initial_commit

author: root <root@artemis.panaceas.org> 2015-12-25 04:40:36 +0000
committer: root <root@artemis.panaceas.org> 2015-12-25 04:40:36 +0000
commit: 849369d6c66d3054688672f97d31fceb8e8230fb (patch)
tree: 6135abc790ca67dedbe07c39806591e70eda81ce /net
download: linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.gz
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.bz2
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.zip
1296 files changed, 694444 insertions, 0 deletions
diff --git a/net/802/Kconfig b/net/802/Kconfig
new file mode 100644
index 00000000..be33d27c
--- /dev/null
+++ b/net/802/Kconfig
@@ -0,0 +1,7 @@
+config STP
+	tristate
+	select LLC
+
+config GARP
+	tristate
+	select STP
diff --git a/net/802/Makefile b/net/802/Makefile
new file mode 100644
index 00000000..7893d679
--- /dev/null
+++ b/net/802/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the Linux 802.x protocol layers.
+#
+
+# Check the p8022 selections against net/core/Makefile.
+obj-$(CONFIG_LLC)	+= p8022.o psnap.o
+obj-$(CONFIG_TR)	+= p8022.o psnap.o tr.o
+obj-$(CONFIG_NET_FC)	+=                 fc.o
+obj-$(CONFIG_FDDI)	+=                 fddi.o
+obj-$(CONFIG_HIPPI)	+=                 hippi.o
+obj-$(CONFIG_IPX)	+= p8022.o psnap.o p8023.o
+obj-$(CONFIG_ATALK)	+= p8022.o psnap.o
+obj-$(CONFIG_STP)	+= stp.o
+obj-$(CONFIG_GARP)	+= garp.o
diff --git a/net/802/fc.c b/net/802/fc.c
new file mode 100644
index 00000000..1e49f2d4
--- /dev/null
+++ b/net/802/fc.c
@@ -0,0 +1,131 @@
+/*
+ * NET3:	Fibre Channel device handling subroutines
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *		Vineet Abraham <vma@iol.unh.edu>
+ *		v 1.0 03/22/99
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/fcdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <net/arp.h>
+
+/*
+ *	Put the headers on a Fibre Channel packet.
+ */
+
+static int fc_header(struct sk_buff *skb, struct net_device *dev,
+		     unsigned short type,
+		     const void *daddr, const void *saddr, unsigned len)
+{
+	struct fch_hdr *fch;
+	int hdr_len;
+
+	/*
+	 * Add the 802.2 SNAP header if IP as the IPv4 code calls
+	 * dev->hard_header directly.
+	 */
+	if (type == ETH_P_IP || type == ETH_P_ARP)
+	{
+		struct fcllc *fcllc;
+
+		hdr_len = sizeof(struct fch_hdr) + sizeof(struct fcllc);
+		fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+		fcllc = (struct fcllc *)(fch+1);
+		fcllc->dsap = fcllc->ssap = EXTENDED_SAP;
+		fcllc->llc = UI_CMD;
+		fcllc->protid[0] = fcllc->protid[1] = fcllc->protid[2] = 0x00;
+		fcllc->ethertype = htons(type);
+	}
+	else
+	{
+		hdr_len = sizeof(struct fch_hdr);
+		fch = (struct fch_hdr *)skb_push(skb, hdr_len);
+	}
+
+	if(saddr)
+		memcpy(fch->saddr,saddr,dev->addr_len);
+	else
+		memcpy(fch->saddr,dev->dev_addr,dev->addr_len);
+
+	if(daddr)
+	{
+		memcpy(fch->daddr,daddr,dev->addr_len);
+		return hdr_len;
+	}
+	return -hdr_len;
+}
+
+/*
+ *	A neighbour discovery of some species (eg arp) has completed. We
+ *	can now send the packet.
+ */
+
+static int fc_rebuild_header(struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	struct fch_hdr *fch=(struct fch_hdr *)skb->data;
+	struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
+	if(fcllc->ethertype != htons(ETH_P_IP)) {
+		printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
+		return 0;
+	}
+	return arp_find(fch->daddr, skb);
+#else
+	return 0;
+#endif
+}
+
+static const struct header_ops fc_header_ops = {
+	.create	 = fc_header,
+	.rebuild = fc_rebuild_header,
+};
+
+static void fc_setup(struct net_device *dev)
+{
+	dev->header_ops		= &fc_header_ops;
+	dev->type		= ARPHRD_IEEE802;
+	dev->hard_header_len	= FC_HLEN;
+	dev->mtu		= 2024;
+	dev->addr_len		= FC_ALEN;
+	dev->tx_queue_len	= 100; /* Long queues on fc */
+	dev->flags		= IFF_BROADCAST;
+
+	memset(dev->broadcast, 0xFF, FC_ALEN);
+}
+
+/**
+ * alloc_fcdev - Register fibre channel device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this fibre channel device
+ *
+ * Fill in the fields of the device structure with fibre channel-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+struct net_device *alloc_fcdev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "fc%d", fc_setup);
+}
+EXPORT_SYMBOL(alloc_fcdev);
diff --git a/net/802/fddi.c b/net/802/fddi.c
new file mode 100644
index 00000000..94b3ad08
--- /dev/null
+++ b/net/802/fddi.c
@@ -0,0 +1,215 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		FDDI-type device handling.
+ *
+ * Version:	@(#)fddi.c	1.0.0	08/12/96
+ *
+ * Authors:	Lawrence V. Stefani, <stefani@lkg.dec.com>
+ *
+ *		fddi.c is based on previous eth.c and tr.c work by
+ *			Ross Biro
+ *			Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *			Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *			Florian La Roche, <rzsfl@rz.uni-sb.de>
+ *			Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Changes
+ *		Alan Cox		:	New arp/rebuild header
+ *		Maciej W. Rozycki	:	IPv6 support
+ */
+
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <net/arp.h>
+#include <net/sock.h>
+
+/*
+ * Create the FDDI MAC header for an arbitrary protocol layer
+ *
+ * saddr=NULL	means use device source address
+ * daddr=NULL	means leave destination address (eg unresolved arp)
+ */
+
+static int fddi_header(struct sk_buff *skb, struct net_device *dev,
+		       unsigned short type,
+		       const void *daddr, const void *saddr, unsigned len)
+{
+	int hl = FDDI_K_SNAP_HLEN;
+	struct fddihdr *fddi;
+
+	if(type != ETH_P_IP && type != ETH_P_IPV6 && type != ETH_P_ARP)
+		hl=FDDI_K_8022_HLEN-3;
+	fddi = (struct fddihdr *)skb_push(skb, hl);
+	fddi->fc			 = FDDI_FC_K_ASYNC_LLC_DEF;
+	if(type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP)
+	{
+		fddi->hdr.llc_snap.dsap		 = FDDI_EXTENDED_SAP;
+		fddi->hdr.llc_snap.ssap		 = FDDI_EXTENDED_SAP;
+		fddi->hdr.llc_snap.ctrl		 = FDDI_UI_CMD;
+		fddi->hdr.llc_snap.oui[0]	 = 0x00;
+		fddi->hdr.llc_snap.oui[1]	 = 0x00;
+		fddi->hdr.llc_snap.oui[2]	 = 0x00;
+		fddi->hdr.llc_snap.ethertype	 = htons(type);
+	}
+
+	/* Set the source and destination hardware addresses */
+
+	if (saddr != NULL)
+		memcpy(fddi->saddr, saddr, dev->addr_len);
+	else
+		memcpy(fddi->saddr, dev->dev_addr, dev->addr_len);
+
+	if (daddr != NULL)
+	{
+		memcpy(fddi->daddr, daddr, dev->addr_len);
+		return hl;
+	}
+
+	return -hl;
+}
+
+
+/*
+ * Rebuild the FDDI MAC header. This is called after an ARP
+ * (or in future other address resolution) has completed on
+ * this sk_buff.  We now let ARP fill in the other fields.
+ */
+
+static int fddi_rebuild_header(struct sk_buff	*skb)
+{
+	struct fddihdr *fddi = (struct fddihdr *)skb->data;
+
+#ifdef CONFIG_INET
+	if (fddi->hdr.llc_snap.ethertype == htons(ETH_P_IP))
+		/* Try to get ARP to resolve the header and fill destination address */
+		return arp_find(fddi->daddr, skb);
+	else
+#endif
+	{
+		printk("%s: Don't know how to resolve type %04X addresses.\n",
+		       skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
+		return 0;
+	}
+}
+
+
+/*
+ * Determine the packet's protocol ID and fill in skb fields.
+ * This routine is called before an incoming packet is passed
+ * up.  It's used to fill in specific skb fields and to set
+ * the proper pointer to the start of packet data (skb->data).
+ */
+
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+	struct fddihdr *fddi = (struct fddihdr *)skb->data;
+	__be16 type;
+
+	/*
+	 * Set mac.raw field to point to FC byte, set data field to point
+	 * to start of packet data.  Assume 802.2 SNAP frames for now.
+	 */
+
+	skb->dev = dev;
+	skb_reset_mac_header(skb);	/* point to frame control (FC) */
+
+	if(fddi->hdr.llc_8022_1.dsap==0xe0)
+	{
+		skb_pull(skb, FDDI_K_8022_HLEN-3);
+		type = htons(ETH_P_802_2);
+	}
+	else
+	{
+		skb_pull(skb, FDDI_K_SNAP_HLEN);		/* adjust for 21 byte header */
+		type=fddi->hdr.llc_snap.ethertype;
+	}
+
+	/* Set packet type based on destination address and flag settings */
+
+	if (*fddi->daddr & 0x01)
+	{
+		if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0)
+			skb->pkt_type = PACKET_BROADCAST;
+		else
+			skb->pkt_type = PACKET_MULTICAST;
+	}
+
+	else if (dev->flags & IFF_PROMISC)
+	{
+		if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN))
+			skb->pkt_type = PACKET_OTHERHOST;
+	}
+
+	/* Assume 802.2 SNAP frames, for now */
+
+	return type;
+}
+
+EXPORT_SYMBOL(fddi_type_trans);
+
+int fddi_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+EXPORT_SYMBOL(fddi_change_mtu);
+
+static const struct header_ops fddi_header_ops = {
+	.create		= fddi_header,
+	.rebuild	= fddi_rebuild_header,
+};
+
+
+static void fddi_setup(struct net_device *dev)
+{
+	dev->header_ops		= &fddi_header_ops;
+	dev->type		= ARPHRD_FDDI;
+	dev->hard_header_len	= FDDI_K_SNAP_HLEN+3;	/* Assume 802.2 SNAP hdr len + 3 pad bytes */
+	dev->mtu		= FDDI_K_SNAP_DLEN;	/* Assume max payload of 802.2 SNAP frame */
+	dev->addr_len		= FDDI_K_ALEN;
+	dev->tx_queue_len	= 100;			/* Long queues on FDDI */
+	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
+
+	memset(dev->broadcast, 0xFF, FDDI_K_ALEN);
+}
+
+/**
+ * alloc_fddidev - Register FDDI device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this FDDI device
+ *
+ * Fill in the fields of the device structure with FDDI-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+struct net_device *alloc_fddidev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup);
+}
+EXPORT_SYMBOL(alloc_fddidev);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/garp.c b/net/802/garp.c
new file mode 100644
index 00000000..16102951
--- /dev/null
+++ b/net/802/garp.c
@@ -0,0 +1,635 @@
+/*
+ *	IEEE 802.1D Generic Attribute Registration Protocol (GARP)
+ *
+ *	Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/garp.h>
+#include <asm/unaligned.h>
+
+static unsigned int garp_join_time __read_mostly = 200;
+module_param(garp_join_time, uint, 0644);
+MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)");
+MODULE_LICENSE("GPL");
+
+static const struct garp_state_trans {
+	u8	state;
+	u8	action;
+} garp_applicant_state_table[GARP_APPLICANT_MAX + 1][GARP_EVENT_MAX + 1] = {
+	[GARP_APPLICANT_VA] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_AA,
+						    .action = GARP_ACTION_S_JOIN_IN },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_AA },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_LA },
+	},
+	[GARP_APPLICANT_AA] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_QA,
+						    .action = GARP_ACTION_S_JOIN_IN },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QA },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_LA },
+	},
+	[GARP_APPLICANT_QA] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QA },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_LA },
+	},
+	[GARP_APPLICANT_LA] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_VO,
+						    .action = GARP_ACTION_S_LEAVE_EMPTY },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_LA },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_LA },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_LA },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_VA },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_INVALID },
+	},
+	[GARP_APPLICANT_VP] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_AA,
+						    .action = GARP_ACTION_S_JOIN_IN },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_AP },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_VO },
+	},
+	[GARP_APPLICANT_AP] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_QA,
+						    .action = GARP_ACTION_S_JOIN_IN },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QP },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_AO },
+	},
+	[GARP_APPLICANT_QP] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QP },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_QO },
+	},
+	[GARP_APPLICANT_VO] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_AO },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_VP },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_INVALID },
+	},
+	[GARP_APPLICANT_AO] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QO },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_AP },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_INVALID },
+	},
+	[GARP_APPLICANT_QO] = {
+		[GARP_EVENT_TRANSMIT_PDU]	= { .state = GARP_APPLICANT_INVALID },
+		[GARP_EVENT_R_JOIN_IN]		= { .state = GARP_APPLICANT_QO },
+		[GARP_EVENT_R_JOIN_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_EMPTY]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_IN]		= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_R_LEAVE_EMPTY]	= { .state = GARP_APPLICANT_VO },
+		[GARP_EVENT_REQ_JOIN]		= { .state = GARP_APPLICANT_QP },
+		[GARP_EVENT_REQ_LEAVE]		= { .state = GARP_APPLICANT_INVALID },
+	},
+};
+
+static int garp_attr_cmp(const struct garp_attr *attr,
+			 const void *data, u8 len, u8 type)
+{
+	if (attr->type != type)
+		return attr->type - type;
+	if (attr->dlen != len)
+		return attr->dlen - len;
+	return memcmp(attr->data, data, len);
+}
+
+static struct garp_attr *garp_attr_lookup(const struct garp_applicant *app,
+					  const void *data, u8 len, u8 type)
+{
+	struct rb_node *parent = app->gid.rb_node;
+	struct garp_attr *attr;
+	int d;
+
+	while (parent) {
+		attr = rb_entry(parent, struct garp_attr, node);
+		d = garp_attr_cmp(attr, data, len, type);
+		if (d < 0)
+			parent = parent->rb_left;
+		else if (d > 0)
+			parent = parent->rb_right;
+		else
+			return attr;
+	}
+	return NULL;
+}
+
+static void garp_attr_insert(struct garp_applicant *app, struct garp_attr *new)
+{
+	struct rb_node *parent = NULL, **p = &app->gid.rb_node;
+	struct garp_attr *attr;
+	int d;
+
+	while (*p) {
+		parent = *p;
+		attr = rb_entry(parent, struct garp_attr, node);
+		d = garp_attr_cmp(attr, new->data, new->dlen, new->type);
+		if (d < 0)
+			p = &parent->rb_left;
+		else if (d > 0)
+			p = &parent->rb_right;
+	}
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &app->gid);
+}
+
+static struct garp_attr *garp_attr_create(struct garp_applicant *app,
+					  const void *data, u8 len, u8 type)
+{
+	struct garp_attr *attr;
+
+	attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC);
+	if (!attr)
+		return attr;
+	attr->state = GARP_APPLICANT_VO;
+	attr->type  = type;
+	attr->dlen  = len;
+	memcpy(attr->data, data, len);
+	garp_attr_insert(app, attr);
+	return attr;
+}
+
+static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr)
+{
+	rb_erase(&attr->node, &app->gid);
+	kfree(attr);
+}
+
+static int garp_pdu_init(struct garp_applicant *app)
+{
+	struct sk_buff *skb;
+	struct garp_pdu_hdr *gp;
+
+#define LLC_RESERVE	sizeof(struct llc_pdu_un)
+	skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev),
+			GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	skb->dev = app->dev;
+	skb->protocol = htons(ETH_P_802_2);
+	skb_reserve(skb, LL_RESERVED_SPACE(app->dev) + LLC_RESERVE);
+
+	gp = (struct garp_pdu_hdr *)__skb_put(skb, sizeof(*gp));
+	put_unaligned(htons(GARP_PROTOCOL_ID), &gp->protocol);
+
+	app->pdu = skb;
+	return 0;
+}
+
+static int garp_pdu_append_end_mark(struct garp_applicant *app)
+{
+	if (skb_tailroom(app->pdu) < sizeof(u8))
+		return -1;
+	*(u8 *)__skb_put(app->pdu, sizeof(u8)) = GARP_END_MARK;
+	return 0;
+}
+
+static void garp_pdu_queue(struct garp_applicant *app)
+{
+	if (!app->pdu)
+		return;
+
+	garp_pdu_append_end_mark(app);
+	garp_pdu_append_end_mark(app);
+
+	llc_pdu_header_init(app->pdu, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
+			    LLC_SAP_BSPAN, LLC_PDU_CMD);
+	llc_pdu_init_as_ui_cmd(app->pdu);
+	llc_mac_hdr_init(app->pdu, app->dev->dev_addr,
+			 app->app->proto.group_address);
+
+	skb_queue_tail(&app->queue, app->pdu);
+	app->pdu = NULL;
+}
+
+static void garp_queue_xmit(struct garp_applicant *app)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&app->queue)))
+		dev_queue_xmit(skb);
+}
+
+static int garp_pdu_append_msg(struct garp_applicant *app, u8 attrtype)
+{
+	struct garp_msg_hdr *gm;
+
+	if (skb_tailroom(app->pdu) < sizeof(*gm))
+		return -1;
+	gm = (struct garp_msg_hdr *)__skb_put(app->pdu, sizeof(*gm));
+	gm->attrtype = attrtype;
+	garp_cb(app->pdu)->cur_type = attrtype;
+	return 0;
+}
+
+static int garp_pdu_append_attr(struct garp_applicant *app,
+				const struct garp_attr *attr,
+				enum garp_attr_event event)
+{
+	struct garp_attr_hdr *ga;
+	unsigned int len;
+	int err;
+again:
+	if (!app->pdu) {
+		err = garp_pdu_init(app);
+		if (err < 0)
+			return err;
+	}
+
+	if (garp_cb(app->pdu)->cur_type != attr->type) {
+		if (garp_cb(app->pdu)->cur_type &&
+		    garp_pdu_append_end_mark(app) < 0)
+			goto queue;
+		if (garp_pdu_append_msg(app, attr->type) < 0)
+			goto queue;
+	}
+
+	len = sizeof(*ga) + attr->dlen;
+	if (skb_tailroom(app->pdu) < len)
+		goto queue;
+	ga = (struct garp_attr_hdr *)__skb_put(app->pdu, len);
+	ga->len   = len;
+	ga->event = event;
+	memcpy(ga->data, attr->data, attr->dlen);
+	return 0;
+
+queue:
+	garp_pdu_queue(app);
+	goto again;
+}
+
+static void garp_attr_event(struct garp_applicant *app,
+			    struct garp_attr *attr, enum garp_event event)
+{
+	enum garp_applicant_state state;
+
+	state = garp_applicant_state_table[attr->state][event].state;
+	if (state == GARP_APPLICANT_INVALID)
+		return;
+
+	switch (garp_applicant_state_table[attr->state][event].action) {
+	case GARP_ACTION_NONE:
+		break;
+	case GARP_ACTION_S_JOIN_IN:
+		/* When appending the attribute fails, don't update state in
+		 * order to retry on next TRANSMIT_PDU event. */
+		if (garp_pdu_append_attr(app, attr, GARP_JOIN_IN) < 0)
+			return;
+		break;
+	case GARP_ACTION_S_LEAVE_EMPTY:
+		garp_pdu_append_attr(app, attr, GARP_LEAVE_EMPTY);
+		/* As a pure applicant, sending a leave message implies that
+		 * the attribute was unregistered and can be destroyed. */
+		garp_attr_destroy(app, attr);
+		return;
+	default:
+		WARN_ON(1);
+	}
+
+	attr->state = state;
+}
+
+int garp_request_join(const struct net_device *dev,
+		      const struct garp_application *appl,
+		      const void *data, u8 len, u8 type)
+{
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+	struct garp_attr *attr;
+
+	spin_lock_bh(&app->lock);
+	attr = garp_attr_create(app, data, len, type);
+	if (!attr) {
+		spin_unlock_bh(&app->lock);
+		return -ENOMEM;
+	}
+	garp_attr_event(app, attr, GARP_EVENT_REQ_JOIN);
+	spin_unlock_bh(&app->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(garp_request_join);
+
+void garp_request_leave(const struct net_device *dev,
+			const struct garp_application *appl,
+			const void *data, u8 len, u8 type)
+{
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+	struct garp_attr *attr;
+
+	spin_lock_bh(&app->lock);
+	attr = garp_attr_lookup(app, data, len, type);
+	if (!attr) {
+		spin_unlock_bh(&app->lock);
+		return;
+	}
+	garp_attr_event(app, attr, GARP_EVENT_REQ_LEAVE);
+	spin_unlock_bh(&app->lock);
+}
+EXPORT_SYMBOL_GPL(garp_request_leave);
+
+static void garp_gid_event(struct garp_applicant *app, enum garp_event event)
+{
+	struct rb_node *node, *next;
+	struct garp_attr *attr;
+
+	for (node = rb_first(&app->gid);
+	     next = node ? rb_next(node) : NULL, node != NULL;
+	     node = next) {
+		attr = rb_entry(node, struct garp_attr, node);
+		garp_attr_event(app, attr, event);
+	}
+}
+
+static void garp_join_timer_arm(struct garp_applicant *app)
+{
+	unsigned long delay;
+
+	delay = (u64)msecs_to_jiffies(garp_join_time) * net_random() >> 32;
+	mod_timer(&app->join_timer, jiffies + delay);
+}
+
+static void garp_join_timer(unsigned long data)
+{
+	struct garp_applicant *app = (struct garp_applicant *)data;
+
+	spin_lock(&app->lock);
+	garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+	garp_pdu_queue(app);
+	spin_unlock(&app->lock);
+
+	garp_queue_xmit(app);
+	garp_join_timer_arm(app);
+}
+
+static int garp_pdu_parse_end_mark(struct sk_buff *skb)
+{
+	if (!pskb_may_pull(skb, sizeof(u8)))
+		return -1;
+	if (*skb->data == GARP_END_MARK) {
+		skb_pull(skb, sizeof(u8));
+		return -1;
+	}
+	return 0;
+}
+
+static int garp_pdu_parse_attr(struct garp_applicant *app, struct sk_buff *skb,
+			       u8 attrtype)
+{
+	const struct garp_attr_hdr *ga;
+	struct garp_attr *attr;
+	enum garp_event event;
+	unsigned int dlen;
+
+	if (!pskb_may_pull(skb, sizeof(*ga)))
+		return -1;
+	ga = (struct garp_attr_hdr *)skb->data;
+	if (ga->len < sizeof(*ga))
+		return -1;
+
+	if (!pskb_may_pull(skb, ga->len))
+		return -1;
+	skb_pull(skb, ga->len);
+	dlen = sizeof(*ga) - ga->len;
+
+	if (attrtype > app->app->maxattr)
+		return 0;
+
+	switch (ga->event) {
+	case GARP_LEAVE_ALL:
+		if (dlen != 0)
+			return -1;
+		garp_gid_event(app, GARP_EVENT_R_LEAVE_EMPTY);
+		return 0;
+	case GARP_JOIN_EMPTY:
+		event = GARP_EVENT_R_JOIN_EMPTY;
+		break;
+	case GARP_JOIN_IN:
+		event = GARP_EVENT_R_JOIN_IN;
+		break;
+	case GARP_LEAVE_EMPTY:
+		event = GARP_EVENT_R_LEAVE_EMPTY;
+		break;
+	case GARP_EMPTY:
+		event = GARP_EVENT_R_EMPTY;
+		break;
+	default:
+		return 0;
+	}
+
+	if (dlen == 0)
+		return -1;
+	attr = garp_attr_lookup(app, ga->data, dlen, attrtype);
+	if (attr == NULL)
+		return 0;
+	garp_attr_event(app, attr, event);
+	return 0;
+}
+
+static int garp_pdu_parse_msg(struct garp_applicant *app, struct sk_buff *skb)
+{
+	const struct garp_msg_hdr *gm;
+
+	if (!pskb_may_pull(skb, sizeof(*gm)))
+		return -1;
+	gm = (struct garp_msg_hdr *)skb->data;
+	if (gm->attrtype == 0)
+		return -1;
+	skb_pull(skb, sizeof(*gm));
+
+	while (skb->len > 0) {
+		if (garp_pdu_parse_attr(app, skb, gm->attrtype) < 0)
+			return -1;
+		if (garp_pdu_parse_end_mark(skb) < 0)
+			break;
+	}
+	return 0;
+}
+
+static void garp_pdu_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+			 struct net_device *dev)
+{
+	struct garp_application *appl = proto->data;
+	struct garp_port *port;
+	struct garp_applicant *app;
+	const struct garp_pdu_hdr *gp;
+
+	port = rcu_dereference(dev->garp_port);
+	if (!port)
+		goto err;
+	app = rcu_dereference(port->applicants[appl->type]);
+	if (!app)
+		goto err;
+
+	if (!pskb_may_pull(skb, sizeof(*gp)))
+		goto err;
+	gp = (struct garp_pdu_hdr *)skb->data;
+	if (get_unaligned(&gp->protocol) != htons(GARP_PROTOCOL_ID))
+		goto err;
+	skb_pull(skb, sizeof(*gp));
+
+	spin_lock(&app->lock);
+	while (skb->len > 0) {
+		if (garp_pdu_parse_msg(app, skb) < 0)
+			break;
+		if (garp_pdu_parse_end_mark(skb) < 0)
+			break;
+	}
+	spin_unlock(&app->lock);
+err:
+	kfree_skb(skb);
+}
+
+static int garp_init_port(struct net_device *dev)
+{
+	struct garp_port *port;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+	rcu_assign_pointer(dev->garp_port, port);
+	return 0;
+}
+
+static void garp_release_port(struct net_device *dev)
+{
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	unsigned int i;
+
+	for (i = 0; i <= GARP_APPLICATION_MAX; i++) {
+		if (rtnl_dereference(port->applicants[i]))
+			return;
+	}
+	rcu_assign_pointer(dev->garp_port, NULL);
+	kfree_rcu(port, rcu);
+}
+
+int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
+{
+	struct garp_applicant *app;
+	int err;
+
+	ASSERT_RTNL();
+
+	if (!rtnl_dereference(dev->garp_port)) {
+		err = garp_init_port(dev);
+		if (err < 0)
+			goto err1;
+	}
+
+	err = -ENOMEM;
+	app = kzalloc(sizeof(*app), GFP_KERNEL);
+	if (!app)
+		goto err2;
+
+	err = dev_mc_add(dev, appl->proto.group_address);
+	if (err < 0)
+		goto err3;
+
+	app->dev = dev;
+	app->app = appl;
+	app->gid = RB_ROOT;
+	spin_lock_init(&app->lock);
+	skb_queue_head_init(&app->queue);
+	rcu_assign_pointer(dev->garp_port->applicants[appl->type], app);
+	setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app);
+	garp_join_timer_arm(app);
+	return 0;
+
+err3:
+	kfree(app);
+err2:
+	garp_release_port(dev);
+err1:
+	return err;
+}
+EXPORT_SYMBOL_GPL(garp_init_applicant);
+
+void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl)
+{
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
+
+	ASSERT_RTNL();
+
+	rcu_assign_pointer(port->applicants[appl->type], NULL);
+
+	/* Delete timer and generate a final TRANSMIT_PDU event to flush out
+	 * all pending messages before the applicant is gone. */
+	del_timer_sync(&app->join_timer);
+	garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+	garp_pdu_queue(app);
+	garp_queue_xmit(app);
+
+	dev_mc_del(dev, appl->proto.group_address);
+	kfree_rcu(app, rcu);
+	garp_release_port(dev);
+}
+EXPORT_SYMBOL_GPL(garp_uninit_applicant);
+
+int garp_register_application(struct garp_application *appl)
+{
+	appl->proto.rcv = garp_pdu_rcv;
+	appl->proto.data = appl;
+	return stp_proto_register(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_register_application);
+
+void garp_unregister_application(struct garp_application *appl)
+{
+	stp_proto_unregister(&appl->proto);
+}
+EXPORT_SYMBOL_GPL(garp_unregister_application);
diff --git a/net/802/hippi.c b/net/802/hippi.c
new file mode 100644
index 00000000..91aca878
--- /dev/null
+++ b/net/802/hippi.c
@@ -0,0 +1,235 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		HIPPI-type device handling.
+ *
+ * Version:	@(#)hippi.c	1.0.0	05/29/97
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Florian  La Roche, <rzsfl@rz.uni-sb.de>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Jes Sorensen, <Jes.Sorensen@cern.ch>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/hippidevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+/*
+ * Create the HIPPI MAC header for an arbitrary protocol layer
+ *
+ * saddr=NULL	means use device source address
+ * daddr=NULL	means leave destination address (eg unresolved arp)
+ */
+
+static int hippi_header(struct sk_buff *skb, struct net_device *dev,
+			unsigned short type,
+			const void *daddr, const void *saddr, unsigned len)
+{
+	struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+	struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
+
+	if (!len){
+		len = skb->len - HIPPI_HLEN;
+		printk("hippi_header(): length not supplied\n");
+	}
+
+	/*
+	 * Due to the stupidity of the little endian byte-order we
+	 * have to set the fp field this way.
+	 */
+	hip->fp.fixed		= htonl(0x04800018);
+	hip->fp.d2_size		= htonl(len + 8);
+	hip->le.fc		= 0;
+	hip->le.double_wide	= 0;	/* only HIPPI 800 for the time being */
+	hip->le.message_type	= 0;	/* Data PDU */
+
+	hip->le.dest_addr_type	= 2;	/* 12 bit SC address */
+	hip->le.src_addr_type	= 2;	/* 12 bit SC address */
+
+	memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
+	memset(&hip->le.reserved, 0, 16);
+
+	hip->snap.dsap		= HIPPI_EXTENDED_SAP;
+	hip->snap.ssap		= HIPPI_EXTENDED_SAP;
+	hip->snap.ctrl		= HIPPI_UI_CMD;
+	hip->snap.oui[0]	= 0x00;
+	hip->snap.oui[1]	= 0x00;
+	hip->snap.oui[2]	= 0x00;
+	hip->snap.ethertype	= htons(type);
+
+	if (daddr)
+	{
+		memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
+		memcpy(&hcb->ifield, daddr + 2, 4);
+		return HIPPI_HLEN;
+	}
+	hcb->ifield = 0;
+	return -((int)HIPPI_HLEN);
+}
+
+
+/*
+ * Rebuild the HIPPI MAC header. This is called after an ARP has
+ * completed on this sk_buff. We now let ARP fill in the other fields.
+ */
+
+static int hippi_rebuild_header(struct sk_buff *skb)
+{
+	struct hippi_hdr *hip = (struct hippi_hdr *)skb->data;
+
+	/*
+	 * Only IP is currently supported
+	 */
+
+	if(hip->snap.ethertype != htons(ETH_P_IP))
+	{
+		printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype));
+		return 0;
+	}
+
+	/*
+	 * We don't support dynamic ARP on HIPPI, but we use the ARP
+	 * static ARP tables to hold the I-FIELDs.
+	 */
+	return arp_find(hip->le.daddr, skb);
+}
+
+
+/*
+ *	Determine the packet's protocol ID.
+ */
+
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+	struct hippi_hdr *hip;
+
+	/*
+	 * This is actually wrong ... question is if we really should
+	 * set the raw address here.
+	 */
+	skb->dev = dev;
+	skb_reset_mac_header(skb);
+	hip = (struct hippi_hdr *)skb_mac_header(skb);
+	skb_pull(skb, HIPPI_HLEN);
+
+	/*
+	 * No fancy promisc stuff here now.
+	 */
+
+	return hip->snap.ethertype;
+}
+
+EXPORT_SYMBOL(hippi_type_trans);
+
+int hippi_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/*
+	 * HIPPI's got these nice large MTUs.
+	 */
+	if ((new_mtu < 68) || (new_mtu > 65280))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+EXPORT_SYMBOL(hippi_change_mtu);
+
+/*
+ * For HIPPI we will actually use the lower 4 bytes of the hardware
+ * address as the I-FIELD rather than the actual hardware address.
+ */
+int hippi_mac_addr(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+	if (netif_running(dev))
+		return -EBUSY;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+EXPORT_SYMBOL(hippi_mac_addr);
+
+int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
+{
+	/* Never send broadcast/multicast ARP messages */
+	p->mcast_probes = 0;
+
+	/* In IPv6 unicast probes are valid even on NBMA,
+	* because they are encapsulated in normal IPv6 protocol.
+	* Should be a generic flag.
+	*/
+	if (p->tbl->family != AF_INET6)
+		p->ucast_probes = 0;
+	return 0;
+}
+EXPORT_SYMBOL(hippi_neigh_setup_dev);
+
+static const struct header_ops hippi_header_ops = {
+	.create		= hippi_header,
+	.rebuild	= hippi_rebuild_header,
+};
+
+
+static void hippi_setup(struct net_device *dev)
+{
+	dev->header_ops			= &hippi_header_ops;
+
+	/*
+	 * We don't support HIPPI `ARP' for the time being, and probably
+	 * never will unless someone else implements it. However we
+	 * still need a fake ARPHRD to make ifconfig and friends play ball.
+	 */
+	dev->type		= ARPHRD_HIPPI;
+	dev->hard_header_len 	= HIPPI_HLEN;
+	dev->mtu		= 65280;
+	dev->addr_len		= HIPPI_ALEN;
+	dev->tx_queue_len	= 25 /* 5 */;
+	memset(dev->broadcast, 0xFF, HIPPI_ALEN);
+
+
+	/*
+	 * HIPPI doesn't support broadcast+multicast and we only use
+	 * static ARP tables. ARP is disabled by hippi_neigh_setup_dev.
+	 */
+	dev->flags = 0;
+}
+
+/**
+ * alloc_hippi_dev - Register HIPPI device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this HIPPI device
+ *
+ * Fill in the fields of the device structure with HIPPI-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_hippi_dev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "hip%d", hippi_setup);
+}
+
+EXPORT_SYMBOL(alloc_hippi_dev);
diff --git a/net/802/p8022.c b/net/802/p8022.c
new file mode 100644
index 00000000..7f353c4f
--- /dev/null
+++ b/net/802/p8022.c
@@ -0,0 +1,67 @@
+/*
+ *	NET3:	Support for 802.2 demultiplexing off Ethernet (Token ring
+ *		is kept separate see p8022tr.c)
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *		Demultiplex 802.2 encoded protocols. We match the entry by the
+ *		SSAP/DSAP pair and then deliver to the registered datalink that
+ *		matches. The control byte is ignored and handling of such items
+ *		is up to the routine passed the frame.
+ *
+ *		Unlike the 802.3 datalink we have a list of 802.2 entries as
+ *		there are multiple protocols to demux. The list is currently
+ *		short (3 or 4 entries at most). The current demux assumes this.
+ */
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <net/llc.h>
+#include <net/p8022.h>
+
+static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
+			 unsigned char *dest)
+{
+	llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
+	return 0;
+}
+
+struct datalink_proto *register_8022_client(unsigned char type,
+					    int (*func)(struct sk_buff *skb,
+							struct net_device *dev,
+							struct packet_type *pt,
+							struct net_device *orig_dev))
+{
+	struct datalink_proto *proto;
+
+	proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+	if (proto) {
+		proto->type[0]		= type;
+		proto->header_length	= 3;
+		proto->request		= p8022_request;
+		proto->sap = llc_sap_open(type, func);
+		if (!proto->sap) {
+			kfree(proto);
+			proto = NULL;
+		}
+	}
+	return proto;
+}
+
+void unregister_8022_client(struct datalink_proto *proto)
+{
+	llc_sap_put(proto->sap);
+	kfree(proto);
+}
+
+EXPORT_SYMBOL(register_8022_client);
+EXPORT_SYMBOL(unregister_8022_client);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/p8023.c b/net/802/p8023.c
new file mode 100644
index 00000000..1256a40d
--- /dev/null
+++ b/net/802/p8023.c
@@ -0,0 +1,64 @@
+/*
+ *	NET3:	802.3 data link hooks used for IPX 802.3
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	802.3 isn't really a protocol data link layer. Some old IPX stuff
+ *	uses it however. Note that there is only one 802.3 protocol layer
+ *	in the system. We don't currently support different protocols
+ *	running raw 802.3 on different devices. Thankfully nobody else
+ *	has done anything like the old IPX.
+ */
+
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/datalink.h>
+#include <net/p8022.h>
+
+/*
+ *	Place an 802.3 header on a packet. The driver will do the mac
+ *	addresses, we just need to give it the buffer length.
+ */
+static int p8023_request(struct datalink_proto *dl,
+			 struct sk_buff *skb, unsigned char *dest_node)
+{
+	struct net_device *dev = skb->dev;
+
+	dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len);
+	return dev_queue_xmit(skb);
+}
+
+/*
+ *	Create an 802.3 client. Note there can be only one 802.3 client
+ */
+struct datalink_proto *make_8023_client(void)
+{
+	struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+
+	if (proto) {
+		proto->header_length = 0;
+		proto->request	     = p8023_request;
+	}
+	return proto;
+}
+
+/*
+ *	Destroy the 802.3 client.
+ */
+void destroy_8023_client(struct datalink_proto *dl)
+{
+	kfree(dl);
+}
+
+EXPORT_SYMBOL(destroy_8023_client);
+EXPORT_SYMBOL(make_8023_client);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/psnap.c b/net/802/psnap.c
new file mode 100644
index 00000000..db6baf7c
--- /dev/null
+++ b/net/802/psnap.c
@@ -0,0 +1,167 @@
+/*
+ *	SNAP data link layer. Derived from 802.2
+ *
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>,
+ *		from the 802.2 layer by Greg Page.
+ *		Merged in additions from Greg Page's psnap.c.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/llc.h>
+#include <net/psnap.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+
+static LIST_HEAD(snap_list);
+static DEFINE_SPINLOCK(snap_lock);
+static struct llc_sap *snap_sap;
+
+/*
+ *	Find a snap client by matching the 5 bytes.
+ */
+static struct datalink_proto *find_snap_client(const unsigned char *desc)
+{
+	struct datalink_proto *proto = NULL, *p;
+
+	list_for_each_entry_rcu(p, &snap_list, node) {
+		if (!memcmp(p->type, desc, 5)) {
+			proto = p;
+			break;
+		}
+	}
+	return proto;
+}
+
+/*
+ *	A SNAP packet has arrived
+ */
+static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	int rc = 1;
+	struct datalink_proto *proto;
+	static struct packet_type snap_packet_type = {
+		.type = cpu_to_be16(ETH_P_SNAP),
+	};
+
+	if (unlikely(!pskb_may_pull(skb, 5)))
+		goto drop;
+
+	rcu_read_lock();
+	proto = find_snap_client(skb_transport_header(skb));
+	if (proto) {
+		/* Pass the frame on. */
+		skb->transport_header += 5;
+		skb_pull_rcsum(skb, 5);
+		rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
+	}
+	rcu_read_unlock();
+
+	if (unlikely(!proto))
+		goto drop;
+
+out:
+	return rc;
+
+drop:
+	kfree_skb(skb);
+	goto out;
+}
+
+/*
+ *	Put a SNAP header on a frame and pass to 802.2
+ */
+static int snap_request(struct datalink_proto *dl,
+			struct sk_buff *skb, u8 *dest)
+{
+	memcpy(skb_push(skb, 5), dl->type, 5);
+	llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);
+	return 0;
+}
+
+/*
+ *	Set up the SNAP layer
+ */
+EXPORT_SYMBOL(register_snap_client);
+EXPORT_SYMBOL(unregister_snap_client);
+
+static const char snap_err_msg[] __initconst =
+	KERN_CRIT "SNAP - unable to register with 802.2\n";
+
+static int __init snap_init(void)
+{
+	snap_sap = llc_sap_open(0xAA, snap_rcv);
+	if (!snap_sap) {
+		printk(snap_err_msg);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+module_init(snap_init);
+
+static void __exit snap_exit(void)
+{
+	llc_sap_put(snap_sap);
+}
+
+module_exit(snap_exit);
+
+
+/*
+ *	Register SNAP clients. We don't yet use this for IP.
+ */
+struct datalink_proto *register_snap_client(const unsigned char *desc,
+					    int (*rcvfunc)(struct sk_buff *,
+							   struct net_device *,
+							   struct packet_type *,
+							   struct net_device *))
+{
+	struct datalink_proto *proto = NULL;
+
+	spin_lock_bh(&snap_lock);
+
+	if (find_snap_client(desc))
+		goto out;
+
+	proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+	if (proto) {
+		memcpy(proto->type, desc, 5);
+		proto->rcvfunc		= rcvfunc;
+		proto->header_length	= 5 + 3; /* snap + 802.2 */
+		proto->request		= snap_request;
+		list_add_rcu(&proto->node, &snap_list);
+	}
+out:
+	spin_unlock_bh(&snap_lock);
+
+	return proto;
+}
+
+/*
+ *	Unregister SNAP clients. Protocols no longer want to play with us ...
+ */
+void unregister_snap_client(struct datalink_proto *proto)
+{
+	spin_lock_bh(&snap_lock);
+	list_del_rcu(&proto->node);
+	spin_unlock_bh(&snap_lock);
+
+	synchronize_net();
+
+	kfree(proto);
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/stp.c b/net/802/stp.c
new file mode 100644
index 00000000..978c30b1
--- /dev/null
+++ b/net/802/stp.c
@@ -0,0 +1,103 @@
+/*
+ *	STP SAP demux
+ *
+ *	Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 as published by the Free Software Foundation.
+ */
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/stp.h>
+
+/* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */
+#define GARP_ADDR_MIN	0x20
+#define GARP_ADDR_MAX	0x2F
+#define GARP_ADDR_RANGE	(GARP_ADDR_MAX - GARP_ADDR_MIN)
+
+static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
+static const struct stp_proto __rcu *stp_proto __read_mostly;
+
+static struct llc_sap *sap __read_mostly;
+static unsigned int sap_registered;
+static DEFINE_MUTEX(stp_proto_mutex);
+
+/* Called under rcu_read_lock from LLC */
+static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	const struct ethhdr *eh = eth_hdr(skb);
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+	const struct stp_proto *proto;
+
+	if (pdu->ssap != LLC_SAP_BSPAN ||
+	    pdu->dsap != LLC_SAP_BSPAN ||
+	    pdu->ctrl_1 != LLC_PDU_TYPE_U)
+		goto err;
+
+	if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) {
+		proto = rcu_dereference(garp_protos[eh->h_dest[5] -
+						    GARP_ADDR_MIN]);
+		if (proto &&
+		    compare_ether_addr(eh->h_dest, proto->group_address))
+			goto err;
+	} else
+		proto = rcu_dereference(stp_proto);
+
+	if (!proto)
+		goto err;
+
+	proto->rcv(proto, skb, dev);
+	return 0;
+
+err:
+	kfree_skb(skb);
+	return 0;
+}
+
+int stp_proto_register(const struct stp_proto *proto)
+{
+	int err = 0;
+
+	mutex_lock(&stp_proto_mutex);
+	if (sap_registered++ == 0) {
+		sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv);
+		if (!sap) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+	if (is_zero_ether_addr(proto->group_address))
+		rcu_assign_pointer(stp_proto, proto);
+	else
+		rcu_assign_pointer(garp_protos[proto->group_address[5] -
+					       GARP_ADDR_MIN], proto);
+out:
+	mutex_unlock(&stp_proto_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(stp_proto_register);
+
+void stp_proto_unregister(const struct stp_proto *proto)
+{
+	mutex_lock(&stp_proto_mutex);
+	if (is_zero_ether_addr(proto->group_address))
+		rcu_assign_pointer(stp_proto, NULL);
+	else
+		rcu_assign_pointer(garp_protos[proto->group_address[5] -
+					       GARP_ADDR_MIN], NULL);
+	synchronize_rcu();
+
+	if (--sap_registered == 0)
+		llc_sap_put(sap);
+	mutex_unlock(&stp_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(stp_proto_unregister);
+
+MODULE_LICENSE("GPL");
diff --git a/net/802/tr.c b/net/802/tr.c
new file mode 100644
index 00000000..5e20cf8a
--- /dev/null
+++ b/net/802/tr.c
@@ -0,0 +1,677 @@
+/*
+ * NET3:	Token ring device handling subroutines
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Fixes:       3 Feb 97 Paul Norton <pnorton@cts.com> Minor routing fixes.
+ *              Added rif table to /proc/net/tr_rif and rif timeout to
+ *              /proc/sys/net/token-ring/rif_timeout.
+ *              22 Jun 98 Paul Norton <p.norton@computer.org> Rearranged
+ *              tr_header and tr_type_trans to handle passing IPX SNAP and
+ *              802.2 through the correct layers. Eliminated tr_reformat.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/net_namespace.h>
+
+static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev);
+static void rif_check_expire(unsigned long dummy);
+
+#define TR_SR_DEBUG 0
+
+/*
+ *	Each RIF entry we learn is kept this way
+ */
+
+struct rif_cache {
+	unsigned char addr[TR_ALEN];
+	int iface;
+	__be16 rcf;
+	__be16 rseg[8];
+	struct rif_cache *next;
+	unsigned long last_used;
+	unsigned char local_ring;
+};
+
+#define RIF_TABLE_SIZE 32
+
+/*
+ *	We hash the RIF cache 32 ways. We do after all have to look it
+ *	up a lot.
+ */
+
+static struct rif_cache *rif_table[RIF_TABLE_SIZE];
+
+static DEFINE_SPINLOCK(rif_lock);
+
+
+/*
+ *	Garbage disposal timer.
+ */
+
+static struct timer_list rif_timer;
+
+static int sysctl_tr_rif_timeout = 60*10*HZ;
+
+static inline unsigned long rif_hash(const unsigned char *addr)
+{
+	unsigned long x;
+
+	x = addr[0];
+	x = (x << 2) ^ addr[1];
+	x = (x << 2) ^ addr[2];
+	x = (x << 2) ^ addr[3];
+	x = (x << 2) ^ addr[4];
+	x = (x << 2) ^ addr[5];
+
+	x ^= x >> 8;
+
+	return x & (RIF_TABLE_SIZE - 1);
+}
+
+/*
+ *	Put the headers on a token ring packet. Token ring source routing
+ *	makes this a little more exciting than on ethernet.
+ */
+
+static int tr_header(struct sk_buff *skb, struct net_device *dev,
+		     unsigned short type,
+		     const void *daddr, const void *saddr, unsigned len)
+{
+	struct trh_hdr *trh;
+	int hdr_len;
+
+	/*
+	 * Add the 802.2 SNAP header if IP as the IPv4/IPv6 code calls
+	 * dev->hard_header directly.
+	 */
+	if (type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP)
+	{
+		struct trllc *trllc;
+
+		hdr_len = sizeof(struct trh_hdr) + sizeof(struct trllc);
+		trh = (struct trh_hdr *)skb_push(skb, hdr_len);
+		trllc = (struct trllc *)(trh+1);
+		trllc->dsap = trllc->ssap = EXTENDED_SAP;
+		trllc->llc = UI_CMD;
+		trllc->protid[0] = trllc->protid[1] = trllc->protid[2] = 0x00;
+		trllc->ethertype = htons(type);
+	}
+	else
+	{
+		hdr_len = sizeof(struct trh_hdr);
+		trh = (struct trh_hdr *)skb_push(skb, hdr_len);
+	}
+
+	trh->ac=AC;
+	trh->fc=LLC_FRAME;
+
+	if(saddr)
+		memcpy(trh->saddr,saddr,dev->addr_len);
+	else
+		memcpy(trh->saddr,dev->dev_addr,dev->addr_len);
+
+	/*
+	 *	Build the destination and then source route the frame
+	 */
+
+	if(daddr)
+	{
+		memcpy(trh->daddr,daddr,dev->addr_len);
+		tr_source_route(skb, trh, dev);
+		return hdr_len;
+	}
+
+	return -hdr_len;
+}
+
+/*
+ *	A neighbour discovery of some species (eg arp) has completed. We
+ *	can now send the packet.
+ */
+
+static int tr_rebuild_header(struct sk_buff *skb)
+{
+	struct trh_hdr *trh=(struct trh_hdr *)skb->data;
+	struct trllc *trllc=(struct trllc *)(skb->data+sizeof(struct trh_hdr));
+	struct net_device *dev = skb->dev;
+
+	/*
+	 *	FIXME: We don't yet support IPv6 over token rings
+	 */
+
+	if(trllc->ethertype != htons(ETH_P_IP)) {
+		printk("tr_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(trllc->ethertype));
+		return 0;
+	}
+
+#ifdef CONFIG_INET
+	if(arp_find(trh->daddr, skb)) {
+			return 1;
+	}
+	else
+#endif
+	{
+		tr_source_route(skb,trh,dev);
+		return 0;
+	}
+}
+
+/*
+ *	Some of this is a bit hackish. We intercept RIF information
+ *	used for source routing. We also grab IP directly and don't feed
+ *	it via SNAP.
+ */
+
+__be16 tr_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+
+	struct trh_hdr *trh;
+	struct trllc *trllc;
+	unsigned riflen=0;
+
+	skb->dev = dev;
+	skb_reset_mac_header(skb);
+	trh = tr_hdr(skb);
+
+	if(trh->saddr[0] & TR_RII)
+		riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8;
+
+	trllc = (struct trllc *)(skb->data+sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen);
+
+	skb_pull(skb,sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen);
+
+	if(*trh->daddr & 0x80)
+	{
+		if(!memcmp(trh->daddr,dev->broadcast,TR_ALEN))
+			skb->pkt_type=PACKET_BROADCAST;
+		else
+			skb->pkt_type=PACKET_MULTICAST;
+	}
+	else if ( (trh->daddr[0] & 0x01) && (trh->daddr[1] & 0x00) && (trh->daddr[2] & 0x5E))
+	{
+		skb->pkt_type=PACKET_MULTICAST;
+	}
+	else if(dev->flags & IFF_PROMISC)
+	{
+		if(memcmp(trh->daddr, dev->dev_addr, TR_ALEN))
+			skb->pkt_type=PACKET_OTHERHOST;
+	}
+
+	if ((skb->pkt_type != PACKET_BROADCAST) &&
+	    (skb->pkt_type != PACKET_MULTICAST))
+		tr_add_rif_info(trh,dev) ;
+
+	/*
+	 * Strip the SNAP header from ARP packets since we don't
+	 * pass them through to the 802.2/SNAP layers.
+	 */
+
+	if (trllc->dsap == EXTENDED_SAP &&
+	    (trllc->ethertype == htons(ETH_P_IP) ||
+	     trllc->ethertype == htons(ETH_P_IPV6) ||
+	     trllc->ethertype == htons(ETH_P_ARP)))
+	{
+		skb_pull(skb, sizeof(struct trllc));
+		return trllc->ethertype;
+	}
+
+	return htons(ETH_P_TR_802_2);
+}
+
+/*
+ *	We try to do source routing...
+ */
+
+void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,
+		     struct net_device *dev)
+{
+	int slack;
+	unsigned int hash;
+	struct rif_cache *entry;
+	unsigned char *olddata;
+	unsigned long flags;
+	static const unsigned char mcast_func_addr[]
+		= {0xC0,0x00,0x00,0x04,0x00,0x00};
+
+	spin_lock_irqsave(&rif_lock, flags);
+
+	/*
+	 *	Broadcasts are single route as stated in RFC 1042
+	 */
+	if( (!memcmp(&(trh->daddr[0]),&(dev->broadcast[0]),TR_ALEN)) ||
+	    (!memcmp(&(trh->daddr[0]),&(mcast_func_addr[0]), TR_ALEN))  )
+	{
+		trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK)
+			       | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST);
+		trh->saddr[0]|=TR_RII;
+	}
+	else
+	{
+		hash = rif_hash(trh->daddr);
+		/*
+		 *	Walk the hash table and look for an entry
+		 */
+		for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->daddr[0]),TR_ALEN);entry=entry->next);
+
+		/*
+		 *	If we found an entry we can route the frame.
+		 */
+		if(entry)
+		{
+#if TR_SR_DEBUG
+printk("source routing for %pM\n", trh->daddr);
+#endif
+			if(!entry->local_ring && (ntohs(entry->rcf) & TR_RCF_LEN_MASK) >> 8)
+			{
+				trh->rcf=entry->rcf;
+				memcpy(&trh->rseg[0],&entry->rseg[0],8*sizeof(unsigned short));
+				trh->rcf^=htons(TR_RCF_DIR_BIT);
+				trh->rcf&=htons(0x1fff);	/* Issam Chehab <ichehab@madge1.demon.co.uk> */
+
+				trh->saddr[0]|=TR_RII;
+#if TR_SR_DEBUG
+				printk("entry found with rcf %04x\n", entry->rcf);
+			}
+			else
+			{
+				printk("entry found but without rcf length, local=%02x\n", entry->local_ring);
+#endif
+			}
+			entry->last_used=jiffies;
+		}
+		else
+		{
+			/*
+			 *	Without the information we simply have to shout
+			 *	on the wire. The replies should rapidly clean this
+			 *	situation up.
+			 */
+			trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK)
+				       | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST);
+			trh->saddr[0]|=TR_RII;
+#if TR_SR_DEBUG
+			printk("no entry in rif table found - broadcasting frame\n");
+#endif
+		}
+	}
+
+	/* Compress the RIF here so we don't have to do it in the driver(s) */
+	if (!(trh->saddr[0] & 0x80))
+		slack = 18;
+	else
+		slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
+	olddata = skb->data;
+	spin_unlock_irqrestore(&rif_lock, flags);
+
+	skb_pull(skb, slack);
+	memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
+}
+
+/*
+ *	We have learned some new RIF information for our source
+ *	routing.
+ */
+
+static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
+{
+	unsigned int hash, rii_p = 0;
+	unsigned long flags;
+	struct rif_cache *entry;
+	unsigned char saddr0;
+
+	spin_lock_irqsave(&rif_lock, flags);
+	saddr0 = trh->saddr[0];
+
+	/*
+	 *	Firstly see if the entry exists
+	 */
+
+	if(trh->saddr[0] & TR_RII)
+	{
+		trh->saddr[0]&=0x7f;
+		if (((ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8) > 2)
+		{
+			rii_p = 1;
+		}
+	}
+
+	hash = rif_hash(trh->saddr);
+	for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);entry=entry->next);
+
+	if(entry==NULL)
+	{
+#if TR_SR_DEBUG
+		printk("adding rif_entry: addr:%pM rcf:%04X\n",
+		       trh->saddr, ntohs(trh->rcf));
+#endif
+		/*
+		 *	Allocate our new entry. A failure to allocate loses
+		 *	use the information. This is harmless.
+		 *
+		 *	FIXME: We ought to keep some kind of cache size
+		 *	limiting and adjust the timers to suit.
+		 */
+		entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC);
+
+		if(!entry)
+		{
+			printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
+			spin_unlock_irqrestore(&rif_lock, flags);
+			return;
+		}
+
+		memcpy(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);
+		entry->iface = dev->ifindex;
+		entry->next=rif_table[hash];
+		entry->last_used=jiffies;
+		rif_table[hash]=entry;
+
+		if (rii_p)
+		{
+			entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK);
+			memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short));
+			entry->local_ring = 0;
+		}
+		else
+		{
+			entry->local_ring = 1;
+		}
+	}
+	else	/* Y. Tahara added */
+	{
+		/*
+		 *	Update existing entries
+		 */
+		if (!entry->local_ring)
+		    if (entry->rcf != (trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK)) &&
+			 !(trh->rcf & htons(TR_RCF_BROADCAST_MASK)))
+		    {
+#if TR_SR_DEBUG
+printk("updating rif_entry: addr:%pM rcf:%04X\n",
+		trh->saddr, ntohs(trh->rcf));
+#endif
+			    entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK);
+			    memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short));
+		    }
+		entry->last_used=jiffies;
+	}
+	trh->saddr[0]=saddr0; /* put the routing indicator back for tcpdump */
+	spin_unlock_irqrestore(&rif_lock, flags);
+}
+
+/*
+ *	Scan the cache with a timer and see what we need to throw out.
+ */
+
+static void rif_check_expire(unsigned long dummy)
+{
+	int i;
+	unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
+
+	spin_lock_irqsave(&rif_lock, flags);
+
+	for(i =0; i < RIF_TABLE_SIZE; i++) {
+		struct rif_cache *entry, **pentry;
+
+		pentry = rif_table+i;
+		while((entry=*pentry) != NULL) {
+			unsigned long expires
+				= entry->last_used + sysctl_tr_rif_timeout;
+
+			if (time_before_eq(expires, jiffies)) {
+				*pentry = entry->next;
+				kfree(entry);
+			} else {
+				pentry = &entry->next;
+
+				if (time_before(expires, next_interval))
+					next_interval = expires;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&rif_lock, flags);
+
+	mod_timer(&rif_timer, next_interval);
+
+}
+
+/*
+ *	Generate the /proc/net information for the token ring RIF
+ *	routing.
+ */
+
+#ifdef CONFIG_PROC_FS
+
+static struct rif_cache *rif_get_idx(loff_t pos)
+{
+	int i;
+	struct rif_cache *entry;
+	loff_t off = 0;
+
+	for(i = 0; i < RIF_TABLE_SIZE; i++)
+		for(entry = rif_table[i]; entry; entry = entry->next) {
+			if (off == pos)
+				return entry;
+			++off;
+		}
+
+	return NULL;
+}
+
+static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(&rif_lock)
+{
+	spin_lock_irq(&rif_lock);
+
+	return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int i;
+	struct rif_cache *ent = v;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN) {
+		i = -1;
+		goto scan;
+	}
+
+	if (ent->next)
+		return ent->next;
+
+	i = rif_hash(ent->addr);
+ scan:
+	while (++i < RIF_TABLE_SIZE) {
+		if ((ent = rif_table[i]) != NULL)
+			return ent;
+	}
+	return NULL;
+}
+
+static void rif_seq_stop(struct seq_file *seq, void *v)
+	__releases(&rif_lock)
+{
+	spin_unlock_irq(&rif_lock);
+}
+
+static int rif_seq_show(struct seq_file *seq, void *v)
+{
+	int j, rcf_len, segment, brdgnmb;
+	struct rif_cache *entry = v;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+		     "if     TR address       TTL   rcf   routing segments\n");
+	else {
+		struct net_device *dev = dev_get_by_index(&init_net, entry->iface);
+		long ttl = (long) (entry->last_used + sysctl_tr_rif_timeout)
+				- (long) jiffies;
+
+		seq_printf(seq, "%s %pM %7li ",
+			   dev?dev->name:"?",
+			   entry->addr,
+			   ttl/HZ);
+
+			if (entry->local_ring)
+				seq_puts(seq, "local\n");
+			else {
+
+				seq_printf(seq, "%04X", ntohs(entry->rcf));
+				rcf_len = ((ntohs(entry->rcf) & TR_RCF_LEN_MASK)>>8)-2;
+				if (rcf_len)
+					rcf_len >>= 1;
+				for(j = 1; j < rcf_len; j++) {
+					if(j==1) {
+						segment=ntohs(entry->rseg[j-1])>>4;
+						seq_printf(seq,"  %03X",segment);
+					}
+
+					segment=ntohs(entry->rseg[j])>>4;
+					brdgnmb=ntohs(entry->rseg[j-1])&0x00f;
+					seq_printf(seq,"-%01X-%03X",brdgnmb,segment);
+				}
+				seq_putc(seq, '\n');
+			}
+
+		if (dev)
+			dev_put(dev);
+		}
+	return 0;
+}
+
+
+static const struct seq_operations rif_seq_ops = {
+	.start = rif_seq_start,
+	.next  = rif_seq_next,
+	.stop  = rif_seq_stop,
+	.show  = rif_seq_show,
+};
+
+static int rif_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rif_seq_ops);
+}
+
+static const struct file_operations rif_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = rif_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+static const struct header_ops tr_header_ops = {
+	.create = tr_header,
+	.rebuild= tr_rebuild_header,
+};
+
+static void tr_setup(struct net_device *dev)
+{
+	/*
+	 *	Configure and register
+	 */
+
+	dev->header_ops	= &tr_header_ops;
+
+	dev->type		= ARPHRD_IEEE802_TR;
+	dev->hard_header_len	= TR_HLEN;
+	dev->mtu		= 2000;
+	dev->addr_len		= TR_ALEN;
+	dev->tx_queue_len	= 100;	/* Long queues on tr */
+
+	memset(dev->broadcast,0xFF, TR_ALEN);
+
+	/* New-style flags. */
+	dev->flags		= IFF_BROADCAST | IFF_MULTICAST ;
+}
+
+/**
+ * alloc_trdev - Register token ring device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this token ring device
+ *
+ * Fill in the fields of the device structure with token ring-generic values.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+struct net_device *alloc_trdev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "tr%d", tr_setup);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tr_table[] = {
+	{
+		.procname	= "rif_timeout",
+		.data		= &sysctl_tr_rif_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ },
+};
+
+static __initdata struct ctl_path tr_path[] = {
+	{ .procname = "net", },
+	{ .procname = "token-ring", },
+	{ }
+};
+#endif
+
+/*
+ *	Called during bootup.  We don't actually have to initialise
+ *	too much for this.
+ */
+
+static int __init rif_init(void)
+{
+	rif_timer.expires  = jiffies + sysctl_tr_rif_timeout;
+	setup_timer(&rif_timer, rif_check_expire, 0);
+	add_timer(&rif_timer);
+#ifdef CONFIG_SYSCTL
+	register_sysctl_paths(tr_path, tr_table);
+#endif
+	proc_net_fops_create(&init_net, "tr_rif", S_IRUGO, &rif_seq_fops);
+	return 0;
+}
+
+module_init(rif_init);
+
+EXPORT_SYMBOL(tr_type_trans);
+EXPORT_SYMBOL(alloc_trdev);
+
+MODULE_LICENSE("GPL");
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 00000000..fa073a54
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,29 @@
+#
+# Configuration for 802.1Q VLAN support
+#
+
+config VLAN_8021Q
+	tristate "802.1Q VLAN Support"
+	---help---
+	  Select this and you will be able to create 802.1Q VLAN interfaces
+	  on your ethernet interfaces.  802.1Q VLAN supports almost
+	  everything a regular ethernet interface does, including
+	  firewalling, bridging, and of course IP traffic.  You will need
+	  the 'vconfig' tool from the VLAN project in order to effectively
+	  use VLANs.  See the VLAN web page for more information:
+	  <http://www.candelatech.com/~greear/vlan.html>
+
+	  To compile this code as a module, choose M here: the module
+	  will be called 8021q.
+
+	  If unsure, say N.
+
+config VLAN_8021Q_GVRP
+	bool "GVRP (GARP VLAN Registration Protocol) support"
+	depends on VLAN_8021Q
+	select GARP
+	help
+	  Select this to enable GVRP end-system support. GVRP is used for
+	  automatic propagation of registered VLANs to switches.
+
+	  If unsure, say N.
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
new file mode 100644
index 00000000..9f4f174e
--- /dev/null
+++ b/net/8021q/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux VLAN layer.
+#
+obj-$(subst m,y,$(CONFIG_VLAN_8021Q))	+= vlan_core.o
+obj-$(CONFIG_VLAN_8021Q)		+= 8021q.o
+
+8021q-y					:= vlan.o vlan_dev.o vlan_netlink.o
+8021q-$(CONFIG_VLAN_8021Q_GVRP)		+= vlan_gvrp.o
+8021q-$(CONFIG_PROC_FS)			+= vlanproc.o
+
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
new file mode 100644
index 00000000..917ecb93
--- /dev/null
+++ b/net/8021q/vlan.c
@@ -0,0 +1,722 @@
+/*
+ * INET		802.1Q VLAN
+ *		Ethernet-type device handling.
+ *
+ * Authors:	Ben Greear <greearb@candelatech.com>
+ *              Please send support related email to: netdev@vger.kernel.org
+ *              VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
+ *
+ * Fixes:
+ *              Fix for packet capture - Nick Eggleston <nick@dccinc.com>;
+ *		Add HW acceleration hooks - David S. Miller <davem@redhat.com>;
+ *		Correct all the locking - David S. Miller <davem@redhat.com>;
+ *		Use hash table for VLAN groups - David S. Miller <davem@redhat.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <net/p8022.h>
+#include <net/arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <asm/uaccess.h>
+
+#include <linux/if_vlan.h>
+#include "vlan.h"
+#include "vlanproc.h"
+
+#define DRV_VERSION "1.8"
+
+/* Global VLAN variables */
+
+int vlan_net_id __read_mostly;
+
+const char vlan_fullname[] = "802.1Q VLAN Support";
+const char vlan_version[] = DRV_VERSION;
+
+/* End of global variables definitions. */
+
+static void vlan_group_free(struct vlan_group *grp)
+{
+	int i;
+
+	for (i = 0; i < VLAN_GROUP_ARRAY_SPLIT_PARTS; i++)
+		kfree(grp->vlan_devices_arrays[i]);
+	kfree(grp);
+}
+
+static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
+{
+	struct vlan_group *grp;
+
+	grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL);
+	if (!grp)
+		return NULL;
+
+	grp->real_dev = real_dev;
+	return grp;
+}
+
+static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id)
+{
+	struct net_device **array;
+	unsigned int size;
+
+	ASSERT_RTNL();
+
+	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+	if (array != NULL)
+		return 0;
+
+	size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
+	array = kzalloc(size, GFP_KERNEL);
+	if (array == NULL)
+		return -ENOBUFS;
+
+	vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN] = array;
+	return 0;
+}
+
+static void vlan_rcu_free(struct rcu_head *rcu)
+{
+	vlan_group_free(container_of(rcu, struct vlan_group, rcu));
+}
+
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct net_device *real_dev = vlan->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	struct vlan_group *grp;
+	u16 vlan_id = vlan->vlan_id;
+
+	ASSERT_RTNL();
+
+	grp = rtnl_dereference(real_dev->vlgrp);
+	BUG_ON(!grp);
+
+	/* Take it out of our own structures, but be sure to interlock with
+	 * HW accelerating devices or SW vlan input packet processing if
+	 * VLAN is not 0 (leave it there for 802.1p).
+	 */
+	if (vlan_id && (real_dev->features & NETIF_F_HW_VLAN_FILTER))
+		ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id);
+
+	grp->nr_vlans--;
+
+	if (vlan->flags & VLAN_FLAG_GVRP)
+		vlan_gvrp_request_leave(dev);
+
+	vlan_group_set_device(grp, vlan_id, NULL);
+	/* Because unregister_netdevice_queue() makes sure at least one rcu
+	 * grace period is respected before device freeing,
+	 * we dont need to call synchronize_net() here.
+	 */
+	unregister_netdevice_queue(dev, head);
+
+	/* If the group is now empty, kill off the group. */
+	if (grp->nr_vlans == 0) {
+		vlan_gvrp_uninit_applicant(real_dev);
+
+		rcu_assign_pointer(real_dev->vlgrp, NULL);
+		if (ops->ndo_vlan_rx_register)
+			ops->ndo_vlan_rx_register(real_dev, NULL);
+
+		/* Free the group, after all cpu's are done. */
+		call_rcu(&grp->rcu, vlan_rcu_free);
+	}
+
+	/* Get rid of the vlan's reference to real_dev */
+	dev_put(real_dev);
+}
+
+int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
+{
+	const char *name = real_dev->name;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+
+	if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
+		pr_info("8021q: VLANs not supported on %s\n", name);
+		return -EOPNOTSUPP;
+	}
+
+	if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
+	    (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) {
+		pr_info("8021q: Device %s has buggy VLAN hw accel\n", name);
+		return -EOPNOTSUPP;
+	}
+
+	if (vlan_find_dev(real_dev, vlan_id) != NULL)
+		return -EEXIST;
+
+	return 0;
+}
+
+int register_vlan_dev(struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct net_device *real_dev = vlan->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	u16 vlan_id = vlan->vlan_id;
+	struct vlan_group *grp, *ngrp = NULL;
+	int err;
+
+	grp = rtnl_dereference(real_dev->vlgrp);
+	if (!grp) {
+		ngrp = grp = vlan_group_alloc(real_dev);
+		if (!grp)
+			return -ENOBUFS;
+		err = vlan_gvrp_init_applicant(real_dev);
+		if (err < 0)
+			goto out_free_group;
+	}
+
+	err = vlan_group_prealloc_vid(grp, vlan_id);
+	if (err < 0)
+		goto out_uninit_applicant;
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto out_uninit_applicant;
+
+	/* Account for reference in struct vlan_dev_info */
+	dev_hold(real_dev);
+
+	netif_stacked_transfer_operstate(real_dev, dev);
+	linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
+
+	/* So, got the sucker initialized, now lets place
+	 * it into our local structure.
+	 */
+	vlan_group_set_device(grp, vlan_id, dev);
+	grp->nr_vlans++;
+
+	if (ngrp) {
+		if (ops->ndo_vlan_rx_register && (real_dev->features & NETIF_F_HW_VLAN_RX))
+			ops->ndo_vlan_rx_register(real_dev, ngrp);
+		rcu_assign_pointer(real_dev->vlgrp, ngrp);
+	}
+	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
+		ops->ndo_vlan_rx_add_vid(real_dev, vlan_id);
+
+	return 0;
+
+out_uninit_applicant:
+	if (ngrp)
+		vlan_gvrp_uninit_applicant(real_dev);
+out_free_group:
+	if (ngrp) {
+		/* Free the group, after all cpu's are done. */
+		call_rcu(&ngrp->rcu, vlan_rcu_free);
+	}
+	return err;
+}
+
+/*  Attach a VLAN device to a mac address (ie Ethernet Card).
+ *  Returns 0 if the device was created or a negative error code otherwise.
+ */
+static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
+{
+	struct net_device *new_dev;
+	struct net *net = dev_net(real_dev);
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
+	char name[IFNAMSIZ];
+	int err;
+
+	if (vlan_id >= VLAN_VID_MASK)
+		return -ERANGE;
+
+	err = vlan_check_real_dev(real_dev, vlan_id);
+	if (err < 0)
+		return err;
+
+	/* Gotta set up the fields for the device. */
+	switch (vn->name_type) {
+	case VLAN_NAME_TYPE_RAW_PLUS_VID:
+		/* name will look like:	 eth1.0005 */
+		snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id);
+		break;
+	case VLAN_NAME_TYPE_PLUS_VID_NO_PAD:
+		/* Put our vlan.VID in the name.
+		 * Name will look like:	 vlan5
+		 */
+		snprintf(name, IFNAMSIZ, "vlan%i", vlan_id);
+		break;
+	case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD:
+		/* Put our vlan.VID in the name.
+		 * Name will look like:	 eth0.5
+		 */
+		snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id);
+		break;
+	case VLAN_NAME_TYPE_PLUS_VID:
+		/* Put our vlan.VID in the name.
+		 * Name will look like:	 vlan0005
+		 */
+	default:
+		snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
+	}
+
+	new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name, vlan_setup);
+
+	if (new_dev == NULL)
+		return -ENOBUFS;
+
+	dev_net_set(new_dev, net);
+	/* need 4 bytes for extra VLAN header info,
+	 * hope the underlying device can handle it.
+	 */
+	new_dev->mtu = real_dev->mtu;
+
+	vlan_dev_info(new_dev)->vlan_id = vlan_id;
+	vlan_dev_info(new_dev)->real_dev = real_dev;
+	vlan_dev_info(new_dev)->dent = NULL;
+	vlan_dev_info(new_dev)->flags = VLAN_FLAG_REORDER_HDR;
+
+	new_dev->rtnl_link_ops = &vlan_link_ops;
+	err = register_vlan_dev(new_dev);
+	if (err < 0)
+		goto out_free_newdev;
+
+	return 0;
+
+out_free_newdev:
+	free_netdev(new_dev);
+	return err;
+}
+
+static void vlan_sync_address(struct net_device *dev,
+			      struct net_device *vlandev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(vlandev);
+
+	/* May be called without an actual change */
+	if (!compare_ether_addr(vlan->real_dev_addr, dev->dev_addr))
+		return;
+
+	/* vlan address was different from the old address and is equal to
+	 * the new address */
+	if (compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
+	    !compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
+		dev_uc_del(dev, vlandev->dev_addr);
+
+	/* vlan address was equal to the old address and is different from
+	 * the new address */
+	if (!compare_ether_addr(vlandev->dev_addr, vlan->real_dev_addr) &&
+	    compare_ether_addr(vlandev->dev_addr, dev->dev_addr))
+		dev_uc_add(dev, vlandev->dev_addr);
+
+	memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN);
+}
+
+static void vlan_transfer_features(struct net_device *dev,
+				   struct net_device *vlandev)
+{
+	vlandev->gso_max_size = dev->gso_max_size;
+
+	if (dev->features & NETIF_F_HW_VLAN_TX)
+		vlandev->hard_header_len = dev->hard_header_len;
+	else
+		vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN;
+
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
+#endif
+
+	netdev_update_features(vlandev);
+}
+
+static void __vlan_device_event(struct net_device *dev, unsigned long event)
+{
+	switch (event) {
+	case NETDEV_CHANGENAME:
+		vlan_proc_rem_dev(dev);
+		if (vlan_proc_add_dev(dev) < 0)
+			pr_warning("8021q: failed to change proc name for %s\n",
+					dev->name);
+		break;
+	case NETDEV_REGISTER:
+		if (vlan_proc_add_dev(dev) < 0)
+			pr_warning("8021q: failed to add proc entry for %s\n",
+					dev->name);
+		break;
+	case NETDEV_UNREGISTER:
+		vlan_proc_rem_dev(dev);
+		break;
+	}
+}
+
+static int vlan_device_event(struct notifier_block *unused, unsigned long event,
+			     void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct vlan_group *grp;
+	int i, flgs;
+	struct net_device *vlandev;
+	struct vlan_dev_info *vlan;
+	LIST_HEAD(list);
+
+	if (is_vlan_dev(dev))
+		__vlan_device_event(dev, event);
+
+	if ((event == NETDEV_UP) &&
+	    (dev->features & NETIF_F_HW_VLAN_FILTER) &&
+	    dev->netdev_ops->ndo_vlan_rx_add_vid) {
+		pr_info("8021q: adding VLAN 0 to HW filter on device %s\n",
+			dev->name);
+		dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
+	}
+
+	grp = rtnl_dereference(dev->vlgrp);
+	if (!grp)
+		goto out;
+
+	/* It is OK that we do not hold the group lock right now,
+	 * as we run under the RTNL lock.
+	 */
+
+	switch (event) {
+	case NETDEV_CHANGE:
+		/* Propagate real device state to vlan devices */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			netif_stacked_transfer_operstate(dev, vlandev);
+		}
+		break;
+
+	case NETDEV_CHANGEADDR:
+		/* Adjust unicast filters on underlying device */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			flgs = vlandev->flags;
+			if (!(flgs & IFF_UP))
+				continue;
+
+			vlan_sync_address(dev, vlandev);
+		}
+		break;
+
+	case NETDEV_CHANGEMTU:
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			if (vlandev->mtu <= dev->mtu)
+				continue;
+
+			dev_set_mtu(vlandev, dev->mtu);
+		}
+		break;
+
+	case NETDEV_FEAT_CHANGE:
+		/* Propagate device features to underlying device */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			vlan_transfer_features(dev, vlandev);
+		}
+
+		break;
+
+	case NETDEV_DOWN:
+		/* Put all VLANs for this dev in the down state too.  */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			flgs = vlandev->flags;
+			if (!(flgs & IFF_UP))
+				continue;
+
+			vlan = vlan_dev_info(vlandev);
+			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+				dev_change_flags(vlandev, flgs & ~IFF_UP);
+			netif_stacked_transfer_operstate(dev, vlandev);
+		}
+		break;
+
+	case NETDEV_UP:
+		/* Put all VLANs for this dev in the up state too.  */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			flgs = vlandev->flags;
+			if (flgs & IFF_UP)
+				continue;
+
+			vlan = vlan_dev_info(vlandev);
+			if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+				dev_change_flags(vlandev, flgs | IFF_UP);
+			netif_stacked_transfer_operstate(dev, vlandev);
+		}
+		break;
+
+	case NETDEV_UNREGISTER:
+		/* twiddle thumbs on netns device moves */
+		if (dev->reg_state != NETREG_UNREGISTERING)
+			break;
+
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			/* unregistration of last vlan destroys group, abort
+			 * afterwards */
+			if (grp->nr_vlans == 1)
+				i = VLAN_N_VID;
+
+			unregister_vlan_dev(vlandev, &list);
+		}
+		unregister_netdevice_many(&list);
+		break;
+
+	case NETDEV_PRE_TYPE_CHANGE:
+		/* Forbid underlaying device to change its type. */
+		return NOTIFY_BAD;
+
+	case NETDEV_NOTIFY_PEERS:
+	case NETDEV_BONDING_FAILOVER:
+		/* Propagate to vlan devices */
+		for (i = 0; i < VLAN_N_VID; i++) {
+			vlandev = vlan_group_get_device(grp, i);
+			if (!vlandev)
+				continue;
+
+			call_netdevice_notifiers(event, vlandev);
+		}
+		break;
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block vlan_notifier_block __read_mostly = {
+	.notifier_call = vlan_device_event,
+};
+
+/*
+ *	VLAN IOCTL handler.
+ *	o execute requested action or pass command to the device driver
+ *   arg is really a struct vlan_ioctl_args __user *.
+ */
+static int vlan_ioctl_handler(struct net *net, void __user *arg)
+{
+	int err;
+	struct vlan_ioctl_args args;
+	struct net_device *dev = NULL;
+
+	if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
+		return -EFAULT;
+
+	/* Null terminate this sucker, just in case. */
+	args.device1[23] = 0;
+	args.u.device2[23] = 0;
+
+	rtnl_lock();
+
+	switch (args.cmd) {
+	case SET_VLAN_INGRESS_PRIORITY_CMD:
+	case SET_VLAN_EGRESS_PRIORITY_CMD:
+	case SET_VLAN_FLAG_CMD:
+	case ADD_VLAN_CMD:
+	case DEL_VLAN_CMD:
+	case GET_VLAN_REALDEV_NAME_CMD:
+	case GET_VLAN_VID_CMD:
+		err = -ENODEV;
+		dev = __dev_get_by_name(net, args.device1);
+		if (!dev)
+			goto out;
+
+		err = -EINVAL;
+		if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev))
+			goto out;
+	}
+
+	switch (args.cmd) {
+	case SET_VLAN_INGRESS_PRIORITY_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		vlan_dev_set_ingress_priority(dev,
+					      args.u.skb_priority,
+					      args.vlan_qos);
+		err = 0;
+		break;
+
+	case SET_VLAN_EGRESS_PRIORITY_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		err = vlan_dev_set_egress_priority(dev,
+						   args.u.skb_priority,
+						   args.vlan_qos);
+		break;
+
+	case SET_VLAN_FLAG_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		err = vlan_dev_change_flags(dev,
+					    args.vlan_qos ? args.u.flag : 0,
+					    args.u.flag);
+		break;
+
+	case SET_VLAN_NAME_TYPE_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		if ((args.u.name_type >= 0) &&
+		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
+			struct vlan_net *vn;
+
+			vn = net_generic(net, vlan_net_id);
+			vn->name_type = args.u.name_type;
+			err = 0;
+		} else {
+			err = -EINVAL;
+		}
+		break;
+
+	case ADD_VLAN_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		err = register_vlan_device(dev, args.u.VID);
+		break;
+
+	case DEL_VLAN_CMD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		unregister_vlan_dev(dev, NULL);
+		err = 0;
+		break;
+
+	case GET_VLAN_REALDEV_NAME_CMD:
+		err = 0;
+		vlan_dev_get_realdev_name(dev, args.u.device2);
+		if (copy_to_user(arg, &args,
+				 sizeof(struct vlan_ioctl_args)))
+			err = -EFAULT;
+		break;
+
+	case GET_VLAN_VID_CMD:
+		err = 0;
+		args.u.VID = vlan_dev_vlan_id(dev);
+		if (copy_to_user(arg, &args,
+				 sizeof(struct vlan_ioctl_args)))
+		      err = -EFAULT;
+		break;
+
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+out:
+	rtnl_unlock();
+	return err;
+}
+
+static int __net_init vlan_init_net(struct net *net)
+{
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
+	int err;
+
+	vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;
+
+	err = vlan_proc_init(net);
+
+	return err;
+}
+
+static void __net_exit vlan_exit_net(struct net *net)
+{
+	vlan_proc_cleanup(net);
+}
+
+static struct pernet_operations vlan_net_ops = {
+	.init = vlan_init_net,
+	.exit = vlan_exit_net,
+	.id   = &vlan_net_id,
+	.size = sizeof(struct vlan_net),
+};
+
+static int __init vlan_proto_init(void)
+{
+	int err;
+
+	pr_info("%s v%s\n", vlan_fullname, vlan_version);
+
+	err = register_pernet_subsys(&vlan_net_ops);
+	if (err < 0)
+		goto err0;
+
+	err = register_netdevice_notifier(&vlan_notifier_block);
+	if (err < 0)
+		goto err2;
+
+	err = vlan_gvrp_init();
+	if (err < 0)
+		goto err3;
+
+	err = vlan_netlink_init();
+	if (err < 0)
+		goto err4;
+
+	vlan_ioctl_set(vlan_ioctl_handler);
+	return 0;
+
+err4:
+	vlan_gvrp_uninit();
+err3:
+	unregister_netdevice_notifier(&vlan_notifier_block);
+err2:
+	unregister_pernet_subsys(&vlan_net_ops);
+err0:
+	return err;
+}
+
+static void __exit vlan_cleanup_module(void)
+{
+	vlan_ioctl_set(NULL);
+	vlan_netlink_fini();
+
+	unregister_netdevice_notifier(&vlan_notifier_block);
+
+	unregister_pernet_subsys(&vlan_net_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+	vlan_gvrp_uninit();
+}
+
+module_init(vlan_proto_init);
+module_exit(vlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
new file mode 100644
index 00000000..9da07e30
--- /dev/null
+++ b/net/8021q/vlan.h
@@ -0,0 +1,134 @@
+#ifndef __BEN_VLAN_802_1Q_INC__
+#define __BEN_VLAN_802_1Q_INC__
+
+#include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
+
+
+/**
+ *	struct vlan_priority_tci_mapping - vlan egress priority mappings
+ *	@priority: skb priority
+ *	@vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
+ *	@next: pointer to next struct
+ */
+struct vlan_priority_tci_mapping {
+	u32					priority;
+	u16					vlan_qos;
+	struct vlan_priority_tci_mapping	*next;
+};
+
+
+/**
+ *	struct vlan_pcpu_stats - VLAN percpu rx/tx stats
+ *	@rx_packets: number of received packets
+ *	@rx_bytes: number of received bytes
+ *	@rx_multicast: number of received multicast packets
+ *	@tx_packets: number of transmitted packets
+ *	@tx_bytes: number of transmitted bytes
+ *	@syncp: synchronization point for 64bit counters
+ *	@rx_errors: number of rx errors
+ *	@tx_dropped: number of tx drops
+ */
+struct vlan_pcpu_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			rx_errors;
+	u32			tx_dropped;
+};
+
+/**
+ *	struct vlan_dev_info - VLAN private device data
+ *	@nr_ingress_mappings: number of ingress priority mappings
+ *	@ingress_priority_map: ingress priority mappings
+ *	@nr_egress_mappings: number of egress priority mappings
+ *	@egress_priority_map: hash of egress priority mappings
+ *	@vlan_id: VLAN identifier
+ *	@flags: device flags
+ *	@real_dev: underlying netdevice
+ *	@real_dev_addr: address of underlying netdevice
+ *	@dent: proc dir entry
+ *	@vlan_pcpu_stats: ptr to percpu rx stats
+ */
+struct vlan_dev_info {
+	unsigned int				nr_ingress_mappings;
+	u32					ingress_priority_map[8];
+	unsigned int				nr_egress_mappings;
+	struct vlan_priority_tci_mapping	*egress_priority_map[16];
+
+	u16					vlan_id;
+	u16					flags;
+
+	struct net_device			*real_dev;
+	unsigned char				real_dev_addr[ETH_ALEN];
+
+	struct proc_dir_entry			*dent;
+	struct vlan_pcpu_stats __percpu		*vlan_pcpu_stats;
+};
+
+static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+/* found in vlan_dev.c */
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+				   u32 skb_prio, u16 vlan_prio);
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+				 u32 skb_prio, u16 vlan_prio);
+int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
+
+int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id);
+void vlan_setup(struct net_device *dev);
+int register_vlan_dev(struct net_device *dev);
+void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+
+static inline u32 vlan_get_ingress_priority(struct net_device *dev,
+					    u16 vlan_tci)
+{
+	struct vlan_dev_info *vip = vlan_dev_info(dev);
+
+	return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
+}
+
+#ifdef CONFIG_VLAN_8021Q_GVRP
+extern int vlan_gvrp_request_join(const struct net_device *dev);
+extern void vlan_gvrp_request_leave(const struct net_device *dev);
+extern int vlan_gvrp_init_applicant(struct net_device *dev);
+extern void vlan_gvrp_uninit_applicant(struct net_device *dev);
+extern int vlan_gvrp_init(void);
+extern void vlan_gvrp_uninit(void);
+#else
+static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
+static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
+static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
+static inline int vlan_gvrp_init(void) { return 0; }
+static inline void vlan_gvrp_uninit(void) {}
+#endif
+
+extern const char vlan_fullname[];
+extern const char vlan_version[];
+extern int vlan_netlink_init(void);
+extern void vlan_netlink_fini(void);
+
+extern struct rtnl_link_ops vlan_link_ops;
+
+extern int vlan_net_id;
+
+struct proc_dir_entry;
+
+struct vlan_net {
+	/* /proc/net/vlan */
+	struct proc_dir_entry *proc_vlan_dir;
+	/* /proc/net/vlan/config */
+	struct proc_dir_entry *proc_vlan_conf;
+	/* Determines interface naming scheme. */
+	unsigned short name_type;
+};
+
+#endif /* !(__BEN_VLAN_802_1Q_INC__) */
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
new file mode 100644
index 00000000..27263fb1
--- /dev/null
+++ b/net/8021q/vlan_core.c
@@ -0,0 +1,181 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/netpoll.h>
+#include "vlan.h"
+
+bool vlan_do_receive(struct sk_buff **skbp)
+{
+	struct sk_buff *skb = *skbp;
+	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
+	struct net_device *vlan_dev;
+	struct vlan_pcpu_stats *rx_stats;
+
+	vlan_dev = vlan_find_dev(skb->dev, vlan_id);
+	if (!vlan_dev) {
+		if (vlan_id)
+			skb->pkt_type = PACKET_OTHERHOST;
+		return false;
+	}
+
+	skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return false;
+
+	skb->dev = vlan_dev;
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		/* Our lower layer thinks this is not local, let's make sure.
+		 * This allows the VLAN to have a different MAC than the
+		 * underlying device, and still route correctly. */
+		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
+					vlan_dev->dev_addr))
+			skb->pkt_type = PACKET_HOST;
+	}
+
+	if (!(vlan_dev_info(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+		unsigned int offset = skb->data - skb_mac_header(skb);
+
+		/*
+		 * vlan_insert_tag expect skb->data pointing to mac header.
+		 * So change skb->data before calling it and change back to
+		 * original position later
+		 */
+		skb_push(skb, offset);
+		skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci);
+		if (!skb)
+			return false;
+		skb_pull(skb, offset + VLAN_HLEN);
+		skb_reset_mac_len(skb);
+	}
+
+	skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
+	skb->vlan_tci = 0;
+
+	rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_pcpu_stats);
+
+	u64_stats_update_begin(&rx_stats->syncp);
+	rx_stats->rx_packets++;
+	rx_stats->rx_bytes += skb->len;
+	if (skb->pkt_type == PACKET_MULTICAST)
+		rx_stats->rx_multicast++;
+	u64_stats_update_end(&rx_stats->syncp);
+
+	return true;
+}
+
+struct net_device *vlan_dev_real_dev(const struct net_device *dev)
+{
+	return vlan_dev_info(dev)->real_dev;
+}
+EXPORT_SYMBOL(vlan_dev_real_dev);
+
+u16 vlan_dev_vlan_id(const struct net_device *dev)
+{
+	return vlan_dev_info(dev)->vlan_id;
+}
+EXPORT_SYMBOL(vlan_dev_vlan_id);
+
+/* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
+int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
+		      u16 vlan_tci, int polling)
+{
+	__vlan_hwaccel_put_tag(skb, vlan_tci);
+	return polling ? netif_receive_skb(skb) : netif_rx(skb);
+}
+EXPORT_SYMBOL(__vlan_hwaccel_rx);
+
+gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
+			      unsigned int vlan_tci, struct sk_buff *skb)
+{
+	__vlan_hwaccel_put_tag(skb, vlan_tci);
+	return napi_gro_receive(napi, skb);
+}
+EXPORT_SYMBOL(vlan_gro_receive);
+
+gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
+			    unsigned int vlan_tci)
+{
+	__vlan_hwaccel_put_tag(napi->skb, vlan_tci);
+	return napi_gro_frags(napi);
+}
+EXPORT_SYMBOL(vlan_gro_frags);
+
+static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
+{
+	if (skb_cow(skb, skb_headroom(skb)) < 0)
+		return NULL;
+	memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+	skb->mac_header += VLAN_HLEN;
+	skb_reset_mac_len(skb);
+	return skb;
+}
+
+static void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr)
+{
+	__be16 proto;
+	unsigned char *rawp;
+
+	/*
+	 * Was a VLAN packet, grab the encapsulated protocol, which the layer
+	 * three protocols care about.
+	 */
+
+	proto = vhdr->h_vlan_encapsulated_proto;
+	if (ntohs(proto) >= 1536) {
+		skb->protocol = proto;
+		return;
+	}
+
+	rawp = skb->data;
+	if (*(unsigned short *) rawp == 0xFFFF)
+		/*
+		 * This is a magic hack to spot IPX packets. Older Novell
+		 * breaks the protocol design and runs IPX over 802.3 without
+		 * an 802.2 LLC layer. We look for FFFF which isn't a used
+		 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
+		 * but does for the rest.
+		 */
+		skb->protocol = htons(ETH_P_802_3);
+	else
+		/*
+		 * Real 802.2 LLC
+		 */
+		skb->protocol = htons(ETH_P_802_2);
+}
+
+struct sk_buff *vlan_untag(struct sk_buff *skb)
+{
+	struct vlan_hdr *vhdr;
+	u16 vlan_tci;
+
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		/* vlan_tci is already set-up so leave this for another time */
+		return skb;
+	}
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		goto err_free;
+
+	if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+		goto err_free;
+
+	vhdr = (struct vlan_hdr *) skb->data;
+	vlan_tci = ntohs(vhdr->h_vlan_TCI);
+	__vlan_hwaccel_put_tag(skb, vlan_tci);
+
+	skb_pull_rcsum(skb, VLAN_HLEN);
+	vlan_set_encap_proto(skb, vhdr);
+
+	skb = vlan_reorder_header(skb);
+	if (unlikely(!skb))
+		goto err_free;
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	return skb;
+
+err_free:
+	kfree_skb(skb);
+	return NULL;
+}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
new file mode 100644
index 00000000..d5484561
--- /dev/null
+++ b/net/8021q/vlan_dev.c
@@ -0,0 +1,705 @@
+/* -*- linux-c -*-
+ * INET		802.1Q VLAN
+ *		Ethernet-type device handling.
+ *
+ * Authors:	Ben Greear <greearb@candelatech.com>
+ *              Please send support related email to: netdev@vger.kernel.org
+ *              VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
+ *
+ * Fixes:       Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com>
+ *                - reset skb->pkt_type on incoming packets when MAC was changed
+ *                - see that changed MAC is saddr for outgoing packets
+ *              Oct 20, 2001:  Ard van Breeman:
+ *                - Fix MC-list, finally.
+ *                - Flush MC-list on VLAN destroy.
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <net/arp.h>
+
+#include "vlan.h"
+#include "vlanproc.h"
+#include <linux/if_vlan.h>
+
+/*
+ *	Rebuild the Ethernet MAC header. This is called after an ARP
+ *	(or in future other address resolution) has completed on this
+ *	sk_buff. We now let ARP fill in the other fields.
+ *
+ *	This routine CANNOT use cached dst->neigh!
+ *	Really, it is used only when dst->neigh is wrong.
+ *
+ * TODO:  This needs a checkup, I'm ignorant here. --BLG
+ */
+static int vlan_dev_rebuild_header(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+
+	switch (veth->h_vlan_encapsulated_proto) {
+#ifdef CONFIG_INET
+	case htons(ETH_P_IP):
+
+		/* TODO:  Confirm this will work with VLAN headers... */
+		return arp_find(veth->h_dest, skb);
+#endif
+	default:
+		pr_debug("%s: unable to resolve type %X addresses.\n",
+			 dev->name, ntohs(veth->h_vlan_encapsulated_proto));
+
+		memcpy(veth->h_source, dev->dev_addr, ETH_ALEN);
+		break;
+	}
+
+	return 0;
+}
+
+static inline u16
+vlan_dev_get_egress_qos_mask(struct net_device *dev, struct sk_buff *skb)
+{
+	struct vlan_priority_tci_mapping *mp;
+
+	mp = vlan_dev_info(dev)->egress_priority_map[(skb->priority & 0xF)];
+	while (mp) {
+		if (mp->priority == skb->priority) {
+			return mp->vlan_qos; /* This should already be shifted
+					      * to mask correctly with the
+					      * VLAN's TCI */
+		}
+		mp = mp->next;
+	}
+	return 0;
+}
+
+/*
+ *	Create the VLAN header for an arbitrary protocol layer
+ *
+ *	saddr=NULL	means use device source address
+ *	daddr=NULL	means leave destination address (eg unresolved arp)
+ *
+ *  This is called when the SKB is moving down the stack towards the
+ *  physical devices.
+ */
+static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
+				unsigned short type,
+				const void *daddr, const void *saddr,
+				unsigned int len)
+{
+	struct vlan_hdr *vhdr;
+	unsigned int vhdrlen = 0;
+	u16 vlan_tci = 0;
+	int rc;
+
+	if (!(vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+		vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN);
+
+		vlan_tci = vlan_dev_info(dev)->vlan_id;
+		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
+		vhdr->h_vlan_TCI = htons(vlan_tci);
+
+		/*
+		 *  Set the protocol type. For a packet of type ETH_P_802_3/2 we
+		 *  put the length in here instead.
+		 */
+		if (type != ETH_P_802_3 && type != ETH_P_802_2)
+			vhdr->h_vlan_encapsulated_proto = htons(type);
+		else
+			vhdr->h_vlan_encapsulated_proto = htons(len);
+
+		skb->protocol = htons(ETH_P_8021Q);
+		type = ETH_P_8021Q;
+		vhdrlen = VLAN_HLEN;
+	}
+
+	/* Before delegating work to the lower layer, enter our MAC-address */
+	if (saddr == NULL)
+		saddr = dev->dev_addr;
+
+	/* Now make the underlying real hard header */
+	dev = vlan_dev_info(dev)->real_dev;
+	rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen);
+	if (rc > 0)
+		rc += vhdrlen;
+	return rc;
+}
+
+static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
+					    struct net_device *dev)
+{
+	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
+	unsigned int len;
+	int ret;
+
+	/* Handle non-VLAN frames if they are sent to us, for example by DHCP.
+	 *
+	 * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
+	 * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
+	 */
+	if (veth->h_vlan_proto != htons(ETH_P_8021Q) ||
+	    vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR) {
+		u16 vlan_tci;
+		vlan_tci = vlan_dev_info(dev)->vlan_id;
+		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
+		skb = __vlan_hwaccel_put_tag(skb, vlan_tci);
+	}
+
+	skb->dev = vlan_dev_info(dev)->real_dev;
+	len = skb->len;
+	ret = dev_queue_xmit(skb);
+
+	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+		struct vlan_pcpu_stats *stats;
+
+		stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats);
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped);
+	}
+
+	return ret;
+}
+
+static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* TODO: gotta make sure the underlying layer can handle it,
+	 * maybe an IFF_VLAN_CAPABLE flag for devices?
+	 */
+	if (vlan_dev_info(dev)->real_dev->mtu < new_mtu)
+		return -ERANGE;
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+void vlan_dev_set_ingress_priority(const struct net_device *dev,
+				   u32 skb_prio, u16 vlan_prio)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+
+	if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio)
+		vlan->nr_ingress_mappings--;
+	else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio)
+		vlan->nr_ingress_mappings++;
+
+	vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio;
+}
+
+int vlan_dev_set_egress_priority(const struct net_device *dev,
+				 u32 skb_prio, u16 vlan_prio)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct vlan_priority_tci_mapping *mp = NULL;
+	struct vlan_priority_tci_mapping *np;
+	u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
+
+	/* See if a priority mapping exists.. */
+	mp = vlan->egress_priority_map[skb_prio & 0xF];
+	while (mp) {
+		if (mp->priority == skb_prio) {
+			if (mp->vlan_qos && !vlan_qos)
+				vlan->nr_egress_mappings--;
+			else if (!mp->vlan_qos && vlan_qos)
+				vlan->nr_egress_mappings++;
+			mp->vlan_qos = vlan_qos;
+			return 0;
+		}
+		mp = mp->next;
+	}
+
+	/* Create a new mapping then. */
+	mp = vlan->egress_priority_map[skb_prio & 0xF];
+	np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL);
+	if (!np)
+		return -ENOBUFS;
+
+	np->next = mp;
+	np->priority = skb_prio;
+	np->vlan_qos = vlan_qos;
+	vlan->egress_priority_map[skb_prio & 0xF] = np;
+	if (vlan_qos)
+		vlan->nr_egress_mappings++;
+	return 0;
+}
+
+/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
+int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	u32 old_flags = vlan->flags;
+
+	if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+		     VLAN_FLAG_LOOSE_BINDING))
+		return -EINVAL;
+
+	vlan->flags = (old_flags & ~mask) | (flags & mask);
+
+	if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) {
+		if (vlan->flags & VLAN_FLAG_GVRP)
+			vlan_gvrp_request_join(dev);
+		else
+			vlan_gvrp_request_leave(dev);
+	}
+	return 0;
+}
+
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
+{
+	strncpy(result, vlan_dev_info(dev)->real_dev->name, 23);
+}
+
+static int vlan_dev_open(struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct net_device *real_dev = vlan->real_dev;
+	int err;
+
+	if (!(real_dev->flags & IFF_UP) &&
+	    !(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
+		return -ENETDOWN;
+
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) {
+		err = dev_uc_add(real_dev, dev->dev_addr);
+		if (err < 0)
+			goto out;
+	}
+
+	if (dev->flags & IFF_ALLMULTI) {
+		err = dev_set_allmulti(real_dev, 1);
+		if (err < 0)
+			goto del_unicast;
+	}
+	if (dev->flags & IFF_PROMISC) {
+		err = dev_set_promiscuity(real_dev, 1);
+		if (err < 0)
+			goto clear_allmulti;
+	}
+
+	memcpy(vlan->real_dev_addr, real_dev->dev_addr, ETH_ALEN);
+
+	if (vlan->flags & VLAN_FLAG_GVRP)
+		vlan_gvrp_request_join(dev);
+
+	if (netif_carrier_ok(real_dev))
+		netif_carrier_on(dev);
+	return 0;
+
+clear_allmulti:
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, -1);
+del_unicast:
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
+		dev_uc_del(real_dev, dev->dev_addr);
+out:
+	netif_carrier_off(dev);
+	return err;
+}
+
+static int vlan_dev_stop(struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct net_device *real_dev = vlan->real_dev;
+
+	dev_mc_unsync(real_dev, dev);
+	dev_uc_unsync(real_dev, dev);
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, -1);
+	if (dev->flags & IFF_PROMISC)
+		dev_set_promiscuity(real_dev, -1);
+
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
+		dev_uc_del(real_dev, dev->dev_addr);
+
+	netif_carrier_off(dev);
+	return 0;
+}
+
+static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	struct sockaddr *addr = p;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (!(dev->flags & IFF_UP))
+		goto out;
+
+	if (compare_ether_addr(addr->sa_data, real_dev->dev_addr)) {
+		err = dev_uc_add(real_dev, addr->sa_data);
+		if (err < 0)
+			return err;
+	}
+
+	if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
+		dev_uc_del(real_dev, dev->dev_addr);
+
+out:
+	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	return 0;
+}
+
+static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	struct ifreq ifrr;
+	int err = -EOPNOTSUPP;
+
+	strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
+	ifrr.ifr_ifru = ifr->ifr_ifru;
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		if (netif_device_present(real_dev) && ops->ndo_do_ioctl)
+			err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd);
+		break;
+	}
+
+	if (!err)
+		ifr->ifr_ifru = ifrr.ifr_ifru;
+
+	return err;
+}
+
+static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int err = 0;
+
+	if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
+		err = ops->ndo_neigh_setup(real_dev, pa);
+
+	return err;
+}
+
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid,
+				   struct scatterlist *sgl, unsigned int sgc)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int rc = 0;
+
+	if (ops->ndo_fcoe_ddp_setup)
+		rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc);
+
+	return rc;
+}
+
+static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int len = 0;
+
+	if (ops->ndo_fcoe_ddp_done)
+		len = ops->ndo_fcoe_ddp_done(real_dev, xid);
+
+	return len;
+}
+
+static int vlan_dev_fcoe_enable(struct net_device *dev)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int rc = -EINVAL;
+
+	if (ops->ndo_fcoe_enable)
+		rc = ops->ndo_fcoe_enable(real_dev);
+	return rc;
+}
+
+static int vlan_dev_fcoe_disable(struct net_device *dev)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int rc = -EINVAL;
+
+	if (ops->ndo_fcoe_disable)
+		rc = ops->ndo_fcoe_disable(real_dev);
+	return rc;
+}
+
+static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int rc = -EINVAL;
+
+	if (ops->ndo_fcoe_get_wwn)
+		rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
+	return rc;
+}
+
+static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
+				    struct scatterlist *sgl, unsigned int sgc)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = real_dev->netdev_ops;
+	int rc = 0;
+
+	if (ops->ndo_fcoe_ddp_target)
+		rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
+
+	return rc;
+}
+#endif
+
+static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
+}
+
+static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
+{
+	dev_mc_sync(vlan_dev_info(vlan_dev)->real_dev, vlan_dev);
+	dev_uc_sync(vlan_dev_info(vlan_dev)->real_dev, vlan_dev);
+}
+
+/*
+ * vlan network devices have devices nesting below it, and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key vlan_netdev_xmit_lock_key;
+static struct lock_class_key vlan_netdev_addr_lock_key;
+
+static void vlan_dev_set_lockdep_one(struct net_device *dev,
+				     struct netdev_queue *txq,
+				     void *_subclass)
+{
+	lockdep_set_class_and_subclass(&txq->_xmit_lock,
+				       &vlan_netdev_xmit_lock_key,
+				       *(int *)_subclass);
+}
+
+static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass)
+{
+	lockdep_set_class_and_subclass(&dev->addr_list_lock,
+				       &vlan_netdev_addr_lock_key,
+				       subclass);
+	netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass);
+}
+
+static const struct header_ops vlan_header_ops = {
+	.create	 = vlan_dev_hard_header,
+	.rebuild = vlan_dev_rebuild_header,
+	.parse	 = eth_header_parse,
+};
+
+static const struct net_device_ops vlan_netdev_ops;
+
+static int vlan_dev_init(struct net_device *dev)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	int subclass = 0;
+
+	netif_carrier_off(dev);
+
+	/* IFF_BROADCAST|IFF_MULTICAST; ??? */
+	dev->flags  = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
+					  IFF_MASTER | IFF_SLAVE);
+	dev->iflink = real_dev->ifindex;
+	dev->state  = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) |
+					  (1<<__LINK_STATE_DORMANT))) |
+		      (1<<__LINK_STATE_PRESENT);
+
+	dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+			   NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |
+			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+			   NETIF_F_ALL_FCOE;
+
+	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
+	dev->gso_max_size = real_dev->gso_max_size;
+
+	/* ipv6 shared card related stuff */
+	dev->dev_id = real_dev->dev_id;
+
+	if (is_zero_ether_addr(dev->dev_addr))
+		memcpy(dev->dev_addr, real_dev->dev_addr, dev->addr_len);
+	if (is_zero_ether_addr(dev->broadcast))
+		memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
+
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid;
+#endif
+
+	dev->needed_headroom = real_dev->needed_headroom;
+	if (real_dev->features & NETIF_F_HW_VLAN_TX) {
+		dev->header_ops      = real_dev->header_ops;
+		dev->hard_header_len = real_dev->hard_header_len;
+	} else {
+		dev->header_ops      = &vlan_header_ops;
+		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
+	}
+
+	dev->netdev_ops = &vlan_netdev_ops;
+
+	if (is_vlan_dev(real_dev))
+		subclass = 1;
+
+	vlan_dev_set_lockdep_class(dev, subclass);
+
+	vlan_dev_info(dev)->vlan_pcpu_stats = alloc_percpu(struct vlan_pcpu_stats);
+	if (!vlan_dev_info(dev)->vlan_pcpu_stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void vlan_dev_uninit(struct net_device *dev)
+{
+	struct vlan_priority_tci_mapping *pm;
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	int i;
+
+	free_percpu(vlan->vlan_pcpu_stats);
+	vlan->vlan_pcpu_stats = NULL;
+	for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+		while ((pm = vlan->egress_priority_map[i]) != NULL) {
+			vlan->egress_priority_map[i] = pm->next;
+			kfree(pm);
+		}
+	}
+}
+
+static u32 vlan_dev_fix_features(struct net_device *dev, u32 features)
+{
+	struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+	u32 old_features = features;
+
+	features &= real_dev->features;
+	features &= real_dev->vlan_features;
+
+	if (old_features & NETIF_F_SOFT_FEATURES)
+		features |= old_features & NETIF_F_SOFT_FEATURES;
+
+	if (dev_ethtool_get_rx_csum(real_dev))
+		features |= NETIF_F_RXCSUM;
+	features |= NETIF_F_LLTX;
+
+	return features;
+}
+
+static int vlan_ethtool_get_settings(struct net_device *dev,
+				     struct ethtool_cmd *cmd)
+{
+	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	return dev_ethtool_get_settings(vlan->real_dev, cmd);
+}
+
+static void vlan_ethtool_get_drvinfo(struct net_device *dev,
+				     struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, vlan_fullname);
+	strcpy(info->version, vlan_version);
+	strcpy(info->fw_version, "N/A");
+}
+
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+
+	if (vlan_dev_info(dev)->vlan_pcpu_stats) {
+		struct vlan_pcpu_stats *p;
+		u32 rx_errors = 0, tx_dropped = 0;
+		int i;
+
+		for_each_possible_cpu(i) {
+			u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes;
+			unsigned int start;
+
+			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats, i);
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rxpackets	= p->rx_packets;
+				rxbytes		= p->rx_bytes;
+				rxmulticast	= p->rx_multicast;
+				txpackets	= p->tx_packets;
+				txbytes		= p->tx_bytes;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+
+			stats->rx_packets	+= rxpackets;
+			stats->rx_bytes		+= rxbytes;
+			stats->multicast	+= rxmulticast;
+			stats->tx_packets	+= txpackets;
+			stats->tx_bytes		+= txbytes;
+			/* rx_errors & tx_dropped are u32 */
+			rx_errors	+= p->rx_errors;
+			tx_dropped	+= p->tx_dropped;
+		}
+		stats->rx_errors  = rx_errors;
+		stats->tx_dropped = tx_dropped;
+	}
+	return stats;
+}
+
+static const struct ethtool_ops vlan_ethtool_ops = {
+	.get_settings	        = vlan_ethtool_get_settings,
+	.get_drvinfo	        = vlan_ethtool_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+static const struct net_device_ops vlan_netdev_ops = {
+	.ndo_change_mtu		= vlan_dev_change_mtu,
+	.ndo_init		= vlan_dev_init,
+	.ndo_uninit		= vlan_dev_uninit,
+	.ndo_open		= vlan_dev_open,
+	.ndo_stop		= vlan_dev_stop,
+	.ndo_start_xmit =  vlan_dev_hard_start_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= vlan_dev_set_mac_address,
+	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
+	.ndo_set_multicast_list	= vlan_dev_set_rx_mode,
+	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
+	.ndo_do_ioctl		= vlan_dev_ioctl,
+	.ndo_neigh_setup	= vlan_dev_neigh_setup,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
+	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
+	.ndo_fcoe_enable	= vlan_dev_fcoe_enable,
+	.ndo_fcoe_disable	= vlan_dev_fcoe_disable,
+	.ndo_fcoe_get_wwn	= vlan_dev_fcoe_get_wwn,
+	.ndo_fcoe_ddp_target	= vlan_dev_fcoe_ddp_target,
+#endif
+	.ndo_fix_features	= vlan_dev_fix_features,
+};
+
+void vlan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->priv_flags		|= IFF_802_1Q_VLAN;
+	dev->priv_flags		&= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->tx_queue_len	= 0;
+
+	dev->netdev_ops		= &vlan_netdev_ops;
+	dev->destructor		= free_netdev;
+	dev->ethtool_ops	= &vlan_ethtool_ops;
+
+	memset(dev->broadcast, 0, ETH_ALEN);
+}
diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c
new file mode 100644
index 00000000..061cecee
--- /dev/null
+++ b/net/8021q/vlan_gvrp.c
@@ -0,0 +1,66 @@
+/*
+ * 	IEEE 802.1Q GARP VLAN Registration Protocol (GVRP)
+ *
+ * 	Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 as published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/if_vlan.h>
+#include <net/garp.h>
+#include "vlan.h"
+
+#define GARP_GVRP_ADDRESS	{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 }
+
+enum gvrp_attributes {
+	GVRP_ATTR_INVALID,
+	GVRP_ATTR_VID,
+	__GVRP_ATTR_MAX
+};
+#define GVRP_ATTR_MAX	(__GVRP_ATTR_MAX - 1)
+
+static struct garp_application vlan_gvrp_app __read_mostly = {
+	.proto.group_address	= GARP_GVRP_ADDRESS,
+	.maxattr		= GVRP_ATTR_MAX,
+	.type			= GARP_APPLICATION_GVRP,
+};
+
+int vlan_gvrp_request_join(const struct net_device *dev)
+{
+	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	__be16 vlan_id = htons(vlan->vlan_id);
+
+	return garp_request_join(vlan->real_dev, &vlan_gvrp_app,
+				 &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+void vlan_gvrp_request_leave(const struct net_device *dev)
+{
+	const struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	__be16 vlan_id = htons(vlan->vlan_id);
+
+	garp_request_leave(vlan->real_dev, &vlan_gvrp_app,
+			   &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
+}
+
+int vlan_gvrp_init_applicant(struct net_device *dev)
+{
+	return garp_init_applicant(dev, &vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit_applicant(struct net_device *dev)
+{
+	garp_uninit_applicant(dev, &vlan_gvrp_app);
+}
+
+int __init vlan_gvrp_init(void)
+{
+	return garp_register_application(&vlan_gvrp_app);
+}
+
+void vlan_gvrp_uninit(void)
+{
+	garp_unregister_application(&vlan_gvrp_app);
+}
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
new file mode 100644
index 00000000..be9a5c19
--- /dev/null
+++ b/net/8021q/vlan_netlink.c
@@ -0,0 +1,240 @@
+/*
+ *	VLAN netlink control interface
+ *
+ * 	Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include "vlan.h"
+
+
+static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = {
+	[IFLA_VLAN_ID]		= { .type = NLA_U16 },
+	[IFLA_VLAN_FLAGS]	= { .len = sizeof(struct ifla_vlan_flags) },
+	[IFLA_VLAN_EGRESS_QOS]	= { .type = NLA_NESTED },
+	[IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = {
+	[IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) },
+};
+
+
+static inline int vlan_validate_qos_map(struct nlattr *attr)
+{
+	if (!attr)
+		return 0;
+	return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy);
+}
+
+static int vlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ifla_vlan_flags *flags;
+	u16 id;
+	int err;
+
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VLAN_ID]) {
+		id = nla_get_u16(data[IFLA_VLAN_ID]);
+		if (id >= VLAN_VID_MASK)
+			return -ERANGE;
+	}
+	if (data[IFLA_VLAN_FLAGS]) {
+		flags = nla_data(data[IFLA_VLAN_FLAGS]);
+		if ((flags->flags & flags->mask) &
+		    ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
+		      VLAN_FLAG_LOOSE_BINDING))
+			return -EINVAL;
+	}
+
+	err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
+	if (err < 0)
+		return err;
+	err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+static int vlan_changelink(struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ifla_vlan_flags *flags;
+	struct ifla_vlan_qos_mapping *m;
+	struct nlattr *attr;
+	int rem;
+
+	if (data[IFLA_VLAN_FLAGS]) {
+		flags = nla_data(data[IFLA_VLAN_FLAGS]);
+		vlan_dev_change_flags(dev, flags->flags, flags->mask);
+	}
+	if (data[IFLA_VLAN_INGRESS_QOS]) {
+		nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
+			m = nla_data(attr);
+			vlan_dev_set_ingress_priority(dev, m->to, m->from);
+		}
+	}
+	if (data[IFLA_VLAN_EGRESS_QOS]) {
+		nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
+			m = nla_data(attr);
+			vlan_dev_set_egress_priority(dev, m->from, m->to);
+		}
+	}
+	return 0;
+}
+
+static int vlan_newlink(struct net *src_net, struct net_device *dev,
+			struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct net_device *real_dev;
+	int err;
+
+	if (!data[IFLA_VLAN_ID])
+		return -EINVAL;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+	real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!real_dev)
+		return -ENODEV;
+
+	vlan->vlan_id  = nla_get_u16(data[IFLA_VLAN_ID]);
+	vlan->real_dev = real_dev;
+	vlan->flags    = VLAN_FLAG_REORDER_HDR;
+
+	err = vlan_check_real_dev(real_dev, vlan->vlan_id);
+	if (err < 0)
+		return err;
+
+	if (!tb[IFLA_MTU])
+		dev->mtu = real_dev->mtu;
+	else if (dev->mtu > real_dev->mtu)
+		return -EINVAL;
+
+	err = vlan_changelink(dev, tb, data);
+	if (err < 0)
+		return err;
+
+	return register_vlan_dev(dev);
+}
+
+static inline size_t vlan_qos_map_size(unsigned int n)
+{
+	if (n == 0)
+		return 0;
+	/* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */
+	return nla_total_size(sizeof(struct nlattr)) +
+	       nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n;
+}
+
+static size_t vlan_get_size(const struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+
+	return nla_total_size(2) +	/* IFLA_VLAN_ID */
+	       sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */
+	       vlan_qos_map_size(vlan->nr_ingress_mappings) +
+	       vlan_qos_map_size(vlan->nr_egress_mappings);
+}
+
+static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct vlan_dev_info *vlan = vlan_dev_info(dev);
+	struct vlan_priority_tci_mapping *pm;
+	struct ifla_vlan_flags f;
+	struct ifla_vlan_qos_mapping m;
+	struct nlattr *nest;
+	unsigned int i;
+
+	NLA_PUT_U16(skb, IFLA_VLAN_ID, vlan_dev_info(dev)->vlan_id);
+	if (vlan->flags) {
+		f.flags = vlan->flags;
+		f.mask  = ~0;
+		NLA_PUT(skb, IFLA_VLAN_FLAGS, sizeof(f), &f);
+	}
+	if (vlan->nr_ingress_mappings) {
+		nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS);
+		if (nest == NULL)
+			goto nla_put_failure;
+
+		for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) {
+			if (!vlan->ingress_priority_map[i])
+				continue;
+
+			m.from = i;
+			m.to   = vlan->ingress_priority_map[i];
+			NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING,
+				sizeof(m), &m);
+		}
+		nla_nest_end(skb, nest);
+	}
+
+	if (vlan->nr_egress_mappings) {
+		nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS);
+		if (nest == NULL)
+			goto nla_put_failure;
+
+		for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
+			for (pm = vlan->egress_priority_map[i]; pm;
+			     pm = pm->next) {
+				if (!pm->vlan_qos)
+					continue;
+
+				m.from = pm->priority;
+				m.to   = (pm->vlan_qos >> 13) & 0x7;
+				NLA_PUT(skb, IFLA_VLAN_QOS_MAPPING,
+					sizeof(m), &m);
+			}
+		}
+		nla_nest_end(skb, nest);
+	}
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+struct rtnl_link_ops vlan_link_ops __read_mostly = {
+	.kind		= "vlan",
+	.maxtype	= IFLA_VLAN_MAX,
+	.policy		= vlan_policy,
+	.priv_size	= sizeof(struct vlan_dev_info),
+	.setup		= vlan_setup,
+	.validate	= vlan_validate,
+	.newlink	= vlan_newlink,
+	.changelink	= vlan_changelink,
+	.dellink	= unregister_vlan_dev,
+	.get_size	= vlan_get_size,
+	.fill_info	= vlan_fill_info,
+};
+
+int __init vlan_netlink_init(void)
+{
+	return rtnl_link_register(&vlan_link_ops);
+}
+
+void __exit vlan_netlink_fini(void)
+{
+	rtnl_link_unregister(&vlan_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("vlan");
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
new file mode 100644
index 00000000..d940c49d
--- /dev/null
+++ b/net/8021q/vlanproc.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * vlanproc.c	VLAN Module. /proc filesystem interface.
+ *
+ *		This module is completely hardware-independent and provides
+ *		access to the router using Linux /proc filesystem.
+ *
+ * Author:	Ben Greear, <greearb@candelatech.com> coppied from wanproc.c
+ *               by: Gene Kozin	<genek@compuserve.com>
+ *
+ * Copyright:	(c) 1998 Ben Greear
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ * ============================================================================
+ * Jan 20, 1998        Ben Greear     Initial Version
+ *****************************************************************************/
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include "vlanproc.h"
+#include "vlan.h"
+
+/****** Function Prototypes *************************************************/
+
+/* Methods for preparing data for reading proc entries */
+static int vlan_seq_show(struct seq_file *seq, void *v);
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos);
+static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+static void vlan_seq_stop(struct seq_file *seq, void *);
+static int vlandev_seq_show(struct seq_file *seq, void *v);
+
+/*
+ *	Global Data
+ */
+
+
+/*
+ *	Names of the proc directory entries
+ */
+
+static const char name_root[]	 = "vlan";
+static const char name_conf[]	 = "config";
+
+/*
+ *	Structures for interfacing with the /proc filesystem.
+ *	VLAN creates its own directory /proc/net/vlan with the following
+ *	entries:
+ *	config		device status/configuration
+ *	<device>	entry for each  device
+ */
+
+/*
+ *	Generic /proc/net/vlan/<file> file and inode operations
+ */
+
+static const struct seq_operations vlan_seq_ops = {
+	.start = vlan_seq_start,
+	.next = vlan_seq_next,
+	.stop = vlan_seq_stop,
+	.show = vlan_seq_show,
+};
+
+static int vlan_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &vlan_seq_ops,
+			sizeof(struct seq_net_private));
+}
+
+static const struct file_operations vlan_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = vlan_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+/*
+ *	/proc/net/vlan/<device> file and inode operations
+ */
+
+static int vlandev_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vlandev_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations vlandev_fops = {
+	.owner = THIS_MODULE,
+	.open    = vlandev_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * Proc filesystem derectory entries.
+ */
+
+/* Strings */
+static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = {
+    [VLAN_NAME_TYPE_RAW_PLUS_VID]        = "VLAN_NAME_TYPE_RAW_PLUS_VID",
+    [VLAN_NAME_TYPE_PLUS_VID_NO_PAD]	 = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD",
+    [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD",
+    [VLAN_NAME_TYPE_PLUS_VID]		 = "VLAN_NAME_TYPE_PLUS_VID",
+};
+/*
+ *	Interface functions
+ */
+
+/*
+ *	Clean up /proc/net/vlan entries
+ */
+
+void vlan_proc_cleanup(struct net *net)
+{
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+	if (vn->proc_vlan_conf)
+		remove_proc_entry(name_conf, vn->proc_vlan_dir);
+
+	if (vn->proc_vlan_dir)
+		proc_net_remove(net, name_root);
+
+	/* Dynamically added entries should be cleaned up as their vlan_device
+	 * is removed, so we should not have to take care of it here...
+	 */
+}
+
+/*
+ *	Create /proc/net/vlan entries
+ */
+
+int __net_init vlan_proc_init(struct net *net)
+{
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+	vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net);
+	if (!vn->proc_vlan_dir)
+		goto err;
+
+	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
+				     vn->proc_vlan_dir, &vlan_fops);
+	if (!vn->proc_vlan_conf)
+		goto err;
+	return 0;
+
+err:
+	pr_err("%s: can't create entry in proc filesystem!\n", __func__);
+	vlan_proc_cleanup(net);
+	return -ENOBUFS;
+}
+
+/*
+ *	Add directory entry for VLAN device.
+ */
+
+int vlan_proc_add_dev(struct net_device *vlandev)
+{
+	struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
+	struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
+
+	dev_info->dent =
+		proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
+				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
+	if (!dev_info->dent)
+		return -ENOBUFS;
+	return 0;
+}
+
+/*
+ *	Delete directory entry for VLAN device.
+ */
+int vlan_proc_rem_dev(struct net_device *vlandev)
+{
+	struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
+
+	/** NOTE:  This will consume the memory pointed to by dent, it seems. */
+	if (vlan_dev_info(vlandev)->dent) {
+		remove_proc_entry(vlan_dev_info(vlandev)->dent->name,
+				  vn->proc_vlan_dir);
+		vlan_dev_info(vlandev)->dent = NULL;
+	}
+	return 0;
+}
+
+/****** Proc filesystem entry points ****************************************/
+
+/*
+ * The following few functions build the content of /proc/net/vlan/config
+ */
+
+/* start read of /proc/net/vlan/config */
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	struct net_device *dev;
+	struct net *net = seq_file_net(seq);
+	loff_t i = 1;
+
+	rcu_read_lock();
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for_each_netdev_rcu(net, dev) {
+		if (!is_vlan_dev(dev))
+			continue;
+
+		if (i++ == *pos)
+			return dev;
+	}
+
+	return  NULL;
+}
+
+static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net_device *dev;
+	struct net *net = seq_file_net(seq);
+
+	++*pos;
+
+	dev = (struct net_device *)v;
+	if (v == SEQ_START_TOKEN)
+		dev = net_device_entry(&net->dev_base_head);
+
+	for_each_netdev_continue_rcu(net, dev) {
+		if (!is_vlan_dev(dev))
+			continue;
+
+		return dev;
+	}
+
+	return NULL;
+}
+
+static void vlan_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+static int vlan_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_net(seq);
+	struct vlan_net *vn = net_generic(net, vlan_net_id);
+
+	if (v == SEQ_START_TOKEN) {
+		const char *nmtype = NULL;
+
+		seq_puts(seq, "VLAN Dev name	 | VLAN ID\n");
+
+		if (vn->name_type < ARRAY_SIZE(vlan_name_type_str))
+		    nmtype =  vlan_name_type_str[vn->name_type];
+
+		seq_printf(seq, "Name-Type: %s\n",
+			   nmtype ? nmtype :  "UNKNOWN");
+	} else {
+		const struct net_device *vlandev = v;
+		const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
+
+		seq_printf(seq, "%-15s| %d  | %s\n",  vlandev->name,
+			   dev_info->vlan_id,    dev_info->real_dev->name);
+	}
+	return 0;
+}
+
+static int vlandev_seq_show(struct seq_file *seq, void *offset)
+{
+	struct net_device *vlandev = (struct net_device *) seq->private;
+	const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats;
+	static const char fmt64[] = "%30s %12llu\n";
+	int i;
+
+	if (!is_vlan_dev(vlandev))
+		return 0;
+
+	stats = dev_get_stats(vlandev, &temp);
+	seq_printf(seq,
+		   "%s  VID: %d	 REORDER_HDR: %i  dev->priv_flags: %hx\n",
+		   vlandev->name, dev_info->vlan_id,
+		   (int)(dev_info->flags & 1), vlandev->priv_flags);
+
+	seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
+	seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
+	seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast);
+	seq_puts(seq, "\n");
+	seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets);
+	seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes);
+	seq_printf(seq, "Device: %s", dev_info->real_dev->name);
+	/* now show all PRIORITY mappings relating to this VLAN */
+	seq_printf(seq, "\nINGRESS priority mappings: "
+			"0:%u  1:%u  2:%u  3:%u  4:%u  5:%u  6:%u 7:%u\n",
+		   dev_info->ingress_priority_map[0],
+		   dev_info->ingress_priority_map[1],
+		   dev_info->ingress_priority_map[2],
+		   dev_info->ingress_priority_map[3],
+		   dev_info->ingress_priority_map[4],
+		   dev_info->ingress_priority_map[5],
+		   dev_info->ingress_priority_map[6],
+		   dev_info->ingress_priority_map[7]);
+
+	seq_printf(seq, " EGRESS priority mappings: ");
+	for (i = 0; i < 16; i++) {
+		const struct vlan_priority_tci_mapping *mp
+			= dev_info->egress_priority_map[i];
+		while (mp) {
+			seq_printf(seq, "%u:%hu ",
+				   mp->priority, ((mp->vlan_qos >> 13) & 0x7));
+			mp = mp->next;
+		}
+	}
+	seq_puts(seq, "\n");
+
+	return 0;
+}
diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h
new file mode 100644
index 00000000..063f60a3
--- /dev/null
+++ b/net/8021q/vlanproc.h
@@ -0,0 +1,20 @@
+#ifndef __BEN_VLAN_PROC_INC__
+#define __BEN_VLAN_PROC_INC__
+
+#ifdef CONFIG_PROC_FS
+struct net;
+
+int vlan_proc_init(struct net *net);
+int vlan_proc_rem_dev(struct net_device *vlandev);
+int vlan_proc_add_dev(struct net_device *vlandev);
+void vlan_proc_cleanup(struct net *net);
+
+#else /* No CONFIG_PROC_FS */
+
+#define vlan_proc_init(net)	(0)
+#define vlan_proc_cleanup(net)	do {} while (0)
+#define vlan_proc_add_dev(dev)	({(void)(dev), 0; })
+#define vlan_proc_rem_dev(dev)	({(void)(dev), 0; })
+#endif
+
+#endif /* !(__BEN_VLAN_PROC_INC__) */
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
new file mode 100644
index 00000000..d9ea09b1
--- /dev/null
+++ b/net/9p/Kconfig
@@ -0,0 +1,36 @@
+#
+# 9P protocol configuration
+#
+
+menuconfig NET_9P
+	depends on NET
+	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
+
+if NET_9P
+
+config NET_9P_VIRTIO
+	depends on VIRTIO
+	tristate "9P Virtio Transport"
+	help
+	  This builds support for a transports between
+	  guest partitions and a host partition.
+
+config NET_9P_RDMA
+	depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL
+	tristate "9P RDMA Transport (Experimental)"
+	help
+	  This builds support for an RDMA transport.
+
+config NET_9P_DEBUG
+	bool "Debug information"
+	help
+	  Say Y if you want the 9P subsystem to log debug information.
+
+endif
diff --git a/net/9p/Makefile b/net/9p/Makefile
new file mode 100644
index 00000000..a0874cc1
--- /dev/null
+++ b/net/9p/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_NET_9P) := 9pnet.o
+obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
+obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
+
+9pnet-objs := \
+	mod.o \
+	client.o \
+	error.o \
+	util.o \
+	protocol.o \
+	trans_fd.o \
+	trans_common.o \
+
+9pnet_virtio-objs := \
+	trans_virtio.o \
+
+9pnet_rdma-objs := \
+	trans_rdma.o \
diff --git a/net/9p/client.c b/net/9p/client.c
new file mode 100644
index 00000000..5532710f
--- /dev/null
+++ b/net/9p/client.c
@@ -0,0 +1,1958 @@
+/*
+ * net/9p/clnt.c
+ *
+ * 9P Client
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include "protocol.h"
+
+/*
+  * Client Option Parsing (code inspired by NFS code)
+  *  - a little lazy - parse all client options
+  */
+
+enum {
+	Opt_msize,
+	Opt_trans,
+	Opt_legacy,
+	Opt_version,
+	Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_msize, "msize=%u"},
+	{Opt_legacy, "noextend"},
+	{Opt_trans, "trans=%s"},
+	{Opt_version, "version=%s"},
+	{Opt_err, NULL},
+};
+
+inline int p9_is_proto_dotl(struct p9_client *clnt)
+{
+	return clnt->proto_version == p9_proto_2000L;
+}
+EXPORT_SYMBOL(p9_is_proto_dotl);
+
+inline int p9_is_proto_dotu(struct p9_client *clnt)
+{
+	return clnt->proto_version == p9_proto_2000u;
+}
+EXPORT_SYMBOL(p9_is_proto_dotu);
+
+/* Interpret mount option for protocol version */
+static int get_protocol_version(const substring_t *name)
+{
+	int version = -EINVAL;
+
+	if (!strncmp("9p2000", name->from, name->to-name->from)) {
+		version = p9_proto_legacy;
+		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: Legacy\n");
+	} else if (!strncmp("9p2000.u", name->from, name->to-name->from)) {
+		version = p9_proto_2000u;
+		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
+	} else if (!strncmp("9p2000.L", name->from, name->to-name->from)) {
+		version = p9_proto_2000L;
+		P9_DPRINTK(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
+	} else {
+		P9_DPRINTK(P9_DEBUG_ERROR, "Unknown protocol version %s. ",
+							name->from);
+	}
+	return version;
+}
+
+/**
+ * parse_options - parse mount options into client structure
+ * @opts: options string passed from mount
+ * @clnt: existing v9fs client information
+ *
+ * Return 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *opts, struct p9_client *clnt)
+{
+	char *options, *tmp_options;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	int ret = 0;
+
+	clnt->proto_version = p9_proto_2000u;
+	clnt->msize = 8192;
+
+	if (!opts)
+		return 0;
+
+	tmp_options = kstrdup(opts, GFP_KERNEL);
+	if (!tmp_options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+				"failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if (token < Opt_trans) {
+			int r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					"integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+		}
+		switch (token) {
+		case Opt_msize:
+			clnt->msize = option;
+			break;
+		case Opt_trans:
+			clnt->trans_mod = v9fs_get_trans_by_name(&args[0]);
+			if(clnt->trans_mod == NULL) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				   "Could not find request transport: %s\n",
+				   (char *) &args[0]);
+				ret = -EINVAL;
+				goto free_and_return;
+			}
+			break;
+		case Opt_legacy:
+			clnt->proto_version = p9_proto_legacy;
+			break;
+		case Opt_version:
+			ret = get_protocol_version(&args[0]);
+			if (ret == -EINVAL)
+				goto free_and_return;
+			clnt->proto_version = ret;
+			break;
+		default:
+			continue;
+		}
+	}
+
+free_and_return:
+	kfree(tmp_options);
+	return ret;
+}
+
+/**
+ * p9_tag_alloc - lookup/allocate a request by tag
+ * @c: client session to lookup tag within
+ * @tag: numeric id for transaction
+ *
+ * this is a simple array lookup, but will grow the
+ * request_slots as necessary to accommodate transaction
+ * ids which did not previously have a slot.
+ *
+ * this code relies on the client spinlock to manage locks, its
+ * possible we should switch to something else, but I'd rather
+ * stick with something low-overhead for the common case.
+ *
+ */
+
+static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag)
+{
+	unsigned long flags;
+	int row, col;
+	struct p9_req_t *req;
+
+	/* This looks up the original request by tag so we know which
+	 * buffer to read the data into */
+	tag++;
+
+	if (tag >= c->max_tag) {
+		spin_lock_irqsave(&c->lock, flags);
+		/* check again since original check was outside of lock */
+		while (tag >= c->max_tag) {
+			row = (tag / P9_ROW_MAXTAG);
+			c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
+					sizeof(struct p9_req_t), GFP_ATOMIC);
+
+			if (!c->reqs[row]) {
+				printk(KERN_ERR "Couldn't grow tag array\n");
+				spin_unlock_irqrestore(&c->lock, flags);
+				return ERR_PTR(-ENOMEM);
+			}
+			for (col = 0; col < P9_ROW_MAXTAG; col++) {
+				c->reqs[row][col].status = REQ_STATUS_IDLE;
+				c->reqs[row][col].tc = NULL;
+			}
+			c->max_tag += P9_ROW_MAXTAG;
+		}
+		spin_unlock_irqrestore(&c->lock, flags);
+	}
+	row = tag / P9_ROW_MAXTAG;
+	col = tag % P9_ROW_MAXTAG;
+
+	req = &c->reqs[row][col];
+	if (!req->tc) {
+		req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
+		if (!req->wq) {
+			printk(KERN_ERR "Couldn't grow tag array\n");
+			return ERR_PTR(-ENOMEM);
+		}
+		init_waitqueue_head(req->wq);
+		if ((c->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
+				P9_TRANS_PREF_PAYLOAD_SEP) {
+			int alloc_msize = min(c->msize, 4096);
+			req->tc = kmalloc(sizeof(struct p9_fcall)+alloc_msize,
+					  GFP_NOFS);
+			req->tc->capacity = alloc_msize;
+			req->rc = kmalloc(sizeof(struct p9_fcall)+alloc_msize,
+					  GFP_NOFS);
+			req->rc->capacity = alloc_msize;
+		} else {
+			req->tc = kmalloc(sizeof(struct p9_fcall)+c->msize,
+					  GFP_NOFS);
+			req->tc->capacity = c->msize;
+			req->rc = kmalloc(sizeof(struct p9_fcall)+c->msize,
+					  GFP_NOFS);
+			req->rc->capacity = c->msize;
+		}
+		if ((!req->tc) || (!req->rc)) {
+			printk(KERN_ERR "Couldn't grow tag array\n");
+			kfree(req->tc);
+			kfree(req->rc);
+			kfree(req->wq);
+			req->tc = req->rc = NULL;
+			req->wq = NULL;
+			return ERR_PTR(-ENOMEM);
+		}
+		req->tc->sdata = (char *) req->tc + sizeof(struct p9_fcall);
+		req->rc->sdata = (char *) req->rc + sizeof(struct p9_fcall);
+	}
+
+	p9pdu_reset(req->tc);
+	p9pdu_reset(req->rc);
+
+	req->tc->tag = tag-1;
+	req->status = REQ_STATUS_ALLOC;
+
+	return &c->reqs[row][col];
+}
+
+/**
+ * p9_tag_lookup - lookup a request by tag
+ * @c: client session to lookup tag within
+ * @tag: numeric id for transaction
+ *
+ */
+
+struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
+{
+	int row, col;
+
+	/* This looks up the original request by tag so we know which
+	 * buffer to read the data into */
+	tag++;
+
+	if(tag >= c->max_tag)
+		return NULL;
+
+	row = tag / P9_ROW_MAXTAG;
+	col = tag % P9_ROW_MAXTAG;
+
+	return &c->reqs[row][col];
+}
+EXPORT_SYMBOL(p9_tag_lookup);
+
+/**
+ * p9_tag_init - setup tags structure and contents
+ * @c:  v9fs client struct
+ *
+ * This initializes the tags structure for each client instance.
+ *
+ */
+
+static int p9_tag_init(struct p9_client *c)
+{
+	int err = 0;
+
+	c->tagpool = p9_idpool_create();
+	if (IS_ERR(c->tagpool)) {
+		err = PTR_ERR(c->tagpool);
+		goto error;
+	}
+	err = p9_idpool_get(c->tagpool); /* reserve tag 0 */
+	if (err < 0) {
+		p9_idpool_destroy(c->tagpool);
+		goto error;
+	}
+	c->max_tag = 0;
+error:
+	return err;
+}
+
+/**
+ * p9_tag_cleanup - cleans up tags structure and reclaims resources
+ * @c:  v9fs client struct
+ *
+ * This frees resources associated with the tags structure
+ *
+ */
+static void p9_tag_cleanup(struct p9_client *c)
+{
+	int row, col;
+
+	/* check to insure all requests are idle */
+	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
+		for (col = 0; col < P9_ROW_MAXTAG; col++) {
+			if (c->reqs[row][col].status != REQ_STATUS_IDLE) {
+				P9_DPRINTK(P9_DEBUG_MUX,
+				  "Attempting to cleanup non-free tag %d,%d\n",
+				  row, col);
+				/* TODO: delay execution of cleanup */
+				return;
+			}
+		}
+	}
+
+	if (c->tagpool) {
+		p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
+		p9_idpool_destroy(c->tagpool);
+	}
+
+	/* free requests associated with tags */
+	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
+		for (col = 0; col < P9_ROW_MAXTAG; col++) {
+			kfree(c->reqs[row][col].wq);
+			kfree(c->reqs[row][col].tc);
+			kfree(c->reqs[row][col].rc);
+		}
+		kfree(c->reqs[row]);
+	}
+	c->max_tag = 0;
+}
+
+/**
+ * p9_free_req - free a request and clean-up as necessary
+ * c: client state
+ * r: request to release
+ *
+ */
+
+static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
+{
+	int tag = r->tc->tag;
+	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
+
+	r->status = REQ_STATUS_IDLE;
+	if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
+		p9_idpool_put(tag, c->tagpool);
+}
+
+/**
+ * p9_client_cb - call back from transport to client
+ * c: client state
+ * req: request received
+ *
+ */
+void p9_client_cb(struct p9_client *c, struct p9_req_t *req)
+{
+	P9_DPRINTK(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
+	wake_up(req->wq);
+	P9_DPRINTK(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
+}
+EXPORT_SYMBOL(p9_client_cb);
+
+/**
+ * p9_parse_header - parse header arguments out of a packet
+ * @pdu: packet to parse
+ * @size: size of packet
+ * @type: type of request
+ * @tag: tag of packet
+ * @rewind: set if we need to rewind offset afterwards
+ */
+
+int
+p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
+								int rewind)
+{
+	int8_t r_type;
+	int16_t r_tag;
+	int32_t r_size;
+	int offset = pdu->offset;
+	int err;
+
+	pdu->offset = 0;
+	if (pdu->size == 0)
+		pdu->size = 7;
+
+	err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
+	if (err)
+		goto rewind_and_exit;
+
+	pdu->size = r_size;
+	pdu->id = r_type;
+	pdu->tag = r_tag;
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", pdu->size,
+							pdu->id, pdu->tag);
+
+	if (type)
+		*type = r_type;
+	if (tag)
+		*tag = r_tag;
+	if (size)
+		*size = r_size;
+
+
+rewind_and_exit:
+	if (rewind)
+		pdu->offset = offset;
+	return err;
+}
+EXPORT_SYMBOL(p9_parse_header);
+
+/**
+ * p9_check_errors - check 9p packet for error return and process it
+ * @c: current client instance
+ * @req: request to parse and check for error conditions
+ *
+ * returns error code if one is discovered, otherwise returns 0
+ *
+ * this will have to be more complicated if we have multiple
+ * error packet types
+ */
+
+static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
+{
+	int8_t type;
+	int err;
+	int ecode;
+
+	err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
+		return err;
+	}
+
+	if (type != P9_RERROR && type != P9_RLERROR)
+		return 0;
+
+	if (!p9_is_proto_dotl(c)) {
+		char *ename;
+
+		if (req->tc->pbuf_size) {
+			/* Handle user buffers */
+			size_t len = req->rc->size - req->rc->offset;
+			if (req->tc->pubuf) {
+				/* User Buffer */
+				err = copy_from_user(
+					&req->rc->sdata[req->rc->offset],
+					req->tc->pubuf, len);
+				if (err) {
+					err = -EFAULT;
+					goto out_err;
+				}
+			} else {
+				/* Kernel Buffer */
+				memmove(&req->rc->sdata[req->rc->offset],
+						req->tc->pkbuf, len);
+			}
+		}
+		err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+				&ename, &ecode);
+		if (err)
+			goto out_err;
+
+		if (p9_is_proto_dotu(c))
+			err = -ecode;
+
+		if (!err || !IS_ERR_VALUE(err)) {
+			err = p9_errstr2errno(ename, strlen(ename));
+
+			P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode,
+					ename);
+
+			kfree(ename);
+		}
+	} else {
+		err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
+		err = -ecode;
+
+		P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+	}
+
+
+	return err;
+
+out_err:
+	P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
+	return err;
+}
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
+
+/**
+ * p9_client_flush - flush (cancel) a request
+ * @c: client state
+ * @oldreq: request to cancel
+ *
+ * This sents a flush for a particular request and links
+ * the flush request to the original request.  The current
+ * code only supports a single flush request although the protocol
+ * allows for multiple flush requests to be sent for a single request.
+ *
+ */
+
+static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
+{
+	struct p9_req_t *req;
+	int16_t oldtag;
+	int err;
+
+	err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1);
+	if (err)
+		return err;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag);
+
+	req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+
+	/* if we haven't received a response for oldreq,
+	   remove it from the list. */
+	spin_lock(&c->lock);
+	if (oldreq->status == REQ_STATUS_FLSH)
+		list_del(&oldreq->req_list);
+	spin_unlock(&c->lock);
+
+	p9_free_req(c, req);
+	return 0;
+}
+
+/**
+ * p9_client_rpc - issue a request and wait for a response
+ * @c: client session
+ * @type: type of request
+ * @fmt: protocol format string (see protocol.c)
+ *
+ * Returns request structure (which client must free using p9_free_req)
+ */
+
+static struct p9_req_t *
+p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
+{
+	va_list ap;
+	int tag, err;
+	struct p9_req_t *req;
+	unsigned long flags;
+	int sigpending;
+
+	P9_DPRINTK(P9_DEBUG_MUX, "client %p op %d\n", c, type);
+
+	/* we allow for any status other than disconnected */
+	if (c->status == Disconnected)
+		return ERR_PTR(-EIO);
+
+	/* if status is begin_disconnected we allow only clunk request */
+	if ((c->status == BeginDisconnect) && (type != P9_TCLUNK))
+		return ERR_PTR(-EIO);
+
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	} else
+		sigpending = 0;
+
+	tag = P9_NOTAG;
+	if (type != P9_TVERSION) {
+		tag = p9_idpool_get(c->tagpool);
+		if (tag < 0)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	req = p9_tag_alloc(c, tag);
+	if (IS_ERR(req))
+		return req;
+
+	/* marshall the data */
+	p9pdu_prepare(req->tc, tag, type);
+	va_start(ap, fmt);
+	err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
+	va_end(ap);
+	if (err)
+		goto reterr;
+	p9pdu_finalize(req->tc);
+
+	err = c->trans_mod->request(c, req);
+	if (err < 0) {
+		if (err != -ERESTARTSYS && err != -EFAULT)
+			c->status = Disconnected;
+		goto reterr;
+	}
+
+	P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d\n", req->wq, tag);
+	err = wait_event_interruptible(*req->wq,
+						req->status >= REQ_STATUS_RCVD);
+	P9_DPRINTK(P9_DEBUG_MUX, "wait %p tag: %d returned %d\n",
+						req->wq, tag, err);
+
+	if (req->status == REQ_STATUS_ERROR) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
+		err = req->t_err;
+	}
+
+	if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+		P9_DPRINTK(P9_DEBUG_MUX, "flushing\n");
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+
+		if (c->trans_mod->cancel(c, req))
+			p9_client_flush(c, req);
+
+		/* if we received the response anyway, don't signal error */
+		if (req->status == REQ_STATUS_RCVD)
+			err = 0;
+	}
+
+	if (sigpending) {
+		spin_lock_irqsave(&current->sighand->siglock, flags);
+		recalc_sigpending();
+		spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	}
+
+	if (err < 0)
+		goto reterr;
+
+	err = p9_check_errors(c, req);
+	if (!err) {
+		P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d\n", c, type);
+		return req;
+	}
+
+reterr:
+	P9_DPRINTK(P9_DEBUG_MUX, "exit: client %p op %d error: %d\n", c, type,
+									err);
+	p9_free_req(c, req);
+	return ERR_PTR(err);
+}
+
+static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+{
+	int ret;
+	struct p9_fid *fid;
+	unsigned long flags;
+
+	P9_DPRINTK(P9_DEBUG_FID, "clnt %p\n", clnt);
+	fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
+	if (!fid)
+		return ERR_PTR(-ENOMEM);
+
+	ret = p9_idpool_get(clnt->fidpool);
+	if (ret < 0) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	fid->fid = ret;
+
+	memset(&fid->qid, 0, sizeof(struct p9_qid));
+	fid->mode = -1;
+	fid->uid = current_fsuid();
+	fid->clnt = clnt;
+	fid->rdir = NULL;
+	spin_lock_irqsave(&clnt->lock, flags);
+	list_add(&fid->flist, &clnt->fidlist);
+	spin_unlock_irqrestore(&clnt->lock, flags);
+
+	return fid;
+
+error:
+	kfree(fid);
+	return ERR_PTR(ret);
+}
+
+static void p9_fid_destroy(struct p9_fid *fid)
+{
+	struct p9_client *clnt;
+	unsigned long flags;
+
+	P9_DPRINTK(P9_DEBUG_FID, "fid %d\n", fid->fid);
+	clnt = fid->clnt;
+	p9_idpool_put(fid->fid, clnt->fidpool);
+	spin_lock_irqsave(&clnt->lock, flags);
+	list_del(&fid->flist);
+	spin_unlock_irqrestore(&clnt->lock, flags);
+	kfree(fid->rdir);
+	kfree(fid);
+}
+
+static int p9_client_version(struct p9_client *c)
+{
+	int err = 0;
+	struct p9_req_t *req;
+	char *version;
+	int msize;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
+						c->msize, c->proto_version);
+
+	switch (c->proto_version) {
+	case p9_proto_2000L:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+					c->msize, "9P2000.L");
+		break;
+	case p9_proto_2000u:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+					c->msize, "9P2000.u");
+		break;
+	case p9_proto_legacy:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+					c->msize, "9P2000");
+		break;
+	default:
+		return -EINVAL;
+		break;
+	}
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
+	if (err) {
+		P9_DPRINTK(P9_DEBUG_9P, "version error %d\n", err);
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
+	if (!strncmp(version, "9P2000.L", 8))
+		c->proto_version = p9_proto_2000L;
+	else if (!strncmp(version, "9P2000.u", 8))
+		c->proto_version = p9_proto_2000u;
+	else if (!strncmp(version, "9P2000", 6))
+		c->proto_version = p9_proto_legacy;
+	else {
+		err = -EREMOTEIO;
+		goto error;
+	}
+
+	if (msize < c->msize)
+		c->msize = msize;
+
+error:
+	kfree(version);
+	p9_free_req(c, req);
+
+	return err;
+}
+
+struct p9_client *p9_client_create(const char *dev_name, char *options)
+{
+	int err;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL);
+	if (!clnt)
+		return ERR_PTR(-ENOMEM);
+
+	clnt->trans_mod = NULL;
+	clnt->trans = NULL;
+	spin_lock_init(&clnt->lock);
+	INIT_LIST_HEAD(&clnt->fidlist);
+
+	err = p9_tag_init(clnt);
+	if (err < 0)
+		goto free_client;
+
+	err = parse_opts(options, clnt);
+	if (err < 0)
+		goto destroy_tagpool;
+
+	if (!clnt->trans_mod)
+		clnt->trans_mod = v9fs_get_default_trans();
+
+	if (clnt->trans_mod == NULL) {
+		err = -EPROTONOSUPPORT;
+		P9_DPRINTK(P9_DEBUG_ERROR,
+				"No transport defined or default transport\n");
+		goto destroy_tagpool;
+	}
+
+	clnt->fidpool = p9_idpool_create();
+	if (IS_ERR(clnt->fidpool)) {
+		err = PTR_ERR(clnt->fidpool);
+		goto put_trans;
+	}
+
+	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
+		clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
+
+	err = clnt->trans_mod->create(clnt, dev_name, options);
+	if (err)
+		goto destroy_fidpool;
+
+	if (clnt->msize > clnt->trans_mod->maxsize)
+		clnt->msize = clnt->trans_mod->maxsize;
+
+	err = p9_client_version(clnt);
+	if (err)
+		goto close_trans;
+
+	return clnt;
+
+close_trans:
+	clnt->trans_mod->close(clnt);
+destroy_fidpool:
+	p9_idpool_destroy(clnt->fidpool);
+put_trans:
+	v9fs_put_trans(clnt->trans_mod);
+destroy_tagpool:
+	p9_idpool_destroy(clnt->tagpool);
+free_client:
+	kfree(clnt);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_create);
+
+void p9_client_destroy(struct p9_client *clnt)
+{
+	struct p9_fid *fid, *fidptr;
+
+	P9_DPRINTK(P9_DEBUG_MUX, "clnt %p\n", clnt);
+
+	if (clnt->trans_mod)
+		clnt->trans_mod->close(clnt);
+
+	v9fs_put_trans(clnt->trans_mod);
+
+	list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) {
+		printk(KERN_INFO "Found fid %d not clunked\n", fid->fid);
+		p9_fid_destroy(fid);
+	}
+
+	if (clnt->fidpool)
+		p9_idpool_destroy(clnt->fidpool);
+
+	p9_tag_cleanup(clnt);
+
+	kfree(clnt);
+}
+EXPORT_SYMBOL(p9_client_destroy);
+
+void p9_client_disconnect(struct p9_client *clnt)
+{
+	P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
+	clnt->status = Disconnected;
+}
+EXPORT_SYMBOL(p9_client_disconnect);
+
+void p9_client_begin_disconnect(struct p9_client *clnt)
+{
+	P9_DPRINTK(P9_DEBUG_9P, "clnt %p\n", clnt);
+	clnt->status = BeginDisconnect;
+}
+EXPORT_SYMBOL(p9_client_begin_disconnect);
+
+struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
+	char *uname, u32 n_uname, char *aname)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_fid *fid;
+	struct p9_qid qid;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
+					afid ? afid->fid : -1, uname, aname);
+	err = 0;
+
+	fid = p9_fid_create(clnt);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		fid = NULL;
+		goto error;
+	}
+
+	req = p9_client_rpc(clnt, P9_TATTACH, "ddss?d", fid->fid,
+			afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
+					qid.type,
+					(unsigned long long)qid.path,
+					qid.version);
+
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+
+	p9_free_req(clnt, req);
+	return fid;
+
+error:
+	if (fid)
+		p9_fid_destroy(fid);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_attach);
+
+struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
+		char **wnames, int clone)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_fid *fid;
+	struct p9_qid *wqids;
+	struct p9_req_t *req;
+	uint16_t nwqids, count;
+
+	err = 0;
+	wqids = NULL;
+	clnt = oldfid->clnt;
+	if (clone) {
+		fid = p9_fid_create(clnt);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			fid = NULL;
+			goto error;
+		}
+
+		fid->uid = oldfid->uid;
+	} else
+		fid = oldfid;
+
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
+		oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
+
+	req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
+								nwname, wnames);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto clunk_fid;
+	}
+	p9_free_req(clnt, req);
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
+
+	if (nwqids != nwname) {
+		err = -ENOENT;
+		goto clunk_fid;
+	}
+
+	for (count = 0; count < nwqids; count++)
+		P9_DPRINTK(P9_DEBUG_9P, "<<<     [%d] %x.%llx.%x\n",
+			count, wqids[count].type,
+			(unsigned long long)wqids[count].path,
+			wqids[count].version);
+
+	if (nwname)
+		memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
+	else
+		fid->qid = oldfid->qid;
+
+	kfree(wqids);
+	return fid;
+
+clunk_fid:
+	kfree(wqids);
+	p9_client_clunk(fid);
+	fid = NULL;
+
+error:
+	if (fid && (fid != oldfid))
+		p9_fid_destroy(fid);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_walk);
+
+int p9_client_open(struct p9_fid *fid, int mode)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	struct p9_qid qid;
+	int iounit;
+
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+		p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
+	err = 0;
+
+	if (fid->mode != -1)
+		return -EINVAL;
+
+	if (p9_is_proto_dotl(clnt))
+		req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode);
+	else
+		req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+		p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN",  qid.type,
+		(unsigned long long)qid.path, qid.version, iounit);
+
+	fid->mode = mode;
+	fid->iounit = iounit;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_open);
+
+int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
+		gid_t gid, struct p9_qid *qid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	int iounit;
+
+	P9_DPRINTK(P9_DEBUG_9P,
+			">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+			ofid->fid, name, flags, mode, gid);
+	clnt = ofid->clnt;
+
+	if (ofid->mode != -1)
+		return -EINVAL;
+
+	req = p9_client_rpc(clnt, P9_TLCREATE, "dsddd", ofid->fid, name, flags,
+			mode, gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+			qid->type,
+			(unsigned long long)qid->path,
+			qid->version, iounit);
+
+	ofid->mode = mode;
+	ofid->iounit = iounit;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_create_dotl);
+
+int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
+		     char *extension)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	struct p9_qid qid;
+	int iounit;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
+						fid->fid, name, perm, mode);
+	err = 0;
+	clnt = fid->clnt;
+
+	if (fid->mode != -1)
+		return -EINVAL;
+
+	req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
+				mode, extension);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
+				qid.type,
+				(unsigned long long)qid.path,
+				qid.version, iounit);
+
+	fid->mode = mode;
+	fid->iounit = iounit;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fcreate);
+
+int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid,
+		struct p9_qid *qid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s  symtgt %s\n",
+			dfid->fid, name, symtgt);
+	clnt = dfid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TSYMLINK, "dssd", dfid->fid, name, symtgt,
+			gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+			qid->type, (unsigned long long)qid->path, qid->version);
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_symlink);
+
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
+{
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+			dfid->fid, oldfid->fid, newname);
+	clnt = dfid->clnt;
+	req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
+			newname);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLINK\n");
+	p9_free_req(clnt, req);
+	return 0;
+}
+EXPORT_SYMBOL(p9_client_link);
+
+int p9_client_fsync(struct p9_fid *fid, int datasync)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+			fid->fid, datasync);
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
+int p9_client_clunk(struct p9_fid *fid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	if (!fid) {
+		P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n");
+		dump_stack();
+		return 0;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+error:
+	/*
+	 * Fid is not valid even after a failed clunk
+	 */
+	p9_fid_destroy(fid);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_clunk);
+
+int p9_client_remove(struct p9_fid *fid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+error:
+	p9_fid_destroy(fid);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_remove);
+
+int
+p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
+								u32 count)
+{
+	int err, rsize;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	char *dataptr;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid,
+					(long long unsigned) offset, count);
+	err = 0;
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
+			P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) {
+		req = p9_client_rpc(clnt, P9_TREAD, "dqE", fid->fid, offset,
+				rsize, data, udata);
+	} else {
+		req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+				rsize);
+	}
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+
+	if (!req->tc->pbuf_size) {
+		if (data) {
+			memmove(data, dataptr, count);
+		} else {
+			err = copy_to_user(udata, dataptr, count);
+			if (err) {
+				err = -EFAULT;
+				goto free_and_error;
+			}
+		}
+	}
+	p9_free_req(clnt, req);
+	return count;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_read);
+
+int
+p9_client_write(struct p9_fid *fid, char *data, const char __user *udata,
+							u64 offset, u32 count)
+{
+	int err, rsize;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d\n",
+				fid->fid, (long long unsigned) offset, count);
+	err = 0;
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	/* Don't bother zerocopy form small IO (< 1024) */
+	if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
+				P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) {
+		req = p9_client_rpc(clnt, P9_TWRITE, "dqE", fid->fid, offset,
+				rsize, data, udata);
+	} else {
+
+		if (data)
+			req = p9_client_rpc(clnt, P9_TWRITE, "dqD", fid->fid,
+					offset, rsize, data);
+		else
+			req = p9_client_rpc(clnt, P9_TWRITE, "dqU", fid->fid,
+					offset, rsize, udata);
+	}
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+
+	p9_free_req(clnt, req);
+	return count;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_write);
+
+struct p9_wstat *p9_client_stat(struct p9_fid *fid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL);
+	struct p9_req_t *req;
+	u16 ignored;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P,
+		"<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+		"<<<    mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+		"<<<    name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+		"<<<    uid=%d gid=%d n_muid=%d\n",
+		ret->size, ret->type, ret->dev, ret->qid.type,
+		(unsigned long long)ret->qid.path, ret->qid.version, ret->mode,
+		ret->atime, ret->mtime, (unsigned long long)ret->length,
+		ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
+		ret->n_uid, ret->n_gid, ret->n_muid);
+
+	p9_free_req(clnt, req);
+	return ret;
+
+error:
+	kfree(ret);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_stat);
+
+struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
+							u64 request_mask)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl),
+								GFP_KERNEL);
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+							fid->fid, request_mask);
+
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P,
+		"<<< RGETATTR st_result_mask=%lld\n"
+		"<<< qid=%x.%llx.%x\n"
+		"<<< st_mode=%8.8x st_nlink=%llu\n"
+		"<<< st_uid=%d st_gid=%d\n"
+		"<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+		"<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+		"<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+		"<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+		"<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+		"<<< st_gen=%lld st_data_version=%lld",
+		ret->st_result_mask, ret->qid.type, ret->qid.path,
+		ret->qid.version, ret->st_mode, ret->st_nlink, ret->st_uid,
+		ret->st_gid, ret->st_rdev, ret->st_size, ret->st_blksize,
+		ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec,
+		ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec,
+		ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec,
+		ret->st_gen, ret->st_data_version);
+
+	p9_free_req(clnt, req);
+	return ret;
+
+error:
+	kfree(ret);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_getattr_dotl);
+
+static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
+{
+	int ret;
+
+	/* NOTE: size shouldn't include its own length */
+	/* size[2] type[2] dev[4] qid[13] */
+	/* mode[4] atime[4] mtime[4] length[8]*/
+	/* name[s] uid[s] gid[s] muid[s] */
+	ret = 2+4+13+4+4+4+8+2+2+2+2;
+
+	if (wst->name)
+		ret += strlen(wst->name);
+	if (wst->uid)
+		ret += strlen(wst->uid);
+	if (wst->gid)
+		ret += strlen(wst->gid);
+	if (wst->muid)
+		ret += strlen(wst->muid);
+
+	if ((proto_version == p9_proto_2000u) ||
+		(proto_version == p9_proto_2000L)) {
+		ret += 2+4+4+4;	/* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+		if (wst->extension)
+			ret += strlen(wst->extension);
+	}
+
+	return ret;
+}
+
+int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = fid->clnt;
+	wst->size = p9_client_statsize(wst, clnt->proto_version);
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
+	P9_DPRINTK(P9_DEBUG_9P,
+		"     sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+		"     mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+		"     name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+		"     uid=%d gid=%d n_muid=%d\n",
+		wst->size, wst->type, wst->dev, wst->qid.type,
+		(unsigned long long)wst->qid.path, wst->qid.version, wst->mode,
+		wst->atime, wst->mtime, (unsigned long long)wst->length,
+		wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
+		wst->n_uid, wst->n_gid, wst->n_muid);
+
+	req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_wstat);
+
+int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+	P9_DPRINTK(P9_DEBUG_9P,
+		"    valid=%x mode=%x uid=%d gid=%d size=%lld\n"
+		"    atime_sec=%lld atime_nsec=%lld\n"
+		"    mtime_sec=%lld mtime_nsec=%lld\n",
+		p9attr->valid, p9attr->mode, p9attr->uid, p9attr->gid,
+		p9attr->size, p9attr->atime_sec, p9attr->atime_nsec,
+		p9attr->mtime_sec, p9attr->mtime_nsec);
+
+	req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
+
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_setattr);
+
+int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = fid->clnt;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
+		&sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
+		&sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld "
+		"blocks %llu bfree %llu bavail %llu files %llu ffree %llu "
+		"fsid %llu namelen %ld\n",
+		fid->fid, (long unsigned int)sb->type, (long int)sb->bsize,
+		sb->blocks, sb->bfree, sb->bavail, sb->files,  sb->ffree,
+		sb->fsid, (long int)sb->namelen);
+
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_statfs);
+
+int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, char *name)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = fid->clnt;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
+			fid->fid, newdirfid->fid, name);
+
+	req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
+			newdirfid->fid, name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_rename);
+
+/*
+ * An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+ */
+struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
+				const char *attr_name, u64 *attr_size)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+	struct p9_fid *attr_fid;
+
+	err = 0;
+	clnt = file_fid->clnt;
+	attr_fid = p9_fid_create(clnt);
+	if (IS_ERR(attr_fid)) {
+		err = PTR_ERR(attr_fid);
+		attr_fid = NULL;
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P,
+		">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n",
+		file_fid->fid, attr_fid->fid, attr_name);
+
+	req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
+			file_fid->fid, attr_fid->fid, attr_name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto clunk_fid;
+	}
+	p9_free_req(clnt, req);
+	P9_DPRINTK(P9_DEBUG_9P, "<<<  RXATTRWALK fid %d size %llu\n",
+		attr_fid->fid, *attr_size);
+	return attr_fid;
+clunk_fid:
+	p9_client_clunk(attr_fid);
+	attr_fid = NULL;
+error:
+	if (attr_fid && (attr_fid != file_fid))
+		p9_fid_destroy(attr_fid);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
+
+int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
+			u64 attr_size, int flags)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	P9_DPRINTK(P9_DEBUG_9P,
+		">>> TXATTRCREATE fid %d name  %s size %lld flag %d\n",
+		fid->fid, name, (long long)attr_size, flags);
+	err = 0;
+	clnt = fid->clnt;
+	req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
+			fid->fid, name, attr_size, flags);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
+
+int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
+{
+	int err, rsize;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	char *dataptr;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
+				fid->fid, (long long unsigned) offset, count);
+
+	err = 0;
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ)
+		rsize = clnt->msize - P9_READDIRHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	if ((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) ==
+			P9_TRANS_PREF_PAYLOAD_SEP) {
+		req = p9_client_rpc(clnt, P9_TREADDIR, "dqF", fid->fid,
+				offset, rsize, data);
+	} else {
+		req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid,
+				offset, rsize);
+	}
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+
+	if (!req->tc->pbuf_size && data)
+		memmove(data, dataptr, count);
+
+	p9_free_req(clnt, req);
+	return count;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readdir);
+
+int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
+			dev_t rdev, gid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
+		"minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+	req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddd", fid->fid, name, mode,
+		MAJOR(rdev), MINOR(rdev), gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
+				(unsigned long long)qid->path, qid->version);
+
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_mknod_dotl);
+
+int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
+				gid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+		 fid->fid, name, mode, gid);
+	req = p9_client_rpc(clnt, P9_TMKDIR, "dsdd", fid->fid, name, mode,
+		gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+				(unsigned long long)qid->path, qid->version);
+
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
+			"start %lld length %lld proc_id %d client_id %s\n",
+			fid->fid, flock->type, flock->flags, flock->start,
+			flock->length, flock->proc_id, flock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+				flock->flags, flock->start, flock->length,
+					flock->proc_id, flock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
+		"length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,  glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
+			&glock->start, &glock->length, &glock->proc_id,
+			&glock->client_id);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
+		"proc_id %d client_id %s\n", glock->type, glock->start,
+		glock->length, glock->proc_id, glock->client_id);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
diff --git a/net/9p/error.c b/net/9p/error.c
new file mode 100644
index 00000000..52518512
--- /dev/null
+++ b/net/9p/error.c
@@ -0,0 +1,247 @@
+/*
+ * linux/fs/9p/error.c
+ *
+ * Error string handling
+ *
+ * Plan 9 uses error strings, Unix uses error numbers.  These functions
+ * try to help manage that and provide for dynamically adding error
+ * mappings.
+ *
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/errno.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct errormap - map string errors from Plan 9 to Linux numeric ids
+ * @name: string sent over 9P
+ * @val: numeric id most closely representing @name
+ * @namelen: length of string
+ * @list: hash-table list for string lookup
+ */
+struct errormap {
+	char *name;
+	int val;
+
+	int namelen;
+	struct hlist_node list;
+};
+
+#define ERRHASHSZ		32
+static struct hlist_head hash_errmap[ERRHASHSZ];
+
+/* FixMe - reduce to a reasonable size */
+static struct errormap errmap[] = {
+	{"Operation not permitted", EPERM},
+	{"wstat prohibited", EPERM},
+	{"No such file or directory", ENOENT},
+	{"directory entry not found", ENOENT},
+	{"file not found", ENOENT},
+	{"Interrupted system call", EINTR},
+	{"Input/output error", EIO},
+	{"No such device or address", ENXIO},
+	{"Argument list too long", E2BIG},
+	{"Bad file descriptor", EBADF},
+	{"Resource temporarily unavailable", EAGAIN},
+	{"Cannot allocate memory", ENOMEM},
+	{"Permission denied", EACCES},
+	{"Bad address", EFAULT},
+	{"Block device required", ENOTBLK},
+	{"Device or resource busy", EBUSY},
+	{"File exists", EEXIST},
+	{"Invalid cross-device link", EXDEV},
+	{"No such device", ENODEV},
+	{"Not a directory", ENOTDIR},
+	{"Is a directory", EISDIR},
+	{"Invalid argument", EINVAL},
+	{"Too many open files in system", ENFILE},
+	{"Too many open files", EMFILE},
+	{"Text file busy", ETXTBSY},
+	{"File too large", EFBIG},
+	{"No space left on device", ENOSPC},
+	{"Illegal seek", ESPIPE},
+	{"Read-only file system", EROFS},
+	{"Too many links", EMLINK},
+	{"Broken pipe", EPIPE},
+	{"Numerical argument out of domain", EDOM},
+	{"Numerical result out of range", ERANGE},
+	{"Resource deadlock avoided", EDEADLK},
+	{"File name too long", ENAMETOOLONG},
+	{"No locks available", ENOLCK},
+	{"Function not implemented", ENOSYS},
+	{"Directory not empty", ENOTEMPTY},
+	{"Too many levels of symbolic links", ELOOP},
+	{"No message of desired type", ENOMSG},
+	{"Identifier removed", EIDRM},
+	{"No data available", ENODATA},
+	{"Machine is not on the network", ENONET},
+	{"Package not installed", ENOPKG},
+	{"Object is remote", EREMOTE},
+	{"Link has been severed", ENOLINK},
+	{"Communication error on send", ECOMM},
+	{"Protocol error", EPROTO},
+	{"Bad message", EBADMSG},
+	{"File descriptor in bad state", EBADFD},
+	{"Streams pipe error", ESTRPIPE},
+	{"Too many users", EUSERS},
+	{"Socket operation on non-socket", ENOTSOCK},
+	{"Message too long", EMSGSIZE},
+	{"Protocol not available", ENOPROTOOPT},
+	{"Protocol not supported", EPROTONOSUPPORT},
+	{"Socket type not supported", ESOCKTNOSUPPORT},
+	{"Operation not supported", EOPNOTSUPP},
+	{"Protocol family not supported", EPFNOSUPPORT},
+	{"Network is down", ENETDOWN},
+	{"Network is unreachable", ENETUNREACH},
+	{"Network dropped connection on reset", ENETRESET},
+	{"Software caused connection abort", ECONNABORTED},
+	{"Connection reset by peer", ECONNRESET},
+	{"No buffer space available", ENOBUFS},
+	{"Transport endpoint is already connected", EISCONN},
+	{"Transport endpoint is not connected", ENOTCONN},
+	{"Cannot send after transport endpoint shutdown", ESHUTDOWN},
+	{"Connection timed out", ETIMEDOUT},
+	{"Connection refused", ECONNREFUSED},
+	{"Host is down", EHOSTDOWN},
+	{"No route to host", EHOSTUNREACH},
+	{"Operation already in progress", EALREADY},
+	{"Operation now in progress", EINPROGRESS},
+	{"Is a named type file", EISNAM},
+	{"Remote I/O error", EREMOTEIO},
+	{"Disk quota exceeded", EDQUOT},
+/* errors from fossil, vacfs, and u9fs */
+	{"fid unknown or out of range", EBADF},
+	{"permission denied", EACCES},
+	{"file does not exist", ENOENT},
+	{"authentication failed", ECONNREFUSED},
+	{"bad offset in directory read", ESPIPE},
+	{"bad use of fid", EBADF},
+	{"wstat can't convert between files and directories", EPERM},
+	{"directory is not empty", ENOTEMPTY},
+	{"file exists", EEXIST},
+	{"file already exists", EEXIST},
+	{"file or directory already exists", EEXIST},
+	{"fid already in use", EBADF},
+	{"file in use", ETXTBSY},
+	{"i/o error", EIO},
+	{"file already open for I/O", ETXTBSY},
+	{"illegal mode", EINVAL},
+	{"illegal name", ENAMETOOLONG},
+	{"not a directory", ENOTDIR},
+	{"not a member of proposed group", EPERM},
+	{"not owner", EACCES},
+	{"only owner can change group in wstat", EACCES},
+	{"read only file system", EROFS},
+	{"no access to special file", EPERM},
+	{"i/o count too large", EIO},
+	{"unknown group", EINVAL},
+	{"unknown user", EINVAL},
+	{"bogus wstat buffer", EPROTO},
+	{"exclusive use file already open", EAGAIN},
+	{"corrupted directory entry", EIO},
+	{"corrupted file entry", EIO},
+	{"corrupted block label", EIO},
+	{"corrupted meta data", EIO},
+	{"illegal offset", EINVAL},
+	{"illegal path element", ENOENT},
+	{"root of file system is corrupted", EIO},
+	{"corrupted super block", EIO},
+	{"protocol botch", EPROTO},
+	{"file system is full", ENOSPC},
+	{"file is in use", EAGAIN},
+	{"directory entry is not allocated", ENOENT},
+	{"file is read only", EROFS},
+	{"file has been removed", EIDRM},
+	{"only support truncation to zero length", EPERM},
+	{"cannot remove root", EPERM},
+	{"file too big", EFBIG},
+	{"venti i/o error", EIO},
+	/* these are not errors */
+	{"u9fs rhostsauth: no authentication required", 0},
+	{"u9fs authnone: no authentication required", 0},
+	{NULL, -1}
+};
+
+/**
+ * p9_error_init - preload mappings into hash list
+ *
+ */
+
+int p9_error_init(void)
+{
+	struct errormap *c;
+	int bucket;
+
+	/* initialize hash table */
+	for (bucket = 0; bucket < ERRHASHSZ; bucket++)
+		INIT_HLIST_HEAD(&hash_errmap[bucket]);
+
+	/* load initial error map into hash table */
+	for (c = errmap; c->name != NULL; c++) {
+		c->namelen = strlen(c->name);
+		bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
+		INIT_HLIST_NODE(&c->list);
+		hlist_add_head(&c->list, &hash_errmap[bucket]);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(p9_error_init);
+
+/**
+ * errstr2errno - convert error string to error number
+ * @errstr: error string
+ * @len: length of error string
+ *
+ */
+
+int p9_errstr2errno(char *errstr, int len)
+{
+	int errno;
+	struct hlist_node *p;
+	struct errormap *c;
+	int bucket;
+
+	errno = 0;
+	p = NULL;
+	c = NULL;
+	bucket = jhash(errstr, len, 0) % ERRHASHSZ;
+	hlist_for_each_entry(c, p, &hash_errmap[bucket], list) {
+		if (c->namelen == len && !memcmp(c->name, errstr, len)) {
+			errno = c->val;
+			break;
+		}
+	}
+
+	if (errno == 0) {
+		/* TODO: if error isn't found, add it dynamically */
+		errstr[len] = 0;
+		printk(KERN_ERR "%s: server reported unknown error %s\n",
+			__func__, errstr);
+		errno = ESERVERFAULT;
+	}
+
+	return -errno;
+}
+EXPORT_SYMBOL(p9_errstr2errno);
diff --git a/net/9p/mod.c b/net/9p/mod.c
new file mode 100644
index 00000000..72c39827
--- /dev/null
+++ b/net/9p/mod.c
@@ -0,0 +1,174 @@
+/*
+ *  net/9p/9p.c
+ *
+ *  9P entry point
+ *
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <net/9p/9p.h>
+#include <linux/fs.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_NET_9P_DEBUG
+unsigned int p9_debug_level = 0;	/* feature-rific global debug level  */
+EXPORT_SYMBOL(p9_debug_level);
+module_param_named(debug, p9_debug_level, uint, 0);
+MODULE_PARM_DESC(debug, "9P debugging level");
+#endif
+
+/*
+ * Dynamic Transport Registration Routines
+ *
+ */
+
+static DEFINE_SPINLOCK(v9fs_trans_lock);
+static LIST_HEAD(v9fs_trans_list);
+
+/**
+ * v9fs_register_trans - register a new transport with 9p
+ * @m: structure describing the transport module and entry points
+ *
+ */
+void v9fs_register_trans(struct p9_trans_module *m)
+{
+	spin_lock(&v9fs_trans_lock);
+	list_add_tail(&m->list, &v9fs_trans_list);
+	spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_register_trans);
+
+/**
+ * v9fs_unregister_trans - unregister a 9p transport
+ * @m: the transport to remove
+ *
+ */
+void v9fs_unregister_trans(struct p9_trans_module *m)
+{
+	spin_lock(&v9fs_trans_lock);
+	list_del_init(&m->list);
+	spin_unlock(&v9fs_trans_lock);
+}
+EXPORT_SYMBOL(v9fs_unregister_trans);
+
+/**
+ * v9fs_get_trans_by_name - get transport with the matching name
+ * @name: string identifying transport
+ *
+ */
+struct p9_trans_module *v9fs_get_trans_by_name(const substring_t *name)
+{
+	struct p9_trans_module *t, *found = NULL;
+
+	spin_lock(&v9fs_trans_lock);
+
+	list_for_each_entry(t, &v9fs_trans_list, list)
+		if (strncmp(t->name, name->from, name->to-name->from) == 0 &&
+		    try_module_get(t->owner)) {
+			found = t;
+			break;
+		}
+
+	spin_unlock(&v9fs_trans_lock);
+	return found;
+}
+EXPORT_SYMBOL(v9fs_get_trans_by_name);
+
+/**
+ * v9fs_get_default_trans - get the default transport
+ *
+ */
+
+struct p9_trans_module *v9fs_get_default_trans(void)
+{
+	struct p9_trans_module *t, *found = NULL;
+
+	spin_lock(&v9fs_trans_lock);
+
+	list_for_each_entry(t, &v9fs_trans_list, list)
+		if (t->def && try_module_get(t->owner)) {
+			found = t;
+			break;
+		}
+
+	if (!found)
+		list_for_each_entry(t, &v9fs_trans_list, list)
+			if (try_module_get(t->owner)) {
+				found = t;
+				break;
+			}
+
+	spin_unlock(&v9fs_trans_lock);
+	return found;
+}
+EXPORT_SYMBOL(v9fs_get_default_trans);
+
+/**
+ * v9fs_put_trans - put trans
+ * @m: transport to put
+ *
+ */
+void v9fs_put_trans(struct p9_trans_module *m)
+{
+	if (m)
+		module_put(m->owner);
+}
+
+/**
+ * init_p9 - Initialize module
+ *
+ */
+static int __init init_p9(void)
+{
+	int ret = 0;
+
+	p9_error_init();
+	printk(KERN_INFO "Installing 9P2000 support\n");
+	p9_trans_fd_init();
+
+	return ret;
+}
+
+/**
+ * exit_p9 - shutdown module
+ *
+ */
+
+static void __exit exit_p9(void)
+{
+	printk(KERN_INFO "Unloading 9P2000 support\n");
+
+	p9_trans_fd_exit();
+}
+
+module_init(init_p9)
+module_exit(exit_p9)
+
+MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
new file mode 100644
index 00000000..a873277c
--- /dev/null
+++ b/net/9p/protocol.c
@@ -0,0 +1,682 @@
+/*
+ * net/9p/protocol.c
+ *
+ * 9P Protocol Support Code
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ *  Copyright (C) 2008 by IBM, Corp.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include "protocol.h"
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+
+#ifdef CONFIG_NET_9P_DEBUG
+void
+p9pdu_dump(int way, struct p9_fcall *pdu)
+{
+	int i, n;
+	u8 *data = pdu->sdata;
+	int datalen = pdu->size;
+	char buf[255];
+	int buflen = 255;
+
+	i = n = 0;
+	if (datalen > (buflen-16))
+		datalen = buflen-16;
+	while (i < datalen) {
+		n += scnprintf(buf + n, buflen - n, "%02x ", data[i]);
+		if (i%4 == 3)
+			n += scnprintf(buf + n, buflen - n, " ");
+		if (i%32 == 31)
+			n += scnprintf(buf + n, buflen - n, "\n");
+
+		i++;
+	}
+	n += scnprintf(buf + n, buflen - n, "\n");
+
+	if (way)
+		P9_DPRINTK(P9_DEBUG_PKT, "[[[(%d) %s\n", datalen, buf);
+	else
+		P9_DPRINTK(P9_DEBUG_PKT, "]]](%d) %s\n", datalen, buf);
+}
+#else
+void
+p9pdu_dump(int way, struct p9_fcall *pdu)
+{
+}
+#endif
+EXPORT_SYMBOL(p9pdu_dump);
+
+void p9stat_free(struct p9_wstat *stbuf)
+{
+	kfree(stbuf->name);
+	kfree(stbuf->uid);
+	kfree(stbuf->gid);
+	kfree(stbuf->muid);
+	kfree(stbuf->extension);
+}
+EXPORT_SYMBOL(p9stat_free);
+
+static size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
+{
+	size_t len = min(pdu->size - pdu->offset, size);
+	memcpy(data, &pdu->sdata[pdu->offset], len);
+	pdu->offset += len;
+	return size - len;
+}
+
+static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size)
+{
+	size_t len = min(pdu->capacity - pdu->size, size);
+	memcpy(&pdu->sdata[pdu->size], data, len);
+	pdu->size += len;
+	return size - len;
+}
+
+static size_t
+pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
+{
+	size_t len = min(pdu->capacity - pdu->size, size);
+	if (copy_from_user(&pdu->sdata[pdu->size], udata, len))
+		len = 0;
+
+	pdu->size += len;
+	return size - len;
+}
+
+static size_t
+pdu_write_urw(struct p9_fcall *pdu, const char *kdata, const char __user *udata,
+		size_t size)
+{
+	BUG_ON(pdu->size > P9_IOHDRSZ);
+	pdu->pubuf = (char __user *)udata;
+	pdu->pkbuf = (char *)kdata;
+	pdu->pbuf_size = size;
+	return 0;
+}
+
+static size_t
+pdu_write_readdir(struct p9_fcall *pdu, const char *kdata, size_t size)
+{
+	BUG_ON(pdu->size > P9_READDIRHDRSZ);
+	pdu->pkbuf = (char *)kdata;
+	pdu->pbuf_size = size;
+	return 0;
+}
+
+/*
+	b - int8_t
+	w - int16_t
+	d - int32_t
+	q - int64_t
+	s - string
+	S - stat
+	Q - qid
+	D - data blob (int32_t size followed by void *, results are not freed)
+	T - array of strings (int16_t count, followed by strings)
+	R - array of qids (int16_t count, followed by qids)
+	A - stat for 9p2000.L (p9_stat_dotl)
+	? - if optional = 1, continue parsing
+*/
+
+static int
+p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
+	va_list ap)
+{
+	const char *ptr;
+	int errcode = 0;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':{
+				int8_t *val = va_arg(ap, int8_t *);
+				if (pdu_read(pdu, val, sizeof(*val))) {
+					errcode = -EFAULT;
+					break;
+				}
+			}
+			break;
+		case 'w':{
+				int16_t *val = va_arg(ap, int16_t *);
+				__le16 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le16_to_cpu(le_val);
+			}
+			break;
+		case 'd':{
+				int32_t *val = va_arg(ap, int32_t *);
+				__le32 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le32_to_cpu(le_val);
+			}
+			break;
+		case 'q':{
+				int64_t *val = va_arg(ap, int64_t *);
+				__le64 le_val;
+				if (pdu_read(pdu, &le_val, sizeof(le_val))) {
+					errcode = -EFAULT;
+					break;
+				}
+				*val = le64_to_cpu(le_val);
+			}
+			break;
+		case 's':{
+				char **sptr = va_arg(ap, char **);
+				uint16_t len;
+
+				errcode = p9pdu_readf(pdu, proto_version,
+								"w", &len);
+				if (errcode)
+					break;
+
+				*sptr = kmalloc(len + 1, GFP_NOFS);
+				if (*sptr == NULL) {
+					errcode = -EFAULT;
+					break;
+				}
+				if (pdu_read(pdu, *sptr, len)) {
+					errcode = -EFAULT;
+					kfree(*sptr);
+					*sptr = NULL;
+				} else
+					(*sptr)[len] = 0;
+			}
+			break;
+		case 'Q':{
+				struct p9_qid *qid =
+				    va_arg(ap, struct p9_qid *);
+
+				errcode = p9pdu_readf(pdu, proto_version, "bdq",
+						      &qid->type, &qid->version,
+						      &qid->path);
+			}
+			break;
+		case 'S':{
+				struct p9_wstat *stbuf =
+				    va_arg(ap, struct p9_wstat *);
+
+				memset(stbuf, 0, sizeof(struct p9_wstat));
+				stbuf->n_uid = stbuf->n_gid = stbuf->n_muid =
+									-1;
+				errcode =
+				    p9pdu_readf(pdu, proto_version,
+						"wwdQdddqssss?sddd",
+						&stbuf->size, &stbuf->type,
+						&stbuf->dev, &stbuf->qid,
+						&stbuf->mode, &stbuf->atime,
+						&stbuf->mtime, &stbuf->length,
+						&stbuf->name, &stbuf->uid,
+						&stbuf->gid, &stbuf->muid,
+						&stbuf->extension,
+						&stbuf->n_uid, &stbuf->n_gid,
+						&stbuf->n_muid);
+				if (errcode)
+					p9stat_free(stbuf);
+			}
+			break;
+		case 'D':{
+				uint32_t *count = va_arg(ap, uint32_t *);
+				void **data = va_arg(ap, void **);
+
+				errcode =
+				    p9pdu_readf(pdu, proto_version, "d", count);
+				if (!errcode) {
+					*count =
+					    min_t(uint32_t, *count,
+						  pdu->size - pdu->offset);
+					*data = &pdu->sdata[pdu->offset];
+				}
+			}
+			break;
+		case 'T':{
+				uint16_t *nwname = va_arg(ap, uint16_t *);
+				char ***wnames = va_arg(ap, char ***);
+
+				errcode = p9pdu_readf(pdu, proto_version,
+								"w", nwname);
+				if (!errcode) {
+					*wnames =
+					    kmalloc(sizeof(char *) * *nwname,
+						    GFP_NOFS);
+					if (!*wnames)
+						errcode = -ENOMEM;
+				}
+
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < *nwname; i++) {
+						errcode =
+						    p9pdu_readf(pdu,
+								proto_version,
+								"s",
+								&(*wnames)[i]);
+						if (errcode)
+							break;
+					}
+				}
+
+				if (errcode) {
+					if (*wnames) {
+						int i;
+
+						for (i = 0; i < *nwname; i++)
+							kfree((*wnames)[i]);
+					}
+					kfree(*wnames);
+					*wnames = NULL;
+				}
+			}
+			break;
+		case 'R':{
+				int16_t *nwqid = va_arg(ap, int16_t *);
+				struct p9_qid **wqids =
+				    va_arg(ap, struct p9_qid **);
+
+				*wqids = NULL;
+
+				errcode =
+				    p9pdu_readf(pdu, proto_version, "w", nwqid);
+				if (!errcode) {
+					*wqids =
+					    kmalloc(*nwqid *
+						    sizeof(struct p9_qid),
+						    GFP_NOFS);
+					if (*wqids == NULL)
+						errcode = -ENOMEM;
+				}
+
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < *nwqid; i++) {
+						errcode =
+						    p9pdu_readf(pdu,
+								proto_version,
+								"Q",
+								&(*wqids)[i]);
+						if (errcode)
+							break;
+					}
+				}
+
+				if (errcode) {
+					kfree(*wqids);
+					*wqids = NULL;
+				}
+			}
+			break;
+		case 'A': {
+				struct p9_stat_dotl *stbuf =
+				    va_arg(ap, struct p9_stat_dotl *);
+
+				memset(stbuf, 0, sizeof(struct p9_stat_dotl));
+				errcode =
+				    p9pdu_readf(pdu, proto_version,
+					"qQdddqqqqqqqqqqqqqqq",
+					&stbuf->st_result_mask,
+					&stbuf->qid,
+					&stbuf->st_mode,
+					&stbuf->st_uid, &stbuf->st_gid,
+					&stbuf->st_nlink,
+					&stbuf->st_rdev, &stbuf->st_size,
+					&stbuf->st_blksize, &stbuf->st_blocks,
+					&stbuf->st_atime_sec,
+					&stbuf->st_atime_nsec,
+					&stbuf->st_mtime_sec,
+					&stbuf->st_mtime_nsec,
+					&stbuf->st_ctime_sec,
+					&stbuf->st_ctime_nsec,
+					&stbuf->st_btime_sec,
+					&stbuf->st_btime_nsec,
+					&stbuf->st_gen,
+					&stbuf->st_data_version);
+			}
+			break;
+		case '?':
+			if ((proto_version != p9_proto_2000u) &&
+				(proto_version != p9_proto_2000L))
+				return 0;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		if (errcode)
+			break;
+	}
+
+	return errcode;
+}
+
+int
+p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+	va_list ap)
+{
+	const char *ptr;
+	int errcode = 0;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':{
+				int8_t val = va_arg(ap, int);
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'w':{
+				__le16 val = cpu_to_le16(va_arg(ap, int));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'd':{
+				__le32 val = cpu_to_le32(va_arg(ap, int32_t));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'q':{
+				__le64 val = cpu_to_le64(va_arg(ap, int64_t));
+				if (pdu_write(pdu, &val, sizeof(val)))
+					errcode = -EFAULT;
+			}
+			break;
+		case 's':{
+				const char *sptr = va_arg(ap, const char *);
+				uint16_t len = 0;
+				if (sptr)
+					len = min_t(uint16_t, strlen(sptr),
+								USHRT_MAX);
+
+				errcode = p9pdu_writef(pdu, proto_version,
+								"w", len);
+				if (!errcode && pdu_write(pdu, sptr, len))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'Q':{
+				const struct p9_qid *qid =
+				    va_arg(ap, const struct p9_qid *);
+				errcode =
+				    p9pdu_writef(pdu, proto_version, "bdq",
+						 qid->type, qid->version,
+						 qid->path);
+			} break;
+		case 'S':{
+				const struct p9_wstat *stbuf =
+				    va_arg(ap, const struct p9_wstat *);
+				errcode =
+				    p9pdu_writef(pdu, proto_version,
+						 "wwdQdddqssss?sddd",
+						 stbuf->size, stbuf->type,
+						 stbuf->dev, &stbuf->qid,
+						 stbuf->mode, stbuf->atime,
+						 stbuf->mtime, stbuf->length,
+						 stbuf->name, stbuf->uid,
+						 stbuf->gid, stbuf->muid,
+						 stbuf->extension, stbuf->n_uid,
+						 stbuf->n_gid, stbuf->n_muid);
+			} break;
+		case 'D':{
+				uint32_t count = va_arg(ap, uint32_t);
+				const void *data = va_arg(ap, const void *);
+
+				errcode = p9pdu_writef(pdu, proto_version, "d",
+									count);
+				if (!errcode && pdu_write(pdu, data, count))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'E':{
+				 int32_t cnt = va_arg(ap, int32_t);
+				 const char *k = va_arg(ap, const void *);
+				 const char __user *u = va_arg(ap,
+							const void __user *);
+				 errcode = p9pdu_writef(pdu, proto_version, "d",
+						 cnt);
+				 if (!errcode && pdu_write_urw(pdu, k, u, cnt))
+					errcode = -EFAULT;
+			 }
+			 break;
+		case 'F':{
+				 int32_t cnt = va_arg(ap, int32_t);
+				 const char *k = va_arg(ap, const void *);
+				 errcode = p9pdu_writef(pdu, proto_version, "d",
+						 cnt);
+				 if (!errcode && pdu_write_readdir(pdu, k, cnt))
+					errcode = -EFAULT;
+			 }
+			 break;
+		case 'U':{
+				int32_t count = va_arg(ap, int32_t);
+				const char __user *udata =
+						va_arg(ap, const void __user *);
+				errcode = p9pdu_writef(pdu, proto_version, "d",
+									count);
+				if (!errcode && pdu_write_u(pdu, udata, count))
+					errcode = -EFAULT;
+			}
+			break;
+		case 'T':{
+				uint16_t nwname = va_arg(ap, int);
+				const char **wnames = va_arg(ap, const char **);
+
+				errcode = p9pdu_writef(pdu, proto_version, "w",
+									nwname);
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < nwname; i++) {
+						errcode =
+						    p9pdu_writef(pdu,
+								proto_version,
+								 "s",
+								 wnames[i]);
+						if (errcode)
+							break;
+					}
+				}
+			}
+			break;
+		case 'R':{
+				int16_t nwqid = va_arg(ap, int);
+				struct p9_qid *wqids =
+				    va_arg(ap, struct p9_qid *);
+
+				errcode = p9pdu_writef(pdu, proto_version, "w",
+									nwqid);
+				if (!errcode) {
+					int i;
+
+					for (i = 0; i < nwqid; i++) {
+						errcode =
+						    p9pdu_writef(pdu,
+								proto_version,
+								 "Q",
+								 &wqids[i]);
+						if (errcode)
+							break;
+					}
+				}
+			}
+			break;
+		case 'I':{
+				struct p9_iattr_dotl *p9attr = va_arg(ap,
+							struct p9_iattr_dotl *);
+
+				errcode = p9pdu_writef(pdu, proto_version,
+							"ddddqqqqq",
+							p9attr->valid,
+							p9attr->mode,
+							p9attr->uid,
+							p9attr->gid,
+							p9attr->size,
+							p9attr->atime_sec,
+							p9attr->atime_nsec,
+							p9attr->mtime_sec,
+							p9attr->mtime_nsec);
+			}
+			break;
+		case '?':
+			if ((proto_version != p9_proto_2000u) &&
+				(proto_version != p9_proto_2000L))
+				return 0;
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		if (errcode)
+			break;
+	}
+
+	return errcode;
+}
+
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = p9pdu_vreadf(pdu, proto_version, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+static int
+p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = p9pdu_vwritef(pdu, proto_version, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+int p9stat_read(char *buf, int len, struct p9_wstat *st, int proto_version)
+{
+	struct p9_fcall fake_pdu;
+	int ret;
+
+	fake_pdu.size = len;
+	fake_pdu.capacity = len;
+	fake_pdu.sdata = buf;
+	fake_pdu.offset = 0;
+
+	ret = p9pdu_readf(&fake_pdu, proto_version, "S", st);
+	if (ret) {
+		P9_DPRINTK(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
+		p9pdu_dump(1, &fake_pdu);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(p9stat_read);
+
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type)
+{
+	pdu->id = type;
+	return p9pdu_writef(pdu, 0, "dbw", 0, type, tag);
+}
+
+int p9pdu_finalize(struct p9_fcall *pdu)
+{
+	int size = pdu->size;
+	int err;
+
+	pdu->size = 0;
+	err = p9pdu_writef(pdu, 0, "d", size);
+	pdu->size = size;
+
+#ifdef CONFIG_NET_9P_DEBUG
+	if ((p9_debug_level & P9_DEBUG_PKT) == P9_DEBUG_PKT)
+		p9pdu_dump(0, pdu);
+#endif
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size,
+							pdu->id, pdu->tag);
+
+	return err;
+}
+
+void p9pdu_reset(struct p9_fcall *pdu)
+{
+	pdu->offset = 0;
+	pdu->size = 0;
+	pdu->private = NULL;
+	pdu->pubuf = NULL;
+	pdu->pkbuf = NULL;
+	pdu->pbuf_size = 0;
+}
+
+int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
+						int proto_version)
+{
+	struct p9_fcall fake_pdu;
+	int ret;
+	char *nameptr;
+
+	fake_pdu.size = len;
+	fake_pdu.capacity = len;
+	fake_pdu.sdata = buf;
+	fake_pdu.offset = 0;
+
+	ret = p9pdu_readf(&fake_pdu, proto_version, "Qqbs", &dirent->qid,
+			&dirent->d_off, &dirent->d_type, &nameptr);
+	if (ret) {
+		P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+		p9pdu_dump(1, &fake_pdu);
+		goto out;
+	}
+
+	strcpy(dirent->d_name, nameptr);
+	kfree(nameptr);
+
+out:
+	return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
new file mode 100644
index 00000000..2431c0f3
--- /dev/null
+++ b/net/9p/protocol.h
@@ -0,0 +1,34 @@
+/*
+ * net/9p/protocol.h
+ *
+ * 9P Protocol Support Code
+ *
+ *  Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *
+ *  Base on code from Anthony Liguori <aliguori@us.ibm.com>
+ *  Copyright (C) 2008 by IBM, Corp.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
+								va_list ap);
+int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
+int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
+int p9pdu_finalize(struct p9_fcall *pdu);
+void p9pdu_dump(int, struct p9_fcall *);
+void p9pdu_reset(struct p9_fcall *pdu);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
new file mode 100644
index 00000000..9a70ebde
--- /dev/null
+++ b/net/9p/trans_common.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <linux/scatterlist.h>
+#include "trans_common.h"
+
+/**
+ *  p9_release_req_pages - Release pages after the transaction.
+ *  @*private: PDU's private page of struct trans_rpage_info
+ */
+void
+p9_release_req_pages(struct trans_rpage_info *rpinfo)
+{
+	int i = 0;
+
+	while (rpinfo->rp_data[i] && rpinfo->rp_nr_pages--) {
+		put_page(rpinfo->rp_data[i]);
+		i++;
+	}
+}
+EXPORT_SYMBOL(p9_release_req_pages);
+
+/**
+ * p9_nr_pages - Return number of pages needed to accommodate the payload.
+ */
+int
+p9_nr_pages(struct p9_req_t *req)
+{
+	unsigned long start_page, end_page;
+	start_page =  (unsigned long)req->tc->pubuf >> PAGE_SHIFT;
+	end_page = ((unsigned long)req->tc->pubuf + req->tc->pbuf_size +
+			PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return end_page - start_page;
+}
+EXPORT_SYMBOL(p9_nr_pages);
+
+/**
+ * payload_gup - Translates user buffer into kernel pages and
+ * pins them either for read/write through get_user_pages_fast().
+ * @req: Request to be sent to server.
+ * @pdata_off: data offset into the first page after translation (gup).
+ * @pdata_len: Total length of the IO. gup may not return requested # of pages.
+ * @nr_pages: number of pages to accommodate the payload
+ * @rw: Indicates if the pages are for read or write.
+ */
+int
+p9_payload_gup(struct p9_req_t *req, size_t *pdata_off, int *pdata_len,
+		int nr_pages, u8 rw)
+{
+	uint32_t first_page_bytes = 0;
+	int32_t pdata_mapped_pages;
+	struct trans_rpage_info  *rpinfo;
+
+	*pdata_off = (__force size_t)req->tc->pubuf & (PAGE_SIZE-1);
+
+	if (*pdata_off)
+		first_page_bytes = min(((size_t)PAGE_SIZE - *pdata_off),
+				       req->tc->pbuf_size);
+
+	rpinfo = req->tc->private;
+	pdata_mapped_pages = get_user_pages_fast((unsigned long)req->tc->pubuf,
+			nr_pages, rw, &rpinfo->rp_data[0]);
+	if (pdata_mapped_pages <= 0)
+		return pdata_mapped_pages;
+
+	rpinfo->rp_nr_pages = pdata_mapped_pages;
+	if (*pdata_off) {
+		*pdata_len = first_page_bytes;
+		*pdata_len += min((req->tc->pbuf_size - *pdata_len),
+				((size_t)pdata_mapped_pages - 1) << PAGE_SHIFT);
+	} else {
+		*pdata_len = min(req->tc->pbuf_size,
+				(size_t)pdata_mapped_pages << PAGE_SHIFT);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(p9_payload_gup);
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
new file mode 100644
index 00000000..76309223
--- /dev/null
+++ b/net/9p/trans_common.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/* TRUE if it is user context */
+#define P9_IS_USER_CONTEXT (!segment_eq(get_fs(), KERNEL_DS))
+
+/**
+ * struct trans_rpage_info - To store mapped page information in PDU.
+ * @rp_alloc:Set if this structure is allocd, not a reuse unused space in pdu.
+ * @rp_nr_pages: Number of mapped pages
+ * @rp_data: Array of page pointers
+ */
+struct trans_rpage_info {
+	u8 rp_alloc;
+	int rp_nr_pages;
+	struct page *rp_data[0];
+};
+
+void p9_release_req_pages(struct trans_rpage_info *);
+int p9_payload_gup(struct p9_req_t *, size_t *, int *, int, u8);
+int p9_nr_pages(struct p9_req_t *);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
new file mode 100644
index 00000000..fdfdb574
--- /dev/null
+++ b/net/9p/trans_fd.c
@@ -0,0 +1,1087 @@
+/*
+ * linux/fs/9p/trans_fd.c
+ *
+ * Fd transport layer.  Includes deprecated socket layer.
+ *
+ *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#include <linux/syscalls.h> /* killme */
+
+#define P9_PORT 564
+#define MAX_SOCK_BUF (64*1024)
+#define MAXPOLLWADDR	2
+
+/**
+ * struct p9_fd_opts - per-transport options
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ *
+ */
+
+struct p9_fd_opts {
+	int rfd;
+	int wfd;
+	u16 port;
+};
+
+/**
+ * struct p9_trans_fd - transport state
+ * @rd: reference to file to read from
+ * @wr: reference of file to write to
+ * @conn: connection state reference
+ *
+ */
+
+struct p9_trans_fd {
+	struct file *rd;
+	struct file *wr;
+	struct p9_conn *conn;
+};
+
+/*
+  * Option Parsing (code inspired by NFS code)
+  *  - a little lazy - parse all fd-transport options
+  */
+
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
+};
+
+static const match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_rfdno, "rfdno=%u"},
+	{Opt_wfdno, "wfdno=%u"},
+	{Opt_err, NULL},
+};
+
+enum {
+	Rworksched = 1,		/* read work scheduled or running */
+	Rpending = 2,		/* can read */
+	Wworksched = 4,		/* write work scheduled or running */
+	Wpending = 8,		/* can write */
+};
+
+struct p9_poll_wait {
+	struct p9_conn *conn;
+	wait_queue_t wait;
+	wait_queue_head_t *wait_addr;
+};
+
+/**
+ * struct p9_conn - fd mux connection state information
+ * @mux_list: list link for mux to manage multiple connections (?)
+ * @client: reference to client instance for this connection
+ * @err: error state
+ * @req_list: accounting for requests which have been sent
+ * @unsent_req_list: accounting for requests that haven't been sent
+ * @req: current request being processed (if any)
+ * @tmp_buf: temporary buffer to read in header
+ * @rsize: amount to read for current frame
+ * @rpos: read position in current frame
+ * @rbuf: current read buffer
+ * @wpos: write position for current frame
+ * @wsize: amount of data to write for current frame
+ * @wbuf: current write buffer
+ * @poll_pending_link: pending links to be polled per conn
+ * @poll_wait: array of wait_q's for various worker threads
+ * @pt: poll state
+ * @rq: current read work
+ * @wq: current write work
+ * @wsched: ????
+ *
+ */
+
+struct p9_conn {
+	struct list_head mux_list;
+	struct p9_client *client;
+	int err;
+	struct list_head req_list;
+	struct list_head unsent_req_list;
+	struct p9_req_t *req;
+	char tmp_buf[7];
+	int rsize;
+	int rpos;
+	char *rbuf;
+	int wpos;
+	int wsize;
+	char *wbuf;
+	struct list_head poll_pending_link;
+	struct p9_poll_wait poll_wait[MAXPOLLWADDR];
+	poll_table pt;
+	struct work_struct rq;
+	struct work_struct wq;
+	unsigned long wsched;
+};
+
+static void p9_poll_workfn(struct work_struct *work);
+
+static DEFINE_SPINLOCK(p9_poll_lock);
+static LIST_HEAD(p9_poll_pending_list);
+static DECLARE_WORK(p9_poll_work, p9_poll_workfn);
+
+static void p9_mux_poll_stop(struct p9_conn *m)
+{
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+		struct p9_poll_wait *pwait = &m->poll_wait[i];
+
+		if (pwait->wait_addr) {
+			remove_wait_queue(pwait->wait_addr, &pwait->wait);
+			pwait->wait_addr = NULL;
+		}
+	}
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	list_del_init(&m->poll_pending_link);
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+}
+
+/**
+ * p9_conn_cancel - cancel all pending requests with error
+ * @m: mux data
+ * @err: error code
+ *
+ */
+
+static void p9_conn_cancel(struct p9_conn *m, int err)
+{
+	struct p9_req_t *req, *rtmp;
+	unsigned long flags;
+	LIST_HEAD(cancel_list);
+
+	P9_DPRINTK(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
+
+	spin_lock_irqsave(&m->client->lock, flags);
+
+	if (m->err) {
+		spin_unlock_irqrestore(&m->client->lock, flags);
+		return;
+	}
+
+	m->err = err;
+
+	list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
+		req->status = REQ_STATUS_ERROR;
+		if (!req->t_err)
+			req->t_err = err;
+		list_move(&req->req_list, &cancel_list);
+	}
+	list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+		req->status = REQ_STATUS_ERROR;
+		if (!req->t_err)
+			req->t_err = err;
+		list_move(&req->req_list, &cancel_list);
+	}
+	spin_unlock_irqrestore(&m->client->lock, flags);
+
+	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "call back req %p\n", req);
+		list_del(&req->req_list);
+		p9_client_cb(m->client, req);
+	}
+}
+
+static int
+p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
+{
+	int ret, n;
+	struct p9_trans_fd *ts = NULL;
+
+	if (client && client->status == Connected)
+		ts = client->trans;
+
+	if (!ts)
+		return -EREMOTEIO;
+
+	if (!ts->rd->f_op || !ts->rd->f_op->poll)
+		return -EIO;
+
+	if (!ts->wr->f_op || !ts->wr->f_op->poll)
+		return -EIO;
+
+	ret = ts->rd->f_op->poll(ts->rd, pt);
+	if (ret < 0)
+		return ret;
+
+	if (ts->rd != ts->wr) {
+		n = ts->wr->f_op->poll(ts->wr, pt);
+		if (n < 0)
+			return n;
+		ret = (ret & ~POLLOUT) | (n & ~POLLIN);
+	}
+
+	return ret;
+}
+
+/**
+ * p9_fd_read- read from a fd
+ * @client: client instance
+ * @v: buffer to receive data into
+ * @len: size of receive buffer
+ *
+ */
+
+static int p9_fd_read(struct p9_client *client, void *v, int len)
+{
+	int ret;
+	struct p9_trans_fd *ts = NULL;
+
+	if (client && client->status != Disconnected)
+		ts = client->trans;
+
+	if (!ts)
+		return -EREMOTEIO;
+
+	if (!(ts->rd->f_flags & O_NONBLOCK))
+		P9_DPRINTK(P9_DEBUG_ERROR, "blocking read ...\n");
+
+	ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
+	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+		client->status = Disconnected;
+	return ret;
+}
+
+/**
+ * p9_read_work - called when there is some data to be read from a transport
+ * @work: container of work to be done
+ *
+ */
+
+static void p9_read_work(struct work_struct *work)
+{
+	int n, err;
+	struct p9_conn *m;
+
+	m = container_of(work, struct p9_conn, rq);
+
+	if (m->err < 0)
+		return;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+
+	if (!m->rbuf) {
+		m->rbuf = m->tmp_buf;
+		m->rpos = 0;
+		m->rsize = 7; /* start by reading header */
+	}
+
+	clear_bit(Rpending, &m->wsched);
+	P9_DPRINTK(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", m,
+					m->rpos, m->rsize, m->rsize-m->rpos);
+	err = p9_fd_read(m->client, m->rbuf + m->rpos,
+						m->rsize - m->rpos);
+	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Rworksched, &m->wsched);
+		return;
+	}
+
+	if (err <= 0)
+		goto error;
+
+	m->rpos += err;
+
+	if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
+		u16 tag;
+		P9_DPRINTK(P9_DEBUG_TRANS, "got new header\n");
+
+		n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
+		if (n >= m->client->msize) {
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				"requested packet size too big: %d\n", n);
+			err = -EIO;
+			goto error;
+		}
+
+		tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
+		P9_DPRINTK(P9_DEBUG_TRANS,
+			"mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+
+		m->req = p9_tag_lookup(m->client, tag);
+		if (!m->req || (m->req->status != REQ_STATUS_SENT &&
+					m->req->status != REQ_STATUS_FLSH)) {
+			P9_DPRINTK(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
+								 tag);
+			err = -EIO;
+			goto error;
+		}
+
+		if (m->req->rc == NULL) {
+			m->req->rc = kmalloc(sizeof(struct p9_fcall) +
+						m->client->msize, GFP_NOFS);
+			if (!m->req->rc) {
+				m->req = NULL;
+				err = -ENOMEM;
+				goto error;
+			}
+		}
+		m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
+		memcpy(m->rbuf, m->tmp_buf, m->rsize);
+		m->rsize = n;
+	}
+
+	/* not an else because some packets (like clunk) have no payload */
+	if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+		P9_DPRINTK(P9_DEBUG_TRANS, "got new packet\n");
+		spin_lock(&m->client->lock);
+		if (m->req->status != REQ_STATUS_ERROR)
+			m->req->status = REQ_STATUS_RCVD;
+		list_del(&m->req->req_list);
+		spin_unlock(&m->client->lock);
+		p9_client_cb(m->client, m->req);
+		m->rbuf = NULL;
+		m->rpos = 0;
+		m->rsize = 0;
+		m->req = NULL;
+	}
+
+	if (!list_empty(&m->req_list)) {
+		if (test_and_clear_bit(Rpending, &m->wsched))
+			n = POLLIN;
+		else
+			n = p9_fd_poll(m->client, NULL);
+
+		if (n & POLLIN) {
+			P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			schedule_work(&m->rq);
+		} else
+			clear_bit(Rworksched, &m->wsched);
+	} else
+		clear_bit(Rworksched, &m->wsched);
+
+	return;
+error:
+	p9_conn_cancel(m, err);
+	clear_bit(Rworksched, &m->wsched);
+}
+
+/**
+ * p9_fd_write - write to a socket
+ * @client: client instance
+ * @v: buffer to send data from
+ * @len: size of send buffer
+ *
+ */
+
+static int p9_fd_write(struct p9_client *client, void *v, int len)
+{
+	int ret;
+	mm_segment_t oldfs;
+	struct p9_trans_fd *ts = NULL;
+
+	if (client && client->status != Disconnected)
+		ts = client->trans;
+
+	if (!ts)
+		return -EREMOTEIO;
+
+	if (!(ts->wr->f_flags & O_NONBLOCK))
+		P9_DPRINTK(P9_DEBUG_ERROR, "blocking write ...\n");
+
+	oldfs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
+	set_fs(oldfs);
+
+	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
+		client->status = Disconnected;
+	return ret;
+}
+
+/**
+ * p9_write_work - called when a transport can send some data
+ * @work: container for work to be done
+ *
+ */
+
+static void p9_write_work(struct work_struct *work)
+{
+	int n, err;
+	struct p9_conn *m;
+	struct p9_req_t *req;
+
+	m = container_of(work, struct p9_conn, wq);
+
+	if (m->err < 0) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
+	}
+
+	if (!m->wsize) {
+		if (list_empty(&m->unsent_req_list)) {
+			clear_bit(Wworksched, &m->wsched);
+			return;
+		}
+
+		spin_lock(&m->client->lock);
+		req = list_entry(m->unsent_req_list.next, struct p9_req_t,
+			       req_list);
+		req->status = REQ_STATUS_SENT;
+		P9_DPRINTK(P9_DEBUG_TRANS, "move req %p\n", req);
+		list_move_tail(&req->req_list, &m->req_list);
+
+		m->wbuf = req->tc->sdata;
+		m->wsize = req->tc->size;
+		m->wpos = 0;
+		spin_unlock(&m->client->lock);
+	}
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", m, m->wpos,
+								m->wsize);
+	clear_bit(Wpending, &m->wsched);
+	err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
+	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
+	if (err == -EAGAIN) {
+		clear_bit(Wworksched, &m->wsched);
+		return;
+	}
+
+	if (err < 0)
+		goto error;
+	else if (err == 0) {
+		err = -EREMOTEIO;
+		goto error;
+	}
+
+	m->wpos += err;
+	if (m->wpos == m->wsize)
+		m->wpos = m->wsize = 0;
+
+	if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
+		if (test_and_clear_bit(Wpending, &m->wsched))
+			n = POLLOUT;
+		else
+			n = p9_fd_poll(m->client, NULL);
+
+		if (n & POLLOUT) {
+			P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			schedule_work(&m->wq);
+		} else
+			clear_bit(Wworksched, &m->wsched);
+	} else
+		clear_bit(Wworksched, &m->wsched);
+
+	return;
+
+error:
+	p9_conn_cancel(m, err);
+	clear_bit(Wworksched, &m->wsched);
+}
+
+static int p9_pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct p9_poll_wait *pwait =
+		container_of(wait, struct p9_poll_wait, wait);
+	struct p9_conn *m = pwait->conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	if (list_empty(&m->poll_pending_link))
+		list_add_tail(&m->poll_pending_link, &p9_poll_pending_list);
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+	schedule_work(&p9_poll_work);
+	return 1;
+}
+
+/**
+ * p9_pollwait - add poll task to the wait queue
+ * @filp: file pointer being polled
+ * @wait_address: wait_q to block on
+ * @p: poll state
+ *
+ * called by files poll operation to add v9fs-poll task to files wait queue
+ */
+
+static void
+p9_pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+	struct p9_conn *m = container_of(p, struct p9_conn, pt);
+	struct p9_poll_wait *pwait = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(m->poll_wait); i++) {
+		if (m->poll_wait[i].wait_addr == NULL) {
+			pwait = &m->poll_wait[i];
+			break;
+		}
+	}
+
+	if (!pwait) {
+		P9_DPRINTK(P9_DEBUG_ERROR, "not enough wait_address slots\n");
+		return;
+	}
+
+	pwait->conn = m;
+	pwait->wait_addr = wait_address;
+	init_waitqueue_func_entry(&pwait->wait, p9_pollwake);
+	add_wait_queue(wait_address, &pwait->wait);
+}
+
+/**
+ * p9_conn_create - allocate and initialize the per-session mux data
+ * @client: client instance
+ *
+ * Note: Creates the polling task if this is the first session.
+ */
+
+static struct p9_conn *p9_conn_create(struct p9_client *client)
+{
+	int n;
+	struct p9_conn *m;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "client %p msize %d\n", client,
+								client->msize);
+	m = kzalloc(sizeof(struct p9_conn), GFP_KERNEL);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&m->mux_list);
+	m->client = client;
+
+	INIT_LIST_HEAD(&m->req_list);
+	INIT_LIST_HEAD(&m->unsent_req_list);
+	INIT_WORK(&m->rq, p9_read_work);
+	INIT_WORK(&m->wq, p9_write_work);
+	INIT_LIST_HEAD(&m->poll_pending_link);
+	init_poll_funcptr(&m->pt, p9_pollwait);
+
+	n = p9_fd_poll(client, &m->pt);
+	if (n & POLLIN) {
+		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		set_bit(Rpending, &m->wsched);
+	}
+
+	if (n & POLLOUT) {
+		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		set_bit(Wpending, &m->wsched);
+	}
+
+	return m;
+}
+
+/**
+ * p9_poll_mux - polls a mux and schedules read or write works if necessary
+ * @m: connection to poll
+ *
+ */
+
+static void p9_poll_mux(struct p9_conn *m)
+{
+	int n;
+
+	if (m->err < 0)
+		return;
+
+	n = p9_fd_poll(m->client, NULL);
+	if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) {
+		P9_DPRINTK(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
+		if (n >= 0)
+			n = -ECONNRESET;
+		p9_conn_cancel(m, n);
+	}
+
+	if (n & POLLIN) {
+		set_bit(Rpending, &m->wsched);
+		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can read\n", m);
+		if (!test_and_set_bit(Rworksched, &m->wsched)) {
+			P9_DPRINTK(P9_DEBUG_TRANS, "sched read work %p\n", m);
+			schedule_work(&m->rq);
+		}
+	}
+
+	if (n & POLLOUT) {
+		set_bit(Wpending, &m->wsched);
+		P9_DPRINTK(P9_DEBUG_TRANS, "mux %p can write\n", m);
+		if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
+		    !test_and_set_bit(Wworksched, &m->wsched)) {
+			P9_DPRINTK(P9_DEBUG_TRANS, "sched write work %p\n", m);
+			schedule_work(&m->wq);
+		}
+	}
+}
+
+/**
+ * p9_fd_request - send 9P request
+ * The function can sleep until the request is scheduled for sending.
+ * The function can be interrupted. Return from the function is not
+ * a guarantee that the request is sent successfully.
+ *
+ * @client: client instance
+ * @req: request to be sent
+ *
+ */
+
+static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
+{
+	int n;
+	struct p9_trans_fd *ts = client->trans;
+	struct p9_conn *m = ts->conn;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", m,
+						current, req->tc, req->tc->id);
+	if (m->err < 0)
+		return m->err;
+
+	spin_lock(&client->lock);
+	req->status = REQ_STATUS_UNSENT;
+	list_add_tail(&req->req_list, &m->unsent_req_list);
+	spin_unlock(&client->lock);
+
+	if (test_and_clear_bit(Wpending, &m->wsched))
+		n = POLLOUT;
+	else
+		n = p9_fd_poll(m->client, NULL);
+
+	if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+		schedule_work(&m->wq);
+
+	return 0;
+}
+
+static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	int ret = 1;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+	spin_lock(&client->lock);
+
+	if (req->status == REQ_STATUS_UNSENT) {
+		list_del(&req->req_list);
+		req->status = REQ_STATUS_FLSHD;
+		ret = 0;
+	} else if (req->status == REQ_STATUS_SENT)
+		req->status = REQ_STATUS_FLSH;
+
+	spin_unlock(&client->lock);
+
+	return ret;
+}
+
+/**
+ * parse_opts - parse mount options into p9_fd_opts structure
+ * @params: options string passed from mount
+ * @opts: fd transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+
+static int parse_opts(char *params, struct p9_fd_opts *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *options, *tmp_options;
+
+	opts->port = P9_PORT;
+	opts->rfd = ~0;
+	opts->wfd = ~0;
+
+	if (!params)
+		return 0;
+
+	tmp_options = kstrdup(params, GFP_KERNEL);
+	if (!tmp_options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+				"failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		int r;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		if (token != Opt_err) {
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+				"integer field, but no integer?\n");
+				continue;
+			}
+		}
+		switch (token) {
+		case Opt_port:
+			opts->port = option;
+			break;
+		case Opt_rfdno:
+			opts->rfd = option;
+			break;
+		case Opt_wfdno:
+			opts->wfd = option;
+			break;
+		default:
+			continue;
+		}
+	}
+
+	kfree(tmp_options);
+	return 0;
+}
+
+static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
+{
+	struct p9_trans_fd *ts = kmalloc(sizeof(struct p9_trans_fd),
+					   GFP_KERNEL);
+	if (!ts)
+		return -ENOMEM;
+
+	ts->rd = fget(rfd);
+	ts->wr = fget(wfd);
+	if (!ts->rd || !ts->wr) {
+		if (ts->rd)
+			fput(ts->rd);
+		if (ts->wr)
+			fput(ts->wr);
+		kfree(ts);
+		return -EIO;
+	}
+
+	client->trans = ts;
+	client->status = Connected;
+
+	return 0;
+}
+
+static int p9_socket_open(struct p9_client *client, struct socket *csocket)
+{
+	struct p9_trans_fd *p;
+	int ret, fd;
+
+	p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	csocket->sk->sk_allocation = GFP_NOIO;
+	fd = sock_map_fd(csocket, 0);
+	if (fd < 0) {
+		P9_EPRINTK(KERN_ERR, "p9_socket_open: failed to map fd\n");
+		sock_release(csocket);
+		kfree(p);
+		return fd;
+	}
+
+	get_file(csocket->file);
+	get_file(csocket->file);
+	p->wr = p->rd = csocket->file;
+	client->trans = p;
+	client->status = Connected;
+
+	sys_close(fd);	/* still racy */
+
+	p->rd->f_flags |= O_NONBLOCK;
+
+	p->conn = p9_conn_create(client);
+	if (IS_ERR(p->conn)) {
+		ret = PTR_ERR(p->conn);
+		p->conn = NULL;
+		kfree(p);
+		sockfd_put(csocket);
+		sockfd_put(csocket);
+		return ret;
+	}
+	return 0;
+}
+
+/**
+ * p9_mux_destroy - cancels all pending requests and frees mux resources
+ * @m: mux to destroy
+ *
+ */
+
+static void p9_conn_destroy(struct p9_conn *m)
+{
+	P9_DPRINTK(P9_DEBUG_TRANS, "mux %p prev %p next %p\n", m,
+		m->mux_list.prev, m->mux_list.next);
+
+	p9_mux_poll_stop(m);
+	cancel_work_sync(&m->rq);
+	cancel_work_sync(&m->wq);
+
+	p9_conn_cancel(m, -ECONNRESET);
+
+	m->client = NULL;
+	kfree(m);
+}
+
+/**
+ * p9_fd_close - shutdown file descriptor transport
+ * @client: client instance
+ *
+ */
+
+static void p9_fd_close(struct p9_client *client)
+{
+	struct p9_trans_fd *ts;
+
+	if (!client)
+		return;
+
+	ts = client->trans;
+	if (!ts)
+		return;
+
+	client->status = Disconnected;
+
+	p9_conn_destroy(ts->conn);
+
+	if (ts->rd)
+		fput(ts->rd);
+	if (ts->wr)
+		fput(ts->wr);
+
+	kfree(ts);
+}
+
+/*
+ * stolen from NFS - maybe should be made a generic function?
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct socket *csocket;
+	struct sockaddr_in sin_server;
+	struct p9_fd_opts opts;
+
+	err = parse_opts(args, &opts);
+	if (err < 0)
+		return err;
+
+	if (valid_ipaddr4(addr) < 0)
+		return -EINVAL;
+
+	csocket = NULL;
+
+	sin_server.sin_family = AF_INET;
+	sin_server.sin_addr.s_addr = in_aton(addr);
+	sin_server.sin_port = htons(opts.port);
+	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_INET,
+			    SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
+	if (err) {
+		P9_EPRINTK(KERN_ERR, "p9_trans_tcp: problem creating socket\n");
+		return err;
+	}
+
+	err = csocket->ops->connect(csocket,
+				    (struct sockaddr *)&sin_server,
+				    sizeof(struct sockaddr_in), 0);
+	if (err < 0) {
+		P9_EPRINTK(KERN_ERR,
+			"p9_trans_tcp: problem connecting socket to %s\n",
+			addr);
+		sock_release(csocket);
+		return err;
+	}
+
+	return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct socket *csocket;
+	struct sockaddr_un sun_server;
+
+	csocket = NULL;
+
+	if (strlen(addr) >= UNIX_PATH_MAX) {
+		P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n",
+			addr);
+		return -ENAMETOOLONG;
+	}
+
+	sun_server.sun_family = PF_UNIX;
+	strcpy(sun_server.sun_path, addr);
+	err = __sock_create(read_pnet(&current->nsproxy->net_ns), PF_UNIX,
+			    SOCK_STREAM, 0, &csocket, 1);
+	if (err < 0) {
+		P9_EPRINTK(KERN_ERR, "p9_trans_unix: problem creating socket\n");
+		return err;
+	}
+	err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
+			sizeof(struct sockaddr_un) - 1, 0);
+	if (err < 0) {
+		P9_EPRINTK(KERN_ERR,
+			"p9_trans_unix: problem connecting socket: %s: %d\n",
+			addr, err);
+		sock_release(csocket);
+		return err;
+	}
+
+	return p9_socket_open(client, csocket);
+}
+
+static int
+p9_fd_create(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct p9_fd_opts opts;
+	struct p9_trans_fd *p;
+
+	parse_opts(args, &opts);
+
+	if (opts.rfd == ~0 || opts.wfd == ~0) {
+		printk(KERN_ERR "v9fs: Insufficient options for proto=fd\n");
+		return -ENOPROTOOPT;
+	}
+
+	err = p9_fd_open(client, opts.rfd, opts.wfd);
+	if (err < 0)
+		return err;
+
+	p = (struct p9_trans_fd *) client->trans;
+	p->conn = p9_conn_create(client);
+	if (IS_ERR(p->conn)) {
+		err = PTR_ERR(p->conn);
+		p->conn = NULL;
+		fput(p->rd);
+		fput(p->wr);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct p9_trans_module p9_tcp_trans = {
+	.name = "tcp",
+	.maxsize = MAX_SOCK_BUF,
+	.def = 1,
+	.create = p9_fd_create_tcp,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.owner = THIS_MODULE,
+};
+
+static struct p9_trans_module p9_unix_trans = {
+	.name = "unix",
+	.maxsize = MAX_SOCK_BUF,
+	.def = 0,
+	.create = p9_fd_create_unix,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.owner = THIS_MODULE,
+};
+
+static struct p9_trans_module p9_fd_trans = {
+	.name = "fd",
+	.maxsize = MAX_SOCK_BUF,
+	.def = 0,
+	.create = p9_fd_create,
+	.close = p9_fd_close,
+	.request = p9_fd_request,
+	.cancel = p9_fd_cancel,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * p9_poll_proc - poll worker thread
+ * @a: thread state and arguments
+ *
+ * polls all v9fs transports for new events and queues the appropriate
+ * work to the work queue
+ *
+ */
+
+static void p9_poll_workfn(struct work_struct *work)
+{
+	unsigned long flags;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "start %p\n", current);
+
+	spin_lock_irqsave(&p9_poll_lock, flags);
+	while (!list_empty(&p9_poll_pending_list)) {
+		struct p9_conn *conn = list_first_entry(&p9_poll_pending_list,
+							struct p9_conn,
+							poll_pending_link);
+		list_del_init(&conn->poll_pending_link);
+		spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+		p9_poll_mux(conn);
+
+		spin_lock_irqsave(&p9_poll_lock, flags);
+	}
+	spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "finish\n");
+}
+
+int p9_trans_fd_init(void)
+{
+	v9fs_register_trans(&p9_tcp_trans);
+	v9fs_register_trans(&p9_unix_trans);
+	v9fs_register_trans(&p9_fd_trans);
+
+	return 0;
+}
+
+void p9_trans_fd_exit(void)
+{
+	flush_work_sync(&p9_poll_work);
+	v9fs_unregister_trans(&p9_tcp_trans);
+	v9fs_unregister_trans(&p9_unix_trans);
+	v9fs_unregister_trans(&p9_fd_trans);
+}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
new file mode 100644
index 00000000..159c50f1
--- /dev/null
+++ b/net/9p/trans_rdma.c
@@ -0,0 +1,719 @@
+/*
+ * linux/fs/9p/trans_rdma.c
+ *
+ * RDMA transport layer based on the trans_fd.c implementation.
+ *
+ *  Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
+ *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
+ *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/parser.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define P9_PORT			5640
+#define P9_RDMA_SQ_DEPTH	32
+#define P9_RDMA_RQ_DEPTH	32
+#define P9_RDMA_SEND_SGE	4
+#define P9_RDMA_RECV_SGE	4
+#define P9_RDMA_IRD		0
+#define P9_RDMA_ORD		0
+#define P9_RDMA_TIMEOUT		30000		/* 30 seconds */
+#define P9_RDMA_MAXSIZE		(4*4096)	/* Min SGE is 4, so we can
+						 * safely advertise a maxsize
+						 * of 64k */
+
+/**
+ * struct p9_trans_rdma - RDMA transport instance
+ *
+ * @state: tracks the transport state machine for connection setup and tear down
+ * @cm_id: The RDMA CM ID
+ * @pd: Protection Domain pointer
+ * @qp: Queue Pair pointer
+ * @cq: Completion Queue pointer
+ * @dm_mr: DMA Memory Region pointer
+ * @lkey: The local access only memory region key
+ * @timeout: Number of uSecs to wait for connection management events
+ * @sq_depth: The depth of the Send Queue
+ * @sq_sem: Semaphore for the SQ
+ * @rq_depth: The depth of the Receive Queue.
+ * @rq_count: Count of requests in the Receive Queue.
+ * @addr: The remote peer's address
+ * @req_lock: Protects the active request list
+ * @cm_done: Completion event for connection management tracking
+ */
+struct p9_trans_rdma {
+	enum {
+		P9_RDMA_INIT,
+		P9_RDMA_ADDR_RESOLVED,
+		P9_RDMA_ROUTE_RESOLVED,
+		P9_RDMA_CONNECTED,
+		P9_RDMA_FLUSHING,
+		P9_RDMA_CLOSING,
+		P9_RDMA_CLOSED,
+	} state;
+	struct rdma_cm_id *cm_id;
+	struct ib_pd *pd;
+	struct ib_qp *qp;
+	struct ib_cq *cq;
+	struct ib_mr *dma_mr;
+	u32 lkey;
+	long timeout;
+	int sq_depth;
+	struct semaphore sq_sem;
+	int rq_depth;
+	atomic_t rq_count;
+	struct sockaddr_in addr;
+	spinlock_t req_lock;
+
+	struct completion cm_done;
+};
+
+/**
+ * p9_rdma_context - Keeps track of in-process WR
+ *
+ * @wc_op: The original WR op for when the CQE completes in error.
+ * @busa: Bus address to unmap when the WR completes
+ * @req: Keeps track of requests (send)
+ * @rc: Keepts track of replies (receive)
+ */
+struct p9_rdma_req;
+struct p9_rdma_context {
+	enum ib_wc_opcode wc_op;
+	dma_addr_t busa;
+	union {
+		struct p9_req_t *req;
+		struct p9_fcall *rc;
+	};
+};
+
+/**
+ * p9_rdma_opts - Collection of mount options
+ * @port: port of connection
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+	short port;
+	int sq_depth;
+	int rq_depth;
+	long timeout;
+};
+
+/*
+ * Option Parsing (code inspired by NFS code)
+ */
+enum {
+	/* Options that take integer arguments */
+	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_port, "port=%u"},
+	{Opt_sq_depth, "sq=%u"},
+	{Opt_rq_depth, "rq=%u"},
+	{Opt_timeout, "timeout=%u"},
+	{Opt_err, NULL},
+};
+
+/**
+ * parse_opts - parse mount options into rdma options structure
+ * @params: options string passed from mount
+ * @opts: rdma transport-specific structure to parse options into
+ *
+ * Returns 0 upon success, -ERRNO upon failure
+ */
+static int parse_opts(char *params, struct p9_rdma_opts *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *options, *tmp_options;
+
+	opts->port = P9_PORT;
+	opts->sq_depth = P9_RDMA_SQ_DEPTH;
+	opts->rq_depth = P9_RDMA_RQ_DEPTH;
+	opts->timeout = P9_RDMA_TIMEOUT;
+
+	if (!params)
+		return 0;
+
+	tmp_options = kstrdup(params, GFP_KERNEL);
+	if (!tmp_options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+			   "failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+	options = tmp_options;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		int r;
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		r = match_int(&args[0], &option);
+		if (r < 0) {
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				   "integer field, but no integer?\n");
+			continue;
+		}
+		switch (token) {
+		case Opt_port:
+			opts->port = option;
+			break;
+		case Opt_sq_depth:
+			opts->sq_depth = option;
+			break;
+		case Opt_rq_depth:
+			opts->rq_depth = option;
+			break;
+		case Opt_timeout:
+			opts->timeout = option;
+			break;
+		default:
+			continue;
+		}
+	}
+	/* RQ must be at least as large as the SQ */
+	opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
+	kfree(tmp_options);
+	return 0;
+}
+
+static int
+p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct p9_client *c = id->context;
+	struct p9_trans_rdma *rdma = c->trans;
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		BUG_ON(rdma->state != P9_RDMA_INIT);
+		rdma->state = P9_RDMA_ADDR_RESOLVED;
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
+		rdma->state = P9_RDMA_ROUTE_RESOLVED;
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
+		rdma->state = P9_RDMA_CONNECTED;
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (rdma)
+			rdma->state = P9_RDMA_CLOSED;
+		if (c)
+			c->status = Disconnected;
+		break;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		break;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		c->status = Disconnected;
+		rdma_disconnect(rdma->cm_id);
+		break;
+	default:
+		BUG();
+	}
+	complete(&rdma->cm_done);
+	return 0;
+}
+
+static void
+handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
+	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+{
+	struct p9_req_t *req;
+	int err = 0;
+	int16_t tag;
+
+	req = NULL;
+	ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
+							 DMA_FROM_DEVICE);
+
+	if (status != IB_WC_SUCCESS)
+		goto err_out;
+
+	err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
+	if (err)
+		goto err_out;
+
+	req = p9_tag_lookup(client, tag);
+	if (!req)
+		goto err_out;
+
+	req->rc = c->rc;
+	req->status = REQ_STATUS_RCVD;
+	p9_client_cb(client, req);
+
+	return;
+
+ err_out:
+	P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n",
+		   req, err, status);
+	rdma->state = P9_RDMA_FLUSHING;
+	client->status = Disconnected;
+}
+
+static void
+handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
+	    struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+{
+	ib_dma_unmap_single(rdma->cm_id->device,
+			    c->busa, c->req->tc->size,
+			    DMA_TO_DEVICE);
+}
+
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event,
+								context);
+}
+
+static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct p9_client *client = cq_context;
+	struct p9_trans_rdma *rdma = client->trans;
+	int ret;
+	struct ib_wc wc;
+
+	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
+	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+		struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id;
+
+		switch (c->wc_op) {
+		case IB_WC_RECV:
+			atomic_dec(&rdma->rq_count);
+			handle_recv(client, rdma, c, wc.status, wc.byte_len);
+			break;
+
+		case IB_WC_SEND:
+			handle_send(client, rdma, c, wc.status, wc.byte_len);
+			up(&rdma->sq_sem);
+			break;
+
+		default:
+			printk(KERN_ERR "9prdma: unexpected completion type, "
+			       "c->wc_op=%d, wc.opcode=%d, status=%d\n",
+			       c->wc_op, wc.opcode, wc.status);
+			break;
+		}
+		kfree(c);
+	}
+}
+
+static void cq_event_handler(struct ib_event *e, void *v)
+{
+	P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
+}
+
+static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
+{
+	if (!rdma)
+		return;
+
+	if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
+		ib_dereg_mr(rdma->dma_mr);
+
+	if (rdma->qp && !IS_ERR(rdma->qp))
+		ib_destroy_qp(rdma->qp);
+
+	if (rdma->pd && !IS_ERR(rdma->pd))
+		ib_dealloc_pd(rdma->pd);
+
+	if (rdma->cq && !IS_ERR(rdma->cq))
+		ib_destroy_cq(rdma->cq);
+
+	if (rdma->cm_id && !IS_ERR(rdma->cm_id))
+		rdma_destroy_id(rdma->cm_id);
+
+	kfree(rdma);
+}
+
+static int
+post_recv(struct p9_client *client, struct p9_rdma_context *c)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	struct ib_recv_wr wr, *bad_wr;
+	struct ib_sge sge;
+
+	c->busa = ib_dma_map_single(rdma->cm_id->device,
+				    c->rc->sdata, client->msize,
+				    DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
+		goto error;
+
+	sge.addr = c->busa;
+	sge.length = client->msize;
+	sge.lkey = rdma->lkey;
+
+	wr.next = NULL;
+	c->wc_op = IB_WC_RECV;
+	wr.wr_id = (unsigned long) c;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+	return ib_post_recv(rdma->qp, &wr, &bad_wr);
+
+ error:
+	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
+	return -EIO;
+}
+
+static int rdma_request(struct p9_client *client, struct p9_req_t *req)
+{
+	struct p9_trans_rdma *rdma = client->trans;
+	struct ib_send_wr wr, *bad_wr;
+	struct ib_sge sge;
+	int err = 0;
+	unsigned long flags;
+	struct p9_rdma_context *c = NULL;
+	struct p9_rdma_context *rpl_context = NULL;
+
+	/* Allocate an fcall for the reply */
+	rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
+	if (!rpl_context) {
+		err = -ENOMEM;
+		goto err_close;
+	}
+
+	/*
+	 * If the request has a buffer, steal it, otherwise
+	 * allocate a new one.  Typically, requests should already
+	 * have receive buffers allocated and just swap them around
+	 */
+	if (!req->rc) {
+		req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize,
+				  GFP_NOFS);
+		if (req->rc) {
+			req->rc->sdata = (char *) req->rc +
+						sizeof(struct p9_fcall);
+			req->rc->capacity = client->msize;
+		}
+	}
+	rpl_context->rc = req->rc;
+	if (!rpl_context->rc) {
+		err = -ENOMEM;
+		goto err_free2;
+	}
+
+	/*
+	 * Post a receive buffer for this request. We need to ensure
+	 * there is a reply buffer available for every outstanding
+	 * request. A flushed request can result in no reply for an
+	 * outstanding request, so we must keep a count to avoid
+	 * overflowing the RQ.
+	 */
+	if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
+		err = post_recv(client, rpl_context);
+		if (err)
+			goto err_free1;
+	} else
+		atomic_dec(&rdma->rq_count);
+
+	/* remove posted receive buffer from request structure */
+	req->rc = NULL;
+
+	/* Post the request */
+	c = kmalloc(sizeof *c, GFP_NOFS);
+	if (!c) {
+		err = -ENOMEM;
+		goto err_free1;
+	}
+	c->req = req;
+
+	c->busa = ib_dma_map_single(rdma->cm_id->device,
+				    c->req->tc->sdata, c->req->tc->size,
+				    DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
+		goto error;
+
+	sge.addr = c->busa;
+	sge.length = c->req->tc->size;
+	sge.lkey = rdma->lkey;
+
+	wr.next = NULL;
+	c->wc_op = IB_WC_SEND;
+	wr.wr_id = (unsigned long) c;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+
+	if (down_interruptible(&rdma->sq_sem))
+		goto error;
+
+	return ib_post_send(rdma->qp, &wr, &bad_wr);
+
+ error:
+	kfree(c);
+	kfree(rpl_context->rc);
+	kfree(rpl_context);
+	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
+	return -EIO;
+ err_free1:
+	kfree(rpl_context->rc);
+ err_free2:
+	kfree(rpl_context);
+ err_close:
+	spin_lock_irqsave(&rdma->req_lock, flags);
+	if (rdma->state < P9_RDMA_CLOSING) {
+		rdma->state = P9_RDMA_CLOSING;
+		spin_unlock_irqrestore(&rdma->req_lock, flags);
+		rdma_disconnect(rdma->cm_id);
+	} else
+		spin_unlock_irqrestore(&rdma->req_lock, flags);
+	return err;
+}
+
+static void rdma_close(struct p9_client *client)
+{
+	struct p9_trans_rdma *rdma;
+
+	if (!client)
+		return;
+
+	rdma = client->trans;
+	if (!rdma)
+		return;
+
+	client->status = Disconnected;
+	rdma_disconnect(rdma->cm_id);
+	rdma_destroy_trans(rdma);
+}
+
+/**
+ * alloc_rdma - Allocate and initialize the rdma transport structure
+ * @opts: Mount options structure
+ */
+static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
+{
+	struct p9_trans_rdma *rdma;
+
+	rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
+	if (!rdma)
+		return NULL;
+
+	rdma->sq_depth = opts->sq_depth;
+	rdma->rq_depth = opts->rq_depth;
+	rdma->timeout = opts->timeout;
+	spin_lock_init(&rdma->req_lock);
+	init_completion(&rdma->cm_done);
+	sema_init(&rdma->sq_sem, rdma->sq_depth);
+	atomic_set(&rdma->rq_count, 0);
+
+	return rdma;
+}
+
+/* its not clear to me we can do anything after send has been posted */
+static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	return 1;
+}
+
+/**
+ * trans_create_rdma - Transport method for creating atransport instance
+ * @client: client instance
+ * @addr: IP address string
+ * @args: Mount options string
+ */
+static int
+rdma_create_trans(struct p9_client *client, const char *addr, char *args)
+{
+	int err;
+	struct p9_rdma_opts opts;
+	struct p9_trans_rdma *rdma;
+	struct rdma_conn_param conn_param;
+	struct ib_qp_init_attr qp_attr;
+	struct ib_device_attr devattr;
+
+	/* Parse the transport specific mount options */
+	err = parse_opts(args, &opts);
+	if (err < 0)
+		return err;
+
+	/* Create and initialize the RDMA transport structure */
+	rdma = alloc_rdma(&opts);
+	if (!rdma)
+		return -ENOMEM;
+
+	/* Create the RDMA CM ID */
+	rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(rdma->cm_id))
+		goto error;
+
+	/* Associate the client with the transport */
+	client->trans = rdma;
+
+	/* Resolve the server's address */
+	rdma->addr.sin_family = AF_INET;
+	rdma->addr.sin_addr.s_addr = in_aton(addr);
+	rdma->addr.sin_port = htons(opts.port);
+	err = rdma_resolve_addr(rdma->cm_id, NULL,
+				(struct sockaddr *)&rdma->addr,
+				rdma->timeout);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
+		goto error;
+
+	/* Resolve the route to the server */
+	err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
+		goto error;
+
+	/* Query the device attributes */
+	err = ib_query_device(rdma->cm_id->device, &devattr);
+	if (err)
+		goto error;
+
+	/* Create the Completion Queue */
+	rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
+				cq_event_handler, client,
+				opts.sq_depth + opts.rq_depth + 1, 0);
+	if (IS_ERR(rdma->cq))
+		goto error;
+	ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
+
+	/* Create the Protection Domain */
+	rdma->pd = ib_alloc_pd(rdma->cm_id->device);
+	if (IS_ERR(rdma->pd))
+		goto error;
+
+	/* Cache the DMA lkey in the transport */
+	rdma->dma_mr = NULL;
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+		rdma->lkey = rdma->cm_id->device->local_dma_lkey;
+	else {
+		rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(rdma->dma_mr))
+			goto error;
+		rdma->lkey = rdma->dma_mr->lkey;
+	}
+
+	/* Create the Queue Pair */
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = client;
+	qp_attr.cap.max_send_wr = opts.sq_depth;
+	qp_attr.cap.max_recv_wr = opts.rq_depth;
+	qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
+	qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = rdma->cq;
+	qp_attr.recv_cq = rdma->cq;
+	err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
+	if (err)
+		goto error;
+	rdma->qp = rdma->cm_id->qp;
+
+	/* Request a connection */
+	memset(&conn_param, 0, sizeof(conn_param));
+	conn_param.private_data = NULL;
+	conn_param.private_data_len = 0;
+	conn_param.responder_resources = P9_RDMA_IRD;
+	conn_param.initiator_depth = P9_RDMA_ORD;
+	err = rdma_connect(rdma->cm_id, &conn_param);
+	if (err)
+		goto error;
+	err = wait_for_completion_interruptible(&rdma->cm_done);
+	if (err || (rdma->state != P9_RDMA_CONNECTED))
+		goto error;
+
+	client->status = Connected;
+
+	return 0;
+
+error:
+	rdma_destroy_trans(rdma);
+	return -ENOTCONN;
+}
+
+static struct p9_trans_module p9_rdma_trans = {
+	.name = "rdma",
+	.maxsize = P9_RDMA_MAXSIZE,
+	.def = 0,
+	.owner = THIS_MODULE,
+	.create = rdma_create_trans,
+	.close = rdma_close,
+	.request = rdma_request,
+	.cancel = rdma_cancel,
+};
+
+/**
+ * p9_trans_rdma_init - Register the 9P RDMA transport driver
+ */
+static int __init p9_trans_rdma_init(void)
+{
+	v9fs_register_trans(&p9_rdma_trans);
+	return 0;
+}
+
+static void __exit p9_trans_rdma_exit(void)
+{
+	v9fs_unregister_trans(&p9_rdma_trans);
+}
+
+module_init(p9_trans_rdma_init);
+module_exit(p9_trans_rdma_exit);
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("RDMA Transport for 9P");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
new file mode 100644
index 00000000..e317583f
--- /dev/null
+++ b/net/9p/trans_virtio.c
@@ -0,0 +1,629 @@
+/*
+ * The Virtio 9p transport driver
+ *
+ * This is a block based transport driver based on the lguest block driver
+ * code.
+ *
+ *  Copyright (C) 2007, 2008 Eric Van Hensbergen, IBM Corporation
+ *
+ *  Based on virtio console driver
+ *  Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/un.h>
+#include <linux/uaccess.h>
+#include <linux/inet.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+#include <linux/parser.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/virtio.h>
+#include <linux/virtio_9p.h>
+#include "trans_common.h"
+
+#define VIRTQUEUE_NUM	128
+
+/* a single mutex to manage channel initialization and attachment */
+static DEFINE_MUTEX(virtio_9p_lock);
+static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
+static atomic_t vp_pinned = ATOMIC_INIT(0);
+
+/**
+ * struct virtio_chan - per-instance transport information
+ * @initialized: whether the channel is initialized
+ * @inuse: whether the channel is in use
+ * @lock: protects multiple elements within this structure
+ * @client: client instance
+ * @vdev: virtio dev associated with this channel
+ * @vq: virtio queue associated with this channel
+ * @sg: scatter gather list which is used to pack a request (protected?)
+ *
+ * We keep all per-channel information in a structure.
+ * This structure is allocated within the devices dev->mem space.
+ * A pointer to the structure will get put in the transport private.
+ *
+ */
+
+struct virtio_chan {
+	bool inuse;
+
+	spinlock_t lock;
+
+	struct p9_client *client;
+	struct virtio_device *vdev;
+	struct virtqueue *vq;
+	int ring_bufs_avail;
+	wait_queue_head_t *vc_wq;
+	/* This is global limit. Since we don't have a global structure,
+	 * will be placing it in each channel.
+	 */
+	int p9_max_pages;
+	/* Scatterlist: can be too big for stack. */
+	struct scatterlist sg[VIRTQUEUE_NUM];
+
+	int tag_len;
+	/*
+	 * tag name to identify a mount Non-null terminated
+	 */
+	char *tag;
+
+	struct list_head chan_list;
+};
+
+static struct list_head virtio_chan_list;
+
+/* How many bytes left in this page. */
+static unsigned int rest_of_page(void *data)
+{
+	return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+}
+
+/**
+ * p9_virtio_close - reclaim resources of a channel
+ * @client: client instance
+ *
+ * This reclaims a channel by freeing its resources and
+ * reseting its inuse flag.
+ *
+ */
+
+static void p9_virtio_close(struct p9_client *client)
+{
+	struct virtio_chan *chan = client->trans;
+
+	mutex_lock(&virtio_9p_lock);
+	if (chan)
+		chan->inuse = false;
+	mutex_unlock(&virtio_9p_lock);
+}
+
+/**
+ * req_done - callback which signals activity from the server
+ * @vq: virtio queue activity was received on
+ *
+ * This notifies us that the server has triggered some activity
+ * on the virtio channel - most likely a response to request we
+ * sent.  Figure out which requests now have responses and wake up
+ * those threads.
+ *
+ * Bugs: could do with some additional sanity checking, but appears to work.
+ *
+ */
+
+static void req_done(struct virtqueue *vq)
+{
+	struct virtio_chan *chan = vq->vdev->priv;
+	struct p9_fcall *rc;
+	unsigned int len;
+	struct p9_req_t *req;
+	unsigned long flags;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n");
+
+	while (1) {
+		spin_lock_irqsave(&chan->lock, flags);
+		rc = virtqueue_get_buf(chan->vq, &len);
+
+		if (rc == NULL) {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			break;
+		}
+
+		chan->ring_bufs_avail = 1;
+		spin_unlock_irqrestore(&chan->lock, flags);
+		/* Wakeup if anyone waiting for VirtIO ring space. */
+		wake_up(chan->vc_wq);
+		P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
+		P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
+		req = p9_tag_lookup(chan->client, rc->tag);
+		if (req->tc->private) {
+			struct trans_rpage_info *rp = req->tc->private;
+			int p = rp->rp_nr_pages;
+			/*Release pages */
+			p9_release_req_pages(rp);
+			atomic_sub(p, &vp_pinned);
+			wake_up(&vp_wq);
+			if (rp->rp_alloc)
+				kfree(rp);
+			req->tc->private = NULL;
+		}
+		req->status = REQ_STATUS_RCVD;
+		p9_client_cb(chan->client, req);
+	}
+}
+
+/**
+ * pack_sg_list - pack a scatter gather list from a linear buffer
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @limit: maximum segment to pack data to
+ * @data: data to pack into scatter/gather list
+ * @count: amount of data to pack into the scatter/gather list
+ *
+ * sg_lists have multiple segments of various sizes.  This will pack
+ * arbitrary data into an existing scatter gather list, segmenting the
+ * data as necessary within constraints.
+ *
+ */
+
+static int
+pack_sg_list(struct scatterlist *sg, int start, int limit, char *data,
+								int count)
+{
+	int s;
+	int index = start;
+
+	while (count) {
+		s = rest_of_page(data);
+		if (s > count)
+			s = count;
+		sg_set_buf(&sg[index++], data, s);
+		count -= s;
+		data += s;
+		BUG_ON(index > limit);
+	}
+
+	return index-start;
+}
+
+/* We don't currently allow canceling of virtio requests */
+static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+	return 1;
+}
+
+/**
+ * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
+ * this takes a list of pages.
+ * @sg: scatter/gather list to pack into
+ * @start: which segment of the sg_list to start at
+ * @pdata_off: Offset into the first page
+ * @**pdata: a list of pages to add into sg.
+ * @count: amount of data to pack into the scatter/gather list
+ */
+static int
+pack_sg_list_p(struct scatterlist *sg, int start, int limit, size_t pdata_off,
+		struct page **pdata, int count)
+{
+	int s;
+	int i = 0;
+	int index = start;
+
+	if (pdata_off) {
+		s = min((int)(PAGE_SIZE - pdata_off), count);
+		sg_set_page(&sg[index++], pdata[i++], s, pdata_off);
+		count -= s;
+	}
+
+	while (count) {
+		BUG_ON(index > limit);
+		s = min((int)PAGE_SIZE, count);
+		sg_set_page(&sg[index++], pdata[i++], s, 0);
+		count -= s;
+	}
+	return index-start;
+}
+
+/**
+ * p9_virtio_request - issue a request
+ * @client: client instance issuing the request
+ * @req: request to be issued
+ *
+ */
+
+static int
+p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
+{
+	int in, out, inp, outp;
+	struct virtio_chan *chan = client->trans;
+	unsigned long flags;
+	size_t pdata_off = 0;
+	struct trans_rpage_info *rpinfo = NULL;
+	int err, pdata_len = 0;
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
+
+	req->status = REQ_STATUS_SENT;
+
+	if (req->tc->pbuf_size && (req->tc->pubuf && P9_IS_USER_CONTEXT)) {
+		int nr_pages = p9_nr_pages(req);
+		int rpinfo_size = sizeof(struct trans_rpage_info) +
+			sizeof(struct page *) * nr_pages;
+
+		if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
+			err = wait_event_interruptible(vp_wq,
+				atomic_read(&vp_pinned) < chan->p9_max_pages);
+			if (err  == -ERESTARTSYS)
+				return err;
+			P9_DPRINTK(P9_DEBUG_TRANS, "9p: May gup pages now.\n");
+		}
+
+		if (rpinfo_size <= (req->tc->capacity - req->tc->size)) {
+			/* We can use sdata */
+			req->tc->private = req->tc->sdata + req->tc->size;
+			rpinfo = (struct trans_rpage_info *)req->tc->private;
+			rpinfo->rp_alloc = 0;
+		} else {
+			req->tc->private = kmalloc(rpinfo_size, GFP_NOFS);
+			if (!req->tc->private) {
+				P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: "
+					"private kmalloc returned NULL");
+				return -ENOMEM;
+			}
+			rpinfo = (struct trans_rpage_info *)req->tc->private;
+			rpinfo->rp_alloc = 1;
+		}
+
+		err = p9_payload_gup(req, &pdata_off, &pdata_len, nr_pages,
+				req->tc->id == P9_TREAD ? 1 : 0);
+		if (err < 0) {
+			if (rpinfo->rp_alloc)
+				kfree(rpinfo);
+			return err;
+		} else {
+			atomic_add(rpinfo->rp_nr_pages, &vp_pinned);
+		}
+	}
+
+req_retry_pinned:
+	spin_lock_irqsave(&chan->lock, flags);
+
+	/* Handle out VirtIO ring buffers */
+	out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata,
+			req->tc->size);
+
+	if (req->tc->pbuf_size && (req->tc->id == P9_TWRITE)) {
+		/* We have additional write payload buffer to take care */
+		if (req->tc->pubuf && P9_IS_USER_CONTEXT) {
+			outp = pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
+					pdata_off, rpinfo->rp_data, pdata_len);
+		} else {
+			char *pbuf;
+			if (req->tc->pubuf)
+				pbuf = (__force char *) req->tc->pubuf;
+			else
+				pbuf = req->tc->pkbuf;
+			outp = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM, pbuf,
+					req->tc->pbuf_size);
+		}
+		out += outp;
+	}
+
+	/* Handle in VirtIO ring buffers */
+	if (req->tc->pbuf_size &&
+		((req->tc->id == P9_TREAD) || (req->tc->id == P9_TREADDIR))) {
+		/*
+		 * Take care of additional Read payload.
+		 * 11 is the read/write header = PDU Header(7) + IO Size (4).
+		 * Arrange in such a way that server places header in the
+		 * alloced memory and payload onto the user buffer.
+		 */
+		inp = pack_sg_list(chan->sg, out,
+				   VIRTQUEUE_NUM, req->rc->sdata, 11);
+		/*
+		 * Running executables in the filesystem may result in
+		 * a read request with kernel buffer as opposed to user buffer.
+		 */
+		if (req->tc->pubuf && P9_IS_USER_CONTEXT) {
+			in = pack_sg_list_p(chan->sg, out+inp, VIRTQUEUE_NUM,
+					pdata_off, rpinfo->rp_data, pdata_len);
+		} else {
+			char *pbuf;
+			if (req->tc->pubuf)
+				pbuf = (__force char *) req->tc->pubuf;
+			else
+				pbuf = req->tc->pkbuf;
+
+			in = pack_sg_list(chan->sg, out+inp, VIRTQUEUE_NUM,
+					pbuf, req->tc->pbuf_size);
+		}
+		in += inp;
+	} else {
+		in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM,
+				  req->rc->sdata, req->rc->capacity);
+	}
+
+	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
+	if (err < 0) {
+		if (err == -ENOSPC) {
+			chan->ring_bufs_avail = 0;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			err = wait_event_interruptible(*chan->vc_wq,
+							chan->ring_bufs_avail);
+			if (err  == -ERESTARTSYS)
+				return err;
+
+			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
+			goto req_retry_pinned;
+		} else {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			P9_DPRINTK(P9_DEBUG_TRANS,
+					"9p debug: "
+					"virtio rpc add_buf returned failure");
+			if (rpinfo && rpinfo->rp_alloc)
+				kfree(rpinfo);
+			return -EIO;
+		}
+	}
+
+	virtqueue_kick(chan->vq);
+	spin_unlock_irqrestore(&chan->lock, flags);
+
+	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
+	return 0;
+}
+
+static ssize_t p9_mount_tag_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct virtio_chan *chan;
+	struct virtio_device *vdev;
+
+	vdev = dev_to_virtio(dev);
+	chan = vdev->priv;
+
+	return snprintf(buf, chan->tag_len + 1, "%s", chan->tag);
+}
+
+static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
+
+/**
+ * p9_virtio_probe - probe for existence of 9P virtio channels
+ * @vdev: virtio device to probe
+ *
+ * This probes for existing virtio channels.
+ *
+ */
+
+static int p9_virtio_probe(struct virtio_device *vdev)
+{
+	__u16 tag_len;
+	char *tag;
+	int err;
+	struct virtio_chan *chan;
+
+	chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
+	if (!chan) {
+		printk(KERN_ERR "9p: Failed to allocate virtio 9P channel\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	chan->vdev = vdev;
+
+	/* We expect one virtqueue, for requests. */
+	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
+	if (IS_ERR(chan->vq)) {
+		err = PTR_ERR(chan->vq);
+		goto out_free_vq;
+	}
+	chan->vq->vdev->priv = chan;
+	spin_lock_init(&chan->lock);
+
+	sg_init_table(chan->sg, VIRTQUEUE_NUM);
+
+	chan->inuse = false;
+	if (virtio_has_feature(vdev, VIRTIO_9P_MOUNT_TAG)) {
+		vdev->config->get(vdev,
+				offsetof(struct virtio_9p_config, tag_len),
+				&tag_len, sizeof(tag_len));
+	} else {
+		err = -EINVAL;
+		goto out_free_vq;
+	}
+	tag = kmalloc(tag_len, GFP_KERNEL);
+	if (!tag) {
+		err = -ENOMEM;
+		goto out_free_vq;
+	}
+	vdev->config->get(vdev, offsetof(struct virtio_9p_config, tag),
+			tag, tag_len);
+	chan->tag = tag;
+	chan->tag_len = tag_len;
+	err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+	if (err) {
+		goto out_free_tag;
+	}
+	chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!chan->vc_wq) {
+		err = -ENOMEM;
+		goto out_free_tag;
+	}
+	init_waitqueue_head(chan->vc_wq);
+	chan->ring_bufs_avail = 1;
+	/* Ceiling limit to avoid denial of service attacks */
+	chan->p9_max_pages = nr_free_buffer_pages()/4;
+
+	mutex_lock(&virtio_9p_lock);
+	list_add_tail(&chan->chan_list, &virtio_chan_list);
+	mutex_unlock(&virtio_9p_lock);
+	return 0;
+
+out_free_tag:
+	kfree(tag);
+out_free_vq:
+	vdev->config->del_vqs(vdev);
+	kfree(chan);
+fail:
+	return err;
+}
+
+
+/**
+ * p9_virtio_create - allocate a new virtio channel
+ * @client: client instance invoking this transport
+ * @devname: string identifying the channel to connect to (unused)
+ * @args: args passed from sys_mount() for per-transport options (unused)
+ *
+ * This sets up a transport channel for 9p communication.  Right now
+ * we only match the first available channel, but eventually we couldlook up
+ * alternate channels by matching devname versus a virtio_config entry.
+ * We use a simple reference count mechanism to ensure that only a single
+ * mount has a channel open at a time.
+ *
+ */
+
+static int
+p9_virtio_create(struct p9_client *client, const char *devname, char *args)
+{
+	struct virtio_chan *chan;
+	int ret = -ENOENT;
+	int found = 0;
+
+	mutex_lock(&virtio_9p_lock);
+	list_for_each_entry(chan, &virtio_chan_list, chan_list) {
+		if (!strncmp(devname, chan->tag, chan->tag_len) &&
+		    strlen(devname) == chan->tag_len) {
+			if (!chan->inuse) {
+				chan->inuse = true;
+				found = 1;
+				break;
+			}
+			ret = -EBUSY;
+		}
+	}
+	mutex_unlock(&virtio_9p_lock);
+
+	if (!found) {
+		printk(KERN_ERR "9p: no channels available\n");
+		return ret;
+	}
+
+	client->trans = (void *)chan;
+	client->status = Connected;
+	chan->client = client;
+
+	return 0;
+}
+
+/**
+ * p9_virtio_remove - clean up resources associated with a virtio device
+ * @vdev: virtio device to remove
+ *
+ */
+
+static void p9_virtio_remove(struct virtio_device *vdev)
+{
+	struct virtio_chan *chan = vdev->priv;
+
+	BUG_ON(chan->inuse);
+	vdev->config->del_vqs(vdev);
+
+	mutex_lock(&virtio_9p_lock);
+	list_del(&chan->chan_list);
+	mutex_unlock(&virtio_9p_lock);
+	sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
+	kfree(chan->tag);
+	kfree(chan->vc_wq);
+	kfree(chan);
+
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+	VIRTIO_9P_MOUNT_TAG,
+};
+
+/* The standard "struct lguest_driver": */
+static struct virtio_driver p9_virtio_drv = {
+	.feature_table  = features,
+	.feature_table_size = ARRAY_SIZE(features),
+	.driver.name    = KBUILD_MODNAME,
+	.driver.owner	= THIS_MODULE,
+	.id_table	= id_table,
+	.probe		= p9_virtio_probe,
+	.remove		= p9_virtio_remove,
+};
+
+static struct p9_trans_module p9_virtio_trans = {
+	.name = "virtio",
+	.create = p9_virtio_create,
+	.close = p9_virtio_close,
+	.request = p9_virtio_request,
+	.cancel = p9_virtio_cancel,
+
+	/*
+	 * We leave one entry for input and one entry for response
+	 * headers. We also skip one more entry to accomodate, address
+	 * that are not at page boundary, that can result in an extra
+	 * page in zero copy.
+	 */
+	.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
+	.pref = P9_TRANS_PREF_PAYLOAD_SEP,
+	.def = 0,
+	.owner = THIS_MODULE,
+};
+
+/* The standard init function */
+static int __init p9_virtio_init(void)
+{
+	INIT_LIST_HEAD(&virtio_chan_list);
+
+	v9fs_register_trans(&p9_virtio_trans);
+	return register_virtio_driver(&p9_virtio_drv);
+}
+
+static void __exit p9_virtio_cleanup(void)
+{
+	unregister_virtio_driver(&p9_virtio_drv);
+	v9fs_unregister_trans(&p9_virtio_trans);
+}
+
+module_init(p9_virtio_init);
+module_exit(p9_virtio_cleanup);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Virtio 9p Transport");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/util.c b/net/9p/util.c
new file mode 100644
index 00000000..9c1c9348
--- /dev/null
+++ b/net/9p/util.c
@@ -0,0 +1,146 @@
+/*
+ *  net/9p/util.c
+ *
+ *  This file contains some helper functions
+ *
+ *  Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
+ *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
+ *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to:
+ *  Free Software Foundation
+ *  51 Franklin Street, Fifth Floor
+ *  Boston, MA  02111-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/parser.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <net/9p/9p.h>
+
+/**
+ * struct p9_idpool - per-connection accounting for tag idpool
+ * @lock: protects the pool
+ * @pool: idr to allocate tag id from
+ *
+ */
+
+struct p9_idpool {
+	spinlock_t lock;
+	struct idr pool;
+};
+
+/**
+ * p9_idpool_create - create a new per-connection id pool
+ *
+ */
+
+struct p9_idpool *p9_idpool_create(void)
+{
+	struct p9_idpool *p;
+
+	p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&p->lock);
+	idr_init(&p->pool);
+
+	return p;
+}
+EXPORT_SYMBOL(p9_idpool_create);
+
+/**
+ * p9_idpool_destroy - create a new per-connection id pool
+ * @p: idpool to destroy
+ */
+
+void p9_idpool_destroy(struct p9_idpool *p)
+{
+	idr_destroy(&p->pool);
+	kfree(p);
+}
+EXPORT_SYMBOL(p9_idpool_destroy);
+
+/**
+ * p9_idpool_get - allocate numeric id from pool
+ * @p: pool to allocate from
+ *
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+int p9_idpool_get(struct p9_idpool *p)
+{
+	int i = 0;
+	int error;
+	unsigned long flags;
+
+retry:
+	if (idr_pre_get(&p->pool, GFP_NOFS) == 0)
+		return -1;
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	/* no need to store exactly p, we just need something non-null */
+	error = idr_get_new(&p->pool, p, &i);
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return -1;
+
+	P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", i, p);
+	return i;
+}
+EXPORT_SYMBOL(p9_idpool_get);
+
+/**
+ * p9_idpool_put - release numeric id from pool
+ * @id: numeric id which is being released
+ * @p: pool to release id into
+ *
+ * Bugs: This seems to be an awful generic function, should it be in idr.c with
+ *            the lock included in struct idr?
+ */
+
+void p9_idpool_put(int id, struct p9_idpool *p)
+{
+	unsigned long flags;
+
+	P9_DPRINTK(P9_DEBUG_MUX, " id %d pool %p\n", id, p);
+
+	spin_lock_irqsave(&p->lock, flags);
+	idr_remove(&p->pool, id);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+EXPORT_SYMBOL(p9_idpool_put);
+
+/**
+ * p9_idpool_check - check if the specified id is available
+ * @id: id to check
+ * @p: pool to check
+ */
+
+int p9_idpool_check(int id, struct p9_idpool *p)
+{
+	return idr_find(&p->pool, id) != NULL;
+}
+EXPORT_SYMBOL(p9_idpool_check);
+
diff --git a/net/Kconfig b/net/Kconfig
new file mode 100644
index 00000000..919cf9a8
--- /dev/null
+++ b/net/Kconfig
@@ -0,0 +1,341 @@
+#
+# Network configuration
+#
+
+menuconfig NET
+	bool "Networking support"
+	select NLATTR
+	---help---
+	  Unless you really know what you are doing, you should say Y here.
+	  The reason is that some programs need kernel networking support even
+	  when running on a stand-alone machine that isn't connected to any
+	  other computer.
+	  
+	  If you are upgrading from an older kernel, you
+	  should consider updating your networking tools too because changes
+	  in the kernel and the tools often go hand in hand. The tools are
+	  contained in the package net-tools, the location and version number
+	  of which are given in <file:Documentation/Changes>.
+
+	  For a general introduction to Linux networking, it is highly
+	  recommended to read the NET-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+if NET
+
+config WANT_COMPAT_NETLINK_MESSAGES
+	bool
+	help
+	  This option can be selected by other options that need compat
+	  netlink messages.
+
+config COMPAT_NETLINK_MESSAGES
+	def_bool y
+	depends on COMPAT
+	depends on WEXT_CORE || WANT_COMPAT_NETLINK_MESSAGES
+	help
+	  This option makes it possible to send different netlink messages
+	  to tasks depending on whether the task is a compat task or not. To
+	  achieve this, you need to set skb_shinfo(skb)->frag_list to the
+	  compat skb before sending the skb, the netlink code will sort out
+	  which message to actually pass to the task.
+
+	  Newly written code should NEVER need this option but do
+	  compat-independent messages instead!
+
+menu "Networking options"
+
+source "net/packet/Kconfig"
+source "net/unix/Kconfig"
+source "net/xfrm/Kconfig"
+source "net/iucv/Kconfig"
+
+config INET
+	bool "TCP/IP networking"
+	---help---
+	  These are the protocols used on the Internet and on most local
+	  Ethernets. It is highly recommended to say Y here (this will enlarge
+	  your kernel by about 400 KB), since some programs (e.g. the X window
+	  system) use TCP/IP even if your machine is not connected to any
+	  other computer. You will get the so-called loopback device which
+	  allows you to ping yourself (great fun, that!).
+
+	  For an excellent introduction to Linux networking, please read the
+	  Linux Networking HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  If you say Y here and also to "/proc file system support" and
+	  "Sysctl support" below, you can change various aspects of the
+	  behavior of the TCP/IP code by writing to the (virtual) files in
+	  /proc/sys/net/ipv4/*; the options are explained in the file
+	  <file:Documentation/networking/ip-sysctl.txt>.
+
+	  Short answer: say Y.
+
+if INET
+source "net/ipv4/Kconfig"
+source "net/ipv6/Kconfig"
+source "net/netlabel/Kconfig"
+
+endif # if INET
+
+config ANDROID_PARANOID_NETWORK
+	bool "Only allow certain groups to create sockets"
+	default y
+	help
+		none
+
+config NET_ACTIVITY_STATS
+	bool "Network activity statistics tracking"
+	default y
+	help
+	 Network activity statistics are useful for tracking wireless
+	 modem activity on 2G, 3G, 4G wireless networks. Counts number of
+	 transmissions and groups them in specified time buckets.
+
+config NETWORK_SECMARK
+	bool "Security Marking"
+	help
+	  This enables security marking of network packets, similar
+	  to nfmark, but designated for security purposes.
+	  If you are unsure how to answer this question, answer N.
+
+config NETWORK_PHY_TIMESTAMPING
+	bool "Timestamping in PHY devices"
+	depends on EXPERIMENTAL
+	help
+	  This allows timestamping of network packets by PHYs with
+	  hardware timestamping capabilities. This option adds some
+	  overhead in the transmit and receive paths.
+
+	  If you are unsure how to answer this question, answer N.
+
+menuconfig NETFILTER
+	bool "Network packet filtering framework (Netfilter)"
+	---help---
+	  Netfilter is a framework for filtering and mangling network packets
+	  that pass through your Linux box.
+
+	  The most common use of packet filtering is to run your Linux box as
+	  a firewall protecting a local network from the Internet. The type of
+	  firewall provided by this kernel support is called a "packet
+	  filter", which means that it can reject individual network packets
+	  based on type, source, destination etc. The other kind of firewall,
+	  a "proxy-based" one, is more secure but more intrusive and more
+	  bothersome to set up; it inspects the network traffic much more
+	  closely, modifies it and has knowledge about the higher level
+	  protocols, which a packet filter lacks. Moreover, proxy-based
+	  firewalls often require changes to the programs running on the local
+	  clients. Proxy-based firewalls don't need support by the kernel, but
+	  they are often combined with a packet filter, which only works if
+	  you say Y here.
+
+	  You should also say Y here if you intend to use your Linux box as
+	  the gateway to the Internet for a local network of machines without
+	  globally valid IP addresses. This is called "masquerading": if one
+	  of the computers on your local network wants to send something to
+	  the outside, your box can "masquerade" as that computer, i.e. it
+	  forwards the traffic to the intended outside destination, but
+	  modifies the packets to make it look like they came from the
+	  firewall box itself. It works both ways: if the outside host
+	  replies, the Linux box will silently forward the traffic to the
+	  correct local computer. This way, the computers on your local net
+	  are completely invisible to the outside world, even though they can
+	  reach the outside and can receive replies. It is even possible to
+	  run globally visible servers from within a masqueraded local network
+	  using a mechanism called portforwarding. Masquerading is also often
+	  called NAT (Network Address Translation).
+
+	  Another use of Netfilter is in transparent proxying: if a machine on
+	  the local network tries to connect to an outside host, your Linux
+	  box can transparently forward the traffic to a local server,
+	  typically a caching proxy server.
+
+	  Yet another use of Netfilter is building a bridging firewall. Using
+	  a bridge with Network packet filtering enabled makes iptables "see"
+	  the bridged traffic. For filtering on the lower network and Ethernet
+	  protocols over the bridge, use ebtables (under bridge netfilter
+	  configuration).
+
+	  Various modules exist for netfilter which replace the previous
+	  masquerading (ipmasqadm), packet filtering (ipchains), transparent
+	  proxying, and portforwarding mechanisms. Please see
+	  <file:Documentation/Changes> under "iptables" for the location of
+	  these packages.
+
+if NETFILTER
+
+config NETFILTER_DEBUG
+	bool "Network packet filtering debugging"
+	depends on NETFILTER
+	help
+	  You can say Y here if you want to get additional messages useful in
+	  debugging the netfilter code.
+
+config NETFILTER_ADVANCED
+	bool "Advanced netfilter configuration"
+	depends on NETFILTER
+	default y
+	help
+	  If you say Y here you can select between all the netfilter modules.
+	  If you say N the more unusual ones will not be shown and the
+	  basic ones needed by most people will default to 'M'.
+
+	  If unsure, say Y.
+
+config BRIDGE_NETFILTER
+	bool "Bridged IP/ARP packets filtering"
+	depends on BRIDGE && NETFILTER && INET
+	depends on NETFILTER_ADVANCED
+	default y
+	---help---
+	  Enabling this option will let arptables resp. iptables see bridged
+	  ARP resp. IP traffic. If you want a bridging firewall, you probably
+	  want this option enabled.
+	  Enabling or disabling this option doesn't enable or disable
+	  ebtables.
+
+	  If unsure, say N.
+
+source "net/netfilter/Kconfig"
+source "net/ipv4/netfilter/Kconfig"
+source "net/ipv6/netfilter/Kconfig"
+source "net/decnet/netfilter/Kconfig"
+source "net/bridge/netfilter/Kconfig"
+
+endif
+
+source "net/dccp/Kconfig"
+source "net/sctp/Kconfig"
+source "net/rds/Kconfig"
+source "net/tipc/Kconfig"
+source "net/atm/Kconfig"
+source "net/l2tp/Kconfig"
+source "net/802/Kconfig"
+source "net/bridge/Kconfig"
+source "net/dsa/Kconfig"
+source "net/8021q/Kconfig"
+source "net/decnet/Kconfig"
+source "net/llc/Kconfig"
+source "net/ipx/Kconfig"
+source "drivers/net/appletalk/Kconfig"
+source "net/x25/Kconfig"
+source "net/lapb/Kconfig"
+source "net/econet/Kconfig"
+source "net/wanrouter/Kconfig"
+source "net/phonet/Kconfig"
+source "net/ieee802154/Kconfig"
+source "net/sched/Kconfig"
+source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
+source "net/batman-adv/Kconfig"
+
+config RPS
+	boolean "RPS"
+	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
+	default y
+
+config RFS_ACCEL
+	boolean
+	depends on RPS && GENERIC_HARDIRQS
+	select CPU_RMAP
+	default y
+
+config XPS
+	boolean
+	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
+	default y
+
+config HAVE_BPF_JIT
+	bool
+
+config BPF_JIT
+	bool "enable BPF Just In Time compiler"
+	depends on HAVE_BPF_JIT
+	depends on MODULES
+	---help---
+	  Berkeley Packet Filter filtering capabilities are normally handled
+	  by an interpreter. This option allows kernel to generate a native
+	  code when filter is loaded in memory. This should speedup
+	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
+	  this feature changing /proc/sys/net/core/bpf_jit_enable
+
+menu "Network testing"
+
+config NET_PKTGEN
+	tristate "Packet Generator (USE WITH CAUTION)"
+	depends on PROC_FS
+	---help---
+	  This module will inject preconfigured packets, at a configurable
+	  rate, out of a given interface.  It is used for network interface
+	  stress testing and performance analysis.  If you don't understand
+	  what was just said, you don't need it: say N.
+
+	  Documentation on how to use the packet generator can be found
+	  at <file:Documentation/networking/pktgen.txt>.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called pktgen.
+
+config NET_TCPPROBE
+	tristate "TCP connection probing"
+	depends on INET && EXPERIMENTAL && PROC_FS && KPROBES
+	---help---
+	This module allows for capturing the changes to TCP connection
+	state in response to incoming packets. It is used for debugging
+	TCP congestion avoidance modules. If you don't understand
+	what was just said, you don't need it: say N.
+
+	Documentation on how to use TCP connection probing can be found
+	at:
+	
+	  http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
+
+	To compile this code as a module, choose M here: the
+	module will be called tcp_probe.
+
+config NET_DROP_MONITOR
+	boolean "Network packet drop alerting service"
+	depends on INET && EXPERIMENTAL && TRACEPOINTS
+	---help---
+	This feature provides an alerting service to userspace in the
+	event that packets are discarded in the network stack.  Alerts
+	are broadcast via netlink socket to any listening user space
+	process.  If you don't need network drop alerts, or if you are ok
+	just checking the various proc files and other utilities for
+	drop statistics, say N here.
+
+endmenu
+
+endmenu
+
+source "net/ax25/Kconfig"
+source "net/can/Kconfig"
+source "net/irda/Kconfig"
+source "net/bluetooth/Kconfig"
+source "net/rxrpc/Kconfig"
+
+config FIB_RULES
+	bool
+
+menuconfig WIRELESS
+	bool "Wireless"
+	depends on !S390
+	default y
+
+if WIRELESS
+
+source "net/wireless/Kconfig"
+source "net/mac80211/Kconfig"
+
+endif # WIRELESS
+
+source "net/wimax/Kconfig"
+
+source "net/rfkill/Kconfig"
+source "net/9p/Kconfig"
+source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
+
+
+endif   # if NET
diff --git a/net/Makefile b/net/Makefile
new file mode 100644
index 00000000..54808aba
--- /dev/null
+++ b/net/Makefile
@@ -0,0 +1,71 @@
+#
+# Makefile for the linux networking.
+#
+# 2 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-y	:= nonet.o
+
+obj-$(CONFIG_NET)		:= socket.o core/
+
+tmp-$(CONFIG_COMPAT) 		:= compat.o
+obj-$(CONFIG_NET)		+= $(tmp-y)
+
+# LLC has to be linked before the files in net/802/
+obj-$(CONFIG_LLC)		+= llc/
+obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/
+obj-$(CONFIG_NETFILTER)		+= netfilter/
+obj-$(CONFIG_INET)		+= ipv4/
+obj-$(CONFIG_XFRM)		+= xfrm/
+obj-$(CONFIG_UNIX)		+= unix/
+obj-$(CONFIG_NET)		+= ipv6/
+obj-$(CONFIG_PACKET)		+= packet/
+obj-$(CONFIG_NET_KEY)		+= key/
+obj-$(CONFIG_BRIDGE)		+= bridge/
+obj-$(CONFIG_NET_DSA)		+= dsa/
+obj-$(CONFIG_IPX)		+= ipx/
+obj-$(CONFIG_ATALK)		+= appletalk/
+obj-$(CONFIG_WAN_ROUTER)	+= wanrouter/
+obj-$(CONFIG_X25)		+= x25/
+obj-$(CONFIG_LAPB)		+= lapb/
+obj-$(CONFIG_NETROM)		+= netrom/
+obj-$(CONFIG_ROSE)		+= rose/
+obj-$(CONFIG_AX25)		+= ax25/
+obj-$(CONFIG_CAN)		+= can/
+obj-$(CONFIG_IRDA)		+= irda/
+obj-$(CONFIG_BT)		+= bluetooth/
+obj-$(CONFIG_SUNRPC)		+= sunrpc/
+obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
+obj-$(CONFIG_ATM)		+= atm/
+obj-$(CONFIG_L2TP)		+= l2tp/
+obj-$(CONFIG_DECNET)		+= decnet/
+obj-$(CONFIG_ECONET)		+= econet/
+obj-$(CONFIG_PHONET)		+= phonet/
+ifneq ($(CONFIG_VLAN_8021Q),)
+obj-y				+= 8021q/
+endif
+obj-$(CONFIG_IP_DCCP)		+= dccp/
+obj-$(CONFIG_IP_SCTP)		+= sctp/
+obj-$(CONFIG_RDS)		+= rds/
+obj-$(CONFIG_WIRELESS)		+= wireless/
+obj-$(CONFIG_MAC80211)		+= mac80211/
+obj-$(CONFIG_TIPC)		+= tipc/
+obj-$(CONFIG_NETLABEL)		+= netlabel/
+obj-$(CONFIG_IUCV)		+= iucv/
+obj-$(CONFIG_RFKILL)		+= rfkill/
+obj-$(CONFIG_NET_9P)		+= 9p/
+obj-$(CONFIG_CAIF)		+= caif/
+ifneq ($(CONFIG_DCB),)
+obj-y				+= dcb/
+endif
+obj-$(CONFIG_IEEE802154)	+= ieee802154/
+
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
+endif
+obj-$(CONFIG_WIMAX)		+= wimax/
+obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
+obj-$(CONFIG_CEPH_LIB)		+= ceph/
+obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
+obj-$(CONFIG_NET_ACTIVITY_STATS)		+= activity_stats.o
diff --git a/net/TUNABLE b/net/TUNABLE
new file mode 100644
index 00000000..9913211f
--- /dev/null
+++ b/net/TUNABLE
@@ -0,0 +1,50 @@
+The following parameters should be tunable at compile time. Some of them
+exist as sysctls too.
+
+This is far from complete
+
+Item			Description
+----------------------------------------------------------------------------
+MAX_LINKS		Maximum number of netlink minor devices. (1-32)
+RIF_TABLE_SIZE		Token ring RIF cache size (tunable)
+AARP_HASH_SIZE		Size of Appletalk hash table (tunable)
+AX25_DEF_T1		AX.25 parameters. These are all tunable via
+AX25_DEF_T2		SIOCAX25SETPARMS
+AX25_DEF_T3		T1-T3,N2 have the meanings in the specification
+AX25_DEF_N2
+AX25_DEF_AXDEFMODE	8 = normal 128 is PE1CHL extended
+AX25_DEF_IPDEFMODE	'D' - datagram  'V' - virtual connection
+AX25_DEF_BACKOFF	'E'xponential 'L'inear
+AX25_DEF_NETROM		Allow netrom 1=Y
+AX25_DF_TEXT		Allow PID=Text 1=Y
+AX25_DEF_WINDOW		Window for normal mode
+AX25_DEF_EWINDOW	Window for PE1CHL mode
+AX25_DEF_DIGI		1 for inband 2 for cross band 3 for both
+AX25_DEF_CONMODE	Allow connected modes 1=Yes
+AX25_ROUTE_MAX		AX.25 route cache size - no currently tunable
+Unnamed (16)		Number of protocol hash slots (tunable)
+DEV_NUMBUFFS		Number of priority levels (not easily tunable)
+Unnamed (300)		Maximum packet backlog queue (tunable)
+MAX_IOVEC		Maximum number of iovecs in a message (tunable)
+MIN_WINDOW		Offered minimum window (tunable)
+MAX_WINDOW		Offered maximum window (tunable)
+MAX_HEADER		Largest physical header (tunable)
+MAX_ADDR_LEN		Largest physical address (tunable)
+SOCK_ARRAY_SIZE		IP socket array hash size (tunable)
+IP_MAX_MEMBERSHIPS	Largest number of groups per socket (BSD style) (tunable)
+16			Hard coded constant for amount of room allowed for
+			cache align and faster forwarding (tunable)
+IP_FRAG_TIME		Time we hold a fragment for. (tunable)
+PORT_MASQ_BEGIN		First port reserved for masquerade (tunable)
+PORT_MASQ_END		Last port used for masquerade	(tunable)
+MASQUERADE_EXPIRE_TCP_FIN	Time we keep a masquerade for after a FIN
+MASQUERADE_EXPIRE_UDP	Time we keep a UDP masquerade for (tunable)
+MAXVIFS			Maximum mrouted vifs (1-32)
+MFC_LINES		Lines in the multicast router cache (tunable)
+
+NetROM parameters are tunable via an ioctl passing a struct
+
+4000			Size a Unix domain socket malloc falls back to 
+			(tunable) should be 8K - a bit for 8K machines like
+			the ALPHA
+
diff --git a/net/activity_stats.c b/net/activity_stats.c
new file mode 100644
index 00000000..8a3e9347
--- /dev/null
+++ b/net/activity_stats.c
@@ -0,0 +1,115 @@
+/* net/activity_stats.c
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mike Chan (mike@android.com)
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/suspend.h>
+#include <net/net_namespace.h>
+
+/*
+ * Track transmission rates in buckets (power of 2).
+ * 1,2,4,8...512 seconds.
+ *
+ * Buckets represent the count of network transmissions at least
+ * N seconds apart, where N is 1 << bucket index.
+ */
+#define BUCKET_MAX 10
+
+/* Track network activity frequency */
+static unsigned long activity_stats[BUCKET_MAX];
+static ktime_t last_transmit;
+static ktime_t suspend_time;
+static DEFINE_SPINLOCK(activity_lock);
+
+void activity_stats_update(void)
+{
+	int i;
+	unsigned long flags;
+	ktime_t now;
+	s64 delta;
+
+	spin_lock_irqsave(&activity_lock, flags);
+	now = ktime_get();
+	delta = ktime_to_ns(ktime_sub(now, last_transmit));
+
+	for (i = BUCKET_MAX - 1; i >= 0; i--) {
+		/*
+		 * Check if the time delta between network activity is within the
+		 * minimum bucket range.
+		 */
+		if (delta < (1000000000ULL << i))
+			continue;
+
+		activity_stats[i]++;
+		last_transmit = now;
+		break;
+	}
+	spin_unlock_irqrestore(&activity_lock, flags);
+}
+
+static int activity_stats_read_proc(char *page, char **start, off_t off,
+					int count, int *eof, void *data)
+{
+	int i;
+	int len;
+	char *p = page;
+
+	/* Only print if offset is 0, or we have enough buffer space */
+	if (off || count < (30 * BUCKET_MAX + 22))
+		return -ENOMEM;
+
+	len = snprintf(p, count, "Min Bucket(sec) Count\n");
+	count -= len;
+	p += len;
+
+	for (i = 0; i < BUCKET_MAX; i++) {
+		len = snprintf(p, count, "%15d %lu\n", 1 << i, activity_stats[i]);
+		count -= len;
+		p += len;
+	}
+	*eof = 1;
+
+	return p - page;
+}
+
+static int activity_stats_notifier(struct notifier_block *nb,
+					unsigned long event, void *dummy)
+{
+	switch (event) {
+		case PM_SUSPEND_PREPARE:
+			suspend_time = ktime_get_real();
+			break;
+
+		case PM_POST_SUSPEND:
+			suspend_time = ktime_sub(ktime_get_real(), suspend_time);
+			last_transmit = ktime_sub(last_transmit, suspend_time);
+	}
+
+	return 0;
+}
+
+static struct notifier_block activity_stats_notifier_block = {
+	.notifier_call = activity_stats_notifier,
+};
+
+static int  __init activity_stats_init(void)
+{
+	create_proc_read_entry("activity", S_IRUGO,
+			init_net.proc_net_stat, activity_stats_read_proc, NULL);
+	return register_pm_notifier(&activity_stats_notifier_block);
+}
+
+subsys_initcall(activity_stats_init);
+
diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile
new file mode 100644
index 00000000..5cda56ed
--- /dev/null
+++ b/net/appletalk/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux AppleTalk layer.
+#
+
+obj-$(CONFIG_ATALK) += appletalk.o
+
+appletalk-y			:= aarp.o ddp.o dev.o
+appletalk-$(CONFIG_PROC_FS)	+= atalk_proc.o
+appletalk-$(CONFIG_SYSCTL)	+= sysctl_net_atalk.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
new file mode 100644
index 00000000..50dce798
--- /dev/null
+++ b/net/appletalk/aarp.c
@@ -0,0 +1,1063 @@
+/*
+ *	AARP:		An implementation of the AppleTalk AARP protocol for
+ *			Ethernet 'ELAP'.
+ *
+ *		Alan Cox  <Alan.Cox@linux.org>
+ *
+ *	This doesn't fit cleanly with the IP arp. Potentially we can use
+ *	the generic neighbour discovery code to clean this up.
+ *
+ *	FIXME:
+ *		We ought to handle the retransmits with a single list and a
+ *	separate fast timer for when it is needed.
+ *		Use neighbour discovery code.
+ *		Token Ring Support.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *
+ *	References:
+ *		Inside AppleTalk (2nd Ed).
+ *	Fixes:
+ *		Jaume Grau	-	flush caches on AARP_PROBE
+ *		Rob Newberry	-	Added proxy AARP and AARP proc fs,
+ *					moved probing from DDP module.
+ *		Arnaldo C. Melo -	don't mangle rx packets
+ *
+ */
+
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <linux/atalk.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME;
+int sysctl_aarp_tick_time = AARP_TICK_TIME;
+int sysctl_aarp_retransmit_limit = AARP_RETRANSMIT_LIMIT;
+int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME;
+
+/* Lists of aarp entries */
+/**
+ *	struct aarp_entry - AARP entry
+ *	@last_sent - Last time we xmitted the aarp request
+ *	@packet_queue - Queue of frames wait for resolution
+ *	@status - Used for proxy AARP
+ *	expires_at - Entry expiry time
+ *	target_addr - DDP Address
+ *	dev - Device to use
+ *	hwaddr - Physical i/f address of target/router
+ *	xmit_count - When this hits 10 we give up
+ *	next - Next entry in chain
+ */
+struct aarp_entry {
+	/* These first two are only used for unresolved entries */
+	unsigned long		last_sent;
+	struct sk_buff_head	packet_queue;
+	int			status;
+	unsigned long		expires_at;
+	struct atalk_addr	target_addr;
+	struct net_device	*dev;
+	char			hwaddr[6];
+	unsigned short		xmit_count;
+	struct aarp_entry	*next;
+};
+
+/* Hashed list of resolved, unresolved and proxy entries */
+static struct aarp_entry *resolved[AARP_HASH_SIZE];
+static struct aarp_entry *unresolved[AARP_HASH_SIZE];
+static struct aarp_entry *proxies[AARP_HASH_SIZE];
+static int unresolved_count;
+
+/* One lock protects it all. */
+static DEFINE_RWLOCK(aarp_lock);
+
+/* Used to walk the list and purge/kick entries.  */
+static struct timer_list aarp_timer;
+
+/*
+ *	Delete an aarp queue
+ *
+ *	Must run under aarp_lock.
+ */
+static void __aarp_expire(struct aarp_entry *a)
+{
+	skb_queue_purge(&a->packet_queue);
+	kfree(a);
+}
+
+/*
+ *	Send an aarp queue entry request
+ *
+ *	Must run under aarp_lock.
+ */
+static void __aarp_send_query(struct aarp_entry *a)
+{
+	static unsigned char aarp_eth_multicast[ETH_ALEN] =
+					{ 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+	struct net_device *dev = a->dev;
+	struct elapaarp *eah;
+	int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+	struct atalk_addr *sat = atalk_find_dev_addr(dev);
+
+	if (!skb)
+		return;
+
+	if (!sat) {
+		kfree_skb(skb);
+		return;
+	}
+
+	/* Set up the buffer */
+	skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_put(skb, sizeof(*eah));
+	skb->protocol    = htons(ETH_P_ATALK);
+	skb->dev	 = dev;
+	eah		 = aarp_hdr(skb);
+
+	/* Set up the ARP */
+	eah->hw_type	 = htons(AARP_HW_TYPE_ETHERNET);
+	eah->pa_type	 = htons(ETH_P_ATALK);
+	eah->hw_len	 = ETH_ALEN;
+	eah->pa_len	 = AARP_PA_ALEN;
+	eah->function	 = htons(AARP_REQUEST);
+
+	memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+
+	eah->pa_src_zero = 0;
+	eah->pa_src_net	 = sat->s_net;
+	eah->pa_src_node = sat->s_node;
+
+	memset(eah->hw_dst, '\0', ETH_ALEN);
+
+	eah->pa_dst_zero = 0;
+	eah->pa_dst_net	 = a->target_addr.s_net;
+	eah->pa_dst_node = a->target_addr.s_node;
+
+	/* Send it */
+	aarp_dl->request(aarp_dl, skb, aarp_eth_multicast);
+	/* Update the sending count */
+	a->xmit_count++;
+	a->last_sent = jiffies;
+}
+
+/* This runs under aarp_lock and in softint context, so only atomic memory
+ * allocations can be used. */
+static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us,
+			    struct atalk_addr *them, unsigned char *sha)
+{
+	struct elapaarp *eah;
+	int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	/* Set up the buffer */
+	skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_put(skb, sizeof(*eah));
+	skb->protocol    = htons(ETH_P_ATALK);
+	skb->dev	 = dev;
+	eah		 = aarp_hdr(skb);
+
+	/* Set up the ARP */
+	eah->hw_type	 = htons(AARP_HW_TYPE_ETHERNET);
+	eah->pa_type	 = htons(ETH_P_ATALK);
+	eah->hw_len	 = ETH_ALEN;
+	eah->pa_len	 = AARP_PA_ALEN;
+	eah->function	 = htons(AARP_REPLY);
+
+	memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+
+	eah->pa_src_zero = 0;
+	eah->pa_src_net	 = us->s_net;
+	eah->pa_src_node = us->s_node;
+
+	if (!sha)
+		memset(eah->hw_dst, '\0', ETH_ALEN);
+	else
+		memcpy(eah->hw_dst, sha, ETH_ALEN);
+
+	eah->pa_dst_zero = 0;
+	eah->pa_dst_net	 = them->s_net;
+	eah->pa_dst_node = them->s_node;
+
+	/* Send it */
+	aarp_dl->request(aarp_dl, skb, sha);
+}
+
+/*
+ *	Send probe frames. Called from aarp_probe_network and
+ *	aarp_proxy_probe_network.
+ */
+
+static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us)
+{
+	struct elapaarp *eah;
+	int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+	static unsigned char aarp_eth_multicast[ETH_ALEN] =
+					{ 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+
+	if (!skb)
+		return;
+
+	/* Set up the buffer */
+	skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_put(skb, sizeof(*eah));
+	skb->protocol    = htons(ETH_P_ATALK);
+	skb->dev	 = dev;
+	eah		 = aarp_hdr(skb);
+
+	/* Set up the ARP */
+	eah->hw_type	 = htons(AARP_HW_TYPE_ETHERNET);
+	eah->pa_type	 = htons(ETH_P_ATALK);
+	eah->hw_len	 = ETH_ALEN;
+	eah->pa_len	 = AARP_PA_ALEN;
+	eah->function	 = htons(AARP_PROBE);
+
+	memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN);
+
+	eah->pa_src_zero = 0;
+	eah->pa_src_net	 = us->s_net;
+	eah->pa_src_node = us->s_node;
+
+	memset(eah->hw_dst, '\0', ETH_ALEN);
+
+	eah->pa_dst_zero = 0;
+	eah->pa_dst_net	 = us->s_net;
+	eah->pa_dst_node = us->s_node;
+
+	/* Send it */
+	aarp_dl->request(aarp_dl, skb, aarp_eth_multicast);
+}
+
+/*
+ *	Handle an aarp timer expire
+ *
+ *	Must run under the aarp_lock.
+ */
+
+static void __aarp_expire_timer(struct aarp_entry **n)
+{
+	struct aarp_entry *t;
+
+	while (*n)
+		/* Expired ? */
+		if (time_after(jiffies, (*n)->expires_at)) {
+			t = *n;
+			*n = (*n)->next;
+			__aarp_expire(t);
+		} else
+			n = &((*n)->next);
+}
+
+/*
+ *	Kick all pending requests 5 times a second.
+ *
+ *	Must run under the aarp_lock.
+ */
+static void __aarp_kick(struct aarp_entry **n)
+{
+	struct aarp_entry *t;
+
+	while (*n)
+		/* Expired: if this will be the 11th tx, we delete instead. */
+		if ((*n)->xmit_count >= sysctl_aarp_retransmit_limit) {
+			t = *n;
+			*n = (*n)->next;
+			__aarp_expire(t);
+		} else {
+			__aarp_send_query(*n);
+			n = &((*n)->next);
+		}
+}
+
+/*
+ *	A device has gone down. Take all entries referring to the device
+ *	and remove them.
+ *
+ *	Must run under the aarp_lock.
+ */
+static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev)
+{
+	struct aarp_entry *t;
+
+	while (*n)
+		if ((*n)->dev == dev) {
+			t = *n;
+			*n = (*n)->next;
+			__aarp_expire(t);
+		} else
+			n = &((*n)->next);
+}
+
+/* Handle the timer event */
+static void aarp_expire_timeout(unsigned long unused)
+{
+	int ct;
+
+	write_lock_bh(&aarp_lock);
+
+	for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+		__aarp_expire_timer(&resolved[ct]);
+		__aarp_kick(&unresolved[ct]);
+		__aarp_expire_timer(&unresolved[ct]);
+		__aarp_expire_timer(&proxies[ct]);
+	}
+
+	write_unlock_bh(&aarp_lock);
+	mod_timer(&aarp_timer, jiffies +
+			       (unresolved_count ? sysctl_aarp_tick_time :
+				sysctl_aarp_expiry_time));
+}
+
+/* Network device notifier chain handler. */
+static int aarp_device_event(struct notifier_block *this, unsigned long event,
+			     void *ptr)
+{
+	struct net_device *dev = ptr;
+	int ct;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_DOWN) {
+		write_lock_bh(&aarp_lock);
+
+		for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+			__aarp_expire_device(&resolved[ct], dev);
+			__aarp_expire_device(&unresolved[ct], dev);
+			__aarp_expire_device(&proxies[ct], dev);
+		}
+
+		write_unlock_bh(&aarp_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+/* Expire all entries in a hash chain */
+static void __aarp_expire_all(struct aarp_entry **n)
+{
+	struct aarp_entry *t;
+
+	while (*n) {
+		t = *n;
+		*n = (*n)->next;
+		__aarp_expire(t);
+	}
+}
+
+/* Cleanup all hash chains -- module unloading */
+static void aarp_purge(void)
+{
+	int ct;
+
+	write_lock_bh(&aarp_lock);
+	for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+		__aarp_expire_all(&resolved[ct]);
+		__aarp_expire_all(&unresolved[ct]);
+		__aarp_expire_all(&proxies[ct]);
+	}
+	write_unlock_bh(&aarp_lock);
+}
+
+/*
+ *	Create a new aarp entry.  This must use GFP_ATOMIC because it
+ *	runs while holding spinlocks.
+ */
+static struct aarp_entry *aarp_alloc(void)
+{
+	struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC);
+
+	if (a)
+		skb_queue_head_init(&a->packet_queue);
+	return a;
+}
+
+/*
+ * Find an entry. We might return an expired but not yet purged entry. We
+ * don't care as it will do no harm.
+ *
+ * This must run under the aarp_lock.
+ */
+static struct aarp_entry *__aarp_find_entry(struct aarp_entry *list,
+					    struct net_device *dev,
+					    struct atalk_addr *sat)
+{
+	while (list) {
+		if (list->target_addr.s_net == sat->s_net &&
+		    list->target_addr.s_node == sat->s_node &&
+		    list->dev == dev)
+			break;
+		list = list->next;
+	}
+
+	return list;
+}
+
+/* Called from the DDP code, and thus must be exported. */
+void aarp_proxy_remove(struct net_device *dev, struct atalk_addr *sa)
+{
+	int hash = sa->s_node % (AARP_HASH_SIZE - 1);
+	struct aarp_entry *a;
+
+	write_lock_bh(&aarp_lock);
+
+	a = __aarp_find_entry(proxies[hash], dev, sa);
+	if (a)
+		a->expires_at = jiffies - 1;
+
+	write_unlock_bh(&aarp_lock);
+}
+
+/* This must run under aarp_lock. */
+static struct atalk_addr *__aarp_proxy_find(struct net_device *dev,
+					    struct atalk_addr *sa)
+{
+	int hash = sa->s_node % (AARP_HASH_SIZE - 1);
+	struct aarp_entry *a = __aarp_find_entry(proxies[hash], dev, sa);
+
+	return a ? sa : NULL;
+}
+
+/*
+ * Probe a Phase 1 device or a device that requires its Net:Node to
+ * be set via an ioctl.
+ */
+static void aarp_send_probe_phase1(struct atalk_iface *iface)
+{
+	struct ifreq atreq;
+	struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr;
+	const struct net_device_ops *ops = iface->dev->netdev_ops;
+
+	sa->sat_addr.s_node = iface->address.s_node;
+	sa->sat_addr.s_net = ntohs(iface->address.s_net);
+
+	/* We pass the Net:Node to the drivers/cards by a Device ioctl. */
+	if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) {
+		ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR);
+		if (iface->address.s_net != htons(sa->sat_addr.s_net) ||
+		    iface->address.s_node != sa->sat_addr.s_node)
+			iface->status |= ATIF_PROBE_FAIL;
+
+		iface->address.s_net  = htons(sa->sat_addr.s_net);
+		iface->address.s_node = sa->sat_addr.s_node;
+	}
+}
+
+
+void aarp_probe_network(struct atalk_iface *atif)
+{
+	if (atif->dev->type == ARPHRD_LOCALTLK ||
+	    atif->dev->type == ARPHRD_PPP)
+		aarp_send_probe_phase1(atif);
+	else {
+		unsigned int count;
+
+		for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+			aarp_send_probe(atif->dev, &atif->address);
+
+			/* Defer 1/10th */
+			msleep(100);
+
+			if (atif->status & ATIF_PROBE_FAIL)
+				break;
+		}
+	}
+}
+
+int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
+{
+	int hash, retval = -EPROTONOSUPPORT;
+	struct aarp_entry *entry;
+	unsigned int count;
+
+	/*
+	 * we don't currently support LocalTalk or PPP for proxy AARP;
+	 * if someone wants to try and add it, have fun
+	 */
+	if (atif->dev->type == ARPHRD_LOCALTLK ||
+	    atif->dev->type == ARPHRD_PPP)
+		goto out;
+
+	/*
+	 * create a new AARP entry with the flags set to be published --
+	 * we need this one to hang around even if it's in use
+	 */
+	entry = aarp_alloc();
+	retval = -ENOMEM;
+	if (!entry)
+		goto out;
+
+	entry->expires_at = -1;
+	entry->status = ATIF_PROBE;
+	entry->target_addr.s_node = sa->s_node;
+	entry->target_addr.s_net = sa->s_net;
+	entry->dev = atif->dev;
+
+	write_lock_bh(&aarp_lock);
+
+	hash = sa->s_node % (AARP_HASH_SIZE - 1);
+	entry->next = proxies[hash];
+	proxies[hash] = entry;
+
+	for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+		aarp_send_probe(atif->dev, sa);
+
+		/* Defer 1/10th */
+		write_unlock_bh(&aarp_lock);
+		msleep(100);
+		write_lock_bh(&aarp_lock);
+
+		if (entry->status & ATIF_PROBE_FAIL)
+			break;
+	}
+
+	if (entry->status & ATIF_PROBE_FAIL) {
+		entry->expires_at = jiffies - 1; /* free the entry */
+		retval = -EADDRINUSE; /* return network full */
+	} else { /* clear the probing flag */
+		entry->status &= ~ATIF_PROBE;
+		retval = 1;
+	}
+
+	write_unlock_bh(&aarp_lock);
+out:
+	return retval;
+}
+
+/* Send a DDP frame */
+int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
+		  struct atalk_addr *sa, void *hwaddr)
+{
+	static char ddp_eth_multicast[ETH_ALEN] =
+		{ 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF };
+	int hash;
+	struct aarp_entry *a;
+
+	skb_reset_network_header(skb);
+
+	/* Check for LocalTalk first */
+	if (dev->type == ARPHRD_LOCALTLK) {
+		struct atalk_addr *at = atalk_find_dev_addr(dev);
+		struct ddpehdr *ddp = (struct ddpehdr *)skb->data;
+		int ft = 2;
+
+		/*
+		 * Compressible ?
+		 *
+		 * IFF: src_net == dest_net == device_net
+		 * (zero matches anything)
+		 */
+
+		if ((!ddp->deh_snet || at->s_net == ddp->deh_snet) &&
+		    (!ddp->deh_dnet || at->s_net == ddp->deh_dnet)) {
+			skb_pull(skb, sizeof(*ddp) - 4);
+
+			/*
+			 *	The upper two remaining bytes are the port
+			 *	numbers	we just happen to need. Now put the
+			 *	length in the lower two.
+			 */
+			*((__be16 *)skb->data) = htons(skb->len);
+			ft = 1;
+		}
+		/*
+		 * Nice and easy. No AARP type protocols occur here so we can
+		 * just shovel it out with a 3 byte LLAP header
+		 */
+
+		skb_push(skb, 3);
+		skb->data[0] = sa->s_node;
+		skb->data[1] = at->s_node;
+		skb->data[2] = ft;
+		skb->dev     = dev;
+		goto sendit;
+	}
+
+	/* On a PPP link we neither compress nor aarp.  */
+	if (dev->type == ARPHRD_PPP) {
+		skb->protocol = htons(ETH_P_PPPTALK);
+		skb->dev = dev;
+		goto sendit;
+	}
+
+	/* Non ELAP we cannot do. */
+	if (dev->type != ARPHRD_ETHER)
+		goto free_it;
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_ATALK);
+	hash = sa->s_node % (AARP_HASH_SIZE - 1);
+
+	/* Do we have a resolved entry? */
+	if (sa->s_node == ATADDR_BCAST) {
+		/* Send it */
+		ddp_dl->request(ddp_dl, skb, ddp_eth_multicast);
+		goto sent;
+	}
+
+	write_lock_bh(&aarp_lock);
+	a = __aarp_find_entry(resolved[hash], dev, sa);
+
+	if (a) { /* Return 1 and fill in the address */
+		a->expires_at = jiffies + (sysctl_aarp_expiry_time * 10);
+		ddp_dl->request(ddp_dl, skb, a->hwaddr);
+		write_unlock_bh(&aarp_lock);
+		goto sent;
+	}
+
+	/* Do we have an unresolved entry: This is the less common path */
+	a = __aarp_find_entry(unresolved[hash], dev, sa);
+	if (a) { /* Queue onto the unresolved queue */
+		skb_queue_tail(&a->packet_queue, skb);
+		goto out_unlock;
+	}
+
+	/* Allocate a new entry */
+	a = aarp_alloc();
+	if (!a) {
+		/* Whoops slipped... good job it's an unreliable protocol 8) */
+		write_unlock_bh(&aarp_lock);
+		goto free_it;
+	}
+
+	/* Set up the queue */
+	skb_queue_tail(&a->packet_queue, skb);
+	a->expires_at	 = jiffies + sysctl_aarp_resolve_time;
+	a->dev		 = dev;
+	a->next		 = unresolved[hash];
+	a->target_addr	 = *sa;
+	a->xmit_count	 = 0;
+	unresolved[hash] = a;
+	unresolved_count++;
+
+	/* Send an initial request for the address */
+	__aarp_send_query(a);
+
+	/*
+	 * Switch to fast timer if needed (That is if this is the first
+	 * unresolved entry to get added)
+	 */
+
+	if (unresolved_count == 1)
+		mod_timer(&aarp_timer, jiffies + sysctl_aarp_tick_time);
+
+	/* Now finally, it is safe to drop the lock. */
+out_unlock:
+	write_unlock_bh(&aarp_lock);
+
+	/* Tell the ddp layer we have taken over for this frame. */
+	goto sent;
+
+sendit:
+	if (skb->sk)
+		skb->priority = skb->sk->sk_priority;
+	if (dev_queue_xmit(skb))
+		goto drop;
+sent:
+	return NET_XMIT_SUCCESS;
+free_it:
+	kfree_skb(skb);
+drop:
+	return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(aarp_send_ddp);
+
+/*
+ *	An entry in the aarp unresolved queue has become resolved. Send
+ *	all the frames queued under it.
+ *
+ *	Must run under aarp_lock.
+ */
+static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
+			    int hash)
+{
+	struct sk_buff *skb;
+
+	while (*list)
+		if (*list == a) {
+			unresolved_count--;
+			*list = a->next;
+
+			/* Move into the resolved list */
+			a->next = resolved[hash];
+			resolved[hash] = a;
+
+			/* Kick frames off */
+			while ((skb = skb_dequeue(&a->packet_queue)) != NULL) {
+				a->expires_at = jiffies +
+						sysctl_aarp_expiry_time * 10;
+				ddp_dl->request(ddp_dl, skb, a->hwaddr);
+			}
+		} else
+			list = &((*list)->next);
+}
+
+/*
+ *	This is called by the SNAP driver whenever we see an AARP SNAP
+ *	frame. We currently only support Ethernet.
+ */
+static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct elapaarp *ea = aarp_hdr(skb);
+	int hash, ret = 0;
+	__u16 function;
+	struct aarp_entry *a;
+	struct atalk_addr sa, *ma, da;
+	struct atalk_iface *ifa;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto out0;
+
+	/* We only do Ethernet SNAP AARP. */
+	if (dev->type != ARPHRD_ETHER)
+		goto out0;
+
+	/* Frame size ok? */
+	if (!skb_pull(skb, sizeof(*ea)))
+		goto out0;
+
+	function = ntohs(ea->function);
+
+	/* Sanity check fields. */
+	if (function < AARP_REQUEST || function > AARP_PROBE ||
+	    ea->hw_len != ETH_ALEN || ea->pa_len != AARP_PA_ALEN ||
+	    ea->pa_src_zero || ea->pa_dst_zero)
+		goto out0;
+
+	/* Looks good. */
+	hash = ea->pa_src_node % (AARP_HASH_SIZE - 1);
+
+	/* Build an address. */
+	sa.s_node = ea->pa_src_node;
+	sa.s_net = ea->pa_src_net;
+
+	/* Process the packet. Check for replies of me. */
+	ifa = atalk_find_dev(dev);
+	if (!ifa)
+		goto out1;
+
+	if (ifa->status & ATIF_PROBE &&
+	    ifa->address.s_node == ea->pa_dst_node &&
+	    ifa->address.s_net == ea->pa_dst_net) {
+		ifa->status |= ATIF_PROBE_FAIL; /* Fail the probe (in use) */
+		goto out1;
+	}
+
+	/* Check for replies of proxy AARP entries */
+	da.s_node = ea->pa_dst_node;
+	da.s_net  = ea->pa_dst_net;
+
+	write_lock_bh(&aarp_lock);
+	a = __aarp_find_entry(proxies[hash], dev, &da);
+
+	if (a && a->status & ATIF_PROBE) {
+		a->status |= ATIF_PROBE_FAIL;
+		/*
+		 * we do not respond to probe or request packets for
+		 * this address while we are probing this address
+		 */
+		goto unlock;
+	}
+
+	switch (function) {
+		case AARP_REPLY:
+			if (!unresolved_count)	/* Speed up */
+				break;
+
+			/* Find the entry.  */
+			a = __aarp_find_entry(unresolved[hash], dev, &sa);
+			if (!a || dev != a->dev)
+				break;
+
+			/* We can fill one in - this is good. */
+			memcpy(a->hwaddr, ea->hw_src, ETH_ALEN);
+			__aarp_resolved(&unresolved[hash], a, hash);
+			if (!unresolved_count)
+				mod_timer(&aarp_timer,
+					  jiffies + sysctl_aarp_expiry_time);
+			break;
+
+		case AARP_REQUEST:
+		case AARP_PROBE:
+
+			/*
+			 * If it is my address set ma to my address and reply.
+			 * We can treat probe and request the same.  Probe
+			 * simply means we shouldn't cache the querying host,
+			 * as in a probe they are proposing an address not
+			 * using one.
+			 *
+			 * Support for proxy-AARP added. We check if the
+			 * address is one of our proxies before we toss the
+			 * packet out.
+			 */
+
+			sa.s_node = ea->pa_dst_node;
+			sa.s_net  = ea->pa_dst_net;
+
+			/* See if we have a matching proxy. */
+			ma = __aarp_proxy_find(dev, &sa);
+			if (!ma)
+				ma = &ifa->address;
+			else { /* We need to make a copy of the entry. */
+				da.s_node = sa.s_node;
+				da.s_net = sa.s_net;
+				ma = &da;
+			}
+
+			if (function == AARP_PROBE) {
+				/*
+				 * A probe implies someone trying to get an
+				 * address. So as a precaution flush any
+				 * entries we have for this address.
+				 */
+				a = __aarp_find_entry(resolved[sa.s_node %
+							  (AARP_HASH_SIZE - 1)],
+						      skb->dev, &sa);
+
+				/*
+				 * Make it expire next tick - that avoids us
+				 * getting into a probe/flush/learn/probe/
+				 * flush/learn cycle during probing of a slow
+				 * to respond host addr.
+				 */
+				if (a) {
+					a->expires_at = jiffies - 1;
+					mod_timer(&aarp_timer, jiffies +
+							sysctl_aarp_tick_time);
+				}
+			}
+
+			if (sa.s_node != ma->s_node)
+				break;
+
+			if (sa.s_net && ma->s_net && sa.s_net != ma->s_net)
+				break;
+
+			sa.s_node = ea->pa_src_node;
+			sa.s_net = ea->pa_src_net;
+
+			/* aarp_my_address has found the address to use for us.
+			*/
+			aarp_send_reply(dev, ma, &sa, ea->hw_src);
+			break;
+	}
+
+unlock:
+	write_unlock_bh(&aarp_lock);
+out1:
+	ret = 1;
+out0:
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct notifier_block aarp_notifier = {
+	.notifier_call = aarp_device_event,
+};
+
+static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 };
+
+void __init aarp_proto_init(void)
+{
+	aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
+	if (!aarp_dl)
+		printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
+	setup_timer(&aarp_timer, aarp_expire_timeout, 0);
+	aarp_timer.expires  = jiffies + sysctl_aarp_expiry_time;
+	add_timer(&aarp_timer);
+	register_netdevice_notifier(&aarp_notifier);
+}
+
+/* Remove the AARP entries associated with a device. */
+void aarp_device_down(struct net_device *dev)
+{
+	int ct;
+
+	write_lock_bh(&aarp_lock);
+
+	for (ct = 0; ct < AARP_HASH_SIZE; ct++) {
+		__aarp_expire_device(&resolved[ct], dev);
+		__aarp_expire_device(&unresolved[ct], dev);
+		__aarp_expire_device(&proxies[ct], dev);
+	}
+
+	write_unlock_bh(&aarp_lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct aarp_iter_state {
+	int bucket;
+	struct aarp_entry **table;
+};
+
+/*
+ * Get the aarp entry that is in the chain described
+ * by the iterator.
+ * If pos is set then skip till that index.
+ * pos = 1 is the first entry
+ */
+static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos)
+{
+	int ct = iter->bucket;
+	struct aarp_entry **table = iter->table;
+	loff_t off = 0;
+	struct aarp_entry *entry;
+
+ rescan:
+	while(ct < AARP_HASH_SIZE) {
+		for (entry = table[ct]; entry; entry = entry->next) {
+			if (!pos || ++off == *pos) {
+				iter->table = table;
+				iter->bucket = ct;
+				return entry;
+			}
+		}
+		++ct;
+	}
+
+	if (table == resolved) {
+		ct = 0;
+		table = unresolved;
+		goto rescan;
+	}
+	if (table == unresolved) {
+		ct = 0;
+		table = proxies;
+		goto rescan;
+	}
+	return NULL;
+}
+
+static void *aarp_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(aarp_lock)
+{
+	struct aarp_iter_state *iter = seq->private;
+
+	read_lock_bh(&aarp_lock);
+	iter->table     = resolved;
+	iter->bucket    = 0;
+
+	return *pos ? iter_next(iter, pos) : SEQ_START_TOKEN;
+}
+
+static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct aarp_entry *entry = v;
+	struct aarp_iter_state *iter = seq->private;
+
+	++*pos;
+
+	/* first line after header */
+	if (v == SEQ_START_TOKEN)
+		entry = iter_next(iter, NULL);
+
+	/* next entry in current bucket */
+	else if (entry->next)
+		entry = entry->next;
+
+	/* next bucket or table */
+	else {
+		++iter->bucket;
+		entry = iter_next(iter, NULL);
+	}
+	return entry;
+}
+
+static void aarp_seq_stop(struct seq_file *seq, void *v)
+	__releases(aarp_lock)
+{
+	read_unlock_bh(&aarp_lock);
+}
+
+static const char *dt2str(unsigned long ticks)
+{
+	static char buf[32];
+
+	sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100 ) / HZ);
+
+	return buf;
+}
+
+static int aarp_seq_show(struct seq_file *seq, void *v)
+{
+	struct aarp_iter_state *iter = seq->private;
+	struct aarp_entry *entry = v;
+	unsigned long now = jiffies;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "Address  Interface   Hardware Address"
+			 "   Expires LastSend  Retry Status\n");
+	else {
+		seq_printf(seq, "%04X:%02X  %-12s",
+			   ntohs(entry->target_addr.s_net),
+			   (unsigned int) entry->target_addr.s_node,
+			   entry->dev ? entry->dev->name : "????");
+		seq_printf(seq, "%pM", entry->hwaddr);
+		seq_printf(seq, " %8s",
+			   dt2str((long)entry->expires_at - (long)now));
+		if (iter->table == unresolved)
+			seq_printf(seq, " %8s %6hu",
+				   dt2str(now - entry->last_sent),
+				   entry->xmit_count);
+		else
+			seq_puts(seq, "                ");
+		seq_printf(seq, " %s\n",
+			   (iter->table == resolved) ? "resolved"
+			   : (iter->table == unresolved) ? "unresolved"
+			   : (iter->table == proxies) ? "proxies"
+			   : "unknown");
+	}
+	return 0;
+}
+
+static const struct seq_operations aarp_seq_ops = {
+	.start  = aarp_seq_start,
+	.next   = aarp_seq_next,
+	.stop   = aarp_seq_stop,
+	.show   = aarp_seq_show,
+};
+
+static int aarp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &aarp_seq_ops,
+			sizeof(struct aarp_iter_state));
+}
+
+const struct file_operations atalk_seq_arp_fops = {
+	.owner		= THIS_MODULE,
+	.open           = aarp_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+#endif
+
+/* General module cleanup. Called from cleanup_module() in ddp.c. */
+void aarp_cleanup_module(void)
+{
+	del_timer_sync(&aarp_timer);
+	unregister_netdevice_notifier(&aarp_notifier);
+	unregister_snap_client(aarp_dl);
+	aarp_purge();
+}
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
new file mode 100644
index 00000000..6ef0e761
--- /dev/null
+++ b/net/appletalk/atalk_proc.c
@@ -0,0 +1,301 @@
+/*
+ * 	atalk_proc.c - proc support for Appletalk
+ *
+ * 	Copyright(c) Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation, version 2.
+ */
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/atalk.h>
+
+
+static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos)
+{
+	struct atalk_iface *i;
+
+	for (i = atalk_interfaces; pos && i; i = i->next)
+		--pos;
+
+	return i;
+}
+
+static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos)
+	__acquires(atalk_interfaces_lock)
+{
+	loff_t l = *pos;
+
+	read_lock_bh(&atalk_interfaces_lock);
+	return l ? atalk_get_interface_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *atalk_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct atalk_iface *i;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		i = NULL;
+		if (atalk_interfaces)
+			i = atalk_interfaces;
+		goto out;
+	}
+	i = v;
+	i = i->next;
+out:
+	return i;
+}
+
+static void atalk_seq_interface_stop(struct seq_file *seq, void *v)
+	__releases(atalk_interfaces_lock)
+{
+	read_unlock_bh(&atalk_interfaces_lock);
+}
+
+static int atalk_seq_interface_show(struct seq_file *seq, void *v)
+{
+	struct atalk_iface *iface;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Interface        Address   Networks  "
+			      "Status\n");
+		goto out;
+	}
+
+	iface = v;
+	seq_printf(seq, "%-16s %04X:%02X  %04X-%04X  %d\n",
+		   iface->dev->name, ntohs(iface->address.s_net),
+		   iface->address.s_node, ntohs(iface->nets.nr_firstnet),
+		   ntohs(iface->nets.nr_lastnet), iface->status);
+out:
+	return 0;
+}
+
+static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos)
+{
+	struct atalk_route *r;
+
+	for (r = atalk_routes; pos && r; r = r->next)
+		--pos;
+
+	return r;
+}
+
+static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos)
+	__acquires(atalk_routes_lock)
+{
+	loff_t l = *pos;
+
+	read_lock_bh(&atalk_routes_lock);
+	return l ? atalk_get_route_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *atalk_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct atalk_route *r;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		r = NULL;
+		if (atalk_routes)
+			r = atalk_routes;
+		goto out;
+	}
+	r = v;
+	r = r->next;
+out:
+	return r;
+}
+
+static void atalk_seq_route_stop(struct seq_file *seq, void *v)
+	__releases(atalk_routes_lock)
+{
+	read_unlock_bh(&atalk_routes_lock);
+}
+
+static int atalk_seq_route_show(struct seq_file *seq, void *v)
+{
+	struct atalk_route *rt;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Target        Router  Flags Dev\n");
+		goto out;
+	}
+
+	if (atrtr_default.dev) {
+		rt = &atrtr_default;
+		seq_printf(seq, "Default     %04X:%02X  %-4d  %s\n",
+			       ntohs(rt->gateway.s_net), rt->gateway.s_node,
+			       rt->flags, rt->dev->name);
+	}
+
+	rt = v;
+	seq_printf(seq, "%04X:%02X     %04X:%02X  %-4d  %s\n",
+		   ntohs(rt->target.s_net), rt->target.s_node,
+		   ntohs(rt->gateway.s_net), rt->gateway.s_node,
+		   rt->flags, rt->dev->name);
+out:
+	return 0;
+}
+
+static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos)
+	__acquires(atalk_sockets_lock)
+{
+	read_lock_bh(&atalk_sockets_lock);
+	return seq_hlist_start_head(&atalk_sockets, *pos);
+}
+
+static void *atalk_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &atalk_sockets, pos);
+}
+
+static void atalk_seq_socket_stop(struct seq_file *seq, void *v)
+	__releases(atalk_sockets_lock)
+{
+	read_unlock_bh(&atalk_sockets_lock);
+}
+
+static int atalk_seq_socket_show(struct seq_file *seq, void *v)
+{
+	struct sock *s;
+	struct atalk_sock *at;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "Type Local_addr  Remote_addr Tx_queue "
+				"Rx_queue St UID\n");
+		goto out;
+	}
+
+	s = sk_entry(v);
+	at = at_sk(s);
+
+	seq_printf(seq, "%02X   %04X:%02X:%02X  %04X:%02X:%02X  %08X:%08X "
+			"%02X %d\n",
+		   s->sk_type, ntohs(at->src_net), at->src_node, at->src_port,
+		   ntohs(at->dest_net), at->dest_node, at->dest_port,
+		   sk_wmem_alloc_get(s),
+		   sk_rmem_alloc_get(s),
+		   s->sk_state, SOCK_INODE(s->sk_socket)->i_uid);
+out:
+	return 0;
+}
+
+static const struct seq_operations atalk_seq_interface_ops = {
+	.start  = atalk_seq_interface_start,
+	.next   = atalk_seq_interface_next,
+	.stop   = atalk_seq_interface_stop,
+	.show   = atalk_seq_interface_show,
+};
+
+static const struct seq_operations atalk_seq_route_ops = {
+	.start  = atalk_seq_route_start,
+	.next   = atalk_seq_route_next,
+	.stop   = atalk_seq_route_stop,
+	.show   = atalk_seq_route_show,
+};
+
+static const struct seq_operations atalk_seq_socket_ops = {
+	.start  = atalk_seq_socket_start,
+	.next   = atalk_seq_socket_next,
+	.stop   = atalk_seq_socket_stop,
+	.show   = atalk_seq_socket_show,
+};
+
+static int atalk_seq_interface_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &atalk_seq_interface_ops);
+}
+
+static int atalk_seq_route_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &atalk_seq_route_ops);
+}
+
+static int atalk_seq_socket_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &atalk_seq_socket_ops);
+}
+
+static const struct file_operations atalk_seq_interface_fops = {
+	.owner		= THIS_MODULE,
+	.open		= atalk_seq_interface_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations atalk_seq_route_fops = {
+	.owner		= THIS_MODULE,
+	.open		= atalk_seq_route_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations atalk_seq_socket_fops = {
+	.owner		= THIS_MODULE,
+	.open		= atalk_seq_socket_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct proc_dir_entry *atalk_proc_dir;
+
+int __init atalk_proc_init(void)
+{
+	struct proc_dir_entry *p;
+	int rc = -ENOMEM;
+
+	atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net);
+	if (!atalk_proc_dir)
+		goto out;
+
+	p = proc_create("interface", S_IRUGO, atalk_proc_dir,
+			&atalk_seq_interface_fops);
+	if (!p)
+		goto out_interface;
+
+	p = proc_create("route", S_IRUGO, atalk_proc_dir,
+			&atalk_seq_route_fops);
+	if (!p)
+		goto out_route;
+
+	p = proc_create("socket", S_IRUGO, atalk_proc_dir,
+			&atalk_seq_socket_fops);
+	if (!p)
+		goto out_socket;
+
+	p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops);
+	if (!p)
+		goto out_arp;
+
+	rc = 0;
+out:
+	return rc;
+out_arp:
+	remove_proc_entry("socket", atalk_proc_dir);
+out_socket:
+	remove_proc_entry("route", atalk_proc_dir);
+out_route:
+	remove_proc_entry("interface", atalk_proc_dir);
+out_interface:
+	remove_proc_entry("atalk", init_net.proc_net);
+	goto out;
+}
+
+void __exit atalk_proc_exit(void)
+{
+	remove_proc_entry("interface", atalk_proc_dir);
+	remove_proc_entry("route", atalk_proc_dir);
+	remove_proc_entry("socket", atalk_proc_dir);
+	remove_proc_entry("arp", atalk_proc_dir);
+	remove_proc_entry("atalk", init_net.proc_net);
+}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
new file mode 100644
index 00000000..956a5302
--- /dev/null
+++ b/net/appletalk/ddp.c
@@ -0,0 +1,1977 @@
+/*
+ *	DDP:	An implementation of the AppleTalk DDP protocol for
+ *		Ethernet 'ELAP'.
+ *
+ *		Alan Cox  <alan@lxorguk.ukuu.org.uk>
+ *
+ *		With more than a little assistance from
+ *
+ *		Wesley Craig <netatalk@umich.edu>
+ *
+ *	Fixes:
+ *		Neil Horman		:	Added missing device ioctls
+ *		Michael Callahan	:	Made routing work
+ *		Wesley Craig		:	Fix probing to listen to a
+ *						passed node id.
+ *		Alan Cox		:	Added send/recvmsg support
+ *		Alan Cox		:	Moved at. to protinfo in
+ *						socket.
+ *		Alan Cox		:	Added firewall hooks.
+ *		Alan Cox		:	Supports new ARPHRD_LOOPBACK
+ *		Christer Weinigel	: 	Routing and /proc fixes.
+ *		Bradford Johnson	:	LocalTalk.
+ *		Tom Dyas		:	Module support.
+ *		Alan Cox		:	Hooks for PPP (based on the
+ *						LocalTalk hook).
+ *		Alan Cox		:	Posix bits
+ *		Alan Cox/Mike Freeman	:	Possible fix to NBP problems
+ *		Bradford Johnson	:	IP-over-DDP (experimental)
+ *		Jay Schulist		:	Moved IP-over-DDP to its own
+ *						driver file. (ipddp.c & ipddp.h)
+ *		Jay Schulist		:	Made work as module with
+ *						AppleTalk drivers, cleaned it.
+ *		Rob Newberry		:	Added proxy AARP and AARP
+ *						procfs, moved probing to AARP
+ *						module.
+ *              Adrian Sun/
+ *              Michael Zuelsdorff      :       fix for net.0 packets. don't
+ *                                              allow illegal ether/tokentalk
+ *                                              port assignment. we lose a
+ *                                              valid localtalk port as a
+ *                                              result.
+ *		Arnaldo C. de Melo	:	Cleanup, in preparation for
+ *						shared skb support 8)
+ *		Arnaldo C. de Melo	:	Move proc stuff to atalk_proc.c,
+ *						use seq_file
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/termios.h>	/* For TIOCOUTQ/INQ */
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/route.h>
+#include <linux/atalk.h>
+#include "../core/kmap_skb.h"
+
+struct datalink_proto *ddp_dl, *aarp_dl;
+static const struct proto_ops atalk_dgram_ops;
+
+/**************************************************************************\
+*                                                                          *
+* Handlers for the socket list.                                            *
+*                                                                          *
+\**************************************************************************/
+
+HLIST_HEAD(atalk_sockets);
+DEFINE_RWLOCK(atalk_sockets_lock);
+
+static inline void __atalk_insert_socket(struct sock *sk)
+{
+	sk_add_node(sk, &atalk_sockets);
+}
+
+static inline void atalk_remove_socket(struct sock *sk)
+{
+	write_lock_bh(&atalk_sockets_lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&atalk_sockets_lock);
+}
+
+static struct sock *atalk_search_socket(struct sockaddr_at *to,
+					struct atalk_iface *atif)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	read_lock_bh(&atalk_sockets_lock);
+	sk_for_each(s, node, &atalk_sockets) {
+		struct atalk_sock *at = at_sk(s);
+
+		if (to->sat_port != at->src_port)
+			continue;
+
+		if (to->sat_addr.s_net == ATADDR_ANYNET &&
+		    to->sat_addr.s_node == ATADDR_BCAST)
+			goto found;
+
+		if (to->sat_addr.s_net == at->src_net &&
+		    (to->sat_addr.s_node == at->src_node ||
+		     to->sat_addr.s_node == ATADDR_BCAST ||
+		     to->sat_addr.s_node == ATADDR_ANYNODE))
+			goto found;
+
+		/* XXXX.0 -- we got a request for this router. make sure
+		 * that the node is appropriately set. */
+		if (to->sat_addr.s_node == ATADDR_ANYNODE &&
+		    to->sat_addr.s_net != ATADDR_ANYNET &&
+		    atif->address.s_node == at->src_node) {
+			to->sat_addr.s_node = atif->address.s_node;
+			goto found;
+		}
+	}
+	s = NULL;
+found:
+	read_unlock_bh(&atalk_sockets_lock);
+	return s;
+}
+
+/**
+ * atalk_find_or_insert_socket - Try to find a socket matching ADDR
+ * @sk - socket to insert in the list if it is not there already
+ * @sat - address to search for
+ *
+ * Try to find a socket matching ADDR in the socket list, if found then return
+ * it. If not, insert SK into the socket list.
+ *
+ * This entire operation must execute atomically.
+ */
+static struct sock *atalk_find_or_insert_socket(struct sock *sk,
+						struct sockaddr_at *sat)
+{
+	struct sock *s;
+	struct hlist_node *node;
+	struct atalk_sock *at;
+
+	write_lock_bh(&atalk_sockets_lock);
+	sk_for_each(s, node, &atalk_sockets) {
+		at = at_sk(s);
+
+		if (at->src_net == sat->sat_addr.s_net &&
+		    at->src_node == sat->sat_addr.s_node &&
+		    at->src_port == sat->sat_port)
+			goto found;
+	}
+	s = NULL;
+	__atalk_insert_socket(sk); /* Wheee, it's free, assign and insert. */
+found:
+	write_unlock_bh(&atalk_sockets_lock);
+	return s;
+}
+
+static void atalk_destroy_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	if (sk_has_allocations(sk)) {
+		sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
+		add_timer(&sk->sk_timer);
+	} else
+		sock_put(sk);
+}
+
+static inline void atalk_destroy_socket(struct sock *sk)
+{
+	atalk_remove_socket(sk);
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	if (sk_has_allocations(sk)) {
+		setup_timer(&sk->sk_timer, atalk_destroy_timer,
+				(unsigned long)sk);
+		sk->sk_timer.expires	= jiffies + SOCK_DESTROY_TIME;
+		add_timer(&sk->sk_timer);
+	} else
+		sock_put(sk);
+}
+
+/**************************************************************************\
+*                                                                          *
+* Routing tables for the AppleTalk socket layer.                           *
+*                                                                          *
+\**************************************************************************/
+
+/* Anti-deadlock ordering is atalk_routes_lock --> iface_lock -DaveM */
+struct atalk_route *atalk_routes;
+DEFINE_RWLOCK(atalk_routes_lock);
+
+struct atalk_iface *atalk_interfaces;
+DEFINE_RWLOCK(atalk_interfaces_lock);
+
+/* For probing devices or in a routerless network */
+struct atalk_route atrtr_default;
+
+/* AppleTalk interface control */
+/*
+ * Drop a device. Doesn't drop any of its routes - that is the caller's
+ * problem. Called when we down the interface or delete the address.
+ */
+static void atif_drop_device(struct net_device *dev)
+{
+	struct atalk_iface **iface = &atalk_interfaces;
+	struct atalk_iface *tmp;
+
+	write_lock_bh(&atalk_interfaces_lock);
+	while ((tmp = *iface) != NULL) {
+		if (tmp->dev == dev) {
+			*iface = tmp->next;
+			dev_put(dev);
+			kfree(tmp);
+			dev->atalk_ptr = NULL;
+		} else
+			iface = &tmp->next;
+	}
+	write_unlock_bh(&atalk_interfaces_lock);
+}
+
+static struct atalk_iface *atif_add_device(struct net_device *dev,
+					   struct atalk_addr *sa)
+{
+	struct atalk_iface *iface = kzalloc(sizeof(*iface), GFP_KERNEL);
+
+	if (!iface)
+		goto out;
+
+	dev_hold(dev);
+	iface->dev = dev;
+	dev->atalk_ptr = iface;
+	iface->address = *sa;
+	iface->status = 0;
+
+	write_lock_bh(&atalk_interfaces_lock);
+	iface->next = atalk_interfaces;
+	atalk_interfaces = iface;
+	write_unlock_bh(&atalk_interfaces_lock);
+out:
+	return iface;
+}
+
+/* Perform phase 2 AARP probing on our tentative address */
+static int atif_probe_device(struct atalk_iface *atif)
+{
+	int netrange = ntohs(atif->nets.nr_lastnet) -
+			ntohs(atif->nets.nr_firstnet) + 1;
+	int probe_net = ntohs(atif->address.s_net);
+	int probe_node = atif->address.s_node;
+	int netct, nodect;
+
+	/* Offset the network we start probing with */
+	if (probe_net == ATADDR_ANYNET) {
+		probe_net = ntohs(atif->nets.nr_firstnet);
+		if (netrange)
+			probe_net += jiffies % netrange;
+	}
+	if (probe_node == ATADDR_ANYNODE)
+		probe_node = jiffies & 0xFF;
+
+	/* Scan the networks */
+	atif->status |= ATIF_PROBE;
+	for (netct = 0; netct <= netrange; netct++) {
+		/* Sweep the available nodes from a given start */
+		atif->address.s_net = htons(probe_net);
+		for (nodect = 0; nodect < 256; nodect++) {
+			atif->address.s_node = (nodect + probe_node) & 0xFF;
+			if (atif->address.s_node > 0 &&
+			    atif->address.s_node < 254) {
+				/* Probe a proposed address */
+				aarp_probe_network(atif);
+
+				if (!(atif->status & ATIF_PROBE_FAIL)) {
+					atif->status &= ~ATIF_PROBE;
+					return 0;
+				}
+			}
+			atif->status &= ~ATIF_PROBE_FAIL;
+		}
+		probe_net++;
+		if (probe_net > ntohs(atif->nets.nr_lastnet))
+			probe_net = ntohs(atif->nets.nr_firstnet);
+	}
+	atif->status &= ~ATIF_PROBE;
+
+	return -EADDRINUSE;	/* Network is full... */
+}
+
+
+/* Perform AARP probing for a proxy address */
+static int atif_proxy_probe_device(struct atalk_iface *atif,
+				   struct atalk_addr* proxy_addr)
+{
+	int netrange = ntohs(atif->nets.nr_lastnet) -
+			ntohs(atif->nets.nr_firstnet) + 1;
+	/* we probe the interface's network */
+	int probe_net = ntohs(atif->address.s_net);
+	int probe_node = ATADDR_ANYNODE;	    /* we'll take anything */
+	int netct, nodect;
+
+	/* Offset the network we start probing with */
+	if (probe_net == ATADDR_ANYNET) {
+		probe_net = ntohs(atif->nets.nr_firstnet);
+		if (netrange)
+			probe_net += jiffies % netrange;
+	}
+
+	if (probe_node == ATADDR_ANYNODE)
+		probe_node = jiffies & 0xFF;
+
+	/* Scan the networks */
+	for (netct = 0; netct <= netrange; netct++) {
+		/* Sweep the available nodes from a given start */
+		proxy_addr->s_net = htons(probe_net);
+		for (nodect = 0; nodect < 256; nodect++) {
+			proxy_addr->s_node = (nodect + probe_node) & 0xFF;
+			if (proxy_addr->s_node > 0 &&
+			    proxy_addr->s_node < 254) {
+				/* Tell AARP to probe a proposed address */
+				int ret = aarp_proxy_probe_network(atif,
+								    proxy_addr);
+
+				if (ret != -EADDRINUSE)
+					return ret;
+			}
+		}
+		probe_net++;
+		if (probe_net > ntohs(atif->nets.nr_lastnet))
+			probe_net = ntohs(atif->nets.nr_firstnet);
+	}
+
+	return -EADDRINUSE;	/* Network is full... */
+}
+
+
+struct atalk_addr *atalk_find_dev_addr(struct net_device *dev)
+{
+	struct atalk_iface *iface = dev->atalk_ptr;
+	return iface ? &iface->address : NULL;
+}
+
+static struct atalk_addr *atalk_find_primary(void)
+{
+	struct atalk_iface *fiface = NULL;
+	struct atalk_addr *retval;
+	struct atalk_iface *iface;
+
+	/*
+	 * Return a point-to-point interface only if
+	 * there is no non-ptp interface available.
+	 */
+	read_lock_bh(&atalk_interfaces_lock);
+	for (iface = atalk_interfaces; iface; iface = iface->next) {
+		if (!fiface && !(iface->dev->flags & IFF_LOOPBACK))
+			fiface = iface;
+		if (!(iface->dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+			retval = &iface->address;
+			goto out;
+		}
+	}
+
+	if (fiface)
+		retval = &fiface->address;
+	else if (atalk_interfaces)
+		retval = &atalk_interfaces->address;
+	else
+		retval = NULL;
+out:
+	read_unlock_bh(&atalk_interfaces_lock);
+	return retval;
+}
+
+/*
+ * Find a match for 'any network' - ie any of our interfaces with that
+ * node number will do just nicely.
+ */
+static struct atalk_iface *atalk_find_anynet(int node, struct net_device *dev)
+{
+	struct atalk_iface *iface = dev->atalk_ptr;
+
+	if (!iface || iface->status & ATIF_PROBE)
+		goto out_err;
+
+	if (node != ATADDR_BCAST &&
+	    iface->address.s_node != node &&
+	    node != ATADDR_ANYNODE)
+		goto out_err;
+out:
+	return iface;
+out_err:
+	iface = NULL;
+	goto out;
+}
+
+/* Find a match for a specific network:node pair */
+static struct atalk_iface *atalk_find_interface(__be16 net, int node)
+{
+	struct atalk_iface *iface;
+
+	read_lock_bh(&atalk_interfaces_lock);
+	for (iface = atalk_interfaces; iface; iface = iface->next) {
+		if ((node == ATADDR_BCAST ||
+		     node == ATADDR_ANYNODE ||
+		     iface->address.s_node == node) &&
+		    iface->address.s_net == net &&
+		    !(iface->status & ATIF_PROBE))
+			break;
+
+		/* XXXX.0 -- net.0 returns the iface associated with net */
+		if (node == ATADDR_ANYNODE && net != ATADDR_ANYNET &&
+		    ntohs(iface->nets.nr_firstnet) <= ntohs(net) &&
+		    ntohs(net) <= ntohs(iface->nets.nr_lastnet))
+			break;
+	}
+	read_unlock_bh(&atalk_interfaces_lock);
+	return iface;
+}
+
+
+/*
+ * Find a route for an AppleTalk packet. This ought to get cached in
+ * the socket (later on...). We know about host routes and the fact
+ * that a route must be direct to broadcast.
+ */
+static struct atalk_route *atrtr_find(struct atalk_addr *target)
+{
+	/*
+	 * we must search through all routes unless we find a
+	 * host route, because some host routes might overlap
+	 * network routes
+	 */
+	struct atalk_route *net_route = NULL;
+	struct atalk_route *r;
+
+	read_lock_bh(&atalk_routes_lock);
+	for (r = atalk_routes; r; r = r->next) {
+		if (!(r->flags & RTF_UP))
+			continue;
+
+		if (r->target.s_net == target->s_net) {
+			if (r->flags & RTF_HOST) {
+				/*
+				 * if this host route is for the target,
+				 * the we're done
+				 */
+				if (r->target.s_node == target->s_node)
+					goto out;
+			} else
+				/*
+				 * this route will work if there isn't a
+				 * direct host route, so cache it
+				 */
+				net_route = r;
+		}
+	}
+
+	/*
+	 * if we found a network route but not a direct host
+	 * route, then return it
+	 */
+	if (net_route)
+		r = net_route;
+	else if (atrtr_default.dev)
+		r = &atrtr_default;
+	else /* No route can be found */
+		r = NULL;
+out:
+	read_unlock_bh(&atalk_routes_lock);
+	return r;
+}
+
+
+/*
+ * Given an AppleTalk network, find the device to use. This can be
+ * a simple lookup.
+ */
+struct net_device *atrtr_get_dev(struct atalk_addr *sa)
+{
+	struct atalk_route *atr = atrtr_find(sa);
+	return atr ? atr->dev : NULL;
+}
+
+/* Set up a default router */
+static void atrtr_set_default(struct net_device *dev)
+{
+	atrtr_default.dev	     = dev;
+	atrtr_default.flags	     = RTF_UP;
+	atrtr_default.gateway.s_net  = htons(0);
+	atrtr_default.gateway.s_node = 0;
+}
+
+/*
+ * Add a router. Basically make sure it looks valid and stuff the
+ * entry in the list. While it uses netranges we always set them to one
+ * entry to work like netatalk.
+ */
+static int atrtr_create(struct rtentry *r, struct net_device *devhint)
+{
+	struct sockaddr_at *ta = (struct sockaddr_at *)&r->rt_dst;
+	struct sockaddr_at *ga = (struct sockaddr_at *)&r->rt_gateway;
+	struct atalk_route *rt;
+	struct atalk_iface *iface, *riface;
+	int retval = -EINVAL;
+
+	/*
+	 * Fixme: Raise/Lower a routing change semaphore for these
+	 * operations.
+	 */
+
+	/* Validate the request */
+	if (ta->sat_family != AF_APPLETALK ||
+	    (!devhint && ga->sat_family != AF_APPLETALK))
+		goto out;
+
+	/* Now walk the routing table and make our decisions */
+	write_lock_bh(&atalk_routes_lock);
+	for (rt = atalk_routes; rt; rt = rt->next) {
+		if (r->rt_flags != rt->flags)
+			continue;
+
+		if (ta->sat_addr.s_net == rt->target.s_net) {
+			if (!(rt->flags & RTF_HOST))
+				break;
+			if (ta->sat_addr.s_node == rt->target.s_node)
+				break;
+		}
+	}
+
+	if (!devhint) {
+		riface = NULL;
+
+		read_lock_bh(&atalk_interfaces_lock);
+		for (iface = atalk_interfaces; iface; iface = iface->next) {
+			if (!riface &&
+			    ntohs(ga->sat_addr.s_net) >=
+					ntohs(iface->nets.nr_firstnet) &&
+			    ntohs(ga->sat_addr.s_net) <=
+					ntohs(iface->nets.nr_lastnet))
+				riface = iface;
+
+			if (ga->sat_addr.s_net == iface->address.s_net &&
+			    ga->sat_addr.s_node == iface->address.s_node)
+				riface = iface;
+		}
+		read_unlock_bh(&atalk_interfaces_lock);
+
+		retval = -ENETUNREACH;
+		if (!riface)
+			goto out_unlock;
+
+		devhint = riface->dev;
+	}
+
+	if (!rt) {
+		rt = kzalloc(sizeof(*rt), GFP_ATOMIC);
+
+		retval = -ENOBUFS;
+		if (!rt)
+			goto out_unlock;
+
+		rt->next = atalk_routes;
+		atalk_routes = rt;
+	}
+
+	/* Fill in the routing entry */
+	rt->target  = ta->sat_addr;
+	dev_hold(devhint);
+	rt->dev     = devhint;
+	rt->flags   = r->rt_flags;
+	rt->gateway = ga->sat_addr;
+
+	retval = 0;
+out_unlock:
+	write_unlock_bh(&atalk_routes_lock);
+out:
+	return retval;
+}
+
+/* Delete a route. Find it and discard it */
+static int atrtr_delete(struct atalk_addr * addr)
+{
+	struct atalk_route **r = &atalk_routes;
+	int retval = 0;
+	struct atalk_route *tmp;
+
+	write_lock_bh(&atalk_routes_lock);
+	while ((tmp = *r) != NULL) {
+		if (tmp->target.s_net == addr->s_net &&
+		    (!(tmp->flags&RTF_GATEWAY) ||
+		     tmp->target.s_node == addr->s_node)) {
+			*r = tmp->next;
+			dev_put(tmp->dev);
+			kfree(tmp);
+			goto out;
+		}
+		r = &tmp->next;
+	}
+	retval = -ENOENT;
+out:
+	write_unlock_bh(&atalk_routes_lock);
+	return retval;
+}
+
+/*
+ * Called when a device is downed. Just throw away any routes
+ * via it.
+ */
+static void atrtr_device_down(struct net_device *dev)
+{
+	struct atalk_route **r = &atalk_routes;
+	struct atalk_route *tmp;
+
+	write_lock_bh(&atalk_routes_lock);
+	while ((tmp = *r) != NULL) {
+		if (tmp->dev == dev) {
+			*r = tmp->next;
+			dev_put(dev);
+			kfree(tmp);
+		} else
+			r = &tmp->next;
+	}
+	write_unlock_bh(&atalk_routes_lock);
+
+	if (atrtr_default.dev == dev)
+		atrtr_set_default(NULL);
+}
+
+/* Actually down the interface */
+static inline void atalk_dev_down(struct net_device *dev)
+{
+	atrtr_device_down(dev);	/* Remove all routes for the device */
+	aarp_device_down(dev);	/* Remove AARP entries for the device */
+	atif_drop_device(dev);	/* Remove the device */
+}
+
+/*
+ * A device event has occurred. Watch for devices going down and
+ * delete our use of them (iface and route).
+ */
+static int ddp_device_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_DOWN)
+		/* Discard any use of this */
+		atalk_dev_down(dev);
+
+	return NOTIFY_DONE;
+}
+
+/* ioctl calls. Shouldn't even need touching */
+/* Device configuration ioctl calls */
+static int atif_ioctl(int cmd, void __user *arg)
+{
+	static char aarp_mcast[6] = { 0x09, 0x00, 0x00, 0xFF, 0xFF, 0xFF };
+	struct ifreq atreq;
+	struct atalk_netrange *nr;
+	struct sockaddr_at *sa;
+	struct net_device *dev;
+	struct atalk_iface *atif;
+	int ct;
+	int limit;
+	struct rtentry rtdef;
+	int add_route;
+
+	if (copy_from_user(&atreq, arg, sizeof(atreq)))
+		return -EFAULT;
+
+	dev = __dev_get_by_name(&init_net, atreq.ifr_name);
+	if (!dev)
+		return -ENODEV;
+
+	sa = (struct sockaddr_at *)&atreq.ifr_addr;
+	atif = atalk_find_dev(dev);
+
+	switch (cmd) {
+		case SIOCSIFADDR:
+			if (!capable(CAP_NET_ADMIN))
+				return -EPERM;
+			if (sa->sat_family != AF_APPLETALK)
+				return -EINVAL;
+			if (dev->type != ARPHRD_ETHER &&
+			    dev->type != ARPHRD_LOOPBACK &&
+			    dev->type != ARPHRD_LOCALTLK &&
+			    dev->type != ARPHRD_PPP)
+				return -EPROTONOSUPPORT;
+
+			nr = (struct atalk_netrange *)&sa->sat_zero[0];
+			add_route = 1;
+
+			/*
+			 * if this is a point-to-point iface, and we already
+			 * have an iface for this AppleTalk address, then we
+			 * should not add a route
+			 */
+			if ((dev->flags & IFF_POINTOPOINT) &&
+			    atalk_find_interface(sa->sat_addr.s_net,
+						 sa->sat_addr.s_node)) {
+				printk(KERN_DEBUG "AppleTalk: point-to-point "
+						  "interface added with "
+						  "existing address\n");
+				add_route = 0;
+			}
+
+			/*
+			 * Phase 1 is fine on LocalTalk but we don't do
+			 * EtherTalk phase 1. Anyone wanting to add it go ahead.
+			 */
+			if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+				return -EPROTONOSUPPORT;
+			if (sa->sat_addr.s_node == ATADDR_BCAST ||
+			    sa->sat_addr.s_node == 254)
+				return -EINVAL;
+			if (atif) {
+				/* Already setting address */
+				if (atif->status & ATIF_PROBE)
+					return -EBUSY;
+
+				atif->address.s_net  = sa->sat_addr.s_net;
+				atif->address.s_node = sa->sat_addr.s_node;
+				atrtr_device_down(dev);	/* Flush old routes */
+			} else {
+				atif = atif_add_device(dev, &sa->sat_addr);
+				if (!atif)
+					return -ENOMEM;
+			}
+			atif->nets = *nr;
+
+			/*
+			 * Check if the chosen address is used. If so we
+			 * error and atalkd will try another.
+			 */
+
+			if (!(dev->flags & IFF_LOOPBACK) &&
+			    !(dev->flags & IFF_POINTOPOINT) &&
+			    atif_probe_device(atif) < 0) {
+				atif_drop_device(dev);
+				return -EADDRINUSE;
+			}
+
+			/* Hey it worked - add the direct routes */
+			sa = (struct sockaddr_at *)&rtdef.rt_gateway;
+			sa->sat_family = AF_APPLETALK;
+			sa->sat_addr.s_net  = atif->address.s_net;
+			sa->sat_addr.s_node = atif->address.s_node;
+			sa = (struct sockaddr_at *)&rtdef.rt_dst;
+			rtdef.rt_flags = RTF_UP;
+			sa->sat_family = AF_APPLETALK;
+			sa->sat_addr.s_node = ATADDR_ANYNODE;
+			if (dev->flags & IFF_LOOPBACK ||
+			    dev->flags & IFF_POINTOPOINT)
+				rtdef.rt_flags |= RTF_HOST;
+
+			/* Routerless initial state */
+			if (nr->nr_firstnet == htons(0) &&
+			    nr->nr_lastnet == htons(0xFFFE)) {
+				sa->sat_addr.s_net = atif->address.s_net;
+				atrtr_create(&rtdef, dev);
+				atrtr_set_default(dev);
+			} else {
+				limit = ntohs(nr->nr_lastnet);
+				if (limit - ntohs(nr->nr_firstnet) > 4096) {
+					printk(KERN_WARNING "Too many routes/"
+							    "iface.\n");
+					return -EINVAL;
+				}
+				if (add_route)
+					for (ct = ntohs(nr->nr_firstnet);
+					     ct <= limit; ct++) {
+						sa->sat_addr.s_net = htons(ct);
+						atrtr_create(&rtdef, dev);
+					}
+			}
+			dev_mc_add_global(dev, aarp_mcast);
+			return 0;
+
+		case SIOCGIFADDR:
+			if (!atif)
+				return -EADDRNOTAVAIL;
+
+			sa->sat_family = AF_APPLETALK;
+			sa->sat_addr = atif->address;
+			break;
+
+		case SIOCGIFBRDADDR:
+			if (!atif)
+				return -EADDRNOTAVAIL;
+
+			sa->sat_family = AF_APPLETALK;
+			sa->sat_addr.s_net = atif->address.s_net;
+			sa->sat_addr.s_node = ATADDR_BCAST;
+			break;
+
+		case SIOCATALKDIFADDR:
+		case SIOCDIFADDR:
+			if (!capable(CAP_NET_ADMIN))
+				return -EPERM;
+			if (sa->sat_family != AF_APPLETALK)
+				return -EINVAL;
+			atalk_dev_down(dev);
+			break;
+
+		case SIOCSARP:
+			if (!capable(CAP_NET_ADMIN))
+				return -EPERM;
+			if (sa->sat_family != AF_APPLETALK)
+				return -EINVAL;
+			/*
+			 * for now, we only support proxy AARP on ELAP;
+			 * we should be able to do it for LocalTalk, too.
+			 */
+			if (dev->type != ARPHRD_ETHER)
+				return -EPROTONOSUPPORT;
+
+			/*
+			 * atif points to the current interface on this network;
+			 * we aren't concerned about its current status (at
+			 * least for now), but it has all the settings about
+			 * the network we're going to probe. Consequently, it
+			 * must exist.
+			 */
+			if (!atif)
+				return -EADDRNOTAVAIL;
+
+			nr = (struct atalk_netrange *)&(atif->nets);
+			/*
+			 * Phase 1 is fine on Localtalk but we don't do
+			 * Ethertalk phase 1. Anyone wanting to add it go ahead.
+			 */
+			if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
+				return -EPROTONOSUPPORT;
+
+			if (sa->sat_addr.s_node == ATADDR_BCAST ||
+			    sa->sat_addr.s_node == 254)
+				return -EINVAL;
+
+			/*
+			 * Check if the chosen address is used. If so we
+			 * error and ATCP will try another.
+			 */
+			if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0)
+				return -EADDRINUSE;
+
+			/*
+			 * We now have an address on the local network, and
+			 * the AARP code will defend it for us until we take it
+			 * down. We don't set up any routes right now, because
+			 * ATCP will install them manually via SIOCADDRT.
+			 */
+			break;
+
+		case SIOCDARP:
+			if (!capable(CAP_NET_ADMIN))
+				return -EPERM;
+			if (sa->sat_family != AF_APPLETALK)
+				return -EINVAL;
+			if (!atif)
+				return -EADDRNOTAVAIL;
+
+			/* give to aarp module to remove proxy entry */
+			aarp_proxy_remove(atif->dev, &(sa->sat_addr));
+			return 0;
+	}
+
+	return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
+}
+
+/* Routing ioctl() calls */
+static int atrtr_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct rtentry rt;
+
+	if (copy_from_user(&rt, arg, sizeof(rt)))
+		return -EFAULT;
+
+	switch (cmd) {
+		case SIOCDELRT:
+			if (rt.rt_dst.sa_family != AF_APPLETALK)
+				return -EINVAL;
+			return atrtr_delete(&((struct sockaddr_at *)
+						&rt.rt_dst)->sat_addr);
+
+		case SIOCADDRT: {
+			struct net_device *dev = NULL;
+			if (rt.rt_dev) {
+				char name[IFNAMSIZ];
+				if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1))
+					return -EFAULT;
+				name[IFNAMSIZ-1] = '\0';
+				dev = __dev_get_by_name(&init_net, name);
+				if (!dev)
+					return -ENODEV;
+			}
+			return atrtr_create(&rt, dev);
+		}
+	}
+	return -EINVAL;
+}
+
+/**************************************************************************\
+*                                                                          *
+* Handling for system calls applied via the various interfaces to an       *
+* AppleTalk socket object.                                                 *
+*                                                                          *
+\**************************************************************************/
+
+/*
+ * Checksum: This is 'optional'. It's quite likely also a good
+ * candidate for assembler hackery 8)
+ */
+static unsigned long atalk_sum_partial(const unsigned char *data,
+				       int len, unsigned long sum)
+{
+	/* This ought to be unwrapped neatly. I'll trust gcc for now */
+	while (len--) {
+		sum += *data++;
+		sum = rol16(sum, 1);
+	}
+	return sum;
+}
+
+/*  Checksum skb data --  similar to skb_checksum  */
+static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
+				   int len, unsigned long sum)
+{
+	int start = skb_headlen(skb);
+	struct sk_buff *frag_iter;
+	int i, copy;
+
+	/* checksum stuff in header space */
+	if ( (copy = start - offset) > 0) {
+		if (copy > len)
+			copy = len;
+		sum = atalk_sum_partial(skb->data + offset, copy, sum);
+		if ( (len -= copy) == 0)
+			return sum;
+
+		offset += copy;
+	}
+
+	/* checksum stuff in frags */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			u8 *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap_skb_frag(frag);
+			sum = atalk_sum_partial(vaddr + frag->page_offset +
+						  offset - start, copy, sum);
+			kunmap_skb_frag(vaddr);
+
+			if (!(len -= copy))
+				return sum;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			sum = atalk_sum_skb(frag_iter, offset - start,
+					    copy, sum);
+			if ((len -= copy) == 0)
+				return sum;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	BUG_ON(len > 0);
+
+	return sum;
+}
+
+static __be16 atalk_checksum(const struct sk_buff *skb, int len)
+{
+	unsigned long sum;
+
+	/* skip header 4 bytes */
+	sum = atalk_sum_skb(skb, 4, len-4, 0);
+
+	/* Use 0xFFFF for 0. 0 itself means none */
+	return sum ? htons((unsigned short)sum) : htons(0xFFFF);
+}
+
+static struct proto ddp_proto = {
+	.name	  = "DDP",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct atalk_sock),
+};
+
+/*
+ * Create a socket. Initialise the socket, blank the addresses
+ * set the state.
+ */
+static int atalk_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
+{
+	struct sock *sk;
+	int rc = -ESOCKTNOSUPPORT;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	/*
+	 * We permit SOCK_DGRAM and RAW is an extension. It is trivial to do
+	 * and gives you the full ELAP frame. Should be handy for CAP 8)
+	 */
+	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+		goto out;
+	rc = -ENOMEM;
+	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+	if (!sk)
+		goto out;
+	rc = 0;
+	sock->ops = &atalk_dgram_ops;
+	sock_init_data(sock, sk);
+
+	/* Checksums on by default */
+	sock_set_flag(sk, SOCK_ZAPPED);
+out:
+	return rc;
+}
+
+/* Free a socket. No work needed */
+static int atalk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		sock_hold(sk);
+		lock_sock(sk);
+
+		sock_orphan(sk);
+		sock->sk = NULL;
+		atalk_destroy_socket(sk);
+
+		release_sock(sk);
+		sock_put(sk);
+	}
+	return 0;
+}
+
+/**
+ * atalk_pick_and_bind_port - Pick a source port when one is not given
+ * @sk - socket to insert into the tables
+ * @sat - address to search for
+ *
+ * Pick a source port when one is not given. If we can find a suitable free
+ * one, we insert the socket into the tables using it.
+ *
+ * This whole operation must be atomic.
+ */
+static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat)
+{
+	int retval;
+
+	write_lock_bh(&atalk_sockets_lock);
+
+	for (sat->sat_port = ATPORT_RESERVED;
+	     sat->sat_port < ATPORT_LAST;
+	     sat->sat_port++) {
+		struct sock *s;
+		struct hlist_node *node;
+
+		sk_for_each(s, node, &atalk_sockets) {
+			struct atalk_sock *at = at_sk(s);
+
+			if (at->src_net == sat->sat_addr.s_net &&
+			    at->src_node == sat->sat_addr.s_node &&
+			    at->src_port == sat->sat_port)
+				goto try_next_port;
+		}
+
+		/* Wheee, it's free, assign and insert. */
+		__atalk_insert_socket(sk);
+		at_sk(sk)->src_port = sat->sat_port;
+		retval = 0;
+		goto out;
+
+try_next_port:;
+	}
+
+	retval = -EBUSY;
+out:
+	write_unlock_bh(&atalk_sockets_lock);
+	return retval;
+}
+
+static int atalk_autobind(struct sock *sk)
+{
+	struct atalk_sock *at = at_sk(sk);
+	struct sockaddr_at sat;
+	struct atalk_addr *ap = atalk_find_primary();
+	int n = -EADDRNOTAVAIL;
+
+	if (!ap || ap->s_net == htons(ATADDR_ANYNET))
+		goto out;
+
+	at->src_net  = sat.sat_addr.s_net  = ap->s_net;
+	at->src_node = sat.sat_addr.s_node = ap->s_node;
+
+	n = atalk_pick_and_bind_port(sk, &sat);
+	if (!n)
+		sock_reset_flag(sk, SOCK_ZAPPED);
+out:
+	return n;
+}
+
+/* Set the address 'our end' of the connection */
+static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_at *addr = (struct sockaddr_at *)uaddr;
+	struct sock *sk = sock->sk;
+	struct atalk_sock *at = at_sk(sk);
+	int err;
+
+	if (!sock_flag(sk, SOCK_ZAPPED) ||
+	    addr_len != sizeof(struct sockaddr_at))
+		return -EINVAL;
+
+	if (addr->sat_family != AF_APPLETALK)
+		return -EAFNOSUPPORT;
+
+	lock_sock(sk);
+	if (addr->sat_addr.s_net == htons(ATADDR_ANYNET)) {
+		struct atalk_addr *ap = atalk_find_primary();
+
+		err = -EADDRNOTAVAIL;
+		if (!ap)
+			goto out;
+
+		at->src_net  = addr->sat_addr.s_net = ap->s_net;
+		at->src_node = addr->sat_addr.s_node= ap->s_node;
+	} else {
+		err = -EADDRNOTAVAIL;
+		if (!atalk_find_interface(addr->sat_addr.s_net,
+					  addr->sat_addr.s_node))
+			goto out;
+
+		at->src_net  = addr->sat_addr.s_net;
+		at->src_node = addr->sat_addr.s_node;
+	}
+
+	if (addr->sat_port == ATADDR_ANYPORT) {
+		err = atalk_pick_and_bind_port(sk, addr);
+
+		if (err < 0)
+			goto out;
+	} else {
+		at->src_port = addr->sat_port;
+
+		err = -EADDRINUSE;
+		if (atalk_find_or_insert_socket(sk, addr))
+			goto out;
+	}
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+/* Set the address we talk to */
+static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
+			 int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct atalk_sock *at = at_sk(sk);
+	struct sockaddr_at *addr;
+	int err;
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	if (addr_len != sizeof(*addr))
+		return -EINVAL;
+
+	addr = (struct sockaddr_at *)uaddr;
+
+	if (addr->sat_family != AF_APPLETALK)
+		return -EAFNOSUPPORT;
+
+	if (addr->sat_addr.s_node == ATADDR_BCAST &&
+	    !sock_flag(sk, SOCK_BROADCAST)) {
+#if 1
+		printk(KERN_WARNING "%s is broken and did not set "
+				    "SO_BROADCAST. It will break when 2.2 is "
+				    "released.\n",
+			current->comm);
+#else
+		return -EACCES;
+#endif
+	}
+
+	lock_sock(sk);
+	err = -EBUSY;
+	if (sock_flag(sk, SOCK_ZAPPED))
+		if (atalk_autobind(sk) < 0)
+			goto out;
+
+	err = -ENETUNREACH;
+	if (!atrtr_get_dev(&addr->sat_addr))
+		goto out;
+
+	at->dest_port = addr->sat_port;
+	at->dest_net  = addr->sat_addr.s_net;
+	at->dest_node = addr->sat_addr.s_node;
+
+	sock->state  = SS_CONNECTED;
+	sk->sk_state = TCP_ESTABLISHED;
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ * Find the name of an AppleTalk socket. Just copy the right
+ * fields into the sockaddr.
+ */
+static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
+			 int *uaddr_len, int peer)
+{
+	struct sockaddr_at sat;
+	struct sock *sk = sock->sk;
+	struct atalk_sock *at = at_sk(sk);
+	int err;
+
+	lock_sock(sk);
+	err = -ENOBUFS;
+	if (sock_flag(sk, SOCK_ZAPPED))
+		if (atalk_autobind(sk) < 0)
+			goto out;
+
+	*uaddr_len = sizeof(struct sockaddr_at);
+	memset(&sat.sat_zero, 0, sizeof(sat.sat_zero));
+
+	if (peer) {
+		err = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+
+		sat.sat_addr.s_net  = at->dest_net;
+		sat.sat_addr.s_node = at->dest_node;
+		sat.sat_port	    = at->dest_port;
+	} else {
+		sat.sat_addr.s_net  = at->src_net;
+		sat.sat_addr.s_node = at->src_node;
+		sat.sat_port	    = at->src_port;
+	}
+
+	err = 0;
+	sat.sat_family = AF_APPLETALK;
+	memcpy(uaddr, &sat, sizeof(sat));
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE)
+static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
+{
+	return skb->data[12] == 22;
+}
+
+static int handle_ip_over_ddp(struct sk_buff *skb)
+{
+	struct net_device *dev = __dev_get_by_name(&init_net, "ipddp0");
+	struct net_device_stats *stats;
+
+	/* This needs to be able to handle ipddp"N" devices */
+	if (!dev) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	skb->protocol = htons(ETH_P_IP);
+	skb_pull(skb, 13);
+	skb->dev   = dev;
+	skb_reset_transport_header(skb);
+
+	stats = netdev_priv(dev);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len + 13;
+	return netif_rx(skb);  /* Send the SKB up to a higher place. */
+}
+#else
+/* make it easy for gcc to optimize this test out, i.e. kill the code */
+#define is_ip_over_ddp(skb) 0
+#define handle_ip_over_ddp(skb) 0
+#endif
+
+static int atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
+			      struct ddpehdr *ddp, __u16 len_hops, int origlen)
+{
+	struct atalk_route *rt;
+	struct atalk_addr ta;
+
+	/*
+	 * Don't route multicast, etc., packets, or packets sent to "this
+	 * network"
+	 */
+	if (skb->pkt_type != PACKET_HOST || !ddp->deh_dnet) {
+		/*
+		 * FIXME:
+		 *
+		 * Can it ever happen that a packet is from a PPP iface and
+		 * needs to be broadcast onto the default network?
+		 */
+		if (dev->type == ARPHRD_PPP)
+			printk(KERN_DEBUG "AppleTalk: didn't forward broadcast "
+					  "packet received from PPP iface\n");
+		goto free_it;
+	}
+
+	ta.s_net  = ddp->deh_dnet;
+	ta.s_node = ddp->deh_dnode;
+
+	/* Route the packet */
+	rt = atrtr_find(&ta);
+	/* increment hops count */
+	len_hops += 1 << 10;
+	if (!rt || !(len_hops & (15 << 10)))
+		goto free_it;
+
+	/* FIXME: use skb->cb to be able to use shared skbs */
+
+	/*
+	 * Route goes through another gateway, so set the target to the
+	 * gateway instead.
+	 */
+
+	if (rt->flags & RTF_GATEWAY) {
+		ta.s_net  = rt->gateway.s_net;
+		ta.s_node = rt->gateway.s_node;
+	}
+
+	/* Fix up skb->len field */
+	skb_trim(skb, min_t(unsigned int, origlen,
+			    (rt->dev->hard_header_len +
+			     ddp_dl->header_length + (len_hops & 1023))));
+
+	/* FIXME: use skb->cb to be able to use shared skbs */
+	ddp->deh_len_hops = htons(len_hops);
+
+	/*
+	 * Send the buffer onwards
+	 *
+	 * Now we must always be careful. If it's come from LocalTalk to
+	 * EtherTalk it might not fit
+	 *
+	 * Order matters here: If a packet has to be copied to make a new
+	 * headroom (rare hopefully) then it won't need unsharing.
+	 *
+	 * Note. ddp-> becomes invalid at the realloc.
+	 */
+	if (skb_headroom(skb) < 22) {
+		/* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */
+		struct sk_buff *nskb = skb_realloc_headroom(skb, 32);
+		kfree_skb(skb);
+		skb = nskb;
+	} else
+		skb = skb_unshare(skb, GFP_ATOMIC);
+
+	/*
+	 * If the buffer didn't vanish into the lack of space bitbucket we can
+	 * send it.
+	 */
+	if (skb == NULL)
+		goto drop;
+
+	if (aarp_send_ddp(rt->dev, skb, &ta, NULL) == NET_XMIT_DROP)
+		return NET_RX_DROP;
+	return NET_RX_SUCCESS;
+free_it:
+	kfree_skb(skb);
+drop:
+	return NET_RX_DROP;
+}
+
+/**
+ *	atalk_rcv - Receive a packet (in skb) from device dev
+ *	@skb - packet received
+ *	@dev - network device where the packet comes from
+ *	@pt - packet type
+ *
+ *	Receive a packet (in skb) from device dev. This has come from the SNAP
+ *	decoder, and on entry skb->transport_header is the DDP header, skb->len
+ *	is the DDP header, skb->len is the DDP length. The physical headers
+ *	have been extracted. PPP should probably pass frames marked as for this
+ *	layer.  [ie ARPHRD_ETHERTALK]
+ */
+static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
+		     struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct ddpehdr *ddp;
+	struct sock *sock;
+	struct atalk_iface *atif;
+	struct sockaddr_at tosat;
+	int origlen;
+	__u16 len_hops;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	/* Don't mangle buffer if shared */
+	if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+		goto out;
+
+	/* Size check and make sure header is contiguous */
+	if (!pskb_may_pull(skb, sizeof(*ddp)))
+		goto drop;
+
+	ddp = ddp_hdr(skb);
+
+	len_hops = ntohs(ddp->deh_len_hops);
+
+	/* Trim buffer in case of stray trailing data */
+	origlen = skb->len;
+	skb_trim(skb, min_t(unsigned int, skb->len, len_hops & 1023));
+
+	/*
+	 * Size check to see if ddp->deh_len was crap
+	 * (Otherwise we'll detonate most spectacularly
+	 * in the middle of atalk_checksum() or recvmsg()).
+	 */
+	if (skb->len < sizeof(*ddp) || skb->len < (len_hops & 1023)) {
+		pr_debug("AppleTalk: dropping corrupted frame (deh_len=%u, "
+			 "skb->len=%u)\n", len_hops & 1023, skb->len);
+		goto drop;
+	}
+
+	/*
+	 * Any checksums. Note we don't do htons() on this == is assumed to be
+	 * valid for net byte orders all over the networking code...
+	 */
+	if (ddp->deh_sum &&
+	    atalk_checksum(skb, len_hops & 1023) != ddp->deh_sum)
+		/* Not a valid AppleTalk frame - dustbin time */
+		goto drop;
+
+	/* Check the packet is aimed at us */
+	if (!ddp->deh_dnet)	/* Net 0 is 'this network' */
+		atif = atalk_find_anynet(ddp->deh_dnode, dev);
+	else
+		atif = atalk_find_interface(ddp->deh_dnet, ddp->deh_dnode);
+
+	if (!atif) {
+		/* Not ours, so we route the packet via the correct
+		 * AppleTalk iface
+		 */
+		return atalk_route_packet(skb, dev, ddp, len_hops, origlen);
+	}
+
+	/* if IP over DDP is not selected this code will be optimized out */
+	if (is_ip_over_ddp(skb))
+		return handle_ip_over_ddp(skb);
+	/*
+	 * Which socket - atalk_search_socket() looks for a *full match*
+	 * of the <net, node, port> tuple.
+	 */
+	tosat.sat_addr.s_net  = ddp->deh_dnet;
+	tosat.sat_addr.s_node = ddp->deh_dnode;
+	tosat.sat_port	      = ddp->deh_dport;
+
+	sock = atalk_search_socket(&tosat, atif);
+	if (!sock) /* But not one of our sockets */
+		goto drop;
+
+	/* Queue packet (standard) */
+	skb->sk = sock;
+
+	if (sock_queue_rcv_skb(sock, skb) < 0)
+		goto drop;
+
+	return NET_RX_SUCCESS;
+
+drop:
+	kfree_skb(skb);
+out:
+	return NET_RX_DROP;
+
+}
+
+/*
+ * Receive a LocalTalk frame. We make some demands on the caller here.
+ * Caller must provide enough headroom on the packet to pull the short
+ * header and append a long one.
+ */
+static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
+		     struct packet_type *pt, struct net_device *orig_dev)
+{
+	if (!net_eq(dev_net(dev), &init_net))
+		goto freeit;
+
+	/* Expand any short form frames */
+	if (skb_mac_header(skb)[2] == 1) {
+		struct ddpehdr *ddp;
+		/* Find our address */
+		struct atalk_addr *ap = atalk_find_dev_addr(dev);
+
+		if (!ap || skb->len < sizeof(__be16) || skb->len > 1023)
+			goto freeit;
+
+		/* Don't mangle buffer if shared */
+		if (!(skb = skb_share_check(skb, GFP_ATOMIC)))
+			return 0;
+
+		/*
+		 * The push leaves us with a ddephdr not an shdr, and
+		 * handily the port bytes in the right place preset.
+		 */
+		ddp = (struct ddpehdr *) skb_push(skb, sizeof(*ddp) - 4);
+
+		/* Now fill in the long header */
+
+		/*
+		 * These two first. The mac overlays the new source/dest
+		 * network information so we MUST copy these before
+		 * we write the network numbers !
+		 */
+
+		ddp->deh_dnode = skb_mac_header(skb)[0];     /* From physical header */
+		ddp->deh_snode = skb_mac_header(skb)[1];     /* From physical header */
+
+		ddp->deh_dnet  = ap->s_net;	/* Network number */
+		ddp->deh_snet  = ap->s_net;
+		ddp->deh_sum   = 0;		/* No checksum */
+		/*
+		 * Not sure about this bit...
+		 */
+		/* Non routable, so force a drop if we slip up later */
+		ddp->deh_len_hops = htons(skb->len + (DDP_MAXHOPS << 10));
+	}
+	skb_reset_transport_header(skb);
+
+	return atalk_rcv(skb, dev, pt, orig_dev);
+freeit:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+			 size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct atalk_sock *at = at_sk(sk);
+	struct sockaddr_at *usat = (struct sockaddr_at *)msg->msg_name;
+	int flags = msg->msg_flags;
+	int loopback = 0;
+	struct sockaddr_at local_satalk, gsat;
+	struct sk_buff *skb;
+	struct net_device *dev;
+	struct ddpehdr *ddp;
+	int size;
+	struct atalk_route *rt;
+	int err;
+
+	if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	if (len > DDP_MAXSZ)
+		return -EMSGSIZE;
+
+	lock_sock(sk);
+	if (usat) {
+		err = -EBUSY;
+		if (sock_flag(sk, SOCK_ZAPPED))
+			if (atalk_autobind(sk) < 0)
+				goto out;
+
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(*usat) ||
+		    usat->sat_family != AF_APPLETALK)
+			goto out;
+
+		err = -EPERM;
+		/* netatalk didn't implement this check */
+		if (usat->sat_addr.s_node == ATADDR_BCAST &&
+		    !sock_flag(sk, SOCK_BROADCAST)) {
+			goto out;
+		}
+	} else {
+		err = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+		usat = &local_satalk;
+		usat->sat_family      = AF_APPLETALK;
+		usat->sat_port	      = at->dest_port;
+		usat->sat_addr.s_node = at->dest_node;
+		usat->sat_addr.s_net  = at->dest_net;
+	}
+
+	/* Build a packet */
+	SOCK_DEBUG(sk, "SK %p: Got address.\n", sk);
+
+	/* For headers */
+	size = sizeof(struct ddpehdr) + len + ddp_dl->header_length;
+
+	if (usat->sat_addr.s_net || usat->sat_addr.s_node == ATADDR_ANYNODE) {
+		rt = atrtr_find(&usat->sat_addr);
+	} else {
+		struct atalk_addr at_hint;
+
+		at_hint.s_node = 0;
+		at_hint.s_net  = at->src_net;
+
+		rt = atrtr_find(&at_hint);
+	}
+	err = ENETUNREACH;
+	if (!rt)
+		goto out;
+
+	dev = rt->dev;
+
+	SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n",
+			sk, size, dev->name);
+
+	size += dev->hard_header_len;
+	release_sock(sk);
+	skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err);
+	lock_sock(sk);
+	if (!skb)
+		goto out;
+
+	skb->sk = sk;
+	skb_reserve(skb, ddp_dl->header_length);
+	skb_reserve(skb, dev->hard_header_len);
+	skb->dev = dev;
+
+	SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk);
+
+	ddp = (struct ddpehdr *)skb_put(skb, sizeof(struct ddpehdr));
+	ddp->deh_len_hops  = htons(len + sizeof(*ddp));
+	ddp->deh_dnet  = usat->sat_addr.s_net;
+	ddp->deh_snet  = at->src_net;
+	ddp->deh_dnode = usat->sat_addr.s_node;
+	ddp->deh_snode = at->src_node;
+	ddp->deh_dport = usat->sat_port;
+	ddp->deh_sport = at->src_port;
+
+	SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len);
+
+	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (err) {
+		kfree_skb(skb);
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (sk->sk_no_check == 1)
+		ddp->deh_sum = 0;
+	else
+		ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp));
+
+	/*
+	 * Loopback broadcast packets to non gateway targets (ie routes
+	 * to group we are in)
+	 */
+	if (ddp->deh_dnode == ATADDR_BCAST &&
+	    !(rt->flags & RTF_GATEWAY) && !(dev->flags & IFF_LOOPBACK)) {
+		struct sk_buff *skb2 = skb_copy(skb, GFP_KERNEL);
+
+		if (skb2) {
+			loopback = 1;
+			SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk);
+			/*
+			 * If it fails it is queued/sent above in the aarp queue
+			 */
+			aarp_send_ddp(dev, skb2, &usat->sat_addr, NULL);
+		}
+	}
+
+	if (dev->flags & IFF_LOOPBACK || loopback) {
+		SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk);
+		/* loop back */
+		skb_orphan(skb);
+		if (ddp->deh_dnode == ATADDR_BCAST) {
+			struct atalk_addr at_lo;
+
+			at_lo.s_node = 0;
+			at_lo.s_net  = 0;
+
+			rt = atrtr_find(&at_lo);
+			if (!rt) {
+				kfree_skb(skb);
+				err = -ENETUNREACH;
+				goto out;
+			}
+			dev = rt->dev;
+			skb->dev = dev;
+		}
+		ddp_dl->request(ddp_dl, skb, dev->dev_addr);
+	} else {
+		SOCK_DEBUG(sk, "SK %p: send out.\n", sk);
+		if (rt->flags & RTF_GATEWAY) {
+		    gsat.sat_addr = rt->gateway;
+		    usat = &gsat;
+		}
+
+		/*
+		 * If it fails it is queued/sent above in the aarp queue
+		 */
+		aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
+	}
+	SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len);
+
+out:
+	release_sock(sk);
+	return err ? : len;
+}
+
+static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+			 size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_at *sat = (struct sockaddr_at *)msg->msg_name;
+	struct ddpehdr *ddp;
+	int copied = 0;
+	int offset = 0;
+	int err = 0;
+	struct sk_buff *skb;
+
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+						flags & MSG_DONTWAIT, &err);
+	lock_sock(sk);
+
+	if (!skb)
+		goto out;
+
+	/* FIXME: use skb->cb to be able to use shared skbs */
+	ddp = ddp_hdr(skb);
+	copied = ntohs(ddp->deh_len_hops) & 1023;
+
+	if (sk->sk_type != SOCK_RAW) {
+		offset = sizeof(*ddp);
+		copied -= offset;
+	}
+
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+	err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
+
+	if (!err) {
+		if (sat) {
+			sat->sat_family      = AF_APPLETALK;
+			sat->sat_port        = ddp->deh_sport;
+			sat->sat_addr.s_node = ddp->deh_snode;
+			sat->sat_addr.s_net  = ddp->deh_snet;
+		}
+		msg->msg_namelen = sizeof(*sat);
+	}
+
+	skb_free_datagram(sk, skb);	/* Free the datagram. */
+
+out:
+	release_sock(sk);
+	return err ? : copied;
+}
+
+
+/*
+ * AppleTalk ioctl calls.
+ */
+static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int rc = -ENOIOCTLCMD;
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+		/* Protocol layer */
+		case TIOCOUTQ: {
+			long amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+
+			if (amount < 0)
+				amount = 0;
+			rc = put_user(amount, (int __user *)argp);
+			break;
+		}
+		case TIOCINQ: {
+			/*
+			 * These two are safe on a single CPU system as only
+			 * user tasks fiddle here
+			 */
+			struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+			long amount = 0;
+
+			if (skb)
+				amount = skb->len - sizeof(struct ddpehdr);
+			rc = put_user(amount, (int __user *)argp);
+			break;
+		}
+		case SIOCGSTAMP:
+			rc = sock_get_timestamp(sk, argp);
+			break;
+		case SIOCGSTAMPNS:
+			rc = sock_get_timestampns(sk, argp);
+			break;
+		/* Routing */
+		case SIOCADDRT:
+		case SIOCDELRT:
+			rc = -EPERM;
+			if (capable(CAP_NET_ADMIN))
+				rc = atrtr_ioctl(cmd, argp);
+			break;
+		/* Interface */
+		case SIOCGIFADDR:
+		case SIOCSIFADDR:
+		case SIOCGIFBRDADDR:
+		case SIOCATALKDIFADDR:
+		case SIOCDIFADDR:
+		case SIOCSARP:		/* proxy AARP */
+		case SIOCDARP:		/* proxy AARP */
+			rtnl_lock();
+			rc = atif_ioctl(cmd, argp);
+			rtnl_unlock();
+			break;
+	}
+
+	return rc;
+}
+
+
+#ifdef CONFIG_COMPAT
+static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	/*
+	 * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we
+	 * cannot handle it in common code. The data we access if ifreq
+	 * here is compatible, so we can simply call the native
+	 * handler.
+	 */
+	if (cmd == SIOCATALKDIFADDR)
+		return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+
+	return -ENOIOCTLCMD;
+}
+#endif
+
+
+static const struct net_proto_family atalk_family_ops = {
+	.family		= PF_APPLETALK,
+	.create		= atalk_create,
+	.owner		= THIS_MODULE,
+};
+
+static const struct proto_ops atalk_dgram_ops = {
+	.family		= PF_APPLETALK,
+	.owner		= THIS_MODULE,
+	.release	= atalk_release,
+	.bind		= atalk_bind,
+	.connect	= atalk_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= atalk_getname,
+	.poll		= datagram_poll,
+	.ioctl		= atalk_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= atalk_compat_ioctl,
+#endif
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.sendmsg	= atalk_sendmsg,
+	.recvmsg	= atalk_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct notifier_block ddp_notifier = {
+	.notifier_call	= ddp_device_event,
+};
+
+static struct packet_type ltalk_packet_type __read_mostly = {
+	.type		= cpu_to_be16(ETH_P_LOCALTALK),
+	.func		= ltalk_rcv,
+};
+
+static struct packet_type ppptalk_packet_type __read_mostly = {
+	.type		= cpu_to_be16(ETH_P_PPPTALK),
+	.func		= atalk_rcv,
+};
+
+static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B };
+
+/* Export symbols for use by drivers when AppleTalk is a module */
+EXPORT_SYMBOL(atrtr_get_dev);
+EXPORT_SYMBOL(atalk_find_dev_addr);
+
+static const char atalk_err_snap[] __initconst =
+	KERN_CRIT "Unable to register DDP with SNAP.\n";
+
+/* Called by proto.c on kernel start up */
+static int __init atalk_init(void)
+{
+	int rc = proto_register(&ddp_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	(void)sock_register(&atalk_family_ops);
+	ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
+	if (!ddp_dl)
+		printk(atalk_err_snap);
+
+	dev_add_pack(&ltalk_packet_type);
+	dev_add_pack(&ppptalk_packet_type);
+
+	register_netdevice_notifier(&ddp_notifier);
+	aarp_proto_init();
+	atalk_proc_init();
+	atalk_register_sysctl();
+out:
+	return rc;
+}
+module_init(atalk_init);
+
+/*
+ * No explicit module reference count manipulation is needed in the
+ * protocol. Socket layer sets module reference count for us
+ * and interfaces reference counting is done
+ * by the network device layer.
+ *
+ * Ergo, before the AppleTalk module can be removed, all AppleTalk
+ * sockets be closed from user space.
+ */
+static void __exit atalk_exit(void)
+{
+#ifdef CONFIG_SYSCTL
+	atalk_unregister_sysctl();
+#endif /* CONFIG_SYSCTL */
+	atalk_proc_exit();
+	aarp_cleanup_module();	/* General aarp clean-up. */
+	unregister_netdevice_notifier(&ddp_notifier);
+	dev_remove_pack(&ltalk_packet_type);
+	dev_remove_pack(&ppptalk_packet_type);
+	unregister_snap_client(ddp_dl);
+	sock_unregister(PF_APPLETALK);
+	proto_unregister(&ddp_proto);
+}
+module_exit(atalk_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>");
+MODULE_DESCRIPTION("AppleTalk 0.20\n");
+MODULE_ALIAS_NETPROTO(PF_APPLETALK);
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
new file mode 100644
index 00000000..6c8016f6
--- /dev/null
+++ b/net/appletalk/dev.c
@@ -0,0 +1,44 @@
+/*
+ * Moved here from drivers/net/net_init.c, which is:
+ *	Written 1993,1994,1995 by Donald Becker.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_ltalk.h>
+
+static void ltalk_setup(struct net_device *dev)
+{
+	/* Fill in the fields of the device structure with localtalk-generic values. */
+
+	dev->type		= ARPHRD_LOCALTLK;
+	dev->hard_header_len 	= LTALK_HLEN;
+	dev->mtu		= LTALK_MTU;
+	dev->addr_len		= LTALK_ALEN;
+	dev->tx_queue_len	= 10;
+
+	dev->broadcast[0]	= 0xFF;
+
+	dev->flags		= IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP;
+}
+
+/**
+ * alloc_ltalkdev - Allocates and sets up an localtalk device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this localtalk device
+ *
+ * Fill in the fields of the device structure with localtalk-generic
+ * values. Basically does everything except registering the device.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size @sizeof_priv.  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_ltalkdev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup);
+}
+EXPORT_SYMBOL(alloc_ltalkdev);
diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c
new file mode 100644
index 00000000..04e9c0da
--- /dev/null
+++ b/net/appletalk/sysctl_net_atalk.c
@@ -0,0 +1,61 @@
+/*
+ * sysctl_net_atalk.c: sysctl interface to net AppleTalk subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/atalk directory entry (empty =) ). [MS]
+ * Dynamic registration, added aarp entries. (5/30/97 Chris Horn)
+ */
+
+#include <linux/sysctl.h>
+#include <net/sock.h>
+#include <linux/atalk.h>
+
+static struct ctl_table atalk_table[] = {
+	{
+		.procname	= "aarp-expiry-time",
+		.data		= &sysctl_aarp_expiry_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "aarp-tick-time",
+		.data		= &sysctl_aarp_tick_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "aarp-retransmit-limit",
+		.data		= &sysctl_aarp_retransmit_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "aarp-resolve-time",
+		.data		= &sysctl_aarp_resolve_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ },
+};
+
+static struct ctl_path atalk_path[] = {
+	{ .procname = "net", },
+	{ .procname = "appletalk", },
+	{ }
+};
+
+static struct ctl_table_header *atalk_table_header;
+
+void atalk_register_sysctl(void)
+{
+	atalk_table_header = register_sysctl_paths(atalk_path, atalk_table);
+}
+
+void atalk_unregister_sysctl(void)
+{
+	unregister_sysctl_table(atalk_table_header);
+}
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 00000000..754ea103
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,73 @@
+#
+# Asynchronous Transfer Mode (ATM)
+#
+
+config ATM
+	tristate "Asynchronous Transfer Mode (ATM)"
+	---help---
+	  ATM is a high-speed networking technology for Local Area Networks
+	  and Wide Area Networks.  It uses a fixed packet size and is
+	  connection oriented, allowing for the negotiation of minimum
+	  bandwidth requirements.
+
+	  In order to participate in an ATM network, your Linux box needs an
+	  ATM networking card. If you have that, say Y here and to the driver
+	  of your ATM card below.
+
+	  Note that you need a set of user-space programs to actually make use
+	  of ATM.  See the file <file:Documentation/networking/atm.txt> for
+	  further details.
+
+config ATM_CLIP
+	tristate "Classical IP over ATM"
+	depends on ATM && INET
+	help
+	  Classical IP over ATM for PVCs and SVCs, supporting InARP and
+	  ATMARP. If you want to communication with other IP hosts on your ATM
+	  network, you will typically either say Y here or to "LAN Emulation
+	  (LANE)" below.
+
+config ATM_CLIP_NO_ICMP
+	bool "Do NOT send ICMP if no neighbour"
+	depends on ATM_CLIP
+	help
+	  Normally, an "ICMP host unreachable" message is sent if a neighbour
+	  cannot be reached because there is no VC to it in the kernel's
+	  ATMARP table. This may cause problems when ATMARP table entries are
+	  briefly removed during revalidation. If you say Y here, packets to
+	  such neighbours are silently discarded instead.
+
+config ATM_LANE
+	tristate "LAN Emulation (LANE) support"
+	depends on ATM
+	help
+	  LAN Emulation emulates services of existing LANs across an ATM
+	  network. Besides operating as a normal ATM end station client, Linux
+	  LANE client can also act as an proxy client bridging packets between
+	  ELAN and Ethernet segments. You need LANE if you want to try MPOA.
+
+config ATM_MPOA
+	tristate "Multi-Protocol Over ATM (MPOA) support"
+	depends on ATM && INET && ATM_LANE!=n
+	help
+	  Multi-Protocol Over ATM allows ATM edge devices such as routers,
+	  bridges and ATM attached hosts establish direct ATM VCs across
+	  subnetwork boundaries. These shortcut connections bypass routers
+	  enhancing overall network performance.
+
+config ATM_BR2684
+	tristate "RFC1483/2684 Bridged protocols"
+	depends on ATM && INET
+	help
+	  ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
+	  This device will act like an ethernet from the kernels point of view,
+	  with the traffic being carried by ATM PVCs (currently 1 PVC/device).
+	  This is sometimes used over DSL lines.  If in doubt, say N.
+
+config ATM_BR2684_IPFILTER
+	bool "Per-VC IP filter kludge"
+	depends on ATM_BR2684
+	help
+	  This is an experimental mechanism for users who need to terminate a
+	  large number of IP-only vcc's.  Do not enable this unless you are sure
+	  you know what you are doing.
diff --git a/net/atm/Makefile b/net/atm/Makefile
new file mode 100644
index 00000000..cc50bd1f
--- /dev/null
+++ b/net/atm/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the ATM Protocol Families.
+#
+
+atm-y		:= addr.o pvc.o signaling.o svc.o ioctl.o common.o atm_misc.o raw.o resources.o atm_sysfs.o
+mpoa-objs	:= mpc.o mpoa_caches.o mpoa_proc.o
+
+obj-$(CONFIG_ATM) += atm.o
+obj-$(CONFIG_ATM_CLIP) += clip.o
+obj-$(CONFIG_ATM_BR2684) += br2684.o
+atm-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_ATM_LANE) += lec.o
+obj-$(CONFIG_ATM_MPOA) += mpoa.o
+obj-$(CONFIG_PPPOATM) += pppoatm.o
diff --git a/net/atm/addr.c b/net/atm/addr.c
new file mode 100644
index 00000000..dcda35c6
--- /dev/null
+++ b/net/atm/addr.c
@@ -0,0 +1,161 @@
+/* net/atm/addr.c - Local ATM address registry */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "signaling.h"
+#include "addr.h"
+
+static int check_addr(const struct sockaddr_atmsvc *addr)
+{
+	int i;
+
+	if (addr->sas_family != AF_ATMSVC)
+		return -EAFNOSUPPORT;
+	if (!*addr->sas_addr.pub)
+		return *addr->sas_addr.prv ? 0 : -EINVAL;
+	for (i = 1; i < ATM_E164_LEN + 1; i++)	/* make sure it's \0-terminated */
+		if (!addr->sas_addr.pub[i])
+			return 0;
+	return -EINVAL;
+}
+
+static int identical(const struct sockaddr_atmsvc *a, const struct sockaddr_atmsvc *b)
+{
+	if (*a->sas_addr.prv)
+		if (memcmp(a->sas_addr.prv, b->sas_addr.prv, ATM_ESA_LEN))
+			return 0;
+	if (!*a->sas_addr.pub)
+		return !*b->sas_addr.pub;
+	if (!*b->sas_addr.pub)
+		return 0;
+	return !strcmp(a->sas_addr.pub, b->sas_addr.pub);
+}
+
+static void notify_sigd(const struct atm_dev *dev)
+{
+	struct sockaddr_atmpvc pvc;
+
+	pvc.sap_addr.itf = dev->number;
+	sigd_enq(NULL, as_itf_notify, NULL, &pvc, NULL);
+}
+
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t atype)
+{
+	unsigned long flags;
+	struct atm_dev_addr *this, *p;
+	struct list_head *head;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	if (atype == ATM_ADDR_LECS)
+		head = &dev->lecs;
+	else
+		head = &dev->local;
+	list_for_each_entry_safe(this, p, head, entry) {
+		list_del(&this->entry);
+		kfree(this);
+	}
+	spin_unlock_irqrestore(&dev->lock, flags);
+	if (head == &dev->local)
+		notify_sigd(dev);
+}
+
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+		 enum atm_addr_type_t atype)
+{
+	unsigned long flags;
+	struct atm_dev_addr *this;
+	struct list_head *head;
+	int error;
+
+	error = check_addr(addr);
+	if (error)
+		return error;
+	spin_lock_irqsave(&dev->lock, flags);
+	if (atype == ATM_ADDR_LECS)
+		head = &dev->lecs;
+	else
+		head = &dev->local;
+	list_for_each_entry(this, head, entry) {
+		if (identical(&this->addr, addr)) {
+			spin_unlock_irqrestore(&dev->lock, flags);
+			return -EEXIST;
+		}
+	}
+	this = kmalloc(sizeof(struct atm_dev_addr), GFP_ATOMIC);
+	if (!this) {
+		spin_unlock_irqrestore(&dev->lock, flags);
+		return -ENOMEM;
+	}
+	this->addr = *addr;
+	list_add(&this->entry, head);
+	spin_unlock_irqrestore(&dev->lock, flags);
+	if (head == &dev->local)
+		notify_sigd(dev);
+	return 0;
+}
+
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+		 enum atm_addr_type_t atype)
+{
+	unsigned long flags;
+	struct atm_dev_addr *this;
+	struct list_head *head;
+	int error;
+
+	error = check_addr(addr);
+	if (error)
+		return error;
+	spin_lock_irqsave(&dev->lock, flags);
+	if (atype == ATM_ADDR_LECS)
+		head = &dev->lecs;
+	else
+		head = &dev->local;
+	list_for_each_entry(this, head, entry) {
+		if (identical(&this->addr, addr)) {
+			list_del(&this->entry);
+			spin_unlock_irqrestore(&dev->lock, flags);
+			kfree(this);
+			if (head == &dev->local)
+				notify_sigd(dev);
+			return 0;
+		}
+	}
+	spin_unlock_irqrestore(&dev->lock, flags);
+	return -ENOENT;
+}
+
+int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user * buf,
+		 size_t size, enum atm_addr_type_t atype)
+{
+	unsigned long flags;
+	struct atm_dev_addr *this;
+	struct list_head *head;
+	int total = 0, error;
+	struct sockaddr_atmsvc *tmp_buf, *tmp_bufp;
+
+	spin_lock_irqsave(&dev->lock, flags);
+	if (atype == ATM_ADDR_LECS)
+		head = &dev->lecs;
+	else
+		head = &dev->local;
+	list_for_each_entry(this, head, entry)
+	    total += sizeof(struct sockaddr_atmsvc);
+	tmp_buf = tmp_bufp = kmalloc(total, GFP_ATOMIC);
+	if (!tmp_buf) {
+		spin_unlock_irqrestore(&dev->lock, flags);
+		return -ENOMEM;
+	}
+	list_for_each_entry(this, head, entry)
+	    memcpy(tmp_bufp++, &this->addr, sizeof(struct sockaddr_atmsvc));
+	spin_unlock_irqrestore(&dev->lock, flags);
+	error = total > size ? -E2BIG : total;
+	if (copy_to_user(buf, tmp_buf, total < size ? total : size))
+		error = -EFAULT;
+	kfree(tmp_buf);
+	return error;
+}
diff --git a/net/atm/addr.h b/net/atm/addr.h
new file mode 100644
index 00000000..6837e9e7
--- /dev/null
+++ b/net/atm/addr.h
@@ -0,0 +1,20 @@
+/* net/atm/addr.h - Local ATM address registry */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_ADDR_H
+#define NET_ATM_ADDR_H
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+
+void atm_reset_addr(struct atm_dev *dev, enum atm_addr_type_t type);
+int atm_add_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+		 enum atm_addr_type_t type);
+int atm_del_addr(struct atm_dev *dev, const struct sockaddr_atmsvc *addr,
+		 enum atm_addr_type_t type);
+int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user *buf,
+		 size_t size, enum atm_addr_type_t type);
+
+#endif
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
new file mode 100644
index 00000000..fc63526d
--- /dev/null
+++ b/net/atm/atm_misc.c
@@ -0,0 +1,101 @@
+/* net/atm/atm_misc.c - Various functions for use by ATM drivers */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL ICA */
+
+#include <linux/module.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/skbuff.h>
+#include <linux/sonet.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <asm/atomic.h>
+
+int atm_charge(struct atm_vcc *vcc, int truesize)
+{
+	atm_force_charge(vcc, truesize);
+	if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf)
+		return 1;
+	atm_return(vcc, truesize);
+	atomic_inc(&vcc->stats->rx_drop);
+	return 0;
+}
+EXPORT_SYMBOL(atm_charge);
+
+struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc, int pdu_size,
+				 gfp_t gfp_flags)
+{
+	struct sock *sk = sk_atm(vcc);
+	int guess = atm_guess_pdu2truesize(pdu_size);
+
+	atm_force_charge(vcc, guess);
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+		struct sk_buff *skb = alloc_skb(pdu_size, gfp_flags);
+
+		if (skb) {
+			atomic_add(skb->truesize-guess,
+				   &sk->sk_rmem_alloc);
+			return skb;
+		}
+	}
+	atm_return(vcc, guess);
+	atomic_inc(&vcc->stats->rx_drop);
+	return NULL;
+}
+EXPORT_SYMBOL(atm_alloc_charge);
+
+
+/*
+ * atm_pcr_goal returns the positive PCR if it should be rounded up, the
+ * negative PCR if it should be rounded down, and zero if the maximum available
+ * bandwidth should be used.
+ *
+ * The rules are as follows (* = maximum, - = absent (0), x = value "x",
+ * (x+ = x or next value above x, x- = x or next value below):
+ *
+ *	min max pcr	result		min max pcr	result
+ *	-   -   -	* (UBR only)	x   -   -	x+
+ *	-   -   *	*		x   -   *	*
+ *	-   -   z	z-		x   -   z	z-
+ *	-   *   -	*		x   *   -	x+
+ *	-   *   *	*		x   *   *	*
+ *	-   *   z	z-		x   *   z	z-
+ *	-   y   -	y-		x   y   -	x+
+ *	-   y   *	y-		x   y   *	y-
+ *	-   y   z	z-		x   y   z	z-
+ *
+ * All non-error cases can be converted with the following simple set of rules:
+ *
+ *   if pcr == z then z-
+ *   else if min == x && pcr == - then x+
+ *     else if max == y then y-
+ *	 else *
+ */
+
+int atm_pcr_goal(const struct atm_trafprm *tp)
+{
+	if (tp->pcr && tp->pcr != ATM_MAX_PCR)
+		return -tp->pcr;
+	if (tp->min_pcr && !tp->pcr)
+		return tp->min_pcr;
+	if (tp->max_pcr != ATM_MAX_PCR)
+		return -tp->max_pcr;
+	return 0;
+}
+EXPORT_SYMBOL(atm_pcr_goal);
+
+void sonet_copy_stats(struct k_sonet_stats *from, struct sonet_stats *to)
+{
+#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i)
+	__SONET_ITEMS
+#undef __HANDLE_ITEM
+}
+EXPORT_SYMBOL(sonet_copy_stats);
+
+void sonet_subtract_stats(struct k_sonet_stats *from, struct sonet_stats *to)
+{
+#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i)
+	__SONET_ITEMS
+#undef __HANDLE_ITEM
+}
+EXPORT_SYMBOL(sonet_subtract_stats);
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
new file mode 100644
index 00000000..f49da581
--- /dev/null
+++ b/net/atm/atm_sysfs.c
@@ -0,0 +1,200 @@
+/* ATM driver model support. */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/atmdev.h>
+#include "common.h"
+#include "resources.h"
+
+#define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev)
+
+static ssize_t show_type(struct device *cdev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct atm_dev *adev = to_atm_dev(cdev);
+	return sprintf(buf, "%s\n", adev->type);
+}
+
+static ssize_t show_address(struct device *cdev,
+			    struct device_attribute *attr, char *buf)
+{
+	char *pos = buf;
+	struct atm_dev *adev = to_atm_dev(cdev);
+	int i;
+
+	for (i = 0; i < (ESI_LEN - 1); i++)
+		pos += sprintf(pos, "%02x:", adev->esi[i]);
+	pos += sprintf(pos, "%02x\n", adev->esi[i]);
+
+	return pos - buf;
+}
+
+static ssize_t show_atmaddress(struct device *cdev,
+			       struct device_attribute *attr, char *buf)
+{
+	unsigned long flags;
+	char *pos = buf;
+	struct atm_dev *adev = to_atm_dev(cdev);
+	struct atm_dev_addr *aaddr;
+	int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
+	int i, j;
+
+	spin_lock_irqsave(&adev->lock, flags);
+	list_for_each_entry(aaddr, &adev->local, entry) {
+		for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
+			if (j == *fmt) {
+				pos += sprintf(pos, ".");
+				++fmt;
+				j = 0;
+			}
+			pos += sprintf(pos, "%02x",
+				       aaddr->addr.sas_addr.prv[i]);
+		}
+		pos += sprintf(pos, "\n");
+	}
+	spin_unlock_irqrestore(&adev->lock, flags);
+
+	return pos - buf;
+}
+
+static ssize_t show_atmindex(struct device *cdev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct atm_dev *adev = to_atm_dev(cdev);
+
+	return sprintf(buf, "%d\n", adev->number);
+}
+
+static ssize_t show_carrier(struct device *cdev,
+			    struct device_attribute *attr, char *buf)
+{
+	char *pos = buf;
+	struct atm_dev *adev = to_atm_dev(cdev);
+
+	pos += sprintf(pos, "%d\n",
+		       adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
+
+	return pos - buf;
+}
+
+static ssize_t show_link_rate(struct device *cdev,
+			      struct device_attribute *attr, char *buf)
+{
+	char *pos = buf;
+	struct atm_dev *adev = to_atm_dev(cdev);
+	int link_rate;
+
+	/* show the link rate, not the data rate */
+	switch (adev->link_rate) {
+	case ATM_OC3_PCR:
+		link_rate = 155520000;
+		break;
+	case ATM_OC12_PCR:
+		link_rate = 622080000;
+		break;
+	case ATM_25_PCR:
+		link_rate = 25600000;
+		break;
+	default:
+		link_rate = adev->link_rate * 8 * 53;
+	}
+	pos += sprintf(pos, "%d\n", link_rate);
+
+	return pos - buf;
+}
+
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL);
+static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL);
+static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL);
+static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
+static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL);
+
+static struct device_attribute *atm_attrs[] = {
+	&dev_attr_atmaddress,
+	&dev_attr_address,
+	&dev_attr_atmindex,
+	&dev_attr_carrier,
+	&dev_attr_type,
+	&dev_attr_link_rate,
+	NULL
+};
+
+
+static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env)
+{
+	struct atm_dev *adev;
+
+	if (!cdev)
+		return -ENODEV;
+
+	adev = to_atm_dev(cdev);
+	if (!adev)
+		return -ENODEV;
+
+	if (add_uevent_var(env, "NAME=%s%d", adev->type, adev->number))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void atm_release(struct device *cdev)
+{
+	struct atm_dev *adev = to_atm_dev(cdev);
+
+	kfree(adev);
+}
+
+static struct class atm_class = {
+	.name		= "atm",
+	.dev_release	= atm_release,
+	.dev_uevent		= atm_uevent,
+};
+
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent)
+{
+	struct device *cdev = &adev->class_dev;
+	int i, j, err;
+
+	cdev->class = &atm_class;
+	cdev->parent = parent;
+	dev_set_drvdata(cdev, adev);
+
+	dev_set_name(cdev, "%s%d", adev->type, adev->number);
+	err = device_register(cdev);
+	if (err < 0)
+		return err;
+
+	for (i = 0; atm_attrs[i]; i++) {
+		err = device_create_file(cdev, atm_attrs[i]);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (j = 0; j < i; j++)
+		device_remove_file(cdev, atm_attrs[j]);
+	device_del(cdev);
+	return err;
+}
+
+void atm_unregister_sysfs(struct atm_dev *adev)
+{
+	struct device *cdev = &adev->class_dev;
+
+	device_del(cdev);
+}
+
+int __init atm_sysfs_init(void)
+{
+	return class_register(&atm_class);
+}
+
+void __exit atm_sysfs_exit(void)
+{
+	class_unregister(&atm_class);
+}
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
new file mode 100644
index 00000000..d07223c8
--- /dev/null
+++ b/net/atm/br2684.c
@@ -0,0 +1,847 @@
+/*
+ * Ethernet netdevice using ATM AAL5 as underlying carrier
+ * (RFC1483 obsoleted by RFC2684) for Linux
+ *
+ * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
+ *          Eric Kinzie, 2006-2007, US Naval Research Laboratory
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+
+#include <linux/atmbr2684.h>
+
+#include "common.h"
+
+static void skb_debug(const struct sk_buff *skb)
+{
+#ifdef SKB_DEBUG
+#define NUM2PRINT 50
+	print_hex_dump(KERN_DEBUG, "br2684: skb: ", DUMP_OFFSET,
+		       16, 1, skb->data, min(NUM2PRINT, skb->len), true);
+#endif
+}
+
+#define BR2684_ETHERTYPE_LEN	2
+#define BR2684_PAD_LEN		2
+
+#define LLC		0xaa, 0xaa, 0x03
+#define SNAP_BRIDGED	0x00, 0x80, 0xc2
+#define SNAP_ROUTED	0x00, 0x00, 0x00
+#define PID_ETHERNET	0x00, 0x07
+#define ETHERTYPE_IPV4	0x08, 0x00
+#define ETHERTYPE_IPV6	0x86, 0xdd
+#define PAD_BRIDGED	0x00, 0x00
+
+static const unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 };
+static const unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 };
+static const unsigned char llc_oui_pid_pad[] =
+			{ LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
+static const unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
+static const unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
+
+enum br2684_encaps {
+	e_vc = BR2684_ENCAPS_VC,
+	e_llc = BR2684_ENCAPS_LLC,
+};
+
+struct br2684_vcc {
+	struct atm_vcc *atmvcc;
+	struct net_device *device;
+	/* keep old push, pop functions for chaining */
+	void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb);
+	void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb);
+	enum br2684_encaps encaps;
+	struct list_head brvccs;
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+	struct br2684_filter filter;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+	unsigned copies_needed, copies_failed;
+};
+
+struct br2684_dev {
+	struct net_device *net_dev;
+	struct list_head br2684_devs;
+	int number;
+	struct list_head brvccs;	/* one device <=> one vcc (before xmas) */
+	int mac_was_set;
+	enum br2684_payload payload;
+};
+
+/*
+ * This lock should be held for writing any time the list of devices or
+ * their attached vcc's could be altered.  It should be held for reading
+ * any time these are being queried.  Note that we sometimes need to
+ * do read-locking under interrupt context, so write locking must block
+ * the current CPU's interrupts
+ */
+static DEFINE_RWLOCK(devs_lock);
+
+static LIST_HEAD(br2684_devs);
+
+static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev)
+{
+	return netdev_priv(net_dev);
+}
+
+static inline struct net_device *list_entry_brdev(const struct list_head *le)
+{
+	return list_entry(le, struct br2684_dev, br2684_devs)->net_dev;
+}
+
+static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc)
+{
+	return (struct br2684_vcc *)(atmvcc->user_back);
+}
+
+static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le)
+{
+	return list_entry(le, struct br2684_vcc, brvccs);
+}
+
+/* Caller should hold read_lock(&devs_lock) */
+static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
+{
+	struct list_head *lh;
+	struct net_device *net_dev;
+	switch (s->method) {
+	case BR2684_FIND_BYNUM:
+		list_for_each(lh, &br2684_devs) {
+			net_dev = list_entry_brdev(lh);
+			if (BRPRIV(net_dev)->number == s->spec.devnum)
+				return net_dev;
+		}
+		break;
+	case BR2684_FIND_BYIFNAME:
+		list_for_each(lh, &br2684_devs) {
+			net_dev = list_entry_brdev(lh);
+			if (!strncmp(net_dev->name, s->spec.ifname, IFNAMSIZ))
+				return net_dev;
+		}
+		break;
+	}
+	return NULL;
+}
+
+static int atm_dev_event(struct notifier_block *this, unsigned long event,
+		 void *arg)
+{
+	struct atm_dev *atm_dev = arg;
+	struct list_head *lh;
+	struct net_device *net_dev;
+	struct br2684_vcc *brvcc;
+	struct atm_vcc *atm_vcc;
+	unsigned long flags;
+
+	pr_debug("event=%ld dev=%p\n", event, atm_dev);
+
+	read_lock_irqsave(&devs_lock, flags);
+	list_for_each(lh, &br2684_devs) {
+		net_dev = list_entry_brdev(lh);
+
+		list_for_each_entry(brvcc, &BRPRIV(net_dev)->brvccs, brvccs) {
+			atm_vcc = brvcc->atmvcc;
+			if (atm_vcc && brvcc->atmvcc->dev == atm_dev) {
+
+				if (atm_vcc->dev->signal == ATM_PHY_SIG_LOST)
+					netif_carrier_off(net_dev);
+				else
+					netif_carrier_on(net_dev);
+
+			}
+		}
+	}
+	read_unlock_irqrestore(&devs_lock, flags);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block atm_dev_notifier = {
+	.notifier_call = atm_dev_event,
+};
+
+/* chained vcc->pop function.  Check if we should wake the netif_queue */
+static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct br2684_vcc *brvcc = BR2684_VCC(vcc);
+	struct net_device *net_dev = skb->dev;
+
+	pr_debug("(vcc %p ; net_dev %p )\n", vcc, net_dev);
+	brvcc->old_pop(vcc, skb);
+
+	if (!net_dev)
+		return;
+
+	if (atm_may_send(vcc, 0))
+		netif_wake_queue(net_dev);
+
+}
+/*
+ * Send a packet out a particular vcc.  Not to useful right now, but paves
+ * the way for multiple vcc's per itf.  Returns true if we can send,
+ * otherwise false
+ */
+static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev,
+			   struct br2684_vcc *brvcc)
+{
+	struct br2684_dev *brdev = BRPRIV(dev);
+	struct atm_vcc *atmvcc;
+	int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2;
+
+	if (skb_headroom(skb) < minheadroom) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom);
+		brvcc->copies_needed++;
+		dev_kfree_skb(skb);
+		if (skb2 == NULL) {
+			brvcc->copies_failed++;
+			return 0;
+		}
+		skb = skb2;
+	}
+
+	if (brvcc->encaps == e_llc) {
+		if (brdev->payload == p_bridged) {
+			skb_push(skb, sizeof(llc_oui_pid_pad));
+			skb_copy_to_linear_data(skb, llc_oui_pid_pad,
+						sizeof(llc_oui_pid_pad));
+		} else if (brdev->payload == p_routed) {
+			unsigned short prot = ntohs(skb->protocol);
+
+			skb_push(skb, sizeof(llc_oui_ipv4));
+			switch (prot) {
+			case ETH_P_IP:
+				skb_copy_to_linear_data(skb, llc_oui_ipv4,
+							sizeof(llc_oui_ipv4));
+				break;
+			case ETH_P_IPV6:
+				skb_copy_to_linear_data(skb, llc_oui_ipv6,
+							sizeof(llc_oui_ipv6));
+				break;
+			default:
+				dev_kfree_skb(skb);
+				return 0;
+			}
+		}
+	} else { /* e_vc */
+		if (brdev->payload == p_bridged) {
+			skb_push(skb, 2);
+			memset(skb->data, 0, 2);
+		}
+	}
+	skb_debug(skb);
+
+	ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
+	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
+	atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
+	ATM_SKB(skb)->atm_options = atmvcc->atm_options;
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+	atmvcc->send(atmvcc, skb);
+
+	if (!atm_may_send(atmvcc, 0)) {
+		netif_stop_queue(brvcc->device);
+		/*check for race with br2684_pop*/
+		if (atm_may_send(atmvcc, 0))
+			netif_start_queue(brvcc->device);
+	}
+
+	return 1;
+}
+
+static inline struct br2684_vcc *pick_outgoing_vcc(const struct sk_buff *skb,
+						   const struct br2684_dev *brdev)
+{
+	return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next);	/* 1 vcc/dev right now */
+}
+
+static netdev_tx_t br2684_start_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	struct br2684_dev *brdev = BRPRIV(dev);
+	struct br2684_vcc *brvcc;
+
+	pr_debug("skb_dst(skb)=%p\n", skb_dst(skb));
+	read_lock(&devs_lock);
+	brvcc = pick_outgoing_vcc(skb, brdev);
+	if (brvcc == NULL) {
+		pr_debug("no vcc attached to dev %s\n", dev->name);
+		dev->stats.tx_errors++;
+		dev->stats.tx_carrier_errors++;
+		/* netif_stop_queue(dev); */
+		dev_kfree_skb(skb);
+		read_unlock(&devs_lock);
+		return NETDEV_TX_OK;
+	}
+	if (!br2684_xmit_vcc(skb, dev, brvcc)) {
+		/*
+		 * We should probably use netif_*_queue() here, but that
+		 * involves added complication.  We need to walk before
+		 * we can run.
+		 *
+		 * Don't free here! this pointer might be no longer valid!
+		 */
+		dev->stats.tx_errors++;
+		dev->stats.tx_fifo_errors++;
+	}
+	read_unlock(&devs_lock);
+	return NETDEV_TX_OK;
+}
+
+/*
+ * We remember when the MAC gets set, so we don't override it later with
+ * the ESI of the ATM card of the first VC
+ */
+static int br2684_mac_addr(struct net_device *dev, void *p)
+{
+	int err = eth_mac_addr(dev, p);
+	if (!err)
+		BRPRIV(dev)->mac_was_set = 1;
+	return err;
+}
+
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+/* this IOCTL is experimental. */
+static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg)
+{
+	struct br2684_vcc *brvcc;
+	struct br2684_filter_set fs;
+
+	if (copy_from_user(&fs, arg, sizeof fs))
+		return -EFAULT;
+	if (fs.ifspec.method != BR2684_FIND_BYNOTHING) {
+		/*
+		 * This is really a per-vcc thing, but we can also search
+		 * by device.
+		 */
+		struct br2684_dev *brdev;
+		read_lock(&devs_lock);
+		brdev = BRPRIV(br2684_find_dev(&fs.ifspec));
+		if (brdev == NULL || list_empty(&brdev->brvccs) ||
+		    brdev->brvccs.next != brdev->brvccs.prev)	/* >1 VCC */
+			brvcc = NULL;
+		else
+			brvcc = list_entry_brvcc(brdev->brvccs.next);
+		read_unlock(&devs_lock);
+		if (brvcc == NULL)
+			return -ESRCH;
+	} else
+		brvcc = BR2684_VCC(atmvcc);
+	memcpy(&brvcc->filter, &fs.filter, sizeof(brvcc->filter));
+	return 0;
+}
+
+/* Returns 1 if packet should be dropped */
+static inline int
+packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb)
+{
+	if (brvcc->filter.netmask == 0)
+		return 0;	/* no filter in place */
+	if (type == htons(ETH_P_IP) &&
+	    (((struct iphdr *)(skb->data))->daddr & brvcc->filter.
+	     netmask) == brvcc->filter.prefix)
+		return 0;
+	if (type == htons(ETH_P_ARP))
+		return 0;
+	/*
+	 * TODO: we should probably filter ARPs too.. don't want to have
+	 * them returning values that don't make sense, or is that ok?
+	 */
+	return 1;		/* drop */
+}
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+
+static void br2684_close_vcc(struct br2684_vcc *brvcc)
+{
+	pr_debug("removing VCC %p from dev %p\n", brvcc, brvcc->device);
+	write_lock_irq(&devs_lock);
+	list_del(&brvcc->brvccs);
+	write_unlock_irq(&devs_lock);
+	brvcc->atmvcc->user_back = NULL;	/* what about vcc->recvq ??? */
+	brvcc->old_push(brvcc->atmvcc, NULL);	/* pass on the bad news */
+	kfree(brvcc);
+	module_put(THIS_MODULE);
+}
+
+/* when AAL5 PDU comes in: */
+static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+	struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
+	struct net_device *net_dev = brvcc->device;
+	struct br2684_dev *brdev = BRPRIV(net_dev);
+
+	pr_debug("\n");
+
+	if (unlikely(skb == NULL)) {
+		/* skb==NULL means VCC is being destroyed */
+		br2684_close_vcc(brvcc);
+		if (list_empty(&brdev->brvccs)) {
+			write_lock_irq(&devs_lock);
+			list_del(&brdev->br2684_devs);
+			write_unlock_irq(&devs_lock);
+			unregister_netdev(net_dev);
+			free_netdev(net_dev);
+		}
+		return;
+	}
+
+	skb_debug(skb);
+	atm_return(atmvcc, skb->truesize);
+	pr_debug("skb from brdev %p\n", brdev);
+	if (brvcc->encaps == e_llc) {
+
+		if (skb->len > 7 && skb->data[7] == 0x01)
+			__skb_trim(skb, skb->len - 4);
+
+		/* accept packets that have "ipv[46]" in the snap header */
+		if ((skb->len >= (sizeof(llc_oui_ipv4))) &&
+		    (memcmp(skb->data, llc_oui_ipv4,
+			    sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
+			if (memcmp(skb->data + 6, ethertype_ipv6,
+				   sizeof(ethertype_ipv6)) == 0)
+				skb->protocol = htons(ETH_P_IPV6);
+			else if (memcmp(skb->data + 6, ethertype_ipv4,
+					sizeof(ethertype_ipv4)) == 0)
+				skb->protocol = htons(ETH_P_IP);
+			else
+				goto error;
+			skb_pull(skb, sizeof(llc_oui_ipv4));
+			skb_reset_network_header(skb);
+			skb->pkt_type = PACKET_HOST;
+		/*
+		 * Let us waste some time for checking the encapsulation.
+		 * Note, that only 7 char is checked so frames with a valid FCS
+		 * are also accepted (but FCS is not checked of course).
+		 */
+		} else if ((skb->len >= sizeof(llc_oui_pid_pad)) &&
+			   (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
+			skb_pull(skb, sizeof(llc_oui_pid_pad));
+			skb->protocol = eth_type_trans(skb, net_dev);
+		} else
+			goto error;
+
+	} else { /* e_vc */
+		if (brdev->payload == p_routed) {
+			struct iphdr *iph;
+
+			skb_reset_network_header(skb);
+			iph = ip_hdr(skb);
+			if (iph->version == 4)
+				skb->protocol = htons(ETH_P_IP);
+			else if (iph->version == 6)
+				skb->protocol = htons(ETH_P_IPV6);
+			else
+				goto error;
+			skb->pkt_type = PACKET_HOST;
+		} else { /* p_bridged */
+			/* first 2 chars should be 0 */
+			if (*((u16 *) (skb->data)) != 0)
+				goto error;
+			skb_pull(skb, BR2684_PAD_LEN);
+			skb->protocol = eth_type_trans(skb, net_dev);
+		}
+	}
+
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+	if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb)))
+		goto dropped;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+	skb->dev = net_dev;
+	ATM_SKB(skb)->vcc = atmvcc;	/* needed ? */
+	pr_debug("received packet's protocol: %x\n", ntohs(skb->protocol));
+	skb_debug(skb);
+	/* sigh, interface is down? */
+	if (unlikely(!(net_dev->flags & IFF_UP)))
+		goto dropped;
+	net_dev->stats.rx_packets++;
+	net_dev->stats.rx_bytes += skb->len;
+	memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+	netif_rx(skb);
+	return;
+
+dropped:
+	net_dev->stats.rx_dropped++;
+	goto free_skb;
+error:
+	net_dev->stats.rx_errors++;
+free_skb:
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Assign a vcc to a dev
+ * Note: we do not have explicit unassign, but look at _push()
+ */
+static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
+{
+	struct sk_buff_head queue;
+	int err;
+	struct br2684_vcc *brvcc;
+	struct sk_buff *skb, *tmp;
+	struct sk_buff_head *rq;
+	struct br2684_dev *brdev;
+	struct net_device *net_dev;
+	struct atm_backend_br2684 be;
+	unsigned long flags;
+
+	if (copy_from_user(&be, arg, sizeof be))
+		return -EFAULT;
+	brvcc = kzalloc(sizeof(struct br2684_vcc), GFP_KERNEL);
+	if (!brvcc)
+		return -ENOMEM;
+	write_lock_irq(&devs_lock);
+	net_dev = br2684_find_dev(&be.ifspec);
+	if (net_dev == NULL) {
+		pr_err("tried to attach to non-existent device\n");
+		err = -ENXIO;
+		goto error;
+	}
+	brdev = BRPRIV(net_dev);
+	if (atmvcc->push == NULL) {
+		err = -EBADFD;
+		goto error;
+	}
+	if (!list_empty(&brdev->brvccs)) {
+		/* Only 1 VCC/dev right now */
+		err = -EEXIST;
+		goto error;
+	}
+	if (be.fcs_in != BR2684_FCSIN_NO ||
+	    be.fcs_out != BR2684_FCSOUT_NO ||
+	    be.fcs_auto || be.has_vpiid || be.send_padding ||
+	    (be.encaps != BR2684_ENCAPS_VC &&
+	     be.encaps != BR2684_ENCAPS_LLC) ||
+	    be.min_size != 0) {
+		err = -EINVAL;
+		goto error;
+	}
+	pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc);
+	if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) {
+		unsigned char *esi = atmvcc->dev->esi;
+		if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5])
+			memcpy(net_dev->dev_addr, esi, net_dev->addr_len);
+		else
+			net_dev->dev_addr[2] = 1;
+	}
+	list_add(&brvcc->brvccs, &brdev->brvccs);
+	write_unlock_irq(&devs_lock);
+	brvcc->device = net_dev;
+	brvcc->atmvcc = atmvcc;
+	atmvcc->user_back = brvcc;
+	brvcc->encaps = (enum br2684_encaps)be.encaps;
+	brvcc->old_push = atmvcc->push;
+	brvcc->old_pop = atmvcc->pop;
+	barrier();
+	atmvcc->push = br2684_push;
+	atmvcc->pop = br2684_pop;
+
+	__skb_queue_head_init(&queue);
+	rq = &sk_atm(atmvcc)->sk_receive_queue;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	skb_queue_splice_init(rq, &queue);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	skb_queue_walk_safe(&queue, skb, tmp) {
+		struct net_device *dev;
+
+		br2684_push(atmvcc, skb);
+		dev = skb->dev;
+
+		dev->stats.rx_bytes -= skb->len;
+		dev->stats.rx_packets--;
+	}
+
+	/* initialize netdev carrier state */
+	if (atmvcc->dev->signal == ATM_PHY_SIG_LOST)
+		netif_carrier_off(net_dev);
+	else
+		netif_carrier_on(net_dev);
+
+	__module_get(THIS_MODULE);
+	return 0;
+
+error:
+	write_unlock_irq(&devs_lock);
+	kfree(brvcc);
+	return err;
+}
+
+static const struct net_device_ops br2684_netdev_ops = {
+	.ndo_start_xmit 	= br2684_start_xmit,
+	.ndo_set_mac_address	= br2684_mac_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static const struct net_device_ops br2684_netdev_ops_routed = {
+	.ndo_start_xmit 	= br2684_start_xmit,
+	.ndo_set_mac_address	= br2684_mac_addr,
+	.ndo_change_mtu		= eth_change_mtu
+};
+
+static void br2684_setup(struct net_device *netdev)
+{
+	struct br2684_dev *brdev = BRPRIV(netdev);
+
+	ether_setup(netdev);
+	brdev->net_dev = netdev;
+
+	netdev->netdev_ops = &br2684_netdev_ops;
+
+	INIT_LIST_HEAD(&brdev->brvccs);
+}
+
+static void br2684_setup_routed(struct net_device *netdev)
+{
+	struct br2684_dev *brdev = BRPRIV(netdev);
+
+	brdev->net_dev = netdev;
+	netdev->hard_header_len = 0;
+	netdev->netdev_ops = &br2684_netdev_ops_routed;
+	netdev->addr_len = 0;
+	netdev->mtu = 1500;
+	netdev->type = ARPHRD_PPP;
+	netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+	netdev->tx_queue_len = 100;
+	INIT_LIST_HEAD(&brdev->brvccs);
+}
+
+static int br2684_create(void __user *arg)
+{
+	int err;
+	struct net_device *netdev;
+	struct br2684_dev *brdev;
+	struct atm_newif_br2684 ni;
+	enum br2684_payload payload;
+
+	pr_debug("\n");
+
+	if (copy_from_user(&ni, arg, sizeof ni))
+		return -EFAULT;
+
+	if (ni.media & BR2684_FLAG_ROUTED)
+		payload = p_routed;
+	else
+		payload = p_bridged;
+	ni.media &= 0xffff;	/* strip flags */
+
+	if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500)
+		return -EINVAL;
+
+	netdev = alloc_netdev(sizeof(struct br2684_dev),
+			      ni.ifname[0] ? ni.ifname : "nas%d",
+			      (payload == p_routed) ?
+			      br2684_setup_routed : br2684_setup);
+	if (!netdev)
+		return -ENOMEM;
+
+	brdev = BRPRIV(netdev);
+
+	pr_debug("registered netdev %s\n", netdev->name);
+	/* open, stop, do_ioctl ? */
+	err = register_netdev(netdev);
+	if (err < 0) {
+		pr_err("register_netdev failed\n");
+		free_netdev(netdev);
+		return err;
+	}
+
+	write_lock_irq(&devs_lock);
+
+	brdev->payload = payload;
+
+	if (list_empty(&br2684_devs)) {
+		/* 1st br2684 device */
+		brdev->number = 1;
+	} else
+		brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
+	list_add_tail(&brdev->br2684_devs, &br2684_devs);
+	write_unlock_irq(&devs_lock);
+	return 0;
+}
+
+/*
+ * This handles ioctls actually performed on our vcc - we must return
+ * -ENOIOCTLCMD for any unrecognized ioctl
+ */
+static int br2684_ioctl(struct socket *sock, unsigned int cmd,
+			unsigned long arg)
+{
+	struct atm_vcc *atmvcc = ATM_SD(sock);
+	void __user *argp = (void __user *)arg;
+	atm_backend_t b;
+
+	int err;
+	switch (cmd) {
+	case ATM_SETBACKEND:
+	case ATM_NEWBACKENDIF:
+		err = get_user(b, (atm_backend_t __user *) argp);
+		if (err)
+			return -EFAULT;
+		if (b != ATM_BACKEND_BR2684)
+			return -ENOIOCTLCMD;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (cmd == ATM_SETBACKEND)
+			return br2684_regvcc(atmvcc, argp);
+		else
+			return br2684_create(argp);
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+	case BR2684_SETFILT:
+		if (atmvcc->push != br2684_push)
+			return -ENOIOCTLCMD;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		err = br2684_setfilt(atmvcc, argp);
+
+		return err;
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+	}
+	return -ENOIOCTLCMD;
+}
+
+static struct atm_ioctl br2684_ioctl_ops = {
+	.owner = THIS_MODULE,
+	.ioctl = br2684_ioctl,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *br2684_seq_start(struct seq_file *seq, loff_t * pos)
+	__acquires(devs_lock)
+{
+	read_lock(&devs_lock);
+	return seq_list_start(&br2684_devs, *pos);
+}
+
+static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos)
+{
+	return seq_list_next(v, &br2684_devs, pos);
+}
+
+static void br2684_seq_stop(struct seq_file *seq, void *v)
+	__releases(devs_lock)
+{
+	read_unlock(&devs_lock);
+}
+
+static int br2684_seq_show(struct seq_file *seq, void *v)
+{
+	const struct br2684_dev *brdev = list_entry(v, struct br2684_dev,
+						    br2684_devs);
+	const struct net_device *net_dev = brdev->net_dev;
+	const struct br2684_vcc *brvcc;
+
+	seq_printf(seq, "dev %.16s: num=%d, mac=%pM (%s)\n",
+		   net_dev->name,
+		   brdev->number,
+		   net_dev->dev_addr,
+		   brdev->mac_was_set ? "set" : "auto");
+
+	list_for_each_entry(brvcc, &brdev->brvccs, brvccs) {
+		seq_printf(seq, "  vcc %d.%d.%d: encaps=%s payload=%s"
+			   ", failed copies %u/%u"
+			   "\n", brvcc->atmvcc->dev->number,
+			   brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
+			   (brvcc->encaps == e_llc) ? "LLC" : "VC",
+			   (brdev->payload == p_bridged) ? "bridged" : "routed",
+			   brvcc->copies_failed, brvcc->copies_needed);
+#ifdef CONFIG_ATM_BR2684_IPFILTER
+#define b1(var, byte)	((u8 *) &brvcc->filter.var)[byte]
+#define bs(var)		b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
+		if (brvcc->filter.netmask != 0)
+			seq_printf(seq, "    filter=%d.%d.%d.%d/"
+				   "%d.%d.%d.%d\n", bs(prefix), bs(netmask));
+#undef bs
+#undef b1
+#endif /* CONFIG_ATM_BR2684_IPFILTER */
+	}
+	return 0;
+}
+
+static const struct seq_operations br2684_seq_ops = {
+	.start = br2684_seq_start,
+	.next = br2684_seq_next,
+	.stop = br2684_seq_stop,
+	.show = br2684_seq_show,
+};
+
+static int br2684_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &br2684_seq_ops);
+}
+
+static const struct file_operations br2684_proc_ops = {
+	.owner = THIS_MODULE,
+	.open = br2684_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+extern struct proc_dir_entry *atm_proc_root;	/* from proc.c */
+#endif /* CONFIG_PROC_FS */
+
+static int __init br2684_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *p;
+	p = proc_create("br2684", 0, atm_proc_root, &br2684_proc_ops);
+	if (p == NULL)
+		return -ENOMEM;
+#endif
+	register_atm_ioctl(&br2684_ioctl_ops);
+	register_atmdevice_notifier(&atm_dev_notifier);
+	return 0;
+}
+
+static void __exit br2684_exit(void)
+{
+	struct net_device *net_dev;
+	struct br2684_dev *brdev;
+	struct br2684_vcc *brvcc;
+	deregister_atm_ioctl(&br2684_ioctl_ops);
+
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("br2684", atm_proc_root);
+#endif
+
+
+	unregister_atmdevice_notifier(&atm_dev_notifier);
+
+	while (!list_empty(&br2684_devs)) {
+		net_dev = list_entry_brdev(br2684_devs.next);
+		brdev = BRPRIV(net_dev);
+		while (!list_empty(&brdev->brvccs)) {
+			brvcc = list_entry_brvcc(brdev->brvccs.next);
+			br2684_close_vcc(brvcc);
+		}
+
+		list_del(&brdev->br2684_devs);
+		unregister_netdev(net_dev);
+		free_netdev(net_dev);
+	}
+}
+
+module_init(br2684_init);
+module_exit(br2684_exit);
+
+MODULE_AUTHOR("Marcell GAL");
+MODULE_DESCRIPTION("RFC2684 bridged protocols over ATM/AAL5");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/clip.c b/net/atm/clip.c
new file mode 100644
index 00000000..5889074e
--- /dev/null
+++ b/net/atm/clip.c
@@ -0,0 +1,1009 @@
+/* net/atm/clip.c - RFC1577 Classical IP over ATM */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h> /* for UINT_MAX */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/if_arp.h> /* for some manifest constants */
+#include <linux/notifier.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/atmarp.h>
+#include <linux/capability.h>
+#include <linux/ip.h> /* for net/route.h */
+#include <linux/in.h> /* for struct sockaddr_in */
+#include <linux/if.h> /* for IFF_UP */
+#include <linux/inetdevice.h>
+#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <net/route.h> /* for struct rtable and routing */
+#include <net/icmp.h> /* icmp_send */
+#include <linux/param.h> /* for HZ */
+#include <linux/uaccess.h>
+#include <asm/byteorder.h> /* for htons etc. */
+#include <asm/system.h> /* save/restore_flags */
+#include <asm/atomic.h>
+
+#include "common.h"
+#include "resources.h"
+#include <net/atmclip.h>
+
+static struct net_device *clip_devs;
+static struct atm_vcc *atmarpd;
+static struct neigh_table clip_tbl;
+static struct timer_list idle_timer;
+
+static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip)
+{
+	struct sock *sk;
+	struct atmarp_ctrl *ctrl;
+	struct sk_buff *skb;
+
+	pr_debug("(%d)\n", type);
+	if (!atmarpd)
+		return -EUNATCH;
+	skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	ctrl = (struct atmarp_ctrl *)skb_put(skb, sizeof(struct atmarp_ctrl));
+	ctrl->type = type;
+	ctrl->itf_num = itf;
+	ctrl->ip = ip;
+	atm_force_charge(atmarpd, skb->truesize);
+
+	sk = sk_atm(atmarpd);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, skb->len);
+	return 0;
+}
+
+static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry)
+{
+	pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh);
+	clip_vcc->entry = entry;
+	clip_vcc->xoff = 0;	/* @@@ may overrun buffer by one packet */
+	clip_vcc->next = entry->vccs;
+	entry->vccs = clip_vcc;
+	entry->neigh->used = jiffies;
+}
+
+static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
+{
+	struct atmarp_entry *entry = clip_vcc->entry;
+	struct clip_vcc **walk;
+
+	if (!entry) {
+		pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+		return;
+	}
+	netif_tx_lock_bh(entry->neigh->dev);	/* block clip_start_xmit() */
+	entry->neigh->used = jiffies;
+	for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
+		if (*walk == clip_vcc) {
+			int error;
+
+			*walk = clip_vcc->next;	/* atomic */
+			clip_vcc->entry = NULL;
+			if (clip_vcc->xoff)
+				netif_wake_queue(entry->neigh->dev);
+			if (entry->vccs)
+				goto out;
+			entry->expires = jiffies - 1;
+			/* force resolution or expiration */
+			error = neigh_update(entry->neigh, NULL, NUD_NONE,
+					     NEIGH_UPDATE_F_ADMIN);
+			if (error)
+				pr_crit("neigh_update failed with %d\n", error);
+			goto out;
+		}
+	pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+out:
+	netif_tx_unlock_bh(entry->neigh->dev);
+}
+
+/* The neighbour entry n->lock is held. */
+static int neigh_check_cb(struct neighbour *n)
+{
+	struct atmarp_entry *entry = NEIGH2ENTRY(n);
+	struct clip_vcc *cv;
+
+	for (cv = entry->vccs; cv; cv = cv->next) {
+		unsigned long exp = cv->last_use + cv->idle_timeout;
+
+		if (cv->idle_timeout && time_after(jiffies, exp)) {
+			pr_debug("releasing vcc %p->%p of entry %p\n",
+				 cv, cv->vcc, entry);
+			vcc_release_async(cv->vcc, -ETIMEDOUT);
+		}
+	}
+
+	if (entry->vccs || time_before(jiffies, entry->expires))
+		return 0;
+
+	if (atomic_read(&n->refcnt) > 1) {
+		struct sk_buff *skb;
+
+		pr_debug("destruction postponed with ref %d\n",
+			 atomic_read(&n->refcnt));
+
+		while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
+			dev_kfree_skb(skb);
+
+		return 0;
+	}
+
+	pr_debug("expired neigh %p\n", n);
+	return 1;
+}
+
+static void idle_timer_check(unsigned long dummy)
+{
+	write_lock(&clip_tbl.lock);
+	__neigh_for_each_release(&clip_tbl, neigh_check_cb);
+	mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
+	write_unlock(&clip_tbl.lock);
+}
+
+static int clip_arp_rcv(struct sk_buff *skb)
+{
+	struct atm_vcc *vcc;
+
+	pr_debug("\n");
+	vcc = ATM_SKB(skb)->vcc;
+	if (!vcc || !atm_charge(vcc, skb->truesize)) {
+		dev_kfree_skb_any(skb);
+		return 0;
+	}
+	pr_debug("pushing to %p\n", vcc);
+	pr_debug("using %p\n", CLIP_VCC(vcc)->old_push);
+	CLIP_VCC(vcc)->old_push(vcc, skb);
+	return 0;
+}
+
+static const unsigned char llc_oui[] = {
+	0xaa,	/* DSAP: non-ISO */
+	0xaa,	/* SSAP: non-ISO */
+	0x03,	/* Ctrl: Unnumbered Information Command PDU */
+	0x00,	/* OUI: EtherType */
+	0x00,
+	0x00
+};
+
+static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+
+	pr_debug("\n");
+	if (!skb) {
+		pr_debug("removing VCC %p\n", clip_vcc);
+		if (clip_vcc->entry)
+			unlink_clip_vcc(clip_vcc);
+		clip_vcc->old_push(vcc, NULL);	/* pass on the bad news */
+		kfree(clip_vcc);
+		return;
+	}
+	atm_return(vcc, skb->truesize);
+	skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs;
+	/* clip_vcc->entry == NULL if we don't have an IP address yet */
+	if (!skb->dev) {
+		dev_kfree_skb_any(skb);
+		return;
+	}
+	ATM_SKB(skb)->vcc = vcc;
+	skb_reset_mac_header(skb);
+	if (!clip_vcc->encap ||
+	    skb->len < RFC1483LLC_LEN ||
+	    memcmp(skb->data, llc_oui, sizeof(llc_oui)))
+		skb->protocol = htons(ETH_P_IP);
+	else {
+		skb->protocol = ((__be16 *)skb->data)[3];
+		skb_pull(skb, RFC1483LLC_LEN);
+		if (skb->protocol == htons(ETH_P_ARP)) {
+			skb->dev->stats.rx_packets++;
+			skb->dev->stats.rx_bytes += skb->len;
+			clip_arp_rcv(skb);
+			return;
+		}
+	}
+	clip_vcc->last_use = jiffies;
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+	memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+	netif_rx(skb);
+}
+
+/*
+ * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that
+ * clip_pop is atomic with respect to the critical section in clip_start_xmit.
+ */
+
+static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+	struct net_device *dev = skb->dev;
+	int old;
+	unsigned long flags;
+
+	pr_debug("(vcc %p)\n", vcc);
+	clip_vcc->old_pop(vcc, skb);
+	/* skb->dev == NULL in outbound ARP packets */
+	if (!dev)
+		return;
+	spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags);
+	if (atm_may_send(vcc, 0)) {
+		old = xchg(&clip_vcc->xoff, 0);
+		if (old)
+			netif_wake_queue(dev);
+	}
+	spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags);
+}
+
+static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+	pr_debug("(neigh %p, skb %p)\n", neigh, skb);
+	to_atmarpd(act_need, PRIV(neigh->dev)->number, NEIGH2ENTRY(neigh)->ip);
+}
+
+static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb)
+{
+#ifndef CONFIG_ATM_CLIP_NO_ICMP
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+#endif
+	kfree_skb(skb);
+}
+
+static const struct neigh_ops clip_neigh_ops = {
+	.family =		AF_INET,
+	.solicit =		clip_neigh_solicit,
+	.error_report =		clip_neigh_error,
+	.output =		dev_queue_xmit,
+	.connected_output =	dev_queue_xmit,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static int clip_constructor(struct neighbour *neigh)
+{
+	struct atmarp_entry *entry = NEIGH2ENTRY(neigh);
+	struct net_device *dev = neigh->dev;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
+
+	pr_debug("(neigh %p, entry %p)\n", neigh, entry);
+	neigh->type = inet_addr_type(&init_net, entry->ip);
+	if (neigh->type != RTN_UNICAST)
+		return -EINVAL;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in_dev->arp_parms;
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+	rcu_read_unlock();
+
+	neigh->ops = &clip_neigh_ops;
+	neigh->output = neigh->nud_state & NUD_VALID ?
+	    neigh->ops->connected_output : neigh->ops->output;
+	entry->neigh = neigh;
+	entry->vccs = NULL;
+	entry->expires = jiffies - 1;
+	return 0;
+}
+
+static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
+{
+	return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
+}
+
+static struct neigh_table clip_tbl = {
+	.family 	= AF_INET,
+	.entry_size 	= sizeof(struct neighbour)+sizeof(struct atmarp_entry),
+	.key_len 	= 4,
+	.hash 		= clip_hash,
+	.constructor 	= clip_constructor,
+	.id 		= "clip_arp_cache",
+
+	/* parameters are copied from ARP ... */
+	.parms = {
+		.tbl 			= &clip_tbl,
+		.base_reachable_time 	= 30 * HZ,
+		.retrans_time 		= 1 * HZ,
+		.gc_staletime 		= 60 * HZ,
+		.reachable_time 	= 30 * HZ,
+		.delay_probe_time 	= 5 * HZ,
+		.queue_len 		= 3,
+		.ucast_probes 		= 3,
+		.mcast_probes 		= 3,
+		.anycast_delay 		= 1 * HZ,
+		.proxy_delay 		= (8 * HZ) / 10,
+		.proxy_qlen 		= 64,
+		.locktime 		= 1 * HZ,
+	},
+	.gc_interval 	= 30 * HZ,
+	.gc_thresh1 	= 128,
+	.gc_thresh2 	= 512,
+	.gc_thresh3 	= 1024,
+};
+
+/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */
+
+/*
+ * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means
+ * to allocate the neighbour entry but not to ask atmarpd for resolution. Also,
+ * don't increment the usage count. This is used to create entries in
+ * clip_setentry.
+ */
+
+static int clip_encap(struct atm_vcc *vcc, int mode)
+{
+	CLIP_VCC(vcc)->encap = mode;
+	return 0;
+}
+
+static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
+				   struct net_device *dev)
+{
+	struct clip_priv *clip_priv = PRIV(dev);
+	struct dst_entry *dst = skb_dst(skb);
+	struct atmarp_entry *entry;
+	struct neighbour *n;
+	struct atm_vcc *vcc;
+	int old;
+	unsigned long flags;
+
+	pr_debug("(skb %p)\n", skb);
+	if (!dst) {
+		pr_err("skb_dst(skb) == NULL\n");
+		dev_kfree_skb(skb);
+		dev->stats.tx_dropped++;
+		return NETDEV_TX_OK;
+	}
+	n = dst_get_neighbour(dst);
+	if (!n) {
+#if 0
+		n = clip_find_neighbour(skb_dst(skb), 1);
+		if (!n) {
+			dev_kfree_skb(skb);	/* lost that one */
+			dev->stats.tx_dropped++;
+			return 0;
+		}
+		dst_set_neighbour(dst, n);
+#endif
+		pr_err("NO NEIGHBOUR !\n");
+		dev_kfree_skb(skb);
+		dev->stats.tx_dropped++;
+		return NETDEV_TX_OK;
+	}
+	entry = NEIGH2ENTRY(n);
+	if (!entry->vccs) {
+		if (time_after(jiffies, entry->expires)) {
+			/* should be resolved */
+			entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ;
+			to_atmarpd(act_need, PRIV(dev)->number, entry->ip);
+		}
+		if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS)
+			skb_queue_tail(&entry->neigh->arp_queue, skb);
+		else {
+			dev_kfree_skb(skb);
+			dev->stats.tx_dropped++;
+		}
+		return NETDEV_TX_OK;
+	}
+	pr_debug("neigh %p, vccs %p\n", entry, entry->vccs);
+	ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc;
+	pr_debug("using neighbour %p, vcc %p\n", n, vcc);
+	if (entry->vccs->encap) {
+		void *here;
+
+		here = skb_push(skb, RFC1483LLC_LEN);
+		memcpy(here, llc_oui, sizeof(llc_oui));
+		((__be16 *) here)[3] = skb->protocol;
+	}
+	atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+	ATM_SKB(skb)->atm_options = vcc->atm_options;
+	entry->vccs->last_use = jiffies;
+	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev);
+	old = xchg(&entry->vccs->xoff, 1);	/* assume XOFF ... */
+	if (old) {
+		pr_warning("XOFF->XOFF transition\n");
+		return NETDEV_TX_OK;
+	}
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+	vcc->send(vcc, skb);
+	if (atm_may_send(vcc, 0)) {
+		entry->vccs->xoff = 0;
+		return NETDEV_TX_OK;
+	}
+	spin_lock_irqsave(&clip_priv->xoff_lock, flags);
+	netif_stop_queue(dev);	/* XOFF -> throttle immediately */
+	barrier();
+	if (!entry->vccs->xoff)
+		netif_start_queue(dev);
+	/* Oh, we just raced with clip_pop. netif_start_queue should be
+	   good enough, because nothing should really be asleep because
+	   of the brief netif_stop_queue. If this isn't true or if it
+	   changes, use netif_wake_queue instead. */
+	spin_unlock_irqrestore(&clip_priv->xoff_lock, flags);
+	return NETDEV_TX_OK;
+}
+
+static int clip_mkip(struct atm_vcc *vcc, int timeout)
+{
+	struct sk_buff_head *rq, queue;
+	struct clip_vcc *clip_vcc;
+	struct sk_buff *skb, *tmp;
+	unsigned long flags;
+
+	if (!vcc->push)
+		return -EBADFD;
+	clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL);
+	if (!clip_vcc)
+		return -ENOMEM;
+	pr_debug("%p vcc %p\n", clip_vcc, vcc);
+	clip_vcc->vcc = vcc;
+	vcc->user_back = clip_vcc;
+	set_bit(ATM_VF_IS_CLIP, &vcc->flags);
+	clip_vcc->entry = NULL;
+	clip_vcc->xoff = 0;
+	clip_vcc->encap = 1;
+	clip_vcc->last_use = jiffies;
+	clip_vcc->idle_timeout = timeout * HZ;
+	clip_vcc->old_push = vcc->push;
+	clip_vcc->old_pop = vcc->pop;
+	vcc->push = clip_push;
+	vcc->pop = clip_pop;
+
+	__skb_queue_head_init(&queue);
+	rq = &sk_atm(vcc)->sk_receive_queue;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	skb_queue_splice_init(rq, &queue);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	/* re-process everything received between connection setup and MKIP */
+	skb_queue_walk_safe(&queue, skb, tmp) {
+		if (!clip_devs) {
+			atm_return(vcc, skb->truesize);
+			kfree_skb(skb);
+		} else {
+			struct net_device *dev = skb->dev;
+			unsigned int len = skb->len;
+
+			skb_get(skb);
+			clip_push(vcc, skb);
+			dev->stats.rx_packets--;
+			dev->stats.rx_bytes -= len;
+			kfree_skb(skb);
+		}
+	}
+	return 0;
+}
+
+static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
+{
+	struct neighbour *neigh;
+	struct atmarp_entry *entry;
+	int error;
+	struct clip_vcc *clip_vcc;
+	struct rtable *rt;
+
+	if (vcc->push != clip_push) {
+		pr_warning("non-CLIP VCC\n");
+		return -EBADF;
+	}
+	clip_vcc = CLIP_VCC(vcc);
+	if (!ip) {
+		if (!clip_vcc->entry) {
+			pr_err("hiding hidden ATMARP entry\n");
+			return 0;
+		}
+		pr_debug("remove\n");
+		unlink_clip_vcc(clip_vcc);
+		return 0;
+	}
+	rt = ip_route_output(&init_net, ip, 0, 1, 0);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+	neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1);
+	ip_rt_put(rt);
+	if (!neigh)
+		return -ENOMEM;
+	entry = NEIGH2ENTRY(neigh);
+	if (entry != clip_vcc->entry) {
+		if (!clip_vcc->entry)
+			pr_debug("add\n");
+		else {
+			pr_debug("update\n");
+			unlink_clip_vcc(clip_vcc);
+		}
+		link_vcc(clip_vcc, entry);
+	}
+	error = neigh_update(neigh, llc_oui, NUD_PERMANENT,
+			     NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN);
+	neigh_release(neigh);
+	return error;
+}
+
+static const struct net_device_ops clip_netdev_ops = {
+	.ndo_start_xmit = clip_start_xmit,
+};
+
+static void clip_setup(struct net_device *dev)
+{
+	dev->netdev_ops = &clip_netdev_ops;
+	dev->type = ARPHRD_ATM;
+	dev->hard_header_len = RFC1483LLC_LEN;
+	dev->mtu = RFC1626_MTU;
+	dev->tx_queue_len = 100;	/* "normal" queue (packets) */
+	/* When using a "real" qdisc, the qdisc determines the queue */
+	/* length. tx_queue_len is only used for the default case, */
+	/* without any more elaborate queuing. 100 is a reasonable */
+	/* compromise between decent burst-tolerance and protection */
+	/* against memory hogs. */
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+}
+
+static int clip_create(int number)
+{
+	struct net_device *dev;
+	struct clip_priv *clip_priv;
+	int error;
+
+	if (number != -1) {
+		for (dev = clip_devs; dev; dev = PRIV(dev)->next)
+			if (PRIV(dev)->number == number)
+				return -EEXIST;
+	} else {
+		number = 0;
+		for (dev = clip_devs; dev; dev = PRIV(dev)->next)
+			if (PRIV(dev)->number >= number)
+				number = PRIV(dev)->number + 1;
+	}
+	dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup);
+	if (!dev)
+		return -ENOMEM;
+	clip_priv = PRIV(dev);
+	sprintf(dev->name, "atm%d", number);
+	spin_lock_init(&clip_priv->xoff_lock);
+	clip_priv->number = number;
+	error = register_netdev(dev);
+	if (error) {
+		free_netdev(dev);
+		return error;
+	}
+	clip_priv->next = clip_devs;
+	clip_devs = dev;
+	pr_debug("registered (net:%s)\n", dev->name);
+	return number;
+}
+
+static int clip_device_event(struct notifier_block *this, unsigned long event,
+			     void *arg)
+{
+	struct net_device *dev = arg;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event == NETDEV_UNREGISTER) {
+		neigh_ifdown(&clip_tbl, dev);
+		return NOTIFY_DONE;
+	}
+
+	/* ignore non-CLIP devices */
+	if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		pr_debug("NETDEV_UP\n");
+		to_atmarpd(act_up, PRIV(dev)->number, 0);
+		break;
+	case NETDEV_GOING_DOWN:
+		pr_debug("NETDEV_DOWN\n");
+		to_atmarpd(act_down, PRIV(dev)->number, 0);
+		break;
+	case NETDEV_CHANGE:
+	case NETDEV_CHANGEMTU:
+		pr_debug("NETDEV_CHANGE*\n");
+		to_atmarpd(act_change, PRIV(dev)->number, 0);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int clip_inet_event(struct notifier_block *this, unsigned long event,
+			   void *ifa)
+{
+	struct in_device *in_dev;
+
+	in_dev = ((struct in_ifaddr *)ifa)->ifa_dev;
+	/*
+	 * Transitions are of the down-change-up type, so it's sufficient to
+	 * handle the change on up.
+	 */
+	if (event != NETDEV_UP)
+		return NOTIFY_DONE;
+	return clip_device_event(this, NETDEV_CHANGE, in_dev->dev);
+}
+
+static struct notifier_block clip_dev_notifier = {
+	.notifier_call = clip_device_event,
+};
+
+
+
+static struct notifier_block clip_inet_notifier = {
+	.notifier_call = clip_inet_event,
+};
+
+
+
+static void atmarpd_close(struct atm_vcc *vcc)
+{
+	pr_debug("\n");
+
+	rtnl_lock();
+	atmarpd = NULL;
+	skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
+	rtnl_unlock();
+
+	pr_debug("(done)\n");
+	module_put(THIS_MODULE);
+}
+
+static struct atmdev_ops atmarpd_dev_ops = {
+	.close = atmarpd_close
+};
+
+
+static struct atm_dev atmarpd_dev = {
+	.ops =			&atmarpd_dev_ops,
+	.type =			"arpd",
+	.number = 		999,
+	.lock =			__SPIN_LOCK_UNLOCKED(atmarpd_dev.lock)
+};
+
+
+static int atm_init_atmarp(struct atm_vcc *vcc)
+{
+	rtnl_lock();
+	if (atmarpd) {
+		rtnl_unlock();
+		return -EADDRINUSE;
+	}
+
+	mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
+
+	atmarpd = vcc;
+	set_bit(ATM_VF_META, &vcc->flags);
+	set_bit(ATM_VF_READY, &vcc->flags);
+	    /* allow replies and avoid getting closed if signaling dies */
+	vcc->dev = &atmarpd_dev;
+	vcc_insert_socket(sk_atm(vcc));
+	vcc->push = NULL;
+	vcc->pop = NULL; /* crash */
+	vcc->push_oam = NULL; /* crash */
+	rtnl_unlock();
+	return 0;
+}
+
+static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int err = 0;
+
+	switch (cmd) {
+	case SIOCMKCLIP:
+	case ATMARPD_CTRL:
+	case ATMARP_MKIP:
+	case ATMARP_SETENTRY:
+	case ATMARP_ENCAP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	switch (cmd) {
+	case SIOCMKCLIP:
+		err = clip_create(arg);
+		break;
+	case ATMARPD_CTRL:
+		err = atm_init_atmarp(vcc);
+		if (!err) {
+			sock->state = SS_CONNECTED;
+			__module_get(THIS_MODULE);
+		}
+		break;
+	case ATMARP_MKIP:
+		err = clip_mkip(vcc, arg);
+		break;
+	case ATMARP_SETENTRY:
+		err = clip_setentry(vcc, (__force __be32)arg);
+		break;
+	case ATMARP_ENCAP:
+		err = clip_encap(vcc, arg);
+		break;
+	}
+	return err;
+}
+
+static struct atm_ioctl clip_ioctl_ops = {
+	.owner = THIS_MODULE,
+	.ioctl = clip_ioctl,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr)
+{
+	static int code[] = { 1, 2, 10, 6, 1, 0 };
+	static int e164[] = { 1, 8, 4, 6, 1, 0 };
+
+	if (*addr->sas_addr.pub) {
+		seq_printf(seq, "%s", addr->sas_addr.pub);
+		if (*addr->sas_addr.prv)
+			seq_putc(seq, '+');
+	} else if (!*addr->sas_addr.prv) {
+		seq_printf(seq, "%s", "(none)");
+		return;
+	}
+	if (*addr->sas_addr.prv) {
+		unsigned char *prv = addr->sas_addr.prv;
+		int *fields;
+		int i, j;
+
+		fields = *prv == ATM_AFI_E164 ? e164 : code;
+		for (i = 0; fields[i]; i++) {
+			for (j = fields[i]; j; j--)
+				seq_printf(seq, "%02X", *prv++);
+			if (fields[i + 1])
+				seq_putc(seq, '.');
+		}
+	}
+}
+
+/* This means the neighbour entry has no attached VCC objects. */
+#define SEQ_NO_VCC_TOKEN	((void *) 2)
+
+static void atmarp_info(struct seq_file *seq, struct net_device *dev,
+			struct atmarp_entry *entry, struct clip_vcc *clip_vcc)
+{
+	unsigned long exp;
+	char buf[17];
+	int svc, llc, off;
+
+	svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) ||
+	       (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC));
+
+	llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap);
+
+	if (clip_vcc == SEQ_NO_VCC_TOKEN)
+		exp = entry->neigh->used;
+	else
+		exp = clip_vcc->last_use;
+
+	exp = (jiffies - exp) / HZ;
+
+	seq_printf(seq, "%-6s%-4s%-4s%5ld ",
+		   dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp);
+
+	off = scnprintf(buf, sizeof(buf) - 1, "%pI4",
+			&entry->ip);
+	while (off < 16)
+		buf[off++] = ' ';
+	buf[off] = '\0';
+	seq_printf(seq, "%s", buf);
+
+	if (clip_vcc == SEQ_NO_VCC_TOKEN) {
+		if (time_before(jiffies, entry->expires))
+			seq_printf(seq, "(resolving)\n");
+		else
+			seq_printf(seq, "(expired, ref %d)\n",
+				   atomic_read(&entry->neigh->refcnt));
+	} else if (!svc) {
+		seq_printf(seq, "%d.%d.%d\n",
+			   clip_vcc->vcc->dev->number,
+			   clip_vcc->vcc->vpi, clip_vcc->vcc->vci);
+	} else {
+		svc_addr(seq, &clip_vcc->vcc->remote);
+		seq_putc(seq, '\n');
+	}
+}
+
+struct clip_seq_state {
+	/* This member must be first. */
+	struct neigh_seq_state ns;
+
+	/* Local to clip specific iteration. */
+	struct clip_vcc *vcc;
+};
+
+static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e,
+					  struct clip_vcc *curr)
+{
+	if (!curr) {
+		curr = e->vccs;
+		if (!curr)
+			return SEQ_NO_VCC_TOKEN;
+		return curr;
+	}
+	if (curr == SEQ_NO_VCC_TOKEN)
+		return NULL;
+
+	curr = curr->next;
+
+	return curr;
+}
+
+static void *clip_seq_vcc_walk(struct clip_seq_state *state,
+			       struct atmarp_entry *e, loff_t * pos)
+{
+	struct clip_vcc *vcc = state->vcc;
+
+	vcc = clip_seq_next_vcc(e, vcc);
+	if (vcc && pos != NULL) {
+		while (*pos) {
+			vcc = clip_seq_next_vcc(e, vcc);
+			if (!vcc)
+				break;
+			--(*pos);
+		}
+	}
+	state->vcc = vcc;
+
+	return vcc;
+}
+
+static void *clip_seq_sub_iter(struct neigh_seq_state *_state,
+			       struct neighbour *n, loff_t * pos)
+{
+	struct clip_seq_state *state = (struct clip_seq_state *)_state;
+
+	return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos);
+}
+
+static void *clip_seq_start(struct seq_file *seq, loff_t * pos)
+{
+	struct clip_seq_state *state = seq->private;
+	state->ns.neigh_sub_iter = clip_seq_sub_iter;
+	return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY);
+}
+
+static int clip_seq_show(struct seq_file *seq, void *v)
+{
+	static char atm_arp_banner[] =
+	    "IPitf TypeEncp Idle IP address      ATM address\n";
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, atm_arp_banner);
+	} else {
+		struct clip_seq_state *state = seq->private;
+		struct neighbour *n = v;
+		struct clip_vcc *vcc = state->vcc;
+
+		atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc);
+	}
+	return 0;
+}
+
+static const struct seq_operations arp_seq_ops = {
+	.start	= clip_seq_start,
+	.next	= neigh_seq_next,
+	.stop	= neigh_seq_stop,
+	.show	= clip_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &arp_seq_ops,
+			    sizeof(struct clip_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+	.open		= arp_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+	.owner		= THIS_MODULE
+};
+#endif
+
+static void atm_clip_exit_noproc(void);
+
+static int __init atm_clip_init(void)
+{
+	neigh_table_init_no_netlink(&clip_tbl);
+
+	clip_tbl_hook = &clip_tbl;
+	register_atm_ioctl(&clip_ioctl_ops);
+	register_netdevice_notifier(&clip_dev_notifier);
+	register_inetaddr_notifier(&clip_inet_notifier);
+
+	setup_timer(&idle_timer, idle_timer_check, 0);
+
+#ifdef CONFIG_PROC_FS
+	{
+		struct proc_dir_entry *p;
+
+		p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops);
+		if (!p) {
+			pr_err("Unable to initialize /proc/net/atm/arp\n");
+			atm_clip_exit_noproc();
+			return -ENOMEM;
+		}
+	}
+#endif
+
+	return 0;
+}
+
+static void atm_clip_exit_noproc(void)
+{
+	struct net_device *dev, *next;
+
+	unregister_inetaddr_notifier(&clip_inet_notifier);
+	unregister_netdevice_notifier(&clip_dev_notifier);
+
+	deregister_atm_ioctl(&clip_ioctl_ops);
+
+	/* First, stop the idle timer, so it stops banging
+	 * on the table.
+	 */
+	del_timer_sync(&idle_timer);
+
+	/* Next, purge the table, so that the device
+	 * unregister loop below does not hang due to
+	 * device references remaining in the table.
+	 */
+	neigh_ifdown(&clip_tbl, NULL);
+
+	dev = clip_devs;
+	while (dev) {
+		next = PRIV(dev)->next;
+		unregister_netdev(dev);
+		free_netdev(dev);
+		dev = next;
+	}
+
+	/* Now it is safe to fully shutdown whole table. */
+	neigh_table_clear(&clip_tbl);
+
+	clip_tbl_hook = NULL;
+}
+
+static void __exit atm_clip_exit(void)
+{
+	remove_proc_entry("arp", atm_proc_root);
+
+	atm_clip_exit_noproc();
+}
+
+module_init(atm_clip_init);
+module_exit(atm_clip_exit);
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("Classical/IP over ATM interface");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/common.c b/net/atm/common.c
new file mode 100644
index 00000000..22b963d0
--- /dev/null
+++ b/net/atm/common.c
@@ -0,0 +1,870 @@
+/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/net.h>		/* struct socket, struct proto_ops */
+#include <linux/atm.h>		/* ATM stuff */
+#include <linux/atmdev.h>
+#include <linux/socket.h>	/* SOL_SOCKET */
+#include <linux/errno.h>	/* error codes */
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/time.h>		/* struct timeval */
+#include <linux/skbuff.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/sock.h>		/* struct sock */
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+
+#include <asm/atomic.h>
+
+#include "resources.h"		/* atm_find_dev */
+#include "common.h"		/* prototypes */
+#include "protocols.h"		/* atm_init_<transport> */
+#include "addr.h"		/* address registry */
+#include "signaling.h"		/* for WAITING and sigd_attach */
+
+struct hlist_head vcc_hash[VCC_HTABLE_SIZE];
+EXPORT_SYMBOL(vcc_hash);
+
+DEFINE_RWLOCK(vcc_sklist_lock);
+EXPORT_SYMBOL(vcc_sklist_lock);
+
+static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain);
+
+static void __vcc_insert_socket(struct sock *sk)
+{
+	struct atm_vcc *vcc = atm_sk(sk);
+	struct hlist_head *head = &vcc_hash[vcc->vci & (VCC_HTABLE_SIZE - 1)];
+	sk->sk_hash = vcc->vci & (VCC_HTABLE_SIZE - 1);
+	sk_add_node(sk, head);
+}
+
+void vcc_insert_socket(struct sock *sk)
+{
+	write_lock_irq(&vcc_sklist_lock);
+	__vcc_insert_socket(sk);
+	write_unlock_irq(&vcc_sklist_lock);
+}
+EXPORT_SYMBOL(vcc_insert_socket);
+
+static void vcc_remove_socket(struct sock *sk)
+{
+	write_lock_irq(&vcc_sklist_lock);
+	sk_del_node_init(sk);
+	write_unlock_irq(&vcc_sklist_lock);
+}
+
+static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size)
+{
+	struct sk_buff *skb;
+	struct sock *sk = sk_atm(vcc);
+
+	if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) {
+		pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
+			 sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
+		return NULL;
+	}
+	while (!(skb = alloc_skb(size, GFP_KERNEL)))
+		schedule();
+	pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
+	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+	return skb;
+}
+
+static void vcc_sock_destruct(struct sock *sk)
+{
+	if (atomic_read(&sk->sk_rmem_alloc))
+		printk(KERN_DEBUG "%s: rmem leakage (%d bytes) detected.\n",
+		       __func__, atomic_read(&sk->sk_rmem_alloc));
+
+	if (atomic_read(&sk->sk_wmem_alloc))
+		printk(KERN_DEBUG "%s: wmem leakage (%d bytes) detected.\n",
+		       __func__, atomic_read(&sk->sk_wmem_alloc));
+}
+
+static void vcc_def_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up(&wq->wait);
+	rcu_read_unlock();
+}
+
+static inline int vcc_writable(struct sock *sk)
+{
+	struct atm_vcc *vcc = atm_sk(sk);
+
+	return (vcc->qos.txtp.max_sdu +
+		atomic_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf;
+}
+
+static void vcc_write_space(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+
+	if (vcc_writable(sk)) {
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
+
+		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	}
+
+	rcu_read_unlock();
+}
+
+static struct proto vcc_proto = {
+	.name	  = "VCC",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct atm_vcc),
+};
+
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+{
+	struct sock *sk;
+	struct atm_vcc *vcc;
+
+	sock->sk = NULL;
+	if (sock->type == SOCK_STREAM)
+		return -EINVAL;
+	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+	if (!sk)
+		return -ENOMEM;
+	sock_init_data(sock, sk);
+	sk->sk_state_change = vcc_def_wakeup;
+	sk->sk_write_space = vcc_write_space;
+
+	vcc = atm_sk(sk);
+	vcc->dev = NULL;
+	memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
+	memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
+	vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
+	atomic_set(&sk->sk_wmem_alloc, 1);
+	atomic_set(&sk->sk_rmem_alloc, 0);
+	vcc->push = NULL;
+	vcc->pop = NULL;
+	vcc->push_oam = NULL;
+	vcc->vpi = vcc->vci = 0; /* no VCI/VPI yet */
+	vcc->atm_options = vcc->aal_options = 0;
+	sk->sk_destruct = vcc_sock_destruct;
+	return 0;
+}
+
+static void vcc_destroy_socket(struct sock *sk)
+{
+	struct atm_vcc *vcc = atm_sk(sk);
+	struct sk_buff *skb;
+
+	set_bit(ATM_VF_CLOSE, &vcc->flags);
+	clear_bit(ATM_VF_READY, &vcc->flags);
+	if (vcc->dev) {
+		if (vcc->dev->ops->close)
+			vcc->dev->ops->close(vcc);
+		if (vcc->push)
+			vcc->push(vcc, NULL); /* atmarpd has no push */
+
+		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+			atm_return(vcc, skb->truesize);
+			kfree_skb(skb);
+		}
+
+		module_put(vcc->dev->ops->owner);
+		atm_dev_put(vcc->dev);
+	}
+
+	vcc_remove_socket(sk);
+}
+
+int vcc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		lock_sock(sk);
+		vcc_destroy_socket(sock->sk);
+		release_sock(sk);
+		sock_put(sk);
+	}
+
+	return 0;
+}
+
+void vcc_release_async(struct atm_vcc *vcc, int reply)
+{
+	struct sock *sk = sk_atm(vcc);
+
+	set_bit(ATM_VF_CLOSE, &vcc->flags);
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sk->sk_err = -reply;
+	clear_bit(ATM_VF_WAITING, &vcc->flags);
+	sk->sk_state_change(sk);
+}
+EXPORT_SYMBOL(vcc_release_async);
+
+void atm_dev_signal_change(struct atm_dev *dev, char signal)
+{
+	pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n",
+		__func__, signal, dev, dev->number, dev->signal);
+
+	/* atm driver sending invalid signal */
+	WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND);
+
+	if (dev->signal == signal)
+		return; /* no change */
+
+	dev->signal = signal;
+
+	atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev);
+}
+EXPORT_SYMBOL(atm_dev_signal_change);
+
+void atm_dev_release_vccs(struct atm_dev *dev)
+{
+	int i;
+
+	write_lock_irq(&vcc_sklist_lock);
+	for (i = 0; i < VCC_HTABLE_SIZE; i++) {
+		struct hlist_head *head = &vcc_hash[i];
+		struct hlist_node *node, *tmp;
+		struct sock *s;
+		struct atm_vcc *vcc;
+
+		sk_for_each_safe(s, node, tmp, head) {
+			vcc = atm_sk(s);
+			if (vcc->dev == dev) {
+				vcc_release_async(vcc, -EPIPE);
+				sk_del_node_init(s);
+			}
+		}
+	}
+	write_unlock_irq(&vcc_sklist_lock);
+}
+EXPORT_SYMBOL(atm_dev_release_vccs);
+
+static int adjust_tp(struct atm_trafprm *tp, unsigned char aal)
+{
+	int max_sdu;
+
+	if (!tp->traffic_class)
+		return 0;
+	switch (aal) {
+	case ATM_AAL0:
+		max_sdu = ATM_CELL_SIZE-1;
+		break;
+	case ATM_AAL34:
+		max_sdu = ATM_MAX_AAL34_PDU;
+		break;
+	default:
+		pr_warning("AAL problems ... (%d)\n", aal);
+		/* fall through */
+	case ATM_AAL5:
+		max_sdu = ATM_MAX_AAL5_PDU;
+	}
+	if (!tp->max_sdu)
+		tp->max_sdu = max_sdu;
+	else if (tp->max_sdu > max_sdu)
+		return -EINVAL;
+	if (!tp->max_cdv)
+		tp->max_cdv = ATM_MAX_CDV;
+	return 0;
+}
+
+static int check_ci(const struct atm_vcc *vcc, short vpi, int vci)
+{
+	struct hlist_head *head = &vcc_hash[vci & (VCC_HTABLE_SIZE - 1)];
+	struct hlist_node *node;
+	struct sock *s;
+	struct atm_vcc *walk;
+
+	sk_for_each(s, node, head) {
+		walk = atm_sk(s);
+		if (walk->dev != vcc->dev)
+			continue;
+		if (test_bit(ATM_VF_ADDR, &walk->flags) && walk->vpi == vpi &&
+		    walk->vci == vci && ((walk->qos.txtp.traffic_class !=
+		    ATM_NONE && vcc->qos.txtp.traffic_class != ATM_NONE) ||
+		    (walk->qos.rxtp.traffic_class != ATM_NONE &&
+		    vcc->qos.rxtp.traffic_class != ATM_NONE)))
+			return -EADDRINUSE;
+	}
+
+	/* allow VCCs with same VPI/VCI iff they don't collide on
+	   TX/RX (but we may refuse such sharing for other reasons,
+	   e.g. if protocol requires to have both channels) */
+
+	return 0;
+}
+
+static int find_ci(const struct atm_vcc *vcc, short *vpi, int *vci)
+{
+	static short p;        /* poor man's per-device cache */
+	static int c;
+	short old_p;
+	int old_c;
+	int err;
+
+	if (*vpi != ATM_VPI_ANY && *vci != ATM_VCI_ANY) {
+		err = check_ci(vcc, *vpi, *vci);
+		return err;
+	}
+	/* last scan may have left values out of bounds for current device */
+	if (*vpi != ATM_VPI_ANY)
+		p = *vpi;
+	else if (p >= 1 << vcc->dev->ci_range.vpi_bits)
+		p = 0;
+	if (*vci != ATM_VCI_ANY)
+		c = *vci;
+	else if (c < ATM_NOT_RSV_VCI || c >= 1 << vcc->dev->ci_range.vci_bits)
+			c = ATM_NOT_RSV_VCI;
+	old_p = p;
+	old_c = c;
+	do {
+		if (!check_ci(vcc, p, c)) {
+			*vpi = p;
+			*vci = c;
+			return 0;
+		}
+		if (*vci == ATM_VCI_ANY) {
+			c++;
+			if (c >= 1 << vcc->dev->ci_range.vci_bits)
+				c = ATM_NOT_RSV_VCI;
+		}
+		if ((c == ATM_NOT_RSV_VCI || *vci != ATM_VCI_ANY) &&
+		    *vpi == ATM_VPI_ANY) {
+			p++;
+			if (p >= 1 << vcc->dev->ci_range.vpi_bits)
+				p = 0;
+		}
+	} while (old_p != p || old_c != c);
+	return -EADDRINUSE;
+}
+
+static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
+			 int vci)
+{
+	struct sock *sk = sk_atm(vcc);
+	int error;
+
+	if ((vpi != ATM_VPI_UNSPEC && vpi != ATM_VPI_ANY &&
+	    vpi >> dev->ci_range.vpi_bits) || (vci != ATM_VCI_UNSPEC &&
+	    vci != ATM_VCI_ANY && vci >> dev->ci_range.vci_bits))
+		return -EINVAL;
+	if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE))
+		return -EPERM;
+	error = -ENODEV;
+	if (!try_module_get(dev->ops->owner))
+		return error;
+	vcc->dev = dev;
+	write_lock_irq(&vcc_sklist_lock);
+	if (test_bit(ATM_DF_REMOVED, &dev->flags) ||
+	    (error = find_ci(vcc, &vpi, &vci))) {
+		write_unlock_irq(&vcc_sklist_lock);
+		goto fail_module_put;
+	}
+	vcc->vpi = vpi;
+	vcc->vci = vci;
+	__vcc_insert_socket(sk);
+	write_unlock_irq(&vcc_sklist_lock);
+	switch (vcc->qos.aal) {
+	case ATM_AAL0:
+		error = atm_init_aal0(vcc);
+		vcc->stats = &dev->stats.aal0;
+		break;
+	case ATM_AAL34:
+		error = atm_init_aal34(vcc);
+		vcc->stats = &dev->stats.aal34;
+		break;
+	case ATM_NO_AAL:
+		/* ATM_AAL5 is also used in the "0 for default" case */
+		vcc->qos.aal = ATM_AAL5;
+		/* fall through */
+	case ATM_AAL5:
+		error = atm_init_aal5(vcc);
+		vcc->stats = &dev->stats.aal5;
+		break;
+	default:
+		error = -EPROTOTYPE;
+	}
+	if (!error)
+		error = adjust_tp(&vcc->qos.txtp, vcc->qos.aal);
+	if (!error)
+		error = adjust_tp(&vcc->qos.rxtp, vcc->qos.aal);
+	if (error)
+		goto fail;
+	pr_debug("VCC %d.%d, AAL %d\n", vpi, vci, vcc->qos.aal);
+	pr_debug("  TX: %d, PCR %d..%d, SDU %d\n",
+		 vcc->qos.txtp.traffic_class,
+		 vcc->qos.txtp.min_pcr,
+		 vcc->qos.txtp.max_pcr,
+		 vcc->qos.txtp.max_sdu);
+	pr_debug("  RX: %d, PCR %d..%d, SDU %d\n",
+		 vcc->qos.rxtp.traffic_class,
+		 vcc->qos.rxtp.min_pcr,
+		 vcc->qos.rxtp.max_pcr,
+		 vcc->qos.rxtp.max_sdu);
+
+	if (dev->ops->open) {
+		error = dev->ops->open(vcc);
+		if (error)
+			goto fail;
+	}
+	return 0;
+
+fail:
+	vcc_remove_socket(sk);
+fail_module_put:
+	module_put(dev->ops->owner);
+	/* ensure we get dev module ref count correct */
+	vcc->dev = NULL;
+	return error;
+}
+
+int vcc_connect(struct socket *sock, int itf, short vpi, int vci)
+{
+	struct atm_dev *dev;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int error;
+
+	pr_debug("(vpi %d, vci %d)\n", vpi, vci);
+	if (sock->state == SS_CONNECTED)
+		return -EISCONN;
+	if (sock->state != SS_UNCONNECTED)
+		return -EINVAL;
+	if (!(vpi || vci))
+		return -EINVAL;
+
+	if (vpi != ATM_VPI_UNSPEC && vci != ATM_VCI_UNSPEC)
+		clear_bit(ATM_VF_PARTIAL, &vcc->flags);
+	else
+		if (test_bit(ATM_VF_PARTIAL, &vcc->flags))
+			return -EINVAL;
+	pr_debug("(TX: cl %d,bw %d-%d,sdu %d; "
+		 "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n",
+		 vcc->qos.txtp.traffic_class, vcc->qos.txtp.min_pcr,
+		 vcc->qos.txtp.max_pcr, vcc->qos.txtp.max_sdu,
+		 vcc->qos.rxtp.traffic_class, vcc->qos.rxtp.min_pcr,
+		 vcc->qos.rxtp.max_pcr, vcc->qos.rxtp.max_sdu,
+		 vcc->qos.aal == ATM_AAL5 ? "" :
+		 vcc->qos.aal == ATM_AAL0 ? "" : " ??? code ",
+		 vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal);
+	if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
+		return -EBADFD;
+	if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
+	    vcc->qos.rxtp.traffic_class == ATM_ANYCLASS)
+		return -EINVAL;
+	if (likely(itf != ATM_ITF_ANY)) {
+		dev = try_then_request_module(atm_dev_lookup(itf),
+					      "atm-device-%d", itf);
+	} else {
+		dev = NULL;
+		mutex_lock(&atm_dev_mutex);
+		if (!list_empty(&atm_devs)) {
+			dev = list_entry(atm_devs.next,
+					 struct atm_dev, dev_list);
+			atm_dev_hold(dev);
+		}
+		mutex_unlock(&atm_dev_mutex);
+	}
+	if (!dev)
+		return -ENODEV;
+	error = __vcc_connect(vcc, dev, vpi, vci);
+	if (error) {
+		atm_dev_put(dev);
+		return error;
+	}
+	if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC)
+		set_bit(ATM_VF_PARTIAL, &vcc->flags);
+	if (test_bit(ATM_VF_READY, &ATM_SD(sock)->flags))
+		sock->state = SS_CONNECTED;
+	return 0;
+}
+
+int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc;
+	struct sk_buff *skb;
+	int copied, error = -EINVAL;
+
+	if (sock->state != SS_CONNECTED)
+		return -ENOTCONN;
+	if (flags & ~MSG_DONTWAIT)		/* only handle MSG_DONTWAIT */
+		return -EOPNOTSUPP;
+	vcc = ATM_SD(sock);
+	if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+	    test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+	    !test_bit(ATM_VF_READY, &vcc->flags))
+		return 0;
+
+	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error);
+	if (!skb)
+		return error;
+
+	copied = skb->len;
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (error)
+		return error;
+	sock_recv_ts_and_drops(msg, sk, skb);
+	pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc), skb->truesize);
+	atm_return(vcc, skb->truesize);
+	skb_free_datagram(sk, skb);
+	return copied;
+}
+
+int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+		size_t total_len)
+{
+	struct sock *sk = sock->sk;
+	DEFINE_WAIT(wait);
+	struct atm_vcc *vcc;
+	struct sk_buff *skb;
+	int eff, error;
+	const void __user *buff;
+	int size;
+
+	lock_sock(sk);
+	if (sock->state != SS_CONNECTED) {
+		error = -ENOTCONN;
+		goto out;
+	}
+	if (m->msg_name) {
+		error = -EISCONN;
+		goto out;
+	}
+	if (m->msg_iovlen != 1) {
+		error = -ENOSYS; /* fix this later @@@ */
+		goto out;
+	}
+	buff = m->msg_iov->iov_base;
+	size = m->msg_iov->iov_len;
+	vcc = ATM_SD(sock);
+	if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+	    test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+	    !test_bit(ATM_VF_READY, &vcc->flags)) {
+		error = -EPIPE;
+		send_sig(SIGPIPE, current, 0);
+		goto out;
+	}
+	if (!size) {
+		error = 0;
+		goto out;
+	}
+	if (size < 0 || size > vcc->qos.txtp.max_sdu) {
+		error = -EMSGSIZE;
+		goto out;
+	}
+
+	eff = (size+3) & ~3; /* align to word boundary */
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	error = 0;
+	while (!(skb = alloc_tx(vcc, eff))) {
+		if (m->msg_flags & MSG_DONTWAIT) {
+			error = -EAGAIN;
+			break;
+		}
+		schedule();
+		if (signal_pending(current)) {
+			error = -ERESTARTSYS;
+			break;
+		}
+		if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+		    test_bit(ATM_VF_CLOSE, &vcc->flags) ||
+		    !test_bit(ATM_VF_READY, &vcc->flags)) {
+			error = -EPIPE;
+			send_sig(SIGPIPE, current, 0);
+			break;
+		}
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (error)
+		goto out;
+	skb->dev = NULL; /* for paths shared with net_device interfaces */
+	ATM_SKB(skb)->atm_options = vcc->atm_options;
+	if (copy_from_user(skb_put(skb, size), buff, size)) {
+		kfree_skb(skb);
+		error = -EFAULT;
+		goto out;
+	}
+	if (eff != size)
+		memset(skb->data + size, 0, eff-size);
+	error = vcc->dev->ops->send(vcc, skb);
+	error = error ? error : size;
+out:
+	release_sock(sk);
+	return error;
+}
+
+unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	vcc = ATM_SD(sock);
+
+	/* exceptional events */
+	if (sk->sk_err)
+		mask = POLLERR;
+
+	if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+	    test_bit(ATM_VF_CLOSE, &vcc->flags))
+		mask |= POLLHUP;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* writable? */
+	if (sock->state == SS_CONNECTING &&
+	    test_bit(ATM_VF_WAITING, &vcc->flags))
+		return mask;
+
+	if (vcc->qos.txtp.traffic_class != ATM_NONE &&
+	    vcc_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static int atm_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
+{
+	int error;
+
+	/*
+	 * Don't let the QoS change the already connected AAL type nor the
+	 * traffic class.
+	 */
+	if (qos->aal != vcc->qos.aal ||
+	    qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class ||
+	    qos->txtp.traffic_class != vcc->qos.txtp.traffic_class)
+		return -EINVAL;
+	error = adjust_tp(&qos->txtp, qos->aal);
+	if (!error)
+		error = adjust_tp(&qos->rxtp, qos->aal);
+	if (error)
+		return error;
+	if (!vcc->dev->ops->change_qos)
+		return -EOPNOTSUPP;
+	if (sk_atm(vcc)->sk_family == AF_ATMPVC)
+		return vcc->dev->ops->change_qos(vcc, qos, ATM_MF_SET);
+	return svc_change_qos(vcc, qos);
+}
+
+static int check_tp(const struct atm_trafprm *tp)
+{
+	/* @@@ Should be merged with adjust_tp */
+	if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS)
+		return 0;
+	if (tp->traffic_class != ATM_UBR && !tp->min_pcr && !tp->pcr &&
+	    !tp->max_pcr)
+		return -EINVAL;
+	if (tp->min_pcr == ATM_MAX_PCR)
+		return -EINVAL;
+	if (tp->min_pcr && tp->max_pcr && tp->max_pcr != ATM_MAX_PCR &&
+	    tp->min_pcr > tp->max_pcr)
+		return -EINVAL;
+	/*
+	 * We allow pcr to be outside [min_pcr,max_pcr], because later
+	 * adjustment may still push it in the valid range.
+	 */
+	return 0;
+}
+
+static int check_qos(const struct atm_qos *qos)
+{
+	int error;
+
+	if (!qos->txtp.traffic_class && !qos->rxtp.traffic_class)
+		return -EINVAL;
+	if (qos->txtp.traffic_class != qos->rxtp.traffic_class &&
+	    qos->txtp.traffic_class && qos->rxtp.traffic_class &&
+	    qos->txtp.traffic_class != ATM_ANYCLASS &&
+	    qos->rxtp.traffic_class != ATM_ANYCLASS)
+		return -EINVAL;
+	error = check_tp(&qos->txtp);
+	if (error)
+		return error;
+	return check_tp(&qos->rxtp);
+}
+
+int vcc_setsockopt(struct socket *sock, int level, int optname,
+		   char __user *optval, unsigned int optlen)
+{
+	struct atm_vcc *vcc;
+	unsigned long value;
+	int error;
+
+	if (__SO_LEVEL_MATCH(optname, level) && optlen != __SO_SIZE(optname))
+		return -EINVAL;
+
+	vcc = ATM_SD(sock);
+	switch (optname) {
+	case SO_ATMQOS:
+	{
+		struct atm_qos qos;
+
+		if (copy_from_user(&qos, optval, sizeof(qos)))
+			return -EFAULT;
+		error = check_qos(&qos);
+		if (error)
+			return error;
+		if (sock->state == SS_CONNECTED)
+			return atm_change_qos(vcc, &qos);
+		if (sock->state != SS_UNCONNECTED)
+			return -EBADFD;
+		vcc->qos = qos;
+		set_bit(ATM_VF_HASQOS, &vcc->flags);
+		return 0;
+	}
+	case SO_SETCLP:
+		if (get_user(value, (unsigned long __user *)optval))
+			return -EFAULT;
+		if (value)
+			vcc->atm_options |= ATM_ATMOPT_CLP;
+		else
+			vcc->atm_options &= ~ATM_ATMOPT_CLP;
+		return 0;
+	default:
+		if (level == SOL_SOCKET)
+			return -EINVAL;
+		break;
+	}
+	if (!vcc->dev || !vcc->dev->ops->setsockopt)
+		return -EINVAL;
+	return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen);
+}
+
+int vcc_getsockopt(struct socket *sock, int level, int optname,
+		   char __user *optval, int __user *optlen)
+{
+	struct atm_vcc *vcc;
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (__SO_LEVEL_MATCH(optname, level) && len != __SO_SIZE(optname))
+		return -EINVAL;
+
+	vcc = ATM_SD(sock);
+	switch (optname) {
+	case SO_ATMQOS:
+		if (!test_bit(ATM_VF_HASQOS, &vcc->flags))
+			return -EINVAL;
+		return copy_to_user(optval, &vcc->qos, sizeof(vcc->qos))
+			? -EFAULT : 0;
+	case SO_SETCLP:
+		return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 : 0,
+				(unsigned long __user *)optval) ? -EFAULT : 0;
+	case SO_ATMPVC:
+	{
+		struct sockaddr_atmpvc pvc;
+
+		if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+			return -ENOTCONN;
+		pvc.sap_family = AF_ATMPVC;
+		pvc.sap_addr.itf = vcc->dev->number;
+		pvc.sap_addr.vpi = vcc->vpi;
+		pvc.sap_addr.vci = vcc->vci;
+		return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0;
+	}
+	default:
+		if (level == SOL_SOCKET)
+			return -EINVAL;
+		break;
+	}
+	if (!vcc->dev || !vcc->dev->ops->getsockopt)
+		return -EINVAL;
+	return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
+}
+
+int register_atmdevice_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_atmdevice_notifier);
+
+void unregister_atmdevice_notifier(struct notifier_block *nb)
+{
+	atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier);
+
+static int __init atm_init(void)
+{
+	int error;
+
+	error = proto_register(&vcc_proto, 0);
+	if (error < 0)
+		goto out;
+	error = atmpvc_init();
+	if (error < 0) {
+		pr_err("atmpvc_init() failed with %d\n", error);
+		goto out_unregister_vcc_proto;
+	}
+	error = atmsvc_init();
+	if (error < 0) {
+		pr_err("atmsvc_init() failed with %d\n", error);
+		goto out_atmpvc_exit;
+	}
+	error = atm_proc_init();
+	if (error < 0) {
+		pr_err("atm_proc_init() failed with %d\n", error);
+		goto out_atmsvc_exit;
+	}
+	error = atm_sysfs_init();
+	if (error < 0) {
+		pr_err("atm_sysfs_init() failed with %d\n", error);
+		goto out_atmproc_exit;
+	}
+out:
+	return error;
+out_atmproc_exit:
+	atm_proc_exit();
+out_atmsvc_exit:
+	atmsvc_exit();
+out_atmpvc_exit:
+	atmsvc_exit();
+out_unregister_vcc_proto:
+	proto_unregister(&vcc_proto);
+	goto out;
+}
+
+static void __exit atm_exit(void)
+{
+	atm_proc_exit();
+	atm_sysfs_exit();
+	atmsvc_exit();
+	atmpvc_exit();
+	proto_unregister(&vcc_proto);
+}
+
+subsys_initcall(atm_init);
+
+module_exit(atm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_ATMPVC);
+MODULE_ALIAS_NETPROTO(PF_ATMSVC);
diff --git a/net/atm/common.h b/net/atm/common.h
new file mode 100644
index 00000000..f48a76b6
--- /dev/null
+++ b/net/atm/common.h
@@ -0,0 +1,55 @@
+/* net/atm/common.h - ATM sockets (common part for PVC and SVC) */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_COMMON_H
+#define NET_ATM_COMMON_H
+
+#include <linux/net.h>
+#include <linux/poll.h> /* for poll_table */
+
+
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_release(struct socket *sock);
+int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
+int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int flags);
+int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+		size_t total_len);
+unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_setsockopt(struct socket *sock, int level, int optname,
+		   char __user *optval, unsigned int optlen);
+int vcc_getsockopt(struct socket *sock, int level, int optname,
+		   char __user *optval, int __user *optlen);
+
+int atmpvc_init(void);
+void atmpvc_exit(void);
+int atmsvc_init(void);
+void atmsvc_exit(void);
+int atm_sysfs_init(void);
+void atm_sysfs_exit(void);
+
+#ifdef CONFIG_PROC_FS
+int atm_proc_init(void);
+void atm_proc_exit(void);
+#else
+static inline int atm_proc_init(void)
+{
+	return 0;
+}
+
+static inline void atm_proc_exit(void)
+{
+	/* nothing */
+}
+#endif /* CONFIG_PROC_FS */
+
+/* SVC */
+int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos);
+
+void atm_dev_release_vccs(struct atm_dev *dev);
+
+#endif
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
new file mode 100644
index 00000000..62dc8bfe
--- /dev/null
+++ b/net/atm/ioctl.c
@@ -0,0 +1,371 @@
+/* ATM ioctl handling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+/* 2003 John Levon  <levon@movementarian.org> */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/net.h>		/* struct socket, struct proto_ops */
+#include <linux/atm.h>		/* ATM stuff */
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>	/* CLIP_*ENCAP */
+#include <linux/atmarp.h>	/* manifest constants */
+#include <linux/capability.h>
+#include <linux/sonet.h>	/* for ioctls */
+#include <linux/atmsvc.h>
+#include <linux/atmmpc.h>
+#include <net/atmclip.h>
+#include <linux/atmlec.h>
+#include <linux/mutex.h>
+#include <asm/ioctls.h>
+#include <net/compat.h>
+
+#include "resources.h"
+#include "signaling.h"		/* for WAITING and sigd_attach */
+#include "common.h"
+
+
+static DEFINE_MUTEX(ioctl_mutex);
+static LIST_HEAD(ioctl_list);
+
+
+void register_atm_ioctl(struct atm_ioctl *ioctl)
+{
+	mutex_lock(&ioctl_mutex);
+	list_add_tail(&ioctl->list, &ioctl_list);
+	mutex_unlock(&ioctl_mutex);
+}
+EXPORT_SYMBOL(register_atm_ioctl);
+
+void deregister_atm_ioctl(struct atm_ioctl *ioctl)
+{
+	mutex_lock(&ioctl_mutex);
+	list_del(&ioctl->list);
+	mutex_unlock(&ioctl_mutex);
+}
+EXPORT_SYMBOL(deregister_atm_ioctl);
+
+static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
+			unsigned long arg, int compat)
+{
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc;
+	int error;
+	struct list_head *pos;
+	void __user *argp = (void __user *)arg;
+
+	vcc = ATM_SD(sock);
+	switch (cmd) {
+	case SIOCOUTQ:
+		if (sock->state != SS_CONNECTED ||
+		    !test_bit(ATM_VF_READY, &vcc->flags)) {
+			error =  -EINVAL;
+			goto done;
+		}
+		error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk),
+				 (int __user *)argp) ? -EFAULT : 0;
+		goto done;
+	case SIOCINQ:
+	{
+		struct sk_buff *skb;
+
+		if (sock->state != SS_CONNECTED) {
+			error = -EINVAL;
+			goto done;
+		}
+		skb = skb_peek(&sk->sk_receive_queue);
+		error = put_user(skb ? skb->len : 0,
+				 (int __user *)argp) ? -EFAULT : 0;
+		goto done;
+	}
+	case SIOCGSTAMP: /* borrowed from IP */
+#ifdef CONFIG_COMPAT
+		if (compat)
+			error = compat_sock_get_timestamp(sk, argp);
+		else
+#endif
+			error = sock_get_timestamp(sk, argp);
+		goto done;
+	case SIOCGSTAMPNS: /* borrowed from IP */
+#ifdef CONFIG_COMPAT
+		if (compat)
+			error = compat_sock_get_timestampns(sk, argp);
+		else
+#endif
+			error = sock_get_timestampns(sk, argp);
+		goto done;
+	case ATM_SETSC:
+		if (net_ratelimit())
+			pr_warning("ATM_SETSC is obsolete; used by %s:%d\n",
+				   current->comm, task_pid_nr(current));
+		error = 0;
+		goto done;
+	case ATMSIGD_CTRL:
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+		/*
+		 * The user/kernel protocol for exchanging signalling
+		 * info uses kernel pointers as opaque references,
+		 * so the holder of the file descriptor can scribble
+		 * on the kernel... so we should make sure that we
+		 * have the same privileges that /proc/kcore needs
+		 */
+		if (!capable(CAP_SYS_RAWIO)) {
+			error = -EPERM;
+			goto done;
+		}
+#ifdef CONFIG_COMPAT
+		/* WTF? I don't even want to _think_ about making this
+		   work for 32-bit userspace. TBH I don't really want
+		   to think about it at all. dwmw2. */
+		if (compat) {
+			if (net_ratelimit())
+				pr_warning("32-bit task cannot be atmsigd\n");
+			error = -EINVAL;
+			goto done;
+		}
+#endif
+		error = sigd_attach(vcc);
+		if (!error)
+			sock->state = SS_CONNECTED;
+		goto done;
+	case ATM_SETBACKEND:
+	case ATM_NEWBACKENDIF:
+	{
+		atm_backend_t backend;
+		error = get_user(backend, (atm_backend_t __user *)argp);
+		if (error)
+			goto done;
+		switch (backend) {
+		case ATM_BACKEND_PPP:
+			request_module("pppoatm");
+			break;
+		case ATM_BACKEND_BR2684:
+			request_module("br2684");
+			break;
+		}
+		break;
+	}
+	case ATMMPC_CTRL:
+	case ATMMPC_DATA:
+		request_module("mpoa");
+		break;
+	case ATMARPD_CTRL:
+		request_module("clip");
+		break;
+	case ATMLEC_CTRL:
+		request_module("lec");
+		break;
+	}
+
+	error = -ENOIOCTLCMD;
+
+	mutex_lock(&ioctl_mutex);
+	list_for_each(pos, &ioctl_list) {
+		struct atm_ioctl *ic = list_entry(pos, struct atm_ioctl, list);
+		if (try_module_get(ic->owner)) {
+			error = ic->ioctl(sock, cmd, arg);
+			module_put(ic->owner);
+			if (error != -ENOIOCTLCMD)
+				break;
+		}
+	}
+	mutex_unlock(&ioctl_mutex);
+
+	if (error != -ENOIOCTLCMD)
+		goto done;
+
+	error = atm_dev_ioctl(cmd, argp, compat);
+
+done:
+	return error;
+}
+
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return do_vcc_ioctl(sock, cmd, arg, 0);
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * FIXME:
+ * The compat_ioctl handling is duplicated, using both these conversion
+ * routines and the compat argument to the actual handlers. Both
+ * versions are somewhat incomplete and should be merged, e.g. by
+ * moving the ioctl number translation into the actual handlers and
+ * killing the conversion code.
+ *
+ * -arnd, November 2009
+ */
+#define ATM_GETLINKRATE32 _IOW('a', ATMIOC_ITF+1, struct compat_atmif_sioc)
+#define ATM_GETNAMES32    _IOW('a', ATMIOC_ITF+3, struct compat_atm_iobuf)
+#define ATM_GETTYPE32     _IOW('a', ATMIOC_ITF+4, struct compat_atmif_sioc)
+#define ATM_GETESI32	  _IOW('a', ATMIOC_ITF+5, struct compat_atmif_sioc)
+#define ATM_GETADDR32	  _IOW('a', ATMIOC_ITF+6, struct compat_atmif_sioc)
+#define ATM_RSTADDR32	  _IOW('a', ATMIOC_ITF+7, struct compat_atmif_sioc)
+#define ATM_ADDADDR32	  _IOW('a', ATMIOC_ITF+8, struct compat_atmif_sioc)
+#define ATM_DELADDR32	  _IOW('a', ATMIOC_ITF+9, struct compat_atmif_sioc)
+#define ATM_GETCIRANGE32  _IOW('a', ATMIOC_ITF+10, struct compat_atmif_sioc)
+#define ATM_SETCIRANGE32  _IOW('a', ATMIOC_ITF+11, struct compat_atmif_sioc)
+#define ATM_SETESI32      _IOW('a', ATMIOC_ITF+12, struct compat_atmif_sioc)
+#define ATM_SETESIF32     _IOW('a', ATMIOC_ITF+13, struct compat_atmif_sioc)
+#define ATM_GETSTAT32     _IOW('a', ATMIOC_SARCOM+0, struct compat_atmif_sioc)
+#define ATM_GETSTATZ32    _IOW('a', ATMIOC_SARCOM+1, struct compat_atmif_sioc)
+#define ATM_GETLOOP32	  _IOW('a', ATMIOC_SARCOM+2, struct compat_atmif_sioc)
+#define ATM_SETLOOP32	  _IOW('a', ATMIOC_SARCOM+3, struct compat_atmif_sioc)
+#define ATM_QUERYLOOP32	  _IOW('a', ATMIOC_SARCOM+4, struct compat_atmif_sioc)
+
+static struct {
+	unsigned int cmd32;
+	unsigned int cmd;
+} atm_ioctl_map[] = {
+	{ ATM_GETLINKRATE32, ATM_GETLINKRATE },
+	{ ATM_GETNAMES32,    ATM_GETNAMES },
+	{ ATM_GETTYPE32,     ATM_GETTYPE },
+	{ ATM_GETESI32,	     ATM_GETESI },
+	{ ATM_GETADDR32,     ATM_GETADDR },
+	{ ATM_RSTADDR32,     ATM_RSTADDR },
+	{ ATM_ADDADDR32,     ATM_ADDADDR },
+	{ ATM_DELADDR32,     ATM_DELADDR },
+	{ ATM_GETCIRANGE32,  ATM_GETCIRANGE },
+	{ ATM_SETCIRANGE32,  ATM_SETCIRANGE },
+	{ ATM_SETESI32,	     ATM_SETESI },
+	{ ATM_SETESIF32,     ATM_SETESIF },
+	{ ATM_GETSTAT32,     ATM_GETSTAT },
+	{ ATM_GETSTATZ32,    ATM_GETSTATZ },
+	{ ATM_GETLOOP32,     ATM_GETLOOP },
+	{ ATM_SETLOOP32,     ATM_SETLOOP },
+	{ ATM_QUERYLOOP32,   ATM_QUERYLOOP },
+};
+
+#define NR_ATM_IOCTL ARRAY_SIZE(atm_ioctl_map)
+
+static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
+			unsigned long arg)
+{
+	struct atm_iobuf __user *iobuf;
+	struct compat_atm_iobuf __user *iobuf32;
+	u32 data;
+	void __user *datap;
+	int len, err;
+
+	iobuf = compat_alloc_user_space(sizeof(*iobuf));
+	iobuf32 = compat_ptr(arg);
+
+	if (get_user(len, &iobuf32->length) ||
+	    get_user(data, &iobuf32->buffer))
+		return -EFAULT;
+	datap = compat_ptr(data);
+	if (put_user(len, &iobuf->length) ||
+	    put_user(datap, &iobuf->buffer))
+		return -EFAULT;
+
+	err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0);
+
+	if (!err) {
+		if (copy_in_user(&iobuf32->length, &iobuf->length,
+				 sizeof(int)))
+			err = -EFAULT;
+	}
+
+	return err;
+}
+
+static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
+			 unsigned long arg)
+{
+	struct atmif_sioc __user *sioc;
+	struct compat_atmif_sioc __user *sioc32;
+	u32 data;
+	void __user *datap;
+	int err;
+
+	sioc = compat_alloc_user_space(sizeof(*sioc));
+	sioc32 = compat_ptr(arg);
+
+	if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
+	    get_user(data, &sioc32->arg))
+		return -EFAULT;
+	datap = compat_ptr(data);
+	if (put_user(datap, &sioc->arg))
+		return -EFAULT;
+
+	err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0);
+
+	if (!err) {
+		if (copy_in_user(&sioc32->length, &sioc->length,
+				 sizeof(int)))
+			err = -EFAULT;
+	}
+	return err;
+}
+
+static int do_atm_ioctl(struct socket *sock, unsigned int cmd32,
+			unsigned long arg)
+{
+	int i;
+	unsigned int cmd = 0;
+
+	switch (cmd32) {
+	case SONET_GETSTAT:
+	case SONET_GETSTATZ:
+	case SONET_GETDIAG:
+	case SONET_SETDIAG:
+	case SONET_CLRDIAG:
+	case SONET_SETFRAMING:
+	case SONET_GETFRAMING:
+	case SONET_GETFRSENSE:
+		return do_atmif_sioc(sock, cmd32, arg);
+	}
+
+	for (i = 0; i < NR_ATM_IOCTL; i++) {
+		if (cmd32 == atm_ioctl_map[i].cmd32) {
+			cmd = atm_ioctl_map[i].cmd;
+			break;
+		}
+	}
+	if (i == NR_ATM_IOCTL)
+		return -EINVAL;
+
+	switch (cmd) {
+	case ATM_GETNAMES:
+		return do_atm_iobuf(sock, cmd, arg);
+
+	case ATM_GETLINKRATE:
+	case ATM_GETTYPE:
+	case ATM_GETESI:
+	case ATM_GETADDR:
+	case ATM_RSTADDR:
+	case ATM_ADDADDR:
+	case ATM_DELADDR:
+	case ATM_GETCIRANGE:
+	case ATM_SETCIRANGE:
+	case ATM_SETESI:
+	case ATM_SETESIF:
+	case ATM_GETSTAT:
+	case ATM_GETSTATZ:
+	case ATM_GETLOOP:
+	case ATM_SETLOOP:
+	case ATM_QUERYLOOP:
+		return do_atmif_sioc(sock, cmd, arg);
+	}
+
+	return -EINVAL;
+}
+
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd,
+		     unsigned long arg)
+{
+	int ret;
+
+	ret = do_vcc_ioctl(sock, cmd, arg, 1);
+	if (ret != -ENOIOCTLCMD)
+		return ret;
+
+	return do_atm_ioctl(sock, cmd, arg);
+}
+#endif
diff --git a/net/atm/lec.c b/net/atm/lec.c
new file mode 100644
index 00000000..ba48daa6
--- /dev/null
+++ b/net/atm/lec.c
@@ -0,0 +1,2409 @@
+/*
+ * lec.c: Lan Emulation driver
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+
+/* We are ethernet device */
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <asm/byteorder.h>
+#include <linux/uaccess.h>
+#include <net/arp.h>
+#include <net/dst.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+
+/* TokenRing if needed */
+#ifdef CONFIG_TR
+#include <linux/trdevice.h>
+#endif
+
+/* And atm device */
+#include <linux/atmdev.h>
+#include <linux/atmlec.h>
+
+/* Proxy LEC knows about bridging */
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+#include "../bridge/br_private.h"
+
+static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
+#endif
+
+/* Modular too */
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "lec.h"
+#include "lec_arpc.h"
+#include "resources.h"
+
+#define DUMP_PACKETS 0		/*
+				 * 0 = None,
+				 * 1 = 30 first bytes
+				 * 2 = Whole packet
+				 */
+
+#define LEC_UNRES_QUE_LEN 8	/*
+				 * number of tx packets to queue for a
+				 * single destination while waiting for SVC
+				 */
+
+static int lec_open(struct net_device *dev);
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+				  struct net_device *dev);
+static int lec_close(struct net_device *dev);
+static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
+					  const unsigned char *mac_addr);
+static int lec_arp_remove(struct lec_priv *priv,
+			  struct lec_arp_table *to_remove);
+/* LANE2 functions */
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address,
+				const u8 *tlvs, u32 sizeoftlvs);
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
+			 u8 **tlvs, u32 *sizeoftlvs);
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+			       const u8 *tlvs, u32 sizeoftlvs);
+
+static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
+			   unsigned long permanent);
+static void lec_arp_check_empties(struct lec_priv *priv,
+				  struct atm_vcc *vcc, struct sk_buff *skb);
+static void lec_arp_destroy(struct lec_priv *priv);
+static void lec_arp_init(struct lec_priv *priv);
+static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
+				       const unsigned char *mac_to_find,
+				       int is_rdesc,
+				       struct lec_arp_table **ret_entry);
+static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+			   const unsigned char *atm_addr,
+			   unsigned long remoteflag,
+			   unsigned int targetless_le_arp);
+static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id);
+static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc);
+static void lec_set_flush_tran_id(struct lec_priv *priv,
+				  const unsigned char *atm_addr,
+				  unsigned long tran_id);
+static void lec_vcc_added(struct lec_priv *priv,
+			  const struct atmlec_ioc *ioc_data,
+			  struct atm_vcc *vcc,
+			  void (*old_push)(struct atm_vcc *vcc,
+					   struct sk_buff *skb));
+static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc);
+
+/* must be done under lec_arp_lock */
+static inline void lec_arp_hold(struct lec_arp_table *entry)
+{
+	atomic_inc(&entry->usage);
+}
+
+static inline void lec_arp_put(struct lec_arp_table *entry)
+{
+	if (atomic_dec_and_test(&entry->usage))
+		kfree(entry);
+}
+
+static struct lane2_ops lane2_ops = {
+	lane2_resolve,		/* resolve,             spec 3.1.3 */
+	lane2_associate_req,	/* associate_req,       spec 3.1.4 */
+	NULL			/* associate indicator, spec 3.1.5 */
+};
+
+static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+/* Device structures */
+static struct net_device *dev_lec[MAX_LEC_ITF];
+
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
+{
+	char *buff;
+	struct lec_priv *priv;
+
+	/*
+	 * Check if this is a BPDU. If so, ask zeppelin to send
+	 * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit
+	 * as the Config BPDU has
+	 */
+	buff = skb->data + skb->dev->hard_header_len;
+	if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) {
+		struct sock *sk;
+		struct sk_buff *skb2;
+		struct atmlec_msg *mesg;
+
+		skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+		if (skb2 == NULL)
+			return;
+		skb2->len = sizeof(struct atmlec_msg);
+		mesg = (struct atmlec_msg *)skb2->data;
+		mesg->type = l_topology_change;
+		buff += 4;
+		mesg->content.normal.flag = *buff & 0x01;
+					/* 0x01 is topology change */
+
+		priv = netdev_priv(dev);
+		atm_force_charge(priv->lecd, skb2->truesize);
+		sk = sk_atm(priv->lecd);
+		skb_queue_tail(&sk->sk_receive_queue, skb2);
+		sk->sk_data_ready(sk, skb2->len);
+	}
+}
+#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+
+/*
+ * Modelled after tr_type_trans
+ * All multicast and ARE or STE frames go to BUS.
+ * Non source routed frames go by destination address.
+ * Last hop source routed frames go by destination address.
+ * Not last hop source routed frames go by _next_ route descriptor.
+ * Returns pointer to destination MAC address or fills in rdesc
+ * and returns NULL.
+ */
+#ifdef CONFIG_TR
+static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
+{
+	struct trh_hdr *trh;
+	unsigned int riflen, num_rdsc;
+
+	trh = (struct trh_hdr *)packet;
+	if (trh->daddr[0] & (uint8_t) 0x80)
+		return bus_mac;	/* multicast */
+
+	if (trh->saddr[0] & TR_RII) {
+		riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8;
+		if ((ntohs(trh->rcf) >> 13) != 0)
+			return bus_mac;	/* ARE or STE */
+	} else
+		return trh->daddr;	/* not source routed */
+
+	if (riflen < 6)
+		return trh->daddr;	/* last hop, source routed */
+
+	/* riflen is 6 or more, packet has more than one route descriptor */
+	num_rdsc = (riflen / 2) - 1;
+	memset(rdesc, 0, ETH_ALEN);
+	/* offset 4 comes from LAN destination field in LE control frames */
+	if (trh->rcf & htons((uint16_t) TR_RCF_DIR_BIT))
+		memcpy(&rdesc[4], &trh->rseg[num_rdsc - 2], sizeof(__be16));
+	else {
+		memcpy(&rdesc[4], &trh->rseg[1], sizeof(__be16));
+		rdesc[5] = ((ntohs(trh->rseg[0]) & 0x000f) | (rdesc[5] & 0xf0));
+	}
+
+	return NULL;
+}
+#endif /* CONFIG_TR */
+
+/*
+ * Open/initialize the netdevice. This is called (in the current kernel)
+ * sometime after booting when the 'ifconfig' program is run.
+ *
+ * This routine should set everything up anew at each open, even
+ * registers that "should" only need to be set once at boot, so that
+ * there is non-reboot way to recover if something goes wrong.
+ */
+
+static int lec_open(struct net_device *dev)
+{
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static void
+lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+
+	ATM_SKB(skb)->vcc = vcc;
+	ATM_SKB(skb)->atm_options = vcc->atm_options;
+
+	atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+	if (vcc->send(vcc, skb) < 0) {
+		dev->stats.tx_dropped++;
+		return;
+	}
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+}
+
+static void lec_tx_timeout(struct net_device *dev)
+{
+	pr_info("%s\n", dev->name);
+	dev->trans_start = jiffies;
+	netif_wake_queue(dev);
+}
+
+static netdev_tx_t lec_start_xmit(struct sk_buff *skb,
+				  struct net_device *dev)
+{
+	struct sk_buff *skb2;
+	struct lec_priv *priv = netdev_priv(dev);
+	struct lecdatahdr_8023 *lec_h;
+	struct atm_vcc *vcc;
+	struct lec_arp_table *entry;
+	unsigned char *dst;
+	int min_frame_size;
+#ifdef CONFIG_TR
+	unsigned char rdesc[ETH_ALEN];	/* Token Ring route descriptor */
+#endif
+	int is_rdesc;
+
+	pr_debug("called\n");
+	if (!priv->lecd) {
+		pr_info("%s:No lecd attached\n", dev->name);
+		dev->stats.tx_errors++;
+		netif_stop_queue(dev);
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n",
+		 (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb),
+		 (long)skb_end_pointer(skb));
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+	if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0)
+		lec_handle_bridge(skb, dev);
+#endif
+
+	/* Make sure we have room for lec_id */
+	if (skb_headroom(skb) < 2) {
+		pr_debug("reallocating skb\n");
+		skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
+		kfree_skb(skb);
+		if (skb2 == NULL)
+			return NETDEV_TX_OK;
+		skb = skb2;
+	}
+	skb_push(skb, 2);
+
+	/* Put le header to place, works for TokenRing too */
+	lec_h = (struct lecdatahdr_8023 *)skb->data;
+	lec_h->le_header = htons(priv->lecid);
+
+#ifdef CONFIG_TR
+	/*
+	 * Ugly. Use this to realign Token Ring packets for
+	 * e.g. PCA-200E driver.
+	 */
+	if (priv->is_trdev) {
+		skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN);
+		kfree_skb(skb);
+		if (skb2 == NULL)
+			return NETDEV_TX_OK;
+		skb = skb2;
+	}
+#endif
+
+#if DUMP_PACKETS >= 2
+#define MAX_DUMP_SKB 99
+#elif DUMP_PACKETS >= 1
+#define MAX_DUMP_SKB 30
+#endif
+#if DUMP_PACKETS >= 1
+	printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n",
+	       dev->name, skb->len, priv->lecid);
+	print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+		       skb->data, min(skb->len, MAX_DUMP_SKB), true);
+#endif /* DUMP_PACKETS >= 1 */
+
+	/* Minimum ethernet-frame size */
+#ifdef CONFIG_TR
+	if (priv->is_trdev)
+		min_frame_size = LEC_MINIMUM_8025_SIZE;
+	else
+#endif
+		min_frame_size = LEC_MINIMUM_8023_SIZE;
+	if (skb->len < min_frame_size) {
+		if ((skb->len + skb_tailroom(skb)) < min_frame_size) {
+			skb2 = skb_copy_expand(skb, 0,
+					       min_frame_size - skb->truesize,
+					       GFP_ATOMIC);
+			dev_kfree_skb(skb);
+			if (skb2 == NULL) {
+				dev->stats.tx_dropped++;
+				return NETDEV_TX_OK;
+			}
+			skb = skb2;
+		}
+		skb_put(skb, min_frame_size - skb->len);
+	}
+
+	/* Send to right vcc */
+	is_rdesc = 0;
+	dst = lec_h->h_dest;
+#ifdef CONFIG_TR
+	if (priv->is_trdev) {
+		dst = get_tr_dst(skb->data + 2, rdesc);
+		if (dst == NULL) {
+			dst = rdesc;
+			is_rdesc = 1;
+		}
+	}
+#endif
+	entry = NULL;
+	vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry);
+	pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n",
+		 dev->name, vcc, vcc ? vcc->flags : 0, entry);
+	if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) {
+		if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) {
+			pr_debug("%s:queuing packet, MAC address %pM\n",
+				 dev->name, lec_h->h_dest);
+			skb_queue_tail(&entry->tx_wait, skb);
+		} else {
+			pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n",
+				 dev->name, lec_h->h_dest);
+			dev->stats.tx_dropped++;
+			dev_kfree_skb(skb);
+		}
+		goto out;
+	}
+#if DUMP_PACKETS > 0
+	printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n",
+	       dev->name, vcc->vpi, vcc->vci);
+#endif /* DUMP_PACKETS > 0 */
+
+	while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) {
+		pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest);
+		lec_send(vcc, skb2);
+	}
+
+	lec_send(vcc, skb);
+
+	if (!atm_may_send(vcc, 0)) {
+		struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+
+		vpriv->xoff = 1;
+		netif_stop_queue(dev);
+
+		/*
+		 * vcc->pop() might have occurred in between, making
+		 * the vcc usuable again.  Since xmit is serialized,
+		 * this is the only situation we have to re-test.
+		 */
+
+		if (atm_may_send(vcc, 0))
+			netif_wake_queue(dev);
+	}
+
+out:
+	if (entry)
+		lec_arp_put(entry);
+	dev->trans_start = jiffies;
+	return NETDEV_TX_OK;
+}
+
+/* The inverse routine to net_open(). */
+static int lec_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	unsigned long flags;
+	struct net_device *dev = (struct net_device *)vcc->proto_data;
+	struct lec_priv *priv = netdev_priv(dev);
+	struct atmlec_msg *mesg;
+	struct lec_arp_table *entry;
+	int i;
+	char *tmp;		/* FIXME */
+
+	atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+	mesg = (struct atmlec_msg *)skb->data;
+	tmp = skb->data;
+	tmp += sizeof(struct atmlec_msg);
+	pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type);
+	switch (mesg->type) {
+	case l_set_mac_addr:
+		for (i = 0; i < 6; i++)
+			dev->dev_addr[i] = mesg->content.normal.mac_addr[i];
+		break;
+	case l_del_mac_addr:
+		for (i = 0; i < 6; i++)
+			dev->dev_addr[i] = 0;
+		break;
+	case l_addr_delete:
+		lec_addr_delete(priv, mesg->content.normal.atm_addr,
+				mesg->content.normal.flag);
+		break;
+	case l_topology_change:
+		priv->topology_change = mesg->content.normal.flag;
+		break;
+	case l_flush_complete:
+		lec_flush_complete(priv, mesg->content.normal.flag);
+		break;
+	case l_narp_req:	/* LANE2: see 7.1.35 in the lane2 spec */
+		spin_lock_irqsave(&priv->lec_arp_lock, flags);
+		entry = lec_arp_find(priv, mesg->content.normal.mac_addr);
+		lec_arp_remove(priv, entry);
+		spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+		if (mesg->content.normal.no_source_le_narp)
+			break;
+		/* FALL THROUGH */
+	case l_arp_update:
+		lec_arp_update(priv, mesg->content.normal.mac_addr,
+			       mesg->content.normal.atm_addr,
+			       mesg->content.normal.flag,
+			       mesg->content.normal.targetless_le_arp);
+		pr_debug("in l_arp_update\n");
+		if (mesg->sizeoftlvs != 0) {	/* LANE2 3.1.5 */
+			pr_debug("LANE2 3.1.5, got tlvs, size %d\n",
+				 mesg->sizeoftlvs);
+			lane2_associate_ind(dev, mesg->content.normal.mac_addr,
+					    tmp, mesg->sizeoftlvs);
+		}
+		break;
+	case l_config:
+		priv->maximum_unknown_frame_count =
+		    mesg->content.config.maximum_unknown_frame_count;
+		priv->max_unknown_frame_time =
+		    (mesg->content.config.max_unknown_frame_time * HZ);
+		priv->max_retry_count = mesg->content.config.max_retry_count;
+		priv->aging_time = (mesg->content.config.aging_time * HZ);
+		priv->forward_delay_time =
+		    (mesg->content.config.forward_delay_time * HZ);
+		priv->arp_response_time =
+		    (mesg->content.config.arp_response_time * HZ);
+		priv->flush_timeout = (mesg->content.config.flush_timeout * HZ);
+		priv->path_switching_delay =
+		    (mesg->content.config.path_switching_delay * HZ);
+		priv->lane_version = mesg->content.config.lane_version;
+					/* LANE2 */
+		priv->lane2_ops = NULL;
+		if (priv->lane_version > 1)
+			priv->lane2_ops = &lane2_ops;
+		if (dev_set_mtu(dev, mesg->content.config.mtu))
+			pr_info("%s: change_mtu to %d failed\n",
+				dev->name, mesg->content.config.mtu);
+		priv->is_proxy = mesg->content.config.is_proxy;
+		break;
+	case l_flush_tran_id:
+		lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr,
+				      mesg->content.normal.flag);
+		break;
+	case l_set_lecid:
+		priv->lecid =
+		    (unsigned short)(0xffff & mesg->content.normal.flag);
+		break;
+	case l_should_bridge:
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+	{
+		pr_debug("%s: bridge zeppelin asks about %pM\n",
+			 dev->name, mesg->content.proxy.mac_addr);
+
+		if (br_fdb_test_addr_hook == NULL)
+			break;
+
+		if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) {
+			/* hit from bridge table, send LE_ARP_RESPONSE */
+			struct sk_buff *skb2;
+			struct sock *sk;
+
+			pr_debug("%s: entry found, responding to zeppelin\n",
+				 dev->name);
+			skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+			if (skb2 == NULL)
+				break;
+			skb2->len = sizeof(struct atmlec_msg);
+			skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg));
+			atm_force_charge(priv->lecd, skb2->truesize);
+			sk = sk_atm(priv->lecd);
+			skb_queue_tail(&sk->sk_receive_queue, skb2);
+			sk->sk_data_ready(sk, skb2->len);
+		}
+	}
+#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */
+		break;
+	default:
+		pr_info("%s: Unknown message type %d\n", dev->name, mesg->type);
+		dev_kfree_skb(skb);
+		return -EINVAL;
+	}
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+static void lec_atm_close(struct atm_vcc *vcc)
+{
+	struct sk_buff *skb;
+	struct net_device *dev = (struct net_device *)vcc->proto_data;
+	struct lec_priv *priv = netdev_priv(dev);
+
+	priv->lecd = NULL;
+	/* Do something needful? */
+
+	netif_stop_queue(dev);
+	lec_arp_destroy(priv);
+
+	if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
+		pr_info("%s closing with messages pending\n", dev->name);
+	while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) {
+		atm_return(vcc, skb->truesize);
+		dev_kfree_skb(skb);
+	}
+
+	pr_info("%s: Shut down!\n", dev->name);
+	module_put(THIS_MODULE);
+}
+
+static struct atmdev_ops lecdev_ops = {
+	.close = lec_atm_close,
+	.send = lec_atm_send
+};
+
+static struct atm_dev lecatm_dev = {
+	.ops = &lecdev_ops,
+	.type = "lec",
+	.number = 999,		/* dummy device number */
+	.lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock)
+};
+
+/*
+ * LANE2: new argument struct sk_buff *data contains
+ * the LE_ARP based TLVs introduced in the LANE2 spec
+ */
+static int
+send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
+	     const unsigned char *mac_addr, const unsigned char *atm_addr,
+	     struct sk_buff *data)
+{
+	struct sock *sk;
+	struct sk_buff *skb;
+	struct atmlec_msg *mesg;
+
+	if (!priv || !priv->lecd)
+		return -1;
+	skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC);
+	if (!skb)
+		return -1;
+	skb->len = sizeof(struct atmlec_msg);
+	mesg = (struct atmlec_msg *)skb->data;
+	memset(mesg, 0, sizeof(struct atmlec_msg));
+	mesg->type = type;
+	if (data != NULL)
+		mesg->sizeoftlvs = data->len;
+	if (mac_addr)
+		memcpy(&mesg->content.normal.mac_addr, mac_addr, ETH_ALEN);
+	else
+		mesg->content.normal.targetless_le_arp = 1;
+	if (atm_addr)
+		memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN);
+
+	atm_force_charge(priv->lecd, skb->truesize);
+	sk = sk_atm(priv->lecd);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, skb->len);
+
+	if (data != NULL) {
+		pr_debug("about to send %d bytes of data\n", data->len);
+		atm_force_charge(priv->lecd, data->truesize);
+		skb_queue_tail(&sk->sk_receive_queue, data);
+		sk->sk_data_ready(sk, skb->len);
+	}
+
+	return 0;
+}
+
+/* shamelessly stolen from drivers/net/net_init.c */
+static int lec_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 68) || (new_mtu > 18190))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static void lec_set_multicast_list(struct net_device *dev)
+{
+	/*
+	 * by default, all multicast frames arrive over the bus.
+	 * eventually support selective multicast service
+	 */
+}
+
+static const struct net_device_ops lec_netdev_ops = {
+	.ndo_open		= lec_open,
+	.ndo_stop		= lec_close,
+	.ndo_start_xmit		= lec_start_xmit,
+	.ndo_change_mtu		= lec_change_mtu,
+	.ndo_tx_timeout		= lec_tx_timeout,
+	.ndo_set_multicast_list	= lec_set_multicast_list,
+};
+
+static const unsigned char lec_ctrl_magic[] = {
+	0xff,
+	0x00,
+	0x01,
+	0x01
+};
+
+#define LEC_DATA_DIRECT_8023  2
+#define LEC_DATA_DIRECT_8025  3
+
+static int lec_is_data_direct(struct atm_vcc *vcc)
+{
+	return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) ||
+		(vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025));
+}
+
+static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	unsigned long flags;
+	struct net_device *dev = (struct net_device *)vcc->proto_data;
+	struct lec_priv *priv = netdev_priv(dev);
+
+#if DUMP_PACKETS > 0
+	printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n",
+	       dev->name, vcc->vpi, vcc->vci);
+#endif
+	if (!skb) {
+		pr_debug("%s: null skb\n", dev->name);
+		lec_vcc_close(priv, vcc);
+		return;
+	}
+#if DUMP_PACKETS >= 2
+#define MAX_SKB_DUMP 99
+#elif DUMP_PACKETS >= 1
+#define MAX_SKB_DUMP 30
+#endif
+#if DUMP_PACKETS > 0
+	printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n",
+	       dev->name, skb->len, priv->lecid);
+	print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1,
+		       skb->data, min(MAX_SKB_DUMP, skb->len), true);
+#endif /* DUMP_PACKETS > 0 */
+	if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) {
+				/* Control frame, to daemon */
+		struct sock *sk = sk_atm(vcc);
+
+		pr_debug("%s: To daemon\n", dev->name);
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		sk->sk_data_ready(sk, skb->len);
+	} else {		/* Data frame, queue to protocol handlers */
+		struct lec_arp_table *entry;
+		unsigned char *src, *dst;
+
+		atm_return(vcc, skb->truesize);
+		if (*(__be16 *) skb->data == htons(priv->lecid) ||
+		    !priv->lecd || !(dev->flags & IFF_UP)) {
+			/*
+			 * Probably looping back, or if lecd is missing,
+			 * lecd has gone down
+			 */
+			pr_debug("Ignoring frame...\n");
+			dev_kfree_skb(skb);
+			return;
+		}
+#ifdef CONFIG_TR
+		if (priv->is_trdev)
+			dst = ((struct lecdatahdr_8025 *)skb->data)->h_dest;
+		else
+#endif
+			dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest;
+
+		/*
+		 * If this is a Data Direct VCC, and the VCC does not match
+		 * the LE_ARP cache entry, delete the LE_ARP cache entry.
+		 */
+		spin_lock_irqsave(&priv->lec_arp_lock, flags);
+		if (lec_is_data_direct(vcc)) {
+#ifdef CONFIG_TR
+			if (priv->is_trdev)
+				src =
+				    ((struct lecdatahdr_8025 *)skb->data)->
+				    h_source;
+			else
+#endif
+				src =
+				    ((struct lecdatahdr_8023 *)skb->data)->
+				    h_source;
+			entry = lec_arp_find(priv, src);
+			if (entry && entry->vcc != vcc) {
+				lec_arp_remove(priv, entry);
+				lec_arp_put(entry);
+			}
+		}
+		spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+		if (!(dst[0] & 0x01) &&	/* Never filter Multi/Broadcast */
+		    !priv->is_proxy &&	/* Proxy wants all the packets */
+		    memcmp(dst, dev->dev_addr, dev->addr_len)) {
+			dev_kfree_skb(skb);
+			return;
+		}
+		if (!hlist_empty(&priv->lec_arp_empty_ones))
+			lec_arp_check_empties(priv, vcc, skb);
+		skb_pull(skb, 2);	/* skip lec_id */
+#ifdef CONFIG_TR
+		if (priv->is_trdev)
+			skb->protocol = tr_type_trans(skb, dev);
+		else
+#endif
+			skb->protocol = eth_type_trans(skb, dev);
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += skb->len;
+		memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+		netif_rx(skb);
+	}
+}
+
+static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+	struct net_device *dev = skb->dev;
+
+	if (vpriv == NULL) {
+		pr_info("vpriv = NULL!?!?!?\n");
+		return;
+	}
+
+	vpriv->old_pop(vcc, skb);
+
+	if (vpriv->xoff && atm_may_send(vcc, 0)) {
+		vpriv->xoff = 0;
+		if (netif_running(dev) && netif_queue_stopped(dev))
+			netif_wake_queue(dev);
+	}
+}
+
+static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
+{
+	struct lec_vcc_priv *vpriv;
+	int bytes_left;
+	struct atmlec_ioc ioc_data;
+
+	/* Lecd must be up in this case */
+	bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
+	if (bytes_left != 0)
+		pr_info("copy from user failed for %d bytes\n", bytes_left);
+	if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF ||
+	    !dev_lec[ioc_data.dev_num])
+		return -EINVAL;
+	vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+	if (!vpriv)
+		return -ENOMEM;
+	vpriv->xoff = 0;
+	vpriv->old_pop = vcc->pop;
+	vcc->user_back = vpriv;
+	vcc->pop = lec_pop;
+	lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]),
+		      &ioc_data, vcc, vcc->push);
+	vcc->proto_data = dev_lec[ioc_data.dev_num];
+	vcc->push = lec_push;
+	return 0;
+}
+
+static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
+{
+	if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+		return -EINVAL;
+	vcc->proto_data = dev_lec[arg];
+	return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
+}
+
+/* Initialize device. */
+static int lecd_attach(struct atm_vcc *vcc, int arg)
+{
+	int i;
+	struct lec_priv *priv;
+
+	if (arg < 0)
+		i = 0;
+	else
+		i = arg;
+#ifdef CONFIG_TR
+	if (arg >= MAX_LEC_ITF)
+		return -EINVAL;
+#else				/* Reserve the top NUM_TR_DEVS for TR */
+	if (arg >= (MAX_LEC_ITF - NUM_TR_DEVS))
+		return -EINVAL;
+#endif
+	if (!dev_lec[i]) {
+		int is_trdev, size;
+
+		is_trdev = 0;
+		if (i >= (MAX_LEC_ITF - NUM_TR_DEVS))
+			is_trdev = 1;
+
+		size = sizeof(struct lec_priv);
+#ifdef CONFIG_TR
+		if (is_trdev)
+			dev_lec[i] = alloc_trdev(size);
+		else
+#endif
+			dev_lec[i] = alloc_etherdev(size);
+		if (!dev_lec[i])
+			return -ENOMEM;
+		dev_lec[i]->netdev_ops = &lec_netdev_ops;
+		snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
+		if (register_netdev(dev_lec[i])) {
+			free_netdev(dev_lec[i]);
+			return -EINVAL;
+		}
+
+		priv = netdev_priv(dev_lec[i]);
+		priv->is_trdev = is_trdev;
+	} else {
+		priv = netdev_priv(dev_lec[i]);
+		if (priv->lecd)
+			return -EADDRINUSE;
+	}
+	lec_arp_init(priv);
+	priv->itfnum = i;	/* LANE2 addition */
+	priv->lecd = vcc;
+	vcc->dev = &lecatm_dev;
+	vcc_insert_socket(sk_atm(vcc));
+
+	vcc->proto_data = dev_lec[i];
+	set_bit(ATM_VF_META, &vcc->flags);
+	set_bit(ATM_VF_READY, &vcc->flags);
+
+	/* Set default values to these variables */
+	priv->maximum_unknown_frame_count = 1;
+	priv->max_unknown_frame_time = (1 * HZ);
+	priv->vcc_timeout_period = (1200 * HZ);
+	priv->max_retry_count = 1;
+	priv->aging_time = (300 * HZ);
+	priv->forward_delay_time = (15 * HZ);
+	priv->topology_change = 0;
+	priv->arp_response_time = (1 * HZ);
+	priv->flush_timeout = (4 * HZ);
+	priv->path_switching_delay = (6 * HZ);
+
+	if (dev_lec[i]->flags & IFF_UP)
+		netif_start_queue(dev_lec[i]);
+	__module_get(THIS_MODULE);
+	return i;
+}
+
+#ifdef CONFIG_PROC_FS
+static const char *lec_arp_get_status_string(unsigned char status)
+{
+	static const char *const lec_arp_status_string[] = {
+		"ESI_UNKNOWN       ",
+		"ESI_ARP_PENDING   ",
+		"ESI_VC_PENDING    ",
+		"<Undefined>       ",
+		"ESI_FLUSH_PENDING ",
+		"ESI_FORWARD_DIRECT"
+	};
+
+	if (status > ESI_FORWARD_DIRECT)
+		status = 3;	/* ESI_UNDEFINED */
+	return lec_arp_status_string[status];
+}
+
+static void lec_info(struct seq_file *seq, struct lec_arp_table *entry)
+{
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++)
+		seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff);
+	seq_printf(seq, " ");
+	for (i = 0; i < ATM_ESA_LEN; i++)
+		seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff);
+	seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status),
+		   entry->flags & 0xffff);
+	if (entry->vcc)
+		seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci);
+	else
+		seq_printf(seq, "        ");
+	if (entry->recv_vcc) {
+		seq_printf(seq, "     %3d %3d", entry->recv_vcc->vpi,
+			   entry->recv_vcc->vci);
+	}
+	seq_putc(seq, '\n');
+}
+
+struct lec_state {
+	unsigned long flags;
+	struct lec_priv *locked;
+	struct hlist_node *node;
+	struct net_device *dev;
+	int itf;
+	int arp_table;
+	int misc_table;
+};
+
+static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
+			  loff_t *l)
+{
+	struct hlist_node *e = state->node;
+	struct lec_arp_table *tmp;
+
+	if (!e)
+		e = tbl->first;
+	if (e == SEQ_START_TOKEN) {
+		e = tbl->first;
+		--*l;
+	}
+
+	hlist_for_each_entry_from(tmp, e, next) {
+		if (--*l < 0)
+			break;
+	}
+	state->node = e;
+
+	return (*l < 0) ? state : NULL;
+}
+
+static void *lec_arp_walk(struct lec_state *state, loff_t *l,
+			  struct lec_priv *priv)
+{
+	void *v = NULL;
+	int p;
+
+	for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) {
+		v = lec_tbl_walk(state, &priv->lec_arp_tables[p], l);
+		if (v)
+			break;
+	}
+	state->arp_table = p;
+	return v;
+}
+
+static void *lec_misc_walk(struct lec_state *state, loff_t *l,
+			   struct lec_priv *priv)
+{
+	struct hlist_head *lec_misc_tables[] = {
+		&priv->lec_arp_empty_ones,
+		&priv->lec_no_forward,
+		&priv->mcast_fwds
+	};
+	void *v = NULL;
+	int q;
+
+	for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) {
+		v = lec_tbl_walk(state, lec_misc_tables[q], l);
+		if (v)
+			break;
+	}
+	state->misc_table = q;
+	return v;
+}
+
+static void *lec_priv_walk(struct lec_state *state, loff_t *l,
+			   struct lec_priv *priv)
+{
+	if (!state->locked) {
+		state->locked = priv;
+		spin_lock_irqsave(&priv->lec_arp_lock, state->flags);
+	}
+	if (!lec_arp_walk(state, l, priv) && !lec_misc_walk(state, l, priv)) {
+		spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags);
+		state->locked = NULL;
+		/* Partial state reset for the next time we get called */
+		state->arp_table = state->misc_table = 0;
+	}
+	return state->locked;
+}
+
+static void *lec_itf_walk(struct lec_state *state, loff_t *l)
+{
+	struct net_device *dev;
+	void *v;
+
+	dev = state->dev ? state->dev : dev_lec[state->itf];
+	v = (dev && netdev_priv(dev)) ?
+		lec_priv_walk(state, l, netdev_priv(dev)) : NULL;
+	if (!v && dev) {
+		dev_put(dev);
+		/* Partial state reset for the next time we get called */
+		dev = NULL;
+	}
+	state->dev = dev;
+	return v;
+}
+
+static void *lec_get_idx(struct lec_state *state, loff_t l)
+{
+	void *v = NULL;
+
+	for (; state->itf < MAX_LEC_ITF; state->itf++) {
+		v = lec_itf_walk(state, &l);
+		if (v)
+			break;
+	}
+	return v;
+}
+
+static void *lec_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct lec_state *state = seq->private;
+
+	state->itf = 0;
+	state->dev = NULL;
+	state->locked = NULL;
+	state->arp_table = 0;
+	state->misc_table = 0;
+	state->node = SEQ_START_TOKEN;
+
+	return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN;
+}
+
+static void lec_seq_stop(struct seq_file *seq, void *v)
+{
+	struct lec_state *state = seq->private;
+
+	if (state->dev) {
+		spin_unlock_irqrestore(&state->locked->lec_arp_lock,
+				       state->flags);
+		dev_put(state->dev);
+	}
+}
+
+static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct lec_state *state = seq->private;
+
+	v = lec_get_idx(state, 1);
+	*pos += !!PTR_ERR(v);
+	return v;
+}
+
+static int lec_seq_show(struct seq_file *seq, void *v)
+{
+	static const char lec_banner[] =
+	    "Itf  MAC          ATM destination"
+	    "                          Status            Flags "
+	    "VPI/VCI Recv VPI/VCI\n";
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, lec_banner);
+	else {
+		struct lec_state *state = seq->private;
+		struct net_device *dev = state->dev;
+		struct lec_arp_table *entry = hlist_entry(state->node,
+							  struct lec_arp_table,
+							  next);
+
+		seq_printf(seq, "%s ", dev->name);
+		lec_info(seq, entry);
+	}
+	return 0;
+}
+
+static const struct seq_operations lec_seq_ops = {
+	.start = lec_seq_start,
+	.next = lec_seq_next,
+	.stop = lec_seq_stop,
+	.show = lec_seq_show,
+};
+
+static int lec_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &lec_seq_ops, sizeof(struct lec_state));
+}
+
+static const struct file_operations lec_seq_fops = {
+	.owner = THIS_MODULE,
+	.open = lec_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+};
+#endif
+
+static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int err = 0;
+
+	switch (cmd) {
+	case ATMLEC_CTRL:
+	case ATMLEC_MCAST:
+	case ATMLEC_DATA:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	switch (cmd) {
+	case ATMLEC_CTRL:
+		err = lecd_attach(vcc, (int)arg);
+		if (err >= 0)
+			sock->state = SS_CONNECTED;
+		break;
+	case ATMLEC_MCAST:
+		err = lec_mcast_attach(vcc, (int)arg);
+		break;
+	case ATMLEC_DATA:
+		err = lec_vcc_attach(vcc, (void __user *)arg);
+		break;
+	}
+
+	return err;
+}
+
+static struct atm_ioctl lane_ioctl_ops = {
+	.owner = THIS_MODULE,
+	.ioctl = lane_ioctl,
+};
+
+static int __init lane_module_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *p;
+
+	p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops);
+	if (!p) {
+		pr_err("Unable to initialize /proc/net/atm/lec\n");
+		return -ENOMEM;
+	}
+#endif
+
+	register_atm_ioctl(&lane_ioctl_ops);
+	pr_info("lec.c: initialized\n");
+	return 0;
+}
+
+static void __exit lane_module_cleanup(void)
+{
+	int i;
+
+	remove_proc_entry("lec", atm_proc_root);
+
+	deregister_atm_ioctl(&lane_ioctl_ops);
+
+	for (i = 0; i < MAX_LEC_ITF; i++) {
+		if (dev_lec[i] != NULL) {
+			unregister_netdev(dev_lec[i]);
+			free_netdev(dev_lec[i]);
+			dev_lec[i] = NULL;
+		}
+	}
+}
+
+module_init(lane_module_init);
+module_exit(lane_module_cleanup);
+
+/*
+ * LANE2: 3.1.3, LE_RESOLVE.request
+ * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
+ * If sizeoftlvs == NULL the default TLVs associated with with this
+ * lec will be used.
+ * If dst_mac == NULL, targetless LE_ARP will be sent
+ */
+static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force,
+			 u8 **tlvs, u32 *sizeoftlvs)
+{
+	unsigned long flags;
+	struct lec_priv *priv = netdev_priv(dev);
+	struct lec_arp_table *table;
+	struct sk_buff *skb;
+	int retval;
+
+	if (force == 0) {
+		spin_lock_irqsave(&priv->lec_arp_lock, flags);
+		table = lec_arp_find(priv, dst_mac);
+		spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+		if (table == NULL)
+			return -1;
+
+		*tlvs = kmemdup(table->tlvs, table->sizeoftlvs, GFP_ATOMIC);
+		if (*tlvs == NULL)
+			return -1;
+
+		*sizeoftlvs = table->sizeoftlvs;
+
+		return 0;
+	}
+
+	if (sizeoftlvs == NULL)
+		retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL);
+
+	else {
+		skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC);
+		if (skb == NULL)
+			return -1;
+		skb->len = *sizeoftlvs;
+		skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs);
+		retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb);
+	}
+	return retval;
+}
+
+/*
+ * LANE2: 3.1.4, LE_ASSOCIATE.request
+ * Associate the *tlvs with the *lan_dst address.
+ * Will overwrite any previous association
+ * Returns 1 for success, 0 for failure (out of memory)
+ *
+ */
+static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst,
+			       const u8 *tlvs, u32 sizeoftlvs)
+{
+	int retval;
+	struct sk_buff *skb;
+	struct lec_priv *priv = netdev_priv(dev);
+
+	if (compare_ether_addr(lan_dst, dev->dev_addr))
+		return 0;	/* not our mac address */
+
+	kfree(priv->tlvs);	/* NULL if there was no previous association */
+
+	priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL);
+	if (priv->tlvs == NULL)
+		return 0;
+	priv->sizeoftlvs = sizeoftlvs;
+
+	skb = alloc_skb(sizeoftlvs, GFP_ATOMIC);
+	if (skb == NULL)
+		return 0;
+	skb->len = sizeoftlvs;
+	skb_copy_to_linear_data(skb, tlvs, sizeoftlvs);
+	retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb);
+	if (retval != 0)
+		pr_info("lec.c: lane2_associate_req() failed\n");
+	/*
+	 * If the previous association has changed we must
+	 * somehow notify other LANE entities about the change
+	 */
+	return 1;
+}
+
+/*
+ * LANE2: 3.1.5, LE_ASSOCIATE.indication
+ *
+ */
+static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr,
+				const u8 *tlvs, u32 sizeoftlvs)
+{
+#if 0
+	int i = 0;
+#endif
+	struct lec_priv *priv = netdev_priv(dev);
+#if 0				/*
+				 * Why have the TLVs in LE_ARP entries
+				 * since we do not use them? When you
+				 * uncomment this code, make sure the
+				 * TLVs get freed when entry is killed
+				 */
+	struct lec_arp_table *entry = lec_arp_find(priv, mac_addr);
+
+	if (entry == NULL)
+		return;		/* should not happen */
+
+	kfree(entry->tlvs);
+
+	entry->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL);
+	if (entry->tlvs == NULL)
+		return;
+	entry->sizeoftlvs = sizeoftlvs;
+#endif
+#if 0
+	pr_info("\n");
+	pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs);
+	while (i < sizeoftlvs)
+		pr_cont("%02x ", tlvs[i++]);
+
+	pr_cont("\n");
+#endif
+
+	/* tell MPOA about the TLVs we saw */
+	if (priv->lane2_ops && priv->lane2_ops->associate_indicator) {
+		priv->lane2_ops->associate_indicator(dev, mac_addr,
+						     tlvs, sizeoftlvs);
+	}
+}
+
+/*
+ * Here starts what used to lec_arpc.c
+ *
+ * lec_arpc.c was added here when making
+ * lane client modular. October 1997
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/param.h>
+#include <asm/atomic.h>
+#include <linux/inetdevice.h>
+#include <net/route.h>
+
+#if 0
+#define pr_debug(format, args...)
+/*
+  #define pr_debug printk
+*/
+#endif
+#define DEBUG_ARP_TABLE 0
+
+#define LEC_ARP_REFRESH_INTERVAL (3*HZ)
+
+static void lec_arp_check_expire(struct work_struct *work);
+static void lec_arp_expire_arp(unsigned long data);
+
+/*
+ * Arp table funcs
+ */
+
+#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1))
+
+/*
+ * Initialization of arp-cache
+ */
+static void lec_arp_init(struct lec_priv *priv)
+{
+	unsigned short i;
+
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
+	INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
+	INIT_HLIST_HEAD(&priv->lec_no_forward);
+	INIT_HLIST_HEAD(&priv->mcast_fwds);
+	spin_lock_init(&priv->lec_arp_lock);
+	INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire);
+	schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
+}
+
+static void lec_arp_clear_vccs(struct lec_arp_table *entry)
+{
+	if (entry->vcc) {
+		struct atm_vcc *vcc = entry->vcc;
+		struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+		struct net_device *dev = (struct net_device *)vcc->proto_data;
+
+		vcc->pop = vpriv->old_pop;
+		if (vpriv->xoff)
+			netif_wake_queue(dev);
+		kfree(vpriv);
+		vcc->user_back = NULL;
+		vcc->push = entry->old_push;
+		vcc_release_async(vcc, -EPIPE);
+		entry->vcc = NULL;
+	}
+	if (entry->recv_vcc) {
+		entry->recv_vcc->push = entry->old_recv_push;
+		vcc_release_async(entry->recv_vcc, -EPIPE);
+		entry->recv_vcc = NULL;
+	}
+}
+
+/*
+ * Insert entry to lec_arp_table
+ * LANE2: Add to the end of the list to satisfy 8.1.13
+ */
+static inline void
+lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry)
+{
+	struct hlist_head *tmp;
+
+	tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])];
+	hlist_add_head(&entry->next, tmp);
+
+	pr_debug("Added entry:%pM\n", entry->mac_addr);
+}
+
+/*
+ * Remove entry from lec_arp_table
+ */
+static int
+lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove)
+{
+	struct hlist_node *node;
+	struct lec_arp_table *entry;
+	int i, remove_vcc = 1;
+
+	if (!to_remove)
+		return -1;
+
+	hlist_del(&to_remove->next);
+	del_timer(&to_remove->timer);
+
+	/*
+	 * If this is the only MAC connected to this VCC,
+	 * also tear down the VCC
+	 */
+	if (to_remove->status >= ESI_FLUSH_PENDING) {
+		/*
+		 * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT
+		 */
+		for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+			hlist_for_each_entry(entry, node,
+					     &priv->lec_arp_tables[i], next) {
+				if (memcmp(to_remove->atm_addr,
+					   entry->atm_addr, ATM_ESA_LEN) == 0) {
+					remove_vcc = 0;
+					break;
+				}
+			}
+		}
+		if (remove_vcc)
+			lec_arp_clear_vccs(to_remove);
+	}
+	skb_queue_purge(&to_remove->tx_wait);	/* FIXME: good place for this? */
+
+	pr_debug("Removed entry:%pM\n", to_remove->mac_addr);
+	return 0;
+}
+
+#if DEBUG_ARP_TABLE
+static const char *get_status_string(unsigned char st)
+{
+	switch (st) {
+	case ESI_UNKNOWN:
+		return "ESI_UNKNOWN";
+	case ESI_ARP_PENDING:
+		return "ESI_ARP_PENDING";
+	case ESI_VC_PENDING:
+		return "ESI_VC_PENDING";
+	case ESI_FLUSH_PENDING:
+		return "ESI_FLUSH_PENDING";
+	case ESI_FORWARD_DIRECT:
+		return "ESI_FORWARD_DIRECT";
+	}
+	return "<UNKNOWN>";
+}
+
+static void dump_arp_table(struct lec_priv *priv)
+{
+	struct hlist_node *node;
+	struct lec_arp_table *rulla;
+	char buf[256];
+	int i, j, offset;
+
+	pr_info("Dump %p:\n", priv);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry(rulla, node,
+				     &priv->lec_arp_tables[i], next) {
+			offset = 0;
+			offset += sprintf(buf, "%d: %p\n", i, rulla);
+			offset += sprintf(buf + offset, "Mac: %pM",
+					  rulla->mac_addr);
+			offset += sprintf(buf + offset, " Atm:");
+			for (j = 0; j < ATM_ESA_LEN; j++) {
+				offset += sprintf(buf + offset,
+						  "%2.2x ",
+						  rulla->atm_addr[j] & 0xff);
+			}
+			offset += sprintf(buf + offset,
+					  "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+					  rulla->vcc ? rulla->vcc->vpi : 0,
+					  rulla->vcc ? rulla->vcc->vci : 0,
+					  rulla->recv_vcc ? rulla->recv_vcc->
+					  vpi : 0,
+					  rulla->recv_vcc ? rulla->recv_vcc->
+					  vci : 0, rulla->last_used,
+					  rulla->timestamp, rulla->no_tries);
+			offset +=
+			    sprintf(buf + offset,
+				    "Flags:%x, Packets_flooded:%x, Status: %s ",
+				    rulla->flags, rulla->packets_flooded,
+				    get_status_string(rulla->status));
+			pr_info("%s\n", buf);
+		}
+	}
+
+	if (!hlist_empty(&priv->lec_no_forward))
+		pr_info("No forward\n");
+	hlist_for_each_entry(rulla, node, &priv->lec_no_forward, next) {
+		offset = 0;
+		offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+		offset += sprintf(buf + offset, " Atm:");
+		for (j = 0; j < ATM_ESA_LEN; j++) {
+			offset += sprintf(buf + offset, "%2.2x ",
+					  rulla->atm_addr[j] & 0xff);
+		}
+		offset += sprintf(buf + offset,
+				  "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+				  rulla->vcc ? rulla->vcc->vpi : 0,
+				  rulla->vcc ? rulla->vcc->vci : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+				  rulla->last_used,
+				  rulla->timestamp, rulla->no_tries);
+		offset += sprintf(buf + offset,
+				  "Flags:%x, Packets_flooded:%x, Status: %s ",
+				  rulla->flags, rulla->packets_flooded,
+				  get_status_string(rulla->status));
+		pr_info("%s\n", buf);
+	}
+
+	if (!hlist_empty(&priv->lec_arp_empty_ones))
+		pr_info("Empty ones\n");
+	hlist_for_each_entry(rulla, node, &priv->lec_arp_empty_ones, next) {
+		offset = 0;
+		offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+		offset += sprintf(buf + offset, " Atm:");
+		for (j = 0; j < ATM_ESA_LEN; j++) {
+			offset += sprintf(buf + offset, "%2.2x ",
+					  rulla->atm_addr[j] & 0xff);
+		}
+		offset += sprintf(buf + offset,
+				  "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+				  rulla->vcc ? rulla->vcc->vpi : 0,
+				  rulla->vcc ? rulla->vcc->vci : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+				  rulla->last_used,
+				  rulla->timestamp, rulla->no_tries);
+		offset += sprintf(buf + offset,
+				  "Flags:%x, Packets_flooded:%x, Status: %s ",
+				  rulla->flags, rulla->packets_flooded,
+				  get_status_string(rulla->status));
+		pr_info("%s", buf);
+	}
+
+	if (!hlist_empty(&priv->mcast_fwds))
+		pr_info("Multicast Forward VCCs\n");
+	hlist_for_each_entry(rulla, node, &priv->mcast_fwds, next) {
+		offset = 0;
+		offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
+		offset += sprintf(buf + offset, " Atm:");
+		for (j = 0; j < ATM_ESA_LEN; j++) {
+			offset += sprintf(buf + offset, "%2.2x ",
+					  rulla->atm_addr[j] & 0xff);
+		}
+		offset += sprintf(buf + offset,
+				  "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
+				  rulla->vcc ? rulla->vcc->vpi : 0,
+				  rulla->vcc ? rulla->vcc->vci : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vpi : 0,
+				  rulla->recv_vcc ? rulla->recv_vcc->vci : 0,
+				  rulla->last_used,
+				  rulla->timestamp, rulla->no_tries);
+		offset += sprintf(buf + offset,
+				  "Flags:%x, Packets_flooded:%x, Status: %s ",
+				  rulla->flags, rulla->packets_flooded,
+				  get_status_string(rulla->status));
+		pr_info("%s\n", buf);
+	}
+
+}
+#else
+#define dump_arp_table(priv) do { } while (0)
+#endif
+
+/*
+ * Destruction of arp-cache
+ */
+static void lec_arp_destroy(struct lec_priv *priv)
+{
+	unsigned long flags;
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry;
+	int i;
+
+	cancel_delayed_work_sync(&priv->lec_arp_work);
+
+	/*
+	 * Remove all entries
+	 */
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry_safe(entry, node, next,
+					  &priv->lec_arp_tables[i], next) {
+			lec_arp_remove(priv, entry);
+			lec_arp_put(entry);
+		}
+		INIT_HLIST_HEAD(&priv->lec_arp_tables[i]);
+	}
+
+	hlist_for_each_entry_safe(entry, node, next,
+				  &priv->lec_arp_empty_ones, next) {
+		del_timer_sync(&entry->timer);
+		lec_arp_clear_vccs(entry);
+		hlist_del(&entry->next);
+		lec_arp_put(entry);
+	}
+	INIT_HLIST_HEAD(&priv->lec_arp_empty_ones);
+
+	hlist_for_each_entry_safe(entry, node, next,
+				  &priv->lec_no_forward, next) {
+		del_timer_sync(&entry->timer);
+		lec_arp_clear_vccs(entry);
+		hlist_del(&entry->next);
+		lec_arp_put(entry);
+	}
+	INIT_HLIST_HEAD(&priv->lec_no_forward);
+
+	hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) {
+		/* No timer, LANEv2 7.1.20 and 2.3.5.3 */
+		lec_arp_clear_vccs(entry);
+		hlist_del(&entry->next);
+		lec_arp_put(entry);
+	}
+	INIT_HLIST_HEAD(&priv->mcast_fwds);
+	priv->mcast_vcc = NULL;
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+/*
+ * Find entry by mac_address
+ */
+static struct lec_arp_table *lec_arp_find(struct lec_priv *priv,
+					  const unsigned char *mac_addr)
+{
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct lec_arp_table *entry;
+
+	pr_debug("%pM\n", mac_addr);
+
+	head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])];
+	hlist_for_each_entry(entry, node, head, next) {
+		if (!compare_ether_addr(mac_addr, entry->mac_addr))
+			return entry;
+	}
+	return NULL;
+}
+
+static struct lec_arp_table *make_entry(struct lec_priv *priv,
+					const unsigned char *mac_addr)
+{
+	struct lec_arp_table *to_return;
+
+	to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC);
+	if (!to_return) {
+		pr_info("LEC: Arp entry kmalloc failed\n");
+		return NULL;
+	}
+	memcpy(to_return->mac_addr, mac_addr, ETH_ALEN);
+	INIT_HLIST_NODE(&to_return->next);
+	setup_timer(&to_return->timer, lec_arp_expire_arp,
+			(unsigned long)to_return);
+	to_return->last_used = jiffies;
+	to_return->priv = priv;
+	skb_queue_head_init(&to_return->tx_wait);
+	atomic_set(&to_return->usage, 1);
+	return to_return;
+}
+
+/* Arp sent timer expired */
+static void lec_arp_expire_arp(unsigned long data)
+{
+	struct lec_arp_table *entry;
+
+	entry = (struct lec_arp_table *)data;
+
+	pr_debug("\n");
+	if (entry->status == ESI_ARP_PENDING) {
+		if (entry->no_tries <= entry->priv->max_retry_count) {
+			if (entry->is_rdesc)
+				send_to_lecd(entry->priv, l_rdesc_arp_xmt,
+					     entry->mac_addr, NULL, NULL);
+			else
+				send_to_lecd(entry->priv, l_arp_xmt,
+					     entry->mac_addr, NULL, NULL);
+			entry->no_tries++;
+		}
+		mod_timer(&entry->timer, jiffies + (1 * HZ));
+	}
+}
+
+/* Unknown/unused vcc expire, remove associated entry */
+static void lec_arp_expire_vcc(unsigned long data)
+{
+	unsigned long flags;
+	struct lec_arp_table *to_remove = (struct lec_arp_table *)data;
+	struct lec_priv *priv = (struct lec_priv *)to_remove->priv;
+
+	del_timer(&to_remove->timer);
+
+	pr_debug("%p %p: vpi:%d vci:%d\n",
+		 to_remove, priv,
+		 to_remove->vcc ? to_remove->recv_vcc->vpi : 0,
+		 to_remove->vcc ? to_remove->recv_vcc->vci : 0);
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	hlist_del(&to_remove->next);
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+	lec_arp_clear_vccs(to_remove);
+	lec_arp_put(to_remove);
+}
+
+static bool __lec_arp_check_expire(struct lec_arp_table *entry,
+				   unsigned long now,
+				   struct lec_priv *priv)
+{
+	unsigned long time_to_check;
+
+	if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change)
+		time_to_check = priv->forward_delay_time;
+	else
+		time_to_check = priv->aging_time;
+
+	pr_debug("About to expire: %lx - %lx > %lx\n",
+		 now, entry->last_used, time_to_check);
+	if (time_after(now, entry->last_used + time_to_check) &&
+	    !(entry->flags & LEC_PERMANENT_FLAG) &&
+	    !(entry->mac_addr[0] & 0x01)) {	/* LANE2: 7.1.20 */
+		/* Remove entry */
+		pr_debug("Entry timed out\n");
+		lec_arp_remove(priv, entry);
+		lec_arp_put(entry);
+	} else {
+		/* Something else */
+		if ((entry->status == ESI_VC_PENDING ||
+		     entry->status == ESI_ARP_PENDING) &&
+		    time_after_eq(now, entry->timestamp +
+				       priv->max_unknown_frame_time)) {
+			entry->timestamp = jiffies;
+			entry->packets_flooded = 0;
+			if (entry->status == ESI_VC_PENDING)
+				send_to_lecd(priv, l_svc_setup,
+					     entry->mac_addr,
+					     entry->atm_addr,
+					     NULL);
+		}
+		if (entry->status == ESI_FLUSH_PENDING &&
+		    time_after_eq(now, entry->timestamp +
+				       priv->path_switching_delay)) {
+			lec_arp_hold(entry);
+			return true;
+		}
+	}
+
+	return false;
+}
+/*
+ * Expire entries.
+ * 1. Re-set timer
+ * 2. For each entry, delete entries that have aged past the age limit.
+ * 3. For each entry, depending on the status of the entry, perform
+ *    the following maintenance.
+ *    a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the
+ *       tick_count is above the max_unknown_frame_time, clear
+ *       the tick_count to zero and clear the packets_flooded counter
+ *       to zero. This supports the packet rate limit per address
+ *       while flooding unknowns.
+ *    b. If the status is ESI_FLUSH_PENDING and the tick_count is greater
+ *       than or equal to the path_switching_delay, change the status
+ *       to ESI_FORWARD_DIRECT. This causes the flush period to end
+ *       regardless of the progress of the flush protocol.
+ */
+static void lec_arp_check_expire(struct work_struct *work)
+{
+	unsigned long flags;
+	struct lec_priv *priv =
+		container_of(work, struct lec_priv, lec_arp_work.work);
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry;
+	unsigned long now;
+	int i;
+
+	pr_debug("%p\n", priv);
+	now = jiffies;
+restart:
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry_safe(entry, node, next,
+					  &priv->lec_arp_tables[i], next) {
+			if (__lec_arp_check_expire(entry, now, priv)) {
+				struct sk_buff *skb;
+				struct atm_vcc *vcc = entry->vcc;
+
+				spin_unlock_irqrestore(&priv->lec_arp_lock,
+						       flags);
+				while ((skb = skb_dequeue(&entry->tx_wait)))
+					lec_send(vcc, skb);
+				entry->last_used = jiffies;
+				entry->status = ESI_FORWARD_DIRECT;
+				lec_arp_put(entry);
+
+				goto restart;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+
+	schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL);
+}
+
+/*
+ * Try to find vcc where mac_address is attached.
+ *
+ */
+static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
+				       const unsigned char *mac_to_find,
+				       int is_rdesc,
+				       struct lec_arp_table **ret_entry)
+{
+	unsigned long flags;
+	struct lec_arp_table *entry;
+	struct atm_vcc *found;
+
+	if (mac_to_find[0] & 0x01) {
+		switch (priv->lane_version) {
+		case 1:
+			return priv->mcast_vcc;
+		case 2:	/* LANE2 wants arp for multicast addresses */
+			if (!compare_ether_addr(mac_to_find, bus_mac))
+				return priv->mcast_vcc;
+			break;
+		default:
+			break;
+		}
+	}
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	entry = lec_arp_find(priv, mac_to_find);
+
+	if (entry) {
+		if (entry->status == ESI_FORWARD_DIRECT) {
+			/* Connection Ok */
+			entry->last_used = jiffies;
+			lec_arp_hold(entry);
+			*ret_entry = entry;
+			found = entry->vcc;
+			goto out;
+		}
+		/*
+		 * If the LE_ARP cache entry is still pending, reset count to 0
+		 * so another LE_ARP request can be made for this frame.
+		 */
+		if (entry->status == ESI_ARP_PENDING)
+			entry->no_tries = 0;
+		/*
+		 * Data direct VC not yet set up, check to see if the unknown
+		 * frame count is greater than the limit. If the limit has
+		 * not been reached, allow the caller to send packet to
+		 * BUS.
+		 */
+		if (entry->status != ESI_FLUSH_PENDING &&
+		    entry->packets_flooded <
+		    priv->maximum_unknown_frame_count) {
+			entry->packets_flooded++;
+			pr_debug("Flooding..\n");
+			found = priv->mcast_vcc;
+			goto out;
+		}
+		/*
+		 * We got here because entry->status == ESI_FLUSH_PENDING
+		 * or BUS flood limit was reached for an entry which is
+		 * in ESI_ARP_PENDING or ESI_VC_PENDING state.
+		 */
+		lec_arp_hold(entry);
+		*ret_entry = entry;
+		pr_debug("entry->status %d entry->vcc %p\n", entry->status,
+			 entry->vcc);
+		found = NULL;
+	} else {
+		/* No matching entry was found */
+		entry = make_entry(priv, mac_to_find);
+		pr_debug("Making entry\n");
+		if (!entry) {
+			found = priv->mcast_vcc;
+			goto out;
+		}
+		lec_arp_add(priv, entry);
+		/* We want arp-request(s) to be sent */
+		entry->packets_flooded = 1;
+		entry->status = ESI_ARP_PENDING;
+		entry->no_tries = 1;
+		entry->last_used = entry->timestamp = jiffies;
+		entry->is_rdesc = is_rdesc;
+		if (entry->is_rdesc)
+			send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL,
+				     NULL);
+		else
+			send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL);
+		entry->timer.expires = jiffies + (1 * HZ);
+		entry->timer.function = lec_arp_expire_arp;
+		add_timer(&entry->timer);
+		found = priv->mcast_vcc;
+	}
+
+out:
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+	return found;
+}
+
+static int
+lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr,
+		unsigned long permanent)
+{
+	unsigned long flags;
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry;
+	int i;
+
+	pr_debug("\n");
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry_safe(entry, node, next,
+					  &priv->lec_arp_tables[i], next) {
+			if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) &&
+			    (permanent ||
+			     !(entry->flags & LEC_PERMANENT_FLAG))) {
+				lec_arp_remove(priv, entry);
+				lec_arp_put(entry);
+			}
+			spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+			return 0;
+		}
+	}
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+	return -1;
+}
+
+/*
+ * Notifies:  Response to arp_request (atm_addr != NULL)
+ */
+static void
+lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
+	       const unsigned char *atm_addr, unsigned long remoteflag,
+	       unsigned int targetless_le_arp)
+{
+	unsigned long flags;
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry, *tmp;
+	int i;
+
+	pr_debug("%smac:%pM\n",
+		 (targetless_le_arp) ? "targetless " : "", mac_addr);
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	entry = lec_arp_find(priv, mac_addr);
+	if (entry == NULL && targetless_le_arp)
+		goto out;	/*
+				 * LANE2: ignore targetless LE_ARPs for which
+				 * we have no entry in the cache. 7.1.30
+				 */
+	if (!hlist_empty(&priv->lec_arp_empty_ones)) {
+		hlist_for_each_entry_safe(entry, node, next,
+					  &priv->lec_arp_empty_ones, next) {
+			if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) {
+				hlist_del(&entry->next);
+				del_timer(&entry->timer);
+				tmp = lec_arp_find(priv, mac_addr);
+				if (tmp) {
+					del_timer(&tmp->timer);
+					tmp->status = ESI_FORWARD_DIRECT;
+					memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN);
+					tmp->vcc = entry->vcc;
+					tmp->old_push = entry->old_push;
+					tmp->last_used = jiffies;
+					del_timer(&entry->timer);
+					lec_arp_put(entry);
+					entry = tmp;
+				} else {
+					entry->status = ESI_FORWARD_DIRECT;
+					memcpy(entry->mac_addr, mac_addr, ETH_ALEN);
+					entry->last_used = jiffies;
+					lec_arp_add(priv, entry);
+				}
+				if (remoteflag)
+					entry->flags |= LEC_REMOTE_FLAG;
+				else
+					entry->flags &= ~LEC_REMOTE_FLAG;
+				pr_debug("After update\n");
+				dump_arp_table(priv);
+				goto out;
+			}
+		}
+	}
+
+	entry = lec_arp_find(priv, mac_addr);
+	if (!entry) {
+		entry = make_entry(priv, mac_addr);
+		if (!entry)
+			goto out;
+		entry->status = ESI_UNKNOWN;
+		lec_arp_add(priv, entry);
+		/* Temporary, changes before end of function */
+	}
+	memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN);
+	del_timer(&entry->timer);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry(tmp, node,
+				     &priv->lec_arp_tables[i], next) {
+			if (entry != tmp &&
+			    !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) {
+				/* Vcc to this host exists */
+				if (tmp->status > ESI_VC_PENDING) {
+					/*
+					 * ESI_FLUSH_PENDING,
+					 * ESI_FORWARD_DIRECT
+					 */
+					entry->vcc = tmp->vcc;
+					entry->old_push = tmp->old_push;
+				}
+				entry->status = tmp->status;
+				break;
+			}
+		}
+	}
+	if (remoteflag)
+		entry->flags |= LEC_REMOTE_FLAG;
+	else
+		entry->flags &= ~LEC_REMOTE_FLAG;
+	if (entry->status == ESI_ARP_PENDING || entry->status == ESI_UNKNOWN) {
+		entry->status = ESI_VC_PENDING;
+		send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL);
+	}
+	pr_debug("After update2\n");
+	dump_arp_table(priv);
+out:
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+/*
+ * Notifies: Vcc setup ready
+ */
+static void
+lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
+	      struct atm_vcc *vcc,
+	      void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb))
+{
+	unsigned long flags;
+	struct hlist_node *node;
+	struct lec_arp_table *entry;
+	int i, found_entry = 0;
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	/* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */
+	if (ioc_data->receive == 2) {
+		pr_debug("LEC_ARP: Attaching mcast forward\n");
+#if 0
+		entry = lec_arp_find(priv, bus_mac);
+		if (!entry) {
+			pr_info("LEC_ARP: Multicast entry not found!\n");
+			goto out;
+		}
+		memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+		entry->recv_vcc = vcc;
+		entry->old_recv_push = old_push;
+#endif
+		entry = make_entry(priv, bus_mac);
+		if (entry == NULL)
+			goto out;
+		del_timer(&entry->timer);
+		memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+		entry->recv_vcc = vcc;
+		entry->old_recv_push = old_push;
+		hlist_add_head(&entry->next, &priv->mcast_fwds);
+		goto out;
+	} else if (ioc_data->receive == 1) {
+		/*
+		 * Vcc which we don't want to make default vcc,
+		 * attach it anyway.
+		 */
+		pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
+			 ioc_data->atm_addr[0], ioc_data->atm_addr[1],
+			 ioc_data->atm_addr[2], ioc_data->atm_addr[3],
+			 ioc_data->atm_addr[4], ioc_data->atm_addr[5],
+			 ioc_data->atm_addr[6], ioc_data->atm_addr[7],
+			 ioc_data->atm_addr[8], ioc_data->atm_addr[9],
+			 ioc_data->atm_addr[10], ioc_data->atm_addr[11],
+			 ioc_data->atm_addr[12], ioc_data->atm_addr[13],
+			 ioc_data->atm_addr[14], ioc_data->atm_addr[15],
+			 ioc_data->atm_addr[16], ioc_data->atm_addr[17],
+			 ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+		entry = make_entry(priv, bus_mac);
+		if (entry == NULL)
+			goto out;
+		memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+		memset(entry->mac_addr, 0, ETH_ALEN);
+		entry->recv_vcc = vcc;
+		entry->old_recv_push = old_push;
+		entry->status = ESI_UNKNOWN;
+		entry->timer.expires = jiffies + priv->vcc_timeout_period;
+		entry->timer.function = lec_arp_expire_vcc;
+		hlist_add_head(&entry->next, &priv->lec_no_forward);
+		add_timer(&entry->timer);
+		dump_arp_table(priv);
+		goto out;
+	}
+	pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
+		 ioc_data->atm_addr[0], ioc_data->atm_addr[1],
+		 ioc_data->atm_addr[2], ioc_data->atm_addr[3],
+		 ioc_data->atm_addr[4], ioc_data->atm_addr[5],
+		 ioc_data->atm_addr[6], ioc_data->atm_addr[7],
+		 ioc_data->atm_addr[8], ioc_data->atm_addr[9],
+		 ioc_data->atm_addr[10], ioc_data->atm_addr[11],
+		 ioc_data->atm_addr[12], ioc_data->atm_addr[13],
+		 ioc_data->atm_addr[14], ioc_data->atm_addr[15],
+		 ioc_data->atm_addr[16], ioc_data->atm_addr[17],
+		 ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry(entry, node,
+				     &priv->lec_arp_tables[i], next) {
+			if (memcmp
+			    (ioc_data->atm_addr, entry->atm_addr,
+			     ATM_ESA_LEN) == 0) {
+				pr_debug("LEC_ARP: Attaching data direct\n");
+				pr_debug("Currently -> Vcc: %d, Rvcc:%d\n",
+					 entry->vcc ? entry->vcc->vci : 0,
+					 entry->recv_vcc ? entry->recv_vcc->
+					 vci : 0);
+				found_entry = 1;
+				del_timer(&entry->timer);
+				entry->vcc = vcc;
+				entry->old_push = old_push;
+				if (entry->status == ESI_VC_PENDING) {
+					if (priv->maximum_unknown_frame_count
+					    == 0)
+						entry->status =
+						    ESI_FORWARD_DIRECT;
+					else {
+						entry->timestamp = jiffies;
+						entry->status =
+						    ESI_FLUSH_PENDING;
+#if 0
+						send_to_lecd(priv, l_flush_xmt,
+							     NULL,
+							     entry->atm_addr,
+							     NULL);
+#endif
+					}
+				} else {
+					/*
+					 * They were forming a connection
+					 * to us, and we to them. Our
+					 * ATM address is numerically lower
+					 * than theirs, so we make connection
+					 * we formed into default VCC (8.1.11).
+					 * Connection they made gets torn
+					 * down. This might confuse some
+					 * clients. Can be changed if
+					 * someone reports trouble...
+					 */
+					;
+				}
+			}
+		}
+	}
+	if (found_entry) {
+		pr_debug("After vcc was added\n");
+		dump_arp_table(priv);
+		goto out;
+	}
+	/*
+	 * Not found, snatch address from first data packet that arrives
+	 * from this vcc
+	 */
+	entry = make_entry(priv, bus_mac);
+	if (!entry)
+		goto out;
+	entry->vcc = vcc;
+	entry->old_push = old_push;
+	memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
+	memset(entry->mac_addr, 0, ETH_ALEN);
+	entry->status = ESI_UNKNOWN;
+	hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
+	entry->timer.expires = jiffies + priv->vcc_timeout_period;
+	entry->timer.function = lec_arp_expire_vcc;
+	add_timer(&entry->timer);
+	pr_debug("After vcc was added\n");
+	dump_arp_table(priv);
+out:
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id)
+{
+	unsigned long flags;
+	struct hlist_node *node;
+	struct lec_arp_table *entry;
+	int i;
+
+	pr_debug("%lx\n", tran_id);
+restart:
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry(entry, node,
+				     &priv->lec_arp_tables[i], next) {
+			if (entry->flush_tran_id == tran_id &&
+			    entry->status == ESI_FLUSH_PENDING) {
+				struct sk_buff *skb;
+				struct atm_vcc *vcc = entry->vcc;
+
+				lec_arp_hold(entry);
+				spin_unlock_irqrestore(&priv->lec_arp_lock,
+						       flags);
+				while ((skb = skb_dequeue(&entry->tx_wait)))
+					lec_send(vcc, skb);
+				entry->last_used = jiffies;
+				entry->status = ESI_FORWARD_DIRECT;
+				lec_arp_put(entry);
+				pr_debug("LEC_ARP: Flushed\n");
+				goto restart;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+	dump_arp_table(priv);
+}
+
+static void
+lec_set_flush_tran_id(struct lec_priv *priv,
+		      const unsigned char *atm_addr, unsigned long tran_id)
+{
+	unsigned long flags;
+	struct hlist_node *node;
+	struct lec_arp_table *entry;
+	int i;
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++)
+		hlist_for_each_entry(entry, node,
+				     &priv->lec_arp_tables[i], next) {
+			if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) {
+				entry->flush_tran_id = tran_id;
+				pr_debug("Set flush transaction id to %lx for %p\n",
+					 tran_id, entry);
+			}
+		}
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc)
+{
+	unsigned long flags;
+	unsigned char mac_addr[] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+	};
+	struct lec_arp_table *to_add;
+	struct lec_vcc_priv *vpriv;
+	int err = 0;
+
+	vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
+	if (!vpriv)
+		return -ENOMEM;
+	vpriv->xoff = 0;
+	vpriv->old_pop = vcc->pop;
+	vcc->user_back = vpriv;
+	vcc->pop = lec_pop;
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	to_add = make_entry(priv, mac_addr);
+	if (!to_add) {
+		vcc->pop = vpriv->old_pop;
+		kfree(vpriv);
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN);
+	to_add->status = ESI_FORWARD_DIRECT;
+	to_add->flags |= LEC_PERMANENT_FLAG;
+	to_add->vcc = vcc;
+	to_add->old_push = vcc->push;
+	vcc->push = lec_push;
+	priv->mcast_vcc = vcc;
+	lec_arp_add(priv, to_add);
+out:
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+	return err;
+}
+
+static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc)
+{
+	unsigned long flags;
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry;
+	int i;
+
+	pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci);
+	dump_arp_table(priv);
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+
+	for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
+		hlist_for_each_entry_safe(entry, node, next,
+					  &priv->lec_arp_tables[i], next) {
+			if (vcc == entry->vcc) {
+				lec_arp_remove(priv, entry);
+				lec_arp_put(entry);
+				if (priv->mcast_vcc == vcc)
+					priv->mcast_vcc = NULL;
+			}
+		}
+	}
+
+	hlist_for_each_entry_safe(entry, node, next,
+				  &priv->lec_arp_empty_ones, next) {
+		if (entry->vcc == vcc) {
+			lec_arp_clear_vccs(entry);
+			del_timer(&entry->timer);
+			hlist_del(&entry->next);
+			lec_arp_put(entry);
+		}
+	}
+
+	hlist_for_each_entry_safe(entry, node, next,
+				  &priv->lec_no_forward, next) {
+		if (entry->recv_vcc == vcc) {
+			lec_arp_clear_vccs(entry);
+			del_timer(&entry->timer);
+			hlist_del(&entry->next);
+			lec_arp_put(entry);
+		}
+	}
+
+	hlist_for_each_entry_safe(entry, node, next, &priv->mcast_fwds, next) {
+		if (entry->recv_vcc == vcc) {
+			lec_arp_clear_vccs(entry);
+			/* No timer, LANEv2 7.1.20 and 2.3.5.3 */
+			hlist_del(&entry->next);
+			lec_arp_put(entry);
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+	dump_arp_table(priv);
+}
+
+static void
+lec_arp_check_empties(struct lec_priv *priv,
+		      struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	unsigned long flags;
+	struct hlist_node *node, *next;
+	struct lec_arp_table *entry, *tmp;
+	struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data;
+	unsigned char *src;
+#ifdef CONFIG_TR
+	struct lecdatahdr_8025 *tr_hdr = (struct lecdatahdr_8025 *)skb->data;
+
+	if (priv->is_trdev)
+		src = tr_hdr->h_source;
+	else
+#endif
+		src = hdr->h_source;
+
+	spin_lock_irqsave(&priv->lec_arp_lock, flags);
+	hlist_for_each_entry_safe(entry, node, next,
+				  &priv->lec_arp_empty_ones, next) {
+		if (vcc == entry->vcc) {
+			del_timer(&entry->timer);
+			memcpy(entry->mac_addr, src, ETH_ALEN);
+			entry->status = ESI_FORWARD_DIRECT;
+			entry->last_used = jiffies;
+			/* We might have got an entry */
+			tmp = lec_arp_find(priv, src);
+			if (tmp) {
+				lec_arp_remove(priv, tmp);
+				lec_arp_put(tmp);
+			}
+			hlist_del(&entry->next);
+			lec_arp_add(priv, entry);
+			goto out;
+		}
+	}
+	pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n");
+out:
+	spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
+}
+
+MODULE_LICENSE("GPL");
diff --git a/net/atm/lec.h b/net/atm/lec.h
new file mode 100644
index 00000000..dfc07196
--- /dev/null
+++ b/net/atm/lec.h
@@ -0,0 +1,155 @@
+/*
+ * Lan Emulation client header file
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+
+#ifndef _LEC_H_
+#define _LEC_H_
+
+#include <linux/atmdev.h>
+#include <linux/netdevice.h>
+#include <linux/atmlec.h>
+
+#define LEC_HEADER_LEN 16
+
+struct lecdatahdr_8023 {
+	__be16 le_header;
+	unsigned char h_dest[ETH_ALEN];
+	unsigned char h_source[ETH_ALEN];
+	__be16 h_type;
+};
+
+struct lecdatahdr_8025 {
+	__be16 le_header;
+	unsigned char ac_pad;
+	unsigned char fc;
+	unsigned char h_dest[ETH_ALEN];
+	unsigned char h_source[ETH_ALEN];
+};
+
+#define LEC_MINIMUM_8023_SIZE   62
+#define LEC_MINIMUM_8025_SIZE   16
+
+/*
+ * Operations that LANE2 capable device can do. Two first functions
+ * are used to make the device do things. See spec 3.1.3 and 3.1.4.
+ *
+ * The third function is intended for the MPOA component sitting on
+ * top of the LANE device. The MPOA component assigns it's own function
+ * to (*associate_indicator)() and the LANE device will use that
+ * function to tell about TLVs it sees floating through.
+ *
+ */
+struct lane2_ops {
+	int (*resolve) (struct net_device *dev, const u8 *dst_mac, int force,
+			u8 **tlvs, u32 *sizeoftlvs);
+	int (*associate_req) (struct net_device *dev, const u8 *lan_dst,
+			      const u8 *tlvs, u32 sizeoftlvs);
+	void (*associate_indicator) (struct net_device *dev, const u8 *mac_addr,
+				     const u8 *tlvs, u32 sizeoftlvs);
+};
+
+/*
+ * ATM LAN Emulation supports both LLC & Dix Ethernet EtherType
+ * frames.
+ *
+ * 1. Dix Ethernet EtherType frames encoded by placing EtherType
+ *    field in h_type field. Data follows immediatelly after header.
+ * 2. LLC Data frames whose total length, including LLC field and data,
+ *    but not padding required to meet the minimum data frame length,
+ *    is less than 1536(0x0600) MUST be encoded by placing that length
+ *    in the h_type field. The LLC field follows header immediatelly.
+ * 3. LLC data frames longer than this maximum MUST be encoded by placing
+ *    the value 0 in the h_type field.
+ *
+ */
+
+/* Hash table size */
+#define LEC_ARP_TABLE_SIZE 16
+
+struct lec_priv {
+	unsigned short lecid;			/* Lecid of this client */
+	struct hlist_head lec_arp_empty_ones;
+						/* Used for storing VCC's that don't have a MAC address attached yet */
+	struct hlist_head lec_arp_tables[LEC_ARP_TABLE_SIZE];
+						/* Actual LE ARP table */
+	struct hlist_head lec_no_forward;
+						/*
+						 * Used for storing VCC's (and forward packets from) which are to
+						 * age out by not using them to forward packets.
+						 * This is because to some LE clients there will be 2 VCCs. Only
+						 * one of them gets used.
+						 */
+	struct hlist_head mcast_fwds;
+						/*
+						 * With LANEv2 it is possible that BUS (or a special multicast server)
+						 * establishes multiple Multicast Forward VCCs to us. This list
+						 * collects all those VCCs. LANEv1 client has only one item in this
+						 * list. These entries are not aged out.
+						 */
+	spinlock_t lec_arp_lock;
+	struct atm_vcc *mcast_vcc;		/* Default Multicast Send VCC */
+	struct atm_vcc *lecd;
+	struct delayed_work lec_arp_work;	/* C10 */
+	unsigned int maximum_unknown_frame_count;
+						/*
+						 * Within the period of time defined by this variable, the client will send
+						 * no more than C10 frames to BUS for a given unicast destination. (C11)
+						 */
+	unsigned long max_unknown_frame_time;
+						/*
+						 * If no traffic has been sent in this vcc for this period of time,
+						 * vcc will be torn down (C12)
+						 */
+	unsigned long vcc_timeout_period;
+						/*
+						 * An LE Client MUST not retry an LE_ARP_REQUEST for a
+						 * given frame's LAN Destination more than maximum retry count times,
+						 * after the first LEC_ARP_REQUEST (C13)
+						 */
+	unsigned short max_retry_count;
+						/*
+						 * Max time the client will maintain an entry in its arp cache in
+						 * absence of a verification of that relationship (C17)
+						 */
+	unsigned long aging_time;
+						/*
+						 * Max time the client will maintain an entry in cache when
+						 * topology change flag is true (C18)
+						 */
+	unsigned long forward_delay_time;	/* Topology change flag (C19) */
+	int topology_change;
+						/*
+						 * Max time the client expects an LE_ARP_REQUEST/LE_ARP_RESPONSE
+						 * cycle to take (C20)
+						 */
+	unsigned long arp_response_time;
+						/*
+						 * Time limit ot wait to receive an LE_FLUSH_RESPONSE after the
+						 * LE_FLUSH_REQUEST has been sent before taking recover action. (C21)
+						 */
+	unsigned long flush_timeout;
+						/* The time since sending a frame to the bus after which the
+						 * LE Client may assume that the frame has been either discarded or
+						 * delivered to the recipient (C22)
+						 */
+	unsigned long path_switching_delay;
+
+	u8 *tlvs;				/* LANE2: TLVs are new */
+	u32 sizeoftlvs;				/* The size of the tlv array in bytes */
+	int lane_version;			/* LANE2 */
+	int itfnum;				/* e.g. 2 for lec2, 5 for lec5 */
+	struct lane2_ops *lane2_ops;		/* can be NULL for LANE v1 */
+	int is_proxy;				/* bridge between ATM and Ethernet */
+	int is_trdev;				/* Device type, 0 = Ethernet, 1 = TokenRing */
+};
+
+struct lec_vcc_priv {
+	void (*old_pop) (struct atm_vcc *vcc, struct sk_buff *skb);
+	int xoff;
+};
+
+#define LEC_VCC_PRIV(vcc)	((struct lec_vcc_priv *)((vcc)->user_back))
+
+#endif				/* _LEC_H_ */
diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h
new file mode 100644
index 00000000..ec67435a
--- /dev/null
+++ b/net/atm/lec_arpc.h
@@ -0,0 +1,96 @@
+/*
+ * Lec arp cache
+ *
+ * Marko Kiiskila <mkiiskila@yahoo.com>
+ */
+#ifndef _LEC_ARP_H_
+#define _LEC_ARP_H_
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/if_ether.h>
+#include <linux/atmlec.h>
+
+struct lec_arp_table {
+	struct hlist_node next;		/* Linked entry list */
+	unsigned char atm_addr[ATM_ESA_LEN];	/* Atm address */
+	unsigned char mac_addr[ETH_ALEN];	/* Mac address */
+	int is_rdesc;			/* Mac address is a route descriptor */
+	struct atm_vcc *vcc;		/* Vcc this entry is attached */
+	struct atm_vcc *recv_vcc;	/* Vcc we receive data from */
+
+	void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb);
+					/* Push that leads to daemon */
+
+	void (*old_recv_push) (struct atm_vcc *vcc, struct sk_buff *skb);
+					/* Push that leads to daemon */
+
+	unsigned long last_used;	/* For expiry */
+	unsigned long timestamp;	/* Used for various timestamping things:
+					 * 1. FLUSH started
+					 *    (status=ESI_FLUSH_PENDING)
+					 * 2. Counting to
+					 *    max_unknown_frame_time
+					 *    (status=ESI_ARP_PENDING||
+					 *     status=ESI_VC_PENDING)
+					 */
+	unsigned char no_tries;		/* No of times arp retry has been tried */
+	unsigned char status;		/* Status of this entry */
+	unsigned short flags;		/* Flags for this entry */
+	unsigned short packets_flooded;	/* Data packets flooded */
+	unsigned long flush_tran_id;	/* Transaction id in flush protocol */
+	struct timer_list timer;	/* Arping timer */
+	struct lec_priv *priv;		/* Pointer back */
+	u8 *tlvs;
+	u32 sizeoftlvs;			/*
+					 * LANE2: Each MAC address can have TLVs
+					 * associated with it.  sizeoftlvs tells the
+					 * the length of the tlvs array
+					 */
+	struct sk_buff_head tx_wait;	/* wait queue for outgoing packets */
+	atomic_t usage;			/* usage count */
+};
+
+/*
+ * LANE2: Template tlv struct for accessing
+ * the tlvs in the lec_arp_table->tlvs array
+ */
+struct tlv {
+	u32 type;
+	u8 length;
+	u8 value[255];
+};
+
+/* Status fields */
+#define ESI_UNKNOWN 0		/*
+				 * Next packet sent to this mac address
+				 * causes ARP-request to be sent
+				 */
+#define ESI_ARP_PENDING 1	/*
+				 * There is no ATM address associated with this
+				 * 48-bit address.  The LE-ARP protocol is in
+				 * progress.
+				 */
+#define ESI_VC_PENDING 2	/*
+				 * There is a valid ATM address associated with
+				 * this 48-bit address but there is no VC set
+				 * up to that ATM address.  The signaling
+				 * protocol is in process.
+				 */
+#define ESI_FLUSH_PENDING 4	/*
+				 * The LEC has been notified of the FLUSH_START
+				 * status and it is assumed that the flush
+				 * protocol is in process.
+				 */
+#define ESI_FORWARD_DIRECT 5	/*
+				 * Either the Path Switching Delay (C22) has
+				 * elapsed or the LEC has notified the Mapping
+				 * that the flush protocol has completed.  In
+				 * either case, it is safe to forward packets
+				 * to this address via the data direct VC.
+				 */
+
+/* Flag values */
+#define LEC_REMOTE_FLAG      0x0001
+#define LEC_PERMANENT_FLAG   0x0002
+
+#endif /* _LEC_ARP_H_ */
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
new file mode 100644
index 00000000..3ccca42e
--- /dev/null
+++ b/net/atm/mpc.c
@@ -0,0 +1,1537 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+
+/* We are an ethernet device */
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/uaccess.h>
+#include <asm/byteorder.h>
+#include <net/checksum.h>   /* for ip_fast_csum() */
+#include <net/arp.h>
+#include <net/dst.h>
+#include <linux/proc_fs.h>
+
+/* And atm device */
+#include <linux/atmdev.h>
+#include <linux/atmlec.h>
+#include <linux/atmmpc.h>
+/* Modular too */
+#include <linux/module.h>
+
+#include "lec.h"
+#include "mpc.h"
+#include "resources.h"
+
+/*
+ * mpc.c: Implementation of MPOA client kernel part
+ */
+
+#if 0
+#define dprintk(format, args...) \
+	printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define dprintk_cont(format, args...) printk(KERN_CONT format, ##args)
+#else
+#define dprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+	} while (0)
+#define dprintk_cont(format, args...)			\
+	do { if (0) printk(KERN_CONT format, ##args); } while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...) \
+	printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args)
+#define ddprintk_cont(format, args...) printk(KERN_CONT format, ##args)
+#else
+#define ddprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __func__, ##args);\
+	} while (0)
+#define ddprintk_cont(format, args...)			\
+	do { if (0) printk(KERN_CONT format, ##args); } while (0)
+#endif
+
+/* mpc_daemon -> kernel */
+static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc);
+static void mps_death(struct k_message *msg, struct mpoa_client *mpc);
+static void clean_up(struct k_message *msg, struct mpoa_client *mpc,
+		     int action);
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+				  struct mpoa_client *mpc);
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+				   struct mpoa_client *mpc);
+static void set_mps_mac_addr_rcvd(struct k_message *mesg,
+				  struct mpoa_client *mpc);
+
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+				const uint8_t *router_mac,
+				const uint8_t *tlvs, uint8_t mps_macs,
+				uint8_t device_type);
+static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry);
+
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc);
+static void mpoad_close(struct atm_vcc *vcc);
+static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb);
+
+static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb);
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+				   struct net_device *dev);
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+			       unsigned long event, void *dev);
+static void mpc_timer_refresh(void);
+static void mpc_cache_check(unsigned long checking_time);
+
+static struct llc_snap_hdr llc_snap_mpoa_ctrl = {
+	0xaa, 0xaa, 0x03,
+	{0x00, 0x00, 0x5e},
+	{0x00, 0x03}         /* For MPOA control PDUs */
+};
+static struct llc_snap_hdr llc_snap_mpoa_data = {
+	0xaa, 0xaa, 0x03,
+	{0x00, 0x00, 0x00},
+	{0x08, 0x00}         /* This is for IP PDUs only */
+};
+static struct llc_snap_hdr llc_snap_mpoa_data_tagged = {
+	0xaa, 0xaa, 0x03,
+	{0x00, 0x00, 0x00},
+	{0x88, 0x4c}         /* This is for tagged data PDUs */
+};
+
+static struct notifier_block mpoa_notifier = {
+	mpoa_event_listener,
+	NULL,
+	0
+};
+
+struct mpoa_client *mpcs = NULL; /* FIXME */
+static struct atm_mpoa_qos *qos_head = NULL;
+static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
+
+
+static struct mpoa_client *find_mpc_by_itfnum(int itf)
+{
+	struct mpoa_client *mpc;
+
+	mpc = mpcs;  /* our global linked list */
+	while (mpc != NULL) {
+		if (mpc->dev_num == itf)
+			return mpc;
+		mpc = mpc->next;
+	}
+
+	return NULL;   /* not found */
+}
+
+static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc)
+{
+	struct mpoa_client *mpc;
+
+	mpc = mpcs;  /* our global linked list */
+	while (mpc != NULL) {
+		if (mpc->mpoad_vcc == vcc)
+			return mpc;
+		mpc = mpc->next;
+	}
+
+	return NULL;   /* not found */
+}
+
+static struct mpoa_client *find_mpc_by_lec(struct net_device *dev)
+{
+	struct mpoa_client *mpc;
+
+	mpc = mpcs;  /* our global linked list */
+	while (mpc != NULL) {
+		if (mpc->dev == dev)
+			return mpc;
+		mpc = mpc->next;
+	}
+
+	return NULL;   /* not found */
+}
+
+/*
+ * Functions for managing QoS list
+ */
+
+/*
+ * Overwrites the old entry or makes a new one.
+ */
+struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos)
+{
+	struct atm_mpoa_qos *entry;
+
+	entry = atm_mpoa_search_qos(dst_ip);
+	if (entry != NULL) {
+		entry->qos = *qos;
+		return entry;
+	}
+
+	entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL);
+	if (entry == NULL) {
+		pr_info("mpoa: out of memory\n");
+		return entry;
+	}
+
+	entry->ipaddr = dst_ip;
+	entry->qos = *qos;
+
+	entry->next = qos_head;
+	qos_head = entry;
+
+	return entry;
+}
+
+struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip)
+{
+	struct atm_mpoa_qos *qos;
+
+	qos = qos_head;
+	while (qos) {
+		if (qos->ipaddr == dst_ip)
+			break;
+		qos = qos->next;
+	}
+
+	return qos;
+}
+
+/*
+ * Returns 0 for failure
+ */
+int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry)
+{
+	struct atm_mpoa_qos *curr;
+
+	if (entry == NULL)
+		return 0;
+	if (entry == qos_head) {
+		qos_head = qos_head->next;
+		kfree(entry);
+		return 1;
+	}
+
+	curr = qos_head;
+	while (curr != NULL) {
+		if (curr->next == entry) {
+			curr->next = entry->next;
+			kfree(entry);
+			return 1;
+		}
+		curr = curr->next;
+	}
+
+	return 0;
+}
+
+/* this is buggered - we need locking for qos_head */
+void atm_mpoa_disp_qos(struct seq_file *m)
+{
+	struct atm_mpoa_qos *qos;
+
+	qos = qos_head;
+	seq_printf(m, "QoS entries for shortcuts:\n");
+	seq_printf(m, "IP address\n  TX:max_pcr pcr     min_pcr max_cdv max_sdu\n  RX:max_pcr pcr     min_pcr max_cdv max_sdu\n");
+
+	while (qos != NULL) {
+		seq_printf(m, "%pI4\n     %-7d %-7d %-7d %-7d %-7d\n     %-7d %-7d %-7d %-7d %-7d\n",
+			   &qos->ipaddr,
+			   qos->qos.txtp.max_pcr,
+			   qos->qos.txtp.pcr,
+			   qos->qos.txtp.min_pcr,
+			   qos->qos.txtp.max_cdv,
+			   qos->qos.txtp.max_sdu,
+			   qos->qos.rxtp.max_pcr,
+			   qos->qos.rxtp.pcr,
+			   qos->qos.rxtp.min_pcr,
+			   qos->qos.rxtp.max_cdv,
+			   qos->qos.rxtp.max_sdu);
+		qos = qos->next;
+	}
+}
+
+static struct net_device *find_lec_by_itfnum(int itf)
+{
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+
+	sprintf(name, "lec%d", itf);
+	dev = dev_get_by_name(&init_net, name);
+
+	return dev;
+}
+
+static struct mpoa_client *alloc_mpc(void)
+{
+	struct mpoa_client *mpc;
+
+	mpc = kzalloc(sizeof(struct mpoa_client), GFP_KERNEL);
+	if (mpc == NULL)
+		return NULL;
+	rwlock_init(&mpc->ingress_lock);
+	rwlock_init(&mpc->egress_lock);
+	mpc->next = mpcs;
+	atm_mpoa_init_cache(mpc);
+
+	mpc->parameters.mpc_p1 = MPC_P1;
+	mpc->parameters.mpc_p2 = MPC_P2;
+	memset(mpc->parameters.mpc_p3, 0, sizeof(mpc->parameters.mpc_p3));
+	mpc->parameters.mpc_p4 = MPC_P4;
+	mpc->parameters.mpc_p5 = MPC_P5;
+	mpc->parameters.mpc_p6 = MPC_P6;
+
+	mpcs = mpc;
+
+	return mpc;
+}
+
+/*
+ *
+ * start_mpc() puts the MPC on line. All the packets destined
+ * to the lec underneath us are now being monitored and
+ * shortcuts will be established.
+ *
+ */
+static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
+{
+
+	dprintk("(%s)\n", mpc->dev->name);
+	if (!dev->netdev_ops)
+		pr_info("(%s) not starting\n", dev->name);
+	else {
+		mpc->old_ops = dev->netdev_ops;
+		mpc->new_ops = *mpc->old_ops;
+		mpc->new_ops.ndo_start_xmit = mpc_send_packet;
+		dev->netdev_ops = &mpc->new_ops;
+	}
+}
+
+static void stop_mpc(struct mpoa_client *mpc)
+{
+	struct net_device *dev = mpc->dev;
+	dprintk("(%s)", mpc->dev->name);
+
+	/* Lets not nullify lec device's dev->hard_start_xmit */
+	if (dev->netdev_ops != &mpc->new_ops) {
+		dprintk_cont(" mpc already stopped, not fatal\n");
+		return;
+	}
+	dprintk_cont("\n");
+
+	dev->netdev_ops = mpc->old_ops;
+	mpc->old_ops = NULL;
+
+	/* close_shortcuts(mpc);    ??? FIXME */
+}
+
+static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
+
+static const char *mpoa_device_type_string(char type)
+{
+	switch (type) {
+	case NON_MPOA:
+		return "non-MPOA device";
+	case MPS:
+		return "MPS";
+	case MPC:
+		return "MPC";
+	case MPS_AND_MPC:
+		return "both MPS and MPC";
+	}
+
+	return "unspecified (non-MPOA) device";
+}
+
+/*
+ * lec device calls this via its netdev_priv(dev)->lane2_ops
+ * ->associate_indicator() when it sees a TLV in LE_ARP packet.
+ * We fill in the pointer above when we see a LANE2 lec initializing
+ * See LANE2 spec 3.1.5
+ *
+ * Quite a big and ugly function but when you look at it
+ * all it does is to try to locate and parse MPOA Device
+ * Type TLV.
+ * We give our lec a pointer to this function and when the
+ * lec sees a TLV it uses the pointer to call this function.
+ *
+ */
+static void lane2_assoc_ind(struct net_device *dev, const u8 *mac_addr,
+			    const u8 *tlvs, u32 sizeoftlvs)
+{
+	uint32_t type;
+	uint8_t length, mpoa_device_type, number_of_mps_macs;
+	const uint8_t *end_of_tlvs;
+	struct mpoa_client *mpc;
+
+	mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */
+	dprintk("(%s) received TLV(s), ", dev->name);
+	dprintk("total length of all TLVs %d\n", sizeoftlvs);
+	mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */
+	if (mpc == NULL) {
+		pr_info("(%s) no mpc\n", dev->name);
+		return;
+	}
+	end_of_tlvs = tlvs + sizeoftlvs;
+	while (end_of_tlvs - tlvs >= 5) {
+		type = ((tlvs[0] << 24) | (tlvs[1] << 16) |
+			(tlvs[2] << 8) | tlvs[3]);
+		length = tlvs[4];
+		tlvs += 5;
+		dprintk("    type 0x%x length %02x\n", type, length);
+		if (tlvs + length > end_of_tlvs) {
+			pr_info("TLV value extends past its buffer, aborting parse\n");
+			return;
+		}
+
+		if (type == 0) {
+			pr_info("mpoa: (%s) TLV type was 0, returning\n",
+				dev->name);
+			return;
+		}
+
+		if (type != TLV_MPOA_DEVICE_TYPE) {
+			tlvs += length;
+			continue;  /* skip other TLVs */
+		}
+		mpoa_device_type = *tlvs++;
+		number_of_mps_macs = *tlvs++;
+		dprintk("(%s) MPOA device type '%s', ",
+			dev->name, mpoa_device_type_string(mpoa_device_type));
+		if (mpoa_device_type == MPS_AND_MPC &&
+		    length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */
+			pr_info("(%s) short MPOA Device Type TLV\n",
+				dev->name);
+			continue;
+		}
+		if ((mpoa_device_type == MPS || mpoa_device_type == MPC) &&
+		    length < 22 + number_of_mps_macs*ETH_ALEN) {
+			pr_info("(%s) short MPOA Device Type TLV\n", dev->name);
+			continue;
+		}
+		if (mpoa_device_type != MPS &&
+		    mpoa_device_type != MPS_AND_MPC) {
+			dprintk("ignoring non-MPS device ");
+			if (mpoa_device_type == MPC)
+				tlvs += 20;
+			continue;  /* we are only interested in MPSs */
+		}
+		if (number_of_mps_macs == 0 &&
+		    mpoa_device_type == MPS_AND_MPC) {
+			pr_info("(%s) MPS_AND_MPC has zero MACs\n", dev->name);
+			continue;  /* someone should read the spec */
+		}
+		dprintk_cont("this MPS has %d MAC addresses\n",
+			     number_of_mps_macs);
+
+		/*
+		 * ok, now we can go and tell our daemon
+		 * the control address of MPS
+		 */
+		send_set_mps_ctrl_addr(tlvs, mpc);
+
+		tlvs = copy_macs(mpc, mac_addr, tlvs,
+				 number_of_mps_macs, mpoa_device_type);
+		if (tlvs == NULL)
+			return;
+	}
+	if (end_of_tlvs - tlvs != 0)
+		pr_info("(%s) ignoring %Zd bytes of trailing TLV garbage\n",
+			dev->name, end_of_tlvs - tlvs);
+}
+
+/*
+ * Store at least advertizing router's MAC address
+ * plus the possible MAC address(es) to mpc->mps_macs.
+ * For a freshly allocated MPOA client mpc->mps_macs == 0.
+ */
+static const uint8_t *copy_macs(struct mpoa_client *mpc,
+				const uint8_t *router_mac,
+				const uint8_t *tlvs, uint8_t mps_macs,
+				uint8_t device_type)
+{
+	int num_macs;
+	num_macs = (mps_macs > 1) ? mps_macs : 1;
+
+	if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */
+		if (mpc->number_of_mps_macs != 0)
+			kfree(mpc->mps_macs);
+		mpc->number_of_mps_macs = 0;
+		mpc->mps_macs = kmalloc(num_macs * ETH_ALEN, GFP_KERNEL);
+		if (mpc->mps_macs == NULL) {
+			pr_info("(%s) out of mem\n", mpc->dev->name);
+			return NULL;
+		}
+	}
+	memcpy(mpc->mps_macs, router_mac, ETH_ALEN);
+	tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20;
+	if (mps_macs > 0)
+		memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN);
+	tlvs += mps_macs*ETH_ALEN;
+	mpc->number_of_mps_macs = num_macs;
+
+	return tlvs;
+}
+
+static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
+{
+	in_cache_entry *entry;
+	struct iphdr *iph;
+	char *buff;
+	__be32 ipaddr = 0;
+
+	static struct {
+		struct llc_snap_hdr hdr;
+		__be32 tag;
+	} tagged_llc_snap_hdr = {
+		{0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c}},
+		0
+	};
+
+	buff = skb->data + mpc->dev->hard_header_len;
+	iph = (struct iphdr *)buff;
+	ipaddr = iph->daddr;
+
+	ddprintk("(%s) ipaddr 0x%x\n",
+		 mpc->dev->name, ipaddr);
+
+	entry = mpc->in_ops->get(ipaddr, mpc);
+	if (entry == NULL) {
+		entry = mpc->in_ops->add_entry(ipaddr, mpc);
+		if (entry != NULL)
+			mpc->in_ops->put(entry);
+		return 1;
+	}
+	/* threshold not exceeded or VCC not ready */
+	if (mpc->in_ops->cache_hit(entry, mpc) != OPEN) {
+		ddprintk("(%s) cache_hit: returns != OPEN\n",
+			 mpc->dev->name);
+		mpc->in_ops->put(entry);
+		return 1;
+	}
+
+	ddprintk("(%s) using shortcut\n",
+		 mpc->dev->name);
+	/* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */
+	if (iph->ttl <= 1) {
+		ddprintk("(%s) IP ttl = %u, using LANE\n",
+			 mpc->dev->name, iph->ttl);
+		mpc->in_ops->put(entry);
+		return 1;
+	}
+	iph->ttl--;
+	iph->check = 0;
+	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+	if (entry->ctrl_info.tag != 0) {
+		ddprintk("(%s) adding tag 0x%x\n",
+			 mpc->dev->name, entry->ctrl_info.tag);
+		tagged_llc_snap_hdr.tag = entry->ctrl_info.tag;
+		skb_pull(skb, ETH_HLEN);	/* get rid of Eth header */
+		skb_push(skb, sizeof(tagged_llc_snap_hdr));
+						/* add LLC/SNAP header   */
+		skb_copy_to_linear_data(skb, &tagged_llc_snap_hdr,
+					sizeof(tagged_llc_snap_hdr));
+	} else {
+		skb_pull(skb, ETH_HLEN);	/* get rid of Eth header */
+		skb_push(skb, sizeof(struct llc_snap_hdr));
+						/* add LLC/SNAP header + tag  */
+		skb_copy_to_linear_data(skb, &llc_snap_mpoa_data,
+					sizeof(struct llc_snap_hdr));
+	}
+
+	atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc);
+	ATM_SKB(skb)->atm_options = entry->shortcut->atm_options;
+	entry->shortcut->send(entry->shortcut, skb);
+	entry->packets_fwded++;
+	mpc->in_ops->put(entry);
+
+	return 0;
+}
+
+/*
+ * Probably needs some error checks and locking, not sure...
+ */
+static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	struct mpoa_client *mpc;
+	struct ethhdr *eth;
+	int i = 0;
+
+	mpc = find_mpc_by_lec(dev); /* this should NEVER fail */
+	if (mpc == NULL) {
+		pr_info("(%s) no MPC found\n", dev->name);
+		goto non_ip;
+	}
+
+	eth = (struct ethhdr *)skb->data;
+	if (eth->h_proto != htons(ETH_P_IP))
+		goto non_ip; /* Multi-Protocol Over ATM :-) */
+
+	/* Weed out funny packets (e.g., AF_PACKET or raw). */
+	if (skb->len < ETH_HLEN + sizeof(struct iphdr))
+		goto non_ip;
+	skb_set_network_header(skb, ETH_HLEN);
+	if (skb->len < ETH_HLEN + ip_hdr(skb)->ihl * 4 || ip_hdr(skb)->ihl < 5)
+		goto non_ip;
+
+	while (i < mpc->number_of_mps_macs) {
+		if (!compare_ether_addr(eth->h_dest,
+					(mpc->mps_macs + i*ETH_ALEN)))
+			if (send_via_shortcut(skb, mpc) == 0) /* try shortcut */
+				return NETDEV_TX_OK;
+		i++;
+	}
+
+non_ip:
+	return mpc->old_ops->ndo_start_xmit(skb, dev);
+}
+
+static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
+{
+	int bytes_left;
+	struct mpoa_client *mpc;
+	struct atmmpc_ioc ioc_data;
+	in_cache_entry *in_entry;
+	__be32  ipaddr;
+
+	bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc));
+	if (bytes_left != 0) {
+		pr_info("mpoa:Short read (missed %d bytes) from userland\n",
+			bytes_left);
+		return -EFAULT;
+	}
+	ipaddr = ioc_data.ipaddr;
+	if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
+		return -EINVAL;
+
+	mpc = find_mpc_by_itfnum(ioc_data.dev_num);
+	if (mpc == NULL)
+		return -EINVAL;
+
+	if (ioc_data.type == MPC_SOCKET_INGRESS) {
+		in_entry = mpc->in_ops->get(ipaddr, mpc);
+		if (in_entry == NULL ||
+		    in_entry->entry_state < INGRESS_RESOLVED) {
+			pr_info("(%s) did not find RESOLVED entry from ingress cache\n",
+				mpc->dev->name);
+			if (in_entry != NULL)
+				mpc->in_ops->put(in_entry);
+			return -EINVAL;
+		}
+		pr_info("(%s) attaching ingress SVC, entry = %pI4\n",
+			mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
+		in_entry->shortcut = vcc;
+		mpc->in_ops->put(in_entry);
+	} else {
+		pr_info("(%s) attaching egress SVC\n", mpc->dev->name);
+	}
+
+	vcc->proto_data = mpc->dev;
+	vcc->push = mpc_push;
+
+	return 0;
+}
+
+/*
+ *
+ */
+static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev)
+{
+	struct mpoa_client *mpc;
+	in_cache_entry *in_entry;
+	eg_cache_entry *eg_entry;
+
+	mpc = find_mpc_by_lec(dev);
+	if (mpc == NULL) {
+		pr_info("(%s) close for unknown MPC\n", dev->name);
+		return;
+	}
+
+	dprintk("(%s)\n", dev->name);
+	in_entry = mpc->in_ops->get_by_vcc(vcc, mpc);
+	if (in_entry) {
+		dprintk("(%s) ingress SVC closed ip = %pI4\n",
+			mpc->dev->name, &in_entry->ctrl_info.in_dst_ip);
+		in_entry->shortcut = NULL;
+		mpc->in_ops->put(in_entry);
+	}
+	eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc);
+	if (eg_entry) {
+		dprintk("(%s) egress SVC closed\n", mpc->dev->name);
+		eg_entry->shortcut = NULL;
+		mpc->eg_ops->put(eg_entry);
+	}
+
+	if (in_entry == NULL && eg_entry == NULL)
+		dprintk("(%s) unused vcc closed\n", dev->name);
+}
+
+static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct net_device *dev = (struct net_device *)vcc->proto_data;
+	struct sk_buff *new_skb;
+	eg_cache_entry *eg;
+	struct mpoa_client *mpc;
+	__be32 tag;
+	char *tmp;
+
+	ddprintk("(%s)\n", dev->name);
+	if (skb == NULL) {
+		dprintk("(%s) null skb, closing VCC\n", dev->name);
+		mpc_vcc_close(vcc, dev);
+		return;
+	}
+
+	skb->dev = dev;
+	if (memcmp(skb->data, &llc_snap_mpoa_ctrl,
+		   sizeof(struct llc_snap_hdr)) == 0) {
+		struct sock *sk = sk_atm(vcc);
+
+		dprintk("(%s) control packet arrived\n", dev->name);
+		/* Pass control packets to daemon */
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		sk->sk_data_ready(sk, skb->len);
+		return;
+	}
+
+	/* data coming over the shortcut */
+	atm_return(vcc, skb->truesize);
+
+	mpc = find_mpc_by_lec(dev);
+	if (mpc == NULL) {
+		pr_info("(%s) unknown MPC\n", dev->name);
+		return;
+	}
+
+	if (memcmp(skb->data, &llc_snap_mpoa_data_tagged,
+		   sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */
+		ddprintk("(%s) tagged data packet arrived\n", dev->name);
+
+	} else if (memcmp(skb->data, &llc_snap_mpoa_data,
+			  sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */
+		pr_info("(%s) Unsupported non-tagged data packet arrived.  Purging\n",
+			dev->name);
+		dev_kfree_skb_any(skb);
+		return;
+	} else {
+		pr_info("(%s) garbage arrived, purging\n", dev->name);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	tmp = skb->data + sizeof(struct llc_snap_hdr);
+	tag = *(__be32 *)tmp;
+
+	eg = mpc->eg_ops->get_by_tag(tag, mpc);
+	if (eg == NULL) {
+		pr_info("mpoa: (%s) Didn't find egress cache entry, tag = %u\n",
+			dev->name, tag);
+		purge_egress_shortcut(vcc, NULL);
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	/*
+	 * See if ingress MPC is using shortcut we opened as a return channel.
+	 * This means we have a bi-directional vcc opened by us.
+	 */
+	if (eg->shortcut == NULL) {
+		eg->shortcut = vcc;
+		pr_info("(%s) egress SVC in use\n", dev->name);
+	}
+
+	skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag));
+					/* get rid of LLC/SNAP header */
+	new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length);
+					/* LLC/SNAP is shorter than MAC header :( */
+	dev_kfree_skb_any(skb);
+	if (new_skb == NULL) {
+		mpc->eg_ops->put(eg);
+		return;
+	}
+	skb_push(new_skb, eg->ctrl_info.DH_length);     /* add MAC header */
+	skb_copy_to_linear_data(new_skb, eg->ctrl_info.DLL_header,
+				eg->ctrl_info.DH_length);
+	new_skb->protocol = eth_type_trans(new_skb, dev);
+	skb_reset_network_header(new_skb);
+
+	eg->latest_ip_addr = ip_hdr(new_skb)->saddr;
+	eg->packets_rcvd++;
+	mpc->eg_ops->put(eg);
+
+	memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
+	netif_rx(new_skb);
+}
+
+static struct atmdev_ops mpc_ops = { /* only send is required */
+	.close	= mpoad_close,
+	.send	= msg_from_mpoad
+};
+
+static struct atm_dev mpc_dev = {
+	.ops	= &mpc_ops,
+	.type	= "mpc",
+	.number	= 42,
+	.lock	= __SPIN_LOCK_UNLOCKED(mpc_dev.lock)
+	/* members not explicitly initialised will be 0 */
+};
+
+static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg)
+{
+	struct mpoa_client *mpc;
+	struct lec_priv *priv;
+	int err;
+
+	if (mpcs == NULL) {
+		init_timer(&mpc_timer);
+		mpc_timer_refresh();
+
+		/* This lets us now how our LECs are doing */
+		err = register_netdevice_notifier(&mpoa_notifier);
+		if (err < 0) {
+			del_timer(&mpc_timer);
+			return err;
+		}
+	}
+
+	mpc = find_mpc_by_itfnum(arg);
+	if (mpc == NULL) {
+		dprintk("allocating new mpc for itf %d\n", arg);
+		mpc = alloc_mpc();
+		if (mpc == NULL)
+			return -ENOMEM;
+		mpc->dev_num = arg;
+		mpc->dev = find_lec_by_itfnum(arg);
+					/* NULL if there was no lec */
+	}
+	if (mpc->mpoad_vcc) {
+		pr_info("mpoad is already present for itf %d\n", arg);
+		return -EADDRINUSE;
+	}
+
+	if (mpc->dev) { /* check if the lec is LANE2 capable */
+		priv = netdev_priv(mpc->dev);
+		if (priv->lane_version < 2) {
+			dev_put(mpc->dev);
+			mpc->dev = NULL;
+		} else
+			priv->lane2_ops->associate_indicator = lane2_assoc_ind;
+	}
+
+	mpc->mpoad_vcc = vcc;
+	vcc->dev = &mpc_dev;
+	vcc_insert_socket(sk_atm(vcc));
+	set_bit(ATM_VF_META, &vcc->flags);
+	set_bit(ATM_VF_READY, &vcc->flags);
+
+	if (mpc->dev) {
+		char empty[ATM_ESA_LEN];
+		memset(empty, 0, ATM_ESA_LEN);
+
+		start_mpc(mpc, mpc->dev);
+		/* set address if mpcd e.g. gets killed and restarted.
+		 * If we do not do it now we have to wait for the next LE_ARP
+		 */
+		if (memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0)
+			send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc);
+	}
+
+	__module_get(THIS_MODULE);
+	return arg;
+}
+
+static void send_set_mps_ctrl_addr(const char *addr, struct mpoa_client *mpc)
+{
+	struct k_message mesg;
+
+	memcpy(mpc->mps_ctrl_addr, addr, ATM_ESA_LEN);
+
+	mesg.type = SET_MPS_CTRL_ADDR;
+	memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN);
+	msg_to_mpoad(&mesg, mpc);
+}
+
+static void mpoad_close(struct atm_vcc *vcc)
+{
+	struct mpoa_client *mpc;
+	struct sk_buff *skb;
+
+	mpc = find_mpc_by_vcc(vcc);
+	if (mpc == NULL) {
+		pr_info("did not find MPC\n");
+		return;
+	}
+	if (!mpc->mpoad_vcc) {
+		pr_info("close for non-present mpoad\n");
+		return;
+	}
+
+	mpc->mpoad_vcc = NULL;
+	if (mpc->dev) {
+		struct lec_priv *priv = netdev_priv(mpc->dev);
+		priv->lane2_ops->associate_indicator = NULL;
+		stop_mpc(mpc);
+		dev_put(mpc->dev);
+	}
+
+	mpc->in_ops->destroy_cache(mpc);
+	mpc->eg_ops->destroy_cache(mpc);
+
+	while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) {
+		atm_return(vcc, skb->truesize);
+		kfree_skb(skb);
+	}
+
+	pr_info("(%s) going down\n",
+		(mpc->dev) ? mpc->dev->name : "<unknown>");
+	module_put(THIS_MODULE);
+}
+
+/*
+ *
+ */
+static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+
+	struct mpoa_client *mpc = find_mpc_by_vcc(vcc);
+	struct k_message *mesg = (struct k_message *)skb->data;
+	atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+
+	if (mpc == NULL) {
+		pr_info("no mpc found\n");
+		return 0;
+	}
+	dprintk("(%s)", mpc->dev ? mpc->dev->name : "<unknown>");
+	switch (mesg->type) {
+	case MPOA_RES_REPLY_RCVD:
+		dprintk_cont("mpoa_res_reply_rcvd\n");
+		MPOA_res_reply_rcvd(mesg, mpc);
+		break;
+	case MPOA_TRIGGER_RCVD:
+		dprintk_cont("mpoa_trigger_rcvd\n");
+		MPOA_trigger_rcvd(mesg, mpc);
+		break;
+	case INGRESS_PURGE_RCVD:
+		dprintk_cont("nhrp_purge_rcvd\n");
+		ingress_purge_rcvd(mesg, mpc);
+		break;
+	case EGRESS_PURGE_RCVD:
+		dprintk_cont("egress_purge_reply_rcvd\n");
+		egress_purge_rcvd(mesg, mpc);
+		break;
+	case MPS_DEATH:
+		dprintk_cont("mps_death\n");
+		mps_death(mesg, mpc);
+		break;
+	case CACHE_IMPOS_RCVD:
+		dprintk_cont("cache_impos_rcvd\n");
+		MPOA_cache_impos_rcvd(mesg, mpc);
+		break;
+	case SET_MPC_CTRL_ADDR:
+		dprintk_cont("set_mpc_ctrl_addr\n");
+		set_mpc_ctrl_addr_rcvd(mesg, mpc);
+		break;
+	case SET_MPS_MAC_ADDR:
+		dprintk_cont("set_mps_mac_addr\n");
+		set_mps_mac_addr_rcvd(mesg, mpc);
+		break;
+	case CLEAN_UP_AND_EXIT:
+		dprintk_cont("clean_up_and_exit\n");
+		clean_up(mesg, mpc, DIE);
+		break;
+	case RELOAD:
+		dprintk_cont("reload\n");
+		clean_up(mesg, mpc, RELOAD);
+		break;
+	case SET_MPC_PARAMS:
+		dprintk_cont("set_mpc_params\n");
+		mpc->parameters = mesg->content.params;
+		break;
+	default:
+		dprintk_cont("unknown message %d\n", mesg->type);
+		break;
+	}
+	kfree_skb(skb);
+
+	return 0;
+}
+
+/* Remember that this function may not do things that sleep */
+int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc)
+{
+	struct sk_buff *skb;
+	struct sock *sk;
+
+	if (mpc == NULL || !mpc->mpoad_vcc) {
+		pr_info("mesg %d to a non-existent mpoad\n", mesg->type);
+		return -ENXIO;
+	}
+
+	skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+	skb_put(skb, sizeof(struct k_message));
+	skb_copy_to_linear_data(skb, mesg, sizeof(*mesg));
+	atm_force_charge(mpc->mpoad_vcc, skb->truesize);
+
+	sk = sk_atm(mpc->mpoad_vcc);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, skb->len);
+
+	return 0;
+}
+
+static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
+			       unsigned long event, void *dev_ptr)
+{
+	struct net_device *dev;
+	struct mpoa_client *mpc;
+	struct lec_priv *priv;
+
+	dev = (struct net_device *)dev_ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (dev->name == NULL || strncmp(dev->name, "lec", 3))
+		return NOTIFY_DONE; /* we are only interested in lec:s */
+
+	switch (event) {
+	case NETDEV_REGISTER:       /* a new lec device was allocated */
+		priv = netdev_priv(dev);
+		if (priv->lane_version < 2)
+			break;
+		priv->lane2_ops->associate_indicator = lane2_assoc_ind;
+		mpc = find_mpc_by_itfnum(priv->itfnum);
+		if (mpc == NULL) {
+			dprintk("allocating new mpc for %s\n", dev->name);
+			mpc = alloc_mpc();
+			if (mpc == NULL) {
+				pr_info("no new mpc");
+				break;
+			}
+		}
+		mpc->dev_num = priv->itfnum;
+		mpc->dev = dev;
+		dev_hold(dev);
+		dprintk("(%s) was initialized\n", dev->name);
+		break;
+	case NETDEV_UNREGISTER:
+		/* the lec device was deallocated */
+		mpc = find_mpc_by_lec(dev);
+		if (mpc == NULL)
+			break;
+		dprintk("device (%s) was deallocated\n", dev->name);
+		stop_mpc(mpc);
+		dev_put(mpc->dev);
+		mpc->dev = NULL;
+		break;
+	case NETDEV_UP:
+		/* the dev was ifconfig'ed up */
+		mpc = find_mpc_by_lec(dev);
+		if (mpc == NULL)
+			break;
+		if (mpc->mpoad_vcc != NULL)
+			start_mpc(mpc, dev);
+		break;
+	case NETDEV_DOWN:
+		/* the dev was ifconfig'ed down */
+		/* this means that the flow of packets from the
+		 * upper layer stops
+		 */
+		mpc = find_mpc_by_lec(dev);
+		if (mpc == NULL)
+			break;
+		if (mpc->mpoad_vcc != NULL)
+			stop_mpc(mpc);
+		break;
+	case NETDEV_REBOOT:
+	case NETDEV_CHANGE:
+	case NETDEV_CHANGEMTU:
+	case NETDEV_CHANGEADDR:
+	case NETDEV_GOING_DOWN:
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Functions which are called after a message is received from mpcd.
+ * Msg is reused on purpose.
+ */
+
+
+static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+	__be32 dst_ip = msg->content.in_info.in_dst_ip;
+	in_cache_entry *entry;
+
+	entry = mpc->in_ops->get(dst_ip, mpc);
+	if (entry == NULL) {
+		entry = mpc->in_ops->add_entry(dst_ip, mpc);
+		entry->entry_state = INGRESS_RESOLVING;
+		msg->type = SND_MPOA_RES_RQST;
+		msg->content.in_info = entry->ctrl_info;
+		msg_to_mpoad(msg, mpc);
+		do_gettimeofday(&(entry->reply_wait));
+		mpc->in_ops->put(entry);
+		return;
+	}
+
+	if (entry->entry_state == INGRESS_INVALID) {
+		entry->entry_state = INGRESS_RESOLVING;
+		msg->type = SND_MPOA_RES_RQST;
+		msg->content.in_info = entry->ctrl_info;
+		msg_to_mpoad(msg, mpc);
+		do_gettimeofday(&(entry->reply_wait));
+		mpc->in_ops->put(entry);
+		return;
+	}
+
+	pr_info("(%s) entry already in resolving state\n",
+		(mpc->dev) ? mpc->dev->name : "<unknown>");
+	mpc->in_ops->put(entry);
+}
+
+/*
+ * Things get complicated because we have to check if there's an egress
+ * shortcut with suitable traffic parameters we could use.
+ */
+static void check_qos_and_open_shortcut(struct k_message *msg,
+					struct mpoa_client *client,
+					in_cache_entry *entry)
+{
+	__be32 dst_ip = msg->content.in_info.in_dst_ip;
+	struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip);
+	eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client);
+
+	if (eg_entry && eg_entry->shortcut) {
+		if (eg_entry->shortcut->qos.txtp.traffic_class &
+		    msg->qos.txtp.traffic_class &
+		    (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)) {
+			if (eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR)
+				entry->shortcut = eg_entry->shortcut;
+			else if (eg_entry->shortcut->qos.txtp.max_pcr > 0)
+				entry->shortcut = eg_entry->shortcut;
+		}
+		if (entry->shortcut) {
+			dprintk("(%s) using egress SVC to reach %pI4\n",
+				client->dev->name, &dst_ip);
+			client->eg_ops->put(eg_entry);
+			return;
+		}
+	}
+	if (eg_entry != NULL)
+		client->eg_ops->put(eg_entry);
+
+	/* No luck in the egress cache we must open an ingress SVC */
+	msg->type = OPEN_INGRESS_SVC;
+	if (qos &&
+	    (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) {
+		msg->qos = qos->qos;
+		pr_info("(%s) trying to get a CBR shortcut\n",
+			client->dev->name);
+	} else
+		memset(&msg->qos, 0, sizeof(struct atm_qos));
+	msg_to_mpoad(msg, client);
+}
+
+static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+	__be32 dst_ip = msg->content.in_info.in_dst_ip;
+	in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc);
+
+	dprintk("(%s) ip %pI4\n",
+		mpc->dev->name, &dst_ip);
+	ddprintk("(%s) entry = %p",
+		 mpc->dev->name, entry);
+	if (entry == NULL) {
+		pr_info("(%s) ARGH, received res. reply for an entry that doesn't exist.\n",
+			mpc->dev->name);
+		return;
+	}
+	ddprintk_cont(" entry_state = %d ", entry->entry_state);
+
+	if (entry->entry_state == INGRESS_RESOLVED) {
+		pr_info("(%s) RESOLVED entry!\n", mpc->dev->name);
+		mpc->in_ops->put(entry);
+		return;
+	}
+
+	entry->ctrl_info = msg->content.in_info;
+	do_gettimeofday(&(entry->tv));
+	do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */
+	entry->refresh_time = 0;
+	ddprintk_cont("entry->shortcut = %p\n", entry->shortcut);
+
+	if (entry->entry_state == INGRESS_RESOLVING &&
+	    entry->shortcut != NULL) {
+		entry->entry_state = INGRESS_RESOLVED;
+		mpc->in_ops->put(entry);
+		return; /* Shortcut already open... */
+	}
+
+	if (entry->shortcut != NULL) {
+		pr_info("(%s) entry->shortcut != NULL, impossible!\n",
+			mpc->dev->name);
+		mpc->in_ops->put(entry);
+		return;
+	}
+
+	check_qos_and_open_shortcut(msg, mpc, entry);
+	entry->entry_state = INGRESS_RESOLVED;
+	mpc->in_ops->put(entry);
+
+	return;
+
+}
+
+static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+	__be32 dst_ip = msg->content.in_info.in_dst_ip;
+	__be32 mask = msg->ip_mask;
+	in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
+
+	if (entry == NULL) {
+		pr_info("(%s) purge for a non-existing entry, ip = %pI4\n",
+			mpc->dev->name, &dst_ip);
+		return;
+	}
+
+	do {
+		dprintk("(%s) removing an ingress entry, ip = %pI4\n",
+			mpc->dev->name, &dst_ip);
+		write_lock_bh(&mpc->ingress_lock);
+		mpc->in_ops->remove_entry(entry, mpc);
+		write_unlock_bh(&mpc->ingress_lock);
+		mpc->in_ops->put(entry);
+		entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask);
+	} while (entry != NULL);
+}
+
+static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc)
+{
+	__be32 cache_id = msg->content.eg_info.cache_id;
+	eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc);
+
+	if (entry == NULL) {
+		dprintk("(%s) purge for a non-existing entry\n",
+			mpc->dev->name);
+		return;
+	}
+
+	write_lock_irq(&mpc->egress_lock);
+	mpc->eg_ops->remove_entry(entry, mpc);
+	write_unlock_irq(&mpc->egress_lock);
+
+	mpc->eg_ops->put(entry);
+}
+
+static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry)
+{
+	struct sock *sk;
+	struct k_message *purge_msg;
+	struct sk_buff *skb;
+
+	dprintk("entering\n");
+	if (vcc == NULL) {
+		pr_info("vcc == NULL\n");
+		return;
+	}
+
+	skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC);
+	if (skb == NULL) {
+		pr_info("out of memory\n");
+		return;
+	}
+
+	skb_put(skb, sizeof(struct k_message));
+	memset(skb->data, 0, sizeof(struct k_message));
+	purge_msg = (struct k_message *)skb->data;
+	purge_msg->type = DATA_PLANE_PURGE;
+	if (entry != NULL)
+		purge_msg->content.eg_info = entry->ctrl_info;
+
+	atm_force_charge(vcc, skb->truesize);
+
+	sk = sk_atm(vcc);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, skb->len);
+	dprintk("exiting\n");
+}
+
+/*
+ * Our MPS died. Tell our daemon to send NHRP data plane purge to each
+ * of the egress shortcuts we have.
+ */
+static void mps_death(struct k_message *msg, struct mpoa_client *mpc)
+{
+	eg_cache_entry *entry;
+
+	dprintk("(%s)\n", mpc->dev->name);
+
+	if (memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)) {
+		pr_info("(%s) wrong MPS\n", mpc->dev->name);
+		return;
+	}
+
+	/* FIXME: This knows too much of the cache structure */
+	read_lock_irq(&mpc->egress_lock);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		purge_egress_shortcut(entry->shortcut, entry);
+		entry = entry->next;
+	}
+	read_unlock_irq(&mpc->egress_lock);
+
+	mpc->in_ops->destroy_cache(mpc);
+	mpc->eg_ops->destroy_cache(mpc);
+}
+
+static void MPOA_cache_impos_rcvd(struct k_message *msg,
+				  struct mpoa_client *mpc)
+{
+	uint16_t holding_time;
+	eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc);
+
+	holding_time = msg->content.eg_info.holding_time;
+	dprintk("(%s) entry = %p, holding_time = %u\n",
+		mpc->dev->name, entry, holding_time);
+	if (entry == NULL && holding_time) {
+		entry = mpc->eg_ops->add_entry(msg, mpc);
+		mpc->eg_ops->put(entry);
+		return;
+	}
+	if (holding_time) {
+		mpc->eg_ops->update(entry, holding_time);
+		return;
+	}
+
+	write_lock_irq(&mpc->egress_lock);
+	mpc->eg_ops->remove_entry(entry, mpc);
+	write_unlock_irq(&mpc->egress_lock);
+
+	mpc->eg_ops->put(entry);
+}
+
+static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg,
+				   struct mpoa_client *mpc)
+{
+	struct lec_priv *priv;
+	int i, retval ;
+
+	uint8_t tlv[4 + 1 + 1 + 1 + ATM_ESA_LEN];
+
+	tlv[0] = 00; tlv[1] = 0xa0; tlv[2] = 0x3e; tlv[3] = 0x2a; /* type  */
+	tlv[4] = 1 + 1 + ATM_ESA_LEN;  /* length                           */
+	tlv[5] = 0x02;                 /* MPOA client                      */
+	tlv[6] = 0x00;                 /* number of MPS MAC addresses      */
+
+	memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */
+	memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN);
+
+	dprintk("(%s) setting MPC ctrl ATM address to",
+		mpc->dev ? mpc->dev->name : "<unknown>");
+	for (i = 7; i < sizeof(tlv); i++)
+		dprintk_cont(" %02x", tlv[i]);
+	dprintk_cont("\n");
+
+	if (mpc->dev) {
+		priv = netdev_priv(mpc->dev);
+		retval = priv->lane2_ops->associate_req(mpc->dev,
+							mpc->dev->dev_addr,
+							tlv, sizeof(tlv));
+		if (retval == 0)
+			pr_info("(%s) MPOA device type TLV association failed\n",
+				mpc->dev->name);
+		retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL);
+		if (retval < 0)
+			pr_info("(%s) targetless LE_ARP request failed\n",
+				mpc->dev->name);
+	}
+}
+
+static void set_mps_mac_addr_rcvd(struct k_message *msg,
+				  struct mpoa_client *client)
+{
+
+	if (client->number_of_mps_macs)
+		kfree(client->mps_macs);
+	client->number_of_mps_macs = 0;
+	client->mps_macs = kmemdup(msg->MPS_ctrl, ETH_ALEN, GFP_KERNEL);
+	if (client->mps_macs == NULL) {
+		pr_info("out of memory\n");
+		return;
+	}
+	client->number_of_mps_macs = 1;
+}
+
+/*
+ * purge egress cache and tell daemon to 'action' (DIE, RELOAD)
+ */
+static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action)
+{
+
+	eg_cache_entry *entry;
+	msg->type = SND_EGRESS_PURGE;
+
+
+	/* FIXME: This knows too much of the cache structure */
+	read_lock_irq(&mpc->egress_lock);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		msg->content.eg_info = entry->ctrl_info;
+		dprintk("cache_id %u\n", entry->ctrl_info.cache_id);
+		msg_to_mpoad(msg, mpc);
+		entry = entry->next;
+	}
+	read_unlock_irq(&mpc->egress_lock);
+
+	msg->type = action;
+	msg_to_mpoad(msg, mpc);
+}
+
+static void mpc_timer_refresh(void)
+{
+	mpc_timer.expires = jiffies + (MPC_P2 * HZ);
+	mpc_timer.data = mpc_timer.expires;
+	mpc_timer.function = mpc_cache_check;
+	add_timer(&mpc_timer);
+}
+
+static void mpc_cache_check(unsigned long checking_time)
+{
+	struct mpoa_client *mpc = mpcs;
+	static unsigned long previous_resolving_check_time;
+	static unsigned long previous_refresh_time;
+
+	while (mpc != NULL) {
+		mpc->in_ops->clear_count(mpc);
+		mpc->eg_ops->clear_expired(mpc);
+		if (checking_time - previous_resolving_check_time >
+		    mpc->parameters.mpc_p4 * HZ) {
+			mpc->in_ops->check_resolving(mpc);
+			previous_resolving_check_time = checking_time;
+		}
+		if (checking_time - previous_refresh_time >
+		    mpc->parameters.mpc_p5 * HZ) {
+			mpc->in_ops->refresh(mpc);
+			previous_refresh_time = checking_time;
+		}
+		mpc = mpc->next;
+	}
+	mpc_timer_refresh();
+}
+
+static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd,
+			  unsigned long arg)
+{
+	int err = 0;
+	struct atm_vcc *vcc = ATM_SD(sock);
+
+	if (cmd != ATMMPC_CTRL && cmd != ATMMPC_DATA)
+		return -ENOIOCTLCMD;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ATMMPC_CTRL:
+		err = atm_mpoa_mpoad_attach(vcc, (int)arg);
+		if (err >= 0)
+			sock->state = SS_CONNECTED;
+		break;
+	case ATMMPC_DATA:
+		err = atm_mpoa_vcc_attach(vcc, (void __user *)arg);
+		break;
+	default:
+		break;
+	}
+	return err;
+}
+
+static struct atm_ioctl atm_ioctl_ops = {
+	.owner	= THIS_MODULE,
+	.ioctl	= atm_mpoa_ioctl,
+};
+
+static __init int atm_mpoa_init(void)
+{
+	register_atm_ioctl(&atm_ioctl_ops);
+
+	if (mpc_proc_init() != 0)
+		pr_info("failed to initialize /proc/mpoa\n");
+
+	pr_info("mpc.c: initialized\n");
+
+	return 0;
+}
+
+static void __exit atm_mpoa_cleanup(void)
+{
+	struct mpoa_client *mpc, *tmp;
+	struct atm_mpoa_qos *qos, *nextqos;
+	struct lec_priv *priv;
+
+	mpc_proc_clean();
+
+	del_timer(&mpc_timer);
+	unregister_netdevice_notifier(&mpoa_notifier);
+	deregister_atm_ioctl(&atm_ioctl_ops);
+
+	mpc = mpcs;
+	mpcs = NULL;
+	while (mpc != NULL) {
+		tmp = mpc->next;
+		if (mpc->dev != NULL) {
+			stop_mpc(mpc);
+			priv = netdev_priv(mpc->dev);
+			if (priv->lane2_ops != NULL)
+				priv->lane2_ops->associate_indicator = NULL;
+		}
+		ddprintk("about to clear caches\n");
+		mpc->in_ops->destroy_cache(mpc);
+		mpc->eg_ops->destroy_cache(mpc);
+		ddprintk("caches cleared\n");
+		kfree(mpc->mps_macs);
+		memset(mpc, 0, sizeof(struct mpoa_client));
+		ddprintk("about to kfree %p\n", mpc);
+		kfree(mpc);
+		ddprintk("next mpc is at %p\n", tmp);
+		mpc = tmp;
+	}
+
+	qos = qos_head;
+	qos_head = NULL;
+	while (qos != NULL) {
+		nextqos = qos->next;
+		dprintk("freeing qos entry %p\n", qos);
+		kfree(qos);
+		qos = nextqos;
+	}
+}
+
+module_init(atm_mpoa_init);
+module_exit(atm_mpoa_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
new file mode 100644
index 00000000..0919a88b
--- /dev/null
+++ b/net/atm/mpc.h
@@ -0,0 +1,64 @@
+#ifndef _MPC_H_
+#define _MPC_H_
+
+#include <linux/types.h>
+#include <linux/atm.h>
+#include <linux/atmmpc.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include "mpoa_caches.h"
+
+/* kernel -> mpc-daemon */
+int msg_to_mpoad(struct k_message *msg, struct mpoa_client *mpc);
+
+struct mpoa_client {
+	struct mpoa_client *next;
+	struct net_device *dev;      /* lec in question                     */
+	int dev_num;                 /* e.g. 2 for lec2                     */
+
+	struct atm_vcc *mpoad_vcc;   /* control channel to mpoad            */
+	uint8_t mps_ctrl_addr[ATM_ESA_LEN];  /* MPS control ATM address     */
+	uint8_t our_ctrl_addr[ATM_ESA_LEN];  /* MPC's control ATM address   */
+
+	rwlock_t ingress_lock;
+	struct in_cache_ops *in_ops; /* ingress cache operations            */
+	in_cache_entry *in_cache;    /* the ingress cache of this MPC       */
+
+	rwlock_t egress_lock;
+	struct eg_cache_ops *eg_ops; /* egress cache operations             */
+	eg_cache_entry *eg_cache;    /* the egress  cache of this MPC       */
+
+	uint8_t *mps_macs;           /* array of MPS MAC addresses, >=1     */
+	int number_of_mps_macs;      /* number of the above MAC addresses   */
+	struct mpc_parameters parameters;  /* parameters for this client    */
+
+	const struct net_device_ops *old_ops;
+	struct net_device_ops new_ops;
+};
+
+
+struct atm_mpoa_qos {
+	struct atm_mpoa_qos *next;
+	__be32 ipaddr;
+	struct atm_qos qos;
+};
+
+
+/* MPOA QoS operations */
+struct atm_mpoa_qos *atm_mpoa_add_qos(__be32 dst_ip, struct atm_qos *qos);
+struct atm_mpoa_qos *atm_mpoa_search_qos(__be32 dst_ip);
+int atm_mpoa_delete_qos(struct atm_mpoa_qos *qos);
+
+/* Display QoS entries. This is for the procfs */
+struct seq_file;
+void atm_mpoa_disp_qos(struct seq_file *m);
+
+#ifdef CONFIG_PROC_FS
+int mpc_proc_init(void);
+void mpc_proc_clean(void);
+#else
+#define mpc_proc_init() (0)
+#define mpc_proc_clean() do { } while(0)
+#endif
+
+#endif /* _MPC_H_ */
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
new file mode 100644
index 00000000..d1b2d9a0
--- /dev/null
+++ b/net/atm/mpoa_caches.c
@@ -0,0 +1,569 @@
+#include <linux/types.h>
+#include <linux/atmmpc.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include "mpoa_caches.h"
+#include "mpc.h"
+
+/*
+ * mpoa_caches.c: Implementation of ingress and egress cache
+ * handling functions
+ */
+
+#if 0
+#define dprintk(format, args...)					\
+	printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args)  /* debug */
+#else
+#define dprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+	} while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...)					\
+	printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args)  /* debug */
+#else
+#define ddprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+	} while (0)
+#endif
+
+static in_cache_entry *in_cache_get(__be32 dst_ip,
+				    struct mpoa_client *client)
+{
+	in_cache_entry *entry;
+
+	read_lock_bh(&client->ingress_lock);
+	entry = client->in_cache;
+	while (entry != NULL) {
+		if (entry->ctrl_info.in_dst_ip == dst_ip) {
+			atomic_inc(&entry->use);
+			read_unlock_bh(&client->ingress_lock);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_bh(&client->ingress_lock);
+
+	return NULL;
+}
+
+static in_cache_entry *in_cache_get_with_mask(__be32 dst_ip,
+					      struct mpoa_client *client,
+					      __be32 mask)
+{
+	in_cache_entry *entry;
+
+	read_lock_bh(&client->ingress_lock);
+	entry = client->in_cache;
+	while (entry != NULL) {
+		if ((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask)) {
+			atomic_inc(&entry->use);
+			read_unlock_bh(&client->ingress_lock);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_bh(&client->ingress_lock);
+
+	return NULL;
+
+}
+
+static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc,
+					   struct mpoa_client *client)
+{
+	in_cache_entry *entry;
+
+	read_lock_bh(&client->ingress_lock);
+	entry = client->in_cache;
+	while (entry != NULL) {
+		if (entry->shortcut == vcc) {
+			atomic_inc(&entry->use);
+			read_unlock_bh(&client->ingress_lock);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_bh(&client->ingress_lock);
+
+	return NULL;
+}
+
+static in_cache_entry *in_cache_add_entry(__be32 dst_ip,
+					  struct mpoa_client *client)
+{
+	in_cache_entry *entry = kzalloc(sizeof(in_cache_entry), GFP_KERNEL);
+
+	if (entry == NULL) {
+		pr_info("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n");
+		return NULL;
+	}
+
+	dprintk("adding an ingress entry, ip = %pI4\n", &dst_ip);
+
+	atomic_set(&entry->use, 1);
+	dprintk("new_in_cache_entry: about to lock\n");
+	write_lock_bh(&client->ingress_lock);
+	entry->next = client->in_cache;
+	entry->prev = NULL;
+	if (client->in_cache != NULL)
+		client->in_cache->prev = entry;
+	client->in_cache = entry;
+
+	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
+	entry->ctrl_info.in_dst_ip = dst_ip;
+	do_gettimeofday(&(entry->tv));
+	entry->retry_time = client->parameters.mpc_p4;
+	entry->count = 1;
+	entry->entry_state = INGRESS_INVALID;
+	entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT;
+	atomic_inc(&entry->use);
+
+	write_unlock_bh(&client->ingress_lock);
+	dprintk("new_in_cache_entry: unlocked\n");
+
+	return entry;
+}
+
+static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
+{
+	struct atm_mpoa_qos *qos;
+	struct k_message msg;
+
+	entry->count++;
+	if (entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL)
+		return OPEN;
+
+	if (entry->entry_state == INGRESS_REFRESHING) {
+		if (entry->count > mpc->parameters.mpc_p1) {
+			msg.type = SND_MPOA_RES_RQST;
+			msg.content.in_info = entry->ctrl_info;
+			memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
+			qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+			if (qos != NULL)
+				msg.qos = qos->qos;
+			msg_to_mpoad(&msg, mpc);
+			do_gettimeofday(&(entry->reply_wait));
+			entry->entry_state = INGRESS_RESOLVING;
+		}
+		if (entry->shortcut != NULL)
+			return OPEN;
+		return CLOSED;
+	}
+
+	if (entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL)
+		return OPEN;
+
+	if (entry->count > mpc->parameters.mpc_p1 &&
+	    entry->entry_state == INGRESS_INVALID) {
+		dprintk("(%s) threshold exceeded for ip %pI4, sending MPOA res req\n",
+			mpc->dev->name, &entry->ctrl_info.in_dst_ip);
+		entry->entry_state = INGRESS_RESOLVING;
+		msg.type = SND_MPOA_RES_RQST;
+		memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN);
+		msg.content.in_info = entry->ctrl_info;
+		qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+		if (qos != NULL)
+			msg.qos = qos->qos;
+		msg_to_mpoad(&msg, mpc);
+		do_gettimeofday(&(entry->reply_wait));
+	}
+
+	return CLOSED;
+}
+
+static void in_cache_put(in_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->use)) {
+		memset(entry, 0, sizeof(in_cache_entry));
+		kfree(entry);
+	}
+}
+
+/*
+ * This should be called with write lock on
+ */
+static void in_cache_remove_entry(in_cache_entry *entry,
+				  struct mpoa_client *client)
+{
+	struct atm_vcc *vcc;
+	struct k_message msg;
+
+	vcc = entry->shortcut;
+	dprintk("removing an ingress entry, ip = %pI4\n",
+		&entry->ctrl_info.in_dst_ip);
+
+	if (entry->prev != NULL)
+		entry->prev->next = entry->next;
+	else
+		client->in_cache = entry->next;
+	if (entry->next != NULL)
+		entry->next->prev = entry->prev;
+	client->in_ops->put(entry);
+	if (client->in_cache == NULL && client->eg_cache == NULL) {
+		msg.type = STOP_KEEP_ALIVE_SM;
+		msg_to_mpoad(&msg, client);
+	}
+
+	/* Check if the egress side still uses this VCC */
+	if (vcc != NULL) {
+		eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc,
+								      client);
+		if (eg_entry != NULL) {
+			client->eg_ops->put(eg_entry);
+			return;
+		}
+		vcc_release_async(vcc, -EPIPE);
+	}
+}
+
+/* Call this every MPC-p2 seconds... Not exactly correct solution,
+   but an easy one... */
+static void clear_count_and_expired(struct mpoa_client *client)
+{
+	in_cache_entry *entry, *next_entry;
+	struct timeval now;
+
+	do_gettimeofday(&now);
+
+	write_lock_bh(&client->ingress_lock);
+	entry = client->in_cache;
+	while (entry != NULL) {
+		entry->count = 0;
+		next_entry = entry->next;
+		if ((now.tv_sec - entry->tv.tv_sec)
+		   > entry->ctrl_info.holding_time) {
+			dprintk("holding time expired, ip = %pI4\n",
+				&entry->ctrl_info.in_dst_ip);
+			client->in_ops->remove_entry(entry, client);
+		}
+		entry = next_entry;
+	}
+	write_unlock_bh(&client->ingress_lock);
+}
+
+/* Call this every MPC-p4 seconds. */
+static void check_resolving_entries(struct mpoa_client *client)
+{
+
+	struct atm_mpoa_qos *qos;
+	in_cache_entry *entry;
+	struct timeval now;
+	struct k_message msg;
+
+	do_gettimeofday(&now);
+
+	read_lock_bh(&client->ingress_lock);
+	entry = client->in_cache;
+	while (entry != NULL) {
+		if (entry->entry_state == INGRESS_RESOLVING) {
+			if ((now.tv_sec - entry->hold_down.tv_sec) <
+			    client->parameters.mpc_p6) {
+				entry = entry->next;	/* Entry in hold down */
+				continue;
+			}
+			if ((now.tv_sec - entry->reply_wait.tv_sec) >
+			    entry->retry_time) {
+				entry->retry_time = MPC_C1 * (entry->retry_time);
+				/*
+				 * Retry time maximum exceeded,
+				 * put entry in hold down.
+				 */
+				if (entry->retry_time > client->parameters.mpc_p5) {
+					do_gettimeofday(&(entry->hold_down));
+					entry->retry_time = client->parameters.mpc_p4;
+					entry = entry->next;
+					continue;
+				}
+				/* Ask daemon to send a resolution request. */
+				memset(&(entry->hold_down), 0, sizeof(struct timeval));
+				msg.type = SND_MPOA_RES_RTRY;
+				memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN);
+				msg.content.in_info = entry->ctrl_info;
+				qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip);
+				if (qos != NULL)
+					msg.qos = qos->qos;
+				msg_to_mpoad(&msg, client);
+				do_gettimeofday(&(entry->reply_wait));
+			}
+		}
+		entry = entry->next;
+	}
+	read_unlock_bh(&client->ingress_lock);
+}
+
+/* Call this every MPC-p5 seconds. */
+static void refresh_entries(struct mpoa_client *client)
+{
+	struct timeval now;
+	struct in_cache_entry *entry = client->in_cache;
+
+	ddprintk("refresh_entries\n");
+	do_gettimeofday(&now);
+
+	read_lock_bh(&client->ingress_lock);
+	while (entry != NULL) {
+		if (entry->entry_state == INGRESS_RESOLVED) {
+			if (!(entry->refresh_time))
+				entry->refresh_time = (2 * (entry->ctrl_info.holding_time))/3;
+			if ((now.tv_sec - entry->reply_wait.tv_sec) >
+			    entry->refresh_time) {
+				dprintk("refreshing an entry.\n");
+				entry->entry_state = INGRESS_REFRESHING;
+
+			}
+		}
+		entry = entry->next;
+	}
+	read_unlock_bh(&client->ingress_lock);
+}
+
+static void in_destroy_cache(struct mpoa_client *mpc)
+{
+	write_lock_irq(&mpc->ingress_lock);
+	while (mpc->in_cache != NULL)
+		mpc->in_ops->remove_entry(mpc->in_cache, mpc);
+	write_unlock_irq(&mpc->ingress_lock);
+}
+
+static eg_cache_entry *eg_cache_get_by_cache_id(__be32 cache_id,
+						struct mpoa_client *mpc)
+{
+	eg_cache_entry *entry;
+
+	read_lock_irq(&mpc->egress_lock);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		if (entry->ctrl_info.cache_id == cache_id) {
+			atomic_inc(&entry->use);
+			read_unlock_irq(&mpc->egress_lock);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_irq(&mpc->egress_lock);
+
+	return NULL;
+}
+
+/* This can be called from any context since it saves CPU flags */
+static eg_cache_entry *eg_cache_get_by_tag(__be32 tag, struct mpoa_client *mpc)
+{
+	unsigned long flags;
+	eg_cache_entry *entry;
+
+	read_lock_irqsave(&mpc->egress_lock, flags);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		if (entry->ctrl_info.tag == tag) {
+			atomic_inc(&entry->use);
+			read_unlock_irqrestore(&mpc->egress_lock, flags);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_irqrestore(&mpc->egress_lock, flags);
+
+	return NULL;
+}
+
+/* This can be called from any context since it saves CPU flags */
+static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc,
+					   struct mpoa_client *mpc)
+{
+	unsigned long flags;
+	eg_cache_entry *entry;
+
+	read_lock_irqsave(&mpc->egress_lock, flags);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		if (entry->shortcut == vcc) {
+			atomic_inc(&entry->use);
+			read_unlock_irqrestore(&mpc->egress_lock, flags);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_irqrestore(&mpc->egress_lock, flags);
+
+	return NULL;
+}
+
+static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
+					      struct mpoa_client *mpc)
+{
+	eg_cache_entry *entry;
+
+	read_lock_irq(&mpc->egress_lock);
+	entry = mpc->eg_cache;
+	while (entry != NULL) {
+		if (entry->latest_ip_addr == ipaddr) {
+			atomic_inc(&entry->use);
+			read_unlock_irq(&mpc->egress_lock);
+			return entry;
+		}
+		entry = entry->next;
+	}
+	read_unlock_irq(&mpc->egress_lock);
+
+	return NULL;
+}
+
+static void eg_cache_put(eg_cache_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->use)) {
+		memset(entry, 0, sizeof(eg_cache_entry));
+		kfree(entry);
+	}
+}
+
+/*
+ * This should be called with write lock on
+ */
+static void eg_cache_remove_entry(eg_cache_entry *entry,
+				  struct mpoa_client *client)
+{
+	struct atm_vcc *vcc;
+	struct k_message msg;
+
+	vcc = entry->shortcut;
+	dprintk("removing an egress entry.\n");
+	if (entry->prev != NULL)
+		entry->prev->next = entry->next;
+	else
+		client->eg_cache = entry->next;
+	if (entry->next != NULL)
+		entry->next->prev = entry->prev;
+	client->eg_ops->put(entry);
+	if (client->in_cache == NULL && client->eg_cache == NULL) {
+		msg.type = STOP_KEEP_ALIVE_SM;
+		msg_to_mpoad(&msg, client);
+	}
+
+	/* Check if the ingress side still uses this VCC */
+	if (vcc != NULL) {
+		in_cache_entry *in_entry = client->in_ops->get_by_vcc(vcc, client);
+		if (in_entry != NULL) {
+			client->in_ops->put(in_entry);
+			return;
+		}
+		vcc_release_async(vcc, -EPIPE);
+	}
+}
+
+static eg_cache_entry *eg_cache_add_entry(struct k_message *msg,
+					  struct mpoa_client *client)
+{
+	eg_cache_entry *entry = kzalloc(sizeof(eg_cache_entry), GFP_KERNEL);
+
+	if (entry == NULL) {
+		pr_info("out of memory\n");
+		return NULL;
+	}
+
+	dprintk("adding an egress entry, ip = %pI4, this should be our IP\n",
+		&msg->content.eg_info.eg_dst_ip);
+
+	atomic_set(&entry->use, 1);
+	dprintk("new_eg_cache_entry: about to lock\n");
+	write_lock_irq(&client->egress_lock);
+	entry->next = client->eg_cache;
+	entry->prev = NULL;
+	if (client->eg_cache != NULL)
+		client->eg_cache->prev = entry;
+	client->eg_cache = entry;
+
+	memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN);
+	entry->ctrl_info = msg->content.eg_info;
+	do_gettimeofday(&(entry->tv));
+	entry->entry_state = EGRESS_RESOLVED;
+	dprintk("new_eg_cache_entry cache_id %u\n",
+		ntohl(entry->ctrl_info.cache_id));
+	dprintk("mps_ip = %pI4\n", &entry->ctrl_info.mps_ip);
+	atomic_inc(&entry->use);
+
+	write_unlock_irq(&client->egress_lock);
+	dprintk("new_eg_cache_entry: unlocked\n");
+
+	return entry;
+}
+
+static void update_eg_cache_entry(eg_cache_entry *entry, uint16_t holding_time)
+{
+	do_gettimeofday(&(entry->tv));
+	entry->entry_state = EGRESS_RESOLVED;
+	entry->ctrl_info.holding_time = holding_time;
+}
+
+static void clear_expired(struct mpoa_client *client)
+{
+	eg_cache_entry *entry, *next_entry;
+	struct timeval now;
+	struct k_message msg;
+
+	do_gettimeofday(&now);
+
+	write_lock_irq(&client->egress_lock);
+	entry = client->eg_cache;
+	while (entry != NULL) {
+		next_entry = entry->next;
+		if ((now.tv_sec - entry->tv.tv_sec)
+		   > entry->ctrl_info.holding_time) {
+			msg.type = SND_EGRESS_PURGE;
+			msg.content.eg_info = entry->ctrl_info;
+			dprintk("egress_cache: holding time expired, cache_id = %u.\n",
+				ntohl(entry->ctrl_info.cache_id));
+			msg_to_mpoad(&msg, client);
+			client->eg_ops->remove_entry(entry, client);
+		}
+		entry = next_entry;
+	}
+	write_unlock_irq(&client->egress_lock);
+}
+
+static void eg_destroy_cache(struct mpoa_client *mpc)
+{
+	write_lock_irq(&mpc->egress_lock);
+	while (mpc->eg_cache != NULL)
+		mpc->eg_ops->remove_entry(mpc->eg_cache, mpc);
+	write_unlock_irq(&mpc->egress_lock);
+}
+
+
+static struct in_cache_ops ingress_ops = {
+	in_cache_add_entry,               /* add_entry       */
+	in_cache_get,                     /* get             */
+	in_cache_get_with_mask,           /* get_with_mask   */
+	in_cache_get_by_vcc,              /* get_by_vcc      */
+	in_cache_put,                     /* put             */
+	in_cache_remove_entry,            /* remove_entry    */
+	cache_hit,                        /* cache_hit       */
+	clear_count_and_expired,          /* clear_count     */
+	check_resolving_entries,          /* check_resolving */
+	refresh_entries,                  /* refresh         */
+	in_destroy_cache                  /* destroy_cache   */
+};
+
+static struct eg_cache_ops egress_ops = {
+	eg_cache_add_entry,               /* add_entry        */
+	eg_cache_get_by_cache_id,         /* get_by_cache_id  */
+	eg_cache_get_by_tag,              /* get_by_tag       */
+	eg_cache_get_by_vcc,              /* get_by_vcc       */
+	eg_cache_get_by_src_ip,           /* get_by_src_ip    */
+	eg_cache_put,                     /* put              */
+	eg_cache_remove_entry,            /* remove_entry     */
+	update_eg_cache_entry,            /* update           */
+	clear_expired,                    /* clear_expired    */
+	eg_destroy_cache                  /* destroy_cache    */
+};
+
+
+void atm_mpoa_init_cache(struct mpoa_client *mpc)
+{
+	mpc->in_ops = &ingress_ops;
+	mpc->eg_ops = &egress_ops;
+}
diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h
new file mode 100644
index 00000000..8e5f78cf
--- /dev/null
+++ b/net/atm/mpoa_caches.h
@@ -0,0 +1,96 @@
+#ifndef MPOA_CACHES_H
+#define MPOA_CACHES_H
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmmpc.h>
+
+struct mpoa_client;
+
+void atm_mpoa_init_cache(struct mpoa_client *mpc);
+
+typedef struct in_cache_entry {
+	struct in_cache_entry *next;
+	struct in_cache_entry *prev;
+	struct timeval  tv;
+	struct timeval  reply_wait;
+	struct timeval  hold_down;
+	uint32_t  packets_fwded;
+	uint16_t  entry_state;
+	uint32_t retry_time;
+	uint32_t refresh_time;
+	uint32_t count;
+	struct   atm_vcc *shortcut;
+	uint8_t  MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+	struct   in_ctrl_info ctrl_info;
+	atomic_t use;
+} in_cache_entry;
+
+struct in_cache_ops{
+	in_cache_entry *(*add_entry)(__be32 dst_ip,
+				      struct mpoa_client *client);
+	in_cache_entry *(*get)(__be32 dst_ip, struct mpoa_client *client);
+	in_cache_entry *(*get_with_mask)(__be32 dst_ip,
+					 struct mpoa_client *client,
+					 __be32 mask);
+	in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc,
+				      struct mpoa_client *client);
+	void            (*put)(in_cache_entry *entry);
+	void            (*remove_entry)(in_cache_entry *delEntry,
+					struct mpoa_client *client );
+	int             (*cache_hit)(in_cache_entry *entry,
+				     struct mpoa_client *client);
+	void            (*clear_count)(struct mpoa_client *client);
+	void            (*check_resolving)(struct mpoa_client *client);
+	void            (*refresh)(struct mpoa_client *client);
+	void            (*destroy_cache)(struct mpoa_client *mpc);
+};
+
+typedef struct eg_cache_entry{
+	struct               eg_cache_entry *next;
+	struct               eg_cache_entry *prev;
+	struct               timeval  tv;
+	uint8_t              MPS_ctrl_ATM_addr[ATM_ESA_LEN];
+	struct atm_vcc       *shortcut;
+	uint32_t             packets_rcvd;
+	uint16_t             entry_state;
+	__be32             latest_ip_addr;    /* The src IP address of the last packet */
+	struct eg_ctrl_info  ctrl_info;
+	atomic_t             use;
+} eg_cache_entry;
+
+struct eg_cache_ops{
+	eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client);
+	eg_cache_entry *(*get_by_cache_id)(__be32 cache_id, struct mpoa_client *client);
+	eg_cache_entry *(*get_by_tag)(__be32 cache_id, struct mpoa_client *client);
+	eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client);
+	eg_cache_entry *(*get_by_src_ip)(__be32 ipaddr, struct mpoa_client *client);
+	void            (*put)(eg_cache_entry *entry);
+	void            (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client);
+	void            (*update)(eg_cache_entry *entry, uint16_t holding_time);
+	void            (*clear_expired)(struct mpoa_client *client);
+	void            (*destroy_cache)(struct mpoa_client *mpc);
+};
+
+
+/* Ingress cache entry states */
+
+#define INGRESS_REFRESHING 3
+#define INGRESS_RESOLVED   2
+#define INGRESS_RESOLVING  1
+#define INGRESS_INVALID    0
+
+/* VCC states */
+
+#define OPEN   1
+#define CLOSED 0
+
+/* Egress cache entry states */
+
+#define EGRESS_RESOLVED 2
+#define EGRESS_PURGE    1
+#define EGRESS_INVALID  0
+
+#endif
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
new file mode 100644
index 00000000..53e50029
--- /dev/null
+++ b/net/atm/mpoa_proc.c
@@ -0,0 +1,312 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#ifdef CONFIG_PROC_FS
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/atmmpc.h>
+#include <linux/atm.h>
+#include <linux/gfp.h>
+#include "mpc.h"
+#include "mpoa_caches.h"
+
+/*
+ * mpoa_proc.c: Implementation MPOA client's proc
+ * file system statistics
+ */
+
+#if 1
+#define dprintk(format, args...)					\
+	printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args)  /* debug */
+#else
+#define dprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+	} while (0)
+#endif
+
+#if 0
+#define ddprintk(format, args...)					\
+	printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args)  /* debug */
+#else
+#define ddprintk(format, args...)					\
+	do { if (0)							\
+		printk(KERN_DEBUG "mpoa:%s: " format, __FILE__, ##args);\
+	} while (0)
+#endif
+
+#define STAT_FILE_NAME "mpc"     /* Our statistic file's name */
+
+extern struct mpoa_client *mpcs;
+extern struct proc_dir_entry *atm_proc_root;  /* from proc.c. */
+
+static int proc_mpc_open(struct inode *inode, struct file *file);
+static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
+			      size_t nbytes, loff_t *ppos);
+
+static int parse_qos(const char *buff);
+
+/*
+ *   Define allowed FILE OPERATIONS
+ */
+static const struct file_operations mpc_file_operations = {
+	.owner =	THIS_MODULE,
+	.open =		proc_mpc_open,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+	.write =	proc_mpc_write,
+	.release =	seq_release,
+};
+
+/*
+ * Returns the state of an ingress cache entry as a string
+ */
+static const char *ingress_state_string(int state)
+{
+	switch (state) {
+	case INGRESS_RESOLVING:
+		return "resolving  ";
+	case INGRESS_RESOLVED:
+		return "resolved   ";
+	case INGRESS_INVALID:
+		return "invalid    ";
+	case INGRESS_REFRESHING:
+		return "refreshing ";
+	}
+
+	return "";
+}
+
+/*
+ * Returns the state of an egress cache entry as a string
+ */
+static const char *egress_state_string(int state)
+{
+	switch (state) {
+	case EGRESS_RESOLVED:
+		return "resolved   ";
+	case EGRESS_PURGE:
+		return "purge      ";
+	case EGRESS_INVALID:
+		return "invalid    ";
+	}
+
+	return "";
+}
+
+/*
+ * FIXME: mpcs (and per-mpc lists) have no locking whatsoever.
+ */
+
+static void *mpc_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t l = *pos;
+	struct mpoa_client *mpc;
+
+	if (!l--)
+		return SEQ_START_TOKEN;
+	for (mpc = mpcs; mpc; mpc = mpc->next)
+		if (!l--)
+			return mpc;
+	return NULL;
+}
+
+static void *mpc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct mpoa_client *p = v;
+	(*pos)++;
+	return v == SEQ_START_TOKEN ? mpcs : p->next;
+}
+
+static void mpc_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * READING function - called when the /proc/atm/mpoa file is read from.
+ */
+static int mpc_show(struct seq_file *m, void *v)
+{
+	struct mpoa_client *mpc = v;
+	int i;
+	in_cache_entry *in_entry;
+	eg_cache_entry *eg_entry;
+	struct timeval now;
+	unsigned char ip_string[16];
+
+	if (v == SEQ_START_TOKEN) {
+		atm_mpoa_disp_qos(m);
+		return 0;
+	}
+
+	seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num);
+	seq_printf(m, "Ingress Entries:\nIP address      State      Holding time  Packets fwded  VPI  VCI\n");
+	do_gettimeofday(&now);
+
+	for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) {
+		sprintf(ip_string, "%pI4", &in_entry->ctrl_info.in_dst_ip);
+		seq_printf(m, "%-16s%s%-14lu%-12u",
+			   ip_string,
+			   ingress_state_string(in_entry->entry_state),
+			   in_entry->ctrl_info.holding_time -
+			   (now.tv_sec-in_entry->tv.tv_sec),
+			   in_entry->packets_fwded);
+		if (in_entry->shortcut)
+			seq_printf(m, "   %-3d  %-3d",
+				   in_entry->shortcut->vpi,
+				   in_entry->shortcut->vci);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "\n");
+	seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id        State      Holding time  Packets recvd  Latest IP addr   VPI VCI\n");
+	for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) {
+		unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr;
+		for (i = 0; i < ATM_ESA_LEN; i++)
+			seq_printf(m, "%02x", p[i]);
+		seq_printf(m, "\n%-16lu%s%-14lu%-15u",
+			   (unsigned long)ntohl(eg_entry->ctrl_info.cache_id),
+			   egress_state_string(eg_entry->entry_state),
+			   (eg_entry->ctrl_info.holding_time -
+			    (now.tv_sec-eg_entry->tv.tv_sec)),
+			   eg_entry->packets_rcvd);
+
+		/* latest IP address */
+		sprintf(ip_string, "%pI4", &eg_entry->latest_ip_addr);
+		seq_printf(m, "%-16s", ip_string);
+
+		if (eg_entry->shortcut)
+			seq_printf(m, " %-3d %-3d",
+				   eg_entry->shortcut->vpi,
+				   eg_entry->shortcut->vci);
+		seq_printf(m, "\n");
+	}
+	seq_printf(m, "\n");
+	return 0;
+}
+
+static const struct seq_operations mpc_op = {
+	.start =	mpc_start,
+	.next =		mpc_next,
+	.stop =		mpc_stop,
+	.show =		mpc_show
+};
+
+static int proc_mpc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &mpc_op);
+}
+
+static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
+			      size_t nbytes, loff_t *ppos)
+{
+	char *page, *p;
+	unsigned len;
+
+	if (nbytes == 0)
+		return 0;
+
+	if (nbytes >= PAGE_SIZE)
+		nbytes = PAGE_SIZE-1;
+
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	for (p = page, len = 0; len < nbytes; p++, len++) {
+		if (get_user(*p, buff++)) {
+			free_page((unsigned long)page);
+			return -EFAULT;
+		}
+		if (*p == '\0' || *p == '\n')
+			break;
+	}
+
+	*p = '\0';
+
+	if (!parse_qos(page))
+		printk("mpoa: proc_mpc_write: could not parse '%s'\n", page);
+
+	free_page((unsigned long)page);
+
+	return len;
+}
+
+static int parse_qos(const char *buff)
+{
+	/* possible lines look like this
+	 * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu
+	 */
+	unsigned char ip[4];
+	int tx_pcr, tx_sdu, rx_pcr, rx_sdu;
+	__be32 ipaddr;
+	struct atm_qos qos;
+
+	memset(&qos, 0, sizeof(struct atm_qos));
+
+	if (sscanf(buff, "del %hhu.%hhu.%hhu.%hhu",
+			ip, ip+1, ip+2, ip+3) == 4) {
+		ipaddr = *(__be32 *)ip;
+		return atm_mpoa_delete_qos(atm_mpoa_search_qos(ipaddr));
+	}
+
+	if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=tx",
+			ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu) == 6) {
+		rx_pcr = tx_pcr;
+		rx_sdu = tx_sdu;
+	} else if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=%d,%d",
+		ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu, &rx_pcr, &rx_sdu) != 8)
+		return 0;
+
+	ipaddr = *(__be32 *)ip;
+	qos.txtp.traffic_class = ATM_CBR;
+	qos.txtp.max_pcr = tx_pcr;
+	qos.txtp.max_sdu = tx_sdu;
+	qos.rxtp.traffic_class = ATM_CBR;
+	qos.rxtp.max_pcr = rx_pcr;
+	qos.rxtp.max_sdu = rx_sdu;
+	qos.aal = ATM_AAL5;
+	dprintk("parse_qos(): setting qos paramameters to tx=%d,%d rx=%d,%d\n",
+		qos.txtp.max_pcr, qos.txtp.max_sdu,
+		qos.rxtp.max_pcr, qos.rxtp.max_sdu);
+
+	atm_mpoa_add_qos(ipaddr, &qos);
+	return 1;
+}
+
+/*
+ * INITIALIZATION function - called when module is initialized/loaded.
+ */
+int mpc_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations);
+	if (!p) {
+		pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/*
+ * DELETING function - called when module is removed.
+ */
+void mpc_proc_clean(void)
+{
+	remove_proc_entry(STAT_FILE_NAME, atm_proc_root);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+
+
+
+
+
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
new file mode 100644
index 00000000..e9aced0e
--- /dev/null
+++ b/net/atm/pppoatm.c
@@ -0,0 +1,362 @@
+/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */
+
+/* Copyright 1999-2000 by Mitchell Blank Jr */
+/* Based on clip.c; 1995-1999 by Werner Almesberger, EPFL LRC/ICA */
+/* And on ppp_async.c; Copyright 1999 Paul Mackerras */
+/* And help from Jens Axboe */
+
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * This driver provides the encapsulation and framing for sending
+ * and receiving PPP frames in ATM AAL5 PDUs.
+ */
+
+/*
+ * One shortcoming of this driver is that it does not comply with
+ * section 8 of RFC2364 - we are supposed to detect a change
+ * in encapsulation and immediately abort the connection (in order
+ * to avoid a black-hole being created if our peer loses state
+ * and changes encapsulation unilaterally.  However, since the
+ * ppp_generic layer actually does the decapsulation, we need
+ * a way of notifying it when we _think_ there might be a problem)
+ * There's two cases:
+ *   1.	LLC-encapsulation was missing when it was enabled.  In
+ *	this case, we should tell the upper layer "tear down
+ *	this session if this skb looks ok to you"
+ *   2.	LLC-encapsulation was present when it was disabled.  Then
+ *	we need to tell the upper layer "this packet may be
+ *	ok, but if its in error tear down the session"
+ * These hooks are not yet available in ppp_generic
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/ppp_channel.h>
+#include <linux/atmppp.h>
+
+#include "common.h"
+
+enum pppoatm_encaps {
+	e_autodetect = PPPOATM_ENCAPS_AUTODETECT,
+	e_vc = PPPOATM_ENCAPS_VC,
+	e_llc = PPPOATM_ENCAPS_LLC,
+};
+
+struct pppoatm_vcc {
+	struct atm_vcc	*atmvcc;	/* VCC descriptor */
+	void (*old_push)(struct atm_vcc *, struct sk_buff *);
+	void (*old_pop)(struct atm_vcc *, struct sk_buff *);
+					/* keep old push/pop for detaching */
+	enum pppoatm_encaps encaps;
+	int flags;			/* SC_COMP_PROT - compress protocol */
+	struct ppp_channel chan;	/* interface to generic ppp layer */
+	struct tasklet_struct wakeup_tasklet;
+};
+
+/*
+ * Header used for LLC Encapsulated PPP (4 bytes) followed by the LCP protocol
+ * ID (0xC021) used in autodetection
+ */
+static const unsigned char pppllc[6] = { 0xFE, 0xFE, 0x03, 0xCF, 0xC0, 0x21 };
+#define LLC_LEN		(4)
+
+static inline struct pppoatm_vcc *atmvcc_to_pvcc(const struct atm_vcc *atmvcc)
+{
+	return (struct pppoatm_vcc *) (atmvcc->user_back);
+}
+
+static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan)
+{
+	return (struct pppoatm_vcc *) (chan->private);
+}
+
+/*
+ * We can't do this directly from our _pop handler, since the ppp code
+ * doesn't want to be called in interrupt context, so we do it from
+ * a tasklet
+ */
+static void pppoatm_wakeup_sender(unsigned long arg)
+{
+	ppp_output_wakeup((struct ppp_channel *) arg);
+}
+
+/*
+ * This gets called every time the ATM card has finished sending our
+ * skb.  The ->old_pop will take care up normal atm flow control,
+ * but we also need to wake up the device if we blocked it
+ */
+static void pppoatm_pop(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+	struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+	pvcc->old_pop(atmvcc, skb);
+	/*
+	 * We don't really always want to do this since it's
+	 * really inefficient - it would be much better if we could
+	 * test if we had actually throttled the generic layer.
+	 * Unfortunately then there would be a nasty SMP race where
+	 * we could clear that flag just as we refuse another packet.
+	 * For now we do the safe thing.
+	 */
+	tasklet_schedule(&pvcc->wakeup_tasklet);
+}
+
+/*
+ * Unbind from PPP - currently we only do this when closing the socket,
+ * but we could put this into an ioctl if need be
+ */
+static void pppoatm_unassign_vcc(struct atm_vcc *atmvcc)
+{
+	struct pppoatm_vcc *pvcc;
+	pvcc = atmvcc_to_pvcc(atmvcc);
+	atmvcc->push = pvcc->old_push;
+	atmvcc->pop = pvcc->old_pop;
+	tasklet_kill(&pvcc->wakeup_tasklet);
+	ppp_unregister_channel(&pvcc->chan);
+	atmvcc->user_back = NULL;
+	kfree(pvcc);
+	/* Gee, I hope we have the big kernel lock here... */
+	module_put(THIS_MODULE);
+}
+
+/* Called when an AAL5 PDU comes in */
+static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
+{
+	struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc);
+	pr_debug("\n");
+	if (skb == NULL) {			/* VCC was closed */
+		pr_debug("removing ATMPPP VCC %p\n", pvcc);
+		pppoatm_unassign_vcc(atmvcc);
+		atmvcc->push(atmvcc, NULL);	/* Pass along bad news */
+		return;
+	}
+	atm_return(atmvcc, skb->truesize);
+	switch (pvcc->encaps) {
+	case e_llc:
+		if (skb->len < LLC_LEN ||
+		    memcmp(skb->data, pppllc, LLC_LEN))
+			goto error;
+		skb_pull(skb, LLC_LEN);
+		break;
+	case e_autodetect:
+		if (pvcc->chan.ppp == NULL) {	/* Not bound yet! */
+			kfree_skb(skb);
+			return;
+		}
+		if (skb->len >= sizeof(pppllc) &&
+		    !memcmp(skb->data, pppllc, sizeof(pppllc))) {
+			pvcc->encaps = e_llc;
+			skb_pull(skb, LLC_LEN);
+			break;
+		}
+		if (skb->len >= (sizeof(pppllc) - LLC_LEN) &&
+		    !memcmp(skb->data, &pppllc[LLC_LEN],
+		    sizeof(pppllc) - LLC_LEN)) {
+			pvcc->encaps = e_vc;
+			pvcc->chan.mtu += LLC_LEN;
+			break;
+		}
+		pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n",
+			 skb->data[0], skb->data[1], skb->data[2],
+			 skb->data[3], skb->data[4], skb->data[5]);
+		goto error;
+	case e_vc:
+		break;
+	}
+	ppp_input(&pvcc->chan, skb);
+	return;
+
+error:
+	kfree_skb(skb);
+	ppp_input_error(&pvcc->chan, 0);
+}
+
+/*
+ * Called by the ppp_generic.c to send a packet - returns true if packet
+ * was accepted.  If we return false, then it's our job to call
+ * ppp_output_wakeup(chan) when we're feeling more up to it.
+ * Note that in the ENOMEM case (as opposed to the !atm_may_send case)
+ * we should really drop the packet, but the generic layer doesn't
+ * support this yet.  We just return 'DROP_PACKET' which we actually define
+ * as success, just to be clear what we're really doing.
+ */
+#define DROP_PACKET 1
+static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb)
+{
+	struct pppoatm_vcc *pvcc = chan_to_pvcc(chan);
+	ATM_SKB(skb)->vcc = pvcc->atmvcc;
+	pr_debug("(skb=0x%p, vcc=0x%p)\n", skb, pvcc->atmvcc);
+	if (skb->data[0] == '\0' && (pvcc->flags & SC_COMP_PROT))
+		(void) skb_pull(skb, 1);
+	switch (pvcc->encaps) {		/* LLC encapsulation needed */
+	case e_llc:
+		if (skb_headroom(skb) < LLC_LEN) {
+			struct sk_buff *n;
+			n = skb_realloc_headroom(skb, LLC_LEN);
+			if (n != NULL &&
+			    !atm_may_send(pvcc->atmvcc, n->truesize)) {
+				kfree_skb(n);
+				goto nospace;
+			}
+			kfree_skb(skb);
+			skb = n;
+			if (skb == NULL)
+				return DROP_PACKET;
+		} else if (!atm_may_send(pvcc->atmvcc, skb->truesize))
+			goto nospace;
+		memcpy(skb_push(skb, LLC_LEN), pppllc, LLC_LEN);
+		break;
+	case e_vc:
+		if (!atm_may_send(pvcc->atmvcc, skb->truesize))
+			goto nospace;
+		break;
+	case e_autodetect:
+		pr_debug("Trying to send without setting encaps!\n");
+		kfree_skb(skb);
+		return 1;
+	}
+
+	atomic_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc);
+	ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options;
+	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n",
+		 skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev);
+	return ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb)
+	    ? DROP_PACKET : 1;
+nospace:
+	/*
+	 * We don't have space to send this SKB now, but we might have
+	 * already applied SC_COMP_PROT compression, so may need to undo
+	 */
+	if ((pvcc->flags & SC_COMP_PROT) && skb_headroom(skb) > 0 &&
+	    skb->data[-1] == '\0')
+		(void) skb_push(skb, 1);
+	return 0;
+}
+
+/* This handles ioctls sent to the /dev/ppp interface */
+static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
+	unsigned long arg)
+{
+	switch (cmd) {
+	case PPPIOCGFLAGS:
+		return put_user(chan_to_pvcc(chan)->flags, (int __user *) arg)
+		    ? -EFAULT : 0;
+	case PPPIOCSFLAGS:
+		return get_user(chan_to_pvcc(chan)->flags, (int __user *) arg)
+		    ? -EFAULT : 0;
+	}
+	return -ENOTTY;
+}
+
+static const struct ppp_channel_ops pppoatm_ops = {
+	.start_xmit = pppoatm_send,
+	.ioctl = pppoatm_devppp_ioctl,
+};
+
+static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
+{
+	struct atm_backend_ppp be;
+	struct pppoatm_vcc *pvcc;
+	int err;
+	/*
+	 * Each PPPoATM instance has its own tasklet - this is just a
+	 * prototypical one used to initialize them
+	 */
+	static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0);
+	if (copy_from_user(&be, arg, sizeof be))
+		return -EFAULT;
+	if (be.encaps != PPPOATM_ENCAPS_AUTODETECT &&
+	    be.encaps != PPPOATM_ENCAPS_VC && be.encaps != PPPOATM_ENCAPS_LLC)
+		return -EINVAL;
+	pvcc = kzalloc(sizeof(*pvcc), GFP_KERNEL);
+	if (pvcc == NULL)
+		return -ENOMEM;
+	pvcc->atmvcc = atmvcc;
+	pvcc->old_push = atmvcc->push;
+	pvcc->old_pop = atmvcc->pop;
+	pvcc->encaps = (enum pppoatm_encaps) be.encaps;
+	pvcc->chan.private = pvcc;
+	pvcc->chan.ops = &pppoatm_ops;
+	pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN -
+	    (be.encaps == e_vc ? 0 : LLC_LEN);
+	pvcc->wakeup_tasklet = tasklet_proto;
+	pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan;
+	err = ppp_register_channel(&pvcc->chan);
+	if (err != 0) {
+		kfree(pvcc);
+		return err;
+	}
+	atmvcc->user_back = pvcc;
+	atmvcc->push = pppoatm_push;
+	atmvcc->pop = pppoatm_pop;
+	__module_get(THIS_MODULE);
+	return 0;
+}
+
+/*
+ * This handles ioctls actually performed on our vcc - we must return
+ * -ENOIOCTLCMD for any unrecognized ioctl
+ */
+static int pppoatm_ioctl(struct socket *sock, unsigned int cmd,
+	unsigned long arg)
+{
+	struct atm_vcc *atmvcc = ATM_SD(sock);
+	void __user *argp = (void __user *)arg;
+
+	if (cmd != ATM_SETBACKEND && atmvcc->push != pppoatm_push)
+		return -ENOIOCTLCMD;
+	switch (cmd) {
+	case ATM_SETBACKEND: {
+		atm_backend_t b;
+		if (get_user(b, (atm_backend_t __user *) argp))
+			return -EFAULT;
+		if (b != ATM_BACKEND_PPP)
+			return -ENOIOCTLCMD;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		return pppoatm_assign_vcc(atmvcc, argp);
+		}
+	case PPPIOCGCHAN:
+		return put_user(ppp_channel_index(&atmvcc_to_pvcc(atmvcc)->
+		    chan), (int __user *) argp) ? -EFAULT : 0;
+	case PPPIOCGUNIT:
+		return put_user(ppp_unit_number(&atmvcc_to_pvcc(atmvcc)->
+		    chan), (int __user *) argp) ? -EFAULT : 0;
+	}
+	return -ENOIOCTLCMD;
+}
+
+static struct atm_ioctl pppoatm_ioctl_ops = {
+	.owner	= THIS_MODULE,
+	.ioctl	= pppoatm_ioctl,
+};
+
+static int __init pppoatm_init(void)
+{
+	register_atm_ioctl(&pppoatm_ioctl_ops);
+	return 0;
+}
+
+static void __exit pppoatm_exit(void)
+{
+	deregister_atm_ioctl(&pppoatm_ioctl_ops);
+}
+
+module_init(pppoatm_init);
+module_exit(pppoatm_exit);
+
+MODULE_AUTHOR("Mitchell Blank Jr <mitch@sfgoth.com>");
+MODULE_DESCRIPTION("RFC2364 PPP over ATM/AAL5");
+MODULE_LICENSE("GPL");
diff --git a/net/atm/proc.c b/net/atm/proc.c
new file mode 100644
index 00000000..be3afdef
--- /dev/null
+++ b/net/atm/proc.c
@@ -0,0 +1,497 @@
+/* net/atm/proc.c - ATM /proc interface
+ *
+ * Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA
+ *
+ * seq_file api usage by romieu@fr.zoreil.com
+ *
+ * Evaluating the efficiency of the whole thing if left as an exercise to
+ * the reader.
+ */
+
+#include <linux/module.h> /* for EXPORT_SYMBOL */
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/errno.h>
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/netdevice.h>
+#include <linux/atmclip.h>
+#include <linux/init.h> /* for __init */
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/atmclip.h>
+#include <linux/uaccess.h>
+#include <linux/param.h> /* for HZ */
+#include <asm/atomic.h>
+#include "resources.h"
+#include "common.h" /* atm_proc_init prototype */
+#include "signaling.h" /* to get sigd - ugly too */
+
+static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *pos);
+
+static const struct file_operations proc_atm_dev_ops = {
+	.owner =	THIS_MODULE,
+	.read =		proc_dev_atm_read,
+	.llseek =	noop_llseek,
+};
+
+static void add_stats(struct seq_file *seq, const char *aal,
+  const struct k_atm_aal_stats *stats)
+{
+	seq_printf(seq, "%s ( %d %d %d %d %d )", aal,
+		   atomic_read(&stats->tx), atomic_read(&stats->tx_err),
+		   atomic_read(&stats->rx), atomic_read(&stats->rx_err),
+		   atomic_read(&stats->rx_drop));
+}
+
+static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev)
+{
+	int i;
+
+	seq_printf(seq, "%3d %-8s", dev->number, dev->type);
+	for (i = 0; i < ESI_LEN; i++)
+		seq_printf(seq, "%02x", dev->esi[i]);
+	seq_puts(seq, "  ");
+	add_stats(seq, "0", &dev->stats.aal0);
+	seq_puts(seq, "  ");
+	add_stats(seq, "5", &dev->stats.aal5);
+	seq_printf(seq, "\t[%d]", atomic_read(&dev->refcnt));
+	seq_putc(seq, '\n');
+}
+
+struct vcc_state {
+	int bucket;
+	struct sock *sk;
+	int family;
+};
+
+static inline int compare_family(struct sock *sk, int family)
+{
+	return !family || (sk->sk_family == family);
+}
+
+static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l)
+{
+	struct sock *sk = *sock;
+
+	if (sk == SEQ_START_TOKEN) {
+		for (*bucket = 0; *bucket < VCC_HTABLE_SIZE; ++*bucket) {
+			struct hlist_head *head = &vcc_hash[*bucket];
+
+			sk = hlist_empty(head) ? NULL : __sk_head(head);
+			if (sk)
+				break;
+		}
+		l--;
+	}
+try_again:
+	for (; sk; sk = sk_next(sk)) {
+		l -= compare_family(sk, family);
+		if (l < 0)
+			goto out;
+	}
+	if (!sk && ++*bucket < VCC_HTABLE_SIZE) {
+		sk = sk_head(&vcc_hash[*bucket]);
+		goto try_again;
+	}
+	sk = SEQ_START_TOKEN;
+out:
+	*sock = sk;
+	return (l < 0);
+}
+
+static inline void *vcc_walk(struct vcc_state *state, loff_t l)
+{
+	return __vcc_walk(&state->sk, state->family, &state->bucket, l) ?
+	       state : NULL;
+}
+
+static int __vcc_seq_open(struct inode *inode, struct file *file,
+	int family, const struct seq_operations *ops)
+{
+	struct vcc_state *state;
+
+	state = __seq_open_private(file, ops, sizeof(*state));
+	if (state == NULL)
+		return -ENOMEM;
+
+	state->family = family;
+	return 0;
+}
+
+static void *vcc_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(vcc_sklist_lock)
+{
+	struct vcc_state *state = seq->private;
+	loff_t left = *pos;
+
+	read_lock(&vcc_sklist_lock);
+	state->sk = SEQ_START_TOKEN;
+	return left ? vcc_walk(state, left) : SEQ_START_TOKEN;
+}
+
+static void vcc_seq_stop(struct seq_file *seq, void *v)
+	__releases(vcc_sklist_lock)
+{
+	read_unlock(&vcc_sklist_lock);
+}
+
+static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct vcc_state *state = seq->private;
+
+	v = vcc_walk(state, 1);
+	*pos += !!PTR_ERR(v);
+	return v;
+}
+
+static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+	static const char *const class_name[] = {
+		"off", "UBR", "CBR", "VBR", "ABR"};
+	static const char *const aal_name[] = {
+		"---",	"1",	"2",	"3/4",	/*  0- 3 */
+		"???",	"5",	"???",	"???",	/*  4- 7 */
+		"???",	"???",	"???",	"???",	/*  8-11 */
+		"???",	"0",	"???",	"???"};	/* 12-15 */
+
+	seq_printf(seq, "%3d %3d %5d %-3s %7d %-5s %7d %-6s",
+		   vcc->dev->number, vcc->vpi, vcc->vci,
+		   vcc->qos.aal >= ARRAY_SIZE(aal_name) ? "err" :
+		   aal_name[vcc->qos.aal], vcc->qos.rxtp.min_pcr,
+		   class_name[vcc->qos.rxtp.traffic_class],
+		   vcc->qos.txtp.min_pcr,
+		   class_name[vcc->qos.txtp.traffic_class]);
+	if (test_bit(ATM_VF_IS_CLIP, &vcc->flags)) {
+		struct clip_vcc *clip_vcc = CLIP_VCC(vcc);
+		struct net_device *dev;
+
+		dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : NULL;
+		seq_printf(seq, "CLIP, Itf:%s, Encap:",
+		    dev ? dev->name : "none?");
+		seq_printf(seq, "%s", clip_vcc->encap ? "LLC/SNAP" : "None");
+	}
+	seq_putc(seq, '\n');
+}
+
+static const char *vcc_state(struct atm_vcc *vcc)
+{
+	static const char *const map[] = { ATM_VS2TXT_MAP };
+
+	return map[ATM_VF2VS(vcc->flags)];
+}
+
+static void vcc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+	struct sock *sk = sk_atm(vcc);
+
+	seq_printf(seq, "%pK ", vcc);
+	if (!vcc->dev)
+		seq_printf(seq, "Unassigned    ");
+	else
+		seq_printf(seq, "%3d %3d %5d ", vcc->dev->number, vcc->vpi,
+			vcc->vci);
+	switch (sk->sk_family) {
+	case AF_ATMPVC:
+		seq_printf(seq, "PVC");
+		break;
+	case AF_ATMSVC:
+		seq_printf(seq, "SVC");
+		break;
+	default:
+		seq_printf(seq, "%3d", sk->sk_family);
+	}
+	seq_printf(seq, " %04lx  %5d %7d/%7d %7d/%7d [%d]\n",
+		   vcc->flags, sk->sk_err,
+		   sk_wmem_alloc_get(sk), sk->sk_sndbuf,
+		   sk_rmem_alloc_get(sk), sk->sk_rcvbuf,
+		   atomic_read(&sk->sk_refcnt));
+}
+
+static void svc_info(struct seq_file *seq, struct atm_vcc *vcc)
+{
+	if (!vcc->dev)
+		seq_printf(seq, sizeof(void *) == 4 ?
+			   "N/A@%pK%10s" : "N/A@%pK%2s", vcc, "");
+	else
+		seq_printf(seq, "%3d %3d %5d         ",
+			   vcc->dev->number, vcc->vpi, vcc->vci);
+	seq_printf(seq, "%-10s ", vcc_state(vcc));
+	seq_printf(seq, "%s%s", vcc->remote.sas_addr.pub,
+	    *vcc->remote.sas_addr.pub && *vcc->remote.sas_addr.prv ? "+" : "");
+	if (*vcc->remote.sas_addr.prv) {
+		int i;
+
+		for (i = 0; i < ATM_ESA_LEN; i++)
+			seq_printf(seq, "%02x", vcc->remote.sas_addr.prv[i]);
+	}
+	seq_putc(seq, '\n');
+}
+
+static int atm_dev_seq_show(struct seq_file *seq, void *v)
+{
+	static char atm_dev_banner[] =
+		"Itf Type    ESI/\"MAC\"addr "
+		"AAL(TX,err,RX,err,drop) ...               [refcnt]\n";
+
+	if (v == &atm_devs)
+		seq_puts(seq, atm_dev_banner);
+	else {
+		struct atm_dev *dev = list_entry(v, struct atm_dev, dev_list);
+
+		atm_dev_info(seq, dev);
+	}
+	return 0;
+}
+
+static const struct seq_operations atm_dev_seq_ops = {
+	.start	= atm_dev_seq_start,
+	.next	= atm_dev_seq_next,
+	.stop	= atm_dev_seq_stop,
+	.show	= atm_dev_seq_show,
+};
+
+static int atm_dev_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &atm_dev_seq_ops);
+}
+
+static const struct file_operations devices_seq_fops = {
+	.open		= atm_dev_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int pvc_seq_show(struct seq_file *seq, void *v)
+{
+	static char atm_pvc_banner[] =
+		"Itf VPI VCI   AAL RX(PCR,Class) TX(PCR,Class)\n";
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, atm_pvc_banner);
+	else {
+		struct vcc_state *state = seq->private;
+		struct atm_vcc *vcc = atm_sk(state->sk);
+
+		pvc_info(seq, vcc);
+	}
+	return 0;
+}
+
+static const struct seq_operations pvc_seq_ops = {
+	.start	= vcc_seq_start,
+	.next	= vcc_seq_next,
+	.stop	= vcc_seq_stop,
+	.show	= pvc_seq_show,
+};
+
+static int pvc_seq_open(struct inode *inode, struct file *file)
+{
+	return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops);
+}
+
+static const struct file_operations pvc_seq_fops = {
+	.open		= pvc_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int vcc_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s",
+			"Address ", "Itf VPI VCI   Fam Flags Reply "
+			"Send buffer     Recv buffer      [refcnt]\n");
+	} else {
+		struct vcc_state *state = seq->private;
+		struct atm_vcc *vcc = atm_sk(state->sk);
+
+		vcc_info(seq, vcc);
+	}
+	return 0;
+}
+
+static const struct seq_operations vcc_seq_ops = {
+	.start	= vcc_seq_start,
+	.next	= vcc_seq_next,
+	.stop	= vcc_seq_stop,
+	.show	= vcc_seq_show,
+};
+
+static int vcc_seq_open(struct inode *inode, struct file *file)
+{
+	return __vcc_seq_open(inode, file, 0, &vcc_seq_ops);
+}
+
+static const struct file_operations vcc_seq_fops = {
+	.open		= vcc_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int svc_seq_show(struct seq_file *seq, void *v)
+{
+	static const char atm_svc_banner[] =
+		"Itf VPI VCI           State      Remote\n";
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, atm_svc_banner);
+	else {
+		struct vcc_state *state = seq->private;
+		struct atm_vcc *vcc = atm_sk(state->sk);
+
+		svc_info(seq, vcc);
+	}
+	return 0;
+}
+
+static const struct seq_operations svc_seq_ops = {
+	.start	= vcc_seq_start,
+	.next	= vcc_seq_next,
+	.stop	= vcc_seq_stop,
+	.show	= svc_seq_show,
+};
+
+static int svc_seq_open(struct inode *inode, struct file *file)
+{
+	return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops);
+}
+
+static const struct file_operations svc_seq_fops = {
+	.open		= svc_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *pos)
+{
+	struct atm_dev *dev;
+	unsigned long page;
+	int length;
+
+	if (count == 0)
+		return 0;
+	page = get_zeroed_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	dev = PDE(file->f_path.dentry->d_inode)->data;
+	if (!dev->ops->proc_read)
+		length = -EINVAL;
+	else {
+		length = dev->ops->proc_read(dev, pos, (char *)page);
+		if (length > count)
+			length = -EINVAL;
+	}
+	if (length >= 0) {
+		if (copy_to_user(buf, (char *)page, length))
+			length = -EFAULT;
+		(*pos)++;
+	}
+	free_page(page);
+	return length;
+}
+
+struct proc_dir_entry *atm_proc_root;
+EXPORT_SYMBOL(atm_proc_root);
+
+
+int atm_proc_dev_register(struct atm_dev *dev)
+{
+	int error;
+
+	/* No proc info */
+	if (!dev->ops->proc_read)
+		return 0;
+
+	error = -ENOMEM;
+	dev->proc_name = kasprintf(GFP_KERNEL, "%s:%d", dev->type, dev->number);
+	if (!dev->proc_name)
+		goto err_out;
+
+	dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root,
+					   &proc_atm_dev_ops, dev);
+	if (!dev->proc_entry)
+		goto err_free_name;
+	return 0;
+
+err_free_name:
+	kfree(dev->proc_name);
+err_out:
+	return error;
+}
+
+void atm_proc_dev_deregister(struct atm_dev *dev)
+{
+	if (!dev->ops->proc_read)
+		return;
+
+	remove_proc_entry(dev->proc_name, atm_proc_root);
+	kfree(dev->proc_name);
+}
+
+static struct atm_proc_entry {
+	char *name;
+	const struct file_operations *proc_fops;
+	struct proc_dir_entry *dirent;
+} atm_proc_ents[] = {
+	{ .name = "devices",	.proc_fops = &devices_seq_fops },
+	{ .name = "pvc",	.proc_fops = &pvc_seq_fops },
+	{ .name = "svc",	.proc_fops = &svc_seq_fops },
+	{ .name = "vc",		.proc_fops = &vcc_seq_fops },
+	{ .name = NULL,		.proc_fops = NULL }
+};
+
+static void atm_proc_dirs_remove(void)
+{
+	static struct atm_proc_entry *e;
+
+	for (e = atm_proc_ents; e->name; e++) {
+		if (e->dirent)
+			remove_proc_entry(e->name, atm_proc_root);
+	}
+	proc_net_remove(&init_net, "atm");
+}
+
+int __init atm_proc_init(void)
+{
+	static struct atm_proc_entry *e;
+	int ret;
+
+	atm_proc_root = proc_net_mkdir(&init_net, "atm", init_net.proc_net);
+	if (!atm_proc_root)
+		goto err_out;
+	for (e = atm_proc_ents; e->name; e++) {
+		struct proc_dir_entry *dirent;
+
+		dirent = proc_create(e->name, S_IRUGO,
+				     atm_proc_root, e->proc_fops);
+		if (!dirent)
+			goto err_out_remove;
+		e->dirent = dirent;
+	}
+	ret = 0;
+out:
+	return ret;
+
+err_out_remove:
+	atm_proc_dirs_remove();
+err_out:
+	ret = -ENOMEM;
+	goto out;
+}
+
+void atm_proc_exit(void)
+{
+	atm_proc_dirs_remove();
+}
diff --git a/net/atm/protocols.h b/net/atm/protocols.h
new file mode 100644
index 00000000..acdfc856
--- /dev/null
+++ b/net/atm/protocols.h
@@ -0,0 +1,13 @@
+/* net/atm/protocols.h - ATM protocol handler entry points */
+
+/* Written 1995-1997 by Werner Almesberger, EPFL LRC */
+
+
+#ifndef NET_ATM_PROTOCOLS_H
+#define NET_ATM_PROTOCOLS_H
+
+int atm_init_aal0(struct atm_vcc *vcc);	/* "raw" AAL0 */
+int atm_init_aal34(struct atm_vcc *vcc);/* "raw" AAL3/4 transport */
+int atm_init_aal5(struct atm_vcc *vcc);	/* "raw" AAL5 transport */
+
+#endif
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
new file mode 100644
index 00000000..437ee70c
--- /dev/null
+++ b/net/atm/pvc.c
@@ -0,0 +1,160 @@
+/* net/atm/pvc.c - ATM PVC sockets */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#include <linux/net.h>		/* struct socket, struct proto_ops */
+#include <linux/atm.h>		/* ATM stuff */
+#include <linux/atmdev.h>	/* ATM devices */
+#include <linux/errno.h>	/* error codes */
+#include <linux/kernel.h>	/* printk */
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/bitops.h>
+#include <net/sock.h>		/* for sock_no_* */
+
+#include "resources.h"		/* devs and vccs */
+#include "common.h"		/* common for PVCs and SVCs */
+
+
+static int pvc_shutdown(struct socket *sock, int how)
+{
+	return 0;
+}
+
+static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr,
+		    int sockaddr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_atmpvc *addr;
+	struct atm_vcc *vcc;
+	int error;
+
+	if (sockaddr_len != sizeof(struct sockaddr_atmpvc))
+		return -EINVAL;
+	addr = (struct sockaddr_atmpvc *)sockaddr;
+	if (addr->sap_family != AF_ATMPVC)
+		return -EAFNOSUPPORT;
+	lock_sock(sk);
+	vcc = ATM_SD(sock);
+	if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+		error = -EBADFD;
+		goto out;
+	}
+	if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) {
+		if (vcc->vpi != ATM_VPI_UNSPEC)
+			addr->sap_addr.vpi = vcc->vpi;
+		if (vcc->vci != ATM_VCI_UNSPEC)
+			addr->sap_addr.vci = vcc->vci;
+	}
+	error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi,
+			    addr->sap_addr.vci);
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr,
+		       int sockaddr_len, int flags)
+{
+	return pvc_bind(sock, sockaddr, sockaddr_len);
+}
+
+static int pvc_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int error;
+
+	lock_sock(sk);
+	error = vcc_setsockopt(sock, level, optname, optval, optlen);
+	release_sock(sk);
+	return error;
+}
+
+static int pvc_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int error;
+
+	lock_sock(sk);
+	error = vcc_getsockopt(sock, level, optname, optval, optlen);
+	release_sock(sk);
+	return error;
+}
+
+static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr,
+		       int *sockaddr_len, int peer)
+{
+	struct sockaddr_atmpvc *addr;
+	struct atm_vcc *vcc = ATM_SD(sock);
+
+	if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
+		return -ENOTCONN;
+	*sockaddr_len = sizeof(struct sockaddr_atmpvc);
+	addr = (struct sockaddr_atmpvc *)sockaddr;
+	addr->sap_family = AF_ATMPVC;
+	addr->sap_addr.itf = vcc->dev->number;
+	addr->sap_addr.vpi = vcc->vpi;
+	addr->sap_addr.vci = vcc->vci;
+	return 0;
+}
+
+static const struct proto_ops pvc_proto_ops = {
+	.family =	PF_ATMPVC,
+	.owner =	THIS_MODULE,
+
+	.release =	vcc_release,
+	.bind =		pvc_bind,
+	.connect =	pvc_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	pvc_getname,
+	.poll =		vcc_poll,
+	.ioctl =	vcc_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = vcc_compat_ioctl,
+#endif
+	.listen =	sock_no_listen,
+	.shutdown =	pvc_shutdown,
+	.setsockopt =	pvc_setsockopt,
+	.getsockopt =	pvc_getsockopt,
+	.sendmsg =	vcc_sendmsg,
+	.recvmsg =	vcc_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+
+static int pvc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	if (net != &init_net)
+		return -EAFNOSUPPORT;
+
+	sock->ops = &pvc_proto_ops;
+	return vcc_create(net, sock, protocol, PF_ATMPVC);
+}
+
+static const struct net_proto_family pvc_family_ops = {
+	.family = PF_ATMPVC,
+	.create = pvc_create,
+	.owner = THIS_MODULE,
+};
+
+
+/*
+ *	Initialize the ATM PVC protocol family
+ */
+
+
+int __init atmpvc_init(void)
+{
+	return sock_register(&pvc_family_ops);
+}
+
+void atmpvc_exit(void)
+{
+	sock_unregister(PF_ATMPVC);
+}
diff --git a/net/atm/raw.c b/net/atm/raw.c
new file mode 100644
index 00000000..b4f7b9ff
--- /dev/null
+++ b/net/atm/raw.c
@@ -0,0 +1,85 @@
+/* net/atm/raw.c - Raw AAL0 and AAL5 transports */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/atmdev.h>
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "common.h"
+#include "protocols.h"
+
+/*
+ * SKB == NULL indicates that the link is being closed
+ */
+
+static void atm_push_raw(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	if (skb) {
+		struct sock *sk = sk_atm(vcc);
+
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		sk->sk_data_ready(sk, skb->len);
+	}
+}
+
+static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct sock *sk = sk_atm(vcc);
+
+	pr_debug("(%d) %d -= %d\n",
+		 vcc->vci, sk_wmem_alloc_get(sk), skb->truesize);
+	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+	dev_kfree_skb_any(skb);
+	sk->sk_write_space(sk);
+}
+
+static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	/*
+	 * Note that if vpi/vci are _ANY or _UNSPEC the below will
+	 * still work
+	 */
+	if (!capable(CAP_NET_ADMIN) &&
+	    (((u32 *)skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) !=
+	    ((vcc->vpi << ATM_HDR_VPI_SHIFT) |
+	     (vcc->vci << ATM_HDR_VCI_SHIFT))) {
+		kfree_skb(skb);
+		return -EADDRNOTAVAIL;
+	}
+	return vcc->dev->ops->send(vcc, skb);
+}
+
+int atm_init_aal0(struct atm_vcc *vcc)
+{
+	vcc->push = atm_push_raw;
+	vcc->pop = atm_pop_raw;
+	vcc->push_oam = NULL;
+	vcc->send = atm_send_aal0;
+	return 0;
+}
+
+int atm_init_aal34(struct atm_vcc *vcc)
+{
+	vcc->push = atm_push_raw;
+	vcc->pop = atm_pop_raw;
+	vcc->push_oam = NULL;
+	vcc->send = vcc->dev->ops->send;
+	return 0;
+}
+
+int atm_init_aal5(struct atm_vcc *vcc)
+{
+	vcc->push = atm_push_raw;
+	vcc->pop = atm_pop_raw;
+	vcc->push_oam = NULL;
+	vcc->send = vcc->dev->ops->send;
+	return 0;
+}
+EXPORT_SYMBOL(atm_init_aal5);
diff --git a/net/atm/resources.c b/net/atm/resources.c
new file mode 100644
index 00000000..23f45ce6
--- /dev/null
+++ b/net/atm/resources.c
@@ -0,0 +1,463 @@
+/* net/atm/resources.c - Statically allocated resources */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+/* Fixes
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * 2002/01 - don't free the whole struct sock on sk->destruct time,
+ * 	     use the default destruct function initialized by sock_init_data */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/atmdev.h>
+#include <linux/sonet.h>
+#include <linux/kernel.h> /* for barrier */
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>	 /* for struct sock */
+
+#include "common.h"
+#include "resources.h"
+#include "addr.h"
+
+
+LIST_HEAD(atm_devs);
+DEFINE_MUTEX(atm_dev_mutex);
+
+static struct atm_dev *__alloc_atm_dev(const char *type)
+{
+	struct atm_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	dev->type = type;
+	dev->signal = ATM_PHY_SIG_UNKNOWN;
+	dev->link_rate = ATM_OC3_PCR;
+	spin_lock_init(&dev->lock);
+	INIT_LIST_HEAD(&dev->local);
+	INIT_LIST_HEAD(&dev->lecs);
+
+	return dev;
+}
+
+static struct atm_dev *__atm_dev_lookup(int number)
+{
+	struct atm_dev *dev;
+	struct list_head *p;
+
+	list_for_each(p, &atm_devs) {
+		dev = list_entry(p, struct atm_dev, dev_list);
+		if (dev->number == number) {
+			atm_dev_hold(dev);
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+struct atm_dev *atm_dev_lookup(int number)
+{
+	struct atm_dev *dev;
+
+	mutex_lock(&atm_dev_mutex);
+	dev = __atm_dev_lookup(number);
+	mutex_unlock(&atm_dev_mutex);
+	return dev;
+}
+EXPORT_SYMBOL(atm_dev_lookup);
+
+struct atm_dev *atm_dev_register(const char *type, struct device *parent,
+				 const struct atmdev_ops *ops, int number,
+				 unsigned long *flags)
+{
+	struct atm_dev *dev, *inuse;
+
+	dev = __alloc_atm_dev(type);
+	if (!dev) {
+		pr_err("no space for dev %s\n", type);
+		return NULL;
+	}
+	mutex_lock(&atm_dev_mutex);
+	if (number != -1) {
+		inuse = __atm_dev_lookup(number);
+		if (inuse) {
+			atm_dev_put(inuse);
+			mutex_unlock(&atm_dev_mutex);
+			kfree(dev);
+			return NULL;
+		}
+		dev->number = number;
+	} else {
+		dev->number = 0;
+		while ((inuse = __atm_dev_lookup(dev->number))) {
+			atm_dev_put(inuse);
+			dev->number++;
+		}
+	}
+
+	dev->ops = ops;
+	if (flags)
+		dev->flags = *flags;
+	else
+		memset(&dev->flags, 0, sizeof(dev->flags));
+	memset(&dev->stats, 0, sizeof(dev->stats));
+	atomic_set(&dev->refcnt, 1);
+
+	if (atm_proc_dev_register(dev) < 0) {
+		pr_err("atm_proc_dev_register failed for dev %s\n", type);
+		goto out_fail;
+	}
+
+	if (atm_register_sysfs(dev, parent) < 0) {
+		pr_err("atm_register_sysfs failed for dev %s\n", type);
+		atm_proc_dev_deregister(dev);
+		goto out_fail;
+	}
+
+	list_add_tail(&dev->dev_list, &atm_devs);
+
+out:
+	mutex_unlock(&atm_dev_mutex);
+	return dev;
+
+out_fail:
+	kfree(dev);
+	dev = NULL;
+	goto out;
+}
+EXPORT_SYMBOL(atm_dev_register);
+
+void atm_dev_deregister(struct atm_dev *dev)
+{
+	BUG_ON(test_bit(ATM_DF_REMOVED, &dev->flags));
+	set_bit(ATM_DF_REMOVED, &dev->flags);
+
+	/*
+	 * if we remove current device from atm_devs list, new device
+	 * with same number can appear, such we need deregister proc,
+	 * release async all vccs and remove them from vccs list too
+	 */
+	mutex_lock(&atm_dev_mutex);
+	list_del(&dev->dev_list);
+	mutex_unlock(&atm_dev_mutex);
+
+	atm_dev_release_vccs(dev);
+	atm_unregister_sysfs(dev);
+	atm_proc_dev_deregister(dev);
+
+	atm_dev_put(dev);
+}
+EXPORT_SYMBOL(atm_dev_deregister);
+
+static void copy_aal_stats(struct k_atm_aal_stats *from,
+    struct atm_aal_stats *to)
+{
+#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i)
+	__AAL_STAT_ITEMS
+#undef __HANDLE_ITEM
+}
+
+static void subtract_aal_stats(struct k_atm_aal_stats *from,
+    struct atm_aal_stats *to)
+{
+#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i)
+	__AAL_STAT_ITEMS
+#undef __HANDLE_ITEM
+}
+
+static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
+		       int zero)
+{
+	struct atm_dev_stats tmp;
+	int error = 0;
+
+	copy_aal_stats(&dev->stats.aal0, &tmp.aal0);
+	copy_aal_stats(&dev->stats.aal34, &tmp.aal34);
+	copy_aal_stats(&dev->stats.aal5, &tmp.aal5);
+	if (arg)
+		error = copy_to_user(arg, &tmp, sizeof(tmp));
+	if (zero && !error) {
+		subtract_aal_stats(&dev->stats.aal0, &tmp.aal0);
+		subtract_aal_stats(&dev->stats.aal34, &tmp.aal34);
+		subtract_aal_stats(&dev->stats.aal5, &tmp.aal5);
+	}
+	return error ? -EFAULT : 0;
+}
+
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+{
+	void __user *buf;
+	int error, len, number, size = 0;
+	struct atm_dev *dev;
+	struct list_head *p;
+	int *tmp_buf, *tmp_p;
+	int __user *sioc_len;
+	int __user *iobuf_len;
+
+#ifndef CONFIG_COMPAT
+	compat = 0; /* Just so the compiler _knows_ */
+#endif
+
+	switch (cmd) {
+	case ATM_GETNAMES:
+		if (compat) {
+#ifdef CONFIG_COMPAT
+			struct compat_atm_iobuf __user *ciobuf = arg;
+			compat_uptr_t cbuf;
+			iobuf_len = &ciobuf->length;
+			if (get_user(cbuf, &ciobuf->buffer))
+				return -EFAULT;
+			buf = compat_ptr(cbuf);
+#endif
+		} else {
+			struct atm_iobuf __user *iobuf = arg;
+			iobuf_len = &iobuf->length;
+			if (get_user(buf, &iobuf->buffer))
+				return -EFAULT;
+		}
+		if (get_user(len, iobuf_len))
+			return -EFAULT;
+		mutex_lock(&atm_dev_mutex);
+		list_for_each(p, &atm_devs)
+			size += sizeof(int);
+		if (size > len) {
+			mutex_unlock(&atm_dev_mutex);
+			return -E2BIG;
+		}
+		tmp_buf = kmalloc(size, GFP_ATOMIC);
+		if (!tmp_buf) {
+			mutex_unlock(&atm_dev_mutex);
+			return -ENOMEM;
+		}
+		tmp_p = tmp_buf;
+		list_for_each(p, &atm_devs) {
+			dev = list_entry(p, struct atm_dev, dev_list);
+			*tmp_p++ = dev->number;
+		}
+		mutex_unlock(&atm_dev_mutex);
+		error = ((copy_to_user(buf, tmp_buf, size)) ||
+			 put_user(size, iobuf_len))
+			? -EFAULT : 0;
+		kfree(tmp_buf);
+		return error;
+	default:
+		break;
+	}
+
+	if (compat) {
+#ifdef CONFIG_COMPAT
+		struct compat_atmif_sioc __user *csioc = arg;
+		compat_uptr_t carg;
+
+		sioc_len = &csioc->length;
+		if (get_user(carg, &csioc->arg))
+			return -EFAULT;
+		buf = compat_ptr(carg);
+
+		if (get_user(len, &csioc->length))
+			return -EFAULT;
+		if (get_user(number, &csioc->number))
+			return -EFAULT;
+#endif
+	} else {
+		struct atmif_sioc __user *sioc = arg;
+
+		sioc_len = &sioc->length;
+		if (get_user(buf, &sioc->arg))
+			return -EFAULT;
+		if (get_user(len, &sioc->length))
+			return -EFAULT;
+		if (get_user(number, &sioc->number))
+			return -EFAULT;
+	}
+
+	dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d",
+				      number);
+	if (!dev)
+		return -ENODEV;
+
+	switch (cmd) {
+	case ATM_GETTYPE:
+		size = strlen(dev->type) + 1;
+		if (copy_to_user(buf, dev->type, size)) {
+			error = -EFAULT;
+			goto done;
+		}
+		break;
+	case ATM_GETESI:
+		size = ESI_LEN;
+		if (copy_to_user(buf, dev->esi, size)) {
+			error = -EFAULT;
+			goto done;
+		}
+		break;
+	case ATM_SETESI:
+	{
+		int i;
+
+		for (i = 0; i < ESI_LEN; i++)
+			if (dev->esi[i]) {
+				error = -EEXIST;
+				goto done;
+			}
+	}
+	/* fall through */
+	case ATM_SETESIF:
+	{
+		unsigned char esi[ESI_LEN];
+
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+		if (copy_from_user(esi, buf, ESI_LEN)) {
+			error = -EFAULT;
+			goto done;
+		}
+		memcpy(dev->esi, esi, ESI_LEN);
+		error =  ESI_LEN;
+		goto done;
+	}
+	case ATM_GETSTATZ:
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+		/* fall through */
+	case ATM_GETSTAT:
+		size = sizeof(struct atm_dev_stats);
+		error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ);
+		if (error)
+			goto done;
+		break;
+	case ATM_GETCIRANGE:
+		size = sizeof(struct atm_cirange);
+		if (copy_to_user(buf, &dev->ci_range, size)) {
+			error = -EFAULT;
+			goto done;
+		}
+		break;
+	case ATM_GETLINKRATE:
+		size = sizeof(int);
+		if (copy_to_user(buf, &dev->link_rate, size)) {
+			error = -EFAULT;
+			goto done;
+		}
+		break;
+	case ATM_RSTADDR:
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+		atm_reset_addr(dev, ATM_ADDR_LOCAL);
+		break;
+	case ATM_ADDADDR:
+	case ATM_DELADDR:
+	case ATM_ADDLECSADDR:
+	case ATM_DELLECSADDR:
+	{
+		struct sockaddr_atmsvc addr;
+
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+
+		if (copy_from_user(&addr, buf, sizeof(addr))) {
+			error = -EFAULT;
+			goto done;
+		}
+		if (cmd == ATM_ADDADDR || cmd == ATM_ADDLECSADDR)
+			error = atm_add_addr(dev, &addr,
+					     (cmd == ATM_ADDADDR ?
+					      ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+		else
+			error = atm_del_addr(dev, &addr,
+					     (cmd == ATM_DELADDR ?
+					      ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+		goto done;
+	}
+	case ATM_GETADDR:
+	case ATM_GETLECSADDR:
+		error = atm_get_addr(dev, buf, len,
+				     (cmd == ATM_GETADDR ?
+				      ATM_ADDR_LOCAL : ATM_ADDR_LECS));
+		if (error < 0)
+			goto done;
+		size = error;
+		/* may return 0, but later on size == 0 means "don't
+		   write the length" */
+		error = put_user(size, sioc_len) ? -EFAULT : 0;
+		goto done;
+	case ATM_SETLOOP:
+		if (__ATM_LM_XTRMT((int) (unsigned long) buf) &&
+		    __ATM_LM_XTLOC((int) (unsigned long) buf) >
+		    __ATM_LM_XTRMT((int) (unsigned long) buf)) {
+			error = -EINVAL;
+			goto done;
+		}
+		/* fall through */
+	case ATM_SETCIRANGE:
+	case SONET_GETSTATZ:
+	case SONET_SETDIAG:
+	case SONET_CLRDIAG:
+	case SONET_SETFRAMING:
+		if (!capable(CAP_NET_ADMIN)) {
+			error = -EPERM;
+			goto done;
+		}
+		/* fall through */
+	default:
+		if (compat) {
+#ifdef CONFIG_COMPAT
+			if (!dev->ops->compat_ioctl) {
+				error = -EINVAL;
+				goto done;
+			}
+			size = dev->ops->compat_ioctl(dev, cmd, buf);
+#endif
+		} else {
+			if (!dev->ops->ioctl) {
+				error = -EINVAL;
+				goto done;
+			}
+			size = dev->ops->ioctl(dev, cmd, buf);
+		}
+		if (size < 0) {
+			error = (size == -ENOIOCTLCMD ? -EINVAL : size);
+			goto done;
+		}
+	}
+
+	if (size)
+		error = put_user(size, sioc_len) ? -EFAULT : 0;
+	else
+		error = 0;
+done:
+	atm_dev_put(dev);
+	return error;
+}
+
+void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	mutex_lock(&atm_dev_mutex);
+	return seq_list_start_head(&atm_devs, *pos);
+}
+
+void atm_dev_seq_stop(struct seq_file *seq, void *v)
+{
+	mutex_unlock(&atm_dev_mutex);
+}
+
+void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &atm_devs, pos);
+}
diff --git a/net/atm/resources.h b/net/atm/resources.h
new file mode 100644
index 00000000..521431e3
--- /dev/null
+++ b/net/atm/resources.h
@@ -0,0 +1,47 @@
+/* net/atm/resources.h - ATM-related resources */
+
+/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_RESOURCES_H
+#define NET_ATM_RESOURCES_H
+
+#include <linux/atmdev.h>
+#include <linux/mutex.h>
+
+
+extern struct list_head atm_devs;
+extern struct mutex atm_dev_mutex;
+
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
+
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/proc_fs.h>
+
+void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos);
+void atm_dev_seq_stop(struct seq_file *seq, void *v);
+void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+
+
+int atm_proc_dev_register(struct atm_dev *dev);
+void atm_proc_dev_deregister(struct atm_dev *dev);
+
+#else
+
+static inline int atm_proc_dev_register(struct atm_dev *dev)
+{
+	return 0;
+}
+
+static inline void atm_proc_dev_deregister(struct atm_dev *dev)
+{
+	/* nothing */
+}
+
+#endif /* CONFIG_PROC_FS */
+
+int atm_register_sysfs(struct atm_dev *adev, struct device *parent);
+void atm_unregister_sysfs(struct atm_dev *adev);
+#endif
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
new file mode 100644
index 00000000..509c8ac0
--- /dev/null
+++ b/net/atm/signaling.c
@@ -0,0 +1,269 @@
+/* net/atm/signaling.c - ATM signaling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/errno.h>	/* error codes */
+#include <linux/kernel.h>	/* printk */
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/sched.h>	/* jiffies and HZ */
+#include <linux/atm.h>		/* ATM stuff */
+#include <linux/atmsap.h>
+#include <linux/atmsvc.h>
+#include <linux/atmdev.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+#include "resources.h"
+#include "signaling.h"
+
+#undef WAIT_FOR_DEMON		/* #define this if system calls on SVC sockets
+				   should block until the demon runs.
+				   Danger: may cause nasty hangs if the demon
+				   crashes. */
+
+struct atm_vcc *sigd = NULL;
+#ifdef WAIT_FOR_DEMON
+static DECLARE_WAIT_QUEUE_HEAD(sigd_sleep);
+#endif
+
+static void sigd_put_skb(struct sk_buff *skb)
+{
+#ifdef WAIT_FOR_DEMON
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&sigd_sleep, &wait);
+	while (!sigd) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		pr_debug("atmsvc: waiting for signaling daemon...\n");
+		schedule();
+	}
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&sigd_sleep, &wait);
+#else
+	if (!sigd) {
+		pr_debug("atmsvc: no signaling daemon\n");
+		kfree_skb(skb);
+		return;
+	}
+#endif
+	atm_force_charge(sigd, skb->truesize);
+	skb_queue_tail(&sk_atm(sigd)->sk_receive_queue, skb);
+	sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len);
+}
+
+static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
+{
+	struct sk_buff *skb;
+
+	if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
+	    !test_bit(ATM_VF_READY, &vcc->flags))
+		return;
+	msg->type = as_error;
+	if (!vcc->dev->ops->change_qos)
+		msg->reply = -EOPNOTSUPP;
+	else {
+		/* should lock VCC */
+		msg->reply = vcc->dev->ops->change_qos(vcc, &msg->qos,
+						       msg->reply);
+		if (!msg->reply)
+			msg->type = as_okay;
+	}
+	/*
+	 * Should probably just turn around the old skb. But the, the buffer
+	 * space accounting needs to follow the change too. Maybe later.
+	 */
+	while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
+		schedule();
+	*(struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg)) = *msg;
+	sigd_put_skb(skb);
+}
+
+static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct atmsvc_msg *msg;
+	struct atm_vcc *session_vcc;
+	struct sock *sk;
+
+	msg = (struct atmsvc_msg *) skb->data;
+	atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+	vcc = *(struct atm_vcc **) &msg->vcc;
+	pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc);
+	sk = sk_atm(vcc);
+
+	switch (msg->type) {
+	case as_okay:
+		sk->sk_err = -msg->reply;
+		clear_bit(ATM_VF_WAITING, &vcc->flags);
+		if (!*vcc->local.sas_addr.prv && !*vcc->local.sas_addr.pub) {
+			vcc->local.sas_family = AF_ATMSVC;
+			memcpy(vcc->local.sas_addr.prv,
+			       msg->local.sas_addr.prv, ATM_ESA_LEN);
+			memcpy(vcc->local.sas_addr.pub,
+			       msg->local.sas_addr.pub, ATM_E164_LEN + 1);
+		}
+		session_vcc = vcc->session ? vcc->session : vcc;
+		if (session_vcc->vpi || session_vcc->vci)
+			break;
+		session_vcc->itf = msg->pvc.sap_addr.itf;
+		session_vcc->vpi = msg->pvc.sap_addr.vpi;
+		session_vcc->vci = msg->pvc.sap_addr.vci;
+		if (session_vcc->vpi || session_vcc->vci)
+			session_vcc->qos = msg->qos;
+		break;
+	case as_error:
+		clear_bit(ATM_VF_REGIS, &vcc->flags);
+		clear_bit(ATM_VF_READY, &vcc->flags);
+		sk->sk_err = -msg->reply;
+		clear_bit(ATM_VF_WAITING, &vcc->flags);
+		break;
+	case as_indicate:
+		vcc = *(struct atm_vcc **)&msg->listen_vcc;
+		sk = sk_atm(vcc);
+		pr_debug("as_indicate!!!\n");
+		lock_sock(sk);
+		if (sk_acceptq_is_full(sk)) {
+			sigd_enq(NULL, as_reject, vcc, NULL, NULL);
+			dev_kfree_skb(skb);
+			goto as_indicate_complete;
+		}
+		sk->sk_ack_backlog++;
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
+		sk->sk_state_change(sk);
+as_indicate_complete:
+		release_sock(sk);
+		return 0;
+	case as_close:
+		set_bit(ATM_VF_RELEASED, &vcc->flags);
+		vcc_release_async(vcc, msg->reply);
+		goto out;
+	case as_modify:
+		modify_qos(vcc, msg);
+		break;
+	case as_addparty:
+	case as_dropparty:
+		sk->sk_err_soft = msg->reply;
+					/* < 0 failure, otherwise ep_ref */
+		clear_bit(ATM_VF_WAITING, &vcc->flags);
+		break;
+	default:
+		pr_alert("bad message type %d\n", (int)msg->type);
+		return -EINVAL;
+	}
+	sk->sk_state_change(sk);
+out:
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+void sigd_enq2(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+	       struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+	       const struct sockaddr_atmsvc *svc, const struct atm_qos *qos,
+	       int reply)
+{
+	struct sk_buff *skb;
+	struct atmsvc_msg *msg;
+	static unsigned session = 0;
+
+	pr_debug("%d (0x%p)\n", (int)type, vcc);
+	while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
+		schedule();
+	msg = (struct atmsvc_msg *)skb_put(skb, sizeof(struct atmsvc_msg));
+	memset(msg, 0, sizeof(*msg));
+	msg->type = type;
+	*(struct atm_vcc **) &msg->vcc = vcc;
+	*(struct atm_vcc **) &msg->listen_vcc = listen_vcc;
+	msg->reply = reply;
+	if (qos)
+		msg->qos = *qos;
+	if (vcc)
+		msg->sap = vcc->sap;
+	if (svc)
+		msg->svc = *svc;
+	if (vcc)
+		msg->local = vcc->local;
+	if (pvc)
+		msg->pvc = *pvc;
+	if (vcc) {
+		if (type == as_connect && test_bit(ATM_VF_SESSION, &vcc->flags))
+			msg->session = ++session;
+			/* every new pmp connect gets the next session number */
+	}
+	sigd_put_skb(skb);
+	if (vcc)
+		set_bit(ATM_VF_REGIS, &vcc->flags);
+}
+
+void sigd_enq(struct atm_vcc *vcc, enum atmsvc_msg_type type,
+	      struct atm_vcc *listen_vcc, const struct sockaddr_atmpvc *pvc,
+	      const struct sockaddr_atmsvc *svc)
+{
+	sigd_enq2(vcc, type, listen_vcc, pvc, svc, vcc ? &vcc->qos : NULL, 0);
+	/* other ISP applications may use "reply" */
+}
+
+static void purge_vcc(struct atm_vcc *vcc)
+{
+	if (sk_atm(vcc)->sk_family == PF_ATMSVC &&
+	    !test_bit(ATM_VF_META, &vcc->flags)) {
+		set_bit(ATM_VF_RELEASED, &vcc->flags);
+		clear_bit(ATM_VF_REGIS, &vcc->flags);
+		vcc_release_async(vcc, -EUNATCH);
+	}
+}
+
+static void sigd_close(struct atm_vcc *vcc)
+{
+	struct hlist_node *node;
+	struct sock *s;
+	int i;
+
+	pr_debug("\n");
+	sigd = NULL;
+	if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
+		pr_err("closing with requests pending\n");
+	skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
+
+	read_lock(&vcc_sklist_lock);
+	for (i = 0; i < VCC_HTABLE_SIZE; ++i) {
+		struct hlist_head *head = &vcc_hash[i];
+
+		sk_for_each(s, node, head) {
+			vcc = atm_sk(s);
+
+			purge_vcc(vcc);
+		}
+	}
+	read_unlock(&vcc_sklist_lock);
+}
+
+static struct atmdev_ops sigd_dev_ops = {
+	.close = sigd_close,
+	.send =	sigd_send
+};
+
+static struct atm_dev sigd_dev = {
+	.ops =		&sigd_dev_ops,
+	.type =		"sig",
+	.number =	999,
+	.lock =		__SPIN_LOCK_UNLOCKED(sigd_dev.lock)
+};
+
+int sigd_attach(struct atm_vcc *vcc)
+{
+	if (sigd)
+		return -EADDRINUSE;
+	pr_debug("\n");
+	sigd = vcc;
+	vcc->dev = &sigd_dev;
+	vcc_insert_socket(sk_atm(vcc));
+	set_bit(ATM_VF_META, &vcc->flags);
+	set_bit(ATM_VF_READY, &vcc->flags);
+#ifdef WAIT_FOR_DEMON
+	wake_up(&sigd_sleep);
+#endif
+	return 0;
+}
diff --git a/net/atm/signaling.h b/net/atm/signaling.h
new file mode 100644
index 00000000..08b2a69c
--- /dev/null
+++ b/net/atm/signaling.h
@@ -0,0 +1,30 @@
+/* net/atm/signaling.h - ATM signaling */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+
+#ifndef NET_ATM_SIGNALING_H
+#define NET_ATM_SIGNALING_H
+
+#include <linux/atm.h>
+#include <linux/atmdev.h>
+#include <linux/atmsvc.h>
+
+
+extern struct atm_vcc *sigd; /* needed in svc_release */
+
+
+/*
+ * sigd_enq is a wrapper for sigd_enq2, covering the more common cases, and
+ * avoiding huge lists of null values.
+ */
+
+void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+    struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+    const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply);
+void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type,
+    struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc,
+    const struct sockaddr_atmsvc *svc);
+int sigd_attach(struct atm_vcc *vcc);
+
+#endif
diff --git a/net/atm/svc.c b/net/atm/svc.c
new file mode 100644
index 00000000..754ee479
--- /dev/null
+++ b/net/atm/svc.c
@@ -0,0 +1,691 @@
+/* net/atm/svc.c - ATM SVC sockets */
+
+/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/net.h>		/* struct socket, struct proto_ops */
+#include <linux/errno.h>	/* error codes */
+#include <linux/kernel.h>	/* printk */
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/sched.h>	/* jiffies and HZ */
+#include <linux/fcntl.h>	/* O_NONBLOCK */
+#include <linux/init.h>
+#include <linux/atm.h>		/* ATM stuff */
+#include <linux/atmsap.h>
+#include <linux/atmsvc.h>
+#include <linux/atmdev.h>
+#include <linux/bitops.h>
+#include <net/sock.h>		/* for sock_no_* */
+#include <linux/uaccess.h>
+
+#include "resources.h"
+#include "common.h"		/* common for PVCs and SVCs */
+#include "signaling.h"
+#include "addr.h"
+
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern);
+
+/*
+ * Note: since all this is still nicely synchronized with the signaling demon,
+ *       there's no need to protect sleep loops with clis. If signaling is
+ *       moved into the kernel, that would change.
+ */
+
+
+static int svc_shutdown(struct socket *sock, int how)
+{
+	return 0;
+}
+
+static void svc_disconnect(struct atm_vcc *vcc)
+{
+	DEFINE_WAIT(wait);
+	struct sk_buff *skb;
+	struct sock *sk = sk_atm(vcc);
+
+	pr_debug("%p\n", vcc);
+	if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+		sigd_enq(vcc, as_close, NULL, NULL, NULL);
+		while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+			schedule();
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_UNINTERRUPTIBLE);
+		}
+		finish_wait(sk_sleep(sk), &wait);
+	}
+	/* beware - socket is still in use by atmsigd until the last
+	   as_indicate has been answered */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		atm_return(vcc, skb->truesize);
+		pr_debug("LISTEN REL\n");
+		sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0);
+		dev_kfree_skb(skb);
+	}
+	clear_bit(ATM_VF_REGIS, &vcc->flags);
+	/* ... may retry later */
+}
+
+static int svc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc;
+
+	if (sk) {
+		vcc = ATM_SD(sock);
+		pr_debug("%p\n", vcc);
+		clear_bit(ATM_VF_READY, &vcc->flags);
+		/*
+		 * VCC pointer is used as a reference,
+		 * so we must not free it (thereby subjecting it to re-use)
+		 * before all pending connections are closed
+		 */
+		svc_disconnect(vcc);
+		vcc_release(sock);
+	}
+	return 0;
+}
+
+static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
+		    int sockaddr_len)
+{
+	DEFINE_WAIT(wait);
+	struct sock *sk = sock->sk;
+	struct sockaddr_atmsvc *addr;
+	struct atm_vcc *vcc;
+	int error;
+
+	if (sockaddr_len != sizeof(struct sockaddr_atmsvc))
+		return -EINVAL;
+	lock_sock(sk);
+	if (sock->state == SS_CONNECTED) {
+		error = -EISCONN;
+		goto out;
+	}
+	if (sock->state != SS_UNCONNECTED) {
+		error = -EINVAL;
+		goto out;
+	}
+	vcc = ATM_SD(sock);
+	addr = (struct sockaddr_atmsvc *) sockaddr;
+	if (addr->sas_family != AF_ATMSVC) {
+		error = -EAFNOSUPPORT;
+		goto out;
+	}
+	clear_bit(ATM_VF_BOUND, &vcc->flags);
+	    /* failing rebind will kill old binding */
+	/* @@@ check memory (de)allocation on rebind */
+	if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+		error = -EBADFD;
+		goto out;
+	}
+	vcc->local = *addr;
+	set_bit(ATM_VF_WAITING, &vcc->flags);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
+	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+		schedule();
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
+	if (!sigd) {
+		error = -EUNATCH;
+		goto out;
+	}
+	if (!sk->sk_err)
+		set_bit(ATM_VF_BOUND, &vcc->flags);
+	error = -sk->sk_err;
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
+		       int sockaddr_len, int flags)
+{
+	DEFINE_WAIT(wait);
+	struct sock *sk = sock->sk;
+	struct sockaddr_atmsvc *addr;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int error;
+
+	pr_debug("%p\n", vcc);
+	lock_sock(sk);
+	if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	switch (sock->state) {
+	default:
+		error = -EINVAL;
+		goto out;
+	case SS_CONNECTED:
+		error = -EISCONN;
+		goto out;
+	case SS_CONNECTING:
+		if (test_bit(ATM_VF_WAITING, &vcc->flags)) {
+			error = -EALREADY;
+			goto out;
+		}
+		sock->state = SS_UNCONNECTED;
+		if (sk->sk_err) {
+			error = -sk->sk_err;
+			goto out;
+		}
+		break;
+	case SS_UNCONNECTED:
+		addr = (struct sockaddr_atmsvc *) sockaddr;
+		if (addr->sas_family != AF_ATMSVC) {
+			error = -EAFNOSUPPORT;
+			goto out;
+		}
+		if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) {
+			error = -EBADFD;
+			goto out;
+		}
+		if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS ||
+		    vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) {
+			error = -EINVAL;
+			goto out;
+		}
+		if (!vcc->qos.txtp.traffic_class &&
+		    !vcc->qos.rxtp.traffic_class) {
+			error = -EINVAL;
+			goto out;
+		}
+		vcc->remote = *addr;
+		set_bit(ATM_VF_WAITING, &vcc->flags);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
+		if (flags & O_NONBLOCK) {
+			finish_wait(sk_sleep(sk), &wait);
+			sock->state = SS_CONNECTING;
+			error = -EINPROGRESS;
+			goto out;
+		}
+		error = 0;
+		while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+			schedule();
+			if (!signal_pending(current)) {
+				prepare_to_wait(sk_sleep(sk), &wait,
+						TASK_INTERRUPTIBLE);
+				continue;
+			}
+			pr_debug("*ABORT*\n");
+			/*
+			 * This is tricky:
+			 *   Kernel ---close--> Demon
+			 *   Kernel <--close--- Demon
+			 * or
+			 *   Kernel ---close--> Demon
+			 *   Kernel <--error--- Demon
+			 * or
+			 *   Kernel ---close--> Demon
+			 *   Kernel <--okay---- Demon
+			 *   Kernel <--close--- Demon
+			 */
+			sigd_enq(vcc, as_close, NULL, NULL, NULL);
+			while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+				prepare_to_wait(sk_sleep(sk), &wait,
+						TASK_INTERRUPTIBLE);
+				schedule();
+			}
+			if (!sk->sk_err)
+				while (!test_bit(ATM_VF_RELEASED, &vcc->flags) &&
+				       sigd) {
+					prepare_to_wait(sk_sleep(sk), &wait,
+							TASK_INTERRUPTIBLE);
+					schedule();
+				}
+			clear_bit(ATM_VF_REGIS, &vcc->flags);
+			clear_bit(ATM_VF_RELEASED, &vcc->flags);
+			clear_bit(ATM_VF_CLOSE, &vcc->flags);
+			    /* we're gone now but may connect later */
+			error = -EINTR;
+			break;
+		}
+		finish_wait(sk_sleep(sk), &wait);
+		if (error)
+			goto out;
+		if (!sigd) {
+			error = -EUNATCH;
+			goto out;
+		}
+		if (sk->sk_err) {
+			error = -sk->sk_err;
+			goto out;
+		}
+	}
+/*
+ * Not supported yet
+ *
+ * #ifndef CONFIG_SINGLE_SIGITF
+ */
+	vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp);
+	vcc->qos.txtp.pcr = 0;
+	vcc->qos.txtp.min_pcr = 0;
+/*
+ * #endif
+ */
+	error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci);
+	if (!error)
+		sock->state = SS_CONNECTED;
+	else
+		(void)svc_disconnect(vcc);
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_listen(struct socket *sock, int backlog)
+{
+	DEFINE_WAIT(wait);
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int error;
+
+	pr_debug("%p\n", vcc);
+	lock_sock(sk);
+	/* let server handle listen on unbound sockets */
+	if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
+		error = -EINVAL;
+		goto out;
+	}
+	if (test_bit(ATM_VF_LISTEN, &vcc->flags)) {
+		error = -EADDRINUSE;
+		goto out;
+	}
+	set_bit(ATM_VF_WAITING, &vcc->flags);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
+	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+		schedule();
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (!sigd) {
+		error = -EUNATCH;
+		goto out;
+	}
+	set_bit(ATM_VF_LISTEN, &vcc->flags);
+	vcc_insert_socket(sk);
+	sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT;
+	error = -sk->sk_err;
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	struct atmsvc_msg *msg;
+	struct atm_vcc *old_vcc = ATM_SD(sock);
+	struct atm_vcc *new_vcc;
+	int error;
+
+	lock_sock(sk);
+
+	error = svc_create(sock_net(sk), newsock, 0, 0);
+	if (error)
+		goto out;
+
+	new_vcc = ATM_SD(newsock);
+
+	pr_debug("%p -> %p\n", old_vcc, new_vcc);
+	while (1) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		while (!(skb = skb_dequeue(&sk->sk_receive_queue)) &&
+		       sigd) {
+			if (test_bit(ATM_VF_RELEASED, &old_vcc->flags))
+				break;
+			if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) {
+				error = -sk->sk_err;
+				break;
+			}
+			if (flags & O_NONBLOCK) {
+				error = -EAGAIN;
+				break;
+			}
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+			if (signal_pending(current)) {
+				error = -ERESTARTSYS;
+				break;
+			}
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+		}
+		finish_wait(sk_sleep(sk), &wait);
+		if (error)
+			goto out;
+		if (!skb) {
+			error = -EUNATCH;
+			goto out;
+		}
+		msg = (struct atmsvc_msg *)skb->data;
+		new_vcc->qos = msg->qos;
+		set_bit(ATM_VF_HASQOS, &new_vcc->flags);
+		new_vcc->remote = msg->svc;
+		new_vcc->local = msg->local;
+		new_vcc->sap = msg->sap;
+		error = vcc_connect(newsock, msg->pvc.sap_addr.itf,
+				    msg->pvc.sap_addr.vpi,
+				    msg->pvc.sap_addr.vci);
+		dev_kfree_skb(skb);
+		sk->sk_ack_backlog--;
+		if (error) {
+			sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL,
+				  &old_vcc->qos, error);
+			error = error == -EAGAIN ? -EBUSY : error;
+			goto out;
+		}
+		/* wait should be short, so we ignore the non-blocking flag */
+		set_bit(ATM_VF_WAITING, &new_vcc->flags);
+		prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+				TASK_UNINTERRUPTIBLE);
+		sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
+		while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+			prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+					TASK_UNINTERRUPTIBLE);
+		}
+		finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
+		if (!sigd) {
+			error = -EUNATCH;
+			goto out;
+		}
+		if (!sk_atm(new_vcc)->sk_err)
+			break;
+		if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) {
+			error = -sk_atm(new_vcc)->sk_err;
+			goto out;
+		}
+	}
+	newsock->state = SS_CONNECTED;
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_getname(struct socket *sock, struct sockaddr *sockaddr,
+		       int *sockaddr_len, int peer)
+{
+	struct sockaddr_atmsvc *addr;
+
+	*sockaddr_len = sizeof(struct sockaddr_atmsvc);
+	addr = (struct sockaddr_atmsvc *) sockaddr;
+	memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local,
+	       sizeof(struct sockaddr_atmsvc));
+	return 0;
+}
+
+int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
+{
+	struct sock *sk = sk_atm(vcc);
+	DEFINE_WAIT(wait);
+
+	set_bit(ATM_VF_WAITING, &vcc->flags);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
+	while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
+	       !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+		schedule();
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (!sigd)
+		return -EUNATCH;
+	return -sk->sk_err;
+}
+
+static int svc_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int value, error = 0;
+
+	lock_sock(sk);
+	switch (optname) {
+	case SO_ATMSAP:
+		if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) {
+			error = -EINVAL;
+			goto out;
+		}
+		if (copy_from_user(&vcc->sap, optval, optlen)) {
+			error = -EFAULT;
+			goto out;
+		}
+		set_bit(ATM_VF_HASSAP, &vcc->flags);
+		break;
+	case SO_MULTIPOINT:
+		if (level != SOL_ATM || optlen != sizeof(int)) {
+			error = -EINVAL;
+			goto out;
+		}
+		if (get_user(value, (int __user *)optval)) {
+			error = -EFAULT;
+			goto out;
+		}
+		if (value == 1)
+			set_bit(ATM_VF_SESSION, &vcc->flags);
+		else if (value == 0)
+			clear_bit(ATM_VF_SESSION, &vcc->flags);
+		else
+			error = -EINVAL;
+		break;
+	default:
+		error = vcc_setsockopt(sock, level, optname, optval, optlen);
+	}
+
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int error = 0, len;
+
+	lock_sock(sk);
+	if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) {
+		error = vcc_getsockopt(sock, level, optname, optval, optlen);
+		goto out;
+	}
+	if (get_user(len, optlen)) {
+		error = -EFAULT;
+		goto out;
+	}
+	if (len != sizeof(struct atm_sap)) {
+		error = -EINVAL;
+		goto out;
+	}
+	if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) {
+		error = -EFAULT;
+		goto out;
+	}
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
+			int sockaddr_len, int flags)
+{
+	DEFINE_WAIT(wait);
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int error;
+
+	lock_sock(sk);
+	set_bit(ATM_VF_WAITING, &vcc->flags);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sigd_enq(vcc, as_addparty, NULL, NULL,
+		 (struct sockaddr_atmsvc *) sockaddr);
+	if (flags & O_NONBLOCK) {
+		finish_wait(sk_sleep(sk), &wait);
+		error = -EINPROGRESS;
+		goto out;
+	}
+	pr_debug("added wait queue\n");
+	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+		schedule();
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	error = xchg(&sk->sk_err_soft, 0);
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_dropparty(struct socket *sock, int ep_ref)
+{
+	DEFINE_WAIT(wait);
+	struct sock *sk = sock->sk;
+	struct atm_vcc *vcc = ATM_SD(sock);
+	int error;
+
+	lock_sock(sk);
+	set_bit(ATM_VF_WAITING, &vcc->flags);
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
+	while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
+		schedule();
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (!sigd) {
+		error = -EUNATCH;
+		goto out;
+	}
+	error = xchg(&sk->sk_err_soft, 0);
+out:
+	release_sock(sk);
+	return error;
+}
+
+static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int error, ep_ref;
+	struct sockaddr_atmsvc sa;
+	struct atm_vcc *vcc = ATM_SD(sock);
+
+	switch (cmd) {
+	case ATM_ADDPARTY:
+		if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+			return -EINVAL;
+		if (copy_from_user(&sa, (void __user *) arg, sizeof(sa)))
+			return -EFAULT;
+		error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa),
+				     0);
+		break;
+	case ATM_DROPPARTY:
+		if (!test_bit(ATM_VF_SESSION, &vcc->flags))
+			return -EINVAL;
+		if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int)))
+			return -EFAULT;
+		error = svc_dropparty(sock, ep_ref);
+		break;
+	default:
+		error = vcc_ioctl(sock, cmd, arg);
+	}
+
+	return error;
+}
+
+#ifdef CONFIG_COMPAT
+static int svc_compat_ioctl(struct socket *sock, unsigned int cmd,
+			    unsigned long arg)
+{
+	/* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf.
+	   But actually it takes a struct sockaddr_atmsvc, which doesn't need
+	   compat handling. So all we have to do is fix up cmd... */
+	if (cmd == COMPAT_ATM_ADDPARTY)
+		cmd = ATM_ADDPARTY;
+
+	if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY)
+		return svc_ioctl(sock, cmd, arg);
+	else
+		return vcc_compat_ioctl(sock, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static const struct proto_ops svc_proto_ops = {
+	.family =	PF_ATMSVC,
+	.owner =	THIS_MODULE,
+
+	.release =	svc_release,
+	.bind =		svc_bind,
+	.connect =	svc_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	svc_accept,
+	.getname =	svc_getname,
+	.poll =		vcc_poll,
+	.ioctl =	svc_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	svc_compat_ioctl,
+#endif
+	.listen =	svc_listen,
+	.shutdown =	svc_shutdown,
+	.setsockopt =	svc_setsockopt,
+	.getsockopt =	svc_getsockopt,
+	.sendmsg =	vcc_sendmsg,
+	.recvmsg =	vcc_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+
+static int svc_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	int error;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	sock->ops = &svc_proto_ops;
+	error = vcc_create(net, sock, protocol, AF_ATMSVC);
+	if (error)
+		return error;
+	ATM_SD(sock)->local.sas_family = AF_ATMSVC;
+	ATM_SD(sock)->remote.sas_family = AF_ATMSVC;
+	return 0;
+}
+
+static const struct net_proto_family svc_family_ops = {
+	.family = PF_ATMSVC,
+	.create = svc_create,
+	.owner = THIS_MODULE,
+};
+
+
+/*
+ *	Initialize the ATM SVC protocol family
+ */
+
+int __init atmsvc_init(void)
+{
+	return sock_register(&svc_family_ops);
+}
+
+void atmsvc_exit(void)
+{
+	sock_unregister(PF_ATMSVC);
+}
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
new file mode 100644
index 00000000..705e53ef
--- /dev/null
+++ b/net/ax25/Kconfig
@@ -0,0 +1,121 @@
+#
+# Amateur Radio protocols and AX.25 device configuration
+#
+
+menuconfig HAMRADIO
+	depends on NET && !S390
+	bool "Amateur Radio support"
+	help
+	  If you want to connect your Linux box to an amateur radio, answer Y
+	  here. You want to read <http://www.tapr.org/>
+	  and more specifically about AX.25 on Linux
+	  <http://www.linux-ax25.org/>.
+
+	  Note that the answer to this question won't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about amateur radio.
+
+comment "Packet Radio protocols"
+	depends on HAMRADIO
+
+config AX25
+	tristate "Amateur Radio AX.25 Level 2 protocol"
+	depends on HAMRADIO
+	help
+	  This is the protocol used for computer communication over amateur
+	  radio. It is either used by itself for point-to-point links, or to
+	  carry other protocols such as tcp/ip. To use it, you need a device
+	  that connects your Linux box to your amateur radio. You can either
+	  use a low speed TNC (a Terminal Node Controller acts as a kind of
+	  modem connecting your computer's serial port to your radio's
+	  microphone input and speaker output) supporting the KISS protocol or
+	  one of the various SCC cards that are supported by the generic Z8530
+	  or the DMA SCC driver. Another option are the Baycom modem serial
+	  and parallel port hacks or the sound card modem (supported by their
+	  own drivers). If you say Y here, you also have to say Y to one of
+	  those drivers.
+
+	  Information about where to get supporting software for Linux amateur
+	  radio as well as information about how to configure an AX.25 port is
+	  contained in the AX25-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>. You might also want to
+	  check out the file <file:Documentation/networking/ax25.txt> in the
+	  kernel source. More information about digital amateur radio in
+	  general is on the WWW at
+	  <http://www.tapr.org/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ax25.
+
+config AX25_DAMA_SLAVE
+	bool "AX.25 DAMA Slave support"
+	default y
+	depends on AX25
+	help
+	  DAMA is a mechanism to prevent collisions when doing AX.25
+	  networking. A DAMA server (called "master") accepts incoming traffic
+	  from clients (called "slaves") and redistributes it to other slaves.
+	  If you say Y here, your Linux box will act as a DAMA slave; this is
+	  transparent in that you don't have to do any special DAMA
+	  configuration. Linux cannot yet act as a DAMA server.  This option
+	  only compiles DAMA slave support into the kernel.  It still needs to
+	  be enabled at runtime.  For more about DAMA see
+	  <http://www.linux-ax25.org>.  If unsure, say Y.
+
+# placeholder until implemented
+config AX25_DAMA_MASTER
+	bool 'AX.25 DAMA Master support'
+	depends on AX25_DAMA_SLAVE && BROKEN
+	help
+	  DAMA is a mechanism to prevent collisions when doing AX.25
+	  networking. A DAMA server (called "master") accepts incoming traffic
+	  from clients (called "slaves") and redistributes it to other slaves.
+	  If you say Y here, your Linux box will act as a DAMA master; this is
+	  transparent in that you don't have to do any special DAMA
+	  configuration. Linux cannot yet act as a DAMA server.  This option
+	  only compiles DAMA slave support into the kernel.  It still needs to
+	  be explicitly enabled, so if unsure, say Y.
+
+config NETROM
+	tristate "Amateur Radio NET/ROM protocol"
+	depends on AX25
+	help
+	  NET/ROM is a network layer protocol on top of AX.25 useful for
+	  routing.
+
+	  A comprehensive listing of all the software for Linux amateur radio
+	  users as well as information about how to configure an AX.25 port is
+	  contained in the Linux Ham Wiki, available from
+	  <http://www.linux-ax25.org>. You also might want to check out the
+	  file <file:Documentation/networking/ax25.txt>. More information about
+	  digital amateur radio in general is on the WWW at
+	  <http://www.tapr.org/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called netrom.
+
+config ROSE
+	tristate "Amateur Radio X.25 PLP (Rose)"
+	depends on AX25
+	help
+	  The Packet Layer Protocol (PLP) is a way to route packets over X.25
+	  connections in general and amateur radio AX.25 connections in
+	  particular, essentially an alternative to NET/ROM.
+
+	  A comprehensive listing of all the software for Linux amateur radio
+	  users as well as information about how to configure an AX.25 port is
+	  contained in the Linux Ham Wiki, available from
+	  <http://www.linux-ax25.org>.  You also might want to check out the
+	  file <file:Documentation/networking/ax25.txt>. More information about
+	  digital amateur radio in general is on the WWW at
+	  <http://www.tapr.org/>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called rose.
+
+menu "AX.25 network device drivers"
+	depends on HAMRADIO && AX25
+
+source "drivers/net/hamradio/Kconfig"
+
+endmenu
diff --git a/net/ax25/Makefile b/net/ax25/Makefile
new file mode 100644
index 00000000..43c46d2c
--- /dev/null
+++ b/net/ax25/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the Linux AX.25 layer.
+#
+
+obj-$(CONFIG_AX25) += ax25.o
+
+ax25-y	 := ax25_addr.o ax25_dev.o ax25_iface.o ax25_in.o ax25_ip.o ax25_out.o \
+	    ax25_route.o ax25_std_in.o ax25_std_subr.o ax25_std_timer.o \
+	    ax25_subr.o ax25_timer.o ax25_uid.o af_ax25.o
+ax25-$(CONFIG_AX25_DAMA_SLAVE) += ax25_ds_in.o ax25_ds_subr.o ax25_ds_timer.o
+ax25-$(CONFIG_SYSCTL) += sysctl_net_ax25.o
diff --git a/net/ax25/TODO b/net/ax25/TODO
new file mode 100644
index 00000000..69fb4e36
--- /dev/null
+++ b/net/ax25/TODO
@@ -0,0 +1,20 @@
+Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and
+listen_lock have to be bh-safe?
+
+Do the netrom and rose locks have to be bh-safe?
+
+A device might be deleted after lookup in the SIOCADDRT ioctl but before it's
+being used.
+
+Routes to a device being taken down might be deleted by ax25_rt_device_down
+but added by somebody else before the device has been deleted fully.
+
+The ax25_rt_find_route synopsys is pervert but I somehow had to deal with
+the race caused by the static variable in it's previous implementation.
+
+Implement proper socket locking in netrom and rose.
+
+Check socket locking when ax25_rcv is sending to raw sockets.  In particular
+ax25_send_to_raw() seems fishy.  Heck - ax25_rcv is fishy.
+
+Handle XID and TEST frames properly.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
new file mode 100644
index 00000000..b04a6ef4
--- /dev/null
+++ b/net/ax25/af_ax25.c
@@ -0,0 +1,2022 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ * Copyright (C) Hans Alblas PE1AYX (hans@esrac.ele.tue.nl)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <net/net_namespace.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+
+
+HLIST_HEAD(ax25_list);
+DEFINE_SPINLOCK(ax25_list_lock);
+
+static const struct proto_ops ax25_proto_ops;
+
+static void ax25_free_sock(struct sock *sk)
+{
+	ax25_cb_put(ax25_sk(sk));
+}
+
+/*
+ *	Socket removal during an interrupt is now safe.
+ */
+static void ax25_cb_del(ax25_cb *ax25)
+{
+	if (!hlist_unhashed(&ax25->ax25_node)) {
+		spin_lock_bh(&ax25_list_lock);
+		hlist_del_init(&ax25->ax25_node);
+		spin_unlock_bh(&ax25_list_lock);
+		ax25_cb_put(ax25);
+	}
+}
+
+/*
+ *	Kill all bound sockets on a dropped device.
+ */
+static void ax25_kill_by_device(struct net_device *dev)
+{
+	ax25_dev *ax25_dev;
+	ax25_cb *s;
+	struct hlist_node *node;
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		return;
+
+	spin_lock_bh(&ax25_list_lock);
+again:
+	ax25_for_each(s, node, &ax25_list) {
+		if (s->ax25_dev == ax25_dev) {
+			s->ax25_dev = NULL;
+			spin_unlock_bh(&ax25_list_lock);
+			ax25_disconnect(s, ENETUNREACH);
+			spin_lock_bh(&ax25_list_lock);
+
+			/* The entry could have been deleted from the
+			 * list meanwhile and thus the next pointer is
+			 * no longer valid.  Play it safe and restart
+			 * the scan.  Forward progress is ensured
+			 * because we set s->ax25_dev to NULL and we
+			 * are never passed a NULL 'dev' argument.
+			 */
+			goto again;
+		}
+	}
+	spin_unlock_bh(&ax25_list_lock);
+}
+
+/*
+ *	Handle device status changes.
+ */
+static int ax25_device_event(struct notifier_block *this, unsigned long event,
+	void *ptr)
+{
+	struct net_device *dev = (struct net_device *)ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	/* Reject non AX.25 devices */
+	if (dev->type != ARPHRD_AX25)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		ax25_dev_device_up(dev);
+		break;
+	case NETDEV_DOWN:
+		ax25_kill_by_device(dev);
+		ax25_rt_device_down(dev);
+		ax25_dev_device_down(dev);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ *	Add a socket to the bound sockets list.
+ */
+void ax25_cb_add(ax25_cb *ax25)
+{
+	spin_lock_bh(&ax25_list_lock);
+	ax25_cb_hold(ax25);
+	hlist_add_head(&ax25->ax25_node, &ax25_list);
+	spin_unlock_bh(&ax25_list_lock);
+}
+
+/*
+ *	Find a socket that wants to accept the SABM we have just
+ *	received.
+ */
+struct sock *ax25_find_listener(ax25_address *addr, int digi,
+	struct net_device *dev, int type)
+{
+	ax25_cb *s;
+	struct hlist_node *node;
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(s, node, &ax25_list) {
+		if ((s->iamdigi && !digi) || (!s->iamdigi && digi))
+			continue;
+		if (s->sk && !ax25cmp(&s->source_addr, addr) &&
+		    s->sk->sk_type == type && s->sk->sk_state == TCP_LISTEN) {
+			/* If device is null we match any device */
+			if (s->ax25_dev == NULL || s->ax25_dev->dev == dev) {
+				sock_hold(s->sk);
+				spin_unlock(&ax25_list_lock);
+				return s->sk;
+			}
+		}
+	}
+	spin_unlock(&ax25_list_lock);
+
+	return NULL;
+}
+
+/*
+ *	Find an AX.25 socket given both ends.
+ */
+struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr,
+	int type)
+{
+	struct sock *sk = NULL;
+	ax25_cb *s;
+	struct hlist_node *node;
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(s, node, &ax25_list) {
+		if (s->sk && !ax25cmp(&s->source_addr, my_addr) &&
+		    !ax25cmp(&s->dest_addr, dest_addr) &&
+		    s->sk->sk_type == type) {
+			sk = s->sk;
+			sock_hold(sk);
+			break;
+		}
+	}
+
+	spin_unlock(&ax25_list_lock);
+
+	return sk;
+}
+
+/*
+ *	Find an AX.25 control block given both ends. It will only pick up
+ *	floating AX.25 control blocks or non Raw socket bound control blocks.
+ */
+ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
+	ax25_digi *digi, struct net_device *dev)
+{
+	ax25_cb *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&ax25_list_lock);
+	ax25_for_each(s, node, &ax25_list) {
+		if (s->sk && s->sk->sk_type != SOCK_SEQPACKET)
+			continue;
+		if (s->ax25_dev == NULL)
+			continue;
+		if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) {
+			if (digi != NULL && digi->ndigi != 0) {
+				if (s->digipeat == NULL)
+					continue;
+				if (ax25digicmp(s->digipeat, digi) != 0)
+					continue;
+			} else {
+				if (s->digipeat != NULL && s->digipeat->ndigi != 0)
+					continue;
+			}
+			ax25_cb_hold(s);
+			spin_unlock_bh(&ax25_list_lock);
+
+			return s;
+		}
+	}
+	spin_unlock_bh(&ax25_list_lock);
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(ax25_find_cb);
+
+void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto)
+{
+	ax25_cb *s;
+	struct sk_buff *copy;
+	struct hlist_node *node;
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(s, node, &ax25_list) {
+		if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 &&
+		    s->sk->sk_type == SOCK_RAW &&
+		    s->sk->sk_protocol == proto &&
+		    s->ax25_dev->dev == skb->dev &&
+		    atomic_read(&s->sk->sk_rmem_alloc) <= s->sk->sk_rcvbuf) {
+			if ((copy = skb_clone(skb, GFP_ATOMIC)) == NULL)
+				continue;
+			if (sock_queue_rcv_skb(s->sk, copy) != 0)
+				kfree_skb(copy);
+		}
+	}
+	spin_unlock(&ax25_list_lock);
+}
+
+/*
+ *	Deferred destroy.
+ */
+void ax25_destroy_socket(ax25_cb *);
+
+/*
+ *	Handler for deferred kills.
+ */
+static void ax25_destroy_timer(unsigned long data)
+{
+	ax25_cb *ax25=(ax25_cb *)data;
+	struct sock *sk;
+
+	sk=ax25->sk;
+
+	bh_lock_sock(sk);
+	sock_hold(sk);
+	ax25_destroy_socket(ax25);
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	This is called from user mode and the timers. Thus it protects itself
+ *	against interrupt users but doesn't worry about being called during
+ *	work. Once it is removed from the queue no interrupt or bottom half
+ *	will touch it and we are (fairly 8-) ) safe.
+ */
+void ax25_destroy_socket(ax25_cb *ax25)
+{
+	struct sk_buff *skb;
+
+	ax25_cb_del(ax25);
+
+	ax25_stop_heartbeat(ax25);
+	ax25_stop_t1timer(ax25);
+	ax25_stop_t2timer(ax25);
+	ax25_stop_t3timer(ax25);
+	ax25_stop_idletimer(ax25);
+
+	ax25_clear_queues(ax25);	/* Flush the queues */
+
+	if (ax25->sk != NULL) {
+		while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) {
+			if (skb->sk != ax25->sk) {
+				/* A pending connection */
+				ax25_cb *sax25 = ax25_sk(skb->sk);
+
+				/* Queue the unaccepted socket for death */
+				sock_orphan(skb->sk);
+
+				/* 9A4GL: hack to release unaccepted sockets */
+				skb->sk->sk_state = TCP_LISTEN;
+
+				ax25_start_heartbeat(sax25);
+				sax25->state = AX25_STATE_0;
+			}
+
+			kfree_skb(skb);
+		}
+		skb_queue_purge(&ax25->sk->sk_write_queue);
+	}
+
+	if (ax25->sk != NULL) {
+		if (sk_has_allocations(ax25->sk)) {
+			/* Defer: outstanding buffers */
+			setup_timer(&ax25->dtimer, ax25_destroy_timer,
+					(unsigned long)ax25);
+			ax25->dtimer.expires  = jiffies + 2 * HZ;
+			add_timer(&ax25->dtimer);
+		} else {
+			struct sock *sk=ax25->sk;
+			ax25->sk=NULL;
+			sock_put(sk);
+		}
+	} else {
+		ax25_cb_put(ax25);
+	}
+}
+
+/*
+ * dl1bke 960311: set parameters for existing AX.25 connections,
+ *		  includes a KILL command to abort any connection.
+ *		  VERY useful for debugging ;-)
+ */
+static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
+{
+	struct ax25_ctl_struct ax25_ctl;
+	ax25_digi digi;
+	ax25_dev *ax25_dev;
+	ax25_cb *ax25;
+	unsigned int k;
+	int ret = 0;
+
+	if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl)))
+		return -EFAULT;
+
+	if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL)
+		return -ENODEV;
+
+	if (ax25_ctl.digi_count > AX25_MAX_DIGIS)
+		return -EINVAL;
+
+	if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL)
+		return -EINVAL;
+
+	digi.ndigi = ax25_ctl.digi_count;
+	for (k = 0; k < digi.ndigi; k++)
+		digi.calls[k] = ax25_ctl.digi_addr[k];
+
+	if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL)
+		return -ENOTCONN;
+
+	switch (ax25_ctl.cmd) {
+	case AX25_KILL:
+		ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+#ifdef CONFIG_AX25_DAMA_SLAVE
+		if (ax25_dev->dama.slave && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE)
+			ax25_dama_off(ax25);
+#endif
+		ax25_disconnect(ax25, ENETRESET);
+		break;
+
+	case AX25_WINDOW:
+		if (ax25->modulus == AX25_MODULUS) {
+			if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7)
+				goto einval_put;
+		} else {
+			if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63)
+				goto einval_put;
+		}
+		ax25->window = ax25_ctl.arg;
+		break;
+
+	case AX25_T1:
+		if (ax25_ctl.arg < 1)
+			goto einval_put;
+		ax25->rtt = (ax25_ctl.arg * HZ) / 2;
+		ax25->t1  = ax25_ctl.arg * HZ;
+		break;
+
+	case AX25_T2:
+		if (ax25_ctl.arg < 1)
+			goto einval_put;
+		ax25->t2 = ax25_ctl.arg * HZ;
+		break;
+
+	case AX25_N2:
+		if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31)
+			goto einval_put;
+		ax25->n2count = 0;
+		ax25->n2 = ax25_ctl.arg;
+		break;
+
+	case AX25_T3:
+		ax25->t3 = ax25_ctl.arg * HZ;
+		break;
+
+	case AX25_IDLE:
+		ax25->idle = ax25_ctl.arg * 60 * HZ;
+		break;
+
+	case AX25_PACLEN:
+		if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535)
+			goto einval_put;
+		ax25->paclen = ax25_ctl.arg;
+		break;
+
+	default:
+		goto einval_put;
+	  }
+
+out_put:
+	ax25_cb_put(ax25);
+	return ret;
+
+einval_put:
+	ret = -EINVAL;
+	goto out_put;
+}
+
+static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev)
+{
+	ax25->rtt     = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2;
+	ax25->t1      = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]);
+	ax25->t2      = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T2]);
+	ax25->t3      = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T3]);
+	ax25->n2      = ax25_dev->values[AX25_VALUES_N2];
+	ax25->paclen  = ax25_dev->values[AX25_VALUES_PACLEN];
+	ax25->idle    = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_IDLE]);
+	ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF];
+
+	if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) {
+		ax25->modulus = AX25_EMODULUS;
+		ax25->window  = ax25_dev->values[AX25_VALUES_EWINDOW];
+	} else {
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = ax25_dev->values[AX25_VALUES_WINDOW];
+	}
+}
+
+/*
+ *	Fill in a created AX.25 created control block with the default
+ *	values for a particular device.
+ */
+void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev)
+{
+	ax25->ax25_dev = ax25_dev;
+
+	if (ax25->ax25_dev != NULL) {
+		ax25_fillin_cb_from_dev(ax25, ax25_dev);
+		return;
+	}
+
+	/*
+	 * No device, use kernel / AX.25 spec default values
+	 */
+	ax25->rtt     = msecs_to_jiffies(AX25_DEF_T1) / 2;
+	ax25->t1      = msecs_to_jiffies(AX25_DEF_T1);
+	ax25->t2      = msecs_to_jiffies(AX25_DEF_T2);
+	ax25->t3      = msecs_to_jiffies(AX25_DEF_T3);
+	ax25->n2      = AX25_DEF_N2;
+	ax25->paclen  = AX25_DEF_PACLEN;
+	ax25->idle    = msecs_to_jiffies(AX25_DEF_IDLE);
+	ax25->backoff = AX25_DEF_BACKOFF;
+
+	if (AX25_DEF_AXDEFMODE) {
+		ax25->modulus = AX25_EMODULUS;
+		ax25->window  = AX25_DEF_EWINDOW;
+	} else {
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = AX25_DEF_WINDOW;
+	}
+}
+
+/*
+ * Create an empty AX.25 control block.
+ */
+ax25_cb *ax25_create_cb(void)
+{
+	ax25_cb *ax25;
+
+	if ((ax25 = kzalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL)
+		return NULL;
+
+	atomic_set(&ax25->refcount, 1);
+
+	skb_queue_head_init(&ax25->write_queue);
+	skb_queue_head_init(&ax25->frag_queue);
+	skb_queue_head_init(&ax25->ack_queue);
+	skb_queue_head_init(&ax25->reseq_queue);
+
+	ax25_setup_timers(ax25);
+
+	ax25_fillin_cb(ax25, NULL);
+
+	ax25->state = AX25_STATE_0;
+
+	return ax25;
+}
+
+/*
+ *	Handling for system calls applied via the various interfaces to an
+ *	AX25 socket object
+ */
+
+static int ax25_setsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	ax25_cb *ax25;
+	struct net_device *dev;
+	char devname[IFNAMSIZ];
+	int opt, res = 0;
+
+	if (level != SOL_AX25)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(opt, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+	ax25 = ax25_sk(sk);
+
+	switch (optname) {
+	case AX25_WINDOW:
+		if (ax25->modulus == AX25_MODULUS) {
+			if (opt < 1 || opt > 7) {
+				res = -EINVAL;
+				break;
+			}
+		} else {
+			if (opt < 1 || opt > 63) {
+				res = -EINVAL;
+				break;
+			}
+		}
+		ax25->window = opt;
+		break;
+
+	case AX25_T1:
+		if (opt < 1) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->rtt = (opt * HZ) >> 1;
+		ax25->t1  = opt * HZ;
+		break;
+
+	case AX25_T2:
+		if (opt < 1) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->t2 = opt * HZ;
+		break;
+
+	case AX25_N2:
+		if (opt < 1 || opt > 31) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->n2 = opt;
+		break;
+
+	case AX25_T3:
+		if (opt < 1) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->t3 = opt * HZ;
+		break;
+
+	case AX25_IDLE:
+		if (opt < 0) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->idle = opt * 60 * HZ;
+		break;
+
+	case AX25_BACKOFF:
+		if (opt < 0 || opt > 2) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->backoff = opt;
+		break;
+
+	case AX25_EXTSEQ:
+		ax25->modulus = opt ? AX25_EMODULUS : AX25_MODULUS;
+		break;
+
+	case AX25_PIDINCL:
+		ax25->pidincl = opt ? 1 : 0;
+		break;
+
+	case AX25_IAMDIGI:
+		ax25->iamdigi = opt ? 1 : 0;
+		break;
+
+	case AX25_PACLEN:
+		if (opt < 16 || opt > 65535) {
+			res = -EINVAL;
+			break;
+		}
+		ax25->paclen = opt;
+		break;
+
+	case SO_BINDTODEVICE:
+		if (optlen > IFNAMSIZ)
+			optlen = IFNAMSIZ;
+
+		if (copy_from_user(devname, optval, optlen)) {
+			res = -EFAULT;
+			break;
+		}
+
+		if (sk->sk_type == SOCK_SEQPACKET &&
+		   (sock->state != SS_UNCONNECTED ||
+		    sk->sk_state == TCP_LISTEN)) {
+			res = -EADDRNOTAVAIL;
+			break;
+		}
+
+		dev = dev_get_by_name(&init_net, devname);
+		if (!dev) {
+			res = -ENODEV;
+			break;
+		}
+
+		ax25->ax25_dev = ax25_dev_ax25dev(dev);
+		ax25_fillin_cb(ax25, ax25->ax25_dev);
+		dev_put(dev);
+		break;
+
+	default:
+		res = -ENOPROTOOPT;
+	}
+	release_sock(sk);
+
+	return res;
+}
+
+static int ax25_getsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	ax25_cb *ax25;
+	struct ax25_dev *ax25_dev;
+	char devname[IFNAMSIZ];
+	void *valptr;
+	int val = 0;
+	int maxlen, length;
+
+	if (level != SOL_AX25)
+		return -ENOPROTOOPT;
+
+	if (get_user(maxlen, optlen))
+		return -EFAULT;
+
+	if (maxlen < 1)
+		return -EFAULT;
+
+	valptr = (void *) &val;
+	length = min_t(unsigned int, maxlen, sizeof(int));
+
+	lock_sock(sk);
+	ax25 = ax25_sk(sk);
+
+	switch (optname) {
+	case AX25_WINDOW:
+		val = ax25->window;
+		break;
+
+	case AX25_T1:
+		val = ax25->t1 / HZ;
+		break;
+
+	case AX25_T2:
+		val = ax25->t2 / HZ;
+		break;
+
+	case AX25_N2:
+		val = ax25->n2;
+		break;
+
+	case AX25_T3:
+		val = ax25->t3 / HZ;
+		break;
+
+	case AX25_IDLE:
+		val = ax25->idle / (60 * HZ);
+		break;
+
+	case AX25_BACKOFF:
+		val = ax25->backoff;
+		break;
+
+	case AX25_EXTSEQ:
+		val = (ax25->modulus == AX25_EMODULUS);
+		break;
+
+	case AX25_PIDINCL:
+		val = ax25->pidincl;
+		break;
+
+	case AX25_IAMDIGI:
+		val = ax25->iamdigi;
+		break;
+
+	case AX25_PACLEN:
+		val = ax25->paclen;
+		break;
+
+	case SO_BINDTODEVICE:
+		ax25_dev = ax25->ax25_dev;
+
+		if (ax25_dev != NULL && ax25_dev->dev != NULL) {
+			strlcpy(devname, ax25_dev->dev->name, sizeof(devname));
+			length = strlen(devname) + 1;
+		} else {
+			*devname = '\0';
+			length = 1;
+		}
+
+		valptr = (void *) devname;
+		break;
+
+	default:
+		release_sock(sk);
+		return -ENOPROTOOPT;
+	}
+	release_sock(sk);
+
+	if (put_user(length, optlen))
+		return -EFAULT;
+
+	return copy_to_user(optval, valptr, length) ? -EFAULT : 0;
+}
+
+static int ax25_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int res = 0;
+
+	lock_sock(sk);
+	if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_LISTEN) {
+		sk->sk_max_ack_backlog = backlog;
+		sk->sk_state           = TCP_LISTEN;
+		goto out;
+	}
+	res = -EOPNOTSUPP;
+
+out:
+	release_sock(sk);
+
+	return res;
+}
+
+/*
+ * XXX: when creating ax25_sock we should update the .obj_size setting
+ * below.
+ */
+static struct proto ax25_proto = {
+	.name	  = "AX25",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct sock),
+};
+
+static int ax25_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	struct sock *sk;
+	ax25_cb *ax25;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	switch (sock->type) {
+	case SOCK_DGRAM:
+		if (protocol == 0 || protocol == PF_AX25)
+			protocol = AX25_P_TEXT;
+		break;
+
+	case SOCK_SEQPACKET:
+		switch (protocol) {
+		case 0:
+		case PF_AX25:	/* For CLX */
+			protocol = AX25_P_TEXT;
+			break;
+		case AX25_P_SEGMENT:
+#ifdef CONFIG_INET
+		case AX25_P_ARP:
+		case AX25_P_IP:
+#endif
+#ifdef CONFIG_NETROM
+		case AX25_P_NETROM:
+#endif
+#ifdef CONFIG_ROSE
+		case AX25_P_ROSE:
+#endif
+			return -ESOCKTNOSUPPORT;
+#ifdef CONFIG_NETROM_MODULE
+		case AX25_P_NETROM:
+			if (ax25_protocol_is_registered(AX25_P_NETROM))
+				return -ESOCKTNOSUPPORT;
+#endif
+#ifdef CONFIG_ROSE_MODULE
+		case AX25_P_ROSE:
+			if (ax25_protocol_is_registered(AX25_P_ROSE))
+				return -ESOCKTNOSUPPORT;
+#endif
+		default:
+			break;
+		}
+		break;
+
+	case SOCK_RAW:
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+	if (sk == NULL)
+		return -ENOMEM;
+
+	ax25 = sk->sk_protinfo = ax25_create_cb();
+	if (!ax25) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct = ax25_free_sock;
+	sock->ops    = &ax25_proto_ops;
+	sk->sk_protocol = protocol;
+
+	ax25->sk    = sk;
+
+	return 0;
+}
+
+struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
+{
+	struct sock *sk;
+	ax25_cb *ax25, *oax25;
+
+	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC,	osk->sk_prot);
+	if (sk == NULL)
+		return NULL;
+
+	if ((ax25 = ax25_create_cb()) == NULL) {
+		sk_free(sk);
+		return NULL;
+	}
+
+	switch (osk->sk_type) {
+	case SOCK_DGRAM:
+		break;
+	case SOCK_SEQPACKET:
+		break;
+	default:
+		sk_free(sk);
+		ax25_cb_put(ax25);
+		return NULL;
+	}
+
+	sock_init_data(NULL, sk);
+
+	sk->sk_type     = osk->sk_type;
+	sk->sk_priority = osk->sk_priority;
+	sk->sk_protocol = osk->sk_protocol;
+	sk->sk_rcvbuf   = osk->sk_rcvbuf;
+	sk->sk_sndbuf   = osk->sk_sndbuf;
+	sk->sk_state    = TCP_ESTABLISHED;
+	sock_copy_flags(sk, osk);
+
+	oax25 = ax25_sk(osk);
+
+	ax25->modulus = oax25->modulus;
+	ax25->backoff = oax25->backoff;
+	ax25->pidincl = oax25->pidincl;
+	ax25->iamdigi = oax25->iamdigi;
+	ax25->rtt     = oax25->rtt;
+	ax25->t1      = oax25->t1;
+	ax25->t2      = oax25->t2;
+	ax25->t3      = oax25->t3;
+	ax25->n2      = oax25->n2;
+	ax25->idle    = oax25->idle;
+	ax25->paclen  = oax25->paclen;
+	ax25->window  = oax25->window;
+
+	ax25->ax25_dev    = ax25_dev;
+	ax25->source_addr = oax25->source_addr;
+
+	if (oax25->digipeat != NULL) {
+		ax25->digipeat = kmemdup(oax25->digipeat, sizeof(ax25_digi),
+					 GFP_ATOMIC);
+		if (ax25->digipeat == NULL) {
+			sk_free(sk);
+			ax25_cb_put(ax25);
+			return NULL;
+		}
+	}
+
+	sk->sk_protinfo = ax25;
+	sk->sk_destruct = ax25_free_sock;
+	ax25->sk    = sk;
+
+	return sk;
+}
+
+static int ax25_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	ax25_cb *ax25;
+
+	if (sk == NULL)
+		return 0;
+
+	sock_hold(sk);
+	sock_orphan(sk);
+	lock_sock(sk);
+	ax25 = ax25_sk(sk);
+
+	if (sk->sk_type == SOCK_SEQPACKET) {
+		switch (ax25->state) {
+		case AX25_STATE_0:
+			release_sock(sk);
+			ax25_disconnect(ax25, 0);
+			lock_sock(sk);
+			ax25_destroy_socket(ax25);
+			break;
+
+		case AX25_STATE_1:
+		case AX25_STATE_2:
+			ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+			release_sock(sk);
+			ax25_disconnect(ax25, 0);
+			lock_sock(sk);
+			ax25_destroy_socket(ax25);
+			break;
+
+		case AX25_STATE_3:
+		case AX25_STATE_4:
+			ax25_clear_queues(ax25);
+			ax25->n2count = 0;
+
+			switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+			case AX25_PROTO_STD_SIMPLEX:
+			case AX25_PROTO_STD_DUPLEX:
+				ax25_send_control(ax25,
+						  AX25_DISC,
+						  AX25_POLLON,
+						  AX25_COMMAND);
+				ax25_stop_t2timer(ax25);
+				ax25_stop_t3timer(ax25);
+				ax25_stop_idletimer(ax25);
+				break;
+#ifdef CONFIG_AX25_DAMA_SLAVE
+			case AX25_PROTO_DAMA_SLAVE:
+				ax25_stop_t3timer(ax25);
+				ax25_stop_idletimer(ax25);
+				break;
+#endif
+			}
+			ax25_calculate_t1(ax25);
+			ax25_start_t1timer(ax25);
+			ax25->state = AX25_STATE_2;
+			sk->sk_state                = TCP_CLOSE;
+			sk->sk_shutdown            |= SEND_SHUTDOWN;
+			sk->sk_state_change(sk);
+			sock_set_flag(sk, SOCK_DESTROY);
+			break;
+
+		default:
+			break;
+		}
+	} else {
+		sk->sk_state     = TCP_CLOSE;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		sk->sk_state_change(sk);
+		ax25_destroy_socket(ax25);
+	}
+
+	sock->sk   = NULL;
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+/*
+ *	We support a funny extension here so you can (as root) give any callsign
+ *	digipeated via a local address as source. This hack is obsolete now
+ *	that we've implemented support for SO_BINDTODEVICE. It is however small
+ *	and trivially backward compatible.
+ */
+static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
+	ax25_dev *ax25_dev = NULL;
+	ax25_uid_assoc *user;
+	ax25_address call;
+	ax25_cb *ax25;
+	int err = 0;
+
+	if (addr_len != sizeof(struct sockaddr_ax25) &&
+	    addr_len != sizeof(struct full_sockaddr_ax25))
+		/* support for old structure may go away some time
+		 * ax25_bind(): uses old (6 digipeater) socket structure.
+		 */
+		if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+		    (addr_len > sizeof(struct full_sockaddr_ax25)))
+			return -EINVAL;
+
+	if (addr->fsa_ax25.sax25_family != AF_AX25)
+		return -EINVAL;
+
+	user = ax25_findbyuid(current_euid());
+	if (user) {
+		call = user->call;
+		ax25_uid_put(user);
+	} else {
+		if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		call = addr->fsa_ax25.sax25_call;
+	}
+
+	lock_sock(sk);
+
+	ax25 = ax25_sk(sk);
+	if (!sock_flag(sk, SOCK_ZAPPED)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ax25->source_addr = call;
+
+	/*
+	 * User already set interface with SO_BINDTODEVICE
+	 */
+	if (ax25->ax25_dev != NULL)
+		goto done;
+
+	if (addr_len > sizeof(struct sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) {
+		if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) != 0 &&
+		    (ax25_dev = ax25_addr_ax25dev(&addr->fsa_digipeater[0])) == NULL) {
+			err = -EADDRNOTAVAIL;
+			goto out;
+		}
+	} else {
+		if ((ax25_dev = ax25_addr_ax25dev(&addr->fsa_ax25.sax25_call)) == NULL) {
+			err = -EADDRNOTAVAIL;
+			goto out;
+		}
+	}
+
+	if (ax25_dev != NULL)
+		ax25_fillin_cb(ax25, ax25_dev);
+
+done:
+	ax25_cb_add(ax25);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+/*
+ *	FIXME: nonblock behaviour looks like it may have a bug.
+ */
+static int __must_check ax25_connect(struct socket *sock,
+	struct sockaddr *uaddr, int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	ax25_cb *ax25 = ax25_sk(sk), *ax25t;
+	struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
+	ax25_digi *digi = NULL;
+	int ct = 0, err = 0;
+
+	/*
+	 * some sanity checks. code further down depends on this
+	 */
+
+	if (addr_len == sizeof(struct sockaddr_ax25))
+		/* support for this will go away in early 2.5.x
+		 * ax25_connect(): uses obsolete socket structure
+		 */
+		;
+	else if (addr_len != sizeof(struct full_sockaddr_ax25))
+		/* support for old structure may go away some time
+		 * ax25_connect(): uses old (6 digipeater) socket structure.
+		 */
+		if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+		    (addr_len > sizeof(struct full_sockaddr_ax25)))
+			return -EINVAL;
+
+
+	if (fsa->fsa_ax25.sax25_family != AF_AX25)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	/* deal with restarts */
+	if (sock->state == SS_CONNECTING) {
+		switch (sk->sk_state) {
+		case TCP_SYN_SENT: /* still trying */
+			err = -EINPROGRESS;
+			goto out_release;
+
+		case TCP_ESTABLISHED: /* connection established */
+			sock->state = SS_CONNECTED;
+			goto out_release;
+
+		case TCP_CLOSE: /* connection refused */
+			sock->state = SS_UNCONNECTED;
+			err = -ECONNREFUSED;
+			goto out_release;
+		}
+	}
+
+	if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) {
+		err = -EISCONN;	/* No reconnect on a seqpacket socket */
+		goto out_release;
+	}
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	kfree(ax25->digipeat);
+	ax25->digipeat = NULL;
+
+	/*
+	 *	Handle digi-peaters to be used.
+	 */
+	if (addr_len > sizeof(struct sockaddr_ax25) &&
+	    fsa->fsa_ax25.sax25_ndigis != 0) {
+		/* Valid number of digipeaters ? */
+		if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) {
+			err = -EINVAL;
+			goto out_release;
+		}
+
+		if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) {
+			err = -ENOBUFS;
+			goto out_release;
+		}
+
+		digi->ndigi      = fsa->fsa_ax25.sax25_ndigis;
+		digi->lastrepeat = -1;
+
+		while (ct < fsa->fsa_ax25.sax25_ndigis) {
+			if ((fsa->fsa_digipeater[ct].ax25_call[6] &
+			     AX25_HBIT) && ax25->iamdigi) {
+				digi->repeated[ct] = 1;
+				digi->lastrepeat   = ct;
+			} else {
+				digi->repeated[ct] = 0;
+			}
+			digi->calls[ct] = fsa->fsa_digipeater[ct];
+			ct++;
+		}
+	}
+
+	/*
+	 *	Must bind first - autobinding in this may or may not work. If
+	 *	the socket is already bound, check to see if the device has
+	 *	been filled in, error if it hasn't.
+	 */
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		/* check if we can remove this feature. It is broken. */
+		printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n",
+			current->comm);
+		if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) {
+			kfree(digi);
+			goto out_release;
+		}
+
+		ax25_fillin_cb(ax25, ax25->ax25_dev);
+		ax25_cb_add(ax25);
+	} else {
+		if (ax25->ax25_dev == NULL) {
+			kfree(digi);
+			err = -EHOSTUNREACH;
+			goto out_release;
+		}
+	}
+
+	if (sk->sk_type == SOCK_SEQPACKET &&
+	    (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi,
+			 ax25->ax25_dev->dev))) {
+		kfree(digi);
+		err = -EADDRINUSE;		/* Already such a connection */
+		ax25_cb_put(ax25t);
+		goto out_release;
+	}
+
+	ax25->dest_addr = fsa->fsa_ax25.sax25_call;
+	ax25->digipeat  = digi;
+
+	/* First the easy one */
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		sock->state = SS_CONNECTED;
+		sk->sk_state   = TCP_ESTABLISHED;
+		goto out_release;
+	}
+
+	/* Move to connecting socket, ax.25 lapb WAIT_UA.. */
+	sock->state        = SS_CONNECTING;
+	sk->sk_state          = TCP_SYN_SENT;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_establish_data_link(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		if (ax25->ax25_dev->dama.slave)
+			ax25_ds_establish_data_link(ax25);
+		else
+			ax25_std_establish_data_link(ax25);
+		break;
+#endif
+	}
+
+	ax25->state = AX25_STATE_1;
+
+	ax25_start_heartbeat(ax25);
+
+	/* Now the loop */
+	if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+		err = -EINPROGRESS;
+		goto out_release;
+	}
+
+	if (sk->sk_state == TCP_SYN_SENT) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk->sk_state != TCP_SYN_SENT)
+				break;
+			if (!signal_pending(current)) {
+				release_sock(sk);
+				schedule();
+				lock_sock(sk);
+				continue;
+			}
+			err = -ERESTARTSYS;
+			break;
+		}
+		finish_wait(sk_sleep(sk), &wait);
+
+		if (err)
+			goto out_release;
+	}
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		/* Not in ABM, not in WAIT_UA -> failed */
+		sock->state = SS_UNCONNECTED;
+		err = sock_error(sk);	/* Always set at this point */
+		goto out_release;
+	}
+
+	sock->state = SS_CONNECTED;
+
+	err = 0;
+out_release:
+	release_sock(sk);
+
+	return err;
+}
+
+static int ax25_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sk_buff *skb;
+	struct sock *newsk;
+	DEFINE_WAIT(wait);
+	struct sock *sk;
+	int err = 0;
+
+	if (sock->state != SS_UNCONNECTED)
+		return -EINVAL;
+
+	if ((sk = sock->sk) == NULL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (sk->sk_state != TCP_LISTEN) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 *	The read queue this time is holding sockets ready to use
+	 *	hooked into the SABM we saved
+	 */
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb)
+			break;
+
+		if (flags & O_NONBLOCK) {
+			err = -EWOULDBLOCK;
+			break;
+		}
+		if (!signal_pending(current)) {
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	if (err)
+		goto out;
+
+	newsk		 = skb->sk;
+	sock_graft(newsk, newsock);
+
+	/* Now attach up the new socket */
+	kfree_skb(skb);
+	sk->sk_ack_backlog--;
+	newsock->state = SS_CONNECTED;
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
+	int *uaddr_len, int peer)
+{
+	struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
+	struct sock *sk = sock->sk;
+	unsigned char ndigi, i;
+	ax25_cb *ax25;
+	int err = 0;
+
+	memset(fsa, 0, sizeof(*fsa));
+	lock_sock(sk);
+	ax25 = ax25_sk(sk);
+
+	if (peer != 0) {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			err = -ENOTCONN;
+			goto out;
+		}
+
+		fsa->fsa_ax25.sax25_family = AF_AX25;
+		fsa->fsa_ax25.sax25_call   = ax25->dest_addr;
+
+		if (ax25->digipeat != NULL) {
+			ndigi = ax25->digipeat->ndigi;
+			fsa->fsa_ax25.sax25_ndigis = ndigi;
+			for (i = 0; i < ndigi; i++)
+				fsa->fsa_digipeater[i] =
+						ax25->digipeat->calls[i];
+		}
+	} else {
+		fsa->fsa_ax25.sax25_family = AF_AX25;
+		fsa->fsa_ax25.sax25_call   = ax25->source_addr;
+		fsa->fsa_ax25.sax25_ndigis = 1;
+		if (ax25->ax25_dev != NULL) {
+			memcpy(&fsa->fsa_digipeater[0],
+			       ax25->ax25_dev->dev->dev_addr, AX25_ADDR_LEN);
+		} else {
+			fsa->fsa_digipeater[0] = null_ax25_address;
+		}
+	}
+	*uaddr_len = sizeof (struct full_sockaddr_ax25);
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t len)
+{
+	struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name;
+	struct sock *sk = sock->sk;
+	struct sockaddr_ax25 sax;
+	struct sk_buff *skb;
+	ax25_digi dtmp, *dp;
+	ax25_cb *ax25;
+	size_t size;
+	int lv, err, addr_len = msg->msg_namelen;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	lock_sock(sk);
+	ax25 = ax25_sk(sk);
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		err = -EADDRNOTAVAIL;
+		goto out;
+	}
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		err = -EPIPE;
+		goto out;
+	}
+
+	if (ax25->ax25_dev == NULL) {
+		err = -ENETUNREACH;
+		goto out;
+	}
+
+	if (len > ax25->ax25_dev->dev->mtu) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	if (usax != NULL) {
+		if (usax->sax25_family != AF_AX25) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (addr_len == sizeof(struct sockaddr_ax25))
+			/* ax25_sendmsg(): uses obsolete socket structure */
+			;
+		else if (addr_len != sizeof(struct full_sockaddr_ax25))
+			/* support for old structure may go away some time
+			 * ax25_sendmsg(): uses old (6 digipeater)
+			 * socket structure.
+			 */
+			if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) ||
+			    (addr_len > sizeof(struct full_sockaddr_ax25))) {
+				err = -EINVAL;
+				goto out;
+			}
+
+
+		if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) {
+			int ct           = 0;
+			struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax;
+
+			/* Valid number of digipeaters ? */
+			if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			dtmp.ndigi      = usax->sax25_ndigis;
+
+			while (ct < usax->sax25_ndigis) {
+				dtmp.repeated[ct] = 0;
+				dtmp.calls[ct]    = fsa->fsa_digipeater[ct];
+				ct++;
+			}
+
+			dtmp.lastrepeat = 0;
+		}
+
+		sax = *usax;
+		if (sk->sk_type == SOCK_SEQPACKET &&
+		    ax25cmp(&ax25->dest_addr, &sax.sax25_call)) {
+			err = -EISCONN;
+			goto out;
+		}
+		if (usax->sax25_ndigis == 0)
+			dp = NULL;
+		else
+			dp = &dtmp;
+	} else {
+		/*
+		 *	FIXME: 1003.1g - if the socket is like this because
+		 *	it has become closed (not started closed) and is VC
+		 *	we ought to SIGPIPE, EPIPE
+		 */
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			err = -ENOTCONN;
+			goto out;
+		}
+		sax.sax25_family = AF_AX25;
+		sax.sax25_call   = ax25->dest_addr;
+		dp = ax25->digipeat;
+	}
+
+	/* Build a packet */
+	/* Assume the worst case */
+	size = len + ax25->ax25_dev->dev->hard_header_len;
+
+	skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto out;
+
+	skb_reserve(skb, size - len);
+
+	/* User data follows immediately after the AX.25 data */
+	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+		err = -EFAULT;
+		kfree_skb(skb);
+		goto out;
+	}
+
+	skb_reset_network_header(skb);
+
+	/* Add the PID if one is not supplied by the user in the skb */
+	if (!ax25->pidincl)
+		*skb_push(skb, 1) = sk->sk_protocol;
+
+	if (sk->sk_type == SOCK_SEQPACKET) {
+		/* Connected mode sockets go via the LAPB machine */
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			kfree_skb(skb);
+			err = -ENOTCONN;
+			goto out;
+		}
+
+		/* Shove it onto the queue and kick */
+		ax25_output(ax25, ax25->paclen, skb);
+
+		err = len;
+		goto out;
+	}
+
+	skb_push(skb, 1 + ax25_addr_size(dp));
+
+	/* Building AX.25 Header */
+
+	/* Build an AX.25 header */
+	lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call,
+			     dp, AX25_COMMAND, AX25_MODULUS);
+
+	skb_set_transport_header(skb, lv);
+
+	*skb_transport_header(skb) = AX25_UI;
+
+	/* Datagram frames go straight out of the door as UI */
+	ax25_queue_xmit(skb, ax25->ax25_dev->dev);
+
+	err = len;
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
+	struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied;
+	int err = 0;
+
+	lock_sock(sk);
+	/*
+	 * 	This works for seqpacket too. The receiver has ordered the
+	 *	queue for us! We do one quick check first though
+	 */
+	if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_ESTABLISHED) {
+		err =  -ENOTCONN;
+		goto out;
+	}
+
+	/* Now we can treat all alike */
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto out;
+
+	if (!ax25_sk(sk)->pidincl)
+		skb_pull(skb, 1);		/* Remove PID */
+
+	skb_reset_transport_header(skb);
+	copied = skb->len;
+
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	if (msg->msg_namelen != 0) {
+		struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name;
+		ax25_digi digi;
+		ax25_address src;
+		const unsigned char *mac = skb_mac_header(skb);
+
+		ax25_addr_parse(mac + 1, skb->data - mac - 1, &src, NULL,
+				&digi, NULL, NULL);
+		sax->sax25_family = AF_AX25;
+		/* We set this correctly, even though we may not let the
+		   application know the digi calls further down (because it
+		   did NOT ask to know them).  This could get political... **/
+		sax->sax25_ndigis = digi.ndigi;
+		sax->sax25_call   = src;
+
+		if (sax->sax25_ndigis != 0) {
+			int ct;
+			struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax;
+
+			for (ct = 0; ct < digi.ndigi; ct++)
+				fsa->fsa_digipeater[ct] = digi.calls[ct];
+		}
+		msg->msg_namelen = sizeof(struct full_sockaddr_ax25);
+	}
+
+	skb_free_datagram(sk, skb);
+	err = copied;
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int ax25_shutdown(struct socket *sk, int how)
+{
+	/* FIXME - generate DM and RNR states */
+	return -EOPNOTSUPP;
+}
+
+static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *)arg;
+	int res = 0;
+
+	lock_sock(sk);
+	switch (cmd) {
+	case TIOCOUTQ: {
+		long amount;
+
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		res = put_user(amount, (int __user *)argp);
+		break;
+	}
+
+	case TIOCINQ: {
+		struct sk_buff *skb;
+		long amount = 0L;
+		/* These two are safe on a single CPU system as only user tasks fiddle here */
+		if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+			amount = skb->len;
+		res = put_user(amount, (int __user *) argp);
+		break;
+	}
+
+	case SIOCGSTAMP:
+		res = sock_get_timestamp(sk, argp);
+		break;
+
+	case SIOCGSTAMPNS:
+		res = sock_get_timestampns(sk, argp);
+		break;
+
+	case SIOCAX25ADDUID:	/* Add a uid to the uid/call map table */
+	case SIOCAX25DELUID:	/* Delete a uid from the uid/call map table */
+	case SIOCAX25GETUID: {
+		struct sockaddr_ax25 sax25;
+		if (copy_from_user(&sax25, argp, sizeof(sax25))) {
+			res = -EFAULT;
+			break;
+		}
+		res = ax25_uid_ioctl(cmd, &sax25);
+		break;
+	}
+
+	case SIOCAX25NOUID: {	/* Set the default policy (default/bar) */
+		long amount;
+		if (!capable(CAP_NET_ADMIN)) {
+			res = -EPERM;
+			break;
+		}
+		if (get_user(amount, (long __user *)argp)) {
+			res = -EFAULT;
+			break;
+		}
+		if (amount > AX25_NOUID_BLOCK) {
+			res = -EINVAL;
+			break;
+		}
+		ax25_uid_policy = amount;
+		res = 0;
+		break;
+	}
+
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCAX25OPTRT:
+		if (!capable(CAP_NET_ADMIN)) {
+			res = -EPERM;
+			break;
+		}
+		res = ax25_rt_ioctl(cmd, argp);
+		break;
+
+	case SIOCAX25CTLCON:
+		if (!capable(CAP_NET_ADMIN)) {
+			res = -EPERM;
+			break;
+		}
+		res = ax25_ctl_ioctl(cmd, argp);
+		break;
+
+	case SIOCAX25GETINFO:
+	case SIOCAX25GETINFOOLD: {
+		ax25_cb *ax25 = ax25_sk(sk);
+		struct ax25_info_struct ax25_info;
+
+		ax25_info.t1        = ax25->t1   / HZ;
+		ax25_info.t2        = ax25->t2   / HZ;
+		ax25_info.t3        = ax25->t3   / HZ;
+		ax25_info.idle      = ax25->idle / (60 * HZ);
+		ax25_info.n2        = ax25->n2;
+		ax25_info.t1timer   = ax25_display_timer(&ax25->t1timer)   / HZ;
+		ax25_info.t2timer   = ax25_display_timer(&ax25->t2timer)   / HZ;
+		ax25_info.t3timer   = ax25_display_timer(&ax25->t3timer)   / HZ;
+		ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ);
+		ax25_info.n2count   = ax25->n2count;
+		ax25_info.state     = ax25->state;
+		ax25_info.rcv_q     = sk_rmem_alloc_get(sk);
+		ax25_info.snd_q     = sk_wmem_alloc_get(sk);
+		ax25_info.vs        = ax25->vs;
+		ax25_info.vr        = ax25->vr;
+		ax25_info.va        = ax25->va;
+		ax25_info.vs_max    = ax25->vs; /* reserved */
+		ax25_info.paclen    = ax25->paclen;
+		ax25_info.window    = ax25->window;
+
+		/* old structure? */
+		if (cmd == SIOCAX25GETINFOOLD) {
+			static int warned = 0;
+			if (!warned) {
+				printk(KERN_INFO "%s uses old SIOCAX25GETINFO\n",
+					current->comm);
+				warned=1;
+			}
+
+			if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct_deprecated))) {
+				res = -EFAULT;
+				break;
+			}
+		} else {
+			if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct))) {
+				res = -EINVAL;
+				break;
+			}
+		}
+		res = 0;
+		break;
+	}
+
+	case SIOCAX25ADDFWD:
+	case SIOCAX25DELFWD: {
+		struct ax25_fwd_struct ax25_fwd;
+		if (!capable(CAP_NET_ADMIN)) {
+			res = -EPERM;
+			break;
+		}
+		if (copy_from_user(&ax25_fwd, argp, sizeof(ax25_fwd))) {
+			res = -EFAULT;
+			break;
+		}
+		res = ax25_fwd_ioctl(cmd, &ax25_fwd);
+		break;
+	}
+
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+		res = -EINVAL;
+		break;
+
+	default:
+		res = -ENOIOCTLCMD;
+		break;
+	}
+	release_sock(sk);
+
+	return res;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_info_start(struct seq_file *seq, loff_t *pos)
+	__acquires(ax25_list_lock)
+{
+	spin_lock_bh(&ax25_list_lock);
+	return seq_hlist_start(&ax25_list, *pos);
+}
+
+static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &ax25_list, pos);
+}
+
+static void ax25_info_stop(struct seq_file *seq, void *v)
+	__releases(ax25_list_lock)
+{
+	spin_unlock_bh(&ax25_list_lock);
+}
+
+static int ax25_info_show(struct seq_file *seq, void *v)
+{
+	ax25_cb *ax25 = hlist_entry(v, struct ax25_cb, ax25_node);
+	char buf[11];
+	int k;
+
+
+	/*
+	 * New format:
+	 * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode
+	 */
+
+	seq_printf(seq, "%8.8lx %s %s%s ",
+		   (long) ax25,
+		   ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
+		   ax2asc(buf, &ax25->source_addr),
+		   ax25->iamdigi? "*":"");
+	seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr));
+
+	for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) {
+		seq_printf(seq, ",%s%s",
+			   ax2asc(buf, &ax25->digipeat->calls[k]),
+			   ax25->digipeat->repeated[k]? "*":"");
+	}
+
+	seq_printf(seq, " %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %d %d",
+		   ax25->state,
+		   ax25->vs, ax25->vr, ax25->va,
+		   ax25_display_timer(&ax25->t1timer) / HZ, ax25->t1 / HZ,
+		   ax25_display_timer(&ax25->t2timer) / HZ, ax25->t2 / HZ,
+		   ax25_display_timer(&ax25->t3timer) / HZ, ax25->t3 / HZ,
+		   ax25_display_timer(&ax25->idletimer) / (60 * HZ),
+		   ax25->idle / (60 * HZ),
+		   ax25->n2count, ax25->n2,
+		   ax25->rtt / HZ,
+		   ax25->window,
+		   ax25->paclen);
+
+	if (ax25->sk != NULL) {
+		seq_printf(seq, " %d %d %lu\n",
+			   sk_wmem_alloc_get(ax25->sk),
+			   sk_rmem_alloc_get(ax25->sk),
+			   sock_i_ino(ax25->sk));
+	} else {
+		seq_puts(seq, " * * *\n");
+	}
+	return 0;
+}
+
+static const struct seq_operations ax25_info_seqops = {
+	.start = ax25_info_start,
+	.next = ax25_info_next,
+	.stop = ax25_info_stop,
+	.show = ax25_info_show,
+};
+
+static int ax25_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ax25_info_seqops);
+}
+
+static const struct file_operations ax25_info_fops = {
+	.owner = THIS_MODULE,
+	.open = ax25_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+static const struct net_proto_family ax25_family_ops = {
+	.family =	PF_AX25,
+	.create =	ax25_create,
+	.owner	=	THIS_MODULE,
+};
+
+static const struct proto_ops ax25_proto_ops = {
+	.family		= PF_AX25,
+	.owner		= THIS_MODULE,
+	.release	= ax25_release,
+	.bind		= ax25_bind,
+	.connect	= ax25_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= ax25_accept,
+	.getname	= ax25_getname,
+	.poll		= datagram_poll,
+	.ioctl		= ax25_ioctl,
+	.listen		= ax25_listen,
+	.shutdown	= ax25_shutdown,
+	.setsockopt	= ax25_setsockopt,
+	.getsockopt	= ax25_getsockopt,
+	.sendmsg	= ax25_sendmsg,
+	.recvmsg	= ax25_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+/*
+ *	Called by socket.c on kernel start up
+ */
+static struct packet_type ax25_packet_type __read_mostly = {
+	.type	=	cpu_to_be16(ETH_P_AX25),
+	.func	=	ax25_kiss_rcv,
+};
+
+static struct notifier_block ax25_dev_notifier = {
+	.notifier_call =ax25_device_event,
+};
+
+static int __init ax25_init(void)
+{
+	int rc = proto_register(&ax25_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	sock_register(&ax25_family_ops);
+	dev_add_pack(&ax25_packet_type);
+	register_netdevice_notifier(&ax25_dev_notifier);
+	ax25_register_sysctl();
+
+	proc_net_fops_create(&init_net, "ax25_route", S_IRUGO, &ax25_route_fops);
+	proc_net_fops_create(&init_net, "ax25", S_IRUGO, &ax25_info_fops);
+	proc_net_fops_create(&init_net, "ax25_calls", S_IRUGO, &ax25_uid_fops);
+out:
+	return rc;
+}
+module_init(ax25_init);
+
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_AX25);
+
+static void __exit ax25_exit(void)
+{
+	proc_net_remove(&init_net, "ax25_route");
+	proc_net_remove(&init_net, "ax25");
+	proc_net_remove(&init_net, "ax25_calls");
+
+	unregister_netdevice_notifier(&ax25_dev_notifier);
+	ax25_unregister_sysctl();
+
+	dev_remove_pack(&ax25_packet_type);
+
+	sock_unregister(PF_AX25);
+	proto_unregister(&ax25_proto);
+
+	ax25_rt_free();
+	ax25_uid_free();
+	ax25_dev_free();
+}
+module_exit(ax25_exit);
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
new file mode 100644
index 00000000..7e7964dd
--- /dev/null
+++ b/net/ax25/ax25_addr.c
@@ -0,0 +1,306 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * The default broadcast address of an interface is QST-0; the default address
+ * is LINUX-1.  The null address is defined as a callsign of all spaces with
+ * an SSID of zero.
+ */
+
+const ax25_address ax25_bcast =
+	{{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+const ax25_address ax25_defaddr =
+	{{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}};
+const ax25_address null_ax25_address =
+	{{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}};
+
+EXPORT_SYMBOL_GPL(ax25_bcast);
+EXPORT_SYMBOL_GPL(ax25_defaddr);
+EXPORT_SYMBOL(null_ax25_address);
+
+/*
+ *	ax25 -> ascii conversion
+ */
+char *ax2asc(char *buf, const ax25_address *a)
+{
+	char c, *s;
+	int n;
+
+	for (n = 0, s = buf; n < 6; n++) {
+		c = (a->ax25_call[n] >> 1) & 0x7F;
+
+		if (c != ' ') *s++ = c;
+	}
+
+	*s++ = '-';
+
+	if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+		*s++ = '1';
+		n -= 10;
+	}
+
+	*s++ = n + '0';
+	*s++ = '\0';
+
+	if (*buf == '\0' || *buf == '-')
+	   return "*";
+
+	return buf;
+
+}
+
+EXPORT_SYMBOL(ax2asc);
+
+/*
+ *	ascii -> ax25 conversion
+ */
+void asc2ax(ax25_address *addr, const char *callsign)
+{
+	const char *s;
+	int n;
+
+	for (s = callsign, n = 0; n < 6; n++) {
+		if (*s != '\0' && *s != '-')
+			addr->ax25_call[n] = *s++;
+		else
+			addr->ax25_call[n] = ' ';
+		addr->ax25_call[n] <<= 1;
+		addr->ax25_call[n] &= 0xFE;
+	}
+
+	if (*s++ == '\0') {
+		addr->ax25_call[6] = 0x00;
+		return;
+	}
+
+	addr->ax25_call[6] = *s++ - '0';
+
+	if (*s != '\0') {
+		addr->ax25_call[6] *= 10;
+		addr->ax25_call[6] += *s++ - '0';
+	}
+
+	addr->ax25_call[6] <<= 1;
+	addr->ax25_call[6] &= 0x1E;
+}
+
+EXPORT_SYMBOL(asc2ax);
+
+/*
+ *	Compare two ax.25 addresses
+ */
+int ax25cmp(const ax25_address *a, const ax25_address *b)
+{
+	int ct = 0;
+
+	while (ct < 6) {
+		if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE))	/* Clean off repeater bits */
+			return 1;
+		ct++;
+	}
+
+	if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E))	/* SSID without control bit */
+		return 0;
+
+	return 2;			/* Partial match */
+}
+
+EXPORT_SYMBOL(ax25cmp);
+
+/*
+ *	Compare two AX.25 digipeater paths.
+ */
+int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2)
+{
+	int i;
+
+	if (digi1->ndigi != digi2->ndigi)
+		return 1;
+
+	if (digi1->lastrepeat != digi2->lastrepeat)
+		return 1;
+
+	for (i = 0; i < digi1->ndigi; i++)
+		if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0)
+			return 1;
+
+	return 0;
+}
+
+/*
+ *	Given an AX.25 address pull of to, from, digi list, command/response and the start of data
+ *
+ */
+const unsigned char *ax25_addr_parse(const unsigned char *buf, int len,
+	ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags,
+	int *dama)
+{
+	int d = 0;
+
+	if (len < 14) return NULL;
+
+	if (flags != NULL) {
+		*flags = 0;
+
+		if (buf[6] & AX25_CBIT)
+			*flags = AX25_COMMAND;
+		if (buf[13] & AX25_CBIT)
+			*flags = AX25_RESPONSE;
+	}
+
+	if (dama != NULL)
+		*dama = ~buf[13] & AX25_DAMA_FLAG;
+
+	/* Copy to, from */
+	if (dest != NULL)
+		memcpy(dest, buf + 0, AX25_ADDR_LEN);
+	if (src != NULL)
+		memcpy(src,  buf + 7, AX25_ADDR_LEN);
+
+	buf += 2 * AX25_ADDR_LEN;
+	len -= 2 * AX25_ADDR_LEN;
+
+	digi->lastrepeat = -1;
+	digi->ndigi      = 0;
+
+	while (!(buf[-1] & AX25_EBIT)) {
+		if (d >= AX25_MAX_DIGIS)  return NULL;	/* Max of 6 digis */
+		if (len < 7) return NULL;	/* Short packet */
+
+		memcpy(&digi->calls[d], buf, AX25_ADDR_LEN);
+		digi->ndigi = d + 1;
+
+		if (buf[6] & AX25_HBIT) {
+			digi->repeated[d] = 1;
+			digi->lastrepeat  = d;
+		} else {
+			digi->repeated[d] = 0;
+		}
+
+		buf += AX25_ADDR_LEN;
+		len -= AX25_ADDR_LEN;
+		d++;
+	}
+
+	return buf;
+}
+
+/*
+ *	Assemble an AX.25 header from the bits
+ */
+int ax25_addr_build(unsigned char *buf, const ax25_address *src,
+	const ax25_address *dest, const ax25_digi *d, int flag, int modulus)
+{
+	int len = 0;
+	int ct  = 0;
+
+	memcpy(buf, dest, AX25_ADDR_LEN);
+	buf[6] &= ~(AX25_EBIT | AX25_CBIT);
+	buf[6] |= AX25_SSSID_SPARE;
+
+	if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT;
+
+	buf += AX25_ADDR_LEN;
+	len += AX25_ADDR_LEN;
+
+	memcpy(buf, src, AX25_ADDR_LEN);
+	buf[6] &= ~(AX25_EBIT | AX25_CBIT);
+	buf[6] &= ~AX25_SSSID_SPARE;
+
+	if (modulus == AX25_MODULUS)
+		buf[6] |= AX25_SSSID_SPARE;
+	else
+		buf[6] |= AX25_ESSID_SPARE;
+
+	if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT;
+
+	/*
+	 *	Fast path the normal digiless path
+	 */
+	if (d == NULL || d->ndigi == 0) {
+		buf[6] |= AX25_EBIT;
+		return 2 * AX25_ADDR_LEN;
+	}
+
+	buf += AX25_ADDR_LEN;
+	len += AX25_ADDR_LEN;
+
+	while (ct < d->ndigi) {
+		memcpy(buf, &d->calls[ct], AX25_ADDR_LEN);
+
+		if (d->repeated[ct])
+			buf[6] |= AX25_HBIT;
+		else
+			buf[6] &= ~AX25_HBIT;
+
+		buf[6] &= ~AX25_EBIT;
+		buf[6] |= AX25_SSSID_SPARE;
+
+		buf += AX25_ADDR_LEN;
+		len += AX25_ADDR_LEN;
+		ct++;
+	}
+
+	buf[-1] |= AX25_EBIT;
+
+	return len;
+}
+
+int ax25_addr_size(const ax25_digi *dp)
+{
+	if (dp == NULL)
+		return 2 * AX25_ADDR_LEN;
+
+	return AX25_ADDR_LEN * (2 + dp->ndigi);
+}
+
+/*
+ *	Reverse Digipeat List. May not pass both parameters as same struct
+ */
+void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
+{
+	int ct;
+
+	out->ndigi      = in->ndigi;
+	out->lastrepeat = in->ndigi - in->lastrepeat - 2;
+
+	/* Invert the digipeaters */
+	for (ct = 0; ct < in->ndigi; ct++) {
+		out->calls[ct] = in->calls[in->ndigi - ct - 1];
+
+		if (ct <= out->lastrepeat) {
+			out->calls[ct].ax25_call[6] |= AX25_HBIT;
+			out->repeated[ct]            = 1;
+		} else {
+			out->calls[ct].ax25_call[6] &= ~AX25_HBIT;
+			out->repeated[ct]            = 0;
+		}
+	}
+}
+
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
new file mode 100644
index 00000000..c1cb982f
--- /dev/null
+++ b/net/ax25/ax25_dev.c
@@ -0,0 +1,205 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/spinlock.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+ax25_dev *ax25_dev_list;
+DEFINE_SPINLOCK(ax25_dev_lock);
+
+ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
+{
+	ax25_dev *ax25_dev, *res = NULL;
+
+	spin_lock_bh(&ax25_dev_lock);
+	for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
+		if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) {
+			res = ax25_dev;
+		}
+	spin_unlock_bh(&ax25_dev_lock);
+
+	return res;
+}
+
+/*
+ *	This is called when an interface is brought up. These are
+ *	reasonable defaults.
+ */
+void ax25_dev_device_up(struct net_device *dev)
+{
+	ax25_dev *ax25_dev;
+
+	if ((ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) {
+		printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n");
+		return;
+	}
+
+	ax25_unregister_sysctl();
+
+	dev->ax25_ptr     = ax25_dev;
+	ax25_dev->dev     = dev;
+	dev_hold(dev);
+	ax25_dev->forward = NULL;
+
+	ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE;
+	ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE;
+	ax25_dev->values[AX25_VALUES_BACKOFF]   = AX25_DEF_BACKOFF;
+	ax25_dev->values[AX25_VALUES_CONMODE]   = AX25_DEF_CONMODE;
+	ax25_dev->values[AX25_VALUES_WINDOW]    = AX25_DEF_WINDOW;
+	ax25_dev->values[AX25_VALUES_EWINDOW]   = AX25_DEF_EWINDOW;
+	ax25_dev->values[AX25_VALUES_T1]        = AX25_DEF_T1;
+	ax25_dev->values[AX25_VALUES_T2]        = AX25_DEF_T2;
+	ax25_dev->values[AX25_VALUES_T3]        = AX25_DEF_T3;
+	ax25_dev->values[AX25_VALUES_IDLE]	= AX25_DEF_IDLE;
+	ax25_dev->values[AX25_VALUES_N2]        = AX25_DEF_N2;
+	ax25_dev->values[AX25_VALUES_PACLEN]	= AX25_DEF_PACLEN;
+	ax25_dev->values[AX25_VALUES_PROTOCOL]  = AX25_DEF_PROTOCOL;
+	ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
+
+#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
+	ax25_ds_setup_timer(ax25_dev);
+#endif
+
+	spin_lock_bh(&ax25_dev_lock);
+	ax25_dev->next = ax25_dev_list;
+	ax25_dev_list  = ax25_dev;
+	spin_unlock_bh(&ax25_dev_lock);
+
+	ax25_register_sysctl();
+}
+
+void ax25_dev_device_down(struct net_device *dev)
+{
+	ax25_dev *s, *ax25_dev;
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		return;
+
+	ax25_unregister_sysctl();
+
+	spin_lock_bh(&ax25_dev_lock);
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	ax25_ds_del_timer(ax25_dev);
+#endif
+
+	/*
+	 *	Remove any packet forwarding that points to this device.
+	 */
+	for (s = ax25_dev_list; s != NULL; s = s->next)
+		if (s->forward == dev)
+			s->forward = NULL;
+
+	if ((s = ax25_dev_list) == ax25_dev) {
+		ax25_dev_list = s->next;
+		spin_unlock_bh(&ax25_dev_lock);
+		dev_put(dev);
+		kfree(ax25_dev);
+		ax25_register_sysctl();
+		return;
+	}
+
+	while (s != NULL && s->next != NULL) {
+		if (s->next == ax25_dev) {
+			s->next = ax25_dev->next;
+			spin_unlock_bh(&ax25_dev_lock);
+			dev_put(dev);
+			kfree(ax25_dev);
+			ax25_register_sysctl();
+			return;
+		}
+
+		s = s->next;
+	}
+	spin_unlock_bh(&ax25_dev_lock);
+	dev->ax25_ptr = NULL;
+
+	ax25_register_sysctl();
+}
+
+int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
+{
+	ax25_dev *ax25_dev, *fwd_dev;
+
+	if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL)
+		return -EINVAL;
+
+	switch (cmd) {
+	case SIOCAX25ADDFWD:
+		if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL)
+			return -EINVAL;
+		if (ax25_dev->forward != NULL)
+			return -EINVAL;
+		ax25_dev->forward = fwd_dev->dev;
+		break;
+
+	case SIOCAX25DELFWD:
+		if (ax25_dev->forward == NULL)
+			return -EINVAL;
+		ax25_dev->forward = NULL;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct net_device *ax25_fwd_dev(struct net_device *dev)
+{
+	ax25_dev *ax25_dev;
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		return dev;
+
+	if (ax25_dev->forward == NULL)
+		return dev;
+
+	return ax25_dev->forward;
+}
+
+/*
+ *	Free all memory associated with device structures.
+ */
+void __exit ax25_dev_free(void)
+{
+	ax25_dev *s, *ax25_dev;
+
+	spin_lock_bh(&ax25_dev_lock);
+	ax25_dev = ax25_dev_list;
+	while (ax25_dev != NULL) {
+		s        = ax25_dev;
+		dev_put(ax25_dev->dev);
+		ax25_dev = ax25_dev->next;
+		kfree(s);
+	}
+	ax25_dev_list = NULL;
+	spin_unlock_bh(&ax25_dev_lock);
+}
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
new file mode 100644
index 00000000..8273b120
--- /dev/null
+++ b/net/ax25/ax25_ds_in.c
@@ -0,0 +1,303 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ *	State machine for state 1, Awaiting Connection State.
+ *	The handling of the timer(s) is in file ax25_ds_timer.c.
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+	switch (frametype) {
+	case AX25_SABM:
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_SABME:
+		ax25->modulus = AX25_EMODULUS;
+		ax25->window  =  ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_UA:
+		ax25_calculate_rtt(ax25);
+		ax25_stop_t1timer(ax25);
+		ax25_start_t3timer(ax25);
+		ax25_start_idletimer(ax25);
+		ax25->vs      = 0;
+		ax25->va      = 0;
+		ax25->vr      = 0;
+		ax25->state   = AX25_STATE_3;
+		ax25->n2count = 0;
+		if (ax25->sk != NULL) {
+			bh_lock_sock(ax25->sk);
+			ax25->sk->sk_state = TCP_ESTABLISHED;
+			/*
+			 * For WAIT_SABM connections we will produce an accept
+			 * ready socket here
+			 */
+			if (!sock_flag(ax25->sk, SOCK_DEAD))
+				ax25->sk->sk_state_change(ax25->sk);
+			bh_unlock_sock(ax25->sk);
+		}
+		ax25_dama_on(ax25);
+
+		/* according to DK4EG's spec we are required to
+		 * send a RR RESPONSE FINAL NR=0.
+		 */
+
+		ax25_std_enquiry_response(ax25);
+		break;
+
+	case AX25_DM:
+		if (pf)
+			ax25_disconnect(ax25, ECONNREFUSED);
+		break;
+
+	default:
+		if (pf)
+			ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ *	State machine for state 2, Awaiting Release State.
+ *	The handling of the timer(s) is in file ax25_ds_timer.c
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+	switch (frametype) {
+	case AX25_SABM:
+	case AX25_SABME:
+		ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+		ax25_dama_off(ax25);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_dama_off(ax25);
+		ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_DM:
+	case AX25_UA:
+		if (pf) {
+			ax25_dama_off(ax25);
+			ax25_disconnect(ax25, 0);
+		}
+		break;
+
+	case AX25_I:
+	case AX25_REJ:
+	case AX25_RNR:
+	case AX25_RR:
+		if (pf) {
+			ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+			ax25_dama_off(ax25);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ *	State machine for state 3, Connected State.
+ *	The handling of the timer(s) is in file ax25_timer.c
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_ds_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+	int queued = 0;
+
+	switch (frametype) {
+	case AX25_SABM:
+	case AX25_SABME:
+		if (frametype == AX25_SABM) {
+			ax25->modulus   = AX25_MODULUS;
+			ax25->window    = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		} else {
+			ax25->modulus   = AX25_EMODULUS;
+			ax25->window    = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+		}
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_stop_t1timer(ax25);
+		ax25_start_t3timer(ax25);
+		ax25_start_idletimer(ax25);
+		ax25->condition = 0x00;
+		ax25->vs        = 0;
+		ax25->va        = 0;
+		ax25->vr        = 0;
+		ax25_requeue_frames(ax25);
+		ax25_dama_on(ax25);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_dama_off(ax25);
+		ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_DM:
+		ax25_dama_off(ax25);
+		ax25_disconnect(ax25, ECONNRESET);
+		break;
+
+	case AX25_RR:
+	case AX25_RNR:
+		if (frametype == AX25_RR)
+			ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+		else
+			ax25->condition |= AX25_COND_PEER_RX_BUSY;
+
+		if (ax25_validate_nr(ax25, nr)) {
+			if (ax25_check_iframes_acked(ax25, nr))
+				ax25->n2count=0;
+			if (type == AX25_COMMAND && pf)
+				ax25_ds_enquiry_response(ax25);
+		} else {
+			ax25_ds_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_REJ:
+		ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+
+		if (ax25_validate_nr(ax25, nr)) {
+			if (ax25->va != nr)
+				ax25->n2count=0;
+
+			ax25_frames_acked(ax25, nr);
+			ax25_calculate_rtt(ax25);
+			ax25_stop_t1timer(ax25);
+			ax25_start_t3timer(ax25);
+			ax25_requeue_frames(ax25);
+
+			if (type == AX25_COMMAND && pf)
+				ax25_ds_enquiry_response(ax25);
+		} else {
+			ax25_ds_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_I:
+		if (!ax25_validate_nr(ax25, nr)) {
+			ax25_ds_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+			break;
+		}
+		if (ax25->condition & AX25_COND_PEER_RX_BUSY) {
+			ax25_frames_acked(ax25, nr);
+			ax25->n2count = 0;
+		} else {
+			if (ax25_check_iframes_acked(ax25, nr))
+				ax25->n2count = 0;
+		}
+		if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+			if (pf) ax25_ds_enquiry_response(ax25);
+			break;
+		}
+		if (ns == ax25->vr) {
+			ax25->vr = (ax25->vr + 1) % ax25->modulus;
+			queued = ax25_rx_iframe(ax25, skb);
+			if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+				ax25->vr = ns;	/* ax25->vr - 1 */
+			ax25->condition &= ~AX25_COND_REJECT;
+			if (pf) {
+				ax25_ds_enquiry_response(ax25);
+			} else {
+				if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+					ax25->condition |= AX25_COND_ACK_PENDING;
+					ax25_start_t2timer(ax25);
+				}
+			}
+		} else {
+			if (ax25->condition & AX25_COND_REJECT) {
+				if (pf) ax25_ds_enquiry_response(ax25);
+			} else {
+				ax25->condition |= AX25_COND_REJECT;
+				ax25_ds_enquiry_response(ax25);
+				ax25->condition &= ~AX25_COND_ACK_PENDING;
+			}
+		}
+		break;
+
+	case AX25_FRMR:
+	case AX25_ILLEGAL:
+		ax25_ds_establish_data_link(ax25);
+		ax25->state = AX25_STATE_1;
+		break;
+
+	default:
+		break;
+	}
+
+	return queued;
+}
+
+/*
+ *	Higher level upcall for a LAPB frame
+ */
+int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+	int queued = 0, frametype, ns, nr, pf;
+
+	frametype = ax25_decode(ax25, skb, &ns, &nr, &pf);
+
+	switch (ax25->state) {
+	case AX25_STATE_1:
+		queued = ax25_ds_state1_machine(ax25, skb, frametype, pf, type);
+		break;
+	case AX25_STATE_2:
+		queued = ax25_ds_state2_machine(ax25, skb, frametype, pf, type);
+		break;
+	case AX25_STATE_3:
+		queued = ax25_ds_state3_machine(ax25, skb, frametype, ns, nr, pf, type);
+		break;
+	}
+
+	return queued;
+}
+
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
new file mode 100644
index 00000000..85816e61
--- /dev/null
+++ b/net/ax25/ax25_ds_subr.c
@@ -0,0 +1,211 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/spinlock.h>
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+void ax25_ds_nr_error_recovery(ax25_cb *ax25)
+{
+	ax25_ds_establish_data_link(ax25);
+}
+
+/*
+ *	dl1bke 960114: transmit I frames on DAMA poll
+ */
+void ax25_ds_enquiry_response(ax25_cb *ax25)
+{
+	ax25_cb *ax25o;
+	struct hlist_node *node;
+
+	/* Please note that neither DK4EG's nor DG2FEF's
+	 * DAMA spec mention the following behaviour as seen
+	 * with TheFirmware:
+	 *
+	 * 	DB0ACH->DL1BKE <RR C P R0> [DAMA]
+	 *	DL1BKE->DB0ACH <I NR=0 NS=0>
+	 *	DL1BKE-7->DB0PRA-6 DB0ACH <I C S3 R5>
+	 *	DL1BKE->DB0ACH <RR R F R0>
+	 *
+	 * The Flexnet DAMA Master implementation apparently
+	 * insists on the "proper" AX.25 behaviour:
+	 *
+	 * 	DB0ACH->DL1BKE <RR C P R0> [DAMA]
+	 *	DL1BKE->DB0ACH <RR R F R0>
+	 *	DL1BKE->DB0ACH <I NR=0 NS=0>
+	 *	DL1BKE-7->DB0PRA-6 DB0ACH <I C S3 R5>
+	 *
+	 * Flexnet refuses to send us *any* I frame if we send
+	 * a REJ in case AX25_COND_REJECT is set. It is superfluous in
+	 * this mode anyway (a RR or RNR invokes the retransmission).
+	 * Is this a Flexnet bug?
+	 */
+
+	ax25_std_enquiry_response(ax25);
+
+	if (!(ax25->condition & AX25_COND_PEER_RX_BUSY)) {
+		ax25_requeue_frames(ax25);
+		ax25_kick(ax25);
+	}
+
+	if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2 || skb_peek(&ax25->ack_queue) != NULL)
+		ax25_ds_t1_timeout(ax25);
+	else
+		ax25->n2count = 0;
+
+	ax25_start_t3timer(ax25);
+	ax25_ds_set_timer(ax25->ax25_dev);
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(ax25o, node, &ax25_list) {
+		if (ax25o == ax25)
+			continue;
+
+		if (ax25o->ax25_dev != ax25->ax25_dev)
+			continue;
+
+		if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2) {
+			ax25_ds_t1_timeout(ax25o);
+			continue;
+		}
+
+		if (!(ax25o->condition & AX25_COND_PEER_RX_BUSY) && ax25o->state == AX25_STATE_3) {
+			ax25_requeue_frames(ax25o);
+			ax25_kick(ax25o);
+		}
+
+		if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || skb_peek(&ax25o->ack_queue) != NULL)
+			ax25_ds_t1_timeout(ax25o);
+
+		/* do not start T3 for listening sockets (tnx DD8NE) */
+
+		if (ax25o->state != AX25_STATE_0)
+			ax25_start_t3timer(ax25o);
+	}
+	spin_unlock(&ax25_list_lock);
+}
+
+void ax25_ds_establish_data_link(ax25_cb *ax25)
+{
+	ax25->condition &= AX25_COND_DAMA_MODE;
+	ax25->n2count    = 0;
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+	ax25_stop_t2timer(ax25);
+	ax25_start_t3timer(ax25);
+}
+
+/*
+ *	:::FIXME:::
+ *	This is a kludge. Not all drivers recognize kiss commands.
+ *	We need a driver level  request to switch duplex mode, that does
+ *	either SCC changing, PI config or KISS as required. Currently
+ *	this request isn't reliable.
+ */
+static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char param)
+{
+	struct sk_buff *skb;
+	unsigned char *p;
+
+	if (ax25_dev->dev == NULL)
+		return;
+
+	if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reset_network_header(skb);
+	p = skb_put(skb, 2);
+
+	*p++ = cmd;
+	*p++ = param;
+
+	skb->protocol = ax25_type_trans(skb, ax25_dev->dev);
+
+	dev_queue_xmit(skb);
+}
+
+/*
+ *	A nasty problem arises if we count the number of DAMA connections
+ *	wrong, especially when connections on the device already existed
+ *	and our network node (or the sysop) decides to turn on DAMA Master
+ *	mode. We thus flag the 'real' slave connections with
+ *	ax25->dama_slave=1 and look on every disconnect if still slave
+ *	connections exist.
+ */
+static int ax25_check_dama_slave(ax25_dev *ax25_dev)
+{
+	ax25_cb *ax25;
+	int res = 0;
+	struct hlist_node *node;
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(ax25, node, &ax25_list)
+		if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) {
+			res = 1;
+			break;
+		}
+	spin_unlock(&ax25_list_lock);
+
+	return res;
+}
+
+static void ax25_dev_dama_on(ax25_dev *ax25_dev)
+{
+	if (ax25_dev == NULL)
+		return;
+
+	if (ax25_dev->dama.slave == 0)
+		ax25_kiss_cmd(ax25_dev, 5, 1);
+
+	ax25_dev->dama.slave = 1;
+	ax25_ds_set_timer(ax25_dev);
+}
+
+void ax25_dev_dama_off(ax25_dev *ax25_dev)
+{
+	if (ax25_dev == NULL)
+		return;
+
+	if (ax25_dev->dama.slave && !ax25_check_dama_slave(ax25_dev)) {
+		ax25_kiss_cmd(ax25_dev, 5, 0);
+		ax25_dev->dama.slave = 0;
+		ax25_ds_del_timer(ax25_dev);
+	}
+}
+
+void ax25_dama_on(ax25_cb *ax25)
+{
+	ax25_dev_dama_on(ax25->ax25_dev);
+	ax25->condition |= AX25_COND_DAMA_MODE;
+}
+
+void ax25_dama_off(ax25_cb *ax25)
+{
+	ax25->condition &= ~AX25_COND_DAMA_MODE;
+	ax25_dev_dama_off(ax25->ax25_dev);
+}
+
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
new file mode 100644
index 00000000..c7d81436
--- /dev/null
+++ b/net/ax25/ax25_ds_timer.c
@@ -0,0 +1,238 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/tcp_states.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static void ax25_ds_timeout(unsigned long);
+
+/*
+ *	Add DAMA slave timeout timer to timer list.
+ *	Unlike the connection based timers the timeout function gets
+ *	triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT
+ *	(aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in
+ *	1/10th of a second.
+ */
+
+void ax25_ds_setup_timer(ax25_dev *ax25_dev)
+{
+	setup_timer(&ax25_dev->dama.slave_timer, ax25_ds_timeout,
+		    (unsigned long)ax25_dev);
+}
+
+void ax25_ds_del_timer(ax25_dev *ax25_dev)
+{
+	if (ax25_dev)
+		del_timer(&ax25_dev->dama.slave_timer);
+}
+
+void ax25_ds_set_timer(ax25_dev *ax25_dev)
+{
+	if (ax25_dev == NULL)		/* paranoia */
+		return;
+
+	ax25_dev->dama.slave_timeout =
+		msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10;
+	mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ);
+}
+
+/*
+ *	DAMA Slave Timeout
+ *	Silently discard all (slave) connections in case our master forgot us...
+ */
+
+static void ax25_ds_timeout(unsigned long arg)
+{
+	ax25_dev *ax25_dev = (struct ax25_dev *) arg;
+	ax25_cb *ax25;
+	struct hlist_node *node;
+
+	if (ax25_dev == NULL || !ax25_dev->dama.slave)
+		return;			/* Yikes! */
+
+	if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) {
+		ax25_ds_set_timer(ax25_dev);
+		return;
+	}
+
+	spin_lock(&ax25_list_lock);
+	ax25_for_each(ax25, node, &ax25_list) {
+		if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE))
+			continue;
+
+		ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+		ax25_disconnect(ax25, ETIMEDOUT);
+	}
+	spin_unlock(&ax25_list_lock);
+
+	ax25_dev_dama_off(ax25_dev);
+}
+
+void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
+{
+	struct sock *sk=ax25->sk;
+
+	if (sk)
+		bh_lock_sock(sk);
+
+	switch (ax25->state) {
+
+	case AX25_STATE_0:
+		/* Magic here: If we listen() and a new link dies before it
+		   is accepted() it isn't 'dead' so doesn't get removed. */
+		if (!sk || sock_flag(sk, SOCK_DESTROY) ||
+		    (sk->sk_state == TCP_LISTEN &&
+		     sock_flag(sk, SOCK_DEAD))) {
+			if (sk) {
+				sock_hold(sk);
+				ax25_destroy_socket(ax25);
+				bh_unlock_sock(sk);
+				sock_put(sk);
+			} else
+				ax25_destroy_socket(ax25);
+			return;
+		}
+		break;
+
+	case AX25_STATE_3:
+		/*
+		 * Check the state of the receive buffer.
+		 */
+		if (sk != NULL) {
+			if (atomic_read(&sk->sk_rmem_alloc) <
+			    (sk->sk_rcvbuf >> 1) &&
+			    (ax25->condition & AX25_COND_OWN_RX_BUSY)) {
+				ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
+				ax25->condition &= ~AX25_COND_ACK_PENDING;
+				break;
+			}
+		}
+		break;
+	}
+
+	if (sk)
+		bh_unlock_sock(sk);
+
+	ax25_start_heartbeat(ax25);
+}
+
+/* dl1bke 960114: T3 works much like the IDLE timeout, but
+ *                gets reloaded with every frame for this
+ *		  connection.
+ */
+void ax25_ds_t3timer_expiry(ax25_cb *ax25)
+{
+	ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+	ax25_dama_off(ax25);
+	ax25_disconnect(ax25, ETIMEDOUT);
+}
+
+/* dl1bke 960228: close the connection when IDLE expires.
+ *		  unlike T3 this timer gets reloaded only on
+ *		  I frames.
+ */
+void ax25_ds_idletimer_expiry(ax25_cb *ax25)
+{
+	ax25_clear_queues(ax25);
+
+	ax25->n2count = 0;
+	ax25->state = AX25_STATE_2;
+
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+	ax25_stop_t3timer(ax25);
+
+	if (ax25->sk != NULL) {
+		bh_lock_sock(ax25->sk);
+		ax25->sk->sk_state     = TCP_CLOSE;
+		ax25->sk->sk_err       = 0;
+		ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+		if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+			ax25->sk->sk_state_change(ax25->sk);
+			sock_set_flag(ax25->sk, SOCK_DEAD);
+		}
+		bh_unlock_sock(ax25->sk);
+	}
+}
+
+/* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC
+ *                within the poll of any connected channel. Remember
+ *                that we are not allowed to send anything unless we
+ *                get polled by the Master.
+ *
+ *                Thus we'll have to do parts of our T1 handling in
+ *                ax25_enquiry_response().
+ */
+void ax25_ds_t1_timeout(ax25_cb *ax25)
+{
+	switch (ax25->state) {
+	case AX25_STATE_1:
+		if (ax25->n2count == ax25->n2) {
+			if (ax25->modulus == AX25_MODULUS) {
+				ax25_disconnect(ax25, ETIMEDOUT);
+				return;
+			} else {
+				ax25->modulus = AX25_MODULUS;
+				ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+				ax25->n2count = 0;
+				ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND);
+			}
+		} else {
+			ax25->n2count++;
+			if (ax25->modulus == AX25_MODULUS)
+				ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND);
+			else
+				ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND);
+		}
+		break;
+
+	case AX25_STATE_2:
+		if (ax25->n2count == ax25->n2) {
+			ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+			ax25_disconnect(ax25, ETIMEDOUT);
+			return;
+		} else {
+			ax25->n2count++;
+		}
+		break;
+
+	case AX25_STATE_3:
+		if (ax25->n2count == ax25->n2) {
+			ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+			ax25_disconnect(ax25, ETIMEDOUT);
+			return;
+		} else {
+			ax25->n2count++;
+		}
+		break;
+	}
+
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+}
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
new file mode 100644
index 00000000..60b545e2
--- /dev/null
+++ b/net/ax25/ax25_iface.c
@@ -0,0 +1,219 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static struct ax25_protocol *protocol_list;
+static DEFINE_RWLOCK(protocol_list_lock);
+
+static HLIST_HEAD(ax25_linkfail_list);
+static DEFINE_SPINLOCK(linkfail_lock);
+
+static struct listen_struct {
+	struct listen_struct *next;
+	ax25_address  callsign;
+	struct net_device *dev;
+} *listen_list = NULL;
+static DEFINE_SPINLOCK(listen_lock);
+
+/*
+ * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT,
+ * AX25_P_IP or AX25_P_ARP ...
+ */
+void ax25_register_pid(struct ax25_protocol *ap)
+{
+	write_lock_bh(&protocol_list_lock);
+	ap->next = protocol_list;
+	protocol_list = ap;
+	write_unlock_bh(&protocol_list_lock);
+}
+
+EXPORT_SYMBOL_GPL(ax25_register_pid);
+
+void ax25_protocol_release(unsigned int pid)
+{
+	struct ax25_protocol *protocol;
+
+	write_lock_bh(&protocol_list_lock);
+	protocol = protocol_list;
+	if (protocol == NULL)
+		goto out;
+
+	if (protocol->pid == pid) {
+		protocol_list = protocol->next;
+		goto out;
+	}
+
+	while (protocol != NULL && protocol->next != NULL) {
+		if (protocol->next->pid == pid) {
+			protocol->next = protocol->next->next;
+			goto out;
+		}
+
+		protocol = protocol->next;
+	}
+out:
+	write_unlock_bh(&protocol_list_lock);
+}
+
+EXPORT_SYMBOL(ax25_protocol_release);
+
+void ax25_linkfail_register(struct ax25_linkfail *lf)
+{
+	spin_lock_bh(&linkfail_lock);
+	hlist_add_head(&lf->lf_node, &ax25_linkfail_list);
+	spin_unlock_bh(&linkfail_lock);
+}
+
+EXPORT_SYMBOL(ax25_linkfail_register);
+
+void ax25_linkfail_release(struct ax25_linkfail *lf)
+{
+	spin_lock_bh(&linkfail_lock);
+	hlist_del_init(&lf->lf_node);
+	spin_unlock_bh(&linkfail_lock);
+}
+
+EXPORT_SYMBOL(ax25_linkfail_release);
+
+int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
+{
+	struct listen_struct *listen;
+
+	if (ax25_listen_mine(callsign, dev))
+		return 0;
+
+	if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL)
+		return -ENOMEM;
+
+	listen->callsign = *callsign;
+	listen->dev      = dev;
+
+	spin_lock_bh(&listen_lock);
+	listen->next = listen_list;
+	listen_list  = listen;
+	spin_unlock_bh(&listen_lock);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(ax25_listen_register);
+
+void ax25_listen_release(ax25_address *callsign, struct net_device *dev)
+{
+	struct listen_struct *s, *listen;
+
+	spin_lock_bh(&listen_lock);
+	listen = listen_list;
+	if (listen == NULL) {
+		spin_unlock_bh(&listen_lock);
+		return;
+	}
+
+	if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) {
+		listen_list = listen->next;
+		spin_unlock_bh(&listen_lock);
+		kfree(listen);
+		return;
+	}
+
+	while (listen != NULL && listen->next != NULL) {
+		if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) {
+			s = listen->next;
+			listen->next = listen->next->next;
+			spin_unlock_bh(&listen_lock);
+			kfree(s);
+			return;
+		}
+
+		listen = listen->next;
+	}
+	spin_unlock_bh(&listen_lock);
+}
+
+EXPORT_SYMBOL(ax25_listen_release);
+
+int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *)
+{
+	int (*res)(struct sk_buff *, ax25_cb *) = NULL;
+	struct ax25_protocol *protocol;
+
+	read_lock(&protocol_list_lock);
+	for (protocol = protocol_list; protocol != NULL; protocol = protocol->next)
+		if (protocol->pid == pid) {
+			res = protocol->func;
+			break;
+		}
+	read_unlock(&protocol_list_lock);
+
+	return res;
+}
+
+int ax25_listen_mine(ax25_address *callsign, struct net_device *dev)
+{
+	struct listen_struct *listen;
+
+	spin_lock_bh(&listen_lock);
+	for (listen = listen_list; listen != NULL; listen = listen->next)
+		if (ax25cmp(&listen->callsign, callsign) == 0 &&
+		    (listen->dev == dev || listen->dev == NULL)) {
+			spin_unlock_bh(&listen_lock);
+			return 1;
+	}
+	spin_unlock_bh(&listen_lock);
+
+	return 0;
+}
+
+void ax25_link_failed(ax25_cb *ax25, int reason)
+{
+	struct ax25_linkfail *lf;
+	struct hlist_node *node;
+
+	spin_lock_bh(&linkfail_lock);
+	hlist_for_each_entry(lf, node, &ax25_linkfail_list, lf_node)
+		lf->func(ax25, reason);
+	spin_unlock_bh(&linkfail_lock);
+}
+
+int ax25_protocol_is_registered(unsigned int pid)
+{
+	struct ax25_protocol *protocol;
+	int res = 0;
+
+	read_lock_bh(&protocol_list_lock);
+	for (protocol = protocol_list; protocol != NULL; protocol = protocol->next)
+		if (protocol->pid == pid) {
+			res = 1;
+			break;
+		}
+	read_unlock_bh(&protocol_list_lock);
+
+	return res;
+}
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
new file mode 100644
index 00000000..9bb77654
--- /dev/null
+++ b/net/ax25/ax25_in.c
@@ -0,0 +1,456 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ *	Given a fragment, queue it on the fragment queue and if the fragment
+ *	is complete, send it back to ax25_rx_iframe.
+ */
+static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb)
+{
+	struct sk_buff *skbn, *skbo;
+
+	if (ax25->fragno != 0) {
+		if (!(*skb->data & AX25_SEG_FIRST)) {
+			if ((ax25->fragno - 1) == (*skb->data & AX25_SEG_REM)) {
+				/* Enqueue fragment */
+				ax25->fragno = *skb->data & AX25_SEG_REM;
+				skb_pull(skb, 1);	/* skip fragno */
+				ax25->fraglen += skb->len;
+				skb_queue_tail(&ax25->frag_queue, skb);
+
+				/* Last fragment received ? */
+				if (ax25->fragno == 0) {
+					skbn = alloc_skb(AX25_MAX_HEADER_LEN +
+							 ax25->fraglen,
+							 GFP_ATOMIC);
+					if (!skbn) {
+						skb_queue_purge(&ax25->frag_queue);
+						return 1;
+					}
+
+					skb_reserve(skbn, AX25_MAX_HEADER_LEN);
+
+					skbn->dev   = ax25->ax25_dev->dev;
+					skb_reset_network_header(skbn);
+					skb_reset_transport_header(skbn);
+
+					/* Copy data from the fragments */
+					while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) {
+						skb_copy_from_linear_data(skbo,
+							  skb_put(skbn, skbo->len),
+									  skbo->len);
+						kfree_skb(skbo);
+					}
+
+					ax25->fraglen = 0;
+
+					if (ax25_rx_iframe(ax25, skbn) == 0)
+						kfree_skb(skbn);
+				}
+
+				return 1;
+			}
+		}
+	} else {
+		/* First fragment received */
+		if (*skb->data & AX25_SEG_FIRST) {
+			skb_queue_purge(&ax25->frag_queue);
+			ax25->fragno = *skb->data & AX25_SEG_REM;
+			skb_pull(skb, 1);		/* skip fragno */
+			ax25->fraglen = skb->len;
+			skb_queue_tail(&ax25->frag_queue, skb);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	This is where all valid I frames are sent to, to be dispatched to
+ *	whichever protocol requires them.
+ */
+int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
+{
+	int (*func)(struct sk_buff *, ax25_cb *);
+	unsigned char pid;
+	int queued = 0;
+
+	if (skb == NULL) return 0;
+
+	ax25_start_idletimer(ax25);
+
+	pid = *skb->data;
+
+	if (pid == AX25_P_IP) {
+		/* working around a TCP bug to keep additional listeners
+		 * happy. TCP re-uses the buffer and destroys the original
+		 * content.
+		 */
+		struct sk_buff *skbn = skb_copy(skb, GFP_ATOMIC);
+		if (skbn != NULL) {
+			kfree_skb(skb);
+			skb = skbn;
+		}
+
+		skb_pull(skb, 1);	/* Remove PID */
+		skb->mac_header = skb->network_header;
+		skb_reset_network_header(skb);
+		skb->dev      = ax25->ax25_dev->dev;
+		skb->pkt_type = PACKET_HOST;
+		skb->protocol = htons(ETH_P_IP);
+		netif_rx(skb);
+		return 1;
+	}
+	if (pid == AX25_P_SEGMENT) {
+		skb_pull(skb, 1);	/* Remove PID */
+		return ax25_rx_fragment(ax25, skb);
+	}
+
+	if ((func = ax25_protocol_function(pid)) != NULL) {
+		skb_pull(skb, 1);	/* Remove PID */
+		return (*func)(skb, ax25);
+	}
+
+	if (ax25->sk != NULL && ax25->ax25_dev->values[AX25_VALUES_CONMODE] == 2) {
+		if ((!ax25->pidincl && ax25->sk->sk_protocol == pid) ||
+		    ax25->pidincl) {
+			if (sock_queue_rcv_skb(ax25->sk, skb) == 0)
+				queued = 1;
+			else
+				ax25->condition |= AX25_COND_OWN_RX_BUSY;
+		}
+	}
+
+	return queued;
+}
+
+/*
+ *	Higher level upcall for a LAPB frame
+ */
+static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, int dama)
+{
+	int queued = 0;
+
+	if (ax25->state == AX25_STATE_0)
+		return 0;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		queued = ax25_std_frame_in(ax25, skb, type);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (dama || ax25->ax25_dev->dama.slave)
+			queued = ax25_ds_frame_in(ax25, skb, type);
+		else
+			queued = ax25_std_frame_in(ax25, skb, type);
+		break;
+#endif
+	}
+
+	return queued;
+}
+
+static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
+	ax25_address *dev_addr, struct packet_type *ptype)
+{
+	ax25_address src, dest, *next_digi = NULL;
+	int type = 0, mine = 0, dama;
+	struct sock *make, *sk;
+	ax25_digi dp, reverse_dp;
+	ax25_cb *ax25;
+	ax25_dev *ax25_dev;
+
+	/*
+	 *	Process the AX.25/LAPB frame.
+	 */
+
+	skb_reset_transport_header(skb);
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		goto free;
+
+	/*
+	 *	Parse the address header.
+	 */
+
+	if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL)
+		goto free;
+
+	/*
+	 *	Ours perhaps ?
+	 */
+	if (dp.lastrepeat + 1 < dp.ndigi)		/* Not yet digipeated completely */
+		next_digi = &dp.calls[dp.lastrepeat + 1];
+
+	/*
+	 *	Pull of the AX.25 headers leaving the CTRL/PID bytes
+	 */
+	skb_pull(skb, ax25_addr_size(&dp));
+
+	/* For our port addresses ? */
+	if (ax25cmp(&dest, dev_addr) == 0 && dp.lastrepeat + 1 == dp.ndigi)
+		mine = 1;
+
+	/* Also match on any registered callsign from L3/4 */
+	if (!mine && ax25_listen_mine(&dest, dev) && dp.lastrepeat + 1 == dp.ndigi)
+		mine = 1;
+
+	/* UI frame - bypass LAPB processing */
+	if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) {
+		skb_set_transport_header(skb, 2); /* skip control and pid */
+
+		ax25_send_to_raw(&dest, skb, skb->data[1]);
+
+		if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0)
+			goto free;
+
+		/* Now we are pointing at the pid byte */
+		switch (skb->data[1]) {
+		case AX25_P_IP:
+			skb_pull(skb,2);		/* drop PID/CTRL */
+			skb_reset_transport_header(skb);
+			skb_reset_network_header(skb);
+			skb->dev      = dev;
+			skb->pkt_type = PACKET_HOST;
+			skb->protocol = htons(ETH_P_IP);
+			netif_rx(skb);
+			break;
+
+		case AX25_P_ARP:
+			skb_pull(skb,2);
+			skb_reset_transport_header(skb);
+			skb_reset_network_header(skb);
+			skb->dev      = dev;
+			skb->pkt_type = PACKET_HOST;
+			skb->protocol = htons(ETH_P_ARP);
+			netif_rx(skb);
+			break;
+		case AX25_P_TEXT:
+			/* Now find a suitable dgram socket */
+			sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
+			if (sk != NULL) {
+				bh_lock_sock(sk);
+				if (atomic_read(&sk->sk_rmem_alloc) >=
+				    sk->sk_rcvbuf) {
+					kfree_skb(skb);
+				} else {
+					/*
+					 *	Remove the control and PID.
+					 */
+					skb_pull(skb, 2);
+					if (sock_queue_rcv_skb(sk, skb) != 0)
+						kfree_skb(skb);
+				}
+				bh_unlock_sock(sk);
+				sock_put(sk);
+			} else {
+				kfree_skb(skb);
+			}
+			break;
+
+		default:
+			kfree_skb(skb);	/* Will scan SOCK_AX25 RAW sockets */
+			break;
+		}
+
+		return 0;
+	}
+
+	/*
+	 *	Is connected mode supported on this device ?
+	 *	If not, should we DM the incoming frame (except DMs) or
+	 *	silently ignore them. For now we stay quiet.
+	 */
+	if (ax25_dev->values[AX25_VALUES_CONMODE] == 0)
+		goto free;
+
+	/* LAPB */
+
+	/* AX.25 state 1-4 */
+
+	ax25_digi_invert(&dp, &reverse_dp);
+
+	if ((ax25 = ax25_find_cb(&dest, &src, &reverse_dp, dev)) != NULL) {
+		/*
+		 *	Process the frame. If it is queued up internally it
+		 *	returns one otherwise we free it immediately. This
+		 *	routine itself wakes the user context layers so we do
+		 *	no further work
+		 */
+		if (ax25_process_rx_frame(ax25, skb, type, dama) == 0)
+			kfree_skb(skb);
+
+		ax25_cb_put(ax25);
+		return 0;
+	}
+
+	/* AX.25 state 0 (disconnected) */
+
+	/* a) received not a SABM(E) */
+
+	if ((*skb->data & ~AX25_PF) != AX25_SABM &&
+	    (*skb->data & ~AX25_PF) != AX25_SABME) {
+		/*
+		 *	Never reply to a DM. Also ignore any connects for
+		 *	addresses that are not our interfaces and not a socket.
+		 */
+		if ((*skb->data & ~AX25_PF) != AX25_DM && mine)
+			ax25_return_dm(dev, &src, &dest, &dp);
+
+		goto free;
+	}
+
+	/* b) received SABM(E) */
+
+	if (dp.lastrepeat + 1 == dp.ndigi)
+		sk = ax25_find_listener(&dest, 0, dev, SOCK_SEQPACKET);
+	else
+		sk = ax25_find_listener(next_digi, 1, dev, SOCK_SEQPACKET);
+
+	if (sk != NULL) {
+		bh_lock_sock(sk);
+		if (sk_acceptq_is_full(sk) ||
+		    (make = ax25_make_new(sk, ax25_dev)) == NULL) {
+			if (mine)
+				ax25_return_dm(dev, &src, &dest, &dp);
+			kfree_skb(skb);
+			bh_unlock_sock(sk);
+			sock_put(sk);
+
+			return 0;
+		}
+
+		ax25 = ax25_sk(make);
+		skb_set_owner_r(skb, make);
+		skb_queue_head(&sk->sk_receive_queue, skb);
+
+		make->sk_state = TCP_ESTABLISHED;
+
+		sk->sk_ack_backlog++;
+		bh_unlock_sock(sk);
+	} else {
+		if (!mine)
+			goto free;
+
+		if ((ax25 = ax25_create_cb()) == NULL) {
+			ax25_return_dm(dev, &src, &dest, &dp);
+			goto free;
+		}
+
+		ax25_fillin_cb(ax25, ax25_dev);
+	}
+
+	ax25->source_addr = dest;
+	ax25->dest_addr   = src;
+
+	/*
+	 *	Sort out any digipeated paths.
+	 */
+	if (dp.ndigi && !ax25->digipeat &&
+	    (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+		kfree_skb(skb);
+		ax25_destroy_socket(ax25);
+		if (sk)
+			sock_put(sk);
+		return 0;
+	}
+
+	if (dp.ndigi == 0) {
+		kfree(ax25->digipeat);
+		ax25->digipeat = NULL;
+	} else {
+		/* Reverse the source SABM's path */
+		memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi));
+	}
+
+	if ((*skb->data & ~AX25_PF) == AX25_SABME) {
+		ax25->modulus = AX25_EMODULUS;
+		ax25->window  = ax25_dev->values[AX25_VALUES_EWINDOW];
+	} else {
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = ax25_dev->values[AX25_VALUES_WINDOW];
+	}
+
+	ax25_send_control(ax25, AX25_UA, AX25_POLLON, AX25_RESPONSE);
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	if (dama && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE)
+		ax25_dama_on(ax25);
+#endif
+
+	ax25->state = AX25_STATE_3;
+
+	ax25_cb_add(ax25);
+
+	ax25_start_heartbeat(ax25);
+	ax25_start_t3timer(ax25);
+	ax25_start_idletimer(ax25);
+
+	if (sk) {
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, skb->len);
+		sock_put(sk);
+	} else {
+free:
+		kfree_skb(skb);
+	}
+	return 0;
+}
+
+/*
+ *	Receive an AX.25 frame via a SLIP interface.
+ */
+int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
+		  struct packet_type *ptype, struct net_device *orig_dev)
+{
+	skb_orphan(skb);
+
+	if (!net_eq(dev_net(dev), &init_net)) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if ((*skb->data & 0x0F) != 0) {
+		kfree_skb(skb);	/* Not a KISS data frame */
+		return 0;
+	}
+
+	skb_pull(skb, AX25_KISS_HEADER_LEN);	/* Remove the KISS byte */
+
+	return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype);
+}
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
new file mode 100644
index 00000000..cf0c47a2
--- /dev/null
+++ b/net/ax25/ax25_ip.c
@@ -0,0 +1,243 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+/*
+ *	IP over AX.25 encapsulation.
+ */
+
+/*
+ *	Shove an AX.25 UI header on an IP packet and handle ARP
+ */
+
+#ifdef CONFIG_INET
+
+int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+		     unsigned short type, const void *daddr,
+		     const void *saddr, unsigned len)
+{
+	unsigned char *buff;
+
+	/* they sometimes come back to us... */
+	if (type == ETH_P_AX25)
+		return 0;
+
+	/* header is an AX.25 UI frame from us to them */
+	buff = skb_push(skb, AX25_HEADER_LEN);
+	*buff++ = 0x00;	/* KISS DATA */
+
+	if (daddr != NULL)
+		memcpy(buff, daddr, dev->addr_len);	/* Address specified */
+
+	buff[6] &= ~AX25_CBIT;
+	buff[6] &= ~AX25_EBIT;
+	buff[6] |= AX25_SSSID_SPARE;
+	buff    += AX25_ADDR_LEN;
+
+	if (saddr != NULL)
+		memcpy(buff, saddr, dev->addr_len);
+	else
+		memcpy(buff, dev->dev_addr, dev->addr_len);
+
+	buff[6] &= ~AX25_CBIT;
+	buff[6] |= AX25_EBIT;
+	buff[6] |= AX25_SSSID_SPARE;
+	buff    += AX25_ADDR_LEN;
+
+	*buff++  = AX25_UI;	/* UI */
+
+	/* Append a suitable AX.25 PID */
+	switch (type) {
+	case ETH_P_IP:
+		*buff++ = AX25_P_IP;
+		break;
+	case ETH_P_ARP:
+		*buff++ = AX25_P_ARP;
+		break;
+	default:
+		printk(KERN_ERR "AX.25: ax25_hard_header - wrong protocol type 0x%2.2x\n", type);
+		*buff++ = 0;
+		break;
+	}
+
+	if (daddr != NULL)
+		return AX25_HEADER_LEN;
+
+	return -AX25_HEADER_LEN;	/* Unfinished header */
+}
+
+int ax25_rebuild_header(struct sk_buff *skb)
+{
+	struct sk_buff *ourskb;
+	unsigned char *bp  = skb->data;
+	ax25_route *route;
+	struct net_device *dev = NULL;
+	ax25_address *src, *dst;
+	ax25_digi *digipeat = NULL;
+	ax25_dev *ax25_dev;
+	ax25_cb *ax25;
+	char ip_mode = ' ';
+
+	dst = (ax25_address *)(bp + 1);
+	src = (ax25_address *)(bp + 8);
+
+	if (arp_find(bp + 1, skb))
+		return 1;
+
+	route = ax25_get_route(dst, NULL);
+	if (route) {
+		digipeat = route->digipeat;
+		dev = route->dev;
+		ip_mode = route->ip_mode;
+	}
+
+	if (dev == NULL)
+		dev = skb->dev;
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
+		goto put;
+	}
+
+	if (bp[16] == AX25_P_IP) {
+		if (ip_mode == 'V' || (ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) {
+			/*
+			 *	We copy the buffer and release the original thereby
+			 *	keeping it straight
+			 *
+			 *	Note: we report 1 back so the caller will
+			 *	not feed the frame direct to the physical device
+			 *	We don't want that to happen. (It won't be upset
+			 *	as we have pulled the frame from the queue by
+			 *	freeing it).
+			 *
+			 *	NB: TCP modifies buffers that are still
+			 *	on a device queue, thus we use skb_copy()
+			 *      instead of using skb_clone() unless this
+			 *	gets fixed.
+			 */
+
+			ax25_address src_c;
+			ax25_address dst_c;
+
+			if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) {
+				kfree_skb(skb);
+				goto put;
+			}
+
+			if (skb->sk != NULL)
+				skb_set_owner_w(ourskb, skb->sk);
+
+			kfree_skb(skb);
+			/* dl9sau: bugfix
+			 * after kfree_skb(), dst and src which were pointer
+			 * to bp which is part of skb->data would not be valid
+			 * anymore hope that after skb_pull(ourskb, ..) our
+			 * dsc_c and src_c will not become invalid
+			 */
+			bp  = ourskb->data;
+			dst_c = *(ax25_address *)(bp + 1);
+			src_c = *(ax25_address *)(bp + 8);
+
+			skb_pull(ourskb, AX25_HEADER_LEN - 1);	/* Keep PID */
+			skb_reset_network_header(ourskb);
+
+			ax25=ax25_send_frame(
+			    ourskb,
+			    ax25_dev->values[AX25_VALUES_PACLEN],
+			    &src_c,
+			    &dst_c, digipeat, dev);
+			if (ax25) {
+				ax25_cb_put(ax25);
+			}
+			goto put;
+		}
+	}
+
+	bp[7]  &= ~AX25_CBIT;
+	bp[7]  &= ~AX25_EBIT;
+	bp[7]  |= AX25_SSSID_SPARE;
+
+	bp[14] &= ~AX25_CBIT;
+	bp[14] |= AX25_EBIT;
+	bp[14] |= AX25_SSSID_SPARE;
+
+	skb_pull(skb, AX25_KISS_HEADER_LEN);
+
+	if (digipeat != NULL) {
+		if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
+			kfree_skb(skb);
+			goto put;
+		}
+
+		skb = ourskb;
+	}
+
+	ax25_queue_xmit(skb, dev);
+
+put:
+	if (route)
+		ax25_put_route(route);
+
+	return 1;
+}
+
+#else	/* INET */
+
+int ax25_hard_header(struct sk_buff *skb, struct net_device *dev,
+		     unsigned short type, const void *daddr,
+		     const void *saddr, unsigned len)
+{
+	return -AX25_HEADER_LEN;
+}
+
+int ax25_rebuild_header(struct sk_buff *skb)
+{
+	return 1;
+}
+
+#endif
+
+const struct header_ops ax25_header_ops = {
+	.create = ax25_hard_header,
+	.rebuild = ax25_rebuild_header,
+};
+
+EXPORT_SYMBOL(ax25_hard_header);
+EXPORT_SYMBOL(ax25_rebuild_header);
+EXPORT_SYMBOL(ax25_header_ops);
+
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
new file mode 100644
index 00000000..37507d80
--- /dev/null
+++ b/net/ax25/ax25_out.c
@@ -0,0 +1,399 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/spinlock.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static DEFINE_SPINLOCK(ax25_frag_lock);
+
+ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
+{
+	ax25_dev *ax25_dev;
+	ax25_cb *ax25;
+
+	/*
+	 * Take the default packet length for the device if zero is
+	 * specified.
+	 */
+	if (paclen == 0) {
+		if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+			return NULL;
+
+		paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+	}
+
+	/*
+	 * Look for an existing connection.
+	 */
+	if ((ax25 = ax25_find_cb(src, dest, digi, dev)) != NULL) {
+		ax25_output(ax25, paclen, skb);
+		return ax25;		/* It already existed */
+	}
+
+	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+		return NULL;
+
+	if ((ax25 = ax25_create_cb()) == NULL)
+		return NULL;
+
+	ax25_fillin_cb(ax25, ax25_dev);
+
+	ax25->source_addr = *src;
+	ax25->dest_addr   = *dest;
+
+	if (digi != NULL) {
+		ax25->digipeat = kmemdup(digi, sizeof(*digi), GFP_ATOMIC);
+		if (ax25->digipeat == NULL) {
+			ax25_cb_put(ax25);
+			return NULL;
+		}
+	}
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_establish_data_link(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (ax25_dev->dama.slave)
+			ax25_ds_establish_data_link(ax25);
+		else
+			ax25_std_establish_data_link(ax25);
+		break;
+#endif
+	}
+
+	/*
+	 * There is one ref for the state machine; a caller needs
+	 * one more to put it back, just like with the existing one.
+	 */
+	ax25_cb_hold(ax25);
+
+	ax25_cb_add(ax25);
+
+	ax25->state = AX25_STATE_1;
+
+	ax25_start_heartbeat(ax25);
+
+	ax25_output(ax25, paclen, skb);
+
+	return ax25;			/* We had to create it */
+}
+
+EXPORT_SYMBOL(ax25_send_frame);
+
+/*
+ *	All outgoing AX.25 I frames pass via this routine. Therefore this is
+ *	where the fragmentation of frames takes place. If fragment is set to
+ *	zero then we are not allowed to do fragmentation, even if the frame
+ *	is too large.
+ */
+void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+	unsigned char *p;
+	int frontlen, len, fragno, ka9qfrag, first = 1;
+
+	if (paclen < 16) {
+		WARN_ON_ONCE(1);
+		kfree_skb(skb);
+		return;
+	}
+
+	if ((skb->len - 1) > paclen) {
+		if (*skb->data == AX25_P_TEXT) {
+			skb_pull(skb, 1); /* skip PID */
+			ka9qfrag = 0;
+		} else {
+			paclen -= 2;	/* Allow for fragment control info */
+			ka9qfrag = 1;
+		}
+
+		fragno = skb->len / paclen;
+		if (skb->len % paclen == 0) fragno--;
+
+		frontlen = skb_headroom(skb);	/* Address space + CTRL */
+
+		while (skb->len > 0) {
+			spin_lock_bh(&ax25_frag_lock);
+			if ((skbn = alloc_skb(paclen + 2 + frontlen, GFP_ATOMIC)) == NULL) {
+				spin_unlock_bh(&ax25_frag_lock);
+				printk(KERN_CRIT "AX.25: ax25_output - out of memory\n");
+				return;
+			}
+
+			if (skb->sk != NULL)
+				skb_set_owner_w(skbn, skb->sk);
+
+			spin_unlock_bh(&ax25_frag_lock);
+
+			len = (paclen > skb->len) ? skb->len : paclen;
+
+			if (ka9qfrag == 1) {
+				skb_reserve(skbn, frontlen + 2);
+				skb_set_network_header(skbn,
+						      skb_network_offset(skb));
+				skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+				p = skb_push(skbn, 2);
+
+				*p++ = AX25_P_SEGMENT;
+
+				*p = fragno--;
+				if (first) {
+					*p |= AX25_SEG_FIRST;
+					first = 0;
+				}
+			} else {
+				skb_reserve(skbn, frontlen + 1);
+				skb_set_network_header(skbn,
+						      skb_network_offset(skb));
+				skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+				p = skb_push(skbn, 1);
+				*p = AX25_P_TEXT;
+			}
+
+			skb_pull(skb, len);
+			skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */
+		}
+
+		kfree_skb(skb);
+	} else {
+		skb_queue_tail(&ax25->write_queue, skb);	  /* Throw it on the queue */
+	}
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_kick(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	/*
+	 * A DAMA slave is _required_ to work as normal AX.25L2V2
+	 * if no DAMA master is available.
+	 */
+	case AX25_PROTO_DAMA_SLAVE:
+		if (!ax25->ax25_dev->dama.slave) ax25_kick(ax25);
+		break;
+#endif
+	}
+}
+
+/*
+ *  This procedure is passed a buffer descriptor for an iframe. It builds
+ *  the rest of the control part of the frame and then writes it out.
+ */
+static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit)
+{
+	unsigned char *frame;
+
+	if (skb == NULL)
+		return;
+
+	skb_reset_network_header(skb);
+
+	if (ax25->modulus == AX25_MODULUS) {
+		frame = skb_push(skb, 1);
+
+		*frame = AX25_I;
+		*frame |= (poll_bit) ? AX25_PF : 0;
+		*frame |= (ax25->vr << 5);
+		*frame |= (ax25->vs << 1);
+	} else {
+		frame = skb_push(skb, 2);
+
+		frame[0] = AX25_I;
+		frame[0] |= (ax25->vs << 1);
+		frame[1] = (poll_bit) ? AX25_EPF : 0;
+		frame[1] |= (ax25->vr << 1);
+	}
+
+	ax25_start_idletimer(ax25);
+
+	ax25_transmit_buffer(ax25, skb, AX25_COMMAND);
+}
+
+void ax25_kick(ax25_cb *ax25)
+{
+	struct sk_buff *skb, *skbn;
+	int last = 1;
+	unsigned short start, end, next;
+
+	if (ax25->state != AX25_STATE_3 && ax25->state != AX25_STATE_4)
+		return;
+
+	if (ax25->condition & AX25_COND_PEER_RX_BUSY)
+		return;
+
+	if (skb_peek(&ax25->write_queue) == NULL)
+		return;
+
+	start = (skb_peek(&ax25->ack_queue) == NULL) ? ax25->va : ax25->vs;
+	end   = (ax25->va + ax25->window) % ax25->modulus;
+
+	if (start == end)
+		return;
+
+	/*
+	 * Transmit data until either we're out of data to send or
+	 * the window is full. Send a poll on the final I frame if
+	 * the window is filled.
+	 */
+
+	/*
+	 * Dequeue the frame and copy it.
+	 * Check for race with ax25_clear_queues().
+	 */
+	skb  = skb_dequeue(&ax25->write_queue);
+	if (!skb)
+		return;
+
+	ax25->vs = start;
+
+	do {
+		if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+			skb_queue_head(&ax25->write_queue, skb);
+			break;
+		}
+
+		if (skb->sk != NULL)
+			skb_set_owner_w(skbn, skb->sk);
+
+		next = (ax25->vs + 1) % ax25->modulus;
+		last = (next == end);
+
+		/*
+		 * Transmit the frame copy.
+		 * bke 960114: do not set the Poll bit on the last frame
+		 * in DAMA mode.
+		 */
+		switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+		case AX25_PROTO_STD_SIMPLEX:
+		case AX25_PROTO_STD_DUPLEX:
+			ax25_send_iframe(ax25, skbn, (last) ? AX25_POLLON : AX25_POLLOFF);
+			break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+		case AX25_PROTO_DAMA_SLAVE:
+			ax25_send_iframe(ax25, skbn, AX25_POLLOFF);
+			break;
+#endif
+		}
+
+		ax25->vs = next;
+
+		/*
+		 * Requeue the original data frame.
+		 */
+		skb_queue_tail(&ax25->ack_queue, skb);
+
+	} while (!last && (skb = skb_dequeue(&ax25->write_queue)) != NULL);
+
+	ax25->condition &= ~AX25_COND_ACK_PENDING;
+
+	if (!ax25_t1timer_running(ax25)) {
+		ax25_stop_t3timer(ax25);
+		ax25_calculate_t1(ax25);
+		ax25_start_t1timer(ax25);
+	}
+}
+
+void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+	struct sk_buff *skbn;
+	unsigned char *ptr;
+	int headroom;
+
+	if (ax25->ax25_dev == NULL) {
+		ax25_disconnect(ax25, ENETUNREACH);
+		return;
+	}
+
+	headroom = ax25_addr_size(ax25->digipeat);
+
+	if (skb_headroom(skb) < headroom) {
+		if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) {
+			printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n");
+			kfree_skb(skb);
+			return;
+		}
+
+		if (skb->sk != NULL)
+			skb_set_owner_w(skbn, skb->sk);
+
+		kfree_skb(skb);
+		skb = skbn;
+	}
+
+	ptr = skb_push(skb, headroom);
+
+	ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus);
+
+	ax25_queue_xmit(skb, ax25->ax25_dev->dev);
+}
+
+/*
+ *	A small shim to dev_queue_xmit to add the KISS control byte, and do
+ *	any packet forwarding in operation.
+ */
+void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	unsigned char *ptr;
+
+	skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev));
+
+	ptr  = skb_push(skb, 1);
+	*ptr = 0x00;			/* KISS */
+
+	dev_queue_xmit(skb);
+}
+
+int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
+{
+	if (ax25->vs == nr) {
+		ax25_frames_acked(ax25, nr);
+		ax25_calculate_rtt(ax25);
+		ax25_stop_t1timer(ax25);
+		ax25_start_t3timer(ax25);
+		return 1;
+	} else {
+		if (ax25->va != nr) {
+			ax25_frames_acked(ax25, nr);
+			ax25_calculate_t1(ax25);
+			ax25_start_t1timer(ax25);
+			return 1;
+		}
+	}
+	return 0;
+}
+
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
new file mode 100644
index 00000000..a1690845
--- /dev/null
+++ b/net/ax25/ax25_route.c
@@ -0,0 +1,505 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+static ax25_route *ax25_route_list;
+static DEFINE_RWLOCK(ax25_route_lock);
+
+void ax25_rt_device_down(struct net_device *dev)
+{
+	ax25_route *s, *t, *ax25_rt;
+
+	write_lock_bh(&ax25_route_lock);
+	ax25_rt = ax25_route_list;
+	while (ax25_rt != NULL) {
+		s       = ax25_rt;
+		ax25_rt = ax25_rt->next;
+
+		if (s->dev == dev) {
+			if (ax25_route_list == s) {
+				ax25_route_list = s->next;
+				kfree(s->digipeat);
+				kfree(s);
+			} else {
+				for (t = ax25_route_list; t != NULL; t = t->next) {
+					if (t->next == s) {
+						t->next = s->next;
+						kfree(s->digipeat);
+						kfree(s);
+						break;
+					}
+				}
+			}
+		}
+	}
+	write_unlock_bh(&ax25_route_lock);
+}
+
+static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
+{
+	ax25_route *ax25_rt;
+	ax25_dev *ax25_dev;
+	int i;
+
+	if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
+		return -EINVAL;
+	if (route->digi_count > AX25_MAX_DIGIS)
+		return -EINVAL;
+
+	write_lock_bh(&ax25_route_lock);
+
+	ax25_rt = ax25_route_list;
+	while (ax25_rt != NULL) {
+		if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 &&
+			    ax25_rt->dev == ax25_dev->dev) {
+			kfree(ax25_rt->digipeat);
+			ax25_rt->digipeat = NULL;
+			if (route->digi_count != 0) {
+				if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+					write_unlock_bh(&ax25_route_lock);
+					return -ENOMEM;
+				}
+				ax25_rt->digipeat->lastrepeat = -1;
+				ax25_rt->digipeat->ndigi      = route->digi_count;
+				for (i = 0; i < route->digi_count; i++) {
+					ax25_rt->digipeat->repeated[i] = 0;
+					ax25_rt->digipeat->calls[i]    = route->digi_addr[i];
+				}
+			}
+			write_unlock_bh(&ax25_route_lock);
+			return 0;
+		}
+		ax25_rt = ax25_rt->next;
+	}
+
+	if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) {
+		write_unlock_bh(&ax25_route_lock);
+		return -ENOMEM;
+	}
+
+	atomic_set(&ax25_rt->refcount, 1);
+	ax25_rt->callsign     = route->dest_addr;
+	ax25_rt->dev          = ax25_dev->dev;
+	ax25_rt->digipeat     = NULL;
+	ax25_rt->ip_mode      = ' ';
+	if (route->digi_count != 0) {
+		if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
+			write_unlock_bh(&ax25_route_lock);
+			kfree(ax25_rt);
+			return -ENOMEM;
+		}
+		ax25_rt->digipeat->lastrepeat = -1;
+		ax25_rt->digipeat->ndigi      = route->digi_count;
+		for (i = 0; i < route->digi_count; i++) {
+			ax25_rt->digipeat->repeated[i] = 0;
+			ax25_rt->digipeat->calls[i]    = route->digi_addr[i];
+		}
+	}
+	ax25_rt->next   = ax25_route_list;
+	ax25_route_list = ax25_rt;
+	write_unlock_bh(&ax25_route_lock);
+
+	return 0;
+}
+
+void __ax25_put_route(ax25_route *ax25_rt)
+{
+	kfree(ax25_rt->digipeat);
+	kfree(ax25_rt);
+}
+
+static int ax25_rt_del(struct ax25_routes_struct *route)
+{
+	ax25_route *s, *t, *ax25_rt;
+	ax25_dev *ax25_dev;
+
+	if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
+		return -EINVAL;
+
+	write_lock_bh(&ax25_route_lock);
+
+	ax25_rt = ax25_route_list;
+	while (ax25_rt != NULL) {
+		s       = ax25_rt;
+		ax25_rt = ax25_rt->next;
+		if (s->dev == ax25_dev->dev &&
+		    ax25cmp(&route->dest_addr, &s->callsign) == 0) {
+			if (ax25_route_list == s) {
+				ax25_route_list = s->next;
+				ax25_put_route(s);
+			} else {
+				for (t = ax25_route_list; t != NULL; t = t->next) {
+					if (t->next == s) {
+						t->next = s->next;
+						ax25_put_route(s);
+						break;
+					}
+				}
+			}
+		}
+	}
+	write_unlock_bh(&ax25_route_lock);
+
+	return 0;
+}
+
+static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option)
+{
+	ax25_route *ax25_rt;
+	ax25_dev *ax25_dev;
+	int err = 0;
+
+	if ((ax25_dev = ax25_addr_ax25dev(&rt_option->port_addr)) == NULL)
+		return -EINVAL;
+
+	write_lock_bh(&ax25_route_lock);
+
+	ax25_rt = ax25_route_list;
+	while (ax25_rt != NULL) {
+		if (ax25_rt->dev == ax25_dev->dev &&
+		    ax25cmp(&rt_option->dest_addr, &ax25_rt->callsign) == 0) {
+			switch (rt_option->cmd) {
+			case AX25_SET_RT_IPMODE:
+				switch (rt_option->arg) {
+				case ' ':
+				case 'D':
+				case 'V':
+					ax25_rt->ip_mode = rt_option->arg;
+					break;
+				default:
+					err = -EINVAL;
+					goto out;
+				}
+				break;
+			default:
+				err = -EINVAL;
+				goto out;
+			}
+		}
+		ax25_rt = ax25_rt->next;
+	}
+
+out:
+	write_unlock_bh(&ax25_route_lock);
+	return err;
+}
+
+int ax25_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct ax25_route_opt_struct rt_option;
+	struct ax25_routes_struct route;
+
+	switch (cmd) {
+	case SIOCADDRT:
+		if (copy_from_user(&route, arg, sizeof(route)))
+			return -EFAULT;
+		return ax25_rt_add(&route);
+
+	case SIOCDELRT:
+		if (copy_from_user(&route, arg, sizeof(route)))
+			return -EFAULT;
+		return ax25_rt_del(&route);
+
+	case SIOCAX25OPTRT:
+		if (copy_from_user(&rt_option, arg, sizeof(rt_option)))
+			return -EFAULT;
+		return ax25_rt_opt(&rt_option);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(ax25_route_lock)
+{
+	struct ax25_route *ax25_rt;
+	int i = 1;
+
+	read_lock(&ax25_route_lock);
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) {
+		if (i == *pos)
+			return ax25_rt;
+		++i;
+	}
+
+	return NULL;
+}
+
+static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return (v == SEQ_START_TOKEN) ? ax25_route_list :
+		((struct ax25_route *) v)->next;
+}
+
+static void ax25_rt_seq_stop(struct seq_file *seq, void *v)
+	__releases(ax25_route_lock)
+{
+	read_unlock(&ax25_route_lock);
+}
+
+static int ax25_rt_seq_show(struct seq_file *seq, void *v)
+{
+	char buf[11];
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "callsign  dev  mode digipeaters\n");
+	else {
+		struct ax25_route *ax25_rt = v;
+		const char *callsign;
+		int i;
+
+		if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0)
+			callsign = "default";
+		else
+			callsign = ax2asc(buf, &ax25_rt->callsign);
+
+		seq_printf(seq, "%-9s %-4s",
+			callsign,
+			ax25_rt->dev ? ax25_rt->dev->name : "???");
+
+		switch (ax25_rt->ip_mode) {
+		case 'V':
+			seq_puts(seq, "   vc");
+			break;
+		case 'D':
+			seq_puts(seq, "   dg");
+			break;
+		default:
+			seq_puts(seq, "    *");
+			break;
+		}
+
+		if (ax25_rt->digipeat != NULL)
+			for (i = 0; i < ax25_rt->digipeat->ndigi; i++)
+				seq_printf(seq, " %s",
+				     ax2asc(buf, &ax25_rt->digipeat->calls[i]));
+
+		seq_puts(seq, "\n");
+	}
+	return 0;
+}
+
+static const struct seq_operations ax25_rt_seqops = {
+	.start = ax25_rt_seq_start,
+	.next = ax25_rt_seq_next,
+	.stop = ax25_rt_seq_stop,
+	.show = ax25_rt_seq_show,
+};
+
+static int ax25_rt_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ax25_rt_seqops);
+}
+
+const struct file_operations ax25_route_fops = {
+	.owner = THIS_MODULE,
+	.open = ax25_rt_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+/*
+ *	Find AX.25 route
+ *
+ *	Only routes with a reference count of zero can be destroyed.
+ */
+ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
+{
+	ax25_route *ax25_spe_rt = NULL;
+	ax25_route *ax25_def_rt = NULL;
+	ax25_route *ax25_rt;
+
+	read_lock(&ax25_route_lock);
+	/*
+	 *	Bind to the physical interface we heard them on, or the default
+	 *	route if none is found;
+	 */
+	for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) {
+		if (dev == NULL) {
+			if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev != NULL)
+				ax25_spe_rt = ax25_rt;
+			if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev != NULL)
+				ax25_def_rt = ax25_rt;
+		} else {
+			if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev == dev)
+				ax25_spe_rt = ax25_rt;
+			if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev == dev)
+				ax25_def_rt = ax25_rt;
+		}
+	}
+
+	ax25_rt = ax25_def_rt;
+	if (ax25_spe_rt != NULL)
+		ax25_rt = ax25_spe_rt;
+
+	if (ax25_rt != NULL)
+		ax25_hold_route(ax25_rt);
+
+	read_unlock(&ax25_route_lock);
+
+	return ax25_rt;
+}
+
+/*
+ *	Adjust path: If you specify a default route and want to connect
+ *      a target on the digipeater path but w/o having a special route
+ *	set before, the path has to be truncated from your target on.
+ */
+static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
+{
+	int k;
+
+	for (k = 0; k < digipeat->ndigi; k++) {
+		if (ax25cmp(addr, &digipeat->calls[k]) == 0)
+			break;
+	}
+
+	digipeat->ndigi = k;
+}
+
+
+/*
+ *	Find which interface to use.
+ */
+int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
+{
+	ax25_uid_assoc *user;
+	ax25_route *ax25_rt;
+	int err = 0;
+
+	if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
+		return -EHOSTUNREACH;
+
+	if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
+		err = -EHOSTUNREACH;
+		goto put;
+	}
+
+	user = ax25_findbyuid(current_euid());
+	if (user) {
+		ax25->source_addr = user->call;
+		ax25_uid_put(user);
+	} else {
+		if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
+			err = -EPERM;
+			goto put;
+		}
+		ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
+	}
+
+	if (ax25_rt->digipeat != NULL) {
+		ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi),
+					 GFP_ATOMIC);
+		if (ax25->digipeat == NULL) {
+			err = -ENOMEM;
+			goto put;
+		}
+		ax25_adjust_path(addr, ax25->digipeat);
+	}
+
+	if (ax25->sk != NULL) {
+		bh_lock_sock(ax25->sk);
+		sock_reset_flag(ax25->sk, SOCK_ZAPPED);
+		bh_unlock_sock(ax25->sk);
+	}
+
+put:
+	ax25_put_route(ax25_rt);
+
+	return err;
+}
+
+struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
+	ax25_address *dest, ax25_digi *digi)
+{
+	struct sk_buff *skbn;
+	unsigned char *bp;
+	int len;
+
+	len = digi->ndigi * AX25_ADDR_LEN;
+
+	if (skb_headroom(skb) < len) {
+		if ((skbn = skb_realloc_headroom(skb, len)) == NULL) {
+			printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n");
+			return NULL;
+		}
+
+		if (skb->sk != NULL)
+			skb_set_owner_w(skbn, skb->sk);
+
+		kfree_skb(skb);
+
+		skb = skbn;
+	}
+
+	bp = skb_push(skb, len);
+
+	ax25_addr_build(bp, src, dest, digi, AX25_COMMAND, AX25_MODULUS);
+
+	return skb;
+}
+
+/*
+ *	Free all memory associated with routing structures.
+ */
+void __exit ax25_rt_free(void)
+{
+	ax25_route *s, *ax25_rt = ax25_route_list;
+
+	write_lock_bh(&ax25_route_lock);
+	while (ax25_rt != NULL) {
+		s       = ax25_rt;
+		ax25_rt = ax25_rt->next;
+
+		kfree(s->digipeat);
+		kfree(s);
+	}
+	write_unlock_bh(&ax25_route_lock);
+}
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
new file mode 100644
index 00000000..a8eef88d
--- /dev/null
+++ b/net/ax25/ax25_std_in.c
@@ -0,0 +1,447 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
+ *
+ * Most of this code is based on the SDL diagrams published in the 7th ARRL
+ * Computer Networking Conference papers. The diagrams have mistakes in them,
+ * but are mostly correct. Before you modify the code could you read the SDL
+ * diagrams as the code is not obvious and probably very easy to break.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ *	State machine for state 1, Awaiting Connection State.
+ *	The handling of the timer(s) is in file ax25_std_timer.c.
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+	switch (frametype) {
+	case AX25_SABM:
+		ax25->modulus = AX25_MODULUS;
+		ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_SABME:
+		ax25->modulus = AX25_EMODULUS;
+		ax25->window  = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_UA:
+		if (pf) {
+			ax25_calculate_rtt(ax25);
+			ax25_stop_t1timer(ax25);
+			ax25_start_t3timer(ax25);
+			ax25_start_idletimer(ax25);
+			ax25->vs      = 0;
+			ax25->va      = 0;
+			ax25->vr      = 0;
+			ax25->state   = AX25_STATE_3;
+			ax25->n2count = 0;
+			if (ax25->sk != NULL) {
+				bh_lock_sock(ax25->sk);
+				ax25->sk->sk_state = TCP_ESTABLISHED;
+				/* For WAIT_SABM connections we will produce an accept ready socket here */
+				if (!sock_flag(ax25->sk, SOCK_DEAD))
+					ax25->sk->sk_state_change(ax25->sk);
+				bh_unlock_sock(ax25->sk);
+			}
+		}
+		break;
+
+	case AX25_DM:
+		if (pf) {
+			if (ax25->modulus == AX25_MODULUS) {
+				ax25_disconnect(ax25, ECONNREFUSED);
+			} else {
+				ax25->modulus = AX25_MODULUS;
+				ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+			}
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ *	State machine for state 2, Awaiting Release State.
+ *	The handling of the timer(s) is in file ax25_std_timer.c
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type)
+{
+	switch (frametype) {
+	case AX25_SABM:
+	case AX25_SABME:
+		ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_DM:
+	case AX25_UA:
+		if (pf)
+			ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_I:
+	case AX25_REJ:
+	case AX25_RNR:
+	case AX25_RR:
+		if (pf) ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ *	State machine for state 3, Connected State.
+ *	The handling of the timer(s) is in file ax25_std_timer.c
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+	int queued = 0;
+
+	switch (frametype) {
+	case AX25_SABM:
+	case AX25_SABME:
+		if (frametype == AX25_SABM) {
+			ax25->modulus = AX25_MODULUS;
+			ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		} else {
+			ax25->modulus = AX25_EMODULUS;
+			ax25->window  = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+		}
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_stop_t1timer(ax25);
+		ax25_stop_t2timer(ax25);
+		ax25_start_t3timer(ax25);
+		ax25_start_idletimer(ax25);
+		ax25->condition = 0x00;
+		ax25->vs        = 0;
+		ax25->va        = 0;
+		ax25->vr        = 0;
+		ax25_requeue_frames(ax25);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_DM:
+		ax25_disconnect(ax25, ECONNRESET);
+		break;
+
+	case AX25_RR:
+	case AX25_RNR:
+		if (frametype == AX25_RR)
+			ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+		else
+			ax25->condition |= AX25_COND_PEER_RX_BUSY;
+		if (type == AX25_COMMAND && pf)
+			ax25_std_enquiry_response(ax25);
+		if (ax25_validate_nr(ax25, nr)) {
+			ax25_check_iframes_acked(ax25, nr);
+		} else {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_REJ:
+		ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+		if (type == AX25_COMMAND && pf)
+			ax25_std_enquiry_response(ax25);
+		if (ax25_validate_nr(ax25, nr)) {
+			ax25_frames_acked(ax25, nr);
+			ax25_calculate_rtt(ax25);
+			ax25_stop_t1timer(ax25);
+			ax25_start_t3timer(ax25);
+			ax25_requeue_frames(ax25);
+		} else {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_I:
+		if (!ax25_validate_nr(ax25, nr)) {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+			break;
+		}
+		if (ax25->condition & AX25_COND_PEER_RX_BUSY) {
+			ax25_frames_acked(ax25, nr);
+		} else {
+			ax25_check_iframes_acked(ax25, nr);
+		}
+		if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+			if (pf) ax25_std_enquiry_response(ax25);
+			break;
+		}
+		if (ns == ax25->vr) {
+			ax25->vr = (ax25->vr + 1) % ax25->modulus;
+			queued = ax25_rx_iframe(ax25, skb);
+			if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+				ax25->vr = ns;	/* ax25->vr - 1 */
+			ax25->condition &= ~AX25_COND_REJECT;
+			if (pf) {
+				ax25_std_enquiry_response(ax25);
+			} else {
+				if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+					ax25->condition |= AX25_COND_ACK_PENDING;
+					ax25_start_t2timer(ax25);
+				}
+			}
+		} else {
+			if (ax25->condition & AX25_COND_REJECT) {
+				if (pf) ax25_std_enquiry_response(ax25);
+			} else {
+				ax25->condition |= AX25_COND_REJECT;
+				ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE);
+				ax25->condition &= ~AX25_COND_ACK_PENDING;
+			}
+		}
+		break;
+
+	case AX25_FRMR:
+	case AX25_ILLEGAL:
+		ax25_std_establish_data_link(ax25);
+		ax25->state = AX25_STATE_1;
+		break;
+
+	default:
+		break;
+	}
+
+	return queued;
+}
+
+/*
+ *	State machine for state 4, Timer Recovery State.
+ *	The handling of the timer(s) is in file ax25_std_timer.c
+ *	Handling of state 0 and connection release is in ax25.c.
+ */
+static int ax25_std_state4_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type)
+{
+	int queued = 0;
+
+	switch (frametype) {
+	case AX25_SABM:
+	case AX25_SABME:
+		if (frametype == AX25_SABM) {
+			ax25->modulus = AX25_MODULUS;
+			ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+		} else {
+			ax25->modulus = AX25_EMODULUS;
+			ax25->window  = ax25->ax25_dev->values[AX25_VALUES_EWINDOW];
+		}
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_stop_t1timer(ax25);
+		ax25_stop_t2timer(ax25);
+		ax25_start_t3timer(ax25);
+		ax25_start_idletimer(ax25);
+		ax25->condition = 0x00;
+		ax25->vs        = 0;
+		ax25->va        = 0;
+		ax25->vr        = 0;
+		ax25->state     = AX25_STATE_3;
+		ax25->n2count   = 0;
+		ax25_requeue_frames(ax25);
+		break;
+
+	case AX25_DISC:
+		ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE);
+		ax25_disconnect(ax25, 0);
+		break;
+
+	case AX25_DM:
+		ax25_disconnect(ax25, ECONNRESET);
+		break;
+
+	case AX25_RR:
+	case AX25_RNR:
+		if (frametype == AX25_RR)
+			ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+		else
+			ax25->condition |= AX25_COND_PEER_RX_BUSY;
+		if (type == AX25_RESPONSE && pf) {
+			ax25_stop_t1timer(ax25);
+			ax25->n2count = 0;
+			if (ax25_validate_nr(ax25, nr)) {
+				ax25_frames_acked(ax25, nr);
+				if (ax25->vs == ax25->va) {
+					ax25_start_t3timer(ax25);
+					ax25->state   = AX25_STATE_3;
+				} else {
+					ax25_requeue_frames(ax25);
+				}
+			} else {
+				ax25_std_nr_error_recovery(ax25);
+				ax25->state = AX25_STATE_1;
+			}
+			break;
+		}
+		if (type == AX25_COMMAND && pf)
+			ax25_std_enquiry_response(ax25);
+		if (ax25_validate_nr(ax25, nr)) {
+			ax25_frames_acked(ax25, nr);
+		} else {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_REJ:
+		ax25->condition &= ~AX25_COND_PEER_RX_BUSY;
+		if (pf && type == AX25_RESPONSE) {
+			ax25_stop_t1timer(ax25);
+			ax25->n2count = 0;
+			if (ax25_validate_nr(ax25, nr)) {
+				ax25_frames_acked(ax25, nr);
+				if (ax25->vs == ax25->va) {
+					ax25_start_t3timer(ax25);
+					ax25->state   = AX25_STATE_3;
+				} else {
+					ax25_requeue_frames(ax25);
+				}
+			} else {
+				ax25_std_nr_error_recovery(ax25);
+				ax25->state = AX25_STATE_1;
+			}
+			break;
+		}
+		if (type == AX25_COMMAND && pf)
+			ax25_std_enquiry_response(ax25);
+		if (ax25_validate_nr(ax25, nr)) {
+			ax25_frames_acked(ax25, nr);
+			ax25_requeue_frames(ax25);
+		} else {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+		}
+		break;
+
+	case AX25_I:
+		if (!ax25_validate_nr(ax25, nr)) {
+			ax25_std_nr_error_recovery(ax25);
+			ax25->state = AX25_STATE_1;
+			break;
+		}
+		ax25_frames_acked(ax25, nr);
+		if (ax25->condition & AX25_COND_OWN_RX_BUSY) {
+			if (pf)
+				ax25_std_enquiry_response(ax25);
+			break;
+		}
+		if (ns == ax25->vr) {
+			ax25->vr = (ax25->vr + 1) % ax25->modulus;
+			queued = ax25_rx_iframe(ax25, skb);
+			if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+				ax25->vr = ns;	/* ax25->vr - 1 */
+			ax25->condition &= ~AX25_COND_REJECT;
+			if (pf) {
+				ax25_std_enquiry_response(ax25);
+			} else {
+				if (!(ax25->condition & AX25_COND_ACK_PENDING)) {
+					ax25->condition |= AX25_COND_ACK_PENDING;
+					ax25_start_t2timer(ax25);
+				}
+			}
+		} else {
+			if (ax25->condition & AX25_COND_REJECT) {
+				if (pf) ax25_std_enquiry_response(ax25);
+			} else {
+				ax25->condition |= AX25_COND_REJECT;
+				ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE);
+				ax25->condition &= ~AX25_COND_ACK_PENDING;
+			}
+		}
+		break;
+
+	case AX25_FRMR:
+	case AX25_ILLEGAL:
+		ax25_std_establish_data_link(ax25);
+		ax25->state = AX25_STATE_1;
+		break;
+
+	default:
+		break;
+	}
+
+	return queued;
+}
+
+/*
+ *	Higher level upcall for a LAPB frame
+ */
+int ax25_std_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
+{
+	int queued = 0, frametype, ns, nr, pf;
+
+	frametype = ax25_decode(ax25, skb, &ns, &nr, &pf);
+
+	switch (ax25->state) {
+	case AX25_STATE_1:
+		queued = ax25_std_state1_machine(ax25, skb, frametype, pf, type);
+		break;
+	case AX25_STATE_2:
+		queued = ax25_std_state2_machine(ax25, skb, frametype, pf, type);
+		break;
+	case AX25_STATE_3:
+		queued = ax25_std_state3_machine(ax25, skb, frametype, ns, nr, pf, type);
+		break;
+	case AX25_STATE_4:
+		queued = ax25_std_state4_machine(ax25, skb, frametype, ns, nr, pf, type);
+		break;
+	}
+
+	ax25_kick(ax25);
+
+	return queued;
+}
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
new file mode 100644
index 00000000..277f81bb
--- /dev/null
+++ b/net/ax25/ax25_std_subr.c
@@ -0,0 +1,87 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void ax25_std_nr_error_recovery(ax25_cb *ax25)
+{
+	ax25_std_establish_data_link(ax25);
+}
+
+void ax25_std_establish_data_link(ax25_cb *ax25)
+{
+	ax25->condition = 0x00;
+	ax25->n2count   = 0;
+
+	if (ax25->modulus == AX25_MODULUS)
+		ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+	else
+		ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND);
+
+	ax25_calculate_t1(ax25);
+	ax25_stop_idletimer(ax25);
+	ax25_stop_t3timer(ax25);
+	ax25_stop_t2timer(ax25);
+	ax25_start_t1timer(ax25);
+}
+
+void ax25_std_transmit_enquiry(ax25_cb *ax25)
+{
+	if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+		ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_COMMAND);
+	else
+		ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_COMMAND);
+
+	ax25->condition &= ~AX25_COND_ACK_PENDING;
+
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+}
+
+void ax25_std_enquiry_response(ax25_cb *ax25)
+{
+	if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+		ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_RESPONSE);
+	else
+		ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_RESPONSE);
+
+	ax25->condition &= ~AX25_COND_ACK_PENDING;
+}
+
+void ax25_std_timeout_response(ax25_cb *ax25)
+{
+	if (ax25->condition & AX25_COND_OWN_RX_BUSY)
+		ax25_send_control(ax25, AX25_RNR, AX25_POLLOFF, AX25_RESPONSE);
+	else
+		ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE);
+
+	ax25->condition &= ~AX25_COND_ACK_PENDING;
+}
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
new file mode 100644
index 00000000..96e4b927
--- /dev/null
+++ b/net/ax25/ax25_std_timer.c
@@ -0,0 +1,176 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+void ax25_std_heartbeat_expiry(ax25_cb *ax25)
+{
+	struct sock *sk = ax25->sk;
+
+	if (sk)
+		bh_lock_sock(sk);
+
+	switch (ax25->state) {
+	case AX25_STATE_0:
+		/* Magic here: If we listen() and a new link dies before it
+		   is accepted() it isn't 'dead' so doesn't get removed. */
+		if (!sk || sock_flag(sk, SOCK_DESTROY) ||
+		    (sk->sk_state == TCP_LISTEN &&
+		     sock_flag(sk, SOCK_DEAD))) {
+			if (sk) {
+				sock_hold(sk);
+				ax25_destroy_socket(ax25);
+				bh_unlock_sock(sk);
+				sock_put(sk);
+			} else
+				ax25_destroy_socket(ax25);
+			return;
+		}
+		break;
+
+	case AX25_STATE_3:
+	case AX25_STATE_4:
+		/*
+		 * Check the state of the receive buffer.
+		 */
+		if (sk != NULL) {
+			if (atomic_read(&sk->sk_rmem_alloc) <
+			    (sk->sk_rcvbuf >> 1) &&
+			    (ax25->condition & AX25_COND_OWN_RX_BUSY)) {
+				ax25->condition &= ~AX25_COND_OWN_RX_BUSY;
+				ax25->condition &= ~AX25_COND_ACK_PENDING;
+				ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE);
+				break;
+			}
+		}
+	}
+
+	if (sk)
+		bh_unlock_sock(sk);
+
+	ax25_start_heartbeat(ax25);
+}
+
+void ax25_std_t2timer_expiry(ax25_cb *ax25)
+{
+	if (ax25->condition & AX25_COND_ACK_PENDING) {
+		ax25->condition &= ~AX25_COND_ACK_PENDING;
+		ax25_std_timeout_response(ax25);
+	}
+}
+
+void ax25_std_t3timer_expiry(ax25_cb *ax25)
+{
+	ax25->n2count = 0;
+	ax25_std_transmit_enquiry(ax25);
+	ax25->state   = AX25_STATE_4;
+}
+
+void ax25_std_idletimer_expiry(ax25_cb *ax25)
+{
+	ax25_clear_queues(ax25);
+
+	ax25->n2count = 0;
+	ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+	ax25->state   = AX25_STATE_2;
+
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+	ax25_stop_t2timer(ax25);
+	ax25_stop_t3timer(ax25);
+
+	if (ax25->sk != NULL) {
+		bh_lock_sock(ax25->sk);
+		ax25->sk->sk_state     = TCP_CLOSE;
+		ax25->sk->sk_err       = 0;
+		ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+		if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+			ax25->sk->sk_state_change(ax25->sk);
+			sock_set_flag(ax25->sk, SOCK_DEAD);
+		}
+		bh_unlock_sock(ax25->sk);
+	}
+}
+
+void ax25_std_t1timer_expiry(ax25_cb *ax25)
+{
+	switch (ax25->state) {
+	case AX25_STATE_1:
+		if (ax25->n2count == ax25->n2) {
+			if (ax25->modulus == AX25_MODULUS) {
+				ax25_disconnect(ax25, ETIMEDOUT);
+				return;
+			} else {
+				ax25->modulus = AX25_MODULUS;
+				ax25->window  = ax25->ax25_dev->values[AX25_VALUES_WINDOW];
+				ax25->n2count = 0;
+				ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+			}
+		} else {
+			ax25->n2count++;
+			if (ax25->modulus == AX25_MODULUS)
+				ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND);
+			else
+				ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND);
+		}
+		break;
+
+	case AX25_STATE_2:
+		if (ax25->n2count == ax25->n2) {
+			ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+			ax25_disconnect(ax25, ETIMEDOUT);
+			return;
+		} else {
+			ax25->n2count++;
+			ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND);
+		}
+		break;
+
+	case AX25_STATE_3:
+		ax25->n2count = 1;
+		ax25_std_transmit_enquiry(ax25);
+		ax25->state   = AX25_STATE_4;
+		break;
+
+	case AX25_STATE_4:
+		if (ax25->n2count == ax25->n2) {
+			ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE);
+			ax25_disconnect(ax25, ETIMEDOUT);
+			return;
+		} else {
+			ax25->n2count++;
+			ax25_std_transmit_enquiry(ax25);
+		}
+		break;
+	}
+
+	ax25_calculate_t1(ax25);
+	ax25_start_t1timer(ax25);
+}
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
new file mode 100644
index 00000000..c6715ee4
--- /dev/null
+++ b/net/ax25/ax25_subr.c
@@ -0,0 +1,290 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+/*
+ *	This routine purges all the queues of frames.
+ */
+void ax25_clear_queues(ax25_cb *ax25)
+{
+	skb_queue_purge(&ax25->write_queue);
+	skb_queue_purge(&ax25->ack_queue);
+	skb_queue_purge(&ax25->reseq_queue);
+	skb_queue_purge(&ax25->frag_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void ax25_frames_acked(ax25_cb *ax25, unsigned short nr)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Remove all the ack-ed frames from the ack queue.
+	 */
+	if (ax25->va != nr) {
+		while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) {
+			skb = skb_dequeue(&ax25->ack_queue);
+			kfree_skb(skb);
+			ax25->va = (ax25->va + 1) % ax25->modulus;
+		}
+	}
+}
+
+void ax25_requeue_frames(ax25_cb *ax25)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Requeue all the un-ack-ed frames on the output queue to be picked
+	 * up by ax25_kick called from the timer. This arrangement handles the
+	 * possibility of an empty output queue.
+	 */
+	while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL)
+		skb_queue_head(&ax25->write_queue, skb);
+}
+
+/*
+ *	Validate that the value of nr is between va and vs. Return true or
+ *	false for testing.
+ */
+int ax25_validate_nr(ax25_cb *ax25, unsigned short nr)
+{
+	unsigned short vc = ax25->va;
+
+	while (vc != ax25->vs) {
+		if (nr == vc) return 1;
+		vc = (vc + 1) % ax25->modulus;
+	}
+
+	if (nr == ax25->vs) return 1;
+
+	return 0;
+}
+
+/*
+ *	This routine is the centralised routine for parsing the control
+ *	information for the different frame formats.
+ */
+int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf)
+{
+	unsigned char *frame;
+	int frametype = AX25_ILLEGAL;
+
+	frame = skb->data;
+	*ns = *nr = *pf = 0;
+
+	if (ax25->modulus == AX25_MODULUS) {
+		if ((frame[0] & AX25_S) == 0) {
+			frametype = AX25_I;			/* I frame - carries NR/NS/PF */
+			*ns = (frame[0] >> 1) & 0x07;
+			*nr = (frame[0] >> 5) & 0x07;
+			*pf = frame[0] & AX25_PF;
+		} else if ((frame[0] & AX25_U) == 1) { 	/* S frame - take out PF/NR */
+			frametype = frame[0] & 0x0F;
+			*nr = (frame[0] >> 5) & 0x07;
+			*pf = frame[0] & AX25_PF;
+		} else if ((frame[0] & AX25_U) == 3) { 	/* U frame - take out PF */
+			frametype = frame[0] & ~AX25_PF;
+			*pf = frame[0] & AX25_PF;
+		}
+		skb_pull(skb, 1);
+	} else {
+		if ((frame[0] & AX25_S) == 0) {
+			frametype = AX25_I;			/* I frame - carries NR/NS/PF */
+			*ns = (frame[0] >> 1) & 0x7F;
+			*nr = (frame[1] >> 1) & 0x7F;
+			*pf = frame[1] & AX25_EPF;
+			skb_pull(skb, 2);
+		} else if ((frame[0] & AX25_U) == 1) { 	/* S frame - take out PF/NR */
+			frametype = frame[0] & 0x0F;
+			*nr = (frame[1] >> 1) & 0x7F;
+			*pf = frame[1] & AX25_EPF;
+			skb_pull(skb, 2);
+		} else if ((frame[0] & AX25_U) == 3) { 	/* U frame - take out PF */
+			frametype = frame[0] & ~AX25_PF;
+			*pf = frame[0] & AX25_PF;
+			skb_pull(skb, 1);
+		}
+	}
+
+	return frametype;
+}
+
+/*
+ *	This routine is called when the HDLC layer internally  generates a
+ *	command or  response  for  the remote machine ( eg. RR, UA etc. ).
+ *	Only supervisory or unnumbered frames are processed.
+ */
+void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type)
+{
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+
+	if ((skb = alloc_skb(ax25->ax25_dev->dev->hard_header_len + 2, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len);
+
+	skb_reset_network_header(skb);
+
+	/* Assume a response - address structure for DTE */
+	if (ax25->modulus == AX25_MODULUS) {
+		dptr = skb_put(skb, 1);
+		*dptr = frametype;
+		*dptr |= (poll_bit) ? AX25_PF : 0;
+		if ((frametype & AX25_U) == AX25_S)		/* S frames carry NR */
+			*dptr |= (ax25->vr << 5);
+	} else {
+		if ((frametype & AX25_U) == AX25_U) {
+			dptr = skb_put(skb, 1);
+			*dptr = frametype;
+			*dptr |= (poll_bit) ? AX25_PF : 0;
+		} else {
+			dptr = skb_put(skb, 2);
+			dptr[0] = frametype;
+			dptr[1] = (ax25->vr << 1);
+			dptr[1] |= (poll_bit) ? AX25_EPF : 0;
+		}
+	}
+
+	ax25_transmit_buffer(ax25, skb, type);
+}
+
+/*
+ *	Send a 'DM' to an unknown connection attempt, or an invalid caller.
+ *
+ *	Note: src here is the sender, thus it's the target of the DM
+ */
+void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi)
+{
+	struct sk_buff *skb;
+	char *dptr;
+	ax25_digi retdigi;
+
+	if (dev == NULL)
+		return;
+
+	if ((skb = alloc_skb(dev->hard_header_len + 1, GFP_ATOMIC)) == NULL)
+		return;	/* Next SABM will get DM'd */
+
+	skb_reserve(skb, dev->hard_header_len);
+	skb_reset_network_header(skb);
+
+	ax25_digi_invert(digi, &retdigi);
+
+	dptr = skb_put(skb, 1);
+
+	*dptr = AX25_DM | AX25_PF;
+
+	/*
+	 *	Do the address ourselves
+	 */
+	dptr  = skb_push(skb, ax25_addr_size(digi));
+	dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS);
+
+	ax25_queue_xmit(skb, dev);
+}
+
+/*
+ *	Exponential backoff for AX.25
+ */
+void ax25_calculate_t1(ax25_cb *ax25)
+{
+	int n, t = 2;
+
+	switch (ax25->backoff) {
+	case 0:
+		break;
+
+	case 1:
+		t += 2 * ax25->n2count;
+		break;
+
+	case 2:
+		for (n = 0; n < ax25->n2count; n++)
+			t *= 2;
+		if (t > 8) t = 8;
+		break;
+	}
+
+	ax25->t1 = t * ax25->rtt;
+}
+
+/*
+ *	Calculate the Round Trip Time
+ */
+void ax25_calculate_rtt(ax25_cb *ax25)
+{
+	if (ax25->backoff == 0)
+		return;
+
+	if (ax25_t1timer_running(ax25) && ax25->n2count == 0)
+		ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25_display_timer(&ax25->t1timer)) / 10;
+
+	if (ax25->rtt < AX25_T1CLAMPLO)
+		ax25->rtt = AX25_T1CLAMPLO;
+
+	if (ax25->rtt > AX25_T1CLAMPHI)
+		ax25->rtt = AX25_T1CLAMPHI;
+}
+
+void ax25_disconnect(ax25_cb *ax25, int reason)
+{
+	ax25_clear_queues(ax25);
+
+	ax25_stop_t1timer(ax25);
+	ax25_stop_t2timer(ax25);
+	ax25_stop_t3timer(ax25);
+	ax25_stop_idletimer(ax25);
+
+	ax25->state = AX25_STATE_0;
+
+	ax25_link_failed(ax25, reason);
+
+	if (ax25->sk != NULL) {
+		local_bh_disable();
+		bh_lock_sock(ax25->sk);
+		ax25->sk->sk_state     = TCP_CLOSE;
+		ax25->sk->sk_err       = reason;
+		ax25->sk->sk_shutdown |= SEND_SHUTDOWN;
+		if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+			ax25->sk->sk_state_change(ax25->sk);
+			sock_set_flag(ax25->sk, SOCK_DEAD);
+		}
+		bh_unlock_sock(ax25->sk);
+		local_bh_enable();
+	}
+}
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
new file mode 100644
index 00000000..db29ea71
--- /dev/null
+++ b/net/ax25/ax25_timer.c
@@ -0,0 +1,227 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
+ * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+
+static void ax25_heartbeat_expiry(unsigned long);
+static void ax25_t1timer_expiry(unsigned long);
+static void ax25_t2timer_expiry(unsigned long);
+static void ax25_t3timer_expiry(unsigned long);
+static void ax25_idletimer_expiry(unsigned long);
+
+void ax25_setup_timers(ax25_cb *ax25)
+{
+	setup_timer(&ax25->timer, ax25_heartbeat_expiry, (unsigned long)ax25);
+	setup_timer(&ax25->t1timer, ax25_t1timer_expiry, (unsigned long)ax25);
+	setup_timer(&ax25->t2timer, ax25_t2timer_expiry, (unsigned long)ax25);
+	setup_timer(&ax25->t3timer, ax25_t3timer_expiry, (unsigned long)ax25);
+	setup_timer(&ax25->idletimer, ax25_idletimer_expiry,
+		    (unsigned long)ax25);
+}
+
+void ax25_start_heartbeat(ax25_cb *ax25)
+{
+	mod_timer(&ax25->timer, jiffies + 5 * HZ);
+}
+
+void ax25_start_t1timer(ax25_cb *ax25)
+{
+	mod_timer(&ax25->t1timer, jiffies + ax25->t1);
+}
+
+void ax25_start_t2timer(ax25_cb *ax25)
+{
+	mod_timer(&ax25->t2timer, jiffies + ax25->t2);
+}
+
+void ax25_start_t3timer(ax25_cb *ax25)
+{
+	if (ax25->t3 > 0)
+		mod_timer(&ax25->t3timer, jiffies + ax25->t3);
+	else
+		del_timer(&ax25->t3timer);
+}
+
+void ax25_start_idletimer(ax25_cb *ax25)
+{
+	if (ax25->idle > 0)
+		mod_timer(&ax25->idletimer, jiffies + ax25->idle);
+	else
+		del_timer(&ax25->idletimer);
+}
+
+void ax25_stop_heartbeat(ax25_cb *ax25)
+{
+	del_timer(&ax25->timer);
+}
+
+void ax25_stop_t1timer(ax25_cb *ax25)
+{
+	del_timer(&ax25->t1timer);
+}
+
+void ax25_stop_t2timer(ax25_cb *ax25)
+{
+	del_timer(&ax25->t2timer);
+}
+
+void ax25_stop_t3timer(ax25_cb *ax25)
+{
+	del_timer(&ax25->t3timer);
+}
+
+void ax25_stop_idletimer(ax25_cb *ax25)
+{
+	del_timer(&ax25->idletimer);
+}
+
+int ax25_t1timer_running(ax25_cb *ax25)
+{
+	return timer_pending(&ax25->t1timer);
+}
+
+unsigned long ax25_display_timer(struct timer_list *timer)
+{
+	if (!timer_pending(timer))
+		return 0;
+
+	return timer->expires - jiffies;
+}
+
+EXPORT_SYMBOL(ax25_display_timer);
+
+static void ax25_heartbeat_expiry(unsigned long param)
+{
+	int proto = AX25_PROTO_STD_SIMPLEX;
+	ax25_cb *ax25 = (ax25_cb *)param;
+
+	if (ax25->ax25_dev)
+		proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL];
+
+	switch (proto) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_heartbeat_expiry(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (ax25->ax25_dev->dama.slave)
+			ax25_ds_heartbeat_expiry(ax25);
+		else
+			ax25_std_heartbeat_expiry(ax25);
+		break;
+#endif
+	}
+}
+
+static void ax25_t1timer_expiry(unsigned long param)
+{
+	ax25_cb *ax25 = (ax25_cb *)param;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_t1timer_expiry(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (!ax25->ax25_dev->dama.slave)
+			ax25_std_t1timer_expiry(ax25);
+		break;
+#endif
+	}
+}
+
+static void ax25_t2timer_expiry(unsigned long param)
+{
+	ax25_cb *ax25 = (ax25_cb *)param;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_t2timer_expiry(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (!ax25->ax25_dev->dama.slave)
+			ax25_std_t2timer_expiry(ax25);
+		break;
+#endif
+	}
+}
+
+static void ax25_t3timer_expiry(unsigned long param)
+{
+	ax25_cb *ax25 = (ax25_cb *)param;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_t3timer_expiry(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (ax25->ax25_dev->dama.slave)
+			ax25_ds_t3timer_expiry(ax25);
+		else
+			ax25_std_t3timer_expiry(ax25);
+		break;
+#endif
+	}
+}
+
+static void ax25_idletimer_expiry(unsigned long param)
+{
+	ax25_cb *ax25 = (ax25_cb *)param;
+
+	switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
+	case AX25_PROTO_STD_SIMPLEX:
+	case AX25_PROTO_STD_DUPLEX:
+		ax25_std_idletimer_expiry(ax25);
+		break;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	case AX25_PROTO_DAMA_SLAVE:
+		if (ax25->ax25_dev->dama.slave)
+			ax25_ds_idletimer_expiry(ax25);
+		else
+			ax25_std_idletimer_expiry(ax25);
+		break;
+#endif
+	}
+}
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
new file mode 100644
index 00000000..d349be95
--- /dev/null
+++ b/net/ax25/ax25_uid.c
@@ -0,0 +1,218 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+/*
+ *	Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
+ */
+
+static HLIST_HEAD(ax25_uid_list);
+static DEFINE_RWLOCK(ax25_uid_lock);
+
+int ax25_uid_policy;
+
+EXPORT_SYMBOL(ax25_uid_policy);
+
+ax25_uid_assoc *ax25_findbyuid(uid_t uid)
+{
+	ax25_uid_assoc *ax25_uid, *res = NULL;
+	struct hlist_node *node;
+
+	read_lock(&ax25_uid_lock);
+	ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+		if (ax25_uid->uid == uid) {
+			ax25_uid_hold(ax25_uid);
+			res = ax25_uid;
+			break;
+		}
+	}
+	read_unlock(&ax25_uid_lock);
+
+	return res;
+}
+
+EXPORT_SYMBOL(ax25_findbyuid);
+
+int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
+{
+	ax25_uid_assoc *ax25_uid;
+	struct hlist_node *node;
+	ax25_uid_assoc *user;
+	unsigned long res;
+
+	switch (cmd) {
+	case SIOCAX25GETUID:
+		res = -ENOENT;
+		read_lock(&ax25_uid_lock);
+		ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+			if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
+				res = ax25_uid->uid;
+				break;
+			}
+		}
+		read_unlock(&ax25_uid_lock);
+
+		return res;
+
+	case SIOCAX25ADDUID:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		user = ax25_findbyuid(sax->sax25_uid);
+		if (user) {
+			ax25_uid_put(user);
+			return -EEXIST;
+		}
+		if (sax->sax25_uid == 0)
+			return -EINVAL;
+		if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
+			return -ENOMEM;
+
+		atomic_set(&ax25_uid->refcount, 1);
+		ax25_uid->uid  = sax->sax25_uid;
+		ax25_uid->call = sax->sax25_call;
+
+		write_lock(&ax25_uid_lock);
+		hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
+		write_unlock(&ax25_uid_lock);
+
+		return 0;
+
+	case SIOCAX25DELUID:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		ax25_uid = NULL;
+		write_lock(&ax25_uid_lock);
+		ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+			if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
+				break;
+		}
+		if (ax25_uid == NULL) {
+			write_unlock(&ax25_uid_lock);
+			return -ENOENT;
+		}
+		hlist_del_init(&ax25_uid->uid_node);
+		ax25_uid_put(ax25_uid);
+		write_unlock(&ax25_uid_lock);
+
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+
+	return -EINVAL;	/*NOTREACHED */
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(ax25_uid_lock)
+{
+	read_lock(&ax25_uid_lock);
+	return seq_hlist_start_head(&ax25_uid_list, *pos);
+}
+
+static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &ax25_uid_list, pos);
+}
+
+static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
+	__releases(ax25_uid_lock)
+{
+	read_unlock(&ax25_uid_lock);
+}
+
+static int ax25_uid_seq_show(struct seq_file *seq, void *v)
+{
+	char buf[11];
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
+	else {
+		struct ax25_uid_assoc *pt;
+
+		pt = hlist_entry(v, struct ax25_uid_assoc, uid_node);
+		seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(buf, &pt->call));
+	}
+	return 0;
+}
+
+static const struct seq_operations ax25_uid_seqops = {
+	.start = ax25_uid_seq_start,
+	.next = ax25_uid_seq_next,
+	.stop = ax25_uid_seq_stop,
+	.show = ax25_uid_seq_show,
+};
+
+static int ax25_uid_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ax25_uid_seqops);
+}
+
+const struct file_operations ax25_uid_fops = {
+	.owner = THIS_MODULE,
+	.open = ax25_uid_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+/*
+ *	Free all memory associated with UID/Callsign structures.
+ */
+void __exit ax25_uid_free(void)
+{
+	ax25_uid_assoc *ax25_uid;
+	struct hlist_node *node;
+
+	write_lock(&ax25_uid_lock);
+again:
+	ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+		hlist_del_init(&ax25_uid->uid_node);
+		ax25_uid_put(ax25_uid);
+		goto again;
+	}
+	write_unlock(&ax25_uid_lock);
+}
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
new file mode 100644
index 00000000..ebe0ef3f
--- /dev/null
+++ b/net/ax25/sysctl_net_ax25.c
@@ -0,0 +1,210 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <net/ax25.h>
+
+static int min_ipdefmode[1],    	max_ipdefmode[] = {1};
+static int min_axdefmode[1],            max_axdefmode[] = {1};
+static int min_backoff[1],		max_backoff[] = {2};
+static int min_conmode[1],		max_conmode[] = {2};
+static int min_window[] = {1},		max_window[] = {7};
+static int min_ewindow[] = {1},		max_ewindow[] = {63};
+static int min_t1[] = {1},		max_t1[] = {30000};
+static int min_t2[] = {1},		max_t2[] = {20000};
+static int min_t3[1],			max_t3[] = {3600000};
+static int min_idle[1],			max_idle[] = {65535000};
+static int min_n2[] = {1},		max_n2[] = {31};
+static int min_paclen[] = {1},		max_paclen[] = {512};
+static int min_proto[1],		max_proto[] = { AX25_PROTO_MAX };
+#ifdef CONFIG_AX25_DAMA_SLAVE
+static int min_ds_timeout[1],		max_ds_timeout[] = {65535000};
+#endif
+
+static struct ctl_table_header *ax25_table_header;
+
+static ctl_table *ax25_table;
+static int ax25_table_size;
+
+static struct ctl_path ax25_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ax25", },
+	{ }
+};
+
+static const ctl_table ax25_param_table[] = {
+	{
+		.procname	= "ip_default_mode",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ipdefmode,
+		.extra2		= &max_ipdefmode
+	},
+	{
+		.procname	= "ax25_default_mode",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_axdefmode,
+		.extra2		= &max_axdefmode
+	},
+	{
+		.procname	= "backoff_type",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_backoff,
+		.extra2		= &max_backoff
+	},
+	{
+		.procname	= "connect_mode",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_conmode,
+		.extra2		= &max_conmode
+	},
+	{
+		.procname	= "standard_window_size",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_window,
+		.extra2		= &max_window
+	},
+	{
+		.procname	= "extended_window_size",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ewindow,
+		.extra2		= &max_ewindow
+	},
+	{
+		.procname	= "t1_timeout",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t1,
+		.extra2		= &max_t1
+	},
+	{
+		.procname	= "t2_timeout",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t2,
+		.extra2		= &max_t2
+	},
+	{
+		.procname	= "t3_timeout",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t3,
+		.extra2		= &max_t3
+	},
+	{
+		.procname	= "idle_timeout",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_idle,
+		.extra2		= &max_idle
+	},
+	{
+		.procname	= "maximum_retry_count",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_n2,
+		.extra2		= &max_n2
+	},
+	{
+		.procname	= "maximum_packet_length",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_paclen,
+		.extra2		= &max_paclen
+	},
+	{
+		.procname	= "protocol",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_proto,
+		.extra2		= &max_proto
+	},
+#ifdef CONFIG_AX25_DAMA_SLAVE
+	{
+		.procname	= "dama_slave_timeout",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ds_timeout,
+		.extra2		= &max_ds_timeout
+	},
+#endif
+
+	{ }	/* that's all, folks! */
+};
+
+void ax25_register_sysctl(void)
+{
+	ax25_dev *ax25_dev;
+	int n, k;
+
+	spin_lock_bh(&ax25_dev_lock);
+	for (ax25_table_size = sizeof(ctl_table), ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
+		ax25_table_size += sizeof(ctl_table);
+
+	if ((ax25_table = kzalloc(ax25_table_size, GFP_ATOMIC)) == NULL) {
+		spin_unlock_bh(&ax25_dev_lock);
+		return;
+	}
+
+	for (n = 0, ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) {
+		struct ctl_table *child = kmemdup(ax25_param_table,
+						  sizeof(ax25_param_table),
+						  GFP_ATOMIC);
+		if (!child) {
+			while (n--)
+				kfree(ax25_table[n].child);
+			kfree(ax25_table);
+			spin_unlock_bh(&ax25_dev_lock);
+			return;
+		}
+		ax25_table[n].child = ax25_dev->systable = child;
+		ax25_table[n].procname     = ax25_dev->dev->name;
+		ax25_table[n].mode         = 0555;
+
+
+		for (k = 0; k < AX25_MAX_VALUES; k++)
+			child[k].data = &ax25_dev->values[k];
+
+		n++;
+	}
+	spin_unlock_bh(&ax25_dev_lock);
+
+	ax25_table_header = register_sysctl_paths(ax25_path, ax25_table);
+}
+
+void ax25_unregister_sysctl(void)
+{
+	ctl_table *p;
+	unregister_sysctl_table(ax25_table_header);
+
+	for (p = ax25_table; p->procname; p++)
+		kfree(p->child);
+	kfree(ax25_table);
+}
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
new file mode 100644
index 00000000..6c051ad8
--- /dev/null
+++ b/net/batman-adv/Kconfig
@@ -0,0 +1,25 @@
+#
+# B.A.T.M.A.N meshing protocol
+#
+
+config BATMAN_ADV
+	tristate "B.A.T.M.A.N. Advanced Meshing Protocol"
+	depends on NET
+        default n
+	---help---
+
+        B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+        a routing protocol for multi-hop ad-hoc mesh networks. The
+        networks may be wired or wireless. See
+        http://www.open-mesh.org/ for more information and user space
+        tools.
+
+config BATMAN_ADV_DEBUG
+	bool "B.A.T.M.A.N. debugging"
+	depends on BATMAN_ADV != n
+	---help---
+
+	  This is an option for use by developers; most people should
+	  say N here. This enables compilation of support for
+	  outputting debugging information to the kernel log. The
+	  output is controlled via the module parameter debug.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
new file mode 100644
index 00000000..2de93d00
--- /dev/null
+++ b/net/batman-adv/Makefile
@@ -0,0 +1,39 @@
+#
+# Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+#
+# Marek Lindner, Simon Wunderlich
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA
+#
+
+obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
+batman-adv-y += aggregation.o
+batman-adv-y += bat_debugfs.o
+batman-adv-y += bat_sysfs.o
+batman-adv-y += bitarray.o
+batman-adv-y += gateway_client.o
+batman-adv-y += gateway_common.o
+batman-adv-y += hard-interface.o
+batman-adv-y += hash.o
+batman-adv-y += icmp_socket.o
+batman-adv-y += main.o
+batman-adv-y += originator.o
+batman-adv-y += ring_buffer.o
+batman-adv-y += routing.o
+batman-adv-y += send.o
+batman-adv-y += soft-interface.o
+batman-adv-y += translation-table.o
+batman-adv-y += unicast.o
+batman-adv-y += vis.o
diff --git a/net/batman-adv/aggregation.c b/net/batman-adv/aggregation.c
new file mode 100644
index 00000000..a8c32030
--- /dev/null
+++ b/net/batman-adv/aggregation.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "aggregation.h"
+#include "send.h"
+#include "routing.h"
+#include "hard-interface.h"
+
+/* calculate the size of the tt information for a given packet */
+static int tt_len(struct batman_packet *batman_packet)
+{
+	return batman_packet->num_tt * ETH_ALEN;
+}
+
+/* return true if new_packet can be aggregated with forw_packet */
+static bool can_aggregate_with(struct batman_packet *new_batman_packet,
+			       int packet_len,
+			       unsigned long send_time,
+			       bool directlink,
+			       struct hard_iface *if_incoming,
+			       struct forw_packet *forw_packet)
+{
+	struct batman_packet *batman_packet =
+		(struct batman_packet *)forw_packet->skb->data;
+	int aggregated_bytes = forw_packet->packet_len + packet_len;
+
+	/**
+	 * we can aggregate the current packet to this aggregated packet
+	 * if:
+	 *
+	 * - the send time is within our MAX_AGGREGATION_MS time
+	 * - the resulting packet wont be bigger than
+	 *   MAX_AGGREGATION_BYTES
+	 */
+
+	if (time_before(send_time, forw_packet->send_time) &&
+	    time_after_eq(send_time + msecs_to_jiffies(MAX_AGGREGATION_MS),
+					forw_packet->send_time) &&
+	    (aggregated_bytes <= MAX_AGGREGATION_BYTES)) {
+
+		/**
+		 * check aggregation compatibility
+		 * -> direct link packets are broadcasted on
+		 *    their interface only
+		 * -> aggregate packet if the current packet is
+		 *    a "global" packet as well as the base
+		 *    packet
+		 */
+
+		/* packets without direct link flag and high TTL
+		 * are flooded through the net  */
+		if ((!directlink) &&
+		    (!(batman_packet->flags & DIRECTLINK)) &&
+		    (batman_packet->ttl != 1) &&
+
+		    /* own packets originating non-primary
+		     * interfaces leave only that interface */
+		    ((!forw_packet->own) ||
+		     (forw_packet->if_incoming->if_num == 0)))
+			return true;
+
+		/* if the incoming packet is sent via this one
+		 * interface only - we still can aggregate */
+		if ((directlink) &&
+		    (new_batman_packet->ttl == 1) &&
+		    (forw_packet->if_incoming == if_incoming) &&
+
+		    /* packets from direct neighbors or
+		     * own secondary interface packets
+		     * (= secondary interface packets in general) */
+		    (batman_packet->flags & DIRECTLINK ||
+		     (forw_packet->own &&
+		      forw_packet->if_incoming->if_num != 0)))
+			return true;
+	}
+
+	return false;
+}
+
+/* create a new aggregated packet and add this packet to it */
+static void new_aggregated_packet(unsigned char *packet_buff, int packet_len,
+				  unsigned long send_time, bool direct_link,
+				  struct hard_iface *if_incoming,
+				  int own_packet)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct forw_packet *forw_packet_aggr;
+	unsigned char *skb_buff;
+
+	if (!atomic_inc_not_zero(&if_incoming->refcount))
+		return;
+
+	/* own packet should always be scheduled */
+	if (!own_packet) {
+		if (!atomic_dec_not_zero(&bat_priv->batman_queue_left)) {
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"batman packet queue full\n");
+			goto out;
+		}
+	}
+
+	forw_packet_aggr = kmalloc(sizeof(struct forw_packet), GFP_ATOMIC);
+	if (!forw_packet_aggr) {
+		if (!own_packet)
+			atomic_inc(&bat_priv->batman_queue_left);
+		goto out;
+	}
+
+	if ((atomic_read(&bat_priv->aggregated_ogms)) &&
+	    (packet_len < MAX_AGGREGATION_BYTES))
+		forw_packet_aggr->skb = dev_alloc_skb(MAX_AGGREGATION_BYTES +
+						      sizeof(struct ethhdr));
+	else
+		forw_packet_aggr->skb = dev_alloc_skb(packet_len +
+						      sizeof(struct ethhdr));
+
+	if (!forw_packet_aggr->skb) {
+		if (!own_packet)
+			atomic_inc(&bat_priv->batman_queue_left);
+		kfree(forw_packet_aggr);
+		goto out;
+	}
+	skb_reserve(forw_packet_aggr->skb, sizeof(struct ethhdr));
+
+	INIT_HLIST_NODE(&forw_packet_aggr->list);
+
+	skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
+	forw_packet_aggr->packet_len = packet_len;
+	memcpy(skb_buff, packet_buff, packet_len);
+
+	forw_packet_aggr->own = own_packet;
+	forw_packet_aggr->if_incoming = if_incoming;
+	forw_packet_aggr->num_packets = 0;
+	forw_packet_aggr->direct_link_flags = 0;
+	forw_packet_aggr->send_time = send_time;
+
+	/* save packet direct link flag status */
+	if (direct_link)
+		forw_packet_aggr->direct_link_flags |= 1;
+
+	/* add new packet to packet list */
+	spin_lock_bh(&bat_priv->forw_bat_list_lock);
+	hlist_add_head(&forw_packet_aggr->list, &bat_priv->forw_bat_list);
+	spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+	/* start timer for this packet */
+	INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
+			  send_outstanding_bat_packet);
+	queue_delayed_work(bat_event_workqueue,
+			   &forw_packet_aggr->delayed_work,
+			   send_time - jiffies);
+
+	return;
+out:
+	hardif_free_ref(if_incoming);
+}
+
+/* aggregate a new packet into the existing aggregation */
+static void aggregate(struct forw_packet *forw_packet_aggr,
+		      unsigned char *packet_buff,
+		      int packet_len,
+		      bool direct_link)
+{
+	unsigned char *skb_buff;
+
+	skb_buff = skb_put(forw_packet_aggr->skb, packet_len);
+	memcpy(skb_buff, packet_buff, packet_len);
+	forw_packet_aggr->packet_len += packet_len;
+	forw_packet_aggr->num_packets++;
+
+	/* save packet direct link flag status */
+	if (direct_link)
+		forw_packet_aggr->direct_link_flags |=
+			(1 << forw_packet_aggr->num_packets);
+}
+
+void add_bat_packet_to_list(struct bat_priv *bat_priv,
+			    unsigned char *packet_buff, int packet_len,
+			    struct hard_iface *if_incoming, char own_packet,
+			    unsigned long send_time)
+{
+	/**
+	 * _aggr -> pointer to the packet we want to aggregate with
+	 * _pos -> pointer to the position in the queue
+	 */
+	struct forw_packet *forw_packet_aggr = NULL, *forw_packet_pos = NULL;
+	struct hlist_node *tmp_node;
+	struct batman_packet *batman_packet =
+		(struct batman_packet *)packet_buff;
+	bool direct_link = batman_packet->flags & DIRECTLINK ? 1 : 0;
+
+	/* find position for the packet in the forward queue */
+	spin_lock_bh(&bat_priv->forw_bat_list_lock);
+	/* own packets are not to be aggregated */
+	if ((atomic_read(&bat_priv->aggregated_ogms)) && (!own_packet)) {
+		hlist_for_each_entry(forw_packet_pos, tmp_node,
+				     &bat_priv->forw_bat_list, list) {
+			if (can_aggregate_with(batman_packet,
+					       packet_len,
+					       send_time,
+					       direct_link,
+					       if_incoming,
+					       forw_packet_pos)) {
+				forw_packet_aggr = forw_packet_pos;
+				break;
+			}
+		}
+	}
+
+	/* nothing to aggregate with - either aggregation disabled or no
+	 * suitable aggregation packet found */
+	if (!forw_packet_aggr) {
+		/* the following section can run without the lock */
+		spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+		/**
+		 * if we could not aggregate this packet with one of the others
+		 * we hold it back for a while, so that it might be aggregated
+		 * later on
+		 */
+		if ((!own_packet) &&
+		    (atomic_read(&bat_priv->aggregated_ogms)))
+			send_time += msecs_to_jiffies(MAX_AGGREGATION_MS);
+
+		new_aggregated_packet(packet_buff, packet_len,
+				      send_time, direct_link,
+				      if_incoming, own_packet);
+	} else {
+		aggregate(forw_packet_aggr,
+			  packet_buff, packet_len,
+			  direct_link);
+		spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+	}
+}
+
+/* unpack the aggregated packets and process them one by one */
+void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff,
+			     int packet_len, struct hard_iface *if_incoming)
+{
+	struct batman_packet *batman_packet;
+	int buff_pos = 0;
+	unsigned char *tt_buff;
+
+	batman_packet = (struct batman_packet *)packet_buff;
+
+	do {
+		/* network to host order for our 32bit seqno, and the
+		   orig_interval. */
+		batman_packet->seqno = ntohl(batman_packet->seqno);
+
+		tt_buff = packet_buff + buff_pos + BAT_PACKET_LEN;
+		receive_bat_packet(ethhdr, batman_packet,
+				   tt_buff, tt_len(batman_packet),
+				   if_incoming);
+
+		buff_pos += BAT_PACKET_LEN + tt_len(batman_packet);
+		batman_packet = (struct batman_packet *)
+			(packet_buff + buff_pos);
+	} while (aggregated_packet(buff_pos, packet_len,
+				   batman_packet->num_tt));
+}
diff --git a/net/batman-adv/aggregation.h b/net/batman-adv/aggregation.h
new file mode 100644
index 00000000..7e6d72fb
--- /dev/null
+++ b/net/batman-adv/aggregation.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_AGGREGATION_H_
+#define _NET_BATMAN_ADV_AGGREGATION_H_
+
+#include "main.h"
+
+/* is there another aggregated packet here? */
+static inline int aggregated_packet(int buff_pos, int packet_len, int num_tt)
+{
+	int next_buff_pos = buff_pos + BAT_PACKET_LEN + (num_tt * ETH_ALEN);
+
+	return (next_buff_pos <= packet_len) &&
+		(next_buff_pos <= MAX_AGGREGATION_BYTES);
+}
+
+void add_bat_packet_to_list(struct bat_priv *bat_priv,
+			    unsigned char *packet_buff, int packet_len,
+			    struct hard_iface *if_incoming, char own_packet,
+			    unsigned long send_time);
+void receive_aggr_bat_packet(struct ethhdr *ethhdr, unsigned char *packet_buff,
+			     int packet_len, struct hard_iface *if_incoming);
+
+#endif /* _NET_BATMAN_ADV_AGGREGATION_H_ */
diff --git a/net/batman-adv/bat_debugfs.c b/net/batman-adv/bat_debugfs.c
new file mode 100644
index 00000000..abaeec5f
--- /dev/null
+++ b/net/batman-adv/bat_debugfs.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+
+#include <linux/debugfs.h>
+
+#include "bat_debugfs.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "soft-interface.h"
+#include "vis.h"
+#include "icmp_socket.h"
+
+static struct dentry *bat_debugfs;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+#define LOG_BUFF_MASK (log_buff_len-1)
+#define LOG_BUFF(idx) (debug_log->log_buff[(idx) & LOG_BUFF_MASK])
+
+static int log_buff_len = LOG_BUF_LEN;
+
+static void emit_log_char(struct debug_log *debug_log, char c)
+{
+	LOG_BUFF(debug_log->log_end) = c;
+	debug_log->log_end++;
+
+	if (debug_log->log_end - debug_log->log_start > log_buff_len)
+		debug_log->log_start = debug_log->log_end - log_buff_len;
+}
+
+static int fdebug_log(struct debug_log *debug_log, char *fmt, ...)
+{
+	va_list args;
+	static char debug_log_buf[256];
+	char *p;
+
+	if (!debug_log)
+		return 0;
+
+	spin_lock_bh(&debug_log->lock);
+	va_start(args, fmt);
+	vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
+	va_end(args);
+
+	for (p = debug_log_buf; *p != 0; p++)
+		emit_log_char(debug_log, *p);
+
+	spin_unlock_bh(&debug_log->lock);
+
+	wake_up(&debug_log->queue_wait);
+
+	return 0;
+}
+
+int debug_log(struct bat_priv *bat_priv, char *fmt, ...)
+{
+	va_list args;
+	char tmp_log_buf[256];
+
+	va_start(args, fmt);
+	vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
+	fdebug_log(bat_priv->debug_log, "[%10u] %s",
+		   (jiffies / HZ), tmp_log_buf);
+	va_end(args);
+
+	return 0;
+}
+
+static int log_open(struct inode *inode, struct file *file)
+{
+	nonseekable_open(inode, file);
+	file->private_data = inode->i_private;
+	inc_module_count();
+	return 0;
+}
+
+static int log_release(struct inode *inode, struct file *file)
+{
+	dec_module_count();
+	return 0;
+}
+
+static ssize_t log_read(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct bat_priv *bat_priv = file->private_data;
+	struct debug_log *debug_log = bat_priv->debug_log;
+	int error, i = 0;
+	char c;
+
+	if ((file->f_flags & O_NONBLOCK) &&
+	    !(debug_log->log_end - debug_log->log_start))
+		return -EAGAIN;
+
+	if ((!buf) || (count < 0))
+		return -EINVAL;
+
+	if (count == 0)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	error = wait_event_interruptible(debug_log->queue_wait,
+				(debug_log->log_start - debug_log->log_end));
+
+	if (error)
+		return error;
+
+	spin_lock_bh(&debug_log->lock);
+
+	while ((!error) && (i < count) &&
+	       (debug_log->log_start != debug_log->log_end)) {
+		c = LOG_BUFF(debug_log->log_start);
+
+		debug_log->log_start++;
+
+		spin_unlock_bh(&debug_log->lock);
+
+		error = __put_user(c, buf);
+
+		spin_lock_bh(&debug_log->lock);
+
+		buf++;
+		i++;
+
+	}
+
+	spin_unlock_bh(&debug_log->lock);
+
+	if (!error)
+		return i;
+
+	return error;
+}
+
+static unsigned int log_poll(struct file *file, poll_table *wait)
+{
+	struct bat_priv *bat_priv = file->private_data;
+	struct debug_log *debug_log = bat_priv->debug_log;
+
+	poll_wait(file, &debug_log->queue_wait, wait);
+
+	if (debug_log->log_end - debug_log->log_start)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static const struct file_operations log_fops = {
+	.open           = log_open,
+	.release        = log_release,
+	.read           = log_read,
+	.poll           = log_poll,
+	.llseek         = no_llseek,
+};
+
+static int debug_log_setup(struct bat_priv *bat_priv)
+{
+	struct dentry *d;
+
+	if (!bat_priv->debug_dir)
+		goto err;
+
+	bat_priv->debug_log = kzalloc(sizeof(struct debug_log), GFP_ATOMIC);
+	if (!bat_priv->debug_log)
+		goto err;
+
+	spin_lock_init(&bat_priv->debug_log->lock);
+	init_waitqueue_head(&bat_priv->debug_log->queue_wait);
+
+	d = debugfs_create_file("log", S_IFREG | S_IRUSR,
+				bat_priv->debug_dir, bat_priv, &log_fops);
+	if (d)
+		goto err;
+
+	return 0;
+
+err:
+	return 1;
+}
+
+static void debug_log_cleanup(struct bat_priv *bat_priv)
+{
+	kfree(bat_priv->debug_log);
+	bat_priv->debug_log = NULL;
+}
+#else /* CONFIG_BATMAN_ADV_DEBUG */
+static int debug_log_setup(struct bat_priv *bat_priv)
+{
+	bat_priv->debug_log = NULL;
+	return 0;
+}
+
+static void debug_log_cleanup(struct bat_priv *bat_priv)
+{
+	return;
+}
+#endif
+
+static int originators_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, orig_seq_print_text, net_dev);
+}
+
+static int gateways_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, gw_client_seq_print_text, net_dev);
+}
+
+static int softif_neigh_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, softif_neigh_seq_print_text, net_dev);
+}
+
+static int transtable_global_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, tt_global_seq_print_text, net_dev);
+}
+
+static int transtable_local_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, tt_local_seq_print_text, net_dev);
+}
+
+static int vis_data_open(struct inode *inode, struct file *file)
+{
+	struct net_device *net_dev = (struct net_device *)inode->i_private;
+	return single_open(file, vis_seq_print_text, net_dev);
+}
+
+struct bat_debuginfo {
+	struct attribute attr;
+	const struct file_operations fops;
+};
+
+#define BAT_DEBUGINFO(_name, _mode, _open)	\
+struct bat_debuginfo bat_debuginfo_##_name = {	\
+	.attr = { .name = __stringify(_name),	\
+		  .mode = _mode, },		\
+	.fops = { .owner = THIS_MODULE,		\
+		  .open = _open,		\
+		  .read	= seq_read,		\
+		  .llseek = seq_lseek,		\
+		  .release = single_release,	\
+		}				\
+};
+
+static BAT_DEBUGINFO(originators, S_IRUGO, originators_open);
+static BAT_DEBUGINFO(gateways, S_IRUGO, gateways_open);
+static BAT_DEBUGINFO(softif_neigh, S_IRUGO, softif_neigh_open);
+static BAT_DEBUGINFO(transtable_global, S_IRUGO, transtable_global_open);
+static BAT_DEBUGINFO(transtable_local, S_IRUGO, transtable_local_open);
+static BAT_DEBUGINFO(vis_data, S_IRUGO, vis_data_open);
+
+static struct bat_debuginfo *mesh_debuginfos[] = {
+	&bat_debuginfo_originators,
+	&bat_debuginfo_gateways,
+	&bat_debuginfo_softif_neigh,
+	&bat_debuginfo_transtable_global,
+	&bat_debuginfo_transtable_local,
+	&bat_debuginfo_vis_data,
+	NULL,
+};
+
+void debugfs_init(void)
+{
+	bat_debugfs = debugfs_create_dir(DEBUGFS_BAT_SUBDIR, NULL);
+	if (bat_debugfs == ERR_PTR(-ENODEV))
+		bat_debugfs = NULL;
+}
+
+void debugfs_destroy(void)
+{
+	if (bat_debugfs) {
+		debugfs_remove_recursive(bat_debugfs);
+		bat_debugfs = NULL;
+	}
+}
+
+int debugfs_add_meshif(struct net_device *dev)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	struct bat_debuginfo **bat_debug;
+	struct dentry *file;
+
+	if (!bat_debugfs)
+		goto out;
+
+	bat_priv->debug_dir = debugfs_create_dir(dev->name, bat_debugfs);
+	if (!bat_priv->debug_dir)
+		goto out;
+
+	bat_socket_setup(bat_priv);
+	debug_log_setup(bat_priv);
+
+	for (bat_debug = mesh_debuginfos; *bat_debug; ++bat_debug) {
+		file = debugfs_create_file(((*bat_debug)->attr).name,
+					  S_IFREG | ((*bat_debug)->attr).mode,
+					  bat_priv->debug_dir,
+					  dev, &(*bat_debug)->fops);
+		if (!file) {
+			bat_err(dev, "Can't add debugfs file: %s/%s\n",
+				dev->name, ((*bat_debug)->attr).name);
+			goto rem_attr;
+		}
+	}
+
+	return 0;
+rem_attr:
+	debugfs_remove_recursive(bat_priv->debug_dir);
+	bat_priv->debug_dir = NULL;
+out:
+#ifdef CONFIG_DEBUG_FS
+	return -ENOMEM;
+#else
+	return 0;
+#endif /* CONFIG_DEBUG_FS */
+}
+
+void debugfs_del_meshif(struct net_device *dev)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+
+	debug_log_cleanup(bat_priv);
+
+	if (bat_debugfs) {
+		debugfs_remove_recursive(bat_priv->debug_dir);
+		bat_priv->debug_dir = NULL;
+	}
+}
diff --git a/net/batman-adv/bat_debugfs.h b/net/batman-adv/bat_debugfs.h
new file mode 100644
index 00000000..bc9cda3f
--- /dev/null
+++ b/net/batman-adv/bat_debugfs.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+
+#ifndef _NET_BATMAN_ADV_DEBUGFS_H_
+#define _NET_BATMAN_ADV_DEBUGFS_H_
+
+#define DEBUGFS_BAT_SUBDIR "batman_adv"
+
+void debugfs_init(void);
+void debugfs_destroy(void);
+int debugfs_add_meshif(struct net_device *dev);
+void debugfs_del_meshif(struct net_device *dev);
+
+#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */
diff --git a/net/batman-adv/bat_sysfs.c b/net/batman-adv/bat_sysfs.c
new file mode 100644
index 00000000..497a0700
--- /dev/null
+++ b/net/batman-adv/bat_sysfs.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "bat_sysfs.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "hard-interface.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "vis.h"
+
+#define to_dev(obj)		container_of(obj, struct device, kobj)
+#define kobj_to_netdev(obj)	to_net_dev(to_dev(obj->parent))
+#define kobj_to_batpriv(obj)	netdev_priv(kobj_to_netdev(obj))
+
+/* Use this, if you have customized show and store functions */
+#define BAT_ATTR(_name, _mode, _show, _store)	\
+struct bat_attribute bat_attr_##_name = {	\
+	.attr = {.name = __stringify(_name),	\
+		 .mode = _mode },		\
+	.show   = _show,			\
+	.store  = _store,			\
+};
+
+#define BAT_ATTR_STORE_BOOL(_name, _post_func)				\
+ssize_t store_##_name(struct kobject *kobj, struct attribute *attr,	\
+		      char *buff, size_t count)				\
+{									\
+	struct net_device *net_dev = kobj_to_netdev(kobj);		\
+	struct bat_priv *bat_priv = netdev_priv(net_dev);		\
+	return __store_bool_attr(buff, count, _post_func, attr,		\
+				 &bat_priv->_name, net_dev);		\
+}
+
+#define BAT_ATTR_SHOW_BOOL(_name)					\
+ssize_t show_##_name(struct kobject *kobj, struct attribute *attr,	\
+			    char *buff)					\
+{									\
+	struct bat_priv *bat_priv = kobj_to_batpriv(kobj);		\
+	return sprintf(buff, "%s\n",					\
+		       atomic_read(&bat_priv->_name) == 0 ?		\
+		       "disabled" : "enabled");				\
+}									\
+
+/* Use this, if you are going to turn a [name] in bat_priv on or off */
+#define BAT_ATTR_BOOL(_name, _mode, _post_func)				\
+	static BAT_ATTR_STORE_BOOL(_name, _post_func)			\
+	static BAT_ATTR_SHOW_BOOL(_name)				\
+	static BAT_ATTR(_name, _mode, show_##_name, store_##_name)
+
+
+#define BAT_ATTR_STORE_UINT(_name, _min, _max, _post_func)		\
+ssize_t store_##_name(struct kobject *kobj, struct attribute *attr,	\
+			     char *buff, size_t count)			\
+{									\
+	struct net_device *net_dev = kobj_to_netdev(kobj);		\
+	struct bat_priv *bat_priv = netdev_priv(net_dev);		\
+	return __store_uint_attr(buff, count, _min, _max, _post_func,	\
+				 attr, &bat_priv->_name, net_dev);	\
+}
+
+#define BAT_ATTR_SHOW_UINT(_name)					\
+ssize_t show_##_name(struct kobject *kobj, struct attribute *attr,	\
+			    char *buff)					\
+{									\
+	struct bat_priv *bat_priv = kobj_to_batpriv(kobj);		\
+	return sprintf(buff, "%i\n", atomic_read(&bat_priv->_name));	\
+}									\
+
+/* Use this, if you are going to set [name] in bat_priv to unsigned integer
+ * values only */
+#define BAT_ATTR_UINT(_name, _mode, _min, _max, _post_func)		\
+	static BAT_ATTR_STORE_UINT(_name, _min, _max, _post_func)	\
+	static BAT_ATTR_SHOW_UINT(_name)				\
+	static BAT_ATTR(_name, _mode, show_##_name, store_##_name)
+
+
+static int store_bool_attr(char *buff, size_t count,
+			   struct net_device *net_dev,
+			   char *attr_name, atomic_t *attr)
+{
+	int enabled = -1;
+
+	if (buff[count - 1] == '\n')
+		buff[count - 1] = '\0';
+
+	if ((strncmp(buff, "1", 2) == 0) ||
+	    (strncmp(buff, "enable", 7) == 0) ||
+	    (strncmp(buff, "enabled", 8) == 0))
+		enabled = 1;
+
+	if ((strncmp(buff, "0", 2) == 0) ||
+	    (strncmp(buff, "disable", 8) == 0) ||
+	    (strncmp(buff, "disabled", 9) == 0))
+		enabled = 0;
+
+	if (enabled < 0) {
+		bat_info(net_dev,
+			 "%s: Invalid parameter received: %s\n",
+			 attr_name, buff);
+		return -EINVAL;
+	}
+
+	if (atomic_read(attr) == enabled)
+		return count;
+
+	bat_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name,
+		 atomic_read(attr) == 1 ? "enabled" : "disabled",
+		 enabled == 1 ? "enabled" : "disabled");
+
+	atomic_set(attr, (unsigned)enabled);
+	return count;
+}
+
+static inline ssize_t __store_bool_attr(char *buff, size_t count,
+			void (*post_func)(struct net_device *),
+			struct attribute *attr,
+			atomic_t *attr_store, struct net_device *net_dev)
+{
+	int ret;
+
+	ret = store_bool_attr(buff, count, net_dev, (char *)attr->name,
+			      attr_store);
+	if (post_func && ret)
+		post_func(net_dev);
+
+	return ret;
+}
+
+static int store_uint_attr(char *buff, size_t count,
+			   struct net_device *net_dev, char *attr_name,
+			   unsigned int min, unsigned int max, atomic_t *attr)
+{
+	unsigned long uint_val;
+	int ret;
+
+	ret = strict_strtoul(buff, 10, &uint_val);
+	if (ret) {
+		bat_info(net_dev,
+			 "%s: Invalid parameter received: %s\n",
+			 attr_name, buff);
+		return -EINVAL;
+	}
+
+	if (uint_val < min) {
+		bat_info(net_dev, "%s: Value is too small: %lu min: %u\n",
+			 attr_name, uint_val, min);
+		return -EINVAL;
+	}
+
+	if (uint_val > max) {
+		bat_info(net_dev, "%s: Value is too big: %lu max: %u\n",
+			 attr_name, uint_val, max);
+		return -EINVAL;
+	}
+
+	if (atomic_read(attr) == uint_val)
+		return count;
+
+	bat_info(net_dev, "%s: Changing from: %i to: %lu\n",
+		 attr_name, atomic_read(attr), uint_val);
+
+	atomic_set(attr, uint_val);
+	return count;
+}
+
+static inline ssize_t __store_uint_attr(char *buff, size_t count,
+			int min, int max,
+			void (*post_func)(struct net_device *),
+			struct attribute *attr,
+			atomic_t *attr_store, struct net_device *net_dev)
+{
+	int ret;
+
+	ret = store_uint_attr(buff, count, net_dev, (char *)attr->name,
+			      min, max, attr_store);
+	if (post_func && ret)
+		post_func(net_dev);
+
+	return ret;
+}
+
+static ssize_t show_vis_mode(struct kobject *kobj, struct attribute *attr,
+			     char *buff)
+{
+	struct bat_priv *bat_priv = kobj_to_batpriv(kobj);
+	int vis_mode = atomic_read(&bat_priv->vis_mode);
+
+	return sprintf(buff, "%s\n",
+		       vis_mode == VIS_TYPE_CLIENT_UPDATE ?
+							"client" : "server");
+}
+
+static ssize_t store_vis_mode(struct kobject *kobj, struct attribute *attr,
+			      char *buff, size_t count)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	unsigned long val;
+	int ret, vis_mode_tmp = -1;
+
+	ret = strict_strtoul(buff, 10, &val);
+
+	if (((count == 2) && (!ret) && (val == VIS_TYPE_CLIENT_UPDATE)) ||
+	    (strncmp(buff, "client", 6) == 0) ||
+	    (strncmp(buff, "off", 3) == 0))
+		vis_mode_tmp = VIS_TYPE_CLIENT_UPDATE;
+
+	if (((count == 2) && (!ret) && (val == VIS_TYPE_SERVER_SYNC)) ||
+	    (strncmp(buff, "server", 6) == 0))
+		vis_mode_tmp = VIS_TYPE_SERVER_SYNC;
+
+	if (vis_mode_tmp < 0) {
+		if (buff[count - 1] == '\n')
+			buff[count - 1] = '\0';
+
+		bat_info(net_dev,
+			 "Invalid parameter for 'vis mode' setting received: "
+			 "%s\n", buff);
+		return -EINVAL;
+	}
+
+	if (atomic_read(&bat_priv->vis_mode) == vis_mode_tmp)
+		return count;
+
+	bat_info(net_dev, "Changing vis mode from: %s to: %s\n",
+		 atomic_read(&bat_priv->vis_mode) == VIS_TYPE_CLIENT_UPDATE ?
+		 "client" : "server", vis_mode_tmp == VIS_TYPE_CLIENT_UPDATE ?
+		 "client" : "server");
+
+	atomic_set(&bat_priv->vis_mode, (unsigned)vis_mode_tmp);
+	return count;
+}
+
+static void post_gw_deselect(struct net_device *net_dev)
+{
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	gw_deselect(bat_priv);
+}
+
+static ssize_t show_gw_mode(struct kobject *kobj, struct attribute *attr,
+			    char *buff)
+{
+	struct bat_priv *bat_priv = kobj_to_batpriv(kobj);
+	int bytes_written;
+
+	switch (atomic_read(&bat_priv->gw_mode)) {
+	case GW_MODE_CLIENT:
+		bytes_written = sprintf(buff, "%s\n", GW_MODE_CLIENT_NAME);
+		break;
+	case GW_MODE_SERVER:
+		bytes_written = sprintf(buff, "%s\n", GW_MODE_SERVER_NAME);
+		break;
+	default:
+		bytes_written = sprintf(buff, "%s\n", GW_MODE_OFF_NAME);
+		break;
+	}
+
+	return bytes_written;
+}
+
+static ssize_t store_gw_mode(struct kobject *kobj, struct attribute *attr,
+			     char *buff, size_t count)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	char *curr_gw_mode_str;
+	int gw_mode_tmp = -1;
+
+	if (buff[count - 1] == '\n')
+		buff[count - 1] = '\0';
+
+	if (strncmp(buff, GW_MODE_OFF_NAME, strlen(GW_MODE_OFF_NAME)) == 0)
+		gw_mode_tmp = GW_MODE_OFF;
+
+	if (strncmp(buff, GW_MODE_CLIENT_NAME,
+		   strlen(GW_MODE_CLIENT_NAME)) == 0)
+		gw_mode_tmp = GW_MODE_CLIENT;
+
+	if (strncmp(buff, GW_MODE_SERVER_NAME,
+		   strlen(GW_MODE_SERVER_NAME)) == 0)
+		gw_mode_tmp = GW_MODE_SERVER;
+
+	if (gw_mode_tmp < 0) {
+		bat_info(net_dev,
+			 "Invalid parameter for 'gw mode' setting received: "
+			 "%s\n", buff);
+		return -EINVAL;
+	}
+
+	if (atomic_read(&bat_priv->gw_mode) == gw_mode_tmp)
+		return count;
+
+	switch (atomic_read(&bat_priv->gw_mode)) {
+	case GW_MODE_CLIENT:
+		curr_gw_mode_str = GW_MODE_CLIENT_NAME;
+		break;
+	case GW_MODE_SERVER:
+		curr_gw_mode_str = GW_MODE_SERVER_NAME;
+		break;
+	default:
+		curr_gw_mode_str = GW_MODE_OFF_NAME;
+		break;
+	}
+
+	bat_info(net_dev, "Changing gw mode from: %s to: %s\n",
+		 curr_gw_mode_str, buff);
+
+	gw_deselect(bat_priv);
+	atomic_set(&bat_priv->gw_mode, (unsigned)gw_mode_tmp);
+	return count;
+}
+
+static ssize_t show_gw_bwidth(struct kobject *kobj, struct attribute *attr,
+			      char *buff)
+{
+	struct bat_priv *bat_priv = kobj_to_batpriv(kobj);
+	int down, up;
+	int gw_bandwidth = atomic_read(&bat_priv->gw_bandwidth);
+
+	gw_bandwidth_to_kbit(gw_bandwidth, &down, &up);
+	return sprintf(buff, "%i%s/%i%s\n",
+		       (down > 2048 ? down / 1024 : down),
+		       (down > 2048 ? "MBit" : "KBit"),
+		       (up > 2048 ? up / 1024 : up),
+		       (up > 2048 ? "MBit" : "KBit"));
+}
+
+static ssize_t store_gw_bwidth(struct kobject *kobj, struct attribute *attr,
+			       char *buff, size_t count)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+
+	if (buff[count - 1] == '\n')
+		buff[count - 1] = '\0';
+
+	return gw_bandwidth_set(net_dev, buff, count);
+}
+
+BAT_ATTR_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL);
+BAT_ATTR_BOOL(bonding, S_IRUGO | S_IWUSR, NULL);
+BAT_ATTR_BOOL(fragmentation, S_IRUGO | S_IWUSR, update_min_mtu);
+static BAT_ATTR(vis_mode, S_IRUGO | S_IWUSR, show_vis_mode, store_vis_mode);
+static BAT_ATTR(gw_mode, S_IRUGO | S_IWUSR, show_gw_mode, store_gw_mode);
+BAT_ATTR_UINT(orig_interval, S_IRUGO | S_IWUSR, 2 * JITTER, INT_MAX, NULL);
+BAT_ATTR_UINT(hop_penalty, S_IRUGO | S_IWUSR, 0, TQ_MAX_VALUE, NULL);
+BAT_ATTR_UINT(gw_sel_class, S_IRUGO | S_IWUSR, 1, TQ_MAX_VALUE,
+	      post_gw_deselect);
+static BAT_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, show_gw_bwidth,
+		store_gw_bwidth);
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+BAT_ATTR_UINT(log_level, S_IRUGO | S_IWUSR, 0, 3, NULL);
+#endif
+
+static struct bat_attribute *mesh_attrs[] = {
+	&bat_attr_aggregated_ogms,
+	&bat_attr_bonding,
+	&bat_attr_fragmentation,
+	&bat_attr_vis_mode,
+	&bat_attr_gw_mode,
+	&bat_attr_orig_interval,
+	&bat_attr_hop_penalty,
+	&bat_attr_gw_sel_class,
+	&bat_attr_gw_bandwidth,
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+	&bat_attr_log_level,
+#endif
+	NULL,
+};
+
+int sysfs_add_meshif(struct net_device *dev)
+{
+	struct kobject *batif_kobject = &dev->dev.kobj;
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	struct bat_attribute **bat_attr;
+	int err;
+
+	bat_priv->mesh_obj = kobject_create_and_add(SYSFS_IF_MESH_SUBDIR,
+						    batif_kobject);
+	if (!bat_priv->mesh_obj) {
+		bat_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
+			SYSFS_IF_MESH_SUBDIR);
+		goto out;
+	}
+
+	for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr) {
+		err = sysfs_create_file(bat_priv->mesh_obj,
+					&((*bat_attr)->attr));
+		if (err) {
+			bat_err(dev, "Can't add sysfs file: %s/%s/%s\n",
+				dev->name, SYSFS_IF_MESH_SUBDIR,
+				((*bat_attr)->attr).name);
+			goto rem_attr;
+		}
+	}
+
+	return 0;
+
+rem_attr:
+	for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr)
+		sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+
+	kobject_put(bat_priv->mesh_obj);
+	bat_priv->mesh_obj = NULL;
+out:
+	return -ENOMEM;
+}
+
+void sysfs_del_meshif(struct net_device *dev)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	struct bat_attribute **bat_attr;
+
+	for (bat_attr = mesh_attrs; *bat_attr; ++bat_attr)
+		sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
+
+	kobject_put(bat_priv->mesh_obj);
+	bat_priv->mesh_obj = NULL;
+}
+
+static ssize_t show_mesh_iface(struct kobject *kobj, struct attribute *attr,
+			       char *buff)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+	struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
+	ssize_t length;
+
+	if (!hard_iface)
+		return 0;
+
+	length = sprintf(buff, "%s\n", hard_iface->if_status == IF_NOT_IN_USE ?
+			 "none" : hard_iface->soft_iface->name);
+
+	hardif_free_ref(hard_iface);
+
+	return length;
+}
+
+static ssize_t store_mesh_iface(struct kobject *kobj, struct attribute *attr,
+				char *buff, size_t count)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+	struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
+	int status_tmp = -1;
+	int ret = count;
+
+	if (!hard_iface)
+		return count;
+
+	if (buff[count - 1] == '\n')
+		buff[count - 1] = '\0';
+
+	if (strlen(buff) >= IFNAMSIZ) {
+		pr_err("Invalid parameter for 'mesh_iface' setting received: "
+		       "interface name too long '%s'\n", buff);
+		hardif_free_ref(hard_iface);
+		return -EINVAL;
+	}
+
+	if (strncmp(buff, "none", 4) == 0)
+		status_tmp = IF_NOT_IN_USE;
+	else
+		status_tmp = IF_I_WANT_YOU;
+
+	if (hard_iface->if_status == status_tmp)
+		goto out;
+
+	if ((hard_iface->soft_iface) &&
+	    (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0))
+		goto out;
+
+	if (!rtnl_trylock()) {
+		ret = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (status_tmp == IF_NOT_IN_USE) {
+		hardif_disable_interface(hard_iface);
+		goto unlock;
+	}
+
+	/* if the interface already is in use */
+	if (hard_iface->if_status != IF_NOT_IN_USE)
+		hardif_disable_interface(hard_iface);
+
+	ret = hardif_enable_interface(hard_iface, buff);
+
+unlock:
+	rtnl_unlock();
+out:
+	hardif_free_ref(hard_iface);
+	return ret;
+}
+
+static ssize_t show_iface_status(struct kobject *kobj, struct attribute *attr,
+				 char *buff)
+{
+	struct net_device *net_dev = kobj_to_netdev(kobj);
+	struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
+	ssize_t length;
+
+	if (!hard_iface)
+		return 0;
+
+	switch (hard_iface->if_status) {
+	case IF_TO_BE_REMOVED:
+		length = sprintf(buff, "disabling\n");
+		break;
+	case IF_INACTIVE:
+		length = sprintf(buff, "inactive\n");
+		break;
+	case IF_ACTIVE:
+		length = sprintf(buff, "active\n");
+		break;
+	case IF_TO_BE_ACTIVATED:
+		length = sprintf(buff, "enabling\n");
+		break;
+	case IF_NOT_IN_USE:
+	default:
+		length = sprintf(buff, "not in use\n");
+		break;
+	}
+
+	hardif_free_ref(hard_iface);
+
+	return length;
+}
+
+static BAT_ATTR(mesh_iface, S_IRUGO | S_IWUSR,
+		show_mesh_iface, store_mesh_iface);
+static BAT_ATTR(iface_status, S_IRUGO, show_iface_status, NULL);
+
+static struct bat_attribute *batman_attrs[] = {
+	&bat_attr_mesh_iface,
+	&bat_attr_iface_status,
+	NULL,
+};
+
+int sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev)
+{
+	struct kobject *hardif_kobject = &dev->dev.kobj;
+	struct bat_attribute **bat_attr;
+	int err;
+
+	*hardif_obj = kobject_create_and_add(SYSFS_IF_BAT_SUBDIR,
+						    hardif_kobject);
+
+	if (!*hardif_obj) {
+		bat_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
+			SYSFS_IF_BAT_SUBDIR);
+		goto out;
+	}
+
+	for (bat_attr = batman_attrs; *bat_attr; ++bat_attr) {
+		err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr));
+		if (err) {
+			bat_err(dev, "Can't add sysfs file: %s/%s/%s\n",
+				dev->name, SYSFS_IF_BAT_SUBDIR,
+				((*bat_attr)->attr).name);
+			goto rem_attr;
+		}
+	}
+
+	return 0;
+
+rem_attr:
+	for (bat_attr = batman_attrs; *bat_attr; ++bat_attr)
+		sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr));
+out:
+	return -ENOMEM;
+}
+
+void sysfs_del_hardif(struct kobject **hardif_obj)
+{
+	kobject_put(*hardif_obj);
+	*hardif_obj = NULL;
+}
diff --git a/net/batman-adv/bat_sysfs.h b/net/batman-adv/bat_sysfs.h
new file mode 100644
index 00000000..02f1fa7a
--- /dev/null
+++ b/net/batman-adv/bat_sysfs.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+
+#ifndef _NET_BATMAN_ADV_SYSFS_H_
+#define _NET_BATMAN_ADV_SYSFS_H_
+
+#define SYSFS_IF_MESH_SUBDIR "mesh"
+#define SYSFS_IF_BAT_SUBDIR "batman_adv"
+
+struct bat_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+			 char *buf, size_t count);
+};
+
+int sysfs_add_meshif(struct net_device *dev);
+void sysfs_del_meshif(struct net_device *dev);
+int sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev);
+void sysfs_del_hardif(struct kobject **hardif_obj);
+
+#endif /* _NET_BATMAN_ADV_SYSFS_H_ */
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
new file mode 100644
index 00000000..ad2ca925
--- /dev/null
+++ b/net/batman-adv/bitarray.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "bitarray.h"
+
+#include <linux/bitops.h>
+
+/* returns true if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno */
+uint8_t get_bit_status(unsigned long *seq_bits, uint32_t last_seqno,
+		       uint32_t curr_seqno)
+{
+	int32_t diff, word_offset, word_num;
+
+	diff = last_seqno - curr_seqno;
+	if (diff < 0 || diff >= TQ_LOCAL_WINDOW_SIZE) {
+		return 0;
+	} else {
+		/* which word */
+		word_num = (last_seqno - curr_seqno) / WORD_BIT_SIZE;
+		/* which position in the selected word */
+		word_offset = (last_seqno - curr_seqno) % WORD_BIT_SIZE;
+
+		if (test_bit(word_offset, &seq_bits[word_num]))
+			return 1;
+		else
+			return 0;
+	}
+}
+
+/* turn corresponding bit on, so we can remember that we got the packet */
+void bit_mark(unsigned long *seq_bits, int32_t n)
+{
+	int32_t word_offset, word_num;
+
+	/* if too old, just drop it */
+	if (n < 0 || n >= TQ_LOCAL_WINDOW_SIZE)
+		return;
+
+	/* which word */
+	word_num = n / WORD_BIT_SIZE;
+	/* which position in the selected word */
+	word_offset = n % WORD_BIT_SIZE;
+
+	set_bit(word_offset, &seq_bits[word_num]); /* turn the position on */
+}
+
+/* shift the packet array by n places. */
+static void bit_shift(unsigned long *seq_bits, int32_t n)
+{
+	int32_t word_offset, word_num;
+	int32_t i;
+
+	if (n <= 0 || n >= TQ_LOCAL_WINDOW_SIZE)
+		return;
+
+	word_offset = n % WORD_BIT_SIZE;/* shift how much inside each word */
+	word_num = n / WORD_BIT_SIZE;	/* shift over how much (full) words */
+
+	for (i = NUM_WORDS - 1; i > word_num; i--) {
+		/* going from old to new, so we don't overwrite the data we copy
+		 * from.
+		 *
+		 * left is high, right is low: FEDC BA98 7654 3210
+		 *					  ^^ ^^
+		 *			       vvvv
+		 * ^^^^ = from, vvvvv =to, we'd have word_num==1 and
+		 * word_offset==WORD_BIT_SIZE/2 ????? in this example.
+		 * (=24 bits)
+		 *
+		 * our desired output would be: 9876 5432 1000 0000
+		 * */
+
+		seq_bits[i] =
+			(seq_bits[i - word_num] << word_offset) +
+			/* take the lower port from the left half, shift it left
+			 * to its final position */
+			(seq_bits[i - word_num - 1] >>
+			 (WORD_BIT_SIZE-word_offset));
+		/* and the upper part of the right half and shift it left to
+		 * it's position */
+		/* for our example that would be: word[0] = 9800 + 0076 =
+		 * 9876 */
+	}
+	/* now for our last word, i==word_num, we only have the it's "left"
+	 * half. that's the 1000 word in our example.*/
+
+	seq_bits[i] = (seq_bits[i - word_num] << word_offset);
+
+	/* pad the rest with 0, if there is anything */
+	i--;
+
+	for (; i >= 0; i--)
+		seq_bits[i] = 0;
+}
+
+static void bit_reset_window(unsigned long *seq_bits)
+{
+	int i;
+	for (i = 0; i < NUM_WORDS; i++)
+		seq_bits[i] = 0;
+}
+
+
+/* receive and process one packet within the sequence number window.
+ *
+ * returns:
+ *  1 if the window was moved (either new or very old)
+ *  0 if the window was not moved/shifted.
+ */
+char bit_get_packet(void *priv, unsigned long *seq_bits,
+		    int32_t seq_num_diff, int8_t set_mark)
+{
+	struct bat_priv *bat_priv = (struct bat_priv *)priv;
+
+	/* sequence number is slightly older. We already got a sequence number
+	 * higher than this one, so we just mark it. */
+
+	if ((seq_num_diff <= 0) && (seq_num_diff > -TQ_LOCAL_WINDOW_SIZE)) {
+		if (set_mark)
+			bit_mark(seq_bits, -seq_num_diff);
+		return 0;
+	}
+
+	/* sequence number is slightly newer, so we shift the window and
+	 * set the mark if required */
+
+	if ((seq_num_diff > 0) && (seq_num_diff < TQ_LOCAL_WINDOW_SIZE)) {
+		bit_shift(seq_bits, seq_num_diff);
+
+		if (set_mark)
+			bit_mark(seq_bits, 0);
+		return 1;
+	}
+
+	/* sequence number is much newer, probably missed a lot of packets */
+
+	if ((seq_num_diff >= TQ_LOCAL_WINDOW_SIZE)
+		|| (seq_num_diff < EXPECTED_SEQNO_RANGE)) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"We missed a lot of packets (%i) !\n",
+			seq_num_diff - 1);
+		bit_reset_window(seq_bits);
+		if (set_mark)
+			bit_mark(seq_bits, 0);
+		return 1;
+	}
+
+	/* received a much older packet. The other host either restarted
+	 * or the old packet got delayed somewhere in the network. The
+	 * packet should be dropped without calling this function if the
+	 * seqno window is protected. */
+
+	if ((seq_num_diff <= -TQ_LOCAL_WINDOW_SIZE)
+		|| (seq_num_diff >= EXPECTED_SEQNO_RANGE)) {
+
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Other host probably restarted!\n");
+
+		bit_reset_window(seq_bits);
+		if (set_mark)
+			bit_mark(seq_bits, 0);
+
+		return 1;
+	}
+
+	/* never reached */
+	return 0;
+}
+
+/* count the hamming weight, how many good packets did we receive? just count
+ * the 1's.
+ */
+int bit_packet_count(unsigned long *seq_bits)
+{
+	int i, hamming = 0;
+
+	for (i = 0; i < NUM_WORDS; i++)
+		hamming += hweight_long(seq_bits[i]);
+
+	return hamming;
+}
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
new file mode 100644
index 00000000..769c246d
--- /dev/null
+++ b/net/batman-adv/bitarray.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_BITARRAY_H_
+#define _NET_BATMAN_ADV_BITARRAY_H_
+
+#define WORD_BIT_SIZE (sizeof(unsigned long) * 8)
+
+/* returns true if the corresponding bit in the given seq_bits indicates true
+ * and curr_seqno is within range of last_seqno */
+uint8_t get_bit_status(unsigned long *seq_bits, uint32_t last_seqno,
+					   uint32_t curr_seqno);
+
+/* turn corresponding bit on, so we can remember that we got the packet */
+void bit_mark(unsigned long *seq_bits, int32_t n);
+
+
+/* receive and process one packet, returns 1 if received seq_num is considered
+ * new, 0 if old  */
+char bit_get_packet(void *priv, unsigned long *seq_bits,
+		    int32_t seq_num_diff, int8_t set_mark);
+
+/* count the hamming weight, how many good packets did we receive? */
+int  bit_packet_count(unsigned long *seq_bits);
+
+#endif /* _NET_BATMAN_ADV_BITARRAY_H_ */
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
new file mode 100644
index 00000000..61605a0f
--- /dev/null
+++ b/net/batman-adv/gateway_client.c
@@ -0,0 +1,561 @@
+/*
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "gateway_client.h"
+#include "gateway_common.h"
+#include "hard-interface.h"
+#include "originator.h"
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/if_vlan.h>
+
+static void gw_node_free_ref(struct gw_node *gw_node)
+{
+	if (atomic_dec_and_test(&gw_node->refcount))
+		kfree_rcu(gw_node, rcu);
+}
+
+static struct gw_node *gw_get_selected_gw_node(struct bat_priv *bat_priv)
+{
+	struct gw_node *gw_node;
+
+	rcu_read_lock();
+	gw_node = rcu_dereference(bat_priv->curr_gw);
+	if (!gw_node)
+		goto out;
+
+	if (!atomic_inc_not_zero(&gw_node->refcount))
+		gw_node = NULL;
+
+out:
+	rcu_read_unlock();
+	return gw_node;
+}
+
+struct orig_node *gw_get_selected_orig(struct bat_priv *bat_priv)
+{
+	struct gw_node *gw_node;
+	struct orig_node *orig_node = NULL;
+
+	gw_node = gw_get_selected_gw_node(bat_priv);
+	if (!gw_node)
+		goto out;
+
+	rcu_read_lock();
+	orig_node = gw_node->orig_node;
+	if (!orig_node)
+		goto unlock;
+
+	if (!atomic_inc_not_zero(&orig_node->refcount))
+		orig_node = NULL;
+
+unlock:
+	rcu_read_unlock();
+out:
+	if (gw_node)
+		gw_node_free_ref(gw_node);
+	return orig_node;
+}
+
+static void gw_select(struct bat_priv *bat_priv, struct gw_node *new_gw_node)
+{
+	struct gw_node *curr_gw_node;
+
+	spin_lock_bh(&bat_priv->gw_list_lock);
+
+	if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount))
+		new_gw_node = NULL;
+
+	curr_gw_node = bat_priv->curr_gw;
+	rcu_assign_pointer(bat_priv->curr_gw, new_gw_node);
+
+	if (curr_gw_node)
+		gw_node_free_ref(curr_gw_node);
+
+	spin_unlock_bh(&bat_priv->gw_list_lock);
+}
+
+void gw_deselect(struct bat_priv *bat_priv)
+{
+	gw_select(bat_priv, NULL);
+}
+
+void gw_election(struct bat_priv *bat_priv)
+{
+	struct hlist_node *node;
+	struct gw_node *gw_node, *curr_gw = NULL, *curr_gw_tmp = NULL;
+	struct neigh_node *router;
+	uint8_t max_tq = 0;
+	uint32_t max_gw_factor = 0, tmp_gw_factor = 0;
+	int down, up;
+
+	/**
+	 * The batman daemon checks here if we already passed a full originator
+	 * cycle in order to make sure we don't choose the first gateway we
+	 * hear about. This check is based on the daemon's uptime which we
+	 * don't have.
+	 **/
+	if (atomic_read(&bat_priv->gw_mode) != GW_MODE_CLIENT)
+		return;
+
+	curr_gw = gw_get_selected_gw_node(bat_priv);
+	if (curr_gw)
+		goto out;
+
+	rcu_read_lock();
+	if (hlist_empty(&bat_priv->gw_list)) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Removing selected gateway - "
+			"no gateway in range\n");
+		gw_deselect(bat_priv);
+		goto unlock;
+	}
+
+	hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) {
+		if (gw_node->deleted)
+			continue;
+
+		router = orig_node_get_router(gw_node->orig_node);
+		if (!router)
+			continue;
+
+		switch (atomic_read(&bat_priv->gw_sel_class)) {
+		case 1: /* fast connection */
+			gw_bandwidth_to_kbit(gw_node->orig_node->gw_flags,
+					     &down, &up);
+
+			tmp_gw_factor = (router->tq_avg * router->tq_avg *
+					 down * 100 * 100) /
+					 (TQ_LOCAL_WINDOW_SIZE *
+					 TQ_LOCAL_WINDOW_SIZE * 64);
+
+			if ((tmp_gw_factor > max_gw_factor) ||
+			    ((tmp_gw_factor == max_gw_factor) &&
+			     (router->tq_avg > max_tq)))
+				curr_gw_tmp = gw_node;
+			break;
+
+		default: /**
+			  * 2:  stable connection (use best statistic)
+			  * 3:  fast-switch (use best statistic but change as
+			  *     soon as a better gateway appears)
+			  * XX: late-switch (use best statistic but change as
+			  *     soon as a better gateway appears which has
+			  *     $routing_class more tq points)
+			  **/
+			if (router->tq_avg > max_tq)
+				curr_gw_tmp = gw_node;
+			break;
+		}
+
+		if (router->tq_avg > max_tq)
+			max_tq = router->tq_avg;
+
+		if (tmp_gw_factor > max_gw_factor)
+			max_gw_factor = tmp_gw_factor;
+
+		neigh_node_free_ref(router);
+	}
+
+	if (curr_gw != curr_gw_tmp) {
+		router = orig_node_get_router(curr_gw_tmp->orig_node);
+		if (!router)
+			goto unlock;
+
+		if ((curr_gw) && (!curr_gw_tmp))
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"Removing selected gateway - "
+				"no gateway in range\n");
+		else if ((!curr_gw) && (curr_gw_tmp))
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"Adding route to gateway %pM "
+				"(gw_flags: %i, tq: %i)\n",
+				curr_gw_tmp->orig_node->orig,
+				curr_gw_tmp->orig_node->gw_flags,
+				router->tq_avg);
+		else
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"Changing route to gateway %pM "
+				"(gw_flags: %i, tq: %i)\n",
+				curr_gw_tmp->orig_node->orig,
+				curr_gw_tmp->orig_node->gw_flags,
+				router->tq_avg);
+
+		neigh_node_free_ref(router);
+		gw_select(bat_priv, curr_gw_tmp);
+	}
+
+unlock:
+	rcu_read_unlock();
+out:
+	if (curr_gw)
+		gw_node_free_ref(curr_gw);
+}
+
+void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node)
+{
+	struct orig_node *curr_gw_orig;
+	struct neigh_node *router_gw = NULL, *router_orig = NULL;
+	uint8_t gw_tq_avg, orig_tq_avg;
+
+	curr_gw_orig = gw_get_selected_orig(bat_priv);
+	if (!curr_gw_orig)
+		goto deselect;
+
+	router_gw = orig_node_get_router(curr_gw_orig);
+	if (!router_gw)
+		goto deselect;
+
+	/* this node already is the gateway */
+	if (curr_gw_orig == orig_node)
+		goto out;
+
+	router_orig = orig_node_get_router(orig_node);
+	if (!router_orig)
+		goto out;
+
+	gw_tq_avg = router_gw->tq_avg;
+	orig_tq_avg = router_orig->tq_avg;
+
+	/* the TQ value has to be better */
+	if (orig_tq_avg < gw_tq_avg)
+		goto out;
+
+	/**
+	 * if the routing class is greater than 3 the value tells us how much
+	 * greater the TQ value of the new gateway must be
+	 **/
+	if ((atomic_read(&bat_priv->gw_sel_class) > 3) &&
+	    (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw_sel_class)))
+		goto out;
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Restarting gateway selection: better gateway found (tq curr: "
+		"%i, tq new: %i)\n",
+		gw_tq_avg, orig_tq_avg);
+
+deselect:
+	gw_deselect(bat_priv);
+out:
+	if (curr_gw_orig)
+		orig_node_free_ref(curr_gw_orig);
+	if (router_gw)
+		neigh_node_free_ref(router_gw);
+	if (router_orig)
+		neigh_node_free_ref(router_orig);
+
+	return;
+}
+
+static void gw_node_add(struct bat_priv *bat_priv,
+			struct orig_node *orig_node, uint8_t new_gwflags)
+{
+	struct gw_node *gw_node;
+	int down, up;
+
+	gw_node = kmalloc(sizeof(struct gw_node), GFP_ATOMIC);
+	if (!gw_node)
+		return;
+
+	memset(gw_node, 0, sizeof(struct gw_node));
+	INIT_HLIST_NODE(&gw_node->list);
+	gw_node->orig_node = orig_node;
+	atomic_set(&gw_node->refcount, 1);
+
+	spin_lock_bh(&bat_priv->gw_list_lock);
+	hlist_add_head_rcu(&gw_node->list, &bat_priv->gw_list);
+	spin_unlock_bh(&bat_priv->gw_list_lock);
+
+	gw_bandwidth_to_kbit(new_gwflags, &down, &up);
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Found new gateway %pM -> gw_class: %i - %i%s/%i%s\n",
+		orig_node->orig, new_gwflags,
+		(down > 2048 ? down / 1024 : down),
+		(down > 2048 ? "MBit" : "KBit"),
+		(up > 2048 ? up / 1024 : up),
+		(up > 2048 ? "MBit" : "KBit"));
+}
+
+void gw_node_update(struct bat_priv *bat_priv,
+		    struct orig_node *orig_node, uint8_t new_gwflags)
+{
+	struct hlist_node *node;
+	struct gw_node *gw_node, *curr_gw;
+
+	/**
+	 * Note: We don't need a NULL check here, since curr_gw never gets
+	 * dereferenced. If curr_gw is NULL we also should not exit as we may
+	 * have this gateway in our list (duplication check!) even though we
+	 * have no currently selected gateway.
+	 */
+	curr_gw = gw_get_selected_gw_node(bat_priv);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) {
+		if (gw_node->orig_node != orig_node)
+			continue;
+
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Gateway class of originator %pM changed from "
+			"%i to %i\n",
+			orig_node->orig, gw_node->orig_node->gw_flags,
+			new_gwflags);
+
+		gw_node->deleted = 0;
+
+		if (new_gwflags == 0) {
+			gw_node->deleted = jiffies;
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"Gateway %pM removed from gateway list\n",
+				orig_node->orig);
+
+			if (gw_node == curr_gw)
+				goto deselect;
+		}
+
+		goto unlock;
+	}
+
+	if (new_gwflags == 0)
+		goto unlock;
+
+	gw_node_add(bat_priv, orig_node, new_gwflags);
+	goto unlock;
+
+deselect:
+	gw_deselect(bat_priv);
+unlock:
+	rcu_read_unlock();
+
+	if (curr_gw)
+		gw_node_free_ref(curr_gw);
+}
+
+void gw_node_delete(struct bat_priv *bat_priv, struct orig_node *orig_node)
+{
+	return gw_node_update(bat_priv, orig_node, 0);
+}
+
+void gw_node_purge(struct bat_priv *bat_priv)
+{
+	struct gw_node *gw_node, *curr_gw;
+	struct hlist_node *node, *node_tmp;
+	unsigned long timeout = 2 * PURGE_TIMEOUT * HZ;
+	char do_deselect = 0;
+
+	curr_gw = gw_get_selected_gw_node(bat_priv);
+
+	spin_lock_bh(&bat_priv->gw_list_lock);
+
+	hlist_for_each_entry_safe(gw_node, node, node_tmp,
+				  &bat_priv->gw_list, list) {
+		if (((!gw_node->deleted) ||
+		     (time_before(jiffies, gw_node->deleted + timeout))) &&
+		    atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE)
+			continue;
+
+		if (curr_gw == gw_node)
+			do_deselect = 1;
+
+		hlist_del_rcu(&gw_node->list);
+		gw_node_free_ref(gw_node);
+	}
+
+	spin_unlock_bh(&bat_priv->gw_list_lock);
+
+	/* gw_deselect() needs to acquire the gw_list_lock */
+	if (do_deselect)
+		gw_deselect(bat_priv);
+
+	if (curr_gw)
+		gw_node_free_ref(curr_gw);
+}
+
+/**
+ * fails if orig_node has no router
+ */
+static int _write_buffer_text(struct bat_priv *bat_priv,
+			      struct seq_file *seq, struct gw_node *gw_node)
+{
+	struct gw_node *curr_gw;
+	struct neigh_node *router;
+	int down, up, ret = -1;
+
+	gw_bandwidth_to_kbit(gw_node->orig_node->gw_flags, &down, &up);
+
+	router = orig_node_get_router(gw_node->orig_node);
+	if (!router)
+		goto out;
+
+	curr_gw = gw_get_selected_gw_node(bat_priv);
+
+	ret = seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %3i - %i%s/%i%s\n",
+			 (curr_gw == gw_node ? "=>" : "  "),
+			 gw_node->orig_node->orig,
+			 router->tq_avg, router->addr,
+			 router->if_incoming->net_dev->name,
+			 gw_node->orig_node->gw_flags,
+			 (down > 2048 ? down / 1024 : down),
+			 (down > 2048 ? "MBit" : "KBit"),
+			 (up > 2048 ? up / 1024 : up),
+			 (up > 2048 ? "MBit" : "KBit"));
+
+	neigh_node_free_ref(router);
+	if (curr_gw)
+		gw_node_free_ref(curr_gw);
+out:
+	return ret;
+}
+
+int gw_client_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct hard_iface *primary_if;
+	struct gw_node *gw_node;
+	struct hlist_node *node;
+	int gw_count = 0, ret = 0;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - please "
+				 "specify interfaces to enable it\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	if (primary_if->if_status != IF_ACTIVE) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "primary interface not active\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	seq_printf(seq, "      %-12s (%s/%i) %17s [%10s]: gw_class ... "
+		   "[B.A.T.M.A.N. adv %s%s, MainIF/MAC: %s/%pM (%s)]\n",
+		   "Gateway", "#", TQ_MAX_VALUE, "Nexthop",
+		   "outgoingIF", SOURCE_VERSION, REVISION_VERSION_STR,
+		   primary_if->net_dev->name,
+		   primary_if->net_dev->dev_addr, net_dev->name);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(gw_node, node, &bat_priv->gw_list, list) {
+		if (gw_node->deleted)
+			continue;
+
+		/* fails if orig_node has no router */
+		if (_write_buffer_text(bat_priv, seq, gw_node) < 0)
+			continue;
+
+		gw_count++;
+	}
+	rcu_read_unlock();
+
+	if (gw_count == 0)
+		seq_printf(seq, "No gateways in range ...\n");
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb)
+{
+	struct ethhdr *ethhdr;
+	struct iphdr *iphdr;
+	struct ipv6hdr *ipv6hdr;
+	struct udphdr *udphdr;
+	struct gw_node *curr_gw;
+	unsigned int header_len = 0;
+
+	if (atomic_read(&bat_priv->gw_mode) == GW_MODE_OFF)
+		return 0;
+
+	/* check for ethernet header */
+	if (!pskb_may_pull(skb, header_len + ETH_HLEN))
+		return 0;
+	ethhdr = (struct ethhdr *)skb->data;
+	header_len += ETH_HLEN;
+
+	/* check for initial vlan header */
+	if (ntohs(ethhdr->h_proto) == ETH_P_8021Q) {
+		if (!pskb_may_pull(skb, header_len + VLAN_HLEN))
+			return 0;
+		ethhdr = (struct ethhdr *)(skb->data + VLAN_HLEN);
+		header_len += VLAN_HLEN;
+	}
+
+	/* check for ip header */
+	switch (ntohs(ethhdr->h_proto)) {
+	case ETH_P_IP:
+		if (!pskb_may_pull(skb, header_len + sizeof(struct iphdr)))
+			return 0;
+		iphdr = (struct iphdr *)(skb->data + header_len);
+		header_len += iphdr->ihl * 4;
+
+		/* check for udp header */
+		if (iphdr->protocol != IPPROTO_UDP)
+			return 0;
+
+		break;
+	case ETH_P_IPV6:
+		if (!pskb_may_pull(skb, header_len + sizeof(struct ipv6hdr)))
+			return 0;
+		ipv6hdr = (struct ipv6hdr *)(skb->data + header_len);
+		header_len += sizeof(struct ipv6hdr);
+
+		/* check for udp header */
+		if (ipv6hdr->nexthdr != IPPROTO_UDP)
+			return 0;
+
+		break;
+	default:
+		return 0;
+	}
+
+	if (!pskb_may_pull(skb, header_len + sizeof(struct udphdr)))
+		return 0;
+	udphdr = (struct udphdr *)(skb->data + header_len);
+	header_len += sizeof(struct udphdr);
+
+	/* check for bootp port */
+	if ((ntohs(ethhdr->h_proto) == ETH_P_IP) &&
+	     (ntohs(udphdr->dest) != 67))
+		return 0;
+
+	if ((ntohs(ethhdr->h_proto) == ETH_P_IPV6) &&
+	    (ntohs(udphdr->dest) != 547))
+		return 0;
+
+	if (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER)
+		return -1;
+
+	curr_gw = gw_get_selected_gw_node(bat_priv);
+	if (!curr_gw)
+		return 0;
+
+	if (curr_gw)
+		gw_node_free_ref(curr_gw);
+	return 1;
+}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
new file mode 100644
index 00000000..1ce8c606
--- /dev/null
+++ b/net/batman-adv/gateway_client.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+#define _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
+
+void gw_deselect(struct bat_priv *bat_priv);
+void gw_election(struct bat_priv *bat_priv);
+struct orig_node *gw_get_selected_orig(struct bat_priv *bat_priv);
+void gw_check_election(struct bat_priv *bat_priv, struct orig_node *orig_node);
+void gw_node_update(struct bat_priv *bat_priv,
+		    struct orig_node *orig_node, uint8_t new_gwflags);
+void gw_node_delete(struct bat_priv *bat_priv, struct orig_node *orig_node);
+void gw_node_purge(struct bat_priv *bat_priv);
+int gw_client_seq_print_text(struct seq_file *seq, void *offset);
+int gw_is_target(struct bat_priv *bat_priv, struct sk_buff *skb);
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
new file mode 100644
index 00000000..50d3a59a
--- /dev/null
+++ b/net/batman-adv/gateway_common.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+
+/* calculates the gateway class from kbit */
+static void kbit_to_gw_bandwidth(int down, int up, long *gw_srv_class)
+{
+	int mdown = 0, tdown, tup, difference;
+	uint8_t sbit, part;
+
+	*gw_srv_class = 0;
+	difference = 0x0FFFFFFF;
+
+	/* test all downspeeds */
+	for (sbit = 0; sbit < 2; sbit++) {
+		for (part = 0; part < 16; part++) {
+			tdown = 32 * (sbit + 2) * (1 << part);
+
+			if (abs(tdown - down) < difference) {
+				*gw_srv_class = (sbit << 7) + (part << 3);
+				difference = abs(tdown - down);
+				mdown = tdown;
+			}
+		}
+	}
+
+	/* test all upspeeds */
+	difference = 0x0FFFFFFF;
+
+	for (part = 0; part < 8; part++) {
+		tup = ((part + 1) * (mdown)) / 8;
+
+		if (abs(tup - up) < difference) {
+			*gw_srv_class = (*gw_srv_class & 0xF8) | part;
+			difference = abs(tup - up);
+		}
+	}
+}
+
+/* returns the up and downspeeds in kbit, calculated from the class */
+void gw_bandwidth_to_kbit(uint8_t gw_srv_class, int *down, int *up)
+{
+	char sbit = (gw_srv_class & 0x80) >> 7;
+	char dpart = (gw_srv_class & 0x78) >> 3;
+	char upart = (gw_srv_class & 0x07);
+
+	if (!gw_srv_class) {
+		*down = 0;
+		*up = 0;
+		return;
+	}
+
+	*down = 32 * (sbit + 2) * (1 << dpart);
+	*up = ((upart + 1) * (*down)) / 8;
+}
+
+static bool parse_gw_bandwidth(struct net_device *net_dev, char *buff,
+			       long *up, long *down)
+{
+	int ret, multi = 1;
+	char *slash_ptr, *tmp_ptr;
+
+	slash_ptr = strchr(buff, '/');
+	if (slash_ptr)
+		*slash_ptr = 0;
+
+	if (strlen(buff) > 4) {
+		tmp_ptr = buff + strlen(buff) - 4;
+
+		if (strnicmp(tmp_ptr, "mbit", 4) == 0)
+			multi = 1024;
+
+		if ((strnicmp(tmp_ptr, "kbit", 4) == 0) ||
+			(multi > 1))
+			*tmp_ptr = '\0';
+	}
+
+	ret = strict_strtoul(buff, 10, down);
+	if (ret) {
+		bat_err(net_dev,
+			"Download speed of gateway mode invalid: %s\n",
+			buff);
+		return false;
+	}
+
+	*down *= multi;
+
+	/* we also got some upload info */
+	if (slash_ptr) {
+		multi = 1;
+
+		if (strlen(slash_ptr + 1) > 4) {
+			tmp_ptr = slash_ptr + 1 - 4 + strlen(slash_ptr + 1);
+
+			if (strnicmp(tmp_ptr, "mbit", 4) == 0)
+				multi = 1024;
+
+			if ((strnicmp(tmp_ptr, "kbit", 4) == 0) ||
+				(multi > 1))
+				*tmp_ptr = '\0';
+		}
+
+		ret = strict_strtoul(slash_ptr + 1, 10, up);
+		if (ret) {
+			bat_err(net_dev,
+				"Upload speed of gateway mode invalid: "
+				"%s\n", slash_ptr + 1);
+			return false;
+		}
+
+		*up *= multi;
+	}
+
+	return true;
+}
+
+ssize_t gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count)
+{
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	long gw_bandwidth_tmp = 0, up = 0, down = 0;
+	bool ret;
+
+	ret = parse_gw_bandwidth(net_dev, buff, &up, &down);
+	if (!ret)
+		goto end;
+
+	if ((!down) || (down < 256))
+		down = 2000;
+
+	if (!up)
+		up = down / 5;
+
+	kbit_to_gw_bandwidth(down, up, &gw_bandwidth_tmp);
+
+	/**
+	 * the gw bandwidth we guessed above might not match the given
+	 * speeds, hence we need to calculate it back to show the number
+	 * that is going to be propagated
+	 **/
+	gw_bandwidth_to_kbit((uint8_t)gw_bandwidth_tmp,
+			     (int *)&down, (int *)&up);
+
+	gw_deselect(bat_priv);
+	bat_info(net_dev, "Changing gateway bandwidth from: '%i' to: '%ld' "
+		 "(propagating: %ld%s/%ld%s)\n",
+		 atomic_read(&bat_priv->gw_bandwidth), gw_bandwidth_tmp,
+		 (down > 2048 ? down / 1024 : down),
+		 (down > 2048 ? "MBit" : "KBit"),
+		 (up > 2048 ? up / 1024 : up),
+		 (up > 2048 ? "MBit" : "KBit"));
+
+	atomic_set(&bat_priv->gw_bandwidth, gw_bandwidth_tmp);
+
+end:
+	return count;
+}
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
new file mode 100644
index 00000000..55e527a4
--- /dev/null
+++ b/net/batman-adv/gateway_common.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+#define _NET_BATMAN_ADV_GATEWAY_COMMON_H_
+
+enum gw_modes {
+	GW_MODE_OFF,
+	GW_MODE_CLIENT,
+	GW_MODE_SERVER,
+};
+
+#define GW_MODE_OFF_NAME	"off"
+#define GW_MODE_CLIENT_NAME	"client"
+#define GW_MODE_SERVER_NAME	"server"
+
+void gw_bandwidth_to_kbit(uint8_t gw_class, int *down, int *up);
+ssize_t gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count);
+
+#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
new file mode 100644
index 00000000..dfbfccc9
--- /dev/null
+++ b/net/batman-adv/hard-interface.c
@@ -0,0 +1,682 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "hard-interface.h"
+#include "soft-interface.h"
+#include "send.h"
+#include "translation-table.h"
+#include "routing.h"
+#include "bat_sysfs.h"
+#include "originator.h"
+#include "hash.h"
+
+#include <linux/if_arp.h>
+
+
+static int batman_skb_recv(struct sk_buff *skb,
+			   struct net_device *dev,
+			   struct packet_type *ptype,
+			   struct net_device *orig_dev);
+
+void hardif_free_rcu(struct rcu_head *rcu)
+{
+	struct hard_iface *hard_iface;
+
+	hard_iface = container_of(rcu, struct hard_iface, rcu);
+	dev_put(hard_iface->net_dev);
+	kfree(hard_iface);
+}
+
+struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev)
+{
+	struct hard_iface *hard_iface;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->net_dev == net_dev &&
+		    atomic_inc_not_zero(&hard_iface->refcount))
+			goto out;
+	}
+
+	hard_iface = NULL;
+
+out:
+	rcu_read_unlock();
+	return hard_iface;
+}
+
+static int is_valid_iface(struct net_device *net_dev)
+{
+	if (net_dev->flags & IFF_LOOPBACK)
+		return 0;
+
+	if (net_dev->type != ARPHRD_ETHER)
+		return 0;
+
+	if (net_dev->addr_len != ETH_ALEN)
+		return 0;
+
+	/* no batman over batman */
+	if (softif_is_valid(net_dev))
+		return 0;
+
+	/* Device is being bridged */
+	/* if (net_dev->priv_flags & IFF_BRIDGE_PORT)
+		return 0; */
+
+	return 1;
+}
+
+static struct hard_iface *hardif_get_active(struct net_device *soft_iface)
+{
+	struct hard_iface *hard_iface;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->soft_iface != soft_iface)
+			continue;
+
+		if (hard_iface->if_status == IF_ACTIVE &&
+		    atomic_inc_not_zero(&hard_iface->refcount))
+			goto out;
+	}
+
+	hard_iface = NULL;
+
+out:
+	rcu_read_unlock();
+	return hard_iface;
+}
+
+static void primary_if_update_addr(struct bat_priv *bat_priv)
+{
+	struct vis_packet *vis_packet;
+	struct hard_iface *primary_if;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	vis_packet = (struct vis_packet *)
+				bat_priv->my_vis_info->skb_packet->data;
+	memcpy(vis_packet->vis_orig, primary_if->net_dev->dev_addr, ETH_ALEN);
+	memcpy(vis_packet->sender_orig,
+	       primary_if->net_dev->dev_addr, ETH_ALEN);
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+}
+
+static void primary_if_select(struct bat_priv *bat_priv,
+			      struct hard_iface *new_hard_iface)
+{
+	struct hard_iface *curr_hard_iface;
+	struct batman_packet *batman_packet;
+
+	ASSERT_RTNL();
+
+	if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount))
+		new_hard_iface = NULL;
+
+	curr_hard_iface = bat_priv->primary_if;
+	rcu_assign_pointer(bat_priv->primary_if, new_hard_iface);
+
+	if (curr_hard_iface)
+		hardif_free_ref(curr_hard_iface);
+
+	if (!new_hard_iface)
+		return;
+
+	batman_packet = (struct batman_packet *)(new_hard_iface->packet_buff);
+	batman_packet->flags = PRIMARIES_FIRST_HOP;
+	batman_packet->ttl = TTL;
+
+	primary_if_update_addr(bat_priv);
+
+	/***
+	 * hacky trick to make sure that we send the TT information via
+	 * our new primary interface
+	 */
+	atomic_set(&bat_priv->tt_local_changed, 1);
+}
+
+static bool hardif_is_iface_up(struct hard_iface *hard_iface)
+{
+	if (hard_iface->net_dev->flags & IFF_UP)
+		return true;
+
+	return false;
+}
+
+static void update_mac_addresses(struct hard_iface *hard_iface)
+{
+	memcpy(((struct batman_packet *)(hard_iface->packet_buff))->orig,
+	       hard_iface->net_dev->dev_addr, ETH_ALEN);
+	memcpy(((struct batman_packet *)(hard_iface->packet_buff))->prev_sender,
+	       hard_iface->net_dev->dev_addr, ETH_ALEN);
+}
+
+static void check_known_mac_addr(struct net_device *net_dev)
+{
+	struct hard_iface *hard_iface;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if ((hard_iface->if_status != IF_ACTIVE) &&
+		    (hard_iface->if_status != IF_TO_BE_ACTIVATED))
+			continue;
+
+		if (hard_iface->net_dev == net_dev)
+			continue;
+
+		if (!compare_eth(hard_iface->net_dev->dev_addr,
+				 net_dev->dev_addr))
+			continue;
+
+		pr_warning("The newly added mac address (%pM) already exists "
+			   "on: %s\n", net_dev->dev_addr,
+			   hard_iface->net_dev->name);
+		pr_warning("It is strongly recommended to keep mac addresses "
+			   "unique to avoid problems!\n");
+	}
+	rcu_read_unlock();
+}
+
+int hardif_min_mtu(struct net_device *soft_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+	struct hard_iface *hard_iface;
+	/* allow big frames if all devices are capable to do so
+	 * (have MTU > 1500 + BAT_HEADER_LEN) */
+	int min_mtu = ETH_DATA_LEN;
+
+	if (atomic_read(&bat_priv->fragmentation))
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if ((hard_iface->if_status != IF_ACTIVE) &&
+		    (hard_iface->if_status != IF_TO_BE_ACTIVATED))
+			continue;
+
+		if (hard_iface->soft_iface != soft_iface)
+			continue;
+
+		min_mtu = min_t(int, hard_iface->net_dev->mtu - BAT_HEADER_LEN,
+				min_mtu);
+	}
+	rcu_read_unlock();
+out:
+	return min_mtu;
+}
+
+/* adjusts the MTU if a new interface with a smaller MTU appeared. */
+void update_min_mtu(struct net_device *soft_iface)
+{
+	int min_mtu;
+
+	min_mtu = hardif_min_mtu(soft_iface);
+	if (soft_iface->mtu != min_mtu)
+		soft_iface->mtu = min_mtu;
+}
+
+static void hardif_activate_interface(struct hard_iface *hard_iface)
+{
+	struct bat_priv *bat_priv;
+	struct hard_iface *primary_if = NULL;
+
+	if (hard_iface->if_status != IF_INACTIVE)
+		goto out;
+
+	bat_priv = netdev_priv(hard_iface->soft_iface);
+
+	update_mac_addresses(hard_iface);
+	hard_iface->if_status = IF_TO_BE_ACTIVATED;
+
+	/**
+	 * the first active interface becomes our primary interface or
+	 * the next active interface after the old primay interface was removed
+	 */
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		primary_if_select(bat_priv, hard_iface);
+
+	bat_info(hard_iface->soft_iface, "Interface activated: %s\n",
+		 hard_iface->net_dev->name);
+
+	update_min_mtu(hard_iface->soft_iface);
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+}
+
+static void hardif_deactivate_interface(struct hard_iface *hard_iface)
+{
+	if ((hard_iface->if_status != IF_ACTIVE) &&
+	    (hard_iface->if_status != IF_TO_BE_ACTIVATED))
+		return;
+
+	hard_iface->if_status = IF_INACTIVE;
+
+	bat_info(hard_iface->soft_iface, "Interface deactivated: %s\n",
+		 hard_iface->net_dev->name);
+
+	update_min_mtu(hard_iface->soft_iface);
+}
+
+int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name)
+{
+	struct bat_priv *bat_priv;
+	struct batman_packet *batman_packet;
+	struct net_device *soft_iface;
+	int ret;
+
+	if (hard_iface->if_status != IF_NOT_IN_USE)
+		goto out;
+
+	if (!atomic_inc_not_zero(&hard_iface->refcount))
+		goto out;
+
+	soft_iface = dev_get_by_name(&init_net, iface_name);
+
+	if (!soft_iface) {
+		soft_iface = softif_create(iface_name);
+
+		if (!soft_iface) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		/* dev_get_by_name() increases the reference counter for us */
+		dev_hold(soft_iface);
+	}
+
+	if (!softif_is_valid(soft_iface)) {
+		pr_err("Can't create batman mesh interface %s: "
+		       "already exists as regular interface\n",
+		       soft_iface->name);
+		dev_put(soft_iface);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	hard_iface->soft_iface = soft_iface;
+	bat_priv = netdev_priv(hard_iface->soft_iface);
+	hard_iface->packet_len = BAT_PACKET_LEN;
+	hard_iface->packet_buff = kmalloc(hard_iface->packet_len, GFP_ATOMIC);
+
+	if (!hard_iface->packet_buff) {
+		bat_err(hard_iface->soft_iface, "Can't add interface packet "
+			"(%s): out of memory\n", hard_iface->net_dev->name);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	batman_packet = (struct batman_packet *)(hard_iface->packet_buff);
+	batman_packet->packet_type = BAT_PACKET;
+	batman_packet->version = COMPAT_VERSION;
+	batman_packet->flags = 0;
+	batman_packet->ttl = 2;
+	batman_packet->tq = TQ_MAX_VALUE;
+	batman_packet->num_tt = 0;
+
+	hard_iface->if_num = bat_priv->num_ifaces;
+	bat_priv->num_ifaces++;
+	hard_iface->if_status = IF_INACTIVE;
+	orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
+
+	hard_iface->batman_adv_ptype.type = __constant_htons(ETH_P_BATMAN);
+	hard_iface->batman_adv_ptype.func = batman_skb_recv;
+	hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
+	dev_add_pack(&hard_iface->batman_adv_ptype);
+
+	atomic_set(&hard_iface->seqno, 1);
+	atomic_set(&hard_iface->frag_seqno, 1);
+	bat_info(hard_iface->soft_iface, "Adding interface: %s\n",
+		 hard_iface->net_dev->name);
+
+	if (atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu <
+		ETH_DATA_LEN + BAT_HEADER_LEN)
+		bat_info(hard_iface->soft_iface,
+			"The MTU of interface %s is too small (%i) to handle "
+			"the transport of batman-adv packets. Packets going "
+			"over this interface will be fragmented on layer2 "
+			"which could impact the performance. Setting the MTU "
+			"to %zi would solve the problem.\n",
+			hard_iface->net_dev->name, hard_iface->net_dev->mtu,
+			ETH_DATA_LEN + BAT_HEADER_LEN);
+
+	if (!atomic_read(&bat_priv->fragmentation) && hard_iface->net_dev->mtu <
+		ETH_DATA_LEN + BAT_HEADER_LEN)
+		bat_info(hard_iface->soft_iface,
+			"The MTU of interface %s is too small (%i) to handle "
+			"the transport of batman-adv packets. If you experience"
+			" problems getting traffic through try increasing the "
+			"MTU to %zi.\n",
+			hard_iface->net_dev->name, hard_iface->net_dev->mtu,
+			ETH_DATA_LEN + BAT_HEADER_LEN);
+
+	if (hardif_is_iface_up(hard_iface))
+		hardif_activate_interface(hard_iface);
+	else
+		bat_err(hard_iface->soft_iface, "Not using interface %s "
+			"(retrying later): interface not active\n",
+			hard_iface->net_dev->name);
+
+	/* begin scheduling originator messages on that interface */
+	schedule_own_packet(hard_iface);
+
+out:
+	return 0;
+
+err:
+	hardif_free_ref(hard_iface);
+	return ret;
+}
+
+void hardif_disable_interface(struct hard_iface *hard_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct hard_iface *primary_if = NULL;
+
+	if (hard_iface->if_status == IF_ACTIVE)
+		hardif_deactivate_interface(hard_iface);
+
+	if (hard_iface->if_status != IF_INACTIVE)
+		goto out;
+
+	bat_info(hard_iface->soft_iface, "Removing interface: %s\n",
+		 hard_iface->net_dev->name);
+	dev_remove_pack(&hard_iface->batman_adv_ptype);
+
+	bat_priv->num_ifaces--;
+	orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (hard_iface == primary_if) {
+		struct hard_iface *new_if;
+
+		new_if = hardif_get_active(hard_iface->soft_iface);
+		primary_if_select(bat_priv, new_if);
+
+		if (new_if)
+			hardif_free_ref(new_if);
+	}
+
+	kfree(hard_iface->packet_buff);
+	hard_iface->packet_buff = NULL;
+	hard_iface->if_status = IF_NOT_IN_USE;
+
+	/* delete all references to this hard_iface */
+	purge_orig_ref(bat_priv);
+	purge_outstanding_packets(bat_priv, hard_iface);
+	dev_put(hard_iface->soft_iface);
+
+	/* nobody uses this interface anymore */
+	if (!bat_priv->num_ifaces)
+		softif_destroy(hard_iface->soft_iface);
+
+	hard_iface->soft_iface = NULL;
+	hardif_free_ref(hard_iface);
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+}
+
+static struct hard_iface *hardif_add_interface(struct net_device *net_dev)
+{
+	struct hard_iface *hard_iface;
+	int ret;
+
+	ASSERT_RTNL();
+
+	ret = is_valid_iface(net_dev);
+	if (ret != 1)
+		goto out;
+
+	dev_hold(net_dev);
+
+	hard_iface = kmalloc(sizeof(struct hard_iface), GFP_ATOMIC);
+	if (!hard_iface) {
+		pr_err("Can't add interface (%s): out of memory\n",
+		       net_dev->name);
+		goto release_dev;
+	}
+
+	ret = sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
+	if (ret)
+		goto free_if;
+
+	hard_iface->if_num = -1;
+	hard_iface->net_dev = net_dev;
+	hard_iface->soft_iface = NULL;
+	hard_iface->if_status = IF_NOT_IN_USE;
+	INIT_LIST_HEAD(&hard_iface->list);
+	/* extra reference for return */
+	atomic_set(&hard_iface->refcount, 2);
+
+	check_known_mac_addr(hard_iface->net_dev);
+	list_add_tail_rcu(&hard_iface->list, &hardif_list);
+
+	return hard_iface;
+
+free_if:
+	kfree(hard_iface);
+release_dev:
+	dev_put(net_dev);
+out:
+	return NULL;
+}
+
+static void hardif_remove_interface(struct hard_iface *hard_iface)
+{
+	ASSERT_RTNL();
+
+	/* first deactivate interface */
+	if (hard_iface->if_status != IF_NOT_IN_USE)
+		hardif_disable_interface(hard_iface);
+
+	if (hard_iface->if_status != IF_NOT_IN_USE)
+		return;
+
+	hard_iface->if_status = IF_TO_BE_REMOVED;
+	sysfs_del_hardif(&hard_iface->hardif_obj);
+	hardif_free_ref(hard_iface);
+}
+
+void hardif_remove_interfaces(void)
+{
+	struct hard_iface *hard_iface, *hard_iface_tmp;
+
+	rtnl_lock();
+	list_for_each_entry_safe(hard_iface, hard_iface_tmp,
+				 &hardif_list, list) {
+		list_del_rcu(&hard_iface->list);
+		hardif_remove_interface(hard_iface);
+	}
+	rtnl_unlock();
+}
+
+static int hard_if_event(struct notifier_block *this,
+			 unsigned long event, void *ptr)
+{
+	struct net_device *net_dev = (struct net_device *)ptr;
+	struct hard_iface *hard_iface = hardif_get_by_netdev(net_dev);
+	struct hard_iface *primary_if = NULL;
+	struct bat_priv *bat_priv;
+
+	if (!hard_iface && event == NETDEV_REGISTER)
+		hard_iface = hardif_add_interface(net_dev);
+
+	if (!hard_iface)
+		goto out;
+
+	switch (event) {
+	case NETDEV_UP:
+		hardif_activate_interface(hard_iface);
+		break;
+	case NETDEV_GOING_DOWN:
+	case NETDEV_DOWN:
+		hardif_deactivate_interface(hard_iface);
+		break;
+	case NETDEV_UNREGISTER:
+		list_del_rcu(&hard_iface->list);
+
+		hardif_remove_interface(hard_iface);
+		break;
+	case NETDEV_CHANGEMTU:
+		if (hard_iface->soft_iface)
+			update_min_mtu(hard_iface->soft_iface);
+		break;
+	case NETDEV_CHANGEADDR:
+		if (hard_iface->if_status == IF_NOT_IN_USE)
+			goto hardif_put;
+
+		check_known_mac_addr(hard_iface->net_dev);
+		update_mac_addresses(hard_iface);
+
+		bat_priv = netdev_priv(hard_iface->soft_iface);
+		primary_if = primary_if_get_selected(bat_priv);
+		if (!primary_if)
+			goto hardif_put;
+
+		if (hard_iface == primary_if)
+			primary_if_update_addr(bat_priv);
+		break;
+	default:
+		break;
+	};
+
+hardif_put:
+	hardif_free_ref(hard_iface);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return NOTIFY_DONE;
+}
+
+/* receive a packet with the batman ethertype coming on a hard
+ * interface */
+static int batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
+			   struct packet_type *ptype,
+			   struct net_device *orig_dev)
+{
+	struct bat_priv *bat_priv;
+	struct batman_packet *batman_packet;
+	struct hard_iface *hard_iface;
+	int ret;
+
+	hard_iface = container_of(ptype, struct hard_iface, batman_adv_ptype);
+	skb = skb_share_check(skb, GFP_ATOMIC);
+
+	/* skb was released by skb_share_check() */
+	if (!skb)
+		goto err_out;
+
+	/* packet should hold at least type and version */
+	if (unlikely(!pskb_may_pull(skb, 2)))
+		goto err_free;
+
+	/* expect a valid ethernet header here. */
+	if (unlikely(skb->mac_len != sizeof(struct ethhdr)
+				|| !skb_mac_header(skb)))
+		goto err_free;
+
+	if (!hard_iface->soft_iface)
+		goto err_free;
+
+	bat_priv = netdev_priv(hard_iface->soft_iface);
+
+	if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
+		goto err_free;
+
+	/* discard frames on not active interfaces */
+	if (hard_iface->if_status != IF_ACTIVE)
+		goto err_free;
+
+	batman_packet = (struct batman_packet *)skb->data;
+
+	if (batman_packet->version != COMPAT_VERSION) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: incompatible batman version (%i)\n",
+			batman_packet->version);
+		goto err_free;
+	}
+
+	/* all receive handlers return whether they received or reused
+	 * the supplied skb. if not, we have to free the skb. */
+
+	switch (batman_packet->packet_type) {
+		/* batman originator packet */
+	case BAT_PACKET:
+		ret = recv_bat_packet(skb, hard_iface);
+		break;
+
+		/* batman icmp packet */
+	case BAT_ICMP:
+		ret = recv_icmp_packet(skb, hard_iface);
+		break;
+
+		/* unicast packet */
+	case BAT_UNICAST:
+		ret = recv_unicast_packet(skb, hard_iface);
+		break;
+
+		/* fragmented unicast packet */
+	case BAT_UNICAST_FRAG:
+		ret = recv_ucast_frag_packet(skb, hard_iface);
+		break;
+
+		/* broadcast packet */
+	case BAT_BCAST:
+		ret = recv_bcast_packet(skb, hard_iface);
+		break;
+
+		/* vis packet */
+	case BAT_VIS:
+		ret = recv_vis_packet(skb, hard_iface);
+		break;
+	default:
+		ret = NET_RX_DROP;
+	}
+
+	if (ret == NET_RX_DROP)
+		kfree_skb(skb);
+
+	/* return NET_RX_SUCCESS in any case as we
+	 * most probably dropped the packet for
+	 * routing-logical reasons. */
+
+	return NET_RX_SUCCESS;
+
+err_free:
+	kfree_skb(skb);
+err_out:
+	return NET_RX_DROP;
+}
+
+struct notifier_block hard_if_notifier = {
+	.notifier_call = hard_if_event,
+};
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
new file mode 100644
index 00000000..64265991
--- /dev/null
+++ b/net/batman-adv/hard-interface.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_
+#define _NET_BATMAN_ADV_HARD_INTERFACE_H_
+
+#define IF_NOT_IN_USE 0
+#define IF_TO_BE_REMOVED 1
+#define IF_INACTIVE 2
+#define IF_ACTIVE 3
+#define IF_TO_BE_ACTIVATED 4
+#define IF_I_WANT_YOU 5
+
+extern struct notifier_block hard_if_notifier;
+
+struct hard_iface *hardif_get_by_netdev(struct net_device *net_dev);
+int hardif_enable_interface(struct hard_iface *hard_iface, char *iface_name);
+void hardif_disable_interface(struct hard_iface *hard_iface);
+void hardif_remove_interfaces(void);
+int hardif_min_mtu(struct net_device *soft_iface);
+void update_min_mtu(struct net_device *soft_iface);
+void hardif_free_rcu(struct rcu_head *rcu);
+
+static inline void hardif_free_ref(struct hard_iface *hard_iface)
+{
+	if (atomic_dec_and_test(&hard_iface->refcount))
+		call_rcu(&hard_iface->rcu, hardif_free_rcu);
+}
+
+static inline struct hard_iface *primary_if_get_selected(
+						struct bat_priv *bat_priv)
+{
+	struct hard_iface *hard_iface;
+
+	rcu_read_lock();
+	hard_iface = rcu_dereference(bat_priv->primary_if);
+	if (!hard_iface)
+		goto out;
+
+	if (!atomic_inc_not_zero(&hard_iface->refcount))
+		hard_iface = NULL;
+
+out:
+	rcu_read_unlock();
+	return hard_iface;
+}
+
+#endif /* _NET_BATMAN_ADV_HARD_INTERFACE_H_ */
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
new file mode 100644
index 00000000..c5213d8f
--- /dev/null
+++ b/net/batman-adv/hash.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "hash.h"
+
+/* clears the hash */
+static void hash_init(struct hashtable_t *hash)
+{
+	int i;
+
+	for (i = 0 ; i < hash->size; i++) {
+		INIT_HLIST_HEAD(&hash->table[i]);
+		spin_lock_init(&hash->list_locks[i]);
+	}
+}
+
+/* free only the hashtable and the hash itself. */
+void hash_destroy(struct hashtable_t *hash)
+{
+	kfree(hash->list_locks);
+	kfree(hash->table);
+	kfree(hash);
+}
+
+/* allocates and clears the hash */
+struct hashtable_t *hash_new(int size)
+{
+	struct hashtable_t *hash;
+
+	hash = kmalloc(sizeof(struct hashtable_t), GFP_ATOMIC);
+	if (!hash)
+		return NULL;
+
+	hash->table = kmalloc(sizeof(struct element_t *) * size, GFP_ATOMIC);
+	if (!hash->table)
+		goto free_hash;
+
+	hash->list_locks = kmalloc(sizeof(spinlock_t) * size, GFP_ATOMIC);
+	if (!hash->list_locks)
+		goto free_table;
+
+	hash->size = size;
+	hash_init(hash);
+	return hash;
+
+free_table:
+	kfree(hash->table);
+free_hash:
+	kfree(hash);
+	return NULL;
+}
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
new file mode 100644
index 00000000..434822b2
--- /dev/null
+++ b/net/batman-adv/hash.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2006-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_HASH_H_
+#define _NET_BATMAN_ADV_HASH_H_
+
+#include <linux/list.h>
+
+/* callback to a compare function.  should
+ * compare 2 element datas for their keys,
+ * return 0 if same and not 0 if not
+ * same */
+typedef int (*hashdata_compare_cb)(struct hlist_node *, void *);
+
+/* the hashfunction, should return an index
+ * based on the key in the data of the first
+ * argument and the size the second */
+typedef int (*hashdata_choose_cb)(void *, int);
+typedef void (*hashdata_free_cb)(struct hlist_node *, void *);
+
+struct hashtable_t {
+	struct hlist_head *table;   /* the hashtable itself with the buckets */
+	spinlock_t *list_locks;     /* spinlock for each hash list entry */
+	int size;		    /* size of hashtable */
+};
+
+/* allocates and clears the hash */
+struct hashtable_t *hash_new(int size);
+
+/* free only the hashtable and the hash itself. */
+void hash_destroy(struct hashtable_t *hash);
+
+/* remove the hash structure. if hashdata_free_cb != NULL, this function will be
+ * called to remove the elements inside of the hash.  if you don't remove the
+ * elements, memory might be leaked. */
+static inline void hash_delete(struct hashtable_t *hash,
+			       hashdata_free_cb free_cb, void *arg)
+{
+	struct hlist_head *head;
+	struct hlist_node *node, *node_tmp;
+	spinlock_t *list_lock; /* spinlock to protect write access */
+	int i;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+		list_lock = &hash->list_locks[i];
+
+		spin_lock_bh(list_lock);
+		hlist_for_each_safe(node, node_tmp, head) {
+			hlist_del_rcu(node);
+
+			if (free_cb)
+				free_cb(node, arg);
+		}
+		spin_unlock_bh(list_lock);
+	}
+
+	hash_destroy(hash);
+}
+
+/* adds data to the hashtable. returns 0 on success, -1 on error */
+static inline int hash_add(struct hashtable_t *hash,
+			   hashdata_compare_cb compare,
+			   hashdata_choose_cb choose,
+			   void *data, struct hlist_node *data_node)
+{
+	int index;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	spinlock_t *list_lock; /* spinlock to protect write access */
+
+	if (!hash)
+		goto err;
+
+	index = choose(data, hash->size);
+	head = &hash->table[index];
+	list_lock = &hash->list_locks[index];
+
+	rcu_read_lock();
+	__hlist_for_each_rcu(node, head) {
+		if (!compare(node, data))
+			continue;
+
+		goto err_unlock;
+	}
+	rcu_read_unlock();
+
+	/* no duplicate found in list, add new element */
+	spin_lock_bh(list_lock);
+	hlist_add_head_rcu(data_node, head);
+	spin_unlock_bh(list_lock);
+
+	return 0;
+
+err_unlock:
+	rcu_read_unlock();
+err:
+	return -1;
+}
+
+/* removes data from hash, if found. returns pointer do data on success, so you
+ * can remove the used structure yourself, or NULL on error .  data could be the
+ * structure you use with just the key filled, we just need the key for
+ * comparing. */
+static inline void *hash_remove(struct hashtable_t *hash,
+				hashdata_compare_cb compare,
+				hashdata_choose_cb choose, void *data)
+{
+	size_t index;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	void *data_save = NULL;
+
+	index = choose(data, hash->size);
+	head = &hash->table[index];
+
+	spin_lock_bh(&hash->list_locks[index]);
+	hlist_for_each(node, head) {
+		if (!compare(node, data))
+			continue;
+
+		data_save = node;
+		hlist_del_rcu(node);
+		break;
+	}
+	spin_unlock_bh(&hash->list_locks[index]);
+
+	return data_save;
+}
+
+#endif /* _NET_BATMAN_ADV_HASH_H_ */
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
new file mode 100644
index 00000000..fa22ba2b
--- /dev/null
+++ b/net/batman-adv/icmp_socket.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include "icmp_socket.h"
+#include "send.h"
+#include "hash.h"
+#include "originator.h"
+#include "hard-interface.h"
+
+static struct socket_client *socket_client_hash[256];
+
+static void bat_socket_add_packet(struct socket_client *socket_client,
+				  struct icmp_packet_rr *icmp_packet,
+				  size_t icmp_len);
+
+void bat_socket_init(void)
+{
+	memset(socket_client_hash, 0, sizeof(socket_client_hash));
+}
+
+static int bat_socket_open(struct inode *inode, struct file *file)
+{
+	unsigned int i;
+	struct socket_client *socket_client;
+
+	nonseekable_open(inode, file);
+
+	socket_client = kmalloc(sizeof(struct socket_client), GFP_KERNEL);
+
+	if (!socket_client)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(socket_client_hash); i++) {
+		if (!socket_client_hash[i]) {
+			socket_client_hash[i] = socket_client;
+			break;
+		}
+	}
+
+	if (i == ARRAY_SIZE(socket_client_hash)) {
+		pr_err("Error - can't add another packet client: "
+		       "maximum number of clients reached\n");
+		kfree(socket_client);
+		return -EXFULL;
+	}
+
+	INIT_LIST_HEAD(&socket_client->queue_list);
+	socket_client->queue_len = 0;
+	socket_client->index = i;
+	socket_client->bat_priv = inode->i_private;
+	spin_lock_init(&socket_client->lock);
+	init_waitqueue_head(&socket_client->queue_wait);
+
+	file->private_data = socket_client;
+
+	inc_module_count();
+	return 0;
+}
+
+static int bat_socket_release(struct inode *inode, struct file *file)
+{
+	struct socket_client *socket_client = file->private_data;
+	struct socket_packet *socket_packet;
+	struct list_head *list_pos, *list_pos_tmp;
+
+	spin_lock_bh(&socket_client->lock);
+
+	/* for all packets in the queue ... */
+	list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) {
+		socket_packet = list_entry(list_pos,
+					   struct socket_packet, list);
+
+		list_del(list_pos);
+		kfree(socket_packet);
+	}
+
+	socket_client_hash[socket_client->index] = NULL;
+	spin_unlock_bh(&socket_client->lock);
+
+	kfree(socket_client);
+	dec_module_count();
+
+	return 0;
+}
+
+static ssize_t bat_socket_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct socket_client *socket_client = file->private_data;
+	struct socket_packet *socket_packet;
+	size_t packet_len;
+	int error;
+
+	if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0))
+		return -EAGAIN;
+
+	if ((!buf) || (count < sizeof(struct icmp_packet)))
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	error = wait_event_interruptible(socket_client->queue_wait,
+					 socket_client->queue_len);
+
+	if (error)
+		return error;
+
+	spin_lock_bh(&socket_client->lock);
+
+	socket_packet = list_first_entry(&socket_client->queue_list,
+					 struct socket_packet, list);
+	list_del(&socket_packet->list);
+	socket_client->queue_len--;
+
+	spin_unlock_bh(&socket_client->lock);
+
+	error = __copy_to_user(buf, &socket_packet->icmp_packet,
+			       socket_packet->icmp_len);
+
+	packet_len = socket_packet->icmp_len;
+	kfree(socket_packet);
+
+	if (error)
+		return -EFAULT;
+
+	return packet_len;
+}
+
+static ssize_t bat_socket_write(struct file *file, const char __user *buff,
+				size_t len, loff_t *off)
+{
+	struct socket_client *socket_client = file->private_data;
+	struct bat_priv *bat_priv = socket_client->bat_priv;
+	struct hard_iface *primary_if = NULL;
+	struct sk_buff *skb;
+	struct icmp_packet_rr *icmp_packet;
+
+	struct orig_node *orig_node = NULL;
+	struct neigh_node *neigh_node = NULL;
+	size_t packet_len = sizeof(struct icmp_packet);
+
+	if (len < sizeof(struct icmp_packet)) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Error - can't send packet from char device: "
+			"invalid packet size\n");
+		return -EINVAL;
+	}
+
+	primary_if = primary_if_get_selected(bat_priv);
+
+	if (!primary_if) {
+		len = -EFAULT;
+		goto out;
+	}
+
+	if (len >= sizeof(struct icmp_packet_rr))
+		packet_len = sizeof(struct icmp_packet_rr);
+
+	skb = dev_alloc_skb(packet_len + sizeof(struct ethhdr));
+	if (!skb) {
+		len = -ENOMEM;
+		goto out;
+	}
+
+	skb_reserve(skb, sizeof(struct ethhdr));
+	icmp_packet = (struct icmp_packet_rr *)skb_put(skb, packet_len);
+
+	if (!access_ok(VERIFY_READ, buff, packet_len)) {
+		len = -EFAULT;
+		goto free_skb;
+	}
+
+	if (__copy_from_user(icmp_packet, buff, packet_len)) {
+		len = -EFAULT;
+		goto free_skb;
+	}
+
+	if (icmp_packet->packet_type != BAT_ICMP) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Error - can't send packet from char device: "
+			"got bogus packet type (expected: BAT_ICMP)\n");
+		len = -EINVAL;
+		goto free_skb;
+	}
+
+	if (icmp_packet->msg_type != ECHO_REQUEST) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Error - can't send packet from char device: "
+			"got bogus message type (expected: ECHO_REQUEST)\n");
+		len = -EINVAL;
+		goto free_skb;
+	}
+
+	icmp_packet->uid = socket_client->index;
+
+	if (icmp_packet->version != COMPAT_VERSION) {
+		icmp_packet->msg_type = PARAMETER_PROBLEM;
+		icmp_packet->ttl = COMPAT_VERSION;
+		bat_socket_add_packet(socket_client, icmp_packet, packet_len);
+		goto free_skb;
+	}
+
+	if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
+		goto dst_unreach;
+
+	orig_node = orig_hash_find(bat_priv, icmp_packet->dst);
+	if (!orig_node)
+		goto dst_unreach;
+
+	neigh_node = orig_node_get_router(orig_node);
+	if (!neigh_node)
+		goto dst_unreach;
+
+	if (!neigh_node->if_incoming)
+		goto dst_unreach;
+
+	if (neigh_node->if_incoming->if_status != IF_ACTIVE)
+		goto dst_unreach;
+
+	memcpy(icmp_packet->orig,
+	       primary_if->net_dev->dev_addr, ETH_ALEN);
+
+	if (packet_len == sizeof(struct icmp_packet_rr))
+		memcpy(icmp_packet->rr,
+		       neigh_node->if_incoming->net_dev->dev_addr, ETH_ALEN);
+
+	send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+	goto out;
+
+dst_unreach:
+	icmp_packet->msg_type = DESTINATION_UNREACHABLE;
+	bat_socket_add_packet(socket_client, icmp_packet, packet_len);
+free_skb:
+	kfree_skb(skb);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	if (neigh_node)
+		neigh_node_free_ref(neigh_node);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return len;
+}
+
+static unsigned int bat_socket_poll(struct file *file, poll_table *wait)
+{
+	struct socket_client *socket_client = file->private_data;
+
+	poll_wait(file, &socket_client->queue_wait, wait);
+
+	if (socket_client->queue_len > 0)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static const struct file_operations fops = {
+	.owner = THIS_MODULE,
+	.open = bat_socket_open,
+	.release = bat_socket_release,
+	.read = bat_socket_read,
+	.write = bat_socket_write,
+	.poll = bat_socket_poll,
+	.llseek = no_llseek,
+};
+
+int bat_socket_setup(struct bat_priv *bat_priv)
+{
+	struct dentry *d;
+
+	if (!bat_priv->debug_dir)
+		goto err;
+
+	d = debugfs_create_file(ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR,
+				bat_priv->debug_dir, bat_priv, &fops);
+	if (d)
+		goto err;
+
+	return 0;
+
+err:
+	return 1;
+}
+
+static void bat_socket_add_packet(struct socket_client *socket_client,
+				  struct icmp_packet_rr *icmp_packet,
+				  size_t icmp_len)
+{
+	struct socket_packet *socket_packet;
+
+	socket_packet = kmalloc(sizeof(struct socket_packet), GFP_ATOMIC);
+
+	if (!socket_packet)
+		return;
+
+	INIT_LIST_HEAD(&socket_packet->list);
+	memcpy(&socket_packet->icmp_packet, icmp_packet, icmp_len);
+	socket_packet->icmp_len = icmp_len;
+
+	spin_lock_bh(&socket_client->lock);
+
+	/* while waiting for the lock the socket_client could have been
+	 * deleted */
+	if (!socket_client_hash[icmp_packet->uid]) {
+		spin_unlock_bh(&socket_client->lock);
+		kfree(socket_packet);
+		return;
+	}
+
+	list_add_tail(&socket_packet->list, &socket_client->queue_list);
+	socket_client->queue_len++;
+
+	if (socket_client->queue_len > 100) {
+		socket_packet = list_first_entry(&socket_client->queue_list,
+						 struct socket_packet, list);
+
+		list_del(&socket_packet->list);
+		kfree(socket_packet);
+		socket_client->queue_len--;
+	}
+
+	spin_unlock_bh(&socket_client->lock);
+
+	wake_up(&socket_client->queue_wait);
+}
+
+void bat_socket_receive_packet(struct icmp_packet_rr *icmp_packet,
+			       size_t icmp_len)
+{
+	struct socket_client *hash = socket_client_hash[icmp_packet->uid];
+
+	if (hash)
+		bat_socket_add_packet(hash, icmp_packet, icmp_len);
+}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
new file mode 100644
index 00000000..462b190f
--- /dev/null
+++ b/net/batman-adv/icmp_socket.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
+#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
+
+#define ICMP_SOCKET "socket"
+
+void bat_socket_init(void);
+int bat_socket_setup(struct bat_priv *bat_priv);
+void bat_socket_receive_packet(struct icmp_packet_rr *icmp_packet,
+			       size_t icmp_len);
+
+#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
new file mode 100644
index 00000000..0a7cee00
--- /dev/null
+++ b/net/batman-adv/main.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "bat_sysfs.h"
+#include "bat_debugfs.h"
+#include "routing.h"
+#include "send.h"
+#include "originator.h"
+#include "soft-interface.h"
+#include "icmp_socket.h"
+#include "translation-table.h"
+#include "hard-interface.h"
+#include "gateway_client.h"
+#include "vis.h"
+#include "hash.h"
+
+
+/* List manipulations on hardif_list have to be rtnl_lock()'ed,
+ * list traversals just rcu-locked */
+struct list_head hardif_list;
+
+unsigned char broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+struct workqueue_struct *bat_event_workqueue;
+
+static int __init batman_init(void)
+{
+	INIT_LIST_HEAD(&hardif_list);
+
+	/* the name should not be longer than 10 chars - see
+	 * http://lwn.net/Articles/23634/ */
+	bat_event_workqueue = create_singlethread_workqueue("bat_events");
+
+	if (!bat_event_workqueue)
+		return -ENOMEM;
+
+	bat_socket_init();
+	debugfs_init();
+
+	register_netdevice_notifier(&hard_if_notifier);
+
+	pr_info("B.A.T.M.A.N. advanced %s%s (compatibility version %i) "
+		"loaded\n", SOURCE_VERSION, REVISION_VERSION_STR,
+		COMPAT_VERSION);
+
+	return 0;
+}
+
+static void __exit batman_exit(void)
+{
+	debugfs_destroy();
+	unregister_netdevice_notifier(&hard_if_notifier);
+	hardif_remove_interfaces();
+
+	flush_workqueue(bat_event_workqueue);
+	destroy_workqueue(bat_event_workqueue);
+	bat_event_workqueue = NULL;
+
+	rcu_barrier();
+}
+
+int mesh_init(struct net_device *soft_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+
+	spin_lock_init(&bat_priv->forw_bat_list_lock);
+	spin_lock_init(&bat_priv->forw_bcast_list_lock);
+	spin_lock_init(&bat_priv->tt_lhash_lock);
+	spin_lock_init(&bat_priv->tt_ghash_lock);
+	spin_lock_init(&bat_priv->gw_list_lock);
+	spin_lock_init(&bat_priv->vis_hash_lock);
+	spin_lock_init(&bat_priv->vis_list_lock);
+	spin_lock_init(&bat_priv->softif_neigh_lock);
+	spin_lock_init(&bat_priv->softif_neigh_vid_lock);
+
+	INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
+	INIT_HLIST_HEAD(&bat_priv->forw_bcast_list);
+	INIT_HLIST_HEAD(&bat_priv->gw_list);
+	INIT_HLIST_HEAD(&bat_priv->softif_neigh_vids);
+
+	if (originator_init(bat_priv) < 1)
+		goto err;
+
+	if (tt_local_init(bat_priv) < 1)
+		goto err;
+
+	if (tt_global_init(bat_priv) < 1)
+		goto err;
+
+	tt_local_add(soft_iface, soft_iface->dev_addr);
+
+	if (vis_init(bat_priv) < 1)
+		goto err;
+
+	atomic_set(&bat_priv->mesh_state, MESH_ACTIVE);
+	goto end;
+
+err:
+	pr_err("Unable to allocate memory for mesh information structures: "
+	       "out of mem ?\n");
+	mesh_free(soft_iface);
+	return -1;
+
+end:
+	return 0;
+}
+
+void mesh_free(struct net_device *soft_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+
+	atomic_set(&bat_priv->mesh_state, MESH_DEACTIVATING);
+
+	purge_outstanding_packets(bat_priv, NULL);
+
+	vis_quit(bat_priv);
+
+	gw_node_purge(bat_priv);
+	originator_free(bat_priv);
+
+	tt_local_free(bat_priv);
+	tt_global_free(bat_priv);
+
+	softif_neigh_purge(bat_priv);
+
+	atomic_set(&bat_priv->mesh_state, MESH_INACTIVE);
+}
+
+void inc_module_count(void)
+{
+	try_module_get(THIS_MODULE);
+}
+
+void dec_module_count(void)
+{
+	module_put(THIS_MODULE);
+}
+
+int is_my_mac(uint8_t *addr)
+{
+	struct hard_iface *hard_iface;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->if_status != IF_ACTIVE)
+			continue;
+
+		if (compare_eth(hard_iface->net_dev->dev_addr, addr)) {
+			rcu_read_unlock();
+			return 1;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+
+}
+
+module_init(batman_init);
+module_exit(batman_exit);
+
+MODULE_LICENSE("GPL");
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_SUPPORTED_DEVICE(DRIVER_DEVICE);
+#ifdef REVISION_VERSION
+MODULE_VERSION(SOURCE_VERSION "-" REVISION_VERSION);
+#else
+MODULE_VERSION(SOURCE_VERSION);
+#endif
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
new file mode 100644
index 00000000..148b49e0
--- /dev/null
+++ b/net/batman-adv/main.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_MAIN_H_
+#define _NET_BATMAN_ADV_MAIN_H_
+
+#define DRIVER_AUTHOR "Marek Lindner <lindner_marek@yahoo.de>, " \
+		      "Simon Wunderlich <siwu@hrz.tu-chemnitz.de>"
+#define DRIVER_DESC   "B.A.T.M.A.N. advanced"
+#define DRIVER_DEVICE "batman-adv"
+
+#define SOURCE_VERSION "next"
+
+
+/* B.A.T.M.A.N. parameters */
+
+#define TQ_MAX_VALUE 255
+#define JITTER 20
+
+ /* Time To Live of broadcast messages */
+#define TTL 50
+
+/* purge originators after time in seconds if no valid packet comes in
+ * -> TODO: check influence on TQ_LOCAL_WINDOW_SIZE */
+#define PURGE_TIMEOUT 200
+#define TT_LOCAL_TIMEOUT 3600 /* in seconds */
+
+/* sliding packet range of received originator messages in squence numbers
+ * (should be a multiple of our word size) */
+#define TQ_LOCAL_WINDOW_SIZE 64
+#define TQ_GLOBAL_WINDOW_SIZE 5
+#define TQ_LOCAL_BIDRECT_SEND_MINIMUM 1
+#define TQ_LOCAL_BIDRECT_RECV_MINIMUM 1
+#define TQ_TOTAL_BIDRECT_LIMIT 1
+
+#define NUM_WORDS (TQ_LOCAL_WINDOW_SIZE / WORD_BIT_SIZE)
+
+#define LOG_BUF_LEN 8192	  /* has to be a power of 2 */
+
+#define VIS_INTERVAL 5000	/* 5 seconds */
+
+/* how much worse secondary interfaces may be to be considered as bonding
+ * candidates */
+#define BONDING_TQ_THRESHOLD	50
+
+/* should not be bigger than 512 bytes or change the size of
+ * forw_packet->direct_link_flags */
+#define MAX_AGGREGATION_BYTES 512
+#define MAX_AGGREGATION_MS 100
+
+#define SOFTIF_NEIGH_TIMEOUT 180000 /* 3 minutes */
+
+/* don't reset again within 30 seconds */
+#define RESET_PROTECTION_MS 30000
+#define EXPECTED_SEQNO_RANGE	65536
+
+#define MESH_INACTIVE 0
+#define MESH_ACTIVE 1
+#define MESH_DEACTIVATING 2
+
+#define BCAST_QUEUE_LEN		256
+#define BATMAN_QUEUE_LEN	256
+
+/*
+ * Debug Messages
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+/* Append 'batman-adv: ' before kernel messages */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+/* all messages related to routing / flooding / broadcasting / etc */
+#define DBG_BATMAN 1
+/* route or tt entry added / changed / deleted */
+#define DBG_ROUTES 2
+#define DBG_ALL 3
+
+
+/*
+ *  Vis
+ */
+
+/*
+ * Kernel headers
+ */
+
+#include <linux/mutex.h>	/* mutex */
+#include <linux/module.h>	/* needed by all modules */
+#include <linux/netdevice.h>	/* netdevice */
+#include <linux/etherdevice.h>  /* ethernet address classifaction */
+#include <linux/if_ether.h>	/* ethernet header */
+#include <linux/poll.h>		/* poll_table */
+#include <linux/kthread.h>	/* kernel threads */
+#include <linux/pkt_sched.h>	/* schedule types */
+#include <linux/workqueue.h>	/* workqueue */
+#include <linux/slab.h>
+#include <net/sock.h>		/* struct sock */
+#include <linux/jiffies.h>
+#include <linux/seq_file.h>
+#include "types.h"
+
+#ifndef REVISION_VERSION
+#define REVISION_VERSION_STR ""
+#else
+#define REVISION_VERSION_STR " "REVISION_VERSION
+#endif
+
+extern struct list_head hardif_list;
+
+extern unsigned char broadcast_addr[];
+extern struct workqueue_struct *bat_event_workqueue;
+
+int mesh_init(struct net_device *soft_iface);
+void mesh_free(struct net_device *soft_iface);
+void inc_module_count(void);
+void dec_module_count(void);
+int is_my_mac(uint8_t *addr);
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+int debug_log(struct bat_priv *bat_priv, char *fmt, ...);
+
+#define bat_dbg(type, bat_priv, fmt, arg...)			\
+	do {							\
+		if (atomic_read(&bat_priv->log_level) & type)	\
+			debug_log(bat_priv, fmt, ## arg);	\
+	}							\
+	while (0)
+#else /* !CONFIG_BATMAN_ADV_DEBUG */
+static inline void bat_dbg(char type __always_unused,
+			   struct bat_priv *bat_priv __always_unused,
+			   char *fmt __always_unused, ...)
+{
+}
+#endif
+
+#define bat_info(net_dev, fmt, arg...)					\
+	do {								\
+		struct net_device *_netdev = (net_dev);                 \
+		struct bat_priv *_batpriv = netdev_priv(_netdev);       \
+		bat_dbg(DBG_ALL, _batpriv, fmt, ## arg);		\
+		pr_info("%s: " fmt, _netdev->name, ## arg);		\
+	} while (0)
+#define bat_err(net_dev, fmt, arg...)					\
+	do {								\
+		struct net_device *_netdev = (net_dev);                 \
+		struct bat_priv *_batpriv = netdev_priv(_netdev);       \
+		bat_dbg(DBG_ALL, _batpriv, fmt, ## arg);		\
+		pr_err("%s: " fmt, _netdev->name, ## arg);		\
+	} while (0)
+
+/**
+ * returns 1 if they are the same ethernet addr
+ *
+ * note: can't use compare_ether_addr() as it requires aligned memory
+ */
+static inline int compare_eth(void *data1, void *data2)
+{
+	return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
+#define atomic_dec_not_zero(v)	atomic_add_unless((v), -1, 0)
+
+#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
new file mode 100644
index 00000000..40a30bbc
--- /dev/null
+++ b/net/batman-adv/originator.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (C) 2009-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "originator.h"
+#include "hash.h"
+#include "translation-table.h"
+#include "routing.h"
+#include "gateway_client.h"
+#include "hard-interface.h"
+#include "unicast.h"
+#include "soft-interface.h"
+
+static void purge_orig(struct work_struct *work);
+
+static void start_purge_timer(struct bat_priv *bat_priv)
+{
+	INIT_DELAYED_WORK(&bat_priv->orig_work, purge_orig);
+	queue_delayed_work(bat_event_workqueue, &bat_priv->orig_work, 1 * HZ);
+}
+
+int originator_init(struct bat_priv *bat_priv)
+{
+	if (bat_priv->orig_hash)
+		return 1;
+
+	bat_priv->orig_hash = hash_new(1024);
+
+	if (!bat_priv->orig_hash)
+		goto err;
+
+	start_purge_timer(bat_priv);
+	return 1;
+
+err:
+	return 0;
+}
+
+void neigh_node_free_ref(struct neigh_node *neigh_node)
+{
+	if (atomic_dec_and_test(&neigh_node->refcount))
+		kfree_rcu(neigh_node, rcu);
+}
+
+/* increases the refcounter of a found router */
+struct neigh_node *orig_node_get_router(struct orig_node *orig_node)
+{
+	struct neigh_node *router;
+
+	rcu_read_lock();
+	router = rcu_dereference(orig_node->router);
+
+	if (router && !atomic_inc_not_zero(&router->refcount))
+		router = NULL;
+
+	rcu_read_unlock();
+	return router;
+}
+
+struct neigh_node *create_neighbor(struct orig_node *orig_node,
+				   struct orig_node *orig_neigh_node,
+				   uint8_t *neigh,
+				   struct hard_iface *if_incoming)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct neigh_node *neigh_node;
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Creating new last-hop neighbor of originator\n");
+
+	neigh_node = kzalloc(sizeof(struct neigh_node), GFP_ATOMIC);
+	if (!neigh_node)
+		return NULL;
+
+	INIT_HLIST_NODE(&neigh_node->list);
+	INIT_LIST_HEAD(&neigh_node->bonding_list);
+	spin_lock_init(&neigh_node->tq_lock);
+
+	memcpy(neigh_node->addr, neigh, ETH_ALEN);
+	neigh_node->orig_node = orig_neigh_node;
+	neigh_node->if_incoming = if_incoming;
+
+	/* extra reference for return */
+	atomic_set(&neigh_node->refcount, 2);
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+	return neigh_node;
+}
+
+static void orig_node_free_rcu(struct rcu_head *rcu)
+{
+	struct hlist_node *node, *node_tmp;
+	struct neigh_node *neigh_node, *tmp_neigh_node;
+	struct orig_node *orig_node;
+
+	orig_node = container_of(rcu, struct orig_node, rcu);
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+
+	/* for all bonding members ... */
+	list_for_each_entry_safe(neigh_node, tmp_neigh_node,
+				 &orig_node->bond_list, bonding_list) {
+		list_del_rcu(&neigh_node->bonding_list);
+		neigh_node_free_ref(neigh_node);
+	}
+
+	/* for all neighbors towards this originator ... */
+	hlist_for_each_entry_safe(neigh_node, node, node_tmp,
+				  &orig_node->neigh_list, list) {
+		hlist_del_rcu(&neigh_node->list);
+		neigh_node_free_ref(neigh_node);
+	}
+
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+	frag_list_free(&orig_node->frag_list);
+	tt_global_del_orig(orig_node->bat_priv, orig_node,
+			    "originator timed out");
+
+	kfree(orig_node->bcast_own);
+	kfree(orig_node->bcast_own_sum);
+	kfree(orig_node);
+}
+
+void orig_node_free_ref(struct orig_node *orig_node)
+{
+	if (atomic_dec_and_test(&orig_node->refcount))
+		call_rcu(&orig_node->rcu, orig_node_free_rcu);
+}
+
+void originator_free(struct bat_priv *bat_priv)
+{
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node, *node_tmp;
+	struct hlist_head *head;
+	spinlock_t *list_lock; /* spinlock to protect write access */
+	struct orig_node *orig_node;
+	int i;
+
+	if (!hash)
+		return;
+
+	cancel_delayed_work_sync(&bat_priv->orig_work);
+
+	bat_priv->orig_hash = NULL;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+		list_lock = &hash->list_locks[i];
+
+		spin_lock_bh(list_lock);
+		hlist_for_each_entry_safe(orig_node, node, node_tmp,
+					  head, hash_entry) {
+
+			hlist_del_rcu(node);
+			orig_node_free_ref(orig_node);
+		}
+		spin_unlock_bh(list_lock);
+	}
+
+	hash_destroy(hash);
+}
+
+/* this function finds or creates an originator entry for the given
+ * address if it does not exits */
+struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr)
+{
+	struct orig_node *orig_node;
+	int size;
+	int hash_added;
+
+	orig_node = orig_hash_find(bat_priv, addr);
+	if (orig_node)
+		return orig_node;
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Creating new originator: %pM\n", addr);
+
+	orig_node = kzalloc(sizeof(struct orig_node), GFP_ATOMIC);
+	if (!orig_node)
+		return NULL;
+
+	INIT_HLIST_HEAD(&orig_node->neigh_list);
+	INIT_LIST_HEAD(&orig_node->bond_list);
+	spin_lock_init(&orig_node->ogm_cnt_lock);
+	spin_lock_init(&orig_node->bcast_seqno_lock);
+	spin_lock_init(&orig_node->neigh_list_lock);
+
+	/* extra reference for return */
+	atomic_set(&orig_node->refcount, 2);
+
+	orig_node->bat_priv = bat_priv;
+	memcpy(orig_node->orig, addr, ETH_ALEN);
+	orig_node->router = NULL;
+	orig_node->tt_buff = NULL;
+	orig_node->bcast_seqno_reset = jiffies - 1
+					- msecs_to_jiffies(RESET_PROTECTION_MS);
+	orig_node->batman_seqno_reset = jiffies - 1
+					- msecs_to_jiffies(RESET_PROTECTION_MS);
+
+	atomic_set(&orig_node->bond_candidates, 0);
+
+	size = bat_priv->num_ifaces * sizeof(unsigned long) * NUM_WORDS;
+
+	orig_node->bcast_own = kzalloc(size, GFP_ATOMIC);
+	if (!orig_node->bcast_own)
+		goto free_orig_node;
+
+	size = bat_priv->num_ifaces * sizeof(uint8_t);
+	orig_node->bcast_own_sum = kzalloc(size, GFP_ATOMIC);
+
+	INIT_LIST_HEAD(&orig_node->frag_list);
+	orig_node->last_frag_packet = 0;
+
+	if (!orig_node->bcast_own_sum)
+		goto free_bcast_own;
+
+	hash_added = hash_add(bat_priv->orig_hash, compare_orig,
+			      choose_orig, orig_node, &orig_node->hash_entry);
+	if (hash_added < 0)
+		goto free_bcast_own_sum;
+
+	return orig_node;
+free_bcast_own_sum:
+	kfree(orig_node->bcast_own_sum);
+free_bcast_own:
+	kfree(orig_node->bcast_own);
+free_orig_node:
+	kfree(orig_node);
+	return NULL;
+}
+
+static bool purge_orig_neighbors(struct bat_priv *bat_priv,
+				 struct orig_node *orig_node,
+				 struct neigh_node **best_neigh_node)
+{
+	struct hlist_node *node, *node_tmp;
+	struct neigh_node *neigh_node;
+	bool neigh_purged = false;
+
+	*best_neigh_node = NULL;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+
+	/* for all neighbors towards this originator ... */
+	hlist_for_each_entry_safe(neigh_node, node, node_tmp,
+				  &orig_node->neigh_list, list) {
+
+		if ((time_after(jiffies,
+			neigh_node->last_valid + PURGE_TIMEOUT * HZ)) ||
+		    (neigh_node->if_incoming->if_status == IF_INACTIVE) ||
+		    (neigh_node->if_incoming->if_status == IF_NOT_IN_USE) ||
+		    (neigh_node->if_incoming->if_status == IF_TO_BE_REMOVED)) {
+
+			if ((neigh_node->if_incoming->if_status ==
+								IF_INACTIVE) ||
+			    (neigh_node->if_incoming->if_status ==
+							IF_NOT_IN_USE) ||
+			    (neigh_node->if_incoming->if_status ==
+							IF_TO_BE_REMOVED))
+				bat_dbg(DBG_BATMAN, bat_priv,
+					"neighbor purge: originator %pM, "
+					"neighbor: %pM, iface: %s\n",
+					orig_node->orig, neigh_node->addr,
+					neigh_node->if_incoming->net_dev->name);
+			else
+				bat_dbg(DBG_BATMAN, bat_priv,
+					"neighbor timeout: originator %pM, "
+					"neighbor: %pM, last_valid: %lu\n",
+					orig_node->orig, neigh_node->addr,
+					(neigh_node->last_valid / HZ));
+
+			neigh_purged = true;
+
+			hlist_del_rcu(&neigh_node->list);
+			bonding_candidate_del(orig_node, neigh_node);
+			neigh_node_free_ref(neigh_node);
+		} else {
+			if ((!*best_neigh_node) ||
+			    (neigh_node->tq_avg > (*best_neigh_node)->tq_avg))
+				*best_neigh_node = neigh_node;
+		}
+	}
+
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+	return neigh_purged;
+}
+
+static bool purge_orig_node(struct bat_priv *bat_priv,
+			    struct orig_node *orig_node)
+{
+	struct neigh_node *best_neigh_node;
+
+	if (time_after(jiffies,
+		orig_node->last_valid + 2 * PURGE_TIMEOUT * HZ)) {
+
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Originator timeout: originator %pM, last_valid %lu\n",
+			orig_node->orig, (orig_node->last_valid / HZ));
+		return true;
+	} else {
+		if (purge_orig_neighbors(bat_priv, orig_node,
+							&best_neigh_node)) {
+			update_routes(bat_priv, orig_node,
+				      best_neigh_node,
+				      orig_node->tt_buff,
+				      orig_node->tt_buff_len);
+		}
+	}
+
+	return false;
+}
+
+static void _purge_orig(struct bat_priv *bat_priv)
+{
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node, *node_tmp;
+	struct hlist_head *head;
+	spinlock_t *list_lock; /* spinlock to protect write access */
+	struct orig_node *orig_node;
+	int i;
+
+	if (!hash)
+		return;
+
+	/* for all origins... */
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+		list_lock = &hash->list_locks[i];
+
+		spin_lock_bh(list_lock);
+		hlist_for_each_entry_safe(orig_node, node, node_tmp,
+					  head, hash_entry) {
+			if (purge_orig_node(bat_priv, orig_node)) {
+				if (orig_node->gw_flags)
+					gw_node_delete(bat_priv, orig_node);
+				hlist_del_rcu(node);
+				orig_node_free_ref(orig_node);
+				continue;
+			}
+
+			if (time_after(jiffies, orig_node->last_frag_packet +
+						msecs_to_jiffies(FRAG_TIMEOUT)))
+				frag_list_free(&orig_node->frag_list);
+		}
+		spin_unlock_bh(list_lock);
+	}
+
+	gw_node_purge(bat_priv);
+	gw_election(bat_priv);
+
+	softif_neigh_purge(bat_priv);
+}
+
+static void purge_orig(struct work_struct *work)
+{
+	struct delayed_work *delayed_work =
+		container_of(work, struct delayed_work, work);
+	struct bat_priv *bat_priv =
+		container_of(delayed_work, struct bat_priv, orig_work);
+
+	_purge_orig(bat_priv);
+	start_purge_timer(bat_priv);
+}
+
+void purge_orig_ref(struct bat_priv *bat_priv)
+{
+	_purge_orig(bat_priv);
+}
+
+int orig_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node, *node_tmp;
+	struct hlist_head *head;
+	struct hard_iface *primary_if;
+	struct orig_node *orig_node;
+	struct neigh_node *neigh_node, *neigh_node_tmp;
+	int batman_count = 0;
+	int last_seen_secs;
+	int last_seen_msecs;
+	int i, ret = 0;
+
+	primary_if = primary_if_get_selected(bat_priv);
+
+	if (!primary_if) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "please specify interfaces to enable it\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	if (primary_if->if_status != IF_ACTIVE) {
+		ret = seq_printf(seq, "BATMAN mesh %s "
+				 "disabled - primary interface not active\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	seq_printf(seq, "[B.A.T.M.A.N. adv %s%s, MainIF/MAC: %s/%pM (%s)]\n",
+		   SOURCE_VERSION, REVISION_VERSION_STR,
+		   primary_if->net_dev->name,
+		   primary_if->net_dev->dev_addr, net_dev->name);
+	seq_printf(seq, "  %-15s %s (%s/%i) %17s [%10s]: %20s ...\n",
+		   "Originator", "last-seen", "#", TQ_MAX_VALUE, "Nexthop",
+		   "outgoingIF", "Potential nexthops");
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			neigh_node = orig_node_get_router(orig_node);
+			if (!neigh_node)
+				continue;
+
+			if (neigh_node->tq_avg == 0)
+				goto next;
+
+			last_seen_secs = jiffies_to_msecs(jiffies -
+						orig_node->last_valid) / 1000;
+			last_seen_msecs = jiffies_to_msecs(jiffies -
+						orig_node->last_valid) % 1000;
+
+			seq_printf(seq, "%pM %4i.%03is   (%3i) %pM [%10s]:",
+				   orig_node->orig, last_seen_secs,
+				   last_seen_msecs, neigh_node->tq_avg,
+				   neigh_node->addr,
+				   neigh_node->if_incoming->net_dev->name);
+
+			hlist_for_each_entry_rcu(neigh_node_tmp, node_tmp,
+						 &orig_node->neigh_list, list) {
+				seq_printf(seq, " %pM (%3i)",
+					   neigh_node_tmp->addr,
+					   neigh_node_tmp->tq_avg);
+			}
+
+			seq_printf(seq, "\n");
+			batman_count++;
+
+next:
+			neigh_node_free_ref(neigh_node);
+		}
+		rcu_read_unlock();
+	}
+
+	if (batman_count == 0)
+		seq_printf(seq, "No batman nodes in range ...\n");
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+static int orig_node_add_if(struct orig_node *orig_node, int max_if_num)
+{
+	void *data_ptr;
+
+	data_ptr = kmalloc(max_if_num * sizeof(unsigned long) * NUM_WORDS,
+			   GFP_ATOMIC);
+	if (!data_ptr) {
+		pr_err("Can't resize orig: out of memory\n");
+		return -1;
+	}
+
+	memcpy(data_ptr, orig_node->bcast_own,
+	       (max_if_num - 1) * sizeof(unsigned long) * NUM_WORDS);
+	kfree(orig_node->bcast_own);
+	orig_node->bcast_own = data_ptr;
+
+	data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC);
+	if (!data_ptr) {
+		pr_err("Can't resize orig: out of memory\n");
+		return -1;
+	}
+
+	memcpy(data_ptr, orig_node->bcast_own_sum,
+	       (max_if_num - 1) * sizeof(uint8_t));
+	kfree(orig_node->bcast_own_sum);
+	orig_node->bcast_own_sum = data_ptr;
+
+	return 0;
+}
+
+int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct orig_node *orig_node;
+	int i, ret;
+
+	/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
+	 * if_num */
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			spin_lock_bh(&orig_node->ogm_cnt_lock);
+			ret = orig_node_add_if(orig_node, max_if_num);
+			spin_unlock_bh(&orig_node->ogm_cnt_lock);
+
+			if (ret == -1)
+				goto err;
+		}
+		rcu_read_unlock();
+	}
+
+	return 0;
+
+err:
+	rcu_read_unlock();
+	return -ENOMEM;
+}
+
+static int orig_node_del_if(struct orig_node *orig_node,
+		     int max_if_num, int del_if_num)
+{
+	void *data_ptr = NULL;
+	int chunk_size;
+
+	/* last interface was removed */
+	if (max_if_num == 0)
+		goto free_bcast_own;
+
+	chunk_size = sizeof(unsigned long) * NUM_WORDS;
+	data_ptr = kmalloc(max_if_num * chunk_size, GFP_ATOMIC);
+	if (!data_ptr) {
+		pr_err("Can't resize orig: out of memory\n");
+		return -1;
+	}
+
+	/* copy first part */
+	memcpy(data_ptr, orig_node->bcast_own, del_if_num * chunk_size);
+
+	/* copy second part */
+	memcpy(data_ptr + del_if_num * chunk_size,
+	       orig_node->bcast_own + ((del_if_num + 1) * chunk_size),
+	       (max_if_num - del_if_num) * chunk_size);
+
+free_bcast_own:
+	kfree(orig_node->bcast_own);
+	orig_node->bcast_own = data_ptr;
+
+	if (max_if_num == 0)
+		goto free_own_sum;
+
+	data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC);
+	if (!data_ptr) {
+		pr_err("Can't resize orig: out of memory\n");
+		return -1;
+	}
+
+	memcpy(data_ptr, orig_node->bcast_own_sum,
+	       del_if_num * sizeof(uint8_t));
+
+	memcpy(data_ptr + del_if_num * sizeof(uint8_t),
+	       orig_node->bcast_own_sum + ((del_if_num + 1) * sizeof(uint8_t)),
+	       (max_if_num - del_if_num) * sizeof(uint8_t));
+
+free_own_sum:
+	kfree(orig_node->bcast_own_sum);
+	orig_node->bcast_own_sum = data_ptr;
+
+	return 0;
+}
+
+int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct hard_iface *hard_iface_tmp;
+	struct orig_node *orig_node;
+	int i, ret;
+
+	/* resize all orig nodes because orig_node->bcast_own(_sum) depend on
+	 * if_num */
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			spin_lock_bh(&orig_node->ogm_cnt_lock);
+			ret = orig_node_del_if(orig_node, max_if_num,
+					hard_iface->if_num);
+			spin_unlock_bh(&orig_node->ogm_cnt_lock);
+
+			if (ret == -1)
+				goto err;
+		}
+		rcu_read_unlock();
+	}
+
+	/* renumber remaining batman interfaces _inside_ of orig_hash_lock */
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface_tmp, &hardif_list, list) {
+		if (hard_iface_tmp->if_status == IF_NOT_IN_USE)
+			continue;
+
+		if (hard_iface == hard_iface_tmp)
+			continue;
+
+		if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
+			continue;
+
+		if (hard_iface_tmp->if_num > hard_iface->if_num)
+			hard_iface_tmp->if_num--;
+	}
+	rcu_read_unlock();
+
+	hard_iface->if_num = -1;
+	return 0;
+
+err:
+	rcu_read_unlock();
+	return -ENOMEM;
+}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
new file mode 100644
index 00000000..e1d641f2
--- /dev/null
+++ b/net/batman-adv/originator.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
+#define _NET_BATMAN_ADV_ORIGINATOR_H_
+
+#include "hash.h"
+
+int originator_init(struct bat_priv *bat_priv);
+void originator_free(struct bat_priv *bat_priv);
+void purge_orig_ref(struct bat_priv *bat_priv);
+void orig_node_free_ref(struct orig_node *orig_node);
+struct orig_node *get_orig_node(struct bat_priv *bat_priv, uint8_t *addr);
+struct neigh_node *create_neighbor(struct orig_node *orig_node,
+				   struct orig_node *orig_neigh_node,
+				   uint8_t *neigh,
+				   struct hard_iface *if_incoming);
+void neigh_node_free_ref(struct neigh_node *neigh_node);
+struct neigh_node *orig_node_get_router(struct orig_node *orig_node);
+int orig_seq_print_text(struct seq_file *seq, void *offset);
+int orig_hash_add_if(struct hard_iface *hard_iface, int max_if_num);
+int orig_hash_del_if(struct hard_iface *hard_iface, int max_if_num);
+
+
+/* returns 1 if they are the same originator */
+static inline int compare_orig(struct hlist_node *node, void *data2)
+{
+	void *data1 = container_of(node, struct orig_node, hash_entry);
+
+	return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
+/* hashfunction to choose an entry in a hash table of given size */
+/* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */
+static inline int choose_orig(void *data, int32_t size)
+{
+	unsigned char *key = data;
+	uint32_t hash = 0;
+	size_t i;
+
+	for (i = 0; i < 6; i++) {
+		hash += key[i];
+		hash += (hash << 10);
+		hash ^= (hash >> 6);
+	}
+
+	hash += (hash << 3);
+	hash ^= (hash >> 11);
+	hash += (hash << 15);
+
+	return hash % size;
+}
+
+static inline struct orig_node *orig_hash_find(struct bat_priv *bat_priv,
+					       void *data)
+{
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct orig_node *orig_node, *orig_node_tmp = NULL;
+	int index;
+
+	if (!hash)
+		return NULL;
+
+	index = choose_orig(data, hash->size);
+	head = &hash->table[index];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+		if (!compare_eth(orig_node, data))
+			continue;
+
+		if (!atomic_inc_not_zero(&orig_node->refcount))
+			continue;
+
+		orig_node_tmp = orig_node;
+		break;
+	}
+	rcu_read_unlock();
+
+	return orig_node_tmp;
+}
+
+#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
new file mode 100644
index 00000000..eda99650
--- /dev/null
+++ b/net/batman-adv/packet.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_PACKET_H_
+#define _NET_BATMAN_ADV_PACKET_H_
+
+#define ETH_P_BATMAN  0x4305	/* unofficial/not registered Ethertype */
+
+#define BAT_PACKET       0x01
+#define BAT_ICMP         0x02
+#define BAT_UNICAST      0x03
+#define BAT_BCAST        0x04
+#define BAT_VIS          0x05
+#define BAT_UNICAST_FRAG 0x06
+
+/* this file is included by batctl which needs these defines */
+#define COMPAT_VERSION 12
+#define DIRECTLINK 0x40
+#define VIS_SERVER 0x20
+#define PRIMARIES_FIRST_HOP 0x10
+
+/* ICMP message types */
+#define ECHO_REPLY 0
+#define DESTINATION_UNREACHABLE 3
+#define ECHO_REQUEST 8
+#define TTL_EXCEEDED 11
+#define PARAMETER_PROBLEM 12
+
+/* vis defines */
+#define VIS_TYPE_SERVER_SYNC		0
+#define VIS_TYPE_CLIENT_UPDATE		1
+
+/* fragmentation defines */
+#define UNI_FRAG_HEAD 0x01
+#define UNI_FRAG_LARGETAIL 0x02
+
+struct batman_packet {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  flags;    /* 0x40: DIRECTLINK flag, 0x20 VIS_SERVER flag... */
+	uint8_t  tq;
+	uint32_t seqno;
+	uint8_t  orig[6];
+	uint8_t  prev_sender[6];
+	uint8_t  ttl;
+	uint8_t  num_tt;
+	uint8_t  gw_flags;  /* flags related to gateway class */
+	uint8_t  align;
+} __packed;
+
+#define BAT_PACKET_LEN sizeof(struct batman_packet)
+
+struct icmp_packet {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  msg_type; /* see ICMP message types above */
+	uint8_t  ttl;
+	uint8_t  dst[6];
+	uint8_t  orig[6];
+	uint16_t seqno;
+	uint8_t  uid;
+} __packed;
+
+#define BAT_RR_LEN 16
+
+/* icmp_packet_rr must start with all fields from imcp_packet
+ * as this is assumed by code that handles ICMP packets */
+struct icmp_packet_rr {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  msg_type; /* see ICMP message types above */
+	uint8_t  ttl;
+	uint8_t  dst[6];
+	uint8_t  orig[6];
+	uint16_t seqno;
+	uint8_t  uid;
+	uint8_t  rr_cur;
+	uint8_t  rr[BAT_RR_LEN][ETH_ALEN];
+} __packed;
+
+struct unicast_packet {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  dest[6];
+	uint8_t  ttl;
+} __packed;
+
+struct unicast_frag_packet {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  dest[6];
+	uint8_t  ttl;
+	uint8_t  flags;
+	uint8_t  orig[6];
+	uint16_t seqno;
+} __packed;
+
+struct bcast_packet {
+	uint8_t  packet_type;
+	uint8_t  version;  /* batman version field */
+	uint8_t  orig[6];
+	uint8_t  ttl;
+	uint32_t seqno;
+} __packed;
+
+struct vis_packet {
+	uint8_t  packet_type;
+	uint8_t  version;        /* batman version field */
+	uint8_t  vis_type;	 /* which type of vis-participant sent this? */
+	uint8_t  entries;	 /* number of entries behind this struct */
+	uint32_t seqno;		 /* sequence number */
+	uint8_t  ttl;		 /* TTL */
+	uint8_t  vis_orig[6];	 /* originator that announces its neighbors */
+	uint8_t  target_orig[6]; /* who should receive this packet */
+	uint8_t  sender_orig[6]; /* who sent or rebroadcasted this packet */
+} __packed;
+
+#endif /* _NET_BATMAN_ADV_PACKET_H_ */
diff --git a/net/batman-adv/ring_buffer.c b/net/batman-adv/ring_buffer.c
new file mode 100644
index 00000000..5bb6a619
--- /dev/null
+++ b/net/batman-adv/ring_buffer.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "ring_buffer.h"
+
+void ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, uint8_t value)
+{
+	lq_recv[*lq_index] = value;
+	*lq_index = (*lq_index + 1) % TQ_GLOBAL_WINDOW_SIZE;
+}
+
+uint8_t ring_buffer_avg(uint8_t lq_recv[])
+{
+	uint8_t *ptr;
+	uint16_t count = 0, i = 0, sum = 0;
+
+	ptr = lq_recv;
+
+	while (i < TQ_GLOBAL_WINDOW_SIZE) {
+		if (*ptr != 0) {
+			count++;
+			sum += *ptr;
+		}
+
+		i++;
+		ptr++;
+	}
+
+	if (count == 0)
+		return 0;
+
+	return (uint8_t)(sum / count);
+}
diff --git a/net/batman-adv/ring_buffer.h b/net/batman-adv/ring_buffer.h
new file mode 100644
index 00000000..0395b274
--- /dev/null
+++ b/net/batman-adv/ring_buffer.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_RING_BUFFER_H_
+#define _NET_BATMAN_ADV_RING_BUFFER_H_
+
+void ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, uint8_t value);
+uint8_t ring_buffer_avg(uint8_t lq_recv[]);
+
+#endif /* _NET_BATMAN_ADV_RING_BUFFER_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
new file mode 100644
index 00000000..bb1c3ec7
--- /dev/null
+++ b/net/batman-adv/routing.c
@@ -0,0 +1,1535 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "routing.h"
+#include "send.h"
+#include "hash.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "icmp_socket.h"
+#include "translation-table.h"
+#include "originator.h"
+#include "ring_buffer.h"
+#include "vis.h"
+#include "aggregation.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "unicast.h"
+
+void slide_own_bcast_window(struct hard_iface *hard_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct orig_node *orig_node;
+	unsigned long *word;
+	int i;
+	size_t word_index;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			spin_lock_bh(&orig_node->ogm_cnt_lock);
+			word_index = hard_iface->if_num * NUM_WORDS;
+			word = &(orig_node->bcast_own[word_index]);
+
+			bit_get_packet(bat_priv, word, 1, 0);
+			orig_node->bcast_own_sum[hard_iface->if_num] =
+				bit_packet_count(word);
+			spin_unlock_bh(&orig_node->ogm_cnt_lock);
+		}
+		rcu_read_unlock();
+	}
+}
+
+static void update_TT(struct bat_priv *bat_priv, struct orig_node *orig_node,
+		       unsigned char *tt_buff, int tt_buff_len)
+{
+	if ((tt_buff_len != orig_node->tt_buff_len) ||
+	    ((tt_buff_len > 0) &&
+	     (orig_node->tt_buff_len > 0) &&
+	     (memcmp(orig_node->tt_buff, tt_buff, tt_buff_len) != 0))) {
+
+		if (orig_node->tt_buff_len > 0)
+			tt_global_del_orig(bat_priv, orig_node,
+					    "originator changed tt");
+
+		if ((tt_buff_len > 0) && (tt_buff))
+			tt_global_add_orig(bat_priv, orig_node,
+					    tt_buff, tt_buff_len);
+	}
+}
+
+static void update_route(struct bat_priv *bat_priv,
+			 struct orig_node *orig_node,
+			 struct neigh_node *neigh_node,
+			 unsigned char *tt_buff, int tt_buff_len)
+{
+	struct neigh_node *curr_router;
+
+	curr_router = orig_node_get_router(orig_node);
+
+	/* route deleted */
+	if ((curr_router) && (!neigh_node)) {
+
+		bat_dbg(DBG_ROUTES, bat_priv, "Deleting route towards: %pM\n",
+			orig_node->orig);
+		tt_global_del_orig(bat_priv, orig_node,
+				    "originator timed out");
+
+	/* route added */
+	} else if ((!curr_router) && (neigh_node)) {
+
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Adding route towards: %pM (via %pM)\n",
+			orig_node->orig, neigh_node->addr);
+		tt_global_add_orig(bat_priv, orig_node,
+				    tt_buff, tt_buff_len);
+
+	/* route changed */
+	} else {
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Changing route towards: %pM "
+			"(now via %pM - was via %pM)\n",
+			orig_node->orig, neigh_node->addr,
+			curr_router->addr);
+	}
+
+	if (curr_router)
+		neigh_node_free_ref(curr_router);
+
+	/* increase refcount of new best neighbor */
+	if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount))
+		neigh_node = NULL;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+	rcu_assign_pointer(orig_node->router, neigh_node);
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+	/* decrease refcount of previous best neighbor */
+	if (curr_router)
+		neigh_node_free_ref(curr_router);
+}
+
+
+void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node,
+		   struct neigh_node *neigh_node, unsigned char *tt_buff,
+		   int tt_buff_len)
+{
+	struct neigh_node *router = NULL;
+
+	if (!orig_node)
+		goto out;
+
+	router = orig_node_get_router(orig_node);
+
+	if (router != neigh_node)
+		update_route(bat_priv, orig_node, neigh_node,
+			     tt_buff, tt_buff_len);
+	/* may be just TT changed */
+	else
+		update_TT(bat_priv, orig_node, tt_buff, tt_buff_len);
+
+out:
+	if (router)
+		neigh_node_free_ref(router);
+}
+
+static int is_bidirectional_neigh(struct orig_node *orig_node,
+				struct orig_node *orig_neigh_node,
+				struct batman_packet *batman_packet,
+				struct hard_iface *if_incoming)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct neigh_node *neigh_node = NULL, *tmp_neigh_node;
+	struct hlist_node *node;
+	unsigned char total_count;
+	uint8_t orig_eq_count, neigh_rq_count, tq_own;
+	int tq_asym_penalty, ret = 0;
+
+	/* find corresponding one hop neighbor */
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tmp_neigh_node, node,
+				 &orig_neigh_node->neigh_list, list) {
+
+		if (!compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig))
+			continue;
+
+		if (tmp_neigh_node->if_incoming != if_incoming)
+			continue;
+
+		if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+			continue;
+
+		neigh_node = tmp_neigh_node;
+		break;
+	}
+	rcu_read_unlock();
+
+	if (!neigh_node)
+		neigh_node = create_neighbor(orig_neigh_node,
+					     orig_neigh_node,
+					     orig_neigh_node->orig,
+					     if_incoming);
+
+	if (!neigh_node)
+		goto out;
+
+	/* if orig_node is direct neighbour update neigh_node last_valid */
+	if (orig_node == orig_neigh_node)
+		neigh_node->last_valid = jiffies;
+
+	orig_node->last_valid = jiffies;
+
+	/* find packet count of corresponding one hop neighbor */
+	spin_lock_bh(&orig_node->ogm_cnt_lock);
+	orig_eq_count = orig_neigh_node->bcast_own_sum[if_incoming->if_num];
+	neigh_rq_count = neigh_node->real_packet_count;
+	spin_unlock_bh(&orig_node->ogm_cnt_lock);
+
+	/* pay attention to not get a value bigger than 100 % */
+	total_count = (orig_eq_count > neigh_rq_count ?
+		       neigh_rq_count : orig_eq_count);
+
+	/* if we have too few packets (too less data) we set tq_own to zero */
+	/* if we receive too few packets it is not considered bidirectional */
+	if ((total_count < TQ_LOCAL_BIDRECT_SEND_MINIMUM) ||
+	    (neigh_rq_count < TQ_LOCAL_BIDRECT_RECV_MINIMUM))
+		tq_own = 0;
+	else
+		/* neigh_node->real_packet_count is never zero as we
+		 * only purge old information when getting new
+		 * information */
+		tq_own = (TQ_MAX_VALUE * total_count) /	neigh_rq_count;
+
+	/*
+	 * 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does
+	 * affect the nearly-symmetric links only a little, but
+	 * punishes asymmetric links more.  This will give a value
+	 * between 0 and TQ_MAX_VALUE
+	 */
+	tq_asym_penalty = TQ_MAX_VALUE - (TQ_MAX_VALUE *
+				(TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) *
+				(TQ_LOCAL_WINDOW_SIZE - neigh_rq_count) *
+				(TQ_LOCAL_WINDOW_SIZE - neigh_rq_count)) /
+					(TQ_LOCAL_WINDOW_SIZE *
+					 TQ_LOCAL_WINDOW_SIZE *
+					 TQ_LOCAL_WINDOW_SIZE);
+
+	batman_packet->tq = ((batman_packet->tq * tq_own * tq_asym_penalty) /
+						(TQ_MAX_VALUE * TQ_MAX_VALUE));
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"bidirectional: "
+		"orig = %-15pM neigh = %-15pM => own_bcast = %2i, "
+		"real recv = %2i, local tq: %3i, asym_penalty: %3i, "
+		"total tq: %3i\n",
+		orig_node->orig, orig_neigh_node->orig, total_count,
+		neigh_rq_count, tq_own,	tq_asym_penalty, batman_packet->tq);
+
+	/* if link has the minimum required transmission quality
+	 * consider it bidirectional */
+	if (batman_packet->tq >= TQ_TOTAL_BIDRECT_LIMIT)
+		ret = 1;
+
+out:
+	if (neigh_node)
+		neigh_node_free_ref(neigh_node);
+	return ret;
+}
+
+/* caller must hold the neigh_list_lock */
+void bonding_candidate_del(struct orig_node *orig_node,
+			   struct neigh_node *neigh_node)
+{
+	/* this neighbor is not part of our candidate list */
+	if (list_empty(&neigh_node->bonding_list))
+		goto out;
+
+	list_del_rcu(&neigh_node->bonding_list);
+	INIT_LIST_HEAD(&neigh_node->bonding_list);
+	neigh_node_free_ref(neigh_node);
+	atomic_dec(&orig_node->bond_candidates);
+
+out:
+	return;
+}
+
+static void bonding_candidate_add(struct orig_node *orig_node,
+				  struct neigh_node *neigh_node)
+{
+	struct hlist_node *node;
+	struct neigh_node *tmp_neigh_node, *router = NULL;
+	uint8_t interference_candidate = 0;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+
+	/* only consider if it has the same primary address ...  */
+	if (!compare_eth(orig_node->orig,
+			 neigh_node->orig_node->primary_addr))
+		goto candidate_del;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto candidate_del;
+
+	/* ... and is good enough to be considered */
+	if (neigh_node->tq_avg < router->tq_avg - BONDING_TQ_THRESHOLD)
+		goto candidate_del;
+
+	/**
+	 * check if we have another candidate with the same mac address or
+	 * interface. If we do, we won't select this candidate because of
+	 * possible interference.
+	 */
+	hlist_for_each_entry_rcu(tmp_neigh_node, node,
+				 &orig_node->neigh_list, list) {
+
+		if (tmp_neigh_node == neigh_node)
+			continue;
+
+		/* we only care if the other candidate is even
+		* considered as candidate. */
+		if (list_empty(&tmp_neigh_node->bonding_list))
+			continue;
+
+		if ((neigh_node->if_incoming == tmp_neigh_node->if_incoming) ||
+		    (compare_eth(neigh_node->addr, tmp_neigh_node->addr))) {
+			interference_candidate = 1;
+			break;
+		}
+	}
+
+	/* don't care further if it is an interference candidate */
+	if (interference_candidate)
+		goto candidate_del;
+
+	/* this neighbor already is part of our candidate list */
+	if (!list_empty(&neigh_node->bonding_list))
+		goto out;
+
+	if (!atomic_inc_not_zero(&neigh_node->refcount))
+		goto out;
+
+	list_add_rcu(&neigh_node->bonding_list, &orig_node->bond_list);
+	atomic_inc(&orig_node->bond_candidates);
+	goto out;
+
+candidate_del:
+	bonding_candidate_del(orig_node, neigh_node);
+
+out:
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+	if (router)
+		neigh_node_free_ref(router);
+}
+
+/* copy primary address for bonding */
+static void bonding_save_primary(struct orig_node *orig_node,
+				 struct orig_node *orig_neigh_node,
+				 struct batman_packet *batman_packet)
+{
+	if (!(batman_packet->flags & PRIMARIES_FIRST_HOP))
+		return;
+
+	memcpy(orig_neigh_node->primary_addr, orig_node->orig, ETH_ALEN);
+}
+
+static void update_orig(struct bat_priv *bat_priv,
+			struct orig_node *orig_node,
+			struct ethhdr *ethhdr,
+			struct batman_packet *batman_packet,
+			struct hard_iface *if_incoming,
+			unsigned char *tt_buff, int tt_buff_len,
+			char is_duplicate)
+{
+	struct neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL;
+	struct neigh_node *router = NULL;
+	struct orig_node *orig_node_tmp;
+	struct hlist_node *node;
+	int tmp_tt_buff_len;
+	uint8_t bcast_own_sum_orig, bcast_own_sum_neigh;
+
+	bat_dbg(DBG_BATMAN, bat_priv, "update_originator(): "
+		"Searching and updating originator entry of received packet\n");
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tmp_neigh_node, node,
+				 &orig_node->neigh_list, list) {
+		if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) &&
+		    (tmp_neigh_node->if_incoming == if_incoming) &&
+		     atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+			if (neigh_node)
+				neigh_node_free_ref(neigh_node);
+			neigh_node = tmp_neigh_node;
+			continue;
+		}
+
+		if (is_duplicate)
+			continue;
+
+		spin_lock_bh(&tmp_neigh_node->tq_lock);
+		ring_buffer_set(tmp_neigh_node->tq_recv,
+				&tmp_neigh_node->tq_index, 0);
+		tmp_neigh_node->tq_avg =
+			ring_buffer_avg(tmp_neigh_node->tq_recv);
+		spin_unlock_bh(&tmp_neigh_node->tq_lock);
+	}
+
+	if (!neigh_node) {
+		struct orig_node *orig_tmp;
+
+		orig_tmp = get_orig_node(bat_priv, ethhdr->h_source);
+		if (!orig_tmp)
+			goto unlock;
+
+		neigh_node = create_neighbor(orig_node, orig_tmp,
+					     ethhdr->h_source, if_incoming);
+
+		orig_node_free_ref(orig_tmp);
+		if (!neigh_node)
+			goto unlock;
+	} else
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Updating existing last-hop neighbor of originator\n");
+
+	rcu_read_unlock();
+
+	orig_node->flags = batman_packet->flags;
+	neigh_node->last_valid = jiffies;
+
+	spin_lock_bh(&neigh_node->tq_lock);
+	ring_buffer_set(neigh_node->tq_recv,
+			&neigh_node->tq_index,
+			batman_packet->tq);
+	neigh_node->tq_avg = ring_buffer_avg(neigh_node->tq_recv);
+	spin_unlock_bh(&neigh_node->tq_lock);
+
+	if (!is_duplicate) {
+		orig_node->last_ttl = batman_packet->ttl;
+		neigh_node->last_ttl = batman_packet->ttl;
+	}
+
+	bonding_candidate_add(orig_node, neigh_node);
+
+	tmp_tt_buff_len = (tt_buff_len > batman_packet->num_tt * ETH_ALEN ?
+			    batman_packet->num_tt * ETH_ALEN : tt_buff_len);
+
+	/* if this neighbor already is our next hop there is nothing
+	 * to change */
+	router = orig_node_get_router(orig_node);
+	if (router == neigh_node)
+		goto update_tt;
+
+	/* if this neighbor does not offer a better TQ we won't consider it */
+	if (router && (router->tq_avg > neigh_node->tq_avg))
+		goto update_tt;
+
+	/* if the TQ is the same and the link not more symetric we
+	 * won't consider it either */
+	if (router && (neigh_node->tq_avg == router->tq_avg)) {
+		orig_node_tmp = router->orig_node;
+		spin_lock_bh(&orig_node_tmp->ogm_cnt_lock);
+		bcast_own_sum_orig =
+			orig_node_tmp->bcast_own_sum[if_incoming->if_num];
+		spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock);
+
+		orig_node_tmp = neigh_node->orig_node;
+		spin_lock_bh(&orig_node_tmp->ogm_cnt_lock);
+		bcast_own_sum_neigh =
+			orig_node_tmp->bcast_own_sum[if_incoming->if_num];
+		spin_unlock_bh(&orig_node_tmp->ogm_cnt_lock);
+
+		if (bcast_own_sum_orig >= bcast_own_sum_neigh)
+			goto update_tt;
+	}
+
+	update_routes(bat_priv, orig_node, neigh_node,
+		      tt_buff, tmp_tt_buff_len);
+	goto update_gw;
+
+update_tt:
+	update_routes(bat_priv, orig_node, router,
+		      tt_buff, tmp_tt_buff_len);
+
+update_gw:
+	if (orig_node->gw_flags != batman_packet->gw_flags)
+		gw_node_update(bat_priv, orig_node, batman_packet->gw_flags);
+
+	orig_node->gw_flags = batman_packet->gw_flags;
+
+	/* restart gateway selection if fast or late switching was enabled */
+	if ((orig_node->gw_flags) &&
+	    (atomic_read(&bat_priv->gw_mode) == GW_MODE_CLIENT) &&
+	    (atomic_read(&bat_priv->gw_sel_class) > 2))
+		gw_check_election(bat_priv, orig_node);
+
+	goto out;
+
+unlock:
+	rcu_read_unlock();
+out:
+	if (neigh_node)
+		neigh_node_free_ref(neigh_node);
+	if (router)
+		neigh_node_free_ref(router);
+}
+
+/* checks whether the host restarted and is in the protection time.
+ * returns:
+ *  0 if the packet is to be accepted
+ *  1 if the packet is to be ignored.
+ */
+static int window_protected(struct bat_priv *bat_priv,
+			    int32_t seq_num_diff,
+			    unsigned long *last_reset)
+{
+	if ((seq_num_diff <= -TQ_LOCAL_WINDOW_SIZE)
+		|| (seq_num_diff >= EXPECTED_SEQNO_RANGE)) {
+		if (time_after(jiffies, *last_reset +
+			msecs_to_jiffies(RESET_PROTECTION_MS))) {
+
+			*last_reset = jiffies;
+			bat_dbg(DBG_BATMAN, bat_priv,
+				"old packet received, start protection\n");
+
+			return 0;
+		} else
+			return 1;
+	}
+	return 0;
+}
+
+/* processes a batman packet for all interfaces, adjusts the sequence number and
+ * finds out whether it is a duplicate.
+ * returns:
+ *   1 the packet is a duplicate
+ *   0 the packet has not yet been received
+ *  -1 the packet is old and has been received while the seqno window
+ *     was protected. Caller should drop it.
+ */
+static char count_real_packets(struct ethhdr *ethhdr,
+			       struct batman_packet *batman_packet,
+			       struct hard_iface *if_incoming)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct orig_node *orig_node;
+	struct neigh_node *tmp_neigh_node;
+	struct hlist_node *node;
+	char is_duplicate = 0;
+	int32_t seq_diff;
+	int need_update = 0;
+	int set_mark, ret = -1;
+
+	orig_node = get_orig_node(bat_priv, batman_packet->orig);
+	if (!orig_node)
+		return 0;
+
+	spin_lock_bh(&orig_node->ogm_cnt_lock);
+	seq_diff = batman_packet->seqno - orig_node->last_real_seqno;
+
+	/* signalize caller that the packet is to be dropped. */
+	if (window_protected(bat_priv, seq_diff,
+			     &orig_node->batman_seqno_reset))
+		goto out;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tmp_neigh_node, node,
+				 &orig_node->neigh_list, list) {
+
+		is_duplicate |= get_bit_status(tmp_neigh_node->real_bits,
+					       orig_node->last_real_seqno,
+					       batman_packet->seqno);
+
+		if (compare_eth(tmp_neigh_node->addr, ethhdr->h_source) &&
+		    (tmp_neigh_node->if_incoming == if_incoming))
+			set_mark = 1;
+		else
+			set_mark = 0;
+
+		/* if the window moved, set the update flag. */
+		need_update |= bit_get_packet(bat_priv,
+					      tmp_neigh_node->real_bits,
+					      seq_diff, set_mark);
+
+		tmp_neigh_node->real_packet_count =
+			bit_packet_count(tmp_neigh_node->real_bits);
+	}
+	rcu_read_unlock();
+
+	if (need_update) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"updating last_seqno: old %d, new %d\n",
+			orig_node->last_real_seqno, batman_packet->seqno);
+		orig_node->last_real_seqno = batman_packet->seqno;
+	}
+
+	ret = is_duplicate;
+
+out:
+	spin_unlock_bh(&orig_node->ogm_cnt_lock);
+	orig_node_free_ref(orig_node);
+	return ret;
+}
+
+void receive_bat_packet(struct ethhdr *ethhdr,
+			struct batman_packet *batman_packet,
+			unsigned char *tt_buff, int tt_buff_len,
+			struct hard_iface *if_incoming)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct hard_iface *hard_iface;
+	struct orig_node *orig_neigh_node, *orig_node;
+	struct neigh_node *router = NULL, *router_router = NULL;
+	struct neigh_node *orig_neigh_router = NULL;
+	char has_directlink_flag;
+	char is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0;
+	char is_broadcast = 0, is_bidirectional, is_single_hop_neigh;
+	char is_duplicate;
+	uint32_t if_incoming_seqno;
+
+	/* Silently drop when the batman packet is actually not a
+	 * correct packet.
+	 *
+	 * This might happen if a packet is padded (e.g. Ethernet has a
+	 * minimum frame length of 64 byte) and the aggregation interprets
+	 * it as an additional length.
+	 *
+	 * TODO: A more sane solution would be to have a bit in the
+	 * batman_packet to detect whether the packet is the last
+	 * packet in an aggregation.  Here we expect that the padding
+	 * is always zero (or not 0x01)
+	 */
+	if (batman_packet->packet_type != BAT_PACKET)
+		return;
+
+	/* could be changed by schedule_own_packet() */
+	if_incoming_seqno = atomic_read(&if_incoming->seqno);
+
+	has_directlink_flag = (batman_packet->flags & DIRECTLINK ? 1 : 0);
+
+	is_single_hop_neigh = (compare_eth(ethhdr->h_source,
+					   batman_packet->orig) ? 1 : 0);
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Received BATMAN packet via NB: %pM, IF: %s [%pM] "
+		"(from OG: %pM, via prev OG: %pM, seqno %d, tq %d, "
+		"TTL %d, V %d, IDF %d)\n",
+		ethhdr->h_source, if_incoming->net_dev->name,
+		if_incoming->net_dev->dev_addr, batman_packet->orig,
+		batman_packet->prev_sender, batman_packet->seqno,
+		batman_packet->tq, batman_packet->ttl, batman_packet->version,
+		has_directlink_flag);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->if_status != IF_ACTIVE)
+			continue;
+
+		if (hard_iface->soft_iface != if_incoming->soft_iface)
+			continue;
+
+		if (compare_eth(ethhdr->h_source,
+				hard_iface->net_dev->dev_addr))
+			is_my_addr = 1;
+
+		if (compare_eth(batman_packet->orig,
+				hard_iface->net_dev->dev_addr))
+			is_my_orig = 1;
+
+		if (compare_eth(batman_packet->prev_sender,
+				hard_iface->net_dev->dev_addr))
+			is_my_oldorig = 1;
+
+		if (compare_eth(ethhdr->h_source, broadcast_addr))
+			is_broadcast = 1;
+	}
+	rcu_read_unlock();
+
+	if (batman_packet->version != COMPAT_VERSION) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: incompatible batman version (%i)\n",
+			batman_packet->version);
+		return;
+	}
+
+	if (is_my_addr) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: received my own broadcast (sender: %pM"
+			")\n",
+			ethhdr->h_source);
+		return;
+	}
+
+	if (is_broadcast) {
+		bat_dbg(DBG_BATMAN, bat_priv, "Drop packet: "
+		"ignoring all packets with broadcast source addr (sender: %pM"
+		")\n", ethhdr->h_source);
+		return;
+	}
+
+	if (is_my_orig) {
+		unsigned long *word;
+		int offset;
+
+		orig_neigh_node = get_orig_node(bat_priv, ethhdr->h_source);
+		if (!orig_neigh_node)
+			return;
+
+		/* neighbor has to indicate direct link and it has to
+		 * come via the corresponding interface */
+		/* if received seqno equals last send seqno save new
+		 * seqno for bidirectional check */
+		if (has_directlink_flag &&
+		    compare_eth(if_incoming->net_dev->dev_addr,
+				batman_packet->orig) &&
+		    (batman_packet->seqno - if_incoming_seqno + 2 == 0)) {
+			offset = if_incoming->if_num * NUM_WORDS;
+
+			spin_lock_bh(&orig_neigh_node->ogm_cnt_lock);
+			word = &(orig_neigh_node->bcast_own[offset]);
+			bit_mark(word, 0);
+			orig_neigh_node->bcast_own_sum[if_incoming->if_num] =
+				bit_packet_count(word);
+			spin_unlock_bh(&orig_neigh_node->ogm_cnt_lock);
+		}
+
+		bat_dbg(DBG_BATMAN, bat_priv, "Drop packet: "
+			"originator packet from myself (via neighbor)\n");
+		orig_node_free_ref(orig_neigh_node);
+		return;
+	}
+
+	if (is_my_oldorig) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: ignoring all rebroadcast echos (sender: "
+			"%pM)\n", ethhdr->h_source);
+		return;
+	}
+
+	orig_node = get_orig_node(bat_priv, batman_packet->orig);
+	if (!orig_node)
+		return;
+
+	is_duplicate = count_real_packets(ethhdr, batman_packet, if_incoming);
+
+	if (is_duplicate == -1) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: packet within seqno protection time "
+			"(sender: %pM)\n", ethhdr->h_source);
+		goto out;
+	}
+
+	if (batman_packet->tq == 0) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: originator packet with tq equal 0\n");
+		goto out;
+	}
+
+	router = orig_node_get_router(orig_node);
+	if (router)
+		router_router = orig_node_get_router(router->orig_node);
+
+	/* avoid temporary routing loops */
+	if (router && router_router &&
+	    (compare_eth(router->addr, batman_packet->prev_sender)) &&
+	    !(compare_eth(batman_packet->orig, batman_packet->prev_sender)) &&
+	    (compare_eth(router->addr, router_router->addr))) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: ignoring all rebroadcast packets that "
+			"may make me loop (sender: %pM)\n", ethhdr->h_source);
+		goto out;
+	}
+
+	/* if sender is a direct neighbor the sender mac equals
+	 * originator mac */
+	orig_neigh_node = (is_single_hop_neigh ?
+			   orig_node :
+			   get_orig_node(bat_priv, ethhdr->h_source));
+	if (!orig_neigh_node)
+		goto out;
+
+	orig_neigh_router = orig_node_get_router(orig_neigh_node);
+
+	/* drop packet if sender is not a direct neighbor and if we
+	 * don't route towards it */
+	if (!is_single_hop_neigh && (!orig_neigh_router)) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: OGM via unknown neighbor!\n");
+		goto out_neigh;
+	}
+
+	is_bidirectional = is_bidirectional_neigh(orig_node, orig_neigh_node,
+						batman_packet, if_incoming);
+
+	bonding_save_primary(orig_node, orig_neigh_node, batman_packet);
+
+	/* update ranking if it is not a duplicate or has the same
+	 * seqno and similar ttl as the non-duplicate */
+	if (is_bidirectional &&
+	    (!is_duplicate ||
+	     ((orig_node->last_real_seqno == batman_packet->seqno) &&
+	      (orig_node->last_ttl - 3 <= batman_packet->ttl))))
+		update_orig(bat_priv, orig_node, ethhdr, batman_packet,
+			    if_incoming, tt_buff, tt_buff_len, is_duplicate);
+
+	/* is single hop (direct) neighbor */
+	if (is_single_hop_neigh) {
+
+		/* mark direct link on incoming interface */
+		schedule_forward_packet(orig_node, ethhdr, batman_packet,
+					1, tt_buff_len, if_incoming);
+
+		bat_dbg(DBG_BATMAN, bat_priv, "Forwarding packet: "
+			"rebroadcast neighbor packet with direct link flag\n");
+		goto out_neigh;
+	}
+
+	/* multihop originator */
+	if (!is_bidirectional) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: not received via bidirectional link\n");
+		goto out_neigh;
+	}
+
+	if (is_duplicate) {
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"Drop packet: duplicate packet received\n");
+		goto out_neigh;
+	}
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Forwarding packet: rebroadcast originator packet\n");
+	schedule_forward_packet(orig_node, ethhdr, batman_packet,
+				0, tt_buff_len, if_incoming);
+
+out_neigh:
+	if ((orig_neigh_node) && (!is_single_hop_neigh))
+		orig_node_free_ref(orig_neigh_node);
+out:
+	if (router)
+		neigh_node_free_ref(router);
+	if (router_router)
+		neigh_node_free_ref(router_router);
+	if (orig_neigh_router)
+		neigh_node_free_ref(orig_neigh_router);
+
+	orig_node_free_ref(orig_node);
+}
+
+int recv_bat_packet(struct sk_buff *skb, struct hard_iface *hard_iface)
+{
+	struct ethhdr *ethhdr;
+
+	/* drop packet if it has not necessary minimum size */
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct batman_packet))))
+		return NET_RX_DROP;
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	/* packet with broadcast indication but unicast recipient */
+	if (!is_broadcast_ether_addr(ethhdr->h_dest))
+		return NET_RX_DROP;
+
+	/* packet with broadcast sender address */
+	if (is_broadcast_ether_addr(ethhdr->h_source))
+		return NET_RX_DROP;
+
+	/* create a copy of the skb, if needed, to modify it. */
+	if (skb_cow(skb, 0) < 0)
+		return NET_RX_DROP;
+
+	/* keep skb linear */
+	if (skb_linearize(skb) < 0)
+		return NET_RX_DROP;
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	receive_aggr_bat_packet(ethhdr,
+				skb->data,
+				skb_headlen(skb),
+				hard_iface);
+
+	kfree_skb(skb);
+	return NET_RX_SUCCESS;
+}
+
+static int recv_my_icmp_packet(struct bat_priv *bat_priv,
+			       struct sk_buff *skb, size_t icmp_len)
+{
+	struct hard_iface *primary_if = NULL;
+	struct orig_node *orig_node = NULL;
+	struct neigh_node *router = NULL;
+	struct icmp_packet_rr *icmp_packet;
+	int ret = NET_RX_DROP;
+
+	icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+	/* add data to device queue */
+	if (icmp_packet->msg_type != ECHO_REQUEST) {
+		bat_socket_receive_packet(icmp_packet, icmp_len);
+		goto out;
+	}
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	/* answer echo request (ping) */
+	/* get routing information */
+	orig_node = orig_hash_find(bat_priv, icmp_packet->orig);
+	if (!orig_node)
+		goto out;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto out;
+
+	/* create a copy of the skb, if needed, to modify it. */
+	if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+		goto out;
+
+	icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+	memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
+	memcpy(icmp_packet->orig, primary_if->net_dev->dev_addr, ETH_ALEN);
+	icmp_packet->msg_type = ECHO_REPLY;
+	icmp_packet->ttl = TTL;
+
+	send_skb_packet(skb, router->if_incoming, router->addr);
+	ret = NET_RX_SUCCESS;
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	if (router)
+		neigh_node_free_ref(router);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+static int recv_icmp_ttl_exceeded(struct bat_priv *bat_priv,
+				  struct sk_buff *skb)
+{
+	struct hard_iface *primary_if = NULL;
+	struct orig_node *orig_node = NULL;
+	struct neigh_node *router = NULL;
+	struct icmp_packet *icmp_packet;
+	int ret = NET_RX_DROP;
+
+	icmp_packet = (struct icmp_packet *)skb->data;
+
+	/* send TTL exceeded if packet is an echo request (traceroute) */
+	if (icmp_packet->msg_type != ECHO_REQUEST) {
+		pr_debug("Warning - can't forward icmp packet from %pM to "
+			 "%pM: ttl exceeded\n", icmp_packet->orig,
+			 icmp_packet->dst);
+		goto out;
+	}
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	/* get routing information */
+	orig_node = orig_hash_find(bat_priv, icmp_packet->orig);
+	if (!orig_node)
+		goto out;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto out;
+
+	/* create a copy of the skb, if needed, to modify it. */
+	if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+		goto out;
+
+	icmp_packet = (struct icmp_packet *)skb->data;
+
+	memcpy(icmp_packet->dst, icmp_packet->orig, ETH_ALEN);
+	memcpy(icmp_packet->orig, primary_if->net_dev->dev_addr, ETH_ALEN);
+	icmp_packet->msg_type = TTL_EXCEEDED;
+	icmp_packet->ttl = TTL;
+
+	send_skb_packet(skb, router->if_incoming, router->addr);
+	ret = NET_RX_SUCCESS;
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	if (router)
+		neigh_node_free_ref(router);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+
+int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+	struct icmp_packet_rr *icmp_packet;
+	struct ethhdr *ethhdr;
+	struct orig_node *orig_node = NULL;
+	struct neigh_node *router = NULL;
+	int hdr_size = sizeof(struct icmp_packet);
+	int ret = NET_RX_DROP;
+
+	/**
+	 * we truncate all incoming icmp packets if they don't match our size
+	 */
+	if (skb->len >= sizeof(struct icmp_packet_rr))
+		hdr_size = sizeof(struct icmp_packet_rr);
+
+	/* drop packet if it has not necessary minimum size */
+	if (unlikely(!pskb_may_pull(skb, hdr_size)))
+		goto out;
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	/* packet with unicast indication but broadcast recipient */
+	if (is_broadcast_ether_addr(ethhdr->h_dest))
+		goto out;
+
+	/* packet with broadcast sender address */
+	if (is_broadcast_ether_addr(ethhdr->h_source))
+		goto out;
+
+	/* not for me */
+	if (!is_my_mac(ethhdr->h_dest))
+		goto out;
+
+	icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+	/* add record route information if not full */
+	if ((hdr_size == sizeof(struct icmp_packet_rr)) &&
+	    (icmp_packet->rr_cur < BAT_RR_LEN)) {
+		memcpy(&(icmp_packet->rr[icmp_packet->rr_cur]),
+			ethhdr->h_dest, ETH_ALEN);
+		icmp_packet->rr_cur++;
+	}
+
+	/* packet for me */
+	if (is_my_mac(icmp_packet->dst))
+		return recv_my_icmp_packet(bat_priv, skb, hdr_size);
+
+	/* TTL exceeded */
+	if (icmp_packet->ttl < 2)
+		return recv_icmp_ttl_exceeded(bat_priv, skb);
+
+	/* get routing information */
+	orig_node = orig_hash_find(bat_priv, icmp_packet->dst);
+	if (!orig_node)
+		goto out;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto out;
+
+	/* create a copy of the skb, if needed, to modify it. */
+	if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+		goto out;
+
+	icmp_packet = (struct icmp_packet_rr *)skb->data;
+
+	/* decrement ttl */
+	icmp_packet->ttl--;
+
+	/* route it */
+	send_skb_packet(skb, router->if_incoming, router->addr);
+	ret = NET_RX_SUCCESS;
+
+out:
+	if (router)
+		neigh_node_free_ref(router);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+/* In the bonding case, send the packets in a round
+ * robin fashion over the remaining interfaces.
+ *
+ * This method rotates the bonding list and increases the
+ * returned router's refcount. */
+static struct neigh_node *find_bond_router(struct orig_node *primary_orig,
+					   struct hard_iface *recv_if)
+{
+	struct neigh_node *tmp_neigh_node;
+	struct neigh_node *router = NULL, *first_candidate = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list,
+				bonding_list) {
+		if (!first_candidate)
+			first_candidate = tmp_neigh_node;
+
+		/* recv_if == NULL on the first node. */
+		if (tmp_neigh_node->if_incoming == recv_if)
+			continue;
+
+		if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+			continue;
+
+		router = tmp_neigh_node;
+		break;
+	}
+
+	/* use the first candidate if nothing was found. */
+	if (!router && first_candidate &&
+	    atomic_inc_not_zero(&first_candidate->refcount))
+		router = first_candidate;
+
+	if (!router)
+		goto out;
+
+	/* selected should point to the next element
+	 * after the current router */
+	spin_lock_bh(&primary_orig->neigh_list_lock);
+	/* this is a list_move(), which unfortunately
+	 * does not exist as rcu version */
+	list_del_rcu(&primary_orig->bond_list);
+	list_add_rcu(&primary_orig->bond_list,
+		     &router->bonding_list);
+	spin_unlock_bh(&primary_orig->neigh_list_lock);
+
+out:
+	rcu_read_unlock();
+	return router;
+}
+
+/* Interface Alternating: Use the best of the
+ * remaining candidates which are not using
+ * this interface.
+ *
+ * Increases the returned router's refcount */
+static struct neigh_node *find_ifalter_router(struct orig_node *primary_orig,
+					      struct hard_iface *recv_if)
+{
+	struct neigh_node *tmp_neigh_node;
+	struct neigh_node *router = NULL, *first_candidate = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp_neigh_node, &primary_orig->bond_list,
+				bonding_list) {
+		if (!first_candidate)
+			first_candidate = tmp_neigh_node;
+
+		/* recv_if == NULL on the first node. */
+		if (tmp_neigh_node->if_incoming == recv_if)
+			continue;
+
+		if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+			continue;
+
+		/* if we don't have a router yet
+		 * or this one is better, choose it. */
+		if ((!router) ||
+		    (tmp_neigh_node->tq_avg > router->tq_avg)) {
+			/* decrement refcount of
+			 * previously selected router */
+			if (router)
+				neigh_node_free_ref(router);
+
+			router = tmp_neigh_node;
+			atomic_inc_not_zero(&router->refcount);
+		}
+
+		neigh_node_free_ref(tmp_neigh_node);
+	}
+
+	/* use the first candidate if nothing was found. */
+	if (!router && first_candidate &&
+	    atomic_inc_not_zero(&first_candidate->refcount))
+		router = first_candidate;
+
+	rcu_read_unlock();
+	return router;
+}
+
+/* find a suitable router for this originator, and use
+ * bonding if possible. increases the found neighbors
+ * refcount.*/
+struct neigh_node *find_router(struct bat_priv *bat_priv,
+			       struct orig_node *orig_node,
+			       struct hard_iface *recv_if)
+{
+	struct orig_node *primary_orig_node;
+	struct orig_node *router_orig;
+	struct neigh_node *router;
+	static uint8_t zero_mac[ETH_ALEN] = {0, 0, 0, 0, 0, 0};
+	int bonding_enabled;
+
+	if (!orig_node)
+		return NULL;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto err;
+
+	/* without bonding, the first node should
+	 * always choose the default router. */
+	bonding_enabled = atomic_read(&bat_priv->bonding);
+
+	rcu_read_lock();
+	/* select default router to output */
+	router_orig = router->orig_node;
+	if (!router_orig)
+		goto err_unlock;
+
+	if ((!recv_if) && (!bonding_enabled))
+		goto return_router;
+
+	/* if we have something in the primary_addr, we can search
+	 * for a potential bonding candidate. */
+	if (compare_eth(router_orig->primary_addr, zero_mac))
+		goto return_router;
+
+	/* find the orig_node which has the primary interface. might
+	 * even be the same as our router_orig in many cases */
+
+	if (compare_eth(router_orig->primary_addr, router_orig->orig)) {
+		primary_orig_node = router_orig;
+	} else {
+		primary_orig_node = orig_hash_find(bat_priv,
+						   router_orig->primary_addr);
+		if (!primary_orig_node)
+			goto return_router;
+
+		orig_node_free_ref(primary_orig_node);
+	}
+
+	/* with less than 2 candidates, we can't do any
+	 * bonding and prefer the original router. */
+	if (atomic_read(&primary_orig_node->bond_candidates) < 2)
+		goto return_router;
+
+	/* all nodes between should choose a candidate which
+	 * is is not on the interface where the packet came
+	 * in. */
+
+	neigh_node_free_ref(router);
+
+	if (bonding_enabled)
+		router = find_bond_router(primary_orig_node, recv_if);
+	else
+		router = find_ifalter_router(primary_orig_node, recv_if);
+
+return_router:
+	rcu_read_unlock();
+	return router;
+err_unlock:
+	rcu_read_unlock();
+err:
+	if (router)
+		neigh_node_free_ref(router);
+	return NULL;
+}
+
+static int check_unicast_packet(struct sk_buff *skb, int hdr_size)
+{
+	struct ethhdr *ethhdr;
+
+	/* drop packet if it has not necessary minimum size */
+	if (unlikely(!pskb_may_pull(skb, hdr_size)))
+		return -1;
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	/* packet with unicast indication but broadcast recipient */
+	if (is_broadcast_ether_addr(ethhdr->h_dest))
+		return -1;
+
+	/* packet with broadcast sender address */
+	if (is_broadcast_ether_addr(ethhdr->h_source))
+		return -1;
+
+	/* not for me */
+	if (!is_my_mac(ethhdr->h_dest))
+		return -1;
+
+	return 0;
+}
+
+int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+	struct orig_node *orig_node = NULL;
+	struct neigh_node *neigh_node = NULL;
+	struct unicast_packet *unicast_packet;
+	struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+	int ret = NET_RX_DROP;
+	struct sk_buff *new_skb;
+
+	unicast_packet = (struct unicast_packet *)skb->data;
+
+	/* TTL exceeded */
+	if (unicast_packet->ttl < 2) {
+		pr_debug("Warning - can't forward unicast packet from %pM to "
+			 "%pM: ttl exceeded\n", ethhdr->h_source,
+			 unicast_packet->dest);
+		goto out;
+	}
+
+	/* get routing information */
+	orig_node = orig_hash_find(bat_priv, unicast_packet->dest);
+
+	if (!orig_node)
+		goto out;
+
+	/* find_router() increases neigh_nodes refcount if found. */
+	neigh_node = find_router(bat_priv, orig_node, recv_if);
+
+	if (!neigh_node)
+		goto out;
+
+	/* create a copy of the skb, if needed, to modify it. */
+	if (skb_cow(skb, sizeof(struct ethhdr)) < 0)
+		goto out;
+
+	unicast_packet = (struct unicast_packet *)skb->data;
+
+	if (unicast_packet->packet_type == BAT_UNICAST &&
+	    atomic_read(&bat_priv->fragmentation) &&
+	    skb->len > neigh_node->if_incoming->net_dev->mtu) {
+		ret = frag_send_skb(skb, bat_priv,
+				    neigh_node->if_incoming, neigh_node->addr);
+		goto out;
+	}
+
+	if (unicast_packet->packet_type == BAT_UNICAST_FRAG &&
+	    frag_can_reassemble(skb, neigh_node->if_incoming->net_dev->mtu)) {
+
+		ret = frag_reassemble_skb(skb, bat_priv, &new_skb);
+
+		if (ret == NET_RX_DROP)
+			goto out;
+
+		/* packet was buffered for late merge */
+		if (!new_skb) {
+			ret = NET_RX_SUCCESS;
+			goto out;
+		}
+
+		skb = new_skb;
+		unicast_packet = (struct unicast_packet *)skb->data;
+	}
+
+	/* decrement ttl */
+	unicast_packet->ttl--;
+
+	/* route it */
+	send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+	ret = NET_RX_SUCCESS;
+
+out:
+	if (neigh_node)
+		neigh_node_free_ref(neigh_node);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct unicast_packet *unicast_packet;
+	int hdr_size = sizeof(struct unicast_packet);
+
+	if (check_unicast_packet(skb, hdr_size) < 0)
+		return NET_RX_DROP;
+
+	unicast_packet = (struct unicast_packet *)skb->data;
+
+	/* packet for me */
+	if (is_my_mac(unicast_packet->dest)) {
+		interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size);
+		return NET_RX_SUCCESS;
+	}
+
+	return route_unicast_packet(skb, recv_if);
+}
+
+int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+	struct unicast_frag_packet *unicast_packet;
+	int hdr_size = sizeof(struct unicast_frag_packet);
+	struct sk_buff *new_skb = NULL;
+	int ret;
+
+	if (check_unicast_packet(skb, hdr_size) < 0)
+		return NET_RX_DROP;
+
+	unicast_packet = (struct unicast_frag_packet *)skb->data;
+
+	/* packet for me */
+	if (is_my_mac(unicast_packet->dest)) {
+
+		ret = frag_reassemble_skb(skb, bat_priv, &new_skb);
+
+		if (ret == NET_RX_DROP)
+			return NET_RX_DROP;
+
+		/* packet was buffered for late merge */
+		if (!new_skb)
+			return NET_RX_SUCCESS;
+
+		interface_rx(recv_if->soft_iface, new_skb, recv_if,
+			     sizeof(struct unicast_packet));
+		return NET_RX_SUCCESS;
+	}
+
+	return route_unicast_packet(skb, recv_if);
+}
+
+
+int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+	struct orig_node *orig_node = NULL;
+	struct bcast_packet *bcast_packet;
+	struct ethhdr *ethhdr;
+	int hdr_size = sizeof(struct bcast_packet);
+	int ret = NET_RX_DROP;
+	int32_t seq_diff;
+
+	/* drop packet if it has not necessary minimum size */
+	if (unlikely(!pskb_may_pull(skb, hdr_size)))
+		goto out;
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	/* packet with broadcast indication but unicast recipient */
+	if (!is_broadcast_ether_addr(ethhdr->h_dest))
+		goto out;
+
+	/* packet with broadcast sender address */
+	if (is_broadcast_ether_addr(ethhdr->h_source))
+		goto out;
+
+	/* ignore broadcasts sent by myself */
+	if (is_my_mac(ethhdr->h_source))
+		goto out;
+
+	bcast_packet = (struct bcast_packet *)skb->data;
+
+	/* ignore broadcasts originated by myself */
+	if (is_my_mac(bcast_packet->orig))
+		goto out;
+
+	if (bcast_packet->ttl < 2)
+		goto out;
+
+	orig_node = orig_hash_find(bat_priv, bcast_packet->orig);
+
+	if (!orig_node)
+		goto out;
+
+	spin_lock_bh(&orig_node->bcast_seqno_lock);
+
+	/* check whether the packet is a duplicate */
+	if (get_bit_status(orig_node->bcast_bits, orig_node->last_bcast_seqno,
+			   ntohl(bcast_packet->seqno)))
+		goto spin_unlock;
+
+	seq_diff = ntohl(bcast_packet->seqno) - orig_node->last_bcast_seqno;
+
+	/* check whether the packet is old and the host just restarted. */
+	if (window_protected(bat_priv, seq_diff,
+			     &orig_node->bcast_seqno_reset))
+		goto spin_unlock;
+
+	/* mark broadcast in flood history, update window position
+	 * if required. */
+	if (bit_get_packet(bat_priv, orig_node->bcast_bits, seq_diff, 1))
+		orig_node->last_bcast_seqno = ntohl(bcast_packet->seqno);
+
+	spin_unlock_bh(&orig_node->bcast_seqno_lock);
+
+	/* rebroadcast packet */
+	add_bcast_packet_to_list(bat_priv, skb);
+
+	/* broadcast for me */
+	interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size);
+	ret = NET_RX_SUCCESS;
+	goto out;
+
+spin_unlock:
+	spin_unlock_bh(&orig_node->bcast_seqno_lock);
+out:
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if)
+{
+	struct vis_packet *vis_packet;
+	struct ethhdr *ethhdr;
+	struct bat_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+	int hdr_size = sizeof(struct vis_packet);
+
+	/* keep skb linear */
+	if (skb_linearize(skb) < 0)
+		return NET_RX_DROP;
+
+	if (unlikely(!pskb_may_pull(skb, hdr_size)))
+		return NET_RX_DROP;
+
+	vis_packet = (struct vis_packet *)skb->data;
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	/* not for me */
+	if (!is_my_mac(ethhdr->h_dest))
+		return NET_RX_DROP;
+
+	/* ignore own packets */
+	if (is_my_mac(vis_packet->vis_orig))
+		return NET_RX_DROP;
+
+	if (is_my_mac(vis_packet->sender_orig))
+		return NET_RX_DROP;
+
+	switch (vis_packet->vis_type) {
+	case VIS_TYPE_SERVER_SYNC:
+		receive_server_sync_packet(bat_priv, vis_packet,
+					   skb_headlen(skb));
+		break;
+
+	case VIS_TYPE_CLIENT_UPDATE:
+		receive_client_update_packet(bat_priv, vis_packet,
+					     skb_headlen(skb));
+		break;
+
+	default:	/* ignore unknown packet */
+		break;
+	}
+
+	/* We take a copy of the data in the packet, so we should
+	   always free the skbuf. */
+	return NET_RX_DROP;
+}
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
new file mode 100644
index 00000000..870f2984
--- /dev/null
+++ b/net/batman-adv/routing.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_ROUTING_H_
+#define _NET_BATMAN_ADV_ROUTING_H_
+
+void slide_own_bcast_window(struct hard_iface *hard_iface);
+void receive_bat_packet(struct ethhdr *ethhdr,
+				struct batman_packet *batman_packet,
+				unsigned char *tt_buff, int tt_buff_len,
+				struct hard_iface *if_incoming);
+void update_routes(struct bat_priv *bat_priv, struct orig_node *orig_node,
+		   struct neigh_node *neigh_node, unsigned char *tt_buff,
+		   int tt_buff_len);
+int route_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+int recv_bat_packet(struct sk_buff *skb, struct hard_iface *recv_if);
+struct neigh_node *find_router(struct bat_priv *bat_priv,
+			       struct orig_node *orig_node,
+			       struct hard_iface *recv_if);
+void bonding_candidate_del(struct orig_node *orig_node,
+			   struct neigh_node *neigh_node);
+
+#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
new file mode 100644
index 00000000..33779278
--- /dev/null
+++ b/net/batman-adv/send.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "send.h"
+#include "routing.h"
+#include "translation-table.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "vis.h"
+#include "aggregation.h"
+#include "gateway_common.h"
+#include "originator.h"
+
+static void send_outstanding_bcast_packet(struct work_struct *work);
+
+/* apply hop penalty for a normal link */
+static uint8_t hop_penalty(const uint8_t tq, struct bat_priv *bat_priv)
+{
+	int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+	return (tq * (TQ_MAX_VALUE - hop_penalty)) / (TQ_MAX_VALUE);
+}
+
+/* when do we schedule our own packet to be sent */
+static unsigned long own_send_time(struct bat_priv *bat_priv)
+{
+	return jiffies + msecs_to_jiffies(
+		   atomic_read(&bat_priv->orig_interval) -
+		   JITTER + (random32() % 2*JITTER));
+}
+
+/* when do we schedule a forwarded packet to be sent */
+static unsigned long forward_send_time(void)
+{
+	return jiffies + msecs_to_jiffies(random32() % (JITTER/2));
+}
+
+/* send out an already prepared packet to the given address via the
+ * specified batman interface */
+int send_skb_packet(struct sk_buff *skb,
+				struct hard_iface *hard_iface,
+				uint8_t *dst_addr)
+{
+	struct ethhdr *ethhdr;
+
+	if (hard_iface->if_status != IF_ACTIVE)
+		goto send_skb_err;
+
+	if (unlikely(!hard_iface->net_dev))
+		goto send_skb_err;
+
+	if (!(hard_iface->net_dev->flags & IFF_UP)) {
+		pr_warning("Interface %s is not up - can't send packet via "
+			   "that interface!\n", hard_iface->net_dev->name);
+		goto send_skb_err;
+	}
+
+	/* push to the ethernet header. */
+	if (my_skb_head_push(skb, sizeof(struct ethhdr)) < 0)
+		goto send_skb_err;
+
+	skb_reset_mac_header(skb);
+
+	ethhdr = (struct ethhdr *) skb_mac_header(skb);
+	memcpy(ethhdr->h_source, hard_iface->net_dev->dev_addr, ETH_ALEN);
+	memcpy(ethhdr->h_dest, dst_addr, ETH_ALEN);
+	ethhdr->h_proto = __constant_htons(ETH_P_BATMAN);
+
+	skb_set_network_header(skb, ETH_HLEN);
+	skb->priority = TC_PRIO_CONTROL;
+	skb->protocol = __constant_htons(ETH_P_BATMAN);
+
+	skb->dev = hard_iface->net_dev;
+
+	/* dev_queue_xmit() returns a negative result on error.	 However on
+	 * congestion and traffic shaping, it drops and returns NET_XMIT_DROP
+	 * (which is > 0). This will not be treated as an error. */
+
+	return dev_queue_xmit(skb);
+send_skb_err:
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+}
+
+/* Send a packet to a given interface */
+static void send_packet_to_if(struct forw_packet *forw_packet,
+			      struct hard_iface *hard_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	char *fwd_str;
+	uint8_t packet_num;
+	int16_t buff_pos;
+	struct batman_packet *batman_packet;
+	struct sk_buff *skb;
+
+	if (hard_iface->if_status != IF_ACTIVE)
+		return;
+
+	packet_num = 0;
+	buff_pos = 0;
+	batman_packet = (struct batman_packet *)forw_packet->skb->data;
+
+	/* adjust all flags and log packets */
+	while (aggregated_packet(buff_pos,
+				 forw_packet->packet_len,
+				 batman_packet->num_tt)) {
+
+		/* we might have aggregated direct link packets with an
+		 * ordinary base packet */
+		if ((forw_packet->direct_link_flags & (1 << packet_num)) &&
+		    (forw_packet->if_incoming == hard_iface))
+			batman_packet->flags |= DIRECTLINK;
+		else
+			batman_packet->flags &= ~DIRECTLINK;
+
+		fwd_str = (packet_num > 0 ? "Forwarding" : (forw_packet->own ?
+							    "Sending own" :
+							    "Forwarding"));
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"%s %spacket (originator %pM, seqno %d, TQ %d, TTL %d,"
+			" IDF %s) on interface %s [%pM]\n",
+			fwd_str, (packet_num > 0 ? "aggregated " : ""),
+			batman_packet->orig, ntohl(batman_packet->seqno),
+			batman_packet->tq, batman_packet->ttl,
+			(batman_packet->flags & DIRECTLINK ?
+			 "on" : "off"),
+			hard_iface->net_dev->name,
+			hard_iface->net_dev->dev_addr);
+
+		buff_pos += sizeof(struct batman_packet) +
+			(batman_packet->num_tt * ETH_ALEN);
+		packet_num++;
+		batman_packet = (struct batman_packet *)
+			(forw_packet->skb->data + buff_pos);
+	}
+
+	/* create clone because function is called more than once */
+	skb = skb_clone(forw_packet->skb, GFP_ATOMIC);
+	if (skb)
+		send_skb_packet(skb, hard_iface, broadcast_addr);
+}
+
+/* send a batman packet */
+static void send_packet(struct forw_packet *forw_packet)
+{
+	struct hard_iface *hard_iface;
+	struct net_device *soft_iface;
+	struct bat_priv *bat_priv;
+	struct batman_packet *batman_packet =
+		(struct batman_packet *)(forw_packet->skb->data);
+	unsigned char directlink = (batman_packet->flags & DIRECTLINK ? 1 : 0);
+
+	if (!forw_packet->if_incoming) {
+		pr_err("Error - can't forward packet: incoming iface not "
+		       "specified\n");
+		return;
+	}
+
+	soft_iface = forw_packet->if_incoming->soft_iface;
+	bat_priv = netdev_priv(soft_iface);
+
+	if (forw_packet->if_incoming->if_status != IF_ACTIVE)
+		return;
+
+	/* multihomed peer assumed */
+	/* non-primary OGMs are only broadcasted on their interface */
+	if ((directlink && (batman_packet->ttl == 1)) ||
+	    (forw_packet->own && (forw_packet->if_incoming->if_num > 0))) {
+
+		/* FIXME: what about aggregated packets ? */
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"%s packet (originator %pM, seqno %d, TTL %d) "
+			"on interface %s [%pM]\n",
+			(forw_packet->own ? "Sending own" : "Forwarding"),
+			batman_packet->orig, ntohl(batman_packet->seqno),
+			batman_packet->ttl,
+			forw_packet->if_incoming->net_dev->name,
+			forw_packet->if_incoming->net_dev->dev_addr);
+
+		/* skb is only used once and than forw_packet is free'd */
+		send_skb_packet(forw_packet->skb, forw_packet->if_incoming,
+				broadcast_addr);
+		forw_packet->skb = NULL;
+
+		return;
+	}
+
+	/* broadcast on every interface */
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->soft_iface != soft_iface)
+			continue;
+
+		send_packet_to_if(forw_packet, hard_iface);
+	}
+	rcu_read_unlock();
+}
+
+static void rebuild_batman_packet(struct bat_priv *bat_priv,
+				  struct hard_iface *hard_iface)
+{
+	int new_len;
+	unsigned char *new_buff;
+	struct batman_packet *batman_packet;
+
+	new_len = sizeof(struct batman_packet) +
+			(bat_priv->num_local_tt * ETH_ALEN);
+	new_buff = kmalloc(new_len, GFP_ATOMIC);
+
+	/* keep old buffer if kmalloc should fail */
+	if (new_buff) {
+		memcpy(new_buff, hard_iface->packet_buff,
+		       sizeof(struct batman_packet));
+		batman_packet = (struct batman_packet *)new_buff;
+
+		batman_packet->num_tt = tt_local_fill_buffer(bat_priv,
+				new_buff + sizeof(struct batman_packet),
+				new_len - sizeof(struct batman_packet));
+
+		kfree(hard_iface->packet_buff);
+		hard_iface->packet_buff = new_buff;
+		hard_iface->packet_len = new_len;
+	}
+}
+
+void schedule_own_packet(struct hard_iface *hard_iface)
+{
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+	struct hard_iface *primary_if;
+	unsigned long send_time;
+	struct batman_packet *batman_packet;
+	int vis_server;
+
+	if ((hard_iface->if_status == IF_NOT_IN_USE) ||
+	    (hard_iface->if_status == IF_TO_BE_REMOVED))
+		return;
+
+	vis_server = atomic_read(&bat_priv->vis_mode);
+	primary_if = primary_if_get_selected(bat_priv);
+
+	/**
+	 * the interface gets activated here to avoid race conditions between
+	 * the moment of activating the interface in
+	 * hardif_activate_interface() where the originator mac is set and
+	 * outdated packets (especially uninitialized mac addresses) in the
+	 * packet queue
+	 */
+	if (hard_iface->if_status == IF_TO_BE_ACTIVATED)
+		hard_iface->if_status = IF_ACTIVE;
+
+	/* if local tt has changed and interface is a primary interface */
+	if ((atomic_read(&bat_priv->tt_local_changed)) &&
+	    (hard_iface == primary_if))
+		rebuild_batman_packet(bat_priv, hard_iface);
+
+	/**
+	 * NOTE: packet_buff might just have been re-allocated in
+	 * rebuild_batman_packet()
+	 */
+	batman_packet = (struct batman_packet *)hard_iface->packet_buff;
+
+	/* change sequence number to network order */
+	batman_packet->seqno =
+		htonl((uint32_t)atomic_read(&hard_iface->seqno));
+
+	if (vis_server == VIS_TYPE_SERVER_SYNC)
+		batman_packet->flags |= VIS_SERVER;
+	else
+		batman_packet->flags &= ~VIS_SERVER;
+
+	if ((hard_iface == primary_if) &&
+	    (atomic_read(&bat_priv->gw_mode) == GW_MODE_SERVER))
+		batman_packet->gw_flags =
+				(uint8_t)atomic_read(&bat_priv->gw_bandwidth);
+	else
+		batman_packet->gw_flags = 0;
+
+	atomic_inc(&hard_iface->seqno);
+
+	slide_own_bcast_window(hard_iface);
+	send_time = own_send_time(bat_priv);
+	add_bat_packet_to_list(bat_priv,
+			       hard_iface->packet_buff,
+			       hard_iface->packet_len,
+			       hard_iface, 1, send_time);
+
+	if (primary_if)
+		hardif_free_ref(primary_if);
+}
+
+void schedule_forward_packet(struct orig_node *orig_node,
+			     struct ethhdr *ethhdr,
+			     struct batman_packet *batman_packet,
+			     uint8_t directlink, int tt_buff_len,
+			     struct hard_iface *if_incoming)
+{
+	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct neigh_node *router;
+	unsigned char in_tq, in_ttl, tq_avg = 0;
+	unsigned long send_time;
+
+	if (batman_packet->ttl <= 1) {
+		bat_dbg(DBG_BATMAN, bat_priv, "ttl exceeded\n");
+		return;
+	}
+
+	router = orig_node_get_router(orig_node);
+
+	in_tq = batman_packet->tq;
+	in_ttl = batman_packet->ttl;
+
+	batman_packet->ttl--;
+	memcpy(batman_packet->prev_sender, ethhdr->h_source, ETH_ALEN);
+
+	/* rebroadcast tq of our best ranking neighbor to ensure the rebroadcast
+	 * of our best tq value */
+	if (router && router->tq_avg != 0) {
+
+		/* rebroadcast ogm of best ranking neighbor as is */
+		if (!compare_eth(router->addr, ethhdr->h_source)) {
+			batman_packet->tq = router->tq_avg;
+
+			if (router->last_ttl)
+				batman_packet->ttl = router->last_ttl - 1;
+		}
+
+		tq_avg = router->tq_avg;
+	}
+
+	if (router)
+		neigh_node_free_ref(router);
+
+	/* apply hop penalty */
+	batman_packet->tq = hop_penalty(batman_packet->tq, bat_priv);
+
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Forwarding packet: tq_orig: %i, tq_avg: %i, "
+		"tq_forw: %i, ttl_orig: %i, ttl_forw: %i\n",
+		in_tq, tq_avg, batman_packet->tq, in_ttl - 1,
+		batman_packet->ttl);
+
+	batman_packet->seqno = htonl(batman_packet->seqno);
+
+	/* switch of primaries first hop flag when forwarding */
+	batman_packet->flags &= ~PRIMARIES_FIRST_HOP;
+	if (directlink)
+		batman_packet->flags |= DIRECTLINK;
+	else
+		batman_packet->flags &= ~DIRECTLINK;
+
+	send_time = forward_send_time();
+	add_bat_packet_to_list(bat_priv,
+			       (unsigned char *)batman_packet,
+			       sizeof(struct batman_packet) + tt_buff_len,
+			       if_incoming, 0, send_time);
+}
+
+static void forw_packet_free(struct forw_packet *forw_packet)
+{
+	if (forw_packet->skb)
+		kfree_skb(forw_packet->skb);
+	if (forw_packet->if_incoming)
+		hardif_free_ref(forw_packet->if_incoming);
+	kfree(forw_packet);
+}
+
+static void _add_bcast_packet_to_list(struct bat_priv *bat_priv,
+				      struct forw_packet *forw_packet,
+				      unsigned long send_time)
+{
+	INIT_HLIST_NODE(&forw_packet->list);
+
+	/* add new packet to packet list */
+	spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+	hlist_add_head(&forw_packet->list, &bat_priv->forw_bcast_list);
+	spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+	/* start timer for this packet */
+	INIT_DELAYED_WORK(&forw_packet->delayed_work,
+			  send_outstanding_bcast_packet);
+	queue_delayed_work(bat_event_workqueue, &forw_packet->delayed_work,
+			   send_time);
+}
+
+/* add a broadcast packet to the queue and setup timers. broadcast packets
+ * are sent multiple times to increase probability for beeing received.
+ *
+ * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on
+ * errors.
+ *
+ * The skb is not consumed, so the caller should make sure that the
+ * skb is freed. */
+int add_bcast_packet_to_list(struct bat_priv *bat_priv, struct sk_buff *skb)
+{
+	struct hard_iface *primary_if = NULL;
+	struct forw_packet *forw_packet;
+	struct bcast_packet *bcast_packet;
+
+	if (!atomic_dec_not_zero(&bat_priv->bcast_queue_left)) {
+		bat_dbg(DBG_BATMAN, bat_priv, "bcast packet queue full\n");
+		goto out;
+	}
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out_and_inc;
+
+	forw_packet = kmalloc(sizeof(struct forw_packet), GFP_ATOMIC);
+
+	if (!forw_packet)
+		goto out_and_inc;
+
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb)
+		goto packet_free;
+
+	/* as we have a copy now, it is safe to decrease the TTL */
+	bcast_packet = (struct bcast_packet *)skb->data;
+	bcast_packet->ttl--;
+
+	skb_reset_mac_header(skb);
+
+	forw_packet->skb = skb;
+	forw_packet->if_incoming = primary_if;
+
+	/* how often did we send the bcast packet ? */
+	forw_packet->num_packets = 0;
+
+	_add_bcast_packet_to_list(bat_priv, forw_packet, 1);
+	return NETDEV_TX_OK;
+
+packet_free:
+	kfree(forw_packet);
+out_and_inc:
+	atomic_inc(&bat_priv->bcast_queue_left);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return NETDEV_TX_BUSY;
+}
+
+static void send_outstanding_bcast_packet(struct work_struct *work)
+{
+	struct hard_iface *hard_iface;
+	struct delayed_work *delayed_work =
+		container_of(work, struct delayed_work, work);
+	struct forw_packet *forw_packet =
+		container_of(delayed_work, struct forw_packet, delayed_work);
+	struct sk_buff *skb1;
+	struct net_device *soft_iface = forw_packet->if_incoming->soft_iface;
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+
+	spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+	hlist_del(&forw_packet->list);
+	spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+	if (atomic_read(&bat_priv->mesh_state) == MESH_DEACTIVATING)
+		goto out;
+
+	/* rebroadcast packet */
+	rcu_read_lock();
+	list_for_each_entry_rcu(hard_iface, &hardif_list, list) {
+		if (hard_iface->soft_iface != soft_iface)
+			continue;
+
+		/* send a copy of the saved skb */
+		skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
+		if (skb1)
+			send_skb_packet(skb1, hard_iface, broadcast_addr);
+	}
+	rcu_read_unlock();
+
+	forw_packet->num_packets++;
+
+	/* if we still have some more bcasts to send */
+	if (forw_packet->num_packets < 3) {
+		_add_bcast_packet_to_list(bat_priv, forw_packet,
+					  ((5 * HZ) / 1000));
+		return;
+	}
+
+out:
+	forw_packet_free(forw_packet);
+	atomic_inc(&bat_priv->bcast_queue_left);
+}
+
+void send_outstanding_bat_packet(struct work_struct *work)
+{
+	struct delayed_work *delayed_work =
+		container_of(work, struct delayed_work, work);
+	struct forw_packet *forw_packet =
+		container_of(delayed_work, struct forw_packet, delayed_work);
+	struct bat_priv *bat_priv;
+
+	bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
+	spin_lock_bh(&bat_priv->forw_bat_list_lock);
+	hlist_del(&forw_packet->list);
+	spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+	if (atomic_read(&bat_priv->mesh_state) == MESH_DEACTIVATING)
+		goto out;
+
+	send_packet(forw_packet);
+
+	/**
+	 * we have to have at least one packet in the queue
+	 * to determine the queues wake up time unless we are
+	 * shutting down
+	 */
+	if (forw_packet->own)
+		schedule_own_packet(forw_packet->if_incoming);
+
+out:
+	/* don't count own packet */
+	if (!forw_packet->own)
+		atomic_inc(&bat_priv->batman_queue_left);
+
+	forw_packet_free(forw_packet);
+}
+
+void purge_outstanding_packets(struct bat_priv *bat_priv,
+			       struct hard_iface *hard_iface)
+{
+	struct forw_packet *forw_packet;
+	struct hlist_node *tmp_node, *safe_tmp_node;
+	bool pending;
+
+	if (hard_iface)
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"purge_outstanding_packets(): %s\n",
+			hard_iface->net_dev->name);
+	else
+		bat_dbg(DBG_BATMAN, bat_priv,
+			"purge_outstanding_packets()\n");
+
+	/* free bcast list */
+	spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+	hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node,
+				  &bat_priv->forw_bcast_list, list) {
+
+		/**
+		 * if purge_outstanding_packets() was called with an argmument
+		 * we delete only packets belonging to the given interface
+		 */
+		if ((hard_iface) &&
+		    (forw_packet->if_incoming != hard_iface))
+			continue;
+
+		spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+		/**
+		 * send_outstanding_bcast_packet() will lock the list to
+		 * delete the item from the list
+		 */
+		pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
+		spin_lock_bh(&bat_priv->forw_bcast_list_lock);
+
+		if (pending) {
+			hlist_del(&forw_packet->list);
+			forw_packet_free(forw_packet);
+		}
+	}
+	spin_unlock_bh(&bat_priv->forw_bcast_list_lock);
+
+	/* free batman packet list */
+	spin_lock_bh(&bat_priv->forw_bat_list_lock);
+	hlist_for_each_entry_safe(forw_packet, tmp_node, safe_tmp_node,
+				  &bat_priv->forw_bat_list, list) {
+
+		/**
+		 * if purge_outstanding_packets() was called with an argmument
+		 * we delete only packets belonging to the given interface
+		 */
+		if ((hard_iface) &&
+		    (forw_packet->if_incoming != hard_iface))
+			continue;
+
+		spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+
+		/**
+		 * send_outstanding_bat_packet() will lock the list to
+		 * delete the item from the list
+		 */
+		pending = cancel_delayed_work_sync(&forw_packet->delayed_work);
+		spin_lock_bh(&bat_priv->forw_bat_list_lock);
+
+		if (pending) {
+			hlist_del(&forw_packet->list);
+			forw_packet_free(forw_packet);
+		}
+	}
+	spin_unlock_bh(&bat_priv->forw_bat_list_lock);
+}
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
new file mode 100644
index 00000000..247172d7
--- /dev/null
+++ b/net/batman-adv/send.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_SEND_H_
+#define _NET_BATMAN_ADV_SEND_H_
+
+int send_skb_packet(struct sk_buff *skb,
+				struct hard_iface *hard_iface,
+				uint8_t *dst_addr);
+void schedule_own_packet(struct hard_iface *hard_iface);
+void schedule_forward_packet(struct orig_node *orig_node,
+			     struct ethhdr *ethhdr,
+			     struct batman_packet *batman_packet,
+			     uint8_t directlink, int tt_buff_len,
+			     struct hard_iface *if_outgoing);
+int add_bcast_packet_to_list(struct bat_priv *bat_priv, struct sk_buff *skb);
+void send_outstanding_bat_packet(struct work_struct *work);
+void purge_outstanding_packets(struct bat_priv *bat_priv,
+			       struct hard_iface *hard_iface);
+
+#endif /* _NET_BATMAN_ADV_SEND_H_ */
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
new file mode 100644
index 00000000..d5aa6099
--- /dev/null
+++ b/net/batman-adv/soft-interface.c
@@ -0,0 +1,927 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "routing.h"
+#include "send.h"
+#include "bat_debugfs.h"
+#include "translation-table.h"
+#include "hash.h"
+#include "gateway_common.h"
+#include "gateway_client.h"
+#include "bat_sysfs.h"
+#include <linux/slab.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include "unicast.h"
+
+
+static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd);
+static void bat_get_drvinfo(struct net_device *dev,
+			    struct ethtool_drvinfo *info);
+static u32 bat_get_msglevel(struct net_device *dev);
+static void bat_set_msglevel(struct net_device *dev, u32 value);
+static u32 bat_get_link(struct net_device *dev);
+
+static const struct ethtool_ops bat_ethtool_ops = {
+	.get_settings = bat_get_settings,
+	.get_drvinfo = bat_get_drvinfo,
+	.get_msglevel = bat_get_msglevel,
+	.set_msglevel = bat_set_msglevel,
+	.get_link = bat_get_link,
+};
+
+int my_skb_head_push(struct sk_buff *skb, unsigned int len)
+{
+	int result;
+
+	/**
+	 * TODO: We must check if we can release all references to non-payload
+	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * work optimally. This means that those skbs are not allowed to read
+	 * or write any data which is before the current position of skb->data
+	 * after that call and thus allow other skbs with the same data buffer
+	 * to write freely in that area.
+	 */
+	result = skb_cow_head(skb, len);
+	if (result < 0)
+		return result;
+
+	skb_push(skb, len);
+	return 0;
+}
+
+static void softif_neigh_free_ref(struct softif_neigh *softif_neigh)
+{
+	if (atomic_dec_and_test(&softif_neigh->refcount))
+		kfree_rcu(softif_neigh, rcu);
+}
+
+static void softif_neigh_vid_free_rcu(struct rcu_head *rcu)
+{
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct softif_neigh *softif_neigh;
+	struct hlist_node *node, *node_tmp;
+	struct bat_priv *bat_priv;
+
+	softif_neigh_vid = container_of(rcu, struct softif_neigh_vid, rcu);
+	bat_priv = softif_neigh_vid->bat_priv;
+
+	spin_lock_bh(&bat_priv->softif_neigh_lock);
+	hlist_for_each_entry_safe(softif_neigh, node, node_tmp,
+				  &softif_neigh_vid->softif_neigh_list, list) {
+		hlist_del_rcu(&softif_neigh->list);
+		softif_neigh_free_ref(softif_neigh);
+	}
+	spin_unlock_bh(&bat_priv->softif_neigh_lock);
+
+	kfree(softif_neigh_vid);
+}
+
+static void softif_neigh_vid_free_ref(struct softif_neigh_vid *softif_neigh_vid)
+{
+	if (atomic_dec_and_test(&softif_neigh_vid->refcount))
+		call_rcu(&softif_neigh_vid->rcu, softif_neigh_vid_free_rcu);
+}
+
+static struct softif_neigh_vid *softif_neigh_vid_get(struct bat_priv *bat_priv,
+						     short vid)
+{
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct hlist_node *node;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(softif_neigh_vid, node,
+				 &bat_priv->softif_neigh_vids, list) {
+		if (softif_neigh_vid->vid != vid)
+			continue;
+
+		if (!atomic_inc_not_zero(&softif_neigh_vid->refcount))
+			continue;
+
+		goto out;
+	}
+
+	softif_neigh_vid = kzalloc(sizeof(struct softif_neigh_vid),
+				   GFP_ATOMIC);
+	if (!softif_neigh_vid)
+		goto out;
+
+	softif_neigh_vid->vid = vid;
+	softif_neigh_vid->bat_priv = bat_priv;
+
+	/* initialize with 2 - caller decrements counter by one */
+	atomic_set(&softif_neigh_vid->refcount, 2);
+	INIT_HLIST_HEAD(&softif_neigh_vid->softif_neigh_list);
+	INIT_HLIST_NODE(&softif_neigh_vid->list);
+	spin_lock_bh(&bat_priv->softif_neigh_vid_lock);
+	hlist_add_head_rcu(&softif_neigh_vid->list,
+			   &bat_priv->softif_neigh_vids);
+	spin_unlock_bh(&bat_priv->softif_neigh_vid_lock);
+
+out:
+	rcu_read_unlock();
+	return softif_neigh_vid;
+}
+
+static struct softif_neigh *softif_neigh_get(struct bat_priv *bat_priv,
+					     uint8_t *addr, short vid)
+{
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct softif_neigh *softif_neigh = NULL;
+	struct hlist_node *node;
+
+	softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid);
+	if (!softif_neigh_vid)
+		goto out;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(softif_neigh, node,
+				 &softif_neigh_vid->softif_neigh_list,
+				 list) {
+		if (!compare_eth(softif_neigh->addr, addr))
+			continue;
+
+		if (!atomic_inc_not_zero(&softif_neigh->refcount))
+			continue;
+
+		softif_neigh->last_seen = jiffies;
+		goto unlock;
+	}
+
+	softif_neigh = kzalloc(sizeof(struct softif_neigh), GFP_ATOMIC);
+	if (!softif_neigh)
+		goto unlock;
+
+	memcpy(softif_neigh->addr, addr, ETH_ALEN);
+	softif_neigh->last_seen = jiffies;
+	/* initialize with 2 - caller decrements counter by one */
+	atomic_set(&softif_neigh->refcount, 2);
+
+	INIT_HLIST_NODE(&softif_neigh->list);
+	spin_lock_bh(&bat_priv->softif_neigh_lock);
+	hlist_add_head_rcu(&softif_neigh->list,
+			   &softif_neigh_vid->softif_neigh_list);
+	spin_unlock_bh(&bat_priv->softif_neigh_lock);
+
+unlock:
+	rcu_read_unlock();
+out:
+	if (softif_neigh_vid)
+		softif_neigh_vid_free_ref(softif_neigh_vid);
+	return softif_neigh;
+}
+
+static struct softif_neigh *softif_neigh_get_selected(
+				struct softif_neigh_vid *softif_neigh_vid)
+{
+	struct softif_neigh *softif_neigh;
+
+	rcu_read_lock();
+	softif_neigh = rcu_dereference(softif_neigh_vid->softif_neigh);
+
+	if (softif_neigh && !atomic_inc_not_zero(&softif_neigh->refcount))
+		softif_neigh = NULL;
+
+	rcu_read_unlock();
+	return softif_neigh;
+}
+
+static struct softif_neigh *softif_neigh_vid_get_selected(
+						struct bat_priv *bat_priv,
+						short vid)
+{
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct softif_neigh *softif_neigh = NULL;
+
+	softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid);
+	if (!softif_neigh_vid)
+		goto out;
+
+	softif_neigh = softif_neigh_get_selected(softif_neigh_vid);
+out:
+	if (softif_neigh_vid)
+		softif_neigh_vid_free_ref(softif_neigh_vid);
+	return softif_neigh;
+}
+
+static void softif_neigh_vid_select(struct bat_priv *bat_priv,
+				    struct softif_neigh *new_neigh,
+				    short vid)
+{
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct softif_neigh *curr_neigh;
+
+	softif_neigh_vid = softif_neigh_vid_get(bat_priv, vid);
+	if (!softif_neigh_vid)
+		goto out;
+
+	spin_lock_bh(&bat_priv->softif_neigh_lock);
+
+	if (new_neigh && !atomic_inc_not_zero(&new_neigh->refcount))
+		new_neigh = NULL;
+
+	curr_neigh = softif_neigh_vid->softif_neigh;
+	rcu_assign_pointer(softif_neigh_vid->softif_neigh, new_neigh);
+
+	if ((curr_neigh) && (!new_neigh))
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Removing mesh exit point on vid: %d (prev: %pM).\n",
+			vid, curr_neigh->addr);
+	else if ((curr_neigh) && (new_neigh))
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Changing mesh exit point on vid: %d from %pM "
+			"to %pM.\n", vid, curr_neigh->addr, new_neigh->addr);
+	else if ((!curr_neigh) && (new_neigh))
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Setting mesh exit point on vid: %d to %pM.\n",
+			vid, new_neigh->addr);
+
+	if (curr_neigh)
+		softif_neigh_free_ref(curr_neigh);
+
+	spin_unlock_bh(&bat_priv->softif_neigh_lock);
+
+out:
+	if (softif_neigh_vid)
+		softif_neigh_vid_free_ref(softif_neigh_vid);
+}
+
+static void softif_neigh_vid_deselect(struct bat_priv *bat_priv,
+				      struct softif_neigh_vid *softif_neigh_vid)
+{
+	struct softif_neigh *curr_neigh;
+	struct softif_neigh *softif_neigh = NULL, *softif_neigh_tmp;
+	struct hard_iface *primary_if = NULL;
+	struct hlist_node *node;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	/* find new softif_neigh immediately to avoid temporary loops */
+	rcu_read_lock();
+	curr_neigh = rcu_dereference(softif_neigh_vid->softif_neigh);
+
+	hlist_for_each_entry_rcu(softif_neigh_tmp, node,
+				 &softif_neigh_vid->softif_neigh_list,
+				 list) {
+		if (softif_neigh_tmp == curr_neigh)
+			continue;
+
+		/* we got a neighbor but its mac is 'bigger' than ours  */
+		if (memcmp(primary_if->net_dev->dev_addr,
+			   softif_neigh_tmp->addr, ETH_ALEN) < 0)
+			continue;
+
+		if (!atomic_inc_not_zero(&softif_neigh_tmp->refcount))
+			continue;
+
+		softif_neigh = softif_neigh_tmp;
+		goto unlock;
+	}
+
+unlock:
+	rcu_read_unlock();
+out:
+	softif_neigh_vid_select(bat_priv, softif_neigh, softif_neigh_vid->vid);
+
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	if (softif_neigh)
+		softif_neigh_free_ref(softif_neigh);
+}
+
+int softif_neigh_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct softif_neigh *softif_neigh;
+	struct hard_iface *primary_if;
+	struct hlist_node *node, *node_tmp;
+	struct softif_neigh *curr_softif_neigh;
+	int ret = 0, last_seen_secs, last_seen_msecs;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "please specify interfaces to enable it\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	if (primary_if->if_status != IF_ACTIVE) {
+		ret = seq_printf(seq, "BATMAN mesh %s "
+				 "disabled - primary interface not active\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	seq_printf(seq, "Softif neighbor list (%s)\n", net_dev->name);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(softif_neigh_vid, node,
+				 &bat_priv->softif_neigh_vids, list) {
+		seq_printf(seq, "     %-15s %s on vid: %d\n",
+			   "Originator", "last-seen", softif_neigh_vid->vid);
+
+		curr_softif_neigh = softif_neigh_get_selected(softif_neigh_vid);
+
+		hlist_for_each_entry_rcu(softif_neigh, node_tmp,
+					 &softif_neigh_vid->softif_neigh_list,
+					 list) {
+			last_seen_secs = jiffies_to_msecs(jiffies -
+						softif_neigh->last_seen) / 1000;
+			last_seen_msecs = jiffies_to_msecs(jiffies -
+						softif_neigh->last_seen) % 1000;
+			seq_printf(seq, "%s %pM  %3i.%03is\n",
+				   curr_softif_neigh == softif_neigh
+				   ? "=>" : "  ", softif_neigh->addr,
+				   last_seen_secs, last_seen_msecs);
+		}
+
+		if (curr_softif_neigh)
+			softif_neigh_free_ref(curr_softif_neigh);
+
+		seq_printf(seq, "\n");
+	}
+	rcu_read_unlock();
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+void softif_neigh_purge(struct bat_priv *bat_priv)
+{
+	struct softif_neigh *softif_neigh, *curr_softif_neigh;
+	struct softif_neigh_vid *softif_neigh_vid;
+	struct hlist_node *node, *node_tmp, *node_tmp2;
+	char do_deselect;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(softif_neigh_vid, node,
+				 &bat_priv->softif_neigh_vids, list) {
+		if (!atomic_inc_not_zero(&softif_neigh_vid->refcount))
+			continue;
+
+		curr_softif_neigh = softif_neigh_get_selected(softif_neigh_vid);
+		do_deselect = 0;
+
+		spin_lock_bh(&bat_priv->softif_neigh_lock);
+		hlist_for_each_entry_safe(softif_neigh, node_tmp, node_tmp2,
+					  &softif_neigh_vid->softif_neigh_list,
+					  list) {
+			if ((!time_after(jiffies, softif_neigh->last_seen +
+				msecs_to_jiffies(SOFTIF_NEIGH_TIMEOUT))) &&
+			    (atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE))
+				continue;
+
+			if (curr_softif_neigh == softif_neigh) {
+				bat_dbg(DBG_ROUTES, bat_priv,
+					"Current mesh exit point on vid: %d "
+					"'%pM' vanished.\n",
+					softif_neigh_vid->vid,
+					softif_neigh->addr);
+				do_deselect = 1;
+			}
+
+			hlist_del_rcu(&softif_neigh->list);
+			softif_neigh_free_ref(softif_neigh);
+		}
+		spin_unlock_bh(&bat_priv->softif_neigh_lock);
+
+		/* soft_neigh_vid_deselect() needs to acquire the
+		 * softif_neigh_lock */
+		if (do_deselect)
+			softif_neigh_vid_deselect(bat_priv, softif_neigh_vid);
+
+		if (curr_softif_neigh)
+			softif_neigh_free_ref(curr_softif_neigh);
+
+		softif_neigh_vid_free_ref(softif_neigh_vid);
+	}
+	rcu_read_unlock();
+
+	spin_lock_bh(&bat_priv->softif_neigh_vid_lock);
+	hlist_for_each_entry_safe(softif_neigh_vid, node, node_tmp,
+				  &bat_priv->softif_neigh_vids, list) {
+		if (!hlist_empty(&softif_neigh_vid->softif_neigh_list))
+			continue;
+
+		hlist_del_rcu(&softif_neigh_vid->list);
+		softif_neigh_vid_free_ref(softif_neigh_vid);
+	}
+	spin_unlock_bh(&bat_priv->softif_neigh_vid_lock);
+
+}
+
+static void softif_batman_recv(struct sk_buff *skb, struct net_device *dev,
+			       short vid)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+	struct batman_packet *batman_packet;
+	struct softif_neigh *softif_neigh = NULL;
+	struct hard_iface *primary_if = NULL;
+	struct softif_neigh *curr_softif_neigh = NULL;
+
+	if (ntohs(ethhdr->h_proto) == ETH_P_8021Q)
+		batman_packet = (struct batman_packet *)
+					(skb->data + ETH_HLEN + VLAN_HLEN);
+	else
+		batman_packet = (struct batman_packet *)(skb->data + ETH_HLEN);
+
+	if (batman_packet->version != COMPAT_VERSION)
+		goto out;
+
+	if (batman_packet->packet_type != BAT_PACKET)
+		goto out;
+
+	if (!(batman_packet->flags & PRIMARIES_FIRST_HOP))
+		goto out;
+
+	if (is_my_mac(batman_packet->orig))
+		goto out;
+
+	softif_neigh = softif_neigh_get(bat_priv, batman_packet->orig, vid);
+	if (!softif_neigh)
+		goto out;
+
+	curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid);
+	if (curr_softif_neigh == softif_neigh)
+		goto out;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	/* we got a neighbor but its mac is 'bigger' than ours  */
+	if (memcmp(primary_if->net_dev->dev_addr,
+		   softif_neigh->addr, ETH_ALEN) < 0)
+		goto out;
+
+	/* close own batX device and use softif_neigh as exit node */
+	if (!curr_softif_neigh) {
+		softif_neigh_vid_select(bat_priv, softif_neigh, vid);
+		goto out;
+	}
+
+	/* switch to new 'smallest neighbor' */
+	if (memcmp(softif_neigh->addr, curr_softif_neigh->addr, ETH_ALEN) < 0)
+		softif_neigh_vid_select(bat_priv, softif_neigh, vid);
+
+out:
+	kfree_skb(skb);
+	if (softif_neigh)
+		softif_neigh_free_ref(softif_neigh);
+	if (curr_softif_neigh)
+		softif_neigh_free_ref(curr_softif_neigh);
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return;
+}
+
+static int interface_open(struct net_device *dev)
+{
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int interface_release(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static struct net_device_stats *interface_stats(struct net_device *dev)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	return &bat_priv->stats;
+}
+
+static int interface_set_mac_addr(struct net_device *dev, void *p)
+{
+	struct bat_priv *bat_priv = netdev_priv(dev);
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	/* only modify transtable if it has been initialised before */
+	if (atomic_read(&bat_priv->mesh_state) == MESH_ACTIVE) {
+		tt_local_remove(bat_priv, dev->dev_addr,
+				 "mac address changed");
+		tt_local_add(dev, addr->sa_data);
+	}
+
+	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	return 0;
+}
+
+static int interface_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* check ranges */
+	if ((new_mtu < 68) || (new_mtu > hardif_min_mtu(dev)))
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+int interface_tx(struct sk_buff *skb, struct net_device *soft_iface)
+{
+	struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+	struct hard_iface *primary_if = NULL;
+	struct bcast_packet *bcast_packet;
+	struct vlan_ethhdr *vhdr;
+	struct softif_neigh *curr_softif_neigh = NULL;
+	int data_len = skb->len, ret;
+	short vid = -1;
+	bool do_bcast = false;
+
+	if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
+		goto dropped;
+
+	soft_iface->trans_start = jiffies;
+
+	switch (ntohs(ethhdr->h_proto)) {
+	case ETH_P_8021Q:
+		vhdr = (struct vlan_ethhdr *)skb->data;
+		vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+
+		if (ntohs(vhdr->h_vlan_encapsulated_proto) != ETH_P_BATMAN)
+			break;
+
+		/* fall through */
+	case ETH_P_BATMAN:
+		softif_batman_recv(skb, soft_iface, vid);
+		goto end;
+	}
+
+	/**
+	 * if we have a another chosen mesh exit node in range
+	 * it will transport the packets to the mesh
+	 */
+	curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid);
+	if (curr_softif_neigh)
+		goto dropped;
+
+	/* TODO: check this for locks */
+	tt_local_add(soft_iface, ethhdr->h_source);
+
+	if (is_multicast_ether_addr(ethhdr->h_dest)) {
+		ret = gw_is_target(bat_priv, skb);
+
+		if (ret < 0)
+			goto dropped;
+
+		if (ret == 0)
+			do_bcast = true;
+	}
+
+	/* ethernet packet should be broadcasted */
+	if (do_bcast) {
+		primary_if = primary_if_get_selected(bat_priv);
+		if (!primary_if)
+			goto dropped;
+
+		if (my_skb_head_push(skb, sizeof(struct bcast_packet)) < 0)
+			goto dropped;
+
+		bcast_packet = (struct bcast_packet *)skb->data;
+		bcast_packet->version = COMPAT_VERSION;
+		bcast_packet->ttl = TTL;
+
+		/* batman packet type: broadcast */
+		bcast_packet->packet_type = BAT_BCAST;
+
+		/* hw address of first interface is the orig mac because only
+		 * this mac is known throughout the mesh */
+		memcpy(bcast_packet->orig,
+		       primary_if->net_dev->dev_addr, ETH_ALEN);
+
+		/* set broadcast sequence number */
+		bcast_packet->seqno =
+			htonl(atomic_inc_return(&bat_priv->bcast_seqno));
+
+		add_bcast_packet_to_list(bat_priv, skb);
+
+		/* a copy is stored in the bcast list, therefore removing
+		 * the original skb. */
+		kfree_skb(skb);
+
+	/* unicast packet */
+	} else {
+		ret = unicast_send_skb(skb, bat_priv);
+		if (ret != 0)
+			goto dropped_freed;
+	}
+
+	bat_priv->stats.tx_packets++;
+	bat_priv->stats.tx_bytes += data_len;
+	goto end;
+
+dropped:
+	kfree_skb(skb);
+dropped_freed:
+	bat_priv->stats.tx_dropped++;
+end:
+	if (curr_softif_neigh)
+		softif_neigh_free_ref(curr_softif_neigh);
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return NETDEV_TX_OK;
+}
+
+void interface_rx(struct net_device *soft_iface,
+		  struct sk_buff *skb, struct hard_iface *recv_if,
+		  int hdr_size)
+{
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+	struct unicast_packet *unicast_packet;
+	struct ethhdr *ethhdr;
+	struct vlan_ethhdr *vhdr;
+	struct softif_neigh *curr_softif_neigh = NULL;
+	short vid = -1;
+	int ret;
+
+	/* check if enough space is available for pulling, and pull */
+	if (!pskb_may_pull(skb, hdr_size))
+		goto dropped;
+
+	skb_pull_rcsum(skb, hdr_size);
+	skb_reset_mac_header(skb);
+
+	ethhdr = (struct ethhdr *)skb_mac_header(skb);
+
+	switch (ntohs(ethhdr->h_proto)) {
+	case ETH_P_8021Q:
+		vhdr = (struct vlan_ethhdr *)skb->data;
+		vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+
+		if (ntohs(vhdr->h_vlan_encapsulated_proto) != ETH_P_BATMAN)
+			break;
+
+		/* fall through */
+	case ETH_P_BATMAN:
+		goto dropped;
+	}
+
+	/**
+	 * if we have a another chosen mesh exit node in range
+	 * it will transport the packets to the non-mesh network
+	 */
+	curr_softif_neigh = softif_neigh_vid_get_selected(bat_priv, vid);
+	if (curr_softif_neigh) {
+		skb_push(skb, hdr_size);
+		unicast_packet = (struct unicast_packet *)skb->data;
+
+		if ((unicast_packet->packet_type != BAT_UNICAST) &&
+		    (unicast_packet->packet_type != BAT_UNICAST_FRAG))
+			goto dropped;
+
+		skb_reset_mac_header(skb);
+
+		memcpy(unicast_packet->dest,
+		       curr_softif_neigh->addr, ETH_ALEN);
+		ret = route_unicast_packet(skb, recv_if);
+		if (ret == NET_RX_DROP)
+			goto dropped;
+
+		goto out;
+	}
+
+	/* skb->dev & skb->pkt_type are set here */
+	if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
+		goto dropped;
+	skb->protocol = eth_type_trans(skb, soft_iface);
+
+	/* should not be necessary anymore as we use skb_pull_rcsum()
+	 * TODO: please verify this and remove this TODO
+	 * -- Dec 21st 2009, Simon Wunderlich */
+
+/*	skb->ip_summed = CHECKSUM_UNNECESSARY;*/
+
+	bat_priv->stats.rx_packets++;
+	bat_priv->stats.rx_bytes += skb->len + sizeof(struct ethhdr);
+
+	soft_iface->last_rx = jiffies;
+
+	netif_rx(skb);
+	goto out;
+
+dropped:
+	kfree_skb(skb);
+out:
+	if (curr_softif_neigh)
+		softif_neigh_free_ref(curr_softif_neigh);
+	return;
+}
+
+#ifdef HAVE_NET_DEVICE_OPS
+static const struct net_device_ops bat_netdev_ops = {
+	.ndo_open = interface_open,
+	.ndo_stop = interface_release,
+	.ndo_get_stats = interface_stats,
+	.ndo_set_mac_address = interface_set_mac_addr,
+	.ndo_change_mtu = interface_change_mtu,
+	.ndo_start_xmit = interface_tx,
+	.ndo_validate_addr = eth_validate_addr
+};
+#endif
+
+static void interface_setup(struct net_device *dev)
+{
+	struct bat_priv *priv = netdev_priv(dev);
+	char dev_addr[ETH_ALEN];
+
+	ether_setup(dev);
+
+#ifdef HAVE_NET_DEVICE_OPS
+	dev->netdev_ops = &bat_netdev_ops;
+#else
+	dev->open = interface_open;
+	dev->stop = interface_release;
+	dev->get_stats = interface_stats;
+	dev->set_mac_address = interface_set_mac_addr;
+	dev->change_mtu = interface_change_mtu;
+	dev->hard_start_xmit = interface_tx;
+#endif
+	dev->destructor = free_netdev;
+	dev->tx_queue_len = 0;
+
+	/**
+	 * can't call min_mtu, because the needed variables
+	 * have not been initialized yet
+	 */
+	dev->mtu = ETH_DATA_LEN;
+	/* reserve more space in the skbuff for our header */
+	dev->hard_header_len = BAT_HEADER_LEN;
+
+	/* generate random address */
+	random_ether_addr(dev_addr);
+	memcpy(dev->dev_addr, dev_addr, ETH_ALEN);
+
+	SET_ETHTOOL_OPS(dev, &bat_ethtool_ops);
+
+	memset(priv, 0, sizeof(struct bat_priv));
+}
+
+struct net_device *softif_create(char *name)
+{
+	struct net_device *soft_iface;
+	struct bat_priv *bat_priv;
+	int ret;
+
+	soft_iface = alloc_netdev(sizeof(struct bat_priv) , name,
+				   interface_setup);
+
+	if (!soft_iface) {
+		pr_err("Unable to allocate the batman interface: %s\n", name);
+		goto out;
+	}
+
+	ret = register_netdevice(soft_iface);
+	if (ret < 0) {
+		pr_err("Unable to register the batman interface '%s': %i\n",
+		       name, ret);
+		goto free_soft_iface;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	atomic_set(&bat_priv->aggregated_ogms, 1);
+	atomic_set(&bat_priv->bonding, 0);
+	atomic_set(&bat_priv->vis_mode, VIS_TYPE_CLIENT_UPDATE);
+	atomic_set(&bat_priv->gw_mode, GW_MODE_OFF);
+	atomic_set(&bat_priv->gw_sel_class, 20);
+	atomic_set(&bat_priv->gw_bandwidth, 41);
+	atomic_set(&bat_priv->orig_interval, 1000);
+	atomic_set(&bat_priv->hop_penalty, 10);
+	atomic_set(&bat_priv->log_level, 0);
+	atomic_set(&bat_priv->fragmentation, 1);
+	atomic_set(&bat_priv->bcast_queue_left, BCAST_QUEUE_LEN);
+	atomic_set(&bat_priv->batman_queue_left, BATMAN_QUEUE_LEN);
+
+	atomic_set(&bat_priv->mesh_state, MESH_INACTIVE);
+	atomic_set(&bat_priv->bcast_seqno, 1);
+	atomic_set(&bat_priv->tt_local_changed, 0);
+
+	bat_priv->primary_if = NULL;
+	bat_priv->num_ifaces = 0;
+
+	ret = sysfs_add_meshif(soft_iface);
+	if (ret < 0)
+		goto unreg_soft_iface;
+
+	ret = debugfs_add_meshif(soft_iface);
+	if (ret < 0)
+		goto unreg_sysfs;
+
+	ret = mesh_init(soft_iface);
+	if (ret < 0)
+		goto unreg_debugfs;
+
+	return soft_iface;
+
+unreg_debugfs:
+	debugfs_del_meshif(soft_iface);
+unreg_sysfs:
+	sysfs_del_meshif(soft_iface);
+unreg_soft_iface:
+	unregister_netdev(soft_iface);
+	return NULL;
+
+free_soft_iface:
+	free_netdev(soft_iface);
+out:
+	return NULL;
+}
+
+void softif_destroy(struct net_device *soft_iface)
+{
+	debugfs_del_meshif(soft_iface);
+	sysfs_del_meshif(soft_iface);
+	mesh_free(soft_iface);
+	unregister_netdevice(soft_iface);
+}
+
+int softif_is_valid(struct net_device *net_dev)
+{
+#ifdef HAVE_NET_DEVICE_OPS
+	if (net_dev->netdev_ops->ndo_start_xmit == interface_tx)
+		return 1;
+#else
+	if (net_dev->hard_start_xmit == interface_tx)
+		return 1;
+#endif
+
+	return 0;
+}
+
+/* ethtool */
+static int bat_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->supported = 0;
+	cmd->advertising = 0;
+	ethtool_cmd_speed_set(cmd, SPEED_10);
+	cmd->duplex = DUPLEX_FULL;
+	cmd->port = PORT_TP;
+	cmd->phy_address = 0;
+	cmd->transceiver = XCVR_INTERNAL;
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->maxtxpkt = 0;
+	cmd->maxrxpkt = 0;
+
+	return 0;
+}
+
+static void bat_get_drvinfo(struct net_device *dev,
+			    struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "B.A.T.M.A.N. advanced");
+	strcpy(info->version, SOURCE_VERSION);
+	strcpy(info->fw_version, "N/A");
+	strcpy(info->bus_info, "batman");
+}
+
+static u32 bat_get_msglevel(struct net_device *dev)
+{
+	return -EOPNOTSUPP;
+}
+
+static void bat_set_msglevel(struct net_device *dev, u32 value)
+{
+}
+
+static u32 bat_get_link(struct net_device *dev)
+{
+	return 1;
+}
+
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
new file mode 100644
index 00000000..4789b6f2
--- /dev/null
+++ b/net/batman-adv/soft-interface.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_
+#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_
+
+int my_skb_head_push(struct sk_buff *skb, unsigned int len);
+int softif_neigh_seq_print_text(struct seq_file *seq, void *offset);
+void softif_neigh_purge(struct bat_priv *bat_priv);
+int interface_tx(struct sk_buff *skb, struct net_device *soft_iface);
+void interface_rx(struct net_device *soft_iface,
+		  struct sk_buff *skb, struct hard_iface *recv_if,
+		  int hdr_size);
+struct net_device *softif_create(char *name);
+void softif_destroy(struct net_device *soft_iface);
+int softif_is_valid(struct net_device *net_dev);
+
+#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
new file mode 100644
index 00000000..7b729660
--- /dev/null
+++ b/net/batman-adv/translation-table.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "translation-table.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
+
+static void tt_local_purge(struct work_struct *work);
+static void _tt_global_del_orig(struct bat_priv *bat_priv,
+				 struct tt_global_entry *tt_global_entry,
+				 char *message);
+
+/* returns 1 if they are the same mac addr */
+static int compare_ltt(struct hlist_node *node, void *data2)
+{
+	void *data1 = container_of(node, struct tt_local_entry, hash_entry);
+
+	return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
+/* returns 1 if they are the same mac addr */
+static int compare_gtt(struct hlist_node *node, void *data2)
+{
+	void *data1 = container_of(node, struct tt_global_entry, hash_entry);
+
+	return (memcmp(data1, data2, ETH_ALEN) == 0 ? 1 : 0);
+}
+
+static void tt_local_start_timer(struct bat_priv *bat_priv)
+{
+	INIT_DELAYED_WORK(&bat_priv->tt_work, tt_local_purge);
+	queue_delayed_work(bat_event_workqueue, &bat_priv->tt_work, 10 * HZ);
+}
+
+static struct tt_local_entry *tt_local_hash_find(struct bat_priv *bat_priv,
+						   void *data)
+{
+	struct hashtable_t *hash = bat_priv->tt_local_hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct tt_local_entry *tt_local_entry, *tt_local_entry_tmp = NULL;
+	int index;
+
+	if (!hash)
+		return NULL;
+
+	index = choose_orig(data, hash->size);
+	head = &hash->table[index];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tt_local_entry, node, head, hash_entry) {
+		if (!compare_eth(tt_local_entry, data))
+			continue;
+
+		tt_local_entry_tmp = tt_local_entry;
+		break;
+	}
+	rcu_read_unlock();
+
+	return tt_local_entry_tmp;
+}
+
+static struct tt_global_entry *tt_global_hash_find(struct bat_priv *bat_priv,
+						     void *data)
+{
+	struct hashtable_t *hash = bat_priv->tt_global_hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct tt_global_entry *tt_global_entry;
+	struct tt_global_entry *tt_global_entry_tmp = NULL;
+	int index;
+
+	if (!hash)
+		return NULL;
+
+	index = choose_orig(data, hash->size);
+	head = &hash->table[index];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tt_global_entry, node, head, hash_entry) {
+		if (!compare_eth(tt_global_entry, data))
+			continue;
+
+		tt_global_entry_tmp = tt_global_entry;
+		break;
+	}
+	rcu_read_unlock();
+
+	return tt_global_entry_tmp;
+}
+
+int tt_local_init(struct bat_priv *bat_priv)
+{
+	if (bat_priv->tt_local_hash)
+		return 1;
+
+	bat_priv->tt_local_hash = hash_new(1024);
+
+	if (!bat_priv->tt_local_hash)
+		return 0;
+
+	atomic_set(&bat_priv->tt_local_changed, 0);
+	tt_local_start_timer(bat_priv);
+
+	return 1;
+}
+
+void tt_local_add(struct net_device *soft_iface, uint8_t *addr)
+{
+	struct bat_priv *bat_priv = netdev_priv(soft_iface);
+	struct tt_local_entry *tt_local_entry;
+	struct tt_global_entry *tt_global_entry;
+	int required_bytes;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+	tt_local_entry = tt_local_hash_find(bat_priv, addr);
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+
+	if (tt_local_entry) {
+		tt_local_entry->last_seen = jiffies;
+		return;
+	}
+
+	/* only announce as many hosts as possible in the batman-packet and
+	   space in batman_packet->num_tt That also should give a limit to
+	   MAC-flooding. */
+	required_bytes = (bat_priv->num_local_tt + 1) * ETH_ALEN;
+	required_bytes += BAT_PACKET_LEN;
+
+	if ((required_bytes > ETH_DATA_LEN) ||
+	    (atomic_read(&bat_priv->aggregated_ogms) &&
+	     required_bytes > MAX_AGGREGATION_BYTES) ||
+	    (bat_priv->num_local_tt + 1 > 255)) {
+		bat_dbg(DBG_ROUTES, bat_priv,
+			"Can't add new local tt entry (%pM): "
+			"number of local tt entries exceeds packet size\n",
+			addr);
+		return;
+	}
+
+	bat_dbg(DBG_ROUTES, bat_priv,
+		"Creating new local tt entry: %pM\n", addr);
+
+	tt_local_entry = kmalloc(sizeof(struct tt_local_entry), GFP_ATOMIC);
+	if (!tt_local_entry)
+		return;
+
+	memcpy(tt_local_entry->addr, addr, ETH_ALEN);
+	tt_local_entry->last_seen = jiffies;
+
+	/* the batman interface mac address should never be purged */
+	if (compare_eth(addr, soft_iface->dev_addr))
+		tt_local_entry->never_purge = 1;
+	else
+		tt_local_entry->never_purge = 0;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+	hash_add(bat_priv->tt_local_hash, compare_ltt, choose_orig,
+		 tt_local_entry, &tt_local_entry->hash_entry);
+	bat_priv->num_local_tt++;
+	atomic_set(&bat_priv->tt_local_changed, 1);
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+
+	/* remove address from global hash if present */
+	spin_lock_bh(&bat_priv->tt_ghash_lock);
+
+	tt_global_entry = tt_global_hash_find(bat_priv, addr);
+
+	if (tt_global_entry)
+		_tt_global_del_orig(bat_priv, tt_global_entry,
+				     "local tt received");
+
+	spin_unlock_bh(&bat_priv->tt_ghash_lock);
+}
+
+int tt_local_fill_buffer(struct bat_priv *bat_priv,
+			  unsigned char *buff, int buff_len)
+{
+	struct hashtable_t *hash = bat_priv->tt_local_hash;
+	struct tt_local_entry *tt_local_entry;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	int i, count = 0;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(tt_local_entry, node,
+					 head, hash_entry) {
+			if (buff_len < (count + 1) * ETH_ALEN)
+				break;
+
+			memcpy(buff + (count * ETH_ALEN), tt_local_entry->addr,
+			       ETH_ALEN);
+
+			count++;
+		}
+		rcu_read_unlock();
+	}
+
+	/* if we did not get all new local tts see you next time  ;-) */
+	if (count == bat_priv->num_local_tt)
+		atomic_set(&bat_priv->tt_local_changed, 0);
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+	return count;
+}
+
+int tt_local_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct hashtable_t *hash = bat_priv->tt_local_hash;
+	struct tt_local_entry *tt_local_entry;
+	struct hard_iface *primary_if;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	size_t buf_size, pos;
+	char *buff;
+	int i, ret = 0;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "please specify interfaces to enable it\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	if (primary_if->if_status != IF_ACTIVE) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "primary interface not active\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	seq_printf(seq, "Locally retrieved addresses (from %s) "
+		   "announced via TT:\n",
+		   net_dev->name);
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+	buf_size = 1;
+	/* Estimate length for: " * xx:xx:xx:xx:xx:xx\n" */
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		__hlist_for_each_rcu(node, head)
+			buf_size += 21;
+		rcu_read_unlock();
+	}
+
+	buff = kmalloc(buf_size, GFP_ATOMIC);
+	if (!buff) {
+		spin_unlock_bh(&bat_priv->tt_lhash_lock);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	buff[0] = '\0';
+	pos = 0;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(tt_local_entry, node,
+					 head, hash_entry) {
+			pos += snprintf(buff + pos, 22, " * %pM\n",
+					tt_local_entry->addr);
+		}
+		rcu_read_unlock();
+	}
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+
+	seq_printf(seq, "%s", buff);
+	kfree(buff);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+static void _tt_local_del(struct hlist_node *node, void *arg)
+{
+	struct bat_priv *bat_priv = (struct bat_priv *)arg;
+	void *data = container_of(node, struct tt_local_entry, hash_entry);
+
+	kfree(data);
+	bat_priv->num_local_tt--;
+	atomic_set(&bat_priv->tt_local_changed, 1);
+}
+
+static void tt_local_del(struct bat_priv *bat_priv,
+			  struct tt_local_entry *tt_local_entry,
+			  char *message)
+{
+	bat_dbg(DBG_ROUTES, bat_priv, "Deleting local tt entry (%pM): %s\n",
+		tt_local_entry->addr, message);
+
+	hash_remove(bat_priv->tt_local_hash, compare_ltt, choose_orig,
+		    tt_local_entry->addr);
+	_tt_local_del(&tt_local_entry->hash_entry, bat_priv);
+}
+
+void tt_local_remove(struct bat_priv *bat_priv,
+		      uint8_t *addr, char *message)
+{
+	struct tt_local_entry *tt_local_entry;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+	tt_local_entry = tt_local_hash_find(bat_priv, addr);
+
+	if (tt_local_entry)
+		tt_local_del(bat_priv, tt_local_entry, message);
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+}
+
+static void tt_local_purge(struct work_struct *work)
+{
+	struct delayed_work *delayed_work =
+		container_of(work, struct delayed_work, work);
+	struct bat_priv *bat_priv =
+		container_of(delayed_work, struct bat_priv, tt_work);
+	struct hashtable_t *hash = bat_priv->tt_local_hash;
+	struct tt_local_entry *tt_local_entry;
+	struct hlist_node *node, *node_tmp;
+	struct hlist_head *head;
+	unsigned long timeout;
+	int i;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		hlist_for_each_entry_safe(tt_local_entry, node, node_tmp,
+					  head, hash_entry) {
+			if (tt_local_entry->never_purge)
+				continue;
+
+			timeout = tt_local_entry->last_seen;
+			timeout += TT_LOCAL_TIMEOUT * HZ;
+
+			if (time_before(jiffies, timeout))
+				continue;
+
+			tt_local_del(bat_priv, tt_local_entry,
+				      "address timed out");
+		}
+	}
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+	tt_local_start_timer(bat_priv);
+}
+
+void tt_local_free(struct bat_priv *bat_priv)
+{
+	if (!bat_priv->tt_local_hash)
+		return;
+
+	cancel_delayed_work_sync(&bat_priv->tt_work);
+	hash_delete(bat_priv->tt_local_hash, _tt_local_del, bat_priv);
+	bat_priv->tt_local_hash = NULL;
+}
+
+int tt_global_init(struct bat_priv *bat_priv)
+{
+	if (bat_priv->tt_global_hash)
+		return 1;
+
+	bat_priv->tt_global_hash = hash_new(1024);
+
+	if (!bat_priv->tt_global_hash)
+		return 0;
+
+	return 1;
+}
+
+void tt_global_add_orig(struct bat_priv *bat_priv,
+			 struct orig_node *orig_node,
+			 unsigned char *tt_buff, int tt_buff_len)
+{
+	struct tt_global_entry *tt_global_entry;
+	struct tt_local_entry *tt_local_entry;
+	int tt_buff_count = 0;
+	unsigned char *tt_ptr;
+
+	while ((tt_buff_count + 1) * ETH_ALEN <= tt_buff_len) {
+		spin_lock_bh(&bat_priv->tt_ghash_lock);
+
+		tt_ptr = tt_buff + (tt_buff_count * ETH_ALEN);
+		tt_global_entry = tt_global_hash_find(bat_priv, tt_ptr);
+
+		if (!tt_global_entry) {
+			spin_unlock_bh(&bat_priv->tt_ghash_lock);
+
+			tt_global_entry =
+				kmalloc(sizeof(struct tt_global_entry),
+					GFP_ATOMIC);
+
+			if (!tt_global_entry)
+				break;
+
+			memcpy(tt_global_entry->addr, tt_ptr, ETH_ALEN);
+
+			bat_dbg(DBG_ROUTES, bat_priv,
+				"Creating new global tt entry: "
+				"%pM (via %pM)\n",
+				tt_global_entry->addr, orig_node->orig);
+
+			spin_lock_bh(&bat_priv->tt_ghash_lock);
+			hash_add(bat_priv->tt_global_hash, compare_gtt,
+				 choose_orig, tt_global_entry,
+				 &tt_global_entry->hash_entry);
+
+		}
+
+		tt_global_entry->orig_node = orig_node;
+		spin_unlock_bh(&bat_priv->tt_ghash_lock);
+
+		/* remove address from local hash if present */
+		spin_lock_bh(&bat_priv->tt_lhash_lock);
+
+		tt_ptr = tt_buff + (tt_buff_count * ETH_ALEN);
+		tt_local_entry = tt_local_hash_find(bat_priv, tt_ptr);
+
+		if (tt_local_entry)
+			tt_local_del(bat_priv, tt_local_entry,
+				      "global tt received");
+
+		spin_unlock_bh(&bat_priv->tt_lhash_lock);
+
+		tt_buff_count++;
+	}
+
+	/* initialize, and overwrite if malloc succeeds */
+	orig_node->tt_buff = NULL;
+	orig_node->tt_buff_len = 0;
+
+	if (tt_buff_len > 0) {
+		orig_node->tt_buff = kmalloc(tt_buff_len, GFP_ATOMIC);
+		if (orig_node->tt_buff) {
+			memcpy(orig_node->tt_buff, tt_buff, tt_buff_len);
+			orig_node->tt_buff_len = tt_buff_len;
+		}
+	}
+}
+
+int tt_global_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct hashtable_t *hash = bat_priv->tt_global_hash;
+	struct tt_global_entry *tt_global_entry;
+	struct hard_iface *primary_if;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	size_t buf_size, pos;
+	char *buff;
+	int i, ret = 0;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - please "
+				 "specify interfaces to enable it\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	if (primary_if->if_status != IF_ACTIVE) {
+		ret = seq_printf(seq, "BATMAN mesh %s disabled - "
+				 "primary interface not active\n",
+				 net_dev->name);
+		goto out;
+	}
+
+	seq_printf(seq,
+		   "Globally announced TT entries received via the mesh %s\n",
+		   net_dev->name);
+
+	spin_lock_bh(&bat_priv->tt_ghash_lock);
+
+	buf_size = 1;
+	/* Estimate length for: " * xx:xx:xx:xx:xx:xx via xx:xx:xx:xx:xx:xx\n"*/
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		__hlist_for_each_rcu(node, head)
+			buf_size += 43;
+		rcu_read_unlock();
+	}
+
+	buff = kmalloc(buf_size, GFP_ATOMIC);
+	if (!buff) {
+		spin_unlock_bh(&bat_priv->tt_ghash_lock);
+		ret = -ENOMEM;
+		goto out;
+	}
+	buff[0] = '\0';
+	pos = 0;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(tt_global_entry, node,
+					 head, hash_entry) {
+			pos += snprintf(buff + pos, 44,
+					" * %pM via %pM\n",
+					tt_global_entry->addr,
+					tt_global_entry->orig_node->orig);
+		}
+		rcu_read_unlock();
+	}
+
+	spin_unlock_bh(&bat_priv->tt_ghash_lock);
+
+	seq_printf(seq, "%s", buff);
+	kfree(buff);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+static void _tt_global_del_orig(struct bat_priv *bat_priv,
+				 struct tt_global_entry *tt_global_entry,
+				 char *message)
+{
+	bat_dbg(DBG_ROUTES, bat_priv,
+		"Deleting global tt entry %pM (via %pM): %s\n",
+		tt_global_entry->addr, tt_global_entry->orig_node->orig,
+		message);
+
+	hash_remove(bat_priv->tt_global_hash, compare_gtt, choose_orig,
+		    tt_global_entry->addr);
+	kfree(tt_global_entry);
+}
+
+void tt_global_del_orig(struct bat_priv *bat_priv,
+			 struct orig_node *orig_node, char *message)
+{
+	struct tt_global_entry *tt_global_entry;
+	int tt_buff_count = 0;
+	unsigned char *tt_ptr;
+
+	if (orig_node->tt_buff_len == 0)
+		return;
+
+	spin_lock_bh(&bat_priv->tt_ghash_lock);
+
+	while ((tt_buff_count + 1) * ETH_ALEN <= orig_node->tt_buff_len) {
+		tt_ptr = orig_node->tt_buff + (tt_buff_count * ETH_ALEN);
+		tt_global_entry = tt_global_hash_find(bat_priv, tt_ptr);
+
+		if ((tt_global_entry) &&
+		    (tt_global_entry->orig_node == orig_node))
+			_tt_global_del_orig(bat_priv, tt_global_entry,
+					     message);
+
+		tt_buff_count++;
+	}
+
+	spin_unlock_bh(&bat_priv->tt_ghash_lock);
+
+	orig_node->tt_buff_len = 0;
+	kfree(orig_node->tt_buff);
+	orig_node->tt_buff = NULL;
+}
+
+static void tt_global_del(struct hlist_node *node, void *arg)
+{
+	void *data = container_of(node, struct tt_global_entry, hash_entry);
+
+	kfree(data);
+}
+
+void tt_global_free(struct bat_priv *bat_priv)
+{
+	if (!bat_priv->tt_global_hash)
+		return;
+
+	hash_delete(bat_priv->tt_global_hash, tt_global_del, NULL);
+	bat_priv->tt_global_hash = NULL;
+}
+
+struct orig_node *transtable_search(struct bat_priv *bat_priv, uint8_t *addr)
+{
+	struct tt_global_entry *tt_global_entry;
+	struct orig_node *orig_node = NULL;
+
+	spin_lock_bh(&bat_priv->tt_ghash_lock);
+	tt_global_entry = tt_global_hash_find(bat_priv, addr);
+
+	if (!tt_global_entry)
+		goto out;
+
+	if (!atomic_inc_not_zero(&tt_global_entry->orig_node->refcount))
+		goto out;
+
+	orig_node = tt_global_entry->orig_node;
+
+out:
+	spin_unlock_bh(&bat_priv->tt_ghash_lock);
+	return orig_node;
+}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
new file mode 100644
index 00000000..46152c38
--- /dev/null
+++ b/net/batman-adv/translation-table.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+#define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
+
+int tt_local_init(struct bat_priv *bat_priv);
+void tt_local_add(struct net_device *soft_iface, uint8_t *addr);
+void tt_local_remove(struct bat_priv *bat_priv,
+		      uint8_t *addr, char *message);
+int tt_local_fill_buffer(struct bat_priv *bat_priv,
+			  unsigned char *buff, int buff_len);
+int tt_local_seq_print_text(struct seq_file *seq, void *offset);
+void tt_local_free(struct bat_priv *bat_priv);
+int tt_global_init(struct bat_priv *bat_priv);
+void tt_global_add_orig(struct bat_priv *bat_priv,
+			 struct orig_node *orig_node,
+			 unsigned char *tt_buff, int tt_buff_len);
+int tt_global_seq_print_text(struct seq_file *seq, void *offset);
+void tt_global_del_orig(struct bat_priv *bat_priv,
+			 struct orig_node *orig_node, char *message);
+void tt_global_free(struct bat_priv *bat_priv);
+struct orig_node *transtable_search(struct bat_priv *bat_priv, uint8_t *addr);
+
+#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
new file mode 100644
index 00000000..fab70e8b
--- /dev/null
+++ b/net/batman-adv/types.h
@@ -0,0 +1,291 @@
+/*
+ * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner, Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+
+
+#ifndef _NET_BATMAN_ADV_TYPES_H_
+#define _NET_BATMAN_ADV_TYPES_H_
+
+#include "packet.h"
+#include "bitarray.h"
+
+#define BAT_HEADER_LEN (sizeof(struct ethhdr) + \
+	((sizeof(struct unicast_packet) > sizeof(struct bcast_packet) ? \
+	 sizeof(struct unicast_packet) : \
+	 sizeof(struct bcast_packet))))
+
+
+struct hard_iface {
+	struct list_head list;
+	int16_t if_num;
+	char if_status;
+	struct net_device *net_dev;
+	atomic_t seqno;
+	atomic_t frag_seqno;
+	unsigned char *packet_buff;
+	int packet_len;
+	struct kobject *hardif_obj;
+	atomic_t refcount;
+	struct packet_type batman_adv_ptype;
+	struct net_device *soft_iface;
+	struct rcu_head rcu;
+};
+
+/**
+ *	orig_node - structure for orig_list maintaining nodes of mesh
+ *	@primary_addr: hosts primary interface address
+ *	@last_valid: when last packet from this node was received
+ *	@bcast_seqno_reset: time when the broadcast seqno window was reset
+ *	@batman_seqno_reset: time when the batman seqno window was reset
+ *	@gw_flags: flags related to gateway class
+ *	@flags: for now only VIS_SERVER flag
+ *	@last_real_seqno: last and best known squence number
+ *	@last_ttl: ttl of last received packet
+ *	@last_bcast_seqno: last broadcast sequence number received by this host
+ *
+ *	@candidates: how many candidates are available
+ *	@selected: next bonding candidate
+ */
+struct orig_node {
+	uint8_t orig[ETH_ALEN];
+	uint8_t primary_addr[ETH_ALEN];
+	struct neigh_node __rcu *router; /* rcu protected pointer */
+	unsigned long *bcast_own;
+	uint8_t *bcast_own_sum;
+	unsigned long last_valid;
+	unsigned long bcast_seqno_reset;
+	unsigned long batman_seqno_reset;
+	uint8_t gw_flags;
+	uint8_t flags;
+	unsigned char *tt_buff;
+	int16_t tt_buff_len;
+	uint32_t last_real_seqno;
+	uint8_t last_ttl;
+	unsigned long bcast_bits[NUM_WORDS];
+	uint32_t last_bcast_seqno;
+	struct hlist_head neigh_list;
+	struct list_head frag_list;
+	spinlock_t neigh_list_lock; /* protects neigh_list and router */
+	atomic_t refcount;
+	struct rcu_head rcu;
+	struct hlist_node hash_entry;
+	struct bat_priv *bat_priv;
+	unsigned long last_frag_packet;
+	/* ogm_cnt_lock protects: bcast_own, bcast_own_sum,
+	 * neigh_node->real_bits, neigh_node->real_packet_count */
+	spinlock_t ogm_cnt_lock;
+	/* bcast_seqno_lock protects bcast_bits, last_bcast_seqno */
+	spinlock_t bcast_seqno_lock;
+	atomic_t bond_candidates;
+	struct list_head bond_list;
+};
+
+struct gw_node {
+	struct hlist_node list;
+	struct orig_node *orig_node;
+	unsigned long deleted;
+	atomic_t refcount;
+	struct rcu_head rcu;
+};
+
+/**
+ *	neigh_node
+ *	@last_valid: when last packet via this neighbor was received
+ */
+struct neigh_node {
+	struct hlist_node list;
+	uint8_t addr[ETH_ALEN];
+	uint8_t real_packet_count;
+	uint8_t tq_recv[TQ_GLOBAL_WINDOW_SIZE];
+	uint8_t tq_index;
+	uint8_t tq_avg;
+	uint8_t last_ttl;
+	struct list_head bonding_list;
+	unsigned long last_valid;
+	unsigned long real_bits[NUM_WORDS];
+	atomic_t refcount;
+	struct rcu_head rcu;
+	struct orig_node *orig_node;
+	struct hard_iface *if_incoming;
+	spinlock_t tq_lock;	/* protects: tq_recv, tq_index */
+};
+
+
+struct bat_priv {
+	atomic_t mesh_state;
+	struct net_device_stats stats;
+	atomic_t aggregated_ogms;	/* boolean */
+	atomic_t bonding;		/* boolean */
+	atomic_t fragmentation;		/* boolean */
+	atomic_t vis_mode;		/* VIS_TYPE_* */
+	atomic_t gw_mode;		/* GW_MODE_* */
+	atomic_t gw_sel_class;		/* uint */
+	atomic_t gw_bandwidth;		/* gw bandwidth */
+	atomic_t orig_interval;		/* uint */
+	atomic_t hop_penalty;		/* uint */
+	atomic_t log_level;		/* uint */
+	atomic_t bcast_seqno;
+	atomic_t bcast_queue_left;
+	atomic_t batman_queue_left;
+	char num_ifaces;
+	struct debug_log *debug_log;
+	struct kobject *mesh_obj;
+	struct dentry *debug_dir;
+	struct hlist_head forw_bat_list;
+	struct hlist_head forw_bcast_list;
+	struct hlist_head gw_list;
+	struct hlist_head softif_neigh_vids;
+	struct list_head vis_send_list;
+	struct hashtable_t *orig_hash;
+	struct hashtable_t *tt_local_hash;
+	struct hashtable_t *tt_global_hash;
+	struct hashtable_t *vis_hash;
+	spinlock_t forw_bat_list_lock; /* protects forw_bat_list */
+	spinlock_t forw_bcast_list_lock; /* protects  */
+	spinlock_t tt_lhash_lock; /* protects tt_local_hash */
+	spinlock_t tt_ghash_lock; /* protects tt_global_hash */
+	spinlock_t gw_list_lock; /* protects gw_list and curr_gw */
+	spinlock_t vis_hash_lock; /* protects vis_hash */
+	spinlock_t vis_list_lock; /* protects vis_info::recv_list */
+	spinlock_t softif_neigh_lock; /* protects soft-interface neigh list */
+	spinlock_t softif_neigh_vid_lock; /* protects soft-interface vid list */
+	int16_t num_local_tt;
+	atomic_t tt_local_changed;
+	struct delayed_work tt_work;
+	struct delayed_work orig_work;
+	struct delayed_work vis_work;
+	struct gw_node __rcu *curr_gw;  /* rcu protected pointer */
+	struct hard_iface __rcu *primary_if;  /* rcu protected pointer */
+	struct vis_info *my_vis_info;
+};
+
+struct socket_client {
+	struct list_head queue_list;
+	unsigned int queue_len;
+	unsigned char index;
+	spinlock_t lock; /* protects queue_list, queue_len, index */
+	wait_queue_head_t queue_wait;
+	struct bat_priv *bat_priv;
+};
+
+struct socket_packet {
+	struct list_head list;
+	size_t icmp_len;
+	struct icmp_packet_rr icmp_packet;
+};
+
+struct tt_local_entry {
+	uint8_t addr[ETH_ALEN];
+	unsigned long last_seen;
+	char never_purge;
+	struct hlist_node hash_entry;
+};
+
+struct tt_global_entry {
+	uint8_t addr[ETH_ALEN];
+	struct orig_node *orig_node;
+	struct hlist_node hash_entry;
+};
+
+/**
+ *	forw_packet - structure for forw_list maintaining packets to be
+ *	              send/forwarded
+ */
+struct forw_packet {
+	struct hlist_node list;
+	unsigned long send_time;
+	uint8_t own;
+	struct sk_buff *skb;
+	uint16_t packet_len;
+	uint32_t direct_link_flags;
+	uint8_t num_packets;
+	struct delayed_work delayed_work;
+	struct hard_iface *if_incoming;
+};
+
+/* While scanning for vis-entries of a particular vis-originator
+ * this list collects its interfaces to create a subgraph/cluster
+ * out of them later
+ */
+struct if_list_entry {
+	uint8_t addr[ETH_ALEN];
+	bool primary;
+	struct hlist_node list;
+};
+
+struct debug_log {
+	char log_buff[LOG_BUF_LEN];
+	unsigned long log_start;
+	unsigned long log_end;
+	spinlock_t lock; /* protects log_buff, log_start and log_end */
+	wait_queue_head_t queue_wait;
+};
+
+struct frag_packet_list_entry {
+	struct list_head list;
+	uint16_t seqno;
+	struct sk_buff *skb;
+};
+
+struct vis_info {
+	unsigned long       first_seen;
+	struct list_head    recv_list;
+			    /* list of server-neighbors we received a vis-packet
+			     * from.  we should not reply to them. */
+	struct list_head send_list;
+	struct kref refcount;
+	struct hlist_node hash_entry;
+	struct bat_priv *bat_priv;
+	/* this packet might be part of the vis send queue. */
+	struct sk_buff *skb_packet;
+	/* vis_info may follow here*/
+} __packed;
+
+struct vis_info_entry {
+	uint8_t  src[ETH_ALEN];
+	uint8_t  dest[ETH_ALEN];
+	uint8_t  quality;	/* quality = 0 client */
+} __packed;
+
+struct recvlist_node {
+	struct list_head list;
+	uint8_t mac[ETH_ALEN];
+};
+
+struct softif_neigh_vid {
+	struct hlist_node list;
+	struct bat_priv *bat_priv;
+	short vid;
+	atomic_t refcount;
+	struct softif_neigh __rcu *softif_neigh;
+	struct rcu_head rcu;
+	struct hlist_head softif_neigh_list;
+};
+
+struct softif_neigh {
+	struct hlist_node list;
+	uint8_t addr[ETH_ALEN];
+	unsigned long last_seen;
+	atomic_t refcount;
+	struct rcu_head rcu;
+};
+
+#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/batman-adv/unicast.c b/net/batman-adv/unicast.c
new file mode 100644
index 00000000..19c3daf3
--- /dev/null
+++ b/net/batman-adv/unicast.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Andreas Langer
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "unicast.h"
+#include "send.h"
+#include "soft-interface.h"
+#include "gateway_client.h"
+#include "originator.h"
+#include "hash.h"
+#include "translation-table.h"
+#include "routing.h"
+#include "hard-interface.h"
+
+
+static struct sk_buff *frag_merge_packet(struct list_head *head,
+					 struct frag_packet_list_entry *tfp,
+					 struct sk_buff *skb)
+{
+	struct unicast_frag_packet *up =
+		(struct unicast_frag_packet *)skb->data;
+	struct sk_buff *tmp_skb;
+	struct unicast_packet *unicast_packet;
+	int hdr_len = sizeof(struct unicast_packet);
+	int uni_diff = sizeof(struct unicast_frag_packet) - hdr_len;
+
+	/* set skb to the first part and tmp_skb to the second part */
+	if (up->flags & UNI_FRAG_HEAD) {
+		tmp_skb = tfp->skb;
+	} else {
+		tmp_skb = skb;
+		skb = tfp->skb;
+	}
+
+	if (skb_linearize(skb) < 0 || skb_linearize(tmp_skb) < 0)
+		goto err;
+
+	skb_pull(tmp_skb, sizeof(struct unicast_frag_packet));
+	if (pskb_expand_head(skb, 0, tmp_skb->len, GFP_ATOMIC) < 0)
+		goto err;
+
+	/* move free entry to end */
+	tfp->skb = NULL;
+	tfp->seqno = 0;
+	list_move_tail(&tfp->list, head);
+
+	memcpy(skb_put(skb, tmp_skb->len), tmp_skb->data, tmp_skb->len);
+	kfree_skb(tmp_skb);
+
+	memmove(skb->data + uni_diff, skb->data, hdr_len);
+	unicast_packet = (struct unicast_packet *) skb_pull(skb, uni_diff);
+	unicast_packet->packet_type = BAT_UNICAST;
+
+	return skb;
+
+err:
+	/* free buffered skb, skb will be freed later */
+	kfree_skb(tfp->skb);
+	return NULL;
+}
+
+static void frag_create_entry(struct list_head *head, struct sk_buff *skb)
+{
+	struct frag_packet_list_entry *tfp;
+	struct unicast_frag_packet *up =
+		(struct unicast_frag_packet *)skb->data;
+
+	/* free and oldest packets stand at the end */
+	tfp = list_entry((head)->prev, typeof(*tfp), list);
+	kfree_skb(tfp->skb);
+
+	tfp->seqno = ntohs(up->seqno);
+	tfp->skb = skb;
+	list_move(&tfp->list, head);
+	return;
+}
+
+static int frag_create_buffer(struct list_head *head)
+{
+	int i;
+	struct frag_packet_list_entry *tfp;
+
+	for (i = 0; i < FRAG_BUFFER_SIZE; i++) {
+		tfp = kmalloc(sizeof(struct frag_packet_list_entry),
+			GFP_ATOMIC);
+		if (!tfp) {
+			frag_list_free(head);
+			return -ENOMEM;
+		}
+		tfp->skb = NULL;
+		tfp->seqno = 0;
+		INIT_LIST_HEAD(&tfp->list);
+		list_add(&tfp->list, head);
+	}
+
+	return 0;
+}
+
+static struct frag_packet_list_entry *frag_search_packet(struct list_head *head,
+						 struct unicast_frag_packet *up)
+{
+	struct frag_packet_list_entry *tfp;
+	struct unicast_frag_packet *tmp_up = NULL;
+	uint16_t search_seqno;
+
+	if (up->flags & UNI_FRAG_HEAD)
+		search_seqno = ntohs(up->seqno)+1;
+	else
+		search_seqno = ntohs(up->seqno)-1;
+
+	list_for_each_entry(tfp, head, list) {
+
+		if (!tfp->skb)
+			continue;
+
+		if (tfp->seqno == ntohs(up->seqno))
+			goto mov_tail;
+
+		tmp_up = (struct unicast_frag_packet *)tfp->skb->data;
+
+		if (tfp->seqno == search_seqno) {
+
+			if ((tmp_up->flags & UNI_FRAG_HEAD) !=
+			    (up->flags & UNI_FRAG_HEAD))
+				return tfp;
+			else
+				goto mov_tail;
+		}
+	}
+	return NULL;
+
+mov_tail:
+	list_move_tail(&tfp->list, head);
+	return NULL;
+}
+
+void frag_list_free(struct list_head *head)
+{
+	struct frag_packet_list_entry *pf, *tmp_pf;
+
+	if (!list_empty(head)) {
+
+		list_for_each_entry_safe(pf, tmp_pf, head, list) {
+			kfree_skb(pf->skb);
+			list_del(&pf->list);
+			kfree(pf);
+		}
+	}
+	return;
+}
+
+/* frag_reassemble_skb():
+ * returns NET_RX_DROP if the operation failed - skb is left intact
+ * returns NET_RX_SUCCESS if the fragment was buffered (skb_new will be NULL)
+ * or the skb could be reassembled (skb_new will point to the new packet and
+ * skb was freed)
+ */
+int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
+			struct sk_buff **new_skb)
+{
+	struct orig_node *orig_node;
+	struct frag_packet_list_entry *tmp_frag_entry;
+	int ret = NET_RX_DROP;
+	struct unicast_frag_packet *unicast_packet =
+		(struct unicast_frag_packet *)skb->data;
+
+	*new_skb = NULL;
+
+	orig_node = orig_hash_find(bat_priv, unicast_packet->orig);
+	if (!orig_node)
+		goto out;
+
+	orig_node->last_frag_packet = jiffies;
+
+	if (list_empty(&orig_node->frag_list) &&
+	    frag_create_buffer(&orig_node->frag_list)) {
+		pr_debug("couldn't create frag buffer\n");
+		goto out;
+	}
+
+	tmp_frag_entry = frag_search_packet(&orig_node->frag_list,
+					    unicast_packet);
+
+	if (!tmp_frag_entry) {
+		frag_create_entry(&orig_node->frag_list, skb);
+		ret = NET_RX_SUCCESS;
+		goto out;
+	}
+
+	*new_skb = frag_merge_packet(&orig_node->frag_list, tmp_frag_entry,
+				     skb);
+	/* if not, merge failed */
+	if (*new_skb)
+		ret = NET_RX_SUCCESS;
+
+out:
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	return ret;
+}
+
+int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
+		  struct hard_iface *hard_iface, uint8_t dstaddr[])
+{
+	struct unicast_packet tmp_uc, *unicast_packet;
+	struct hard_iface *primary_if;
+	struct sk_buff *frag_skb;
+	struct unicast_frag_packet *frag1, *frag2;
+	int uc_hdr_len = sizeof(struct unicast_packet);
+	int ucf_hdr_len = sizeof(struct unicast_frag_packet);
+	int data_len = skb->len - uc_hdr_len;
+	int large_tail = 0, ret = NET_RX_DROP;
+	uint16_t seqno;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto dropped;
+
+	frag_skb = dev_alloc_skb(data_len - (data_len / 2) + ucf_hdr_len);
+	if (!frag_skb)
+		goto dropped;
+	skb_reserve(frag_skb, ucf_hdr_len);
+
+	unicast_packet = (struct unicast_packet *) skb->data;
+	memcpy(&tmp_uc, unicast_packet, uc_hdr_len);
+	skb_split(skb, frag_skb, data_len / 2 + uc_hdr_len);
+
+	if (my_skb_head_push(skb, ucf_hdr_len - uc_hdr_len) < 0 ||
+	    my_skb_head_push(frag_skb, ucf_hdr_len) < 0)
+		goto drop_frag;
+
+	frag1 = (struct unicast_frag_packet *)skb->data;
+	frag2 = (struct unicast_frag_packet *)frag_skb->data;
+
+	memcpy(frag1, &tmp_uc, sizeof(struct unicast_packet));
+
+	frag1->ttl--;
+	frag1->version = COMPAT_VERSION;
+	frag1->packet_type = BAT_UNICAST_FRAG;
+
+	memcpy(frag1->orig, primary_if->net_dev->dev_addr, ETH_ALEN);
+	memcpy(frag2, frag1, sizeof(struct unicast_frag_packet));
+
+	if (data_len & 1)
+		large_tail = UNI_FRAG_LARGETAIL;
+
+	frag1->flags = UNI_FRAG_HEAD | large_tail;
+	frag2->flags = large_tail;
+
+	seqno = atomic_add_return(2, &hard_iface->frag_seqno);
+	frag1->seqno = htons(seqno - 1);
+	frag2->seqno = htons(seqno);
+
+	send_skb_packet(skb, hard_iface, dstaddr);
+	send_skb_packet(frag_skb, hard_iface, dstaddr);
+	ret = NET_RX_SUCCESS;
+	goto out;
+
+drop_frag:
+	kfree_skb(frag_skb);
+dropped:
+	kfree_skb(skb);
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv)
+{
+	struct ethhdr *ethhdr = (struct ethhdr *)skb->data;
+	struct unicast_packet *unicast_packet;
+	struct orig_node *orig_node;
+	struct neigh_node *neigh_node;
+	int data_len = skb->len;
+	int ret = 1;
+
+	/* get routing information */
+	if (is_multicast_ether_addr(ethhdr->h_dest)) {
+		orig_node = (struct orig_node *)gw_get_selected_orig(bat_priv);
+		if (orig_node)
+			goto find_router;
+	}
+
+	/* check for tt host - increases orig_node refcount */
+	orig_node = transtable_search(bat_priv, ethhdr->h_dest);
+
+find_router:
+	/**
+	 * find_router():
+	 *  - if orig_node is NULL it returns NULL
+	 *  - increases neigh_nodes refcount if found.
+	 */
+	neigh_node = find_router(bat_priv, orig_node, NULL);
+
+	if (!neigh_node)
+		goto out;
+
+	if (neigh_node->if_incoming->if_status != IF_ACTIVE)
+		goto out;
+
+	if (my_skb_head_push(skb, sizeof(struct unicast_packet)) < 0)
+		goto out;
+
+	unicast_packet = (struct unicast_packet *)skb->data;
+
+	unicast_packet->version = COMPAT_VERSION;
+	/* batman packet type: unicast */
+	unicast_packet->packet_type = BAT_UNICAST;
+	/* set unicast ttl */
+	unicast_packet->ttl = TTL;
+	/* copy the destination for faster routing */
+	memcpy(unicast_packet->dest, orig_node->orig, ETH_ALEN);
+
+	if (atomic_read(&bat_priv->fragmentation) &&
+	    data_len + sizeof(struct unicast_packet) >
+				neigh_node->if_incoming->net_dev->mtu) {
+		/* send frag skb decreases ttl */
+		unicast_packet->ttl++;
+		ret = frag_send_skb(skb, bat_priv,
+				    neigh_node->if_incoming, neigh_node->addr);
+		goto out;
+	}
+
+	send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+	ret = 0;
+	goto out;
+
+out:
+	if (neigh_node)
+		neigh_node_free_ref(neigh_node);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+	if (ret == 1)
+		kfree_skb(skb);
+	return ret;
+}
diff --git a/net/batman-adv/unicast.h b/net/batman-adv/unicast.h
new file mode 100644
index 00000000..16ad7a92
--- /dev/null
+++ b/net/batman-adv/unicast.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2010-2011 B.A.T.M.A.N. contributors:
+ *
+ * Andreas Langer
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_UNICAST_H_
+#define _NET_BATMAN_ADV_UNICAST_H_
+
+#include "packet.h"
+
+#define FRAG_TIMEOUT 10000	/* purge frag list entrys after time in ms */
+#define FRAG_BUFFER_SIZE 6	/* number of list elements in buffer */
+
+int frag_reassemble_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
+			struct sk_buff **new_skb);
+void frag_list_free(struct list_head *head);
+int unicast_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv);
+int frag_send_skb(struct sk_buff *skb, struct bat_priv *bat_priv,
+		  struct hard_iface *hard_iface, uint8_t dstaddr[]);
+
+static inline int frag_can_reassemble(struct sk_buff *skb, int mtu)
+{
+	struct unicast_frag_packet *unicast_packet;
+	int uneven_correction = 0;
+	unsigned int merged_size;
+
+	unicast_packet = (struct unicast_frag_packet *)skb->data;
+
+	if (unicast_packet->flags & UNI_FRAG_LARGETAIL) {
+		if (unicast_packet->flags & UNI_FRAG_HEAD)
+			uneven_correction = 1;
+		else
+			uneven_correction = -1;
+	}
+
+	merged_size = (skb->len - sizeof(struct unicast_frag_packet)) * 2;
+	merged_size += sizeof(struct unicast_packet) + uneven_correction;
+
+	return merged_size <= mtu;
+}
+
+#endif /* _NET_BATMAN_ADV_UNICAST_H_ */
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
new file mode 100644
index 00000000..c39f20cc
--- /dev/null
+++ b/net/batman-adv/vis.c
@@ -0,0 +1,993 @@
+/*
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#include "main.h"
+#include "send.h"
+#include "translation-table.h"
+#include "vis.h"
+#include "soft-interface.h"
+#include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
+
+#define MAX_VIS_PACKET_SIZE 1000
+
+/* Returns the smallest signed integer in two's complement with the sizeof x */
+#define smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u)))
+
+/* Checks if a sequence number x is a predecessor/successor of y.
+ * they handle overflows/underflows and can correctly check for a
+ * predecessor/successor unless the variable sequence number has grown by
+ * more then 2**(bitwidth(x)-1)-1.
+ * This means that for a uint8_t with the maximum value 255, it would think:
+ *  - when adding nothing - it is neither a predecessor nor a successor
+ *  - before adding more than 127 to the starting value - it is a predecessor,
+ *  - when adding 128 - it is neither a predecessor nor a successor,
+ *  - after adding more than 127 to the starting value - it is a successor */
+#define seq_before(x, y) ({typeof(x) _dummy = (x - y); \
+			_dummy > smallest_signed_int(_dummy); })
+#define seq_after(x, y) seq_before(y, x)
+
+static void start_vis_timer(struct bat_priv *bat_priv);
+
+/* free the info */
+static void free_info(struct kref *ref)
+{
+	struct vis_info *info = container_of(ref, struct vis_info, refcount);
+	struct bat_priv *bat_priv = info->bat_priv;
+	struct recvlist_node *entry, *tmp;
+
+	list_del_init(&info->send_list);
+	spin_lock_bh(&bat_priv->vis_list_lock);
+	list_for_each_entry_safe(entry, tmp, &info->recv_list, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	spin_unlock_bh(&bat_priv->vis_list_lock);
+	kfree_skb(info->skb_packet);
+	kfree(info);
+}
+
+/* Compare two vis packets, used by the hashing algorithm */
+static int vis_info_cmp(struct hlist_node *node, void *data2)
+{
+	struct vis_info *d1, *d2;
+	struct vis_packet *p1, *p2;
+
+	d1 = container_of(node, struct vis_info, hash_entry);
+	d2 = data2;
+	p1 = (struct vis_packet *)d1->skb_packet->data;
+	p2 = (struct vis_packet *)d2->skb_packet->data;
+	return compare_eth(p1->vis_orig, p2->vis_orig);
+}
+
+/* hash function to choose an entry in a hash table of given size */
+/* hash algorithm from http://en.wikipedia.org/wiki/Hash_table */
+static int vis_info_choose(void *data, int size)
+{
+	struct vis_info *vis_info = data;
+	struct vis_packet *packet;
+	unsigned char *key;
+	uint32_t hash = 0;
+	size_t i;
+
+	packet = (struct vis_packet *)vis_info->skb_packet->data;
+	key = packet->vis_orig;
+	for (i = 0; i < ETH_ALEN; i++) {
+		hash += key[i];
+		hash += (hash << 10);
+		hash ^= (hash >> 6);
+	}
+
+	hash += (hash << 3);
+	hash ^= (hash >> 11);
+	hash += (hash << 15);
+
+	return hash % size;
+}
+
+static struct vis_info *vis_hash_find(struct bat_priv *bat_priv,
+				      void *data)
+{
+	struct hashtable_t *hash = bat_priv->vis_hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct vis_info *vis_info, *vis_info_tmp = NULL;
+	int index;
+
+	if (!hash)
+		return NULL;
+
+	index = vis_info_choose(data, hash->size);
+	head = &hash->table[index];
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(vis_info, node, head, hash_entry) {
+		if (!vis_info_cmp(node, data))
+			continue;
+
+		vis_info_tmp = vis_info;
+		break;
+	}
+	rcu_read_unlock();
+
+	return vis_info_tmp;
+}
+
+/* insert interface to the list of interfaces of one originator, if it
+ * does not already exist in the list */
+static void vis_data_insert_interface(const uint8_t *interface,
+				      struct hlist_head *if_list,
+				      bool primary)
+{
+	struct if_list_entry *entry;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry(entry, pos, if_list, list) {
+		if (compare_eth(entry->addr, (void *)interface))
+			return;
+	}
+
+	/* its a new address, add it to the list */
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return;
+	memcpy(entry->addr, interface, ETH_ALEN);
+	entry->primary = primary;
+	hlist_add_head(&entry->list, if_list);
+}
+
+static ssize_t vis_data_read_prim_sec(char *buff, struct hlist_head *if_list)
+{
+	struct if_list_entry *entry;
+	struct hlist_node *pos;
+	size_t len = 0;
+
+	hlist_for_each_entry(entry, pos, if_list, list) {
+		if (entry->primary)
+			len += sprintf(buff + len, "PRIMARY, ");
+		else
+			len += sprintf(buff + len,  "SEC %pM, ", entry->addr);
+	}
+
+	return len;
+}
+
+static size_t vis_data_count_prim_sec(struct hlist_head *if_list)
+{
+	struct if_list_entry *entry;
+	struct hlist_node *pos;
+	size_t count = 0;
+
+	hlist_for_each_entry(entry, pos, if_list, list) {
+		if (entry->primary)
+			count += 9;
+		else
+			count += 23;
+	}
+
+	return count;
+}
+
+/* read an entry  */
+static ssize_t vis_data_read_entry(char *buff, struct vis_info_entry *entry,
+				   uint8_t *src, bool primary)
+{
+	/* maximal length: max(4+17+2, 3+17+1+3+2) == 26 */
+	if (primary && entry->quality == 0)
+		return sprintf(buff, "TT %pM, ", entry->dest);
+	else if (compare_eth(entry->src, src))
+		return sprintf(buff, "TQ %pM %d, ", entry->dest,
+			       entry->quality);
+
+	return 0;
+}
+
+int vis_seq_print_text(struct seq_file *seq, void *offset)
+{
+	struct hard_iface *primary_if;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct vis_info *info;
+	struct vis_packet *packet;
+	struct vis_info_entry *entries;
+	struct net_device *net_dev = (struct net_device *)seq->private;
+	struct bat_priv *bat_priv = netdev_priv(net_dev);
+	struct hashtable_t *hash = bat_priv->vis_hash;
+	HLIST_HEAD(vis_if_list);
+	struct if_list_entry *entry;
+	struct hlist_node *pos, *n;
+	int i, j, ret = 0;
+	int vis_server = atomic_read(&bat_priv->vis_mode);
+	size_t buff_pos, buf_size;
+	char *buff;
+	int compare;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	if (vis_server == VIS_TYPE_CLIENT_UPDATE)
+		goto out;
+
+	buf_size = 1;
+	/* Estimate length */
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(info, node, head, hash_entry) {
+			packet = (struct vis_packet *)info->skb_packet->data;
+			entries = (struct vis_info_entry *)
+				((char *)packet + sizeof(struct vis_packet));
+
+			for (j = 0; j < packet->entries; j++) {
+				if (entries[j].quality == 0)
+					continue;
+				compare =
+				 compare_eth(entries[j].src, packet->vis_orig);
+				vis_data_insert_interface(entries[j].src,
+							  &vis_if_list,
+							  compare);
+			}
+
+			hlist_for_each_entry(entry, pos, &vis_if_list, list) {
+				buf_size += 18 + 26 * packet->entries;
+
+				/* add primary/secondary records */
+				if (compare_eth(entry->addr, packet->vis_orig))
+					buf_size +=
+					  vis_data_count_prim_sec(&vis_if_list);
+
+				buf_size += 1;
+			}
+
+			hlist_for_each_entry_safe(entry, pos, n, &vis_if_list,
+						  list) {
+				hlist_del(&entry->list);
+				kfree(entry);
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	buff = kmalloc(buf_size, GFP_ATOMIC);
+	if (!buff) {
+		spin_unlock_bh(&bat_priv->vis_hash_lock);
+		ret = -ENOMEM;
+		goto out;
+	}
+	buff[0] = '\0';
+	buff_pos = 0;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(info, node, head, hash_entry) {
+			packet = (struct vis_packet *)info->skb_packet->data;
+			entries = (struct vis_info_entry *)
+				((char *)packet + sizeof(struct vis_packet));
+
+			for (j = 0; j < packet->entries; j++) {
+				if (entries[j].quality == 0)
+					continue;
+				compare =
+				 compare_eth(entries[j].src, packet->vis_orig);
+				vis_data_insert_interface(entries[j].src,
+							  &vis_if_list,
+							  compare);
+			}
+
+			hlist_for_each_entry(entry, pos, &vis_if_list, list) {
+				buff_pos += sprintf(buff + buff_pos, "%pM,",
+						entry->addr);
+
+				for (j = 0; j < packet->entries; j++)
+					buff_pos += vis_data_read_entry(
+							buff + buff_pos,
+							&entries[j],
+							entry->addr,
+							entry->primary);
+
+				/* add primary/secondary records */
+				if (compare_eth(entry->addr, packet->vis_orig))
+					buff_pos +=
+					 vis_data_read_prim_sec(buff + buff_pos,
+								&vis_if_list);
+
+				buff_pos += sprintf(buff + buff_pos, "\n");
+			}
+
+			hlist_for_each_entry_safe(entry, pos, n, &vis_if_list,
+						  list) {
+				hlist_del(&entry->list);
+				kfree(entry);
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+
+	seq_printf(seq, "%s", buff);
+	kfree(buff);
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+	return ret;
+}
+
+/* add the info packet to the send list, if it was not
+ * already linked in. */
+static void send_list_add(struct bat_priv *bat_priv, struct vis_info *info)
+{
+	if (list_empty(&info->send_list)) {
+		kref_get(&info->refcount);
+		list_add_tail(&info->send_list, &bat_priv->vis_send_list);
+	}
+}
+
+/* delete the info packet from the send list, if it was
+ * linked in. */
+static void send_list_del(struct vis_info *info)
+{
+	if (!list_empty(&info->send_list)) {
+		list_del_init(&info->send_list);
+		kref_put(&info->refcount, free_info);
+	}
+}
+
+/* tries to add one entry to the receive list. */
+static void recv_list_add(struct bat_priv *bat_priv,
+			  struct list_head *recv_list, char *mac)
+{
+	struct recvlist_node *entry;
+
+	entry = kmalloc(sizeof(struct recvlist_node), GFP_ATOMIC);
+	if (!entry)
+		return;
+
+	memcpy(entry->mac, mac, ETH_ALEN);
+	spin_lock_bh(&bat_priv->vis_list_lock);
+	list_add_tail(&entry->list, recv_list);
+	spin_unlock_bh(&bat_priv->vis_list_lock);
+}
+
+/* returns 1 if this mac is in the recv_list */
+static int recv_list_is_in(struct bat_priv *bat_priv,
+			   struct list_head *recv_list, char *mac)
+{
+	struct recvlist_node *entry;
+
+	spin_lock_bh(&bat_priv->vis_list_lock);
+	list_for_each_entry(entry, recv_list, list) {
+		if (compare_eth(entry->mac, mac)) {
+			spin_unlock_bh(&bat_priv->vis_list_lock);
+			return 1;
+		}
+	}
+	spin_unlock_bh(&bat_priv->vis_list_lock);
+	return 0;
+}
+
+/* try to add the packet to the vis_hash. return NULL if invalid (e.g. too old,
+ * broken.. ).	vis hash must be locked outside.  is_new is set when the packet
+ * is newer than old entries in the hash. */
+static struct vis_info *add_packet(struct bat_priv *bat_priv,
+				   struct vis_packet *vis_packet,
+				   int vis_info_len, int *is_new,
+				   int make_broadcast)
+{
+	struct vis_info *info, *old_info;
+	struct vis_packet *search_packet, *old_packet;
+	struct vis_info search_elem;
+	struct vis_packet *packet;
+	int hash_added;
+
+	*is_new = 0;
+	/* sanity check */
+	if (!bat_priv->vis_hash)
+		return NULL;
+
+	/* see if the packet is already in vis_hash */
+	search_elem.skb_packet = dev_alloc_skb(sizeof(struct vis_packet));
+	if (!search_elem.skb_packet)
+		return NULL;
+	search_packet = (struct vis_packet *)skb_put(search_elem.skb_packet,
+						     sizeof(struct vis_packet));
+
+	memcpy(search_packet->vis_orig, vis_packet->vis_orig, ETH_ALEN);
+	old_info = vis_hash_find(bat_priv, &search_elem);
+	kfree_skb(search_elem.skb_packet);
+
+	if (old_info) {
+		old_packet = (struct vis_packet *)old_info->skb_packet->data;
+		if (!seq_after(ntohl(vis_packet->seqno),
+			       ntohl(old_packet->seqno))) {
+			if (old_packet->seqno == vis_packet->seqno) {
+				recv_list_add(bat_priv, &old_info->recv_list,
+					      vis_packet->sender_orig);
+				return old_info;
+			} else {
+				/* newer packet is already in hash. */
+				return NULL;
+			}
+		}
+		/* remove old entry */
+		hash_remove(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
+			    old_info);
+		send_list_del(old_info);
+		kref_put(&old_info->refcount, free_info);
+	}
+
+	info = kmalloc(sizeof(struct vis_info), GFP_ATOMIC);
+	if (!info)
+		return NULL;
+
+	info->skb_packet = dev_alloc_skb(sizeof(struct vis_packet) +
+					 vis_info_len + sizeof(struct ethhdr));
+	if (!info->skb_packet) {
+		kfree(info);
+		return NULL;
+	}
+	skb_reserve(info->skb_packet, sizeof(struct ethhdr));
+	packet = (struct vis_packet *)skb_put(info->skb_packet,
+					      sizeof(struct vis_packet) +
+					      vis_info_len);
+
+	kref_init(&info->refcount);
+	INIT_LIST_HEAD(&info->send_list);
+	INIT_LIST_HEAD(&info->recv_list);
+	info->first_seen = jiffies;
+	info->bat_priv = bat_priv;
+	memcpy(packet, vis_packet, sizeof(struct vis_packet) + vis_info_len);
+
+	/* initialize and add new packet. */
+	*is_new = 1;
+
+	/* Make it a broadcast packet, if required */
+	if (make_broadcast)
+		memcpy(packet->target_orig, broadcast_addr, ETH_ALEN);
+
+	/* repair if entries is longer than packet. */
+	if (packet->entries * sizeof(struct vis_info_entry) > vis_info_len)
+		packet->entries = vis_info_len / sizeof(struct vis_info_entry);
+
+	recv_list_add(bat_priv, &info->recv_list, packet->sender_orig);
+
+	/* try to add it */
+	hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
+			      info, &info->hash_entry);
+	if (hash_added < 0) {
+		/* did not work (for some reason) */
+		kref_put(&info->refcount, free_info);
+		info = NULL;
+	}
+
+	return info;
+}
+
+/* handle the server sync packet, forward if needed. */
+void receive_server_sync_packet(struct bat_priv *bat_priv,
+				struct vis_packet *vis_packet,
+				int vis_info_len)
+{
+	struct vis_info *info;
+	int is_new, make_broadcast;
+	int vis_server = atomic_read(&bat_priv->vis_mode);
+
+	make_broadcast = (vis_server == VIS_TYPE_SERVER_SYNC);
+
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+	info = add_packet(bat_priv, vis_packet, vis_info_len,
+			  &is_new, make_broadcast);
+	if (!info)
+		goto end;
+
+	/* only if we are server ourselves and packet is newer than the one in
+	 * hash.*/
+	if (vis_server == VIS_TYPE_SERVER_SYNC && is_new)
+		send_list_add(bat_priv, info);
+end:
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+}
+
+/* handle an incoming client update packet and schedule forward if needed. */
+void receive_client_update_packet(struct bat_priv *bat_priv,
+				  struct vis_packet *vis_packet,
+				  int vis_info_len)
+{
+	struct vis_info *info;
+	struct vis_packet *packet;
+	int is_new;
+	int vis_server = atomic_read(&bat_priv->vis_mode);
+	int are_target = 0;
+
+	/* clients shall not broadcast. */
+	if (is_broadcast_ether_addr(vis_packet->target_orig))
+		return;
+
+	/* Are we the target for this VIS packet? */
+	if (vis_server == VIS_TYPE_SERVER_SYNC	&&
+	    is_my_mac(vis_packet->target_orig))
+		are_target = 1;
+
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+	info = add_packet(bat_priv, vis_packet, vis_info_len,
+			  &is_new, are_target);
+
+	if (!info)
+		goto end;
+	/* note that outdated packets will be dropped at this point. */
+
+	packet = (struct vis_packet *)info->skb_packet->data;
+
+	/* send only if we're the target server or ... */
+	if (are_target && is_new) {
+		packet->vis_type = VIS_TYPE_SERVER_SYNC;	/* upgrade! */
+		send_list_add(bat_priv, info);
+
+		/* ... we're not the recipient (and thus need to forward). */
+	} else if (!is_my_mac(packet->target_orig)) {
+		send_list_add(bat_priv, info);
+	}
+
+end:
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+}
+
+/* Walk the originators and find the VIS server with the best tq. Set the packet
+ * address to its address and return the best_tq.
+ *
+ * Must be called with the originator hash locked */
+static int find_best_vis_server(struct bat_priv *bat_priv,
+				struct vis_info *info)
+{
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct neigh_node *router;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct orig_node *orig_node;
+	struct vis_packet *packet;
+	int best_tq = -1, i;
+
+	packet = (struct vis_packet *)info->skb_packet->data;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			router = orig_node_get_router(orig_node);
+			if (!router)
+				continue;
+
+			if ((orig_node->flags & VIS_SERVER) &&
+			    (router->tq_avg > best_tq)) {
+				best_tq = router->tq_avg;
+				memcpy(packet->target_orig, orig_node->orig,
+				       ETH_ALEN);
+			}
+			neigh_node_free_ref(router);
+		}
+		rcu_read_unlock();
+	}
+
+	return best_tq;
+}
+
+/* Return true if the vis packet is full. */
+static bool vis_packet_full(struct vis_info *info)
+{
+	struct vis_packet *packet;
+	packet = (struct vis_packet *)info->skb_packet->data;
+
+	if (MAX_VIS_PACKET_SIZE / sizeof(struct vis_info_entry)
+		< packet->entries + 1)
+		return true;
+	return false;
+}
+
+/* generates a packet of own vis data,
+ * returns 0 on success, -1 if no packet could be generated */
+static int generate_vis_packet(struct bat_priv *bat_priv)
+{
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct orig_node *orig_node;
+	struct neigh_node *router;
+	struct vis_info *info = (struct vis_info *)bat_priv->my_vis_info;
+	struct vis_packet *packet = (struct vis_packet *)info->skb_packet->data;
+	struct vis_info_entry *entry;
+	struct tt_local_entry *tt_local_entry;
+	int best_tq = -1, i;
+
+	info->first_seen = jiffies;
+	packet->vis_type = atomic_read(&bat_priv->vis_mode);
+
+	memcpy(packet->target_orig, broadcast_addr, ETH_ALEN);
+	packet->ttl = TTL;
+	packet->seqno = htonl(ntohl(packet->seqno) + 1);
+	packet->entries = 0;
+	skb_trim(info->skb_packet, sizeof(struct vis_packet));
+
+	if (packet->vis_type == VIS_TYPE_CLIENT_UPDATE) {
+		best_tq = find_best_vis_server(bat_priv, info);
+
+		if (best_tq < 0)
+			return -1;
+	}
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			router = orig_node_get_router(orig_node);
+			if (!router)
+				continue;
+
+			if (!compare_eth(router->addr, orig_node->orig))
+				goto next;
+
+			if (router->if_incoming->if_status != IF_ACTIVE)
+				goto next;
+
+			if (router->tq_avg < 1)
+				goto next;
+
+			/* fill one entry into buffer. */
+			entry = (struct vis_info_entry *)
+				      skb_put(info->skb_packet, sizeof(*entry));
+			memcpy(entry->src,
+			       router->if_incoming->net_dev->dev_addr,
+			       ETH_ALEN);
+			memcpy(entry->dest, orig_node->orig, ETH_ALEN);
+			entry->quality = router->tq_avg;
+			packet->entries++;
+
+next:
+			neigh_node_free_ref(router);
+
+			if (vis_packet_full(info))
+				goto unlock;
+		}
+		rcu_read_unlock();
+	}
+
+	hash = bat_priv->tt_local_hash;
+
+	spin_lock_bh(&bat_priv->tt_lhash_lock);
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		hlist_for_each_entry(tt_local_entry, node, head, hash_entry) {
+			entry = (struct vis_info_entry *)
+					skb_put(info->skb_packet,
+						sizeof(*entry));
+			memset(entry->src, 0, ETH_ALEN);
+			memcpy(entry->dest, tt_local_entry->addr, ETH_ALEN);
+			entry->quality = 0; /* 0 means TT */
+			packet->entries++;
+
+			if (vis_packet_full(info)) {
+				spin_unlock_bh(&bat_priv->tt_lhash_lock);
+				return 0;
+			}
+		}
+	}
+
+	spin_unlock_bh(&bat_priv->tt_lhash_lock);
+	return 0;
+
+unlock:
+	rcu_read_unlock();
+	return 0;
+}
+
+/* free old vis packets. Must be called with this vis_hash_lock
+ * held */
+static void purge_vis_packets(struct bat_priv *bat_priv)
+{
+	int i;
+	struct hashtable_t *hash = bat_priv->vis_hash;
+	struct hlist_node *node, *node_tmp;
+	struct hlist_head *head;
+	struct vis_info *info;
+
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		hlist_for_each_entry_safe(info, node, node_tmp,
+					  head, hash_entry) {
+			/* never purge own data. */
+			if (info == bat_priv->my_vis_info)
+				continue;
+
+			if (time_after(jiffies,
+				       info->first_seen + VIS_TIMEOUT * HZ)) {
+				hlist_del(node);
+				send_list_del(info);
+				kref_put(&info->refcount, free_info);
+			}
+		}
+	}
+}
+
+static void broadcast_vis_packet(struct bat_priv *bat_priv,
+				 struct vis_info *info)
+{
+	struct neigh_node *router;
+	struct hashtable_t *hash = bat_priv->orig_hash;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	struct orig_node *orig_node;
+	struct vis_packet *packet;
+	struct sk_buff *skb;
+	struct hard_iface *hard_iface;
+	uint8_t dstaddr[ETH_ALEN];
+	int i;
+
+
+	packet = (struct vis_packet *)info->skb_packet->data;
+
+	/* send to all routers in range. */
+	for (i = 0; i < hash->size; i++) {
+		head = &hash->table[i];
+
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(orig_node, node, head, hash_entry) {
+			/* if it's a vis server and reachable, send it. */
+			if (!(orig_node->flags & VIS_SERVER))
+				continue;
+
+			router = orig_node_get_router(orig_node);
+			if (!router)
+				continue;
+
+			/* don't send it if we already received the packet from
+			 * this node. */
+			if (recv_list_is_in(bat_priv, &info->recv_list,
+					    orig_node->orig)) {
+				neigh_node_free_ref(router);
+				continue;
+			}
+
+			memcpy(packet->target_orig, orig_node->orig, ETH_ALEN);
+			hard_iface = router->if_incoming;
+			memcpy(dstaddr, router->addr, ETH_ALEN);
+
+			neigh_node_free_ref(router);
+
+			skb = skb_clone(info->skb_packet, GFP_ATOMIC);
+			if (skb)
+				send_skb_packet(skb, hard_iface, dstaddr);
+
+		}
+		rcu_read_unlock();
+	}
+}
+
+static void unicast_vis_packet(struct bat_priv *bat_priv,
+			       struct vis_info *info)
+{
+	struct orig_node *orig_node;
+	struct neigh_node *router = NULL;
+	struct sk_buff *skb;
+	struct vis_packet *packet;
+
+	packet = (struct vis_packet *)info->skb_packet->data;
+
+	orig_node = orig_hash_find(bat_priv, packet->target_orig);
+	if (!orig_node)
+		goto out;
+
+	router = orig_node_get_router(orig_node);
+	if (!router)
+		goto out;
+
+	skb = skb_clone(info->skb_packet, GFP_ATOMIC);
+	if (skb)
+		send_skb_packet(skb, router->if_incoming, router->addr);
+
+out:
+	if (router)
+		neigh_node_free_ref(router);
+	if (orig_node)
+		orig_node_free_ref(orig_node);
+}
+
+/* only send one vis packet. called from send_vis_packets() */
+static void send_vis_packet(struct bat_priv *bat_priv, struct vis_info *info)
+{
+	struct hard_iface *primary_if;
+	struct vis_packet *packet;
+
+	primary_if = primary_if_get_selected(bat_priv);
+	if (!primary_if)
+		goto out;
+
+	packet = (struct vis_packet *)info->skb_packet->data;
+	if (packet->ttl < 2) {
+		pr_debug("Error - can't send vis packet: ttl exceeded\n");
+		goto out;
+	}
+
+	memcpy(packet->sender_orig, primary_if->net_dev->dev_addr, ETH_ALEN);
+	packet->ttl--;
+
+	if (is_broadcast_ether_addr(packet->target_orig))
+		broadcast_vis_packet(bat_priv, info);
+	else
+		unicast_vis_packet(bat_priv, info);
+	packet->ttl++; /* restore TTL */
+
+out:
+	if (primary_if)
+		hardif_free_ref(primary_if);
+}
+
+/* called from timer; send (and maybe generate) vis packet. */
+static void send_vis_packets(struct work_struct *work)
+{
+	struct delayed_work *delayed_work =
+		container_of(work, struct delayed_work, work);
+	struct bat_priv *bat_priv =
+		container_of(delayed_work, struct bat_priv, vis_work);
+	struct vis_info *info;
+
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+	purge_vis_packets(bat_priv);
+
+	if (generate_vis_packet(bat_priv) == 0) {
+		/* schedule if generation was successful */
+		send_list_add(bat_priv, bat_priv->my_vis_info);
+	}
+
+	while (!list_empty(&bat_priv->vis_send_list)) {
+		info = list_first_entry(&bat_priv->vis_send_list,
+					typeof(*info), send_list);
+
+		kref_get(&info->refcount);
+		spin_unlock_bh(&bat_priv->vis_hash_lock);
+
+		send_vis_packet(bat_priv, info);
+
+		spin_lock_bh(&bat_priv->vis_hash_lock);
+		send_list_del(info);
+		kref_put(&info->refcount, free_info);
+	}
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+	start_vis_timer(bat_priv);
+}
+
+/* init the vis server. this may only be called when if_list is already
+ * initialized (e.g. bat0 is initialized, interfaces have been added) */
+int vis_init(struct bat_priv *bat_priv)
+{
+	struct vis_packet *packet;
+	int hash_added;
+
+	if (bat_priv->vis_hash)
+		return 1;
+
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+
+	bat_priv->vis_hash = hash_new(256);
+	if (!bat_priv->vis_hash) {
+		pr_err("Can't initialize vis_hash\n");
+		goto err;
+	}
+
+	bat_priv->my_vis_info = kmalloc(MAX_VIS_PACKET_SIZE, GFP_ATOMIC);
+	if (!bat_priv->my_vis_info) {
+		pr_err("Can't initialize vis packet\n");
+		goto err;
+	}
+
+	bat_priv->my_vis_info->skb_packet = dev_alloc_skb(
+						sizeof(struct vis_packet) +
+						MAX_VIS_PACKET_SIZE +
+						sizeof(struct ethhdr));
+	if (!bat_priv->my_vis_info->skb_packet)
+		goto free_info;
+
+	skb_reserve(bat_priv->my_vis_info->skb_packet, sizeof(struct ethhdr));
+	packet = (struct vis_packet *)skb_put(
+					bat_priv->my_vis_info->skb_packet,
+					sizeof(struct vis_packet));
+
+	/* prefill the vis info */
+	bat_priv->my_vis_info->first_seen = jiffies -
+						msecs_to_jiffies(VIS_INTERVAL);
+	INIT_LIST_HEAD(&bat_priv->my_vis_info->recv_list);
+	INIT_LIST_HEAD(&bat_priv->my_vis_info->send_list);
+	kref_init(&bat_priv->my_vis_info->refcount);
+	bat_priv->my_vis_info->bat_priv = bat_priv;
+	packet->version = COMPAT_VERSION;
+	packet->packet_type = BAT_VIS;
+	packet->ttl = TTL;
+	packet->seqno = 0;
+	packet->entries = 0;
+
+	INIT_LIST_HEAD(&bat_priv->vis_send_list);
+
+	hash_added = hash_add(bat_priv->vis_hash, vis_info_cmp, vis_info_choose,
+			      bat_priv->my_vis_info,
+			      &bat_priv->my_vis_info->hash_entry);
+	if (hash_added < 0) {
+		pr_err("Can't add own vis packet into hash\n");
+		/* not in hash, need to remove it manually. */
+		kref_put(&bat_priv->my_vis_info->refcount, free_info);
+		goto err;
+	}
+
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+	start_vis_timer(bat_priv);
+	return 1;
+
+free_info:
+	kfree(bat_priv->my_vis_info);
+	bat_priv->my_vis_info = NULL;
+err:
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+	vis_quit(bat_priv);
+	return 0;
+}
+
+/* Decrease the reference count on a hash item info */
+static void free_info_ref(struct hlist_node *node, void *arg)
+{
+	struct vis_info *info;
+
+	info = container_of(node, struct vis_info, hash_entry);
+	send_list_del(info);
+	kref_put(&info->refcount, free_info);
+}
+
+/* shutdown vis-server */
+void vis_quit(struct bat_priv *bat_priv)
+{
+	if (!bat_priv->vis_hash)
+		return;
+
+	cancel_delayed_work_sync(&bat_priv->vis_work);
+
+	spin_lock_bh(&bat_priv->vis_hash_lock);
+	/* properly remove, kill timers ... */
+	hash_delete(bat_priv->vis_hash, free_info_ref, NULL);
+	bat_priv->vis_hash = NULL;
+	bat_priv->my_vis_info = NULL;
+	spin_unlock_bh(&bat_priv->vis_hash_lock);
+}
+
+/* schedule packets for (re)transmission */
+static void start_vis_timer(struct bat_priv *bat_priv)
+{
+	INIT_DELAYED_WORK(&bat_priv->vis_work, send_vis_packets);
+	queue_delayed_work(bat_event_workqueue, &bat_priv->vis_work,
+			   msecs_to_jiffies(VIS_INTERVAL));
+}
diff --git a/net/batman-adv/vis.h b/net/batman-adv/vis.h
new file mode 100644
index 00000000..31b820d0
--- /dev/null
+++ b/net/batman-adv/vis.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2008-2011 B.A.T.M.A.N. contributors:
+ *
+ * Simon Wunderlich, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ *
+ */
+
+#ifndef _NET_BATMAN_ADV_VIS_H_
+#define _NET_BATMAN_ADV_VIS_H_
+
+#define VIS_TIMEOUT		200	/* timeout of vis packets in seconds */
+
+int vis_seq_print_text(struct seq_file *seq, void *offset);
+void receive_server_sync_packet(struct bat_priv *bat_priv,
+				struct vis_packet *vis_packet,
+				int vis_info_len);
+void receive_client_update_packet(struct bat_priv *bat_priv,
+				  struct vis_packet *vis_packet,
+				  int vis_info_len);
+int vis_init(struct bat_priv *bat_priv);
+void vis_quit(struct bat_priv *bat_priv);
+
+#endif /* _NET_BATMAN_ADV_VIS_H_ */
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
new file mode 100644
index 00000000..bfb3dc03
--- /dev/null
+++ b/net/bluetooth/Kconfig
@@ -0,0 +1,70 @@
+#
+# Bluetooth subsystem configuration
+#
+
+menuconfig BT
+	tristate "Bluetooth subsystem support"
+	depends on NET && !S390
+	depends on RFKILL || !RFKILL
+	select CRYPTO
+	help
+	  Bluetooth is low-cost, low-power, short-range wireless technology.
+	  It was designed as a replacement for cables and other short-range
+	  technologies like IrDA.  Bluetooth operates in personal area range
+	  that typically extends up to 10 meters.  More information about
+	  Bluetooth can be found at <http://www.bluetooth.com/>.
+
+	  Linux Bluetooth subsystem consist of several layers:
+	     Bluetooth Core (HCI device and connection manager, scheduler)
+	     HCI Device drivers (Interface to the hardware)
+	     SCO Module (SCO audio links)
+	     L2CAP Module (Logical Link Control and Adaptation Protocol)
+	     RFCOMM Module (RFCOMM Protocol)  
+	     BNEP Module (Bluetooth Network Encapsulation Protocol)
+	     CMTP Module (CAPI Message Transport Protocol)
+	     HIDP Module (Human Interface Device Protocol)
+	     SMP Module (Security Manager Protocol)
+
+	  Say Y here to compile Bluetooth support into the kernel or say M to
+	  compile it as module (bluetooth).
+
+	  To use Linux Bluetooth subsystem, you will need several user-space
+	  utilities like hciconfig and bluetoothd.  These utilities and updates
+	  to Bluetooth kernel modules are provided in the BlueZ packages.  For
+	  more information, see <http://www.bluez.org/>.
+
+if BT != n
+
+config BT_L2CAP
+	bool "L2CAP protocol support"
+	select CRC16
+	select CRYPTO
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+	select CRYPTO_ECB
+	help
+	  L2CAP (Logical Link Control and Adaptation Protocol) provides
+	  connection oriented and connection-less data transport.  L2CAP
+	  support is required for most Bluetooth applications.
+
+	  Also included is support for SMP (Security Manager Protocol) which
+	  is the security layer on top of LE (Low Energy) links.
+
+config BT_SCO
+	bool "SCO links support"
+	help
+	  SCO link provides voice transport over Bluetooth.  SCO support is
+	  required for voice applications like Headset and Audio.
+
+endif
+
+source "net/bluetooth/rfcomm/Kconfig"
+
+source "net/bluetooth/bnep/Kconfig"
+
+source "net/bluetooth/cmtp/Kconfig"
+
+source "net/bluetooth/hidp/Kconfig"
+
+source "drivers/bluetooth/Kconfig"
+
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
new file mode 100644
index 00000000..9b67f3d0
--- /dev/null
+++ b/net/bluetooth/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the Linux Bluetooth subsystem.
+#
+
+obj-$(CONFIG_BT)	+= bluetooth.o
+obj-$(CONFIG_BT_RFCOMM)	+= rfcomm/
+obj-$(CONFIG_BT_BNEP)	+= bnep/
+obj-$(CONFIG_BT_CMTP)	+= cmtp/
+obj-$(CONFIG_BT_HIDP)	+= hidp/
+
+bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o hci_sock.o hci_sysfs.o lib.o
+bluetooth-$(CONFIG_BT_L2CAP)	+= l2cap_core.o l2cap_sock.o smp.o
+bluetooth-$(CONFIG_BT_SCO)	+= sco.o
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
new file mode 100644
index 00000000..7c73a10d
--- /dev/null
+++ b/net/bluetooth/af_bluetooth.c
@@ -0,0 +1,633 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth address family and sockets. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+#include <asm/ioctls.h>
+#include <linux/kmod.h>
+
+#include <net/bluetooth/bluetooth.h>
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+#endif
+
+#ifndef CONFIG_BT_SOCK_DEBUG
+#undef  BT_DBG
+#define BT_DBG(D...)
+#endif
+
+#define VERSION "2.16"
+
+/* Bluetooth sockets */
+#define BT_MAX_PROTO	8
+static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
+static DEFINE_RWLOCK(bt_proto_lock);
+
+static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
+static const char *const bt_key_strings[BT_MAX_PROTO] = {
+	"sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_HCI",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_SCO",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_BNEP",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_CMTP",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_HIDP",
+	"sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP",
+};
+
+static struct lock_class_key bt_slock_key[BT_MAX_PROTO];
+static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
+	"slock-AF_BLUETOOTH-BTPROTO_L2CAP",
+	"slock-AF_BLUETOOTH-BTPROTO_HCI",
+	"slock-AF_BLUETOOTH-BTPROTO_SCO",
+	"slock-AF_BLUETOOTH-BTPROTO_RFCOMM",
+	"slock-AF_BLUETOOTH-BTPROTO_BNEP",
+	"slock-AF_BLUETOOTH-BTPROTO_CMTP",
+	"slock-AF_BLUETOOTH-BTPROTO_HIDP",
+	"slock-AF_BLUETOOTH-BTPROTO_AVDTP",
+};
+
+static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
+{
+	struct sock *sk = sock->sk;
+
+	if (!sk)
+		return;
+
+	BUG_ON(sock_owned_by_user(sk));
+
+	sock_lock_init_class_and_name(sk,
+			bt_slock_key_strings[proto], &bt_slock_key[proto],
+				bt_key_strings[proto], &bt_lock_key[proto]);
+}
+
+int bt_sock_register(int proto, const struct net_proto_family *ops)
+{
+	int err = 0;
+
+	if (proto < 0 || proto >= BT_MAX_PROTO)
+		return -EINVAL;
+
+	write_lock(&bt_proto_lock);
+
+	if (bt_proto[proto])
+		err = -EEXIST;
+	else
+		bt_proto[proto] = ops;
+
+	write_unlock(&bt_proto_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(bt_sock_register);
+
+int bt_sock_unregister(int proto)
+{
+	int err = 0;
+
+	if (proto < 0 || proto >= BT_MAX_PROTO)
+		return -EINVAL;
+
+	write_lock(&bt_proto_lock);
+
+	if (!bt_proto[proto])
+		err = -ENOENT;
+	else
+		bt_proto[proto] = NULL;
+
+	write_unlock(&bt_proto_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(bt_sock_unregister);
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+static inline int current_has_bt_admin(void)
+{
+	return (!current_euid() || in_egroup_p(AID_NET_BT_ADMIN));
+}
+
+static inline int current_has_bt(void)
+{
+	return (current_has_bt_admin() || in_egroup_p(AID_NET_BT));
+}
+# else
+static inline int current_has_bt_admin(void)
+{
+	return 1;
+}
+
+static inline int current_has_bt(void)
+{
+	return 1;
+}
+#endif
+
+static int bt_sock_create(struct net *net, struct socket *sock, int proto,
+			  int kern)
+{
+	int err;
+
+	if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO ||
+			proto == BTPROTO_L2CAP) {
+		if (!current_has_bt())
+			return -EPERM;
+	} else if (!current_has_bt_admin())
+		return -EPERM;
+
+	if (net != &init_net)
+		return -EAFNOSUPPORT;
+
+	if (proto < 0 || proto >= BT_MAX_PROTO)
+		return -EINVAL;
+
+	if (!bt_proto[proto])
+		request_module("bt-proto-%d", proto);
+
+	err = -EPROTONOSUPPORT;
+
+	read_lock(&bt_proto_lock);
+
+	if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) {
+		err = bt_proto[proto]->create(net, sock, proto, kern);
+		bt_sock_reclassify_lock(sock, proto);
+		module_put(bt_proto[proto]->owner);
+	}
+
+	read_unlock(&bt_proto_lock);
+
+	return err;
+}
+
+void bt_sock_link(struct bt_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_add_node(sk, &l->head);
+	write_unlock_bh(&l->lock);
+}
+EXPORT_SYMBOL(bt_sock_link);
+
+void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&l->lock);
+}
+EXPORT_SYMBOL(bt_sock_unlink);
+
+void bt_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+	BT_DBG("parent %p, sk %p", parent, sk);
+
+	sock_hold(sk);
+	list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
+	bt_sk(sk)->parent = parent;
+	parent->sk_ack_backlog++;
+}
+EXPORT_SYMBOL(bt_accept_enqueue);
+
+void bt_accept_unlink(struct sock *sk)
+{
+	BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+	list_del_init(&bt_sk(sk)->accept_q);
+	bt_sk(sk)->parent->sk_ack_backlog--;
+	bt_sk(sk)->parent = NULL;
+	sock_put(sk);
+}
+EXPORT_SYMBOL(bt_accept_unlink);
+
+struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
+{
+	struct list_head *p, *n;
+	struct sock *sk;
+
+	BT_DBG("parent %p", parent);
+
+	local_bh_disable();
+	list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
+		sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+
+		bh_lock_sock(sk);
+
+		/* FIXME: Is this check still needed */
+		if (sk->sk_state == BT_CLOSED) {
+			bh_unlock_sock(sk);
+			bt_accept_unlink(sk);
+			continue;
+		}
+
+		if (sk->sk_state == BT_CONNECTED || !newsock ||
+						bt_sk(parent)->defer_setup) {
+			bt_accept_unlink(sk);
+			if (newsock)
+				sock_graft(sk, newsock);
+
+			bh_unlock_sock(sk);
+			local_bh_enable();
+			return sk;
+		}
+
+		bh_unlock_sock(sk);
+	}
+	local_bh_enable();
+
+	return NULL;
+}
+EXPORT_SYMBOL(bt_accept_dequeue);
+
+int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *msg, size_t len, int flags)
+{
+	int noblock = flags & MSG_DONTWAIT;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	size_t copied;
+	int err;
+
+	BT_DBG("sock %p sk %p len %zu", sock, sk, len);
+
+	if (flags & (MSG_OOB))
+		return -EOPNOTSUPP;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb) {
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			return 0;
+		return err;
+	}
+
+	msg->msg_namelen = 0;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	skb_reset_transport_header(skb);
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err == 0)
+		sock_recv_ts_and_drops(msg, sk, skb);
+
+	skb_free_datagram(sk, skb);
+
+	return err ? : copied;
+}
+EXPORT_SYMBOL(bt_sock_recvmsg);
+
+static long bt_sock_data_wait(struct sock *sk, long timeo)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			break;
+
+		if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
+			break;
+
+		if (signal_pending(current) || !timeo)
+			break;
+
+		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return timeo;
+}
+
+int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	size_t target, copied = 0;
+	long timeo;
+
+	if (flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	msg->msg_namelen = 0;
+
+	BT_DBG("sk %p size %zu", sk, size);
+
+	lock_sock(sk);
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+	timeo  = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	do {
+		struct sk_buff *skb;
+		int chunk;
+
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (!skb) {
+			if (copied >= target)
+				break;
+
+			err = sock_error(sk);
+			if (err)
+				break;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			err = -EAGAIN;
+			if (!timeo)
+				break;
+
+			timeo = bt_sock_data_wait(sk, timeo);
+
+			if (signal_pending(current)) {
+				err = sock_intr_errno(timeo);
+				goto out;
+			}
+			continue;
+		}
+
+		chunk = min_t(unsigned int, skb->len, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			if (!copied)
+				copied = -EFAULT;
+			break;
+		}
+		copied += chunk;
+		size   -= chunk;
+
+		sock_recv_ts_and_drops(msg, sk, skb);
+
+		if (!(flags & MSG_PEEK)) {
+			skb_pull(skb, chunk);
+			if (skb->len) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+			kfree_skb(skb);
+
+		} else {
+			/* put message back and return */
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			break;
+		}
+	} while (size);
+
+out:
+	release_sock(sk);
+	return copied ? : err;
+}
+EXPORT_SYMBOL(bt_sock_stream_recvmsg);
+
+static inline unsigned int bt_accept_poll(struct sock *parent)
+{
+	struct list_head *p, *n;
+	struct sock *sk;
+
+	list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
+		sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+		if (sk->sk_state == BT_CONNECTED ||
+					(bt_sk(parent)->defer_setup &&
+						sk->sk_state == BT_CONNECT2))
+			return POLLIN | POLLRDNORM;
+	}
+
+	return 0;
+}
+
+unsigned int bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	if (sk->sk_state == BT_LISTEN)
+		return bt_accept_poll(sk);
+
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sk->sk_state == BT_CLOSED)
+		mask |= POLLHUP;
+
+	if (sk->sk_state == BT_CONNECT ||
+			sk->sk_state == BT_CONNECT2 ||
+			sk->sk_state == BT_CONFIG)
+		return mask;
+
+	if (sock_writeable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	else
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	return mask;
+}
+EXPORT_SYMBOL(bt_sock_poll);
+
+int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	long amount;
+	int err;
+
+	BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+	switch (cmd) {
+	case TIOCOUTQ:
+		if (sk->sk_state == BT_LISTEN)
+			return -EINVAL;
+
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		err = put_user(amount, (int __user *) arg);
+		break;
+
+	case TIOCINQ:
+		if (sk->sk_state == BT_LISTEN)
+			return -EINVAL;
+
+		lock_sock(sk);
+		skb = skb_peek(&sk->sk_receive_queue);
+		amount = skb ? skb->len : 0;
+		release_sock(sk);
+		err = put_user(amount, (int __user *) arg);
+		break;
+
+	case SIOCGSTAMP:
+		err = sock_get_timestamp(sk, (struct timeval __user *) arg);
+		break;
+
+	case SIOCGSTAMPNS:
+		err = sock_get_timestampns(sk, (struct timespec __user *) arg);
+		break;
+
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(bt_sock_ioctl);
+
+int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (sk->sk_state != state) {
+		if (!timeo) {
+			err = -EINPROGRESS;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		err = sock_error(sk);
+		if (err)
+			break;
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return err;
+}
+EXPORT_SYMBOL(bt_sock_wait_state);
+
+static struct net_proto_family bt_sock_family_ops = {
+	.owner	= THIS_MODULE,
+	.family	= PF_BLUETOOTH,
+	.create	= bt_sock_create,
+};
+
+static int __init bt_init(void)
+{
+	int err;
+
+	BT_INFO("Core ver %s", VERSION);
+
+	err = bt_sysfs_init();
+	if (err < 0)
+		return err;
+
+	err = sock_register(&bt_sock_family_ops);
+	if (err < 0) {
+		bt_sysfs_cleanup();
+		return err;
+	}
+
+	BT_INFO("HCI device and connection manager initialized");
+
+	err = hci_sock_init();
+	if (err < 0)
+		goto error;
+
+	err = l2cap_init();
+	if (err < 0)
+		goto sock_err;
+
+	err = sco_init();
+	if (err < 0) {
+		l2cap_exit();
+		goto sock_err;
+	}
+
+	return 0;
+
+sock_err:
+	hci_sock_cleanup();
+
+error:
+	sock_unregister(PF_BLUETOOTH);
+	bt_sysfs_cleanup();
+
+	return err;
+}
+
+static void __exit bt_exit(void)
+{
+
+	sco_exit();
+
+	l2cap_exit();
+
+	hci_sock_cleanup();
+
+	sock_unregister(PF_BLUETOOTH);
+
+	bt_sysfs_cleanup();
+}
+
+subsys_initcall(bt_init);
+module_exit(bt_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig
new file mode 100644
index 00000000..35158b03
--- /dev/null
+++ b/net/bluetooth/bnep/Kconfig
@@ -0,0 +1,24 @@
+config BT_BNEP
+	tristate "BNEP protocol support"
+	depends on BT && BT_L2CAP
+	select CRC32
+	help
+	  BNEP (Bluetooth Network Encapsulation Protocol) is Ethernet
+	  emulation layer on top of Bluetooth.  BNEP is required for
+	  Bluetooth PAN (Personal Area Network).
+
+	  Say Y here to compile BNEP support into the kernel or say M to
+	  compile it as module (bnep).
+
+config BT_BNEP_MC_FILTER
+	bool "Multicast filter support"
+	depends on BT_BNEP
+	help
+	  This option enables the multicast filter support for BNEP.
+
+config BT_BNEP_PROTO_FILTER
+	bool "Protocol filter support"
+	depends on BT_BNEP
+	help
+	  This option enables the protocol filter support for BNEP.
+
diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile
new file mode 100644
index 00000000..c7821e76
--- /dev/null
+++ b/net/bluetooth/bnep/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth BNEP layer.
+#
+
+obj-$(CONFIG_BT_BNEP) += bnep.o
+
+bnep-objs := core.o sock.o netdev.o
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
new file mode 100644
index 00000000..e7ee5314
--- /dev/null
+++ b/net/bluetooth/bnep/bnep.h
@@ -0,0 +1,180 @@
+/*
+  BNEP protocol definition for Linux Bluetooth stack (BlueZ).
+  Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License, version 2, as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+*/
+
+#ifndef _BNEP_H
+#define _BNEP_H
+
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <net/bluetooth/bluetooth.h>
+
+/* Limits */
+#define BNEP_MAX_PROTO_FILTERS		5
+#define BNEP_MAX_MULTICAST_FILTERS	20
+
+/* UUIDs */
+#define BNEP_BASE_UUID	0x0000000000001000800000805F9B34FB
+#define BNEP_UUID16	0x02
+#define BNEP_UUID32	0x04
+#define BNEP_UUID128	0x16
+
+#define BNEP_SVC_PANU	0x1115
+#define BNEP_SVC_NAP	0x1116
+#define BNEP_SVC_GN	0x1117
+
+/* Packet types */
+#define BNEP_GENERAL			0x00
+#define BNEP_CONTROL			0x01
+#define BNEP_COMPRESSED			0x02
+#define BNEP_COMPRESSED_SRC_ONLY	0x03
+#define BNEP_COMPRESSED_DST_ONLY	0x04
+
+/* Control types */
+#define BNEP_CMD_NOT_UNDERSTOOD		0x00
+#define BNEP_SETUP_CONN_REQ		0x01
+#define BNEP_SETUP_CONN_RSP		0x02
+#define BNEP_FILTER_NET_TYPE_SET	0x03
+#define BNEP_FILTER_NET_TYPE_RSP	0x04
+#define BNEP_FILTER_MULTI_ADDR_SET	0x05
+#define BNEP_FILTER_MULTI_ADDR_RSP	0x06
+
+/* Extension types */
+#define BNEP_EXT_CONTROL 0x00
+
+/* Response messages */
+#define BNEP_SUCCESS 0x00
+
+#define BNEP_CONN_INVALID_DST 0x01
+#define BNEP_CONN_INVALID_SRC 0x02
+#define BNEP_CONN_INVALID_SVC 0x03
+#define BNEP_CONN_NOT_ALLOWED 0x04
+
+#define BNEP_FILTER_UNSUPPORTED_REQ	0x01
+#define BNEP_FILTER_INVALID_RANGE	0x02
+#define BNEP_FILTER_INVALID_MCADDR	0x02
+#define BNEP_FILTER_LIMIT_REACHED	0x03
+#define BNEP_FILTER_DENIED_SECURITY	0x04
+
+/* L2CAP settings */
+#define BNEP_MTU	1691
+#define BNEP_PSM	0x0f
+#define BNEP_FLUSH_TO	0xffff
+#define BNEP_CONNECT_TO	15
+#define BNEP_FILTER_TO	15
+
+/* Headers */
+#define BNEP_TYPE_MASK	0x7f
+#define BNEP_EXT_HEADER	0x80
+
+struct bnep_setup_conn_req {
+	__u8 type;
+	__u8 ctrl;
+	__u8 uuid_size;
+	__u8 service[0];
+} __packed;
+
+struct bnep_set_filter_req {
+	__u8 type;
+	__u8 ctrl;
+	__be16 len;
+	__u8 list[0];
+} __packed;
+
+struct bnep_control_rsp {
+	__u8 type;
+	__u8 ctrl;
+	__be16 resp;
+} __packed;
+
+struct bnep_ext_hdr {
+	__u8 type;
+	__u8 len;
+	__u8 data[0];
+} __packed;
+
+/* BNEP ioctl defines */
+#define BNEPCONNADD	_IOW('B', 200, int)
+#define BNEPCONNDEL	_IOW('B', 201, int)
+#define BNEPGETCONNLIST	_IOR('B', 210, int)
+#define BNEPGETCONNINFO	_IOR('B', 211, int)
+
+struct bnep_connadd_req {
+	int   sock;		/* Connected socket */
+	__u32 flags;
+	__u16 role;
+	char  device[16];	/* Name of the Ethernet device */
+};
+
+struct bnep_conndel_req {
+	__u32 flags;
+	__u8  dst[ETH_ALEN];
+};
+
+struct bnep_conninfo {
+	__u32 flags;
+	__u16 role;
+	__u16 state;
+	__u8  dst[ETH_ALEN];
+	char  device[16];
+};
+
+struct bnep_connlist_req {
+	__u32  cnum;
+	struct bnep_conninfo __user *ci;
+};
+
+struct bnep_proto_filter {
+	__u16 start;
+	__u16 end;
+};
+
+int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock);
+int bnep_del_connection(struct bnep_conndel_req *req);
+int bnep_get_connlist(struct bnep_connlist_req *req);
+int bnep_get_conninfo(struct bnep_conninfo *ci);
+
+/* BNEP sessions */
+struct bnep_session {
+	struct list_head list;
+
+	unsigned int  role;
+	unsigned long state;
+	unsigned long flags;
+	atomic_t      terminate;
+	struct task_struct *task;
+
+	struct ethhdr eh;
+	struct msghdr msg;
+
+	struct bnep_proto_filter proto_filter[BNEP_MAX_PROTO_FILTERS];
+	unsigned long long mc_filter;
+
+	struct socket    *sock;
+	struct net_device *dev;
+};
+
+void bnep_net_setup(struct net_device *dev);
+int bnep_sock_init(void);
+void bnep_sock_cleanup(void);
+
+static inline int bnep_mc_hash(__u8 *addr)
+{
+	return crc32_be(~0, addr, ETH_ALEN) >> 26;
+}
+
+#endif
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
new file mode 100644
index 00000000..d9edfe8b
--- /dev/null
+++ b/net/bluetooth/bnep/core.c
@@ -0,0 +1,751 @@
+/*
+   BNEP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2001-2002 Inventel Systemes
+   Written 2001-2002 by
+	Clément Moreau <clement.moreau@inventel.fr>
+	David Libault  <david.libault@inventel.fr>
+
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/freezer.h>
+#include <linux/errno.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/socket.h>
+#include <linux/file.h>
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "bnep.h"
+
+#define VERSION "1.3"
+
+static int compress_src = 1;
+static int compress_dst = 1;
+
+static LIST_HEAD(bnep_session_list);
+static DECLARE_RWSEM(bnep_session_sem);
+
+static struct bnep_session *__bnep_get_session(u8 *dst)
+{
+	struct bnep_session *s;
+	struct list_head *p;
+
+	BT_DBG("");
+
+	list_for_each(p, &bnep_session_list) {
+		s = list_entry(p, struct bnep_session, list);
+		if (!compare_ether_addr(dst, s->eh.h_source))
+			return s;
+	}
+	return NULL;
+}
+
+static void __bnep_link_session(struct bnep_session *s)
+{
+	/* It's safe to call __module_get() here because sessions are added
+	   by the socket layer which has to hold the reference to this module.
+	 */
+	__module_get(THIS_MODULE);
+	list_add(&s->list, &bnep_session_list);
+}
+
+static void __bnep_unlink_session(struct bnep_session *s)
+{
+	list_del(&s->list);
+	module_put(THIS_MODULE);
+}
+
+static int bnep_send(struct bnep_session *s, void *data, size_t len)
+{
+	struct socket *sock = s->sock;
+	struct kvec iv = { data, len };
+
+	return kernel_sendmsg(sock, &s->msg, &iv, 1, len);
+}
+
+static int bnep_send_rsp(struct bnep_session *s, u8 ctrl, u16 resp)
+{
+	struct bnep_control_rsp rsp;
+	rsp.type = BNEP_CONTROL;
+	rsp.ctrl = ctrl;
+	rsp.resp = htons(resp);
+	return bnep_send(s, &rsp, sizeof(rsp));
+}
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+static inline void bnep_set_default_proto_filter(struct bnep_session *s)
+{
+	/* (IPv4, ARP)  */
+	s->proto_filter[0].start = ETH_P_IP;
+	s->proto_filter[0].end   = ETH_P_ARP;
+	/* (RARP, AppleTalk) */
+	s->proto_filter[1].start = ETH_P_RARP;
+	s->proto_filter[1].end   = ETH_P_AARP;
+	/* (IPX, IPv6) */
+	s->proto_filter[2].start = ETH_P_IPX;
+	s->proto_filter[2].end   = ETH_P_IPV6;
+}
+#endif
+
+static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len)
+{
+	int n;
+
+	if (len < 2)
+		return -EILSEQ;
+
+	n = get_unaligned_be16(data);
+	data++;
+	len -= 2;
+
+	if (len < n)
+		return -EILSEQ;
+
+	BT_DBG("filter len %d", n);
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+	n /= 4;
+	if (n <= BNEP_MAX_PROTO_FILTERS) {
+		struct bnep_proto_filter *f = s->proto_filter;
+		int i;
+
+		for (i = 0; i < n; i++) {
+			f[i].start = get_unaligned_be16(data++);
+			f[i].end   = get_unaligned_be16(data++);
+
+			BT_DBG("proto filter start %d end %d",
+				f[i].start, f[i].end);
+		}
+
+		if (i < BNEP_MAX_PROTO_FILTERS)
+			memset(f + i, 0, sizeof(*f));
+
+		if (n == 0)
+			bnep_set_default_proto_filter(s);
+
+		bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_SUCCESS);
+	} else {
+		bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_LIMIT_REACHED);
+	}
+#else
+	bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_UNSUPPORTED_REQ);
+#endif
+	return 0;
+}
+
+static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len)
+{
+	int n;
+
+	if (len < 2)
+		return -EILSEQ;
+
+	n = get_unaligned_be16(data);
+	data += 2;
+	len -= 2;
+
+	if (len < n)
+		return -EILSEQ;
+
+	BT_DBG("filter len %d", n);
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+	n /= (ETH_ALEN * 2);
+
+	if (n > 0) {
+		int i;
+
+		s->mc_filter = 0;
+
+		/* Always send broadcast */
+		set_bit(bnep_mc_hash(s->dev->broadcast), (ulong *) &s->mc_filter);
+
+		/* Add address ranges to the multicast hash */
+		for (; n > 0; n--) {
+			u8 a1[6], *a2;
+
+			memcpy(a1, data, ETH_ALEN);
+			data += ETH_ALEN;
+			a2 = data;
+			data += ETH_ALEN;
+
+			BT_DBG("mc filter %s -> %s",
+				batostr((void *) a1), batostr((void *) a2));
+
+			/* Iterate from a1 to a2 */
+			set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
+			while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) {
+				/* Increment a1 */
+				i = 5;
+				while (i >= 0 && ++a1[i--] == 0)
+					;
+
+				set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter);
+			}
+		}
+	}
+
+	BT_DBG("mc filter hash 0x%llx", s->mc_filter);
+
+	bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_SUCCESS);
+#else
+	bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_FILTER_UNSUPPORTED_REQ);
+#endif
+	return 0;
+}
+
+static int bnep_rx_control(struct bnep_session *s, void *data, int len)
+{
+	u8  cmd = *(u8 *)data;
+	int err = 0;
+
+	data++;
+	len--;
+
+	switch (cmd) {
+	case BNEP_CMD_NOT_UNDERSTOOD:
+	case BNEP_SETUP_CONN_RSP:
+	case BNEP_FILTER_NET_TYPE_RSP:
+	case BNEP_FILTER_MULTI_ADDR_RSP:
+		/* Ignore these for now */
+		break;
+
+	case BNEP_FILTER_NET_TYPE_SET:
+		err = bnep_ctrl_set_netfilter(s, data, len);
+		break;
+
+	case BNEP_FILTER_MULTI_ADDR_SET:
+		err = bnep_ctrl_set_mcfilter(s, data, len);
+		break;
+
+	case BNEP_SETUP_CONN_REQ:
+		err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, BNEP_CONN_NOT_ALLOWED);
+		break;
+
+	default: {
+			u8 pkt[3];
+			pkt[0] = BNEP_CONTROL;
+			pkt[1] = BNEP_CMD_NOT_UNDERSTOOD;
+			pkt[2] = cmd;
+			bnep_send(s, pkt, sizeof(pkt));
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
+{
+	struct bnep_ext_hdr *h;
+	int err = 0;
+
+	do {
+		h = (void *) skb->data;
+		if (!skb_pull(skb, sizeof(*h))) {
+			err = -EILSEQ;
+			break;
+		}
+
+		BT_DBG("type 0x%x len %d", h->type, h->len);
+
+		switch (h->type & BNEP_TYPE_MASK) {
+		case BNEP_EXT_CONTROL:
+			bnep_rx_control(s, skb->data, skb->len);
+			break;
+
+		default:
+			/* Unknown extension, skip it. */
+			break;
+		}
+
+		if (!skb_pull(skb, h->len)) {
+			err = -EILSEQ;
+			break;
+		}
+	} while (!err && (h->type & BNEP_EXT_HEADER));
+
+	return err;
+}
+
+static u8 __bnep_rx_hlen[] = {
+	ETH_HLEN,     /* BNEP_GENERAL */
+	0,            /* BNEP_CONTROL */
+	2,            /* BNEP_COMPRESSED */
+	ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */
+	ETH_ALEN + 2  /* BNEP_COMPRESSED_DST_ONLY */
+};
+
+static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
+{
+	struct net_device *dev = s->dev;
+	struct sk_buff *nskb;
+	u8 type;
+
+	dev->stats.rx_bytes += skb->len;
+
+	type = *(u8 *) skb->data;
+	skb_pull(skb, 1);
+
+	if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen))
+		goto badframe;
+
+	if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) {
+		bnep_rx_control(s, skb->data, skb->len);
+		kfree_skb(skb);
+		return 0;
+	}
+
+	skb_reset_mac_header(skb);
+
+	/* Verify and pull out header */
+	if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK]))
+		goto badframe;
+
+	s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+
+	if (type & BNEP_EXT_HEADER) {
+		if (bnep_rx_extension(s, skb) < 0)
+			goto badframe;
+	}
+
+	/* Strip 802.1p header */
+	if (ntohs(s->eh.h_proto) == 0x8100) {
+		if (!skb_pull(skb, 4))
+			goto badframe;
+		s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2));
+	}
+
+	/* We have to alloc new skb and copy data here :(. Because original skb
+	 * may not be modified and because of the alignment requirements. */
+	nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL);
+	if (!nskb) {
+		dev->stats.rx_dropped++;
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	skb_reserve(nskb, 2);
+
+	/* Decompress header and construct ether frame */
+	switch (type & BNEP_TYPE_MASK) {
+	case BNEP_COMPRESSED:
+		memcpy(__skb_put(nskb, ETH_HLEN), &s->eh, ETH_HLEN);
+		break;
+
+	case BNEP_COMPRESSED_SRC_ONLY:
+		memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN);
+		memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb), ETH_ALEN);
+		put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
+		break;
+
+	case BNEP_COMPRESSED_DST_ONLY:
+		memcpy(__skb_put(nskb, ETH_ALEN), skb_mac_header(skb),
+								ETH_ALEN);
+		memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source,
+								ETH_ALEN + 2);
+		break;
+
+	case BNEP_GENERAL:
+		memcpy(__skb_put(nskb, ETH_ALEN * 2), skb_mac_header(skb),
+								ETH_ALEN * 2);
+		put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2));
+		break;
+	}
+
+	skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len);
+	kfree_skb(skb);
+
+	dev->stats.rx_packets++;
+	nskb->ip_summed = CHECKSUM_NONE;
+	nskb->protocol  = eth_type_trans(nskb, dev);
+	netif_rx_ni(nskb);
+	return 0;
+
+badframe:
+	dev->stats.rx_errors++;
+	kfree_skb(skb);
+	return 0;
+}
+
+static u8 __bnep_tx_types[] = {
+	BNEP_GENERAL,
+	BNEP_COMPRESSED_SRC_ONLY,
+	BNEP_COMPRESSED_DST_ONLY,
+	BNEP_COMPRESSED
+};
+
+static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
+{
+	struct ethhdr *eh = (void *) skb->data;
+	struct socket *sock = s->sock;
+	struct kvec iv[3];
+	int len = 0, il = 0;
+	u8 type = 0;
+
+	BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type);
+
+	if (!skb->dev) {
+		/* Control frame sent by us */
+		goto send;
+	}
+
+	iv[il++] = (struct kvec) { &type, 1 };
+	len++;
+
+	if (compress_src && !compare_ether_addr(eh->h_dest, s->eh.h_source))
+		type |= 0x01;
+
+	if (compress_dst && !compare_ether_addr(eh->h_source, s->eh.h_dest))
+		type |= 0x02;
+
+	if (type)
+		skb_pull(skb, ETH_ALEN * 2);
+
+	type = __bnep_tx_types[type];
+	switch (type) {
+	case BNEP_COMPRESSED_SRC_ONLY:
+		iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN };
+		len += ETH_ALEN;
+		break;
+
+	case BNEP_COMPRESSED_DST_ONLY:
+		iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN };
+		len += ETH_ALEN;
+		break;
+	}
+
+send:
+	iv[il++] = (struct kvec) { skb->data, skb->len };
+	len += skb->len;
+
+	/* FIXME: linearize skb */
+	{
+		len = kernel_sendmsg(sock, &s->msg, iv, il, len);
+	}
+	kfree_skb(skb);
+
+	if (len > 0) {
+		s->dev->stats.tx_bytes += len;
+		s->dev->stats.tx_packets++;
+		return 0;
+	}
+
+	return len;
+}
+
+static int bnep_session(void *arg)
+{
+	struct bnep_session *s = arg;
+	struct net_device *dev = s->dev;
+	struct sock *sk = s->sock->sk;
+	struct sk_buff *skb;
+	wait_queue_t wait;
+
+	BT_DBG("");
+
+	set_user_nice(current, -15);
+
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (atomic_read(&s->terminate))
+			break;
+		/* RX */
+		while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+			skb_orphan(skb);
+			bnep_rx_frame(s, skb);
+		}
+
+		if (sk->sk_state != BT_CONNECTED)
+			break;
+
+		/* TX */
+		while ((skb = skb_dequeue(&sk->sk_write_queue)))
+			if (bnep_tx_frame(s, skb))
+				break;
+		netif_wake_queue(dev);
+
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	/* Cleanup session */
+	down_write(&bnep_session_sem);
+
+	/* Delete network device */
+	unregister_netdev(dev);
+
+	/* Wakeup user-space polling for socket errors */
+	s->sock->sk->sk_err = EUNATCH;
+
+	wake_up_interruptible(sk_sleep(s->sock->sk));
+
+	/* Release the socket */
+	fput(s->sock->file);
+
+	__bnep_unlink_session(s);
+
+	up_write(&bnep_session_sem);
+	free_netdev(dev);
+	return 0;
+}
+
+static struct device *bnep_get_device(struct bnep_session *session)
+{
+	bdaddr_t *src = &bt_sk(session->sock->sk)->src;
+	bdaddr_t *dst = &bt_sk(session->sock->sk)->dst;
+	struct hci_dev *hdev;
+	struct hci_conn *conn;
+
+	hdev = hci_get_route(dst, src);
+	if (!hdev)
+		return NULL;
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+
+	hci_dev_put(hdev);
+
+	return conn ? &conn->dev : NULL;
+}
+
+static struct device_type bnep_type = {
+	.name	= "bluetooth",
+};
+
+int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
+{
+	struct net_device *dev;
+	struct bnep_session *s, *ss;
+	u8 dst[ETH_ALEN], src[ETH_ALEN];
+	int err;
+
+	BT_DBG("");
+
+	baswap((void *) dst, &bt_sk(sock->sk)->dst);
+	baswap((void *) src, &bt_sk(sock->sk)->src);
+
+	/* session struct allocated as private part of net_device */
+	dev = alloc_netdev(sizeof(struct bnep_session),
+				(*req->device) ? req->device : "bnep%d",
+				bnep_net_setup);
+	if (!dev)
+		return -ENOMEM;
+
+	down_write(&bnep_session_sem);
+
+	ss = __bnep_get_session(dst);
+	if (ss && ss->state == BT_CONNECTED) {
+		err = -EEXIST;
+		goto failed;
+	}
+
+	s = netdev_priv(dev);
+
+	/* This is rx header therefore addresses are swapped.
+	 * ie. eh.h_dest is our local address. */
+	memcpy(s->eh.h_dest,   &src, ETH_ALEN);
+	memcpy(s->eh.h_source, &dst, ETH_ALEN);
+	memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN);
+
+	s->dev   = dev;
+	s->sock  = sock;
+	s->role  = req->role;
+	s->state = BT_CONNECTED;
+
+	s->msg.msg_flags = MSG_NOSIGNAL;
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+	/* Set default mc filter */
+	set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter);
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+	/* Set default protocol filter */
+	bnep_set_default_proto_filter(s);
+#endif
+
+	SET_NETDEV_DEV(dev, bnep_get_device(s));
+	SET_NETDEV_DEVTYPE(dev, &bnep_type);
+
+	err = register_netdev(dev);
+	if (err)
+		goto failed;
+
+	__bnep_link_session(s);
+
+	s->task = kthread_run(bnep_session, s, "kbnepd %s", dev->name);
+	if (IS_ERR(s->task)) {
+		/* Session thread start failed, gotta cleanup. */
+		unregister_netdev(dev);
+		__bnep_unlink_session(s);
+		err = PTR_ERR(s->task);
+		goto failed;
+	}
+
+	up_write(&bnep_session_sem);
+	strcpy(req->device, dev->name);
+	return 0;
+
+failed:
+	up_write(&bnep_session_sem);
+	free_netdev(dev);
+	return err;
+}
+
+int bnep_del_connection(struct bnep_conndel_req *req)
+{
+	struct bnep_session *s;
+	int  err = 0;
+
+	BT_DBG("");
+
+	down_read(&bnep_session_sem);
+
+	s = __bnep_get_session(req->dst);
+	if (s) {
+		atomic_inc(&s->terminate);
+		wake_up_process(s->task);
+	} else
+		err = -ENOENT;
+
+	up_read(&bnep_session_sem);
+	return err;
+}
+
+static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s)
+{
+	memset(ci, 0, sizeof(*ci));
+	memcpy(ci->dst, s->eh.h_source, ETH_ALEN);
+	strcpy(ci->device, s->dev->name);
+	ci->flags = s->flags;
+	ci->state = s->state;
+	ci->role  = s->role;
+}
+
+int bnep_get_connlist(struct bnep_connlist_req *req)
+{
+	struct list_head *p;
+	int err = 0, n = 0;
+
+	down_read(&bnep_session_sem);
+
+	list_for_each(p, &bnep_session_list) {
+		struct bnep_session *s;
+		struct bnep_conninfo ci;
+
+		s = list_entry(p, struct bnep_session, list);
+
+		__bnep_copy_ci(&ci, s);
+
+		if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (++n >= req->cnum)
+			break;
+
+		req->ci++;
+	}
+	req->cnum = n;
+
+	up_read(&bnep_session_sem);
+	return err;
+}
+
+int bnep_get_conninfo(struct bnep_conninfo *ci)
+{
+	struct bnep_session *s;
+	int err = 0;
+
+	down_read(&bnep_session_sem);
+
+	s = __bnep_get_session(ci->dst);
+	if (s)
+		__bnep_copy_ci(ci, s);
+	else
+		err = -ENOENT;
+
+	up_read(&bnep_session_sem);
+	return err;
+}
+
+static int __init bnep_init(void)
+{
+	char flt[50] = "";
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+	strcat(flt, "protocol ");
+#endif
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+	strcat(flt, "multicast");
+#endif
+
+	BT_INFO("BNEP (Ethernet Emulation) ver %s", VERSION);
+	if (flt[0])
+		BT_INFO("BNEP filters: %s", flt);
+
+	bnep_sock_init();
+	return 0;
+}
+
+static void __exit bnep_exit(void)
+{
+	bnep_sock_cleanup();
+}
+
+module_init(bnep_init);
+module_exit(bnep_exit);
+
+module_param(compress_src, bool, 0644);
+MODULE_PARM_DESC(compress_src, "Compress sources headers");
+
+module_param(compress_dst, bool, 0644);
+MODULE_PARM_DESC(compress_dst, "Compress destination headers");
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-4");
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
new file mode 100644
index 00000000..d4f5dff7
--- /dev/null
+++ b/net/bluetooth/bnep/netdev.c
@@ -0,0 +1,238 @@
+/*
+   BNEP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2001-2002 Inventel Systemes
+   Written 2001-2002 by
+	Clément Moreau <clement.moreau@inventel.fr>
+	David Libault  <david.libault@inventel.fr>
+
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "bnep.h"
+
+#define BNEP_TX_QUEUE_LEN 20
+
+static int bnep_net_open(struct net_device *dev)
+{
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int bnep_net_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static void bnep_net_set_mc_list(struct net_device *dev)
+{
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+	struct bnep_session *s = netdev_priv(dev);
+	struct sock *sk = s->sock->sk;
+	struct bnep_set_filter_req *r;
+	struct sk_buff *skb;
+	int size;
+
+	BT_DBG("%s mc_count %d", dev->name, netdev_mc_count(dev));
+
+	size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2;
+	skb  = alloc_skb(size, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("%s Multicast list allocation failed", dev->name);
+		return;
+	}
+
+	r = (void *) skb->data;
+	__skb_put(skb, sizeof(*r));
+
+	r->type = BNEP_CONTROL;
+	r->ctrl = BNEP_FILTER_MULTI_ADDR_SET;
+
+	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
+		u8 start[ETH_ALEN] = { 0x01 };
+
+		/* Request all addresses */
+		memcpy(__skb_put(skb, ETH_ALEN), start, ETH_ALEN);
+		memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+		r->len = htons(ETH_ALEN * 2);
+	} else {
+		struct netdev_hw_addr *ha;
+		int i, len = skb->len;
+
+		if (dev->flags & IFF_BROADCAST) {
+			memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+			memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN);
+		}
+
+		/* FIXME: We should group addresses here. */
+
+		i = 0;
+		netdev_for_each_mc_addr(ha, dev) {
+			if (i == BNEP_MAX_MULTICAST_FILTERS)
+				break;
+			memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN);
+			memcpy(__skb_put(skb, ETH_ALEN), ha->addr, ETH_ALEN);
+
+			i++;
+		}
+		r->len = htons(skb->len - len);
+	}
+
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	wake_up_interruptible(sk_sleep(sk));
+#endif
+}
+
+static int bnep_net_set_mac_addr(struct net_device *dev, void *arg)
+{
+	BT_DBG("%s", dev->name);
+	return 0;
+}
+
+static void bnep_net_timeout(struct net_device *dev)
+{
+	BT_DBG("net_timeout");
+	netif_wake_queue(dev);
+}
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+static inline int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s)
+{
+	struct ethhdr *eh = (void *) skb->data;
+
+	if ((eh->h_dest[0] & 1) && !test_bit(bnep_mc_hash(eh->h_dest), (ulong *) &s->mc_filter))
+		return 1;
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+/* Determine ether protocol. Based on eth_type_trans. */
+static inline u16 bnep_net_eth_proto(struct sk_buff *skb)
+{
+	struct ethhdr *eh = (void *) skb->data;
+	u16 proto = ntohs(eh->h_proto);
+
+	if (proto >= 1536)
+		return proto;
+
+	if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF))
+		return ETH_P_802_3;
+
+	return ETH_P_802_2;
+}
+
+static inline int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s)
+{
+	u16 proto = bnep_net_eth_proto(skb);
+	struct bnep_proto_filter *f = s->proto_filter;
+	int i;
+
+	for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) {
+		if (proto >= f[i].start && proto <= f[i].end)
+			return 0;
+	}
+
+	BT_DBG("BNEP: filtered skb %p, proto 0x%.4x", skb, proto);
+	return 1;
+}
+#endif
+
+static netdev_tx_t bnep_net_xmit(struct sk_buff *skb,
+				 struct net_device *dev)
+{
+	struct bnep_session *s = netdev_priv(dev);
+	struct sock *sk = s->sock->sk;
+
+	BT_DBG("skb %p, dev %p", skb, dev);
+
+#ifdef CONFIG_BT_BNEP_MC_FILTER
+	if (bnep_net_mc_filter(skb, s)) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+
+#ifdef CONFIG_BT_BNEP_PROTO_FILTER
+	if (bnep_net_proto_filter(skb, s)) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+
+	/*
+	 * We cannot send L2CAP packets from here as we are potentially in a bh.
+	 * So we have to queue them and wake up session thread which is sleeping
+	 * on the sk_sleep(sk).
+	 */
+	dev->trans_start = jiffies;
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	wake_up_interruptible(sk_sleep(sk));
+
+	if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) {
+		BT_DBG("tx queue is full");
+
+		/* Stop queuing.
+		 * Session thread will do netif_wake_queue() */
+		netif_stop_queue(dev);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops bnep_netdev_ops = {
+	.ndo_open            = bnep_net_open,
+	.ndo_stop            = bnep_net_close,
+	.ndo_start_xmit	     = bnep_net_xmit,
+	.ndo_validate_addr   = eth_validate_addr,
+	.ndo_set_multicast_list = bnep_net_set_mc_list,
+	.ndo_set_mac_address = bnep_net_set_mac_addr,
+	.ndo_tx_timeout      = bnep_net_timeout,
+	.ndo_change_mtu	     = eth_change_mtu,
+
+};
+
+void bnep_net_setup(struct net_device *dev)
+{
+
+	memset(dev->broadcast, 0xff, ETH_ALEN);
+	dev->addr_len = ETH_ALEN;
+
+	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->netdev_ops = &bnep_netdev_ops;
+
+	dev->watchdog_timeo  = HZ * 2;
+}
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
new file mode 100644
index 00000000..17800b1d
--- /dev/null
+++ b/net/bluetooth/bnep/sock.c
@@ -0,0 +1,259 @@
+/*
+   BNEP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2001-2002 Inventel Systemes
+   Written 2001-2002 by
+	David Libault  <david.libault@inventel.fr>
+
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/gfp.h>
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+#include "bnep.h"
+
+static int bnep_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sock_orphan(sk);
+	sock_put(sk);
+	return 0;
+}
+
+static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct bnep_connlist_req cl;
+	struct bnep_connadd_req  ca;
+	struct bnep_conndel_req  cd;
+	struct bnep_conninfo ci;
+	struct socket *nsock;
+	void __user *argp = (void __user *)arg;
+	int err;
+
+	BT_DBG("cmd %x arg %lx", cmd, arg);
+
+	switch (cmd) {
+	case BNEPCONNADD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&ca, argp, sizeof(ca)))
+			return -EFAULT;
+
+		nsock = sockfd_lookup(ca.sock, &err);
+		if (!nsock)
+			return err;
+
+		if (nsock->sk->sk_state != BT_CONNECTED) {
+			sockfd_put(nsock);
+			return -EBADFD;
+		}
+		ca.device[sizeof(ca.device)-1] = 0;
+
+		err = bnep_add_connection(&ca, nsock);
+		if (!err) {
+			if (copy_to_user(argp, &ca, sizeof(ca)))
+				err = -EFAULT;
+		} else
+			sockfd_put(nsock);
+
+		return err;
+
+	case BNEPCONNDEL:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&cd, argp, sizeof(cd)))
+			return -EFAULT;
+
+		return bnep_del_connection(&cd);
+
+	case BNEPGETCONNLIST:
+		if (copy_from_user(&cl, argp, sizeof(cl)))
+			return -EFAULT;
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = bnep_get_connlist(&cl);
+		if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+			return -EFAULT;
+
+		return err;
+
+	case BNEPGETCONNINFO:
+		if (copy_from_user(&ci, argp, sizeof(ci)))
+			return -EFAULT;
+
+		err = bnep_get_conninfo(&ci);
+		if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+			return -EFAULT;
+
+		return err;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == BNEPGETCONNLIST) {
+		struct bnep_connlist_req cl;
+		uint32_t uci;
+		int err;
+
+		if (get_user(cl.cnum, (uint32_t __user *) arg) ||
+				get_user(uci, (u32 __user *) (arg + 4)))
+			return -EFAULT;
+
+		cl.ci = compat_ptr(uci);
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = bnep_get_connlist(&cl);
+
+		if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+			err = -EFAULT;
+
+		return err;
+	}
+
+	return bnep_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops bnep_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= bnep_sock_release,
+	.ioctl		= bnep_sock_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= bnep_sock_compat_ioctl,
+#endif
+	.bind		= sock_no_bind,
+	.getname	= sock_no_getname,
+	.sendmsg	= sock_no_sendmsg,
+	.recvmsg	= sock_no_recvmsg,
+	.poll		= sock_no_poll,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.mmap		= sock_no_mmap
+};
+
+static struct proto bnep_proto = {
+	.name		= "BNEP",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct bt_sock)
+};
+
+static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	sock->ops = &bnep_sock_ops;
+
+	sock->state = SS_UNCONNECTED;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = protocol;
+	sk->sk_state	= BT_OPEN;
+
+	return 0;
+}
+
+static const struct net_proto_family bnep_sock_family_ops = {
+	.family = PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create = bnep_sock_create
+};
+
+int __init bnep_sock_init(void)
+{
+	int err;
+
+	err = proto_register(&bnep_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	return 0;
+
+error:
+	BT_ERR("Can't register BNEP socket");
+	proto_unregister(&bnep_proto);
+	return err;
+}
+
+void __exit bnep_sock_cleanup(void)
+{
+	if (bt_sock_unregister(BTPROTO_BNEP) < 0)
+		BT_ERR("Can't unregister BNEP socket");
+
+	proto_unregister(&bnep_proto);
+}
diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig
new file mode 100644
index 00000000..d6b0382f
--- /dev/null
+++ b/net/bluetooth/cmtp/Kconfig
@@ -0,0 +1,11 @@
+config BT_CMTP
+	tristate "CMTP protocol support"
+	depends on BT && BT_L2CAP && ISDN_CAPI
+	help
+	  CMTP (CAPI Message Transport Protocol) is a transport layer
+	  for CAPI messages.  CMTP is required for the Bluetooth Common
+	  ISDN Access Profile.
+
+	  Say Y here to compile CMTP support into the kernel or say M to
+	  compile it as module (cmtp).
+
diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile
new file mode 100644
index 00000000..890a9a5a
--- /dev/null
+++ b/net/bluetooth/cmtp/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth CMTP layer
+#
+
+obj-$(CONFIG_BT_CMTP) += cmtp.o
+
+cmtp-objs := core.o sock.o capi.o
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
new file mode 100644
index 00000000..040f67b1
--- /dev/null
+++ b/net/bluetooth/cmtp/capi.c
@@ -0,0 +1,623 @@
+/*
+   CMTP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+#include <linux/isdn/capicmd.h>
+#include <linux/isdn/capiutil.h>
+
+#include "cmtp.h"
+
+#define CAPI_INTEROPERABILITY		0x20
+
+#define CAPI_INTEROPERABILITY_REQ	CAPICMD(CAPI_INTEROPERABILITY, CAPI_REQ)
+#define CAPI_INTEROPERABILITY_CONF	CAPICMD(CAPI_INTEROPERABILITY, CAPI_CONF)
+#define CAPI_INTEROPERABILITY_IND	CAPICMD(CAPI_INTEROPERABILITY, CAPI_IND)
+#define CAPI_INTEROPERABILITY_RESP	CAPICMD(CAPI_INTEROPERABILITY, CAPI_RESP)
+
+#define CAPI_INTEROPERABILITY_REQ_LEN	(CAPI_MSG_BASELEN + 2)
+#define CAPI_INTEROPERABILITY_CONF_LEN	(CAPI_MSG_BASELEN + 4)
+#define CAPI_INTEROPERABILITY_IND_LEN	(CAPI_MSG_BASELEN + 2)
+#define CAPI_INTEROPERABILITY_RESP_LEN	(CAPI_MSG_BASELEN + 2)
+
+#define CAPI_FUNCTION_REGISTER		0
+#define CAPI_FUNCTION_RELEASE		1
+#define CAPI_FUNCTION_GET_PROFILE	2
+#define CAPI_FUNCTION_GET_MANUFACTURER	3
+#define CAPI_FUNCTION_GET_VERSION	4
+#define CAPI_FUNCTION_GET_SERIAL_NUMBER	5
+#define CAPI_FUNCTION_MANUFACTURER	6
+#define CAPI_FUNCTION_LOOPBACK		7
+
+
+#define CMTP_MSGNUM	1
+#define CMTP_APPLID	2
+#define CMTP_MAPPING	3
+
+static struct cmtp_application *cmtp_application_add(struct cmtp_session *session, __u16 appl)
+{
+	struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL);
+
+	BT_DBG("session %p application %p appl %d", session, app, appl);
+
+	if (!app)
+		return NULL;
+
+	app->state = BT_OPEN;
+	app->appl = appl;
+
+	list_add_tail(&app->list, &session->applications);
+
+	return app;
+}
+
+static void cmtp_application_del(struct cmtp_session *session, struct cmtp_application *app)
+{
+	BT_DBG("session %p application %p", session, app);
+
+	if (app) {
+		list_del(&app->list);
+		kfree(app);
+	}
+}
+
+static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
+{
+	struct cmtp_application *app;
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &session->applications) {
+		app = list_entry(p, struct cmtp_application, list);
+		switch (pattern) {
+		case CMTP_MSGNUM:
+			if (app->msgnum == value)
+				return app;
+			break;
+		case CMTP_APPLID:
+			if (app->appl == value)
+				return app;
+			break;
+		case CMTP_MAPPING:
+			if (app->mapping == value)
+				return app;
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+static int cmtp_msgnum_get(struct cmtp_session *session)
+{
+	session->msgnum++;
+
+	if ((session->msgnum & 0xff) > 200)
+		session->msgnum = CMTP_INITIAL_MSGNUM + 1;
+
+	return session->msgnum;
+}
+
+static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+	struct cmtp_scb *scb = (void *) skb->cb;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	scb->id = -1;
+	scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3);
+
+	skb_queue_tail(&session->transmit, skb);
+
+	wake_up_interruptible(sk_sleep(session->sock->sk));
+}
+
+static void cmtp_send_interopmsg(struct cmtp_session *session,
+					__u8 subcmd, __u16 appl, __u16 msgnum,
+					__u16 function, unsigned char *buf, int len)
+{
+	struct sk_buff *skb;
+	unsigned char *s;
+
+	BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum);
+
+	skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("Can't allocate memory for interoperability packet");
+		return;
+	}
+
+	s = skb_put(skb, CAPI_MSG_BASELEN + 6 + len);
+
+	capimsg_setu16(s, 0, CAPI_MSG_BASELEN + 6 + len);
+	capimsg_setu16(s, 2, appl);
+	capimsg_setu8 (s, 4, CAPI_INTEROPERABILITY);
+	capimsg_setu8 (s, 5, subcmd);
+	capimsg_setu16(s, 6, msgnum);
+
+	/* Interoperability selector (Bluetooth Device Management) */
+	capimsg_setu16(s, 8, 0x0001);
+
+	capimsg_setu8 (s, 10, 3 + len);
+	capimsg_setu16(s, 11, function);
+	capimsg_setu8 (s, 13, len);
+
+	if (len > 0)
+		memcpy(s + 14, buf, len);
+
+	cmtp_send_capimsg(session, skb);
+}
+
+static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+	struct capi_ctr *ctrl = &session->ctrl;
+	struct cmtp_application *application;
+	__u16 appl, msgnum, func, info;
+	__u32 controller;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	switch (CAPIMSG_SUBCOMMAND(skb->data)) {
+	case CAPI_CONF:
+		if (skb->len < CAPI_MSG_BASELEN + 10)
+			break;
+
+		func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5);
+		info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8);
+
+		switch (func) {
+		case CAPI_FUNCTION_REGISTER:
+			msgnum = CAPIMSG_MSGID(skb->data);
+
+			application = cmtp_application_get(session, CMTP_MSGNUM, msgnum);
+			if (application) {
+				application->state = BT_CONNECTED;
+				application->msgnum = 0;
+				application->mapping = CAPIMSG_APPID(skb->data);
+				wake_up_interruptible(&session->wait);
+			}
+
+			break;
+
+		case CAPI_FUNCTION_RELEASE:
+			appl = CAPIMSG_APPID(skb->data);
+
+			application = cmtp_application_get(session, CMTP_MAPPING, appl);
+			if (application) {
+				application->state = BT_CLOSED;
+				application->msgnum = 0;
+				wake_up_interruptible(&session->wait);
+			}
+
+			break;
+
+		case CAPI_FUNCTION_GET_PROFILE:
+			if (skb->len < CAPI_MSG_BASELEN + 11 + sizeof(capi_profile))
+				break;
+
+			controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11);
+			msgnum = CAPIMSG_MSGID(skb->data);
+
+			if (!info && (msgnum == CMTP_INITIAL_MSGNUM)) {
+				session->ncontroller = controller;
+				wake_up_interruptible(&session->wait);
+				break;
+			}
+
+			if (!info && ctrl) {
+				memcpy(&ctrl->profile,
+					skb->data + CAPI_MSG_BASELEN + 11,
+					sizeof(capi_profile));
+				session->state = BT_CONNECTED;
+				capi_ctr_ready(ctrl);
+			}
+
+			break;
+
+		case CAPI_FUNCTION_GET_MANUFACTURER:
+			if (skb->len < CAPI_MSG_BASELEN + 15)
+				break;
+
+			controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 10);
+
+			if (!info && ctrl) {
+				int len = min_t(uint, CAPI_MANUFACTURER_LEN,
+						skb->data[CAPI_MSG_BASELEN + 14]);
+
+				memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN);
+				strncpy(ctrl->manu,
+					skb->data + CAPI_MSG_BASELEN + 15, len);
+			}
+
+			break;
+
+		case CAPI_FUNCTION_GET_VERSION:
+			if (skb->len < CAPI_MSG_BASELEN + 32)
+				break;
+
+			controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12);
+
+			if (!info && ctrl) {
+				ctrl->version.majorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 16);
+				ctrl->version.minorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 20);
+				ctrl->version.majormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 24);
+				ctrl->version.minormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 28);
+			}
+
+			break;
+
+		case CAPI_FUNCTION_GET_SERIAL_NUMBER:
+			if (skb->len < CAPI_MSG_BASELEN + 17)
+				break;
+
+			controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12);
+
+			if (!info && ctrl) {
+				int len = min_t(uint, CAPI_SERIAL_LEN,
+						skb->data[CAPI_MSG_BASELEN + 16]);
+
+				memset(ctrl->serial, 0, CAPI_SERIAL_LEN);
+				strncpy(ctrl->serial,
+					skb->data + CAPI_MSG_BASELEN + 17, len);
+			}
+
+			break;
+		}
+
+		break;
+
+	case CAPI_IND:
+		if (skb->len < CAPI_MSG_BASELEN + 6)
+			break;
+
+		func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3);
+
+		if (func == CAPI_FUNCTION_LOOPBACK) {
+			int len = min_t(uint, skb->len - CAPI_MSG_BASELEN - 6,
+						skb->data[CAPI_MSG_BASELEN + 5]);
+			appl = CAPIMSG_APPID(skb->data);
+			msgnum = CAPIMSG_MSGID(skb->data);
+			cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func,
+						skb->data + CAPI_MSG_BASELEN + 6, len);
+		}
+
+		break;
+	}
+
+	kfree_skb(skb);
+}
+
+void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
+{
+	struct capi_ctr *ctrl = &session->ctrl;
+	struct cmtp_application *application;
+	__u16 appl;
+	__u32 contr;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	if (skb->len < CAPI_MSG_BASELEN)
+		return;
+
+	if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) {
+		cmtp_recv_interopmsg(session, skb);
+		return;
+	}
+
+	if (session->flags & (1 << CMTP_LOOPBACK)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	appl = CAPIMSG_APPID(skb->data);
+	contr = CAPIMSG_CONTROL(skb->data);
+
+	application = cmtp_application_get(session, CMTP_MAPPING, appl);
+	if (application) {
+		appl = application->appl;
+		CAPIMSG_SETAPPID(skb->data, appl);
+	} else {
+		BT_ERR("Can't find application with id %d", appl);
+		kfree_skb(skb);
+		return;
+	}
+
+	if ((contr & 0x7f) == 0x01) {
+		contr = (contr & 0xffffff80) | session->num;
+		CAPIMSG_SETCONTROL(skb->data, contr);
+	}
+
+	if (!ctrl) {
+		BT_ERR("Can't find controller %d for message", session->num);
+		kfree_skb(skb);
+		return;
+	}
+
+	capi_ctr_handle_message(ctrl, appl, skb);
+}
+
+static int cmtp_load_firmware(struct capi_ctr *ctrl, capiloaddata *data)
+{
+	BT_DBG("ctrl %p data %p", ctrl, data);
+
+	return 0;
+}
+
+static void cmtp_reset_ctr(struct capi_ctr *ctrl)
+{
+	struct cmtp_session *session = ctrl->driverdata;
+
+	BT_DBG("ctrl %p", ctrl);
+
+	capi_ctr_down(ctrl);
+
+	kthread_stop(session->task);
+}
+
+static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_params *rp)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct cmtp_session *session = ctrl->driverdata;
+	struct cmtp_application *application;
+	unsigned long timeo = CMTP_INTEROP_TIMEOUT;
+	unsigned char buf[8];
+	int err = 0, nconn, want = rp->level3cnt;
+
+	BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d",
+		ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
+
+	application = cmtp_application_add(session, appl);
+	if (!application) {
+		BT_ERR("Can't allocate memory for new application");
+		return;
+	}
+
+	if (want < 0)
+		nconn = ctrl->profile.nbchannel * -want;
+	else
+		nconn = want;
+
+	if (nconn == 0)
+		nconn = ctrl->profile.nbchannel;
+
+	capimsg_setu16(buf, 0, nconn);
+	capimsg_setu16(buf, 2, rp->datablkcnt);
+	capimsg_setu16(buf, 4, rp->datablklen);
+
+	application->state = BT_CONFIG;
+	application->msgnum = cmtp_msgnum_get(session);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0x0000, application->msgnum,
+				CAPI_FUNCTION_REGISTER, buf, 6);
+
+	add_wait_queue(&session->wait, &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		if (application->state == BT_CLOSED) {
+			err = -application->err;
+			break;
+		}
+
+		if (application->state == BT_CONNECTED)
+			break;
+
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+
+		timeo = schedule_timeout(timeo);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&session->wait, &wait);
+
+	if (err) {
+		cmtp_application_del(session, application);
+		return;
+	}
+}
+
+static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl)
+{
+	struct cmtp_session *session = ctrl->driverdata;
+	struct cmtp_application *application;
+
+	BT_DBG("ctrl %p appl %d", ctrl, appl);
+
+	application = cmtp_application_get(session, CMTP_APPLID, appl);
+	if (!application) {
+		BT_ERR("Can't find application");
+		return;
+	}
+
+	application->msgnum = cmtp_msgnum_get(session);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, application->mapping, application->msgnum,
+				CAPI_FUNCTION_RELEASE, NULL, 0);
+
+	wait_event_interruptible_timeout(session->wait,
+			(application->state == BT_CLOSED), CMTP_INTEROP_TIMEOUT);
+
+	cmtp_application_del(session, application);
+}
+
+static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
+{
+	struct cmtp_session *session = ctrl->driverdata;
+	struct cmtp_application *application;
+	__u16 appl;
+	__u32 contr;
+
+	BT_DBG("ctrl %p skb %p", ctrl, skb);
+
+	appl = CAPIMSG_APPID(skb->data);
+	contr = CAPIMSG_CONTROL(skb->data);
+
+	application = cmtp_application_get(session, CMTP_APPLID, appl);
+	if ((!application) || (application->state != BT_CONNECTED)) {
+		BT_ERR("Can't find application with id %d", appl);
+		return CAPI_ILLAPPNR;
+	}
+
+	CAPIMSG_SETAPPID(skb->data, application->mapping);
+
+	if ((contr & 0x7f) == session->num) {
+		contr = (contr & 0xffffff80) | 0x01;
+		CAPIMSG_SETCONTROL(skb->data, contr);
+	}
+
+	cmtp_send_capimsg(session, skb);
+
+	return CAPI_NOERROR;
+}
+
+static char *cmtp_procinfo(struct capi_ctr *ctrl)
+{
+	return "CAPI Message Transport Protocol";
+}
+
+static int cmtp_proc_show(struct seq_file *m, void *v)
+{
+	struct capi_ctr *ctrl = m->private;
+	struct cmtp_session *session = ctrl->driverdata;
+	struct cmtp_application *app;
+	struct list_head *p, *n;
+
+	seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
+	seq_printf(m, "addr %s\n", session->name);
+	seq_printf(m, "ctrl %d\n", session->num);
+
+	list_for_each_safe(p, n, &session->applications) {
+		app = list_entry(p, struct cmtp_application, list);
+		seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
+	}
+
+	return 0;
+}
+
+static int cmtp_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cmtp_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations cmtp_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cmtp_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int cmtp_attach_device(struct cmtp_session *session)
+{
+	unsigned char buf[4];
+	long ret;
+
+	BT_DBG("session %p", session);
+
+	capimsg_setu32(buf, 0, 0);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, CMTP_INITIAL_MSGNUM,
+				CAPI_FUNCTION_GET_PROFILE, buf, 4);
+
+	ret = wait_event_interruptible_timeout(session->wait,
+			session->ncontroller, CMTP_INTEROP_TIMEOUT);
+
+	BT_INFO("Found %d CAPI controller(s) on device %s", session->ncontroller, session->name);
+
+	if (!ret)
+		return -ETIMEDOUT;
+
+	if (!session->ncontroller)
+		return -ENODEV;
+
+	if (session->ncontroller > 1)
+		BT_INFO("Setting up only CAPI controller 1");
+
+	session->ctrl.owner      = THIS_MODULE;
+	session->ctrl.driverdata = session;
+	strcpy(session->ctrl.name, session->name);
+
+	session->ctrl.driver_name   = "cmtp";
+	session->ctrl.load_firmware = cmtp_load_firmware;
+	session->ctrl.reset_ctr     = cmtp_reset_ctr;
+	session->ctrl.register_appl = cmtp_register_appl;
+	session->ctrl.release_appl  = cmtp_release_appl;
+	session->ctrl.send_message  = cmtp_send_message;
+
+	session->ctrl.procinfo      = cmtp_procinfo;
+	session->ctrl.proc_fops = &cmtp_proc_fops;
+
+	if (attach_capi_ctr(&session->ctrl) < 0) {
+		BT_ERR("Can't attach new controller");
+		return -EBUSY;
+	}
+
+	session->num = session->ctrl.cnr;
+
+	BT_DBG("session %p num %d", session, session->num);
+
+	capimsg_setu32(buf, 0, 1);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+				CAPI_FUNCTION_GET_MANUFACTURER, buf, 4);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+				CAPI_FUNCTION_GET_VERSION, buf, 4);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+				CAPI_FUNCTION_GET_SERIAL_NUMBER, buf, 4);
+
+	cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session),
+				CAPI_FUNCTION_GET_PROFILE, buf, 4);
+
+	return 0;
+}
+
+void cmtp_detach_device(struct cmtp_session *session)
+{
+	BT_DBG("session %p", session);
+
+	detach_capi_ctr(&session->ctrl);
+}
diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h
new file mode 100644
index 00000000..db43b54a
--- /dev/null
+++ b/net/bluetooth/cmtp/cmtp.h
@@ -0,0 +1,128 @@
+/*
+   CMTP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __CMTP_H
+#define __CMTP_H
+
+#include <linux/types.h>
+#include <net/bluetooth/bluetooth.h>
+
+#define BTNAMSIZ 18
+
+/* CMTP ioctl defines */
+#define CMTPCONNADD	_IOW('C', 200, int)
+#define CMTPCONNDEL	_IOW('C', 201, int)
+#define CMTPGETCONNLIST	_IOR('C', 210, int)
+#define CMTPGETCONNINFO	_IOR('C', 211, int)
+
+#define CMTP_LOOPBACK	0
+
+struct cmtp_connadd_req {
+	int   sock;	/* Connected socket */
+	__u32 flags;
+};
+
+struct cmtp_conndel_req {
+	bdaddr_t bdaddr;
+	__u32    flags;
+};
+
+struct cmtp_conninfo {
+	bdaddr_t bdaddr;
+	__u32    flags;
+	__u16    state;
+	int      num;
+};
+
+struct cmtp_connlist_req {
+	__u32  cnum;
+	struct cmtp_conninfo __user *ci;
+};
+
+int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock);
+int cmtp_del_connection(struct cmtp_conndel_req *req);
+int cmtp_get_connlist(struct cmtp_connlist_req *req);
+int cmtp_get_conninfo(struct cmtp_conninfo *ci);
+
+/* CMTP session defines */
+#define CMTP_INTEROP_TIMEOUT	(HZ * 5)
+#define CMTP_INITIAL_MSGNUM	0xff00
+
+struct cmtp_session {
+	struct list_head list;
+
+	struct socket *sock;
+
+	bdaddr_t bdaddr;
+
+	unsigned long state;
+	unsigned long flags;
+
+	uint mtu;
+
+	char name[BTNAMSIZ];
+
+	struct task_struct *task;
+
+	wait_queue_head_t wait;
+
+	int ncontroller;
+	int num;
+	struct capi_ctr ctrl;
+
+	struct list_head applications;
+
+	unsigned long blockids;
+	int msgnum;
+
+	struct sk_buff_head transmit;
+
+	struct sk_buff *reassembly[16];
+};
+
+struct cmtp_application {
+	struct list_head list;
+
+	unsigned long state;
+	int err;
+
+	__u16 appl;
+	__u16 mapping;
+
+	__u16 msgnum;
+};
+
+struct cmtp_scb {
+	int id;
+	int data;
+};
+
+int  cmtp_attach_device(struct cmtp_session *session);
+void cmtp_detach_device(struct cmtp_session *session);
+
+void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb);
+
+/* CMTP init defines */
+int cmtp_init_sockets(void);
+void cmtp_cleanup_sockets(void);
+
+#endif /* __CMTP_H */
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
new file mode 100644
index 00000000..c5b11af9
--- /dev/null
+++ b/net/bluetooth/cmtp/core.c
@@ -0,0 +1,497 @@
+/*
+   CMTP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/freezer.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "cmtp.h"
+
+#define VERSION "1.0"
+
+static DECLARE_RWSEM(cmtp_session_sem);
+static LIST_HEAD(cmtp_session_list);
+
+static struct cmtp_session *__cmtp_get_session(bdaddr_t *bdaddr)
+{
+	struct cmtp_session *session;
+	struct list_head *p;
+
+	BT_DBG("");
+
+	list_for_each(p, &cmtp_session_list) {
+		session = list_entry(p, struct cmtp_session, list);
+		if (!bacmp(bdaddr, &session->bdaddr))
+			return session;
+	}
+	return NULL;
+}
+
+static void __cmtp_link_session(struct cmtp_session *session)
+{
+	__module_get(THIS_MODULE);
+	list_add(&session->list, &cmtp_session_list);
+}
+
+static void __cmtp_unlink_session(struct cmtp_session *session)
+{
+	list_del(&session->list);
+	module_put(THIS_MODULE);
+}
+
+static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci)
+{
+	memset(ci, 0, sizeof(*ci));
+	bacpy(&ci->bdaddr, &session->bdaddr);
+
+	ci->flags = session->flags;
+	ci->state = session->state;
+
+	ci->num = session->num;
+}
+
+
+static inline int cmtp_alloc_block_id(struct cmtp_session *session)
+{
+	int i, id = -1;
+
+	for (i = 0; i < 16; i++)
+		if (!test_and_set_bit(i, &session->blockids)) {
+			id = i;
+			break;
+		}
+
+	return id;
+}
+
+static inline void cmtp_free_block_id(struct cmtp_session *session, int id)
+{
+	clear_bit(id, &session->blockids);
+}
+
+static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const unsigned char *buf, int count)
+{
+	struct sk_buff *skb = session->reassembly[id], *nskb;
+	int size;
+
+	BT_DBG("session %p buf %p count %d", session, buf, count);
+
+	size = (skb) ? skb->len + count : count;
+
+	nskb = alloc_skb(size, GFP_ATOMIC);
+	if (!nskb) {
+		BT_ERR("Can't allocate memory for CAPI message");
+		return;
+	}
+
+	if (skb && (skb->len > 0))
+		skb_copy_from_linear_data(skb, skb_put(nskb, skb->len), skb->len);
+
+	memcpy(skb_put(nskb, count), buf, count);
+
+	session->reassembly[id] = nskb;
+
+	kfree_skb(skb);
+}
+
+static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
+{
+	__u8 hdr, hdrlen, id;
+	__u16 len;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	while (skb->len > 0) {
+		hdr = skb->data[0];
+
+		switch (hdr & 0xc0) {
+		case 0x40:
+			hdrlen = 2;
+			len = skb->data[1];
+			break;
+		case 0x80:
+			hdrlen = 3;
+			len = skb->data[1] | (skb->data[2] << 8);
+			break;
+		default:
+			hdrlen = 1;
+			len = 0;
+			break;
+		}
+
+		id = (hdr & 0x3c) >> 2;
+
+		BT_DBG("hdr 0x%02x hdrlen %d len %d id %d", hdr, hdrlen, len, id);
+
+		if (hdrlen + len > skb->len) {
+			BT_ERR("Wrong size or header information in CMTP frame");
+			break;
+		}
+
+		if (len == 0) {
+			skb_pull(skb, hdrlen);
+			continue;
+		}
+
+		switch (hdr & 0x03) {
+		case 0x00:
+			cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
+			cmtp_recv_capimsg(session, session->reassembly[id]);
+			session->reassembly[id] = NULL;
+			break;
+		case 0x01:
+			cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
+			break;
+		default:
+			if (session->reassembly[id] != NULL)
+				kfree_skb(session->reassembly[id]);
+			session->reassembly[id] = NULL;
+			break;
+		}
+
+		skb_pull(skb, hdrlen + len);
+	}
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, int len)
+{
+	struct socket *sock = session->sock;
+	struct kvec iv = { data, len };
+	struct msghdr msg;
+
+	BT_DBG("session %p data %p len %d", session, data, len);
+
+	if (!len)
+		return 0;
+
+	memset(&msg, 0, sizeof(msg));
+
+	return kernel_sendmsg(sock, &msg, &iv, 1, len);
+}
+
+static void cmtp_process_transmit(struct cmtp_session *session)
+{
+	struct sk_buff *skb, *nskb;
+	unsigned char *hdr;
+	unsigned int size, tail;
+
+	BT_DBG("session %p", session);
+
+	nskb = alloc_skb(session->mtu, GFP_ATOMIC);
+	if (!nskb) {
+		BT_ERR("Can't allocate memory for new frame");
+		return;
+	}
+
+	while ((skb = skb_dequeue(&session->transmit))) {
+		struct cmtp_scb *scb = (void *) skb->cb;
+
+		tail = session->mtu - nskb->len;
+		if (tail < 5) {
+			cmtp_send_frame(session, nskb->data, nskb->len);
+			skb_trim(nskb, 0);
+			tail = session->mtu;
+		}
+
+		size = min_t(uint, ((tail < 258) ? (tail - 2) : (tail - 3)), skb->len);
+
+		if (scb->id < 0) {
+			scb->id = cmtp_alloc_block_id(session);
+			if (scb->id < 0) {
+				skb_queue_head(&session->transmit, skb);
+				break;
+			}
+		}
+
+		if (size < 256) {
+			hdr = skb_put(nskb, 2);
+			hdr[0] = 0x40
+				| ((scb->id << 2) & 0x3c)
+				| ((skb->len == size) ? 0x00 : 0x01);
+			hdr[1] = size;
+		} else {
+			hdr = skb_put(nskb, 3);
+			hdr[0] = 0x80
+				| ((scb->id << 2) & 0x3c)
+				| ((skb->len == size) ? 0x00 : 0x01);
+			hdr[1] = size & 0xff;
+			hdr[2] = size >> 8;
+		}
+
+		skb_copy_from_linear_data(skb, skb_put(nskb, size), size);
+		skb_pull(skb, size);
+
+		if (skb->len > 0) {
+			skb_queue_head(&session->transmit, skb);
+		} else {
+			cmtp_free_block_id(session, scb->id);
+			if (scb->data) {
+				cmtp_send_frame(session, nskb->data, nskb->len);
+				skb_trim(nskb, 0);
+			}
+			kfree_skb(skb);
+		}
+	}
+
+	cmtp_send_frame(session, nskb->data, nskb->len);
+
+	kfree_skb(nskb);
+}
+
+static int cmtp_session(void *arg)
+{
+	struct cmtp_session *session = arg;
+	struct sock *sk = session->sock->sk;
+	struct sk_buff *skb;
+	wait_queue_t wait;
+
+	BT_DBG("session %p", session);
+
+	set_user_nice(current, -15);
+
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (sk->sk_state != BT_CONNECTED)
+			break;
+
+		while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+			skb_orphan(skb);
+			cmtp_recv_frame(session, skb);
+		}
+
+		cmtp_process_transmit(session);
+
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	down_write(&cmtp_session_sem);
+
+	if (!(session->flags & (1 << CMTP_LOOPBACK)))
+		cmtp_detach_device(session);
+
+	fput(session->sock->file);
+
+	__cmtp_unlink_session(session);
+
+	up_write(&cmtp_session_sem);
+
+	kfree(session);
+	return 0;
+}
+
+int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
+{
+	struct cmtp_session *session, *s;
+	int i, err;
+
+	BT_DBG("");
+
+	session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
+
+	down_write(&cmtp_session_sem);
+
+	s = __cmtp_get_session(&bt_sk(sock->sk)->dst);
+	if (s && s->state == BT_CONNECTED) {
+		err = -EEXIST;
+		goto failed;
+	}
+
+	bacpy(&session->bdaddr, &bt_sk(sock->sk)->dst);
+
+	session->mtu = min_t(uint, l2cap_pi(sock->sk)->chan->omtu,
+					l2cap_pi(sock->sk)->chan->imtu);
+
+	BT_DBG("mtu %d", session->mtu);
+
+	sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst));
+
+	session->sock  = sock;
+	session->state = BT_CONFIG;
+
+	init_waitqueue_head(&session->wait);
+
+	session->msgnum = CMTP_INITIAL_MSGNUM;
+
+	INIT_LIST_HEAD(&session->applications);
+
+	skb_queue_head_init(&session->transmit);
+
+	for (i = 0; i < 16; i++)
+		session->reassembly[i] = NULL;
+
+	session->flags = req->flags;
+
+	__cmtp_link_session(session);
+
+	session->task = kthread_run(cmtp_session, session, "kcmtpd_ctr_%d",
+								session->num);
+	if (IS_ERR(session->task)) {
+		err = PTR_ERR(session->task);
+		goto unlink;
+	}
+
+	if (!(session->flags & (1 << CMTP_LOOPBACK))) {
+		err = cmtp_attach_device(session);
+		if (err < 0)
+			goto detach;
+	}
+
+	up_write(&cmtp_session_sem);
+	return 0;
+
+detach:
+	cmtp_detach_device(session);
+
+unlink:
+	__cmtp_unlink_session(session);
+
+failed:
+	up_write(&cmtp_session_sem);
+	kfree(session);
+	return err;
+}
+
+int cmtp_del_connection(struct cmtp_conndel_req *req)
+{
+	struct cmtp_session *session;
+	int err = 0;
+
+	BT_DBG("");
+
+	down_read(&cmtp_session_sem);
+
+	session = __cmtp_get_session(&req->bdaddr);
+	if (session) {
+		/* Flush the transmit queue */
+		skb_queue_purge(&session->transmit);
+
+		/* Stop session thread */
+		kthread_stop(session->task);
+	} else
+		err = -ENOENT;
+
+	up_read(&cmtp_session_sem);
+	return err;
+}
+
+int cmtp_get_connlist(struct cmtp_connlist_req *req)
+{
+	struct list_head *p;
+	int err = 0, n = 0;
+
+	BT_DBG("");
+
+	down_read(&cmtp_session_sem);
+
+	list_for_each(p, &cmtp_session_list) {
+		struct cmtp_session *session;
+		struct cmtp_conninfo ci;
+
+		session = list_entry(p, struct cmtp_session, list);
+
+		__cmtp_copy_session(session, &ci);
+
+		if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (++n >= req->cnum)
+			break;
+
+		req->ci++;
+	}
+	req->cnum = n;
+
+	up_read(&cmtp_session_sem);
+	return err;
+}
+
+int cmtp_get_conninfo(struct cmtp_conninfo *ci)
+{
+	struct cmtp_session *session;
+	int err = 0;
+
+	down_read(&cmtp_session_sem);
+
+	session = __cmtp_get_session(&ci->bdaddr);
+	if (session)
+		__cmtp_copy_session(session, ci);
+	else
+		err = -ENOENT;
+
+	up_read(&cmtp_session_sem);
+	return err;
+}
+
+
+static int __init cmtp_init(void)
+{
+	BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION);
+
+	cmtp_init_sockets();
+
+	return 0;
+}
+
+static void __exit cmtp_exit(void)
+{
+	cmtp_cleanup_sockets();
+}
+
+module_init(cmtp_init);
+module_exit(cmtp_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth CMTP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-5");
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
new file mode 100644
index 00000000..3f2dd5c2
--- /dev/null
+++ b/net/bluetooth/cmtp/sock.c
@@ -0,0 +1,253 @@
+/*
+   CMTP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/gfp.h>
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/isdn/capilli.h>
+
+#include <asm/system.h>
+
+#include "cmtp.h"
+
+static int cmtp_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sock_orphan(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct cmtp_connadd_req ca;
+	struct cmtp_conndel_req cd;
+	struct cmtp_connlist_req cl;
+	struct cmtp_conninfo ci;
+	struct socket *nsock;
+	void __user *argp = (void __user *)arg;
+	int err;
+
+	BT_DBG("cmd %x arg %lx", cmd, arg);
+
+	switch (cmd) {
+	case CMTPCONNADD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&ca, argp, sizeof(ca)))
+			return -EFAULT;
+
+		nsock = sockfd_lookup(ca.sock, &err);
+		if (!nsock)
+			return err;
+
+		if (nsock->sk->sk_state != BT_CONNECTED) {
+			sockfd_put(nsock);
+			return -EBADFD;
+		}
+
+		err = cmtp_add_connection(&ca, nsock);
+		if (!err) {
+			if (copy_to_user(argp, &ca, sizeof(ca)))
+				err = -EFAULT;
+		} else
+			sockfd_put(nsock);
+
+		return err;
+
+	case CMTPCONNDEL:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&cd, argp, sizeof(cd)))
+			return -EFAULT;
+
+		return cmtp_del_connection(&cd);
+
+	case CMTPGETCONNLIST:
+		if (copy_from_user(&cl, argp, sizeof(cl)))
+			return -EFAULT;
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = cmtp_get_connlist(&cl);
+		if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+			return -EFAULT;
+
+		return err;
+
+	case CMTPGETCONNINFO:
+		if (copy_from_user(&ci, argp, sizeof(ci)))
+			return -EFAULT;
+
+		err = cmtp_get_conninfo(&ci);
+		if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+			return -EFAULT;
+
+		return err;
+	}
+
+	return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == CMTPGETCONNLIST) {
+		struct cmtp_connlist_req cl;
+		uint32_t uci;
+		int err;
+
+		if (get_user(cl.cnum, (uint32_t __user *) arg) ||
+				get_user(uci, (u32 __user *) (arg + 4)))
+			return -EFAULT;
+
+		cl.ci = compat_ptr(uci);
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = cmtp_get_connlist(&cl);
+
+		if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+			err = -EFAULT;
+
+		return err;
+	}
+
+	return cmtp_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops cmtp_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= cmtp_sock_release,
+	.ioctl		= cmtp_sock_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= cmtp_sock_compat_ioctl,
+#endif
+	.bind		= sock_no_bind,
+	.getname	= sock_no_getname,
+	.sendmsg	= sock_no_sendmsg,
+	.recvmsg	= sock_no_recvmsg,
+	.poll		= sock_no_poll,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.mmap		= sock_no_mmap
+};
+
+static struct proto cmtp_proto = {
+	.name		= "CMTP",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct bt_sock)
+};
+
+static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	sock->ops = &cmtp_sock_ops;
+
+	sock->state = SS_UNCONNECTED;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = protocol;
+	sk->sk_state    = BT_OPEN;
+
+	return 0;
+}
+
+static const struct net_proto_family cmtp_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= cmtp_sock_create
+};
+
+int cmtp_init_sockets(void)
+{
+	int err;
+
+	err = proto_register(&cmtp_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	return 0;
+
+error:
+	BT_ERR("Can't register CMTP socket");
+	proto_unregister(&cmtp_proto);
+	return err;
+}
+
+void cmtp_cleanup_sockets(void)
+{
+	if (bt_sock_unregister(BTPROTO_CMTP) < 0)
+		BT_ERR("Can't unregister CMTP socket");
+
+	proto_unregister(&cmtp_proto);
+}
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
new file mode 100644
index 00000000..33c4e0cd
--- /dev/null
+++ b/net/bluetooth/hci_conn.c
@@ -0,0 +1,991 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI connection handling. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+static void hci_le_connect(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_create_conn cp;
+
+	conn->state = BT_CONNECT;
+	conn->out = 1;
+	conn->link_mode |= HCI_LM_MASTER;
+	conn->sec_level = BT_SECURITY_LOW;
+
+	memset(&cp, 0, sizeof(cp));
+	cp.scan_interval = cpu_to_le16(0x0004);
+	cp.scan_window = cpu_to_le16(0x0004);
+	bacpy(&cp.peer_addr, &conn->dst);
+	cp.peer_addr_type = conn->dst_type;
+	cp.conn_interval_min = cpu_to_le16(0x0008);
+	cp.conn_interval_max = cpu_to_le16(0x0100);
+	cp.supervision_timeout = cpu_to_le16(0x0064);
+	cp.min_ce_len = cpu_to_le16(0x0001);
+	cp.max_ce_len = cpu_to_le16(0x0001);
+
+	hci_send_cmd(hdev, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+}
+
+static void hci_le_connect_cancel(struct hci_conn *conn)
+{
+	hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL);
+}
+
+void hci_acl_connect(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct inquiry_entry *ie;
+	struct hci_cp_create_conn cp;
+
+	BT_DBG("%p", conn);
+
+	conn->state = BT_CONNECT;
+	conn->out = 1;
+
+	conn->link_mode = HCI_LM_MASTER;
+
+	conn->attempt++;
+
+	conn->link_policy = hdev->link_policy;
+
+	memset(&cp, 0, sizeof(cp));
+	bacpy(&cp.bdaddr, &conn->dst);
+	cp.pscan_rep_mode = 0x02;
+
+	ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+	if (ie) {
+		if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+			cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+			cp.pscan_mode     = ie->data.pscan_mode;
+			cp.clock_offset   = ie->data.clock_offset |
+							cpu_to_le16(0x8000);
+		}
+
+		memcpy(conn->dev_class, ie->data.dev_class, 3);
+		conn->ssp_mode = ie->data.ssp_mode;
+	}
+
+	cp.pkt_type = cpu_to_le16(conn->pkt_type);
+	if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+		cp.role_switch = 0x01;
+	else
+		cp.role_switch = 0x00;
+
+	hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
+}
+
+static void hci_acl_connect_cancel(struct hci_conn *conn)
+{
+	struct hci_cp_create_conn_cancel cp;
+
+	BT_DBG("%p", conn);
+
+	if (conn->hdev->hci_ver < 2)
+		return;
+
+	bacpy(&cp.bdaddr, &conn->dst);
+	hci_send_cmd(conn->hdev, HCI_OP_CREATE_CONN_CANCEL, sizeof(cp), &cp);
+}
+
+void hci_acl_disconn(struct hci_conn *conn, __u8 reason)
+{
+	struct hci_cp_disconnect cp;
+
+	BT_DBG("%p", conn);
+
+	conn->state = BT_DISCONN;
+
+	cp.handle = cpu_to_le16(conn->handle);
+	cp.reason = reason;
+	hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp);
+}
+
+void hci_add_sco(struct hci_conn *conn, __u16 handle)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_add_sco cp;
+
+	BT_DBG("%p", conn);
+
+	conn->state = BT_CONNECT;
+	conn->out = 1;
+
+	conn->attempt++;
+
+	cp.handle   = cpu_to_le16(handle);
+	cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+	hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
+}
+
+void hci_setup_sync(struct hci_conn *conn, __u16 handle)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_setup_sync_conn cp;
+
+	BT_DBG("%p", conn);
+
+	conn->state = BT_CONNECT;
+	conn->out = 1;
+
+	conn->attempt++;
+
+	cp.handle   = cpu_to_le16(handle);
+	cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+	cp.tx_bandwidth   = cpu_to_le32(0x00001f40);
+	cp.rx_bandwidth   = cpu_to_le32(0x00001f40);
+	cp.max_latency    = cpu_to_le16(0xffff);
+	cp.voice_setting  = cpu_to_le16(hdev->voice_setting);
+	cp.retrans_effort = 0xff;
+
+	hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp);
+}
+
+void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max,
+					u16 latency, u16 to_multiplier)
+{
+	struct hci_cp_le_conn_update cp;
+	struct hci_dev *hdev = conn->hdev;
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.handle		= cpu_to_le16(conn->handle);
+	cp.conn_interval_min	= cpu_to_le16(min);
+	cp.conn_interval_max	= cpu_to_le16(max);
+	cp.conn_latency		= cpu_to_le16(latency);
+	cp.supervision_timeout	= cpu_to_le16(to_multiplier);
+	cp.min_ce_len		= cpu_to_le16(0x0001);
+	cp.max_ce_len		= cpu_to_le16(0x0001);
+
+	hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp);
+}
+EXPORT_SYMBOL(hci_le_conn_update);
+
+void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __u8 rand[8],
+							__u8 ltk[16])
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_start_enc cp;
+
+	BT_DBG("%p", conn);
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.handle = cpu_to_le16(conn->handle);
+	memcpy(cp.ltk, ltk, sizeof(cp.ltk));
+	cp.ediv = ediv;
+	memcpy(cp.rand, rand, sizeof(rand));
+
+	hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp);
+}
+EXPORT_SYMBOL(hci_le_start_enc);
+
+void hci_le_ltk_reply(struct hci_conn *conn, u8 ltk[16])
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_ltk_reply cp;
+
+	BT_DBG("%p", conn);
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.handle = cpu_to_le16(conn->handle);
+	memcpy(cp.ltk, ltk, sizeof(ltk));
+
+	hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp);
+}
+EXPORT_SYMBOL(hci_le_ltk_reply);
+
+void hci_le_ltk_neg_reply(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_cp_le_ltk_neg_reply cp;
+
+	BT_DBG("%p", conn);
+
+	memset(&cp, 0, sizeof(cp));
+
+	cp.handle = cpu_to_le16(conn->handle);
+
+	hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(cp), &cp);
+}
+
+/* Device _must_ be locked */
+void hci_sco_setup(struct hci_conn *conn, __u8 status)
+{
+	struct hci_conn *sco = conn->link;
+
+	BT_DBG("%p", conn);
+
+	if (!sco)
+		return;
+
+	if (!status) {
+		if (lmp_esco_capable(conn->hdev))
+			hci_setup_sync(sco, conn->handle);
+		else
+			hci_add_sco(sco, conn->handle);
+	} else {
+		hci_proto_connect_cfm(sco, status);
+		hci_conn_del(sco);
+	}
+}
+
+static void hci_conn_timeout(unsigned long arg)
+{
+	struct hci_conn *conn = (void *) arg;
+	struct hci_dev *hdev = conn->hdev;
+	__u8 reason;
+
+	BT_DBG("conn %p state %d", conn, conn->state);
+
+	if (atomic_read(&conn->refcnt))
+		return;
+
+	hci_dev_lock(hdev);
+
+	switch (conn->state) {
+	case BT_CONNECT:
+	case BT_CONNECT2:
+		if (conn->out) {
+			if (conn->type == ACL_LINK)
+				hci_acl_connect_cancel(conn);
+			else if (conn->type == LE_LINK)
+				hci_le_connect_cancel(conn);
+		}
+		break;
+	case BT_CONFIG:
+	case BT_CONNECTED:
+		reason = hci_proto_disconn_ind(conn);
+		hci_acl_disconn(conn, reason);
+		break;
+	default:
+		conn->state = BT_CLOSED;
+		break;
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_conn_idle(unsigned long arg)
+{
+	struct hci_conn *conn = (void *) arg;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	hci_conn_enter_sniff_mode(conn);
+}
+
+static void hci_conn_auto_accept(unsigned long arg)
+{
+	struct hci_conn *conn = (void *) arg;
+	struct hci_dev *hdev = conn->hdev;
+
+	hci_dev_lock(hdev);
+
+	hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst),
+								&conn->dst);
+
+	hci_dev_unlock(hdev);
+}
+
+struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type,
+					__u16 pkt_type, bdaddr_t *dst)
+{
+	struct hci_conn *conn;
+
+	BT_DBG("%s dst %s", hdev->name, batostr(dst));
+
+	conn = kzalloc(sizeof(struct hci_conn), GFP_ATOMIC);
+	if (!conn)
+		return NULL;
+
+	bacpy(&conn->dst, dst);
+	conn->hdev  = hdev;
+	conn->type  = type;
+	conn->mode  = HCI_CM_ACTIVE;
+	conn->state = BT_OPEN;
+	conn->auth_type = HCI_AT_GENERAL_BONDING;
+	conn->io_capability = hdev->io_capability;
+	conn->remote_auth = 0xff;
+	conn->key_type = 0xff;
+
+	conn->power_save = 1;
+	conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+
+	switch (type) {
+	case ACL_LINK:
+		conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+		break;
+	case SCO_LINK:
+		if (!pkt_type)
+			pkt_type = SCO_ESCO_MASK;
+	case ESCO_LINK:
+		if (!pkt_type)
+			pkt_type = ALL_ESCO_MASK;
+		if (lmp_esco_capable(hdev)) {
+			/* HCI Setup Synchronous Connection Command uses
+			   reverse logic on the EDR_ESCO_MASK bits */
+			conn->pkt_type = (pkt_type ^ EDR_ESCO_MASK) &
+					hdev->esco_type;
+		} else {
+			/* Legacy HCI Add Sco Connection Command uses a
+			   shifted bitmask */
+			conn->pkt_type = (pkt_type << 5) & hdev->pkt_type &
+					SCO_PTYPE_MASK;
+		}
+		break;
+	}
+
+	skb_queue_head_init(&conn->data_q);
+
+	setup_timer(&conn->disc_timer, hci_conn_timeout, (unsigned long)conn);
+	setup_timer(&conn->idle_timer, hci_conn_idle, (unsigned long)conn);
+	setup_timer(&conn->auto_accept_timer, hci_conn_auto_accept,
+							(unsigned long) conn);
+
+	atomic_set(&conn->refcnt, 0);
+
+	hci_dev_hold(hdev);
+
+	tasklet_disable(&hdev->tx_task);
+
+	hci_conn_hash_add(hdev, conn);
+	if (hdev->notify)
+		hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+
+	atomic_set(&conn->devref, 0);
+
+	hci_conn_init_sysfs(conn);
+
+	tasklet_enable(&hdev->tx_task);
+
+	return conn;
+}
+
+int hci_conn_del(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("%s conn %p handle %d", hdev->name, conn, conn->handle);
+
+	del_timer(&conn->idle_timer);
+
+	del_timer(&conn->disc_timer);
+
+	del_timer(&conn->auto_accept_timer);
+
+	if (conn->type == ACL_LINK) {
+		struct hci_conn *sco = conn->link;
+		if (sco)
+			sco->link = NULL;
+
+		/* Unacked frames */
+		hdev->acl_cnt += conn->sent;
+	} else if (conn->type == LE_LINK) {
+		if (hdev->le_pkts)
+			hdev->le_cnt += conn->sent;
+		else
+			hdev->acl_cnt += conn->sent;
+	} else {
+		struct hci_conn *acl = conn->link;
+		if (acl) {
+			acl->link = NULL;
+			hci_conn_put(acl);
+		}
+	}
+
+	tasklet_disable(&hdev->tx_task);
+
+	hci_conn_hash_del(hdev, conn);
+	if (hdev->notify)
+		hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
+
+	tasklet_enable(&hdev->tx_task);
+
+	skb_queue_purge(&conn->data_q);
+
+	hci_conn_put_device(conn);
+
+	hci_dev_put(hdev);
+
+	if (conn->handle == 0)
+		kfree(conn);
+
+	return 0;
+}
+
+struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src)
+{
+	int use_src = bacmp(src, BDADDR_ANY);
+	struct hci_dev *hdev = NULL;
+	struct list_head *p;
+
+	BT_DBG("%s -> %s", batostr(src), batostr(dst));
+
+	read_lock_bh(&hci_dev_list_lock);
+
+	list_for_each(p, &hci_dev_list) {
+		struct hci_dev *d = list_entry(p, struct hci_dev, list);
+
+		if (!test_bit(HCI_UP, &d->flags) || test_bit(HCI_RAW, &d->flags))
+			continue;
+
+		/* Simple routing:
+		 *   No source address - find interface with bdaddr != dst
+		 *   Source address    - find interface with bdaddr == src
+		 */
+
+		if (use_src) {
+			if (!bacmp(&d->bdaddr, src)) {
+				hdev = d; break;
+			}
+		} else {
+			if (bacmp(&d->bdaddr, dst)) {
+				hdev = d; break;
+			}
+		}
+	}
+
+	if (hdev)
+		hdev = hci_dev_hold(hdev);
+
+	read_unlock_bh(&hci_dev_list_lock);
+	return hdev;
+}
+EXPORT_SYMBOL(hci_get_route);
+
+/* Create SCO, ACL or LE connection.
+ * Device _must_ be locked */
+struct hci_conn *hci_connect(struct hci_dev *hdev, int type,
+					__u16 pkt_type, bdaddr_t *dst,
+					__u8 sec_level, __u8 auth_type)
+{
+	struct hci_conn *acl;
+	struct hci_conn *sco;
+	struct hci_conn *le;
+
+	BT_DBG("%s dst %s", hdev->name, batostr(dst));
+
+	if (type == LE_LINK) {
+		struct adv_entry *entry;
+
+		le = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst);
+		if (le)
+			return ERR_PTR(-EBUSY);
+
+		entry = hci_find_adv_entry(hdev, dst);
+		if (!entry)
+			return ERR_PTR(-EHOSTUNREACH);
+
+		le = hci_conn_add(hdev, LE_LINK, 0, dst);
+		if (!le)
+			return ERR_PTR(-ENOMEM);
+
+		le->dst_type = entry->bdaddr_type;
+
+		hci_le_connect(le);
+
+		hci_conn_hold(le);
+
+		return le;
+	}
+
+	acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+	if (!acl) {
+		acl = hci_conn_add(hdev, ACL_LINK, 0, dst);
+		if (!acl)
+			return NULL;
+	}
+
+	hci_conn_hold(acl);
+
+	if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+		acl->sec_level = BT_SECURITY_LOW;
+		acl->pending_sec_level = sec_level;
+		acl->auth_type = auth_type;
+		hci_acl_connect(acl);
+	}
+
+	if (type == ACL_LINK)
+		return acl;
+
+	sco = hci_conn_hash_lookup_ba(hdev, type, dst);
+	if (!sco) {
+		sco = hci_conn_add(hdev, type, pkt_type, dst);
+		if (!sco) {
+			hci_conn_put(acl);
+			return NULL;
+		}
+	}
+
+	acl->link = sco;
+	sco->link = acl;
+
+	hci_conn_hold(sco);
+
+	if (acl->state == BT_CONNECTED &&
+			(sco->state == BT_OPEN || sco->state == BT_CLOSED)) {
+		acl->power_save = 1;
+		hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON);
+
+		if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->pend)) {
+			/* defer SCO setup until mode change completed */
+			set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->pend);
+			return sco;
+		}
+
+		hci_sco_setup(acl, 0x00);
+	}
+
+	return sco;
+}
+EXPORT_SYMBOL(hci_connect);
+
+/* Check link security requirement */
+int hci_conn_check_link_mode(struct hci_conn *conn)
+{
+	BT_DBG("conn %p", conn);
+
+	if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0 &&
+					!(conn->link_mode & HCI_LM_ENCRYPT))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(hci_conn_check_link_mode);
+
+/* Authenticate remote device */
+static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
+{
+	BT_DBG("conn %p", conn);
+
+	if (conn->pending_sec_level > sec_level)
+		sec_level = conn->pending_sec_level;
+
+	if (sec_level > conn->sec_level)
+		conn->pending_sec_level = sec_level;
+	else if (conn->link_mode & HCI_LM_AUTH)
+		return 1;
+
+	/* Make sure we preserve an existing MITM requirement*/
+	auth_type |= (conn->auth_type & 0x01);
+
+	conn->auth_type = auth_type;
+
+	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+		struct hci_cp_auth_requested cp;
+
+		/* encrypt must be pending if auth is also pending */
+		set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
+
+		cp.handle = cpu_to_le16(conn->handle);
+		hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
+							sizeof(cp), &cp);
+		if (conn->key_type != 0xff)
+			set_bit(HCI_CONN_REAUTH_PEND, &conn->pend);
+	}
+
+	return 0;
+}
+
+/* Encrypt the the link */
+static void hci_conn_encrypt(struct hci_conn *conn)
+{
+	BT_DBG("conn %p", conn);
+
+	if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) {
+		struct hci_cp_set_conn_encrypt cp;
+		cp.handle  = cpu_to_le16(conn->handle);
+		cp.encrypt = 0x01;
+		hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+									&cp);
+	}
+}
+
+/* Enable security */
+int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
+{
+	BT_DBG("conn %p", conn);
+
+	/* For sdp we don't need the link key. */
+	if (sec_level == BT_SECURITY_SDP)
+		return 1;
+
+	/* For non 2.1 devices and low security level we don't need the link
+	   key. */
+	if (sec_level == BT_SECURITY_LOW &&
+				(!conn->ssp_mode || !conn->hdev->ssp_mode))
+		return 1;
+
+	/* For other security levels we need the link key. */
+	if (!(conn->link_mode & HCI_LM_AUTH))
+		goto auth;
+
+	/* An authenticated combination key has sufficient security for any
+	   security level. */
+	if (conn->key_type == HCI_LK_AUTH_COMBINATION)
+		goto encrypt;
+
+	/* An unauthenticated combination key has sufficient security for
+	   security level 1 and 2. */
+	if (conn->key_type == HCI_LK_UNAUTH_COMBINATION &&
+			(sec_level == BT_SECURITY_MEDIUM ||
+			sec_level == BT_SECURITY_LOW))
+		goto encrypt;
+
+	/* A combination key has always sufficient security for the security
+	   levels 1 or 2. High security level requires the combination key
+	   is generated using maximum PIN code length (16).
+	   For pre 2.1 units. */
+	if (conn->key_type == HCI_LK_COMBINATION &&
+			(sec_level != BT_SECURITY_HIGH ||
+			conn->pin_length == 16))
+		goto encrypt;
+
+auth:
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
+		return 0;
+
+	if (!hci_conn_auth(conn, sec_level, auth_type))
+		return 0;
+
+encrypt:
+	if (conn->link_mode & HCI_LM_ENCRYPT)
+		return 1;
+
+	hci_conn_encrypt(conn);
+	return 0;
+}
+EXPORT_SYMBOL(hci_conn_security);
+
+/* Check secure link requirement */
+int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level)
+{
+	BT_DBG("conn %p", conn);
+
+	if (sec_level != BT_SECURITY_HIGH)
+		return 1; /* Accept if non-secure is required */
+
+	if (conn->sec_level == BT_SECURITY_HIGH)
+		return 1;
+
+	return 0; /* Reject not secure link */
+}
+EXPORT_SYMBOL(hci_conn_check_secure);
+
+/* Change link key */
+int hci_conn_change_link_key(struct hci_conn *conn)
+{
+	BT_DBG("conn %p", conn);
+
+	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+		struct hci_cp_change_conn_link_key cp;
+		cp.handle = cpu_to_le16(conn->handle);
+		hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY,
+							sizeof(cp), &cp);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_conn_change_link_key);
+
+/* Switch role */
+int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
+{
+	BT_DBG("conn %p", conn);
+
+	if (!role && conn->link_mode & HCI_LM_MASTER)
+		return 1;
+
+	if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->pend)) {
+		struct hci_cp_switch_role cp;
+		bacpy(&cp.bdaddr, &conn->dst);
+		cp.role = role;
+		hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_conn_switch_role);
+
+/* Enter active mode */
+void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	if (test_bit(HCI_RAW, &hdev->flags))
+		return;
+
+	if (conn->mode != HCI_CM_SNIFF)
+		goto timer;
+
+	if (!conn->power_save && !force_active)
+		goto timer;
+
+	if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+		struct hci_cp_exit_sniff_mode cp;
+		cp.handle = cpu_to_le16(conn->handle);
+		hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp);
+	}
+
+timer:
+	if (hdev->idle_timeout > 0)
+		mod_timer(&conn->idle_timer,
+			jiffies + msecs_to_jiffies(hdev->idle_timeout));
+}
+
+/* Enter sniff mode */
+void hci_conn_enter_sniff_mode(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p mode %d", conn, conn->mode);
+
+	if (test_bit(HCI_RAW, &hdev->flags))
+		return;
+
+	if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn))
+		return;
+
+	if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF))
+		return;
+
+	if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) {
+		struct hci_cp_sniff_subrate cp;
+		cp.handle             = cpu_to_le16(conn->handle);
+		cp.max_latency        = cpu_to_le16(0);
+		cp.min_remote_timeout = cpu_to_le16(0);
+		cp.min_local_timeout  = cpu_to_le16(0);
+		hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp);
+	}
+
+	if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+		struct hci_cp_sniff_mode cp;
+		cp.handle       = cpu_to_le16(conn->handle);
+		cp.max_interval = cpu_to_le16(hdev->sniff_max_interval);
+		cp.min_interval = cpu_to_le16(hdev->sniff_min_interval);
+		cp.attempt      = cpu_to_le16(4);
+		cp.timeout      = cpu_to_le16(1);
+		hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp);
+	}
+}
+
+/* Drop all connection on the device */
+void hci_conn_hash_flush(struct hci_dev *hdev)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct list_head *p;
+
+	BT_DBG("hdev %s", hdev->name);
+
+	p = h->list.next;
+	while (p != &h->list) {
+		struct hci_conn *c;
+
+		c = list_entry(p, struct hci_conn, list);
+		p = p->next;
+
+		c->state = BT_CLOSED;
+
+		hci_proto_disconn_cfm(c, 0x16);
+		hci_conn_del(c);
+	}
+}
+
+/* Check pending connect attempts */
+void hci_conn_check_pending(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+
+	BT_DBG("hdev %s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
+	if (conn)
+		hci_acl_connect(conn);
+
+	hci_dev_unlock(hdev);
+}
+
+void hci_conn_hold_device(struct hci_conn *conn)
+{
+	atomic_inc(&conn->devref);
+}
+EXPORT_SYMBOL(hci_conn_hold_device);
+
+void hci_conn_put_device(struct hci_conn *conn)
+{
+	if (atomic_dec_and_test(&conn->devref))
+		hci_conn_del_sysfs(conn);
+}
+EXPORT_SYMBOL(hci_conn_put_device);
+
+int hci_get_conn_list(void __user *arg)
+{
+	struct hci_conn_list_req req, *cl;
+	struct hci_conn_info *ci;
+	struct hci_dev *hdev;
+	struct list_head *p;
+	int n = 0, size, err;
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci))
+		return -EINVAL;
+
+	size = sizeof(req) + req.conn_num * sizeof(*ci);
+
+	cl = kmalloc(size, GFP_KERNEL);
+	if (!cl)
+		return -ENOMEM;
+
+	hdev = hci_dev_get(req.dev_id);
+	if (!hdev) {
+		kfree(cl);
+		return -ENODEV;
+	}
+
+	ci = cl->conn_info;
+
+	hci_dev_lock_bh(hdev);
+	list_for_each(p, &hdev->conn_hash.list) {
+		register struct hci_conn *c;
+		c = list_entry(p, struct hci_conn, list);
+
+		bacpy(&(ci + n)->bdaddr, &c->dst);
+		(ci + n)->handle = c->handle;
+		(ci + n)->type  = c->type;
+		(ci + n)->out   = c->out;
+		(ci + n)->state = c->state;
+		(ci + n)->link_mode = c->link_mode;
+		if (c->type == SCO_LINK) {
+			(ci + n)->mtu = hdev->sco_mtu;
+			(ci + n)->cnt = hdev->sco_cnt;
+			(ci + n)->pkts = hdev->sco_pkts;
+		} else {
+			(ci + n)->mtu = hdev->acl_mtu;
+			(ci + n)->cnt = hdev->acl_cnt;
+			(ci + n)->pkts = hdev->acl_pkts;
+		}
+		if (++n >= req.conn_num)
+			break;
+	}
+	hci_dev_unlock_bh(hdev);
+
+	cl->dev_id = hdev->id;
+	cl->conn_num = n;
+	size = sizeof(req) + n * sizeof(*ci);
+
+	hci_dev_put(hdev);
+
+	err = copy_to_user(arg, cl, size);
+	kfree(cl);
+
+	return err ? -EFAULT : 0;
+}
+
+int hci_get_conn_info(struct hci_dev *hdev, void __user *arg)
+{
+	struct hci_conn_info_req req;
+	struct hci_conn_info ci;
+	struct hci_conn *conn;
+	char __user *ptr = arg + sizeof(req);
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	hci_dev_lock_bh(hdev);
+	conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr);
+	if (conn) {
+		bacpy(&ci.bdaddr, &conn->dst);
+		ci.handle = conn->handle;
+		ci.type  = conn->type;
+		ci.out   = conn->out;
+		ci.state = conn->state;
+		ci.link_mode = conn->link_mode;
+		if (req.type == SCO_LINK) {
+			ci.mtu = hdev->sco_mtu;
+			ci.cnt = hdev->sco_cnt;
+			ci.pkts = hdev->sco_pkts;
+		} else {
+			ci.mtu = hdev->acl_mtu;
+			ci.cnt = hdev->acl_cnt;
+			ci.pkts = hdev->acl_pkts;
+		}
+	}
+	hci_dev_unlock_bh(hdev);
+
+	if (!conn)
+		return -ENOENT;
+
+	return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0;
+}
+
+int hci_get_auth_info(struct hci_dev *hdev, void __user *arg)
+{
+	struct hci_auth_info_req req;
+	struct hci_conn *conn;
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	hci_dev_lock_bh(hdev);
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr);
+	if (conn)
+		req.type = conn->auth_type;
+	hci_dev_unlock_bh(hdev);
+
+	if (!conn)
+		return -ENOENT;
+
+	return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
new file mode 100644
index 00000000..f38e633c
--- /dev/null
+++ b/net/bluetooth/hci_core.c
@@ -0,0 +1,2427 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI core. */
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/rfkill.h>
+#include <linux/timer.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#define AUTO_OFF_TIMEOUT 2000
+
+static void hci_cmd_task(unsigned long arg);
+static void hci_rx_task(unsigned long arg);
+static void hci_tx_task(unsigned long arg);
+
+static DEFINE_RWLOCK(hci_task_lock);
+
+/* HCI device list */
+LIST_HEAD(hci_dev_list);
+DEFINE_RWLOCK(hci_dev_list_lock);
+
+/* HCI callback list */
+LIST_HEAD(hci_cb_list);
+DEFINE_RWLOCK(hci_cb_list_lock);
+
+/* HCI protocols */
+#define HCI_MAX_PROTO	2
+struct hci_proto *hci_proto[HCI_MAX_PROTO];
+
+/* HCI notifiers list */
+static ATOMIC_NOTIFIER_HEAD(hci_notifier);
+
+/* ---- HCI notifications ---- */
+
+int hci_register_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&hci_notifier, nb);
+}
+
+int hci_unregister_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&hci_notifier, nb);
+}
+
+static void hci_notify(struct hci_dev *hdev, int event)
+{
+	atomic_notifier_call_chain(&hci_notifier, event, hdev);
+}
+
+/* ---- HCI requests ---- */
+
+void hci_req_complete(struct hci_dev *hdev, __u16 cmd, int result)
+{
+	BT_DBG("%s command 0x%04x result 0x%2.2x", hdev->name, cmd, result);
+
+	/* If this is the init phase check if the completed command matches
+	 * the last init command, and if not just return.
+	 */
+	if (test_bit(HCI_INIT, &hdev->flags) && hdev->init_last_cmd != cmd)
+		return;
+
+	if (hdev->req_status == HCI_REQ_PEND) {
+		hdev->req_result = result;
+		hdev->req_status = HCI_REQ_DONE;
+		wake_up_interruptible(&hdev->req_wait_q);
+	}
+}
+
+static void hci_req_cancel(struct hci_dev *hdev, int err)
+{
+	BT_DBG("%s err 0x%2.2x", hdev->name, err);
+
+	if (hdev->req_status == HCI_REQ_PEND) {
+		hdev->req_result = err;
+		hdev->req_status = HCI_REQ_CANCELED;
+		wake_up_interruptible(&hdev->req_wait_q);
+	}
+}
+
+/* Execute request and wait for completion. */
+static int __hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt),
+					unsigned long opt, __u32 timeout)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int err = 0;
+
+	BT_DBG("%s start", hdev->name);
+
+	hdev->req_status = HCI_REQ_PEND;
+
+	add_wait_queue(&hdev->req_wait_q, &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	req(hdev, opt);
+	schedule_timeout(timeout);
+
+	remove_wait_queue(&hdev->req_wait_q, &wait);
+
+	if (signal_pending(current))
+		return -EINTR;
+
+	switch (hdev->req_status) {
+	case HCI_REQ_DONE:
+		err = -bt_to_errno(hdev->req_result);
+		break;
+
+	case HCI_REQ_CANCELED:
+		err = -hdev->req_result;
+		break;
+
+	default:
+		err = -ETIMEDOUT;
+		break;
+	}
+
+	hdev->req_status = hdev->req_result = 0;
+
+	BT_DBG("%s end: err %d", hdev->name, err);
+
+	return err;
+}
+
+static inline int hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt),
+					unsigned long opt, __u32 timeout)
+{
+	int ret;
+
+	if (!test_bit(HCI_UP, &hdev->flags))
+		return -ENETDOWN;
+
+	/* Serialize all requests */
+	hci_req_lock(hdev);
+	ret = __hci_request(hdev, req, opt, timeout);
+	hci_req_unlock(hdev);
+
+	return ret;
+}
+
+static void hci_reset_req(struct hci_dev *hdev, unsigned long opt)
+{
+	BT_DBG("%s %ld", hdev->name, opt);
+
+	/* Reset device */
+	set_bit(HCI_RESET, &hdev->flags);
+	hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL);
+}
+
+static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
+{
+	struct hci_cp_delete_stored_link_key cp;
+	struct sk_buff *skb;
+	__le16 param;
+	__u8 flt_type;
+
+	BT_DBG("%s %ld", hdev->name, opt);
+
+	/* Driver initialization */
+
+	/* Special commands */
+	while ((skb = skb_dequeue(&hdev->driver_init))) {
+		bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
+		skb->dev = (void *) hdev;
+
+		skb_queue_tail(&hdev->cmd_q, skb);
+		tasklet_schedule(&hdev->cmd_task);
+	}
+	skb_queue_purge(&hdev->driver_init);
+
+	/* Mandatory initialization */
+
+	/* Reset */
+	if (!test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) {
+			set_bit(HCI_RESET, &hdev->flags);
+			hci_send_cmd(hdev, HCI_OP_RESET, 0, NULL);
+	}
+
+	/* Read Local Supported Features */
+	hci_send_cmd(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
+
+	/* Read Local Version */
+	hci_send_cmd(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
+
+	/* Read Buffer Size (ACL mtu, max pkt, etc.) */
+	hci_send_cmd(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL);
+
+#if 0
+	/* Host buffer size */
+	{
+		struct hci_cp_host_buffer_size cp;
+		cp.acl_mtu = cpu_to_le16(HCI_MAX_ACL_SIZE);
+		cp.sco_mtu = HCI_MAX_SCO_SIZE;
+		cp.acl_max_pkt = cpu_to_le16(0xffff);
+		cp.sco_max_pkt = cpu_to_le16(0xffff);
+		hci_send_cmd(hdev, HCI_OP_HOST_BUFFER_SIZE, sizeof(cp), &cp);
+	}
+#endif
+
+	/* Read BD Address */
+	hci_send_cmd(hdev, HCI_OP_READ_BD_ADDR, 0, NULL);
+
+	/* Read Class of Device */
+	hci_send_cmd(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL);
+
+	/* Read Local Name */
+	hci_send_cmd(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL);
+
+	/* Read Voice Setting */
+	hci_send_cmd(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL);
+
+	/* Optional initialization */
+
+	/* Clear Event Filters */
+	flt_type = HCI_FLT_CLEAR_ALL;
+	hci_send_cmd(hdev, HCI_OP_SET_EVENT_FLT, 1, &flt_type);
+
+	/* Connection accept timeout ~20 secs */
+	param = cpu_to_le16(0x7d00);
+	hci_send_cmd(hdev, HCI_OP_WRITE_CA_TIMEOUT, 2, &param);
+
+	bacpy(&cp.bdaddr, BDADDR_ANY);
+	cp.delete_all = 1;
+	hci_send_cmd(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp);
+}
+
+static void hci_le_init_req(struct hci_dev *hdev, unsigned long opt)
+{
+	BT_DBG("%s", hdev->name);
+
+	/* Read LE buffer size */
+	hci_send_cmd(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL);
+}
+
+static void hci_scan_req(struct hci_dev *hdev, unsigned long opt)
+{
+	__u8 scan = opt;
+
+	BT_DBG("%s %x", hdev->name, scan);
+
+	/* Inquiry and Page scans */
+	hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+}
+
+static void hci_auth_req(struct hci_dev *hdev, unsigned long opt)
+{
+	__u8 auth = opt;
+
+	BT_DBG("%s %x", hdev->name, auth);
+
+	/* Authentication */
+	hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth);
+}
+
+static void hci_encrypt_req(struct hci_dev *hdev, unsigned long opt)
+{
+	__u8 encrypt = opt;
+
+	BT_DBG("%s %x", hdev->name, encrypt);
+
+	/* Encryption */
+	hci_send_cmd(hdev, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt);
+}
+
+static void hci_linkpol_req(struct hci_dev *hdev, unsigned long opt)
+{
+	__le16 policy = cpu_to_le16(opt);
+
+	BT_DBG("%s %x", hdev->name, policy);
+
+	/* Default link policy */
+	hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy);
+}
+
+/* Get HCI device by index.
+ * Device is held on return. */
+struct hci_dev *hci_dev_get(int index)
+{
+	struct hci_dev *hdev = NULL;
+	struct list_head *p;
+
+	BT_DBG("%d", index);
+
+	if (index < 0)
+		return NULL;
+
+	read_lock(&hci_dev_list_lock);
+	list_for_each(p, &hci_dev_list) {
+		struct hci_dev *d = list_entry(p, struct hci_dev, list);
+		if (d->id == index) {
+			hdev = hci_dev_hold(d);
+			break;
+		}
+	}
+	read_unlock(&hci_dev_list_lock);
+	return hdev;
+}
+
+/* ---- Inquiry support ---- */
+static void inquiry_cache_flush(struct hci_dev *hdev)
+{
+	struct inquiry_cache *cache = &hdev->inq_cache;
+	struct inquiry_entry *next  = cache->list, *e;
+
+	BT_DBG("cache %p", cache);
+
+	cache->list = NULL;
+	while ((e = next)) {
+		next = e->next;
+		kfree(e);
+	}
+}
+
+struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct inquiry_cache *cache = &hdev->inq_cache;
+	struct inquiry_entry *e;
+
+	BT_DBG("cache %p, %s", cache, batostr(bdaddr));
+
+	for (e = cache->list; e; e = e->next)
+		if (!bacmp(&e->data.bdaddr, bdaddr))
+			break;
+	return e;
+}
+
+void hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data)
+{
+	struct inquiry_cache *cache = &hdev->inq_cache;
+	struct inquiry_entry *ie;
+
+	BT_DBG("cache %p, %s", cache, batostr(&data->bdaddr));
+
+	ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr);
+	if (!ie) {
+		/* Entry not in the cache. Add new one. */
+		ie = kzalloc(sizeof(struct inquiry_entry), GFP_ATOMIC);
+		if (!ie)
+			return;
+
+		ie->next = cache->list;
+		cache->list = ie;
+	}
+
+	memcpy(&ie->data, data, sizeof(*data));
+	ie->timestamp = jiffies;
+	cache->timestamp = jiffies;
+}
+
+static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf)
+{
+	struct inquiry_cache *cache = &hdev->inq_cache;
+	struct inquiry_info *info = (struct inquiry_info *) buf;
+	struct inquiry_entry *e;
+	int copied = 0;
+
+	for (e = cache->list; e && copied < num; e = e->next, copied++) {
+		struct inquiry_data *data = &e->data;
+		bacpy(&info->bdaddr, &data->bdaddr);
+		info->pscan_rep_mode	= data->pscan_rep_mode;
+		info->pscan_period_mode	= data->pscan_period_mode;
+		info->pscan_mode	= data->pscan_mode;
+		memcpy(info->dev_class, data->dev_class, 3);
+		info->clock_offset	= data->clock_offset;
+		info++;
+	}
+
+	BT_DBG("cache %p, copied %d", cache, copied);
+	return copied;
+}
+
+static void hci_inq_req(struct hci_dev *hdev, unsigned long opt)
+{
+	struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt;
+	struct hci_cp_inquiry cp;
+
+	BT_DBG("%s", hdev->name);
+
+	if (test_bit(HCI_INQUIRY, &hdev->flags))
+		return;
+
+	/* Start Inquiry */
+	memcpy(&cp.lap, &ir->lap, 3);
+	cp.length  = ir->length;
+	cp.num_rsp = ir->num_rsp;
+	hci_send_cmd(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp);
+}
+
+int hci_inquiry(void __user *arg)
+{
+	__u8 __user *ptr = arg;
+	struct hci_inquiry_req ir;
+	struct hci_dev *hdev;
+	int err = 0, do_inquiry = 0, max_rsp;
+	long timeo;
+	__u8 *buf;
+
+	if (copy_from_user(&ir, ptr, sizeof(ir)))
+		return -EFAULT;
+
+	hdev = hci_dev_get(ir.dev_id);
+	if (!hdev)
+		return -ENODEV;
+
+	hci_dev_lock_bh(hdev);
+	if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX ||
+				inquiry_cache_empty(hdev) ||
+				ir.flags & IREQ_CACHE_FLUSH) {
+		inquiry_cache_flush(hdev);
+		do_inquiry = 1;
+	}
+	hci_dev_unlock_bh(hdev);
+
+	timeo = ir.length * msecs_to_jiffies(2000);
+
+	if (do_inquiry) {
+		err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo);
+		if (err < 0)
+			goto done;
+	}
+
+	/* for unlimited number of responses we will use buffer with 255 entries */
+	max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp;
+
+	/* cache_dump can't sleep. Therefore we allocate temp buffer and then
+	 * copy it to the user space.
+	 */
+	buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	hci_dev_lock_bh(hdev);
+	ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf);
+	hci_dev_unlock_bh(hdev);
+
+	BT_DBG("num_rsp %d", ir.num_rsp);
+
+	if (!copy_to_user(ptr, &ir, sizeof(ir))) {
+		ptr += sizeof(ir);
+		if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) *
+					ir.num_rsp))
+			err = -EFAULT;
+	} else
+		err = -EFAULT;
+
+	kfree(buf);
+
+done:
+	hci_dev_put(hdev);
+	return err;
+}
+
+/* ---- HCI ioctl helpers ---- */
+
+int hci_dev_open(__u16 dev)
+{
+	struct hci_dev *hdev;
+	int ret = 0;
+
+	hdev = hci_dev_get(dev);
+	if (!hdev)
+		return -ENODEV;
+
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	hci_req_lock(hdev);
+
+	if (test_bit(HCI_UNREGISTER, &hdev->flags)) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) {
+		ret = -ERFKILL;
+		goto done;
+	}
+
+	if (test_bit(HCI_UP, &hdev->flags)) {
+		ret = -EALREADY;
+		goto done;
+	}
+
+	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+		set_bit(HCI_RAW, &hdev->flags);
+
+	/* Treat all non BR/EDR controllers as raw devices for now */
+	if (hdev->dev_type != HCI_BREDR)
+		set_bit(HCI_RAW, &hdev->flags);
+
+	if (hdev->open(hdev)) {
+		ret = -EIO;
+		goto done;
+	}
+
+	if (!test_bit(HCI_RAW, &hdev->flags)) {
+		atomic_set(&hdev->cmd_cnt, 1);
+		set_bit(HCI_INIT, &hdev->flags);
+		hdev->init_last_cmd = 0;
+
+		ret = __hci_request(hdev, hci_init_req, 0,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+
+		if (lmp_host_le_capable(hdev))
+			ret = __hci_request(hdev, hci_le_init_req, 0,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+
+		clear_bit(HCI_INIT, &hdev->flags);
+	}
+
+	if (!ret) {
+		hci_dev_hold(hdev);
+		set_bit(HCI_UP, &hdev->flags);
+		hci_notify(hdev, HCI_DEV_UP);
+		if (!test_bit(HCI_SETUP, &hdev->flags))
+			mgmt_powered(hdev->id, 1);
+	} else {
+		/* Init failed, cleanup */
+		tasklet_kill(&hdev->rx_task);
+		tasklet_kill(&hdev->tx_task);
+		tasklet_kill(&hdev->cmd_task);
+
+		skb_queue_purge(&hdev->cmd_q);
+		skb_queue_purge(&hdev->rx_q);
+
+		if (hdev->flush)
+			hdev->flush(hdev);
+
+		if (hdev->sent_cmd) {
+			kfree_skb(hdev->sent_cmd);
+			hdev->sent_cmd = NULL;
+		}
+
+		hdev->close(hdev);
+		hdev->flags = 0;
+	}
+
+done:
+	hci_req_unlock(hdev);
+	hci_dev_put(hdev);
+	return ret;
+}
+
+static int hci_dev_do_close(struct hci_dev *hdev)
+{
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	hci_req_cancel(hdev, ENODEV);
+	hci_req_lock(hdev);
+
+	if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
+		del_timer_sync(&hdev->cmd_timer);
+		hci_req_unlock(hdev);
+		return 0;
+	}
+
+	/* Kill RX and TX tasks */
+	tasklet_kill(&hdev->rx_task);
+	tasklet_kill(&hdev->tx_task);
+
+	hci_dev_lock_bh(hdev);
+	inquiry_cache_flush(hdev);
+	hci_conn_hash_flush(hdev);
+	hci_dev_unlock_bh(hdev);
+
+	hci_notify(hdev, HCI_DEV_DOWN);
+
+	if (hdev->flush)
+		hdev->flush(hdev);
+
+	/* Reset device */
+	skb_queue_purge(&hdev->cmd_q);
+	atomic_set(&hdev->cmd_cnt, 1);
+	if (!test_bit(HCI_RAW, &hdev->flags)) {
+		set_bit(HCI_INIT, &hdev->flags);
+		__hci_request(hdev, hci_reset_req, 0,
+					msecs_to_jiffies(250));
+		clear_bit(HCI_INIT, &hdev->flags);
+	}
+
+	/* Kill cmd task */
+	tasklet_kill(&hdev->cmd_task);
+
+	/* Drop queues */
+	skb_queue_purge(&hdev->rx_q);
+	skb_queue_purge(&hdev->cmd_q);
+	skb_queue_purge(&hdev->raw_q);
+
+	/* Drop last sent command */
+	if (hdev->sent_cmd) {
+		del_timer_sync(&hdev->cmd_timer);
+		kfree_skb(hdev->sent_cmd);
+		hdev->sent_cmd = NULL;
+	}
+
+	/* After this point our queues are empty
+	 * and no tasks are scheduled. */
+	hdev->close(hdev);
+
+	mgmt_powered(hdev->id, 0);
+
+	/* Clear flags */
+	hdev->flags = 0;
+
+	hci_req_unlock(hdev);
+
+	hci_dev_put(hdev);
+	return 0;
+}
+
+int hci_dev_close(__u16 dev)
+{
+	struct hci_dev *hdev;
+	int err;
+
+	hdev = hci_dev_get(dev);
+	if (!hdev)
+		return -ENODEV;
+	err = hci_dev_do_close(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+int hci_dev_reset(__u16 dev)
+{
+	struct hci_dev *hdev;
+	int ret = 0;
+
+	hdev = hci_dev_get(dev);
+	if (!hdev)
+		return -ENODEV;
+
+	hci_req_lock(hdev);
+	tasklet_disable(&hdev->tx_task);
+
+	if (!test_bit(HCI_UP, &hdev->flags))
+		goto done;
+
+	/* Drop queues */
+	skb_queue_purge(&hdev->rx_q);
+	skb_queue_purge(&hdev->cmd_q);
+
+	hci_dev_lock_bh(hdev);
+	inquiry_cache_flush(hdev);
+	hci_conn_hash_flush(hdev);
+	hci_dev_unlock_bh(hdev);
+
+	if (hdev->flush)
+		hdev->flush(hdev);
+
+	atomic_set(&hdev->cmd_cnt, 1);
+	hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
+
+	if (!test_bit(HCI_RAW, &hdev->flags))
+		ret = __hci_request(hdev, hci_reset_req, 0,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+
+done:
+	tasklet_enable(&hdev->tx_task);
+	hci_req_unlock(hdev);
+	hci_dev_put(hdev);
+	return ret;
+}
+
+int hci_dev_reset_stat(__u16 dev)
+{
+	struct hci_dev *hdev;
+	int ret = 0;
+
+	hdev = hci_dev_get(dev);
+	if (!hdev)
+		return -ENODEV;
+
+	memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
+
+	hci_dev_put(hdev);
+
+	return ret;
+}
+
+int hci_dev_cmd(unsigned int cmd, void __user *arg)
+{
+	struct hci_dev *hdev;
+	struct hci_dev_req dr;
+	int err = 0;
+
+	if (copy_from_user(&dr, arg, sizeof(dr)))
+		return -EFAULT;
+
+	hdev = hci_dev_get(dr.dev_id);
+	if (!hdev)
+		return -ENODEV;
+
+	switch (cmd) {
+	case HCISETAUTH:
+		err = hci_request(hdev, hci_auth_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+		break;
+
+	case HCISETENCRYPT:
+		if (!lmp_encrypt_capable(hdev)) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+
+		if (!test_bit(HCI_AUTH, &hdev->flags)) {
+			/* Auth must be enabled first */
+			err = hci_request(hdev, hci_auth_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+			if (err)
+				break;
+		}
+
+		err = hci_request(hdev, hci_encrypt_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+		break;
+
+	case HCISETSCAN:
+		err = hci_request(hdev, hci_scan_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+		break;
+
+	case HCISETLINKPOL:
+		err = hci_request(hdev, hci_linkpol_req, dr.dev_opt,
+					msecs_to_jiffies(HCI_INIT_TIMEOUT));
+		break;
+
+	case HCISETLINKMODE:
+		hdev->link_mode = ((__u16) dr.dev_opt) &
+					(HCI_LM_MASTER | HCI_LM_ACCEPT);
+		break;
+
+	case HCISETPTYPE:
+		hdev->pkt_type = (__u16) dr.dev_opt;
+		break;
+
+	case HCISETACLMTU:
+		hdev->acl_mtu  = *((__u16 *) &dr.dev_opt + 1);
+		hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0);
+		break;
+
+	case HCISETSCOMTU:
+		hdev->sco_mtu  = *((__u16 *) &dr.dev_opt + 1);
+		hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	hci_dev_put(hdev);
+	return err;
+}
+
+int hci_get_dev_list(void __user *arg)
+{
+	struct hci_dev_list_req *dl;
+	struct hci_dev_req *dr;
+	struct list_head *p;
+	int n = 0, size, err;
+	__u16 dev_num;
+
+	if (get_user(dev_num, (__u16 __user *) arg))
+		return -EFAULT;
+
+	if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr))
+		return -EINVAL;
+
+	size = sizeof(*dl) + dev_num * sizeof(*dr);
+
+	dl = kzalloc(size, GFP_KERNEL);
+	if (!dl)
+		return -ENOMEM;
+
+	dr = dl->dev_req;
+
+	read_lock_bh(&hci_dev_list_lock);
+	list_for_each(p, &hci_dev_list) {
+		struct hci_dev *hdev;
+
+		hdev = list_entry(p, struct hci_dev, list);
+
+		hci_del_off_timer(hdev);
+
+		if (!test_bit(HCI_MGMT, &hdev->flags))
+			set_bit(HCI_PAIRABLE, &hdev->flags);
+
+		(dr + n)->dev_id  = hdev->id;
+		(dr + n)->dev_opt = hdev->flags;
+
+		if (++n >= dev_num)
+			break;
+	}
+	read_unlock_bh(&hci_dev_list_lock);
+
+	dl->dev_num = n;
+	size = sizeof(*dl) + n * sizeof(*dr);
+
+	err = copy_to_user(arg, dl, size);
+	kfree(dl);
+
+	return err ? -EFAULT : 0;
+}
+
+int hci_get_dev_info(void __user *arg)
+{
+	struct hci_dev *hdev;
+	struct hci_dev_info di;
+	int err = 0;
+
+	if (copy_from_user(&di, arg, sizeof(di)))
+		return -EFAULT;
+
+	hdev = hci_dev_get(di.dev_id);
+	if (!hdev)
+		return -ENODEV;
+
+	hci_del_off_timer(hdev);
+
+	if (!test_bit(HCI_MGMT, &hdev->flags))
+		set_bit(HCI_PAIRABLE, &hdev->flags);
+
+	strcpy(di.name, hdev->name);
+	di.bdaddr   = hdev->bdaddr;
+	di.type     = (hdev->bus & 0x0f) | (hdev->dev_type << 4);
+	di.flags    = hdev->flags;
+	di.pkt_type = hdev->pkt_type;
+	di.acl_mtu  = hdev->acl_mtu;
+	di.acl_pkts = hdev->acl_pkts;
+	di.sco_mtu  = hdev->sco_mtu;
+	di.sco_pkts = hdev->sco_pkts;
+	di.link_policy = hdev->link_policy;
+	di.link_mode   = hdev->link_mode;
+
+	memcpy(&di.stat, &hdev->stat, sizeof(di.stat));
+	memcpy(&di.features, &hdev->features, sizeof(di.features));
+
+	if (copy_to_user(arg, &di, sizeof(di)))
+		err = -EFAULT;
+
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+/* ---- Interface to HCI drivers ---- */
+
+static int hci_rfkill_set_block(void *data, bool blocked)
+{
+	struct hci_dev *hdev = data;
+
+	BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
+
+	if (!blocked)
+		return 0;
+
+	hci_dev_do_close(hdev);
+
+	return 0;
+}
+
+static const struct rfkill_ops hci_rfkill_ops = {
+	.set_block = hci_rfkill_set_block,
+};
+
+/* Alloc HCI device */
+struct hci_dev *hci_alloc_dev(void)
+{
+	struct hci_dev *hdev;
+
+	hdev = kzalloc(sizeof(struct hci_dev), GFP_KERNEL);
+	if (!hdev)
+		return NULL;
+
+	skb_queue_head_init(&hdev->driver_init);
+
+	return hdev;
+}
+EXPORT_SYMBOL(hci_alloc_dev);
+
+/* Free HCI device */
+void hci_free_dev(struct hci_dev *hdev)
+{
+	skb_queue_purge(&hdev->driver_init);
+
+	/* will free via device release */
+	put_device(&hdev->dev);
+}
+EXPORT_SYMBOL(hci_free_dev);
+
+static void hci_power_on(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev, power_on);
+
+	BT_DBG("%s", hdev->name);
+
+	if (hci_dev_open(hdev->id) < 0)
+		return;
+
+	if (test_bit(HCI_AUTO_OFF, &hdev->flags))
+		mod_timer(&hdev->off_timer,
+				jiffies + msecs_to_jiffies(AUTO_OFF_TIMEOUT));
+
+	if (test_and_clear_bit(HCI_SETUP, &hdev->flags))
+		mgmt_index_added(hdev->id);
+}
+
+static void hci_power_off(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev, power_off);
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_close(hdev->id);
+}
+
+static void hci_auto_off(unsigned long data)
+{
+	struct hci_dev *hdev = (struct hci_dev *) data;
+
+	BT_DBG("%s", hdev->name);
+
+	clear_bit(HCI_AUTO_OFF, &hdev->flags);
+
+	queue_work(hdev->workqueue, &hdev->power_off);
+}
+
+void hci_del_off_timer(struct hci_dev *hdev)
+{
+	BT_DBG("%s", hdev->name);
+
+	clear_bit(HCI_AUTO_OFF, &hdev->flags);
+	del_timer(&hdev->off_timer);
+}
+
+int hci_uuids_clear(struct hci_dev *hdev)
+{
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &hdev->uuids) {
+		struct bt_uuid *uuid;
+
+		uuid = list_entry(p, struct bt_uuid, list);
+
+		list_del(p);
+		kfree(uuid);
+	}
+
+	return 0;
+}
+
+int hci_link_keys_clear(struct hci_dev *hdev)
+{
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &hdev->link_keys) {
+		struct link_key *key;
+
+		key = list_entry(p, struct link_key, list);
+
+		list_del(p);
+		kfree(key);
+	}
+
+	return 0;
+}
+
+struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct list_head *p;
+
+	list_for_each(p, &hdev->link_keys) {
+		struct link_key *k;
+
+		k = list_entry(p, struct link_key, list);
+
+		if (bacmp(bdaddr, &k->bdaddr) == 0)
+			return k;
+	}
+
+	return NULL;
+}
+
+static int hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn,
+						u8 key_type, u8 old_key_type)
+{
+	/* Legacy key */
+	if (key_type < 0x03)
+		return 1;
+
+	/* Debug keys are insecure so don't store them persistently */
+	if (key_type == HCI_LK_DEBUG_COMBINATION)
+		return 0;
+
+	/* Changed combination key and there's no previous one */
+	if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff)
+		return 0;
+
+	/* Security mode 3 case */
+	if (!conn)
+		return 1;
+
+	/* Neither local nor remote side had no-bonding as requirement */
+	if (conn->auth_type > 0x01 && conn->remote_auth > 0x01)
+		return 1;
+
+	/* Local side had dedicated bonding as requirement */
+	if (conn->auth_type == 0x02 || conn->auth_type == 0x03)
+		return 1;
+
+	/* Remote side had dedicated bonding as requirement */
+	if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03)
+		return 1;
+
+	/* If none of the above criteria match, then don't store the key
+	 * persistently */
+	return 0;
+}
+
+struct link_key *hci_find_ltk(struct hci_dev *hdev, __le16 ediv, u8 rand[8])
+{
+	struct link_key *k;
+
+	list_for_each_entry(k, &hdev->link_keys, list) {
+		struct key_master_id *id;
+
+		if (k->type != HCI_LK_SMP_LTK)
+			continue;
+
+		if (k->dlen != sizeof(*id))
+			continue;
+
+		id = (void *) &k->data;
+		if (id->ediv == ediv &&
+				(memcmp(rand, id->rand, sizeof(id->rand)) == 0))
+			return k;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(hci_find_ltk);
+
+struct link_key *hci_find_link_key_type(struct hci_dev *hdev,
+					bdaddr_t *bdaddr, u8 type)
+{
+	struct link_key *k;
+
+	list_for_each_entry(k, &hdev->link_keys, list)
+		if (k->type == type && bacmp(bdaddr, &k->bdaddr) == 0)
+			return k;
+
+	return NULL;
+}
+EXPORT_SYMBOL(hci_find_link_key_type);
+
+int hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, int new_key,
+				bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len)
+{
+	struct link_key *key, *old_key;
+	u8 old_key_type, persistent;
+
+	old_key = hci_find_link_key(hdev, bdaddr);
+	if (old_key) {
+		old_key_type = old_key->type;
+		key = old_key;
+	} else {
+		old_key_type = conn ? conn->key_type : 0xff;
+		key = kzalloc(sizeof(*key), GFP_ATOMIC);
+		if (!key)
+			return -ENOMEM;
+		list_add(&key->list, &hdev->link_keys);
+	}
+
+	BT_DBG("%s key for %s type %u", hdev->name, batostr(bdaddr), type);
+
+	/* Some buggy controller combinations generate a changed
+	 * combination key for legacy pairing even when there's no
+	 * previous key */
+	if (type == HCI_LK_CHANGED_COMBINATION &&
+					(!conn || conn->remote_auth == 0xff) &&
+					old_key_type == 0xff) {
+		type = HCI_LK_COMBINATION;
+		if (conn)
+			conn->key_type = type;
+	}
+
+	bacpy(&key->bdaddr, bdaddr);
+	memcpy(key->val, val, 16);
+	key->pin_len = pin_len;
+
+	if (type == HCI_LK_CHANGED_COMBINATION)
+		key->type = old_key_type;
+	else
+		key->type = type;
+
+	if (!new_key)
+		return 0;
+
+	persistent = hci_persistent_key(hdev, conn, type, old_key_type);
+
+	mgmt_new_key(hdev->id, key, persistent);
+
+	if (!persistent) {
+		list_del(&key->list);
+		kfree(key);
+	}
+
+	return 0;
+}
+
+int hci_add_ltk(struct hci_dev *hdev, int new_key, bdaddr_t *bdaddr,
+			u8 key_size, __le16 ediv, u8 rand[8], u8 ltk[16])
+{
+	struct link_key *key, *old_key;
+	struct key_master_id *id;
+	u8 old_key_type;
+
+	BT_DBG("%s addr %s", hdev->name, batostr(bdaddr));
+
+	old_key = hci_find_link_key_type(hdev, bdaddr, HCI_LK_SMP_LTK);
+	if (old_key) {
+		key = old_key;
+		old_key_type = old_key->type;
+	} else {
+		key = kzalloc(sizeof(*key) + sizeof(*id), GFP_ATOMIC);
+		if (!key)
+			return -ENOMEM;
+		list_add(&key->list, &hdev->link_keys);
+		old_key_type = 0xff;
+	}
+
+	key->dlen = sizeof(*id);
+
+	bacpy(&key->bdaddr, bdaddr);
+	memcpy(key->val, ltk, sizeof(key->val));
+	key->type = HCI_LK_SMP_LTK;
+	key->pin_len = key_size;
+
+	id = (void *) &key->data;
+	id->ediv = ediv;
+	memcpy(id->rand, rand, sizeof(id->rand));
+
+	if (new_key)
+		mgmt_new_key(hdev->id, key, old_key_type);
+
+	return 0;
+}
+
+int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct link_key *key;
+
+	key = hci_find_link_key(hdev, bdaddr);
+	if (!key)
+		return -ENOENT;
+
+	BT_DBG("%s removing %s", hdev->name, batostr(bdaddr));
+
+	list_del(&key->list);
+	kfree(key);
+
+	return 0;
+}
+
+/* HCI command timer function */
+static void hci_cmd_timer(unsigned long arg)
+{
+	struct hci_dev *hdev = (void *) arg;
+
+	BT_ERR("%s command tx timeout", hdev->name);
+	atomic_set(&hdev->cmd_cnt, 1);
+	tasklet_schedule(&hdev->cmd_task);
+}
+
+struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
+							bdaddr_t *bdaddr)
+{
+	struct oob_data *data;
+
+	list_for_each_entry(data, &hdev->remote_oob_data, list)
+		if (bacmp(bdaddr, &data->bdaddr) == 0)
+			return data;
+
+	return NULL;
+}
+
+int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct oob_data *data;
+
+	data = hci_find_remote_oob_data(hdev, bdaddr);
+	if (!data)
+		return -ENOENT;
+
+	BT_DBG("%s removing %s", hdev->name, batostr(bdaddr));
+
+	list_del(&data->list);
+	kfree(data);
+
+	return 0;
+}
+
+int hci_remote_oob_data_clear(struct hci_dev *hdev)
+{
+	struct oob_data *data, *n;
+
+	list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) {
+		list_del(&data->list);
+		kfree(data);
+	}
+
+	return 0;
+}
+
+int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *hash,
+								u8 *randomizer)
+{
+	struct oob_data *data;
+
+	data = hci_find_remote_oob_data(hdev, bdaddr);
+
+	if (!data) {
+		data = kmalloc(sizeof(*data), GFP_ATOMIC);
+		if (!data)
+			return -ENOMEM;
+
+		bacpy(&data->bdaddr, bdaddr);
+		list_add(&data->list, &hdev->remote_oob_data);
+	}
+
+	memcpy(data->hash, hash, sizeof(data->hash));
+	memcpy(data->randomizer, randomizer, sizeof(data->randomizer));
+
+	BT_DBG("%s for %s", hdev->name, batostr(bdaddr));
+
+	return 0;
+}
+
+struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev,
+						bdaddr_t *bdaddr)
+{
+	struct list_head *p;
+
+	list_for_each(p, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(p, struct bdaddr_list, list);
+
+		if (bacmp(bdaddr, &b->bdaddr) == 0)
+			return b;
+	}
+
+	return NULL;
+}
+
+int hci_blacklist_clear(struct hci_dev *hdev)
+{
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(p, struct bdaddr_list, list);
+
+		list_del(p);
+		kfree(b);
+	}
+
+	return 0;
+}
+
+int hci_blacklist_add(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct bdaddr_list *entry;
+	int err;
+
+	if (bacmp(bdaddr, BDADDR_ANY) == 0)
+		return -EBADF;
+
+	hci_dev_lock_bh(hdev);
+
+	if (hci_blacklist_lookup(hdev, bdaddr)) {
+		err = -EEXIST;
+		goto err;
+	}
+
+	entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL);
+	if (!entry) {
+		return -ENOMEM;
+		goto err;
+	}
+
+	bacpy(&entry->bdaddr, bdaddr);
+
+	list_add(&entry->list, &hdev->blacklist);
+
+	err = 0;
+
+err:
+	hci_dev_unlock_bh(hdev);
+	return err;
+}
+
+int hci_blacklist_del(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct bdaddr_list *entry;
+	int err = 0;
+
+	hci_dev_lock_bh(hdev);
+
+	if (bacmp(bdaddr, BDADDR_ANY) == 0) {
+		hci_blacklist_clear(hdev);
+		goto done;
+	}
+
+	entry = hci_blacklist_lookup(hdev, bdaddr);
+	if (!entry) {
+		err = -ENOENT;
+		goto done;
+	}
+
+	list_del(&entry->list);
+	kfree(entry);
+
+done:
+	hci_dev_unlock_bh(hdev);
+	return err;
+}
+
+static void hci_clear_adv_cache(unsigned long arg)
+{
+	struct hci_dev *hdev = (void *) arg;
+
+	hci_dev_lock(hdev);
+
+	hci_adv_entries_clear(hdev);
+
+	hci_dev_unlock(hdev);
+}
+
+int hci_adv_entries_clear(struct hci_dev *hdev)
+{
+	struct adv_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, &hdev->adv_entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	BT_DBG("%s adv cache cleared", hdev->name);
+
+	return 0;
+}
+
+struct adv_entry *hci_find_adv_entry(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct adv_entry *entry;
+
+	list_for_each_entry(entry, &hdev->adv_entries, list)
+		if (bacmp(bdaddr, &entry->bdaddr) == 0)
+			return entry;
+
+	return NULL;
+}
+
+static inline int is_connectable_adv(u8 evt_type)
+{
+	if (evt_type == ADV_IND || evt_type == ADV_DIRECT_IND)
+		return 1;
+
+	return 0;
+}
+
+int hci_add_adv_entry(struct hci_dev *hdev,
+					struct hci_ev_le_advertising_info *ev)
+{
+	struct adv_entry *entry;
+
+	if (!is_connectable_adv(ev->evt_type))
+		return -EINVAL;
+
+	/* Only new entries should be added to adv_entries. So, if
+	 * bdaddr was found, don't add it. */
+	if (hci_find_adv_entry(hdev, &ev->bdaddr))
+		return 0;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+
+	bacpy(&entry->bdaddr, &ev->bdaddr);
+	entry->bdaddr_type = ev->bdaddr_type;
+
+	list_add(&entry->list, &hdev->adv_entries);
+
+	BT_DBG("%s adv entry added: address %s type %u", hdev->name,
+				batostr(&entry->bdaddr), entry->bdaddr_type);
+
+	return 0;
+}
+
+/* Register HCI device */
+int hci_register_dev(struct hci_dev *hdev)
+{
+	struct list_head *head = &hci_dev_list, *p;
+	int i, id = 0;
+
+	BT_DBG("%p name %s bus %d owner %p", hdev, hdev->name,
+						hdev->bus, hdev->owner);
+
+	if (!hdev->open || !hdev->close || !hdev->destruct)
+		return -EINVAL;
+
+	write_lock_bh(&hci_dev_list_lock);
+
+	/* Find first available device id */
+	list_for_each(p, &hci_dev_list) {
+		if (list_entry(p, struct hci_dev, list)->id != id)
+			break;
+		head = p; id++;
+	}
+
+	sprintf(hdev->name, "hci%d", id);
+	hdev->id = id;
+	list_add(&hdev->list, head);
+
+	atomic_set(&hdev->refcnt, 1);
+	spin_lock_init(&hdev->lock);
+
+	hdev->flags = 0;
+	hdev->pkt_type  = (HCI_DM1 | HCI_DH1 | HCI_HV1);
+	hdev->esco_type = (ESCO_HV1);
+	hdev->link_mode = (HCI_LM_ACCEPT);
+	hdev->io_capability = 0x03; /* No Input No Output */
+
+	hdev->idle_timeout = 0;
+	hdev->sniff_max_interval = 800;
+	hdev->sniff_min_interval = 80;
+
+	tasklet_init(&hdev->cmd_task, hci_cmd_task, (unsigned long) hdev);
+	tasklet_init(&hdev->rx_task, hci_rx_task, (unsigned long) hdev);
+	tasklet_init(&hdev->tx_task, hci_tx_task, (unsigned long) hdev);
+
+	skb_queue_head_init(&hdev->rx_q);
+	skb_queue_head_init(&hdev->cmd_q);
+	skb_queue_head_init(&hdev->raw_q);
+
+	setup_timer(&hdev->cmd_timer, hci_cmd_timer, (unsigned long) hdev);
+
+	for (i = 0; i < NUM_REASSEMBLY; i++)
+		hdev->reassembly[i] = NULL;
+
+	init_waitqueue_head(&hdev->req_wait_q);
+	mutex_init(&hdev->req_lock);
+
+	inquiry_cache_init(hdev);
+
+	hci_conn_hash_init(hdev);
+
+	INIT_LIST_HEAD(&hdev->blacklist);
+
+	INIT_LIST_HEAD(&hdev->uuids);
+
+	INIT_LIST_HEAD(&hdev->link_keys);
+
+	INIT_LIST_HEAD(&hdev->remote_oob_data);
+
+	INIT_LIST_HEAD(&hdev->adv_entries);
+	setup_timer(&hdev->adv_timer, hci_clear_adv_cache,
+						(unsigned long) hdev);
+
+	INIT_WORK(&hdev->power_on, hci_power_on);
+	INIT_WORK(&hdev->power_off, hci_power_off);
+	setup_timer(&hdev->off_timer, hci_auto_off, (unsigned long) hdev);
+
+	memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
+
+	atomic_set(&hdev->promisc, 0);
+
+	write_unlock_bh(&hci_dev_list_lock);
+
+	hdev->workqueue = create_singlethread_workqueue(hdev->name);
+	if (!hdev->workqueue)
+		goto nomem;
+
+	hdev->tfm = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hdev->tfm))
+		BT_INFO("Failed to load transform for ecb(aes): %ld",
+							PTR_ERR(hdev->tfm));
+
+	hci_register_sysfs(hdev);
+
+	hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
+				RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev);
+	if (hdev->rfkill) {
+		if (rfkill_register(hdev->rfkill) < 0) {
+			rfkill_destroy(hdev->rfkill);
+			hdev->rfkill = NULL;
+		}
+	}
+
+	set_bit(HCI_AUTO_OFF, &hdev->flags);
+	set_bit(HCI_SETUP, &hdev->flags);
+	queue_work(hdev->workqueue, &hdev->power_on);
+
+	hci_notify(hdev, HCI_DEV_REG);
+
+	return id;
+
+nomem:
+	write_lock_bh(&hci_dev_list_lock);
+	list_del(&hdev->list);
+	write_unlock_bh(&hci_dev_list_lock);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(hci_register_dev);
+
+/* Unregister HCI device */
+int hci_unregister_dev(struct hci_dev *hdev)
+{
+	int i;
+
+	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+	set_bit(HCI_UNREGISTER, &hdev->flags);
+
+	write_lock_bh(&hci_dev_list_lock);
+	list_del(&hdev->list);
+	write_unlock_bh(&hci_dev_list_lock);
+
+	hci_dev_do_close(hdev);
+
+	for (i = 0; i < NUM_REASSEMBLY; i++)
+		kfree_skb(hdev->reassembly[i]);
+
+	if (!test_bit(HCI_INIT, &hdev->flags) &&
+					!test_bit(HCI_SETUP, &hdev->flags))
+		mgmt_index_removed(hdev->id);
+
+	if (!IS_ERR(hdev->tfm))
+		crypto_free_blkcipher(hdev->tfm);
+
+	hci_notify(hdev, HCI_DEV_UNREG);
+
+	if (hdev->rfkill) {
+		rfkill_unregister(hdev->rfkill);
+		rfkill_destroy(hdev->rfkill);
+	}
+
+	hci_unregister_sysfs(hdev);
+
+	hci_del_off_timer(hdev);
+	del_timer(&hdev->adv_timer);
+
+	destroy_workqueue(hdev->workqueue);
+
+	hci_dev_lock_bh(hdev);
+	hci_blacklist_clear(hdev);
+	hci_uuids_clear(hdev);
+	hci_link_keys_clear(hdev);
+	hci_remote_oob_data_clear(hdev);
+	hci_adv_entries_clear(hdev);
+	hci_dev_unlock_bh(hdev);
+
+	__hci_dev_put(hdev);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_unregister_dev);
+
+/* Suspend HCI device */
+int hci_suspend_dev(struct hci_dev *hdev)
+{
+	hci_notify(hdev, HCI_DEV_SUSPEND);
+	return 0;
+}
+EXPORT_SYMBOL(hci_suspend_dev);
+
+/* Resume HCI device */
+int hci_resume_dev(struct hci_dev *hdev)
+{
+	hci_notify(hdev, HCI_DEV_RESUME);
+	return 0;
+}
+EXPORT_SYMBOL(hci_resume_dev);
+
+/* Receive frame from HCI drivers */
+int hci_recv_frame(struct sk_buff *skb)
+{
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+	if (!hdev || (!test_bit(HCI_UP, &hdev->flags)
+				&& !test_bit(HCI_INIT, &hdev->flags))) {
+		kfree_skb(skb);
+		return -ENXIO;
+	}
+
+	/* Incomming skb */
+	bt_cb(skb)->incoming = 1;
+
+	/* Time stamp */
+	__net_timestamp(skb);
+
+	/* Queue frame for rx task */
+	skb_queue_tail(&hdev->rx_q, skb);
+	tasklet_schedule(&hdev->rx_task);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_recv_frame);
+
+static int hci_reassembly(struct hci_dev *hdev, int type, void *data,
+						  int count, __u8 index)
+{
+	int len = 0;
+	int hlen = 0;
+	int remain = count;
+	struct sk_buff *skb;
+	struct bt_skb_cb *scb;
+
+	if ((type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) ||
+				index >= NUM_REASSEMBLY)
+		return -EILSEQ;
+
+	skb = hdev->reassembly[index];
+
+	if (!skb) {
+		switch (type) {
+		case HCI_ACLDATA_PKT:
+			len = HCI_MAX_FRAME_SIZE;
+			hlen = HCI_ACL_HDR_SIZE;
+			break;
+		case HCI_EVENT_PKT:
+			len = HCI_MAX_EVENT_SIZE;
+			hlen = HCI_EVENT_HDR_SIZE;
+			break;
+		case HCI_SCODATA_PKT:
+			len = HCI_MAX_SCO_SIZE;
+			hlen = HCI_SCO_HDR_SIZE;
+			break;
+		}
+
+		skb = bt_skb_alloc(len, GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+
+		scb = (void *) skb->cb;
+		scb->expect = hlen;
+		scb->pkt_type = type;
+
+		skb->dev = (void *) hdev;
+		hdev->reassembly[index] = skb;
+	}
+
+	while (count) {
+		scb = (void *) skb->cb;
+		len = min(scb->expect, (__u16)count);
+
+		memcpy(skb_put(skb, len), data, len);
+
+		count -= len;
+		data += len;
+		scb->expect -= len;
+		remain = count;
+
+		switch (type) {
+		case HCI_EVENT_PKT:
+			if (skb->len == HCI_EVENT_HDR_SIZE) {
+				struct hci_event_hdr *h = hci_event_hdr(skb);
+				scb->expect = h->plen;
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
+			}
+			break;
+
+		case HCI_ACLDATA_PKT:
+			if (skb->len  == HCI_ACL_HDR_SIZE) {
+				struct hci_acl_hdr *h = hci_acl_hdr(skb);
+				scb->expect = __le16_to_cpu(h->dlen);
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
+			}
+			break;
+
+		case HCI_SCODATA_PKT:
+			if (skb->len == HCI_SCO_HDR_SIZE) {
+				struct hci_sco_hdr *h = hci_sco_hdr(skb);
+				scb->expect = h->dlen;
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
+			}
+			break;
+		}
+
+		if (scb->expect == 0) {
+			/* Complete frame */
+
+			bt_cb(skb)->pkt_type = type;
+			hci_recv_frame(skb);
+
+			hdev->reassembly[index] = NULL;
+			return remain;
+		}
+	}
+
+	return remain;
+}
+
+int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count)
+{
+	int rem = 0;
+
+	if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT)
+		return -EILSEQ;
+
+	while (count) {
+		rem = hci_reassembly(hdev, type, data, count, type - 1);
+		if (rem < 0)
+			return rem;
+
+		data += (count - rem);
+		count = rem;
+	};
+
+	return rem;
+}
+EXPORT_SYMBOL(hci_recv_fragment);
+
+#define STREAM_REASSEMBLY 0
+
+int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count)
+{
+	int type;
+	int rem = 0;
+
+	while (count) {
+		struct sk_buff *skb = hdev->reassembly[STREAM_REASSEMBLY];
+
+		if (!skb) {
+			struct { char type; } *pkt;
+
+			/* Start of the frame */
+			pkt = data;
+			type = pkt->type;
+
+			data++;
+			count--;
+		} else
+			type = bt_cb(skb)->pkt_type;
+
+		rem = hci_reassembly(hdev, type, data, count,
+							STREAM_REASSEMBLY);
+		if (rem < 0)
+			return rem;
+
+		data += (count - rem);
+		count = rem;
+	};
+
+	return rem;
+}
+EXPORT_SYMBOL(hci_recv_stream_fragment);
+
+/* ---- Interface to upper protocols ---- */
+
+/* Register/Unregister protocols.
+ * hci_task_lock is used to ensure that no tasks are running. */
+int hci_register_proto(struct hci_proto *hp)
+{
+	int err = 0;
+
+	BT_DBG("%p name %s id %d", hp, hp->name, hp->id);
+
+	if (hp->id >= HCI_MAX_PROTO)
+		return -EINVAL;
+
+	write_lock_bh(&hci_task_lock);
+
+	if (!hci_proto[hp->id])
+		hci_proto[hp->id] = hp;
+	else
+		err = -EEXIST;
+
+	write_unlock_bh(&hci_task_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(hci_register_proto);
+
+int hci_unregister_proto(struct hci_proto *hp)
+{
+	int err = 0;
+
+	BT_DBG("%p name %s id %d", hp, hp->name, hp->id);
+
+	if (hp->id >= HCI_MAX_PROTO)
+		return -EINVAL;
+
+	write_lock_bh(&hci_task_lock);
+
+	if (hci_proto[hp->id])
+		hci_proto[hp->id] = NULL;
+	else
+		err = -ENOENT;
+
+	write_unlock_bh(&hci_task_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(hci_unregister_proto);
+
+int hci_register_cb(struct hci_cb *cb)
+{
+	BT_DBG("%p name %s", cb, cb->name);
+
+	write_lock_bh(&hci_cb_list_lock);
+	list_add(&cb->list, &hci_cb_list);
+	write_unlock_bh(&hci_cb_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_register_cb);
+
+int hci_unregister_cb(struct hci_cb *cb)
+{
+	BT_DBG("%p name %s", cb, cb->name);
+
+	write_lock_bh(&hci_cb_list_lock);
+	list_del(&cb->list);
+	write_unlock_bh(&hci_cb_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_unregister_cb);
+
+static int hci_send_frame(struct sk_buff *skb)
+{
+	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
+
+	if (!hdev) {
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
+
+	if (atomic_read(&hdev->promisc)) {
+		/* Time stamp */
+		__net_timestamp(skb);
+
+		hci_send_to_sock(hdev, skb, NULL);
+	}
+
+	/* Get rid of skb owner, prior to sending to the driver. */
+	skb_orphan(skb);
+
+	return hdev->send(skb);
+}
+
+/* Send HCI command */
+int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, void *param)
+{
+	int len = HCI_COMMAND_HDR_SIZE + plen;
+	struct hci_command_hdr *hdr;
+	struct sk_buff *skb;
+
+	BT_DBG("%s opcode 0x%x plen %d", hdev->name, opcode, plen);
+
+	skb = bt_skb_alloc(len, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("%s no memory for command", hdev->name);
+		return -ENOMEM;
+	}
+
+	hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE);
+	hdr->opcode = cpu_to_le16(opcode);
+	hdr->plen   = plen;
+
+	if (plen)
+		memcpy(skb_put(skb, plen), param, plen);
+
+	BT_DBG("skb len %d", skb->len);
+
+	bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
+	skb->dev = (void *) hdev;
+
+	if (test_bit(HCI_INIT, &hdev->flags))
+		hdev->init_last_cmd = opcode;
+
+	skb_queue_tail(&hdev->cmd_q, skb);
+	tasklet_schedule(&hdev->cmd_task);
+
+	return 0;
+}
+
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+	struct hci_command_hdr *hdr;
+
+	if (!hdev->sent_cmd)
+		return NULL;
+
+	hdr = (void *) hdev->sent_cmd->data;
+
+	if (hdr->opcode != cpu_to_le16(opcode))
+		return NULL;
+
+	BT_DBG("%s opcode 0x%x", hdev->name, opcode);
+
+	return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+}
+
+/* Send ACL data */
+static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
+{
+	struct hci_acl_hdr *hdr;
+	int len = skb->len;
+
+	skb_push(skb, HCI_ACL_HDR_SIZE);
+	skb_reset_transport_header(skb);
+	hdr = (struct hci_acl_hdr *)skb_transport_header(skb);
+	hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+	hdr->dlen   = cpu_to_le16(len);
+}
+
+void hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct sk_buff *list;
+
+	BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
+
+	skb->dev = (void *) hdev;
+	bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+	hci_add_acl_hdr(skb, conn->handle, flags);
+
+	list = skb_shinfo(skb)->frag_list;
+	if (!list) {
+		/* Non fragmented */
+		BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
+
+		skb_queue_tail(&conn->data_q, skb);
+	} else {
+		/* Fragmented */
+		BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+		skb_shinfo(skb)->frag_list = NULL;
+
+		/* Queue all fragments atomically */
+		spin_lock_bh(&conn->data_q.lock);
+
+		__skb_queue_tail(&conn->data_q, skb);
+
+		flags &= ~ACL_START;
+		flags |= ACL_CONT;
+		do {
+			skb = list; list = list->next;
+
+			skb->dev = (void *) hdev;
+			bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+			hci_add_acl_hdr(skb, conn->handle, flags);
+
+			BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+			__skb_queue_tail(&conn->data_q, skb);
+		} while (list);
+
+		spin_unlock_bh(&conn->data_q.lock);
+	}
+
+	tasklet_schedule(&hdev->tx_task);
+}
+EXPORT_SYMBOL(hci_send_acl);
+
+/* Send SCO data */
+void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct hci_sco_hdr hdr;
+
+	BT_DBG("%s len %d", hdev->name, skb->len);
+
+	hdr.handle = cpu_to_le16(conn->handle);
+	hdr.dlen   = skb->len;
+
+	skb_push(skb, HCI_SCO_HDR_SIZE);
+	skb_reset_transport_header(skb);
+	memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE);
+
+	skb->dev = (void *) hdev;
+	bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
+
+	skb_queue_tail(&conn->data_q, skb);
+	tasklet_schedule(&hdev->tx_task);
+}
+EXPORT_SYMBOL(hci_send_sco);
+
+/* ---- HCI TX task (outgoing data) ---- */
+
+/* HCI Connection scheduler */
+static inline struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct hci_conn *conn = NULL;
+	int num = 0, min = ~0;
+	struct list_head *p;
+
+	/* We don't have to lock device here. Connections are always
+	 * added and removed with TX task disabled. */
+	list_for_each(p, &h->list) {
+		struct hci_conn *c;
+		c = list_entry(p, struct hci_conn, list);
+
+		if (c->type != type || skb_queue_empty(&c->data_q))
+			continue;
+
+		if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
+			continue;
+
+		num++;
+
+		if (c->sent < min) {
+			min  = c->sent;
+			conn = c;
+		}
+	}
+
+	if (conn) {
+		int cnt, q;
+
+		switch (conn->type) {
+		case ACL_LINK:
+			cnt = hdev->acl_cnt;
+			break;
+		case SCO_LINK:
+		case ESCO_LINK:
+			cnt = hdev->sco_cnt;
+			break;
+		case LE_LINK:
+			cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+			break;
+		default:
+			cnt = 0;
+			BT_ERR("Unknown link type");
+		}
+
+		q = cnt / num;
+		*quote = q ? q : 1;
+	} else
+		*quote = 0;
+
+	BT_DBG("conn %p quote %d", conn, *quote);
+	return conn;
+}
+
+static inline void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
+{
+	struct hci_conn_hash *h = &hdev->conn_hash;
+	struct list_head *p;
+	struct hci_conn  *c;
+
+	BT_ERR("%s link tx timeout", hdev->name);
+
+	/* Kill stalled connections */
+	list_for_each(p, &h->list) {
+		c = list_entry(p, struct hci_conn, list);
+		if (c->type == type && c->sent) {
+			BT_ERR("%s killing stalled connection %s",
+				hdev->name, batostr(&c->dst));
+			hci_acl_disconn(c, 0x13);
+		}
+	}
+}
+
+static inline void hci_sched_acl(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_bit(HCI_RAW, &hdev->flags)) {
+		/* ACL tx timeout must be longer than maximum
+		 * link supervision timeout (40.9 seconds) */
+		if (!hdev->acl_cnt && time_after(jiffies, hdev->acl_last_tx + HZ * 45))
+			hci_link_tx_to(hdev, ACL_LINK);
+	}
+
+	while (hdev->acl_cnt && (conn = hci_low_sent(hdev, ACL_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+
+			hci_conn_enter_active_mode(conn, bt_cb(skb)->force_active);
+
+			hci_send_frame(skb);
+			hdev->acl_last_tx = jiffies;
+
+			hdev->acl_cnt--;
+			conn->sent++;
+		}
+	}
+}
+
+/* Schedule SCO */
+static inline void hci_sched_sco(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote;
+
+	BT_DBG("%s", hdev->name);
+
+	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+			hci_send_frame(skb);
+
+			conn->sent++;
+			if (conn->sent == ~0)
+				conn->sent = 0;
+		}
+	}
+}
+
+static inline void hci_sched_esco(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote;
+
+	BT_DBG("%s", hdev->name);
+
+	while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+			hci_send_frame(skb);
+
+			conn->sent++;
+			if (conn->sent == ~0)
+				conn->sent = 0;
+		}
+	}
+}
+
+static inline void hci_sched_le(struct hci_dev *hdev)
+{
+	struct hci_conn *conn;
+	struct sk_buff *skb;
+	int quote, cnt;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_bit(HCI_RAW, &hdev->flags)) {
+		/* LE tx timeout must be longer than maximum
+		 * link supervision timeout (40.9 seconds) */
+		if (!hdev->le_cnt && hdev->le_pkts &&
+				time_after(jiffies, hdev->le_last_tx + HZ * 45))
+			hci_link_tx_to(hdev, LE_LINK);
+	}
+
+	cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
+	while (cnt && (conn = hci_low_sent(hdev, LE_LINK, &quote))) {
+		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+			BT_DBG("skb %p len %d", skb, skb->len);
+
+			hci_send_frame(skb);
+			hdev->le_last_tx = jiffies;
+
+			cnt--;
+			conn->sent++;
+		}
+	}
+	if (hdev->le_pkts)
+		hdev->le_cnt = cnt;
+	else
+		hdev->acl_cnt = cnt;
+}
+
+static void hci_tx_task(unsigned long arg)
+{
+	struct hci_dev *hdev = (struct hci_dev *) arg;
+	struct sk_buff *skb;
+
+	read_lock(&hci_task_lock);
+
+	BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt,
+		hdev->sco_cnt, hdev->le_cnt);
+
+	/* Schedule queues and send stuff to HCI driver */
+
+	hci_sched_acl(hdev);
+
+	hci_sched_sco(hdev);
+
+	hci_sched_esco(hdev);
+
+	hci_sched_le(hdev);
+
+	/* Send next queued raw (unknown type) packet */
+	while ((skb = skb_dequeue(&hdev->raw_q)))
+		hci_send_frame(skb);
+
+	read_unlock(&hci_task_lock);
+}
+
+/* ----- HCI RX task (incoming data processing) ----- */
+
+/* ACL data packet */
+static inline void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_acl_hdr *hdr = (void *) skb->data;
+	struct hci_conn *conn;
+	__u16 handle, flags;
+
+	skb_pull(skb, HCI_ACL_HDR_SIZE);
+
+	handle = __le16_to_cpu(hdr->handle);
+	flags  = hci_flags(handle);
+	handle = hci_handle(handle);
+
+	BT_DBG("%s len %d handle 0x%x flags 0x%x", hdev->name, skb->len, handle, flags);
+
+	hdev->stat.acl_rx++;
+
+	hci_dev_lock(hdev);
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	hci_dev_unlock(hdev);
+
+	if (conn) {
+		register struct hci_proto *hp;
+
+		hci_conn_enter_active_mode(conn, bt_cb(skb)->force_active);
+
+		/* Send to upper protocol */
+		hp = hci_proto[HCI_PROTO_L2CAP];
+		if (hp && hp->recv_acldata) {
+			hp->recv_acldata(conn, skb, flags);
+			return;
+		}
+	} else {
+		BT_ERR("%s ACL packet for unknown connection handle %d",
+			hdev->name, handle);
+	}
+
+	kfree_skb(skb);
+}
+
+/* SCO data packet */
+static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_sco_hdr *hdr = (void *) skb->data;
+	struct hci_conn *conn;
+	__u16 handle;
+
+	skb_pull(skb, HCI_SCO_HDR_SIZE);
+
+	handle = __le16_to_cpu(hdr->handle);
+
+	BT_DBG("%s len %d handle 0x%x", hdev->name, skb->len, handle);
+
+	hdev->stat.sco_rx++;
+
+	hci_dev_lock(hdev);
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	hci_dev_unlock(hdev);
+
+	if (conn) {
+		register struct hci_proto *hp;
+
+		/* Send to upper protocol */
+		hp = hci_proto[HCI_PROTO_SCO];
+		if (hp && hp->recv_scodata) {
+			hp->recv_scodata(conn, skb);
+			return;
+		}
+	} else {
+		BT_ERR("%s SCO packet for unknown connection handle %d",
+			hdev->name, handle);
+	}
+
+	kfree_skb(skb);
+}
+
+static void hci_rx_task(unsigned long arg)
+{
+	struct hci_dev *hdev = (struct hci_dev *) arg;
+	struct sk_buff *skb;
+
+	BT_DBG("%s", hdev->name);
+
+	read_lock(&hci_task_lock);
+
+	while ((skb = skb_dequeue(&hdev->rx_q))) {
+		if (atomic_read(&hdev->promisc)) {
+			/* Send copy to the sockets */
+			hci_send_to_sock(hdev, skb, NULL);
+		}
+
+		if (test_bit(HCI_RAW, &hdev->flags)) {
+			kfree_skb(skb);
+			continue;
+		}
+
+		if (test_bit(HCI_INIT, &hdev->flags)) {
+			/* Don't process data packets in this states. */
+			switch (bt_cb(skb)->pkt_type) {
+			case HCI_ACLDATA_PKT:
+			case HCI_SCODATA_PKT:
+				kfree_skb(skb);
+				continue;
+			}
+		}
+
+		/* Process frame */
+		switch (bt_cb(skb)->pkt_type) {
+		case HCI_EVENT_PKT:
+			hci_event_packet(hdev, skb);
+			break;
+
+		case HCI_ACLDATA_PKT:
+			BT_DBG("%s ACL data packet", hdev->name);
+			hci_acldata_packet(hdev, skb);
+			break;
+
+		case HCI_SCODATA_PKT:
+			BT_DBG("%s SCO data packet", hdev->name);
+			hci_scodata_packet(hdev, skb);
+			break;
+
+		default:
+			kfree_skb(skb);
+			break;
+		}
+	}
+
+	read_unlock(&hci_task_lock);
+}
+
+static void hci_cmd_task(unsigned long arg)
+{
+	struct hci_dev *hdev = (struct hci_dev *) arg;
+	struct sk_buff *skb;
+
+	BT_DBG("%s cmd %d", hdev->name, atomic_read(&hdev->cmd_cnt));
+
+	/* Send queued commands */
+	if (atomic_read(&hdev->cmd_cnt)) {
+		skb = skb_dequeue(&hdev->cmd_q);
+		if (!skb)
+			return;
+
+		kfree_skb(hdev->sent_cmd);
+
+		hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC);
+		if (hdev->sent_cmd) {
+			atomic_dec(&hdev->cmd_cnt);
+			hci_send_frame(skb);
+			if (test_bit(HCI_RESET, &hdev->flags))
+				del_timer(&hdev->cmd_timer);
+			else
+				mod_timer(&hdev->cmd_timer,
+				  jiffies + msecs_to_jiffies(HCI_CMD_TIMEOUT));
+		} else {
+			skb_queue_head(&hdev->cmd_q, skb);
+			tasklet_schedule(&hdev->cmd_task);
+		}
+	}
+}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
new file mode 100755
index 00000000..5a7074a7
--- /dev/null
+++ b/net/bluetooth/hci_event.c
@@ -0,0 +1,3118 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI event handling. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+static int enable_le;
+
+/* Handle HCI Event packets */
+
+static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) &&
+			test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_discovering(hdev->id, 0);
+
+	hci_req_complete(hdev, HCI_OP_INQUIRY_CANCEL, status);
+
+	hci_conn_check_pending(hdev);
+}
+
+static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) &&
+				test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_discovering(hdev->id, 0);
+
+	hci_conn_check_pending(hdev);
+}
+
+static void hci_cc_remote_name_req_cancel(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	BT_DBG("%s", hdev->name);
+}
+
+static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_role_discovery *rp = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+	if (conn) {
+		if (rp->role)
+			conn->link_mode &= ~HCI_LM_MASTER;
+		else
+			conn->link_mode |= HCI_LM_MASTER;
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_link_policy *rp = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+	if (conn)
+		conn->link_policy = __le16_to_cpu(rp->policy);
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cc_write_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_write_link_policy *rp = (void *) skb->data;
+	struct hci_conn *conn;
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LINK_POLICY);
+	if (!sent)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+	if (conn)
+		conn->link_policy = get_unaligned_le16(sent + 2);
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cc_read_def_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_def_link_policy *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hdev->link_policy = __le16_to_cpu(rp->policy);
+}
+
+static void hci_cc_write_def_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY);
+	if (!sent)
+		return;
+
+	if (!status)
+		hdev->link_policy = get_unaligned_le16(sent);
+
+	hci_req_complete(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, status);
+}
+
+static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	clear_bit(HCI_RESET, &hdev->flags);
+
+	hci_req_complete(hdev, HCI_OP_RESET, status);
+}
+
+static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LOCAL_NAME);
+	if (!sent)
+		return;
+
+	if (test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_set_local_name_complete(hdev->id, sent, status);
+
+	if (status)
+		return;
+
+	memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH);
+}
+
+static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_local_name *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH);
+}
+
+static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_ENABLE);
+	if (!sent)
+		return;
+
+	if (!status) {
+		__u8 param = *((__u8 *) sent);
+
+		if (param == AUTH_ENABLED)
+			set_bit(HCI_AUTH, &hdev->flags);
+		else
+			clear_bit(HCI_AUTH, &hdev->flags);
+	}
+
+	hci_req_complete(hdev, HCI_OP_WRITE_AUTH_ENABLE, status);
+}
+
+static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE);
+	if (!sent)
+		return;
+
+	if (!status) {
+		__u8 param = *((__u8 *) sent);
+
+		if (param)
+			set_bit(HCI_ENCRYPT, &hdev->flags);
+		else
+			clear_bit(HCI_ENCRYPT, &hdev->flags);
+	}
+
+	hci_req_complete(hdev, HCI_OP_WRITE_ENCRYPT_MODE, status);
+}
+
+static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SCAN_ENABLE);
+	if (!sent)
+		return;
+
+	if (!status) {
+		__u8 param = *((__u8 *) sent);
+		int old_pscan, old_iscan;
+
+		old_pscan = test_and_clear_bit(HCI_PSCAN, &hdev->flags);
+		old_iscan = test_and_clear_bit(HCI_ISCAN, &hdev->flags);
+
+		if (param & SCAN_INQUIRY) {
+			set_bit(HCI_ISCAN, &hdev->flags);
+			if (!old_iscan)
+				mgmt_discoverable(hdev->id, 1);
+		} else if (old_iscan)
+			mgmt_discoverable(hdev->id, 0);
+
+		if (param & SCAN_PAGE) {
+			set_bit(HCI_PSCAN, &hdev->flags);
+			if (!old_pscan)
+				mgmt_connectable(hdev->id, 1);
+		} else if (old_pscan)
+			mgmt_connectable(hdev->id, 0);
+	}
+
+	hci_req_complete(hdev, HCI_OP_WRITE_SCAN_ENABLE, status);
+}
+
+static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_class_of_dev *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	memcpy(hdev->dev_class, rp->dev_class, 3);
+
+	BT_DBG("%s class 0x%.2x%.2x%.2x", hdev->name,
+		hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]);
+}
+
+static void hci_cc_write_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_CLASS_OF_DEV);
+	if (!sent)
+		return;
+
+	memcpy(hdev->dev_class, sent, 3);
+}
+
+static void hci_cc_read_voice_setting(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_voice_setting *rp = (void *) skb->data;
+	__u16 setting;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	setting = __le16_to_cpu(rp->voice_setting);
+
+	if (hdev->voice_setting == setting)
+		return;
+
+	hdev->voice_setting = setting;
+
+	BT_DBG("%s voice setting 0x%04x", hdev->name, setting);
+
+	if (hdev->notify) {
+		tasklet_disable(&hdev->tx_task);
+		hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+		tasklet_enable(&hdev->tx_task);
+	}
+}
+
+static void hci_cc_write_voice_setting(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	__u16 setting;
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_VOICE_SETTING);
+	if (!sent)
+		return;
+
+	setting = get_unaligned_le16(sent);
+
+	if (hdev->voice_setting == setting)
+		return;
+
+	hdev->voice_setting = setting;
+
+	BT_DBG("%s voice setting 0x%04x", hdev->name, setting);
+
+	if (hdev->notify) {
+		tasklet_disable(&hdev->tx_task);
+		hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+		tasklet_enable(&hdev->tx_task);
+	}
+}
+
+static void hci_cc_host_buffer_size(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_HOST_BUFFER_SIZE, status);
+}
+
+static void hci_cc_read_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_ssp_mode *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hdev->ssp_mode = rp->mode;
+}
+
+static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+	void *sent;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_MODE);
+	if (!sent)
+		return;
+
+	hdev->ssp_mode = *((__u8 *) sent);
+}
+
+static u8 hci_get_inquiry_mode(struct hci_dev *hdev)
+{
+	if (hdev->features[6] & LMP_EXT_INQ)
+		return 2;
+
+	if (hdev->features[3] & LMP_RSSI_INQ)
+		return 1;
+
+	if (hdev->manufacturer == 11 && hdev->hci_rev == 0x00 &&
+						hdev->lmp_subver == 0x0757)
+		return 1;
+
+	if (hdev->manufacturer == 15) {
+		if (hdev->hci_rev == 0x03 && hdev->lmp_subver == 0x6963)
+			return 1;
+		if (hdev->hci_rev == 0x09 && hdev->lmp_subver == 0x6963)
+			return 1;
+		if (hdev->hci_rev == 0x00 && hdev->lmp_subver == 0x6965)
+			return 1;
+	}
+
+	if (hdev->manufacturer == 31 && hdev->hci_rev == 0x2005 &&
+						hdev->lmp_subver == 0x1805)
+		return 1;
+
+	return 0;
+}
+
+static void hci_setup_inquiry_mode(struct hci_dev *hdev)
+{
+	u8 mode;
+
+	mode = hci_get_inquiry_mode(hdev);
+
+	hci_send_cmd(hdev, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode);
+}
+
+static void hci_setup_event_mask(struct hci_dev *hdev)
+{
+	/* The second byte is 0xff instead of 0x9f (two reserved bits
+	 * disabled) since a Broadcom 1.2 dongle doesn't respond to the
+	 * command otherwise */
+	u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+	/* CSR 1.1 dongles does not accept any bitfield so don't try to set
+	 * any event mask for pre 1.2 devices */
+	if (hdev->lmp_ver <= 1)
+		return;
+
+	events[4] |= 0x01; /* Flow Specification Complete */
+	events[4] |= 0x02; /* Inquiry Result with RSSI */
+	events[4] |= 0x04; /* Read Remote Extended Features Complete */
+	events[5] |= 0x08; /* Synchronous Connection Complete */
+	events[5] |= 0x10; /* Synchronous Connection Changed */
+
+	if (hdev->features[3] & LMP_RSSI_INQ)
+		events[4] |= 0x04; /* Inquiry Result with RSSI */
+
+	if (hdev->features[5] & LMP_SNIFF_SUBR)
+		events[5] |= 0x20; /* Sniff Subrating */
+
+	if (hdev->features[5] & LMP_PAUSE_ENC)
+		events[5] |= 0x80; /* Encryption Key Refresh Complete */
+
+	if (hdev->features[6] & LMP_EXT_INQ)
+		events[5] |= 0x40; /* Extended Inquiry Result */
+
+	if (hdev->features[6] & LMP_NO_FLUSH)
+		events[7] |= 0x01; /* Enhanced Flush Complete */
+
+	if (hdev->features[7] & LMP_LSTO)
+		events[6] |= 0x80; /* Link Supervision Timeout Changed */
+
+	if (hdev->features[6] & LMP_SIMPLE_PAIR) {
+		events[6] |= 0x01;	/* IO Capability Request */
+		events[6] |= 0x02;	/* IO Capability Response */
+		events[6] |= 0x04;	/* User Confirmation Request */
+		events[6] |= 0x08;	/* User Passkey Request */
+		events[6] |= 0x10;	/* Remote OOB Data Request */
+		events[6] |= 0x20;	/* Simple Pairing Complete */
+		events[7] |= 0x04;	/* User Passkey Notification */
+		events[7] |= 0x08;	/* Keypress Notification */
+		events[7] |= 0x10;	/* Remote Host Supported
+					 * Features Notification */
+	}
+
+	if (hdev->features[4] & LMP_LE)
+		events[7] |= 0x20;	/* LE Meta-Event */
+
+	hci_send_cmd(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events);
+}
+
+static void hci_set_le_support(struct hci_dev *hdev)
+{
+	struct hci_cp_write_le_host_supported cp;
+
+	memset(&cp, 0, sizeof(cp));
+
+	if (enable_le) {
+		cp.le = 1;
+		cp.simul = !!(hdev->features[6] & LMP_SIMUL_LE_BR);
+	}
+
+	hci_send_cmd(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp);
+}
+
+static void hci_setup(struct hci_dev *hdev)
+{
+	hci_setup_event_mask(hdev);
+
+	if (hdev->lmp_ver > 1)
+		hci_send_cmd(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL);
+
+	if (hdev->features[6] & LMP_SIMPLE_PAIR) {
+		u8 mode = 0x01;
+		hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
+	}
+
+	if (hdev->features[3] & LMP_RSSI_INQ)
+		hci_setup_inquiry_mode(hdev);
+
+	if (hdev->features[7] & LMP_INQ_TX_PWR)
+		hci_send_cmd(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL);
+
+	if (hdev->features[7] & LMP_EXTFEATURES) {
+		struct hci_cp_read_local_ext_features cp;
+
+		cp.page = 0x01;
+		hci_send_cmd(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES,
+							sizeof(cp), &cp);
+	}
+
+	if (hdev->features[4] & LMP_LE)
+		hci_set_le_support(hdev);
+}
+
+static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_local_version *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hdev->hci_ver = rp->hci_ver;
+	hdev->hci_rev = __le16_to_cpu(rp->hci_rev);
+	hdev->lmp_ver = rp->lmp_ver;
+	hdev->manufacturer = __le16_to_cpu(rp->manufacturer);
+	hdev->lmp_subver = __le16_to_cpu(rp->lmp_subver);
+
+	BT_DBG("%s manufacturer %d hci ver %d:%d", hdev->name,
+					hdev->manufacturer,
+					hdev->hci_ver, hdev->hci_rev);
+
+	if (test_bit(HCI_INIT, &hdev->flags))
+		hci_setup(hdev);
+}
+
+static void hci_setup_link_policy(struct hci_dev *hdev)
+{
+	u16 link_policy = 0;
+
+	if (hdev->features[0] & LMP_RSWITCH)
+		link_policy |= HCI_LP_RSWITCH;
+	if (hdev->features[0] & LMP_HOLD)
+		link_policy |= HCI_LP_HOLD;
+	if (hdev->features[0] & LMP_SNIFF)
+		link_policy |= HCI_LP_SNIFF;
+	if (hdev->features[1] & LMP_PARK)
+		link_policy |= HCI_LP_PARK;
+
+	link_policy = cpu_to_le16(link_policy);
+	hci_send_cmd(hdev, HCI_OP_WRITE_DEF_LINK_POLICY,
+					sizeof(link_policy), &link_policy);
+}
+
+static void hci_cc_read_local_commands(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_local_commands *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		goto done;
+
+	memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
+
+	if (test_bit(HCI_INIT, &hdev->flags) && (hdev->commands[5] & 0x10))
+		hci_setup_link_policy(hdev);
+
+done:
+	hci_req_complete(hdev, HCI_OP_READ_LOCAL_COMMANDS, rp->status);
+}
+
+static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_local_features *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	memcpy(hdev->features, rp->features, 8);
+
+	/* Adjust default settings according to features
+	 * supported by device. */
+
+	if (hdev->features[0] & LMP_3SLOT)
+		hdev->pkt_type |= (HCI_DM3 | HCI_DH3);
+
+	if (hdev->features[0] & LMP_5SLOT)
+		hdev->pkt_type |= (HCI_DM5 | HCI_DH5);
+
+	if (hdev->features[1] & LMP_HV2) {
+		hdev->pkt_type  |= (HCI_HV2);
+		hdev->esco_type |= (ESCO_HV2);
+	}
+
+	if (hdev->features[1] & LMP_HV3) {
+		hdev->pkt_type  |= (HCI_HV3);
+		hdev->esco_type |= (ESCO_HV3);
+	}
+
+	if (hdev->features[3] & LMP_ESCO)
+		hdev->esco_type |= (ESCO_EV3);
+
+	if (hdev->features[4] & LMP_EV4)
+		hdev->esco_type |= (ESCO_EV4);
+
+	if (hdev->features[4] & LMP_EV5)
+		hdev->esco_type |= (ESCO_EV5);
+
+	if (hdev->features[5] & LMP_EDR_ESCO_2M)
+		hdev->esco_type |= (ESCO_2EV3);
+
+	if (hdev->features[5] & LMP_EDR_ESCO_3M)
+		hdev->esco_type |= (ESCO_3EV3);
+
+	if (hdev->features[5] & LMP_EDR_3S_ESCO)
+		hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+
+	BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name,
+					hdev->features[0], hdev->features[1],
+					hdev->features[2], hdev->features[3],
+					hdev->features[4], hdev->features[5],
+					hdev->features[6], hdev->features[7]);
+}
+
+static void hci_cc_read_local_ext_features(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_rp_read_local_ext_features *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	memcpy(hdev->extfeatures, rp->features, 8);
+
+	hci_req_complete(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, rp->status);
+}
+
+static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_buffer_size *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hdev->acl_mtu  = __le16_to_cpu(rp->acl_mtu);
+	hdev->sco_mtu  = rp->sco_mtu;
+	hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt);
+	hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt);
+
+	if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
+		hdev->sco_mtu  = 64;
+		hdev->sco_pkts = 8;
+	}
+
+	hdev->acl_cnt = hdev->acl_pkts;
+	hdev->sco_cnt = hdev->sco_pkts;
+
+	BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name,
+					hdev->acl_mtu, hdev->acl_pkts,
+					hdev->sco_mtu, hdev->sco_pkts);
+}
+
+static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_read_bd_addr *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (!rp->status)
+		bacpy(&hdev->bdaddr, &rp->bdaddr);
+
+	hci_req_complete(hdev, HCI_OP_READ_BD_ADDR, rp->status);
+}
+
+static void hci_cc_write_ca_timeout(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_WRITE_CA_TIMEOUT, status);
+}
+
+static void hci_cc_delete_stored_link_key(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_DELETE_STORED_LINK_KEY, status);
+}
+
+static void hci_cc_set_event_mask(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_SET_EVENT_MASK, status);
+}
+
+static void hci_cc_write_inquiry_mode(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_WRITE_INQUIRY_MODE, status);
+}
+
+static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, status);
+}
+
+static void hci_cc_set_event_flt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	hci_req_complete(hdev, HCI_OP_SET_EVENT_FLT, status);
+}
+
+static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_pin_code_reply *rp = (void *) skb->data;
+	struct hci_cp_pin_code_reply *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_pin_code_reply_complete(hdev->id, &rp->bdaddr, rp->status);
+
+	if (rp->status != 0)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_PIN_CODE_REPLY);
+	if (!cp)
+		return;
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+	if (conn)
+		conn->pin_length = cp->pin_len;
+}
+
+static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_pin_code_neg_reply *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_pin_code_neg_reply_complete(hdev->id, &rp->bdaddr,
+								rp->status);
+}
+static void hci_cc_le_read_buffer_size(struct hci_dev *hdev,
+				       struct sk_buff *skb)
+{
+	struct hci_rp_le_read_buffer_size *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hdev->le_mtu = __le16_to_cpu(rp->le_mtu);
+	hdev->le_pkts = rp->le_max_pkt;
+
+	hdev->le_cnt = hdev->le_pkts;
+
+	BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts);
+
+	hci_req_complete(hdev, HCI_OP_LE_READ_BUFFER_SIZE, rp->status);
+}
+
+static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_user_confirm_reply_complete(hdev->id, &rp->bdaddr,
+								rp->status);
+}
+
+static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_user_confirm_neg_reply_complete(hdev->id, &rp->bdaddr,
+								rp->status);
+}
+
+static void hci_cc_read_local_oob_data_reply(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	mgmt_read_local_oob_data_reply_complete(hdev->id, rp->hash,
+						rp->randomizer, rp->status);
+}
+
+static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
+					struct sk_buff *skb)
+{
+	struct hci_cp_le_set_scan_enable *cp;
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	if (cp->enable == 0x01) {
+		del_timer(&hdev->adv_timer);
+		hci_adv_entries_clear(hdev);
+	} else if (cp->enable == 0x00) {
+		mod_timer(&hdev->adv_timer, jiffies + ADV_CLEAR_TIMEOUT);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_ltk_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_le_ltk_reply *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hci_req_complete(hdev, HCI_OP_LE_LTK_REPLY, rp->status);
+}
+
+static void hci_cc_le_ltk_neg_reply(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_rp_le_ltk_neg_reply *rp = (void *) skb->data;
+
+	BT_DBG("%s status 0x%x", hdev->name, rp->status);
+
+	if (rp->status)
+		return;
+
+	hci_req_complete(hdev, HCI_OP_LE_LTK_NEG_REPLY, rp->status);
+}
+
+static inline void hci_cc_write_le_host_supported(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_cp_read_local_ext_features cp;
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status)
+		return;
+
+	cp.page = 0x01;
+	hci_send_cmd(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp);
+}
+
+static inline void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
+{
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (status) {
+		hci_req_complete(hdev, HCI_OP_INQUIRY, status);
+		hci_conn_check_pending(hdev);
+		return;
+	}
+
+	if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags) &&
+				test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_discovering(hdev->id, 1);
+}
+
+static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_create_conn *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_CONN);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+
+	BT_DBG("%s bdaddr %s conn %p", hdev->name, batostr(&cp->bdaddr), conn);
+
+	if (status) {
+		if (conn && conn->state == BT_CONNECT) {
+			if (status != 0x0c || conn->attempt > 2) {
+				conn->state = BT_CLOSED;
+				hci_proto_connect_cfm(conn, status);
+				hci_conn_del(conn);
+			} else
+				conn->state = BT_CONNECT2;
+		}
+	} else {
+		if (!conn) {
+			conn = hci_conn_add(hdev, ACL_LINK, 0, &cp->bdaddr);
+			if (conn) {
+				conn->out = 1;
+				conn->link_mode |= HCI_LM_MASTER;
+			} else
+				BT_ERR("No memory for new connection");
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_add_sco *cp;
+	struct hci_conn *acl, *sco;
+	__u16 handle;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_ADD_SCO);
+	if (!cp)
+		return;
+
+	handle = __le16_to_cpu(cp->handle);
+
+	BT_DBG("%s handle %d", hdev->name, handle);
+
+	hci_dev_lock(hdev);
+
+	acl = hci_conn_hash_lookup_handle(hdev, handle);
+	if (acl) {
+		sco = acl->link;
+		if (sco) {
+			sco->state = BT_CLOSED;
+
+			hci_proto_connect_cfm(sco, status);
+			hci_conn_del(sco);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_auth_requested *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_AUTH_REQUESTED);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		if (conn->state == BT_CONFIG) {
+			hci_proto_connect_cfm(conn, status);
+			hci_conn_put(conn);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_set_conn_encrypt *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_SET_CONN_ENCRYPT);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		if (conn->state == BT_CONFIG) {
+			hci_proto_connect_cfm(conn, status);
+			hci_conn_put(conn);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static int hci_outgoing_auth_needed(struct hci_dev *hdev,
+							struct hci_conn *conn)
+{
+	if (conn->state != BT_CONFIG || !conn->out)
+		return 0;
+
+	if (conn->pending_sec_level == BT_SECURITY_SDP)
+		return 0;
+
+	/* Only request authentication for SSP connections or non-SSP
+	 * devices with sec_level HIGH */
+	if (!(hdev->ssp_mode > 0 && conn->ssp_mode > 0) &&
+				conn->pending_sec_level != BT_SECURITY_HIGH)
+		return 0;
+
+	return 1;
+}
+
+static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_remote_name_req *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	/* If successful wait for the name req complete event before
+	 * checking for the need to do authentication */
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_REMOTE_NAME_REQ);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	if (!hci_outgoing_auth_needed(hdev, conn))
+		goto unlock;
+
+	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+		struct hci_cp_auth_requested cp;
+		cp.handle = __cpu_to_le16(conn->handle);
+		hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_read_remote_features *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_FEATURES);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		if (conn->state == BT_CONFIG) {
+			hci_proto_connect_cfm(conn, status);
+			hci_conn_put(conn);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_read_remote_ext_features *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		if (conn->state == BT_CONFIG) {
+			hci_proto_connect_cfm(conn, status);
+			hci_conn_put(conn);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_setup_sync_conn *cp;
+	struct hci_conn *acl, *sco;
+	__u16 handle;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
+	if (!cp)
+		return;
+
+	handle = __le16_to_cpu(cp->handle);
+
+	BT_DBG("%s handle %d", hdev->name, handle);
+
+	hci_dev_lock(hdev);
+
+	acl = hci_conn_hash_lookup_handle(hdev, handle);
+	if (acl) {
+		sco = acl->link;
+		if (sco) {
+			sco->state = BT_CLOSED;
+
+			hci_proto_connect_cfm(sco, status);
+			hci_conn_del(sco);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_sniff_mode *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_SNIFF_MODE);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
+
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, status);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_exit_sniff_mode *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	if (!status)
+		return;
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_EXIT_SNIFF_MODE);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+	if (conn) {
+		clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
+
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, status);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_create_conn(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_le_create_conn *cp;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status 0x%x", hdev->name, status);
+
+	cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CONN);
+	if (!cp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->peer_addr);
+
+	BT_DBG("%s bdaddr %s conn %p", hdev->name, batostr(&cp->peer_addr),
+		conn);
+
+	if (status) {
+		if (conn && conn->state == BT_CONNECT) {
+			conn->state = BT_CLOSED;
+			hci_proto_connect_cfm(conn, status);
+			hci_conn_del(conn);
+		}
+	} else {
+		if (!conn) {
+			conn = hci_conn_add(hdev, LE_LINK, 0, &cp->peer_addr);
+			if (conn) {
+				conn->dst_type = cp->peer_addr_type;
+				conn->out = 1;
+			} else {
+				BT_ERR("No memory for new connection");
+			}
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status)
+{
+	BT_DBG("%s status 0x%x", hdev->name, status);
+}
+
+static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	__u8 status = *((__u8 *) skb->data);
+
+	BT_DBG("%s status %d", hdev->name, status);
+
+	if (test_and_clear_bit(HCI_INQUIRY, &hdev->flags) &&
+				test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_discovering(hdev->id, 0);
+
+	hci_req_complete(hdev, HCI_OP_INQUIRY, status);
+
+	hci_conn_check_pending(hdev);
+}
+
+static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct inquiry_data data;
+	struct inquiry_info *info = (void *) (skb->data + 1);
+	int num_rsp = *((__u8 *) skb->data);
+
+	BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+	if (!num_rsp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) {
+
+		if (test_bit(HCI_MGMT, &hdev->flags))
+			mgmt_discovering(hdev->id, 1);
+	}
+
+	for (; num_rsp; num_rsp--, info++) {
+		bacpy(&data.bdaddr, &info->bdaddr);
+		data.pscan_rep_mode	= info->pscan_rep_mode;
+		data.pscan_period_mode	= info->pscan_period_mode;
+		data.pscan_mode		= info->pscan_mode;
+		memcpy(data.dev_class, info->dev_class, 3);
+		data.clock_offset	= info->clock_offset;
+		data.rssi		= 0x00;
+		data.ssp_mode		= 0x00;
+		hci_inquiry_cache_update(hdev, &data);
+		mgmt_device_found(hdev->id, &info->bdaddr, info->dev_class, 0,
+									NULL);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_conn_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+	if (!conn) {
+		if (ev->link_type != SCO_LINK)
+			goto unlock;
+
+		conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
+		if (!conn)
+			goto unlock;
+
+		conn->type = SCO_LINK;
+	}
+
+	if (!ev->status) {
+		conn->handle = __le16_to_cpu(ev->handle);
+
+		if (conn->type == ACL_LINK) {
+			conn->state = BT_CONFIG;
+			hci_conn_hold(conn);
+			conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+			mgmt_connected(hdev->id, &ev->bdaddr);
+		} else
+			conn->state = BT_CONNECTED;
+
+		hci_conn_hold_device(conn);
+		hci_conn_add_sysfs(conn);
+
+		if (test_bit(HCI_AUTH, &hdev->flags))
+			conn->link_mode |= HCI_LM_AUTH;
+
+		if (test_bit(HCI_ENCRYPT, &hdev->flags))
+			conn->link_mode |= HCI_LM_ENCRYPT;
+
+		/* Get remote features */
+		if (conn->type == ACL_LINK) {
+			struct hci_cp_read_remote_features cp;
+			cp.handle = ev->handle;
+			hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
+							sizeof(cp), &cp);
+		}
+
+		/* Set packet type for incoming connection */
+		if (!conn->out && hdev->hci_ver < 3) {
+			struct hci_cp_change_conn_ptype cp;
+			cp.handle = ev->handle;
+			cp.pkt_type = cpu_to_le16(conn->pkt_type);
+			hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE,
+							sizeof(cp), &cp);
+		}
+	} else {
+		conn->state = BT_CLOSED;
+		if (conn->type == ACL_LINK)
+			mgmt_connect_failed(hdev->id, &ev->bdaddr, ev->status);
+	}
+
+	if (conn->type == ACL_LINK)
+		hci_sco_setup(conn, ev->status);
+
+	if (ev->status) {
+		hci_proto_connect_cfm(conn, ev->status);
+		hci_conn_del(conn);
+	} else if (ev->link_type != ACL_LINK)
+		hci_proto_connect_cfm(conn, ev->status);
+
+unlock:
+	hci_dev_unlock(hdev);
+
+	hci_conn_check_pending(hdev);
+}
+
+static inline bool is_sco_active(struct hci_dev *hdev)
+{
+	if (hci_conn_hash_lookup_state(hdev, SCO_LINK, BT_CONNECTED) ||
+			(hci_conn_hash_lookup_state(hdev, ESCO_LINK,
+						    BT_CONNECTED)))
+		return true;
+	return false;
+}
+
+static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_conn_request *ev = (void *) skb->data;
+	int mask = hdev->link_mode;
+
+	BT_DBG("%s bdaddr %s type 0x%x", hdev->name,
+					batostr(&ev->bdaddr), ev->link_type);
+
+	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type);
+
+	if ((mask & HCI_LM_ACCEPT) &&
+			!hci_blacklist_lookup(hdev, &ev->bdaddr)) {
+		/* Connection accepted */
+		struct inquiry_entry *ie;
+		struct hci_conn *conn;
+
+		hci_dev_lock(hdev);
+
+		ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+		if (ie)
+			memcpy(ie->data.dev_class, ev->dev_class, 3);
+
+		conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+		if (!conn) {
+			/* pkt_type not yet used for incoming connections */
+			conn = hci_conn_add(hdev, ev->link_type, 0, &ev->bdaddr);
+			if (!conn) {
+				BT_ERR("No memory for new connection");
+				hci_dev_unlock(hdev);
+				return;
+			}
+		}
+
+		memcpy(conn->dev_class, ev->dev_class, 3);
+		conn->state = BT_CONNECT;
+
+		hci_dev_unlock(hdev);
+
+		if (ev->link_type == ACL_LINK || !lmp_esco_capable(hdev)) {
+			struct hci_cp_accept_conn_req cp;
+
+			bacpy(&cp.bdaddr, &ev->bdaddr);
+
+			if (lmp_rswitch_capable(hdev) && ((mask & HCI_LM_MASTER)
+						|| is_sco_active(hdev)))
+				cp.role = 0x00; /* Become master */
+			else
+				cp.role = 0x01; /* Remain slave */
+
+			hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ,
+							sizeof(cp), &cp);
+		} else {
+			struct hci_cp_accept_sync_conn_req cp;
+
+			bacpy(&cp.bdaddr, &ev->bdaddr);
+			cp.pkt_type = cpu_to_le16(conn->pkt_type);
+
+			cp.tx_bandwidth   = cpu_to_le32(0x00001f40);
+			cp.rx_bandwidth   = cpu_to_le32(0x00001f40);
+			cp.max_latency    = cpu_to_le16(0xffff);
+			cp.content_format = cpu_to_le16(hdev->voice_setting);
+			cp.retrans_effort = 0xff;
+
+			hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ,
+							sizeof(cp), &cp);
+		}
+	} else {
+		/* Connection rejected */
+		struct hci_cp_reject_conn_req cp;
+
+		bacpy(&cp.bdaddr, &ev->bdaddr);
+		cp.reason = 0x0f;
+		hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp);
+	}
+}
+
+static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_disconn_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	if (ev->status) {
+		mgmt_disconnect_failed(hdev->id);
+		return;
+	}
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (!conn)
+		goto unlock;
+
+	conn->state = BT_CLOSED;
+
+	if (conn->type == ACL_LINK || conn->type == LE_LINK)
+		mgmt_disconnected(hdev->id, &conn->dst);
+
+	hci_proto_disconn_cfm(conn, ev->reason);
+	hci_conn_del(conn);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_auth_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (!conn)
+		goto unlock;
+
+	if (!ev->status) {
+		if (!(conn->ssp_mode > 0 && hdev->ssp_mode > 0) &&
+				test_bit(HCI_CONN_REAUTH_PEND,	&conn->pend)) {
+			BT_INFO("re-auth of legacy device is not possible.");
+		} else {
+			conn->link_mode |= HCI_LM_AUTH;
+			conn->sec_level = conn->pending_sec_level;
+		}
+	} else {
+		mgmt_auth_failed(hdev->id, &conn->dst, ev->status);
+	}
+
+	clear_bit(HCI_CONN_AUTH_PEND, &conn->pend);
+	clear_bit(HCI_CONN_REAUTH_PEND, &conn->pend);
+
+	if (conn->state == BT_CONFIG) {
+		if (!ev->status && hdev->ssp_mode > 0 && conn->ssp_mode > 0) {
+			struct hci_cp_set_conn_encrypt cp;
+			cp.handle  = ev->handle;
+			cp.encrypt = 0x01;
+			hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+									&cp);
+		} else {
+			conn->state = BT_CONNECTED;
+			hci_proto_connect_cfm(conn, ev->status);
+			hci_conn_put(conn);
+		}
+	} else {
+		hci_auth_cfm(conn, ev->status);
+
+		hci_conn_hold(conn);
+		conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+		hci_conn_put(conn);
+	}
+
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) {
+		if (!ev->status) {
+			struct hci_cp_set_conn_encrypt cp;
+			cp.handle  = ev->handle;
+			cp.encrypt = 0x01;
+			hci_send_cmd(hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp),
+									&cp);
+		} else {
+			clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
+			hci_encrypt_cfm(conn, ev->status, 0x00);
+		}
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_remote_name *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_conn_check_pending(hdev);
+
+	hci_dev_lock(hdev);
+
+	if (ev->status == 0 && test_bit(HCI_MGMT, &hdev->flags))
+		mgmt_remote_name(hdev->id, &ev->bdaddr, ev->name);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	if (!hci_outgoing_auth_needed(hdev, conn))
+		goto unlock;
+
+	if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+		struct hci_cp_auth_requested cp;
+		cp.handle = __cpu_to_le16(conn->handle);
+		hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_encrypt_change *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn) {
+		if (!ev->status) {
+			if (ev->encrypt) {
+				/* Encryption implies authentication */
+				conn->link_mode |= HCI_LM_AUTH;
+				conn->link_mode |= HCI_LM_ENCRYPT;
+				conn->sec_level = conn->pending_sec_level;
+			} else
+				conn->link_mode &= ~HCI_LM_ENCRYPT;
+		}
+
+		clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend);
+
+		if (conn->state == BT_CONFIG) {
+			if (!ev->status)
+				conn->state = BT_CONNECTED;
+
+			hci_proto_connect_cfm(conn, ev->status);
+			hci_conn_put(conn);
+		} else
+			hci_encrypt_cfm(conn, ev->status, ev->encrypt);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_change_link_key_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_change_link_key_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn) {
+		if (!ev->status)
+			conn->link_mode |= HCI_LM_SECURE;
+
+		clear_bit(HCI_CONN_AUTH_PEND, &conn->pend);
+
+		hci_key_change_cfm(conn, ev->status);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_features_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_remote_features *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (!conn)
+		goto unlock;
+
+	if (!ev->status)
+		memcpy(conn->features, ev->features, 8);
+
+	if (conn->state != BT_CONFIG)
+		goto unlock;
+
+	if (!ev->status && lmp_ssp_capable(hdev) && lmp_ssp_capable(conn)) {
+		struct hci_cp_read_remote_ext_features cp;
+		cp.handle = ev->handle;
+		cp.page = 0x01;
+		hci_send_cmd(hdev, HCI_OP_READ_REMOTE_EXT_FEATURES,
+							sizeof(cp), &cp);
+		goto unlock;
+	}
+
+	if (!ev->status) {
+		struct hci_cp_remote_name_req cp;
+		memset(&cp, 0, sizeof(cp));
+		bacpy(&cp.bdaddr, &conn->dst);
+		cp.pscan_rep_mode = 0x02;
+		hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+	}
+
+	if (!hci_outgoing_auth_needed(hdev, conn)) {
+		conn->state = BT_CONNECTED;
+		hci_proto_connect_cfm(conn, ev->status);
+		hci_conn_put(conn);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_version_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	BT_DBG("%s", hdev->name);
+}
+
+static inline void hci_qos_setup_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	BT_DBG("%s", hdev->name);
+}
+
+static inline void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_cmd_complete *ev = (void *) skb->data;
+	__u16 opcode;
+
+	skb_pull(skb, sizeof(*ev));
+
+	opcode = __le16_to_cpu(ev->opcode);
+
+	switch (opcode) {
+	case HCI_OP_INQUIRY_CANCEL:
+		hci_cc_inquiry_cancel(hdev, skb);
+		break;
+
+	case HCI_OP_EXIT_PERIODIC_INQ:
+		hci_cc_exit_periodic_inq(hdev, skb);
+		break;
+
+	case HCI_OP_REMOTE_NAME_REQ_CANCEL:
+		hci_cc_remote_name_req_cancel(hdev, skb);
+		break;
+
+	case HCI_OP_ROLE_DISCOVERY:
+		hci_cc_role_discovery(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LINK_POLICY:
+		hci_cc_read_link_policy(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_LINK_POLICY:
+		hci_cc_write_link_policy(hdev, skb);
+		break;
+
+	case HCI_OP_READ_DEF_LINK_POLICY:
+		hci_cc_read_def_link_policy(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_DEF_LINK_POLICY:
+		hci_cc_write_def_link_policy(hdev, skb);
+		break;
+
+	case HCI_OP_RESET:
+		hci_cc_reset(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_LOCAL_NAME:
+		hci_cc_write_local_name(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_NAME:
+		hci_cc_read_local_name(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_AUTH_ENABLE:
+		hci_cc_write_auth_enable(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_ENCRYPT_MODE:
+		hci_cc_write_encrypt_mode(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_SCAN_ENABLE:
+		hci_cc_write_scan_enable(hdev, skb);
+		break;
+
+	case HCI_OP_READ_CLASS_OF_DEV:
+		hci_cc_read_class_of_dev(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_CLASS_OF_DEV:
+		hci_cc_write_class_of_dev(hdev, skb);
+		break;
+
+	case HCI_OP_READ_VOICE_SETTING:
+		hci_cc_read_voice_setting(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_VOICE_SETTING:
+		hci_cc_write_voice_setting(hdev, skb);
+		break;
+
+	case HCI_OP_HOST_BUFFER_SIZE:
+		hci_cc_host_buffer_size(hdev, skb);
+		break;
+
+	case HCI_OP_READ_SSP_MODE:
+		hci_cc_read_ssp_mode(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_SSP_MODE:
+		hci_cc_write_ssp_mode(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_VERSION:
+		hci_cc_read_local_version(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_COMMANDS:
+		hci_cc_read_local_commands(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_FEATURES:
+		hci_cc_read_local_features(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_EXT_FEATURES:
+		hci_cc_read_local_ext_features(hdev, skb);
+		break;
+
+	case HCI_OP_READ_BUFFER_SIZE:
+		hci_cc_read_buffer_size(hdev, skb);
+		break;
+
+	case HCI_OP_READ_BD_ADDR:
+		hci_cc_read_bd_addr(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_CA_TIMEOUT:
+		hci_cc_write_ca_timeout(hdev, skb);
+		break;
+
+	case HCI_OP_DELETE_STORED_LINK_KEY:
+		hci_cc_delete_stored_link_key(hdev, skb);
+		break;
+
+	case HCI_OP_SET_EVENT_MASK:
+		hci_cc_set_event_mask(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_INQUIRY_MODE:
+		hci_cc_write_inquiry_mode(hdev, skb);
+		break;
+
+	case HCI_OP_READ_INQ_RSP_TX_POWER:
+		hci_cc_read_inq_rsp_tx_power(hdev, skb);
+		break;
+
+	case HCI_OP_SET_EVENT_FLT:
+		hci_cc_set_event_flt(hdev, skb);
+		break;
+
+	case HCI_OP_PIN_CODE_REPLY:
+		hci_cc_pin_code_reply(hdev, skb);
+		break;
+
+	case HCI_OP_PIN_CODE_NEG_REPLY:
+		hci_cc_pin_code_neg_reply(hdev, skb);
+		break;
+
+	case HCI_OP_READ_LOCAL_OOB_DATA:
+		hci_cc_read_local_oob_data_reply(hdev, skb);
+		break;
+
+	case HCI_OP_LE_READ_BUFFER_SIZE:
+		hci_cc_le_read_buffer_size(hdev, skb);
+		break;
+
+	case HCI_OP_USER_CONFIRM_REPLY:
+		hci_cc_user_confirm_reply(hdev, skb);
+		break;
+
+	case HCI_OP_USER_CONFIRM_NEG_REPLY:
+		hci_cc_user_confirm_neg_reply(hdev, skb);
+		break;
+
+	case HCI_OP_LE_SET_SCAN_ENABLE:
+		hci_cc_le_set_scan_enable(hdev, skb);
+		break;
+
+	case HCI_OP_LE_LTK_REPLY:
+		hci_cc_le_ltk_reply(hdev, skb);
+		break;
+
+	case HCI_OP_LE_LTK_NEG_REPLY:
+		hci_cc_le_ltk_neg_reply(hdev, skb);
+		break;
+
+	case HCI_OP_WRITE_LE_HOST_SUPPORTED:
+		hci_cc_write_le_host_supported(hdev, skb);
+		break;
+
+	default:
+		BT_DBG("%s opcode 0x%x", hdev->name, opcode);
+		break;
+	}
+
+	if (ev->opcode != HCI_OP_NOP)
+		del_timer(&hdev->cmd_timer);
+
+	if (ev->ncmd) {
+		atomic_set(&hdev->cmd_cnt, 1);
+		if (!skb_queue_empty(&hdev->cmd_q))
+			tasklet_schedule(&hdev->cmd_task);
+	}
+}
+
+static inline void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_cmd_status *ev = (void *) skb->data;
+	__u16 opcode;
+
+	skb_pull(skb, sizeof(*ev));
+
+	opcode = __le16_to_cpu(ev->opcode);
+
+	switch (opcode) {
+	case HCI_OP_INQUIRY:
+		hci_cs_inquiry(hdev, ev->status);
+		break;
+
+	case HCI_OP_CREATE_CONN:
+		hci_cs_create_conn(hdev, ev->status);
+		break;
+
+	case HCI_OP_ADD_SCO:
+		hci_cs_add_sco(hdev, ev->status);
+		break;
+
+	case HCI_OP_AUTH_REQUESTED:
+		hci_cs_auth_requested(hdev, ev->status);
+		break;
+
+	case HCI_OP_SET_CONN_ENCRYPT:
+		hci_cs_set_conn_encrypt(hdev, ev->status);
+		break;
+
+	case HCI_OP_REMOTE_NAME_REQ:
+		hci_cs_remote_name_req(hdev, ev->status);
+		break;
+
+	case HCI_OP_READ_REMOTE_FEATURES:
+		hci_cs_read_remote_features(hdev, ev->status);
+		break;
+
+	case HCI_OP_READ_REMOTE_EXT_FEATURES:
+		hci_cs_read_remote_ext_features(hdev, ev->status);
+		break;
+
+	case HCI_OP_SETUP_SYNC_CONN:
+		hci_cs_setup_sync_conn(hdev, ev->status);
+		break;
+
+	case HCI_OP_SNIFF_MODE:
+		hci_cs_sniff_mode(hdev, ev->status);
+		break;
+
+	case HCI_OP_EXIT_SNIFF_MODE:
+		hci_cs_exit_sniff_mode(hdev, ev->status);
+		break;
+
+	case HCI_OP_DISCONNECT:
+		if (ev->status != 0)
+			mgmt_disconnect_failed(hdev->id);
+		break;
+
+	case HCI_OP_LE_CREATE_CONN:
+		hci_cs_le_create_conn(hdev, ev->status);
+		break;
+
+	case HCI_OP_LE_START_ENC:
+		hci_cs_le_start_enc(hdev, ev->status);
+		break;
+
+	default:
+		BT_DBG("%s opcode 0x%x", hdev->name, opcode);
+		break;
+	}
+
+	if (ev->opcode != HCI_OP_NOP)
+		del_timer(&hdev->cmd_timer);
+
+	if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags)) {
+		atomic_set(&hdev->cmd_cnt, 1);
+		if (!skb_queue_empty(&hdev->cmd_q))
+			tasklet_schedule(&hdev->cmd_task);
+	}
+}
+
+static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_role_change *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn) {
+		if (!ev->status) {
+			if (ev->role)
+				conn->link_mode &= ~HCI_LM_MASTER;
+			else
+				conn->link_mode |= HCI_LM_MASTER;
+		}
+
+		clear_bit(HCI_CONN_RSWITCH_PEND, &conn->pend);
+
+		hci_role_switch_cfm(conn, ev->status, ev->role);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_num_comp_pkts *ev = (void *) skb->data;
+	__le16 *ptr;
+	int i;
+
+	skb_pull(skb, sizeof(*ev));
+
+	BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl);
+
+	if (skb->len < ev->num_hndl * 4) {
+		BT_DBG("%s bad parameters", hdev->name);
+		return;
+	}
+
+	tasklet_disable(&hdev->tx_task);
+
+	for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) {
+		struct hci_conn *conn;
+		__u16  handle, count;
+
+		handle = get_unaligned_le16(ptr++);
+		count  = get_unaligned_le16(ptr++);
+
+		conn = hci_conn_hash_lookup_handle(hdev, handle);
+		if (conn) {
+			conn->sent -= count;
+
+			if (conn->type == ACL_LINK) {
+				hdev->acl_cnt += count;
+				if (hdev->acl_cnt > hdev->acl_pkts)
+					hdev->acl_cnt = hdev->acl_pkts;
+			} else if (conn->type == LE_LINK) {
+				if (hdev->le_pkts) {
+					hdev->le_cnt += count;
+					if (hdev->le_cnt > hdev->le_pkts)
+						hdev->le_cnt = hdev->le_pkts;
+				} else {
+					hdev->acl_cnt += count;
+					if (hdev->acl_cnt > hdev->acl_pkts)
+						hdev->acl_cnt = hdev->acl_pkts;
+				}
+			} else {
+				hdev->sco_cnt += count;
+				if (hdev->sco_cnt > hdev->sco_pkts)
+					hdev->sco_cnt = hdev->sco_pkts;
+			}
+		}
+	}
+
+	tasklet_schedule(&hdev->tx_task);
+
+	tasklet_enable(&hdev->tx_task);
+}
+
+static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_mode_change *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn) {
+		conn->mode = ev->mode;
+		conn->interval = __le16_to_cpu(ev->interval);
+
+		if (!test_and_clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend)) {
+			if (conn->mode == HCI_CM_ACTIVE)
+				conn->power_save = 1;
+			else
+				conn->power_save = 0;
+		}
+
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, ev->status);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_pin_code_req *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn && conn->state == BT_CONNECTED) {
+		hci_conn_hold(conn);
+		conn->disc_timeout = HCI_PAIRING_TIMEOUT;
+		hci_conn_put(conn);
+	}
+
+	if (!test_bit(HCI_PAIRABLE, &hdev->flags))
+		hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY,
+					sizeof(ev->bdaddr), &ev->bdaddr);
+	else if (test_bit(HCI_MGMT, &hdev->flags)) {
+		u8 secure;
+
+		if (conn->pending_sec_level == BT_SECURITY_HIGH)
+			secure = 1;
+		else
+			secure = 0;
+
+		mgmt_pin_code_request(hdev->id, &ev->bdaddr, secure);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_link_key_req *ev = (void *) skb->data;
+	struct hci_cp_link_key_reply cp;
+	struct hci_conn *conn;
+	struct link_key *key;
+
+	BT_DBG("%s", hdev->name);
+
+	if (!test_bit(HCI_LINK_KEYS, &hdev->flags))
+		return;
+
+	hci_dev_lock(hdev);
+
+	key = hci_find_link_key(hdev, &ev->bdaddr);
+	if (!key) {
+		BT_DBG("%s link key not found for %s", hdev->name,
+							batostr(&ev->bdaddr));
+		goto not_found;
+	}
+
+	BT_DBG("%s found key type %u for %s", hdev->name, key->type,
+							batostr(&ev->bdaddr));
+
+	if (!test_bit(HCI_DEBUG_KEYS, &hdev->flags) &&
+				key->type == HCI_LK_DEBUG_COMBINATION) {
+		BT_DBG("%s ignoring debug key", hdev->name);
+		goto not_found;
+	}
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn) {
+		if (key->type == HCI_LK_UNAUTH_COMBINATION &&
+				conn->auth_type != 0xff &&
+				(conn->auth_type & 0x01)) {
+			BT_DBG("%s ignoring unauthenticated key", hdev->name);
+			goto not_found;
+		}
+
+		if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 &&
+				conn->pending_sec_level == BT_SECURITY_HIGH) {
+			BT_DBG("%s ignoring key unauthenticated for high \
+							security", hdev->name);
+			goto not_found;
+		}
+
+		conn->key_type = key->type;
+		conn->pin_length = key->pin_len;
+	}
+
+	bacpy(&cp.bdaddr, &ev->bdaddr);
+	memcpy(cp.link_key, key->val, 16);
+
+	hci_send_cmd(hdev, HCI_OP_LINK_KEY_REPLY, sizeof(cp), &cp);
+
+	hci_dev_unlock(hdev);
+
+	return;
+
+not_found:
+	hci_send_cmd(hdev, HCI_OP_LINK_KEY_NEG_REPLY, 6, &ev->bdaddr);
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_link_key_notify *ev = (void *) skb->data;
+	struct hci_conn *conn;
+	u8 pin_len = 0;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (conn) {
+		hci_conn_hold(conn);
+		conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+		pin_len = conn->pin_length;
+
+		if (ev->key_type != HCI_LK_CHANGED_COMBINATION)
+			conn->key_type = ev->key_type;
+
+		hci_conn_put(conn);
+	}
+
+	if (test_bit(HCI_LINK_KEYS, &hdev->flags))
+		hci_add_link_key(hdev, conn, 1, &ev->bdaddr, ev->link_key,
+							ev->key_type, pin_len);
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_clock_offset *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn && !ev->status) {
+		struct inquiry_entry *ie;
+
+		ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+		if (ie) {
+			ie->data.clock_offset = ev->clock_offset;
+			ie->timestamp = jiffies;
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_pkt_type_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_pkt_type_change *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn && !ev->status)
+		conn->pkt_type = __le16_to_cpu(ev->pkt_type);
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_pscan_rep_mode *ev = (void *) skb->data;
+	struct inquiry_entry *ie;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+	if (ie) {
+		ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+		ie->timestamp = jiffies;
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct inquiry_data data;
+	int num_rsp = *((__u8 *) skb->data);
+
+	BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+	if (!num_rsp)
+		return;
+
+	hci_dev_lock(hdev);
+
+	if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) {
+
+		if (test_bit(HCI_MGMT, &hdev->flags))
+			mgmt_discovering(hdev->id, 1);
+	}
+
+	if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
+		struct inquiry_info_with_rssi_and_pscan_mode *info;
+		info = (void *) (skb->data + 1);
+
+		for (; num_rsp; num_rsp--, info++) {
+			bacpy(&data.bdaddr, &info->bdaddr);
+			data.pscan_rep_mode	= info->pscan_rep_mode;
+			data.pscan_period_mode	= info->pscan_period_mode;
+			data.pscan_mode		= info->pscan_mode;
+			memcpy(data.dev_class, info->dev_class, 3);
+			data.clock_offset	= info->clock_offset;
+			data.rssi		= info->rssi;
+			data.ssp_mode		= 0x00;
+			hci_inquiry_cache_update(hdev, &data);
+			mgmt_device_found(hdev->id, &info->bdaddr,
+						info->dev_class, info->rssi,
+						NULL);
+		}
+	} else {
+		struct inquiry_info_with_rssi *info = (void *) (skb->data + 1);
+
+		for (; num_rsp; num_rsp--, info++) {
+			bacpy(&data.bdaddr, &info->bdaddr);
+			data.pscan_rep_mode	= info->pscan_rep_mode;
+			data.pscan_period_mode	= info->pscan_period_mode;
+			data.pscan_mode		= 0x00;
+			memcpy(data.dev_class, info->dev_class, 3);
+			data.clock_offset	= info->clock_offset;
+			data.rssi		= info->rssi;
+			data.ssp_mode		= 0x00;
+			hci_inquiry_cache_update(hdev, &data);
+			mgmt_device_found(hdev->id, &info->bdaddr,
+						info->dev_class, info->rssi,
+						NULL);
+		}
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_remote_ext_features *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (!conn)
+		goto unlock;
+
+	if (!ev->status && ev->page == 0x01) {
+		struct inquiry_entry *ie;
+
+		ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+		if (ie)
+			ie->data.ssp_mode = (ev->features[0] & 0x01);
+
+		conn->ssp_mode = (ev->features[0] & 0x01);
+	}
+
+	if (conn->state != BT_CONFIG)
+		goto unlock;
+
+	if (!ev->status) {
+		struct hci_cp_remote_name_req cp;
+		memset(&cp, 0, sizeof(cp));
+		bacpy(&cp.bdaddr, &conn->dst);
+		cp.pscan_rep_mode = 0x02;
+		hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
+	}
+
+	if (!hci_outgoing_auth_needed(hdev, conn)) {
+		conn->state = BT_CONNECTED;
+		hci_proto_connect_cfm(conn, ev->status);
+		hci_conn_put(conn);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_sync_conn_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
+	if (!conn) {
+		if (ev->link_type == ESCO_LINK)
+			goto unlock;
+
+		conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
+		if (!conn)
+			goto unlock;
+
+		conn->type = SCO_LINK;
+	}
+
+	switch (ev->status) {
+	case 0x00:
+		conn->handle = __le16_to_cpu(ev->handle);
+		conn->state  = BT_CONNECTED;
+
+		hci_conn_hold_device(conn);
+		hci_conn_add_sysfs(conn);
+		break;
+
+	case 0x10:	/* Connection Accept Timeout */
+	case 0x11:	/* Unsupported Feature or Parameter Value */
+	case 0x1c:	/* SCO interval rejected */
+	case 0x1a:	/* Unsupported Remote Feature */
+	case 0x1f:	/* Unspecified error */
+		if (conn->out && conn->attempt < 2) {
+			conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
+					(hdev->esco_type & EDR_ESCO_MASK);
+			hci_setup_sync(conn, conn->link->handle);
+			goto unlock;
+		}
+		/* fall through */
+
+	default:
+		conn->state = BT_CLOSED;
+		break;
+	}
+
+	hci_proto_connect_cfm(conn, ev->status);
+	if (ev->status)
+		hci_conn_del(conn);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_sync_conn_changed_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	BT_DBG("%s", hdev->name);
+}
+
+static inline void hci_sniff_subrate_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_sniff_subrate *ev = (void *) skb->data;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+}
+
+static inline void hci_extended_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct inquiry_data data;
+	struct extended_inquiry_info *info = (void *) (skb->data + 1);
+	int num_rsp = *((__u8 *) skb->data);
+
+	BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+
+	if (!num_rsp)
+		return;
+
+	if (!test_and_set_bit(HCI_INQUIRY, &hdev->flags)) {
+
+		if (test_bit(HCI_MGMT, &hdev->flags))
+			mgmt_discovering(hdev->id, 1);
+	}
+
+	hci_dev_lock(hdev);
+
+	for (; num_rsp; num_rsp--, info++) {
+		bacpy(&data.bdaddr, &info->bdaddr);
+		data.pscan_rep_mode	= info->pscan_rep_mode;
+		data.pscan_period_mode	= info->pscan_period_mode;
+		data.pscan_mode		= 0x00;
+		memcpy(data.dev_class, info->dev_class, 3);
+		data.clock_offset	= info->clock_offset;
+		data.rssi		= info->rssi;
+		data.ssp_mode		= 0x01;
+		hci_inquiry_cache_update(hdev, &data);
+		mgmt_device_found(hdev->id, &info->bdaddr, info->dev_class,
+						info->rssi, info->data);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline u8 hci_get_auth_req(struct hci_conn *conn)
+{
+	/* If remote requests dedicated bonding follow that lead */
+	if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) {
+		/* If both remote and local IO capabilities allow MITM
+		 * protection then require it, otherwise don't */
+		if (conn->remote_cap == 0x03 || conn->io_capability == 0x03)
+			return 0x02;
+		else
+			return 0x03;
+	}
+
+	/* If remote requests no-bonding follow that lead */
+	if (conn->remote_auth == 0x00 || conn->remote_auth == 0x01)
+		return conn->remote_auth | (conn->auth_type & 0x01);
+
+	return conn->auth_type;
+}
+
+static inline void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_io_capa_request *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	hci_conn_hold(conn);
+
+	if (!test_bit(HCI_MGMT, &hdev->flags))
+		goto unlock;
+
+	if (test_bit(HCI_PAIRABLE, &hdev->flags) ||
+			(conn->remote_auth & ~0x01) == HCI_AT_NO_BONDING) {
+		struct hci_cp_io_capability_reply cp;
+
+		bacpy(&cp.bdaddr, &ev->bdaddr);
+		cp.capability = conn->io_capability;
+		conn->auth_type = hci_get_auth_req(conn);
+		cp.authentication = conn->auth_type;
+
+		if ((conn->out == 0x01 || conn->remote_oob == 0x01) &&
+				hci_find_remote_oob_data(hdev, &conn->dst))
+			cp.oob_data = 0x01;
+		else
+			cp.oob_data = 0x00;
+
+		hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_REPLY,
+							sizeof(cp), &cp);
+	} else {
+		struct hci_cp_io_capability_neg_reply cp;
+
+		bacpy(&cp.bdaddr, &ev->bdaddr);
+		cp.reason = 0x18; /* Pairing not allowed */
+
+		hci_send_cmd(hdev, HCI_OP_IO_CAPABILITY_NEG_REPLY,
+							sizeof(cp), &cp);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_io_capa_reply *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	conn->remote_cap = ev->capability;
+	conn->remote_oob = ev->oob_data;
+	conn->remote_auth = ev->authentication;
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_user_confirm_request_evt(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_ev_user_confirm_req *ev = (void *) skb->data;
+	int loc_mitm, rem_mitm, confirm_hint = 0;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	if (!test_bit(HCI_MGMT, &hdev->flags))
+		goto unlock;
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	loc_mitm = (conn->auth_type & 0x01);
+	rem_mitm = (conn->remote_auth & 0x01);
+
+	/* If we require MITM but the remote device can't provide that
+	 * (it has NoInputNoOutput) then reject the confirmation
+	 * request. The only exception is when we're dedicated bonding
+	 * initiators (connect_cfm_cb set) since then we always have the MITM
+	 * bit set. */
+	if (!conn->connect_cfm_cb && loc_mitm && conn->remote_cap == 0x03) {
+		BT_DBG("Rejecting request: remote device can't provide MITM");
+		hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
+					sizeof(ev->bdaddr), &ev->bdaddr);
+		goto unlock;
+	}
+
+	/* If no side requires MITM protection; auto-accept */
+	if ((!loc_mitm || conn->remote_cap == 0x03) &&
+				(!rem_mitm || conn->io_capability == 0x03)) {
+
+		/* If we're not the initiators request authorization to
+		 * proceed from user space (mgmt_user_confirm with
+		 * confirm_hint set to 1). */
+		if (!test_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
+			BT_DBG("Confirming auto-accept as acceptor");
+			confirm_hint = 1;
+			goto confirm;
+		}
+
+		BT_DBG("Auto-accept of user confirmation with %ums delay",
+						hdev->auto_accept_delay);
+
+		if (hdev->auto_accept_delay > 0) {
+			int delay = msecs_to_jiffies(hdev->auto_accept_delay);
+			mod_timer(&conn->auto_accept_timer, jiffies + delay);
+			goto unlock;
+		}
+
+		hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_REPLY,
+						sizeof(ev->bdaddr), &ev->bdaddr);
+		goto unlock;
+	}
+
+confirm:
+	mgmt_user_confirm_request(hdev->id, &ev->bdaddr, ev->passkey,
+								confirm_hint);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_simple_pair_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_simple_pair_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+	if (!conn)
+		goto unlock;
+
+	/* To avoid duplicate auth_failed events to user space we check
+	 * the HCI_CONN_AUTH_PEND flag which will be set if we
+	 * initiated the authentication. A traditional auth_complete
+	 * event gets always produced as initiator and is also mapped to
+	 * the mgmt_auth_failed event */
+	if (!test_bit(HCI_CONN_AUTH_PEND, &conn->pend) && ev->status != 0)
+		mgmt_auth_failed(hdev->id, &conn->dst, ev->status);
+
+	hci_conn_put(conn);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_host_features_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_remote_host_features *ev = (void *) skb->data;
+	struct inquiry_entry *ie;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+	if (ie)
+		ie->data.ssp_mode = (ev->features[0] & 0x01);
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_remote_oob_data_request_evt(struct hci_dev *hdev,
+							struct sk_buff *skb)
+{
+	struct hci_ev_remote_oob_data_request *ev = (void *) skb->data;
+	struct oob_data *data;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	if (!test_bit(HCI_MGMT, &hdev->flags))
+		goto unlock;
+
+	data = hci_find_remote_oob_data(hdev, &ev->bdaddr);
+	if (data) {
+		struct hci_cp_remote_oob_data_reply cp;
+
+		bacpy(&cp.bdaddr, &ev->bdaddr);
+		memcpy(cp.hash, data->hash, sizeof(cp.hash));
+		memcpy(cp.randomizer, data->randomizer, sizeof(cp.randomizer));
+
+		hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_REPLY, sizeof(cp),
+									&cp);
+	} else {
+		struct hci_cp_remote_oob_data_neg_reply cp;
+
+		bacpy(&cp.bdaddr, &ev->bdaddr);
+		hci_send_cmd(hdev, HCI_OP_REMOTE_OOB_DATA_NEG_REPLY, sizeof(cp),
+									&cp);
+	}
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_le_conn_complete *ev = (void *) skb->data;
+	struct hci_conn *conn;
+
+	BT_DBG("%s status %d", hdev->name, ev->status);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &ev->bdaddr);
+	if (!conn) {
+		conn = hci_conn_add(hdev, LE_LINK, 0, &ev->bdaddr);
+		if (!conn) {
+			BT_ERR("No memory for new connection");
+			hci_dev_unlock(hdev);
+			return;
+		}
+
+		conn->dst_type = ev->bdaddr_type;
+	}
+
+	if (ev->status) {
+		mgmt_connect_failed(hdev->id, &ev->bdaddr, ev->status);
+		hci_proto_connect_cfm(conn, ev->status);
+		conn->state = BT_CLOSED;
+		hci_conn_del(conn);
+		goto unlock;
+	}
+
+	mgmt_connected(hdev->id, &ev->bdaddr);
+
+	conn->sec_level = BT_SECURITY_LOW;
+	conn->handle = __le16_to_cpu(ev->handle);
+	conn->state = BT_CONNECTED;
+
+	hci_conn_hold_device(conn);
+	hci_conn_add_sysfs(conn);
+
+	hci_proto_connect_cfm(conn, ev->status);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_le_adv_report_evt(struct hci_dev *hdev,
+						struct sk_buff *skb)
+{
+	struct hci_ev_le_advertising_info *ev;
+	u8 num_reports;
+
+	num_reports = skb->data[0];
+	ev = (void *) &skb->data[1];
+
+	hci_dev_lock(hdev);
+
+	hci_add_adv_entry(hdev, ev);
+
+	while (--num_reports) {
+		ev = (void *) (ev->data + ev->length + 1);
+		hci_add_adv_entry(hdev, ev);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_le_ltk_request_evt(struct hci_dev *hdev,
+						struct sk_buff *skb)
+{
+	struct hci_ev_le_ltk_req *ev = (void *) skb->data;
+	struct hci_cp_le_ltk_reply cp;
+	struct hci_cp_le_ltk_neg_reply neg;
+	struct hci_conn *conn;
+	struct link_key *ltk;
+
+	BT_DBG("%s handle %d", hdev->name, cpu_to_le16(ev->handle));
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+	if (conn == NULL)
+		goto not_found;
+
+	ltk = hci_find_ltk(hdev, ev->ediv, ev->random);
+	if (ltk == NULL)
+		goto not_found;
+
+	memcpy(cp.ltk, ltk->val, sizeof(ltk->val));
+	cp.handle = cpu_to_le16(conn->handle);
+	conn->pin_length = ltk->pin_len;
+
+	hci_send_cmd(hdev, HCI_OP_LE_LTK_REPLY, sizeof(cp), &cp);
+
+	hci_dev_unlock(hdev);
+
+	return;
+
+not_found:
+	neg.handle = ev->handle;
+	hci_send_cmd(hdev, HCI_OP_LE_LTK_NEG_REPLY, sizeof(neg), &neg);
+	hci_dev_unlock(hdev);
+}
+
+static inline void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_le_meta *le_ev = (void *) skb->data;
+
+	skb_pull(skb, sizeof(*le_ev));
+
+	switch (le_ev->subevent) {
+	case HCI_EV_LE_CONN_COMPLETE:
+		hci_le_conn_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_LE_ADVERTISING_REPORT:
+		hci_le_adv_report_evt(hdev, skb);
+		break;
+
+	case HCI_EV_LE_LTK_REQ:
+		hci_le_ltk_request_evt(hdev, skb);
+		break;
+
+	default:
+		break;
+	}
+}
+
+void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_event_hdr *hdr = (void *) skb->data;
+	__u8 event = hdr->evt;
+
+	skb_pull(skb, HCI_EVENT_HDR_SIZE);
+
+	switch (event) {
+	case HCI_EV_INQUIRY_COMPLETE:
+		hci_inquiry_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_INQUIRY_RESULT:
+		hci_inquiry_result_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CONN_COMPLETE:
+		hci_conn_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CONN_REQUEST:
+		hci_conn_request_evt(hdev, skb);
+		break;
+
+	case HCI_EV_DISCONN_COMPLETE:
+		hci_disconn_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_AUTH_COMPLETE:
+		hci_auth_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_NAME:
+		hci_remote_name_evt(hdev, skb);
+		break;
+
+	case HCI_EV_ENCRYPT_CHANGE:
+		hci_encrypt_change_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CHANGE_LINK_KEY_COMPLETE:
+		hci_change_link_key_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_FEATURES:
+		hci_remote_features_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_VERSION:
+		hci_remote_version_evt(hdev, skb);
+		break;
+
+	case HCI_EV_QOS_SETUP_COMPLETE:
+		hci_qos_setup_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CMD_COMPLETE:
+		hci_cmd_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CMD_STATUS:
+		hci_cmd_status_evt(hdev, skb);
+		break;
+
+	case HCI_EV_ROLE_CHANGE:
+		hci_role_change_evt(hdev, skb);
+		break;
+
+	case HCI_EV_NUM_COMP_PKTS:
+		hci_num_comp_pkts_evt(hdev, skb);
+		break;
+
+	case HCI_EV_MODE_CHANGE:
+		hci_mode_change_evt(hdev, skb);
+		break;
+
+	case HCI_EV_PIN_CODE_REQ:
+		hci_pin_code_request_evt(hdev, skb);
+		break;
+
+	case HCI_EV_LINK_KEY_REQ:
+		hci_link_key_request_evt(hdev, skb);
+		break;
+
+	case HCI_EV_LINK_KEY_NOTIFY:
+		hci_link_key_notify_evt(hdev, skb);
+		break;
+
+	case HCI_EV_CLOCK_OFFSET:
+		hci_clock_offset_evt(hdev, skb);
+		break;
+
+	case HCI_EV_PKT_TYPE_CHANGE:
+		hci_pkt_type_change_evt(hdev, skb);
+		break;
+
+	case HCI_EV_PSCAN_REP_MODE:
+		hci_pscan_rep_mode_evt(hdev, skb);
+		break;
+
+	case HCI_EV_INQUIRY_RESULT_WITH_RSSI:
+		hci_inquiry_result_with_rssi_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_EXT_FEATURES:
+		hci_remote_ext_features_evt(hdev, skb);
+		break;
+
+	case HCI_EV_SYNC_CONN_COMPLETE:
+		hci_sync_conn_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_SYNC_CONN_CHANGED:
+		hci_sync_conn_changed_evt(hdev, skb);
+		break;
+
+	case HCI_EV_SNIFF_SUBRATE:
+		hci_sniff_subrate_evt(hdev, skb);
+		break;
+
+	case HCI_EV_EXTENDED_INQUIRY_RESULT:
+		hci_extended_inquiry_result_evt(hdev, skb);
+		break;
+
+	case HCI_EV_IO_CAPA_REQUEST:
+		hci_io_capa_request_evt(hdev, skb);
+		break;
+
+	case HCI_EV_IO_CAPA_REPLY:
+		hci_io_capa_reply_evt(hdev, skb);
+		break;
+
+	case HCI_EV_USER_CONFIRM_REQUEST:
+		hci_user_confirm_request_evt(hdev, skb);
+		break;
+
+	case HCI_EV_SIMPLE_PAIR_COMPLETE:
+		hci_simple_pair_complete_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_HOST_FEATURES:
+		hci_remote_host_features_evt(hdev, skb);
+		break;
+
+	case HCI_EV_LE_META:
+		hci_le_meta_evt(hdev, skb);
+		break;
+
+	case HCI_EV_REMOTE_OOB_DATA_REQUEST:
+		hci_remote_oob_data_request_evt(hdev, skb);
+		break;
+
+	default:
+		BT_DBG("%s event 0x%x", hdev->name, event);
+		break;
+	}
+
+	kfree_skb(skb);
+	hdev->stat.evt_rx++;
+}
+
+/* Generate internal stack event */
+void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
+{
+	struct hci_event_hdr *hdr;
+	struct hci_ev_stack_internal *ev;
+	struct sk_buff *skb;
+
+	skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE);
+	hdr->evt  = HCI_EV_STACK_INTERNAL;
+	hdr->plen = sizeof(*ev) + dlen;
+
+	ev  = (void *) skb_put(skb, sizeof(*ev) + dlen);
+	ev->type = type;
+	memcpy(ev->data, data, dlen);
+
+	bt_cb(skb)->incoming = 1;
+	__net_timestamp(skb);
+
+	bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
+	skb->dev = (void *) hdev;
+	hci_send_to_sock(hdev, skb, NULL);
+	kfree_skb(skb);
+}
+
+module_param(enable_le, bool, 0444);
+MODULE_PARM_DESC(enable_le, "Enable LE support");
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
new file mode 100644
index 00000000..ff02cf5e
--- /dev/null
+++ b/net/bluetooth/hci_sock.c
@@ -0,0 +1,817 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI sockets. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/compat.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+static int enable_mgmt;
+
+/* ----- HCI socket interface ----- */
+
+static inline int hci_test_bit(int nr, void *addr)
+{
+	return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31));
+}
+
+/* Security filter */
+static struct hci_sec_filter hci_sec_filter = {
+	/* Packet types */
+	0x10,
+	/* Events */
+	{ 0x1000d9fe, 0x0000b00c },
+	/* Commands */
+	{
+		{ 0x0 },
+		/* OGF_LINK_CTL */
+		{ 0xbe000006, 0x00000001, 0x00000000, 0x00 },
+		/* OGF_LINK_POLICY */
+		{ 0x00005200, 0x00000000, 0x00000000, 0x00 },
+		/* OGF_HOST_CTL */
+		{ 0xaab00200, 0x2b402aaa, 0x05220154, 0x00 },
+		/* OGF_INFO_PARAM */
+		{ 0x000002be, 0x00000000, 0x00000000, 0x00 },
+		/* OGF_STATUS_PARAM */
+		{ 0x000000ea, 0x00000000, 0x00000000, 0x00 }
+	}
+};
+
+static struct bt_sock_list hci_sk_list = {
+	.lock = __RW_LOCK_UNLOCKED(hci_sk_list.lock)
+};
+
+/* Send frame to RAW socket */
+void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb,
+							struct sock *skip_sk)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	BT_DBG("hdev %p len %d", hdev, skb->len);
+
+	read_lock(&hci_sk_list.lock);
+	sk_for_each(sk, node, &hci_sk_list.head) {
+		struct hci_filter *flt;
+		struct sk_buff *nskb;
+
+		if (sk == skip_sk)
+			continue;
+
+		if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev)
+			continue;
+
+		/* Don't send frame to the socket it came from */
+		if (skb->sk == sk)
+			continue;
+
+		if (bt_cb(skb)->channel != hci_pi(sk)->channel)
+			continue;
+
+		if (bt_cb(skb)->channel == HCI_CHANNEL_CONTROL)
+			goto clone;
+
+		/* Apply filter */
+		flt = &hci_pi(sk)->filter;
+
+		if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
+				0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+			continue;
+
+		if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
+			register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
+
+			if (!hci_test_bit(evt, &flt->event_mask))
+				continue;
+
+			if (flt->opcode &&
+			    ((evt == HCI_EV_CMD_COMPLETE &&
+			      flt->opcode !=
+			      get_unaligned((__le16 *)(skb->data + 3))) ||
+			     (evt == HCI_EV_CMD_STATUS &&
+			      flt->opcode !=
+			      get_unaligned((__le16 *)(skb->data + 4)))))
+				continue;
+		}
+
+clone:
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb)
+			continue;
+
+		/* Put type byte before the data */
+		if (bt_cb(skb)->channel == HCI_CHANNEL_RAW)
+			memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
+
+		if (sock_queue_rcv_skb(sk, nskb))
+			kfree_skb(nskb);
+	}
+	read_unlock(&hci_sk_list.lock);
+}
+
+static int hci_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct hci_dev *hdev;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	hdev = hci_pi(sk)->hdev;
+
+	bt_sock_unlink(&hci_sk_list, sk);
+
+	if (hdev) {
+		atomic_dec(&hdev->promisc);
+		hci_dev_put(hdev);
+	}
+
+	sock_orphan(sk);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+
+	sock_put(sk);
+	return 0;
+}
+
+static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg)
+{
+	bdaddr_t bdaddr;
+
+	if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+		return -EFAULT;
+
+	return hci_blacklist_add(hdev, &bdaddr);
+}
+
+static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg)
+{
+	bdaddr_t bdaddr;
+
+	if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+		return -EFAULT;
+
+	return hci_blacklist_del(hdev, &bdaddr);
+}
+
+/* Ioctls that require bound socket */
+static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+	struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+	if (!hdev)
+		return -EBADFD;
+
+	switch (cmd) {
+	case HCISETRAW:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+			return -EPERM;
+
+		if (arg)
+			set_bit(HCI_RAW, &hdev->flags);
+		else
+			clear_bit(HCI_RAW, &hdev->flags);
+
+		return 0;
+
+	case HCIGETCONNINFO:
+		return hci_get_conn_info(hdev, (void __user *) arg);
+
+	case HCIGETAUTHINFO:
+		return hci_get_auth_info(hdev, (void __user *) arg);
+
+	case HCIBLOCKADDR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_sock_blacklist_add(hdev, (void __user *) arg);
+
+	case HCIUNBLOCKADDR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_sock_blacklist_del(hdev, (void __user *) arg);
+
+	default:
+		if (hdev->ioctl)
+			return hdev->ioctl(hdev, cmd, arg);
+		return -EINVAL;
+	}
+}
+
+static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *) arg;
+	int err;
+
+	BT_DBG("cmd %x arg %lx", cmd, arg);
+
+	switch (cmd) {
+	case HCIGETDEVLIST:
+		return hci_get_dev_list(argp);
+
+	case HCIGETDEVINFO:
+		return hci_get_dev_info(argp);
+
+	case HCIGETCONNLIST:
+		return hci_get_conn_list(argp);
+
+	case HCIDEVUP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_dev_open(arg);
+
+	case HCIDEVDOWN:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_dev_close(arg);
+
+	case HCIDEVRESET:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_dev_reset(arg);
+
+	case HCIDEVRESTAT:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_dev_reset_stat(arg);
+
+	case HCISETSCAN:
+	case HCISETAUTH:
+	case HCISETENCRYPT:
+	case HCISETPTYPE:
+	case HCISETLINKPOL:
+	case HCISETLINKMODE:
+	case HCISETACLMTU:
+	case HCISETSCOMTU:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_dev_cmd(cmd, argp);
+
+	case HCIINQUIRY:
+		return hci_inquiry(argp);
+
+	default:
+		lock_sock(sk);
+		err = hci_sock_bound_ioctl(sk, cmd, arg);
+		release_sock(sk);
+		return err;
+	}
+}
+
+static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_hci haddr;
+	struct sock *sk = sock->sk;
+	struct hci_dev *hdev = NULL;
+	int len, err = 0;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!addr)
+		return -EINVAL;
+
+	memset(&haddr, 0, sizeof(haddr));
+	len = min_t(unsigned int, sizeof(haddr), addr_len);
+	memcpy(&haddr, addr, len);
+
+	if (haddr.hci_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	if (haddr.hci_channel > HCI_CHANNEL_CONTROL)
+		return -EINVAL;
+
+	if (haddr.hci_channel == HCI_CHANNEL_CONTROL && !enable_mgmt)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == BT_BOUND || hci_pi(sk)->hdev) {
+		err = -EALREADY;
+		goto done;
+	}
+
+	if (haddr.hci_dev != HCI_DEV_NONE) {
+		hdev = hci_dev_get(haddr.hci_dev);
+		if (!hdev) {
+			err = -ENODEV;
+			goto done;
+		}
+
+		atomic_inc(&hdev->promisc);
+	}
+
+	hci_pi(sk)->channel = haddr.hci_channel;
+	hci_pi(sk)->hdev = hdev;
+	sk->sk_state = BT_BOUND;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer)
+{
+	struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr;
+	struct sock *sk = sock->sk;
+	struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!hdev)
+		return -EBADFD;
+
+	lock_sock(sk);
+
+	*addr_len = sizeof(*haddr);
+	haddr->hci_family = AF_BLUETOOTH;
+	haddr->hci_dev    = hdev->id;
+
+	release_sock(sk);
+	return 0;
+}
+
+static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
+{
+	__u32 mask = hci_pi(sk)->cmsg_mask;
+
+	if (mask & HCI_CMSG_DIR) {
+		int incoming = bt_cb(skb)->incoming;
+		put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+	}
+
+	if (mask & HCI_CMSG_TSTAMP) {
+#ifdef CONFIG_COMPAT
+		struct compat_timeval ctv;
+#endif
+		struct timeval tv;
+		void *data;
+		int len;
+
+		skb_get_timestamp(skb, &tv);
+
+		data = &tv;
+		len = sizeof(tv);
+#ifdef CONFIG_COMPAT
+		if (msg->msg_flags & MSG_CMSG_COMPAT) {
+			ctv.tv_sec = tv.tv_sec;
+			ctv.tv_usec = tv.tv_usec;
+			data = &ctv;
+			len = sizeof(ctv);
+		}
+#endif
+
+		put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, len, data);
+	}
+}
+
+static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *msg, size_t len, int flags)
+{
+	int noblock = flags & MSG_DONTWAIT;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied, err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (flags & (MSG_OOB))
+		return -EOPNOTSUPP;
+
+	if (sk->sk_state == BT_CLOSED)
+		return 0;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		return err;
+
+	msg->msg_namelen = 0;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	skb_reset_transport_header(skb);
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	hci_sock_cmsg(sk, msg, skb);
+
+	skb_free_datagram(sk, skb);
+
+	return err ? : copied;
+}
+
+static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+			    struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct hci_dev *hdev;
+	struct sk_buff *skb;
+	int err;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE))
+		return -EINVAL;
+
+	if (len < 4 || len > HCI_MAX_FRAME_SIZE)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	switch (hci_pi(sk)->channel) {
+	case HCI_CHANNEL_RAW:
+		break;
+	case HCI_CHANNEL_CONTROL:
+		err = mgmt_control(sk, msg, len);
+		goto done;
+	default:
+		err = -EINVAL;
+		goto done;
+	}
+
+	hdev = hci_pi(sk)->hdev;
+	if (!hdev) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = -ENETDOWN;
+		goto done;
+	}
+
+	skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		goto done;
+
+	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+		err = -EFAULT;
+		goto drop;
+	}
+
+	bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
+	skb_pull(skb, 1);
+	skb->dev = (void *) hdev;
+
+	if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
+		u16 opcode = get_unaligned_le16(skb->data);
+		u16 ogf = hci_opcode_ogf(opcode);
+		u16 ocf = hci_opcode_ocf(opcode);
+
+		if (((ogf > HCI_SFLT_MAX_OGF) ||
+				!hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) &&
+					!capable(CAP_NET_RAW)) {
+			err = -EPERM;
+			goto drop;
+		}
+
+		if (test_bit(HCI_RAW, &hdev->flags) || (ogf == 0x3f)) {
+			skb_queue_tail(&hdev->raw_q, skb);
+			tasklet_schedule(&hdev->tx_task);
+		} else {
+			skb_queue_tail(&hdev->cmd_q, skb);
+			tasklet_schedule(&hdev->cmd_task);
+		}
+	} else {
+		if (!capable(CAP_NET_RAW)) {
+			err = -EPERM;
+			goto drop;
+		}
+
+		skb_queue_tail(&hdev->raw_q, skb);
+		tasklet_schedule(&hdev->tx_task);
+	}
+
+	err = len;
+
+done:
+	release_sock(sk);
+	return err;
+
+drop:
+	kfree_skb(skb);
+	goto done;
+}
+
+static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int len)
+{
+	struct hci_ufilter uf = { .opcode = 0 };
+	struct sock *sk = sock->sk;
+	int err = 0, opt = 0;
+
+	BT_DBG("sk %p, opt %d", sk, optname);
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case HCI_DATA_DIR:
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt)
+			hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR;
+		else
+			hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR;
+		break;
+
+	case HCI_TIME_STAMP:
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt)
+			hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP;
+		else
+			hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP;
+		break;
+
+	case HCI_FILTER:
+		{
+			struct hci_filter *f = &hci_pi(sk)->filter;
+
+			uf.type_mask = f->type_mask;
+			uf.opcode    = f->opcode;
+			uf.event_mask[0] = *((u32 *) f->event_mask + 0);
+			uf.event_mask[1] = *((u32 *) f->event_mask + 1);
+		}
+
+		len = min_t(unsigned int, len, sizeof(uf));
+		if (copy_from_user(&uf, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (!capable(CAP_NET_RAW)) {
+			uf.type_mask &= hci_sec_filter.type_mask;
+			uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0);
+			uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1);
+		}
+
+		{
+			struct hci_filter *f = &hci_pi(sk)->filter;
+
+			f->type_mask = uf.type_mask;
+			f->opcode    = uf.opcode;
+			*((u32 *) f->event_mask + 0) = uf.event_mask[0];
+			*((u32 *) f->event_mask + 1) = uf.event_mask[1];
+		}
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct hci_ufilter uf;
+	struct sock *sk = sock->sk;
+	int len, opt;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	switch (optname) {
+	case HCI_DATA_DIR:
+		if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR)
+			opt = 1;
+		else
+			opt = 0;
+
+		if (put_user(opt, optval))
+			return -EFAULT;
+		break;
+
+	case HCI_TIME_STAMP:
+		if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP)
+			opt = 1;
+		else
+			opt = 0;
+
+		if (put_user(opt, optval))
+			return -EFAULT;
+		break;
+
+	case HCI_FILTER:
+		{
+			struct hci_filter *f = &hci_pi(sk)->filter;
+
+			uf.type_mask = f->type_mask;
+			uf.opcode    = f->opcode;
+			uf.event_mask[0] = *((u32 *) f->event_mask + 0);
+			uf.event_mask[1] = *((u32 *) f->event_mask + 1);
+		}
+
+		len = min_t(unsigned int, len, sizeof(uf));
+		if (copy_to_user(optval, &uf, len))
+			return -EFAULT;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+		break;
+	}
+
+	return 0;
+}
+
+static const struct proto_ops hci_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= hci_sock_release,
+	.bind		= hci_sock_bind,
+	.getname	= hci_sock_getname,
+	.sendmsg	= hci_sock_sendmsg,
+	.recvmsg	= hci_sock_recvmsg,
+	.ioctl		= hci_sock_ioctl,
+	.poll		= datagram_poll,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= hci_sock_setsockopt,
+	.getsockopt	= hci_sock_getsockopt,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.mmap		= sock_no_mmap
+};
+
+static struct proto hci_sk_proto = {
+	.name		= "HCI",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct hci_pinfo)
+};
+
+static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
+			   int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &hci_sock_ops;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = protocol;
+
+	sock->state = SS_UNCONNECTED;
+	sk->sk_state = BT_OPEN;
+
+	bt_sock_link(&hci_sk_list, sk);
+	return 0;
+}
+
+static int hci_sock_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct hci_dev *hdev = (struct hci_dev *) ptr;
+	struct hci_ev_si_device ev;
+
+	BT_DBG("hdev %s event %ld", hdev->name, event);
+
+	/* Send event to sockets */
+	ev.event  = event;
+	ev.dev_id = hdev->id;
+	hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev);
+
+	if (event == HCI_DEV_UNREG) {
+		struct sock *sk;
+		struct hlist_node *node;
+
+		/* Detach sockets from device */
+		read_lock(&hci_sk_list.lock);
+		sk_for_each(sk, node, &hci_sk_list.head) {
+			local_bh_disable();
+			bh_lock_sock_nested(sk);
+			if (hci_pi(sk)->hdev == hdev) {
+				hci_pi(sk)->hdev = NULL;
+				sk->sk_err = EPIPE;
+				sk->sk_state = BT_OPEN;
+				sk->sk_state_change(sk);
+
+				hci_dev_put(hdev);
+			}
+			bh_unlock_sock(sk);
+			local_bh_enable();
+		}
+		read_unlock(&hci_sk_list.lock);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static const struct net_proto_family hci_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= hci_sock_create,
+};
+
+static struct notifier_block hci_sock_nblock = {
+	.notifier_call = hci_sock_dev_event
+};
+
+int __init hci_sock_init(void)
+{
+	int err;
+
+	err = proto_register(&hci_sk_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	hci_register_notifier(&hci_sock_nblock);
+
+	BT_INFO("HCI socket layer initialized");
+
+	return 0;
+
+error:
+	BT_ERR("HCI socket registration failed");
+	proto_unregister(&hci_sk_proto);
+	return err;
+}
+
+void hci_sock_cleanup(void)
+{
+	if (bt_sock_unregister(BTPROTO_HCI) < 0)
+		BT_ERR("HCI socket unregistration failed");
+
+	hci_unregister_notifier(&hci_sock_nblock);
+
+	proto_unregister(&hci_sk_proto);
+}
+
+module_param(enable_mgmt, bool, 0644);
+MODULE_PARM_DESC(enable_mgmt, "Enable Management interface");
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
new file mode 100644
index 00000000..a6c3aa8b
--- /dev/null
+++ b/net/bluetooth/hci_sysfs.c
@@ -0,0 +1,607 @@
+/* Bluetooth HCI driver model support. */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+static struct class *bt_class;
+
+struct dentry *bt_debugfs;
+EXPORT_SYMBOL_GPL(bt_debugfs);
+
+static inline char *link_typetostr(int type)
+{
+	switch (type) {
+	case ACL_LINK:
+		return "ACL";
+	case SCO_LINK:
+		return "SCO";
+	case ESCO_LINK:
+		return "eSCO";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static ssize_t show_link_type(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_conn *conn = dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n", link_typetostr(conn->type));
+}
+
+static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_conn *conn = dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n", batostr(&conn->dst));
+}
+
+static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_conn *conn = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+				conn->features[0], conn->features[1],
+				conn->features[2], conn->features[3],
+				conn->features[4], conn->features[5],
+				conn->features[6], conn->features[7]);
+}
+
+#define LINK_ATTR(_name, _mode, _show, _store) \
+struct device_attribute link_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+static LINK_ATTR(type, S_IRUGO, show_link_type, NULL);
+static LINK_ATTR(address, S_IRUGO, show_link_address, NULL);
+static LINK_ATTR(features, S_IRUGO, show_link_features, NULL);
+
+static struct attribute *bt_link_attrs[] = {
+	&link_attr_type.attr,
+	&link_attr_address.attr,
+	&link_attr_features.attr,
+	NULL
+};
+
+static struct attribute_group bt_link_group = {
+	.attrs = bt_link_attrs,
+};
+
+static const struct attribute_group *bt_link_groups[] = {
+	&bt_link_group,
+	NULL
+};
+
+static void bt_link_release(struct device *dev)
+{
+	void *data = dev_get_drvdata(dev);
+	kfree(data);
+}
+
+static struct device_type bt_link = {
+	.name    = "link",
+	.groups  = bt_link_groups,
+	.release = bt_link_release,
+};
+
+static void add_conn(struct work_struct *work)
+{
+	struct hci_conn *conn = container_of(work, struct hci_conn, work_add);
+	struct hci_dev *hdev = conn->hdev;
+
+	dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
+
+	dev_set_drvdata(&conn->dev, conn);
+
+	if (device_add(&conn->dev) < 0) {
+		BT_ERR("Failed to register connection device");
+		return;
+	}
+
+	hci_dev_hold(hdev);
+}
+
+/*
+ * The rfcomm tty device will possibly retain even when conn
+ * is down, and sysfs doesn't support move zombie device,
+ * so we should move the device before conn device is destroyed.
+ */
+static int __match_tty(struct device *dev, void *data)
+{
+	return !strncmp(dev_name(dev), "rfcomm", 6);
+}
+
+static void del_conn(struct work_struct *work)
+{
+	struct hci_conn *conn = container_of(work, struct hci_conn, work_del);
+	struct hci_dev *hdev = conn->hdev;
+
+	if (!device_is_registered(&conn->dev))
+		return;
+
+	while (1) {
+		struct device *dev;
+
+		dev = device_find_child(&conn->dev, NULL, __match_tty);
+		if (!dev)
+			break;
+		device_move(dev, NULL, DPM_ORDER_DEV_LAST);
+		put_device(dev);
+	}
+
+	device_del(&conn->dev);
+	put_device(&conn->dev);
+
+	hci_dev_put(hdev);
+}
+
+void hci_conn_init_sysfs(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+
+	BT_DBG("conn %p", conn);
+
+	conn->dev.type = &bt_link;
+	conn->dev.class = bt_class;
+	conn->dev.parent = &hdev->dev;
+
+	device_initialize(&conn->dev);
+
+	INIT_WORK(&conn->work_add, add_conn);
+	INIT_WORK(&conn->work_del, del_conn);
+}
+
+void hci_conn_add_sysfs(struct hci_conn *conn)
+{
+	BT_DBG("conn %p", conn);
+
+	queue_work(conn->hdev->workqueue, &conn->work_add);
+}
+
+void hci_conn_del_sysfs(struct hci_conn *conn)
+{
+	BT_DBG("conn %p", conn);
+
+	queue_work(conn->hdev->workqueue, &conn->work_del);
+}
+
+static inline char *host_bustostr(int bus)
+{
+	switch (bus) {
+	case HCI_VIRTUAL:
+		return "VIRTUAL";
+	case HCI_USB:
+		return "USB";
+	case HCI_PCCARD:
+		return "PCCARD";
+	case HCI_UART:
+		return "UART";
+	case HCI_RS232:
+		return "RS232";
+	case HCI_PCI:
+		return "PCI";
+	case HCI_SDIO:
+		return "SDIO";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static inline char *host_typetostr(int type)
+{
+	switch (type) {
+	case HCI_BREDR:
+		return "BR/EDR";
+	case HCI_AMP:
+		return "AMP";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static ssize_t show_bus(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n", host_bustostr(hdev->bus));
+}
+
+static ssize_t show_type(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n", host_typetostr(hdev->dev_type));
+}
+
+static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	char name[HCI_MAX_NAME_LENGTH + 1];
+	int i;
+
+	for (i = 0; i < HCI_MAX_NAME_LENGTH; i++)
+		name[i] = hdev->dev_name[i];
+
+	name[HCI_MAX_NAME_LENGTH] = '\0';
+	return sprintf(buf, "%s\n", name);
+}
+
+static ssize_t show_class(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "0x%.2x%.2x%.2x\n",
+			hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]);
+}
+
+static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%s\n", batostr(&hdev->bdaddr));
+}
+
+static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+				hdev->features[0], hdev->features[1],
+				hdev->features[2], hdev->features[3],
+				hdev->features[4], hdev->features[5],
+				hdev->features[6], hdev->features[7]);
+}
+
+static ssize_t show_manufacturer(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->manufacturer);
+}
+
+static ssize_t show_hci_version(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->hci_ver);
+}
+
+static ssize_t show_hci_revision(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->hci_rev);
+}
+
+static ssize_t show_idle_timeout(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->idle_timeout);
+}
+
+static ssize_t store_idle_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	unsigned int val;
+	int rv;
+
+	rv = kstrtouint(buf, 0, &val);
+	if (rv < 0)
+		return rv;
+
+	if (val != 0 && (val < 500 || val > 3600000))
+		return -EINVAL;
+
+	hdev->idle_timeout = val;
+
+	return count;
+}
+
+static ssize_t show_sniff_max_interval(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->sniff_max_interval);
+}
+
+static ssize_t store_sniff_max_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	u16 val;
+	int rv;
+
+	rv = kstrtou16(buf, 0, &val);
+	if (rv < 0)
+		return rv;
+
+	if (val == 0 || val % 2 || val < hdev->sniff_min_interval)
+		return -EINVAL;
+
+	hdev->sniff_max_interval = val;
+
+	return count;
+}
+
+static ssize_t show_sniff_min_interval(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", hdev->sniff_min_interval);
+}
+
+static ssize_t store_sniff_min_interval(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct hci_dev *hdev = dev_get_drvdata(dev);
+	u16 val;
+	int rv;
+
+	rv = kstrtou16(buf, 0, &val);
+	if (rv < 0)
+		return rv;
+
+	if (val == 0 || val % 2 || val > hdev->sniff_max_interval)
+		return -EINVAL;
+
+	hdev->sniff_min_interval = val;
+
+	return count;
+}
+
+static DEVICE_ATTR(bus, S_IRUGO, show_bus, NULL);
+static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
+static DEVICE_ATTR(name, S_IRUGO, show_name, NULL);
+static DEVICE_ATTR(class, S_IRUGO, show_class, NULL);
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(features, S_IRUGO, show_features, NULL);
+static DEVICE_ATTR(manufacturer, S_IRUGO, show_manufacturer, NULL);
+static DEVICE_ATTR(hci_version, S_IRUGO, show_hci_version, NULL);
+static DEVICE_ATTR(hci_revision, S_IRUGO, show_hci_revision, NULL);
+
+static DEVICE_ATTR(idle_timeout, S_IRUGO | S_IWUSR,
+				show_idle_timeout, store_idle_timeout);
+static DEVICE_ATTR(sniff_max_interval, S_IRUGO | S_IWUSR,
+				show_sniff_max_interval, store_sniff_max_interval);
+static DEVICE_ATTR(sniff_min_interval, S_IRUGO | S_IWUSR,
+				show_sniff_min_interval, store_sniff_min_interval);
+
+static struct attribute *bt_host_attrs[] = {
+	&dev_attr_bus.attr,
+	&dev_attr_type.attr,
+	&dev_attr_name.attr,
+	&dev_attr_class.attr,
+	&dev_attr_address.attr,
+	&dev_attr_features.attr,
+	&dev_attr_manufacturer.attr,
+	&dev_attr_hci_version.attr,
+	&dev_attr_hci_revision.attr,
+	&dev_attr_idle_timeout.attr,
+	&dev_attr_sniff_max_interval.attr,
+	&dev_attr_sniff_min_interval.attr,
+	NULL
+};
+
+static struct attribute_group bt_host_group = {
+	.attrs = bt_host_attrs,
+};
+
+static const struct attribute_group *bt_host_groups[] = {
+	&bt_host_group,
+	NULL
+};
+
+static void bt_host_release(struct device *dev)
+{
+	void *data = dev_get_drvdata(dev);
+	kfree(data);
+}
+
+static struct device_type bt_host = {
+	.name    = "host",
+	.groups  = bt_host_groups,
+	.release = bt_host_release,
+};
+
+static int inquiry_cache_show(struct seq_file *f, void *p)
+{
+	struct hci_dev *hdev = f->private;
+	struct inquiry_cache *cache = &hdev->inq_cache;
+	struct inquiry_entry *e;
+
+	hci_dev_lock_bh(hdev);
+
+	for (e = cache->list; e; e = e->next) {
+		struct inquiry_data *data = &e->data;
+		seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
+			   batostr(&data->bdaddr),
+			   data->pscan_rep_mode, data->pscan_period_mode,
+			   data->pscan_mode, data->dev_class[2],
+			   data->dev_class[1], data->dev_class[0],
+			   __le16_to_cpu(data->clock_offset),
+			   data->rssi, data->ssp_mode, e->timestamp);
+	}
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+static int inquiry_cache_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, inquiry_cache_show, inode->i_private);
+}
+
+static const struct file_operations inquiry_cache_fops = {
+	.open		= inquiry_cache_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int blacklist_show(struct seq_file *f, void *p)
+{
+	struct hci_dev *hdev = f->private;
+	struct list_head *l;
+
+	hci_dev_lock_bh(hdev);
+
+	list_for_each(l, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(l, struct bdaddr_list, list);
+
+		seq_printf(f, "%s\n", batostr(&b->bdaddr));
+	}
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+static int blacklist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, blacklist_show, inode->i_private);
+}
+
+static const struct file_operations blacklist_fops = {
+	.open		= blacklist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void print_bt_uuid(struct seq_file *f, u8 *uuid)
+{
+	u32 data0, data4;
+	u16 data1, data2, data3, data5;
+
+	memcpy(&data0, &uuid[0], 4);
+	memcpy(&data1, &uuid[4], 2);
+	memcpy(&data2, &uuid[6], 2);
+	memcpy(&data3, &uuid[8], 2);
+	memcpy(&data4, &uuid[10], 4);
+	memcpy(&data5, &uuid[14], 2);
+
+	seq_printf(f, "%.8x-%.4x-%.4x-%.4x-%.8x%.4x\n",
+				ntohl(data0), ntohs(data1), ntohs(data2),
+				ntohs(data3), ntohl(data4), ntohs(data5));
+}
+
+static int uuids_show(struct seq_file *f, void *p)
+{
+	struct hci_dev *hdev = f->private;
+	struct list_head *l;
+
+	hci_dev_lock_bh(hdev);
+
+	list_for_each(l, &hdev->uuids) {
+		struct bt_uuid *uuid;
+
+		uuid = list_entry(l, struct bt_uuid, list);
+
+		print_bt_uuid(f, uuid->uuid);
+	}
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+static int uuids_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uuids_show, inode->i_private);
+}
+
+static const struct file_operations uuids_fops = {
+	.open		= uuids_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int auto_accept_delay_set(void *data, u64 val)
+{
+	struct hci_dev *hdev = data;
+
+	hci_dev_lock_bh(hdev);
+
+	hdev->auto_accept_delay = val;
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+static int auto_accept_delay_get(void *data, u64 *val)
+{
+	struct hci_dev *hdev = data;
+
+	hci_dev_lock_bh(hdev);
+
+	*val = hdev->auto_accept_delay;
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
+					auto_accept_delay_set, "%llu\n");
+
+int hci_register_sysfs(struct hci_dev *hdev)
+{
+	struct device *dev = &hdev->dev;
+	int err;
+
+	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+	dev->type = &bt_host;
+	dev->class = bt_class;
+	dev->parent = hdev->parent;
+
+	dev_set_name(dev, "%s", hdev->name);
+
+	dev_set_drvdata(dev, hdev);
+
+	err = device_register(dev);
+	if (err < 0)
+		return err;
+
+	if (!bt_debugfs)
+		return 0;
+
+	hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs);
+	if (!hdev->debugfs)
+		return 0;
+
+	debugfs_create_file("inquiry_cache", 0444, hdev->debugfs,
+						hdev, &inquiry_cache_fops);
+
+	debugfs_create_file("blacklist", 0444, hdev->debugfs,
+						hdev, &blacklist_fops);
+
+	debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops);
+
+	debugfs_create_file("auto_accept_delay", 0444, hdev->debugfs, hdev,
+						&auto_accept_delay_fops);
+	return 0;
+}
+
+void hci_unregister_sysfs(struct hci_dev *hdev)
+{
+	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+
+	debugfs_remove_recursive(hdev->debugfs);
+
+	device_del(&hdev->dev);
+}
+
+int __init bt_sysfs_init(void)
+{
+	bt_debugfs = debugfs_create_dir("bluetooth", NULL);
+
+	bt_class = class_create(THIS_MODULE, "bluetooth");
+	if (IS_ERR(bt_class))
+		return PTR_ERR(bt_class);
+
+	return 0;
+}
+
+void bt_sysfs_cleanup(void)
+{
+	class_destroy(bt_class);
+
+	debugfs_remove_recursive(bt_debugfs);
+}
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
new file mode 100644
index 00000000..86a91543
--- /dev/null
+++ b/net/bluetooth/hidp/Kconfig
@@ -0,0 +1,12 @@
+config BT_HIDP
+	tristate "HIDP protocol support"
+	depends on BT && BT_L2CAP && INPUT && HID_SUPPORT
+	select HID
+	help
+	  HIDP (Human Interface Device Protocol) is a transport layer
+	  for HID reports.  HIDP is required for the Bluetooth Human
+	  Interface Device Profile.
+
+	  Say Y here to compile HIDP support into the kernel or say M to
+	  compile it as module (hidp).
+
diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile
new file mode 100644
index 00000000..a9ee1156
--- /dev/null
+++ b/net/bluetooth/hidp/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Bluetooth HIDP layer
+#
+
+obj-$(CONFIG_BT_HIDP) += hidp.o
+
+hidp-objs := core.o sock.o
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
new file mode 100644
index 00000000..fb68f344
--- /dev/null
+++ b/net/bluetooth/hidp/core.c
@@ -0,0 +1,1220 @@
+/*
+   HIDP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/freezer.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <linux/input.h>
+#include <linux/hid.h>
+#include <linux/hidraw.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+
+#include "hidp.h"
+
+#define VERSION "1.2"
+
+static DECLARE_RWSEM(hidp_session_sem);
+static LIST_HEAD(hidp_session_list);
+
+static unsigned char hidp_keycode[256] = {
+	  0,   0,   0,   0,  30,  48,  46,  32,  18,  33,  34,  35,  23,  36,
+	 37,  38,  50,  49,  24,  25,  16,  19,  31,  20,  22,  47,  17,  45,
+	 21,  44,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  28,   1,
+	 14,  15,  57,  12,  13,  26,  27,  43,  43,  39,  40,  41,  51,  52,
+	 53,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  87,  88,
+	 99,  70, 119, 110, 102, 104, 111, 107, 109, 106, 105, 108, 103,  69,
+	 98,  55,  74,  78,  96,  79,  80,  81,  75,  76,  77,  71,  72,  73,
+	 82,  83,  86, 127, 116, 117, 183, 184, 185, 186, 187, 188, 189, 190,
+	191, 192, 193, 194, 134, 138, 130, 132, 128, 129, 131, 137, 133, 135,
+	136, 113, 115, 114,   0,   0,   0, 121,   0,  89,  93, 124,  92,  94,
+	 95,   0,   0,   0, 122, 123,  90,  91,  85,   0,   0,   0,   0,   0,
+	  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+	 29,  42,  56, 125,  97,  54, 100, 126, 164, 166, 165, 163, 161, 115,
+	114, 113, 150, 158, 159, 128, 136, 177, 178, 176, 142, 152, 173, 140
+};
+
+static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 };
+
+static struct hidp_session *__hidp_get_session(bdaddr_t *bdaddr)
+{
+	struct hidp_session *session;
+	struct list_head *p;
+
+	BT_DBG("");
+
+	list_for_each(p, &hidp_session_list) {
+		session = list_entry(p, struct hidp_session, list);
+		if (!bacmp(bdaddr, &session->bdaddr))
+			return session;
+	}
+	return NULL;
+}
+
+static void __hidp_link_session(struct hidp_session *session)
+{
+	__module_get(THIS_MODULE);
+	list_add(&session->list, &hidp_session_list);
+
+	hci_conn_hold_device(session->conn);
+}
+
+static void __hidp_unlink_session(struct hidp_session *session)
+{
+	hci_conn_put_device(session->conn);
+
+	list_del(&session->list);
+	module_put(THIS_MODULE);
+}
+
+static void __hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci)
+{
+	memset(ci, 0, sizeof(*ci));
+	bacpy(&ci->bdaddr, &session->bdaddr);
+
+	ci->flags = session->flags;
+	ci->state = session->state;
+
+	ci->vendor  = 0x0000;
+	ci->product = 0x0000;
+	ci->version = 0x0000;
+
+	if (session->input) {
+		ci->vendor  = session->input->id.vendor;
+		ci->product = session->input->id.product;
+		ci->version = session->input->id.version;
+		if (session->input->name)
+			strncpy(ci->name, session->input->name, 128);
+		else
+			strncpy(ci->name, "HID Boot Device", 128);
+	}
+
+	if (session->hid) {
+		ci->vendor  = session->hid->vendor;
+		ci->product = session->hid->product;
+		ci->version = session->hid->version;
+		strncpy(ci->name, session->hid->name, 128);
+	}
+}
+
+static int hidp_queue_event(struct hidp_session *session, struct input_dev *dev,
+				unsigned int type, unsigned int code, int value)
+{
+	unsigned char newleds;
+	struct sk_buff *skb;
+
+	BT_DBG("session %p type %d code %d value %d", session, type, code, value);
+
+	if (type != EV_LED)
+		return -1;
+
+	newleds = (!!test_bit(LED_KANA,    dev->led) << 3) |
+		  (!!test_bit(LED_COMPOSE, dev->led) << 3) |
+		  (!!test_bit(LED_SCROLLL, dev->led) << 2) |
+		  (!!test_bit(LED_CAPSL,   dev->led) << 1) |
+		  (!!test_bit(LED_NUML,    dev->led));
+
+	if (session->leds == newleds)
+		return 0;
+
+	session->leds = newleds;
+
+	skb = alloc_skb(3, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("Can't allocate memory for new frame");
+		return -ENOMEM;
+	}
+
+	*skb_put(skb, 1) = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT;
+	*skb_put(skb, 1) = 0x01;
+	*skb_put(skb, 1) = newleds;
+
+	skb_queue_tail(&session->intr_transmit, skb);
+
+	hidp_schedule(session);
+
+	return 0;
+}
+
+static int hidp_hidinput_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+	struct hid_device *hid = input_get_drvdata(dev);
+	struct hidp_session *session = hid->driver_data;
+
+	return hidp_queue_event(session, dev, type, code, value);
+}
+
+static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+	struct hidp_session *session = input_get_drvdata(dev);
+
+	return hidp_queue_event(session, dev, type, code, value);
+}
+
+static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb)
+{
+	struct input_dev *dev = session->input;
+	unsigned char *keys = session->keys;
+	unsigned char *udata = skb->data + 1;
+	signed char *sdata = skb->data + 1;
+	int i, size = skb->len - 1;
+
+	switch (skb->data[0]) {
+	case 0x01:	/* Keyboard report */
+		for (i = 0; i < 8; i++)
+			input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1);
+
+		/* If all the key codes have been set to 0x01, it means
+		 * too many keys were pressed at the same time. */
+		if (!memcmp(udata + 2, hidp_mkeyspat, 6))
+			break;
+
+		for (i = 2; i < 8; i++) {
+			if (keys[i] > 3 && memscan(udata + 2, keys[i], 6) == udata + 8) {
+				if (hidp_keycode[keys[i]])
+					input_report_key(dev, hidp_keycode[keys[i]], 0);
+				else
+					BT_ERR("Unknown key (scancode %#x) released.", keys[i]);
+			}
+
+			if (udata[i] > 3 && memscan(keys + 2, udata[i], 6) == keys + 8) {
+				if (hidp_keycode[udata[i]])
+					input_report_key(dev, hidp_keycode[udata[i]], 1);
+				else
+					BT_ERR("Unknown key (scancode %#x) pressed.", udata[i]);
+			}
+		}
+
+		memcpy(keys, udata, 8);
+		break;
+
+	case 0x02:	/* Mouse report */
+		input_report_key(dev, BTN_LEFT,   sdata[0] & 0x01);
+		input_report_key(dev, BTN_RIGHT,  sdata[0] & 0x02);
+		input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04);
+		input_report_key(dev, BTN_SIDE,   sdata[0] & 0x08);
+		input_report_key(dev, BTN_EXTRA,  sdata[0] & 0x10);
+
+		input_report_rel(dev, REL_X, sdata[1]);
+		input_report_rel(dev, REL_Y, sdata[2]);
+
+		if (size > 3)
+			input_report_rel(dev, REL_WHEEL, sdata[3]);
+		break;
+	}
+
+	input_sync(dev);
+}
+
+static int __hidp_send_ctrl_message(struct hidp_session *session,
+			unsigned char hdr, unsigned char *data, int size)
+{
+	struct sk_buff *skb;
+
+	BT_DBG("session %p data %p size %d", session, data, size);
+
+	skb = alloc_skb(size + 1, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("Can't allocate memory for new frame");
+		return -ENOMEM;
+	}
+
+	*skb_put(skb, 1) = hdr;
+	if (data && size > 0)
+		memcpy(skb_put(skb, size), data, size);
+
+	skb_queue_tail(&session->ctrl_transmit, skb);
+
+	return 0;
+}
+
+static inline int hidp_send_ctrl_message(struct hidp_session *session,
+			unsigned char hdr, unsigned char *data, int size)
+{
+	int err;
+
+	err = __hidp_send_ctrl_message(session, hdr, data, size);
+
+	hidp_schedule(session);
+
+	return err;
+}
+
+static int hidp_queue_report(struct hidp_session *session,
+				unsigned char *data, int size)
+{
+	struct sk_buff *skb;
+
+	BT_DBG("session %p hid %p data %p size %d", session, session->hid, data, size);
+
+	skb = alloc_skb(size + 1, GFP_ATOMIC);
+	if (!skb) {
+		BT_ERR("Can't allocate memory for new frame");
+		return -ENOMEM;
+	}
+
+	*skb_put(skb, 1) = 0xa2;
+	if (size > 0)
+		memcpy(skb_put(skb, size), data, size);
+
+	skb_queue_tail(&session->intr_transmit, skb);
+
+	hidp_schedule(session);
+
+	return 0;
+}
+
+static int hidp_send_report(struct hidp_session *session, struct hid_report *report)
+{
+	unsigned char buf[32];
+	int rsize;
+
+	rsize = ((report->size - 1) >> 3) + 1 + (report->id > 0);
+	if (rsize > sizeof(buf))
+		return -EIO;
+
+	hid_output_report(report, buf);
+
+	return hidp_queue_report(session, buf, rsize);
+}
+
+static int hidp_get_raw_report(struct hid_device *hid,
+		unsigned char report_number,
+		unsigned char *data, size_t count,
+		unsigned char report_type)
+{
+	struct hidp_session *session = hid->driver_data;
+	struct sk_buff *skb;
+	size_t len;
+	int numbered_reports = hid->report_enum[report_type].numbered;
+
+	switch (report_type) {
+	case HID_FEATURE_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+		break;
+	case HID_INPUT_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT;
+		break;
+	case HID_OUTPUT_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mutex_lock_interruptible(&session->report_mutex))
+		return -ERESTARTSYS;
+
+	/* Set up our wait, and send the report request to the device. */
+	session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK;
+	session->waiting_report_number = numbered_reports ? report_number : -1;
+	set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	data[0] = report_number;
+	if (hidp_send_ctrl_message(hid->driver_data, report_type, data, 1))
+		goto err_eio;
+
+	/* Wait for the return of the report. The returned report
+	   gets put in session->report_return.  */
+	while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) {
+		int res;
+
+		res = wait_event_interruptible_timeout(session->report_queue,
+			!test_bit(HIDP_WAITING_FOR_RETURN, &session->flags),
+			5*HZ);
+		if (res == 0) {
+			/* timeout */
+			goto err_eio;
+		}
+		if (res < 0) {
+			/* signal */
+			goto err_restartsys;
+		}
+	}
+
+	skb = session->report_return;
+	if (skb) {
+		len = skb->len < count ? skb->len : count;
+		memcpy(data, skb->data, len);
+
+		kfree_skb(skb);
+		session->report_return = NULL;
+	} else {
+		/* Device returned a HANDSHAKE, indicating  protocol error. */
+		len = -EIO;
+	}
+
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+
+	return len;
+
+err_restartsys:
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return -ERESTARTSYS;
+err_eio:
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return -EIO;
+}
+
+static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count,
+		unsigned char report_type)
+{
+	struct hidp_session *session = hid->driver_data;
+	int ret;
+
+	switch (report_type) {
+	case HID_FEATURE_REPORT:
+		report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+		break;
+	case HID_OUTPUT_REPORT:
+		report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mutex_lock_interruptible(&session->report_mutex))
+		return -ERESTARTSYS;
+
+	/* Set up our wait, and send the report request to the device. */
+	set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+	if (hidp_send_ctrl_message(hid->driver_data, report_type,
+			data, count)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Wait for the ACK from the device. */
+	while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) {
+		int res;
+
+		res = wait_event_interruptible_timeout(session->report_queue,
+			!test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags),
+			10*HZ);
+		if (res == 0) {
+			/* timeout */
+			ret = -EIO;
+			goto err;
+		}
+		if (res < 0) {
+			/* signal */
+			ret = -ERESTARTSYS;
+			goto err;
+		}
+	}
+
+	if (!session->output_report_success) {
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = count;
+
+err:
+	clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return ret;
+}
+
+static void hidp_idle_timeout(unsigned long arg)
+{
+	struct hidp_session *session = (struct hidp_session *) arg;
+
+	atomic_inc(&session->terminate);
+	wake_up_process(session->task);
+}
+
+static void hidp_set_timer(struct hidp_session *session)
+{
+	if (session->idle_to > 0)
+		mod_timer(&session->timer, jiffies + HZ * session->idle_to);
+}
+
+static inline void hidp_del_timer(struct hidp_session *session)
+{
+	if (session->idle_to > 0)
+		del_timer(&session->timer);
+}
+
+static void hidp_process_handshake(struct hidp_session *session,
+					unsigned char param)
+{
+	BT_DBG("session %p param 0x%02x", session, param);
+	session->output_report_success = 0; /* default condition */
+
+	switch (param) {
+	case HIDP_HSHK_SUCCESSFUL:
+		/* FIXME: Call into SET_ GET_ handlers here */
+		session->output_report_success = 1;
+		break;
+
+	case HIDP_HSHK_NOT_READY:
+	case HIDP_HSHK_ERR_INVALID_REPORT_ID:
+	case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST:
+	case HIDP_HSHK_ERR_INVALID_PARAMETER:
+		if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) {
+			clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+			wake_up_interruptible(&session->report_queue);
+		}
+		/* FIXME: Call into SET_ GET_ handlers here */
+		break;
+
+	case HIDP_HSHK_ERR_UNKNOWN:
+		break;
+
+	case HIDP_HSHK_ERR_FATAL:
+		/* Device requests a reboot, as this is the only way this error
+		 * can be recovered. */
+		__hidp_send_ctrl_message(session,
+			HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0);
+		break;
+
+	default:
+		__hidp_send_ctrl_message(session,
+			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
+		break;
+	}
+
+	/* Wake up the waiting thread. */
+	if (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) {
+		clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+		wake_up_interruptible(&session->report_queue);
+	}
+}
+
+static void hidp_process_hid_control(struct hidp_session *session,
+					unsigned char param)
+{
+	BT_DBG("session %p param 0x%02x", session, param);
+
+	if (param == HIDP_CTRL_VIRTUAL_CABLE_UNPLUG) {
+		/* Flush the transmit queues */
+		skb_queue_purge(&session->ctrl_transmit);
+		skb_queue_purge(&session->intr_transmit);
+
+		atomic_inc(&session->terminate);
+		wake_up_process(current);
+	}
+}
+
+/* Returns true if the passed-in skb should be freed by the caller. */
+static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+				unsigned char param)
+{
+	int done_with_skb = 1;
+	BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
+
+	switch (param) {
+	case HIDP_DATA_RTYPE_INPUT:
+		hidp_set_timer(session);
+
+		if (session->input)
+			hidp_input_report(session, skb);
+
+		if (session->hid)
+			hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0);
+		break;
+
+	case HIDP_DATA_RTYPE_OTHER:
+	case HIDP_DATA_RTYPE_OUPUT:
+	case HIDP_DATA_RTYPE_FEATURE:
+		break;
+
+	default:
+		__hidp_send_ctrl_message(session,
+			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
+	}
+
+	if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+				param == session->waiting_report_type) {
+		if (session->waiting_report_number < 0 ||
+		    session->waiting_report_number == skb->data[0]) {
+			/* hidp_get_raw_report() is waiting on this report. */
+			session->report_return = skb;
+			done_with_skb = 0;
+			clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+			wake_up_interruptible(&session->report_queue);
+		}
+	}
+
+	return done_with_skb;
+}
+
+static void hidp_recv_ctrl_frame(struct hidp_session *session,
+					struct sk_buff *skb)
+{
+	unsigned char hdr, type, param;
+	int free_skb = 1;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	hdr = skb->data[0];
+	skb_pull(skb, 1);
+
+	type = hdr & HIDP_HEADER_TRANS_MASK;
+	param = hdr & HIDP_HEADER_PARAM_MASK;
+
+	switch (type) {
+	case HIDP_TRANS_HANDSHAKE:
+		hidp_process_handshake(session, param);
+		break;
+
+	case HIDP_TRANS_HID_CONTROL:
+		hidp_process_hid_control(session, param);
+		break;
+
+	case HIDP_TRANS_DATA:
+		free_skb = hidp_process_data(session, skb, param);
+		break;
+
+	default:
+		__hidp_send_ctrl_message(session,
+			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0);
+		break;
+	}
+
+	if (free_skb)
+		kfree_skb(skb);
+}
+
+static void hidp_recv_intr_frame(struct hidp_session *session,
+				struct sk_buff *skb)
+{
+	unsigned char hdr;
+
+	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+
+	hdr = skb->data[0];
+	skb_pull(skb, 1);
+
+	if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) {
+		hidp_set_timer(session);
+
+		if (session->input)
+			hidp_input_report(session, skb);
+
+		if (session->hid) {
+			hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 1);
+			BT_DBG("report len %d", skb->len);
+		}
+	} else {
+		BT_DBG("Unsupported protocol header 0x%02x", hdr);
+	}
+
+	kfree_skb(skb);
+}
+
+static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
+{
+	struct kvec iv = { data, len };
+	struct msghdr msg;
+
+	BT_DBG("sock %p data %p len %d", sock, data, len);
+
+	if (!len)
+		return 0;
+
+	memset(&msg, 0, sizeof(msg));
+
+	return kernel_sendmsg(sock, &msg, &iv, 1, len);
+}
+
+static void hidp_process_transmit(struct hidp_session *session)
+{
+	struct sk_buff *skb;
+
+	BT_DBG("session %p", session);
+
+	while ((skb = skb_dequeue(&session->ctrl_transmit))) {
+		if (hidp_send_frame(session->ctrl_sock, skb->data, skb->len) < 0) {
+			skb_queue_head(&session->ctrl_transmit, skb);
+			break;
+		}
+
+		hidp_set_timer(session);
+		kfree_skb(skb);
+	}
+
+	while ((skb = skb_dequeue(&session->intr_transmit))) {
+		if (hidp_send_frame(session->intr_sock, skb->data, skb->len) < 0) {
+			skb_queue_head(&session->intr_transmit, skb);
+			break;
+		}
+
+		hidp_set_timer(session);
+		kfree_skb(skb);
+	}
+}
+
+static int hidp_session(void *arg)
+{
+	struct hidp_session *session = arg;
+	struct sock *ctrl_sk = session->ctrl_sock->sk;
+	struct sock *intr_sk = session->intr_sock->sk;
+	struct sk_buff *skb;
+	wait_queue_t ctrl_wait, intr_wait;
+
+	BT_DBG("session %p", session);
+
+	set_user_nice(current, -15);
+
+	init_waitqueue_entry(&ctrl_wait, current);
+	init_waitqueue_entry(&intr_wait, current);
+	add_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait);
+	add_wait_queue(sk_sleep(intr_sk), &intr_wait);
+	session->waiting_for_startup = 0;
+	wake_up_interruptible(&session->startup_queue);
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!atomic_read(&session->terminate)) {
+		if (ctrl_sk->sk_state != BT_CONNECTED ||
+				intr_sk->sk_state != BT_CONNECTED)
+			break;
+
+		while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) {
+			skb_orphan(skb);
+			hidp_recv_ctrl_frame(session, skb);
+		}
+
+		while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) {
+			skb_orphan(skb);
+			hidp_recv_intr_frame(session, skb);
+		}
+
+		hidp_process_transmit(session);
+
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(intr_sk), &intr_wait);
+	remove_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait);
+
+	down_write(&hidp_session_sem);
+
+	hidp_del_timer(session);
+
+	if (session->input) {
+		input_unregister_device(session->input);
+		session->input = NULL;
+	}
+
+	if (session->hid) {
+		hid_destroy_device(session->hid);
+		session->hid = NULL;
+	}
+
+	/* Wakeup user-space polling for socket errors */
+	session->intr_sock->sk->sk_err = EUNATCH;
+	session->ctrl_sock->sk->sk_err = EUNATCH;
+
+	hidp_schedule(session);
+
+	fput(session->intr_sock->file);
+
+	wait_event_timeout(*(sk_sleep(ctrl_sk)),
+		(ctrl_sk->sk_state == BT_CLOSED), msecs_to_jiffies(500));
+
+	fput(session->ctrl_sock->file);
+
+	__hidp_unlink_session(session);
+
+	up_write(&hidp_session_sem);
+
+	kfree(session->rd_data);
+	kfree(session);
+	return 0;
+}
+
+static struct device *hidp_get_device(struct hidp_session *session)
+{
+	bdaddr_t *src = &bt_sk(session->ctrl_sock->sk)->src;
+	bdaddr_t *dst = &bt_sk(session->ctrl_sock->sk)->dst;
+	struct device *device = NULL;
+	struct hci_dev *hdev;
+
+	hdev = hci_get_route(dst, src);
+	if (!hdev)
+		return NULL;
+
+	session->conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
+	if (session->conn)
+		device = &session->conn->dev;
+
+	hci_dev_put(hdev);
+
+	return device;
+}
+
+static int hidp_setup_input(struct hidp_session *session,
+				struct hidp_connadd_req *req)
+{
+	struct input_dev *input;
+	int err, i;
+
+	input = input_allocate_device();
+	if (!input)
+		return -ENOMEM;
+
+	session->input = input;
+
+	input_set_drvdata(input, session);
+
+	input->name = "Bluetooth HID Boot Protocol Device";
+
+	input->id.bustype = BUS_BLUETOOTH;
+	input->id.vendor  = req->vendor;
+	input->id.product = req->product;
+	input->id.version = req->version;
+
+	if (req->subclass & 0x40) {
+		set_bit(EV_KEY, input->evbit);
+		set_bit(EV_LED, input->evbit);
+		set_bit(EV_REP, input->evbit);
+
+		set_bit(LED_NUML,    input->ledbit);
+		set_bit(LED_CAPSL,   input->ledbit);
+		set_bit(LED_SCROLLL, input->ledbit);
+		set_bit(LED_COMPOSE, input->ledbit);
+		set_bit(LED_KANA,    input->ledbit);
+
+		for (i = 0; i < sizeof(hidp_keycode); i++)
+			set_bit(hidp_keycode[i], input->keybit);
+		clear_bit(0, input->keybit);
+	}
+
+	if (req->subclass & 0x80) {
+		input->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL);
+		input->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) |
+			BIT_MASK(BTN_RIGHT) | BIT_MASK(BTN_MIDDLE);
+		input->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+		input->keybit[BIT_WORD(BTN_MOUSE)] |= BIT_MASK(BTN_SIDE) |
+			BIT_MASK(BTN_EXTRA);
+		input->relbit[0] |= BIT_MASK(REL_WHEEL);
+	}
+
+	input->dev.parent = hidp_get_device(session);
+
+	input->event = hidp_input_event;
+
+	err = input_register_device(input);
+	if (err < 0) {
+		input_free_device(input);
+		session->input = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+static int hidp_open(struct hid_device *hid)
+{
+	return 0;
+}
+
+static void hidp_close(struct hid_device *hid)
+{
+}
+
+static int hidp_parse(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+
+	return hid_parse_report(session->hid, session->rd_data,
+			session->rd_size);
+}
+
+static int hidp_start(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+	struct hid_report *report;
+
+	list_for_each_entry(report, &hid->report_enum[HID_INPUT_REPORT].
+			report_list, list)
+		hidp_send_report(session, report);
+
+	list_for_each_entry(report, &hid->report_enum[HID_FEATURE_REPORT].
+			report_list, list)
+		hidp_send_report(session, report);
+
+	return 0;
+}
+
+static void hidp_stop(struct hid_device *hid)
+{
+	struct hidp_session *session = hid->driver_data;
+
+	skb_queue_purge(&session->ctrl_transmit);
+	skb_queue_purge(&session->intr_transmit);
+
+	hid->claimed = 0;
+}
+
+static struct hid_ll_driver hidp_hid_driver = {
+	.parse = hidp_parse,
+	.start = hidp_start,
+	.stop = hidp_stop,
+	.open  = hidp_open,
+	.close = hidp_close,
+	.hidinput_input_event = hidp_hidinput_event,
+};
+
+/* This function sets up the hid device. It does not add it
+   to the HID system. That is done in hidp_add_connection(). */
+static int hidp_setup_hid(struct hidp_session *session,
+				struct hidp_connadd_req *req)
+{
+	struct hid_device *hid;
+	int err;
+
+	session->rd_data = kzalloc(req->rd_size, GFP_KERNEL);
+	if (!session->rd_data)
+		return -ENOMEM;
+
+	if (copy_from_user(session->rd_data, req->rd_data, req->rd_size)) {
+		err = -EFAULT;
+		goto fault;
+	}
+	session->rd_size = req->rd_size;
+
+	hid = hid_allocate_device();
+	if (IS_ERR(hid)) {
+		err = PTR_ERR(hid);
+		goto fault;
+	}
+
+	session->hid = hid;
+
+	hid->driver_data = session;
+
+	hid->bus     = BUS_BLUETOOTH;
+	hid->vendor  = req->vendor;
+	hid->product = req->product;
+	hid->version = req->version;
+	hid->country = req->country;
+
+	strncpy(hid->name, req->name, 128);
+	strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64);
+	strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64);
+
+	hid->dev.parent = hidp_get_device(session);
+	hid->ll_driver = &hidp_hid_driver;
+
+	hid->hid_get_raw_report = hidp_get_raw_report;
+	hid->hid_output_raw_report = hidp_output_raw_report;
+
+	return 0;
+
+fault:
+	kfree(session->rd_data);
+	session->rd_data = NULL;
+
+	return err;
+}
+
+int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock)
+{
+	struct hidp_session *session, *s;
+	int vendor, product;
+	int err;
+
+	BT_DBG("");
+
+	if (bacmp(&bt_sk(ctrl_sock->sk)->src, &bt_sk(intr_sock->sk)->src) ||
+			bacmp(&bt_sk(ctrl_sock->sk)->dst, &bt_sk(intr_sock->sk)->dst))
+		return -ENOTUNIQ;
+
+	session = kzalloc(sizeof(struct hidp_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
+
+	BT_DBG("rd_data %p rd_size %d", req->rd_data, req->rd_size);
+
+	down_write(&hidp_session_sem);
+
+	s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst);
+	if (s && s->state == BT_CONNECTED) {
+		err = -EEXIST;
+		goto failed;
+	}
+
+	bacpy(&session->bdaddr, &bt_sk(ctrl_sock->sk)->dst);
+
+	session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl_sock->sk)->chan->omtu,
+					l2cap_pi(ctrl_sock->sk)->chan->imtu);
+	session->intr_mtu = min_t(uint, l2cap_pi(intr_sock->sk)->chan->omtu,
+					l2cap_pi(intr_sock->sk)->chan->imtu);
+
+	BT_DBG("ctrl mtu %d intr mtu %d", session->ctrl_mtu, session->intr_mtu);
+
+	session->ctrl_sock = ctrl_sock;
+	session->intr_sock = intr_sock;
+	session->state     = BT_CONNECTED;
+
+	setup_timer(&session->timer, hidp_idle_timeout, (unsigned long)session);
+
+	skb_queue_head_init(&session->ctrl_transmit);
+	skb_queue_head_init(&session->intr_transmit);
+
+	mutex_init(&session->report_mutex);
+	init_waitqueue_head(&session->report_queue);
+	init_waitqueue_head(&session->startup_queue);
+	session->waiting_for_startup = 1;
+	session->flags   = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID);
+	session->idle_to = req->idle_to;
+
+	if (req->rd_size > 0) {
+		err = hidp_setup_hid(session, req);
+		if (err && err != -ENODEV)
+			goto purge;
+	}
+
+	if (!session->hid) {
+		err = hidp_setup_input(session, req);
+		if (err < 0)
+			goto purge;
+	}
+
+	__hidp_link_session(session);
+
+	hidp_set_timer(session);
+
+	if (session->hid) {
+		vendor  = session->hid->vendor;
+		product = session->hid->product;
+	} else if (session->input) {
+		vendor  = session->input->id.vendor;
+		product = session->input->id.product;
+	} else {
+		vendor = 0x0000;
+		product = 0x0000;
+	}
+
+	session->task = kthread_run(hidp_session, session, "khidpd_%04x%04x",
+							vendor, product);
+	if (IS_ERR(session->task)) {
+		err = PTR_ERR(session->task);
+		goto unlink;
+	}
+
+	while (session->waiting_for_startup) {
+		wait_event_interruptible(session->startup_queue,
+			!session->waiting_for_startup);
+	}
+
+	err = hid_add_device(session->hid);
+	if (err < 0) {
+		atomic_inc(&session->terminate);
+		wake_up_process(session->task);
+		up_write(&hidp_session_sem);
+		return err;
+	}
+
+	if (session->input) {
+		hidp_send_ctrl_message(session,
+			HIDP_TRANS_SET_PROTOCOL | HIDP_PROTO_BOOT, NULL, 0);
+		session->flags |= (1 << HIDP_BOOT_PROTOCOL_MODE);
+
+		session->leds = 0xff;
+		hidp_input_event(session->input, EV_LED, 0, 0);
+	}
+
+	up_write(&hidp_session_sem);
+	return 0;
+
+unlink:
+	hidp_del_timer(session);
+
+	__hidp_unlink_session(session);
+
+	if (session->input) {
+		input_unregister_device(session->input);
+		session->input = NULL;
+	}
+
+	if (session->hid) {
+		hid_destroy_device(session->hid);
+		session->hid = NULL;
+	}
+
+	kfree(session->rd_data);
+	session->rd_data = NULL;
+
+purge:
+	skb_queue_purge(&session->ctrl_transmit);
+	skb_queue_purge(&session->intr_transmit);
+
+failed:
+	up_write(&hidp_session_sem);
+
+	kfree(session);
+	return err;
+}
+
+int hidp_del_connection(struct hidp_conndel_req *req)
+{
+	struct hidp_session *session;
+	int err = 0;
+
+	BT_DBG("");
+
+	down_read(&hidp_session_sem);
+
+	session = __hidp_get_session(&req->bdaddr);
+	if (session) {
+		if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) {
+			hidp_send_ctrl_message(session,
+				HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0);
+		} else {
+			/* Flush the transmit queues */
+			skb_queue_purge(&session->ctrl_transmit);
+			skb_queue_purge(&session->intr_transmit);
+
+			atomic_inc(&session->terminate);
+			wake_up_process(session->task);
+		}
+	} else
+		err = -ENOENT;
+
+	up_read(&hidp_session_sem);
+	return err;
+}
+
+int hidp_get_connlist(struct hidp_connlist_req *req)
+{
+	struct list_head *p;
+	int err = 0, n = 0;
+
+	BT_DBG("");
+
+	down_read(&hidp_session_sem);
+
+	list_for_each(p, &hidp_session_list) {
+		struct hidp_session *session;
+		struct hidp_conninfo ci;
+
+		session = list_entry(p, struct hidp_session, list);
+
+		__hidp_copy_session(session, &ci);
+
+		if (copy_to_user(req->ci, &ci, sizeof(ci))) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (++n >= req->cnum)
+			break;
+
+		req->ci++;
+	}
+	req->cnum = n;
+
+	up_read(&hidp_session_sem);
+	return err;
+}
+
+int hidp_get_conninfo(struct hidp_conninfo *ci)
+{
+	struct hidp_session *session;
+	int err = 0;
+
+	down_read(&hidp_session_sem);
+
+	session = __hidp_get_session(&ci->bdaddr);
+	if (session)
+		__hidp_copy_session(session, ci);
+	else
+		err = -ENOENT;
+
+	up_read(&hidp_session_sem);
+	return err;
+}
+
+static const struct hid_device_id hidp_table[] = {
+	{ HID_BLUETOOTH_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+	{ }
+};
+
+static struct hid_driver hidp_driver = {
+	.name = "generic-bluetooth",
+	.id_table = hidp_table,
+};
+
+static int __init hidp_init(void)
+{
+	int ret;
+
+	BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION);
+
+	ret = hid_register_driver(&hidp_driver);
+	if (ret)
+		goto err;
+
+	ret = hidp_init_sockets();
+	if (ret)
+		goto err_drv;
+
+	return 0;
+err_drv:
+	hid_unregister_driver(&hidp_driver);
+err:
+	return ret;
+}
+
+static void __exit hidp_exit(void)
+{
+	hidp_cleanup_sockets();
+	hid_unregister_driver(&hidp_driver);
+}
+
+module_init(hidp_init);
+module_exit(hidp_exit);
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-6");
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
new file mode 100644
index 00000000..af1bcc82
--- /dev/null
+++ b/net/bluetooth/hidp/hidp.h
@@ -0,0 +1,191 @@
+/*
+   HIDP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#ifndef __HIDP_H
+#define __HIDP_H
+
+#include <linux/types.h>
+#include <net/bluetooth/bluetooth.h>
+
+/* HIDP header masks */
+#define HIDP_HEADER_TRANS_MASK			0xf0
+#define HIDP_HEADER_PARAM_MASK			0x0f
+
+/* HIDP transaction types */
+#define HIDP_TRANS_HANDSHAKE			0x00
+#define HIDP_TRANS_HID_CONTROL			0x10
+#define HIDP_TRANS_GET_REPORT			0x40
+#define HIDP_TRANS_SET_REPORT			0x50
+#define HIDP_TRANS_GET_PROTOCOL			0x60
+#define HIDP_TRANS_SET_PROTOCOL			0x70
+#define HIDP_TRANS_GET_IDLE			0x80
+#define HIDP_TRANS_SET_IDLE			0x90
+#define HIDP_TRANS_DATA				0xa0
+#define HIDP_TRANS_DATC				0xb0
+
+/* HIDP handshake results */
+#define HIDP_HSHK_SUCCESSFUL			0x00
+#define HIDP_HSHK_NOT_READY			0x01
+#define HIDP_HSHK_ERR_INVALID_REPORT_ID		0x02
+#define HIDP_HSHK_ERR_UNSUPPORTED_REQUEST	0x03
+#define HIDP_HSHK_ERR_INVALID_PARAMETER		0x04
+#define HIDP_HSHK_ERR_UNKNOWN			0x0e
+#define HIDP_HSHK_ERR_FATAL			0x0f
+
+/* HIDP control operation parameters */
+#define HIDP_CTRL_NOP				0x00
+#define HIDP_CTRL_HARD_RESET			0x01
+#define HIDP_CTRL_SOFT_RESET			0x02
+#define HIDP_CTRL_SUSPEND			0x03
+#define HIDP_CTRL_EXIT_SUSPEND			0x04
+#define HIDP_CTRL_VIRTUAL_CABLE_UNPLUG		0x05
+
+/* HIDP data transaction headers */
+#define HIDP_DATA_RTYPE_MASK			0x03
+#define HIDP_DATA_RSRVD_MASK			0x0c
+#define HIDP_DATA_RTYPE_OTHER			0x00
+#define HIDP_DATA_RTYPE_INPUT			0x01
+#define HIDP_DATA_RTYPE_OUPUT			0x02
+#define HIDP_DATA_RTYPE_FEATURE			0x03
+
+/* HIDP protocol header parameters */
+#define HIDP_PROTO_BOOT				0x00
+#define HIDP_PROTO_REPORT			0x01
+
+/* HIDP ioctl defines */
+#define HIDPCONNADD	_IOW('H', 200, int)
+#define HIDPCONNDEL	_IOW('H', 201, int)
+#define HIDPGETCONNLIST	_IOR('H', 210, int)
+#define HIDPGETCONNINFO	_IOR('H', 211, int)
+
+#define HIDP_VIRTUAL_CABLE_UNPLUG	0
+#define HIDP_BOOT_PROTOCOL_MODE		1
+#define HIDP_BLUETOOTH_VENDOR_ID	9
+#define HIDP_WAITING_FOR_RETURN		10
+#define HIDP_WAITING_FOR_SEND_ACK	11
+
+struct hidp_connadd_req {
+	int   ctrl_sock;	/* Connected control socket */
+	int   intr_sock;	/* Connected interrupt socket */
+	__u16 parser;
+	__u16 rd_size;
+	__u8 __user *rd_data;
+	__u8  country;
+	__u8  subclass;
+	__u16 vendor;
+	__u16 product;
+	__u16 version;
+	__u32 flags;
+	__u32 idle_to;
+	char  name[128];
+};
+
+struct hidp_conndel_req {
+	bdaddr_t bdaddr;
+	__u32    flags;
+};
+
+struct hidp_conninfo {
+	bdaddr_t bdaddr;
+	__u32    flags;
+	__u16    state;
+	__u16    vendor;
+	__u16    product;
+	__u16    version;
+	char     name[128];
+};
+
+struct hidp_connlist_req {
+	__u32  cnum;
+	struct hidp_conninfo __user *ci;
+};
+
+int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
+int hidp_del_connection(struct hidp_conndel_req *req);
+int hidp_get_connlist(struct hidp_connlist_req *req);
+int hidp_get_conninfo(struct hidp_conninfo *ci);
+
+/* HIDP session defines */
+struct hidp_session {
+	struct list_head list;
+
+	struct hci_conn *conn;
+
+	struct socket *ctrl_sock;
+	struct socket *intr_sock;
+
+	bdaddr_t bdaddr;
+
+	unsigned long state;
+	unsigned long flags;
+	unsigned long idle_to;
+
+	uint ctrl_mtu;
+	uint intr_mtu;
+
+	atomic_t terminate;
+	struct task_struct *task;
+
+	unsigned char keys[8];
+	unsigned char leds;
+
+	struct input_dev *input;
+
+	struct hid_device *hid;
+
+	struct timer_list timer;
+
+	struct sk_buff_head ctrl_transmit;
+	struct sk_buff_head intr_transmit;
+
+	/* Used in hidp_get_raw_report() */
+	int waiting_report_type; /* HIDP_DATA_RTYPE_* */
+	int waiting_report_number; /* -1 for not numbered */
+	struct mutex report_mutex;
+	struct sk_buff *report_return;
+	wait_queue_head_t report_queue;
+
+	/* Used in hidp_output_raw_report() */
+	int output_report_success; /* boolean */
+
+	/* Report descriptor */
+	__u8 *rd_data;
+	uint rd_size;
+
+	wait_queue_head_t startup_queue;
+	int waiting_for_startup;
+};
+
+static inline void hidp_schedule(struct hidp_session *session)
+{
+	struct sock *ctrl_sk = session->ctrl_sock->sk;
+	struct sock *intr_sk = session->intr_sock->sk;
+
+	wake_up_interruptible(sk_sleep(ctrl_sk));
+	wake_up_interruptible(sk_sleep(intr_sk));
+}
+
+/* HIDP init defines */
+extern int __init hidp_init_sockets(void);
+extern void __exit hidp_cleanup_sockets(void);
+
+#endif /* __HIDP_H */
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
new file mode 100644
index 00000000..178ac7f1
--- /dev/null
+++ b/net/bluetooth/hidp/sock.c
@@ -0,0 +1,305 @@
+/*
+   HIDP implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/ioctl.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+
+#include "hidp.h"
+
+static int hidp_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sock_orphan(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *) arg;
+	struct hidp_connadd_req ca;
+	struct hidp_conndel_req cd;
+	struct hidp_connlist_req cl;
+	struct hidp_conninfo ci;
+	struct socket *csock;
+	struct socket *isock;
+	int err;
+
+	BT_DBG("cmd %x arg %lx", cmd, arg);
+
+	switch (cmd) {
+	case HIDPCONNADD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&ca, argp, sizeof(ca)))
+			return -EFAULT;
+
+		csock = sockfd_lookup(ca.ctrl_sock, &err);
+		if (!csock)
+			return err;
+
+		isock = sockfd_lookup(ca.intr_sock, &err);
+		if (!isock) {
+			sockfd_put(csock);
+			return err;
+		}
+
+		if (csock->sk->sk_state != BT_CONNECTED ||
+				isock->sk->sk_state != BT_CONNECTED) {
+			sockfd_put(csock);
+			sockfd_put(isock);
+			return -EBADFD;
+		}
+
+		err = hidp_add_connection(&ca, csock, isock);
+		if (!err) {
+			if (copy_to_user(argp, &ca, sizeof(ca)))
+				err = -EFAULT;
+		} else {
+			sockfd_put(csock);
+			sockfd_put(isock);
+		}
+
+		return err;
+
+	case HIDPCONNDEL:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+
+		if (copy_from_user(&cd, argp, sizeof(cd)))
+			return -EFAULT;
+
+		return hidp_del_connection(&cd);
+
+	case HIDPGETCONNLIST:
+		if (copy_from_user(&cl, argp, sizeof(cl)))
+			return -EFAULT;
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = hidp_get_connlist(&cl);
+		if (!err && copy_to_user(argp, &cl, sizeof(cl)))
+			return -EFAULT;
+
+		return err;
+
+	case HIDPGETCONNINFO:
+		if (copy_from_user(&ci, argp, sizeof(ci)))
+			return -EFAULT;
+
+		err = hidp_get_conninfo(&ci);
+		if (!err && copy_to_user(argp, &ci, sizeof(ci)))
+			return -EFAULT;
+
+		return err;
+	}
+
+	return -EINVAL;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_hidp_connadd_req {
+	int   ctrl_sock;	/* Connected control socket */
+	int   intr_sock;	/* Connected interrupt socket */
+	__u16 parser;
+	__u16 rd_size;
+	compat_uptr_t rd_data;
+	__u8  country;
+	__u8  subclass;
+	__u16 vendor;
+	__u16 product;
+	__u16 version;
+	__u32 flags;
+	__u32 idle_to;
+	char  name[128];
+};
+
+static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	if (cmd == HIDPGETCONNLIST) {
+		struct hidp_connlist_req cl;
+		uint32_t uci;
+		int err;
+
+		if (get_user(cl.cnum, (uint32_t __user *) arg) ||
+				get_user(uci, (u32 __user *) (arg + 4)))
+			return -EFAULT;
+
+		cl.ci = compat_ptr(uci);
+
+		if (cl.cnum <= 0)
+			return -EINVAL;
+
+		err = hidp_get_connlist(&cl);
+
+		if (!err && put_user(cl.cnum, (uint32_t __user *) arg))
+			err = -EFAULT;
+
+		return err;
+	} else if (cmd == HIDPCONNADD) {
+		struct compat_hidp_connadd_req ca;
+		struct hidp_connadd_req __user *uca;
+
+		uca = compat_alloc_user_space(sizeof(*uca));
+
+		if (copy_from_user(&ca, (void __user *) arg, sizeof(ca)))
+			return -EFAULT;
+
+		if (put_user(ca.ctrl_sock, &uca->ctrl_sock) ||
+				put_user(ca.intr_sock, &uca->intr_sock) ||
+				put_user(ca.parser, &uca->parser) ||
+				put_user(ca.rd_size, &uca->rd_size) ||
+				put_user(compat_ptr(ca.rd_data), &uca->rd_data) ||
+				put_user(ca.country, &uca->country) ||
+				put_user(ca.subclass, &uca->subclass) ||
+				put_user(ca.vendor, &uca->vendor) ||
+				put_user(ca.product, &uca->product) ||
+				put_user(ca.version, &uca->version) ||
+				put_user(ca.flags, &uca->flags) ||
+				put_user(ca.idle_to, &uca->idle_to) ||
+				copy_to_user(&uca->name[0], &ca.name[0], 128))
+			return -EFAULT;
+
+		arg = (unsigned long) uca;
+
+		/* Fall through. We don't actually write back any _changes_
+		   to the structure anyway, so there's no need to copy back
+		   into the original compat version */
+	}
+
+	return hidp_sock_ioctl(sock, cmd, arg);
+}
+#endif
+
+static const struct proto_ops hidp_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= hidp_sock_release,
+	.ioctl		= hidp_sock_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= hidp_sock_compat_ioctl,
+#endif
+	.bind		= sock_no_bind,
+	.getname	= sock_no_getname,
+	.sendmsg	= sock_no_sendmsg,
+	.recvmsg	= sock_no_recvmsg,
+	.poll		= sock_no_poll,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.mmap		= sock_no_mmap
+};
+
+static struct proto hidp_proto = {
+	.name		= "HIDP",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct bt_sock)
+};
+
+static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	sock->ops = &hidp_sock_ops;
+
+	sock->state = SS_UNCONNECTED;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = protocol;
+	sk->sk_state	= BT_OPEN;
+
+	return 0;
+}
+
+static const struct net_proto_family hidp_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= hidp_sock_create
+};
+
+int __init hidp_init_sockets(void)
+{
+	int err;
+
+	err = proto_register(&hidp_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	return 0;
+
+error:
+	BT_ERR("Can't register HIDP socket");
+	proto_unregister(&hidp_proto);
+	return err;
+}
+
+void __exit hidp_cleanup_sockets(void)
+{
+	if (bt_sock_unregister(BTPROTO_HIDP) < 0)
+		BT_ERR("Can't unregister HIDP socket");
+
+	proto_unregister(&hidp_proto);
+}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
new file mode 100644
index 00000000..5a0ce738
--- /dev/null
+++ b/net/bluetooth/l2cap_core.c
@@ -0,0 +1,4378 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+   Copyright (C) 2010 Google Inc.
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP core. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/crc16.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/smp.h>
+
+int disable_ertm;
+
+static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN;
+static u8 l2cap_fixed_chan[8] = { 0x02, };
+
+static LIST_HEAD(chan_list);
+static DEFINE_RWLOCK(chan_list_lock);
+
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
+				u8 code, u8 ident, u16 dlen, void *data);
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
+								void *data);
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data);
+static void l2cap_send_disconn_req(struct l2cap_conn *conn,
+				struct l2cap_chan *chan, int err);
+
+static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb);
+
+/* ---- L2CAP channels ---- */
+
+static inline void chan_hold(struct l2cap_chan *c)
+{
+	atomic_inc(&c->refcnt);
+}
+
+static inline void chan_put(struct l2cap_chan *c)
+{
+	if (atomic_dec_and_test(&c->refcnt))
+		kfree(c);
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_dcid(struct l2cap_conn *conn, u16 cid)
+{
+	struct l2cap_chan *c;
+
+	list_for_each_entry(c, &conn->chan_l, list) {
+		if (c->dcid == cid)
+			return c;
+	}
+	return NULL;
+
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid)
+{
+	struct l2cap_chan *c;
+
+	list_for_each_entry(c, &conn->chan_l, list) {
+		if (c->scid == cid)
+			return c;
+	}
+	return NULL;
+}
+
+/* Find channel with given SCID.
+ * Returns locked socket */
+static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, u16 cid)
+{
+	struct l2cap_chan *c;
+
+	read_lock(&conn->chan_lock);
+	c = __l2cap_get_chan_by_scid(conn, cid);
+	if (c)
+		bh_lock_sock(c->sk);
+	read_unlock(&conn->chan_lock);
+	return c;
+}
+
+static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn, u8 ident)
+{
+	struct l2cap_chan *c;
+
+	list_for_each_entry(c, &conn->chan_l, list) {
+		if (c->ident == ident)
+			return c;
+	}
+	return NULL;
+}
+
+static inline struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn, u8 ident)
+{
+	struct l2cap_chan *c;
+
+	read_lock(&conn->chan_lock);
+	c = __l2cap_get_chan_by_ident(conn, ident);
+	if (c)
+		bh_lock_sock(c->sk);
+	read_unlock(&conn->chan_lock);
+	return c;
+}
+
+static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src)
+{
+	struct l2cap_chan *c;
+
+	list_for_each_entry(c, &chan_list, global_l) {
+		if (c->sport == psm && !bacmp(&bt_sk(c->sk)->src, src))
+			goto found;
+	}
+
+	c = NULL;
+found:
+	return c;
+}
+
+int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
+{
+	int err;
+
+	write_lock_bh(&chan_list_lock);
+
+	if (psm && __l2cap_global_chan_by_addr(psm, src)) {
+		err = -EADDRINUSE;
+		goto done;
+	}
+
+	if (psm) {
+		chan->psm = psm;
+		chan->sport = psm;
+		err = 0;
+	} else {
+		u16 p;
+
+		err = -EINVAL;
+		for (p = 0x1001; p < 0x1100; p += 2)
+			if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
+				chan->psm   = cpu_to_le16(p);
+				chan->sport = cpu_to_le16(p);
+				err = 0;
+				break;
+			}
+	}
+
+done:
+	write_unlock_bh(&chan_list_lock);
+	return err;
+}
+
+int l2cap_add_scid(struct l2cap_chan *chan,  __u16 scid)
+{
+	write_lock_bh(&chan_list_lock);
+
+	chan->scid = scid;
+
+	write_unlock_bh(&chan_list_lock);
+
+	return 0;
+}
+
+static u16 l2cap_alloc_cid(struct l2cap_conn *conn)
+{
+	u16 cid = L2CAP_CID_DYN_START;
+
+	for (; cid < L2CAP_CID_DYN_END; cid++) {
+		if (!__l2cap_get_chan_by_scid(conn, cid))
+			return cid;
+	}
+
+	return 0;
+}
+
+static void l2cap_set_timer(struct l2cap_chan *chan, struct timer_list *timer, long timeout)
+{
+	BT_DBG("chan %p state %d timeout %ld", chan->sk, chan->state, timeout);
+
+	if (!mod_timer(timer, jiffies + msecs_to_jiffies(timeout)))
+		chan_hold(chan);
+}
+
+static void l2cap_clear_timer(struct l2cap_chan *chan, struct timer_list *timer)
+{
+	BT_DBG("chan %p state %d", chan, chan->state);
+
+	if (timer_pending(timer) && del_timer(timer))
+		chan_put(chan);
+}
+
+static void l2cap_state_change(struct l2cap_chan *chan, int state)
+{
+	chan->state = state;
+	chan->ops->state_change(chan->data, state);
+}
+
+static void l2cap_chan_timeout(unsigned long arg)
+{
+	struct l2cap_chan *chan = (struct l2cap_chan *) arg;
+	struct sock *sk = chan->sk;
+	int reason;
+
+	BT_DBG("chan %p state %d", chan, chan->state);
+
+	bh_lock_sock(sk);
+
+	if (sock_owned_by_user(sk)) {
+		/* sk is owned by user. Try again later */
+		__set_chan_timer(chan, HZ / 5);
+		bh_unlock_sock(sk);
+		chan_put(chan);
+		return;
+	}
+
+	if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG)
+		reason = ECONNREFUSED;
+	else if (chan->state == BT_CONNECT &&
+					chan->sec_level != BT_SECURITY_SDP)
+		reason = ECONNREFUSED;
+	else
+		reason = ETIMEDOUT;
+
+	l2cap_chan_close(chan, reason);
+
+	bh_unlock_sock(sk);
+
+	chan->ops->close(chan->data);
+	chan_put(chan);
+}
+
+struct l2cap_chan *l2cap_chan_create(struct sock *sk)
+{
+	struct l2cap_chan *chan;
+
+	chan = kzalloc(sizeof(*chan), GFP_ATOMIC);
+	if (!chan)
+		return NULL;
+
+	chan->sk = sk;
+
+	write_lock_bh(&chan_list_lock);
+	list_add(&chan->global_l, &chan_list);
+	write_unlock_bh(&chan_list_lock);
+
+	setup_timer(&chan->chan_timer, l2cap_chan_timeout, (unsigned long) chan);
+
+	chan->state = BT_OPEN;
+
+	atomic_set(&chan->refcnt, 1);
+
+	return chan;
+}
+
+void l2cap_chan_destroy(struct l2cap_chan *chan)
+{
+	write_lock_bh(&chan_list_lock);
+	list_del(&chan->global_l);
+	write_unlock_bh(&chan_list_lock);
+
+	chan_put(chan);
+}
+
+static void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+	BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn,
+			chan->psm, chan->dcid);
+
+	conn->disc_reason = 0x13;
+
+	chan->conn = conn;
+
+	if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
+		if (conn->hcon->type == LE_LINK) {
+			/* LE connection */
+			chan->omtu = L2CAP_LE_DEFAULT_MTU;
+			chan->scid = L2CAP_CID_LE_DATA;
+			chan->dcid = L2CAP_CID_LE_DATA;
+		} else {
+			/* Alloc CID for connection-oriented socket */
+			chan->scid = l2cap_alloc_cid(conn);
+			chan->omtu = L2CAP_DEFAULT_MTU;
+		}
+	} else if (chan->chan_type == L2CAP_CHAN_CONN_LESS) {
+		/* Connectionless socket */
+		chan->scid = L2CAP_CID_CONN_LESS;
+		chan->dcid = L2CAP_CID_CONN_LESS;
+		chan->omtu = L2CAP_DEFAULT_MTU;
+	} else {
+		/* Raw socket can send/recv signalling messages only */
+		chan->scid = L2CAP_CID_SIGNALING;
+		chan->dcid = L2CAP_CID_SIGNALING;
+		chan->omtu = L2CAP_DEFAULT_MTU;
+	}
+
+	chan_hold(chan);
+
+	list_add(&chan->list, &conn->chan_l);
+}
+
+/* Delete channel.
+ * Must be called on the locked socket. */
+static void l2cap_chan_del(struct l2cap_chan *chan, int err)
+{
+	struct sock *sk = chan->sk;
+	struct l2cap_conn *conn = chan->conn;
+	struct sock *parent = bt_sk(sk)->parent;
+
+	__clear_chan_timer(chan);
+
+	BT_DBG("chan %p, conn %p, err %d", chan, conn, err);
+
+	if (conn) {
+		/* Delete from channel list */
+		write_lock_bh(&conn->chan_lock);
+		list_del(&chan->list);
+		write_unlock_bh(&conn->chan_lock);
+		chan_put(chan);
+
+		chan->conn = NULL;
+		hci_conn_put(conn->hcon);
+	}
+
+	l2cap_state_change(chan, BT_CLOSED);
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	if (err)
+		sk->sk_err = err;
+
+	if (parent) {
+		bt_accept_unlink(sk);
+		parent->sk_data_ready(parent, 0);
+	} else
+		sk->sk_state_change(sk);
+
+	if (!(test_bit(CONF_OUTPUT_DONE, &chan->conf_state) &&
+			test_bit(CONF_INPUT_DONE, &chan->conf_state)))
+		return;
+
+	skb_queue_purge(&chan->tx_q);
+
+	if (chan->mode == L2CAP_MODE_ERTM) {
+		struct srej_list *l, *tmp;
+
+		__clear_retrans_timer(chan);
+		__clear_monitor_timer(chan);
+		__clear_ack_timer(chan);
+
+		skb_queue_purge(&chan->srej_q);
+
+		list_for_each_entry_safe(l, tmp, &chan->srej_l, list) {
+			list_del(&l->list);
+			kfree(l);
+		}
+	}
+}
+
+static void l2cap_chan_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	BT_DBG("parent %p", parent);
+
+	/* Close not yet accepted channels */
+	while ((sk = bt_accept_dequeue(parent, NULL))) {
+		struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+		__clear_chan_timer(chan);
+		lock_sock(sk);
+		l2cap_chan_close(chan, ECONNRESET);
+		release_sock(sk);
+		chan->ops->close(chan->data);
+	}
+}
+
+void l2cap_chan_close(struct l2cap_chan *chan, int reason)
+{
+	struct l2cap_conn *conn = chan->conn;
+	struct sock *sk = chan->sk;
+
+	BT_DBG("chan %p state %d socket %p", chan, chan->state, sk->sk_socket);
+
+	switch (chan->state) {
+	case BT_LISTEN:
+		l2cap_chan_cleanup_listen(sk);
+
+		l2cap_state_change(chan, BT_CLOSED);
+		sock_set_flag(sk, SOCK_ZAPPED);
+		break;
+
+	case BT_CONNECTED:
+	case BT_CONFIG:
+		if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED &&
+					conn->hcon->type == ACL_LINK) {
+			__clear_chan_timer(chan);
+			__set_chan_timer(chan, sk->sk_sndtimeo);
+			l2cap_send_disconn_req(conn, chan, reason);
+		} else
+			l2cap_chan_del(chan, reason);
+		break;
+
+	case BT_CONNECT2:
+		if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED &&
+					conn->hcon->type == ACL_LINK) {
+			struct l2cap_conn_rsp rsp;
+			__u16 result;
+
+			if (bt_sk(sk)->defer_setup)
+				result = L2CAP_CR_SEC_BLOCK;
+			else
+				result = L2CAP_CR_BAD_PSM;
+			l2cap_state_change(chan, BT_DISCONN);
+
+			rsp.scid   = cpu_to_le16(chan->dcid);
+			rsp.dcid   = cpu_to_le16(chan->scid);
+			rsp.result = cpu_to_le16(result);
+			rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+			l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+							sizeof(rsp), &rsp);
+		}
+
+		l2cap_chan_del(chan, reason);
+		break;
+
+	case BT_CONNECT:
+	case BT_DISCONN:
+		l2cap_chan_del(chan, reason);
+		break;
+
+	default:
+		sock_set_flag(sk, SOCK_ZAPPED);
+		break;
+	}
+}
+
+static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan)
+{
+	if (chan->chan_type == L2CAP_CHAN_RAW) {
+		switch (chan->sec_level) {
+		case BT_SECURITY_HIGH:
+			return HCI_AT_DEDICATED_BONDING_MITM;
+		case BT_SECURITY_MEDIUM:
+			return HCI_AT_DEDICATED_BONDING;
+		default:
+			return HCI_AT_NO_BONDING;
+		}
+	} else if (chan->psm == cpu_to_le16(0x0001)) {
+		if (chan->sec_level == BT_SECURITY_LOW)
+			chan->sec_level = BT_SECURITY_SDP;
+
+		if (chan->sec_level == BT_SECURITY_HIGH)
+			return HCI_AT_NO_BONDING_MITM;
+		else
+			return HCI_AT_NO_BONDING;
+	} else {
+		switch (chan->sec_level) {
+		case BT_SECURITY_HIGH:
+			return HCI_AT_GENERAL_BONDING_MITM;
+		case BT_SECURITY_MEDIUM:
+			return HCI_AT_GENERAL_BONDING;
+		default:
+			return HCI_AT_NO_BONDING;
+		}
+	}
+}
+
+/* Service level security */
+static inline int l2cap_check_security(struct l2cap_chan *chan)
+{
+	struct l2cap_conn *conn = chan->conn;
+	__u8 auth_type;
+
+	auth_type = l2cap_get_auth_type(chan);
+
+	return hci_conn_security(conn->hcon, chan->sec_level, auth_type);
+}
+
+static u8 l2cap_get_ident(struct l2cap_conn *conn)
+{
+	u8 id;
+
+	/* Get next available identificator.
+	 *    1 - 128 are used by kernel.
+	 *  129 - 199 are reserved.
+	 *  200 - 254 are used by utilities like l2ping, etc.
+	 */
+
+	spin_lock_bh(&conn->lock);
+
+	if (++conn->tx_ident > 128)
+		conn->tx_ident = 1;
+
+	id = conn->tx_ident;
+
+	spin_unlock_bh(&conn->lock);
+
+	return id;
+}
+
+static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data)
+{
+	struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data);
+	u8 flags;
+
+	BT_DBG("code 0x%2.2x", code);
+
+	if (!skb)
+		return;
+
+	if (lmp_no_flush_capable(conn->hcon->hdev))
+		flags = ACL_START_NO_FLUSH;
+	else
+		flags = ACL_START;
+
+	bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON;
+
+	hci_send_acl(conn->hcon, skb, flags);
+}
+
+static inline void l2cap_send_sframe(struct l2cap_chan *chan, u16 control)
+{
+	struct sk_buff *skb;
+	struct l2cap_hdr *lh;
+	struct l2cap_conn *conn = chan->conn;
+	int count, hlen = L2CAP_HDR_SIZE + 2;
+	u8 flags;
+
+	if (chan->state != BT_CONNECTED)
+		return;
+
+	if (chan->fcs == L2CAP_FCS_CRC16)
+		hlen += 2;
+
+	BT_DBG("chan %p, control 0x%2.2x", chan, control);
+
+	count = min_t(unsigned int, conn->mtu, hlen);
+	control |= L2CAP_CTRL_FRAME_TYPE;
+
+	if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+		control |= L2CAP_CTRL_FINAL;
+
+	if (test_and_clear_bit(CONN_SEND_PBIT, &chan->conn_state))
+		control |= L2CAP_CTRL_POLL;
+
+	skb = bt_skb_alloc(count, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->len = cpu_to_le16(hlen - L2CAP_HDR_SIZE);
+	lh->cid = cpu_to_le16(chan->dcid);
+	put_unaligned_le16(control, skb_put(skb, 2));
+
+	if (chan->fcs == L2CAP_FCS_CRC16) {
+		u16 fcs = crc16(0, (u8 *)lh, count - 2);
+		put_unaligned_le16(fcs, skb_put(skb, 2));
+	}
+
+	if (lmp_no_flush_capable(conn->hcon->hdev))
+		flags = ACL_START_NO_FLUSH;
+	else
+		flags = ACL_START;
+
+	bt_cb(skb)->force_active = chan->force_active;
+
+	hci_send_acl(chan->conn->hcon, skb, flags);
+}
+
+static inline void l2cap_send_rr_or_rnr(struct l2cap_chan *chan, u16 control)
+{
+	if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+		control |= L2CAP_SUPER_RCV_NOT_READY;
+		set_bit(CONN_RNR_SENT, &chan->conn_state);
+	} else
+		control |= L2CAP_SUPER_RCV_READY;
+
+	control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+
+	l2cap_send_sframe(chan, control);
+}
+
+static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
+{
+	return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
+}
+
+static void l2cap_do_start(struct l2cap_chan *chan)
+{
+	struct l2cap_conn *conn = chan->conn;
+
+	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
+		if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
+			return;
+
+		if (l2cap_check_security(chan) &&
+				__l2cap_no_conn_pending(chan)) {
+			struct l2cap_conn_req req;
+			req.scid = cpu_to_le16(chan->scid);
+			req.psm  = chan->psm;
+
+			chan->ident = l2cap_get_ident(conn);
+			set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+			l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ,
+							sizeof(req), &req);
+		}
+	} else {
+		struct l2cap_info_req req;
+		req.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+		conn->info_ident = l2cap_get_ident(conn);
+
+		mod_timer(&conn->info_timer, jiffies +
+					msecs_to_jiffies(L2CAP_INFO_TIMEOUT));
+
+		l2cap_send_cmd(conn, conn->info_ident,
+					L2CAP_INFO_REQ, sizeof(req), &req);
+	}
+}
+
+static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
+{
+	u32 local_feat_mask = l2cap_feat_mask;
+	if (!disable_ertm)
+		local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING;
+
+	switch (mode) {
+	case L2CAP_MODE_ERTM:
+		return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask;
+	case L2CAP_MODE_STREAMING:
+		return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask;
+	default:
+		return 0x00;
+	}
+}
+
+static void l2cap_send_disconn_req(struct l2cap_conn *conn, struct l2cap_chan *chan, int err)
+{
+	struct sock *sk;
+	struct l2cap_disconn_req req;
+
+	if (!conn)
+		return;
+
+	sk = chan->sk;
+
+	if (chan->mode == L2CAP_MODE_ERTM) {
+		__clear_retrans_timer(chan);
+		__clear_monitor_timer(chan);
+		__clear_ack_timer(chan);
+	}
+
+	req.dcid = cpu_to_le16(chan->dcid);
+	req.scid = cpu_to_le16(chan->scid);
+	l2cap_send_cmd(conn, l2cap_get_ident(conn),
+			L2CAP_DISCONN_REQ, sizeof(req), &req);
+
+	l2cap_state_change(chan, BT_DISCONN);
+	sk->sk_err = err;
+}
+
+/* ---- L2CAP connections ---- */
+static void l2cap_conn_start(struct l2cap_conn *conn)
+{
+	struct l2cap_chan *chan, *tmp;
+
+	BT_DBG("conn %p", conn);
+
+	read_lock(&conn->chan_lock);
+
+	list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+		struct sock *sk = chan->sk;
+
+		bh_lock_sock(sk);
+
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if (chan->state == BT_CONNECT) {
+			struct l2cap_conn_req req;
+
+			if (!l2cap_check_security(chan) ||
+					!__l2cap_no_conn_pending(chan)) {
+				bh_unlock_sock(sk);
+				continue;
+			}
+
+			if (!l2cap_mode_supported(chan->mode, conn->feat_mask)
+					&& test_bit(CONF_STATE2_DEVICE,
+					&chan->conf_state)) {
+				/* l2cap_chan_close() calls list_del(chan)
+				 * so release the lock */
+				read_unlock(&conn->chan_lock);
+				l2cap_chan_close(chan, ECONNRESET);
+				read_lock(&conn->chan_lock);
+				bh_unlock_sock(sk);
+				continue;
+			}
+
+			req.scid = cpu_to_le16(chan->scid);
+			req.psm  = chan->psm;
+
+			chan->ident = l2cap_get_ident(conn);
+			set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+			l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ,
+							sizeof(req), &req);
+
+		} else if (chan->state == BT_CONNECT2) {
+			struct l2cap_conn_rsp rsp;
+			char buf[128];
+			rsp.scid = cpu_to_le16(chan->dcid);
+			rsp.dcid = cpu_to_le16(chan->scid);
+
+			if (l2cap_check_security(chan)) {
+				if (bt_sk(sk)->defer_setup) {
+					struct sock *parent = bt_sk(sk)->parent;
+					rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+					rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
+					if (parent)
+						parent->sk_data_ready(parent, 0);
+
+				} else {
+					l2cap_state_change(chan, BT_CONFIG);
+					rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+					rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+				}
+			} else {
+				rsp.result = cpu_to_le16(L2CAP_CR_PEND);
+				rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
+			}
+
+			l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+							sizeof(rsp), &rsp);
+
+			if (test_bit(CONF_REQ_SENT, &chan->conf_state) ||
+					rsp.result != L2CAP_CR_SUCCESS) {
+				bh_unlock_sock(sk);
+				continue;
+			}
+
+			set_bit(CONF_REQ_SENT, &chan->conf_state);
+			l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+						l2cap_build_conf_req(chan, buf), buf);
+			chan->num_conf_req++;
+		}
+
+		bh_unlock_sock(sk);
+	}
+
+	read_unlock(&conn->chan_lock);
+}
+
+/* Find socket with cid and source bdaddr.
+ * Returns closest match, locked.
+ */
+static struct l2cap_chan *l2cap_global_chan_by_scid(int state, __le16 cid, bdaddr_t *src)
+{
+	struct l2cap_chan *c, *c1 = NULL;
+
+	read_lock(&chan_list_lock);
+
+	list_for_each_entry(c, &chan_list, global_l) {
+		struct sock *sk = c->sk;
+
+		if (state && c->state != state)
+			continue;
+
+		if (c->scid == cid) {
+			/* Exact match. */
+			if (!bacmp(&bt_sk(sk)->src, src)) {
+				read_unlock(&chan_list_lock);
+				return c;
+			}
+
+			/* Closest match */
+			if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+				c1 = c;
+		}
+	}
+
+	read_unlock(&chan_list_lock);
+
+	return c1;
+}
+
+static void l2cap_le_conn_ready(struct l2cap_conn *conn)
+{
+	struct sock *parent, *sk;
+	struct l2cap_chan *chan, *pchan;
+
+	BT_DBG("");
+
+	/* Check if we have socket listening on cid */
+	pchan = l2cap_global_chan_by_scid(BT_LISTEN, L2CAP_CID_LE_DATA,
+							conn->src);
+	if (!pchan)
+		return;
+
+	parent = pchan->sk;
+
+	bh_lock_sock(parent);
+
+	/* Check for backlog size */
+	if (sk_acceptq_is_full(parent)) {
+		BT_DBG("backlog full %d", parent->sk_ack_backlog);
+		goto clean;
+	}
+
+	chan = pchan->ops->new_connection(pchan->data);
+	if (!chan)
+		goto clean;
+
+	sk = chan->sk;
+
+	write_lock_bh(&conn->chan_lock);
+
+	hci_conn_hold(conn->hcon);
+
+	bacpy(&bt_sk(sk)->src, conn->src);
+	bacpy(&bt_sk(sk)->dst, conn->dst);
+
+	bt_accept_enqueue(parent, sk);
+
+	__l2cap_chan_add(conn, chan);
+
+	__set_chan_timer(chan, sk->sk_sndtimeo);
+
+	l2cap_state_change(chan, BT_CONNECTED);
+	parent->sk_data_ready(parent, 0);
+
+	write_unlock_bh(&conn->chan_lock);
+
+clean:
+	bh_unlock_sock(parent);
+}
+
+static void l2cap_chan_ready(struct sock *sk)
+{
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct sock *parent = bt_sk(sk)->parent;
+
+	BT_DBG("sk %p, parent %p", sk, parent);
+
+	chan->conf_state = 0;
+	__clear_chan_timer(chan);
+
+	l2cap_state_change(chan, BT_CONNECTED);
+	sk->sk_state_change(sk);
+
+	if (parent)
+		parent->sk_data_ready(parent, 0);
+}
+
+static void l2cap_conn_ready(struct l2cap_conn *conn)
+{
+	struct l2cap_chan *chan;
+
+	BT_DBG("conn %p", conn);
+
+	if (!conn->hcon->out && conn->hcon->type == LE_LINK)
+		l2cap_le_conn_ready(conn);
+
+	read_lock(&conn->chan_lock);
+
+	list_for_each_entry(chan, &conn->chan_l, list) {
+		struct sock *sk = chan->sk;
+
+		bh_lock_sock(sk);
+
+		if (conn->hcon->type == LE_LINK) {
+			if (smp_conn_security(conn, chan->sec_level))
+				l2cap_chan_ready(sk);
+
+		} else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+			__clear_chan_timer(chan);
+			l2cap_state_change(chan, BT_CONNECTED);
+			sk->sk_state_change(sk);
+
+		} else if (chan->state == BT_CONNECT)
+			l2cap_do_start(chan);
+
+		bh_unlock_sock(sk);
+	}
+
+	read_unlock(&conn->chan_lock);
+}
+
+/* Notify sockets that we cannot guaranty reliability anymore */
+static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
+{
+	struct l2cap_chan *chan;
+
+	BT_DBG("conn %p", conn);
+
+	read_lock(&conn->chan_lock);
+
+	list_for_each_entry(chan, &conn->chan_l, list) {
+		struct sock *sk = chan->sk;
+
+		if (chan->force_reliable)
+			sk->sk_err = err;
+	}
+
+	read_unlock(&conn->chan_lock);
+}
+
+static void l2cap_info_timeout(unsigned long arg)
+{
+	struct l2cap_conn *conn = (void *) arg;
+
+	conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+	conn->info_ident = 0;
+
+	l2cap_conn_start(conn);
+}
+
+static void l2cap_conn_del(struct hci_conn *hcon, int err)
+{
+	struct l2cap_conn *conn = hcon->l2cap_data;
+	struct l2cap_chan *chan, *l;
+	struct sock *sk;
+
+	if (!conn)
+		return;
+
+	BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+	kfree_skb(conn->rx_skb);
+
+	/* Kill channels */
+	list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
+		sk = chan->sk;
+		bh_lock_sock(sk);
+		l2cap_chan_del(chan, err);
+		bh_unlock_sock(sk);
+		chan->ops->close(chan->data);
+	}
+
+	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
+		del_timer_sync(&conn->info_timer);
+
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend))
+		del_timer(&conn->security_timer);
+
+	hcon->l2cap_data = NULL;
+	kfree(conn);
+}
+
+static void security_timeout(unsigned long arg)
+{
+	struct l2cap_conn *conn = (void *) arg;
+
+	l2cap_conn_del(conn->hcon, ETIMEDOUT);
+}
+
+static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status)
+{
+	struct l2cap_conn *conn = hcon->l2cap_data;
+
+	if (conn || status)
+		return conn;
+
+	conn = kzalloc(sizeof(struct l2cap_conn), GFP_ATOMIC);
+	if (!conn)
+		return NULL;
+
+	hcon->l2cap_data = conn;
+	conn->hcon = hcon;
+
+	BT_DBG("hcon %p conn %p", hcon, conn);
+
+	if (hcon->hdev->le_mtu && hcon->type == LE_LINK)
+		conn->mtu = hcon->hdev->le_mtu;
+	else
+		conn->mtu = hcon->hdev->acl_mtu;
+
+	conn->src = &hcon->hdev->bdaddr;
+	conn->dst = &hcon->dst;
+
+	conn->feat_mask = 0;
+
+	spin_lock_init(&conn->lock);
+	rwlock_init(&conn->chan_lock);
+
+	INIT_LIST_HEAD(&conn->chan_l);
+
+	if (hcon->type == LE_LINK)
+		setup_timer(&conn->security_timer, security_timeout,
+						(unsigned long) conn);
+	else
+		setup_timer(&conn->info_timer, l2cap_info_timeout,
+						(unsigned long) conn);
+
+	conn->disc_reason = 0x13;
+
+	return conn;
+}
+
+static inline void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
+{
+	write_lock_bh(&conn->chan_lock);
+	__l2cap_chan_add(conn, chan);
+	write_unlock_bh(&conn->chan_lock);
+}
+
+/* ---- Socket interface ---- */
+
+/* Find socket with psm and source bdaddr.
+ * Returns closest match.
+ */
+static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr_t *src)
+{
+	struct l2cap_chan *c, *c1 = NULL;
+
+	read_lock(&chan_list_lock);
+
+	list_for_each_entry(c, &chan_list, global_l) {
+		struct sock *sk = c->sk;
+
+		if (state && c->state != state)
+			continue;
+
+		if (c->psm == psm) {
+			/* Exact match. */
+			if (!bacmp(&bt_sk(sk)->src, src)) {
+				read_unlock(&chan_list_lock);
+				return c;
+			}
+
+			/* Closest match */
+			if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+				c1 = c;
+		}
+	}
+
+	read_unlock(&chan_list_lock);
+
+	return c1;
+}
+
+int l2cap_chan_connect(struct l2cap_chan *chan)
+{
+	struct sock *sk = chan->sk;
+	bdaddr_t *src = &bt_sk(sk)->src;
+	bdaddr_t *dst = &bt_sk(sk)->dst;
+	struct l2cap_conn *conn;
+	struct hci_conn *hcon;
+	struct hci_dev *hdev;
+	__u8 auth_type;
+	int err;
+
+	BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst),
+							chan->psm);
+
+	hdev = hci_get_route(dst, src);
+	if (!hdev)
+		return -EHOSTUNREACH;
+
+	hci_dev_lock_bh(hdev);
+
+	auth_type = l2cap_get_auth_type(chan);
+
+	if (chan->dcid == L2CAP_CID_LE_DATA)
+		hcon = hci_connect(hdev, LE_LINK, 0, dst,
+					chan->sec_level, auth_type);
+	else
+		hcon = hci_connect(hdev, ACL_LINK, 0, dst,
+					chan->sec_level, auth_type);
+
+	if (IS_ERR(hcon)) {
+		err = PTR_ERR(hcon);
+		goto done;
+	}
+
+	conn = l2cap_conn_add(hcon, 0);
+	if (!conn) {
+		hci_conn_put(hcon);
+		err = -ENOMEM;
+		goto done;
+	}
+
+	/* Update source addr of the socket */
+	bacpy(src, conn->src);
+
+	l2cap_chan_add(conn, chan);
+
+	l2cap_state_change(chan, BT_CONNECT);
+	__set_chan_timer(chan, sk->sk_sndtimeo);
+
+	if (hcon->state == BT_CONNECTED) {
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+			__clear_chan_timer(chan);
+			if (l2cap_check_security(chan))
+				l2cap_state_change(chan, BT_CONNECTED);
+		} else
+			l2cap_do_start(chan);
+	}
+
+	err = 0;
+
+done:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+int __l2cap_wait_ack(struct sock *sk)
+{
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	DECLARE_WAITQUEUE(wait, current);
+	int err = 0;
+	int timeo = HZ/5;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (chan->unacked_frames > 0 && chan->conn) {
+		if (!timeo)
+			timeo = HZ/5;
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		err = sock_error(sk);
+		if (err)
+			break;
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return err;
+}
+
+static void l2cap_monitor_timeout(unsigned long arg)
+{
+	struct l2cap_chan *chan = (void *) arg;
+	struct sock *sk = chan->sk;
+
+	BT_DBG("chan %p", chan);
+
+	bh_lock_sock(sk);
+	if (chan->retry_count >= chan->remote_max_tx) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED);
+		bh_unlock_sock(sk);
+		return;
+	}
+
+	chan->retry_count++;
+	__set_monitor_timer(chan);
+
+	l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_POLL);
+	bh_unlock_sock(sk);
+}
+
+static void l2cap_retrans_timeout(unsigned long arg)
+{
+	struct l2cap_chan *chan = (void *) arg;
+	struct sock *sk = chan->sk;
+
+	BT_DBG("chan %p", chan);
+
+	bh_lock_sock(sk);
+	chan->retry_count = 1;
+	__set_monitor_timer(chan);
+
+	set_bit(CONN_WAIT_F, &chan->conn_state);
+
+	l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_POLL);
+	bh_unlock_sock(sk);
+}
+
+static void l2cap_drop_acked_frames(struct l2cap_chan *chan)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_peek(&chan->tx_q)) &&
+			chan->unacked_frames) {
+		if (bt_cb(skb)->tx_seq == chan->expected_ack_seq)
+			break;
+
+		skb = skb_dequeue(&chan->tx_q);
+		kfree_skb(skb);
+
+		chan->unacked_frames--;
+	}
+
+	if (!chan->unacked_frames)
+		__clear_retrans_timer(chan);
+}
+
+void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+	struct hci_conn *hcon = chan->conn->hcon;
+	u16 flags;
+
+	BT_DBG("chan %p, skb %p len %d", chan, skb, skb->len);
+
+	if (!chan->flushable && lmp_no_flush_capable(hcon->hdev))
+		flags = ACL_START_NO_FLUSH;
+	else
+		flags = ACL_START;
+
+	bt_cb(skb)->force_active = chan->force_active;
+	hci_send_acl(hcon, skb, flags);
+}
+
+void l2cap_streaming_send(struct l2cap_chan *chan)
+{
+	struct sk_buff *skb;
+	u16 control, fcs;
+
+	while ((skb = skb_dequeue(&chan->tx_q))) {
+		control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE);
+		control |= chan->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT;
+		put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE);
+
+		if (chan->fcs == L2CAP_FCS_CRC16) {
+			fcs = crc16(0, (u8 *)skb->data, skb->len - 2);
+			put_unaligned_le16(fcs, skb->data + skb->len - 2);
+		}
+
+		l2cap_do_send(chan, skb);
+
+		chan->next_tx_seq = (chan->next_tx_seq + 1) % 64;
+	}
+}
+
+static void l2cap_retransmit_one_frame(struct l2cap_chan *chan, u8 tx_seq)
+{
+	struct sk_buff *skb, *tx_skb;
+	u16 control, fcs;
+
+	skb = skb_peek(&chan->tx_q);
+	if (!skb)
+		return;
+
+	do {
+		if (bt_cb(skb)->tx_seq == tx_seq)
+			break;
+
+		if (skb_queue_is_last(&chan->tx_q, skb))
+			return;
+
+	} while ((skb = skb_queue_next(&chan->tx_q, skb)));
+
+	if (chan->remote_max_tx &&
+			bt_cb(skb)->retries == chan->remote_max_tx) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED);
+		return;
+	}
+
+	tx_skb = skb_clone(skb, GFP_ATOMIC);
+	bt_cb(skb)->retries++;
+	control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+	control &= L2CAP_CTRL_SAR;
+
+	if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+		control |= L2CAP_CTRL_FINAL;
+
+	control |= (chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT)
+			| (tx_seq << L2CAP_CTRL_TXSEQ_SHIFT);
+
+	put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE);
+
+	if (chan->fcs == L2CAP_FCS_CRC16) {
+		fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2);
+		put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2);
+	}
+
+	l2cap_do_send(chan, tx_skb);
+}
+
+int l2cap_ertm_send(struct l2cap_chan *chan)
+{
+	struct sk_buff *skb, *tx_skb;
+	u16 control, fcs;
+	int nsent = 0;
+
+	if (chan->state != BT_CONNECTED)
+		return -ENOTCONN;
+
+	while ((skb = chan->tx_send_head) && (!l2cap_tx_window_full(chan))) {
+
+		if (chan->remote_max_tx &&
+				bt_cb(skb)->retries == chan->remote_max_tx) {
+			l2cap_send_disconn_req(chan->conn, chan, ECONNABORTED);
+			break;
+		}
+
+		tx_skb = skb_clone(skb, GFP_ATOMIC);
+
+		bt_cb(skb)->retries++;
+
+		control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+		control &= L2CAP_CTRL_SAR;
+
+		if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state))
+			control |= L2CAP_CTRL_FINAL;
+
+		control |= (chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT)
+				| (chan->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT);
+		put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE);
+
+
+		if (chan->fcs == L2CAP_FCS_CRC16) {
+			fcs = crc16(0, (u8 *)skb->data, tx_skb->len - 2);
+			put_unaligned_le16(fcs, skb->data + tx_skb->len - 2);
+		}
+
+		l2cap_do_send(chan, tx_skb);
+
+		__set_retrans_timer(chan);
+
+		bt_cb(skb)->tx_seq = chan->next_tx_seq;
+		chan->next_tx_seq = (chan->next_tx_seq + 1) % 64;
+
+		if (bt_cb(skb)->retries == 1)
+			chan->unacked_frames++;
+
+		chan->frames_sent++;
+
+		if (skb_queue_is_last(&chan->tx_q, skb))
+			chan->tx_send_head = NULL;
+		else
+			chan->tx_send_head = skb_queue_next(&chan->tx_q, skb);
+
+		nsent++;
+	}
+
+	return nsent;
+}
+
+static int l2cap_retransmit_frames(struct l2cap_chan *chan)
+{
+	int ret;
+
+	if (!skb_queue_empty(&chan->tx_q))
+		chan->tx_send_head = chan->tx_q.next;
+
+	chan->next_tx_seq = chan->expected_ack_seq;
+	ret = l2cap_ertm_send(chan);
+	return ret;
+}
+
+static void l2cap_send_ack(struct l2cap_chan *chan)
+{
+	u16 control = 0;
+
+	control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+
+	if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+		control |= L2CAP_SUPER_RCV_NOT_READY;
+		set_bit(CONN_RNR_SENT, &chan->conn_state);
+		l2cap_send_sframe(chan, control);
+		return;
+	}
+
+	if (l2cap_ertm_send(chan) > 0)
+		return;
+
+	control |= L2CAP_SUPER_RCV_READY;
+	l2cap_send_sframe(chan, control);
+}
+
+static void l2cap_send_srejtail(struct l2cap_chan *chan)
+{
+	struct srej_list *tail;
+	u16 control;
+
+	control = L2CAP_SUPER_SELECT_REJECT;
+	control |= L2CAP_CTRL_FINAL;
+
+	tail = list_entry((&chan->srej_l)->prev, struct srej_list, list);
+	control |= tail->tx_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+
+	l2cap_send_sframe(chan, control);
+}
+
+static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, int len, int count, struct sk_buff *skb)
+{
+	struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+	struct sk_buff **frag;
+	int err, sent = 0;
+
+	if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count))
+		return -EFAULT;
+
+	sent += count;
+	len  -= count;
+
+	/* Continuation fragments (no L2CAP header) */
+	frag = &skb_shinfo(skb)->frag_list;
+	while (len) {
+		count = min_t(unsigned int, conn->mtu, len);
+
+		*frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err);
+		if (!*frag)
+			return err;
+		if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count))
+			return -EFAULT;
+
+		sent += count;
+		len  -= count;
+
+		frag = &(*frag)->next;
+	}
+
+	return sent;
+}
+
+struct sk_buff *l2cap_create_connless_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = chan->sk;
+	struct l2cap_conn *conn = chan->conn;
+	struct sk_buff *skb;
+	int err, count, hlen = L2CAP_HDR_SIZE + 2;
+	struct l2cap_hdr *lh;
+
+	BT_DBG("sk %p len %d", sk, (int)len);
+
+	count = min_t(unsigned int, (conn->mtu - hlen), len);
+	skb = bt_skb_send_alloc(sk, count + hlen,
+			msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return ERR_PTR(err);
+
+	/* Create L2CAP header */
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->cid = cpu_to_le16(chan->dcid);
+	lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+	put_unaligned_le16(chan->psm, skb_put(skb, 2));
+
+	err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb);
+	if (unlikely(err < 0)) {
+		kfree_skb(skb);
+		return ERR_PTR(err);
+	}
+	return skb;
+}
+
+struct sk_buff *l2cap_create_basic_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = chan->sk;
+	struct l2cap_conn *conn = chan->conn;
+	struct sk_buff *skb;
+	int err, count, hlen = L2CAP_HDR_SIZE;
+	struct l2cap_hdr *lh;
+
+	BT_DBG("sk %p len %d", sk, (int)len);
+
+	count = min_t(unsigned int, (conn->mtu - hlen), len);
+	skb = bt_skb_send_alloc(sk, count + hlen,
+			msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return ERR_PTR(err);
+
+	/* Create L2CAP header */
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->cid = cpu_to_le16(chan->dcid);
+	lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+
+	err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb);
+	if (unlikely(err < 0)) {
+		kfree_skb(skb);
+		return ERR_PTR(err);
+	}
+	return skb;
+}
+
+struct sk_buff *l2cap_create_iframe_pdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len, u16 control, u16 sdulen)
+{
+	struct sock *sk = chan->sk;
+	struct l2cap_conn *conn = chan->conn;
+	struct sk_buff *skb;
+	int err, count, hlen = L2CAP_HDR_SIZE + 2;
+	struct l2cap_hdr *lh;
+
+	BT_DBG("sk %p len %d", sk, (int)len);
+
+	if (!conn)
+		return ERR_PTR(-ENOTCONN);
+
+	if (sdulen)
+		hlen += 2;
+
+	if (chan->fcs == L2CAP_FCS_CRC16)
+		hlen += 2;
+
+	count = min_t(unsigned int, (conn->mtu - hlen), len);
+	skb = bt_skb_send_alloc(sk, count + hlen,
+			msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return ERR_PTR(err);
+
+	/* Create L2CAP header */
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->cid = cpu_to_le16(chan->dcid);
+	lh->len = cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE));
+	put_unaligned_le16(control, skb_put(skb, 2));
+	if (sdulen)
+		put_unaligned_le16(sdulen, skb_put(skb, 2));
+
+	err = l2cap_skbuff_fromiovec(sk, msg, len, count, skb);
+	if (unlikely(err < 0)) {
+		kfree_skb(skb);
+		return ERR_PTR(err);
+	}
+
+	if (chan->fcs == L2CAP_FCS_CRC16)
+		put_unaligned_le16(0, skb_put(skb, 2));
+
+	bt_cb(skb)->retries = 0;
+	return skb;
+}
+
+int l2cap_sar_segment_sdu(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+{
+	struct sk_buff *skb;
+	struct sk_buff_head sar_queue;
+	u16 control;
+	size_t size = 0;
+
+	skb_queue_head_init(&sar_queue);
+	control = L2CAP_SDU_START;
+	skb = l2cap_create_iframe_pdu(chan, msg, chan->remote_mps, control, len);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	__skb_queue_tail(&sar_queue, skb);
+	len -= chan->remote_mps;
+	size += chan->remote_mps;
+
+	while (len > 0) {
+		size_t buflen;
+
+		if (len > chan->remote_mps) {
+			control = L2CAP_SDU_CONTINUE;
+			buflen = chan->remote_mps;
+		} else {
+			control = L2CAP_SDU_END;
+			buflen = len;
+		}
+
+		skb = l2cap_create_iframe_pdu(chan, msg, buflen, control, 0);
+		if (IS_ERR(skb)) {
+			skb_queue_purge(&sar_queue);
+			return PTR_ERR(skb);
+		}
+
+		__skb_queue_tail(&sar_queue, skb);
+		len -= buflen;
+		size += buflen;
+	}
+	skb_queue_splice_tail(&sar_queue, &chan->tx_q);
+	if (chan->tx_send_head == NULL)
+		chan->tx_send_head = sar_queue.next;
+
+	return size;
+}
+
+int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+{
+	struct sk_buff *skb;
+	u16 control;
+	int err;
+
+	/* Connectionless channel */
+	if (chan->chan_type == L2CAP_CHAN_CONN_LESS) {
+		skb = l2cap_create_connless_pdu(chan, msg, len);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		l2cap_do_send(chan, skb);
+		return len;
+	}
+
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		/* Check outgoing MTU */
+		if (len > chan->omtu)
+			return -EMSGSIZE;
+
+		/* Create a basic PDU */
+		skb = l2cap_create_basic_pdu(chan, msg, len);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		l2cap_do_send(chan, skb);
+		err = len;
+		break;
+
+	case L2CAP_MODE_ERTM:
+	case L2CAP_MODE_STREAMING:
+		/* Entire SDU fits into one PDU */
+		if (len <= chan->remote_mps) {
+			control = L2CAP_SDU_UNSEGMENTED;
+			skb = l2cap_create_iframe_pdu(chan, msg, len, control,
+									0);
+			if (IS_ERR(skb))
+				return PTR_ERR(skb);
+
+			__skb_queue_tail(&chan->tx_q, skb);
+
+			if (chan->tx_send_head == NULL)
+				chan->tx_send_head = skb;
+
+		} else {
+			/* Segment SDU into multiples PDUs */
+			err = l2cap_sar_segment_sdu(chan, msg, len);
+			if (err < 0)
+				return err;
+		}
+
+		if (chan->mode == L2CAP_MODE_STREAMING) {
+			l2cap_streaming_send(chan);
+			err = len;
+			break;
+		}
+
+		if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) &&
+				test_bit(CONN_WAIT_F, &chan->conn_state)) {
+			err = len;
+			break;
+		}
+
+		err = l2cap_ertm_send(chan);
+		if (err >= 0)
+			err = len;
+
+		break;
+
+	default:
+		BT_DBG("bad state %1.1x", chan->mode);
+		err = -EBADFD;
+	}
+
+	return err;
+}
+
+/* Copy frame to all raw sockets on that connection */
+static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+	struct l2cap_chan *chan;
+
+	BT_DBG("conn %p", conn);
+
+	read_lock(&conn->chan_lock);
+	list_for_each_entry(chan, &conn->chan_l, list) {
+		struct sock *sk = chan->sk;
+		if (chan->chan_type != L2CAP_CHAN_RAW)
+			continue;
+
+		/* Don't send frame to the socket it came from */
+		if (skb->sk == sk)
+			continue;
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb)
+			continue;
+
+		if (chan->ops->recv(chan->data, nskb))
+			kfree_skb(nskb);
+	}
+	read_unlock(&conn->chan_lock);
+}
+
+/* ---- L2CAP signalling commands ---- */
+static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
+				u8 code, u8 ident, u16 dlen, void *data)
+{
+	struct sk_buff *skb, **frag;
+	struct l2cap_cmd_hdr *cmd;
+	struct l2cap_hdr *lh;
+	int len, count;
+
+	BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %d",
+			conn, code, ident, dlen);
+
+	len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen;
+	count = min_t(unsigned int, conn->mtu, len);
+
+	skb = bt_skb_alloc(count, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->len = cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen);
+
+	if (conn->hcon->type == LE_LINK)
+		lh->cid = cpu_to_le16(L2CAP_CID_LE_SIGNALING);
+	else
+		lh->cid = cpu_to_le16(L2CAP_CID_SIGNALING);
+
+	cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE);
+	cmd->code  = code;
+	cmd->ident = ident;
+	cmd->len   = cpu_to_le16(dlen);
+
+	if (dlen) {
+		count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE;
+		memcpy(skb_put(skb, count), data, count);
+		data += count;
+	}
+
+	len -= skb->len;
+
+	/* Continuation fragments (no L2CAP header) */
+	frag = &skb_shinfo(skb)->frag_list;
+	while (len) {
+		count = min_t(unsigned int, conn->mtu, len);
+
+		*frag = bt_skb_alloc(count, GFP_ATOMIC);
+		if (!*frag)
+			goto fail;
+
+		memcpy(skb_put(*frag, count), data, count);
+
+		len  -= count;
+		data += count;
+
+		frag = &(*frag)->next;
+	}
+
+	return skb;
+
+fail:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, unsigned long *val)
+{
+	struct l2cap_conf_opt *opt = *ptr;
+	int len;
+
+	len = L2CAP_CONF_OPT_SIZE + opt->len;
+	*ptr += len;
+
+	*type = opt->type;
+	*olen = opt->len;
+
+	switch (opt->len) {
+	case 1:
+		*val = *((u8 *) opt->val);
+		break;
+
+	case 2:
+		*val = get_unaligned_le16(opt->val);
+		break;
+
+	case 4:
+		*val = get_unaligned_le32(opt->val);
+		break;
+
+	default:
+		*val = (unsigned long) opt->val;
+		break;
+	}
+
+	BT_DBG("type 0x%2.2x len %d val 0x%lx", *type, opt->len, *val);
+	return len;
+}
+
+static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
+{
+	struct l2cap_conf_opt *opt = *ptr;
+
+	BT_DBG("type 0x%2.2x len %d val 0x%lx", type, len, val);
+
+	opt->type = type;
+	opt->len  = len;
+
+	switch (len) {
+	case 1:
+		*((u8 *) opt->val)  = val;
+		break;
+
+	case 2:
+		put_unaligned_le16(val, opt->val);
+		break;
+
+	case 4:
+		put_unaligned_le32(val, opt->val);
+		break;
+
+	default:
+		memcpy(opt->val, (void *) val, len);
+		break;
+	}
+
+	*ptr += L2CAP_CONF_OPT_SIZE + len;
+}
+
+static void l2cap_ack_timeout(unsigned long arg)
+{
+	struct l2cap_chan *chan = (void *) arg;
+
+	bh_lock_sock(chan->sk);
+	l2cap_send_ack(chan);
+	bh_unlock_sock(chan->sk);
+}
+
+static inline void l2cap_ertm_init(struct l2cap_chan *chan)
+{
+	struct sock *sk = chan->sk;
+
+	chan->expected_ack_seq = 0;
+	chan->unacked_frames = 0;
+	chan->buffer_seq = 0;
+	chan->num_acked = 0;
+	chan->frames_sent = 0;
+
+	setup_timer(&chan->retrans_timer, l2cap_retrans_timeout,
+							(unsigned long) chan);
+	setup_timer(&chan->monitor_timer, l2cap_monitor_timeout,
+							(unsigned long) chan);
+	setup_timer(&chan->ack_timer, l2cap_ack_timeout, (unsigned long) chan);
+
+	skb_queue_head_init(&chan->srej_q);
+
+	INIT_LIST_HEAD(&chan->srej_l);
+
+
+	sk->sk_backlog_rcv = l2cap_ertm_data_rcv;
+}
+
+static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
+{
+	switch (mode) {
+	case L2CAP_MODE_STREAMING:
+	case L2CAP_MODE_ERTM:
+		if (l2cap_mode_supported(mode, remote_feat_mask))
+			return mode;
+		/* fall through */
+	default:
+		return L2CAP_MODE_BASIC;
+	}
+}
+
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
+{
+	struct l2cap_conf_req *req = data;
+	struct l2cap_conf_rfc rfc = { .mode = chan->mode };
+	void *ptr = req->data;
+
+	BT_DBG("chan %p", chan);
+
+	if (chan->num_conf_req || chan->num_conf_rsp)
+		goto done;
+
+	switch (chan->mode) {
+	case L2CAP_MODE_STREAMING:
+	case L2CAP_MODE_ERTM:
+		if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state))
+			break;
+
+		/* fall through */
+	default:
+		chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask);
+		break;
+	}
+
+done:
+	if (chan->imtu != L2CAP_DEFAULT_MTU)
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		if (!(chan->conn->feat_mask & L2CAP_FEAT_ERTM) &&
+				!(chan->conn->feat_mask & L2CAP_FEAT_STREAMING))
+			break;
+
+		rfc.mode            = L2CAP_MODE_BASIC;
+		rfc.txwin_size      = 0;
+		rfc.max_transmit    = 0;
+		rfc.retrans_timeout = 0;
+		rfc.monitor_timeout = 0;
+		rfc.max_pdu_size    = 0;
+
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
+		break;
+
+	case L2CAP_MODE_ERTM:
+		rfc.mode            = L2CAP_MODE_ERTM;
+		rfc.txwin_size      = chan->tx_win;
+		rfc.max_transmit    = chan->max_tx;
+		rfc.retrans_timeout = 0;
+		rfc.monitor_timeout = 0;
+		rfc.max_pdu_size    = cpu_to_le16(L2CAP_DEFAULT_MAX_PDU_SIZE);
+		if (L2CAP_DEFAULT_MAX_PDU_SIZE > chan->conn->mtu - 10)
+			rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10);
+
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
+
+		if (!(chan->conn->feat_mask & L2CAP_FEAT_FCS))
+			break;
+
+		if (chan->fcs == L2CAP_FCS_NONE ||
+				test_bit(CONF_NO_FCS_RECV, &chan->conf_state)) {
+			chan->fcs = L2CAP_FCS_NONE;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, chan->fcs);
+		}
+		break;
+
+	case L2CAP_MODE_STREAMING:
+		rfc.mode            = L2CAP_MODE_STREAMING;
+		rfc.txwin_size      = 0;
+		rfc.max_transmit    = 0;
+		rfc.retrans_timeout = 0;
+		rfc.monitor_timeout = 0;
+		rfc.max_pdu_size    = cpu_to_le16(L2CAP_DEFAULT_MAX_PDU_SIZE);
+		if (L2CAP_DEFAULT_MAX_PDU_SIZE > chan->conn->mtu - 10)
+			rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10);
+
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
+
+		if (!(chan->conn->feat_mask & L2CAP_FEAT_FCS))
+			break;
+
+		if (chan->fcs == L2CAP_FCS_NONE ||
+				test_bit(CONF_NO_FCS_RECV, &chan->conf_state)) {
+			chan->fcs = L2CAP_FCS_NONE;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, chan->fcs);
+		}
+		break;
+	}
+
+	req->dcid  = cpu_to_le16(chan->dcid);
+	req->flags = cpu_to_le16(0);
+
+	return ptr - data;
+}
+
+static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data)
+{
+	struct l2cap_conf_rsp *rsp = data;
+	void *ptr = rsp->data;
+	void *req = chan->conf_req;
+	int len = chan->conf_len;
+	int type, hint, olen;
+	unsigned long val;
+	struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
+	u16 mtu = L2CAP_DEFAULT_MTU;
+	u16 result = L2CAP_CONF_SUCCESS;
+
+	BT_DBG("chan %p", chan);
+
+	while (len >= L2CAP_CONF_OPT_SIZE) {
+		len -= l2cap_get_conf_opt(&req, &type, &olen, &val);
+
+		hint  = type & L2CAP_CONF_HINT;
+		type &= L2CAP_CONF_MASK;
+
+		switch (type) {
+		case L2CAP_CONF_MTU:
+			mtu = val;
+			break;
+
+		case L2CAP_CONF_FLUSH_TO:
+			chan->flush_to = val;
+			break;
+
+		case L2CAP_CONF_QOS:
+			break;
+
+		case L2CAP_CONF_RFC:
+			if (olen == sizeof(rfc))
+				memcpy(&rfc, (void *) val, olen);
+			break;
+
+		case L2CAP_CONF_FCS:
+			if (val == L2CAP_FCS_NONE)
+				set_bit(CONF_NO_FCS_RECV, &chan->conf_state);
+
+			break;
+
+		default:
+			if (hint)
+				break;
+
+			result = L2CAP_CONF_UNKNOWN;
+			*((u8 *) ptr++) = type;
+			break;
+		}
+	}
+
+	if (chan->num_conf_rsp || chan->num_conf_req > 1)
+		goto done;
+
+	switch (chan->mode) {
+	case L2CAP_MODE_STREAMING:
+	case L2CAP_MODE_ERTM:
+		if (!test_bit(CONF_STATE2_DEVICE, &chan->conf_state)) {
+			chan->mode = l2cap_select_mode(rfc.mode,
+					chan->conn->feat_mask);
+			break;
+		}
+
+		if (chan->mode != rfc.mode)
+			return -ECONNREFUSED;
+
+		break;
+	}
+
+done:
+	if (chan->mode != rfc.mode) {
+		result = L2CAP_CONF_UNACCEPT;
+		rfc.mode = chan->mode;
+
+		if (chan->num_conf_rsp == 1)
+			return -ECONNREFUSED;
+
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+					sizeof(rfc), (unsigned long) &rfc);
+	}
+
+
+	if (result == L2CAP_CONF_SUCCESS) {
+		/* Configure output options and let the other side know
+		 * which ones we don't like. */
+
+		if (mtu < L2CAP_DEFAULT_MIN_MTU)
+			result = L2CAP_CONF_UNACCEPT;
+		else {
+			chan->omtu = mtu;
+			set_bit(CONF_MTU_DONE, &chan->conf_state);
+		}
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu);
+
+		switch (rfc.mode) {
+		case L2CAP_MODE_BASIC:
+			chan->fcs = L2CAP_FCS_NONE;
+			set_bit(CONF_MODE_DONE, &chan->conf_state);
+			break;
+
+		case L2CAP_MODE_ERTM:
+			chan->remote_tx_win = rfc.txwin_size;
+			chan->remote_max_tx = rfc.max_transmit;
+
+			if (le16_to_cpu(rfc.max_pdu_size) > chan->conn->mtu - 10)
+				rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10);
+
+			chan->remote_mps = le16_to_cpu(rfc.max_pdu_size);
+
+			rfc.retrans_timeout =
+				le16_to_cpu(L2CAP_DEFAULT_RETRANS_TO);
+			rfc.monitor_timeout =
+				le16_to_cpu(L2CAP_DEFAULT_MONITOR_TO);
+
+			set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+					sizeof(rfc), (unsigned long) &rfc);
+
+			break;
+
+		case L2CAP_MODE_STREAMING:
+			if (le16_to_cpu(rfc.max_pdu_size) > chan->conn->mtu - 10)
+				rfc.max_pdu_size = cpu_to_le16(chan->conn->mtu - 10);
+
+			chan->remote_mps = le16_to_cpu(rfc.max_pdu_size);
+
+			set_bit(CONF_MODE_DONE, &chan->conf_state);
+
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+					sizeof(rfc), (unsigned long) &rfc);
+
+			break;
+
+		default:
+			result = L2CAP_CONF_UNACCEPT;
+
+			memset(&rfc, 0, sizeof(rfc));
+			rfc.mode = chan->mode;
+		}
+
+		if (result == L2CAP_CONF_SUCCESS)
+			set_bit(CONF_OUTPUT_DONE, &chan->conf_state);
+	}
+	rsp->scid   = cpu_to_le16(chan->dcid);
+	rsp->result = cpu_to_le16(result);
+	rsp->flags  = cpu_to_le16(0x0000);
+
+	return ptr - data;
+}
+
+static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, void *data, u16 *result)
+{
+	struct l2cap_conf_req *req = data;
+	void *ptr = req->data;
+	int type, olen;
+	unsigned long val;
+	struct l2cap_conf_rfc rfc;
+
+	BT_DBG("chan %p, rsp %p, len %d, req %p", chan, rsp, len, data);
+
+	while (len >= L2CAP_CONF_OPT_SIZE) {
+		len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+
+		switch (type) {
+		case L2CAP_CONF_MTU:
+			if (val < L2CAP_DEFAULT_MIN_MTU) {
+				*result = L2CAP_CONF_UNACCEPT;
+				chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+			} else
+				chan->imtu = val;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+			break;
+
+		case L2CAP_CONF_FLUSH_TO:
+			chan->flush_to = val;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
+							2, chan->flush_to);
+			break;
+
+		case L2CAP_CONF_RFC:
+			if (olen == sizeof(rfc))
+				memcpy(&rfc, (void *)val, olen);
+
+			if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) &&
+							rfc.mode != chan->mode)
+				return -ECONNREFUSED;
+
+			chan->fcs = 0;
+
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
+					sizeof(rfc), (unsigned long) &rfc);
+			break;
+		}
+	}
+
+	if (chan->mode == L2CAP_MODE_BASIC && chan->mode != rfc.mode)
+		return -ECONNREFUSED;
+
+	chan->mode = rfc.mode;
+
+	if (*result == L2CAP_CONF_SUCCESS) {
+		switch (rfc.mode) {
+		case L2CAP_MODE_ERTM:
+			chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+			chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+			chan->mps    = le16_to_cpu(rfc.max_pdu_size);
+			break;
+		case L2CAP_MODE_STREAMING:
+			chan->mps    = le16_to_cpu(rfc.max_pdu_size);
+		}
+	}
+
+	req->dcid   = cpu_to_le16(chan->dcid);
+	req->flags  = cpu_to_le16(0x0000);
+
+	return ptr - data;
+}
+
+static int l2cap_build_conf_rsp(struct l2cap_chan *chan, void *data, u16 result, u16 flags)
+{
+	struct l2cap_conf_rsp *rsp = data;
+	void *ptr = rsp->data;
+
+	BT_DBG("chan %p", chan);
+
+	rsp->scid   = cpu_to_le16(chan->dcid);
+	rsp->result = cpu_to_le16(result);
+	rsp->flags  = cpu_to_le16(flags);
+
+	return ptr - data;
+}
+
+void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
+{
+	struct l2cap_conn_rsp rsp;
+	struct l2cap_conn *conn = chan->conn;
+	u8 buf[128];
+
+	rsp.scid   = cpu_to_le16(chan->dcid);
+	rsp.dcid   = cpu_to_le16(chan->scid);
+	rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+	rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
+	l2cap_send_cmd(conn, chan->ident,
+				L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+
+	if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+		return;
+
+	l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+			l2cap_build_conf_req(chan, buf), buf);
+	chan->num_conf_req++;
+}
+
+static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
+{
+	int type, olen;
+	unsigned long val;
+	struct l2cap_conf_rfc rfc;
+
+	BT_DBG("chan %p, rsp %p, len %d", chan, rsp, len);
+
+	if ((chan->mode != L2CAP_MODE_ERTM) && (chan->mode != L2CAP_MODE_STREAMING))
+		return;
+
+	while (len >= L2CAP_CONF_OPT_SIZE) {
+		len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+
+		switch (type) {
+		case L2CAP_CONF_RFC:
+			if (olen == sizeof(rfc))
+				memcpy(&rfc, (void *)val, olen);
+			goto done;
+		}
+	}
+
+done:
+	switch (rfc.mode) {
+	case L2CAP_MODE_ERTM:
+		chan->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
+		chan->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
+		chan->mps    = le16_to_cpu(rfc.max_pdu_size);
+		break;
+	case L2CAP_MODE_STREAMING:
+		chan->mps    = le16_to_cpu(rfc.max_pdu_size);
+	}
+}
+
+static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_cmd_rej *rej = (struct l2cap_cmd_rej *) data;
+
+	if (rej->reason != 0x0000)
+		return 0;
+
+	if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
+					cmd->ident == conn->info_ident) {
+		del_timer(&conn->info_timer);
+
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
+		l2cap_conn_start(conn);
+	}
+
+	return 0;
+}
+
+static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
+	struct l2cap_conn_rsp rsp;
+	struct l2cap_chan *chan = NULL, *pchan;
+	struct sock *parent, *sk = NULL;
+	int result, status = L2CAP_CS_NO_INFO;
+
+	u16 dcid = 0, scid = __le16_to_cpu(req->scid);
+	__le16 psm = req->psm;
+
+	BT_DBG("psm 0x%2.2x scid 0x%4.4x", psm, scid);
+
+	/* Check if we have socket listening on psm */
+	pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, conn->src);
+	if (!pchan) {
+		result = L2CAP_CR_BAD_PSM;
+		goto sendresp;
+	}
+
+	parent = pchan->sk;
+
+	bh_lock_sock(parent);
+
+	/* Check if the ACL is secure enough (if not SDP) */
+	if (psm != cpu_to_le16(0x0001) &&
+				!hci_conn_check_link_mode(conn->hcon)) {
+		conn->disc_reason = 0x05;
+		result = L2CAP_CR_SEC_BLOCK;
+		goto response;
+	}
+
+	result = L2CAP_CR_NO_MEM;
+
+	/* Check for backlog size */
+	if (sk_acceptq_is_full(parent)) {
+		BT_DBG("backlog full %d", parent->sk_ack_backlog);
+		goto response;
+	}
+
+	chan = pchan->ops->new_connection(pchan->data);
+	if (!chan)
+		goto response;
+
+	sk = chan->sk;
+
+	write_lock_bh(&conn->chan_lock);
+
+	/* Check if we already have channel with that dcid */
+	if (__l2cap_get_chan_by_dcid(conn, scid)) {
+		write_unlock_bh(&conn->chan_lock);
+		sock_set_flag(sk, SOCK_ZAPPED);
+		chan->ops->close(chan->data);
+		goto response;
+	}
+
+	hci_conn_hold(conn->hcon);
+
+	bacpy(&bt_sk(sk)->src, conn->src);
+	bacpy(&bt_sk(sk)->dst, conn->dst);
+	chan->psm  = psm;
+	chan->dcid = scid;
+
+	bt_accept_enqueue(parent, sk);
+
+	__l2cap_chan_add(conn, chan);
+
+	dcid = chan->scid;
+
+	__set_chan_timer(chan, sk->sk_sndtimeo);
+
+	chan->ident = cmd->ident;
+
+	if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
+		if (l2cap_check_security(chan)) {
+			if (bt_sk(sk)->defer_setup) {
+				l2cap_state_change(chan, BT_CONNECT2);
+				result = L2CAP_CR_PEND;
+				status = L2CAP_CS_AUTHOR_PEND;
+				parent->sk_data_ready(parent, 0);
+			} else {
+				l2cap_state_change(chan, BT_CONFIG);
+				result = L2CAP_CR_SUCCESS;
+				status = L2CAP_CS_NO_INFO;
+			}
+		} else {
+			l2cap_state_change(chan, BT_CONNECT2);
+			result = L2CAP_CR_PEND;
+			status = L2CAP_CS_AUTHEN_PEND;
+		}
+	} else {
+		l2cap_state_change(chan, BT_CONNECT2);
+		result = L2CAP_CR_PEND;
+		status = L2CAP_CS_NO_INFO;
+	}
+
+	write_unlock_bh(&conn->chan_lock);
+
+response:
+	bh_unlock_sock(parent);
+
+sendresp:
+	rsp.scid   = cpu_to_le16(scid);
+	rsp.dcid   = cpu_to_le16(dcid);
+	rsp.result = cpu_to_le16(result);
+	rsp.status = cpu_to_le16(status);
+	l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+
+	if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) {
+		struct l2cap_info_req info;
+		info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT;
+		conn->info_ident = l2cap_get_ident(conn);
+
+		mod_timer(&conn->info_timer, jiffies +
+					msecs_to_jiffies(L2CAP_INFO_TIMEOUT));
+
+		l2cap_send_cmd(conn, conn->info_ident,
+					L2CAP_INFO_REQ, sizeof(info), &info);
+	}
+
+	if (chan && !test_bit(CONF_REQ_SENT, &chan->conf_state) &&
+				result == L2CAP_CR_SUCCESS) {
+		u8 buf[128];
+		set_bit(CONF_REQ_SENT, &chan->conf_state);
+		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+					l2cap_build_conf_req(chan, buf), buf);
+		chan->num_conf_req++;
+	}
+
+	return 0;
+}
+
+static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data;
+	u16 scid, dcid, result, status;
+	struct l2cap_chan *chan;
+	struct sock *sk;
+	u8 req[128];
+
+	scid   = __le16_to_cpu(rsp->scid);
+	dcid   = __le16_to_cpu(rsp->dcid);
+	result = __le16_to_cpu(rsp->result);
+	status = __le16_to_cpu(rsp->status);
+
+	BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status);
+
+	if (scid) {
+		chan = l2cap_get_chan_by_scid(conn, scid);
+		if (!chan)
+			return -EFAULT;
+	} else {
+		chan = l2cap_get_chan_by_ident(conn, cmd->ident);
+		if (!chan)
+			return -EFAULT;
+	}
+
+	sk = chan->sk;
+
+	switch (result) {
+	case L2CAP_CR_SUCCESS:
+		l2cap_state_change(chan, BT_CONFIG);
+		chan->ident = 0;
+		chan->dcid = dcid;
+		clear_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+		if (test_and_set_bit(CONF_REQ_SENT, &chan->conf_state))
+			break;
+
+		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+					l2cap_build_conf_req(chan, req), req);
+		chan->num_conf_req++;
+		break;
+
+	case L2CAP_CR_PEND:
+		set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+		break;
+
+	default:
+		/* don't delete l2cap channel if sk is owned by user */
+		if (sock_owned_by_user(sk)) {
+			l2cap_state_change(chan, BT_DISCONN);
+			__clear_chan_timer(chan);
+			__set_chan_timer(chan, HZ / 5);
+			break;
+		}
+
+		l2cap_chan_del(chan, ECONNREFUSED);
+		break;
+	}
+
+	bh_unlock_sock(sk);
+	return 0;
+}
+
+static inline void set_default_fcs(struct l2cap_chan *chan)
+{
+	/* FCS is enabled only in ERTM or streaming mode, if one or both
+	 * sides request it.
+	 */
+	if (chan->mode != L2CAP_MODE_ERTM && chan->mode != L2CAP_MODE_STREAMING)
+		chan->fcs = L2CAP_FCS_NONE;
+	else if (!test_bit(CONF_NO_FCS_RECV, &chan->conf_state))
+		chan->fcs = L2CAP_FCS_CRC16;
+}
+
+static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
+{
+	struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
+	u16 dcid, flags;
+	u8 rsp[64];
+	struct l2cap_chan *chan;
+	struct sock *sk;
+	int len;
+
+	dcid  = __le16_to_cpu(req->dcid);
+	flags = __le16_to_cpu(req->flags);
+
+	BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags);
+
+	chan = l2cap_get_chan_by_scid(conn, dcid);
+	if (!chan)
+		return -ENOENT;
+
+	sk = chan->sk;
+
+	if (sk->sk_state != BT_CONFIG && sk->sk_state != BT_CONNECT2) {
+		struct l2cap_cmd_rej rej;
+
+		rej.reason = cpu_to_le16(0x0002);
+		l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
+				sizeof(rej), &rej);
+		goto unlock;
+	}
+
+	/* Reject if config buffer is too small. */
+	len = cmd_len - sizeof(*req);
+	if (len < 0 || chan->conf_len + len > sizeof(chan->conf_req)) {
+		l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+				l2cap_build_conf_rsp(chan, rsp,
+					L2CAP_CONF_REJECT, flags), rsp);
+		goto unlock;
+	}
+
+	/* Store config. */
+	memcpy(chan->conf_req + chan->conf_len, req->data, len);
+	chan->conf_len += len;
+
+	if (flags & 0x0001) {
+		/* Incomplete config. Send empty response. */
+		l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP,
+				l2cap_build_conf_rsp(chan, rsp,
+					L2CAP_CONF_SUCCESS, 0x0001), rsp);
+		goto unlock;
+	}
+
+	/* Complete config. */
+	len = l2cap_parse_conf_req(chan, rsp);
+	if (len < 0) {
+		l2cap_send_disconn_req(conn, chan, ECONNRESET);
+		goto unlock;
+	}
+
+	l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp);
+	chan->num_conf_rsp++;
+
+	/* Reset config buffer. */
+	chan->conf_len = 0;
+
+	if (!test_bit(CONF_OUTPUT_DONE, &chan->conf_state))
+		goto unlock;
+
+	if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
+		set_default_fcs(chan);
+
+		l2cap_state_change(chan, BT_CONNECTED);
+
+		chan->next_tx_seq = 0;
+		chan->expected_tx_seq = 0;
+		skb_queue_head_init(&chan->tx_q);
+		if (chan->mode == L2CAP_MODE_ERTM)
+			l2cap_ertm_init(chan);
+
+		l2cap_chan_ready(sk);
+		goto unlock;
+	}
+
+	if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) {
+		u8 buf[64];
+		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+					l2cap_build_conf_req(chan, buf), buf);
+		chan->num_conf_req++;
+	}
+
+unlock:
+	bh_unlock_sock(sk);
+	return 0;
+}
+
+static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data;
+	u16 scid, flags, result;
+	struct l2cap_chan *chan;
+	struct sock *sk;
+	int len = cmd->len - sizeof(*rsp);
+
+	scid   = __le16_to_cpu(rsp->scid);
+	flags  = __le16_to_cpu(rsp->flags);
+	result = __le16_to_cpu(rsp->result);
+
+	BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x",
+			scid, flags, result);
+
+	chan = l2cap_get_chan_by_scid(conn, scid);
+	if (!chan)
+		return 0;
+
+	sk = chan->sk;
+
+	switch (result) {
+	case L2CAP_CONF_SUCCESS:
+		l2cap_conf_rfc_get(chan, rsp->data, len);
+		break;
+
+	case L2CAP_CONF_UNACCEPT:
+		if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) {
+			char req[64];
+
+			if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
+				l2cap_send_disconn_req(conn, chan, ECONNRESET);
+				goto done;
+			}
+
+			/* throw out any old stored conf requests */
+			result = L2CAP_CONF_SUCCESS;
+			len = l2cap_parse_conf_rsp(chan, rsp->data, len,
+								req, &result);
+			if (len < 0) {
+				l2cap_send_disconn_req(conn, chan, ECONNRESET);
+				goto done;
+			}
+
+			l2cap_send_cmd(conn, l2cap_get_ident(conn),
+						L2CAP_CONF_REQ, len, req);
+			chan->num_conf_req++;
+			if (result != L2CAP_CONF_SUCCESS)
+				goto done;
+			break;
+		}
+
+	default:
+		sk->sk_err = ECONNRESET;
+		__set_chan_timer(chan, HZ * 5);
+		l2cap_send_disconn_req(conn, chan, ECONNRESET);
+		goto done;
+	}
+
+	if (flags & 0x01)
+		goto done;
+
+	set_bit(CONF_INPUT_DONE, &chan->conf_state);
+
+	if (test_bit(CONF_OUTPUT_DONE, &chan->conf_state)) {
+		set_default_fcs(chan);
+
+		l2cap_state_change(chan, BT_CONNECTED);
+		chan->next_tx_seq = 0;
+		chan->expected_tx_seq = 0;
+		skb_queue_head_init(&chan->tx_q);
+		if (chan->mode ==  L2CAP_MODE_ERTM)
+			l2cap_ertm_init(chan);
+
+		l2cap_chan_ready(sk);
+	}
+
+done:
+	bh_unlock_sock(sk);
+	return 0;
+}
+
+static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data;
+	struct l2cap_disconn_rsp rsp;
+	u16 dcid, scid;
+	struct l2cap_chan *chan;
+	struct sock *sk;
+
+	scid = __le16_to_cpu(req->scid);
+	dcid = __le16_to_cpu(req->dcid);
+
+	BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
+
+	chan = l2cap_get_chan_by_scid(conn, dcid);
+	if (!chan)
+		return 0;
+
+	sk = chan->sk;
+
+	rsp.dcid = cpu_to_le16(chan->scid);
+	rsp.scid = cpu_to_le16(chan->dcid);
+	l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	/* don't delete l2cap channel if sk is owned by user */
+	if (sock_owned_by_user(sk)) {
+		l2cap_state_change(chan, BT_DISCONN);
+		__clear_chan_timer(chan);
+		__set_chan_timer(chan, HZ / 5);
+		bh_unlock_sock(sk);
+		return 0;
+	}
+
+	l2cap_chan_del(chan, ECONNRESET);
+	bh_unlock_sock(sk);
+
+	chan->ops->close(chan->data);
+	return 0;
+}
+
+static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data;
+	u16 dcid, scid;
+	struct l2cap_chan *chan;
+	struct sock *sk;
+
+	scid = __le16_to_cpu(rsp->scid);
+	dcid = __le16_to_cpu(rsp->dcid);
+
+	BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
+
+	chan = l2cap_get_chan_by_scid(conn, scid);
+	if (!chan)
+		return 0;
+
+	sk = chan->sk;
+
+	/* don't delete l2cap channel if sk is owned by user */
+	if (sock_owned_by_user(sk)) {
+		l2cap_state_change(chan,BT_DISCONN);
+		__clear_chan_timer(chan);
+		__set_chan_timer(chan, HZ / 5);
+		bh_unlock_sock(sk);
+		return 0;
+	}
+
+	l2cap_chan_del(chan, 0);
+	bh_unlock_sock(sk);
+
+	chan->ops->close(chan->data);
+	return 0;
+}
+
+static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_info_req *req = (struct l2cap_info_req *) data;
+	u16 type;
+
+	type = __le16_to_cpu(req->type);
+
+	BT_DBG("type 0x%4.4x", type);
+
+	if (type == L2CAP_IT_FEAT_MASK) {
+		u8 buf[8];
+		u32 feat_mask = l2cap_feat_mask;
+		struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+		rsp->type   = cpu_to_le16(L2CAP_IT_FEAT_MASK);
+		rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+		if (!disable_ertm)
+			feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
+							 | L2CAP_FEAT_FCS;
+		put_unaligned_le32(feat_mask, rsp->data);
+		l2cap_send_cmd(conn, cmd->ident,
+					L2CAP_INFO_RSP, sizeof(buf), buf);
+	} else if (type == L2CAP_IT_FIXED_CHAN) {
+		u8 buf[12];
+		struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
+		rsp->type   = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+		rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
+		memcpy(buf + 4, l2cap_fixed_chan, 8);
+		l2cap_send_cmd(conn, cmd->ident,
+					L2CAP_INFO_RSP, sizeof(buf), buf);
+	} else {
+		struct l2cap_info_rsp rsp;
+		rsp.type   = cpu_to_le16(type);
+		rsp.result = cpu_to_le16(L2CAP_IR_NOTSUPP);
+		l2cap_send_cmd(conn, cmd->ident,
+					L2CAP_INFO_RSP, sizeof(rsp), &rsp);
+	}
+
+	return 0;
+}
+
+static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data;
+	u16 type, result;
+
+	type   = __le16_to_cpu(rsp->type);
+	result = __le16_to_cpu(rsp->result);
+
+	BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
+
+	/* L2CAP Info req/rsp are unbound to channels, add extra checks */
+	if (cmd->ident != conn->info_ident ||
+			conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)
+		return 0;
+
+	del_timer(&conn->info_timer);
+
+	if (result != L2CAP_IR_SUCCESS) {
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
+		l2cap_conn_start(conn);
+
+		return 0;
+	}
+
+	if (type == L2CAP_IT_FEAT_MASK) {
+		conn->feat_mask = get_unaligned_le32(rsp->data);
+
+		if (conn->feat_mask & L2CAP_FEAT_FIXED_CHAN) {
+			struct l2cap_info_req req;
+			req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
+
+			conn->info_ident = l2cap_get_ident(conn);
+
+			l2cap_send_cmd(conn, conn->info_ident,
+					L2CAP_INFO_REQ, sizeof(req), &req);
+		} else {
+			conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+			conn->info_ident = 0;
+
+			l2cap_conn_start(conn);
+		}
+	} else if (type == L2CAP_IT_FIXED_CHAN) {
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
+		l2cap_conn_start(conn);
+	}
+
+	return 0;
+}
+
+static inline int l2cap_check_conn_param(u16 min, u16 max, u16 latency,
+							u16 to_multiplier)
+{
+	u16 max_latency;
+
+	if (min > max || min < 6 || max > 3200)
+		return -EINVAL;
+
+	if (to_multiplier < 10 || to_multiplier > 3200)
+		return -EINVAL;
+
+	if (max >= to_multiplier * 8)
+		return -EINVAL;
+
+	max_latency = (to_multiplier * 8 / max) - 1;
+	if (latency > 499 || latency > max_latency)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
+					struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	struct hci_conn *hcon = conn->hcon;
+	struct l2cap_conn_param_update_req *req;
+	struct l2cap_conn_param_update_rsp rsp;
+	u16 min, max, latency, to_multiplier, cmd_len;
+	int err;
+
+	if (!(hcon->link_mode & HCI_LM_MASTER))
+		return -EINVAL;
+
+	cmd_len = __le16_to_cpu(cmd->len);
+	if (cmd_len != sizeof(struct l2cap_conn_param_update_req))
+		return -EPROTO;
+
+	req = (struct l2cap_conn_param_update_req *) data;
+	min		= __le16_to_cpu(req->min);
+	max		= __le16_to_cpu(req->max);
+	latency		= __le16_to_cpu(req->latency);
+	to_multiplier	= __le16_to_cpu(req->to_multiplier);
+
+	BT_DBG("min 0x%4.4x max 0x%4.4x latency: 0x%4.4x Timeout: 0x%4.4x",
+						min, max, latency, to_multiplier);
+
+	memset(&rsp, 0, sizeof(rsp));
+
+	err = l2cap_check_conn_param(min, max, latency, to_multiplier);
+	if (err)
+		rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
+	else
+		rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_ACCEPTED);
+
+	l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP,
+							sizeof(rsp), &rsp);
+
+	if (!err)
+		hci_le_conn_update(hcon, min, max, latency, to_multiplier);
+
+	return 0;
+}
+
+static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
+			struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
+{
+	int err = 0;
+
+	switch (cmd->code) {
+	case L2CAP_COMMAND_REJ:
+		l2cap_command_rej(conn, cmd, data);
+		break;
+
+	case L2CAP_CONN_REQ:
+		err = l2cap_connect_req(conn, cmd, data);
+		break;
+
+	case L2CAP_CONN_RSP:
+		err = l2cap_connect_rsp(conn, cmd, data);
+		break;
+
+	case L2CAP_CONF_REQ:
+		err = l2cap_config_req(conn, cmd, cmd_len, data);
+		break;
+
+	case L2CAP_CONF_RSP:
+		err = l2cap_config_rsp(conn, cmd, data);
+		break;
+
+	case L2CAP_DISCONN_REQ:
+		err = l2cap_disconnect_req(conn, cmd, data);
+		break;
+
+	case L2CAP_DISCONN_RSP:
+		err = l2cap_disconnect_rsp(conn, cmd, data);
+		break;
+
+	case L2CAP_ECHO_REQ:
+		l2cap_send_cmd(conn, cmd->ident, L2CAP_ECHO_RSP, cmd_len, data);
+		break;
+
+	case L2CAP_ECHO_RSP:
+		break;
+
+	case L2CAP_INFO_REQ:
+		err = l2cap_information_req(conn, cmd, data);
+		break;
+
+	case L2CAP_INFO_RSP:
+		err = l2cap_information_rsp(conn, cmd, data);
+		break;
+
+	default:
+		BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
+					struct l2cap_cmd_hdr *cmd, u8 *data)
+{
+	switch (cmd->code) {
+	case L2CAP_COMMAND_REJ:
+		return 0;
+
+	case L2CAP_CONN_PARAM_UPDATE_REQ:
+		return l2cap_conn_param_update_req(conn, cmd, data);
+
+	case L2CAP_CONN_PARAM_UPDATE_RSP:
+		return 0;
+
+	default:
+		BT_ERR("Unknown LE signaling command 0x%2.2x", cmd->code);
+		return -EINVAL;
+	}
+}
+
+static inline void l2cap_sig_channel(struct l2cap_conn *conn,
+							struct sk_buff *skb)
+{
+	u8 *data = skb->data;
+	int len = skb->len;
+	struct l2cap_cmd_hdr cmd;
+	int err;
+
+	l2cap_raw_recv(conn, skb);
+
+	while (len >= L2CAP_CMD_HDR_SIZE) {
+		u16 cmd_len;
+		memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE);
+		data += L2CAP_CMD_HDR_SIZE;
+		len  -= L2CAP_CMD_HDR_SIZE;
+
+		cmd_len = le16_to_cpu(cmd.len);
+
+		BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len, cmd.ident);
+
+		if (cmd_len > len || !cmd.ident) {
+			BT_DBG("corrupted command");
+			break;
+		}
+
+		if (conn->hcon->type == LE_LINK)
+			err = l2cap_le_sig_cmd(conn, &cmd, data);
+		else
+			err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data);
+
+		if (err) {
+			struct l2cap_cmd_rej rej;
+
+			BT_ERR("Wrong link type (%d)", err);
+
+			/* FIXME: Map err to a valid reason */
+			rej.reason = cpu_to_le16(0);
+			l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+		}
+
+		data += cmd_len;
+		len  -= cmd_len;
+	}
+
+	kfree_skb(skb);
+}
+
+static int l2cap_check_fcs(struct l2cap_chan *chan,  struct sk_buff *skb)
+{
+	u16 our_fcs, rcv_fcs;
+	int hdr_size = L2CAP_HDR_SIZE + 2;
+
+	if (chan->fcs == L2CAP_FCS_CRC16) {
+		skb_trim(skb, skb->len - 2);
+		rcv_fcs = get_unaligned_le16(skb->data + skb->len);
+		our_fcs = crc16(0, skb->data - hdr_size, skb->len + hdr_size);
+
+		if (our_fcs != rcv_fcs)
+			return -EBADMSG;
+	}
+	return 0;
+}
+
+static inline void l2cap_send_i_or_rr_or_rnr(struct l2cap_chan *chan)
+{
+	u16 control = 0;
+
+	chan->frames_sent = 0;
+
+	control |= chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+
+	if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+		control |= L2CAP_SUPER_RCV_NOT_READY;
+		l2cap_send_sframe(chan, control);
+		set_bit(CONN_RNR_SENT, &chan->conn_state);
+	}
+
+	if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
+		l2cap_retransmit_frames(chan);
+
+	l2cap_ertm_send(chan);
+
+	if (!test_bit(CONN_LOCAL_BUSY, &chan->conn_state) &&
+			chan->frames_sent == 0) {
+		control |= L2CAP_SUPER_RCV_READY;
+		l2cap_send_sframe(chan, control);
+	}
+}
+
+static int l2cap_add_to_srej_queue(struct l2cap_chan *chan, struct sk_buff *skb, u8 tx_seq, u8 sar)
+{
+	struct sk_buff *next_skb;
+	int tx_seq_offset, next_tx_seq_offset;
+
+	bt_cb(skb)->tx_seq = tx_seq;
+	bt_cb(skb)->sar = sar;
+
+	next_skb = skb_peek(&chan->srej_q);
+	if (!next_skb) {
+		__skb_queue_tail(&chan->srej_q, skb);
+		return 0;
+	}
+
+	tx_seq_offset = (tx_seq - chan->buffer_seq) % 64;
+	if (tx_seq_offset < 0)
+		tx_seq_offset += 64;
+
+	do {
+		if (bt_cb(next_skb)->tx_seq == tx_seq)
+			return -EINVAL;
+
+		next_tx_seq_offset = (bt_cb(next_skb)->tx_seq -
+						chan->buffer_seq) % 64;
+		if (next_tx_seq_offset < 0)
+			next_tx_seq_offset += 64;
+
+		if (next_tx_seq_offset > tx_seq_offset) {
+			__skb_queue_before(&chan->srej_q, next_skb, skb);
+			return 0;
+		}
+
+		if (skb_queue_is_last(&chan->srej_q, next_skb))
+			break;
+
+	} while ((next_skb = skb_queue_next(&chan->srej_q, next_skb)));
+
+	__skb_queue_tail(&chan->srej_q, skb);
+
+	return 0;
+}
+
+static int l2cap_ertm_reassembly_sdu(struct l2cap_chan *chan, struct sk_buff *skb, u16 control)
+{
+	struct sk_buff *_skb;
+	int err;
+
+	switch (control & L2CAP_CTRL_SAR) {
+	case L2CAP_SDU_UNSEGMENTED:
+		if (test_bit(CONN_SAR_SDU, &chan->conn_state))
+			goto drop;
+
+		return chan->ops->recv(chan->data, skb);
+
+	case L2CAP_SDU_START:
+		if (test_bit(CONN_SAR_SDU, &chan->conn_state))
+			goto drop;
+
+		chan->sdu_len = get_unaligned_le16(skb->data);
+
+		if (chan->sdu_len > chan->imtu)
+			goto disconnect;
+
+		chan->sdu = bt_skb_alloc(chan->sdu_len, GFP_ATOMIC);
+		if (!chan->sdu)
+			return -ENOMEM;
+
+		/* pull sdu_len bytes only after alloc, because of Local Busy
+		 * condition we have to be sure that this will be executed
+		 * only once, i.e., when alloc does not fail */
+		skb_pull(skb, 2);
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		set_bit(CONN_SAR_SDU, &chan->conn_state);
+		chan->partial_sdu_len = skb->len;
+		break;
+
+	case L2CAP_SDU_CONTINUE:
+		if (!test_bit(CONN_SAR_SDU, &chan->conn_state))
+			goto disconnect;
+
+		if (!chan->sdu)
+			goto disconnect;
+
+		chan->partial_sdu_len += skb->len;
+		if (chan->partial_sdu_len > chan->sdu_len)
+			goto drop;
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		break;
+
+	case L2CAP_SDU_END:
+		if (!test_bit(CONN_SAR_SDU, &chan->conn_state))
+			goto disconnect;
+
+		if (!chan->sdu)
+			goto disconnect;
+
+		chan->partial_sdu_len += skb->len;
+
+		if (chan->partial_sdu_len > chan->imtu)
+			goto drop;
+
+		if (chan->partial_sdu_len != chan->sdu_len)
+			goto drop;
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		_skb = skb_clone(chan->sdu, GFP_ATOMIC);
+		if (!_skb) {
+			return -ENOMEM;
+		}
+
+		err = chan->ops->recv(chan->data, _skb);
+		if (err < 0) {
+			kfree_skb(_skb);
+			return err;
+		}
+
+		clear_bit(CONN_SAR_SDU, &chan->conn_state);
+
+		kfree_skb(chan->sdu);
+		break;
+	}
+
+	kfree_skb(skb);
+	return 0;
+
+drop:
+	kfree_skb(chan->sdu);
+	chan->sdu = NULL;
+
+disconnect:
+	l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+	kfree_skb(skb);
+	return 0;
+}
+
+static void l2cap_ertm_enter_local_busy(struct l2cap_chan *chan)
+{
+	u16 control;
+
+	BT_DBG("chan %p, Enter local busy", chan);
+
+	set_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+
+	control = chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+	control |= L2CAP_SUPER_RCV_NOT_READY;
+	l2cap_send_sframe(chan, control);
+
+	set_bit(CONN_RNR_SENT, &chan->conn_state);
+
+	__clear_ack_timer(chan);
+}
+
+static void l2cap_ertm_exit_local_busy(struct l2cap_chan *chan)
+{
+	u16 control;
+
+	if (!test_bit(CONN_RNR_SENT, &chan->conn_state))
+		goto done;
+
+	control = chan->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+	control |= L2CAP_SUPER_RCV_READY | L2CAP_CTRL_POLL;
+	l2cap_send_sframe(chan, control);
+	chan->retry_count = 1;
+
+	__clear_retrans_timer(chan);
+	__set_monitor_timer(chan);
+
+	set_bit(CONN_WAIT_F, &chan->conn_state);
+
+done:
+	clear_bit(CONN_LOCAL_BUSY, &chan->conn_state);
+	clear_bit(CONN_RNR_SENT, &chan->conn_state);
+
+	BT_DBG("chan %p, Exit local busy", chan);
+}
+
+void l2cap_chan_busy(struct l2cap_chan *chan, int busy)
+{
+	if (chan->mode == L2CAP_MODE_ERTM) {
+		if (busy)
+			l2cap_ertm_enter_local_busy(chan);
+		else
+			l2cap_ertm_exit_local_busy(chan);
+	}
+}
+
+static int l2cap_streaming_reassembly_sdu(struct l2cap_chan *chan, struct sk_buff *skb, u16 control)
+{
+	struct sk_buff *_skb;
+	int err = -EINVAL;
+
+	/*
+	 * TODO: We have to notify the userland if some data is lost with the
+	 * Streaming Mode.
+	 */
+
+	switch (control & L2CAP_CTRL_SAR) {
+	case L2CAP_SDU_UNSEGMENTED:
+		if (test_bit(CONN_SAR_SDU, &chan->conn_state)) {
+			kfree_skb(chan->sdu);
+			break;
+		}
+
+		err = chan->ops->recv(chan->data, skb);
+		if (!err)
+			return 0;
+
+		break;
+
+	case L2CAP_SDU_START:
+		if (test_bit(CONN_SAR_SDU, &chan->conn_state)) {
+			kfree_skb(chan->sdu);
+			break;
+		}
+
+		chan->sdu_len = get_unaligned_le16(skb->data);
+		skb_pull(skb, 2);
+
+		if (chan->sdu_len > chan->imtu) {
+			err = -EMSGSIZE;
+			break;
+		}
+
+		chan->sdu = bt_skb_alloc(chan->sdu_len, GFP_ATOMIC);
+		if (!chan->sdu) {
+			err = -ENOMEM;
+			break;
+		}
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		set_bit(CONN_SAR_SDU, &chan->conn_state);
+		chan->partial_sdu_len = skb->len;
+		err = 0;
+		break;
+
+	case L2CAP_SDU_CONTINUE:
+		if (!test_bit(CONN_SAR_SDU, &chan->conn_state))
+			break;
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		chan->partial_sdu_len += skb->len;
+		if (chan->partial_sdu_len > chan->sdu_len)
+			kfree_skb(chan->sdu);
+		else
+			err = 0;
+
+		break;
+
+	case L2CAP_SDU_END:
+		if (!test_bit(CONN_SAR_SDU, &chan->conn_state))
+			break;
+
+		memcpy(skb_put(chan->sdu, skb->len), skb->data, skb->len);
+
+		clear_bit(CONN_SAR_SDU, &chan->conn_state);
+		chan->partial_sdu_len += skb->len;
+
+		if (chan->partial_sdu_len > chan->imtu)
+			goto drop;
+
+		if (chan->partial_sdu_len == chan->sdu_len) {
+			_skb = skb_clone(chan->sdu, GFP_ATOMIC);
+			err = chan->ops->recv(chan->data, _skb);
+			if (err < 0)
+				kfree_skb(_skb);
+		}
+		err = 0;
+
+drop:
+		kfree_skb(chan->sdu);
+		break;
+	}
+
+	kfree_skb(skb);
+	return err;
+}
+
+static void l2cap_check_srej_gap(struct l2cap_chan *chan, u8 tx_seq)
+{
+	struct sk_buff *skb;
+	u16 control;
+
+	while ((skb = skb_peek(&chan->srej_q)) &&
+			!test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
+		int err;
+
+		if (bt_cb(skb)->tx_seq != tx_seq)
+			break;
+
+		skb = skb_dequeue(&chan->srej_q);
+		control = bt_cb(skb)->sar << L2CAP_CTRL_SAR_SHIFT;
+		err = l2cap_ertm_reassembly_sdu(chan, skb, control);
+
+		if (err < 0) {
+			l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+			break;
+		}
+
+		chan->buffer_seq_srej =
+			(chan->buffer_seq_srej + 1) % 64;
+		tx_seq = (tx_seq + 1) % 64;
+	}
+}
+
+static void l2cap_resend_srejframe(struct l2cap_chan *chan, u8 tx_seq)
+{
+	struct srej_list *l, *tmp;
+	u16 control;
+
+	list_for_each_entry_safe(l, tmp, &chan->srej_l, list) {
+		if (l->tx_seq == tx_seq) {
+			list_del(&l->list);
+			kfree(l);
+			return;
+		}
+		control = L2CAP_SUPER_SELECT_REJECT;
+		control |= l->tx_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+		l2cap_send_sframe(chan, control);
+		list_del(&l->list);
+		list_add_tail(&l->list, &chan->srej_l);
+	}
+}
+
+static void l2cap_send_srejframe(struct l2cap_chan *chan, u8 tx_seq)
+{
+	struct srej_list *new;
+	u16 control;
+
+	while (tx_seq != chan->expected_tx_seq) {
+		control = L2CAP_SUPER_SELECT_REJECT;
+		control |= chan->expected_tx_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+		l2cap_send_sframe(chan, control);
+
+		new = kzalloc(sizeof(struct srej_list), GFP_ATOMIC);
+		new->tx_seq = chan->expected_tx_seq;
+		chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64;
+		list_add_tail(&new->list, &chan->srej_l);
+	}
+	chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64;
+}
+
+static inline int l2cap_data_channel_iframe(struct l2cap_chan *chan, u16 rx_control, struct sk_buff *skb)
+{
+	u8 tx_seq = __get_txseq(rx_control);
+	u8 req_seq = __get_reqseq(rx_control);
+	u8 sar = rx_control >> L2CAP_CTRL_SAR_SHIFT;
+	int tx_seq_offset, expected_tx_seq_offset;
+	int num_to_ack = (chan->tx_win/6) + 1;
+	int err = 0;
+
+	BT_DBG("chan %p len %d tx_seq %d rx_control 0x%4.4x", chan, skb->len,
+							tx_seq, rx_control);
+
+	if (L2CAP_CTRL_FINAL & rx_control &&
+			test_bit(CONN_WAIT_F, &chan->conn_state)) {
+		__clear_monitor_timer(chan);
+		if (chan->unacked_frames > 0)
+			__set_retrans_timer(chan);
+		clear_bit(CONN_WAIT_F, &chan->conn_state);
+	}
+
+	chan->expected_ack_seq = req_seq;
+	l2cap_drop_acked_frames(chan);
+
+	tx_seq_offset = (tx_seq - chan->buffer_seq) % 64;
+	if (tx_seq_offset < 0)
+		tx_seq_offset += 64;
+
+	/* invalid tx_seq */
+	if (tx_seq_offset >= chan->tx_win) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+		goto drop;
+	}
+
+	if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state))
+		goto drop;
+
+	if (tx_seq == chan->expected_tx_seq)
+		goto expected;
+
+	if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) {
+		struct srej_list *first;
+
+		first = list_first_entry(&chan->srej_l,
+				struct srej_list, list);
+		if (tx_seq == first->tx_seq) {
+			l2cap_add_to_srej_queue(chan, skb, tx_seq, sar);
+			l2cap_check_srej_gap(chan, tx_seq);
+
+			list_del(&first->list);
+			kfree(first);
+
+			if (list_empty(&chan->srej_l)) {
+				chan->buffer_seq = chan->buffer_seq_srej;
+				clear_bit(CONN_SREJ_SENT, &chan->conn_state);
+				l2cap_send_ack(chan);
+				BT_DBG("chan %p, Exit SREJ_SENT", chan);
+			}
+		} else {
+			struct srej_list *l;
+
+			/* duplicated tx_seq */
+			if (l2cap_add_to_srej_queue(chan, skb, tx_seq, sar) < 0)
+				goto drop;
+
+			list_for_each_entry(l, &chan->srej_l, list) {
+				if (l->tx_seq == tx_seq) {
+					l2cap_resend_srejframe(chan, tx_seq);
+					return 0;
+				}
+			}
+			l2cap_send_srejframe(chan, tx_seq);
+		}
+	} else {
+		expected_tx_seq_offset =
+			(chan->expected_tx_seq - chan->buffer_seq) % 64;
+		if (expected_tx_seq_offset < 0)
+			expected_tx_seq_offset += 64;
+
+		/* duplicated tx_seq */
+		if (tx_seq_offset < expected_tx_seq_offset)
+			goto drop;
+
+		set_bit(CONN_SREJ_SENT, &chan->conn_state);
+
+		BT_DBG("chan %p, Enter SREJ", chan);
+
+		INIT_LIST_HEAD(&chan->srej_l);
+		chan->buffer_seq_srej = chan->buffer_seq;
+
+		__skb_queue_head_init(&chan->srej_q);
+		l2cap_add_to_srej_queue(chan, skb, tx_seq, sar);
+
+		set_bit(CONN_SEND_PBIT, &chan->conn_state);
+
+		l2cap_send_srejframe(chan, tx_seq);
+
+		__clear_ack_timer(chan);
+	}
+	return 0;
+
+expected:
+	chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64;
+
+	if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) {
+		bt_cb(skb)->tx_seq = tx_seq;
+		bt_cb(skb)->sar = sar;
+		__skb_queue_tail(&chan->srej_q, skb);
+		return 0;
+	}
+
+	err = l2cap_ertm_reassembly_sdu(chan, skb, rx_control);
+	chan->buffer_seq = (chan->buffer_seq + 1) % 64;
+	if (err < 0) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+		return err;
+	}
+
+	if (rx_control & L2CAP_CTRL_FINAL) {
+		if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state))
+			l2cap_retransmit_frames(chan);
+	}
+
+	__set_ack_timer(chan);
+
+	chan->num_acked = (chan->num_acked + 1) % num_to_ack;
+	if (chan->num_acked == num_to_ack - 1)
+		l2cap_send_ack(chan);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static inline void l2cap_data_channel_rrframe(struct l2cap_chan *chan, u16 rx_control)
+{
+	BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, __get_reqseq(rx_control),
+						rx_control);
+
+	chan->expected_ack_seq = __get_reqseq(rx_control);
+	l2cap_drop_acked_frames(chan);
+
+	if (rx_control & L2CAP_CTRL_POLL) {
+		set_bit(CONN_SEND_FBIT, &chan->conn_state);
+		if (test_bit(CONN_SREJ_SENT, &chan->conn_state)) {
+			if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) &&
+					(chan->unacked_frames > 0))
+				__set_retrans_timer(chan);
+
+			clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+			l2cap_send_srejtail(chan);
+		} else {
+			l2cap_send_i_or_rr_or_rnr(chan);
+		}
+
+	} else if (rx_control & L2CAP_CTRL_FINAL) {
+		clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+		if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state))
+			l2cap_retransmit_frames(chan);
+
+	} else {
+		if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state) &&
+				(chan->unacked_frames > 0))
+			__set_retrans_timer(chan);
+
+		clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+		if (test_bit(CONN_SREJ_SENT, &chan->conn_state))
+			l2cap_send_ack(chan);
+		else
+			l2cap_ertm_send(chan);
+	}
+}
+
+static inline void l2cap_data_channel_rejframe(struct l2cap_chan *chan, u16 rx_control)
+{
+	u8 tx_seq = __get_reqseq(rx_control);
+
+	BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control);
+
+	clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+	chan->expected_ack_seq = tx_seq;
+	l2cap_drop_acked_frames(chan);
+
+	if (rx_control & L2CAP_CTRL_FINAL) {
+		if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state))
+			l2cap_retransmit_frames(chan);
+	} else {
+		l2cap_retransmit_frames(chan);
+
+		if (test_bit(CONN_WAIT_F, &chan->conn_state))
+			set_bit(CONN_REJ_ACT, &chan->conn_state);
+	}
+}
+static inline void l2cap_data_channel_srejframe(struct l2cap_chan *chan, u16 rx_control)
+{
+	u8 tx_seq = __get_reqseq(rx_control);
+
+	BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control);
+
+	clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+
+	if (rx_control & L2CAP_CTRL_POLL) {
+		chan->expected_ack_seq = tx_seq;
+		l2cap_drop_acked_frames(chan);
+
+		set_bit(CONN_SEND_FBIT, &chan->conn_state);
+		l2cap_retransmit_one_frame(chan, tx_seq);
+
+		l2cap_ertm_send(chan);
+
+		if (test_bit(CONN_WAIT_F, &chan->conn_state)) {
+			chan->srej_save_reqseq = tx_seq;
+			set_bit(CONN_SREJ_ACT, &chan->conn_state);
+		}
+	} else if (rx_control & L2CAP_CTRL_FINAL) {
+		if (test_bit(CONN_SREJ_ACT, &chan->conn_state) &&
+				chan->srej_save_reqseq == tx_seq)
+			clear_bit(CONN_SREJ_ACT, &chan->conn_state);
+		else
+			l2cap_retransmit_one_frame(chan, tx_seq);
+	} else {
+		l2cap_retransmit_one_frame(chan, tx_seq);
+		if (test_bit(CONN_WAIT_F, &chan->conn_state)) {
+			chan->srej_save_reqseq = tx_seq;
+			set_bit(CONN_SREJ_ACT, &chan->conn_state);
+		}
+	}
+}
+
+static inline void l2cap_data_channel_rnrframe(struct l2cap_chan *chan, u16 rx_control)
+{
+	u8 tx_seq = __get_reqseq(rx_control);
+
+	BT_DBG("chan %p, req_seq %d ctrl 0x%4.4x", chan, tx_seq, rx_control);
+
+	set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
+	chan->expected_ack_seq = tx_seq;
+	l2cap_drop_acked_frames(chan);
+
+	if (rx_control & L2CAP_CTRL_POLL)
+		set_bit(CONN_SEND_FBIT, &chan->conn_state);
+
+	if (!test_bit(CONN_SREJ_SENT, &chan->conn_state)) {
+		__clear_retrans_timer(chan);
+		if (rx_control & L2CAP_CTRL_POLL)
+			l2cap_send_rr_or_rnr(chan, L2CAP_CTRL_FINAL);
+		return;
+	}
+
+	if (rx_control & L2CAP_CTRL_POLL)
+		l2cap_send_srejtail(chan);
+	else
+		l2cap_send_sframe(chan, L2CAP_SUPER_RCV_READY);
+}
+
+static inline int l2cap_data_channel_sframe(struct l2cap_chan *chan, u16 rx_control, struct sk_buff *skb)
+{
+	BT_DBG("chan %p rx_control 0x%4.4x len %d", chan, rx_control, skb->len);
+
+	if (L2CAP_CTRL_FINAL & rx_control &&
+			test_bit(CONN_WAIT_F, &chan->conn_state)) {
+		__clear_monitor_timer(chan);
+		if (chan->unacked_frames > 0)
+			__set_retrans_timer(chan);
+		clear_bit(CONN_WAIT_F, &chan->conn_state);
+	}
+
+	switch (rx_control & L2CAP_CTRL_SUPERVISE) {
+	case L2CAP_SUPER_RCV_READY:
+		l2cap_data_channel_rrframe(chan, rx_control);
+		break;
+
+	case L2CAP_SUPER_REJECT:
+		l2cap_data_channel_rejframe(chan, rx_control);
+		break;
+
+	case L2CAP_SUPER_SELECT_REJECT:
+		l2cap_data_channel_srejframe(chan, rx_control);
+		break;
+
+	case L2CAP_SUPER_RCV_NOT_READY:
+		l2cap_data_channel_rnrframe(chan, rx_control);
+		break;
+	}
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	u16 control;
+	u8 req_seq;
+	int len, next_tx_seq_offset, req_seq_offset;
+
+	control = get_unaligned_le16(skb->data);
+	skb_pull(skb, 2);
+	len = skb->len;
+
+	/*
+	 * We can just drop the corrupted I-frame here.
+	 * Receiver will miss it and start proper recovery
+	 * procedures and ask retransmission.
+	 */
+	if (l2cap_check_fcs(chan, skb))
+		goto drop;
+
+	if (__is_sar_start(control) && __is_iframe(control))
+		len -= 2;
+
+	if (chan->fcs == L2CAP_FCS_CRC16)
+		len -= 2;
+
+	if (len > chan->mps) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+		goto drop;
+	}
+
+	req_seq = __get_reqseq(control);
+	req_seq_offset = (req_seq - chan->expected_ack_seq) % 64;
+	if (req_seq_offset < 0)
+		req_seq_offset += 64;
+
+	next_tx_seq_offset =
+		(chan->next_tx_seq - chan->expected_ack_seq) % 64;
+	if (next_tx_seq_offset < 0)
+		next_tx_seq_offset += 64;
+
+	/* check for invalid req-seq */
+	if (req_seq_offset > next_tx_seq_offset) {
+		l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+		goto drop;
+	}
+
+	if (__is_iframe(control)) {
+		if (len < 0) {
+			l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+			goto drop;
+		}
+
+		l2cap_data_channel_iframe(chan, control, skb);
+	} else {
+		if (len != 0) {
+			BT_ERR("%d", len);
+			l2cap_send_disconn_req(chan->conn, chan, ECONNRESET);
+			goto drop;
+		}
+
+		l2cap_data_channel_sframe(chan, control, skb);
+	}
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb)
+{
+	struct l2cap_chan *chan;
+	struct sock *sk = NULL;
+	u16 control;
+	u8 tx_seq;
+	int len;
+
+	chan = l2cap_get_chan_by_scid(conn, cid);
+	if (!chan) {
+		BT_DBG("unknown cid 0x%4.4x", cid);
+		goto drop;
+	}
+
+	sk = chan->sk;
+
+	BT_DBG("chan %p, len %d", chan, skb->len);
+
+	if (chan->state != BT_CONNECTED)
+		goto drop;
+
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		/* If socket recv buffers overflows we drop data here
+		 * which is *bad* because L2CAP has to be reliable.
+		 * But we don't have any other choice. L2CAP doesn't
+		 * provide flow control mechanism. */
+
+		if (chan->imtu < skb->len)
+			goto drop;
+
+		if (!chan->ops->recv(chan->data, skb))
+			goto done;
+		break;
+
+	case L2CAP_MODE_ERTM:
+		if (!sock_owned_by_user(sk)) {
+			l2cap_ertm_data_rcv(sk, skb);
+		} else {
+			if (sk_add_backlog(sk, skb))
+				goto drop;
+		}
+
+		goto done;
+
+	case L2CAP_MODE_STREAMING:
+		control = get_unaligned_le16(skb->data);
+		skb_pull(skb, 2);
+		len = skb->len;
+
+		if (l2cap_check_fcs(chan, skb))
+			goto drop;
+
+		if (__is_sar_start(control))
+			len -= 2;
+
+		if (chan->fcs == L2CAP_FCS_CRC16)
+			len -= 2;
+
+		if (len > chan->mps || len < 0 || __is_sframe(control))
+			goto drop;
+
+		tx_seq = __get_txseq(control);
+
+		if (chan->expected_tx_seq == tx_seq)
+			chan->expected_tx_seq = (chan->expected_tx_seq + 1) % 64;
+		else
+			chan->expected_tx_seq = (tx_seq + 1) % 64;
+
+		l2cap_streaming_reassembly_sdu(chan, skb, control);
+
+		goto done;
+
+	default:
+		BT_DBG("chan %p: bad mode 0x%2.2x", chan, chan->mode);
+		break;
+	}
+
+drop:
+	kfree_skb(skb);
+
+done:
+	if (sk)
+		bh_unlock_sock(sk);
+
+	return 0;
+}
+
+static inline int l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm, struct sk_buff *skb)
+{
+	struct sock *sk = NULL;
+	struct l2cap_chan *chan;
+
+	chan = l2cap_global_chan_by_psm(0, psm, conn->src);
+	if (!chan)
+		goto drop;
+
+	sk = chan->sk;
+
+	bh_lock_sock(sk);
+
+	BT_DBG("sk %p, len %d", sk, skb->len);
+
+	if (chan->state != BT_BOUND && chan->state != BT_CONNECTED)
+		goto drop;
+
+	if (chan->imtu < skb->len)
+		goto drop;
+
+	if (!chan->ops->recv(chan->data, skb))
+		goto done;
+
+drop:
+	kfree_skb(skb);
+
+done:
+	if (sk)
+		bh_unlock_sock(sk);
+	return 0;
+}
+
+static inline int l2cap_att_channel(struct l2cap_conn *conn, __le16 cid, struct sk_buff *skb)
+{
+	struct sock *sk = NULL;
+	struct l2cap_chan *chan;
+
+	chan = l2cap_global_chan_by_scid(0, cid, conn->src);
+	if (!chan)
+		goto drop;
+
+	sk = chan->sk;
+
+	bh_lock_sock(sk);
+
+	BT_DBG("sk %p, len %d", sk, skb->len);
+
+	if (chan->state != BT_BOUND && chan->state != BT_CONNECTED)
+		goto drop;
+
+	if (chan->imtu < skb->len)
+		goto drop;
+
+	if (!chan->ops->recv(chan->data, skb))
+		goto done;
+
+drop:
+	kfree_skb(skb);
+
+done:
+	if (sk)
+		bh_unlock_sock(sk);
+	return 0;
+}
+
+static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct l2cap_hdr *lh = (void *) skb->data;
+	u16 cid, len;
+	__le16 psm;
+
+	skb_pull(skb, L2CAP_HDR_SIZE);
+	cid = __le16_to_cpu(lh->cid);
+	len = __le16_to_cpu(lh->len);
+
+	if (len != skb->len) {
+		kfree_skb(skb);
+		return;
+	}
+
+	BT_DBG("len %d, cid 0x%4.4x", len, cid);
+
+	switch (cid) {
+	case L2CAP_CID_LE_SIGNALING:
+	case L2CAP_CID_SIGNALING:
+		l2cap_sig_channel(conn, skb);
+		break;
+
+	case L2CAP_CID_CONN_LESS:
+		psm = get_unaligned_le16(skb->data);
+		skb_pull(skb, 2);
+		l2cap_conless_channel(conn, psm, skb);
+		break;
+
+	case L2CAP_CID_LE_DATA:
+		l2cap_att_channel(conn, cid, skb);
+		break;
+
+	case L2CAP_CID_SMP:
+		if (smp_sig_channel(conn, skb))
+			l2cap_conn_del(conn->hcon, EACCES);
+		break;
+
+	default:
+		l2cap_data_channel(conn, cid, skb);
+		break;
+	}
+}
+
+/* ---- L2CAP interface with lower layer (HCI) ---- */
+
+static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
+{
+	int exact = 0, lm1 = 0, lm2 = 0;
+	struct l2cap_chan *c;
+
+	if (type != ACL_LINK)
+		return -EINVAL;
+
+	BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
+
+	/* Find listening sockets and check their link_mode */
+	read_lock(&chan_list_lock);
+	list_for_each_entry(c, &chan_list, global_l) {
+		struct sock *sk = c->sk;
+
+		if (c->state != BT_LISTEN)
+			continue;
+
+		if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) {
+			lm1 |= HCI_LM_ACCEPT;
+			if (c->role_switch)
+				lm1 |= HCI_LM_MASTER;
+			exact++;
+		} else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+			lm2 |= HCI_LM_ACCEPT;
+			if (c->role_switch)
+				lm2 |= HCI_LM_MASTER;
+		}
+	}
+	read_unlock(&chan_list_lock);
+
+	return exact ? lm1 : lm2;
+}
+
+static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
+{
+	struct l2cap_conn *conn;
+
+	BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
+
+	if (!(hcon->type == ACL_LINK || hcon->type == LE_LINK))
+		return -EINVAL;
+
+	if (!status) {
+		conn = l2cap_conn_add(hcon, status);
+		if (conn)
+			l2cap_conn_ready(conn);
+	} else
+		l2cap_conn_del(hcon, bt_to_errno(status));
+
+	return 0;
+}
+
+static int l2cap_disconn_ind(struct hci_conn *hcon)
+{
+	struct l2cap_conn *conn = hcon->l2cap_data;
+
+	BT_DBG("hcon %p", hcon);
+
+	if ((hcon->type != ACL_LINK && hcon->type != LE_LINK) || !conn)
+		return 0x13;
+
+	return conn->disc_reason;
+}
+
+static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
+{
+	BT_DBG("hcon %p reason %d", hcon, reason);
+
+	if (!(hcon->type == ACL_LINK || hcon->type == LE_LINK))
+		return -EINVAL;
+
+	l2cap_conn_del(hcon, bt_to_errno(reason));
+
+	return 0;
+}
+
+static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt)
+{
+	if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED)
+		return;
+
+	if (encrypt == 0x00) {
+		if (chan->sec_level == BT_SECURITY_MEDIUM) {
+			__clear_chan_timer(chan);
+			__set_chan_timer(chan, HZ * 5);
+		} else if (chan->sec_level == BT_SECURITY_HIGH)
+			l2cap_chan_close(chan, ECONNREFUSED);
+	} else {
+		if (chan->sec_level == BT_SECURITY_MEDIUM)
+			__clear_chan_timer(chan);
+	}
+}
+
+static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
+{
+	struct l2cap_conn *conn = hcon->l2cap_data;
+	struct l2cap_chan *chan;
+
+	if (!conn)
+		return 0;
+
+	BT_DBG("conn %p", conn);
+
+	read_lock(&conn->chan_lock);
+
+	list_for_each_entry(chan, &conn->chan_l, list) {
+		struct sock *sk = chan->sk;
+
+		bh_lock_sock(sk);
+
+		BT_DBG("chan->scid %d", chan->scid);
+
+		if (chan->scid == L2CAP_CID_LE_DATA) {
+			if (!status && encrypt) {
+				chan->sec_level = hcon->sec_level;
+				del_timer(&conn->security_timer);
+				l2cap_chan_ready(sk);
+				smp_distribute_keys(conn, 0);
+			}
+
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if (test_bit(CONF_CONNECT_PEND, &chan->conf_state)) {
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if (!status && (chan->state == BT_CONNECTED ||
+						chan->state == BT_CONFIG)) {
+			l2cap_check_encryption(chan, encrypt);
+			bh_unlock_sock(sk);
+			continue;
+		}
+
+		if (chan->state == BT_CONNECT) {
+			if (!status) {
+				struct l2cap_conn_req req;
+				req.scid = cpu_to_le16(chan->scid);
+				req.psm  = chan->psm;
+
+				chan->ident = l2cap_get_ident(conn);
+				set_bit(CONF_CONNECT_PEND, &chan->conf_state);
+
+				l2cap_send_cmd(conn, chan->ident,
+					L2CAP_CONN_REQ, sizeof(req), &req);
+			} else {
+				__clear_chan_timer(chan);
+				__set_chan_timer(chan, HZ / 10);
+			}
+		} else if (chan->state == BT_CONNECT2) {
+			struct l2cap_conn_rsp rsp;
+			__u16 res, stat;
+
+			if (!status) {
+				if (bt_sk(sk)->defer_setup) {
+					struct sock *parent = bt_sk(sk)->parent;
+					res = L2CAP_CR_PEND;
+					stat = L2CAP_CS_AUTHOR_PEND;
+					if (parent)
+						parent->sk_data_ready(parent, 0);
+				} else {
+					l2cap_state_change(chan, BT_CONFIG);
+					res = L2CAP_CR_SUCCESS;
+					stat = L2CAP_CS_NO_INFO;
+				}
+			} else {
+				l2cap_state_change(chan, BT_DISCONN);
+				__set_chan_timer(chan, HZ / 10);
+				res = L2CAP_CR_SEC_BLOCK;
+				stat = L2CAP_CS_NO_INFO;
+			}
+
+			rsp.scid   = cpu_to_le16(chan->dcid);
+			rsp.dcid   = cpu_to_le16(chan->scid);
+			rsp.result = cpu_to_le16(res);
+			rsp.status = cpu_to_le16(stat);
+			l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_RSP,
+							sizeof(rsp), &rsp);
+		}
+
+		bh_unlock_sock(sk);
+	}
+
+	read_unlock(&conn->chan_lock);
+
+	return 0;
+}
+
+static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+{
+	struct l2cap_conn *conn = hcon->l2cap_data;
+
+	if (!conn)
+		conn = l2cap_conn_add(hcon, 0);
+
+	if (!conn)
+		goto drop;
+
+	BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags);
+
+	if (!(flags & ACL_CONT)) {
+		struct l2cap_hdr *hdr;
+		struct l2cap_chan *chan;
+		u16 cid;
+		int len;
+
+		if (conn->rx_len) {
+			BT_ERR("Unexpected start frame (len %d)", skb->len);
+			kfree_skb(conn->rx_skb);
+			conn->rx_skb = NULL;
+			conn->rx_len = 0;
+			l2cap_conn_unreliable(conn, ECOMM);
+		}
+
+		/* Start fragment always begin with Basic L2CAP header */
+		if (skb->len < L2CAP_HDR_SIZE) {
+			BT_ERR("Frame is too short (len %d)", skb->len);
+			l2cap_conn_unreliable(conn, ECOMM);
+			goto drop;
+		}
+
+		hdr = (struct l2cap_hdr *) skb->data;
+		len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
+		cid = __le16_to_cpu(hdr->cid);
+
+		if (len == skb->len) {
+			/* Complete frame received */
+			l2cap_recv_frame(conn, skb);
+			return 0;
+		}
+
+		BT_DBG("Start: total len %d, frag len %d", len, skb->len);
+
+		if (skb->len > len) {
+			BT_ERR("Frame is too long (len %d, expected len %d)",
+				skb->len, len);
+			l2cap_conn_unreliable(conn, ECOMM);
+			goto drop;
+		}
+
+		chan = l2cap_get_chan_by_scid(conn, cid);
+
+		if (chan && chan->sk) {
+			struct sock *sk = chan->sk;
+
+			if (chan->imtu < len - L2CAP_HDR_SIZE) {
+				BT_ERR("Frame exceeding recv MTU (len %d, "
+							"MTU %d)", len,
+							chan->imtu);
+				bh_unlock_sock(sk);
+				l2cap_conn_unreliable(conn, ECOMM);
+				goto drop;
+			}
+			bh_unlock_sock(sk);
+		}
+
+		/* Allocate skb for the complete frame (with header) */
+		conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC);
+		if (!conn->rx_skb)
+			goto drop;
+
+		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+								skb->len);
+		conn->rx_len = len - skb->len;
+	} else {
+		BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
+
+		if (!conn->rx_len) {
+			BT_ERR("Unexpected continuation frame (len %d)", skb->len);
+			l2cap_conn_unreliable(conn, ECOMM);
+			goto drop;
+		}
+
+		if (skb->len > conn->rx_len) {
+			BT_ERR("Fragment is too long (len %d, expected %d)",
+					skb->len, conn->rx_len);
+			kfree_skb(conn->rx_skb);
+			conn->rx_skb = NULL;
+			conn->rx_len = 0;
+			l2cap_conn_unreliable(conn, ECOMM);
+			goto drop;
+		}
+
+		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+								skb->len);
+		conn->rx_len -= skb->len;
+
+		if (!conn->rx_len) {
+			/* Complete frame received */
+			l2cap_recv_frame(conn, conn->rx_skb);
+			conn->rx_skb = NULL;
+		}
+	}
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int l2cap_debugfs_show(struct seq_file *f, void *p)
+{
+	struct l2cap_chan *c;
+
+	read_lock_bh(&chan_list_lock);
+
+	list_for_each_entry(c, &chan_list, global_l) {
+		struct sock *sk = c->sk;
+
+		seq_printf(f, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d %d\n",
+					batostr(&bt_sk(sk)->src),
+					batostr(&bt_sk(sk)->dst),
+					c->state, __le16_to_cpu(c->psm),
+					c->scid, c->dcid, c->imtu, c->omtu,
+					c->sec_level, c->mode);
+	}
+
+	read_unlock_bh(&chan_list_lock);
+
+	return 0;
+}
+
+static int l2cap_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, l2cap_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations l2cap_debugfs_fops = {
+	.open		= l2cap_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *l2cap_debugfs;
+
+static struct hci_proto l2cap_hci_proto = {
+	.name		= "L2CAP",
+	.id		= HCI_PROTO_L2CAP,
+	.connect_ind	= l2cap_connect_ind,
+	.connect_cfm	= l2cap_connect_cfm,
+	.disconn_ind	= l2cap_disconn_ind,
+	.disconn_cfm	= l2cap_disconn_cfm,
+	.security_cfm	= l2cap_security_cfm,
+	.recv_acldata	= l2cap_recv_acldata
+};
+
+int __init l2cap_init(void)
+{
+	int err;
+
+	err = l2cap_init_sockets();
+	if (err < 0)
+		return err;
+
+	err = hci_register_proto(&l2cap_hci_proto);
+	if (err < 0) {
+		BT_ERR("L2CAP protocol registration failed");
+		bt_sock_unregister(BTPROTO_L2CAP);
+		goto error;
+	}
+
+	if (bt_debugfs) {
+		l2cap_debugfs = debugfs_create_file("l2cap", 0444,
+					bt_debugfs, NULL, &l2cap_debugfs_fops);
+		if (!l2cap_debugfs)
+			BT_ERR("Failed to create L2CAP debug file");
+	}
+
+	return 0;
+
+error:
+	l2cap_cleanup_sockets();
+	return err;
+}
+
+void l2cap_exit(void)
+{
+	debugfs_remove(l2cap_debugfs);
+
+	if (hci_unregister_proto(&l2cap_hci_proto) < 0)
+		BT_ERR("L2CAP protocol unregistration failed");
+
+	l2cap_cleanup_sockets();
+}
+
+module_param(disable_ertm, bool, 0644);
+MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
new file mode 100644
index 00000000..61f1f623
--- /dev/null
+++ b/net/bluetooth/l2cap_sock.c
@@ -0,0 +1,1094 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+   Copyright (C) 2010 Google Inc.
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth L2CAP sockets. */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/smp.h>
+
+static const struct proto_ops l2cap_sock_ops;
+static void l2cap_sock_init(struct sock *sk, struct sock *parent);
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio);
+
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct sockaddr_l2 la;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (!addr || addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	memset(&la, 0, sizeof(la));
+	len = min_t(unsigned int, sizeof(la), alen);
+	memcpy(&la, addr, len);
+
+	if (la.l2_cid && la.l2_psm)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_OPEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (la.l2_psm) {
+		__u16 psm = __le16_to_cpu(la.l2_psm);
+
+		/* PSM must be odd and lsb of upper byte must be 0 */
+		if ((psm & 0x0101) != 0x0001) {
+			err = -EINVAL;
+			goto done;
+		}
+
+		/* Restrict usage of well-known PSMs */
+		if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) {
+			err = -EACCES;
+			goto done;
+		}
+	}
+
+	if (la.l2_cid)
+		err = l2cap_add_scid(chan, la.l2_cid);
+	else
+		err = l2cap_add_psm(chan, &la.l2_bdaddr, la.l2_psm);
+
+	if (err < 0)
+		goto done;
+
+	if (__le16_to_cpu(la.l2_psm) == 0x0001 ||
+				__le16_to_cpu(la.l2_psm) == 0x0003)
+		chan->sec_level = BT_SECURITY_SDP;
+
+	bacpy(&bt_sk(sk)->src, &la.l2_bdaddr);
+
+	chan->state = BT_BOUND;
+	sk->sk_state = BT_BOUND;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct sockaddr_l2 la;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (!addr || alen < sizeof(addr->sa_family) ||
+	    addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	memset(&la, 0, sizeof(la));
+	len = min_t(unsigned int, sizeof(la), alen);
+	memcpy(&la, addr, len);
+
+	if (la.l2_cid && la.l2_psm)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED
+			&& !(la.l2_psm || la.l2_cid)) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		break;
+	case L2CAP_MODE_ERTM:
+	case L2CAP_MODE_STREAMING:
+		if (!disable_ertm)
+			break;
+		/* fall through */
+	default:
+		err = -ENOTSUPP;
+		goto done;
+	}
+
+	switch (sk->sk_state) {
+	case BT_CONNECT:
+	case BT_CONNECT2:
+	case BT_CONFIG:
+		/* Already connecting */
+		goto wait;
+
+	case BT_CONNECTED:
+		/* Already connected */
+		err = -EISCONN;
+		goto done;
+
+	case BT_OPEN:
+	case BT_BOUND:
+		/* Can connect */
+		break;
+
+	default:
+		err = -EBADFD;
+		goto done;
+	}
+
+	/* PSM must be odd and lsb of upper byte must be 0 */
+	if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 && !la.l2_cid &&
+					chan->chan_type != L2CAP_CHAN_RAW) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	/* Set destination address and psm */
+	bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
+	chan->psm = la.l2_psm;
+	chan->dcid = la.l2_cid;
+
+	err = l2cap_chan_connect(l2cap_pi(sk)->chan);
+	if (err)
+		goto done;
+
+wait:
+	err = bt_sock_wait_state(sk, BT_CONNECTED,
+			sock_sndtimeo(sk, flags & O_NONBLOCK));
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	int err = 0;
+
+	BT_DBG("sk %p backlog %d", sk, backlog);
+
+	lock_sock(sk);
+
+	if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM)
+			|| sk->sk_state != BT_BOUND) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	switch (chan->mode) {
+	case L2CAP_MODE_BASIC:
+		break;
+	case L2CAP_MODE_ERTM:
+	case L2CAP_MODE_STREAMING:
+		if (!disable_ertm)
+			break;
+		/* fall through */
+	default:
+		err = -ENOTSUPP;
+		goto done;
+	}
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+
+	chan->state = BT_LISTEN;
+	sk->sk_state = BT_LISTEN;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct sock *sk = sock->sk, *nsk;
+	long timeo;
+	int err = 0;
+
+	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	BT_DBG("sk %p timeo %ld", sk, timeo);
+
+	/* Wait for an incoming connection. (wake-one). */
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (sk->sk_state != BT_LISTEN) {
+			err = -EBADFD;
+			break;
+		}
+
+		nsk = bt_accept_dequeue(sk, newsock);
+		if (nsk)
+			break;
+
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (err)
+		goto done;
+
+	newsock->state = SS_CONNECTED;
+
+	BT_DBG("new socket %p", nsk);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+{
+	struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	addr->sa_family = AF_BLUETOOTH;
+	*len = sizeof(struct sockaddr_l2);
+
+	if (peer) {
+		la->l2_psm = chan->psm;
+		bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst);
+		la->l2_cid = cpu_to_le16(chan->dcid);
+	} else {
+		la->l2_psm = chan->sport;
+		bacpy(&la->l2_bdaddr, &bt_sk(sk)->src);
+		la->l2_cid = cpu_to_le16(chan->scid);
+	}
+
+	return 0;
+}
+
+static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct l2cap_options opts;
+	struct l2cap_conninfo cinfo;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case L2CAP_OPTIONS:
+		memset(&opts, 0, sizeof(opts));
+		opts.imtu     = chan->imtu;
+		opts.omtu     = chan->omtu;
+		opts.flush_to = chan->flush_to;
+		opts.mode     = chan->mode;
+		opts.fcs      = chan->fcs;
+		opts.max_tx   = chan->max_tx;
+		opts.txwin_size = (__u16)chan->tx_win;
+
+		len = min_t(unsigned int, len, sizeof(opts));
+		if (copy_to_user(optval, (char *) &opts, len))
+			err = -EFAULT;
+
+		break;
+
+	case L2CAP_LM:
+		switch (chan->sec_level) {
+		case BT_SECURITY_LOW:
+			opt = L2CAP_LM_AUTH;
+			break;
+		case BT_SECURITY_MEDIUM:
+			opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
+			break;
+		case BT_SECURITY_HIGH:
+			opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
+							L2CAP_LM_SECURE;
+			break;
+		default:
+			opt = 0;
+			break;
+		}
+
+		if (chan->role_switch)
+			opt |= L2CAP_LM_MASTER;
+
+		if (chan->force_reliable)
+			opt |= L2CAP_LM_RELIABLE;
+
+		if (put_user(opt, (u32 __user *) optval))
+			err = -EFAULT;
+		break;
+
+	case L2CAP_CONNINFO:
+		if (sk->sk_state != BT_CONNECTED &&
+					!(sk->sk_state == BT_CONNECT2 &&
+						bt_sk(sk)->defer_setup)) {
+			err = -ENOTCONN;
+			break;
+		}
+
+		memset(&cinfo, 0, sizeof(cinfo));
+		cinfo.hci_handle = chan->conn->hcon->handle;
+		memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3);
+
+		len = min_t(unsigned int, len, sizeof(cinfo));
+		if (copy_to_user(optval, (char *) &cinfo, len))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct bt_security sec;
+	struct bt_power pwr;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_L2CAP)
+		return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+					chan->chan_type != L2CAP_CHAN_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		memset(&sec, 0, sizeof(sec));
+		sec.level = chan->sec_level;
+
+		if (sk->sk_state == BT_CONNECTED)
+			sec.key_size = chan->conn->hcon->enc_key_size;
+
+		len = min_t(unsigned int, len, sizeof(sec));
+		if (copy_to_user(optval, (char *) &sec, len))
+			err = -EFAULT;
+
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	case BT_FLUSHABLE:
+		if (put_user(chan->flushable, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	case BT_POWER:
+		if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_STREAM
+				&& sk->sk_type != SOCK_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		pwr.force_active = chan->force_active;
+
+		len = min_t(unsigned int, len, sizeof(pwr));
+		if (copy_to_user(optval, (char *) &pwr, len))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct l2cap_options opts;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case L2CAP_OPTIONS:
+		if (sk->sk_state == BT_CONNECTED) {
+			err = -EINVAL;
+			break;
+		}
+
+		opts.imtu     = chan->imtu;
+		opts.omtu     = chan->omtu;
+		opts.flush_to = chan->flush_to;
+		opts.mode     = chan->mode;
+		opts.fcs      = chan->fcs;
+		opts.max_tx   = chan->max_tx;
+		opts.txwin_size = (__u16)chan->tx_win;
+
+		len = min_t(unsigned int, sizeof(opts), optlen);
+		if (copy_from_user((char *) &opts, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opts.txwin_size > L2CAP_DEFAULT_TX_WINDOW) {
+			err = -EINVAL;
+			break;
+		}
+
+		chan->mode = opts.mode;
+		switch (chan->mode) {
+		case L2CAP_MODE_BASIC:
+			clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+			break;
+		case L2CAP_MODE_ERTM:
+		case L2CAP_MODE_STREAMING:
+			if (!disable_ertm)
+				break;
+			/* fall through */
+		default:
+			err = -EINVAL;
+			break;
+		}
+
+		chan->imtu = opts.imtu;
+		chan->omtu = opts.omtu;
+		chan->fcs  = opts.fcs;
+		chan->max_tx = opts.max_tx;
+		chan->tx_win = (__u8)opts.txwin_size;
+		break;
+
+	case L2CAP_LM:
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt & L2CAP_LM_AUTH)
+			chan->sec_level = BT_SECURITY_LOW;
+		if (opt & L2CAP_LM_ENCRYPT)
+			chan->sec_level = BT_SECURITY_MEDIUM;
+		if (opt & L2CAP_LM_SECURE)
+			chan->sec_level = BT_SECURITY_HIGH;
+
+		chan->role_switch    = (opt & L2CAP_LM_MASTER);
+		chan->force_reliable = (opt & L2CAP_LM_RELIABLE);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	struct bt_security sec;
+	struct bt_power pwr;
+	struct l2cap_conn *conn;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_L2CAP)
+		return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+					chan->chan_type != L2CAP_CHAN_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = BT_SECURITY_LOW;
+
+		len = min_t(unsigned int, sizeof(sec), optlen);
+		if (copy_from_user((char *) &sec, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (sec.level < BT_SECURITY_LOW ||
+					sec.level > BT_SECURITY_HIGH) {
+			err = -EINVAL;
+			break;
+		}
+
+		chan->sec_level = sec.level;
+
+		conn = chan->conn;
+		if (conn && chan->scid == L2CAP_CID_LE_DATA) {
+			if (!conn->hcon->out) {
+				err = -EINVAL;
+				break;
+			}
+
+			if (smp_conn_security(conn, sec.level))
+				break;
+
+			err = 0;
+			sk->sk_state = BT_CONFIG;
+		}
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		bt_sk(sk)->defer_setup = opt;
+		break;
+
+	case BT_FLUSHABLE:
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt > BT_FLUSHABLE_ON) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (opt == BT_FLUSHABLE_OFF) {
+			struct l2cap_conn *conn = chan->conn;
+			/* proceed further only when we have l2cap_conn and
+			   No Flush support in the LM */
+			if (!conn || !lmp_no_flush_capable(conn->hcon->hdev)) {
+				err = -EINVAL;
+				break;
+			}
+		}
+
+		chan->flushable = opt;
+		break;
+
+	case BT_POWER:
+		if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED &&
+					chan->chan_type != L2CAP_CHAN_RAW) {
+			err = -EINVAL;
+			break;
+		}
+
+		pwr.force_active = BT_POWER_FORCE_ACTIVE_ON;
+
+		len = min_t(unsigned int, sizeof(pwr), optlen);
+		if (copy_from_user((char *) &pwr, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+		chan->force_active = pwr.force_active;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	int err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	err = sock_error(sk);
+	if (err)
+		return err;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_CONNECTED) {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	err = l2cap_chan_send(chan, msg, len);
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+	int err;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) {
+		sk->sk_state = BT_CONFIG;
+
+		__l2cap_connect_rsp_defer(pi->chan);
+		release_sock(sk);
+		return 0;
+	}
+
+	release_sock(sk);
+
+	if (sock->type == SOCK_STREAM)
+		err = bt_sock_stream_recvmsg(iocb, sock, msg, len, flags);
+	else
+		err = bt_sock_recvmsg(iocb, sock, msg, len, flags);
+
+	if (pi->chan->mode != L2CAP_MODE_ERTM)
+		return err;
+
+	/* Attempt to put pending rx data in the socket buffer */
+
+	lock_sock(sk);
+
+	if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state))
+		goto done;
+
+	if (pi->rx_busy_skb) {
+		if (!sock_queue_rcv_skb(sk, pi->rx_busy_skb))
+			pi->rx_busy_skb = NULL;
+		else
+			goto done;
+	}
+
+	/* Restore data flow when half of the receive buffer is
+	 * available.  This avoids resending large numbers of
+	 * frames.
+	 */
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
+		l2cap_chan_busy(pi->chan, 0);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void l2cap_sock_kill(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+		return;
+
+	BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+	/* Kill poor orphan */
+
+	l2cap_chan_destroy(l2cap_pi(sk)->chan);
+	sock_set_flag(sk, SOCK_DEAD);
+	sock_put(sk);
+}
+
+static int l2cap_sock_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+	if (!sk->sk_shutdown) {
+		if (chan->mode == L2CAP_MODE_ERTM)
+			err = __l2cap_wait_ack(sk);
+
+		sk->sk_shutdown = SHUTDOWN_MASK;
+		l2cap_chan_close(chan, 0);
+
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+			err = bt_sock_wait_state(sk, BT_CLOSED,
+							sk->sk_lingertime);
+	}
+
+	if (!err && sk->sk_err)
+		err = -sk->sk_err;
+
+	release_sock(sk);
+	return err;
+}
+
+static int l2cap_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	err = l2cap_sock_shutdown(sock, 2);
+
+	sock_orphan(sk);
+	l2cap_sock_kill(sk);
+	return err;
+}
+
+static struct l2cap_chan *l2cap_sock_new_connection_cb(void *data)
+{
+	struct sock *sk, *parent = data;
+
+	sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
+								GFP_ATOMIC);
+	if (!sk)
+		return NULL;
+
+	l2cap_sock_init(sk, parent);
+
+	return l2cap_pi(sk)->chan;
+}
+
+static int l2cap_sock_recv_cb(void *data, struct sk_buff *skb)
+{
+	int err;
+	struct sock *sk = data;
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+
+	if (pi->rx_busy_skb)
+		return -ENOMEM;
+
+	err = sock_queue_rcv_skb(sk, skb);
+
+	/* For ERTM, handle one skb that doesn't fit into the recv
+	 * buffer.  This is important to do because the data frames
+	 * have already been acked, so the skb cannot be discarded.
+	 *
+	 * Notify the l2cap core that the buffer is full, so the
+	 * LOCAL_BUSY state is entered and no more frames are
+	 * acked and reassembled until there is buffer space
+	 * available.
+	 */
+	if (err < 0 && pi->chan->mode == L2CAP_MODE_ERTM) {
+		pi->rx_busy_skb = skb;
+		l2cap_chan_busy(pi->chan, 1);
+		err = 0;
+	}
+
+	return err;
+}
+
+static void l2cap_sock_close_cb(void *data)
+{
+	struct sock *sk = data;
+
+	l2cap_sock_kill(sk);
+}
+
+static void l2cap_sock_state_change_cb(void *data, int state)
+{
+	struct sock *sk = data;
+
+	sk->sk_state = state;
+}
+
+static struct l2cap_ops l2cap_chan_ops = {
+	.name		= "L2CAP Socket Interface",
+	.new_connection	= l2cap_sock_new_connection_cb,
+	.recv		= l2cap_sock_recv_cb,
+	.close		= l2cap_sock_close_cb,
+	.state_change	= l2cap_sock_state_change_cb,
+};
+
+static void l2cap_sock_destruct(struct sock *sk)
+{
+	BT_DBG("sk %p", sk);
+
+	if (l2cap_pi(sk)->rx_busy_skb) {
+		kfree_skb(l2cap_pi(sk)->rx_busy_skb);
+		l2cap_pi(sk)->rx_busy_skb = NULL;
+	}
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void l2cap_sock_init(struct sock *sk, struct sock *parent)
+{
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+	struct l2cap_chan *chan = pi->chan;
+
+	BT_DBG("sk %p", sk);
+
+	if (parent) {
+		struct l2cap_chan *pchan = l2cap_pi(parent)->chan;
+
+		sk->sk_type = parent->sk_type;
+		bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup;
+
+		chan->chan_type = pchan->chan_type;
+		chan->imtu = pchan->imtu;
+		chan->omtu = pchan->omtu;
+		chan->conf_state = pchan->conf_state;
+		chan->mode = pchan->mode;
+		chan->fcs  = pchan->fcs;
+		chan->max_tx = pchan->max_tx;
+		chan->tx_win = pchan->tx_win;
+		chan->sec_level = pchan->sec_level;
+		chan->role_switch = pchan->role_switch;
+		chan->force_reliable = pchan->force_reliable;
+		chan->flushable = pchan->flushable;
+		chan->force_active = pchan->force_active;
+	} else {
+
+		switch (sk->sk_type) {
+		case SOCK_RAW:
+			chan->chan_type = L2CAP_CHAN_RAW;
+			break;
+		case SOCK_DGRAM:
+			chan->chan_type = L2CAP_CHAN_CONN_LESS;
+			break;
+		case SOCK_SEQPACKET:
+		case SOCK_STREAM:
+			chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
+			break;
+		}
+
+		chan->imtu = L2CAP_DEFAULT_MTU;
+		chan->omtu = 0;
+		if (!disable_ertm && sk->sk_type == SOCK_STREAM) {
+			chan->mode = L2CAP_MODE_ERTM;
+			set_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+		} else {
+			chan->mode = L2CAP_MODE_BASIC;
+		}
+		chan->max_tx = L2CAP_DEFAULT_MAX_TX;
+		chan->fcs  = L2CAP_FCS_CRC16;
+		chan->tx_win = L2CAP_DEFAULT_TX_WINDOW;
+		chan->sec_level = BT_SECURITY_LOW;
+		chan->role_switch = 0;
+		chan->force_reliable = 0;
+		chan->flushable = BT_FLUSHABLE_OFF;
+		chan->force_active = BT_POWER_FORCE_ACTIVE_ON;
+
+	}
+
+	/* Default config options */
+	chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
+
+	chan->data = sk;
+	chan->ops = &l2cap_chan_ops;
+}
+
+static struct proto l2cap_proto = {
+	.name		= "L2CAP",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct l2cap_pinfo)
+};
+
+static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+{
+	struct sock *sk;
+	struct l2cap_chan *chan;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk);
+	INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+	sk->sk_destruct = l2cap_sock_destruct;
+	sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = proto;
+	sk->sk_state = BT_OPEN;
+
+	chan = l2cap_chan_create(sk);
+	if (!chan) {
+		l2cap_sock_kill(sk);
+		return NULL;
+	}
+
+	l2cap_pi(sk)->chan = chan;
+
+	return sk;
+}
+
+static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
+			     int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM &&
+			sock->type != SOCK_DGRAM && sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	sock->ops = &l2cap_sock_ops;
+
+	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	if (!sk)
+		return -ENOMEM;
+
+	l2cap_sock_init(sk, NULL);
+	return 0;
+}
+
+static const struct proto_ops l2cap_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= l2cap_sock_release,
+	.bind		= l2cap_sock_bind,
+	.connect	= l2cap_sock_connect,
+	.listen		= l2cap_sock_listen,
+	.accept		= l2cap_sock_accept,
+	.getname	= l2cap_sock_getname,
+	.sendmsg	= l2cap_sock_sendmsg,
+	.recvmsg	= l2cap_sock_recvmsg,
+	.poll		= bt_sock_poll,
+	.ioctl		= bt_sock_ioctl,
+	.mmap		= sock_no_mmap,
+	.socketpair	= sock_no_socketpair,
+	.shutdown	= l2cap_sock_shutdown,
+	.setsockopt	= l2cap_sock_setsockopt,
+	.getsockopt	= l2cap_sock_getsockopt
+};
+
+static const struct net_proto_family l2cap_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= l2cap_sock_create,
+};
+
+int __init l2cap_init_sockets(void)
+{
+	int err;
+
+	err = proto_register(&l2cap_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	BT_INFO("L2CAP socket layer initialized");
+
+	return 0;
+
+error:
+	BT_ERR("L2CAP socket registration failed");
+	proto_unregister(&l2cap_proto);
+	return err;
+}
+
+void l2cap_cleanup_sockets(void)
+{
+	if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
+		BT_ERR("L2CAP socket unregistration failed");
+
+	proto_unregister(&l2cap_proto);
+}
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
new file mode 100644
index 00000000..86a6bed2
--- /dev/null
+++ b/net/bluetooth/lib.c
@@ -0,0 +1,171 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth kernel library. */
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <asm/errno.h>
+
+#include <net/bluetooth/bluetooth.h>
+
+void baswap(bdaddr_t *dst, bdaddr_t *src)
+{
+	unsigned char *d = (unsigned char *) dst;
+	unsigned char *s = (unsigned char *) src;
+	unsigned int i;
+
+	for (i = 0; i < 6; i++)
+		d[i] = s[5 - i];
+}
+EXPORT_SYMBOL(baswap);
+
+char *batostr(bdaddr_t *ba)
+{
+	static char str[2][18];
+	static int i = 1;
+
+	i ^= 1;
+	sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X",
+		ba->b[5], ba->b[4], ba->b[3],
+		ba->b[2], ba->b[1], ba->b[0]);
+
+	return str[i];
+}
+EXPORT_SYMBOL(batostr);
+
+/* Bluetooth error codes to Unix errno mapping */
+int bt_to_errno(__u16 code)
+{
+	switch (code) {
+	case 0:
+		return 0;
+
+	case 0x01:
+		return EBADRQC;
+
+	case 0x02:
+		return ENOTCONN;
+
+	case 0x03:
+		return EIO;
+
+	case 0x04:
+		return EHOSTDOWN;
+
+	case 0x05:
+		return EACCES;
+
+	case 0x06:
+		return EBADE;
+
+	case 0x07:
+		return ENOMEM;
+
+	case 0x08:
+		return ETIMEDOUT;
+
+	case 0x09:
+		return EMLINK;
+
+	case 0x0a:
+		return EMLINK;
+
+	case 0x0b:
+		return EALREADY;
+
+	case 0x0c:
+		return EBUSY;
+
+	case 0x0d:
+	case 0x0e:
+	case 0x0f:
+		return ECONNREFUSED;
+
+	case 0x10:
+		return ETIMEDOUT;
+
+	case 0x11:
+	case 0x27:
+	case 0x29:
+	case 0x20:
+		return EOPNOTSUPP;
+
+	case 0x12:
+		return EINVAL;
+
+	case 0x13:
+	case 0x14:
+	case 0x15:
+		return ECONNRESET;
+
+	case 0x16:
+		return ECONNABORTED;
+
+	case 0x17:
+		return ELOOP;
+
+	case 0x18:
+		return EACCES;
+
+	case 0x1a:
+		return EPROTONOSUPPORT;
+
+	case 0x1b:
+		return ECONNREFUSED;
+
+	case 0x19:
+	case 0x1e:
+	case 0x23:
+	case 0x24:
+	case 0x25:
+		return EPROTO;
+
+	default:
+		return ENOSYS;
+	}
+}
+EXPORT_SYMBOL(bt_to_errno);
+
+int bt_printk(const char *level, const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int r;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	r = printk("%sBluetooth: %pV\n", level, &vaf);
+
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(bt_printk);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
new file mode 100644
index 00000000..98327213
--- /dev/null
+++ b/net/bluetooth/mgmt.c
@@ -0,0 +1,2288 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2010  Nokia Corporation
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth HCI Management interface */
+
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#define MGMT_VERSION	0
+#define MGMT_REVISION	1
+
+struct pending_cmd {
+	struct list_head list;
+	__u16 opcode;
+	int index;
+	void *param;
+	struct sock *sk;
+	void *user_data;
+};
+
+static LIST_HEAD(cmd_list);
+
+static int cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
+{
+	struct sk_buff *skb;
+	struct mgmt_hdr *hdr;
+	struct mgmt_ev_cmd_status *ev;
+
+	BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status);
+
+	skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = (void *) skb_put(skb, sizeof(*hdr));
+
+	hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS);
+	hdr->index = cpu_to_le16(index);
+	hdr->len = cpu_to_le16(sizeof(*ev));
+
+	ev = (void *) skb_put(skb, sizeof(*ev));
+	ev->status = status;
+	put_unaligned_le16(cmd, &ev->opcode);
+
+	if (sock_queue_rcv_skb(sk, skb) < 0)
+		kfree_skb(skb);
+
+	return 0;
+}
+
+static int cmd_complete(struct sock *sk, u16 index, u16 cmd, void *rp,
+								size_t rp_len)
+{
+	struct sk_buff *skb;
+	struct mgmt_hdr *hdr;
+	struct mgmt_ev_cmd_complete *ev;
+
+	BT_DBG("sock %p", sk);
+
+	skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = (void *) skb_put(skb, sizeof(*hdr));
+
+	hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE);
+	hdr->index = cpu_to_le16(index);
+	hdr->len = cpu_to_le16(sizeof(*ev) + rp_len);
+
+	ev = (void *) skb_put(skb, sizeof(*ev) + rp_len);
+	put_unaligned_le16(cmd, &ev->opcode);
+
+	if (rp)
+		memcpy(ev->data, rp, rp_len);
+
+	if (sock_queue_rcv_skb(sk, skb) < 0)
+		kfree_skb(skb);
+
+	return 0;
+}
+
+static int read_version(struct sock *sk)
+{
+	struct mgmt_rp_read_version rp;
+
+	BT_DBG("sock %p", sk);
+
+	rp.version = MGMT_VERSION;
+	put_unaligned_le16(MGMT_REVISION, &rp.revision);
+
+	return cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, &rp,
+								sizeof(rp));
+}
+
+static int read_index_list(struct sock *sk)
+{
+	struct mgmt_rp_read_index_list *rp;
+	struct list_head *p;
+	size_t rp_len;
+	u16 count;
+	int i, err;
+
+	BT_DBG("sock %p", sk);
+
+	read_lock(&hci_dev_list_lock);
+
+	count = 0;
+	list_for_each(p, &hci_dev_list) {
+		count++;
+	}
+
+	rp_len = sizeof(*rp) + (2 * count);
+	rp = kmalloc(rp_len, GFP_ATOMIC);
+	if (!rp) {
+		read_unlock(&hci_dev_list_lock);
+		return -ENOMEM;
+	}
+
+	put_unaligned_le16(count, &rp->num_controllers);
+
+	i = 0;
+	list_for_each(p, &hci_dev_list) {
+		struct hci_dev *d = list_entry(p, struct hci_dev, list);
+
+		hci_del_off_timer(d);
+
+		set_bit(HCI_MGMT, &d->flags);
+
+		if (test_bit(HCI_SETUP, &d->flags))
+			continue;
+
+		put_unaligned_le16(d->id, &rp->index[i++]);
+		BT_DBG("Added hci%u", d->id);
+	}
+
+	read_unlock(&hci_dev_list_lock);
+
+	err = cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST, rp,
+									rp_len);
+
+	kfree(rp);
+
+	return err;
+}
+
+static int read_controller_info(struct sock *sk, u16 index)
+{
+	struct mgmt_rp_read_info rp;
+	struct hci_dev *hdev;
+
+	BT_DBG("sock %p hci%u", sk, index);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_READ_INFO, ENODEV);
+
+	hci_del_off_timer(hdev);
+
+	hci_dev_lock_bh(hdev);
+
+	set_bit(HCI_MGMT, &hdev->flags);
+
+	memset(&rp, 0, sizeof(rp));
+
+	rp.type = hdev->dev_type;
+
+	rp.powered = test_bit(HCI_UP, &hdev->flags);
+	rp.connectable = test_bit(HCI_PSCAN, &hdev->flags);
+	rp.discoverable = test_bit(HCI_ISCAN, &hdev->flags);
+	rp.pairable = test_bit(HCI_PSCAN, &hdev->flags);
+
+	if (test_bit(HCI_AUTH, &hdev->flags))
+		rp.sec_mode = 3;
+	else if (hdev->ssp_mode > 0)
+		rp.sec_mode = 4;
+	else
+		rp.sec_mode = 2;
+
+	bacpy(&rp.bdaddr, &hdev->bdaddr);
+	memcpy(rp.features, hdev->features, 8);
+	memcpy(rp.dev_class, hdev->dev_class, 3);
+	put_unaligned_le16(hdev->manufacturer, &rp.manufacturer);
+	rp.hci_ver = hdev->hci_ver;
+	put_unaligned_le16(hdev->hci_rev, &rp.hci_rev);
+
+	memcpy(rp.name, hdev->dev_name, sizeof(hdev->dev_name));
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return cmd_complete(sk, index, MGMT_OP_READ_INFO, &rp, sizeof(rp));
+}
+
+static void mgmt_pending_free(struct pending_cmd *cmd)
+{
+	sock_put(cmd->sk);
+	kfree(cmd->param);
+	kfree(cmd);
+}
+
+static struct pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+						u16 index, void *data, u16 len)
+{
+	struct pending_cmd *cmd;
+
+	cmd = kmalloc(sizeof(*cmd), GFP_ATOMIC);
+	if (!cmd)
+		return NULL;
+
+	cmd->opcode = opcode;
+	cmd->index = index;
+
+	cmd->param = kmalloc(len, GFP_ATOMIC);
+	if (!cmd->param) {
+		kfree(cmd);
+		return NULL;
+	}
+
+	if (data)
+		memcpy(cmd->param, data, len);
+
+	cmd->sk = sk;
+	sock_hold(sk);
+
+	list_add(&cmd->list, &cmd_list);
+
+	return cmd;
+}
+
+static void mgmt_pending_foreach(u16 opcode, int index,
+				void (*cb)(struct pending_cmd *cmd, void *data),
+				void *data)
+{
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &cmd_list) {
+		struct pending_cmd *cmd;
+
+		cmd = list_entry(p, struct pending_cmd, list);
+
+		if (cmd->opcode != opcode)
+			continue;
+
+		if (index >= 0 && cmd->index != index)
+			continue;
+
+		cb(cmd, data);
+	}
+}
+
+static struct pending_cmd *mgmt_pending_find(u16 opcode, int index)
+{
+	struct list_head *p;
+
+	list_for_each(p, &cmd_list) {
+		struct pending_cmd *cmd;
+
+		cmd = list_entry(p, struct pending_cmd, list);
+
+		if (cmd->opcode != opcode)
+			continue;
+
+		if (index >= 0 && cmd->index != index)
+			continue;
+
+		return cmd;
+	}
+
+	return NULL;
+}
+
+static void mgmt_pending_remove(struct pending_cmd *cmd)
+{
+	list_del(&cmd->list);
+	mgmt_pending_free(cmd);
+}
+
+static int set_powered(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct mgmt_mode *cp;
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	int err, up;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_POWERED, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_POWERED, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	up = test_bit(HCI_UP, &hdev->flags);
+	if ((cp->val && up) || (!cp->val && !up)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_POWERED, EALREADY);
+		goto failed;
+	}
+
+	if (mgmt_pending_find(MGMT_OP_SET_POWERED, index)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_POWERED, EBUSY);
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	if (cp->val)
+		queue_work(hdev->workqueue, &hdev->power_on);
+	else
+		queue_work(hdev->workqueue, &hdev->power_off);
+
+	err = 0;
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+static int set_discoverable(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct mgmt_mode *cp;
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	u8 scan;
+	int err;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, ENETDOWN);
+		goto failed;
+	}
+
+	if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, index) ||
+			mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, index)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EBUSY);
+		goto failed;
+	}
+
+	if (cp->val == test_bit(HCI_ISCAN, &hdev->flags) &&
+					test_bit(HCI_PSCAN, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_DISCOVERABLE, EALREADY);
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	scan = SCAN_PAGE;
+
+	if (cp->val)
+		scan |= SCAN_INQUIRY;
+
+	err = hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int set_connectable(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct mgmt_mode *cp;
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	u8 scan;
+	int err;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, ENETDOWN);
+		goto failed;
+	}
+
+	if (mgmt_pending_find(MGMT_OP_SET_DISCOVERABLE, index) ||
+			mgmt_pending_find(MGMT_OP_SET_CONNECTABLE, index)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EBUSY);
+		goto failed;
+	}
+
+	if (cp->val == test_bit(HCI_PSCAN, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_SET_CONNECTABLE, EALREADY);
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	if (cp->val)
+		scan = SCAN_PAGE;
+	else
+		scan = 0;
+
+	err = hci_send_cmd(hdev, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int mgmt_event(u16 event, u16 index, void *data, u16 data_len,
+							struct sock *skip_sk)
+{
+	struct sk_buff *skb;
+	struct mgmt_hdr *hdr;
+
+	skb = alloc_skb(sizeof(*hdr) + data_len, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	bt_cb(skb)->channel = HCI_CHANNEL_CONTROL;
+
+	hdr = (void *) skb_put(skb, sizeof(*hdr));
+	hdr->opcode = cpu_to_le16(event);
+	hdr->index = cpu_to_le16(index);
+	hdr->len = cpu_to_le16(data_len);
+
+	if (data)
+		memcpy(skb_put(skb, data_len), data, data_len);
+
+	hci_send_to_sock(NULL, skb, skip_sk);
+	kfree_skb(skb);
+
+	return 0;
+}
+
+static int send_mode_rsp(struct sock *sk, u16 opcode, u16 index, u8 val)
+{
+	struct mgmt_mode rp;
+
+	rp.val = val;
+
+	return cmd_complete(sk, index, opcode, &rp, sizeof(rp));
+}
+
+static int set_pairable(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct mgmt_mode *cp, ev;
+	struct hci_dev *hdev;
+	int err;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_PAIRABLE, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_PAIRABLE, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (cp->val)
+		set_bit(HCI_PAIRABLE, &hdev->flags);
+	else
+		clear_bit(HCI_PAIRABLE, &hdev->flags);
+
+	err = send_mode_rsp(sk, MGMT_OP_SET_PAIRABLE, index, cp->val);
+	if (err < 0)
+		goto failed;
+
+	ev.val = cp->val;
+
+	err = mgmt_event(MGMT_EV_PAIRABLE, index, &ev, sizeof(ev), sk);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+#define EIR_FLAGS		0x01 /* flags */
+#define EIR_UUID16_SOME		0x02 /* 16-bit UUID, more available */
+#define EIR_UUID16_ALL		0x03 /* 16-bit UUID, all listed */
+#define EIR_UUID32_SOME		0x04 /* 32-bit UUID, more available */
+#define EIR_UUID32_ALL		0x05 /* 32-bit UUID, all listed */
+#define EIR_UUID128_SOME	0x06 /* 128-bit UUID, more available */
+#define EIR_UUID128_ALL		0x07 /* 128-bit UUID, all listed */
+#define EIR_NAME_SHORT		0x08 /* shortened local name */
+#define EIR_NAME_COMPLETE	0x09 /* complete local name */
+#define EIR_TX_POWER		0x0A /* transmit power level */
+#define EIR_DEVICE_ID		0x10 /* device ID */
+
+#define PNP_INFO_SVCLASS_ID		0x1200
+
+static u8 bluetooth_base_uuid[] = {
+			0xFB, 0x34, 0x9B, 0x5F, 0x80, 0x00, 0x00, 0x80,
+			0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static u16 get_uuid16(u8 *uuid128)
+{
+	u32 val;
+	int i;
+
+	for (i = 0; i < 12; i++) {
+		if (bluetooth_base_uuid[i] != uuid128[i])
+			return 0;
+	}
+
+	memcpy(&val, &uuid128[12], 4);
+
+	val = le32_to_cpu(val);
+	if (val > 0xffff)
+		return 0;
+
+	return (u16) val;
+}
+
+static void create_eir(struct hci_dev *hdev, u8 *data)
+{
+	u8 *ptr = data;
+	u16 eir_len = 0;
+	u16 uuid16_list[HCI_MAX_EIR_LENGTH / sizeof(u16)];
+	int i, truncated = 0;
+	struct list_head *p;
+	size_t name_len;
+
+	name_len = strlen(hdev->dev_name);
+
+	if (name_len > 0) {
+		/* EIR Data type */
+		if (name_len > 48) {
+			name_len = 48;
+			ptr[1] = EIR_NAME_SHORT;
+		} else
+			ptr[1] = EIR_NAME_COMPLETE;
+
+		/* EIR Data length */
+		ptr[0] = name_len + 1;
+
+		memcpy(ptr + 2, hdev->dev_name, name_len);
+
+		eir_len += (name_len + 2);
+		ptr += (name_len + 2);
+	}
+
+	memset(uuid16_list, 0, sizeof(uuid16_list));
+
+	/* Group all UUID16 types */
+	list_for_each(p, &hdev->uuids) {
+		struct bt_uuid *uuid = list_entry(p, struct bt_uuid, list);
+		u16 uuid16;
+
+		uuid16 = get_uuid16(uuid->uuid);
+		if (uuid16 == 0)
+			return;
+
+		if (uuid16 < 0x1100)
+			continue;
+
+		if (uuid16 == PNP_INFO_SVCLASS_ID)
+			continue;
+
+		/* Stop if not enough space to put next UUID */
+		if (eir_len + 2 + sizeof(u16) > HCI_MAX_EIR_LENGTH) {
+			truncated = 1;
+			break;
+		}
+
+		/* Check for duplicates */
+		for (i = 0; uuid16_list[i] != 0; i++)
+			if (uuid16_list[i] == uuid16)
+				break;
+
+		if (uuid16_list[i] == 0) {
+			uuid16_list[i] = uuid16;
+			eir_len += sizeof(u16);
+		}
+	}
+
+	if (uuid16_list[0] != 0) {
+		u8 *length = ptr;
+
+		/* EIR Data type */
+		ptr[1] = truncated ? EIR_UUID16_SOME : EIR_UUID16_ALL;
+
+		ptr += 2;
+		eir_len += 2;
+
+		for (i = 0; uuid16_list[i] != 0; i++) {
+			*ptr++ = (uuid16_list[i] & 0x00ff);
+			*ptr++ = (uuid16_list[i] & 0xff00) >> 8;
+		}
+
+		/* EIR Data length */
+		*length = (i * sizeof(u16)) + 1;
+	}
+}
+
+static int update_eir(struct hci_dev *hdev)
+{
+	struct hci_cp_write_eir cp;
+
+	if (!(hdev->features[6] & LMP_EXT_INQ))
+		return 0;
+
+	if (hdev->ssp_mode == 0)
+		return 0;
+
+	if (test_bit(HCI_SERVICE_CACHE, &hdev->flags))
+		return 0;
+
+	memset(&cp, 0, sizeof(cp));
+
+	create_eir(hdev, cp.data);
+
+	if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
+		return 0;
+
+	memcpy(hdev->eir, cp.data, sizeof(cp.data));
+
+	return hci_send_cmd(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
+}
+
+static u8 get_service_classes(struct hci_dev *hdev)
+{
+	struct list_head *p;
+	u8 val = 0;
+
+	list_for_each(p, &hdev->uuids) {
+		struct bt_uuid *uuid = list_entry(p, struct bt_uuid, list);
+
+		val |= uuid->svc_hint;
+	}
+
+	return val;
+}
+
+static int update_class(struct hci_dev *hdev)
+{
+	u8 cod[3];
+
+	BT_DBG("%s", hdev->name);
+
+	if (test_bit(HCI_SERVICE_CACHE, &hdev->flags))
+		return 0;
+
+	cod[0] = hdev->minor_class;
+	cod[1] = hdev->major_class;
+	cod[2] = get_service_classes(hdev);
+
+	if (memcmp(cod, hdev->dev_class, 3) == 0)
+		return 0;
+
+	return hci_send_cmd(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
+}
+
+static int add_uuid(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct mgmt_cp_add_uuid *cp;
+	struct hci_dev *hdev;
+	struct bt_uuid *uuid;
+	int err;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_ADD_UUID, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_ADD_UUID, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	uuid = kmalloc(sizeof(*uuid), GFP_ATOMIC);
+	if (!uuid) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	memcpy(uuid->uuid, cp->uuid, 16);
+	uuid->svc_hint = cp->svc_hint;
+
+	list_add(&uuid->list, &hdev->uuids);
+
+	err = update_class(hdev);
+	if (err < 0)
+		goto failed;
+
+	err = update_eir(hdev);
+	if (err < 0)
+		goto failed;
+
+	err = cmd_complete(sk, index, MGMT_OP_ADD_UUID, NULL, 0);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int remove_uuid(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct list_head *p, *n;
+	struct mgmt_cp_remove_uuid *cp;
+	struct hci_dev *hdev;
+	u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int err, found;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_REMOVE_UUID, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_REMOVE_UUID, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) {
+		err = hci_uuids_clear(hdev);
+		goto unlock;
+	}
+
+	found = 0;
+
+	list_for_each_safe(p, n, &hdev->uuids) {
+		struct bt_uuid *match = list_entry(p, struct bt_uuid, list);
+
+		if (memcmp(match->uuid, cp->uuid, 16) != 0)
+			continue;
+
+		list_del(&match->list);
+		found++;
+	}
+
+	if (found == 0) {
+		err = cmd_status(sk, index, MGMT_OP_REMOVE_UUID, ENOENT);
+		goto unlock;
+	}
+
+	err = update_class(hdev);
+	if (err < 0)
+		goto unlock;
+
+	err = update_eir(hdev);
+	if (err < 0)
+		goto unlock;
+
+	err = cmd_complete(sk, index, MGMT_OP_REMOVE_UUID, NULL, 0);
+
+unlock:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int set_dev_class(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_set_dev_class *cp;
+	int err;
+
+	cp = (void *) data;
+
+	BT_DBG("request for hci%u", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_DEV_CLASS, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_DEV_CLASS, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	hdev->major_class = cp->major;
+	hdev->minor_class = cp->minor;
+
+	err = update_class(hdev);
+
+	if (err == 0)
+		err = cmd_complete(sk, index, MGMT_OP_SET_DEV_CLASS, NULL, 0);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int set_service_cache(struct sock *sk, u16 index,  unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_set_service_cache *cp;
+	int err;
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_SERVICE_CACHE, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_SERVICE_CACHE, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	BT_DBG("hci%u enable %d", index, cp->enable);
+
+	if (cp->enable) {
+		set_bit(HCI_SERVICE_CACHE, &hdev->flags);
+		err = 0;
+	} else {
+		clear_bit(HCI_SERVICE_CACHE, &hdev->flags);
+		err = update_class(hdev);
+		if (err == 0)
+			err = update_eir(hdev);
+	}
+
+	if (err == 0)
+		err = cmd_complete(sk, index, MGMT_OP_SET_SERVICE_CACHE, NULL,
+									0);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int load_keys(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_load_keys *cp;
+	u16 key_count, expected_len;
+	int i, err;
+
+	cp = (void *) data;
+
+	if (len < sizeof(*cp))
+		return -EINVAL;
+
+	key_count = get_unaligned_le16(&cp->key_count);
+
+	expected_len = sizeof(*cp) + key_count * sizeof(struct mgmt_key_info);
+	if (expected_len > len) {
+		BT_ERR("load_keys: expected at least %u bytes, got %u bytes",
+							expected_len, len);
+		return -EINVAL;
+	}
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_LOAD_KEYS, ENODEV);
+
+	BT_DBG("hci%u debug_keys %u key_count %u", index, cp->debug_keys,
+								key_count);
+
+	hci_dev_lock_bh(hdev);
+
+	hci_link_keys_clear(hdev);
+
+	set_bit(HCI_LINK_KEYS, &hdev->flags);
+
+	if (cp->debug_keys)
+		set_bit(HCI_DEBUG_KEYS, &hdev->flags);
+	else
+		clear_bit(HCI_DEBUG_KEYS, &hdev->flags);
+
+	len -= sizeof(*cp);
+	i = 0;
+
+	while (i < len) {
+		struct mgmt_key_info *key = (void *) cp->keys + i;
+
+		i += sizeof(*key) + key->dlen;
+
+		if (key->type == HCI_LK_SMP_LTK) {
+			struct key_master_id *id = (void *) key->data;
+
+			if (key->dlen != sizeof(struct key_master_id))
+				continue;
+
+			hci_add_ltk(hdev, 0, &key->bdaddr, key->pin_len,
+						id->ediv, id->rand, key->val);
+
+			continue;
+		}
+
+		hci_add_link_key(hdev, NULL, 0, &key->bdaddr, key->val, key->type,
+								key->pin_len);
+	}
+
+	err = cmd_complete(sk, index, MGMT_OP_LOAD_KEYS, NULL, 0);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int remove_key(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_remove_key *cp;
+	struct hci_conn *conn;
+	int err;
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_REMOVE_KEY, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_REMOVE_KEY, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	err = hci_remove_link_key(hdev, &cp->bdaddr);
+	if (err < 0) {
+		err = cmd_status(sk, index, MGMT_OP_REMOVE_KEY, -err);
+		goto unlock;
+	}
+
+	err = 0;
+
+	if (!test_bit(HCI_UP, &hdev->flags) || !cp->disconnect)
+		goto unlock;
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+	if (conn) {
+		struct hci_cp_disconnect dc;
+
+		put_unaligned_le16(conn->handle, &dc.handle);
+		dc.reason = 0x13; /* Remote User Terminated Connection */
+		err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+	}
+
+unlock:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int disconnect(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_disconnect *cp;
+	struct hci_cp_disconnect dc;
+	struct pending_cmd *cmd;
+	struct hci_conn *conn;
+	int err;
+
+	BT_DBG("");
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_DISCONNECT, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_DISCONNECT, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_DISCONNECT, ENETDOWN);
+		goto failed;
+	}
+
+	if (mgmt_pending_find(MGMT_OP_DISCONNECT, index)) {
+		err = cmd_status(sk, index, MGMT_OP_DISCONNECT, EBUSY);
+		goto failed;
+	}
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+	if (!conn)
+		conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->bdaddr);
+
+	if (!conn) {
+		err = cmd_status(sk, index, MGMT_OP_DISCONNECT, ENOTCONN);
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	put_unaligned_le16(conn->handle, &dc.handle);
+	dc.reason = 0x13; /* Remote User Terminated Connection */
+
+	err = hci_send_cmd(hdev, HCI_OP_DISCONNECT, sizeof(dc), &dc);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int get_connections(struct sock *sk, u16 index)
+{
+	struct mgmt_rp_get_connections *rp;
+	struct hci_dev *hdev;
+	struct list_head *p;
+	size_t rp_len;
+	u16 count;
+	int i, err;
+
+	BT_DBG("");
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_GET_CONNECTIONS, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	count = 0;
+	list_for_each(p, &hdev->conn_hash.list) {
+		count++;
+	}
+
+	rp_len = sizeof(*rp) + (count * sizeof(bdaddr_t));
+	rp = kmalloc(rp_len, GFP_ATOMIC);
+	if (!rp) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	put_unaligned_le16(count, &rp->conn_count);
+
+	i = 0;
+	list_for_each(p, &hdev->conn_hash.list) {
+		struct hci_conn *c = list_entry(p, struct hci_conn, list);
+
+		bacpy(&rp->conn[i++], &c->dst);
+	}
+
+	err = cmd_complete(sk, index, MGMT_OP_GET_CONNECTIONS, rp, rp_len);
+
+unlock:
+	kfree(rp);
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+static int send_pin_code_neg_reply(struct sock *sk, u16 index,
+		struct hci_dev *hdev, struct mgmt_cp_pin_code_neg_reply *cp)
+{
+	struct pending_cmd *cmd;
+	int err;
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, index, cp,
+								sizeof(*cp));
+	if (!cmd)
+		return -ENOMEM;
+
+	err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(cp->bdaddr),
+								&cp->bdaddr);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+static int pin_code_reply(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct hci_conn *conn;
+	struct mgmt_cp_pin_code_reply *cp;
+	struct mgmt_cp_pin_code_neg_reply ncp;
+	struct hci_cp_pin_code_reply reply;
+	struct pending_cmd *cmd;
+	int err;
+
+	BT_DBG("");
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENETDOWN);
+		goto failed;
+	}
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
+	if (!conn) {
+		err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY, ENOTCONN);
+		goto failed;
+	}
+
+	if (conn->pending_sec_level == BT_SECURITY_HIGH && cp->pin_len != 16) {
+		bacpy(&ncp.bdaddr, &cp->bdaddr);
+
+		BT_ERR("PIN code is not 16 bytes long");
+
+		err = send_pin_code_neg_reply(sk, index, hdev, &ncp);
+		if (err >= 0)
+			err = cmd_status(sk, index, MGMT_OP_PIN_CODE_REPLY,
+								EINVAL);
+
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_REPLY, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	bacpy(&reply.bdaddr, &cp->bdaddr);
+	reply.pin_len = cp->pin_len;
+	memcpy(reply.pin_code, cp->pin_code, sizeof(reply.pin_code));
+
+	err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_REPLY, sizeof(reply), &reply);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int pin_code_neg_reply(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_pin_code_neg_reply *cp;
+	int err;
+
+	BT_DBG("");
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY,
+									EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY,
+									ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_PIN_CODE_NEG_REPLY,
+								ENETDOWN);
+		goto failed;
+	}
+
+	err = send_pin_code_neg_reply(sk, index, hdev, cp);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int set_io_capability(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_set_io_capability *cp;
+
+	BT_DBG("");
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_SET_IO_CAPABILITY, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_IO_CAPABILITY, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	hdev->io_capability = cp->io_capability;
+
+	BT_DBG("%s IO capability set to 0x%02x", hdev->name,
+							hdev->io_capability);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return cmd_complete(sk, index, MGMT_OP_SET_IO_CAPABILITY, NULL, 0);
+}
+
+static inline struct pending_cmd *find_pairing(struct hci_conn *conn)
+{
+	struct hci_dev *hdev = conn->hdev;
+	struct list_head *p;
+
+	list_for_each(p, &cmd_list) {
+		struct pending_cmd *cmd;
+
+		cmd = list_entry(p, struct pending_cmd, list);
+
+		if (cmd->opcode != MGMT_OP_PAIR_DEVICE)
+			continue;
+
+		if (cmd->index != hdev->id)
+			continue;
+
+		if (cmd->user_data != conn)
+			continue;
+
+		return cmd;
+	}
+
+	return NULL;
+}
+
+static void pairing_complete(struct pending_cmd *cmd, u8 status)
+{
+	struct mgmt_rp_pair_device rp;
+	struct hci_conn *conn = cmd->user_data;
+
+	bacpy(&rp.bdaddr, &conn->dst);
+	rp.status = status;
+
+	cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, &rp, sizeof(rp));
+
+	/* So we don't get further callbacks for this connection */
+	conn->connect_cfm_cb = NULL;
+	conn->security_cfm_cb = NULL;
+	conn->disconn_cfm_cb = NULL;
+
+	hci_conn_put(conn);
+
+	mgmt_pending_remove(cmd);
+}
+
+static void pairing_complete_cb(struct hci_conn *conn, u8 status)
+{
+	struct pending_cmd *cmd;
+
+	BT_DBG("status %u", status);
+
+	cmd = find_pairing(conn);
+	if (!cmd) {
+		BT_DBG("Unable to find a pending command");
+		return;
+	}
+
+	pairing_complete(cmd, status);
+}
+
+static int pair_device(struct sock *sk, u16 index, unsigned char *data, u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_pair_device *cp;
+	struct pending_cmd *cmd;
+	u8 sec_level, auth_type;
+	struct hci_conn *conn;
+	int err;
+
+	BT_DBG("");
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (cp->io_cap == 0x03) {
+		sec_level = BT_SECURITY_MEDIUM;
+		auth_type = HCI_AT_DEDICATED_BONDING;
+	} else {
+		sec_level = BT_SECURITY_HIGH;
+		auth_type = HCI_AT_DEDICATED_BONDING_MITM;
+	}
+
+	conn = hci_connect(hdev, ACL_LINK, 0, &cp->bdaddr, sec_level, auth_type);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		goto unlock;
+	}
+
+	if (conn->connect_cfm_cb) {
+		hci_conn_put(conn);
+		err = cmd_status(sk, index, MGMT_OP_PAIR_DEVICE, EBUSY);
+		goto unlock;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		hci_conn_put(conn);
+		goto unlock;
+	}
+
+	conn->connect_cfm_cb = pairing_complete_cb;
+	conn->security_cfm_cb = pairing_complete_cb;
+	conn->disconn_cfm_cb = pairing_complete_cb;
+	conn->io_capability = cp->io_cap;
+	cmd->user_data = conn;
+
+	if (conn->state == BT_CONNECTED &&
+				hci_conn_security(conn, sec_level, auth_type))
+		pairing_complete(cmd, 0);
+
+	err = 0;
+
+unlock:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int user_confirm_reply(struct sock *sk, u16 index, unsigned char *data,
+							u16 len, int success)
+{
+	struct mgmt_cp_user_confirm_reply *cp = (void *) data;
+	u16 mgmt_op, hci_op;
+	struct pending_cmd *cmd;
+	struct hci_dev *hdev;
+	int err;
+
+	BT_DBG("");
+
+	if (success) {
+		mgmt_op = MGMT_OP_USER_CONFIRM_REPLY;
+		hci_op = HCI_OP_USER_CONFIRM_REPLY;
+	} else {
+		mgmt_op = MGMT_OP_USER_CONFIRM_NEG_REPLY;
+		hci_op = HCI_OP_USER_CONFIRM_NEG_REPLY;
+	}
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, mgmt_op, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, mgmt_op, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, mgmt_op, ENETDOWN);
+		goto failed;
+	}
+
+	cmd = mgmt_pending_add(sk, mgmt_op, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	err = hci_send_cmd(hdev, hci_op, sizeof(cp->bdaddr), &cp->bdaddr);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int set_local_name(struct sock *sk, u16 index, unsigned char *data,
+								u16 len)
+{
+	struct mgmt_cp_set_local_name *mgmt_cp = (void *) data;
+	struct hci_cp_write_local_name hci_cp;
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	int err;
+
+	BT_DBG("");
+
+	if (len != sizeof(*mgmt_cp))
+		return cmd_status(sk, index, MGMT_OP_SET_LOCAL_NAME, EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_SET_LOCAL_NAME, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, index, data, len);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	memcpy(hci_cp.name, mgmt_cp->name, sizeof(hci_cp.name));
+	err = hci_send_cmd(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(hci_cp),
+								&hci_cp);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int read_local_oob_data(struct sock *sk, u16 index)
+{
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	int err;
+
+	BT_DBG("hci%u", index);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA,
+									ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	if (!test_bit(HCI_UP, &hdev->flags)) {
+		err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA,
+								ENETDOWN);
+		goto unlock;
+	}
+
+	if (!(hdev->features[6] & LMP_SIMPLE_PAIR)) {
+		err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA,
+								EOPNOTSUPP);
+		goto unlock;
+	}
+
+	if (mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, index)) {
+		err = cmd_status(sk, index, MGMT_OP_READ_LOCAL_OOB_DATA, EBUSY);
+		goto unlock;
+	}
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_DATA, index, NULL, 0);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	err = hci_send_cmd(hdev, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+unlock:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int add_remote_oob_data(struct sock *sk, u16 index, unsigned char *data,
+									u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_add_remote_oob_data *cp = (void *) data;
+	int err;
+
+	BT_DBG("hci%u ", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA,
+									EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA,
+									ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	err = hci_add_remote_oob_data(hdev, &cp->bdaddr, cp->hash,
+								cp->randomizer);
+	if (err < 0)
+		err = cmd_status(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, -err);
+	else
+		err = cmd_complete(sk, index, MGMT_OP_ADD_REMOTE_OOB_DATA, NULL,
+									0);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int remove_remote_oob_data(struct sock *sk, u16 index,
+						unsigned char *data, u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_remove_remote_oob_data *cp = (void *) data;
+	int err;
+
+	BT_DBG("hci%u ", index);
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+									EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+									ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	err = hci_remove_remote_oob_data(hdev, &cp->bdaddr);
+	if (err < 0)
+		err = cmd_status(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+									-err);
+	else
+		err = cmd_complete(sk, index, MGMT_OP_REMOVE_REMOTE_OOB_DATA,
+								NULL, 0);
+
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int start_discovery(struct sock *sk, u16 index)
+{
+	u8 lap[3] = { 0x33, 0x8b, 0x9e };
+	struct hci_cp_inquiry cp;
+	struct pending_cmd *cmd;
+	struct hci_dev *hdev;
+	int err;
+
+	BT_DBG("hci%u", index);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_START_DISCOVERY, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_START_DISCOVERY, index, NULL, 0);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	memset(&cp, 0, sizeof(cp));
+	memcpy(&cp.lap, lap, 3);
+	cp.length  = 0x08;
+	cp.num_rsp = 0x00;
+
+	err = hci_send_cmd(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int stop_discovery(struct sock *sk, u16 index)
+{
+	struct hci_dev *hdev;
+	struct pending_cmd *cmd;
+	int err;
+
+	BT_DBG("hci%u", index);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_STOP_DISCOVERY, ENODEV);
+
+	hci_dev_lock_bh(hdev);
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, index, NULL, 0);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
+	err = hci_send_cmd(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL);
+	if (err < 0)
+		mgmt_pending_remove(cmd);
+
+failed:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int block_device(struct sock *sk, u16 index, unsigned char *data,
+								u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_block_device *cp;
+	int err;
+
+	BT_DBG("hci%u", index);
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE,
+							EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE,
+							ENODEV);
+
+	err = hci_blacklist_add(hdev, &cp->bdaddr);
+
+	if (err < 0)
+		err = cmd_status(sk, index, MGMT_OP_BLOCK_DEVICE, -err);
+	else
+		err = cmd_complete(sk, index, MGMT_OP_BLOCK_DEVICE,
+							NULL, 0);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+static int unblock_device(struct sock *sk, u16 index, unsigned char *data,
+								u16 len)
+{
+	struct hci_dev *hdev;
+	struct mgmt_cp_unblock_device *cp;
+	int err;
+
+	BT_DBG("hci%u", index);
+
+	cp = (void *) data;
+
+	if (len != sizeof(*cp))
+		return cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE,
+								EINVAL);
+
+	hdev = hci_dev_get(index);
+	if (!hdev)
+		return cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE,
+								ENODEV);
+
+	err = hci_blacklist_del(hdev, &cp->bdaddr);
+
+	if (err < 0)
+		err = cmd_status(sk, index, MGMT_OP_UNBLOCK_DEVICE, -err);
+	else
+		err = cmd_complete(sk, index, MGMT_OP_UNBLOCK_DEVICE,
+								NULL, 0);
+	hci_dev_put(hdev);
+
+	return err;
+}
+
+int mgmt_control(struct sock *sk, struct msghdr *msg, size_t msglen)
+{
+	unsigned char *buf;
+	struct mgmt_hdr *hdr;
+	u16 opcode, index, len;
+	int err;
+
+	BT_DBG("got %zu bytes", msglen);
+
+	if (msglen < sizeof(*hdr))
+		return -EINVAL;
+
+	buf = kmalloc(msglen, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (memcpy_fromiovec(buf, msg->msg_iov, msglen)) {
+		err = -EFAULT;
+		goto done;
+	}
+
+	hdr = (struct mgmt_hdr *) buf;
+	opcode = get_unaligned_le16(&hdr->opcode);
+	index = get_unaligned_le16(&hdr->index);
+	len = get_unaligned_le16(&hdr->len);
+
+	if (len != msglen - sizeof(*hdr)) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	switch (opcode) {
+	case MGMT_OP_READ_VERSION:
+		err = read_version(sk);
+		break;
+	case MGMT_OP_READ_INDEX_LIST:
+		err = read_index_list(sk);
+		break;
+	case MGMT_OP_READ_INFO:
+		err = read_controller_info(sk, index);
+		break;
+	case MGMT_OP_SET_POWERED:
+		err = set_powered(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_DISCOVERABLE:
+		err = set_discoverable(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_CONNECTABLE:
+		err = set_connectable(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_PAIRABLE:
+		err = set_pairable(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_ADD_UUID:
+		err = add_uuid(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_REMOVE_UUID:
+		err = remove_uuid(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_DEV_CLASS:
+		err = set_dev_class(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_SERVICE_CACHE:
+		err = set_service_cache(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_LOAD_KEYS:
+		err = load_keys(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_REMOVE_KEY:
+		err = remove_key(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_DISCONNECT:
+		err = disconnect(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_GET_CONNECTIONS:
+		err = get_connections(sk, index);
+		break;
+	case MGMT_OP_PIN_CODE_REPLY:
+		err = pin_code_reply(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_PIN_CODE_NEG_REPLY:
+		err = pin_code_neg_reply(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_SET_IO_CAPABILITY:
+		err = set_io_capability(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_PAIR_DEVICE:
+		err = pair_device(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_USER_CONFIRM_REPLY:
+		err = user_confirm_reply(sk, index, buf + sizeof(*hdr), len, 1);
+		break;
+	case MGMT_OP_USER_CONFIRM_NEG_REPLY:
+		err = user_confirm_reply(sk, index, buf + sizeof(*hdr), len, 0);
+		break;
+	case MGMT_OP_SET_LOCAL_NAME:
+		err = set_local_name(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_READ_LOCAL_OOB_DATA:
+		err = read_local_oob_data(sk, index);
+		break;
+	case MGMT_OP_ADD_REMOTE_OOB_DATA:
+		err = add_remote_oob_data(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_REMOVE_REMOTE_OOB_DATA:
+		err = remove_remote_oob_data(sk, index, buf + sizeof(*hdr),
+									len);
+		break;
+	case MGMT_OP_START_DISCOVERY:
+		err = start_discovery(sk, index);
+		break;
+	case MGMT_OP_STOP_DISCOVERY:
+		err = stop_discovery(sk, index);
+		break;
+	case MGMT_OP_BLOCK_DEVICE:
+		err = block_device(sk, index, buf + sizeof(*hdr), len);
+		break;
+	case MGMT_OP_UNBLOCK_DEVICE:
+		err = unblock_device(sk, index, buf + sizeof(*hdr), len);
+		break;
+	default:
+		BT_DBG("Unknown op %u", opcode);
+		err = cmd_status(sk, index, opcode, 0x01);
+		break;
+	}
+
+	if (err < 0)
+		goto done;
+
+	err = msglen;
+
+done:
+	kfree(buf);
+	return err;
+}
+
+int mgmt_index_added(u16 index)
+{
+	return mgmt_event(MGMT_EV_INDEX_ADDED, index, NULL, 0, NULL);
+}
+
+int mgmt_index_removed(u16 index)
+{
+	return mgmt_event(MGMT_EV_INDEX_REMOVED, index, NULL, 0, NULL);
+}
+
+struct cmd_lookup {
+	u8 val;
+	struct sock *sk;
+};
+
+static void mode_rsp(struct pending_cmd *cmd, void *data)
+{
+	struct mgmt_mode *cp = cmd->param;
+	struct cmd_lookup *match = data;
+
+	if (cp->val != match->val)
+		return;
+
+	send_mode_rsp(cmd->sk, cmd->opcode, cmd->index, cp->val);
+
+	list_del(&cmd->list);
+
+	if (match->sk == NULL) {
+		match->sk = cmd->sk;
+		sock_hold(match->sk);
+	}
+
+	mgmt_pending_free(cmd);
+}
+
+int mgmt_powered(u16 index, u8 powered)
+{
+	struct mgmt_mode ev;
+	struct cmd_lookup match = { powered, NULL };
+	int ret;
+
+	mgmt_pending_foreach(MGMT_OP_SET_POWERED, index, mode_rsp, &match);
+
+	ev.val = powered;
+
+	ret = mgmt_event(MGMT_EV_POWERED, index, &ev, sizeof(ev), match.sk);
+
+	if (match.sk)
+		sock_put(match.sk);
+
+	return ret;
+}
+
+int mgmt_discoverable(u16 index, u8 discoverable)
+{
+	struct mgmt_mode ev;
+	struct cmd_lookup match = { discoverable, NULL };
+	int ret;
+
+	mgmt_pending_foreach(MGMT_OP_SET_DISCOVERABLE, index, mode_rsp, &match);
+
+	ev.val = discoverable;
+
+	ret = mgmt_event(MGMT_EV_DISCOVERABLE, index, &ev, sizeof(ev),
+								match.sk);
+
+	if (match.sk)
+		sock_put(match.sk);
+
+	return ret;
+}
+
+int mgmt_connectable(u16 index, u8 connectable)
+{
+	struct mgmt_mode ev;
+	struct cmd_lookup match = { connectable, NULL };
+	int ret;
+
+	mgmt_pending_foreach(MGMT_OP_SET_CONNECTABLE, index, mode_rsp, &match);
+
+	ev.val = connectable;
+
+	ret = mgmt_event(MGMT_EV_CONNECTABLE, index, &ev, sizeof(ev), match.sk);
+
+	if (match.sk)
+		sock_put(match.sk);
+
+	return ret;
+}
+
+int mgmt_new_key(u16 index, struct link_key *key, u8 persistent)
+{
+	struct mgmt_ev_new_key *ev;
+	int err, total;
+
+	total = sizeof(struct mgmt_ev_new_key) + key->dlen;
+	ev = kzalloc(total, GFP_ATOMIC);
+	if (!ev)
+		return -ENOMEM;
+
+	bacpy(&ev->key.bdaddr, &key->bdaddr);
+	ev->key.type = key->type;
+	memcpy(ev->key.val, key->val, 16);
+	ev->key.pin_len = key->pin_len;
+	ev->key.dlen = key->dlen;
+	ev->store_hint = persistent;
+
+	memcpy(ev->key.data, key->data, key->dlen);
+
+	err = mgmt_event(MGMT_EV_NEW_KEY, index, ev, total, NULL);
+
+	kfree(ev);
+
+	return err;
+}
+
+int mgmt_connected(u16 index, bdaddr_t *bdaddr)
+{
+	struct mgmt_ev_connected ev;
+
+	bacpy(&ev.bdaddr, bdaddr);
+
+	return mgmt_event(MGMT_EV_CONNECTED, index, &ev, sizeof(ev), NULL);
+}
+
+static void disconnect_rsp(struct pending_cmd *cmd, void *data)
+{
+	struct mgmt_cp_disconnect *cp = cmd->param;
+	struct sock **sk = data;
+	struct mgmt_rp_disconnect rp;
+
+	bacpy(&rp.bdaddr, &cp->bdaddr);
+
+	cmd_complete(cmd->sk, cmd->index, MGMT_OP_DISCONNECT, &rp, sizeof(rp));
+
+	*sk = cmd->sk;
+	sock_hold(*sk);
+
+	mgmt_pending_remove(cmd);
+}
+
+int mgmt_disconnected(u16 index, bdaddr_t *bdaddr)
+{
+	struct mgmt_ev_disconnected ev;
+	struct sock *sk = NULL;
+	int err;
+
+	mgmt_pending_foreach(MGMT_OP_DISCONNECT, index, disconnect_rsp, &sk);
+
+	bacpy(&ev.bdaddr, bdaddr);
+
+	err = mgmt_event(MGMT_EV_DISCONNECTED, index, &ev, sizeof(ev), sk);
+
+	if (sk)
+		sock_put(sk);
+
+	return err;
+}
+
+int mgmt_disconnect_failed(u16 index)
+{
+	struct pending_cmd *cmd;
+	int err;
+
+	cmd = mgmt_pending_find(MGMT_OP_DISCONNECT, index);
+	if (!cmd)
+		return -ENOENT;
+
+	err = cmd_status(cmd->sk, index, MGMT_OP_DISCONNECT, EIO);
+
+	mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+int mgmt_connect_failed(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	struct mgmt_ev_connect_failed ev;
+
+	bacpy(&ev.bdaddr, bdaddr);
+	ev.status = status;
+
+	return mgmt_event(MGMT_EV_CONNECT_FAILED, index, &ev, sizeof(ev), NULL);
+}
+
+int mgmt_pin_code_request(u16 index, bdaddr_t *bdaddr, u8 secure)
+{
+	struct mgmt_ev_pin_code_request ev;
+
+	bacpy(&ev.bdaddr, bdaddr);
+	ev.secure = secure;
+
+	return mgmt_event(MGMT_EV_PIN_CODE_REQUEST, index, &ev, sizeof(ev),
+									NULL);
+}
+
+int mgmt_pin_code_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	struct pending_cmd *cmd;
+	struct mgmt_rp_pin_code_reply rp;
+	int err;
+
+	cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_REPLY, index);
+	if (!cmd)
+		return -ENOENT;
+
+	bacpy(&rp.bdaddr, bdaddr);
+	rp.status = status;
+
+	err = cmd_complete(cmd->sk, index, MGMT_OP_PIN_CODE_REPLY, &rp,
+								sizeof(rp));
+
+	mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+int mgmt_pin_code_neg_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	struct pending_cmd *cmd;
+	struct mgmt_rp_pin_code_reply rp;
+	int err;
+
+	cmd = mgmt_pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, index);
+	if (!cmd)
+		return -ENOENT;
+
+	bacpy(&rp.bdaddr, bdaddr);
+	rp.status = status;
+
+	err = cmd_complete(cmd->sk, index, MGMT_OP_PIN_CODE_NEG_REPLY, &rp,
+								sizeof(rp));
+
+	mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+int mgmt_user_confirm_request(u16 index, bdaddr_t *bdaddr, __le32 value,
+							u8 confirm_hint)
+{
+	struct mgmt_ev_user_confirm_request ev;
+
+	BT_DBG("hci%u", index);
+
+	bacpy(&ev.bdaddr, bdaddr);
+	ev.confirm_hint = confirm_hint;
+	put_unaligned_le32(value, &ev.value);
+
+	return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, index, &ev, sizeof(ev),
+									NULL);
+}
+
+static int confirm_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status,
+								u8 opcode)
+{
+	struct pending_cmd *cmd;
+	struct mgmt_rp_user_confirm_reply rp;
+	int err;
+
+	cmd = mgmt_pending_find(opcode, index);
+	if (!cmd)
+		return -ENOENT;
+
+	bacpy(&rp.bdaddr, bdaddr);
+	rp.status = status;
+	err = cmd_complete(cmd->sk, index, opcode, &rp, sizeof(rp));
+
+	mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+int mgmt_user_confirm_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	return confirm_reply_complete(index, bdaddr, status,
+						MGMT_OP_USER_CONFIRM_REPLY);
+}
+
+int mgmt_user_confirm_neg_reply_complete(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	return confirm_reply_complete(index, bdaddr, status,
+					MGMT_OP_USER_CONFIRM_NEG_REPLY);
+}
+
+int mgmt_auth_failed(u16 index, bdaddr_t *bdaddr, u8 status)
+{
+	struct mgmt_ev_auth_failed ev;
+
+	bacpy(&ev.bdaddr, bdaddr);
+	ev.status = status;
+
+	return mgmt_event(MGMT_EV_AUTH_FAILED, index, &ev, sizeof(ev), NULL);
+}
+
+int mgmt_set_local_name_complete(u16 index, u8 *name, u8 status)
+{
+	struct pending_cmd *cmd;
+	struct hci_dev *hdev;
+	struct mgmt_cp_set_local_name ev;
+	int err;
+
+	memset(&ev, 0, sizeof(ev));
+	memcpy(ev.name, name, HCI_MAX_NAME_LENGTH);
+
+	cmd = mgmt_pending_find(MGMT_OP_SET_LOCAL_NAME, index);
+	if (!cmd)
+		goto send_event;
+
+	if (status) {
+		err = cmd_status(cmd->sk, index, MGMT_OP_SET_LOCAL_NAME, EIO);
+		goto failed;
+	}
+
+	hdev = hci_dev_get(index);
+	if (hdev) {
+		hci_dev_lock_bh(hdev);
+		update_eir(hdev);
+		hci_dev_unlock_bh(hdev);
+		hci_dev_put(hdev);
+	}
+
+	err = cmd_complete(cmd->sk, index, MGMT_OP_SET_LOCAL_NAME, &ev,
+								sizeof(ev));
+	if (err < 0)
+		goto failed;
+
+send_event:
+	err = mgmt_event(MGMT_EV_LOCAL_NAME_CHANGED, index, &ev, sizeof(ev),
+							cmd ? cmd->sk : NULL);
+
+failed:
+	if (cmd)
+		mgmt_pending_remove(cmd);
+	return err;
+}
+
+int mgmt_read_local_oob_data_reply_complete(u16 index, u8 *hash, u8 *randomizer,
+								u8 status)
+{
+	struct pending_cmd *cmd;
+	int err;
+
+	BT_DBG("hci%u status %u", index, status);
+
+	cmd = mgmt_pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, index);
+	if (!cmd)
+		return -ENOENT;
+
+	if (status) {
+		err = cmd_status(cmd->sk, index, MGMT_OP_READ_LOCAL_OOB_DATA,
+									EIO);
+	} else {
+		struct mgmt_rp_read_local_oob_data rp;
+
+		memcpy(rp.hash, hash, sizeof(rp.hash));
+		memcpy(rp.randomizer, randomizer, sizeof(rp.randomizer));
+
+		err = cmd_complete(cmd->sk, index, MGMT_OP_READ_LOCAL_OOB_DATA,
+							&rp, sizeof(rp));
+	}
+
+	mgmt_pending_remove(cmd);
+
+	return err;
+}
+
+int mgmt_device_found(u16 index, bdaddr_t *bdaddr, u8 *dev_class, s8 rssi,
+								u8 *eir)
+{
+	struct mgmt_ev_device_found ev;
+
+	memset(&ev, 0, sizeof(ev));
+
+	bacpy(&ev.bdaddr, bdaddr);
+	memcpy(ev.dev_class, dev_class, sizeof(ev.dev_class));
+	ev.rssi = rssi;
+
+	if (eir)
+		memcpy(ev.eir, eir, sizeof(ev.eir));
+
+	return mgmt_event(MGMT_EV_DEVICE_FOUND, index, &ev, sizeof(ev), NULL);
+}
+
+int mgmt_remote_name(u16 index, bdaddr_t *bdaddr, u8 *name)
+{
+	struct mgmt_ev_remote_name ev;
+
+	memset(&ev, 0, sizeof(ev));
+
+	bacpy(&ev.bdaddr, bdaddr);
+	memcpy(ev.name, name, HCI_MAX_NAME_LENGTH);
+
+	return mgmt_event(MGMT_EV_REMOTE_NAME, index, &ev, sizeof(ev), NULL);
+}
+
+int mgmt_discovering(u16 index, u8 discovering)
+{
+	return mgmt_event(MGMT_EV_DISCOVERING, index, &discovering,
+						sizeof(discovering), NULL);
+}
diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig
new file mode 100644
index 00000000..405a0e61
--- /dev/null
+++ b/net/bluetooth/rfcomm/Kconfig
@@ -0,0 +1,17 @@
+config BT_RFCOMM
+	tristate "RFCOMM protocol support"
+	depends on BT && BT_L2CAP
+	help
+	  RFCOMM provides connection oriented stream transport.  RFCOMM
+	  support is required for Dialup Networking, OBEX and other Bluetooth
+	  applications.
+
+	  Say Y here to compile RFCOMM support into the kernel or say M to
+	  compile it as module (rfcomm).
+
+config BT_RFCOMM_TTY
+	bool "RFCOMM TTY support"
+	depends on BT_RFCOMM
+	help
+	  This option enables TTY emulation support for RFCOMM channels.
+
diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile
new file mode 100644
index 00000000..fe07988a
--- /dev/null
+++ b/net/bluetooth/rfcomm/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux Bluetooth RFCOMM layer.
+#
+
+obj-$(CONFIG_BT_RFCOMM) += rfcomm.o
+
+rfcomm-y			:= core.o sock.o
+rfcomm-$(CONFIG_BT_RFCOMM_TTY)	+= tty.o
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
new file mode 100644
index 00000000..c2486a53
--- /dev/null
+++ b/net/bluetooth/rfcomm/core.c
@@ -0,0 +1,2227 @@
+/*
+   RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+   Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * Bluetooth RFCOMM core.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/signal.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/rfcomm.h>
+
+#define VERSION "1.11"
+
+static int disable_cfc;
+static int l2cap_ertm;
+static int channel_mtu = -1;
+static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU;
+
+static struct task_struct *rfcomm_thread;
+
+static DEFINE_MUTEX(rfcomm_mutex);
+#define rfcomm_lock()	mutex_lock(&rfcomm_mutex)
+#define rfcomm_unlock()	mutex_unlock(&rfcomm_mutex)
+
+
+static LIST_HEAD(session_list);
+
+static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len);
+static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci);
+static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci);
+static int rfcomm_queue_disc(struct rfcomm_dlc *d);
+static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type);
+static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d);
+static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig);
+static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len);
+static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits);
+static void rfcomm_make_uih(struct sk_buff *skb, u8 addr);
+
+static void rfcomm_process_connect(struct rfcomm_session *s);
+
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+							bdaddr_t *dst,
+							u8 sec_level,
+							int *err);
+static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst);
+static void rfcomm_session_del(struct rfcomm_session *s);
+
+/* ---- RFCOMM frame parsing macros ---- */
+#define __get_dlci(b)     ((b & 0xfc) >> 2)
+#define __get_channel(b)  ((b & 0xf8) >> 3)
+#define __get_dir(b)      ((b & 0x04) >> 2)
+#define __get_type(b)     ((b & 0xef))
+
+#define __test_ea(b)      ((b & 0x01))
+#define __test_cr(b)      ((b & 0x02))
+#define __test_pf(b)      ((b & 0x10))
+
+#define __addr(cr, dlci)       (((dlci & 0x3f) << 2) | (cr << 1) | 0x01)
+#define __ctrl(type, pf)       (((type & 0xef) | (pf << 4)))
+#define __dlci(dir, chn)       (((chn & 0x1f) << 1) | dir)
+#define __srv_channel(dlci)    (dlci >> 1)
+#define __dir(dlci)            (dlci & 0x01)
+
+#define __len8(len)       (((len) << 1) | 1)
+#define __len16(len)      ((len) << 1)
+
+/* MCC macros */
+#define __mcc_type(cr, type)   (((type << 2) | (cr << 1) | 0x01))
+#define __get_mcc_type(b) ((b & 0xfc) >> 2)
+#define __get_mcc_len(b)  ((b & 0xfe) >> 1)
+
+/* RPN macros */
+#define __rpn_line_settings(data, stop, parity)  ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
+#define __get_rpn_data_bits(line) ((line) & 0x3)
+#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
+#define __get_rpn_parity(line)    (((line) >> 3) & 0x7)
+
+static inline void rfcomm_schedule(void)
+{
+	if (!rfcomm_thread)
+		return;
+	wake_up_process(rfcomm_thread);
+}
+
+static inline void rfcomm_session_put(struct rfcomm_session *s)
+{
+	if (atomic_dec_and_test(&s->refcnt))
+		rfcomm_session_del(s);
+}
+
+/* ---- RFCOMM FCS computation ---- */
+
+/* reversed, 8-bit, poly=0x07 */
+static unsigned char rfcomm_crc_table[256] = {
+	0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
+	0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
+	0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
+	0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67,
+
+	0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d,
+	0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43,
+	0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51,
+	0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f,
+
+	0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05,
+	0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b,
+	0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19,
+	0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17,
+
+	0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d,
+	0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33,
+	0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21,
+	0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f,
+
+	0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95,
+	0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b,
+	0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89,
+	0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87,
+
+	0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad,
+	0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3,
+	0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1,
+	0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf,
+
+	0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5,
+	0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb,
+	0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9,
+	0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7,
+
+	0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd,
+	0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3,
+	0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1,
+	0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf
+};
+
+/* CRC on 2 bytes */
+#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]])
+
+/* FCS on 2 bytes */
+static inline u8 __fcs(u8 *data)
+{
+	return 0xff - __crc(data);
+}
+
+/* FCS on 3 bytes */
+static inline u8 __fcs2(u8 *data)
+{
+	return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
+}
+
+/* Check FCS */
+static inline int __check_fcs(u8 *data, int type, u8 fcs)
+{
+	u8 f = __crc(data);
+
+	if (type != RFCOMM_UIH)
+		f = rfcomm_crc_table[f ^ data[2]];
+
+	return rfcomm_crc_table[f ^ fcs] != 0xcf;
+}
+
+/* ---- L2CAP callbacks ---- */
+static void rfcomm_l2state_change(struct sock *sk)
+{
+	BT_DBG("%p state %d", sk, sk->sk_state);
+	rfcomm_schedule();
+}
+
+static void rfcomm_l2data_ready(struct sock *sk, int bytes)
+{
+	BT_DBG("%p bytes %d", sk, bytes);
+	rfcomm_schedule();
+}
+
+static int rfcomm_l2sock_create(struct socket **sock)
+{
+	int err;
+
+	BT_DBG("");
+
+	err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
+	if (!err) {
+		struct sock *sk = (*sock)->sk;
+		sk->sk_data_ready   = rfcomm_l2data_ready;
+		sk->sk_state_change = rfcomm_l2state_change;
+	}
+	return err;
+}
+
+static inline int rfcomm_check_security(struct rfcomm_dlc *d)
+{
+	struct sock *sk = d->session->sock->sk;
+	struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+
+	__u8 auth_type;
+
+	switch (d->sec_level) {
+	case BT_SECURITY_HIGH:
+		auth_type = HCI_AT_GENERAL_BONDING_MITM;
+		break;
+	case BT_SECURITY_MEDIUM:
+		auth_type = HCI_AT_GENERAL_BONDING;
+		break;
+	default:
+		auth_type = HCI_AT_NO_BONDING;
+		break;
+	}
+
+	return hci_conn_security(conn->hcon, d->sec_level, auth_type);
+}
+
+static void rfcomm_session_timeout(unsigned long arg)
+{
+	struct rfcomm_session *s = (void *) arg;
+
+	BT_DBG("session %p state %ld", s, s->state);
+
+	set_bit(RFCOMM_TIMED_OUT, &s->flags);
+	rfcomm_schedule();
+}
+
+static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
+{
+	BT_DBG("session %p state %ld timeout %ld", s, s->state, timeout);
+
+	if (!mod_timer(&s->timer, jiffies + timeout))
+		rfcomm_session_hold(s);
+}
+
+static void rfcomm_session_clear_timer(struct rfcomm_session *s)
+{
+	BT_DBG("session %p state %ld", s, s->state);
+
+	if (timer_pending(&s->timer) && del_timer(&s->timer))
+		rfcomm_session_put(s);
+}
+
+/* ---- RFCOMM DLCs ---- */
+static void rfcomm_dlc_timeout(unsigned long arg)
+{
+	struct rfcomm_dlc *d = (void *) arg;
+
+	BT_DBG("dlc %p state %ld", d, d->state);
+
+	set_bit(RFCOMM_TIMED_OUT, &d->flags);
+	rfcomm_dlc_put(d);
+	rfcomm_schedule();
+}
+
+static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
+{
+	BT_DBG("dlc %p state %ld timeout %ld", d, d->state, timeout);
+
+	if (!mod_timer(&d->timer, jiffies + timeout))
+		rfcomm_dlc_hold(d);
+}
+
+static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d)
+{
+	BT_DBG("dlc %p state %ld", d, d->state);
+
+	if (timer_pending(&d->timer) && del_timer(&d->timer))
+		rfcomm_dlc_put(d);
+}
+
+static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
+{
+	BT_DBG("%p", d);
+
+	d->state      = BT_OPEN;
+	d->flags      = 0;
+	d->mscex      = 0;
+	d->sec_level  = BT_SECURITY_LOW;
+	d->mtu        = RFCOMM_DEFAULT_MTU;
+	d->v24_sig    = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV;
+
+	d->cfc        = RFCOMM_CFC_DISABLED;
+	d->rx_credits = RFCOMM_DEFAULT_CREDITS;
+}
+
+struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
+{
+	struct rfcomm_dlc *d = kzalloc(sizeof(*d), prio);
+
+	if (!d)
+		return NULL;
+
+	setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d);
+
+	skb_queue_head_init(&d->tx_queue);
+	spin_lock_init(&d->lock);
+	atomic_set(&d->refcnt, 1);
+
+	rfcomm_dlc_clear_state(d);
+
+	BT_DBG("%p", d);
+
+	return d;
+}
+
+void rfcomm_dlc_free(struct rfcomm_dlc *d)
+{
+	BT_DBG("%p", d);
+
+	skb_queue_purge(&d->tx_queue);
+	kfree(d);
+}
+
+static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d)
+{
+	BT_DBG("dlc %p session %p", d, s);
+
+	rfcomm_session_hold(s);
+
+	rfcomm_session_clear_timer(s);
+	rfcomm_dlc_hold(d);
+	list_add(&d->list, &s->dlcs);
+	d->session = s;
+}
+
+static void rfcomm_dlc_unlink(struct rfcomm_dlc *d)
+{
+	struct rfcomm_session *s = d->session;
+
+	BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s);
+
+	list_del(&d->list);
+	d->session = NULL;
+	rfcomm_dlc_put(d);
+
+	if (list_empty(&s->dlcs))
+		rfcomm_session_set_timer(s, RFCOMM_IDLE_TIMEOUT);
+
+	rfcomm_session_put(s);
+}
+
+static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_dlc *d;
+	struct list_head *p;
+
+	list_for_each(p, &s->dlcs) {
+		d = list_entry(p, struct rfcomm_dlc, list);
+		if (d->dlci == dlci)
+			return d;
+	}
+	return NULL;
+}
+
+static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel)
+{
+	struct rfcomm_session *s;
+	int err = 0;
+	u8 dlci;
+
+	BT_DBG("dlc %p state %ld %s %s channel %d",
+			d, d->state, batostr(src), batostr(dst), channel);
+
+	if (channel < 1 || channel > 30)
+		return -EINVAL;
+
+	if (d->state != BT_OPEN && d->state != BT_CLOSED)
+		return 0;
+
+	s = rfcomm_session_get(src, dst);
+	if (!s) {
+		s = rfcomm_session_create(src, dst, d->sec_level, &err);
+		if (!s)
+			return err;
+	}
+
+	dlci = __dlci(!s->initiator, channel);
+
+	/* Check if DLCI already exists */
+	if (rfcomm_dlc_get(s, dlci))
+		return -EBUSY;
+
+	rfcomm_dlc_clear_state(d);
+
+	d->dlci     = dlci;
+	d->addr     = __addr(s->initiator, dlci);
+	d->priority = 7;
+
+	d->state = BT_CONFIG;
+	rfcomm_dlc_link(s, d);
+
+	d->out = 1;
+
+	d->mtu = s->mtu;
+	d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
+
+	if (s->state == BT_CONNECTED) {
+		if (rfcomm_check_security(d))
+			rfcomm_send_pn(s, 1, d);
+		else
+			set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+	}
+
+	rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+
+	return 0;
+}
+
+int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel)
+{
+	int r;
+
+	rfcomm_lock();
+
+	r = __rfcomm_dlc_open(d, src, dst, channel);
+
+	rfcomm_unlock();
+	return r;
+}
+
+static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
+{
+	struct rfcomm_session *s = d->session;
+	if (!s)
+		return 0;
+
+	BT_DBG("dlc %p state %ld dlci %d err %d session %p",
+			d, d->state, d->dlci, err, s);
+
+	switch (d->state) {
+	case BT_CONNECT:
+		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+			rfcomm_schedule();
+			break;
+		}
+		/* Fall through */
+
+	case BT_CONNECTED:
+		d->state = BT_DISCONN;
+		if (skb_queue_empty(&d->tx_queue)) {
+			rfcomm_send_disc(s, d->dlci);
+			rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT);
+		} else {
+			rfcomm_queue_disc(d);
+			rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2);
+		}
+		break;
+
+	case BT_OPEN:
+	case BT_CONNECT2:
+		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+			rfcomm_schedule();
+			break;
+		}
+		/* Fall through */
+
+	default:
+		rfcomm_dlc_clear_timer(d);
+
+		rfcomm_dlc_lock(d);
+		d->state = BT_CLOSED;
+		d->state_change(d, err);
+		rfcomm_dlc_unlock(d);
+
+		skb_queue_purge(&d->tx_queue);
+		rfcomm_dlc_unlink(d);
+	}
+
+	return 0;
+}
+
+int rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
+{
+	int r;
+
+	rfcomm_lock();
+
+	r = __rfcomm_dlc_close(d, err);
+
+	rfcomm_unlock();
+	return r;
+}
+
+int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+	int len = skb->len;
+
+	if (d->state != BT_CONNECTED)
+		return -ENOTCONN;
+
+	BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+
+	if (len > d->mtu)
+		return -EINVAL;
+
+	rfcomm_make_uih(skb, d->addr);
+	skb_queue_tail(&d->tx_queue, skb);
+
+	if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+		rfcomm_schedule();
+	return len;
+}
+
+void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
+{
+	BT_DBG("dlc %p state %ld", d, d->state);
+
+	if (!d->cfc) {
+		d->v24_sig |= RFCOMM_V24_FC;
+		set_bit(RFCOMM_MSC_PENDING, &d->flags);
+	}
+	rfcomm_schedule();
+}
+
+void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
+{
+	BT_DBG("dlc %p state %ld", d, d->state);
+
+	if (!d->cfc) {
+		d->v24_sig &= ~RFCOMM_V24_FC;
+		set_bit(RFCOMM_MSC_PENDING, &d->flags);
+	}
+	rfcomm_schedule();
+}
+
+/*
+   Set/get modem status functions use _local_ status i.e. what we report
+   to the other side.
+   Remote status is provided by dlc->modem_status() callback.
+ */
+int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
+{
+	BT_DBG("dlc %p state %ld v24_sig 0x%x",
+			d, d->state, v24_sig);
+
+	if (test_bit(RFCOMM_RX_THROTTLED, &d->flags))
+		v24_sig |= RFCOMM_V24_FC;
+	else
+		v24_sig &= ~RFCOMM_V24_FC;
+
+	d->v24_sig = v24_sig;
+
+	if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
+		rfcomm_schedule();
+
+	return 0;
+}
+
+int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig)
+{
+	BT_DBG("dlc %p state %ld v24_sig 0x%x",
+			d, d->state, d->v24_sig);
+
+	*v24_sig = d->v24_sig;
+	return 0;
+}
+
+/* ---- RFCOMM sessions ---- */
+static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
+{
+	struct rfcomm_session *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		return NULL;
+
+	BT_DBG("session %p sock %p", s, sock);
+
+	setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s);
+
+	INIT_LIST_HEAD(&s->dlcs);
+	s->state = state;
+	s->sock  = sock;
+
+	s->mtu = RFCOMM_DEFAULT_MTU;
+	s->cfc = disable_cfc ? RFCOMM_CFC_DISABLED : RFCOMM_CFC_UNKNOWN;
+
+	/* Do not increment module usage count for listening sessions.
+	 * Otherwise we won't be able to unload the module. */
+	if (state != BT_LISTEN)
+		if (!try_module_get(THIS_MODULE)) {
+			kfree(s);
+			return NULL;
+		}
+
+	list_add(&s->list, &session_list);
+
+	return s;
+}
+
+static void rfcomm_session_del(struct rfcomm_session *s)
+{
+	int state = s->state;
+
+	BT_DBG("session %p state %ld", s, s->state);
+
+	list_del(&s->list);
+
+	if (state == BT_CONNECTED)
+		rfcomm_send_disc(s, 0);
+
+	rfcomm_session_clear_timer(s);
+	sock_release(s->sock);
+	kfree(s);
+
+	if (state != BT_LISTEN)
+		module_put(THIS_MODULE);
+}
+
+static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst)
+{
+	struct rfcomm_session *s;
+	struct list_head *p, *n;
+	struct bt_sock *sk;
+	list_for_each_safe(p, n, &session_list) {
+		s = list_entry(p, struct rfcomm_session, list);
+		sk = bt_sk(s->sock->sk);
+
+		if ((!bacmp(src, BDADDR_ANY) || !bacmp(&sk->src, src)) &&
+				!bacmp(&sk->dst, dst))
+			return s;
+	}
+	return NULL;
+}
+
+static void rfcomm_session_close(struct rfcomm_session *s, int err)
+{
+	struct rfcomm_dlc *d;
+	struct list_head *p, *n;
+
+	BT_DBG("session %p state %ld err %d", s, s->state, err);
+
+	rfcomm_session_hold(s);
+
+	s->state = BT_CLOSED;
+
+	/* Close all dlcs */
+	list_for_each_safe(p, n, &s->dlcs) {
+		d = list_entry(p, struct rfcomm_dlc, list);
+		d->state = BT_CLOSED;
+		__rfcomm_dlc_close(d, err);
+	}
+
+	rfcomm_session_clear_timer(s);
+	rfcomm_session_put(s);
+}
+
+static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
+							bdaddr_t *dst,
+							u8 sec_level,
+							int *err)
+{
+	struct rfcomm_session *s = NULL;
+	struct sockaddr_l2 addr;
+	struct socket *sock;
+	struct sock *sk;
+
+	BT_DBG("%s %s", batostr(src), batostr(dst));
+
+	*err = rfcomm_l2sock_create(&sock);
+	if (*err < 0)
+		return NULL;
+
+	bacpy(&addr.l2_bdaddr, src);
+	addr.l2_family = AF_BLUETOOTH;
+	addr.l2_psm    = 0;
+	addr.l2_cid    = 0;
+	*err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (*err < 0)
+		goto failed;
+
+	/* Set L2CAP options */
+	sk = sock->sk;
+	lock_sock(sk);
+	l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+	l2cap_pi(sk)->chan->sec_level = sec_level;
+	if (l2cap_ertm)
+		l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM;
+	release_sock(sk);
+
+	s = rfcomm_session_add(sock, BT_BOUND);
+	if (!s) {
+		*err = -ENOMEM;
+		goto failed;
+	}
+
+	s->initiator = 1;
+
+	bacpy(&addr.l2_bdaddr, dst);
+	addr.l2_family = AF_BLUETOOTH;
+	addr.l2_psm    = cpu_to_le16(RFCOMM_PSM);
+	addr.l2_cid    = 0;
+	*err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
+	if (*err == 0 || *err == -EINPROGRESS)
+		return s;
+
+	rfcomm_session_del(s);
+	return NULL;
+
+failed:
+	sock_release(sock);
+	return NULL;
+}
+
+void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst)
+{
+	struct sock *sk = s->sock->sk;
+	if (src)
+		bacpy(src, &bt_sk(sk)->src);
+	if (dst)
+		bacpy(dst, &bt_sk(sk)->dst);
+}
+
+/* ---- RFCOMM frame sending ---- */
+static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len)
+{
+	struct socket *sock = s->sock;
+	struct kvec iv = { data, len };
+	struct msghdr msg;
+
+	BT_DBG("session %p len %d", s, len);
+
+	memset(&msg, 0, sizeof(msg));
+
+	return kernel_sendmsg(sock, &msg, &iv, 1, len);
+}
+
+static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_cmd cmd;
+
+	BT_DBG("%p dlci %d", s, dlci);
+
+	cmd.addr = __addr(s->initiator, dlci);
+	cmd.ctrl = __ctrl(RFCOMM_SABM, 1);
+	cmd.len  = __len8(0);
+	cmd.fcs  = __fcs2((u8 *) &cmd);
+
+	return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+}
+
+static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_cmd cmd;
+
+	BT_DBG("%p dlci %d", s, dlci);
+
+	cmd.addr = __addr(!s->initiator, dlci);
+	cmd.ctrl = __ctrl(RFCOMM_UA, 1);
+	cmd.len  = __len8(0);
+	cmd.fcs  = __fcs2((u8 *) &cmd);
+
+	return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+}
+
+static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_cmd cmd;
+
+	BT_DBG("%p dlci %d", s, dlci);
+
+	cmd.addr = __addr(s->initiator, dlci);
+	cmd.ctrl = __ctrl(RFCOMM_DISC, 1);
+	cmd.len  = __len8(0);
+	cmd.fcs  = __fcs2((u8 *) &cmd);
+
+	return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+}
+
+static int rfcomm_queue_disc(struct rfcomm_dlc *d)
+{
+	struct rfcomm_cmd *cmd;
+	struct sk_buff *skb;
+
+	BT_DBG("dlc %p dlci %d", d, d->dlci);
+
+	skb = alloc_skb(sizeof(*cmd), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	cmd = (void *) __skb_put(skb, sizeof(*cmd));
+	cmd->addr = d->addr;
+	cmd->ctrl = __ctrl(RFCOMM_DISC, 1);
+	cmd->len  = __len8(0);
+	cmd->fcs  = __fcs2((u8 *) cmd);
+
+	skb_queue_tail(&d->tx_queue, skb);
+	rfcomm_schedule();
+	return 0;
+}
+
+static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_cmd cmd;
+
+	BT_DBG("%p dlci %d", s, dlci);
+
+	cmd.addr = __addr(!s->initiator, dlci);
+	cmd.ctrl = __ctrl(RFCOMM_DM, 1);
+	cmd.len  = __len8(0);
+	cmd.fcs  = __fcs2((u8 *) &cmd);
+
+	return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd));
+}
+
+static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d type %d", s, cr, type);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc) + 1);
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_NSC);
+	mcc->len  = __len8(1);
+
+	/* Type that we didn't like */
+	*ptr = __mcc_type(cr, type); ptr++;
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	struct rfcomm_pn  *pn;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d dlci %d mtu %d", s, cr, d->dlci, d->mtu);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc) + sizeof(*pn));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_PN);
+	mcc->len  = __len8(sizeof(*pn));
+
+	pn = (void *) ptr; ptr += sizeof(*pn);
+	pn->dlci        = d->dlci;
+	pn->priority    = d->priority;
+	pn->ack_timer   = 0;
+	pn->max_retrans = 0;
+
+	if (s->cfc) {
+		pn->flow_ctrl = cr ? 0xf0 : 0xe0;
+		pn->credits = RFCOMM_DEFAULT_CREDITS;
+	} else {
+		pn->flow_ctrl = 0;
+		pn->credits   = 0;
+	}
+
+	if (cr && channel_mtu >= 0)
+		pn->mtu = cpu_to_le16(channel_mtu);
+	else
+		pn->mtu = cpu_to_le16(d->mtu);
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+			u8 bit_rate, u8 data_bits, u8 stop_bits,
+			u8 parity, u8 flow_ctrl_settings,
+			u8 xon_char, u8 xoff_char, u16 param_mask)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	struct rfcomm_rpn *rpn;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
+			" flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
+		s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
+		flow_ctrl_settings, xon_char, xoff_char, param_mask);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc) + sizeof(*rpn));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_RPN);
+	mcc->len  = __len8(sizeof(*rpn));
+
+	rpn = (void *) ptr; ptr += sizeof(*rpn);
+	rpn->dlci          = __addr(1, dlci);
+	rpn->bit_rate      = bit_rate;
+	rpn->line_settings = __rpn_line_settings(data_bits, stop_bits, parity);
+	rpn->flow_ctrl     = flow_ctrl_settings;
+	rpn->xon_char      = xon_char;
+	rpn->xoff_char     = xoff_char;
+	rpn->param_mask    = cpu_to_le16(param_mask);
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_rls(struct rfcomm_session *s, int cr, u8 dlci, u8 status)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	struct rfcomm_rls *rls;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d status 0x%x", s, cr, status);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc) + sizeof(*rls));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_RLS);
+	mcc->len  = __len8(sizeof(*rls));
+
+	rls = (void *) ptr; ptr += sizeof(*rls);
+	rls->dlci   = __addr(1, dlci);
+	rls->status = status;
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	struct rfcomm_msc *msc;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d v24 0x%x", s, cr, v24_sig);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc) + sizeof(*msc));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_MSC);
+	mcc->len  = __len8(sizeof(*msc));
+
+	msc = (void *) ptr; ptr += sizeof(*msc);
+	msc->dlci    = __addr(1, dlci);
+	msc->v24_sig = v24_sig | 0x01;
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_fcoff(struct rfcomm_session *s, int cr)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d", s, cr);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_FCOFF);
+	mcc->len  = __len8(0);
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_fcon(struct rfcomm_session *s, int cr)
+{
+	struct rfcomm_hdr *hdr;
+	struct rfcomm_mcc *mcc;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p cr %d", s, cr);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = __addr(s->initiator, 0);
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+	hdr->len  = __len8(sizeof(*mcc));
+
+	mcc = (void *) ptr; ptr += sizeof(*mcc);
+	mcc->type = __mcc_type(cr, RFCOMM_FCON);
+	mcc->len  = __len8(0);
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len)
+{
+	struct socket *sock = s->sock;
+	struct kvec iv[3];
+	struct msghdr msg;
+	unsigned char hdr[5], crc[1];
+
+	if (len > 125)
+		return -EINVAL;
+
+	BT_DBG("%p cr %d", s, cr);
+
+	hdr[0] = __addr(s->initiator, 0);
+	hdr[1] = __ctrl(RFCOMM_UIH, 0);
+	hdr[2] = 0x01 | ((len + 2) << 1);
+	hdr[3] = 0x01 | ((cr & 0x01) << 1) | (RFCOMM_TEST << 2);
+	hdr[4] = 0x01 | (len << 1);
+
+	crc[0] = __fcs(hdr);
+
+	iv[0].iov_base = hdr;
+	iv[0].iov_len  = 5;
+	iv[1].iov_base = pattern;
+	iv[1].iov_len  = len;
+	iv[2].iov_base = crc;
+	iv[2].iov_len  = 1;
+
+	memset(&msg, 0, sizeof(msg));
+
+	return kernel_sendmsg(sock, &msg, iv, 3, 6 + len);
+}
+
+static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits)
+{
+	struct rfcomm_hdr *hdr;
+	u8 buf[16], *ptr = buf;
+
+	BT_DBG("%p addr %d credits %d", s, addr, credits);
+
+	hdr = (void *) ptr; ptr += sizeof(*hdr);
+	hdr->addr = addr;
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 1);
+	hdr->len  = __len8(0);
+
+	*ptr = credits; ptr++;
+
+	*ptr = __fcs(buf); ptr++;
+
+	return rfcomm_send_frame(s, buf, ptr - buf);
+}
+
+static void rfcomm_make_uih(struct sk_buff *skb, u8 addr)
+{
+	struct rfcomm_hdr *hdr;
+	int len = skb->len;
+	u8 *crc;
+
+	if (len > 127) {
+		hdr = (void *) skb_push(skb, 4);
+		put_unaligned(cpu_to_le16(__len16(len)), (__le16 *) &hdr->len);
+	} else {
+		hdr = (void *) skb_push(skb, 3);
+		hdr->len = __len8(len);
+	}
+	hdr->addr = addr;
+	hdr->ctrl = __ctrl(RFCOMM_UIH, 0);
+
+	crc = skb_put(skb, 1);
+	*crc = __fcs((void *) hdr);
+}
+
+/* ---- RFCOMM frame reception ---- */
+static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci)
+{
+	BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+	if (dlci) {
+		/* Data channel */
+		struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+		if (!d) {
+			rfcomm_send_dm(s, dlci);
+			return 0;
+		}
+
+		switch (d->state) {
+		case BT_CONNECT:
+			rfcomm_dlc_clear_timer(d);
+
+			rfcomm_dlc_lock(d);
+			d->state = BT_CONNECTED;
+			d->state_change(d, 0);
+			rfcomm_dlc_unlock(d);
+
+			rfcomm_send_msc(s, 1, dlci, d->v24_sig);
+			break;
+
+		case BT_DISCONN:
+			d->state = BT_CLOSED;
+			__rfcomm_dlc_close(d, 0);
+
+			if (list_empty(&s->dlcs)) {
+				s->state = BT_DISCONN;
+				rfcomm_send_disc(s, 0);
+			}
+
+			break;
+		}
+	} else {
+		/* Control channel */
+		switch (s->state) {
+		case BT_CONNECT:
+			s->state = BT_CONNECTED;
+			rfcomm_process_connect(s);
+			break;
+
+		case BT_DISCONN:
+			/* When socket is closed and we are not RFCOMM
+			 * initiator rfcomm_process_rx already calls
+			 * rfcomm_session_put() */
+			if (s->sock->sk->sk_state != BT_CLOSED)
+				if (list_empty(&s->dlcs))
+					rfcomm_session_put(s);
+			break;
+		}
+	}
+	return 0;
+}
+
+static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci)
+{
+	int err = 0;
+
+	BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+	if (dlci) {
+		/* Data DLC */
+		struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+		if (d) {
+			if (d->state == BT_CONNECT || d->state == BT_CONFIG)
+				err = ECONNREFUSED;
+			else
+				err = ECONNRESET;
+
+			d->state = BT_CLOSED;
+			__rfcomm_dlc_close(d, err);
+		}
+	} else {
+		if (s->state == BT_CONNECT)
+			err = ECONNREFUSED;
+		else
+			err = ECONNRESET;
+
+		s->state = BT_CLOSED;
+		rfcomm_session_close(s, err);
+	}
+	return 0;
+}
+
+static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
+{
+	int err = 0;
+
+	BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+	if (dlci) {
+		struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci);
+		if (d) {
+			rfcomm_send_ua(s, dlci);
+
+			if (d->state == BT_CONNECT || d->state == BT_CONFIG)
+				err = ECONNREFUSED;
+			else
+				err = ECONNRESET;
+
+			d->state = BT_CLOSED;
+			__rfcomm_dlc_close(d, err);
+		} else
+			rfcomm_send_dm(s, dlci);
+
+	} else {
+		rfcomm_send_ua(s, 0);
+
+		if (s->state == BT_CONNECT)
+			err = ECONNREFUSED;
+		else
+			err = ECONNRESET;
+
+		s->state = BT_CLOSED;
+		rfcomm_session_close(s, err);
+	}
+
+	return 0;
+}
+
+void rfcomm_dlc_accept(struct rfcomm_dlc *d)
+{
+	struct sock *sk = d->session->sock->sk;
+	struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+
+	BT_DBG("dlc %p", d);
+
+	rfcomm_send_ua(d->session, d->dlci);
+
+	rfcomm_dlc_clear_timer(d);
+
+	rfcomm_dlc_lock(d);
+	d->state = BT_CONNECTED;
+	d->state_change(d, 0);
+	rfcomm_dlc_unlock(d);
+
+	if (d->role_switch)
+		hci_conn_switch_role(conn->hcon, 0x00);
+
+	rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
+}
+
+static void rfcomm_check_accept(struct rfcomm_dlc *d)
+{
+	if (rfcomm_check_security(d)) {
+		if (d->defer_setup) {
+			set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+			rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+			rfcomm_dlc_lock(d);
+			d->state = BT_CONNECT2;
+			d->state_change(d, 0);
+			rfcomm_dlc_unlock(d);
+		} else
+			rfcomm_dlc_accept(d);
+	} else {
+		set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+		rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+	}
+}
+
+static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
+{
+	struct rfcomm_dlc *d;
+	u8 channel;
+
+	BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+	if (!dlci) {
+		rfcomm_send_ua(s, 0);
+
+		if (s->state == BT_OPEN) {
+			s->state = BT_CONNECTED;
+			rfcomm_process_connect(s);
+		}
+		return 0;
+	}
+
+	/* Check if DLC exists */
+	d = rfcomm_dlc_get(s, dlci);
+	if (d) {
+		if (d->state == BT_OPEN) {
+			/* DLC was previously opened by PN request */
+			rfcomm_check_accept(d);
+		}
+		return 0;
+	}
+
+	/* Notify socket layer about incoming connection */
+	channel = __srv_channel(dlci);
+	if (rfcomm_connect_ind(s, channel, &d)) {
+		d->dlci = dlci;
+		d->addr = __addr(s->initiator, dlci);
+		rfcomm_dlc_link(s, d);
+
+		rfcomm_check_accept(d);
+	} else {
+		rfcomm_send_dm(s, dlci);
+	}
+
+	return 0;
+}
+
+static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn)
+{
+	struct rfcomm_session *s = d->session;
+
+	BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d",
+			d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits);
+
+	if ((pn->flow_ctrl == 0xf0 && s->cfc != RFCOMM_CFC_DISABLED) ||
+						pn->flow_ctrl == 0xe0) {
+		d->cfc = RFCOMM_CFC_ENABLED;
+		d->tx_credits = pn->credits;
+	} else {
+		d->cfc = RFCOMM_CFC_DISABLED;
+		set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+	}
+
+	if (s->cfc == RFCOMM_CFC_UNKNOWN)
+		s->cfc = d->cfc;
+
+	d->priority = pn->priority;
+
+	d->mtu = __le16_to_cpu(pn->mtu);
+
+	if (cr && d->mtu > s->mtu)
+		d->mtu = s->mtu;
+
+	return 0;
+}
+
+static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+	struct rfcomm_pn *pn = (void *) skb->data;
+	struct rfcomm_dlc *d;
+	u8 dlci = pn->dlci;
+
+	BT_DBG("session %p state %ld dlci %d", s, s->state, dlci);
+
+	if (!dlci)
+		return 0;
+
+	d = rfcomm_dlc_get(s, dlci);
+	if (d) {
+		if (cr) {
+			/* PN request */
+			rfcomm_apply_pn(d, cr, pn);
+			rfcomm_send_pn(s, 0, d);
+		} else {
+			/* PN response */
+			switch (d->state) {
+			case BT_CONFIG:
+				rfcomm_apply_pn(d, cr, pn);
+
+				d->state = BT_CONNECT;
+				rfcomm_send_sabm(s, d->dlci);
+				break;
+			}
+		}
+	} else {
+		u8 channel = __srv_channel(dlci);
+
+		if (!cr)
+			return 0;
+
+		/* PN request for non existing DLC.
+		 * Assume incoming connection. */
+		if (rfcomm_connect_ind(s, channel, &d)) {
+			d->dlci = dlci;
+			d->addr = __addr(s->initiator, dlci);
+			rfcomm_dlc_link(s, d);
+
+			rfcomm_apply_pn(d, cr, pn);
+
+			d->state = BT_OPEN;
+			rfcomm_send_pn(s, 0, d);
+		} else {
+			rfcomm_send_dm(s, dlci);
+		}
+	}
+	return 0;
+}
+
+static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb)
+{
+	struct rfcomm_rpn *rpn = (void *) skb->data;
+	u8 dlci = __get_dlci(rpn->dlci);
+
+	u8 bit_rate  = 0;
+	u8 data_bits = 0;
+	u8 stop_bits = 0;
+	u8 parity    = 0;
+	u8 flow_ctrl = 0;
+	u8 xon_char  = 0;
+	u8 xoff_char = 0;
+	u16 rpn_mask = RFCOMM_RPN_PM_ALL;
+
+	BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+		dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+		rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+	if (!cr)
+		return 0;
+
+	if (len == 1) {
+		/* This is a request, return default (according to ETSI TS 07.10) settings */
+		bit_rate  = RFCOMM_RPN_BR_9600;
+		data_bits = RFCOMM_RPN_DATA_8;
+		stop_bits = RFCOMM_RPN_STOP_1;
+		parity    = RFCOMM_RPN_PARITY_NONE;
+		flow_ctrl = RFCOMM_RPN_FLOW_NONE;
+		xon_char  = RFCOMM_RPN_XON_CHAR;
+		xoff_char = RFCOMM_RPN_XOFF_CHAR;
+		goto rpn_out;
+	}
+
+	/* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+	 * no parity, no flow control lines, normal XON/XOFF chars */
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
+		bit_rate = rpn->bit_rate;
+		if (bit_rate > RFCOMM_RPN_BR_230400) {
+			BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
+			bit_rate = RFCOMM_RPN_BR_9600;
+			rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_DATA)) {
+		data_bits = __get_rpn_data_bits(rpn->line_settings);
+		if (data_bits != RFCOMM_RPN_DATA_8) {
+			BT_DBG("RPN data bits mismatch 0x%x", data_bits);
+			data_bits = RFCOMM_RPN_DATA_8;
+			rpn_mask ^= RFCOMM_RPN_PM_DATA;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_STOP)) {
+		stop_bits = __get_rpn_stop_bits(rpn->line_settings);
+		if (stop_bits != RFCOMM_RPN_STOP_1) {
+			BT_DBG("RPN stop bits mismatch 0x%x", stop_bits);
+			stop_bits = RFCOMM_RPN_STOP_1;
+			rpn_mask ^= RFCOMM_RPN_PM_STOP;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_PARITY)) {
+		parity = __get_rpn_parity(rpn->line_settings);
+		if (parity != RFCOMM_RPN_PARITY_NONE) {
+			BT_DBG("RPN parity mismatch 0x%x", parity);
+			parity = RFCOMM_RPN_PARITY_NONE;
+			rpn_mask ^= RFCOMM_RPN_PM_PARITY;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_FLOW)) {
+		flow_ctrl = rpn->flow_ctrl;
+		if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
+			BT_DBG("RPN flow ctrl mismatch 0x%x", flow_ctrl);
+			flow_ctrl = RFCOMM_RPN_FLOW_NONE;
+			rpn_mask ^= RFCOMM_RPN_PM_FLOW;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XON)) {
+		xon_char = rpn->xon_char;
+		if (xon_char != RFCOMM_RPN_XON_CHAR) {
+			BT_DBG("RPN XON char mismatch 0x%x", xon_char);
+			xon_char = RFCOMM_RPN_XON_CHAR;
+			rpn_mask ^= RFCOMM_RPN_PM_XON;
+		}
+	}
+
+	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_XOFF)) {
+		xoff_char = rpn->xoff_char;
+		if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
+			BT_DBG("RPN XOFF char mismatch 0x%x", xoff_char);
+			xoff_char = RFCOMM_RPN_XOFF_CHAR;
+			rpn_mask ^= RFCOMM_RPN_PM_XOFF;
+		}
+	}
+
+rpn_out:
+	rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
+			parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
+
+	return 0;
+}
+
+static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+	struct rfcomm_rls *rls = (void *) skb->data;
+	u8 dlci = __get_dlci(rls->dlci);
+
+	BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
+
+	if (!cr)
+		return 0;
+
+	/* We should probably do something with this information here. But
+	 * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
+	 * mandatory to recognise and respond to RLS */
+
+	rfcomm_send_rls(s, 0, dlci, rls->status);
+
+	return 0;
+}
+
+static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb)
+{
+	struct rfcomm_msc *msc = (void *) skb->data;
+	struct rfcomm_dlc *d;
+	u8 dlci = __get_dlci(msc->dlci);
+
+	BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
+
+	d = rfcomm_dlc_get(s, dlci);
+	if (!d)
+		return 0;
+
+	if (cr) {
+		if (msc->v24_sig & RFCOMM_V24_FC && !d->cfc)
+			set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+		else
+			clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
+
+		rfcomm_dlc_lock(d);
+
+		d->remote_v24_sig = msc->v24_sig;
+
+		if (d->modem_status)
+			d->modem_status(d, msc->v24_sig);
+
+		rfcomm_dlc_unlock(d);
+
+		rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
+
+		d->mscex |= RFCOMM_MSCEX_RX;
+	} else
+		d->mscex |= RFCOMM_MSCEX_TX;
+
+	return 0;
+}
+
+static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb)
+{
+	struct rfcomm_mcc *mcc = (void *) skb->data;
+	u8 type, cr, len;
+
+	cr   = __test_cr(mcc->type);
+	type = __get_mcc_type(mcc->type);
+	len  = __get_mcc_len(mcc->len);
+
+	BT_DBG("%p type 0x%x cr %d", s, type, cr);
+
+	skb_pull(skb, 2);
+
+	switch (type) {
+	case RFCOMM_PN:
+		rfcomm_recv_pn(s, cr, skb);
+		break;
+
+	case RFCOMM_RPN:
+		rfcomm_recv_rpn(s, cr, len, skb);
+		break;
+
+	case RFCOMM_RLS:
+		rfcomm_recv_rls(s, cr, skb);
+		break;
+
+	case RFCOMM_MSC:
+		rfcomm_recv_msc(s, cr, skb);
+		break;
+
+	case RFCOMM_FCOFF:
+		if (cr) {
+			set_bit(RFCOMM_TX_THROTTLED, &s->flags);
+			rfcomm_send_fcoff(s, 0);
+		}
+		break;
+
+	case RFCOMM_FCON:
+		if (cr) {
+			clear_bit(RFCOMM_TX_THROTTLED, &s->flags);
+			rfcomm_send_fcon(s, 0);
+		}
+		break;
+
+	case RFCOMM_TEST:
+		if (cr)
+			rfcomm_send_test(s, 0, skb->data, skb->len);
+		break;
+
+	case RFCOMM_NSC:
+		break;
+
+	default:
+		BT_ERR("Unknown control type 0x%02x", type);
+		rfcomm_send_nsc(s, cr, type);
+		break;
+	}
+	return 0;
+}
+
+static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk_buff *skb)
+{
+	struct rfcomm_dlc *d;
+
+	BT_DBG("session %p state %ld dlci %d pf %d", s, s->state, dlci, pf);
+
+	d = rfcomm_dlc_get(s, dlci);
+	if (!d) {
+		rfcomm_send_dm(s, dlci);
+		goto drop;
+	}
+
+	if (pf && d->cfc) {
+		u8 credits = *(u8 *) skb->data; skb_pull(skb, 1);
+
+		d->tx_credits += credits;
+		if (d->tx_credits)
+			clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
+	}
+
+	if (skb->len && d->state == BT_CONNECTED) {
+		rfcomm_dlc_lock(d);
+		d->rx_credits--;
+		d->data_ready(d, skb);
+		rfcomm_dlc_unlock(d);
+		return 0;
+	}
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
+{
+	struct rfcomm_hdr *hdr = (void *) skb->data;
+	u8 type, dlci, fcs;
+
+	dlci = __get_dlci(hdr->addr);
+	type = __get_type(hdr->ctrl);
+
+	/* Trim FCS */
+	skb->len--; skb->tail--;
+	fcs = *(u8 *)skb_tail_pointer(skb);
+
+	if (__check_fcs(skb->data, type, fcs)) {
+		BT_ERR("bad checksum in packet");
+		kfree_skb(skb);
+		return -EILSEQ;
+	}
+
+	if (__test_ea(hdr->len))
+		skb_pull(skb, 3);
+	else
+		skb_pull(skb, 4);
+
+	switch (type) {
+	case RFCOMM_SABM:
+		if (__test_pf(hdr->ctrl))
+			rfcomm_recv_sabm(s, dlci);
+		break;
+
+	case RFCOMM_DISC:
+		if (__test_pf(hdr->ctrl))
+			rfcomm_recv_disc(s, dlci);
+		break;
+
+	case RFCOMM_UA:
+		if (__test_pf(hdr->ctrl))
+			rfcomm_recv_ua(s, dlci);
+		break;
+
+	case RFCOMM_DM:
+		rfcomm_recv_dm(s, dlci);
+		break;
+
+	case RFCOMM_UIH:
+		if (dlci)
+			return rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb);
+
+		rfcomm_recv_mcc(s, skb);
+		break;
+
+	default:
+		BT_ERR("Unknown packet type 0x%02x", type);
+		break;
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
+/* ---- Connection and data processing ---- */
+
+static void rfcomm_process_connect(struct rfcomm_session *s)
+{
+	struct rfcomm_dlc *d;
+	struct list_head *p, *n;
+
+	BT_DBG("session %p state %ld", s, s->state);
+
+	list_for_each_safe(p, n, &s->dlcs) {
+		d = list_entry(p, struct rfcomm_dlc, list);
+		if (d->state == BT_CONFIG) {
+			d->mtu = s->mtu;
+			if (rfcomm_check_security(d)) {
+				rfcomm_send_pn(s, 1, d);
+			} else {
+				set_bit(RFCOMM_AUTH_PENDING, &d->flags);
+				rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+			}
+		}
+	}
+}
+
+/* Send data queued for the DLC.
+ * Return number of frames left in the queue.
+ */
+static inline int rfcomm_process_tx(struct rfcomm_dlc *d)
+{
+	struct sk_buff *skb;
+	int err;
+
+	BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d",
+			d, d->state, d->cfc, d->rx_credits, d->tx_credits);
+
+	/* Send pending MSC */
+	if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags))
+		rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
+
+	if (d->cfc) {
+		/* CFC enabled.
+		 * Give them some credits */
+		if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) &&
+				d->rx_credits <= (d->cfc >> 2)) {
+			rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits);
+			d->rx_credits = d->cfc;
+		}
+	} else {
+		/* CFC disabled.
+		 * Give ourselves some credits */
+		d->tx_credits = 5;
+	}
+
+	if (test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+		return skb_queue_len(&d->tx_queue);
+
+	while (d->tx_credits && (skb = skb_dequeue(&d->tx_queue))) {
+		err = rfcomm_send_frame(d->session, skb->data, skb->len);
+		if (err < 0) {
+			skb_queue_head(&d->tx_queue, skb);
+			break;
+		}
+		kfree_skb(skb);
+		d->tx_credits--;
+	}
+
+	if (d->cfc && !d->tx_credits) {
+		/* We're out of TX credits.
+		 * Set TX_THROTTLED flag to avoid unnesary wakeups by dlc_send. */
+		set_bit(RFCOMM_TX_THROTTLED, &d->flags);
+	}
+
+	return skb_queue_len(&d->tx_queue);
+}
+
+static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
+{
+	struct rfcomm_dlc *d;
+	struct list_head *p, *n;
+
+	BT_DBG("session %p state %ld", s, s->state);
+
+	list_for_each_safe(p, n, &s->dlcs) {
+		d = list_entry(p, struct rfcomm_dlc, list);
+
+		if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) {
+			__rfcomm_dlc_close(d, ETIMEDOUT);
+			continue;
+		}
+
+		if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) {
+			rfcomm_dlc_clear_timer(d);
+			if (d->out) {
+				rfcomm_send_pn(s, 1, d);
+				rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
+			} else {
+				if (d->defer_setup) {
+					set_bit(RFCOMM_DEFER_SETUP, &d->flags);
+					rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+
+					rfcomm_dlc_lock(d);
+					d->state = BT_CONNECT2;
+					d->state_change(d, 0);
+					rfcomm_dlc_unlock(d);
+				} else
+					rfcomm_dlc_accept(d);
+			}
+			continue;
+		} else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
+			rfcomm_dlc_clear_timer(d);
+			if (!d->out)
+				rfcomm_send_dm(s, d->dlci);
+			else
+				d->state = BT_CLOSED;
+			__rfcomm_dlc_close(d, ECONNREFUSED);
+			continue;
+		}
+
+		if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
+			continue;
+
+		if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
+			continue;
+
+		if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) &&
+						d->mscex == RFCOMM_MSCEX_OK)
+			rfcomm_process_tx(d);
+	}
+}
+
+static inline void rfcomm_process_rx(struct rfcomm_session *s)
+{
+	struct socket *sock = s->sock;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+
+	BT_DBG("session %p state %ld qlen %d", s, s->state, skb_queue_len(&sk->sk_receive_queue));
+
+	/* Get data directly from socket receive queue without copying it. */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		skb_orphan(skb);
+		rfcomm_recv_frame(s, skb);
+	}
+
+	if (sk->sk_state == BT_CLOSED) {
+		if (!s->initiator)
+			rfcomm_session_put(s);
+
+		rfcomm_session_close(s, sk->sk_err);
+	}
+}
+
+static inline void rfcomm_accept_connection(struct rfcomm_session *s)
+{
+	struct socket *sock = s->sock, *nsock;
+	int err;
+
+	/* Fast check for a new connection.
+	 * Avoids unnesesary socket allocations. */
+	if (list_empty(&bt_sk(sock->sk)->accept_q))
+		return;
+
+	BT_DBG("session %p", s);
+
+	err = kernel_accept(sock, &nsock, O_NONBLOCK);
+	if (err < 0)
+		return;
+
+	/* Set our callbacks */
+	nsock->sk->sk_data_ready   = rfcomm_l2data_ready;
+	nsock->sk->sk_state_change = rfcomm_l2state_change;
+
+	s = rfcomm_session_add(nsock, BT_OPEN);
+	if (s) {
+		rfcomm_session_hold(s);
+
+		/* We should adjust MTU on incoming sessions.
+		 * L2CAP MTU minus UIH header and FCS. */
+		s->mtu = min(l2cap_pi(nsock->sk)->chan->omtu,
+				l2cap_pi(nsock->sk)->chan->imtu) - 5;
+
+		rfcomm_schedule();
+	} else
+		sock_release(nsock);
+}
+
+static inline void rfcomm_check_connection(struct rfcomm_session *s)
+{
+	struct sock *sk = s->sock->sk;
+
+	BT_DBG("%p state %ld", s, s->state);
+
+	switch (sk->sk_state) {
+	case BT_CONNECTED:
+		s->state = BT_CONNECT;
+
+		/* We can adjust MTU on outgoing sessions.
+		 * L2CAP MTU minus UIH header and FCS. */
+		s->mtu = min(l2cap_pi(sk)->chan->omtu, l2cap_pi(sk)->chan->imtu) - 5;
+
+		rfcomm_send_sabm(s, 0);
+		break;
+
+	case BT_CLOSED:
+		s->state = BT_CLOSED;
+		rfcomm_session_close(s, sk->sk_err);
+		break;
+	}
+}
+
+static inline void rfcomm_process_sessions(void)
+{
+	struct list_head *p, *n;
+
+	rfcomm_lock();
+
+	list_for_each_safe(p, n, &session_list) {
+		struct rfcomm_session *s;
+		s = list_entry(p, struct rfcomm_session, list);
+
+		if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) {
+			s->state = BT_DISCONN;
+			rfcomm_send_disc(s, 0);
+			rfcomm_session_put(s);
+			continue;
+		}
+
+		if (s->state == BT_LISTEN) {
+			rfcomm_accept_connection(s);
+			continue;
+		}
+
+		rfcomm_session_hold(s);
+
+		switch (s->state) {
+		case BT_BOUND:
+			rfcomm_check_connection(s);
+			break;
+
+		default:
+			rfcomm_process_rx(s);
+			break;
+		}
+
+		rfcomm_process_dlcs(s);
+
+		rfcomm_session_put(s);
+	}
+
+	rfcomm_unlock();
+}
+
+static int rfcomm_add_listener(bdaddr_t *ba)
+{
+	struct sockaddr_l2 addr;
+	struct socket *sock;
+	struct sock *sk;
+	struct rfcomm_session *s;
+	int    err = 0;
+
+	/* Create socket */
+	err = rfcomm_l2sock_create(&sock);
+	if (err < 0) {
+		BT_ERR("Create socket failed %d", err);
+		return err;
+	}
+
+	/* Bind socket */
+	bacpy(&addr.l2_bdaddr, ba);
+	addr.l2_family = AF_BLUETOOTH;
+	addr.l2_psm    = cpu_to_le16(RFCOMM_PSM);
+	addr.l2_cid    = 0;
+	err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (err < 0) {
+		BT_ERR("Bind failed %d", err);
+		goto failed;
+	}
+
+	/* Set L2CAP options */
+	sk = sock->sk;
+	lock_sock(sk);
+	l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+	release_sock(sk);
+
+	/* Start listening on the socket */
+	err = kernel_listen(sock, 10);
+	if (err) {
+		BT_ERR("Listen failed %d", err);
+		goto failed;
+	}
+
+	/* Add listening session */
+	s = rfcomm_session_add(sock, BT_LISTEN);
+	if (!s)
+		goto failed;
+
+	rfcomm_session_hold(s);
+	return 0;
+failed:
+	sock_release(sock);
+	return err;
+}
+
+static void rfcomm_kill_listener(void)
+{
+	struct rfcomm_session *s;
+	struct list_head *p, *n;
+
+	BT_DBG("");
+
+	list_for_each_safe(p, n, &session_list) {
+		s = list_entry(p, struct rfcomm_session, list);
+		rfcomm_session_del(s);
+	}
+}
+
+static int rfcomm_run(void *unused)
+{
+	BT_DBG("");
+
+	set_user_nice(current, -10);
+
+	rfcomm_add_listener(BDADDR_ANY);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread_should_stop())
+			break;
+
+		/* Process stuff */
+		rfcomm_process_sessions();
+
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	rfcomm_kill_listener();
+
+	return 0;
+}
+
+static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
+{
+	struct rfcomm_session *s;
+	struct rfcomm_dlc *d;
+	struct list_head *p, *n;
+
+	BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt);
+
+	s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
+	if (!s)
+		return;
+
+	rfcomm_session_hold(s);
+
+	list_for_each_safe(p, n, &s->dlcs) {
+		d = list_entry(p, struct rfcomm_dlc, list);
+
+		if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
+			rfcomm_dlc_clear_timer(d);
+			if (status || encrypt == 0x00) {
+				__rfcomm_dlc_close(d, ECONNREFUSED);
+				continue;
+			}
+		}
+
+		if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
+			if (d->sec_level == BT_SECURITY_MEDIUM) {
+				set_bit(RFCOMM_SEC_PENDING, &d->flags);
+				rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
+				continue;
+			} else if (d->sec_level == BT_SECURITY_HIGH) {
+				__rfcomm_dlc_close(d, ECONNREFUSED);
+				continue;
+			}
+		}
+
+		if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
+			continue;
+
+		if (!status && hci_conn_check_secure(conn, d->sec_level))
+			set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
+		else
+			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
+	}
+
+	rfcomm_session_put(s);
+
+	rfcomm_schedule();
+}
+
+static struct hci_cb rfcomm_cb = {
+	.name		= "RFCOMM",
+	.security_cfm	= rfcomm_security_cfm
+};
+
+static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
+{
+	struct rfcomm_session *s;
+	struct list_head *pp, *p;
+
+	rfcomm_lock();
+
+	list_for_each(p, &session_list) {
+		s = list_entry(p, struct rfcomm_session, list);
+		list_for_each(pp, &s->dlcs) {
+			struct sock *sk = s->sock->sk;
+			struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list);
+
+			seq_printf(f, "%s %s %ld %d %d %d %d\n",
+						batostr(&bt_sk(sk)->src),
+						batostr(&bt_sk(sk)->dst),
+						d->state, d->dlci, d->mtu,
+						d->rx_credits, d->tx_credits);
+		}
+	}
+
+	rfcomm_unlock();
+
+	return 0;
+}
+
+static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_dlc_debugfs_fops = {
+	.open		= rfcomm_dlc_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *rfcomm_dlc_debugfs;
+
+/* ---- Initialization ---- */
+static int __init rfcomm_init(void)
+{
+	int err;
+
+	hci_register_cb(&rfcomm_cb);
+
+	rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd");
+	if (IS_ERR(rfcomm_thread)) {
+		err = PTR_ERR(rfcomm_thread);
+		goto unregister;
+	}
+
+	if (bt_debugfs) {
+		rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444,
+				bt_debugfs, NULL, &rfcomm_dlc_debugfs_fops);
+		if (!rfcomm_dlc_debugfs)
+			BT_ERR("Failed to create RFCOMM debug file");
+	}
+
+	err = rfcomm_init_ttys();
+	if (err < 0)
+		goto stop;
+
+	err = rfcomm_init_sockets();
+	if (err < 0)
+		goto cleanup;
+
+	BT_INFO("RFCOMM ver %s", VERSION);
+
+	return 0;
+
+cleanup:
+	rfcomm_cleanup_ttys();
+
+stop:
+	kthread_stop(rfcomm_thread);
+
+unregister:
+	hci_unregister_cb(&rfcomm_cb);
+
+	return err;
+}
+
+static void __exit rfcomm_exit(void)
+{
+	debugfs_remove(rfcomm_dlc_debugfs);
+
+	hci_unregister_cb(&rfcomm_cb);
+
+	kthread_stop(rfcomm_thread);
+
+	rfcomm_cleanup_ttys();
+
+	rfcomm_cleanup_sockets();
+}
+
+module_init(rfcomm_init);
+module_exit(rfcomm_exit);
+
+module_param(disable_cfc, bool, 0644);
+MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control");
+
+module_param(channel_mtu, int, 0644);
+MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel");
+
+module_param(l2cap_mtu, uint, 0644);
+MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection");
+
+module_param(l2cap_ertm, bool, 0644);
+MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection");
+
+MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("bt-proto-3");
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
new file mode 100644
index 00000000..b02f0d47
--- /dev/null
+++ b/net/bluetooth/rfcomm/sock.c
@@ -0,0 +1,1072 @@
+/*
+   RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+   Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * RFCOMM sockets.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/rfcomm.h>
+
+static const struct proto_ops rfcomm_sock_ops;
+
+static struct bt_sock_list rfcomm_sk_list = {
+	.lock = __RW_LOCK_UNLOCKED(rfcomm_sk_list.lock)
+};
+
+static void rfcomm_sock_close(struct sock *sk);
+static void rfcomm_sock_kill(struct sock *sk);
+
+/* ---- DLC callbacks ----
+ *
+ * called under rfcomm_dlc_lock()
+ */
+static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
+{
+	struct sock *sk = d->owner;
+	if (!sk)
+		return;
+
+	atomic_add(skb->len, &sk->sk_rmem_alloc);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, skb->len);
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		rfcomm_dlc_throttle(d);
+}
+
+static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
+{
+	struct sock *sk = d->owner, *parent;
+	unsigned long flags;
+
+	if (!sk)
+		return;
+
+	BT_DBG("dlc %p state %ld err %d", d, d->state, err);
+
+	local_irq_save(flags);
+	bh_lock_sock(sk);
+
+	if (err)
+		sk->sk_err = err;
+
+	sk->sk_state = d->state;
+
+	parent = bt_sk(sk)->parent;
+	if (parent) {
+		if (d->state == BT_CLOSED) {
+			sock_set_flag(sk, SOCK_ZAPPED);
+			bt_accept_unlink(sk);
+		}
+		parent->sk_data_ready(parent, 0);
+	} else {
+		if (d->state == BT_CONNECTED)
+			rfcomm_session_getaddr(d->session, &bt_sk(sk)->src, NULL);
+		sk->sk_state_change(sk);
+	}
+
+	bh_unlock_sock(sk);
+	local_irq_restore(flags);
+
+	if (parent && sock_flag(sk, SOCK_ZAPPED)) {
+		/* We have to drop DLC lock here, otherwise
+		 * rfcomm_sock_destruct() will dead lock. */
+		rfcomm_dlc_unlock(d);
+		rfcomm_sock_kill(sk);
+		rfcomm_dlc_lock(d);
+	}
+}
+
+/* ---- Socket functions ---- */
+static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src)
+{
+	struct sock *sk = NULL;
+	struct hlist_node *node;
+
+	sk_for_each(sk, node, &rfcomm_sk_list.head) {
+		if (rfcomm_pi(sk)->channel == channel &&
+				!bacmp(&bt_sk(sk)->src, src))
+			break;
+	}
+
+	return node ? sk : NULL;
+}
+
+/* Find socket with channel and source bdaddr.
+ * Returns closest match.
+ */
+static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src)
+{
+	struct sock *sk = NULL, *sk1 = NULL;
+	struct hlist_node *node;
+
+	read_lock(&rfcomm_sk_list.lock);
+
+	sk_for_each(sk, node, &rfcomm_sk_list.head) {
+		if (state && sk->sk_state != state)
+			continue;
+
+		if (rfcomm_pi(sk)->channel == channel) {
+			/* Exact match. */
+			if (!bacmp(&bt_sk(sk)->src, src))
+				break;
+
+			/* Closest match */
+			if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+				sk1 = sk;
+		}
+	}
+
+	read_unlock(&rfcomm_sk_list.lock);
+
+	return node ? sk : sk1;
+}
+
+static void rfcomm_sock_destruct(struct sock *sk)
+{
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+
+	BT_DBG("sk %p dlc %p", sk, d);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+
+	rfcomm_dlc_lock(d);
+	rfcomm_pi(sk)->dlc = NULL;
+
+	/* Detach DLC if it's owned by this socket */
+	if (d->owner == sk)
+		d->owner = NULL;
+	rfcomm_dlc_unlock(d);
+
+	rfcomm_dlc_put(d);
+}
+
+static void rfcomm_sock_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	BT_DBG("parent %p", parent);
+
+	/* Close not yet accepted dlcs */
+	while ((sk = bt_accept_dequeue(parent, NULL))) {
+		rfcomm_sock_close(sk);
+		rfcomm_sock_kill(sk);
+	}
+
+	parent->sk_state  = BT_CLOSED;
+	sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void rfcomm_sock_kill(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+		return;
+
+	BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, atomic_read(&sk->sk_refcnt));
+
+	/* Kill poor orphan */
+	bt_sock_unlink(&rfcomm_sk_list, sk);
+	sock_set_flag(sk, SOCK_DEAD);
+	sock_put(sk);
+}
+
+static void __rfcomm_sock_close(struct sock *sk)
+{
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+
+	BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+	switch (sk->sk_state) {
+	case BT_LISTEN:
+		rfcomm_sock_cleanup_listen(sk);
+		break;
+
+	case BT_CONNECT:
+	case BT_CONNECT2:
+	case BT_CONFIG:
+	case BT_CONNECTED:
+		rfcomm_dlc_close(d, 0);
+
+	default:
+		sock_set_flag(sk, SOCK_ZAPPED);
+		break;
+	}
+}
+
+/* Close socket.
+ * Must be called on unlocked socket.
+ */
+static void rfcomm_sock_close(struct sock *sk)
+{
+	lock_sock(sk);
+	__rfcomm_sock_close(sk);
+	release_sock(sk);
+}
+
+static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
+{
+	struct rfcomm_pinfo *pi = rfcomm_pi(sk);
+
+	BT_DBG("sk %p", sk);
+
+	if (parent) {
+		sk->sk_type = parent->sk_type;
+		pi->dlc->defer_setup = bt_sk(parent)->defer_setup;
+
+		pi->sec_level = rfcomm_pi(parent)->sec_level;
+		pi->role_switch = rfcomm_pi(parent)->role_switch;
+	} else {
+		pi->dlc->defer_setup = 0;
+
+		pi->sec_level = BT_SECURITY_LOW;
+		pi->role_switch = 0;
+	}
+
+	pi->dlc->sec_level = pi->sec_level;
+	pi->dlc->role_switch = pi->role_switch;
+}
+
+static struct proto rfcomm_proto = {
+	.name		= "RFCOMM",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct rfcomm_pinfo)
+};
+
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+{
+	struct rfcomm_dlc *d;
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk);
+	INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+	d = rfcomm_dlc_alloc(prio);
+	if (!d) {
+		sk_free(sk);
+		return NULL;
+	}
+
+	d->data_ready   = rfcomm_sk_data_ready;
+	d->state_change = rfcomm_sk_state_change;
+
+	rfcomm_pi(sk)->dlc = d;
+	d->owner = sk;
+
+	sk->sk_destruct = rfcomm_sock_destruct;
+	sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT;
+
+	sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
+	sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = proto;
+	sk->sk_state    = BT_OPEN;
+
+	bt_sock_link(&rfcomm_sk_list, sk);
+
+	BT_DBG("sk %p", sk);
+	return sk;
+}
+
+static int rfcomm_sock_create(struct net *net, struct socket *sock,
+			      int protocol, int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_STREAM && sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &rfcomm_sock_ops;
+
+	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	if (!sk)
+		return -ENOMEM;
+
+	rfcomm_sock_init(sk, NULL);
+	return 0;
+}
+
+static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sk %p %s", sk, batostr(&sa->rc_bdaddr));
+
+	if (!addr || addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_OPEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (sk->sk_type != SOCK_STREAM) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	write_lock_bh(&rfcomm_sk_list.lock);
+
+	if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) {
+		err = -EADDRINUSE;
+	} else {
+		/* Save source address */
+		bacpy(&bt_sk(sk)->src, &sa->rc_bdaddr);
+		rfcomm_pi(sk)->channel = sa->rc_channel;
+		sk->sk_state = BT_BOUND;
+	}
+
+	write_unlock_bh(&rfcomm_sk_list.lock);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+{
+	struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+	struct sock *sk = sock->sk;
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+	int err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (alen < sizeof(struct sockaddr_rc) ||
+	    addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (sk->sk_type != SOCK_STREAM) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	sk->sk_state = BT_CONNECT;
+	bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
+	rfcomm_pi(sk)->channel = sa->rc_channel;
+
+	d->sec_level = rfcomm_pi(sk)->sec_level;
+	d->role_switch = rfcomm_pi(sk)->role_switch;
+
+	err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel);
+	if (!err)
+		err = bt_sock_wait_state(sk, BT_CONNECTED,
+				sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sk %p backlog %d", sk, backlog);
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_BOUND) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	if (sk->sk_type != SOCK_STREAM) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	if (!rfcomm_pi(sk)->channel) {
+		bdaddr_t *src = &bt_sk(sk)->src;
+		u8 channel;
+
+		err = -EINVAL;
+
+		write_lock_bh(&rfcomm_sk_list.lock);
+
+		for (channel = 1; channel < 31; channel++)
+			if (!__rfcomm_get_sock_by_addr(channel, src)) {
+				rfcomm_pi(sk)->channel = channel;
+				err = 0;
+				break;
+			}
+
+		write_unlock_bh(&rfcomm_sk_list.lock);
+
+		if (err < 0)
+			goto done;
+	}
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = BT_LISTEN;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct sock *sk = sock->sk, *nsk;
+	long timeo;
+	int err = 0;
+
+	lock_sock(sk);
+
+	if (sk->sk_type != SOCK_STREAM) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	BT_DBG("sk %p timeo %ld", sk, timeo);
+
+	/* Wait for an incoming connection. (wake-one). */
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (sk->sk_state != BT_LISTEN) {
+			err = -EBADFD;
+			break;
+		}
+
+		nsk = bt_accept_dequeue(sk, newsock);
+		if (nsk)
+			break;
+
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (err)
+		goto done;
+
+	newsock->state = SS_CONNECTED;
+
+	BT_DBG("new socket %p", nsk);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+{
+	struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	sa->rc_family  = AF_BLUETOOTH;
+	sa->rc_channel = rfcomm_pi(sk)->channel;
+	if (peer)
+		bacpy(&sa->rc_bdaddr, &bt_sk(sk)->dst);
+	else
+		bacpy(&sa->rc_bdaddr, &bt_sk(sk)->src);
+
+	*len = sizeof(struct sockaddr_rc);
+	return 0;
+}
+
+static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+	struct sk_buff *skb;
+	int sent = 0;
+
+	if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
+		return -ENOTCONN;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN)
+		return -EPIPE;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	lock_sock(sk);
+
+	while (len) {
+		size_t size = min_t(size_t, len, d->mtu);
+		int err;
+
+		skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
+				msg->msg_flags & MSG_DONTWAIT, &err);
+		if (!skb) {
+			if (sent == 0)
+				sent = err;
+			break;
+		}
+		skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
+
+		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		if (err) {
+			kfree_skb(skb);
+			if (sent == 0)
+				sent = err;
+			break;
+		}
+
+		err = rfcomm_dlc_send(d, skb);
+		if (err < 0) {
+			kfree_skb(skb);
+			if (sent == 0)
+				sent = err;
+			break;
+		}
+
+		sent += size;
+		len  -= size;
+	}
+
+	release_sock(sk);
+
+	return sent;
+}
+
+static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
+	int len;
+
+	if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
+		rfcomm_dlc_accept(d);
+		return 0;
+	}
+
+	len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags);
+
+	lock_sock(sk);
+	if (!(flags & MSG_PEEK) && len > 0)
+		atomic_sub(len, &sk->sk_rmem_alloc);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
+		rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
+	release_sock(sk);
+
+	return len;
+}
+
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case RFCOMM_LM:
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt & RFCOMM_LM_AUTH)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
+		if (opt & RFCOMM_LM_ENCRYPT)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
+		if (opt & RFCOMM_LM_SECURE)
+			rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
+
+		rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int err = 0;
+	size_t len;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_RFCOMM)
+		return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_STREAM) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = BT_SECURITY_LOW;
+
+		len = min_t(unsigned int, sizeof(sec), optlen);
+		if (copy_from_user((char *) &sec, optval, len)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (sec.level > BT_SECURITY_HIGH) {
+			err = -EINVAL;
+			break;
+		}
+
+		rfcomm_pi(sk)->sec_level = sec.level;
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		bt_sk(sk)->defer_setup = opt;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct rfcomm_conninfo cinfo;
+	struct l2cap_conn *conn = l2cap_pi(sk)->chan->conn;
+	int len, err = 0;
+	u32 opt;
+
+	BT_DBG("sk %p", sk);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case RFCOMM_LM:
+		switch (rfcomm_pi(sk)->sec_level) {
+		case BT_SECURITY_LOW:
+			opt = RFCOMM_LM_AUTH;
+			break;
+		case BT_SECURITY_MEDIUM:
+			opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
+			break;
+		case BT_SECURITY_HIGH:
+			opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
+							RFCOMM_LM_SECURE;
+			break;
+		default:
+			opt = 0;
+			break;
+		}
+
+		if (rfcomm_pi(sk)->role_switch)
+			opt |= RFCOMM_LM_MASTER;
+
+		if (put_user(opt, (u32 __user *) optval))
+			err = -EFAULT;
+		break;
+
+	case RFCOMM_CONNINFO:
+		if (sk->sk_state != BT_CONNECTED &&
+					!rfcomm_pi(sk)->dlc->defer_setup) {
+			err = -ENOTCONN;
+			break;
+		}
+
+
+		memset(&cinfo, 0, sizeof(cinfo));
+		cinfo.hci_handle = conn->hcon->handle;
+		memcpy(cinfo.dev_class, conn->hcon->dev_class, 3);
+
+		len = min_t(unsigned int, len, sizeof(cinfo));
+		if (copy_to_user(optval, (char *) &cinfo, len))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct bt_security sec;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_RFCOMM)
+		return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (level != SOL_BLUETOOTH)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case BT_SECURITY:
+		if (sk->sk_type != SOCK_STREAM) {
+			err = -EINVAL;
+			break;
+		}
+
+		sec.level = rfcomm_pi(sk)->sec_level;
+
+		len = min_t(unsigned int, len, sizeof(sec));
+		if (copy_to_user(optval, (char *) &sec, len))
+			err = -EFAULT;
+
+		break;
+
+	case BT_DEFER_SETUP:
+		if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk __maybe_unused = sock->sk;
+	int err;
+
+	BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg);
+
+	err = bt_sock_ioctl(sock, cmd, arg);
+
+	if (err == -ENOIOCTLCMD) {
+#ifdef CONFIG_BT_RFCOMM_TTY
+		lock_sock(sk);
+		err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg);
+		release_sock(sk);
+#else
+		err = -EOPNOTSUPP;
+#endif
+	}
+
+	return err;
+}
+
+static int rfcomm_sock_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+	if (!sk->sk_shutdown) {
+		sk->sk_shutdown = SHUTDOWN_MASK;
+		__rfcomm_sock_close(sk);
+
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+			err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+	}
+	release_sock(sk);
+	return err;
+}
+
+static int rfcomm_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	err = rfcomm_sock_shutdown(sock, 2);
+
+	sock_orphan(sk);
+	rfcomm_sock_kill(sk);
+	return err;
+}
+
+/* ---- RFCOMM core layer callbacks ----
+ *
+ * called under rfcomm_lock()
+ */
+int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d)
+{
+	struct sock *sk, *parent;
+	bdaddr_t src, dst;
+	int result = 0;
+
+	BT_DBG("session %p channel %d", s, channel);
+
+	rfcomm_session_getaddr(s, &src, &dst);
+
+	/* Check if we have socket listening on channel */
+	parent = rfcomm_get_sock_by_channel(BT_LISTEN, channel, &src);
+	if (!parent)
+		return 0;
+
+	bh_lock_sock(parent);
+
+	/* Check for backlog size */
+	if (sk_acceptq_is_full(parent)) {
+		BT_DBG("backlog full %d", parent->sk_ack_backlog);
+		goto done;
+	}
+
+	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+	if (!sk)
+		goto done;
+
+	rfcomm_sock_init(sk, parent);
+	bacpy(&bt_sk(sk)->src, &src);
+	bacpy(&bt_sk(sk)->dst, &dst);
+	rfcomm_pi(sk)->channel = channel;
+
+	sk->sk_state = BT_CONFIG;
+	bt_accept_enqueue(parent, sk);
+
+	/* Accept connection and return socket DLC */
+	*d = rfcomm_pi(sk)->dlc;
+	result = 1;
+
+done:
+	bh_unlock_sock(parent);
+
+	if (bt_sk(parent)->defer_setup)
+		parent->sk_state_change(parent);
+
+	return result;
+}
+
+static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock_bh(&rfcomm_sk_list.lock);
+
+	sk_for_each(sk, node, &rfcomm_sk_list.head) {
+		seq_printf(f, "%s %s %d %d\n",
+				batostr(&bt_sk(sk)->src),
+				batostr(&bt_sk(sk)->dst),
+				sk->sk_state, rfcomm_pi(sk)->channel);
+	}
+
+	read_unlock_bh(&rfcomm_sk_list.lock);
+
+	return 0;
+}
+
+static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_sock_debugfs_fops = {
+	.open		= rfcomm_sock_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *rfcomm_sock_debugfs;
+
+static const struct proto_ops rfcomm_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= rfcomm_sock_release,
+	.bind		= rfcomm_sock_bind,
+	.connect	= rfcomm_sock_connect,
+	.listen		= rfcomm_sock_listen,
+	.accept		= rfcomm_sock_accept,
+	.getname	= rfcomm_sock_getname,
+	.sendmsg	= rfcomm_sock_sendmsg,
+	.recvmsg	= rfcomm_sock_recvmsg,
+	.shutdown	= rfcomm_sock_shutdown,
+	.setsockopt	= rfcomm_sock_setsockopt,
+	.getsockopt	= rfcomm_sock_getsockopt,
+	.ioctl		= rfcomm_sock_ioctl,
+	.poll		= bt_sock_poll,
+	.socketpair	= sock_no_socketpair,
+	.mmap		= sock_no_mmap
+};
+
+static const struct net_proto_family rfcomm_sock_family_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.create		= rfcomm_sock_create
+};
+
+int __init rfcomm_init_sockets(void)
+{
+	int err;
+
+	err = proto_register(&rfcomm_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops);
+	if (err < 0)
+		goto error;
+
+	if (bt_debugfs) {
+		rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444,
+				bt_debugfs, NULL, &rfcomm_sock_debugfs_fops);
+		if (!rfcomm_sock_debugfs)
+			BT_ERR("Failed to create RFCOMM debug file");
+	}
+
+	BT_INFO("RFCOMM socket layer initialized");
+
+	return 0;
+
+error:
+	BT_ERR("RFCOMM socket layer registration failed");
+	proto_unregister(&rfcomm_proto);
+	return err;
+}
+
+void __exit rfcomm_cleanup_sockets(void)
+{
+	debugfs_remove(rfcomm_sock_debugfs);
+
+	if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
+		BT_ERR("RFCOMM socket layer unregistration failed");
+
+	proto_unregister(&rfcomm_proto);
+}
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
new file mode 100644
index 00000000..c2587963
--- /dev/null
+++ b/net/bluetooth/rfcomm/tty.c
@@ -0,0 +1,1190 @@
+/*
+   RFCOMM implementation for Linux Bluetooth stack (BlueZ).
+   Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
+   Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/*
+ * RFCOMM TTY.
+ */
+
+#include <linux/module.h>
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+
+#include <linux/capability.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/rfcomm.h>
+
+#define RFCOMM_TTY_MAGIC 0x6d02		/* magic number for rfcomm struct */
+#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV	/* whole lotta rfcomm devices */
+#define RFCOMM_TTY_MAJOR 216		/* device node major id of the usb/bluetooth.c driver */
+#define RFCOMM_TTY_MINOR 0
+
+static struct tty_driver *rfcomm_tty_driver;
+
+struct rfcomm_dev {
+	struct list_head	list;
+	atomic_t		refcnt;
+
+	char			name[12];
+	int			id;
+	unsigned long		flags;
+	atomic_t		opened;
+	int			err;
+
+	bdaddr_t		src;
+	bdaddr_t		dst;
+	u8			channel;
+
+	uint			modem_status;
+
+	struct rfcomm_dlc	*dlc;
+	struct tty_struct	*tty;
+	wait_queue_head_t       wait;
+	struct tasklet_struct   wakeup_task;
+
+	struct device		*tty_dev;
+
+	atomic_t		wmem_alloc;
+
+	struct sk_buff_head	pending;
+};
+
+static LIST_HEAD(rfcomm_dev_list);
+static DEFINE_RWLOCK(rfcomm_dev_lock);
+
+static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb);
+static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err);
+static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig);
+
+static void rfcomm_tty_wakeup(unsigned long arg);
+
+/* ---- Device functions ---- */
+static void rfcomm_dev_destruct(struct rfcomm_dev *dev)
+{
+	struct rfcomm_dlc *dlc = dev->dlc;
+
+	BT_DBG("dev %p dlc %p", dev, dlc);
+
+	/* Refcount should only hit zero when called from rfcomm_dev_del()
+	   which will have taken us off the list. Everything else are
+	   refcounting bugs. */
+	BUG_ON(!list_empty(&dev->list));
+
+	rfcomm_dlc_lock(dlc);
+	/* Detach DLC if it's owned by this dev */
+	if (dlc->owner == dev)
+		dlc->owner = NULL;
+	rfcomm_dlc_unlock(dlc);
+
+	rfcomm_dlc_put(dlc);
+
+	tty_unregister_device(rfcomm_tty_driver, dev->id);
+
+	kfree(dev);
+
+	/* It's safe to call module_put() here because socket still
+	   holds reference to this module. */
+	module_put(THIS_MODULE);
+}
+
+static inline void rfcomm_dev_hold(struct rfcomm_dev *dev)
+{
+	atomic_inc(&dev->refcnt);
+}
+
+static inline void rfcomm_dev_put(struct rfcomm_dev *dev)
+{
+	/* The reason this isn't actually a race, as you no
+	   doubt have a little voice screaming at you in your
+	   head, is that the refcount should never actually
+	   reach zero unless the device has already been taken
+	   off the list, in rfcomm_dev_del(). And if that's not
+	   true, we'll hit the BUG() in rfcomm_dev_destruct()
+	   anyway. */
+	if (atomic_dec_and_test(&dev->refcnt))
+		rfcomm_dev_destruct(dev);
+}
+
+static struct rfcomm_dev *__rfcomm_dev_get(int id)
+{
+	struct rfcomm_dev *dev;
+	struct list_head  *p;
+
+	list_for_each(p, &rfcomm_dev_list) {
+		dev = list_entry(p, struct rfcomm_dev, list);
+		if (dev->id == id)
+			return dev;
+	}
+
+	return NULL;
+}
+
+static inline struct rfcomm_dev *rfcomm_dev_get(int id)
+{
+	struct rfcomm_dev *dev;
+
+	read_lock(&rfcomm_dev_lock);
+
+	dev = __rfcomm_dev_get(id);
+
+	if (dev) {
+		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
+			dev = NULL;
+		else
+			rfcomm_dev_hold(dev);
+	}
+
+	read_unlock(&rfcomm_dev_lock);
+
+	return dev;
+}
+
+static struct device *rfcomm_get_device(struct rfcomm_dev *dev)
+{
+	struct hci_dev *hdev;
+	struct hci_conn *conn;
+
+	hdev = hci_get_route(&dev->dst, &dev->src);
+	if (!hdev)
+		return NULL;
+
+	conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst);
+
+	hci_dev_put(hdev);
+
+	return conn ? &conn->dev : NULL;
+}
+
+static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
+{
+	struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+	return sprintf(buf, "%s\n", batostr(&dev->dst));
+}
+
+static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
+{
+	struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
+	return sprintf(buf, "%d\n", dev->channel);
+}
+
+static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
+static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL);
+
+static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc)
+{
+	struct rfcomm_dev *dev;
+	struct list_head *head = &rfcomm_dev_list, *p;
+	int err = 0;
+
+	BT_DBG("id %d channel %d", req->dev_id, req->channel);
+
+	dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	write_lock_bh(&rfcomm_dev_lock);
+
+	if (req->dev_id < 0) {
+		dev->id = 0;
+
+		list_for_each(p, &rfcomm_dev_list) {
+			if (list_entry(p, struct rfcomm_dev, list)->id != dev->id)
+				break;
+
+			dev->id++;
+			head = p;
+		}
+	} else {
+		dev->id = req->dev_id;
+
+		list_for_each(p, &rfcomm_dev_list) {
+			struct rfcomm_dev *entry = list_entry(p, struct rfcomm_dev, list);
+
+			if (entry->id == dev->id) {
+				err = -EADDRINUSE;
+				goto out;
+			}
+
+			if (entry->id > dev->id - 1)
+				break;
+
+			head = p;
+		}
+	}
+
+	if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) {
+		err = -ENFILE;
+		goto out;
+	}
+
+	sprintf(dev->name, "rfcomm%d", dev->id);
+
+	list_add(&dev->list, head);
+	atomic_set(&dev->refcnt, 1);
+
+	bacpy(&dev->src, &req->src);
+	bacpy(&dev->dst, &req->dst);
+	dev->channel = req->channel;
+
+	dev->flags = req->flags &
+		((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC));
+
+	atomic_set(&dev->opened, 0);
+
+	init_waitqueue_head(&dev->wait);
+	tasklet_init(&dev->wakeup_task, rfcomm_tty_wakeup, (unsigned long) dev);
+
+	skb_queue_head_init(&dev->pending);
+
+	rfcomm_dlc_lock(dlc);
+
+	if (req->flags & (1 << RFCOMM_REUSE_DLC)) {
+		struct sock *sk = dlc->owner;
+		struct sk_buff *skb;
+
+		BUG_ON(!sk);
+
+		rfcomm_dlc_throttle(dlc);
+
+		while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+			skb_orphan(skb);
+			skb_queue_tail(&dev->pending, skb);
+			atomic_sub(skb->len, &sk->sk_rmem_alloc);
+		}
+	}
+
+	dlc->data_ready   = rfcomm_dev_data_ready;
+	dlc->state_change = rfcomm_dev_state_change;
+	dlc->modem_status = rfcomm_dev_modem_status;
+
+	dlc->owner = dev;
+	dev->dlc   = dlc;
+
+	rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig);
+
+	rfcomm_dlc_unlock(dlc);
+
+	/* It's safe to call __module_get() here because socket already
+	   holds reference to this module. */
+	__module_get(THIS_MODULE);
+
+out:
+	write_unlock_bh(&rfcomm_dev_lock);
+
+	if (err < 0)
+		goto free;
+
+	dev->tty_dev = tty_register_device(rfcomm_tty_driver, dev->id, NULL);
+
+	if (IS_ERR(dev->tty_dev)) {
+		err = PTR_ERR(dev->tty_dev);
+		list_del(&dev->list);
+		goto free;
+	}
+
+	dev_set_drvdata(dev->tty_dev, dev);
+
+	if (device_create_file(dev->tty_dev, &dev_attr_address) < 0)
+		BT_ERR("Failed to create address attribute");
+
+	if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0)
+		BT_ERR("Failed to create channel attribute");
+
+	return dev->id;
+
+free:
+	kfree(dev);
+	return err;
+}
+
+static void rfcomm_dev_del(struct rfcomm_dev *dev)
+{
+	BT_DBG("dev %p", dev);
+
+	BUG_ON(test_and_set_bit(RFCOMM_TTY_RELEASED, &dev->flags));
+
+	if (atomic_read(&dev->opened) > 0)
+		return;
+
+	write_lock_bh(&rfcomm_dev_lock);
+	list_del_init(&dev->list);
+	write_unlock_bh(&rfcomm_dev_lock);
+
+	rfcomm_dev_put(dev);
+}
+
+/* ---- Send buffer ---- */
+static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc)
+{
+	/* We can't let it be zero, because we don't get a callback
+	   when tx_credits becomes nonzero, hence we'd never wake up */
+	return dlc->mtu * (dlc->tx_credits?:1);
+}
+
+static void rfcomm_wfree(struct sk_buff *skb)
+{
+	struct rfcomm_dev *dev = (void *) skb->sk;
+	atomic_sub(skb->truesize, &dev->wmem_alloc);
+	if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags))
+		tasklet_schedule(&dev->wakeup_task);
+	rfcomm_dev_put(dev);
+}
+
+static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev)
+{
+	rfcomm_dev_hold(dev);
+	atomic_add(skb->truesize, &dev->wmem_alloc);
+	skb->sk = (void *) dev;
+	skb->destructor = rfcomm_wfree;
+}
+
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority)
+{
+	if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
+		struct sk_buff *skb = alloc_skb(size, priority);
+		if (skb) {
+			rfcomm_set_owner_w(skb, dev);
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+/* ---- Device IOCTLs ---- */
+
+#define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP))
+
+static int rfcomm_create_dev(struct sock *sk, void __user *arg)
+{
+	struct rfcomm_dev_req req;
+	struct rfcomm_dlc *dlc;
+	int id;
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags);
+
+	if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (req.flags & (1 << RFCOMM_REUSE_DLC)) {
+		/* Socket must be connected */
+		if (sk->sk_state != BT_CONNECTED)
+			return -EBADFD;
+
+		dlc = rfcomm_pi(sk)->dlc;
+		rfcomm_dlc_hold(dlc);
+	} else {
+		dlc = rfcomm_dlc_alloc(GFP_KERNEL);
+		if (!dlc)
+			return -ENOMEM;
+	}
+
+	id = rfcomm_dev_add(&req, dlc);
+	if (id < 0) {
+		rfcomm_dlc_put(dlc);
+		return id;
+	}
+
+	if (req.flags & (1 << RFCOMM_REUSE_DLC)) {
+		/* DLC is now used by device.
+		 * Socket must be disconnected */
+		sk->sk_state = BT_CLOSED;
+	}
+
+	return id;
+}
+
+static int rfcomm_release_dev(void __user *arg)
+{
+	struct rfcomm_dev_req req;
+	struct rfcomm_dev *dev;
+
+	if (copy_from_user(&req, arg, sizeof(req)))
+		return -EFAULT;
+
+	BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags);
+
+	dev = rfcomm_dev_get(req.dev_id);
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) {
+		rfcomm_dev_put(dev);
+		return -EPERM;
+	}
+
+	if (req.flags & (1 << RFCOMM_HANGUP_NOW))
+		rfcomm_dlc_close(dev->dlc, 0);
+
+	/* Shut down TTY synchronously before freeing rfcomm_dev */
+	if (dev->tty)
+		tty_vhangup(dev->tty);
+
+	if (!test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags))
+		rfcomm_dev_del(dev);
+	rfcomm_dev_put(dev);
+	return 0;
+}
+
+static int rfcomm_get_dev_list(void __user *arg)
+{
+	struct rfcomm_dev_list_req *dl;
+	struct rfcomm_dev_info *di;
+	struct list_head *p;
+	int n = 0, size, err;
+	u16 dev_num;
+
+	BT_DBG("");
+
+	if (get_user(dev_num, (u16 __user *) arg))
+		return -EFAULT;
+
+	if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di))
+		return -EINVAL;
+
+	size = sizeof(*dl) + dev_num * sizeof(*di);
+
+	dl = kmalloc(size, GFP_KERNEL);
+	if (!dl)
+		return -ENOMEM;
+
+	di = dl->dev_info;
+
+	read_lock_bh(&rfcomm_dev_lock);
+
+	list_for_each(p, &rfcomm_dev_list) {
+		struct rfcomm_dev *dev = list_entry(p, struct rfcomm_dev, list);
+		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags))
+			continue;
+		(di + n)->id      = dev->id;
+		(di + n)->flags   = dev->flags;
+		(di + n)->state   = dev->dlc->state;
+		(di + n)->channel = dev->channel;
+		bacpy(&(di + n)->src, &dev->src);
+		bacpy(&(di + n)->dst, &dev->dst);
+		if (++n >= dev_num)
+			break;
+	}
+
+	read_unlock_bh(&rfcomm_dev_lock);
+
+	dl->dev_num = n;
+	size = sizeof(*dl) + n * sizeof(*di);
+
+	err = copy_to_user(arg, dl, size);
+	kfree(dl);
+
+	return err ? -EFAULT : 0;
+}
+
+static int rfcomm_get_dev_info(void __user *arg)
+{
+	struct rfcomm_dev *dev;
+	struct rfcomm_dev_info di;
+	int err = 0;
+
+	BT_DBG("");
+
+	if (copy_from_user(&di, arg, sizeof(di)))
+		return -EFAULT;
+
+	dev = rfcomm_dev_get(di.id);
+	if (!dev)
+		return -ENODEV;
+
+	di.flags   = dev->flags;
+	di.channel = dev->channel;
+	di.state   = dev->dlc->state;
+	bacpy(&di.src, &dev->src);
+	bacpy(&di.dst, &dev->dst);
+
+	if (copy_to_user(arg, &di, sizeof(di)))
+		err = -EFAULT;
+
+	rfcomm_dev_put(dev);
+	return err;
+}
+
+int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+	BT_DBG("cmd %d arg %p", cmd, arg);
+
+	switch (cmd) {
+	case RFCOMMCREATEDEV:
+		return rfcomm_create_dev(sk, arg);
+
+	case RFCOMMRELEASEDEV:
+		return rfcomm_release_dev(arg);
+
+	case RFCOMMGETDEVLIST:
+		return rfcomm_get_dev_list(arg);
+
+	case RFCOMMGETDEVINFO:
+		return rfcomm_get_dev_info(arg);
+	}
+
+	return -EINVAL;
+}
+
+/* ---- DLC callbacks ---- */
+static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb)
+{
+	struct rfcomm_dev *dev = dlc->owner;
+	struct tty_struct *tty;
+
+	if (!dev) {
+		kfree_skb(skb);
+		return;
+	}
+
+	tty = dev->tty;
+	if (!tty || !skb_queue_empty(&dev->pending)) {
+		skb_queue_tail(&dev->pending, skb);
+		return;
+	}
+
+	BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len);
+
+	tty_insert_flip_string(tty, skb->data, skb->len);
+	tty_flip_buffer_push(tty);
+
+	kfree_skb(skb);
+}
+
+static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err)
+{
+	struct rfcomm_dev *dev = dlc->owner;
+	if (!dev)
+		return;
+
+	BT_DBG("dlc %p dev %p err %d", dlc, dev, err);
+
+	dev->err = err;
+	wake_up_interruptible(&dev->wait);
+
+	if (dlc->state == BT_CLOSED) {
+		if (!dev->tty) {
+			if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
+				/* Drop DLC lock here to avoid deadlock
+				 * 1. rfcomm_dev_get will take rfcomm_dev_lock
+				 *    but in rfcomm_dev_add there's lock order:
+				 *    rfcomm_dev_lock -> dlc lock
+				 * 2. rfcomm_dev_put will deadlock if it's
+				 *    the last reference
+				 */
+				rfcomm_dlc_unlock(dlc);
+				if (rfcomm_dev_get(dev->id) == NULL) {
+					rfcomm_dlc_lock(dlc);
+					return;
+				}
+
+				rfcomm_dev_del(dev);
+				rfcomm_dev_put(dev);
+				rfcomm_dlc_lock(dlc);
+			}
+		} else
+			tty_hangup(dev->tty);
+	}
+}
+
+static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
+{
+	struct rfcomm_dev *dev = dlc->owner;
+	if (!dev)
+		return;
+
+	BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
+
+	if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
+		if (dev->tty && !C_CLOCAL(dev->tty))
+			tty_hangup(dev->tty);
+	}
+
+	dev->modem_status =
+		((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
+		((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
+		((v24_sig & RFCOMM_V24_IC)  ? TIOCM_RI : 0) |
+		((v24_sig & RFCOMM_V24_DV)  ? TIOCM_CD : 0);
+}
+
+/* ---- TTY functions ---- */
+static void rfcomm_tty_wakeup(unsigned long arg)
+{
+	struct rfcomm_dev *dev = (void *) arg;
+	struct tty_struct *tty = dev->tty;
+	if (!tty)
+		return;
+
+	BT_DBG("dev %p tty %p", dev, tty);
+	tty_wakeup(tty);
+}
+
+static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev)
+{
+	struct tty_struct *tty = dev->tty;
+	struct sk_buff *skb;
+	int inserted = 0;
+
+	if (!tty)
+		return;
+
+	BT_DBG("dev %p tty %p", dev, tty);
+
+	rfcomm_dlc_lock(dev->dlc);
+
+	while ((skb = skb_dequeue(&dev->pending))) {
+		inserted += tty_insert_flip_string(tty, skb->data, skb->len);
+		kfree_skb(skb);
+	}
+
+	rfcomm_dlc_unlock(dev->dlc);
+
+	if (inserted > 0)
+		tty_flip_buffer_push(tty);
+}
+
+static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct rfcomm_dev *dev;
+	struct rfcomm_dlc *dlc;
+	int err, id;
+
+	id = tty->index;
+
+	BT_DBG("tty %p id %d", tty, id);
+
+	/* We don't leak this refcount. For reasons which are not entirely
+	   clear, the TTY layer will call our ->close() method even if the
+	   open fails. We decrease the refcount there, and decreasing it
+	   here too would cause breakage. */
+	dev = rfcomm_dev_get(id);
+	if (!dev)
+		return -ENODEV;
+
+	BT_DBG("dev %p dst %s channel %d opened %d", dev, batostr(&dev->dst),
+				dev->channel, atomic_read(&dev->opened));
+
+	if (atomic_inc_return(&dev->opened) > 1)
+		return 0;
+
+	dlc = dev->dlc;
+
+	/* Attach TTY and open DLC */
+
+	rfcomm_dlc_lock(dlc);
+	tty->driver_data = dev;
+	dev->tty = tty;
+	rfcomm_dlc_unlock(dlc);
+	set_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+
+	err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel);
+	if (err < 0)
+		return err;
+
+	/* Wait for DLC to connect */
+	add_wait_queue(&dev->wait, &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (dlc->state == BT_CLOSED) {
+			err = -dev->err;
+			break;
+		}
+
+		if (dlc->state == BT_CONNECTED)
+			break;
+
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+
+		tty_unlock();
+		schedule();
+		tty_lock();
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&dev->wait, &wait);
+
+	if (err == 0)
+		device_move(dev->tty_dev, rfcomm_get_device(dev),
+			    DPM_ORDER_DEV_AFTER_PARENT);
+
+	rfcomm_tty_copy_pending(dev);
+
+	rfcomm_dlc_unthrottle(dev->dlc);
+
+	return err;
+}
+
+static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+	if (!dev)
+		return;
+
+	BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
+						atomic_read(&dev->opened));
+
+	if (atomic_dec_and_test(&dev->opened)) {
+		if (dev->tty_dev->parent)
+			device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST);
+
+		/* Close DLC and dettach TTY */
+		rfcomm_dlc_close(dev->dlc, 0);
+
+		clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags);
+		tasklet_kill(&dev->wakeup_task);
+
+		rfcomm_dlc_lock(dev->dlc);
+		tty->driver_data = NULL;
+		dev->tty = NULL;
+		rfcomm_dlc_unlock(dev->dlc);
+
+		if (test_bit(RFCOMM_TTY_RELEASED, &dev->flags)) {
+			write_lock_bh(&rfcomm_dev_lock);
+			list_del_init(&dev->list);
+			write_unlock_bh(&rfcomm_dev_lock);
+
+			rfcomm_dev_put(dev);
+		}
+	}
+
+	rfcomm_dev_put(dev);
+}
+
+static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+	struct rfcomm_dlc *dlc = dev->dlc;
+	struct sk_buff *skb;
+	int err = 0, sent = 0, size;
+
+	BT_DBG("tty %p count %d", tty, count);
+
+	while (count) {
+		size = min_t(uint, count, dlc->mtu);
+
+		skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC);
+
+		if (!skb)
+			break;
+
+		skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
+
+		memcpy(skb_put(skb, size), buf + sent, size);
+
+		err = rfcomm_dlc_send(dlc, skb);
+		if (err < 0) {
+			kfree_skb(skb);
+			break;
+		}
+
+		sent  += size;
+		count -= size;
+	}
+
+	return sent ? sent : err;
+}
+
+static int rfcomm_tty_write_room(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+	int room;
+
+	BT_DBG("tty %p", tty);
+
+	if (!dev || !dev->dlc)
+		return 0;
+
+	room = rfcomm_room(dev->dlc) - atomic_read(&dev->wmem_alloc);
+	if (room < 0)
+		room = 0;
+
+	return room;
+}
+
+static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
+{
+	BT_DBG("tty %p cmd 0x%02x", tty, cmd);
+
+	switch (cmd) {
+	case TCGETS:
+		BT_DBG("TCGETS is not supported");
+		return -ENOIOCTLCMD;
+
+	case TCSETS:
+		BT_DBG("TCSETS is not supported");
+		return -ENOIOCTLCMD;
+
+	case TIOCMIWAIT:
+		BT_DBG("TIOCMIWAIT");
+		break;
+
+	case TIOCGSERIAL:
+		BT_ERR("TIOCGSERIAL is not supported");
+		return -ENOIOCTLCMD;
+
+	case TIOCSSERIAL:
+		BT_ERR("TIOCSSERIAL is not supported");
+		return -ENOIOCTLCMD;
+
+	case TIOCSERGSTRUCT:
+		BT_ERR("TIOCSERGSTRUCT is not supported");
+		return -ENOIOCTLCMD;
+
+	case TIOCSERGETLSR:
+		BT_ERR("TIOCSERGETLSR is not supported");
+		return -ENOIOCTLCMD;
+
+	case TIOCSERCONFIG:
+		BT_ERR("TIOCSERCONFIG is not supported");
+		return -ENOIOCTLCMD;
+
+	default:
+		return -ENOIOCTLCMD;	/* ioctls which we must ignore */
+
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
+{
+	struct ktermios *new = tty->termios;
+	int old_baud_rate = tty_termios_baud_rate(old);
+	int new_baud_rate = tty_termios_baud_rate(new);
+
+	u8 baud, data_bits, stop_bits, parity, x_on, x_off;
+	u16 changes = 0;
+
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p termios %p", tty, old);
+
+	if (!dev || !dev->dlc || !dev->dlc->session)
+		return;
+
+	/* Handle turning off CRTSCTS */
+	if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
+		BT_DBG("Turning off CRTSCTS unsupported");
+
+	/* Parity on/off and when on, odd/even */
+	if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+			((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) {
+		changes |= RFCOMM_RPN_PM_PARITY;
+		BT_DBG("Parity change detected.");
+	}
+
+	/* Mark and space parity are not supported! */
+	if (new->c_cflag & PARENB) {
+		if (new->c_cflag & PARODD) {
+			BT_DBG("Parity is ODD");
+			parity = RFCOMM_RPN_PARITY_ODD;
+		} else {
+			BT_DBG("Parity is EVEN");
+			parity = RFCOMM_RPN_PARITY_EVEN;
+		}
+	} else {
+		BT_DBG("Parity is OFF");
+		parity = RFCOMM_RPN_PARITY_NONE;
+	}
+
+	/* Setting the x_on / x_off characters */
+	if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+		BT_DBG("XOFF custom");
+		x_on = new->c_cc[VSTOP];
+		changes |= RFCOMM_RPN_PM_XON;
+	} else {
+		BT_DBG("XOFF default");
+		x_on = RFCOMM_RPN_XON_CHAR;
+	}
+
+	if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+		BT_DBG("XON custom");
+		x_off = new->c_cc[VSTART];
+		changes |= RFCOMM_RPN_PM_XOFF;
+	} else {
+		BT_DBG("XON default");
+		x_off = RFCOMM_RPN_XOFF_CHAR;
+	}
+
+	/* Handle setting of stop bits */
+	if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+		changes |= RFCOMM_RPN_PM_STOP;
+
+	/* POSIX does not support 1.5 stop bits and RFCOMM does not
+	 * support 2 stop bits. So a request for 2 stop bits gets
+	 * translated to 1.5 stop bits */
+	if (new->c_cflag & CSTOPB)
+		stop_bits = RFCOMM_RPN_STOP_15;
+	else
+		stop_bits = RFCOMM_RPN_STOP_1;
+
+	/* Handle number of data bits [5-8] */
+	if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
+		changes |= RFCOMM_RPN_PM_DATA;
+
+	switch (new->c_cflag & CSIZE) {
+	case CS5:
+		data_bits = RFCOMM_RPN_DATA_5;
+		break;
+	case CS6:
+		data_bits = RFCOMM_RPN_DATA_6;
+		break;
+	case CS7:
+		data_bits = RFCOMM_RPN_DATA_7;
+		break;
+	case CS8:
+		data_bits = RFCOMM_RPN_DATA_8;
+		break;
+	default:
+		data_bits = RFCOMM_RPN_DATA_8;
+		break;
+	}
+
+	/* Handle baudrate settings */
+	if (old_baud_rate != new_baud_rate)
+		changes |= RFCOMM_RPN_PM_BITRATE;
+
+	switch (new_baud_rate) {
+	case 2400:
+		baud = RFCOMM_RPN_BR_2400;
+		break;
+	case 4800:
+		baud = RFCOMM_RPN_BR_4800;
+		break;
+	case 7200:
+		baud = RFCOMM_RPN_BR_7200;
+		break;
+	case 9600:
+		baud = RFCOMM_RPN_BR_9600;
+		break;
+	case 19200:
+		baud = RFCOMM_RPN_BR_19200;
+		break;
+	case 38400:
+		baud = RFCOMM_RPN_BR_38400;
+		break;
+	case 57600:
+		baud = RFCOMM_RPN_BR_57600;
+		break;
+	case 115200:
+		baud = RFCOMM_RPN_BR_115200;
+		break;
+	case 230400:
+		baud = RFCOMM_RPN_BR_230400;
+		break;
+	default:
+		/* 9600 is standard accordinag to the RFCOMM specification */
+		baud = RFCOMM_RPN_BR_9600;
+		break;
+
+	}
+
+	if (changes)
+		rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+				data_bits, stop_bits, parity,
+				RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+}
+
+static void rfcomm_tty_throttle(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	rfcomm_dlc_throttle(dev->dlc);
+}
+
+static void rfcomm_tty_unthrottle(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	rfcomm_dlc_unthrottle(dev->dlc);
+}
+
+static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	if (!dev || !dev->dlc)
+		return 0;
+
+	if (!skb_queue_empty(&dev->dlc->tx_queue))
+		return dev->dlc->mtu;
+
+	return 0;
+}
+
+static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	if (!dev || !dev->dlc)
+		return;
+
+	skb_queue_purge(&dev->dlc->tx_queue);
+	tty_wakeup(tty);
+}
+
+static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
+{
+	BT_DBG("tty %p ch %c", tty, ch);
+}
+
+static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	BT_DBG("tty %p timeout %d", tty, timeout);
+}
+
+static void rfcomm_tty_hangup(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	if (!dev)
+		return;
+
+	rfcomm_tty_flush_buffer(tty);
+
+	if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) {
+		if (rfcomm_dev_get(dev->id) == NULL)
+			return;
+		rfcomm_dev_del(dev);
+		rfcomm_dev_put(dev);
+	}
+}
+
+static int rfcomm_tty_tiocmget(struct tty_struct *tty)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p dev %p", tty, dev);
+
+	return dev->modem_status;
+}
+
+static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
+{
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+	struct rfcomm_dlc *dlc = dev->dlc;
+	u8 v24_sig;
+
+	BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
+
+	rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+	if (set & TIOCM_DSR || set & TIOCM_DTR)
+		v24_sig |= RFCOMM_V24_RTC;
+	if (set & TIOCM_RTS || set & TIOCM_CTS)
+		v24_sig |= RFCOMM_V24_RTR;
+	if (set & TIOCM_RI)
+		v24_sig |= RFCOMM_V24_IC;
+	if (set & TIOCM_CD)
+		v24_sig |= RFCOMM_V24_DV;
+
+	if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+		v24_sig &= ~RFCOMM_V24_RTC;
+	if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+		v24_sig &= ~RFCOMM_V24_RTR;
+	if (clear & TIOCM_RI)
+		v24_sig &= ~RFCOMM_V24_IC;
+	if (clear & TIOCM_CD)
+		v24_sig &= ~RFCOMM_V24_DV;
+
+	rfcomm_dlc_set_modem_status(dlc, v24_sig);
+
+	return 0;
+}
+
+/* ---- TTY structure ---- */
+
+static const struct tty_operations rfcomm_ops = {
+	.open			= rfcomm_tty_open,
+	.close			= rfcomm_tty_close,
+	.write			= rfcomm_tty_write,
+	.write_room		= rfcomm_tty_write_room,
+	.chars_in_buffer	= rfcomm_tty_chars_in_buffer,
+	.flush_buffer		= rfcomm_tty_flush_buffer,
+	.ioctl			= rfcomm_tty_ioctl,
+	.throttle		= rfcomm_tty_throttle,
+	.unthrottle		= rfcomm_tty_unthrottle,
+	.set_termios		= rfcomm_tty_set_termios,
+	.send_xchar		= rfcomm_tty_send_xchar,
+	.hangup			= rfcomm_tty_hangup,
+	.wait_until_sent	= rfcomm_tty_wait_until_sent,
+	.tiocmget		= rfcomm_tty_tiocmget,
+	.tiocmset		= rfcomm_tty_tiocmset,
+};
+
+int __init rfcomm_init_ttys(void)
+{
+	rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS);
+	if (!rfcomm_tty_driver)
+		return -1;
+
+	rfcomm_tty_driver->owner	= THIS_MODULE;
+	rfcomm_tty_driver->driver_name	= "rfcomm";
+	rfcomm_tty_driver->name		= "rfcomm";
+	rfcomm_tty_driver->major	= RFCOMM_TTY_MAJOR;
+	rfcomm_tty_driver->minor_start	= RFCOMM_TTY_MINOR;
+	rfcomm_tty_driver->type		= TTY_DRIVER_TYPE_SERIAL;
+	rfcomm_tty_driver->subtype	= SERIAL_TYPE_NORMAL;
+	rfcomm_tty_driver->flags	= TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
+	rfcomm_tty_driver->init_termios	= tty_std_termios;
+	rfcomm_tty_driver->init_termios.c_cflag	= B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON;
+	tty_set_operations(rfcomm_tty_driver, &rfcomm_ops);
+
+	if (tty_register_driver(rfcomm_tty_driver)) {
+		BT_ERR("Can't register RFCOMM TTY driver");
+		put_tty_driver(rfcomm_tty_driver);
+		return -1;
+	}
+
+	BT_INFO("RFCOMM TTY layer initialized");
+
+	return 0;
+}
+
+void rfcomm_cleanup_ttys(void)
+{
+	tty_unregister_driver(rfcomm_tty_driver);
+	put_tty_driver(rfcomm_tty_driver);
+}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
new file mode 100644
index 00000000..d3d48b5b
--- /dev/null
+++ b/net/bluetooth/sco.c
@@ -0,0 +1,1104 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2000-2001 Qualcomm Incorporated
+
+   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+/* Bluetooth SCO sockets. */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/sco.h>
+
+static int disable_esco;
+
+static const struct proto_ops sco_sock_ops;
+
+static struct bt_sock_list sco_sk_list = {
+	.lock = __RW_LOCK_UNLOCKED(sco_sk_list.lock)
+};
+
+static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent);
+static void sco_chan_del(struct sock *sk, int err);
+
+static int  sco_conn_del(struct hci_conn *conn, int err);
+
+static void sco_sock_close(struct sock *sk);
+static void sco_sock_kill(struct sock *sk);
+
+/* ---- SCO timers ---- */
+static void sco_sock_timeout(unsigned long arg)
+{
+	struct sock *sk = (struct sock *) arg;
+
+	BT_DBG("sock %p state %d", sk, sk->sk_state);
+
+	bh_lock_sock(sk);
+	sk->sk_err = ETIMEDOUT;
+	sk->sk_state_change(sk);
+	bh_unlock_sock(sk);
+
+	sco_sock_kill(sk);
+	sock_put(sk);
+}
+
+static void sco_sock_set_timer(struct sock *sk, long timeout)
+{
+	BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout);
+}
+
+static void sco_sock_clear_timer(struct sock *sk)
+{
+	BT_DBG("sock %p state %d", sk, sk->sk_state);
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+/* ---- SCO connections ---- */
+static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status)
+{
+	struct hci_dev *hdev = hcon->hdev;
+	struct sco_conn *conn = hcon->sco_data;
+
+	if (conn || status)
+		return conn;
+
+	conn = kzalloc(sizeof(struct sco_conn), GFP_ATOMIC);
+	if (!conn)
+		return NULL;
+
+	spin_lock_init(&conn->lock);
+
+	hcon->sco_data = conn;
+	conn->hcon = hcon;
+
+	conn->src = &hdev->bdaddr;
+	conn->dst = &hcon->dst;
+
+	if (hdev->sco_mtu > 0)
+		conn->mtu = hdev->sco_mtu;
+	else
+		conn->mtu = 60;
+
+	BT_DBG("hcon %p conn %p", hcon, conn);
+
+	return conn;
+}
+
+static inline struct sock *sco_chan_get(struct sco_conn *conn)
+{
+	struct sock *sk = NULL;
+	sco_conn_lock(conn);
+	sk = conn->sk;
+	sco_conn_unlock(conn);
+	return sk;
+}
+
+static int sco_conn_del(struct hci_conn *hcon, int err)
+{
+	struct sco_conn *conn = hcon->sco_data;
+	struct sock *sk;
+
+	if (!conn)
+		return 0;
+
+	BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+	/* Kill socket */
+	sk = sco_chan_get(conn);
+	if (sk) {
+		bh_lock_sock(sk);
+		sco_sock_clear_timer(sk);
+		sco_chan_del(sk, err);
+		bh_unlock_sock(sk);
+		sco_sock_kill(sk);
+	}
+
+	hcon->sco_data = NULL;
+	kfree(conn);
+	return 0;
+}
+
+static inline int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+{
+	int err = 0;
+
+	sco_conn_lock(conn);
+	if (conn->sk)
+		err = -EBUSY;
+	else
+		__sco_chan_add(conn, sk, parent);
+
+	sco_conn_unlock(conn);
+	return err;
+}
+
+static int sco_connect(struct sock *sk)
+{
+	bdaddr_t *src = &bt_sk(sk)->src;
+	bdaddr_t *dst = &bt_sk(sk)->dst;
+	__u16 pkt_type = sco_pi(sk)->pkt_type;
+	struct sco_conn *conn;
+	struct hci_conn *hcon;
+	struct hci_dev  *hdev;
+	int err, type;
+
+	BT_DBG("%s -> %s", batostr(src), batostr(dst));
+
+	hdev = hci_get_route(dst, src);
+	if (!hdev)
+		return -EHOSTUNREACH;
+
+	hci_dev_lock_bh(hdev);
+
+	if (lmp_esco_capable(hdev) && !disable_esco)
+		type = ESCO_LINK;
+	else {
+		type = SCO_LINK;
+		pkt_type &= SCO_ESCO_MASK;
+	}
+
+	hcon = hci_connect(hdev, type, pkt_type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
+	if (IS_ERR(hcon)) {
+		err = PTR_ERR(hcon);
+		goto done;
+	}
+
+	conn = sco_conn_add(hcon, 0);
+	if (!conn) {
+		hci_conn_put(hcon);
+		err = -ENOMEM;
+		goto done;
+	}
+
+	/* Update source addr of the socket */
+	bacpy(src, conn->src);
+
+	err = sco_chan_add(conn, sk, NULL);
+	if (err)
+		goto done;
+
+	if (hcon->state == BT_CONNECTED) {
+		sco_sock_clear_timer(sk);
+		sk->sk_state = BT_CONNECTED;
+	} else {
+		sk->sk_state = BT_CONNECT;
+		sco_sock_set_timer(sk, sk->sk_sndtimeo);
+	}
+
+done:
+	hci_dev_unlock_bh(hdev);
+	hci_dev_put(hdev);
+	return err;
+}
+
+static inline int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sco_conn *conn = sco_pi(sk)->conn;
+	struct sk_buff *skb;
+	int err, count;
+
+	/* Check outgoing MTU */
+	if (len > conn->mtu)
+		return -EINVAL;
+
+	BT_DBG("sk %p len %d", sk, len);
+
+	count = min_t(unsigned int, conn->mtu, len);
+	skb = bt_skb_send_alloc(sk, count,
+			msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
+
+	hci_send_sco(conn->hcon, skb);
+
+	return count;
+}
+
+static inline void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
+{
+	struct sock *sk = sco_chan_get(conn);
+
+	if (!sk)
+		goto drop;
+
+	BT_DBG("sk %p len %d", sk, skb->len);
+
+	if (sk->sk_state != BT_CONNECTED)
+		goto drop;
+
+	if (!sock_queue_rcv_skb(sk, skb))
+		return;
+
+drop:
+	kfree_skb(skb);
+}
+
+/* -------- Socket interface ---------- */
+static struct sock *__sco_get_sock_by_addr(bdaddr_t *ba)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	sk_for_each(sk, node, &sco_sk_list.head)
+		if (!bacmp(&bt_sk(sk)->src, ba))
+			goto found;
+	sk = NULL;
+found:
+	return sk;
+}
+
+/* Find socket listening on source bdaddr.
+ * Returns closest match.
+ */
+static struct sock *sco_get_sock_listen(bdaddr_t *src)
+{
+	struct sock *sk = NULL, *sk1 = NULL;
+	struct hlist_node *node;
+
+	read_lock(&sco_sk_list.lock);
+
+	sk_for_each(sk, node, &sco_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		/* Exact match. */
+		if (!bacmp(&bt_sk(sk)->src, src))
+			break;
+
+		/* Closest match */
+		if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY))
+			sk1 = sk;
+	}
+
+	read_unlock(&sco_sk_list.lock);
+
+	return node ? sk : sk1;
+}
+
+static void sco_sock_destruct(struct sock *sk)
+{
+	BT_DBG("sk %p", sk);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void sco_sock_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	BT_DBG("parent %p", parent);
+
+	/* Close not yet accepted channels */
+	while ((sk = bt_accept_dequeue(parent, NULL))) {
+		sco_sock_close(sk);
+		sco_sock_kill(sk);
+	}
+
+	parent->sk_state  = BT_CLOSED;
+	sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void sco_sock_kill(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+		return;
+
+	BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+	/* Kill poor orphan */
+	bt_sock_unlink(&sco_sk_list, sk);
+	sock_set_flag(sk, SOCK_DEAD);
+	sock_put(sk);
+}
+
+static void __sco_sock_close(struct sock *sk)
+{
+	BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+	switch (sk->sk_state) {
+	case BT_LISTEN:
+		sco_sock_cleanup_listen(sk);
+		break;
+
+	case BT_CONNECTED:
+	case BT_CONFIG:
+		if (sco_pi(sk)->conn) {
+			sk->sk_state = BT_DISCONN;
+			sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
+			hci_conn_put(sco_pi(sk)->conn->hcon);
+			sco_pi(sk)->conn->hcon = NULL;
+		} else
+			sco_chan_del(sk, ECONNRESET);
+		break;
+
+	case BT_CONNECT:
+	case BT_DISCONN:
+		sco_chan_del(sk, ECONNRESET);
+		break;
+
+	default:
+		sock_set_flag(sk, SOCK_ZAPPED);
+		break;
+	}
+}
+
+/* Must be called on unlocked socket. */
+static void sco_sock_close(struct sock *sk)
+{
+	sco_sock_clear_timer(sk);
+	lock_sock(sk);
+	__sco_sock_close(sk);
+	release_sock(sk);
+	sco_sock_kill(sk);
+}
+
+static void sco_sock_init(struct sock *sk, struct sock *parent)
+{
+	BT_DBG("sk %p", sk);
+
+	if (parent)
+		sk->sk_type = parent->sk_type;
+}
+
+static struct proto sco_proto = {
+	.name		= "SCO",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct sco_pinfo)
+};
+
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+{
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk);
+	INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+	sk->sk_destruct = sco_sock_destruct;
+	sk->sk_sndtimeo = SCO_CONN_TIMEOUT;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = proto;
+	sk->sk_state    = BT_OPEN;
+
+	setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk);
+
+	bt_sock_link(&sco_sk_list, sk);
+	return sk;
+}
+
+static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
+			   int kern)
+{
+	struct sock *sk;
+
+	BT_DBG("sock %p", sock);
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_SEQPACKET)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &sco_sock_ops;
+
+	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	if (!sk)
+		return -ENOMEM;
+
+	sco_sock_init(sk, NULL);
+	return 0;
+}
+
+static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+{
+	struct sockaddr_sco sa;
+	struct sock *sk = sock->sk;
+	bdaddr_t *src = &sa.sco_bdaddr;
+	int len, err = 0;
+
+	BT_DBG("sk %p %s", sk, batostr(&sa.sco_bdaddr));
+
+	if (!addr || addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	memset(&sa, 0, sizeof(sa));
+	len = min_t(unsigned int, sizeof(sa), alen);
+	memcpy(&sa, addr, len);
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_OPEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	write_lock_bh(&sco_sk_list.lock);
+
+	if (bacmp(src, BDADDR_ANY) && __sco_get_sock_by_addr(src)) {
+		err = -EADDRINUSE;
+	} else {
+		/* Save source address */
+		bacpy(&bt_sk(sk)->src, &sa.sco_bdaddr);
+		sco_pi(sk)->pkt_type = sa.sco_pkt_type;
+		sk->sk_state = BT_BOUND;
+	}
+
+	write_unlock_bh(&sco_sk_list.lock);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_sco sa;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (!addr || addr->sa_family != AF_BLUETOOTH)
+		return -EINVAL;
+
+	memset(&sa, 0, sizeof(sa));
+	len = min_t(unsigned int, sizeof(sa), alen);
+	memcpy(&sa, addr, len);
+
+	lock_sock(sk);
+
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	/* Set destination address and psm */
+	bacpy(&bt_sk(sk)->dst, &sa.sco_bdaddr);
+	sco_pi(sk)->pkt_type = sa.sco_pkt_type;
+
+	err = sco_connect(sk);
+	if (err)
+		goto done;
+
+	err = bt_sock_wait_state(sk, BT_CONNECTED,
+			sock_sndtimeo(sk, flags & O_NONBLOCK));
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sk %p backlog %d", sk, backlog);
+
+	lock_sock(sk);
+
+	if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = BT_LISTEN;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct sock *sk = sock->sk, *ch;
+	long timeo;
+	int err = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	BT_DBG("sk %p timeo %ld", sk, timeo);
+
+	/* Wait for an incoming connection. (wake-one). */
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (sk->sk_state != BT_LISTEN) {
+			err = -EBADFD;
+			break;
+		}
+
+		ch = bt_accept_dequeue(sk, newsock);
+		if (ch)
+			break;
+
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (err)
+		goto done;
+
+	newsock->state = SS_CONNECTED;
+
+	BT_DBG("new socket %p", ch);
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+{
+	struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
+	struct sock *sk = sock->sk;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	addr->sa_family = AF_BLUETOOTH;
+	*len = sizeof(struct sockaddr_sco);
+
+	if (peer)
+		bacpy(&sa->sco_bdaddr, &bt_sk(sk)->dst);
+	else
+		bacpy(&sa->sco_bdaddr, &bt_sk(sk)->src);
+	sa->sco_pkt_type = sco_pi(sk)->pkt_type;
+
+	return 0;
+}
+
+static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+			    struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	err = sock_error(sk);
+	if (err)
+		return err;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == BT_CONNECTED)
+		err = sco_send_frame(sk, msg, len);
+	else
+		err = -ENOTCONN;
+
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	lock_sock(sk);
+
+	switch (optname) {
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct sco_options opts;
+	struct sco_conninfo cinfo;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case SCO_OPTIONS:
+		if (sk->sk_state != BT_CONNECTED) {
+			err = -ENOTCONN;
+			break;
+		}
+
+		opts.mtu = sco_pi(sk)->conn->mtu;
+
+		BT_DBG("mtu %d", opts.mtu);
+
+		len = min_t(unsigned int, len, sizeof(opts));
+		if (copy_to_user(optval, (char *)&opts, len))
+			err = -EFAULT;
+
+		break;
+
+	case SCO_CONNINFO:
+		if (sk->sk_state != BT_CONNECTED) {
+			err = -ENOTCONN;
+			break;
+		}
+
+		memset(&cinfo, 0, sizeof(cinfo));
+		cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle;
+		memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3);
+
+		len = min_t(unsigned int, len, sizeof(cinfo));
+		if (copy_to_user(optval, (char *)&cinfo, len))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int len, err = 0;
+
+	BT_DBG("sk %p", sk);
+
+	if (level == SOL_SCO)
+		return sco_sock_getsockopt_old(sock, optname, optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+	if (!sk->sk_shutdown) {
+		sk->sk_shutdown = SHUTDOWN_MASK;
+		sco_sock_clear_timer(sk);
+		__sco_sock_close(sk);
+
+		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+			err = bt_sock_wait_state(sk, BT_CLOSED,
+							sk->sk_lingertime);
+	}
+	release_sock(sk);
+	return err;
+}
+
+static int sco_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	BT_DBG("sock %p, sk %p", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sco_sock_close(sk);
+
+	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) {
+		lock_sock(sk);
+		err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+		release_sock(sk);
+	}
+
+	sock_orphan(sk);
+	sco_sock_kill(sk);
+	return err;
+}
+
+static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent)
+{
+	BT_DBG("conn %p", conn);
+
+	sco_pi(sk)->conn = conn;
+	conn->sk = sk;
+
+	if (parent)
+		bt_accept_enqueue(parent, sk);
+}
+
+/* Delete channel.
+ * Must be called on the locked socket. */
+static void sco_chan_del(struct sock *sk, int err)
+{
+	struct sco_conn *conn;
+
+	conn = sco_pi(sk)->conn;
+
+	BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+	if (conn) {
+		sco_conn_lock(conn);
+		conn->sk = NULL;
+		sco_pi(sk)->conn = NULL;
+		sco_conn_unlock(conn);
+
+		if (conn->hcon)
+			hci_conn_put(conn->hcon);
+	}
+
+	sk->sk_state = BT_CLOSED;
+	sk->sk_err   = err;
+	sk->sk_state_change(sk);
+
+	sock_set_flag(sk, SOCK_ZAPPED);
+}
+
+static void sco_conn_ready(struct sco_conn *conn)
+{
+	struct sock *parent;
+	struct sock *sk = conn->sk;
+
+	BT_DBG("conn %p", conn);
+
+	sco_conn_lock(conn);
+
+	if (sk) {
+		sco_sock_clear_timer(sk);
+		bh_lock_sock(sk);
+		sk->sk_state = BT_CONNECTED;
+		sk->sk_state_change(sk);
+		bh_unlock_sock(sk);
+	} else {
+		parent = sco_get_sock_listen(conn->src);
+		if (!parent)
+			goto done;
+
+		bh_lock_sock(parent);
+
+		sk = sco_sock_alloc(sock_net(parent), NULL,
+				BTPROTO_SCO, GFP_ATOMIC);
+		if (!sk) {
+			bh_unlock_sock(parent);
+			goto done;
+		}
+
+		sco_sock_init(sk, parent);
+
+		bacpy(&bt_sk(sk)->src, conn->src);
+		bacpy(&bt_sk(sk)->dst, conn->dst);
+
+		hci_conn_hold(conn->hcon);
+		__sco_chan_add(conn, sk, parent);
+
+		sk->sk_state = BT_CONNECTED;
+
+		/* Wake up parent */
+		parent->sk_data_ready(parent, 1);
+
+		bh_unlock_sock(parent);
+	}
+
+done:
+	sco_conn_unlock(conn);
+}
+
+/* ----- SCO interface with lower layer (HCI) ----- */
+static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
+{
+	register struct sock *sk;
+	struct hlist_node *node;
+	int lm = 0;
+
+	if (type != SCO_LINK && type != ESCO_LINK)
+		return -EINVAL;
+
+	BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
+
+	/* Find listening sockets */
+	read_lock(&sco_sk_list.lock);
+	sk_for_each(sk, node, &sco_sk_list.head) {
+		if (sk->sk_state != BT_LISTEN)
+			continue;
+
+		if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) ||
+				!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
+			lm |= HCI_LM_ACCEPT;
+			break;
+		}
+	}
+	read_unlock(&sco_sk_list.lock);
+
+	return lm;
+}
+
+static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
+{
+	BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
+
+	if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+		return -EINVAL;
+
+	if (!status) {
+		struct sco_conn *conn;
+
+		conn = sco_conn_add(hcon, status);
+		if (conn)
+			sco_conn_ready(conn);
+	} else
+		sco_conn_del(hcon, bt_to_errno(status));
+
+	return 0;
+}
+
+static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
+{
+	BT_DBG("hcon %p reason %d", hcon, reason);
+
+	if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+		return -EINVAL;
+
+	sco_conn_del(hcon, bt_to_errno(reason));
+
+	return 0;
+}
+
+static int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+{
+	struct sco_conn *conn = hcon->sco_data;
+
+	if (!conn)
+		goto drop;
+
+	BT_DBG("conn %p len %d", conn, skb->len);
+
+	if (skb->len) {
+		sco_recv_frame(conn, skb);
+		return 0;
+	}
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int sco_debugfs_show(struct seq_file *f, void *p)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock_bh(&sco_sk_list.lock);
+
+	sk_for_each(sk, node, &sco_sk_list.head) {
+		seq_printf(f, "%s %s %d\n", batostr(&bt_sk(sk)->src),
+				batostr(&bt_sk(sk)->dst), sk->sk_state);
+	}
+
+	read_unlock_bh(&sco_sk_list.lock);
+
+	return 0;
+}
+
+static int sco_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sco_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations sco_debugfs_fops = {
+	.open		= sco_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *sco_debugfs;
+
+static const struct proto_ops sco_sock_ops = {
+	.family		= PF_BLUETOOTH,
+	.owner		= THIS_MODULE,
+	.release	= sco_sock_release,
+	.bind		= sco_sock_bind,
+	.connect	= sco_sock_connect,
+	.listen		= sco_sock_listen,
+	.accept		= sco_sock_accept,
+	.getname	= sco_sock_getname,
+	.sendmsg	= sco_sock_sendmsg,
+	.recvmsg	= bt_sock_recvmsg,
+	.poll		= bt_sock_poll,
+	.ioctl		= bt_sock_ioctl,
+	.mmap		= sock_no_mmap,
+	.socketpair	= sock_no_socketpair,
+	.shutdown	= sco_sock_shutdown,
+	.setsockopt	= sco_sock_setsockopt,
+	.getsockopt	= sco_sock_getsockopt
+};
+
+static const struct net_proto_family sco_sock_family_ops = {
+	.family	= PF_BLUETOOTH,
+	.owner	= THIS_MODULE,
+	.create	= sco_sock_create,
+};
+
+static struct hci_proto sco_hci_proto = {
+	.name		= "SCO",
+	.id		= HCI_PROTO_SCO,
+	.connect_ind	= sco_connect_ind,
+	.connect_cfm	= sco_connect_cfm,
+	.disconn_cfm	= sco_disconn_cfm,
+	.recv_scodata	= sco_recv_scodata
+};
+
+int __init sco_init(void)
+{
+	int err;
+
+	err = proto_register(&sco_proto, 0);
+	if (err < 0)
+		return err;
+
+	err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops);
+	if (err < 0) {
+		BT_ERR("SCO socket registration failed");
+		goto error;
+	}
+
+	err = hci_register_proto(&sco_hci_proto);
+	if (err < 0) {
+		BT_ERR("SCO protocol registration failed");
+		bt_sock_unregister(BTPROTO_SCO);
+		goto error;
+	}
+
+	if (bt_debugfs) {
+		sco_debugfs = debugfs_create_file("sco", 0444,
+					bt_debugfs, NULL, &sco_debugfs_fops);
+		if (!sco_debugfs)
+			BT_ERR("Failed to create SCO debug file");
+	}
+
+	BT_INFO("SCO socket layer initialized");
+
+	return 0;
+
+error:
+	proto_unregister(&sco_proto);
+	return err;
+}
+
+void __exit sco_exit(void)
+{
+	debugfs_remove(sco_debugfs);
+
+	if (bt_sock_unregister(BTPROTO_SCO) < 0)
+		BT_ERR("SCO socket unregistration failed");
+
+	if (hci_unregister_proto(&sco_hci_proto) < 0)
+		BT_ERR("SCO protocol unregistration failed");
+
+	proto_unregister(&sco_proto);
+}
+
+module_param(disable_esco, bool, 0644);
+MODULE_PARM_DESC(disable_esco, "Disable eSCO connection creation");
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
new file mode 100644
index 00000000..391888b8
--- /dev/null
+++ b/net/bluetooth/smp.c
@@ -0,0 +1,702 @@
+/*
+   BlueZ - Bluetooth protocol stack for Linux
+   Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License version 2 as
+   published by the Free Software Foundation;
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
+   SOFTWARE IS DISCLAIMED.
+*/
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/smp.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/b128ops.h>
+
+#define SMP_TIMEOUT 30000 /* 30 seconds */
+
+static inline void swap128(u8 src[16], u8 dst[16])
+{
+	int i;
+	for (i = 0; i < 16; i++)
+		dst[15 - i] = src[i];
+}
+
+static inline void swap56(u8 src[7], u8 dst[7])
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		dst[6 - i] = src[i];
+}
+
+static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist sg;
+	int err, iv_len;
+	unsigned char iv[128];
+
+	if (tfm == NULL) {
+		BT_ERR("tfm %p", tfm);
+		return -EINVAL;
+	}
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+
+	err = crypto_blkcipher_setkey(tfm, k, 16);
+	if (err) {
+		BT_ERR("cipher setkey failed: %d", err);
+		return err;
+	}
+
+	sg_init_one(&sg, r, 16);
+
+	iv_len = crypto_blkcipher_ivsize(tfm);
+	if (iv_len) {
+		memset(&iv, 0xff, iv_len);
+		crypto_blkcipher_set_iv(tfm, iv, iv_len);
+	}
+
+	err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16);
+	if (err)
+		BT_ERR("Encrypt data error %d", err);
+
+	return err;
+}
+
+static int smp_c1(struct crypto_blkcipher *tfm, u8 k[16], u8 r[16],
+		u8 preq[7], u8 pres[7], u8 _iat, bdaddr_t *ia,
+		u8 _rat, bdaddr_t *ra, u8 res[16])
+{
+	u8 p1[16], p2[16];
+	int err;
+
+	memset(p1, 0, 16);
+
+	/* p1 = pres || preq || _rat || _iat */
+	swap56(pres, p1);
+	swap56(preq, p1 + 7);
+	p1[14] = _rat;
+	p1[15] = _iat;
+
+	memset(p2, 0, 16);
+
+	/* p2 = padding || ia || ra */
+	baswap((bdaddr_t *) (p2 + 4), ia);
+	baswap((bdaddr_t *) (p2 + 10), ra);
+
+	/* res = r XOR p1 */
+	u128_xor((u128 *) res, (u128 *) r, (u128 *) p1);
+
+	/* res = e(k, res) */
+	err = smp_e(tfm, k, res);
+	if (err) {
+		BT_ERR("Encrypt data error");
+		return err;
+	}
+
+	/* res = res XOR p2 */
+	u128_xor((u128 *) res, (u128 *) res, (u128 *) p2);
+
+	/* res = e(k, res) */
+	err = smp_e(tfm, k, res);
+	if (err)
+		BT_ERR("Encrypt data error");
+
+	return err;
+}
+
+static int smp_s1(struct crypto_blkcipher *tfm, u8 k[16],
+			u8 r1[16], u8 r2[16], u8 _r[16])
+{
+	int err;
+
+	/* Just least significant octets from r1 and r2 are considered */
+	memcpy(_r, r1 + 8, 8);
+	memcpy(_r + 8, r2 + 8, 8);
+
+	err = smp_e(tfm, k, _r);
+	if (err)
+		BT_ERR("Encrypt data error");
+
+	return err;
+}
+
+static int smp_rand(u8 *buf)
+{
+	get_random_bytes(buf, 16);
+
+	return 0;
+}
+
+static struct sk_buff *smp_build_cmd(struct l2cap_conn *conn, u8 code,
+						u16 dlen, void *data)
+{
+	struct sk_buff *skb;
+	struct l2cap_hdr *lh;
+	int len;
+
+	len = L2CAP_HDR_SIZE + sizeof(code) + dlen;
+
+	if (len > conn->mtu)
+		return NULL;
+
+	skb = bt_skb_alloc(len, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
+	lh->len = cpu_to_le16(sizeof(code) + dlen);
+	lh->cid = cpu_to_le16(L2CAP_CID_SMP);
+
+	memcpy(skb_put(skb, sizeof(code)), &code, sizeof(code));
+
+	memcpy(skb_put(skb, dlen), data, dlen);
+
+	return skb;
+}
+
+static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
+{
+	struct sk_buff *skb = smp_build_cmd(conn, code, len, data);
+
+	BT_DBG("code 0x%2.2x", code);
+
+	if (!skb)
+		return;
+
+	hci_send_acl(conn->hcon, skb, 0);
+}
+
+static __u8 seclevel_to_authreq(__u8 level)
+{
+	switch (level) {
+	case BT_SECURITY_HIGH:
+		/* Right now we don't support bonding */
+		return SMP_AUTH_MITM;
+
+	default:
+		return SMP_AUTH_NONE;
+	}
+}
+
+static void build_pairing_cmd(struct l2cap_conn *conn,
+				struct smp_cmd_pairing *req,
+				struct smp_cmd_pairing *rsp,
+				__u8 authreq)
+{
+	u8 dist_keys;
+
+	dist_keys = 0;
+	if (test_bit(HCI_PAIRABLE, &conn->hcon->hdev->flags)) {
+		dist_keys = SMP_DIST_ENC_KEY | SMP_DIST_ID_KEY | SMP_DIST_SIGN;
+		authreq |= SMP_AUTH_BONDING;
+	}
+
+	if (rsp == NULL) {
+		req->io_capability = conn->hcon->io_capability;
+		req->oob_flag = SMP_OOB_NOT_PRESENT;
+		req->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+		req->init_key_dist = dist_keys;
+		req->resp_key_dist = dist_keys;
+		req->auth_req = authreq;
+		return;
+	}
+
+	rsp->io_capability = conn->hcon->io_capability;
+	rsp->oob_flag = SMP_OOB_NOT_PRESENT;
+	rsp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
+	rsp->init_key_dist = req->init_key_dist & dist_keys;
+	rsp->resp_key_dist = req->resp_key_dist & dist_keys;
+	rsp->auth_req = authreq;
+}
+
+static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
+{
+	if ((max_key_size > SMP_MAX_ENC_KEY_SIZE) ||
+			(max_key_size < SMP_MIN_ENC_KEY_SIZE))
+		return SMP_ENC_KEY_SIZE;
+
+	conn->smp_key_size = max_key_size;
+
+	return 0;
+}
+
+static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct smp_cmd_pairing rsp, *req = (void *) skb->data;
+	u8 key_size;
+
+	BT_DBG("conn %p", conn);
+
+	conn->preq[0] = SMP_CMD_PAIRING_REQ;
+	memcpy(&conn->preq[1], req, sizeof(*req));
+	skb_pull(skb, sizeof(*req));
+
+	if (req->oob_flag)
+		return SMP_OOB_NOT_AVAIL;
+
+	/* We didn't start the pairing, so no requirements */
+	build_pairing_cmd(conn, req, &rsp, SMP_AUTH_NONE);
+
+	key_size = min(req->max_key_size, rsp.max_key_size);
+	if (check_enc_key_size(conn, key_size))
+		return SMP_ENC_KEY_SIZE;
+
+	/* Just works */
+	memset(conn->tk, 0, sizeof(conn->tk));
+
+	conn->prsp[0] = SMP_CMD_PAIRING_RSP;
+	memcpy(&conn->prsp[1], &rsp, sizeof(rsp));
+
+	smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp);
+
+	mod_timer(&conn->security_timer, jiffies +
+					msecs_to_jiffies(SMP_TIMEOUT));
+
+	return 0;
+}
+
+static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct smp_cmd_pairing *req, *rsp = (void *) skb->data;
+	struct smp_cmd_pairing_confirm cp;
+	struct crypto_blkcipher *tfm = conn->hcon->hdev->tfm;
+	int ret;
+	u8 res[16], key_size;
+
+	BT_DBG("conn %p", conn);
+
+	skb_pull(skb, sizeof(*rsp));
+
+	req = (void *) &conn->preq[1];
+
+	key_size = min(req->max_key_size, rsp->max_key_size);
+	if (check_enc_key_size(conn, key_size))
+		return SMP_ENC_KEY_SIZE;
+
+	if (rsp->oob_flag)
+		return SMP_OOB_NOT_AVAIL;
+
+	/* Just works */
+	memset(conn->tk, 0, sizeof(conn->tk));
+
+	conn->prsp[0] = SMP_CMD_PAIRING_RSP;
+	memcpy(&conn->prsp[1], rsp, sizeof(*rsp));
+
+	ret = smp_rand(conn->prnd);
+	if (ret)
+		return SMP_UNSPECIFIED;
+
+	ret = smp_c1(tfm, conn->tk, conn->prnd, conn->preq, conn->prsp, 0,
+			conn->src, conn->hcon->dst_type, conn->dst, res);
+	if (ret)
+		return SMP_UNSPECIFIED;
+
+	swap128(res, cp.confirm_val);
+
+	smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
+
+	return 0;
+}
+
+static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct crypto_blkcipher *tfm = conn->hcon->hdev->tfm;
+
+	BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+
+	memcpy(conn->pcnf, skb->data, sizeof(conn->pcnf));
+	skb_pull(skb, sizeof(conn->pcnf));
+
+	if (conn->hcon->out) {
+		u8 random[16];
+
+		swap128(conn->prnd, random);
+		smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(random),
+								random);
+	} else {
+		struct smp_cmd_pairing_confirm cp;
+		int ret;
+		u8 res[16];
+
+		ret = smp_rand(conn->prnd);
+		if (ret)
+			return SMP_UNSPECIFIED;
+
+		ret = smp_c1(tfm, conn->tk, conn->prnd, conn->preq, conn->prsp,
+						conn->hcon->dst_type, conn->dst,
+						0, conn->src, res);
+		if (ret)
+			return SMP_CONFIRM_FAILED;
+
+		swap128(res, cp.confirm_val);
+
+		smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
+	}
+
+	mod_timer(&conn->security_timer, jiffies +
+					msecs_to_jiffies(SMP_TIMEOUT));
+
+	return 0;
+}
+
+static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct hci_conn *hcon = conn->hcon;
+	struct crypto_blkcipher *tfm = hcon->hdev->tfm;
+	int ret;
+	u8 key[16], res[16], random[16], confirm[16];
+
+	swap128(skb->data, random);
+	skb_pull(skb, sizeof(random));
+
+	if (conn->hcon->out)
+		ret = smp_c1(tfm, conn->tk, random, conn->preq, conn->prsp, 0,
+				conn->src, conn->hcon->dst_type, conn->dst,
+				res);
+	else
+		ret = smp_c1(tfm, conn->tk, random, conn->preq, conn->prsp,
+				conn->hcon->dst_type, conn->dst, 0, conn->src,
+				res);
+	if (ret)
+		return SMP_UNSPECIFIED;
+
+	BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+
+	swap128(res, confirm);
+
+	if (memcmp(conn->pcnf, confirm, sizeof(conn->pcnf)) != 0) {
+		BT_ERR("Pairing failed (confirmation values mismatch)");
+		return SMP_CONFIRM_FAILED;
+	}
+
+	if (conn->hcon->out) {
+		u8 stk[16], rand[8];
+		__le16 ediv;
+
+		memset(rand, 0, sizeof(rand));
+		ediv = 0;
+
+		smp_s1(tfm, conn->tk, random, conn->prnd, key);
+		swap128(key, stk);
+
+		memset(stk + conn->smp_key_size, 0,
+				SMP_MAX_ENC_KEY_SIZE - conn->smp_key_size);
+
+		hci_le_start_enc(hcon, ediv, rand, stk);
+		hcon->enc_key_size = conn->smp_key_size;
+	} else {
+		u8 stk[16], r[16], rand[8];
+		__le16 ediv;
+
+		memset(rand, 0, sizeof(rand));
+		ediv = 0;
+
+		swap128(conn->prnd, r);
+		smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(r), r);
+
+		smp_s1(tfm, conn->tk, conn->prnd, random, key);
+		swap128(key, stk);
+
+		memset(stk + conn->smp_key_size, 0,
+				SMP_MAX_ENC_KEY_SIZE - conn->smp_key_size);
+
+		hci_add_ltk(conn->hcon->hdev, 0, conn->dst, conn->smp_key_size,
+							ediv, rand, stk);
+	}
+
+	return 0;
+}
+
+static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct smp_cmd_security_req *rp = (void *) skb->data;
+	struct smp_cmd_pairing cp;
+	struct hci_conn *hcon = conn->hcon;
+
+	BT_DBG("conn %p", conn);
+
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend))
+		return 0;
+
+	skb_pull(skb, sizeof(*rp));
+
+	memset(&cp, 0, sizeof(cp));
+	build_pairing_cmd(conn, &cp, NULL, rp->auth_req);
+
+	conn->preq[0] = SMP_CMD_PAIRING_REQ;
+	memcpy(&conn->preq[1], &cp, sizeof(cp));
+
+	smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+
+	mod_timer(&conn->security_timer, jiffies +
+					msecs_to_jiffies(SMP_TIMEOUT));
+
+	set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend);
+
+	return 0;
+}
+
+int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level)
+{
+	struct hci_conn *hcon = conn->hcon;
+	__u8 authreq;
+
+	BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level);
+
+	if (!lmp_host_le_capable(hcon->hdev))
+		return 1;
+
+	if (IS_ERR(hcon->hdev->tfm))
+		return 1;
+
+	if (test_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend))
+		return 0;
+
+	if (sec_level == BT_SECURITY_LOW)
+		return 1;
+
+	if (hcon->sec_level >= sec_level)
+		return 1;
+
+	authreq = seclevel_to_authreq(sec_level);
+
+	if (hcon->link_mode & HCI_LM_MASTER) {
+		struct smp_cmd_pairing cp;
+		struct link_key *key;
+
+		key = hci_find_link_key_type(hcon->hdev, conn->dst,
+							HCI_LK_SMP_LTK);
+		if (key) {
+			struct key_master_id *master = (void *) key->data;
+
+			hci_le_start_enc(hcon, master->ediv, master->rand,
+								key->val);
+			hcon->enc_key_size = key->pin_len;
+
+			goto done;
+		}
+
+		build_pairing_cmd(conn, &cp, NULL, authreq);
+		conn->preq[0] = SMP_CMD_PAIRING_REQ;
+		memcpy(&conn->preq[1], &cp, sizeof(cp));
+
+		mod_timer(&conn->security_timer, jiffies +
+					msecs_to_jiffies(SMP_TIMEOUT));
+
+		smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+	} else {
+		struct smp_cmd_security_req cp;
+		cp.auth_req = authreq;
+		smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
+	}
+
+done:
+	hcon->pending_sec_level = sec_level;
+	set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->pend);
+
+	return 0;
+}
+
+static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct smp_cmd_encrypt_info *rp = (void *) skb->data;
+
+	skb_pull(skb, sizeof(*rp));
+
+	memcpy(conn->tk, rp->ltk, sizeof(conn->tk));
+
+	return 0;
+}
+
+static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	struct smp_cmd_master_ident *rp = (void *) skb->data;
+
+	skb_pull(skb, sizeof(*rp));
+
+	hci_add_ltk(conn->hcon->hdev, 1, conn->src, conn->smp_key_size,
+						rp->ediv, rp->rand, conn->tk);
+
+	smp_distribute_keys(conn, 1);
+
+	return 0;
+}
+
+int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+	__u8 code = skb->data[0];
+	__u8 reason;
+	int err = 0;
+
+	if (!lmp_host_le_capable(conn->hcon->hdev)) {
+		err = -ENOTSUPP;
+		reason = SMP_PAIRING_NOTSUPP;
+		goto done;
+	}
+
+	if (IS_ERR(conn->hcon->hdev->tfm)) {
+		err = PTR_ERR(conn->hcon->hdev->tfm);
+		reason = SMP_PAIRING_NOTSUPP;
+		goto done;
+	}
+
+	skb_pull(skb, sizeof(code));
+
+	switch (code) {
+	case SMP_CMD_PAIRING_REQ:
+		reason = smp_cmd_pairing_req(conn, skb);
+		break;
+
+	case SMP_CMD_PAIRING_FAIL:
+		reason = 0;
+		err = -EPERM;
+		break;
+
+	case SMP_CMD_PAIRING_RSP:
+		reason = smp_cmd_pairing_rsp(conn, skb);
+		break;
+
+	case SMP_CMD_SECURITY_REQ:
+		reason = smp_cmd_security_req(conn, skb);
+		break;
+
+	case SMP_CMD_PAIRING_CONFIRM:
+		reason = smp_cmd_pairing_confirm(conn, skb);
+		break;
+
+	case SMP_CMD_PAIRING_RANDOM:
+		reason = smp_cmd_pairing_random(conn, skb);
+		break;
+
+	case SMP_CMD_ENCRYPT_INFO:
+		reason = smp_cmd_encrypt_info(conn, skb);
+		break;
+
+	case SMP_CMD_MASTER_IDENT:
+		reason = smp_cmd_master_ident(conn, skb);
+		break;
+
+	case SMP_CMD_IDENT_INFO:
+	case SMP_CMD_IDENT_ADDR_INFO:
+	case SMP_CMD_SIGN_INFO:
+		/* Just ignored */
+		reason = 0;
+		break;
+
+	default:
+		BT_DBG("Unknown command code 0x%2.2x", code);
+
+		reason = SMP_CMD_NOTSUPP;
+		err = -EOPNOTSUPP;
+		goto done;
+	}
+
+done:
+	if (reason)
+		smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason),
+								&reason);
+
+	kfree_skb(skb);
+	return err;
+}
+
+int smp_distribute_keys(struct l2cap_conn *conn, __u8 force)
+{
+	struct smp_cmd_pairing *req, *rsp;
+	__u8 *keydist;
+
+	BT_DBG("conn %p force %d", conn, force);
+
+	if (IS_ERR(conn->hcon->hdev->tfm))
+		return PTR_ERR(conn->hcon->hdev->tfm);
+
+	rsp = (void *) &conn->prsp[1];
+
+	/* The responder sends its keys first */
+	if (!force && conn->hcon->out && (rsp->resp_key_dist & 0x07))
+		return 0;
+
+	req = (void *) &conn->preq[1];
+
+	if (conn->hcon->out) {
+		keydist = &rsp->init_key_dist;
+		*keydist &= req->init_key_dist;
+	} else {
+		keydist = &rsp->resp_key_dist;
+		*keydist &= req->resp_key_dist;
+	}
+
+
+	BT_DBG("keydist 0x%x", *keydist);
+
+	if (*keydist & SMP_DIST_ENC_KEY) {
+		struct smp_cmd_encrypt_info enc;
+		struct smp_cmd_master_ident ident;
+		__le16 ediv;
+
+		get_random_bytes(enc.ltk, sizeof(enc.ltk));
+		get_random_bytes(&ediv, sizeof(ediv));
+		get_random_bytes(ident.rand, sizeof(ident.rand));
+
+		smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc);
+
+		hci_add_ltk(conn->hcon->hdev, 1, conn->dst, conn->smp_key_size,
+						ediv, ident.rand, enc.ltk);
+
+		ident.ediv = cpu_to_le16(ediv);
+
+		smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident);
+
+		*keydist &= ~SMP_DIST_ENC_KEY;
+	}
+
+	if (*keydist & SMP_DIST_ID_KEY) {
+		struct smp_cmd_ident_addr_info addrinfo;
+		struct smp_cmd_ident_info idinfo;
+
+		/* Send a dummy key */
+		get_random_bytes(idinfo.irk, sizeof(idinfo.irk));
+
+		smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo);
+
+		/* Just public address */
+		memset(&addrinfo, 0, sizeof(addrinfo));
+		bacpy(&addrinfo.bdaddr, conn->src);
+
+		smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo),
+								&addrinfo);
+
+		*keydist &= ~SMP_DIST_ID_KEY;
+	}
+
+	if (*keydist & SMP_DIST_SIGN) {
+		struct smp_cmd_sign_info sign;
+
+		/* Send a dummy key */
+		get_random_bytes(sign.csrk, sizeof(sign.csrk));
+
+		smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign);
+
+		*keydist &= ~SMP_DIST_SIGN;
+	}
+
+	return 0;
+}
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 00000000..6dee7bf6
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,48 @@
+#
+# 802.1d Ethernet Bridging
+#
+
+config BRIDGE
+	tristate "802.1d Ethernet Bridging"
+	select LLC
+	select STP
+	depends on IPV6 || IPV6=n
+	---help---
+	  If you say Y here, then your Linux box will be able to act as an
+	  Ethernet bridge, which means that the different Ethernet segments it
+	  is connected to will appear as one Ethernet to the participants.
+	  Several such bridges can work together to create even larger
+	  networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
+	  As this is a standard, Linux bridges will cooperate properly with
+	  other third party bridge products.
+
+	  In order to use the Ethernet bridge, you'll need the bridge
+	  configuration tools; see <file:Documentation/networking/bridge.txt>
+	  for location. Please read the Bridge mini-HOWTO for more
+	  information.
+
+	  If you enable iptables support along with the bridge support then you
+	  turn your bridge into a bridging IP firewall.
+	  iptables will then see the IP packets being bridged, so you need to
+	  take this into account when setting up your firewall rules.
+	  Enabling arptables support when bridging will let arptables see
+	  bridged ARP traffic in the arptables FORWARD chain.
+
+	  To compile this code as a module, choose M here: the module
+	  will be called bridge.
+
+	  If unsure, say N.
+
+config BRIDGE_IGMP_SNOOPING
+	bool "IGMP/MLD snooping"
+	depends on BRIDGE
+	depends on INET
+	default y
+	---help---
+	  If you say Y here, then the Ethernet bridge will be able selectively
+	  forward multicast traffic based on IGMP/MLD traffic received from
+	  each port.
+
+	  Say N to exclude this support and reduce the binary size.
+
+	  If unsure, say Y.
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
new file mode 100644
index 00000000..d0359ea8
--- /dev/null
+++ b/net/bridge/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for the IEEE 802.1d ethernet bridging layer.
+#
+
+obj-$(CONFIG_BRIDGE) += bridge.o
+
+bridge-y	:= br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
+			br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \
+			br_stp_if.o br_stp_timer.o br_netlink.o
+
+bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
+
+bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
+
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o
+
+obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/
diff --git a/net/bridge/br.c b/net/bridge/br.c
new file mode 100644
index 00000000..f20c4fd9
--- /dev/null
+++ b/net/bridge/br.c
@@ -0,0 +1,107 @@
+/*
+ *	Generic parts
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/llc.h>
+#include <net/llc.h>
+#include <net/stp.h>
+
+#include "br_private.h"
+
+static const struct stp_proto br_stp_proto = {
+	.rcv	= br_stp_rcv,
+};
+
+static struct pernet_operations br_net_ops = {
+	.exit	= br_net_exit,
+};
+
+static int __init br_init(void)
+{
+	int err;
+
+	err = stp_proto_register(&br_stp_proto);
+	if (err < 0) {
+		pr_err("bridge: can't register sap for STP\n");
+		return err;
+	}
+
+	err = br_fdb_init();
+	if (err)
+		goto err_out;
+
+	err = register_pernet_subsys(&br_net_ops);
+	if (err)
+		goto err_out1;
+
+	err = br_netfilter_init();
+	if (err)
+		goto err_out2;
+
+	err = register_netdevice_notifier(&br_device_notifier);
+	if (err)
+		goto err_out3;
+
+	err = br_netlink_init();
+	if (err)
+		goto err_out4;
+
+	brioctl_set(br_ioctl_deviceless_stub);
+
+#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+	br_fdb_test_addr_hook = br_fdb_test_addr;
+#endif
+
+	return 0;
+err_out4:
+	unregister_netdevice_notifier(&br_device_notifier);
+err_out3:
+	br_netfilter_fini();
+err_out2:
+	unregister_pernet_subsys(&br_net_ops);
+err_out1:
+	br_fdb_fini();
+err_out:
+	stp_proto_unregister(&br_stp_proto);
+	return err;
+}
+
+static void __exit br_deinit(void)
+{
+	stp_proto_unregister(&br_stp_proto);
+
+	br_netlink_fini();
+	unregister_netdevice_notifier(&br_device_notifier);
+	brioctl_set(NULL);
+
+	unregister_pernet_subsys(&br_net_ops);
+
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+	br_netfilter_fini();
+#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+	br_fdb_test_addr_hook = NULL;
+#endif
+
+	br_fdb_fini();
+}
+
+module_init(br_init)
+module_exit(br_deinit)
+MODULE_LICENSE("GPL");
+MODULE_VERSION(BR_VERSION);
+MODULE_ALIAS_RTNL_LINK("bridge");
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
new file mode 100644
index 00000000..dac6a214
--- /dev/null
+++ b/net/bridge/br_device.c
@@ -0,0 +1,371 @@
+/*
+ *	Device handling code
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/list.h>
+#include <linux/netfilter_bridge.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+
+/* net device transmit always called with BH disabled */
+netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	const unsigned char *dest = skb->data;
+	struct net_bridge_fdb_entry *dst;
+	struct net_bridge_mdb_entry *mdst;
+	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+		br_nf_pre_routing_finish_bridge_slow(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+
+	BR_INPUT_SKB_CB(skb)->brdev = dev;
+
+	skb_reset_mac_header(skb);
+	skb_pull(skb, ETH_HLEN);
+
+	u64_stats_update_begin(&brstats->syncp);
+	brstats->tx_packets++;
+	/* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
+	brstats->tx_bytes += skb->len;
+	u64_stats_update_end(&brstats->syncp);
+
+	rcu_read_lock();
+	if (is_broadcast_ether_addr(dest))
+		br_flood_deliver(br, skb);
+	else if (is_multicast_ether_addr(dest)) {
+		if (unlikely(netpoll_tx_running(dev))) {
+			br_flood_deliver(br, skb);
+			goto out;
+		}
+		if (br_multicast_rcv(br, NULL, skb)) {
+			kfree_skb(skb);
+			goto out;
+		}
+
+		mdst = br_mdb_get(br, skb);
+		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb))
+			br_multicast_deliver(mdst, skb);
+		else
+			br_flood_deliver(br, skb);
+	} else if ((dst = __br_fdb_get(br, dest)) != NULL)
+		br_deliver(dst->dst, skb);
+	else
+		br_flood_deliver(br, skb);
+
+out:
+	rcu_read_unlock();
+	return NETDEV_TX_OK;
+}
+
+static int br_dev_init(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	br->stats = alloc_percpu(struct br_cpu_netstats);
+	if (!br->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int br_dev_open(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	netdev_update_features(dev);
+	netif_start_queue(dev);
+	br_stp_enable_bridge(br);
+	br_multicast_open(br);
+
+	return 0;
+}
+
+static void br_dev_set_multicast_list(struct net_device *dev)
+{
+}
+
+static int br_dev_stop(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	br_stp_disable_bridge(br);
+	br_multicast_stop(br);
+
+	netif_stop_queue(dev);
+
+	return 0;
+}
+
+static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
+						struct rtnl_link_stats64 *stats)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct br_cpu_netstats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct br_cpu_netstats *bstats
+			= per_cpu_ptr(br->stats, cpu);
+		do {
+			start = u64_stats_fetch_begin(&bstats->syncp);
+			memcpy(&tmp, bstats, sizeof(tmp));
+		} while (u64_stats_fetch_retry(&bstats->syncp, start));
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	return stats;
+}
+
+static int br_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	if (new_mtu < 68 || new_mtu > br_min_mtu(br))
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	/* remember the MTU in the rtable for PMTU */
+	dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
+#endif
+
+	return 0;
+}
+
+/* Allow setting mac address to any valid ethernet address. */
+static int br_set_mac_address(struct net_device *dev, void *p)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	spin_lock_bh(&br->lock);
+	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	br_stp_change_bridge_id(br, addr->sa_data);
+	br->flags |= BR_SET_MAC_ADDR;
+	spin_unlock_bh(&br->lock);
+
+	return 0;
+}
+
+static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "bridge");
+	strcpy(info->version, BR_VERSION);
+	strcpy(info->fw_version, "N/A");
+	strcpy(info->bus_info, "N/A");
+}
+
+static u32 br_fix_features(struct net_device *dev, u32 features)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	return br_features_recompute(br, features);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void br_poll_controller(struct net_device *br_dev)
+{
+}
+
+static void br_netpoll_cleanup(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p, *n;
+
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		br_netpoll_disable(p);
+	}
+}
+
+static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p, *n;
+	int err = 0;
+
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		if (!p->dev)
+			continue;
+
+		err = br_netpoll_enable(p);
+		if (err)
+			goto fail;
+	}
+
+out:
+	return err;
+
+fail:
+	br_netpoll_cleanup(dev);
+	goto out;
+}
+
+int br_netpoll_enable(struct net_bridge_port *p)
+{
+	struct netpoll *np;
+	int err = 0;
+
+	np = kzalloc(sizeof(*p->np), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!np)
+		goto out;
+
+	np->dev = p->dev;
+	strlcpy(np->dev_name, p->dev->name, IFNAMSIZ);
+
+	err = __netpoll_setup(np);
+	if (err) {
+		kfree(np);
+		goto out;
+	}
+
+	p->np = np;
+
+out:
+	return err;
+}
+
+void br_netpoll_disable(struct net_bridge_port *p)
+{
+	struct netpoll *np = p->np;
+
+	if (!np)
+		return;
+
+	p->np = NULL;
+
+	/* Wait for transmitting packets to finish before freeing. */
+	synchronize_rcu_bh();
+
+	__netpoll_cleanup(np);
+	kfree(np);
+}
+
+#endif
+
+static int br_add_slave(struct net_device *dev, struct net_device *slave_dev)
+
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	return br_add_if(br, slave_dev);
+}
+
+static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	return br_del_if(br, slave_dev);
+}
+
+static const struct ethtool_ops br_ethtool_ops = {
+	.get_drvinfo    = br_getinfo,
+	.get_link	= ethtool_op_get_link,
+};
+
+static const struct net_device_ops br_netdev_ops = {
+	.ndo_open		 = br_dev_open,
+	.ndo_stop		 = br_dev_stop,
+	.ndo_init		 = br_dev_init,
+	.ndo_start_xmit		 = br_dev_xmit,
+	.ndo_get_stats64	 = br_get_stats64,
+	.ndo_set_mac_address	 = br_set_mac_address,
+	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
+	.ndo_change_mtu		 = br_change_mtu,
+	.ndo_do_ioctl		 = br_dev_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_setup	 = br_netpoll_setup,
+	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
+	.ndo_poll_controller	 = br_poll_controller,
+#endif
+	.ndo_add_slave		 = br_add_slave,
+	.ndo_del_slave		 = br_del_slave,
+	.ndo_fix_features        = br_fix_features,
+};
+
+static void br_dev_free(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	free_percpu(br->stats);
+	free_netdev(dev);
+}
+
+static struct device_type br_type = {
+	.name	= "bridge",
+};
+
+void br_dev_setup(struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	random_ether_addr(dev->dev_addr);
+	ether_setup(dev);
+
+	dev->netdev_ops = &br_netdev_ops;
+	dev->destructor = br_dev_free;
+	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
+	SET_NETDEV_DEVTYPE(dev, &br_type);
+	dev->tx_queue_len = 0;
+	dev->priv_flags = IFF_EBRIDGE;
+
+	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
+			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
+			NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_TX;
+	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
+			   NETIF_F_GSO_MASK | NETIF_F_NO_CSUM |
+			   NETIF_F_HW_VLAN_TX;
+
+	br->dev = dev;
+	spin_lock_init(&br->lock);
+	INIT_LIST_HEAD(&br->port_list);
+	spin_lock_init(&br->hash_lock);
+
+	br->bridge_id.prio[0] = 0x80;
+	br->bridge_id.prio[1] = 0x00;
+
+	memcpy(br->group_addr, br_group_address, ETH_ALEN);
+
+	br->stp_enabled = BR_NO_STP;
+	br->designated_root = br->bridge_id;
+	br->bridge_max_age = br->max_age = 20 * HZ;
+	br->bridge_hello_time = br->hello_time = 2 * HZ;
+	br->bridge_forward_delay = br->forward_delay = 15 * HZ;
+	br->ageing_time = 300 * HZ;
+
+	br_netfilter_rtable_init(br);
+	br_stp_timer_init(br);
+	br_multicast_init(br);
+}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
new file mode 100644
index 00000000..e0dfbc15
--- /dev/null
+++ b/net/bridge/br_fdb.c
@@ -0,0 +1,696 @@
+/*
+ *	Forwarding database
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/times.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+#include <asm/unaligned.h>
+#include "br_private.h"
+
+static struct kmem_cache *br_fdb_cache __read_mostly;
+static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+		      const unsigned char *addr);
+static void fdb_notify(const struct net_bridge_fdb_entry *, int);
+
+static u32 fdb_salt __read_mostly;
+
+int __init br_fdb_init(void)
+{
+	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
+					 sizeof(struct net_bridge_fdb_entry),
+					 0,
+					 SLAB_HWCACHE_ALIGN, NULL);
+	if (!br_fdb_cache)
+		return -ENOMEM;
+
+	get_random_bytes(&fdb_salt, sizeof(fdb_salt));
+	return 0;
+}
+
+void br_fdb_fini(void)
+{
+	kmem_cache_destroy(br_fdb_cache);
+}
+
+
+/* if topology_changing then use forward_delay (default 15 sec)
+ * otherwise keep longer (default 5 minutes)
+ */
+static inline unsigned long hold_time(const struct net_bridge *br)
+{
+	return br->topology_change ? br->forward_delay : br->ageing_time;
+}
+
+static inline int has_expired(const struct net_bridge *br,
+				  const struct net_bridge_fdb_entry *fdb)
+{
+	return !fdb->is_static &&
+		time_before_eq(fdb->updated + hold_time(br), jiffies);
+}
+
+static inline int br_mac_hash(const unsigned char *mac)
+{
+	/* use 1 byte of OUI cnd 3 bytes of NIC */
+	u32 key = get_unaligned((u32 *)(mac + 2));
+	return jhash_1word(key, fdb_salt) & (BR_HASH_SIZE - 1);
+}
+
+static void fdb_rcu_free(struct rcu_head *head)
+{
+	struct net_bridge_fdb_entry *ent
+		= container_of(head, struct net_bridge_fdb_entry, rcu);
+	kmem_cache_free(br_fdb_cache, ent);
+}
+
+static inline void fdb_delete(struct net_bridge_fdb_entry *f)
+{
+	fdb_notify(f, RTM_DELNEIGH);
+	hlist_del_rcu(&f->hlist);
+	call_rcu(&f->rcu, fdb_rcu_free);
+}
+
+void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
+{
+	struct net_bridge *br = p->br;
+	int i;
+
+	spin_lock_bh(&br->hash_lock);
+
+	/* Search all chains since old address/hash is unknown */
+	for (i = 0; i < BR_HASH_SIZE; i++) {
+		struct hlist_node *h;
+		hlist_for_each(h, &br->hash[i]) {
+			struct net_bridge_fdb_entry *f;
+
+			f = hlist_entry(h, struct net_bridge_fdb_entry, hlist);
+			if (f->dst == p && f->is_local) {
+				/* maybe another port has same hw addr? */
+				struct net_bridge_port *op;
+				list_for_each_entry(op, &br->port_list, list) {
+					if (op != p &&
+					    !compare_ether_addr(op->dev->dev_addr,
+								f->addr.addr)) {
+						f->dst = op;
+						goto insert;
+					}
+				}
+
+				/* delete old one */
+				fdb_delete(f);
+				goto insert;
+			}
+		}
+	}
+ insert:
+	/* insert new address,  may fail if invalid address or dup. */
+	fdb_insert(br, p, newaddr);
+
+	spin_unlock_bh(&br->hash_lock);
+}
+
+void br_fdb_cleanup(unsigned long _data)
+{
+	struct net_bridge *br = (struct net_bridge *)_data;
+	unsigned long delay = hold_time(br);
+	unsigned long next_timer = jiffies + br->ageing_time;
+	int i;
+
+	spin_lock_bh(&br->hash_lock);
+	for (i = 0; i < BR_HASH_SIZE; i++) {
+		struct net_bridge_fdb_entry *f;
+		struct hlist_node *h, *n;
+
+		hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) {
+			unsigned long this_timer;
+			if (f->is_static)
+				continue;
+			this_timer = f->updated + delay;
+			if (time_before_eq(this_timer, jiffies))
+				fdb_delete(f);
+			else if (time_before(this_timer, next_timer))
+				next_timer = this_timer;
+		}
+	}
+	spin_unlock_bh(&br->hash_lock);
+
+	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+}
+
+/* Completely flush all dynamic entries in forwarding database.*/
+void br_fdb_flush(struct net_bridge *br)
+{
+	int i;
+
+	spin_lock_bh(&br->hash_lock);
+	for (i = 0; i < BR_HASH_SIZE; i++) {
+		struct net_bridge_fdb_entry *f;
+		struct hlist_node *h, *n;
+		hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) {
+			if (!f->is_static)
+				fdb_delete(f);
+		}
+	}
+	spin_unlock_bh(&br->hash_lock);
+}
+
+/* Flush all entries referring to a specific port.
+ * if do_all is set also flush static entries
+ */
+void br_fdb_delete_by_port(struct net_bridge *br,
+			   const struct net_bridge_port *p,
+			   int do_all)
+{
+	int i;
+
+	spin_lock_bh(&br->hash_lock);
+	for (i = 0; i < BR_HASH_SIZE; i++) {
+		struct hlist_node *h, *g;
+
+		hlist_for_each_safe(h, g, &br->hash[i]) {
+			struct net_bridge_fdb_entry *f
+				= hlist_entry(h, struct net_bridge_fdb_entry, hlist);
+			if (f->dst != p)
+				continue;
+
+			if (f->is_static && !do_all)
+				continue;
+			/*
+			 * if multiple ports all have the same device address
+			 * then when one port is deleted, assign
+			 * the local entry to other port
+			 */
+			if (f->is_local) {
+				struct net_bridge_port *op;
+				list_for_each_entry(op, &br->port_list, list) {
+					if (op != p &&
+					    !compare_ether_addr(op->dev->dev_addr,
+								f->addr.addr)) {
+						f->dst = op;
+						goto skip_delete;
+					}
+				}
+			}
+
+			fdb_delete(f);
+		skip_delete: ;
+		}
+	}
+	spin_unlock_bh(&br->hash_lock);
+}
+
+/* No locking or refcounting, assumes caller has rcu_read_lock */
+struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
+					  const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct net_bridge_fdb_entry *fdb;
+
+	hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
+		if (!compare_ether_addr(fdb->addr.addr, addr)) {
+			if (unlikely(has_expired(br, fdb)))
+				break;
+			return fdb;
+		}
+	}
+
+	return NULL;
+}
+
+#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+/* Interface used by ATM LANE hook to test
+ * if an addr is on some other bridge port */
+int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
+{
+	struct net_bridge_fdb_entry *fdb;
+	struct net_bridge_port *port;
+	int ret;
+
+	rcu_read_lock();
+	port = br_port_get_rcu(dev);
+	if (!port)
+		ret = 0;
+	else {
+		fdb = __br_fdb_get(port->br, addr);
+		ret = fdb && fdb->dst->dev != dev &&
+			fdb->dst->state == BR_STATE_FORWARDING;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+#endif /* CONFIG_ATM_LANE */
+
+/*
+ * Fill buffer with forwarding table records in
+ * the API format.
+ */
+int br_fdb_fillbuf(struct net_bridge *br, void *buf,
+		   unsigned long maxnum, unsigned long skip)
+{
+	struct __fdb_entry *fe = buf;
+	int i, num = 0;
+	struct hlist_node *h;
+	struct net_bridge_fdb_entry *f;
+
+	memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
+
+	rcu_read_lock();
+	for (i = 0; i < BR_HASH_SIZE; i++) {
+		hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
+			if (num >= maxnum)
+				goto out;
+
+			if (has_expired(br, f))
+				continue;
+
+			if (skip) {
+				--skip;
+				continue;
+			}
+
+			/* convert from internal format to API */
+			memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN);
+
+			/* due to ABI compat need to split into hi/lo */
+			fe->port_no = f->dst->port_no;
+			fe->port_hi = f->dst->port_no >> 8;
+
+			fe->is_local = f->is_local;
+			if (!f->is_static)
+				fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->updated);
+			++fe;
+			++num;
+		}
+	}
+
+ out:
+	rcu_read_unlock();
+
+	return num;
+}
+
+static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
+					     const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct net_bridge_fdb_entry *fdb;
+
+	hlist_for_each_entry(fdb, h, head, hlist) {
+		if (!compare_ether_addr(fdb->addr.addr, addr))
+			return fdb;
+	}
+	return NULL;
+}
+
+static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head,
+						 const unsigned char *addr)
+{
+	struct hlist_node *h;
+	struct net_bridge_fdb_entry *fdb;
+
+	hlist_for_each_entry_rcu(fdb, h, head, hlist) {
+		if (!compare_ether_addr(fdb->addr.addr, addr))
+			return fdb;
+	}
+	return NULL;
+}
+
+static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
+					       struct net_bridge_port *source,
+					       const unsigned char *addr)
+{
+	struct net_bridge_fdb_entry *fdb;
+
+	fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
+	if (fdb) {
+		memcpy(fdb->addr.addr, addr, ETH_ALEN);
+		fdb->dst = source;
+		fdb->is_local = 0;
+		fdb->is_static = 0;
+		fdb->updated = fdb->used = jiffies;
+		hlist_add_head_rcu(&fdb->hlist, head);
+		fdb_notify(fdb, RTM_NEWNEIGH);
+	}
+	return fdb;
+}
+
+static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+		  const unsigned char *addr)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+	struct net_bridge_fdb_entry *fdb;
+
+	if (!is_valid_ether_addr(addr))
+		return -EINVAL;
+
+	fdb = fdb_find(head, addr);
+	if (fdb) {
+		/* it is okay to have multiple ports with same
+		 * address, just use the first one.
+		 */
+		if (fdb->is_local)
+			return 0;
+		br_warn(br, "adding interface %s with same address "
+		       "as a received packet\n",
+		       source->dev->name);
+		fdb_delete(fdb);
+	}
+
+	fdb = fdb_create(head, source, addr);
+	if (!fdb)
+		return -ENOMEM;
+
+	fdb->is_local = fdb->is_static = 1;
+	return 0;
+}
+
+/* Add entry for local address of interface */
+int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
+		  const unsigned char *addr)
+{
+	int ret;
+
+	spin_lock_bh(&br->hash_lock);
+	ret = fdb_insert(br, source, addr);
+	spin_unlock_bh(&br->hash_lock);
+	return ret;
+}
+
+void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
+		   const unsigned char *addr)
+{
+	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+	struct net_bridge_fdb_entry *fdb;
+
+	/* some users want to always flood. */
+	if (hold_time(br) == 0)
+		return;
+
+	/* ignore packets unless we are using this port */
+	if (!(source->state == BR_STATE_LEARNING ||
+	      source->state == BR_STATE_FORWARDING))
+		return;
+
+	fdb = fdb_find_rcu(head, addr);
+	if (likely(fdb)) {
+		/* attempt to update an entry for a local interface */
+		if (unlikely(fdb->is_local)) {
+			if (net_ratelimit())
+				br_warn(br, "received packet on %s with "
+					"own address as source address\n",
+					source->dev->name);
+		} else {
+			/* fastpath: update of existing entry */
+			fdb->dst = source;
+			fdb->updated = jiffies;
+		}
+	} else {
+		spin_lock(&br->hash_lock);
+		if (likely(!fdb_find(head, addr)))
+			fdb_create(head, source, addr);
+
+		/* else  we lose race and someone else inserts
+		 * it first, don't bother updating
+		 */
+		spin_unlock(&br->hash_lock);
+	}
+}
+
+static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb)
+{
+	if (fdb->is_local)
+		return NUD_PERMANENT;
+	else if (fdb->is_static)
+		return NUD_NOARP;
+	else if (has_expired(fdb->dst->br, fdb))
+		return NUD_STALE;
+	else
+		return NUD_REACHABLE;
+}
+
+static int fdb_fill_info(struct sk_buff *skb,
+			 const struct net_bridge_fdb_entry *fdb,
+			 u32 pid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family	 = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags	 = 0;
+	ndm->ndm_type	 = 0;
+	ndm->ndm_ifindex = fdb->dst->dev->ifindex;
+	ndm->ndm_state   = fdb_to_nud(fdb);
+
+	NLA_PUT(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr);
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+	NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t fdb_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
+{
+	struct net *net = dev_net(fdb->dst->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = fdb_fill_info(skb, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Dump information about entries, in response to GETNEIGH */
+int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	int idx = 0;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		struct net_bridge *br = netdev_priv(dev);
+		int i;
+
+		if (!(dev->priv_flags & IFF_EBRIDGE))
+			continue;
+
+		for (i = 0; i < BR_HASH_SIZE; i++) {
+			struct hlist_node *h;
+			struct net_bridge_fdb_entry *f;
+
+			hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
+				if (idx < cb->args[0])
+					goto skip;
+
+				if (fdb_fill_info(skb, f,
+						  NETLINK_CB(cb->skb).pid,
+						  cb->nlh->nlmsg_seq,
+						  RTM_NEWNEIGH,
+						  NLM_F_MULTI) < 0)
+					break;
+skip:
+				++idx;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+/* Create new static fdb entry */
+static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr,
+			 __u16 state)
+{
+	struct net_bridge *br = source->br;
+	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+	struct net_bridge_fdb_entry *fdb;
+
+	fdb = fdb_find(head, addr);
+	if (fdb)
+		return -EEXIST;
+
+	fdb = fdb_create(head, source, addr);
+	if (!fdb)
+		return -ENOMEM;
+
+	if (state & NUD_PERMANENT)
+		fdb->is_local = fdb->is_static = 1;
+	else if (state & NUD_NOARP)
+		fdb->is_static = 1;
+	return 0;
+}
+
+/* Add new permanent fdb entry with RTM_NEWNEIGH */
+int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ndmsg *ndm;
+	struct nlattr *tb[NDA_MAX+1];
+	struct net_device *dev;
+	struct net_bridge_port *p;
+	const __u8 *addr;
+	int err;
+
+	ASSERT_RTNL();
+	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	ndm = nlmsg_data(nlh);
+	if (ndm->ndm_ifindex == 0) {
+		pr_info("bridge: RTM_NEWNEIGH with invalid ifindex\n");
+		return -EINVAL;
+	}
+
+	dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+	if (dev == NULL) {
+		pr_info("bridge: RTM_NEWNEIGH with unknown ifindex\n");
+		return -ENODEV;
+	}
+
+	if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+		pr_info("bridge: RTM_NEWNEIGH with invalid address\n");
+		return -EINVAL;
+	}
+
+	addr = nla_data(tb[NDA_LLADDR]);
+	if (!is_valid_ether_addr(addr)) {
+		pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
+		return -EINVAL;
+	}
+
+	p = br_port_get_rtnl(dev);
+	if (p == NULL) {
+		pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n",
+			dev->name);
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&p->br->hash_lock);
+	err = fdb_add_entry(p, addr, ndm->ndm_state);
+	spin_unlock_bh(&p->br->hash_lock);
+
+	return err;
+}
+
+static int fdb_delete_by_addr(struct net_bridge_port *p, const u8 *addr)
+{
+	struct net_bridge *br = p->br;
+	struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+	struct net_bridge_fdb_entry *fdb;
+
+	fdb = fdb_find(head, addr);
+	if (!fdb)
+		return -ENOENT;
+
+	fdb_delete(fdb);
+	return 0;
+}
+
+/* Remove neighbor entry with RTM_DELNEIGH */
+int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ndmsg *ndm;
+	struct net_bridge_port *p;
+	struct nlattr *llattr;
+	const __u8 *addr;
+	struct net_device *dev;
+	int err;
+
+	ASSERT_RTNL();
+	if (nlmsg_len(nlh) < sizeof(*ndm))
+		return -EINVAL;
+
+	ndm = nlmsg_data(nlh);
+	if (ndm->ndm_ifindex == 0) {
+		pr_info("bridge: RTM_DELNEIGH with invalid ifindex\n");
+		return -EINVAL;
+	}
+
+	dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+	if (dev == NULL) {
+		pr_info("bridge: RTM_DELNEIGH with unknown ifindex\n");
+		return -ENODEV;
+	}
+
+	llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR);
+	if (llattr == NULL || nla_len(llattr) != ETH_ALEN) {
+		pr_info("bridge: RTM_DELNEIGH with invalid address\n");
+		return -EINVAL;
+	}
+
+	addr = nla_data(llattr);
+
+	p = br_port_get_rtnl(dev);
+	if (p == NULL) {
+		pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
+			dev->name);
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&p->br->hash_lock);
+	err = fdb_delete_by_addr(p, addr);
+	spin_unlock_bh(&p->br->hash_lock);
+
+	return err;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
new file mode 100644
index 00000000..ee64287f
--- /dev/null
+++ b/net/bridge/br_forward.c
@@ -0,0 +1,271 @@
+/*
+ *	Forwarding decision
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/netfilter_bridge.h>
+#include "br_private.h"
+
+static int deliver_clone(const struct net_bridge_port *prev,
+			 struct sk_buff *skb,
+			 void (*__packet_hook)(const struct net_bridge_port *p,
+					       struct sk_buff *skb));
+
+/* Don't forward packets to originating port or forwarding diasabled */
+static inline int should_deliver(const struct net_bridge_port *p,
+				 const struct sk_buff *skb)
+{
+	return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+		p->state == BR_STATE_FORWARDING);
+}
+
+static inline unsigned packet_length(const struct sk_buff *skb)
+{
+	return skb->len - (skb->protocol == htons(ETH_P_8021Q) ? VLAN_HLEN : 0);
+}
+
+int br_dev_queue_push_xmit(struct sk_buff *skb)
+{
+	/* ip_fragment doesn't copy the MAC header */
+	if (nf_bridge_maybe_copy_header(skb) ||
+	    (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) {
+		kfree_skb(skb);
+	} else {
+		skb_push(skb, ETH_HLEN);
+		dev_queue_xmit(skb);
+	}
+
+	return 0;
+}
+
+int br_forward_finish(struct sk_buff *skb)
+{
+	return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
+		       br_dev_queue_push_xmit);
+
+}
+
+static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+	skb->dev = to->dev;
+
+	if (unlikely(netpoll_tx_running(to->dev))) {
+		if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
+			kfree_skb(skb);
+		else {
+			skb_push(skb, ETH_HLEN);
+			br_netpoll_send_skb(to, skb);
+		}
+		return;
+	}
+
+	NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
+		br_forward_finish);
+}
+
+static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+	struct net_device *indev;
+
+	if (skb_warn_if_lro(skb)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	indev = skb->dev;
+	skb->dev = to->dev;
+	skb_forward_csum(skb);
+
+	NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
+		br_forward_finish);
+}
+
+/* called with rcu_read_lock */
+void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+	if (should_deliver(to, skb)) {
+		__br_deliver(to, skb);
+		return;
+	}
+
+	kfree_skb(skb);
+}
+
+/* called with rcu_read_lock */
+void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
+{
+	if (should_deliver(to, skb)) {
+		if (skb0)
+			deliver_clone(to, skb, __br_forward);
+		else
+			__br_forward(to, skb);
+		return;
+	}
+
+	if (!skb0)
+		kfree_skb(skb);
+}
+
+static int deliver_clone(const struct net_bridge_port *prev,
+			 struct sk_buff *skb,
+			 void (*__packet_hook)(const struct net_bridge_port *p,
+					       struct sk_buff *skb))
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb) {
+		dev->stats.tx_dropped++;
+		return -ENOMEM;
+	}
+
+	__packet_hook(prev, skb);
+	return 0;
+}
+
+static struct net_bridge_port *maybe_deliver(
+	struct net_bridge_port *prev, struct net_bridge_port *p,
+	struct sk_buff *skb,
+	void (*__packet_hook)(const struct net_bridge_port *p,
+			      struct sk_buff *skb))
+{
+	int err;
+
+	if (!should_deliver(p, skb))
+		return prev;
+
+	if (!prev)
+		goto out;
+
+	err = deliver_clone(prev, skb, __packet_hook);
+	if (err)
+		return ERR_PTR(err);
+
+out:
+	return p;
+}
+
+/* called under bridge lock */
+static void br_flood(struct net_bridge *br, struct sk_buff *skb,
+		     struct sk_buff *skb0,
+		     void (*__packet_hook)(const struct net_bridge_port *p,
+					   struct sk_buff *skb))
+{
+	struct net_bridge_port *p;
+	struct net_bridge_port *prev;
+
+	prev = NULL;
+
+	list_for_each_entry_rcu(p, &br->port_list, list) {
+		prev = maybe_deliver(prev, p, skb, __packet_hook);
+		if (IS_ERR(prev))
+			goto out;
+	}
+
+	if (!prev)
+		goto out;
+
+	if (skb0)
+		deliver_clone(prev, skb, __packet_hook);
+	else
+		__packet_hook(prev, skb);
+	return;
+
+out:
+	if (!skb0)
+		kfree_skb(skb);
+}
+
+
+/* called with rcu_read_lock */
+void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
+{
+	br_flood(br, skb, NULL, __br_deliver);
+}
+
+/* called under bridge lock */
+void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
+		      struct sk_buff *skb2)
+{
+	br_flood(br, skb, skb2, __br_forward);
+}
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+/* called with rcu_read_lock */
+static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
+			       struct sk_buff *skb, struct sk_buff *skb0,
+			       void (*__packet_hook)(
+					const struct net_bridge_port *p,
+					struct sk_buff *skb))
+{
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *prev = NULL;
+	struct net_bridge_port_group *p;
+	struct hlist_node *rp;
+
+	rp = rcu_dereference(hlist_first_rcu(&br->router_list));
+	p = mdst ? rcu_dereference(mdst->ports) : NULL;
+	while (p || rp) {
+		struct net_bridge_port *port, *lport, *rport;
+
+		lport = p ? p->port : NULL;
+		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
+			     NULL;
+
+		port = (unsigned long)lport > (unsigned long)rport ?
+		       lport : rport;
+
+		prev = maybe_deliver(prev, port, skb, __packet_hook);
+		if (IS_ERR(prev))
+			goto out;
+
+		if ((unsigned long)lport >= (unsigned long)port)
+			p = rcu_dereference(p->next);
+		if ((unsigned long)rport >= (unsigned long)port)
+			rp = rcu_dereference(hlist_next_rcu(rp));
+	}
+
+	if (!prev)
+		goto out;
+
+	if (skb0)
+		deliver_clone(prev, skb, __packet_hook);
+	else
+		__packet_hook(prev, skb);
+	return;
+
+out:
+	if (!skb0)
+		kfree_skb(skb);
+}
+
+/* called with rcu_read_lock */
+void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+			  struct sk_buff *skb)
+{
+	br_multicast_flood(mdst, skb, NULL, __br_deliver);
+}
+
+/* called with rcu_read_lock */
+void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+			  struct sk_buff *skb, struct sk_buff *skb2)
+{
+	br_multicast_flood(mdst, skb, skb2, __br_forward);
+}
+#endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
new file mode 100644
index 00000000..44908737
--- /dev/null
+++ b/net/bridge/br_if.c
@@ -0,0 +1,454 @@
+/*
+ *	Userspace interface
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/ethtool.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+
+#include "br_private.h"
+
+/*
+ * Determine initial path cost based on speed.
+ * using recommendations from 802.1d standard
+ *
+ * Since driver might sleep need to not be holding any locks.
+ */
+static int port_cost(struct net_device *dev)
+{
+	if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
+		struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
+
+		if (!dev_ethtool_get_settings(dev, &ecmd)) {
+			switch (ethtool_cmd_speed(&ecmd)) {
+			case SPEED_10000:
+				return 2;
+			case SPEED_1000:
+				return 4;
+			case SPEED_100:
+				return 19;
+			case SPEED_10:
+				return 100;
+			}
+		}
+	}
+
+	/* Old silly heuristics based on name */
+	if (!strncmp(dev->name, "lec", 3))
+		return 7;
+
+	if (!strncmp(dev->name, "plip", 4))
+		return 2500;
+
+	return 100;	/* assume old 10Mbps */
+}
+
+
+/* Check for port carrier transistions. */
+void br_port_carrier_check(struct net_bridge_port *p)
+{
+	struct net_device *dev = p->dev;
+	struct net_bridge *br = p->br;
+
+	if (netif_running(dev) && netif_carrier_ok(dev))
+		p->path_cost = port_cost(dev);
+
+	if (!netif_running(br->dev))
+		return;
+
+	spin_lock_bh(&br->lock);
+	if (netif_running(dev) && netif_carrier_ok(dev)) {
+		if (p->state == BR_STATE_DISABLED)
+			br_stp_enable_port(p);
+	} else {
+		if (p->state != BR_STATE_DISABLED)
+			br_stp_disable_port(p);
+	}
+	spin_unlock_bh(&br->lock);
+}
+
+static void release_nbp(struct kobject *kobj)
+{
+	struct net_bridge_port *p
+		= container_of(kobj, struct net_bridge_port, kobj);
+	kfree(p);
+}
+
+static struct kobj_type brport_ktype = {
+#ifdef CONFIG_SYSFS
+	.sysfs_ops = &brport_sysfs_ops,
+#endif
+	.release = release_nbp,
+};
+
+static void destroy_nbp(struct net_bridge_port *p)
+{
+	struct net_device *dev = p->dev;
+
+	p->br = NULL;
+	p->dev = NULL;
+	dev_put(dev);
+
+	kobject_put(&p->kobj);
+}
+
+static void destroy_nbp_rcu(struct rcu_head *head)
+{
+	struct net_bridge_port *p =
+			container_of(head, struct net_bridge_port, rcu);
+	destroy_nbp(p);
+}
+
+/* Delete port(interface) from bridge is done in two steps.
+ * via RCU. First step, marks device as down. That deletes
+ * all the timers and stops new packets from flowing through.
+ *
+ * Final cleanup doesn't occur until after all CPU's finished
+ * processing packets.
+ *
+ * Protected from multiple admin operations by RTNL mutex
+ */
+static void del_nbp(struct net_bridge_port *p)
+{
+	struct net_bridge *br = p->br;
+	struct net_device *dev = p->dev;
+
+	sysfs_remove_link(br->ifobj, p->dev->name);
+
+	dev_set_promiscuity(dev, -1);
+
+	spin_lock_bh(&br->lock);
+	br_stp_disable_port(p);
+	spin_unlock_bh(&br->lock);
+
+	br_ifinfo_notify(RTM_DELLINK, p);
+
+	br_fdb_delete_by_port(br, p, 1);
+
+	list_del_rcu(&p->list);
+
+	dev->priv_flags &= ~IFF_BRIDGE_PORT;
+
+	netdev_rx_handler_unregister(dev);
+	synchronize_net();
+
+	netdev_set_master(dev, NULL);
+
+	br_multicast_del_port(p);
+
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
+	kobject_del(&p->kobj);
+
+	br_netpoll_disable(p);
+
+	call_rcu(&p->rcu, destroy_nbp_rcu);
+}
+
+/* Delete bridge device */
+void br_dev_delete(struct net_device *dev, struct list_head *head)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p, *n;
+
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		del_nbp(p);
+	}
+
+	del_timer_sync(&br->gc_timer);
+
+	br_sysfs_delbr(br->dev);
+	unregister_netdevice_queue(br->dev, head);
+}
+
+/* find an available port number */
+static int find_portno(struct net_bridge *br)
+{
+	int index;
+	struct net_bridge_port *p;
+	unsigned long *inuse;
+
+	inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
+			GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	set_bit(0, inuse);	/* zero is reserved */
+	list_for_each_entry(p, &br->port_list, list) {
+		set_bit(p->port_no, inuse);
+	}
+	index = find_first_zero_bit(inuse, BR_MAX_PORTS);
+	kfree(inuse);
+
+	return (index >= BR_MAX_PORTS) ? -EXFULL : index;
+}
+
+/* called with RTNL but without bridge lock */
+static struct net_bridge_port *new_nbp(struct net_bridge *br,
+				       struct net_device *dev)
+{
+	int index;
+	struct net_bridge_port *p;
+
+	index = find_portno(br);
+	if (index < 0)
+		return ERR_PTR(index);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	p->br = br;
+	dev_hold(dev);
+	p->dev = dev;
+	p->path_cost = port_cost(dev);
+	p->priority = 0x8000 >> BR_PORT_BITS;
+	p->port_no = index;
+	p->flags = 0;
+	br_init_port(p);
+	p->state = BR_STATE_DISABLED;
+	br_stp_port_timer_init(p);
+	br_multicast_add_port(p);
+
+	return p;
+}
+
+int br_add_bridge(struct net *net, const char *name)
+{
+	struct net_device *dev;
+	int res;
+
+	dev = alloc_netdev(sizeof(struct net_bridge), name,
+			   br_dev_setup);
+
+	if (!dev)
+		return -ENOMEM;
+
+	dev_net_set(dev, net);
+
+	res = register_netdev(dev);
+	if (res)
+		free_netdev(dev);
+	return res;
+}
+
+int br_del_bridge(struct net *net, const char *name)
+{
+	struct net_device *dev;
+	int ret = 0;
+
+	rtnl_lock();
+	dev = __dev_get_by_name(net, name);
+	if (dev == NULL)
+		ret =  -ENXIO; 	/* Could not find device */
+
+	else if (!(dev->priv_flags & IFF_EBRIDGE)) {
+		/* Attempt to delete non bridge device! */
+		ret = -EPERM;
+	}
+
+	else if (dev->flags & IFF_UP) {
+		/* Not shutdown yet. */
+		ret = -EBUSY;
+	}
+
+	else
+		br_dev_delete(dev, NULL);
+
+	rtnl_unlock();
+	return ret;
+}
+
+/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
+int br_min_mtu(const struct net_bridge *br)
+{
+	const struct net_bridge_port *p;
+	int mtu = 0;
+
+	ASSERT_RTNL();
+
+	if (list_empty(&br->port_list))
+		mtu = ETH_DATA_LEN;
+	else {
+		list_for_each_entry(p, &br->port_list, list) {
+			if (!mtu  || p->dev->mtu < mtu)
+				mtu = p->dev->mtu;
+		}
+	}
+	return mtu;
+}
+
+/*
+ * Recomputes features using slave's features
+ */
+u32 br_features_recompute(struct net_bridge *br, u32 features)
+{
+	struct net_bridge_port *p;
+	u32 mask;
+
+	if (list_empty(&br->port_list))
+		return features;
+
+	mask = features;
+	features &= ~NETIF_F_ONE_FOR_ALL;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		features = netdev_increment_features(features,
+						     p->dev->features, mask);
+	}
+
+	return features;
+}
+
+/* called with RTNL */
+int br_add_if(struct net_bridge *br, struct net_device *dev)
+{
+	struct net_bridge_port *p;
+	int err = 0;
+	bool changed_addr;
+
+	/* Don't allow bridging non-ethernet like devices */
+	if ((dev->flags & IFF_LOOPBACK) ||
+	    dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN)
+		return -EINVAL;
+
+	/* No bridging of bridges */
+	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
+		return -ELOOP;
+
+	/* Device is already being bridged */
+	if (br_port_exists(dev))
+		return -EBUSY;
+
+	/* No bridging devices that dislike that (e.g. wireless) */
+	if (dev->priv_flags & IFF_DONT_BRIDGE)
+		return -EOPNOTSUPP;
+
+	p = new_nbp(br, dev);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	call_netdevice_notifiers(NETDEV_JOIN, dev);
+
+	err = dev_set_promiscuity(dev, 1);
+	if (err)
+		goto put_back;
+
+	err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
+				   SYSFS_BRIDGE_PORT_ATTR);
+	if (err)
+		goto err0;
+
+	err = br_fdb_insert(br, p, dev->dev_addr);
+	if (err)
+		goto err1;
+
+	err = br_sysfs_addif(p);
+	if (err)
+		goto err2;
+
+	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
+		goto err3;
+
+	err = netdev_set_master(dev, br->dev);
+	if (err)
+		goto err3;
+
+	err = netdev_rx_handler_register(dev, br_handle_frame, p);
+	if (err)
+		goto err4;
+
+	dev->priv_flags |= IFF_BRIDGE_PORT;
+
+	dev_disable_lro(dev);
+
+	list_add_rcu(&p->list, &br->port_list);
+
+	netdev_update_features(br->dev);
+
+	spin_lock_bh(&br->lock);
+	changed_addr = br_stp_recalculate_bridge_id(br);
+
+	if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
+	    (br->dev->flags & IFF_UP))
+		br_stp_enable_port(p);
+	spin_unlock_bh(&br->lock);
+
+	br_ifinfo_notify(RTM_NEWLINK, p);
+
+	if (changed_addr)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+
+	dev_set_mtu(br->dev, br_min_mtu(br));
+
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+
+	return 0;
+
+err4:
+	netdev_set_master(dev, NULL);
+err3:
+	sysfs_remove_link(br->ifobj, p->dev->name);
+err2:
+	br_fdb_delete_by_port(br, p, 1);
+err1:
+	kobject_put(&p->kobj);
+	p = NULL; /* kobject_put frees */
+err0:
+	dev_set_promiscuity(dev, -1);
+put_back:
+	dev_put(dev);
+	kfree(p);
+	return err;
+}
+
+/* called with RTNL */
+int br_del_if(struct net_bridge *br, struct net_device *dev)
+{
+	struct net_bridge_port *p;
+
+	p = br_port_get_rtnl(dev);
+	if (!p || p->br != br)
+		return -EINVAL;
+
+	del_nbp(p);
+
+	spin_lock_bh(&br->lock);
+	br_stp_recalculate_bridge_id(br);
+	spin_unlock_bh(&br->lock);
+
+	netdev_update_features(br->dev);
+
+	return 0;
+}
+
+void __net_exit br_net_exit(struct net *net)
+{
+	struct net_device *dev;
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	for_each_netdev(net, dev)
+		if (dev->priv_flags & IFF_EBRIDGE)
+			br_dev_delete(dev, &list);
+
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+
+}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
new file mode 100644
index 00000000..f06ee39c
--- /dev/null
+++ b/net/bridge/br_input.c
@@ -0,0 +1,206 @@
+/*
+ *	Handle incoming frames
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/netfilter_bridge.h>
+#include "br_private.h"
+
+/* Bridge group multicast address 802.1d (pg 51). */
+const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+
+/* Hook for brouter */
+br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
+EXPORT_SYMBOL(br_should_route_hook);
+
+static int br_pass_frame_up(struct sk_buff *skb)
+{
+	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
+	struct net_bridge *br = netdev_priv(brdev);
+	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
+
+	u64_stats_update_begin(&brstats->syncp);
+	brstats->rx_packets++;
+	brstats->rx_bytes += skb->len;
+	u64_stats_update_end(&brstats->syncp);
+
+	indev = skb->dev;
+	skb->dev = brdev;
+
+	return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
+		       netif_receive_skb);
+}
+
+/* note: already called with rcu_read_lock */
+int br_handle_frame_finish(struct sk_buff *skb)
+{
+	const unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+	struct net_bridge *br;
+	struct net_bridge_fdb_entry *dst;
+	struct net_bridge_mdb_entry *mdst;
+	struct sk_buff *skb2;
+
+	if (!p || p->state == BR_STATE_DISABLED)
+		goto drop;
+
+	/* insert into forwarding database after filtering to avoid spoofing */
+	br = p->br;
+	br_fdb_update(br, p, eth_hdr(skb)->h_source);
+
+	if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
+	    br_multicast_rcv(br, p, skb))
+		goto drop;
+
+	if (p->state == BR_STATE_LEARNING)
+		goto drop;
+
+	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+
+	/* The packet skb2 goes to the local host (NULL to skip). */
+	skb2 = NULL;
+
+	if (br->dev->flags & IFF_PROMISC)
+		skb2 = skb;
+
+	dst = NULL;
+
+	if (is_broadcast_ether_addr(dest))
+		skb2 = skb;
+	else if (is_multicast_ether_addr(dest)) {
+		mdst = br_mdb_get(br, skb);
+		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
+			if ((mdst && mdst->mglist) ||
+			    br_multicast_is_router(br))
+				skb2 = skb;
+			br_multicast_forward(mdst, skb, skb2);
+			skb = NULL;
+			if (!skb2)
+				goto out;
+		} else
+			skb2 = skb;
+
+		br->dev->stats.multicast++;
+	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+		skb2 = skb;
+		/* Do not forward the packet since it's local. */
+		skb = NULL;
+	}
+
+	if (skb) {
+		if (dst) {
+			dst->used = jiffies;
+			br_forward(dst->dst, skb, skb2);
+		} else
+			br_flood_forward(br, skb, skb2);
+	}
+
+	if (skb2)
+		return br_pass_frame_up(skb2);
+
+out:
+	return 0;
+drop:
+	kfree_skb(skb);
+	goto out;
+}
+
+/* note: already called with rcu_read_lock */
+static int br_handle_local_finish(struct sk_buff *skb)
+{
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+
+	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+	return 0;	 /* process further */
+}
+
+/* Does address match the link local multicast address.
+ * 01:80:c2:00:00:0X
+ */
+static inline int is_link_local(const unsigned char *dest)
+{
+	__be16 *a = (__be16 *)dest;
+	static const __be16 *b = (const __be16 *)br_group_address;
+	static const __be16 m = cpu_to_be16(0xfff0);
+
+	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
+}
+
+/*
+ * Return NULL if skb is handled
+ * note: already called with rcu_read_lock
+ */
+rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
+{
+	struct net_bridge_port *p;
+	struct sk_buff *skb = *pskb;
+	const unsigned char *dest = eth_hdr(skb)->h_dest;
+	br_should_route_hook_t *rhook;
+
+	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+		return RX_HANDLER_PASS;
+
+	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
+		goto drop;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		return RX_HANDLER_CONSUMED;
+
+	p = br_port_get_rcu(skb->dev);
+
+	if (unlikely(is_link_local(dest))) {
+		/* Pause frames shouldn't be passed up by driver anyway */
+		if (skb->protocol == htons(ETH_P_PAUSE))
+			goto drop;
+
+		/* If STP is turned off, then forward */
+		if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
+			goto forward;
+
+		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
+			    NULL, br_handle_local_finish)) {
+			return RX_HANDLER_CONSUMED; /* consumed by filter */
+		} else {
+			*pskb = skb;
+			return RX_HANDLER_PASS;	/* continue processing */
+		}
+	}
+
+forward:
+	switch (p->state) {
+	case BR_STATE_FORWARDING:
+		rhook = rcu_dereference(br_should_route_hook);
+		if (rhook) {
+			if ((*rhook)(skb)) {
+				*pskb = skb;
+				return RX_HANDLER_PASS;
+			}
+			dest = eth_hdr(skb)->h_dest;
+		}
+		/* fall through */
+	case BR_STATE_LEARNING:
+		if (!compare_ether_addr(p->br->dev->dev_addr, dest))
+			skb->pkt_type = PACKET_HOST;
+
+		NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
+			br_handle_frame_finish);
+		break;
+	default:
+drop:
+		kfree_skb(skb);
+	}
+	return RX_HANDLER_CONSUMED;
+}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
new file mode 100644
index 00000000..7222fe1d
--- /dev/null
+++ b/net/bridge/br_ioctl.c
@@ -0,0 +1,395 @@
+/*
+ *	Ioctl handler
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <asm/uaccess.h>
+#include "br_private.h"
+
+/* called with RTNL */
+static int get_bridge_ifindices(struct net *net, int *indices, int num)
+{
+	struct net_device *dev;
+	int i = 0;
+
+	for_each_netdev(net, dev) {
+		if (i >= num)
+			break;
+		if (dev->priv_flags & IFF_EBRIDGE)
+			indices[i++] = dev->ifindex;
+	}
+
+	return i;
+}
+
+/* called with RTNL */
+static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->port_no < num)
+			ifindices[p->port_no] = p->dev->ifindex;
+	}
+}
+
+/*
+ * Format up to a page worth of forwarding table entries
+ * userbuf -- where to copy result
+ * maxnum  -- maximum number of entries desired
+ *            (limited to a page for sanity)
+ * offset  -- number of records to skip
+ */
+static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
+			   unsigned long maxnum, unsigned long offset)
+{
+	int num;
+	void *buf;
+	size_t size;
+
+	/* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */
+	if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry))
+		maxnum = PAGE_SIZE/sizeof(struct __fdb_entry);
+
+	size = maxnum * sizeof(struct __fdb_entry);
+
+	buf = kmalloc(size, GFP_USER);
+	if (!buf)
+		return -ENOMEM;
+
+	num = br_fdb_fillbuf(br, buf, maxnum, offset);
+	if (num > 0) {
+		if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry)))
+			num = -EFAULT;
+	}
+	kfree(buf);
+
+	return num;
+}
+
+/* called with RTNL */
+static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
+{
+	struct net_device *dev;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	dev = __dev_get_by_index(dev_net(br->dev), ifindex);
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (isadd)
+		ret = br_add_if(br, dev);
+	else
+		ret = br_del_if(br, dev);
+
+	return ret;
+}
+
+/*
+ * Legacy ioctl's through SIOCDEVPRIVATE
+ * This interface is deprecated because it was too difficult to
+ * to do the translation for 32/64bit ioctl compatibility.
+ */
+static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	unsigned long args[4];
+
+	if (copy_from_user(args, rq->ifr_data, sizeof(args)))
+		return -EFAULT;
+
+	switch (args[0]) {
+	case BRCTL_ADD_IF:
+	case BRCTL_DEL_IF:
+		return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF);
+
+	case BRCTL_GET_BRIDGE_INFO:
+	{
+		struct __bridge_info b;
+
+		memset(&b, 0, sizeof(struct __bridge_info));
+		rcu_read_lock();
+		memcpy(&b.designated_root, &br->designated_root, 8);
+		memcpy(&b.bridge_id, &br->bridge_id, 8);
+		b.root_path_cost = br->root_path_cost;
+		b.max_age = jiffies_to_clock_t(br->max_age);
+		b.hello_time = jiffies_to_clock_t(br->hello_time);
+		b.forward_delay = br->forward_delay;
+		b.bridge_max_age = br->bridge_max_age;
+		b.bridge_hello_time = br->bridge_hello_time;
+		b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay);
+		b.topology_change = br->topology_change;
+		b.topology_change_detected = br->topology_change_detected;
+		b.root_port = br->root_port;
+
+		b.stp_enabled = (br->stp_enabled != BR_NO_STP);
+		b.ageing_time = jiffies_to_clock_t(br->ageing_time);
+		b.hello_timer_value = br_timer_value(&br->hello_timer);
+		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
+		b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
+		b.gc_timer_value = br_timer_value(&br->gc_timer);
+		rcu_read_unlock();
+
+		if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case BRCTL_GET_PORT_LIST:
+	{
+		int num, *indices;
+
+		num = args[2];
+		if (num < 0)
+			return -EINVAL;
+		if (num == 0)
+			num = 256;
+		if (num > BR_MAX_PORTS)
+			num = BR_MAX_PORTS;
+
+		indices = kcalloc(num, sizeof(int), GFP_KERNEL);
+		if (indices == NULL)
+			return -ENOMEM;
+
+		get_port_ifindices(br, indices, num);
+		if (copy_to_user((void __user *)args[1], indices, num*sizeof(int)))
+			num =  -EFAULT;
+		kfree(indices);
+		return num;
+	}
+
+	case BRCTL_SET_BRIDGE_FORWARD_DELAY:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		return br_set_forward_delay(br, args[1]);
+
+	case BRCTL_SET_BRIDGE_HELLO_TIME:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		return br_set_hello_time(br, args[1]);
+
+	case BRCTL_SET_BRIDGE_MAX_AGE:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		return br_set_max_age(br, args[1]);
+
+	case BRCTL_SET_AGEING_TIME:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		br->ageing_time = clock_t_to_jiffies(args[1]);
+		return 0;
+
+	case BRCTL_GET_PORT_INFO:
+	{
+		struct __port_info p;
+		struct net_bridge_port *pt;
+
+		rcu_read_lock();
+		if ((pt = br_get_port(br, args[2])) == NULL) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+
+		memset(&p, 0, sizeof(struct __port_info));
+		memcpy(&p.designated_root, &pt->designated_root, 8);
+		memcpy(&p.designated_bridge, &pt->designated_bridge, 8);
+		p.port_id = pt->port_id;
+		p.designated_port = pt->designated_port;
+		p.path_cost = pt->path_cost;
+		p.designated_cost = pt->designated_cost;
+		p.state = pt->state;
+		p.top_change_ack = pt->topology_change_ack;
+		p.config_pending = pt->config_pending;
+		p.message_age_timer_value = br_timer_value(&pt->message_age_timer);
+		p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer);
+		p.hold_timer_value = br_timer_value(&pt->hold_timer);
+
+		rcu_read_unlock();
+
+		if (copy_to_user((void __user *)args[1], &p, sizeof(p)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	case BRCTL_SET_BRIDGE_STP_STATE:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		br_stp_set_enabled(br, args[1]);
+		return 0;
+
+	case BRCTL_SET_BRIDGE_PRIORITY:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		spin_lock_bh(&br->lock);
+		br_stp_set_bridge_priority(br, args[1]);
+		spin_unlock_bh(&br->lock);
+		return 0;
+
+	case BRCTL_SET_PORT_PRIORITY:
+	{
+		struct net_bridge_port *p;
+		int ret;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		spin_lock_bh(&br->lock);
+		if ((p = br_get_port(br, args[1])) == NULL)
+			ret = -EINVAL;
+		else
+			ret = br_stp_set_port_priority(p, args[2]);
+		spin_unlock_bh(&br->lock);
+		return ret;
+	}
+
+	case BRCTL_SET_PATH_COST:
+	{
+		struct net_bridge_port *p;
+		int ret;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		spin_lock_bh(&br->lock);
+		if ((p = br_get_port(br, args[1])) == NULL)
+			ret = -EINVAL;
+		else
+			ret = br_stp_set_path_cost(p, args[2]);
+		spin_unlock_bh(&br->lock);
+
+		return ret;
+	}
+
+	case BRCTL_GET_FDB_ENTRIES:
+		return get_fdb_entries(br, (void __user *)args[1],
+				       args[2], args[3]);
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int old_deviceless(struct net *net, void __user *uarg)
+{
+	unsigned long args[3];
+
+	if (copy_from_user(args, uarg, sizeof(args)))
+		return -EFAULT;
+
+	switch (args[0]) {
+	case BRCTL_GET_VERSION:
+		return BRCTL_VERSION;
+
+	case BRCTL_GET_BRIDGES:
+	{
+		int *indices;
+		int ret = 0;
+
+		if (args[2] >= 2048)
+			return -ENOMEM;
+		indices = kcalloc(args[2], sizeof(int), GFP_KERNEL);
+		if (indices == NULL)
+			return -ENOMEM;
+
+		args[2] = get_bridge_ifindices(net, indices, args[2]);
+
+		ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int))
+			? -EFAULT : args[2];
+
+		kfree(indices);
+		return ret;
+	}
+
+	case BRCTL_ADD_BRIDGE:
+	case BRCTL_DEL_BRIDGE:
+	{
+		char buf[IFNAMSIZ];
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ))
+			return -EFAULT;
+
+		buf[IFNAMSIZ-1] = 0;
+
+		if (args[0] == BRCTL_ADD_BRIDGE)
+			return br_add_bridge(net, buf);
+
+		return br_del_bridge(net, buf);
+	}
+	}
+
+	return -EOPNOTSUPP;
+}
+
+int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
+{
+	switch (cmd) {
+	case SIOCGIFBR:
+	case SIOCSIFBR:
+		return old_deviceless(net, uarg);
+
+	case SIOCBRADDBR:
+	case SIOCBRDELBR:
+	{
+		char buf[IFNAMSIZ];
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(buf, uarg, IFNAMSIZ))
+			return -EFAULT;
+
+		buf[IFNAMSIZ-1] = 0;
+		if (cmd == SIOCBRADDBR)
+			return br_add_bridge(net, buf);
+
+		return br_del_bridge(net, buf);
+	}
+	}
+	return -EOPNOTSUPP;
+}
+
+int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	switch(cmd) {
+	case SIOCDEVPRIVATE:
+		return old_dev_ioctl(dev, rq, cmd);
+
+	case SIOCBRADDIF:
+	case SIOCBRDELIF:
+		return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
+
+	}
+
+	br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
+	return -EOPNOTSUPP;
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
new file mode 100644
index 00000000..e78269d7
--- /dev/null
+++ b/net/bridge/br_multicast.c
@@ -0,0 +1,1777 @@
+/*
+ * Bridge multicast support.
+ *
+ * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+#define mlock_dereference(X, br) \
+	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline int ipv6_is_transient_multicast(const struct in6_addr *addr)
+{
+	if (ipv6_addr_is_multicast(addr) && IPV6_ADDR_MC_FLAG_TRANSIENT(addr))
+		return 1;
+	return 0;
+}
+#endif
+
+static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
+{
+	if (a->proto != b->proto)
+		return 0;
+	switch (a->proto) {
+	case htons(ETH_P_IP):
+		return a->u.ip4 == b->u.ip4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case htons(ETH_P_IPV6):
+		return ipv6_addr_equal(&a->u.ip6, &b->u.ip6);
+#endif
+	}
+	return 0;
+}
+
+static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip)
+{
+	return jhash_1word(mdb->secret, (__force u32)ip) & (mdb->max - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb,
+				const struct in6_addr *ip)
+{
+	return jhash2((__force u32 *)ip->s6_addr32, 4, mdb->secret) & (mdb->max - 1);
+}
+#endif
+
+static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb,
+			     struct br_ip *ip)
+{
+	switch (ip->proto) {
+	case htons(ETH_P_IP):
+		return __br_ip4_hash(mdb, ip->u.ip4);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case htons(ETH_P_IPV6):
+		return __br_ip6_hash(mdb, &ip->u.ip6);
+#endif
+	}
+	return 0;
+}
+
+static struct net_bridge_mdb_entry *__br_mdb_ip_get(
+	struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash)
+{
+	struct net_bridge_mdb_entry *mp;
+	struct hlist_node *p;
+
+	hlist_for_each_entry_rcu(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) {
+		if (br_ip_equal(&mp->addr, dst))
+			return mp;
+	}
+
+	return NULL;
+}
+
+static struct net_bridge_mdb_entry *br_mdb_ip_get(
+	struct net_bridge_mdb_htable *mdb, struct br_ip *dst)
+{
+	if (!mdb)
+		return NULL;
+
+	return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst));
+}
+
+static struct net_bridge_mdb_entry *br_mdb_ip4_get(
+	struct net_bridge_mdb_htable *mdb, __be32 dst)
+{
+	struct br_ip br_dst;
+
+	br_dst.u.ip4 = dst;
+	br_dst.proto = htons(ETH_P_IP);
+
+	return br_mdb_ip_get(mdb, &br_dst);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static struct net_bridge_mdb_entry *br_mdb_ip6_get(
+	struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst)
+{
+	struct br_ip br_dst;
+
+	ipv6_addr_copy(&br_dst.u.ip6, dst);
+	br_dst.proto = htons(ETH_P_IPV6);
+
+	return br_mdb_ip_get(mdb, &br_dst);
+}
+#endif
+
+struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+					struct sk_buff *skb)
+{
+	struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
+	struct br_ip ip;
+
+	if (br->multicast_disabled)
+		return NULL;
+
+	if (BR_INPUT_SKB_CB(skb)->igmp)
+		return NULL;
+
+	ip.proto = skb->protocol;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ip.u.ip4 = ip_hdr(skb)->daddr;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case htons(ETH_P_IPV6):
+		ipv6_addr_copy(&ip.u.ip6, &ipv6_hdr(skb)->daddr);
+		break;
+#endif
+	default:
+		return NULL;
+	}
+
+	return br_mdb_ip_get(mdb, &ip);
+}
+
+static void br_mdb_free(struct rcu_head *head)
+{
+	struct net_bridge_mdb_htable *mdb =
+		container_of(head, struct net_bridge_mdb_htable, rcu);
+	struct net_bridge_mdb_htable *old = mdb->old;
+
+	mdb->old = NULL;
+	kfree(old->mhash);
+	kfree(old);
+}
+
+static int br_mdb_copy(struct net_bridge_mdb_htable *new,
+		       struct net_bridge_mdb_htable *old,
+		       int elasticity)
+{
+	struct net_bridge_mdb_entry *mp;
+	struct hlist_node *p;
+	int maxlen;
+	int len;
+	int i;
+
+	for (i = 0; i < old->max; i++)
+		hlist_for_each_entry(mp, p, &old->mhash[i], hlist[old->ver])
+			hlist_add_head(&mp->hlist[new->ver],
+				       &new->mhash[br_ip_hash(new, &mp->addr)]);
+
+	if (!elasticity)
+		return 0;
+
+	maxlen = 0;
+	for (i = 0; i < new->max; i++) {
+		len = 0;
+		hlist_for_each_entry(mp, p, &new->mhash[i], hlist[new->ver])
+			len++;
+		if (len > maxlen)
+			maxlen = len;
+	}
+
+	return maxlen > elasticity ? -EINVAL : 0;
+}
+
+static void br_multicast_free_pg(struct rcu_head *head)
+{
+	struct net_bridge_port_group *p =
+		container_of(head, struct net_bridge_port_group, rcu);
+
+	kfree(p);
+}
+
+static void br_multicast_free_group(struct rcu_head *head)
+{
+	struct net_bridge_mdb_entry *mp =
+		container_of(head, struct net_bridge_mdb_entry, rcu);
+
+	kfree(mp);
+}
+
+static void br_multicast_group_expired(unsigned long data)
+{
+	struct net_bridge_mdb_entry *mp = (void *)data;
+	struct net_bridge *br = mp->br;
+	struct net_bridge_mdb_htable *mdb;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) || timer_pending(&mp->timer))
+		goto out;
+
+	mp->mglist = false;
+
+	if (mp->ports)
+		goto out;
+
+	mdb = mlock_dereference(br->mdb, br);
+
+	hlist_del_rcu(&mp->hlist[mdb->ver]);
+	mdb->size--;
+
+	call_rcu_bh(&mp->rcu, br_multicast_free_group);
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_del_pg(struct net_bridge *br,
+				struct net_bridge_port_group *pg)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+
+	mdb = mlock_dereference(br->mdb, br);
+
+	mp = br_mdb_ip_get(mdb, &pg->addr);
+	if (WARN_ON(!mp))
+		return;
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (p != pg)
+			continue;
+
+		rcu_assign_pointer(*pp, p->next);
+		hlist_del_init(&p->mglist);
+		del_timer(&p->timer);
+		call_rcu_bh(&p->rcu, br_multicast_free_pg);
+
+		if (!mp->ports && !mp->mglist &&
+		    netif_running(br->dev))
+			mod_timer(&mp->timer, jiffies);
+
+		return;
+	}
+
+	WARN_ON(1);
+}
+
+static void br_multicast_port_group_expired(unsigned long data)
+{
+	struct net_bridge_port_group *pg = (void *)data;
+	struct net_bridge *br = pg->port->br;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
+	    hlist_unhashed(&pg->mglist))
+		goto out;
+
+	br_multicast_del_pg(br, pg);
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
+			 int elasticity)
+{
+	struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1);
+	struct net_bridge_mdb_htable *mdb;
+	int err;
+
+	mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC);
+	if (!mdb)
+		return -ENOMEM;
+
+	mdb->max = max;
+	mdb->old = old;
+
+	mdb->mhash = kzalloc(max * sizeof(*mdb->mhash), GFP_ATOMIC);
+	if (!mdb->mhash) {
+		kfree(mdb);
+		return -ENOMEM;
+	}
+
+	mdb->size = old ? old->size : 0;
+	mdb->ver = old ? old->ver ^ 1 : 0;
+
+	if (!old || elasticity)
+		get_random_bytes(&mdb->secret, sizeof(mdb->secret));
+	else
+		mdb->secret = old->secret;
+
+	if (!old)
+		goto out;
+
+	err = br_mdb_copy(mdb, old, elasticity);
+	if (err) {
+		kfree(mdb->mhash);
+		kfree(mdb);
+		return err;
+	}
+
+	call_rcu_bh(&mdb->rcu, br_mdb_free);
+
+out:
+	rcu_assign_pointer(*mdbp, mdb);
+
+	return 0;
+}
+
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
+						    __be32 group)
+{
+	struct sk_buff *skb;
+	struct igmphdr *ih;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+
+	skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
+						 sizeof(*ih) + 4);
+	if (!skb)
+		goto out;
+
+	skb->protocol = htons(ETH_P_IP);
+
+	skb_reset_mac_header(skb);
+	eth = eth_hdr(skb);
+
+	memcpy(eth->h_source, br->dev->dev_addr, 6);
+	eth->h_dest[0] = 1;
+	eth->h_dest[1] = 0;
+	eth->h_dest[2] = 0x5e;
+	eth->h_dest[3] = 0;
+	eth->h_dest[4] = 0;
+	eth->h_dest[5] = 1;
+	eth->h_proto = htons(ETH_P_IP);
+	skb_put(skb, sizeof(*eth));
+
+	skb_set_network_header(skb, skb->len);
+	iph = ip_hdr(skb);
+
+	iph->version = 4;
+	iph->ihl = 6;
+	iph->tos = 0xc0;
+	iph->tot_len = htons(sizeof(*iph) + sizeof(*ih) + 4);
+	iph->id = 0;
+	iph->frag_off = htons(IP_DF);
+	iph->ttl = 1;
+	iph->protocol = IPPROTO_IGMP;
+	iph->saddr = 0;
+	iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
+	((u8 *)&iph[1])[0] = IPOPT_RA;
+	((u8 *)&iph[1])[1] = 4;
+	((u8 *)&iph[1])[2] = 0;
+	((u8 *)&iph[1])[3] = 0;
+	ip_send_check(iph);
+	skb_put(skb, 24);
+
+	skb_set_transport_header(skb, skb->len);
+	ih = igmp_hdr(skb);
+	ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
+	ih->code = (group ? br->multicast_last_member_interval :
+			    br->multicast_query_response_interval) /
+		   (HZ / IGMP_TIMER_SCALE);
+	ih->group = group;
+	ih->csum = 0;
+	ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+	skb_put(skb, sizeof(*ih));
+
+	__skb_pull(skb, sizeof(*eth));
+
+out:
+	return skb;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
+						    const struct in6_addr *group)
+{
+	struct sk_buff *skb;
+	struct ipv6hdr *ip6h;
+	struct mld_msg *mldq;
+	struct ethhdr *eth;
+	u8 *hopopt;
+	unsigned long interval;
+
+	skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
+						 8 + sizeof(*mldq));
+	if (!skb)
+		goto out;
+
+	skb->protocol = htons(ETH_P_IPV6);
+
+	/* Ethernet header */
+	skb_reset_mac_header(skb);
+	eth = eth_hdr(skb);
+
+	memcpy(eth->h_source, br->dev->dev_addr, 6);
+	eth->h_proto = htons(ETH_P_IPV6);
+	skb_put(skb, sizeof(*eth));
+
+	/* IPv6 header + HbH option */
+	skb_set_network_header(skb, skb->len);
+	ip6h = ipv6_hdr(skb);
+
+	*(__force __be32 *)ip6h = htonl(0x60000000);
+	ip6h->payload_len = htons(8 + sizeof(*mldq));
+	ip6h->nexthdr = IPPROTO_HOPOPTS;
+	ip6h->hop_limit = 1;
+	ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
+	if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
+			       &ip6h->saddr)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+	ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
+
+	hopopt = (u8 *)(ip6h + 1);
+	hopopt[0] = IPPROTO_ICMPV6;		/* next hdr */
+	hopopt[1] = 0;				/* length of HbH */
+	hopopt[2] = IPV6_TLV_ROUTERALERT;	/* Router Alert */
+	hopopt[3] = 2;				/* Length of RA Option */
+	hopopt[4] = 0;				/* Type = 0x0000 (MLD) */
+	hopopt[5] = 0;
+	hopopt[6] = IPV6_TLV_PAD0;		/* Pad0 */
+	hopopt[7] = IPV6_TLV_PAD0;		/* Pad0 */
+
+	skb_put(skb, sizeof(*ip6h) + 8);
+
+	/* ICMPv6 */
+	skb_set_transport_header(skb, skb->len);
+	mldq = (struct mld_msg *) icmp6_hdr(skb);
+
+	interval = ipv6_addr_any(group) ? br->multicast_last_member_interval :
+					  br->multicast_query_response_interval;
+
+	mldq->mld_type = ICMPV6_MGM_QUERY;
+	mldq->mld_code = 0;
+	mldq->mld_cksum = 0;
+	mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
+	mldq->mld_reserved = 0;
+	ipv6_addr_copy(&mldq->mld_mca, group);
+
+	/* checksum */
+	mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					  sizeof(*mldq), IPPROTO_ICMPV6,
+					  csum_partial(mldq,
+						       sizeof(*mldq), 0));
+	skb_put(skb, sizeof(*mldq));
+
+	__skb_pull(skb, sizeof(*eth));
+
+out:
+	return skb;
+}
+#endif
+
+static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
+						struct br_ip *addr)
+{
+	switch (addr->proto) {
+	case htons(ETH_P_IP):
+		return br_ip4_multicast_alloc_query(br, addr->u.ip4);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case htons(ETH_P_IPV6):
+		return br_ip6_multicast_alloc_query(br, &addr->u.ip6);
+#endif
+	}
+	return NULL;
+}
+
+static struct net_bridge_mdb_entry *br_multicast_get_group(
+	struct net_bridge *br, struct net_bridge_port *port,
+	struct br_ip *group, int hash)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct hlist_node *p;
+	unsigned count = 0;
+	unsigned max;
+	int elasticity;
+	int err;
+
+	mdb = rcu_dereference_protected(br->mdb, 1);
+	hlist_for_each_entry(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) {
+		count++;
+		if (unlikely(br_ip_equal(group, &mp->addr)))
+			return mp;
+	}
+
+	elasticity = 0;
+	max = mdb->max;
+
+	if (unlikely(count > br->hash_elasticity && count)) {
+		if (net_ratelimit())
+			br_info(br, "Multicast hash table "
+				"chain limit reached: %s\n",
+				port ? port->dev->name : br->dev->name);
+
+		elasticity = br->hash_elasticity;
+	}
+
+	if (mdb->size >= max) {
+		max *= 2;
+		if (unlikely(max >= br->hash_max)) {
+			br_warn(br, "Multicast hash table maximum "
+				"reached, disabling snooping: %s, %d\n",
+				port ? port->dev->name : br->dev->name, max);
+			err = -E2BIG;
+disable:
+			br->multicast_disabled = 1;
+			goto err;
+		}
+	}
+
+	if (max > mdb->max || elasticity) {
+		if (mdb->old) {
+			if (net_ratelimit())
+				br_info(br, "Multicast hash table "
+					"on fire: %s\n",
+					port ? port->dev->name : br->dev->name);
+			err = -EEXIST;
+			goto err;
+		}
+
+		err = br_mdb_rehash(&br->mdb, max, elasticity);
+		if (err) {
+			br_warn(br, "Cannot rehash multicast "
+				"hash table, disabling snooping: %s, %d, %d\n",
+				port ? port->dev->name : br->dev->name,
+				mdb->size, err);
+			goto disable;
+		}
+
+		err = -EAGAIN;
+		goto err;
+	}
+
+	return NULL;
+
+err:
+	mp = ERR_PTR(err);
+	return mp;
+}
+
+static struct net_bridge_mdb_entry *br_multicast_new_group(
+	struct net_bridge *br, struct net_bridge_port *port,
+	struct br_ip *group)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	int hash;
+	int err;
+
+	mdb = rcu_dereference_protected(br->mdb, 1);
+	if (!mdb) {
+		err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0);
+		if (err)
+			return ERR_PTR(err);
+		goto rehash;
+	}
+
+	hash = br_ip_hash(mdb, group);
+	mp = br_multicast_get_group(br, port, group, hash);
+	switch (PTR_ERR(mp)) {
+	case 0:
+		break;
+
+	case -EAGAIN:
+rehash:
+		mdb = rcu_dereference_protected(br->mdb, 1);
+		hash = br_ip_hash(mdb, group);
+		break;
+
+	default:
+		goto out;
+	}
+
+	mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
+	if (unlikely(!mp))
+		return ERR_PTR(-ENOMEM);
+
+	mp->br = br;
+	mp->addr = *group;
+	setup_timer(&mp->timer, br_multicast_group_expired,
+		    (unsigned long)mp);
+
+	hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
+	mdb->size++;
+
+out:
+	return mp;
+}
+
+static int br_multicast_add_group(struct net_bridge *br,
+				  struct net_bridge_port *port,
+				  struct br_ip *group)
+{
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+	unsigned long now = jiffies;
+	int err;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) ||
+	    (port && port->state == BR_STATE_DISABLED))
+		goto out;
+
+	mp = br_multicast_new_group(br, port, group);
+	err = PTR_ERR(mp);
+	if (IS_ERR(mp))
+		goto err;
+
+	if (!port) {
+		mp->mglist = true;
+		mod_timer(&mp->timer, now + br->multicast_membership_interval);
+		goto out;
+	}
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (p->port == port)
+			goto found;
+		if ((unsigned long)p->port < (unsigned long)port)
+			break;
+	}
+
+	p = kzalloc(sizeof(*p), GFP_ATOMIC);
+	err = -ENOMEM;
+	if (unlikely(!p))
+		goto err;
+
+	p->addr = *group;
+	p->port = port;
+	p->next = *pp;
+	hlist_add_head(&p->mglist, &port->mglist);
+	setup_timer(&p->timer, br_multicast_port_group_expired,
+		    (unsigned long)p);
+
+	rcu_assign_pointer(*pp, p);
+
+found:
+	mod_timer(&p->timer, now + br->multicast_membership_interval);
+out:
+	err = 0;
+
+err:
+	spin_unlock(&br->multicast_lock);
+	return err;
+}
+
+static int br_ip4_multicast_add_group(struct net_bridge *br,
+				      struct net_bridge_port *port,
+				      __be32 group)
+{
+	struct br_ip br_group;
+
+	if (ipv4_is_local_multicast(group))
+		return 0;
+
+	br_group.u.ip4 = group;
+	br_group.proto = htons(ETH_P_IP);
+
+	return br_multicast_add_group(br, port, &br_group);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int br_ip6_multicast_add_group(struct net_bridge *br,
+				      struct net_bridge_port *port,
+				      const struct in6_addr *group)
+{
+	struct br_ip br_group;
+
+	if (!ipv6_is_transient_multicast(group))
+		return 0;
+
+	ipv6_addr_copy(&br_group.u.ip6, group);
+	br_group.proto = htons(ETH_P_IPV6);
+
+	return br_multicast_add_group(br, port, &br_group);
+}
+#endif
+
+static void br_multicast_router_expired(unsigned long data)
+{
+	struct net_bridge_port *port = (void *)data;
+	struct net_bridge *br = port->br;
+
+	spin_lock(&br->multicast_lock);
+	if (port->multicast_router != 1 ||
+	    timer_pending(&port->multicast_router_timer) ||
+	    hlist_unhashed(&port->rlist))
+		goto out;
+
+	hlist_del_init_rcu(&port->rlist);
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_local_router_expired(unsigned long data)
+{
+}
+
+static void __br_multicast_send_query(struct net_bridge *br,
+				      struct net_bridge_port *port,
+				      struct br_ip *ip)
+{
+	struct sk_buff *skb;
+
+	skb = br_multicast_alloc_query(br, ip);
+	if (!skb)
+		return;
+
+	if (port) {
+		__skb_push(skb, sizeof(struct ethhdr));
+		skb->dev = port->dev;
+		NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
+			dev_queue_xmit);
+	} else
+		netif_rx(skb);
+}
+
+static void br_multicast_send_query(struct net_bridge *br,
+				    struct net_bridge_port *port, u32 sent)
+{
+	unsigned long time;
+	struct br_ip br_group;
+
+	if (!netif_running(br->dev) || br->multicast_disabled ||
+	    timer_pending(&br->multicast_querier_timer))
+		return;
+
+	memset(&br_group.u, 0, sizeof(br_group.u));
+
+	br_group.proto = htons(ETH_P_IP);
+	__br_multicast_send_query(br, port, &br_group);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	br_group.proto = htons(ETH_P_IPV6);
+	__br_multicast_send_query(br, port, &br_group);
+#endif
+
+	time = jiffies;
+	time += sent < br->multicast_startup_query_count ?
+		br->multicast_startup_query_interval :
+		br->multicast_query_interval;
+	mod_timer(port ? &port->multicast_query_timer :
+			 &br->multicast_query_timer, time);
+}
+
+static void br_multicast_port_query_expired(unsigned long data)
+{
+	struct net_bridge_port *port = (void *)data;
+	struct net_bridge *br = port->br;
+
+	spin_lock(&br->multicast_lock);
+	if (port->state == BR_STATE_DISABLED ||
+	    port->state == BR_STATE_BLOCKING)
+		goto out;
+
+	if (port->multicast_startup_queries_sent <
+	    br->multicast_startup_query_count)
+		port->multicast_startup_queries_sent++;
+
+	br_multicast_send_query(port->br, port,
+				port->multicast_startup_queries_sent);
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+void br_multicast_add_port(struct net_bridge_port *port)
+{
+	port->multicast_router = 1;
+
+	setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
+		    (unsigned long)port);
+	setup_timer(&port->multicast_query_timer,
+		    br_multicast_port_query_expired, (unsigned long)port);
+}
+
+void br_multicast_del_port(struct net_bridge_port *port)
+{
+	del_timer_sync(&port->multicast_router_timer);
+}
+
+static void __br_multicast_enable_port(struct net_bridge_port *port)
+{
+	port->multicast_startup_queries_sent = 0;
+
+	if (try_to_del_timer_sync(&port->multicast_query_timer) >= 0 ||
+	    del_timer(&port->multicast_query_timer))
+		mod_timer(&port->multicast_query_timer, jiffies);
+}
+
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+	struct net_bridge *br = port->br;
+
+	spin_lock(&br->multicast_lock);
+	if (br->multicast_disabled || !netif_running(br->dev))
+		goto out;
+
+	__br_multicast_enable_port(port);
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+void br_multicast_disable_port(struct net_bridge_port *port)
+{
+	struct net_bridge *br = port->br;
+	struct net_bridge_port_group *pg;
+	struct hlist_node *p, *n;
+
+	spin_lock(&br->multicast_lock);
+	hlist_for_each_entry_safe(pg, p, n, &port->mglist, mglist)
+		br_multicast_del_pg(br, pg);
+
+	if (!hlist_unhashed(&port->rlist))
+		hlist_del_init_rcu(&port->rlist);
+	del_timer(&port->multicast_router_timer);
+	del_timer(&port->multicast_query_timer);
+	spin_unlock(&br->multicast_lock);
+}
+
+static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
+					 struct net_bridge_port *port,
+					 struct sk_buff *skb)
+{
+	struct igmpv3_report *ih;
+	struct igmpv3_grec *grec;
+	int i;
+	int len;
+	int num;
+	int type;
+	int err = 0;
+	__be32 group;
+
+	if (!pskb_may_pull(skb, sizeof(*ih)))
+		return -EINVAL;
+
+	ih = igmpv3_report_hdr(skb);
+	num = ntohs(ih->ngrec);
+	len = sizeof(*ih);
+
+	for (i = 0; i < num; i++) {
+		len += sizeof(*grec);
+		if (!pskb_may_pull(skb, len))
+			return -EINVAL;
+
+		grec = (void *)(skb->data + len - sizeof(*grec));
+		group = grec->grec_mca;
+		type = grec->grec_type;
+
+		len += ntohs(grec->grec_nsrcs) * 4;
+		if (!pskb_may_pull(skb, len))
+			return -EINVAL;
+
+		/* We treat this as an IGMPv2 report for now. */
+		switch (type) {
+		case IGMPV3_MODE_IS_INCLUDE:
+		case IGMPV3_MODE_IS_EXCLUDE:
+		case IGMPV3_CHANGE_TO_INCLUDE:
+		case IGMPV3_CHANGE_TO_EXCLUDE:
+		case IGMPV3_ALLOW_NEW_SOURCES:
+		case IGMPV3_BLOCK_OLD_SOURCES:
+			break;
+
+		default:
+			continue;
+		}
+
+		err = br_ip4_multicast_add_group(br, port, group);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int br_ip6_multicast_mld2_report(struct net_bridge *br,
+					struct net_bridge_port *port,
+					struct sk_buff *skb)
+{
+	struct icmp6hdr *icmp6h;
+	struct mld2_grec *grec;
+	int i;
+	int len;
+	int num;
+	int err = 0;
+
+	if (!pskb_may_pull(skb, sizeof(*icmp6h)))
+		return -EINVAL;
+
+	icmp6h = icmp6_hdr(skb);
+	num = ntohs(icmp6h->icmp6_dataun.un_data16[1]);
+	len = sizeof(*icmp6h);
+
+	for (i = 0; i < num; i++) {
+		__be16 *nsrcs, _nsrcs;
+
+		nsrcs = skb_header_pointer(skb,
+					   len + offsetof(struct mld2_grec,
+							  grec_nsrcs),
+					   sizeof(_nsrcs), &_nsrcs);
+		if (!nsrcs)
+			return -EINVAL;
+
+		if (!pskb_may_pull(skb,
+				   len + sizeof(*grec) +
+				   sizeof(struct in6_addr) * ntohs(*nsrcs)))
+			return -EINVAL;
+
+		grec = (struct mld2_grec *)(skb->data + len);
+		len += sizeof(*grec) +
+		       sizeof(struct in6_addr) * ntohs(*nsrcs);
+
+		/* We treat these as MLDv1 reports for now. */
+		switch (grec->grec_type) {
+		case MLD2_MODE_IS_INCLUDE:
+		case MLD2_MODE_IS_EXCLUDE:
+		case MLD2_CHANGE_TO_INCLUDE:
+		case MLD2_CHANGE_TO_EXCLUDE:
+		case MLD2_ALLOW_NEW_SOURCES:
+		case MLD2_BLOCK_OLD_SOURCES:
+			break;
+
+		default:
+			continue;
+		}
+
+		err = br_ip6_multicast_add_group(br, port, &grec->grec_mca);
+		if (!err)
+			break;
+	}
+
+	return err;
+}
+#endif
+
+/*
+ * Add port to rotuer_list
+ *  list is maintained ordered by pointer value
+ *  and locked by br->multicast_lock and RCU
+ */
+static void br_multicast_add_router(struct net_bridge *br,
+				    struct net_bridge_port *port)
+{
+	struct net_bridge_port *p;
+	struct hlist_node *n, *slot = NULL;
+
+	hlist_for_each_entry(p, n, &br->router_list, rlist) {
+		if ((unsigned long) port >= (unsigned long) p)
+			break;
+		slot = n;
+	}
+
+	if (slot)
+		hlist_add_after_rcu(slot, &port->rlist);
+	else
+		hlist_add_head_rcu(&port->rlist, &br->router_list);
+}
+
+static void br_multicast_mark_router(struct net_bridge *br,
+				     struct net_bridge_port *port)
+{
+	unsigned long now = jiffies;
+
+	if (!port) {
+		if (br->multicast_router == 1)
+			mod_timer(&br->multicast_router_timer,
+				  now + br->multicast_querier_interval);
+		return;
+	}
+
+	if (port->multicast_router != 1)
+		return;
+
+	if (!hlist_unhashed(&port->rlist))
+		goto timer;
+
+	br_multicast_add_router(br, port);
+
+timer:
+	mod_timer(&port->multicast_router_timer,
+		  now + br->multicast_querier_interval);
+}
+
+static void br_multicast_query_received(struct net_bridge *br,
+					struct net_bridge_port *port,
+					int saddr)
+{
+	if (saddr)
+		mod_timer(&br->multicast_querier_timer,
+			  jiffies + br->multicast_querier_interval);
+	else if (timer_pending(&br->multicast_querier_timer))
+		return;
+
+	br_multicast_mark_router(br, port);
+}
+
+static int br_ip4_multicast_query(struct net_bridge *br,
+				  struct net_bridge_port *port,
+				  struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct igmphdr *ih = igmp_hdr(skb);
+	struct net_bridge_mdb_entry *mp;
+	struct igmpv3_query *ih3;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+	unsigned long max_delay;
+	unsigned long now = jiffies;
+	__be32 group;
+	int err = 0;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) ||
+	    (port && port->state == BR_STATE_DISABLED))
+		goto out;
+
+	br_multicast_query_received(br, port, !!iph->saddr);
+
+	group = ih->group;
+
+	if (skb->len == sizeof(*ih)) {
+		max_delay = ih->code * (HZ / IGMP_TIMER_SCALE);
+
+		if (!max_delay) {
+			max_delay = 10 * HZ;
+			group = 0;
+		}
+	} else {
+		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ih3 = igmpv3_query_hdr(skb);
+		if (ih3->nsrcs)
+			goto out;
+
+		max_delay = ih3->code ?
+			    IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
+	}
+
+	if (!group)
+		goto out;
+
+	mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group);
+	if (!mp)
+		goto out;
+
+	max_delay *= br->multicast_last_member_count;
+
+	if (mp->mglist &&
+	    (timer_pending(&mp->timer) ?
+	     time_after(mp->timer.expires, now + max_delay) :
+	     try_to_del_timer_sync(&mp->timer) >= 0))
+		mod_timer(&mp->timer, now + max_delay);
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (timer_pending(&p->timer) ?
+		    time_after(p->timer.expires, now + max_delay) :
+		    try_to_del_timer_sync(&p->timer) >= 0)
+			mod_timer(&p->timer, now + max_delay);
+	}
+
+out:
+	spin_unlock(&br->multicast_lock);
+	return err;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int br_ip6_multicast_query(struct net_bridge *br,
+				  struct net_bridge_port *port,
+				  struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct mld_msg *mld = (struct mld_msg *) icmp6_hdr(skb);
+	struct net_bridge_mdb_entry *mp;
+	struct mld2_query *mld2q;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
+	unsigned long max_delay;
+	unsigned long now = jiffies;
+	const struct in6_addr *group = NULL;
+	int err = 0;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) ||
+	    (port && port->state == BR_STATE_DISABLED))
+		goto out;
+
+	br_multicast_query_received(br, port, !ipv6_addr_any(&ip6h->saddr));
+
+	if (skb->len == sizeof(*mld)) {
+		if (!pskb_may_pull(skb, sizeof(*mld))) {
+			err = -EINVAL;
+			goto out;
+		}
+		mld = (struct mld_msg *) icmp6_hdr(skb);
+		max_delay = msecs_to_jiffies(htons(mld->mld_maxdelay));
+		if (max_delay)
+			group = &mld->mld_mca;
+	} else if (skb->len >= sizeof(*mld2q)) {
+		if (!pskb_may_pull(skb, sizeof(*mld2q))) {
+			err = -EINVAL;
+			goto out;
+		}
+		mld2q = (struct mld2_query *)icmp6_hdr(skb);
+		if (!mld2q->mld2q_nsrcs)
+			group = &mld2q->mld2q_mca;
+		max_delay = mld2q->mld2q_mrc ? MLDV2_MRC(mld2q->mld2q_mrc) : 1;
+	}
+
+	if (!group)
+		goto out;
+
+	mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group);
+	if (!mp)
+		goto out;
+
+	max_delay *= br->multicast_last_member_count;
+	if (mp->mglist &&
+	    (timer_pending(&mp->timer) ?
+	     time_after(mp->timer.expires, now + max_delay) :
+	     try_to_del_timer_sync(&mp->timer) >= 0))
+		mod_timer(&mp->timer, now + max_delay);
+
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
+		if (timer_pending(&p->timer) ?
+		    time_after(p->timer.expires, now + max_delay) :
+		    try_to_del_timer_sync(&p->timer) >= 0)
+			mod_timer(&p->timer, now + max_delay);
+	}
+
+out:
+	spin_unlock(&br->multicast_lock);
+	return err;
+}
+#endif
+
+static void br_multicast_leave_group(struct net_bridge *br,
+				     struct net_bridge_port *port,
+				     struct br_ip *group)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct net_bridge_port_group *p;
+	unsigned long now;
+	unsigned long time;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) ||
+	    (port && port->state == BR_STATE_DISABLED) ||
+	    timer_pending(&br->multicast_querier_timer))
+		goto out;
+
+	mdb = mlock_dereference(br->mdb, br);
+	mp = br_mdb_ip_get(mdb, group);
+	if (!mp)
+		goto out;
+
+	now = jiffies;
+	time = now + br->multicast_last_member_count *
+		     br->multicast_last_member_interval;
+
+	if (!port) {
+		if (mp->mglist &&
+		    (timer_pending(&mp->timer) ?
+		     time_after(mp->timer.expires, time) :
+		     try_to_del_timer_sync(&mp->timer) >= 0)) {
+			mod_timer(&mp->timer, time);
+		}
+
+		goto out;
+	}
+
+	for (p = mlock_dereference(mp->ports, br);
+	     p != NULL;
+	     p = mlock_dereference(p->next, br)) {
+		if (p->port != port)
+			continue;
+
+		if (!hlist_unhashed(&p->mglist) &&
+		    (timer_pending(&p->timer) ?
+		     time_after(p->timer.expires, time) :
+		     try_to_del_timer_sync(&p->timer) >= 0)) {
+			mod_timer(&p->timer, time);
+		}
+
+		break;
+	}
+
+out:
+	spin_unlock(&br->multicast_lock);
+}
+
+static void br_ip4_multicast_leave_group(struct net_bridge *br,
+					 struct net_bridge_port *port,
+					 __be32 group)
+{
+	struct br_ip br_group;
+
+	if (ipv4_is_local_multicast(group))
+		return;
+
+	br_group.u.ip4 = group;
+	br_group.proto = htons(ETH_P_IP);
+
+	br_multicast_leave_group(br, port, &br_group);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void br_ip6_multicast_leave_group(struct net_bridge *br,
+					 struct net_bridge_port *port,
+					 const struct in6_addr *group)
+{
+	struct br_ip br_group;
+
+	if (!ipv6_is_transient_multicast(group))
+		return;
+
+	ipv6_addr_copy(&br_group.u.ip6, group);
+	br_group.proto = htons(ETH_P_IPV6);
+
+	br_multicast_leave_group(br, port, &br_group);
+}
+#endif
+
+static int br_multicast_ipv4_rcv(struct net_bridge *br,
+				 struct net_bridge_port *port,
+				 struct sk_buff *skb)
+{
+	struct sk_buff *skb2 = skb;
+	const struct iphdr *iph;
+	struct igmphdr *ih;
+	unsigned len;
+	unsigned offset;
+	int err;
+
+	/* We treat OOM as packet loss for now. */
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (iph->ihl < 5 || iph->version != 4)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb, ip_hdrlen(skb)))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		return -EINVAL;
+
+	if (iph->protocol != IPPROTO_IGMP) {
+		if ((iph->daddr & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP)
+			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+		return 0;
+	}
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < ip_hdrlen(skb))
+		return -EINVAL;
+
+	if (skb->len > len) {
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb2)
+			return -ENOMEM;
+
+		err = pskb_trim_rcsum(skb2, len);
+		if (err)
+			goto err_out;
+	}
+
+	len -= ip_hdrlen(skb2);
+	offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
+	__skb_pull(skb2, offset);
+	skb_reset_transport_header(skb2);
+
+	err = -EINVAL;
+	if (!pskb_may_pull(skb2, sizeof(*ih)))
+		goto out;
+
+	switch (skb2->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!csum_fold(skb2->csum))
+			break;
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb2->csum = 0;
+		if (skb_checksum_complete(skb2))
+			goto out;
+	}
+
+	err = 0;
+
+	BR_INPUT_SKB_CB(skb)->igmp = 1;
+	ih = igmp_hdr(skb2);
+
+	switch (ih->type) {
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+		err = br_ip4_multicast_add_group(br, port, ih->group);
+		break;
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		err = br_ip4_multicast_igmp3_report(br, port, skb2);
+		break;
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		err = br_ip4_multicast_query(br, port, skb2);
+		break;
+	case IGMP_HOST_LEAVE_MESSAGE:
+		br_ip4_multicast_leave_group(br, port, ih->group);
+		break;
+	}
+
+out:
+	__skb_push(skb2, offset);
+err_out:
+	if (skb2 != skb)
+		kfree_skb(skb2);
+	return err;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int br_multicast_ipv6_rcv(struct net_bridge *br,
+				 struct net_bridge_port *port,
+				 struct sk_buff *skb)
+{
+	struct sk_buff *skb2;
+	const struct ipv6hdr *ip6h;
+	u8 icmp6_type;
+	u8 nexthdr;
+	unsigned len;
+	int offset;
+	int err;
+
+	if (!pskb_may_pull(skb, sizeof(*ip6h)))
+		return -EINVAL;
+
+	ip6h = ipv6_hdr(skb);
+
+	/*
+	 * We're interested in MLD messages only.
+	 *  - Version is 6
+	 *  - MLD has always Router Alert hop-by-hop option
+	 *  - But we do not support jumbrograms.
+	 */
+	if (ip6h->version != 6 ||
+	    ip6h->nexthdr != IPPROTO_HOPOPTS ||
+	    ip6h->payload_len == 0)
+		return 0;
+
+	len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
+	if (skb->len < len)
+		return -EINVAL;
+
+	nexthdr = ip6h->nexthdr;
+	offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr);
+
+	if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+		return 0;
+
+	/* Okay, we found ICMPv6 header */
+	skb2 = skb_clone(skb, GFP_ATOMIC);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = -EINVAL;
+	if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
+		goto out;
+
+	len -= offset - skb_network_offset(skb2);
+
+	__skb_pull(skb2, offset);
+	skb_reset_transport_header(skb2);
+
+	icmp6_type = icmp6_hdr(skb2)->icmp6_type;
+
+	switch (icmp6_type) {
+	case ICMPV6_MGM_QUERY:
+	case ICMPV6_MGM_REPORT:
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_MLD2_REPORT:
+		break;
+	default:
+		err = 0;
+		goto out;
+	}
+
+	/* Okay, we found MLD message. Check further. */
+	if (skb2->len > len) {
+		err = pskb_trim_rcsum(skb2, len);
+		if (err)
+			goto out;
+		err = -EINVAL;
+	}
+
+	ip6h = ipv6_hdr(skb2);
+
+	switch (skb2->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb2->len,
+					IPPROTO_ICMPV6, skb2->csum))
+			break;
+		/*FALLTHROUGH*/
+	case CHECKSUM_NONE:
+		skb2->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+							&ip6h->daddr,
+							skb2->len,
+							IPPROTO_ICMPV6, 0));
+		if (__skb_checksum_complete(skb2))
+			goto out;
+	}
+
+	err = 0;
+
+	BR_INPUT_SKB_CB(skb)->igmp = 1;
+
+	switch (icmp6_type) {
+	case ICMPV6_MGM_REPORT:
+	    {
+		struct mld_msg *mld;
+		if (!pskb_may_pull(skb2, sizeof(*mld))) {
+			err = -EINVAL;
+			goto out;
+		}
+		mld = (struct mld_msg *)skb_transport_header(skb2);
+		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca);
+		break;
+	    }
+	case ICMPV6_MLD2_REPORT:
+		err = br_ip6_multicast_mld2_report(br, port, skb2);
+		break;
+	case ICMPV6_MGM_QUERY:
+		err = br_ip6_multicast_query(br, port, skb2);
+		break;
+	case ICMPV6_MGM_REDUCTION:
+	    {
+		struct mld_msg *mld;
+		if (!pskb_may_pull(skb2, sizeof(*mld))) {
+			err = -EINVAL;
+			goto out;
+		}
+		mld = (struct mld_msg *)skb_transport_header(skb2);
+		br_ip6_multicast_leave_group(br, port, &mld->mld_mca);
+	    }
+	}
+
+out:
+	kfree_skb(skb2);
+	return err;
+}
+#endif
+
+int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
+		     struct sk_buff *skb)
+{
+	BR_INPUT_SKB_CB(skb)->igmp = 0;
+	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
+
+	if (br->multicast_disabled)
+		return 0;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return br_multicast_ipv4_rcv(br, port, skb);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case htons(ETH_P_IPV6):
+		return br_multicast_ipv6_rcv(br, port, skb);
+#endif
+	}
+
+	return 0;
+}
+
+static void br_multicast_query_expired(unsigned long data)
+{
+	struct net_bridge *br = (void *)data;
+
+	spin_lock(&br->multicast_lock);
+	if (br->multicast_startup_queries_sent <
+	    br->multicast_startup_query_count)
+		br->multicast_startup_queries_sent++;
+
+	br_multicast_send_query(br, NULL, br->multicast_startup_queries_sent);
+
+	spin_unlock(&br->multicast_lock);
+}
+
+void br_multicast_init(struct net_bridge *br)
+{
+	br->hash_elasticity = 4;
+	br->hash_max = 512;
+
+	br->multicast_router = 1;
+	br->multicast_last_member_count = 2;
+	br->multicast_startup_query_count = 2;
+
+	br->multicast_last_member_interval = HZ;
+	br->multicast_query_response_interval = 10 * HZ;
+	br->multicast_startup_query_interval = 125 * HZ / 4;
+	br->multicast_query_interval = 125 * HZ;
+	br->multicast_querier_interval = 255 * HZ;
+	br->multicast_membership_interval = 260 * HZ;
+
+	spin_lock_init(&br->multicast_lock);
+	setup_timer(&br->multicast_router_timer,
+		    br_multicast_local_router_expired, 0);
+	setup_timer(&br->multicast_querier_timer,
+		    br_multicast_local_router_expired, 0);
+	setup_timer(&br->multicast_query_timer, br_multicast_query_expired,
+		    (unsigned long)br);
+}
+
+void br_multicast_open(struct net_bridge *br)
+{
+	br->multicast_startup_queries_sent = 0;
+
+	if (br->multicast_disabled)
+		return;
+
+	mod_timer(&br->multicast_query_timer, jiffies);
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+	struct net_bridge_mdb_htable *mdb;
+	struct net_bridge_mdb_entry *mp;
+	struct hlist_node *p, *n;
+	u32 ver;
+	int i;
+
+	del_timer_sync(&br->multicast_router_timer);
+	del_timer_sync(&br->multicast_querier_timer);
+	del_timer_sync(&br->multicast_query_timer);
+
+	spin_lock_bh(&br->multicast_lock);
+	mdb = mlock_dereference(br->mdb, br);
+	if (!mdb)
+		goto out;
+
+	br->mdb = NULL;
+
+	ver = mdb->ver;
+	for (i = 0; i < mdb->max; i++) {
+		hlist_for_each_entry_safe(mp, p, n, &mdb->mhash[i],
+					  hlist[ver]) {
+			del_timer(&mp->timer);
+			call_rcu_bh(&mp->rcu, br_multicast_free_group);
+		}
+	}
+
+	if (mdb->old) {
+		spin_unlock_bh(&br->multicast_lock);
+		rcu_barrier_bh();
+		spin_lock_bh(&br->multicast_lock);
+		WARN_ON(mdb->old);
+	}
+
+	mdb->old = mdb;
+	call_rcu_bh(&mdb->rcu, br_mdb_free);
+
+out:
+	spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_multicast_set_router(struct net_bridge *br, unsigned long val)
+{
+	int err = -ENOENT;
+
+	spin_lock_bh(&br->multicast_lock);
+	if (!netif_running(br->dev))
+		goto unlock;
+
+	switch (val) {
+	case 0:
+	case 2:
+		del_timer(&br->multicast_router_timer);
+		/* fall through */
+	case 1:
+		br->multicast_router = val;
+		err = 0;
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+unlock:
+	spin_unlock_bh(&br->multicast_lock);
+
+	return err;
+}
+
+int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
+{
+	struct net_bridge *br = p->br;
+	int err = -ENOENT;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev) || p->state == BR_STATE_DISABLED)
+		goto unlock;
+
+	switch (val) {
+	case 0:
+	case 1:
+	case 2:
+		p->multicast_router = val;
+		err = 0;
+
+		if (val < 2 && !hlist_unhashed(&p->rlist))
+			hlist_del_init_rcu(&p->rlist);
+
+		if (val == 1)
+			break;
+
+		del_timer(&p->multicast_router_timer);
+
+		if (val == 0)
+			break;
+
+		br_multicast_add_router(br, p);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+unlock:
+	spin_unlock(&br->multicast_lock);
+
+	return err;
+}
+
+int br_multicast_toggle(struct net_bridge *br, unsigned long val)
+{
+	struct net_bridge_port *port;
+	int err = 0;
+	struct net_bridge_mdb_htable *mdb;
+
+	spin_lock(&br->multicast_lock);
+	if (br->multicast_disabled == !val)
+		goto unlock;
+
+	br->multicast_disabled = !val;
+	if (br->multicast_disabled)
+		goto unlock;
+
+	if (!netif_running(br->dev))
+		goto unlock;
+
+	mdb = mlock_dereference(br->mdb, br);
+	if (mdb) {
+		if (mdb->old) {
+			err = -EEXIST;
+rollback:
+			br->multicast_disabled = !!val;
+			goto unlock;
+		}
+
+		err = br_mdb_rehash(&br->mdb, mdb->max,
+				    br->hash_elasticity);
+		if (err)
+			goto rollback;
+	}
+
+	br_multicast_open(br);
+	list_for_each_entry(port, &br->port_list, list) {
+		if (port->state == BR_STATE_DISABLED ||
+		    port->state == BR_STATE_BLOCKING)
+			continue;
+
+		__br_multicast_enable_port(port);
+	}
+
+unlock:
+	spin_unlock(&br->multicast_lock);
+
+	return err;
+}
+
+int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val)
+{
+	int err = -ENOENT;
+	u32 old;
+	struct net_bridge_mdb_htable *mdb;
+
+	spin_lock(&br->multicast_lock);
+	if (!netif_running(br->dev))
+		goto unlock;
+
+	err = -EINVAL;
+	if (!is_power_of_2(val))
+		goto unlock;
+
+	mdb = mlock_dereference(br->mdb, br);
+	if (mdb && val < mdb->size)
+		goto unlock;
+
+	err = 0;
+
+	old = br->hash_max;
+	br->hash_max = val;
+
+	if (mdb) {
+		if (mdb->old) {
+			err = -EEXIST;
+rollback:
+			br->hash_max = old;
+			goto unlock;
+		}
+
+		err = br_mdb_rehash(&br->mdb, br->hash_max,
+				    br->hash_elasticity);
+		if (err)
+			goto rollback;
+	}
+
+unlock:
+	spin_unlock(&br->multicast_lock);
+
+	return err;
+}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
new file mode 100644
index 00000000..3dc7f544
--- /dev/null
+++ b/net/bridge/br_netfilter.c
@@ -0,0 +1,1037 @@
+/*
+ *	Handle firewalling
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *	Bart De Schuymer		<bdschuym@pandora.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define skb_origaddr(skb)	 (((struct bridge_skb_cb *) \
+				 (skb->nf_bridge->data))->daddr.ipv4)
+#define store_orig_dstaddr(skb)	 (skb_origaddr(skb) = ip_hdr(skb)->daddr)
+#define dnat_took_place(skb)	 (skb_origaddr(skb) != ip_hdr(skb)->daddr)
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *brnf_sysctl_header;
+static int brnf_call_iptables __read_mostly = 1;
+static int brnf_call_ip6tables __read_mostly = 1;
+static int brnf_call_arptables __read_mostly = 1;
+static int brnf_filter_vlan_tagged __read_mostly = 0;
+static int brnf_filter_pppoe_tagged __read_mostly = 0;
+#else
+#define brnf_call_iptables 1
+#define brnf_call_ip6tables 1
+#define brnf_call_arptables 1
+#define brnf_filter_vlan_tagged 0
+#define brnf_filter_pppoe_tagged 0
+#endif
+
+static inline __be16 vlan_proto(const struct sk_buff *skb)
+{
+	if (vlan_tx_tag_present(skb))
+		return skb->protocol;
+	else if (skb->protocol == htons(ETH_P_8021Q))
+		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+	else
+		return 0;
+}
+
+#define IS_VLAN_IP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IP) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_IPV6(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_ARP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
+	 brnf_filter_vlan_tagged)
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			    sizeof(struct pppoe_hdr)));
+}
+
+#define IS_PPPOE_IP(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IP) && \
+	 brnf_filter_pppoe_tagged)
+
+#define IS_PPPOE_IPV6(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IPV6) && \
+	 brnf_filter_pppoe_tagged)
+
+static void fake_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+}
+
+static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	return NULL;
+}
+
+static struct dst_ops fake_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		cpu_to_be16(ETH_P_IP),
+	.update_pmtu =		fake_update_pmtu,
+	.cow_metrics =		fake_cow_metrics,
+};
+
+/*
+ * Initialize bogus route table used to keep netfilter happy.
+ * Currently, we fill in the PMTU entry because netfilter
+ * refragmentation needs it, and the rt_flags entry because
+ * ipt_REJECT needs it.  Future netfilter modules might
+ * require us to fill additional fields.
+ */
+static const u32 br_dst_default_metrics[RTAX_MAX] = {
+	[RTAX_MTU - 1] = 1500,
+};
+
+void br_netfilter_rtable_init(struct net_bridge *br)
+{
+	struct rtable *rt = &br->fake_rtable;
+
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.dev = br->dev;
+	rt->dst.path = &rt->dst;
+	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+	rt->dst.flags	= DST_NOXFRM;
+	rt->dst.ops = &fake_dst_ops;
+}
+
+static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? &port->br->fake_rtable : NULL;
+}
+
+static inline struct net_device *bridge_parent(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? port->br->dev : NULL;
+}
+
+static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
+{
+	skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
+	if (likely(skb->nf_bridge))
+		atomic_set(&(skb->nf_bridge->use), 1);
+
+	return skb->nf_bridge;
+}
+
+static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (atomic_read(&nf_bridge->use) > 1) {
+		struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
+
+		if (tmp) {
+			memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
+			atomic_set(&tmp->use, 1);
+		}
+		nf_bridge_put(nf_bridge);
+		nf_bridge = tmp;
+	}
+	return nf_bridge;
+}
+
+static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_push(skb, len);
+	skb->network_header -= len;
+}
+
+static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull_rcsum(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_save_header(struct sk_buff *skb)
+{
+	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+
+	skb_copy_from_linear_data_offset(skb, -header_size,
+					 skb->nf_bridge->data, header_size);
+}
+
+static inline void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+	if (skb->nf_bridge->mask & BRNF_8021Q)
+		skb->protocol = htons(ETH_P_8021Q);
+	else if (skb->nf_bridge->mask & BRNF_PPPoE)
+		skb->protocol = htons(ETH_P_PPP_SES);
+}
+
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	const struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+	u32 len;
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+
+	/* Basic sanity checks */
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	if (iph->ihl == 5)
+		return 0;
+
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+	if (ip_options_compile(dev_net(dev), opt, skb))
+		goto inhdr_error;
+
+	/* Check correct handling of SRR option */
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+			goto drop;
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
+/* Fill in the header for fragmented IP packets handled by
+ * the IPv4 connection tracking code.
+ */
+int nf_bridge_copy_header(struct sk_buff *skb)
+{
+	int err;
+	unsigned int header_size;
+
+	nf_bridge_update_protocol(skb);
+	header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+	err = skb_cow_head(skb, header_size);
+	if (err)
+		return err;
+
+	skb_copy_to_linear_data_offset(skb, -header_size,
+				       skb->nf_bridge->data, header_size);
+	__skb_push(skb, nf_bridge_encap_header_len(skb));
+	return 0;
+}
+
+/* PF_BRIDGE/PRE_ROUTING *********************************************/
+/* Undo the changes made for ip6tables PREROUTING and continue the
+ * bridge PRE_ROUTING hook. */
+static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+
+	rt = bridge_parent_rtable(nf_bridge->physindev);
+	if (!rt) {
+		kfree_skb(skb);
+		return 0;
+	}
+	skb_dst_set_noref(skb, &rt->dst);
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct neighbour *neigh;
+	struct dst_entry *dst;
+
+	skb->dev = bridge_parent(skb->dev);
+	if (!skb->dev)
+		goto free_skb;
+	dst = skb_dst(skb);
+	neigh = dst_get_neighbour(dst);
+	if (dst->hh) {
+		neigh_hh_bridge(dst->hh, skb);
+		skb->dev = nf_bridge->physindev;
+		return br_handle_frame_finish(skb);
+	} else if (neigh) {
+		/* the neighbour function below overwrites the complete
+		 * MAC header, so we save the Ethernet source address and
+		 * protocol number. */
+		skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), skb->nf_bridge->data, ETH_HLEN-ETH_ALEN);
+		/* tell br_dev_xmit to continue with forwarding */
+		nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+		return neigh->output(skb);
+	}
+free_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* This requires some explaining. If DNAT has taken place,
+ * we will need to fix up the destination Ethernet address.
+ *
+ * There are two cases to consider:
+ * 1. The packet was DNAT'ed to a device in the same bridge
+ *    port group as it was received on. We can still bridge
+ *    the packet.
+ * 2. The packet was DNAT'ed to a different device, either
+ *    a non-bridged device or another bridge port group.
+ *    The packet will need to be routed.
+ *
+ * The correct way of distinguishing between these two cases is to
+ * call ip_route_input() and to look at skb->dst->dev, which is
+ * changed to the destination device if ip_route_input() succeeds.
+ *
+ * Let's first consider the case that ip_route_input() succeeds:
+ *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
+ * Otherwise, the packet is considered to be routed and we just
+ * change the destination MAC address so that the packet will
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
+ *
+ * Let's now consider the case that ip_route_input() fails:
+ *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
+ * thinks we're handling a locally generated packet and won't care
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+static int br_nf_pre_routing_finish(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct iphdr *iph = ip_hdr(skb);
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+	int err;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+	if (dnat_took_place(skb)) {
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+			struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+			/* If err equals -EHOSTUNREACH the error is due to a
+			 * martian destination or due to the fact that
+			 * forwarding is disabled. For most martian packets,
+			 * ip_route_output_key() will fail. It won't fail for 2 types of
+			 * martian destinations: loopback destinations and destination
+			 * 0.0.0.0. In both cases the packet will be dropped because the
+			 * destination is the loopback device and not the bridge. */
+			if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
+				goto free_skb;
+
+			rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+					     RT_TOS(iph->tos), 0);
+			if (!IS_ERR(rt)) {
+				/* - Bridged-and-DNAT'ed traffic doesn't
+				 *   require ip_forwarding. */
+				if (rt->dst.dev == dev) {
+					skb_dst_set(skb, &rt->dst);
+					goto bridged_dnat;
+				}
+				ip_rt_put(rt);
+			}
+free_skb:
+			kfree_skb(skb);
+			return 0;
+		} else {
+			if (skb_dst(skb)->dev == dev) {
+bridged_dnat:
+				skb->dev = nf_bridge->physindev;
+				nf_bridge_update_protocol(skb);
+				nf_bridge_push_encap_header(skb);
+				NF_HOOK_THRESH(NFPROTO_BRIDGE,
+					       NF_BR_PRE_ROUTING,
+					       skb, skb->dev, NULL,
+					       br_nf_pre_routing_finish_bridge,
+					       1);
+				return 0;
+			}
+			memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
+			skb->pkt_type = PACKET_HOST;
+		}
+	} else {
+		rt = bridge_parent_rtable(nf_bridge->physindev);
+		if (!rt) {
+			kfree_skb(skb);
+			return 0;
+		}
+		skb_dst_set_noref(skb, &rt->dst);
+	}
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+/* Some common code for IPv4/IPv6 */
+static struct net_device *setup_pre_routing(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->physindev = skb->dev;
+	skb->dev = bridge_parent(skb->dev);
+	if (skb->protocol == htons(ETH_P_8021Q))
+		nf_bridge->mask |= BRNF_8021Q;
+	else if (skb->protocol == htons(ETH_P_PPP_SES))
+		nf_bridge->mask |= BRNF_PPPoE;
+
+	return skb->dev;
+}
+
+/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
+					   struct sk_buff *skb,
+					   const struct net_device *in,
+					   const struct net_device *out,
+					   int (*okfn)(struct sk_buff *))
+{
+	const struct ipv6hdr *hdr;
+	u32 pkt_len;
+
+	if (skb->len < sizeof(struct ipv6hdr))
+		return NF_DROP;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		return NF_DROP;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		return NF_DROP;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+			return NF_DROP;
+		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+			return NF_DROP;
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+		br_nf_pre_routing_finish_ipv6);
+
+	return NF_STOLEN;
+}
+
+/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
+ * Replicate the checks that IPv4 does on packet reception.
+ * Set skb->dev to the bridge device (i.e. parent of the
+ * receiving device) to make netfilter happy, the REDIRECT
+ * target in particular.  Save the original destination IP
+ * address to be able to detect DNAT afterwards. */
+static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	__u32 len = nf_bridge_encap_header_len(skb);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return NF_DROP;
+
+	p = br_port_get_rcu(in);
+	if (p == NULL)
+		return NF_DROP;
+	br = p->br;
+
+	if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
+	    IS_PPPOE_IPV6(skb)) {
+		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+			return NF_ACCEPT;
+
+		nf_bridge_pull_encap_header_rcsum(skb);
+		return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
+	}
+
+	if (!brnf_call_iptables && !br->nf_call_iptables)
+		return NF_ACCEPT;
+
+	if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) &&
+	    !IS_PPPOE_IP(skb))
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header_rcsum(skb);
+
+	if (br_parse_ip_options(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+	store_orig_dstaddr(skb);
+	skb->protocol = htons(ETH_P_IP);
+
+	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+		br_nf_pre_routing_finish);
+
+	return NF_STOLEN;
+}
+
+
+/* PF_BRIDGE/LOCAL_IN ************************************************/
+/* The packet is locally destined, which requires a real
+ * dst_entry, so detach the fake one.  On the way up, the
+ * packet would pass through PRE_ROUTING again (which already
+ * took place when the packet entered the bridge), but we
+ * register an IPv4 PRE_ROUTING 'sabotage' hook that will
+ * prevent this from happening. */
+static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out,
+				   int (*okfn)(struct sk_buff *))
+{
+	struct rtable *rt = skb_rtable(skb);
+
+	if (rt && rt == bridge_parent_rtable(in))
+		skb_dst_drop(skb);
+
+	return NF_ACCEPT;
+}
+
+/* PF_BRIDGE/FORWARD *************************************************/
+static int br_nf_forward_finish(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *in;
+
+	if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) {
+		in = nf_bridge->physindev;
+		if (nf_bridge->mask & BRNF_PKT_TYPE) {
+			skb->pkt_type = PACKET_OTHERHOST;
+			nf_bridge->mask ^= BRNF_PKT_TYPE;
+		}
+		nf_bridge_update_protocol(skb);
+	} else {
+		in = *((struct net_device **)(skb->cb));
+	}
+	nf_bridge_push_encap_header(skb);
+
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, in,
+		       skb->dev, br_forward_finish, 1);
+	return 0;
+}
+
+/* This is the 'purely bridged' case.  For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports. */
+static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
+				     const struct net_device *in,
+				     const struct net_device *out,
+				     int (*okfn)(struct sk_buff *))
+{
+	struct nf_bridge_info *nf_bridge;
+	struct net_device *parent;
+	u_int8_t pf;
+
+	if (!skb->nf_bridge)
+		return NF_ACCEPT;
+
+	/* Need exclusive nf_bridge_info since we might have multiple
+	 * different physoutdevs. */
+	if (!nf_bridge_unshare(skb))
+		return NF_DROP;
+
+	parent = bridge_parent(out);
+	if (!parent)
+		return NF_DROP;
+
+	if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) ||
+	    IS_PPPOE_IP(skb))
+		pf = PF_INET;
+	else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
+		 IS_PPPOE_IPV6(skb))
+		pf = PF_INET6;
+	else
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header(skb);
+
+	nf_bridge = skb->nf_bridge;
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	if (pf == PF_INET && br_parse_ip_options(skb))
+		return NF_DROP;
+
+	/* The physdev module checks on this */
+	nf_bridge->mask |= BRNF_BRIDGED;
+	nf_bridge->physoutdev = skb->dev;
+	if (pf == PF_INET)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent,
+		br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct net_device **d = (struct net_device **)(skb->cb);
+
+	p = br_port_get_rcu(out);
+	if (p == NULL)
+		return NF_ACCEPT;
+	br = p->br;
+
+	if (!brnf_call_arptables && !br->nf_call_arptables)
+		return NF_ACCEPT;
+
+	if (skb->protocol != htons(ETH_P_ARP)) {
+		if (!IS_VLAN_ARP(skb))
+			return NF_ACCEPT;
+		nf_bridge_pull_encap_header(skb);
+	}
+
+	if (arp_hdr(skb)->ar_pln != 4) {
+		if (IS_VLAN_ARP(skb))
+			nf_bridge_push_encap_header(skb);
+		return NF_ACCEPT;
+	}
+	*d = (struct net_device *)in;
+	NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in,
+		(struct net_device *)out, br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
+static int br_nf_dev_queue_xmit(struct sk_buff *skb)
+{
+	int ret;
+
+	if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		if (br_parse_ip_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		ret = ip_fragment(skb, br_dev_queue_push_xmit);
+	} else
+		ret = br_dev_queue_push_xmit(skb);
+
+	return ret;
+}
+#else
+static int br_nf_dev_queue_xmit(struct sk_buff *skb)
+{
+        return br_dev_queue_push_xmit(skb);
+}
+#endif
+
+/* PF_BRIDGE/POST_ROUTING ********************************************/
+static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *realoutdev = bridge_parent(skb->dev);
+	u_int8_t pf;
+
+	if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED))
+		return NF_ACCEPT;
+
+	if (!realoutdev)
+		return NF_DROP;
+
+	if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) ||
+	    IS_PPPOE_IP(skb))
+		pf = PF_INET;
+	else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
+		 IS_PPPOE_IPV6(skb))
+		pf = PF_INET6;
+	else
+		return NF_ACCEPT;
+
+	/* We assume any code from br_dev_queue_push_xmit onwards doesn't care
+	 * about the value of skb->pkt_type. */
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge_pull_encap_header(skb);
+	nf_bridge_save_header(skb);
+	if (pf == PF_INET)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_POST_ROUTING, skb, NULL, realoutdev,
+		br_nf_dev_queue_xmit);
+
+	return NF_STOLEN;
+}
+
+/* IP/SABOTAGE *****************************************************/
+/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
+ * for the second time. */
+static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out,
+				   int (*okfn)(struct sk_buff *))
+{
+	if (skb->nf_bridge &&
+	    !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+		return NF_STOP;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
+static struct nf_hook_ops br_nf_ops[] __read_mostly = {
+	{
+		.hook = br_nf_pre_routing,
+		.owner = THIS_MODULE,
+		.pf = PF_BRIDGE,
+		.hooknum = NF_BR_PRE_ROUTING,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_local_in,
+		.owner = THIS_MODULE,
+		.pf = PF_BRIDGE,
+		.hooknum = NF_BR_LOCAL_IN,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_forward_ip,
+		.owner = THIS_MODULE,
+		.pf = PF_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF - 1,
+	},
+	{
+		.hook = br_nf_forward_arp,
+		.owner = THIS_MODULE,
+		.pf = PF_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_post_routing,
+		.owner = THIS_MODULE,
+		.pf = PF_BRIDGE,
+		.hooknum = NF_BR_POST_ROUTING,
+		.priority = NF_BR_PRI_LAST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP_PRI_FIRST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP6_PRI_FIRST,
+	},
+};
+
+#ifdef CONFIG_SYSCTL
+static
+int brnf_sysctl_call_tables(ctl_table * ctl, int write,
+			    void __user * buffer, size_t * lenp, loff_t * ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write && *(int *)(ctl->data))
+		*(int *)(ctl->data) = 1;
+	return ret;
+}
+
+static ctl_table brnf_table[] = {
+	{
+		.procname	= "bridge-nf-call-arptables",
+		.data		= &brnf_call_arptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-iptables",
+		.data		= &brnf_call_iptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-ip6tables",
+		.data		= &brnf_call_ip6tables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-vlan-tagged",
+		.data		= &brnf_filter_vlan_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-pppoe-tagged",
+		.data		= &brnf_filter_pppoe_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{ }
+};
+
+static struct ctl_path brnf_path[] = {
+	{ .procname = "net", },
+	{ .procname = "bridge", },
+	{ }
+};
+#endif
+
+int __init br_netfilter_init(void)
+{
+	int ret;
+
+	ret = dst_entries_init(&fake_dst_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret < 0) {
+		dst_entries_destroy(&fake_dst_ops);
+		return ret;
+	}
+#ifdef CONFIG_SYSCTL
+	brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
+	if (brnf_sysctl_header == NULL) {
+		printk(KERN_WARNING
+		       "br_netfilter: can't register to sysctl.\n");
+		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		dst_entries_destroy(&fake_dst_ops);
+		return -ENOMEM;
+	}
+#endif
+	printk(KERN_NOTICE "Bridge firewalling registered\n");
+	return 0;
+}
+
+void br_netfilter_fini(void)
+{
+	nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(brnf_sysctl_header);
+#endif
+	dst_entries_destroy(&fake_dst_ops);
+}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
new file mode 100644
index 00000000..2c160552
--- /dev/null
+++ b/net/bridge/br_netlink.c
@@ -0,0 +1,252 @@
+/*
+ *	Bridge netlink control interface
+ *
+ *	Authors:
+ *	Stephen Hemminger		<shemminger@osdl.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/etherdevice.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "br_private.h"
+
+static inline size_t br_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+	       + nla_total_size(4) /* IFLA_MASTER */
+	       + nla_total_size(4) /* IFLA_MTU */
+	       + nla_total_size(4) /* IFLA_LINK */
+	       + nla_total_size(1) /* IFLA_OPERSTATE */
+	       + nla_total_size(1); /* IFLA_PROTINFO */
+}
+
+/*
+ * Create one netlink message for one interface
+ * Contains port and master info as well as carrier and bridge state.
+ */
+static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port,
+			  u32 pid, u32 seq, int event, unsigned int flags)
+{
+	const struct net_bridge *br = port->br;
+	const struct net_device *dev = port->dev;
+	struct ifinfomsg *hdr;
+	struct nlmsghdr *nlh;
+	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+
+	br_debug(br, "br_fill_info event %d port %s master %s\n",
+		     event, dev->name, br->dev->name);
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	hdr = nlmsg_data(nlh);
+	hdr->ifi_family = AF_BRIDGE;
+	hdr->__ifi_pad = 0;
+	hdr->ifi_type = dev->type;
+	hdr->ifi_index = dev->ifindex;
+	hdr->ifi_flags = dev_get_flags(dev);
+	hdr->ifi_change = 0;
+
+	NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+	NLA_PUT_U32(skb, IFLA_MASTER, br->dev->ifindex);
+	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U8(skb, IFLA_OPERSTATE, operstate);
+
+	if (dev->addr_len)
+		NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+
+	if (dev->ifindex != dev->iflink)
+		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+
+	if (event == RTM_NEWLINK)
+		NLA_PUT_U8(skb, IFLA_PROTINFO, port->state);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+/*
+ * Notify listeners of a change in port information
+ */
+void br_ifinfo_notify(int event, struct net_bridge_port *port)
+{
+	struct net *net = dev_net(port->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	br_debug(port->br, "port %u(%s) event %d\n",
+		 (unsigned)port->port_no, port->dev->name, event);
+
+	skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = br_fill_ifinfo(skb, port, 0, 0, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in br_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+/*
+ * Dump information about all ports, in response to GETLINK
+ */
+static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	int idx;
+
+	idx = 0;
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		struct net_bridge_port *port = br_port_get_rcu(dev);
+
+		/* not a bridge port */
+		if (!port || idx < cb->args[0])
+			goto skip;
+
+		if (br_fill_ifinfo(skb, port,
+				   NETLINK_CB(cb->skb).pid,
+				   cb->nlh->nlmsg_seq, RTM_NEWLINK,
+				   NLM_F_MULTI) < 0)
+			break;
+skip:
+		++idx;
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+/*
+ * Change state of port (ie from forwarding to blocking etc)
+ * Used by spanning tree in user space.
+ */
+static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifinfomsg *ifm;
+	struct nlattr *protinfo;
+	struct net_device *dev;
+	struct net_bridge_port *p;
+	u8 new_state;
+
+	if (nlmsg_len(nlh) < sizeof(*ifm))
+		return -EINVAL;
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_family != AF_BRIDGE)
+		return -EPFNOSUPPORT;
+
+	protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO);
+	if (!protinfo || nla_len(protinfo) < sizeof(u8))
+		return -EINVAL;
+
+	new_state = nla_get_u8(protinfo);
+	if (new_state > BR_STATE_BLOCKING)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, ifm->ifi_index);
+	if (!dev)
+		return -ENODEV;
+
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		return -EINVAL;
+
+	/* if kernel STP is running, don't allow changes */
+	if (p->br->stp_enabled == BR_KERNEL_STP)
+		return -EBUSY;
+
+	if (!netif_running(dev) ||
+	    (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED))
+		return -ENETDOWN;
+
+	p->state = new_state;
+	br_log_state(p);
+	return 0;
+}
+
+static int br_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	return 0;
+}
+
+static struct rtnl_link_ops br_link_ops __read_mostly = {
+	.kind		= "bridge",
+	.priv_size	= sizeof(struct net_bridge),
+	.setup		= br_dev_setup,
+	.validate	= br_validate,
+	.dellink	= br_dev_delete,
+};
+
+int __init br_netlink_init(void)
+{
+	int err;
+
+	err = rtnl_link_register(&br_link_ops);
+	if (err < 0)
+		goto err1;
+
+	err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo);
+	if (err)
+		goto err2;
+	err = __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL);
+	if (err)
+		goto err3;
+	err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, br_fdb_add, NULL);
+	if (err)
+		goto err3;
+	err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH, br_fdb_delete, NULL);
+	if (err)
+		goto err3;
+	err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, br_fdb_dump);
+	if (err)
+		goto err3;
+
+	return 0;
+
+err3:
+	rtnl_unregister_all(PF_BRIDGE);
+err2:
+	rtnl_link_unregister(&br_link_ops);
+err1:
+	return err;
+}
+
+void __exit br_netlink_fini(void)
+{
+	rtnl_link_unregister(&br_link_ops);
+	rtnl_unregister_all(PF_BRIDGE);
+}
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
new file mode 100644
index 00000000..6545ee95
--- /dev/null
+++ b/net/bridge/br_notify.c
@@ -0,0 +1,108 @@
+/*
+ *	Device event handling
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+
+#include "br_private.h"
+
+static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr);
+
+struct notifier_block br_device_notifier = {
+	.notifier_call = br_device_event
+};
+
+/*
+ * Handle changes in state of network devices enslaved to a bridge.
+ *
+ * Note: don't care about up/down if bridge itself is down, because
+ *     port state is checked when bridge is brought up.
+ */
+static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	int err;
+
+	/* register of bridge completed, add sysfs entries */
+	if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) {
+		br_sysfs_addbr(dev);
+		return NOTIFY_DONE;
+	}
+
+	/* not a port of a bridge */
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		return NOTIFY_DONE;
+
+	br = p->br;
+
+	switch (event) {
+	case NETDEV_CHANGEMTU:
+		dev_set_mtu(br->dev, br_min_mtu(br));
+		break;
+
+	case NETDEV_CHANGEADDR:
+		spin_lock_bh(&br->lock);
+		br_fdb_changeaddr(p, dev->dev_addr);
+		br_stp_recalculate_bridge_id(br);
+		spin_unlock_bh(&br->lock);
+		break;
+
+	case NETDEV_CHANGE:
+		br_port_carrier_check(p);
+		break;
+
+	case NETDEV_FEAT_CHANGE:
+		netdev_update_features(br->dev);
+		break;
+
+	case NETDEV_DOWN:
+		spin_lock_bh(&br->lock);
+		if (br->dev->flags & IFF_UP)
+			br_stp_disable_port(p);
+		spin_unlock_bh(&br->lock);
+		break;
+
+	case NETDEV_UP:
+		if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) {
+			spin_lock_bh(&br->lock);
+			br_stp_enable_port(p);
+			spin_unlock_bh(&br->lock);
+		}
+		break;
+
+	case NETDEV_UNREGISTER:
+		br_del_if(br, dev);
+		break;
+
+	case NETDEV_CHANGENAME:
+		err = br_sysfs_renameif(p);
+		if (err)
+			return notifier_from_errno(err);
+		break;
+
+	case NETDEV_PRE_TYPE_CHANGE:
+		/* Forbid underlaying device to change its type. */
+		return NOTIFY_BAD;
+	}
+
+	/* Events that may cause spanning tree to refresh */
+	if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+	    event == NETDEV_CHANGE || event == NETDEV_DOWN)
+		br_ifinfo_notify(RTM_NEWLINK, p);
+
+	return NOTIFY_DONE;
+}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
new file mode 100644
index 00000000..1ca1b1c7
--- /dev/null
+++ b/net/bridge/br_private.h
@@ -0,0 +1,554 @@
+/*
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_H
+#define _BR_PRIVATE_H
+
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
+#include <linux/u64_stats_sync.h>
+#include <net/route.h>
+
+#define BR_HASH_BITS 8
+#define BR_HASH_SIZE (1 << BR_HASH_BITS)
+
+#define BR_HOLD_TIME (1*HZ)
+
+#define BR_PORT_BITS	10
+#define BR_MAX_PORTS	(1<<BR_PORT_BITS)
+
+#define BR_VERSION	"2.3"
+
+/* Path to usermode spanning tree program */
+#define BR_STP_PROG	"/sbin/bridge-stp"
+
+typedef struct bridge_id bridge_id;
+typedef struct mac_addr mac_addr;
+typedef __u16 port_id;
+
+struct bridge_id
+{
+	unsigned char	prio[2];
+	unsigned char	addr[6];
+};
+
+struct mac_addr
+{
+	unsigned char	addr[6];
+};
+
+struct br_ip
+{
+	union {
+		__be32	ip4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		struct in6_addr ip6;
+#endif
+	} u;
+	__be16		proto;
+};
+
+struct net_bridge_fdb_entry
+{
+	struct hlist_node		hlist;
+	struct net_bridge_port		*dst;
+
+	struct rcu_head			rcu;
+	unsigned long			updated;
+	unsigned long			used;
+	mac_addr			addr;
+	unsigned char			is_local;
+	unsigned char			is_static;
+};
+
+struct net_bridge_port_group {
+	struct net_bridge_port		*port;
+	struct net_bridge_port_group __rcu *next;
+	struct hlist_node		mglist;
+	struct rcu_head			rcu;
+	struct timer_list		timer;
+	struct br_ip			addr;
+};
+
+struct net_bridge_mdb_entry
+{
+	struct hlist_node		hlist[2];
+	struct net_bridge		*br;
+	struct net_bridge_port_group __rcu *ports;
+	struct rcu_head			rcu;
+	struct timer_list		timer;
+	struct br_ip			addr;
+	bool				mglist;
+};
+
+struct net_bridge_mdb_htable
+{
+	struct hlist_head		*mhash;
+	struct rcu_head			rcu;
+	struct net_bridge_mdb_htable	*old;
+	u32				size;
+	u32				max;
+	u32				secret;
+	u32				ver;
+};
+
+struct net_bridge_port
+{
+	struct net_bridge		*br;
+	struct net_device		*dev;
+	struct list_head		list;
+
+	/* STP */
+	u8				priority;
+	u8				state;
+	u16				port_no;
+	unsigned char			topology_change_ack;
+	unsigned char			config_pending;
+	port_id				port_id;
+	port_id				designated_port;
+	bridge_id			designated_root;
+	bridge_id			designated_bridge;
+	u32				path_cost;
+	u32				designated_cost;
+	unsigned long			designated_age;
+
+	struct timer_list		forward_delay_timer;
+	struct timer_list		hold_timer;
+	struct timer_list		message_age_timer;
+	struct kobject			kobj;
+	struct rcu_head			rcu;
+
+	unsigned long 			flags;
+#define BR_HAIRPIN_MODE		0x00000001
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	u32				multicast_startup_queries_sent;
+	unsigned char			multicast_router;
+	struct timer_list		multicast_router_timer;
+	struct timer_list		multicast_query_timer;
+	struct hlist_head		mglist;
+	struct hlist_node		rlist;
+#endif
+
+#ifdef CONFIG_SYSFS
+	char				sysfs_name[IFNAMSIZ];
+#endif
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll			*np;
+#endif
+};
+
+#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
+
+static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
+{
+	struct net_bridge_port *port = rcu_dereference(dev->rx_handler_data);
+	return br_port_exists(dev) ? port : NULL;
+}
+
+static inline struct net_bridge_port *br_port_get_rtnl(struct net_device *dev)
+{
+	return br_port_exists(dev) ?
+		rtnl_dereference(dev->rx_handler_data) : NULL;
+}
+
+struct br_cpu_netstats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+struct net_bridge
+{
+	spinlock_t			lock;
+	struct list_head		port_list;
+	struct net_device		*dev;
+
+	struct br_cpu_netstats __percpu *stats;
+	spinlock_t			hash_lock;
+	struct hlist_head		hash[BR_HASH_SIZE];
+#ifdef CONFIG_BRIDGE_NETFILTER
+	struct rtable 			fake_rtable;
+	bool				nf_call_iptables;
+	bool				nf_call_ip6tables;
+	bool				nf_call_arptables;
+#endif
+	unsigned long			flags;
+#define BR_SET_MAC_ADDR		0x00000001
+
+	/* STP */
+	bridge_id			designated_root;
+	bridge_id			bridge_id;
+	u32				root_path_cost;
+	unsigned long			max_age;
+	unsigned long			hello_time;
+	unsigned long			forward_delay;
+	unsigned long			bridge_max_age;
+	unsigned long			ageing_time;
+	unsigned long			bridge_hello_time;
+	unsigned long			bridge_forward_delay;
+
+	u8				group_addr[ETH_ALEN];
+	u16				root_port;
+
+	enum {
+		BR_NO_STP, 		/* no spanning tree */
+		BR_KERNEL_STP,		/* old STP in kernel */
+		BR_USER_STP,		/* new RSTP in userspace */
+	} stp_enabled;
+
+	unsigned char			topology_change;
+	unsigned char			topology_change_detected;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	unsigned char			multicast_router;
+
+	u8				multicast_disabled:1;
+
+	u32				hash_elasticity;
+	u32				hash_max;
+
+	u32				multicast_last_member_count;
+	u32				multicast_startup_queries_sent;
+	u32				multicast_startup_query_count;
+
+	unsigned long			multicast_last_member_interval;
+	unsigned long			multicast_membership_interval;
+	unsigned long			multicast_querier_interval;
+	unsigned long			multicast_query_interval;
+	unsigned long			multicast_query_response_interval;
+	unsigned long			multicast_startup_query_interval;
+
+	spinlock_t			multicast_lock;
+	struct net_bridge_mdb_htable __rcu *mdb;
+	struct hlist_head		router_list;
+
+	struct timer_list		multicast_router_timer;
+	struct timer_list		multicast_querier_timer;
+	struct timer_list		multicast_query_timer;
+#endif
+
+	struct timer_list		hello_timer;
+	struct timer_list		tcn_timer;
+	struct timer_list		topology_change_timer;
+	struct timer_list		gc_timer;
+	struct kobject			*ifobj;
+};
+
+struct br_input_skb_cb {
+	struct net_device *brdev;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	int igmp;
+	int mrouters_only;
+#endif
+};
+
+#define BR_INPUT_SKB_CB(__skb)	((struct br_input_skb_cb *)(__skb)->cb)
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)	(BR_INPUT_SKB_CB(__skb)->mrouters_only)
+#else
+# define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb)	(0)
+#endif
+
+#define br_printk(level, br, format, args...)	\
+	printk(level "%s: " format, (br)->dev->name, ##args)
+
+#define br_err(__br, format, args...)			\
+	br_printk(KERN_ERR, __br, format, ##args)
+#define br_warn(__br, format, args...)			\
+	br_printk(KERN_WARNING, __br, format, ##args)
+#define br_notice(__br, format, args...)		\
+	br_printk(KERN_NOTICE, __br, format, ##args)
+#define br_info(__br, format, args...)			\
+	br_printk(KERN_INFO, __br, format, ##args)
+
+#define br_debug(br, format, args...)			\
+	pr_debug("%s: " format,  (br)->dev->name, ##args)
+
+extern struct notifier_block br_device_notifier;
+extern const u8 br_group_address[ETH_ALEN];
+
+/* called under bridge lock */
+static inline int br_is_root_bridge(const struct net_bridge *br)
+{
+	return !memcmp(&br->bridge_id, &br->designated_root, 8);
+}
+
+/* br_device.c */
+extern void br_dev_setup(struct net_device *dev);
+extern void br_dev_delete(struct net_device *dev, struct list_head *list);
+extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
+			       struct net_device *dev);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+	return br->dev->npinfo;
+}
+
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+				       struct sk_buff *skb)
+{
+	struct netpoll *np = p->np;
+
+	if (np)
+		netpoll_send_skb(np, skb);
+}
+
+extern int br_netpoll_enable(struct net_bridge_port *p);
+extern void br_netpoll_disable(struct net_bridge_port *p);
+#else
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+	return NULL;
+}
+
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+				       struct sk_buff *skb)
+{
+}
+
+static inline int br_netpoll_enable(struct net_bridge_port *p)
+{
+	return 0;
+}
+
+static inline void br_netpoll_disable(struct net_bridge_port *p)
+{
+}
+#endif
+
+/* br_fdb.c */
+extern int br_fdb_init(void);
+extern void br_fdb_fini(void);
+extern void br_fdb_flush(struct net_bridge *br);
+extern void br_fdb_changeaddr(struct net_bridge_port *p,
+			      const unsigned char *newaddr);
+extern void br_fdb_cleanup(unsigned long arg);
+extern void br_fdb_delete_by_port(struct net_bridge *br,
+				  const struct net_bridge_port *p, int do_all);
+extern struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
+						 const unsigned char *addr);
+extern int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
+extern int br_fdb_fillbuf(struct net_bridge *br, void *buf,
+			  unsigned long count, unsigned long off);
+extern int br_fdb_insert(struct net_bridge *br,
+			 struct net_bridge_port *source,
+			 const unsigned char *addr);
+extern void br_fdb_update(struct net_bridge *br,
+			  struct net_bridge_port *source,
+			  const unsigned char *addr);
+extern int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb);
+extern int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg);
+extern int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg);
+
+/* br_forward.c */
+extern void br_deliver(const struct net_bridge_port *to,
+		struct sk_buff *skb);
+extern int br_dev_queue_push_xmit(struct sk_buff *skb);
+extern void br_forward(const struct net_bridge_port *to,
+		struct sk_buff *skb, struct sk_buff *skb0);
+extern int br_forward_finish(struct sk_buff *skb);
+extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
+extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
+			     struct sk_buff *skb2);
+
+/* br_if.c */
+extern void br_port_carrier_check(struct net_bridge_port *p);
+extern int br_add_bridge(struct net *net, const char *name);
+extern int br_del_bridge(struct net *net, const char *name);
+extern void br_net_exit(struct net *net);
+extern int br_add_if(struct net_bridge *br,
+	      struct net_device *dev);
+extern int br_del_if(struct net_bridge *br,
+	      struct net_device *dev);
+extern int br_min_mtu(const struct net_bridge *br);
+extern u32 br_features_recompute(struct net_bridge *br, u32 features);
+
+/* br_input.c */
+extern int br_handle_frame_finish(struct sk_buff *skb);
+extern rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
+
+/* br_ioctl.c */
+extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+extern int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *arg);
+
+/* br_multicast.c */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+extern int br_multicast_rcv(struct net_bridge *br,
+			    struct net_bridge_port *port,
+			    struct sk_buff *skb);
+extern struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+					       struct sk_buff *skb);
+extern void br_multicast_add_port(struct net_bridge_port *port);
+extern void br_multicast_del_port(struct net_bridge_port *port);
+extern void br_multicast_enable_port(struct net_bridge_port *port);
+extern void br_multicast_disable_port(struct net_bridge_port *port);
+extern void br_multicast_init(struct net_bridge *br);
+extern void br_multicast_open(struct net_bridge *br);
+extern void br_multicast_stop(struct net_bridge *br);
+extern void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+				 struct sk_buff *skb);
+extern void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+				 struct sk_buff *skb, struct sk_buff *skb2);
+extern int br_multicast_set_router(struct net_bridge *br, unsigned long val);
+extern int br_multicast_set_port_router(struct net_bridge_port *p,
+					unsigned long val);
+extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
+extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+
+static inline bool br_multicast_is_router(struct net_bridge *br)
+{
+	return br->multicast_router == 2 ||
+	       (br->multicast_router == 1 &&
+		timer_pending(&br->multicast_router_timer));
+}
+#else
+static inline int br_multicast_rcv(struct net_bridge *br,
+				   struct net_bridge_port *port,
+				   struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
+						      struct sk_buff *skb)
+{
+	return NULL;
+}
+
+static inline void br_multicast_add_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_del_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_enable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_disable_port(struct net_bridge_port *port)
+{
+}
+
+static inline void br_multicast_init(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_open(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_stop(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst,
+					struct sk_buff *skb)
+{
+}
+
+static inline void br_multicast_forward(struct net_bridge_mdb_entry *mdst,
+					struct sk_buff *skb,
+					struct sk_buff *skb2)
+{
+}
+static inline bool br_multicast_is_router(struct net_bridge *br)
+{
+	return 0;
+}
+#endif
+
+/* br_netfilter.c */
+#ifdef CONFIG_BRIDGE_NETFILTER
+extern int br_netfilter_init(void);
+extern void br_netfilter_fini(void);
+extern void br_netfilter_rtable_init(struct net_bridge *);
+#else
+#define br_netfilter_init()	(0)
+#define br_netfilter_fini()	do { } while(0)
+#define br_netfilter_rtable_init(x)
+#endif
+
+/* br_stp.c */
+extern void br_log_state(const struct net_bridge_port *p);
+extern struct net_bridge_port *br_get_port(struct net_bridge *br,
+					   u16 port_no);
+extern void br_init_port(struct net_bridge_port *p);
+extern void br_become_designated_port(struct net_bridge_port *p);
+
+extern int br_set_forward_delay(struct net_bridge *br, unsigned long x);
+extern int br_set_hello_time(struct net_bridge *br, unsigned long x);
+extern int br_set_max_age(struct net_bridge *br, unsigned long x);
+
+
+/* br_stp_if.c */
+extern void br_stp_enable_bridge(struct net_bridge *br);
+extern void br_stp_disable_bridge(struct net_bridge *br);
+extern void br_stp_set_enabled(struct net_bridge *br, unsigned long val);
+extern void br_stp_enable_port(struct net_bridge_port *p);
+extern void br_stp_disable_port(struct net_bridge_port *p);
+extern bool br_stp_recalculate_bridge_id(struct net_bridge *br);
+extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
+extern void br_stp_set_bridge_priority(struct net_bridge *br,
+				       u16 newprio);
+extern int br_stp_set_port_priority(struct net_bridge_port *p,
+				    unsigned long newprio);
+extern int br_stp_set_path_cost(struct net_bridge_port *p,
+				unsigned long path_cost);
+extern ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id);
+
+/* br_stp_bpdu.c */
+struct stp_proto;
+extern void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+		       struct net_device *dev);
+
+/* br_stp_timer.c */
+extern void br_stp_timer_init(struct net_bridge *br);
+extern void br_stp_port_timer_init(struct net_bridge_port *p);
+extern unsigned long br_timer_value(const struct timer_list *timer);
+
+/* br.c */
+#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
+#endif
+
+/* br_netlink.c */
+extern int br_netlink_init(void);
+extern void br_netlink_fini(void);
+extern void br_ifinfo_notify(int event, struct net_bridge_port *port);
+
+#ifdef CONFIG_SYSFS
+/* br_sysfs_if.c */
+extern const struct sysfs_ops brport_sysfs_ops;
+extern int br_sysfs_addif(struct net_bridge_port *p);
+extern int br_sysfs_renameif(struct net_bridge_port *p);
+
+/* br_sysfs_br.c */
+extern int br_sysfs_addbr(struct net_device *dev);
+extern void br_sysfs_delbr(struct net_device *dev);
+
+#else
+
+#define br_sysfs_addif(p)	(0)
+#define br_sysfs_renameif(p)	(0)
+#define br_sysfs_addbr(dev)	(0)
+#define br_sysfs_delbr(dev)	do { } while(0)
+#endif /* CONFIG_SYSFS */
+
+#endif
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
new file mode 100644
index 00000000..642ef47a
--- /dev/null
+++ b/net/bridge/br_private_stp.h
@@ -0,0 +1,69 @@
+/*
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _BR_PRIVATE_STP_H
+#define _BR_PRIVATE_STP_H
+
+#define BPDU_TYPE_CONFIG 0
+#define BPDU_TYPE_TCN 0x80
+
+/* IEEE 802.1D-1998 timer values */
+#define BR_MIN_HELLO_TIME	(1*HZ)
+#define BR_MAX_HELLO_TIME	(10*HZ)
+
+#define BR_MIN_FORWARD_DELAY	(2*HZ)
+#define BR_MAX_FORWARD_DELAY	(30*HZ)
+
+#define BR_MIN_MAX_AGE		(6*HZ)
+#define BR_MAX_MAX_AGE		(40*HZ)
+
+#define BR_MIN_PATH_COST	1
+#define BR_MAX_PATH_COST	65535
+
+struct br_config_bpdu
+{
+	unsigned	topology_change:1;
+	unsigned	topology_change_ack:1;
+	bridge_id	root;
+	int		root_path_cost;
+	bridge_id	bridge_id;
+	port_id		port_id;
+	int		message_age;
+	int		max_age;
+	int		hello_time;
+	int		forward_delay;
+};
+
+/* called under bridge lock */
+static inline int br_is_designated_port(const struct net_bridge_port *p)
+{
+	return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) &&
+		(p->designated_port == p->port_id);
+}
+
+
+/* br_stp.c */
+extern void br_become_root_bridge(struct net_bridge *br);
+extern void br_config_bpdu_generation(struct net_bridge *);
+extern void br_configuration_update(struct net_bridge *);
+extern void br_port_state_selection(struct net_bridge *);
+extern void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu);
+extern void br_received_tcn_bpdu(struct net_bridge_port *p);
+extern void br_transmit_config(struct net_bridge_port *p);
+extern void br_transmit_tcn(struct net_bridge *br);
+extern void br_topology_change_detection(struct net_bridge *br);
+
+/* br_stp_bpdu.c */
+extern void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *);
+extern void br_send_tcn_bpdu(struct net_bridge_port *);
+
+#endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
new file mode 100644
index 00000000..fcff6225
--- /dev/null
+++ b/net/bridge/br_stp.c
@@ -0,0 +1,534 @@
+/*
+ *	Spanning tree protocol; generic parts
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+/* since time values in bpdu are in jiffies and then scaled (1/256)
+ * before sending, make sure that is at least one.
+ */
+#define MESSAGE_AGE_INCR	((HZ < 256) ? 1 : (HZ/256))
+
+static const char *const br_port_state_names[] = {
+	[BR_STATE_DISABLED] = "disabled",
+	[BR_STATE_LISTENING] = "listening",
+	[BR_STATE_LEARNING] = "learning",
+	[BR_STATE_FORWARDING] = "forwarding",
+	[BR_STATE_BLOCKING] = "blocking",
+};
+
+void br_log_state(const struct net_bridge_port *p)
+{
+	br_info(p->br, "port %u(%s) entering %s state\n",
+		(unsigned) p->port_no, p->dev->name,
+		br_port_state_names[p->state]);
+}
+
+/* called under bridge lock */
+struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry_rcu(p, &br->port_list, list) {
+		if (p->port_no == port_no)
+			return p;
+	}
+
+	return NULL;
+}
+
+/* called under bridge lock */
+static int br_should_become_root_port(const struct net_bridge_port *p,
+				      u16 root_port)
+{
+	struct net_bridge *br;
+	struct net_bridge_port *rp;
+	int t;
+
+	br = p->br;
+	if (p->state == BR_STATE_DISABLED ||
+	    br_is_designated_port(p))
+		return 0;
+
+	if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0)
+		return 0;
+
+	if (!root_port)
+		return 1;
+
+	rp = br_get_port(br, root_port);
+
+	t = memcmp(&p->designated_root, &rp->designated_root, 8);
+	if (t < 0)
+		return 1;
+	else if (t > 0)
+		return 0;
+
+	if (p->designated_cost + p->path_cost <
+	    rp->designated_cost + rp->path_cost)
+		return 1;
+	else if (p->designated_cost + p->path_cost >
+		 rp->designated_cost + rp->path_cost)
+		return 0;
+
+	t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8);
+	if (t < 0)
+		return 1;
+	else if (t > 0)
+		return 0;
+
+	if (p->designated_port < rp->designated_port)
+		return 1;
+	else if (p->designated_port > rp->designated_port)
+		return 0;
+
+	if (p->port_id < rp->port_id)
+		return 1;
+
+	return 0;
+}
+
+/* called under bridge lock */
+static void br_root_selection(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	u16 root_port = 0;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (br_should_become_root_port(p, root_port))
+			root_port = p->port_no;
+
+	}
+
+	br->root_port = root_port;
+
+	if (!root_port) {
+		br->designated_root = br->bridge_id;
+		br->root_path_cost = 0;
+	} else {
+		p = br_get_port(br, root_port);
+		br->designated_root = p->designated_root;
+		br->root_path_cost = p->designated_cost + p->path_cost;
+	}
+}
+
+/* called under bridge lock */
+void br_become_root_bridge(struct net_bridge *br)
+{
+	br->max_age = br->bridge_max_age;
+	br->hello_time = br->bridge_hello_time;
+	br->forward_delay = br->bridge_forward_delay;
+	br_topology_change_detection(br);
+	del_timer(&br->tcn_timer);
+
+	if (br->dev->flags & IFF_UP) {
+		br_config_bpdu_generation(br);
+		mod_timer(&br->hello_timer, jiffies + br->hello_time);
+	}
+}
+
+/* called under bridge lock */
+void br_transmit_config(struct net_bridge_port *p)
+{
+	struct br_config_bpdu bpdu;
+	struct net_bridge *br;
+
+
+	if (timer_pending(&p->hold_timer)) {
+		p->config_pending = 1;
+		return;
+	}
+
+	br = p->br;
+
+	bpdu.topology_change = br->topology_change;
+	bpdu.topology_change_ack = p->topology_change_ack;
+	bpdu.root = br->designated_root;
+	bpdu.root_path_cost = br->root_path_cost;
+	bpdu.bridge_id = br->bridge_id;
+	bpdu.port_id = p->port_id;
+	if (br_is_root_bridge(br))
+		bpdu.message_age = 0;
+	else {
+		struct net_bridge_port *root
+			= br_get_port(br, br->root_port);
+		bpdu.message_age = (jiffies - root->designated_age)
+			+ MESSAGE_AGE_INCR;
+	}
+	bpdu.max_age = br->max_age;
+	bpdu.hello_time = br->hello_time;
+	bpdu.forward_delay = br->forward_delay;
+
+	if (bpdu.message_age < br->max_age) {
+		br_send_config_bpdu(p, &bpdu);
+		p->topology_change_ack = 0;
+		p->config_pending = 0;
+		mod_timer(&p->hold_timer,
+			  round_jiffies(jiffies + BR_HOLD_TIME));
+	}
+}
+
+/* called under bridge lock */
+static inline void br_record_config_information(struct net_bridge_port *p,
+						const struct br_config_bpdu *bpdu)
+{
+	p->designated_root = bpdu->root;
+	p->designated_cost = bpdu->root_path_cost;
+	p->designated_bridge = bpdu->bridge_id;
+	p->designated_port = bpdu->port_id;
+	p->designated_age = jiffies + bpdu->message_age;
+
+	mod_timer(&p->message_age_timer, jiffies
+		  + (p->br->max_age - bpdu->message_age));
+}
+
+/* called under bridge lock */
+static inline void br_record_config_timeout_values(struct net_bridge *br,
+					    const struct br_config_bpdu *bpdu)
+{
+	br->max_age = bpdu->max_age;
+	br->hello_time = bpdu->hello_time;
+	br->forward_delay = bpdu->forward_delay;
+	br->topology_change = bpdu->topology_change;
+}
+
+/* called under bridge lock */
+void br_transmit_tcn(struct net_bridge *br)
+{
+	br_send_tcn_bpdu(br_get_port(br, br->root_port));
+}
+
+/* called under bridge lock */
+static int br_should_become_designated_port(const struct net_bridge_port *p)
+{
+	struct net_bridge *br;
+	int t;
+
+	br = p->br;
+	if (br_is_designated_port(p))
+		return 1;
+
+	if (memcmp(&p->designated_root, &br->designated_root, 8))
+		return 1;
+
+	if (br->root_path_cost < p->designated_cost)
+		return 1;
+	else if (br->root_path_cost > p->designated_cost)
+		return 0;
+
+	t = memcmp(&br->bridge_id, &p->designated_bridge, 8);
+	if (t < 0)
+		return 1;
+	else if (t > 0)
+		return 0;
+
+	if (p->port_id < p->designated_port)
+		return 1;
+
+	return 0;
+}
+
+/* called under bridge lock */
+static void br_designated_port_selection(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state != BR_STATE_DISABLED &&
+		    br_should_become_designated_port(p))
+			br_become_designated_port(p);
+
+	}
+}
+
+/* called under bridge lock */
+static int br_supersedes_port_info(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+{
+	int t;
+
+	t = memcmp(&bpdu->root, &p->designated_root, 8);
+	if (t < 0)
+		return 1;
+	else if (t > 0)
+		return 0;
+
+	if (bpdu->root_path_cost < p->designated_cost)
+		return 1;
+	else if (bpdu->root_path_cost > p->designated_cost)
+		return 0;
+
+	t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8);
+	if (t < 0)
+		return 1;
+	else if (t > 0)
+		return 0;
+
+	if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8))
+		return 1;
+
+	if (bpdu->port_id <= p->designated_port)
+		return 1;
+
+	return 0;
+}
+
+/* called under bridge lock */
+static inline void br_topology_change_acknowledged(struct net_bridge *br)
+{
+	br->topology_change_detected = 0;
+	del_timer(&br->tcn_timer);
+}
+
+/* called under bridge lock */
+void br_topology_change_detection(struct net_bridge *br)
+{
+	int isroot = br_is_root_bridge(br);
+
+	if (br->stp_enabled != BR_KERNEL_STP)
+		return;
+
+	br_info(br, "topology change detected, %s\n",
+		isroot ? "propagating" : "sending tcn bpdu");
+
+	if (isroot) {
+		br->topology_change = 1;
+		mod_timer(&br->topology_change_timer, jiffies
+			  + br->bridge_forward_delay + br->bridge_max_age);
+	} else if (!br->topology_change_detected) {
+		br_transmit_tcn(br);
+		mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time);
+	}
+
+	br->topology_change_detected = 1;
+}
+
+/* called under bridge lock */
+void br_config_bpdu_generation(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state != BR_STATE_DISABLED &&
+		    br_is_designated_port(p))
+			br_transmit_config(p);
+	}
+}
+
+/* called under bridge lock */
+static inline void br_reply(struct net_bridge_port *p)
+{
+	br_transmit_config(p);
+}
+
+/* called under bridge lock */
+void br_configuration_update(struct net_bridge *br)
+{
+	br_root_selection(br);
+	br_designated_port_selection(br);
+}
+
+/* called under bridge lock */
+void br_become_designated_port(struct net_bridge_port *p)
+{
+	struct net_bridge *br;
+
+	br = p->br;
+	p->designated_root = br->designated_root;
+	p->designated_cost = br->root_path_cost;
+	p->designated_bridge = br->bridge_id;
+	p->designated_port = p->port_id;
+}
+
+
+/* called under bridge lock */
+static void br_make_blocking(struct net_bridge_port *p)
+{
+	if (p->state != BR_STATE_DISABLED &&
+	    p->state != BR_STATE_BLOCKING) {
+		if (p->state == BR_STATE_FORWARDING ||
+		    p->state == BR_STATE_LEARNING)
+			br_topology_change_detection(p->br);
+
+		p->state = BR_STATE_BLOCKING;
+		br_log_state(p);
+		del_timer(&p->forward_delay_timer);
+	}
+}
+
+/* called under bridge lock */
+static void br_make_forwarding(struct net_bridge_port *p)
+{
+	struct net_bridge *br = p->br;
+
+	if (p->state != BR_STATE_BLOCKING)
+		return;
+
+	if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) {
+		p->state = BR_STATE_FORWARDING;
+		br_topology_change_detection(br);
+		del_timer(&p->forward_delay_timer);
+	}
+	else if (br->stp_enabled == BR_KERNEL_STP)
+		p->state = BR_STATE_LISTENING;
+	else
+		p->state = BR_STATE_LEARNING;
+
+	br_multicast_enable_port(p);
+
+	br_log_state(p);
+
+	if (br->forward_delay != 0)
+		mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay);
+}
+
+/* called under bridge lock */
+void br_port_state_selection(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	unsigned int liveports = 0;
+
+	/* Don't change port states if userspace is handling STP */
+	if (br->stp_enabled == BR_USER_STP)
+		return;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state == BR_STATE_DISABLED)
+			continue;
+
+		if (p->port_no == br->root_port) {
+			p->config_pending = 0;
+			p->topology_change_ack = 0;
+			br_make_forwarding(p);
+		} else if (br_is_designated_port(p)) {
+			del_timer(&p->message_age_timer);
+			br_make_forwarding(p);
+		} else {
+			p->config_pending = 0;
+			p->topology_change_ack = 0;
+			br_make_blocking(p);
+		}
+
+		if (p->state == BR_STATE_FORWARDING)
+			++liveports;
+	}
+
+	if (liveports == 0)
+		netif_carrier_off(br->dev);
+	else
+		netif_carrier_on(br->dev);
+}
+
+/* called under bridge lock */
+static inline void br_topology_change_acknowledge(struct net_bridge_port *p)
+{
+	p->topology_change_ack = 1;
+	br_transmit_config(p);
+}
+
+/* called under bridge lock */
+void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+{
+	struct net_bridge *br;
+	int was_root;
+
+	br = p->br;
+	was_root = br_is_root_bridge(br);
+
+	if (br_supersedes_port_info(p, bpdu)) {
+		br_record_config_information(p, bpdu);
+		br_configuration_update(br);
+		br_port_state_selection(br);
+
+		if (!br_is_root_bridge(br) && was_root) {
+			del_timer(&br->hello_timer);
+			if (br->topology_change_detected) {
+				del_timer(&br->topology_change_timer);
+				br_transmit_tcn(br);
+
+				mod_timer(&br->tcn_timer,
+					  jiffies + br->bridge_hello_time);
+			}
+		}
+
+		if (p->port_no == br->root_port) {
+			br_record_config_timeout_values(br, bpdu);
+			br_config_bpdu_generation(br);
+			if (bpdu->topology_change_ack)
+				br_topology_change_acknowledged(br);
+		}
+	} else if (br_is_designated_port(p)) {
+		br_reply(p);
+	}
+}
+
+/* called under bridge lock */
+void br_received_tcn_bpdu(struct net_bridge_port *p)
+{
+	if (br_is_designated_port(p)) {
+		br_info(p->br, "port %u(%s) received tcn bpdu\n",
+			(unsigned) p->port_no, p->dev->name);
+
+		br_topology_change_detection(p->br);
+		br_topology_change_acknowledge(p);
+	}
+}
+
+/* Change bridge STP parameter */
+int br_set_hello_time(struct net_bridge *br, unsigned long val)
+{
+	unsigned long t = clock_t_to_jiffies(val);
+
+	if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME)
+		return -ERANGE;
+
+	spin_lock_bh(&br->lock);
+	br->bridge_hello_time = t;
+	if (br_is_root_bridge(br))
+		br->hello_time = br->bridge_hello_time;
+	spin_unlock_bh(&br->lock);
+	return 0;
+}
+
+int br_set_max_age(struct net_bridge *br, unsigned long val)
+{
+	unsigned long t = clock_t_to_jiffies(val);
+
+	if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE)
+		return -ERANGE;
+
+	spin_lock_bh(&br->lock);
+	br->bridge_max_age = t;
+	if (br_is_root_bridge(br))
+		br->max_age = br->bridge_max_age;
+	spin_unlock_bh(&br->lock);
+	return 0;
+
+}
+
+int br_set_forward_delay(struct net_bridge *br, unsigned long val)
+{
+	unsigned long t = clock_t_to_jiffies(val);
+
+	if (br->stp_enabled != BR_NO_STP &&
+	    (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY))
+		return -ERANGE;
+
+	spin_lock_bh(&br->lock);
+	br->bridge_forward_delay = t;
+	if (br_is_root_bridge(br))
+		br->forward_delay = br->bridge_forward_delay;
+	spin_unlock_bh(&br->lock);
+	return 0;
+}
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
new file mode 100644
index 00000000..289646ec
--- /dev/null
+++ b/net/bridge/br_stp_bpdu.c
@@ -0,0 +1,223 @@
+/*
+ *	Spanning tree protocol; BPDU handling
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/etherdevice.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/stp.h>
+#include <asm/unaligned.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+#define STP_HZ		256
+
+#define LLC_RESERVE sizeof(struct llc_pdu_un)
+
+static void br_send_bpdu(struct net_bridge_port *p,
+			 const unsigned char *data, int length)
+{
+	struct sk_buff *skb;
+
+	skb = dev_alloc_skb(length+LLC_RESERVE);
+	if (!skb)
+		return;
+
+	skb->dev = p->dev;
+	skb->protocol = htons(ETH_P_802_2);
+
+	skb_reserve(skb, LLC_RESERVE);
+	memcpy(__skb_put(skb, length), data, length);
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, LLC_SAP_BSPAN,
+			    LLC_SAP_BSPAN, LLC_PDU_CMD);
+	llc_pdu_init_as_ui_cmd(skb);
+
+	llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr);
+
+	skb_reset_mac_header(skb);
+
+	NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
+		dev_queue_xmit);
+}
+
+static inline void br_set_ticks(unsigned char *dest, int j)
+{
+	unsigned long ticks = (STP_HZ * j)/ HZ;
+
+	put_unaligned_be16(ticks, dest);
+}
+
+static inline int br_get_ticks(const unsigned char *src)
+{
+	unsigned long ticks = get_unaligned_be16(src);
+
+	return DIV_ROUND_UP(ticks * HZ, STP_HZ);
+}
+
+/* called under bridge lock */
+void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
+{
+	unsigned char buf[35];
+
+	if (p->br->stp_enabled != BR_KERNEL_STP)
+		return;
+
+	buf[0] = 0;
+	buf[1] = 0;
+	buf[2] = 0;
+	buf[3] = BPDU_TYPE_CONFIG;
+	buf[4] = (bpdu->topology_change ? 0x01 : 0) |
+		(bpdu->topology_change_ack ? 0x80 : 0);
+	buf[5] = bpdu->root.prio[0];
+	buf[6] = bpdu->root.prio[1];
+	buf[7] = bpdu->root.addr[0];
+	buf[8] = bpdu->root.addr[1];
+	buf[9] = bpdu->root.addr[2];
+	buf[10] = bpdu->root.addr[3];
+	buf[11] = bpdu->root.addr[4];
+	buf[12] = bpdu->root.addr[5];
+	buf[13] = (bpdu->root_path_cost >> 24) & 0xFF;
+	buf[14] = (bpdu->root_path_cost >> 16) & 0xFF;
+	buf[15] = (bpdu->root_path_cost >> 8) & 0xFF;
+	buf[16] = bpdu->root_path_cost & 0xFF;
+	buf[17] = bpdu->bridge_id.prio[0];
+	buf[18] = bpdu->bridge_id.prio[1];
+	buf[19] = bpdu->bridge_id.addr[0];
+	buf[20] = bpdu->bridge_id.addr[1];
+	buf[21] = bpdu->bridge_id.addr[2];
+	buf[22] = bpdu->bridge_id.addr[3];
+	buf[23] = bpdu->bridge_id.addr[4];
+	buf[24] = bpdu->bridge_id.addr[5];
+	buf[25] = (bpdu->port_id >> 8) & 0xFF;
+	buf[26] = bpdu->port_id & 0xFF;
+
+	br_set_ticks(buf+27, bpdu->message_age);
+	br_set_ticks(buf+29, bpdu->max_age);
+	br_set_ticks(buf+31, bpdu->hello_time);
+	br_set_ticks(buf+33, bpdu->forward_delay);
+
+	br_send_bpdu(p, buf, 35);
+}
+
+/* called under bridge lock */
+void br_send_tcn_bpdu(struct net_bridge_port *p)
+{
+	unsigned char buf[4];
+
+	if (p->br->stp_enabled != BR_KERNEL_STP)
+		return;
+
+	buf[0] = 0;
+	buf[1] = 0;
+	buf[2] = 0;
+	buf[3] = BPDU_TYPE_TCN;
+	br_send_bpdu(p, buf, 4);
+}
+
+/*
+ * Called from llc.
+ *
+ * NO locks, but rcu_read_lock
+ */
+void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
+		struct net_device *dev)
+{
+	const unsigned char *dest = eth_hdr(skb)->h_dest;
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	const unsigned char *buf;
+
+	if (!pskb_may_pull(skb, 4))
+		goto err;
+
+	/* compare of protocol id and version */
+	buf = skb->data;
+	if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
+		goto err;
+
+	p = br_port_get_rcu(dev);
+	if (!p)
+		goto err;
+
+	br = p->br;
+	spin_lock(&br->lock);
+
+	if (br->stp_enabled != BR_KERNEL_STP)
+		goto out;
+
+	if (!(br->dev->flags & IFF_UP))
+		goto out;
+
+	if (p->state == BR_STATE_DISABLED)
+		goto out;
+
+	if (compare_ether_addr(dest, br->group_addr) != 0)
+		goto out;
+
+	buf = skb_pull(skb, 3);
+
+	if (buf[0] == BPDU_TYPE_CONFIG) {
+		struct br_config_bpdu bpdu;
+
+		if (!pskb_may_pull(skb, 32))
+			goto out;
+
+		buf = skb->data;
+		bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0;
+		bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0;
+
+		bpdu.root.prio[0] = buf[2];
+		bpdu.root.prio[1] = buf[3];
+		bpdu.root.addr[0] = buf[4];
+		bpdu.root.addr[1] = buf[5];
+		bpdu.root.addr[2] = buf[6];
+		bpdu.root.addr[3] = buf[7];
+		bpdu.root.addr[4] = buf[8];
+		bpdu.root.addr[5] = buf[9];
+		bpdu.root_path_cost =
+			(buf[10] << 24) |
+			(buf[11] << 16) |
+			(buf[12] << 8) |
+			buf[13];
+		bpdu.bridge_id.prio[0] = buf[14];
+		bpdu.bridge_id.prio[1] = buf[15];
+		bpdu.bridge_id.addr[0] = buf[16];
+		bpdu.bridge_id.addr[1] = buf[17];
+		bpdu.bridge_id.addr[2] = buf[18];
+		bpdu.bridge_id.addr[3] = buf[19];
+		bpdu.bridge_id.addr[4] = buf[20];
+		bpdu.bridge_id.addr[5] = buf[21];
+		bpdu.port_id = (buf[22] << 8) | buf[23];
+
+		bpdu.message_age = br_get_ticks(buf+24);
+		bpdu.max_age = br_get_ticks(buf+26);
+		bpdu.hello_time = br_get_ticks(buf+28);
+		bpdu.forward_delay = br_get_ticks(buf+30);
+
+		br_received_config_bpdu(p, &bpdu);
+	}
+
+	else if (buf[0] == BPDU_TYPE_TCN) {
+		br_received_tcn_bpdu(p);
+	}
+ out:
+	spin_unlock(&br->lock);
+ err:
+	kfree_skb(skb);
+}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
new file mode 100644
index 00000000..6f615b81
--- /dev/null
+++ b/net/bridge/br_stp_if.c
@@ -0,0 +1,301 @@
+/*
+ *	Spanning tree protocol; interface code
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+
+/* Port id is composed of priority and port number.
+ * NB: some bits of priority are dropped to
+ *     make room for more ports.
+ */
+static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
+{
+	return ((u16)priority << BR_PORT_BITS)
+		| (port_no & ((1<<BR_PORT_BITS)-1));
+}
+
+#define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS)
+
+/* called under bridge lock */
+void br_init_port(struct net_bridge_port *p)
+{
+	p->port_id = br_make_port_id(p->priority, p->port_no);
+	br_become_designated_port(p);
+	p->state = BR_STATE_BLOCKING;
+	p->topology_change_ack = 0;
+	p->config_pending = 0;
+}
+
+/* called under bridge lock */
+void br_stp_enable_bridge(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	spin_lock_bh(&br->lock);
+	mod_timer(&br->hello_timer, jiffies + br->hello_time);
+	mod_timer(&br->gc_timer, jiffies + HZ/10);
+
+	br_config_bpdu_generation(br);
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if ((p->dev->flags & IFF_UP) && netif_carrier_ok(p->dev))
+			br_stp_enable_port(p);
+
+	}
+	spin_unlock_bh(&br->lock);
+}
+
+/* NO locks held */
+void br_stp_disable_bridge(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	spin_lock_bh(&br->lock);
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state != BR_STATE_DISABLED)
+			br_stp_disable_port(p);
+
+	}
+
+	br->topology_change = 0;
+	br->topology_change_detected = 0;
+	spin_unlock_bh(&br->lock);
+
+	del_timer_sync(&br->hello_timer);
+	del_timer_sync(&br->topology_change_timer);
+	del_timer_sync(&br->tcn_timer);
+	del_timer_sync(&br->gc_timer);
+}
+
+/* called under bridge lock */
+void br_stp_enable_port(struct net_bridge_port *p)
+{
+	br_init_port(p);
+	br_port_state_selection(p->br);
+	br_log_state(p);
+}
+
+/* called under bridge lock */
+void br_stp_disable_port(struct net_bridge_port *p)
+{
+	struct net_bridge *br = p->br;
+	int wasroot;
+
+	br_log_state(p);
+
+	wasroot = br_is_root_bridge(br);
+	br_become_designated_port(p);
+	p->state = BR_STATE_DISABLED;
+	p->topology_change_ack = 0;
+	p->config_pending = 0;
+
+	del_timer(&p->message_age_timer);
+	del_timer(&p->forward_delay_timer);
+	del_timer(&p->hold_timer);
+
+	br_fdb_delete_by_port(br, p, 0);
+	br_multicast_disable_port(p);
+
+	br_configuration_update(br);
+
+	br_port_state_selection(br);
+
+	if (br_is_root_bridge(br) && !wasroot)
+		br_become_root_bridge(br);
+}
+
+static void br_stp_start(struct net_bridge *br)
+{
+	int r;
+	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
+	char *envp[] = { NULL };
+
+	r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+	if (r == 0) {
+		br->stp_enabled = BR_USER_STP;
+		br_debug(br, "userspace STP started\n");
+	} else {
+		br->stp_enabled = BR_KERNEL_STP;
+		br_debug(br, "using kernel STP\n");
+
+		/* To start timers on any ports left in blocking */
+		spin_lock_bh(&br->lock);
+		br_port_state_selection(br);
+		spin_unlock_bh(&br->lock);
+	}
+}
+
+static void br_stp_stop(struct net_bridge *br)
+{
+	int r;
+	char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
+	char *envp[] = { NULL };
+
+	if (br->stp_enabled == BR_USER_STP) {
+		r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+		br_info(br, "userspace STP stopped, return code %d\n", r);
+
+		/* To start timers on any ports left in blocking */
+		spin_lock_bh(&br->lock);
+		br_port_state_selection(br);
+		spin_unlock_bh(&br->lock);
+	}
+
+	br->stp_enabled = BR_NO_STP;
+}
+
+void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
+{
+	ASSERT_RTNL();
+
+	if (val) {
+		if (br->stp_enabled == BR_NO_STP)
+			br_stp_start(br);
+	} else {
+		if (br->stp_enabled != BR_NO_STP)
+			br_stp_stop(br);
+	}
+}
+
+/* called under bridge lock */
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
+{
+	/* should be aligned on 2 bytes for compare_ether_addr() */
+	unsigned short oldaddr_aligned[ETH_ALEN >> 1];
+	unsigned char *oldaddr = (unsigned char *)oldaddr_aligned;
+	struct net_bridge_port *p;
+	int wasroot;
+
+	wasroot = br_is_root_bridge(br);
+
+	memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN);
+	memcpy(br->bridge_id.addr, addr, ETH_ALEN);
+	memcpy(br->dev->dev_addr, addr, ETH_ALEN);
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (!compare_ether_addr(p->designated_bridge.addr, oldaddr))
+			memcpy(p->designated_bridge.addr, addr, ETH_ALEN);
+
+		if (!compare_ether_addr(p->designated_root.addr, oldaddr))
+			memcpy(p->designated_root.addr, addr, ETH_ALEN);
+
+	}
+
+	br_configuration_update(br);
+	br_port_state_selection(br);
+	if (br_is_root_bridge(br) && !wasroot)
+		br_become_root_bridge(br);
+}
+
+/* should be aligned on 2 bytes for compare_ether_addr() */
+static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1];
+
+/* called under bridge lock */
+bool br_stp_recalculate_bridge_id(struct net_bridge *br)
+{
+	const unsigned char *br_mac_zero =
+			(const unsigned char *)br_mac_zero_aligned;
+	const unsigned char *addr = br_mac_zero;
+	struct net_bridge_port *p;
+
+	/* user has chosen a value so keep it */
+	if (br->flags & BR_SET_MAC_ADDR)
+		return false;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (addr == br_mac_zero ||
+		    memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
+			addr = p->dev->dev_addr;
+
+	}
+
+	if (compare_ether_addr(br->bridge_id.addr, addr) == 0)
+		return false;	/* no change */
+
+	br_stp_change_bridge_id(br, addr);
+	return true;
+}
+
+/* called under bridge lock */
+void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio)
+{
+	struct net_bridge_port *p;
+	int wasroot;
+
+	wasroot = br_is_root_bridge(br);
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state != BR_STATE_DISABLED &&
+		    br_is_designated_port(p)) {
+			p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF;
+			p->designated_bridge.prio[1] = newprio & 0xFF;
+		}
+
+	}
+
+	br->bridge_id.prio[0] = (newprio >> 8) & 0xFF;
+	br->bridge_id.prio[1] = newprio & 0xFF;
+	br_configuration_update(br);
+	br_port_state_selection(br);
+	if (br_is_root_bridge(br) && !wasroot)
+		br_become_root_bridge(br);
+}
+
+/* called under bridge lock */
+int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio)
+{
+	port_id new_port_id;
+
+	if (newprio > BR_MAX_PORT_PRIORITY)
+		return -ERANGE;
+
+	new_port_id = br_make_port_id(newprio, p->port_no);
+	if (br_is_designated_port(p))
+		p->designated_port = new_port_id;
+
+	p->port_id = new_port_id;
+	p->priority = newprio;
+	if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) &&
+	    p->port_id < p->designated_port) {
+		br_become_designated_port(p);
+		br_port_state_selection(p->br);
+	}
+
+	return 0;
+}
+
+/* called under bridge lock */
+int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost)
+{
+	if (path_cost < BR_MIN_PATH_COST ||
+	    path_cost > BR_MAX_PATH_COST)
+		return -ERANGE;
+
+	p->path_cost = path_cost;
+	br_configuration_update(p->br);
+	br_port_state_selection(p->br);
+	return 0;
+}
+
+ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id)
+{
+	return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n",
+	       id->prio[0], id->prio[1],
+	       id->addr[0], id->addr[1], id->addr[2],
+	       id->addr[3], id->addr[4], id->addr[5]);
+}
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
new file mode 100644
index 00000000..3e965140
--- /dev/null
+++ b/net/bridge/br_stp_timer.c
@@ -0,0 +1,173 @@
+/*
+ *	Spanning tree protocol; timer-related code
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/times.h>
+
+#include "br_private.h"
+#include "br_private_stp.h"
+
+/* called under bridge lock */
+static int br_is_designated_for_some_port(const struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (p->state != BR_STATE_DISABLED &&
+		    !memcmp(&p->designated_bridge, &br->bridge_id, 8))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void br_hello_timer_expired(unsigned long arg)
+{
+	struct net_bridge *br = (struct net_bridge *)arg;
+
+	br_debug(br, "hello timer expired\n");
+	spin_lock(&br->lock);
+	if (br->dev->flags & IFF_UP) {
+		br_config_bpdu_generation(br);
+
+		mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time));
+	}
+	spin_unlock(&br->lock);
+}
+
+static void br_message_age_timer_expired(unsigned long arg)
+{
+	struct net_bridge_port *p = (struct net_bridge_port *) arg;
+	struct net_bridge *br = p->br;
+	const bridge_id *id = &p->designated_bridge;
+	int was_root;
+
+	if (p->state == BR_STATE_DISABLED)
+		return;
+
+	br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n",
+		(unsigned) p->port_no, p->dev->name,
+		id->prio[0], id->prio[1], &id->addr);
+
+	/*
+	 * According to the spec, the message age timer cannot be
+	 * running when we are the root bridge. So..  this was_root
+	 * check is redundant. I'm leaving it in for now, though.
+	 */
+	spin_lock(&br->lock);
+	if (p->state == BR_STATE_DISABLED)
+		goto unlock;
+	was_root = br_is_root_bridge(br);
+
+	br_become_designated_port(p);
+	br_configuration_update(br);
+	br_port_state_selection(br);
+	if (br_is_root_bridge(br) && !was_root)
+		br_become_root_bridge(br);
+ unlock:
+	spin_unlock(&br->lock);
+}
+
+static void br_forward_delay_timer_expired(unsigned long arg)
+{
+	struct net_bridge_port *p = (struct net_bridge_port *) arg;
+	struct net_bridge *br = p->br;
+
+	br_debug(br, "port %u(%s) forward delay timer\n",
+		 (unsigned) p->port_no, p->dev->name);
+	spin_lock(&br->lock);
+	if (p->state == BR_STATE_LISTENING) {
+		p->state = BR_STATE_LEARNING;
+		mod_timer(&p->forward_delay_timer,
+			  jiffies + br->forward_delay);
+	} else if (p->state == BR_STATE_LEARNING) {
+		p->state = BR_STATE_FORWARDING;
+		if (br_is_designated_for_some_port(br))
+			br_topology_change_detection(br);
+		netif_carrier_on(br->dev);
+	}
+	br_log_state(p);
+	spin_unlock(&br->lock);
+}
+
+static void br_tcn_timer_expired(unsigned long arg)
+{
+	struct net_bridge *br = (struct net_bridge *) arg;
+
+	br_debug(br, "tcn timer expired\n");
+	spin_lock(&br->lock);
+	if (br->dev->flags & IFF_UP) {
+		br_transmit_tcn(br);
+
+		mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time);
+	}
+	spin_unlock(&br->lock);
+}
+
+static void br_topology_change_timer_expired(unsigned long arg)
+{
+	struct net_bridge *br = (struct net_bridge *) arg;
+
+	br_debug(br, "topo change timer expired\n");
+	spin_lock(&br->lock);
+	br->topology_change_detected = 0;
+	br->topology_change = 0;
+	spin_unlock(&br->lock);
+}
+
+static void br_hold_timer_expired(unsigned long arg)
+{
+	struct net_bridge_port *p = (struct net_bridge_port *) arg;
+
+	br_debug(p->br, "port %u(%s) hold timer expired\n",
+		 (unsigned) p->port_no, p->dev->name);
+
+	spin_lock(&p->br->lock);
+	if (p->config_pending)
+		br_transmit_config(p);
+	spin_unlock(&p->br->lock);
+}
+
+void br_stp_timer_init(struct net_bridge *br)
+{
+	setup_timer(&br->hello_timer, br_hello_timer_expired,
+		      (unsigned long) br);
+
+	setup_timer(&br->tcn_timer, br_tcn_timer_expired,
+		      (unsigned long) br);
+
+	setup_timer(&br->topology_change_timer,
+		      br_topology_change_timer_expired,
+		      (unsigned long) br);
+
+	setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
+}
+
+void br_stp_port_timer_init(struct net_bridge_port *p)
+{
+	setup_timer(&p->message_age_timer, br_message_age_timer_expired,
+		      (unsigned long) p);
+
+	setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired,
+		      (unsigned long) p);
+
+	setup_timer(&p->hold_timer, br_hold_timer_expired,
+		      (unsigned long) p);
+}
+
+/* Report ticks left (in USER_HZ) used for API */
+unsigned long br_timer_value(const struct timer_list *timer)
+{
+	return timer_pending(timer)
+		? jiffies_to_clock_t(timer->expires - jiffies) : 0;
+}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
new file mode 100644
index 00000000..68b893ea
--- /dev/null
+++ b/net/bridge/br_sysfs_br.c
@@ -0,0 +1,784 @@
+/*
+ *	Sysfs attributes of bridge ports
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Stephen Hemminger		<shemminger@osdl.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include <linux/times.h>
+
+#include "br_private.h"
+
+#define to_dev(obj)	container_of(obj, struct device, kobj)
+#define to_bridge(cd)	((struct net_bridge *)netdev_priv(to_net_dev(cd)))
+
+/*
+ * Common code for storing bridge parameters.
+ */
+static ssize_t store_bridge_parm(struct device *d,
+				 const char *buf, size_t len,
+				 int (*set)(struct net_bridge *, unsigned long))
+{
+	struct net_bridge *br = to_bridge(d);
+	char *endp;
+	unsigned long val;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+
+	err = (*set)(br, val);
+	return err ? err : len;
+}
+
+
+static ssize_t show_forward_delay(struct device *d,
+				  struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
+}
+
+static ssize_t store_forward_delay(struct device *d,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_set_forward_delay);
+}
+static DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR,
+		   show_forward_delay, store_forward_delay);
+
+static ssize_t show_hello_time(struct device *d, struct device_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(to_bridge(d)->hello_time));
+}
+
+static ssize_t store_hello_time(struct device *d,
+				struct device_attribute *attr, const char *buf,
+				size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_set_hello_time);
+}
+static DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time,
+		   store_hello_time);
+
+static ssize_t show_max_age(struct device *d, struct device_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(to_bridge(d)->max_age));
+}
+
+static ssize_t store_max_age(struct device *d, struct device_attribute *attr,
+			     const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_set_max_age);
+}
+static DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age);
+
+static ssize_t show_ageing_time(struct device *d,
+				struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
+}
+
+static int set_ageing_time(struct net_bridge *br, unsigned long val)
+{
+	br->ageing_time = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_ageing_time(struct device *d,
+				 struct device_attribute *attr,
+				 const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_ageing_time);
+}
+static DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time,
+		   store_ageing_time);
+
+static ssize_t show_stp_state(struct device *d,
+			      struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", br->stp_enabled);
+}
+
+
+static ssize_t store_stp_state(struct device *d,
+			       struct device_attribute *attr, const char *buf,
+			       size_t len)
+{
+	struct net_bridge *br = to_bridge(d);
+	char *endp;
+	unsigned long val;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+	br_stp_set_enabled(br, val);
+	rtnl_unlock();
+
+	return len;
+}
+static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
+		   store_stp_state);
+
+static ssize_t show_priority(struct device *d, struct device_attribute *attr,
+			     char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n",
+		       (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
+}
+
+static int set_priority(struct net_bridge *br, unsigned long val)
+{
+	br_stp_set_bridge_priority(br, (u16) val);
+	return 0;
+}
+
+static ssize_t store_priority(struct device *d, struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_priority);
+}
+static DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, store_priority);
+
+static ssize_t show_root_id(struct device *d, struct device_attribute *attr,
+			    char *buf)
+{
+	return br_show_bridge_id(buf, &to_bridge(d)->designated_root);
+}
+static DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL);
+
+static ssize_t show_bridge_id(struct device *d, struct device_attribute *attr,
+			      char *buf)
+{
+	return br_show_bridge_id(buf, &to_bridge(d)->bridge_id);
+}
+static DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL);
+
+static ssize_t show_root_port(struct device *d, struct device_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "%d\n", to_bridge(d)->root_port);
+}
+static DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL);
+
+static ssize_t show_root_path_cost(struct device *d,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost);
+}
+static DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL);
+
+static ssize_t show_topology_change(struct device *d,
+				    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", to_bridge(d)->topology_change);
+}
+static DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL);
+
+static ssize_t show_topology_change_detected(struct device *d,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", br->topology_change_detected);
+}
+static DEVICE_ATTR(topology_change_detected, S_IRUGO,
+		   show_topology_change_detected, NULL);
+
+static ssize_t show_hello_timer(struct device *d,
+				struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer));
+}
+static DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL);
+
+static ssize_t show_tcn_timer(struct device *d, struct device_attribute *attr,
+			      char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer));
+}
+static DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL);
+
+static ssize_t show_topology_change_timer(struct device *d,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
+}
+static DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer,
+		   NULL);
+
+static ssize_t show_gc_timer(struct device *d, struct device_attribute *attr,
+			     char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+}
+static DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL);
+
+static ssize_t show_group_addr(struct device *d,
+			       struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%x:%x:%x:%x:%x:%x\n",
+		       br->group_addr[0], br->group_addr[1],
+		       br->group_addr[2], br->group_addr[3],
+		       br->group_addr[4], br->group_addr[5]);
+}
+
+static ssize_t store_group_addr(struct device *d,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	struct net_bridge *br = to_bridge(d);
+	unsigned new_addr[6];
+	int i;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (sscanf(buf, "%x:%x:%x:%x:%x:%x",
+		   &new_addr[0], &new_addr[1], &new_addr[2],
+		   &new_addr[3], &new_addr[4], &new_addr[5]) != 6)
+		return -EINVAL;
+
+	/* Must be 01:80:c2:00:00:0X */
+	for (i = 0; i < 5; i++)
+		if (new_addr[i] != br_group_address[i])
+			return -EINVAL;
+
+	if (new_addr[5] & ~0xf)
+		return -EINVAL;
+
+	if (new_addr[5] == 1 ||		/* 802.3x Pause address */
+	    new_addr[5] == 2 ||		/* 802.3ad Slow protocols */
+	    new_addr[5] == 3)		/* 802.1X PAE address */
+		return -EINVAL;
+
+	spin_lock_bh(&br->lock);
+	for (i = 0; i < 6; i++)
+		br->group_addr[i] = new_addr[i];
+	spin_unlock_bh(&br->lock);
+	return len;
+}
+
+static DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR,
+		   show_group_addr, store_group_addr);
+
+static ssize_t store_flush(struct device *d,
+			   struct device_attribute *attr,
+			   const char *buf, size_t len)
+{
+	struct net_bridge *br = to_bridge(d);
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	br_fdb_flush(br);
+	return len;
+}
+static DEVICE_ATTR(flush, S_IWUSR, NULL, store_flush);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t show_multicast_router(struct device *d,
+				     struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", br->multicast_router);
+}
+
+static ssize_t store_multicast_router(struct device *d,
+				      struct device_attribute *attr,
+				      const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_multicast_set_router);
+}
+static DEVICE_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
+		   store_multicast_router);
+
+static ssize_t show_multicast_snooping(struct device *d,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%d\n", !br->multicast_disabled);
+}
+
+static ssize_t store_multicast_snooping(struct device *d,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_multicast_toggle);
+}
+static DEVICE_ATTR(multicast_snooping, S_IRUGO | S_IWUSR,
+		   show_multicast_snooping, store_multicast_snooping);
+
+static ssize_t show_hash_elasticity(struct device *d,
+				    struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->hash_elasticity);
+}
+
+static int set_elasticity(struct net_bridge *br, unsigned long val)
+{
+	br->hash_elasticity = val;
+	return 0;
+}
+
+static ssize_t store_hash_elasticity(struct device *d,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_elasticity);
+}
+static DEVICE_ATTR(hash_elasticity, S_IRUGO | S_IWUSR, show_hash_elasticity,
+		   store_hash_elasticity);
+
+static ssize_t show_hash_max(struct device *d, struct device_attribute *attr,
+			     char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->hash_max);
+}
+
+static ssize_t store_hash_max(struct device *d, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, br_multicast_set_hash_max);
+}
+static DEVICE_ATTR(hash_max, S_IRUGO | S_IWUSR, show_hash_max,
+		   store_hash_max);
+
+static ssize_t show_multicast_last_member_count(struct device *d,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->multicast_last_member_count);
+}
+
+static int set_last_member_count(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_last_member_count = val;
+	return 0;
+}
+
+static ssize_t store_multicast_last_member_count(struct device *d,
+						 struct device_attribute *attr,
+						 const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_last_member_count);
+}
+static DEVICE_ATTR(multicast_last_member_count, S_IRUGO | S_IWUSR,
+		   show_multicast_last_member_count,
+		   store_multicast_last_member_count);
+
+static ssize_t show_multicast_startup_query_count(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->multicast_startup_query_count);
+}
+
+static int set_startup_query_count(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_startup_query_count = val;
+	return 0;
+}
+
+static ssize_t store_multicast_startup_query_count(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_startup_query_count);
+}
+static DEVICE_ATTR(multicast_startup_query_count, S_IRUGO | S_IWUSR,
+		   show_multicast_startup_query_count,
+		   store_multicast_startup_query_count);
+
+static ssize_t show_multicast_last_member_interval(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(br->multicast_last_member_interval));
+}
+
+static int set_last_member_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_last_member_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_last_member_interval(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_last_member_interval);
+}
+static DEVICE_ATTR(multicast_last_member_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_last_member_interval,
+		   store_multicast_last_member_interval);
+
+static ssize_t show_multicast_membership_interval(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(br->multicast_membership_interval));
+}
+
+static int set_membership_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_membership_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_membership_interval(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_membership_interval);
+}
+static DEVICE_ATTR(multicast_membership_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_membership_interval,
+		   store_multicast_membership_interval);
+
+static ssize_t show_multicast_querier_interval(struct device *d,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(br->multicast_querier_interval));
+}
+
+static int set_querier_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_querier_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_querier_interval(struct device *d,
+						struct device_attribute *attr,
+						const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_querier_interval);
+}
+static DEVICE_ATTR(multicast_querier_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_querier_interval,
+		   store_multicast_querier_interval);
+
+static ssize_t show_multicast_query_interval(struct device *d,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%lu\n",
+		       jiffies_to_clock_t(br->multicast_query_interval));
+}
+
+static int set_query_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_query_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_query_interval(struct device *d,
+					      struct device_attribute *attr,
+					      const char *buf, size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_query_interval);
+}
+static DEVICE_ATTR(multicast_query_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_query_interval,
+		   store_multicast_query_interval);
+
+static ssize_t show_multicast_query_response_interval(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(
+		buf, "%lu\n",
+		jiffies_to_clock_t(br->multicast_query_response_interval));
+}
+
+static int set_query_response_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_query_response_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_query_response_interval(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_query_response_interval);
+}
+static DEVICE_ATTR(multicast_query_response_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_query_response_interval,
+		   store_multicast_query_response_interval);
+
+static ssize_t show_multicast_startup_query_interval(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(
+		buf, "%lu\n",
+		jiffies_to_clock_t(br->multicast_startup_query_interval));
+}
+
+static int set_startup_query_interval(struct net_bridge *br, unsigned long val)
+{
+	br->multicast_startup_query_interval = clock_t_to_jiffies(val);
+	return 0;
+}
+
+static ssize_t store_multicast_startup_query_interval(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_startup_query_interval);
+}
+static DEVICE_ATTR(multicast_startup_query_interval, S_IRUGO | S_IWUSR,
+		   show_multicast_startup_query_interval,
+		   store_multicast_startup_query_interval);
+#endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+static ssize_t show_nf_call_iptables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_iptables);
+}
+
+static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_iptables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_iptables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_iptables);
+}
+static DEVICE_ATTR(nf_call_iptables, S_IRUGO | S_IWUSR,
+		   show_nf_call_iptables, store_nf_call_iptables);
+
+static ssize_t show_nf_call_ip6tables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+}
+
+static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_ip6tables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_ip6tables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_ip6tables);
+}
+static DEVICE_ATTR(nf_call_ip6tables, S_IRUGO | S_IWUSR,
+		   show_nf_call_ip6tables, store_nf_call_ip6tables);
+
+static ssize_t show_nf_call_arptables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_arptables);
+}
+
+static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_arptables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_arptables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_arptables);
+}
+static DEVICE_ATTR(nf_call_arptables, S_IRUGO | S_IWUSR,
+		   show_nf_call_arptables, store_nf_call_arptables);
+#endif
+
+static struct attribute *bridge_attrs[] = {
+	&dev_attr_forward_delay.attr,
+	&dev_attr_hello_time.attr,
+	&dev_attr_max_age.attr,
+	&dev_attr_ageing_time.attr,
+	&dev_attr_stp_state.attr,
+	&dev_attr_priority.attr,
+	&dev_attr_bridge_id.attr,
+	&dev_attr_root_id.attr,
+	&dev_attr_root_path_cost.attr,
+	&dev_attr_root_port.attr,
+	&dev_attr_topology_change.attr,
+	&dev_attr_topology_change_detected.attr,
+	&dev_attr_hello_timer.attr,
+	&dev_attr_tcn_timer.attr,
+	&dev_attr_topology_change_timer.attr,
+	&dev_attr_gc_timer.attr,
+	&dev_attr_group_addr.attr,
+	&dev_attr_flush.attr,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	&dev_attr_multicast_router.attr,
+	&dev_attr_multicast_snooping.attr,
+	&dev_attr_hash_elasticity.attr,
+	&dev_attr_hash_max.attr,
+	&dev_attr_multicast_last_member_count.attr,
+	&dev_attr_multicast_startup_query_count.attr,
+	&dev_attr_multicast_last_member_interval.attr,
+	&dev_attr_multicast_membership_interval.attr,
+	&dev_attr_multicast_querier_interval.attr,
+	&dev_attr_multicast_query_interval.attr,
+	&dev_attr_multicast_query_response_interval.attr,
+	&dev_attr_multicast_startup_query_interval.attr,
+#endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+	&dev_attr_nf_call_iptables.attr,
+	&dev_attr_nf_call_ip6tables.attr,
+	&dev_attr_nf_call_arptables.attr,
+#endif
+	NULL
+};
+
+static struct attribute_group bridge_group = {
+	.name = SYSFS_BRIDGE_ATTR,
+	.attrs = bridge_attrs,
+};
+
+/*
+ * Export the forwarding information table as a binary file
+ * The records are struct __fdb_entry.
+ *
+ * Returns the number of bytes read.
+ */
+static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buf, loff_t off, size_t count)
+{
+	struct device *dev = to_dev(kobj);
+	struct net_bridge *br = to_bridge(dev);
+	int n;
+
+	/* must read whole records */
+	if (off % sizeof(struct __fdb_entry) != 0)
+		return -EINVAL;
+
+	n =  br_fdb_fillbuf(br, buf,
+			    count / sizeof(struct __fdb_entry),
+			    off / sizeof(struct __fdb_entry));
+
+	if (n > 0)
+		n *= sizeof(struct __fdb_entry);
+
+	return n;
+}
+
+static struct bin_attribute bridge_forward = {
+	.attr = { .name = SYSFS_BRIDGE_FDB,
+		  .mode = S_IRUGO, },
+	.read = brforward_read,
+};
+
+/*
+ * Add entries in sysfs onto the existing network class device
+ * for the bridge.
+ *   Adds a attribute group "bridge" containing tuning parameters.
+ *   Binary attribute containing the forward table
+ *   Sub directory to hold links to interfaces.
+ *
+ * Note: the ifobj exists only to be a subdirectory
+ *   to hold links.  The ifobj exists in same data structure
+ *   as it's parent the bridge so reference counting works.
+ */
+int br_sysfs_addbr(struct net_device *dev)
+{
+	struct kobject *brobj = &dev->dev.kobj;
+	struct net_bridge *br = netdev_priv(dev);
+	int err;
+
+	err = sysfs_create_group(brobj, &bridge_group);
+	if (err) {
+		pr_info("%s: can't create group %s/%s\n",
+			__func__, dev->name, bridge_group.name);
+		goto out1;
+	}
+
+	err = sysfs_create_bin_file(brobj, &bridge_forward);
+	if (err) {
+		pr_info("%s: can't create attribute file %s/%s\n",
+			__func__, dev->name, bridge_forward.attr.name);
+		goto out2;
+	}
+
+	br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj);
+	if (!br->ifobj) {
+		pr_info("%s: can't add kobject (directory) %s/%s\n",
+			__func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR);
+		goto out3;
+	}
+	return 0;
+ out3:
+	sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward);
+ out2:
+	sysfs_remove_group(&dev->dev.kobj, &bridge_group);
+ out1:
+	return err;
+
+}
+
+void br_sysfs_delbr(struct net_device *dev)
+{
+	struct kobject *kobj = &dev->dev.kobj;
+	struct net_bridge *br = netdev_priv(dev);
+
+	kobject_put(br->ifobj);
+	sysfs_remove_bin_file(kobj, &bridge_forward);
+	sysfs_remove_group(kobj, &bridge_group);
+}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
new file mode 100644
index 00000000..6229b627
--- /dev/null
+++ b/net/bridge/br_sysfs_if.c
@@ -0,0 +1,283 @@
+/*
+ *	Sysfs attributes of bridge ports
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Stephen Hemminger		<shemminger@osdl.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+
+#include "br_private.h"
+
+struct brport_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct net_bridge_port *, char *);
+	int (*store)(struct net_bridge_port *, unsigned long);
+};
+
+#define BRPORT_ATTR(_name,_mode,_show,_store)		        \
+struct brport_attribute brport_attr_##_name = { 	        \
+	.attr = {.name = __stringify(_name), 			\
+		 .mode = _mode },				\
+	.show	= _show,					\
+	.store	= _store,					\
+};
+
+static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->path_cost);
+}
+
+static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR,
+		   show_path_cost, br_stp_set_path_cost);
+
+static ssize_t show_priority(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->priority);
+}
+
+static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR,
+			 show_priority, br_stp_set_port_priority);
+
+static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
+{
+	return br_show_bridge_id(buf, &p->designated_root);
+}
+static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL);
+
+static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf)
+{
+	return br_show_bridge_id(buf, &p->designated_bridge);
+}
+static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL);
+
+static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->designated_port);
+}
+static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL);
+
+static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->designated_cost);
+}
+static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL);
+
+static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "0x%x\n", p->port_id);
+}
+static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL);
+
+static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "0x%x\n", p->port_no);
+}
+
+static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL);
+
+static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->topology_change_ack);
+}
+static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL);
+
+static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->config_pending);
+}
+static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL);
+
+static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->state);
+}
+static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL);
+
+static ssize_t show_message_age_timer(struct net_bridge_port *p,
+					    char *buf)
+{
+	return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
+}
+static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL);
+
+static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
+					    char *buf)
+{
+	return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
+}
+static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL);
+
+static ssize_t show_hold_timer(struct net_bridge_port *p,
+					    char *buf)
+{
+	return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
+}
+static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
+
+static int store_flush(struct net_bridge_port *p, unsigned long v)
+{
+	br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry
+	return 0;
+}
+static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
+
+static ssize_t show_hairpin_mode(struct net_bridge_port *p, char *buf)
+{
+	int hairpin_mode = (p->flags & BR_HAIRPIN_MODE) ? 1 : 0;
+	return sprintf(buf, "%d\n", hairpin_mode);
+}
+static int store_hairpin_mode(struct net_bridge_port *p, unsigned long v)
+{
+	if (v)
+		p->flags |= BR_HAIRPIN_MODE;
+	else
+		p->flags &= ~BR_HAIRPIN_MODE;
+	return 0;
+}
+static BRPORT_ATTR(hairpin_mode, S_IRUGO | S_IWUSR,
+		   show_hairpin_mode, store_hairpin_mode);
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
+{
+	return sprintf(buf, "%d\n", p->multicast_router);
+}
+
+static int store_multicast_router(struct net_bridge_port *p,
+				      unsigned long v)
+{
+	return br_multicast_set_port_router(p, v);
+}
+static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
+		   store_multicast_router);
+#endif
+
+static struct brport_attribute *brport_attrs[] = {
+	&brport_attr_path_cost,
+	&brport_attr_priority,
+	&brport_attr_port_id,
+	&brport_attr_port_no,
+	&brport_attr_designated_root,
+	&brport_attr_designated_bridge,
+	&brport_attr_designated_port,
+	&brport_attr_designated_cost,
+	&brport_attr_state,
+	&brport_attr_change_ack,
+	&brport_attr_config_pending,
+	&brport_attr_message_age_timer,
+	&brport_attr_forward_delay_timer,
+	&brport_attr_hold_timer,
+	&brport_attr_flush,
+	&brport_attr_hairpin_mode,
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+	&brport_attr_multicast_router,
+#endif
+	NULL
+};
+
+#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
+#define to_brport(obj)	container_of(obj, struct net_bridge_port, kobj)
+
+static ssize_t brport_show(struct kobject * kobj,
+			   struct attribute * attr, char * buf)
+{
+	struct brport_attribute * brport_attr = to_brport_attr(attr);
+	struct net_bridge_port * p = to_brport(kobj);
+
+	return brport_attr->show(p, buf);
+}
+
+static ssize_t brport_store(struct kobject * kobj,
+			    struct attribute * attr,
+			    const char * buf, size_t count)
+{
+	struct brport_attribute * brport_attr = to_brport_attr(attr);
+	struct net_bridge_port * p = to_brport(kobj);
+	ssize_t ret = -EINVAL;
+	char *endp;
+	unsigned long val;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp != buf) {
+		if (!rtnl_trylock())
+			return restart_syscall();
+		if (p->dev && p->br && brport_attr->store) {
+			spin_lock_bh(&p->br->lock);
+			ret = brport_attr->store(p, val);
+			spin_unlock_bh(&p->br->lock);
+			if (ret == 0)
+				ret = count;
+		}
+		rtnl_unlock();
+	}
+	return ret;
+}
+
+const struct sysfs_ops brport_sysfs_ops = {
+	.show = brport_show,
+	.store = brport_store,
+};
+
+/*
+ * Add sysfs entries to ethernet device added to a bridge.
+ * Creates a brport subdirectory with bridge attributes.
+ * Puts symlink in bridge's brif subdirectory
+ */
+int br_sysfs_addif(struct net_bridge_port *p)
+{
+	struct net_bridge *br = p->br;
+	struct brport_attribute **a;
+	int err;
+
+	err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj,
+				SYSFS_BRIDGE_PORT_LINK);
+	if (err)
+		return err;
+
+	for (a = brport_attrs; *a; ++a) {
+		err = sysfs_create_file(&p->kobj, &((*a)->attr));
+		if (err)
+			return err;
+	}
+
+	strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+	return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name);
+}
+
+/* Rename bridge's brif symlink */
+int br_sysfs_renameif(struct net_bridge_port *p)
+{
+	struct net_bridge *br = p->br;
+	int err;
+
+	/* If a rename fails, the rollback will cause another
+	 * rename call with the existing name.
+	 */
+	if (!strncmp(p->sysfs_name, p->dev->name, IFNAMSIZ))
+		return 0;
+
+	err = sysfs_rename_link(br->ifobj, &p->kobj,
+				p->sysfs_name, p->dev->name);
+	if (err)
+		netdev_notice(br->dev, "unable to rename link %s to %s",
+			      p->sysfs_name, p->dev->name);
+	else
+		strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+
+	return err;
+}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
new file mode 100644
index 00000000..ba6f73eb
--- /dev/null
+++ b/net/bridge/netfilter/Kconfig
@@ -0,0 +1,221 @@
+#
+# Bridge netfilter configuration
+#
+
+menuconfig BRIDGE_NF_EBTABLES
+	tristate "Ethernet Bridge tables (ebtables) support"
+	depends on BRIDGE && BRIDGE_NETFILTER
+	select NETFILTER_XTABLES
+	help
+	  ebtables is a general, extensible frame/packet identification
+	  framework. Say 'Y' or 'M' here if you want to do Ethernet
+	  filtering/NAT/brouting on the Ethernet bridge.
+
+if BRIDGE_NF_EBTABLES
+
+#
+# tables
+#
+config BRIDGE_EBT_BROUTE
+	tristate "ebt: broute table support"
+	help
+	  The ebtables broute table is used to define rules that decide between
+	  bridging and routing frames, giving Linux the functionality of a
+	  brouter. See the man page for ebtables(8) and examples on the ebtables
+	  website.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_T_FILTER
+	tristate "ebt: filter table support"
+	help
+	  The ebtables filter table is used to define frame filtering rules at
+	  local input, forwarding and local output. See the man page for
+	  ebtables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_T_NAT
+	tristate "ebt: nat table support"
+	help
+	  The ebtables nat table is used to define rules that alter the MAC
+	  source address (MAC SNAT) or the MAC destination address (MAC DNAT).
+	  See the man page for ebtables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+#
+# matches
+#
+config BRIDGE_EBT_802_3
+	tristate "ebt: 802.3 filter support"
+	help
+	  This option adds matching support for 802.3 Ethernet frames.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_AMONG
+	tristate "ebt: among filter support"
+	help
+	  This option adds the among match, which allows matching the MAC source
+	  and/or destination address on a list of addresses. Optionally,
+	  MAC/IP address pairs can be matched, f.e. for anti-spoofing rules.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_ARP
+	tristate "ebt: ARP filter support"
+	help
+	  This option adds the ARP match, which allows ARP and RARP header field
+	  filtering.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_IP
+	tristate "ebt: IP filter support"
+	help
+	  This option adds the IP match, which allows basic IP header field
+	  filtering.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_IP6
+	tristate "ebt: IP6 filter support"
+	depends on BRIDGE_NF_EBTABLES && IPV6
+	help
+	  This option adds the IP6 match, which allows basic IPV6 header field
+	  filtering.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_LIMIT
+	tristate "ebt: limit match support"
+	help
+	  This option adds the limit match, which allows you to control
+	  the rate at which a rule can be matched. This match is the
+	  equivalent of the iptables limit match.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config BRIDGE_EBT_MARK
+	tristate "ebt: mark filter support"
+	help
+	  This option adds the mark match, which allows matching frames based on
+	  the 'nfmark' value in the frame. This can be set by the mark target.
+	  This value is the same as the one used in the iptables mark match and
+	  target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_PKTTYPE
+	tristate "ebt: packet type filter support"
+	help
+	  This option adds the packet type match, which allows matching on the
+	  type of packet based on its Ethernet "class" (as determined by
+	  the generic networking code): broadcast, multicast,
+	  for this host alone or for another host.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_STP
+	tristate "ebt: STP filter support"
+	help
+	  This option adds the Spanning Tree Protocol match, which
+	  allows STP header field filtering.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_VLAN
+	tristate "ebt: 802.1Q VLAN filter support"
+	help
+	  This option adds the 802.1Q vlan match, which allows the filtering of
+	  802.1Q vlan fields.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+#
+# targets
+#
+config BRIDGE_EBT_ARPREPLY
+	tristate "ebt: arp reply target support"
+	depends on BRIDGE_NF_EBTABLES && INET
+	help
+	  This option adds the arp reply target, which allows
+	  automatically sending arp replies to arp requests.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_DNAT
+	tristate "ebt: dnat target support"
+	help
+	  This option adds the MAC DNAT target, which allows altering the MAC
+	  destination address of frames.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_MARK_T
+	tristate "ebt: mark target support"
+	help
+	  This option adds the mark target, which allows marking frames by
+	  setting the 'nfmark' value in the frame.
+	  This value is the same as the one used in the iptables mark match and
+	  target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_REDIRECT
+	tristate "ebt: redirect target support"
+	help
+	  This option adds the MAC redirect target, which allows altering the MAC
+	  destination address of a frame to that of the device it arrived on.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_SNAT
+	tristate "ebt: snat target support"
+	help
+	  This option adds the MAC SNAT target, which allows altering the MAC
+	  source address of frames.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+#
+# watchers
+#
+config BRIDGE_EBT_LOG
+	tristate "ebt: log support"
+	help
+	  This option adds the log watcher, that you can use in any rule
+	  in any ebtables table. It records info about the frame header
+	  to the syslog.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_ULOG
+	tristate "ebt: ulog support (OBSOLETE)"
+	help
+	  This option enables the old bridge-specific "ebt_ulog" implementation
+	  which has been obsoleted by the new "nfnetlink_log" code (see
+	  CONFIG_NETFILTER_NETLINK_LOG).
+
+	  This option adds the ulog watcher, that you can use in any rule
+	  in any ebtables table. The packet is passed to a userspace
+	  logging daemon using netlink multicast sockets. This differs
+	  from the log watcher in the sense that the complete packet is
+	  sent to userspace instead of a descriptive text and that
+	  netlink multicast sockets are used instead of the syslog.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config BRIDGE_EBT_NFLOG
+	tristate "ebt: nflog support"
+	help
+	  This option enables the nflog watcher, which allows to LOG
+	  messages through the netfilter logging API, which can use
+	  either the old LOG target, the old ULOG target or nfnetlink_log
+	  as backend.
+
+	  This option adds the nflog watcher, that you can use in any rule
+	  in any ebtables table.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+endif # BRIDGE_NF_EBTABLES
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
new file mode 100644
index 00000000..07186995
--- /dev/null
+++ b/net/bridge/netfilter/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the netfilter modules for Link Layer filtering on a bridge.
+#
+
+obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
+
+# tables
+obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o
+obj-$(CONFIG_BRIDGE_EBT_T_FILTER) += ebtable_filter.o
+obj-$(CONFIG_BRIDGE_EBT_T_NAT) += ebtable_nat.o
+
+#matches
+obj-$(CONFIG_BRIDGE_EBT_802_3) += ebt_802_3.o
+obj-$(CONFIG_BRIDGE_EBT_AMONG) += ebt_among.o
+obj-$(CONFIG_BRIDGE_EBT_ARP) += ebt_arp.o
+obj-$(CONFIG_BRIDGE_EBT_IP) += ebt_ip.o
+obj-$(CONFIG_BRIDGE_EBT_IP6) += ebt_ip6.o
+obj-$(CONFIG_BRIDGE_EBT_LIMIT) += ebt_limit.o
+obj-$(CONFIG_BRIDGE_EBT_MARK) += ebt_mark_m.o
+obj-$(CONFIG_BRIDGE_EBT_PKTTYPE) += ebt_pkttype.o
+obj-$(CONFIG_BRIDGE_EBT_STP) += ebt_stp.o
+obj-$(CONFIG_BRIDGE_EBT_VLAN) += ebt_vlan.o
+
+# targets
+obj-$(CONFIG_BRIDGE_EBT_ARPREPLY) += ebt_arpreply.o
+obj-$(CONFIG_BRIDGE_EBT_MARK_T) += ebt_mark.o
+obj-$(CONFIG_BRIDGE_EBT_DNAT) += ebt_dnat.o
+obj-$(CONFIG_BRIDGE_EBT_REDIRECT) += ebt_redirect.o
+obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o
+
+# watchers
+obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o
+obj-$(CONFIG_BRIDGE_EBT_ULOG) += ebt_ulog.o
+obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
new file mode 100644
index 00000000..2a449b7a
--- /dev/null
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -0,0 +1,72 @@
+/*
+ * 802_3
+ *
+ * Author:
+ * Chris Vitale csv@bluetail.com
+ *
+ * May 2003
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_802_3.h>
+
+static bool
+ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_802_3_info *info = par->matchinfo;
+	const struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb);
+	__be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type;
+
+	if (info->bitmask & EBT_802_3_SAP) {
+		if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
+			return false;
+		if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
+			return false;
+	}
+
+	if (info->bitmask & EBT_802_3_TYPE) {
+		if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE))
+			return false;
+		if (FWINV(info->type != type, EBT_802_3_TYPE))
+			return false;
+	}
+
+	return true;
+}
+
+static int ebt_802_3_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_802_3_info *info = par->matchinfo;
+
+	if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct xt_match ebt_802_3_mt_reg __read_mostly = {
+	.name		= "802_3",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_802_3_mt,
+	.checkentry	= ebt_802_3_mt_check,
+	.matchsize	= sizeof(struct ebt_802_3_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_802_3_init(void)
+{
+	return xt_register_match(&ebt_802_3_mt_reg);
+}
+
+static void __exit ebt_802_3_fini(void)
+{
+	xt_unregister_match(&ebt_802_3_mt_reg);
+}
+
+module_init(ebt_802_3_init);
+module_exit(ebt_802_3_fini);
+MODULE_DESCRIPTION("Ebtables: DSAP/SSAP field and SNAP type matching");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
new file mode 100644
index 00000000..8b84c581
--- /dev/null
+++ b/net/bridge/netfilter/ebt_among.c
@@ -0,0 +1,229 @@
+/*
+ *  ebt_among
+ *
+ *	Authors:
+ *	Grzegorz Borowiak <grzes@gnu.univ.gda.pl>
+ *
+ *  August, 2003
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/ip.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_among.h>
+
+static bool ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh,
+				      const char *mac, __be32 ip)
+{
+	/* You may be puzzled as to how this code works.
+	 * Some tricks were used, refer to
+	 * 	include/linux/netfilter_bridge/ebt_among.h
+	 * as there you can find a solution of this mystery.
+	 */
+	const struct ebt_mac_wormhash_tuple *p;
+	int start, limit, i;
+	uint32_t cmp[2] = { 0, 0 };
+	int key = ((const unsigned char *)mac)[5];
+
+	memcpy(((char *) cmp) + 2, mac, 6);
+	start = wh->table[key];
+	limit = wh->table[key + 1];
+	if (ip) {
+		for (i = start; i < limit; i++) {
+			p = &wh->pool[i];
+			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+				if (p->ip == 0 || p->ip == ip)
+					return true;
+		}
+	} else {
+		for (i = start; i < limit; i++) {
+			p = &wh->pool[i];
+			if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0])
+				if (p->ip == 0)
+					return true;
+		}
+	}
+	return false;
+}
+
+static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash
+					    *wh)
+{
+	int i;
+
+	for (i = 0; i < 256; i++) {
+		if (wh->table[i] > wh->table[i + 1])
+			return -0x100 - i;
+		if (wh->table[i] < 0)
+			return -0x200 - i;
+		if (wh->table[i] > wh->poolsize)
+			return -0x300 - i;
+	}
+	if (wh->table[256] > wh->poolsize)
+		return -0xc00;
+	return 0;
+}
+
+static int get_ip_dst(const struct sk_buff *skb, __be32 *addr)
+{
+	if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
+		const struct iphdr *ih;
+		struct iphdr _iph;
+
+		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+		if (ih == NULL)
+			return -1;
+		*addr = ih->daddr;
+	} else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+		const struct arphdr *ah;
+		struct arphdr _arph;
+		const __be32 *bp;
+		__be32 buf;
+
+		ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+		if (ah == NULL ||
+		    ah->ar_pln != sizeof(__be32) ||
+		    ah->ar_hln != ETH_ALEN)
+			return -1;
+		bp = skb_header_pointer(skb, sizeof(struct arphdr) +
+					2 * ETH_ALEN + sizeof(__be32),
+					sizeof(__be32), &buf);
+		if (bp == NULL)
+			return -1;
+		*addr = *bp;
+	}
+	return 0;
+}
+
+static int get_ip_src(const struct sk_buff *skb, __be32 *addr)
+{
+	if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) {
+		const struct iphdr *ih;
+		struct iphdr _iph;
+
+		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+		if (ih == NULL)
+			return -1;
+		*addr = ih->saddr;
+	} else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+		const struct arphdr *ah;
+		struct arphdr _arph;
+		const __be32 *bp;
+		__be32 buf;
+
+		ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+		if (ah == NULL ||
+		    ah->ar_pln != sizeof(__be32) ||
+		    ah->ar_hln != ETH_ALEN)
+			return -1;
+		bp = skb_header_pointer(skb, sizeof(struct arphdr) +
+					ETH_ALEN, sizeof(__be32), &buf);
+		if (bp == NULL)
+			return -1;
+		*addr = *bp;
+	}
+	return 0;
+}
+
+static bool
+ebt_among_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_among_info *info = par->matchinfo;
+	const char *dmac, *smac;
+	const struct ebt_mac_wormhash *wh_dst, *wh_src;
+	__be32 dip = 0, sip = 0;
+
+	wh_dst = ebt_among_wh_dst(info);
+	wh_src = ebt_among_wh_src(info);
+
+	if (wh_src) {
+		smac = eth_hdr(skb)->h_source;
+		if (get_ip_src(skb, &sip))
+			return false;
+		if (!(info->bitmask & EBT_AMONG_SRC_NEG)) {
+			/* we match only if it contains */
+			if (!ebt_mac_wormhash_contains(wh_src, smac, sip))
+				return false;
+		} else {
+			/* we match only if it DOES NOT contain */
+			if (ebt_mac_wormhash_contains(wh_src, smac, sip))
+				return false;
+		}
+	}
+
+	if (wh_dst) {
+		dmac = eth_hdr(skb)->h_dest;
+		if (get_ip_dst(skb, &dip))
+			return false;
+		if (!(info->bitmask & EBT_AMONG_DST_NEG)) {
+			/* we match only if it contains */
+			if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip))
+				return false;
+		} else {
+			/* we match only if it DOES NOT contain */
+			if (ebt_mac_wormhash_contains(wh_dst, dmac, dip))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static int ebt_among_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_among_info *info = par->matchinfo;
+	const struct ebt_entry_match *em =
+		container_of(par->matchinfo, const struct ebt_entry_match, data);
+	int expected_length = sizeof(struct ebt_among_info);
+	const struct ebt_mac_wormhash *wh_dst, *wh_src;
+	int err;
+
+	wh_dst = ebt_among_wh_dst(info);
+	wh_src = ebt_among_wh_src(info);
+	expected_length += ebt_mac_wormhash_size(wh_dst);
+	expected_length += ebt_mac_wormhash_size(wh_src);
+
+	if (em->match_size != EBT_ALIGN(expected_length)) {
+		pr_info("wrong size: %d against expected %d, rounded to %Zd\n",
+			em->match_size, expected_length,
+			EBT_ALIGN(expected_length));
+		return -EINVAL;
+	}
+	if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) {
+		pr_info("dst integrity fail: %x\n", -err);
+		return -EINVAL;
+	}
+	if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) {
+		pr_info("src integrity fail: %x\n", -err);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match ebt_among_mt_reg __read_mostly = {
+	.name		= "among",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_among_mt,
+	.checkentry	= ebt_among_mt_check,
+	.matchsize	= -1, /* special case */
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_among_init(void)
+{
+	return xt_register_match(&ebt_among_mt_reg);
+}
+
+static void __exit ebt_among_fini(void)
+{
+	xt_unregister_match(&ebt_among_mt_reg);
+}
+
+module_init(ebt_among_init);
+module_exit(ebt_among_fini);
+MODULE_DESCRIPTION("Ebtables: Combined MAC/IP address list matching");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
new file mode 100644
index 00000000..cd457b89
--- /dev/null
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -0,0 +1,140 @@
+/*
+ *  ebt_arp
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Tim Gardner <timg@tpi.com>
+ *
+ *  April, 2002
+ *
+ */
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arp.h>
+
+static bool
+ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_arp_info *info = par->matchinfo;
+	const struct arphdr *ah;
+	struct arphdr _arph;
+
+	ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+	if (ah == NULL)
+		return false;
+	if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
+	   ah->ar_op, EBT_ARP_OPCODE))
+		return false;
+	if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
+	   ah->ar_hrd, EBT_ARP_HTYPE))
+		return false;
+	if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype !=
+	   ah->ar_pro, EBT_ARP_PTYPE))
+		return false;
+
+	if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) {
+		const __be32 *sap, *dap;
+		__be32 saddr, daddr;
+
+		if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP))
+			return false;
+		sap = skb_header_pointer(skb, sizeof(struct arphdr) +
+					ah->ar_hln, sizeof(saddr),
+					&saddr);
+		if (sap == NULL)
+			return false;
+		dap = skb_header_pointer(skb, sizeof(struct arphdr) +
+					2*ah->ar_hln+sizeof(saddr),
+					sizeof(daddr), &daddr);
+		if (dap == NULL)
+			return false;
+		if (info->bitmask & EBT_ARP_SRC_IP &&
+		    FWINV(info->saddr != (*sap & info->smsk), EBT_ARP_SRC_IP))
+			return false;
+		if (info->bitmask & EBT_ARP_DST_IP &&
+		    FWINV(info->daddr != (*dap & info->dmsk), EBT_ARP_DST_IP))
+			return false;
+		if (info->bitmask & EBT_ARP_GRAT &&
+		    FWINV(*dap != *sap, EBT_ARP_GRAT))
+			return false;
+	}
+
+	if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
+		const unsigned char *mp;
+		unsigned char _mac[ETH_ALEN];
+		uint8_t verdict, i;
+
+		if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
+			return false;
+		if (info->bitmask & EBT_ARP_SRC_MAC) {
+			mp = skb_header_pointer(skb, sizeof(struct arphdr),
+						sizeof(_mac), &_mac);
+			if (mp == NULL)
+				return false;
+			verdict = 0;
+			for (i = 0; i < 6; i++)
+				verdict |= (mp[i] ^ info->smaddr[i]) &
+				       info->smmsk[i];
+			if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
+				return false;
+		}
+
+		if (info->bitmask & EBT_ARP_DST_MAC) {
+			mp = skb_header_pointer(skb, sizeof(struct arphdr) +
+						ah->ar_hln + ah->ar_pln,
+						sizeof(_mac), &_mac);
+			if (mp == NULL)
+				return false;
+			verdict = 0;
+			for (i = 0; i < 6; i++)
+				verdict |= (mp[i] ^ info->dmaddr[i]) &
+					info->dmmsk[i];
+			if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static int ebt_arp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_arp_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
+
+	if ((e->ethproto != htons(ETH_P_ARP) &&
+	   e->ethproto != htons(ETH_P_RARP)) ||
+	   e->invflags & EBT_IPROTO)
+		return -EINVAL;
+	if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_match ebt_arp_mt_reg __read_mostly = {
+	.name		= "arp",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_arp_mt,
+	.checkentry	= ebt_arp_mt_check,
+	.matchsize	= sizeof(struct ebt_arp_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_arp_init(void)
+{
+	return xt_register_match(&ebt_arp_mt_reg);
+}
+
+static void __exit ebt_arp_fini(void)
+{
+	xt_unregister_match(&ebt_arp_mt_reg);
+}
+
+module_init(ebt_arp_init);
+module_exit(ebt_arp_fini);
+MODULE_DESCRIPTION("Ebtables: ARP protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
new file mode 100644
index 00000000..070cf134
--- /dev/null
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -0,0 +1,98 @@
+/*
+ *  ebt_arpreply
+ *
+ *	Authors:
+ *	Grzegorz Borowiak <grzes@gnu.univ.gda.pl>
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  August, 2003
+ *
+ */
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_arpreply.h>
+
+static unsigned int
+ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_arpreply_info *info = par->targinfo;
+	const __be32 *siptr, *diptr;
+	__be32 _sip, _dip;
+	const struct arphdr *ap;
+	struct arphdr _ah;
+	const unsigned char *shp;
+	unsigned char _sha[ETH_ALEN];
+
+	ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
+	if (ap == NULL)
+		return EBT_DROP;
+
+	if (ap->ar_op != htons(ARPOP_REQUEST) ||
+	    ap->ar_hln != ETH_ALEN ||
+	    ap->ar_pro != htons(ETH_P_IP) ||
+	    ap->ar_pln != 4)
+		return EBT_CONTINUE;
+
+	shp = skb_header_pointer(skb, sizeof(_ah), ETH_ALEN, &_sha);
+	if (shp == NULL)
+		return EBT_DROP;
+
+	siptr = skb_header_pointer(skb, sizeof(_ah) + ETH_ALEN,
+				   sizeof(_sip), &_sip);
+	if (siptr == NULL)
+		return EBT_DROP;
+
+	diptr = skb_header_pointer(skb,
+				   sizeof(_ah) + 2 * ETH_ALEN + sizeof(_sip),
+				   sizeof(_dip), &_dip);
+	if (diptr == NULL)
+		return EBT_DROP;
+
+	arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
+		 *diptr, shp, info->mac, shp);
+
+	return info->target;
+}
+
+static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ebt_arpreply_info *info = par->targinfo;
+	const struct ebt_entry *e = par->entryinfo;
+
+	if (BASE_CHAIN && info->target == EBT_RETURN)
+		return -EINVAL;
+	if (e->ethproto != htons(ETH_P_ARP) ||
+	    e->invflags & EBT_IPROTO)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target ebt_arpreply_tg_reg __read_mostly = {
+	.name		= "arpreply",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.table		= "nat",
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING),
+	.target		= ebt_arpreply_tg,
+	.checkentry	= ebt_arpreply_tg_check,
+	.targetsize	= sizeof(struct ebt_arpreply_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_arpreply_init(void)
+{
+	return xt_register_target(&ebt_arpreply_tg_reg);
+}
+
+static void __exit ebt_arpreply_fini(void)
+{
+	xt_unregister_target(&ebt_arpreply_tg_reg);
+}
+
+module_init(ebt_arpreply_init);
+module_exit(ebt_arpreply_fini);
+MODULE_DESCRIPTION("Ebtables: ARP reply target");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
new file mode 100644
index 00000000..c59f7bfa
--- /dev/null
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -0,0 +1,74 @@
+/*
+ *  ebt_dnat
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  June, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
+
+static unsigned int
+ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_nat_info *info = par->targinfo;
+
+	if (!skb_make_writable(skb, 0))
+		return EBT_DROP;
+
+	memcpy(eth_hdr(skb)->h_dest, info->mac, ETH_ALEN);
+	return info->target;
+}
+
+static int ebt_dnat_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ebt_nat_info *info = par->targinfo;
+	unsigned int hook_mask;
+
+	if (BASE_CHAIN && info->target == EBT_RETURN)
+		return -EINVAL;
+
+	hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+	if ((strcmp(par->table, "nat") != 0 ||
+	    (hook_mask & ~((1 << NF_BR_PRE_ROUTING) |
+	    (1 << NF_BR_LOCAL_OUT)))) &&
+	    (strcmp(par->table, "broute") != 0 ||
+	    hook_mask & ~(1 << NF_BR_BROUTING)))
+		return -EINVAL;
+	if (INVALID_TARGET)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target ebt_dnat_tg_reg __read_mostly = {
+	.name		= "dnat",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING),
+	.target		= ebt_dnat_tg,
+	.checkentry	= ebt_dnat_tg_check,
+	.targetsize	= sizeof(struct ebt_nat_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_dnat_init(void)
+{
+	return xt_register_target(&ebt_dnat_tg_reg);
+}
+
+static void __exit ebt_dnat_fini(void)
+{
+	xt_unregister_target(&ebt_dnat_tg_reg);
+}
+
+module_init(ebt_dnat_init);
+module_exit(ebt_dnat_fini);
+MODULE_DESCRIPTION("Ebtables: Destination MAC address translation");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
new file mode 100644
index 00000000..23bca62d
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -0,0 +1,130 @@
+/*
+ *  ebt_ip
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2002
+ *
+ *  Changes:
+ *    added ip-sport and ip-dport
+ *    Innominate Security Technologies AG <mhopf@innominate.com>
+ *    September, 2002
+ */
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip.h>
+
+struct tcpudphdr {
+	__be16 src;
+	__be16 dst;
+};
+
+static bool
+ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_ip_info *info = par->matchinfo;
+	const struct iphdr *ih;
+	struct iphdr _iph;
+	const struct tcpudphdr *pptr;
+	struct tcpudphdr _ports;
+
+	ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		return false;
+	if (info->bitmask & EBT_IP_TOS &&
+	   FWINV(info->tos != ih->tos, EBT_IP_TOS))
+		return false;
+	if (info->bitmask & EBT_IP_SOURCE &&
+	   FWINV((ih->saddr & info->smsk) !=
+	   info->saddr, EBT_IP_SOURCE))
+		return false;
+	if ((info->bitmask & EBT_IP_DEST) &&
+	   FWINV((ih->daddr & info->dmsk) !=
+	   info->daddr, EBT_IP_DEST))
+		return false;
+	if (info->bitmask & EBT_IP_PROTO) {
+		if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO))
+			return false;
+		if (!(info->bitmask & EBT_IP_DPORT) &&
+		    !(info->bitmask & EBT_IP_SPORT))
+			return true;
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			return false;
+		pptr = skb_header_pointer(skb, ih->ihl*4,
+					  sizeof(_ports), &_ports);
+		if (pptr == NULL)
+			return false;
+		if (info->bitmask & EBT_IP_DPORT) {
+			u32 dst = ntohs(pptr->dst);
+			if (FWINV(dst < info->dport[0] ||
+				  dst > info->dport[1],
+				  EBT_IP_DPORT))
+			return false;
+		}
+		if (info->bitmask & EBT_IP_SPORT) {
+			u32 src = ntohs(pptr->src);
+			if (FWINV(src < info->sport[0] ||
+				  src > info->sport[1],
+				  EBT_IP_SPORT))
+			return false;
+		}
+	}
+	return true;
+}
+
+static int ebt_ip_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_ip_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
+
+	if (e->ethproto != htons(ETH_P_IP) ||
+	   e->invflags & EBT_IPROTO)
+		return -EINVAL;
+	if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK)
+		return -EINVAL;
+	if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) {
+		if (info->invflags & EBT_IP_PROTO)
+			return -EINVAL;
+		if (info->protocol != IPPROTO_TCP &&
+		    info->protocol != IPPROTO_UDP &&
+		    info->protocol != IPPROTO_UDPLITE &&
+		    info->protocol != IPPROTO_SCTP &&
+		    info->protocol != IPPROTO_DCCP)
+			 return -EINVAL;
+	}
+	if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1])
+		return -EINVAL;
+	if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1])
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_match ebt_ip_mt_reg __read_mostly = {
+	.name		= "ip",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_ip_mt,
+	.checkentry	= ebt_ip_mt_check,
+	.matchsize	= sizeof(struct ebt_ip_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_ip_init(void)
+{
+	return xt_register_match(&ebt_ip_mt_reg);
+}
+
+static void __exit ebt_ip_fini(void)
+{
+	xt_unregister_match(&ebt_ip_mt_reg);
+}
+
+module_init(ebt_ip_init);
+module_exit(ebt_ip_fini);
+MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
new file mode 100644
index 00000000..2ed0056a
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -0,0 +1,155 @@
+/*
+ *  ebt_ip6
+ *
+ *	Authors:
+ *	Manohar Castelino <manohar.r.castelino@intel.com>
+ *	Kuo-Lang Tseng <kuo-lang.tseng@intel.com>
+ *	Jan Engelhardt <jengelh@medozas.de>
+ *
+ * Summary:
+ * This is just a modification of the IPv4 code written by
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * with the changes required to support IPv6
+ *
+ *  Jan, 2008
+ */
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/dsfield.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ip6.h>
+
+union pkthdr {
+	struct {
+		__be16 src;
+		__be16 dst;
+	} tcpudphdr;
+	struct {
+		u8 type;
+		u8 code;
+	} icmphdr;
+};
+
+static bool
+ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_ip6_info *info = par->matchinfo;
+	const struct ipv6hdr *ih6;
+	struct ipv6hdr _ip6h;
+	const union pkthdr *pptr;
+	union pkthdr _pkthdr;
+
+	ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
+	if (ih6 == NULL)
+		return false;
+	if (info->bitmask & EBT_IP6_TCLASS &&
+	   FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS))
+		return false;
+	if (FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk,
+				       &info->saddr), EBT_IP6_SOURCE) ||
+	    FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk,
+				       &info->daddr), EBT_IP6_DEST))
+		return false;
+	if (info->bitmask & EBT_IP6_PROTO) {
+		uint8_t nexthdr = ih6->nexthdr;
+		int offset_ph;
+
+		offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr);
+		if (offset_ph == -1)
+			return false;
+		if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
+			return false;
+		if (!(info->bitmask & ( EBT_IP6_DPORT |
+					EBT_IP6_SPORT | EBT_IP6_ICMP6)))
+			return true;
+
+		/* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
+		pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
+					  &_pkthdr);
+		if (pptr == NULL)
+			return false;
+		if (info->bitmask & EBT_IP6_DPORT) {
+			u16 dst = ntohs(pptr->tcpudphdr.dst);
+			if (FWINV(dst < info->dport[0] ||
+				  dst > info->dport[1], EBT_IP6_DPORT))
+				return false;
+		}
+		if (info->bitmask & EBT_IP6_SPORT) {
+			u16 src = ntohs(pptr->tcpudphdr.src);
+			if (FWINV(src < info->sport[0] ||
+				  src > info->sport[1], EBT_IP6_SPORT))
+			return false;
+		}
+		if ((info->bitmask & EBT_IP6_ICMP6) &&
+		     FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
+			   pptr->icmphdr.type > info->icmpv6_type[1] ||
+			   pptr->icmphdr.code < info->icmpv6_code[0] ||
+			   pptr->icmphdr.code > info->icmpv6_code[1],
+							EBT_IP6_ICMP6))
+			return false;
+	}
+	return true;
+}
+
+static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_entry *e = par->entryinfo;
+	struct ebt_ip6_info *info = par->matchinfo;
+
+	if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO)
+		return -EINVAL;
+	if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK)
+		return -EINVAL;
+	if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) {
+		if (info->invflags & EBT_IP6_PROTO)
+			return -EINVAL;
+		if (info->protocol != IPPROTO_TCP &&
+		    info->protocol != IPPROTO_UDP &&
+		    info->protocol != IPPROTO_UDPLITE &&
+		    info->protocol != IPPROTO_SCTP &&
+		    info->protocol != IPPROTO_DCCP)
+			return -EINVAL;
+	}
+	if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1])
+		return -EINVAL;
+	if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
+		return -EINVAL;
+	if (info->bitmask & EBT_IP6_ICMP6) {
+		if ((info->invflags & EBT_IP6_PROTO) ||
+		     info->protocol != IPPROTO_ICMPV6)
+			return -EINVAL;
+		if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
+		    info->icmpv6_code[0] > info->icmpv6_code[1])
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match ebt_ip6_mt_reg __read_mostly = {
+	.name		= "ip6",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_ip6_mt,
+	.checkentry	= ebt_ip6_mt_check,
+	.matchsize	= sizeof(struct ebt_ip6_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_ip6_init(void)
+{
+	return xt_register_match(&ebt_ip6_mt_reg);
+}
+
+static void __exit ebt_ip6_fini(void)
+{
+	xt_unregister_match(&ebt_ip6_mt_reg);
+}
+
+module_init(ebt_ip6_init);
+module_exit(ebt_ip6_fini);
+MODULE_DESCRIPTION("Ebtables: IPv6 protocol packet match");
+MODULE_AUTHOR("Kuo-Lang Tseng <kuo-lang.tseng@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
new file mode 100644
index 00000000..517e78be
--- /dev/null
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -0,0 +1,127 @@
+/*
+ *  ebt_limit
+ *
+ *	Authors:
+ *	Tom Marshall <tommy@home.tig-grr.com>
+ *
+ *	Mostly copied from netfilter's ipt_limit.c, see that file for
+ *	more explanation
+ *
+ *  September, 2003
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_limit.h>
+
+static DEFINE_SPINLOCK(limit_lock);
+
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static bool
+ebt_limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ebt_limit_info *info = (void *)par->matchinfo;
+	unsigned long now = jiffies;
+
+	spin_lock_bh(&limit_lock);
+	info->credit += (now - xchg(&info->prev, now)) * CREDITS_PER_JIFFY;
+	if (info->credit > info->credit_cap)
+		info->credit = info->credit_cap;
+
+	if (info->credit >= info->cost) {
+		/* We're not limited. */
+		info->credit -= info->cost;
+		spin_unlock_bh(&limit_lock);
+		return true;
+	}
+
+	spin_unlock_bh(&limit_lock);
+	return false;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / EBT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE;
+}
+
+static int ebt_limit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct ebt_limit_info *info = par->matchinfo;
+
+	/* Check for overflow. */
+	if (info->burst == 0 ||
+	    user2credits(info->avg * info->burst) < user2credits(info->avg)) {
+		pr_info("overflow, try lower: %u/%u\n",
+			info->avg, info->burst);
+		return -EINVAL;
+	}
+
+	/* User avg in seconds * EBT_LIMIT_SCALE: convert to jiffies * 128. */
+	info->prev = jiffies;
+	info->credit = user2credits(info->avg * info->burst);
+	info->credit_cap = user2credits(info->avg * info->burst);
+	info->cost = user2credits(info->avg);
+	return 0;
+}
+
+
+#ifdef CONFIG_COMPAT
+/*
+ * no conversion function needed --
+ * only avg/burst have meaningful values in userspace.
+ */
+struct ebt_compat_limit_info {
+	compat_uint_t avg, burst;
+	compat_ulong_t prev;
+	compat_uint_t credit, credit_cap, cost;
+};
+#endif
+
+static struct xt_match ebt_limit_mt_reg __read_mostly = {
+	.name		= "limit",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_limit_mt,
+	.checkentry	= ebt_limit_mt_check,
+	.matchsize	= sizeof(struct ebt_limit_info),
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct ebt_compat_limit_info),
+#endif
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_limit_init(void)
+{
+	return xt_register_match(&ebt_limit_mt_reg);
+}
+
+static void __exit ebt_limit_fini(void)
+{
+	xt_unregister_match(&ebt_limit_mt_reg);
+}
+
+module_init(ebt_limit_init);
+module_exit(ebt_limit_fini);
+MODULE_DESCRIPTION("Ebtables: Rate-limit match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
new file mode 100644
index 00000000..6e5a8bb9
--- /dev/null
+++ b/net/bridge/netfilter/ebt_log.c
@@ -0,0 +1,228 @@
+/*
+ *  ebt_log
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Harald Welte <laforge@netfilter.org>
+ *
+ *  April, 2002
+ *
+ */
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/spinlock.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
+
+static DEFINE_SPINLOCK(ebt_log_lock);
+
+static int ebt_log_tg_check(const struct xt_tgchk_param *par)
+{
+	struct ebt_log_info *info = par->targinfo;
+
+	if (info->bitmask & ~EBT_LOG_MASK)
+		return -EINVAL;
+	if (info->loglevel >= 8)
+		return -EINVAL;
+	info->prefix[EBT_LOG_PREFIX_SIZE - 1] = '\0';
+	return 0;
+}
+
+struct tcpudphdr
+{
+	__be16 src;
+	__be16 dst;
+};
+
+struct arppayload
+{
+	unsigned char mac_src[ETH_ALEN];
+	unsigned char ip_src[4];
+	unsigned char mac_dst[ETH_ALEN];
+	unsigned char ip_dst[4];
+};
+
+static void
+print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
+{
+	if (protocol == IPPROTO_TCP ||
+	    protocol == IPPROTO_UDP ||
+	    protocol == IPPROTO_UDPLITE ||
+	    protocol == IPPROTO_SCTP ||
+	    protocol == IPPROTO_DCCP) {
+		const struct tcpudphdr *pptr;
+		struct tcpudphdr _ports;
+
+		pptr = skb_header_pointer(skb, offset,
+					  sizeof(_ports), &_ports);
+		if (pptr == NULL) {
+			printk(" INCOMPLETE TCP/UDP header");
+			return;
+		}
+		printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+	}
+}
+
+static void
+ebt_log_packet(u_int8_t pf, unsigned int hooknum,
+   const struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, const struct nf_loginfo *loginfo,
+   const char *prefix)
+{
+	unsigned int bitmask;
+
+	spin_lock_bh(&ebt_log_lock);
+	printk("<%c>%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+	       '0' + loginfo->u.log.level, prefix,
+	       in ? in->name : "", out ? out->name : "",
+	       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+	       ntohs(eth_hdr(skb)->h_proto));
+
+	if (loginfo->type == NF_LOG_TYPE_LOG)
+		bitmask = loginfo->u.log.logflags;
+	else
+		bitmask = NF_LOG_MASK;
+
+	if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
+	   htons(ETH_P_IP)){
+		const struct iphdr *ih;
+		struct iphdr _iph;
+
+		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+		if (ih == NULL) {
+			printk(" INCOMPLETE IP header");
+			goto out;
+		}
+		printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+		       &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
+		print_ports(skb, ih->protocol, ih->ihl*4);
+		goto out;
+	}
+
+#if defined(CONFIG_BRIDGE_EBT_IP6) || defined(CONFIG_BRIDGE_EBT_IP6_MODULE)
+	if ((bitmask & EBT_LOG_IP6) && eth_hdr(skb)->h_proto ==
+	   htons(ETH_P_IPV6)) {
+		const struct ipv6hdr *ih;
+		struct ipv6hdr _iph;
+		uint8_t nexthdr;
+		int offset_ph;
+
+		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+		if (ih == NULL) {
+			printk(" INCOMPLETE IPv6 header");
+			goto out;
+		}
+		printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
+		       &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
+		nexthdr = ih->nexthdr;
+		offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr);
+		if (offset_ph == -1)
+			goto out;
+		print_ports(skb, nexthdr, offset_ph);
+		goto out;
+	}
+#endif
+
+	if ((bitmask & EBT_LOG_ARP) &&
+	    ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
+	     (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
+		const struct arphdr *ah;
+		struct arphdr _arph;
+
+		ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+		if (ah == NULL) {
+			printk(" INCOMPLETE ARP header");
+			goto out;
+		}
+		printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+		       ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
+		       ntohs(ah->ar_op));
+
+		/* If it's for Ethernet and the lengths are OK,
+		 * then log the ARP payload */
+		if (ah->ar_hrd == htons(1) &&
+		    ah->ar_hln == ETH_ALEN &&
+		    ah->ar_pln == sizeof(__be32)) {
+			const struct arppayload *ap;
+			struct arppayload _arpp;
+
+			ap = skb_header_pointer(skb, sizeof(_arph),
+						sizeof(_arpp), &_arpp);
+			if (ap == NULL) {
+				printk(" INCOMPLETE ARP payload");
+				goto out;
+			}
+			printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
+					ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+		}
+	}
+out:
+	printk("\n");
+	spin_unlock_bh(&ebt_log_lock);
+
+}
+
+static unsigned int
+ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_log_info *info = par->targinfo;
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = info->loglevel;
+	li.u.log.logflags = info->bitmask;
+
+	if (info->bitmask & EBT_LOG_NFLOG)
+		nf_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
+		              par->out, &li, "%s", info->prefix);
+	else
+		ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in,
+		               par->out, &li, info->prefix);
+	return EBT_CONTINUE;
+}
+
+static struct xt_target ebt_log_tg_reg __read_mostly = {
+	.name		= "log",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.target		= ebt_log_tg,
+	.checkentry	= ebt_log_tg_check,
+	.targetsize	= sizeof(struct ebt_log_info),
+	.me		= THIS_MODULE,
+};
+
+static struct nf_logger ebt_log_logger __read_mostly = {
+	.name 		= "ebt_log",
+	.logfn		= &ebt_log_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_log_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&ebt_log_tg_reg);
+	if (ret < 0)
+		return ret;
+	nf_log_register(NFPROTO_BRIDGE, &ebt_log_logger);
+	return 0;
+}
+
+static void __exit ebt_log_fini(void)
+{
+	nf_log_unregister(&ebt_log_logger);
+	xt_unregister_target(&ebt_log_tg_reg);
+}
+
+module_init(ebt_log_init);
+module_exit(ebt_log_fini);
+MODULE_DESCRIPTION("Ebtables: Packet logging to syslog");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
new file mode 100644
index 00000000..66697cbd
--- /dev/null
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -0,0 +1,110 @@
+/*
+ *  ebt_mark
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  July, 2002
+ *
+ */
+
+/* The mark target can be used in any chain,
+ * I believe adding a mangle table just for marking is total overkill.
+ * Marking a frame doesn't really change anything in the frame anyway.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_mark_t.h>
+
+static unsigned int
+ebt_mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_mark_t_info *info = par->targinfo;
+	int action = info->target & -16;
+
+	if (action == MARK_SET_VALUE)
+		skb->mark = info->mark;
+	else if (action == MARK_OR_VALUE)
+		skb->mark |= info->mark;
+	else if (action == MARK_AND_VALUE)
+		skb->mark &= info->mark;
+	else
+		skb->mark ^= info->mark;
+
+	return info->target | ~EBT_VERDICT_BITS;
+}
+
+static int ebt_mark_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ebt_mark_t_info *info = par->targinfo;
+	int tmp;
+
+	tmp = info->target | ~EBT_VERDICT_BITS;
+	if (BASE_CHAIN && tmp == EBT_RETURN)
+		return -EINVAL;
+	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
+		return -EINVAL;
+	tmp = info->target & ~EBT_VERDICT_BITS;
+	if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE &&
+	    tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE)
+		return -EINVAL;
+	return 0;
+}
+#ifdef CONFIG_COMPAT
+struct compat_ebt_mark_t_info {
+	compat_ulong_t mark;
+	compat_uint_t target;
+};
+
+static void mark_tg_compat_from_user(void *dst, const void *src)
+{
+	const struct compat_ebt_mark_t_info *user = src;
+	struct ebt_mark_t_info *kern = dst;
+
+	kern->mark = user->mark;
+	kern->target = user->target;
+}
+
+static int mark_tg_compat_to_user(void __user *dst, const void *src)
+{
+	struct compat_ebt_mark_t_info __user *user = dst;
+	const struct ebt_mark_t_info *kern = src;
+
+	if (put_user(kern->mark, &user->mark) ||
+	    put_user(kern->target, &user->target))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+static struct xt_target ebt_mark_tg_reg __read_mostly = {
+	.name		= "mark",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.target		= ebt_mark_tg,
+	.checkentry	= ebt_mark_tg_check,
+	.targetsize	= sizeof(struct ebt_mark_t_info),
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ebt_mark_t_info),
+	.compat_from_user = mark_tg_compat_from_user,
+	.compat_to_user	= mark_tg_compat_to_user,
+#endif
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_mark_init(void)
+{
+	return xt_register_target(&ebt_mark_tg_reg);
+}
+
+static void __exit ebt_mark_fini(void)
+{
+	xt_unregister_target(&ebt_mark_tg_reg);
+}
+
+module_init(ebt_mark_init);
+module_exit(ebt_mark_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark modification");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
new file mode 100644
index 00000000..d98baefc
--- /dev/null
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -0,0 +1,98 @@
+/*
+ *  ebt_mark_m
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  July, 2002
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_mark_m.h>
+
+static bool
+ebt_mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_mark_m_info *info = par->matchinfo;
+
+	if (info->bitmask & EBT_MARK_OR)
+		return !!(skb->mark & info->mask) ^ info->invert;
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int ebt_mark_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_mark_m_info *info = par->matchinfo;
+
+	if (info->bitmask & ~EBT_MARK_MASK)
+		return -EINVAL;
+	if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND))
+		return -EINVAL;
+	if (!info->bitmask)
+		return -EINVAL;
+	return 0;
+}
+
+
+#ifdef CONFIG_COMPAT
+struct compat_ebt_mark_m_info {
+	compat_ulong_t mark, mask;
+	uint8_t invert, bitmask;
+};
+
+static void mark_mt_compat_from_user(void *dst, const void *src)
+{
+	const struct compat_ebt_mark_m_info *user = src;
+	struct ebt_mark_m_info *kern = dst;
+
+	kern->mark = user->mark;
+	kern->mask = user->mask;
+	kern->invert = user->invert;
+	kern->bitmask = user->bitmask;
+}
+
+static int mark_mt_compat_to_user(void __user *dst, const void *src)
+{
+	struct compat_ebt_mark_m_info __user *user = dst;
+	const struct ebt_mark_m_info *kern = src;
+
+	if (put_user(kern->mark, &user->mark) ||
+	    put_user(kern->mask, &user->mask) ||
+	    put_user(kern->invert, &user->invert) ||
+	    put_user(kern->bitmask, &user->bitmask))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+static struct xt_match ebt_mark_mt_reg __read_mostly = {
+	.name		= "mark_m",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_mark_mt,
+	.checkentry	= ebt_mark_mt_check,
+	.matchsize	= sizeof(struct ebt_mark_m_info),
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ebt_mark_m_info),
+	.compat_from_user = mark_mt_compat_from_user,
+	.compat_to_user	= mark_mt_compat_to_user,
+#endif
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_mark_m_init(void)
+{
+	return xt_register_match(&ebt_mark_mt_reg);
+}
+
+static void __exit ebt_mark_m_fini(void)
+{
+	xt_unregister_match(&ebt_mark_mt_reg);
+}
+
+module_init(ebt_mark_m_init);
+module_exit(ebt_mark_m_fini);
+MODULE_DESCRIPTION("Ebtables: Packet mark match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
new file mode 100644
index 00000000..5be68bbc
--- /dev/null
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -0,0 +1,72 @@
+/*
+ * ebt_nflog
+ *
+ *	Author:
+ *	Peter Warasin <peter@endian.com>
+ *
+ *  February, 2008
+ *
+ * Based on:
+ *  xt_NFLOG.c, (C) 2006 by Patrick McHardy <kaber@trash.net>
+ *  ebt_ulog.c, (C) 2004 by Bart De Schuymer <bdschuym@pandora.be>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nflog.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int
+ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_nflog_info *info = par->targinfo;
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_ULOG;
+	li.u.ulog.copy_len = info->len;
+	li.u.ulog.group = info->group;
+	li.u.ulog.qthreshold = info->threshold;
+
+	nf_log_packet(PF_BRIDGE, par->hooknum, skb, par->in, par->out,
+	              &li, "%s", info->prefix);
+	return EBT_CONTINUE;
+}
+
+static int ebt_nflog_tg_check(const struct xt_tgchk_param *par)
+{
+	struct ebt_nflog_info *info = par->targinfo;
+
+	if (info->flags & ~EBT_NFLOG_MASK)
+		return -EINVAL;
+	info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0';
+	return 0;
+}
+
+static struct xt_target ebt_nflog_tg_reg __read_mostly = {
+	.name       = "nflog",
+	.revision   = 0,
+	.family     = NFPROTO_BRIDGE,
+	.target     = ebt_nflog_tg,
+	.checkentry = ebt_nflog_tg_check,
+	.targetsize = sizeof(struct ebt_nflog_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init ebt_nflog_init(void)
+{
+	return xt_register_target(&ebt_nflog_tg_reg);
+}
+
+static void __exit ebt_nflog_fini(void)
+{
+	xt_unregister_target(&ebt_nflog_tg_reg);
+}
+
+module_init(ebt_nflog_init);
+module_exit(ebt_nflog_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Peter Warasin <peter@endian.com>");
+MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module");
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
new file mode 100644
index 00000000..496a5651
--- /dev/null
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -0,0 +1,56 @@
+/*
+ *  ebt_pkttype
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2003
+ *
+ */
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_pkttype.h>
+
+static bool
+ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_pkttype_info *info = par->matchinfo;
+
+	return (skb->pkt_type == info->pkt_type) ^ info->invert;
+}
+
+static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_pkttype_info *info = par->matchinfo;
+
+	if (info->invert != 0 && info->invert != 1)
+		return -EINVAL;
+	/* Allow any pkt_type value */
+	return 0;
+}
+
+static struct xt_match ebt_pkttype_mt_reg __read_mostly = {
+	.name		= "pkttype",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_pkttype_mt,
+	.checkentry	= ebt_pkttype_mt_check,
+	.matchsize	= sizeof(struct ebt_pkttype_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_pkttype_init(void)
+{
+	return xt_register_match(&ebt_pkttype_mt_reg);
+}
+
+static void __exit ebt_pkttype_fini(void)
+{
+	xt_unregister_match(&ebt_pkttype_mt_reg);
+}
+
+module_init(ebt_pkttype_init);
+module_exit(ebt_pkttype_fini);
+MODULE_DESCRIPTION("Ebtables: Link layer packet type match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
new file mode 100644
index 00000000..46624bb6
--- /dev/null
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -0,0 +1,80 @@
+/*
+ *  ebt_redirect
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include "../br_private.h"
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_redirect.h>
+
+static unsigned int
+ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_redirect_info *info = par->targinfo;
+
+	if (!skb_make_writable(skb, 0))
+		return EBT_DROP;
+
+	if (par->hooknum != NF_BR_BROUTING)
+		/* rcu_read_lock()ed by nf_hook_slow */
+		memcpy(eth_hdr(skb)->h_dest,
+		       br_port_get_rcu(par->in)->br->dev->dev_addr, ETH_ALEN);
+	else
+		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
+	skb->pkt_type = PACKET_HOST;
+	return info->target;
+}
+
+static int ebt_redirect_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ebt_redirect_info *info = par->targinfo;
+	unsigned int hook_mask;
+
+	if (BASE_CHAIN && info->target == EBT_RETURN)
+		return -EINVAL;
+
+	hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS);
+	if ((strcmp(par->table, "nat") != 0 ||
+	    hook_mask & ~(1 << NF_BR_PRE_ROUTING)) &&
+	    (strcmp(par->table, "broute") != 0 ||
+	    hook_mask & ~(1 << NF_BR_BROUTING)))
+		return -EINVAL;
+	if (INVALID_TARGET)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target ebt_redirect_tg_reg __read_mostly = {
+	.name		= "redirect",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_BROUTING),
+	.target		= ebt_redirect_tg,
+	.checkentry	= ebt_redirect_tg_check,
+	.targetsize	= sizeof(struct ebt_redirect_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_redirect_init(void)
+{
+	return xt_register_target(&ebt_redirect_tg_reg);
+}
+
+static void __exit ebt_redirect_fini(void)
+{
+	xt_unregister_target(&ebt_redirect_tg_reg);
+}
+
+module_init(ebt_redirect_init);
+module_exit(ebt_redirect_fini);
+MODULE_DESCRIPTION("Ebtables: Packet redirection to localhost");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
new file mode 100644
index 00000000..f8f0bd1a
--- /dev/null
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -0,0 +1,87 @@
+/*
+ *  ebt_snat
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  June, 2002
+ *
+ */
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_nat.h>
+
+static unsigned int
+ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ebt_nat_info *info = par->targinfo;
+
+	if (!skb_make_writable(skb, 0))
+		return EBT_DROP;
+
+	memcpy(eth_hdr(skb)->h_source, info->mac, ETH_ALEN);
+	if (!(info->target & NAT_ARP_BIT) &&
+	    eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) {
+		const struct arphdr *ap;
+		struct arphdr _ah;
+
+		ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah);
+		if (ap == NULL)
+			return EBT_DROP;
+		if (ap->ar_hln != ETH_ALEN)
+			goto out;
+		if (skb_store_bits(skb, sizeof(_ah), info->mac,ETH_ALEN))
+			return EBT_DROP;
+	}
+out:
+	return info->target | ~EBT_VERDICT_BITS;
+}
+
+static int ebt_snat_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ebt_nat_info *info = par->targinfo;
+	int tmp;
+
+	tmp = info->target | ~EBT_VERDICT_BITS;
+	if (BASE_CHAIN && tmp == EBT_RETURN)
+		return -EINVAL;
+
+	if (tmp < -NUM_STANDARD_TARGETS || tmp >= 0)
+		return -EINVAL;
+	tmp = info->target | EBT_VERDICT_BITS;
+	if ((tmp & ~NAT_ARP_BIT) != ~NAT_ARP_BIT)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target ebt_snat_tg_reg __read_mostly = {
+	.name		= "snat",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.table		= "nat",
+	.hooks		= (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_POST_ROUTING),
+	.target		= ebt_snat_tg,
+	.checkentry	= ebt_snat_tg_check,
+	.targetsize	= sizeof(struct ebt_nat_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_snat_init(void)
+{
+	return xt_register_target(&ebt_snat_tg_reg);
+}
+
+static void __exit ebt_snat_fini(void)
+{
+	xt_unregister_target(&ebt_snat_tg_reg);
+}
+
+module_init(ebt_snat_init);
+module_exit(ebt_snat_fini);
+MODULE_DESCRIPTION("Ebtables: Source MAC address translation");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
new file mode 100644
index 00000000..5b33a2e6
--- /dev/null
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -0,0 +1,197 @@
+/*
+ *  ebt_stp
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Stephen Hemminger <shemminger@osdl.org>
+ *
+ *  July, 2003
+ */
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_stp.h>
+
+#define BPDU_TYPE_CONFIG 0
+#define BPDU_TYPE_TCN 0x80
+
+struct stp_header {
+	uint8_t dsap;
+	uint8_t ssap;
+	uint8_t ctrl;
+	uint8_t pid;
+	uint8_t vers;
+	uint8_t type;
+};
+
+struct stp_config_pdu {
+	uint8_t flags;
+	uint8_t root[8];
+	uint8_t root_cost[4];
+	uint8_t sender[8];
+	uint8_t port[2];
+	uint8_t msg_age[2];
+	uint8_t max_age[2];
+	uint8_t hello_time[2];
+	uint8_t forward_delay[2];
+};
+
+#define NR16(p) (p[0] << 8 | p[1])
+#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3])
+
+static bool ebt_filter_config(const struct ebt_stp_info *info,
+   const struct stp_config_pdu *stpc)
+{
+	const struct ebt_stp_config_info *c;
+	uint16_t v16;
+	uint32_t v32;
+	int verdict, i;
+
+	c = &info->config;
+	if ((info->bitmask & EBT_STP_FLAGS) &&
+	    FWINV(c->flags != stpc->flags, EBT_STP_FLAGS))
+		return false;
+	if (info->bitmask & EBT_STP_ROOTPRIO) {
+		v16 = NR16(stpc->root);
+		if (FWINV(v16 < c->root_priol ||
+		    v16 > c->root_priou, EBT_STP_ROOTPRIO))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_ROOTADDR) {
+		verdict = 0;
+		for (i = 0; i < 6; i++)
+			verdict |= (stpc->root[2+i] ^ c->root_addr[i]) &
+				   c->root_addrmsk[i];
+		if (FWINV(verdict != 0, EBT_STP_ROOTADDR))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_ROOTCOST) {
+		v32 = NR32(stpc->root_cost);
+		if (FWINV(v32 < c->root_costl ||
+		    v32 > c->root_costu, EBT_STP_ROOTCOST))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_SENDERPRIO) {
+		v16 = NR16(stpc->sender);
+		if (FWINV(v16 < c->sender_priol ||
+		    v16 > c->sender_priou, EBT_STP_SENDERPRIO))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_SENDERADDR) {
+		verdict = 0;
+		for (i = 0; i < 6; i++)
+			verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) &
+				   c->sender_addrmsk[i];
+		if (FWINV(verdict != 0, EBT_STP_SENDERADDR))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_PORT) {
+		v16 = NR16(stpc->port);
+		if (FWINV(v16 < c->portl ||
+		    v16 > c->portu, EBT_STP_PORT))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_MSGAGE) {
+		v16 = NR16(stpc->msg_age);
+		if (FWINV(v16 < c->msg_agel ||
+		    v16 > c->msg_ageu, EBT_STP_MSGAGE))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_MAXAGE) {
+		v16 = NR16(stpc->max_age);
+		if (FWINV(v16 < c->max_agel ||
+		    v16 > c->max_ageu, EBT_STP_MAXAGE))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_HELLOTIME) {
+		v16 = NR16(stpc->hello_time);
+		if (FWINV(v16 < c->hello_timel ||
+		    v16 > c->hello_timeu, EBT_STP_HELLOTIME))
+			return false;
+	}
+	if (info->bitmask & EBT_STP_FWDD) {
+		v16 = NR16(stpc->forward_delay);
+		if (FWINV(v16 < c->forward_delayl ||
+		    v16 > c->forward_delayu, EBT_STP_FWDD))
+			return false;
+	}
+	return true;
+}
+
+static bool
+ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_stp_info *info = par->matchinfo;
+	const struct stp_header *sp;
+	struct stp_header _stph;
+	const uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00};
+
+	sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph);
+	if (sp == NULL)
+		return false;
+
+	/* The stp code only considers these */
+	if (memcmp(sp, header, sizeof(header)))
+		return false;
+
+	if (info->bitmask & EBT_STP_TYPE &&
+	    FWINV(info->type != sp->type, EBT_STP_TYPE))
+		return false;
+
+	if (sp->type == BPDU_TYPE_CONFIG &&
+	    info->bitmask & EBT_STP_CONFIG_MASK) {
+		const struct stp_config_pdu *st;
+		struct stp_config_pdu _stpc;
+
+		st = skb_header_pointer(skb, sizeof(_stph),
+					sizeof(_stpc), &_stpc);
+		if (st == NULL)
+			return false;
+		return ebt_filter_config(info, st);
+	}
+	return true;
+}
+
+static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ebt_stp_info *info = par->matchinfo;
+	const uint8_t bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
+	const uint8_t msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+	const struct ebt_entry *e = par->entryinfo;
+
+	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
+	    !(info->bitmask & EBT_STP_MASK))
+		return -EINVAL;
+	/* Make sure the match only receives stp frames */
+	if (compare_ether_addr(e->destmac, bridge_ula) ||
+	    compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct xt_match ebt_stp_mt_reg __read_mostly = {
+	.name		= "stp",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_stp_mt,
+	.checkentry	= ebt_stp_mt_check,
+	.matchsize	= sizeof(struct ebt_stp_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_stp_init(void)
+{
+	return xt_register_match(&ebt_stp_mt_reg);
+}
+
+static void __exit ebt_stp_fini(void)
+{
+	xt_unregister_match(&ebt_stp_mt_reg);
+}
+
+module_init(ebt_stp_init);
+module_exit(ebt_stp_fini);
+MODULE_DESCRIPTION("Ebtables: Spanning Tree Protocol packet match");
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
new file mode 100644
index 00000000..26377e96
--- /dev/null
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -0,0 +1,342 @@
+/*
+ * netfilter module for userspace bridged Ethernet frames logging daemons
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Harald Welte <laforge@netfilter.org>
+ *
+ *  November, 2004
+ *
+ * Based on ipt_ULOG.c, which is
+ * (C) 2000-2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This module accepts two parameters:
+ *
+ * nlbufsiz:
+ *   The parameter specifies how big the buffer for each netlink multicast
+ * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
+ * get accumulated in the kernel until they are sent to userspace. It is
+ * NOT possible to allocate more than 128kB, and it is strongly discouraged,
+ * because atomically allocating 128kB inside the network rx softirq is not
+ * reliable. Please also keep in mind that this buffer size is allocated for
+ * each nlgroup you are using, so the total kernel memory usage increases
+ * by that factor.
+ *
+ * flushtimeout:
+ *   Specify, after how many hundredths of a second the queue should be
+ *   flushed even if it is not full yet.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_ulog.h>
+#include <net/netfilter/nf_log.h>
+#include <net/sock.h>
+#include "../br_private.h"
+
+static unsigned int nlbufsiz = NLMSG_GOODSIZE;
+module_param(nlbufsiz, uint, 0600);
+MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) "
+			   "(defaults to 4096)");
+
+static unsigned int flushtimeout = 10;
+module_param(flushtimeout, uint, 0600);
+MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) "
+			       "(defaults to 10)");
+
+typedef struct {
+	unsigned int qlen;		/* number of nlmsgs' in the skb */
+	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
+	struct sk_buff *skb;		/* the pre-allocated skb */
+	struct timer_list timer;	/* the timer function */
+	spinlock_t lock;		/* the per-queue lock */
+} ebt_ulog_buff_t;
+
+static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
+static struct sock *ebtulognl;
+
+/* send one ulog_buff_t to userspace */
+static void ulog_send(unsigned int nlgroup)
+{
+	ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup];
+
+	if (timer_pending(&ub->timer))
+		del_timer(&ub->timer);
+
+	if (!ub->skb)
+		return;
+
+	/* last nlmsg needs NLMSG_DONE */
+	if (ub->qlen > 1)
+		ub->lastnlh->nlmsg_type = NLMSG_DONE;
+
+	NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
+	netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
+
+	ub->qlen = 0;
+	ub->skb = NULL;
+}
+
+/* timer function to flush queue in flushtimeout time */
+static void ulog_timer(unsigned long data)
+{
+	spin_lock_bh(&ulog_buffers[data].lock);
+	if (ulog_buffers[data].skb)
+		ulog_send(data);
+	spin_unlock_bh(&ulog_buffers[data].lock);
+}
+
+static struct sk_buff *ulog_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb;
+	unsigned int n;
+
+	n = max(size, nlbufsiz);
+	skb = alloc_skb(n, GFP_ATOMIC);
+	if (!skb) {
+		pr_debug("cannot alloc whole buffer of size %ub!\n", n);
+		if (n > size) {
+			/* try to allocate only as much as we need for
+			 * current packet */
+			skb = alloc_skb(size, GFP_ATOMIC);
+			if (!skb)
+				pr_debug("cannot even allocate "
+					 "buffer of size %ub\n", size);
+		}
+	}
+
+	return skb;
+}
+
+static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
+   const struct net_device *in, const struct net_device *out,
+   const struct ebt_ulog_info *uloginfo, const char *prefix)
+{
+	ebt_ulog_packet_msg_t *pm;
+	size_t size, copy_len;
+	struct nlmsghdr *nlh;
+	unsigned int group = uloginfo->nlgroup;
+	ebt_ulog_buff_t *ub = &ulog_buffers[group];
+	spinlock_t *lock = &ub->lock;
+	ktime_t kt;
+
+	if ((uloginfo->cprange == 0) ||
+	    (uloginfo->cprange > skb->len + ETH_HLEN))
+		copy_len = skb->len + ETH_HLEN;
+	else
+		copy_len = uloginfo->cprange;
+
+	size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+	if (size > nlbufsiz) {
+		pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz);
+		return;
+	}
+
+	spin_lock_bh(lock);
+
+	if (!ub->skb) {
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	} else if (size > skb_tailroom(ub->skb)) {
+		ulog_send(group);
+
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	}
+
+	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, 0,
+			size - NLMSG_ALIGN(sizeof(*nlh)));
+	ub->qlen++;
+
+	pm = NLMSG_DATA(nlh);
+
+	/* Fill in the ulog data */
+	pm->version = EBT_ULOG_VERSION;
+	kt = ktime_get_real();
+	pm->stamp = ktime_to_timeval(kt);
+	if (ub->qlen == 1)
+		ub->skb->tstamp = kt;
+	pm->data_len = copy_len;
+	pm->mark = skb->mark;
+	pm->hook = hooknr;
+	if (uloginfo->prefix != NULL)
+		strcpy(pm->prefix, uloginfo->prefix);
+	else
+		*(pm->prefix) = '\0';
+
+	if (in) {
+		strcpy(pm->physindev, in->name);
+		/* If in isn't a bridge, then physindev==indev */
+		if (br_port_exists(in))
+			/* rcu_read_lock()ed by nf_hook_slow */
+			strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
+		else
+			strcpy(pm->indev, in->name);
+	} else
+		pm->indev[0] = pm->physindev[0] = '\0';
+
+	if (out) {
+		/* If out exists, then out is a bridge port */
+		strcpy(pm->physoutdev, out->name);
+		/* rcu_read_lock()ed by nf_hook_slow */
+		strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
+	} else
+		pm->outdev[0] = pm->physoutdev[0] = '\0';
+
+	if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0)
+		BUG();
+
+	if (ub->qlen > 1)
+		ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
+
+	ub->lastnlh = nlh;
+
+	if (ub->qlen >= uloginfo->qthreshold)
+		ulog_send(group);
+	else if (!timer_pending(&ub->timer)) {
+		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
+		add_timer(&ub->timer);
+	}
+
+unlock:
+	spin_unlock_bh(lock);
+
+	return;
+
+nlmsg_failure:
+	pr_debug("error during NLMSG_PUT. This should "
+		 "not happen, please report to author.\n");
+	goto unlock;
+alloc_failure:
+	goto unlock;
+}
+
+/* this function is registered with the netfilter core */
+static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
+   const struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, const struct nf_loginfo *li,
+   const char *prefix)
+{
+	struct ebt_ulog_info loginfo;
+
+	if (!li || li->type != NF_LOG_TYPE_ULOG) {
+		loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
+		loginfo.cprange = 0;
+		loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
+		loginfo.prefix[0] = '\0';
+	} else {
+		loginfo.nlgroup = li->u.ulog.group;
+		loginfo.cprange = li->u.ulog.copy_len;
+		loginfo.qthreshold = li->u.ulog.qthreshold;
+		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+	}
+
+	ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static unsigned int
+ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	ebt_ulog_packet(par->hooknum, skb, par->in, par->out,
+	                par->targinfo, NULL);
+	return EBT_CONTINUE;
+}
+
+static int ebt_ulog_tg_check(const struct xt_tgchk_param *par)
+{
+	struct ebt_ulog_info *uloginfo = par->targinfo;
+
+	if (uloginfo->nlgroup > 31)
+		return -EINVAL;
+
+	uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0';
+
+	if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN)
+		uloginfo->qthreshold = EBT_ULOG_MAX_QLEN;
+
+	return 0;
+}
+
+static struct xt_target ebt_ulog_tg_reg __read_mostly = {
+	.name		= "ulog",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.target		= ebt_ulog_tg,
+	.checkentry	= ebt_ulog_tg_check,
+	.targetsize	= sizeof(struct ebt_ulog_info),
+	.me		= THIS_MODULE,
+};
+
+static struct nf_logger ebt_ulog_logger __read_mostly = {
+	.name		= "ebt_ulog",
+	.logfn		= &ebt_log_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_ulog_init(void)
+{
+	int ret;
+	int i;
+
+	if (nlbufsiz >= 128*1024) {
+		pr_warning("Netlink buffer has to be <= 128kB,"
+			   " please try a smaller nlbufsiz parameter.\n");
+		return -EINVAL;
+	}
+
+	/* initialize ulog_buffers */
+	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
+		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+		spin_lock_init(&ulog_buffers[i].lock);
+	}
+
+	ebtulognl = netlink_kernel_create(&init_net, NETLINK_NFLOG,
+					  EBT_ULOG_MAXNLGROUPS, NULL, NULL,
+					  THIS_MODULE);
+	if (!ebtulognl)
+		ret = -ENOMEM;
+	else if ((ret = xt_register_target(&ebt_ulog_tg_reg)) != 0)
+		netlink_kernel_release(ebtulognl);
+
+	if (ret == 0)
+		nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
+
+	return ret;
+}
+
+static void __exit ebt_ulog_fini(void)
+{
+	ebt_ulog_buff_t *ub;
+	int i;
+
+	nf_log_unregister(&ebt_ulog_logger);
+	xt_unregister_target(&ebt_ulog_tg_reg);
+	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
+		ub = &ulog_buffers[i];
+		if (timer_pending(&ub->timer))
+			del_timer(&ub->timer);
+		spin_lock_bh(&ub->lock);
+		if (ub->skb) {
+			kfree_skb(ub->skb);
+			ub->skb = NULL;
+		}
+		spin_unlock_bh(&ub->lock);
+	}
+	netlink_kernel_release(ebtulognl);
+}
+
+module_init(ebt_ulog_init);
+module_exit(ebt_ulog_fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG");
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
new file mode 100644
index 00000000..eae67bf0
--- /dev/null
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -0,0 +1,181 @@
+/*
+ * Description: EBTables 802.1Q match extension kernelspace module.
+ * Authors: Nick Fedchik <nick@fedchik.org.ua>
+ *          Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/netfilter_bridge/ebt_vlan.h>
+
+#define MODULE_VERS "0.6"
+
+MODULE_AUTHOR("Nick Fedchik <nick@fedchik.org.ua>");
+MODULE_DESCRIPTION("Ebtables: 802.1Q VLAN tag match");
+MODULE_LICENSE("GPL");
+
+#define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_
+#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return false; }
+
+static bool
+ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ebt_vlan_info *info = par->matchinfo;
+
+	unsigned short TCI;	/* Whole TCI, given from parsed frame */
+	unsigned short id;	/* VLAN ID, given from frame TCI */
+	unsigned char prio;	/* user_priority, given from frame TCI */
+	/* VLAN encapsulated Type/Length field, given from orig frame */
+	__be16 encap;
+
+	if (vlan_tx_tag_present(skb)) {
+		TCI = vlan_tx_tag_get(skb);
+		encap = skb->protocol;
+	} else {
+		const struct vlan_hdr *fp;
+		struct vlan_hdr _frame;
+
+		fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
+		if (fp == NULL)
+			return false;
+
+		TCI = ntohs(fp->h_vlan_TCI);
+		encap = fp->h_vlan_encapsulated_proto;
+	}
+
+	/* Tag Control Information (TCI) consists of the following elements:
+	 * - User_priority. The user_priority field is three bits in length,
+	 * interpreted as a binary number.
+	 * - Canonical Format Indicator (CFI). The Canonical Format Indicator
+	 * (CFI) is a single bit flag value. Currently ignored.
+	 * - VLAN Identifier (VID). The VID is encoded as
+	 * an unsigned binary number. */
+	id = TCI & VLAN_VID_MASK;
+	prio = (TCI >> 13) & 0x7;
+
+	/* Checking VLAN Identifier (VID) */
+	if (GET_BITMASK(EBT_VLAN_ID))
+		EXIT_ON_MISMATCH(id, EBT_VLAN_ID);
+
+	/* Checking user_priority */
+	if (GET_BITMASK(EBT_VLAN_PRIO))
+		EXIT_ON_MISMATCH(prio, EBT_VLAN_PRIO);
+
+	/* Checking Encapsulated Proto (Length/Type) field */
+	if (GET_BITMASK(EBT_VLAN_ENCAP))
+		EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP);
+
+	return true;
+}
+
+static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
+{
+	struct ebt_vlan_info *info = par->matchinfo;
+	const struct ebt_entry *e = par->entryinfo;
+
+	/* Is it 802.1Q frame checked? */
+	if (e->ethproto != htons(ETH_P_8021Q)) {
+		pr_debug("passed entry proto %2.4X is not 802.1Q (8100)\n",
+			 ntohs(e->ethproto));
+		return -EINVAL;
+	}
+
+	/* Check for bitmask range
+	 * True if even one bit is out of mask */
+	if (info->bitmask & ~EBT_VLAN_MASK) {
+		pr_debug("bitmask %2X is out of mask (%2X)\n",
+			 info->bitmask, EBT_VLAN_MASK);
+		return -EINVAL;
+	}
+
+	/* Check for inversion flags range */
+	if (info->invflags & ~EBT_VLAN_MASK) {
+		pr_debug("inversion flags %2X is out of mask (%2X)\n",
+			 info->invflags, EBT_VLAN_MASK);
+		return -EINVAL;
+	}
+
+	/* Reserved VLAN ID (VID) values
+	 * -----------------------------
+	 * 0 - The null VLAN ID.
+	 * 1 - The default Port VID (PVID)
+	 * 0x0FFF - Reserved for implementation use.
+	 * if_vlan.h: VLAN_N_VID 4096. */
+	if (GET_BITMASK(EBT_VLAN_ID)) {
+		if (!!info->id) { /* if id!=0 => check vid range */
+			if (info->id > VLAN_N_VID) {
+				pr_debug("id %d is out of range (1-4096)\n",
+					 info->id);
+				return -EINVAL;
+			}
+			/* Note: This is valid VLAN-tagged frame point.
+			 * Any value of user_priority are acceptable,
+			 * but should be ignored according to 802.1Q Std.
+			 * So we just drop the prio flag. */
+			info->bitmask &= ~EBT_VLAN_PRIO;
+		}
+		/* Else, id=0 (null VLAN ID)  => user_priority range (any?) */
+	}
+
+	if (GET_BITMASK(EBT_VLAN_PRIO)) {
+		if ((unsigned char) info->prio > 7) {
+			pr_debug("prio %d is out of range (0-7)\n",
+				 info->prio);
+			return -EINVAL;
+		}
+	}
+	/* Check for encapsulated proto range - it is possible to be
+	 * any value for u_short range.
+	 * if_ether.h:  ETH_ZLEN        60   -  Min. octets in frame sans FCS */
+	if (GET_BITMASK(EBT_VLAN_ENCAP)) {
+		if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) {
+			pr_debug("encap frame length %d is less than "
+				 "minimal\n", ntohs(info->encap));
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct xt_match ebt_vlan_mt_reg __read_mostly = {
+	.name		= "vlan",
+	.revision	= 0,
+	.family		= NFPROTO_BRIDGE,
+	.match		= ebt_vlan_mt,
+	.checkentry	= ebt_vlan_mt_check,
+	.matchsize	= sizeof(struct ebt_vlan_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init ebt_vlan_init(void)
+{
+	pr_debug("ebtables 802.1Q extension module v" MODULE_VERS "\n");
+	return xt_register_match(&ebt_vlan_mt_reg);
+}
+
+static void __exit ebt_vlan_fini(void)
+{
+	xt_unregister_match(&ebt_vlan_mt_reg);
+}
+
+module_init(ebt_vlan_init);
+module_exit(ebt_vlan_fini);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
new file mode 100644
index 00000000..1bcaf36a
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -0,0 +1,104 @@
+/*
+ *  ebtable_broute
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2002
+ *
+ *  This table lets you choose between routing and bridging for frames
+ *  entering on a bridge enslaved nic. This table is traversed before any
+ *  other ebtables table. See net/bridge/br_input.c.
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+#include <linux/if_bridge.h>
+
+/* EBT_ACCEPT means the frame will be bridged
+ * EBT_DROP means the frame will be routed
+ */
+static struct ebt_entries initial_chain = {
+	.name		= "BROUTING",
+	.policy		= EBT_ACCEPT,
+};
+
+static struct ebt_replace_kernel initial_table =
+{
+	.name		= "broute",
+	.valid_hooks	= 1 << NF_BR_BROUTING,
+	.entries_size	= sizeof(struct ebt_entries),
+	.hook_entry	= {
+		[NF_BR_BROUTING]	= &initial_chain,
+	},
+	.entries	= (char *)&initial_chain,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+	if (valid_hooks & ~(1 << NF_BR_BROUTING))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct ebt_table broute_table =
+{
+	.name		= "broute",
+	.table		= &initial_table,
+	.valid_hooks	= 1 << NF_BR_BROUTING,
+	.check		= check,
+	.me		= THIS_MODULE,
+};
+
+static int ebt_broute(struct sk_buff *skb)
+{
+	int ret;
+
+	ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
+			   dev_net(skb->dev)->xt.broute_table);
+	if (ret == NF_DROP)
+		return 1; /* route it */
+	return 0; /* bridge it */
+}
+
+static int __net_init broute_net_init(struct net *net)
+{
+	net->xt.broute_table = ebt_register_table(net, &broute_table);
+	if (IS_ERR(net->xt.broute_table))
+		return PTR_ERR(net->xt.broute_table);
+	return 0;
+}
+
+static void __net_exit broute_net_exit(struct net *net)
+{
+	ebt_unregister_table(net, net->xt.broute_table);
+}
+
+static struct pernet_operations broute_net_ops = {
+	.init = broute_net_init,
+	.exit = broute_net_exit,
+};
+
+static int __init ebtable_broute_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&broute_net_ops);
+	if (ret < 0)
+		return ret;
+	/* see br_input.c */
+	rcu_assign_pointer(br_should_route_hook,
+			   (br_should_route_hook_t *)ebt_broute);
+	return 0;
+}
+
+static void __exit ebtable_broute_fini(void)
+{
+	rcu_assign_pointer(br_should_route_hook, NULL);
+	synchronize_net();
+	unregister_pernet_subsys(&broute_net_ops);
+}
+
+module_init(ebtable_broute_init);
+module_exit(ebtable_broute_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
new file mode 100644
index 00000000..42e6bd09
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -0,0 +1,139 @@
+/*
+ *  ebtable_filter
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2002
+ *
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+
+#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
+   (1 << NF_BR_LOCAL_OUT))
+
+static struct ebt_entries initial_chains[] =
+{
+	{
+		.name	= "INPUT",
+		.policy	= EBT_ACCEPT,
+	},
+	{
+		.name	= "FORWARD",
+		.policy	= EBT_ACCEPT,
+	},
+	{
+		.name	= "OUTPUT",
+		.policy	= EBT_ACCEPT,
+	},
+};
+
+static struct ebt_replace_kernel initial_table =
+{
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.entries_size	= 3 * sizeof(struct ebt_entries),
+	.hook_entry	= {
+		[NF_BR_LOCAL_IN]	= &initial_chains[0],
+		[NF_BR_FORWARD]		= &initial_chains[1],
+		[NF_BR_LOCAL_OUT]	= &initial_chains[2],
+	},
+	.entries	= (char *)initial_chains,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+	if (valid_hooks & ~FILTER_VALID_HOOKS)
+		return -EINVAL;
+	return 0;
+}
+
+static const struct ebt_table frame_filter =
+{
+	.name		= "filter",
+	.table		= &initial_table,
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.check		= check,
+	.me		= THIS_MODULE,
+};
+
+static unsigned int
+ebt_in_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, int (*okfn)(struct sk_buff *))
+{
+	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_filter);
+}
+
+static unsigned int
+ebt_out_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, int (*okfn)(struct sk_buff *))
+{
+	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_filter);
+}
+
+static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
+	{
+		.hook		= ebt_in_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_LOCAL_IN,
+		.priority	= NF_BR_PRI_FILTER_BRIDGED,
+	},
+	{
+		.hook		= ebt_in_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_FORWARD,
+		.priority	= NF_BR_PRI_FILTER_BRIDGED,
+	},
+	{
+		.hook		= ebt_out_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_LOCAL_OUT,
+		.priority	= NF_BR_PRI_FILTER_OTHER,
+	},
+};
+
+static int __net_init frame_filter_net_init(struct net *net)
+{
+	net->xt.frame_filter = ebt_register_table(net, &frame_filter);
+	if (IS_ERR(net->xt.frame_filter))
+		return PTR_ERR(net->xt.frame_filter);
+	return 0;
+}
+
+static void __net_exit frame_filter_net_exit(struct net *net)
+{
+	ebt_unregister_table(net, net->xt.frame_filter);
+}
+
+static struct pernet_operations frame_filter_net_ops = {
+	.init = frame_filter_net_init,
+	.exit = frame_filter_net_exit,
+};
+
+static int __init ebtable_filter_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&frame_filter_net_ops);
+	if (ret < 0)
+		return ret;
+	ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
+	if (ret < 0)
+		unregister_pernet_subsys(&frame_filter_net_ops);
+	return ret;
+}
+
+static void __exit ebtable_filter_fini(void)
+{
+	nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
+	unregister_pernet_subsys(&frame_filter_net_ops);
+}
+
+module_init(ebtable_filter_init);
+module_exit(ebtable_filter_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
new file mode 100644
index 00000000..6dc2f878
--- /dev/null
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -0,0 +1,139 @@
+/*
+ *  ebtable_nat
+ *
+ *	Authors:
+ *	Bart De Schuymer <bdschuym@pandora.be>
+ *
+ *  April, 2002
+ *
+ */
+
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/module.h>
+
+#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
+   (1 << NF_BR_POST_ROUTING))
+
+static struct ebt_entries initial_chains[] =
+{
+	{
+		.name	= "PREROUTING",
+		.policy	= EBT_ACCEPT,
+	},
+	{
+		.name	= "OUTPUT",
+		.policy	= EBT_ACCEPT,
+	},
+	{
+		.name	= "POSTROUTING",
+		.policy	= EBT_ACCEPT,
+	}
+};
+
+static struct ebt_replace_kernel initial_table =
+{
+	.name		= "nat",
+	.valid_hooks	= NAT_VALID_HOOKS,
+	.entries_size	= 3 * sizeof(struct ebt_entries),
+	.hook_entry	= {
+		[NF_BR_PRE_ROUTING]	= &initial_chains[0],
+		[NF_BR_LOCAL_OUT]	= &initial_chains[1],
+		[NF_BR_POST_ROUTING]	= &initial_chains[2],
+	},
+	.entries	= (char *)initial_chains,
+};
+
+static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
+{
+	if (valid_hooks & ~NAT_VALID_HOOKS)
+		return -EINVAL;
+	return 0;
+}
+
+static struct ebt_table frame_nat =
+{
+	.name		= "nat",
+	.table		= &initial_table,
+	.valid_hooks	= NAT_VALID_HOOKS,
+	.check		= check,
+	.me		= THIS_MODULE,
+};
+
+static unsigned int
+ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in
+   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+{
+	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat);
+}
+
+static unsigned int
+ebt_nat_out(unsigned int hook, struct sk_buff *skb, const struct net_device *in
+   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+{
+	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_nat);
+}
+
+static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
+	{
+		.hook		= ebt_nat_out,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_LOCAL_OUT,
+		.priority	= NF_BR_PRI_NAT_DST_OTHER,
+	},
+	{
+		.hook		= ebt_nat_out,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_POST_ROUTING,
+		.priority	= NF_BR_PRI_NAT_SRC,
+	},
+	{
+		.hook		= ebt_nat_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_BRIDGE,
+		.hooknum	= NF_BR_PRE_ROUTING,
+		.priority	= NF_BR_PRI_NAT_DST_BRIDGED,
+	},
+};
+
+static int __net_init frame_nat_net_init(struct net *net)
+{
+	net->xt.frame_nat = ebt_register_table(net, &frame_nat);
+	if (IS_ERR(net->xt.frame_nat))
+		return PTR_ERR(net->xt.frame_nat);
+	return 0;
+}
+
+static void __net_exit frame_nat_net_exit(struct net *net)
+{
+	ebt_unregister_table(net, net->xt.frame_nat);
+}
+
+static struct pernet_operations frame_nat_net_ops = {
+	.init = frame_nat_net_init,
+	.exit = frame_nat_net_exit,
+};
+
+static int __init ebtable_nat_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&frame_nat_net_ops);
+	if (ret < 0)
+		return ret;
+	ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
+	if (ret < 0)
+		unregister_pernet_subsys(&frame_nat_net_ops);
+	return ret;
+}
+
+static void __exit ebtable_nat_fini(void)
+{
+	nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
+	unregister_pernet_subsys(&frame_nat_net_ops);
+}
+
+module_init(ebtable_nat_init);
+module_exit(ebtable_nat_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
new file mode 100644
index 00000000..2b5ca1a0
--- /dev/null
+++ b/net/bridge/netfilter/ebtables.c
@@ -0,0 +1,2416 @@
+/*
+ *  ebtables
+ *
+ *  Author:
+ *  Bart De Schuymer		<bdschuym@pandora.be>
+ *
+ *  ebtables.c,v 2.0, July, 2002
+ *
+ *  This code is stongly inspired on the iptables code which is
+ *  Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <net/sock.h>
+/* needed for logical [in,out]-dev filtering */
+#include "../br_private.h"
+
+#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
+					 "report to author: "format, ## args)
+/* #define BUGPRINT(format, args...) */
+
+/*
+ * Each cpu has its own set of counters, so there is no need for write_lock in
+ * the softirq
+ * For reading or updating the counters, the user context needs to
+ * get a write_lock
+ */
+
+/* The size of each set of counters is altered to get cache alignment */
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter)))
+#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \
+   COUNTER_OFFSET(n) * cpu))
+
+
+
+static DEFINE_MUTEX(ebt_mutex);
+
+#ifdef CONFIG_COMPAT
+static void ebt_standard_compat_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v >= 0)
+		v += xt_compat_calc_jump(NFPROTO_BRIDGE, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int ebt_standard_compat_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv >= 0)
+		cv -= xt_compat_calc_jump(NFPROTO_BRIDGE, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+#endif
+
+
+static struct xt_target ebt_standard_target = {
+	.name       = "standard",
+	.revision   = 0,
+	.family     = NFPROTO_BRIDGE,
+	.targetsize = sizeof(int),
+#ifdef CONFIG_COMPAT
+	.compatsize = sizeof(compat_int_t),
+	.compat_from_user = ebt_standard_compat_from_user,
+	.compat_to_user =  ebt_standard_compat_to_user,
+#endif
+};
+
+static inline int
+ebt_do_watcher(const struct ebt_entry_watcher *w, struct sk_buff *skb,
+	       struct xt_action_param *par)
+{
+	par->target   = w->u.watcher;
+	par->targinfo = w->data;
+	w->u.watcher->target(skb, par);
+	/* watchers don't give a verdict */
+	return 0;
+}
+
+static inline int
+ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
+	     struct xt_action_param *par)
+{
+	par->match     = m->u.match;
+	par->matchinfo = m->data;
+	return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+}
+
+static inline int
+ebt_dev_check(const char *entry, const struct net_device *device)
+{
+	int i = 0;
+	const char *devname;
+
+	if (*entry == '\0')
+		return 0;
+	if (!device)
+		return 1;
+	devname = device->name;
+	/* 1 is the wildcard token */
+	while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i])
+		i++;
+	return (devname[i] != entry[i] && entry[i] != 1);
+}
+
+#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg))
+/* process standard matches */
+static inline int
+ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
+                const struct net_device *in, const struct net_device *out)
+{
+	const struct ethhdr *h = eth_hdr(skb);
+	const struct net_bridge_port *p;
+	__be16 ethproto;
+	int verdict, i;
+
+	if (vlan_tx_tag_present(skb))
+		ethproto = htons(ETH_P_8021Q);
+	else
+		ethproto = h->h_proto;
+
+	if (e->bitmask & EBT_802_3) {
+		if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
+			return 1;
+	} else if (!(e->bitmask & EBT_NOPROTO) &&
+	   FWINV2(e->ethproto != ethproto, EBT_IPROTO))
+		return 1;
+
+	if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
+		return 1;
+	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
+		return 1;
+	/* rcu_read_lock()ed by nf_hook_slow */
+	if (in && (p = br_port_get_rcu(in)) != NULL &&
+	    FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN))
+		return 1;
+	if (out && (p = br_port_get_rcu(out)) != NULL &&
+	    FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT))
+		return 1;
+
+	if (e->bitmask & EBT_SOURCEMAC) {
+		verdict = 0;
+		for (i = 0; i < 6; i++)
+			verdict |= (h->h_source[i] ^ e->sourcemac[i]) &
+			   e->sourcemsk[i];
+		if (FWINV2(verdict != 0, EBT_ISOURCE) )
+			return 1;
+	}
+	if (e->bitmask & EBT_DESTMAC) {
+		verdict = 0;
+		for (i = 0; i < 6; i++)
+			verdict |= (h->h_dest[i] ^ e->destmac[i]) &
+			   e->destmsk[i];
+		if (FWINV2(verdict != 0, EBT_IDEST) )
+			return 1;
+	}
+	return 0;
+}
+
+static inline __pure
+struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+/* Do some firewalling */
+unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
+   const struct net_device *in, const struct net_device *out,
+   struct ebt_table *table)
+{
+	int i, nentries;
+	struct ebt_entry *point;
+	struct ebt_counter *counter_base, *cb_base;
+	const struct ebt_entry_target *t;
+	int verdict, sp = 0;
+	struct ebt_chainstack *cs;
+	struct ebt_entries *chaininfo;
+	const char *base;
+	const struct ebt_table_info *private;
+	struct xt_action_param acpar;
+
+	acpar.family  = NFPROTO_BRIDGE;
+	acpar.in      = in;
+	acpar.out     = out;
+	acpar.hotdrop = false;
+	acpar.hooknum = hook;
+
+	read_lock_bh(&table->lock);
+	private = table->private;
+	cb_base = COUNTER_BASE(private->counters, private->nentries,
+	   smp_processor_id());
+	if (private->chainstack)
+		cs = private->chainstack[smp_processor_id()];
+	else
+		cs = NULL;
+	chaininfo = private->hook_entry[hook];
+	nentries = private->hook_entry[hook]->nentries;
+	point = (struct ebt_entry *)(private->hook_entry[hook]->data);
+	counter_base = cb_base + private->hook_entry[hook]->counter_offset;
+	/* base for chain jumps */
+	base = private->entries;
+	i = 0;
+	while (i < nentries) {
+		if (ebt_basic_match(point, skb, in, out))
+			goto letscontinue;
+
+		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
+			goto letscontinue;
+		if (acpar.hotdrop) {
+			read_unlock_bh(&table->lock);
+			return NF_DROP;
+		}
+
+		/* increase counter */
+		(*(counter_base + i)).pcnt++;
+		(*(counter_base + i)).bcnt += skb->len;
+
+		/* these should only watch: not modify, nor tell us
+		   what to do with the packet */
+		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
+
+		t = (struct ebt_entry_target *)
+		   (((char *)point) + point->target_offset);
+		/* standard target */
+		if (!t->u.target->target)
+			verdict = ((struct ebt_standard_target *)t)->verdict;
+		else {
+			acpar.target   = t->u.target;
+			acpar.targinfo = t->data;
+			verdict = t->u.target->target(skb, &acpar);
+		}
+		if (verdict == EBT_ACCEPT) {
+			read_unlock_bh(&table->lock);
+			return NF_ACCEPT;
+		}
+		if (verdict == EBT_DROP) {
+			read_unlock_bh(&table->lock);
+			return NF_DROP;
+		}
+		if (verdict == EBT_RETURN) {
+letsreturn:
+#ifdef CONFIG_NETFILTER_DEBUG
+			if (sp == 0) {
+				BUGPRINT("RETURN on base chain");
+				/* act like this is EBT_CONTINUE */
+				goto letscontinue;
+			}
+#endif
+			sp--;
+			/* put all the local variables right */
+			i = cs[sp].n;
+			chaininfo = cs[sp].chaininfo;
+			nentries = chaininfo->nentries;
+			point = cs[sp].e;
+			counter_base = cb_base +
+			   chaininfo->counter_offset;
+			continue;
+		}
+		if (verdict == EBT_CONTINUE)
+			goto letscontinue;
+#ifdef CONFIG_NETFILTER_DEBUG
+		if (verdict < 0) {
+			BUGPRINT("bogus standard verdict\n");
+			read_unlock_bh(&table->lock);
+			return NF_DROP;
+		}
+#endif
+		/* jump to a udc */
+		cs[sp].n = i + 1;
+		cs[sp].chaininfo = chaininfo;
+		cs[sp].e = ebt_next_entry(point);
+		i = 0;
+		chaininfo = (struct ebt_entries *) (base + verdict);
+#ifdef CONFIG_NETFILTER_DEBUG
+		if (chaininfo->distinguisher) {
+			BUGPRINT("jump to non-chain\n");
+			read_unlock_bh(&table->lock);
+			return NF_DROP;
+		}
+#endif
+		nentries = chaininfo->nentries;
+		point = (struct ebt_entry *)chaininfo->data;
+		counter_base = cb_base + chaininfo->counter_offset;
+		sp++;
+		continue;
+letscontinue:
+		point = ebt_next_entry(point);
+		i++;
+	}
+
+	/* I actually like this :) */
+	if (chaininfo->policy == EBT_RETURN)
+		goto letsreturn;
+	if (chaininfo->policy == EBT_ACCEPT) {
+		read_unlock_bh(&table->lock);
+		return NF_ACCEPT;
+	}
+	read_unlock_bh(&table->lock);
+	return NF_DROP;
+}
+
+/* If it succeeds, returns element and locks mutex */
+static inline void *
+find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
+   struct mutex *mutex)
+{
+	struct {
+		struct list_head list;
+		char name[EBT_FUNCTION_MAXNAMELEN];
+	} *e;
+
+	*error = mutex_lock_interruptible(mutex);
+	if (*error != 0)
+		return NULL;
+
+	list_for_each_entry(e, head, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+	*error = -ENOENT;
+	mutex_unlock(mutex);
+	return NULL;
+}
+
+static void *
+find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
+   int *error, struct mutex *mutex)
+{
+	return try_then_request_module(
+			find_inlist_lock_noload(head, name, error, mutex),
+			"%s%s", prefix, name);
+}
+
+static inline struct ebt_table *
+find_table_lock(struct net *net, const char *name, int *error,
+		struct mutex *mutex)
+{
+	return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name,
+				"ebtable_", error, mutex);
+}
+
+static inline int
+ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
+		unsigned int *cnt)
+{
+	const struct ebt_entry *e = par->entryinfo;
+	struct xt_match *match;
+	size_t left = ((char *)e + e->watchers_offset) - (char *)m;
+	int ret;
+
+	if (left < sizeof(struct ebt_entry_match) ||
+	    left - sizeof(struct ebt_entry_match) < m->match_size)
+		return -EINVAL;
+
+	match = xt_request_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+	if (IS_ERR(match))
+		return PTR_ERR(match);
+	m->u.match = match;
+
+	par->match     = match;
+	par->matchinfo = m->data;
+	ret = xt_check_match(par, m->match_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(match->me);
+		return ret;
+	}
+
+	(*cnt)++;
+	return 0;
+}
+
+static inline int
+ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
+		  unsigned int *cnt)
+{
+	const struct ebt_entry *e = par->entryinfo;
+	struct xt_target *watcher;
+	size_t left = ((char *)e + e->target_offset) - (char *)w;
+	int ret;
+
+	if (left < sizeof(struct ebt_entry_watcher) ||
+	   left - sizeof(struct ebt_entry_watcher) < w->watcher_size)
+		return -EINVAL;
+
+	watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0);
+	if (IS_ERR(watcher))
+		return PTR_ERR(watcher);
+	w->u.watcher = watcher;
+
+	par->target   = watcher;
+	par->targinfo = w->data;
+	ret = xt_check_target(par, w->watcher_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(watcher->me);
+		return ret;
+	}
+
+	(*cnt)++;
+	return 0;
+}
+
+static int ebt_verify_pointers(const struct ebt_replace *repl,
+			       struct ebt_table_info *newinfo)
+{
+	unsigned int limit = repl->entries_size;
+	unsigned int valid_hooks = repl->valid_hooks;
+	unsigned int offset = 0;
+	int i;
+
+	for (i = 0; i < NF_BR_NUMHOOKS; i++)
+		newinfo->hook_entry[i] = NULL;
+
+	newinfo->entries_size = repl->entries_size;
+	newinfo->nentries = repl->nentries;
+
+	while (offset < limit) {
+		size_t left = limit - offset;
+		struct ebt_entry *e = (void *)newinfo->entries + offset;
+
+		if (left < sizeof(unsigned int))
+			break;
+
+		for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+			if ((valid_hooks & (1 << i)) == 0)
+				continue;
+			if ((char __user *)repl->hook_entry[i] ==
+			     repl->entries + offset)
+				break;
+		}
+
+		if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) {
+			if (e->bitmask != 0) {
+				/* we make userspace set this right,
+				   so there is no misunderstanding */
+				BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set "
+					 "in distinguisher\n");
+				return -EINVAL;
+			}
+			if (i != NF_BR_NUMHOOKS)
+				newinfo->hook_entry[i] = (struct ebt_entries *)e;
+			if (left < sizeof(struct ebt_entries))
+				break;
+			offset += sizeof(struct ebt_entries);
+		} else {
+			if (left < sizeof(struct ebt_entry))
+				break;
+			if (left < e->next_offset)
+				break;
+			if (e->next_offset < sizeof(struct ebt_entry))
+				return -EINVAL;
+			offset += e->next_offset;
+		}
+	}
+	if (offset != limit) {
+		BUGPRINT("entries_size too small\n");
+		return -EINVAL;
+	}
+
+	/* check if all valid hooks have a chain */
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		if (!newinfo->hook_entry[i] &&
+		   (valid_hooks & (1 << i))) {
+			BUGPRINT("Valid hook without chain\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * this one is very careful, as it is the first function
+ * to parse the userspace data
+ */
+static inline int
+ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
+   const struct ebt_table_info *newinfo,
+   unsigned int *n, unsigned int *cnt,
+   unsigned int *totalcnt, unsigned int *udc_cnt)
+{
+	int i;
+
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		if ((void *)e == (void *)newinfo->hook_entry[i])
+			break;
+	}
+	/* beginning of a new chain
+	   if i == NF_BR_NUMHOOKS it must be a user defined chain */
+	if (i != NF_BR_NUMHOOKS || !e->bitmask) {
+		/* this checks if the previous chain has as many entries
+		   as it said it has */
+		if (*n != *cnt) {
+			BUGPRINT("nentries does not equal the nr of entries "
+				 "in the chain\n");
+			return -EINVAL;
+		}
+		if (((struct ebt_entries *)e)->policy != EBT_DROP &&
+		   ((struct ebt_entries *)e)->policy != EBT_ACCEPT) {
+			/* only RETURN from udc */
+			if (i != NF_BR_NUMHOOKS ||
+			   ((struct ebt_entries *)e)->policy != EBT_RETURN) {
+				BUGPRINT("bad policy\n");
+				return -EINVAL;
+			}
+		}
+		if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */
+			(*udc_cnt)++;
+		if (((struct ebt_entries *)e)->counter_offset != *totalcnt) {
+			BUGPRINT("counter_offset != totalcnt");
+			return -EINVAL;
+		}
+		*n = ((struct ebt_entries *)e)->nentries;
+		*cnt = 0;
+		return 0;
+	}
+	/* a plain old entry, heh */
+	if (sizeof(struct ebt_entry) > e->watchers_offset ||
+	   e->watchers_offset > e->target_offset ||
+	   e->target_offset >= e->next_offset) {
+		BUGPRINT("entry offsets not in right order\n");
+		return -EINVAL;
+	}
+	/* this is not checked anywhere else */
+	if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) {
+		BUGPRINT("target size too small\n");
+		return -EINVAL;
+	}
+	(*cnt)++;
+	(*totalcnt)++;
+	return 0;
+}
+
+struct ebt_cl_stack
+{
+	struct ebt_chainstack cs;
+	int from;
+	unsigned int hookmask;
+};
+
+/*
+ * we need these positions to check that the jumps to a different part of the
+ * entries is a jump to the beginning of a new chain.
+ */
+static inline int
+ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
+   unsigned int *n, struct ebt_cl_stack *udc)
+{
+	int i;
+
+	/* we're only interested in chain starts */
+	if (e->bitmask)
+		return 0;
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		if (newinfo->hook_entry[i] == (struct ebt_entries *)e)
+			break;
+	}
+	/* only care about udc */
+	if (i != NF_BR_NUMHOOKS)
+		return 0;
+
+	udc[*n].cs.chaininfo = (struct ebt_entries *)e;
+	/* these initialisations are depended on later in check_chainloops() */
+	udc[*n].cs.n = 0;
+	udc[*n].hookmask = 0;
+
+	(*n)++;
+	return 0;
+}
+
+static inline int
+ebt_cleanup_match(struct ebt_entry_match *m, struct net *net, unsigned int *i)
+{
+	struct xt_mtdtor_param par;
+
+	if (i && (*i)-- == 0)
+		return 1;
+
+	par.net       = net;
+	par.match     = m->u.match;
+	par.matchinfo = m->data;
+	par.family    = NFPROTO_BRIDGE;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
+	return 0;
+}
+
+static inline int
+ebt_cleanup_watcher(struct ebt_entry_watcher *w, struct net *net, unsigned int *i)
+{
+	struct xt_tgdtor_param par;
+
+	if (i && (*i)-- == 0)
+		return 1;
+
+	par.net      = net;
+	par.target   = w->u.watcher;
+	par.targinfo = w->data;
+	par.family   = NFPROTO_BRIDGE;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+	return 0;
+}
+
+static inline int
+ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
+{
+	struct xt_tgdtor_param par;
+	struct ebt_entry_target *t;
+
+	if (e->bitmask == 0)
+		return 0;
+	/* we're done */
+	if (cnt && (*cnt)-- == 0)
+		return 1;
+	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
+	EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
+	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+
+	par.net      = net;
+	par.target   = t->u.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_BRIDGE;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+	return 0;
+}
+
+static inline int
+ebt_check_entry(struct ebt_entry *e, struct net *net,
+   const struct ebt_table_info *newinfo,
+   const char *name, unsigned int *cnt,
+   struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
+{
+	struct ebt_entry_target *t;
+	struct xt_target *target;
+	unsigned int i, j, hook = 0, hookmask = 0;
+	size_t gap;
+	int ret;
+	struct xt_mtchk_param mtpar;
+	struct xt_tgchk_param tgpar;
+
+	/* don't mess with the struct ebt_entries */
+	if (e->bitmask == 0)
+		return 0;
+
+	if (e->bitmask & ~EBT_F_MASK) {
+		BUGPRINT("Unknown flag for bitmask\n");
+		return -EINVAL;
+	}
+	if (e->invflags & ~EBT_INV_MASK) {
+		BUGPRINT("Unknown flag for inv bitmask\n");
+		return -EINVAL;
+	}
+	if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) {
+		BUGPRINT("NOPROTO & 802_3 not allowed\n");
+		return -EINVAL;
+	}
+	/* what hook do we belong to? */
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		if (!newinfo->hook_entry[i])
+			continue;
+		if ((char *)newinfo->hook_entry[i] < (char *)e)
+			hook = i;
+		else
+			break;
+	}
+	/* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on
+	   a base chain */
+	if (i < NF_BR_NUMHOOKS)
+		hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
+	else {
+		for (i = 0; i < udc_cnt; i++)
+			if ((char *)(cl_s[i].cs.chaininfo) > (char *)e)
+				break;
+		if (i == 0)
+			hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
+		else
+			hookmask = cl_s[i - 1].hookmask;
+	}
+	i = 0;
+
+	mtpar.net	= tgpar.net       = net;
+	mtpar.table     = tgpar.table     = name;
+	mtpar.entryinfo = tgpar.entryinfo = e;
+	mtpar.hook_mask = tgpar.hook_mask = hookmask;
+	mtpar.family    = tgpar.family    = NFPROTO_BRIDGE;
+	ret = EBT_MATCH_ITERATE(e, ebt_check_match, &mtpar, &i);
+	if (ret != 0)
+		goto cleanup_matches;
+	j = 0;
+	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
+	if (ret != 0)
+		goto cleanup_watchers;
+	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	gap = e->next_offset - e->target_offset;
+
+	target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
+	if (IS_ERR(target)) {
+		ret = PTR_ERR(target);
+		goto cleanup_watchers;
+	}
+
+	t->u.target = target;
+	if (t->u.target == &ebt_standard_target) {
+		if (gap < sizeof(struct ebt_standard_target)) {
+			BUGPRINT("Standard target size too big\n");
+			ret = -EFAULT;
+			goto cleanup_watchers;
+		}
+		if (((struct ebt_standard_target *)t)->verdict <
+		   -NUM_STANDARD_TARGETS) {
+			BUGPRINT("Invalid standard target\n");
+			ret = -EFAULT;
+			goto cleanup_watchers;
+		}
+	} else if (t->target_size > gap - sizeof(struct ebt_entry_target)) {
+		module_put(t->u.target->me);
+		ret = -EFAULT;
+		goto cleanup_watchers;
+	}
+
+	tgpar.target   = target;
+	tgpar.targinfo = t->data;
+	ret = xt_check_target(&tgpar, t->target_size,
+	      e->ethproto, e->invflags & EBT_IPROTO);
+	if (ret < 0) {
+		module_put(target->me);
+		goto cleanup_watchers;
+	}
+	(*cnt)++;
+	return 0;
+cleanup_watchers:
+	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, &j);
+cleanup_matches:
+	EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, &i);
+	return ret;
+}
+
+/*
+ * checks for loops and sets the hook mask for udc
+ * the hook mask for udc tells us from which base chains the udc can be
+ * accessed. This mask is a parameter to the check() functions of the extensions
+ */
+static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s,
+   unsigned int udc_cnt, unsigned int hooknr, char *base)
+{
+	int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict;
+	const struct ebt_entry *e = (struct ebt_entry *)chain->data;
+	const struct ebt_entry_target *t;
+
+	while (pos < nentries || chain_nr != -1) {
+		/* end of udc, go back one 'recursion' step */
+		if (pos == nentries) {
+			/* put back values of the time when this chain was called */
+			e = cl_s[chain_nr].cs.e;
+			if (cl_s[chain_nr].from != -1)
+				nentries =
+				cl_s[cl_s[chain_nr].from].cs.chaininfo->nentries;
+			else
+				nentries = chain->nentries;
+			pos = cl_s[chain_nr].cs.n;
+			/* make sure we won't see a loop that isn't one */
+			cl_s[chain_nr].cs.n = 0;
+			chain_nr = cl_s[chain_nr].from;
+			if (pos == nentries)
+				continue;
+		}
+		t = (struct ebt_entry_target *)
+		   (((char *)e) + e->target_offset);
+		if (strcmp(t->u.name, EBT_STANDARD_TARGET))
+			goto letscontinue;
+		if (e->target_offset + sizeof(struct ebt_standard_target) >
+		   e->next_offset) {
+			BUGPRINT("Standard target size too big\n");
+			return -1;
+		}
+		verdict = ((struct ebt_standard_target *)t)->verdict;
+		if (verdict >= 0) { /* jump to another chain */
+			struct ebt_entries *hlp2 =
+			   (struct ebt_entries *)(base + verdict);
+			for (i = 0; i < udc_cnt; i++)
+				if (hlp2 == cl_s[i].cs.chaininfo)
+					break;
+			/* bad destination or loop */
+			if (i == udc_cnt) {
+				BUGPRINT("bad destination\n");
+				return -1;
+			}
+			if (cl_s[i].cs.n) {
+				BUGPRINT("loop\n");
+				return -1;
+			}
+			if (cl_s[i].hookmask & (1 << hooknr))
+				goto letscontinue;
+			/* this can't be 0, so the loop test is correct */
+			cl_s[i].cs.n = pos + 1;
+			pos = 0;
+			cl_s[i].cs.e = ebt_next_entry(e);
+			e = (struct ebt_entry *)(hlp2->data);
+			nentries = hlp2->nentries;
+			cl_s[i].from = chain_nr;
+			chain_nr = i;
+			/* this udc is accessible from the base chain for hooknr */
+			cl_s[i].hookmask |= (1 << hooknr);
+			continue;
+		}
+letscontinue:
+		e = ebt_next_entry(e);
+		pos++;
+	}
+	return 0;
+}
+
+/* do the parsing of the table/chains/entries/matches/watchers/targets, heh */
+static int translate_table(struct net *net, const char *name,
+			   struct ebt_table_info *newinfo)
+{
+	unsigned int i, j, k, udc_cnt;
+	int ret;
+	struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */
+
+	i = 0;
+	while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i])
+		i++;
+	if (i == NF_BR_NUMHOOKS) {
+		BUGPRINT("No valid hooks specified\n");
+		return -EINVAL;
+	}
+	if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) {
+		BUGPRINT("Chains don't start at beginning\n");
+		return -EINVAL;
+	}
+	/* make sure chains are ordered after each other in same order
+	   as their corresponding hooks */
+	for (j = i + 1; j < NF_BR_NUMHOOKS; j++) {
+		if (!newinfo->hook_entry[j])
+			continue;
+		if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) {
+			BUGPRINT("Hook order must be followed\n");
+			return -EINVAL;
+		}
+		i = j;
+	}
+
+	/* do some early checkings and initialize some things */
+	i = 0; /* holds the expected nr. of entries for the chain */
+	j = 0; /* holds the up to now counted entries for the chain */
+	k = 0; /* holds the total nr. of entries, should equal
+		  newinfo->nentries afterwards */
+	udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */
+	ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+	   ebt_check_entry_size_and_hooks, newinfo,
+	   &i, &j, &k, &udc_cnt);
+
+	if (ret != 0)
+		return ret;
+
+	if (i != j) {
+		BUGPRINT("nentries does not equal the nr of entries in the "
+			 "(last) chain\n");
+		return -EINVAL;
+	}
+	if (k != newinfo->nentries) {
+		BUGPRINT("Total nentries is wrong\n");
+		return -EINVAL;
+	}
+
+	/* get the location of the udc, put them in an array
+	   while we're at it, allocate the chainstack */
+	if (udc_cnt) {
+		/* this will get free'd in do_replace()/ebt_register_table()
+		   if an error occurs */
+		newinfo->chainstack =
+			vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack)));
+		if (!newinfo->chainstack)
+			return -ENOMEM;
+		for_each_possible_cpu(i) {
+			newinfo->chainstack[i] =
+			  vmalloc(udc_cnt * sizeof(*(newinfo->chainstack[0])));
+			if (!newinfo->chainstack[i]) {
+				while (i)
+					vfree(newinfo->chainstack[--i]);
+				vfree(newinfo->chainstack);
+				newinfo->chainstack = NULL;
+				return -ENOMEM;
+			}
+		}
+
+		cl_s = vmalloc(udc_cnt * sizeof(*cl_s));
+		if (!cl_s)
+			return -ENOMEM;
+		i = 0; /* the i'th udc */
+		EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+		   ebt_get_udc_positions, newinfo, &i, cl_s);
+		/* sanity check */
+		if (i != udc_cnt) {
+			BUGPRINT("i != udc_cnt\n");
+			vfree(cl_s);
+			return -EFAULT;
+		}
+	}
+
+	/* Check for loops */
+	for (i = 0; i < NF_BR_NUMHOOKS; i++)
+		if (newinfo->hook_entry[i])
+			if (check_chainloops(newinfo->hook_entry[i],
+			   cl_s, udc_cnt, i, newinfo->entries)) {
+				vfree(cl_s);
+				return -EINVAL;
+			}
+
+	/* we now know the following (along with E=mc²):
+	   - the nr of entries in each chain is right
+	   - the size of the allocated space is right
+	   - all valid hooks have a corresponding chain
+	   - there are no loops
+	   - wrong data can still be on the level of a single entry
+	   - could be there are jumps to places that are not the
+	     beginning of a chain. This can only occur in chains that
+	     are not accessible from any base chains, so we don't care. */
+
+	/* used to know what we need to clean up if something goes wrong */
+	i = 0;
+	ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+	   ebt_check_entry, net, newinfo, name, &i, cl_s, udc_cnt);
+	if (ret != 0) {
+		EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+				  ebt_cleanup_entry, net, &i);
+	}
+	vfree(cl_s);
+	return ret;
+}
+
+/* called under write_lock */
+static void get_counters(const struct ebt_counter *oldcounters,
+   struct ebt_counter *counters, unsigned int nentries)
+{
+	int i, cpu;
+	struct ebt_counter *counter_base;
+
+	/* counters of cpu 0 */
+	memcpy(counters, oldcounters,
+	       sizeof(struct ebt_counter) * nentries);
+
+	/* add other counters to those of cpu 0 */
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
+		for (i = 0; i < nentries; i++) {
+			counters[i].pcnt += counter_base[i].pcnt;
+			counters[i].bcnt += counter_base[i].bcnt;
+		}
+	}
+}
+
+static int do_replace_finish(struct net *net, struct ebt_replace *repl,
+			      struct ebt_table_info *newinfo)
+{
+	int ret, i;
+	struct ebt_counter *counterstmp = NULL;
+	/* used to be able to unlock earlier */
+	struct ebt_table_info *table;
+	struct ebt_table *t;
+
+	/* the user wants counters back
+	   the check on the size is done later, when we have the lock */
+	if (repl->num_counters) {
+		unsigned long size = repl->num_counters * sizeof(*counterstmp);
+		counterstmp = vmalloc(size);
+		if (!counterstmp)
+			return -ENOMEM;
+	}
+
+	newinfo->chainstack = NULL;
+	ret = ebt_verify_pointers(repl, newinfo);
+	if (ret != 0)
+		goto free_counterstmp;
+
+	ret = translate_table(net, repl->name, newinfo);
+
+	if (ret != 0)
+		goto free_counterstmp;
+
+	t = find_table_lock(net, repl->name, &ret, &ebt_mutex);
+	if (!t) {
+		ret = -ENOENT;
+		goto free_iterate;
+	}
+
+	/* the table doesn't like it */
+	if (t->check && (ret = t->check(newinfo, repl->valid_hooks)))
+		goto free_unlock;
+
+	if (repl->num_counters && repl->num_counters != t->private->nentries) {
+		BUGPRINT("Wrong nr. of counters requested\n");
+		ret = -EINVAL;
+		goto free_unlock;
+	}
+
+	/* we have the mutex lock, so no danger in reading this pointer */
+	table = t->private;
+	/* make sure the table can only be rmmod'ed if it contains no rules */
+	if (!table->nentries && newinfo->nentries && !try_module_get(t->me)) {
+		ret = -ENOENT;
+		goto free_unlock;
+	} else if (table->nentries && !newinfo->nentries)
+		module_put(t->me);
+	/* we need an atomic snapshot of the counters */
+	write_lock_bh(&t->lock);
+	if (repl->num_counters)
+		get_counters(t->private->counters, counterstmp,
+		   t->private->nentries);
+
+	t->private = newinfo;
+	write_unlock_bh(&t->lock);
+	mutex_unlock(&ebt_mutex);
+	/* so, a user can change the chains while having messed up her counter
+	   allocation. Only reason why this is done is because this way the lock
+	   is held only once, while this doesn't bring the kernel into a
+	   dangerous state. */
+	if (repl->num_counters &&
+	   copy_to_user(repl->counters, counterstmp,
+	   repl->num_counters * sizeof(struct ebt_counter))) {
+		ret = -EFAULT;
+	}
+	else
+		ret = 0;
+
+	/* decrease module count and free resources */
+	EBT_ENTRY_ITERATE(table->entries, table->entries_size,
+			  ebt_cleanup_entry, net, NULL);
+
+	vfree(table->entries);
+	if (table->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(table->chainstack[i]);
+		vfree(table->chainstack);
+	}
+	vfree(table);
+
+	vfree(counterstmp);
+	return ret;
+
+free_unlock:
+	mutex_unlock(&ebt_mutex);
+free_iterate:
+	EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
+			  ebt_cleanup_entry, net, NULL);
+free_counterstmp:
+	vfree(counterstmp);
+	/* can be initialized in translate_table() */
+	if (newinfo->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(newinfo->chainstack[i]);
+		vfree(newinfo->chainstack);
+	}
+	return ret;
+}
+
+/* replace the table */
+static int do_replace(struct net *net, const void __user *user,
+		      unsigned int len)
+{
+	int ret, countersize;
+	struct ebt_table_info *newinfo;
+	struct ebt_replace tmp;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	if (len != sizeof(tmp) + tmp.entries_size) {
+		BUGPRINT("Wrong len argument\n");
+		return -EINVAL;
+	}
+
+	if (tmp.entries_size == 0) {
+		BUGPRINT("Entries_size never zero\n");
+		return -EINVAL;
+	}
+	/* overflow check */
+	if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+			NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+		return -ENOMEM;
+	if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+		return -ENOMEM;
+
+	tmp.name[sizeof(tmp.name) - 1] = 0;
+
+	countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+	newinfo = vmalloc(sizeof(*newinfo) + countersize);
+	if (!newinfo)
+		return -ENOMEM;
+
+	if (countersize)
+		memset(newinfo->counters, 0, countersize);
+
+	newinfo->entries = vmalloc(tmp.entries_size);
+	if (!newinfo->entries) {
+		ret = -ENOMEM;
+		goto free_newinfo;
+	}
+	if (copy_from_user(
+	   newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+		BUGPRINT("Couldn't copy entries from userspace\n");
+		ret = -EFAULT;
+		goto free_entries;
+	}
+
+	ret = do_replace_finish(net, &tmp, newinfo);
+	if (ret == 0)
+		return ret;
+free_entries:
+	vfree(newinfo->entries);
+free_newinfo:
+	vfree(newinfo);
+	return ret;
+}
+
+struct ebt_table *
+ebt_register_table(struct net *net, const struct ebt_table *input_table)
+{
+	struct ebt_table_info *newinfo;
+	struct ebt_table *t, *table;
+	struct ebt_replace_kernel *repl;
+	int ret, i, countersize;
+	void *p;
+
+	if (input_table == NULL || (repl = input_table->table) == NULL ||
+	    repl->entries == NULL || repl->entries_size == 0 ||
+	    repl->counters != NULL || input_table->private != NULL) {
+		BUGPRINT("Bad table data for ebt_register_table!!!\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Don't add one table to multiple lists. */
+	table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL);
+	if (!table) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
+	newinfo = vmalloc(sizeof(*newinfo) + countersize);
+	ret = -ENOMEM;
+	if (!newinfo)
+		goto free_table;
+
+	p = vmalloc(repl->entries_size);
+	if (!p)
+		goto free_newinfo;
+
+	memcpy(p, repl->entries, repl->entries_size);
+	newinfo->entries = p;
+
+	newinfo->entries_size = repl->entries_size;
+	newinfo->nentries = repl->nentries;
+
+	if (countersize)
+		memset(newinfo->counters, 0, countersize);
+
+	/* fill in newinfo and parse the entries */
+	newinfo->chainstack = NULL;
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		if ((repl->valid_hooks & (1 << i)) == 0)
+			newinfo->hook_entry[i] = NULL;
+		else
+			newinfo->hook_entry[i] = p +
+				((char *)repl->hook_entry[i] - repl->entries);
+	}
+	ret = translate_table(net, repl->name, newinfo);
+	if (ret != 0) {
+		BUGPRINT("Translate_table failed\n");
+		goto free_chainstack;
+	}
+
+	if (table->check && table->check(newinfo, table->valid_hooks)) {
+		BUGPRINT("The table doesn't like its own initial data, lol\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	table->private = newinfo;
+	rwlock_init(&table->lock);
+	ret = mutex_lock_interruptible(&ebt_mutex);
+	if (ret != 0)
+		goto free_chainstack;
+
+	list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
+		if (strcmp(t->name, table->name) == 0) {
+			ret = -EEXIST;
+			BUGPRINT("Table name already exists\n");
+			goto free_unlock;
+		}
+	}
+
+	/* Hold a reference count if the chains aren't empty */
+	if (newinfo->nentries && !try_module_get(table->me)) {
+		ret = -ENOENT;
+		goto free_unlock;
+	}
+	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
+	mutex_unlock(&ebt_mutex);
+	return table;
+free_unlock:
+	mutex_unlock(&ebt_mutex);
+free_chainstack:
+	if (newinfo->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(newinfo->chainstack[i]);
+		vfree(newinfo->chainstack);
+	}
+	vfree(newinfo->entries);
+free_newinfo:
+	vfree(newinfo);
+free_table:
+	kfree(table);
+out:
+	return ERR_PTR(ret);
+}
+
+void ebt_unregister_table(struct net *net, struct ebt_table *table)
+{
+	int i;
+
+	if (!table) {
+		BUGPRINT("Request to unregister NULL table!!!\n");
+		return;
+	}
+	mutex_lock(&ebt_mutex);
+	list_del(&table->list);
+	mutex_unlock(&ebt_mutex);
+	EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
+			  ebt_cleanup_entry, net, NULL);
+	if (table->private->nentries)
+		module_put(table->me);
+	vfree(table->private->entries);
+	if (table->private->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(table->private->chainstack[i]);
+		vfree(table->private->chainstack);
+	}
+	vfree(table->private);
+	kfree(table);
+}
+
+/* userspace just supplied us with counters */
+static int do_update_counters(struct net *net, const char *name,
+				struct ebt_counter __user *counters,
+				unsigned int num_counters,
+				const void __user *user, unsigned int len)
+{
+	int i, ret;
+	struct ebt_counter *tmp;
+	struct ebt_table *t;
+
+	if (num_counters == 0)
+		return -EINVAL;
+
+	tmp = vmalloc(num_counters * sizeof(*tmp));
+	if (!tmp)
+		return -ENOMEM;
+
+	t = find_table_lock(net, name, &ret, &ebt_mutex);
+	if (!t)
+		goto free_tmp;
+
+	if (num_counters != t->private->nentries) {
+		BUGPRINT("Wrong nr of counters\n");
+		ret = -EINVAL;
+		goto unlock_mutex;
+	}
+
+	if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) {
+		ret = -EFAULT;
+		goto unlock_mutex;
+	}
+
+	/* we want an atomic add of the counters */
+	write_lock_bh(&t->lock);
+
+	/* we add to the counters of the first cpu */
+	for (i = 0; i < num_counters; i++) {
+		t->private->counters[i].pcnt += tmp[i].pcnt;
+		t->private->counters[i].bcnt += tmp[i].bcnt;
+	}
+
+	write_unlock_bh(&t->lock);
+	ret = 0;
+unlock_mutex:
+	mutex_unlock(&ebt_mutex);
+free_tmp:
+	vfree(tmp);
+	return ret;
+}
+
+static int update_counters(struct net *net, const void __user *user,
+			    unsigned int len)
+{
+	struct ebt_replace hlp;
+
+	if (copy_from_user(&hlp, user, sizeof(hlp)))
+		return -EFAULT;
+
+	if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+		return -EINVAL;
+
+	return do_update_counters(net, hlp.name, hlp.counters,
+				hlp.num_counters, user, len);
+}
+
+static inline int ebt_make_matchname(const struct ebt_entry_match *m,
+    const char *base, char __user *ubase)
+{
+	char __user *hlp = ubase + ((char *)m - base);
+	if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN))
+		return -EFAULT;
+	return 0;
+}
+
+static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
+    const char *base, char __user *ubase)
+{
+	char __user *hlp = ubase + ((char *)w - base);
+	if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN))
+		return -EFAULT;
+	return 0;
+}
+
+static inline int
+ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
+{
+	int ret;
+	char __user *hlp;
+	const struct ebt_entry_target *t;
+
+	if (e->bitmask == 0)
+		return 0;
+
+	hlp = ubase + (((char *)e + e->target_offset) - base);
+	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+
+	ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase);
+	if (ret != 0)
+		return ret;
+	ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase);
+	if (ret != 0)
+		return ret;
+	if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN))
+		return -EFAULT;
+	return 0;
+}
+
+static int copy_counters_to_user(struct ebt_table *t,
+				  const struct ebt_counter *oldcounters,
+				  void __user *user, unsigned int num_counters,
+				  unsigned int nentries)
+{
+	struct ebt_counter *counterstmp;
+	int ret = 0;
+
+	/* userspace might not need the counters */
+	if (num_counters == 0)
+		return 0;
+
+	if (num_counters != nentries) {
+		BUGPRINT("Num_counters wrong\n");
+		return -EINVAL;
+	}
+
+	counterstmp = vmalloc(nentries * sizeof(*counterstmp));
+	if (!counterstmp)
+		return -ENOMEM;
+
+	write_lock_bh(&t->lock);
+	get_counters(oldcounters, counterstmp, nentries);
+	write_unlock_bh(&t->lock);
+
+	if (copy_to_user(user, counterstmp,
+	   nentries * sizeof(struct ebt_counter)))
+		ret = -EFAULT;
+	vfree(counterstmp);
+	return ret;
+}
+
+/* called with ebt_mutex locked */
+static int copy_everything_to_user(struct ebt_table *t, void __user *user,
+    const int *len, int cmd)
+{
+	struct ebt_replace tmp;
+	const struct ebt_counter *oldcounters;
+	unsigned int entries_size, nentries;
+	int ret;
+	char *entries;
+
+	if (cmd == EBT_SO_GET_ENTRIES) {
+		entries_size = t->private->entries_size;
+		nentries = t->private->nentries;
+		entries = t->private->entries;
+		oldcounters = t->private->counters;
+	} else {
+		entries_size = t->table->entries_size;
+		nentries = t->table->nentries;
+		entries = t->table->entries;
+		oldcounters = t->table->counters;
+	}
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	if (*len != sizeof(struct ebt_replace) + entries_size +
+	   (tmp.num_counters? nentries * sizeof(struct ebt_counter): 0))
+		return -EINVAL;
+
+	if (tmp.nentries != nentries) {
+		BUGPRINT("Nentries wrong\n");
+		return -EINVAL;
+	}
+
+	if (tmp.entries_size != entries_size) {
+		BUGPRINT("Wrong size\n");
+		return -EINVAL;
+	}
+
+	ret = copy_counters_to_user(t, oldcounters, tmp.counters,
+					tmp.num_counters, nentries);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(tmp.entries, entries, entries_size)) {
+		BUGPRINT("Couldn't copy entries to userspace\n");
+		return -EFAULT;
+	}
+	/* set the match/watcher/target names right */
+	return EBT_ENTRY_ITERATE(entries, entries_size,
+	   ebt_make_names, entries, tmp.entries);
+}
+
+static int do_ebt_set_ctl(struct sock *sk,
+	int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch(cmd) {
+	case EBT_SO_SET_ENTRIES:
+		ret = do_replace(sock_net(sk), user, len);
+		break;
+	case EBT_SO_SET_COUNTERS:
+		ret = update_counters(sock_net(sk), user, len);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+	struct ebt_replace tmp;
+	struct ebt_table *t;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex);
+	if (!t)
+		return ret;
+
+	switch(cmd) {
+	case EBT_SO_GET_INFO:
+	case EBT_SO_GET_INIT_INFO:
+		if (*len != sizeof(struct ebt_replace)){
+			ret = -EINVAL;
+			mutex_unlock(&ebt_mutex);
+			break;
+		}
+		if (cmd == EBT_SO_GET_INFO) {
+			tmp.nentries = t->private->nentries;
+			tmp.entries_size = t->private->entries_size;
+			tmp.valid_hooks = t->valid_hooks;
+		} else {
+			tmp.nentries = t->table->nentries;
+			tmp.entries_size = t->table->entries_size;
+			tmp.valid_hooks = t->table->valid_hooks;
+		}
+		mutex_unlock(&ebt_mutex);
+		if (copy_to_user(user, &tmp, *len) != 0){
+			BUGPRINT("c2u Didn't work\n");
+			ret = -EFAULT;
+			break;
+		}
+		ret = 0;
+		break;
+
+	case EBT_SO_GET_ENTRIES:
+	case EBT_SO_GET_INIT_ENTRIES:
+		ret = copy_everything_to_user(t, user, len, cmd);
+		mutex_unlock(&ebt_mutex);
+		break;
+
+	default:
+		mutex_unlock(&ebt_mutex);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+/* 32 bit-userspace compatibility definitions. */
+struct compat_ebt_replace {
+	char name[EBT_TABLE_MAXNAMELEN];
+	compat_uint_t valid_hooks;
+	compat_uint_t nentries;
+	compat_uint_t entries_size;
+	/* start of the chains */
+	compat_uptr_t hook_entry[NF_BR_NUMHOOKS];
+	/* nr of counters userspace expects back */
+	compat_uint_t num_counters;
+	/* where the kernel will put the old counters. */
+	compat_uptr_t counters;
+	compat_uptr_t entries;
+};
+
+/* struct ebt_entry_match, _target and _watcher have same layout */
+struct compat_ebt_entry_mwt {
+	union {
+		char name[EBT_FUNCTION_MAXNAMELEN];
+		compat_uptr_t ptr;
+	} u;
+	compat_uint_t match_size;
+	compat_uint_t data[0];
+};
+
+/* account for possible padding between match_size and ->data */
+static int ebt_compat_entry_padsize(void)
+{
+	BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) <
+			COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)));
+	return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) -
+			COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt));
+}
+
+static int ebt_compat_match_offset(const struct xt_match *match,
+				   unsigned int userlen)
+{
+	/*
+	 * ebt_among needs special handling. The kernel .matchsize is
+	 * set to -1 at registration time; at runtime an EBT_ALIGN()ed
+	 * value is expected.
+	 * Example: userspace sends 4500, ebt_among.c wants 4504.
+	 */
+	if (unlikely(match->matchsize == -1))
+		return XT_ALIGN(userlen) - COMPAT_XT_ALIGN(userlen);
+	return xt_compat_match_offset(match);
+}
+
+static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
+				unsigned int *size)
+{
+	const struct xt_match *match = m->u.match;
+	struct compat_ebt_entry_mwt __user *cm = *dstptr;
+	int off = ebt_compat_match_offset(match, m->match_size);
+	compat_uint_t msize = m->match_size - off;
+
+	BUG_ON(off >= m->match_size);
+
+	if (copy_to_user(cm->u.name, match->name,
+	    strlen(match->name) + 1) || put_user(msize, &cm->match_size))
+		return -EFAULT;
+
+	if (match->compat_to_user) {
+		if (match->compat_to_user(cm->data, m->data))
+			return -EFAULT;
+	} else if (copy_to_user(cm->data, m->data, msize))
+			return -EFAULT;
+
+	*size -= ebt_compat_entry_padsize() + off;
+	*dstptr = cm->data;
+	*dstptr += msize;
+	return 0;
+}
+
+static int compat_target_to_user(struct ebt_entry_target *t,
+				 void __user **dstptr,
+				 unsigned int *size)
+{
+	const struct xt_target *target = t->u.target;
+	struct compat_ebt_entry_mwt __user *cm = *dstptr;
+	int off = xt_compat_target_offset(target);
+	compat_uint_t tsize = t->target_size - off;
+
+	BUG_ON(off >= t->target_size);
+
+	if (copy_to_user(cm->u.name, target->name,
+	    strlen(target->name) + 1) || put_user(tsize, &cm->match_size))
+		return -EFAULT;
+
+	if (target->compat_to_user) {
+		if (target->compat_to_user(cm->data, t->data))
+			return -EFAULT;
+	} else if (copy_to_user(cm->data, t->data, tsize))
+		return -EFAULT;
+
+	*size -= ebt_compat_entry_padsize() + off;
+	*dstptr = cm->data;
+	*dstptr += tsize;
+	return 0;
+}
+
+static int compat_watcher_to_user(struct ebt_entry_watcher *w,
+				  void __user **dstptr,
+				  unsigned int *size)
+{
+	return compat_target_to_user((struct ebt_entry_target *)w,
+							dstptr, size);
+}
+
+static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
+				unsigned int *size)
+{
+	struct ebt_entry_target *t;
+	struct ebt_entry __user *ce;
+	u32 watchers_offset, target_offset, next_offset;
+	compat_uint_t origsize;
+	int ret;
+
+	if (e->bitmask == 0) {
+		if (*size < sizeof(struct ebt_entries))
+			return -EINVAL;
+		if (copy_to_user(*dstptr, e, sizeof(struct ebt_entries)))
+			return -EFAULT;
+
+		*dstptr += sizeof(struct ebt_entries);
+		*size -= sizeof(struct ebt_entries);
+		return 0;
+	}
+
+	if (*size < sizeof(*ce))
+		return -EINVAL;
+
+	ce = (struct ebt_entry __user *)*dstptr;
+	if (copy_to_user(ce, e, sizeof(*ce)))
+		return -EFAULT;
+
+	origsize = *size;
+	*dstptr += sizeof(*ce);
+
+	ret = EBT_MATCH_ITERATE(e, compat_match_to_user, dstptr, size);
+	if (ret)
+		return ret;
+	watchers_offset = e->watchers_offset - (origsize - *size);
+
+	ret = EBT_WATCHER_ITERATE(e, compat_watcher_to_user, dstptr, size);
+	if (ret)
+		return ret;
+	target_offset = e->target_offset - (origsize - *size);
+
+	t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+
+	ret = compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+
+	if (put_user(watchers_offset, &ce->watchers_offset) ||
+	    put_user(target_offset, &ce->target_offset) ||
+	    put_user(next_offset, &ce->next_offset))
+		return -EFAULT;
+
+	*size -= sizeof(*ce);
+	return 0;
+}
+
+static int compat_calc_match(struct ebt_entry_match *m, int *off)
+{
+	*off += ebt_compat_match_offset(m->u.match, m->match_size);
+	*off += ebt_compat_entry_padsize();
+	return 0;
+}
+
+static int compat_calc_watcher(struct ebt_entry_watcher *w, int *off)
+{
+	*off += xt_compat_target_offset(w->u.watcher);
+	*off += ebt_compat_entry_padsize();
+	return 0;
+}
+
+static int compat_calc_entry(const struct ebt_entry *e,
+			     const struct ebt_table_info *info,
+			     const void *base,
+			     struct compat_ebt_replace *newinfo)
+{
+	const struct ebt_entry_target *t;
+	unsigned int entry_offset;
+	int off, ret, i;
+
+	if (e->bitmask == 0)
+		return 0;
+
+	off = 0;
+	entry_offset = (void *)e - base;
+
+	EBT_MATCH_ITERATE(e, compat_calc_match, &off);
+	EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
+
+	t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+
+	off += xt_compat_target_offset(t->u.target);
+	off += ebt_compat_entry_padsize();
+
+	newinfo->entries_size -= off;
+
+	ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		const void *hookptr = info->hook_entry[i];
+		if (info->hook_entry[i] &&
+		    (e < (struct ebt_entry *)(base - hookptr))) {
+			newinfo->hook_entry[i] -= off;
+			pr_debug("0x%08X -> 0x%08X\n",
+					newinfo->hook_entry[i] + off,
+					newinfo->hook_entry[i]);
+		}
+	}
+
+	return 0;
+}
+
+
+static int compat_table_info(const struct ebt_table_info *info,
+			     struct compat_ebt_replace *newinfo)
+{
+	unsigned int size = info->entries_size;
+	const void *entries = info->entries;
+
+	newinfo->entries_size = size;
+
+	xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
+	return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
+							entries, newinfo);
+}
+
+static int compat_copy_everything_to_user(struct ebt_table *t,
+					  void __user *user, int *len, int cmd)
+{
+	struct compat_ebt_replace repl, tmp;
+	struct ebt_counter *oldcounters;
+	struct ebt_table_info tinfo;
+	int ret;
+	void __user *pos;
+
+	memset(&tinfo, 0, sizeof(tinfo));
+
+	if (cmd == EBT_SO_GET_ENTRIES) {
+		tinfo.entries_size = t->private->entries_size;
+		tinfo.nentries = t->private->nentries;
+		tinfo.entries = t->private->entries;
+		oldcounters = t->private->counters;
+	} else {
+		tinfo.entries_size = t->table->entries_size;
+		tinfo.nentries = t->table->nentries;
+		tinfo.entries = t->table->entries;
+		oldcounters = t->table->counters;
+	}
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	if (tmp.nentries != tinfo.nentries ||
+	   (tmp.num_counters && tmp.num_counters != tinfo.nentries))
+		return -EINVAL;
+
+	memcpy(&repl, &tmp, sizeof(repl));
+	if (cmd == EBT_SO_GET_ENTRIES)
+		ret = compat_table_info(t->private, &repl);
+	else
+		ret = compat_table_info(&tinfo, &repl);
+	if (ret)
+		return ret;
+
+	if (*len != sizeof(tmp) + repl.entries_size +
+	   (tmp.num_counters? tinfo.nentries * sizeof(struct ebt_counter): 0)) {
+		pr_err("wrong size: *len %d, entries_size %u, replsz %d\n",
+				*len, tinfo.entries_size, repl.entries_size);
+		return -EINVAL;
+	}
+
+	/* userspace might not need the counters */
+	ret = copy_counters_to_user(t, oldcounters, compat_ptr(tmp.counters),
+					tmp.num_counters, tinfo.nentries);
+	if (ret)
+		return ret;
+
+	pos = compat_ptr(tmp.entries);
+	return EBT_ENTRY_ITERATE(tinfo.entries, tinfo.entries_size,
+			compat_copy_entry_to_user, &pos, &tmp.entries_size);
+}
+
+struct ebt_entries_buf_state {
+	char *buf_kern_start;	/* kernel buffer to copy (translated) data to */
+	u32 buf_kern_len;	/* total size of kernel buffer */
+	u32 buf_kern_offset;	/* amount of data copied so far */
+	u32 buf_user_offset;	/* read position in userspace buffer */
+};
+
+static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+	state->buf_kern_offset += sz;
+	return state->buf_kern_offset >= sz ? 0 : -EINVAL;
+}
+
+static int ebt_buf_add(struct ebt_entries_buf_state *state,
+		       void *data, unsigned int sz)
+{
+	if (state->buf_kern_start == NULL)
+		goto count_only;
+
+	BUG_ON(state->buf_kern_offset + sz > state->buf_kern_len);
+
+	memcpy(state->buf_kern_start + state->buf_kern_offset, data, sz);
+
+ count_only:
+	state->buf_user_offset += sz;
+	return ebt_buf_count(state, sz);
+}
+
+static int ebt_buf_add_pad(struct ebt_entries_buf_state *state, unsigned int sz)
+{
+	char *b = state->buf_kern_start;
+
+	BUG_ON(b && state->buf_kern_offset > state->buf_kern_len);
+
+	if (b != NULL && sz > 0)
+		memset(b + state->buf_kern_offset, 0, sz);
+	/* do not adjust ->buf_user_offset here, we added kernel-side padding */
+	return ebt_buf_count(state, sz);
+}
+
+enum compat_mwt {
+	EBT_COMPAT_MATCH,
+	EBT_COMPAT_WATCHER,
+	EBT_COMPAT_TARGET,
+};
+
+static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
+				enum compat_mwt compat_mwt,
+				struct ebt_entries_buf_state *state,
+				const unsigned char *base)
+{
+	char name[EBT_FUNCTION_MAXNAMELEN];
+	struct xt_match *match;
+	struct xt_target *wt;
+	void *dst = NULL;
+	int off, pad = 0;
+	unsigned int size_kern, match_size = mwt->match_size;
+
+	strlcpy(name, mwt->u.name, sizeof(name));
+
+	if (state->buf_kern_start)
+		dst = state->buf_kern_start + state->buf_kern_offset;
+
+	switch (compat_mwt) {
+	case EBT_COMPAT_MATCH:
+		match = try_then_request_module(xt_find_match(NFPROTO_BRIDGE,
+						name, 0), "ebt_%s", name);
+		if (match == NULL)
+			return -ENOENT;
+		if (IS_ERR(match))
+			return PTR_ERR(match);
+
+		off = ebt_compat_match_offset(match, match_size);
+		if (dst) {
+			if (match->compat_from_user)
+				match->compat_from_user(dst, mwt->data);
+			else
+				memcpy(dst, mwt->data, match_size);
+		}
+
+		size_kern = match->matchsize;
+		if (unlikely(size_kern == -1))
+			size_kern = match_size;
+		module_put(match->me);
+		break;
+	case EBT_COMPAT_WATCHER: /* fallthrough */
+	case EBT_COMPAT_TARGET:
+		wt = try_then_request_module(xt_find_target(NFPROTO_BRIDGE,
+						name, 0), "ebt_%s", name);
+		if (wt == NULL)
+			return -ENOENT;
+		if (IS_ERR(wt))
+			return PTR_ERR(wt);
+		off = xt_compat_target_offset(wt);
+
+		if (dst) {
+			if (wt->compat_from_user)
+				wt->compat_from_user(dst, mwt->data);
+			else
+				memcpy(dst, mwt->data, match_size);
+		}
+
+		size_kern = wt->targetsize;
+		module_put(wt->me);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	state->buf_kern_offset += match_size + off;
+	state->buf_user_offset += match_size;
+	pad = XT_ALIGN(size_kern) - size_kern;
+
+	if (pad > 0 && dst) {
+		BUG_ON(state->buf_kern_len <= pad);
+		BUG_ON(state->buf_kern_offset - (match_size + off) + size_kern > state->buf_kern_len - pad);
+		memset(dst + size_kern, 0, pad);
+	}
+	return off + match_size;
+}
+
+/*
+ * return size of all matches, watchers or target, including necessary
+ * alignment and padding.
+ */
+static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
+			unsigned int size_left, enum compat_mwt type,
+			struct ebt_entries_buf_state *state, const void *base)
+{
+	int growth = 0;
+	char *buf;
+
+	if (size_left == 0)
+		return 0;
+
+	buf = (char *) match32;
+
+	while (size_left >= sizeof(*match32)) {
+		struct ebt_entry_match *match_kern;
+		int ret;
+
+		match_kern = (struct ebt_entry_match *) state->buf_kern_start;
+		if (match_kern) {
+			char *tmp;
+			tmp = state->buf_kern_start + state->buf_kern_offset;
+			match_kern = (struct ebt_entry_match *) tmp;
+		}
+		ret = ebt_buf_add(state, buf, sizeof(*match32));
+		if (ret < 0)
+			return ret;
+		size_left -= sizeof(*match32);
+
+		/* add padding before match->data (if any) */
+		ret = ebt_buf_add_pad(state, ebt_compat_entry_padsize());
+		if (ret < 0)
+			return ret;
+
+		if (match32->match_size > size_left)
+			return -EINVAL;
+
+		size_left -= match32->match_size;
+
+		ret = compat_mtw_from_user(match32, type, state, base);
+		if (ret < 0)
+			return ret;
+
+		BUG_ON(ret < match32->match_size);
+		growth += ret - match32->match_size;
+		growth += ebt_compat_entry_padsize();
+
+		buf += sizeof(*match32);
+		buf += match32->match_size;
+
+		if (match_kern)
+			match_kern->match_size = ret;
+
+		WARN_ON(type == EBT_COMPAT_TARGET && size_left);
+		match32 = (struct compat_ebt_entry_mwt *) buf;
+	}
+
+	return growth;
+}
+
+/* called for all ebt_entry structures. */
+static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
+			  unsigned int *total,
+			  struct ebt_entries_buf_state *state)
+{
+	unsigned int i, j, startoff, new_offset = 0;
+	/* stores match/watchers/targets & offset of next struct ebt_entry: */
+	unsigned int offsets[4];
+	unsigned int *offsets_update = NULL;
+	int ret;
+	char *buf_start;
+
+	if (*total < sizeof(struct ebt_entries))
+		return -EINVAL;
+
+	if (!entry->bitmask) {
+		*total -= sizeof(struct ebt_entries);
+		return ebt_buf_add(state, entry, sizeof(struct ebt_entries));
+	}
+	if (*total < sizeof(*entry) || entry->next_offset < sizeof(*entry))
+		return -EINVAL;
+
+	startoff = state->buf_user_offset;
+	/* pull in most part of ebt_entry, it does not need to be changed. */
+	ret = ebt_buf_add(state, entry,
+			offsetof(struct ebt_entry, watchers_offset));
+	if (ret < 0)
+		return ret;
+
+	offsets[0] = sizeof(struct ebt_entry); /* matches come first */
+	memcpy(&offsets[1], &entry->watchers_offset,
+			sizeof(offsets) - sizeof(offsets[0]));
+
+	if (state->buf_kern_start) {
+		buf_start = state->buf_kern_start + state->buf_kern_offset;
+		offsets_update = (unsigned int *) buf_start;
+	}
+	ret = ebt_buf_add(state, &offsets[1],
+			sizeof(offsets) - sizeof(offsets[0]));
+	if (ret < 0)
+		return ret;
+	buf_start = (char *) entry;
+	/*
+	 * 0: matches offset, always follows ebt_entry.
+	 * 1: watchers offset, from ebt_entry structure
+	 * 2: target offset, from ebt_entry structure
+	 * 3: next ebt_entry offset, from ebt_entry structure
+	 *
+	 * offsets are relative to beginning of struct ebt_entry (i.e., 0).
+	 */
+	for (i = 0, j = 1 ; j < 4 ; j++, i++) {
+		struct compat_ebt_entry_mwt *match32;
+		unsigned int size;
+		char *buf = buf_start;
+
+		buf = buf_start + offsets[i];
+		if (offsets[i] > offsets[j])
+			return -EINVAL;
+
+		match32 = (struct compat_ebt_entry_mwt *) buf;
+		size = offsets[j] - offsets[i];
+		ret = ebt_size_mwt(match32, size, i, state, base);
+		if (ret < 0)
+			return ret;
+		new_offset += ret;
+		if (offsets_update && new_offset) {
+			pr_debug("change offset %d to %d\n",
+				offsets_update[i], offsets[j] + new_offset);
+			offsets_update[i] = offsets[j] + new_offset;
+		}
+	}
+
+	if (state->buf_kern_start == NULL) {
+		unsigned int offset = buf_start - (char *) base;
+
+		ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset);
+		if (ret < 0)
+			return ret;
+	}
+
+	startoff = state->buf_user_offset - startoff;
+
+	BUG_ON(*total < startoff);
+	*total -= startoff;
+	return 0;
+}
+
+/*
+ * repl->entries_size is the size of the ebt_entry blob in userspace.
+ * It might need more memory when copied to a 64 bit kernel in case
+ * userspace is 32-bit. So, first task: find out how much memory is needed.
+ *
+ * Called before validation is performed.
+ */
+static int compat_copy_entries(unsigned char *data, unsigned int size_user,
+				struct ebt_entries_buf_state *state)
+{
+	unsigned int size_remaining = size_user;
+	int ret;
+
+	ret = EBT_ENTRY_ITERATE(data, size_user, size_entry_mwt, data,
+					&size_remaining, state);
+	if (ret < 0)
+		return ret;
+
+	WARN_ON(size_remaining);
+	return state->buf_kern_offset;
+}
+
+
+static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
+					    void __user *user, unsigned int len)
+{
+	struct compat_ebt_replace tmp;
+	int i;
+
+	if (len < sizeof(tmp))
+		return -EINVAL;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	if (len != sizeof(tmp) + tmp.entries_size)
+		return -EINVAL;
+
+	if (tmp.entries_size == 0)
+		return -EINVAL;
+
+	if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
+			NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
+		return -ENOMEM;
+	if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
+		return -ENOMEM;
+
+	memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry));
+
+	/* starting with hook_entry, 32 vs. 64 bit structures are different */
+	for (i = 0; i < NF_BR_NUMHOOKS; i++)
+		repl->hook_entry[i] = compat_ptr(tmp.hook_entry[i]);
+
+	repl->num_counters = tmp.num_counters;
+	repl->counters = compat_ptr(tmp.counters);
+	repl->entries = compat_ptr(tmp.entries);
+	return 0;
+}
+
+static int compat_do_replace(struct net *net, void __user *user,
+			     unsigned int len)
+{
+	int ret, i, countersize, size64;
+	struct ebt_table_info *newinfo;
+	struct ebt_replace tmp;
+	struct ebt_entries_buf_state state;
+	void *entries_tmp;
+
+	ret = compat_copy_ebt_replace_from_user(&tmp, user, len);
+	if (ret) {
+		/* try real handler in case userland supplied needed padding */
+		if (ret == -EINVAL && do_replace(net, user, len) == 0)
+			ret = 0;
+		return ret;
+	}
+
+	countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
+	newinfo = vmalloc(sizeof(*newinfo) + countersize);
+	if (!newinfo)
+		return -ENOMEM;
+
+	if (countersize)
+		memset(newinfo->counters, 0, countersize);
+
+	memset(&state, 0, sizeof(state));
+
+	newinfo->entries = vmalloc(tmp.entries_size);
+	if (!newinfo->entries) {
+		ret = -ENOMEM;
+		goto free_newinfo;
+	}
+	if (copy_from_user(
+	   newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
+		ret = -EFAULT;
+		goto free_entries;
+	}
+
+	entries_tmp = newinfo->entries;
+
+	xt_compat_lock(NFPROTO_BRIDGE);
+
+	xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+	ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+	if (ret < 0)
+		goto out_unlock;
+
+	pr_debug("tmp.entries_size %d, kern off %d, user off %d delta %d\n",
+		tmp.entries_size, state.buf_kern_offset, state.buf_user_offset,
+		xt_compat_calc_jump(NFPROTO_BRIDGE, tmp.entries_size));
+
+	size64 = ret;
+	newinfo->entries = vmalloc(size64);
+	if (!newinfo->entries) {
+		vfree(entries_tmp);
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	memset(&state, 0, sizeof(state));
+	state.buf_kern_start = newinfo->entries;
+	state.buf_kern_len = size64;
+
+	ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
+	BUG_ON(ret < 0);	/* parses same data again */
+
+	vfree(entries_tmp);
+	tmp.entries_size = size64;
+
+	for (i = 0; i < NF_BR_NUMHOOKS; i++) {
+		char __user *usrptr;
+		if (tmp.hook_entry[i]) {
+			unsigned int delta;
+			usrptr = (char __user *) tmp.hook_entry[i];
+			delta = usrptr - tmp.entries;
+			usrptr += xt_compat_calc_jump(NFPROTO_BRIDGE, delta);
+			tmp.hook_entry[i] = (struct ebt_entries __user *)usrptr;
+		}
+	}
+
+	xt_compat_flush_offsets(NFPROTO_BRIDGE);
+	xt_compat_unlock(NFPROTO_BRIDGE);
+
+	ret = do_replace_finish(net, &tmp, newinfo);
+	if (ret == 0)
+		return ret;
+free_entries:
+	vfree(newinfo->entries);
+free_newinfo:
+	vfree(newinfo);
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(NFPROTO_BRIDGE);
+	xt_compat_unlock(NFPROTO_BRIDGE);
+	goto free_entries;
+}
+
+static int compat_update_counters(struct net *net, void __user *user,
+				  unsigned int len)
+{
+	struct compat_ebt_replace hlp;
+
+	if (copy_from_user(&hlp, user, sizeof(hlp)))
+		return -EFAULT;
+
+	/* try real handler in case userland supplied needed padding */
+	if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
+		return update_counters(net, user, len);
+
+	return do_update_counters(net, hlp.name, compat_ptr(hlp.counters),
+					hlp.num_counters, user, len);
+}
+
+static int compat_do_ebt_set_ctl(struct sock *sk,
+		int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case EBT_SO_SET_ENTRIES:
+		ret = compat_do_replace(sock_net(sk), user, len);
+		break;
+	case EBT_SO_SET_COUNTERS:
+		ret = compat_update_counters(sock_net(sk), user, len);
+		break;
+	default:
+		ret = -EINVAL;
+  }
+	return ret;
+}
+
+static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
+		void __user *user, int *len)
+{
+	int ret;
+	struct compat_ebt_replace tmp;
+	struct ebt_table *t;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* try real handler in case userland supplied needed padding */
+	if ((cmd == EBT_SO_GET_INFO ||
+	     cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp))
+			return do_ebt_get_ctl(sk, cmd, user, len);
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex);
+	if (!t)
+		return ret;
+
+	xt_compat_lock(NFPROTO_BRIDGE);
+	switch (cmd) {
+	case EBT_SO_GET_INFO:
+		tmp.nentries = t->private->nentries;
+		ret = compat_table_info(t->private, &tmp);
+		if (ret)
+			goto out;
+		tmp.valid_hooks = t->valid_hooks;
+
+		if (copy_to_user(user, &tmp, *len) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = 0;
+		break;
+	case EBT_SO_GET_INIT_INFO:
+		tmp.nentries = t->table->nentries;
+		tmp.entries_size = t->table->entries_size;
+		tmp.valid_hooks = t->table->valid_hooks;
+
+		if (copy_to_user(user, &tmp, *len) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = 0;
+		break;
+	case EBT_SO_GET_ENTRIES:
+	case EBT_SO_GET_INIT_ENTRIES:
+		/*
+		 * try real handler first in case of userland-side padding.
+		 * in case we are dealing with an 'ordinary' 32 bit binary
+		 * without 64bit compatibility padding, this will fail right
+		 * after copy_from_user when the *len argument is validated.
+		 *
+		 * the compat_ variant needs to do one pass over the kernel
+		 * data set to adjust for size differences before it the check.
+		 */
+		if (copy_everything_to_user(t, user, len, cmd) == 0)
+			ret = 0;
+		else
+			ret = compat_copy_everything_to_user(t, user, len, cmd);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+ out:
+	xt_compat_flush_offsets(NFPROTO_BRIDGE);
+	xt_compat_unlock(NFPROTO_BRIDGE);
+	mutex_unlock(&ebt_mutex);
+	return ret;
+}
+#endif
+
+static struct nf_sockopt_ops ebt_sockopts =
+{
+	.pf		= PF_INET,
+	.set_optmin	= EBT_BASE_CTL,
+	.set_optmax	= EBT_SO_SET_MAX + 1,
+	.set		= do_ebt_set_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_set	= compat_do_ebt_set_ctl,
+#endif
+	.get_optmin	= EBT_BASE_CTL,
+	.get_optmax	= EBT_SO_GET_MAX + 1,
+	.get		= do_ebt_get_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_get	= compat_do_ebt_get_ctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static int __init ebtables_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&ebt_standard_target);
+	if (ret < 0)
+		return ret;
+	ret = nf_register_sockopt(&ebt_sockopts);
+	if (ret < 0) {
+		xt_unregister_target(&ebt_standard_target);
+		return ret;
+	}
+
+	printk(KERN_INFO "Ebtables v2.0 registered\n");
+	return 0;
+}
+
+static void __exit ebtables_fini(void)
+{
+	nf_unregister_sockopt(&ebt_sockopts);
+	xt_unregister_target(&ebt_standard_target);
+	printk(KERN_INFO "Ebtables v2.0 unregistered\n");
+}
+
+EXPORT_SYMBOL(ebt_register_table);
+EXPORT_SYMBOL(ebt_unregister_table);
+EXPORT_SYMBOL(ebt_do_table);
+module_init(ebtables_init);
+module_exit(ebtables_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
new file mode 100644
index 00000000..529750da
--- /dev/null
+++ b/net/caif/Kconfig
@@ -0,0 +1,42 @@
+#
+# CAIF net configurations
+#
+
+menuconfig CAIF
+	tristate "CAIF support"
+	select CRC_CCITT
+	default n
+	---help---
+	The "Communication CPU to Application CPU Interface" (CAIF) is a packet
+	based connection-oriented MUX protocol developed by ST-Ericsson for use
+	with its modems. It is accessed from user space as sockets (PF_CAIF).
+
+	Say Y (or M) here if you build for a phone product (e.g. Android or
+	MeeGo ) that uses CAIF as transport, if unsure say N.
+
+	If you select to build it as module then CAIF_NETDEV also needs to be
+	built as modules. You will also need to say yes to any CAIF physical
+	devices that your platform requires.
+
+	See Documentation/networking/caif for a further explanation on how to
+	use and configure CAIF.
+
+config  CAIF_DEBUG
+	bool "Enable Debug"
+	depends on CAIF
+	default n
+	--- help ---
+	Enable the inclusion of debug code in the CAIF stack.
+	Be aware that doing this will impact performance.
+	If unsure say N.
+
+config CAIF_NETDEV
+	tristate "CAIF GPRS Network device"
+	depends on CAIF
+	default CAIF
+	---help---
+	Say Y if you will be using a CAIF based GPRS network device.
+	This can be either built-in or a loadable module,
+	If you select to build it as a built-in then the main CAIF device must
+	also be a built-in.
+	If unsure say Y.
diff --git a/net/caif/Makefile b/net/caif/Makefile
new file mode 100644
index 00000000..ebcd4e7e
--- /dev/null
+++ b/net/caif/Makefile
@@ -0,0 +1,14 @@
+ccflags-$(CONFIG_CAIF_DEBUG)     :=      -DDEBUG
+
+caif-y := caif_dev.o \
+	cfcnfg.o cfmuxl.o cfctrl.o  \
+	cffrml.o cfveil.o cfdbgl.o\
+	cfserl.o cfdgml.o  \
+	cfrfml.o cfvidl.o cfutill.o \
+	cfsrvl.o cfpkt_skbuff.o
+
+obj-$(CONFIG_CAIF) += caif.o
+obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o
+obj-$(CONFIG_CAIF) += caif_socket.o
+
+export-y := caif.o
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
new file mode 100644
index 00000000..5ba4366a
--- /dev/null
+++ b/net/caif/caif_dev.c
@@ -0,0 +1,433 @@
+/*
+ * CAIF Interface registration.
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ * Borrowed heavily from file: pn_dev.c. Thanks to
+ *  Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *  and Sakari Ailus <sakari.ailus@nokia.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <net/netns/generic.h>
+#include <net/net_namespace.h>
+#include <net/pkt_sched.h>
+#include <net/caif/caif_device.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+
+MODULE_LICENSE("GPL");
+
+/* Used for local tracking of the CAIF net devices */
+struct caif_device_entry {
+	struct cflayer layer;
+	struct list_head list;
+	struct net_device *netdev;
+	int __percpu *pcpu_refcnt;
+};
+
+struct caif_device_entry_list {
+	struct list_head list;
+	/* Protects simulanous deletes in list */
+	struct mutex lock;
+};
+
+struct caif_net {
+	struct cfcnfg *cfg;
+	struct caif_device_entry_list caifdevs;
+};
+
+static int caif_net_id;
+
+struct cfcnfg *get_cfcnfg(struct net *net)
+{
+	struct caif_net *caifn;
+	BUG_ON(!net);
+	caifn = net_generic(net, caif_net_id);
+	return caifn->cfg;
+}
+EXPORT_SYMBOL(get_cfcnfg);
+
+static struct caif_device_entry_list *caif_device_list(struct net *net)
+{
+	struct caif_net *caifn;
+	BUG_ON(!net);
+	caifn = net_generic(net, caif_net_id);
+	return &caifn->caifdevs;
+}
+
+static void caifd_put(struct caif_device_entry *e)
+{
+	irqsafe_cpu_dec(*e->pcpu_refcnt);
+}
+
+static void caifd_hold(struct caif_device_entry *e)
+{
+	irqsafe_cpu_inc(*e->pcpu_refcnt);
+}
+
+static int caifd_refcnt_read(struct caif_device_entry *e)
+{
+	int i, refcnt = 0;
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(e->pcpu_refcnt, i);
+	return refcnt;
+}
+
+/* Allocate new CAIF device. */
+static struct caif_device_entry *caif_device_alloc(struct net_device *dev)
+{
+	struct caif_device_entry_list *caifdevs;
+	struct caif_device_entry *caifd;
+
+	caifdevs = caif_device_list(dev_net(dev));
+
+	caifd = kzalloc(sizeof(*caifd), GFP_ATOMIC);
+	if (!caifd)
+		return NULL;
+	caifd->pcpu_refcnt = alloc_percpu(int);
+	caifd->netdev = dev;
+	dev_hold(dev);
+	return caifd;
+}
+
+static struct caif_device_entry *caif_get(struct net_device *dev)
+{
+	struct caif_device_entry_list *caifdevs =
+	    caif_device_list(dev_net(dev));
+	struct caif_device_entry *caifd;
+
+	list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
+		if (caifd->netdev == dev)
+			return caifd;
+	}
+	return NULL;
+}
+
+static int transmit(struct cflayer *layer, struct cfpkt *pkt)
+{
+	int err;
+	struct caif_device_entry *caifd =
+	    container_of(layer, struct caif_device_entry, layer);
+	struct sk_buff *skb;
+
+	skb = cfpkt_tonative(pkt);
+	skb->dev = caifd->netdev;
+
+	err = dev_queue_xmit(skb);
+	if (err > 0)
+		err = -EIO;
+
+	return err;
+}
+
+/*
+ * Stuff received packets into the CAIF stack.
+ * On error, returns non-zero and releases the skb.
+ */
+static int receive(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pkttype, struct net_device *orig_dev)
+{
+	struct cfpkt *pkt;
+	struct caif_device_entry *caifd;
+	int err;
+
+	pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
+
+	rcu_read_lock();
+	caifd = caif_get(dev);
+
+	if (!caifd || !caifd->layer.up || !caifd->layer.up->receive ||
+			!netif_oper_up(caifd->netdev)) {
+		rcu_read_unlock();
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	/* Hold reference to netdevice while using CAIF stack */
+	caifd_hold(caifd);
+	rcu_read_unlock();
+
+	err = caifd->layer.up->receive(caifd->layer.up, pkt);
+
+	/* For -EILSEQ the packet is not freed so so it now */
+	if (err == -EILSEQ)
+		cfpkt_destroy(pkt);
+
+	/* Release reference to stack upwards */
+	caifd_put(caifd);
+	return 0;
+}
+
+static struct packet_type caif_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_CAIF),
+	.func = receive,
+};
+
+static void dev_flowctrl(struct net_device *dev, int on)
+{
+	struct caif_device_entry *caifd;
+
+	rcu_read_lock();
+
+	caifd = caif_get(dev);
+	if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+		rcu_read_unlock();
+		return;
+	}
+
+	caifd_hold(caifd);
+	rcu_read_unlock();
+
+	caifd->layer.up->ctrlcmd(caifd->layer.up,
+				 on ?
+				 _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND :
+				 _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
+				 caifd->layer.id);
+	caifd_put(caifd);
+}
+
+/* notify Caif of device events */
+static int caif_device_notify(struct notifier_block *me, unsigned long what,
+			      void *arg)
+{
+	struct net_device *dev = arg;
+	struct caif_device_entry *caifd = NULL;
+	struct caif_dev_common *caifdev;
+	enum cfcnfg_phy_preference pref;
+	enum cfcnfg_phy_type phy_type;
+	struct cfcnfg *cfg;
+	struct caif_device_entry_list *caifdevs;
+
+	if (dev->type != ARPHRD_CAIF)
+		return 0;
+
+	cfg = get_cfcnfg(dev_net(dev));
+	if (cfg == NULL)
+		return 0;
+
+	caifdevs = caif_device_list(dev_net(dev));
+
+	switch (what) {
+	case NETDEV_REGISTER:
+		caifd = caif_device_alloc(dev);
+		if (!caifd)
+			return 0;
+
+		caifdev = netdev_priv(dev);
+		caifdev->flowctrl = dev_flowctrl;
+
+		caifd->layer.transmit = transmit;
+
+		if (caifdev->use_frag)
+			phy_type = CFPHYTYPE_FRAG;
+		else
+			phy_type = CFPHYTYPE_CAIF;
+
+		switch (caifdev->link_select) {
+		case CAIF_LINK_HIGH_BANDW:
+			pref = CFPHYPREF_HIGH_BW;
+			break;
+		case CAIF_LINK_LOW_LATENCY:
+			pref = CFPHYPREF_LOW_LAT;
+			break;
+		default:
+			pref = CFPHYPREF_HIGH_BW;
+			break;
+		}
+		strncpy(caifd->layer.name, dev->name,
+			sizeof(caifd->layer.name) - 1);
+		caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0;
+
+		mutex_lock(&caifdevs->lock);
+		list_add_rcu(&caifd->list, &caifdevs->list);
+
+		cfcnfg_add_phy_layer(cfg,
+				     phy_type,
+				     dev,
+				     &caifd->layer,
+				     pref,
+				     caifdev->use_fcs,
+				     caifdev->use_stx);
+		mutex_unlock(&caifdevs->lock);
+		break;
+
+	case NETDEV_UP:
+		rcu_read_lock();
+
+		caifd = caif_get(dev);
+		if (caifd == NULL) {
+			rcu_read_unlock();
+			break;
+		}
+
+		cfcnfg_set_phy_state(cfg, &caifd->layer, true);
+		rcu_read_unlock();
+
+		break;
+
+	case NETDEV_DOWN:
+		rcu_read_lock();
+
+		caifd = caif_get(dev);
+		if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+
+		cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+		caifd_hold(caifd);
+		rcu_read_unlock();
+
+		caifd->layer.up->ctrlcmd(caifd->layer.up,
+					 _CAIF_CTRLCMD_PHYIF_DOWN_IND,
+					 caifd->layer.id);
+		caifd_put(caifd);
+		break;
+
+	case NETDEV_UNREGISTER:
+		mutex_lock(&caifdevs->lock);
+
+		caifd = caif_get(dev);
+		if (caifd == NULL) {
+			mutex_unlock(&caifdevs->lock);
+			break;
+		}
+		list_del_rcu(&caifd->list);
+
+		/*
+		 * NETDEV_UNREGISTER is called repeatedly until all reference
+		 * counts for the net-device are released. If references to
+		 * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for
+		 * the next call to NETDEV_UNREGISTER.
+		 *
+		 * If any packets are in flight down the CAIF Stack,
+		 * cfcnfg_del_phy_layer will return nonzero.
+		 * If no packets are in flight, the CAIF Stack associated
+		 * with the net-device un-registering is freed.
+		 */
+
+		if (caifd_refcnt_read(caifd) != 0 ||
+			cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) {
+
+			pr_info("Wait for device inuse\n");
+			/* Enrole device if CAIF Stack is still in use */
+			list_add_rcu(&caifd->list, &caifdevs->list);
+			mutex_unlock(&caifdevs->lock);
+			break;
+		}
+
+		synchronize_rcu();
+		dev_put(caifd->netdev);
+		free_percpu(caifd->pcpu_refcnt);
+		kfree(caifd);
+
+		mutex_unlock(&caifdevs->lock);
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block caif_device_notifier = {
+	.notifier_call = caif_device_notify,
+	.priority = 0,
+};
+
+/* Per-namespace Caif devices handling */
+static int caif_init_net(struct net *net)
+{
+	struct caif_net *caifn = net_generic(net, caif_net_id);
+
+	INIT_LIST_HEAD(&caifn->caifdevs.list);
+	mutex_init(&caifn->caifdevs.lock);
+
+	caifn->cfg = cfcnfg_create();
+	if (!caifn->cfg) {
+		pr_warn("can't create cfcnfg\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void caif_exit_net(struct net *net)
+{
+	struct caif_device_entry *caifd, *tmp;
+	struct caif_device_entry_list *caifdevs =
+	    caif_device_list(net);
+	struct cfcnfg *cfg;
+
+	rtnl_lock();
+	mutex_lock(&caifdevs->lock);
+
+	cfg = get_cfcnfg(net);
+	if (cfg == NULL) {
+		mutex_unlock(&caifdevs->lock);
+		return;
+	}
+
+	list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) {
+		int i = 0;
+		list_del_rcu(&caifd->list);
+		cfcnfg_set_phy_state(cfg, &caifd->layer, false);
+
+		while (i < 10 &&
+			(caifd_refcnt_read(caifd) != 0 ||
+			cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) {
+
+			pr_info("Wait for device inuse\n");
+			msleep(250);
+			i++;
+		}
+		synchronize_rcu();
+		dev_put(caifd->netdev);
+		free_percpu(caifd->pcpu_refcnt);
+		kfree(caifd);
+	}
+	cfcnfg_remove(cfg);
+
+	mutex_unlock(&caifdevs->lock);
+	rtnl_unlock();
+}
+
+static struct pernet_operations caif_net_ops = {
+	.init = caif_init_net,
+	.exit = caif_exit_net,
+	.id   = &caif_net_id,
+	.size = sizeof(struct caif_net),
+};
+
+/* Initialize Caif devices list */
+static int __init caif_device_init(void)
+{
+	int result;
+
+	result = register_pernet_subsys(&caif_net_ops);
+
+	if (result)
+		return result;
+
+	register_netdevice_notifier(&caif_device_notifier);
+	dev_add_pack(&caif_packet_type);
+
+	return result;
+}
+
+static void __exit caif_device_exit(void)
+{
+	unregister_pernet_subsys(&caif_net_ops);
+	unregister_netdevice_notifier(&caif_device_notifier);
+	dev_remove_pack(&caif_packet_type);
+}
+
+module_init(caif_device_init);
+module_exit(caif_device_exit);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
new file mode 100644
index 00000000..a9862808
--- /dev/null
+++ b/net/caif/caif_socket.c
@@ -0,0 +1,1220 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/caif/caif_socket.h>
+#include <linux/atomic.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/caif_dev.h>
+#include <net/caif/cfpkt.h>
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(AF_CAIF);
+
+/*
+ * CAIF state is re-using the TCP socket states.
+ * caif_states stored in sk_state reflect the state as reported by
+ * the CAIF stack, while sk_socket->state is the state of the socket.
+ */
+enum caif_states {
+	CAIF_CONNECTED		= TCP_ESTABLISHED,
+	CAIF_CONNECTING	= TCP_SYN_SENT,
+	CAIF_DISCONNECTED	= TCP_CLOSE
+};
+
+#define TX_FLOW_ON_BIT	1
+#define RX_FLOW_ON_BIT	2
+
+static struct dentry *debugfsdir;
+
+#ifdef CONFIG_DEBUG_FS
+struct debug_fs_counter {
+	atomic_t caif_nr_socks;
+	atomic_t caif_sock_create;
+	atomic_t num_connect_req;
+	atomic_t num_connect_resp;
+	atomic_t num_connect_fail_resp;
+	atomic_t num_disconnect;
+	atomic_t num_remote_shutdown_ind;
+	atomic_t num_tx_flow_off_ind;
+	atomic_t num_tx_flow_on_ind;
+	atomic_t num_rx_flow_off;
+	atomic_t num_rx_flow_on;
+};
+static struct debug_fs_counter cnt;
+#define	dbfs_atomic_inc(v) atomic_inc_return(v)
+#define	dbfs_atomic_dec(v) atomic_dec_return(v)
+#else
+#define	dbfs_atomic_inc(v) 0
+#define	dbfs_atomic_dec(v) 0
+#endif
+
+struct caifsock {
+	struct sock sk; /* must be first member */
+	struct cflayer layer;
+	char name[CAIF_LAYER_NAME_SZ]; /* Used for debugging */
+	u32 flow_state;
+	struct caif_connect_request conn_req;
+	struct mutex readlock;
+	struct dentry *debugfs_socket_dir;
+	int headroom, tailroom, maxframe;
+};
+
+static int rx_flow_is_on(struct caifsock *cf_sk)
+{
+	return test_bit(RX_FLOW_ON_BIT,
+			(void *) &cf_sk->flow_state);
+}
+
+static int tx_flow_is_on(struct caifsock *cf_sk)
+{
+	return test_bit(TX_FLOW_ON_BIT,
+			(void *) &cf_sk->flow_state);
+}
+
+static void set_rx_flow_off(struct caifsock *cf_sk)
+{
+	 clear_bit(RX_FLOW_ON_BIT,
+		 (void *) &cf_sk->flow_state);
+}
+
+static void set_rx_flow_on(struct caifsock *cf_sk)
+{
+	 set_bit(RX_FLOW_ON_BIT,
+			(void *) &cf_sk->flow_state);
+}
+
+static void set_tx_flow_off(struct caifsock *cf_sk)
+{
+	 clear_bit(TX_FLOW_ON_BIT,
+		(void *) &cf_sk->flow_state);
+}
+
+static void set_tx_flow_on(struct caifsock *cf_sk)
+{
+	 set_bit(TX_FLOW_ON_BIT,
+		(void *) &cf_sk->flow_state);
+}
+
+static void caif_read_lock(struct sock *sk)
+{
+	struct caifsock *cf_sk;
+	cf_sk = container_of(sk, struct caifsock, sk);
+	mutex_lock(&cf_sk->readlock);
+}
+
+static void caif_read_unlock(struct sock *sk)
+{
+	struct caifsock *cf_sk;
+	cf_sk = container_of(sk, struct caifsock, sk);
+	mutex_unlock(&cf_sk->readlock);
+}
+
+static int sk_rcvbuf_lowwater(struct caifsock *cf_sk)
+{
+	/* A quarter of full buffer is used a low water mark */
+	return cf_sk->sk.sk_rcvbuf / 4;
+}
+
+static void caif_flow_ctrl(struct sock *sk, int mode)
+{
+	struct caifsock *cf_sk;
+	cf_sk = container_of(sk, struct caifsock, sk);
+	if (cf_sk->layer.dn && cf_sk->layer.dn->modemcmd)
+		cf_sk->layer.dn->modemcmd(cf_sk->layer.dn, mode);
+}
+
+/*
+ * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
+ * not dropped, but CAIF is sending flow off instead.
+ */
+static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+	int skb_len;
+	unsigned long flags;
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+		(unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
+		if (net_ratelimit())
+			pr_debug("sending flow OFF (queue len = %d %d)\n",
+					atomic_read(&cf_sk->sk.sk_rmem_alloc),
+					sk_rcvbuf_lowwater(cf_sk));
+		set_rx_flow_off(cf_sk);
+		dbfs_atomic_inc(&cnt.num_rx_flow_off);
+		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+	}
+
+	err = sk_filter(sk, skb);
+	if (err)
+		return err;
+	if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
+		set_rx_flow_off(cf_sk);
+		if (net_ratelimit())
+			pr_debug("sending flow OFF due to rmem_schedule\n");
+		dbfs_atomic_inc(&cnt.num_rx_flow_off);
+		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
+	}
+	skb->dev = NULL;
+	skb_set_owner_r(skb, sk);
+	/* Cache the SKB length before we tack it onto the receive
+	 * queue. Once it is added it no longer belongs to us and
+	 * may be freed by other threads of control pulling packets
+	 * from the queue.
+	 */
+	skb_len = skb->len;
+	spin_lock_irqsave(&list->lock, flags);
+	if (!sock_flag(sk, SOCK_DEAD))
+		__skb_queue_tail(list, skb);
+	spin_unlock_irqrestore(&list->lock, flags);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb_len);
+	else
+		kfree_skb(skb);
+	return 0;
+}
+
+/* Packet Receive Callback function called from CAIF Stack */
+static int caif_sktrecv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+	struct caifsock *cf_sk;
+	struct sk_buff *skb;
+
+	cf_sk = container_of(layr, struct caifsock, layer);
+	skb = cfpkt_tonative(pkt);
+
+	if (unlikely(cf_sk->sk.sk_state != CAIF_CONNECTED)) {
+		kfree_skb(skb);
+		return 0;
+	}
+	caif_queue_rcv_skb(&cf_sk->sk, skb);
+	return 0;
+}
+
+static void cfsk_hold(struct cflayer *layr)
+{
+	struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+	sock_hold(&cf_sk->sk);
+}
+
+static void cfsk_put(struct cflayer *layr)
+{
+	struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+	sock_put(&cf_sk->sk);
+}
+
+/* Packet Control Callback function called from CAIF */
+static void caif_ctrl_cb(struct cflayer *layr,
+				enum caif_ctrlcmd flow,
+				int phyid)
+{
+	struct caifsock *cf_sk = container_of(layr, struct caifsock, layer);
+	switch (flow) {
+	case CAIF_CTRLCMD_FLOW_ON_IND:
+		/* OK from modem to start sending again */
+		dbfs_atomic_inc(&cnt.num_tx_flow_on_ind);
+		set_tx_flow_on(cf_sk);
+		cf_sk->sk.sk_state_change(&cf_sk->sk);
+		break;
+
+	case CAIF_CTRLCMD_FLOW_OFF_IND:
+		/* Modem asks us to shut up */
+		dbfs_atomic_inc(&cnt.num_tx_flow_off_ind);
+		set_tx_flow_off(cf_sk);
+		cf_sk->sk.sk_state_change(&cf_sk->sk);
+		break;
+
+	case CAIF_CTRLCMD_INIT_RSP:
+		/* We're now connected */
+		caif_client_register_refcnt(&cf_sk->layer,
+						cfsk_hold, cfsk_put);
+		dbfs_atomic_inc(&cnt.num_connect_resp);
+		cf_sk->sk.sk_state = CAIF_CONNECTED;
+		set_tx_flow_on(cf_sk);
+		cf_sk->sk.sk_state_change(&cf_sk->sk);
+		break;
+
+	case CAIF_CTRLCMD_DEINIT_RSP:
+		/* We're now disconnected */
+		cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+		cf_sk->sk.sk_state_change(&cf_sk->sk);
+		break;
+
+	case CAIF_CTRLCMD_INIT_FAIL_RSP:
+		/* Connect request failed */
+		dbfs_atomic_inc(&cnt.num_connect_fail_resp);
+		cf_sk->sk.sk_err = ECONNREFUSED;
+		cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+		cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+		/*
+		 * Socket "standards" seems to require POLLOUT to
+		 * be set at connect failure.
+		 */
+		set_tx_flow_on(cf_sk);
+		cf_sk->sk.sk_state_change(&cf_sk->sk);
+		break;
+
+	case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+		/* Modem has closed this connection, or device is down. */
+		dbfs_atomic_inc(&cnt.num_remote_shutdown_ind);
+		cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
+		cf_sk->sk.sk_err = ECONNRESET;
+		set_rx_flow_on(cf_sk);
+		cf_sk->sk.sk_error_report(&cf_sk->sk);
+		break;
+
+	default:
+		pr_debug("Unexpected flow command %d\n", flow);
+	}
+}
+
+static void caif_check_flow_release(struct sock *sk)
+{
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+	if (rx_flow_is_on(cf_sk))
+		return;
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk_rcvbuf_lowwater(cf_sk)) {
+			dbfs_atomic_inc(&cnt.num_rx_flow_on);
+			set_rx_flow_on(cf_sk);
+			caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_ON_REQ);
+	}
+}
+
+/*
+ * Copied from unix_dgram_recvmsg, but removed credit checks,
+ * changed locking, address handling and added MSG_TRUNC.
+ */
+static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *m, size_t len, int flags)
+
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int ret;
+	int copylen;
+
+	ret = -EOPNOTSUPP;
+	if (m->msg_flags&MSG_OOB)
+		goto read_error;
+
+	skb = skb_recv_datagram(sk, flags, 0 , &ret);
+	if (!skb)
+		goto read_error;
+	copylen = skb->len;
+	if (len < copylen) {
+		m->msg_flags |= MSG_TRUNC;
+		copylen = len;
+	}
+
+	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
+	if (ret)
+		goto out_free;
+
+	ret = (flags & MSG_TRUNC) ? skb->len : copylen;
+out_free:
+	skb_free_datagram(sk, skb);
+	caif_check_flow_release(sk);
+	return ret;
+
+read_error:
+	return ret;
+}
+
+
+/* Copied from unix_stream_wait_data, identical except for lock call. */
+static long caif_stream_data_wait(struct sock *sk, long timeo)
+{
+	DEFINE_WAIT(wait);
+	lock_sock(sk);
+
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+			sk->sk_err ||
+			sk->sk_state != CAIF_CONNECTED ||
+			sock_flag(sk, SOCK_DEAD) ||
+			(sk->sk_shutdown & RCV_SHUTDOWN) ||
+			signal_pending(current) ||
+			!timeo)
+			break;
+
+		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	}
+
+	finish_wait(sk_sleep(sk), &wait);
+	release_sock(sk);
+	return timeo;
+}
+
+
+/*
+ * Copied from unix_stream_recvmsg, but removed credit checks,
+ * changed locking calls, changed address handling.
+ */
+static int caif_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *msg, size_t size,
+				int flags)
+{
+	struct sock *sk = sock->sk;
+	int copied = 0;
+	int target;
+	int err = 0;
+	long timeo;
+
+	err = -EOPNOTSUPP;
+	if (flags&MSG_OOB)
+		goto out;
+
+	msg->msg_namelen = 0;
+
+	/*
+	 * Lock the socket to prevent queue disordering
+	 * while sleeps in memcpy_tomsg
+	 */
+	err = -EAGAIN;
+	if (sk->sk_state == CAIF_CONNECTING)
+		goto out;
+
+	caif_read_lock(sk);
+	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
+
+	do {
+		int chunk;
+		struct sk_buff *skb;
+
+		lock_sock(sk);
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		caif_check_flow_release(sk);
+
+		if (skb == NULL) {
+			if (copied >= target)
+				goto unlock;
+			/*
+			 *	POSIX 1003.1g mandates this order.
+			 */
+			err = sock_error(sk);
+			if (err)
+				goto unlock;
+			err = -ECONNRESET;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				goto unlock;
+
+			err = -EPIPE;
+			if (sk->sk_state != CAIF_CONNECTED)
+				goto unlock;
+			if (sock_flag(sk, SOCK_DEAD))
+				goto unlock;
+
+			release_sock(sk);
+
+			err = -EAGAIN;
+			if (!timeo)
+				break;
+
+			caif_read_unlock(sk);
+
+			timeo = caif_stream_data_wait(sk, timeo);
+
+			if (signal_pending(current)) {
+				err = sock_intr_errno(timeo);
+				goto out;
+			}
+			caif_read_lock(sk);
+			continue;
+unlock:
+			release_sock(sk);
+			break;
+		}
+		release_sock(sk);
+		chunk = min_t(unsigned int, skb->len, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			if (copied == 0)
+				copied = -EFAULT;
+			break;
+		}
+		copied += chunk;
+		size -= chunk;
+
+		/* Mark read part of skb as used */
+		if (!(flags & MSG_PEEK)) {
+			skb_pull(skb, chunk);
+
+			/* put the skb back if we didn't use it up. */
+			if (skb->len) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+			kfree_skb(skb);
+
+		} else {
+			/*
+			 * It is questionable, see note in unix_dgram_recvmsg.
+			 */
+			/* put message back and return */
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			break;
+		}
+	} while (size);
+	caif_read_unlock(sk);
+
+out:
+	return copied ? : err;
+}
+
+/*
+ * Copied from sock.c:sock_wait_for_wmem, but change to wait for
+ * CAIF flow-on and sock_writable.
+ */
+static long caif_wait_for_flow_on(struct caifsock *cf_sk,
+				int wait_writeable, long timeo, int *err)
+{
+	struct sock *sk = &cf_sk->sk;
+	DEFINE_WAIT(wait);
+	for (;;) {
+		*err = 0;
+		if (tx_flow_is_on(cf_sk) &&
+			(!wait_writeable || sock_writeable(&cf_sk->sk)))
+			break;
+		*err = -ETIMEDOUT;
+		if (!timeo)
+			break;
+		*err = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		*err = -ECONNRESET;
+		if (sk->sk_shutdown & SHUTDOWN_MASK)
+			break;
+		*err = -sk->sk_err;
+		if (sk->sk_err)
+			break;
+		*err = -EPIPE;
+		if (cf_sk->sk.sk_state != CAIF_CONNECTED)
+			break;
+		timeo = schedule_timeout(timeo);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return timeo;
+}
+
+/*
+ * Transmit a SKB. The device may temporarily request re-transmission
+ * by returning EAGAIN.
+ */
+static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
+			int noblock, long timeo)
+{
+	struct cfpkt *pkt;
+
+	pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb);
+	memset(skb->cb, 0, sizeof(struct caif_payload_info));
+
+	if (cf_sk->layer.dn == NULL)
+		return -EINVAL;
+
+	return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt);
+}
+
+/* Copied from af_unix:unix_dgram_sendmsg, and adapted to CAIF */
+static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
+			struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	int buffer_size;
+	int ret = 0;
+	struct sk_buff *skb = NULL;
+	int noblock;
+	long timeo;
+	caif_assert(cf_sk);
+	ret = sock_error(sk);
+	if (ret)
+		goto err;
+
+	ret = -EOPNOTSUPP;
+	if (msg->msg_flags&MSG_OOB)
+		goto err;
+
+	ret = -EOPNOTSUPP;
+	if (msg->msg_namelen)
+		goto err;
+
+	ret = -EINVAL;
+	if (unlikely(msg->msg_iov->iov_base == NULL))
+		goto err;
+	noblock = msg->msg_flags & MSG_DONTWAIT;
+
+	timeo = sock_sndtimeo(sk, noblock);
+	timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk),
+				1, timeo, &ret);
+
+	if (ret)
+		goto err;
+	ret = -EPIPE;
+	if (cf_sk->sk.sk_state != CAIF_CONNECTED ||
+		sock_flag(sk, SOCK_DEAD) ||
+		(sk->sk_shutdown & RCV_SHUTDOWN))
+		goto err;
+
+	/* Error if trying to write more than maximum frame size. */
+	ret = -EMSGSIZE;
+	if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM)
+		goto err;
+
+	buffer_size = len + cf_sk->headroom + cf_sk->tailroom;
+
+	ret = -ENOMEM;
+	skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret);
+
+	if (!skb || skb_tailroom(skb) < buffer_size)
+		goto err;
+
+	skb_reserve(skb, cf_sk->headroom);
+
+	ret = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+
+	if (ret)
+		goto err;
+	ret = transmit_skb(skb, cf_sk, noblock, timeo);
+	if (ret < 0)
+		/* skb is already freed */
+		return ret;
+
+	return len;
+err:
+	kfree_skb(skb);
+	return ret;
+}
+
+/*
+ * Copied from unix_stream_sendmsg and adapted to CAIF:
+ * Changed removed permission handling and added waiting for flow on
+ * and other minor adaptations.
+ */
+static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
+				struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	int err, size;
+	struct sk_buff *skb;
+	int sent = 0;
+	long timeo;
+
+	err = -EOPNOTSUPP;
+	if (unlikely(msg->msg_flags&MSG_OOB))
+		goto out_err;
+
+	if (unlikely(msg->msg_namelen))
+		goto out_err;
+
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	timeo = caif_wait_for_flow_on(cf_sk, 1, timeo, &err);
+
+	if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
+		goto pipe_err;
+
+	while (sent < len) {
+
+		size = len-sent;
+
+		if (size > cf_sk->maxframe)
+			size = cf_sk->maxframe;
+
+		/* If size is more than half of sndbuf, chop up message */
+		if (size > ((sk->sk_sndbuf >> 1) - 64))
+			size = (sk->sk_sndbuf >> 1) - 64;
+
+		if (size > SKB_MAX_ALLOC)
+			size = SKB_MAX_ALLOC;
+
+		skb = sock_alloc_send_skb(sk,
+					size + cf_sk->headroom +
+					cf_sk->tailroom,
+					msg->msg_flags&MSG_DONTWAIT,
+					&err);
+		if (skb == NULL)
+			goto out_err;
+
+		skb_reserve(skb, cf_sk->headroom);
+		/*
+		 *	If you pass two values to the sock_alloc_send_skb
+		 *	it tries to grab the large buffer with GFP_NOFS
+		 *	(which can fail easily), and if it fails grab the
+		 *	fallback size buffer which is under a page and will
+		 *	succeed. [Alan]
+		 */
+		size = min_t(int, size, skb_tailroom(skb));
+
+		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		if (err) {
+			kfree_skb(skb);
+			goto out_err;
+		}
+		err = transmit_skb(skb, cf_sk,
+				msg->msg_flags&MSG_DONTWAIT, timeo);
+		if (err < 0) {
+			kfree_skb(skb);
+			goto pipe_err;
+		}
+		sent += size;
+	}
+
+	return sent;
+
+pipe_err:
+	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+		send_sig(SIGPIPE, current, 0);
+	err = -EPIPE;
+out_err:
+	return sent ? : err;
+}
+
+static int setsockopt(struct socket *sock,
+			int lvl, int opt, char __user *ov, unsigned int ol)
+{
+	struct sock *sk = sock->sk;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	int linksel;
+
+	if (cf_sk->sk.sk_socket->state != SS_UNCONNECTED)
+		return -ENOPROTOOPT;
+
+	switch (opt) {
+	case CAIFSO_LINK_SELECT:
+		if (ol < sizeof(int))
+			return -EINVAL;
+		if (lvl != SOL_CAIF)
+			goto bad_sol;
+		if (copy_from_user(&linksel, ov, sizeof(int)))
+			return -EINVAL;
+		lock_sock(&(cf_sk->sk));
+		cf_sk->conn_req.link_selector = linksel;
+		release_sock(&cf_sk->sk);
+		return 0;
+
+	case CAIFSO_REQ_PARAM:
+		if (lvl != SOL_CAIF)
+			goto bad_sol;
+		if (cf_sk->sk.sk_protocol != CAIFPROTO_UTIL)
+			return -ENOPROTOOPT;
+		lock_sock(&(cf_sk->sk));
+		if (ol > sizeof(cf_sk->conn_req.param.data) ||
+			copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) {
+			release_sock(&cf_sk->sk);
+			return -EINVAL;
+		}
+		cf_sk->conn_req.param.size = ol;
+		release_sock(&cf_sk->sk);
+		return 0;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	return 0;
+bad_sol:
+	return -ENOPROTOOPT;
+
+}
+
+/*
+ * caif_connect() - Connect a CAIF Socket
+ * Copied and modified af_irda.c:irda_connect().
+ *
+ * Note : by consulting "errno", the user space caller may learn the cause
+ * of the failure. Most of them are visible in the function, others may come
+ * from subroutines called and are listed here :
+ *  o -EAFNOSUPPORT: bad socket family or type.
+ *  o -ESOCKTNOSUPPORT: bad socket type or protocol
+ *  o -EINVAL: bad socket address, or CAIF link type
+ *  o -ECONNREFUSED: remote end refused the connection.
+ *  o -EINPROGRESS: connect request sent but timed out (or non-blocking)
+ *  o -EISCONN: already connected.
+ *  o -ETIMEDOUT: Connection timed out (send timeout)
+ *  o -ENODEV: No link layer to send request
+ *  o -ECONNRESET: Received Shutdown indication or lost link layer
+ *  o -ENOMEM: Out of memory
+ *
+ *  State Strategy:
+ *  o sk_state: holds the CAIF_* protocol state, it's updated by
+ *	caif_ctrl_cb.
+ *  o sock->state: holds the SS_* socket state and is updated by connect and
+ *	disconnect.
+ */
+static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	long timeo;
+	int err;
+	int ifindex, headroom, tailroom;
+	unsigned int mtu;
+	struct net_device *dev;
+
+	lock_sock(sk);
+
+	err = -EAFNOSUPPORT;
+	if (uaddr->sa_family != AF_CAIF)
+		goto out;
+
+	switch (sock->state) {
+	case SS_UNCONNECTED:
+		/* Normal case, a fresh connect */
+		caif_assert(sk->sk_state == CAIF_DISCONNECTED);
+		break;
+	case SS_CONNECTING:
+		switch (sk->sk_state) {
+		case CAIF_CONNECTED:
+			sock->state = SS_CONNECTED;
+			err = -EISCONN;
+			goto out;
+		case CAIF_DISCONNECTED:
+			/* Reconnect allowed */
+			break;
+		case CAIF_CONNECTING:
+			err = -EALREADY;
+			if (flags & O_NONBLOCK)
+				goto out;
+			goto wait_connect;
+		}
+		break;
+	case SS_CONNECTED:
+		caif_assert(sk->sk_state == CAIF_CONNECTED ||
+				sk->sk_state == CAIF_DISCONNECTED);
+		if (sk->sk_shutdown & SHUTDOWN_MASK) {
+			/* Allow re-connect after SHUTDOWN_IND */
+			caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+			caif_free_client(&cf_sk->layer);
+			break;
+		}
+		/* No reconnect on a seqpacket socket */
+		err = -EISCONN;
+		goto out;
+	case SS_DISCONNECTING:
+	case SS_FREE:
+		caif_assert(1); /*Should never happen */
+		break;
+	}
+	sk->sk_state = CAIF_DISCONNECTED;
+	sock->state = SS_UNCONNECTED;
+	sk_stream_kill_queues(&cf_sk->sk);
+
+	err = -EINVAL;
+	if (addr_len != sizeof(struct sockaddr_caif))
+		goto out;
+
+	memcpy(&cf_sk->conn_req.sockaddr, uaddr,
+		sizeof(struct sockaddr_caif));
+
+	/* Move to connecting socket, start sending Connect Requests */
+	sock->state = SS_CONNECTING;
+	sk->sk_state = CAIF_CONNECTING;
+
+	/* Check priority value comming from socket */
+	/* if priority value is out of range it will be ajusted */
+	if (cf_sk->sk.sk_priority > CAIF_PRIO_MAX)
+		cf_sk->conn_req.priority = CAIF_PRIO_MAX;
+	else if (cf_sk->sk.sk_priority < CAIF_PRIO_MIN)
+		cf_sk->conn_req.priority = CAIF_PRIO_MIN;
+	else
+		cf_sk->conn_req.priority = cf_sk->sk.sk_priority;
+
+	/*ifindex = id of the interface.*/
+	cf_sk->conn_req.ifindex = cf_sk->sk.sk_bound_dev_if;
+
+	dbfs_atomic_inc(&cnt.num_connect_req);
+	cf_sk->layer.receive = caif_sktrecv_cb;
+
+	err = caif_connect_client(sock_net(sk), &cf_sk->conn_req,
+				&cf_sk->layer, &ifindex, &headroom, &tailroom);
+
+	if (err < 0) {
+		cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+		cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+		goto out;
+	}
+
+	err = -ENODEV;
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+	if (!dev) {
+		rcu_read_unlock();
+		goto out;
+	}
+	cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
+	mtu = dev->mtu;
+	rcu_read_unlock();
+
+	cf_sk->tailroom = tailroom;
+	cf_sk->maxframe = mtu - (headroom + tailroom);
+	if (cf_sk->maxframe < 1) {
+		pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
+		err = -ENODEV;
+		goto out;
+	}
+
+	err = -EINPROGRESS;
+wait_connect:
+
+	if (sk->sk_state != CAIF_CONNECTED && (flags & O_NONBLOCK))
+		goto out;
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	release_sock(sk);
+	err = -ERESTARTSYS;
+	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+			sk->sk_state != CAIF_CONNECTING,
+			timeo);
+	lock_sock(sk);
+	if (timeo < 0)
+		goto out; /* -ERESTARTSYS */
+
+	err = -ETIMEDOUT;
+	if (timeo == 0 && sk->sk_state != CAIF_CONNECTED)
+		goto out;
+	if (sk->sk_state != CAIF_CONNECTED) {
+		sock->state = SS_UNCONNECTED;
+		err = sock_error(sk);
+		if (!err)
+			err = -ECONNREFUSED;
+		goto out;
+	}
+	sock->state = SS_CONNECTED;
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ * caif_release() - Disconnect a CAIF Socket
+ * Copied and modified af_irda.c:irda_release().
+ */
+static int caif_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+	if (!sk)
+		return 0;
+
+	set_tx_flow_off(cf_sk);
+
+	/*
+	 * Ensure that packets are not queued after this point in time.
+	 * caif_queue_rcv_skb checks SOCK_DEAD holding the queue lock,
+	 * this ensures no packets when sock is dead.
+	 */
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	sock_set_flag(sk, SOCK_DEAD);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+	sock->sk = NULL;
+
+	dbfs_atomic_inc(&cnt.num_disconnect);
+
+	WARN_ON(IS_ERR(cf_sk->debugfs_socket_dir));
+	if (cf_sk->debugfs_socket_dir != NULL)
+		debugfs_remove_recursive(cf_sk->debugfs_socket_dir);
+
+	lock_sock(&(cf_sk->sk));
+	sk->sk_state = CAIF_DISCONNECTED;
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	caif_disconnect_client(sock_net(sk), &cf_sk->layer);
+	cf_sk->sk.sk_socket->state = SS_DISCONNECTING;
+	wake_up_interruptible_poll(sk_sleep(sk), POLLERR|POLLHUP);
+
+	sock_orphan(sk);
+	sk_stream_kill_queues(&cf_sk->sk);
+	release_sock(sk);
+	sock_put(sk);
+	return 0;
+}
+
+/* Copied from af_unix.c:unix_poll(), added CAIF tx_flow handling */
+static unsigned int caif_poll(struct file *file,
+				struct socket *sock, poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask;
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* exceptional events? */
+	if (sk->sk_err)
+		mask |= POLLERR;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		(sk->sk_shutdown & RCV_SHUTDOWN))
+		mask |= POLLIN | POLLRDNORM;
+
+	/*
+	 * we set writable also when the other side has shut down the
+	 * connection. This prevents stuck sockets.
+	 */
+	if (sock_writeable(sk) && tx_flow_is_on(cf_sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static const struct proto_ops caif_seqpacket_ops = {
+	.family = PF_CAIF,
+	.owner = THIS_MODULE,
+	.release = caif_release,
+	.bind = sock_no_bind,
+	.connect = caif_connect,
+	.socketpair = sock_no_socketpair,
+	.accept = sock_no_accept,
+	.getname = sock_no_getname,
+	.poll = caif_poll,
+	.ioctl = sock_no_ioctl,
+	.listen = sock_no_listen,
+	.shutdown = sock_no_shutdown,
+	.setsockopt = setsockopt,
+	.getsockopt = sock_no_getsockopt,
+	.sendmsg = caif_seqpkt_sendmsg,
+	.recvmsg = caif_seqpkt_recvmsg,
+	.mmap = sock_no_mmap,
+	.sendpage = sock_no_sendpage,
+};
+
+static const struct proto_ops caif_stream_ops = {
+	.family = PF_CAIF,
+	.owner = THIS_MODULE,
+	.release = caif_release,
+	.bind = sock_no_bind,
+	.connect = caif_connect,
+	.socketpair = sock_no_socketpair,
+	.accept = sock_no_accept,
+	.getname = sock_no_getname,
+	.poll = caif_poll,
+	.ioctl = sock_no_ioctl,
+	.listen = sock_no_listen,
+	.shutdown = sock_no_shutdown,
+	.setsockopt = setsockopt,
+	.getsockopt = sock_no_getsockopt,
+	.sendmsg = caif_stream_sendmsg,
+	.recvmsg = caif_stream_recvmsg,
+	.mmap = sock_no_mmap,
+	.sendpage = sock_no_sendpage,
+};
+
+/* This function is called when a socket is finally destroyed. */
+static void caif_sock_destructor(struct sock *sk)
+{
+	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	caif_assert(!atomic_read(&sk->sk_wmem_alloc));
+	caif_assert(sk_unhashed(sk));
+	caif_assert(!sk->sk_socket);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		pr_debug("Attempt to release alive CAIF socket: %p\n", sk);
+		return;
+	}
+	sk_stream_kill_queues(&cf_sk->sk);
+	dbfs_atomic_dec(&cnt.caif_nr_socks);
+	caif_free_client(&cf_sk->layer);
+}
+
+static int caif_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
+{
+	int num;
+	struct sock *sk = NULL;
+	struct caifsock *cf_sk = NULL;
+	static struct proto prot = {.name = "PF_CAIF",
+		.owner = THIS_MODULE,
+		.obj_size = sizeof(struct caifsock),
+	};
+
+	if (!capable(CAP_SYS_ADMIN) && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+	/*
+	 * The sock->type specifies the socket type to use.
+	 * The CAIF socket is a packet stream in the sense
+	 * that it is packet based. CAIF trusts the reliability
+	 * of the link, no resending is implemented.
+	 */
+	if (sock->type == SOCK_SEQPACKET)
+		sock->ops = &caif_seqpacket_ops;
+	else if (sock->type == SOCK_STREAM)
+		sock->ops = &caif_stream_ops;
+	else
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol < 0 || protocol >= CAIFPROTO_MAX)
+		return -EPROTONOSUPPORT;
+	/*
+	 * Set the socket state to unconnected.	 The socket state
+	 * is really not used at all in the net/core or socket.c but the
+	 * initialization makes sure that sock->state is not uninitialized.
+	 */
+	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+	if (!sk)
+		return -ENOMEM;
+
+	cf_sk = container_of(sk, struct caifsock, sk);
+
+	/* Store the protocol */
+	sk->sk_protocol = (unsigned char) protocol;
+
+	/*
+	 * Lock in order to try to stop someone from opening the socket
+	 * too early.
+	 */
+	lock_sock(&(cf_sk->sk));
+
+	/* Initialize the nozero default sock structure data. */
+	sock_init_data(sock, sk);
+	sk->sk_destruct = caif_sock_destructor;
+
+	mutex_init(&cf_sk->readlock); /* single task reading lock */
+	cf_sk->layer.ctrlcmd = caif_ctrl_cb;
+	cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
+	cf_sk->sk.sk_state = CAIF_DISCONNECTED;
+
+	set_tx_flow_off(cf_sk);
+	set_rx_flow_on(cf_sk);
+
+	/* Set default options on configuration */
+	cf_sk->sk.sk_priority = CAIF_PRIO_NORMAL;
+	cf_sk->conn_req.link_selector = CAIF_LINK_LOW_LATENCY;
+	cf_sk->conn_req.protocol = protocol;
+	/* Increase the number of sockets created. */
+	dbfs_atomic_inc(&cnt.caif_nr_socks);
+	num = dbfs_atomic_inc(&cnt.caif_sock_create);
+#ifdef CONFIG_DEBUG_FS
+	if (!IS_ERR(debugfsdir)) {
+
+		/* Fill in some information concerning the misc socket. */
+		snprintf(cf_sk->name, sizeof(cf_sk->name), "cfsk%d", num);
+
+		cf_sk->debugfs_socket_dir =
+			debugfs_create_dir(cf_sk->name, debugfsdir);
+
+		debugfs_create_u32("sk_state", S_IRUSR | S_IWUSR,
+				cf_sk->debugfs_socket_dir,
+				(u32 *) &cf_sk->sk.sk_state);
+		debugfs_create_u32("flow_state", S_IRUSR | S_IWUSR,
+				cf_sk->debugfs_socket_dir, &cf_sk->flow_state);
+		debugfs_create_u32("sk_rmem_alloc", S_IRUSR | S_IWUSR,
+				cf_sk->debugfs_socket_dir,
+				(u32 *) &cf_sk->sk.sk_rmem_alloc);
+		debugfs_create_u32("sk_wmem_alloc", S_IRUSR | S_IWUSR,
+				cf_sk->debugfs_socket_dir,
+				(u32 *) &cf_sk->sk.sk_wmem_alloc);
+		debugfs_create_u32("identity", S_IRUSR | S_IWUSR,
+				cf_sk->debugfs_socket_dir,
+				(u32 *) &cf_sk->layer.id);
+	}
+#endif
+	release_sock(&cf_sk->sk);
+	return 0;
+}
+
+
+static struct net_proto_family caif_family_ops = {
+	.family = PF_CAIF,
+	.create = caif_create,
+	.owner = THIS_MODULE,
+};
+
+static int af_caif_init(void)
+{
+	int err = sock_register(&caif_family_ops);
+	if (!err)
+		return err;
+	return 0;
+}
+
+static int __init caif_sktinit_module(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	debugfsdir = debugfs_create_dir("caif_sk", NULL);
+	if (!IS_ERR(debugfsdir)) {
+		debugfs_create_u32("num_sockets", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.caif_nr_socks);
+		debugfs_create_u32("num_create", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.caif_sock_create);
+		debugfs_create_u32("num_connect_req", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_connect_req);
+		debugfs_create_u32("num_connect_resp", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_connect_resp);
+		debugfs_create_u32("num_connect_fail_resp", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_connect_fail_resp);
+		debugfs_create_u32("num_disconnect", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_disconnect);
+		debugfs_create_u32("num_remote_shutdown_ind",
+				S_IRUSR | S_IWUSR, debugfsdir,
+				(u32 *) &cnt.num_remote_shutdown_ind);
+		debugfs_create_u32("num_tx_flow_off_ind", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_tx_flow_off_ind);
+		debugfs_create_u32("num_tx_flow_on_ind", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_tx_flow_on_ind);
+		debugfs_create_u32("num_rx_flow_off", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_rx_flow_off);
+		debugfs_create_u32("num_rx_flow_on", S_IRUSR | S_IWUSR,
+				debugfsdir,
+				(u32 *) &cnt.num_rx_flow_on);
+	}
+#endif
+	return af_caif_init();
+}
+
+static void __exit caif_sktexit_module(void)
+{
+	sock_unregister(PF_CAIF);
+	if (debugfsdir != NULL)
+		debugfs_remove_recursive(debugfsdir);
+}
+module_init(caif_sktinit_module);
+module_exit(caif_sktexit_module);
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
new file mode 100644
index 00000000..bca32d7c
--- /dev/null
+++ b/net/caif/cfcnfg.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfcnfg.h>
+#include <net/caif/cfctrl.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cffrml.h>
+#include <net/caif/cfserl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/caif_dev.h>
+
+#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
+
+/* Information about CAIF physical interfaces held by Config Module in order
+ * to manage physical interfaces
+ */
+struct cfcnfg_phyinfo {
+	struct list_head node;
+	bool up;
+
+	/* Pointer to the layer below the MUX (framing layer) */
+	struct cflayer *frm_layer;
+	/* Pointer to the lowest actual physical layer */
+	struct cflayer *phy_layer;
+	/* Unique identifier of the physical interface */
+	unsigned int id;
+	/* Preference of the physical in interface */
+	enum cfcnfg_phy_preference pref;
+
+	/* Information about the physical device */
+	struct dev_info dev_info;
+
+	/* Interface index */
+	int ifindex;
+
+	/* Use Start of frame extension */
+	bool use_stx;
+
+	/* Use Start of frame checksum */
+	bool use_fcs;
+};
+
+struct cfcnfg {
+	struct cflayer layer;
+	struct cflayer *ctrl;
+	struct cflayer *mux;
+	struct list_head phys;
+	struct mutex lock;
+};
+
+static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id,
+			     enum cfctrl_srv serv, u8 phyid,
+			     struct cflayer *adapt_layer);
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id);
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+			     struct cflayer *adapt_layer);
+static void cfctrl_resp_func(void);
+static void cfctrl_enum_resp(void);
+
+struct cfcnfg *cfcnfg_create(void)
+{
+	struct cfcnfg *this;
+	struct cfctrl_rsp *resp;
+
+	might_sleep();
+
+	/* Initiate this layer */
+	this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
+	if (!this) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	this->mux = cfmuxl_create();
+	if (!this->mux)
+		goto out_of_mem;
+	this->ctrl = cfctrl_create();
+	if (!this->ctrl)
+		goto out_of_mem;
+	/* Initiate response functions */
+	resp = cfctrl_get_respfuncs(this->ctrl);
+	resp->enum_rsp = cfctrl_enum_resp;
+	resp->linkerror_ind = cfctrl_resp_func;
+	resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp;
+	resp->sleep_rsp = cfctrl_resp_func;
+	resp->wake_rsp = cfctrl_resp_func;
+	resp->restart_rsp = cfctrl_resp_func;
+	resp->radioset_rsp = cfctrl_resp_func;
+	resp->linksetup_rsp = cfcnfg_linkup_rsp;
+	resp->reject_rsp = cfcnfg_reject_rsp;
+	INIT_LIST_HEAD(&this->phys);
+
+	cfmuxl_set_uplayer(this->mux, this->ctrl, 0);
+	layer_set_dn(this->ctrl, this->mux);
+	layer_set_up(this->ctrl, this);
+	mutex_init(&this->lock);
+
+	return this;
+out_of_mem:
+	pr_warn("Out of memory\n");
+
+	synchronize_rcu();
+
+	kfree(this->mux);
+	kfree(this->ctrl);
+	kfree(this);
+	return NULL;
+}
+
+void cfcnfg_remove(struct cfcnfg *cfg)
+{
+	might_sleep();
+	if (cfg) {
+		synchronize_rcu();
+
+		kfree(cfg->mux);
+		cfctrl_remove(cfg->ctrl);
+		kfree(cfg);
+	}
+}
+
+static void cfctrl_resp_func(void)
+{
+}
+
+static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg,
+							u8 phyid)
+{
+	struct cfcnfg_phyinfo *phy;
+
+	list_for_each_entry_rcu(phy, &cnfg->phys, node)
+		if (phy->id == phyid)
+			return phy;
+	return NULL;
+}
+
+static void cfctrl_enum_resp(void)
+{
+}
+
+static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg,
+				  enum cfcnfg_phy_preference phy_pref)
+{
+	/* Try to match with specified preference */
+	struct cfcnfg_phyinfo *phy;
+
+	list_for_each_entry_rcu(phy, &cnfg->phys, node) {
+		if (phy->up && phy->pref == phy_pref &&
+				phy->frm_layer != NULL)
+
+			return &phy->dev_info;
+	}
+
+	/* Otherwise just return something */
+	list_for_each_entry_rcu(phy, &cnfg->phys, node)
+		if (phy->up)
+			return &phy->dev_info;
+
+	return NULL;
+}
+
+static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi)
+{
+	struct cfcnfg_phyinfo *phy;
+
+	list_for_each_entry_rcu(phy, &cnfg->phys, node)
+		if (phy->ifindex == ifi && phy->up)
+			return phy->id;
+	return -ENODEV;
+}
+
+int caif_disconnect_client(struct net *net, struct cflayer *adap_layer)
+{
+	u8 channel_id;
+	struct cfcnfg *cfg = get_cfcnfg(net);
+
+	caif_assert(adap_layer != NULL);
+	cfctrl_cancel_req(cfg->ctrl, adap_layer);
+	channel_id = adap_layer->id;
+	if (channel_id != 0) {
+		struct cflayer *servl;
+		servl = cfmuxl_remove_uplayer(cfg->mux, channel_id);
+		if (servl != NULL)
+			layer_set_up(servl, NULL);
+	} else
+		pr_debug("nothing to disconnect\n");
+	cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer);
+
+	/* Do RCU sync before initiating cleanup */
+	synchronize_rcu();
+	if (adap_layer->ctrlcmd != NULL)
+		adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0);
+	return 0;
+
+}
+EXPORT_SYMBOL(caif_disconnect_client);
+
+static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id)
+{
+}
+
+static const int protohead[CFCTRL_SRV_MASK] = {
+	[CFCTRL_SRV_VEI] = 4,
+	[CFCTRL_SRV_DATAGRAM] = 7,
+	[CFCTRL_SRV_UTIL] = 4,
+	[CFCTRL_SRV_RFM] = 3,
+	[CFCTRL_SRV_DBG] = 3,
+};
+
+
+static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
+				   struct caif_connect_request *s,
+				   struct cfctrl_link_param *l)
+{
+	struct dev_info *dev_info;
+	enum cfcnfg_phy_preference pref;
+	int res;
+
+	memset(l, 0, sizeof(*l));
+	/* In caif protocol low value is high priority */
+	l->priority = CAIF_PRIO_MAX - s->priority + 1;
+
+	if (s->ifindex != 0) {
+		res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex);
+		if (res < 0)
+			return res;
+		l->phyid = res;
+	} else {
+		switch (s->link_selector) {
+		case CAIF_LINK_HIGH_BANDW:
+			pref = CFPHYPREF_HIGH_BW;
+			break;
+		case CAIF_LINK_LOW_LATENCY:
+			pref = CFPHYPREF_LOW_LAT;
+			break;
+		default:
+			return -EINVAL;
+		}
+		dev_info = cfcnfg_get_phyid(cnfg, pref);
+		if (dev_info == NULL)
+			return -ENODEV;
+		l->phyid = dev_info->id;
+	}
+	switch (s->protocol) {
+	case CAIFPROTO_AT:
+		l->linktype = CFCTRL_SRV_VEI;
+		l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3;
+		l->chtype = s->sockaddr.u.at.type & 0x3;
+		break;
+	case CAIFPROTO_DATAGRAM:
+		l->linktype = CFCTRL_SRV_DATAGRAM;
+		l->chtype = 0x00;
+		l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+		break;
+	case CAIFPROTO_DATAGRAM_LOOP:
+		l->linktype = CFCTRL_SRV_DATAGRAM;
+		l->chtype = 0x03;
+		l->endpoint = 0x00;
+		l->u.datagram.connid = s->sockaddr.u.dgm.connection_id;
+		break;
+	case CAIFPROTO_RFM:
+		l->linktype = CFCTRL_SRV_RFM;
+		l->u.datagram.connid = s->sockaddr.u.rfm.connection_id;
+		strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
+			sizeof(l->u.rfm.volume)-1);
+		l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0;
+		break;
+	case CAIFPROTO_UTIL:
+		l->linktype = CFCTRL_SRV_UTIL;
+		l->endpoint = 0x00;
+		l->chtype = 0x00;
+		strncpy(l->u.utility.name, s->sockaddr.u.util.service,
+			sizeof(l->u.utility.name)-1);
+		l->u.utility.name[sizeof(l->u.utility.name)-1] = 0;
+		caif_assert(sizeof(l->u.utility.name) > 10);
+		l->u.utility.paramlen = s->param.size;
+		if (l->u.utility.paramlen > sizeof(l->u.utility.params))
+			l->u.utility.paramlen = sizeof(l->u.utility.params);
+
+		memcpy(l->u.utility.params, s->param.data,
+		       l->u.utility.paramlen);
+
+		break;
+	case CAIFPROTO_DEBUG:
+		l->linktype = CFCTRL_SRV_DBG;
+		l->endpoint = s->sockaddr.u.dbg.service;
+		l->chtype = s->sockaddr.u.dbg.type;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int caif_connect_client(struct net *net, struct caif_connect_request *conn_req,
+			struct cflayer *adap_layer, int *ifindex,
+				int *proto_head,
+				int *proto_tail)
+{
+	struct cflayer *frml;
+	struct cfcnfg_phyinfo *phy;
+	int err;
+	struct cfctrl_link_param param;
+	struct cfcnfg *cfg = get_cfcnfg(net);
+
+	rcu_read_lock();
+	err = caif_connect_req_to_link_param(cfg, conn_req, &param);
+	if (err)
+		goto unlock;
+
+	phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid);
+	if (!phy) {
+		err = -ENODEV;
+		goto unlock;
+	}
+	err = -EINVAL;
+
+	if (adap_layer == NULL) {
+		pr_err("adap_layer is zero\n");
+		goto unlock;
+	}
+	if (adap_layer->receive == NULL) {
+		pr_err("adap_layer->receive is NULL\n");
+		goto unlock;
+	}
+	if (adap_layer->ctrlcmd == NULL) {
+		pr_err("adap_layer->ctrlcmd == NULL\n");
+		goto unlock;
+	}
+
+	err = -ENODEV;
+	frml = phy->frm_layer;
+	if (frml == NULL) {
+		pr_err("Specified PHY type does not exist!\n");
+		goto unlock;
+	}
+	caif_assert(param.phyid == phy->id);
+	caif_assert(phy->frm_layer->id ==
+		     param.phyid);
+	caif_assert(phy->phy_layer->id ==
+		     param.phyid);
+
+	*ifindex = phy->ifindex;
+	*proto_tail = 2;
+	*proto_head =
+
+	protohead[param.linktype] + (phy->use_stx ? 1 : 0);
+
+	rcu_read_unlock();
+
+	/* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */
+	cfctrl_enum_req(cfg->ctrl, param.phyid);
+	return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer);
+
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(caif_connect_client);
+
+static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id,
+			     struct cflayer *adapt_layer)
+{
+	if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+		adapt_layer->ctrlcmd(adapt_layer,
+				     CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+}
+
+static void
+cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
+		  u8 phyid, struct cflayer *adapt_layer)
+{
+	struct cfcnfg *cnfg = container_obj(layer);
+	struct cflayer *servicel = NULL;
+	struct cfcnfg_phyinfo *phyinfo;
+	struct net_device *netdev;
+
+	if (channel_id == 0) {
+		pr_warn("received channel_id zero\n");
+		if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL)
+			adapt_layer->ctrlcmd(adapt_layer,
+						CAIF_CTRLCMD_INIT_FAIL_RSP, 0);
+		return;
+	}
+
+	rcu_read_lock();
+
+	if (adapt_layer == NULL) {
+		pr_debug("link setup response but no client exist,"
+				"send linkdown back\n");
+		cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
+		goto unlock;
+	}
+
+	caif_assert(cnfg != NULL);
+	caif_assert(phyid != 0);
+
+	phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+	if (phyinfo == NULL) {
+		pr_err("ERROR: Link Layer Device dissapeared"
+				"while connecting\n");
+		goto unlock;
+	}
+
+	caif_assert(phyinfo != NULL);
+	caif_assert(phyinfo->id == phyid);
+	caif_assert(phyinfo->phy_layer != NULL);
+	caif_assert(phyinfo->phy_layer->id == phyid);
+
+	adapt_layer->id = channel_id;
+
+	switch (serv) {
+	case CFCTRL_SRV_VEI:
+		servicel = cfvei_create(channel_id, &phyinfo->dev_info);
+		break;
+	case CFCTRL_SRV_DATAGRAM:
+		servicel = cfdgml_create(channel_id,
+					&phyinfo->dev_info);
+		break;
+	case CFCTRL_SRV_RFM:
+		netdev = phyinfo->dev_info.dev;
+		servicel = cfrfml_create(channel_id, &phyinfo->dev_info,
+						netdev->mtu);
+		break;
+	case CFCTRL_SRV_UTIL:
+		servicel = cfutill_create(channel_id, &phyinfo->dev_info);
+		break;
+	case CFCTRL_SRV_VIDEO:
+		servicel = cfvidl_create(channel_id, &phyinfo->dev_info);
+		break;
+	case CFCTRL_SRV_DBG:
+		servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
+		break;
+	default:
+		pr_err("Protocol error. Link setup response "
+				"- unknown channel type\n");
+		goto unlock;
+	}
+	if (!servicel) {
+		pr_warn("Out of memory\n");
+		goto unlock;
+	}
+	layer_set_dn(servicel, cnfg->mux);
+	cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id);
+	layer_set_up(servicel, adapt_layer);
+	layer_set_dn(adapt_layer, servicel);
+
+	rcu_read_unlock();
+
+	servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0);
+	return;
+unlock:
+	rcu_read_unlock();
+}
+
+void
+cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
+		     struct net_device *dev, struct cflayer *phy_layer,
+		     enum cfcnfg_phy_preference pref,
+		     bool fcs, bool stx)
+{
+	struct cflayer *frml;
+	struct cflayer *phy_driver = NULL;
+	struct cfcnfg_phyinfo *phyinfo;
+	int i;
+	u8 phyid;
+
+	mutex_lock(&cnfg->lock);
+
+	/* CAIF protocol allow maximum 6 link-layers */
+	for (i = 0; i < 7; i++) {
+		phyid = (dev->ifindex + i) & 0x7;
+		if (phyid == 0)
+			continue;
+		if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL)
+			goto got_phyid;
+	}
+	pr_warn("Too many CAIF Link Layers (max 6)\n");
+	goto out;
+
+got_phyid:
+	phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC);
+
+	switch (phy_type) {
+	case CFPHYTYPE_FRAG:
+		phy_driver =
+		    cfserl_create(CFPHYTYPE_FRAG, phyid, stx);
+		if (!phy_driver) {
+			pr_warn("Out of memory\n");
+			goto out;
+		}
+		break;
+	case CFPHYTYPE_CAIF:
+		phy_driver = NULL;
+		break;
+	default:
+		goto out;
+	}
+	phy_layer->id = phyid;
+	phyinfo->pref = pref;
+	phyinfo->id = phyid;
+	phyinfo->dev_info.id = phyid;
+	phyinfo->dev_info.dev = dev;
+	phyinfo->phy_layer = phy_layer;
+	phyinfo->ifindex = dev->ifindex;
+	phyinfo->use_stx = stx;
+	phyinfo->use_fcs = fcs;
+
+	frml = cffrml_create(phyid, fcs);
+
+	if (!frml) {
+		pr_warn("Out of memory\n");
+		kfree(phyinfo);
+		goto out;
+	}
+	phyinfo->frm_layer = frml;
+	layer_set_up(frml, cnfg->mux);
+
+	if (phy_driver != NULL) {
+		phy_driver->id = phyid;
+		layer_set_dn(frml, phy_driver);
+		layer_set_up(phy_driver, frml);
+		layer_set_dn(phy_driver, phy_layer);
+		layer_set_up(phy_layer, phy_driver);
+	} else {
+		layer_set_dn(frml, phy_layer);
+		layer_set_up(phy_layer, frml);
+	}
+
+	list_add_rcu(&phyinfo->node, &cnfg->phys);
+out:
+	mutex_unlock(&cnfg->lock);
+}
+EXPORT_SYMBOL(cfcnfg_add_phy_layer);
+
+int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer,
+		bool up)
+{
+	struct cfcnfg_phyinfo *phyinfo;
+
+	rcu_read_lock();
+	phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id);
+	if (phyinfo == NULL) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	if (phyinfo->up == up) {
+		rcu_read_unlock();
+		return 0;
+	}
+	phyinfo->up = up;
+
+	if (up) {
+		cffrml_hold(phyinfo->frm_layer);
+		cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer,
+					phy_layer->id);
+	} else {
+		cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id);
+		cffrml_put(phyinfo->frm_layer);
+	}
+
+	rcu_read_unlock();
+	return 0;
+}
+EXPORT_SYMBOL(cfcnfg_set_phy_state);
+
+int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer)
+{
+	struct cflayer *frml, *frml_dn;
+	u16 phyid;
+	struct cfcnfg_phyinfo *phyinfo;
+
+	might_sleep();
+
+	mutex_lock(&cnfg->lock);
+
+	phyid = phy_layer->id;
+	phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid);
+
+	if (phyinfo == NULL) {
+		mutex_unlock(&cnfg->lock);
+		return 0;
+	}
+	caif_assert(phyid == phyinfo->id);
+	caif_assert(phy_layer == phyinfo->phy_layer);
+	caif_assert(phy_layer->id == phyid);
+	caif_assert(phyinfo->frm_layer->id == phyid);
+
+	list_del_rcu(&phyinfo->node);
+	synchronize_rcu();
+
+	/* Fail if reference count is not zero */
+	if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) {
+		pr_info("Wait for device inuse\n");
+		list_add_rcu(&phyinfo->node, &cnfg->phys);
+		mutex_unlock(&cnfg->lock);
+		return -EAGAIN;
+	}
+
+	frml = phyinfo->frm_layer;
+	frml_dn = frml->dn;
+	cffrml_set_uplayer(frml, NULL);
+	cffrml_set_dnlayer(frml, NULL);
+	if (phy_layer != frml_dn) {
+		layer_set_up(frml_dn, NULL);
+		layer_set_dn(frml_dn, NULL);
+	}
+	layer_set_up(phy_layer, NULL);
+
+	if (phyinfo->phy_layer != frml_dn)
+		kfree(frml_dn);
+
+	cffrml_free(frml);
+	kfree(phyinfo);
+	mutex_unlock(&cnfg->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(cfcnfg_del_phy_layer);
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
new file mode 100644
index 00000000..e22671be
--- /dev/null
+++ b/net/caif/cfctrl.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfctrl.h>
+
+#define container_obj(layr) container_of(layr, struct cfctrl, serv.layer)
+#define UTILITY_NAME_LENGTH 16
+#define CFPKT_CTRL_PKT_LEN 20
+
+#ifdef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl,
+			      int cmd, struct cfpkt *pkt){
+	return -1;
+}
+#else
+static int handle_loop(struct cfctrl *ctrl,
+		int cmd, struct cfpkt *pkt);
+#endif
+static int cfctrl_recv(struct cflayer *layr, struct cfpkt *pkt);
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+			   int phyid);
+
+
+struct cflayer *cfctrl_create(void)
+{
+	struct dev_info dev_info;
+	struct cfctrl *this =
+		kmalloc(sizeof(struct cfctrl), GFP_ATOMIC);
+	if (!this) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+	memset(&dev_info, 0, sizeof(dev_info));
+	dev_info.id = 0xff;
+	memset(this, 0, sizeof(*this));
+	cfsrvl_init(&this->serv, 0, &dev_info, false);
+	atomic_set(&this->req_seq_no, 1);
+	atomic_set(&this->rsp_seq_no, 1);
+	this->serv.layer.receive = cfctrl_recv;
+	sprintf(this->serv.layer.name, "ctrl");
+	this->serv.layer.ctrlcmd = cfctrl_ctrlcmd;
+#ifndef CAIF_NO_LOOP
+	spin_lock_init(&this->loop_linkid_lock);
+	this->loop_linkid = 1;
+#endif
+	spin_lock_init(&this->info_list_lock);
+	INIT_LIST_HEAD(&this->list);
+	return &this->serv.layer;
+}
+
+void cfctrl_remove(struct cflayer *layer)
+{
+	struct cfctrl_request_info *p, *tmp;
+	struct cfctrl *ctrl = container_obj(layer);
+
+	spin_lock_bh(&ctrl->info_list_lock);
+	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+		list_del(&p->list);
+		kfree(p);
+	}
+	spin_unlock_bh(&ctrl->info_list_lock);
+	kfree(layer);
+}
+
+static bool param_eq(const struct cfctrl_link_param *p1,
+			const struct cfctrl_link_param *p2)
+{
+	bool eq =
+	    p1->linktype == p2->linktype &&
+	    p1->priority == p2->priority &&
+	    p1->phyid == p2->phyid &&
+	    p1->endpoint == p2->endpoint && p1->chtype == p2->chtype;
+
+	if (!eq)
+		return false;
+
+	switch (p1->linktype) {
+	case CFCTRL_SRV_VEI:
+		return true;
+	case CFCTRL_SRV_DATAGRAM:
+		return p1->u.datagram.connid == p2->u.datagram.connid;
+	case CFCTRL_SRV_RFM:
+		return
+		    p1->u.rfm.connid == p2->u.rfm.connid &&
+		    strcmp(p1->u.rfm.volume, p2->u.rfm.volume) == 0;
+	case CFCTRL_SRV_UTIL:
+		return
+		    p1->u.utility.fifosize_kb == p2->u.utility.fifosize_kb
+		    && p1->u.utility.fifosize_bufs ==
+		    p2->u.utility.fifosize_bufs
+		    && strcmp(p1->u.utility.name, p2->u.utility.name) == 0
+		    && p1->u.utility.paramlen == p2->u.utility.paramlen
+		    && memcmp(p1->u.utility.params, p2->u.utility.params,
+			      p1->u.utility.paramlen) == 0;
+
+	case CFCTRL_SRV_VIDEO:
+		return p1->u.video.connid == p2->u.video.connid;
+	case CFCTRL_SRV_DBG:
+		return true;
+	case CFCTRL_SRV_DECM:
+		return false;
+	default:
+		return false;
+	}
+	return false;
+}
+
+static bool cfctrl_req_eq(const struct cfctrl_request_info *r1,
+			  const struct cfctrl_request_info *r2)
+{
+	if (r1->cmd != r2->cmd)
+		return false;
+	if (r1->cmd == CFCTRL_CMD_LINK_SETUP)
+		return param_eq(&r1->param, &r2->param);
+	else
+		return r1->channel_id == r2->channel_id;
+}
+
+/* Insert request at the end */
+static void cfctrl_insert_req(struct cfctrl *ctrl,
+			      struct cfctrl_request_info *req)
+{
+	spin_lock_bh(&ctrl->info_list_lock);
+	atomic_inc(&ctrl->req_seq_no);
+	req->sequence_no = atomic_read(&ctrl->req_seq_no);
+	list_add_tail(&req->list, &ctrl->list);
+	spin_unlock_bh(&ctrl->info_list_lock);
+}
+
+/* Compare and remove request */
+static struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
+						struct cfctrl_request_info *req)
+{
+	struct cfctrl_request_info *p, *tmp, *first;
+
+	first = list_first_entry(&ctrl->list, struct cfctrl_request_info, list);
+
+	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+		if (cfctrl_req_eq(req, p)) {
+			if (p != first)
+				pr_warn("Requests are not received in order\n");
+
+			atomic_set(&ctrl->rsp_seq_no,
+					 p->sequence_no);
+			list_del(&p->list);
+			goto out;
+		}
+	}
+	p = NULL;
+out:
+	return p;
+}
+
+struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer)
+{
+	struct cfctrl *this = container_obj(layer);
+	return &this->res;
+}
+
+static void init_info(struct caif_payload_info *info, struct cfctrl *cfctrl)
+{
+	info->hdr_len = 0;
+	info->channel_id = cfctrl->serv.layer.id;
+	info->dev_info = &cfctrl->serv.dev_info;
+}
+
+void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
+{
+	struct cfctrl *cfctrl = container_obj(layer);
+	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+	struct cflayer *dn = cfctrl->serv.layer.dn;
+	if (!pkt) {
+		pr_warn("Out of memory\n");
+		return;
+	}
+	if (!dn) {
+		pr_debug("not able to send enum request\n");
+		return;
+	}
+	caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
+	init_info(cfpkt_info(pkt), cfctrl);
+	cfpkt_info(pkt)->dev_info->id = physlinkid;
+	cfctrl->serv.dev_info.id = physlinkid;
+	cfpkt_addbdy(pkt, CFCTRL_CMD_ENUM);
+	cfpkt_addbdy(pkt, physlinkid);
+	dn->transmit(dn, pkt);
+}
+
+int cfctrl_linkup_request(struct cflayer *layer,
+			   struct cfctrl_link_param *param,
+			   struct cflayer *user_layer)
+{
+	struct cfctrl *cfctrl = container_obj(layer);
+	u32 tmp32;
+	u16 tmp16;
+	u8 tmp8;
+	struct cfctrl_request_info *req;
+	int ret;
+	char utility_name[16];
+	struct cfpkt *pkt;
+	struct cflayer *dn = cfctrl->serv.layer.dn;
+
+	if (!dn) {
+		pr_debug("not able to send linkup request\n");
+		return -ENODEV;
+	}
+
+	if (cfctrl_cancel_req(layer, user_layer) > 0) {
+		/* Slight Paranoia, check if already connecting */
+		pr_err("Duplicate connect request for same client\n");
+		WARN_ON(1);
+		return -EALREADY;
+	}
+
+	pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+	if (!pkt) {
+		pr_warn("Out of memory\n");
+		return -ENOMEM;
+	}
+	cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
+	cfpkt_addbdy(pkt, (param->chtype << 4) | param->linktype);
+	cfpkt_addbdy(pkt, (param->priority << 3) | param->phyid);
+	cfpkt_addbdy(pkt, param->endpoint & 0x03);
+
+	switch (param->linktype) {
+	case CFCTRL_SRV_VEI:
+		break;
+	case CFCTRL_SRV_VIDEO:
+		cfpkt_addbdy(pkt, (u8) param->u.video.connid);
+		break;
+	case CFCTRL_SRV_DBG:
+		break;
+	case CFCTRL_SRV_DATAGRAM:
+		tmp32 = cpu_to_le32(param->u.datagram.connid);
+		cfpkt_add_body(pkt, &tmp32, 4);
+		break;
+	case CFCTRL_SRV_RFM:
+		/* Construct a frame, convert DatagramConnectionID to network
+		 * format long and copy it out...
+		 */
+		tmp32 = cpu_to_le32(param->u.rfm.connid);
+		cfpkt_add_body(pkt, &tmp32, 4);
+		/* Add volume name, including zero termination... */
+		cfpkt_add_body(pkt, param->u.rfm.volume,
+			       strlen(param->u.rfm.volume) + 1);
+		break;
+	case CFCTRL_SRV_UTIL:
+		tmp16 = cpu_to_le16(param->u.utility.fifosize_kb);
+		cfpkt_add_body(pkt, &tmp16, 2);
+		tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
+		cfpkt_add_body(pkt, &tmp16, 2);
+		memset(utility_name, 0, sizeof(utility_name));
+		strncpy(utility_name, param->u.utility.name,
+			UTILITY_NAME_LENGTH - 1);
+		cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
+		tmp8 = param->u.utility.paramlen;
+		cfpkt_add_body(pkt, &tmp8, 1);
+		cfpkt_add_body(pkt, param->u.utility.params,
+			       param->u.utility.paramlen);
+		break;
+	default:
+		pr_warn("Request setup of bad link type = %d\n",
+			param->linktype);
+		return -EINVAL;
+	}
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req) {
+		pr_warn("Out of memory\n");
+		return -ENOMEM;
+	}
+	req->client_layer = user_layer;
+	req->cmd = CFCTRL_CMD_LINK_SETUP;
+	req->param = *param;
+	cfctrl_insert_req(cfctrl, req);
+	init_info(cfpkt_info(pkt), cfctrl);
+	/*
+	 * NOTE:Always send linkup and linkdown request on the same
+	 *	device as the payload. Otherwise old queued up payload
+	 *	might arrive with the newly allocated channel ID.
+	 */
+	cfpkt_info(pkt)->dev_info->id = param->phyid;
+	ret =
+	    dn->transmit(dn, pkt);
+	if (ret < 0) {
+		int count;
+
+		count = cfctrl_cancel_req(&cfctrl->serv.layer,
+						user_layer);
+		if (count != 1)
+			pr_err("Could not remove request (%d)", count);
+			return -ENODEV;
+	}
+	return 0;
+}
+
+int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
+				struct cflayer *client)
+{
+	int ret;
+	struct cfctrl *cfctrl = container_obj(layer);
+	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
+	struct cflayer *dn = cfctrl->serv.layer.dn;
+
+	if (!pkt) {
+		pr_warn("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	if (!dn) {
+		pr_debug("not able to send link-down request\n");
+		return -ENODEV;
+	}
+
+	cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
+	cfpkt_addbdy(pkt, channelid);
+	init_info(cfpkt_info(pkt), cfctrl);
+	ret =
+	    dn->transmit(dn, pkt);
+#ifndef CAIF_NO_LOOP
+	cfctrl->loop_linkused[channelid] = 0;
+#endif
+	return ret;
+}
+
+int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
+{
+	struct cfctrl_request_info *p, *tmp;
+	struct cfctrl *ctrl = container_obj(layr);
+	int found = 0;
+	spin_lock_bh(&ctrl->info_list_lock);
+
+	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
+		if (p->client_layer == adap_layer) {
+			list_del(&p->list);
+			kfree(p);
+			found++;
+		}
+	}
+
+	spin_unlock_bh(&ctrl->info_list_lock);
+	return found;
+}
+
+static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
+{
+	u8 cmdrsp;
+	u8 cmd;
+	int ret = -1;
+	u16 tmp16;
+	u8 len;
+	u8 param[255];
+	u8 linkid;
+	struct cfctrl *cfctrl = container_obj(layer);
+	struct cfctrl_request_info rsp, *req;
+
+
+	cfpkt_extr_head(pkt, &cmdrsp, 1);
+	cmd = cmdrsp & CFCTRL_CMD_MASK;
+	if (cmd != CFCTRL_CMD_LINK_ERR
+	    && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp)
+		&& CFCTRL_ERR_BIT != (CFCTRL_ERR_BIT & cmdrsp)) {
+		if (handle_loop(cfctrl, cmd, pkt) != 0)
+			cmdrsp |= CFCTRL_ERR_BIT;
+	}
+
+	switch (cmd) {
+	case CFCTRL_CMD_LINK_SETUP:
+		{
+			enum cfctrl_srv serv;
+			enum cfctrl_srv servtype;
+			u8 endpoint;
+			u8 physlinkid;
+			u8 prio;
+			u8 tmp;
+			u32 tmp32;
+			u8 *cp;
+			int i;
+			struct cfctrl_link_param linkparam;
+			memset(&linkparam, 0, sizeof(linkparam));
+
+			cfpkt_extr_head(pkt, &tmp, 1);
+
+			serv = tmp & CFCTRL_SRV_MASK;
+			linkparam.linktype = serv;
+
+			servtype = tmp >> 4;
+			linkparam.chtype = servtype;
+
+			cfpkt_extr_head(pkt, &tmp, 1);
+			physlinkid = tmp & 0x07;
+			prio = tmp >> 3;
+
+			linkparam.priority = prio;
+			linkparam.phyid = physlinkid;
+			cfpkt_extr_head(pkt, &endpoint, 1);
+			linkparam.endpoint = endpoint & 0x03;
+
+			switch (serv) {
+			case CFCTRL_SRV_VEI:
+			case CFCTRL_SRV_DBG:
+				if (CFCTRL_ERR_BIT & cmdrsp)
+					break;
+				/* Link ID */
+				cfpkt_extr_head(pkt, &linkid, 1);
+				break;
+			case CFCTRL_SRV_VIDEO:
+				cfpkt_extr_head(pkt, &tmp, 1);
+				linkparam.u.video.connid = tmp;
+				if (CFCTRL_ERR_BIT & cmdrsp)
+					break;
+				/* Link ID */
+				cfpkt_extr_head(pkt, &linkid, 1);
+				break;
+
+			case CFCTRL_SRV_DATAGRAM:
+				cfpkt_extr_head(pkt, &tmp32, 4);
+				linkparam.u.datagram.connid =
+				    le32_to_cpu(tmp32);
+				if (CFCTRL_ERR_BIT & cmdrsp)
+					break;
+				/* Link ID */
+				cfpkt_extr_head(pkt, &linkid, 1);
+				break;
+			case CFCTRL_SRV_RFM:
+				/* Construct a frame, convert
+				 * DatagramConnectionID
+				 * to network format long and copy it out...
+				 */
+				cfpkt_extr_head(pkt, &tmp32, 4);
+				linkparam.u.rfm.connid =
+				  le32_to_cpu(tmp32);
+				cp = (u8 *) linkparam.u.rfm.volume;
+				for (cfpkt_extr_head(pkt, &tmp, 1);
+				     cfpkt_more(pkt) && tmp != '\0';
+				     cfpkt_extr_head(pkt, &tmp, 1))
+					*cp++ = tmp;
+				*cp = '\0';
+
+				if (CFCTRL_ERR_BIT & cmdrsp)
+					break;
+				/* Link ID */
+				cfpkt_extr_head(pkt, &linkid, 1);
+
+				break;
+			case CFCTRL_SRV_UTIL:
+				/* Construct a frame, convert
+				 * DatagramConnectionID
+				 * to network format long and copy it out...
+				 */
+				/* Fifosize KB */
+				cfpkt_extr_head(pkt, &tmp16, 2);
+				linkparam.u.utility.fifosize_kb =
+				    le16_to_cpu(tmp16);
+				/* Fifosize bufs */
+				cfpkt_extr_head(pkt, &tmp16, 2);
+				linkparam.u.utility.fifosize_bufs =
+				    le16_to_cpu(tmp16);
+				/* name */
+				cp = (u8 *) linkparam.u.utility.name;
+				caif_assert(sizeof(linkparam.u.utility.name)
+					     >= UTILITY_NAME_LENGTH);
+				for (i = 0;
+				     i < UTILITY_NAME_LENGTH
+				     && cfpkt_more(pkt); i++) {
+					cfpkt_extr_head(pkt, &tmp, 1);
+					*cp++ = tmp;
+				}
+				/* Length */
+				cfpkt_extr_head(pkt, &len, 1);
+				linkparam.u.utility.paramlen = len;
+				/* Param Data */
+				cp = linkparam.u.utility.params;
+				while (cfpkt_more(pkt) && len--) {
+					cfpkt_extr_head(pkt, &tmp, 1);
+					*cp++ = tmp;
+				}
+				if (CFCTRL_ERR_BIT & cmdrsp)
+					break;
+				/* Link ID */
+				cfpkt_extr_head(pkt, &linkid, 1);
+				/* Length */
+				cfpkt_extr_head(pkt, &len, 1);
+				/* Param Data */
+				cfpkt_extr_head(pkt, &param, len);
+				break;
+			default:
+				pr_warn("Request setup, invalid type (%d)\n",
+					serv);
+				goto error;
+			}
+
+			rsp.cmd = cmd;
+			rsp.param = linkparam;
+			spin_lock_bh(&cfctrl->info_list_lock);
+			req = cfctrl_remove_req(cfctrl, &rsp);
+
+			if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
+				cfpkt_erroneous(pkt)) {
+				pr_err("Invalid O/E bit or parse error "
+						"on CAIF control channel\n");
+				cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
+						       0,
+						       req ? req->client_layer
+						       : NULL);
+			} else {
+				cfctrl->res.linksetup_rsp(cfctrl->serv.
+							  layer.up, linkid,
+							  serv, physlinkid,
+							  req ? req->
+							  client_layer : NULL);
+			}
+
+			if (req != NULL)
+				kfree(req);
+
+			spin_unlock_bh(&cfctrl->info_list_lock);
+		}
+		break;
+	case CFCTRL_CMD_LINK_DESTROY:
+		cfpkt_extr_head(pkt, &linkid, 1);
+		cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
+		break;
+	case CFCTRL_CMD_LINK_ERR:
+		pr_err("Frame Error Indication received\n");
+		cfctrl->res.linkerror_ind();
+		break;
+	case CFCTRL_CMD_ENUM:
+		cfctrl->res.enum_rsp();
+		break;
+	case CFCTRL_CMD_SLEEP:
+		cfctrl->res.sleep_rsp();
+		break;
+	case CFCTRL_CMD_WAKE:
+		cfctrl->res.wake_rsp();
+		break;
+	case CFCTRL_CMD_LINK_RECONF:
+		cfctrl->res.restart_rsp();
+		break;
+	case CFCTRL_CMD_RADIO_SET:
+		cfctrl->res.radioset_rsp();
+		break;
+	default:
+		pr_err("Unrecognized Control Frame\n");
+		goto error;
+		break;
+	}
+	ret = 0;
+error:
+	cfpkt_destroy(pkt);
+	return ret;
+}
+
+static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+			int phyid)
+{
+	struct cfctrl *this = container_obj(layr);
+	switch (ctrl) {
+	case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+	case CAIF_CTRLCMD_FLOW_OFF_IND:
+		spin_lock_bh(&this->info_list_lock);
+		if (!list_empty(&this->list))
+			pr_debug("Received flow off in control layer\n");
+		spin_unlock_bh(&this->info_list_lock);
+		break;
+	case _CAIF_CTRLCMD_PHYIF_DOWN_IND: {
+		struct cfctrl_request_info *p, *tmp;
+
+		/* Find all connect request and report failure */
+		spin_lock_bh(&this->info_list_lock);
+		list_for_each_entry_safe(p, tmp, &this->list, list) {
+			if (p->param.phyid == phyid) {
+				list_del(&p->list);
+				p->client_layer->ctrlcmd(p->client_layer,
+						CAIF_CTRLCMD_INIT_FAIL_RSP,
+						phyid);
+				kfree(p);
+			}
+		}
+		spin_unlock_bh(&this->info_list_lock);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+#ifndef CAIF_NO_LOOP
+static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
+{
+	static int last_linkid;
+	static int dec;
+	u8 linkid, linktype, tmp;
+	switch (cmd) {
+	case CFCTRL_CMD_LINK_SETUP:
+		spin_lock_bh(&ctrl->loop_linkid_lock);
+		if (!dec) {
+			for (linkid = last_linkid + 1; linkid < 254; linkid++)
+				if (!ctrl->loop_linkused[linkid])
+					goto found;
+		}
+		dec = 1;
+		for (linkid = last_linkid - 1; linkid > 1; linkid--)
+			if (!ctrl->loop_linkused[linkid])
+				goto found;
+		spin_unlock_bh(&ctrl->loop_linkid_lock);
+		return -1;
+found:
+		if (linkid < 10)
+			dec = 0;
+
+		if (!ctrl->loop_linkused[linkid])
+			ctrl->loop_linkused[linkid] = 1;
+
+		last_linkid = linkid;
+
+		cfpkt_add_trail(pkt, &linkid, 1);
+		spin_unlock_bh(&ctrl->loop_linkid_lock);
+		cfpkt_peek_head(pkt, &linktype, 1);
+		if (linktype ==  CFCTRL_SRV_UTIL) {
+			tmp = 0x01;
+			cfpkt_add_trail(pkt, &tmp, 1);
+			cfpkt_add_trail(pkt, &tmp, 1);
+		}
+		break;
+
+	case CFCTRL_CMD_LINK_DESTROY:
+		spin_lock_bh(&ctrl->loop_linkid_lock);
+		cfpkt_peek_head(pkt, &linkid, 1);
+		ctrl->loop_linkused[linkid] = 0;
+		spin_unlock_bh(&ctrl->loop_linkid_lock);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+#endif
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
new file mode 100644
index 00000000..11a2af4c
--- /dev/null
+++ b/net/caif/cfdbgl.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
+{
+	struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+	if (!dbg) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+	memset(dbg, 0, sizeof(struct cfsrvl));
+	cfsrvl_init(dbg, channel_id, dev_info, false);
+	dbg->layer.receive = cfdbgl_receive;
+	dbg->layer.transmit = cfdbgl_transmit;
+	snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id);
+	return &dbg->layer;
+}
+
+static int cfdbgl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	return layr->up->receive(layr->up, pkt);
+}
+
+static int cfdbgl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	struct cfsrvl *service = container_obj(layr);
+	struct caif_payload_info *info;
+	int ret;
+
+	if (!cfsrvl_ready(service, &ret))
+		return ret;
+
+	/* Add info for MUX-layer to route the packet out */
+	info = cfpkt_info(pkt);
+	info->channel_id = service->layer.id;
+	info->dev_info = &service->dev_info;
+
+	return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
new file mode 100644
index 00000000..0382dec8
--- /dev/null
+++ b/net/caif/cfdgml.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+#define DGM_CMD_BIT  0x80
+#define DGM_FLOW_OFF 0x81
+#define DGM_FLOW_ON  0x80
+#define DGM_MTU 1500
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
+{
+	struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+	if (!dgm) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+	memset(dgm, 0, sizeof(struct cfsrvl));
+	cfsrvl_init(dgm, channel_id, dev_info, true);
+	dgm->layer.receive = cfdgml_receive;
+	dgm->layer.transmit = cfdgml_transmit;
+	snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id);
+	dgm->layer.name[CAIF_LAYER_NAME_SZ - 1] = '\0';
+	return &dgm->layer;
+}
+
+static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 cmd = -1;
+	u8 dgmhdr[3];
+	int ret;
+	caif_assert(layr->up != NULL);
+	caif_assert(layr->receive != NULL);
+	caif_assert(layr->ctrlcmd != NULL);
+
+	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+
+	if ((cmd & DGM_CMD_BIT) == 0) {
+		if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
+			pr_err("Packet is erroneous!\n");
+			cfpkt_destroy(pkt);
+			return -EPROTO;
+		}
+		ret = layr->up->receive(layr->up, pkt);
+		return ret;
+	}
+
+	switch (cmd) {
+	case DGM_FLOW_OFF:	/* FLOW OFF */
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	case DGM_FLOW_ON:	/* FLOW ON */
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	default:
+		cfpkt_destroy(pkt);
+		pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
+		return -EPROTO;
+	}
+}
+
+static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 packet_type;
+	u32 zero = 0;
+	struct caif_payload_info *info;
+	struct cfsrvl *service = container_obj(layr);
+	int ret;
+	if (!cfsrvl_ready(service, &ret))
+		return ret;
+
+	/* STE Modem cannot handle more than 1500 bytes datagrams */
+	if (cfpkt_getlen(pkt) > DGM_MTU)
+		return -EMSGSIZE;
+
+	cfpkt_add_head(pkt, &zero, 3);
+	packet_type = 0x08; /* B9 set - UNCLASSIFIED */
+	cfpkt_add_head(pkt, &packet_type, 1);
+
+	/* Add info for MUX-layer to route the packet out. */
+	info = cfpkt_info(pkt);
+	info->channel_id = service->layer.id;
+	/* To optimize alignment, we add up the size of CAIF header
+	 * before payload.
+	 */
+	info->hdr_len = 4;
+	info->dev_info = &service->dev_info;
+	return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
new file mode 100644
index 00000000..04204b20
--- /dev/null
+++ b/net/caif/cffrml.c
@@ -0,0 +1,199 @@
+/*
+ * CAIF Framing Layer.
+ *
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/crc-ccitt.h>
+#include <linux/netdevice.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cffrml, layer)
+
+struct cffrml {
+	struct cflayer layer;
+	bool dofcs;		/* !< FCS active */
+	int __percpu		*pcpu_refcnt;
+};
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid);
+
+static u32 cffrml_rcv_error;
+static u32 cffrml_rcv_checsum_error;
+struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
+{
+	struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC);
+	if (!this) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	this->pcpu_refcnt = alloc_percpu(int);
+	if (this->pcpu_refcnt == NULL) {
+		kfree(this);
+		return NULL;
+	}
+
+	caif_assert(offsetof(struct cffrml, layer) == 0);
+
+	memset(this, 0, sizeof(struct cflayer));
+	this->layer.receive = cffrml_receive;
+	this->layer.transmit = cffrml_transmit;
+	this->layer.ctrlcmd = cffrml_ctrlcmd;
+	snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid);
+	this->dofcs = use_fcs;
+	this->layer.id = phyid;
+	return (struct cflayer *) this;
+}
+
+void cffrml_free(struct cflayer *layer)
+{
+	struct cffrml *this = container_obj(layer);
+	free_percpu(this->pcpu_refcnt);
+	kfree(layer);
+}
+
+void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up)
+{
+	this->up = up;
+}
+
+void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn)
+{
+	this->dn = dn;
+}
+
+static u16 cffrml_checksum(u16 chks, void *buf, u16 len)
+{
+	/* FIXME: FCS should be moved to glue in order to use OS-Specific
+	 * solutions
+	 */
+	return crc_ccitt(chks, buf, len);
+}
+
+static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u16 tmp;
+	u16 len;
+	u16 hdrchks;
+	u16 pktchks;
+	struct cffrml *this;
+	this = container_obj(layr);
+
+	cfpkt_extr_head(pkt, &tmp, 2);
+	len = le16_to_cpu(tmp);
+
+	/* Subtract for FCS on length if FCS is not used. */
+	if (!this->dofcs)
+		len -= 2;
+
+	if (cfpkt_setlen(pkt, len) < 0) {
+		++cffrml_rcv_error;
+		pr_err("Framing length error (%d)\n", len);
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+	/*
+	 * Don't do extract if FCS is false, rather do setlen - then we don't
+	 * get a cache-miss.
+	 */
+	if (this->dofcs) {
+		cfpkt_extr_trail(pkt, &tmp, 2);
+		hdrchks = le16_to_cpu(tmp);
+		pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+		if (pktchks != hdrchks) {
+			cfpkt_add_trail(pkt, &tmp, 2);
+			++cffrml_rcv_error;
+			++cffrml_rcv_checsum_error;
+			pr_info("Frame checksum error (0x%x != 0x%x)\n",
+				hdrchks, pktchks);
+			return -EILSEQ;
+		}
+	}
+	if (cfpkt_erroneous(pkt)) {
+		++cffrml_rcv_error;
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+
+	if (layr->up == NULL) {
+		pr_err("Layr up is missing!\n");
+		cfpkt_destroy(pkt);
+		return -EINVAL;
+	}
+
+	return layr->up->receive(layr->up, pkt);
+}
+
+static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	int tmp;
+	u16 chks;
+	u16 len;
+	struct cffrml *this = container_obj(layr);
+	if (this->dofcs) {
+		chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff);
+		tmp = cpu_to_le16(chks);
+		cfpkt_add_trail(pkt, &tmp, 2);
+	} else {
+		cfpkt_pad_trail(pkt, 2);
+	}
+	len = cfpkt_getlen(pkt);
+	tmp = cpu_to_le16(len);
+	cfpkt_add_head(pkt, &tmp, 2);
+	cfpkt_info(pkt)->hdr_len += 2;
+	if (cfpkt_erroneous(pkt)) {
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+
+	if (layr->dn == NULL) {
+		cfpkt_destroy(pkt);
+		return -ENODEV;
+
+	}
+	return layr->dn->transmit(layr->dn, pkt);
+}
+
+static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+					int phyid)
+{
+	if (layr->up && layr->up->ctrlcmd)
+		layr->up->ctrlcmd(layr->up, ctrl, layr->id);
+}
+
+void cffrml_put(struct cflayer *layr)
+{
+	struct cffrml *this = container_obj(layr);
+	if (layr != NULL && this->pcpu_refcnt != NULL)
+		irqsafe_cpu_dec(*this->pcpu_refcnt);
+}
+
+void cffrml_hold(struct cflayer *layr)
+{
+	struct cffrml *this = container_obj(layr);
+	if (layr != NULL && this->pcpu_refcnt != NULL)
+		irqsafe_cpu_inc(*this->pcpu_refcnt);
+}
+
+int cffrml_refcnt_read(struct cflayer *layr)
+{
+	int i, refcnt = 0;
+	struct cffrml *this = container_obj(layr);
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(this->pcpu_refcnt, i);
+	return refcnt;
+}
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
new file mode 100644
index 00000000..c23979e7
--- /dev/null
+++ b/net/caif/cfmuxl.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfmuxl.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cffrml.h>
+
+#define container_obj(layr) container_of(layr, struct cfmuxl, layer)
+
+#define CAIF_CTRL_CHANNEL 0
+#define UP_CACHE_SIZE 8
+#define DN_CACHE_SIZE 8
+
+struct cfmuxl {
+	struct cflayer layer;
+	struct list_head srvl_list;
+	struct list_head frml_list;
+	struct cflayer *up_cache[UP_CACHE_SIZE];
+	struct cflayer *dn_cache[DN_CACHE_SIZE];
+	/*
+	 * Set when inserting or removing downwards layers.
+	 */
+	spinlock_t transmit_lock;
+
+	/*
+	 * Set when inserting or removing upwards layers.
+	 */
+	spinlock_t receive_lock;
+
+};
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid);
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id);
+
+struct cflayer *cfmuxl_create(void)
+{
+	struct cfmuxl *this = kmalloc(sizeof(struct cfmuxl), GFP_ATOMIC);
+	if (!this)
+		return NULL;
+	memset(this, 0, sizeof(*this));
+	this->layer.receive = cfmuxl_receive;
+	this->layer.transmit = cfmuxl_transmit;
+	this->layer.ctrlcmd = cfmuxl_ctrlcmd;
+	INIT_LIST_HEAD(&this->srvl_list);
+	INIT_LIST_HEAD(&this->frml_list);
+	spin_lock_init(&this->transmit_lock);
+	spin_lock_init(&this->receive_lock);
+	snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux");
+	return &this->layer;
+}
+
+int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid)
+{
+	struct cfmuxl *muxl = (struct cfmuxl *) layr;
+
+	spin_lock_bh(&muxl->transmit_lock);
+	list_add_rcu(&dn->node, &muxl->frml_list);
+	spin_unlock_bh(&muxl->transmit_lock);
+	return 0;
+}
+
+static struct cflayer *get_from_id(struct list_head *list, u16 id)
+{
+	struct cflayer *lyr;
+	list_for_each_entry_rcu(lyr, list, node) {
+		if (lyr->id == id)
+			return lyr;
+	}
+
+	return NULL;
+}
+
+int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid)
+{
+	struct cfmuxl *muxl = container_obj(layr);
+	struct cflayer *old;
+
+	spin_lock_bh(&muxl->receive_lock);
+
+	/* Two entries with same id is wrong, so remove old layer from mux */
+	old = get_from_id(&muxl->srvl_list, linkid);
+	if (old != NULL)
+		list_del_rcu(&old->node);
+
+	list_add_rcu(&up->node, &muxl->srvl_list);
+	spin_unlock_bh(&muxl->receive_lock);
+
+	return 0;
+}
+
+struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid)
+{
+	struct cfmuxl *muxl = container_obj(layr);
+	struct cflayer *dn;
+	int idx = phyid % DN_CACHE_SIZE;
+
+	spin_lock_bh(&muxl->transmit_lock);
+	rcu_assign_pointer(muxl->dn_cache[idx], NULL);
+	dn = get_from_id(&muxl->frml_list, phyid);
+	if (dn == NULL)
+		goto out;
+
+	list_del_rcu(&dn->node);
+	caif_assert(dn != NULL);
+out:
+	spin_unlock_bh(&muxl->transmit_lock);
+	return dn;
+}
+
+static struct cflayer *get_up(struct cfmuxl *muxl, u16 id)
+{
+	struct cflayer *up;
+	int idx = id % UP_CACHE_SIZE;
+	up = rcu_dereference(muxl->up_cache[idx]);
+	if (up == NULL || up->id != id) {
+		spin_lock_bh(&muxl->receive_lock);
+		up = get_from_id(&muxl->srvl_list, id);
+		rcu_assign_pointer(muxl->up_cache[idx], up);
+		spin_unlock_bh(&muxl->receive_lock);
+	}
+	return up;
+}
+
+static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info)
+{
+	struct cflayer *dn;
+	int idx = dev_info->id % DN_CACHE_SIZE;
+	dn = rcu_dereference(muxl->dn_cache[idx]);
+	if (dn == NULL || dn->id != dev_info->id) {
+		spin_lock_bh(&muxl->transmit_lock);
+		dn = get_from_id(&muxl->frml_list, dev_info->id);
+		rcu_assign_pointer(muxl->dn_cache[idx], dn);
+		spin_unlock_bh(&muxl->transmit_lock);
+	}
+	return dn;
+}
+
+struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id)
+{
+	struct cflayer *up;
+	struct cfmuxl *muxl = container_obj(layr);
+	int idx = id % UP_CACHE_SIZE;
+
+	if (id == 0) {
+		pr_warn("Trying to remove control layer\n");
+		return NULL;
+	}
+
+	spin_lock_bh(&muxl->receive_lock);
+	up = get_from_id(&muxl->srvl_list, id);
+	if (up == NULL)
+		goto out;
+
+	rcu_assign_pointer(muxl->up_cache[idx], NULL);
+	list_del_rcu(&up->node);
+out:
+	spin_unlock_bh(&muxl->receive_lock);
+	return up;
+}
+
+static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	int ret;
+	struct cfmuxl *muxl = container_obj(layr);
+	u8 id;
+	struct cflayer *up;
+	if (cfpkt_extr_head(pkt, &id, 1) < 0) {
+		pr_err("erroneous Caif Packet\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+	rcu_read_lock();
+	up = get_up(muxl, id);
+
+	if (up == NULL) {
+		pr_debug("Received data on unknown link ID = %d (0x%x)"
+			" up == NULL", id, id);
+		cfpkt_destroy(pkt);
+		/*
+		 * Don't return ERROR, since modem misbehaves and sends out
+		 * flow on before linksetup response.
+		 */
+
+		rcu_read_unlock();
+		return /* CFGLU_EPROT; */ 0;
+	}
+
+	/* We can't hold rcu_lock during receive, so take a ref count instead */
+	cfsrvl_get(up);
+	rcu_read_unlock();
+
+	ret = up->receive(up, pkt);
+
+	cfsrvl_put(up);
+	return ret;
+}
+
+static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	struct cfmuxl *muxl = container_obj(layr);
+	int err;
+	u8 linkid;
+	struct cflayer *dn;
+	struct caif_payload_info *info = cfpkt_info(pkt);
+	BUG_ON(!info);
+
+	rcu_read_lock();
+
+	dn = get_dn(muxl, info->dev_info);
+	if (dn == NULL) {
+		pr_debug("Send data on unknown phy ID = %d (0x%x)\n",
+			info->dev_info->id, info->dev_info->id);
+		rcu_read_unlock();
+		cfpkt_destroy(pkt);
+		return -ENOTCONN;
+	}
+
+	info->hdr_len += 1;
+	linkid = info->channel_id;
+	cfpkt_add_head(pkt, &linkid, 1);
+
+	/* We can't hold rcu_lock during receive, so take a ref count instead */
+	cffrml_hold(dn);
+
+	rcu_read_unlock();
+
+	err = dn->transmit(dn, pkt);
+
+	cffrml_put(dn);
+	return err;
+}
+
+static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid)
+{
+	struct cfmuxl *muxl = container_obj(layr);
+	struct cflayer *layer;
+	int idx;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(layer, &muxl->srvl_list, node) {
+
+		if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) {
+
+			if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND ||
+				ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) &&
+					layer->id != 0) {
+
+				idx = layer->id % UP_CACHE_SIZE;
+				spin_lock_bh(&muxl->receive_lock);
+				rcu_assign_pointer(muxl->up_cache[idx], NULL);
+				list_del_rcu(&layer->node);
+				spin_unlock_bh(&muxl->receive_lock);
+			}
+			/* NOTE: ctrlcmd is not allowed to block */
+			layer->ctrlcmd(layer, ctrl, phyid);
+		}
+	}
+	rcu_read_unlock();
+}
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
new file mode 100644
index 00000000..75d4bfae
--- /dev/null
+++ b/net/caif/cfpkt_skbuff.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/hardirq.h>
+#include <net/caif/cfpkt.h>
+
+#define PKT_PREFIX  48
+#define PKT_POSTFIX 2
+#define PKT_LEN_WHEN_EXTENDING 128
+#define PKT_ERROR(pkt, errmsg)		   \
+do {					   \
+	cfpkt_priv(pkt)->erronous = true;  \
+	skb_reset_tail_pointer(&pkt->skb); \
+	pr_warn(errmsg);		   \
+} while (0)
+
+struct cfpktq {
+	struct sk_buff_head head;
+	atomic_t count;
+	/* Lock protects count updates */
+	spinlock_t lock;
+};
+
+/*
+ * net/caif/ is generic and does not
+ * understand SKB, so we do this typecast
+ */
+struct cfpkt {
+	struct sk_buff skb;
+};
+
+/* Private data inside SKB */
+struct cfpkt_priv_data {
+	struct dev_info dev_info;
+	bool erronous;
+};
+
+static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt)
+{
+	return (struct cfpkt_priv_data *) pkt->skb.cb;
+}
+
+static inline bool is_erronous(struct cfpkt *pkt)
+{
+	return cfpkt_priv(pkt)->erronous;
+}
+
+static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt)
+{
+	return &pkt->skb;
+}
+
+static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb)
+{
+	return (struct cfpkt *) skb;
+}
+
+
+struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt)
+{
+	struct cfpkt *pkt = skb_to_pkt(nativepkt);
+	cfpkt_priv(pkt)->erronous = false;
+	return pkt;
+}
+EXPORT_SYMBOL(cfpkt_fromnative);
+
+void *cfpkt_tonative(struct cfpkt *pkt)
+{
+	return (void *) pkt;
+}
+EXPORT_SYMBOL(cfpkt_tonative);
+
+static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx)
+{
+	struct sk_buff *skb;
+
+	if (likely(in_interrupt()))
+		skb = alloc_skb(len + pfx, GFP_ATOMIC);
+	else
+		skb = alloc_skb(len + pfx, GFP_KERNEL);
+
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	skb_reserve(skb, pfx);
+	return skb_to_pkt(skb);
+}
+
+inline struct cfpkt *cfpkt_create(u16 len)
+{
+	return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX);
+}
+
+void cfpkt_destroy(struct cfpkt *pkt)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	kfree_skb(skb);
+}
+
+
+inline bool cfpkt_more(struct cfpkt *pkt)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	return skb->len > 0;
+}
+
+
+int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	if (skb_headlen(skb) >= len) {
+		memcpy(data, skb->data, len);
+		return 0;
+	}
+	return !cfpkt_extr_head(pkt, data, len) &&
+	    !cfpkt_add_head(pkt, data, len);
+}
+
+int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	u8 *from;
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+
+	if (unlikely(len > skb->len)) {
+		PKT_ERROR(pkt, "read beyond end of packet\n");
+		return -EPROTO;
+	}
+
+	if (unlikely(len > skb_headlen(skb))) {
+		if (unlikely(skb_linearize(skb) != 0)) {
+			PKT_ERROR(pkt, "linearize failed\n");
+			return -EPROTO;
+		}
+	}
+	from = skb_pull(skb, len);
+	from -= len;
+	memcpy(data, from, len);
+	return 0;
+}
+
+int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	u8 *data = dta;
+	u8 *from;
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+
+	if (unlikely(skb_linearize(skb) != 0)) {
+		PKT_ERROR(pkt, "linearize failed\n");
+		return -EPROTO;
+	}
+	if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
+		PKT_ERROR(pkt, "read beyond end of packet\n");
+		return -EPROTO;
+	}
+	from = skb_tail_pointer(skb) - len;
+	skb_trim(skb, skb->len - len);
+	memcpy(data, from, len);
+	return 0;
+}
+
+
+int cfpkt_pad_trail(struct cfpkt *pkt, u16 len)
+{
+	return cfpkt_add_body(pkt, NULL, len);
+}
+
+
+int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	struct sk_buff *lastskb;
+	u8 *to;
+	u16 addlen = 0;
+
+
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+
+	lastskb = skb;
+
+	/* Check whether we need to add space at the tail */
+	if (unlikely(skb_tailroom(skb) < len)) {
+		if (likely(len < PKT_LEN_WHEN_EXTENDING))
+			addlen = PKT_LEN_WHEN_EXTENDING;
+		else
+			addlen = len;
+	}
+
+	/* Check whether we need to change the SKB before writing to the tail */
+	if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) {
+
+		/* Make sure data is writable */
+		if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
+			PKT_ERROR(pkt, "cow failed\n");
+			return -EPROTO;
+		}
+		/*
+		 * Is the SKB non-linear after skb_cow_data()? If so, we are
+		 * going to add data to the last SKB, so we need to adjust
+		 * lengths of the top SKB.
+		 */
+		if (lastskb != skb) {
+			pr_warn("Packet is non-linear\n");
+			skb->len += len;
+			skb->data_len += len;
+		}
+	}
+
+	/* All set to put the last SKB and optionally write data there. */
+	to = skb_put(lastskb, len);
+	if (likely(data))
+		memcpy(to, data, len);
+	return 0;
+}
+
+inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data)
+{
+	return cfpkt_add_body(pkt, &data, 1);
+}
+
+int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	struct sk_buff *lastskb;
+	u8 *to;
+	const u8 *data = data2;
+	int ret;
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+	if (unlikely(skb_headroom(skb) < len)) {
+		PKT_ERROR(pkt, "no headroom\n");
+		return -EPROTO;
+	}
+
+	/* Make sure data is writable */
+	ret = skb_cow_data(skb, 0, &lastskb);
+	if (unlikely(ret < 0)) {
+		PKT_ERROR(pkt, "cow failed\n");
+		return ret;
+	}
+
+	to = skb_push(skb, len);
+	memcpy(to, data, len);
+	return 0;
+}
+
+
+inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len)
+{
+	return cfpkt_add_body(pkt, data, len);
+}
+
+
+inline u16 cfpkt_getlen(struct cfpkt *pkt)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	return skb->len;
+}
+
+
+inline u16 cfpkt_iterate(struct cfpkt *pkt,
+			    u16 (*iter_func)(u16, void *, u16),
+			    u16 data)
+{
+	/*
+	 * Don't care about the performance hit of linearizing,
+	 * Checksum should not be used on high-speed interfaces anyway.
+	 */
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+	if (unlikely(skb_linearize(&pkt->skb) != 0)) {
+		PKT_ERROR(pkt, "linearize failed\n");
+		return -EPROTO;
+	}
+	return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
+}
+
+
+int cfpkt_setlen(struct cfpkt *pkt, u16 len)
+{
+	struct sk_buff *skb = pkt_to_skb(pkt);
+
+
+	if (unlikely(is_erronous(pkt)))
+		return -EPROTO;
+
+	if (likely(len <= skb->len)) {
+		if (unlikely(skb->data_len))
+			___pskb_trim(skb, len);
+		else
+			skb_trim(skb, len);
+
+			return cfpkt_getlen(pkt);
+	}
+
+	/* Need to expand SKB */
+	if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
+		PKT_ERROR(pkt, "skb_pad_trail failed\n");
+
+	return cfpkt_getlen(pkt);
+}
+
+struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
+			     struct cfpkt *addpkt,
+			     u16 expectlen)
+{
+	struct sk_buff *dst = pkt_to_skb(dstpkt);
+	struct sk_buff *add = pkt_to_skb(addpkt);
+	u16 addlen = skb_headlen(add);
+	u16 neededtailspace;
+	struct sk_buff *tmp;
+	u16 dstlen;
+	u16 createlen;
+	if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) {
+		return dstpkt;
+	}
+	if (expectlen > addlen)
+		neededtailspace = expectlen;
+	else
+		neededtailspace = addlen;
+
+	if (dst->tail + neededtailspace > dst->end) {
+		/* Create a dumplicate of 'dst' with more tail space */
+		struct cfpkt *tmppkt;
+		dstlen = skb_headlen(dst);
+		createlen = dstlen + neededtailspace;
+		tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX);
+		if (tmppkt == NULL)
+			return NULL;
+		tmp = pkt_to_skb(tmppkt);
+		skb_set_tail_pointer(tmp, dstlen);
+		tmp->len = dstlen;
+		memcpy(tmp->data, dst->data, dstlen);
+		cfpkt_destroy(dstpkt);
+		dst = tmp;
+	}
+	memcpy(skb_tail_pointer(dst), add->data, skb_headlen(add));
+	cfpkt_destroy(addpkt);
+	dst->tail += addlen;
+	dst->len += addlen;
+	return skb_to_pkt(dst);
+}
+
+struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
+{
+	struct sk_buff *skb2;
+	struct sk_buff *skb = pkt_to_skb(pkt);
+	struct cfpkt *tmppkt;
+	u8 *split = skb->data + pos;
+	u16 len2nd = skb_tail_pointer(skb) - split;
+
+	if (unlikely(is_erronous(pkt)))
+		return NULL;
+
+	if (skb->data + pos > skb_tail_pointer(skb)) {
+		PKT_ERROR(pkt, "trying to split beyond end of packet\n");
+		return NULL;
+	}
+
+	/* Create a new packet for the second part of the data */
+	tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX,
+				  PKT_PREFIX);
+	if (tmppkt == NULL)
+		return NULL;
+	skb2 = pkt_to_skb(tmppkt);
+
+
+	if (skb2 == NULL)
+		return NULL;
+
+	/* Reduce the length of the original packet */
+	skb_set_tail_pointer(skb, pos);
+	skb->len = pos;
+
+	memcpy(skb2->data, split, len2nd);
+	skb2->tail += len2nd;
+	skb2->len += len2nd;
+	return skb_to_pkt(skb2);
+}
+
+bool cfpkt_erroneous(struct cfpkt *pkt)
+{
+	return cfpkt_priv(pkt)->erronous;
+}
+
+struct caif_payload_info *cfpkt_info(struct cfpkt *pkt)
+{
+	return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb;
+}
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
new file mode 100644
index 00000000..0deabb44
--- /dev/null
+++ b/net/caif/cfrfml.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) container_of(layr, struct cfrfml, serv.layer)
+#define RFM_SEGMENTATION_BIT 0x01
+#define RFM_HEAD_SIZE 7
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cfrfml {
+	struct cfsrvl serv;
+	struct cfpkt *incomplete_frm;
+	int fragment_size;
+	u8  seghead[6];
+	u16 pdu_size;
+	/* Protects serialized processing of packets */
+	spinlock_t sync;
+};
+
+static void cfrfml_release(struct cflayer *layer)
+{
+	struct cfsrvl *srvl = container_of(layer, struct cfsrvl, layer);
+	struct cfrfml *rfml = container_obj(&srvl->layer);
+
+	if (rfml->incomplete_frm)
+		cfpkt_destroy(rfml->incomplete_frm);
+
+	kfree(srvl);
+}
+
+struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
+					int mtu_size)
+{
+	int tmp;
+	struct cfrfml *this =
+		kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
+
+	if (!this) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+
+	cfsrvl_init(&this->serv, channel_id, dev_info, false);
+	this->serv.release = cfrfml_release;
+	this->serv.layer.receive = cfrfml_receive;
+	this->serv.layer.transmit = cfrfml_transmit;
+
+	/* Round down to closest multiple of 16 */
+	tmp = (mtu_size - RFM_HEAD_SIZE - 6) / 16;
+	tmp *= 16;
+
+	this->fragment_size = tmp;
+	spin_lock_init(&this->sync);
+	snprintf(this->serv.layer.name, CAIF_LAYER_NAME_SZ,
+		"rfm%d", channel_id);
+
+	return &this->serv.layer;
+}
+
+static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead,
+			struct cfpkt *pkt, int *err)
+{
+	struct cfpkt *tmppkt;
+	*err = -EPROTO;
+	/* n-th but not last segment */
+
+	if (cfpkt_extr_head(pkt, seghead, 6) < 0)
+		return NULL;
+
+	/* Verify correct header */
+	if (memcmp(seghead, rfml->seghead, 6) != 0)
+		return NULL;
+
+	tmppkt = cfpkt_append(rfml->incomplete_frm, pkt,
+			rfml->pdu_size + RFM_HEAD_SIZE);
+
+	/* If cfpkt_append failes input pkts are not freed */
+	*err = -ENOMEM;
+	if (tmppkt == NULL)
+		return NULL;
+
+	*err = 0;
+	return tmppkt;
+}
+
+static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 tmp;
+	bool segmented;
+	int err;
+	u8 seghead[6];
+	struct cfrfml *rfml;
+	struct cfpkt *tmppkt = NULL;
+
+	caif_assert(layr->up != NULL);
+	caif_assert(layr->receive != NULL);
+	rfml = container_obj(layr);
+	spin_lock(&rfml->sync);
+
+	err = -EPROTO;
+	if (cfpkt_extr_head(pkt, &tmp, 1) < 0)
+		goto out;
+	segmented = tmp & RFM_SEGMENTATION_BIT;
+
+	if (segmented) {
+		if (rfml->incomplete_frm == NULL) {
+			/* Initial Segment */
+			if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0)
+				goto out;
+
+			rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
+
+			if (cfpkt_erroneous(pkt))
+				goto out;
+			rfml->incomplete_frm = pkt;
+			pkt = NULL;
+		} else {
+
+			tmppkt = rfm_append(rfml, seghead, pkt, &err);
+			if (tmppkt == NULL)
+				goto out;
+
+			if (cfpkt_erroneous(tmppkt))
+				goto out;
+
+			rfml->incomplete_frm = tmppkt;
+
+
+			if (cfpkt_erroneous(tmppkt))
+				goto out;
+		}
+		err = 0;
+		goto out;
+	}
+
+	if (rfml->incomplete_frm) {
+
+		/* Last Segment */
+		tmppkt = rfm_append(rfml, seghead, pkt, &err);
+		if (tmppkt == NULL)
+			goto out;
+
+		if (cfpkt_erroneous(tmppkt))
+			goto out;
+
+		rfml->incomplete_frm = NULL;
+		pkt = tmppkt;
+		tmppkt = NULL;
+
+		/* Verify that length is correct */
+		err = EPROTO;
+		if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
+			goto out;
+	}
+
+	err = rfml->serv.layer.up->receive(rfml->serv.layer.up, pkt);
+
+out:
+
+	if (err != 0) {
+		if (tmppkt)
+			cfpkt_destroy(tmppkt);
+		if (pkt)
+			cfpkt_destroy(pkt);
+		if (rfml->incomplete_frm)
+			cfpkt_destroy(rfml->incomplete_frm);
+		rfml->incomplete_frm = NULL;
+
+		pr_info("Connection error %d triggered on RFM link\n", err);
+
+		/* Trigger connection error upon failure.*/
+		layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+					rfml->serv.dev_info.id);
+	}
+	spin_unlock(&rfml->sync);
+	return err;
+}
+
+
+static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt)
+{
+	caif_assert(cfpkt_getlen(pkt) < rfml->fragment_size);
+
+	/* Add info for MUX-layer to route the packet out. */
+	cfpkt_info(pkt)->channel_id = rfml->serv.layer.id;
+
+	/*
+	 * To optimize alignment, we add up the size of CAIF header before
+	 * payload.
+	 */
+	cfpkt_info(pkt)->hdr_len = RFM_HEAD_SIZE;
+	cfpkt_info(pkt)->dev_info = &rfml->serv.dev_info;
+
+	return rfml->serv.layer.dn->transmit(rfml->serv.layer.dn, pkt);
+}
+
+static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	int err;
+	u8 seg;
+	u8 head[6];
+	struct cfpkt *rearpkt = NULL;
+	struct cfpkt *frontpkt = pkt;
+	struct cfrfml *rfml = container_obj(layr);
+
+	caif_assert(layr->dn != NULL);
+	caif_assert(layr->dn->transmit != NULL);
+
+	if (!cfsrvl_ready(&rfml->serv, &err))
+		return err;
+
+	err = -EPROTO;
+	if (cfpkt_getlen(pkt) <= RFM_HEAD_SIZE-1)
+		goto out;
+
+	err = 0;
+	if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
+		err = cfpkt_peek_head(pkt, head, 6);
+
+	if (err < 0)
+		goto out;
+
+	while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
+
+		seg = 1;
+		err = -EPROTO;
+
+		if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+			goto out;
+		/*
+		 * On OOM error cfpkt_split returns NULL.
+		 *
+		 * NOTE: Segmented pdu is not correctly aligned.
+		 * This has negative performance impact.
+		 */
+
+		rearpkt = cfpkt_split(frontpkt, rfml->fragment_size);
+		if (rearpkt == NULL)
+			goto out;
+
+		err = cfrfml_transmit_segment(rfml, frontpkt);
+
+		if (err != 0)
+			goto out;
+		frontpkt = rearpkt;
+		rearpkt = NULL;
+
+		err = -ENOMEM;
+		if (frontpkt == NULL)
+			goto out;
+		err = -EPROTO;
+		if (cfpkt_add_head(frontpkt, head, 6) < 0)
+			goto out;
+
+	}
+
+	seg = 0;
+	err = -EPROTO;
+
+	if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+		goto out;
+
+	err = cfrfml_transmit_segment(rfml, frontpkt);
+
+	frontpkt = NULL;
+out:
+
+	if (err != 0) {
+		pr_info("Connection error %d triggered on RFM link\n", err);
+		/* Trigger connection error upon failure.*/
+
+		layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+					rfml->serv.dev_info.id);
+
+		if (rearpkt)
+			cfpkt_destroy(rearpkt);
+
+		if (frontpkt && frontpkt != pkt) {
+
+			cfpkt_destroy(frontpkt);
+			/*
+			 * Socket layer will free the original packet,
+			 * but this packet may already be sent and
+			 * freed. So we have to return 0 in this case
+			 * to avoid socket layer to re-free this packet.
+			 * The return of shutdown indication will
+			 * cause connection to be invalidated anyhow.
+			 */
+			err = 0;
+		}
+	}
+
+	return err;
+}
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
new file mode 100644
index 00000000..2715c84c
--- /dev/null
+++ b/net/caif/cfserl.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/cfserl.h>
+
+#define container_obj(layr) ((struct cfserl *) layr)
+
+#define CFSERL_STX 0x02
+#define SERIAL_MINIUM_PACKET_SIZE 4
+#define SERIAL_MAX_FRAMESIZE 4096
+struct cfserl {
+	struct cflayer layer;
+	struct cfpkt *incomplete_frm;
+	/* Protects parallel processing of incoming packets */
+	spinlock_t sync;
+	bool usestx;
+};
+
+static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid);
+
+struct cflayer *cfserl_create(int type, int instance, bool use_stx)
+{
+	struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC);
+	if (!this) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfserl, layer) == 0);
+	memset(this, 0, sizeof(struct cfserl));
+	this->layer.receive = cfserl_receive;
+	this->layer.transmit = cfserl_transmit;
+	this->layer.ctrlcmd = cfserl_ctrlcmd;
+	this->layer.type = type;
+	this->usestx = use_stx;
+	spin_lock_init(&this->sync);
+	snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "ser1");
+	return &this->layer;
+}
+
+static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
+{
+	struct cfserl *layr = container_obj(l);
+	u16 pkt_len;
+	struct cfpkt *pkt = NULL;
+	struct cfpkt *tail_pkt = NULL;
+	u8 tmp8;
+	u16 tmp;
+	u8 stx = CFSERL_STX;
+	int ret;
+	u16 expectlen = 0;
+
+	caif_assert(newpkt != NULL);
+	spin_lock(&layr->sync);
+
+	if (layr->incomplete_frm != NULL) {
+		layr->incomplete_frm =
+		    cfpkt_append(layr->incomplete_frm, newpkt, expectlen);
+		pkt = layr->incomplete_frm;
+		if (pkt == NULL) {
+			spin_unlock(&layr->sync);
+			return -ENOMEM;
+		}
+	} else {
+		pkt = newpkt;
+	}
+	layr->incomplete_frm = NULL;
+
+	do {
+		/* Search for STX at start of pkt if STX is used */
+		if (layr->usestx) {
+			cfpkt_extr_head(pkt, &tmp8, 1);
+			if (tmp8 != CFSERL_STX) {
+				while (cfpkt_more(pkt)
+				       && tmp8 != CFSERL_STX) {
+					cfpkt_extr_head(pkt, &tmp8, 1);
+				}
+				if (!cfpkt_more(pkt)) {
+					cfpkt_destroy(pkt);
+					layr->incomplete_frm = NULL;
+					spin_unlock(&layr->sync);
+					return -EPROTO;
+				}
+			}
+		}
+
+		pkt_len = cfpkt_getlen(pkt);
+
+		/*
+		 *  pkt_len is the accumulated length of the packet data
+		 *  we have received so far.
+		 *  Exit if frame doesn't hold length.
+		 */
+
+		if (pkt_len < 2) {
+			if (layr->usestx)
+				cfpkt_add_head(pkt, &stx, 1);
+			layr->incomplete_frm = pkt;
+			spin_unlock(&layr->sync);
+			return 0;
+		}
+
+		/*
+		 *  Find length of frame.
+		 *  expectlen is the length we need for a full frame.
+		 */
+		cfpkt_peek_head(pkt, &tmp, 2);
+		expectlen = le16_to_cpu(tmp) + 2;
+		/*
+		 * Frame error handling
+		 */
+		if (expectlen < SERIAL_MINIUM_PACKET_SIZE
+		    || expectlen > SERIAL_MAX_FRAMESIZE) {
+			if (!layr->usestx) {
+				if (pkt != NULL)
+					cfpkt_destroy(pkt);
+				layr->incomplete_frm = NULL;
+				expectlen = 0;
+				spin_unlock(&layr->sync);
+				return -EPROTO;
+			}
+			continue;
+		}
+
+		if (pkt_len < expectlen) {
+			/* Too little received data */
+			if (layr->usestx)
+				cfpkt_add_head(pkt, &stx, 1);
+			layr->incomplete_frm = pkt;
+			spin_unlock(&layr->sync);
+			return 0;
+		}
+
+		/*
+		 * Enough data for at least one frame.
+		 * Split the frame, if too long
+		 */
+		if (pkt_len > expectlen)
+			tail_pkt = cfpkt_split(pkt, expectlen);
+		else
+			tail_pkt = NULL;
+
+		/* Send the first part of packet upwards.*/
+		spin_unlock(&layr->sync);
+		ret = layr->layer.up->receive(layr->layer.up, pkt);
+		spin_lock(&layr->sync);
+		if (ret == -EILSEQ) {
+			if (layr->usestx) {
+				if (tail_pkt != NULL)
+					pkt = cfpkt_append(pkt, tail_pkt, 0);
+				/* Start search for next STX if frame failed */
+				continue;
+			} else {
+				cfpkt_destroy(pkt);
+				pkt = NULL;
+			}
+		}
+
+		pkt = tail_pkt;
+
+	} while (pkt != NULL);
+
+	spin_unlock(&layr->sync);
+	return 0;
+}
+
+static int cfserl_transmit(struct cflayer *layer, struct cfpkt *newpkt)
+{
+	struct cfserl *layr = container_obj(layer);
+	u8 tmp8 = CFSERL_STX;
+	if (layr->usestx)
+		cfpkt_add_head(newpkt, &tmp8, 1);
+	return layer->dn->transmit(layer->dn, newpkt);
+}
+
+static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid)
+{
+	layr->up->ctrlcmd(layr->up, ctrl, phyid);
+}
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
new file mode 100644
index 00000000..535a1e72
--- /dev/null
+++ b/net/caif/cfsrvl.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define SRVL_CTRL_PKT_SIZE 1
+#define SRVL_FLOW_OFF 0x81
+#define SRVL_FLOW_ON  0x80
+#define SRVL_SET_PIN  0x82
+#define SRVL_CTRL_PKT_SIZE 1
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
+				int phyid)
+{
+	struct cfsrvl *service = container_obj(layr);
+
+	if (layr->up == NULL || layr->up->ctrlcmd == NULL)
+		return;
+
+	switch (ctrl) {
+	case CAIF_CTRLCMD_INIT_RSP:
+		service->open = true;
+		layr->up->ctrlcmd(layr->up, ctrl, phyid);
+		break;
+	case CAIF_CTRLCMD_DEINIT_RSP:
+	case CAIF_CTRLCMD_INIT_FAIL_RSP:
+		service->open = false;
+		layr->up->ctrlcmd(layr->up, ctrl, phyid);
+		break;
+	case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND:
+		if (phyid != service->dev_info.id)
+			break;
+		if (service->modem_flow_on)
+			layr->up->ctrlcmd(layr->up,
+					  CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+		service->phy_flow_on = false;
+		break;
+	case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND:
+		if (phyid != service->dev_info.id)
+			return;
+		if (service->modem_flow_on) {
+			layr->up->ctrlcmd(layr->up,
+					   CAIF_CTRLCMD_FLOW_ON_IND,
+					   phyid);
+		}
+		service->phy_flow_on = true;
+		break;
+	case CAIF_CTRLCMD_FLOW_OFF_IND:
+		if (service->phy_flow_on) {
+			layr->up->ctrlcmd(layr->up,
+					  CAIF_CTRLCMD_FLOW_OFF_IND, phyid);
+		}
+		service->modem_flow_on = false;
+		break;
+	case CAIF_CTRLCMD_FLOW_ON_IND:
+		if (service->phy_flow_on) {
+			layr->up->ctrlcmd(layr->up,
+					  CAIF_CTRLCMD_FLOW_ON_IND, phyid);
+		}
+		service->modem_flow_on = true;
+		break;
+	case _CAIF_CTRLCMD_PHYIF_DOWN_IND:
+		/* In case interface is down, let's fake a remove shutdown */
+		layr->up->ctrlcmd(layr->up,
+				CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid);
+		break;
+	case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+		layr->up->ctrlcmd(layr->up, ctrl, phyid);
+		break;
+	default:
+		pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
+		/* We have both modem and phy flow on, send flow on */
+		layr->up->ctrlcmd(layr->up, ctrl, phyid);
+		service->phy_flow_on = true;
+		break;
+	}
+}
+
+static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
+{
+	struct cfsrvl *service = container_obj(layr);
+
+	caif_assert(layr != NULL);
+	caif_assert(layr->dn != NULL);
+	caif_assert(layr->dn->transmit != NULL);
+
+	if (!service->supports_flowctrl)
+		return 0;
+
+	switch (ctrl) {
+	case CAIF_MODEMCMD_FLOW_ON_REQ:
+		{
+			struct cfpkt *pkt;
+			struct caif_payload_info *info;
+			u8 flow_on = SRVL_FLOW_ON;
+			pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+			if (!pkt) {
+				pr_warn("Out of memory\n");
+				return -ENOMEM;
+			}
+
+			if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
+				pr_err("Packet is erroneous!\n");
+				cfpkt_destroy(pkt);
+				return -EPROTO;
+			}
+			info = cfpkt_info(pkt);
+			info->channel_id = service->layer.id;
+			info->hdr_len = 1;
+			info->dev_info = &service->dev_info;
+			return layr->dn->transmit(layr->dn, pkt);
+		}
+	case CAIF_MODEMCMD_FLOW_OFF_REQ:
+		{
+			struct cfpkt *pkt;
+			struct caif_payload_info *info;
+			u8 flow_off = SRVL_FLOW_OFF;
+			pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
+			if (!pkt) {
+				pr_warn("Out of memory\n");
+				return -ENOMEM;
+			}
+
+			if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
+				pr_err("Packet is erroneous!\n");
+				cfpkt_destroy(pkt);
+				return -EPROTO;
+			}
+			info = cfpkt_info(pkt);
+			info->channel_id = service->layer.id;
+			info->hdr_len = 1;
+			info->dev_info = &service->dev_info;
+			return layr->dn->transmit(layr->dn, pkt);
+		}
+	default:
+	  break;
+	}
+	return -EINVAL;
+}
+
+static void cfsrvl_release(struct cflayer *layer)
+{
+	struct cfsrvl *service = container_of(layer, struct cfsrvl, layer);
+	kfree(service);
+}
+
+void cfsrvl_init(struct cfsrvl *service,
+			u8 channel_id,
+			struct dev_info *dev_info,
+			bool supports_flowctrl
+			)
+{
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+	service->open = false;
+	service->modem_flow_on = true;
+	service->phy_flow_on = true;
+	service->layer.id = channel_id;
+	service->layer.ctrlcmd = cfservl_ctrlcmd;
+	service->layer.modemcmd = cfservl_modemcmd;
+	service->dev_info = *dev_info;
+	service->supports_flowctrl = supports_flowctrl;
+	service->release = cfsrvl_release;
+}
+
+bool cfsrvl_ready(struct cfsrvl *service, int *err)
+{
+	if (service->open && service->modem_flow_on && service->phy_flow_on)
+		return true;
+	if (!service->open) {
+		*err = -ENOTCONN;
+		return false;
+	}
+	caif_assert(!(service->modem_flow_on && service->phy_flow_on));
+	*err = -EAGAIN;
+	return false;
+}
+
+u8 cfsrvl_getphyid(struct cflayer *layer)
+{
+	struct cfsrvl *servl = container_obj(layer);
+	return servl->dev_info.id;
+}
+
+bool cfsrvl_phyid_match(struct cflayer *layer, int phyid)
+{
+	struct cfsrvl *servl = container_obj(layer);
+	return servl->dev_info.id == phyid;
+}
+
+void caif_free_client(struct cflayer *adap_layer)
+{
+	struct cfsrvl *servl;
+	if (adap_layer == NULL || adap_layer->dn == NULL)
+		return;
+	servl = container_obj(adap_layer->dn);
+	servl->release(&servl->layer);
+}
+EXPORT_SYMBOL(caif_free_client);
+
+void caif_client_register_refcnt(struct cflayer *adapt_layer,
+					void (*hold)(struct cflayer *lyr),
+					void (*put)(struct cflayer *lyr))
+{
+	struct cfsrvl *service;
+	service = container_of(adapt_layer->dn, struct cfsrvl, layer);
+
+	WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL);
+	service->hold = hold;
+	service->put = put;
+}
+EXPORT_SYMBOL(caif_client_register_refcnt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
new file mode 100644
index 00000000..98e027db
--- /dev/null
+++ b/net/caif/cfutill.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+#define UTIL_PAYLOAD  0x00
+#define UTIL_CMD_BIT  0x80
+#define UTIL_REMOTE_SHUTDOWN 0x82
+#define UTIL_FLOW_OFF 0x81
+#define UTIL_FLOW_ON  0x80
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
+{
+	struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+	if (!util) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+	memset(util, 0, sizeof(struct cfsrvl));
+	cfsrvl_init(util, channel_id, dev_info, true);
+	util->layer.receive = cfutill_receive;
+	util->layer.transmit = cfutill_transmit;
+	snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1");
+	return &util->layer;
+}
+
+static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 cmd = -1;
+	struct cfsrvl *service = container_obj(layr);
+	caif_assert(layr != NULL);
+	caif_assert(layr->up != NULL);
+	caif_assert(layr->up->receive != NULL);
+	caif_assert(layr->up->ctrlcmd != NULL);
+	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+
+	switch (cmd) {
+	case UTIL_PAYLOAD:
+		return layr->up->receive(layr->up, pkt);
+	case UTIL_FLOW_OFF:
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	case UTIL_FLOW_ON:
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	case UTIL_REMOTE_SHUTDOWN:	/* Remote Shutdown Request */
+		pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
+		service->open = false;
+		cfpkt_destroy(pkt);
+		return 0;
+	default:
+		cfpkt_destroy(pkt);
+		pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
+		return -EPROTO;
+	}
+}
+
+static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 zero = 0;
+	struct caif_payload_info *info;
+	int ret;
+	struct cfsrvl *service = container_obj(layr);
+	caif_assert(layr != NULL);
+	caif_assert(layr->dn != NULL);
+	caif_assert(layr->dn->transmit != NULL);
+	if (!cfsrvl_ready(service, &ret))
+		return ret;
+
+	cfpkt_add_head(pkt, &zero, 1);
+	/* Add info for MUX-layer to route the packet out. */
+	info = cfpkt_info(pkt);
+	info->channel_id = service->layer.id;
+	/*
+	 * To optimize alignment, we add up the size of CAIF header before
+	 * payload.
+	 */
+	info->hdr_len = 1;
+	info->dev_info = &service->dev_info;
+	return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
new file mode 100644
index 00000000..3ec83fbc
--- /dev/null
+++ b/net/caif/cfveil.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define VEI_PAYLOAD  0x00
+#define VEI_CMD_BIT  0x80
+#define VEI_FLOW_OFF 0x81
+#define VEI_FLOW_ON  0x80
+#define VEI_SET_PIN  0x82
+
+#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
+{
+	struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+	if (!vei) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+	memset(vei, 0, sizeof(struct cfsrvl));
+	cfsrvl_init(vei, channel_id, dev_info, true);
+	vei->layer.receive = cfvei_receive;
+	vei->layer.transmit = cfvei_transmit;
+	snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id);
+	return &vei->layer;
+}
+
+static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 cmd;
+	int ret;
+	caif_assert(layr->up != NULL);
+	caif_assert(layr->receive != NULL);
+	caif_assert(layr->ctrlcmd != NULL);
+
+
+	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+	switch (cmd) {
+	case VEI_PAYLOAD:
+		ret = layr->up->receive(layr->up, pkt);
+		return ret;
+	case VEI_FLOW_OFF:
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_OFF_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	case VEI_FLOW_ON:
+		layr->ctrlcmd(layr, CAIF_CTRLCMD_FLOW_ON_IND, 0);
+		cfpkt_destroy(pkt);
+		return 0;
+	case VEI_SET_PIN:	/* SET RS232 PIN */
+		cfpkt_destroy(pkt);
+		return 0;
+	default:		/* SET RS232 PIN */
+		pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+}
+
+static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u8 tmp = 0;
+	struct caif_payload_info *info;
+	int ret;
+	struct cfsrvl *service = container_obj(layr);
+	if (!cfsrvl_ready(service, &ret))
+		goto err;
+	caif_assert(layr->dn != NULL);
+	caif_assert(layr->dn->transmit != NULL);
+
+	if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
+		pr_err("Packet is erroneous!\n");
+		ret = -EPROTO;
+		goto err;
+	}
+
+	/* Add info-> for MUX-layer to route the packet out. */
+	info = cfpkt_info(pkt);
+	info->channel_id = service->layer.id;
+	info->hdr_len = 1;
+	info->dev_info = &service->dev_info;
+	return layr->dn->transmit(layr->dn, pkt);
+err:
+	cfpkt_destroy(pkt);
+	return ret;
+}
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
new file mode 100644
index 00000000..b2f5989a
--- /dev/null
+++ b/net/caif/cfvidl.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfsrvl.h>
+#include <net/caif/cfpkt.h>
+
+#define container_obj(layr) ((struct cfsrvl *) layr)
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt);
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt);
+
+struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
+{
+	struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
+	if (!vid) {
+		pr_warn("Out of memory\n");
+		return NULL;
+	}
+	caif_assert(offsetof(struct cfsrvl, layer) == 0);
+
+	memset(vid, 0, sizeof(struct cfsrvl));
+	cfsrvl_init(vid, channel_id, dev_info, false);
+	vid->layer.receive = cfvidl_receive;
+	vid->layer.transmit = cfvidl_transmit;
+	snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1");
+	return &vid->layer;
+}
+
+static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
+{
+	u32 videoheader;
+	if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
+		pr_err("Packet is erroneous!\n");
+		cfpkt_destroy(pkt);
+		return -EPROTO;
+	}
+	return layr->up->receive(layr->up, pkt);
+}
+
+static int cfvidl_transmit(struct cflayer *layr, struct cfpkt *pkt)
+{
+	struct cfsrvl *service = container_obj(layr);
+	struct caif_payload_info *info;
+	u32 videoheader = 0;
+	int ret;
+	if (!cfsrvl_ready(service, &ret))
+		return ret;
+	cfpkt_add_head(pkt, &videoheader, 4);
+	/* Add info for MUX-layer to route the packet out */
+	info = cfpkt_info(pkt);
+	info->channel_id = service->layer.id;
+	info->dev_info = &service->dev_info;
+	return layr->dn->transmit(layr->dn, pkt);
+}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
new file mode 100644
index 00000000..adbb4244
--- /dev/null
+++ b/net/caif/chnl_net.c
@@ -0,0 +1,545 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2010
+ * Authors:	Sjur Brendeland/sjur.brandeland@stericsson.com
+ *		Daniel Martensson / Daniel.Martensson@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/sched.h>
+#include <linux/sockios.h>
+#include <linux/caif/if_caif.h>
+#include <net/rtnetlink.h>
+#include <net/caif/caif_layer.h>
+#include <net/caif/cfpkt.h>
+#include <net/caif/caif_dev.h>
+
+/* GPRS PDP connection has MTU to 1500 */
+#define GPRS_PDP_MTU 1500
+/* 5 sec. connect timeout */
+#define CONNECT_TIMEOUT (5 * HZ)
+#define CAIF_NET_DEFAULT_QUEUE_LEN 500
+
+/*This list is protected by the rtnl lock. */
+static LIST_HEAD(chnl_net_list);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("caif");
+
+enum caif_states {
+	CAIF_CONNECTED		= 1,
+	CAIF_CONNECTING,
+	CAIF_DISCONNECTED,
+	CAIF_SHUTDOWN
+};
+
+struct chnl_net {
+	struct cflayer chnl;
+	struct net_device_stats stats;
+	struct caif_connect_request conn_req;
+	struct list_head list_field;
+	struct net_device *netdev;
+	char name[256];
+	wait_queue_head_t netmgmt_wq;
+	/* Flow status to remember and control the transmission. */
+	bool flowenabled;
+	enum caif_states state;
+};
+
+static void robust_list_del(struct list_head *delete_node)
+{
+	struct list_head *list_node;
+	struct list_head *n;
+	ASSERT_RTNL();
+	list_for_each_safe(list_node, n, &chnl_net_list) {
+		if (list_node == delete_node) {
+			list_del(list_node);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
+static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
+{
+	struct sk_buff *skb;
+	struct chnl_net *priv  = container_of(layr, struct chnl_net, chnl);
+	int pktlen;
+	int err = 0;
+	const u8 *ip_version;
+	u8 buf;
+
+	priv = container_of(layr, struct chnl_net, chnl);
+
+	if (!priv)
+		return -EINVAL;
+
+	skb = (struct sk_buff *) cfpkt_tonative(pkt);
+
+	/* Get length of CAIF packet. */
+	pktlen = skb->len;
+
+	/* Pass some minimum information and
+	 * send the packet to the net stack.
+	 */
+	skb->dev = priv->netdev;
+
+	/* check the version of IP */
+	ip_version = skb_header_pointer(skb, 0, 1, &buf);
+	if (!ip_version)
+		return -EINVAL;
+	switch (*ip_version >> 4) {
+	case 4:
+		skb->protocol = htons(ETH_P_IP);
+		break;
+	case 6:
+		skb->protocol = htons(ETH_P_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* If we change the header in loop mode, the checksum is corrupted. */
+	if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	else
+		skb->ip_summed = CHECKSUM_NONE;
+
+	if (in_interrupt())
+		netif_rx(skb);
+	else
+		netif_rx_ni(skb);
+
+	/* Update statistics. */
+	priv->netdev->stats.rx_packets++;
+	priv->netdev->stats.rx_bytes += pktlen;
+
+	return err;
+}
+
+static int delete_device(struct chnl_net *dev)
+{
+	ASSERT_RTNL();
+	if (dev->netdev)
+		unregister_netdevice(dev->netdev);
+	return 0;
+}
+
+static void close_work(struct work_struct *work)
+{
+	struct chnl_net *dev = NULL;
+	struct list_head *list_node;
+	struct list_head *_tmp;
+
+	rtnl_lock();
+	list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+		dev = list_entry(list_node, struct chnl_net, list_field);
+		if (dev->state == CAIF_SHUTDOWN)
+			dev_close(dev->netdev);
+	}
+	rtnl_unlock();
+}
+static DECLARE_WORK(close_worker, close_work);
+
+static void chnl_hold(struct cflayer *lyr)
+{
+	struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+	dev_hold(priv->netdev);
+}
+
+static void chnl_put(struct cflayer *lyr)
+{
+	struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl);
+	dev_put(priv->netdev);
+}
+
+static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
+				int phyid)
+{
+	struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
+	pr_debug("NET flowctrl func called flow: %s\n",
+		flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
+		flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
+		flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
+		flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" :
+		flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" :
+		flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ?
+		 "REMOTE_SHUTDOWN" : "UKNOWN CTRL COMMAND");
+
+
+
+	switch (flow) {
+	case CAIF_CTRLCMD_FLOW_OFF_IND:
+		priv->flowenabled = false;
+		netif_stop_queue(priv->netdev);
+		break;
+	case CAIF_CTRLCMD_DEINIT_RSP:
+		priv->state = CAIF_DISCONNECTED;
+		break;
+	case CAIF_CTRLCMD_INIT_FAIL_RSP:
+		priv->state = CAIF_DISCONNECTED;
+		wake_up_interruptible(&priv->netmgmt_wq);
+		break;
+	case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND:
+		priv->state = CAIF_SHUTDOWN;
+		netif_tx_disable(priv->netdev);
+		schedule_work(&close_worker);
+		break;
+	case CAIF_CTRLCMD_FLOW_ON_IND:
+		priv->flowenabled = true;
+		netif_wake_queue(priv->netdev);
+		break;
+	case CAIF_CTRLCMD_INIT_RSP:
+		caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put);
+		priv->state = CAIF_CONNECTED;
+		priv->flowenabled = true;
+		netif_wake_queue(priv->netdev);
+		wake_up_interruptible(&priv->netmgmt_wq);
+		break;
+	default:
+		break;
+	}
+}
+
+static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct chnl_net *priv;
+	struct cfpkt *pkt = NULL;
+	int len;
+	int result = -1;
+	/* Get our private data. */
+	priv = netdev_priv(dev);
+
+	if (skb->len > priv->netdev->mtu) {
+		pr_warn("Size of skb exceeded MTU\n");
+		return -ENOSPC;
+	}
+
+	if (!priv->flowenabled) {
+		pr_debug("dropping packets flow off\n");
+		return NETDEV_TX_BUSY;
+	}
+
+	if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP)
+		swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+
+	/* Store original SKB length. */
+	len = skb->len;
+
+	pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb);
+
+	/* Send the packet down the stack. */
+	result = priv->chnl.dn->transmit(priv->chnl.dn, pkt);
+	if (result) {
+		if (result == -EAGAIN)
+			result = NETDEV_TX_BUSY;
+		return result;
+	}
+
+	/* Update statistics. */
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += len;
+
+	return NETDEV_TX_OK;
+}
+
+static int chnl_net_open(struct net_device *dev)
+{
+	struct chnl_net *priv = NULL;
+	int result = -1;
+	int llifindex, headroom, tailroom, mtu;
+	struct net_device *lldev;
+	ASSERT_RTNL();
+	priv = netdev_priv(dev);
+	if (!priv) {
+		pr_debug("chnl_net_open: no priv\n");
+		return -ENODEV;
+	}
+
+	if (priv->state != CAIF_CONNECTING) {
+		priv->state = CAIF_CONNECTING;
+		result = caif_connect_client(dev_net(dev), &priv->conn_req,
+						&priv->chnl, &llifindex,
+						&headroom, &tailroom);
+		if (result != 0) {
+				pr_debug("err: "
+					 "Unable to register and open device,"
+					 " Err:%d\n",
+					 result);
+				goto error;
+		}
+
+		lldev = dev_get_by_index(dev_net(dev), llifindex);
+
+		if (lldev == NULL) {
+			pr_debug("no interface?\n");
+			result = -ENODEV;
+			goto error;
+		}
+
+		dev->needed_tailroom = tailroom + lldev->needed_tailroom;
+		dev->hard_header_len = headroom + lldev->hard_header_len +
+			lldev->needed_tailroom;
+
+		/*
+		 * MTU, head-room etc is not know before we have a
+		 * CAIF link layer device available. MTU calculation may
+		 * override initial RTNL configuration.
+		 * MTU is minimum of current mtu, link layer mtu pluss
+		 * CAIF head and tail, and PDP GPRS contexts max MTU.
+		 */
+		mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom));
+		mtu = min_t(int, GPRS_PDP_MTU, mtu);
+		dev_set_mtu(dev, mtu);
+		dev_put(lldev);
+
+		if (mtu < 100) {
+			pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
+			result = -ENODEV;
+			goto error;
+		}
+	}
+
+	rtnl_unlock();  /* Release RTNL lock during connect wait */
+
+	result = wait_event_interruptible_timeout(priv->netmgmt_wq,
+						priv->state != CAIF_CONNECTING,
+						CONNECT_TIMEOUT);
+
+	rtnl_lock();
+
+	if (result == -ERESTARTSYS) {
+		pr_debug("wait_event_interruptible woken by a signal\n");
+		result = -ERESTARTSYS;
+		goto error;
+	}
+
+	if (result == 0) {
+		pr_debug("connect timeout\n");
+		caif_disconnect_client(dev_net(dev), &priv->chnl);
+		priv->state = CAIF_DISCONNECTED;
+		pr_debug("state disconnected\n");
+		result = -ETIMEDOUT;
+		goto error;
+	}
+
+	if (priv->state != CAIF_CONNECTED) {
+		pr_debug("connect failed\n");
+		result = -ECONNREFUSED;
+		goto error;
+	}
+	pr_debug("CAIF Netdevice connected\n");
+	return 0;
+
+error:
+	caif_disconnect_client(dev_net(dev), &priv->chnl);
+	priv->state = CAIF_DISCONNECTED;
+	pr_debug("state disconnected\n");
+	return result;
+
+}
+
+static int chnl_net_stop(struct net_device *dev)
+{
+	struct chnl_net *priv;
+
+	ASSERT_RTNL();
+	priv = netdev_priv(dev);
+	priv->state = CAIF_DISCONNECTED;
+	caif_disconnect_client(dev_net(dev), &priv->chnl);
+	return 0;
+}
+
+static int chnl_net_init(struct net_device *dev)
+{
+	struct chnl_net *priv;
+	ASSERT_RTNL();
+	priv = netdev_priv(dev);
+	strncpy(priv->name, dev->name, sizeof(priv->name));
+	return 0;
+}
+
+static void chnl_net_uninit(struct net_device *dev)
+{
+	struct chnl_net *priv;
+	ASSERT_RTNL();
+	priv = netdev_priv(dev);
+	robust_list_del(&priv->list_field);
+}
+
+static const struct net_device_ops netdev_ops = {
+	.ndo_open = chnl_net_open,
+	.ndo_stop = chnl_net_stop,
+	.ndo_init = chnl_net_init,
+	.ndo_uninit = chnl_net_uninit,
+	.ndo_start_xmit = chnl_net_start_xmit,
+};
+
+static void chnl_net_destructor(struct net_device *dev)
+{
+	struct chnl_net *priv = netdev_priv(dev);
+	caif_free_client(&priv->chnl);
+	free_netdev(dev);
+}
+
+static void ipcaif_net_setup(struct net_device *dev)
+{
+	struct chnl_net *priv;
+	dev->netdev_ops = &netdev_ops;
+	dev->destructor = chnl_net_destructor;
+	dev->flags |= IFF_NOARP;
+	dev->flags |= IFF_POINTOPOINT;
+	dev->mtu = GPRS_PDP_MTU;
+	dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN;
+
+	priv = netdev_priv(dev);
+	priv->chnl.receive = chnl_recv_cb;
+	priv->chnl.ctrlcmd = chnl_flowctrl_cb;
+	priv->netdev = dev;
+	priv->conn_req.protocol = CAIFPROTO_DATAGRAM;
+	priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW;
+	priv->conn_req.priority = CAIF_PRIO_LOW;
+	/* Insert illegal value */
+	priv->conn_req.sockaddr.u.dgm.connection_id = 0;
+	priv->flowenabled = false;
+
+	init_waitqueue_head(&priv->netmgmt_wq);
+}
+
+
+static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct chnl_net *priv;
+	u8 loop;
+	priv = netdev_priv(dev);
+	NLA_PUT_U32(skb, IFLA_CAIF_IPV4_CONNID,
+		    priv->conn_req.sockaddr.u.dgm.connection_id);
+	NLA_PUT_U32(skb, IFLA_CAIF_IPV6_CONNID,
+		    priv->conn_req.sockaddr.u.dgm.connection_id);
+	loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP;
+	NLA_PUT_U8(skb, IFLA_CAIF_LOOPBACK, loop);
+
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+
+}
+
+static void caif_netlink_parms(struct nlattr *data[],
+				struct caif_connect_request *conn_req)
+{
+	if (!data) {
+		pr_warn("no params data found\n");
+		return;
+	}
+	if (data[IFLA_CAIF_IPV4_CONNID])
+		conn_req->sockaddr.u.dgm.connection_id =
+			nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]);
+	if (data[IFLA_CAIF_IPV6_CONNID])
+		conn_req->sockaddr.u.dgm.connection_id =
+			nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]);
+	if (data[IFLA_CAIF_LOOPBACK]) {
+		if (nla_get_u8(data[IFLA_CAIF_LOOPBACK]))
+			conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP;
+		else
+			conn_req->protocol = CAIFPROTO_DATAGRAM;
+	}
+}
+
+static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	int ret;
+	struct chnl_net *caifdev;
+	ASSERT_RTNL();
+	caifdev = netdev_priv(dev);
+	caif_netlink_parms(data, &caifdev->conn_req);
+	dev_net_set(caifdev->netdev, src_net);
+
+	ret = register_netdevice(dev);
+	if (ret)
+		pr_warn("device rtml registration failed\n");
+	else
+		list_add(&caifdev->list_field, &chnl_net_list);
+
+	/* Take ifindex as connection-id if null */
+	if (caifdev->conn_req.sockaddr.u.dgm.connection_id == 0)
+		caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex;
+	return ret;
+}
+
+static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[],
+				struct nlattr *data[])
+{
+	struct chnl_net *caifdev;
+	ASSERT_RTNL();
+	caifdev = netdev_priv(dev);
+	caif_netlink_parms(data, &caifdev->conn_req);
+	netdev_state_change(dev);
+	return 0;
+}
+
+static size_t ipcaif_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_CAIF_IPV4_CONNID */
+		nla_total_size(4) +
+		/* IFLA_CAIF_IPV6_CONNID */
+		nla_total_size(4) +
+		/* IFLA_CAIF_LOOPBACK */
+		nla_total_size(2) +
+		0;
+}
+
+static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = {
+	[IFLA_CAIF_IPV4_CONNID]	      = { .type = NLA_U32 },
+	[IFLA_CAIF_IPV6_CONNID]	      = { .type = NLA_U32 },
+	[IFLA_CAIF_LOOPBACK]	      = { .type = NLA_U8 }
+};
+
+
+static struct rtnl_link_ops ipcaif_link_ops __read_mostly = {
+	.kind		= "caif",
+	.priv_size	= sizeof(struct chnl_net),
+	.setup		= ipcaif_net_setup,
+	.maxtype	= IFLA_CAIF_MAX,
+	.policy		= ipcaif_policy,
+	.newlink	= ipcaif_newlink,
+	.changelink	= ipcaif_changelink,
+	.get_size	= ipcaif_get_size,
+	.fill_info	= ipcaif_fill_info,
+
+};
+
+static int __init chnl_init_module(void)
+{
+	return rtnl_link_register(&ipcaif_link_ops);
+}
+
+static void __exit chnl_exit_module(void)
+{
+	struct chnl_net *dev = NULL;
+	struct list_head *list_node;
+	struct list_head *_tmp;
+	rtnl_link_unregister(&ipcaif_link_ops);
+	rtnl_lock();
+	list_for_each_safe(list_node, _tmp, &chnl_net_list) {
+		dev = list_entry(list_node, struct chnl_net, list_field);
+		list_del(list_node);
+		delete_device(dev);
+	}
+	rtnl_unlock();
+}
+
+module_init(chnl_init_module);
+module_exit(chnl_exit_module);
diff --git a/net/can/Kconfig b/net/can/Kconfig
new file mode 100644
index 00000000..89395b2c
--- /dev/null
+++ b/net/can/Kconfig
@@ -0,0 +1,44 @@
+#
+# Controller Area Network (CAN) network layer core configuration
+#
+
+menuconfig CAN
+	depends on NET
+	tristate "CAN bus subsystem support"
+	---help---
+	  Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
+	  communications protocol which was developed by Bosch in
+	  1991, mainly for automotive, but now widely used in marine
+	  (NMEA2000), industrial, and medical applications.
+	  More information on the CAN network protocol family PF_CAN
+	  is contained in <Documentation/networking/can.txt>.
+
+	  If you want CAN support you should say Y here and also to the
+	  specific driver for your controller(s) below.
+
+config CAN_RAW
+	tristate "Raw CAN Protocol (raw access with CAN-ID filtering)"
+	depends on CAN
+	default N
+	---help---
+	  The raw CAN protocol option offers access to the CAN bus via
+	  the BSD socket API. You probably want to use the raw socket in
+	  most cases where no higher level protocol is being used. The raw
+	  socket has several filter options e.g. ID masking / error frames.
+	  To receive/send raw CAN messages, use AF_CAN with protocol CAN_RAW.
+
+config CAN_BCM
+	tristate "Broadcast Manager CAN Protocol (with content filtering)"
+	depends on CAN
+	default N
+	---help---
+	  The Broadcast Manager offers content filtering, timeout monitoring,
+	  sending of RTR frames, and cyclic CAN messages without permanent user
+	  interaction. The BCM can be 'programmed' via the BSD socket API and
+	  informs you on demand e.g. only on content updates / timeouts.
+	  You probably want to use the bcm socket in most cases where cyclic
+	  CAN messages are used on the bus (e.g. in automotive environments).
+	  To use the Broadcast Manager, use AF_CAN with protocol CAN_BCM.
+
+
+source "drivers/net/can/Kconfig"
diff --git a/net/can/Makefile b/net/can/Makefile
new file mode 100644
index 00000000..2d3894b3
--- /dev/null
+++ b/net/can/Makefile
@@ -0,0 +1,12 @@
+#
+#  Makefile for the Linux Controller Area Network core.
+#
+
+obj-$(CONFIG_CAN)	+= can.o
+can-y			:= af_can.o proc.o
+
+obj-$(CONFIG_CAN_RAW)	+= can-raw.o
+can-raw-y		:= raw.o
+
+obj-$(CONFIG_CAN_BCM)	+= can-bcm.o
+can-bcm-y		:= bcm.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
new file mode 100644
index 00000000..094fc533
--- /dev/null
+++ b/net/can/af_can.c
@@ -0,0 +1,888 @@
+/*
+ * af_can.c - Protocol family CAN core module
+ *            (used by different CAN protocol modules)
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "af_can.h"
+
+static __initdata const char banner[] = KERN_INFO
+	"can: controller area network core (" CAN_VERSION_STRING ")\n";
+
+MODULE_DESCRIPTION("Controller Area Network PF_CAN core");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, "
+	      "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+
+MODULE_ALIAS_NETPROTO(PF_CAN);
+
+static int stats_timer __read_mostly = 1;
+module_param(stats_timer, int, S_IRUGO);
+MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
+
+/* receive filters subscribed for 'all' CAN devices */
+struct dev_rcv_lists can_rx_alldev_list;
+static DEFINE_SPINLOCK(can_rcvlists_lock);
+
+static struct kmem_cache *rcv_cache __read_mostly;
+
+/* table of registered CAN protocols */
+static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly;
+static DEFINE_MUTEX(proto_tab_lock);
+
+struct timer_list can_stattimer;   /* timer for statistics update */
+struct s_stats    can_stats;       /* packet statistics */
+struct s_pstats   can_pstats;      /* receive list statistics */
+
+/*
+ * af_can socket functions
+ */
+
+int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+
+	case SIOCGSTAMP:
+		return sock_get_timestamp(sk, (struct timeval __user *)arg);
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+EXPORT_SYMBOL(can_ioctl);
+
+static void can_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static const struct can_proto *can_get_proto(int protocol)
+{
+	const struct can_proto *cp;
+
+	rcu_read_lock();
+	cp = rcu_dereference(proto_tab[protocol]);
+	if (cp && !try_module_get(cp->prot->owner))
+		cp = NULL;
+	rcu_read_unlock();
+
+	return cp;
+}
+
+static inline void can_put_proto(const struct can_proto *cp)
+{
+	module_put(cp->prot->owner);
+}
+
+static int can_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	const struct can_proto *cp;
+	int err = 0;
+
+	sock->state = SS_UNCONNECTED;
+
+	if (protocol < 0 || protocol >= CAN_NPROTO)
+		return -EINVAL;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	cp = can_get_proto(protocol);
+
+#ifdef CONFIG_MODULES
+	if (!cp) {
+		/* try to load protocol module if kernel is modular */
+
+		err = request_module("can-proto-%d", protocol);
+
+		/*
+		 * In case of error we only print a message but don't
+		 * return the error code immediately.  Below we will
+		 * return -EPROTONOSUPPORT
+		 */
+		if (err && printk_ratelimit())
+			printk(KERN_ERR "can: request_module "
+			       "(can-proto-%d) failed.\n", protocol);
+
+		cp = can_get_proto(protocol);
+	}
+#endif
+
+	/* check for available protocol and correct usage */
+
+	if (!cp)
+		return -EPROTONOSUPPORT;
+
+	if (cp->type != sock->type) {
+		err = -EPROTOTYPE;
+		goto errout;
+	}
+
+	sock->ops = cp->ops;
+
+	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+	if (!sk) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	sock_init_data(sock, sk);
+	sk->sk_destruct = can_sock_destruct;
+
+	if (sk->sk_prot->init)
+		err = sk->sk_prot->init(sk);
+
+	if (err) {
+		/* release sk on errors */
+		sock_orphan(sk);
+		sock_put(sk);
+	}
+
+ errout:
+	can_put_proto(cp);
+	return err;
+}
+
+/*
+ * af_can tx path
+ */
+
+/**
+ * can_send - transmit a CAN frame (optional with local loopback)
+ * @skb: pointer to socket buffer with CAN frame in data section
+ * @loop: loopback for listeners on local CAN sockets (recommended default!)
+ *
+ * Due to the loopback this routine must not be called from hardirq context.
+ *
+ * Return:
+ *  0 on success
+ *  -ENETDOWN when the selected interface is down
+ *  -ENOBUFS on full driver queue (see net_xmit_errno())
+ *  -ENOMEM when local loopback failed at calling skb_clone()
+ *  -EPERM when trying to send on a non-CAN interface
+ *  -EINVAL when the skb->data does not contain a valid CAN frame
+ */
+int can_send(struct sk_buff *skb, int loop)
+{
+	struct sk_buff *newskb = NULL;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	int err;
+
+	if (skb->len != sizeof(struct can_frame) || cf->can_dlc > 8) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	if (skb->dev->type != ARPHRD_CAN) {
+		kfree_skb(skb);
+		return -EPERM;
+	}
+
+	if (!(skb->dev->flags & IFF_UP)) {
+		kfree_skb(skb);
+		return -ENETDOWN;
+	}
+
+	skb->protocol = htons(ETH_P_CAN);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	if (loop) {
+		/* local loopback of sent CAN frames */
+
+		/* indication for the CAN driver: do loopback */
+		skb->pkt_type = PACKET_LOOPBACK;
+
+		/*
+		 * The reference to the originating sock may be required
+		 * by the receiving socket to check whether the frame is
+		 * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
+		 * Therefore we have to ensure that skb->sk remains the
+		 * reference to the originating sock by restoring skb->sk
+		 * after each skb_clone() or skb_orphan() usage.
+		 */
+
+		if (!(skb->dev->flags & IFF_ECHO)) {
+			/*
+			 * If the interface is not capable to do loopback
+			 * itself, we do it here.
+			 */
+			newskb = skb_clone(skb, GFP_ATOMIC);
+			if (!newskb) {
+				kfree_skb(skb);
+				return -ENOMEM;
+			}
+
+			newskb->sk = skb->sk;
+			newskb->ip_summed = CHECKSUM_UNNECESSARY;
+			newskb->pkt_type = PACKET_BROADCAST;
+		}
+	} else {
+		/* indication for the CAN driver: no loopback required */
+		skb->pkt_type = PACKET_HOST;
+	}
+
+	/* send to netdevice */
+	err = dev_queue_xmit(skb);
+	if (err > 0)
+		err = net_xmit_errno(err);
+
+	if (err) {
+		kfree_skb(newskb);
+		return err;
+	}
+
+	if (newskb)
+		netif_rx_ni(newskb);
+
+	/* update statistics */
+	can_stats.tx_frames++;
+	can_stats.tx_frames_delta++;
+
+	return 0;
+}
+EXPORT_SYMBOL(can_send);
+
+/*
+ * af_can rx path
+ */
+
+static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev)
+{
+	if (!dev)
+		return &can_rx_alldev_list;
+	else
+		return (struct dev_rcv_lists *)dev->ml_priv;
+}
+
+/**
+ * find_rcv_list - determine optimal filterlist inside device filter struct
+ * @can_id: pointer to CAN identifier of a given can_filter
+ * @mask: pointer to CAN mask of a given can_filter
+ * @d: pointer to the device filter struct
+ *
+ * Description:
+ *  Returns the optimal filterlist to reduce the filter handling in the
+ *  receive path. This function is called by service functions that need
+ *  to register or unregister a can_filter in the filter lists.
+ *
+ *  A filter matches in general, when
+ *
+ *          <received_can_id> & mask == can_id & mask
+ *
+ *  so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe
+ *  relevant bits for the filter.
+ *
+ *  The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ *  filter for error frames (CAN_ERR_FLAG bit set in mask). For error frames
+ *  there is a special filterlist and a special rx path filter handling.
+ *
+ * Return:
+ *  Pointer to optimal filterlist for the given can_id/mask pair.
+ *  Constistency checked mask.
+ *  Reduced can_id to have a preprocessed filter compare value.
+ */
+static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
+					struct dev_rcv_lists *d)
+{
+	canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
+
+	/* filter for error frames in extra filterlist */
+	if (*mask & CAN_ERR_FLAG) {
+		/* clear CAN_ERR_FLAG in filter entry */
+		*mask &= CAN_ERR_MASK;
+		return &d->rx[RX_ERR];
+	}
+
+	/* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */
+
+#define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+	/* ensure valid values in can_mask for 'SFF only' frame filtering */
+	if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG))
+		*mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS);
+
+	/* reduce condition testing at receive time */
+	*can_id &= *mask;
+
+	/* inverse can_id/can_mask filter */
+	if (inv)
+		return &d->rx[RX_INV];
+
+	/* mask == 0 => no condition testing at receive time */
+	if (!(*mask))
+		return &d->rx[RX_ALL];
+
+	/* extra filterlists for the subscription of a single non-RTR can_id */
+	if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
+	    !(*can_id & CAN_RTR_FLAG)) {
+
+		if (*can_id & CAN_EFF_FLAG) {
+			if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) {
+				/* RFC: a future use-case for hash-tables? */
+				return &d->rx[RX_EFF];
+			}
+		} else {
+			if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
+				return &d->rx_sff[*can_id];
+		}
+	}
+
+	/* default: filter via can_id/can_mask */
+	return &d->rx[RX_FIL];
+}
+
+/**
+ * can_rx_register - subscribe CAN frames from a specific interface
+ * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list)
+ * @can_id: CAN identifier (see description)
+ * @mask: CAN mask (see description)
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ * @ident: string for calling module indentification
+ *
+ * Description:
+ *  Invokes the callback function with the received sk_buff and the given
+ *  parameter 'data' on a matching receive filter. A filter matches, when
+ *
+ *          <received_can_id> & mask == can_id & mask
+ *
+ *  The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ *  filter for error frames (CAN_ERR_FLAG bit set in mask).
+ *
+ *  The provided pointer to the sk_buff is guaranteed to be valid as long as
+ *  the callback function is running. The callback function must *not* free
+ *  the given sk_buff while processing it's task. When the given sk_buff is
+ *  needed after the end of the callback function it must be cloned inside
+ *  the callback function with skb_clone().
+ *
+ * Return:
+ *  0 on success
+ *  -ENOMEM on missing cache mem to create subscription entry
+ *  -ENODEV unknown device
+ */
+int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
+		    void (*func)(struct sk_buff *, void *), void *data,
+		    char *ident)
+{
+	struct receiver *r;
+	struct hlist_head *rl;
+	struct dev_rcv_lists *d;
+	int err = 0;
+
+	/* insert new receiver  (dev,canid,mask) -> (func,data) */
+
+	if (dev && dev->type != ARPHRD_CAN)
+		return -ENODEV;
+
+	r = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	spin_lock(&can_rcvlists_lock);
+
+	d = find_dev_rcv_lists(dev);
+	if (d) {
+		rl = find_rcv_list(&can_id, &mask, d);
+
+		r->can_id  = can_id;
+		r->mask    = mask;
+		r->matches = 0;
+		r->func    = func;
+		r->data    = data;
+		r->ident   = ident;
+
+		hlist_add_head_rcu(&r->list, rl);
+		d->entries++;
+
+		can_pstats.rcv_entries++;
+		if (can_pstats.rcv_entries_max < can_pstats.rcv_entries)
+			can_pstats.rcv_entries_max = can_pstats.rcv_entries;
+	} else {
+		kmem_cache_free(rcv_cache, r);
+		err = -ENODEV;
+	}
+
+	spin_unlock(&can_rcvlists_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(can_rx_register);
+
+/*
+ * can_rx_delete_receiver - rcu callback for single receiver entry removal
+ */
+static void can_rx_delete_receiver(struct rcu_head *rp)
+{
+	struct receiver *r = container_of(rp, struct receiver, rcu);
+
+	kmem_cache_free(rcv_cache, r);
+}
+
+/**
+ * can_rx_unregister - unsubscribe CAN frames from a specific interface
+ * @dev: pointer to netdevice (NULL => unsubcribe from 'all' CAN devices list)
+ * @can_id: CAN identifier
+ * @mask: CAN mask
+ * @func: callback function on filter match
+ * @data: returned parameter for callback function
+ *
+ * Description:
+ *  Removes subscription entry depending on given (subscription) values.
+ */
+void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
+		       void (*func)(struct sk_buff *, void *), void *data)
+{
+	struct receiver *r = NULL;
+	struct hlist_head *rl;
+	struct hlist_node *next;
+	struct dev_rcv_lists *d;
+
+	if (dev && dev->type != ARPHRD_CAN)
+		return;
+
+	spin_lock(&can_rcvlists_lock);
+
+	d = find_dev_rcv_lists(dev);
+	if (!d) {
+		printk(KERN_ERR "BUG: receive list not found for "
+		       "dev %s, id %03X, mask %03X\n",
+		       DNAME(dev), can_id, mask);
+		goto out;
+	}
+
+	rl = find_rcv_list(&can_id, &mask, d);
+
+	/*
+	 * Search the receiver list for the item to delete.  This should
+	 * exist, since no receiver may be unregistered that hasn't
+	 * been registered before.
+	 */
+
+	hlist_for_each_entry_rcu(r, next, rl, list) {
+		if (r->can_id == can_id && r->mask == mask &&
+		    r->func == func && r->data == data)
+			break;
+	}
+
+	/*
+	 * Check for bugs in CAN protocol implementations:
+	 * If no matching list item was found, the list cursor variable next
+	 * will be NULL, while r will point to the last item of the list.
+	 */
+
+	if (!next) {
+		printk(KERN_ERR "BUG: receive list entry not found for "
+		       "dev %s, id %03X, mask %03X\n",
+		       DNAME(dev), can_id, mask);
+		r = NULL;
+		goto out;
+	}
+
+	hlist_del_rcu(&r->list);
+	d->entries--;
+
+	if (can_pstats.rcv_entries > 0)
+		can_pstats.rcv_entries--;
+
+	/* remove device structure requested by NETDEV_UNREGISTER */
+	if (d->remove_on_zero_entries && !d->entries) {
+		kfree(d);
+		dev->ml_priv = NULL;
+	}
+
+ out:
+	spin_unlock(&can_rcvlists_lock);
+
+	/* schedule the receiver item for deletion */
+	if (r)
+		call_rcu(&r->rcu, can_rx_delete_receiver);
+}
+EXPORT_SYMBOL(can_rx_unregister);
+
+static inline void deliver(struct sk_buff *skb, struct receiver *r)
+{
+	r->func(skb, r->data);
+	r->matches++;
+}
+
+static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
+{
+	struct receiver *r;
+	struct hlist_node *n;
+	int matches = 0;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	canid_t can_id = cf->can_id;
+
+	if (d->entries == 0)
+		return 0;
+
+	if (can_id & CAN_ERR_FLAG) {
+		/* check for error frame entries only */
+		hlist_for_each_entry_rcu(r, n, &d->rx[RX_ERR], list) {
+			if (can_id & r->mask) {
+				deliver(skb, r);
+				matches++;
+			}
+		}
+		return matches;
+	}
+
+	/* check for unfiltered entries */
+	hlist_for_each_entry_rcu(r, n, &d->rx[RX_ALL], list) {
+		deliver(skb, r);
+		matches++;
+	}
+
+	/* check for can_id/mask entries */
+	hlist_for_each_entry_rcu(r, n, &d->rx[RX_FIL], list) {
+		if ((can_id & r->mask) == r->can_id) {
+			deliver(skb, r);
+			matches++;
+		}
+	}
+
+	/* check for inverted can_id/mask entries */
+	hlist_for_each_entry_rcu(r, n, &d->rx[RX_INV], list) {
+		if ((can_id & r->mask) != r->can_id) {
+			deliver(skb, r);
+			matches++;
+		}
+	}
+
+	/* check filterlists for single non-RTR can_ids */
+	if (can_id & CAN_RTR_FLAG)
+		return matches;
+
+	if (can_id & CAN_EFF_FLAG) {
+		hlist_for_each_entry_rcu(r, n, &d->rx[RX_EFF], list) {
+			if (r->can_id == can_id) {
+				deliver(skb, r);
+				matches++;
+			}
+		}
+	} else {
+		can_id &= CAN_SFF_MASK;
+		hlist_for_each_entry_rcu(r, n, &d->rx_sff[can_id], list) {
+			deliver(skb, r);
+			matches++;
+		}
+	}
+
+	return matches;
+}
+
+static int can_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dev_rcv_lists *d;
+	struct can_frame *cf = (struct can_frame *)skb->data;
+	int matches;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	if (WARN_ONCE(dev->type != ARPHRD_CAN ||
+		      skb->len != sizeof(struct can_frame) ||
+		      cf->can_dlc > 8,
+		      "PF_CAN: dropped non conform skbuf: "
+		      "dev type %d, len %d, can_dlc %d\n",
+		      dev->type, skb->len, cf->can_dlc))
+		goto drop;
+
+	/* update statistics */
+	can_stats.rx_frames++;
+	can_stats.rx_frames_delta++;
+
+	rcu_read_lock();
+
+	/* deliver the packet to sockets listening on all devices */
+	matches = can_rcv_filter(&can_rx_alldev_list, skb);
+
+	/* find receive list for this device */
+	d = find_dev_rcv_lists(dev);
+	if (d)
+		matches += can_rcv_filter(d, skb);
+
+	rcu_read_unlock();
+
+	/* consume the skbuff allocated by the netdevice driver */
+	consume_skb(skb);
+
+	if (matches > 0) {
+		can_stats.matches++;
+		can_stats.matches_delta++;
+	}
+
+	return NET_RX_SUCCESS;
+
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+/*
+ * af_can protocol functions
+ */
+
+/**
+ * can_proto_register - register CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ *
+ * Return:
+ *  0 on success
+ *  -EINVAL invalid (out of range) protocol number
+ *  -EBUSY  protocol already in use
+ *  -ENOBUF if proto_register() fails
+ */
+int can_proto_register(const struct can_proto *cp)
+{
+	int proto = cp->protocol;
+	int err = 0;
+
+	if (proto < 0 || proto >= CAN_NPROTO) {
+		printk(KERN_ERR "can: protocol number %d out of range\n",
+		       proto);
+		return -EINVAL;
+	}
+
+	err = proto_register(cp->prot, 0);
+	if (err < 0)
+		return err;
+
+	mutex_lock(&proto_tab_lock);
+
+	if (proto_tab[proto]) {
+		printk(KERN_ERR "can: protocol %d already registered\n",
+		       proto);
+		err = -EBUSY;
+	} else
+		rcu_assign_pointer(proto_tab[proto], cp);
+
+	mutex_unlock(&proto_tab_lock);
+
+	if (err < 0)
+		proto_unregister(cp->prot);
+
+	return err;
+}
+EXPORT_SYMBOL(can_proto_register);
+
+/**
+ * can_proto_unregister - unregister CAN transport protocol
+ * @cp: pointer to CAN protocol structure
+ */
+void can_proto_unregister(const struct can_proto *cp)
+{
+	int proto = cp->protocol;
+
+	mutex_lock(&proto_tab_lock);
+	BUG_ON(proto_tab[proto] != cp);
+	rcu_assign_pointer(proto_tab[proto], NULL);
+	mutex_unlock(&proto_tab_lock);
+
+	synchronize_rcu();
+
+	proto_unregister(cp->prot);
+}
+EXPORT_SYMBOL(can_proto_unregister);
+
+/*
+ * af_can notifier to create/remove CAN netdevice specific structs
+ */
+static int can_notifier(struct notifier_block *nb, unsigned long msg,
+			void *data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct dev_rcv_lists *d;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (dev->type != ARPHRD_CAN)
+		return NOTIFY_DONE;
+
+	switch (msg) {
+
+	case NETDEV_REGISTER:
+
+		/* create new dev_rcv_lists for this device */
+		d = kzalloc(sizeof(*d), GFP_KERNEL);
+		if (!d) {
+			printk(KERN_ERR
+			       "can: allocation of receive list failed\n");
+			return NOTIFY_DONE;
+		}
+		BUG_ON(dev->ml_priv);
+		dev->ml_priv = d;
+
+		break;
+
+	case NETDEV_UNREGISTER:
+		spin_lock(&can_rcvlists_lock);
+
+		d = dev->ml_priv;
+		if (d) {
+			if (d->entries)
+				d->remove_on_zero_entries = 1;
+			else {
+				kfree(d);
+				dev->ml_priv = NULL;
+			}
+		} else
+			printk(KERN_ERR "can: notifier: receive list not "
+			       "found for dev %s\n", dev->name);
+
+		spin_unlock(&can_rcvlists_lock);
+
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * af_can module init/exit functions
+ */
+
+static struct packet_type can_packet __read_mostly = {
+	.type = cpu_to_be16(ETH_P_CAN),
+	.dev  = NULL,
+	.func = can_rcv,
+};
+
+static const struct net_proto_family can_family_ops = {
+	.family = PF_CAN,
+	.create = can_create,
+	.owner  = THIS_MODULE,
+};
+
+/* notifier block for netdevice event */
+static struct notifier_block can_netdev_notifier __read_mostly = {
+	.notifier_call = can_notifier,
+};
+
+static __init int can_init(void)
+{
+	printk(banner);
+
+	memset(&can_rx_alldev_list, 0, sizeof(can_rx_alldev_list));
+
+	rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
+				      0, 0, NULL);
+	if (!rcv_cache)
+		return -ENOMEM;
+
+	if (stats_timer) {
+		/* the statistics are updated every second (timer triggered) */
+		setup_timer(&can_stattimer, can_stat_update, 0);
+		mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+	} else
+		can_stattimer.function = NULL;
+
+	can_init_proc();
+
+	/* protocol register */
+	sock_register(&can_family_ops);
+	register_netdevice_notifier(&can_netdev_notifier);
+	dev_add_pack(&can_packet);
+
+	return 0;
+}
+
+static __exit void can_exit(void)
+{
+	struct net_device *dev;
+
+	if (stats_timer)
+		del_timer(&can_stattimer);
+
+	can_remove_proc();
+
+	/* protocol unregister */
+	dev_remove_pack(&can_packet);
+	unregister_netdevice_notifier(&can_netdev_notifier);
+	sock_unregister(PF_CAN);
+
+	/* remove created dev_rcv_lists from still registered CAN devices */
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if (dev->type == ARPHRD_CAN && dev->ml_priv){
+
+			struct dev_rcv_lists *d = dev->ml_priv;
+
+			BUG_ON(d->entries);
+			kfree(d);
+			dev->ml_priv = NULL;
+		}
+	}
+	rcu_read_unlock();
+
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+	kmem_cache_destroy(rcv_cache);
+}
+
+module_init(can_init);
+module_exit(can_exit);
diff --git a/net/can/af_can.h b/net/can/af_can.h
new file mode 100644
index 00000000..34253b84
--- /dev/null
+++ b/net/can/af_can.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#ifndef AF_CAN_H
+#define AF_CAN_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/can.h>
+
+/* af_can rx dispatcher structures */
+
+struct receiver {
+	struct hlist_node list;
+	struct rcu_head rcu;
+	canid_t can_id;
+	canid_t mask;
+	unsigned long matches;
+	void (*func)(struct sk_buff *, void *);
+	void *data;
+	char *ident;
+};
+
+enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_EFF, RX_MAX };
+
+/* per device receive filters linked at dev->ml_priv */
+struct dev_rcv_lists {
+	struct hlist_head rx[RX_MAX];
+	struct hlist_head rx_sff[0x800];
+	int remove_on_zero_entries;
+	int entries;
+};
+
+/* statistic structures */
+
+/* can be reset e.g. by can_init_stats() */
+struct s_stats {
+	unsigned long jiffies_init;
+
+	unsigned long rx_frames;
+	unsigned long tx_frames;
+	unsigned long matches;
+
+	unsigned long total_rx_rate;
+	unsigned long total_tx_rate;
+	unsigned long total_rx_match_ratio;
+
+	unsigned long current_rx_rate;
+	unsigned long current_tx_rate;
+	unsigned long current_rx_match_ratio;
+
+	unsigned long max_rx_rate;
+	unsigned long max_tx_rate;
+	unsigned long max_rx_match_ratio;
+
+	unsigned long rx_frames_delta;
+	unsigned long tx_frames_delta;
+	unsigned long matches_delta;
+};
+
+/* persistent statistics */
+struct s_pstats {
+	unsigned long stats_reset;
+	unsigned long user_reset;
+	unsigned long rcv_entries;
+	unsigned long rcv_entries_max;
+};
+
+/* function prototypes for the CAN networklayer procfs (proc.c) */
+extern void can_init_proc(void);
+extern void can_remove_proc(void);
+extern void can_stat_update(unsigned long data);
+
+/* structures and variables from af_can.c needed in proc.c for reading */
+extern struct timer_list can_stattimer;    /* timer for statistics update */
+extern struct s_stats    can_stats;        /* packet statistics */
+extern struct s_pstats   can_pstats;       /* receive list statistics */
+extern struct hlist_head can_rx_dev_list;  /* rx dispatcher structures */
+
+#endif /* AF_CAN_H */
diff --git a/net/can/bcm.c b/net/can/bcm.c
new file mode 100644
index 00000000..c6cc66f7
--- /dev/null
+++ b/net/can/bcm.c
@@ -0,0 +1,1632 @@
+/*
+ * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/bcm.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+/*
+ * To send multiple CAN frame content within TX_SETUP or to filter
+ * CAN messages with multiplex index within RX_SETUP, the number of
+ * different filters is limited to 256 due to the one byte index value.
+ */
+#define MAX_NFRAMES 256
+
+/* use of last_frames[index].can_dlc */
+#define RX_RECV    0x40 /* received data for this element */
+#define RX_THR     0x80 /* element not been sent due to throttle feature */
+#define BCM_CAN_DLC_MASK 0x0F /* clean private flags in can_dlc by masking */
+
+/* get best masking value for can_rx_register() for a given single can_id */
+#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
+		     (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+		     (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+#define CAN_BCM_VERSION CAN_VERSION
+static __initdata const char banner[] = KERN_INFO
+	"can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n";
+
+MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
+MODULE_ALIAS("can-proto-2");
+
+/* easy access to can_frame payload */
+static inline u64 GET_U64(const struct can_frame *cp)
+{
+	return *(u64 *)cp->data;
+}
+
+struct bcm_op {
+	struct list_head list;
+	int ifindex;
+	canid_t can_id;
+	u32 flags;
+	unsigned long frames_abs, frames_filtered;
+	struct timeval ival1, ival2;
+	struct hrtimer timer, thrtimer;
+	struct tasklet_struct tsklet, thrtsklet;
+	ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
+	int rx_ifindex;
+	u32 count;
+	u32 nframes;
+	u32 currframe;
+	struct can_frame *frames;
+	struct can_frame *last_frames;
+	struct can_frame sframe;
+	struct can_frame last_sframe;
+	struct sock *sk;
+	struct net_device *rx_reg_dev;
+};
+
+static struct proc_dir_entry *proc_dir;
+
+struct bcm_sock {
+	struct sock sk;
+	int bound;
+	int ifindex;
+	struct notifier_block notifier;
+	struct list_head rx_ops;
+	struct list_head tx_ops;
+	unsigned long dropped_usr_msgs;
+	struct proc_dir_entry *bcm_proc_read;
+	char procname [32]; /* inode number in decimal with \0 */
+};
+
+static inline struct bcm_sock *bcm_sk(const struct sock *sk)
+{
+	return (struct bcm_sock *)sk;
+}
+
+#define CFSIZ sizeof(struct can_frame)
+#define OPSIZ sizeof(struct bcm_op)
+#define MHSIZ sizeof(struct bcm_msg_head)
+
+/*
+ * procfs functions
+ */
+static char *bcm_proc_getifname(char *result, int ifindex)
+{
+	struct net_device *dev;
+
+	if (!ifindex)
+		return "any";
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(&init_net, ifindex);
+	if (dev)
+		strcpy(result, dev->name);
+	else
+		strcpy(result, "???");
+	rcu_read_unlock();
+
+	return result;
+}
+
+static int bcm_proc_show(struct seq_file *m, void *v)
+{
+	char ifname[IFNAMSIZ];
+	struct sock *sk = (struct sock *)m->private;
+	struct bcm_sock *bo = bcm_sk(sk);
+	struct bcm_op *op;
+
+	seq_printf(m, ">>> socket %pK", sk->sk_socket);
+	seq_printf(m, " / sk %pK", sk);
+	seq_printf(m, " / bo %pK", bo);
+	seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
+	seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex));
+	seq_printf(m, " <<<\n");
+
+	list_for_each_entry(op, &bo->rx_ops, list) {
+
+		unsigned long reduction;
+
+		/* print only active entries & prevent division by zero */
+		if (!op->frames_abs)
+			continue;
+
+		seq_printf(m, "rx_op: %03X %-5s ",
+				op->can_id, bcm_proc_getifname(ifname, op->ifindex));
+		seq_printf(m, "[%u]%c ", op->nframes,
+				(op->flags & RX_CHECK_DLC)?'d':' ');
+		if (op->kt_ival1.tv64)
+			seq_printf(m, "timeo=%lld ",
+					(long long)
+					ktime_to_us(op->kt_ival1));
+
+		if (op->kt_ival2.tv64)
+			seq_printf(m, "thr=%lld ",
+					(long long)
+					ktime_to_us(op->kt_ival2));
+
+		seq_printf(m, "# recv %ld (%ld) => reduction: ",
+				op->frames_filtered, op->frames_abs);
+
+		reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;
+
+		seq_printf(m, "%s%ld%%\n",
+				(reduction == 100)?"near ":"", reduction);
+	}
+
+	list_for_each_entry(op, &bo->tx_ops, list) {
+
+		seq_printf(m, "tx_op: %03X %s [%u] ",
+				op->can_id,
+				bcm_proc_getifname(ifname, op->ifindex),
+				op->nframes);
+
+		if (op->kt_ival1.tv64)
+			seq_printf(m, "t1=%lld ",
+					(long long) ktime_to_us(op->kt_ival1));
+
+		if (op->kt_ival2.tv64)
+			seq_printf(m, "t2=%lld ",
+					(long long) ktime_to_us(op->kt_ival2));
+
+		seq_printf(m, "# sent %ld\n", op->frames_abs);
+	}
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int bcm_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, bcm_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations bcm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= bcm_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/*
+ * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface
+ *              of the given bcm tx op
+ */
+static void bcm_can_tx(struct bcm_op *op)
+{
+	struct sk_buff *skb;
+	struct net_device *dev;
+	struct can_frame *cf = &op->frames[op->currframe];
+
+	/* no target device? => exit */
+	if (!op->ifindex)
+		return;
+
+	dev = dev_get_by_index(&init_net, op->ifindex);
+	if (!dev) {
+		/* RFC: should this bcm_op remove itself here? */
+		return;
+	}
+
+	skb = alloc_skb(CFSIZ, gfp_any());
+	if (!skb)
+		goto out;
+
+	memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
+
+	/* send with loopback */
+	skb->dev = dev;
+	skb->sk = op->sk;
+	can_send(skb, 1);
+
+	/* update statistics */
+	op->currframe++;
+	op->frames_abs++;
+
+	/* reached last frame? */
+	if (op->currframe >= op->nframes)
+		op->currframe = 0;
+ out:
+	dev_put(dev);
+}
+
+/*
+ * bcm_send_to_user - send a BCM message to the userspace
+ *                    (consisting of bcm_msg_head + x CAN frames)
+ */
+static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
+			     struct can_frame *frames, int has_timestamp)
+{
+	struct sk_buff *skb;
+	struct can_frame *firstframe;
+	struct sockaddr_can *addr;
+	struct sock *sk = op->sk;
+	unsigned int datalen = head->nframes * CFSIZ;
+	int err;
+
+	skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
+	if (!skb)
+		return;
+
+	memcpy(skb_put(skb, sizeof(*head)), head, sizeof(*head));
+
+	if (head->nframes) {
+		/* can_frames starting here */
+		firstframe = (struct can_frame *)skb_tail_pointer(skb);
+
+		memcpy(skb_put(skb, datalen), frames, datalen);
+
+		/*
+		 * the BCM uses the can_dlc-element of the can_frame
+		 * structure for internal purposes. This is only
+		 * relevant for updates that are generated by the
+		 * BCM, where nframes is 1
+		 */
+		if (head->nframes == 1)
+			firstframe->can_dlc &= BCM_CAN_DLC_MASK;
+	}
+
+	if (has_timestamp) {
+		/* restore rx timestamp */
+		skb->tstamp = op->rx_stamp;
+	}
+
+	/*
+	 *  Put the datagram to the queue so that bcm_recvmsg() can
+	 *  get it from there.  We need to pass the interface index to
+	 *  bcm_recvmsg().  We pass a whole struct sockaddr_can in skb->cb
+	 *  containing the interface index.
+	 */
+
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));
+	addr = (struct sockaddr_can *)skb->cb;
+	memset(addr, 0, sizeof(*addr));
+	addr->can_family  = AF_CAN;
+	addr->can_ifindex = op->rx_ifindex;
+
+	err = sock_queue_rcv_skb(sk, skb);
+	if (err < 0) {
+		struct bcm_sock *bo = bcm_sk(sk);
+
+		kfree_skb(skb);
+		/* don't care about overflows in this statistic */
+		bo->dropped_usr_msgs++;
+	}
+}
+
+static void bcm_tx_start_timer(struct bcm_op *op)
+{
+	if (op->kt_ival1.tv64 && op->count)
+		hrtimer_start(&op->timer,
+			      ktime_add(ktime_get(), op->kt_ival1),
+			      HRTIMER_MODE_ABS);
+	else if (op->kt_ival2.tv64)
+		hrtimer_start(&op->timer,
+			      ktime_add(ktime_get(), op->kt_ival2),
+			      HRTIMER_MODE_ABS);
+}
+
+static void bcm_tx_timeout_tsklet(unsigned long data)
+{
+	struct bcm_op *op = (struct bcm_op *)data;
+	struct bcm_msg_head msg_head;
+
+	if (op->kt_ival1.tv64 && (op->count > 0)) {
+
+		op->count--;
+		if (!op->count && (op->flags & TX_COUNTEVT)) {
+
+			/* create notification to user */
+			msg_head.opcode  = TX_EXPIRED;
+			msg_head.flags   = op->flags;
+			msg_head.count   = op->count;
+			msg_head.ival1   = op->ival1;
+			msg_head.ival2   = op->ival2;
+			msg_head.can_id  = op->can_id;
+			msg_head.nframes = 0;
+
+			bcm_send_to_user(op, &msg_head, NULL, 0);
+		}
+		bcm_can_tx(op);
+
+	} else if (op->kt_ival2.tv64)
+		bcm_can_tx(op);
+
+	bcm_tx_start_timer(op);
+}
+
+/*
+ * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
+ */
+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
+{
+	struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+
+	tasklet_schedule(&op->tsklet);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_changed - create a RX_CHANGED notification due to changed content
+ */
+static void bcm_rx_changed(struct bcm_op *op, struct can_frame *data)
+{
+	struct bcm_msg_head head;
+
+	/* update statistics */
+	op->frames_filtered++;
+
+	/* prevent statistics overflow */
+	if (op->frames_filtered > ULONG_MAX/100)
+		op->frames_filtered = op->frames_abs = 0;
+
+	/* this element is not throttled anymore */
+	data->can_dlc &= (BCM_CAN_DLC_MASK|RX_RECV);
+
+	head.opcode  = RX_CHANGED;
+	head.flags   = op->flags;
+	head.count   = op->count;
+	head.ival1   = op->ival1;
+	head.ival2   = op->ival2;
+	head.can_id  = op->can_id;
+	head.nframes = 1;
+
+	bcm_send_to_user(op, &head, data, 1);
+}
+
+/*
+ * bcm_rx_update_and_send - process a detected relevant receive content change
+ *                          1. update the last received data
+ *                          2. send a notification to the user (if possible)
+ */
+static void bcm_rx_update_and_send(struct bcm_op *op,
+				   struct can_frame *lastdata,
+				   const struct can_frame *rxdata)
+{
+	memcpy(lastdata, rxdata, CFSIZ);
+
+	/* mark as used and throttled by default */
+	lastdata->can_dlc |= (RX_RECV|RX_THR);
+
+	/* throtteling mode inactive ? */
+	if (!op->kt_ival2.tv64) {
+		/* send RX_CHANGED to the user immediately */
+		bcm_rx_changed(op, lastdata);
+		return;
+	}
+
+	/* with active throttling timer we are just done here */
+	if (hrtimer_active(&op->thrtimer))
+		return;
+
+	/* first receiption with enabled throttling mode */
+	if (!op->kt_lastmsg.tv64)
+		goto rx_changed_settime;
+
+	/* got a second frame inside a potential throttle period? */
+	if (ktime_us_delta(ktime_get(), op->kt_lastmsg) <
+	    ktime_to_us(op->kt_ival2)) {
+		/* do not send the saved data - only start throttle timer */
+		hrtimer_start(&op->thrtimer,
+			      ktime_add(op->kt_lastmsg, op->kt_ival2),
+			      HRTIMER_MODE_ABS);
+		return;
+	}
+
+	/* the gap was that big, that throttling was not needed here */
+rx_changed_settime:
+	bcm_rx_changed(op, lastdata);
+	op->kt_lastmsg = ktime_get();
+}
+
+/*
+ * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
+ *                       received data stored in op->last_frames[]
+ */
+static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
+				const struct can_frame *rxdata)
+{
+	/*
+	 * no one uses the MSBs of can_dlc for comparation,
+	 * so we use it here to detect the first time of reception
+	 */
+
+	if (!(op->last_frames[index].can_dlc & RX_RECV)) {
+		/* received data for the first time => send update to user */
+		bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+		return;
+	}
+
+	/* do a real check in can_frame data section */
+
+	if ((GET_U64(&op->frames[index]) & GET_U64(rxdata)) !=
+	    (GET_U64(&op->frames[index]) & GET_U64(&op->last_frames[index]))) {
+		bcm_rx_update_and_send(op, &op->last_frames[index], rxdata);
+		return;
+	}
+
+	if (op->flags & RX_CHECK_DLC) {
+		/* do a real check in can_frame dlc */
+		if (rxdata->can_dlc != (op->last_frames[index].can_dlc &
+					BCM_CAN_DLC_MASK)) {
+			bcm_rx_update_and_send(op, &op->last_frames[index],
+					       rxdata);
+			return;
+		}
+	}
+}
+
+/*
+ * bcm_rx_starttimer - enable timeout monitoring for CAN frame receiption
+ */
+static void bcm_rx_starttimer(struct bcm_op *op)
+{
+	if (op->flags & RX_NO_AUTOTIMER)
+		return;
+
+	if (op->kt_ival1.tv64)
+		hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
+}
+
+static void bcm_rx_timeout_tsklet(unsigned long data)
+{
+	struct bcm_op *op = (struct bcm_op *)data;
+	struct bcm_msg_head msg_head;
+
+	/* create notification to user */
+	msg_head.opcode  = RX_TIMEOUT;
+	msg_head.flags   = op->flags;
+	msg_head.count   = op->count;
+	msg_head.ival1   = op->ival1;
+	msg_head.ival2   = op->ival2;
+	msg_head.can_id  = op->can_id;
+	msg_head.nframes = 0;
+
+	bcm_send_to_user(op, &msg_head, NULL, 0);
+}
+
+/*
+ * bcm_rx_timeout_handler - when the (cyclic) CAN frame receiption timed out
+ */
+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
+{
+	struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
+
+	/* schedule before NET_RX_SOFTIRQ */
+	tasklet_hi_schedule(&op->tsklet);
+
+	/* no restart of the timer is done here! */
+
+	/* if user wants to be informed, when cyclic CAN-Messages come back */
+	if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
+		/* clear received can_frames to indicate 'nothing received' */
+		memset(op->last_frames, 0, op->nframes * CFSIZ);
+	}
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * bcm_rx_do_flush - helper for bcm_rx_thr_flush
+ */
+static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
+				  unsigned int index)
+{
+	if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
+		if (update)
+			bcm_rx_changed(op, &op->last_frames[index]);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
+ *
+ * update == 0 : just check if throttled data is available  (any irq context)
+ * update == 1 : check and send throttled data to userspace (soft_irq context)
+ */
+static int bcm_rx_thr_flush(struct bcm_op *op, int update)
+{
+	int updated = 0;
+
+	if (op->nframes > 1) {
+		unsigned int i;
+
+		/* for MUX filter we start at index 1 */
+		for (i = 1; i < op->nframes; i++)
+			updated += bcm_rx_do_flush(op, update, i);
+
+	} else {
+		/* for RX_FILTER_ID and simple filter */
+		updated += bcm_rx_do_flush(op, update, 0);
+	}
+
+	return updated;
+}
+
+static void bcm_rx_thr_tsklet(unsigned long data)
+{
+	struct bcm_op *op = (struct bcm_op *)data;
+
+	/* push the changed data to the userspace */
+	bcm_rx_thr_flush(op, 1);
+}
+
+/*
+ * bcm_rx_thr_handler - the time for blocked content updates is over now:
+ *                      Check for throttled data and send it to the userspace
+ */
+static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
+{
+	struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
+
+	tasklet_schedule(&op->thrtsklet);
+
+	if (bcm_rx_thr_flush(op, 0)) {
+		hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
+		return HRTIMER_RESTART;
+	} else {
+		/* rearm throttle handling */
+		op->kt_lastmsg = ktime_set(0, 0);
+		return HRTIMER_NORESTART;
+	}
+}
+
+/*
+ * bcm_rx_handler - handle a CAN frame receiption
+ */
+static void bcm_rx_handler(struct sk_buff *skb, void *data)
+{
+	struct bcm_op *op = (struct bcm_op *)data;
+	const struct can_frame *rxframe = (struct can_frame *)skb->data;
+	unsigned int i;
+
+	/* disable timeout */
+	hrtimer_cancel(&op->timer);
+
+	if (op->can_id != rxframe->can_id)
+		return;
+
+	/* save rx timestamp */
+	op->rx_stamp = skb->tstamp;
+	/* save originator for recvfrom() */
+	op->rx_ifindex = skb->dev->ifindex;
+	/* update statistics */
+	op->frames_abs++;
+
+	if (op->flags & RX_RTR_FRAME) {
+		/* send reply for RTR-request (placed in op->frames[0]) */
+		bcm_can_tx(op);
+		return;
+	}
+
+	if (op->flags & RX_FILTER_ID) {
+		/* the easiest case */
+		bcm_rx_update_and_send(op, &op->last_frames[0], rxframe);
+		goto rx_starttimer;
+	}
+
+	if (op->nframes == 1) {
+		/* simple compare with index 0 */
+		bcm_rx_cmp_to_index(op, 0, rxframe);
+		goto rx_starttimer;
+	}
+
+	if (op->nframes > 1) {
+		/*
+		 * multiplex compare
+		 *
+		 * find the first multiplex mask that fits.
+		 * Remark: The MUX-mask is stored in index 0
+		 */
+
+		for (i = 1; i < op->nframes; i++) {
+			if ((GET_U64(&op->frames[0]) & GET_U64(rxframe)) ==
+			    (GET_U64(&op->frames[0]) &
+			     GET_U64(&op->frames[i]))) {
+				bcm_rx_cmp_to_index(op, i, rxframe);
+				break;
+			}
+		}
+	}
+
+rx_starttimer:
+	bcm_rx_starttimer(op);
+}
+
+/*
+ * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
+ */
+static struct bcm_op *bcm_find_op(struct list_head *ops, canid_t can_id,
+				  int ifindex)
+{
+	struct bcm_op *op;
+
+	list_for_each_entry(op, ops, list) {
+		if ((op->can_id == can_id) && (op->ifindex == ifindex))
+			return op;
+	}
+
+	return NULL;
+}
+
+static void bcm_remove_op(struct bcm_op *op)
+{
+	hrtimer_cancel(&op->timer);
+	hrtimer_cancel(&op->thrtimer);
+
+	if (op->tsklet.func)
+		tasklet_kill(&op->tsklet);
+
+	if (op->thrtsklet.func)
+		tasklet_kill(&op->thrtsklet);
+
+	if ((op->frames) && (op->frames != &op->sframe))
+		kfree(op->frames);
+
+	if ((op->last_frames) && (op->last_frames != &op->last_sframe))
+		kfree(op->last_frames);
+
+	kfree(op);
+}
+
+static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
+{
+	if (op->rx_reg_dev == dev) {
+		can_rx_unregister(dev, op->can_id, REGMASK(op->can_id),
+				  bcm_rx_handler, op);
+
+		/* mark as removed subscription */
+		op->rx_reg_dev = NULL;
+	} else
+		printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device "
+		       "mismatch %p %p\n", op->rx_reg_dev, dev);
+}
+
+/*
+ * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
+ */
+static int bcm_delete_rx_op(struct list_head *ops, canid_t can_id, int ifindex)
+{
+	struct bcm_op *op, *n;
+
+	list_for_each_entry_safe(op, n, ops, list) {
+		if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+
+			/*
+			 * Don't care if we're bound or not (due to netdev
+			 * problems) can_rx_unregister() is always a save
+			 * thing to do here.
+			 */
+			if (op->ifindex) {
+				/*
+				 * Only remove subscriptions that had not
+				 * been removed due to NETDEV_UNREGISTER
+				 * in bcm_notifier()
+				 */
+				if (op->rx_reg_dev) {
+					struct net_device *dev;
+
+					dev = dev_get_by_index(&init_net,
+							       op->ifindex);
+					if (dev) {
+						bcm_rx_unreg(dev, op);
+						dev_put(dev);
+					}
+				}
+			} else
+				can_rx_unregister(NULL, op->can_id,
+						  REGMASK(op->can_id),
+						  bcm_rx_handler, op);
+
+			list_del(&op->list);
+			bcm_remove_op(op);
+			return 1; /* done */
+		}
+	}
+
+	return 0; /* not found */
+}
+
+/*
+ * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
+ */
+static int bcm_delete_tx_op(struct list_head *ops, canid_t can_id, int ifindex)
+{
+	struct bcm_op *op, *n;
+
+	list_for_each_entry_safe(op, n, ops, list) {
+		if ((op->can_id == can_id) && (op->ifindex == ifindex)) {
+			list_del(&op->list);
+			bcm_remove_op(op);
+			return 1; /* done */
+		}
+	}
+
+	return 0; /* not found */
+}
+
+/*
+ * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg)
+ */
+static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
+		       int ifindex)
+{
+	struct bcm_op *op = bcm_find_op(ops, msg_head->can_id, ifindex);
+
+	if (!op)
+		return -EINVAL;
+
+	/* put current values into msg_head */
+	msg_head->flags   = op->flags;
+	msg_head->count   = op->count;
+	msg_head->ival1   = op->ival1;
+	msg_head->ival2   = op->ival2;
+	msg_head->nframes = op->nframes;
+
+	bcm_send_to_user(op, msg_head, op->frames, 0);
+
+	return MHSIZ;
+}
+
+/*
+ * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg)
+ */
+static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+			int ifindex, struct sock *sk)
+{
+	struct bcm_sock *bo = bcm_sk(sk);
+	struct bcm_op *op;
+	unsigned int i;
+	int err;
+
+	/* we need a real device to send frames */
+	if (!ifindex)
+		return -ENODEV;
+
+	/* check nframes boundaries - we need at least one can_frame */
+	if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
+		return -EINVAL;
+
+	/* check the given can_id */
+	op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex);
+
+	if (op) {
+		/* update existing BCM operation */
+
+		/*
+		 * Do we need more space for the can_frames than currently
+		 * allocated? -> This is a _really_ unusual use-case and
+		 * therefore (complexity / locking) it is not supported.
+		 */
+		if (msg_head->nframes > op->nframes)
+			return -E2BIG;
+
+		/* update can_frames content */
+		for (i = 0; i < msg_head->nframes; i++) {
+			err = memcpy_fromiovec((u8 *)&op->frames[i],
+					       msg->msg_iov, CFSIZ);
+
+			if (op->frames[i].can_dlc > 8)
+				err = -EINVAL;
+
+			if (err < 0)
+				return err;
+
+			if (msg_head->flags & TX_CP_CAN_ID) {
+				/* copy can_id into frame */
+				op->frames[i].can_id = msg_head->can_id;
+			}
+		}
+
+	} else {
+		/* insert new BCM operation for the given can_id */
+
+		op = kzalloc(OPSIZ, GFP_KERNEL);
+		if (!op)
+			return -ENOMEM;
+
+		op->can_id    = msg_head->can_id;
+
+		/* create array for can_frames and copy the data */
+		if (msg_head->nframes > 1) {
+			op->frames = kmalloc(msg_head->nframes * CFSIZ,
+					     GFP_KERNEL);
+			if (!op->frames) {
+				kfree(op);
+				return -ENOMEM;
+			}
+		} else
+			op->frames = &op->sframe;
+
+		for (i = 0; i < msg_head->nframes; i++) {
+			err = memcpy_fromiovec((u8 *)&op->frames[i],
+					       msg->msg_iov, CFSIZ);
+
+			if (op->frames[i].can_dlc > 8)
+				err = -EINVAL;
+
+			if (err < 0) {
+				if (op->frames != &op->sframe)
+					kfree(op->frames);
+				kfree(op);
+				return err;
+			}
+
+			if (msg_head->flags & TX_CP_CAN_ID) {
+				/* copy can_id into frame */
+				op->frames[i].can_id = msg_head->can_id;
+			}
+		}
+
+		/* tx_ops never compare with previous received messages */
+		op->last_frames = NULL;
+
+		/* bcm_can_tx / bcm_tx_timeout_handler needs this */
+		op->sk = sk;
+		op->ifindex = ifindex;
+
+		/* initialize uninitialized (kzalloc) structure */
+		hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		op->timer.function = bcm_tx_timeout_handler;
+
+		/* initialize tasklet for tx countevent notification */
+		tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
+			     (unsigned long) op);
+
+		/* currently unused in tx_ops */
+		hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+		/* add this bcm_op to the list of the tx_ops */
+		list_add(&op->list, &bo->tx_ops);
+
+	} /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */
+
+	if (op->nframes != msg_head->nframes) {
+		op->nframes   = msg_head->nframes;
+		/* start multiple frame transmission with index 0 */
+		op->currframe = 0;
+	}
+
+	/* check flags */
+
+	op->flags = msg_head->flags;
+
+	if (op->flags & TX_RESET_MULTI_IDX) {
+		/* start multiple frame transmission with index 0 */
+		op->currframe = 0;
+	}
+
+	if (op->flags & SETTIMER) {
+		/* set timer values */
+		op->count = msg_head->count;
+		op->ival1 = msg_head->ival1;
+		op->ival2 = msg_head->ival2;
+		op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
+		op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+
+		/* disable an active timer due to zero values? */
+		if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
+			hrtimer_cancel(&op->timer);
+	}
+
+	if (op->flags & STARTTIMER) {
+		hrtimer_cancel(&op->timer);
+		/* spec: send can_frame when starting timer */
+		op->flags |= TX_ANNOUNCE;
+	}
+
+	if (op->flags & TX_ANNOUNCE) {
+		bcm_can_tx(op);
+		if (op->count)
+			op->count--;
+	}
+
+	if (op->flags & STARTTIMER)
+		bcm_tx_start_timer(op);
+
+	return msg_head->nframes * CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg)
+ */
+static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
+			int ifindex, struct sock *sk)
+{
+	struct bcm_sock *bo = bcm_sk(sk);
+	struct bcm_op *op;
+	int do_rx_register;
+	int err = 0;
+
+	if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) {
+		/* be robust against wrong usage ... */
+		msg_head->flags |= RX_FILTER_ID;
+		/* ignore trailing garbage */
+		msg_head->nframes = 0;
+	}
+
+	/* the first element contains the mux-mask => MAX_NFRAMES + 1  */
+	if (msg_head->nframes > MAX_NFRAMES + 1)
+		return -EINVAL;
+
+	if ((msg_head->flags & RX_RTR_FRAME) &&
+	    ((msg_head->nframes != 1) ||
+	     (!(msg_head->can_id & CAN_RTR_FLAG))))
+		return -EINVAL;
+
+	/* check the given can_id */
+	op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex);
+	if (op) {
+		/* update existing BCM operation */
+
+		/*
+		 * Do we need more space for the can_frames than currently
+		 * allocated? -> This is a _really_ unusual use-case and
+		 * therefore (complexity / locking) it is not supported.
+		 */
+		if (msg_head->nframes > op->nframes)
+			return -E2BIG;
+
+		if (msg_head->nframes) {
+			/* update can_frames content */
+			err = memcpy_fromiovec((u8 *)op->frames,
+					       msg->msg_iov,
+					       msg_head->nframes * CFSIZ);
+			if (err < 0)
+				return err;
+
+			/* clear last_frames to indicate 'nothing received' */
+			memset(op->last_frames, 0, msg_head->nframes * CFSIZ);
+		}
+
+		op->nframes = msg_head->nframes;
+
+		/* Only an update -> do not call can_rx_register() */
+		do_rx_register = 0;
+
+	} else {
+		/* insert new BCM operation for the given can_id */
+		op = kzalloc(OPSIZ, GFP_KERNEL);
+		if (!op)
+			return -ENOMEM;
+
+		op->can_id    = msg_head->can_id;
+		op->nframes   = msg_head->nframes;
+
+		if (msg_head->nframes > 1) {
+			/* create array for can_frames and copy the data */
+			op->frames = kmalloc(msg_head->nframes * CFSIZ,
+					     GFP_KERNEL);
+			if (!op->frames) {
+				kfree(op);
+				return -ENOMEM;
+			}
+
+			/* create and init array for received can_frames */
+			op->last_frames = kzalloc(msg_head->nframes * CFSIZ,
+						  GFP_KERNEL);
+			if (!op->last_frames) {
+				kfree(op->frames);
+				kfree(op);
+				return -ENOMEM;
+			}
+
+		} else {
+			op->frames = &op->sframe;
+			op->last_frames = &op->last_sframe;
+		}
+
+		if (msg_head->nframes) {
+			err = memcpy_fromiovec((u8 *)op->frames, msg->msg_iov,
+					       msg_head->nframes * CFSIZ);
+			if (err < 0) {
+				if (op->frames != &op->sframe)
+					kfree(op->frames);
+				if (op->last_frames != &op->last_sframe)
+					kfree(op->last_frames);
+				kfree(op);
+				return err;
+			}
+		}
+
+		/* bcm_can_tx / bcm_tx_timeout_handler needs this */
+		op->sk = sk;
+		op->ifindex = ifindex;
+
+		/* initialize uninitialized (kzalloc) structure */
+		hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		op->timer.function = bcm_rx_timeout_handler;
+
+		/* initialize tasklet for rx timeout notification */
+		tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
+			     (unsigned long) op);
+
+		hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		op->thrtimer.function = bcm_rx_thr_handler;
+
+		/* initialize tasklet for rx throttle handling */
+		tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
+			     (unsigned long) op);
+
+		/* add this bcm_op to the list of the rx_ops */
+		list_add(&op->list, &bo->rx_ops);
+
+		/* call can_rx_register() */
+		do_rx_register = 1;
+
+	} /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */
+
+	/* check flags */
+	op->flags = msg_head->flags;
+
+	if (op->flags & RX_RTR_FRAME) {
+
+		/* no timers in RTR-mode */
+		hrtimer_cancel(&op->thrtimer);
+		hrtimer_cancel(&op->timer);
+
+		/*
+		 * funny feature in RX(!)_SETUP only for RTR-mode:
+		 * copy can_id into frame BUT without RTR-flag to
+		 * prevent a full-load-loopback-test ... ;-]
+		 */
+		if ((op->flags & TX_CP_CAN_ID) ||
+		    (op->frames[0].can_id == op->can_id))
+			op->frames[0].can_id = op->can_id & ~CAN_RTR_FLAG;
+
+	} else {
+		if (op->flags & SETTIMER) {
+
+			/* set timer value */
+			op->ival1 = msg_head->ival1;
+			op->ival2 = msg_head->ival2;
+			op->kt_ival1 = timeval_to_ktime(msg_head->ival1);
+			op->kt_ival2 = timeval_to_ktime(msg_head->ival2);
+
+			/* disable an active timer due to zero value? */
+			if (!op->kt_ival1.tv64)
+				hrtimer_cancel(&op->timer);
+
+			/*
+			 * In any case cancel the throttle timer, flush
+			 * potentially blocked msgs and reset throttle handling
+			 */
+			op->kt_lastmsg = ktime_set(0, 0);
+			hrtimer_cancel(&op->thrtimer);
+			bcm_rx_thr_flush(op, 1);
+		}
+
+		if ((op->flags & STARTTIMER) && op->kt_ival1.tv64)
+			hrtimer_start(&op->timer, op->kt_ival1,
+				      HRTIMER_MODE_REL);
+	}
+
+	/* now we can register for can_ids, if we added a new bcm_op */
+	if (do_rx_register) {
+		if (ifindex) {
+			struct net_device *dev;
+
+			dev = dev_get_by_index(&init_net, ifindex);
+			if (dev) {
+				err = can_rx_register(dev, op->can_id,
+						      REGMASK(op->can_id),
+						      bcm_rx_handler, op,
+						      "bcm");
+
+				op->rx_reg_dev = dev;
+				dev_put(dev);
+			}
+
+		} else
+			err = can_rx_register(NULL, op->can_id,
+					      REGMASK(op->can_id),
+					      bcm_rx_handler, op, "bcm");
+		if (err) {
+			/* this bcm rx op is broken -> remove it */
+			list_del(&op->list);
+			bcm_remove_op(op);
+			return err;
+		}
+	}
+
+	return msg_head->nframes * CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
+ */
+static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct net_device *dev;
+	int err;
+
+	/* we need a real device to send frames */
+	if (!ifindex)
+		return -ENODEV;
+
+	skb = alloc_skb(CFSIZ, GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+
+	err = memcpy_fromiovec(skb_put(skb, CFSIZ), msg->msg_iov, CFSIZ);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	dev = dev_get_by_index(&init_net, ifindex);
+	if (!dev) {
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	skb->dev = dev;
+	skb->sk  = sk;
+	err = can_send(skb, 1); /* send with loopback */
+	dev_put(dev);
+
+	if (err)
+		return err;
+
+	return CFSIZ + MHSIZ;
+}
+
+/*
+ * bcm_sendmsg - process BCM commands (opcodes) from the userspace
+ */
+static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct bcm_sock *bo = bcm_sk(sk);
+	int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
+	struct bcm_msg_head msg_head;
+	int ret; /* read bytes or error codes as return value */
+
+	if (!bo->bound)
+		return -ENOTCONN;
+
+	/* check for valid message length from userspace */
+	if (size < MHSIZ || (size - MHSIZ) % CFSIZ)
+		return -EINVAL;
+
+	/* check for alternative ifindex for this bcm_op */
+
+	if (!ifindex && msg->msg_name) {
+		/* no bound device as default => check msg_name */
+		struct sockaddr_can *addr =
+			(struct sockaddr_can *)msg->msg_name;
+
+		if (msg->msg_namelen < sizeof(*addr))
+			return -EINVAL;
+
+		if (addr->can_family != AF_CAN)
+			return -EINVAL;
+
+		/* ifindex from sendto() */
+		ifindex = addr->can_ifindex;
+
+		if (ifindex) {
+			struct net_device *dev;
+
+			dev = dev_get_by_index(&init_net, ifindex);
+			if (!dev)
+				return -ENODEV;
+
+			if (dev->type != ARPHRD_CAN) {
+				dev_put(dev);
+				return -ENODEV;
+			}
+
+			dev_put(dev);
+		}
+	}
+
+	/* read message head information */
+
+	ret = memcpy_fromiovec((u8 *)&msg_head, msg->msg_iov, MHSIZ);
+	if (ret < 0)
+		return ret;
+
+	lock_sock(sk);
+
+	switch (msg_head.opcode) {
+
+	case TX_SETUP:
+		ret = bcm_tx_setup(&msg_head, msg, ifindex, sk);
+		break;
+
+	case RX_SETUP:
+		ret = bcm_rx_setup(&msg_head, msg, ifindex, sk);
+		break;
+
+	case TX_DELETE:
+		if (bcm_delete_tx_op(&bo->tx_ops, msg_head.can_id, ifindex))
+			ret = MHSIZ;
+		else
+			ret = -EINVAL;
+		break;
+
+	case RX_DELETE:
+		if (bcm_delete_rx_op(&bo->rx_ops, msg_head.can_id, ifindex))
+			ret = MHSIZ;
+		else
+			ret = -EINVAL;
+		break;
+
+	case TX_READ:
+		/* reuse msg_head for the reply to TX_READ */
+		msg_head.opcode  = TX_STATUS;
+		ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex);
+		break;
+
+	case RX_READ:
+		/* reuse msg_head for the reply to RX_READ */
+		msg_head.opcode  = RX_STATUS;
+		ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex);
+		break;
+
+	case TX_SEND:
+		/* we need exactly one can_frame behind the msg head */
+		if ((msg_head.nframes != 1) || (size != CFSIZ + MHSIZ))
+			ret = -EINVAL;
+		else
+			ret = bcm_tx_send(msg, ifindex, sk);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	release_sock(sk);
+
+	return ret;
+}
+
+/*
+ * notification handler for netdevice status changes
+ */
+static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
+			void *data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier);
+	struct sock *sk = &bo->sk;
+	struct bcm_op *op;
+	int notify_enodev = 0;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (dev->type != ARPHRD_CAN)
+		return NOTIFY_DONE;
+
+	switch (msg) {
+
+	case NETDEV_UNREGISTER:
+		lock_sock(sk);
+
+		/* remove device specific receive entries */
+		list_for_each_entry(op, &bo->rx_ops, list)
+			if (op->rx_reg_dev == dev)
+				bcm_rx_unreg(dev, op);
+
+		/* remove device reference, if this is our bound device */
+		if (bo->bound && bo->ifindex == dev->ifindex) {
+			bo->bound   = 0;
+			bo->ifindex = 0;
+			notify_enodev = 1;
+		}
+
+		release_sock(sk);
+
+		if (notify_enodev) {
+			sk->sk_err = ENODEV;
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_error_report(sk);
+		}
+		break;
+
+	case NETDEV_DOWN:
+		if (bo->bound && bo->ifindex == dev->ifindex) {
+			sk->sk_err = ENETDOWN;
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_error_report(sk);
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * initial settings for all BCM sockets to be set at socket creation time
+ */
+static int bcm_init(struct sock *sk)
+{
+	struct bcm_sock *bo = bcm_sk(sk);
+
+	bo->bound            = 0;
+	bo->ifindex          = 0;
+	bo->dropped_usr_msgs = 0;
+	bo->bcm_proc_read    = NULL;
+
+	INIT_LIST_HEAD(&bo->tx_ops);
+	INIT_LIST_HEAD(&bo->rx_ops);
+
+	/* set notifier */
+	bo->notifier.notifier_call = bcm_notifier;
+
+	register_netdevice_notifier(&bo->notifier);
+
+	return 0;
+}
+
+/*
+ * standard socket functions
+ */
+static int bcm_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct bcm_sock *bo;
+	struct bcm_op *op, *next;
+
+	if (sk == NULL)
+		return 0;
+
+	bo = bcm_sk(sk);
+
+	/* remove bcm_ops, timer, rx_unregister(), etc. */
+
+	unregister_netdevice_notifier(&bo->notifier);
+
+	lock_sock(sk);
+
+	list_for_each_entry_safe(op, next, &bo->tx_ops, list)
+		bcm_remove_op(op);
+
+	list_for_each_entry_safe(op, next, &bo->rx_ops, list) {
+		/*
+		 * Don't care if we're bound or not (due to netdev problems)
+		 * can_rx_unregister() is always a save thing to do here.
+		 */
+		if (op->ifindex) {
+			/*
+			 * Only remove subscriptions that had not
+			 * been removed due to NETDEV_UNREGISTER
+			 * in bcm_notifier()
+			 */
+			if (op->rx_reg_dev) {
+				struct net_device *dev;
+
+				dev = dev_get_by_index(&init_net, op->ifindex);
+				if (dev) {
+					bcm_rx_unreg(dev, op);
+					dev_put(dev);
+				}
+			}
+		} else
+			can_rx_unregister(NULL, op->can_id,
+					  REGMASK(op->can_id),
+					  bcm_rx_handler, op);
+
+		bcm_remove_op(op);
+	}
+
+	/* remove procfs entry */
+	if (proc_dir && bo->bcm_proc_read)
+		remove_proc_entry(bo->procname, proc_dir);
+
+	/* remove device reference */
+	if (bo->bound) {
+		bo->bound   = 0;
+		bo->ifindex = 0;
+	}
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
+		       int flags)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct sock *sk = sock->sk;
+	struct bcm_sock *bo = bcm_sk(sk);
+
+	if (len < sizeof(*addr))
+		return -EINVAL;
+
+	if (bo->bound)
+		return -EISCONN;
+
+	/* bind a device to this socket */
+	if (addr->can_ifindex) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index(&init_net, addr->can_ifindex);
+		if (!dev)
+			return -ENODEV;
+
+		if (dev->type != ARPHRD_CAN) {
+			dev_put(dev);
+			return -ENODEV;
+		}
+
+		bo->ifindex = dev->ifindex;
+		dev_put(dev);
+
+	} else {
+		/* no interface reference for ifindex = 0 ('any' CAN device) */
+		bo->ifindex = 0;
+	}
+
+	bo->bound = 1;
+
+	if (proc_dir) {
+		/* unique socket address as filename */
+		sprintf(bo->procname, "%lu", sock_i_ino(sk));
+		bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
+						     proc_dir,
+						     &bcm_proc_fops, sk);
+	}
+
+	return 0;
+}
+
+static int bcm_recvmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int error = 0;
+	int noblock;
+	int err;
+
+	noblock =  flags & MSG_DONTWAIT;
+	flags   &= ~MSG_DONTWAIT;
+	skb = skb_recv_datagram(sk, flags, noblock, &error);
+	if (!skb)
+		return error;
+
+	if (skb->len < size)
+		size = skb->len;
+
+	err = memcpy_toiovec(msg->msg_iov, skb->data, size);
+	if (err < 0) {
+		skb_free_datagram(sk, skb);
+		return err;
+	}
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (msg->msg_name) {
+		msg->msg_namelen = sizeof(struct sockaddr_can);
+		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+	}
+
+	skb_free_datagram(sk, skb);
+
+	return size;
+}
+
+static const struct proto_ops bcm_ops = {
+	.family        = PF_CAN,
+	.release       = bcm_release,
+	.bind          = sock_no_bind,
+	.connect       = bcm_connect,
+	.socketpair    = sock_no_socketpair,
+	.accept        = sock_no_accept,
+	.getname       = sock_no_getname,
+	.poll          = datagram_poll,
+	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
+	.listen        = sock_no_listen,
+	.shutdown      = sock_no_shutdown,
+	.setsockopt    = sock_no_setsockopt,
+	.getsockopt    = sock_no_getsockopt,
+	.sendmsg       = bcm_sendmsg,
+	.recvmsg       = bcm_recvmsg,
+	.mmap          = sock_no_mmap,
+	.sendpage      = sock_no_sendpage,
+};
+
+static struct proto bcm_proto __read_mostly = {
+	.name       = "CAN_BCM",
+	.owner      = THIS_MODULE,
+	.obj_size   = sizeof(struct bcm_sock),
+	.init       = bcm_init,
+};
+
+static const struct can_proto bcm_can_proto = {
+	.type       = SOCK_DGRAM,
+	.protocol   = CAN_BCM,
+	.ops        = &bcm_ops,
+	.prot       = &bcm_proto,
+};
+
+static int __init bcm_module_init(void)
+{
+	int err;
+
+	printk(banner);
+
+	err = can_proto_register(&bcm_can_proto);
+	if (err < 0) {
+		printk(KERN_ERR "can: registration of bcm protocol failed\n");
+		return err;
+	}
+
+	/* create /proc/net/can-bcm directory */
+	proc_dir = proc_mkdir("can-bcm", init_net.proc_net);
+	return 0;
+}
+
+static void __exit bcm_module_exit(void)
+{
+	can_proto_unregister(&bcm_can_proto);
+
+	if (proc_dir)
+		proc_net_remove(&init_net, "can-bcm");
+}
+
+module_init(bcm_module_init);
+module_exit(bcm_module_exit);
diff --git a/net/can/proc.c b/net/can/proc.c
new file mode 100644
index 00000000..0016f733
--- /dev/null
+++ b/net/can/proc.c
@@ -0,0 +1,540 @@
+/*
+ * proc.c - procfs support for Protocol family CAN core module
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/if_arp.h>
+#include <linux/can/core.h>
+
+#include "af_can.h"
+
+/*
+ * proc filenames for the PF_CAN core
+ */
+
+#define CAN_PROC_VERSION     "version"
+#define CAN_PROC_STATS       "stats"
+#define CAN_PROC_RESET_STATS "reset_stats"
+#define CAN_PROC_RCVLIST_ALL "rcvlist_all"
+#define CAN_PROC_RCVLIST_FIL "rcvlist_fil"
+#define CAN_PROC_RCVLIST_INV "rcvlist_inv"
+#define CAN_PROC_RCVLIST_SFF "rcvlist_sff"
+#define CAN_PROC_RCVLIST_EFF "rcvlist_eff"
+#define CAN_PROC_RCVLIST_ERR "rcvlist_err"
+
+static struct proc_dir_entry *can_dir;
+static struct proc_dir_entry *pde_version;
+static struct proc_dir_entry *pde_stats;
+static struct proc_dir_entry *pde_reset_stats;
+static struct proc_dir_entry *pde_rcvlist_all;
+static struct proc_dir_entry *pde_rcvlist_fil;
+static struct proc_dir_entry *pde_rcvlist_inv;
+static struct proc_dir_entry *pde_rcvlist_sff;
+static struct proc_dir_entry *pde_rcvlist_eff;
+static struct proc_dir_entry *pde_rcvlist_err;
+
+static int user_reset;
+
+static const char rx_list_name[][8] = {
+	[RX_ERR] = "rx_err",
+	[RX_ALL] = "rx_all",
+	[RX_FIL] = "rx_fil",
+	[RX_INV] = "rx_inv",
+	[RX_EFF] = "rx_eff",
+};
+
+/* receive filters subscribed for 'all' CAN devices */
+extern struct dev_rcv_lists can_rx_alldev_list;
+
+/*
+ * af_can statistics stuff
+ */
+
+static void can_init_stats(void)
+{
+	/*
+	 * This memset function is called from a timer context (when
+	 * can_stattimer is active which is the default) OR in a process
+	 * context (reading the proc_fs when can_stattimer is disabled).
+	 */
+	memset(&can_stats, 0, sizeof(can_stats));
+	can_stats.jiffies_init = jiffies;
+
+	can_pstats.stats_reset++;
+
+	if (user_reset) {
+		user_reset = 0;
+		can_pstats.user_reset++;
+	}
+}
+
+static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
+			       unsigned long count)
+{
+	unsigned long rate;
+
+	if (oldjif == newjif)
+		return 0;
+
+	/* see can_stat_update() - this should NEVER happen! */
+	if (count > (ULONG_MAX / HZ)) {
+		printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n",
+		       count);
+		return 99999999;
+	}
+
+	rate = (count * HZ) / (newjif - oldjif);
+
+	return rate;
+}
+
+void can_stat_update(unsigned long data)
+{
+	unsigned long j = jiffies; /* snapshot */
+
+	/* restart counting in timer context on user request */
+	if (user_reset)
+		can_init_stats();
+
+	/* restart counting on jiffies overflow */
+	if (j < can_stats.jiffies_init)
+		can_init_stats();
+
+	/* prevent overflow in calc_rate() */
+	if (can_stats.rx_frames > (ULONG_MAX / HZ))
+		can_init_stats();
+
+	/* prevent overflow in calc_rate() */
+	if (can_stats.tx_frames > (ULONG_MAX / HZ))
+		can_init_stats();
+
+	/* matches overflow - very improbable */
+	if (can_stats.matches > (ULONG_MAX / 100))
+		can_init_stats();
+
+	/* calc total values */
+	if (can_stats.rx_frames)
+		can_stats.total_rx_match_ratio = (can_stats.matches * 100) /
+			can_stats.rx_frames;
+
+	can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j,
+					    can_stats.tx_frames);
+	can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j,
+					    can_stats.rx_frames);
+
+	/* calc current values */
+	if (can_stats.rx_frames_delta)
+		can_stats.current_rx_match_ratio =
+			(can_stats.matches_delta * 100) /
+			can_stats.rx_frames_delta;
+
+	can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta);
+	can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta);
+
+	/* check / update maximum values */
+	if (can_stats.max_tx_rate < can_stats.current_tx_rate)
+		can_stats.max_tx_rate = can_stats.current_tx_rate;
+
+	if (can_stats.max_rx_rate < can_stats.current_rx_rate)
+		can_stats.max_rx_rate = can_stats.current_rx_rate;
+
+	if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio)
+		can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio;
+
+	/* clear values for 'current rate' calculation */
+	can_stats.tx_frames_delta = 0;
+	can_stats.rx_frames_delta = 0;
+	can_stats.matches_delta   = 0;
+
+	/* restart timer (one second) */
+	mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+}
+
+/*
+ * proc read functions
+ */
+
+static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list,
+			      struct net_device *dev)
+{
+	struct receiver *r;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_rcu(r, n, rx_list, list) {
+		char *fmt = (r->can_id & CAN_EFF_FLAG)?
+			"   %-5s  %08x  %08x  %pK  %pK  %8ld  %s\n" :
+			"   %-5s     %03x    %08x  %pK  %pK  %8ld  %s\n";
+
+		seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask,
+				r->func, r->data, r->matches, r->ident);
+	}
+}
+
+static void can_print_recv_banner(struct seq_file *m)
+{
+	/*
+	 *                  can1.  00000000  00000000  00000000
+	 *                 .......          0  tp20
+	 */
+	seq_puts(m, "  device   can_id   can_mask  function"
+			"  userdata   matches  ident\n");
+}
+
+static int can_stats_proc_show(struct seq_file *m, void *v)
+{
+	seq_putc(m, '\n');
+	seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats.tx_frames);
+	seq_printf(m, " %8ld received frames (RXF)\n", can_stats.rx_frames);
+	seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats.matches);
+
+	seq_putc(m, '\n');
+
+	if (can_stattimer.function == can_stat_update) {
+		seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
+				can_stats.total_rx_match_ratio);
+
+		seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
+				can_stats.total_tx_rate);
+		seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
+				can_stats.total_rx_rate);
+
+		seq_putc(m, '\n');
+
+		seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
+				can_stats.current_rx_match_ratio);
+
+		seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
+				can_stats.current_tx_rate);
+		seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
+				can_stats.current_rx_rate);
+
+		seq_putc(m, '\n');
+
+		seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
+				can_stats.max_rx_match_ratio);
+
+		seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
+				can_stats.max_tx_rate);
+		seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
+				can_stats.max_rx_rate);
+
+		seq_putc(m, '\n');
+	}
+
+	seq_printf(m, " %8ld current receive list entries (CRCV)\n",
+			can_pstats.rcv_entries);
+	seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
+			can_pstats.rcv_entries_max);
+
+	if (can_pstats.stats_reset)
+		seq_printf(m, "\n %8ld statistic resets (STR)\n",
+				can_pstats.stats_reset);
+
+	if (can_pstats.user_reset)
+		seq_printf(m, " %8ld user statistic resets (USTR)\n",
+				can_pstats.user_reset);
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int can_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, can_stats_proc_show, NULL);
+}
+
+static const struct file_operations can_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= can_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int can_reset_stats_proc_show(struct seq_file *m, void *v)
+{
+	user_reset = 1;
+
+	if (can_stattimer.function == can_stat_update) {
+		seq_printf(m, "Scheduled statistic reset #%ld.\n",
+				can_pstats.stats_reset + 1);
+
+	} else {
+		if (can_stats.jiffies_init != jiffies)
+			can_init_stats();
+
+		seq_printf(m, "Performed statistic reset #%ld.\n",
+				can_pstats.stats_reset);
+	}
+	return 0;
+}
+
+static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, can_reset_stats_proc_show, NULL);
+}
+
+static const struct file_operations can_reset_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= can_reset_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int can_version_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", CAN_VERSION_STRING);
+	return 0;
+}
+
+static int can_version_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, can_version_proc_show, NULL);
+}
+
+static const struct file_operations can_version_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= can_version_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
+					     struct net_device *dev,
+					     struct dev_rcv_lists *d)
+{
+	if (!hlist_empty(&d->rx[idx])) {
+		can_print_recv_banner(m);
+		can_print_rcvlist(m, &d->rx[idx], dev);
+	} else
+		seq_printf(m, "  (%s: no entry)\n", DNAME(dev));
+
+}
+
+static int can_rcvlist_proc_show(struct seq_file *m, void *v)
+{
+	/* double cast to prevent GCC warning */
+	int idx = (int)(long)m->private;
+	struct net_device *dev;
+	struct dev_rcv_lists *d;
+
+	seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
+
+	rcu_read_lock();
+
+	/* receive list for 'all' CAN devices (dev == NULL) */
+	d = &can_rx_alldev_list;
+	can_rcvlist_proc_show_one(m, idx, NULL, d);
+
+	/* receive list for registered CAN devices */
+	for_each_netdev_rcu(&init_net, dev) {
+		if (dev->type == ARPHRD_CAN && dev->ml_priv)
+			can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
+	}
+
+	rcu_read_unlock();
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, can_rcvlist_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations can_rcvlist_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= can_rcvlist_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static inline void can_rcvlist_sff_proc_show_one(struct seq_file *m,
+						 struct net_device *dev,
+						 struct dev_rcv_lists *d)
+{
+	int i;
+	int all_empty = 1;
+
+	/* check wether at least one list is non-empty */
+	for (i = 0; i < 0x800; i++)
+		if (!hlist_empty(&d->rx_sff[i])) {
+			all_empty = 0;
+			break;
+		}
+
+	if (!all_empty) {
+		can_print_recv_banner(m);
+		for (i = 0; i < 0x800; i++) {
+			if (!hlist_empty(&d->rx_sff[i]))
+				can_print_rcvlist(m, &d->rx_sff[i], dev);
+		}
+	} else
+		seq_printf(m, "  (%s: no entry)\n", DNAME(dev));
+}
+
+static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
+{
+	struct net_device *dev;
+	struct dev_rcv_lists *d;
+
+	/* RX_SFF */
+	seq_puts(m, "\nreceive list 'rx_sff':\n");
+
+	rcu_read_lock();
+
+	/* sff receive list for 'all' CAN devices (dev == NULL) */
+	d = &can_rx_alldev_list;
+	can_rcvlist_sff_proc_show_one(m, NULL, d);
+
+	/* sff receive list for registered CAN devices */
+	for_each_netdev_rcu(&init_net, dev) {
+		if (dev->type == ARPHRD_CAN && dev->ml_priv)
+			can_rcvlist_sff_proc_show_one(m, dev, dev->ml_priv);
+	}
+
+	rcu_read_unlock();
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, can_rcvlist_sff_proc_show, NULL);
+}
+
+static const struct file_operations can_rcvlist_sff_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= can_rcvlist_sff_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/*
+ * proc utility functions
+ */
+
+static void can_remove_proc_readentry(const char *name)
+{
+	if (can_dir)
+		remove_proc_entry(name, can_dir);
+}
+
+/*
+ * can_init_proc - create main CAN proc directory and procfs entries
+ */
+void can_init_proc(void)
+{
+	/* create /proc/net/can directory */
+	can_dir = proc_mkdir("can", init_net.proc_net);
+
+	if (!can_dir) {
+		printk(KERN_INFO "can: failed to create /proc/net/can . "
+		       "CONFIG_PROC_FS missing?\n");
+		return;
+	}
+
+	/* own procfs entries from the AF_CAN core */
+	pde_version     = proc_create(CAN_PROC_VERSION, 0644, can_dir,
+				      &can_version_proc_fops);
+	pde_stats       = proc_create(CAN_PROC_STATS, 0644, can_dir,
+				      &can_stats_proc_fops);
+	pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, can_dir,
+				      &can_reset_stats_proc_fops);
+	pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, can_dir,
+					   &can_rcvlist_proc_fops, (void *)RX_ERR);
+	pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, can_dir,
+					   &can_rcvlist_proc_fops, (void *)RX_ALL);
+	pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, can_dir,
+					   &can_rcvlist_proc_fops, (void *)RX_FIL);
+	pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir,
+					   &can_rcvlist_proc_fops, (void *)RX_INV);
+	pde_rcvlist_eff = proc_create_data(CAN_PROC_RCVLIST_EFF, 0644, can_dir,
+					   &can_rcvlist_proc_fops, (void *)RX_EFF);
+	pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir,
+				      &can_rcvlist_sff_proc_fops);
+}
+
+/*
+ * can_remove_proc - remove procfs entries and main CAN proc directory
+ */
+void can_remove_proc(void)
+{
+	if (pde_version)
+		can_remove_proc_readentry(CAN_PROC_VERSION);
+
+	if (pde_stats)
+		can_remove_proc_readentry(CAN_PROC_STATS);
+
+	if (pde_reset_stats)
+		can_remove_proc_readentry(CAN_PROC_RESET_STATS);
+
+	if (pde_rcvlist_err)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR);
+
+	if (pde_rcvlist_all)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL);
+
+	if (pde_rcvlist_fil)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL);
+
+	if (pde_rcvlist_inv)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_INV);
+
+	if (pde_rcvlist_eff)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF);
+
+	if (pde_rcvlist_sff)
+		can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF);
+
+	if (can_dir)
+		proc_net_remove(&init_net, "can");
+}
diff --git a/net/can/raw.c b/net/can/raw.c
new file mode 100644
index 00000000..dea99a6e
--- /dev/null
+++ b/net/can/raw.c
@@ -0,0 +1,803 @@
+/*
+ * raw.c - Raw sockets for protocol family CAN
+ *
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Send feedback to <socketcan-users@lists.berlios.de>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/raw.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+#define CAN_RAW_VERSION CAN_VERSION
+static __initdata const char banner[] =
+	KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n";
+
+MODULE_DESCRIPTION("PF_CAN raw protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
+MODULE_ALIAS("can-proto-1");
+
+#define MASK_ALL 0
+
+/*
+ * A raw socket has a list of can_filters attached to it, each receiving
+ * the CAN frames matching that filter.  If the filter list is empty,
+ * no CAN frames will be received by the socket.  The default after
+ * opening the socket, is to have one filter which receives all frames.
+ * The filter list is allocated dynamically with the exception of the
+ * list containing only one item.  This common case is optimized by
+ * storing the single filter in dfilter, to avoid using dynamic memory.
+ */
+
+struct raw_sock {
+	struct sock sk;
+	int bound;
+	int ifindex;
+	struct notifier_block notifier;
+	int loopback;
+	int recv_own_msgs;
+	int count;                 /* number of active filters */
+	struct can_filter dfilter; /* default/single filter */
+	struct can_filter *filter; /* pointer to filter(s) */
+	can_err_mask_t err_mask;
+};
+
+/*
+ * Return pointer to store the extra msg flags for raw_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *raw_flags(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) +
+					 sizeof(unsigned int)));
+
+	/* return pointer after struct sockaddr_can */
+	return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
+static inline struct raw_sock *raw_sk(const struct sock *sk)
+{
+	return (struct raw_sock *)sk;
+}
+
+static void raw_rcv(struct sk_buff *oskb, void *data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct raw_sock *ro = raw_sk(sk);
+	struct sockaddr_can *addr;
+	struct sk_buff *skb;
+	unsigned int *pflags;
+
+	/* check the received tx sock reference */
+	if (!ro->recv_own_msgs && oskb->sk == sk)
+		return;
+
+	/* clone the given skb to be able to enqueue it into the rcv queue */
+	skb = skb_clone(oskb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/*
+	 *  Put the datagram to the queue so that raw_recvmsg() can
+	 *  get it from there.  We need to pass the interface index to
+	 *  raw_recvmsg().  We pass a whole struct sockaddr_can in skb->cb
+	 *  containing the interface index.
+	 */
+
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));
+	addr = (struct sockaddr_can *)skb->cb;
+	memset(addr, 0, sizeof(*addr));
+	addr->can_family  = AF_CAN;
+	addr->can_ifindex = skb->dev->ifindex;
+
+	/* add CAN specific message flags for raw_recvmsg() */
+	pflags = raw_flags(skb);
+	*pflags = 0;
+	if (oskb->sk)
+		*pflags |= MSG_DONTROUTE;
+	if (oskb->sk == sk)
+		*pflags |= MSG_CONFIRM;
+
+	if (sock_queue_rcv_skb(sk, skb) < 0)
+		kfree_skb(skb);
+}
+
+static int raw_enable_filters(struct net_device *dev, struct sock *sk,
+			      struct can_filter *filter, int count)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		err = can_rx_register(dev, filter[i].can_id,
+				      filter[i].can_mask,
+				      raw_rcv, sk, "raw");
+		if (err) {
+			/* clean up successfully registered filters */
+			while (--i >= 0)
+				can_rx_unregister(dev, filter[i].can_id,
+						  filter[i].can_mask,
+						  raw_rcv, sk);
+			break;
+		}
+	}
+
+	return err;
+}
+
+static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
+				can_err_mask_t err_mask)
+{
+	int err = 0;
+
+	if (err_mask)
+		err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
+				      raw_rcv, sk, "raw");
+
+	return err;
+}
+
+static void raw_disable_filters(struct net_device *dev, struct sock *sk,
+			      struct can_filter *filter, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask,
+				  raw_rcv, sk);
+}
+
+static inline void raw_disable_errfilter(struct net_device *dev,
+					 struct sock *sk,
+					 can_err_mask_t err_mask)
+
+{
+	if (err_mask)
+		can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG,
+				  raw_rcv, sk);
+}
+
+static inline void raw_disable_allfilters(struct net_device *dev,
+					  struct sock *sk)
+{
+	struct raw_sock *ro = raw_sk(sk);
+
+	raw_disable_filters(dev, sk, ro->filter, ro->count);
+	raw_disable_errfilter(dev, sk, ro->err_mask);
+}
+
+static int raw_enable_allfilters(struct net_device *dev, struct sock *sk)
+{
+	struct raw_sock *ro = raw_sk(sk);
+	int err;
+
+	err = raw_enable_filters(dev, sk, ro->filter, ro->count);
+	if (!err) {
+		err = raw_enable_errfilter(dev, sk, ro->err_mask);
+		if (err)
+			raw_disable_filters(dev, sk, ro->filter, ro->count);
+	}
+
+	return err;
+}
+
+static int raw_notifier(struct notifier_block *nb,
+			unsigned long msg, void *data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct raw_sock *ro = container_of(nb, struct raw_sock, notifier);
+	struct sock *sk = &ro->sk;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (dev->type != ARPHRD_CAN)
+		return NOTIFY_DONE;
+
+	if (ro->ifindex != dev->ifindex)
+		return NOTIFY_DONE;
+
+	switch (msg) {
+
+	case NETDEV_UNREGISTER:
+		lock_sock(sk);
+		/* remove current filters & unregister */
+		if (ro->bound)
+			raw_disable_allfilters(dev, sk);
+
+		if (ro->count > 1)
+			kfree(ro->filter);
+
+		ro->ifindex = 0;
+		ro->bound   = 0;
+		ro->count   = 0;
+		release_sock(sk);
+
+		sk->sk_err = ENODEV;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+		break;
+
+	case NETDEV_DOWN:
+		sk->sk_err = ENETDOWN;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int raw_init(struct sock *sk)
+{
+	struct raw_sock *ro = raw_sk(sk);
+
+	ro->bound            = 0;
+	ro->ifindex          = 0;
+
+	/* set default filter to single entry dfilter */
+	ro->dfilter.can_id   = 0;
+	ro->dfilter.can_mask = MASK_ALL;
+	ro->filter           = &ro->dfilter;
+	ro->count            = 1;
+
+	/* set default loopback behaviour */
+	ro->loopback         = 1;
+	ro->recv_own_msgs    = 0;
+
+	/* set notifier */
+	ro->notifier.notifier_call = raw_notifier;
+
+	register_netdevice_notifier(&ro->notifier);
+
+	return 0;
+}
+
+static int raw_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro;
+
+	if (!sk)
+		return 0;
+
+	ro = raw_sk(sk);
+
+	unregister_netdevice_notifier(&ro->notifier);
+
+	lock_sock(sk);
+
+	/* remove current filters & unregister */
+	if (ro->bound) {
+		if (ro->ifindex) {
+			struct net_device *dev;
+
+			dev = dev_get_by_index(&init_net, ro->ifindex);
+			if (dev) {
+				raw_disable_allfilters(dev, sk);
+				dev_put(dev);
+			}
+		} else
+			raw_disable_allfilters(NULL, sk);
+	}
+
+	if (ro->count > 1)
+		kfree(ro->filter);
+
+	ro->ifindex = 0;
+	ro->bound   = 0;
+	ro->count   = 0;
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro = raw_sk(sk);
+	int ifindex;
+	int err = 0;
+	int notify_enetdown = 0;
+
+	if (len < sizeof(*addr))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (ro->bound && addr->can_ifindex == ro->ifindex)
+		goto out;
+
+	if (addr->can_ifindex) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index(&init_net, addr->can_ifindex);
+		if (!dev) {
+			err = -ENODEV;
+			goto out;
+		}
+		if (dev->type != ARPHRD_CAN) {
+			dev_put(dev);
+			err = -ENODEV;
+			goto out;
+		}
+		if (!(dev->flags & IFF_UP))
+			notify_enetdown = 1;
+
+		ifindex = dev->ifindex;
+
+		/* filters set by default/setsockopt */
+		err = raw_enable_allfilters(dev, sk);
+		dev_put(dev);
+	} else {
+		ifindex = 0;
+
+		/* filters set by default/setsockopt */
+		err = raw_enable_allfilters(NULL, sk);
+	}
+
+	if (!err) {
+		if (ro->bound) {
+			/* unregister old filters */
+			if (ro->ifindex) {
+				struct net_device *dev;
+
+				dev = dev_get_by_index(&init_net, ro->ifindex);
+				if (dev) {
+					raw_disable_allfilters(dev, sk);
+					dev_put(dev);
+				}
+			} else
+				raw_disable_allfilters(NULL, sk);
+		}
+		ro->ifindex = ifindex;
+		ro->bound = 1;
+	}
+
+ out:
+	release_sock(sk);
+
+	if (notify_enetdown) {
+		sk->sk_err = ENETDOWN;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+	}
+
+	return err;
+}
+
+static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int *len, int peer)
+{
+	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro = raw_sk(sk);
+
+	if (peer)
+		return -EOPNOTSUPP;
+
+	memset(addr, 0, sizeof(*addr));
+	addr->can_family  = AF_CAN;
+	addr->can_ifindex = ro->ifindex;
+
+	*len = sizeof(*addr);
+
+	return 0;
+}
+
+static int raw_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro = raw_sk(sk);
+	struct can_filter *filter = NULL;  /* dyn. alloc'ed filters */
+	struct can_filter sfilter;         /* single filter */
+	struct net_device *dev = NULL;
+	can_err_mask_t err_mask = 0;
+	int count = 0;
+	int err = 0;
+
+	if (level != SOL_CAN_RAW)
+		return -EINVAL;
+
+	switch (optname) {
+
+	case CAN_RAW_FILTER:
+		if (optlen % sizeof(struct can_filter) != 0)
+			return -EINVAL;
+
+		count = optlen / sizeof(struct can_filter);
+
+		if (count > 1) {
+			/* filter does not fit into dfilter => alloc space */
+			filter = memdup_user(optval, optlen);
+			if (IS_ERR(filter))
+				return PTR_ERR(filter);
+		} else if (count == 1) {
+			if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
+				return -EFAULT;
+		}
+
+		lock_sock(sk);
+
+		if (ro->bound && ro->ifindex)
+			dev = dev_get_by_index(&init_net, ro->ifindex);
+
+		if (ro->bound) {
+			/* (try to) register the new filters */
+			if (count == 1)
+				err = raw_enable_filters(dev, sk, &sfilter, 1);
+			else
+				err = raw_enable_filters(dev, sk, filter,
+							 count);
+			if (err) {
+				if (count > 1)
+					kfree(filter);
+				goto out_fil;
+			}
+
+			/* remove old filter registrations */
+			raw_disable_filters(dev, sk, ro->filter, ro->count);
+		}
+
+		/* remove old filter space */
+		if (ro->count > 1)
+			kfree(ro->filter);
+
+		/* link new filters to the socket */
+		if (count == 1) {
+			/* copy filter data for single filter */
+			ro->dfilter = sfilter;
+			filter = &ro->dfilter;
+		}
+		ro->filter = filter;
+		ro->count  = count;
+
+ out_fil:
+		if (dev)
+			dev_put(dev);
+
+		release_sock(sk);
+
+		break;
+
+	case CAN_RAW_ERR_FILTER:
+		if (optlen != sizeof(err_mask))
+			return -EINVAL;
+
+		if (copy_from_user(&err_mask, optval, optlen))
+			return -EFAULT;
+
+		err_mask &= CAN_ERR_MASK;
+
+		lock_sock(sk);
+
+		if (ro->bound && ro->ifindex)
+			dev = dev_get_by_index(&init_net, ro->ifindex);
+
+		/* remove current error mask */
+		if (ro->bound) {
+			/* (try to) register the new err_mask */
+			err = raw_enable_errfilter(dev, sk, err_mask);
+
+			if (err)
+				goto out_err;
+
+			/* remove old err_mask registration */
+			raw_disable_errfilter(dev, sk, ro->err_mask);
+		}
+
+		/* link new err_mask to the socket */
+		ro->err_mask = err_mask;
+
+ out_err:
+		if (dev)
+			dev_put(dev);
+
+		release_sock(sk);
+
+		break;
+
+	case CAN_RAW_LOOPBACK:
+		if (optlen != sizeof(ro->loopback))
+			return -EINVAL;
+
+		if (copy_from_user(&ro->loopback, optval, optlen))
+			return -EFAULT;
+
+		break;
+
+	case CAN_RAW_RECV_OWN_MSGS:
+		if (optlen != sizeof(ro->recv_own_msgs))
+			return -EINVAL;
+
+		if (copy_from_user(&ro->recv_own_msgs, optval, optlen))
+			return -EFAULT;
+
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+	return err;
+}
+
+static int raw_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro = raw_sk(sk);
+	int len;
+	void *val;
+	int err = 0;
+
+	if (level != SOL_CAN_RAW)
+		return -EINVAL;
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+
+	case CAN_RAW_FILTER:
+		lock_sock(sk);
+		if (ro->count > 0) {
+			int fsize = ro->count * sizeof(struct can_filter);
+			if (len > fsize)
+				len = fsize;
+			if (copy_to_user(optval, ro->filter, len))
+				err = -EFAULT;
+		} else
+			len = 0;
+		release_sock(sk);
+
+		if (!err)
+			err = put_user(len, optlen);
+		return err;
+
+	case CAN_RAW_ERR_FILTER:
+		if (len > sizeof(can_err_mask_t))
+			len = sizeof(can_err_mask_t);
+		val = &ro->err_mask;
+		break;
+
+	case CAN_RAW_LOOPBACK:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = &ro->loopback;
+		break;
+
+	case CAN_RAW_RECV_OWN_MSGS:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = &ro->recv_own_msgs;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct raw_sock *ro = raw_sk(sk);
+	struct sk_buff *skb;
+	struct net_device *dev;
+	int ifindex;
+	int err;
+
+	if (msg->msg_name) {
+		struct sockaddr_can *addr =
+			(struct sockaddr_can *)msg->msg_name;
+
+		if (msg->msg_namelen < sizeof(*addr))
+			return -EINVAL;
+
+		if (addr->can_family != AF_CAN)
+			return -EINVAL;
+
+		ifindex = addr->can_ifindex;
+	} else
+		ifindex = ro->ifindex;
+
+	if (size != sizeof(struct can_frame))
+		return -EINVAL;
+
+	dev = dev_get_by_index(&init_net, ifindex);
+	if (!dev)
+		return -ENXIO;
+
+	skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT,
+				  &err);
+	if (!skb)
+		goto put_dev;
+
+	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	if (err < 0)
+		goto free_skb;
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+	if (err < 0)
+		goto free_skb;
+
+	/* to be able to check the received tx sock reference in raw_rcv() */
+	skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
+
+	skb->dev = dev;
+	skb->sk  = sk;
+
+	err = can_send(skb, ro->loopback);
+
+	dev_put(dev);
+
+	if (err)
+		goto send_failed;
+
+	return size;
+
+free_skb:
+	kfree_skb(skb);
+put_dev:
+	dev_put(dev);
+send_failed:
+	return err;
+}
+
+static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int err = 0;
+	int noblock;
+
+	noblock =  flags & MSG_DONTWAIT;
+	flags   &= ~MSG_DONTWAIT;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		return err;
+
+	if (size < skb->len)
+		msg->msg_flags |= MSG_TRUNC;
+	else
+		size = skb->len;
+
+	err = memcpy_toiovec(msg->msg_iov, skb->data, size);
+	if (err < 0) {
+		skb_free_datagram(sk, skb);
+		return err;
+	}
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (msg->msg_name) {
+		msg->msg_namelen = sizeof(struct sockaddr_can);
+		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+	}
+
+	/* assign the flags that have been recorded in raw_rcv() */
+	msg->msg_flags |= *(raw_flags(skb));
+
+	skb_free_datagram(sk, skb);
+
+	return size;
+}
+
+static const struct proto_ops raw_ops = {
+	.family        = PF_CAN,
+	.release       = raw_release,
+	.bind          = raw_bind,
+	.connect       = sock_no_connect,
+	.socketpair    = sock_no_socketpair,
+	.accept        = sock_no_accept,
+	.getname       = raw_getname,
+	.poll          = datagram_poll,
+	.ioctl         = can_ioctl,	/* use can_ioctl() from af_can.c */
+	.listen        = sock_no_listen,
+	.shutdown      = sock_no_shutdown,
+	.setsockopt    = raw_setsockopt,
+	.getsockopt    = raw_getsockopt,
+	.sendmsg       = raw_sendmsg,
+	.recvmsg       = raw_recvmsg,
+	.mmap          = sock_no_mmap,
+	.sendpage      = sock_no_sendpage,
+};
+
+static struct proto raw_proto __read_mostly = {
+	.name       = "CAN_RAW",
+	.owner      = THIS_MODULE,
+	.obj_size   = sizeof(struct raw_sock),
+	.init       = raw_init,
+};
+
+static const struct can_proto raw_can_proto = {
+	.type       = SOCK_RAW,
+	.protocol   = CAN_RAW,
+	.ops        = &raw_ops,
+	.prot       = &raw_proto,
+};
+
+static __init int raw_module_init(void)
+{
+	int err;
+
+	printk(banner);
+
+	err = can_proto_register(&raw_can_proto);
+	if (err < 0)
+		printk(KERN_ERR "can: registration of raw protocol failed\n");
+
+	return err;
+}
+
+static __exit void raw_module_exit(void)
+{
+	can_proto_unregister(&raw_can_proto);
+}
+
+module_init(raw_module_init);
+module_exit(raw_module_exit);
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 00000000..be683f2d
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,29 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	select KEYS
+	default n
+	help
+	  Choose Y or M here to include cephlib, which provides the
+	  common functionality to both the Ceph filesystem and
+	  to the rados block device (rbd).
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_LIB
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This increases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 00000000..e87ef435
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for CEPH filesystem.
+#
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o \
+	pagevec.o
+
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 00000000..1fc1ee11
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,105 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src[0] == '\n') {
+			src++;
+			continue;
+		}
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 00000000..b4bf4ac0
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s'\n", name);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s\n", ac->name);
+	ac->key = key;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+				   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+
+		ac->negotiating = false;
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
+	} else if (ret) {
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (ac->ops->should_authenticate(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 00000000..214c2bb4
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.should_authenticate = should_authenticate,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 00000000..ed7d088b
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 00000000..1587dc60
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,690 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 reply_struct_v;
+
+	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 tkt_struct_v, blob_struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		blob_struct_v = ceph_decode_8(&tp);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kfree(ticket_buf);
+out_dbuf:
+	kfree(dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int maxlen;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (IS_ERR(th)) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *(__le64 *)u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le16_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		if (IS_ERR(th))
+			return PTR_ERR(th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	if (xi->auth_authorizer.buf)
+		ceph_buffer_put(xi->auth_authorizer.buf);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (!IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
+	.is_authenticated = ceph_x_is_authenticated,
+	.should_authenticate = ceph_x_should_authenticate,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	ret = -ENOMEM;
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		goto out;
+
+	ret = -EINVAL;
+	if (!ac->key) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto out_nomem;
+	}
+
+	ret = ceph_crypto_key_clone(&xi->secret, ac->key);
+	if (ret < 0) {
+		pr_err("cannot clone key: %d\n", ret);
+		goto out_nomem;
+	}
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+out_nomem:
+	kfree(xi);
+out:
+	return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 00000000..e02da7a5
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 00000000..671d3057
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 00000000..bf3e6a13
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
+	kref_init(&b->kref);
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
+	return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 00000000..132963ab
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,622 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/key.h>
+#include <keys/ceph-type.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include "crypto.h"
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+	default: return "unknown";
+	}
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had %pU got %pU",
+			       &client->fsid, fsid);
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+			 struct ceph_client *client)
+{
+	struct ceph_options *opt1 = new_opt;
+	struct ceph_options *opt2 = client->options;
+	int ofs = offsetof(struct ceph_options, mon_addr);
+	int i;
+	int ret;
+
+	ret = memcmp(opt1, opt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->name, opt2->name);
+	if (ret)
+		return ret;
+
+	if (opt1->key && !opt2->key)
+		return -1;
+	if (!opt1->key && opt2->key)
+		return 1;
+	if (opt1->key && opt2->key) {
+		if (opt1->key->type != opt2->key->type)
+			return -1;
+		if (opt1->key->created.tv_sec != opt2->key->created.tv_sec)
+			return -1;
+		if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec)
+			return -1;
+		if (opt1->key->len != opt2->key->len)
+			return -1;
+		if (opt1->key->key && !opt2->key->key)
+			return -1;
+		if (!opt1->key->key && opt2->key->key)
+			return 1;
+		if (opt1->key->key && opt2->key->key) {
+			ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	/* any matching mon ip implies a match */
+	for (i = 0; i < opt1->num_mon; i++) {
+		if (ceph_monmap_contains(client->monc.monmap,
+				 &opt1->mon_addr[i]))
+			return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+	int i = 0;
+	char tmp[3];
+	int err = -EINVAL;
+	int d;
+
+	dout("parse_fsid '%s'\n", str);
+	tmp[2] = 0;
+	while (*str && i < 16) {
+		if (ispunct(*str)) {
+			str++;
+			continue;
+		}
+		if (!isxdigit(str[0]) || !isxdigit(str[1]))
+			break;
+		tmp[0] = str[0];
+		tmp[1] = str[1];
+		if (sscanf(tmp, "%x", &d) < 1)
+			break;
+		fsid->fsid[i] = d & 0xff;
+		i++;
+		str += 2;
+	}
+
+	if (i == 16)
+		err = 0;
+	dout("parse_fsid ret %d got fsid %pU", err, fsid);
+	return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_last_int,
+	/* int args above */
+	Opt_fsid,
+	Opt_name,
+	Opt_secret,
+	Opt_key,
+	Opt_ip,
+	Opt_last_string,
+	/* string args above */
+	Opt_noshare,
+	Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	/* int args above */
+	{Opt_fsid, "fsid=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	{Opt_key, "key=%s"},
+	{Opt_ip, "ip=%s"},
+	/* string args above */
+	{Opt_noshare, "noshare"},
+	{Opt_nocrc, "nocrc"},
+	{-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+	dout("destroy_options %p\n", opt);
+	kfree(opt->name);
+	if (opt->key) {
+		ceph_crypto_key_destroy(opt->key);
+		kfree(opt->key);
+	}
+	kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+/* get secret from key store */
+static int get_secret(struct ceph_crypto_key *dst, const char *name) {
+	struct key *ukey;
+	int key_err;
+	int err = 0;
+	struct ceph_crypto_key *ckey;
+
+	ukey = request_key(&key_type_ceph, name, NULL);
+	if (!ukey || IS_ERR(ukey)) {
+		/* request_key errors don't map nicely to mount(2)
+		   errors; don't even try, but still printk */
+		key_err = PTR_ERR(ukey);
+		switch (key_err) {
+		case -ENOKEY:
+			pr_warning("ceph: Mount failed due to key not found: %s\n", name);
+			break;
+		case -EKEYEXPIRED:
+			pr_warning("ceph: Mount failed due to expired key: %s\n", name);
+			break;
+		case -EKEYREVOKED:
+			pr_warning("ceph: Mount failed due to revoked key: %s\n", name);
+			break;
+		default:
+			pr_warning("ceph: Mount failed due to unknown key error"
+			       " %d: %s\n", key_err, name);
+		}
+		err = -EPERM;
+		goto out;
+	}
+
+	ckey = ukey->payload.data;
+	err = ceph_crypto_key_clone(dst, ckey);
+	if (err)
+		goto out_key;
+	/* pass through, err is 0 */
+
+out_key:
+	key_put(ukey);
+out:
+	return err;
+}
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+		       const char *dev_name, const char *dev_name_end,
+		       int (*parse_extra_token)(char *c, void *private),
+		       void *private)
+{
+	struct ceph_options *opt;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+	if (!opt)
+		return err;
+	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+				GFP_KERNEL);
+	if (!opt->mon_addr)
+		goto out;
+
+	dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+	     dev_name);
+
+	/* start with defaults */
+	opt->flags = CEPH_OPT_DEFAULT;
+	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+	/* get mon ip(s) */
+	/* ip1[:port1][,ip2[:port2]...] */
+	err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+			     CEPH_MAX_MON, &opt->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, opt_tokens, argstr);
+		if (token < 0 && parse_extra_token) {
+			/* extra? */
+			err = parse_extra_token((char *)c, private);
+			if (err < 0) {
+				pr_err("bad option at '%s'\n", c);
+				goto out;
+			}
+			continue;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &opt->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			opt->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_fsid:
+			err = parse_fsid(argstr[0].from, &opt->fsid);
+			if (err == 0)
+				opt->flags |= CEPH_OPT_FSID;
+			break;
+		case Opt_name:
+			opt->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+			if (!opt->key) {
+				err = -ENOMEM;
+				goto out;
+			}
+			err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
+			if (err < 0)
+				goto out;
+			break;
+		case Opt_key:
+		        opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+			if (!opt->key) {
+				err = -ENOMEM;
+				goto out;
+			}
+			err = get_secret(opt->key, argstr[0].from);
+			if (err < 0)
+				goto out;
+			break;
+
+			/* misc */
+		case Opt_osdtimeout:
+			opt->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			opt->osd_keepalive_timeout = intval;
+			break;
+		case Opt_osd_idle_ttl:
+			opt->osd_idle_ttl = intval;
+			break;
+		case Opt_mount_timeout:
+			opt->mount_timeout = intval;
+			break;
+
+		case Opt_noshare:
+			opt->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_nocrc:
+			opt->flags |= CEPH_OPT_NOCRC;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+
+	/* success */
+	*popt = opt;
+	return 0;
+
+out:
+	ceph_destroy_options(opt);
+	return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+	return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	client->private = private;
+	client->options = opt;
+
+	mutex_init(&client->mount_mutex);
+	init_waitqueue_head(&client->auth_wq);
+	client->auth_err = 0;
+
+	client->extra_mon_dispatch = NULL;
+	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+	client->msgr = NULL;
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+
+	return client;
+
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_osdc_stop(&client->osdc);
+
+	/*
+	 * make sure osd connections close out before destroying the
+	 * auth module, which is needed to free those connections'
+	 * ceph_authorizers.
+	 */
+	ceph_msgr_flush();
+
+	ceph_monc_stop(&client->monc);
+
+	ceph_debugfs_client_cleanup(client);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+
+	ceph_destroy_options(client->options);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->options->mount_timeout * HZ;
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->options->my_addr;
+		client->msgr = ceph_messenger_create(myaddr,
+					client->supported_features,
+					client->required_features);
+		if (IS_ERR(client->msgr)) {
+			client->msgr = NULL;
+			return PTR_ERR(client->msgr);
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		return err;
+
+	while (!have_mon_and_osd_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			return err;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			have_mon_and_osd_map(client) || (client->auth_err < 0),
+			timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			return err;
+		if (client->auth_err < 0)
+			return client->auth_err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+	int ret;
+	unsigned long started = jiffies;  /* note the start time */
+
+	dout("open_session start\n");
+	mutex_lock(&client->mount_mutex);
+
+	ret = __ceph_open_session(client, started);
+
+	mutex_unlock(&client->mount_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_crypto_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_crypto;
+
+	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+	return 0;
+
+out_crypto:
+	ceph_crypto_shutdown();
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+	dout("exit_ceph_lib\n");
+	ceph_msgr_exit();
+	ceph_crypto_shutdown();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 00000000..41466ccb
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,78 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+	int mode;
+
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+
+	switch (flags & O_ACCMODE) {
+	case O_WRONLY:
+		mode = CEPH_FILE_MODE_WR;
+		break;
+	case O_RDONLY:
+		mode = CEPH_FILE_MODE_RD;
+		break;
+	case O_RDWR:
+	case O_ACCMODE: /* this is what the VFS does */
+		mode = CEPH_FILE_MODE_RDWR;
+		break;
+	}
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+	return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+	int caps = CEPH_CAP_PIN;
+
+	if (mode & CEPH_FILE_MODE_RD)
+		caps |= CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	if (mode & CEPH_FILE_MODE_WR)
+		caps |= CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	if (mode & CEPH_FILE_MODE_LAZY)
+		caps |= CEPH_CAP_FILE_LAZYIO;
+
+	return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 00000000..0a1b53bc
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,121 @@
+
+#include <linux/ceph/types.h>
+#include <linux/module.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned long hash = 0;
+	unsigned char c;
+
+	while (length--) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+EXPORT_SYMBOL(ceph_str_hash);
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
+EXPORT_SYMBOL(ceph_str_hash_name);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 00000000..3fbda04d
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+	case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 00000000..d6ebb13a
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 00000000..5bb63e37
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 00000000..42599e31
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x10000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					retry_bucket = 1;
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				reject = 0;
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						if (crush_choose(map,
+							 map->buckets[-1-item],
+							 weight,
+							 x, outpos+1, 0,
+							 out2, outpos,
+							 firstn, 0,
+							 NULL) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+					}
+				}
+
+				if (!reject) {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("CHOOSE got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("CHOOSE returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 00000000..5a8009c9
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,485 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+#include <linux/key-type.h>
+
+#include <keys/ceph-type.h>
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+			  const struct ceph_crypto_key *src)
+{
+	memcpy(dst, src, sizeof(struct ceph_crypto_key));
+	dst->key = kmalloc(src->len, GFP_NOFS);
+	if (!dst->key)
+		return -ENOMEM;
+	memcpy(dst->key, src->key, src->len);
+	return 0;
+}
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+			     size_t *dst_len,
+			     const void *src1, size_t src1_len,
+			     const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+			     void *dst1, size_t *dst1_len,
+			     void *dst2, size_t *dst2_len,
+			     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	struct ceph_crypto_key *ckey;
+	int ret;
+	void *p;
+
+	ret = -EINVAL;
+	if (datalen <= 0 || datalen > 32767 || !data)
+		goto err;
+
+	ret = key_payload_reserve(key, datalen);
+	if (ret < 0)
+		goto err;
+
+	ret = -ENOMEM;
+	ckey = kmalloc(sizeof(*ckey), GFP_KERNEL);
+	if (!ckey)
+		goto err;
+
+	/* TODO ceph_crypto_key_decode should really take const input */
+	p = (void*)data;
+	ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen);
+	if (ret < 0)
+		goto err_ckey;
+
+	key->payload.data = ckey;
+	return 0;
+
+err_ckey:
+	kfree(ckey);
+err:
+	return ret;
+}
+
+int ceph_key_match(const struct key *key, const void *description)
+{
+	return strcmp(key->description, description) == 0;
+}
+
+void ceph_key_destroy(struct key *key) {
+	struct ceph_crypto_key *ckey = key->payload.data;
+
+	ceph_crypto_key_destroy(ckey);
+}
+
+struct key_type key_type_ceph = {
+	.name		= "ceph",
+	.instantiate	= ceph_key_instantiate,
+	.match		= ceph_key_match,
+	.destroy	= ceph_key_destroy,
+};
+
+int ceph_crypto_init(void) {
+	return register_key_type(&key_type_ceph);
+}
+
+void ceph_crypto_shutdown(void) {
+	unregister_key_type(&key_type_ceph);
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 00000000..1919d155
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,52 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
+				 const struct ceph_crypto_key *src);
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+extern int ceph_crypto_init(void);
+extern void ceph_crypto_shutdown(void);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 00000000..27d4ea31
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   ceph_pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, ceph_pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = -ENOMEM;
+	char name[80];
+
+	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+		 client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 00000000..78b55f49
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2489 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+			 (unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+			 (unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)",
+			 (int)ss->ss_family);
+	}
+
+	return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int ceph_msgr_init(void)
+{
+	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
+	if (!ceph_msgr_wq) {
+		pr_err("msgr_init failed to create workqueue\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+	flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+				 O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     ceph_pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+	con->in_seq = 0;
+	con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con,
+	     ceph_pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
+	mutex_lock(&con->mutex);
+	reset_connection(con);
+	con->peer_global_seq = 0;
+	cancel_delayed_work(&con->work);
+	mutex_unlock(&con->mutex);
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		BUG_ON(con->sock);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	}
+
+	/*
+	 * only assign outgoing seq # if we haven't sent this message
+	 * yet.  if it is requeued, resend with it's original seq.
+	 */
+	if (m->needs_out_seq) {
+		m->hdr.seq = cpu_to_le64(++con->out_seq);
+		m->needs_out_seq = false;
+	}
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		if (m->pages)
+			con->out_msg_pos.page_pos = m->page_alignment;
+		else
+			con->out_msg_pos.page_pos = 0;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+	mutex_lock(&con->mutex);
+
+	if (test_bit(CLOSED, &con->state) ||
+	    test_bit(OPENING, &con->state))
+		return -EAGAIN;
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	if (auth_len) {
+		con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+		con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+		con->out_kvec_left++;
+		con->out_kvec_bytes += auth_len;
+	}
+	return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static int prepare_write_connect(struct ceph_messenger *msgr,
+				 struct ceph_connection *con,
+				 int after_banner)
+{
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features = cpu_to_le64(msgr->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+
+	return prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+	int total_max_write;
+	int in_trail = 0;
+	size_t trail_len = (msg->trail ? msg->trail->length : 0);
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+	while (data_len > con->out_msg_pos.data_pos) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+		int max_write = PAGE_SIZE;
+		int page_shift = 0;
+
+		total_max_write = data_len - trail_len -
+			con->out_msg_pos.data_pos;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+
+		/* have we reached the trail part of the data? */
+		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+			in_trail = 1;
+
+			total_max_write = data_len - con->out_msg_pos.data_pos;
+
+			page = list_first_entry(&msg->trail->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+			max_write = PAGE_SIZE;
+		} else if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+		} else if (msg->bio) {
+			struct bio_vec *bv;
+
+			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			page = bv->bv_page;
+			page_shift = bv->bv_offset;
+			if (crc)
+				kaddr = kmap(page) + page_shift;
+			max_write = bv->bv_len;
+#endif
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min_t(int, max_write - con->out_msg_pos.page_pos,
+			    total_max_write);
+
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos + page_shift,
+				      len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc &&
+		    (msg->pages || msg->pagelist || msg->bio || in_trail))
+			kunmap(page);
+
+		if (ret == -EAGAIN)
+			ret = 0;
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+			if (in_trail)
+				list_move_tail(&page->lru,
+					       &msg->trail->head);
+			else if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+			else if (msg->bio)
+				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+		break;
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+		char delim = ',';
+
+		if (*p == '[') {
+			delim = ']';
+			p++;
+		}
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     delim, &ipend))
+			ss->ss_family = AF_INET;
+		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				  delim, &ipend))
+			ss->ss_family = AF_INET6;
+		else
+			goto bad;
+		p = ipend;
+
+		if (delim == ']') {
+			if (*p != ']') {
+				dout("missing matching ']'\n");
+				goto bad;
+			}
+			p++;
+		}
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warning("wrong peer, want %s/%d, got %s/%d\n",
+			   ceph_pr_addr(&con->peer_addr.in_addr),
+			   (int)le32_to_cpu(con->peer_addr.nonce),
+			   ceph_pr_addr(&con->actual_peer_addr.in_addr),
+			   (int)le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = con->msgr->supported_features;
+	u64 req_feat = con->msgr->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	int ret;
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			return -1;
+		}
+		con->auth_retry = 1;
+		ret = prepare_write_connect(con->msgr, con, 0);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		if (test_bit(CLOSED, &con->state) ||
+		    test_bit(OPENING, &con->state))
+			return -EAGAIN;
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
+		clear_bit(CONNECTING, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect got WAIT as client\n");
+		con->error_msg = "protocol error, got WAIT as client";
+		return -1;
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
+
+	return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+				      struct page **pages,
+				      unsigned data_len, int datacrc)
+{
+	void *p;
+	int ret;
+	int left;
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+	/* (page) data */
+	BUG_ON(pages == NULL);
+	p = kmap(pages[con->in_msg_pos.page]);
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(pages[con->in_msg_pos.page]);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+		con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.page++;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+				    struct bio **bio_iter, int *bio_seg,
+				    unsigned data_len, int datacrc)
+{
+	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+	void *p;
+	int ret, left;
+
+	if (IS_ERR(bv))
+		return PTR_ERR(bv);
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+	p = kmap(bv->bv_page) + bv->bv_offset;
+
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(bv->bv_page);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == bv->bv_len) {
+		con->in_msg_pos.page_pos = 0;
+		iter_bio_next(bio_iter, bio_seg);
+	}
+
+	return ret;
+}
+#endif
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int ret;
+	int to, left;
+	unsigned front_len, middle_len, data_len;
+	int datacrc = con->msgr->nocrc;
+	int skip;
+	u64 seq;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr.in_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof(m->footer);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		return 0;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADMSG;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		skip = 0;
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			BUG_ON(con->in_msg);
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 0;
+		}
+		if (!con->in_msg) {
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return -ENOMEM;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		con->in_msg_pos.page = 0;
+		if (m->pages)
+			con->in_msg_pos.page_pos = m->page_alignment;
+		else
+			con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.data_pos = 0;
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+#ifdef CONFIG_BLOCK
+	if (m->bio && !m->bio_iter)
+		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
+
+	/* (page) data */
+	while (con->in_msg_pos.data_pos < data_len) {
+		if (m->pages) {
+			ret = read_partial_message_pages(con, m->pages,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#ifdef CONFIG_BLOCK
+		} else if (m->bio) {
+
+			ret = read_partial_message_bio(con,
+						 &m->bio_iter, &m->bio_seg,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#endif
+		} else {
+			BUG_ON(1);
+		}
+	}
+
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	msg = con->in_msg;
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src;
+
+	con->in_seq++;
+	mutex_unlock(&con->mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
+		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto out;
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto out;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto out;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto out;
+		}
+	}
+
+do_next:
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+	ret = 0;
+out:
+	dout("try_write done on %p ret %d\n", con, ret);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+
+	/*
+	 * process_connect and process_message drop and re-take
+	 * con->mutex.  make sure we handle a racing close or reopen.
+	 */
+	if (test_bit(CLOSED, &con->state) ||
+	    test_bit(OPENING, &con->state)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (test_bit(CONNECTING, &con->state)) {
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto out;
+			ret = process_banner(con);
+			if (ret < 0)
+				goto out;
+		}
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto out;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto out;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto out;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				break;
+			case -EIO:
+				con->error_msg = "io error";
+				break;
+			}
+			goto out;
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto out;
+		process_ack(con);
+		goto more;
+	}
+
+out:
+	dout("try_read done on %p ret %d\n", con, ret);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int ret;
+
+	mutex_lock(&con->mutex);
+restart:
+	if (test_and_clear_bit(BACKOFF, &con->state)) {
+		dout("con_work %p backing off\n", con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay))) {
+			dout("con_work %p backoff %lu\n", con, con->delay);
+			mutex_unlock(&con->mutex);
+			return;
+		} else {
+			con->ops->put(con);
+			dout("con_work %p FAILED to back off %lu\n", con,
+			     con->delay);
+		}
+	}
+
+	if (test_bit(STANDBY, &con->state)) {
+		dout("con_work %p STANDBY\n", con);
+		goto done;
+	}
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state))
+		goto fault;
+
+	ret = try_read(con);
+	if (ret == -EAGAIN)
+		goto restart;
+	if (ret < 0)
+		goto fault;
+
+	ret = try_write(con);
+	if (ret == -EAGAIN)
+		goto restart;
+	if (ret < 0)
+		goto fault;
+
+done:
+	mutex_unlock(&con->mutex);
+done_unlocked:
+	con->ops->put(con);
+	return;
+
+fault:
+	mutex_unlock(&con->mutex);
+	ceph_fault(con);     /* error/fault path */
+	goto done_unlocked;
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
+
+	con_close_socket(con);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
+
+	/* If there are no messages queued or keepalive pending, place
+	 * the connection in a STANDBY state */
+	if (list_empty(&con->out_queue) &&
+	    !test_bit(KEEPALIVE_PENDING, &con->state)) {
+		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
+		clear_bit(WRITE_PENDING, &con->state);
+		set_bit(STANDBY, &con->state);
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay))) {
+			dout("fault queued %p delay %lu\n", con, con->delay);
+		} else {
+			con->ops->put(con);
+			dout("fault failed to queue %p delay %lu, backoff\n",
+			     con, con->delay);
+			/*
+			 * In many cases we see a socket state change
+			 * while con_work is running and end up
+			 * queuing (non-delayed) work, such that we
+			 * can't backoff with a delay.  Set a flag so
+			 * that when con_work restarts we schedule the
+			 * delay then.
+			 */
+			set_bit(BACKOFF, &con->state);
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&con->mutex);
+out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+	 */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+					     u32 supported_features,
+					     u32 required_features)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msgr->supported_features = supported_features;
+	msgr->required_features = required_features;
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	msgr->inst.addr.type = 0;
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+EXPORT_SYMBOL(ceph_messenger_create);
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_destroy);
+
+static void clear_standby(struct ceph_connection *con)
+{
+	/* come back from STANDBY? */
+	if (test_and_clear_bit(STANDBY, &con->state)) {
+		mutex_lock(&con->mutex);
+		dout("clear_standby %p and ++connect_seq\n", con);
+		con->connect_seq++;
+		WARN_ON(test_bit(WRITE_PENDING, &con->state));
+		WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state));
+		mutex_unlock(&con->mutex);
+	}
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst.name;
+
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+	msg->needs_out_seq = true;
+
+	/* queue */
+	mutex_lock(&con->mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	clear_standby(con);
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p - was on queue\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	if (con->out_msg == msg) {
+		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		con->out_msg = NULL;
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
+				sizeof(struct ceph_msg_footer);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	dout("con_keepalive %p\n", con);
+	clear_standby(con);
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), flags);
+	if (m == NULL)
+		goto out;
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.tid = 0;
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.version = 0;
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
+	m->hdr.reserved = 0;
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->footer.flags = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = 0;
+	m->page_alignment = 0;
+	m->pages = NULL;
+	m->pagelist = NULL;
+	m->bio = NULL;
+	m->bio_iter = NULL;
+	m->bio_seg = 0;
+	m->trail = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, flags,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, flags);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	dout("ceph_msg_new %p front %d\n", m, front_len);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
+		if (!msg || *skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return NULL;
+		}
+		msg->page_alignment = le16_to_cpu(hdr->data_off);
+	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+	if (middle_len && !msg->middle) {
+		ret = ceph_alloc_middle(con, msg);
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
+	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
+	m->trail = NULL;
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
+}
+EXPORT_SYMBOL(ceph_msg_last_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 00000000..cbe31fa4
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(&p);
+
+	num_mon = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+			return 1;
+	return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+		monc->pending_auth = 0;
+		ceph_auth_reset(monc->auth);
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+	int ret;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg = monc->m_subscribe;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+		int num;
+
+		p = msg->front.iov_base;
+		end = p + msg->front_max;
+
+		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+		ceph_encode_32(&p, num);
+
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		}
+		if (monc->want_mdsmap) {
+			dout("__send_subscribe to 'mdsmap' %u+\n",
+			     (unsigned)monc->have_mdsmap);
+			ceph_encode_string(&p, end, "mdsmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_mdsmap);
+			i->onetime = 0;
+			p += sizeof(*i);
+		}
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
+
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__open_session(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		goto out;
+	}
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+		kfree(monmap);
+		goto out;
+	}
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+out:
+	mutex_unlock(&monc->mutex);
+	wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (e.g., statfs, poolop)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
+{
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_generic_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+
+	kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (!req) {
+		dout("get_generic_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+			      struct ceph_mon_generic_request *req)
+{
+	int err;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	mutex_unlock(&monc->mutex);
+
+	err = wait_for_completion_interruptible(&req->completion);
+
+	mutex_lock(&monc->mutex);
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req->result;
+	return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		*(struct ceph_statfs *)req->buf = reply->st;
+		req->result = 0;
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete_all(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = sizeof(*buf);
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+				char *dst, size_t dst_len)
+{
+	u32 buf_len;
+
+	if (src_len != sizeof(u32) + dst_len)
+		return -EINVAL;
+
+	buf_len = le32_to_cpu(*(u32 *)src);
+	if (buf_len != dst_len)
+		return -EINVAL;
+
+	memcpy(dst, src + sizeof(u32), dst_len);
+	return 0;
+}
+
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len < sizeof(*reply))
+		goto bad;
+	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		if (req->buf_len &&
+		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+				     msg->front.iov_len - sizeof(*reply),
+				     req->buf, req->buf_len) < 0) {
+			mutex_unlock(&monc->mutex);
+			goto bad;
+		}
+		req->result = le32_to_cpu(reply->reply_code);
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+			u32 pool, u64 snapid,
+			char *buf, int len)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = len;
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	req->request->hdr.version = cpu_to_le16(2);
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+	h->pool = cpu_to_le32(pool);
+	h->op = cpu_to_le32(op);
+	h->auid = 0;
+	h->snapid = cpu_to_le64(snapid);
+	h->name_len = 0;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 *snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, 0, (char *)snapid, sizeof(*snapid));
+
+}
+EXPORT_SYMBOL(ceph_monc_create_snapid);
+
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, snapid, 0, 0);
+
+}
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *p;
+
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
+	} else {
+		ceph_con_keepalive(monc->con);
+
+		__validate_auth(monc);
+
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
+	}
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_options *opt = monc->client->options;
+	struct ceph_entity_addr *mon_addr = opt->mon_addr;
+	int num_mon = opt->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
+	return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
+	monc->con = NULL;
+
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->options->name,
+				    cl->options->key);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+	/* msgs */
+	err = -ENOMEM;
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
+	if (!monc->m_subscribe_ack)
+		goto out_monmap;
+
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	if (!monc->m_auth_reply)
+		goto out_subscribe;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+	monc->pending_auth = 0;
+	if (!monc->m_auth)
+		goto out_auth_reply;
+
+	monc->cur_mon = -1;
+	monc->hunting = true;
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	return 0;
+
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+	kfree(monc->monmap);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
+	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
+	ceph_msg_put(monc->m_subscribe_ack);
+
+	kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+	int was_auth = 0;
+
+	mutex_lock(&monc->mutex);
+	if (monc->auth->ops)
+		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	monc->pending_auth = 0;
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->auth_err = ret;
+		wake_up_all(&monc->client->auth_wq);
+	} else if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num =
+					cpu_to_le64(monc->auth->global_id);
+
+		__send_subscribe(monc);
+		__resend_generic_request(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_POOLOP_REPLY:
+		handle_poolop_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		/* can the chained handler handle it? */
+		if (monc->client->extra_mon_dispatch &&
+		    monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+			break;
+			
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m = NULL;
+
+	*skip = 0;
+
+	switch (type) {
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		m = ceph_msg_get(monc->m_subscribe_ack);
+		break;
+	case CEPH_MSG_POOLOP_REPLY:
+	case CEPH_MSG_STATFS_REPLY:
+		return get_generic_reply(con, hdr, skip);
+	case CEPH_MSG_AUTH_REPLY:
+		m = ceph_msg_get(monc->m_auth_reply);
+		break;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
+		break;
+	}
+
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
+		*skip = 1;
+	}
+	return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		__open_session(monc);
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 00000000..d5f2d97a
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+	struct ceph_msgpool *pool = arg;
+	void *p;
+
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+	ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int size, bool blocking, const char *name)
+{
+	pool->front_len = front_len;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	pool->name = name;
+	return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
+{
+	if (front_len > pool->front_len) {
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		return ceph_msg_new(0, front_len, GFP_NOFS);
+	}
+
+	return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+	kref_init(&msg->kref);  /* retake single ref */
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 00000000..7330c275
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,2162 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
+
+static const struct ceph_connection_operations osd_con_ops;
+
+static void send_queued(struct ceph_osd_client *osdc);
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+static void __register_request(struct ceph_osd_client *osdc,
+			       struct ceph_osd_request *req);
+static void __unregister_linger_request(struct ceph_osd_client *osdc,
+					struct ceph_osd_request *req);
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req);
+
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
+	case CEPH_OSD_OP_NOTIFY:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int op_has_extent(int op)
+{
+	return (op == CEPH_OSD_OP_READ ||
+		op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+
+	reqhead->snapid = cpu_to_le64(snapid);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	if (op_has_extent(op->op)) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	req->r_page_alignment = off & ~PAGE_MASK;
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino,
+			struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	u64 bno;
+
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+			     plen, &bno, req, op);
+
+	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_msg) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+#ifdef CONFIG_BLOCK
+	if (req->r_bio)
+		bio_put(req->r_bio);
+#endif
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_trail) {
+		ceph_pagelist_release(req->r_trail);
+		kfree(req->r_trail);
+	}
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
+}
+EXPORT_SYMBOL(ceph_osdc_release_request);
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+	int i = 0;
+
+	if (needs_trail)
+		*needs_trail = 0;
+	while (ops[i].op) {
+		if (needs_trail && op_needs_trail(ops[i].op))
+			*needs_trail = 1;
+		i++;
+	}
+
+	return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	int needs_trail;
+	int num_op = get_num_ops(ops, &needs_trail);
+	size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+	msg_size += num_op*sizeof(struct ceph_osd_op);
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), gfp_flags);
+	}
+	if (req == NULL)
+		return NULL;
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	INIT_LIST_HEAD(&req->r_linger_item);
+	INIT_LIST_HEAD(&req->r_linger_osd);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+	req->r_reply = msg;
+
+	/* allocate space for the trailing data */
+	if (needs_trail) {
+		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+		if (!req->r_trail) {
+			ceph_osdc_put_request(req);
+			return NULL;
+		}
+		ceph_pagelist_init(req->r_trail);
+	}
+	/* create request message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+	req->r_request = msg;
+	req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+	if (bio) {
+		req->r_bio = bio;
+		bio_get(req->r_bio);
+	}
+#endif
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst,
+			      struct ceph_osd_req_op *src)
+{
+	dst->op = cpu_to_le16(src->op);
+
+	switch (dst->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		dst->extent.offset =
+			cpu_to_le64(src->extent.offset);
+		dst->extent.length =
+			cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		break;
+
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		BUG_ON(!req->r_trail);
+
+		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+		dst->xattr.cmp_op = src->xattr.cmp_op;
+		dst->xattr.cmp_mode = src->xattr.cmp_mode;
+		ceph_pagelist_append(req->r_trail, src->xattr.name,
+				     src->xattr.name_len);
+		ceph_pagelist_append(req->r_trail, src->xattr.val,
+				     src->xattr.value_len);
+		break;
+	case CEPH_OSD_OP_CALL:
+		BUG_ON(!req->r_trail);
+
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+				     src->cls.class_len);
+		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+				     src->cls.method_len);
+		ceph_pagelist_append(req->r_trail, src->cls.indata,
+				     src->cls.indata_len);
+		break;
+	case CEPH_OSD_OP_ROLLBACK:
+		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	case CEPH_OSD_OP_NOTIFY:
+		{
+			__le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
+			__le32 timeout = cpu_to_le32(src->watch.timeout);
+
+			BUG_ON(!req->r_trail);
+
+			ceph_pagelist_append(req->r_trail,
+						&prot_ver, sizeof(prot_ver));
+			ceph_pagelist_append(req->r_trail,
+						&timeout, sizeof(timeout));
+		}
+	case CEPH_OSD_OP_NOTIFY_ACK:
+	case CEPH_OSD_OP_WATCH:
+		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
+		dst->watch.ver = cpu_to_le64(src->watch.ver);
+		dst->watch.flag = src->watch.flag;
+		break;
+	default:
+		pr_err("unrecognized osd opcode %d\n", dst->op);
+		WARN_ON(1);
+		break;
+	}
+	dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+			     u64 off, u64 *plen,
+			     struct ceph_osd_req_op *src_ops,
+			     struct ceph_snap_context *snapc,
+			     struct timespec *mtime,
+			     const char *oid,
+			     int oid_len)
+{
+	struct ceph_msg *msg = req->r_request;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_req_op *src_op;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = get_num_ops(src_ops, NULL);
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int flags = req->r_flags;
+	u64 data_len = 0;
+	int i;
+
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(oid_len);
+	memcpy(p, oid, oid_len);
+	p += oid_len;
+
+	src_op = src_ops;
+	while (src_op->op) {
+		osd_req_encode_op(req, op, src_op);
+		src_op++;
+		op++;
+	}
+
+	if (req->r_trail)
+		data_len += req->r_trail->length;
+
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+	} else if (data_len) {
+		req->r_request->hdr.data_off = 0;
+		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+	}
+
+	req->r_request->page_alignment = req->r_page_alignment;
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply,
+					       int page_align)
+{
+	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_request *req;
+
+	ops[0].op = opcode;
+	ops[0].extent.truncate_seq = truncate_seq;
+	ops[0].extent.truncate_size = truncate_size;
+	ops[0].payload_len = 0;
+
+	if (do_sync) {
+		ops[1].op = CEPH_OSD_OP_STARTSYNC;
+		ops[1].payload_len = 0;
+		ops[2].op = 0;
+	} else
+		ops[1].op = 0;
+
+	req = ceph_osdc_alloc_request(osdc, flags,
+					 snapc, ops,
+					 use_mempool,
+					 GFP_NOFS, NULL, NULL);
+	if (!req)
+		return NULL;
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	/* in case it differs from natural (file) alignment that
+	   calc_layout filled in for us */
+	req->r_num_pages = calc_pages_for(page_align, *plen);
+	req->r_page_alignment = page_align;
+
+	ceph_osdc_build_request(req, off, plen, ops,
+				snapc,
+				mtime,
+				req->r_oid, req->r_oid_len);
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void __kick_osd_requests(struct ceph_osd_client *osdc,
+				struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req, *nreq;
+	int err;
+
+	dout("__kick_osd_requests osd%d\n", osd->o_osd);
+	err = __reset_osd(osdc, osd);
+	if (err == -EAGAIN)
+		return;
+
+	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
+		list_move(&req->r_req_lru_item, &osdc->req_unsent);
+		dout("requeued %p tid %llu osd%d\n", req, req->r_tid,
+		     osd->o_osd);
+		if (!req->r_linger)
+			req->r_flags |= CEPH_OSD_FLAG_RETRY;
+	}
+
+	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
+				 r_linger_osd) {
+		/*
+		 * reregister request prior to unregistering linger so
+		 * that r_osd is preserved.
+		 */
+		BUG_ON(!list_empty(&req->r_req_lru_item));
+		__register_request(osdc, req);
+		list_add(&req->r_req_lru_item, &osdc->req_unsent);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+		__unregister_linger_request(osdc, req);
+		dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
+		     osd->o_osd);
+	}
+}
+
+static void kick_osd_requests(struct ceph_osd_client *osdc,
+			      struct ceph_osd *kickosd)
+{
+	mutex_lock(&osdc->request_mutex);
+	__kick_osd_requests(osdc, kickosd);
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	down_read(&osdc->map_sem);
+	kick_osd_requests(osdc, osd);
+	send_queued(osdc);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_linger_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref)) {
+		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+		if (osd->o_authorizer)
+			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+		kfree(osd);
+	}
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("__remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req;
+	int ret = 0;
+
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests) &&
+	    list_empty(&osd->o_linger_requests)) {
+		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void __register_request(struct ceph_osd_client *osdc,
+			       struct ceph_osd_request *req)
+{
+	req->r_tid = ++osdc->last_tid;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
+
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	if (osdc->num_requests == 1) {
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
+	}
+}
+
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	__register_request(osdc, req);
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests) &&
+		    list_empty(&req->r_osd->o_linger_requests)) {
+			dout("moving osd to %p lru\n", req->r_osd);
+			__move_osd_to_lru(osdc, req->r_osd);
+		}
+		if (list_empty(&req->r_linger_item))
+			req->r_osd = NULL;
+	}
+
+	ceph_osdc_put_request(req);
+
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent && req->r_osd) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+}
+
+static void __register_linger_request(struct ceph_osd_client *osdc,
+				    struct ceph_osd_request *req)
+{
+	dout("__register_linger_request %p\n", req);
+	list_add_tail(&req->r_linger_item, &osdc->req_linger);
+	list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
+}
+
+static void __unregister_linger_request(struct ceph_osd_client *osdc,
+					struct ceph_osd_request *req)
+{
+	dout("__unregister_linger_request %p\n", req);
+	if (req->r_osd) {
+		list_del_init(&req->r_linger_item);
+		list_del_init(&req->r_linger_osd);
+
+		if (list_empty(&req->r_osd->o_requests) &&
+		    list_empty(&req->r_osd->o_linger_requests)) {
+			dout("moving osd to %p lru\n", req->r_osd);
+			__move_osd_to_lru(osdc, req->r_osd);
+		}
+		if (list_empty(&req->r_osd_item))
+			req->r_osd = NULL;
+	}
+}
+
+void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
+					 struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	if (req->r_linger) {
+		__unregister_linger_request(osdc, req);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
+
+void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
+				  struct ceph_osd_request *req)
+{
+	if (!req->r_linger) {
+		dout("set_request_linger %p\n", req);
+		req->r_linger = 1;
+		/*
+		 * caller is now responsible for calling
+		 * unregister_linger_request
+		 */
+		ceph_osdc_get_request(req);
+	}
+}
+EXPORT_SYMBOL(ceph_osdc_set_request_linger);
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.  Move the request to the appropriate list
+ * (unsent, homeless) or leave on in-flight lru.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_request(struct ceph_osd_client *osdc,
+			 struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_pg pgid;
+	int acting[CEPH_PG_MAX_SIZE];
+	int o = -1, num = 0;
+	int err;
+
+	dout("map_request %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err) {
+		list_move(&req->r_req_lru_item, &osdc->req_notarget);
+		return err;
+	}
+	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
+	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+	if (err > 0) {
+		o = acting[0];
+		num = err;
+	}
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation &&
+	     req->r_num_pg_osds == num &&
+	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	/* record full pg acting set */
+	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+	req->r_num_pg_osds = num;
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd) {
+			list_move(&req->r_req_lru_item, &osdc->req_notarget);
+			goto out;
+		}
+
+		dout("map_request osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+		list_move(&req->r_req_lru_item, &osdc->req_unsent);
+	} else {
+		list_move(&req->r_req_lru_item, &osdc->req_notarget);
+	}
+	err = 1;   /* osd or pg changed */
+
+out:
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_stamp = jiffies;
+	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Send any requests in the queue (req_unsent).
+ */
+static void send_queued(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req, *tmp;
+
+	dout("send_queued\n");
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
+		__send_request(osdc, req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+	unsigned long keepalive =
+		osdc->client->options->osd_keepalive_timeout * HZ;
+	unsigned long last_stamp = 0;
+	struct list_head slow_osds;
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (timeout && !list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
+		last_req = req;
+		last_stamp = req->r_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_osd_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	__schedule_osd_timeout(osdc);
+	mutex_unlock(&osdc->request_mutex);
+	send_queued(osdc);
+	up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
+static void complete_request(struct ceph_osd_request *req)
+{
+	if (req->r_safe_callback)
+		req->r_safe_callback(req, NULL);
+	complete_all(&req->r_safe_completion);  /* fsync waiter */
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+	s32 result;
+
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	result = le32_to_cpu(rhead->result);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	/*
+	 * if this connection filled our message, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
+		ceph_con_put(con);
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
+		__register_linger_request(osdc, req);
+
+	/* either this is a read, or we got the safe response */
+	if (result < 0 ||
+	    (flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete_all(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK)
+		complete_request(req);
+
+done:
+	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
+}
+
+static void reset_changed_osds(struct ceph_osd_client *osdc)
+{
+	struct rb_node *p, *n;
+
+	for (p = rb_first(&osdc->osds); p; p = n) {
+		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+
+		n = rb_next(p);
+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+		    memcmp(&osd->o_con.peer_addr,
+			   ceph_osd_addr(osdc->osdmap,
+					 osd->o_osd),
+			   sizeof(struct ceph_entity_addr)) != 0)
+			__reset_osd(osdc, osd);
+	}
+}
+
+/*
+ * Requeue requests whose mapping to an OSD has changed.  If requests map to
+ * no osd, request a new map.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req, *nreq;
+	struct rb_node *p;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests\n");
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+		err = __map_request(osdc, req);
+		if (err < 0)
+			continue;  /* error */
+		if (req->r_osd == NULL) {
+			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
+			needmap++;  /* request a newer map */
+		} else if (err > 0) {
+			dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
+			     req->r_osd ? req->r_osd->o_osd : -1);
+			if (!req->r_linger)
+				req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		}
+	}
+
+	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
+				 r_linger_item) {
+		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
+
+		err = __map_request(osdc, req);
+		if (err == 0)
+			continue;  /* no change and no osd was specified */
+		if (err < 0)
+			continue;  /* hrm! */
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+		dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd ? req->r_osd->o_osd : -1);
+		__unregister_linger_request(osdc, req);
+		__register_request(osdc, req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+}
+
+
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+			kick_requests(osdc);
+			reset_changed_osds(osdc);
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+			kick_requests(osdc);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+
+	/*
+	 * subscribe to subsequent osdmap updates if full to ensure
+	 * we find out when we are no longer full and stop returning
+	 * ENOSPC.
+	 */
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	send_queued(osdc);
+	up_read(&osdc->map_sem);
+	wake_up_all(&osdc->client->auth_wq);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
+	up_write(&osdc->map_sem);
+	return;
+}
+
+/*
+ * watch/notify callback event infrastructure
+ *
+ * These callbacks are used both for watch and notify operations.
+ */
+static void __release_event(struct kref *kref)
+{
+	struct ceph_osd_event *event =
+		container_of(kref, struct ceph_osd_event, kref);
+
+	dout("__release_event %p\n", event);
+	kfree(event);
+}
+
+static void get_event(struct ceph_osd_event *event)
+{
+	kref_get(&event->kref);
+}
+
+void ceph_osdc_put_event(struct ceph_osd_event *event)
+{
+	kref_put(&event->kref, __release_event);
+}
+EXPORT_SYMBOL(ceph_osdc_put_event);
+
+static void __insert_event(struct ceph_osd_client *osdc,
+			     struct ceph_osd_event *new)
+{
+	struct rb_node **p = &osdc->event_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_event *event = NULL;
+
+	while (*p) {
+		parent = *p;
+		event = rb_entry(parent, struct ceph_osd_event, node);
+		if (new->cookie < event->cookie)
+			p = &(*p)->rb_left;
+		else if (new->cookie > event->cookie)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &osdc->event_tree);
+}
+
+static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
+					        u64 cookie)
+{
+	struct rb_node **p = &osdc->event_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_event *event = NULL;
+
+	while (*p) {
+		parent = *p;
+		event = rb_entry(parent, struct ceph_osd_event, node);
+		if (cookie < event->cookie)
+			p = &(*p)->rb_left;
+		else if (cookie > event->cookie)
+			p = &(*p)->rb_right;
+		else
+			return event;
+	}
+	return NULL;
+}
+
+static void __remove_event(struct ceph_osd_event *event)
+{
+	struct ceph_osd_client *osdc = event->osdc;
+
+	if (!RB_EMPTY_NODE(&event->node)) {
+		dout("__remove_event removed %p\n", event);
+		rb_erase(&event->node, &osdc->event_tree);
+		ceph_osdc_put_event(event);
+	} else {
+		dout("__remove_event didn't remove %p\n", event);
+	}
+}
+
+int ceph_osdc_create_event(struct ceph_osd_client *osdc,
+			   void (*event_cb)(u64, u64, u8, void *),
+			   int one_shot, void *data,
+			   struct ceph_osd_event **pevent)
+{
+	struct ceph_osd_event *event;
+
+	event = kmalloc(sizeof(*event), GFP_NOIO);
+	if (!event)
+		return -ENOMEM;
+
+	dout("create_event %p\n", event);
+	event->cb = event_cb;
+	event->one_shot = one_shot;
+	event->data = data;
+	event->osdc = osdc;
+	INIT_LIST_HEAD(&event->osd_node);
+	kref_init(&event->kref);   /* one ref for us */
+	kref_get(&event->kref);    /* one ref for the caller */
+	init_completion(&event->completion);
+
+	spin_lock(&osdc->event_lock);
+	event->cookie = ++osdc->event_count;
+	__insert_event(osdc, event);
+	spin_unlock(&osdc->event_lock);
+
+	*pevent = event;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_osdc_create_event);
+
+void ceph_osdc_cancel_event(struct ceph_osd_event *event)
+{
+	struct ceph_osd_client *osdc = event->osdc;
+
+	dout("cancel_event %p\n", event);
+	spin_lock(&osdc->event_lock);
+	__remove_event(event);
+	spin_unlock(&osdc->event_lock);
+	ceph_osdc_put_event(event); /* caller's */
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_event);
+
+
+static void do_event_work(struct work_struct *work)
+{
+	struct ceph_osd_event_work *event_work =
+		container_of(work, struct ceph_osd_event_work, work);
+	struct ceph_osd_event *event = event_work->event;
+	u64 ver = event_work->ver;
+	u64 notify_id = event_work->notify_id;
+	u8 opcode = event_work->opcode;
+
+	dout("do_event_work completing %p\n", event);
+	event->cb(ver, notify_id, opcode, event->data);
+	complete(&event->completion);
+	dout("do_event_work completed %p\n", event);
+	ceph_osdc_put_event(event);
+	kfree(event_work);
+}
+
+
+/*
+ * Process osd watch notifications
+ */
+void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end;
+	u8 proto_ver;
+	u64 cookie, ver, notify_id;
+	u8 opcode;
+	struct ceph_osd_event *event;
+	struct ceph_osd_event_work *event_work;
+
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	ceph_decode_8_safe(&p, end, proto_ver, bad);
+	ceph_decode_8_safe(&p, end, opcode, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+	ceph_decode_64_safe(&p, end, ver, bad);
+	ceph_decode_64_safe(&p, end, notify_id, bad);
+
+	spin_lock(&osdc->event_lock);
+	event = __find_event(osdc, cookie);
+	if (event) {
+		get_event(event);
+		if (event->one_shot)
+			__remove_event(event);
+	}
+	spin_unlock(&osdc->event_lock);
+	dout("handle_watch_notify cookie %lld ver %lld event %p\n",
+	     cookie, ver, event);
+	if (event) {
+		event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
+		if (!event_work) {
+			dout("ERROR: could not allocate event_work\n");
+			goto done_err;
+		}
+		INIT_WORK(&event_work->work, do_event_work);
+		event_work->event = event;
+		event_work->ver = ver;
+		event_work->notify_id = notify_id;
+		event_work->opcode = opcode;
+		if (!queue_work(osdc->notify_wq, &event_work->work)) {
+			dout("WARNING: failed to queue notify event work\n");
+			goto done_err;
+		}
+	}
+
+	return;
+
+done_err:
+	complete(&event->completion);
+	ceph_osdc_put_event(event);
+	return;
+
+bad:
+	pr_err("osdc handle_watch_notify corrupt msg\n");
+	return;
+}
+
+int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
+{
+	int err;
+
+	dout("wait_event %p\n", event);
+	err = wait_for_completion_interruptible_timeout(&event->completion,
+							timeout * HZ);
+	ceph_osdc_put_event(event);
+	if (err > 0)
+		err = 0;
+	dout("wait_event %p returns %d\n", event, err);
+	return err;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_event);
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc = 0;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+	req->r_request->bio = req->r_bio;
+#endif
+	req->r_request->trail = req->r_trail;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __map_request(osdc, req);
+		if (rc < 0) {
+			if (nofail) {
+				dout("osdc_start_request failed map, "
+				     " will retry %lld\n", req->r_tid);
+				rc = 0;
+			}
+			goto out_unlock;
+		}
+		if (req->r_osd == NULL) {
+			dout("send_request %p no up osds in pg\n", req);
+			ceph_monc_request_next_osdmap(&osdc->client->monc);
+		} else {
+			rc = __send_request(osdc, req);
+			if (rc) {
+				if (nofail) {
+					dout("osdc_start_request failed send, "
+					     " will retry %lld\n", req->r_tid);
+					rc = 0;
+				} else {
+					__unregister_request(osdc, req);
+				}
+			}
+		}
+	}
+
+out_unlock:
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		__unregister_request(osdc, req);
+		mutex_unlock(&osdc->request_mutex);
+		complete_request(req);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
+	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
+	INIT_LIST_HEAD(&osdc->req_unsent);
+	INIT_LIST_HEAD(&osdc->req_notarget);
+	INIT_LIST_HEAD(&osdc->req_linger);
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+	spin_lock_init(&osdc->event_lock);
+	osdc->event_tree = RB_ROOT;
+	osdc->event_count = 0;
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+	err = -ENOMEM;
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		goto out;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
+	if (err < 0)
+		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
+	if (err < 0)
+		goto out_msgpool;
+
+	osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
+	if (IS_ERR(osdc->notify_wq)) {
+		err = PTR_ERR(osdc->notify_wq);
+		osdc->notify_wq = NULL;
+		goto out_msgpool;
+	}
+	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_osdc_init);
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	flush_workqueue(osdc->notify_wq);
+	destroy_workqueue(osdc->notify_wq);
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	remove_old_osds(osdc, 1);
+	WARN_ON(!RB_EMPTY_ROOT(&osdc->osds));
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+EXPORT_SYMBOL(ceph_osdc_stop);
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages, int page_align)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1, page_align);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
+	     off, *plen, req->r_num_pages, page_align);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+	int page_align = off & ~PAGE_MASK;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1, page_align);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		goto out;
+	osdc = osd->o_osdc;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg, con);
+		break;
+	case CEPH_MSG_WATCH_NOTIFY:
+		handle_watch_notify(osdc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
+	u64 tid;
+
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
+		goto out;
+	}
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
+	}
+
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		if (!m)
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
+	if (data_len > 0) {
+		int want = calc_pages_for(req->r_page_alignment, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply has %d bytes %d pages, we"
+				   " had only %d pages ready\n", tid, data_len,
+				   want, req->r_num_pages);
+			*skip = 1;
+			ceph_msg_put(m);
+			m = NULL;
+			goto out;
+		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
+		m->page_alignment = req->r_page_alignment;
+#ifdef CONFIG_BLOCK
+		m->bio = req->r_bio;
+#endif
+	}
+	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+	case CEPH_MSG_WATCH_NOTIFY:
+		return ceph_msg_new(type, front, GFP_NOFS);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+			return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.alloc_msg = alloc_msg,
+	.fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 00000000..e97c3588
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1135 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	b->item_weight = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		b->node_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	magic = ceph_decode_32(p);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			err = -EINVAL;
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			b->items[j] = ceph_decode_32(p);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		err = -EINVAL;
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+	int c;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+	struct rb_node *rbp;
+
+	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rbp, struct ceph_pg_pool_info, node);
+		if (pi->name && strcmp(pi->name, name) == 0)
+			return pi->id;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+	unsigned n, m;
+
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+
+	/* num_snaps * snap_info_t */
+	n = le32_to_cpu(pi->v.num_snaps);
+	while (n--) {
+		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+				 sizeof(struct ceph_timespec), bad);
+		*p += sizeof(u64) +       /* key */
+			1 + sizeof(u64) + /* u8, snapid */
+			sizeof(struct ceph_timespec);
+		m = ceph_decode_32(p);    /* snap name */
+		*p += m;
+	}
+
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	u8 ev;
+	int err = -EINVAL;
+	void *start = *p;
+	struct ceph_pg_pool_info *pi;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	map->epoch = ceph_decode_32(p);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
+			goto bad;
+		pi->id = ceph_decode_32(p);
+		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			kfree(pi);
+			goto bad;
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0) {
+			kfree(pi);
+			goto bad;
+		}
+		__insert_pg_pool(&map->pg_pools, pi);
+	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	max = ceph_decode_32(p);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		map->osd_weight[i] = ceph_decode_32(p);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		struct ceph_pg pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		n = ceph_decode_32(p);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg)
+			goto bad;
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			pg->osds[j] = ceph_decode_32(p);
+
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_pool_max, new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(p);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
+	new_flags = ceph_decode_32(p);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		return osdmap_decode(p, min(*p+len, end));
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_CAST(newcrush);
+		*p += len;
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	max = ceph_decode_32(p);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		__u8 ev;
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0)
+			goto bad;
+	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	/* old_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
+	}
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_state */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		u8 xorstate;
+		ceph_decode_32_safe(p, end, osd, bad);
+		xorstate = **(u8 **)p;
+		(*p)++;  /* clean flag */
+		if (xorstate == 0)
+			xorstate = CEPH_OSD_UP;
+		if (xorstate & CEPH_OSD_UP)
+			pr_info("osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] ^= xorstate;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		struct ceph_pg pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		pglen = ceph_decode_32(p);
+
+		/* remove any? */
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
+			struct ceph_pg_mapping *cur =
+				rb_entry(rbp, struct ceph_pg_mapping, node);
+
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+			rb_erase(&cur->node, &map->pg_temp);
+			kfree(cur);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < pglen; j++)
+				pg->osds[j] = ceph_decode_32(p);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err) {
+				kfree(pg);
+				goto bad;
+			}
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
+		}
+	}
+	while (rbp) {
+		struct ceph_pg_mapping *cur =
+			rb_entry(rbp, struct ceph_pg_mapping, node);
+
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+		rb_erase(&cur->node, &map->pg_temp);
+		kfree(cur);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *ono,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t, su_offset;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / su;
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+	t = off;
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	struct ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+	unsigned ps;
+
+	BUG_ON(!osdmap);
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	if (preferred >= 0) {
+		ps += preferred;
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+	ol->ol_pgid = pgid;
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_calc_object_layout);
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned poolid, ps, pps;
+	int preferred;
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
+	}
+
+	/* crush */
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return NULL;
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
+		       poolid, pool->v.crush_ruleset, pool->v.type,
+		       pool->v.size);
+		return NULL;
+	}
+
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += poolid;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *acting)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, o, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	o = 0;
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			acting[o++] = osds[i];
+	return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			return osds[i];
+	return -1;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 00000000..13cb409a
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail) {
+		struct page *page = list_entry(pl->head.prev, struct page, lru);
+		kunmap(page);
+		pl->mapped_tail = NULL;
+	}
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	ceph_pagelist_unmap_tail(pl);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	ceph_pagelist_free_reserve(pl);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page;
+
+	if (!pl->num_pages_free) {
+		page = __page_cache_alloc(GFP_NOFS);
+	} else {
+		page = list_first_entry(&pl->free_list, struct page, lru);
+		list_del(&page->lru);
+		--pl->num_pages_free;
+	}
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	ceph_pagelist_unmap_tail(pl);
+	list_add_tail(&page->lru, &pl->head);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+	if (space <= pl->room)
+		return 0;
+	space -= pl->room;
+	space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
+
+	while (space > pl->num_pages_free) {
+		struct page *page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return -ENOMEM;
+		list_add_tail(&page->lru, &pl->free_list);
+		++pl->num_pages_free;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+	while (!list_empty(&pl->free_list)) {
+		struct page *page = list_first_entry(&pl->free_list,
+						     struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+		--pl->num_pages_free;
+	}
+	BUG_ON(pl->num_pages_free);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+			      struct ceph_pagelist_cursor *c)
+{
+	c->pl = pl;
+	c->page_lru = pl->head.prev;
+	c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ *          -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+			   struct ceph_pagelist_cursor *c)
+{
+	struct page *page;
+
+	if (pl != c->pl)
+		return -EINVAL;
+	ceph_pagelist_unmap_tail(pl);
+	while (pl->head.prev != c->page_lru) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		list_del(&page->lru);                /* remove from pagelist */
+		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+		++pl->num_pages_free;
+	}
+	pl->room = c->room;
+	if (!list_empty(&pl->head)) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		pl->mapped_tail = kmap(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 00000000..cd9c21df
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,233 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+					  int num_pages, bool write_page)
+{
+	struct page **pages;
+	int got = 0;
+	int rc = 0;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	while (got < num_pages) {
+		rc = get_user_pages(current, current->mm,
+		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
+		    num_pages - got, write_page, 0, pages + got, NULL);
+		if (rc < 0)
+			break;
+		BUG_ON(rc == 0);
+		got += rc;
+	}
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	ceph_put_page_vector(pages, got, false);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++) {
+		if (dirty)
+			set_page_dirty_lock(pages[i]);
+		put_page(pages[i]);
+	}
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = __page_cache_alloc(flags);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(page_address(pages[i]) + po, data, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(data, page_address(pages[i]) + po, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+					 char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/compat.c b/net/compat.c
new file mode 100644
index 00000000..c578d938
--- /dev/null
+++ b/net/compat.c
@@ -0,0 +1,867 @@
+/*
+ * 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c.
+ *
+ * Copyright (C) 2000		VA Linux Co
+ * Copyright (C) 2000		Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 		Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 	Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 		David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000		Hewlett-Packard Co.
+ * Copyright (C) 2000		David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001	Andi Kleen, SuSE Labs
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/icmpv6.h>
+#include <linux/socket.h>
+#include <linux/syscalls.h>
+#include <linux/filter.h>
+#include <linux/compat.h>
+#include <linux/security.h>
+
+#include <net/scm.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <asm/uaccess.h>
+#include <net/compat.h>
+
+static inline int iov_from_user_compat_to_kern(struct iovec *kiov,
+					  struct compat_iovec __user *uiov32,
+					  int niov)
+{
+	int tot_len = 0;
+
+	while (niov > 0) {
+		compat_uptr_t buf;
+		compat_size_t len;
+
+		if (get_user(len, &uiov32->iov_len) ||
+		    get_user(buf, &uiov32->iov_base))
+			return -EFAULT;
+
+		if (len > INT_MAX - tot_len)
+			len = INT_MAX - tot_len;
+
+		tot_len += len;
+		kiov->iov_base = compat_ptr(buf);
+		kiov->iov_len = (__kernel_size_t) len;
+		uiov32++;
+		kiov++;
+		niov--;
+	}
+	return tot_len;
+}
+
+int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg)
+{
+	compat_uptr_t tmp1, tmp2, tmp3;
+
+	if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) ||
+	    __get_user(tmp1, &umsg->msg_name) ||
+	    __get_user(kmsg->msg_namelen, &umsg->msg_namelen) ||
+	    __get_user(tmp2, &umsg->msg_iov) ||
+	    __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) ||
+	    __get_user(tmp3, &umsg->msg_control) ||
+	    __get_user(kmsg->msg_controllen, &umsg->msg_controllen) ||
+	    __get_user(kmsg->msg_flags, &umsg->msg_flags))
+		return -EFAULT;
+	kmsg->msg_name = compat_ptr(tmp1);
+	kmsg->msg_iov = compat_ptr(tmp2);
+	kmsg->msg_control = compat_ptr(tmp3);
+	return 0;
+}
+
+/* I've named the args so it is easy to tell whose space the pointers are in. */
+int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
+		   struct sockaddr *kern_address, int mode)
+{
+	int tot_len;
+
+	if (kern_msg->msg_namelen) {
+		if (mode == VERIFY_READ) {
+			int err = move_addr_to_kernel(kern_msg->msg_name,
+						      kern_msg->msg_namelen,
+						      kern_address);
+			if (err < 0)
+				return err;
+		}
+		kern_msg->msg_name = kern_address;
+	} else
+		kern_msg->msg_name = NULL;
+
+	tot_len = iov_from_user_compat_to_kern(kern_iov,
+					  (struct compat_iovec __user *)kern_msg->msg_iov,
+					  kern_msg->msg_iovlen);
+	if (tot_len >= 0)
+		kern_msg->msg_iov = kern_iov;
+
+	return tot_len;
+}
+
+/* Bleech... */
+#define CMSG_COMPAT_ALIGN(len)	ALIGN((len), sizeof(s32))
+
+#define CMSG_COMPAT_DATA(cmsg)				\
+	((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))))
+#define CMSG_COMPAT_SPACE(len)				\
+	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len))
+#define CMSG_COMPAT_LEN(len)				\
+	(CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len))
+
+#define CMSG_COMPAT_FIRSTHDR(msg)			\
+	(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ?	\
+	 (struct compat_cmsghdr __user *)((msg)->msg_control) :		\
+	 (struct compat_cmsghdr __user *)NULL)
+
+#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
+	((ucmlen) >= sizeof(struct compat_cmsghdr) && \
+	 (ucmlen) <= (unsigned long) \
+	 ((mhdr)->msg_controllen - \
+	  ((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
+
+static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
+		struct compat_cmsghdr __user *cmsg, int cmsg_len)
+{
+	char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
+	if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
+			msg->msg_controllen)
+		return NULL;
+	return (struct compat_cmsghdr __user *)ptr;
+}
+
+/* There is a lot of hair here because the alignment rules (and
+ * thus placement) of cmsg headers and length are different for
+ * 32-bit apps.  -DaveM
+ */
+int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
+			       unsigned char *stackbuf, int stackbuf_size)
+{
+	struct compat_cmsghdr __user *ucmsg;
+	struct cmsghdr *kcmsg, *kcmsg_base;
+	compat_size_t ucmlen;
+	__kernel_size_t kcmlen, tmp;
+	int err = -EFAULT;
+
+	kcmlen = 0;
+	kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
+	ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
+	while (ucmsg != NULL) {
+		if (get_user(ucmlen, &ucmsg->cmsg_len))
+			return -EFAULT;
+
+		/* Catch bogons. */
+		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+			return -EINVAL;
+
+		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
+		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		tmp = CMSG_ALIGN(tmp);
+		kcmlen += tmp;
+		ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+	}
+	if (kcmlen == 0)
+		return -EINVAL;
+
+	/* The kcmlen holds the 64-bit version of the control length.
+	 * It may not be modified as we do not stick it into the kmsg
+	 * until we have successfully copied over all of the data
+	 * from the user.
+	 */
+	if (kcmlen > stackbuf_size)
+		kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
+	if (kcmsg == NULL)
+		return -ENOBUFS;
+
+	/* Now copy them over neatly. */
+	memset(kcmsg, 0, kcmlen);
+	ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
+	while (ucmsg != NULL) {
+		if (__get_user(ucmlen, &ucmsg->cmsg_len))
+			goto Efault;
+		if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+			goto Einval;
+		tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
+		       CMSG_ALIGN(sizeof(struct cmsghdr)));
+		if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
+			goto Einval;
+		kcmsg->cmsg_len = tmp;
+		tmp = CMSG_ALIGN(tmp);
+		if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
+		    __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
+		    copy_from_user(CMSG_DATA(kcmsg),
+				   CMSG_COMPAT_DATA(ucmsg),
+				   (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+			goto Efault;
+
+		/* Advance. */
+		kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
+		ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+	}
+
+	/* Ok, looks like we made it.  Hook it up and return success. */
+	kmsg->msg_control = kcmsg_base;
+	kmsg->msg_controllen = kcmlen;
+	return 0;
+
+Einval:
+	err = -EINVAL;
+Efault:
+	if (kcmsg_base != (struct cmsghdr *)stackbuf)
+		sock_kfree_s(sk, kcmsg_base, kcmlen);
+	return err;
+}
+
+int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
+{
+	struct compat_timeval ctv;
+	struct compat_timespec cts[3];
+	struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+	struct compat_cmsghdr cmhdr;
+	int cmlen;
+
+	if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
+		kmsg->msg_flags |= MSG_CTRUNC;
+		return 0; /* XXX: return error? check spec. */
+	}
+
+	if (level == SOL_SOCKET && type == SCM_TIMESTAMP) {
+		struct timeval *tv = (struct timeval *)data;
+		ctv.tv_sec = tv->tv_sec;
+		ctv.tv_usec = tv->tv_usec;
+		data = &ctv;
+		len = sizeof(ctv);
+	}
+	if (level == SOL_SOCKET &&
+	    (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) {
+		int count = type == SCM_TIMESTAMPNS ? 1 : 3;
+		int i;
+		struct timespec *ts = (struct timespec *)data;
+		for (i = 0; i < count; i++) {
+			cts[i].tv_sec = ts[i].tv_sec;
+			cts[i].tv_nsec = ts[i].tv_nsec;
+		}
+		data = &cts;
+		len = sizeof(cts[0]) * count;
+	}
+
+	cmlen = CMSG_COMPAT_LEN(len);
+	if (kmsg->msg_controllen < cmlen) {
+		kmsg->msg_flags |= MSG_CTRUNC;
+		cmlen = kmsg->msg_controllen;
+	}
+	cmhdr.cmsg_level = level;
+	cmhdr.cmsg_type = type;
+	cmhdr.cmsg_len = cmlen;
+
+	if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+		return -EFAULT;
+	if (copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr)))
+		return -EFAULT;
+	cmlen = CMSG_COMPAT_SPACE(len);
+	if (kmsg->msg_controllen < cmlen)
+		cmlen = kmsg->msg_controllen;
+	kmsg->msg_control += cmlen;
+	kmsg->msg_controllen -= cmlen;
+	return 0;
+}
+
+void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
+{
+	struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+	int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
+	int fdnum = scm->fp->count;
+	struct file **fp = scm->fp->fp;
+	int __user *cmfptr;
+	int err = 0, i;
+
+	if (fdnum < fdmax)
+		fdmax = fdnum;
+
+	for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) {
+		int new_fd;
+		err = security_file_receive(fp[i]);
+		if (err)
+			break;
+		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags
+					  ? O_CLOEXEC : 0);
+		if (err < 0)
+			break;
+		new_fd = err;
+		err = put_user(new_fd, cmfptr);
+		if (err) {
+			put_unused_fd(new_fd);
+			break;
+		}
+		/* Bump the usage count and install the file. */
+		get_file(fp[i]);
+		fd_install(new_fd, fp[i]);
+	}
+
+	if (i > 0) {
+		int cmlen = CMSG_COMPAT_LEN(i * sizeof(int));
+		err = put_user(SOL_SOCKET, &cm->cmsg_level);
+		if (!err)
+			err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+		if (!err)
+			err = put_user(cmlen, &cm->cmsg_len);
+		if (!err) {
+			cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
+			kmsg->msg_control += cmlen;
+			kmsg->msg_controllen -= cmlen;
+		}
+	}
+	if (i < fdnum)
+		kmsg->msg_flags |= MSG_CTRUNC;
+
+	/*
+	 * All of the files that fit in the message have had their
+	 * usage counts incremented, so we just free the list.
+	 */
+	__scm_destroy(scm);
+}
+
+/*
+ * A struct sock_filter is architecture independent.
+ */
+struct compat_sock_fprog {
+	u16		len;
+	compat_uptr_t	filter;		/* struct sock_filter * */
+};
+
+static int do_set_attach_filter(struct socket *sock, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
+	struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
+	compat_uptr_t ptr;
+	u16 len;
+
+	if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) ||
+	    !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) ||
+	    __get_user(len, &fprog32->len) ||
+	    __get_user(ptr, &fprog32->filter) ||
+	    __put_user(len, &kfprog->len) ||
+	    __put_user(compat_ptr(ptr), &kfprog->filter))
+		return -EFAULT;
+
+	return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
+			      sizeof(struct sock_fprog));
+}
+
+static int do_set_sock_timeout(struct socket *sock, int level,
+		int optname, char __user *optval, unsigned int optlen)
+{
+	struct compat_timeval __user *up = (struct compat_timeval __user *)optval;
+	struct timeval ktime;
+	mm_segment_t old_fs;
+	int err;
+
+	if (optlen < sizeof(*up))
+		return -EINVAL;
+	if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
+	    __get_user(ktime.tv_sec, &up->tv_sec) ||
+	    __get_user(ktime.tv_usec, &up->tv_usec))
+		return -EFAULT;
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sock_setsockopt(sock, level, optname, (char *)&ktime, sizeof(ktime));
+	set_fs(old_fs);
+
+	return err;
+}
+
+static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	if (optname == SO_ATTACH_FILTER)
+		return do_set_attach_filter(sock, level, optname,
+					    optval, optlen);
+	if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+		return do_set_sock_timeout(sock, level, optname, optval, optlen);
+
+	return sock_setsockopt(sock, level, optname, optval, optlen);
+}
+
+asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	int err;
+	struct socket *sock = sockfd_lookup(fd, &err);
+
+	if (sock) {
+		err = security_socket_setsockopt(sock, level, optname);
+		if (err) {
+			sockfd_put(sock);
+			return err;
+		}
+
+		if (level == SOL_SOCKET)
+			err = compat_sock_setsockopt(sock, level,
+					optname, optval, optlen);
+		else if (sock->ops->compat_setsockopt)
+			err = sock->ops->compat_setsockopt(sock, level,
+					optname, optval, optlen);
+		else
+			err = sock->ops->setsockopt(sock, level,
+					optname, optval, optlen);
+		sockfd_put(sock);
+	}
+	return err;
+}
+
+static int do_get_sock_timeout(struct socket *sock, int level, int optname,
+		char __user *optval, int __user *optlen)
+{
+	struct compat_timeval __user *up;
+	struct timeval ktime;
+	mm_segment_t old_fs;
+	int len, err;
+
+	up = (struct compat_timeval __user *) optval;
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < sizeof(*up))
+		return -EINVAL;
+	len = sizeof(ktime);
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
+	set_fs(old_fs);
+
+	if (!err) {
+		if (put_user(sizeof(*up), optlen) ||
+		    !access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
+		    __put_user(ktime.tv_sec, &up->tv_sec) ||
+		    __put_user(ktime.tv_usec, &up->tv_usec))
+			err = -EFAULT;
+	}
+	return err;
+}
+
+static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)
+		return do_get_sock_timeout(sock, level, optname, optval, optlen);
+	return sock_getsockopt(sock, level, optname, optval, optlen);
+}
+
+int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+	struct compat_timeval __user *ctv =
+			(struct compat_timeval __user *) userstamp;
+	int err = -ENOENT;
+	struct timeval tv;
+
+	if (!sock_flag(sk, SOCK_TIMESTAMP))
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	tv = ktime_to_timeval(sk->sk_stamp);
+	if (tv.tv_sec == -1)
+		return err;
+	if (tv.tv_sec == 0) {
+		sk->sk_stamp = ktime_get_real();
+		tv = ktime_to_timeval(sk->sk_stamp);
+	}
+	err = 0;
+	if (put_user(tv.tv_sec, &ctv->tv_sec) ||
+			put_user(tv.tv_usec, &ctv->tv_usec))
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(compat_sock_get_timestamp);
+
+int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+	struct compat_timespec __user *ctv =
+			(struct compat_timespec __user *) userstamp;
+	int err = -ENOENT;
+	struct timespec ts;
+
+	if (!sock_flag(sk, SOCK_TIMESTAMP))
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	ts = ktime_to_timespec(sk->sk_stamp);
+	if (ts.tv_sec == -1)
+		return err;
+	if (ts.tv_sec == 0) {
+		sk->sk_stamp = ktime_get_real();
+		ts = ktime_to_timespec(sk->sk_stamp);
+	}
+	err = 0;
+	if (put_user(ts.tv_sec, &ctv->tv_sec) ||
+			put_user(ts.tv_nsec, &ctv->tv_nsec))
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(compat_sock_get_timestampns);
+
+asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	int err;
+	struct socket *sock = sockfd_lookup(fd, &err);
+
+	if (sock) {
+		err = security_socket_getsockopt(sock, level, optname);
+		if (err) {
+			sockfd_put(sock);
+			return err;
+		}
+
+		if (level == SOL_SOCKET)
+			err = compat_sock_getsockopt(sock, level,
+					optname, optval, optlen);
+		else if (sock->ops->compat_getsockopt)
+			err = sock->ops->compat_getsockopt(sock, level,
+					optname, optval, optlen);
+		else
+			err = sock->ops->getsockopt(sock, level,
+					optname, optval, optlen);
+		sockfd_put(sock);
+	}
+	return err;
+}
+
+struct compat_group_req {
+	__u32				 gr_interface;
+	struct __kernel_sockaddr_storage gr_group
+		__attribute__ ((aligned(4)));
+} __packed;
+
+struct compat_group_source_req {
+	__u32				 gsr_interface;
+	struct __kernel_sockaddr_storage gsr_group
+		__attribute__ ((aligned(4)));
+	struct __kernel_sockaddr_storage gsr_source
+		__attribute__ ((aligned(4)));
+} __packed;
+
+struct compat_group_filter {
+	__u32				 gf_interface;
+	struct __kernel_sockaddr_storage gf_group
+		__attribute__ ((aligned(4)));
+	__u32				 gf_fmode;
+	__u32				 gf_numsrc;
+	struct __kernel_sockaddr_storage gf_slist[1]
+		__attribute__ ((aligned(4)));
+} __packed;
+
+#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
+			sizeof(struct __kernel_sockaddr_storage))
+
+
+int compat_mc_setsockopt(struct sock *sock, int level, int optname,
+	char __user *optval, unsigned int optlen,
+	int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
+{
+	char __user	*koptval = optval;
+	int		koptlen = optlen;
+
+	switch (optname) {
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+	{
+		struct compat_group_req __user *gr32 = (void *)optval;
+		struct group_req __user *kgr =
+			compat_alloc_user_space(sizeof(struct group_req));
+		u32 interface;
+
+		if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) ||
+		    !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) ||
+		    __get_user(interface, &gr32->gr_interface) ||
+		    __put_user(interface, &kgr->gr_interface) ||
+		    copy_in_user(&kgr->gr_group, &gr32->gr_group,
+				sizeof(kgr->gr_group)))
+			return -EFAULT;
+		koptval = (char __user *)kgr;
+		koptlen = sizeof(struct group_req);
+		break;
+	}
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	{
+		struct compat_group_source_req __user *gsr32 = (void *)optval;
+		struct group_source_req __user *kgsr = compat_alloc_user_space(
+			sizeof(struct group_source_req));
+		u32 interface;
+
+		if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) ||
+		    !access_ok(VERIFY_WRITE, kgsr,
+			sizeof(struct group_source_req)) ||
+		    __get_user(interface, &gsr32->gsr_interface) ||
+		    __put_user(interface, &kgsr->gsr_interface) ||
+		    copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group,
+				sizeof(kgsr->gsr_group)) ||
+		    copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source,
+				sizeof(kgsr->gsr_source)))
+			return -EFAULT;
+		koptval = (char __user *)kgsr;
+		koptlen = sizeof(struct group_source_req);
+		break;
+	}
+	case MCAST_MSFILTER:
+	{
+		struct compat_group_filter __user *gf32 = (void *)optval;
+		struct group_filter __user *kgf;
+		u32 interface, fmode, numsrc;
+
+		if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+		    __get_user(interface, &gf32->gf_interface) ||
+		    __get_user(fmode, &gf32->gf_fmode) ||
+		    __get_user(numsrc, &gf32->gf_numsrc))
+			return -EFAULT;
+		koptlen = optlen + sizeof(struct group_filter) -
+				sizeof(struct compat_group_filter);
+		if (koptlen < GROUP_FILTER_SIZE(numsrc))
+			return -EINVAL;
+		kgf = compat_alloc_user_space(koptlen);
+		if (!access_ok(VERIFY_WRITE, kgf, koptlen) ||
+		    __put_user(interface, &kgf->gf_interface) ||
+		    __put_user(fmode, &kgf->gf_fmode) ||
+		    __put_user(numsrc, &kgf->gf_numsrc) ||
+		    copy_in_user(&kgf->gf_group, &gf32->gf_group,
+				sizeof(kgf->gf_group)) ||
+		    (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist,
+				numsrc * sizeof(kgf->gf_slist[0]))))
+			return -EFAULT;
+		koptval = (char __user *)kgf;
+		break;
+	}
+
+	default:
+		break;
+	}
+	return setsockopt(sock, level, optname, koptval, koptlen);
+}
+EXPORT_SYMBOL(compat_mc_setsockopt);
+
+int compat_mc_getsockopt(struct sock *sock, int level, int optname,
+	char __user *optval, int __user *optlen,
+	int (*getsockopt)(struct sock *, int, int, char __user *, int __user *))
+{
+	struct compat_group_filter __user *gf32 = (void *)optval;
+	struct group_filter __user *kgf;
+	int __user	*koptlen;
+	u32 interface, fmode, numsrc;
+	int klen, ulen, err;
+
+	if (optname != MCAST_MSFILTER)
+		return getsockopt(sock, level, optname, optval, optlen);
+
+	koptlen = compat_alloc_user_space(sizeof(*koptlen));
+	if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) ||
+	    __get_user(ulen, optlen))
+		return -EFAULT;
+
+	/* adjust len for pad */
+	klen = ulen + sizeof(*kgf) - sizeof(*gf32);
+
+	if (klen < GROUP_FILTER_SIZE(0))
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) ||
+	    __put_user(klen, koptlen))
+		return -EFAULT;
+
+	/* have to allow space for previous compat_alloc_user_space, too */
+	kgf = compat_alloc_user_space(klen+sizeof(*optlen));
+
+	if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
+	    __get_user(interface, &gf32->gf_interface) ||
+	    __get_user(fmode, &gf32->gf_fmode) ||
+	    __get_user(numsrc, &gf32->gf_numsrc) ||
+	    __put_user(interface, &kgf->gf_interface) ||
+	    __put_user(fmode, &kgf->gf_fmode) ||
+	    __put_user(numsrc, &kgf->gf_numsrc) ||
+	    copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group)))
+		return -EFAULT;
+
+	err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
+	if (err)
+		return err;
+
+	if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) ||
+	    __get_user(klen, koptlen))
+		return -EFAULT;
+
+	ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
+
+	if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) ||
+	    __put_user(ulen, optlen))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_READ, kgf, klen) ||
+	    !access_ok(VERIFY_WRITE, gf32, ulen) ||
+	    __get_user(interface, &kgf->gf_interface) ||
+	    __get_user(fmode, &kgf->gf_fmode) ||
+	    __get_user(numsrc, &kgf->gf_numsrc) ||
+	    __put_user(interface, &gf32->gf_interface) ||
+	    __put_user(fmode, &gf32->gf_fmode) ||
+	    __put_user(numsrc, &gf32->gf_numsrc))
+		return -EFAULT;
+	if (numsrc) {
+		int copylen;
+
+		klen -= GROUP_FILTER_SIZE(0);
+		copylen = numsrc * sizeof(gf32->gf_slist[0]);
+		if (copylen > klen)
+			copylen = klen;
+		if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
+			return -EFAULT;
+	}
+	return err;
+}
+EXPORT_SYMBOL(compat_mc_getsockopt);
+
+
+/* Argument list sizes for compat_sys_socketcall */
+#define AL(x) ((x) * sizeof(u32))
+static unsigned char nas[21] = {
+	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+	AL(4), AL(5), AL(4)
+};
+#undef AL
+
+asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
+{
+	return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+}
+
+asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags)
+{
+	return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+			      flags | MSG_CMSG_COMPAT);
+}
+
+asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags)
+{
+	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
+}
+
+asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned flags)
+{
+	return sys_recv(fd, buf, len, flags | MSG_CMSG_COMPAT);
+}
+
+asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len,
+				    unsigned flags, struct sockaddr __user *addr,
+				    int __user *addrlen)
+{
+	return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
+}
+
+asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
+				    unsigned vlen, unsigned int flags,
+				    struct compat_timespec __user *timeout)
+{
+	int datagrams;
+	struct timespec ktspec;
+
+	if (timeout == NULL)
+		return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+				      flags | MSG_CMSG_COMPAT, NULL);
+
+	if (get_compat_timespec(&ktspec, timeout))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+				   flags | MSG_CMSG_COMPAT, &ktspec);
+	if (datagrams > 0 && put_compat_timespec(&ktspec, timeout))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
+asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
+{
+	int ret;
+	u32 a[6];
+	u32 a0, a1;
+
+	if (call < SYS_SOCKET || call > SYS_SENDMMSG)
+		return -EINVAL;
+	if (copy_from_user(a, args, nas[call]))
+		return -EFAULT;
+	a0 = a[0];
+	a1 = a[1];
+
+	switch (call) {
+	case SYS_SOCKET:
+		ret = sys_socket(a0, a1, a[2]);
+		break;
+	case SYS_BIND:
+		ret = sys_bind(a0, compat_ptr(a1), a[2]);
+		break;
+	case SYS_CONNECT:
+		ret = sys_connect(a0, compat_ptr(a1), a[2]);
+		break;
+	case SYS_LISTEN:
+		ret = sys_listen(a0, a1);
+		break;
+	case SYS_ACCEPT:
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
+		break;
+	case SYS_GETSOCKNAME:
+		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
+		break;
+	case SYS_GETPEERNAME:
+		ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
+		break;
+	case SYS_SOCKETPAIR:
+		ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
+		break;
+	case SYS_SEND:
+		ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
+		break;
+	case SYS_SENDTO:
+		ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]);
+		break;
+	case SYS_RECV:
+		ret = compat_sys_recv(a0, compat_ptr(a1), a[2], a[3]);
+		break;
+	case SYS_RECVFROM:
+		ret = compat_sys_recvfrom(a0, compat_ptr(a1), a[2], a[3],
+					  compat_ptr(a[4]), compat_ptr(a[5]));
+		break;
+	case SYS_SHUTDOWN:
+		ret = sys_shutdown(a0, a1);
+		break;
+	case SYS_SETSOCKOPT:
+		ret = compat_sys_setsockopt(a0, a1, a[2],
+				compat_ptr(a[3]), a[4]);
+		break;
+	case SYS_GETSOCKOPT:
+		ret = compat_sys_getsockopt(a0, a1, a[2],
+				compat_ptr(a[3]), compat_ptr(a[4]));
+		break;
+	case SYS_SENDMSG:
+		ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
+		break;
+	case SYS_SENDMMSG:
+		ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]);
+		break;
+	case SYS_RECVMSG:
+		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
+		break;
+	case SYS_RECVMMSG:
+		ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
+					  compat_ptr(a[4]));
+		break;
+	case SYS_ACCEPT4:
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
diff --git a/net/core/Makefile b/net/core/Makefile
new file mode 100644
index 00000000..0d357b1c
--- /dev/null
+++ b/net/core/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for the Linux networking core.
+#
+
+obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
+
+obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+			neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+
+obj-$(CONFIG_XFRM) += flow.o
+obj-y += net-sysfs.o
+obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NET_DMA) += user_dma.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
+obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
new file mode 100644
index 00000000..18ac112e
--- /dev/null
+++ b/net/core/datagram.c
@@ -0,0 +1,775 @@
+/*
+ *	SUCS NET3:
+ *
+ *	Generic datagram handling routines. These are generic for all
+ *	protocols. Possibly a generic IP version on top of these would
+ *	make sense. Not tonight however 8-).
+ *	This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
+ *	NetROM layer all have identical poll code and mostly
+ *	identical recvmsg() code. So we share it here. The poll was
+ *	shared before but buried in udp.c so I moved it.
+ *
+ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
+ *						     udp.c code)
+ *
+ *	Fixes:
+ *		Alan Cox	:	NULL return from skb_peek_copy()
+ *					understood
+ *		Alan Cox	:	Rewrote skb_read_datagram to avoid the
+ *					skb_peek_copy stuff.
+ *		Alan Cox	:	Added support for SOCK_SEQPACKET.
+ *					IPX can no longer use the SO_TYPE hack
+ *					but AX.25 now works right, and SPX is
+ *					feasible.
+ *		Alan Cox	:	Fixed write poll of non IP protocol
+ *					crash.
+ *		Florian  La Roche:	Changed for my new skbuff handling.
+ *		Darryl Miles	:	Fixed non-blocking SOCK_SEQPACKET.
+ *		Linus Torvalds	:	BSD semantic fixes.
+ *		Alan Cox	:	Datagram iovec handling
+ *		Darryl Miles	:	Fixed non-blocking SOCK_STREAM.
+ *		Alan Cox	:	POSIXisms
+ *		Pete Wyckoff    :       Unconnected accept() fix.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <trace/events/skb.h>
+
+/*
+ *	Is a socket 'connection oriented' ?
+ */
+static inline int connection_based(struct sock *sk)
+{
+	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
+}
+
+static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
+				  void *key)
+{
+	unsigned long bits = (unsigned long)key;
+
+	/*
+	 * Avoid a wakeup if event not interesting for us
+	 */
+	if (bits && !(bits & (POLLIN | POLLERR)))
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+/*
+ * Wait for a packet..
+ */
+static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
+{
+	int error;
+	DEFINE_WAIT_FUNC(wait, receiver_wake_function);
+
+	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+	/* Socket errors? */
+	error = sock_error(sk);
+	if (error)
+		goto out_err;
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		goto out;
+
+	/* Socket shut down? */
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		goto out_noerr;
+
+	/* Sequenced packets can come disconnected.
+	 * If so we report the problem
+	 */
+	error = -ENOTCONN;
+	if (connection_based(sk) &&
+	    !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
+		goto out_err;
+
+	/* handle signals */
+	if (signal_pending(current))
+		goto interrupted;
+
+	error = 0;
+	*timeo_p = schedule_timeout(*timeo_p);
+out:
+	finish_wait(sk_sleep(sk), &wait);
+	return error;
+interrupted:
+	error = sock_intr_errno(*timeo_p);
+out_err:
+	*err = error;
+	goto out;
+out_noerr:
+	*err = 0;
+	error = 1;
+	goto out;
+}
+
+/**
+ *	__skb_recv_datagram - Receive a datagram skbuff
+ *	@sk: socket
+ *	@flags: MSG_ flags
+ *	@peeked: returns non-zero if this packet has been seen before
+ *	@err: error code returned
+ *
+ *	Get a datagram skbuff, understands the peeking, nonblocking wakeups
+ *	and possible races. This replaces identical code in packet, raw and
+ *	udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
+ *	the long standing peek and read race for datagram sockets. If you
+ *	alter this routine remember it must be re-entrant.
+ *
+ *	This function will lock the socket if a skb is returned, so the caller
+ *	needs to unlock the socket in that case (usually by calling
+ *	skb_free_datagram)
+ *
+ *	* It does not lock socket since today. This function is
+ *	* free of race conditions. This measure should/can improve
+ *	* significantly datagram socket latencies at high loads,
+ *	* when data copying to user space takes lots of time.
+ *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ *	*  8) Great win.)
+ *	*			                    --ANK (980729)
+ *
+ *	The order of the tests when we find no data waiting are specified
+ *	quite explicitly by POSIX 1003.1g, don't change them without having
+ *	the standard around please.
+ */
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+				    int *peeked, int *err)
+{
+	struct sk_buff *skb;
+	long timeo;
+	/*
+	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
+	 */
+	int error = sock_error(sk);
+
+	if (error)
+		goto no_packet;
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	do {
+		/* Again only user level code calls this function, so nothing
+		 * interrupt level will suddenly eat the receive_queue.
+		 *
+		 * Look at current nfs client by the way...
+		 * However, this function was correct in any case. 8)
+		 */
+		unsigned long cpu_flags;
+
+		spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb) {
+			*peeked = skb->peeked;
+			if (flags & MSG_PEEK) {
+				skb->peeked = 1;
+				atomic_inc(&skb->users);
+			} else
+				__skb_unlink(skb, &sk->sk_receive_queue);
+		}
+		spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+
+		if (skb)
+			return skb;
+
+		/* User doesn't want to wait */
+		error = -EAGAIN;
+		if (!timeo)
+			goto no_packet;
+
+	} while (!wait_for_packet(sk, err, &timeo));
+
+	return NULL;
+
+no_packet:
+	*err = error;
+	return NULL;
+}
+EXPORT_SYMBOL(__skb_recv_datagram);
+
+struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
+				  int noblock, int *err)
+{
+	int peeked;
+
+	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				   &peeked, err);
+}
+EXPORT_SYMBOL(skb_recv_datagram);
+
+void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
+{
+	consume_skb(skb);
+	sk_mem_reclaim_partial(sk);
+}
+EXPORT_SYMBOL(skb_free_datagram);
+
+void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
+{
+	bool slow;
+
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+
+	slow = lock_sock_fast(sk);
+	skb_orphan(skb);
+	sk_mem_reclaim_partial(sk);
+	unlock_sock_fast(sk, slow);
+
+	/* skb is now orphaned, can be freed outside of locked section */
+	trace_kfree_skb(skb, skb_free_datagram_locked);
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_free_datagram_locked);
+
+/**
+ *	skb_kill_datagram - Free a datagram skbuff forcibly
+ *	@sk: socket
+ *	@skb: datagram skbuff
+ *	@flags: MSG_ flags
+ *
+ *	This function frees a datagram skbuff that was received by
+ *	skb_recv_datagram.  The flags argument must match the one
+ *	used for skb_recv_datagram.
+ *
+ *	If the MSG_PEEK flag is set, and the packet is still on the
+ *	receive queue of the socket, it will be taken off the queue
+ *	before it is freed.
+ *
+ *	This function currently only disables BH when acquiring the
+ *	sk_receive_queue lock.  Therefore it must not be used in a
+ *	context where that lock is acquired in an IRQ context.
+ *
+ *	It returns 0 if the packet was removed by us.
+ */
+
+int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+	int err = 0;
+
+	if (flags & MSG_PEEK) {
+		err = -ENOENT;
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		if (skb == skb_peek(&sk->sk_receive_queue)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			atomic_dec(&skb->users);
+			err = 0;
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	}
+
+	kfree_skb(skb);
+	atomic_inc(&sk->sk_drops);
+	sk_mem_reclaim_partial(sk);
+
+	return err;
+}
+EXPORT_SYMBOL(skb_kill_datagram);
+
+/**
+ *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: io vector to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
+			    struct iovec *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (memcpy_toiovec(to, skb->data + offset, copy))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = memcpy_toiovec(to, vaddr + frag->page_offset +
+					     offset - start, copy);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iovec(frag_iter,
+						    offset - start,
+						    to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iovec);
+
+/**
+ *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: io vector to copy to
+ *	@to_offset: offset in the io vector to start copying to
+ *	@len: amount of data to copy from buffer to iovec
+ *
+ *	Returns 0 or -EFAULT.
+ *	Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
+				  const struct iovec *to, int to_offset,
+				  int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		to_offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
+						offset - start, to_offset, copy);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+			to_offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_const_iovec(frag_iter,
+							  offset - start,
+							  to, to_offset,
+							  copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			to_offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+
+/**
+ *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying to
+ *	@from: io vector to copy to
+ *	@from_offset: offset in the io vector to start copying from
+ *	@len: amount of data to copy to buffer from iovec
+ *
+ *	Returns 0 or -EFAULT.
+ *	Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
+				 const struct iovec *from, int from_offset,
+				 int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
+					copy))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		from_offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = memcpy_fromiovecend(vaddr + frag->page_offset +
+						  offset - start,
+						  from, from_offset, copy);
+			kunmap(page);
+			if (err)
+				goto fault;
+
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+			from_offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_from_iovec(frag_iter,
+							 offset - start,
+							 from,
+							 from_offset,
+							 copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			from_offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+				      u8 __user *to, int len,
+				      __wsum *csump)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	int pos = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		int err = 0;
+		if (copy > len)
+			copy = len;
+		*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
+					       *csump, &err);
+		if (err)
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		to += copy;
+		pos = copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			__wsum csum2;
+			int err = 0;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			csum2 = csum_and_copy_to_user(vaddr +
+							frag->page_offset +
+							offset - start,
+						      to, copy, 0, &err);
+			kunmap(page);
+			if (err)
+				goto fault;
+			*csump = csum_block_add(*csump, csum2, pos);
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+			to += copy;
+			pos += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			__wsum csum2 = 0;
+			if (copy > len)
+				copy = len;
+			if (skb_copy_and_csum_datagram(frag_iter,
+						       offset - start,
+						       to, copy,
+						       &csum2))
+				goto fault;
+			*csump = csum_block_add(*csump, csum2, pos);
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			to += copy;
+			pos += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
+{
+	__sum16 sum;
+
+	sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+	if (likely(!sum)) {
+		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+			netdev_rx_csum_fault(skb->dev);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
+
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+	return __skb_checksum_complete_head(skb, skb->len);
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
+/**
+ *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ *	@skb: skbuff
+ *	@hlen: hardware length
+ *	@iov: io vector
+ *
+ *	Caller _must_ check that skb will fit to this iovec.
+ *
+ *	Returns: 0       - success.
+ *		 -EINVAL - checksum failure.
+ *		 -EFAULT - fault during copy. Beware, in this case iovec
+ *			   can be modified!
+ */
+int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
+				     int hlen, struct iovec *iov)
+{
+	__wsum csum;
+	int chunk = skb->len - hlen;
+
+	if (!chunk)
+		return 0;
+
+	/* Skip filled elements.
+	 * Pretty silly, look at memcpy_toiovec, though 8)
+	 */
+	while (!iov->iov_len)
+		iov++;
+
+	if (iov->iov_len < chunk) {
+		if (__skb_checksum_complete(skb))
+			goto csum_error;
+		if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
+			goto fault;
+	} else {
+		csum = csum_partial(skb->data, hlen, skb->csum);
+		if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
+					       chunk, &csum))
+			goto fault;
+		if (csum_fold(csum))
+			goto csum_error;
+		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+			netdev_rx_csum_fault(skb->dev);
+		iov->iov_len -= chunk;
+		iov->iov_base += chunk;
+	}
+	return 0;
+csum_error:
+	return -EINVAL;
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
+
+/**
+ * 	datagram_poll - generic datagram poll
+ *	@file: file struct
+ *	@sock: socket
+ *	@wait: poll table
+ *
+ *	Datagram poll: Again totally generic. This also handles
+ *	sequenced packet sockets providing the socket receive queue
+ *	is only ever holding data ready to receive.
+ *
+ *	Note: when you _don't_ use this routine for this protocol,
+ *	and you use a different write policy from sock_writeable()
+ *	then please supply your own write_space callback.
+ */
+unsigned int datagram_poll(struct file *file, struct socket *sock,
+			   poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* exceptional events? */
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connection-based need to check for termination and startup */
+	if (connection_based(sk)) {
+		if (sk->sk_state == TCP_CLOSE)
+			mask |= POLLHUP;
+		/* connection hasn't started yet? */
+		if (sk->sk_state == TCP_SYN_SENT)
+			return mask;
+	}
+
+	/* writable? */
+	if (sock_writeable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	else
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	return mask;
+}
+EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
new file mode 100644
index 00000000..a71eafc3
--- /dev/null
+++ b/net/core/dev.c
@@ -0,0 +1,6522 @@
+/*
+ * 	NET3	Protocol independent device support routines.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Derived from the non IP parts of dev.c 1.0.19
+ * 		Authors:	Ross Biro
+ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ *	Additional Authors:
+ *		Florian la Roche <rzsfl@rz.uni-sb.de>
+ *		Alan Cox <gw4pts@gw4pts.ampr.org>
+ *		David Hinds <dahinds@users.sourceforge.net>
+ *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *		Adam Sulmicki <adam@cfar.umd.edu>
+ *              Pekka Riikonen <priikone@poesidon.pspt.fi>
+ *
+ *	Changes:
+ *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
+ *              			to 2 if register_netdev gets called
+ *              			before net_dev_init & also removed a
+ *              			few lines of code in the process.
+ *		Alan Cox	:	device private ioctl copies fields back.
+ *		Alan Cox	:	Transmit queue code does relevant
+ *					stunts to keep the queue safe.
+ *		Alan Cox	:	Fixed double lock.
+ *		Alan Cox	:	Fixed promisc NULL pointer trap
+ *		????????	:	Support the full private ioctl range
+ *		Alan Cox	:	Moved ioctl permission check into
+ *					drivers
+ *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
+ *		Alan Cox	:	100 backlog just doesn't cut it when
+ *					you start doing multicast video 8)
+ *		Alan Cox	:	Rewrote net_bh and list manager.
+ *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
+ *		Alan Cox	:	Took out transmit every packet pass
+ *					Saved a few bytes in the ioctl handler
+ *		Alan Cox	:	Network driver sets packet type before
+ *					calling netif_rx. Saves a function
+ *					call a packet.
+ *		Alan Cox	:	Hashed net_bh()
+ *		Richard Kooijman:	Timestamp fixes.
+ *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
+ *		Alan Cox	:	Device lock protection.
+ *		Alan Cox	: 	Fixed nasty side effect of device close
+ *					changes.
+ *		Rudi Cilibrasi	:	Pass the right thing to
+ *					set_mac_address()
+ *		Dave Miller	:	32bit quantity for the device lock to
+ *					make it work out on a Sparc.
+ *		Bjorn Ekwall	:	Added KERNELD hack.
+ *		Alan Cox	:	Cleaned up the backlog initialise.
+ *		Craig Metz	:	SIOCGIFCONF fix if space for under
+ *					1 device.
+ *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
+ *					is no device open function.
+ *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
+ *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
+ *		Cyrus Durgin	:	Cleaned for KMOD
+ *		Adam Sulmicki   :	Bug Fix : Network Device Unload
+ *					A network device unload needs to purge
+ *					the backlog queue.
+ *	Paul Rusty Russell	:	SIOCSIFNAME
+ *              Pekka Riikonen  :	Netdev boot-time settings code
+ *              Andrew Morton   :       Make unregister_netdevice wait
+ *              			indefinitely on dev->refcnt
+ * 		J Hadi Salim	:	- Backlog queue sampling
+ *				        - netif_rx() feedback
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/notifier.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <net/dst.h>
+#include <net/pkt_sched.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/netpoll.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+#include <net/wext.h>
+#include <net/iw_handler.h>
+#include <asm/current.h>
+#include <linux/audit.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
+#include <linux/pci.h>
+#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
+
+#include "net-sysfs.h"
+
+/* Instead of increasing this, you should create a hash table. */
+#define MAX_GRO_SKBS 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
+/*
+ *	The list of packet types we will receive (as opposed to discard)
+ *	and the routines to invoke.
+ *
+ *	Why 16. Because with 16 the only overlap we get on a hash of the
+ *	low nibble of the protocol value is RARP/SNAP/X.25.
+ *
+ *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
+ *             sure which should go first, but I bet it won't make much
+ *             difference if we are running VLANs.  The good news is that
+ *             this protocol won't be in the list unless compiled in, so
+ *             the average user (w/out VLANs) will not be adversely affected.
+ *             --BLG
+ *
+ *		0800	IP
+ *		8100    802.1Q VLAN
+ *		0001	802.3
+ *		0002	AX.25
+ *		0004	802.2
+ *		8035	RARP
+ *		0005	SNAP
+ *		0805	X.25
+ *		0806	ARP
+ *		8137	IPX
+ *		0009	Localtalk
+ *		86DD	IPv6
+ */
+
+#define PTYPE_HASH_SIZE	(16)
+#define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
+
+static DEFINE_SPINLOCK(ptype_lock);
+static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+static struct list_head ptype_all __read_mostly;	/* Taps */
+
+/*
+ * The @dev_base_head list is protected by @dev_base_lock and the rtnl
+ * semaphore.
+ *
+ * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
+ *
+ * Writers must hold the rtnl semaphore while they loop through the
+ * dev_base_head list, and hold dev_base_lock for writing when they do the
+ * actual updates.  This allows pure readers to access the list even
+ * while a writer is preparing to update it.
+ *
+ * To put it another way, dev_base_lock is held for writing only to
+ * protect against pure readers; the rtnl semaphore provides the
+ * protection against other writers.
+ *
+ * See, for example usages, register_netdevice() and
+ * unregister_netdevice(), which must be called with the rtnl
+ * semaphore held.
+ */
+DEFINE_RWLOCK(dev_base_lock);
+EXPORT_SYMBOL(dev_base_lock);
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+}
+
+static inline void rps_lock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	spin_lock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+static inline void rps_unlock(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	spin_unlock(&sd->input_pkt_queue.lock);
+#endif
+}
+
+/* Device list insertion */
+static int list_netdevice(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+
+	ASSERT_RTNL();
+
+	write_lock_bh(&dev_base_lock);
+	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_index_hash(net, dev->ifindex));
+	write_unlock_bh(&dev_base_lock);
+	return 0;
+}
+
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
+static void unlist_netdevice(struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	/* Unlink dev from the device chain */
+	write_lock_bh(&dev_base_lock);
+	list_del_rcu(&dev->dev_list);
+	hlist_del_rcu(&dev->name_hlist);
+	hlist_del_rcu(&dev->index_hlist);
+	write_unlock_bh(&dev_base_lock);
+}
+
+/*
+ *	Our notifier list
+ */
+
+static RAW_NOTIFIER_HEAD(netdev_chain);
+
+/*
+ *	Device drivers call our routines to queue packets here. We empty the
+ *	queue in the local softnet handler.
+ */
+
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+EXPORT_PER_CPU_SYMBOL(softnet_data);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] =
+	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
+	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
+	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
+	 ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] =
+	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
+	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
+	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
+	 "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+		if (netdev_lock_type[i] == dev_type)
+			return i;
+	/* the last key is used by default */
+	return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+						 unsigned short dev_type)
+{
+	int i;
+
+	i = netdev_lock_pos(dev_type);
+	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+				   netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+	int i;
+
+	i = netdev_lock_pos(dev->type);
+	lockdep_set_class_and_name(&dev->addr_list_lock,
+				   &netdev_addr_lock_key[i],
+				   netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+						 unsigned short dev_type)
+{
+}
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
+/*******************************************************************************
+
+		Protocol management and registration routines
+
+*******************************************************************************/
+
+/*
+ *	Add a protocol ID to the list. Now that the input handler is
+ *	smarter we can dispense with all the messy stuff that used to be
+ *	here.
+ *
+ *	BEWARE!!! Protocol handlers, mangling input packets,
+ *	MUST BE last in hash buckets and checking protocol handlers
+ *	MUST start from promiscuous ptype_all chain in net_bh.
+ *	It is true now, do not change it.
+ *	Explanation follows: if protocol handler, mangling packet, will
+ *	be the first on list, it is not able to sense, that packet
+ *	is cloned and should be copied-on-write, so that it will
+ *	change it and subsequent readers will get broken packet.
+ *							--ANK (980803)
+ */
+
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+	if (pt->type == htons(ETH_P_ALL))
+		return &ptype_all;
+	else
+		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
+/**
+ *	dev_add_pack - add packet handler
+ *	@pt: packet type declaration
+ *
+ *	Add a protocol handler to the networking stack. The passed &packet_type
+ *	is linked into kernel lists and may not be freed until it has been
+ *	removed from the kernel lists.
+ *
+ *	This call does not sleep therefore it can not
+ *	guarantee all CPU's that are in middle of receiving packets
+ *	will see the new packet type (until the next received packet).
+ */
+
+void dev_add_pack(struct packet_type *pt)
+{
+	struct list_head *head = ptype_head(pt);
+
+	spin_lock(&ptype_lock);
+	list_add_rcu(&pt->list, head);
+	spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(dev_add_pack);
+
+/**
+ *	__dev_remove_pack	 - remove packet handler
+ *	@pt: packet type declaration
+ *
+ *	Remove a protocol handler that was previously added to the kernel
+ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ *	from the kernel lists and can be freed or reused once this function
+ *	returns.
+ *
+ *      The packet type might still be in use by receivers
+ *	and must not be freed until after all the CPU's have gone
+ *	through a quiescent state.
+ */
+void __dev_remove_pack(struct packet_type *pt)
+{
+	struct list_head *head = ptype_head(pt);
+	struct packet_type *pt1;
+
+	spin_lock(&ptype_lock);
+
+	list_for_each_entry(pt1, head, list) {
+		if (pt == pt1) {
+			list_del_rcu(&pt->list);
+			goto out;
+		}
+	}
+
+	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
+out:
+	spin_unlock(&ptype_lock);
+}
+EXPORT_SYMBOL(__dev_remove_pack);
+
+/**
+ *	dev_remove_pack	 - remove packet handler
+ *	@pt: packet type declaration
+ *
+ *	Remove a protocol handler that was previously added to the kernel
+ *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
+ *	from the kernel lists and can be freed or reused once this function
+ *	returns.
+ *
+ *	This call sleeps to guarantee that no CPU is looking at the packet
+ *	type after return.
+ */
+void dev_remove_pack(struct packet_type *pt)
+{
+	__dev_remove_pack(pt);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_pack);
+
+/******************************************************************************
+
+		      Device Boot-time Settings Routines
+
+*******************************************************************************/
+
+/* Boot time configuration table */
+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+
+/**
+ *	netdev_boot_setup_add	- add new setup entry
+ *	@name: name of the device
+ *	@map: configured settings for the device
+ *
+ *	Adds new setup entry to the dev_boot_setup list.  The function
+ *	returns 0 on error and 1 on success.  This is a generic routine to
+ *	all netdevices.
+ */
+static int netdev_boot_setup_add(char *name, struct ifmap *map)
+{
+	struct netdev_boot_setup *s;
+	int i;
+
+	s = dev_boot_setup;
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
+			memset(s[i].name, 0, sizeof(s[i].name));
+			strlcpy(s[i].name, name, IFNAMSIZ);
+			memcpy(&s[i].map, map, sizeof(s[i].map));
+			break;
+		}
+	}
+
+	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+}
+
+/**
+ *	netdev_boot_setup_check	- check boot time settings
+ *	@dev: the netdevice
+ *
+ * 	Check boot time settings for the device.
+ *	The found settings are set for the device to be used
+ *	later in the device probing.
+ *	Returns 0 if no settings found, 1 if they are.
+ */
+int netdev_boot_setup_check(struct net_device *dev)
+{
+	struct netdev_boot_setup *s = dev_boot_setup;
+	int i;
+
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
+		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
+		    !strcmp(dev->name, s[i].name)) {
+			dev->irq 	= s[i].map.irq;
+			dev->base_addr 	= s[i].map.base_addr;
+			dev->mem_start 	= s[i].map.mem_start;
+			dev->mem_end 	= s[i].map.mem_end;
+			return 1;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(netdev_boot_setup_check);
+
+
+/**
+ *	netdev_boot_base	- get address from boot time settings
+ *	@prefix: prefix for network device
+ *	@unit: id for network device
+ *
+ * 	Check boot time settings for the base address of device.
+ *	The found settings are set for the device to be used
+ *	later in the device probing.
+ *	Returns 0 if no settings found.
+ */
+unsigned long netdev_boot_base(const char *prefix, int unit)
+{
+	const struct netdev_boot_setup *s = dev_boot_setup;
+	char name[IFNAMSIZ];
+	int i;
+
+	sprintf(name, "%s%d", prefix, unit);
+
+	/*
+	 * If device already registered then return base of 1
+	 * to indicate not to probe for this interface
+	 */
+	if (__dev_get_by_name(&init_net, name))
+		return 1;
+
+	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
+		if (!strcmp(name, s[i].name))
+			return s[i].map.base_addr;
+	return 0;
+}
+
+/*
+ * Saves at boot time configured settings for any netdevice.
+ */
+int __init netdev_boot_setup(char *str)
+{
+	int ints[5];
+	struct ifmap map;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	if (!str || !*str)
+		return 0;
+
+	/* Save settings */
+	memset(&map, 0, sizeof(map));
+	if (ints[0] > 0)
+		map.irq = ints[1];
+	if (ints[0] > 1)
+		map.base_addr = ints[2];
+	if (ints[0] > 2)
+		map.mem_start = ints[3];
+	if (ints[0] > 3)
+		map.mem_end = ints[4];
+
+	/* Add new entry to the list */
+	return netdev_boot_setup_add(str, &map);
+}
+
+__setup("netdev=", netdev_boot_setup);
+
+/*******************************************************************************
+
+			    Device Interface Subroutines
+
+*******************************************************************************/
+
+/**
+ *	__dev_get_by_name	- find a device by its name
+ *	@net: the applicable net namespace
+ *	@name: name to find
+ *
+ *	Find an interface by name. Must be called under RTNL semaphore
+ *	or @dev_base_lock. If the name is found a pointer to the device
+ *	is returned. If the name is not found then %NULL is returned. The
+ *	reference counters are not incremented so the caller must be
+ *	careful with locks.
+ */
+
+struct net_device *__dev_get_by_name(struct net *net, const char *name)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_name_hash(net, name);
+
+	hlist_for_each_entry(dev, p, head, name_hlist)
+		if (!strncmp(dev->name, name, IFNAMSIZ))
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_name);
+
+/**
+ *	dev_get_by_name_rcu	- find a device by its name
+ *	@net: the applicable net namespace
+ *	@name: name to find
+ *
+ *	Find an interface by name.
+ *	If the name is found a pointer to the device is returned.
+ * 	If the name is not found then %NULL is returned.
+ *	The reference counters are not incremented so the caller must be
+ *	careful with locks. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_name_hash(net, name);
+
+	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
+		if (!strncmp(dev->name, name, IFNAMSIZ))
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_name_rcu);
+
+/**
+ *	dev_get_by_name		- find a device by its name
+ *	@net: the applicable net namespace
+ *	@name: name to find
+ *
+ *	Find an interface by name. This can be called from any
+ *	context and does its own locking. The returned handle has
+ *	the usage count incremented and the caller must use dev_put() to
+ *	release it when it is no longer needed. %NULL is returned if no
+ *	matching device is found.
+ */
+
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
+	if (dev)
+		dev_hold(dev);
+	rcu_read_unlock();
+	return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
+/**
+ *	__dev_get_by_index - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not
+ *	had its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold either the RTNL semaphore
+ *	or @dev_base_lock.
+ */
+
+struct net_device *__dev_get_by_index(struct net *net, int ifindex)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
+
+	hlist_for_each_entry(dev, p, head, index_hlist)
+		if (dev->ifindex == ifindex)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(__dev_get_by_index);
+
+/**
+ *	dev_get_by_index_rcu - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not
+ *	had its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
+
+	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
+		if (dev->ifindex == ifindex)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
+
+/**
+ *	dev_get_by_index - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns NULL if the device
+ *	is not found or a pointer to the device. The device returned has
+ *	had a reference added and the pointer is safe until the user calls
+ *	dev_put to indicate they have finished with it.
+ */
+
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (dev)
+		dev_hold(dev);
+	rcu_read_unlock();
+	return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
+
+/**
+ *	dev_getbyhwaddr_rcu - find a device by its hardware address
+ *	@net: the applicable net namespace
+ *	@type: media type of device
+ *	@ha: hardware address
+ *
+ *	Search for an interface by MAC address. Returns NULL if the device
+ *	is not found or a pointer to the device.
+ *	The caller must hold RCU or RTNL.
+ *	The returned device has not had its ref count increased
+ *	and the caller must therefore be careful about locking
+ *
+ */
+
+struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
+				       const char *ha)
+{
+	struct net_device *dev;
+
+	for_each_netdev_rcu(net, dev)
+		if (dev->type == type &&
+		    !memcmp(dev->dev_addr, ha, dev->addr_len))
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+
+struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+	for_each_netdev(net, dev)
+		if (dev->type == type)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+
+struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
+{
+	struct net_device *dev, *ret = NULL;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev)
+		if (dev->type == type) {
+			dev_hold(dev);
+			ret = dev;
+			break;
+		}
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(dev_getfirstbyhwtype);
+
+/**
+ *	dev_get_by_flags_rcu - find any device with given flags
+ *	@net: the applicable net namespace
+ *	@if_flags: IFF_* values
+ *	@mask: bitmask of bits in if_flags to check
+ *
+ *	Search for any interface with the given flags. Returns NULL if a device
+ *	is not found or a pointer to the device. Must be called inside
+ *	rcu_read_lock(), and result refcount is unchanged.
+ */
+
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
+				    unsigned short mask)
+{
+	struct net_device *dev, *ret;
+
+	ret = NULL;
+	for_each_netdev_rcu(net, dev) {
+		if (((dev->flags ^ if_flags) & mask) == 0) {
+			ret = dev;
+			break;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(dev_get_by_flags_rcu);
+
+/**
+ *	dev_valid_name - check if name is okay for network device
+ *	@name: name string
+ *
+ *	Network device names need to be valid file names to
+ *	to allow sysfs to work.  We also disallow any kind of
+ *	whitespace.
+ */
+int dev_valid_name(const char *name)
+{
+	if (*name == '\0')
+		return 0;
+	if (strlen(name) >= IFNAMSIZ)
+		return 0;
+	if (!strcmp(name, ".") || !strcmp(name, ".."))
+		return 0;
+
+	while (*name) {
+		if (*name == '/' || isspace(*name))
+			return 0;
+		name++;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(dev_valid_name);
+
+/**
+ *	__dev_alloc_name - allocate a name for a device
+ *	@net: network namespace to allocate the device name in
+ *	@name: name format string
+ *	@buf:  scratch buffer and result name string
+ *
+ *	Passed a format string - eg "lt%d" it will try and find a suitable
+ *	id. It scans list of devices to build up a free map, then chooses
+ *	the first empty slot. The caller must hold the dev_base or rtnl lock
+ *	while allocating the name and adding the device in order to avoid
+ *	duplicates.
+ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ *	Returns the number of the unit assigned or a negative errno code.
+ */
+
+static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+{
+	int i = 0;
+	const char *p;
+	const int max_netdevices = 8*PAGE_SIZE;
+	unsigned long *inuse;
+	struct net_device *d;
+
+	p = strnchr(name, IFNAMSIZ-1, '%');
+	if (p) {
+		/*
+		 * Verify the string as this thing may have come from
+		 * the user.  There must be either one "%d" and no other "%"
+		 * characters.
+		 */
+		if (p[1] != 'd' || strchr(p + 2, '%'))
+			return -EINVAL;
+
+		/* Use one page as a bit array of possible slots */
+		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
+		if (!inuse)
+			return -ENOMEM;
+
+		for_each_netdev(net, d) {
+			if (!sscanf(d->name, name, &i))
+				continue;
+			if (i < 0 || i >= max_netdevices)
+				continue;
+
+			/*  avoid cases where sscanf is not exact inverse of printf */
+			snprintf(buf, IFNAMSIZ, name, i);
+			if (!strncmp(buf, d->name, IFNAMSIZ))
+				set_bit(i, inuse);
+		}
+
+		i = find_first_zero_bit(inuse, max_netdevices);
+		free_page((unsigned long) inuse);
+	}
+
+	if (buf != name)
+		snprintf(buf, IFNAMSIZ, name, i);
+	if (!__dev_get_by_name(net, buf))
+		return i;
+
+	/* It is possible to run out of possible slots
+	 * when the name is long and there isn't enough space left
+	 * for the digits, or if all bits are used.
+	 */
+	return -ENFILE;
+}
+
+/**
+ *	dev_alloc_name - allocate a name for a device
+ *	@dev: device
+ *	@name: name format string
+ *
+ *	Passed a format string - eg "lt%d" it will try and find a suitable
+ *	id. It scans list of devices to build up a free map, then chooses
+ *	the first empty slot. The caller must hold the dev_base or rtnl lock
+ *	while allocating the name and adding the device in order to avoid
+ *	duplicates.
+ *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
+ *	Returns the number of the unit assigned or a negative errno code.
+ */
+
+int dev_alloc_name(struct net_device *dev, const char *name)
+{
+	char buf[IFNAMSIZ];
+	struct net *net;
+	int ret;
+
+	BUG_ON(!dev_net(dev));
+	net = dev_net(dev);
+	ret = __dev_alloc_name(net, name, buf);
+	if (ret >= 0)
+		strlcpy(dev->name, buf, IFNAMSIZ);
+	return ret;
+}
+EXPORT_SYMBOL(dev_alloc_name);
+
+static int dev_get_valid_name(struct net_device *dev, const char *name)
+{
+	struct net *net;
+
+	BUG_ON(!dev_net(dev));
+	net = dev_net(dev);
+
+	if (!dev_valid_name(name))
+		return -EINVAL;
+
+	if (strchr(name, '%'))
+		return dev_alloc_name(dev, name);
+	else if (__dev_get_by_name(net, name))
+		return -EEXIST;
+	else if (dev->name != name)
+		strlcpy(dev->name, name, IFNAMSIZ);
+
+	return 0;
+}
+
+/**
+ *	dev_change_name - change name of a device
+ *	@dev: device
+ *	@newname: name (or format string) must be at least IFNAMSIZ
+ *
+ *	Change name of a device, can pass format strings "eth%d".
+ *	for wildcarding.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+	char oldname[IFNAMSIZ];
+	int err = 0;
+	int ret;
+	struct net *net;
+
+	ASSERT_RTNL();
+	BUG_ON(!dev_net(dev));
+
+	net = dev_net(dev);
+	if (dev->flags & IFF_UP)
+		return -EBUSY;
+
+	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+		return 0;
+
+	memcpy(oldname, dev->name, IFNAMSIZ);
+
+	err = dev_get_valid_name(dev, newname);
+	if (err < 0)
+		return err;
+
+rollback:
+	ret = device_rename(&dev->dev, dev->name);
+	if (ret) {
+		memcpy(dev->name, oldname, IFNAMSIZ);
+		return ret;
+	}
+
+	write_lock_bh(&dev_base_lock);
+	hlist_del_rcu(&dev->name_hlist);
+	write_unlock_bh(&dev_base_lock);
+
+	synchronize_rcu();
+
+	write_lock_bh(&dev_base_lock);
+	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+	write_unlock_bh(&dev_base_lock);
+
+	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
+	ret = notifier_to_errno(ret);
+
+	if (ret) {
+		/* err >= 0 after dev_alloc_name() or stores the first errno */
+		if (err >= 0) {
+			err = ret;
+			memcpy(dev->name, oldname, IFNAMSIZ);
+			goto rollback;
+		} else {
+			printk(KERN_ERR
+			       "%s: name change rollback failed: %d.\n",
+			       dev->name, ret);
+		}
+	}
+
+	return err;
+}
+
+/**
+ *	dev_set_alias - change ifalias of a device
+ *	@dev: device
+ *	@alias: name up to IFALIASZ
+ *	@len: limit of bytes to copy from info
+ *
+ *	Set ifalias for a device,
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+	ASSERT_RTNL();
+
+	if (len >= IFALIASZ)
+		return -EINVAL;
+
+	if (!len) {
+		if (dev->ifalias) {
+			kfree(dev->ifalias);
+			dev->ifalias = NULL;
+		}
+		return 0;
+	}
+
+	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
+	if (!dev->ifalias)
+		return -ENOMEM;
+
+	strlcpy(dev->ifalias, alias, len+1);
+	return len;
+}
+
+
+/**
+ *	netdev_features_change - device changes features
+ *	@dev: device to cause notification
+ *
+ *	Called to indicate a device has changed features.
+ */
+void netdev_features_change(struct net_device *dev)
+{
+	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL(netdev_features_change);
+
+/**
+ *	netdev_state_change - device changes state
+ *	@dev: device to cause notification
+ *
+ *	Called to indicate a device has changed state. This function calls
+ *	the notifier chains for netdev_chain and sends a NEWLINK message
+ *	to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+	if (dev->flags & IFF_UP) {
+		call_netdevice_notifiers(NETDEV_CHANGE, dev);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+	}
+}
+EXPORT_SYMBOL(netdev_state_change);
+
+int netdev_bonding_change(struct net_device *dev, unsigned long event)
+{
+	return call_netdevice_notifiers(event, dev);
+}
+EXPORT_SYMBOL(netdev_bonding_change);
+
+/**
+ *	dev_load 	- load a network module
+ *	@net: the applicable net namespace
+ *	@name: name of interface
+ *
+ *	If a network interface is not present and the process has suitable
+ *	privileges this function loads the module. If module loading is not
+ *	available in this kernel then it becomes a nop.
+ */
+
+void dev_load(struct net *net, const char *name)
+{
+	struct net_device *dev;
+	int no_module;
+
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(net, name);
+	rcu_read_unlock();
+
+	no_module = !dev;
+	if (no_module && capable(CAP_NET_ADMIN))
+		no_module = request_module("netdev-%s", name);
+	if (no_module && capable(CAP_SYS_MODULE)) {
+		if (!request_module("%s", name))
+			pr_err("Loading kernel module for a network device "
+"with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
+"instead\n", name);
+	}
+}
+EXPORT_SYMBOL(dev_load);
+
+static int __dev_open(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		return ret;
+
+	set_bit(__LINK_STATE_START, &dev->state);
+
+	if (ops->ndo_validate_addr)
+		ret = ops->ndo_validate_addr(dev);
+
+	if (!ret && ops->ndo_open)
+		ret = ops->ndo_open(dev);
+
+	if (ret)
+		clear_bit(__LINK_STATE_START, &dev->state);
+	else {
+		dev->flags |= IFF_UP;
+		net_dmaengine_get();
+		dev_set_rx_mode(dev);
+		dev_activate(dev);
+	}
+
+	return ret;
+}
+
+/**
+ *	dev_open	- prepare an interface for use.
+ *	@dev:	device to open
+ *
+ *	Takes a device from down to up state. The device's private open
+ *	function is invoked and then the multicast lists are loaded. Finally
+ *	the device is moved into the up state and a %NETDEV_UP message is
+ *	sent to the netdev notifier chain.
+ *
+ *	Calling this function on an active interface is a nop. On a failure
+ *	a negative errno code is returned.
+ */
+int dev_open(struct net_device *dev)
+{
+	int ret;
+
+	if (dev->flags & IFF_UP)
+		return 0;
+
+	ret = __dev_open(dev);
+	if (ret < 0)
+		return ret;
+
+	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+	call_netdevice_notifiers(NETDEV_UP, dev);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+static int __dev_close_many(struct list_head *head)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+	might_sleep();
+
+	list_for_each_entry(dev, head, unreg_list) {
+		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
+
+		clear_bit(__LINK_STATE_START, &dev->state);
+
+		/* Synchronize to scheduled poll. We cannot touch poll list, it
+		 * can be even on different cpu. So just clear netif_running().
+		 *
+		 * dev->stop() will invoke napi_disable() on all of it's
+		 * napi_struct instances on this device.
+		 */
+		smp_mb__after_clear_bit(); /* Commit netif_running(). */
+	}
+
+	dev_deactivate_many(head);
+
+	list_for_each_entry(dev, head, unreg_list) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+
+		/*
+		 *	Call the device specific close. This cannot fail.
+		 *	Only if device is UP
+		 *
+		 *	We allow it to be called even after a DETACH hot-plug
+		 *	event.
+		 */
+		if (ops->ndo_stop)
+			ops->ndo_stop(dev);
+
+		dev->flags &= ~IFF_UP;
+		net_dmaengine_put();
+	}
+
+	return 0;
+}
+
+static int __dev_close(struct net_device *dev)
+{
+	int retval;
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	retval = __dev_close_many(&single);
+	list_del(&single);
+	return retval;
+}
+
+static int dev_close_many(struct list_head *head)
+{
+	struct net_device *dev, *tmp;
+	LIST_HEAD(tmp_list);
+
+	list_for_each_entry_safe(dev, tmp, head, unreg_list)
+		if (!(dev->flags & IFF_UP))
+			list_move(&dev->unreg_list, &tmp_list);
+
+	__dev_close_many(head);
+
+	list_for_each_entry(dev, head, unreg_list) {
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
+		call_netdevice_notifiers(NETDEV_DOWN, dev);
+	}
+
+	/* rollback_registered_many needs the complete original list */
+	list_splice(&tmp_list, head);
+	return 0;
+}
+
+/**
+ *	dev_close - shutdown an interface.
+ *	@dev: device to shutdown
+ *
+ *	This function moves an active device into down state. A
+ *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ *	chain.
+ */
+int dev_close(struct net_device *dev)
+{
+	if (dev->flags & IFF_UP) {
+		LIST_HEAD(single);
+
+		list_add(&dev->unreg_list, &single);
+		dev_close_many(&single);
+		list_del(&single);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dev_close);
+
+
+/**
+ *	dev_disable_lro - disable Large Receive Offload on a device
+ *	@dev: device
+ *
+ *	Disable Large Receive Offload (LRO) on a net device.  Must be
+ *	called under RTNL.  This is needed if received packets may be
+ *	forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+	u32 flags;
+
+	/*
+	 * If we're trying to disable lro on a vlan device
+	 * use the underlying physical device instead
+	 */
+	if (is_vlan_dev(dev))
+		dev = vlan_dev_real_dev(dev);
+
+	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
+		flags = dev->ethtool_ops->get_flags(dev);
+	else
+		flags = ethtool_op_get_flags(dev);
+
+	if (!(flags & ETH_FLAG_LRO))
+		return;
+
+	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
+	if (unlikely(dev->features & NETIF_F_LRO))
+		netdev_WARN(dev, "failed to disable LRO!\n");
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+
+static int dev_boot_phase = 1;
+
+/**
+ *	register_netdevice_notifier - register a network notifier block
+ *	@nb: notifier
+ *
+ *	Register a notifier to be called when network device events occur.
+ *	The notifier passed is linked into the kernel structures and must
+ *	not be reused until it has been unregistered. A negative errno code
+ *	is returned on a failure.
+ *
+ * 	When registered all registration and up events are replayed
+ *	to the new notifier to allow device to have a race free
+ *	view of the network device list.
+ */
+
+int register_netdevice_notifier(struct notifier_block *nb)
+{
+	struct net_device *dev;
+	struct net_device *last;
+	struct net *net;
+	int err;
+
+	rtnl_lock();
+	err = raw_notifier_chain_register(&netdev_chain, nb);
+	if (err)
+		goto unlock;
+	if (dev_boot_phase)
+		goto unlock;
+	for_each_net(net) {
+		for_each_netdev(net, dev) {
+			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
+			err = notifier_to_errno(err);
+			if (err)
+				goto rollback;
+
+			if (!(dev->flags & IFF_UP))
+				continue;
+
+			nb->notifier_call(nb, NETDEV_UP, dev);
+		}
+	}
+
+unlock:
+	rtnl_unlock();
+	return err;
+
+rollback:
+	last = dev;
+	for_each_net(net) {
+		for_each_netdev(net, dev) {
+			if (dev == last)
+				break;
+
+			if (dev->flags & IFF_UP) {
+				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
+				nb->notifier_call(nb, NETDEV_DOWN, dev);
+			}
+			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
+		}
+	}
+
+	raw_notifier_chain_unregister(&netdev_chain, nb);
+	goto unlock;
+}
+EXPORT_SYMBOL(register_netdevice_notifier);
+
+/**
+ *	unregister_netdevice_notifier - unregister a network notifier block
+ *	@nb: notifier
+ *
+ *	Unregister a notifier previously registered by
+ *	register_netdevice_notifier(). The notifier is unlinked into the
+ *	kernel structures and may then be reused. A negative errno code
+ *	is returned on a failure.
+ *
+ * 	After unregistering unregister and down device events are synthesized
+ *	for all devices on the device list to the removed notifier to remove
+ *	the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier(struct notifier_block *nb)
+{
+	struct net_device *dev;
+	struct net *net;
+	int err;
+
+	rtnl_lock();
+	err = raw_notifier_chain_unregister(&netdev_chain, nb);
+	if (err)
+		goto unlock;
+
+	for_each_net(net) {
+		for_each_netdev(net, dev) {
+			if (dev->flags & IFF_UP) {
+				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
+				nb->notifier_call(nb, NETDEV_DOWN, dev);
+			}
+			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
+			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
+		}
+	}
+unlock:
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier);
+
+/**
+ *	call_netdevice_notifiers - call all network notifier blocks
+ *      @val: value passed unmodified to notifier function
+ *      @dev: net_device pointer passed unmodified to notifier function
+ *
+ *	Call all network notifier blocks.  Parameters and return value
+ *	are as for raw_notifier_call_chain().
+ */
+
+int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
+{
+	ASSERT_RTNL();
+	return raw_notifier_call_chain(&netdev_chain, val, dev);
+}
+EXPORT_SYMBOL(call_netdevice_notifiers);
+
+/* When > 0 there are consumers of rx skb time stamps */
+static atomic_t netstamp_needed = ATOMIC_INIT(0);
+
+void net_enable_timestamp(void)
+{
+	atomic_inc(&netstamp_needed);
+}
+EXPORT_SYMBOL(net_enable_timestamp);
+
+void net_disable_timestamp(void)
+{
+	atomic_dec(&netstamp_needed);
+}
+EXPORT_SYMBOL(net_disable_timestamp);
+
+static inline void net_timestamp_set(struct sk_buff *skb)
+{
+	if (atomic_read(&netstamp_needed))
+		__net_timestamp(skb);
+	else
+		skb->tstamp.tv64 = 0;
+}
+
+static inline void net_timestamp_check(struct sk_buff *skb)
+{
+	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
+		__net_timestamp(skb);
+}
+
+static inline bool is_skb_forwardable(struct net_device *dev,
+				      struct sk_buff *skb)
+{
+	unsigned int len;
+
+	if (!(dev->flags & IFF_UP))
+		return false;
+
+	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+	if (skb->len <= len)
+		return true;
+
+	/* if TSO is enabled, we don't care about the length as the packet
+	 * could be forwarded without being segmented before
+	 */
+	if (skb_is_gso(skb))
+		return true;
+
+	return false;
+}
+
+/**
+ * dev_forward_skb - loopback an skb to another netif
+ *
+ * @dev: destination network device
+ * @skb: buffer to forward
+ *
+ * return values:
+ *	NET_RX_SUCCESS	(no congestion)
+ *	NET_RX_DROP     (packet was dropped, but freed)
+ *
+ * dev_forward_skb can be used for injecting an skb from the
+ * start_xmit function of one device into the receive queue
+ * of another device.
+ *
+ * The receiving device may be in another namespace, so
+ * we have to clear all information in the skb that could
+ * impact namespace isolation.
+ */
+int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+	skb_orphan(skb);
+	nf_reset(skb);
+
+	if (unlikely(!is_skb_forwardable(dev, skb))) {
+		atomic_long_inc(&dev->rx_dropped);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+	skb->dev = dev;
+	skb_dst_drop(skb);
+	skb->tstamp.tv64 = 0;
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, dev);
+	skb->mark = 0;
+	secpath_reset(skb);
+	nf_reset(skb);
+	return netif_rx(skb);
+}
+EXPORT_SYMBOL_GPL(dev_forward_skb);
+
+static inline int deliver_skb(struct sk_buff *skb,
+			      struct packet_type *pt_prev,
+			      struct net_device *orig_dev)
+{
+	atomic_inc(&skb->users);
+	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+/*
+ *	Support routine. Sends outgoing frames to any network
+ *	taps currently in use.
+ */
+
+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct packet_type *ptype;
+	struct sk_buff *skb2 = NULL;
+	struct packet_type *pt_prev = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+		/* Never send packets back to the socket
+		 * they originated from - MvS (miquels@drinkel.ow.org)
+		 */
+		if ((ptype->dev == dev || !ptype->dev) &&
+		    (ptype->af_packet_priv == NULL ||
+		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
+			if (pt_prev) {
+				deliver_skb(skb2, pt_prev, skb->dev);
+				pt_prev = ptype;
+				continue;
+			}
+
+			skb2 = skb_clone(skb, GFP_ATOMIC);
+			if (!skb2)
+				break;
+
+			net_timestamp_set(skb2);
+
+			/* skb->nh should be correctly
+			   set by sender, so that the second statement is
+			   just protection against buggy protocols.
+			 */
+			skb_reset_mac_header(skb2);
+
+			if (skb_network_header(skb2) < skb2->data ||
+			    skb2->network_header > skb2->tail) {
+				if (net_ratelimit())
+					printk(KERN_CRIT "protocol %04x is "
+					       "buggy, dev %s\n",
+					       ntohs(skb2->protocol),
+					       dev->name);
+				skb_reset_network_header(skb2);
+			}
+
+			skb2->transport_header = skb2->network_header;
+			skb2->pkt_type = PACKET_OUTGOING;
+			pt_prev = ptype;
+		}
+	}
+	if (pt_prev)
+		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
+	rcu_read_unlock();
+}
+
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this verify the tc mapping remains valid and if
+ * not NULL the mapping. With no priorities mapping to this
+ * offset/count pair it will no longer be used. In the worst case TC0
+ * is invalid nothing can be done so disable priority mappings. If is
+ * expected that drivers will fix this mapping if they can before
+ * calling netif_set_real_num_tx_queues.
+ */
+static void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+	int i;
+	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+	/* If TC0 is invalidated disable TC mapping */
+	if (tc->offset + tc->count > txq) {
+		pr_warning("Number of in use tx queues changed "
+			   "invalidating tc mappings. Priority "
+			   "traffic classification disabled!\n");
+		dev->num_tc = 0;
+		return;
+	}
+
+	/* Invalidated prio to tc mappings set to TC0 */
+	for (i = 1; i < TC_BITMASK + 1; i++) {
+		int q = netdev_get_prio_tc_map(dev, i);
+
+		tc = &dev->tc_to_txq[q];
+		if (tc->offset + tc->count > txq) {
+			pr_warning("Number of in use tx queues "
+				   "changed. Priority %i to tc "
+				   "mapping %i is no longer valid "
+				   "setting map to 0\n",
+				   i, q);
+			netdev_set_prio_tc_map(dev, i, 0);
+		}
+	}
+}
+
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+	int rc;
+
+	if (txq < 1 || txq > dev->num_tx_queues)
+		return -EINVAL;
+
+	if (dev->reg_state == NETREG_REGISTERED ||
+	    dev->reg_state == NETREG_UNREGISTERING) {
+		ASSERT_RTNL();
+
+		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
+						  txq);
+		if (rc)
+			return rc;
+
+		if (dev->num_tc)
+			netif_setup_tc(dev, txq);
+
+		if (txq < dev->real_num_tx_queues)
+			qdisc_reset_all_tx_gt(dev, txq);
+	}
+
+	dev->real_num_tx_queues = txq;
+	return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
+
+#ifdef CONFIG_RPS
+/**
+ *	netif_set_real_num_rx_queues - set actual number of RX queues used
+ *	@dev: Network device
+ *	@rxq: Actual number of RX queues
+ *
+ *	This must be called either with the rtnl_lock held or before
+ *	registration of the net device.  Returns 0 on success, or a
+ *	negative error code.  If called before registration, it always
+ *	succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+	int rc;
+
+	if (rxq < 1 || rxq > dev->num_rx_queues)
+		return -EINVAL;
+
+	if (dev->reg_state == NETREG_REGISTERED) {
+		ASSERT_RTNL();
+
+		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+						  rxq);
+		if (rc)
+			return rc;
+	}
+
+	dev->real_num_rx_queues = rxq;
+	return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
+static inline void __netif_reschedule(struct Qdisc *q)
+{
+	struct softnet_data *sd;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	sd = &__get_cpu_var(softnet_data);
+	q->next_sched = NULL;
+	*sd->output_queue_tailp = q;
+	sd->output_queue_tailp = &q->next_sched;
+	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+		__netif_reschedule(q);
+}
+EXPORT_SYMBOL(__netif_schedule);
+
+void dev_kfree_skb_irq(struct sk_buff *skb)
+{
+	if (atomic_dec_and_test(&skb->users)) {
+		struct softnet_data *sd;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		sd = &__get_cpu_var(softnet_data);
+		skb->next = sd->completion_queue;
+		sd->completion_queue = skb;
+		raise_softirq_irqoff(NET_TX_SOFTIRQ);
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL(dev_kfree_skb_irq);
+
+void dev_kfree_skb_any(struct sk_buff *skb)
+{
+	if (in_irq() || irqs_disabled())
+		dev_kfree_skb_irq(skb);
+	else
+		dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(dev_kfree_skb_any);
+
+
+/**
+ * netif_device_detach - mark device as removed
+ * @dev: network device
+ *
+ * Mark device as removed from system and therefore no longer available.
+ */
+void netif_device_detach(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
+	    netif_running(dev)) {
+		netif_tx_stop_all_queues(dev);
+	}
+}
+EXPORT_SYMBOL(netif_device_detach);
+
+/**
+ * netif_device_attach - mark device as attached
+ * @dev: network device
+ *
+ * Mark device as attached from system and restart if needed.
+ */
+void netif_device_attach(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
+	    netif_running(dev)) {
+		netif_tx_wake_all_queues(dev);
+		__netdev_watchdog_up(dev);
+	}
+}
+EXPORT_SYMBOL(netif_device_attach);
+
+/*
+ * Invalidate hardware checksum when packet is to be mangled, and
+ * complete checksum manually on outgoing path.
+ */
+int skb_checksum_help(struct sk_buff *skb)
+{
+	__wsum csum;
+	int ret = 0, offset;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		goto out_set_summed;
+
+	if (unlikely(skb_shinfo(skb)->gso_size)) {
+		/* Let GSO fix up the checksum. */
+		goto out_set_summed;
+	}
+
+	offset = skb_checksum_start_offset(skb);
+	BUG_ON(offset >= skb_headlen(skb));
+	csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+	offset += skb->csum_offset;
+	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+
+	if (skb_cloned(skb) &&
+	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
+		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+		if (ret)
+			goto out;
+	}
+
+	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+out_set_summed:
+	skb->ip_summed = CHECKSUM_NONE;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(skb_checksum_help);
+
+/**
+ *	skb_gso_segment - Perform segmentation on skb.
+ *	@skb: buffer to segment
+ *	@features: features for the output path (see dev->features)
+ *
+ *	This function segments the given skb and returns a list of segments.
+ *
+ *	It may return NULL if the skb requires no segmentation.  This is
+ *	only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+	struct packet_type *ptype;
+	__be16 type = skb->protocol;
+	int vlan_depth = ETH_HLEN;
+	int err;
+
+	while (type == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vh;
+
+		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
+			return ERR_PTR(-EINVAL);
+
+		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+		type = vh->h_vlan_encapsulated_proto;
+		vlan_depth += VLAN_HLEN;
+	}
+
+	skb_reset_mac_header(skb);
+	skb->mac_len = skb->network_header - skb->mac_header;
+	__skb_pull(skb, skb->mac_len);
+
+	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+		struct net_device *dev = skb->dev;
+		struct ethtool_drvinfo info = {};
+
+		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
+			dev->ethtool_ops->get_drvinfo(dev, &info);
+
+		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
+		     info.driver, dev ? dev->features : 0L,
+		     skb->sk ? skb->sk->sk_route_caps : 0L,
+		     skb->len, skb->data_len, skb->ip_summed);
+
+		if (skb_header_cloned(skb) &&
+		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+			return ERR_PTR(err);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype,
+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
+			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+				err = ptype->gso_send_check(skb);
+				segs = ERR_PTR(err);
+				if (err || skb_gso_ok(skb, features))
+					break;
+				__skb_push(skb, (skb->data -
+						 skb_network_header(skb)));
+			}
+			segs = ptype->gso_segment(skb, features);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	__skb_push(skb, skb->data - skb_mac_header(skb));
+
+	return segs;
+}
+EXPORT_SYMBOL(skb_gso_segment);
+
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+	if (net_ratelimit()) {
+		printk(KERN_ERR "%s: hw csum failure.\n",
+			dev ? dev->name : "<unknown>");
+		dump_stack();
+	}
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
+/* Actually, we should eliminate this check as soon as we know, that:
+ * 1. IOMMU is present and allows to map all the memory.
+ * 2. No high memory really exists on this machine.
+ */
+
+static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_HIGHMEM
+	int i;
+	if (!(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
+				return 1;
+	}
+
+	if (PCI_DMA_BUS_IS_PHYS) {
+		struct device *pdev = dev->dev.parent;
+
+		if (!pdev)
+			return 0;
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
+			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
+				return 1;
+		}
+	}
+#endif
+	return 0;
+}
+
+struct dev_gso_cb {
+	void (*destructor)(struct sk_buff *skb);
+};
+
+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+
+static void dev_gso_skb_destructor(struct sk_buff *skb)
+{
+	struct dev_gso_cb *cb;
+
+	do {
+		struct sk_buff *nskb = skb->next;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+		kfree_skb(nskb);
+	} while (skb->next);
+
+	cb = DEV_GSO_CB(skb);
+	if (cb->destructor)
+		cb->destructor(skb);
+}
+
+/**
+ *	dev_gso_segment - Perform emulated hardware segmentation on skb.
+ *	@skb: buffer to segment
+ *	@features: device features as applicable to this skb
+ *
+ *	This function segments the given skb and stores the list of segments
+ *	in skb->next.
+ */
+static int dev_gso_segment(struct sk_buff *skb, int features)
+{
+	struct sk_buff *segs;
+
+	segs = skb_gso_segment(skb, features);
+
+	/* Verifying header integrity only. */
+	if (!segs)
+		return 0;
+
+	if (IS_ERR(segs))
+		return PTR_ERR(segs);
+
+	skb->next = segs;
+	DEV_GSO_CB(skb)->destructor = skb->destructor;
+	skb->destructor = dev_gso_skb_destructor;
+
+	return 0;
+}
+
+/*
+ * Try to orphan skb early, right before transmission by the device.
+ * We cannot orphan skb if tx timestamp is requested or the sk-reference
+ * is needed on driver level for other reasons, e.g. see net/can/raw.c
+ */
+static inline void skb_orphan_try(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (sk && !skb_shinfo(skb)->tx_flags) {
+		/* skb_tx_hash() wont be able to get sk.
+		 * We copy sk_hash into skb->rxhash
+		 */
+		if (!skb->rxhash)
+			skb->rxhash = sk->sk_hash;
+		skb_orphan(skb);
+	}
+}
+
+static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+{
+	return ((features & NETIF_F_GEN_CSUM) ||
+		((features & NETIF_F_V4_CSUM) &&
+		 protocol == htons(ETH_P_IP)) ||
+		((features & NETIF_F_V6_CSUM) &&
+		 protocol == htons(ETH_P_IPV6)) ||
+		((features & NETIF_F_FCOE_CRC) &&
+		 protocol == htons(ETH_P_FCOE)));
+}
+
+static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
+{
+	if (!can_checksum_protocol(features, protocol)) {
+		features &= ~NETIF_F_ALL_CSUM;
+		features &= ~NETIF_F_SG;
+	} else if (illegal_highdma(skb->dev, skb)) {
+		features &= ~NETIF_F_SG;
+	}
+
+	return features;
+}
+
+u32 netif_skb_features(struct sk_buff *skb)
+{
+	__be16 protocol = skb->protocol;
+	u32 features = skb->dev->features;
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+		protocol = veh->h_vlan_encapsulated_proto;
+	} else if (!vlan_tx_tag_present(skb)) {
+		return harmonize_features(skb, protocol, features);
+	}
+
+	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
+
+	if (protocol != htons(ETH_P_8021Q)) {
+		return harmonize_features(skb, protocol, features);
+	} else {
+		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
+				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
+		return harmonize_features(skb, protocol, features);
+	}
+}
+EXPORT_SYMBOL(netif_skb_features);
+
+/*
+ * Returns true if either:
+ *	1. skb has frag_list and the device doesn't support FRAGLIST, or
+ *	2. skb is fragmented and the device does not support SG, or if
+ *	   at least one of fragments is in highmem and device does not
+ *	   support DMA from it.
+ */
+static inline int skb_needs_linearize(struct sk_buff *skb,
+				      int features)
+{
+	return skb_is_nonlinear(skb) &&
+			((skb_has_frag_list(skb) &&
+				!(features & NETIF_F_FRAGLIST)) ||
+			(skb_shinfo(skb)->nr_frags &&
+				!(features & NETIF_F_SG)));
+}
+
+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+			struct netdev_queue *txq)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int rc = NETDEV_TX_OK;
+	unsigned int skb_len;
+
+	if (likely(!skb->next)) {
+		u32 features;
+
+		/*
+		 * If device doesn't need skb->dst, release it right now while
+		 * its hot in this cpu cache
+		 */
+		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+			skb_dst_drop(skb);
+
+		if (!list_empty(&ptype_all))
+			dev_queue_xmit_nit(skb, dev);
+
+		skb_orphan_try(skb);
+
+		features = netif_skb_features(skb);
+
+		if (vlan_tx_tag_present(skb) &&
+		    !(features & NETIF_F_HW_VLAN_TX)) {
+			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+			if (unlikely(!skb))
+				goto out;
+
+			skb->vlan_tci = 0;
+		}
+
+		if (netif_needs_gso(skb, features)) {
+			if (unlikely(dev_gso_segment(skb, features)))
+				goto out_kfree_skb;
+			if (skb->next)
+				goto gso;
+		} else {
+			if (skb_needs_linearize(skb, features) &&
+			    __skb_linearize(skb))
+				goto out_kfree_skb;
+
+			/* If packet is not checksummed and device does not
+			 * support checksumming for this protocol, complete
+			 * checksumming here.
+			 */
+			if (skb->ip_summed == CHECKSUM_PARTIAL) {
+				skb_set_transport_header(skb,
+					skb_checksum_start_offset(skb));
+				if (!(features & NETIF_F_ALL_CSUM) &&
+				     skb_checksum_help(skb))
+					goto out_kfree_skb;
+			}
+		}
+
+		skb_len = skb->len;
+		rc = ops->ndo_start_xmit(skb, dev);
+		trace_net_dev_xmit(skb, rc, dev, skb_len);
+		if (rc == NETDEV_TX_OK)
+			txq_trans_update(txq);
+		return rc;
+	}
+
+gso:
+	do {
+		struct sk_buff *nskb = skb->next;
+
+		skb->next = nskb->next;
+		nskb->next = NULL;
+
+		/*
+		 * If device doesn't need nskb->dst, release it right now while
+		 * its hot in this cpu cache
+		 */
+		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+			skb_dst_drop(nskb);
+
+		skb_len = nskb->len;
+		rc = ops->ndo_start_xmit(nskb, dev);
+		trace_net_dev_xmit(nskb, rc, dev, skb_len);
+		if (unlikely(rc != NETDEV_TX_OK)) {
+			if (rc & ~NETDEV_TX_MASK)
+				goto out_kfree_gso_skb;
+			nskb->next = skb->next;
+			skb->next = nskb;
+			return rc;
+		}
+		txq_trans_update(txq);
+		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+			return NETDEV_TX_BUSY;
+	} while (skb->next);
+
+out_kfree_gso_skb:
+	if (likely(skb->next == NULL))
+		skb->destructor = DEV_GSO_CB(skb)->destructor;
+out_kfree_skb:
+	kfree_skb(skb);
+out:
+	return rc;
+}
+
+static u32 hashrnd __read_mostly;
+
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+		  unsigned int num_tx_queues)
+{
+	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
+
+	if (skb_rx_queue_recorded(skb)) {
+		hash = skb_get_rx_queue(skb);
+		while (unlikely(hash >= num_tx_queues))
+			hash -= num_tx_queues;
+		return hash;
+	}
+
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
+	if (skb->sk && skb->sk->sk_hash)
+		hash = skb->sk->sk_hash;
+	else
+		hash = (__force u16) skb->protocol ^ skb->rxhash;
+	hash = jhash_1word(hash, hashrnd);
+
+	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
+static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
+{
+	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
+		if (net_ratelimit()) {
+			pr_warning("%s selects TX queue %d, but "
+				"real number of TX queues is %d\n",
+				dev->name, queue_index, dev->real_num_tx_queues);
+		}
+		return 0;
+	}
+	return queue_index;
+}
+
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	int queue_index = -1;
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps);
+	if (dev_maps) {
+		map = rcu_dereference(
+		    dev_maps->cpu_map[raw_smp_processor_id()]);
+		if (map) {
+			if (map->len == 1)
+				queue_index = map->queues[0];
+			else {
+				u32 hash;
+				if (skb->sk && skb->sk->sk_hash)
+					hash = skb->sk->sk_hash;
+				else
+					hash = (__force u16) skb->protocol ^
+					    skb->rxhash;
+				hash = jhash_1word(hash, hashrnd);
+				queue_index = map->queues[
+				    ((u64)hash * map->len) >> 32];
+			}
+			if (unlikely(queue_index >= dev->real_num_tx_queues))
+				queue_index = -1;
+		}
+	}
+	rcu_read_unlock();
+
+	return queue_index;
+#else
+	return -1;
+#endif
+}
+
+static struct netdev_queue *dev_pick_tx(struct net_device *dev,
+					struct sk_buff *skb)
+{
+	int queue_index;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (dev->real_num_tx_queues == 1)
+		queue_index = 0;
+	else if (ops->ndo_select_queue) {
+		queue_index = ops->ndo_select_queue(dev, skb);
+		queue_index = dev_cap_txqueue(dev, queue_index);
+	} else {
+		struct sock *sk = skb->sk;
+		queue_index = sk_tx_queue_get(sk);
+
+		if (queue_index < 0 || skb->ooo_okay ||
+		    queue_index >= dev->real_num_tx_queues) {
+			int old_index = queue_index;
+
+			queue_index = get_xps_queue(dev, skb);
+			if (queue_index < 0)
+				queue_index = skb_tx_hash(dev, skb);
+
+			if (queue_index != old_index && sk) {
+				struct dst_entry *dst =
+				    rcu_dereference_check(sk->sk_dst_cache, 1);
+
+				if (dst && skb_dst(skb) == dst)
+					sk_tx_queue_set(sk, queue_index);
+			}
+		}
+	}
+
+	skb_set_queue_mapping(skb, queue_index);
+	return netdev_get_tx_queue(dev, queue_index);
+}
+
+static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+				 struct net_device *dev,
+				 struct netdev_queue *txq)
+{
+	spinlock_t *root_lock = qdisc_lock(q);
+	bool contended;
+	int rc;
+
+	qdisc_skb_cb(skb)->pkt_len = skb->len;
+	qdisc_calculate_pkt_len(skb, q);
+	/*
+	 * Heuristic to force contended enqueues to serialize on a
+	 * separate lock before trying to get qdisc main lock.
+	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+	 * and dequeue packets faster.
+	 */
+	contended = qdisc_is_running(q);
+	if (unlikely(contended))
+		spin_lock(&q->busylock);
+
+	spin_lock(root_lock);
+	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+		kfree_skb(skb);
+		rc = NET_XMIT_DROP;
+	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+		   qdisc_run_begin(q)) {
+		/*
+		 * This is a work-conserving queue; there are no old skbs
+		 * waiting to be sent out; and the qdisc is not running -
+		 * xmit the skb directly.
+		 */
+		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+			skb_dst_force(skb);
+
+		qdisc_bstats_update(q, skb);
+
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
+			__qdisc_run(q);
+		} else
+			qdisc_run_end(q);
+
+		rc = NET_XMIT_SUCCESS;
+	} else {
+		skb_dst_force(skb);
+		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
+		if (qdisc_run_begin(q)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
+			__qdisc_run(q);
+		}
+	}
+	spin_unlock(root_lock);
+	if (unlikely(contended))
+		spin_unlock(&q->busylock);
+	return rc;
+}
+
+static DEFINE_PER_CPU(int, xmit_recursion);
+#define RECURSION_LIMIT 10
+
+/**
+ *	dev_queue_xmit - transmit a buffer
+ *	@skb: buffer to transmit
+ *
+ *	Queue a buffer for transmission to a network device. The caller must
+ *	have set the device and priority and built the buffer before calling
+ *	this function. The function can be called from an interrupt.
+ *
+ *	A negative errno code is returned on a failure. A success does not
+ *	guarantee the frame will be transmitted as it may be dropped due
+ *	to congestion or traffic shaping.
+ *
+ * -----------------------------------------------------------------------------------
+ *      I notice this method can also return errors from the queue disciplines,
+ *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
+ *      be positive.
+ *
+ *      Regardless of the return value, the skb is consumed, so it is currently
+ *      difficult to retry a send to this method.  (You can bump the ref count
+ *      before sending to hold a reference for retry if you are careful.)
+ *
+ *      When calling this method, interrupts MUST be enabled.  This is because
+ *      the BH enable code must have IRQs enabled so that it will not deadlock.
+ *          --BLG
+ */
+int dev_queue_xmit(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct netdev_queue *txq;
+	struct Qdisc *q;
+	int rc = -ENOMEM;
+
+	/* Disable soft irqs for various locks below. Also
+	 * stops preemption for RCU.
+	 */
+	rcu_read_lock_bh();
+
+	txq = dev_pick_tx(dev, skb);
+	q = rcu_dereference_bh(txq->qdisc);
+
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+#endif
+	trace_net_dev_queue(skb);
+	if (q->enqueue) {
+		rc = __dev_xmit_skb(skb, q, dev, txq);
+		goto out;
+	}
+
+	/* The device has no queue. Common case for software devices:
+	   loopback, all the sorts of tunnels...
+
+	   Really, it is unlikely that netif_tx_lock protection is necessary
+	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
+	   counters.)
+	   However, it is possible, that they rely on protection
+	   made by us here.
+
+	   Check this and shot the lock. It is not prone from deadlocks.
+	   Either shot noqueue qdisc, it is even simpler 8)
+	 */
+	if (dev->flags & IFF_UP) {
+		int cpu = smp_processor_id(); /* ok because BHs are off */
+
+		if (txq->xmit_lock_owner != cpu) {
+
+			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+				goto recursion_alert;
+
+			HARD_TX_LOCK(dev, txq, cpu);
+
+			if (!netif_tx_queue_stopped(txq)) {
+				__this_cpu_inc(xmit_recursion);
+				rc = dev_hard_start_xmit(skb, dev, txq);
+				__this_cpu_dec(xmit_recursion);
+				if (dev_xmit_complete(rc)) {
+					HARD_TX_UNLOCK(dev, txq);
+					goto out;
+				}
+			}
+			HARD_TX_UNLOCK(dev, txq);
+			if (net_ratelimit())
+				printk(KERN_CRIT "Virtual device %s asks to "
+				       "queue packet!\n", dev->name);
+		} else {
+			/* Recursion is detected! It is possible,
+			 * unfortunately
+			 */
+recursion_alert:
+			if (net_ratelimit())
+				printk(KERN_CRIT "Dead loop on virtual device "
+				       "%s, fix it urgently!\n", dev->name);
+		}
+	}
+
+	rc = -ENETDOWN;
+	rcu_read_unlock_bh();
+
+	kfree_skb(skb);
+	return rc;
+out:
+	rcu_read_unlock_bh();
+	return rc;
+}
+EXPORT_SYMBOL(dev_queue_xmit);
+
+
+/*=======================================================================
+			Receiver routines
+  =======================================================================*/
+
+int netdev_max_backlog __read_mostly = 1000;
+int netdev_tstamp_prequeue __read_mostly = 1;
+int netdev_budget __read_mostly = 300;
+int weight_p __read_mostly = 64;            /* old backlog weight */
+
+/* Called with irq disabled */
+static inline void ____napi_schedule(struct softnet_data *sd,
+				     struct napi_struct *napi)
+{
+	list_add_tail(&napi->poll_list, &sd->poll_list);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+/*
+ * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Returns a non-zero hash number on success
+ * and 0 on failure.
+ */
+__u32 __skb_get_rxhash(struct sk_buff *skb)
+{
+	int nhoff, hash = 0, poff;
+	const struct ipv6hdr *ip6;
+	const struct iphdr *ip;
+	u8 ip_proto;
+	u32 addr1, addr2, ihl;
+	union {
+		u32 v32;
+		u16 v16[2];
+	} ports;
+
+	nhoff = skb_network_offset(skb);
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
+			goto done;
+
+		ip = (const struct iphdr *) (skb->data + nhoff);
+		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+			ip_proto = 0;
+		else
+			ip_proto = ip->protocol;
+		addr1 = (__force u32) ip->saddr;
+		addr2 = (__force u32) ip->daddr;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
+			goto done;
+
+		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
+		ip_proto = ip6->nexthdr;
+		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
+		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	default:
+		goto done;
+	}
+
+	ports.v32 = 0;
+	poff = proto_ports_offset(ip_proto);
+	if (poff >= 0) {
+		nhoff += ihl * 4 + poff;
+		if (pskb_may_pull(skb, nhoff + 4)) {
+			ports.v32 = * (__force u32 *) (skb->data + nhoff);
+			if (ports.v16[1] < ports.v16[0])
+				swap(ports.v16[0], ports.v16[1]);
+		}
+	}
+
+	/* get a consistent hash (same value on both flow directions) */
+	if (addr2 < addr1)
+		swap(addr1, addr2);
+
+	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+	if (!hash)
+		hash = 1;
+
+done:
+	return hash;
+}
+EXPORT_SYMBOL(__skb_get_rxhash);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+	    struct rps_dev_flow *rflow, u16 next_cpu)
+{
+	u16 tcpu;
+
+	tcpu = rflow->cpu = next_cpu;
+	if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+		struct netdev_rx_queue *rxqueue;
+		struct rps_dev_flow_table *flow_table;
+		struct rps_dev_flow *old_rflow;
+		u32 flow_id;
+		u16 rxq_index;
+		int rc;
+
+		/* Should we steer this flow to a different hardware queue? */
+		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
+		    !(dev->features & NETIF_F_NTUPLE))
+			goto out;
+		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+		if (rxq_index == skb_get_rx_queue(skb))
+			goto out;
+
+		rxqueue = dev->_rx + rxq_index;
+		flow_table = rcu_dereference(rxqueue->rps_flow_table);
+		if (!flow_table)
+			goto out;
+		flow_id = skb->rxhash & flow_table->mask;
+		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+							rxq_index, flow_id);
+		if (rc < 0)
+			goto out;
+		old_rflow = rflow;
+		rflow = &flow_table->flows[flow_id];
+		rflow->cpu = next_cpu;
+		rflow->filter = rc;
+		if (old_rflow->filter == rflow->filter)
+			old_rflow->filter = RPS_NO_FILTER;
+	out:
+#endif
+		rflow->last_qtail =
+			per_cpu(softnet_data, tcpu).input_queue_head;
+	}
+
+	return rflow;
+}
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+		       struct rps_dev_flow **rflowp)
+{
+	struct netdev_rx_queue *rxqueue;
+	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_sock_flow_table *sock_flow_table;
+	int cpu = -1;
+	u16 tcpu;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+		if (unlikely(index >= dev->real_num_rx_queues)) {
+			WARN_ONCE(dev->real_num_rx_queues > 1,
+				  "%s received packet on queue %u, but number "
+				  "of RX queues is %u\n",
+				  dev->name, index, dev->real_num_rx_queues);
+			goto done;
+		}
+		rxqueue = dev->_rx + index;
+	} else
+		rxqueue = dev->_rx;
+
+	map = rcu_dereference(rxqueue->rps_map);
+	if (map) {
+		if (map->len == 1 &&
+		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
+			tcpu = map->cpus[0];
+			if (cpu_online(tcpu))
+				cpu = tcpu;
+			goto done;
+		}
+	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
+		goto done;
+	}
+
+	skb_reset_network_header(skb);
+	if (!skb_get_rxhash(skb))
+		goto done;
+
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	sock_flow_table = rcu_dereference(rps_sock_flow_table);
+	if (flow_table && sock_flow_table) {
+		u16 next_cpu;
+		struct rps_dev_flow *rflow;
+
+		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+		tcpu = rflow->cpu;
+
+		next_cpu = sock_flow_table->ents[skb->rxhash &
+		    sock_flow_table->mask];
+
+		/*
+		 * If the desired CPU (where last recvmsg was done) is
+		 * different from current CPU (one in the rx-queue flow
+		 * table entry), switch if one of the following holds:
+		 *   - Current CPU is unset (equal to RPS_NO_CPU).
+		 *   - Current CPU is offline.
+		 *   - The current CPU's queue tail has advanced beyond the
+		 *     last packet that was enqueued using this table entry.
+		 *     This guarantees that all previous packets for the flow
+		 *     have been dequeued, thus preserving in order delivery.
+		 */
+		if (unlikely(tcpu != next_cpu) &&
+		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+		      rflow->last_qtail)) >= 0))
+			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
+		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+			*rflowp = rflow;
+			cpu = tcpu;
+			goto done;
+		}
+	}
+
+	if (map) {
+		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+
+		if (cpu_online(tcpu)) {
+			cpu = tcpu;
+			goto done;
+		}
+	}
+
+done:
+	return cpu;
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+			 u32 flow_id, u16 filter_id)
+{
+	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_dev_flow *rflow;
+	bool expire = true;
+	int cpu;
+
+	rcu_read_lock();
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	if (flow_table && flow_id <= flow_table->mask) {
+		rflow = &flow_table->flows[flow_id];
+		cpu = ACCESS_ONCE(rflow->cpu);
+		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+			   rflow->last_qtail) <
+		     (int)(10 * flow_table->mask)))
+			expire = false;
+	}
+	rcu_read_unlock();
+	return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
+/* Called from hardirq (IPI) context */
+static void rps_trigger_softirq(void *data)
+{
+	struct softnet_data *sd = data;
+
+	____napi_schedule(sd, &sd->backlog);
+	sd->received_rps++;
+}
+
+#endif /* CONFIG_RPS */
+
+/*
+ * Check if this softnet_data structure is another cpu one
+ * If yes, queue it to our IPI list and return 1
+ * If no, return 0
+ */
+static int rps_ipi_queued(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
+
+	if (sd != mysd) {
+		sd->rps_ipi_next = mysd->rps_ipi_list;
+		mysd->rps_ipi_list = sd;
+
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		return 1;
+	}
+#endif /* CONFIG_RPS */
+	return 0;
+}
+
+/*
+ * enqueue_to_backlog is called to queue an skb to a per CPU backlog
+ * queue (may be a remote CPU queue).
+ */
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+			      unsigned int *qtail)
+{
+	struct softnet_data *sd;
+	unsigned long flags;
+
+	sd = &per_cpu(softnet_data, cpu);
+
+	local_irq_save(flags);
+
+	rps_lock(sd);
+	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+		if (skb_queue_len(&sd->input_pkt_queue)) {
+enqueue:
+			__skb_queue_tail(&sd->input_pkt_queue, skb);
+			input_queue_tail_incr_save(sd, qtail);
+			rps_unlock(sd);
+			local_irq_restore(flags);
+			return NET_RX_SUCCESS;
+		}
+
+		/* Schedule NAPI for backlog device
+		 * We can use non atomic operation since we own the queue lock
+		 */
+		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
+			if (!rps_ipi_queued(sd))
+				____napi_schedule(sd, &sd->backlog);
+		}
+		goto enqueue;
+	}
+
+	sd->dropped++;
+	rps_unlock(sd);
+
+	local_irq_restore(flags);
+
+	atomic_long_inc(&skb->dev->rx_dropped);
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+/**
+ *	netif_rx	-	post buffer to the network code
+ *	@skb: buffer to post
+ *
+ *	This function receives a packet from a device driver and queues it for
+ *	the upper (protocol) levels to process.  It always succeeds. The buffer
+ *	may be dropped during processing for congestion control or by the
+ *	protocol layers.
+ *
+ *	return values:
+ *	NET_RX_SUCCESS	(no congestion)
+ *	NET_RX_DROP     (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+	int ret;
+
+	/* if netpoll wants it, pretend we never saw it */
+	if (netpoll_rx(skb))
+		return NET_RX_DROP;
+
+	if (netdev_tstamp_prequeue)
+		net_timestamp_check(skb);
+
+	trace_netif_rx(skb);
+#ifdef CONFIG_RPS
+	{
+		struct rps_dev_flow voidflow, *rflow = &voidflow;
+		int cpu;
+
+		preempt_disable();
+		rcu_read_lock();
+
+		cpu = get_rps_cpu(skb->dev, skb, &rflow);
+		if (cpu < 0)
+			cpu = smp_processor_id();
+
+		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+		rcu_read_unlock();
+		preempt_enable();
+	}
+#else
+	{
+		unsigned int qtail;
+		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+		put_cpu();
+	}
+#endif
+	return ret;
+}
+EXPORT_SYMBOL(netif_rx);
+
+int netif_rx_ni(struct sk_buff *skb)
+{
+	int err;
+
+	preempt_disable();
+	err = netif_rx(skb);
+	if (local_softirq_pending())
+		do_softirq();
+	preempt_enable();
+
+	return err;
+}
+EXPORT_SYMBOL(netif_rx_ni);
+
+static void net_tx_action(struct softirq_action *h)
+{
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+
+	if (sd->completion_queue) {
+		struct sk_buff *clist;
+
+		local_irq_disable();
+		clist = sd->completion_queue;
+		sd->completion_queue = NULL;
+		local_irq_enable();
+
+		while (clist) {
+			struct sk_buff *skb = clist;
+			clist = clist->next;
+
+			WARN_ON(atomic_read(&skb->users));
+			trace_kfree_skb(skb, net_tx_action);
+			__kfree_skb(skb);
+		}
+	}
+
+	if (sd->output_queue) {
+		struct Qdisc *head;
+
+		local_irq_disable();
+		head = sd->output_queue;
+		sd->output_queue = NULL;
+		sd->output_queue_tailp = &sd->output_queue;
+		local_irq_enable();
+
+		while (head) {
+			struct Qdisc *q = head;
+			spinlock_t *root_lock;
+
+			head = head->next_sched;
+
+			root_lock = qdisc_lock(q);
+			if (spin_trylock(root_lock)) {
+				smp_mb__before_clear_bit();
+				clear_bit(__QDISC_STATE_SCHED,
+					  &q->state);
+				qdisc_run(q);
+				spin_unlock(root_lock);
+			} else {
+				if (!test_bit(__QDISC_STATE_DEACTIVATED,
+					      &q->state)) {
+					__netif_reschedule(q);
+				} else {
+					smp_mb__before_clear_bit();
+					clear_bit(__QDISC_STATE_SCHED,
+						  &q->state);
+				}
+			}
+		}
+	}
+}
+
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
+/* This hook is defined here for ATM LANE */
+int (*br_fdb_test_addr_hook)(struct net_device *dev,
+			     unsigned char *addr) __read_mostly;
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
+#endif
+
+#ifdef CONFIG_NET_CLS_ACT
+/* TODO: Maybe we should just force sch_ingress to be compiled in
+ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
+ * a compare and 2 stores extra right now if we dont have it on
+ * but have CONFIG_NET_CLS_ACT
+ * NOTE: This doesn't stop any functionality; if you dont have
+ * the ingress scheduler, you just can't add policies on ingress.
+ *
+ */
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
+{
+	struct net_device *dev = skb->dev;
+	u32 ttl = G_TC_RTTL(skb->tc_verd);
+	int result = TC_ACT_OK;
+	struct Qdisc *q;
+
+	if (unlikely(MAX_RED_LOOP < ttl++)) {
+		if (net_ratelimit())
+			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
+			       skb->skb_iif, dev->ifindex);
+		return TC_ACT_SHOT;
+	}
+
+	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
+	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+
+	q = rxq->qdisc;
+	if (q != &noop_qdisc) {
+		spin_lock(qdisc_lock(q));
+		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
+			result = qdisc_enqueue_root(skb, q);
+		spin_unlock(qdisc_lock(q));
+	}
+
+	return result;
+}
+
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+					 struct packet_type **pt_prev,
+					 int *ret, struct net_device *orig_dev)
+{
+	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
+		goto out;
+
+	if (*pt_prev) {
+		*ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = NULL;
+	}
+
+	switch (ing_filter(skb, rxq)) {
+	case TC_ACT_SHOT:
+	case TC_ACT_STOLEN:
+		kfree_skb(skb);
+		return NULL;
+	}
+
+out:
+	skb->tc_verd = 0;
+	return skb;
+}
+#endif
+
+/**
+ *	netdev_rx_handler_register - register receive handler
+ *	@dev: device to register a handler for
+ *	@rx_handler: receive handler to register
+ *	@rx_handler_data: data pointer that is used by rx handler
+ *
+ *	Register a receive hander for a device. This handler will then be
+ *	called from __netif_receive_skb. A negative errno code is returned
+ *	on a failure.
+ *
+ *	The caller must hold the rtnl_mutex.
+ *
+ *	For a general description of rx_handler, see enum rx_handler_result.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+			       rx_handler_func_t *rx_handler,
+			       void *rx_handler_data)
+{
+	ASSERT_RTNL();
+
+	if (dev->rx_handler)
+		return -EBUSY;
+
+	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+	rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ *	netdev_rx_handler_unregister - unregister receive handler
+ *	@dev: device to unregister a handler from
+ *
+ *	Unregister a receive hander from a device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(dev->rx_handler, NULL);
+	rcu_assign_pointer(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+	struct packet_type *ptype, *pt_prev;
+	rx_handler_func_t *rx_handler;
+	struct net_device *orig_dev;
+	struct net_device *null_or_dev;
+	bool deliver_exact = false;
+	int ret = NET_RX_DROP;
+	__be16 type;
+
+	if (!netdev_tstamp_prequeue)
+		net_timestamp_check(skb);
+
+	trace_netif_receive_skb(skb);
+
+	/* if we've gotten here through NAPI, check netpoll */
+	if (netpoll_receive_skb(skb))
+		return NET_RX_DROP;
+
+	if (!skb->skb_iif)
+		skb->skb_iif = skb->dev->ifindex;
+	orig_dev = skb->dev;
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_len(skb);
+
+	pt_prev = NULL;
+
+	rcu_read_lock();
+
+another_round:
+
+	__this_cpu_inc(softnet_data.processed);
+
+	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
+		skb = vlan_untag(skb);
+		if (unlikely(!skb))
+			goto out;
+	}
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (skb->tc_verd & TC_NCLS) {
+		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+		goto ncls;
+	}
+#endif
+
+	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+		if (!ptype->dev || ptype->dev == skb->dev) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+#ifdef CONFIG_NET_CLS_ACT
+	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+	if (!skb)
+		goto out;
+ncls:
+#endif
+
+	rx_handler = rcu_dereference(skb->dev->rx_handler);
+	if (rx_handler) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		switch (rx_handler(&skb)) {
+		case RX_HANDLER_CONSUMED:
+			goto out;
+		case RX_HANDLER_ANOTHER:
+			goto another_round;
+		case RX_HANDLER_EXACT:
+			deliver_exact = true;
+		case RX_HANDLER_PASS:
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		if (vlan_do_receive(&skb)) {
+			ret = __netif_receive_skb(skb);
+			goto out;
+		} else if (unlikely(!skb))
+			goto out;
+	}
+
+	/* deliver only exact match when indicated */
+	null_or_dev = deliver_exact ? skb->dev : NULL;
+
+	type = skb->protocol;
+	list_for_each_entry_rcu(ptype,
+			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
+		if (ptype->type == type &&
+		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
+		     ptype->dev == orig_dev)) {
+			if (pt_prev)
+				ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = ptype;
+		}
+	}
+
+	if (pt_prev) {
+		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+	} else {
+		atomic_long_inc(&skb->dev->rx_dropped);
+		kfree_skb(skb);
+		/* Jamal, now you will not able to escape explaining
+		 * me how you were going to use this. :-)
+		 */
+		ret = NET_RX_DROP;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+/**
+ *	netif_receive_skb - process receive buffer from network
+ *	@skb: buffer to process
+ *
+ *	netif_receive_skb() is the main receive data processing function.
+ *	It always succeeds. The buffer may be dropped during processing
+ *	for congestion control or by the protocol layers.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+	if (netdev_tstamp_prequeue)
+		net_timestamp_check(skb);
+
+	if (skb_defer_rx_timestamp(skb))
+		return NET_RX_SUCCESS;
+
+#ifdef CONFIG_RPS
+	{
+		struct rps_dev_flow voidflow, *rflow = &voidflow;
+		int cpu, ret;
+
+		rcu_read_lock();
+
+		cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+		if (cpu >= 0) {
+			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+			rcu_read_unlock();
+		} else {
+			rcu_read_unlock();
+			ret = __netif_receive_skb(skb);
+		}
+
+		return ret;
+	}
+#else
+	return __netif_receive_skb(skb);
+#endif
+}
+EXPORT_SYMBOL(netif_receive_skb);
+
+/* Network device is going away, flush any packets still pending
+ * Called with irqs disabled.
+ */
+static void flush_backlog(void *arg)
+{
+	struct net_device *dev = arg;
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct sk_buff *skb, *tmp;
+
+	rps_lock(sd);
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+		if (skb->dev == dev) {
+			__skb_unlink(skb, &sd->input_pkt_queue);
+			kfree_skb(skb);
+			input_queue_head_incr(sd);
+		}
+	}
+	rps_unlock(sd);
+
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+		if (skb->dev == dev) {
+			__skb_unlink(skb, &sd->process_queue);
+			kfree_skb(skb);
+			input_queue_head_incr(sd);
+		}
+	}
+}
+
+static int napi_gro_complete(struct sk_buff *skb)
+{
+	struct packet_type *ptype;
+	__be16 type = skb->protocol;
+	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	int err = -ENOENT;
+
+	if (NAPI_GRO_CB(skb)->count == 1) {
+		skb_shinfo(skb)->gso_size = 0;
+		goto out;
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, head, list) {
+		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
+			continue;
+
+		err = ptype->gro_complete(skb);
+		break;
+	}
+	rcu_read_unlock();
+
+	if (err) {
+		WARN_ON(&ptype->list == head);
+		kfree_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+out:
+	return netif_receive_skb(skb);
+}
+
+inline void napi_gro_flush(struct napi_struct *napi)
+{
+	struct sk_buff *skb, *next;
+
+	for (skb = napi->gro_list; skb; skb = next) {
+		next = skb->next;
+		skb->next = NULL;
+		napi_gro_complete(skb);
+	}
+
+	napi->gro_count = 0;
+	napi->gro_list = NULL;
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	struct sk_buff **pp = NULL;
+	struct packet_type *ptype;
+	__be16 type = skb->protocol;
+	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	int same_flow;
+	int mac_len;
+	enum gro_result ret;
+
+	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
+		goto normal;
+
+	if (skb_is_gso(skb) || skb_has_frag_list(skb))
+		goto normal;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, head, list) {
+		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
+			continue;
+
+		skb_set_network_header(skb, skb_gro_offset(skb));
+		mac_len = skb->network_header - skb->mac_header;
+		skb->mac_len = mac_len;
+		NAPI_GRO_CB(skb)->same_flow = 0;
+		NAPI_GRO_CB(skb)->flush = 0;
+		NAPI_GRO_CB(skb)->free = 0;
+
+		pp = ptype->gro_receive(&napi->gro_list, skb);
+		break;
+	}
+	rcu_read_unlock();
+
+	if (&ptype->list == head)
+		goto normal;
+
+	same_flow = NAPI_GRO_CB(skb)->same_flow;
+	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+	if (pp) {
+		struct sk_buff *nskb = *pp;
+
+		*pp = nskb->next;
+		nskb->next = NULL;
+		napi_gro_complete(nskb);
+		napi->gro_count--;
+	}
+
+	if (same_flow)
+		goto ok;
+
+	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
+		goto normal;
+
+	napi->gro_count++;
+	NAPI_GRO_CB(skb)->count = 1;
+	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+	skb->next = napi->gro_list;
+	napi->gro_list = skb;
+	ret = GRO_HELD;
+
+pull:
+	if (skb_headlen(skb) < skb_gro_offset(skb)) {
+		int grow = skb_gro_offset(skb) - skb_headlen(skb);
+
+		BUG_ON(skb->end - skb->tail < grow);
+
+		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+		skb->tail += grow;
+		skb->data_len -= grow;
+
+		skb_shinfo(skb)->frags[0].page_offset += grow;
+		skb_shinfo(skb)->frags[0].size -= grow;
+
+		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
+			put_page(skb_shinfo(skb)->frags[0].page);
+			memmove(skb_shinfo(skb)->frags,
+				skb_shinfo(skb)->frags + 1,
+				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
+		}
+	}
+
+ok:
+	return ret;
+
+normal:
+	ret = GRO_NORMAL;
+	goto pull;
+}
+EXPORT_SYMBOL(dev_gro_receive);
+
+static inline gro_result_t
+__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	struct sk_buff *p;
+	unsigned int maclen = skb->dev->hard_header_len;
+
+	for (p = napi->gro_list; p; p = p->next) {
+		unsigned long diffs;
+
+		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+		diffs |= p->vlan_tci ^ skb->vlan_tci;
+		if (maclen == ETH_HLEN)
+			diffs |= compare_ether_header(skb_mac_header(p),
+						      skb_gro_mac_header(skb));
+		else if (!diffs)
+			diffs = memcmp(skb_mac_header(p),
+				       skb_gro_mac_header(skb),
+				       maclen);
+		NAPI_GRO_CB(p)->same_flow = !diffs;
+		NAPI_GRO_CB(p)->flush = 0;
+	}
+
+	return dev_gro_receive(napi, skb);
+}
+
+gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+{
+	switch (ret) {
+	case GRO_NORMAL:
+		if (netif_receive_skb(skb))
+			ret = GRO_DROP;
+		break;
+
+	case GRO_DROP:
+	case GRO_MERGED_FREE:
+		kfree_skb(skb);
+		break;
+
+	case GRO_HELD:
+	case GRO_MERGED:
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(napi_skb_finish);
+
+void skb_gro_reset_offset(struct sk_buff *skb)
+{
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	NAPI_GRO_CB(skb)->frag0 = NULL;
+	NAPI_GRO_CB(skb)->frag0_len = 0;
+
+	if (skb->mac_header == skb->tail &&
+	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
+		NAPI_GRO_CB(skb)->frag0 =
+			page_address(skb_shinfo(skb)->frags[0].page) +
+			skb_shinfo(skb)->frags[0].page_offset;
+		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
+	}
+}
+EXPORT_SYMBOL(skb_gro_reset_offset);
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	skb_gro_reset_offset(skb);
+
+	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+	__skb_pull(skb, skb_headlen(skb));
+	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+	skb->vlan_tci = 0;
+	skb->dev = napi->dev;
+	skb->skb_iif = 0;
+
+	napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+	struct sk_buff *skb = napi->skb;
+
+	if (!skb) {
+		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
+		if (skb)
+			napi->skb = skb;
+	}
+	return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
+			       gro_result_t ret)
+{
+	switch (ret) {
+	case GRO_NORMAL:
+	case GRO_HELD:
+		skb->protocol = eth_type_trans(skb, skb->dev);
+
+		if (ret == GRO_HELD)
+			skb_gro_pull(skb, -ETH_HLEN);
+		else if (netif_receive_skb(skb))
+			ret = GRO_DROP;
+		break;
+
+	case GRO_DROP:
+	case GRO_MERGED_FREE:
+		napi_reuse_skb(napi, skb);
+		break;
+
+	case GRO_MERGED:
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(napi_frags_finish);
+
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+	struct sk_buff *skb = napi->skb;
+	struct ethhdr *eth;
+	unsigned int hlen;
+	unsigned int off;
+
+	napi->skb = NULL;
+
+	skb_reset_mac_header(skb);
+	skb_gro_reset_offset(skb);
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*eth);
+	eth = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		eth = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!eth)) {
+			napi_reuse_skb(napi, skb);
+			skb = NULL;
+			goto out;
+		}
+	}
+
+	skb_gro_pull(skb, sizeof(*eth));
+
+	/*
+	 * This works because the only protocols we care about don't require
+	 * special handling.  We'll fix it up properly at the end.
+	 */
+	skb->protocol = eth->h_proto;
+
+out:
+	return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+	struct sk_buff *skb = napi_frags_skb(napi);
+
+	if (!skb)
+		return GRO_DROP;
+
+	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/*
+ * net_rps_action sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
+ */
+static void net_rps_action_and_irq_enable(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	struct softnet_data *remsd = sd->rps_ipi_list;
+
+	if (remsd) {
+		sd->rps_ipi_list = NULL;
+
+		local_irq_enable();
+
+		/* Send pending IPI's to kick RPS processing on remote cpus. */
+		while (remsd) {
+			struct softnet_data *next = remsd->rps_ipi_next;
+
+			if (cpu_online(remsd->cpu))
+				__smp_call_function_single(remsd->cpu,
+							   &remsd->csd, 0);
+			remsd = next;
+		}
+	} else
+#endif
+		local_irq_enable();
+}
+
+static int process_backlog(struct napi_struct *napi, int quota)
+{
+	int work = 0;
+	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
+
+#ifdef CONFIG_RPS
+	/* Check if we have pending ipi, its better to send them now,
+	 * not waiting net_rx_action() end.
+	 */
+	if (sd->rps_ipi_list) {
+		local_irq_disable();
+		net_rps_action_and_irq_enable(sd);
+	}
+#endif
+	napi->weight = weight_p;
+	local_irq_disable();
+	while (work < quota) {
+		struct sk_buff *skb;
+		unsigned int qlen;
+
+		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			local_irq_enable();
+			__netif_receive_skb(skb);
+			local_irq_disable();
+			input_queue_head_incr(sd);
+			if (++work >= quota) {
+				local_irq_enable();
+				return work;
+			}
+		}
+
+		rps_lock(sd);
+		qlen = skb_queue_len(&sd->input_pkt_queue);
+		if (qlen)
+			skb_queue_splice_tail_init(&sd->input_pkt_queue,
+						   &sd->process_queue);
+
+		if (qlen < quota - work) {
+			/*
+			 * Inline a custom version of __napi_complete().
+			 * only current cpu owns and manipulates this napi,
+			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
+			 * we can use a plain write instead of clear_bit(),
+			 * and we dont need an smp_mb() memory barrier.
+			 */
+			list_del(&napi->poll_list);
+			napi->state = 0;
+
+			quota = work + qlen;
+		}
+		rps_unlock(sd);
+	}
+	local_irq_enable();
+
+	return work;
+}
+
+/**
+ * __napi_schedule - schedule for receive
+ * @n: entry to schedule
+ *
+ * The entry's receive function will be scheduled to run
+ */
+void __napi_schedule(struct napi_struct *n)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	____napi_schedule(&__get_cpu_var(softnet_data), n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__napi_schedule);
+
+void __napi_complete(struct napi_struct *n)
+{
+	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+	BUG_ON(n->gro_list);
+
+	list_del(&n->poll_list);
+	smp_mb__before_clear_bit();
+	clear_bit(NAPI_STATE_SCHED, &n->state);
+}
+EXPORT_SYMBOL(__napi_complete);
+
+void napi_complete(struct napi_struct *n)
+{
+	unsigned long flags;
+
+	/*
+	 * don't let napi dequeue from the cpu poll list
+	 * just in case its running on a different cpu
+	 */
+	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+		return;
+
+	napi_gro_flush(n);
+	local_irq_save(flags);
+	__napi_complete(n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(napi_complete);
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+		    int (*poll)(struct napi_struct *, int), int weight)
+{
+	INIT_LIST_HEAD(&napi->poll_list);
+	napi->gro_count = 0;
+	napi->gro_list = NULL;
+	napi->skb = NULL;
+	napi->poll = poll;
+	napi->weight = weight;
+	list_add(&napi->dev_list, &dev->napi_list);
+	napi->dev = dev;
+#ifdef CONFIG_NETPOLL
+	spin_lock_init(&napi->poll_lock);
+	napi->poll_owner = -1;
+#endif
+	set_bit(NAPI_STATE_SCHED, &napi->state);
+}
+EXPORT_SYMBOL(netif_napi_add);
+
+void netif_napi_del(struct napi_struct *napi)
+{
+	struct sk_buff *skb, *next;
+
+	list_del_init(&napi->dev_list);
+	napi_free_frags(napi);
+
+	for (skb = napi->gro_list; skb; skb = next) {
+		next = skb->next;
+		skb->next = NULL;
+		kfree_skb(skb);
+	}
+
+	napi->gro_list = NULL;
+	napi->gro_count = 0;
+}
+EXPORT_SYMBOL(netif_napi_del);
+
+static void net_rx_action(struct softirq_action *h)
+{
+	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	unsigned long time_limit = jiffies + 2;
+	int budget = netdev_budget;
+	void *have;
+
+	local_irq_disable();
+
+	while (!list_empty(&sd->poll_list)) {
+		struct napi_struct *n;
+		int work, weight;
+
+		/* If softirq window is exhuasted then punt.
+		 * Allow this to run for 2 jiffies since which will allow
+		 * an average latency of 1.5/HZ.
+		 */
+		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
+			goto softnet_break;
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->poll()
+		 * calls can remove this head entry from the list.
+		 */
+		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
+
+		have = netpoll_poll_lock(n);
+
+		weight = n->weight;
+
+		/* This NAPI_STATE_SCHED test is for avoiding a race
+		 * with netpoll's poll_napi().  Only the entity which
+		 * obtains the lock and sees NAPI_STATE_SCHED set will
+		 * actually make the ->poll() call.  Therefore we avoid
+		 * accidentally calling ->poll() when NAPI is not scheduled.
+		 */
+		work = 0;
+		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+			work = n->poll(n, weight);
+			trace_napi_poll(n);
+		}
+
+		WARN_ON_ONCE(work > weight);
+
+		budget -= work;
+
+		local_irq_disable();
+
+		/* Drivers must not modify the NAPI state if they
+		 * consume the entire weight.  In such cases this code
+		 * still "owns" the NAPI instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (unlikely(work == weight)) {
+			if (unlikely(napi_disable_pending(n))) {
+				local_irq_enable();
+				napi_complete(n);
+				local_irq_disable();
+			} else
+				list_move_tail(&n->poll_list, &sd->poll_list);
+		}
+
+		netpoll_poll_unlock(have);
+	}
+out:
+	net_rps_action_and_irq_enable(sd);
+
+#ifdef CONFIG_NET_DMA
+	/*
+	 * There may not be any more sk_buffs coming right now, so push
+	 * any pending DMA copies to hardware
+	 */
+	dma_issue_pending_all();
+#endif
+
+	return;
+
+softnet_break:
+	sd->time_squeeze++;
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	goto out;
+}
+
+static gifconf_func_t *gifconf_list[NPROTO];
+
+/**
+ *	register_gifconf	-	register a SIOCGIF handler
+ *	@family: Address family
+ *	@gifconf: Function handler
+ *
+ *	Register protocol dependent address dumping routines. The handler
+ *	that is passed must not be freed or reused until it has been replaced
+ *	by another handler.
+ */
+int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
+{
+	if (family >= NPROTO)
+		return -EINVAL;
+	gifconf_list[family] = gifconf;
+	return 0;
+}
+EXPORT_SYMBOL(register_gifconf);
+
+
+/*
+ *	Map an interface index to its name (SIOCGIFNAME)
+ */
+
+/*
+ *	We need this ioctl for efficient implementation of the
+ *	if_indextoname() function required by the IPv6 API.  Without
+ *	it, we would have to search all the interfaces to find a
+ *	match.  --pb
+ */
+
+static int dev_ifname(struct net *net, struct ifreq __user *arg)
+{
+	struct net_device *dev;
+	struct ifreq ifr;
+
+	/*
+	 *	Fetch the caller's info block.
+	 */
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
+	if (!dev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	strcpy(ifr.ifr_name, dev->name);
+	rcu_read_unlock();
+
+	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ *	Perform a SIOCGIFCONF call. This structure will change
+ *	size eventually, and there is nothing I can do about it.
+ *	Thus we will need a 'compatibility mode'.
+ */
+
+static int dev_ifconf(struct net *net, char __user *arg)
+{
+	struct ifconf ifc;
+	struct net_device *dev;
+	char __user *pos;
+	int len;
+	int total;
+	int i;
+
+	/*
+	 *	Fetch the caller's info block.
+	 */
+
+	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
+		return -EFAULT;
+
+	pos = ifc.ifc_buf;
+	len = ifc.ifc_len;
+
+	/*
+	 *	Loop over the interfaces, and write an info block for each.
+	 */
+
+	total = 0;
+	for_each_netdev(net, dev) {
+		for (i = 0; i < NPROTO; i++) {
+			if (gifconf_list[i]) {
+				int done;
+				if (!pos)
+					done = gifconf_list[i](dev, NULL, 0);
+				else
+					done = gifconf_list[i](dev, pos + total,
+							       len - total);
+				if (done < 0)
+					return -EFAULT;
+				total += done;
+			}
+		}
+	}
+
+	/*
+	 *	All done.  Write the updated control block back to the caller.
+	 */
+	ifc.ifc_len = total;
+
+	/*
+	 * 	Both BSD and Solaris return 0 here, so we do too.
+	 */
+	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	This is invoked by the /proc filesystem handler to display a device
+ *	in detail.
+ */
+void *dev_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct net *net = seq_file_net(seq);
+	loff_t off;
+	struct net_device *dev;
+
+	rcu_read_lock();
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	off = 1;
+	for_each_netdev_rcu(net, dev)
+		if (off++ == *pos)
+			return dev;
+
+	return NULL;
+}
+
+void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net_device *dev = v;
+
+	if (v == SEQ_START_TOKEN)
+		dev = first_net_device_rcu(seq_file_net(seq));
+	else
+		dev = next_net_device_rcu(dev);
+
+	++*pos;
+	return dev;
+}
+
+void dev_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
+{
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+		   dev->name, stats->rx_bytes, stats->rx_packets,
+		   stats->rx_errors,
+		   stats->rx_dropped + stats->rx_missed_errors,
+		   stats->rx_fifo_errors,
+		   stats->rx_length_errors + stats->rx_over_errors +
+		    stats->rx_crc_errors + stats->rx_frame_errors,
+		   stats->rx_compressed, stats->multicast,
+		   stats->tx_bytes, stats->tx_packets,
+		   stats->tx_errors, stats->tx_dropped,
+		   stats->tx_fifo_errors, stats->collisions,
+		   stats->tx_carrier_errors +
+		    stats->tx_aborted_errors +
+		    stats->tx_window_errors +
+		    stats->tx_heartbeat_errors,
+		   stats->tx_compressed);
+}
+
+/*
+ *	Called from the PROCfs module. This now uses the new arbitrary sized
+ *	/proc/net interface to create /proc/net/dev
+ */
+static int dev_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Inter-|   Receive                            "
+			      "                    |  Transmit\n"
+			      " face |bytes    packets errs drop fifo frame "
+			      "compressed multicast|bytes    packets errs "
+			      "drop fifo colls carrier compressed\n");
+	else
+		dev_seq_printf_stats(seq, v);
+	return 0;
+}
+
+static struct softnet_data *softnet_get_online(loff_t *pos)
+{
+	struct softnet_data *sd = NULL;
+
+	while (*pos < nr_cpu_ids)
+		if (cpu_online(*pos)) {
+			sd = &per_cpu(softnet_data, *pos);
+			break;
+		} else
+			++*pos;
+	return sd;
+}
+
+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return softnet_get_online(pos);
+}
+
+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return softnet_get_online(pos);
+}
+
+static void softnet_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int softnet_seq_show(struct seq_file *seq, void *v)
+{
+	struct softnet_data *sd = v;
+
+	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		   sd->processed, sd->dropped, sd->time_squeeze, 0,
+		   0, 0, 0, 0, /* was fastroute */
+		   sd->cpu_collision, sd->received_rps);
+	return 0;
+}
+
+static const struct seq_operations dev_seq_ops = {
+	.start = dev_seq_start,
+	.next  = dev_seq_next,
+	.stop  = dev_seq_stop,
+	.show  = dev_seq_show,
+};
+
+static int dev_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &dev_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = dev_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static const struct seq_operations softnet_seq_ops = {
+	.start = softnet_seq_start,
+	.next  = softnet_seq_next,
+	.stop  = softnet_seq_stop,
+	.show  = softnet_seq_show,
+};
+
+static int softnet_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &softnet_seq_ops);
+}
+
+static const struct file_operations softnet_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = softnet_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static void *ptype_get_idx(loff_t pos)
+{
+	struct packet_type *pt = NULL;
+	loff_t i = 0;
+	int t;
+
+	list_for_each_entry_rcu(pt, &ptype_all, list) {
+		if (i == pos)
+			return pt;
+		++i;
+	}
+
+	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
+		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+			if (i == pos)
+				return pt;
+			++i;
+		}
+	}
+	return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct packet_type *pt;
+	struct list_head *nxt;
+	int hash;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ptype_get_idx(0);
+
+	pt = v;
+	nxt = pt->list.next;
+	if (pt->type == htons(ETH_P_ALL)) {
+		if (nxt != &ptype_all)
+			goto found;
+		hash = 0;
+		nxt = ptype_base[0].next;
+	} else
+		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+
+	while (nxt == &ptype_base[hash]) {
+		if (++hash >= PTYPE_HASH_SIZE)
+			return NULL;
+		nxt = ptype_base[hash].next;
+	}
+found:
+	return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+	struct packet_type *pt = v;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Type Device      Function\n");
+	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+		if (pt->type == htons(ETH_P_ALL))
+			seq_puts(seq, "ALL ");
+		else
+			seq_printf(seq, "%04x", ntohs(pt->type));
+
+		seq_printf(seq, " %-8s %pF\n",
+			   pt->dev ? pt->dev->name : "", pt->func);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+	.start = ptype_seq_start,
+	.next  = ptype_seq_next,
+	.stop  = ptype_seq_stop,
+	.show  = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ptype_seq_ops,
+			sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ptype_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ptype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+
+static int __net_init dev_proc_net_init(struct net *net)
+{
+	int rc = -ENOMEM;
+
+	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
+		goto out;
+	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
+		goto out_dev;
+	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
+		goto out_softnet;
+
+	if (wext_proc_init(net))
+		goto out_ptype;
+	rc = 0;
+out:
+	return rc;
+out_ptype:
+	proc_net_remove(net, "ptype");
+out_softnet:
+	proc_net_remove(net, "softnet_stat");
+out_dev:
+	proc_net_remove(net, "dev");
+	goto out;
+}
+
+static void __net_exit dev_proc_net_exit(struct net *net)
+{
+	wext_proc_exit(net);
+
+	proc_net_remove(net, "ptype");
+	proc_net_remove(net, "softnet_stat");
+	proc_net_remove(net, "dev");
+}
+
+static struct pernet_operations __net_initdata dev_proc_ops = {
+	.init = dev_proc_net_init,
+	.exit = dev_proc_net_exit,
+};
+
+static int __init dev_proc_init(void)
+{
+	return register_pernet_subsys(&dev_proc_ops);
+}
+#else
+#define dev_proc_init() 0
+#endif	/* CONFIG_PROC_FS */
+
+
+/**
+ *	netdev_set_master	-	set up master pointer
+ *	@slave: slave device
+ *	@master: new master device
+ *
+ *	Changes the master device of the slave. Pass %NULL to break the
+ *	bonding. The caller must hold the RTNL semaphore. On a failure
+ *	a negative errno code is returned. On success the reference counts
+ *	are adjusted and the function returns zero.
+ */
+int netdev_set_master(struct net_device *slave, struct net_device *master)
+{
+	struct net_device *old = slave->master;
+
+	ASSERT_RTNL();
+
+	if (master) {
+		if (old)
+			return -EBUSY;
+		dev_hold(master);
+	}
+
+	slave->master = master;
+
+	if (old)
+		dev_put(old);
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_master);
+
+/**
+ *	netdev_set_bond_master	-	set up bonding master/slave pair
+ *	@slave: slave device
+ *	@master: new master device
+ *
+ *	Changes the master device of the slave. Pass %NULL to break the
+ *	bonding. The caller must hold the RTNL semaphore. On a failure
+ *	a negative errno code is returned. On success %RTM_NEWLINK is sent
+ *	to the routing socket and the function returns zero.
+ */
+int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = netdev_set_master(slave, master);
+	if (err)
+		return err;
+	if (master)
+		slave->flags |= IFF_SLAVE;
+	else
+		slave->flags &= ~IFF_SLAVE;
+
+	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_bond_master);
+
+static void dev_change_rx_flags(struct net_device *dev, int flags)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
+		ops->ndo_change_rx_flags(dev, flags);
+}
+
+static int __dev_set_promiscuity(struct net_device *dev, int inc)
+{
+	unsigned short old_flags = dev->flags;
+	uid_t uid;
+	gid_t gid;
+
+	ASSERT_RTNL();
+
+	dev->flags |= IFF_PROMISC;
+	dev->promiscuity += inc;
+	if (dev->promiscuity == 0) {
+		/*
+		 * Avoid overflow.
+		 * If inc causes overflow, untouch promisc and return error.
+		 */
+		if (inc < 0)
+			dev->flags &= ~IFF_PROMISC;
+		else {
+			dev->promiscuity -= inc;
+			printk(KERN_WARNING "%s: promiscuity touches roof, "
+				"set promiscuity failed, promiscuity feature "
+				"of device might be broken.\n", dev->name);
+			return -EOVERFLOW;
+		}
+	}
+	if (dev->flags != old_flags) {
+		printk(KERN_INFO "device %s %s promiscuous mode\n",
+		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
+							       "left");
+		if (audit_enabled) {
+			current_uid_gid(&uid, &gid);
+			audit_log(current->audit_context, GFP_ATOMIC,
+				AUDIT_ANOM_PROMISCUOUS,
+				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
+				dev->name, (dev->flags & IFF_PROMISC),
+				(old_flags & IFF_PROMISC),
+				audit_get_loginuid(current),
+				uid, gid,
+				audit_get_sessionid(current));
+		}
+
+		dev_change_rx_flags(dev, IFF_PROMISC);
+	}
+	return 0;
+}
+
+/**
+ *	dev_set_promiscuity	- update promiscuity count on a device
+ *	@dev: device
+ *	@inc: modifier
+ *
+ *	Add or remove promiscuity from a device. While the count in the device
+ *	remains above zero the interface remains promiscuous. Once it hits zero
+ *	the device reverts back to normal filtering operation. A negative inc
+ *	value is used to drop promiscuity on the device.
+ *	Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+	unsigned short old_flags = dev->flags;
+	int err;
+
+	err = __dev_set_promiscuity(dev, inc);
+	if (err < 0)
+		return err;
+	if (dev->flags != old_flags)
+		dev_set_rx_mode(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+/**
+ *	dev_set_allmulti	- update allmulti count on a device
+ *	@dev: device
+ *	@inc: modifier
+ *
+ *	Add or remove reception of all multicast frames to a device. While the
+ *	count in the device remains above zero the interface remains listening
+ *	to all interfaces. Once it hits zero the device reverts back to normal
+ *	filtering operation. A negative @inc value is used to drop the counter
+ *	when releasing a resource needing all multicasts.
+ *	Return 0 if successful or a negative errno code on error.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+	unsigned short old_flags = dev->flags;
+
+	ASSERT_RTNL();
+
+	dev->flags |= IFF_ALLMULTI;
+	dev->allmulti += inc;
+	if (dev->allmulti == 0) {
+		/*
+		 * Avoid overflow.
+		 * If inc causes overflow, untouch allmulti and return error.
+		 */
+		if (inc < 0)
+			dev->flags &= ~IFF_ALLMULTI;
+		else {
+			dev->allmulti -= inc;
+			printk(KERN_WARNING "%s: allmulti touches roof, "
+				"set allmulti failed, allmulti feature of "
+				"device might be broken.\n", dev->name);
+			return -EOVERFLOW;
+		}
+	}
+	if (dev->flags ^ old_flags) {
+		dev_change_rx_flags(dev, IFF_ALLMULTI);
+		dev_set_rx_mode(dev);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/*
+ *	Upload unicast and multicast address lists to device and
+ *	configure RX filtering. When the device doesn't support unicast
+ *	filtering it is put in promiscuous mode while unicast addresses
+ *	are present.
+ */
+void __dev_set_rx_mode(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	/* dev_open will call this function so the list will stay sane. */
+	if (!(dev->flags&IFF_UP))
+		return;
+
+	if (!netif_device_present(dev))
+		return;
+
+	if (ops->ndo_set_rx_mode)
+		ops->ndo_set_rx_mode(dev);
+	else {
+		/* Unicast addresses changes may only happen under the rtnl,
+		 * therefore calling __dev_set_promiscuity here is safe.
+		 */
+		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
+			__dev_set_promiscuity(dev, 1);
+			dev->uc_promisc = 1;
+		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
+			__dev_set_promiscuity(dev, -1);
+			dev->uc_promisc = 0;
+		}
+
+		if (ops->ndo_set_multicast_list)
+			ops->ndo_set_multicast_list(dev);
+	}
+}
+
+void dev_set_rx_mode(struct net_device *dev)
+{
+	netif_addr_lock_bh(dev);
+	__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
+}
+
+/**
+ *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
+ *	@dev: device
+ *	@cmd: memory area for ethtool_ops::get_settings() result
+ *
+ *      The cmd arg is initialized properly (cleared and
+ *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
+ *
+ *	Return device's ethtool_ops::get_settings() result value or
+ *	-EOPNOTSUPP when device doesn't expose
+ *	ethtool_ops::get_settings() operation.
+ */
+int dev_ethtool_get_settings(struct net_device *dev,
+			     struct ethtool_cmd *cmd)
+{
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+
+	memset(cmd, 0, sizeof(struct ethtool_cmd));
+	cmd->cmd = ETHTOOL_GSET;
+	return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(dev_ethtool_get_settings);
+
+/**
+ *	dev_get_flags - get flags reported to userspace
+ *	@dev: device
+ *
+ *	Get the combination of flag bits exported through APIs to userspace.
+ */
+unsigned dev_get_flags(const struct net_device *dev)
+{
+	unsigned flags;
+
+	flags = (dev->flags & ~(IFF_PROMISC |
+				IFF_ALLMULTI |
+				IFF_RUNNING |
+				IFF_LOWER_UP |
+				IFF_DORMANT)) |
+		(dev->gflags & (IFF_PROMISC |
+				IFF_ALLMULTI));
+
+	if (netif_running(dev)) {
+		if (netif_oper_up(dev))
+			flags |= IFF_RUNNING;
+		if (netif_carrier_ok(dev))
+			flags |= IFF_LOWER_UP;
+		if (netif_dormant(dev))
+			flags |= IFF_DORMANT;
+	}
+
+	return flags;
+}
+EXPORT_SYMBOL(dev_get_flags);
+
+int __dev_change_flags(struct net_device *dev, unsigned int flags)
+{
+	int old_flags = dev->flags;
+	int ret;
+
+	ASSERT_RTNL();
+
+	/*
+	 *	Set the flags on our device.
+	 */
+
+	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
+			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
+			       IFF_AUTOMEDIA)) |
+		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
+				    IFF_ALLMULTI));
+
+	/*
+	 *	Load in the correct multicast list now the flags have changed.
+	 */
+
+	if ((old_flags ^ flags) & IFF_MULTICAST)
+		dev_change_rx_flags(dev, IFF_MULTICAST);
+
+	dev_set_rx_mode(dev);
+
+	/*
+	 *	Have we downed the interface. We handle IFF_UP ourselves
+	 *	according to user attempts to set it, rather than blindly
+	 *	setting it.
+	 */
+
+	ret = 0;
+	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
+		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
+
+		if (!ret)
+			dev_set_rx_mode(dev);
+	}
+
+	if ((flags ^ dev->gflags) & IFF_PROMISC) {
+		int inc = (flags & IFF_PROMISC) ? 1 : -1;
+
+		dev->gflags ^= IFF_PROMISC;
+		dev_set_promiscuity(dev, inc);
+	}
+
+	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
+	   is important. Some (broken) drivers set IFF_PROMISC, when
+	   IFF_ALLMULTI is requested not asking us and not reporting.
+	 */
+	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
+		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
+
+		dev->gflags ^= IFF_ALLMULTI;
+		dev_set_allmulti(dev, inc);
+	}
+
+	return ret;
+}
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
+{
+	unsigned int changes = dev->flags ^ old_flags;
+
+	if (changes & IFF_UP) {
+		if (dev->flags & IFF_UP)
+			call_netdevice_notifiers(NETDEV_UP, dev);
+		else
+			call_netdevice_notifiers(NETDEV_DOWN, dev);
+	}
+
+	if (dev->flags & IFF_UP &&
+	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
+		call_netdevice_notifiers(NETDEV_CHANGE, dev);
+}
+
+/**
+ *	dev_change_flags - change device settings
+ *	@dev: device
+ *	@flags: device state flags
+ *
+ *	Change settings on device based state flags. The flags are
+ *	in the userspace exported format.
+ */
+int dev_change_flags(struct net_device *dev, unsigned flags)
+{
+	int ret, changes;
+	int old_flags = dev->flags;
+
+	ret = __dev_change_flags(dev, flags);
+	if (ret < 0)
+		return ret;
+
+	changes = old_flags ^ dev->flags;
+	if (changes)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
+
+	__dev_notify_flags(dev, old_flags);
+	return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+/**
+ *	dev_set_mtu - Change maximum transfer unit
+ *	@dev: device
+ *	@new_mtu: new transfer unit
+ *
+ *	Change the maximum transfer size of the network device.
+ */
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int err;
+
+	if (new_mtu == dev->mtu)
+		return 0;
+
+	/*	MTU must be positive.	 */
+	if (new_mtu < 0)
+		return -EINVAL;
+
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	err = 0;
+	if (ops->ndo_change_mtu)
+		err = ops->ndo_change_mtu(dev, new_mtu);
+	else
+		dev->mtu = new_mtu;
+
+	if (!err && dev->flags & IFF_UP)
+		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ *	dev_set_group - Change group this device belongs to
+ *	@dev: device
+ *	@new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+	dev->group = new_group;
+}
+EXPORT_SYMBOL(dev_set_group);
+
+/**
+ *	dev_set_mac_address - Change Media Access Control Address
+ *	@dev: device
+ *	@sa: new address
+ *
+ *	Change the hardware (MAC) address of the device
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int err;
+
+	if (!ops->ndo_set_mac_address)
+		return -EOPNOTSUPP;
+	if (sa->sa_family != dev->type)
+		return -EINVAL;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	err = ops->ndo_set_mac_address(dev, sa);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+/*
+ *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
+ */
+static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+	int err;
+	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
+
+	if (!dev)
+		return -ENODEV;
+
+	switch (cmd) {
+	case SIOCGIFFLAGS:	/* Get interface flags */
+		ifr->ifr_flags = (short) dev_get_flags(dev);
+		return 0;
+
+	case SIOCGIFMETRIC:	/* Get the metric on the interface
+				   (currently unused) */
+		ifr->ifr_metric = 0;
+		return 0;
+
+	case SIOCGIFMTU:	/* Get the MTU of a device */
+		ifr->ifr_mtu = dev->mtu;
+		return 0;
+
+	case SIOCGIFHWADDR:
+		if (!dev->addr_len)
+			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
+		else
+			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
+			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+		ifr->ifr_hwaddr.sa_family = dev->type;
+		return 0;
+
+	case SIOCGIFSLAVE:
+		err = -EINVAL;
+		break;
+
+	case SIOCGIFMAP:
+		ifr->ifr_map.mem_start = dev->mem_start;
+		ifr->ifr_map.mem_end   = dev->mem_end;
+		ifr->ifr_map.base_addr = dev->base_addr;
+		ifr->ifr_map.irq       = dev->irq;
+		ifr->ifr_map.dma       = dev->dma;
+		ifr->ifr_map.port      = dev->if_port;
+		return 0;
+
+	case SIOCGIFINDEX:
+		ifr->ifr_ifindex = dev->ifindex;
+		return 0;
+
+	case SIOCGIFTXQLEN:
+		ifr->ifr_qlen = dev->tx_queue_len;
+		return 0;
+
+	default:
+		/* dev_ioctl() should ensure this case
+		 * is never reached
+		 */
+		WARN_ON(1);
+		err = -ENOTTY;
+		break;
+
+	}
+	return err;
+}
+
+/*
+ *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ */
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+{
+	int err;
+	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+	const struct net_device_ops *ops;
+
+	if (!dev)
+		return -ENODEV;
+
+	ops = dev->netdev_ops;
+
+	switch (cmd) {
+	case SIOCSIFFLAGS:	/* Set interface flags */
+		return dev_change_flags(dev, ifr->ifr_flags);
+
+	case SIOCSIFMETRIC:	/* Set the metric on the interface
+				   (currently unused) */
+		return -EOPNOTSUPP;
+
+	case SIOCSIFMTU:	/* Set the MTU of a device */
+		return dev_set_mtu(dev, ifr->ifr_mtu);
+
+	case SIOCSIFHWADDR:
+		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+
+	case SIOCSIFHWBROADCAST:
+		if (ifr->ifr_hwaddr.sa_family != dev->type)
+			return -EINVAL;
+		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
+		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+		return 0;
+
+	case SIOCSIFMAP:
+		if (ops->ndo_set_config) {
+			if (!netif_device_present(dev))
+				return -ENODEV;
+			return ops->ndo_set_config(dev, &ifr->ifr_map);
+		}
+		return -EOPNOTSUPP;
+
+	case SIOCADDMULTI:
+		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+			return -EINVAL;
+		if (!netif_device_present(dev))
+			return -ENODEV;
+		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+
+	case SIOCDELMULTI:
+		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
+			return -EINVAL;
+		if (!netif_device_present(dev))
+			return -ENODEV;
+		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+
+	case SIOCSIFTXQLEN:
+		if (ifr->ifr_qlen < 0)
+			return -EINVAL;
+		dev->tx_queue_len = ifr->ifr_qlen;
+		return 0;
+
+	case SIOCSIFNAME:
+		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
+		return dev_change_name(dev, ifr->ifr_newname);
+
+	/*
+	 *	Unknown or private ioctl
+	 */
+	default:
+		if ((cmd >= SIOCDEVPRIVATE &&
+		    cmd <= SIOCDEVPRIVATE + 15) ||
+		    cmd == SIOCBONDENSLAVE ||
+		    cmd == SIOCBONDRELEASE ||
+		    cmd == SIOCBONDSETHWADDR ||
+		    cmd == SIOCBONDSLAVEINFOQUERY ||
+		    cmd == SIOCBONDINFOQUERY ||
+		    cmd == SIOCBONDCHANGEACTIVE ||
+		    cmd == SIOCGMIIPHY ||
+		    cmd == SIOCGMIIREG ||
+		    cmd == SIOCSMIIREG ||
+		    cmd == SIOCBRADDIF ||
+		    cmd == SIOCBRDELIF ||
+		    cmd == SIOCSHWTSTAMP ||
+		    cmd == SIOCWANDEV) {
+			err = -EOPNOTSUPP;
+			if (ops->ndo_do_ioctl) {
+				if (netif_device_present(dev))
+					err = ops->ndo_do_ioctl(dev, ifr, cmd);
+				else
+					err = -ENODEV;
+			}
+		} else
+			err = -EINVAL;
+
+	}
+	return err;
+}
+
+/*
+ *	This function handles all "interface"-type I/O control requests. The actual
+ *	'doing' part of this is dev_ifsioc above.
+ */
+
+/**
+ *	dev_ioctl	-	network device ioctl
+ *	@net: the applicable net namespace
+ *	@cmd: command to issue
+ *	@arg: pointer to a struct ifreq in user space
+ *
+ *	Issue ioctl functions to devices. This is normally called by the
+ *	user space syscall interfaces but can sometimes be useful for
+ *	other purposes. The return value is the return from the syscall if
+ *	positive or a negative errno code on error.
+ */
+
+int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	struct ifreq ifr;
+	int ret;
+	char *colon;
+
+	/* One special case: SIOCGIFCONF takes ifconf argument
+	   and requires shared lock, because it sleeps writing
+	   to user space.
+	 */
+
+	if (cmd == SIOCGIFCONF) {
+		rtnl_lock();
+		ret = dev_ifconf(net, (char __user *) arg);
+		rtnl_unlock();
+		return ret;
+	}
+	if (cmd == SIOCGIFNAME)
+		return dev_ifname(net, (struct ifreq __user *)arg);
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		return -EFAULT;
+
+	ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+	colon = strchr(ifr.ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+	/*
+	 *	See which interface the caller is talking about.
+	 */
+
+	switch (cmd) {
+	/*
+	 *	These ioctl calls:
+	 *	- can be done by all.
+	 *	- atomic and do not require locking.
+	 *	- return a value
+	 */
+	case SIOCGIFFLAGS:
+	case SIOCGIFMETRIC:
+	case SIOCGIFMTU:
+	case SIOCGIFHWADDR:
+	case SIOCGIFSLAVE:
+	case SIOCGIFMAP:
+	case SIOCGIFINDEX:
+	case SIOCGIFTXQLEN:
+		dev_load(net, ifr.ifr_name);
+		rcu_read_lock();
+		ret = dev_ifsioc_locked(net, &ifr, cmd);
+		rcu_read_unlock();
+		if (!ret) {
+			if (colon)
+				*colon = ':';
+			if (copy_to_user(arg, &ifr,
+					 sizeof(struct ifreq)))
+				ret = -EFAULT;
+		}
+		return ret;
+
+	case SIOCETHTOOL:
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ethtool(net, &ifr);
+		rtnl_unlock();
+		if (!ret) {
+			if (colon)
+				*colon = ':';
+			if (copy_to_user(arg, &ifr,
+					 sizeof(struct ifreq)))
+				ret = -EFAULT;
+		}
+		return ret;
+
+	/*
+	 *	These ioctl calls:
+	 *	- require superuser power.
+	 *	- require strict serialization.
+	 *	- return a value
+	 */
+	case SIOCGMIIPHY:
+	case SIOCGMIIREG:
+	case SIOCSIFNAME:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ifsioc(net, &ifr, cmd);
+		rtnl_unlock();
+		if (!ret) {
+			if (colon)
+				*colon = ':';
+			if (copy_to_user(arg, &ifr,
+					 sizeof(struct ifreq)))
+				ret = -EFAULT;
+		}
+		return ret;
+
+	/*
+	 *	These ioctl calls:
+	 *	- require superuser power.
+	 *	- require strict serialization.
+	 *	- do not return a value
+	 */
+	case SIOCSIFFLAGS:
+	case SIOCSIFMETRIC:
+	case SIOCSIFMTU:
+	case SIOCSIFMAP:
+	case SIOCSIFHWADDR:
+	case SIOCSIFSLAVE:
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+	case SIOCSIFHWBROADCAST:
+	case SIOCSIFTXQLEN:
+	case SIOCSMIIREG:
+	case SIOCBONDENSLAVE:
+	case SIOCBONDRELEASE:
+	case SIOCBONDSETHWADDR:
+	case SIOCBONDCHANGEACTIVE:
+	case SIOCBRADDIF:
+	case SIOCBRDELIF:
+	case SIOCSHWTSTAMP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		/* fall through */
+	case SIOCBONDSLAVEINFOQUERY:
+	case SIOCBONDINFOQUERY:
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ifsioc(net, &ifr, cmd);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCGIFMEM:
+		/* Get the per device memory space. We can add this but
+		 * currently do not support it */
+	case SIOCSIFMEM:
+		/* Set the per device memory buffer space.
+		 * Not applicable in our case */
+	case SIOCSIFLINK:
+		return -ENOTTY;
+
+	/*
+	 *	Unknown or private ioctl.
+	 */
+	default:
+		if (cmd == SIOCWANDEV ||
+		    (cmd >= SIOCDEVPRIVATE &&
+		     cmd <= SIOCDEVPRIVATE + 15)) {
+			dev_load(net, ifr.ifr_name);
+			rtnl_lock();
+			ret = dev_ifsioc(net, &ifr, cmd);
+			rtnl_unlock();
+			if (!ret && copy_to_user(arg, &ifr,
+						 sizeof(struct ifreq)))
+				ret = -EFAULT;
+			return ret;
+		}
+		/* Take care of Wireless Extensions */
+		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
+			return wext_handle_ioctl(net, &ifr, cmd, arg);
+		return -ENOTTY;
+	}
+}
+
+
+/**
+ *	dev_new_index	-	allocate an ifindex
+ *	@net: the applicable net namespace
+ *
+ *	Returns a suitable unique value for a new device interface
+ *	number.  The caller must hold the rtnl semaphore or the
+ *	dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+	static int ifindex;
+	for (;;) {
+		if (++ifindex <= 0)
+			ifindex = 1;
+		if (!__dev_get_by_index(net, ifindex))
+			return ifindex;
+	}
+}
+
+/* Delayed registration/unregisteration */
+static LIST_HEAD(net_todo_list);
+
+static void net_set_todo(struct net_device *dev)
+{
+	list_add_tail(&dev->todo_list, &net_todo_list);
+}
+
+static void rollback_registered_many(struct list_head *head)
+{
+	struct net_device *dev, *tmp;
+
+	BUG_ON(dev_boot_phase);
+	ASSERT_RTNL();
+
+	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+		/* Some devices call without registering
+		 * for initialization unwind. Remove those
+		 * devices and proceed with the remaining.
+		 */
+		if (dev->reg_state == NETREG_UNINITIALIZED) {
+			pr_debug("unregister_netdevice: device %s/%p never "
+				 "was registered\n", dev->name, dev);
+
+			WARN_ON(1);
+			list_del(&dev->unreg_list);
+			continue;
+		}
+		dev->dismantle = true;
+		BUG_ON(dev->reg_state != NETREG_REGISTERED);
+	}
+
+	/* If device is running, close it first. */
+	dev_close_many(head);
+
+	list_for_each_entry(dev, head, unreg_list) {
+		/* And unlink it from device chain. */
+		unlist_netdevice(dev);
+
+		dev->reg_state = NETREG_UNREGISTERING;
+	}
+
+	synchronize_net();
+
+	list_for_each_entry(dev, head, unreg_list) {
+		/* Shutdown queueing discipline. */
+		dev_shutdown(dev);
+
+
+		/* Notify protocols, that we are about to destroy
+		   this device. They should clean all the things.
+		*/
+		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+		if (!dev->rtnl_link_ops ||
+		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
+
+		/*
+		 *	Flush the unicast and multicast chains
+		 */
+		dev_uc_flush(dev);
+		dev_mc_flush(dev);
+
+		if (dev->netdev_ops->ndo_uninit)
+			dev->netdev_ops->ndo_uninit(dev);
+
+		/* Notifier chain MUST detach us from master device. */
+		WARN_ON(dev->master);
+
+		/* Remove entries from kobject tree */
+		netdev_unregister_kobject(dev);
+	}
+
+	/* Process any work delayed until the end of the batch */
+	dev = list_first_entry(head, struct net_device, unreg_list);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+
+	rcu_barrier();
+
+	list_for_each_entry(dev, head, unreg_list)
+		dev_put(dev);
+}
+
+static void rollback_registered(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	rollback_registered_many(&single);
+	list_del(&single);
+}
+
+u32 netdev_fix_features(struct net_device *dev, u32 features)
+{
+	/* Fix illegal checksum combinations */
+	if ((features & NETIF_F_HW_CSUM) &&
+	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
+		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
+	}
+
+	if ((features & NETIF_F_NO_CSUM) &&
+	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+		netdev_warn(dev, "mixed no checksumming and other settings.\n");
+		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
+	}
+
+	/* Fix illegal SG+CSUM combinations. */
+	if ((features & NETIF_F_SG) &&
+	    !(features & NETIF_F_ALL_CSUM)) {
+		netdev_dbg(dev,
+			"Dropping NETIF_F_SG since no checksum feature.\n");
+		features &= ~NETIF_F_SG;
+	}
+
+	/* TSO requires that SG is present as well. */
+	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
+		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
+		features &= ~NETIF_F_ALL_TSO;
+	}
+
+	/* TSO ECN requires that TSO is present as well. */
+	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
+		features &= ~NETIF_F_TSO_ECN;
+
+	/* Software GSO depends on SG. */
+	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
+		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
+		features &= ~NETIF_F_GSO;
+	}
+
+	/* UFO needs SG and checksumming */
+	if (features & NETIF_F_UFO) {
+		/* maybe split UFO into V4 and V6? */
+		if (!((features & NETIF_F_GEN_CSUM) ||
+		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+			netdev_dbg(dev,
+				"Dropping NETIF_F_UFO since no checksum offload features.\n");
+			features &= ~NETIF_F_UFO;
+		}
+
+		if (!(features & NETIF_F_SG)) {
+			netdev_dbg(dev,
+				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
+			features &= ~NETIF_F_UFO;
+		}
+	}
+
+	return features;
+}
+EXPORT_SYMBOL(netdev_fix_features);
+
+int __netdev_update_features(struct net_device *dev)
+{
+	u32 features;
+	int err = 0;
+
+	ASSERT_RTNL();
+
+	features = netdev_get_wanted_features(dev);
+
+	if (dev->netdev_ops->ndo_fix_features)
+		features = dev->netdev_ops->ndo_fix_features(dev, features);
+
+	/* driver might be less strict about feature dependencies */
+	features = netdev_fix_features(dev, features);
+
+	if (dev->features == features)
+		return 0;
+
+	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
+		dev->features, features);
+
+	if (dev->netdev_ops->ndo_set_features)
+		err = dev->netdev_ops->ndo_set_features(dev, features);
+
+	if (unlikely(err < 0)) {
+		netdev_err(dev,
+			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
+			err, features, dev->features);
+		return -1;
+	}
+
+	if (!err)
+		dev->features = features;
+
+	return 1;
+}
+
+/**
+ *	netdev_update_features - recalculate device features
+ *	@dev: the device to check
+ *
+ *	Recalculate dev->features set and send notifications if it
+ *	has changed. Should be called after driver or hardware dependent
+ *	conditions might have changed that influence the features.
+ */
+void netdev_update_features(struct net_device *dev)
+{
+	if (__netdev_update_features(dev))
+		netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_update_features);
+
+/**
+ *	netdev_change_features - recalculate device features
+ *	@dev: the device to check
+ *
+ *	Recalculate dev->features set and send notifications even
+ *	if they have not changed. Should be called instead of
+ *	netdev_update_features() if also dev->vlan_features might
+ *	have changed to allow the changes to be propagated to stacked
+ *	VLAN devices.
+ */
+void netdev_change_features(struct net_device *dev)
+{
+	__netdev_update_features(dev);
+	netdev_features_change(dev);
+}
+EXPORT_SYMBOL(netdev_change_features);
+
+/**
+ *	netif_stacked_transfer_operstate -	transfer operstate
+ *	@rootdev: the root or lower level device to transfer state from
+ *	@dev: the device to transfer operstate to
+ *
+ *	Transfer operational state from root to device. This is normally
+ *	called when a stacking relationship exists between the root
+ *	device and the device(a leaf device).
+ */
+void netif_stacked_transfer_operstate(const struct net_device *rootdev,
+					struct net_device *dev)
+{
+	if (rootdev->operstate == IF_OPER_DORMANT)
+		netif_dormant_on(dev);
+	else
+		netif_dormant_off(dev);
+
+	if (netif_carrier_ok(rootdev)) {
+		if (!netif_carrier_ok(dev))
+			netif_carrier_on(dev);
+	} else {
+		if (netif_carrier_ok(dev))
+			netif_carrier_off(dev);
+	}
+}
+EXPORT_SYMBOL(netif_stacked_transfer_operstate);
+
+#ifdef CONFIG_RPS
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+	unsigned int i, count = dev->num_rx_queues;
+	struct netdev_rx_queue *rx;
+
+	BUG_ON(count < 1);
+
+	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+	if (!rx) {
+		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
+		return -ENOMEM;
+	}
+	dev->_rx = rx;
+
+	for (i = 0; i < count; i++)
+		rx[i].dev = dev;
+	return 0;
+}
+#endif
+
+static void netdev_init_one_queue(struct net_device *dev,
+				  struct netdev_queue *queue, void *_unused)
+{
+	/* Initialize queue lock */
+	spin_lock_init(&queue->_xmit_lock);
+	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+	queue->xmit_lock_owner = -1;
+	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
+	queue->dev = dev;
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+	unsigned int count = dev->num_tx_queues;
+	struct netdev_queue *tx;
+
+	BUG_ON(count < 1);
+
+	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
+	if (!tx) {
+		pr_err("netdev: Unable to allocate %u tx queues.\n",
+		       count);
+		return -ENOMEM;
+	}
+	dev->_tx = tx;
+
+	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+	spin_lock_init(&dev->tx_global_lock);
+
+	return 0;
+}
+
+/**
+ *	register_netdevice	- register a network device
+ *	@dev: device to register
+ *
+ *	Take a completed network device structure and add it to the kernel
+ *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ *	chain. 0 is returned on success. A negative errno code is returned
+ *	on a failure to set up the device, or if the name is a duplicate.
+ *
+ *	Callers must hold the rtnl semaphore. You may want
+ *	register_netdev() instead of this.
+ *
+ *	BUGS:
+ *	The locking appears insufficient to guarantee two parallel registers
+ *	will not get the same name.
+ */
+
+int register_netdevice(struct net_device *dev)
+{
+	int ret;
+	struct net *net = dev_net(dev);
+
+	BUG_ON(dev_boot_phase);
+	ASSERT_RTNL();
+
+	might_sleep();
+
+	/* When net_device's are persistent, this will be fatal. */
+	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+	BUG_ON(!net);
+
+	spin_lock_init(&dev->addr_list_lock);
+	netdev_set_addr_lockdep_class(dev);
+
+	dev->iflink = -1;
+
+	ret = dev_get_valid_name(dev, dev->name);
+	if (ret < 0)
+		goto out;
+
+	/* Init, if this function is available */
+	if (dev->netdev_ops->ndo_init) {
+		ret = dev->netdev_ops->ndo_init(dev);
+		if (ret) {
+			if (ret > 0)
+				ret = -EIO;
+			goto out;
+		}
+	}
+
+	dev->ifindex = dev_new_index(net);
+	if (dev->iflink == -1)
+		dev->iflink = dev->ifindex;
+
+	/* Transfer changeable features to wanted_features and enable
+	 * software offloads (GSO and GRO).
+	 */
+	dev->hw_features |= NETIF_F_SOFT_FEATURES;
+	dev->features |= NETIF_F_SOFT_FEATURES;
+	dev->wanted_features = dev->features & dev->hw_features;
+
+	/* Turn on no cache copy if HW is doing checksum */
+	dev->hw_features |= NETIF_F_NOCACHE_COPY;
+	if ((dev->features & NETIF_F_ALL_CSUM) &&
+	    !(dev->features & NETIF_F_NO_CSUM)) {
+		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
+		dev->features |= NETIF_F_NOCACHE_COPY;
+	}
+
+	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
+	 * vlan_dev_init() will do the dev->features check, so these features
+	 * are enabled only if supported by underlying device.
+	 */
+	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
+
+	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto err_uninit;
+
+	ret = netdev_register_kobject(dev);
+	if (ret)
+		goto err_uninit;
+	dev->reg_state = NETREG_REGISTERED;
+
+	__netdev_update_features(dev);
+
+	/*
+	 *	Default initial state at registry is that the
+	 *	device is present.
+	 */
+
+	set_bit(__LINK_STATE_PRESENT, &dev->state);
+
+	dev_init_scheduler(dev);
+	dev_hold(dev);
+	list_netdevice(dev);
+
+	/* Notify protocols, that a new device appeared. */
+	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+	ret = notifier_to_errno(ret);
+	if (ret) {
+		rollback_registered(dev);
+		dev->reg_state = NETREG_UNREGISTERED;
+	}
+	/*
+	 *	Prevent userspace races by waiting until the network
+	 *	device is fully setup before sending notifications.
+	 */
+	if (!dev->rtnl_link_ops ||
+	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+
+out:
+	return ret;
+
+err_uninit:
+	if (dev->netdev_ops->ndo_uninit)
+		dev->netdev_ops->ndo_uninit(dev);
+	goto out;
+}
+EXPORT_SYMBOL(register_netdevice);
+
+/**
+ *	init_dummy_netdev	- init a dummy network device for NAPI
+ *	@dev: device to init
+ *
+ *	This takes a network device structure and initialize the minimum
+ *	amount of fields so it can be used to schedule NAPI polls without
+ *	registering a full blown interface. This is to be used by drivers
+ *	that need to tie several hardware interfaces to a single NAPI
+ *	poll scheduler due to HW limitations.
+ */
+int init_dummy_netdev(struct net_device *dev)
+{
+	/* Clear everything. Note we don't initialize spinlocks
+	 * are they aren't supposed to be taken by any of the
+	 * NAPI code and this dummy netdev is supposed to be
+	 * only ever used for NAPI polls
+	 */
+	memset(dev, 0, sizeof(struct net_device));
+
+	/* make sure we BUG if trying to hit standard
+	 * register/unregister code path
+	 */
+	dev->reg_state = NETREG_DUMMY;
+
+	/* NAPI wants this */
+	INIT_LIST_HEAD(&dev->napi_list);
+
+	/* a dummy interface is started by default */
+	set_bit(__LINK_STATE_PRESENT, &dev->state);
+	set_bit(__LINK_STATE_START, &dev->state);
+
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(init_dummy_netdev);
+
+
+/**
+ *	register_netdev	- register a network device
+ *	@dev: device to register
+ *
+ *	Take a completed network device structure and add it to the kernel
+ *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
+ *	chain. 0 is returned on success. A negative errno code is returned
+ *	on a failure to set up the device, or if the name is a duplicate.
+ *
+ *	This is a wrapper around register_netdevice that takes the rtnl semaphore
+ *	and expands the device name if you passed a format string to
+ *	alloc_netdev.
+ */
+int register_netdev(struct net_device *dev)
+{
+	int err;
+
+	rtnl_lock();
+	err = register_netdevice(dev);
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(register_netdev);
+
+int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
+/*
+ * netdev_wait_allrefs - wait until all references are gone.
+ *
+ * This is called when unregistering network devices.
+ *
+ * Any protocol or device that holds a reference should register
+ * for netdevice notification, and cleanup and put back the
+ * reference if they receive an UNREGISTER event.
+ * We can get stuck here if buggy protocols don't correctly
+ * call dev_put.
+ */
+static void netdev_wait_allrefs(struct net_device *dev)
+{
+	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
+
+	linkwatch_forget_dev(dev);
+
+	rebroadcast_time = warning_time = jiffies;
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
+		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
+			rtnl_lock();
+
+			/* Rebroadcast unregister notification */
+			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
+			 * should have already handle it the first time */
+
+			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+				     &dev->state)) {
+				/* We must not have linkwatch events
+				 * pending on unregister. If this
+				 * happens, we simply run the queue
+				 * unscheduled, resulting in a noop
+				 * for this device.
+				 */
+				linkwatch_run_queue();
+			}
+
+			__rtnl_unlock();
+
+			rebroadcast_time = jiffies;
+		}
+
+		msleep(250);
+
+		refcnt = netdev_refcnt_read(dev);
+
+		if (time_after(jiffies, warning_time + 10 * HZ)) {
+			printk(KERN_EMERG "unregister_netdevice: "
+			       "waiting for %s to become free. Usage "
+			       "count = %d\n",
+			       dev->name, refcnt);
+			warning_time = jiffies;
+		}
+	}
+}
+
+/* The sequence is:
+ *
+ *	rtnl_lock();
+ *	...
+ *	register_netdevice(x1);
+ *	register_netdevice(x2);
+ *	...
+ *	unregister_netdevice(y1);
+ *	unregister_netdevice(y2);
+ *      ...
+ *	rtnl_unlock();
+ *	free_netdev(y1);
+ *	free_netdev(y2);
+ *
+ * We are invoked by rtnl_unlock().
+ * This allows us to deal with problems:
+ * 1) We can delete sysfs objects which invoke hotplug
+ *    without deadlocking with linkwatch via keventd.
+ * 2) Since we run with the RTNL semaphore not held, we can sleep
+ *    safely in order to wait for the netdev refcnt to drop to zero.
+ *
+ * We must not return until all unregister events added during
+ * the interval the lock was held have been completed.
+ */
+void netdev_run_todo(void)
+{
+	struct list_head list;
+
+	/* Snapshot list, allow later requests */
+	list_replace_init(&net_todo_list, &list);
+
+	__rtnl_unlock();
+
+	while (!list_empty(&list)) {
+		struct net_device *dev
+			= list_first_entry(&list, struct net_device, todo_list);
+		list_del(&dev->todo_list);
+
+		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
+			printk(KERN_ERR "network todo '%s' but state %d\n",
+			       dev->name, dev->reg_state);
+			dump_stack();
+			continue;
+		}
+
+		dev->reg_state = NETREG_UNREGISTERED;
+
+		on_each_cpu(flush_backlog, dev, 1);
+
+		netdev_wait_allrefs(dev);
+
+		/* paranoia */
+		BUG_ON(netdev_refcnt_read(dev));
+		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
+		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
+		WARN_ON(dev->dn_ptr);
+
+		if (dev->destructor)
+			dev->destructor(dev);
+
+		/* Free network device */
+		kobject_put(&dev->dev.kobj);
+	}
+}
+
+/* Convert net_device_stats to rtnl_link_stats64.  They have the same
+ * fields in the same order, with only the type differing.
+ */
+static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+				    const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+        memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+	size_t i, n = sizeof(*stats64) / sizeof(u64);
+	const unsigned long *src = (const unsigned long *)netdev_stats;
+	u64 *dst = (u64 *)stats64;
+
+	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+		     sizeof(*stats64) / sizeof(u64));
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+#endif
+}
+
+/**
+ *	dev_get_stats	- get network device statistics
+ *	@dev: device to get statistics from
+ *	@storage: place to store stats
+ *
+ *	Get network statistics from device. Return @storage.
+ *	The device driver may provide its own method by setting
+ *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ *	otherwise the internal statistics structure is used.
+ */
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+					struct rtnl_link_stats64 *storage)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_get_stats64) {
+		memset(storage, 0, sizeof(*storage));
+		ops->ndo_get_stats64(dev, storage);
+	} else if (ops->ndo_get_stats) {
+		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+	} else {
+		netdev_stats_to_stats64(storage, &dev->stats);
+	}
+	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+	return storage;
+}
+EXPORT_SYMBOL(dev_get_stats);
+
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
+{
+	struct netdev_queue *queue = dev_ingress_queue(dev);
+
+#ifdef CONFIG_NET_CLS_ACT
+	if (queue)
+		return queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+	return queue;
+}
+
+/**
+ *	alloc_netdev_mqs - allocate network device
+ *	@sizeof_priv:	size of private data to allocate space for
+ *	@name:		device name format string
+ *	@setup:		callback to initialize device
+ *	@txqs:		the number of TX subqueues to allocate
+ *	@rxqs:		the number of RX subqueues to allocate
+ *
+ *	Allocates a struct net_device with private data area for driver use
+ *	and performs basic initialization.  Also allocates subquue structs
+ *	for each queue on the device.
+ */
+struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+		void (*setup)(struct net_device *),
+		unsigned int txqs, unsigned int rxqs)
+{
+	struct net_device *dev;
+	size_t alloc_size;
+	struct net_device *p;
+
+	BUG_ON(strlen(name) >= sizeof(dev->name));
+
+	if (txqs < 1) {
+		pr_err("alloc_netdev: Unable to allocate device "
+		       "with zero queues.\n");
+		return NULL;
+	}
+
+#ifdef CONFIG_RPS
+	if (rxqs < 1) {
+		pr_err("alloc_netdev: Unable to allocate device "
+		       "with zero RX queues.\n");
+		return NULL;
+	}
+#endif
+
+	alloc_size = sizeof(struct net_device);
+	if (sizeof_priv) {
+		/* ensure 32-byte alignment of private area */
+		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
+		alloc_size += sizeof_priv;
+	}
+	/* ensure 32-byte alignment of whole construct */
+	alloc_size += NETDEV_ALIGN - 1;
+
+	p = kzalloc(alloc_size, GFP_KERNEL);
+	if (!p) {
+		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
+		return NULL;
+	}
+
+	dev = PTR_ALIGN(p, NETDEV_ALIGN);
+	dev->padded = (char *)dev - (char *)p;
+
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
+		goto free_p;
+
+	if (dev_addr_init(dev))
+		goto free_pcpu;
+
+	dev_mc_init(dev);
+	dev_uc_init(dev);
+
+	dev_net_set(dev, &init_net);
+
+	dev->gso_max_size = GSO_MAX_SIZE;
+
+	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
+	dev->ethtool_ntuple_list.count = 0;
+	INIT_LIST_HEAD(&dev->napi_list);
+	INIT_LIST_HEAD(&dev->unreg_list);
+	INIT_LIST_HEAD(&dev->link_watch_list);
+	dev->priv_flags = IFF_XMIT_DST_RELEASE;
+	setup(dev);
+
+	dev->num_tx_queues = txqs;
+	dev->real_num_tx_queues = txqs;
+	if (netif_alloc_netdev_queues(dev))
+		goto free_all;
+
+#ifdef CONFIG_RPS
+	dev->num_rx_queues = rxqs;
+	dev->real_num_rx_queues = rxqs;
+	if (netif_alloc_rx_queues(dev))
+		goto free_all;
+#endif
+
+	strcpy(dev->name, name);
+	dev->group = INIT_NETDEV_GROUP;
+	return dev;
+
+free_all:
+	free_netdev(dev);
+	return NULL;
+
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
+	kfree(dev->_tx);
+#ifdef CONFIG_RPS
+	kfree(dev->_rx);
+#endif
+
+free_p:
+	kfree(p);
+	return NULL;
+}
+EXPORT_SYMBOL(alloc_netdev_mqs);
+
+/**
+ *	free_netdev - free network device
+ *	@dev: device
+ *
+ *	This function does the last stage of destroying an allocated device
+ * 	interface. The reference to the device object is released.
+ *	If this is the last reference then it will be freed.
+ */
+void free_netdev(struct net_device *dev)
+{
+	struct napi_struct *p, *n;
+
+	release_net(dev_net(dev));
+
+	kfree(dev->_tx);
+#ifdef CONFIG_RPS
+	kfree(dev->_rx);
+#endif
+
+	kfree(rcu_dereference_raw(dev->ingress_queue));
+
+	/* Flush device addresses */
+	dev_addr_flush(dev);
+
+	/* Clear ethtool n-tuple list */
+	ethtool_ntuple_flush(dev);
+
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
+	/*  Compatibility with error handling in drivers */
+	if (dev->reg_state == NETREG_UNINITIALIZED) {
+		kfree((char *)dev - dev->padded);
+		return;
+	}
+
+	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
+	dev->reg_state = NETREG_RELEASED;
+
+	/* will free via device release */
+	put_device(&dev->dev);
+}
+EXPORT_SYMBOL(free_netdev);
+
+/**
+ *	synchronize_net -  Synchronize with packet receive processing
+ *
+ *	Wait for packets currently being received to be done.
+ *	Does not block later packets from starting.
+ */
+void synchronize_net(void)
+{
+	might_sleep();
+	if (rtnl_is_locked())
+		synchronize_rcu_expedited();
+	else
+		synchronize_rcu();
+}
+EXPORT_SYMBOL(synchronize_net);
+
+/**
+ *	unregister_netdevice_queue - remove device from the kernel
+ *	@dev: device
+ *	@head: list
+ *
+ *	This function shuts down a device interface and removes it
+ *	from the kernel tables.
+ *	If head not NULL, device is queued to be unregistered later.
+ *
+ *	Callers must hold the rtnl semaphore.  You may want
+ *	unregister_netdev() instead of this.
+ */
+
+void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
+{
+	ASSERT_RTNL();
+
+	if (head) {
+		list_move_tail(&dev->unreg_list, head);
+	} else {
+		rollback_registered(dev);
+		/* Finish processing unregister after unlock */
+		net_set_todo(dev);
+	}
+}
+EXPORT_SYMBOL(unregister_netdevice_queue);
+
+/**
+ *	unregister_netdevice_many - unregister many devices
+ *	@head: list of devices
+ */
+void unregister_netdevice_many(struct list_head *head)
+{
+	struct net_device *dev;
+
+	if (!list_empty(head)) {
+		rollback_registered_many(head);
+		list_for_each_entry(dev, head, unreg_list)
+			net_set_todo(dev);
+	}
+}
+EXPORT_SYMBOL(unregister_netdevice_many);
+
+/**
+ *	unregister_netdev - remove device from the kernel
+ *	@dev: device
+ *
+ *	This function shuts down a device interface and removes it
+ *	from the kernel tables.
+ *
+ *	This is just a wrapper for unregister_netdevice that takes
+ *	the rtnl semaphore.  In general you want to use this and not
+ *	unregister_netdevice.
+ */
+void unregister_netdev(struct net_device *dev)
+{
+	rtnl_lock();
+	unregister_netdevice(dev);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev);
+
+/**
+ *	dev_change_net_namespace - move device to different nethost namespace
+ *	@dev: device
+ *	@net: network namespace
+ *	@pat: If not NULL name pattern to try if the current device name
+ *	      is already taken in the destination network namespace.
+ *
+ *	This function shuts down a device interface and moves it
+ *	to a new network namespace. On success 0 is returned, on
+ *	a failure a netagive errno code is returned.
+ *
+ *	Callers must hold the rtnl semaphore.
+ */
+
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	/* Don't allow namespace local devices to be moved. */
+	err = -EINVAL;
+	if (dev->features & NETIF_F_NETNS_LOCAL)
+		goto out;
+
+	/* Ensure the device has been registrered */
+	err = -EINVAL;
+	if (dev->reg_state != NETREG_REGISTERED)
+		goto out;
+
+	/* Get out if there is nothing todo */
+	err = 0;
+	if (net_eq(dev_net(dev), net))
+		goto out;
+
+	/* Pick the destination device name, and ensure
+	 * we can use it in the destination network namespace.
+	 */
+	err = -EEXIST;
+	if (__dev_get_by_name(net, dev->name)) {
+		/* We get here if we can't use the current device name */
+		if (!pat)
+			goto out;
+		if (dev_get_valid_name(dev, pat) < 0)
+			goto out;
+	}
+
+	/*
+	 * And now a mini version of register_netdevice unregister_netdevice.
+	 */
+
+	/* If device is running close it first. */
+	dev_close(dev);
+
+	/* And unlink it from device chain */
+	err = -ENODEV;
+	unlist_netdevice(dev);
+
+	synchronize_net();
+
+	/* Shutdown queueing discipline. */
+	dev_shutdown(dev);
+
+	/* Notify protocols, that we are about to destroy
+	   this device. They should clean all the things.
+
+	   Note that dev->reg_state stays at NETREG_REGISTERED.
+	   This is wanted because this way 8021q and macvlan know
+	   the device is just moving and can keep their slaves up.
+	*/
+	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
+
+	/*
+	 *	Flush the unicast and multicast chains
+	 */
+	dev_uc_flush(dev);
+	dev_mc_flush(dev);
+
+	/* Actually switch the network namespace */
+	dev_net_set(dev, net);
+
+	/* If there is an ifindex conflict assign a new one */
+	if (__dev_get_by_index(net, dev->ifindex)) {
+		int iflink = (dev->iflink == dev->ifindex);
+		dev->ifindex = dev_new_index(net);
+		if (iflink)
+			dev->iflink = dev->ifindex;
+	}
+
+	/* Fixup kobjects */
+	err = device_rename(&dev->dev, dev->name);
+	WARN_ON(err);
+
+	/* Add the device back in the hashes */
+	list_netdevice(dev);
+
+	/* Notify protocols, that a new device appeared. */
+	call_netdevice_notifiers(NETDEV_REGISTER, dev);
+
+	/*
+	 *	Prevent userspace races by waiting until the network
+	 *	device is fully setup before sending notifications.
+	 */
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+
+	synchronize_net();
+	err = 0;
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+static int dev_cpu_callback(struct notifier_block *nfb,
+			    unsigned long action,
+			    void *ocpu)
+{
+	struct sk_buff **list_skb;
+	struct sk_buff *skb;
+	unsigned int cpu, oldcpu = (unsigned long)ocpu;
+	struct softnet_data *sd, *oldsd;
+
+	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+		return NOTIFY_OK;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+	sd = &per_cpu(softnet_data, cpu);
+	oldsd = &per_cpu(softnet_data, oldcpu);
+
+	/* Find end of our completion_queue. */
+	list_skb = &sd->completion_queue;
+	while (*list_skb)
+		list_skb = &(*list_skb)->next;
+	/* Append completion queue from offline CPU. */
+	*list_skb = oldsd->completion_queue;
+	oldsd->completion_queue = NULL;
+
+	/* Append output queue from offline CPU. */
+	if (oldsd->output_queue) {
+		*sd->output_queue_tailp = oldsd->output_queue;
+		sd->output_queue_tailp = oldsd->output_queue_tailp;
+		oldsd->output_queue = NULL;
+		oldsd->output_queue_tailp = &oldsd->output_queue;
+	}
+	/* Append NAPI poll list from offline CPU. */
+	if (!list_empty(&oldsd->poll_list)) {
+		list_splice_init(&oldsd->poll_list, &sd->poll_list);
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	}
+
+	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	local_irq_enable();
+
+	/* Process offline CPU's input_pkt_queue */
+	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+		netif_rx(skb);
+		input_queue_head_incr(oldsd);
+	}
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+		netif_rx(skb);
+		input_queue_head_incr(oldsd);
+	}
+
+	return NOTIFY_OK;
+}
+
+
+/**
+ *	netdev_increment_features - increment feature set by one
+ *	@all: current feature set
+ *	@one: new feature set
+ *	@mask: mask feature set
+ *
+ *	Computes a new feature set after adding a device with feature set
+ *	@one to the master device with current feature set @all.  Will not
+ *	enable anything that is off in @mask. Returns the new feature set.
+ */
+u32 netdev_increment_features(u32 all, u32 one, u32 mask)
+{
+	if (mask & NETIF_F_GEN_CSUM)
+		mask |= NETIF_F_ALL_CSUM;
+	mask |= NETIF_F_VLAN_CHALLENGED;
+
+	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+	all &= one | ~NETIF_F_ALL_FOR_ALL;
+
+	/* If device needs checksumming, downgrade to it. */
+	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
+		all &= ~NETIF_F_NO_CSUM;
+
+	/* If one device supports hw checksumming, set for all. */
+	if (all & NETIF_F_GEN_CSUM)
+		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+
+	return all;
+}
+EXPORT_SYMBOL(netdev_increment_features);
+
+static struct hlist_head *netdev_create_hash(void)
+{
+	int i;
+	struct hlist_head *hash;
+
+	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+	if (hash != NULL)
+		for (i = 0; i < NETDEV_HASHENTRIES; i++)
+			INIT_HLIST_HEAD(&hash[i]);
+
+	return hash;
+}
+
+/* Initialize per network namespace state */
+static int __net_init netdev_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->dev_base_head);
+
+	net->dev_name_head = netdev_create_hash();
+	if (net->dev_name_head == NULL)
+		goto err_name;
+
+	net->dev_index_head = netdev_create_hash();
+	if (net->dev_index_head == NULL)
+		goto err_idx;
+
+	return 0;
+
+err_idx:
+	kfree(net->dev_name_head);
+err_name:
+	return -ENOMEM;
+}
+
+/**
+ *	netdev_drivername - network driver for the device
+ *	@dev: network device
+ *
+ *	Determine network driver for device.
+ */
+const char *netdev_drivername(const struct net_device *dev)
+{
+	const struct device_driver *driver;
+	const struct device *parent;
+	const char *empty = "";
+
+	parent = dev->dev.parent;
+	if (!parent)
+		return empty;
+
+	driver = parent->driver;
+	if (driver && driver->name)
+		return driver->name;
+	return empty;
+}
+
+static int __netdev_printk(const char *level, const struct net_device *dev,
+			   struct va_format *vaf)
+{
+	int r;
+
+	if (dev && dev->dev.parent)
+		r = dev_printk(level, dev->dev.parent, "%s: %pV",
+			       netdev_name(dev), vaf);
+	else if (dev)
+		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+	else
+		r = printk("%s(NULL net_device): %pV", level, vaf);
+
+	return r;
+}
+
+int netdev_printk(const char *level, const struct net_device *dev,
+		  const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int r;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	r = __netdev_printk(level, dev, &vaf);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level)			\
+int func(const struct net_device *dev, const char *fmt, ...)	\
+{								\
+	int r;							\
+	struct va_format vaf;					\
+	va_list args;						\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __netdev_printk(level, dev, &vaf);			\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
+static void __net_exit netdev_exit(struct net *net)
+{
+	kfree(net->dev_name_head);
+	kfree(net->dev_index_head);
+}
+
+static struct pernet_operations __net_initdata netdev_net_ops = {
+	.init = netdev_init,
+	.exit = netdev_exit,
+};
+
+static void __net_exit default_device_exit(struct net *net)
+{
+	struct net_device *dev, *aux;
+	/*
+	 * Push all migratable network devices back to the
+	 * initial network namespace
+	 */
+	rtnl_lock();
+	for_each_netdev_safe(net, dev, aux) {
+		int err;
+		char fb_name[IFNAMSIZ];
+
+		/* Ignore unmoveable devices (i.e. loopback) */
+		if (dev->features & NETIF_F_NETNS_LOCAL)
+			continue;
+
+		/* Leave virtual devices for the generic cleanup */
+		if (dev->rtnl_link_ops)
+			continue;
+
+		/* Push remaining network devices to init_net */
+		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+		err = dev_change_net_namespace(dev, &init_net, fb_name);
+		if (err) {
+			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
+				__func__, dev->name, err);
+			BUG();
+		}
+	}
+	rtnl_unlock();
+}
+
+static void __net_exit default_device_exit_batch(struct list_head *net_list)
+{
+	/* At exit all network devices most be removed from a network
+	 * namespace.  Do this in the reverse order of registration.
+	 * Do this across as many network namespaces as possible to
+	 * improve batching efficiency.
+	 */
+	struct net_device *dev;
+	struct net *net;
+	LIST_HEAD(dev_kill_list);
+
+	rtnl_lock();
+	list_for_each_entry(net, net_list, exit_list) {
+		for_each_netdev_reverse(net, dev) {
+			if (dev->rtnl_link_ops)
+				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
+			else
+				unregister_netdevice_queue(dev, &dev_kill_list);
+		}
+	}
+	unregister_netdevice_many(&dev_kill_list);
+	list_del(&dev_kill_list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations __net_initdata default_device_ops = {
+	.exit = default_device_exit,
+	.exit_batch = default_device_exit_batch,
+};
+
+/*
+ *	Initialize the DEV module. At boot time this walks the device list and
+ *	unhooks any devices that fail to initialise (normally hardware not
+ *	present) and leaves us with a valid list of present and active devices.
+ *
+ */
+
+/*
+ *       This is called single threaded during boot, so no need
+ *       to take the rtnl semaphore.
+ */
+static int __init net_dev_init(void)
+{
+	int i, rc = -ENOMEM;
+
+	BUG_ON(!dev_boot_phase);
+
+	if (dev_proc_init())
+		goto out;
+
+	if (netdev_kobject_init())
+		goto out;
+
+	INIT_LIST_HEAD(&ptype_all);
+	for (i = 0; i < PTYPE_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&ptype_base[i]);
+
+	if (register_pernet_subsys(&netdev_net_ops))
+		goto out;
+
+	/*
+	 *	Initialise the packet receive queues.
+	 */
+
+	for_each_possible_cpu(i) {
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
+
+		memset(sd, 0, sizeof(*sd));
+		skb_queue_head_init(&sd->input_pkt_queue);
+		skb_queue_head_init(&sd->process_queue);
+		sd->completion_queue = NULL;
+		INIT_LIST_HEAD(&sd->poll_list);
+		sd->output_queue = NULL;
+		sd->output_queue_tailp = &sd->output_queue;
+#ifdef CONFIG_RPS
+		sd->csd.func = rps_trigger_softirq;
+		sd->csd.info = sd;
+		sd->csd.flags = 0;
+		sd->cpu = i;
+#endif
+
+		sd->backlog.poll = process_backlog;
+		sd->backlog.weight = weight_p;
+		sd->backlog.gro_list = NULL;
+		sd->backlog.gro_count = 0;
+	}
+
+	dev_boot_phase = 0;
+
+	/* The loopback device is special if any other network devices
+	 * is present in a network namespace the loopback device must
+	 * be present. Since we now dynamically allocate and free the
+	 * loopback device ensure this invariant is maintained by
+	 * keeping the loopback device as the first device on the
+	 * list of network devices.  Ensuring the loopback devices
+	 * is the first device that appears and the last network device
+	 * that disappears.
+	 */
+	if (register_pernet_device(&loopback_net_ops))
+		goto out;
+
+	if (register_pernet_device(&default_device_ops))
+		goto out;
+
+	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
+
+	hotcpu_notifier(dev_cpu_callback, 0);
+	dst_init();
+	dev_mcast_init();
+	rc = 0;
+out:
+	return rc;
+}
+
+subsys_initcall(net_dev_init);
+
+static int __init initialize_hashrnd(void)
+{
+	get_random_bytes(&hashrnd, sizeof(hashrnd));
+	return 0;
+}
+
+late_initcall_sync(initialize_hashrnd);
+
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
new file mode 100644
index 00000000..e2e66939
--- /dev/null
+++ b/net/core/dev_addr_lists.c
@@ -0,0 +1,733 @@
+/*
+ * net/core/dev_addr_lists.c - Functions for handling net device lists
+ * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
+ *
+ * This file contains functions for working with unicast, multicast and device
+ * addresses lists.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * General list handling functions
+ */
+
+static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
+			    unsigned char *addr, int addr_len,
+			    unsigned char addr_type, bool global)
+{
+	struct netdev_hw_addr *ha;
+	int alloc_size;
+
+	if (addr_len > MAX_ADDR_LEN)
+		return -EINVAL;
+
+	list_for_each_entry(ha, &list->list, list) {
+		if (!memcmp(ha->addr, addr, addr_len) &&
+		    ha->type == addr_type) {
+			if (global) {
+				/* check if addr is already used as global */
+				if (ha->global_use)
+					return 0;
+				else
+					ha->global_use = true;
+			}
+			ha->refcount++;
+			return 0;
+		}
+	}
+
+
+	alloc_size = sizeof(*ha);
+	if (alloc_size < L1_CACHE_BYTES)
+		alloc_size = L1_CACHE_BYTES;
+	ha = kmalloc(alloc_size, GFP_ATOMIC);
+	if (!ha)
+		return -ENOMEM;
+	memcpy(ha->addr, addr, addr_len);
+	ha->type = addr_type;
+	ha->refcount = 1;
+	ha->global_use = global;
+	ha->synced = false;
+	list_add_tail_rcu(&ha->list, &list->list);
+	list->count++;
+	return 0;
+}
+
+static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
+}
+
+static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
+			    unsigned char *addr, int addr_len,
+			    unsigned char addr_type, bool global)
+{
+	struct netdev_hw_addr *ha;
+
+	list_for_each_entry(ha, &list->list, list) {
+		if (!memcmp(ha->addr, addr, addr_len) &&
+		    (ha->type == addr_type || !addr_type)) {
+			if (global) {
+				if (!ha->global_use)
+					break;
+				else
+					ha->global_use = false;
+			}
+			if (--ha->refcount)
+				return 0;
+			list_del_rcu(&ha->list);
+			kfree_rcu(ha, rcu_head);
+			list->count--;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
+			 int addr_len, unsigned char addr_type)
+{
+	return __hw_addr_del_ex(list, addr, addr_len, addr_type, false);
+}
+
+int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
+			   struct netdev_hw_addr_list *from_list,
+			   int addr_len, unsigned char addr_type)
+{
+	int err;
+	struct netdev_hw_addr *ha, *ha2;
+	unsigned char type;
+
+	list_for_each_entry(ha, &from_list->list, list) {
+		type = addr_type ? addr_type : ha->type;
+		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+		if (err)
+			goto unroll;
+	}
+	return 0;
+
+unroll:
+	list_for_each_entry(ha2, &from_list->list, list) {
+		if (ha2 == ha)
+			break;
+		type = addr_type ? addr_type : ha2->type;
+		__hw_addr_del(to_list, ha2->addr, addr_len, type);
+	}
+	return err;
+}
+EXPORT_SYMBOL(__hw_addr_add_multiple);
+
+void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
+			    struct netdev_hw_addr_list *from_list,
+			    int addr_len, unsigned char addr_type)
+{
+	struct netdev_hw_addr *ha;
+	unsigned char type;
+
+	list_for_each_entry(ha, &from_list->list, list) {
+		type = addr_type ? addr_type : ha->type;
+		__hw_addr_del(to_list, ha->addr, addr_len, type);
+	}
+}
+EXPORT_SYMBOL(__hw_addr_del_multiple);
+
+int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
+		   struct netdev_hw_addr_list *from_list,
+		   int addr_len)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+		if (!ha->synced) {
+			err = __hw_addr_add(to_list, ha->addr,
+					    addr_len, ha->type);
+			if (err)
+				break;
+			ha->synced = true;
+			ha->refcount++;
+		} else if (ha->refcount == 1) {
+			__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
+			__hw_addr_del(from_list, ha->addr, addr_len, ha->type);
+		}
+	}
+	return err;
+}
+EXPORT_SYMBOL(__hw_addr_sync);
+
+void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
+		      struct netdev_hw_addr_list *from_list,
+		      int addr_len)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+		if (ha->synced) {
+			__hw_addr_del(to_list, ha->addr,
+				      addr_len, ha->type);
+			ha->synced = false;
+			__hw_addr_del(from_list, ha->addr,
+				      addr_len, ha->type);
+		}
+	}
+}
+EXPORT_SYMBOL(__hw_addr_unsync);
+
+void __hw_addr_flush(struct netdev_hw_addr_list *list)
+{
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &list->list, list) {
+		list_del_rcu(&ha->list);
+		kfree_rcu(ha, rcu_head);
+	}
+	list->count = 0;
+}
+EXPORT_SYMBOL(__hw_addr_flush);
+
+void __hw_addr_init(struct netdev_hw_addr_list *list)
+{
+	INIT_LIST_HEAD(&list->list);
+	list->count = 0;
+}
+EXPORT_SYMBOL(__hw_addr_init);
+
+/*
+ * Device addresses handling functions
+ */
+
+/**
+ *	dev_addr_flush - Flush device address list
+ *	@dev: device
+ *
+ *	Flush device address list and reset ->dev_addr.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+void dev_addr_flush(struct net_device *dev)
+{
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_flush(&dev->dev_addrs);
+	dev->dev_addr = NULL;
+}
+EXPORT_SYMBOL(dev_addr_flush);
+
+/**
+ *	dev_addr_init - Init device address list
+ *	@dev: device
+ *
+ *	Init device address list and create the first element,
+ *	used by ->dev_addr.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_init(struct net_device *dev)
+{
+	unsigned char addr[MAX_ADDR_LEN];
+	struct netdev_hw_addr *ha;
+	int err;
+
+	/* rtnl_mutex must be held here */
+
+	__hw_addr_init(&dev->dev_addrs);
+	memset(addr, 0, sizeof(addr));
+	err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
+			    NETDEV_HW_ADDR_T_LAN);
+	if (!err) {
+		/*
+		 * Get the first (previously created) address from the list
+		 * and set dev_addr pointer to this location.
+		 */
+		ha = list_first_entry(&dev->dev_addrs.list,
+				      struct netdev_hw_addr, list);
+		dev->dev_addr = ha->addr;
+	}
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_init);
+
+/**
+ *	dev_addr_add - Add a device address
+ *	@dev: device
+ *	@addr: address to add
+ *	@addr_type: address type
+ *
+ *	Add a device address to the device or increase the reference count if
+ *	it already exists.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ *	dev_addr_del - Release a device address.
+ *	@dev: device
+ *	@addr: address to delete
+ *	@addr_type: address type
+ *
+ *	Release reference to a device address and remove it from the device
+ *	if the reference count drops to zero.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+		 unsigned char addr_type)
+{
+	int err;
+	struct netdev_hw_addr *ha;
+
+	ASSERT_RTNL();
+
+	/*
+	 * We can not remove the first address from the list because
+	 * dev->dev_addr points to that.
+	 */
+	ha = list_first_entry(&dev->dev_addrs.list,
+			      struct netdev_hw_addr, list);
+	if (ha->addr == dev->dev_addr && ha->refcount == 1)
+		return -ENOENT;
+
+	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
+			    addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ *	dev_addr_add_multiple - Add device addresses from another device
+ *	@to_dev: device to which addresses will be added
+ *	@from_dev: device from which addresses will be added
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Add device addresses of the one device to another.
+ **
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	int err;
+
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
+				     to_dev->addr_len, addr_type);
+	if (!err)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ *	dev_addr_del_multiple - Delete device addresses by another device
+ *	@to_dev: device where the addresses will be deleted
+ *	@from_dev: device supplying the addresses to be deleted
+ *	@addr_type: address type - 0 means type will be used from from_dev
+ *
+ *	Deletes addresses in to device by the list of addresses in from device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+			  struct net_device *from_dev,
+			  unsigned char addr_type)
+{
+	ASSERT_RTNL();
+
+	if (from_dev->addr_len != to_dev->addr_len)
+		return -EINVAL;
+	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
+			       to_dev->addr_len, addr_type);
+	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+	return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/*
+ * Unicast list handling functions
+ */
+
+/**
+ *	dev_uc_add - Add a secondary unicast address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a secondary unicast address to the device or increase
+ *	the reference count if it already exists.
+ */
+int dev_uc_add(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	netif_addr_lock_bh(dev);
+	err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
+			    NETDEV_HW_ADDR_T_UNICAST);
+	if (!err)
+		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_uc_add);
+
+/**
+ *	dev_uc_del - Release secondary unicast address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a secondary unicast address and remove it
+ *	from the device if the reference count drops to zero.
+ */
+int dev_uc_del(struct net_device *dev, unsigned char *addr)
+{
+	int err;
+
+	netif_addr_lock_bh(dev);
+	err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
+			    NETDEV_HW_ADDR_T_UNICAST);
+	if (!err)
+		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
+	return err;
+}
+EXPORT_SYMBOL(dev_uc_del);
+
+/**
+ *	dev_uc_sync - Synchronize device's unicast list to another device
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Add newly added addresses to the destination device and release
+ *	addresses that have no users left. The source device must be
+ *	locked by netif_tx_lock_bh.
+ *
+ *	This function is intended to be called from the dev->set_rx_mode
+ *	function of layered software devices.
+ */
+int dev_uc_sync(struct net_device *to, struct net_device *from)
+{
+	int err = 0;
+
+	if (to->addr_len != from->addr_len)
+		return -EINVAL;
+
+	netif_addr_lock_bh(to);
+	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
+	if (!err)
+		__dev_set_rx_mode(to);
+	netif_addr_unlock_bh(to);
+	return err;
+}
+EXPORT_SYMBOL(dev_uc_sync);
+
+/**
+ *	dev_uc_unsync - Remove synchronized addresses from the destination device
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Remove all addresses that were added to the destination device by
+ *	dev_uc_sync(). This function is intended to be called from the
+ *	dev->stop function of layered software devices.
+ */
+void dev_uc_unsync(struct net_device *to, struct net_device *from)
+{
+	if (to->addr_len != from->addr_len)
+		return;
+
+	netif_addr_lock_bh(from);
+	netif_addr_lock(to);
+	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
+	__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
+	netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_uc_unsync);
+
+/**
+ *	dev_uc_flush - Flush unicast addresses
+ *	@dev: device
+ *
+ *	Flush unicast addresses.
+ */
+void dev_uc_flush(struct net_device *dev)
+{
+	netif_addr_lock_bh(dev);
+	__hw_addr_flush(&dev->uc);
+	netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_uc_flush);
+
+/**
+ *	dev_uc_flush - Init unicast address list
+ *	@dev: device
+ *
+ *	Init unicast address list.
+ */
+void dev_uc_init(struct net_device *dev)
+{
+	__hw_addr_init(&dev->uc);
+}
+EXPORT_SYMBOL(dev_uc_init);
+
+/*
+ * Multicast list handling functions
+ */
+
+static int __dev_mc_add(struct net_device *dev, unsigned char *addr,
+			bool global)
+{
+	int err;
+
+	netif_addr_lock_bh(dev);
+	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_MULTICAST, global);
+	if (!err)
+		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
+	return err;
+}
+/**
+ *	dev_mc_add - Add a multicast address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a multicast address to the device or increase
+ *	the reference count if it already exists.
+ */
+int dev_mc_add(struct net_device *dev, unsigned char *addr)
+{
+	return __dev_mc_add(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_add);
+
+/**
+ *	dev_mc_add_global - Add a global multicast address
+ *	@dev: device
+ *	@addr: address to add
+ *
+ *	Add a global multicast address to the device.
+ */
+int dev_mc_add_global(struct net_device *dev, unsigned char *addr)
+{
+	return __dev_mc_add(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_add_global);
+
+static int __dev_mc_del(struct net_device *dev, unsigned char *addr,
+			bool global)
+{
+	int err;
+
+	netif_addr_lock_bh(dev);
+	err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
+			       NETDEV_HW_ADDR_T_MULTICAST, global);
+	if (!err)
+		__dev_set_rx_mode(dev);
+	netif_addr_unlock_bh(dev);
+	return err;
+}
+
+/**
+ *	dev_mc_del - Delete a multicast address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a multicast address and remove it
+ *	from the device if the reference count drops to zero.
+ */
+int dev_mc_del(struct net_device *dev, unsigned char *addr)
+{
+	return __dev_mc_del(dev, addr, false);
+}
+EXPORT_SYMBOL(dev_mc_del);
+
+/**
+ *	dev_mc_del_global - Delete a global multicast address.
+ *	@dev: device
+ *	@addr: address to delete
+ *
+ *	Release reference to a multicast address and remove it
+ *	from the device if the reference count drops to zero.
+ */
+int dev_mc_del_global(struct net_device *dev, unsigned char *addr)
+{
+	return __dev_mc_del(dev, addr, true);
+}
+EXPORT_SYMBOL(dev_mc_del_global);
+
+/**
+ *	dev_mc_sync - Synchronize device's unicast list to another device
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Add newly added addresses to the destination device and release
+ *	addresses that have no users left. The source device must be
+ *	locked by netif_tx_lock_bh.
+ *
+ *	This function is intended to be called from the dev->set_multicast_list
+ *	or dev->set_rx_mode function of layered software devices.
+ */
+int dev_mc_sync(struct net_device *to, struct net_device *from)
+{
+	int err = 0;
+
+	if (to->addr_len != from->addr_len)
+		return -EINVAL;
+
+	netif_addr_lock_bh(to);
+	err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+	if (!err)
+		__dev_set_rx_mode(to);
+	netif_addr_unlock_bh(to);
+	return err;
+}
+EXPORT_SYMBOL(dev_mc_sync);
+
+/**
+ *	dev_mc_unsync - Remove synchronized addresses from the destination device
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Remove all addresses that were added to the destination device by
+ *	dev_mc_sync(). This function is intended to be called from the
+ *	dev->stop function of layered software devices.
+ */
+void dev_mc_unsync(struct net_device *to, struct net_device *from)
+{
+	if (to->addr_len != from->addr_len)
+		return;
+
+	netif_addr_lock_bh(from);
+	netif_addr_lock(to);
+	__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
+	__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
+	netif_addr_unlock_bh(from);
+}
+EXPORT_SYMBOL(dev_mc_unsync);
+
+/**
+ *	dev_mc_flush - Flush multicast addresses
+ *	@dev: device
+ *
+ *	Flush multicast addresses.
+ */
+void dev_mc_flush(struct net_device *dev)
+{
+	netif_addr_lock_bh(dev);
+	__hw_addr_flush(&dev->mc);
+	netif_addr_unlock_bh(dev);
+}
+EXPORT_SYMBOL(dev_mc_flush);
+
+/**
+ *	dev_mc_flush - Init multicast address list
+ *	@dev: device
+ *
+ *	Init multicast address list.
+ */
+void dev_mc_init(struct net_device *dev)
+{
+	__hw_addr_init(&dev->mc);
+}
+EXPORT_SYMBOL(dev_mc_init);
+
+#ifdef CONFIG_PROC_FS
+#include <linux/seq_file.h>
+
+static int dev_mc_seq_show(struct seq_file *seq, void *v)
+{
+	struct netdev_hw_addr *ha;
+	struct net_device *dev = v;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	netif_addr_lock_bh(dev);
+	netdev_for_each_mc_addr(ha, dev) {
+		int i;
+
+		seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
+			   dev->name, ha->refcount, ha->global_use);
+
+		for (i = 0; i < dev->addr_len; i++)
+			seq_printf(seq, "%02x", ha->addr[i]);
+
+		seq_putc(seq, '\n');
+	}
+	netif_addr_unlock_bh(dev);
+	return 0;
+}
+
+static const struct seq_operations dev_mc_seq_ops = {
+	.start = dev_seq_start,
+	.next  = dev_seq_next,
+	.stop  = dev_seq_stop,
+	.show  = dev_mc_seq_show,
+};
+
+static int dev_mc_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &dev_mc_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations dev_mc_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = dev_mc_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+#endif
+
+static int __net_init dev_mc_net_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit dev_mc_net_exit(struct net *net)
+{
+	proc_net_remove(net, "dev_mcast");
+}
+
+static struct pernet_operations __net_initdata dev_mc_net_ops = {
+	.init = dev_mc_net_init,
+	.exit = dev_mc_net_exit,
+};
+
+void __init dev_mcast_init(void)
+{
+	register_pernet_subsys(&dev_mc_net_ops);
+}
+
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 00000000..7f36b38e
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,385 @@
+/*
+ * Monitoring code for network dropped packet alerts
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <net/genetlink.h>
+#include <net/netevent.h>
+
+#include <trace/events/skb.h>
+#include <trace/events/napi.h>
+
+#include <asm/unaligned.h>
+
+#define TRACE_ON 1
+#define TRACE_OFF 0
+
+static void send_dm_alert(struct work_struct *unused);
+
+
+/*
+ * Globals, our netlink socket pointer
+ * and the work handle that will send up
+ * netlink alerts
+ */
+static int trace_state = TRACE_OFF;
+static DEFINE_SPINLOCK(trace_state_lock);
+
+struct per_cpu_dm_data {
+	struct work_struct dm_alert_work;
+	struct sk_buff *skb;
+	atomic_t dm_hit_count;
+	struct timer_list send_timer;
+};
+
+struct dm_hw_stat_delta {
+	struct net_device *dev;
+	unsigned long last_rx;
+	struct list_head list;
+	struct rcu_head rcu;
+	unsigned long last_drop_val;
+};
+
+static struct genl_family net_drop_monitor_family = {
+	.id             = GENL_ID_GENERATE,
+	.hdrsize        = 0,
+	.name           = "NET_DM",
+	.version        = 2,
+	.maxattr        = NET_DM_CMD_MAX,
+};
+
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+
+static int dm_hit_limit = 64;
+static int dm_delay = 1;
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
+
+static void reset_per_cpu_data(struct per_cpu_dm_data *data)
+{
+	size_t al;
+	struct net_dm_alert_msg *msg;
+	struct nlattr *nla;
+
+	al = sizeof(struct net_dm_alert_msg);
+	al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+	al += sizeof(struct nlattr);
+
+	data->skb = genlmsg_new(al, GFP_KERNEL);
+	genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
+			0, NET_DM_CMD_ALERT);
+	nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg));
+	msg = nla_data(nla);
+	memset(msg, 0, al);
+	atomic_set(&data->dm_hit_count, dm_hit_limit);
+}
+
+static void send_dm_alert(struct work_struct *unused)
+{
+	struct sk_buff *skb;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+	/*
+	 * Grab the skb we're about to send
+	 */
+	skb = data->skb;
+
+	/*
+	 * Replace it with a new one
+	 */
+	reset_per_cpu_data(data);
+
+	/*
+	 * Ship it!
+	 */
+	genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
+
+}
+
+/*
+ * This is the timer function to delay the sending of an alert
+ * in the event that more drops will arrive during the
+ * hysteresis period.  Note that it operates under the timer interrupt
+ * so we don't need to disable preemption here
+ */
+static void sched_send_work(unsigned long unused)
+{
+	struct per_cpu_dm_data *data =  &__get_cpu_var(dm_cpu_data);
+
+	schedule_work(&data->dm_alert_work);
+}
+
+static void trace_drop_common(struct sk_buff *skb, void *location)
+{
+	struct net_dm_alert_msg *msg;
+	struct nlmsghdr *nlh;
+	struct nlattr *nla;
+	int i;
+	struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
+
+
+	if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
+		/*
+		 * we're already at zero, discard this hit
+		 */
+		goto out;
+	}
+
+	nlh = (struct nlmsghdr *)data->skb->data;
+	nla = genlmsg_data(nlmsg_data(nlh));
+	msg = nla_data(nla);
+	for (i = 0; i < msg->entries; i++) {
+		if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
+			msg->points[i].count++;
+			goto out;
+		}
+	}
+
+	/*
+	 * We need to create a new entry
+	 */
+	__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
+	memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
+	msg->points[msg->entries].count = 1;
+	msg->entries++;
+
+	if (!timer_pending(&data->send_timer)) {
+		data->send_timer.expires = jiffies + dm_delay * HZ;
+		add_timer_on(&data->send_timer, smp_processor_id());
+	}
+
+out:
+	return;
+}
+
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+{
+	trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
+{
+	struct dm_hw_stat_delta *new_stat;
+
+	/*
+	 * Don't check napi structures with no associated device
+	 */
+	if (!napi->dev)
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+		/*
+		 * only add a note to our monitor buffer if:
+		 * 1) this is the dev we received on
+		 * 2) its after the last_rx delta
+		 * 3) our rx_dropped count has gone up
+		 */
+		if ((new_stat->dev == napi->dev)  &&
+		    (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
+		    (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+			trace_drop_common(NULL, NULL);
+			new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+			new_stat->last_rx = jiffies;
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int set_all_monitor_traces(int state)
+{
+	int rc = 0;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *temp;
+
+	spin_lock(&trace_state_lock);
+
+	if (state == trace_state) {
+		rc = -EAGAIN;
+		goto out_unlock;
+	}
+
+	switch (state) {
+	case TRACE_ON:
+		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+		rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
+		break;
+	case TRACE_OFF:
+		rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
+		rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+
+		tracepoint_synchronize_unregister();
+
+		/*
+		 * Clean the device list
+		 */
+		list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+			if (new_stat->dev == NULL) {
+				list_del_rcu(&new_stat->list);
+				kfree_rcu(new_stat, rcu);
+			}
+		}
+		break;
+	default:
+		rc = 1;
+		break;
+	}
+
+	if (!rc)
+		trace_state = state;
+	else
+		rc = -EINPROGRESS;
+
+out_unlock:
+	spin_unlock(&trace_state_lock);
+
+	return rc;
+}
+
+
+static int net_dm_cmd_config(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	return -ENOTSUPP;
+}
+
+static int net_dm_cmd_trace(struct sk_buff *skb,
+			struct genl_info *info)
+{
+	switch (info->genlhdr->cmd) {
+	case NET_DM_CMD_START:
+		return set_all_monitor_traces(TRACE_ON);
+		break;
+	case NET_DM_CMD_STOP:
+		return set_all_monitor_traces(TRACE_OFF);
+		break;
+	}
+
+	return -ENOTSUPP;
+}
+
+static int dropmon_net_event(struct notifier_block *ev_block,
+			unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dm_hw_stat_delta *new_stat = NULL;
+	struct dm_hw_stat_delta *tmp;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+		if (!new_stat)
+			goto out;
+
+		new_stat->dev = dev;
+		new_stat->last_rx = jiffies;
+		spin_lock(&trace_state_lock);
+		list_add_rcu(&new_stat->list, &hw_stats_list);
+		spin_unlock(&trace_state_lock);
+		break;
+	case NETDEV_UNREGISTER:
+		spin_lock(&trace_state_lock);
+		list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+			if (new_stat->dev == dev) {
+				new_stat->dev = NULL;
+				if (trace_state == TRACE_OFF) {
+					list_del_rcu(&new_stat->list);
+					kfree_rcu(new_stat, rcu);
+					break;
+				}
+			}
+		}
+		spin_unlock(&trace_state_lock);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
+
+static struct genl_ops dropmon_ops[] = {
+	{
+		.cmd = NET_DM_CMD_CONFIG,
+		.doit = net_dm_cmd_config,
+	},
+	{
+		.cmd = NET_DM_CMD_START,
+		.doit = net_dm_cmd_trace,
+	},
+	{
+		.cmd = NET_DM_CMD_STOP,
+		.doit = net_dm_cmd_trace,
+	},
+};
+
+static struct notifier_block dropmon_net_notifier = {
+	.notifier_call = dropmon_net_event
+};
+
+static int __init init_net_drop_monitor(void)
+{
+	struct per_cpu_dm_data *data;
+	int cpu, rc;
+
+	printk(KERN_INFO "Initializing network drop monitor service\n");
+
+	if (sizeof(void *) > 8) {
+		printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
+		return -ENOSPC;
+	}
+
+	rc = genl_register_family_with_ops(&net_drop_monitor_family,
+					   dropmon_ops,
+					   ARRAY_SIZE(dropmon_ops));
+	if (rc) {
+		printk(KERN_ERR "Could not create drop monitor netlink family\n");
+		return rc;
+	}
+
+	rc = register_netdevice_notifier(&dropmon_net_notifier);
+	if (rc < 0) {
+		printk(KERN_CRIT "Failed to register netdevice notifier\n");
+		goto out_unreg;
+	}
+
+	rc = 0;
+
+	for_each_present_cpu(cpu) {
+		data = &per_cpu(dm_cpu_data, cpu);
+		reset_per_cpu_data(data);
+		INIT_WORK(&data->dm_alert_work, send_dm_alert);
+		init_timer(&data->send_timer);
+		data->send_timer.data = cpu;
+		data->send_timer.function = sched_send_work;
+	}
+
+	goto out;
+
+out_unreg:
+	genl_unregister_family(&net_drop_monitor_family);
+out:
+	return rc;
+}
+
+late_initcall(init_net_drop_monitor);
diff --git a/net/core/dst.c b/net/core/dst.c
new file mode 100644
index 00000000..8246d47a
--- /dev/null
+++ b/net/core/dst.c
@@ -0,0 +1,426 @@
+/*
+ * net/core/dst.c	Protocol independent destination cache.
+ *
+ * Authors:		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/sched.h>
+#include <linux/prefetch.h>
+
+#include <net/dst.h>
+
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ *    new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ *    we use a second list where are stored long lived
+ *    entries, that are handled by the garbage collect thread
+ *    fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ *    so that the gc_task and dst_dev_event() can be synchronized.
+ */
+
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+static struct {
+	spinlock_t		lock;
+	struct dst_entry	*list;
+	unsigned long		timer_inc;
+	unsigned long		timer_expires;
+} dst_garbage = {
+	.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
+	.timer_inc = DST_GC_MAX,
+};
+static void dst_gc_task(struct work_struct *work);
+static void ___dst_free(struct dst_entry *dst);
+
+static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
+
+static DEFINE_MUTEX(dst_gc_mutex);
+/*
+ * long lived entries are maintained in this list, guarded by dst_gc_mutex
+ */
+static struct dst_entry         *dst_busy_list;
+
+static void dst_gc_task(struct work_struct *work)
+{
+	int    delayed = 0;
+	int    work_performed = 0;
+	unsigned long expires = ~0L;
+	struct dst_entry *dst, *next, head;
+	struct dst_entry *last = &head;
+
+	mutex_lock(&dst_gc_mutex);
+	next = dst_busy_list;
+
+loop:
+	while ((dst = next) != NULL) {
+		next = dst->next;
+		prefetch(&next->next);
+		cond_resched();
+		if (likely(atomic_read(&dst->__refcnt))) {
+			last->next = dst;
+			last = dst;
+			delayed++;
+			continue;
+		}
+		work_performed++;
+
+		dst = dst_destroy(dst);
+		if (dst) {
+			/* NOHASH and still referenced. Unless it is already
+			 * on gc list, invalidate it and add to gc list.
+			 *
+			 * Note: this is temporary. Actually, NOHASH dst's
+			 * must be obsoleted when parent is obsoleted.
+			 * But we do not have state "obsoleted, but
+			 * referenced by parent", so it is right.
+			 */
+			if (dst->obsolete > 1)
+				continue;
+
+			___dst_free(dst);
+			dst->next = next;
+			next = dst;
+		}
+	}
+
+	spin_lock_bh(&dst_garbage.lock);
+	next = dst_garbage.list;
+	if (next) {
+		dst_garbage.list = NULL;
+		spin_unlock_bh(&dst_garbage.lock);
+		goto loop;
+	}
+	last->next = NULL;
+	dst_busy_list = head.next;
+	if (!dst_busy_list)
+		dst_garbage.timer_inc = DST_GC_MAX;
+	else {
+		/*
+		 * if we freed less than 1/10 of delayed entries,
+		 * we can sleep longer.
+		 */
+		if (work_performed <= delayed/10) {
+			dst_garbage.timer_expires += dst_garbage.timer_inc;
+			if (dst_garbage.timer_expires > DST_GC_MAX)
+				dst_garbage.timer_expires = DST_GC_MAX;
+			dst_garbage.timer_inc += DST_GC_INC;
+		} else {
+			dst_garbage.timer_inc = DST_GC_INC;
+			dst_garbage.timer_expires = DST_GC_MIN;
+		}
+		expires = dst_garbage.timer_expires;
+		/*
+		 * if the next desired timer is more than 4 seconds in the
+		 * future then round the timer to whole seconds
+		 */
+		if (expires > 4*HZ)
+			expires = round_jiffies_relative(expires);
+		schedule_delayed_work(&dst_gc_work, expires);
+	}
+
+	spin_unlock_bh(&dst_garbage.lock);
+	mutex_unlock(&dst_gc_mutex);
+}
+
+int dst_discard(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(dst_discard);
+
+const u32 dst_default_metrics[RTAX_MAX];
+
+void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
+		int initial_ref, int initial_obsolete, int flags)
+{
+	struct dst_entry *dst;
+
+	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+		if (ops->gc(ops))
+			return NULL;
+	}
+	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+	if (!dst)
+		return NULL;
+	dst->child = NULL;
+	dst->dev = dev;
+	if (dev)
+		dev_hold(dev);
+	dst->ops = ops;
+	dst_init_metrics(dst, dst_default_metrics, true);
+	dst->expires = 0UL;
+	dst->path = dst;
+	RCU_INIT_POINTER(dst->_neighbour, NULL);
+	dst->hh = NULL;
+#ifdef CONFIG_XFRM
+	dst->xfrm = NULL;
+#endif
+	dst->input = dst_discard;
+	dst->output = dst_discard;
+	dst->error = 0;
+	dst->obsolete = initial_obsolete;
+	dst->header_len = 0;
+	dst->trailer_len = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	dst->tclassid = 0;
+#endif
+	atomic_set(&dst->__refcnt, initial_ref);
+	dst->__use = 0;
+	dst->lastuse = jiffies;
+	dst->flags = flags;
+	dst->next = NULL;
+	if (!(flags & DST_NOCOUNT))
+		dst_entries_add(ops, 1);
+	return dst;
+}
+EXPORT_SYMBOL(dst_alloc);
+
+static void ___dst_free(struct dst_entry *dst)
+{
+	/* The first case (dev==NULL) is required, when
+	   protocol module is unloaded.
+	 */
+	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP))
+		dst->input = dst->output = dst_discard;
+	dst->obsolete = 2;
+}
+
+void __dst_free(struct dst_entry *dst)
+{
+	spin_lock_bh(&dst_garbage.lock);
+	___dst_free(dst);
+	dst->next = dst_garbage.list;
+	dst_garbage.list = dst;
+	if (dst_garbage.timer_inc > DST_GC_INC) {
+		dst_garbage.timer_inc = DST_GC_INC;
+		dst_garbage.timer_expires = DST_GC_MIN;
+		cancel_delayed_work(&dst_gc_work);
+		schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
+	}
+	spin_unlock_bh(&dst_garbage.lock);
+}
+EXPORT_SYMBOL(__dst_free);
+
+struct dst_entry *dst_destroy(struct dst_entry * dst)
+{
+	struct dst_entry *child;
+	struct neighbour *neigh;
+	struct hh_cache *hh;
+
+	smp_rmb();
+
+again:
+	neigh = rcu_dereference_protected(dst->_neighbour, 1);
+	hh = dst->hh;
+	child = dst->child;
+
+	dst->hh = NULL;
+	if (hh)
+		hh_cache_put(hh);
+
+	if (neigh) {
+		RCU_INIT_POINTER(dst->_neighbour, NULL);
+		neigh_release(neigh);
+	}
+
+	if (!(dst->flags & DST_NOCOUNT))
+		dst_entries_add(dst->ops, -1);
+
+	if (dst->ops->destroy)
+		dst->ops->destroy(dst);
+	if (dst->dev)
+		dev_put(dst->dev);
+	kmem_cache_free(dst->ops->kmem_cachep, dst);
+
+	dst = child;
+	if (dst) {
+		int nohash = dst->flags & DST_NOHASH;
+
+		if (atomic_dec_and_test(&dst->__refcnt)) {
+			/* We were real parent of this dst, so kill child. */
+			if (nohash)
+				goto again;
+		} else {
+			/* Child is still referenced, return it for freeing. */
+			if (nohash)
+				return dst;
+			/* Child is still in his hash table */
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(dst_destroy);
+
+void dst_release(struct dst_entry *dst)
+{
+	if (dst) {
+		int newrefcnt;
+
+		newrefcnt = atomic_dec_return(&dst->__refcnt);
+		WARN_ON(newrefcnt < 0);
+		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
+			dst = dst_destroy(dst);
+			if (dst)
+				__dst_free(dst);
+		}
+	}
+}
+EXPORT_SYMBOL(dst_release);
+
+u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+	u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
+
+	if (p) {
+		u32 *old_p = __DST_METRICS_PTR(old);
+		unsigned long prev, new;
+
+		memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+		new = (unsigned long) p;
+		prev = cmpxchg(&dst->_metrics, old, new);
+
+		if (prev != old) {
+			kfree(p);
+			p = __DST_METRICS_PTR(prev);
+			if (prev & DST_METRICS_READ_ONLY)
+				p = NULL;
+		}
+	}
+	return p;
+}
+EXPORT_SYMBOL(dst_cow_metrics_generic);
+
+/* Caller asserts that dst_metrics_read_only(dst) is false.  */
+void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
+{
+	unsigned long prev, new;
+
+	new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
+	prev = cmpxchg(&dst->_metrics, old, new);
+	if (prev == old)
+		kfree(__DST_METRICS_PTR(old));
+}
+EXPORT_SYMBOL(__dst_destroy_metrics_generic);
+
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	/* If dst not in cache, we must take a reference, because
+	 * dst_release() will destroy dst as soon as its refcount becomes zero
+	 */
+	if (unlikely(dst->flags & DST_NOCACHE)) {
+		dst_hold(dst);
+		skb_dst_set(skb, dst);
+	} else {
+		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+	}
+}
+EXPORT_SYMBOL(skb_dst_set_noref);
+
+/* Dirty hack. We did it in 2.2 (in __dst_free),
+ * we have _very_ good reasons not to repeat
+ * this mistake in 2.3, but we have no choice
+ * now. _It_ _is_ _explicit_ _deliberate_
+ * _race_ _condition_.
+ *
+ * Commented and originally written by Alexey.
+ */
+static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+		       int unregister)
+{
+	if (dst->ops->ifdown)
+		dst->ops->ifdown(dst, dev, unregister);
+
+	if (dev != dst->dev)
+		return;
+
+	if (!unregister) {
+		dst->input = dst->output = dst_discard;
+	} else {
+		struct neighbour *neigh;
+
+		dst->dev = dev_net(dst->dev)->loopback_dev;
+		dev_hold(dst->dev);
+		dev_put(dev);
+		rcu_read_lock();
+		neigh = dst_get_neighbour(dst);
+		if (neigh && neigh->dev == dev) {
+			neigh->dev = dst->dev;
+			dev_hold(dst->dev);
+			dev_put(dev);
+		}
+		rcu_read_unlock();
+	}
+}
+
+static int dst_dev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct dst_entry *dst, *last = NULL;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+	case NETDEV_DOWN:
+		mutex_lock(&dst_gc_mutex);
+		for (dst = dst_busy_list; dst; dst = dst->next) {
+			last = dst;
+			dst_ifdown(dst, dev, event != NETDEV_DOWN);
+		}
+
+		spin_lock_bh(&dst_garbage.lock);
+		dst = dst_garbage.list;
+		dst_garbage.list = NULL;
+		spin_unlock_bh(&dst_garbage.lock);
+
+		if (last)
+			last->next = dst;
+		else
+			dst_busy_list = dst;
+		for (; dst; dst = dst->next)
+			dst_ifdown(dst, dev, event != NETDEV_DOWN);
+		mutex_unlock(&dst_gc_mutex);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dst_dev_notifier = {
+	.notifier_call	= dst_dev_event,
+	.priority = -10, /* must be called after other network notifiers */
+};
+
+void __init dst_init(void)
+{
+	register_netdevice_notifier(&dst_dev_notifier);
+}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
new file mode 100644
index 00000000..4fb77049
--- /dev/null
+++ b/net/core/ethtool.c
@@ -0,0 +1,2166 @@
+/*
+ * net/core/ethtool.c - Ethtool ioctl handler
+ * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
+ *
+ * This file is where we call all the ethtool_ops commands to get
+ * the information ethtool needs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/bitops.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+
+/*
+ * Some useful ethtool_ops methods that're device independent.
+ * If we find that all drivers want to do the same thing here,
+ * we can turn these into dev_() function calls.
+ */
+
+u32 ethtool_op_get_link(struct net_device *dev)
+{
+	return netif_carrier_ok(dev) ? 1 : 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_link);
+
+u32 ethtool_op_get_tx_csum(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_ALL_CSUM) != 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_tx_csum);
+
+int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_IP_CSUM;
+	else
+		dev->features &= ~NETIF_F_IP_CSUM;
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_tx_csum);
+
+int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_HW_CSUM;
+	else
+		dev->features &= ~NETIF_F_HW_CSUM;
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
+
+int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	else
+		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
+
+u32 ethtool_op_get_sg(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_SG) != 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_sg);
+
+int ethtool_op_set_sg(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_SG;
+	else
+		dev->features &= ~NETIF_F_SG;
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_sg);
+
+u32 ethtool_op_get_tso(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_TSO) != 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_tso);
+
+int ethtool_op_set_tso(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_TSO;
+	else
+		dev->features &= ~NETIF_F_TSO;
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_tso);
+
+u32 ethtool_op_get_ufo(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_UFO) != 0;
+}
+EXPORT_SYMBOL(ethtool_op_get_ufo);
+
+int ethtool_op_set_ufo(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_UFO;
+	else
+		dev->features &= ~NETIF_F_UFO;
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_ufo);
+
+/* the following list of flags are the same as their associated
+ * NETIF_F_xxx values in include/linux/netdevice.h
+ */
+static const u32 flags_dup_features =
+	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
+	 ETH_FLAG_RXHASH);
+
+u32 ethtool_op_get_flags(struct net_device *dev)
+{
+	/* in the future, this function will probably contain additional
+	 * handling for flags which are not so easily handled
+	 * by a simple masking operation
+	 */
+
+	return dev->features & flags_dup_features;
+}
+EXPORT_SYMBOL(ethtool_op_get_flags);
+
+/* Check if device can enable (or disable) particular feature coded in "data"
+ * argument. Flags "supported" describe features that can be toggled by device.
+ * If feature can not be toggled, it state (enabled or disabled) must match
+ * hardcoded device features state, otherwise flags are marked as invalid.
+ */
+bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
+{
+	u32 features = dev->features & flags_dup_features;
+	/* "data" can contain only flags_dup_features bits,
+	 * see __ethtool_set_flags */
+
+	return (features & ~supported) != (data & ~supported);
+}
+EXPORT_SYMBOL(ethtool_invalid_flags);
+
+int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
+{
+	if (ethtool_invalid_flags(dev, data, supported))
+		return -EINVAL;
+
+	dev->features = ((dev->features & ~flags_dup_features) |
+			 (data & flags_dup_features));
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_op_set_flags);
+
+void ethtool_ntuple_flush(struct net_device *dev)
+{
+	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
+
+	list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) {
+		list_del(&fsc->list);
+		kfree(fsc);
+	}
+	dev->ethtool_ntuple_list.count = 0;
+}
+EXPORT_SYMBOL(ethtool_ntuple_flush);
+
+/* Handlers for each ethtool command */
+
+#define ETHTOOL_DEV_FEATURE_WORDS	1
+
+static void ethtool_get_features_compat(struct net_device *dev,
+	struct ethtool_get_features_block *features)
+{
+	if (!dev->ethtool_ops)
+		return;
+
+	/* getting RX checksum */
+	if (dev->ethtool_ops->get_rx_csum)
+		if (dev->ethtool_ops->get_rx_csum(dev))
+			features[0].active |= NETIF_F_RXCSUM;
+
+	/* mark legacy-changeable features */
+	if (dev->ethtool_ops->set_sg)
+		features[0].available |= NETIF_F_SG;
+	if (dev->ethtool_ops->set_tx_csum)
+		features[0].available |= NETIF_F_ALL_CSUM;
+	if (dev->ethtool_ops->set_tso)
+		features[0].available |= NETIF_F_ALL_TSO;
+	if (dev->ethtool_ops->set_rx_csum)
+		features[0].available |= NETIF_F_RXCSUM;
+	if (dev->ethtool_ops->set_flags)
+		features[0].available |= flags_dup_features;
+}
+
+static int ethtool_set_feature_compat(struct net_device *dev,
+	int (*legacy_set)(struct net_device *, u32),
+	struct ethtool_set_features_block *features, u32 mask)
+{
+	u32 do_set;
+
+	if (!legacy_set)
+		return 0;
+
+	if (!(features[0].valid & mask))
+		return 0;
+
+	features[0].valid &= ~mask;
+
+	do_set = !!(features[0].requested & mask);
+
+	if (legacy_set(dev, do_set) < 0)
+		netdev_info(dev,
+			"Legacy feature change (%s) failed for 0x%08x\n",
+			do_set ? "set" : "clear", mask);
+
+	return 1;
+}
+
+static int ethtool_set_flags_compat(struct net_device *dev,
+	int (*legacy_set)(struct net_device *, u32),
+	struct ethtool_set_features_block *features, u32 mask)
+{
+	u32 value;
+
+	if (!legacy_set)
+		return 0;
+
+	if (!(features[0].valid & mask))
+		return 0;
+
+	value = dev->features & ~features[0].valid;
+	value |= features[0].requested;
+
+	features[0].valid &= ~mask;
+
+	if (legacy_set(dev, value & mask) < 0)
+		netdev_info(dev, "Legacy flags change failed\n");
+
+	return 1;
+}
+
+static int ethtool_set_features_compat(struct net_device *dev,
+	struct ethtool_set_features_block *features)
+{
+	int compat;
+
+	if (!dev->ethtool_ops)
+		return 0;
+
+	compat  = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
+		features, NETIF_F_SG);
+	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
+		features, NETIF_F_ALL_CSUM);
+	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
+		features, NETIF_F_ALL_TSO);
+	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
+		features, NETIF_F_RXCSUM);
+	compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags,
+		features, flags_dup_features);
+
+	return compat;
+}
+
+static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_gfeatures cmd = {
+		.cmd = ETHTOOL_GFEATURES,
+		.size = ETHTOOL_DEV_FEATURE_WORDS,
+	};
+	struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
+		{
+			.available = dev->hw_features,
+			.requested = dev->wanted_features,
+			.active = dev->features,
+			.never_changed = NETIF_F_NEVER_CHANGE,
+		},
+	};
+	u32 __user *sizeaddr;
+	u32 copy_size;
+
+	ethtool_get_features_compat(dev, features);
+
+	sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
+	if (get_user(copy_size, sizeaddr))
+		return -EFAULT;
+
+	if (copy_size > ETHTOOL_DEV_FEATURE_WORDS)
+		copy_size = ETHTOOL_DEV_FEATURE_WORDS;
+
+	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+		return -EFAULT;
+	useraddr += sizeof(cmd);
+	if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_sfeatures cmd;
+	struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
+	int ret = 0;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+	useraddr += sizeof(cmd);
+
+	if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS)
+		return -EINVAL;
+
+	if (copy_from_user(features, useraddr, sizeof(features)))
+		return -EFAULT;
+
+	if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
+		return -EINVAL;
+
+	if (ethtool_set_features_compat(dev, features))
+		ret |= ETHTOOL_F_COMPAT;
+
+	if (features[0].valid & ~dev->hw_features) {
+		features[0].valid &= dev->hw_features;
+		ret |= ETHTOOL_F_UNSUPPORTED;
+	}
+
+	dev->wanted_features &= ~features[0].valid;
+	dev->wanted_features |= features[0].valid & features[0].requested;
+	__netdev_update_features(dev);
+
+	if ((dev->wanted_features ^ dev->features) & features[0].valid)
+		ret |= ETHTOOL_F_WISH;
+
+	return ret;
+}
+
+static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
+	/* NETIF_F_SG */              "tx-scatter-gather",
+	/* NETIF_F_IP_CSUM */         "tx-checksum-ipv4",
+	/* NETIF_F_NO_CSUM */         "tx-checksum-unneeded",
+	/* NETIF_F_HW_CSUM */         "tx-checksum-ip-generic",
+	/* NETIF_F_IPV6_CSUM */       "tx-checksum-ipv6",
+	/* NETIF_F_HIGHDMA */         "highdma",
+	/* NETIF_F_FRAGLIST */        "tx-scatter-gather-fraglist",
+	/* NETIF_F_HW_VLAN_TX */      "tx-vlan-hw-insert",
+
+	/* NETIF_F_HW_VLAN_RX */      "rx-vlan-hw-parse",
+	/* NETIF_F_HW_VLAN_FILTER */  "rx-vlan-filter",
+	/* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
+	/* NETIF_F_GSO */             "tx-generic-segmentation",
+	/* NETIF_F_LLTX */            "tx-lockless",
+	/* NETIF_F_NETNS_LOCAL */     "netns-local",
+	/* NETIF_F_GRO */             "rx-gro",
+	/* NETIF_F_LRO */             "rx-lro",
+
+	/* NETIF_F_TSO */             "tx-tcp-segmentation",
+	/* NETIF_F_UFO */             "tx-udp-fragmentation",
+	/* NETIF_F_GSO_ROBUST */      "tx-gso-robust",
+	/* NETIF_F_TSO_ECN */         "tx-tcp-ecn-segmentation",
+	/* NETIF_F_TSO6 */            "tx-tcp6-segmentation",
+	/* NETIF_F_FSO */             "tx-fcoe-segmentation",
+	"",
+	"",
+
+	/* NETIF_F_FCOE_CRC */        "tx-checksum-fcoe-crc",
+	/* NETIF_F_SCTP_CSUM */       "tx-checksum-sctp",
+	/* NETIF_F_FCOE_MTU */        "fcoe-mtu",
+	/* NETIF_F_NTUPLE */          "rx-ntuple-filter",
+	/* NETIF_F_RXHASH */          "rx-hashing",
+	/* NETIF_F_RXCSUM */          "rx-checksum",
+	/* NETIF_F_NOCACHE_COPY */    "tx-nocache-copy",
+	/* NETIF_F_LOOPBACK */        "loopback",
+};
+
+static int __ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	if (sset == ETH_SS_FEATURES)
+		return ARRAY_SIZE(netdev_features_strings);
+
+	if (ops && ops->get_sset_count && ops->get_strings)
+		return ops->get_sset_count(dev, sset);
+	else
+		return -EOPNOTSUPP;
+}
+
+static void __ethtool_get_strings(struct net_device *dev,
+	u32 stringset, u8 *data)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	if (stringset == ETH_SS_FEATURES)
+		memcpy(data, netdev_features_strings,
+			sizeof(netdev_features_strings));
+	else
+		/* ops->get_strings is valid because checked earlier */
+		ops->get_strings(dev, stringset, data);
+}
+
+static u32 ethtool_get_feature_mask(u32 eth_cmd)
+{
+	/* feature masks of legacy discrete ethtool ops */
+
+	switch (eth_cmd) {
+	case ETHTOOL_GTXCSUM:
+	case ETHTOOL_STXCSUM:
+		return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+	case ETHTOOL_GRXCSUM:
+	case ETHTOOL_SRXCSUM:
+		return NETIF_F_RXCSUM;
+	case ETHTOOL_GSG:
+	case ETHTOOL_SSG:
+		return NETIF_F_SG;
+	case ETHTOOL_GTSO:
+	case ETHTOOL_STSO:
+		return NETIF_F_ALL_TSO;
+	case ETHTOOL_GUFO:
+	case ETHTOOL_SUFO:
+		return NETIF_F_UFO;
+	case ETHTOOL_GGSO:
+	case ETHTOOL_SGSO:
+		return NETIF_F_GSO;
+	case ETHTOOL_GGRO:
+	case ETHTOOL_SGRO:
+		return NETIF_F_GRO;
+	default:
+		BUG();
+	}
+}
+
+static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	if (!ops)
+		return NULL;
+
+	switch (ethcmd) {
+	case ETHTOOL_GTXCSUM:
+		return ops->get_tx_csum;
+	case ETHTOOL_GRXCSUM:
+		return ops->get_rx_csum;
+	case ETHTOOL_SSG:
+		return ops->get_sg;
+	case ETHTOOL_STSO:
+		return ops->get_tso;
+	case ETHTOOL_SUFO:
+		return ops->get_ufo;
+	default:
+		return NULL;
+	}
+}
+
+static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
+{
+	return !!(dev->features & NETIF_F_ALL_CSUM);
+}
+
+static int ethtool_get_one_feature(struct net_device *dev,
+	char __user *useraddr, u32 ethcmd)
+{
+	u32 mask = ethtool_get_feature_mask(ethcmd);
+	struct ethtool_value edata = {
+		.cmd = ethcmd,
+		.data = !!(dev->features & mask),
+	};
+
+	/* compatibility with discrete get_ ops */
+	if (!(dev->hw_features & mask)) {
+		u32 (*actor)(struct net_device *);
+
+		actor = __ethtool_get_one_feature_actor(dev, ethcmd);
+
+		/* bug compatibility with old get_rx_csum */
+		if (ethcmd == ETHTOOL_GRXCSUM && !actor)
+			actor = __ethtool_get_rx_csum_oldbug;
+
+		if (actor)
+			edata.data = actor(dev);
+	}
+
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		return -EFAULT;
+	return 0;
+}
+
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_sg(struct net_device *dev, u32 data);
+static int __ethtool_set_tso(struct net_device *dev, u32 data);
+static int __ethtool_set_ufo(struct net_device *dev, u32 data);
+
+static int ethtool_set_one_feature(struct net_device *dev,
+	void __user *useraddr, u32 ethcmd)
+{
+	struct ethtool_value edata;
+	u32 mask;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+
+	mask = ethtool_get_feature_mask(ethcmd);
+	mask &= dev->hw_features;
+	if (mask) {
+		if (edata.data)
+			dev->wanted_features |= mask;
+		else
+			dev->wanted_features &= ~mask;
+
+		__netdev_update_features(dev);
+		return 0;
+	}
+
+	/* Driver is not converted to ndo_fix_features or does not
+	 * support changing this offload. In the latter case it won't
+	 * have corresponding ethtool_ops field set.
+	 *
+	 * Following part is to be removed after all drivers advertise
+	 * their changeable features in netdev->hw_features and stop
+	 * using discrete offload setting ops.
+	 */
+
+	switch (ethcmd) {
+	case ETHTOOL_STXCSUM:
+		return __ethtool_set_tx_csum(dev, edata.data);
+	case ETHTOOL_SRXCSUM:
+		return __ethtool_set_rx_csum(dev, edata.data);
+	case ETHTOOL_SSG:
+		return __ethtool_set_sg(dev, edata.data);
+	case ETHTOOL_STSO:
+		return __ethtool_set_tso(dev, edata.data);
+	case ETHTOOL_SUFO:
+		return __ethtool_set_ufo(dev, edata.data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+	u32 changed;
+
+	if (data & ~flags_dup_features)
+		return -EINVAL;
+
+	/* legacy set_flags() op */
+	if (dev->ethtool_ops->set_flags) {
+		if (unlikely(dev->hw_features & flags_dup_features))
+			netdev_warn(dev,
+				"driver BUG: mixed hw_features and set_flags()\n");
+		return dev->ethtool_ops->set_flags(dev, data);
+	}
+
+	/* allow changing only bits set in hw_features */
+	changed = (data ^ dev->features) & flags_dup_features;
+	if (changed & ~dev->hw_features)
+		return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
+
+	dev->wanted_features =
+		(dev->wanted_features & ~changed) | (data & dev->hw_features);
+
+	__netdev_update_features(dev);
+
+	return 0;
+}
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
+	int err;
+
+	if (!dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+
+	err = dev->ethtool_ops->get_settings(dev, &cmd);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_cmd cmd;
+
+	if (!dev->ethtool_ops->set_settings)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_settings(dev, &cmd);
+}
+
+static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
+						  void __user *useraddr)
+{
+	struct ethtool_drvinfo info;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	memset(&info, 0, sizeof(info));
+	info.cmd = ETHTOOL_GDRVINFO;
+	if (ops && ops->get_drvinfo) {
+		ops->get_drvinfo(dev, &info);
+	} else if (dev->dev.parent && dev->dev.parent->driver) {
+		strlcpy(info.bus_info, dev_name(dev->dev.parent),
+			sizeof(info.bus_info));
+		strlcpy(info.driver, dev->dev.parent->driver->name,
+			sizeof(info.driver));
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * this method of obtaining string set info is deprecated;
+	 * Use ETHTOOL_GSSET_INFO instead.
+	 */
+	if (ops && ops->get_sset_count) {
+		int rc;
+
+		rc = ops->get_sset_count(dev, ETH_SS_TEST);
+		if (rc >= 0)
+			info.testinfo_len = rc;
+		rc = ops->get_sset_count(dev, ETH_SS_STATS);
+		if (rc >= 0)
+			info.n_stats = rc;
+		rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+		if (rc >= 0)
+			info.n_priv_flags = rc;
+	}
+	if (ops && ops->get_regs_len)
+		info.regdump_len = ops->get_regs_len(dev);
+	if (ops && ops->get_eeprom_len)
+		info.eedump_len = ops->get_eeprom_len(dev);
+
+	if (copy_to_user(useraddr, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
+						    void __user *useraddr)
+{
+	struct ethtool_sset_info info;
+	u64 sset_mask;
+	int i, idx = 0, n_bits = 0, ret, rc;
+	u32 *info_buf = NULL;
+
+	if (copy_from_user(&info, useraddr, sizeof(info)))
+		return -EFAULT;
+
+	/* store copy of mask, because we zero struct later on */
+	sset_mask = info.sset_mask;
+	if (!sset_mask)
+		return 0;
+
+	/* calculate size of return buffer */
+	n_bits = hweight64(sset_mask);
+
+	memset(&info, 0, sizeof(info));
+	info.cmd = ETHTOOL_GSSET_INFO;
+
+	info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER);
+	if (!info_buf)
+		return -ENOMEM;
+
+	/*
+	 * fill return buffer based on input bitmask and successful
+	 * get_sset_count return
+	 */
+	for (i = 0; i < 64; i++) {
+		if (!(sset_mask & (1ULL << i)))
+			continue;
+
+		rc = __ethtool_get_sset_count(dev, i);
+		if (rc >= 0) {
+			info.sset_mask |= (1ULL << i);
+			info_buf[idx++] = rc;
+		}
+	}
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &info, sizeof(info)))
+		goto out;
+
+	useraddr += offsetof(struct ethtool_sset_info, data);
+	if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+		goto out;
+
+	ret = 0;
+
+out:
+	kfree(info_buf);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+						u32 cmd, void __user *useraddr)
+{
+	struct ethtool_rxnfc info;
+	size_t info_size = sizeof(info);
+
+	if (!dev->ethtool_ops->set_rxnfc)
+		return -EOPNOTSUPP;
+
+	/* struct ethtool_rxnfc was originally defined for
+	 * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+	 * members.  User-space might still be using that
+	 * definition. */
+	if (cmd == ETHTOOL_SRXFH)
+		info_size = (offsetof(struct ethtool_rxnfc, data) +
+			     sizeof(info.data));
+
+	if (copy_from_user(&info, useraddr, info_size))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_rxnfc(dev, &info);
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+						u32 cmd, void __user *useraddr)
+{
+	struct ethtool_rxnfc info;
+	size_t info_size = sizeof(info);
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	int ret;
+	void *rule_buf = NULL;
+
+	if (!ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	/* struct ethtool_rxnfc was originally defined for
+	 * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
+	 * members.  User-space might still be using that
+	 * definition. */
+	if (cmd == ETHTOOL_GRXFH)
+		info_size = (offsetof(struct ethtool_rxnfc, data) +
+			     sizeof(info.data));
+
+	if (copy_from_user(&info, useraddr, info_size))
+		return -EFAULT;
+
+	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
+		if (info.rule_cnt > 0) {
+			if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
+				rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
+						   GFP_USER);
+			if (!rule_buf)
+				return -ENOMEM;
+		}
+	}
+
+	ret = ops->get_rxnfc(dev, &info, rule_buf);
+	if (ret < 0)
+		goto err_out;
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &info, info_size))
+		goto err_out;
+
+	if (rule_buf) {
+		useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+		if (copy_to_user(useraddr, rule_buf,
+				 info.rule_cnt * sizeof(u32)))
+			goto err_out;
+	}
+	ret = 0;
+
+err_out:
+	kfree(rule_buf);
+
+	return ret;
+}
+
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->get_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kzalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	indir->cmd = ETHTOOL_GRXFHINDIR;
+	indir->size = table_size;
+	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(useraddr, indir, full_size))
+		ret = -EFAULT;
+
+out:
+	kfree(indir);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->set_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kmalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	if (copy_from_user(indir, useraddr, full_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
+
+out:
+	kfree(indir);
+	return ret;
+}
+
+static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
+			struct ethtool_rx_ntuple_flow_spec *spec,
+			struct ethtool_rx_ntuple_flow_spec_container *fsc)
+{
+
+	/* don't add filters forever */
+	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) {
+		/* free the container */
+		kfree(fsc);
+		return;
+	}
+
+	/* Copy the whole filter over */
+	fsc->fs.flow_type = spec->flow_type;
+	memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u));
+	memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u));
+
+	fsc->fs.vlan_tag = spec->vlan_tag;
+	fsc->fs.vlan_tag_mask = spec->vlan_tag_mask;
+	fsc->fs.data = spec->data;
+	fsc->fs.data_mask = spec->data_mask;
+	fsc->fs.action = spec->action;
+
+	/* add to the list */
+	list_add_tail_rcu(&fsc->list, &list->list);
+	list->count++;
+}
+
+/*
+ * ethtool does not (or did not) set masks for flow parameters that are
+ * not specified, so if both value and mask are 0 then this must be
+ * treated as equivalent to a mask with all bits set.  Implement that
+ * here rather than in drivers.
+ */
+static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
+	struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
+
+	if (fs->flow_type != TCP_V4_FLOW &&
+	    fs->flow_type != UDP_V4_FLOW &&
+	    fs->flow_type != SCTP_V4_FLOW)
+		return;
+
+	if (!(entry->ip4src | mask->ip4src))
+		mask->ip4src = htonl(0xffffffff);
+	if (!(entry->ip4dst | mask->ip4dst))
+		mask->ip4dst = htonl(0xffffffff);
+	if (!(entry->psrc | mask->psrc))
+		mask->psrc = htons(0xffff);
+	if (!(entry->pdst | mask->pdst))
+		mask->pdst = htons(0xffff);
+	if (!(entry->tos | mask->tos))
+		mask->tos = 0xff;
+	if (!(fs->vlan_tag | fs->vlan_tag_mask))
+		fs->vlan_tag_mask = 0xffff;
+	if (!(fs->data | fs->data_mask))
+		fs->data_mask = 0xffffffffffffffffULL;
+}
+
+static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
+						    void __user *useraddr)
+{
+	struct ethtool_rx_ntuple cmd;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
+	int ret;
+
+	if (!ops->set_rx_ntuple)
+		return -EOPNOTSUPP;
+
+	if (!(dev->features & NETIF_F_NTUPLE))
+		return -EINVAL;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	rx_ntuple_fix_masks(&cmd.fs);
+
+	/*
+	 * Cache filter in dev struct for GET operation only if
+	 * the underlying driver doesn't have its own GET operation, and
+	 * only if the filter was added successfully.  First make sure we
+	 * can allocate the filter, then continue if successful.
+	 */
+	if (!ops->get_rx_ntuple) {
+		fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
+		if (!fsc)
+			return -ENOMEM;
+	}
+
+	ret = ops->set_rx_ntuple(dev, &cmd);
+	if (ret) {
+		kfree(fsc);
+		return ret;
+	}
+
+	if (!ops->get_rx_ntuple)
+		__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc);
+
+	return ret;
+}
+
+static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_gstrings gstrings;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rx_ntuple_flow_spec_container *fsc;
+	u8 *data;
+	char *p;
+	int ret, i, num_strings = 0;
+
+	if (!ops->get_sset_count)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+		return -EFAULT;
+
+	ret = ops->get_sset_count(dev, gstrings.string_set);
+	if (ret < 0)
+		return ret;
+
+	gstrings.len = ret;
+
+	data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	if (ops->get_rx_ntuple) {
+		/* driver-specific filter grab */
+		ret = ops->get_rx_ntuple(dev, gstrings.string_set, data);
+		goto copy;
+	}
+
+	/* default ethtool filter grab */
+	i = 0;
+	p = (char *)data;
+	list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) {
+		sprintf(p, "Filter %d:\n", i);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+
+		switch (fsc->fs.flow_type) {
+		case TCP_V4_FLOW:
+			sprintf(p, "\tFlow Type: TCP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case UDP_V4_FLOW:
+			sprintf(p, "\tFlow Type: UDP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case SCTP_V4_FLOW:
+			sprintf(p, "\tFlow Type: SCTP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case AH_ESP_V4_FLOW:
+			sprintf(p, "\tFlow Type: AH ESP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case ESP_V4_FLOW:
+			sprintf(p, "\tFlow Type: ESP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IP_USER_FLOW:
+			sprintf(p, "\tFlow Type: Raw IP\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IPV4_FLOW:
+			sprintf(p, "\tFlow Type: IPv4\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		default:
+			sprintf(p, "\tFlow Type: Unknown\n");
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			goto unknown_filter;
+		}
+
+		/* now the rest of the filters */
+		switch (fsc->fs.flow_type) {
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+		case SCTP_V4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+				fsc->fs.h_u.tcp_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+				fsc->fs.m_u.tcp_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+				fsc->fs.h_u.tcp_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+				fsc->fs.m_u.tcp_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
+				fsc->fs.h_u.tcp_ip4_spec.psrc,
+				fsc->fs.m_u.tcp_ip4_spec.psrc);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
+				fsc->fs.h_u.tcp_ip4_spec.pdst,
+				fsc->fs.m_u.tcp_ip4_spec.pdst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+				fsc->fs.h_u.tcp_ip4_spec.tos,
+				fsc->fs.m_u.tcp_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case AH_ESP_V4_FLOW:
+		case ESP_V4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+				fsc->fs.h_u.ah_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+				fsc->fs.m_u.ah_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+				fsc->fs.h_u.ah_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+				fsc->fs.m_u.ah_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSPI: %d, mask: 0x%x\n",
+				fsc->fs.h_u.ah_ip4_spec.spi,
+				fsc->fs.m_u.ah_ip4_spec.spi);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+				fsc->fs.h_u.ah_ip4_spec.tos,
+				fsc->fs.m_u.ah_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IP_USER_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+				fsc->fs.m_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+				fsc->fs.m_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		case IPV4_FLOW:
+			sprintf(p, "\tSrc IP addr: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tSrc IP mask: 0x%x\n",
+				fsc->fs.m_u.usr_ip4_spec.ip4src);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP addr: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tDest IP mask: 0x%x\n",
+				fsc->fs.m_u.usr_ip4_spec.ip4dst);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
+				fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.tos,
+				fsc->fs.m_u.usr_ip4_spec.tos);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.ip_ver,
+				fsc->fs.m_u.usr_ip4_spec.ip_ver);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
+				fsc->fs.h_u.usr_ip4_spec.proto,
+				fsc->fs.m_u.usr_ip4_spec.proto);
+			p += ETH_GSTRING_LEN;
+			num_strings++;
+			break;
+		}
+		sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
+			fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+		if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
+			sprintf(p, "\tAction: Drop\n");
+		else
+			sprintf(p, "\tAction: Direct to queue %d\n",
+				fsc->fs.action);
+		p += ETH_GSTRING_LEN;
+		num_strings++;
+unknown_filter:
+		i++;
+	}
+copy:
+	/* indicate to userspace how many strings we actually have */
+	gstrings.len = num_strings;
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+		goto out;
+	useraddr += sizeof(gstrings);
+	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+		goto out;
+	ret = 0;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_regs regs;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	void *regbuf;
+	int reglen, ret;
+
+	if (!ops->get_regs || !ops->get_regs_len)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&regs, useraddr, sizeof(regs)))
+		return -EFAULT;
+
+	reglen = ops->get_regs_len(dev);
+	if (regs.len > reglen)
+		regs.len = reglen;
+
+	regbuf = vzalloc(reglen);
+	if (reglen && !regbuf)
+		return -ENOMEM;
+
+	ops->get_regs(dev, &regs, regbuf);
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &regs, sizeof(regs)))
+		goto out;
+	useraddr += offsetof(struct ethtool_regs, data);
+	if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
+		goto out;
+	ret = 0;
+
+ out:
+	vfree(regbuf);
+	return ret;
+}
+
+static int ethtool_reset(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value reset;
+	int ret;
+
+	if (!dev->ethtool_ops->reset)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&reset, useraddr, sizeof(reset)))
+		return -EFAULT;
+
+	ret = dev->ethtool_ops->reset(dev, &reset.data);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(useraddr, &reset, sizeof(reset)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+
+	if (!dev->ethtool_ops->get_wol)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_wol(dev, &wol);
+
+	if (copy_to_user(useraddr, &wol, sizeof(wol)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_wolinfo wol;
+
+	if (!dev->ethtool_ops->set_wol)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&wol, useraddr, sizeof(wol)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_wol(dev, &wol);
+}
+
+static int ethtool_nway_reset(struct net_device *dev)
+{
+	if (!dev->ethtool_ops->nway_reset)
+		return -EOPNOTSUPP;
+
+	return dev->ethtool_ops->nway_reset(dev);
+}
+
+static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+
+	if (!dev->ethtool_ops->get_link)
+		return -EOPNOTSUPP;
+
+	edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_eeprom eeprom;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	void __user *userbuf = useraddr + sizeof(eeprom);
+	u32 bytes_remaining;
+	u8 *data;
+	int ret = 0;
+
+	if (!ops->get_eeprom || !ops->get_eeprom_len)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+		return -EFAULT;
+
+	/* Check for wrap and zero */
+	if (eeprom.offset + eeprom.len <= eeprom.offset)
+		return -EINVAL;
+
+	/* Check for exceeding total eeprom len */
+	if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+		return -EINVAL;
+
+	data = kmalloc(PAGE_SIZE, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes_remaining = eeprom.len;
+	while (bytes_remaining > 0) {
+		eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+		ret = ops->get_eeprom(dev, &eeprom, data);
+		if (ret)
+			break;
+		if (copy_to_user(userbuf, data, eeprom.len)) {
+			ret = -EFAULT;
+			break;
+		}
+		userbuf += eeprom.len;
+		eeprom.offset += eeprom.len;
+		bytes_remaining -= eeprom.len;
+	}
+
+	eeprom.len = userbuf - (useraddr + sizeof(eeprom));
+	eeprom.offset -= eeprom.len;
+	if (copy_to_user(useraddr, &eeprom, sizeof(eeprom)))
+		ret = -EFAULT;
+
+	kfree(data);
+	return ret;
+}
+
+static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_eeprom eeprom;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	void __user *userbuf = useraddr + sizeof(eeprom);
+	u32 bytes_remaining;
+	u8 *data;
+	int ret = 0;
+
+	if (!ops->set_eeprom || !ops->get_eeprom_len)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&eeprom, useraddr, sizeof(eeprom)))
+		return -EFAULT;
+
+	/* Check for wrap and zero */
+	if (eeprom.offset + eeprom.len <= eeprom.offset)
+		return -EINVAL;
+
+	/* Check for exceeding total eeprom len */
+	if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
+		return -EINVAL;
+
+	data = kmalloc(PAGE_SIZE, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	bytes_remaining = eeprom.len;
+	while (bytes_remaining > 0) {
+		eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE);
+
+		if (copy_from_user(data, userbuf, eeprom.len)) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = ops->set_eeprom(dev, &eeprom, data);
+		if (ret)
+			break;
+		userbuf += eeprom.len;
+		eeprom.offset += eeprom.len;
+		bytes_remaining -= eeprom.len;
+	}
+
+	kfree(data);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
+						   void __user *useraddr)
+{
+	struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+	if (!dev->ethtool_ops->get_coalesce)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_coalesce(dev, &coalesce);
+
+	if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+		return -EFAULT;
+	return 0;
+}
+
+static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
+						   void __user *useraddr)
+{
+	struct ethtool_coalesce coalesce;
+
+	if (!dev->ethtool_ops->set_coalesce)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+}
+
+static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
+
+	if (!dev->ethtool_ops->get_ringparam)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_ringparam(dev, &ringparam);
+
+	if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_ringparam ringparam;
+
+	if (!dev->ethtool_ops->set_ringparam)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+}
+
+static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
+						   void __user *useraddr)
+{
+	struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
+
+	if (!dev->ethtool_ops->get_channels)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_channels(dev, &channels);
+
+	if (copy_to_user(useraddr, &channels, sizeof(channels)))
+		return -EFAULT;
+	return 0;
+}
+
+static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
+						   void __user *useraddr)
+{
+	struct ethtool_channels channels;
+
+	if (!dev->ethtool_ops->set_channels)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&channels, useraddr, sizeof(channels)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_channels(dev, &channels);
+}
+
+static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+
+	if (!dev->ethtool_ops->get_pauseparam)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_pauseparam(dev, &pauseparam);
+
+	if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_pauseparam pauseparam;
+
+	if (!dev->ethtool_ops->set_pauseparam)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+}
+
+static int __ethtool_set_sg(struct net_device *dev, u32 data)
+{
+	int err;
+
+	if (!dev->ethtool_ops->set_sg)
+		return -EOPNOTSUPP;
+
+	if (data && !(dev->features & NETIF_F_ALL_CSUM))
+		return -EINVAL;
+
+	if (!data && dev->ethtool_ops->set_tso) {
+		err = dev->ethtool_ops->set_tso(dev, 0);
+		if (err)
+			return err;
+	}
+
+	if (!data && dev->ethtool_ops->set_ufo) {
+		err = dev->ethtool_ops->set_ufo(dev, 0);
+		if (err)
+			return err;
+	}
+	return dev->ethtool_ops->set_sg(dev, data);
+}
+
+static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
+{
+	int err;
+
+	if (!dev->ethtool_ops->set_tx_csum)
+		return -EOPNOTSUPP;
+
+	if (!data && dev->ethtool_ops->set_sg) {
+		err = __ethtool_set_sg(dev, 0);
+		if (err)
+			return err;
+	}
+
+	return dev->ethtool_ops->set_tx_csum(dev, data);
+}
+
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
+{
+	if (!dev->ethtool_ops->set_rx_csum)
+		return -EOPNOTSUPP;
+
+	if (!data)
+		dev->features &= ~NETIF_F_GRO;
+
+	return dev->ethtool_ops->set_rx_csum(dev, data);
+}
+
+static int __ethtool_set_tso(struct net_device *dev, u32 data)
+{
+	if (!dev->ethtool_ops->set_tso)
+		return -EOPNOTSUPP;
+
+	if (data && !(dev->features & NETIF_F_SG))
+		return -EINVAL;
+
+	return dev->ethtool_ops->set_tso(dev, data);
+}
+
+static int __ethtool_set_ufo(struct net_device *dev, u32 data)
+{
+	if (!dev->ethtool_ops->set_ufo)
+		return -EOPNOTSUPP;
+	if (data && !(dev->features & NETIF_F_SG))
+		return -EINVAL;
+	if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
+		(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
+			== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
+		return -EINVAL;
+	return dev->ethtool_ops->set_ufo(dev, data);
+}
+
+static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_test test;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	u64 *data;
+	int ret, test_len;
+
+	if (!ops->self_test || !ops->get_sset_count)
+		return -EOPNOTSUPP;
+
+	test_len = ops->get_sset_count(dev, ETH_SS_TEST);
+	if (test_len < 0)
+		return test_len;
+	WARN_ON(test_len == 0);
+
+	if (copy_from_user(&test, useraddr, sizeof(test)))
+		return -EFAULT;
+
+	test.len = test_len;
+	data = kmalloc(test_len * sizeof(u64), GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	ops->self_test(dev, &test, data);
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &test, sizeof(test)))
+		goto out;
+	useraddr += sizeof(test);
+	if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+		goto out;
+	ret = 0;
+
+ out:
+	kfree(data);
+	return ret;
+}
+
+static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_gstrings gstrings;
+	u8 *data;
+	int ret;
+
+	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
+		return -EFAULT;
+
+	ret = __ethtool_get_sset_count(dev, gstrings.string_set);
+	if (ret < 0)
+		return ret;
+
+	gstrings.len = ret;
+
+	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	__ethtool_get_strings(dev, gstrings.string_set, data);
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
+		goto out;
+	useraddr += sizeof(gstrings);
+	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+		goto out;
+	ret = 0;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_value id;
+	static bool busy;
+	int rc;
+
+	if (!dev->ethtool_ops->set_phys_id)
+		return -EOPNOTSUPP;
+
+	if (busy)
+		return -EBUSY;
+
+	if (copy_from_user(&id, useraddr, sizeof(id)))
+		return -EFAULT;
+
+	rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
+	if (rc < 0)
+		return rc;
+
+	/* Drop the RTNL lock while waiting, but prevent reentry or
+	 * removal of the device.
+	 */
+	busy = true;
+	dev_hold(dev);
+	rtnl_unlock();
+
+	if (rc == 0) {
+		/* Driver will handle this itself */
+		schedule_timeout_interruptible(
+			id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
+	} else {
+		/* Driver expects to be called at twice the frequency in rc */
+		int n = rc * 2, i, interval = HZ / n;
+
+		/* Count down seconds */
+		do {
+			/* Count down iterations per second */
+			i = n;
+			do {
+				rtnl_lock();
+				rc = dev->ethtool_ops->set_phys_id(dev,
+				    (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+				rtnl_unlock();
+				if (rc)
+					break;
+				schedule_timeout_interruptible(interval);
+			} while (!signal_pending(current) && --i != 0);
+		} while (!signal_pending(current) &&
+			 (id.data == 0 || --id.data != 0));
+	}
+
+	rtnl_lock();
+	dev_put(dev);
+	busy = false;
+
+	(void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
+	return rc;
+}
+
+static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_stats stats;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	u64 *data;
+	int ret, n_stats;
+
+	if (!ops->get_ethtool_stats || !ops->get_sset_count)
+		return -EOPNOTSUPP;
+
+	n_stats = ops->get_sset_count(dev, ETH_SS_STATS);
+	if (n_stats < 0)
+		return n_stats;
+	WARN_ON(n_stats == 0);
+
+	if (copy_from_user(&stats, useraddr, sizeof(stats)))
+		return -EFAULT;
+
+	stats.n_stats = n_stats;
+	data = kmalloc(n_stats * sizeof(u64), GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	ops->get_ethtool_stats(dev, &stats, data);
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &stats, sizeof(stats)))
+		goto out;
+	useraddr += sizeof(stats);
+	if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+		goto out;
+	ret = 0;
+
+ out:
+	kfree(data);
+	return ret;
+}
+
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_perm_addr epaddr;
+
+	if (copy_from_user(&epaddr, useraddr, sizeof(epaddr)))
+		return -EFAULT;
+
+	if (epaddr.size < dev->addr_len)
+		return -ETOOSMALL;
+	epaddr.size = dev->addr_len;
+
+	if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+		return -EFAULT;
+	useraddr += sizeof(epaddr);
+	if (copy_to_user(useraddr, dev->perm_addr, epaddr.size))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_get_value(struct net_device *dev, char __user *useraddr,
+			     u32 cmd, u32 (*actor)(struct net_device *))
+{
+	struct ethtool_value edata = { .cmd = cmd };
+
+	if (!actor)
+		return -EOPNOTSUPP;
+
+	edata.data = actor(dev);
+
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr,
+			     void (*actor)(struct net_device *, u32))
+{
+	struct ethtool_value edata;
+
+	if (!actor)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+
+	actor(dev, edata.data);
+	return 0;
+}
+
+static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
+			     int (*actor)(struct net_device *, u32))
+{
+	struct ethtool_value edata;
+
+	if (!actor)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+
+	return actor(dev, edata.data);
+}
+
+static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
+						   char __user *useraddr)
+{
+	struct ethtool_flash efl;
+
+	if (copy_from_user(&efl, useraddr, sizeof(efl)))
+		return -EFAULT;
+
+	if (!dev->ethtool_ops->flash_device)
+		return -EOPNOTSUPP;
+
+	return dev->ethtool_ops->flash_device(dev, &efl);
+}
+
+static int ethtool_set_dump(struct net_device *dev,
+			void __user *useraddr)
+{
+	struct ethtool_dump dump;
+
+	if (!dev->ethtool_ops->set_dump)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&dump, useraddr, sizeof(dump)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_dump(dev, &dump);
+}
+
+static int ethtool_get_dump_flag(struct net_device *dev,
+				void __user *useraddr)
+{
+	int ret;
+	struct ethtool_dump dump;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+
+	if (!dev->ethtool_ops->get_dump_flag)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&dump, useraddr, sizeof(dump)))
+		return -EFAULT;
+
+	ret = ops->get_dump_flag(dev, &dump);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(useraddr, &dump, sizeof(dump)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_get_dump_data(struct net_device *dev,
+				void __user *useraddr)
+{
+	int ret;
+	__u32 len;
+	struct ethtool_dump dump, tmp;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	void *data = NULL;
+
+	if (!dev->ethtool_ops->get_dump_data ||
+		!dev->ethtool_ops->get_dump_flag)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&dump, useraddr, sizeof(dump)))
+		return -EFAULT;
+
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.cmd = ETHTOOL_GET_DUMP_FLAG;
+	ret = ops->get_dump_flag(dev, &tmp);
+	if (ret)
+		return ret;
+
+	len = (tmp.len > dump.len) ? dump.len : tmp.len;
+	if (!len)
+		return -EFAULT;
+
+	data = vzalloc(tmp.len);
+	if (!data)
+		return -ENOMEM;
+	ret = ops->get_dump_data(dev, &dump, data);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(useraddr, &dump, sizeof(dump))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	useraddr += offsetof(struct ethtool_dump, data);
+	if (copy_to_user(useraddr, data, len))
+		ret = -EFAULT;
+out:
+	vfree(data);
+	return ret;
+}
+
+/* The main entry point in this file.  Called from net/core/dev.c */
+
+int dev_ethtool(struct net *net, struct ifreq *ifr)
+{
+	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+	void __user *useraddr = ifr->ifr_data;
+	u32 ethcmd;
+	int rc;
+	u32 old_features;
+
+	if (!dev || !netif_device_present(dev))
+		return -ENODEV;
+
+	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+		return -EFAULT;
+
+	if (!dev->ethtool_ops) {
+		/* ETHTOOL_GDRVINFO does not require any driver support.
+		 * It is also unprivileged and does not change anything,
+		 * so we can take a shortcut to it. */
+		if (ethcmd == ETHTOOL_GDRVINFO)
+			return ethtool_get_drvinfo(dev, useraddr);
+		else
+			return -EOPNOTSUPP;
+	}
+
+	/* Allow some commands to be done by anyone */
+	switch (ethcmd) {
+	case ETHTOOL_GSET:
+	case ETHTOOL_GDRVINFO:
+	case ETHTOOL_GMSGLVL:
+	case ETHTOOL_GCOALESCE:
+	case ETHTOOL_GRINGPARAM:
+	case ETHTOOL_GPAUSEPARAM:
+	case ETHTOOL_GRXCSUM:
+	case ETHTOOL_GTXCSUM:
+	case ETHTOOL_GSG:
+	case ETHTOOL_GSTRINGS:
+	case ETHTOOL_GTSO:
+	case ETHTOOL_GPERMADDR:
+	case ETHTOOL_GUFO:
+	case ETHTOOL_GGSO:
+	case ETHTOOL_GGRO:
+	case ETHTOOL_GFLAGS:
+	case ETHTOOL_GPFLAGS:
+	case ETHTOOL_GRXFH:
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+	case ETHTOOL_GFEATURES:
+		break;
+	default:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+	}
+
+	if (dev->ethtool_ops->begin) {
+		rc = dev->ethtool_ops->begin(dev);
+		if (rc  < 0)
+			return rc;
+	}
+	old_features = dev->features;
+
+	switch (ethcmd) {
+	case ETHTOOL_GSET:
+		rc = ethtool_get_settings(dev, useraddr);
+		break;
+	case ETHTOOL_SSET:
+		rc = ethtool_set_settings(dev, useraddr);
+		break;
+	case ETHTOOL_GDRVINFO:
+		rc = ethtool_get_drvinfo(dev, useraddr);
+		break;
+	case ETHTOOL_GREGS:
+		rc = ethtool_get_regs(dev, useraddr);
+		break;
+	case ETHTOOL_GWOL:
+		rc = ethtool_get_wol(dev, useraddr);
+		break;
+	case ETHTOOL_SWOL:
+		rc = ethtool_set_wol(dev, useraddr);
+		break;
+	case ETHTOOL_GMSGLVL:
+		rc = ethtool_get_value(dev, useraddr, ethcmd,
+				       dev->ethtool_ops->get_msglevel);
+		break;
+	case ETHTOOL_SMSGLVL:
+		rc = ethtool_set_value_void(dev, useraddr,
+				       dev->ethtool_ops->set_msglevel);
+		break;
+	case ETHTOOL_NWAY_RST:
+		rc = ethtool_nway_reset(dev);
+		break;
+	case ETHTOOL_GLINK:
+		rc = ethtool_get_link(dev, useraddr);
+		break;
+	case ETHTOOL_GEEPROM:
+		rc = ethtool_get_eeprom(dev, useraddr);
+		break;
+	case ETHTOOL_SEEPROM:
+		rc = ethtool_set_eeprom(dev, useraddr);
+		break;
+	case ETHTOOL_GCOALESCE:
+		rc = ethtool_get_coalesce(dev, useraddr);
+		break;
+	case ETHTOOL_SCOALESCE:
+		rc = ethtool_set_coalesce(dev, useraddr);
+		break;
+	case ETHTOOL_GRINGPARAM:
+		rc = ethtool_get_ringparam(dev, useraddr);
+		break;
+	case ETHTOOL_SRINGPARAM:
+		rc = ethtool_set_ringparam(dev, useraddr);
+		break;
+	case ETHTOOL_GPAUSEPARAM:
+		rc = ethtool_get_pauseparam(dev, useraddr);
+		break;
+	case ETHTOOL_SPAUSEPARAM:
+		rc = ethtool_set_pauseparam(dev, useraddr);
+		break;
+	case ETHTOOL_TEST:
+		rc = ethtool_self_test(dev, useraddr);
+		break;
+	case ETHTOOL_GSTRINGS:
+		rc = ethtool_get_strings(dev, useraddr);
+		break;
+	case ETHTOOL_PHYS_ID:
+		rc = ethtool_phys_id(dev, useraddr);
+		break;
+	case ETHTOOL_GSTATS:
+		rc = ethtool_get_stats(dev, useraddr);
+		break;
+	case ETHTOOL_GPERMADDR:
+		rc = ethtool_get_perm_addr(dev, useraddr);
+		break;
+	case ETHTOOL_GFLAGS:
+		rc = ethtool_get_value(dev, useraddr, ethcmd,
+				       (dev->ethtool_ops->get_flags ?
+					dev->ethtool_ops->get_flags :
+					ethtool_op_get_flags));
+		break;
+	case ETHTOOL_SFLAGS:
+		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
+		break;
+	case ETHTOOL_GPFLAGS:
+		rc = ethtool_get_value(dev, useraddr, ethcmd,
+				       dev->ethtool_ops->get_priv_flags);
+		break;
+	case ETHTOOL_SPFLAGS:
+		rc = ethtool_set_value(dev, useraddr,
+				       dev->ethtool_ops->set_priv_flags);
+		break;
+	case ETHTOOL_GRXFH:
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+	case ETHTOOL_GRXCLSRLALL:
+		rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
+		break;
+	case ETHTOOL_SRXFH:
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+		rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
+		break;
+	case ETHTOOL_FLASHDEV:
+		rc = ethtool_flash_device(dev, useraddr);
+		break;
+	case ETHTOOL_RESET:
+		rc = ethtool_reset(dev, useraddr);
+		break;
+	case ETHTOOL_SRXNTUPLE:
+		rc = ethtool_set_rx_ntuple(dev, useraddr);
+		break;
+	case ETHTOOL_GRXNTUPLE:
+		rc = ethtool_get_rx_ntuple(dev, useraddr);
+		break;
+	case ETHTOOL_GSSET_INFO:
+		rc = ethtool_get_sset_info(dev, useraddr);
+		break;
+	case ETHTOOL_GRXFHINDIR:
+		rc = ethtool_get_rxfh_indir(dev, useraddr);
+		break;
+	case ETHTOOL_SRXFHINDIR:
+		rc = ethtool_set_rxfh_indir(dev, useraddr);
+		break;
+	case ETHTOOL_GFEATURES:
+		rc = ethtool_get_features(dev, useraddr);
+		break;
+	case ETHTOOL_SFEATURES:
+		rc = ethtool_set_features(dev, useraddr);
+		break;
+	case ETHTOOL_GTXCSUM:
+	case ETHTOOL_GRXCSUM:
+	case ETHTOOL_GSG:
+	case ETHTOOL_GTSO:
+	case ETHTOOL_GUFO:
+	case ETHTOOL_GGSO:
+	case ETHTOOL_GGRO:
+		rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
+		break;
+	case ETHTOOL_STXCSUM:
+	case ETHTOOL_SRXCSUM:
+	case ETHTOOL_SSG:
+	case ETHTOOL_STSO:
+	case ETHTOOL_SUFO:
+	case ETHTOOL_SGSO:
+	case ETHTOOL_SGRO:
+		rc = ethtool_set_one_feature(dev, useraddr, ethcmd);
+		break;
+	case ETHTOOL_GCHANNELS:
+		rc = ethtool_get_channels(dev, useraddr);
+		break;
+	case ETHTOOL_SCHANNELS:
+		rc = ethtool_set_channels(dev, useraddr);
+		break;
+	case ETHTOOL_SET_DUMP:
+		rc = ethtool_set_dump(dev, useraddr);
+		break;
+	case ETHTOOL_GET_DUMP_FLAG:
+		rc = ethtool_get_dump_flag(dev, useraddr);
+		break;
+	case ETHTOOL_GET_DUMP_DATA:
+		rc = ethtool_get_dump_data(dev, useraddr);
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+	}
+
+	if (dev->ethtool_ops->complete)
+		dev->ethtool_ops->complete(dev);
+
+	if (old_features != dev->features)
+		netdev_features_change(dev);
+
+	return rc;
+}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 00000000..f39ef5c6
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,766 @@
+/*
+ * net/core/fib_rules.c		Generic Routing Rules
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/fib_rules.h>
+
+int fib_default_rule_add(struct fib_rules_ops *ops,
+			 u32 pref, u32 table, u32 flags)
+{
+	struct fib_rule *r;
+
+	r = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (r == NULL)
+		return -ENOMEM;
+
+	atomic_set(&r->refcnt, 1);
+	r->action = FR_ACT_TO_TBL;
+	r->pref = pref;
+	r->table = table;
+	r->flags = flags;
+	r->fr_net = hold_net(ops->fro_net);
+
+	/* The lock is not required here, the list in unreacheable
+	 * at the moment this function is called */
+	list_add_tail(&r->list, &ops->rules_list);
+	return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
+u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+{
+	struct list_head *pos;
+	struct fib_rule *rule;
+
+	if (!list_empty(&ops->rules_list)) {
+		pos = ops->rules_list.next;
+		if (pos->next != &ops->rules_list) {
+			rule = list_entry(pos->next, struct fib_rule, list);
+			if (rule->pref)
+				return rule->pref - 1;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_pref);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+			       u32 pid);
+
+static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+{
+	struct fib_rules_ops *ops;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+		if (ops->family == family) {
+			if (!try_module_get(ops->owner))
+				ops = NULL;
+			rcu_read_unlock();
+			return ops;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+	if (ops)
+		module_put(ops->owner);
+}
+
+static void flush_route_cache(struct fib_rules_ops *ops)
+{
+	if (ops->flush_cache)
+		ops->flush_cache(ops);
+}
+
+static int __fib_rules_register(struct fib_rules_ops *ops)
+{
+	int err = -EEXIST;
+	struct fib_rules_ops *o;
+	struct net *net;
+
+	net = ops->fro_net;
+
+	if (ops->rule_size < sizeof(struct fib_rule))
+		return -EINVAL;
+
+	if (ops->match == NULL || ops->configure == NULL ||
+	    ops->compare == NULL || ops->fill == NULL ||
+	    ops->action == NULL)
+		return -EINVAL;
+
+	spin_lock(&net->rules_mod_lock);
+	list_for_each_entry(o, &net->rules_ops, list)
+		if (ops->family == o->family)
+			goto errout;
+
+	hold_net(net);
+	list_add_tail_rcu(&ops->list, &net->rules_ops);
+	err = 0;
+errout:
+	spin_unlock(&net->rules_mod_lock);
+
+	return err;
+}
+
+struct fib_rules_ops *
+fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
+{
+	struct fib_rules_ops *ops;
+	int err;
+
+	ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+	if (ops == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ops->rules_list);
+	ops->fro_net = net;
+
+	err = __fib_rules_register(ops);
+	if (err) {
+		kfree(ops);
+		ops = ERR_PTR(err);
+	}
+
+	return ops;
+}
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+{
+	struct fib_rule *rule, *tmp;
+
+	list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
+		list_del_rcu(&rule->list);
+		fib_rule_put(rule);
+	}
+}
+
+static void fib_rules_put_rcu(struct rcu_head *head)
+{
+	struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
+	struct net *net = ops->fro_net;
+
+	release_net(net);
+	kfree(ops);
+}
+
+void fib_rules_unregister(struct fib_rules_ops *ops)
+{
+	struct net *net = ops->fro_net;
+
+	spin_lock(&net->rules_mod_lock);
+	list_del_rcu(&ops->list);
+	fib_rules_cleanup_ops(ops);
+	spin_unlock(&net->rules_mod_lock);
+
+	call_rcu(&ops->rcu, fib_rules_put_rcu);
+}
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
+			  struct flowi *fl, int flags)
+{
+	int ret = 0;
+
+	if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+		goto out;
+
+	if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+		goto out;
+
+	if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
+		goto out;
+
+	ret = ops->match(rule, fl, flags);
+out:
+	return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
+}
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+		     int flags, struct fib_lookup_arg *arg)
+{
+	struct fib_rule *rule;
+	int err;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+jumped:
+		if (!fib_rule_match(rule, ops, fl, flags))
+			continue;
+
+		if (rule->action == FR_ACT_GOTO) {
+			struct fib_rule *target;
+
+			target = rcu_dereference(rule->ctarget);
+			if (target == NULL) {
+				continue;
+			} else {
+				rule = target;
+				goto jumped;
+			}
+		} else if (rule->action == FR_ACT_NOP)
+			continue;
+		else
+			err = ops->action(rule, fl, flags, arg);
+
+		if (err != -EAGAIN) {
+			if ((arg->flags & FIB_LOOKUP_NOREF) ||
+			    likely(atomic_inc_not_zero(&rule->refcnt))) {
+				arg->rule = rule;
+				goto out;
+			}
+			break;
+		}
+	}
+
+	err = -ESRCH;
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
+			    struct fib_rules_ops *ops)
+{
+	int err = -EINVAL;
+
+	if (frh->src_len)
+		if (tb[FRA_SRC] == NULL ||
+		    frh->src_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+			goto errout;
+
+	if (frh->dst_len)
+		if (tb[FRA_DST] == NULL ||
+		    frh->dst_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_DST]) != ops->addr_size)
+			goto errout;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule, *r, *last = NULL;
+	struct nlattr *tb[FRA_MAX+1];
+	int err = -EINVAL, unresolved = 0;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(net, frh->family);
+	if (ops == NULL) {
+		err = -EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+	if (err < 0)
+		goto errout;
+
+	err = validate_rulemsg(frh, tb, ops);
+	if (err < 0)
+		goto errout;
+
+	rule = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (rule == NULL) {
+		err = -ENOMEM;
+		goto errout;
+	}
+	rule->fr_net = hold_net(net);
+
+	if (tb[FRA_PRIORITY])
+		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+
+	if (tb[FRA_IIFNAME]) {
+		struct net_device *dev;
+
+		rule->iifindex = -1;
+		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->iifname);
+		if (dev)
+			rule->iifindex = dev->ifindex;
+	}
+
+	if (tb[FRA_OIFNAME]) {
+		struct net_device *dev;
+
+		rule->oifindex = -1;
+		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, rule->oifname);
+		if (dev)
+			rule->oifindex = dev->ifindex;
+	}
+
+	if (tb[FRA_FWMARK]) {
+		rule->mark = nla_get_u32(tb[FRA_FWMARK]);
+		if (rule->mark)
+			/* compatibility: if the mark value is non-zero all bits
+			 * are compared unless a mask is explicitly specified.
+			 */
+			rule->mark_mask = 0xFFFFFFFF;
+	}
+
+	if (tb[FRA_FWMASK])
+		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+
+	rule->action = frh->action;
+	rule->flags = frh->flags;
+	rule->table = frh_get_table(frh, tb);
+
+	if (!tb[FRA_PRIORITY] && ops->default_pref)
+		rule->pref = ops->default_pref(ops);
+
+	err = -EINVAL;
+	if (tb[FRA_GOTO]) {
+		if (rule->action != FR_ACT_GOTO)
+			goto errout_free;
+
+		rule->target = nla_get_u32(tb[FRA_GOTO]);
+		/* Backward jumps are prohibited to avoid endless loops */
+		if (rule->target <= rule->pref)
+			goto errout_free;
+
+		list_for_each_entry(r, &ops->rules_list, list) {
+			if (r->pref == rule->target) {
+				RCU_INIT_POINTER(rule->ctarget, r);
+				break;
+			}
+		}
+
+		if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+			unresolved = 1;
+	} else if (rule->action == FR_ACT_GOTO)
+		goto errout_free;
+
+	err = ops->configure(rule, skb, frh, tb);
+	if (err < 0)
+		goto errout_free;
+
+	list_for_each_entry(r, &ops->rules_list, list) {
+		if (r->pref > rule->pref)
+			break;
+		last = r;
+	}
+
+	fib_rule_get(rule);
+
+	if (last)
+		list_add_rcu(&rule->list, &last->list);
+	else
+		list_add_rcu(&rule->list, &ops->rules_list);
+
+	if (ops->unresolved_rules) {
+		/*
+		 * There are unresolved goto rules in the list, check if
+		 * any of them are pointing to this new rule.
+		 */
+		list_for_each_entry(r, &ops->rules_list, list) {
+			if (r->action == FR_ACT_GOTO &&
+			    r->target == rule->pref &&
+			    rtnl_dereference(r->ctarget) == NULL) {
+				rcu_assign_pointer(r->ctarget, rule);
+				if (--ops->unresolved_rules == 0)
+					break;
+			}
+		}
+	}
+
+	if (rule->action == FR_ACT_GOTO)
+		ops->nr_goto_rules++;
+
+	if (unresolved)
+		ops->unresolved_rules++;
+
+	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
+	flush_route_cache(ops);
+	rules_ops_put(ops);
+	return 0;
+
+errout_free:
+	release_net(rule->fr_net);
+	kfree(rule);
+errout:
+	rules_ops_put(ops);
+	return err;
+}
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule, *tmp;
+	struct nlattr *tb[FRA_MAX+1];
+	int err = -EINVAL;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(net, frh->family);
+	if (ops == NULL) {
+		err = -EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+	if (err < 0)
+		goto errout;
+
+	err = validate_rulemsg(frh, tb, ops);
+	if (err < 0)
+		goto errout;
+
+	list_for_each_entry(rule, &ops->rules_list, list) {
+		if (frh->action && (frh->action != rule->action))
+			continue;
+
+		if (frh->table && (frh_get_table(frh, tb) != rule->table))
+			continue;
+
+		if (tb[FRA_PRIORITY] &&
+		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
+			continue;
+
+		if (tb[FRA_IIFNAME] &&
+		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
+			continue;
+
+		if (tb[FRA_OIFNAME] &&
+		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
+			continue;
+
+		if (tb[FRA_FWMARK] &&
+		    (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
+			continue;
+
+		if (tb[FRA_FWMASK] &&
+		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
+			continue;
+
+		if (!ops->compare(rule, frh, tb))
+			continue;
+
+		if (rule->flags & FIB_RULE_PERMANENT) {
+			err = -EPERM;
+			goto errout;
+		}
+
+		list_del_rcu(&rule->list);
+
+		if (rule->action == FR_ACT_GOTO)
+			ops->nr_goto_rules--;
+
+		/*
+		 * Check if this rule is a target to any of them. If so,
+		 * disable them. As this operation is eventually very
+		 * expensive, it is only performed if goto rules have
+		 * actually been added.
+		 */
+		if (ops->nr_goto_rules > 0) {
+			list_for_each_entry(tmp, &ops->rules_list, list) {
+				if (rtnl_dereference(tmp->ctarget) == rule) {
+					rcu_assign_pointer(tmp->ctarget, NULL);
+					ops->unresolved_rules++;
+				}
+			}
+		}
+
+		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+				   NETLINK_CB(skb).pid);
+		fib_rule_put(rule);
+		flush_route_cache(ops);
+		rules_ops_put(ops);
+		return 0;
+	}
+
+	err = -ENOENT;
+errout:
+	rules_ops_put(ops);
+	return err;
+}
+
+static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+					 struct fib_rule *rule)
+{
+	size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+			 + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+			 + nla_total_size(4) /* FRA_PRIORITY */
+			 + nla_total_size(4) /* FRA_TABLE */
+			 + nla_total_size(4) /* FRA_FWMARK */
+			 + nla_total_size(4); /* FRA_FWMASK */
+
+	if (ops->nlmsg_payload)
+		payload += ops->nlmsg_payload(rule);
+
+	return payload;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+			    u32 pid, u32 seq, int type, int flags,
+			    struct fib_rules_ops *ops)
+{
+	struct nlmsghdr *nlh;
+	struct fib_rule_hdr *frh;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	frh = nlmsg_data(nlh);
+	frh->family = ops->family;
+	frh->table = rule->table;
+	NLA_PUT_U32(skb, FRA_TABLE, rule->table);
+	frh->res1 = 0;
+	frh->res2 = 0;
+	frh->action = rule->action;
+	frh->flags = rule->flags;
+
+	if (rule->action == FR_ACT_GOTO &&
+	    rcu_dereference_raw(rule->ctarget) == NULL)
+		frh->flags |= FIB_RULE_UNRESOLVED;
+
+	if (rule->iifname[0]) {
+		NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname);
+
+		if (rule->iifindex == -1)
+			frh->flags |= FIB_RULE_IIF_DETACHED;
+	}
+
+	if (rule->oifname[0]) {
+		NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname);
+
+		if (rule->oifindex == -1)
+			frh->flags |= FIB_RULE_OIF_DETACHED;
+	}
+
+	if (rule->pref)
+		NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
+
+	if (rule->mark)
+		NLA_PUT_U32(skb, FRA_FWMARK, rule->mark);
+
+	if (rule->mark_mask || rule->mark)
+		NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask);
+
+	if (rule->target)
+		NLA_PUT_U32(skb, FRA_GOTO, rule->target);
+
+	if (ops->fill(rule, skb, frh) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
+		      struct fib_rules_ops *ops)
+{
+	int idx = 0;
+	struct fib_rule *rule;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+		if (idx < cb->args[1])
+			goto skip;
+
+		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
+				     cb->nlh->nlmsg_seq, RTM_NEWRULE,
+				     NLM_F_MULTI, ops) < 0)
+			break;
+skip:
+		idx++;
+	}
+	rcu_read_unlock();
+	cb->args[1] = idx;
+	rules_ops_put(ops);
+
+	return skb->len;
+}
+
+static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_rules_ops *ops;
+	int idx = 0, family;
+
+	family = rtnl_msg_family(cb->nlh);
+	if (family != AF_UNSPEC) {
+		/* Protocol specific dump request */
+		ops = lookup_rules_ops(net, family);
+		if (ops == NULL)
+			return -EAFNOSUPPORT;
+
+		return dump_rules(skb, cb, ops);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &net->rules_ops, list) {
+		if (idx < cb->args[0] || !try_module_get(ops->owner))
+			goto skip;
+
+		if (dump_rules(skb, cb, ops) < 0)
+			break;
+
+		cb->args[1] = 0;
+skip:
+		idx++;
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
+			       u32 pid)
+{
+	struct net *net;
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	net = ops->fro_net;
+	skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, ops->nlgroup, err);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+	struct fib_rule *rule;
+
+	list_for_each_entry(rule, rules, list) {
+		if (rule->iifindex == -1 &&
+		    strcmp(dev->name, rule->iifname) == 0)
+			rule->iifindex = dev->ifindex;
+		if (rule->oifindex == -1 &&
+		    strcmp(dev->name, rule->oifname) == 0)
+			rule->oifindex = dev->ifindex;
+	}
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+	struct fib_rule *rule;
+
+	list_for_each_entry(rule, rules, list) {
+		if (rule->iifindex == dev->ifindex)
+			rule->iifindex = -1;
+		if (rule->oifindex == dev->ifindex)
+			rule->oifindex = -1;
+	}
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+	struct fib_rules_ops *ops;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		list_for_each_entry(ops, &net->rules_ops, list)
+			attach_rules(&ops->rules_list, dev);
+		break;
+
+	case NETDEV_UNREGISTER:
+		list_for_each_entry(ops, &net->rules_ops, list)
+			detach_rules(&ops->rules_list, dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+	.notifier_call = fib_rules_event,
+};
+
+static int __net_init fib_rules_net_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->rules_ops);
+	spin_lock_init(&net->rules_mod_lock);
+	return 0;
+}
+
+static struct pernet_operations fib_rules_net_ops = {
+	.init = fib_rules_net_init,
+};
+
+static int __init fib_rules_init(void)
+{
+	int err;
+	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule);
+
+	err = register_pernet_subsys(&fib_rules_net_ops);
+	if (err < 0)
+		goto fail;
+
+	err = register_netdevice_notifier(&fib_rules_notifier);
+	if (err < 0)
+		goto fail_unregister;
+
+	return 0;
+
+fail_unregister:
+	unregister_pernet_subsys(&fib_rules_net_ops);
+fail:
+	rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
+	rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
+	rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+	return err;
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/filter.c b/net/core/filter.c
new file mode 100644
index 00000000..36f975fa
--- /dev/null
+++ b/net/core/filter.c
@@ -0,0 +1,654 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Author:
+ *     Jay Schulist <jschlst@samba.org>
+ *
+ * Based on the design of:
+ *     - The Berkeley Packet Filter
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <linux/gfp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/filter.h>
+#include <linux/reciprocal_div.h>
+#include <linux/ratelimit.h>
+
+/* No hurry in this branch */
+static void *__load_pointer(const struct sk_buff *skb, int k, unsigned int size)
+{
+	u8 *ptr = NULL;
+
+	if (k >= SKF_NET_OFF)
+		ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+	else if (k >= SKF_LL_OFF)
+		ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
+	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+		return ptr;
+	return NULL;
+}
+
+static inline void *load_pointer(const struct sk_buff *skb, int k,
+				 unsigned int size, void *buffer)
+{
+	if (k >= 0)
+		return skb_header_pointer(skb, k, size, buffer);
+	return __load_pointer(skb, k, size);
+}
+
+/**
+ *	sk_filter - run a packet through a socket filter
+ *	@sk: sock associated with &sk_buff
+ *	@skb: buffer to filter
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+	struct sk_filter *filter;
+
+	err = security_sock_rcv_skb(sk, skb);
+	if (err)
+		return err;
+
+	rcu_read_lock();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter) {
+		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
+
+		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+	}
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/**
+ *	sk_run_filter - run a filter on a socket
+ *	@skb: buffer to run the filter on
+ *	@fentry: filter to apply
+ *
+ * Decode and apply filter instructions to the skb->data.
+ * Return length to keep, 0 for none. @skb is the data we are
+ * filtering, @filter is the array of filter instructions.
+ * Because all jumps are guaranteed to be before last instruction,
+ * and last instruction guaranteed to be a RET, we dont need to check
+ * flen. (We used to pass to this function the length of filter)
+ */
+unsigned int sk_run_filter(const struct sk_buff *skb,
+			   const struct sock_filter *fentry)
+{
+	void *ptr;
+	u32 A = 0;			/* Accumulator */
+	u32 X = 0;			/* Index Register */
+	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
+	u32 tmp;
+	int k;
+
+	/*
+	 * Process array of filter instructions.
+	 */
+	for (;; fentry++) {
+#if defined(CONFIG_X86_32)
+#define	K (fentry->k)
+#else
+		const u32 K = fentry->k;
+#endif
+
+		switch (fentry->code) {
+		case BPF_S_ALU_ADD_X:
+			A += X;
+			continue;
+		case BPF_S_ALU_ADD_K:
+			A += K;
+			continue;
+		case BPF_S_ALU_SUB_X:
+			A -= X;
+			continue;
+		case BPF_S_ALU_SUB_K:
+			A -= K;
+			continue;
+		case BPF_S_ALU_MUL_X:
+			A *= X;
+			continue;
+		case BPF_S_ALU_MUL_K:
+			A *= K;
+			continue;
+		case BPF_S_ALU_DIV_X:
+			if (X == 0)
+				return 0;
+			A /= X;
+			continue;
+		case BPF_S_ALU_DIV_K:
+			A = reciprocal_divide(A, K);
+			continue;
+		case BPF_S_ALU_AND_X:
+			A &= X;
+			continue;
+		case BPF_S_ALU_AND_K:
+			A &= K;
+			continue;
+		case BPF_S_ALU_OR_X:
+			A |= X;
+			continue;
+		case BPF_S_ALU_OR_K:
+			A |= K;
+			continue;
+		case BPF_S_ALU_LSH_X:
+			A <<= X;
+			continue;
+		case BPF_S_ALU_LSH_K:
+			A <<= K;
+			continue;
+		case BPF_S_ALU_RSH_X:
+			A >>= X;
+			continue;
+		case BPF_S_ALU_RSH_K:
+			A >>= K;
+			continue;
+		case BPF_S_ALU_NEG:
+			A = -A;
+			continue;
+		case BPF_S_JMP_JA:
+			fentry += K;
+			continue;
+		case BPF_S_JMP_JGT_K:
+			fentry += (A > K) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JGE_K:
+			fentry += (A >= K) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JEQ_K:
+			fentry += (A == K) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JSET_K:
+			fentry += (A & K) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JGT_X:
+			fentry += (A > X) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JGE_X:
+			fentry += (A >= X) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JEQ_X:
+			fentry += (A == X) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_JMP_JSET_X:
+			fentry += (A & X) ? fentry->jt : fentry->jf;
+			continue;
+		case BPF_S_LD_W_ABS:
+			k = K;
+load_w:
+			ptr = load_pointer(skb, k, 4, &tmp);
+			if (ptr != NULL) {
+				A = get_unaligned_be32(ptr);
+				continue;
+			}
+			return 0;
+		case BPF_S_LD_H_ABS:
+			k = K;
+load_h:
+			ptr = load_pointer(skb, k, 2, &tmp);
+			if (ptr != NULL) {
+				A = get_unaligned_be16(ptr);
+				continue;
+			}
+			return 0;
+		case BPF_S_LD_B_ABS:
+			k = K;
+load_b:
+			ptr = load_pointer(skb, k, 1, &tmp);
+			if (ptr != NULL) {
+				A = *(u8 *)ptr;
+				continue;
+			}
+			return 0;
+		case BPF_S_LD_W_LEN:
+			A = skb->len;
+			continue;
+		case BPF_S_LDX_W_LEN:
+			X = skb->len;
+			continue;
+		case BPF_S_LD_W_IND:
+			k = X + K;
+			goto load_w;
+		case BPF_S_LD_H_IND:
+			k = X + K;
+			goto load_h;
+		case BPF_S_LD_B_IND:
+			k = X + K;
+			goto load_b;
+		case BPF_S_LDX_B_MSH:
+			ptr = load_pointer(skb, K, 1, &tmp);
+			if (ptr != NULL) {
+				X = (*(u8 *)ptr & 0xf) << 2;
+				continue;
+			}
+			return 0;
+		case BPF_S_LD_IMM:
+			A = K;
+			continue;
+		case BPF_S_LDX_IMM:
+			X = K;
+			continue;
+		case BPF_S_LD_MEM:
+			A = mem[K];
+			continue;
+		case BPF_S_LDX_MEM:
+			X = mem[K];
+			continue;
+		case BPF_S_MISC_TAX:
+			X = A;
+			continue;
+		case BPF_S_MISC_TXA:
+			A = X;
+			continue;
+		case BPF_S_RET_K:
+			return K;
+		case BPF_S_RET_A:
+			return A;
+		case BPF_S_ST:
+			mem[K] = A;
+			continue;
+		case BPF_S_STX:
+			mem[K] = X;
+			continue;
+		case BPF_S_ANC_PROTOCOL:
+			A = ntohs(skb->protocol);
+			continue;
+		case BPF_S_ANC_PKTTYPE:
+			A = skb->pkt_type;
+			continue;
+		case BPF_S_ANC_IFINDEX:
+			if (!skb->dev)
+				return 0;
+			A = skb->dev->ifindex;
+			continue;
+		case BPF_S_ANC_MARK:
+			A = skb->mark;
+			continue;
+		case BPF_S_ANC_QUEUE:
+			A = skb->queue_mapping;
+			continue;
+		case BPF_S_ANC_HATYPE:
+			if (!skb->dev)
+				return 0;
+			A = skb->dev->type;
+			continue;
+		case BPF_S_ANC_RXHASH:
+			A = skb->rxhash;
+			continue;
+		case BPF_S_ANC_CPU:
+			A = raw_smp_processor_id();
+			continue;
+		case BPF_S_ANC_NLATTR: {
+			struct nlattr *nla;
+
+			if (skb_is_nonlinear(skb))
+				return 0;
+			if (A > skb->len - sizeof(struct nlattr))
+				return 0;
+
+			nla = nla_find((struct nlattr *)&skb->data[A],
+				       skb->len - A, X);
+			if (nla)
+				A = (void *)nla - (void *)skb->data;
+			else
+				A = 0;
+			continue;
+		}
+		case BPF_S_ANC_NLATTR_NEST: {
+			struct nlattr *nla;
+
+			if (skb_is_nonlinear(skb))
+				return 0;
+			if (A > skb->len - sizeof(struct nlattr))
+				return 0;
+
+			nla = (struct nlattr *)&skb->data[A];
+			if (nla->nla_len > A - skb->len)
+				return 0;
+
+			nla = nla_find_nested(nla, X);
+			if (nla)
+				A = (void *)nla - (void *)skb->data;
+			else
+				A = 0;
+			continue;
+		}
+		default:
+			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
+				       fentry->code, fentry->jt,
+				       fentry->jf, fentry->k);
+			return 0;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(sk_run_filter);
+
+/*
+ * Security :
+ * A BPF program is able to use 16 cells of memory to store intermediate
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
+ * As we dont want to clear mem[] array for each packet going through
+ * sk_run_filter(), we check that filter loaded by user never try to read
+ * a cell if not previously written, and we check all branches to be sure
+ * a malicious user doesn't try to abuse us.
+ */
+static int check_load_and_stores(struct sock_filter *filter, int flen)
+{
+	u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */
+	int pc, ret = 0;
+
+	BUILD_BUG_ON(BPF_MEMWORDS > 16);
+	masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		return -ENOMEM;
+	memset(masks, 0xff, flen * sizeof(*masks));
+
+	for (pc = 0; pc < flen; pc++) {
+		memvalid &= masks[pc];
+
+		switch (filter[pc].code) {
+		case BPF_S_ST:
+		case BPF_S_STX:
+			memvalid |= (1 << filter[pc].k);
+			break;
+		case BPF_S_LD_MEM:
+		case BPF_S_LDX_MEM:
+			if (!(memvalid & (1 << filter[pc].k))) {
+				ret = -EINVAL;
+				goto error;
+			}
+			break;
+		case BPF_S_JMP_JA:
+			/* a jump must set masks on target */
+			masks[pc + 1 + filter[pc].k] &= memvalid;
+			memvalid = ~0;
+			break;
+		case BPF_S_JMP_JEQ_K:
+		case BPF_S_JMP_JEQ_X:
+		case BPF_S_JMP_JGE_K:
+		case BPF_S_JMP_JGE_X:
+		case BPF_S_JMP_JGT_K:
+		case BPF_S_JMP_JGT_X:
+		case BPF_S_JMP_JSET_X:
+		case BPF_S_JMP_JSET_K:
+			/* a jump must set masks on targets */
+			masks[pc + 1 + filter[pc].jt] &= memvalid;
+			masks[pc + 1 + filter[pc].jf] &= memvalid;
+			memvalid = ~0;
+			break;
+		}
+	}
+error:
+	kfree(masks);
+	return ret;
+}
+
+/**
+ *	sk_chk_filter - verify socket filter code
+ *	@filter: filter to verify
+ *	@flen: length of filter
+ *
+ * Check the user's filter code. If we let some ugly
+ * filter code slip through kaboom! The filter must contain
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
+ *
+ * All jumps are forward as they are not signed.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+int sk_chk_filter(struct sock_filter *filter, int flen)
+{
+	/*
+	 * Valid instructions are initialized to non-0.
+	 * Invalid instructions are initialized to 0.
+	 */
+	static const u8 codes[] = {
+		[BPF_ALU|BPF_ADD|BPF_K]  = BPF_S_ALU_ADD_K,
+		[BPF_ALU|BPF_ADD|BPF_X]  = BPF_S_ALU_ADD_X,
+		[BPF_ALU|BPF_SUB|BPF_K]  = BPF_S_ALU_SUB_K,
+		[BPF_ALU|BPF_SUB|BPF_X]  = BPF_S_ALU_SUB_X,
+		[BPF_ALU|BPF_MUL|BPF_K]  = BPF_S_ALU_MUL_K,
+		[BPF_ALU|BPF_MUL|BPF_X]  = BPF_S_ALU_MUL_X,
+		[BPF_ALU|BPF_DIV|BPF_X]  = BPF_S_ALU_DIV_X,
+		[BPF_ALU|BPF_AND|BPF_K]  = BPF_S_ALU_AND_K,
+		[BPF_ALU|BPF_AND|BPF_X]  = BPF_S_ALU_AND_X,
+		[BPF_ALU|BPF_OR|BPF_K]   = BPF_S_ALU_OR_K,
+		[BPF_ALU|BPF_OR|BPF_X]   = BPF_S_ALU_OR_X,
+		[BPF_ALU|BPF_LSH|BPF_K]  = BPF_S_ALU_LSH_K,
+		[BPF_ALU|BPF_LSH|BPF_X]  = BPF_S_ALU_LSH_X,
+		[BPF_ALU|BPF_RSH|BPF_K]  = BPF_S_ALU_RSH_K,
+		[BPF_ALU|BPF_RSH|BPF_X]  = BPF_S_ALU_RSH_X,
+		[BPF_ALU|BPF_NEG]        = BPF_S_ALU_NEG,
+		[BPF_LD|BPF_W|BPF_ABS]   = BPF_S_LD_W_ABS,
+		[BPF_LD|BPF_H|BPF_ABS]   = BPF_S_LD_H_ABS,
+		[BPF_LD|BPF_B|BPF_ABS]   = BPF_S_LD_B_ABS,
+		[BPF_LD|BPF_W|BPF_LEN]   = BPF_S_LD_W_LEN,
+		[BPF_LD|BPF_W|BPF_IND]   = BPF_S_LD_W_IND,
+		[BPF_LD|BPF_H|BPF_IND]   = BPF_S_LD_H_IND,
+		[BPF_LD|BPF_B|BPF_IND]   = BPF_S_LD_B_IND,
+		[BPF_LD|BPF_IMM]         = BPF_S_LD_IMM,
+		[BPF_LDX|BPF_W|BPF_LEN]  = BPF_S_LDX_W_LEN,
+		[BPF_LDX|BPF_B|BPF_MSH]  = BPF_S_LDX_B_MSH,
+		[BPF_LDX|BPF_IMM]        = BPF_S_LDX_IMM,
+		[BPF_MISC|BPF_TAX]       = BPF_S_MISC_TAX,
+		[BPF_MISC|BPF_TXA]       = BPF_S_MISC_TXA,
+		[BPF_RET|BPF_K]          = BPF_S_RET_K,
+		[BPF_RET|BPF_A]          = BPF_S_RET_A,
+		[BPF_ALU|BPF_DIV|BPF_K]  = BPF_S_ALU_DIV_K,
+		[BPF_LD|BPF_MEM]         = BPF_S_LD_MEM,
+		[BPF_LDX|BPF_MEM]        = BPF_S_LDX_MEM,
+		[BPF_ST]                 = BPF_S_ST,
+		[BPF_STX]                = BPF_S_STX,
+		[BPF_JMP|BPF_JA]         = BPF_S_JMP_JA,
+		[BPF_JMP|BPF_JEQ|BPF_K]  = BPF_S_JMP_JEQ_K,
+		[BPF_JMP|BPF_JEQ|BPF_X]  = BPF_S_JMP_JEQ_X,
+		[BPF_JMP|BPF_JGE|BPF_K]  = BPF_S_JMP_JGE_K,
+		[BPF_JMP|BPF_JGE|BPF_X]  = BPF_S_JMP_JGE_X,
+		[BPF_JMP|BPF_JGT|BPF_K]  = BPF_S_JMP_JGT_K,
+		[BPF_JMP|BPF_JGT|BPF_X]  = BPF_S_JMP_JGT_X,
+		[BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K,
+		[BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X,
+	};
+	int pc;
+
+	if (flen == 0 || flen > BPF_MAXINSNS)
+		return -EINVAL;
+
+	/* check the filter code now */
+	for (pc = 0; pc < flen; pc++) {
+		struct sock_filter *ftest = &filter[pc];
+		u16 code = ftest->code;
+
+		if (code >= ARRAY_SIZE(codes))
+			return -EINVAL;
+		code = codes[code];
+		if (!code)
+			return -EINVAL;
+		/* Some instructions need special checks */
+		switch (code) {
+		case BPF_S_ALU_DIV_K:
+			/* check for division by zero */
+			if (ftest->k == 0)
+				return -EINVAL;
+			ftest->k = reciprocal_value(ftest->k);
+			break;
+		case BPF_S_LD_MEM:
+		case BPF_S_LDX_MEM:
+		case BPF_S_ST:
+		case BPF_S_STX:
+			/* check for invalid memory addresses */
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			break;
+		case BPF_S_JMP_JA:
+			/*
+			 * Note, the large ftest->k might cause loops.
+			 * Compare this with conditional jumps below,
+			 * where offsets are limited. --ANK (981016)
+			 */
+			if (ftest->k >= (unsigned)(flen-pc-1))
+				return -EINVAL;
+			break;
+		case BPF_S_JMP_JEQ_K:
+		case BPF_S_JMP_JEQ_X:
+		case BPF_S_JMP_JGE_K:
+		case BPF_S_JMP_JGE_X:
+		case BPF_S_JMP_JGT_K:
+		case BPF_S_JMP_JGT_X:
+		case BPF_S_JMP_JSET_X:
+		case BPF_S_JMP_JSET_K:
+			/* for conditionals both must be safe */
+			if (pc + ftest->jt + 1 >= flen ||
+			    pc + ftest->jf + 1 >= flen)
+				return -EINVAL;
+			break;
+		case BPF_S_LD_W_ABS:
+		case BPF_S_LD_H_ABS:
+		case BPF_S_LD_B_ABS:
+#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE:	\
+				code = BPF_S_ANC_##CODE;	\
+				break
+			switch (ftest->k) {
+			ANCILLARY(PROTOCOL);
+			ANCILLARY(PKTTYPE);
+			ANCILLARY(IFINDEX);
+			ANCILLARY(NLATTR);
+			ANCILLARY(NLATTR_NEST);
+			ANCILLARY(MARK);
+			ANCILLARY(QUEUE);
+			ANCILLARY(HATYPE);
+			ANCILLARY(RXHASH);
+			ANCILLARY(CPU);
+			}
+		}
+		ftest->code = code;
+	}
+
+	/* last instruction must be a RET code */
+	switch (filter[flen - 1].code) {
+	case BPF_S_RET_K:
+	case BPF_S_RET_A:
+		return check_load_and_stores(filter, flen);
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(sk_chk_filter);
+
+/**
+ * 	sk_filter_release_rcu - Release a socket filter by rcu_head
+ *	@rcu: rcu_head that contains the sk_filter to free
+ */
+void sk_filter_release_rcu(struct rcu_head *rcu)
+{
+	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+	bpf_jit_free(fp);
+	kfree(fp);
+}
+EXPORT_SYMBOL(sk_filter_release_rcu);
+
+/**
+ *	sk_attach_filter - attach a socket filter
+ *	@fprog: the filter program
+ *	@sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+	struct sk_filter *fp, *old_fp;
+	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
+	int err;
+
+	/* Make sure new filter is there and in the right amounts. */
+	if (fprog->filter == NULL)
+		return -EINVAL;
+
+	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+		sock_kfree_s(sk, fp, fsize+sizeof(*fp));
+		return -EFAULT;
+	}
+
+	atomic_set(&fp->refcnt, 1);
+	fp->len = fprog->len;
+	fp->bpf_func = sk_run_filter;
+
+	err = sk_chk_filter(fp->insns, fp->len);
+	if (err) {
+		sk_filter_uncharge(sk, fp);
+		return err;
+	}
+
+	bpf_jit_compile(fp);
+
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   sock_owned_by_user(sk));
+	rcu_assign_pointer(sk->sk_filter, fp);
+
+	if (old_fp)
+		sk_filter_uncharge(sk, old_fp);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sk_attach_filter);
+
+int sk_detach_filter(struct sock *sk)
+{
+	int ret = -ENOENT;
+	struct sk_filter *filter;
+
+	filter = rcu_dereference_protected(sk->sk_filter,
+					   sock_owned_by_user(sk));
+	if (filter) {
+		rcu_assign_pointer(sk->sk_filter, NULL);
+		sk_filter_uncharge(sk, filter);
+		ret = 0;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_detach_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
new file mode 100644
index 00000000..a6bda2a5
--- /dev/null
+++ b/net/core/flow.c
@@ -0,0 +1,437 @@
+/* flow.c: Generic flow cache.
+ *
+ * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <linux/percpu.h>
+#include <linux/bitops.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/mutex.h>
+#include <net/flow.h>
+#include <asm/atomic.h>
+#include <linux/security.h>
+
+struct flow_cache_entry {
+	union {
+		struct hlist_node	hlist;
+		struct list_head	gc_list;
+	} u;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_object	*object;
+};
+
+struct flow_cache_percpu {
+	struct hlist_head		*hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
+};
+
+struct flow_flush_info {
+	struct flow_cache		*cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
+};
+
+struct flow_cache {
+	u32				hash_shift;
+	struct flow_cache_percpu __percpu *percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+};
+
+atomic_t flow_cache_genid = ATOMIC_INIT(0);
+EXPORT_SYMBOL(flow_cache_genid);
+static struct flow_cache flow_cache_global;
+static struct kmem_cache *flow_cachep __read_mostly;
+
+static DEFINE_SPINLOCK(flow_cache_gc_lock);
+static LIST_HEAD(flow_cache_gc_list);
+
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
+
+static void flow_cache_new_hashrnd(unsigned long arg)
+{
+	struct flow_cache *fc = (void *) arg;
+	int i;
+
+	for_each_possible_cpu(i)
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
+
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
+}
+
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->object && !fle->object->ops->check(fle->object))
+		return 0;
+	return 1;
+}
+
+static void flow_entry_kill(struct flow_cache_entry *fle)
+{
+	if (fle->object)
+		fle->object->ops->delete(fle->object);
+	kmem_cache_free(flow_cachep, fle);
+}
+
+static void flow_cache_gc_task(struct work_struct *work)
+{
+	struct list_head gc_list;
+	struct flow_cache_entry *fce, *n;
+
+	INIT_LIST_HEAD(&gc_list);
+	spin_lock_bh(&flow_cache_gc_lock);
+	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
+	spin_unlock_bh(&flow_cache_gc_lock);
+
+	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
+		flow_entry_kill(fce);
+}
+static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
+
+static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
+				     int deleted, struct list_head *gc_list)
+{
+	if (deleted) {
+		fcp->hash_count -= deleted;
+		spin_lock_bh(&flow_cache_gc_lock);
+		list_splice_tail(gc_list, &flow_cache_gc_list);
+		spin_unlock_bh(&flow_cache_gc_lock);
+		schedule_work(&flow_cache_gc_work);
+	}
+}
+
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
+{
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
+
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
+		int saved = 0;
+
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+			} else {
+				deleted++;
+				hlist_del(&fle->u.hlist);
+				list_add_tail(&fle->u.gc_list, &gc_list);
+			}
+		}
+	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+}
+
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
+{
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
+
+	__flow_cache_shrink(fc, fcp, shrink_to);
+}
+
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
+{
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
+}
+
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  const struct flowi *key,
+			  size_t keysize)
+{
+	const u32 *k = (const u32 *) key;
+	const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
+
+	return jhash2(k, length, fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1);
+}
+
+/* I hear what you're saying, use memcmp.  But memcmp cannot make
+ * important assumptions that we can here, such as alignment.
+ */
+static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
+			    size_t keysize)
+{
+	const flow_compare_t *k1, *k1_lim, *k2;
+
+	k1 = (const flow_compare_t *) key1;
+	k1_lim = k1 + keysize;
+
+	k2 = (const flow_compare_t *) key2;
+
+	do {
+		if (*k1++ != *k2++)
+			return 1;
+	} while (k1 < k1_lim);
+
+	return 0;
+}
+
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
+		  flow_resolve_t resolver, void *ctx)
+{
+	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache_percpu *fcp;
+	struct flow_cache_entry *fle, *tfle;
+	struct hlist_node *entry;
+	struct flow_cache_object *flo;
+	size_t keysize;
+	unsigned int hash;
+
+	local_bh_disable();
+	fcp = this_cpu_ptr(fc->percpu);
+
+	fle = NULL;
+	flo = NULL;
+
+	keysize = flow_key_size(family);
+	if (!keysize)
+		goto nocache;
+
+	/* Packet really early in init?  Making flow_cache_init a
+	 * pre-smp initcall would solve this.  --RR */
+	if (!fcp->hash_table)
+		goto nocache;
+
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+
+	hash = flow_hash_code(fc, fcp, key, keysize);
+	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
+		if (tfle->family == family &&
+		    tfle->dir == dir &&
+		    flow_key_compare(key, &tfle->key, keysize) == 0) {
+			fle = tfle;
+			break;
+		}
+	}
+
+	if (unlikely(!fle)) {
+		if (fcp->hash_count > fc->high_watermark)
+			flow_cache_shrink(fc, fcp);
+
+		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
+		if (fle) {
+			fle->family = family;
+			fle->dir = dir;
+			memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
+			fle->object = NULL;
+			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
+			fcp->hash_count++;
+		}
+	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
+		flo = fle->object;
+		if (!flo)
+			goto ret_object;
+		flo = flo->ops->get(flo);
+		if (flo)
+			goto ret_object;
+	} else if (fle->object) {
+	        flo = fle->object;
+	        flo->ops->delete(flo);
+	        fle->object = NULL;
+	}
+
+nocache:
+	flo = NULL;
+	if (fle) {
+		flo = fle->object;
+		fle->object = NULL;
+	}
+	flo = resolver(net, key, family, dir, flo, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (!IS_ERR(flo))
+			fle->object = flo;
+		else
+			fle->genid--;
+	} else {
+		if (flo && !IS_ERR(flo))
+			flo->ops->delete(flo);
+	}
+ret_object:
+	local_bh_enable();
+	return flo;
+}
+EXPORT_SYMBOL(flow_cache_lookup);
+
+static void flow_cache_flush_tasklet(unsigned long data)
+{
+	struct flow_flush_info *info = (void *)data;
+	struct flow_cache *fc = info->cache;
+	struct flow_cache_percpu *fcp;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry, *tmp;
+	LIST_HEAD(gc_list);
+	int i, deleted = 0;
+
+	fcp = this_cpu_ptr(fc->percpu);
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
+		hlist_for_each_entry_safe(fle, entry, tmp,
+					  &fcp->hash_table[i], u.hlist) {
+			if (flow_entry_valid(fle))
+				continue;
+
+			deleted++;
+			hlist_del(&fle->u.hlist);
+			list_add_tail(&fle->u.gc_list, &gc_list);
+		}
+	}
+
+	flow_cache_queue_garbage(fcp, deleted, &gc_list);
+
+	if (atomic_dec_and_test(&info->cpuleft))
+		complete(&info->completion);
+}
+
+static void flow_cache_flush_per_cpu(void *data)
+{
+	struct flow_flush_info *info = data;
+	int cpu;
+	struct tasklet_struct *tasklet;
+
+	cpu = smp_processor_id();
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+	tasklet->data = (unsigned long)info;
+	tasklet_schedule(tasklet);
+}
+
+void flow_cache_flush(void)
+{
+	struct flow_flush_info info;
+	static DEFINE_MUTEX(flow_flush_sem);
+
+	/* Don't want cpus going down or up during this. */
+	get_online_cpus();
+	mutex_lock(&flow_flush_sem);
+	info.cache = &flow_cache_global;
+	atomic_set(&info.cpuleft, num_online_cpus());
+	init_completion(&info.completion);
+
+	local_bh_disable();
+	smp_call_function(flow_cache_flush_per_cpu, &info, 0);
+	flow_cache_flush_tasklet((unsigned long)&info);
+	local_bh_enable();
+
+	wait_for_completion(&info.completion);
+	mutex_unlock(&flow_flush_sem);
+	put_online_cpus();
+}
+
+static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
+{
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+	size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
+
+	if (!fcp->hash_table) {
+		fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+		if (!fcp->hash_table) {
+			pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
+			return -ENOMEM;
+		}
+		fcp->hash_rnd_recalc = 1;
+		fcp->hash_count = 0;
+		tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+	}
+	return 0;
+}
+
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
+			  unsigned long action,
+			  void *hcpu)
+{
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int res, cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		res = flow_cache_cpu_prepare(fc, cpu);
+		if (res)
+			return notifier_from_errno(res);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		__flow_cache_shrink(fc, fcp, 0);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int __init flow_cache_init(struct flow_cache *fc)
+{
+	int i;
+
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
+	if (!fc->percpu)
+		return -ENOMEM;
+
+	for_each_online_cpu(i) {
+		if (flow_cache_cpu_prepare(fc, i))
+			return -ENOMEM;
+	}
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
+
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+		    (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
+
+	return 0;
+}
+
+static int __init flow_cache_init_global(void)
+{
+	flow_cachep = kmem_cache_create("flow_cache",
+					sizeof(struct flow_cache_entry),
+					0, SLAB_PANIC, NULL);
+
+	return flow_cache_init(&flow_cache_global);
+}
+
+module_init(flow_cache_init_global);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
new file mode 100644
index 00000000..43b03dd7
--- /dev/null
+++ b/net/core/gen_estimator.c
@@ -0,0 +1,322 @@
+/*
+ * net/sched/gen_estimator.c	Simple rate estimator.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *              Jamal Hadi Salim - moved it to net/core and reshulfed
+ *              names to make it usable in general net subsystem.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/gen_stats.h>
+
+/*
+   This code is NOT intended to be used for statistics collection,
+   its purpose is to provide a base for statistical multiplexing
+   for controlled load service.
+   If you need only statistics, run a user level daemon which
+   periodically reads byte counters.
+
+   Unfortunately, rate estimation is not a very easy task.
+   F.e. I did not find a simple way to estimate the current peak rate
+   and even failed to formulate the problem 8)8)
+
+   So I preferred not to built an estimator into the scheduler,
+   but run this task separately.
+   Ideally, it should be kernel thread(s), but for now it runs
+   from timers, which puts apparent top bounds on the number of rated
+   flows, has minimal overhead on small, but is enough
+   to handle controlled load service, sets of aggregates.
+
+   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
+
+   avrate = avrate*(1-W) + rate*W
+
+   where W is chosen as negative power of 2: W = 2^(-ewma_log)
+
+   The resulting time constant is:
+
+   T = A/(-ln(1-W))
+
+
+   NOTES.
+
+   * avbps is scaled by 2^5, avpps is scaled by 2^10.
+   * both values are reported as 32 bit unsigned values. bps can
+     overflow for fast links : max speed being 34360Mbit/sec
+   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
+     for HZ=100 and HZ=1024 8)), maximal interval
+     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
+     are too expensive, longer ones can be implemented
+     at user level painlessly.
+ */
+
+#define EST_MAX_INTERVAL	5
+
+struct gen_estimator
+{
+	struct list_head	list;
+	struct gnet_stats_basic_packed	*bstats;
+	struct gnet_stats_rate_est	*rate_est;
+	spinlock_t		*stats_lock;
+	int			ewma_log;
+	u64			last_bytes;
+	u64			avbps;
+	u32			last_packets;
+	u32			avpps;
+	struct rcu_head		e_rcu;
+	struct rb_node		node;
+};
+
+struct gen_estimator_head
+{
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
+
+/* Protects against NULL dereference */
+static DEFINE_RWLOCK(est_lock);
+
+/* Protects against soft lockup during large deletion */
+static struct rb_root est_root = RB_ROOT;
+static DEFINE_SPINLOCK(est_tree_lock);
+
+static void est_timer(unsigned long arg)
+{
+	int idx = (int)arg;
+	struct gen_estimator *e;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &elist[idx].list, list) {
+		u64 nbytes;
+		u64 brate;
+		u32 npackets;
+		u32 rate;
+
+		spin_lock(e->stats_lock);
+		read_lock(&est_lock);
+		if (e->bstats == NULL)
+			goto skip;
+
+		nbytes = e->bstats->bytes;
+		npackets = e->bstats->packets;
+		brate = (nbytes - e->last_bytes)<<(7 - idx);
+		e->last_bytes = nbytes;
+		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
+		e->rate_est->bps = (e->avbps+0xF)>>5;
+
+		rate = (npackets - e->last_packets)<<(12 - idx);
+		e->last_packets = npackets;
+		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
+		e->rate_est->pps = (e->avpps+0x1FF)>>10;
+skip:
+		read_unlock(&est_lock);
+		spin_unlock(e->stats_lock);
+	}
+
+	if (!list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+	rcu_read_unlock();
+}
+
+static void gen_add_node(struct gen_estimator *est)
+{
+	struct rb_node **p = &est_root.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct gen_estimator *e;
+
+		parent = *p;
+		e = rb_entry(parent, struct gen_estimator, node);
+
+		if (est->bstats > e->bstats)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&est->node, parent, p);
+	rb_insert_color(&est->node, &est_root);
+}
+
+static
+struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
+				    const struct gnet_stats_rate_est *rate_est)
+{
+	struct rb_node *p = est_root.rb_node;
+
+	while (p) {
+		struct gen_estimator *e;
+
+		e = rb_entry(p, struct gen_estimator, node);
+
+		if (bstats > e->bstats)
+			p = p->rb_right;
+		else if (bstats < e->bstats || rate_est != e->rate_est)
+			p = p->rb_left;
+		else
+			return e;
+	}
+	return NULL;
+}
+
+/**
+ * gen_new_estimator - create a new rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Creates a new rate estimator with &bstats as source and &rate_est
+ * as destination. A new timer with the interval specified in the
+ * configuration TLV is created. Upon each interval, the latest statistics
+ * will be read from &bstats and the estimated rate will be stored in
+ * &rate_est with the statistics lock grabed during this period.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ */
+int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
+		      struct gnet_stats_rate_est *rate_est,
+		      spinlock_t *stats_lock,
+		      struct nlattr *opt)
+{
+	struct gen_estimator *est;
+	struct gnet_estimator *parm = nla_data(opt);
+	int idx;
+
+	if (nla_len(opt) < sizeof(*parm))
+		return -EINVAL;
+
+	if (parm->interval < -2 || parm->interval > 3)
+		return -EINVAL;
+
+	est = kzalloc(sizeof(*est), GFP_KERNEL);
+	if (est == NULL)
+		return -ENOBUFS;
+
+	idx = parm->interval + 2;
+	est->bstats = bstats;
+	est->rate_est = rate_est;
+	est->stats_lock = stats_lock;
+	est->ewma_log = parm->ewma_log;
+	est->last_bytes = bstats->bytes;
+	est->avbps = rate_est->bps<<5;
+	est->last_packets = bstats->packets;
+	est->avpps = rate_est->pps<<10;
+
+	spin_lock_bh(&est_tree_lock);
+	if (!elist[idx].timer.function) {
+		INIT_LIST_HEAD(&elist[idx].list);
+		setup_timer(&elist[idx].timer, est_timer, idx);
+	}
+
+	if (list_empty(&elist[idx].list))
+		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
+
+	list_add_rcu(&est->list, &elist[idx].list);
+	gen_add_node(est);
+	spin_unlock_bh(&est_tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(gen_new_estimator);
+
+/**
+ * gen_kill_estimator - remove a rate estimator
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Removes the rate estimator specified by &bstats and &rate_est.
+ *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
+ */
+void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
+			struct gnet_stats_rate_est *rate_est)
+{
+	struct gen_estimator *e;
+
+	spin_lock_bh(&est_tree_lock);
+	while ((e = gen_find_node(bstats, rate_est))) {
+		rb_erase(&e->node, &est_root);
+
+		write_lock(&est_lock);
+		e->bstats = NULL;
+		write_unlock(&est_lock);
+
+		list_del_rcu(&e->list);
+		kfree_rcu(e, e_rcu);
+	}
+	spin_unlock_bh(&est_tree_lock);
+}
+EXPORT_SYMBOL(gen_kill_estimator);
+
+/**
+ * gen_replace_estimator - replace rate estimator configuration
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ * @stats_lock: statistics lock
+ * @opt: rate estimator configuration TLV
+ *
+ * Replaces the configuration of a rate estimator by calling
+ * gen_kill_estimator() and gen_new_estimator().
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
+			  struct gnet_stats_rate_est *rate_est,
+			  spinlock_t *stats_lock, struct nlattr *opt)
+{
+	gen_kill_estimator(bstats, rate_est);
+	return gen_new_estimator(bstats, rate_est, stats_lock, opt);
+}
+EXPORT_SYMBOL(gen_replace_estimator);
+
+/**
+ * gen_estimator_active - test if estimator is currently in use
+ * @bstats: basic statistics
+ * @rate_est: rate estimator statistics
+ *
+ * Returns true if estimator is active, and false if not.
+ */
+bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
+			  const struct gnet_stats_rate_est *rate_est)
+{
+	bool res;
+
+	ASSERT_RTNL();
+
+	spin_lock_bh(&est_tree_lock);
+	res = gen_find_node(bstats, rate_est) != NULL;
+	spin_unlock_bh(&est_tree_lock);
+
+	return res;
+}
+EXPORT_SYMBOL(gen_estimator_active);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
new file mode 100644
index 00000000..0452eb27
--- /dev/null
+++ b/net/core/gen_stats.c
@@ -0,0 +1,250 @@
+/*
+ * net/core/gen_stats.c
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * Authors:  Thomas Graf <tgraf@suug.ch>
+ *           Jamal Hadi Salim
+ *           Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * See Documentation/networking/gen_stats.txt
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/gen_stats.h>
+#include <net/netlink.h>
+#include <net/gen_stats.h>
+
+
+static inline int
+gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
+{
+	NLA_PUT(d->skb, type, size, buf);
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(d->lock);
+	return -1;
+}
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
+ * @xstats_type: TLV type for backward compatibility xstats TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * The dumping handle is marked to be in backward compatibility mode telling
+ * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
+	int xstats_type, spinlock_t *lock, struct gnet_dump *d)
+	__acquires(lock)
+{
+	memset(d, 0, sizeof(*d));
+
+	spin_lock_bh(lock);
+	d->lock = lock;
+	if (type)
+		d->tail = (struct nlattr *)skb_tail_pointer(skb);
+	d->skb = skb;
+	d->compat_tc_stats = tc_stats_type;
+	d->compat_xstats = xstats_type;
+
+	if (d->tail)
+		return gnet_stats_copy(d, type, NULL, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
+
+/**
+ * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
+ * @skb: socket buffer to put statistics TLVs into
+ * @type: TLV type for top level statistic TLV
+ * @lock: statistics lock
+ * @d: dumping handle
+ *
+ * Initializes the dumping handle, grabs the statistic lock and appends
+ * an empty TLV header to the socket buffer for use a container for all
+ * other statistic TLVS.
+ *
+ * Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
+	struct gnet_dump *d)
+{
+	return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
+}
+EXPORT_SYMBOL(gnet_stats_start_copy);
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @d: dumping handle
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
+{
+	if (d->compat_tc_stats) {
+		d->tc_stats.bytes = b->bytes;
+		d->tc_stats.packets = b->packets;
+	}
+
+	if (d->tail) {
+		struct gnet_stats_basic sb;
+
+		memset(&sb, 0, sizeof(sb));
+		sb.bytes = b->bytes;
+		sb.packets = b->packets;
+		return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
+	}
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic);
+
+/**
+ * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
+ * @d: dumping handle
+ * @b: basic statistics
+ * @r: rate estimator statistics
+ *
+ * Appends the rate estimator statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_rate_est(struct gnet_dump *d,
+			 const struct gnet_stats_basic_packed *b,
+			 struct gnet_stats_rate_est *r)
+{
+	if (b && !gen_estimator_active(b, r))
+		return 0;
+
+	if (d->compat_tc_stats) {
+		d->tc_stats.bps = r->bps;
+		d->tc_stats.pps = r->pps;
+	}
+
+	if (d->tail)
+		return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r));
+
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
+
+/**
+ * gnet_stats_copy_queue - copy queue statistics into statistics TLV
+ * @d: dumping handle
+ * @q: queue statistics
+ *
+ * Appends the queue statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
+{
+	if (d->compat_tc_stats) {
+		d->tc_stats.drops = q->drops;
+		d->tc_stats.qlen = q->qlen;
+		d->tc_stats.backlog = q->backlog;
+		d->tc_stats.overlimits = q->overlimits;
+	}
+
+	if (d->tail)
+		return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q));
+
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_queue);
+
+/**
+ * gnet_stats_copy_app - copy application specific statistics into statistics TLV
+ * @d: dumping handle
+ * @st: application specific statistics data
+ * @len: length of data
+ *
+ * Appends the application sepecific statistics to the top level TLV created by
+ * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
+ * handle is in backward compatibility mode.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
+{
+	if (d->compat_xstats) {
+		d->xstats = st;
+		d->xstats_len = len;
+	}
+
+	if (d->tail)
+		return gnet_stats_copy(d, TCA_STATS_APP, st, len);
+
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_copy_app);
+
+/**
+ * gnet_stats_finish_copy - finish dumping procedure
+ * @d: dumping handle
+ *
+ * Corrects the length of the top level TLV to include all TLVs added
+ * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
+ * if gnet_stats_start_copy_compat() was used and releases the statistics
+ * lock.
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_finish_copy(struct gnet_dump *d)
+{
+	if (d->tail)
+		d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
+
+	if (d->compat_tc_stats)
+		if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
+			sizeof(d->tc_stats)) < 0)
+			return -1;
+
+	if (d->compat_xstats && d->xstats) {
+		if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
+			d->xstats_len) < 0)
+			return -1;
+	}
+
+	spin_unlock_bh(d->lock);
+	return 0;
+}
+EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/iovec.c b/net/core/iovec.c
new file mode 100644
index 00000000..c40f27e7
--- /dev/null
+++ b/net/core/iovec.c
@@ -0,0 +1,264 @@
+/*
+ *	iovec manipulation routines.
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *		Andrew Lunn	:	Errors in iovec copying.
+ *		Pedro Roque	:	Added memcpy_fromiovecend and
+ *					csum_..._fromiovecend.
+ *		Andi Kleen	:	fixed error handling for 2.1
+ *		Alexey Kuznetsov:	2.1 optimisations
+ *		Andi Kleen	:	Fix csum*fromiovecend for IPv6.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <net/checksum.h>
+#include <net/sock.h>
+
+/*
+ *	Verify iovec. The caller must ensure that the iovec is big enough
+ *	to hold the message iovec.
+ *
+ *	Save time not doing access_ok. copy_*_user will make this work
+ *	in any case.
+ */
+
+int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
+{
+	int size, ct, err;
+
+	if (m->msg_namelen) {
+		if (mode == VERIFY_READ) {
+			void __user *namep;
+			namep = (void __user __force *) m->msg_name;
+			err = move_addr_to_kernel(namep, m->msg_namelen,
+						  address);
+			if (err < 0)
+				return err;
+		}
+		m->msg_name = address;
+	} else {
+		m->msg_name = NULL;
+	}
+
+	size = m->msg_iovlen * sizeof(struct iovec);
+	if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
+		return -EFAULT;
+
+	m->msg_iov = iov;
+	err = 0;
+
+	for (ct = 0; ct < m->msg_iovlen; ct++) {
+		size_t len = iov[ct].iov_len;
+
+		if (len > INT_MAX - err) {
+			len = INT_MAX - err;
+			iov[ct].iov_len = len;
+		}
+		err += len;
+	}
+
+	return err;
+}
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			if (copy_to_user(iov->iov_base, kdata, copy))
+				return -EFAULT;
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovec);
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+		      int offset, int len)
+{
+	int copy;
+	for (; len > 0; ++iov) {
+		/* Skip over the finished iovecs */
+		if (unlikely(offset >= iov->iov_len)) {
+			offset -= iov->iov_len;
+			continue;
+		}
+		copy = min_t(unsigned int, iov->iov_len - offset, len);
+		if (copy_to_user(iov->iov_base + offset, kdata, copy))
+			return -EFAULT;
+		offset = 0;
+		kdata += copy;
+		len -= copy;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovecend);
+
+/*
+ *	Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, len, iov->iov_len);
+			if (copy_from_user(kdata, iov->iov_base, copy))
+				return -EFAULT;
+			len -= copy;
+			kdata += copy;
+			iov->iov_base += copy;
+			iov->iov_len -= copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovec);
+
+/*
+ *	Copy iovec from kernel. Returns -EFAULT on error.
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			int offset, int len)
+{
+	/* Skip over the finished iovecs */
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		iov++;
+	}
+
+	while (len > 0) {
+		u8 __user *base = iov->iov_base + offset;
+		int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+		offset = 0;
+		if (copy_from_user(kdata, base, copy))
+			return -EFAULT;
+		len -= copy;
+		kdata += copy;
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovecend);
+
+/*
+ *	And now for the all-in-one: copy and checksum from a user iovec
+ *	directly to a datagram
+ *	Calls to csum_partial but the last must be in 32 bit chunks
+ *
+ *	ip_build_xmit must ensure that when fragmenting only the last
+ *	call to this function will be unaligned also.
+ */
+int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
+				 int offset, unsigned int len, __wsum *csump)
+{
+	__wsum csum = *csump;
+	int partial_cnt = 0, err = 0;
+
+	/* Skip over the finished iovecs */
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		iov++;
+	}
+
+	while (len > 0) {
+		u8 __user *base = iov->iov_base + offset;
+		int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+		offset = 0;
+
+		/* There is a remnant from previous iov. */
+		if (partial_cnt) {
+			int par_len = 4 - partial_cnt;
+
+			/* iov component is too short ... */
+			if (par_len > copy) {
+				if (copy_from_user(kdata, base, copy))
+					goto out_fault;
+				kdata += copy;
+				base += copy;
+				partial_cnt += copy;
+				len -= copy;
+				iov++;
+				if (len)
+					continue;
+				*csump = csum_partial(kdata - partial_cnt,
+							 partial_cnt, csum);
+				goto out;
+			}
+			if (copy_from_user(kdata, base, par_len))
+				goto out_fault;
+			csum = csum_partial(kdata - partial_cnt, 4, csum);
+			kdata += par_len;
+			base  += par_len;
+			copy  -= par_len;
+			len   -= par_len;
+			partial_cnt = 0;
+		}
+
+		if (len > copy) {
+			partial_cnt = copy % 4;
+			if (partial_cnt) {
+				copy -= partial_cnt;
+				if (copy_from_user(kdata + copy, base + copy,
+						partial_cnt))
+					goto out_fault;
+			}
+		}
+
+		if (copy) {
+			csum = csum_and_copy_from_user(base, kdata, copy,
+							csum, &err);
+			if (err)
+				goto out;
+		}
+		len   -= copy + partial_cnt;
+		kdata += copy + partial_cnt;
+		iov++;
+	}
+	*csump = csum;
+out:
+	return err;
+
+out_fault:
+	err = -EFAULT;
+	goto out;
+}
+EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h
new file mode 100644
index 00000000..283c2b99
--- /dev/null
+++ b/net/core/kmap_skb.h
@@ -0,0 +1,19 @@
+#include <linux/highmem.h>
+
+static inline void *kmap_skb_frag(const skb_frag_t *frag)
+{
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(in_irq());
+
+	local_bh_disable();
+#endif
+	return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
+}
+
+static inline void kunmap_skb_frag(void *vaddr)
+{
+	kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ);
+#ifdef CONFIG_HIGHMEM
+	local_bh_enable();
+#endif
+}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
new file mode 100644
index 00000000..357bd4ee
--- /dev/null
+++ b/net/core/link_watch.c
@@ -0,0 +1,246 @@
+/*
+ * Linux network device link state notification
+ *
+ * Author:
+ *     Stefan Rompf <sux@loplof.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <asm/types.h>
+
+
+enum lw_bits {
+	LW_URGENT = 0,
+};
+
+static unsigned long linkwatch_flags;
+static unsigned long linkwatch_nextevent;
+
+static void linkwatch_event(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
+
+static LIST_HEAD(lweventlist);
+static DEFINE_SPINLOCK(lweventlist_lock);
+
+static unsigned char default_operstate(const struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		return (dev->ifindex != dev->iflink ?
+			IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+
+	if (netif_dormant(dev))
+		return IF_OPER_DORMANT;
+
+	return IF_OPER_UP;
+}
+
+
+static void rfc2863_policy(struct net_device *dev)
+{
+	unsigned char operstate = default_operstate(dev);
+
+	if (operstate == dev->operstate)
+		return;
+
+	write_lock_bh(&dev_base_lock);
+
+	switch(dev->link_mode) {
+	case IF_LINK_MODE_DORMANT:
+		if (operstate == IF_OPER_UP)
+			operstate = IF_OPER_DORMANT;
+		break;
+
+	case IF_LINK_MODE_DEFAULT:
+	default:
+		break;
+	}
+
+	dev->operstate = operstate;
+
+	write_unlock_bh(&dev_base_lock);
+}
+
+
+static bool linkwatch_urgent_event(struct net_device *dev)
+{
+	return netif_running(dev) && netif_carrier_ok(dev) &&
+		qdisc_tx_changing(dev);
+}
+
+
+static void linkwatch_add_event(struct net_device *dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&lweventlist_lock, flags);
+	if (list_empty(&dev->link_watch_list)) {
+		list_add_tail(&dev->link_watch_list, &lweventlist);
+		dev_hold(dev);
+	}
+	spin_unlock_irqrestore(&lweventlist_lock, flags);
+}
+
+
+static void linkwatch_schedule_work(int urgent)
+{
+	unsigned long delay = linkwatch_nextevent - jiffies;
+
+	if (test_bit(LW_URGENT, &linkwatch_flags))
+		return;
+
+	/* Minimise down-time: drop delay for up event. */
+	if (urgent) {
+		if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
+			return;
+		delay = 0;
+	}
+
+	/* If we wrap around we'll delay it by at most HZ. */
+	if (delay > HZ)
+		delay = 0;
+
+	/*
+	 * This is true if we've scheduled it immeditately or if we don't
+	 * need an immediate execution and it's already pending.
+	 */
+	if (schedule_delayed_work(&linkwatch_work, delay) == !delay)
+		return;
+
+	/* Don't bother if there is nothing urgent. */
+	if (!test_bit(LW_URGENT, &linkwatch_flags))
+		return;
+
+	/* It's already running which is good enough. */
+	if (!__cancel_delayed_work(&linkwatch_work))
+		return;
+
+	/* Otherwise we reschedule it again for immediate execution. */
+	schedule_delayed_work(&linkwatch_work, 0);
+}
+
+
+static void linkwatch_do_dev(struct net_device *dev)
+{
+	/*
+	 * Make sure the above read is complete since it can be
+	 * rewritten as soon as we clear the bit below.
+	 */
+	smp_mb__before_clear_bit();
+
+	/* We are about to handle this device,
+	 * so new events can be accepted
+	 */
+	clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
+
+	rfc2863_policy(dev);
+	if (dev->flags & IFF_UP) {
+		if (netif_carrier_ok(dev))
+			dev_activate(dev);
+		else
+			dev_deactivate(dev);
+
+		netdev_state_change(dev);
+	}
+	dev_put(dev);
+}
+
+static void __linkwatch_run_queue(int urgent_only)
+{
+	struct net_device *dev;
+	LIST_HEAD(wrk);
+
+	/*
+	 * Limit the number of linkwatch events to one
+	 * per second so that a runaway driver does not
+	 * cause a storm of messages on the netlink
+	 * socket.  This limit does not apply to up events
+	 * while the device qdisc is down.
+	 */
+	if (!urgent_only)
+		linkwatch_nextevent = jiffies + HZ;
+	/* Limit wrap-around effect on delay. */
+	else if (time_after(linkwatch_nextevent, jiffies + HZ))
+		linkwatch_nextevent = jiffies;
+
+	clear_bit(LW_URGENT, &linkwatch_flags);
+
+	spin_lock_irq(&lweventlist_lock);
+	list_splice_init(&lweventlist, &wrk);
+
+	while (!list_empty(&wrk)) {
+
+		dev = list_first_entry(&wrk, struct net_device, link_watch_list);
+		list_del_init(&dev->link_watch_list);
+
+		if (urgent_only && !linkwatch_urgent_event(dev)) {
+			list_add_tail(&dev->link_watch_list, &lweventlist);
+			continue;
+		}
+		spin_unlock_irq(&lweventlist_lock);
+		linkwatch_do_dev(dev);
+		spin_lock_irq(&lweventlist_lock);
+	}
+
+	if (!list_empty(&lweventlist))
+		linkwatch_schedule_work(0);
+	spin_unlock_irq(&lweventlist_lock);
+}
+
+void linkwatch_forget_dev(struct net_device *dev)
+{
+	unsigned long flags;
+	int clean = 0;
+
+	spin_lock_irqsave(&lweventlist_lock, flags);
+	if (!list_empty(&dev->link_watch_list)) {
+		list_del_init(&dev->link_watch_list);
+		clean = 1;
+	}
+	spin_unlock_irqrestore(&lweventlist_lock, flags);
+	if (clean)
+		linkwatch_do_dev(dev);
+}
+
+
+/* Must be called with the rtnl semaphore held */
+void linkwatch_run_queue(void)
+{
+	__linkwatch_run_queue(0);
+}
+
+
+static void linkwatch_event(struct work_struct *dummy)
+{
+	rtnl_lock();
+	__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
+	rtnl_unlock();
+}
+
+
+void linkwatch_fire_event(struct net_device *dev)
+{
+	bool urgent = linkwatch_urgent_event(dev);
+
+	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
+		linkwatch_add_event(dev);
+	} else if (!urgent)
+		return;
+
+	linkwatch_schedule_work(urgent);
+}
+EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
new file mode 100644
index 00000000..96bb0a33
--- /dev/null
+++ b/net/core/neighbour.c
@@ -0,0 +1,2934 @@
+/*
+ *	Generic address resolution entity
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *	Vitaly E. Lavrov	releasing NULL neighbor in neigh_add.
+ *	Harald Welte		Add neighbour cache statistics like rtstat
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/times.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/log2.h>
+
+#define NEIGH_DEBUG 1
+
+#define NEIGH_PRINTK(x...) printk(x)
+#define NEIGH_NOPRINTK(x...) do { ; } while(0)
+#define NEIGH_PRINTK1 NEIGH_NOPRINTK
+#define NEIGH_PRINTK2 NEIGH_NOPRINTK
+
+#if NEIGH_DEBUG >= 1
+#undef NEIGH_PRINTK1
+#define NEIGH_PRINTK1 NEIGH_PRINTK
+#endif
+#if NEIGH_DEBUG >= 2
+#undef NEIGH_PRINTK2
+#define NEIGH_PRINTK2 NEIGH_PRINTK
+#endif
+
+#define PNEIGH_HASHMASK		0xF
+
+static void neigh_timer_handler(unsigned long arg);
+static void __neigh_notify(struct neighbour *n, int type, int flags);
+static void neigh_update_notify(struct neighbour *neigh);
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
+
+static struct neigh_table *neigh_tables;
+#ifdef CONFIG_PROC_FS
+static const struct file_operations neigh_stat_seq_fops;
+#endif
+
+/*
+   Neighbour hash table buckets are protected with rwlock tbl->lock.
+
+   - All the scans/updates to hash buckets MUST be made under this lock.
+   - NOTHING clever should be made under this lock: no callbacks
+     to protocol backends, no attempts to send something to network.
+     It will result in deadlocks, if backend/driver wants to use neighbour
+     cache.
+   - If the entry requires some non-trivial actions, increase
+     its reference count and release table lock.
+
+   Neighbour entries are protected:
+   - with reference count.
+   - with rwlock neigh->lock
+
+   Reference count prevents destruction.
+
+   neigh->lock mainly serializes ll address data and its validity state.
+   However, the same lock is used to protect another entry fields:
+    - timer
+    - resolution queue
+
+   Again, nothing clever shall be made under neigh->lock,
+   the most complicated procedure, which we allow is dev->hard_header.
+   It is supposed, that dev->hard_header is simplistic and does
+   not make callbacks to neighbour tables.
+
+   The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
+   list of neighbour tables. This list is used only in process context,
+ */
+
+static DEFINE_RWLOCK(neigh_tbl_lock);
+
+static int neigh_blackhole(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+	return -ENETDOWN;
+}
+
+static void neigh_cleanup_and_release(struct neighbour *neigh)
+{
+	if (neigh->parms->neigh_cleanup)
+		neigh->parms->neigh_cleanup(neigh);
+
+	__neigh_notify(neigh, RTM_DELNEIGH, 0);
+	neigh_release(neigh);
+}
+
+/*
+ * It is random distribution in the interval (1/2)*base...(3/2)*base.
+ * It corresponds to default IPv6 settings and is not overridable,
+ * because it is really reasonable choice.
+ */
+
+unsigned long neigh_rand_reach_time(unsigned long base)
+{
+	return base ? (net_random() % base) + (base >> 1) : 0;
+}
+EXPORT_SYMBOL(neigh_rand_reach_time);
+
+
+static int neigh_forced_gc(struct neigh_table *tbl)
+{
+	int shrunk = 0;
+	int i;
+	struct neigh_hash_table *nht;
+
+	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
+
+	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (i = 0; i <= nht->hash_mask; i++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np;
+
+		np = &nht->hash_buckets[i];
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
+			/* Neighbour record may be discarded if:
+			 * - nobody refers to it.
+			 * - it is not permanent
+			 */
+			write_lock(&n->lock);
+			if (atomic_read(&n->refcnt) == 1 &&
+			    !(n->nud_state & NUD_PERMANENT)) {
+				rcu_assign_pointer(*np,
+					rcu_dereference_protected(n->next,
+						  lockdep_is_held(&tbl->lock)));
+				n->dead = 1;
+				shrunk	= 1;
+				write_unlock(&n->lock);
+				neigh_cleanup_and_release(n);
+				continue;
+			}
+			write_unlock(&n->lock);
+			np = &n->next;
+		}
+	}
+
+	tbl->last_flush = jiffies;
+
+	write_unlock_bh(&tbl->lock);
+
+	return shrunk;
+}
+
+static void neigh_add_timer(struct neighbour *n, unsigned long when)
+{
+	neigh_hold(n);
+	if (unlikely(mod_timer(&n->timer, when))) {
+		printk("NEIGH: BUG, double timer add, state is %x\n",
+		       n->nud_state);
+		dump_stack();
+	}
+}
+
+static int neigh_del_timer(struct neighbour *n)
+{
+	if ((n->nud_state & NUD_IN_TIMER) &&
+	    del_timer(&n->timer)) {
+		neigh_release(n);
+		return 1;
+	}
+	return 0;
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(list)) != NULL) {
+		dev_put(skb->dev);
+		kfree_skb(skb);
+	}
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+{
+	int i;
+	struct neigh_hash_table *nht;
+
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+
+	for (i = 0; i <= nht->hash_mask; i++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
+			if (dev && n->dev != dev) {
+				np = &n->next;
+				continue;
+			}
+			rcu_assign_pointer(*np,
+				   rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock)));
+			write_lock(&n->lock);
+			neigh_del_timer(n);
+			n->dead = 1;
+
+			if (atomic_read(&n->refcnt) != 1) {
+				/* The most unpleasant situation.
+				   We must destroy neighbour entry,
+				   but someone still uses it.
+
+				   The destroy will be delayed until
+				   the last user releases us, but
+				   we must kill timers etc. and move
+				   it to safe state.
+				 */
+				skb_queue_purge(&n->arp_queue);
+				n->output = neigh_blackhole;
+				if (n->nud_state & NUD_VALID)
+					n->nud_state = NUD_NOARP;
+				else
+					n->nud_state = NUD_NONE;
+				NEIGH_PRINTK2("neigh %p is stray.\n", n);
+			}
+			write_unlock(&n->lock);
+			neigh_cleanup_and_release(n);
+		}
+	}
+}
+
+void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
+{
+	write_lock_bh(&tbl->lock);
+	neigh_flush_dev(tbl, dev);
+	write_unlock_bh(&tbl->lock);
+}
+EXPORT_SYMBOL(neigh_changeaddr);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+	write_lock_bh(&tbl->lock);
+	neigh_flush_dev(tbl, dev);
+	pneigh_ifdown(tbl, dev);
+	write_unlock_bh(&tbl->lock);
+
+	del_timer_sync(&tbl->proxy_timer);
+	pneigh_queue_purge(&tbl->proxy_queue);
+	return 0;
+}
+EXPORT_SYMBOL(neigh_ifdown);
+
+static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+{
+	struct neighbour *n = NULL;
+	unsigned long now = jiffies;
+	int entries;
+
+	entries = atomic_inc_return(&tbl->entries) - 1;
+	if (entries >= tbl->gc_thresh3 ||
+	    (entries >= tbl->gc_thresh2 &&
+	     time_after(now, tbl->last_flush + 5 * HZ))) {
+		if (!neigh_forced_gc(tbl) &&
+		    entries >= tbl->gc_thresh3)
+			goto out_entries;
+	}
+
+	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+	if (!n)
+		goto out_entries;
+
+	skb_queue_head_init(&n->arp_queue);
+	rwlock_init(&n->lock);
+	seqlock_init(&n->ha_lock);
+	n->updated	  = n->used = now;
+	n->nud_state	  = NUD_NONE;
+	n->output	  = neigh_blackhole;
+	n->parms	  = neigh_parms_clone(&tbl->parms);
+	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
+
+	NEIGH_CACHE_STAT_INC(tbl, allocs);
+	n->tbl		  = tbl;
+	atomic_set(&n->refcnt, 1);
+	n->dead		  = 1;
+out:
+	return n;
+
+out_entries:
+	atomic_dec(&tbl->entries);
+	goto out;
+}
+
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
+{
+	size_t size = entries * sizeof(struct neighbour *);
+	struct neigh_hash_table *ret;
+	struct neighbour __rcu **buckets;
+
+	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret)
+		return NULL;
+	if (size <= PAGE_SIZE)
+		buckets = kzalloc(size, GFP_ATOMIC);
+	else
+		buckets = (struct neighbour __rcu **)
+			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+					   get_order(size));
+	if (!buckets) {
+		kfree(ret);
+		return NULL;
+	}
+	ret->hash_buckets = buckets;
+	ret->hash_mask = entries - 1;
+	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
+	return ret;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+	struct neigh_hash_table *nht = container_of(head,
+						    struct neigh_hash_table,
+						    rcu);
+	size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
+	struct neighbour __rcu **buckets = nht->hash_buckets;
+
+	if (size <= PAGE_SIZE)
+		kfree(buckets);
+	else
+		free_pages((unsigned long)buckets, get_order(size));
+	kfree(nht);
+}
+
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+						unsigned long new_entries)
+{
+	unsigned int i, hash;
+	struct neigh_hash_table *new_nht, *old_nht;
+
+	NEIGH_CACHE_STAT_INC(tbl, hash_grows);
+
+	BUG_ON(!is_power_of_2(new_entries));
+	old_nht = rcu_dereference_protected(tbl->nht,
+					    lockdep_is_held(&tbl->lock));
+	new_nht = neigh_hash_alloc(new_entries);
+	if (!new_nht)
+		return old_nht;
+
+	for (i = 0; i <= old_nht->hash_mask; i++) {
+		struct neighbour *n, *next;
+
+		for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+						   lockdep_is_held(&tbl->lock));
+		     n != NULL;
+		     n = next) {
+			hash = tbl->hash(n->primary_key, n->dev,
+					 new_nht->hash_rnd);
+
+			hash &= new_nht->hash_mask;
+			next = rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock));
+
+			rcu_assign_pointer(n->next,
+					   rcu_dereference_protected(
+						new_nht->hash_buckets[hash],
+						lockdep_is_held(&tbl->lock)));
+			rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+		}
+	}
+
+	rcu_assign_pointer(tbl->nht, new_nht);
+	call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+	return new_nht;
+}
+
+struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
+			       struct net_device *dev)
+{
+	struct neighbour *n;
+	int key_len = tbl->key_len;
+	u32 hash_val;
+	struct neigh_hash_table *nht;
+
+	NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+
+	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	     n != NULL;
+	     n = rcu_dereference_bh(n->next)) {
+		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
+			if (!atomic_inc_not_zero(&n->refcnt))
+				n = NULL;
+			NEIGH_CACHE_STAT_INC(tbl, hits);
+			break;
+		}
+	}
+
+	rcu_read_unlock_bh();
+	return n;
+}
+EXPORT_SYMBOL(neigh_lookup);
+
+struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
+				     const void *pkey)
+{
+	struct neighbour *n;
+	int key_len = tbl->key_len;
+	u32 hash_val;
+	struct neigh_hash_table *nht;
+
+	NEIGH_CACHE_STAT_INC(tbl, lookups);
+
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+
+	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	     n != NULL;
+	     n = rcu_dereference_bh(n->next)) {
+		if (!memcmp(n->primary_key, pkey, key_len) &&
+		    net_eq(dev_net(n->dev), net)) {
+			if (!atomic_inc_not_zero(&n->refcnt))
+				n = NULL;
+			NEIGH_CACHE_STAT_INC(tbl, hits);
+			break;
+		}
+	}
+
+	rcu_read_unlock_bh();
+	return n;
+}
+EXPORT_SYMBOL(neigh_lookup_nodev);
+
+struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
+			       struct net_device *dev)
+{
+	u32 hash_val;
+	int key_len = tbl->key_len;
+	int error;
+	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct neigh_hash_table *nht;
+
+	if (!n) {
+		rc = ERR_PTR(-ENOBUFS);
+		goto out;
+	}
+
+	memcpy(n->primary_key, pkey, key_len);
+	n->dev = dev;
+	dev_hold(dev);
+
+	/* Protocol specific setup. */
+	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) {
+		rc = ERR_PTR(error);
+		goto out_neigh_release;
+	}
+
+	/* Device specific setup. */
+	if (n->parms->neigh_setup &&
+	    (error = n->parms->neigh_setup(n)) < 0) {
+		rc = ERR_PTR(error);
+		goto out_neigh_release;
+	}
+
+	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
+
+	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+
+	if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
+		nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
+
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+
+	if (n->parms->dead) {
+		rc = ERR_PTR(-EINVAL);
+		goto out_tbl_unlock;
+	}
+
+	for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+					    lockdep_is_held(&tbl->lock));
+	     n1 != NULL;
+	     n1 = rcu_dereference_protected(n1->next,
+			lockdep_is_held(&tbl->lock))) {
+		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
+			neigh_hold(n1);
+			rc = n1;
+			goto out_tbl_unlock;
+		}
+	}
+
+	n->dead = 0;
+	neigh_hold(n);
+	rcu_assign_pointer(n->next,
+			   rcu_dereference_protected(nht->hash_buckets[hash_val],
+						     lockdep_is_held(&tbl->lock)));
+	rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+	write_unlock_bh(&tbl->lock);
+	NEIGH_PRINTK2("neigh %p is created.\n", n);
+	rc = n;
+out:
+	return rc;
+out_tbl_unlock:
+	write_unlock_bh(&tbl->lock);
+out_neigh_release:
+	neigh_release(n);
+	goto out;
+}
+EXPORT_SYMBOL(neigh_create);
+
+static u32 pneigh_hash(const void *pkey, int key_len)
+{
+	u32 hash_val = *(u32 *)(pkey + key_len - 4);
+	hash_val ^= (hash_val >> 16);
+	hash_val ^= hash_val >> 8;
+	hash_val ^= hash_val >> 4;
+	hash_val &= PNEIGH_HASHMASK;
+	return hash_val;
+}
+
+static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
+					      struct net *net,
+					      const void *pkey,
+					      int key_len,
+					      struct net_device *dev)
+{
+	while (n) {
+		if (!memcmp(n->key, pkey, key_len) &&
+		    net_eq(pneigh_net(n), net) &&
+		    (n->dev == dev || !n->dev))
+			return n;
+		n = n->next;
+	}
+	return NULL;
+}
+
+struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
+		struct net *net, const void *pkey, struct net_device *dev)
+{
+	int key_len = tbl->key_len;
+	u32 hash_val = pneigh_hash(pkey, key_len);
+
+	return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+				 net, pkey, key_len, dev);
+}
+EXPORT_SYMBOL_GPL(__pneigh_lookup);
+
+struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
+				    struct net *net, const void *pkey,
+				    struct net_device *dev, int creat)
+{
+	struct pneigh_entry *n;
+	int key_len = tbl->key_len;
+	u32 hash_val = pneigh_hash(pkey, key_len);
+
+	read_lock_bh(&tbl->lock);
+	n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+			      net, pkey, key_len, dev);
+	read_unlock_bh(&tbl->lock);
+
+	if (n || !creat)
+		goto out;
+
+	ASSERT_RTNL();
+
+	n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+	if (!n)
+		goto out;
+
+	write_pnet(&n->net, hold_net(net));
+	memcpy(n->key, pkey, key_len);
+	n->dev = dev;
+	if (dev)
+		dev_hold(dev);
+
+	if (tbl->pconstructor && tbl->pconstructor(n)) {
+		if (dev)
+			dev_put(dev);
+		release_net(net);
+		kfree(n);
+		n = NULL;
+		goto out;
+	}
+
+	write_lock_bh(&tbl->lock);
+	n->next = tbl->phash_buckets[hash_val];
+	tbl->phash_buckets[hash_val] = n;
+	write_unlock_bh(&tbl->lock);
+out:
+	return n;
+}
+EXPORT_SYMBOL(pneigh_lookup);
+
+
+int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
+		  struct net_device *dev)
+{
+	struct pneigh_entry *n, **np;
+	int key_len = tbl->key_len;
+	u32 hash_val = pneigh_hash(pkey, key_len);
+
+	write_lock_bh(&tbl->lock);
+	for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+	     np = &n->next) {
+		if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
+		    net_eq(pneigh_net(n), net)) {
+			*np = n->next;
+			write_unlock_bh(&tbl->lock);
+			if (tbl->pdestructor)
+				tbl->pdestructor(n);
+			if (n->dev)
+				dev_put(n->dev);
+			release_net(pneigh_net(n));
+			kfree(n);
+			return 0;
+		}
+	}
+	write_unlock_bh(&tbl->lock);
+	return -ENOENT;
+}
+
+static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+	struct pneigh_entry *n, **np;
+	u32 h;
+
+	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
+		np = &tbl->phash_buckets[h];
+		while ((n = *np) != NULL) {
+			if (!dev || n->dev == dev) {
+				*np = n->next;
+				if (tbl->pdestructor)
+					tbl->pdestructor(n);
+				if (n->dev)
+					dev_put(n->dev);
+				release_net(pneigh_net(n));
+				kfree(n);
+				continue;
+			}
+			np = &n->next;
+		}
+	}
+	return -ENOENT;
+}
+
+static void neigh_parms_destroy(struct neigh_parms *parms);
+
+static inline void neigh_parms_put(struct neigh_parms *parms)
+{
+	if (atomic_dec_and_test(&parms->refcnt))
+		neigh_parms_destroy(parms);
+}
+
+static void neigh_destroy_rcu(struct rcu_head *head)
+{
+	struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+	kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+}
+/*
+ *	neighbour must already be out of the table;
+ *
+ */
+void neigh_destroy(struct neighbour *neigh)
+{
+	struct hh_cache *hh;
+
+	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
+
+	if (!neigh->dead) {
+		printk(KERN_WARNING
+		       "Destroying alive neighbour %p\n", neigh);
+		dump_stack();
+		return;
+	}
+
+	if (neigh_del_timer(neigh))
+		printk(KERN_WARNING "Impossible event.\n");
+
+	while ((hh = neigh->hh) != NULL) {
+		neigh->hh = hh->hh_next;
+		hh->hh_next = NULL;
+
+		write_seqlock_bh(&hh->hh_lock);
+		hh->hh_output = neigh_blackhole;
+		write_sequnlock_bh(&hh->hh_lock);
+		hh_cache_put(hh);
+	}
+
+	skb_queue_purge(&neigh->arp_queue);
+
+	dev_put(neigh->dev);
+	neigh_parms_put(neigh->parms);
+
+	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
+
+	atomic_dec(&neigh->tbl->entries);
+	call_rcu(&neigh->rcu, neigh_destroy_rcu);
+}
+EXPORT_SYMBOL(neigh_destroy);
+
+/* Neighbour state is suspicious;
+   disable fast path.
+
+   Called with write_locked neigh.
+ */
+static void neigh_suspect(struct neighbour *neigh)
+{
+	struct hh_cache *hh;
+
+	NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+
+	neigh->output = neigh->ops->output;
+
+	for (hh = neigh->hh; hh; hh = hh->hh_next)
+		hh->hh_output = neigh->ops->output;
+}
+
+/* Neighbour state is OK;
+   enable fast path.
+
+   Called with write_locked neigh.
+ */
+static void neigh_connect(struct neighbour *neigh)
+{
+	struct hh_cache *hh;
+
+	NEIGH_PRINTK2("neigh %p is connected.\n", neigh);
+
+	neigh->output = neigh->ops->connected_output;
+
+	for (hh = neigh->hh; hh; hh = hh->hh_next)
+		hh->hh_output = neigh->ops->hh_output;
+}
+
+static void neigh_periodic_work(struct work_struct *work)
+{
+	struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+	struct neighbour *n;
+	struct neighbour __rcu **np;
+	unsigned int i;
+	struct neigh_hash_table *nht;
+
+	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
+
+	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+
+	/*
+	 *	periodically recompute ReachableTime from random function
+	 */
+
+	if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
+		struct neigh_parms *p;
+		tbl->last_rand = jiffies;
+		for (p = &tbl->parms; p; p = p->next)
+			p->reachable_time =
+				neigh_rand_reach_time(p->base_reachable_time);
+	}
+
+	for (i = 0 ; i <= nht->hash_mask; i++) {
+		np = &nht->hash_buckets[i];
+
+		while ((n = rcu_dereference_protected(*np,
+				lockdep_is_held(&tbl->lock))) != NULL) {
+			unsigned int state;
+
+			write_lock(&n->lock);
+
+			state = n->nud_state;
+			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+				write_unlock(&n->lock);
+				goto next_elt;
+			}
+
+			if (time_before(n->used, n->confirmed))
+				n->used = n->confirmed;
+
+			if (atomic_read(&n->refcnt) == 1 &&
+			    (state == NUD_FAILED ||
+			     time_after(jiffies, n->used + n->parms->gc_staletime))) {
+				*np = n->next;
+				n->dead = 1;
+				write_unlock(&n->lock);
+				neigh_cleanup_and_release(n);
+				continue;
+			}
+			write_unlock(&n->lock);
+
+next_elt:
+			np = &n->next;
+		}
+		/*
+		 * It's fine to release lock here, even if hash table
+		 * grows while we are preempted.
+		 */
+		write_unlock_bh(&tbl->lock);
+		cond_resched();
+		write_lock_bh(&tbl->lock);
+		nht = rcu_dereference_protected(tbl->nht,
+						lockdep_is_held(&tbl->lock));
+	}
+	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
+	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
+	 * base_reachable_time.
+	 */
+	schedule_delayed_work(&tbl->gc_work,
+			      tbl->parms.base_reachable_time >> 1);
+	write_unlock_bh(&tbl->lock);
+}
+
+static __inline__ int neigh_max_probes(struct neighbour *n)
+{
+	struct neigh_parms *p = n->parms;
+	return (n->nud_state & NUD_PROBE) ?
+		p->ucast_probes :
+		p->ucast_probes + p->app_probes + p->mcast_probes;
+}
+
+static void neigh_invalidate(struct neighbour *neigh)
+	__releases(neigh->lock)
+	__acquires(neigh->lock)
+{
+	struct sk_buff *skb;
+
+	NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
+	NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
+	neigh->updated = jiffies;
+
+	/* It is very thin place. report_unreachable is very complicated
+	   routine. Particularly, it can hit the same neighbour entry!
+
+	   So that, we try to be accurate and avoid dead loop. --ANK
+	 */
+	while (neigh->nud_state == NUD_FAILED &&
+	       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+		write_unlock(&neigh->lock);
+		neigh->ops->error_report(neigh, skb);
+		write_lock(&neigh->lock);
+	}
+	skb_queue_purge(&neigh->arp_queue);
+}
+
+/* Called when a timer expires for a neighbour entry. */
+
+static void neigh_timer_handler(unsigned long arg)
+{
+	unsigned long now, next;
+	struct neighbour *neigh = (struct neighbour *)arg;
+	unsigned state;
+	int notify = 0;
+
+	write_lock(&neigh->lock);
+
+	state = neigh->nud_state;
+	now = jiffies;
+	next = now + HZ;
+
+	if (!(state & NUD_IN_TIMER)) {
+#ifndef CONFIG_SMP
+		printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
+#endif
+		goto out;
+	}
+
+	if (state & NUD_REACHABLE) {
+		if (time_before_eq(now,
+				   neigh->confirmed + neigh->parms->reachable_time)) {
+			NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
+			next = neigh->confirmed + neigh->parms->reachable_time;
+		} else if (time_before_eq(now,
+					  neigh->used + neigh->parms->delay_probe_time)) {
+			NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+			neigh->nud_state = NUD_DELAY;
+			neigh->updated = jiffies;
+			neigh_suspect(neigh);
+			next = now + neigh->parms->delay_probe_time;
+		} else {
+			NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
+			neigh->nud_state = NUD_STALE;
+			neigh->updated = jiffies;
+			neigh_suspect(neigh);
+			notify = 1;
+		}
+	} else if (state & NUD_DELAY) {
+		if (time_before_eq(now,
+				   neigh->confirmed + neigh->parms->delay_probe_time)) {
+			NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh);
+			neigh->nud_state = NUD_REACHABLE;
+			neigh->updated = jiffies;
+			neigh_connect(neigh);
+			notify = 1;
+			next = neigh->confirmed + neigh->parms->reachable_time;
+		} else {
+			NEIGH_PRINTK2("neigh %p is probed.\n", neigh);
+			neigh->nud_state = NUD_PROBE;
+			neigh->updated = jiffies;
+			atomic_set(&neigh->probes, 0);
+			next = now + neigh->parms->retrans_time;
+		}
+	} else {
+		/* NUD_PROBE|NUD_INCOMPLETE */
+		next = now + neigh->parms->retrans_time;
+	}
+
+	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
+	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
+		neigh->nud_state = NUD_FAILED;
+		notify = 1;
+		neigh_invalidate(neigh);
+	}
+
+	if (neigh->nud_state & NUD_IN_TIMER) {
+		if (time_before(next, jiffies + HZ/2))
+			next = jiffies + HZ/2;
+		if (!mod_timer(&neigh->timer, next))
+			neigh_hold(neigh);
+	}
+	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
+		struct sk_buff *skb = skb_peek(&neigh->arp_queue);
+		/* keep skb alive even if arp_queue overflows */
+		if (skb)
+			skb = skb_copy(skb, GFP_ATOMIC);
+		write_unlock(&neigh->lock);
+		neigh->ops->solicit(neigh, skb);
+		atomic_inc(&neigh->probes);
+		kfree_skb(skb);
+	} else {
+out:
+		write_unlock(&neigh->lock);
+	}
+
+	if (notify)
+		neigh_update_notify(neigh);
+
+	neigh_release(neigh);
+}
+
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+	int rc;
+	unsigned long now;
+
+	write_lock_bh(&neigh->lock);
+
+	rc = 0;
+	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
+		goto out_unlock_bh;
+
+	now = jiffies;
+
+	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
+		if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
+			atomic_set(&neigh->probes, neigh->parms->ucast_probes);
+			neigh->nud_state     = NUD_INCOMPLETE;
+			neigh->updated = jiffies;
+			neigh_add_timer(neigh, now + 1);
+		} else {
+			neigh->nud_state = NUD_FAILED;
+			neigh->updated = jiffies;
+			write_unlock_bh(&neigh->lock);
+
+			kfree_skb(skb);
+			return 1;
+		}
+	} else if (neigh->nud_state & NUD_STALE) {
+		NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
+		neigh->nud_state = NUD_DELAY;
+		neigh->updated = jiffies;
+		neigh_add_timer(neigh,
+				jiffies + neigh->parms->delay_probe_time);
+	}
+
+	if (neigh->nud_state == NUD_INCOMPLETE) {
+		if (skb) {
+			if (skb_queue_len(&neigh->arp_queue) >=
+			    neigh->parms->queue_len) {
+				struct sk_buff *buff;
+				buff = __skb_dequeue(&neigh->arp_queue);
+				kfree_skb(buff);
+				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
+			}
+			skb_dst_force(skb);
+			__skb_queue_tail(&neigh->arp_queue, skb);
+		}
+		rc = 1;
+	}
+out_unlock_bh:
+	write_unlock_bh(&neigh->lock);
+	return rc;
+}
+EXPORT_SYMBOL(__neigh_event_send);
+
+static void neigh_update_hhs(const struct neighbour *neigh)
+{
+	struct hh_cache *hh;
+	void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
+		= NULL;
+
+	if (neigh->dev->header_ops)
+		update = neigh->dev->header_ops->cache_update;
+
+	if (update) {
+		for (hh = neigh->hh; hh; hh = hh->hh_next) {
+			write_seqlock_bh(&hh->hh_lock);
+			update(hh, neigh->dev, neigh->ha);
+			write_sequnlock_bh(&hh->hh_lock);
+		}
+	}
+}
+
+
+
+/* Generic update routine.
+   -- lladdr is new lladdr or NULL, if it is not supplied.
+   -- new    is new state.
+   -- flags
+	NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
+				if it is different.
+	NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
+				lladdr instead of overriding it
+				if it is different.
+				It also allows to retain current state
+				if lladdr is unchanged.
+	NEIGH_UPDATE_F_ADMIN	means that the change is administrative.
+
+	NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
+				NTF_ROUTER flag.
+	NEIGH_UPDATE_F_ISROUTER	indicates if the neighbour is known as
+				a router.
+
+   Caller MUST hold reference count on the entry.
+ */
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+		 u32 flags)
+{
+	u8 old;
+	int err;
+	int notify = 0;
+	struct net_device *dev;
+	int update_isrouter = 0;
+
+	write_lock_bh(&neigh->lock);
+
+	dev    = neigh->dev;
+	old    = neigh->nud_state;
+	err    = -EPERM;
+
+	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+	    (old & (NUD_NOARP | NUD_PERMANENT)))
+		goto out;
+
+	if (!(new & NUD_VALID)) {
+		neigh_del_timer(neigh);
+		if (old & NUD_CONNECTED)
+			neigh_suspect(neigh);
+		neigh->nud_state = new;
+		err = 0;
+		notify = old & NUD_VALID;
+		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+		    (new & NUD_FAILED)) {
+			neigh_invalidate(neigh);
+			notify = 1;
+		}
+		goto out;
+	}
+
+	/* Compare new lladdr with cached one */
+	if (!dev->addr_len) {
+		/* First case: device needs no address. */
+		lladdr = neigh->ha;
+	} else if (lladdr) {
+		/* The second case: if something is already cached
+		   and a new address is proposed:
+		   - compare new & old
+		   - if they are different, check override flag
+		 */
+		if ((old & NUD_VALID) &&
+		    !memcmp(lladdr, neigh->ha, dev->addr_len))
+			lladdr = neigh->ha;
+	} else {
+		/* No address is supplied; if we know something,
+		   use it, otherwise discard the request.
+		 */
+		err = -EINVAL;
+		if (!(old & NUD_VALID))
+			goto out;
+		lladdr = neigh->ha;
+	}
+
+	if (new & NUD_CONNECTED)
+		neigh->confirmed = jiffies;
+	neigh->updated = jiffies;
+
+	/* If entry was valid and address is not changed,
+	   do not change entry state, if new one is STALE.
+	 */
+	err = 0;
+	update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+	if (old & NUD_VALID) {
+		if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
+			update_isrouter = 0;
+			if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
+			    (old & NUD_CONNECTED)) {
+				lladdr = neigh->ha;
+				new = NUD_STALE;
+			} else
+				goto out;
+		} else {
+			if (lladdr == neigh->ha && new == NUD_STALE &&
+			    ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
+			     (old & NUD_CONNECTED))
+			    )
+				new = old;
+		}
+	}
+
+	if (new != old) {
+		neigh_del_timer(neigh);
+		if (new & NUD_IN_TIMER)
+			neigh_add_timer(neigh, (jiffies +
+						((new & NUD_REACHABLE) ?
+						 neigh->parms->reachable_time :
+						 0)));
+		neigh->nud_state = new;
+	}
+
+	if (lladdr != neigh->ha) {
+		write_seqlock(&neigh->ha_lock);
+		memcpy(&neigh->ha, lladdr, dev->addr_len);
+		write_sequnlock(&neigh->ha_lock);
+		neigh_update_hhs(neigh);
+		if (!(new & NUD_CONNECTED))
+			neigh->confirmed = jiffies -
+				      (neigh->parms->base_reachable_time << 1);
+		notify = 1;
+	}
+	if (new == old)
+		goto out;
+	if (new & NUD_CONNECTED)
+		neigh_connect(neigh);
+	else
+		neigh_suspect(neigh);
+	if (!(old & NUD_VALID)) {
+		struct sk_buff *skb;
+
+		/* Again: avoid dead loop if something went wrong */
+
+		while (neigh->nud_state & NUD_VALID &&
+		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+			struct dst_entry *dst = skb_dst(skb);
+			struct neighbour *n2, *n1 = neigh;
+			write_unlock_bh(&neigh->lock);
+
+			rcu_read_lock();
+			/* On shaper/eql skb->dst->neighbour != neigh :( */
+			if (dst && (n2 = dst_get_neighbour(dst)) != NULL)
+				n1 = n2;
+			n1->output(skb);
+			rcu_read_unlock();
+
+			write_lock_bh(&neigh->lock);
+		}
+		skb_queue_purge(&neigh->arp_queue);
+	}
+out:
+	if (update_isrouter) {
+		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
+			(neigh->flags | NTF_ROUTER) :
+			(neigh->flags & ~NTF_ROUTER);
+	}
+	write_unlock_bh(&neigh->lock);
+
+	if (notify)
+		neigh_update_notify(neigh);
+
+	return err;
+}
+EXPORT_SYMBOL(neigh_update);
+
+struct neighbour *neigh_event_ns(struct neigh_table *tbl,
+				 u8 *lladdr, void *saddr,
+				 struct net_device *dev)
+{
+	struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
+						 lladdr || !dev->addr_len);
+	if (neigh)
+		neigh_update(neigh, lladdr, NUD_STALE,
+			     NEIGH_UPDATE_F_OVERRIDE);
+	return neigh;
+}
+EXPORT_SYMBOL(neigh_event_ns);
+
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
+				   __be16 protocol)
+{
+	struct hh_cache *hh;
+
+	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
+	for (hh = n->hh; hh; hh = hh->hh_next) {
+		if (hh->hh_type == protocol) {
+			atomic_inc(&hh->hh_refcnt);
+			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+				hh_cache_put(hh);
+			return true;
+		}
+	}
+	return false;
+}
+
+/* called with read_lock_bh(&n->lock); */
+static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
+			  __be16 protocol)
+{
+	struct hh_cache	*hh;
+	struct net_device *dev = dst->dev;
+
+	if (likely(neigh_hh_lookup(n, dst, protocol)))
+		return;
+
+	/* slow path */
+	hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
+	if (!hh)
+		return;
+
+	seqlock_init(&hh->hh_lock);
+	hh->hh_type = protocol;
+	atomic_set(&hh->hh_refcnt, 2);
+
+	if (dev->header_ops->cache(n, hh)) {
+		kfree(hh);
+		return;
+	}
+
+	write_lock_bh(&n->lock);
+
+	/* must check if another thread already did the insert */
+	if (neigh_hh_lookup(n, dst, protocol)) {
+		kfree(hh);
+		goto end;
+	}
+
+	if (n->nud_state & NUD_CONNECTED)
+		hh->hh_output = n->ops->hh_output;
+	else
+		hh->hh_output = n->ops->output;
+
+	hh->hh_next = n->hh;
+	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
+	n->hh	    = hh;
+
+	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+		hh_cache_put(hh);
+end:
+	write_unlock_bh(&n->lock);
+}
+
+/* This function can be used in contexts, where only old dev_queue_xmit
+ * worked, f.e. if you want to override normal output path (eql, shaper),
+ * but resolution is not made yet.
+ */
+
+int neigh_compat_output(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+
+	__skb_pull(skb, skb_network_offset(skb));
+
+	if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL,
+			    skb->len) < 0 &&
+	    dev->header_ops->rebuild(skb))
+		return 0;
+
+	return dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL(neigh_compat_output);
+
+/* Slow and careful. */
+
+int neigh_resolve_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	int rc = 0;
+
+	if (!dst)
+		goto discard;
+
+	__skb_pull(skb, skb_network_offset(skb));
+
+	if (!neigh_event_send(neigh, skb)) {
+		int err;
+		struct net_device *dev = neigh->dev;
+		unsigned int seq;
+
+		if (dev->header_ops->cache &&
+		    !dst->hh &&
+		    !(dst->flags & DST_NOCACHE))
+			neigh_hh_init(neigh, dst, dst->ops->protocol);
+
+		do {
+			seq = read_seqbegin(&neigh->ha_lock);
+			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+					      neigh->ha, NULL, skb->len);
+		} while (read_seqretry(&neigh->ha_lock, seq));
+
+		if (err >= 0)
+			rc = neigh->ops->queue_xmit(skb);
+		else
+			goto out_kfree_skb;
+	}
+out:
+	return rc;
+discard:
+	NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n",
+		      dst, neigh);
+out_kfree_skb:
+	rc = -EINVAL;
+	kfree_skb(skb);
+	goto out;
+}
+EXPORT_SYMBOL(neigh_resolve_output);
+
+/* As fast as possible without hh cache */
+
+int neigh_connected_output(struct sk_buff *skb)
+{
+	int err;
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	struct net_device *dev = neigh->dev;
+	unsigned int seq;
+
+	__skb_pull(skb, skb_network_offset(skb));
+
+	do {
+		seq = read_seqbegin(&neigh->ha_lock);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
+	} while (read_seqretry(&neigh->ha_lock, seq));
+
+	if (err >= 0)
+		err = neigh->ops->queue_xmit(skb);
+	else {
+		err = -EINVAL;
+		kfree_skb(skb);
+	}
+	return err;
+}
+EXPORT_SYMBOL(neigh_connected_output);
+
+static void neigh_proxy_process(unsigned long arg)
+{
+	struct neigh_table *tbl = (struct neigh_table *)arg;
+	long sched_next = 0;
+	unsigned long now = jiffies;
+	struct sk_buff *skb, *n;
+
+	spin_lock(&tbl->proxy_queue.lock);
+
+	skb_queue_walk_safe(&tbl->proxy_queue, skb, n) {
+		long tdif = NEIGH_CB(skb)->sched_next - now;
+
+		if (tdif <= 0) {
+			struct net_device *dev = skb->dev;
+
+			__skb_unlink(skb, &tbl->proxy_queue);
+			if (tbl->proxy_redo && netif_running(dev)) {
+				rcu_read_lock();
+				tbl->proxy_redo(skb);
+				rcu_read_unlock();
+			} else {
+				kfree_skb(skb);
+			}
+
+			dev_put(dev);
+		} else if (!sched_next || tdif < sched_next)
+			sched_next = tdif;
+	}
+	del_timer(&tbl->proxy_timer);
+	if (sched_next)
+		mod_timer(&tbl->proxy_timer, jiffies + sched_next);
+	spin_unlock(&tbl->proxy_queue.lock);
+}
+
+void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+		    struct sk_buff *skb)
+{
+	unsigned long now = jiffies;
+	unsigned long sched_next = now + (net_random() % p->proxy_delay);
+
+	if (tbl->proxy_queue.qlen > p->proxy_qlen) {
+		kfree_skb(skb);
+		return;
+	}
+
+	NEIGH_CB(skb)->sched_next = sched_next;
+	NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
+
+	spin_lock(&tbl->proxy_queue.lock);
+	if (del_timer(&tbl->proxy_timer)) {
+		if (time_before(tbl->proxy_timer.expires, sched_next))
+			sched_next = tbl->proxy_timer.expires;
+	}
+	skb_dst_drop(skb);
+	dev_hold(skb->dev);
+	__skb_queue_tail(&tbl->proxy_queue, skb);
+	mod_timer(&tbl->proxy_timer, sched_next);
+	spin_unlock(&tbl->proxy_queue.lock);
+}
+EXPORT_SYMBOL(pneigh_enqueue);
+
+static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
+						      struct net *net, int ifindex)
+{
+	struct neigh_parms *p;
+
+	for (p = &tbl->parms; p; p = p->next) {
+		if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
+		    (!p->dev && !ifindex))
+			return p;
+	}
+
+	return NULL;
+}
+
+struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
+				      struct neigh_table *tbl)
+{
+	struct neigh_parms *p, *ref;
+	struct net *net = dev_net(dev);
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	ref = lookup_neigh_parms(tbl, net, 0);
+	if (!ref)
+		return NULL;
+
+	p = kmemdup(ref, sizeof(*p), GFP_KERNEL);
+	if (p) {
+		p->tbl		  = tbl;
+		atomic_set(&p->refcnt, 1);
+		p->reachable_time =
+				neigh_rand_reach_time(p->base_reachable_time);
+
+		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
+			kfree(p);
+			return NULL;
+		}
+
+		dev_hold(dev);
+		p->dev = dev;
+		write_pnet(&p->net, hold_net(net));
+		p->sysctl_table = NULL;
+		write_lock_bh(&tbl->lock);
+		p->next		= tbl->parms.next;
+		tbl->parms.next = p;
+		write_unlock_bh(&tbl->lock);
+	}
+	return p;
+}
+EXPORT_SYMBOL(neigh_parms_alloc);
+
+static void neigh_rcu_free_parms(struct rcu_head *head)
+{
+	struct neigh_parms *parms =
+		container_of(head, struct neigh_parms, rcu_head);
+
+	neigh_parms_put(parms);
+}
+
+void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
+{
+	struct neigh_parms **p;
+
+	if (!parms || parms == &tbl->parms)
+		return;
+	write_lock_bh(&tbl->lock);
+	for (p = &tbl->parms.next; *p; p = &(*p)->next) {
+		if (*p == parms) {
+			*p = parms->next;
+			parms->dead = 1;
+			write_unlock_bh(&tbl->lock);
+			if (parms->dev)
+				dev_put(parms->dev);
+			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
+			return;
+		}
+	}
+	write_unlock_bh(&tbl->lock);
+	NEIGH_PRINTK1("neigh_parms_release: not found\n");
+}
+EXPORT_SYMBOL(neigh_parms_release);
+
+static void neigh_parms_destroy(struct neigh_parms *parms)
+{
+	release_net(neigh_parms_net(parms));
+	kfree(parms);
+}
+
+static struct lock_class_key neigh_table_proxy_queue_class;
+
+void neigh_table_init_no_netlink(struct neigh_table *tbl)
+{
+	unsigned long now = jiffies;
+	unsigned long phsize;
+
+	write_pnet(&tbl->parms.net, &init_net);
+	atomic_set(&tbl->parms.refcnt, 1);
+	tbl->parms.reachable_time =
+			  neigh_rand_reach_time(tbl->parms.base_reachable_time);
+
+	if (!tbl->kmem_cachep)
+		tbl->kmem_cachep =
+			kmem_cache_create(tbl->id, tbl->entry_size, 0,
+					  SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					  NULL);
+	tbl->stats = alloc_percpu(struct neigh_statistics);
+	if (!tbl->stats)
+		panic("cannot create neighbour cache statistics");
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat,
+			      &neigh_stat_seq_fops, tbl))
+		panic("cannot create neighbour proc dir entry");
+#endif
+
+	RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
+
+	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
+	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+
+	if (!tbl->nht || !tbl->phash_buckets)
+		panic("cannot allocate neighbour cache hashes");
+
+	rwlock_init(&tbl->lock);
+	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
+	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
+	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+	skb_queue_head_init_class(&tbl->proxy_queue,
+			&neigh_table_proxy_queue_class);
+
+	tbl->last_flush = now;
+	tbl->last_rand	= now + tbl->parms.reachable_time * 20;
+}
+EXPORT_SYMBOL(neigh_table_init_no_netlink);
+
+void neigh_table_init(struct neigh_table *tbl)
+{
+	struct neigh_table *tmp;
+
+	neigh_table_init_no_netlink(tbl);
+	write_lock(&neigh_tbl_lock);
+	for (tmp = neigh_tables; tmp; tmp = tmp->next) {
+		if (tmp->family == tbl->family)
+			break;
+	}
+	tbl->next	= neigh_tables;
+	neigh_tables	= tbl;
+	write_unlock(&neigh_tbl_lock);
+
+	if (unlikely(tmp)) {
+		printk(KERN_ERR "NEIGH: Registering multiple tables for "
+		       "family %d\n", tbl->family);
+		dump_stack();
+	}
+}
+EXPORT_SYMBOL(neigh_table_init);
+
+int neigh_table_clear(struct neigh_table *tbl)
+{
+	struct neigh_table **tp;
+
+	/* It is not clean... Fix it to unload IPv6 module safely */
+	cancel_delayed_work_sync(&tbl->gc_work);
+	del_timer_sync(&tbl->proxy_timer);
+	pneigh_queue_purge(&tbl->proxy_queue);
+	neigh_ifdown(tbl, NULL);
+	if (atomic_read(&tbl->entries))
+		printk(KERN_CRIT "neighbour leakage\n");
+	write_lock(&neigh_tbl_lock);
+	for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
+		if (*tp == tbl) {
+			*tp = tbl->next;
+			break;
+		}
+	}
+	write_unlock(&neigh_tbl_lock);
+
+	call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
+		 neigh_hash_free_rcu);
+	tbl->nht = NULL;
+
+	kfree(tbl->phash_buckets);
+	tbl->phash_buckets = NULL;
+
+	remove_proc_entry(tbl->id, init_net.proc_net_stat);
+
+	free_percpu(tbl->stats);
+	tbl->stats = NULL;
+
+	kmem_cache_destroy(tbl->kmem_cachep);
+	tbl->kmem_cachep = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(neigh_table_clear);
+
+static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ndmsg *ndm;
+	struct nlattr *dst_attr;
+	struct neigh_table *tbl;
+	struct net_device *dev = NULL;
+	int err = -EINVAL;
+
+	ASSERT_RTNL();
+	if (nlmsg_len(nlh) < sizeof(*ndm))
+		goto out;
+
+	dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
+	if (dst_attr == NULL)
+		goto out;
+
+	ndm = nlmsg_data(nlh);
+	if (ndm->ndm_ifindex) {
+		dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+		if (dev == NULL) {
+			err = -ENODEV;
+			goto out;
+		}
+	}
+
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+		struct neighbour *neigh;
+
+		if (tbl->family != ndm->ndm_family)
+			continue;
+		read_unlock(&neigh_tbl_lock);
+
+		if (nla_len(dst_attr) < tbl->key_len)
+			goto out;
+
+		if (ndm->ndm_flags & NTF_PROXY) {
+			err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
+			goto out;
+		}
+
+		if (dev == NULL)
+			goto out;
+
+		neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+		if (neigh == NULL) {
+			err = -ENOENT;
+			goto out;
+		}
+
+		err = neigh_update(neigh, NULL, NUD_FAILED,
+				   NEIGH_UPDATE_F_OVERRIDE |
+				   NEIGH_UPDATE_F_ADMIN);
+		neigh_release(neigh);
+		goto out;
+	}
+	read_unlock(&neigh_tbl_lock);
+	err = -EAFNOSUPPORT;
+
+out:
+	return err;
+}
+
+static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ndmsg *ndm;
+	struct nlattr *tb[NDA_MAX+1];
+	struct neigh_table *tbl;
+	struct net_device *dev = NULL;
+	int err;
+
+	ASSERT_RTNL();
+	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
+	if (err < 0)
+		goto out;
+
+	err = -EINVAL;
+	if (tb[NDA_DST] == NULL)
+		goto out;
+
+	ndm = nlmsg_data(nlh);
+	if (ndm->ndm_ifindex) {
+		dev = __dev_get_by_index(net, ndm->ndm_ifindex);
+		if (dev == NULL) {
+			err = -ENODEV;
+			goto out;
+		}
+
+		if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+			goto out;
+	}
+
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+		int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+		struct neighbour *neigh;
+		void *dst, *lladdr;
+
+		if (tbl->family != ndm->ndm_family)
+			continue;
+		read_unlock(&neigh_tbl_lock);
+
+		if (nla_len(tb[NDA_DST]) < tbl->key_len)
+			goto out;
+		dst = nla_data(tb[NDA_DST]);
+		lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
+
+		if (ndm->ndm_flags & NTF_PROXY) {
+			struct pneigh_entry *pn;
+
+			err = -ENOBUFS;
+			pn = pneigh_lookup(tbl, net, dst, dev, 1);
+			if (pn) {
+				pn->flags = ndm->ndm_flags;
+				err = 0;
+			}
+			goto out;
+		}
+
+		if (dev == NULL)
+			goto out;
+
+		neigh = neigh_lookup(tbl, dst, dev);
+		if (neigh == NULL) {
+			if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+				err = -ENOENT;
+				goto out;
+			}
+
+			neigh = __neigh_lookup_errno(tbl, dst, dev);
+			if (IS_ERR(neigh)) {
+				err = PTR_ERR(neigh);
+				goto out;
+			}
+		} else {
+			if (nlh->nlmsg_flags & NLM_F_EXCL) {
+				err = -EEXIST;
+				neigh_release(neigh);
+				goto out;
+			}
+
+			if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
+				flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+		}
+
+		if (ndm->ndm_flags & NTF_USE) {
+			neigh_event_send(neigh, NULL);
+			err = 0;
+		} else
+			err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
+		neigh_release(neigh);
+		goto out;
+	}
+
+	read_unlock(&neigh_tbl_lock);
+	err = -EAFNOSUPPORT;
+out:
+	return err;
+}
+
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, NDTA_PARMS);
+	if (nest == NULL)
+		return -ENOBUFS;
+
+	if (parms->dev)
+		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
+
+	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
+	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
+	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
+	NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
+	NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
+	NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
+		      parms->base_reachable_time);
+	NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
+	NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
+	NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
+	NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
+	NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
+	NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
+			      u32 pid, u32 seq, int type, int flags)
+{
+	struct nlmsghdr *nlh;
+	struct ndtmsg *ndtmsg;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndtmsg = nlmsg_data(nlh);
+
+	read_lock_bh(&tbl->lock);
+	ndtmsg->ndtm_family = tbl->family;
+	ndtmsg->ndtm_pad1   = 0;
+	ndtmsg->ndtm_pad2   = 0;
+
+	NLA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+	NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
+	NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
+	NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
+	NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
+
+	{
+		unsigned long now = jiffies;
+		unsigned int flush_delta = now - tbl->last_flush;
+		unsigned int rand_delta = now - tbl->last_rand;
+		struct neigh_hash_table *nht;
+		struct ndt_config ndc = {
+			.ndtc_key_len		= tbl->key_len,
+			.ndtc_entry_size	= tbl->entry_size,
+			.ndtc_entries		= atomic_read(&tbl->entries),
+			.ndtc_last_flush	= jiffies_to_msecs(flush_delta),
+			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
+			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
+		};
+
+		rcu_read_lock_bh();
+		nht = rcu_dereference_bh(tbl->nht);
+		ndc.ndtc_hash_rnd = nht->hash_rnd;
+		ndc.ndtc_hash_mask = nht->hash_mask;
+		rcu_read_unlock_bh();
+
+		NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+	}
+
+	{
+		int cpu;
+		struct ndt_stats ndst;
+
+		memset(&ndst, 0, sizeof(ndst));
+
+		for_each_possible_cpu(cpu) {
+			struct neigh_statistics	*st;
+
+			st = per_cpu_ptr(tbl->stats, cpu);
+			ndst.ndts_allocs		+= st->allocs;
+			ndst.ndts_destroys		+= st->destroys;
+			ndst.ndts_hash_grows		+= st->hash_grows;
+			ndst.ndts_res_failed		+= st->res_failed;
+			ndst.ndts_lookups		+= st->lookups;
+			ndst.ndts_hits			+= st->hits;
+			ndst.ndts_rcv_probes_mcast	+= st->rcv_probes_mcast;
+			ndst.ndts_rcv_probes_ucast	+= st->rcv_probes_ucast;
+			ndst.ndts_periodic_gc_runs	+= st->periodic_gc_runs;
+			ndst.ndts_forced_gc_runs	+= st->forced_gc_runs;
+		}
+
+		NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+	}
+
+	BUG_ON(tbl->parms.dev);
+	if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+		goto nla_put_failure;
+
+	read_unlock_bh(&tbl->lock);
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	read_unlock_bh(&tbl->lock);
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int neightbl_fill_param_info(struct sk_buff *skb,
+				    struct neigh_table *tbl,
+				    struct neigh_parms *parms,
+				    u32 pid, u32 seq, int type,
+				    unsigned int flags)
+{
+	struct ndtmsg *ndtmsg;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndtmsg = nlmsg_data(nlh);
+
+	read_lock_bh(&tbl->lock);
+	ndtmsg->ndtm_family = tbl->family;
+	ndtmsg->ndtm_pad1   = 0;
+	ndtmsg->ndtm_pad2   = 0;
+
+	if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 ||
+	    neightbl_fill_parms(skb, parms) < 0)
+		goto errout;
+
+	read_unlock_bh(&tbl->lock);
+	return nlmsg_end(skb, nlh);
+errout:
+	read_unlock_bh(&tbl->lock);
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
+	[NDTA_NAME]		= { .type = NLA_STRING },
+	[NDTA_THRESH1]		= { .type = NLA_U32 },
+	[NDTA_THRESH2]		= { .type = NLA_U32 },
+	[NDTA_THRESH3]		= { .type = NLA_U32 },
+	[NDTA_GC_INTERVAL]	= { .type = NLA_U64 },
+	[NDTA_PARMS]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
+	[NDTPA_IFINDEX]			= { .type = NLA_U32 },
+	[NDTPA_QUEUE_LEN]		= { .type = NLA_U32 },
+	[NDTPA_PROXY_QLEN]		= { .type = NLA_U32 },
+	[NDTPA_APP_PROBES]		= { .type = NLA_U32 },
+	[NDTPA_UCAST_PROBES]		= { .type = NLA_U32 },
+	[NDTPA_MCAST_PROBES]		= { .type = NLA_U32 },
+	[NDTPA_BASE_REACHABLE_TIME]	= { .type = NLA_U64 },
+	[NDTPA_GC_STALETIME]		= { .type = NLA_U64 },
+	[NDTPA_DELAY_PROBE_TIME]	= { .type = NLA_U64 },
+	[NDTPA_RETRANS_TIME]		= { .type = NLA_U64 },
+	[NDTPA_ANYCAST_DELAY]		= { .type = NLA_U64 },
+	[NDTPA_PROXY_DELAY]		= { .type = NLA_U64 },
+	[NDTPA_LOCKTIME]		= { .type = NLA_U64 },
+};
+
+static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct neigh_table *tbl;
+	struct ndtmsg *ndtmsg;
+	struct nlattr *tb[NDTA_MAX+1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+			  nl_neightbl_policy);
+	if (err < 0)
+		goto errout;
+
+	if (tb[NDTA_NAME] == NULL) {
+		err = -EINVAL;
+		goto errout;
+	}
+
+	ndtmsg = nlmsg_data(nlh);
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+		if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+			continue;
+
+		if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0)
+			break;
+	}
+
+	if (tbl == NULL) {
+		err = -ENOENT;
+		goto errout_locked;
+	}
+
+	/*
+	 * We acquire tbl->lock to be nice to the periodic timers and
+	 * make sure they always see a consistent set of values.
+	 */
+	write_lock_bh(&tbl->lock);
+
+	if (tb[NDTA_PARMS]) {
+		struct nlattr *tbp[NDTPA_MAX+1];
+		struct neigh_parms *p;
+		int i, ifindex = 0;
+
+		err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
+				       nl_ntbl_parm_policy);
+		if (err < 0)
+			goto errout_tbl_lock;
+
+		if (tbp[NDTPA_IFINDEX])
+			ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]);
+
+		p = lookup_neigh_parms(tbl, net, ifindex);
+		if (p == NULL) {
+			err = -ENOENT;
+			goto errout_tbl_lock;
+		}
+
+		for (i = 1; i <= NDTPA_MAX; i++) {
+			if (tbp[i] == NULL)
+				continue;
+
+			switch (i) {
+			case NDTPA_QUEUE_LEN:
+				p->queue_len = nla_get_u32(tbp[i]);
+				break;
+			case NDTPA_PROXY_QLEN:
+				p->proxy_qlen = nla_get_u32(tbp[i]);
+				break;
+			case NDTPA_APP_PROBES:
+				p->app_probes = nla_get_u32(tbp[i]);
+				break;
+			case NDTPA_UCAST_PROBES:
+				p->ucast_probes = nla_get_u32(tbp[i]);
+				break;
+			case NDTPA_MCAST_PROBES:
+				p->mcast_probes = nla_get_u32(tbp[i]);
+				break;
+			case NDTPA_BASE_REACHABLE_TIME:
+				p->base_reachable_time = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_GC_STALETIME:
+				p->gc_staletime = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_DELAY_PROBE_TIME:
+				p->delay_probe_time = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_RETRANS_TIME:
+				p->retrans_time = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_ANYCAST_DELAY:
+				p->anycast_delay = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_PROXY_DELAY:
+				p->proxy_delay = nla_get_msecs(tbp[i]);
+				break;
+			case NDTPA_LOCKTIME:
+				p->locktime = nla_get_msecs(tbp[i]);
+				break;
+			}
+		}
+	}
+
+	if (tb[NDTA_THRESH1])
+		tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+
+	if (tb[NDTA_THRESH2])
+		tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+
+	if (tb[NDTA_THRESH3])
+		tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+
+	if (tb[NDTA_GC_INTERVAL])
+		tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+
+	err = 0;
+
+errout_tbl_lock:
+	write_unlock_bh(&tbl->lock);
+errout_locked:
+	read_unlock(&neigh_tbl_lock);
+errout:
+	return err;
+}
+
+static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int family, tidx, nidx = 0;
+	int tbl_skip = cb->args[0];
+	int neigh_skip = cb->args[1];
+	struct neigh_table *tbl;
+
+	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+		struct neigh_parms *p;
+
+		if (tidx < tbl_skip || (family && tbl->family != family))
+			continue;
+
+		if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
+				       cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+				       NLM_F_MULTI) <= 0)
+			break;
+
+		for (nidx = 0, p = tbl->parms.next; p; p = p->next) {
+			if (!net_eq(neigh_parms_net(p), net))
+				continue;
+
+			if (nidx < neigh_skip)
+				goto next;
+
+			if (neightbl_fill_param_info(skb, tbl, p,
+						     NETLINK_CB(cb->skb).pid,
+						     cb->nlh->nlmsg_seq,
+						     RTM_NEWNEIGHTBL,
+						     NLM_F_MULTI) <= 0)
+				goto out;
+		next:
+			nidx++;
+		}
+
+		neigh_skip = 0;
+	}
+out:
+	read_unlock(&neigh_tbl_lock);
+	cb->args[0] = tidx;
+	cb->args[1] = nidx;
+
+	return skb->len;
+}
+
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+			   u32 pid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family	 = neigh->ops->family;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags	 = neigh->flags;
+	ndm->ndm_type	 = neigh->type;
+	ndm->ndm_ifindex = neigh->dev->ifindex;
+
+	NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
+
+	read_lock_bh(&neigh->lock);
+	ndm->ndm_state	 = neigh->nud_state;
+	if (neigh->nud_state & NUD_VALID) {
+		char haddr[MAX_ADDR_LEN];
+
+		neigh_ha_snapshot(haddr, neigh, neigh->dev);
+		if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+			read_unlock_bh(&neigh->lock);
+			goto nla_put_failure;
+		}
+	}
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - neigh->used);
+	ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
+	ci.ndm_updated	 = jiffies_to_clock_t(now - neigh->updated);
+	ci.ndm_refcnt	 = atomic_read(&neigh->refcnt) - 1;
+	read_unlock_bh(&neigh->lock);
+
+	NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
+	NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void neigh_update_notify(struct neighbour *neigh)
+{
+	call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+	__neigh_notify(neigh, RTM_NEWNEIGH, 0);
+}
+
+static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
+			    struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct neighbour *n;
+	int rc, h, s_h = cb->args[1];
+	int idx, s_idx = idx = cb->args[2];
+	struct neigh_hash_table *nht;
+
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	for (h = 0; h <= nht->hash_mask; h++) {
+		if (h < s_h)
+			continue;
+		if (h > s_h)
+			s_idx = 0;
+		for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+		     n != NULL;
+		     n = rcu_dereference_bh(n->next)) {
+			if (!net_eq(dev_net(n->dev), net))
+				continue;
+			if (idx < s_idx)
+				goto next;
+			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) <= 0) {
+				rc = -1;
+				goto out;
+			}
+next:
+			idx++;
+		}
+	}
+	rc = skb->len;
+out:
+	rcu_read_unlock_bh();
+	cb->args[1] = h;
+	cb->args[2] = idx;
+	return rc;
+}
+
+static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct neigh_table *tbl;
+	int t, family, s_t;
+
+	read_lock(&neigh_tbl_lock);
+	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+	s_t = cb->args[0];
+
+	for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+		if (t < s_t || (family && tbl->family != family))
+			continue;
+		if (t > s_t)
+			memset(&cb->args[1], 0, sizeof(cb->args) -
+						sizeof(cb->args[0]));
+		if (neigh_dump_table(tbl, skb, cb) < 0)
+			break;
+	}
+	read_unlock(&neigh_tbl_lock);
+
+	cb->args[0] = t;
+	return skb->len;
+}
+
+void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
+{
+	int chain;
+	struct neigh_hash_table *nht;
+
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	read_lock(&tbl->lock); /* avoid resizes */
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
+		struct neighbour *n;
+
+		for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+		     n != NULL;
+		     n = rcu_dereference_bh(n->next))
+			cb(n, cookie);
+	}
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_for_each);
+
+/* The tbl->lock must be held as a writer and BH disabled. */
+void __neigh_for_each_release(struct neigh_table *tbl,
+			      int (*cb)(struct neighbour *))
+{
+	int chain;
+	struct neigh_hash_table *nht;
+
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np;
+
+		np = &nht->hash_buckets[chain];
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
+			int release;
+
+			write_lock(&n->lock);
+			release = cb(n);
+			if (release) {
+				rcu_assign_pointer(*np,
+					rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock)));
+				n->dead = 1;
+			} else
+				np = &n->next;
+			write_unlock(&n->lock);
+			if (release)
+				neigh_cleanup_and_release(n);
+		}
+	}
+}
+EXPORT_SYMBOL(__neigh_for_each_release);
+
+#ifdef CONFIG_PROC_FS
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+	struct neigh_seq_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct neigh_hash_table *nht = state->nht;
+	struct neighbour *n = NULL;
+	int bucket = state->bucket;
+
+	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
+	for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+		n = rcu_dereference_bh(nht->hash_buckets[bucket]);
+
+		while (n) {
+			if (!net_eq(dev_net(n->dev), net))
+				goto next;
+			if (state->neigh_sub_iter) {
+				loff_t fakep = 0;
+				void *v;
+
+				v = state->neigh_sub_iter(state, n, &fakep);
+				if (!v)
+					goto next;
+			}
+			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+				break;
+			if (n->nud_state & ~NUD_NOARP)
+				break;
+next:
+			n = rcu_dereference_bh(n->next);
+		}
+
+		if (n)
+			break;
+	}
+	state->bucket = bucket;
+
+	return n;
+}
+
+static struct neighbour *neigh_get_next(struct seq_file *seq,
+					struct neighbour *n,
+					loff_t *pos)
+{
+	struct neigh_seq_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct neigh_hash_table *nht = state->nht;
+
+	if (state->neigh_sub_iter) {
+		void *v = state->neigh_sub_iter(state, n, pos);
+		if (v)
+			return n;
+	}
+	n = rcu_dereference_bh(n->next);
+
+	while (1) {
+		while (n) {
+			if (!net_eq(dev_net(n->dev), net))
+				goto next;
+			if (state->neigh_sub_iter) {
+				void *v = state->neigh_sub_iter(state, n, pos);
+				if (v)
+					return n;
+				goto next;
+			}
+			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+				break;
+
+			if (n->nud_state & ~NUD_NOARP)
+				break;
+next:
+			n = rcu_dereference_bh(n->next);
+		}
+
+		if (n)
+			break;
+
+		if (++state->bucket > nht->hash_mask)
+			break;
+
+		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
+	}
+
+	if (n && pos)
+		--(*pos);
+	return n;
+}
+
+static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	struct neighbour *n = neigh_get_first(seq);
+
+	if (n) {
+		--(*pos);
+		while (*pos) {
+			n = neigh_get_next(seq, n, pos);
+			if (!n)
+				break;
+		}
+	}
+	return *pos ? NULL : n;
+}
+
+static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
+{
+	struct neigh_seq_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct neigh_table *tbl = state->tbl;
+	struct pneigh_entry *pn = NULL;
+	int bucket = state->bucket;
+
+	state->flags |= NEIGH_SEQ_IS_PNEIGH;
+	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
+		pn = tbl->phash_buckets[bucket];
+		while (pn && !net_eq(pneigh_net(pn), net))
+			pn = pn->next;
+		if (pn)
+			break;
+	}
+	state->bucket = bucket;
+
+	return pn;
+}
+
+static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
+					    struct pneigh_entry *pn,
+					    loff_t *pos)
+{
+	struct neigh_seq_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct neigh_table *tbl = state->tbl;
+
+	pn = pn->next;
+	while (!pn) {
+		if (++state->bucket > PNEIGH_HASHMASK)
+			break;
+		pn = tbl->phash_buckets[state->bucket];
+		while (pn && !net_eq(pneigh_net(pn), net))
+			pn = pn->next;
+		if (pn)
+			break;
+	}
+
+	if (pn && pos)
+		--(*pos);
+
+	return pn;
+}
+
+static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	struct pneigh_entry *pn = pneigh_get_first(seq);
+
+	if (pn) {
+		--(*pos);
+		while (*pos) {
+			pn = pneigh_get_next(seq, pn, pos);
+			if (!pn)
+				break;
+		}
+	}
+	return *pos ? NULL : pn;
+}
+
+static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
+{
+	struct neigh_seq_state *state = seq->private;
+	void *rc;
+	loff_t idxpos = *pos;
+
+	rc = neigh_get_idx(seq, &idxpos);
+	if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+		rc = pneigh_get_idx(seq, &idxpos);
+
+	return rc;
+}
+
+void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+	__acquires(rcu_bh)
+{
+	struct neigh_seq_state *state = seq->private;
+
+	state->tbl = tbl;
+	state->bucket = 0;
+	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
+
+	rcu_read_lock_bh();
+	state->nht = rcu_dereference_bh(tbl->nht);
+
+	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL(neigh_seq_start);
+
+void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct neigh_seq_state *state;
+	void *rc;
+
+	if (v == SEQ_START_TOKEN) {
+		rc = neigh_get_first(seq);
+		goto out;
+	}
+
+	state = seq->private;
+	if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) {
+		rc = neigh_get_next(seq, v, NULL);
+		if (rc)
+			goto out;
+		if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY))
+			rc = pneigh_get_first(seq);
+	} else {
+		BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY);
+		rc = pneigh_get_next(seq, v, NULL);
+	}
+out:
+	++(*pos);
+	return rc;
+}
+EXPORT_SYMBOL(neigh_seq_next);
+
+void neigh_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu_bh)
+{
+	rcu_read_unlock_bh();
+}
+EXPORT_SYMBOL(neigh_seq_stop);
+
+/* statistics via seq_file */
+
+static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct neigh_table *tbl = seq->private;
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(tbl->stats, cpu);
+	}
+	return NULL;
+}
+
+static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct neigh_table *tbl = seq->private;
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(tbl->stats, cpu);
+	}
+	return NULL;
+}
+
+static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int neigh_stat_seq_show(struct seq_file *seq, void *v)
+{
+	struct neigh_table *tbl = seq->private;
+	struct neigh_statistics *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  allocs destroys hash_grows  lookups hits  res_failed  rcv_probes_mcast rcv_probes_ucast  periodic_gc_runs forced_gc_runs unresolved_discards\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x  %08lx %08lx %08lx  %08lx %08lx  %08lx  "
+			"%08lx %08lx  %08lx %08lx %08lx\n",
+		   atomic_read(&tbl->entries),
+
+		   st->allocs,
+		   st->destroys,
+		   st->hash_grows,
+
+		   st->lookups,
+		   st->hits,
+
+		   st->res_failed,
+
+		   st->rcv_probes_mcast,
+		   st->rcv_probes_ucast,
+
+		   st->periodic_gc_runs,
+		   st->forced_gc_runs,
+		   st->unres_discards
+		   );
+
+	return 0;
+}
+
+static const struct seq_operations neigh_stat_seq_ops = {
+	.start	= neigh_stat_seq_start,
+	.next	= neigh_stat_seq_next,
+	.stop	= neigh_stat_seq_stop,
+	.show	= neigh_stat_seq_show,
+};
+
+static int neigh_stat_seq_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &neigh_stat_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		sf->private = PDE(inode)->data;
+	}
+	return ret;
+};
+
+static const struct file_operations neigh_stat_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open 	 = neigh_stat_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static inline size_t neigh_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+	       + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+	       + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+	       + nla_total_size(sizeof(struct nda_cacheinfo))
+	       + nla_total_size(4); /* NDA_PROBES */
+}
+
+static void __neigh_notify(struct neighbour *n, int type, int flags)
+{
+	struct net *net = dev_net(n->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = neigh_fill_info(skb, n, 0, 0, type, flags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+#ifdef CONFIG_ARPD
+void neigh_app_ns(struct neighbour *n)
+{
+	__neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST);
+}
+EXPORT_SYMBOL(neigh_app_ns);
+#endif /* CONFIG_ARPD */
+
+#ifdef CONFIG_SYSCTL
+
+#define NEIGH_VARS_MAX 19
+
+static struct neigh_sysctl_table {
+	struct ctl_table_header *sysctl_header;
+	struct ctl_table neigh_vars[NEIGH_VARS_MAX];
+	char *dev_name;
+} neigh_sysctl_template __read_mostly = {
+	.neigh_vars = {
+		{
+			.procname	= "mcast_solicit",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "ucast_solicit",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "app_solicit",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "retrans_time",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_userhz_jiffies,
+		},
+		{
+			.procname	= "base_reachable_time",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "delay_first_probe_time",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "gc_stale_time",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "unres_qlen",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "proxy_qlen",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "anycast_delay",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_userhz_jiffies,
+		},
+		{
+			.procname	= "proxy_delay",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_userhz_jiffies,
+		},
+		{
+			.procname	= "locktime",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_userhz_jiffies,
+		},
+		{
+			.procname	= "retrans_time_ms",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
+		{
+			.procname	= "base_reachable_time_ms",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
+		{
+			.procname	= "gc_interval",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "gc_thresh1",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "gc_thresh2",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "gc_thresh3",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{},
+	},
+};
+
+int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
+			  char *p_name, proc_handler *handler)
+{
+	struct neigh_sysctl_table *t;
+	const char *dev_name_source = NULL;
+
+#define NEIGH_CTL_PATH_ROOT	0
+#define NEIGH_CTL_PATH_PROTO	1
+#define NEIGH_CTL_PATH_NEIGH	2
+#define NEIGH_CTL_PATH_DEV	3
+
+	struct ctl_path neigh_path[] = {
+		{ .procname = "net",	 },
+		{ .procname = "proto",	 },
+		{ .procname = "neigh",	 },
+		{ .procname = "default", },
+		{ },
+	};
+
+	t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto err;
+
+	t->neigh_vars[0].data  = &p->mcast_probes;
+	t->neigh_vars[1].data  = &p->ucast_probes;
+	t->neigh_vars[2].data  = &p->app_probes;
+	t->neigh_vars[3].data  = &p->retrans_time;
+	t->neigh_vars[4].data  = &p->base_reachable_time;
+	t->neigh_vars[5].data  = &p->delay_probe_time;
+	t->neigh_vars[6].data  = &p->gc_staletime;
+	t->neigh_vars[7].data  = &p->queue_len;
+	t->neigh_vars[8].data  = &p->proxy_qlen;
+	t->neigh_vars[9].data  = &p->anycast_delay;
+	t->neigh_vars[10].data = &p->proxy_delay;
+	t->neigh_vars[11].data = &p->locktime;
+	t->neigh_vars[12].data  = &p->retrans_time;
+	t->neigh_vars[13].data  = &p->base_reachable_time;
+
+	if (dev) {
+		dev_name_source = dev->name;
+		/* Terminate the table early */
+		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+	} else {
+		dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
+		t->neigh_vars[14].data = (int *)(p + 1);
+		t->neigh_vars[15].data = (int *)(p + 1) + 1;
+		t->neigh_vars[16].data = (int *)(p + 1) + 2;
+		t->neigh_vars[17].data = (int *)(p + 1) + 3;
+	}
+
+
+	if (handler) {
+		/* RetransTime */
+		t->neigh_vars[3].proc_handler = handler;
+		t->neigh_vars[3].extra1 = dev;
+		/* ReachableTime */
+		t->neigh_vars[4].proc_handler = handler;
+		t->neigh_vars[4].extra1 = dev;
+		/* RetransTime (in milliseconds)*/
+		t->neigh_vars[12].proc_handler = handler;
+		t->neigh_vars[12].extra1 = dev;
+		/* ReachableTime (in milliseconds) */
+		t->neigh_vars[13].proc_handler = handler;
+		t->neigh_vars[13].extra1 = dev;
+	}
+
+	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
+	if (!t->dev_name)
+		goto free;
+
+	neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name;
+	neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name;
+
+	t->sysctl_header =
+		register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars);
+	if (!t->sysctl_header)
+		goto free_procname;
+
+	p->sysctl_table = t;
+	return 0;
+
+free_procname:
+	kfree(t->dev_name);
+free:
+	kfree(t);
+err:
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(neigh_sysctl_register);
+
+void neigh_sysctl_unregister(struct neigh_parms *p)
+{
+	if (p->sysctl_table) {
+		struct neigh_sysctl_table *t = p->sysctl_table;
+		p->sysctl_table = NULL;
+		unregister_sysctl_table(t->sysctl_header);
+		kfree(t->dev_name);
+		kfree(t);
+	}
+}
+EXPORT_SYMBOL(neigh_sysctl_unregister);
+
+#endif	/* CONFIG_SYSCTL */
+
+static int __init neigh_init(void)
+{
+	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info);
+
+	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info);
+	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL);
+
+	return 0;
+}
+
+subsys_initcall(neigh_init);
+
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
new file mode 100644
index 00000000..33d2a1fb
--- /dev/null
+++ b/net/core/net-sysfs.c
@@ -0,0 +1,1339 @@
+/*
+ * net-sysfs.c - network device class and attributes
+ *
+ * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/wireless.h>
+#include <linux/vmalloc.h>
+#include <net/wext.h>
+
+#include "net-sysfs.h"
+
+#ifdef CONFIG_SYSFS
+static const char fmt_hex[] = "%#x\n";
+static const char fmt_long_hex[] = "%#lx\n";
+static const char fmt_dec[] = "%d\n";
+static const char fmt_udec[] = "%u\n";
+static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
+
+static inline int dev_isalive(const struct net_device *dev)
+{
+	return dev->reg_state <= NETREG_REGISTERED;
+}
+
+/* use same locking rules as GIF* ioctl's */
+static ssize_t netdev_show(const struct device *dev,
+			   struct device_attribute *attr, char *buf,
+			   ssize_t (*format)(const struct net_device *, char *))
+{
+	struct net_device *net = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	read_lock(&dev_base_lock);
+	if (dev_isalive(net))
+		ret = (*format)(net, buf);
+	read_unlock(&dev_base_lock);
+
+	return ret;
+}
+
+/* generate a show function for simple field */
+#define NETDEVICE_SHOW(field, format_string)				\
+static ssize_t format_##field(const struct net_device *net, char *buf)	\
+{									\
+	return sprintf(buf, format_string, net->field);			\
+}									\
+static ssize_t show_##field(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)	\
+{									\
+	return netdev_show(dev, attr, buf, format_##field);		\
+}
+
+
+/* use same locking and permission rules as SIF* ioctl's */
+static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t len,
+			    int (*set)(struct net_device *, unsigned long))
+{
+	struct net_device *net = to_net_dev(dev);
+	char *endp;
+	unsigned long new;
+	int ret = -EINVAL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	new = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		goto err;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (dev_isalive(net)) {
+		if ((ret = (*set)(net, new)) == 0)
+			ret = len;
+	}
+	rtnl_unlock();
+ err:
+	return ret;
+}
+
+NETDEVICE_SHOW(dev_id, fmt_hex);
+NETDEVICE_SHOW(addr_assign_type, fmt_dec);
+NETDEVICE_SHOW(addr_len, fmt_dec);
+NETDEVICE_SHOW(iflink, fmt_dec);
+NETDEVICE_SHOW(ifindex, fmt_dec);
+NETDEVICE_SHOW(features, fmt_hex);
+NETDEVICE_SHOW(type, fmt_dec);
+NETDEVICE_SHOW(link_mode, fmt_dec);
+
+/* use same locking rules as GIFHWADDR ioctl's */
+static ssize_t show_address(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	ssize_t ret = -EINVAL;
+
+	read_lock(&dev_base_lock);
+	if (dev_isalive(net))
+		ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
+	read_unlock(&dev_base_lock);
+	return ret;
+}
+
+static ssize_t show_broadcast(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *net = to_net_dev(dev);
+	if (dev_isalive(net))
+		return sysfs_format_mac(buf, net->broadcast, net->addr_len);
+	return -EINVAL;
+}
+
+static ssize_t show_carrier(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	if (netif_running(netdev)) {
+		return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+	}
+	return -EINVAL;
+}
+
+static ssize_t show_speed(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	int ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (netif_running(netdev)) {
+		struct ethtool_cmd cmd;
+		if (!dev_ethtool_get_settings(netdev, &cmd))
+			ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+static ssize_t show_duplex(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	int ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (netif_running(netdev)) {
+		struct ethtool_cmd cmd;
+		if (!dev_ethtool_get_settings(netdev, &cmd))
+			ret = sprintf(buf, "%s\n",
+				      cmd.duplex ? "full" : "half");
+	}
+	rtnl_unlock();
+	return ret;
+}
+
+static ssize_t show_dormant(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct net_device *netdev = to_net_dev(dev);
+
+	if (netif_running(netdev))
+		return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+
+	return -EINVAL;
+}
+
+static const char *const operstates[] = {
+	"unknown",
+	"notpresent", /* currently unused */
+	"down",
+	"lowerlayerdown",
+	"testing", /* currently unused */
+	"dormant",
+	"up"
+};
+
+static ssize_t show_operstate(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	const struct net_device *netdev = to_net_dev(dev);
+	unsigned char operstate;
+
+	read_lock(&dev_base_lock);
+	operstate = netdev->operstate;
+	if (!netif_running(netdev))
+		operstate = IF_OPER_DOWN;
+	read_unlock(&dev_base_lock);
+
+	if (operstate >= ARRAY_SIZE(operstates))
+		return -EINVAL; /* should not happen */
+
+	return sprintf(buf, "%s\n", operstates[operstate]);
+}
+
+/* read-write attributes */
+NETDEVICE_SHOW(mtu, fmt_dec);
+
+static int change_mtu(struct net_device *net, unsigned long new_mtu)
+{
+	return dev_set_mtu(net, (int) new_mtu);
+}
+
+static ssize_t store_mtu(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_mtu);
+}
+
+NETDEVICE_SHOW(flags, fmt_hex);
+
+static int change_flags(struct net_device *net, unsigned long new_flags)
+{
+	return dev_change_flags(net, (unsigned) new_flags);
+}
+
+static ssize_t store_flags(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_flags);
+}
+
+NETDEVICE_SHOW(tx_queue_len, fmt_ulong);
+
+static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+{
+	net->tx_queue_len = new_len;
+	return 0;
+}
+
+static ssize_t store_tx_queue_len(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_tx_queue_len);
+}
+
+static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	size_t count = len;
+	ssize_t ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* ignore trailing newline */
+	if (len >  0 && buf[len - 1] == '\n')
+		--count;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+	ret = dev_set_alias(netdev, buf, count);
+	rtnl_unlock();
+
+	return ret < 0 ? ret : len;
+}
+
+static ssize_t show_ifalias(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	const struct net_device *netdev = to_net_dev(dev);
+	ssize_t ret = 0;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+	if (netdev->ifalias)
+		ret = sprintf(buf, "%s\n", netdev->ifalias);
+	rtnl_unlock();
+	return ret;
+}
+
+NETDEVICE_SHOW(group, fmt_dec);
+
+static int change_group(struct net_device *net, unsigned long new_group)
+{
+	dev_set_group(net, (int) new_group);
+	return 0;
+}
+
+static ssize_t store_group(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_group);
+}
+
+static struct device_attribute net_class_attributes[] = {
+	__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
+	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
+	__ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
+	__ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
+	__ATTR(iflink, S_IRUGO, show_iflink, NULL),
+	__ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
+	__ATTR(features, S_IRUGO, show_features, NULL),
+	__ATTR(type, S_IRUGO, show_type, NULL),
+	__ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
+	__ATTR(address, S_IRUGO, show_address, NULL),
+	__ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
+	__ATTR(carrier, S_IRUGO, show_carrier, NULL),
+	__ATTR(speed, S_IRUGO, show_speed, NULL),
+	__ATTR(duplex, S_IRUGO, show_duplex, NULL),
+	__ATTR(dormant, S_IRUGO, show_dormant, NULL),
+	__ATTR(operstate, S_IRUGO, show_operstate, NULL),
+	__ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu),
+	__ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
+	__ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
+	       store_tx_queue_len),
+	__ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group),
+	{}
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t netstat_show(const struct device *d,
+			    struct device_attribute *attr, char *buf,
+			    unsigned long offset)
+{
+	struct net_device *dev = to_net_dev(d);
+	ssize_t ret = -EINVAL;
+
+	WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+			offset % sizeof(u64) != 0);
+
+	read_lock(&dev_base_lock);
+	if (dev_isalive(dev)) {
+		struct rtnl_link_stats64 temp;
+		const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+		ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
+	}
+	read_unlock(&dev_base_lock);
+	return ret;
+}
+
+/* generate a read-only statistics attribute */
+#define NETSTAT_ENTRY(name)						\
+static ssize_t show_##name(struct device *d,				\
+			   struct device_attribute *attr, char *buf) 	\
+{									\
+	return netstat_show(d, attr, buf,				\
+			    offsetof(struct rtnl_link_stats64, name));	\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+NETSTAT_ENTRY(rx_packets);
+NETSTAT_ENTRY(tx_packets);
+NETSTAT_ENTRY(rx_bytes);
+NETSTAT_ENTRY(tx_bytes);
+NETSTAT_ENTRY(rx_errors);
+NETSTAT_ENTRY(tx_errors);
+NETSTAT_ENTRY(rx_dropped);
+NETSTAT_ENTRY(tx_dropped);
+NETSTAT_ENTRY(multicast);
+NETSTAT_ENTRY(collisions);
+NETSTAT_ENTRY(rx_length_errors);
+NETSTAT_ENTRY(rx_over_errors);
+NETSTAT_ENTRY(rx_crc_errors);
+NETSTAT_ENTRY(rx_frame_errors);
+NETSTAT_ENTRY(rx_fifo_errors);
+NETSTAT_ENTRY(rx_missed_errors);
+NETSTAT_ENTRY(tx_aborted_errors);
+NETSTAT_ENTRY(tx_carrier_errors);
+NETSTAT_ENTRY(tx_fifo_errors);
+NETSTAT_ENTRY(tx_heartbeat_errors);
+NETSTAT_ENTRY(tx_window_errors);
+NETSTAT_ENTRY(rx_compressed);
+NETSTAT_ENTRY(tx_compressed);
+
+static struct attribute *netstat_attrs[] = {
+	&dev_attr_rx_packets.attr,
+	&dev_attr_tx_packets.attr,
+	&dev_attr_rx_bytes.attr,
+	&dev_attr_tx_bytes.attr,
+	&dev_attr_rx_errors.attr,
+	&dev_attr_tx_errors.attr,
+	&dev_attr_rx_dropped.attr,
+	&dev_attr_tx_dropped.attr,
+	&dev_attr_multicast.attr,
+	&dev_attr_collisions.attr,
+	&dev_attr_rx_length_errors.attr,
+	&dev_attr_rx_over_errors.attr,
+	&dev_attr_rx_crc_errors.attr,
+	&dev_attr_rx_frame_errors.attr,
+	&dev_attr_rx_fifo_errors.attr,
+	&dev_attr_rx_missed_errors.attr,
+	&dev_attr_tx_aborted_errors.attr,
+	&dev_attr_tx_carrier_errors.attr,
+	&dev_attr_tx_fifo_errors.attr,
+	&dev_attr_tx_heartbeat_errors.attr,
+	&dev_attr_tx_window_errors.attr,
+	&dev_attr_rx_compressed.attr,
+	&dev_attr_tx_compressed.attr,
+	NULL
+};
+
+
+static struct attribute_group netstat_group = {
+	.name  = "statistics",
+	.attrs  = netstat_attrs,
+};
+
+#ifdef CONFIG_WIRELESS_EXT_SYSFS
+/* helper function that does all the locking etc for wireless stats */
+static ssize_t wireless_show(struct device *d, char *buf,
+			     ssize_t (*format)(const struct iw_statistics *,
+					       char *))
+{
+	struct net_device *dev = to_net_dev(d);
+	const struct iw_statistics *iw;
+	ssize_t ret = -EINVAL;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+	if (dev_isalive(dev)) {
+		iw = get_wireless_stats(dev);
+		if (iw)
+			ret = (*format)(iw, buf);
+	}
+	rtnl_unlock();
+
+	return ret;
+}
+
+/* show function template for wireless fields */
+#define WIRELESS_SHOW(name, field, format_string)			\
+static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \
+{									\
+	return sprintf(buf, format_string, iw->field);			\
+}									\
+static ssize_t show_iw_##name(struct device *d,				\
+			      struct device_attribute *attr, char *buf)	\
+{									\
+	return wireless_show(d, buf, format_iw_##name);			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL)
+
+WIRELESS_SHOW(status, status, fmt_hex);
+WIRELESS_SHOW(link, qual.qual, fmt_dec);
+WIRELESS_SHOW(level, qual.level, fmt_dec);
+WIRELESS_SHOW(noise, qual.noise, fmt_dec);
+WIRELESS_SHOW(nwid, discard.nwid, fmt_dec);
+WIRELESS_SHOW(crypt, discard.code, fmt_dec);
+WIRELESS_SHOW(fragment, discard.fragment, fmt_dec);
+WIRELESS_SHOW(misc, discard.misc, fmt_dec);
+WIRELESS_SHOW(retries, discard.retries, fmt_dec);
+WIRELESS_SHOW(beacon, miss.beacon, fmt_dec);
+
+static struct attribute *wireless_attrs[] = {
+	&dev_attr_status.attr,
+	&dev_attr_link.attr,
+	&dev_attr_level.attr,
+	&dev_attr_noise.attr,
+	&dev_attr_nwid.attr,
+	&dev_attr_crypt.attr,
+	&dev_attr_fragment.attr,
+	&dev_attr_retries.attr,
+	&dev_attr_misc.attr,
+	&dev_attr_beacon.attr,
+	NULL
+};
+
+static struct attribute_group wireless_group = {
+	.name = "wireless",
+	.attrs = wireless_attrs,
+};
+#endif
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_RPS
+/*
+ * RX queue sysfs structures and functions.
+ */
+struct rx_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_rx_queue *queue,
+	    struct rx_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_rx_queue_attr(_attr) container_of(_attr,		\
+    struct rx_queue_attribute, attr)
+
+#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
+
+static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
+				  char *buf)
+{
+	struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
+	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops rx_queue_sysfs_ops = {
+	.show = rx_queue_attr_show,
+	.store = rx_queue_attr_store,
+};
+
+static ssize_t show_rps_map(struct netdev_rx_queue *queue,
+			    struct rx_queue_attribute *attribute, char *buf)
+{
+	struct rps_map *map;
+	cpumask_var_t mask;
+	size_t len = 0;
+	int i;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rcu_read_lock();
+	map = rcu_dereference(queue->rps_map);
+	if (map)
+		for (i = 0; i < map->len; i++)
+			cpumask_set_cpu(map->cpus[i], mask);
+
+	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+	if (PAGE_SIZE - len < 3) {
+		rcu_read_unlock();
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+	rcu_read_unlock();
+
+	free_cpumask_var(mask);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+		      struct rx_queue_attribute *attribute,
+		      const char *buf, size_t len)
+{
+	struct rps_map *old_map, *map;
+	cpumask_var_t mask;
+	int err, cpu, i;
+	static DEFINE_SPINLOCK(rps_map_lock);
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+	if (err) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	map = kzalloc(max_t(unsigned,
+	    RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
+	    GFP_KERNEL);
+	if (!map) {
+		free_cpumask_var(mask);
+		return -ENOMEM;
+	}
+
+	i = 0;
+	for_each_cpu_and(cpu, mask, cpu_online_mask)
+		map->cpus[i++] = cpu;
+
+	if (i)
+		map->len = i;
+	else {
+		kfree(map);
+		map = NULL;
+	}
+
+	spin_lock(&rps_map_lock);
+	old_map = rcu_dereference_protected(queue->rps_map,
+					    lockdep_is_held(&rps_map_lock));
+	rcu_assign_pointer(queue->rps_map, map);
+	spin_unlock(&rps_map_lock);
+
+	if (old_map)
+		kfree_rcu(old_map, rcu);
+
+	free_cpumask_var(mask);
+	return len;
+}
+
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+					   struct rx_queue_attribute *attr,
+					   char *buf)
+{
+	struct rps_dev_flow_table *flow_table;
+	unsigned int val = 0;
+
+	rcu_read_lock();
+	flow_table = rcu_dereference(queue->rps_flow_table);
+	if (flow_table)
+		val = flow_table->mask + 1;
+	rcu_read_unlock();
+
+	return sprintf(buf, "%u\n", val);
+}
+
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+	struct rps_dev_flow_table *table = container_of(work,
+	    struct rps_dev_flow_table, free_work);
+
+	vfree(table);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+	struct rps_dev_flow_table *table = container_of(rcu,
+	    struct rps_dev_flow_table, rcu);
+
+	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+	schedule_work(&table->free_work);
+}
+
+static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+				     struct rx_queue_attribute *attr,
+				     const char *buf, size_t len)
+{
+	unsigned int count;
+	char *endp;
+	struct rps_dev_flow_table *table, *old_table;
+	static DEFINE_SPINLOCK(rps_dev_flow_lock);
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	count = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+
+	if (count) {
+		int i;
+
+		if (count > 1<<30) {
+			/* Enforce a limit to prevent overflow */
+			return -EINVAL;
+		}
+		count = roundup_pow_of_two(count);
+		table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+		if (!table)
+			return -ENOMEM;
+
+		table->mask = count - 1;
+		for (i = 0; i < count; i++)
+			table->flows[i].cpu = RPS_NO_CPU;
+	} else
+		table = NULL;
+
+	spin_lock(&rps_dev_flow_lock);
+	old_table = rcu_dereference_protected(queue->rps_flow_table,
+					      lockdep_is_held(&rps_dev_flow_lock));
+	rcu_assign_pointer(queue->rps_flow_table, table);
+	spin_unlock(&rps_dev_flow_lock);
+
+	if (old_table)
+		call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+	return len;
+}
+
+static struct rx_queue_attribute rps_cpus_attribute =
+	__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+	__ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+	    show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+
+static struct attribute *rx_queue_default_attrs[] = {
+	&rps_cpus_attribute.attr,
+	&rps_dev_flow_table_cnt_attribute.attr,
+	NULL
+};
+
+static void rx_queue_release(struct kobject *kobj)
+{
+	struct netdev_rx_queue *queue = to_rx_queue(kobj);
+	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+
+
+	map = rcu_dereference_raw(queue->rps_map);
+	if (map) {
+		RCU_INIT_POINTER(queue->rps_map, NULL);
+		kfree_rcu(map, rcu);
+	}
+
+	flow_table = rcu_dereference_raw(queue->rps_flow_table);
+	if (flow_table) {
+		RCU_INIT_POINTER(queue->rps_flow_table, NULL);
+		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
+	}
+
+	memset(kobj, 0, sizeof(*kobj));
+	dev_put(queue->dev);
+}
+
+static struct kobj_type rx_queue_ktype = {
+	.sysfs_ops = &rx_queue_sysfs_ops,
+	.release = rx_queue_release,
+	.default_attrs = rx_queue_default_attrs,
+};
+
+static int rx_queue_add_kobject(struct net_device *net, int index)
+{
+	struct netdev_rx_queue *queue = net->_rx + index;
+	struct kobject *kobj = &queue->kobj;
+	int error = 0;
+
+	kobj->kset = net->queues_kset;
+	error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
+	    "rx-%u", index);
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
+
+	kobject_uevent(kobj, KOBJ_ADD);
+	dev_hold(queue->dev);
+
+	return error;
+}
+#endif /* CONFIG_RPS */
+
+int
+net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+{
+#ifdef CONFIG_RPS
+	int i;
+	int error = 0;
+
+	for (i = old_num; i < new_num; i++) {
+		error = rx_queue_add_kobject(net, i);
+		if (error) {
+			new_num = old_num;
+			break;
+		}
+	}
+
+	while (--i >= new_num)
+		kobject_put(&net->_rx[i].kobj);
+
+	return error;
+#else
+	return 0;
+#endif
+}
+
+#ifdef CONFIG_XPS
+/*
+ * netdev_queue sysfs structures and functions.
+ */
+struct netdev_queue_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, char *buf);
+	ssize_t (*store)(struct netdev_queue *queue,
+	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
+};
+#define to_netdev_queue_attr(_attr) container_of(_attr,		\
+    struct netdev_queue_attribute, attr)
+
+#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj)
+
+static ssize_t netdev_queue_attr_show(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(queue, attribute, buf);
+}
+
+static ssize_t netdev_queue_attr_store(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr);
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(queue, attribute, buf, count);
+}
+
+static const struct sysfs_ops netdev_queue_sysfs_ops = {
+	.show = netdev_queue_attr_show,
+	.store = netdev_queue_attr_store,
+};
+
+static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
+{
+	struct net_device *dev = queue->dev;
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++)
+		if (queue == &dev->_tx[i])
+			break;
+
+	BUG_ON(i >= dev->num_tx_queues);
+
+	return i;
+}
+
+
+static ssize_t show_xps_map(struct netdev_queue *queue,
+			    struct netdev_queue_attribute *attribute, char *buf)
+{
+	struct net_device *dev = queue->dev;
+	struct xps_dev_maps *dev_maps;
+	cpumask_var_t mask;
+	unsigned long index;
+	size_t len = 0;
+	int i;
+
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps);
+	if (dev_maps) {
+		for_each_possible_cpu(i) {
+			struct xps_map *map =
+			    rcu_dereference(dev_maps->cpu_map[i]);
+			if (map) {
+				int j;
+				for (j = 0; j < map->len; j++) {
+					if (map->queues[j] == index) {
+						cpumask_set_cpu(i, mask);
+						break;
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
+	if (PAGE_SIZE - len < 3) {
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+
+	free_cpumask_var(mask);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P)		\
+	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static ssize_t store_xps_map(struct netdev_queue *queue,
+		      struct netdev_queue_attribute *attribute,
+		      const char *buf, size_t len)
+{
+	struct net_device *dev = queue->dev;
+	cpumask_var_t mask;
+	int err, i, cpu, pos, map_len, alloc_len, need_set;
+	unsigned long index;
+	struct xps_map *map, *new_map;
+	struct xps_dev_maps *dev_maps, *new_dev_maps;
+	int nonempty = 0;
+	int numa_node = -2;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	index = get_netdev_queue_index(queue);
+
+	err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+	if (err) {
+		free_cpumask_var(mask);
+		return err;
+	}
+
+	new_dev_maps = kzalloc(max_t(unsigned,
+	    XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL);
+	if (!new_dev_maps) {
+		free_cpumask_var(mask);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&xps_map_mutex);
+
+	dev_maps = xmap_dereference(dev->xps_maps);
+
+	for_each_possible_cpu(cpu) {
+		map = dev_maps ?
+			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
+		new_map = map;
+		if (map) {
+			for (pos = 0; pos < map->len; pos++)
+				if (map->queues[pos] == index)
+					break;
+			map_len = map->len;
+			alloc_len = map->alloc_len;
+		} else
+			pos = map_len = alloc_len = 0;
+
+		need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
+#ifdef CONFIG_NUMA
+		if (need_set) {
+			if (numa_node == -2)
+				numa_node = cpu_to_node(cpu);
+			else if (numa_node != cpu_to_node(cpu))
+				numa_node = -1;
+		}
+#endif
+		if (need_set && pos >= map_len) {
+			/* Need to add queue to this CPU's map */
+			if (map_len >= alloc_len) {
+				alloc_len = alloc_len ?
+				    2 * alloc_len : XPS_MIN_MAP_ALLOC;
+				new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len),
+						       GFP_KERNEL,
+						       cpu_to_node(cpu));
+				if (!new_map)
+					goto error;
+				new_map->alloc_len = alloc_len;
+				for (i = 0; i < map_len; i++)
+					new_map->queues[i] = map->queues[i];
+				new_map->len = map_len;
+			}
+			new_map->queues[new_map->len++] = index;
+		} else if (!need_set && pos < map_len) {
+			/* Need to remove queue from this CPU's map */
+			if (map_len > 1)
+				new_map->queues[pos] =
+				    new_map->queues[--new_map->len];
+			else
+				new_map = NULL;
+		}
+		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map);
+	}
+
+	/* Cleanup old maps */
+	for_each_possible_cpu(cpu) {
+		map = dev_maps ?
+			xmap_dereference(dev_maps->cpu_map[cpu]) : NULL;
+		if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map)
+			kfree_rcu(map, rcu);
+		if (new_dev_maps->cpu_map[cpu])
+			nonempty = 1;
+	}
+
+	if (nonempty)
+		rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+	else {
+		kfree(new_dev_maps);
+		rcu_assign_pointer(dev->xps_maps, NULL);
+	}
+
+	if (dev_maps)
+		kfree_rcu(dev_maps, rcu);
+
+	netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
+					    NUMA_NO_NODE);
+
+	mutex_unlock(&xps_map_mutex);
+
+	free_cpumask_var(mask);
+	return len;
+
+error:
+	mutex_unlock(&xps_map_mutex);
+
+	if (new_dev_maps)
+		for_each_possible_cpu(i)
+			kfree(rcu_dereference_protected(
+				new_dev_maps->cpu_map[i],
+				1));
+	kfree(new_dev_maps);
+	free_cpumask_var(mask);
+	return -ENOMEM;
+}
+
+static struct netdev_queue_attribute xps_cpus_attribute =
+    __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+
+static struct attribute *netdev_queue_default_attrs[] = {
+	&xps_cpus_attribute.attr,
+	NULL
+};
+
+static void netdev_queue_release(struct kobject *kobj)
+{
+	struct netdev_queue *queue = to_netdev_queue(kobj);
+	struct net_device *dev = queue->dev;
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	unsigned long index;
+	int i, pos, nonempty = 0;
+
+	index = get_netdev_queue_index(queue);
+
+	mutex_lock(&xps_map_mutex);
+	dev_maps = xmap_dereference(dev->xps_maps);
+
+	if (dev_maps) {
+		for_each_possible_cpu(i) {
+			map = xmap_dereference(dev_maps->cpu_map[i]);
+			if (!map)
+				continue;
+
+			for (pos = 0; pos < map->len; pos++)
+				if (map->queues[pos] == index)
+					break;
+
+			if (pos < map->len) {
+				if (map->len > 1)
+					map->queues[pos] =
+					    map->queues[--map->len];
+				else {
+					RCU_INIT_POINTER(dev_maps->cpu_map[i],
+					    NULL);
+					kfree_rcu(map, rcu);
+					map = NULL;
+				}
+			}
+			if (map)
+				nonempty = 1;
+		}
+
+		if (!nonempty) {
+			RCU_INIT_POINTER(dev->xps_maps, NULL);
+			kfree_rcu(dev_maps, rcu);
+		}
+	}
+
+	mutex_unlock(&xps_map_mutex);
+
+	memset(kobj, 0, sizeof(*kobj));
+	dev_put(queue->dev);
+}
+
+static struct kobj_type netdev_queue_ktype = {
+	.sysfs_ops = &netdev_queue_sysfs_ops,
+	.release = netdev_queue_release,
+	.default_attrs = netdev_queue_default_attrs,
+};
+
+static int netdev_queue_add_kobject(struct net_device *net, int index)
+{
+	struct netdev_queue *queue = net->_tx + index;
+	struct kobject *kobj = &queue->kobj;
+	int error = 0;
+
+	kobj->kset = net->queues_kset;
+	error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
+	    "tx-%u", index);
+	if (error) {
+		kobject_put(kobj);
+		return error;
+	}
+
+	kobject_uevent(kobj, KOBJ_ADD);
+	dev_hold(queue->dev);
+
+	return error;
+}
+#endif /* CONFIG_XPS */
+
+int
+netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+{
+#ifdef CONFIG_XPS
+	int i;
+	int error = 0;
+
+	for (i = old_num; i < new_num; i++) {
+		error = netdev_queue_add_kobject(net, i);
+		if (error) {
+			new_num = old_num;
+			break;
+		}
+	}
+
+	while (--i >= new_num)
+		kobject_put(&net->_tx[i].kobj);
+
+	return error;
+#else
+	return 0;
+#endif
+}
+
+static int register_queue_kobjects(struct net_device *net)
+{
+	int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
+
+#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+	net->queues_kset = kset_create_and_add("queues",
+	    NULL, &net->dev.kobj);
+	if (!net->queues_kset)
+		return -ENOMEM;
+#endif
+
+#ifdef CONFIG_RPS
+	real_rx = net->real_num_rx_queues;
+#endif
+	real_tx = net->real_num_tx_queues;
+
+	error = net_rx_queue_update_kobjects(net, 0, real_rx);
+	if (error)
+		goto error;
+	rxq = real_rx;
+
+	error = netdev_queue_update_kobjects(net, 0, real_tx);
+	if (error)
+		goto error;
+	txq = real_tx;
+
+	return 0;
+
+error:
+	netdev_queue_update_kobjects(net, txq, 0);
+	net_rx_queue_update_kobjects(net, rxq, 0);
+	return error;
+}
+
+static void remove_queue_kobjects(struct net_device *net)
+{
+	int real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_RPS
+	real_rx = net->real_num_rx_queues;
+#endif
+	real_tx = net->real_num_tx_queues;
+
+	net_rx_queue_update_kobjects(net, real_rx, 0);
+	netdev_queue_update_kobjects(net, real_tx, 0);
+#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+	kset_unregister(net->queues_kset);
+#endif
+}
+
+static void *net_grab_current_ns(void)
+{
+	struct net *ns = current->nsproxy->net_ns;
+#ifdef CONFIG_NET_NS
+	if (ns)
+		atomic_inc(&ns->passive);
+#endif
+	return ns;
+}
+
+static const void *net_initial_ns(void)
+{
+	return &init_net;
+}
+
+static const void *net_netlink_ns(struct sock *sk)
+{
+	return sock_net(sk);
+}
+
+struct kobj_ns_type_operations net_ns_type_operations = {
+	.type = KOBJ_NS_TYPE_NET,
+	.grab_current_ns = net_grab_current_ns,
+	.netlink_ns = net_netlink_ns,
+	.initial_ns = net_initial_ns,
+	.drop_ns = net_drop_ns,
+};
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
+
+#ifdef CONFIG_HOTPLUG
+static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+{
+	struct net_device *dev = to_net_dev(d);
+	int retval;
+
+	/* pass interface to uevent. */
+	retval = add_uevent_var(env, "INTERFACE=%s", dev->name);
+	if (retval)
+		goto exit;
+
+	/* pass ifindex to uevent.
+	 * ifindex is useful as it won't change (interface name may change)
+	 * and is what RtNetlink uses natively. */
+	retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex);
+
+exit:
+	return retval;
+}
+#endif
+
+/*
+ *	netdev_release -- destroy and free a dead device.
+ *	Called when last reference to device kobject is gone.
+ */
+static void netdev_release(struct device *d)
+{
+	struct net_device *dev = to_net_dev(d);
+
+	BUG_ON(dev->reg_state != NETREG_RELEASED);
+
+	kfree(dev->ifalias);
+	kfree((char *)dev - dev->padded);
+}
+
+static const void *net_namespace(struct device *d)
+{
+	struct net_device *dev;
+	dev = container_of(d, struct net_device, dev);
+	return dev_net(dev);
+}
+
+static struct class net_class = {
+	.name = "net",
+	.dev_release = netdev_release,
+#ifdef CONFIG_SYSFS
+	.dev_attrs = net_class_attributes,
+#endif /* CONFIG_SYSFS */
+#ifdef CONFIG_HOTPLUG
+	.dev_uevent = netdev_uevent,
+#endif
+	.ns_type = &net_ns_type_operations,
+	.namespace = net_namespace,
+};
+
+/* Delete sysfs entries but hold kobject reference until after all
+ * netdev references are gone.
+ */
+void netdev_unregister_kobject(struct net_device * net)
+{
+	struct device *dev = &(net->dev);
+
+	kobject_get(&dev->kobj);
+
+	remove_queue_kobjects(net);
+
+	device_del(dev);
+}
+
+/* Create sysfs entries for network device. */
+int netdev_register_kobject(struct net_device *net)
+{
+	struct device *dev = &(net->dev);
+	const struct attribute_group **groups = net->sysfs_groups;
+	int error = 0;
+
+	device_initialize(dev);
+	dev->class = &net_class;
+	dev->platform_data = net;
+	dev->groups = groups;
+
+	dev_set_name(dev, "%s", net->name);
+
+#ifdef CONFIG_SYSFS
+	/* Allow for a device specific group */
+	if (*groups)
+		groups++;
+
+	*groups++ = &netstat_group;
+#ifdef CONFIG_WIRELESS_EXT_SYSFS
+	if (net->ieee80211_ptr)
+		*groups++ = &wireless_group;
+#ifdef CONFIG_WIRELESS_EXT
+	else if (net->wireless_handlers)
+		*groups++ = &wireless_group;
+#endif
+#endif
+#endif /* CONFIG_SYSFS */
+
+	error = device_add(dev);
+	if (error)
+		return error;
+
+	error = register_queue_kobjects(net);
+	if (error) {
+		device_del(dev);
+		return error;
+	}
+
+	return error;
+}
+
+int netdev_class_create_file(struct class_attribute *class_attr)
+{
+	return class_create_file(&net_class, class_attr);
+}
+EXPORT_SYMBOL(netdev_class_create_file);
+
+void netdev_class_remove_file(struct class_attribute *class_attr)
+{
+	class_remove_file(&net_class, class_attr);
+}
+EXPORT_SYMBOL(netdev_class_remove_file);
+
+int netdev_kobject_init(void)
+{
+	kobj_ns_type_register(&net_ns_type_operations);
+	return class_register(&net_class);
+}
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
new file mode 100644
index 00000000..bd7751ec
--- /dev/null
+++ b/net/core/net-sysfs.h
@@ -0,0 +1,11 @@
+#ifndef __NET_SYSFS_H__
+#define __NET_SYSFS_H__
+
+int netdev_kobject_init(void);
+int netdev_register_kobject(struct net_device *);
+void netdev_unregister_kobject(struct net_device *);
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+int netdev_queue_update_kobjects(struct net_device *net,
+				 int old_num, int new_num);
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 00000000..7f1bb2ab
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,34 @@
+/*
+ * consolidates trace point definitions
+ *
+ * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/netlink.h>
+#include <linux/net_dropmon.h>
+#include <linux/slab.h>
+
+#include <asm/unaligned.h>
+#include <asm/bitops.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
+#include <trace/events/net.h>
+#include <trace/events/napi.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
new file mode 100644
index 00000000..2772ed11
--- /dev/null
+++ b/net/core/net_namespace.c
@@ -0,0 +1,638 @@
+#include <linux/workqueue.h>
+#include <linux/rtnetlink.h>
+#include <linux/cache.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/rculist.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+ *	Our network namespace constructor/destructor lists
+ */
+
+static LIST_HEAD(pernet_list);
+static struct list_head *first_device = &pernet_list;
+static DEFINE_MUTEX(net_mutex);
+
+LIST_HEAD(net_namespace_list);
+EXPORT_SYMBOL_GPL(net_namespace_list);
+
+struct net init_net;
+EXPORT_SYMBOL(init_net);
+
+#define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
+
+static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+
+static struct net_generic *net_alloc_generic(void)
+{
+	struct net_generic *ng;
+	size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+	ng = kzalloc(generic_size, GFP_KERNEL);
+	if (ng)
+		ng->len = max_gen_ptrs;
+
+	return ng;
+}
+
+static int net_assign_generic(struct net *net, int id, void *data)
+{
+	struct net_generic *ng, *old_ng;
+
+	BUG_ON(!mutex_is_locked(&net_mutex));
+	BUG_ON(id == 0);
+
+	old_ng = rcu_dereference_protected(net->gen,
+					   lockdep_is_held(&net_mutex));
+	ng = old_ng;
+	if (old_ng->len >= id)
+		goto assign;
+
+	ng = net_alloc_generic();
+	if (ng == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Some synchronisation notes:
+	 *
+	 * The net_generic explores the net->gen array inside rcu
+	 * read section. Besides once set the net->gen->ptr[x]
+	 * pointer never changes (see rules in netns/generic.h).
+	 *
+	 * That said, we simply duplicate this array and schedule
+	 * the old copy for kfree after a grace period.
+	 */
+
+	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+
+	rcu_assign_pointer(net->gen, ng);
+	kfree_rcu(old_ng, rcu);
+assign:
+	ng->ptr[id - 1] = data;
+	return 0;
+}
+
+static int ops_init(const struct pernet_operations *ops, struct net *net)
+{
+	int err = -ENOMEM;
+	void *data = NULL;
+
+	if (ops->id && ops->size) {
+		data = kzalloc(ops->size, GFP_KERNEL);
+		if (!data)
+			goto out;
+
+		err = net_assign_generic(net, *ops->id, data);
+		if (err)
+			goto cleanup;
+	}
+	err = 0;
+	if (ops->init)
+		err = ops->init(net);
+	if (!err)
+		return 0;
+
+cleanup:
+	kfree(data);
+
+out:
+	return err;
+}
+
+static void ops_free(const struct pernet_operations *ops, struct net *net)
+{
+	if (ops->id && ops->size) {
+		int id = *ops->id;
+		kfree(net_generic(net, id));
+	}
+}
+
+static void ops_exit_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+	if (ops->exit) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops->exit(net);
+	}
+	if (ops->exit_batch)
+		ops->exit_batch(net_exit_list);
+}
+
+static void ops_free_list(const struct pernet_operations *ops,
+			  struct list_head *net_exit_list)
+{
+	struct net *net;
+	if (ops->size && ops->id) {
+		list_for_each_entry(net, net_exit_list, exit_list)
+			ops_free(ops, net);
+	}
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net)
+{
+	/* Must be called with net_mutex held */
+	const struct pernet_operations *ops, *saved_ops;
+	int error = 0;
+	LIST_HEAD(net_exit_list);
+
+	atomic_set(&net->count, 1);
+	atomic_set(&net->passive, 1);
+
+#ifdef NETNS_REFCNT_DEBUG
+	atomic_set(&net->use_count, 0);
+#endif
+
+	list_for_each_entry(ops, &pernet_list, list) {
+		error = ops_init(ops, net);
+		if (error < 0)
+			goto out_undo;
+	}
+out:
+	return error;
+
+out_undo:
+	/* Walk through the list backwards calling the exit functions
+	 * for the pernet modules whose init functions did not fail.
+	 */
+	list_add(&net->exit_list, &net_exit_list);
+	saved_ops = ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
+	ops = saved_ops;
+	list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+		ops_free_list(ops, &net_exit_list);
+
+	rcu_barrier();
+	goto out;
+}
+
+
+#ifdef CONFIG_NET_NS
+static struct kmem_cache *net_cachep;
+static struct workqueue_struct *netns_wq;
+
+static struct net *net_alloc(void)
+{
+	struct net *net = NULL;
+	struct net_generic *ng;
+
+	ng = net_alloc_generic();
+	if (!ng)
+		goto out;
+
+	net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
+	if (!net)
+		goto out_free;
+
+	rcu_assign_pointer(net->gen, ng);
+out:
+	return net;
+
+out_free:
+	kfree(ng);
+	goto out;
+}
+
+static void net_free(struct net *net)
+{
+#ifdef NETNS_REFCNT_DEBUG
+	if (unlikely(atomic_read(&net->use_count) != 0)) {
+		printk(KERN_EMERG "network namespace not free! Usage: %d\n",
+			atomic_read(&net->use_count));
+		return;
+	}
+#endif
+	kfree(net->gen);
+	kmem_cache_free(net_cachep, net);
+}
+
+void net_drop_ns(void *p)
+{
+	struct net *ns = p;
+	if (ns && atomic_dec_and_test(&ns->passive))
+		net_free(ns);
+}
+
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+	struct net *net;
+	int rv;
+
+	if (!(flags & CLONE_NEWNET))
+		return get_net(old_net);
+
+	net = net_alloc();
+	if (!net)
+		return ERR_PTR(-ENOMEM);
+	mutex_lock(&net_mutex);
+	rv = setup_net(net);
+	if (rv == 0) {
+		rtnl_lock();
+		list_add_tail_rcu(&net->list, &net_namespace_list);
+		rtnl_unlock();
+	}
+	mutex_unlock(&net_mutex);
+	if (rv < 0) {
+		net_drop_ns(net);
+		return ERR_PTR(rv);
+	}
+	return net;
+}
+
+static DEFINE_SPINLOCK(cleanup_list_lock);
+static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
+
+static void cleanup_net(struct work_struct *work)
+{
+	const struct pernet_operations *ops;
+	struct net *net, *tmp;
+	LIST_HEAD(net_kill_list);
+	LIST_HEAD(net_exit_list);
+
+	/* Atomically snapshot the list of namespaces to cleanup */
+	spin_lock_irq(&cleanup_list_lock);
+	list_replace_init(&cleanup_list, &net_kill_list);
+	spin_unlock_irq(&cleanup_list_lock);
+
+	mutex_lock(&net_mutex);
+
+	/* Don't let anyone else find us. */
+	rtnl_lock();
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		list_del_rcu(&net->list);
+		list_add_tail(&net->exit_list, &net_exit_list);
+	}
+	rtnl_unlock();
+
+	/*
+	 * Another CPU might be rcu-iterating the list, wait for it.
+	 * This needs to be before calling the exit() notifiers, so
+	 * the rcu_barrier() below isn't sufficient alone.
+	 */
+	synchronize_rcu();
+
+	/* Run all of the network namespace exit methods */
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_exit_list(ops, &net_exit_list);
+
+	/* Free the net generic variables */
+	list_for_each_entry_reverse(ops, &pernet_list, list)
+		ops_free_list(ops, &net_exit_list);
+
+	mutex_unlock(&net_mutex);
+
+	/* Ensure there are no outstanding rcu callbacks using this
+	 * network namespace.
+	 */
+	rcu_barrier();
+
+	/* Finally it is safe to free my network namespace structure */
+	list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
+		list_del_init(&net->exit_list);
+		net_drop_ns(net);
+	}
+}
+static DECLARE_WORK(net_cleanup_work, cleanup_net);
+
+void __put_net(struct net *net)
+{
+	/* Cleanup the network namespace in process context */
+	unsigned long flags;
+
+	spin_lock_irqsave(&cleanup_list_lock, flags);
+	list_add(&net->cleanup_list, &cleanup_list);
+	spin_unlock_irqrestore(&cleanup_list_lock, flags);
+
+	queue_work(netns_wq, &net_cleanup_work);
+}
+EXPORT_SYMBOL_GPL(__put_net);
+
+struct net *get_net_ns_by_fd(int fd)
+{
+	struct proc_inode *ei;
+	struct file *file;
+	struct net *net;
+
+	file = proc_ns_fget(fd);
+	if (IS_ERR(file))
+		return ERR_CAST(file);
+
+	ei = PROC_I(file->f_dentry->d_inode);
+	if (ei->ns_ops == &netns_operations)
+		net = get_net(ei->ns);
+	else
+		net = ERR_PTR(-EINVAL);
+
+	fput(file);
+	return net;
+}
+
+#else
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+	if (flags & CLONE_NEWNET)
+		return ERR_PTR(-EINVAL);
+	return old_net;
+}
+
+struct net *get_net_ns_by_fd(int fd)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif
+
+struct net *get_net_ns_by_pid(pid_t pid)
+{
+	struct task_struct *tsk;
+	struct net *net;
+
+	/* Lookup the network namespace */
+	net = ERR_PTR(-ESRCH);
+	rcu_read_lock();
+	tsk = find_task_by_vpid(pid);
+	if (tsk) {
+		struct nsproxy *nsproxy;
+		nsproxy = task_nsproxy(tsk);
+		if (nsproxy)
+			net = get_net(nsproxy->net_ns);
+	}
+	rcu_read_unlock();
+	return net;
+}
+EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
+
+static int __init net_ns_init(void)
+{
+	struct net_generic *ng;
+
+#ifdef CONFIG_NET_NS
+	net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
+					SMP_CACHE_BYTES,
+					SLAB_PANIC, NULL);
+
+	/* Create workqueue for cleanup */
+	netns_wq = create_singlethread_workqueue("netns");
+	if (!netns_wq)
+		panic("Could not create netns workq");
+#endif
+
+	ng = net_alloc_generic();
+	if (!ng)
+		panic("Could not allocate generic netns");
+
+	rcu_assign_pointer(init_net.gen, ng);
+
+	mutex_lock(&net_mutex);
+	if (setup_net(&init_net))
+		panic("Could not setup the initial network namespace");
+
+	rtnl_lock();
+	list_add_tail_rcu(&init_net.list, &net_namespace_list);
+	rtnl_unlock();
+
+	mutex_unlock(&net_mutex);
+
+	return 0;
+}
+
+pure_initcall(net_ns_init);
+
+#ifdef CONFIG_NET_NS
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
+{
+	struct net *net;
+	int error;
+	LIST_HEAD(net_exit_list);
+
+	list_add_tail(&ops->list, list);
+	if (ops->init || (ops->id && ops->size)) {
+		for_each_net(net) {
+			error = ops_init(ops, net);
+			if (error)
+				goto out_undo;
+			list_add_tail(&net->exit_list, &net_exit_list);
+		}
+	}
+	return 0;
+
+out_undo:
+	/* If I have an error cleanup all namespaces I initialized */
+	list_del(&ops->list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
+	return error;
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+	struct net *net;
+	LIST_HEAD(net_exit_list);
+
+	list_del(&ops->list);
+	for_each_net(net)
+		list_add_tail(&net->exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
+}
+
+#else
+
+static int __register_pernet_operations(struct list_head *list,
+					struct pernet_operations *ops)
+{
+	return ops_init(ops, &init_net);
+}
+
+static void __unregister_pernet_operations(struct pernet_operations *ops)
+{
+	LIST_HEAD(net_exit_list);
+	list_add(&init_net.exit_list, &net_exit_list);
+	ops_exit_list(ops, &net_exit_list);
+	ops_free_list(ops, &net_exit_list);
+}
+
+#endif /* CONFIG_NET_NS */
+
+static DEFINE_IDA(net_generic_ids);
+
+static int register_pernet_operations(struct list_head *list,
+				      struct pernet_operations *ops)
+{
+	int error;
+
+	if (ops->id) {
+again:
+		error = ida_get_new_above(&net_generic_ids, 1, ops->id);
+		if (error < 0) {
+			if (error == -EAGAIN) {
+				ida_pre_get(&net_generic_ids, GFP_KERNEL);
+				goto again;
+			}
+			return error;
+		}
+		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+	}
+	error = __register_pernet_operations(list, ops);
+	if (error) {
+		rcu_barrier();
+		if (ops->id)
+			ida_remove(&net_generic_ids, *ops->id);
+	}
+
+	return error;
+}
+
+static void unregister_pernet_operations(struct pernet_operations *ops)
+{
+	
+	__unregister_pernet_operations(ops);
+	rcu_barrier();
+	if (ops->id)
+		ida_remove(&net_generic_ids, *ops->id);
+}
+
+/**
+ *      register_pernet_subsys - register a network namespace subsystem
+ *	@ops:  pernet operations structure for the subsystem
+ *
+ *	Register a subsystem which has init and exit functions
+ *	that are called when network namespaces are created and
+ *	destroyed respectively.
+ *
+ *	When registered all network namespace init functions are
+ *	called for every existing network namespace.  Allowing kernel
+ *	modules to have a race free view of the set of network namespaces.
+ *
+ *	When a new network namespace is created all of the init
+ *	methods are called in the order in which they were registered.
+ *
+ *	When a network namespace is destroyed all of the exit methods
+ *	are called in the reverse of the order with which they were
+ *	registered.
+ */
+int register_pernet_subsys(struct pernet_operations *ops)
+{
+	int error;
+	mutex_lock(&net_mutex);
+	error =  register_pernet_operations(first_device, ops);
+	mutex_unlock(&net_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_subsys);
+
+/**
+ *      unregister_pernet_subsys - unregister a network namespace subsystem
+ *	@ops: pernet operations structure to manipulate
+ *
+ *	Remove the pernet operations structure from the list to be
+ *	used when network namespaces are created or destroyed.  In
+ *	addition run the exit method for all existing network
+ *	namespaces.
+ */
+void unregister_pernet_subsys(struct pernet_operations *ops)
+{
+	mutex_lock(&net_mutex);
+	unregister_pernet_operations(ops);
+	mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
+
+/**
+ *      register_pernet_device - register a network namespace device
+ *	@ops:  pernet operations structure for the subsystem
+ *
+ *	Register a device which has init and exit functions
+ *	that are called when network namespaces are created and
+ *	destroyed respectively.
+ *
+ *	When registered all network namespace init functions are
+ *	called for every existing network namespace.  Allowing kernel
+ *	modules to have a race free view of the set of network namespaces.
+ *
+ *	When a new network namespace is created all of the init
+ *	methods are called in the order in which they were registered.
+ *
+ *	When a network namespace is destroyed all of the exit methods
+ *	are called in the reverse of the order with which they were
+ *	registered.
+ */
+int register_pernet_device(struct pernet_operations *ops)
+{
+	int error;
+	mutex_lock(&net_mutex);
+	error = register_pernet_operations(&pernet_list, ops);
+	if (!error && (first_device == &pernet_list))
+		first_device = &ops->list;
+	mutex_unlock(&net_mutex);
+	return error;
+}
+EXPORT_SYMBOL_GPL(register_pernet_device);
+
+/**
+ *      unregister_pernet_device - unregister a network namespace netdevice
+ *	@ops: pernet operations structure to manipulate
+ *
+ *	Remove the pernet operations structure from the list to be
+ *	used when network namespaces are created or destroyed.  In
+ *	addition run the exit method for all existing network
+ *	namespaces.
+ */
+void unregister_pernet_device(struct pernet_operations *ops)
+{
+	mutex_lock(&net_mutex);
+	if (&ops->list == first_device)
+		first_device = first_device->next;
+	unregister_pernet_operations(ops);
+	mutex_unlock(&net_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static void *netns_get(struct task_struct *task)
+{
+	struct net *net = NULL;
+	struct nsproxy *nsproxy;
+
+	rcu_read_lock();
+	nsproxy = task_nsproxy(task);
+	if (nsproxy)
+		net = get_net(nsproxy->net_ns);
+	rcu_read_unlock();
+
+	return net;
+}
+
+static void netns_put(void *ns)
+{
+	put_net(ns);
+}
+
+static int netns_install(struct nsproxy *nsproxy, void *ns)
+{
+	put_net(nsproxy->net_ns);
+	nsproxy->net_ns = get_net(ns);
+	return 0;
+}
+
+const struct proc_ns_operations netns_operations = {
+	.name		= "net",
+	.type		= CLONE_NEWNET,
+	.get		= netns_get,
+	.put		= netns_put,
+	.install	= netns_install,
+};
+#endif
diff --git a/net/core/netevent.c b/net/core/netevent.c
new file mode 100644
index 00000000..865f0ceb
--- /dev/null
+++ b/net/core/netevent.c
@@ -0,0 +1,69 @@
+/*
+ *	Network event notifiers
+ *
+ *	Authors:
+ *      Tom Tucker             <tom@opengridcomputing.com>
+ *      Steve Wise             <swise@opengridcomputing.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <net/netevent.h>
+
+static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
+
+/**
+ *	register_netevent_notifier - register a netevent notifier block
+ *	@nb: notifier
+ *
+ *	Register a notifier to be called when a netevent occurs.
+ *	The notifier passed is linked into the kernel structures and must
+ *	not be reused until it has been unregistered. A negative errno code
+ *	is returned on a failure.
+ */
+int register_netevent_notifier(struct notifier_block *nb)
+{
+	int err;
+
+	err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
+
+/**
+ *	netevent_unregister_notifier - unregister a netevent notifier block
+ *	@nb: notifier
+ *
+ *	Unregister a notifier previously registered by
+ *	register_neigh_notifier(). The notifier is unlinked into the
+ *	kernel structures and may then be reused. A negative errno code
+ *	is returned on a failure.
+ */
+
+int unregister_netevent_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
+
+/**
+ *	call_netevent_notifiers - call all netevent notifier blocks
+ *      @val: value passed unmodified to notifier function
+ *      @v:   pointer passed unmodified to notifier function
+ *
+ *	Call all neighbour notifier blocks.  Parameters and return value
+ *	are as for notifier_call_chain().
+ */
+
+int call_netevent_notifiers(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
+}
+EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
new file mode 100644
index 00000000..05db410f
--- /dev/null
+++ b/net/core/netpoll.c
@@ -0,0 +1,954 @@
+/*
+ * Common framework for low-level network console, dump, and debugger code
+ *
+ * Sep 8 2003  Matt Mackall <mpm@selenic.com>
+ *
+ * based on the netconsole code from:
+ *
+ * Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002  Red Hat, Inc.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/string.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/inet.h>
+#include <linux/interrupt.h>
+#include <linux/netpoll.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/unaligned.h>
+#include <trace/events/napi.h>
+
+/*
+ * We maintain a small pool of fully-sized skbs, to make sure the
+ * message gets out even in extreme OOM situations.
+ */
+
+#define MAX_UDP_CHUNK 1460
+#define MAX_SKBS 32
+
+static struct sk_buff_head skb_pool;
+
+static atomic_t trapped;
+
+#define USEC_PER_POLL	50
+#define NETPOLL_RX_ENABLED  1
+#define NETPOLL_RX_DROP     2
+
+#define MAX_SKB_SIZE \
+		(MAX_UDP_CHUNK + sizeof(struct udphdr) + \
+				sizeof(struct iphdr) + sizeof(struct ethhdr))
+
+static void zap_completion_queue(void);
+static void arp_reply(struct sk_buff *skb);
+
+static unsigned int carrier_timeout = 4;
+module_param(carrier_timeout, uint, 0644);
+
+static void queue_process(struct work_struct *work)
+{
+	struct netpoll_info *npinfo =
+		container_of(work, struct netpoll_info, tx_work.work);
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	while ((skb = skb_dequeue(&npinfo->txq))) {
+		struct net_device *dev = skb->dev;
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct netdev_queue *txq;
+
+		if (!netif_device_present(dev) || !netif_running(dev)) {
+			__kfree_skb(skb);
+			continue;
+		}
+
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+		local_irq_save(flags);
+		__netif_tx_lock(txq, smp_processor_id());
+		if (netif_tx_queue_frozen_or_stopped(txq) ||
+		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
+			skb_queue_head(&npinfo->txq, skb);
+			__netif_tx_unlock(txq);
+			local_irq_restore(flags);
+
+			schedule_delayed_work(&npinfo->tx_work, HZ/10);
+			return;
+		}
+		__netif_tx_unlock(txq);
+		local_irq_restore(flags);
+	}
+}
+
+static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh,
+			    unsigned short ulen, __be32 saddr, __be32 daddr)
+{
+	__wsum psum;
+
+	if (uh->check == 0 || skb_csum_unnecessary(skb))
+		return 0;
+
+	psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE &&
+	    !csum_fold(csum_add(psum, skb->csum)))
+		return 0;
+
+	skb->csum = psum;
+
+	return __skb_checksum_complete(skb);
+}
+
+/*
+ * Check whether delayed processing was scheduled for our NIC. If so,
+ * we attempt to grab the poll lock and use ->poll() to pump the card.
+ * If this fails, either we've recursed in ->poll() or it's already
+ * running on another CPU.
+ *
+ * Note: we don't mask interrupts with this lock because we're using
+ * trylock here and interrupts are already disabled in the softirq
+ * case. Further, we test the poll_owner to avoid recursion on UP
+ * systems where the lock doesn't exist.
+ *
+ * In cases where there is bi-directional communications, reading only
+ * one message at a time can lead to packets being dropped by the
+ * network adapter, forcing superfluous retries and possibly timeouts.
+ * Thus, we set our budget to greater than 1.
+ */
+static int poll_one_napi(struct netpoll_info *npinfo,
+			 struct napi_struct *napi, int budget)
+{
+	int work;
+
+	/* net_rx_action's ->poll() invocations and our's are
+	 * synchronized by this test which is only made while
+	 * holding the napi->poll_lock.
+	 */
+	if (!test_bit(NAPI_STATE_SCHED, &napi->state))
+		return budget;
+
+	npinfo->rx_flags |= NETPOLL_RX_DROP;
+	atomic_inc(&trapped);
+	set_bit(NAPI_STATE_NPSVC, &napi->state);
+
+	work = napi->poll(napi, budget);
+	trace_napi_poll(napi);
+
+	clear_bit(NAPI_STATE_NPSVC, &napi->state);
+	atomic_dec(&trapped);
+	npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+
+	return budget - work;
+}
+
+static void poll_napi(struct net_device *dev)
+{
+	struct napi_struct *napi;
+	int budget = 16;
+
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (napi->poll_owner != smp_processor_id() &&
+		    spin_trylock(&napi->poll_lock)) {
+			budget = poll_one_napi(dev->npinfo, napi, budget);
+			spin_unlock(&napi->poll_lock);
+
+			if (!budget)
+				break;
+		}
+	}
+}
+
+static void service_arp_queue(struct netpoll_info *npi)
+{
+	if (npi) {
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&npi->arp_tx)))
+			arp_reply(skb);
+	}
+}
+
+void netpoll_poll_dev(struct net_device *dev)
+{
+	const struct net_device_ops *ops;
+
+	if (!dev || !netif_running(dev))
+		return;
+
+	ops = dev->netdev_ops;
+	if (!ops->ndo_poll_controller)
+		return;
+
+	/* Process pending work on NIC */
+	ops->ndo_poll_controller(dev);
+
+	poll_napi(dev);
+
+	if (dev->flags & IFF_SLAVE) {
+		if (dev->npinfo) {
+			struct net_device *bond_dev = dev->master;
+			struct sk_buff *skb;
+			while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) {
+				skb->dev = bond_dev;
+				skb_queue_tail(&bond_dev->npinfo->arp_tx, skb);
+			}
+		}
+	}
+
+	service_arp_queue(dev->npinfo);
+
+	zap_completion_queue();
+}
+EXPORT_SYMBOL(netpoll_poll_dev);
+
+void netpoll_poll(struct netpoll *np)
+{
+	netpoll_poll_dev(np->dev);
+}
+EXPORT_SYMBOL(netpoll_poll);
+
+static void refill_skbs(void)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&skb_pool.lock, flags);
+	while (skb_pool.qlen < MAX_SKBS) {
+		skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
+		if (!skb)
+			break;
+
+		__skb_queue_tail(&skb_pool, skb);
+	}
+	spin_unlock_irqrestore(&skb_pool.lock, flags);
+}
+
+static void zap_completion_queue(void)
+{
+	unsigned long flags;
+	struct softnet_data *sd = &get_cpu_var(softnet_data);
+
+	if (sd->completion_queue) {
+		struct sk_buff *clist;
+
+		local_irq_save(flags);
+		clist = sd->completion_queue;
+		sd->completion_queue = NULL;
+		local_irq_restore(flags);
+
+		while (clist != NULL) {
+			struct sk_buff *skb = clist;
+			clist = clist->next;
+			if (skb->destructor) {
+				atomic_inc(&skb->users);
+				dev_kfree_skb_any(skb); /* put this one back */
+			} else {
+				__kfree_skb(skb);
+			}
+		}
+	}
+
+	put_cpu_var(softnet_data);
+}
+
+static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
+{
+	int count = 0;
+	struct sk_buff *skb;
+
+	zap_completion_queue();
+	refill_skbs();
+repeat:
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (!skb)
+		skb = skb_dequeue(&skb_pool);
+
+	if (!skb) {
+		if (++count < 10) {
+			netpoll_poll(np);
+			goto repeat;
+		}
+		return NULL;
+	}
+
+	atomic_set(&skb->users, 1);
+	skb_reserve(skb, reserve);
+	return skb;
+}
+
+static int netpoll_owner_active(struct net_device *dev)
+{
+	struct napi_struct *napi;
+
+	list_for_each_entry(napi, &dev->napi_list, dev_list) {
+		if (napi->poll_owner == smp_processor_id())
+			return 1;
+	}
+	return 0;
+}
+
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+			     struct net_device *dev)
+{
+	int status = NETDEV_TX_BUSY;
+	unsigned long tries;
+	const struct net_device_ops *ops = dev->netdev_ops;
+	/* It is up to the caller to keep npinfo alive. */
+	struct netpoll_info *npinfo = np->dev->npinfo;
+
+	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
+		__kfree_skb(skb);
+		return;
+	}
+
+	/* don't get messages out of order, and no recursion */
+	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
+		struct netdev_queue *txq;
+		unsigned long flags;
+
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+		local_irq_save(flags);
+		/* try until next clock tick */
+		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
+		     tries > 0; --tries) {
+			if (__netif_tx_trylock(txq)) {
+				if (!netif_tx_queue_stopped(txq)) {
+					status = ops->ndo_start_xmit(skb, dev);
+					if (status == NETDEV_TX_OK)
+						txq_trans_update(txq);
+				}
+				__netif_tx_unlock(txq);
+
+				if (status == NETDEV_TX_OK)
+					break;
+
+			}
+
+			/* tickle device maybe there is some cleanup */
+			netpoll_poll(np);
+
+			udelay(USEC_PER_POLL);
+		}
+
+		WARN_ONCE(!irqs_disabled(),
+			"netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n",
+			dev->name, ops->ndo_start_xmit);
+
+		local_irq_restore(flags);
+	}
+
+	if (status != NETDEV_TX_OK) {
+		skb_queue_tail(&npinfo->txq, skb);
+		schedule_delayed_work(&npinfo->tx_work,0);
+	}
+}
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+
+void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+{
+	int total_len, eth_len, ip_len, udp_len;
+	struct sk_buff *skb;
+	struct udphdr *udph;
+	struct iphdr *iph;
+	struct ethhdr *eth;
+
+	udp_len = len + sizeof(*udph);
+	ip_len = eth_len = udp_len + sizeof(*iph);
+	total_len = eth_len + ETH_HLEN + NET_IP_ALIGN;
+
+	skb = find_skb(np, total_len, total_len - len);
+	if (!skb)
+		return;
+
+	skb_copy_to_linear_data(skb, msg, len);
+	skb->len += len;
+
+	skb_push(skb, sizeof(*udph));
+	skb_reset_transport_header(skb);
+	udph = udp_hdr(skb);
+	udph->source = htons(np->local_port);
+	udph->dest = htons(np->remote_port);
+	udph->len = htons(udp_len);
+	udph->check = 0;
+	udph->check = csum_tcpudp_magic(np->local_ip,
+					np->remote_ip,
+					udp_len, IPPROTO_UDP,
+					csum_partial(udph, udp_len, 0));
+	if (udph->check == 0)
+		udph->check = CSUM_MANGLED_0;
+
+	skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+
+	/* iph->version = 4; iph->ihl = 5; */
+	put_unaligned(0x45, (unsigned char *)iph);
+	iph->tos      = 0;
+	put_unaligned(htons(ip_len), &(iph->tot_len));
+	iph->id       = 0;
+	iph->frag_off = 0;
+	iph->ttl      = 64;
+	iph->protocol = IPPROTO_UDP;
+	iph->check    = 0;
+	put_unaligned(np->local_ip, &(iph->saddr));
+	put_unaligned(np->remote_ip, &(iph->daddr));
+	iph->check    = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+	eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+	skb->protocol = eth->h_proto = htons(ETH_P_IP);
+	memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN);
+	memcpy(eth->h_dest, np->remote_mac, ETH_ALEN);
+
+	skb->dev = np->dev;
+
+	netpoll_send_skb(np, skb);
+}
+EXPORT_SYMBOL(netpoll_send_udp);
+
+static void arp_reply(struct sk_buff *skb)
+{
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
+	__be32 sip, tip;
+	unsigned char *sha;
+	struct sk_buff *send_skb;
+	struct netpoll *np, *tmp;
+	unsigned long flags;
+	int hits = 0;
+
+	if (list_empty(&npinfo->rx_np))
+		return;
+
+	/* Before checking the packet, we do some early
+	   inspection whether this is interesting at all */
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (np->dev == skb->dev)
+			hits++;
+	}
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+	/* No netpoll struct is using this dev */
+	if (!hits)
+		return;
+
+	/* No arp on this interface */
+	if (skb->dev->flags & IFF_NOARP)
+		return;
+
+	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+		return;
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	arp = arp_hdr(skb);
+
+	if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+	     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+	    arp->ar_pro != htons(ETH_P_IP) ||
+	    arp->ar_op != htons(ARPOP_REQUEST))
+		return;
+
+	arp_ptr = (unsigned char *)(arp+1);
+	/* save the location of the src hw addr */
+	sha = arp_ptr;
+	arp_ptr += skb->dev->addr_len;
+	memcpy(&sip, arp_ptr, 4);
+	arp_ptr += 4;
+	/* If we actually cared about dst hw addr,
+	   it would get copied here */
+	arp_ptr += skb->dev->addr_len;
+	memcpy(&tip, arp_ptr, 4);
+
+	/* Should we ignore arp? */
+	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+		return;
+
+	size = arp_hdr_len(skb->dev);
+
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (tip != np->local_ip)
+			continue;
+
+		send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
+				    LL_RESERVED_SPACE(np->dev));
+		if (!send_skb)
+			continue;
+
+		skb_reset_network_header(send_skb);
+		arp = (struct arphdr *) skb_put(send_skb, size);
+		send_skb->dev = skb->dev;
+		send_skb->protocol = htons(ETH_P_ARP);
+
+		/* Fill the device header for the ARP frame */
+		if (dev_hard_header(send_skb, skb->dev, ptype,
+				    sha, np->dev->dev_addr,
+				    send_skb->len) < 0) {
+			kfree_skb(send_skb);
+			continue;
+		}
+
+		/*
+		 * Fill out the arp protocol part.
+		 *
+		 * we only support ethernet device type,
+		 * which (according to RFC 1390) should
+		 * always equal 1 (Ethernet).
+		 */
+
+		arp->ar_hrd = htons(np->dev->type);
+		arp->ar_pro = htons(ETH_P_IP);
+		arp->ar_hln = np->dev->addr_len;
+		arp->ar_pln = 4;
+		arp->ar_op = htons(type);
+
+		arp_ptr = (unsigned char *)(arp + 1);
+		memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len);
+		arp_ptr += np->dev->addr_len;
+		memcpy(arp_ptr, &tip, 4);
+		arp_ptr += 4;
+		memcpy(arp_ptr, sha, np->dev->addr_len);
+		arp_ptr += np->dev->addr_len;
+		memcpy(arp_ptr, &sip, 4);
+
+		netpoll_send_skb(np, send_skb);
+
+		/* If there are several rx_hooks for the same address,
+		   we're fine by sending a single reply */
+		break;
+	}
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+}
+
+int __netpoll_rx(struct sk_buff *skb)
+{
+	int proto, len, ulen;
+	int hits = 0;
+	const struct iphdr *iph;
+	struct udphdr *uh;
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+	struct netpoll *np, *tmp;
+
+	if (list_empty(&npinfo->rx_np))
+		goto out;
+
+	if (skb->dev->type != ARPHRD_ETHER)
+		goto out;
+
+	/* check if netpoll clients need ARP */
+	if (skb->protocol == htons(ETH_P_ARP) &&
+	    atomic_read(&trapped)) {
+		skb_queue_tail(&npinfo->arp_tx, skb);
+		return 1;
+	}
+
+	proto = ntohs(eth_hdr(skb)->h_proto);
+	if (proto != ETH_P_IP)
+		goto out;
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto out;
+	if (skb_shared(skb))
+		goto out;
+
+	iph = (struct iphdr *)skb->data;
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+	if (iph->ihl < 5 || iph->version != 4)
+		goto out;
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto out;
+	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+		goto out;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < iph->ihl*4)
+		goto out;
+
+	/*
+	 * Our transport medium may have padded the buffer out.
+	 * Now We trim to the true length of the frame.
+	 */
+	if (pskb_trim_rcsum(skb, len))
+		goto out;
+
+	if (iph->protocol != IPPROTO_UDP)
+		goto out;
+
+	len -= iph->ihl*4;
+	uh = (struct udphdr *)(((char *)iph) + iph->ihl*4);
+	ulen = ntohs(uh->len);
+
+	if (ulen != len)
+		goto out;
+	if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
+		goto out;
+
+	list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) {
+		if (np->local_ip && np->local_ip != iph->daddr)
+			continue;
+		if (np->remote_ip && np->remote_ip != iph->saddr)
+			continue;
+		if (np->local_port && np->local_port != ntohs(uh->dest))
+			continue;
+
+		np->rx_hook(np, ntohs(uh->source),
+			       (char *)(uh+1),
+			       ulen - sizeof(struct udphdr));
+		hits++;
+	}
+
+	if (!hits)
+		goto out;
+
+	kfree_skb(skb);
+	return 1;
+
+out:
+	if (atomic_read(&trapped)) {
+		kfree_skb(skb);
+		return 1;
+	}
+
+	return 0;
+}
+
+void netpoll_print_options(struct netpoll *np)
+{
+	printk(KERN_INFO "%s: local port %d\n",
+			 np->name, np->local_port);
+	printk(KERN_INFO "%s: local IP %pI4\n",
+			 np->name, &np->local_ip);
+	printk(KERN_INFO "%s: interface '%s'\n",
+			 np->name, np->dev_name);
+	printk(KERN_INFO "%s: remote port %d\n",
+			 np->name, np->remote_port);
+	printk(KERN_INFO "%s: remote IP %pI4\n",
+			 np->name, &np->remote_ip);
+	printk(KERN_INFO "%s: remote ethernet address %pM\n",
+	                 np->name, np->remote_mac);
+}
+EXPORT_SYMBOL(netpoll_print_options);
+
+int netpoll_parse_options(struct netpoll *np, char *opt)
+{
+	char *cur=opt, *delim;
+
+	if (*cur != '@') {
+		if ((delim = strchr(cur, '@')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		np->local_port = simple_strtol(cur, NULL, 10);
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != '/') {
+		if ((delim = strchr(cur, '/')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		np->local_ip = in_aton(cur);
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != ',') {
+		/* parse out dev name */
+		if ((delim = strchr(cur, ',')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != '@') {
+		/* dst port */
+		if ((delim = strchr(cur, '@')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		if (*cur == ' ' || *cur == '\t')
+			printk(KERN_INFO "%s: warning: whitespace"
+					"is not allowed\n", np->name);
+		np->remote_port = simple_strtol(cur, NULL, 10);
+		cur = delim;
+	}
+	cur++;
+
+	/* dst ip */
+	if ((delim = strchr(cur, '/')) == NULL)
+		goto parse_failed;
+	*delim = 0;
+	np->remote_ip = in_aton(cur);
+	cur = delim + 1;
+
+	if (*cur != 0) {
+		/* MAC address */
+		if (!mac_pton(cur, np->remote_mac))
+			goto parse_failed;
+	}
+
+	netpoll_print_options(np);
+
+	return 0;
+
+ parse_failed:
+	printk(KERN_INFO "%s: couldn't parse config at '%s'!\n",
+	       np->name, cur);
+	return -1;
+}
+EXPORT_SYMBOL(netpoll_parse_options);
+
+int __netpoll_setup(struct netpoll *np)
+{
+	struct net_device *ndev = np->dev;
+	struct netpoll_info *npinfo;
+	const struct net_device_ops *ops;
+	unsigned long flags;
+	int err;
+
+	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+	    !ndev->netdev_ops->ndo_poll_controller) {
+		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+		       np->name, np->dev_name);
+		err = -ENOTSUPP;
+		goto out;
+	}
+
+	if (!ndev->npinfo) {
+		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+		if (!npinfo) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		npinfo->rx_flags = 0;
+		INIT_LIST_HEAD(&npinfo->rx_np);
+
+		spin_lock_init(&npinfo->rx_lock);
+		skb_queue_head_init(&npinfo->arp_tx);
+		skb_queue_head_init(&npinfo->txq);
+		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+		atomic_set(&npinfo->refcnt, 1);
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_setup) {
+			err = ops->ndo_netpoll_setup(ndev, npinfo);
+			if (err)
+				goto free_npinfo;
+		}
+	} else {
+		npinfo = ndev->npinfo;
+		atomic_inc(&npinfo->refcnt);
+	}
+
+	npinfo->netpoll = np;
+
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		list_add_tail(&np->rx, &npinfo->rx_np);
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+
+	/* last thing to do is link it to the net device structure */
+	rcu_assign_pointer(ndev->npinfo, npinfo);
+
+	return 0;
+
+free_npinfo:
+	kfree(npinfo);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+	struct net_device *ndev = NULL;
+	struct in_device *in_dev;
+	int err;
+
+	if (np->dev_name)
+		ndev = dev_get_by_name(&init_net, np->dev_name);
+	if (!ndev) {
+		printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
+		       np->name, np->dev_name);
+		return -ENODEV;
+	}
+
+	if (ndev->master) {
+		printk(KERN_ERR "%s: %s is a slave device, aborting.\n",
+		       np->name, np->dev_name);
+		err = -EBUSY;
+		goto put;
+	}
+
+	if (!netif_running(ndev)) {
+		unsigned long atmost, atleast;
+
+		printk(KERN_INFO "%s: device %s not up yet, forcing it\n",
+		       np->name, np->dev_name);
+
+		rtnl_lock();
+		err = dev_open(ndev);
+		rtnl_unlock();
+
+		if (err) {
+			printk(KERN_ERR "%s: failed to open %s\n",
+			       np->name, ndev->name);
+			goto put;
+		}
+
+		atleast = jiffies + HZ/10;
+		atmost = jiffies + carrier_timeout * HZ;
+		while (!netif_carrier_ok(ndev)) {
+			if (time_after(jiffies, atmost)) {
+				printk(KERN_NOTICE
+				       "%s: timeout waiting for carrier\n",
+				       np->name);
+				break;
+			}
+			msleep(1);
+		}
+
+		/* If carrier appears to come up instantly, we don't
+		 * trust it and pause so that we don't pump all our
+		 * queued console messages into the bitbucket.
+		 */
+
+		if (time_before(jiffies, atleast)) {
+			printk(KERN_NOTICE "%s: carrier detect appears"
+			       " untrustworthy, waiting 4 seconds\n",
+			       np->name);
+			msleep(4000);
+		}
+	}
+
+	if (!np->local_ip) {
+		rcu_read_lock();
+		in_dev = __in_dev_get_rcu(ndev);
+
+		if (!in_dev || !in_dev->ifa_list) {
+			rcu_read_unlock();
+			printk(KERN_ERR "%s: no IP address for %s, aborting\n",
+			       np->name, np->dev_name);
+			err = -EDESTADDRREQ;
+			goto put;
+		}
+
+		np->local_ip = in_dev->ifa_list->ifa_local;
+		rcu_read_unlock();
+		printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
+	}
+
+	np->dev = ndev;
+
+	/* fill up the skb queue */
+	refill_skbs();
+
+	rtnl_lock();
+	err = __netpoll_setup(np);
+	rtnl_unlock();
+
+	if (err)
+		goto put;
+
+	return 0;
+
+put:
+	dev_put(ndev);
+	return err;
+}
+EXPORT_SYMBOL(netpoll_setup);
+
+static int __init netpoll_init(void)
+{
+	skb_queue_head_init(&skb_pool);
+	return 0;
+}
+core_initcall(netpoll_init);
+
+void __netpoll_cleanup(struct netpoll *np)
+{
+	struct netpoll_info *npinfo;
+	unsigned long flags;
+
+	npinfo = np->dev->npinfo;
+	if (!npinfo)
+		return;
+
+	if (!list_empty(&npinfo->rx_np)) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		list_del(&np->rx);
+		if (list_empty(&npinfo->rx_np))
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+
+	if (atomic_dec_and_test(&npinfo->refcnt)) {
+		const struct net_device_ops *ops;
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_cleanup)
+			ops->ndo_netpoll_cleanup(np->dev);
+
+		rcu_assign_pointer(np->dev->npinfo, NULL);
+
+		/* avoid racing with NAPI reading npinfo */
+		synchronize_rcu_bh();
+
+		skb_queue_purge(&npinfo->arp_tx);
+		skb_queue_purge(&npinfo->txq);
+		cancel_delayed_work_sync(&npinfo->tx_work);
+
+		/* clean after last, unfinished work */
+		__skb_queue_purge(&npinfo->txq);
+		kfree(npinfo);
+	}
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
+
+void netpoll_cleanup(struct netpoll *np)
+{
+	if (!np->dev)
+		return;
+
+	rtnl_lock();
+	__netpoll_cleanup(np);
+	rtnl_unlock();
+
+	dev_put(np->dev);
+	np->dev = NULL;
+}
+EXPORT_SYMBOL(netpoll_cleanup);
+
+int netpoll_trap(void)
+{
+	return atomic_read(&trapped);
+}
+EXPORT_SYMBOL(netpoll_trap);
+
+void netpoll_set_trap(int trap)
+{
+	if (trap)
+		atomic_inc(&trapped);
+	else
+		atomic_dec(&trapped);
+}
+EXPORT_SYMBOL(netpoll_set_trap);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
new file mode 100644
index 00000000..c0e0f767
--- /dev/null
+++ b/net/core/pktgen.c
@@ -0,0 +1,3796 @@
+/*
+ * Authors:
+ * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
+ *                             Uppsala University and
+ *                             Swedish University of Agricultural Sciences
+ *
+ * Alexey Kuznetsov  <kuznet@ms2.inr.ac.ru>
+ * Ben Greear <greearb@candelatech.com>
+ * Jens Låås <jens.laas@data.slu.se>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ *
+ * A tool for loading the network with preconfigurated packets.
+ * The tool is implemented as a linux module.  Parameters are output
+ * device, delay (to hard_xmit), number of packets, and whether
+ * to use multiple SKBs or just the same one.
+ * pktgen uses the installed interface's output routine.
+ *
+ * Additional hacking by:
+ *
+ * Jens.Laas@data.slu.se
+ * Improved by ANK. 010120.
+ * Improved by ANK even more. 010212.
+ * MAC address typo fixed. 010417 --ro
+ * Integrated.  020301 --DaveM
+ * Added multiskb option 020301 --DaveM
+ * Scaling of results. 020417--sigurdur@linpro.no
+ * Significant re-work of the module:
+ *   *  Convert to threaded model to more efficiently be able to transmit
+ *       and receive on multiple interfaces at once.
+ *   *  Converted many counters to __u64 to allow longer runs.
+ *   *  Allow configuration of ranges, like min/max IP address, MACs,
+ *       and UDP-ports, for both source and destination, and can
+ *       set to use a random distribution or sequentially walk the range.
+ *   *  Can now change most values after starting.
+ *   *  Place 12-byte packet in UDP payload with magic number,
+ *       sequence number, and timestamp.
+ *   *  Add receiver code that detects dropped pkts, re-ordered pkts, and
+ *       latencies (with micro-second) precision.
+ *   *  Add IOCTL interface to easily get counters & configuration.
+ *   --Ben Greear <greearb@candelatech.com>
+ *
+ * Renamed multiskb to clone_skb and cleaned up sending core for two distinct
+ * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0
+ * as a "fastpath" with a configurable number of clones after alloc's.
+ * clone_skb=0 means all packets are allocated this also means ranges time
+ * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100
+ * clones.
+ *
+ * Also moved to /proc/net/pktgen/
+ * --ro
+ *
+ * Sept 10:  Fixed threading/locking.  Lots of bone-headed and more clever
+ *    mistakes.  Also merged in DaveM's patch in the -pre6 patch.
+ * --Ben Greear <greearb@candelatech.com>
+ *
+ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
+ *
+ *
+ * 021124 Finished major redesign and rewrite for new functionality.
+ * See Documentation/networking/pktgen.txt for how to use this.
+ *
+ * The new operation:
+ * For each CPU one thread/process is created at start. This process checks
+ * for running devices in the if_list and sends packets until count is 0 it
+ * also the thread checks the thread->control which is used for inter-process
+ * communication. controlling process "posts" operations to the threads this
+ * way. The if_lock should be possible to remove when add/rem_device is merged
+ * into this too.
+ *
+ * By design there should only be *one* "controlling" process. In practice
+ * multiple write accesses gives unpredictable result. Understood by "write"
+ * to /proc gives result code thats should be read be the "writer".
+ * For practical use this should be no problem.
+ *
+ * Note when adding devices to a specific CPU there good idea to also assign
+ * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU.
+ * --ro
+ *
+ * Fix refcount off by one if first packet fails, potential null deref,
+ * memleak 030710- KJP
+ *
+ * First "ranges" functionality for ipv6 030726 --ro
+ *
+ * Included flow support. 030802 ANK.
+ *
+ * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org>
+ *
+ * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419
+ * ia64 compilation fix from  Aron Griffis <aron@hp.com> 040604
+ *
+ * New xmit() return, do_div and misc clean up by Stephen Hemminger
+ * <shemminger@osdl.org> 040923
+ *
+ * Randy Dunlap fixed u64 printk compiler waring
+ *
+ * Remove FCS from BW calculation.  Lennert Buytenhek <buytenh@wantstofly.org>
+ * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213
+ *
+ * Corrections from Nikolai Malykh (nmalykh@bilim.com)
+ * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230
+ *
+ * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com>
+ * 050103
+ *
+ * MPLS support by Steven Whitehouse <steve@chygwyn.com>
+ *
+ * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com>
+ *
+ * Fixed src_mac command to set source mac of packet to value specified in
+ * command by Adit Ranadive <adit.262@gmail.com>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sys.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/unistd.h>
+#include <linux/string.h>
+#include <linux/ptrace.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/hrtimer.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wait.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <net/net_namespace.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+#include <asm/byteorder.h>
+#include <linux/rcupdate.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/uaccess.h>
+#include <asm/dma.h>
+#include <asm/div64.h>		/* do_div */
+
+#define VERSION	"2.74"
+#define IP_NAME_SZ 32
+#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
+#define MPLS_STACK_BOTTOM htonl(0x00000100)
+
+#define func_enter() pr_debug("entering %s\n", __func__);
+
+/* Device flag bits */
+#define F_IPSRC_RND   (1<<0)	/* IP-Src Random  */
+#define F_IPDST_RND   (1<<1)	/* IP-Dst Random  */
+#define F_UDPSRC_RND  (1<<2)	/* UDP-Src Random */
+#define F_UDPDST_RND  (1<<3)	/* UDP-Dst Random */
+#define F_MACSRC_RND  (1<<4)	/* MAC-Src Random */
+#define F_MACDST_RND  (1<<5)	/* MAC-Dst Random */
+#define F_TXSIZE_RND  (1<<6)	/* Transmit size is random */
+#define F_IPV6        (1<<7)	/* Interface in IPV6 Mode */
+#define F_MPLS_RND    (1<<8)	/* Random MPLS labels */
+#define F_VID_RND     (1<<9)	/* Random VLAN ID */
+#define F_SVID_RND    (1<<10)	/* Random SVLAN ID */
+#define F_FLOW_SEQ    (1<<11)	/* Sequential flows */
+#define F_IPSEC_ON    (1<<12)	/* ipsec on for flows */
+#define F_QUEUE_MAP_RND (1<<13)	/* queue map Random */
+#define F_QUEUE_MAP_CPU (1<<14)	/* queue map mirrors smp_processor_id() */
+#define F_NODE          (1<<15)	/* Node memory alloc*/
+
+/* Thread control flag bits */
+#define T_STOP        (1<<0)	/* Stop run */
+#define T_RUN         (1<<1)	/* Start run */
+#define T_REMDEVALL   (1<<2)	/* Remove all devs */
+#define T_REMDEV      (1<<3)	/* Remove one dev */
+
+/* If lock -- can be removed after some work */
+#define   if_lock(t)           spin_lock(&(t->if_lock));
+#define   if_unlock(t)           spin_unlock(&(t->if_lock));
+
+/* Used to help with determining the pkts on receive */
+#define PKTGEN_MAGIC 0xbe9be955
+#define PG_PROC_DIR "pktgen"
+#define PGCTRL	    "pgctrl"
+static struct proc_dir_entry *pg_proc_dir;
+
+#define MAX_CFLOWS  65536
+
+#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
+#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+
+struct flow_state {
+	__be32 cur_daddr;
+	int count;
+#ifdef CONFIG_XFRM
+	struct xfrm_state *x;
+#endif
+	__u32 flags;
+};
+
+/* flow flag bits */
+#define F_INIT   (1<<0)		/* flow has been initialized */
+
+struct pktgen_dev {
+	/*
+	 * Try to keep frequent/infrequent used vars. separated.
+	 */
+	struct proc_dir_entry *entry;	/* proc file */
+	struct pktgen_thread *pg_thread;/* the owner */
+	struct list_head list;		/* chaining in the thread's run-queue */
+
+	int running;		/* if false, the test will stop */
+
+	/* If min != max, then we will either do a linear iteration, or
+	 * we will do a random selection from within the range.
+	 */
+	__u32 flags;
+	int removal_mark;	/* non-zero => the device is marked for
+				 * removal by worker thread */
+
+	int min_pkt_size;	/* = ETH_ZLEN; */
+	int max_pkt_size;	/* = ETH_ZLEN; */
+	int pkt_overhead;	/* overhead for MPLS, VLANs, IPSEC etc */
+	int nfrags;
+	struct page *page;
+	u64 delay;		/* nano-seconds */
+
+	__u64 count;		/* Default No packets to send */
+	__u64 sofar;		/* How many pkts we've sent so far */
+	__u64 tx_bytes;		/* How many bytes we've transmitted */
+	__u64 errors;		/* Errors when trying to transmit, */
+
+	/* runtime counters relating to clone_skb */
+
+	__u64 allocated_skbs;
+	__u32 clone_count;
+	int last_ok;		/* Was last skb sent?
+				 * Or a failed transmit of some sort?
+				 * This will keep sequence numbers in order
+				 */
+	ktime_t next_tx;
+	ktime_t started_at;
+	ktime_t stopped_at;
+	u64	idle_acc;	/* nano-seconds */
+
+	__u32 seq_num;
+
+	int clone_skb;		/*
+				 * Use multiple SKBs during packet gen.
+				 * If this number is greater than 1, then
+				 * that many copies of the same packet will be
+				 * sent before a new packet is allocated.
+				 * If you want to send 1024 identical packets
+				 * before creating a new packet,
+				 * set clone_skb to 1024.
+				 */
+
+	char dst_min[IP_NAME_SZ];	/* IP, ie 1.2.3.4 */
+	char dst_max[IP_NAME_SZ];	/* IP, ie 1.2.3.4 */
+	char src_min[IP_NAME_SZ];	/* IP, ie 1.2.3.4 */
+	char src_max[IP_NAME_SZ];	/* IP, ie 1.2.3.4 */
+
+	struct in6_addr in6_saddr;
+	struct in6_addr in6_daddr;
+	struct in6_addr cur_in6_daddr;
+	struct in6_addr cur_in6_saddr;
+	/* For ranges */
+	struct in6_addr min_in6_daddr;
+	struct in6_addr max_in6_daddr;
+	struct in6_addr min_in6_saddr;
+	struct in6_addr max_in6_saddr;
+
+	/* If we're doing ranges, random or incremental, then this
+	 * defines the min/max for those ranges.
+	 */
+	__be32 saddr_min;	/* inclusive, source IP address */
+	__be32 saddr_max;	/* exclusive, source IP address */
+	__be32 daddr_min;	/* inclusive, dest IP address */
+	__be32 daddr_max;	/* exclusive, dest IP address */
+
+	__u16 udp_src_min;	/* inclusive, source UDP port */
+	__u16 udp_src_max;	/* exclusive, source UDP port */
+	__u16 udp_dst_min;	/* inclusive, dest UDP port */
+	__u16 udp_dst_max;	/* exclusive, dest UDP port */
+
+	/* DSCP + ECN */
+	__u8 tos;            /* six MSB of (former) IPv4 TOS
+				are for dscp codepoint */
+	__u8 traffic_class;  /* ditto for the (former) Traffic Class in IPv6
+				(see RFC 3260, sec. 4) */
+
+	/* MPLS */
+	unsigned nr_labels;	/* Depth of stack, 0 = no MPLS */
+	__be32 labels[MAX_MPLS_LABELS];
+
+	/* VLAN/SVLAN (802.1Q/Q-in-Q) */
+	__u8  vlan_p;
+	__u8  vlan_cfi;
+	__u16 vlan_id;  /* 0xffff means no vlan tag */
+
+	__u8  svlan_p;
+	__u8  svlan_cfi;
+	__u16 svlan_id; /* 0xffff means no svlan tag */
+
+	__u32 src_mac_count;	/* How many MACs to iterate through */
+	__u32 dst_mac_count;	/* How many MACs to iterate through */
+
+	unsigned char dst_mac[ETH_ALEN];
+	unsigned char src_mac[ETH_ALEN];
+
+	__u32 cur_dst_mac_offset;
+	__u32 cur_src_mac_offset;
+	__be32 cur_saddr;
+	__be32 cur_daddr;
+	__u16 ip_id;
+	__u16 cur_udp_dst;
+	__u16 cur_udp_src;
+	__u16 cur_queue_map;
+	__u32 cur_pkt_size;
+	__u32 last_pkt_size;
+
+	__u8 hh[14];
+	/* = {
+	   0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+
+	   We fill in SRC address later
+	   0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	   0x08, 0x00
+	   };
+	 */
+	__u16 pad;		/* pad out the hh struct to an even 16 bytes */
+
+	struct sk_buff *skb;	/* skb we are to transmit next, used for when we
+				 * are transmitting the same one multiple times
+				 */
+	struct net_device *odev; /* The out-going device.
+				  * Note that the device should have it's
+				  * pg_info pointer pointing back to this
+				  * device.
+				  * Set when the user specifies the out-going
+				  * device name (not when the inject is
+				  * started as it used to do.)
+				  */
+	char odevname[32];
+	struct flow_state *flows;
+	unsigned cflows;	/* Concurrent flows (config) */
+	unsigned lflow;		/* Flow length  (config) */
+	unsigned nflows;	/* accumulated flows (stats) */
+	unsigned curfl;		/* current sequenced flow (state)*/
+
+	u16 queue_map_min;
+	u16 queue_map_max;
+	__u32 skb_priority;	/* skb priority field */
+	int node;               /* Memory node */
+
+#ifdef CONFIG_XFRM
+	__u8	ipsmode;		/* IPSEC mode (config) */
+	__u8	ipsproto;		/* IPSEC type (config) */
+#endif
+	char result[512];
+};
+
+struct pktgen_hdr {
+	__be32 pgh_magic;
+	__be32 seq_num;
+	__be32 tv_sec;
+	__be32 tv_usec;
+};
+
+static bool pktgen_exiting __read_mostly;
+
+struct pktgen_thread {
+	spinlock_t if_lock;		/* for list of devices */
+	struct list_head if_list;	/* All device here */
+	struct list_head th_list;
+	struct task_struct *tsk;
+	char result[512];
+
+	/* Field for thread to receive "posted" events terminate,
+	   stop ifs etc. */
+
+	u32 control;
+	int cpu;
+
+	wait_queue_head_t queue;
+	struct completion start_done;
+};
+
+#define REMOVE 1
+#define FIND   0
+
+static inline ktime_t ktime_now(void)
+{
+	struct timespec ts;
+	ktime_get_ts(&ts);
+
+	return timespec_to_ktime(ts);
+}
+
+/* This works even if 32 bit because of careful byte order choice */
+static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2)
+{
+	return cmp1.tv64 < cmp2.tv64;
+}
+
+static const char version[] =
+	"Packet Generator for packet performance testing. "
+	"Version: " VERSION "\n";
+
+static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+					  const char *ifname, bool exact);
+static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
+static void pktgen_run_all_threads(void);
+static void pktgen_reset_all_threads(void);
+static void pktgen_stop_all_threads_ifs(void);
+
+static void pktgen_stop(struct pktgen_thread *t);
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+
+static unsigned int scan_ip6(const char *s, char ip[16]);
+
+/* Module parameters, defaults. */
+static int pg_count_d __read_mostly = 1000;
+static int pg_delay_d __read_mostly;
+static int pg_clone_skb_d  __read_mostly;
+static int debug  __read_mostly;
+
+static DEFINE_MUTEX(pktgen_thread_lock);
+static LIST_HEAD(pktgen_threads);
+
+static struct notifier_block pktgen_notifier_block = {
+	.notifier_call = pktgen_device_event,
+};
+
+/*
+ * /proc handling functions
+ *
+ */
+
+static int pgctrl_show(struct seq_file *seq, void *v)
+{
+	seq_puts(seq, version);
+	return 0;
+}
+
+static ssize_t pgctrl_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int err = 0;
+	char data[128];
+
+	if (!capable(CAP_NET_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	if (count > sizeof(data))
+		count = sizeof(data);
+
+	if (copy_from_user(data, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+	data[count - 1] = 0;	/* Make string */
+
+	if (!strcmp(data, "stop"))
+		pktgen_stop_all_threads_ifs();
+
+	else if (!strcmp(data, "start"))
+		pktgen_run_all_threads();
+
+	else if (!strcmp(data, "reset"))
+		pktgen_reset_all_threads();
+
+	else
+		pr_warning("Unknown command: %s\n", data);
+
+	err = count;
+
+out:
+	return err;
+}
+
+static int pgctrl_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pgctrl_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_fops = {
+	.owner   = THIS_MODULE,
+	.open    = pgctrl_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = pgctrl_write,
+	.release = single_release,
+};
+
+static int pktgen_if_show(struct seq_file *seq, void *v)
+{
+	const struct pktgen_dev *pkt_dev = seq->private;
+	ktime_t stopped;
+	u64 idle;
+
+	seq_printf(seq,
+		   "Params: count %llu  min_pkt_size: %u  max_pkt_size: %u\n",
+		   (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
+		   pkt_dev->max_pkt_size);
+
+	seq_printf(seq,
+		   "     frags: %d  delay: %llu  clone_skb: %d  ifname: %s\n",
+		   pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
+		   pkt_dev->clone_skb, pkt_dev->odevname);
+
+	seq_printf(seq, "     flows: %u flowlen: %u\n", pkt_dev->cflows,
+		   pkt_dev->lflow);
+
+	seq_printf(seq,
+		   "     queue_map_min: %u  queue_map_max: %u\n",
+		   pkt_dev->queue_map_min,
+		   pkt_dev->queue_map_max);
+
+	if (pkt_dev->skb_priority)
+		seq_printf(seq, "     skb_priority: %u\n",
+			   pkt_dev->skb_priority);
+
+	if (pkt_dev->flags & F_IPV6) {
+		seq_printf(seq,
+			   "     saddr: %pI6c  min_saddr: %pI6c  max_saddr: %pI6c\n"
+			   "     daddr: %pI6c  min_daddr: %pI6c  max_daddr: %pI6c\n",
+			   &pkt_dev->in6_saddr,
+			   &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr,
+			   &pkt_dev->in6_daddr,
+			   &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr);
+	} else {
+		seq_printf(seq,
+			   "     dst_min: %s  dst_max: %s\n",
+			   pkt_dev->dst_min, pkt_dev->dst_max);
+		seq_printf(seq,
+			   "        src_min: %s  src_max: %s\n",
+			   pkt_dev->src_min, pkt_dev->src_max);
+	}
+
+	seq_puts(seq, "     src_mac: ");
+
+	seq_printf(seq, "%pM ",
+		   is_zero_ether_addr(pkt_dev->src_mac) ?
+			     pkt_dev->odev->dev_addr : pkt_dev->src_mac);
+
+	seq_printf(seq, "dst_mac: ");
+	seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
+
+	seq_printf(seq,
+		   "     udp_src_min: %d  udp_src_max: %d"
+		   "  udp_dst_min: %d  udp_dst_max: %d\n",
+		   pkt_dev->udp_src_min, pkt_dev->udp_src_max,
+		   pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
+
+	seq_printf(seq,
+		   "     src_mac_count: %d  dst_mac_count: %d\n",
+		   pkt_dev->src_mac_count, pkt_dev->dst_mac_count);
+
+	if (pkt_dev->nr_labels) {
+		unsigned i;
+		seq_printf(seq, "     mpls: ");
+		for (i = 0; i < pkt_dev->nr_labels; i++)
+			seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]),
+				   i == pkt_dev->nr_labels-1 ? "\n" : ", ");
+	}
+
+	if (pkt_dev->vlan_id != 0xffff)
+		seq_printf(seq, "     vlan_id: %u  vlan_p: %u  vlan_cfi: %u\n",
+			   pkt_dev->vlan_id, pkt_dev->vlan_p,
+			   pkt_dev->vlan_cfi);
+
+	if (pkt_dev->svlan_id != 0xffff)
+		seq_printf(seq, "     svlan_id: %u  vlan_p: %u  vlan_cfi: %u\n",
+			   pkt_dev->svlan_id, pkt_dev->svlan_p,
+			   pkt_dev->svlan_cfi);
+
+	if (pkt_dev->tos)
+		seq_printf(seq, "     tos: 0x%02x\n", pkt_dev->tos);
+
+	if (pkt_dev->traffic_class)
+		seq_printf(seq, "     traffic_class: 0x%02x\n", pkt_dev->traffic_class);
+
+	if (pkt_dev->node >= 0)
+		seq_printf(seq, "     node: %d\n", pkt_dev->node);
+
+	seq_printf(seq, "     Flags: ");
+
+	if (pkt_dev->flags & F_IPV6)
+		seq_printf(seq, "IPV6  ");
+
+	if (pkt_dev->flags & F_IPSRC_RND)
+		seq_printf(seq, "IPSRC_RND  ");
+
+	if (pkt_dev->flags & F_IPDST_RND)
+		seq_printf(seq, "IPDST_RND  ");
+
+	if (pkt_dev->flags & F_TXSIZE_RND)
+		seq_printf(seq, "TXSIZE_RND  ");
+
+	if (pkt_dev->flags & F_UDPSRC_RND)
+		seq_printf(seq, "UDPSRC_RND  ");
+
+	if (pkt_dev->flags & F_UDPDST_RND)
+		seq_printf(seq, "UDPDST_RND  ");
+
+	if (pkt_dev->flags & F_MPLS_RND)
+		seq_printf(seq,  "MPLS_RND  ");
+
+	if (pkt_dev->flags & F_QUEUE_MAP_RND)
+		seq_printf(seq,  "QUEUE_MAP_RND  ");
+
+	if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+		seq_printf(seq,  "QUEUE_MAP_CPU  ");
+
+	if (pkt_dev->cflows) {
+		if (pkt_dev->flags & F_FLOW_SEQ)
+			seq_printf(seq,  "FLOW_SEQ  "); /*in sequence flows*/
+		else
+			seq_printf(seq,  "FLOW_RND  ");
+	}
+
+#ifdef CONFIG_XFRM
+	if (pkt_dev->flags & F_IPSEC_ON)
+		seq_printf(seq,  "IPSEC  ");
+#endif
+
+	if (pkt_dev->flags & F_MACSRC_RND)
+		seq_printf(seq, "MACSRC_RND  ");
+
+	if (pkt_dev->flags & F_MACDST_RND)
+		seq_printf(seq, "MACDST_RND  ");
+
+	if (pkt_dev->flags & F_VID_RND)
+		seq_printf(seq, "VID_RND  ");
+
+	if (pkt_dev->flags & F_SVID_RND)
+		seq_printf(seq, "SVID_RND  ");
+
+	if (pkt_dev->flags & F_NODE)
+		seq_printf(seq, "NODE_ALLOC  ");
+
+	seq_puts(seq, "\n");
+
+	/* not really stopped, more like last-running-at */
+	stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at;
+	idle = pkt_dev->idle_acc;
+	do_div(idle, NSEC_PER_USEC);
+
+	seq_printf(seq,
+		   "Current:\n     pkts-sofar: %llu  errors: %llu\n",
+		   (unsigned long long)pkt_dev->sofar,
+		   (unsigned long long)pkt_dev->errors);
+
+	seq_printf(seq,
+		   "     started: %lluus  stopped: %lluus idle: %lluus\n",
+		   (unsigned long long) ktime_to_us(pkt_dev->started_at),
+		   (unsigned long long) ktime_to_us(stopped),
+		   (unsigned long long) idle);
+
+	seq_printf(seq,
+		   "     seq_num: %d  cur_dst_mac_offset: %d  cur_src_mac_offset: %d\n",
+		   pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset,
+		   pkt_dev->cur_src_mac_offset);
+
+	if (pkt_dev->flags & F_IPV6) {
+		seq_printf(seq, "     cur_saddr: %pI6c  cur_daddr: %pI6c\n",
+				&pkt_dev->cur_in6_saddr,
+				&pkt_dev->cur_in6_daddr);
+	} else
+		seq_printf(seq, "     cur_saddr: 0x%x  cur_daddr: 0x%x\n",
+			   pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+
+	seq_printf(seq, "     cur_udp_dst: %d  cur_udp_src: %d\n",
+		   pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
+
+	seq_printf(seq, "     cur_queue_map: %u\n", pkt_dev->cur_queue_map);
+
+	seq_printf(seq, "     flows: %u\n", pkt_dev->nflows);
+
+	if (pkt_dev->result[0])
+		seq_printf(seq, "Result: %s\n", pkt_dev->result);
+	else
+		seq_printf(seq, "Result: Idle\n");
+
+	return 0;
+}
+
+
+static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
+		     __u32 *num)
+{
+	int i = 0;
+	*num = 0;
+
+	for (; i < maxlen; i++) {
+		int value;
+		char c;
+		*num <<= 4;
+		if (get_user(c, &user_buffer[i]))
+			return -EFAULT;
+		value = hex_to_bin(c);
+		if (value >= 0)
+			*num |= value;
+		else
+			break;
+	}
+	return i;
+}
+
+static int count_trail_chars(const char __user * user_buffer,
+			     unsigned int maxlen)
+{
+	int i;
+
+	for (i = 0; i < maxlen; i++) {
+		char c;
+		if (get_user(c, &user_buffer[i]))
+			return -EFAULT;
+		switch (c) {
+		case '\"':
+		case '\n':
+		case '\r':
+		case '\t':
+		case ' ':
+		case '=':
+			break;
+		default:
+			goto done;
+		}
+	}
+done:
+	return i;
+}
+
+static unsigned long num_arg(const char __user * user_buffer,
+			     unsigned long maxlen, unsigned long *num)
+{
+	int i;
+	*num = 0;
+
+	for (i = 0; i < maxlen; i++) {
+		char c;
+		if (get_user(c, &user_buffer[i]))
+			return -EFAULT;
+		if ((c >= '0') && (c <= '9')) {
+			*num *= 10;
+			*num += c - '0';
+		} else
+			break;
+	}
+	return i;
+}
+
+static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+{
+	int i;
+
+	for (i = 0; i < maxlen; i++) {
+		char c;
+		if (get_user(c, &user_buffer[i]))
+			return -EFAULT;
+		switch (c) {
+		case '\"':
+		case '\n':
+		case '\r':
+		case '\t':
+		case ' ':
+			goto done_str;
+			break;
+		default:
+			break;
+		}
+	}
+done_str:
+	return i;
+}
+
+static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+{
+	unsigned n = 0;
+	char c;
+	ssize_t i = 0;
+	int len;
+
+	pkt_dev->nr_labels = 0;
+	do {
+		__u32 tmp;
+		len = hex32_arg(&buffer[i], 8, &tmp);
+		if (len <= 0)
+			return len;
+		pkt_dev->labels[n] = htonl(tmp);
+		if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
+			pkt_dev->flags |= F_MPLS_RND;
+		i += len;
+		if (get_user(c, &buffer[i]))
+			return -EFAULT;
+		i++;
+		n++;
+		if (n >= MAX_MPLS_LABELS)
+			return -E2BIG;
+	} while (c == ',');
+
+	pkt_dev->nr_labels = n;
+	return i;
+}
+
+static ssize_t pktgen_if_write(struct file *file,
+			       const char __user * user_buffer, size_t count,
+			       loff_t * offset)
+{
+	struct seq_file *seq = file->private_data;
+	struct pktgen_dev *pkt_dev = seq->private;
+	int i, max, len;
+	char name[16], valstr[32];
+	unsigned long value = 0;
+	char *pg_result = NULL;
+	int tmp = 0;
+	char buf[128];
+
+	pg_result = &(pkt_dev->result[0]);
+
+	if (count < 1) {
+		pr_warning("wrong command format\n");
+		return -EINVAL;
+	}
+
+	max = count;
+	tmp = count_trail_chars(user_buffer, max);
+	if (tmp < 0) {
+		pr_warning("illegal format\n");
+		return tmp;
+	}
+	i = tmp;
+
+	/* Read variable name */
+
+	len = strn_len(&user_buffer[i], sizeof(name) - 1);
+	if (len < 0)
+		return len;
+
+	memset(name, 0, sizeof(name));
+	if (copy_from_user(name, &user_buffer[i], len))
+		return -EFAULT;
+	i += len;
+
+	max = count - i;
+	len = count_trail_chars(&user_buffer[i], max);
+	if (len < 0)
+		return len;
+
+	i += len;
+
+	if (debug) {
+		size_t copy = min_t(size_t, count, 1023);
+		char tb[copy + 1];
+		if (copy_from_user(tb, user_buffer, copy))
+			return -EFAULT;
+		tb[copy] = 0;
+		printk(KERN_DEBUG "pktgen: %s,%lu  buffer -:%s:-\n", name,
+		       (unsigned long)count, tb);
+	}
+
+	if (!strcmp(name, "min_pkt_size")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value < 14 + 20 + 8)
+			value = 14 + 20 + 8;
+		if (value != pkt_dev->min_pkt_size) {
+			pkt_dev->min_pkt_size = value;
+			pkt_dev->cur_pkt_size = value;
+		}
+		sprintf(pg_result, "OK: min_pkt_size=%u",
+			pkt_dev->min_pkt_size);
+		return count;
+	}
+
+	if (!strcmp(name, "max_pkt_size")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value < 14 + 20 + 8)
+			value = 14 + 20 + 8;
+		if (value != pkt_dev->max_pkt_size) {
+			pkt_dev->max_pkt_size = value;
+			pkt_dev->cur_pkt_size = value;
+		}
+		sprintf(pg_result, "OK: max_pkt_size=%u",
+			pkt_dev->max_pkt_size);
+		return count;
+	}
+
+	/* Shortcut for min = max */
+
+	if (!strcmp(name, "pkt_size")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value < 14 + 20 + 8)
+			value = 14 + 20 + 8;
+		if (value != pkt_dev->min_pkt_size) {
+			pkt_dev->min_pkt_size = value;
+			pkt_dev->max_pkt_size = value;
+			pkt_dev->cur_pkt_size = value;
+		}
+		sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+		return count;
+	}
+
+	if (!strcmp(name, "debug")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		debug = value;
+		sprintf(pg_result, "OK: debug=%u", debug);
+		return count;
+	}
+
+	if (!strcmp(name, "frags")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->nfrags = value;
+		sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+		return count;
+	}
+	if (!strcmp(name, "delay")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value == 0x7FFFFFFF)
+			pkt_dev->delay = ULLONG_MAX;
+		else
+			pkt_dev->delay = (u64)value;
+
+		sprintf(pg_result, "OK: delay=%llu",
+			(unsigned long long) pkt_dev->delay);
+		return count;
+	}
+	if (!strcmp(name, "rate")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (!value)
+			return len;
+		pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+		if (debug)
+			pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+		sprintf(pg_result, "OK: rate=%lu", value);
+		return count;
+	}
+	if (!strcmp(name, "ratep")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (!value)
+			return len;
+		pkt_dev->delay = NSEC_PER_SEC/value;
+		if (debug)
+			pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+		sprintf(pg_result, "OK: rate=%lu", value);
+		return count;
+	}
+	if (!strcmp(name, "udp_src_min")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value != pkt_dev->udp_src_min) {
+			pkt_dev->udp_src_min = value;
+			pkt_dev->cur_udp_src = value;
+		}
+		sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min);
+		return count;
+	}
+	if (!strcmp(name, "udp_dst_min")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value != pkt_dev->udp_dst_min) {
+			pkt_dev->udp_dst_min = value;
+			pkt_dev->cur_udp_dst = value;
+		}
+		sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min);
+		return count;
+	}
+	if (!strcmp(name, "udp_src_max")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value != pkt_dev->udp_src_max) {
+			pkt_dev->udp_src_max = value;
+			pkt_dev->cur_udp_src = value;
+		}
+		sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max);
+		return count;
+	}
+	if (!strcmp(name, "udp_dst_max")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value != pkt_dev->udp_dst_max) {
+			pkt_dev->udp_dst_max = value;
+			pkt_dev->cur_udp_dst = value;
+		}
+		sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max);
+		return count;
+	}
+	if (!strcmp(name, "clone_skb")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+		if ((value > 0) &&
+		    (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+			return -ENOTSUPP;
+		i += len;
+		pkt_dev->clone_skb = value;
+
+		sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
+		return count;
+	}
+	if (!strcmp(name, "count")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->count = value;
+		sprintf(pg_result, "OK: count=%llu",
+			(unsigned long long)pkt_dev->count);
+		return count;
+	}
+	if (!strcmp(name, "src_mac_count")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (pkt_dev->src_mac_count != value) {
+			pkt_dev->src_mac_count = value;
+			pkt_dev->cur_src_mac_offset = 0;
+		}
+		sprintf(pg_result, "OK: src_mac_count=%d",
+			pkt_dev->src_mac_count);
+		return count;
+	}
+	if (!strcmp(name, "dst_mac_count")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (pkt_dev->dst_mac_count != value) {
+			pkt_dev->dst_mac_count = value;
+			pkt_dev->cur_dst_mac_offset = 0;
+		}
+		sprintf(pg_result, "OK: dst_mac_count=%d",
+			pkt_dev->dst_mac_count);
+		return count;
+	}
+	if (!strcmp(name, "node")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+
+		if (node_possible(value)) {
+			pkt_dev->node = value;
+			sprintf(pg_result, "OK: node=%d", pkt_dev->node);
+			if (pkt_dev->page) {
+				put_page(pkt_dev->page);
+				pkt_dev->page = NULL;
+			}
+		}
+		else
+			sprintf(pg_result, "ERROR: node not possible");
+		return count;
+	}
+	if (!strcmp(name, "flag")) {
+		char f[32];
+		memset(f, 0, 32);
+		len = strn_len(&user_buffer[i], sizeof(f) - 1);
+		if (len < 0)
+			return len;
+
+		if (copy_from_user(f, &user_buffer[i], len))
+			return -EFAULT;
+		i += len;
+		if (strcmp(f, "IPSRC_RND") == 0)
+			pkt_dev->flags |= F_IPSRC_RND;
+
+		else if (strcmp(f, "!IPSRC_RND") == 0)
+			pkt_dev->flags &= ~F_IPSRC_RND;
+
+		else if (strcmp(f, "TXSIZE_RND") == 0)
+			pkt_dev->flags |= F_TXSIZE_RND;
+
+		else if (strcmp(f, "!TXSIZE_RND") == 0)
+			pkt_dev->flags &= ~F_TXSIZE_RND;
+
+		else if (strcmp(f, "IPDST_RND") == 0)
+			pkt_dev->flags |= F_IPDST_RND;
+
+		else if (strcmp(f, "!IPDST_RND") == 0)
+			pkt_dev->flags &= ~F_IPDST_RND;
+
+		else if (strcmp(f, "UDPSRC_RND") == 0)
+			pkt_dev->flags |= F_UDPSRC_RND;
+
+		else if (strcmp(f, "!UDPSRC_RND") == 0)
+			pkt_dev->flags &= ~F_UDPSRC_RND;
+
+		else if (strcmp(f, "UDPDST_RND") == 0)
+			pkt_dev->flags |= F_UDPDST_RND;
+
+		else if (strcmp(f, "!UDPDST_RND") == 0)
+			pkt_dev->flags &= ~F_UDPDST_RND;
+
+		else if (strcmp(f, "MACSRC_RND") == 0)
+			pkt_dev->flags |= F_MACSRC_RND;
+
+		else if (strcmp(f, "!MACSRC_RND") == 0)
+			pkt_dev->flags &= ~F_MACSRC_RND;
+
+		else if (strcmp(f, "MACDST_RND") == 0)
+			pkt_dev->flags |= F_MACDST_RND;
+
+		else if (strcmp(f, "!MACDST_RND") == 0)
+			pkt_dev->flags &= ~F_MACDST_RND;
+
+		else if (strcmp(f, "MPLS_RND") == 0)
+			pkt_dev->flags |= F_MPLS_RND;
+
+		else if (strcmp(f, "!MPLS_RND") == 0)
+			pkt_dev->flags &= ~F_MPLS_RND;
+
+		else if (strcmp(f, "VID_RND") == 0)
+			pkt_dev->flags |= F_VID_RND;
+
+		else if (strcmp(f, "!VID_RND") == 0)
+			pkt_dev->flags &= ~F_VID_RND;
+
+		else if (strcmp(f, "SVID_RND") == 0)
+			pkt_dev->flags |= F_SVID_RND;
+
+		else if (strcmp(f, "!SVID_RND") == 0)
+			pkt_dev->flags &= ~F_SVID_RND;
+
+		else if (strcmp(f, "FLOW_SEQ") == 0)
+			pkt_dev->flags |= F_FLOW_SEQ;
+
+		else if (strcmp(f, "QUEUE_MAP_RND") == 0)
+			pkt_dev->flags |= F_QUEUE_MAP_RND;
+
+		else if (strcmp(f, "!QUEUE_MAP_RND") == 0)
+			pkt_dev->flags &= ~F_QUEUE_MAP_RND;
+
+		else if (strcmp(f, "QUEUE_MAP_CPU") == 0)
+			pkt_dev->flags |= F_QUEUE_MAP_CPU;
+
+		else if (strcmp(f, "!QUEUE_MAP_CPU") == 0)
+			pkt_dev->flags &= ~F_QUEUE_MAP_CPU;
+#ifdef CONFIG_XFRM
+		else if (strcmp(f, "IPSEC") == 0)
+			pkt_dev->flags |= F_IPSEC_ON;
+#endif
+
+		else if (strcmp(f, "!IPV6") == 0)
+			pkt_dev->flags &= ~F_IPV6;
+
+		else if (strcmp(f, "NODE_ALLOC") == 0)
+			pkt_dev->flags |= F_NODE;
+
+		else if (strcmp(f, "!NODE_ALLOC") == 0)
+			pkt_dev->flags &= ~F_NODE;
+
+		else {
+			sprintf(pg_result,
+				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
+				f,
+				"IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
+				"MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n");
+			return count;
+		}
+		sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+		return count;
+	}
+	if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
+		len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+		if (len < 0)
+			return len;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+		if (strcmp(buf, pkt_dev->dst_min) != 0) {
+			memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
+			strncpy(pkt_dev->dst_min, buf, len);
+			pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+			pkt_dev->cur_daddr = pkt_dev->daddr_min;
+		}
+		if (debug)
+			printk(KERN_DEBUG "pktgen: dst_min set to: %s\n",
+			       pkt_dev->dst_min);
+		i += len;
+		sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
+		return count;
+	}
+	if (!strcmp(name, "dst_max")) {
+		len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+		if (len < 0)
+			return len;
+
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+
+		buf[len] = 0;
+		if (strcmp(buf, pkt_dev->dst_max) != 0) {
+			memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
+			strncpy(pkt_dev->dst_max, buf, len);
+			pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+			pkt_dev->cur_daddr = pkt_dev->daddr_max;
+		}
+		if (debug)
+			printk(KERN_DEBUG "pktgen: dst_max set to: %s\n",
+			       pkt_dev->dst_max);
+		i += len;
+		sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
+		return count;
+	}
+	if (!strcmp(name, "dst6")) {
+		len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+		if (len < 0)
+			return len;
+
+		pkt_dev->flags |= F_IPV6;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+
+		scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
+
+		ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+
+		if (debug)
+			printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
+
+		i += len;
+		sprintf(pg_result, "OK: dst6=%s", buf);
+		return count;
+	}
+	if (!strcmp(name, "dst6_min")) {
+		len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+		if (len < 0)
+			return len;
+
+		pkt_dev->flags |= F_IPV6;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+
+		scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
+
+		ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
+			       &pkt_dev->min_in6_daddr);
+		if (debug)
+			printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
+
+		i += len;
+		sprintf(pg_result, "OK: dst6_min=%s", buf);
+		return count;
+	}
+	if (!strcmp(name, "dst6_max")) {
+		len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+		if (len < 0)
+			return len;
+
+		pkt_dev->flags |= F_IPV6;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+
+		scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
+
+		if (debug)
+			printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf);
+
+		i += len;
+		sprintf(pg_result, "OK: dst6_max=%s", buf);
+		return count;
+	}
+	if (!strcmp(name, "src6")) {
+		len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+		if (len < 0)
+			return len;
+
+		pkt_dev->flags |= F_IPV6;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+
+		scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
+
+		ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+
+		if (debug)
+			printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
+
+		i += len;
+		sprintf(pg_result, "OK: src6=%s", buf);
+		return count;
+	}
+	if (!strcmp(name, "src_min")) {
+		len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+		if (len < 0)
+			return len;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+		if (strcmp(buf, pkt_dev->src_min) != 0) {
+			memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
+			strncpy(pkt_dev->src_min, buf, len);
+			pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+			pkt_dev->cur_saddr = pkt_dev->saddr_min;
+		}
+		if (debug)
+			printk(KERN_DEBUG "pktgen: src_min set to: %s\n",
+			       pkt_dev->src_min);
+		i += len;
+		sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
+		return count;
+	}
+	if (!strcmp(name, "src_max")) {
+		len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+		if (len < 0)
+			return len;
+
+		if (copy_from_user(buf, &user_buffer[i], len))
+			return -EFAULT;
+		buf[len] = 0;
+		if (strcmp(buf, pkt_dev->src_max) != 0) {
+			memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
+			strncpy(pkt_dev->src_max, buf, len);
+			pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+			pkt_dev->cur_saddr = pkt_dev->saddr_max;
+		}
+		if (debug)
+			printk(KERN_DEBUG "pktgen: src_max set to: %s\n",
+			       pkt_dev->src_max);
+		i += len;
+		sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
+		return count;
+	}
+	if (!strcmp(name, "dst_mac")) {
+		len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+		if (len < 0)
+			return len;
+
+		memset(valstr, 0, sizeof(valstr));
+		if (copy_from_user(valstr, &user_buffer[i], len))
+			return -EFAULT;
+
+		if (!mac_pton(valstr, pkt_dev->dst_mac))
+			return -EINVAL;
+		/* Set up Dest MAC */
+		memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN);
+
+		sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac);
+		return count;
+	}
+	if (!strcmp(name, "src_mac")) {
+		len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+		if (len < 0)
+			return len;
+
+		memset(valstr, 0, sizeof(valstr));
+		if (copy_from_user(valstr, &user_buffer[i], len))
+			return -EFAULT;
+
+		if (!mac_pton(valstr, pkt_dev->src_mac))
+			return -EINVAL;
+		/* Set up Src MAC */
+		memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN);
+
+		sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac);
+		return count;
+	}
+
+	if (!strcmp(name, "clear_counters")) {
+		pktgen_clear_counters(pkt_dev);
+		sprintf(pg_result, "OK: Clearing counters.\n");
+		return count;
+	}
+
+	if (!strcmp(name, "flows")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value > MAX_CFLOWS)
+			value = MAX_CFLOWS;
+
+		pkt_dev->cflows = value;
+		sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows);
+		return count;
+	}
+
+	if (!strcmp(name, "flowlen")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->lflow = value;
+		sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
+		return count;
+	}
+
+	if (!strcmp(name, "queue_map_min")) {
+		len = num_arg(&user_buffer[i], 5, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->queue_map_min = value;
+		sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
+		return count;
+	}
+
+	if (!strcmp(name, "queue_map_max")) {
+		len = num_arg(&user_buffer[i], 5, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->queue_map_max = value;
+		sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
+		return count;
+	}
+
+	if (!strcmp(name, "mpls")) {
+		unsigned n, cnt;
+
+		len = get_labels(&user_buffer[i], pkt_dev);
+		if (len < 0)
+			return len;
+		i += len;
+		cnt = sprintf(pg_result, "OK: mpls=");
+		for (n = 0; n < pkt_dev->nr_labels; n++)
+			cnt += sprintf(pg_result + cnt,
+				       "%08x%s", ntohl(pkt_dev->labels[n]),
+				       n == pkt_dev->nr_labels-1 ? "" : ",");
+
+		if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) {
+			pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+			pkt_dev->svlan_id = 0xffff;
+
+			if (debug)
+				printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "vlan_id")) {
+		len = num_arg(&user_buffer[i], 4, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (value <= 4095) {
+			pkt_dev->vlan_id = value;  /* turn on VLAN */
+
+			if (debug)
+				printk(KERN_DEBUG "pktgen: VLAN turned on\n");
+
+			if (debug && pkt_dev->nr_labels)
+				printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+
+			pkt_dev->nr_labels = 0;    /* turn off MPLS */
+			sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id);
+		} else {
+			pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+			pkt_dev->svlan_id = 0xffff;
+
+			if (debug)
+				printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "vlan_p")) {
+		len = num_arg(&user_buffer[i], 1, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
+			pkt_dev->vlan_p = value;
+			sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
+		} else {
+			sprintf(pg_result, "ERROR: vlan_p must be 0-7");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "vlan_cfi")) {
+		len = num_arg(&user_buffer[i], 1, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
+			pkt_dev->vlan_cfi = value;
+			sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
+		} else {
+			sprintf(pg_result, "ERROR: vlan_cfi must be 0-1");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "svlan_id")) {
+		len = num_arg(&user_buffer[i], 4, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
+			pkt_dev->svlan_id = value;  /* turn on SVLAN */
+
+			if (debug)
+				printk(KERN_DEBUG "pktgen: SVLAN turned on\n");
+
+			if (debug && pkt_dev->nr_labels)
+				printk(KERN_DEBUG "pktgen: MPLS auto turned off\n");
+
+			pkt_dev->nr_labels = 0;    /* turn off MPLS */
+			sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id);
+		} else {
+			pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */
+			pkt_dev->svlan_id = 0xffff;
+
+			if (debug)
+				printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "svlan_p")) {
+		len = num_arg(&user_buffer[i], 1, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
+			pkt_dev->svlan_p = value;
+			sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
+		} else {
+			sprintf(pg_result, "ERROR: svlan_p must be 0-7");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "svlan_cfi")) {
+		len = num_arg(&user_buffer[i], 1, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
+			pkt_dev->svlan_cfi = value;
+			sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
+		} else {
+			sprintf(pg_result, "ERROR: svlan_cfi must be 0-1");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "tos")) {
+		__u32 tmp_value = 0;
+		len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (len == 2) {
+			pkt_dev->tos = tmp_value;
+			sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
+		} else {
+			sprintf(pg_result, "ERROR: tos must be 00-ff");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "traffic_class")) {
+		__u32 tmp_value = 0;
+		len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (len == 2) {
+			pkt_dev->traffic_class = tmp_value;
+			sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
+		} else {
+			sprintf(pg_result, "ERROR: traffic_class must be 00-ff");
+		}
+		return count;
+	}
+
+	if (!strcmp(name, "skb_priority")) {
+		len = num_arg(&user_buffer[i], 9, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		pkt_dev->skb_priority = value;
+		sprintf(pg_result, "OK: skb_priority=%i",
+			pkt_dev->skb_priority);
+		return count;
+	}
+
+	sprintf(pkt_dev->result, "No such parameter \"%s\"", name);
+	return -EINVAL;
+}
+
+static int pktgen_if_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pktgen_if_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_if_fops = {
+	.owner   = THIS_MODULE,
+	.open    = pktgen_if_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = pktgen_if_write,
+	.release = single_release,
+};
+
+static int pktgen_thread_show(struct seq_file *seq, void *v)
+{
+	struct pktgen_thread *t = seq->private;
+	const struct pktgen_dev *pkt_dev;
+
+	BUG_ON(!t);
+
+	seq_printf(seq, "Running: ");
+
+	if_lock(t);
+	list_for_each_entry(pkt_dev, &t->if_list, list)
+		if (pkt_dev->running)
+			seq_printf(seq, "%s ", pkt_dev->odevname);
+
+	seq_printf(seq, "\nStopped: ");
+
+	list_for_each_entry(pkt_dev, &t->if_list, list)
+		if (!pkt_dev->running)
+			seq_printf(seq, "%s ", pkt_dev->odevname);
+
+	if (t->result[0])
+		seq_printf(seq, "\nResult: %s\n", t->result);
+	else
+		seq_printf(seq, "\nResult: NA\n");
+
+	if_unlock(t);
+
+	return 0;
+}
+
+static ssize_t pktgen_thread_write(struct file *file,
+				   const char __user * user_buffer,
+				   size_t count, loff_t * offset)
+{
+	struct seq_file *seq = file->private_data;
+	struct pktgen_thread *t = seq->private;
+	int i, max, len, ret;
+	char name[40];
+	char *pg_result;
+
+	if (count < 1) {
+		//      sprintf(pg_result, "Wrong command format");
+		return -EINVAL;
+	}
+
+	max = count;
+	len = count_trail_chars(user_buffer, max);
+	if (len < 0)
+		return len;
+
+	i = len;
+
+	/* Read variable name */
+
+	len = strn_len(&user_buffer[i], sizeof(name) - 1);
+	if (len < 0)
+		return len;
+
+	memset(name, 0, sizeof(name));
+	if (copy_from_user(name, &user_buffer[i], len))
+		return -EFAULT;
+	i += len;
+
+	max = count - i;
+	len = count_trail_chars(&user_buffer[i], max);
+	if (len < 0)
+		return len;
+
+	i += len;
+
+	if (debug)
+		printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n",
+		       name, (unsigned long)count);
+
+	if (!t) {
+		pr_err("ERROR: No thread\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pg_result = &(t->result[0]);
+
+	if (!strcmp(name, "add_device")) {
+		char f[32];
+		memset(f, 0, 32);
+		len = strn_len(&user_buffer[i], sizeof(f) - 1);
+		if (len < 0) {
+			ret = len;
+			goto out;
+		}
+		if (copy_from_user(f, &user_buffer[i], len))
+			return -EFAULT;
+		i += len;
+		mutex_lock(&pktgen_thread_lock);
+		pktgen_add_device(t, f);
+		mutex_unlock(&pktgen_thread_lock);
+		ret = count;
+		sprintf(pg_result, "OK: add_device=%s", f);
+		goto out;
+	}
+
+	if (!strcmp(name, "rem_device_all")) {
+		mutex_lock(&pktgen_thread_lock);
+		t->control |= T_REMDEVALL;
+		mutex_unlock(&pktgen_thread_lock);
+		schedule_timeout_interruptible(msecs_to_jiffies(125));	/* Propagate thread->control  */
+		ret = count;
+		sprintf(pg_result, "OK: rem_device_all");
+		goto out;
+	}
+
+	if (!strcmp(name, "max_before_softirq")) {
+		sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use");
+		ret = count;
+		goto out;
+	}
+
+	ret = -EINVAL;
+out:
+	return ret;
+}
+
+static int pktgen_thread_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pktgen_thread_show, PDE(inode)->data);
+}
+
+static const struct file_operations pktgen_thread_fops = {
+	.owner   = THIS_MODULE,
+	.open    = pktgen_thread_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = pktgen_thread_write,
+	.release = single_release,
+};
+
+/* Think find or remove for NN */
+static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove)
+{
+	struct pktgen_thread *t;
+	struct pktgen_dev *pkt_dev = NULL;
+	bool exact = (remove == FIND);
+
+	list_for_each_entry(t, &pktgen_threads, th_list) {
+		pkt_dev = pktgen_find_dev(t, ifname, exact);
+		if (pkt_dev) {
+			if (remove) {
+				if_lock(t);
+				pkt_dev->removal_mark = 1;
+				t->control |= T_REMDEV;
+				if_unlock(t);
+			}
+			break;
+		}
+	}
+	return pkt_dev;
+}
+
+/*
+ * mark a device for removal
+ */
+static void pktgen_mark_device(const char *ifname)
+{
+	struct pktgen_dev *pkt_dev = NULL;
+	const int max_tries = 10, msec_per_try = 125;
+	int i = 0;
+
+	mutex_lock(&pktgen_thread_lock);
+	pr_debug("%s: marking %s for removal\n", __func__, ifname);
+
+	while (1) {
+
+		pkt_dev = __pktgen_NN_threads(ifname, REMOVE);
+		if (pkt_dev == NULL)
+			break;	/* success */
+
+		mutex_unlock(&pktgen_thread_lock);
+		pr_debug("%s: waiting for %s to disappear....\n",
+			 __func__, ifname);
+		schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
+		mutex_lock(&pktgen_thread_lock);
+
+		if (++i >= max_tries) {
+			pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+			       __func__, msec_per_try * i, ifname);
+			break;
+		}
+
+	}
+
+	mutex_unlock(&pktgen_thread_lock);
+}
+
+static void pktgen_change_name(struct net_device *dev)
+{
+	struct pktgen_thread *t;
+
+	list_for_each_entry(t, &pktgen_threads, th_list) {
+		struct pktgen_dev *pkt_dev;
+
+		list_for_each_entry(pkt_dev, &t->if_list, list) {
+			if (pkt_dev->odev != dev)
+				continue;
+
+			remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+
+			pkt_dev->entry = proc_create_data(dev->name, 0600,
+							  pg_proc_dir,
+							  &pktgen_if_fops,
+							  pkt_dev);
+			if (!pkt_dev->entry)
+				pr_err("can't move proc entry for '%s'\n",
+				       dev->name);
+			break;
+		}
+	}
+}
+
+static int pktgen_device_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting)
+		return NOTIFY_DONE;
+
+	/* It is OK that we do not hold the group lock right now,
+	 * as we run under the RTNL lock.
+	 */
+
+	switch (event) {
+	case NETDEV_CHANGENAME:
+		pktgen_change_name(dev);
+		break;
+
+	case NETDEV_UNREGISTER:
+		pktgen_mark_device(dev->name);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
+						 const char *ifname)
+{
+	char b[IFNAMSIZ+5];
+	int i;
+
+	for (i = 0; ifname[i] != '@'; i++) {
+		if (i == IFNAMSIZ)
+			break;
+
+		b[i] = ifname[i];
+	}
+	b[i] = 0;
+
+	return dev_get_by_name(&init_net, b);
+}
+
+
+/* Associate pktgen_dev with a device. */
+
+static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)
+{
+	struct net_device *odev;
+	int err;
+
+	/* Clean old setups */
+	if (pkt_dev->odev) {
+		dev_put(pkt_dev->odev);
+		pkt_dev->odev = NULL;
+	}
+
+	odev = pktgen_dev_get_by_name(pkt_dev, ifname);
+	if (!odev) {
+		pr_err("no such netdevice: \"%s\"\n", ifname);
+		return -ENODEV;
+	}
+
+	if (odev->type != ARPHRD_ETHER) {
+		pr_err("not an ethernet device: \"%s\"\n", ifname);
+		err = -EINVAL;
+	} else if (!netif_running(odev)) {
+		pr_err("device is down: \"%s\"\n", ifname);
+		err = -ENETDOWN;
+	} else {
+		pkt_dev->odev = odev;
+		return 0;
+	}
+
+	dev_put(odev);
+	return err;
+}
+
+/* Read pkt_dev from the interface and set up internal pktgen_dev
+ * structure to have the right information to create/send packets
+ */
+static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
+{
+	int ntxq;
+
+	if (!pkt_dev->odev) {
+		pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
+		sprintf(pkt_dev->result,
+			"ERROR: pkt_dev->odev == NULL in setup_inject.\n");
+		return;
+	}
+
+	/* make sure that we don't pick a non-existing transmit queue */
+	ntxq = pkt_dev->odev->real_num_tx_queues;
+
+	if (ntxq <= pkt_dev->queue_map_min) {
+		pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+			   pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+			   pkt_dev->odevname);
+		pkt_dev->queue_map_min = ntxq - 1;
+	}
+	if (pkt_dev->queue_map_max >= ntxq) {
+		pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+			   pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+			   pkt_dev->odevname);
+		pkt_dev->queue_map_max = ntxq - 1;
+	}
+
+	/* Default to the interface's mac if not explicitly set. */
+
+	if (is_zero_ether_addr(pkt_dev->src_mac))
+		memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN);
+
+	/* Set up Dest MAC */
+	memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
+
+	/* Set up pkt size */
+	pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+
+	if (pkt_dev->flags & F_IPV6) {
+		/*
+		 * Skip this automatic address setting until locks or functions
+		 * gets exported
+		 */
+
+#ifdef NOTNOW
+		int i, set = 0, err = 1;
+		struct inet6_dev *idev;
+
+		for (i = 0; i < IN6_ADDR_HSIZE; i++)
+			if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
+				set = 1;
+				break;
+			}
+
+		if (!set) {
+
+			/*
+			 * Use linklevel address if unconfigured.
+			 *
+			 * use ipv6_get_lladdr if/when it's get exported
+			 */
+
+			rcu_read_lock();
+			idev = __in6_dev_get(pkt_dev->odev);
+			if (idev) {
+				struct inet6_ifaddr *ifp;
+
+				read_lock_bh(&idev->lock);
+				for (ifp = idev->addr_list; ifp;
+				     ifp = ifp->if_next) {
+					if (ifp->scope == IFA_LINK &&
+					    !(ifp->flags & IFA_F_TENTATIVE)) {
+						ipv6_addr_copy(&pkt_dev->
+							       cur_in6_saddr,
+							       &ifp->addr);
+						err = 0;
+						break;
+					}
+				}
+				read_unlock_bh(&idev->lock);
+			}
+			rcu_read_unlock();
+			if (err)
+				pr_err("ERROR: IPv6 link address not available\n");
+		}
+#endif
+	} else {
+		pkt_dev->saddr_min = 0;
+		pkt_dev->saddr_max = 0;
+		if (strlen(pkt_dev->src_min) == 0) {
+
+			struct in_device *in_dev;
+
+			rcu_read_lock();
+			in_dev = __in_dev_get_rcu(pkt_dev->odev);
+			if (in_dev) {
+				if (in_dev->ifa_list) {
+					pkt_dev->saddr_min =
+					    in_dev->ifa_list->ifa_address;
+					pkt_dev->saddr_max = pkt_dev->saddr_min;
+				}
+			}
+			rcu_read_unlock();
+		} else {
+			pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
+			pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
+		}
+
+		pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
+		pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
+	}
+	/* Initialize current values. */
+	pkt_dev->cur_dst_mac_offset = 0;
+	pkt_dev->cur_src_mac_offset = 0;
+	pkt_dev->cur_saddr = pkt_dev->saddr_min;
+	pkt_dev->cur_daddr = pkt_dev->daddr_min;
+	pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+	pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+	pkt_dev->nflows = 0;
+}
+
+
+static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
+{
+	ktime_t start_time, end_time;
+	s64 remaining;
+	struct hrtimer_sleeper t;
+
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_set_expires(&t.timer, spin_until);
+
+	remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
+	if (remaining <= 0) {
+		pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+		return;
+	}
+
+	start_time = ktime_now();
+	if (remaining < 100000)
+		ndelay(remaining);	/* really small just spin */
+	else {
+		/* see do_nanosleep */
+		hrtimer_init_sleeper(&t, current);
+		do {
+			set_current_state(TASK_INTERRUPTIBLE);
+			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+			if (!hrtimer_active(&t.timer))
+				t.task = NULL;
+
+			if (likely(t.task))
+				schedule();
+
+			hrtimer_cancel(&t.timer);
+		} while (t.task && pkt_dev->running && !signal_pending(current));
+		__set_current_state(TASK_RUNNING);
+	}
+	end_time = ktime_now();
+
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
+	pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
+}
+
+static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev)
+{
+	pkt_dev->pkt_overhead = 0;
+	pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32);
+	pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev);
+	pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev);
+}
+
+static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow)
+{
+	return !!(pkt_dev->flows[flow].flags & F_INIT);
+}
+
+static inline int f_pick(struct pktgen_dev *pkt_dev)
+{
+	int flow = pkt_dev->curfl;
+
+	if (pkt_dev->flags & F_FLOW_SEQ) {
+		if (pkt_dev->flows[flow].count >= pkt_dev->lflow) {
+			/* reset time */
+			pkt_dev->flows[flow].count = 0;
+			pkt_dev->flows[flow].flags = 0;
+			pkt_dev->curfl += 1;
+			if (pkt_dev->curfl >= pkt_dev->cflows)
+				pkt_dev->curfl = 0; /*reset */
+		}
+	} else {
+		flow = random32() % pkt_dev->cflows;
+		pkt_dev->curfl = flow;
+
+		if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
+			pkt_dev->flows[flow].count = 0;
+			pkt_dev->flows[flow].flags = 0;
+		}
+	}
+
+	return pkt_dev->curfl;
+}
+
+
+#ifdef CONFIG_XFRM
+/* If there was already an IPSEC SA, we keep it as is, else
+ * we go look for it ...
+*/
+#define DUMMY_MARK 0
+static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
+{
+	struct xfrm_state *x = pkt_dev->flows[flow].x;
+	if (!x) {
+		/*slow path: we dont already have xfrm_state*/
+		x = xfrm_stateonly_find(&init_net, DUMMY_MARK,
+					(xfrm_address_t *)&pkt_dev->cur_daddr,
+					(xfrm_address_t *)&pkt_dev->cur_saddr,
+					AF_INET,
+					pkt_dev->ipsmode,
+					pkt_dev->ipsproto, 0);
+		if (x) {
+			pkt_dev->flows[flow].x = x;
+			set_pkt_overhead(pkt_dev);
+			pkt_dev->pkt_overhead += x->props.header_len;
+		}
+
+	}
+}
+#endif
+static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
+{
+
+	if (pkt_dev->flags & F_QUEUE_MAP_CPU)
+		pkt_dev->cur_queue_map = smp_processor_id();
+
+	else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
+		__u16 t;
+		if (pkt_dev->flags & F_QUEUE_MAP_RND) {
+			t = random32() %
+				(pkt_dev->queue_map_max -
+				 pkt_dev->queue_map_min + 1)
+				+ pkt_dev->queue_map_min;
+		} else {
+			t = pkt_dev->cur_queue_map + 1;
+			if (t > pkt_dev->queue_map_max)
+				t = pkt_dev->queue_map_min;
+		}
+		pkt_dev->cur_queue_map = t;
+	}
+	pkt_dev->cur_queue_map  = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues;
+}
+
+/* Increment/randomize headers according to flags and current values
+ * for IP src/dest, UDP src/dst port, MAC-Addr src/dst
+ */
+static void mod_cur_headers(struct pktgen_dev *pkt_dev)
+{
+	__u32 imn;
+	__u32 imx;
+	int flow = 0;
+
+	if (pkt_dev->cflows)
+		flow = f_pick(pkt_dev);
+
+	/*  Deal with source MAC */
+	if (pkt_dev->src_mac_count > 1) {
+		__u32 mc;
+		__u32 tmp;
+
+		if (pkt_dev->flags & F_MACSRC_RND)
+			mc = random32() % pkt_dev->src_mac_count;
+		else {
+			mc = pkt_dev->cur_src_mac_offset++;
+			if (pkt_dev->cur_src_mac_offset >=
+			    pkt_dev->src_mac_count)
+				pkt_dev->cur_src_mac_offset = 0;
+		}
+
+		tmp = pkt_dev->src_mac[5] + (mc & 0xFF);
+		pkt_dev->hh[11] = tmp;
+		tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[10] = tmp;
+		tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[9] = tmp;
+		tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[8] = tmp;
+		tmp = (pkt_dev->src_mac[1] + (tmp >> 8));
+		pkt_dev->hh[7] = tmp;
+	}
+
+	/*  Deal with Destination MAC */
+	if (pkt_dev->dst_mac_count > 1) {
+		__u32 mc;
+		__u32 tmp;
+
+		if (pkt_dev->flags & F_MACDST_RND)
+			mc = random32() % pkt_dev->dst_mac_count;
+
+		else {
+			mc = pkt_dev->cur_dst_mac_offset++;
+			if (pkt_dev->cur_dst_mac_offset >=
+			    pkt_dev->dst_mac_count) {
+				pkt_dev->cur_dst_mac_offset = 0;
+			}
+		}
+
+		tmp = pkt_dev->dst_mac[5] + (mc & 0xFF);
+		pkt_dev->hh[5] = tmp;
+		tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[4] = tmp;
+		tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[3] = tmp;
+		tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8));
+		pkt_dev->hh[2] = tmp;
+		tmp = (pkt_dev->dst_mac[1] + (tmp >> 8));
+		pkt_dev->hh[1] = tmp;
+	}
+
+	if (pkt_dev->flags & F_MPLS_RND) {
+		unsigned i;
+		for (i = 0; i < pkt_dev->nr_labels; i++)
+			if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
+				pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
+					     ((__force __be32)random32() &
+						      htonl(0x000fffff));
+	}
+
+	if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
+		pkt_dev->vlan_id = random32() & (4096-1);
+	}
+
+	if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
+		pkt_dev->svlan_id = random32() & (4096 - 1);
+	}
+
+	if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
+		if (pkt_dev->flags & F_UDPSRC_RND)
+			pkt_dev->cur_udp_src = random32() %
+				(pkt_dev->udp_src_max - pkt_dev->udp_src_min)
+				+ pkt_dev->udp_src_min;
+
+		else {
+			pkt_dev->cur_udp_src++;
+			if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max)
+				pkt_dev->cur_udp_src = pkt_dev->udp_src_min;
+		}
+	}
+
+	if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
+		if (pkt_dev->flags & F_UDPDST_RND) {
+			pkt_dev->cur_udp_dst = random32() %
+				(pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
+				+ pkt_dev->udp_dst_min;
+		} else {
+			pkt_dev->cur_udp_dst++;
+			if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
+				pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min;
+		}
+	}
+
+	if (!(pkt_dev->flags & F_IPV6)) {
+
+		imn = ntohl(pkt_dev->saddr_min);
+		imx = ntohl(pkt_dev->saddr_max);
+		if (imn < imx) {
+			__u32 t;
+			if (pkt_dev->flags & F_IPSRC_RND)
+				t = random32() % (imx - imn) + imn;
+			else {
+				t = ntohl(pkt_dev->cur_saddr);
+				t++;
+				if (t > imx)
+					t = imn;
+
+			}
+			pkt_dev->cur_saddr = htonl(t);
+		}
+
+		if (pkt_dev->cflows && f_seen(pkt_dev, flow)) {
+			pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr;
+		} else {
+			imn = ntohl(pkt_dev->daddr_min);
+			imx = ntohl(pkt_dev->daddr_max);
+			if (imn < imx) {
+				__u32 t;
+				__be32 s;
+				if (pkt_dev->flags & F_IPDST_RND) {
+
+					t = random32() % (imx - imn) + imn;
+					s = htonl(t);
+
+					while (ipv4_is_loopback(s) ||
+					       ipv4_is_multicast(s) ||
+					       ipv4_is_lbcast(s) ||
+					       ipv4_is_zeronet(s) ||
+					       ipv4_is_local_multicast(s)) {
+						t = random32() % (imx - imn) + imn;
+						s = htonl(t);
+					}
+					pkt_dev->cur_daddr = s;
+				} else {
+					t = ntohl(pkt_dev->cur_daddr);
+					t++;
+					if (t > imx) {
+						t = imn;
+					}
+					pkt_dev->cur_daddr = htonl(t);
+				}
+			}
+			if (pkt_dev->cflows) {
+				pkt_dev->flows[flow].flags |= F_INIT;
+				pkt_dev->flows[flow].cur_daddr =
+				    pkt_dev->cur_daddr;
+#ifdef CONFIG_XFRM
+				if (pkt_dev->flags & F_IPSEC_ON)
+					get_ipsec_sa(pkt_dev, flow);
+#endif
+				pkt_dev->nflows++;
+			}
+		}
+	} else {		/* IPV6 * */
+
+		if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 &&
+		    pkt_dev->min_in6_daddr.s6_addr32[1] == 0 &&
+		    pkt_dev->min_in6_daddr.s6_addr32[2] == 0 &&
+		    pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ;
+		else {
+			int i;
+
+			/* Only random destinations yet */
+
+			for (i = 0; i < 4; i++) {
+				pkt_dev->cur_in6_daddr.s6_addr32[i] =
+				    (((__force __be32)random32() |
+				      pkt_dev->min_in6_daddr.s6_addr32[i]) &
+				     pkt_dev->max_in6_daddr.s6_addr32[i]);
+			}
+		}
+	}
+
+	if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
+		__u32 t;
+		if (pkt_dev->flags & F_TXSIZE_RND) {
+			t = random32() %
+				(pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
+				+ pkt_dev->min_pkt_size;
+		} else {
+			t = pkt_dev->cur_pkt_size + 1;
+			if (t > pkt_dev->max_pkt_size)
+				t = pkt_dev->min_pkt_size;
+		}
+		pkt_dev->cur_pkt_size = t;
+	}
+
+	set_cur_queue_map(pkt_dev);
+
+	pkt_dev->flows[flow].count++;
+}
+
+
+#ifdef CONFIG_XFRM
+static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
+{
+	struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+	int err = 0;
+
+	if (!x)
+		return 0;
+	/* XXX: we dont support tunnel mode for now until
+	 * we resolve the dst issue */
+	if (x->props.mode != XFRM_MODE_TRANSPORT)
+		return 0;
+
+	spin_lock(&x->lock);
+
+	err = x->outer_mode->output(x, skb);
+	if (err)
+		goto error;
+	err = x->type->output(x, skb);
+	if (err)
+		goto error;
+
+	x->curlft.bytes += skb->len;
+	x->curlft.packets++;
+error:
+	spin_unlock(&x->lock);
+	return err;
+}
+
+static void free_SAs(struct pktgen_dev *pkt_dev)
+{
+	if (pkt_dev->cflows) {
+		/* let go of the SAs if we have them */
+		int i;
+		for (i = 0; i < pkt_dev->cflows; i++) {
+			struct xfrm_state *x = pkt_dev->flows[i].x;
+			if (x) {
+				xfrm_state_put(x);
+				pkt_dev->flows[i].x = NULL;
+			}
+		}
+	}
+}
+
+static int process_ipsec(struct pktgen_dev *pkt_dev,
+			      struct sk_buff *skb, __be16 protocol)
+{
+	if (pkt_dev->flags & F_IPSEC_ON) {
+		struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
+		int nhead = 0;
+		if (x) {
+			int ret;
+			__u8 *eth;
+			nhead = x->props.header_len - skb_headroom(skb);
+			if (nhead > 0) {
+				ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
+				if (ret < 0) {
+					pr_err("Error expanding ipsec packet %d\n",
+					       ret);
+					goto err;
+				}
+			}
+
+			/* ipsec is not expecting ll header */
+			skb_pull(skb, ETH_HLEN);
+			ret = pktgen_output_ipsec(skb, pkt_dev);
+			if (ret) {
+				pr_err("Error creating ipsec packet %d\n", ret);
+				goto err;
+			}
+			/* restore ll */
+			eth = (__u8 *) skb_push(skb, ETH_HLEN);
+			memcpy(eth, pkt_dev->hh, 12);
+			*(u16 *) &eth[12] = protocol;
+		}
+	}
+	return 1;
+err:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
+{
+	unsigned i;
+	for (i = 0; i < pkt_dev->nr_labels; i++)
+		*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
+
+	mpls--;
+	*mpls |= MPLS_STACK_BOTTOM;
+}
+
+static inline __be16 build_tci(unsigned int id, unsigned int cfi,
+			       unsigned int prio)
+{
+	return htons(id | (cfi << 12) | (prio << 13));
+}
+
+static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
+				int datalen)
+{
+	struct timeval timestamp;
+	struct pktgen_hdr *pgh;
+
+	pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
+	datalen -= sizeof(*pgh);
+
+	if (pkt_dev->nfrags <= 0) {
+		memset(skb_put(skb, datalen), 0, datalen);
+	} else {
+		int frags = pkt_dev->nfrags;
+		int i, len;
+		int frag_len;
+
+
+		if (frags > MAX_SKB_FRAGS)
+			frags = MAX_SKB_FRAGS;
+		len = datalen - frags * PAGE_SIZE;
+		if (len > 0) {
+			memset(skb_put(skb, len), 0, len);
+			datalen = frags * PAGE_SIZE;
+		}
+
+		i = 0;
+		frag_len = (datalen/frags) < PAGE_SIZE ?
+			   (datalen/frags) : PAGE_SIZE;
+		while (datalen > 0) {
+			if (unlikely(!pkt_dev->page)) {
+				int node = numa_node_id();
+
+				if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
+					node = pkt_dev->node;
+				pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+				if (!pkt_dev->page)
+					break;
+			}
+			skb_shinfo(skb)->frags[i].page = pkt_dev->page;
+			get_page(pkt_dev->page);
+			skb_shinfo(skb)->frags[i].page_offset = 0;
+			/*last fragment, fill rest of data*/
+			if (i == (frags - 1))
+				skb_shinfo(skb)->frags[i].size =
+				    (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+			else
+				skb_shinfo(skb)->frags[i].size = frag_len;
+			datalen -= skb_shinfo(skb)->frags[i].size;
+			skb->len += skb_shinfo(skb)->frags[i].size;
+			skb->data_len += skb_shinfo(skb)->frags[i].size;
+			i++;
+			skb_shinfo(skb)->nr_frags = i;
+		}
+	}
+
+	/* Stamp the time, and sequence number,
+	 * convert them to network byte order
+	 */
+	pgh->pgh_magic = htonl(PKTGEN_MAGIC);
+	pgh->seq_num = htonl(pkt_dev->seq_num);
+
+	do_gettimeofday(&timestamp);
+	pgh->tv_sec = htonl(timestamp.tv_sec);
+	pgh->tv_usec = htonl(timestamp.tv_usec);
+}
+
+static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
+					struct pktgen_dev *pkt_dev)
+{
+	struct sk_buff *skb = NULL;
+	__u8 *eth;
+	struct udphdr *udph;
+	int datalen, iplen;
+	struct iphdr *iph;
+	__be16 protocol = htons(ETH_P_IP);
+	__be32 *mpls;
+	__be16 *vlan_tci = NULL;                 /* Encapsulates priority and VLAN ID */
+	__be16 *vlan_encapsulated_proto = NULL;  /* packet type ID field (or len) for VLAN tag */
+	__be16 *svlan_tci = NULL;                /* Encapsulates priority and SVLAN ID */
+	__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+	u16 queue_map;
+
+	if (pkt_dev->nr_labels)
+		protocol = htons(ETH_P_MPLS_UC);
+
+	if (pkt_dev->vlan_id != 0xffff)
+		protocol = htons(ETH_P_8021Q);
+
+	/* Update any of the values, used when we're incrementing various
+	 * fields.
+	 */
+	mod_cur_headers(pkt_dev);
+	queue_map = pkt_dev->cur_queue_map;
+
+	datalen = (odev->hard_header_len + 16) & ~0xf;
+
+	if (pkt_dev->flags & F_NODE) {
+		int node;
+
+		if (pkt_dev->node >= 0)
+			node = pkt_dev->node;
+		else
+			node =  numa_node_id();
+
+		skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64
+				  + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node);
+		if (likely(skb)) {
+			skb_reserve(skb, NET_SKB_PAD);
+			skb->dev = odev;
+		}
+	}
+	else
+	  skb = __netdev_alloc_skb(odev,
+				   pkt_dev->cur_pkt_size + 64
+				   + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT);
+
+	if (!skb) {
+		sprintf(pkt_dev->result, "No memory");
+		return NULL;
+	}
+	prefetchw(skb->data);
+
+	skb_reserve(skb, datalen);
+
+	/*  Reserve for ethernet and IP header  */
+	eth = (__u8 *) skb_push(skb, 14);
+	mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+	if (pkt_dev->nr_labels)
+		mpls_push(mpls, pkt_dev);
+
+	if (pkt_dev->vlan_id != 0xffff) {
+		if (pkt_dev->svlan_id != 0xffff) {
+			svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+			*svlan_tci = build_tci(pkt_dev->svlan_id,
+					       pkt_dev->svlan_cfi,
+					       pkt_dev->svlan_p);
+			svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+			*svlan_encapsulated_proto = htons(ETH_P_8021Q);
+		}
+		vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+		*vlan_tci = build_tci(pkt_dev->vlan_id,
+				      pkt_dev->vlan_cfi,
+				      pkt_dev->vlan_p);
+		vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+		*vlan_encapsulated_proto = htons(ETH_P_IP);
+	}
+
+	skb->network_header = skb->tail;
+	skb->transport_header = skb->network_header + sizeof(struct iphdr);
+	skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr));
+	skb_set_queue_mapping(skb, queue_map);
+	skb->priority = pkt_dev->skb_priority;
+
+	iph = ip_hdr(skb);
+	udph = udp_hdr(skb);
+
+	memcpy(eth, pkt_dev->hh, 12);
+	*(__be16 *) & eth[12] = protocol;
+
+	/* Eth + IPh + UDPh + mpls */
+	datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
+		  pkt_dev->pkt_overhead;
+	if (datalen < sizeof(struct pktgen_hdr))
+		datalen = sizeof(struct pktgen_hdr);
+
+	udph->source = htons(pkt_dev->cur_udp_src);
+	udph->dest = htons(pkt_dev->cur_udp_dst);
+	udph->len = htons(datalen + 8);	/* DATA + udphdr */
+	udph->check = 0;	/* No checksum */
+
+	iph->ihl = 5;
+	iph->version = 4;
+	iph->ttl = 32;
+	iph->tos = pkt_dev->tos;
+	iph->protocol = IPPROTO_UDP;	/* UDP */
+	iph->saddr = pkt_dev->cur_saddr;
+	iph->daddr = pkt_dev->cur_daddr;
+	iph->id = htons(pkt_dev->ip_id);
+	pkt_dev->ip_id++;
+	iph->frag_off = 0;
+	iplen = 20 + 8 + datalen;
+	iph->tot_len = htons(iplen);
+	iph->check = 0;
+	iph->check = ip_fast_csum((void *)iph, iph->ihl);
+	skb->protocol = protocol;
+	skb->mac_header = (skb->network_header - ETH_HLEN -
+			   pkt_dev->pkt_overhead);
+	skb->dev = odev;
+	skb->pkt_type = PACKET_HOST;
+	pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+#ifdef CONFIG_XFRM
+	if (!process_ipsec(pkt_dev, skb, protocol))
+		return NULL;
+#endif
+
+	return skb;
+}
+
+/*
+ * scan_ip6, fmt_ip taken from dietlibc-0.21
+ * Author Felix von Leitner <felix-dietlibc@fefe.de>
+ *
+ * Slightly modified for kernel.
+ * Should be candidate for net/ipv4/utils.c
+ * --ro
+ */
+
+static unsigned int scan_ip6(const char *s, char ip[16])
+{
+	unsigned int i;
+	unsigned int len = 0;
+	unsigned long u;
+	char suffix[16];
+	unsigned int prefixlen = 0;
+	unsigned int suffixlen = 0;
+	__be32 tmp;
+	char *pos;
+
+	for (i = 0; i < 16; i++)
+		ip[i] = 0;
+
+	for (;;) {
+		if (*s == ':') {
+			len++;
+			if (s[1] == ':') {	/* Found "::", skip to part 2 */
+				s += 2;
+				len++;
+				break;
+			}
+			s++;
+		}
+
+		u = simple_strtoul(s, &pos, 16);
+		i = pos - s;
+		if (!i)
+			return 0;
+		if (prefixlen == 12 && s[i] == '.') {
+
+			/* the last 4 bytes may be written as IPv4 address */
+
+			tmp = in_aton(s);
+			memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp));
+			return i + len;
+		}
+		ip[prefixlen++] = (u >> 8);
+		ip[prefixlen++] = (u & 255);
+		s += i;
+		len += i;
+		if (prefixlen == 16)
+			return len;
+	}
+
+/* part 2, after "::" */
+	for (;;) {
+		if (*s == ':') {
+			if (suffixlen == 0)
+				break;
+			s++;
+			len++;
+		} else if (suffixlen != 0)
+			break;
+
+		u = simple_strtol(s, &pos, 16);
+		i = pos - s;
+		if (!i) {
+			if (*s)
+				len--;
+			break;
+		}
+		if (suffixlen + prefixlen <= 12 && s[i] == '.') {
+			tmp = in_aton(s);
+			memcpy((struct in_addr *)(suffix + suffixlen), &tmp,
+			       sizeof(tmp));
+			suffixlen += 4;
+			len += strlen(s);
+			break;
+		}
+		suffix[suffixlen++] = (u >> 8);
+		suffix[suffixlen++] = (u & 255);
+		s += i;
+		len += i;
+		if (prefixlen + suffixlen == 16)
+			break;
+	}
+	for (i = 0; i < suffixlen; i++)
+		ip[16 - suffixlen + i] = suffix[i];
+	return len;
+}
+
+static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
+					struct pktgen_dev *pkt_dev)
+{
+	struct sk_buff *skb = NULL;
+	__u8 *eth;
+	struct udphdr *udph;
+	int datalen;
+	struct ipv6hdr *iph;
+	__be16 protocol = htons(ETH_P_IPV6);
+	__be32 *mpls;
+	__be16 *vlan_tci = NULL;                 /* Encapsulates priority and VLAN ID */
+	__be16 *vlan_encapsulated_proto = NULL;  /* packet type ID field (or len) for VLAN tag */
+	__be16 *svlan_tci = NULL;                /* Encapsulates priority and SVLAN ID */
+	__be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */
+	u16 queue_map;
+
+	if (pkt_dev->nr_labels)
+		protocol = htons(ETH_P_MPLS_UC);
+
+	if (pkt_dev->vlan_id != 0xffff)
+		protocol = htons(ETH_P_8021Q);
+
+	/* Update any of the values, used when we're incrementing various
+	 * fields.
+	 */
+	mod_cur_headers(pkt_dev);
+	queue_map = pkt_dev->cur_queue_map;
+
+	skb = __netdev_alloc_skb(odev,
+				 pkt_dev->cur_pkt_size + 64
+				 + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT);
+	if (!skb) {
+		sprintf(pkt_dev->result, "No memory");
+		return NULL;
+	}
+	prefetchw(skb->data);
+
+	skb_reserve(skb, 16);
+
+	/*  Reserve for ethernet and IP header  */
+	eth = (__u8 *) skb_push(skb, 14);
+	mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32));
+	if (pkt_dev->nr_labels)
+		mpls_push(mpls, pkt_dev);
+
+	if (pkt_dev->vlan_id != 0xffff) {
+		if (pkt_dev->svlan_id != 0xffff) {
+			svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+			*svlan_tci = build_tci(pkt_dev->svlan_id,
+					       pkt_dev->svlan_cfi,
+					       pkt_dev->svlan_p);
+			svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+			*svlan_encapsulated_proto = htons(ETH_P_8021Q);
+		}
+		vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16));
+		*vlan_tci = build_tci(pkt_dev->vlan_id,
+				      pkt_dev->vlan_cfi,
+				      pkt_dev->vlan_p);
+		vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16));
+		*vlan_encapsulated_proto = htons(ETH_P_IPV6);
+	}
+
+	skb->network_header = skb->tail;
+	skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
+	skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr));
+	skb_set_queue_mapping(skb, queue_map);
+	skb->priority = pkt_dev->skb_priority;
+	iph = ipv6_hdr(skb);
+	udph = udp_hdr(skb);
+
+	memcpy(eth, pkt_dev->hh, 12);
+	*(__be16 *) &eth[12] = protocol;
+
+	/* Eth + IPh + UDPh + mpls */
+	datalen = pkt_dev->cur_pkt_size - 14 -
+		  sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
+		  pkt_dev->pkt_overhead;
+
+	if (datalen < sizeof(struct pktgen_hdr)) {
+		datalen = sizeof(struct pktgen_hdr);
+		if (net_ratelimit())
+			pr_info("increased datalen to %d\n", datalen);
+	}
+
+	udph->source = htons(pkt_dev->cur_udp_src);
+	udph->dest = htons(pkt_dev->cur_udp_dst);
+	udph->len = htons(datalen + sizeof(struct udphdr));
+	udph->check = 0;	/* No checksum */
+
+	*(__be32 *) iph = htonl(0x60000000);	/* Version + flow */
+
+	if (pkt_dev->traffic_class) {
+		/* Version + traffic class + flow (0) */
+		*(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20));
+	}
+
+	iph->hop_limit = 32;
+
+	iph->payload_len = htons(sizeof(struct udphdr) + datalen);
+	iph->nexthdr = IPPROTO_UDP;
+
+	ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
+	ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+
+	skb->mac_header = (skb->network_header - ETH_HLEN -
+			   pkt_dev->pkt_overhead);
+	skb->protocol = protocol;
+	skb->dev = odev;
+	skb->pkt_type = PACKET_HOST;
+
+	pktgen_finalize_skb(pkt_dev, skb, datalen);
+
+	return skb;
+}
+
+static struct sk_buff *fill_packet(struct net_device *odev,
+				   struct pktgen_dev *pkt_dev)
+{
+	if (pkt_dev->flags & F_IPV6)
+		return fill_packet_ipv6(odev, pkt_dev);
+	else
+		return fill_packet_ipv4(odev, pkt_dev);
+}
+
+static void pktgen_clear_counters(struct pktgen_dev *pkt_dev)
+{
+	pkt_dev->seq_num = 1;
+	pkt_dev->idle_acc = 0;
+	pkt_dev->sofar = 0;
+	pkt_dev->tx_bytes = 0;
+	pkt_dev->errors = 0;
+}
+
+/* Set up structure for sending pkts, clear counters */
+
+static void pktgen_run(struct pktgen_thread *t)
+{
+	struct pktgen_dev *pkt_dev;
+	int started = 0;
+
+	func_enter();
+
+	if_lock(t);
+	list_for_each_entry(pkt_dev, &t->if_list, list) {
+
+		/*
+		 * setup odev and create initial packet.
+		 */
+		pktgen_setup_inject(pkt_dev);
+
+		if (pkt_dev->odev) {
+			pktgen_clear_counters(pkt_dev);
+			pkt_dev->running = 1;	/* Cranke yeself! */
+			pkt_dev->skb = NULL;
+			pkt_dev->started_at =
+				pkt_dev->next_tx = ktime_now();
+
+			set_pkt_overhead(pkt_dev);
+
+			strcpy(pkt_dev->result, "Starting");
+			started++;
+		} else
+			strcpy(pkt_dev->result, "Error starting");
+	}
+	if_unlock(t);
+	if (started)
+		t->control &= ~(T_STOP);
+}
+
+static void pktgen_stop_all_threads_ifs(void)
+{
+	struct pktgen_thread *t;
+
+	func_enter();
+
+	mutex_lock(&pktgen_thread_lock);
+
+	list_for_each_entry(t, &pktgen_threads, th_list)
+		t->control |= T_STOP;
+
+	mutex_unlock(&pktgen_thread_lock);
+}
+
+static int thread_is_running(const struct pktgen_thread *t)
+{
+	const struct pktgen_dev *pkt_dev;
+
+	list_for_each_entry(pkt_dev, &t->if_list, list)
+		if (pkt_dev->running)
+			return 1;
+	return 0;
+}
+
+static int pktgen_wait_thread_run(struct pktgen_thread *t)
+{
+	if_lock(t);
+
+	while (thread_is_running(t)) {
+
+		if_unlock(t);
+
+		msleep_interruptible(100);
+
+		if (signal_pending(current))
+			goto signal;
+		if_lock(t);
+	}
+	if_unlock(t);
+	return 1;
+signal:
+	return 0;
+}
+
+static int pktgen_wait_all_threads_run(void)
+{
+	struct pktgen_thread *t;
+	int sig = 1;
+
+	mutex_lock(&pktgen_thread_lock);
+
+	list_for_each_entry(t, &pktgen_threads, th_list) {
+		sig = pktgen_wait_thread_run(t);
+		if (sig == 0)
+			break;
+	}
+
+	if (sig == 0)
+		list_for_each_entry(t, &pktgen_threads, th_list)
+			t->control |= (T_STOP);
+
+	mutex_unlock(&pktgen_thread_lock);
+	return sig;
+}
+
+static void pktgen_run_all_threads(void)
+{
+	struct pktgen_thread *t;
+
+	func_enter();
+
+	mutex_lock(&pktgen_thread_lock);
+
+	list_for_each_entry(t, &pktgen_threads, th_list)
+		t->control |= (T_RUN);
+
+	mutex_unlock(&pktgen_thread_lock);
+
+	/* Propagate thread->control  */
+	schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+	pktgen_wait_all_threads_run();
+}
+
+static void pktgen_reset_all_threads(void)
+{
+	struct pktgen_thread *t;
+
+	func_enter();
+
+	mutex_lock(&pktgen_thread_lock);
+
+	list_for_each_entry(t, &pktgen_threads, th_list)
+		t->control |= (T_REMDEVALL);
+
+	mutex_unlock(&pktgen_thread_lock);
+
+	/* Propagate thread->control  */
+	schedule_timeout_interruptible(msecs_to_jiffies(125));
+
+	pktgen_wait_all_threads_run();
+}
+
+static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
+{
+	__u64 bps, mbps, pps;
+	char *p = pkt_dev->result;
+	ktime_t elapsed = ktime_sub(pkt_dev->stopped_at,
+				    pkt_dev->started_at);
+	ktime_t idle = ns_to_ktime(pkt_dev->idle_acc);
+
+	p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n",
+		     (unsigned long long)ktime_to_us(elapsed),
+		     (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)),
+		     (unsigned long long)ktime_to_us(idle),
+		     (unsigned long long)pkt_dev->sofar,
+		     pkt_dev->cur_pkt_size, nr_frags);
+
+	pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
+			ktime_to_ns(elapsed));
+
+	bps = pps * 8 * pkt_dev->cur_pkt_size;
+
+	mbps = bps;
+	do_div(mbps, 1000000);
+	p += sprintf(p, "  %llupps %lluMb/sec (%llubps) errors: %llu",
+		     (unsigned long long)pps,
+		     (unsigned long long)mbps,
+		     (unsigned long long)bps,
+		     (unsigned long long)pkt_dev->errors);
+}
+
+/* Set stopped-at timer, remove from running list, do counters & statistics */
+static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
+{
+	int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
+
+	if (!pkt_dev->running) {
+		pr_warning("interface: %s is already stopped\n",
+			   pkt_dev->odevname);
+		return -EINVAL;
+	}
+
+	kfree_skb(pkt_dev->skb);
+	pkt_dev->skb = NULL;
+	pkt_dev->stopped_at = ktime_now();
+	pkt_dev->running = 0;
+
+	show_results(pkt_dev, nr_frags);
+
+	return 0;
+}
+
+static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
+{
+	struct pktgen_dev *pkt_dev, *best = NULL;
+
+	if_lock(t);
+
+	list_for_each_entry(pkt_dev, &t->if_list, list) {
+		if (!pkt_dev->running)
+			continue;
+		if (best == NULL)
+			best = pkt_dev;
+		else if (ktime_lt(pkt_dev->next_tx, best->next_tx))
+			best = pkt_dev;
+	}
+	if_unlock(t);
+	return best;
+}
+
+static void pktgen_stop(struct pktgen_thread *t)
+{
+	struct pktgen_dev *pkt_dev;
+
+	func_enter();
+
+	if_lock(t);
+
+	list_for_each_entry(pkt_dev, &t->if_list, list) {
+		pktgen_stop_device(pkt_dev);
+	}
+
+	if_unlock(t);
+}
+
+/*
+ * one of our devices needs to be removed - find it
+ * and remove it
+ */
+static void pktgen_rem_one_if(struct pktgen_thread *t)
+{
+	struct list_head *q, *n;
+	struct pktgen_dev *cur;
+
+	func_enter();
+
+	if_lock(t);
+
+	list_for_each_safe(q, n, &t->if_list) {
+		cur = list_entry(q, struct pktgen_dev, list);
+
+		if (!cur->removal_mark)
+			continue;
+
+		kfree_skb(cur->skb);
+		cur->skb = NULL;
+
+		pktgen_remove_device(t, cur);
+
+		break;
+	}
+
+	if_unlock(t);
+}
+
+static void pktgen_rem_all_ifs(struct pktgen_thread *t)
+{
+	struct list_head *q, *n;
+	struct pktgen_dev *cur;
+
+	func_enter();
+
+	/* Remove all devices, free mem */
+
+	if_lock(t);
+
+	list_for_each_safe(q, n, &t->if_list) {
+		cur = list_entry(q, struct pktgen_dev, list);
+
+		kfree_skb(cur->skb);
+		cur->skb = NULL;
+
+		pktgen_remove_device(t, cur);
+	}
+
+	if_unlock(t);
+}
+
+static void pktgen_rem_thread(struct pktgen_thread *t)
+{
+	/* Remove from the thread list */
+
+	remove_proc_entry(t->tsk->comm, pg_proc_dir);
+
+}
+
+static void pktgen_resched(struct pktgen_dev *pkt_dev)
+{
+	ktime_t idle_start = ktime_now();
+	schedule();
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+}
+
+static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
+{
+	ktime_t idle_start = ktime_now();
+
+	while (atomic_read(&(pkt_dev->skb->users)) != 1) {
+		if (signal_pending(current))
+			break;
+
+		if (need_resched())
+			pktgen_resched(pkt_dev);
+		else
+			cpu_relax();
+	}
+	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start));
+}
+
+static void pktgen_xmit(struct pktgen_dev *pkt_dev)
+{
+	struct net_device *odev = pkt_dev->odev;
+	netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
+		= odev->netdev_ops->ndo_start_xmit;
+	struct netdev_queue *txq;
+	u16 queue_map;
+	int ret;
+
+	/* If device is offline, then don't send */
+	if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
+		pktgen_stop_device(pkt_dev);
+		return;
+	}
+
+	/* This is max DELAY, this has special meaning of
+	 * "never transmit"
+	 */
+	if (unlikely(pkt_dev->delay == ULLONG_MAX)) {
+		pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX);
+		return;
+	}
+
+	/* If no skb or clone count exhausted then get new one */
+	if (!pkt_dev->skb || (pkt_dev->last_ok &&
+			      ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+		/* build a new pkt */
+		kfree_skb(pkt_dev->skb);
+
+		pkt_dev->skb = fill_packet(odev, pkt_dev);
+		if (pkt_dev->skb == NULL) {
+			pr_err("ERROR: couldn't allocate skb in fill_packet\n");
+			schedule();
+			pkt_dev->clone_count--;	/* back out increment, OOM */
+			return;
+		}
+		pkt_dev->last_pkt_size = pkt_dev->skb->len;
+		pkt_dev->allocated_skbs++;
+		pkt_dev->clone_count = 0;	/* reset counter */
+	}
+
+	if (pkt_dev->delay && pkt_dev->last_ok)
+		spin(pkt_dev, pkt_dev->next_tx);
+
+	queue_map = skb_get_queue_mapping(pkt_dev->skb);
+	txq = netdev_get_tx_queue(odev, queue_map);
+
+	__netif_tx_lock_bh(txq);
+
+	if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
+		ret = NETDEV_TX_BUSY;
+		pkt_dev->last_ok = 0;
+		goto unlock;
+	}
+	atomic_inc(&(pkt_dev->skb->users));
+	ret = (*xmit)(pkt_dev->skb, odev);
+
+	switch (ret) {
+	case NETDEV_TX_OK:
+		txq_trans_update(txq);
+		pkt_dev->last_ok = 1;
+		pkt_dev->sofar++;
+		pkt_dev->seq_num++;
+		pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
+		break;
+	case NET_XMIT_DROP:
+	case NET_XMIT_CN:
+	case NET_XMIT_POLICED:
+		/* skb has been consumed */
+		pkt_dev->errors++;
+		break;
+	default: /* Drivers are not supposed to return other values! */
+		if (net_ratelimit())
+			pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret);
+		pkt_dev->errors++;
+		/* fallthru */
+	case NETDEV_TX_LOCKED:
+	case NETDEV_TX_BUSY:
+		/* Retry it next time */
+		atomic_dec(&(pkt_dev->skb->users));
+		pkt_dev->last_ok = 0;
+	}
+unlock:
+	__netif_tx_unlock_bh(txq);
+
+	/* If pkt_dev->count is zero, then run forever */
+	if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
+		pktgen_wait_for_skb(pkt_dev);
+
+		/* Done with this */
+		pktgen_stop_device(pkt_dev);
+	}
+}
+
+/*
+ * Main loop of the thread goes here
+ */
+
+static int pktgen_thread_worker(void *arg)
+{
+	DEFINE_WAIT(wait);
+	struct pktgen_thread *t = arg;
+	struct pktgen_dev *pkt_dev = NULL;
+	int cpu = t->cpu;
+
+	BUG_ON(smp_processor_id() != cpu);
+
+	init_waitqueue_head(&t->queue);
+	complete(&t->start_done);
+
+	pr_debug("starting pktgen/%d:  pid=%d\n", cpu, task_pid_nr(current));
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		pkt_dev = next_to_run(t);
+
+		if (unlikely(!pkt_dev && t->control == 0)) {
+			if (pktgen_exiting)
+				break;
+			wait_event_interruptible_timeout(t->queue,
+							 t->control != 0,
+							 HZ/10);
+			try_to_freeze();
+			continue;
+		}
+
+		__set_current_state(TASK_RUNNING);
+
+		if (likely(pkt_dev)) {
+			pktgen_xmit(pkt_dev);
+
+			if (need_resched())
+				pktgen_resched(pkt_dev);
+			else
+				cpu_relax();
+		}
+
+		if (t->control & T_STOP) {
+			pktgen_stop(t);
+			t->control &= ~(T_STOP);
+		}
+
+		if (t->control & T_RUN) {
+			pktgen_run(t);
+			t->control &= ~(T_RUN);
+		}
+
+		if (t->control & T_REMDEVALL) {
+			pktgen_rem_all_ifs(t);
+			t->control &= ~(T_REMDEVALL);
+		}
+
+		if (t->control & T_REMDEV) {
+			pktgen_rem_one_if(t);
+			t->control &= ~(T_REMDEV);
+		}
+
+		try_to_freeze();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	pr_debug("%s stopping all device\n", t->tsk->comm);
+	pktgen_stop(t);
+
+	pr_debug("%s removing all device\n", t->tsk->comm);
+	pktgen_rem_all_ifs(t);
+
+	pr_debug("%s removing thread\n", t->tsk->comm);
+	pktgen_rem_thread(t);
+
+	/* Wait for kthread_stop */
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
+					  const char *ifname, bool exact)
+{
+	struct pktgen_dev *p, *pkt_dev = NULL;
+	size_t len = strlen(ifname);
+
+	if_lock(t);
+	list_for_each_entry(p, &t->if_list, list)
+		if (strncmp(p->odevname, ifname, len) == 0) {
+			if (p->odevname[len]) {
+				if (exact || p->odevname[len] != '@')
+					continue;
+			}
+			pkt_dev = p;
+			break;
+		}
+
+	if_unlock(t);
+	pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
+	return pkt_dev;
+}
+
+/*
+ * Adds a dev at front of if_list.
+ */
+
+static int add_dev_to_thread(struct pktgen_thread *t,
+			     struct pktgen_dev *pkt_dev)
+{
+	int rv = 0;
+
+	if_lock(t);
+
+	if (pkt_dev->pg_thread) {
+		pr_err("ERROR: already assigned to a thread\n");
+		rv = -EBUSY;
+		goto out;
+	}
+
+	list_add(&pkt_dev->list, &t->if_list);
+	pkt_dev->pg_thread = t;
+	pkt_dev->running = 0;
+
+out:
+	if_unlock(t);
+	return rv;
+}
+
+/* Called under thread lock */
+
+static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
+{
+	struct pktgen_dev *pkt_dev;
+	int err;
+	int node = cpu_to_node(t->cpu);
+
+	/* We don't allow a device to be on several threads */
+
+	pkt_dev = __pktgen_NN_threads(ifname, FIND);
+	if (pkt_dev) {
+		pr_err("ERROR: interface already used\n");
+		return -EBUSY;
+	}
+
+	pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node);
+	if (!pkt_dev)
+		return -ENOMEM;
+
+	strcpy(pkt_dev->odevname, ifname);
+	pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state),
+				      node);
+	if (pkt_dev->flows == NULL) {
+		kfree(pkt_dev);
+		return -ENOMEM;
+	}
+
+	pkt_dev->removal_mark = 0;
+	pkt_dev->min_pkt_size = ETH_ZLEN;
+	pkt_dev->max_pkt_size = ETH_ZLEN;
+	pkt_dev->nfrags = 0;
+	pkt_dev->delay = pg_delay_d;
+	pkt_dev->count = pg_count_d;
+	pkt_dev->sofar = 0;
+	pkt_dev->udp_src_min = 9;	/* sink port */
+	pkt_dev->udp_src_max = 9;
+	pkt_dev->udp_dst_min = 9;
+	pkt_dev->udp_dst_max = 9;
+	pkt_dev->vlan_p = 0;
+	pkt_dev->vlan_cfi = 0;
+	pkt_dev->vlan_id = 0xffff;
+	pkt_dev->svlan_p = 0;
+	pkt_dev->svlan_cfi = 0;
+	pkt_dev->svlan_id = 0xffff;
+	pkt_dev->node = -1;
+
+	err = pktgen_setup_dev(pkt_dev, ifname);
+	if (err)
+		goto out1;
+	if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+		pkt_dev->clone_skb = pg_clone_skb_d;
+
+	pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir,
+					  &pktgen_if_fops, pkt_dev);
+	if (!pkt_dev->entry) {
+		pr_err("cannot create %s/%s procfs entry\n",
+		       PG_PROC_DIR, ifname);
+		err = -EINVAL;
+		goto out2;
+	}
+#ifdef CONFIG_XFRM
+	pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
+	pkt_dev->ipsproto = IPPROTO_ESP;
+#endif
+
+	return add_dev_to_thread(t, pkt_dev);
+out2:
+	dev_put(pkt_dev->odev);
+out1:
+#ifdef CONFIG_XFRM
+	free_SAs(pkt_dev);
+#endif
+	vfree(pkt_dev->flows);
+	kfree(pkt_dev);
+	return err;
+}
+
+static int __init pktgen_create_thread(int cpu)
+{
+	struct pktgen_thread *t;
+	struct proc_dir_entry *pe;
+	struct task_struct *p;
+
+	t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
+			 cpu_to_node(cpu));
+	if (!t) {
+		pr_err("ERROR: out of memory, can't create new thread\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&t->if_lock);
+	t->cpu = cpu;
+
+	INIT_LIST_HEAD(&t->if_list);
+
+	list_add_tail(&t->th_list, &pktgen_threads);
+	init_completion(&t->start_done);
+
+	p = kthread_create_on_node(pktgen_thread_worker,
+				   t,
+				   cpu_to_node(cpu),
+				   "kpktgend_%d", cpu);
+	if (IS_ERR(p)) {
+		pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+		list_del(&t->th_list);
+		kfree(t);
+		return PTR_ERR(p);
+	}
+	kthread_bind(p, cpu);
+	t->tsk = p;
+
+	pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir,
+			      &pktgen_thread_fops, t);
+	if (!pe) {
+		pr_err("cannot create %s/%s procfs entry\n",
+		       PG_PROC_DIR, t->tsk->comm);
+		kthread_stop(p);
+		list_del(&t->th_list);
+		kfree(t);
+		return -EINVAL;
+	}
+
+	wake_up_process(p);
+	wait_for_completion(&t->start_done);
+
+	return 0;
+}
+
+/*
+ * Removes a device from the thread if_list.
+ */
+static void _rem_dev_from_if_list(struct pktgen_thread *t,
+				  struct pktgen_dev *pkt_dev)
+{
+	struct list_head *q, *n;
+	struct pktgen_dev *p;
+
+	list_for_each_safe(q, n, &t->if_list) {
+		p = list_entry(q, struct pktgen_dev, list);
+		if (p == pkt_dev)
+			list_del(&p->list);
+	}
+}
+
+static int pktgen_remove_device(struct pktgen_thread *t,
+				struct pktgen_dev *pkt_dev)
+{
+
+	pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
+
+	if (pkt_dev->running) {
+		pr_warning("WARNING: trying to remove a running interface, stopping it now\n");
+		pktgen_stop_device(pkt_dev);
+	}
+
+	/* Dis-associate from the interface */
+
+	if (pkt_dev->odev) {
+		dev_put(pkt_dev->odev);
+		pkt_dev->odev = NULL;
+	}
+
+	/* And update the thread if_list */
+
+	_rem_dev_from_if_list(t, pkt_dev);
+
+	if (pkt_dev->entry)
+		remove_proc_entry(pkt_dev->entry->name, pg_proc_dir);
+
+#ifdef CONFIG_XFRM
+	free_SAs(pkt_dev);
+#endif
+	vfree(pkt_dev->flows);
+	if (pkt_dev->page)
+		put_page(pkt_dev->page);
+	kfree(pkt_dev);
+	return 0;
+}
+
+static int __init pg_init(void)
+{
+	int cpu;
+	struct proc_dir_entry *pe;
+	int ret = 0;
+
+	pr_info("%s", version);
+
+	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
+	if (!pg_proc_dir)
+		return -ENODEV;
+
+	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
+	if (pe == NULL) {
+		pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL);
+		ret = -EINVAL;
+		goto remove_dir;
+	}
+
+	register_netdevice_notifier(&pktgen_notifier_block);
+
+	for_each_online_cpu(cpu) {
+		int err;
+
+		err = pktgen_create_thread(cpu);
+		if (err)
+			pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n",
+				   cpu, err);
+	}
+
+	if (list_empty(&pktgen_threads)) {
+		pr_err("ERROR: Initialization failed for all threads\n");
+		ret = -ENODEV;
+		goto unregister;
+	}
+
+	return 0;
+
+ unregister:
+	unregister_netdevice_notifier(&pktgen_notifier_block);
+	remove_proc_entry(PGCTRL, pg_proc_dir);
+ remove_dir:
+	proc_net_remove(&init_net, PG_PROC_DIR);
+	return ret;
+}
+
+static void __exit pg_cleanup(void)
+{
+	struct pktgen_thread *t;
+	struct list_head *q, *n;
+	LIST_HEAD(list);
+
+	/* Stop all interfaces & threads */
+	pktgen_exiting = true;
+
+	mutex_lock(&pktgen_thread_lock);
+	list_splice_init(&pktgen_threads, &list);
+	mutex_unlock(&pktgen_thread_lock);
+
+	list_for_each_safe(q, n, &list) {
+		t = list_entry(q, struct pktgen_thread, th_list);
+		list_del(&t->th_list);
+		kthread_stop(t->tsk);
+		kfree(t);
+	}
+
+	/* Un-register us from receiving netdevice events */
+	unregister_netdevice_notifier(&pktgen_notifier_block);
+
+	/* Clean up proc file system */
+	remove_proc_entry(PGCTRL, pg_proc_dir);
+	proc_net_remove(&init_net, PG_PROC_DIR);
+}
+
+module_init(pg_init);
+module_exit(pg_cleanup);
+
+MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>");
+MODULE_DESCRIPTION("Packet Generator tool");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);
+module_param(pg_count_d, int, 0);
+MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject");
+module_param(pg_delay_d, int, 0);
+MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)");
+module_param(pg_clone_skb_d, int, 0);
+MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet");
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Enable debugging of pktgen module");
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 00000000..182236b2
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,131 @@
+/*
+ * NET		Generic infrastructure for Network protocols.
+ *
+ * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * 		From code originally in include/net/tcp.h
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <net/request_sock.h>
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Note : Dont forget somaxconn that may limit backlog too.
+ */
+int sysctl_max_syn_backlog = 256;
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+
+int reqsk_queue_alloc(struct request_sock_queue *queue,
+		      unsigned int nr_table_entries)
+{
+	size_t lopt_size = sizeof(struct listen_sock);
+	struct listen_sock *lopt;
+
+	nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+	nr_table_entries = max_t(u32, nr_table_entries, 8);
+	nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+	lopt_size += nr_table_entries * sizeof(struct request_sock *);
+	if (lopt_size > PAGE_SIZE)
+		lopt = vzalloc(lopt_size);
+	else
+		lopt = kzalloc(lopt_size, GFP_KERNEL);
+	if (lopt == NULL)
+		return -ENOMEM;
+
+	for (lopt->max_qlen_log = 3;
+	     (1 << lopt->max_qlen_log) < nr_table_entries;
+	     lopt->max_qlen_log++);
+
+	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
+	rwlock_init(&queue->syn_wait_lock);
+	queue->rskq_accept_head = NULL;
+	lopt->nr_table_entries = nr_table_entries;
+
+	write_lock_bh(&queue->syn_wait_lock);
+	queue->listen_opt = lopt;
+	write_unlock_bh(&queue->syn_wait_lock);
+
+	return 0;
+}
+
+void __reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+	struct listen_sock *lopt;
+	size_t lopt_size;
+
+	/*
+	 * this is an error recovery path only
+	 * no locking needed and the lopt is not NULL
+	 */
+
+	lopt = queue->listen_opt;
+	lopt_size = sizeof(struct listen_sock) +
+		lopt->nr_table_entries * sizeof(struct request_sock *);
+
+	if (lopt_size > PAGE_SIZE)
+		vfree(lopt);
+	else
+		kfree(lopt);
+}
+
+static inline struct listen_sock *reqsk_queue_yank_listen_sk(
+		struct request_sock_queue *queue)
+{
+	struct listen_sock *lopt;
+
+	write_lock_bh(&queue->syn_wait_lock);
+	lopt = queue->listen_opt;
+	queue->listen_opt = NULL;
+	write_unlock_bh(&queue->syn_wait_lock);
+
+	return lopt;
+}
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+	/* make all the listen_opt local to us */
+	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+	size_t lopt_size = sizeof(struct listen_sock) +
+		lopt->nr_table_entries * sizeof(struct request_sock *);
+
+	if (lopt->qlen != 0) {
+		unsigned int i;
+
+		for (i = 0; i < lopt->nr_table_entries; i++) {
+			struct request_sock *req;
+
+			while ((req = lopt->syn_table[i]) != NULL) {
+				lopt->syn_table[i] = req->dl_next;
+				lopt->qlen--;
+				reqsk_free(req);
+			}
+		}
+	}
+
+	WARN_ON(lopt->qlen != 0);
+	if (lopt_size > PAGE_SIZE)
+		vfree(lopt);
+	else
+		kfree(lopt);
+}
+
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
new file mode 100644
index 00000000..abd936d8
--- /dev/null
+++ b/net/core/rtnetlink.c
@@ -0,0 +1,2030 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Routing netlink socket interface: protocol independent part.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *	Vitaly E. Lavrov		RTA_OK arithmetics was wrong.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/if_addr.h>
+#include <linux/pci.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/fib_rules.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+
+struct rtnl_link {
+	rtnl_doit_func		doit;
+	rtnl_dumpit_func	dumpit;
+};
+
+static DEFINE_MUTEX(rtnl_mutex);
+
+void rtnl_lock(void)
+{
+	mutex_lock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock);
+
+void __rtnl_unlock(void)
+{
+	mutex_unlock(&rtnl_mutex);
+}
+
+void rtnl_unlock(void)
+{
+	/* This fellow will unlock it for us. */
+	netdev_run_todo();
+}
+EXPORT_SYMBOL(rtnl_unlock);
+
+int rtnl_trylock(void)
+{
+	return mutex_trylock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+	return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rtnl_is_held(void)
+{
+	return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+
+static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+
+static inline int rtm_msgindex(int msgtype)
+{
+	int msgindex = msgtype - RTM_BASE;
+
+	/*
+	 * msgindex < 0 implies someone tried to register a netlink
+	 * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
+	 * the message type has not been added to linux/rtnetlink.h
+	 */
+	BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);
+
+	return msgindex;
+}
+
+static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex)
+{
+	struct rtnl_link *tab;
+
+	if (protocol <= RTNL_FAMILY_MAX)
+		tab = rtnl_msg_handlers[protocol];
+	else
+		tab = NULL;
+
+	if (tab == NULL || tab[msgindex].doit == NULL)
+		tab = rtnl_msg_handlers[PF_UNSPEC];
+
+	return tab ? tab[msgindex].doit : NULL;
+}
+
+static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
+{
+	struct rtnl_link *tab;
+
+	if (protocol <= RTNL_FAMILY_MAX)
+		tab = rtnl_msg_handlers[protocol];
+	else
+		tab = NULL;
+
+	if (tab == NULL || tab[msgindex].dumpit == NULL)
+		tab = rtnl_msg_handlers[PF_UNSPEC];
+
+	return tab ? tab[msgindex].dumpit : NULL;
+}
+
+/**
+ * __rtnl_register - Register a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ * @doit: Function pointer called for each request message
+ * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_register(int protocol, int msgtype,
+		    rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+{
+	struct rtnl_link *tab;
+	int msgindex;
+
+	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+	msgindex = rtm_msgindex(msgtype);
+
+	tab = rtnl_msg_handlers[protocol];
+	if (tab == NULL) {
+		tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
+		if (tab == NULL)
+			return -ENOBUFS;
+
+		rtnl_msg_handlers[protocol] = tab;
+	}
+
+	if (doit)
+		tab[msgindex].doit = doit;
+
+	if (dumpit)
+		tab[msgindex].dumpit = dumpit;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_register);
+
+/**
+ * rtnl_register - Register a rtnetlink message type
+ *
+ * Identical to __rtnl_register() but panics on failure. This is useful
+ * as failure of this function is very unlikely, it can only happen due
+ * to lack of memory when allocating the chain to store all message
+ * handlers for a protocol. Meant for use in init functions where lack
+ * of memory implies no sense in continuing.
+ */
+void rtnl_register(int protocol, int msgtype,
+		   rtnl_doit_func doit, rtnl_dumpit_func dumpit)
+{
+	if (__rtnl_register(protocol, msgtype, doit, dumpit) < 0)
+		panic("Unable to register rtnetlink message handler, "
+		      "protocol = %d, message type = %d\n",
+		      protocol, msgtype);
+}
+EXPORT_SYMBOL_GPL(rtnl_register);
+
+/**
+ * rtnl_unregister - Unregister a rtnetlink message type
+ * @protocol: Protocol family or PF_UNSPEC
+ * @msgtype: rtnetlink message type
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_unregister(int protocol, int msgtype)
+{
+	int msgindex;
+
+	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+	msgindex = rtm_msgindex(msgtype);
+
+	if (rtnl_msg_handlers[protocol] == NULL)
+		return -ENOENT;
+
+	rtnl_msg_handlers[protocol][msgindex].doit = NULL;
+	rtnl_msg_handlers[protocol][msgindex].dumpit = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister);
+
+/**
+ * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
+ * @protocol : Protocol family or PF_UNSPEC
+ *
+ * Identical to calling rtnl_unregster() for all registered message types
+ * of a certain protocol family.
+ */
+void rtnl_unregister_all(int protocol)
+{
+	BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
+
+	kfree(rtnl_msg_handlers[protocol]);
+	rtnl_msg_handlers[protocol] = NULL;
+}
+EXPORT_SYMBOL_GPL(rtnl_unregister_all);
+
+static LIST_HEAD(link_ops);
+
+/**
+ * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * The caller must hold the rtnl_mutex. This function should be used
+ * by drivers that create devices during module initialization. It
+ * must be called before registering the devices.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_link_register(struct rtnl_link_ops *ops)
+{
+	if (!ops->dellink)
+		ops->dellink = unregister_netdevice_queue;
+
+	list_add_tail(&ops->list, &link_ops);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_register);
+
+/**
+ * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
+ * @ops: struct rtnl_link_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_link_register(struct rtnl_link_ops *ops)
+{
+	int err;
+
+	rtnl_lock();
+	err = __rtnl_link_register(ops);
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_link_register);
+
+static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
+{
+	struct net_device *dev;
+	LIST_HEAD(list_kill);
+
+	for_each_netdev(net, dev) {
+		if (dev->rtnl_link_ops == ops)
+			ops->dellink(dev, &list_kill);
+	}
+	unregister_netdevice_many(&list_kill);
+}
+
+/**
+ * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+	struct net *net;
+
+	for_each_net(net) {
+		__rtnl_kill_links(net, ops);
+	}
+	list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
+
+/**
+ * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
+ * @ops: struct rtnl_link_ops * to unregister
+ */
+void rtnl_link_unregister(struct rtnl_link_ops *ops)
+{
+	rtnl_lock();
+	__rtnl_link_unregister(ops);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_link_unregister);
+
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+	const struct rtnl_link_ops *ops;
+
+	list_for_each_entry(ops, &link_ops, list) {
+		if (!strcmp(ops->kind, kind))
+			return ops;
+	}
+	return NULL;
+}
+
+static size_t rtnl_link_get_size(const struct net_device *dev)
+{
+	const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+	size_t size;
+
+	if (!ops)
+		return 0;
+
+	size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
+	       nla_total_size(strlen(ops->kind) + 1);  /* IFLA_INFO_KIND */
+
+	if (ops->get_size)
+		/* IFLA_INFO_DATA + nested data */
+		size += nla_total_size(sizeof(struct nlattr)) +
+			ops->get_size(dev);
+
+	if (ops->get_xstats_size)
+		/* IFLA_INFO_XSTATS */
+		size += nla_total_size(ops->get_xstats_size(dev));
+
+	return size;
+}
+
+static LIST_HEAD(rtnl_af_ops);
+
+static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+{
+	const struct rtnl_af_ops *ops;
+
+	list_for_each_entry(ops, &rtnl_af_ops, list) {
+		if (ops->family == family)
+			return ops;
+	}
+
+	return NULL;
+}
+
+/**
+ * __rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * The caller must hold the rtnl_mutex.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int __rtnl_af_register(struct rtnl_af_ops *ops)
+{
+	list_add_tail(&ops->list, &rtnl_af_ops);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__rtnl_af_register);
+
+/**
+ * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
+ * @ops: struct rtnl_af_ops * to register
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int rtnl_af_register(struct rtnl_af_ops *ops)
+{
+	int err;
+
+	rtnl_lock();
+	err = __rtnl_af_register(ops);
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtnl_af_register);
+
+/**
+ * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+void __rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+	list_del(&ops->list);
+}
+EXPORT_SYMBOL_GPL(__rtnl_af_unregister);
+
+/**
+ * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
+ * @ops: struct rtnl_af_ops * to unregister
+ */
+void rtnl_af_unregister(struct rtnl_af_ops *ops)
+{
+	rtnl_lock();
+	__rtnl_af_unregister(ops);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(rtnl_af_unregister);
+
+static size_t rtnl_link_get_af_size(const struct net_device *dev)
+{
+	struct rtnl_af_ops *af_ops;
+	size_t size;
+
+	/* IFLA_AF_SPEC */
+	size = nla_total_size(sizeof(struct nlattr));
+
+	list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+		if (af_ops->get_link_af_size) {
+			/* AF_* + nested data */
+			size += nla_total_size(sizeof(struct nlattr)) +
+				af_ops->get_link_af_size(dev);
+		}
+	}
+
+	return size;
+}
+
+static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
+	struct nlattr *linkinfo, *data;
+	int err = -EMSGSIZE;
+
+	linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+	if (linkinfo == NULL)
+		goto out;
+
+	if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
+		goto err_cancel_link;
+	if (ops->fill_xstats) {
+		err = ops->fill_xstats(skb, dev);
+		if (err < 0)
+			goto err_cancel_link;
+	}
+	if (ops->fill_info) {
+		data = nla_nest_start(skb, IFLA_INFO_DATA);
+		if (data == NULL)
+			goto err_cancel_link;
+		err = ops->fill_info(skb, dev);
+		if (err < 0)
+			goto err_cancel_data;
+		nla_nest_end(skb, data);
+	}
+
+	nla_nest_end(skb, linkinfo);
+	return 0;
+
+err_cancel_data:
+	nla_nest_cancel(skb, data);
+err_cancel_link:
+	nla_nest_cancel(skb, linkinfo);
+out:
+	return err;
+}
+
+static const int rtm_min[RTM_NR_FAMILIES] =
+{
+	[RTM_FAM(RTM_NEWLINK)]      = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+	[RTM_FAM(RTM_NEWADDR)]      = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
+	[RTM_FAM(RTM_NEWROUTE)]     = NLMSG_LENGTH(sizeof(struct rtmsg)),
+	[RTM_FAM(RTM_NEWRULE)]      = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
+	[RTM_FAM(RTM_NEWQDISC)]     = NLMSG_LENGTH(sizeof(struct tcmsg)),
+	[RTM_FAM(RTM_NEWTCLASS)]    = NLMSG_LENGTH(sizeof(struct tcmsg)),
+	[RTM_FAM(RTM_NEWTFILTER)]   = NLMSG_LENGTH(sizeof(struct tcmsg)),
+	[RTM_FAM(RTM_NEWACTION)]    = NLMSG_LENGTH(sizeof(struct tcamsg)),
+	[RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+	[RTM_FAM(RTM_GETANYCAST)]   = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+};
+
+static const int rta_max[RTM_NR_FAMILIES] =
+{
+	[RTM_FAM(RTM_NEWLINK)]      = IFLA_MAX,
+	[RTM_FAM(RTM_NEWADDR)]      = IFA_MAX,
+	[RTM_FAM(RTM_NEWROUTE)]     = RTA_MAX,
+	[RTM_FAM(RTM_NEWRULE)]      = FRA_MAX,
+	[RTM_FAM(RTM_NEWQDISC)]     = TCA_MAX,
+	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX,
+	[RTM_FAM(RTM_NEWTFILTER)]   = TCA_MAX,
+	[RTM_FAM(RTM_NEWACTION)]    = TCAA_MAX,
+};
+
+void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+{
+	struct rtattr *rta;
+	int size = RTA_LENGTH(attrlen);
+
+	rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size));
+	rta->rta_type = attrtype;
+	rta->rta_len = size;
+	memcpy(RTA_DATA(rta), data, attrlen);
+	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
+}
+EXPORT_SYMBOL(__rta_fill);
+
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
+{
+	struct sock *rtnl = net->rtnl;
+	int err = 0;
+
+	NETLINK_CB(skb).dst_group = group;
+	if (echo)
+		atomic_inc(&skb->users);
+	netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
+	if (echo)
+		err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+	return err;
+}
+
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{
+	struct sock *rtnl = net->rtnl;
+
+	return nlmsg_unicast(rtnl, skb, pid);
+}
+EXPORT_SYMBOL(rtnl_unicast);
+
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
+		 struct nlmsghdr *nlh, gfp_t flags)
+{
+	struct sock *rtnl = net->rtnl;
+	int report = 0;
+
+	if (nlh)
+		report = nlmsg_report(nlh);
+
+	nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+EXPORT_SYMBOL(rtnl_notify);
+
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+	struct sock *rtnl = net->rtnl;
+
+	netlink_set_err(rtnl, 0, group, error);
+}
+EXPORT_SYMBOL(rtnl_set_sk_err);
+
+int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
+{
+	struct nlattr *mx;
+	int i, valid = 0;
+
+	mx = nla_nest_start(skb, RTA_METRICS);
+	if (mx == NULL)
+		return -ENOBUFS;
+
+	for (i = 0; i < RTAX_MAX; i++) {
+		if (metrics[i]) {
+			valid++;
+			NLA_PUT_U32(skb, i+1, metrics[i]);
+		}
+	}
+
+	if (!valid) {
+		nla_nest_cancel(skb, mx);
+		return 0;
+	}
+
+	return nla_nest_end(skb, mx);
+
+nla_put_failure:
+	nla_nest_cancel(skb, mx);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rtnetlink_put_metrics);
+
+int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+		       u32 ts, u32 tsage, long expires, u32 error)
+{
+	struct rta_cacheinfo ci = {
+		.rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse),
+		.rta_used = dst->__use,
+		.rta_clntref = atomic_read(&(dst->__refcnt)),
+		.rta_error = error,
+		.rta_id =  id,
+		.rta_ts = ts,
+		.rta_tsage = tsage,
+	};
+
+	if (expires)
+		ci.rta_expires = jiffies_to_clock_t(expires);
+
+	return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+}
+EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+
+static void set_operstate(struct net_device *dev, unsigned char transition)
+{
+	unsigned char operstate = dev->operstate;
+
+	switch (transition) {
+	case IF_OPER_UP:
+		if ((operstate == IF_OPER_DORMANT ||
+		     operstate == IF_OPER_UNKNOWN) &&
+		    !netif_dormant(dev))
+			operstate = IF_OPER_UP;
+		break;
+
+	case IF_OPER_DORMANT:
+		if (operstate == IF_OPER_UP ||
+		    operstate == IF_OPER_UNKNOWN)
+			operstate = IF_OPER_DORMANT;
+		break;
+	}
+
+	if (dev->operstate != operstate) {
+		write_lock_bh(&dev_base_lock);
+		dev->operstate = operstate;
+		write_unlock_bh(&dev_base_lock);
+		netdev_state_change(dev);
+	}
+}
+
+static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
+					   const struct ifinfomsg *ifm)
+{
+	unsigned int flags = ifm->ifi_flags;
+
+	/* bugwards compatibility: ifi_change == 0 is treated as ~0 */
+	if (ifm->ifi_change)
+		flags = (flags & ifm->ifi_change) |
+			(dev->flags & ~ifm->ifi_change);
+
+	return flags;
+}
+
+static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+				 const struct rtnl_link_stats64 *b)
+{
+	a->rx_packets = b->rx_packets;
+	a->tx_packets = b->tx_packets;
+	a->rx_bytes = b->rx_bytes;
+	a->tx_bytes = b->tx_bytes;
+	a->rx_errors = b->rx_errors;
+	a->tx_errors = b->tx_errors;
+	a->rx_dropped = b->rx_dropped;
+	a->tx_dropped = b->tx_dropped;
+
+	a->multicast = b->multicast;
+	a->collisions = b->collisions;
+
+	a->rx_length_errors = b->rx_length_errors;
+	a->rx_over_errors = b->rx_over_errors;
+	a->rx_crc_errors = b->rx_crc_errors;
+	a->rx_frame_errors = b->rx_frame_errors;
+	a->rx_fifo_errors = b->rx_fifo_errors;
+	a->rx_missed_errors = b->rx_missed_errors;
+
+	a->tx_aborted_errors = b->tx_aborted_errors;
+	a->tx_carrier_errors = b->tx_carrier_errors;
+	a->tx_fifo_errors = b->tx_fifo_errors;
+	a->tx_heartbeat_errors = b->tx_heartbeat_errors;
+	a->tx_window_errors = b->tx_window_errors;
+
+	a->rx_compressed = b->rx_compressed;
+	a->tx_compressed = b->tx_compressed;
+}
+
+static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
+{
+	memcpy(v, b, sizeof(*b));
+}
+
+/* All VF info */
+static inline int rtnl_vfinfo_size(const struct net_device *dev)
+{
+	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
+
+		int num_vfs = dev_num_vf(dev->dev.parent);
+		size_t size = nla_total_size(sizeof(struct nlattr));
+		size += nla_total_size(num_vfs * sizeof(struct nlattr));
+		size += num_vfs *
+			(nla_total_size(sizeof(struct ifla_vf_mac)) +
+			 nla_total_size(sizeof(struct ifla_vf_vlan)) +
+			 nla_total_size(sizeof(struct ifla_vf_tx_rate)));
+		return size;
+	} else
+		return 0;
+}
+
+static size_t rtnl_port_size(const struct net_device *dev)
+{
+	size_t port_size = nla_total_size(4)		/* PORT_VF */
+		+ nla_total_size(PORT_PROFILE_MAX)	/* PORT_PROFILE */
+		+ nla_total_size(sizeof(struct ifla_port_vsi))
+							/* PORT_VSI_TYPE */
+		+ nla_total_size(PORT_UUID_MAX)		/* PORT_INSTANCE_UUID */
+		+ nla_total_size(PORT_UUID_MAX)		/* PORT_HOST_UUID */
+		+ nla_total_size(1)			/* PROT_VDP_REQUEST */
+		+ nla_total_size(2);			/* PORT_VDP_RESPONSE */
+	size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
+	size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
+		+ port_size;
+	size_t port_self_size = nla_total_size(sizeof(struct nlattr))
+		+ port_size;
+
+	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+		return 0;
+	if (dev_num_vf(dev->dev.parent))
+		return port_self_size + vf_ports_size +
+			vf_port_size * dev_num_vf(dev->dev.parent);
+	else
+		return port_self_size;
+}
+
+static noinline size_t if_nlmsg_size(const struct net_device *dev)
+{
+	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+	       + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+	       + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+	       + nla_total_size(sizeof(struct rtnl_link_ifmap))
+	       + nla_total_size(sizeof(struct rtnl_link_stats))
+	       + nla_total_size(sizeof(struct rtnl_link_stats64))
+	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+	       + nla_total_size(4) /* IFLA_TXQLEN */
+	       + nla_total_size(4) /* IFLA_WEIGHT */
+	       + nla_total_size(4) /* IFLA_MTU */
+	       + nla_total_size(4) /* IFLA_LINK */
+	       + nla_total_size(4) /* IFLA_MASTER */
+	       + nla_total_size(1) /* IFLA_OPERSTATE */
+	       + nla_total_size(1) /* IFLA_LINKMODE */
+	       + nla_total_size(4) /* IFLA_NUM_VF */
+	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
+	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
+}
+
+static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	struct nlattr *vf_ports;
+	struct nlattr *vf_port;
+	int vf;
+	int err;
+
+	vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
+	if (!vf_ports)
+		return -EMSGSIZE;
+
+	for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+		vf_port = nla_nest_start(skb, IFLA_VF_PORT);
+		if (!vf_port)
+			goto nla_put_failure;
+		NLA_PUT_U32(skb, IFLA_PORT_VF, vf);
+		err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
+		if (err == -EMSGSIZE)
+			goto nla_put_failure;
+		if (err) {
+			nla_nest_cancel(skb, vf_port);
+			continue;
+		}
+		nla_nest_end(skb, vf_port);
+	}
+
+	nla_nest_end(skb, vf_ports);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, vf_ports);
+	return -EMSGSIZE;
+}
+
+static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	struct nlattr *port_self;
+	int err;
+
+	port_self = nla_nest_start(skb, IFLA_PORT_SELF);
+	if (!port_self)
+		return -EMSGSIZE;
+
+	err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
+	if (err) {
+		nla_nest_cancel(skb, port_self);
+		return (err == -EMSGSIZE) ? err : 0;
+	}
+
+	nla_nest_end(skb, port_self);
+
+	return 0;
+}
+
+static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	int err;
+
+	if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent)
+		return 0;
+
+	err = rtnl_port_self_fill(skb, dev);
+	if (err)
+		return err;
+
+	if (dev_num_vf(dev->dev.parent)) {
+		err = rtnl_vf_ports_fill(skb, dev);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+			    int type, u32 pid, u32 seq, u32 change,
+			    unsigned int flags)
+{
+	struct ifinfomsg *ifm;
+	struct nlmsghdr *nlh;
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats;
+	struct nlattr *attr, *af_spec;
+	struct rtnl_af_ops *af_ops;
+
+	ASSERT_RTNL();
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->__ifi_pad = 0;
+	ifm->ifi_type = dev->type;
+	ifm->ifi_index = dev->ifindex;
+	ifm->ifi_flags = dev_get_flags(dev);
+	ifm->ifi_change = change;
+
+	NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+	NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len);
+	NLA_PUT_U8(skb, IFLA_OPERSTATE,
+		   netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
+	NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
+	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	NLA_PUT_U32(skb, IFLA_GROUP, dev->group);
+
+	if (dev->ifindex != dev->iflink)
+		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+
+	if (dev->master)
+		NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex);
+
+	if (dev->qdisc)
+		NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id);
+
+	if (dev->ifalias)
+		NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias);
+
+	if (1) {
+		struct rtnl_link_ifmap map = {
+			.mem_start   = dev->mem_start,
+			.mem_end     = dev->mem_end,
+			.base_addr   = dev->base_addr,
+			.irq         = dev->irq,
+			.dma         = dev->dma,
+			.port        = dev->if_port,
+		};
+		NLA_PUT(skb, IFLA_MAP, sizeof(map), &map);
+	}
+
+	if (dev->addr_len) {
+		NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+		NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast);
+	}
+
+	attr = nla_reserve(skb, IFLA_STATS,
+			sizeof(struct rtnl_link_stats));
+	if (attr == NULL)
+		goto nla_put_failure;
+
+	stats = dev_get_stats(dev, &temp);
+	copy_rtnl_link_stats(nla_data(attr), stats);
+
+	attr = nla_reserve(skb, IFLA_STATS64,
+			sizeof(struct rtnl_link_stats64));
+	if (attr == NULL)
+		goto nla_put_failure;
+	copy_rtnl_link_stats64(nla_data(attr), stats);
+
+	if (dev->dev.parent)
+		NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
+
+	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
+		int i;
+
+		struct nlattr *vfinfo, *vf;
+		int num_vfs = dev_num_vf(dev->dev.parent);
+
+		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+		if (!vfinfo)
+			goto nla_put_failure;
+		for (i = 0; i < num_vfs; i++) {
+			struct ifla_vf_info ivi;
+			struct ifla_vf_mac vf_mac;
+			struct ifla_vf_vlan vf_vlan;
+			struct ifla_vf_tx_rate vf_tx_rate;
+			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
+				break;
+			vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
+			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+			vf_vlan.vlan = ivi.vlan;
+			vf_vlan.qos = ivi.qos;
+			vf_tx_rate.rate = ivi.tx_rate;
+			vf = nla_nest_start(skb, IFLA_VF_INFO);
+			if (!vf) {
+				nla_nest_cancel(skb, vfinfo);
+				goto nla_put_failure;
+			}
+			NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
+			NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
+			NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
+			nla_nest_end(skb, vf);
+		}
+		nla_nest_end(skb, vfinfo);
+	}
+
+	if (rtnl_port_fill(skb, dev))
+		goto nla_put_failure;
+
+	if (dev->rtnl_link_ops) {
+		if (rtnl_link_fill(skb, dev) < 0)
+			goto nla_put_failure;
+	}
+
+	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
+		goto nla_put_failure;
+
+	list_for_each_entry(af_ops, &rtnl_af_ops, list) {
+		if (af_ops->fill_link_af) {
+			struct nlattr *af;
+			int err;
+
+			if (!(af = nla_nest_start(skb, af_ops->family)))
+				goto nla_put_failure;
+
+			err = af_ops->fill_link_af(skb, dev);
+
+			/*
+			 * Caller may return ENODATA to indicate that there
+			 * was no data to be dumped. This is not an error, it
+			 * means we should trim the attribute header and
+			 * continue.
+			 */
+			if (err == -ENODATA)
+				nla_nest_cancel(skb, af);
+			else if (err < 0)
+				goto nla_put_failure;
+
+			nla_nest_end(skb, af);
+		}
+	}
+
+	nla_nest_end(skb, af_spec);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int h, s_h;
+	int idx = 0, s_idx;
+	struct net_device *dev;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	s_h = cb->args[0];
+	s_idx = cb->args[1];
+
+	rcu_read_lock();
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
+					     NETLINK_CB(cb->skb).pid,
+					     cb->nlh->nlmsg_seq, 0,
+					     NLM_F_MULTI) <= 0)
+				goto out;
+cont:
+			idx++;
+		}
+	}
+out:
+	rcu_read_unlock();
+	cb->args[1] = idx;
+	cb->args[0] = h;
+
+	return skb->len;
+}
+
+const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+	[IFLA_IFNAME]		= { .type = NLA_STRING, .len = IFNAMSIZ-1 },
+	[IFLA_ADDRESS]		= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+	[IFLA_BROADCAST]	= { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+	[IFLA_MAP]		= { .len = sizeof(struct rtnl_link_ifmap) },
+	[IFLA_MTU]		= { .type = NLA_U32 },
+	[IFLA_LINK]		= { .type = NLA_U32 },
+	[IFLA_MASTER]		= { .type = NLA_U32 },
+	[IFLA_TXQLEN]		= { .type = NLA_U32 },
+	[IFLA_WEIGHT]		= { .type = NLA_U32 },
+	[IFLA_OPERSTATE]	= { .type = NLA_U8 },
+	[IFLA_LINKMODE]		= { .type = NLA_U8 },
+	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
+	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
+	[IFLA_NET_NS_FD]	= { .type = NLA_U32 },
+	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
+	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
+	[IFLA_VF_PORTS]		= { .type = NLA_NESTED },
+	[IFLA_PORT_SELF]	= { .type = NLA_NESTED },
+	[IFLA_AF_SPEC]		= { .type = NLA_NESTED },
+};
+EXPORT_SYMBOL(ifla_policy);
+
+static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+	[IFLA_INFO_KIND]	= { .type = NLA_STRING },
+	[IFLA_INFO_DATA]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
+	[IFLA_VF_INFO]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
+	[IFLA_VF_MAC]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_mac) },
+	[IFLA_VF_VLAN]		= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_vlan) },
+	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_tx_rate) },
+};
+
+static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
+	[IFLA_PORT_VF]		= { .type = NLA_U32 },
+	[IFLA_PORT_PROFILE]	= { .type = NLA_STRING,
+				    .len = PORT_PROFILE_MAX },
+	[IFLA_PORT_VSI_TYPE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_port_vsi)},
+	[IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
+				      .len = PORT_UUID_MAX },
+	[IFLA_PORT_HOST_UUID]	= { .type = NLA_STRING,
+				    .len = PORT_UUID_MAX },
+	[IFLA_PORT_REQUEST]	= { .type = NLA_U8, },
+	[IFLA_PORT_RESPONSE]	= { .type = NLA_U16, },
+};
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+	struct net *net;
+	/* Examine the link attributes and figure out which
+	 * network namespace we are talking about.
+	 */
+	if (tb[IFLA_NET_NS_PID])
+		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
+	else if (tb[IFLA_NET_NS_FD])
+		net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
+	else
+		net = get_net(src_net);
+	return net;
+}
+EXPORT_SYMBOL(rtnl_link_get_net);
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+{
+	if (dev) {
+		if (tb[IFLA_ADDRESS] &&
+		    nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+			return -EINVAL;
+
+		if (tb[IFLA_BROADCAST] &&
+		    nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+			return -EINVAL;
+	}
+
+	if (tb[IFLA_AF_SPEC]) {
+		struct nlattr *af;
+		int rem, err;
+
+		nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+			const struct rtnl_af_ops *af_ops;
+
+			if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+				return -EAFNOSUPPORT;
+
+			if (!af_ops->set_link_af)
+				return -EOPNOTSUPP;
+
+			if (af_ops->validate_link_af) {
+				err = af_ops->validate_link_af(dev, af);
+				if (err < 0)
+					return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
+{
+	int rem, err = -EINVAL;
+	struct nlattr *vf;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	nla_for_each_nested(vf, attr, rem) {
+		switch (nla_type(vf)) {
+		case IFLA_VF_MAC: {
+			struct ifla_vf_mac *ivm;
+			ivm = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_mac)
+				err = ops->ndo_set_vf_mac(dev, ivm->vf,
+							  ivm->mac);
+			break;
+		}
+		case IFLA_VF_VLAN: {
+			struct ifla_vf_vlan *ivv;
+			ivv = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_vlan)
+				err = ops->ndo_set_vf_vlan(dev, ivv->vf,
+							   ivv->vlan,
+							   ivv->qos);
+			break;
+		}
+		case IFLA_VF_TX_RATE: {
+			struct ifla_vf_tx_rate *ivt;
+			ivt = nla_data(vf);
+			err = -EOPNOTSUPP;
+			if (ops->ndo_set_vf_tx_rate)
+				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
+							      ivt->rate);
+			break;
+		}
+		default:
+			err = -EINVAL;
+			break;
+		}
+		if (err)
+			break;
+	}
+	return err;
+}
+
+static int do_set_master(struct net_device *dev, int ifindex)
+{
+	struct net_device *master_dev;
+	const struct net_device_ops *ops;
+	int err;
+
+	if (dev->master) {
+		if (dev->master->ifindex == ifindex)
+			return 0;
+		ops = dev->master->netdev_ops;
+		if (ops->ndo_del_slave) {
+			err = ops->ndo_del_slave(dev->master, dev);
+			if (err)
+				return err;
+		} else {
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if (ifindex) {
+		master_dev = __dev_get_by_index(dev_net(dev), ifindex);
+		if (!master_dev)
+			return -EINVAL;
+		ops = master_dev->netdev_ops;
+		if (ops->ndo_add_slave) {
+			err = ops->ndo_add_slave(master_dev, dev);
+			if (err)
+				return err;
+		} else {
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+		      struct nlattr **tb, char *ifname, int modified)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int send_addr_notify = 0;
+	int err;
+
+	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) {
+		struct net *net = rtnl_link_get_net(dev_net(dev), tb);
+		if (IS_ERR(net)) {
+			err = PTR_ERR(net);
+			goto errout;
+		}
+		err = dev_change_net_namespace(dev, net, ifname);
+		put_net(net);
+		if (err)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_MAP]) {
+		struct rtnl_link_ifmap *u_map;
+		struct ifmap k_map;
+
+		if (!ops->ndo_set_config) {
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
+
+		if (!netif_device_present(dev)) {
+			err = -ENODEV;
+			goto errout;
+		}
+
+		u_map = nla_data(tb[IFLA_MAP]);
+		k_map.mem_start = (unsigned long) u_map->mem_start;
+		k_map.mem_end = (unsigned long) u_map->mem_end;
+		k_map.base_addr = (unsigned short) u_map->base_addr;
+		k_map.irq = (unsigned char) u_map->irq;
+		k_map.dma = (unsigned char) u_map->dma;
+		k_map.port = (unsigned char) u_map->port;
+
+		err = ops->ndo_set_config(dev, &k_map);
+		if (err < 0)
+			goto errout;
+
+		modified = 1;
+	}
+
+	if (tb[IFLA_ADDRESS]) {
+		struct sockaddr *sa;
+		int len;
+
+		if (!ops->ndo_set_mac_address) {
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
+
+		if (!netif_device_present(dev)) {
+			err = -ENODEV;
+			goto errout;
+		}
+
+		len = sizeof(sa_family_t) + dev->addr_len;
+		sa = kmalloc(len, GFP_KERNEL);
+		if (!sa) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		sa->sa_family = dev->type;
+		memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
+		       dev->addr_len);
+		err = ops->ndo_set_mac_address(dev, sa);
+		kfree(sa);
+		if (err)
+			goto errout;
+		send_addr_notify = 1;
+		modified = 1;
+	}
+
+	if (tb[IFLA_MTU]) {
+		err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_GROUP]) {
+		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+		modified = 1;
+	}
+
+	/*
+	 * Interface selected by interface index but interface
+	 * name provided implies that a name change has been
+	 * requested.
+	 */
+	if (ifm->ifi_index > 0 && ifname[0]) {
+		err = dev_change_name(dev, ifname);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_IFALIAS]) {
+		err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+				    nla_len(tb[IFLA_IFALIAS]));
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_BROADCAST]) {
+		nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
+		send_addr_notify = 1;
+	}
+
+	if (ifm->ifi_flags || ifm->ifi_change) {
+		err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+		if (err < 0)
+			goto errout;
+	}
+
+	if (tb[IFLA_MASTER]) {
+		err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
+		if (err)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_TXQLEN])
+		dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+
+	if (tb[IFLA_OPERSTATE])
+		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+
+	if (tb[IFLA_LINKMODE]) {
+		write_lock_bh(&dev_base_lock);
+		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+		write_unlock_bh(&dev_base_lock);
+	}
+
+	if (tb[IFLA_VFINFO_LIST]) {
+		struct nlattr *attr;
+		int rem;
+		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
+			if (nla_type(attr) != IFLA_VF_INFO) {
+				err = -EINVAL;
+				goto errout;
+			}
+			err = do_setvfinfo(dev, attr);
+			if (err < 0)
+				goto errout;
+			modified = 1;
+		}
+	}
+	err = 0;
+
+	if (tb[IFLA_VF_PORTS]) {
+		struct nlattr *port[IFLA_PORT_MAX+1];
+		struct nlattr *attr;
+		int vf;
+		int rem;
+
+		err = -EOPNOTSUPP;
+		if (!ops->ndo_set_vf_port)
+			goto errout;
+
+		nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
+			if (nla_type(attr) != IFLA_VF_PORT)
+				continue;
+			err = nla_parse_nested(port, IFLA_PORT_MAX,
+				attr, ifla_port_policy);
+			if (err < 0)
+				goto errout;
+			if (!port[IFLA_PORT_VF]) {
+				err = -EOPNOTSUPP;
+				goto errout;
+			}
+			vf = nla_get_u32(port[IFLA_PORT_VF]);
+			err = ops->ndo_set_vf_port(dev, vf, port);
+			if (err < 0)
+				goto errout;
+			modified = 1;
+		}
+	}
+	err = 0;
+
+	if (tb[IFLA_PORT_SELF]) {
+		struct nlattr *port[IFLA_PORT_MAX+1];
+
+		err = nla_parse_nested(port, IFLA_PORT_MAX,
+			tb[IFLA_PORT_SELF], ifla_port_policy);
+		if (err < 0)
+			goto errout;
+
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_port)
+			err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+
+	if (tb[IFLA_AF_SPEC]) {
+		struct nlattr *af;
+		int rem;
+
+		nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
+			const struct rtnl_af_ops *af_ops;
+
+			if (!(af_ops = rtnl_af_lookup(nla_type(af))))
+				BUG();
+
+			err = af_ops->set_link_af(dev, af);
+			if (err < 0)
+				goto errout;
+
+			modified = 1;
+		}
+	}
+	err = 0;
+
+errout:
+	if (err < 0 && modified && net_ratelimit())
+		printk(KERN_WARNING "A link change request failed with "
+		       "some changes committed already. Interface %s may "
+		       "have been left with an inconsistent configuration, "
+		       "please check.\n", dev->name);
+
+	if (send_addr_notify)
+		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+	return err;
+}
+
+static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifinfomsg *ifm;
+	struct net_device *dev;
+	int err;
+	struct nlattr *tb[IFLA_MAX+1];
+	char ifname[IFNAMSIZ];
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		goto errout;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		ifname[0] = '\0';
+
+	err = -EINVAL;
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	else if (tb[IFLA_IFNAME])
+		dev = __dev_get_by_name(net, ifname);
+	else
+		goto errout;
+
+	if (dev == NULL) {
+		err = -ENODEV;
+		goto errout;
+	}
+
+	err = validate_linkmsg(dev, tb);
+	if (err < 0)
+		goto errout;
+
+	err = do_setlink(dev, ifm, tb, ifname, 0);
+errout:
+	return err;
+}
+
+static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	const struct rtnl_link_ops *ops;
+	struct net_device *dev;
+	struct ifinfomsg *ifm;
+	char ifname[IFNAMSIZ];
+	struct nlattr *tb[IFLA_MAX+1];
+	int err;
+	LIST_HEAD(list_kill);
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	else if (tb[IFLA_IFNAME])
+		dev = __dev_get_by_name(net, ifname);
+	else
+		return -EINVAL;
+
+	if (!dev)
+		return -ENODEV;
+
+	ops = dev->rtnl_link_ops;
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	ops->dellink(dev, &list_kill);
+	unregister_netdevice_many(&list_kill);
+	list_del(&list_kill);
+	return 0;
+}
+
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+{
+	unsigned int old_flags;
+	int err;
+
+	old_flags = dev->flags;
+	if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
+		err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+		if (err < 0)
+			return err;
+	}
+
+	dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
+
+	__dev_notify_flags(dev, old_flags);
+	return 0;
+}
+EXPORT_SYMBOL(rtnl_configure_link);
+
+struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
+	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
+{
+	int err;
+	struct net_device *dev;
+	unsigned int num_queues = 1;
+	unsigned int real_num_queues = 1;
+
+	if (ops->get_tx_queues) {
+		err = ops->get_tx_queues(src_net, tb, &num_queues,
+					 &real_num_queues);
+		if (err)
+			goto err;
+	}
+	err = -ENOMEM;
+	dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues);
+	if (!dev)
+		goto err;
+
+	dev_net_set(dev, net);
+	dev->rtnl_link_ops = ops;
+	dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+	dev->real_num_tx_queues = real_num_queues;
+
+	if (tb[IFLA_MTU])
+		dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+	if (tb[IFLA_ADDRESS])
+		memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
+				nla_len(tb[IFLA_ADDRESS]));
+	if (tb[IFLA_BROADCAST])
+		memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
+				nla_len(tb[IFLA_BROADCAST]));
+	if (tb[IFLA_TXQLEN])
+		dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
+	if (tb[IFLA_OPERSTATE])
+		set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
+	if (tb[IFLA_LINKMODE])
+		dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
+	if (tb[IFLA_GROUP])
+		dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+
+	return dev;
+
+err:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(rtnl_create_link);
+
+static int rtnl_group_changelink(struct net *net, int group,
+		struct ifinfomsg *ifm,
+		struct nlattr **tb)
+{
+	struct net_device *dev;
+	int err;
+
+	for_each_netdev(net, dev) {
+		if (dev->group == group) {
+			err = do_setlink(dev, ifm, tb, NULL, 0);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	const struct rtnl_link_ops *ops;
+	struct net_device *dev;
+	struct ifinfomsg *ifm;
+	char kind[MODULE_NAME_LEN];
+	char ifname[IFNAMSIZ];
+	struct nlattr *tb[IFLA_MAX+1];
+	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+	int err;
+
+#ifdef CONFIG_MODULES
+replay:
+#endif
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		ifname[0] = '\0';
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	else {
+		if (ifname[0])
+			dev = __dev_get_by_name(net, ifname);
+		else
+			dev = NULL;
+	}
+
+	err = validate_linkmsg(dev, tb);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_LINKINFO]) {
+		err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
+				       tb[IFLA_LINKINFO], ifla_info_policy);
+		if (err < 0)
+			return err;
+	} else
+		memset(linkinfo, 0, sizeof(linkinfo));
+
+	if (linkinfo[IFLA_INFO_KIND]) {
+		nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+		ops = rtnl_link_ops_get(kind);
+	} else {
+		kind[0] = '\0';
+		ops = NULL;
+	}
+
+	if (1) {
+		struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL;
+		struct net *dest_net;
+
+		if (ops) {
+			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+				err = nla_parse_nested(attr, ops->maxtype,
+						       linkinfo[IFLA_INFO_DATA],
+						       ops->policy);
+				if (err < 0)
+					return err;
+				data = attr;
+			}
+			if (ops->validate) {
+				err = ops->validate(tb, data);
+				if (err < 0)
+					return err;
+			}
+		}
+
+		if (dev) {
+			int modified = 0;
+
+			if (nlh->nlmsg_flags & NLM_F_EXCL)
+				return -EEXIST;
+			if (nlh->nlmsg_flags & NLM_F_REPLACE)
+				return -EOPNOTSUPP;
+
+			if (linkinfo[IFLA_INFO_DATA]) {
+				if (!ops || ops != dev->rtnl_link_ops ||
+				    !ops->changelink)
+					return -EOPNOTSUPP;
+
+				err = ops->changelink(dev, tb, data);
+				if (err < 0)
+					return err;
+				modified = 1;
+			}
+
+			return do_setlink(dev, ifm, tb, ifname, modified);
+		}
+
+		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+			if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+				return rtnl_group_changelink(net,
+						nla_get_u32(tb[IFLA_GROUP]),
+						ifm, tb);
+			return -ENODEV;
+		}
+
+		if (ifm->ifi_index)
+			return -EOPNOTSUPP;
+		if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO])
+			return -EOPNOTSUPP;
+
+		if (!ops) {
+#ifdef CONFIG_MODULES
+			if (kind[0]) {
+				__rtnl_unlock();
+				request_module("rtnl-link-%s", kind);
+				rtnl_lock();
+				ops = rtnl_link_ops_get(kind);
+				if (ops)
+					goto replay;
+			}
+#endif
+			return -EOPNOTSUPP;
+		}
+
+		if (!ifname[0])
+			snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+
+		dest_net = rtnl_link_get_net(net, tb);
+		if (IS_ERR(dest_net))
+			return PTR_ERR(dest_net);
+
+		dev = rtnl_create_link(net, dest_net, ifname, ops, tb);
+
+		if (IS_ERR(dev))
+			err = PTR_ERR(dev);
+		else if (ops->newlink)
+			err = ops->newlink(net, dev, tb, data);
+		else
+			err = register_netdevice(dev);
+
+		if (err < 0 && !IS_ERR(dev))
+			free_netdev(dev);
+		if (err < 0)
+			goto out;
+
+		err = rtnl_configure_link(dev, ifm);
+		if (err < 0)
+			unregister_netdevice(dev);
+out:
+		put_net(dest_net);
+		return err;
+	}
+}
+
+static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifinfomsg *ifm;
+	char ifname[IFNAMSIZ];
+	struct nlattr *tb[IFLA_MAX+1];
+	struct net_device *dev = NULL;
+	struct sk_buff *nskb;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_IFNAME])
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0)
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	else if (tb[IFLA_IFNAME])
+		dev = __dev_get_by_name(net, ifname);
+	else
+		return -EINVAL;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+	if (nskb == NULL)
+		return -ENOBUFS;
+
+	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid,
+			       nlh->nlmsg_seq, 0, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in if_nlmsg_size */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(nskb);
+	} else
+		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
+
+	return err;
+}
+
+static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int idx;
+	int s_idx = cb->family;
+
+	if (s_idx == 0)
+		s_idx = 1;
+	for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
+		int type = cb->nlh->nlmsg_type-RTM_BASE;
+		if (idx < s_idx || idx == PF_PACKET)
+			continue;
+		if (rtnl_msg_handlers[idx] == NULL ||
+		    rtnl_msg_handlers[idx][type].dumpit == NULL)
+			continue;
+		if (idx > s_idx)
+			memset(&cb->args[0], 0, sizeof(cb->args));
+		if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
+			break;
+	}
+	cb->family = idx;
+
+	return skb->len;
+}
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
+{
+	struct net *net = dev_net(dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+/* Protected by RTNL sempahore.  */
+static struct rtattr **rta_buf;
+static int rtattr_max;
+
+/* Process one rtnetlink message. */
+
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(skb->sk);
+	rtnl_doit_func doit;
+	int sz_idx, kind;
+	int min_len;
+	int family;
+	int type;
+	int err;
+
+	type = nlh->nlmsg_type;
+	if (type > RTM_MAX)
+		return -EOPNOTSUPP;
+
+	type -= RTM_BASE;
+
+	/* All the messages must have at least 1 byte length */
+	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg)))
+		return 0;
+
+	family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family;
+	sz_idx = type>>2;
+	kind = type&3;
+
+	if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+		struct sock *rtnl;
+		rtnl_dumpit_func dumpit;
+
+		dumpit = rtnl_get_dumpit(family, type);
+		if (dumpit == NULL)
+			return -EOPNOTSUPP;
+
+		__rtnl_unlock();
+		rtnl = net->rtnl;
+		err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
+		rtnl_lock();
+		return err;
+	}
+
+	memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *)));
+
+	min_len = rtm_min[sz_idx];
+	if (nlh->nlmsg_len < min_len)
+		return -EINVAL;
+
+	if (nlh->nlmsg_len > min_len) {
+		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+		struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len);
+
+		while (RTA_OK(attr, attrlen)) {
+			unsigned flavor = attr->rta_type;
+			if (flavor) {
+				if (flavor > rta_max[sz_idx])
+					return -EINVAL;
+				rta_buf[flavor-1] = attr;
+			}
+			attr = RTA_NEXT(attr, attrlen);
+		}
+	}
+
+	doit = rtnl_get_doit(family, type);
+	if (doit == NULL)
+		return -EOPNOTSUPP;
+
+	return doit(skb, nlh, (void *)&rta_buf[0]);
+}
+
+static void rtnetlink_rcv(struct sk_buff *skb)
+{
+	rtnl_lock();
+	netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
+	rtnl_unlock();
+}
+
+static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	switch (event) {
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_PRE_UP:
+	case NETDEV_POST_INIT:
+	case NETDEV_REGISTER:
+	case NETDEV_CHANGE:
+	case NETDEV_PRE_TYPE_CHANGE:
+	case NETDEV_GOING_DOWN:
+	case NETDEV_UNREGISTER:
+	case NETDEV_UNREGISTER_BATCH:
+	case NETDEV_RELEASE:
+	case NETDEV_JOIN:
+		break;
+	default:
+		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rtnetlink_dev_notifier = {
+	.notifier_call	= rtnetlink_event,
+};
+
+
+static int __net_init rtnetlink_net_init(struct net *net)
+{
+	struct sock *sk;
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
+				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	if (!sk)
+		return -ENOMEM;
+	net->rtnl = sk;
+	return 0;
+}
+
+static void __net_exit rtnetlink_net_exit(struct net *net)
+{
+	netlink_kernel_release(net->rtnl);
+	net->rtnl = NULL;
+}
+
+static struct pernet_operations rtnetlink_net_ops = {
+	.init = rtnetlink_net_init,
+	.exit = rtnetlink_net_exit,
+};
+
+void __init rtnetlink_init(void)
+{
+	int i;
+
+	rtattr_max = 0;
+	for (i = 0; i < ARRAY_SIZE(rta_max); i++)
+		if (rta_max[i] > rtattr_max)
+			rtattr_max = rta_max[i];
+	rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL);
+	if (!rta_buf)
+		panic("rtnetlink_init: cannot allocate rta_buf\n");
+
+	if (register_pernet_subsys(&rtnetlink_net_ops))
+		panic("rtnetlink_init: cannot initialize rtnetlink\n");
+
+	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
+	register_netdevice_notifier(&rtnetlink_dev_notifier);
+
+	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo);
+	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL);
+
+	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all);
+	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all);
+}
+
diff --git a/net/core/scm.c b/net/core/scm.c
new file mode 100644
index 00000000..811b53fb
--- /dev/null
+++ b/net/core/scm.c
@@ -0,0 +1,344 @@
+/* scm.c - Socket level control messages processing.
+ *
+ * Author:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              Alignment and value checking mods by Craig Metz
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/compat.h>
+#include <net/scm.h>
+
+
+/*
+ *	Only allow a user to send credentials, that they could set with
+ *	setu(g)id.
+ */
+
+static __inline__ int scm_check_creds(struct ucred *creds)
+{
+	const struct cred *cred = current_cred();
+
+	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
+	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
+	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
+	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
+	      creds->gid == cred->sgid) || capable(CAP_SETGID))) {
+	       return 0;
+	}
+	return -EPERM;
+}
+
+static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
+{
+	int *fdp = (int*)CMSG_DATA(cmsg);
+	struct scm_fp_list *fpl = *fplp;
+	struct file **fpp;
+	int i, num;
+
+	num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
+
+	if (num <= 0)
+		return 0;
+
+	if (num > SCM_MAX_FD)
+		return -EINVAL;
+
+	if (!fpl)
+	{
+		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+		if (!fpl)
+			return -ENOMEM;
+		*fplp = fpl;
+		fpl->count = 0;
+		fpl->max = SCM_MAX_FD;
+	}
+	fpp = &fpl->fp[fpl->count];
+
+	if (fpl->count + num > fpl->max)
+		return -EINVAL;
+
+	/*
+	 *	Verify the descriptors and increment the usage count.
+	 */
+
+	for (i=0; i< num; i++)
+	{
+		int fd = fdp[i];
+		struct file *file;
+
+		if (fd < 0 || !(file = fget_raw(fd)))
+			return -EBADF;
+		*fpp++ = file;
+		fpl->count++;
+	}
+	return num;
+}
+
+void __scm_destroy(struct scm_cookie *scm)
+{
+	struct scm_fp_list *fpl = scm->fp;
+	int i;
+
+	if (fpl) {
+		scm->fp = NULL;
+		if (current->scm_work_list) {
+			list_add_tail(&fpl->list, current->scm_work_list);
+		} else {
+			LIST_HEAD(work_list);
+
+			current->scm_work_list = &work_list;
+
+			list_add(&fpl->list, &work_list);
+			while (!list_empty(&work_list)) {
+				fpl = list_first_entry(&work_list, struct scm_fp_list, list);
+
+				list_del(&fpl->list);
+				for (i=fpl->count-1; i>=0; i--)
+					fput(fpl->fp[i]);
+				kfree(fpl);
+			}
+
+			current->scm_work_list = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(__scm_destroy);
+
+int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
+{
+	struct cmsghdr *cmsg;
+	int err;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
+	{
+		err = -EINVAL;
+
+		/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
+		/* The first check was omitted in <= 2.2.5. The reasoning was
+		   that parser checks cmsg_len in any case, so that
+		   additional check would be work duplication.
+		   But if cmsg_level is not SOL_SOCKET, we do not check
+		   for too short ancillary data object at all! Oops.
+		   OK, let's add it...
+		 */
+		if (!CMSG_OK(msg, cmsg))
+			goto error;
+
+		if (cmsg->cmsg_level != SOL_SOCKET)
+			continue;
+
+		switch (cmsg->cmsg_type)
+		{
+		case SCM_RIGHTS:
+			if (!sock->ops || sock->ops->family != PF_UNIX)
+				goto error;
+			err=scm_fp_copy(cmsg, &p->fp);
+			if (err<0)
+				goto error;
+			break;
+		case SCM_CREDENTIALS:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
+				goto error;
+			memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+			err = scm_check_creds(&p->creds);
+			if (err)
+				goto error;
+
+			if (pid_vnr(p->pid) != p->creds.pid) {
+				struct pid *pid;
+				err = -ESRCH;
+				pid = find_get_pid(p->creds.pid);
+				if (!pid)
+					goto error;
+				put_pid(p->pid);
+				p->pid = pid;
+			}
+
+			if ((p->cred->euid != p->creds.uid) ||
+				(p->cred->egid != p->creds.gid)) {
+				struct cred *cred;
+				err = -ENOMEM;
+				cred = prepare_creds();
+				if (!cred)
+					goto error;
+
+				cred->uid = cred->euid = p->creds.uid;
+				cred->gid = cred->egid = p->creds.gid;
+				put_cred(p->cred);
+				p->cred = cred;
+			}
+			break;
+		default:
+			goto error;
+		}
+	}
+
+	if (p->fp && !p->fp->count)
+	{
+		kfree(p->fp);
+		p->fp = NULL;
+	}
+	return 0;
+
+error:
+	scm_destroy(p);
+	return err;
+}
+EXPORT_SYMBOL(__scm_send);
+
+int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
+{
+	struct cmsghdr __user *cm
+		= (__force struct cmsghdr __user *)msg->msg_control;
+	struct cmsghdr cmhdr;
+	int cmlen = CMSG_LEN(len);
+	int err;
+
+	if (MSG_CMSG_COMPAT & msg->msg_flags)
+		return put_cmsg_compat(msg, level, type, len, data);
+
+	if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+		msg->msg_flags |= MSG_CTRUNC;
+		return 0; /* XXX: return error? check spec. */
+	}
+	if (msg->msg_controllen < cmlen) {
+		msg->msg_flags |= MSG_CTRUNC;
+		cmlen = msg->msg_controllen;
+	}
+	cmhdr.cmsg_level = level;
+	cmhdr.cmsg_type = type;
+	cmhdr.cmsg_len = cmlen;
+
+	err = -EFAULT;
+	if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
+		goto out;
+	if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
+		goto out;
+	cmlen = CMSG_SPACE(len);
+	if (msg->msg_controllen < cmlen)
+		cmlen = msg->msg_controllen;
+	msg->msg_control += cmlen;
+	msg->msg_controllen -= cmlen;
+	err = 0;
+out:
+	return err;
+}
+EXPORT_SYMBOL(put_cmsg);
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+	struct cmsghdr __user *cm
+		= (__force struct cmsghdr __user*)msg->msg_control;
+
+	int fdmax = 0;
+	int fdnum = scm->fp->count;
+	struct file **fp = scm->fp->fp;
+	int __user *cmfptr;
+	int err = 0, i;
+
+	if (MSG_CMSG_COMPAT & msg->msg_flags) {
+		scm_detach_fds_compat(msg, scm);
+		return;
+	}
+
+	if (msg->msg_controllen > sizeof(struct cmsghdr))
+		fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
+			 / sizeof(int));
+
+	if (fdnum < fdmax)
+		fdmax = fdnum;
+
+	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
+	     i++, cmfptr++)
+	{
+		int new_fd;
+		err = security_file_receive(fp[i]);
+		if (err)
+			break;
+		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
+					  ? O_CLOEXEC : 0);
+		if (err < 0)
+			break;
+		new_fd = err;
+		err = put_user(new_fd, cmfptr);
+		if (err) {
+			put_unused_fd(new_fd);
+			break;
+		}
+		/* Bump the usage count and install the file. */
+		get_file(fp[i]);
+		fd_install(new_fd, fp[i]);
+	}
+
+	if (i > 0)
+	{
+		int cmlen = CMSG_LEN(i*sizeof(int));
+		err = put_user(SOL_SOCKET, &cm->cmsg_level);
+		if (!err)
+			err = put_user(SCM_RIGHTS, &cm->cmsg_type);
+		if (!err)
+			err = put_user(cmlen, &cm->cmsg_len);
+		if (!err) {
+			cmlen = CMSG_SPACE(i*sizeof(int));
+			msg->msg_control += cmlen;
+			msg->msg_controllen -= cmlen;
+		}
+	}
+	if (i < fdnum || (fdnum && fdmax <= 0))
+		msg->msg_flags |= MSG_CTRUNC;
+
+	/*
+	 * All of the files that fit in the message have had their
+	 * usage counts incremented, so we just free the list.
+	 */
+	__scm_destroy(scm);
+}
+EXPORT_SYMBOL(scm_detach_fds);
+
+struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
+{
+	struct scm_fp_list *new_fpl;
+	int i;
+
+	if (!fpl)
+		return NULL;
+
+	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+			  GFP_KERNEL);
+	if (new_fpl) {
+		for (i = 0; i < fpl->count; i++)
+			get_file(fpl->fp[i]);
+		new_fpl->max = new_fpl->count;
+	}
+	return new_fpl;
+}
+EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
new file mode 100644
index 00000000..45329d7c
--- /dev/null
+++ b/net/core/secure_seq.c
@@ -0,0 +1,184 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cryptohash.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/random.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/string.h>
+
+#include <net/secure_seq.h>
+
+static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
+
+static int __init net_secret_init(void)
+{
+	get_random_bytes(net_secret, sizeof(net_secret));
+	return 0;
+}
+late_initcall(net_secret_init);
+
+static u32 seq_scale(u32 seq)
+{
+	/*
+	 *	As close as possible to RFC 793, which
+	 *	suggests using a 250 kHz clock.
+	 *	Further reading shows this assumes 2 Mb/s networks.
+	 *	For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
+	 *	For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
+	 *	we also need to limit the resolution so that the u32 seq
+	 *	overlaps less than one time per MSL (2 minutes).
+	 *	Choosing a clock of 64 ns period is OK. (period of 274 s)
+	 */
+	return seq + (ktime_to_ns(ktime_get_real()) >> 6);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+				   __be16 sport, __be16 dport)
+{
+	u32 secret[MD5_MESSAGE_BYTES / 4];
+	u32 hash[MD5_DIGEST_WORDS];
+	u32 i;
+
+	memcpy(hash, saddr, 16);
+	for (i = 0; i < 4; i++)
+		secret[i] = net_secret[i] + daddr[i];
+	secret[4] = net_secret[4] +
+		(((__force u16)sport << 16) + (__force u16)dport);
+	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+		secret[i] = net_secret[i];
+
+	md5_transform(hash, secret);
+
+	return seq_scale(hash[0]);
+}
+EXPORT_SYMBOL(secure_tcpv6_sequence_number);
+
+u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+			       __be16 dport)
+{
+	u32 secret[MD5_MESSAGE_BYTES / 4];
+	u32 hash[MD5_DIGEST_WORDS];
+	u32 i;
+
+	memcpy(hash, saddr, 16);
+	for (i = 0; i < 4; i++)
+		secret[i] = net_secret[i] + (__force u32) daddr[i];
+	secret[4] = net_secret[4] + (__force u32)dport;
+	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+		secret[i] = net_secret[i];
+
+	md5_transform(hash, secret);
+
+	return hash[0];
+}
+#endif
+
+#ifdef CONFIG_INET
+__u32 secure_ip_id(__be32 daddr)
+{
+	u32 hash[MD5_DIGEST_WORDS];
+
+	hash[0] = (__force __u32) daddr;
+	hash[1] = net_secret[13];
+	hash[2] = net_secret[14];
+	hash[3] = net_secret[15];
+
+	md5_transform(hash, net_secret);
+
+	return hash[0];
+}
+
+__u32 secure_ipv6_id(const __be32 daddr[4])
+{
+	__u32 hash[4];
+
+	memcpy(hash, daddr, 16);
+	md5_transform(hash, net_secret);
+
+	return hash[0];
+}
+
+__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+				 __be16 sport, __be16 dport)
+{
+	u32 hash[MD5_DIGEST_WORDS];
+
+	hash[0] = (__force u32)saddr;
+	hash[1] = (__force u32)daddr;
+	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+	hash[3] = net_secret[15];
+
+	md5_transform(hash, net_secret);
+
+	return seq_scale(hash[0]);
+}
+
+u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+{
+	u32 hash[MD5_DIGEST_WORDS];
+
+	hash[0] = (__force u32)saddr;
+	hash[1] = (__force u32)daddr;
+	hash[2] = (__force u32)dport ^ net_secret[14];
+	hash[3] = net_secret[15];
+
+	md5_transform(hash, net_secret);
+
+	return hash[0];
+}
+EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
+#endif
+
+#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
+u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
+				__be16 sport, __be16 dport)
+{
+	u32 hash[MD5_DIGEST_WORDS];
+	u64 seq;
+
+	hash[0] = (__force u32)saddr;
+	hash[1] = (__force u32)daddr;
+	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
+	hash[3] = net_secret[15];
+
+	md5_transform(hash, net_secret);
+
+	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq += ktime_to_ns(ktime_get_real());
+	seq &= (1ull << 48) - 1;
+
+	return seq;
+}
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+				  __be16 sport, __be16 dport)
+{
+	u32 secret[MD5_MESSAGE_BYTES / 4];
+	u32 hash[MD5_DIGEST_WORDS];
+	u64 seq;
+	u32 i;
+
+	memcpy(hash, saddr, 16);
+	for (i = 0; i < 4; i++)
+		secret[i] = net_secret[i] + daddr[i];
+	secret[4] = net_secret[4] +
+		(((__force u16)sport << 16) + (__force u16)dport);
+	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
+		secret[i] = net_secret[i];
+
+	md5_transform(hash, secret);
+
+	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq += ktime_to_ns(ktime_get_real());
+	seq &= (1ull << 48) - 1;
+
+	return seq;
+}
+EXPORT_SYMBOL(secure_dccpv6_sequence_number);
+#endif
+#endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
new file mode 100644
index 00000000..4821df84
--- /dev/null
+++ b/net/core/skbuff.c
@@ -0,0 +1,3084 @@
+/*
+ *	Routines having to do with the 'struct sk_buff' memory handlers.
+ *
+ *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *			Florian La Roche <rzsfl@rz.uni-sb.de>
+ *
+ *	Fixes:
+ *		Alan Cox	:	Fixed the worst of the load
+ *					balancer bugs.
+ *		Dave Platt	:	Interrupt stacking fix.
+ *	Richard Kooijman	:	Timestamp fixes.
+ *		Alan Cox	:	Changed buffer format.
+ *		Alan Cox	:	destructor hook for AF_UNIX etc.
+ *		Linus Torvalds	:	Better skb_clone.
+ *		Alan Cox	:	Added skb_copy.
+ *		Alan Cox	:	Added all the changed routines Linus
+ *					only put in the headers
+ *		Ray VanTassle	:	Fixed --skb->lock in free
+ *		Alan Cox	:	skb_copy copy arp field
+ *		Andi Kleen	:	slabified it.
+ *		Robert Olsson	:	Removed skb_head_pool
+ *
+ *	NOTE:
+ *		The __skb_ routines should be called with interrupts
+ *	disabled, or you better be *real* sure that the operation is atomic
+ *	with respect to whatever list is being frobbed (e.g. via lock_sock()
+ *	or via disabling bottom half handlers, etc).
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *	The functions in this file will not compile correctly with gcc 2.4.x
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NET_CLS_ACT
+#include <net/pkt_sched.h>
+#endif
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/cache.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <linux/errqueue.h>
+#include <linux/prefetch.h>
+
+#include <net/protocol.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <trace/events/skb.h>
+
+#include "kmap_skb.h"
+
+static struct kmem_cache *skbuff_head_cache __read_mostly;
+static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+
+static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
+{
+	put_page(buf->page);
+}
+
+static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
+{
+	get_page(buf->page);
+}
+
+static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+
+/* Pipe buffer operations for a socket. */
+static const struct pipe_buf_operations sock_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = sock_pipe_buf_release,
+	.steal = sock_pipe_buf_steal,
+	.get = sock_pipe_buf_get,
+};
+
+/*
+ *	Keep out-of-line to prevent kernel bloat.
+ *	__builtin_return_address is not used because it is not always
+ *	reliable.
+ */
+
+/**
+ *	skb_over_panic	- 	private function
+ *	@skb: buffer
+ *	@sz: size
+ *	@here: address
+ *
+ *	Out of line support code for skb_put(). Not user callable.
+ */
+static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+{
+	printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
+			  "data:%p tail:%#lx end:%#lx dev:%s\n",
+	       here, skb->len, sz, skb->head, skb->data,
+	       (unsigned long)skb->tail, (unsigned long)skb->end,
+	       skb->dev ? skb->dev->name : "<NULL>");
+	BUG();
+}
+
+/**
+ *	skb_under_panic	- 	private function
+ *	@skb: buffer
+ *	@sz: size
+ *	@here: address
+ *
+ *	Out of line support code for skb_push(). Not user callable.
+ */
+
+static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
+{
+	printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
+			  "data:%p tail:%#lx end:%#lx dev:%s\n",
+	       here, skb->len, sz, skb->head, skb->data,
+	       (unsigned long)skb->tail, (unsigned long)skb->end,
+	       skb->dev ? skb->dev->name : "<NULL>");
+	BUG();
+}
+
+/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
+ *	'private' fields and also do memory statistics to find all the
+ *	[BEEP] leaks.
+ *
+ */
+
+/**
+ *	__alloc_skb	-	allocate a network buffer
+ *	@size: size to allocate
+ *	@gfp_mask: allocation mask
+ *	@fclone: allocate from fclone cache instead of head cache
+ *		and allocate a cloned (child) skb
+ *	@node: numa node to allocate memory on
+ *
+ *	Allocate a new &sk_buff. The returned buffer has no headroom and a
+ *	tail room of size bytes. The object has a reference count of one.
+ *	The return is the buffer. On a failure the return is %NULL.
+ *
+ *	Buffers may only be allocated from interrupts using a @gfp_mask of
+ *	%GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+			    int fclone, int node)
+{
+	struct kmem_cache *cache;
+	struct skb_shared_info *shinfo;
+	struct sk_buff *skb;
+	u8 *data;
+
+	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+
+	/* Get the HEAD */
+	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+	if (!skb)
+		goto out;
+	prefetchw(skb);
+
+	size = SKB_DATA_ALIGN(size);
+	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
+			gfp_mask, node);
+	if (!data)
+		goto nodata;
+	prefetchw(data + size);
+
+	/*
+	 * Only clear those fields we need to clear, not those that we will
+	 * actually initialise below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
+	 */
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->truesize = size + sizeof(struct sk_buff);
+	atomic_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+	kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+	if (fclone) {
+		struct sk_buff *child = skb + 1;
+		atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+		kmemcheck_annotate_bitfield(child, flags1);
+		kmemcheck_annotate_bitfield(child, flags2);
+		skb->fclone = SKB_FCLONE_ORIG;
+		atomic_set(fclone_ref, 1);
+
+		child->fclone = SKB_FCLONE_UNAVAILABLE;
+	}
+out:
+	return skb;
+nodata:
+	kmem_cache_free(cache, skb);
+	skb = NULL;
+	goto out;
+}
+EXPORT_SYMBOL(__alloc_skb);
+
+/**
+ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	@dev: network device to receive on
+ *	@length: length to allocate
+ *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *
+ *	Allocate a new &sk_buff and assign it a usage count of one. The
+ *	buffer has unspecified headroom built in. Users should allocate
+ *	the headroom they think they need without accounting for the
+ *	built in space. The built in space is used for optimisations.
+ *
+ *	%NULL is returned if there is no free memory.
+ */
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+		unsigned int length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+	if (likely(skb)) {
+		skb_reserve(skb, NET_SKB_PAD);
+		skb->dev = dev;
+	}
+	return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb);
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+		int size)
+{
+	skb_fill_page_desc(skb, i, page, off, size);
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += size;
+}
+EXPORT_SYMBOL(skb_add_rx_frag);
+
+/**
+ *	dev_alloc_skb - allocate an skbuff for receiving
+ *	@length: length to allocate
+ *
+ *	Allocate a new &sk_buff and assign it a usage count of one. The
+ *	buffer has unspecified headroom built in. Users should allocate
+ *	the headroom they think they need without accounting for the
+ *	built in space. The built in space is used for optimisations.
+ *
+ *	%NULL is returned if there is no free memory. Although this function
+ *	allocates memory it can be called from an interrupt.
+ */
+struct sk_buff *dev_alloc_skb(unsigned int length)
+{
+	/*
+	 * There is more code here than it seems:
+	 * __dev_alloc_skb is an inline
+	 */
+	return __dev_alloc_skb(length, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(dev_alloc_skb);
+
+static void skb_drop_list(struct sk_buff **listp)
+{
+	struct sk_buff *list = *listp;
+
+	*listp = NULL;
+
+	do {
+		struct sk_buff *this = list;
+		list = list->next;
+		kfree_skb(this);
+	} while (list);
+}
+
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+	skb_drop_list(&skb_shinfo(skb)->frag_list);
+}
+
+static void skb_clone_fraglist(struct sk_buff *skb)
+{
+	struct sk_buff *list;
+
+	skb_walk_frags(skb, list)
+		skb_get(list);
+}
+
+static void skb_release_data(struct sk_buff *skb)
+{
+	if (!skb->cloned ||
+	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
+			       &skb_shinfo(skb)->dataref)) {
+		if (skb_shinfo(skb)->nr_frags) {
+			int i;
+			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+				put_page(skb_shinfo(skb)->frags[i].page);
+		}
+
+		if (skb_has_frag_list(skb))
+			skb_drop_fraglist(skb);
+
+		kfree(skb->head);
+	}
+}
+
+/*
+ *	Free an skbuff by memory without cleaning the state.
+ */
+static void kfree_skbmem(struct sk_buff *skb)
+{
+	struct sk_buff *other;
+	atomic_t *fclone_ref;
+
+	switch (skb->fclone) {
+	case SKB_FCLONE_UNAVAILABLE:
+		kmem_cache_free(skbuff_head_cache, skb);
+		break;
+
+	case SKB_FCLONE_ORIG:
+		fclone_ref = (atomic_t *) (skb + 2);
+		if (atomic_dec_and_test(fclone_ref))
+			kmem_cache_free(skbuff_fclone_cache, skb);
+		break;
+
+	case SKB_FCLONE_CLONE:
+		fclone_ref = (atomic_t *) (skb + 1);
+		other = skb - 1;
+
+		/* The clone portion is available for
+		 * fast-cloning again.
+		 */
+		skb->fclone = SKB_FCLONE_UNAVAILABLE;
+
+		if (atomic_dec_and_test(fclone_ref))
+			kmem_cache_free(skbuff_fclone_cache, other);
+		break;
+	}
+}
+
+static void skb_release_head_state(struct sk_buff *skb)
+{
+	skb_dst_drop(skb);
+#ifdef CONFIG_XFRM
+	secpath_put(skb->sp);
+#endif
+	if (skb->destructor) {
+		WARN_ON(in_irq());
+		skb->destructor(skb);
+	}
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	nf_conntrack_put(skb->nfct);
+#endif
+#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
+	nf_conntrack_put_reasm(skb->nfct_reasm);
+#endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+	nf_bridge_put(skb->nf_bridge);
+#endif
+/* XXX: IS this still necessary? - JHS */
+#ifdef CONFIG_NET_SCHED
+	skb->tc_index = 0;
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_verd = 0;
+#endif
+#endif
+}
+
+/* Free everything but the sk_buff shell. */
+static void skb_release_all(struct sk_buff *skb)
+{
+	skb_release_head_state(skb);
+	skb_release_data(skb);
+}
+
+/**
+ *	__kfree_skb - private function
+ *	@skb: buffer
+ *
+ *	Free an sk_buff. Release anything attached to the buffer.
+ *	Clean the state. This is an internal helper function. Users should
+ *	always call kfree_skb
+ */
+
+void __kfree_skb(struct sk_buff *skb)
+{
+	skb_release_all(skb);
+	kfree_skbmem(skb);
+}
+EXPORT_SYMBOL(__kfree_skb);
+
+/**
+ *	kfree_skb - free an sk_buff
+ *	@skb: buffer to free
+ *
+ *	Drop a reference to the buffer and free it if the usage count has
+ *	hit zero.
+ */
+void kfree_skb(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	trace_kfree_skb(skb, __builtin_return_address(0));
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb);
+
+/**
+ *	consume_skb - free an skbuff
+ *	@skb: buffer to free
+ *
+ *	Drop a ref to the buffer and free it if the usage count has hit zero
+ *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
+ *	is being dropped after a failure and notes that
+ */
+void consume_skb(struct sk_buff *skb)
+{
+	if (unlikely(!skb))
+		return;
+	if (likely(atomic_read(&skb->users) == 1))
+		smp_rmb();
+	else if (likely(!atomic_dec_and_test(&skb->users)))
+		return;
+	trace_consume_skb(skb);
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL(consume_skb);
+
+/**
+ *	skb_recycle_check - check if skb can be reused for receive
+ *	@skb: buffer
+ *	@skb_size: minimum receive buffer size
+ *
+ *	Checks that the skb passed in is not shared or cloned, and
+ *	that it is linear and its head portion at least as large as
+ *	skb_size so that it can be recycled as a receive buffer.
+ *	If these conditions are met, this function does any necessary
+ *	reference count dropping and cleans up the skbuff as if it
+ *	just came from __alloc_skb().
+ */
+bool skb_recycle_check(struct sk_buff *skb, int skb_size)
+{
+	struct skb_shared_info *shinfo;
+
+	if (irqs_disabled())
+		return false;
+
+	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
+		return false;
+
+	skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
+	if (skb_end_pointer(skb) - skb->head < skb_size)
+		return false;
+
+	if (skb_shared(skb) || skb_cloned(skb))
+		return false;
+
+	skb_release_head_state(skb);
+
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->data = skb->head + NET_SKB_PAD;
+	skb_reset_tail_pointer(skb);
+
+	return true;
+}
+EXPORT_SYMBOL(skb_recycle_check);
+
+static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+	new->tstamp		= old->tstamp;
+	new->dev		= old->dev;
+	new->transport_header	= old->transport_header;
+	new->network_header	= old->network_header;
+	new->mac_header		= old->mac_header;
+	skb_dst_copy(new, old);
+	new->rxhash		= old->rxhash;
+#ifdef CONFIG_XFRM
+	new->sp			= secpath_get(old->sp);
+#endif
+	memcpy(new->cb, old->cb, sizeof(old->cb));
+	new->csum		= old->csum;
+	new->local_df		= old->local_df;
+	new->pkt_type		= old->pkt_type;
+	new->ip_summed		= old->ip_summed;
+	skb_copy_queue_mapping(new, old);
+	new->priority		= old->priority;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+	new->ipvs_property	= old->ipvs_property;
+#endif
+	new->protocol		= old->protocol;
+	new->mark		= old->mark;
+	new->skb_iif		= old->skb_iif;
+	__nf_copy(new, old);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	new->nf_trace		= old->nf_trace;
+#endif
+#ifdef CONFIG_NET_SCHED
+	new->tc_index		= old->tc_index;
+#ifdef CONFIG_NET_CLS_ACT
+	new->tc_verd		= old->tc_verd;
+#endif
+#endif
+	new->vlan_tci		= old->vlan_tci;
+
+	skb_copy_secmark(new, old);
+}
+
+/*
+ * You should not add any new code to this function.  Add it to
+ * __copy_skb_header above instead.
+ */
+static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
+{
+#define C(x) n->x = skb->x
+
+	n->next = n->prev = NULL;
+	n->sk = NULL;
+	__copy_skb_header(n, skb);
+
+	C(len);
+	C(data_len);
+	C(mac_len);
+	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
+	n->cloned = 1;
+	n->nohdr = 0;
+	n->destructor = NULL;
+	C(tail);
+	C(end);
+	C(head);
+	C(data);
+	C(truesize);
+	atomic_set(&n->users, 1);
+
+	atomic_inc(&(skb_shinfo(skb)->dataref));
+	skb->cloned = 1;
+
+	return n;
+#undef C
+}
+
+/**
+ *	skb_morph	-	morph one skb into another
+ *	@dst: the skb to receive the contents
+ *	@src: the skb to supply the contents
+ *
+ *	This is identical to skb_clone except that the target skb is
+ *	supplied by the user.
+ *
+ *	The target skb is returned upon exit.
+ */
+struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
+{
+	skb_release_all(dst);
+	return __skb_clone(dst, src);
+}
+EXPORT_SYMBOL_GPL(skb_morph);
+
+/**
+ *	skb_clone	-	duplicate an sk_buff
+ *	@skb: buffer to clone
+ *	@gfp_mask: allocation priority
+ *
+ *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
+ *	copies share the same packet data but not structure. The new
+ *	buffer has a reference count of 1. If the allocation fails the
+ *	function returns %NULL otherwise the new buffer is returned.
+ *
+ *	If this function is called from an interrupt gfp_mask() must be
+ *	%GFP_ATOMIC.
+ */
+
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	struct sk_buff *n;
+
+	n = skb + 1;
+	if (skb->fclone == SKB_FCLONE_ORIG &&
+	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
+		atomic_t *fclone_ref = (atomic_t *) (n + 1);
+		n->fclone = SKB_FCLONE_CLONE;
+		atomic_inc(fclone_ref);
+	} else {
+		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+		if (!n)
+			return NULL;
+
+		kmemcheck_annotate_bitfield(n, flags1);
+		kmemcheck_annotate_bitfield(n, flags2);
+		n->fclone = SKB_FCLONE_UNAVAILABLE;
+	}
+
+	return __skb_clone(n, skb);
+}
+EXPORT_SYMBOL(skb_clone);
+
+static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+{
+#ifndef NET_SKBUFF_DATA_USES_OFFSET
+	/*
+	 *	Shift between the two data areas in bytes
+	 */
+	unsigned long offset = new->data - old->data;
+#endif
+
+	__copy_skb_header(new, old);
+
+#ifndef NET_SKBUFF_DATA_USES_OFFSET
+	/* {transport,network,mac}_header are relative to skb->head */
+	new->transport_header += offset;
+	new->network_header   += offset;
+	if (skb_mac_header_was_set(new))
+		new->mac_header	      += offset;
+#endif
+	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
+	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
+	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
+}
+
+/**
+ *	skb_copy	-	create private copy of an sk_buff
+ *	@skb: buffer to copy
+ *	@gfp_mask: allocation priority
+ *
+ *	Make a copy of both an &sk_buff and its data. This is used when the
+ *	caller wishes to modify the data and needs a private copy of the
+ *	data to alter. Returns %NULL on failure or the pointer to the buffer
+ *	on success. The returned buffer has a reference count of 1.
+ *
+ *	As by-product this function converts non-linear &sk_buff to linear
+ *	one, so that &sk_buff becomes completely private and caller is allowed
+ *	to modify all the data of returned buffer. This means that this
+ *	function is not recommended for use in circumstances when only
+ *	header is going to be modified. Use pskb_copy() instead.
+ */
+
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
+{
+	int headerlen = skb_headroom(skb);
+	unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
+	struct sk_buff *n = alloc_skb(size, gfp_mask);
+
+	if (!n)
+		return NULL;
+
+	/* Set the data pointer */
+	skb_reserve(n, headerlen);
+	/* Set the tail pointer and length */
+	skb_put(n, skb->len);
+
+	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
+		BUG();
+
+	copy_skb_header(n, skb);
+	return n;
+}
+EXPORT_SYMBOL(skb_copy);
+
+/**
+ *	pskb_copy	-	create copy of an sk_buff with private head.
+ *	@skb: buffer to copy
+ *	@gfp_mask: allocation priority
+ *
+ *	Make a copy of both an &sk_buff and part of its data, located
+ *	in header. Fragmented data remain shared. This is used when
+ *	the caller wishes to modify only header of &sk_buff and needs
+ *	private copy of the header to alter. Returns %NULL on failure
+ *	or the pointer to the buffer on success.
+ *	The returned buffer has a reference count of 1.
+ */
+
+struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	unsigned int size = skb_end_pointer(skb) - skb->head;
+	struct sk_buff *n = alloc_skb(size, gfp_mask);
+
+	if (!n)
+		goto out;
+
+	/* Set the data pointer */
+	skb_reserve(n, skb_headroom(skb));
+	/* Set the tail pointer and length */
+	skb_put(n, skb_headlen(skb));
+	/* Copy the bytes */
+	skb_copy_from_linear_data(skb, n->data, n->len);
+
+	n->truesize += skb->data_len;
+	n->data_len  = skb->data_len;
+	n->len	     = skb->len;
+
+	if (skb_shinfo(skb)->nr_frags) {
+		int i;
+
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
+			get_page(skb_shinfo(n)->frags[i].page);
+		}
+		skb_shinfo(n)->nr_frags = i;
+	}
+
+	if (skb_has_frag_list(skb)) {
+		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
+		skb_clone_fraglist(n);
+	}
+
+	copy_skb_header(n, skb);
+out:
+	return n;
+}
+EXPORT_SYMBOL(pskb_copy);
+
+/**
+ *	pskb_expand_head - reallocate header of &sk_buff
+ *	@skb: buffer to reallocate
+ *	@nhead: room to add at head
+ *	@ntail: room to add at tail
+ *	@gfp_mask: allocation priority
+ *
+ *	Expands (or creates identical copy, if &nhead and &ntail are zero)
+ *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have
+ *	reference count of 1. Returns zero in the case of success or error,
+ *	if expansion failed. In the last case, &sk_buff is not changed.
+ *
+ *	All the pointers pointing into skb header may change and must be
+ *	reloaded after call to this function.
+ */
+
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+		     gfp_t gfp_mask)
+{
+	int i;
+	u8 *data;
+	int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
+	long off;
+	bool fastpath;
+
+	BUG_ON(nhead < 0);
+
+	if (skb_shared(skb))
+		BUG();
+
+	size = SKB_DATA_ALIGN(size);
+
+	/* Check if we can avoid taking references on fragments if we own
+	 * the last reference on skb->head. (see skb_release_data())
+	 */
+	if (!skb->cloned)
+		fastpath = true;
+	else {
+		int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
+
+		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
+	}
+
+	if (fastpath &&
+	    size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
+		memmove(skb->head + size, skb_shinfo(skb),
+			offsetof(struct skb_shared_info,
+				 frags[skb_shinfo(skb)->nr_frags]));
+		memmove(skb->head + nhead, skb->head,
+			skb_tail_pointer(skb) - skb->head);
+		off = nhead;
+		goto adjust_others;
+	}
+
+	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	if (!data)
+		goto nodata;
+
+	/* Copy only real data... and, alas, header. This should be
+	 * optimized for the cases when header is void.
+	 */
+	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
+
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb),
+	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+
+	if (fastpath) {
+		kfree(skb->head);
+	} else {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			get_page(skb_shinfo(skb)->frags[i].page);
+
+		if (skb_has_frag_list(skb))
+			skb_clone_fraglist(skb);
+
+		skb_release_data(skb);
+	}
+	off = (data + nhead) - skb->head;
+
+	skb->head     = data;
+adjust_others:
+	skb->data    += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->end      = size;
+	off           = nhead;
+#else
+	skb->end      = skb->head + size;
+#endif
+	/* {transport,network,mac}_header and tail are relative to skb->head */
+	skb->tail	      += off;
+	skb->transport_header += off;
+	skb->network_header   += off;
+	if (skb_mac_header_was_set(skb))
+		skb->mac_header += off;
+	/* Only adjust this if it actually is csum_start rather than csum */
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb->csum_start += nhead;
+	skb->cloned   = 0;
+	skb->hdr_len  = 0;
+	skb->nohdr    = 0;
+	atomic_set(&skb_shinfo(skb)->dataref, 1);
+	return 0;
+
+nodata:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(pskb_expand_head);
+
+/* Make private copy of skb with writable head and some headroom */
+
+struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
+{
+	struct sk_buff *skb2;
+	int delta = headroom - skb_headroom(skb);
+
+	if (delta <= 0)
+		skb2 = pskb_copy(skb, GFP_ATOMIC);
+	else {
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
+					     GFP_ATOMIC)) {
+			kfree_skb(skb2);
+			skb2 = NULL;
+		}
+	}
+	return skb2;
+}
+EXPORT_SYMBOL(skb_realloc_headroom);
+
+/**
+ *	skb_copy_expand	-	copy and expand sk_buff
+ *	@skb: buffer to copy
+ *	@newheadroom: new free bytes at head
+ *	@newtailroom: new free bytes at tail
+ *	@gfp_mask: allocation priority
+ *
+ *	Make a copy of both an &sk_buff and its data and while doing so
+ *	allocate additional space.
+ *
+ *	This is used when the caller wishes to modify the data and needs a
+ *	private copy of the data to alter as well as more space for new fields.
+ *	Returns %NULL on failure or the pointer to the buffer
+ *	on success. The returned buffer has a reference count of 1.
+ *
+ *	You must pass %GFP_ATOMIC as the allocation priority if this function
+ *	is called from an interrupt.
+ */
+struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
+				int newheadroom, int newtailroom,
+				gfp_t gfp_mask)
+{
+	/*
+	 *	Allocate the copy buffer
+	 */
+	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
+				      gfp_mask);
+	int oldheadroom = skb_headroom(skb);
+	int head_copy_len, head_copy_off;
+	int off;
+
+	if (!n)
+		return NULL;
+
+	skb_reserve(n, newheadroom);
+
+	/* Set the tail pointer and length */
+	skb_put(n, skb->len);
+
+	head_copy_len = oldheadroom;
+	head_copy_off = 0;
+	if (newheadroom <= head_copy_len)
+		head_copy_len = newheadroom;
+	else
+		head_copy_off = newheadroom - head_copy_len;
+
+	/* Copy the linear header and data. */
+	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+			  skb->len + head_copy_len))
+		BUG();
+
+	copy_skb_header(n, skb);
+
+	off                  = newheadroom - oldheadroom;
+	if (n->ip_summed == CHECKSUM_PARTIAL)
+		n->csum_start += off;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	n->transport_header += off;
+	n->network_header   += off;
+	if (skb_mac_header_was_set(skb))
+		n->mac_header += off;
+#endif
+
+	return n;
+}
+EXPORT_SYMBOL(skb_copy_expand);
+
+/**
+ *	skb_pad			-	zero pad the tail of an skb
+ *	@skb: buffer to pad
+ *	@pad: space to pad
+ *
+ *	Ensure that a buffer is followed by a padding area that is zero
+ *	filled. Used by network drivers which may DMA or transfer data
+ *	beyond the buffer end onto the wire.
+ *
+ *	May return error in out of memory cases. The skb is freed on error.
+ */
+
+int skb_pad(struct sk_buff *skb, int pad)
+{
+	int err;
+	int ntail;
+
+	/* If the skbuff is non linear tailroom is always zero.. */
+	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
+		memset(skb->data+skb->len, 0, pad);
+		return 0;
+	}
+
+	ntail = skb->data_len + pad - (skb->end - skb->tail);
+	if (likely(skb_cloned(skb) || ntail > 0)) {
+		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
+		if (unlikely(err))
+			goto free_skb;
+	}
+
+	/* FIXME: The use of this function with non-linear skb's really needs
+	 * to be audited.
+	 */
+	err = skb_linearize(skb);
+	if (unlikely(err))
+		goto free_skb;
+
+	memset(skb->data + skb->len, 0, pad);
+	return 0;
+
+free_skb:
+	kfree_skb(skb);
+	return err;
+}
+EXPORT_SYMBOL(skb_pad);
+
+/**
+ *	skb_put - add data to a buffer
+ *	@skb: buffer to use
+ *	@len: amount of data to add
+ *
+ *	This function extends the used data area of the buffer. If this would
+ *	exceed the total buffer size the kernel will panic. A pointer to the
+ *	first byte of the extra data is returned.
+ */
+unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
+{
+	unsigned char *tmp = skb_tail_pointer(skb);
+	SKB_LINEAR_ASSERT(skb);
+	skb->tail += len;
+	skb->len  += len;
+	if (unlikely(skb->tail > skb->end))
+		skb_over_panic(skb, len, __builtin_return_address(0));
+	return tmp;
+}
+EXPORT_SYMBOL(skb_put);
+
+/**
+ *	skb_push - add data to the start of a buffer
+ *	@skb: buffer to use
+ *	@len: amount of data to add
+ *
+ *	This function extends the used data area of the buffer at the buffer
+ *	start. If this would exceed the total buffer headroom the kernel will
+ *	panic. A pointer to the first byte of the extra data is returned.
+ */
+unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
+{
+	skb->data -= len;
+	skb->len  += len;
+	if (unlikely(skb->data<skb->head))
+		skb_under_panic(skb, len, __builtin_return_address(0));
+	return skb->data;
+}
+EXPORT_SYMBOL(skb_push);
+
+/**
+ *	skb_pull - remove data from the start of a buffer
+ *	@skb: buffer to use
+ *	@len: amount of data to remove
+ *
+ *	This function removes data from the start of a buffer, returning
+ *	the memory to the headroom. A pointer to the next data in the buffer
+ *	is returned. Once the data has been pulled future pushes will overwrite
+ *	the old data.
+ */
+unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
+{
+	return skb_pull_inline(skb, len);
+}
+EXPORT_SYMBOL(skb_pull);
+
+/**
+ *	skb_trim - remove end from a buffer
+ *	@skb: buffer to alter
+ *	@len: new length
+ *
+ *	Cut the length of a buffer down by removing data from the tail. If
+ *	the buffer is already under the length specified it is not modified.
+ *	The skb must be linear.
+ */
+void skb_trim(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->len > len)
+		__skb_trim(skb, len);
+}
+EXPORT_SYMBOL(skb_trim);
+
+/* Trims skb to length len. It can change skb pointers.
+ */
+
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
+{
+	struct sk_buff **fragp;
+	struct sk_buff *frag;
+	int offset = skb_headlen(skb);
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	int i;
+	int err;
+
+	if (skb_cloned(skb) &&
+	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
+		return err;
+
+	i = 0;
+	if (offset >= len)
+		goto drop_pages;
+
+	for (; i < nfrags; i++) {
+		int end = offset + skb_shinfo(skb)->frags[i].size;
+
+		if (end < len) {
+			offset = end;
+			continue;
+		}
+
+		skb_shinfo(skb)->frags[i++].size = len - offset;
+
+drop_pages:
+		skb_shinfo(skb)->nr_frags = i;
+
+		for (; i < nfrags; i++)
+			put_page(skb_shinfo(skb)->frags[i].page);
+
+		if (skb_has_frag_list(skb))
+			skb_drop_fraglist(skb);
+		goto done;
+	}
+
+	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
+	     fragp = &frag->next) {
+		int end = offset + frag->len;
+
+		if (skb_shared(frag)) {
+			struct sk_buff *nfrag;
+
+			nfrag = skb_clone(frag, GFP_ATOMIC);
+			if (unlikely(!nfrag))
+				return -ENOMEM;
+
+			nfrag->next = frag->next;
+			kfree_skb(frag);
+			frag = nfrag;
+			*fragp = frag;
+		}
+
+		if (end < len) {
+			offset = end;
+			continue;
+		}
+
+		if (end > len &&
+		    unlikely((err = pskb_trim(frag, len - offset))))
+			return err;
+
+		if (frag->next)
+			skb_drop_list(&frag->next);
+		break;
+	}
+
+done:
+	if (len > skb_headlen(skb)) {
+		skb->data_len -= skb->len - len;
+		skb->len       = len;
+	} else {
+		skb->len       = len;
+		skb->data_len  = 0;
+		skb_set_tail_pointer(skb, len);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(___pskb_trim);
+
+/**
+ *	__pskb_pull_tail - advance tail of skb header
+ *	@skb: buffer to reallocate
+ *	@delta: number of bytes to advance tail
+ *
+ *	The function makes a sense only on a fragmented &sk_buff,
+ *	it expands header moving its tail forward and copying necessary
+ *	data from fragmented part.
+ *
+ *	&sk_buff MUST have reference count of 1.
+ *
+ *	Returns %NULL (and &sk_buff does not change) if pull failed
+ *	or value of new tail of skb in the case of success.
+ *
+ *	All the pointers pointing into skb header may change and must be
+ *	reloaded after call to this function.
+ */
+
+/* Moves tail of skb head forward, copying data from fragmented part,
+ * when it is necessary.
+ * 1. It may fail due to malloc failure.
+ * 2. It may change skb pointers.
+ *
+ * It is pretty complicated. Luckily, it is called only in exceptional cases.
+ */
+unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
+{
+	/* If skb has not enough free space at tail, get new one
+	 * plus 128 bytes for future expansions. If we have enough
+	 * room at tail, reallocate without expansion only if skb is cloned.
+	 */
+	int i, k, eat = (skb->tail + delta) - skb->end;
+
+	if (eat > 0 || skb_cloned(skb)) {
+		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
+				     GFP_ATOMIC))
+			return NULL;
+	}
+
+	if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
+		BUG();
+
+	/* Optimization: no fragments, no reasons to preestimate
+	 * size of pulled pages. Superb.
+	 */
+	if (!skb_has_frag_list(skb))
+		goto pull_pages;
+
+	/* Estimate size of pulled pages. */
+	eat = delta;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		if (skb_shinfo(skb)->frags[i].size >= eat)
+			goto pull_pages;
+		eat -= skb_shinfo(skb)->frags[i].size;
+	}
+
+	/* If we need update frag list, we are in troubles.
+	 * Certainly, it possible to add an offset to skb data,
+	 * but taking into account that pulling is expected to
+	 * be very rare operation, it is worth to fight against
+	 * further bloating skb head and crucify ourselves here instead.
+	 * Pure masohism, indeed. 8)8)
+	 */
+	if (eat) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+		struct sk_buff *clone = NULL;
+		struct sk_buff *insp = NULL;
+
+		do {
+			BUG_ON(!list);
+
+			if (list->len <= eat) {
+				/* Eaten as whole. */
+				eat -= list->len;
+				list = list->next;
+				insp = list;
+			} else {
+				/* Eaten partially. */
+
+				if (skb_shared(list)) {
+					/* Sucks! We need to fork list. :-( */
+					clone = skb_clone(list, GFP_ATOMIC);
+					if (!clone)
+						return NULL;
+					insp = list->next;
+					list = clone;
+				} else {
+					/* This may be pulled without
+					 * problems. */
+					insp = list;
+				}
+				if (!pskb_pull(list, eat)) {
+					kfree_skb(clone);
+					return NULL;
+				}
+				break;
+			}
+		} while (eat);
+
+		/* Free pulled out fragments. */
+		while ((list = skb_shinfo(skb)->frag_list) != insp) {
+			skb_shinfo(skb)->frag_list = list->next;
+			kfree_skb(list);
+		}
+		/* And insert new clone at head. */
+		if (clone) {
+			clone->next = list;
+			skb_shinfo(skb)->frag_list = clone;
+		}
+	}
+	/* Success! Now we may commit changes to skb data. */
+
+pull_pages:
+	eat = delta;
+	k = 0;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		if (skb_shinfo(skb)->frags[i].size <= eat) {
+			put_page(skb_shinfo(skb)->frags[i].page);
+			eat -= skb_shinfo(skb)->frags[i].size;
+		} else {
+			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+			if (eat) {
+				skb_shinfo(skb)->frags[k].page_offset += eat;
+				skb_shinfo(skb)->frags[k].size -= eat;
+				eat = 0;
+			}
+			k++;
+		}
+	}
+	skb_shinfo(skb)->nr_frags = k;
+
+	skb->tail     += delta;
+	skb->data_len -= delta;
+
+	return skb_tail_pointer(skb);
+}
+EXPORT_SYMBOL(__pskb_pull_tail);
+
+/* Copy some data bits from skb to kernel buffer. */
+
+int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
+{
+	int start = skb_headlen(skb);
+	struct sk_buff *frag_iter;
+	int i, copy;
+
+	if (offset > (int)skb->len - len)
+		goto fault;
+
+	/* Copy header. */
+	if ((copy = start - offset) > 0) {
+		if (copy > len)
+			copy = len;
+		skb_copy_from_linear_data_offset(skb, offset, to, copy);
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		to     += copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			u8 *vaddr;
+
+			if (copy > len)
+				copy = len;
+
+			vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+			memcpy(to,
+			       vaddr + skb_shinfo(skb)->frags[i].page_offset+
+			       offset - start, copy);
+			kunmap_skb_frag(vaddr);
+
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			to     += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_bits(frag_iter, offset - start, to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			to     += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_bits);
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+	put_page(spd->pages[i]);
+}
+
+static inline struct page *linear_to_page(struct page *page, unsigned int *len,
+					  unsigned int *offset,
+					  struct sk_buff *skb, struct sock *sk)
+{
+	struct page *p = sk->sk_sndmsg_page;
+	unsigned int off;
+
+	if (!p) {
+new_page:
+		p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
+		if (!p)
+			return NULL;
+
+		off = sk->sk_sndmsg_off = 0;
+		/* hold one ref to this page until it's full */
+	} else {
+		unsigned int mlen;
+
+		off = sk->sk_sndmsg_off;
+		mlen = PAGE_SIZE - off;
+		if (mlen < 64 && mlen < *len) {
+			put_page(p);
+			goto new_page;
+		}
+
+		*len = min_t(unsigned int, *len, mlen);
+	}
+
+	memcpy(page_address(p) + off, page_address(page) + *offset, *len);
+	sk->sk_sndmsg_off += *len;
+	*offset = off;
+	get_page(p);
+
+	return p;
+}
+
+/*
+ * Fill page/offset/length into spd, if it can hold more pages.
+ */
+static inline int spd_fill_page(struct splice_pipe_desc *spd,
+				struct pipe_inode_info *pipe, struct page *page,
+				unsigned int *len, unsigned int offset,
+				struct sk_buff *skb, int linear,
+				struct sock *sk)
+{
+	if (unlikely(spd->nr_pages == pipe->buffers))
+		return 1;
+
+	if (linear) {
+		page = linear_to_page(page, len, &offset, skb, sk);
+		if (!page)
+			return 1;
+	} else
+		get_page(page);
+
+	spd->pages[spd->nr_pages] = page;
+	spd->partial[spd->nr_pages].len = *len;
+	spd->partial[spd->nr_pages].offset = offset;
+	spd->nr_pages++;
+
+	return 0;
+}
+
+static inline void __segment_seek(struct page **page, unsigned int *poff,
+				  unsigned int *plen, unsigned int off)
+{
+	unsigned long n;
+
+	*poff += off;
+	n = *poff / PAGE_SIZE;
+	if (n)
+		*page = nth_page(*page, n);
+
+	*poff = *poff % PAGE_SIZE;
+	*plen -= off;
+}
+
+static inline int __splice_segment(struct page *page, unsigned int poff,
+				   unsigned int plen, unsigned int *off,
+				   unsigned int *len, struct sk_buff *skb,
+				   struct splice_pipe_desc *spd, int linear,
+				   struct sock *sk,
+				   struct pipe_inode_info *pipe)
+{
+	if (!*len)
+		return 1;
+
+	/* skip this segment if already processed */
+	if (*off >= plen) {
+		*off -= plen;
+		return 0;
+	}
+
+	/* ignore any bits we already processed */
+	if (*off) {
+		__segment_seek(&page, &poff, &plen, *off);
+		*off = 0;
+	}
+
+	do {
+		unsigned int flen = min(*len, plen);
+
+		/* the linear region may spread across several pages  */
+		flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
+
+		if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
+			return 1;
+
+		__segment_seek(&page, &poff, &plen, flen);
+		*len -= flen;
+
+	} while (*len && plen);
+
+	return 0;
+}
+
+/*
+ * Map linear and fragment data from the skb to spd. It reports failure if the
+ * pipe is full or if we already spliced the requested length.
+ */
+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+			     unsigned int *offset, unsigned int *len,
+			     struct splice_pipe_desc *spd, struct sock *sk)
+{
+	int seg;
+
+	/*
+	 * map the linear part
+	 */
+	if (__splice_segment(virt_to_page(skb->data),
+			     (unsigned long) skb->data & (PAGE_SIZE - 1),
+			     skb_headlen(skb),
+			     offset, len, skb, spd, 1, sk, pipe))
+		return 1;
+
+	/*
+	 * then map the fragments
+	 */
+	for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
+		const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+		if (__splice_segment(f->page, f->page_offset, f->size,
+				     offset, len, skb, spd, 0, sk, pipe))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Map data from the skb to a pipe. Should handle both the linear part,
+ * the fragments, and the frag list. It does NOT handle frag lists within
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+		    struct pipe_inode_info *pipe, unsigned int tlen,
+		    unsigned int flags)
+{
+	struct partial_page partial[PIPE_DEF_BUFFERS];
+	struct page *pages[PIPE_DEF_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &sock_pipe_buf_ops,
+		.spd_release = sock_spd_release,
+	};
+	struct sk_buff *frag_iter;
+	struct sock *sk = skb->sk;
+	int ret = 0;
+
+	if (splice_grow_spd(pipe, &spd))
+		return -ENOMEM;
+
+	/*
+	 * __skb_splice_bits() only fails if the output has no room left,
+	 * so no point in going over the frag_list for the error case.
+	 */
+	if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
+		goto done;
+	else if (!tlen)
+		goto done;
+
+	/*
+	 * now see if we have a frag_list to map
+	 */
+	skb_walk_frags(skb, frag_iter) {
+		if (!tlen)
+			break;
+		if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
+			break;
+	}
+
+done:
+	if (spd.nr_pages) {
+		/*
+		 * Drop the socket lock, otherwise we have reverse
+		 * locking dependencies between sk_lock and i_mutex
+		 * here as compared to sendfile(). We enter here
+		 * with the socket lock held, and splice_to_pipe() will
+		 * grab the pipe inode lock. For sendfile() emulation,
+		 * we call into ->sendpage() with the i_mutex lock held
+		 * and networking will grab the socket lock.
+		 */
+		release_sock(sk);
+		ret = splice_to_pipe(pipe, &spd);
+		lock_sock(sk);
+	}
+
+	splice_shrink_spd(pipe, &spd);
+	return ret;
+}
+
+/**
+ *	skb_store_bits - store bits from kernel buffer to skb
+ *	@skb: destination buffer
+ *	@offset: offset in destination
+ *	@from: source buffer
+ *	@len: number of bytes to copy
+ *
+ *	Copy the specified number of bytes from the source buffer to the
+ *	destination skb.  This function handles all the messy bits of
+ *	traversing fragment lists and such.
+ */
+
+int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
+{
+	int start = skb_headlen(skb);
+	struct sk_buff *frag_iter;
+	int i, copy;
+
+	if (offset > (int)skb->len - len)
+		goto fault;
+
+	if ((copy = start - offset) > 0) {
+		if (copy > len)
+			copy = len;
+		skb_copy_to_linear_data_offset(skb, offset, from, copy);
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		from += copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag->size;
+		if ((copy = end - offset) > 0) {
+			u8 *vaddr;
+
+			if (copy > len)
+				copy = len;
+
+			vaddr = kmap_skb_frag(frag);
+			memcpy(vaddr + frag->page_offset + offset - start,
+			       from, copy);
+			kunmap_skb_frag(vaddr);
+
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			from += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_store_bits(frag_iter, offset - start,
+					   from, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+			from += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_store_bits);
+
+/* Checksum skb data. */
+
+__wsum skb_checksum(const struct sk_buff *skb, int offset,
+			  int len, __wsum csum)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	int pos = 0;
+
+	/* Checksum header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		csum = csum_partial(skb->data + offset, copy, csum);
+		if ((len -= copy) == 0)
+			return csum;
+		offset += copy;
+		pos	= copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			__wsum csum2;
+			u8 *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap_skb_frag(frag);
+			csum2 = csum_partial(vaddr + frag->page_offset +
+					     offset - start, copy, 0);
+			kunmap_skb_frag(vaddr);
+			csum = csum_block_add(csum, csum2, pos);
+			if (!(len -= copy))
+				return csum;
+			offset += copy;
+			pos    += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			__wsum csum2;
+			if (copy > len)
+				copy = len;
+			csum2 = skb_checksum(frag_iter, offset - start,
+					     copy, 0);
+			csum = csum_block_add(csum, csum2, pos);
+			if ((len -= copy) == 0)
+				return csum;
+			offset += copy;
+			pos    += copy;
+		}
+		start = end;
+	}
+	BUG_ON(len);
+
+	return csum;
+}
+EXPORT_SYMBOL(skb_checksum);
+
+/* Both of above in one bottle. */
+
+__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
+				    u8 *to, int len, __wsum csum)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	int pos = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		csum = csum_partial_copy_nocheck(skb->data + offset, to,
+						 copy, csum);
+		if ((len -= copy) == 0)
+			return csum;
+		offset += copy;
+		to     += copy;
+		pos	= copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			__wsum csum2;
+			u8 *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap_skb_frag(frag);
+			csum2 = csum_partial_copy_nocheck(vaddr +
+							  frag->page_offset +
+							  offset - start, to,
+							  copy, 0);
+			kunmap_skb_frag(vaddr);
+			csum = csum_block_add(csum, csum2, pos);
+			if (!(len -= copy))
+				return csum;
+			offset += copy;
+			to     += copy;
+			pos    += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		__wsum csum2;
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			csum2 = skb_copy_and_csum_bits(frag_iter,
+						       offset - start,
+						       to, copy, 0);
+			csum = csum_block_add(csum, csum2, pos);
+			if ((len -= copy) == 0)
+				return csum;
+			offset += copy;
+			to     += copy;
+			pos    += copy;
+		}
+		start = end;
+	}
+	BUG_ON(len);
+	return csum;
+}
+EXPORT_SYMBOL(skb_copy_and_csum_bits);
+
+void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
+{
+	__wsum csum;
+	long csstart;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		csstart = skb_checksum_start_offset(skb);
+	else
+		csstart = skb_headlen(skb);
+
+	BUG_ON(csstart > skb_headlen(skb));
+
+	skb_copy_from_linear_data(skb, to, csstart);
+
+	csum = 0;
+	if (csstart != skb->len)
+		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
+					      skb->len - csstart, 0);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		long csstuff = csstart + skb->csum_offset;
+
+		*((__sum16 *)(to + csstuff)) = csum_fold(csum);
+	}
+}
+EXPORT_SYMBOL(skb_copy_and_csum_dev);
+
+/**
+ *	skb_dequeue - remove from the head of the queue
+ *	@list: list to dequeue from
+ *
+ *	Remove the head of the list. The list lock is taken so the function
+ *	may be used safely with other locking list functions. The head item is
+ *	returned or %NULL if the list is empty.
+ */
+
+struct sk_buff *skb_dequeue(struct sk_buff_head *list)
+{
+	unsigned long flags;
+	struct sk_buff *result;
+
+	spin_lock_irqsave(&list->lock, flags);
+	result = __skb_dequeue(list);
+	spin_unlock_irqrestore(&list->lock, flags);
+	return result;
+}
+EXPORT_SYMBOL(skb_dequeue);
+
+/**
+ *	skb_dequeue_tail - remove from the tail of the queue
+ *	@list: list to dequeue from
+ *
+ *	Remove the tail of the list. The list lock is taken so the function
+ *	may be used safely with other locking list functions. The tail item is
+ *	returned or %NULL if the list is empty.
+ */
+struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
+{
+	unsigned long flags;
+	struct sk_buff *result;
+
+	spin_lock_irqsave(&list->lock, flags);
+	result = __skb_dequeue_tail(list);
+	spin_unlock_irqrestore(&list->lock, flags);
+	return result;
+}
+EXPORT_SYMBOL(skb_dequeue_tail);
+
+/**
+ *	skb_queue_purge - empty a list
+ *	@list: list to empty
+ *
+ *	Delete all buffers on an &sk_buff list. Each buffer is removed from
+ *	the list and one reference dropped. This function takes the list
+ *	lock and is atomic with respect to other list locking functions.
+ */
+void skb_queue_purge(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+	while ((skb = skb_dequeue(list)) != NULL)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL(skb_queue_purge);
+
+/**
+ *	skb_queue_head - queue a buffer at the list head
+ *	@list: list to use
+ *	@newsk: buffer to queue
+ *
+ *	Queue a buffer at the start of the list. This function takes the
+ *	list lock and can be used safely with other locking &sk_buff functions
+ *	safely.
+ *
+ *	A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_queue_head(list, newsk);
+	spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_head);
+
+/**
+ *	skb_queue_tail - queue a buffer at the list tail
+ *	@list: list to use
+ *	@newsk: buffer to queue
+ *
+ *	Queue a buffer at the tail of the list. This function takes the
+ *	list lock and can be used safely with other locking &sk_buff functions
+ *	safely.
+ *
+ *	A buffer cannot be placed on two lists at the same time.
+ */
+void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_queue_tail(list, newsk);
+	spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_queue_tail);
+
+/**
+ *	skb_unlink	-	remove a buffer from a list
+ *	@skb: buffer to remove
+ *	@list: list to use
+ *
+ *	Remove a packet from a list. The list locks are taken and this
+ *	function is atomic with respect to other list locked calls
+ *
+ *	You must know what list the SKB is on.
+ */
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_unlink(skb, list);
+	spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_unlink);
+
+/**
+ *	skb_append	-	append a buffer
+ *	@old: buffer to insert after
+ *	@newsk: buffer to insert
+ *	@list: list to use
+ *
+ *	Place a packet after a given packet in a list. The list locks are taken
+ *	and this function is atomic with respect to other list locked calls.
+ *	A buffer cannot be placed on two lists at the same time.
+ */
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_queue_after(list, old, newsk);
+	spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_append);
+
+/**
+ *	skb_insert	-	insert a buffer
+ *	@old: buffer to insert before
+ *	@newsk: buffer to insert
+ *	@list: list to use
+ *
+ *	Place a packet before a given packet in a list. The list locks are
+ * 	taken and this function is atomic with respect to other list locked
+ *	calls.
+ *
+ *	A buffer cannot be placed on two lists at the same time.
+ */
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_insert(newsk, old->prev, old, list);
+	spin_unlock_irqrestore(&list->lock, flags);
+}
+EXPORT_SYMBOL(skb_insert);
+
+static inline void skb_split_inside_header(struct sk_buff *skb,
+					   struct sk_buff* skb1,
+					   const u32 len, const int pos)
+{
+	int i;
+
+	skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
+					 pos - len);
+	/* And move data appendix as is. */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+		skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
+
+	skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+	skb_shinfo(skb)->nr_frags  = 0;
+	skb1->data_len		   = skb->data_len;
+	skb1->len		   += skb1->data_len;
+	skb->data_len		   = 0;
+	skb->len		   = len;
+	skb_set_tail_pointer(skb, len);
+}
+
+static inline void skb_split_no_header(struct sk_buff *skb,
+				       struct sk_buff* skb1,
+				       const u32 len, int pos)
+{
+	int i, k = 0;
+	const int nfrags = skb_shinfo(skb)->nr_frags;
+
+	skb_shinfo(skb)->nr_frags = 0;
+	skb1->len		  = skb1->data_len = skb->len - len;
+	skb->len		  = len;
+	skb->data_len		  = len - pos;
+
+	for (i = 0; i < nfrags; i++) {
+		int size = skb_shinfo(skb)->frags[i].size;
+
+		if (pos + size > len) {
+			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
+
+			if (pos < len) {
+				/* Split frag.
+				 * We have two variants in this case:
+				 * 1. Move all the frag to the second
+				 *    part, if it is possible. F.e.
+				 *    this approach is mandatory for TUX,
+				 *    where splitting is expensive.
+				 * 2. Split is accurately. We make this.
+				 */
+				get_page(skb_shinfo(skb)->frags[i].page);
+				skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+				skb_shinfo(skb1)->frags[0].size -= len - pos;
+				skb_shinfo(skb)->frags[i].size	= len - pos;
+				skb_shinfo(skb)->nr_frags++;
+			}
+			k++;
+		} else
+			skb_shinfo(skb)->nr_frags++;
+		pos += size;
+	}
+	skb_shinfo(skb1)->nr_frags = k;
+}
+
+/**
+ * skb_split - Split fragmented skb to two parts at length len.
+ * @skb: the buffer to split
+ * @skb1: the buffer to receive the second part
+ * @len: new length for skb
+ */
+void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
+{
+	int pos = skb_headlen(skb);
+
+	if (len < pos)	/* Split line is inside header. */
+		skb_split_inside_header(skb, skb1, len, pos);
+	else		/* Second chunk has no header, nothing to copy. */
+		skb_split_no_header(skb, skb1, len, pos);
+}
+EXPORT_SYMBOL(skb_split);
+
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * Caller cannot keep skb_shinfo related pointers past calling here!
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+	return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+	int from, to, merge, todo;
+	struct skb_frag_struct *fragfrom, *fragto;
+
+	BUG_ON(shiftlen > skb->len);
+	BUG_ON(skb_headlen(skb));	/* Would corrupt stream */
+
+	todo = shiftlen;
+	from = 0;
+	to = skb_shinfo(tgt)->nr_frags;
+	fragfrom = &skb_shinfo(skb)->frags[from];
+
+	/* Actual merge is delayed until the point when we know we can
+	 * commit all, so that we don't have to undo partial changes
+	 */
+	if (!to ||
+	    !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+		merge = -1;
+	} else {
+		merge = to - 1;
+
+		todo -= fragfrom->size;
+		if (todo < 0) {
+			if (skb_prepare_for_shift(skb) ||
+			    skb_prepare_for_shift(tgt))
+				return 0;
+
+			/* All previous frag pointers might be stale! */
+			fragfrom = &skb_shinfo(skb)->frags[from];
+			fragto = &skb_shinfo(tgt)->frags[merge];
+
+			fragto->size += shiftlen;
+			fragfrom->size -= shiftlen;
+			fragfrom->page_offset += shiftlen;
+
+			goto onlymerged;
+		}
+
+		from++;
+	}
+
+	/* Skip full, not-fitting skb to avoid expensive operations */
+	if ((shiftlen == skb->len) &&
+	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+		return 0;
+
+	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+		return 0;
+
+	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+		if (to == MAX_SKB_FRAGS)
+			return 0;
+
+		fragfrom = &skb_shinfo(skb)->frags[from];
+		fragto = &skb_shinfo(tgt)->frags[to];
+
+		if (todo >= fragfrom->size) {
+			*fragto = *fragfrom;
+			todo -= fragfrom->size;
+			from++;
+			to++;
+
+		} else {
+			get_page(fragfrom->page);
+			fragto->page = fragfrom->page;
+			fragto->page_offset = fragfrom->page_offset;
+			fragto->size = todo;
+
+			fragfrom->page_offset += todo;
+			fragfrom->size -= todo;
+			todo = 0;
+
+			to++;
+			break;
+		}
+	}
+
+	/* Ready to "commit" this state change to tgt */
+	skb_shinfo(tgt)->nr_frags = to;
+
+	if (merge >= 0) {
+		fragfrom = &skb_shinfo(skb)->frags[0];
+		fragto = &skb_shinfo(tgt)->frags[merge];
+
+		fragto->size += fragfrom->size;
+		put_page(fragfrom->page);
+	}
+
+	/* Reposition in the original skb */
+	to = 0;
+	while (from < skb_shinfo(skb)->nr_frags)
+		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+	skb_shinfo(skb)->nr_frags = to;
+
+	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+	/* Most likely the tgt won't ever need its checksum anymore, skb on
+	 * the other hand might need it if it needs to be resent
+	 */
+	tgt->ip_summed = CHECKSUM_PARTIAL;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	/* Yak, is it really working this way? Some helper please? */
+	skb->len -= shiftlen;
+	skb->data_len -= shiftlen;
+	skb->truesize -= shiftlen;
+	tgt->len += shiftlen;
+	tgt->data_len += shiftlen;
+	tgt->truesize += shiftlen;
+
+	return shiftlen;
+}
+
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+			  unsigned int to, struct skb_seq_state *st)
+{
+	st->lower_offset = from;
+	st->upper_offset = to;
+	st->root_skb = st->cur_skb = skb;
+	st->frag_idx = st->stepped_offset = 0;
+	st->frag_data = NULL;
+}
+EXPORT_SYMBOL(skb_prepare_seq_read);
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note 1: The size of each block of data returned can be arbitrary,
+ *       this limitation is the cost for zerocopy seqeuental
+ *       reads of potentially non linear data.
+ *
+ * Note 2: Fragment lists within fragments are not implemented
+ *       at the moment, state->root_skb could be replaced with
+ *       a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+			  struct skb_seq_state *st)
+{
+	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+	skb_frag_t *frag;
+
+	if (unlikely(abs_offset >= st->upper_offset))
+		return 0;
+
+next_skb:
+	block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
+
+	if (abs_offset < block_limit && !st->frag_data) {
+		*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
+		return block_limit - abs_offset;
+	}
+
+	if (st->frag_idx == 0 && !st->frag_data)
+		st->stepped_offset += skb_headlen(st->cur_skb);
+
+	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+		block_limit = frag->size + st->stepped_offset;
+
+		if (abs_offset < block_limit) {
+			if (!st->frag_data)
+				st->frag_data = kmap_skb_frag(frag);
+
+			*data = (u8 *) st->frag_data + frag->page_offset +
+				(abs_offset - st->stepped_offset);
+
+			return block_limit - abs_offset;
+		}
+
+		if (st->frag_data) {
+			kunmap_skb_frag(st->frag_data);
+			st->frag_data = NULL;
+		}
+
+		st->frag_idx++;
+		st->stepped_offset += frag->size;
+	}
+
+	if (st->frag_data) {
+		kunmap_skb_frag(st->frag_data);
+		st->frag_data = NULL;
+	}
+
+	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
+		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+		st->frag_idx = 0;
+		goto next_skb;
+	} else if (st->cur_skb->next) {
+		st->cur_skb = st->cur_skb->next;
+		st->frag_idx = 0;
+		goto next_skb;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(skb_seq_read);
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+	if (st->frag_data)
+		kunmap_skb_frag(st->frag_data);
+}
+EXPORT_SYMBOL(skb_abort_seq_read);
+
+#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+					  struct ts_config *conf,
+					  struct ts_state *state)
+{
+	return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+	skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+			   unsigned int to, struct ts_config *config,
+			   struct ts_state *state)
+{
+	unsigned int ret;
+
+	config->get_next_block = skb_ts_get_next_block;
+	config->finish = skb_ts_finish;
+
+	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+	ret = textsearch_find(config, state);
+	return (ret <= to - from ? ret : UINT_MAX);
+}
+EXPORT_SYMBOL(skb_find_text);
+
+/**
+ * skb_append_datato_frags: - append the user data to a skb
+ * @sk: sock  structure
+ * @skb: skb structure to be appened with user data.
+ * @getfrag: call back function to be used for getting the user data
+ * @from: pointer to user message iov
+ * @length: length of the iov message
+ *
+ * Description: This procedure append the user data in the fragment part
+ * of the skb if any page alloc fails user this procedure returns  -ENOMEM
+ */
+int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
+			int (*getfrag)(void *from, char *to, int offset,
+					int len, int odd, struct sk_buff *skb),
+			void *from, int length)
+{
+	int frg_cnt = 0;
+	skb_frag_t *frag = NULL;
+	struct page *page = NULL;
+	int copy, left;
+	int offset = 0;
+	int ret;
+
+	do {
+		/* Return error if we don't have space for new frag */
+		frg_cnt = skb_shinfo(skb)->nr_frags;
+		if (frg_cnt >= MAX_SKB_FRAGS)
+			return -EFAULT;
+
+		/* allocate a new page for next frag */
+		page = alloc_pages(sk->sk_allocation, 0);
+
+		/* If alloc_page fails just return failure and caller will
+		 * free previous allocated pages by doing kfree_skb()
+		 */
+		if (page == NULL)
+			return -ENOMEM;
+
+		/* initialize the next frag */
+		skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
+		skb->truesize += PAGE_SIZE;
+		atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+
+		/* get the new initialized frag */
+		frg_cnt = skb_shinfo(skb)->nr_frags;
+		frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
+
+		/* copy the user data to page */
+		left = PAGE_SIZE - frag->page_offset;
+		copy = (length > left)? left : length;
+
+		ret = getfrag(from, (page_address(frag->page) +
+			    frag->page_offset + frag->size),
+			    offset, copy, 0, skb);
+		if (ret < 0)
+			return -EFAULT;
+
+		/* copy was successful so update the size parameters */
+		frag->size += copy;
+		skb->len += copy;
+		skb->data_len += copy;
+		offset += copy;
+		length -= copy;
+
+	} while (length > 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(skb_append_datato_frags);
+
+/**
+ *	skb_pull_rcsum - pull skb and update receive checksum
+ *	@skb: buffer to update
+ *	@len: length of data pulled
+ *
+ *	This function performs an skb_pull on the packet and updates
+ *	the CHECKSUM_COMPLETE checksum.  It should be used on
+ *	receive path processing instead of skb_pull unless you know
+ *	that the checksum difference is zero (e.g., a valid IP header)
+ *	or you are setting ip_summed to CHECKSUM_NONE.
+ */
+unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
+{
+	BUG_ON(len > skb->len);
+	skb->len -= len;
+	BUG_ON(skb->len < skb->data_len);
+	skb_postpull_rcsum(skb, skb->data, len);
+	return skb->data += len;
+}
+EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+
+/**
+ *	skb_segment - Perform protocol segmentation on skb.
+ *	@skb: buffer to segment
+ *	@features: features for the output path (see dev->features)
+ *
+ *	This function performs segmentation on the given skb.  It returns
+ *	a pointer to the first in a list of new skbs for the segments.
+ *	In case of error it returns ERR_PTR(err).
+ */
+struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = NULL;
+	struct sk_buff *tail = NULL;
+	struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
+	unsigned int mss = skb_shinfo(skb)->gso_size;
+	unsigned int doffset = skb->data - skb_mac_header(skb);
+	unsigned int offset = doffset;
+	unsigned int headroom;
+	unsigned int len;
+	int sg = !!(features & NETIF_F_SG);
+	int nfrags = skb_shinfo(skb)->nr_frags;
+	int err = -ENOMEM;
+	int i = 0;
+	int pos;
+
+	__skb_push(skb, doffset);
+	headroom = skb_headroom(skb);
+	pos = skb_headlen(skb);
+
+	do {
+		struct sk_buff *nskb;
+		skb_frag_t *frag;
+		int hsize;
+		int size;
+
+		len = skb->len - offset;
+		if (len > mss)
+			len = mss;
+
+		hsize = skb_headlen(skb) - offset;
+		if (hsize < 0)
+			hsize = 0;
+		if (hsize > len || !sg)
+			hsize = len;
+
+		if (!hsize && i >= nfrags) {
+			BUG_ON(fskb->len != len);
+
+			pos += len;
+			nskb = skb_clone(fskb, GFP_ATOMIC);
+			fskb = fskb->next;
+
+			if (unlikely(!nskb))
+				goto err;
+
+			hsize = skb_end_pointer(nskb) - nskb->head;
+			if (skb_cow_head(nskb, doffset + headroom)) {
+				kfree_skb(nskb);
+				goto err;
+			}
+
+			nskb->truesize += skb_end_pointer(nskb) - nskb->head -
+					  hsize;
+			skb_release_head_state(nskb);
+			__skb_push(nskb, doffset);
+		} else {
+			nskb = alloc_skb(hsize + doffset + headroom,
+					 GFP_ATOMIC);
+
+			if (unlikely(!nskb))
+				goto err;
+
+			skb_reserve(nskb, headroom);
+			__skb_put(nskb, doffset);
+		}
+
+		if (segs)
+			tail->next = nskb;
+		else
+			segs = nskb;
+		tail = nskb;
+
+		__copy_skb_header(nskb, skb);
+		nskb->mac_len = skb->mac_len;
+
+		/* nskb and skb might have different headroom */
+		if (nskb->ip_summed == CHECKSUM_PARTIAL)
+			nskb->csum_start += skb_headroom(nskb) - headroom;
+
+		skb_reset_mac_header(nskb);
+		skb_set_network_header(nskb, skb->mac_len);
+		nskb->transport_header = (nskb->network_header +
+					  skb_network_header_len(skb));
+		skb_copy_from_linear_data(skb, nskb->data, doffset);
+
+		if (fskb != skb_shinfo(skb)->frag_list)
+			continue;
+
+		if (!sg) {
+			nskb->ip_summed = CHECKSUM_NONE;
+			nskb->csum = skb_copy_and_csum_bits(skb, offset,
+							    skb_put(nskb, len),
+							    len, 0);
+			continue;
+		}
+
+		frag = skb_shinfo(nskb)->frags;
+
+		skb_copy_from_linear_data_offset(skb, offset,
+						 skb_put(nskb, hsize), hsize);
+
+		while (pos < offset + len && i < nfrags) {
+			*frag = skb_shinfo(skb)->frags[i];
+			get_page(frag->page);
+			size = frag->size;
+
+			if (pos < offset) {
+				frag->page_offset += offset - pos;
+				frag->size -= offset - pos;
+			}
+
+			skb_shinfo(nskb)->nr_frags++;
+
+			if (pos + size <= offset + len) {
+				i++;
+				pos += size;
+			} else {
+				frag->size -= pos + size - (offset + len);
+				goto skip_fraglist;
+			}
+
+			frag++;
+		}
+
+		if (pos < offset + len) {
+			struct sk_buff *fskb2 = fskb;
+
+			BUG_ON(pos + fskb->len != offset + len);
+
+			pos += fskb->len;
+			fskb = fskb->next;
+
+			if (fskb2->next) {
+				fskb2 = skb_clone(fskb2, GFP_ATOMIC);
+				if (!fskb2)
+					goto err;
+			} else
+				skb_get(fskb2);
+
+			SKB_FRAG_ASSERT(nskb);
+			skb_shinfo(nskb)->frag_list = fskb2;
+		}
+
+skip_fraglist:
+		nskb->data_len = len - hsize;
+		nskb->len += nskb->data_len;
+		nskb->truesize += nskb->data_len;
+	} while ((offset += len) < skb->len);
+
+	return segs;
+
+err:
+	while ((skb = segs)) {
+		segs = skb->next;
+		kfree_skb(skb);
+	}
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(skb_segment);
+
+int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff *p = *head;
+	struct sk_buff *nskb;
+	struct skb_shared_info *skbinfo = skb_shinfo(skb);
+	struct skb_shared_info *pinfo = skb_shinfo(p);
+	unsigned int headroom;
+	unsigned int len = skb_gro_len(skb);
+	unsigned int offset = skb_gro_offset(skb);
+	unsigned int headlen = skb_headlen(skb);
+
+	if (p->len + len >= 65536)
+		return -E2BIG;
+
+	if (pinfo->frag_list)
+		goto merge;
+	else if (headlen <= offset) {
+		skb_frag_t *frag;
+		skb_frag_t *frag2;
+		int i = skbinfo->nr_frags;
+		int nr_frags = pinfo->nr_frags + i;
+
+		offset -= headlen;
+
+		if (nr_frags > MAX_SKB_FRAGS)
+			return -E2BIG;
+
+		pinfo->nr_frags = nr_frags;
+		skbinfo->nr_frags = 0;
+
+		frag = pinfo->frags + nr_frags;
+		frag2 = skbinfo->frags + i;
+		do {
+			*--frag = *--frag2;
+		} while (--i);
+
+		frag->page_offset += offset;
+		frag->size -= offset;
+
+		skb->truesize -= skb->data_len;
+		skb->len -= skb->data_len;
+		skb->data_len = 0;
+
+		NAPI_GRO_CB(skb)->free = 1;
+		goto done;
+	} else if (skb_gro_len(p) != pinfo->gso_size)
+		return -E2BIG;
+
+	headroom = skb_headroom(p);
+	nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
+	if (unlikely(!nskb))
+		return -ENOMEM;
+
+	__copy_skb_header(nskb, p);
+	nskb->mac_len = p->mac_len;
+
+	skb_reserve(nskb, headroom);
+	__skb_put(nskb, skb_gro_offset(p));
+
+	skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
+	skb_set_network_header(nskb, skb_network_offset(p));
+	skb_set_transport_header(nskb, skb_transport_offset(p));
+
+	__skb_pull(p, skb_gro_offset(p));
+	memcpy(skb_mac_header(nskb), skb_mac_header(p),
+	       p->data - skb_mac_header(p));
+
+	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
+	skb_shinfo(nskb)->frag_list = p;
+	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
+	pinfo->gso_size = 0;
+	skb_header_release(p);
+	nskb->prev = p;
+
+	nskb->data_len += p->len;
+	nskb->truesize += p->len;
+	nskb->len += p->len;
+
+	*head = nskb;
+	nskb->next = p->next;
+	p->next = NULL;
+
+	p = nskb;
+
+merge:
+	if (offset > headlen) {
+		unsigned int eat = offset - headlen;
+
+		skbinfo->frags[0].page_offset += eat;
+		skbinfo->frags[0].size -= eat;
+		skb->data_len -= eat;
+		skb->len -= eat;
+		offset = headlen;
+	}
+
+	__skb_pull(skb, offset);
+
+	p->prev->next = skb;
+	p->prev = skb;
+	skb_header_release(skb);
+
+done:
+	NAPI_GRO_CB(p)->count++;
+	p->data_len += len;
+	p->truesize += len;
+	p->len += len;
+
+	NAPI_GRO_CB(skb)->same_flow = 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_gro_receive);
+
+void __init skb_init(void)
+{
+	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+					      sizeof(struct sk_buff),
+					      0,
+					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					      NULL);
+	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+						(2*sizeof(struct sk_buff)) +
+						sizeof(atomic_t),
+						0,
+						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+						NULL);
+}
+
+/**
+ *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ *	@skb: Socket buffer containing the buffers to be mapped
+ *	@sg: The scatter-gather list to map into
+ *	@offset: The offset into the buffer's contents to start mapping
+ *	@len: Length of buffer space to be mapped
+ *
+ *	Fill the specified scatter-gather list with mappings/pointers into a
+ *	region of the buffer space attached to a socket buffer.
+ */
+static int
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	int elt = 0;
+
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		sg_set_buf(sg, skb->data + offset, copy);
+		elt++;
+		if ((len -= copy) == 0)
+			return elt;
+		offset += copy;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+			if (copy > len)
+				copy = len;
+			sg_set_page(&sg[elt], frag->page, copy,
+					frag->page_offset+offset-start);
+			elt++;
+			if (!(len -= copy))
+				return elt;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+					      copy);
+			if ((len -= copy) == 0)
+				return elt;
+			offset += copy;
+		}
+		start = end;
+	}
+	BUG_ON(len);
+	return elt;
+}
+
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+	int nsg = __skb_to_sgvec(skb, sg, offset, len);
+
+	sg_mark_end(&sg[nsg - 1]);
+
+	return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
+/**
+ *	skb_cow_data - Check that a socket buffer's data buffers are writable
+ *	@skb: The socket buffer to check.
+ *	@tailbits: Amount of trailing space to be added
+ *	@trailer: Returned pointer to the skb where the @tailbits space begins
+ *
+ *	Make sure that the data buffers attached to a socket buffer are
+ *	writable. If they are not, private copies are made of the data buffers
+ *	and the socket buffer is set to use these instead.
+ *
+ *	If @tailbits is given, make sure that there is space to write @tailbits
+ *	bytes of data beyond current end of socket buffer.  @trailer will be
+ *	set to point to the skb in which this space begins.
+ *
+ *	The number of scatterlist elements required to completely map the
+ *	COW'd and extended socket buffer will be returned.
+ */
+int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
+{
+	int copyflag;
+	int elt;
+	struct sk_buff *skb1, **skb_p;
+
+	/* If skb is cloned or its head is paged, reallocate
+	 * head pulling out all the pages (pages are considered not writable
+	 * at the moment even if they are anonymous).
+	 */
+	if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
+	    __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+		return -ENOMEM;
+
+	/* Easy case. Most of packets will go this way. */
+	if (!skb_has_frag_list(skb)) {
+		/* A little of trouble, not enough of space for trailer.
+		 * This should not happen, when stack is tuned to generate
+		 * good frames. OK, on miss we reallocate and reserve even more
+		 * space, 128 bytes is fair. */
+
+		if (skb_tailroom(skb) < tailbits &&
+		    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
+			return -ENOMEM;
+
+		/* Voila! */
+		*trailer = skb;
+		return 1;
+	}
+
+	/* Misery. We are in troubles, going to mincer fragments... */
+
+	elt = 1;
+	skb_p = &skb_shinfo(skb)->frag_list;
+	copyflag = 0;
+
+	while ((skb1 = *skb_p) != NULL) {
+		int ntail = 0;
+
+		/* The fragment is partially pulled by someone,
+		 * this can happen on input. Copy it and everything
+		 * after it. */
+
+		if (skb_shared(skb1))
+			copyflag = 1;
+
+		/* If the skb is the last, worry about trailer. */
+
+		if (skb1->next == NULL && tailbits) {
+			if (skb_shinfo(skb1)->nr_frags ||
+			    skb_has_frag_list(skb1) ||
+			    skb_tailroom(skb1) < tailbits)
+				ntail = tailbits + 128;
+		}
+
+		if (copyflag ||
+		    skb_cloned(skb1) ||
+		    ntail ||
+		    skb_shinfo(skb1)->nr_frags ||
+		    skb_has_frag_list(skb1)) {
+			struct sk_buff *skb2;
+
+			/* Fuck, we are miserable poor guys... */
+			if (ntail == 0)
+				skb2 = skb_copy(skb1, GFP_ATOMIC);
+			else
+				skb2 = skb_copy_expand(skb1,
+						       skb_headroom(skb1),
+						       ntail,
+						       GFP_ATOMIC);
+			if (unlikely(skb2 == NULL))
+				return -ENOMEM;
+
+			if (skb1->sk)
+				skb_set_owner_w(skb2, skb1->sk);
+
+			/* Looking around. Are we still alive?
+			 * OK, link new skb, drop old one */
+
+			skb2->next = skb1->next;
+			*skb_p = skb2;
+			kfree_skb(skb1);
+			skb1 = skb2;
+		}
+		elt++;
+		*trailer = skb1;
+		skb_p = &skb1->next;
+	}
+
+	return elt;
+}
+EXPORT_SYMBOL_GPL(skb_cow_data);
+
+static void sock_rmem_free(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int len = skb->len;
+
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_rmem_free;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+	/* before exiting rcu section, make sure dst is refcounted */
+	skb_dst_force(skb);
+
+	skb_queue_tail(&sk->sk_error_queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, len);
+	return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+		struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock *sk = orig_skb->sk;
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb;
+	int err;
+
+	if (!sk)
+		return;
+
+	skb = skb_clone(orig_skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	if (hwtstamps) {
+		*skb_hwtstamps(skb) =
+			*hwtstamps;
+	} else {
+		/*
+		 * no hardware time stamps available,
+		 * so keep the shared tx_flags and only
+		 * store software time stamp
+		 */
+		skb->tstamp = ktime_get_real();
+	}
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+
+	err = sock_queue_err_skb(sk, skb);
+
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+
+
+/**
+ * skb_partial_csum_set - set up and verify partial csum values for packet
+ * @skb: the skb to set
+ * @start: the number of bytes after skb->data to start checksumming.
+ * @off: the offset from start to place the checksum.
+ *
+ * For untrusted partially-checksummed packets, we need to make sure the values
+ * for skb->csum_start and skb->csum_offset are valid so we don't oops.
+ *
+ * This function checks and sets those values and skb->ip_summed: if this
+ * returns false you should drop the packet.
+ */
+bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
+{
+	if (unlikely(start > skb_headlen(skb)) ||
+	    unlikely((int)start + off > skb_headlen(skb) - 2)) {
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "bad partial csum: csum=%u/%u len=%u\n",
+			       start, off, skb_headlen(skb));
+		return false;
+	}
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum_start = skb_headroom(skb) + start;
+	skb->csum_offset = off;
+	return true;
+}
+EXPORT_SYMBOL_GPL(skb_partial_csum_set);
+
+void __skb_warn_lro_forwarding(const struct sk_buff *skb)
+{
+	if (net_ratelimit())
+		pr_warning("%s: received packets cannot be forwarded"
+			   " while LRO is enabled\n", skb->dev->name);
+}
+EXPORT_SYMBOL(__skb_warn_lro_forwarding);
diff --git a/net/core/sock.c b/net/core/sock.c
new file mode 100644
index 00000000..b4bb59a9
--- /dev/null
+++ b/net/core/sock.c
@@ -0,0 +1,2579 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic socket support routines. Memory allocators, socket lock/release
+ *		handler for protocols to use and generic option handler.
+ *
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Fixes:
+ *		Alan Cox	: 	Numerous verify_area() problems
+ *		Alan Cox	:	Connecting on a connecting socket
+ *					now returns an error for tcp.
+ *		Alan Cox	:	sock->protocol is set correctly.
+ *					and is not sometimes left as 0.
+ *		Alan Cox	:	connect handles icmp errors on a
+ *					connect properly. Unfortunately there
+ *					is a restart syscall nasty there. I
+ *					can't match BSD without hacking the C
+ *					library. Ideas urgently sought!
+ *		Alan Cox	:	Disallow bind() to addresses that are
+ *					not ours - especially broadcast ones!!
+ *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
+ *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
+ *					instead they leave that for the DESTROY timer.
+ *		Alan Cox	:	Clean up error flag in accept
+ *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
+ *					was buggy. Put a remove_sock() in the handler
+ *					for memory when we hit 0. Also altered the timer
+ *					code. The ACK stuff can wait and needs major
+ *					TCP layer surgery.
+ *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
+ *					and fixed timer/inet_bh race.
+ *		Alan Cox	:	Added zapped flag for TCP
+ *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
+ *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
+ *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
+ *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
+ *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
+ *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
+ *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
+ *	Pauline Middelink	:	identd support
+ *		Alan Cox	:	Fixed connect() taking signals I think.
+ *		Alan Cox	:	SO_LINGER supported
+ *		Alan Cox	:	Error reporting fixes
+ *		Anonymous	:	inet_create tidied up (sk->reuse setting)
+ *		Alan Cox	:	inet sockets don't set sk->type!
+ *		Alan Cox	:	Split socket option code
+ *		Alan Cox	:	Callbacks
+ *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
+ *		Alex		:	Removed restriction on inet fioctl
+ *		Alan Cox	:	Splitting INET from NET core
+ *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
+ *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
+ *		Alan Cox	:	Split IP from generic code
+ *		Alan Cox	:	New kfree_skbmem()
+ *		Alan Cox	:	Make SO_DEBUG superuser only.
+ *		Alan Cox	:	Allow anyone to clear SO_DEBUG
+ *					(compatibility fix)
+ *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
+ *		Alan Cox	:	Allocator for a socket is settable.
+ *		Alan Cox	:	SO_ERROR includes soft errors.
+ *		Alan Cox	:	Allow NULL arguments on some SO_ opts
+ *		Alan Cox	: 	Generic socket allocation to make hooks
+ *					easier (suggested by Craig Metz).
+ *		Michael Pall	:	SO_ERROR returns positive errno again
+ *              Steve Whitehouse:       Added default destructor to free
+ *                                      protocol private data.
+ *              Steve Whitehouse:       Added various other default routines
+ *                                      common to several socket families.
+ *              Chris Evans     :       Call suser() check last on F_SETOWN
+ *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
+ *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
+ *		Andi Kleen	:	Fix write_space callback
+ *		Chris Evans	:	Security fixes - signedness again
+ *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
+ *
+ * To Fix:
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/poll.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/user_namespace.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/netdevice.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
+#include <net/sock.h>
+#include <linux/net_tstamp.h>
+#include <net/xfrm.h>
+#include <linux/ipsec.h>
+#include <net/cls_cgroup.h>
+
+#include <linux/filter.h>
+
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+#endif
+
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *const af_family_key_strings[AF_MAX+1] = {
+  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
+  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
+  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
+  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
+  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
+  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
+  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
+  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
+  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
+  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
+  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
+  "sk_lock-AF_MAX"
+};
+static const char *const af_family_slock_key_strings[AF_MAX+1] = {
+  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
+  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
+  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
+  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
+  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
+  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
+  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
+  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
+  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
+  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
+  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
+  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
+  "slock-AF_MAX"
+};
+static const char *const af_family_clock_key_strings[AF_MAX+1] = {
+  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
+  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
+  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
+  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
+  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
+  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
+  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
+  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
+  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
+  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
+  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
+  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
+  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
+  "clock-AF_MAX"
+};
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
+/* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+ * platforms.  This makes socket queueing behavior and performance
+ * not depend upon such differences.
+ */
+#define _SK_MEM_PACKETS		256
+#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
+#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+
+/* Run time adjustable parameters. */
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+
+/* Maximal space eaten by iovec or ancillary data plus some space */
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+EXPORT_SYMBOL(sysctl_optmem_max);
+
+#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
+int net_cls_subsys_id = -1;
+EXPORT_SYMBOL_GPL(net_cls_subsys_id);
+#endif
+
+static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+{
+	struct timeval tv;
+
+	if (optlen < sizeof(tv))
+		return -EINVAL;
+	if (copy_from_user(&tv, optval, sizeof(tv)))
+		return -EFAULT;
+	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+		return -EDOM;
+
+	if (tv.tv_sec < 0) {
+		static int warned __read_mostly;
+
+		*timeo_p = 0;
+		if (warned < 10 && net_ratelimit()) {
+			warned++;
+			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
+			       "tries to set negative timeout\n",
+				current->comm, task_pid_nr(current));
+		}
+		return 0;
+	}
+	*timeo_p = MAX_SCHEDULE_TIMEOUT;
+	if (tv.tv_sec == 0 && tv.tv_usec == 0)
+		return 0;
+	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
+		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+	return 0;
+}
+
+static void sock_warn_obsolete_bsdism(const char *name)
+{
+	static int warned;
+	static char warncomm[TASK_COMM_LEN];
+	if (strcmp(warncomm, current->comm) && warned < 5) {
+		strcpy(warncomm,  current->comm);
+		printk(KERN_WARNING "process `%s' is using obsolete "
+		       "%s SO_BSDCOMPAT\n", warncomm, name);
+		warned++;
+	}
+}
+
+static void sock_disable_timestamp(struct sock *sk, int flag)
+{
+	if (sock_flag(sk, flag)) {
+		sock_reset_flag(sk, flag);
+		if (!sock_flag(sk, SOCK_TIMESTAMP) &&
+		    !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+			net_disable_timestamp();
+		}
+	}
+}
+
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+	int skb_len;
+	unsigned long flags;
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+
+	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
+	   number of warnings when compiling with -W --ANK
+	 */
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf) {
+		atomic_inc(&sk->sk_drops);
+		return -ENOMEM;
+	}
+
+	err = sk_filter(sk, skb);
+	if (err)
+		return err;
+
+	if (!sk_rmem_schedule(sk, skb->truesize)) {
+		atomic_inc(&sk->sk_drops);
+		return -ENOBUFS;
+	}
+
+	skb->dev = NULL;
+	skb_set_owner_r(skb, sk);
+
+	/* Cache the SKB length before we tack it onto the receive
+	 * queue.  Once it is added it no longer belongs to us and
+	 * may be freed by other threads of control pulling packets
+	 * from the queue.
+	 */
+	skb_len = skb->len;
+
+	/* we escape from rcu protected region, make sure we dont leak
+	 * a norefcounted dst
+	 */
+	skb_dst_force(skb);
+
+	spin_lock_irqsave(&list->lock, flags);
+	skb->dropcount = atomic_read(&sk->sk_drops);
+	__skb_queue_tail(list, skb);
+	spin_unlock_irqrestore(&list->lock, flags);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb_len);
+	return 0;
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+{
+	int rc = NET_RX_SUCCESS;
+
+	if (sk_filter(sk, skb))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	if (sk_rcvqueues_full(sk, skb)) {
+		atomic_inc(&sk->sk_drops);
+		goto discard_and_relse;
+	}
+	if (nested)
+		bh_lock_sock_nested(sk);
+	else
+		bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		/*
+		 * trylock + unlock semantics:
+		 */
+		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+		rc = sk_backlog_rcv(sk, skb);
+
+		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+	} else if (sk_add_backlog(sk, skb)) {
+		bh_unlock_sock(sk);
+		atomic_inc(&sk->sk_drops);
+		goto discard_and_relse;
+	}
+
+	bh_unlock_sock(sk);
+out:
+	sock_put(sk);
+	return rc;
+discard_and_relse:
+	kfree_skb(skb);
+	goto out;
+}
+EXPORT_SYMBOL(sk_receive_skb);
+
+void sk_reset_txq(struct sock *sk)
+{
+	sk_tx_queue_clear(sk);
+}
+EXPORT_SYMBOL(sk_reset_txq);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		sk_tx_queue_clear(sk);
+		rcu_assign_pointer(sk->sk_dst_cache, NULL);
+		dst_release(dst);
+		return NULL;
+	}
+
+	return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		sk_dst_reset(sk);
+		dst_release(dst);
+		return NULL;
+	}
+
+	return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+{
+	int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+	struct net *net = sock_net(sk);
+	char devname[IFNAMSIZ];
+	int index;
+
+	/* Sorry... */
+	ret = -EPERM;
+	if (!capable(CAP_NET_RAW))
+		goto out;
+
+	ret = -EINVAL;
+	if (optlen < 0)
+		goto out;
+
+	/* Bind this socket to a particular device like "eth0",
+	 * as specified in the passed interface name. If the
+	 * name is "" or the option length is zero the socket
+	 * is not bound.
+	 */
+	if (optlen > IFNAMSIZ - 1)
+		optlen = IFNAMSIZ - 1;
+	memset(devname, 0, sizeof(devname));
+
+	ret = -EFAULT;
+	if (copy_from_user(devname, optval, optlen))
+		goto out;
+
+	index = 0;
+	if (devname[0] != '\0') {
+		struct net_device *dev;
+
+		rcu_read_lock();
+		dev = dev_get_by_name_rcu(net, devname);
+		if (dev)
+			index = dev->ifindex;
+		rcu_read_unlock();
+		ret = -ENODEV;
+		if (!dev)
+			goto out;
+	}
+
+	lock_sock(sk);
+	sk->sk_bound_dev_if = index;
+	sk_dst_reset(sk);
+	release_sock(sk);
+
+	ret = 0;
+
+out:
+#endif
+
+	return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+	if (valbool)
+		sock_set_flag(sk, bit);
+	else
+		sock_reset_flag(sk, bit);
+}
+
+/*
+ *	This is meant for all protocols to use and covers goings on
+ *	at the socket level. Everything here is generic.
+ */
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int val;
+	int valbool;
+	struct linger ling;
+	int ret = 0;
+
+	/*
+	 *	Options without arguments
+	 */
+
+	if (optname == SO_BINDTODEVICE)
+		return sock_bindtodevice(sk, optval, optlen);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	valbool = val ? 1 : 0;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case SO_DEBUG:
+		if (val && !capable(CAP_NET_ADMIN))
+			ret = -EACCES;
+		else
+			sock_valbool_flag(sk, SOCK_DBG, valbool);
+		break;
+	case SO_REUSEADDR:
+		sk->sk_reuse = valbool;
+		break;
+	case SO_TYPE:
+	case SO_PROTOCOL:
+	case SO_DOMAIN:
+	case SO_ERROR:
+		ret = -ENOPROTOOPT;
+		break;
+	case SO_DONTROUTE:
+		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+		break;
+	case SO_BROADCAST:
+		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+		break;
+	case SO_SNDBUF:
+		/* Don't error on this BSD doesn't and if you think
+		   about it this is right. Otherwise apps have to
+		   play 'guess the biggest size' games. RCVBUF/SNDBUF
+		   are treated in BSD as hints */
+
+		if (val > sysctl_wmem_max)
+			val = sysctl_wmem_max;
+set_sndbuf:
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+		if ((val * 2) < SOCK_MIN_SNDBUF)
+			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
+		else
+			sk->sk_sndbuf = val * 2;
+
+		/*
+		 *	Wake up sending tasks if we
+		 *	upped the value.
+		 */
+		sk->sk_write_space(sk);
+		break;
+
+	case SO_SNDBUFFORCE:
+		if (!capable(CAP_NET_ADMIN)) {
+			ret = -EPERM;
+			break;
+		}
+		goto set_sndbuf;
+
+	case SO_RCVBUF:
+		/* Don't error on this BSD doesn't and if you think
+		   about it this is right. Otherwise apps have to
+		   play 'guess the biggest size' games. RCVBUF/SNDBUF
+		   are treated in BSD as hints */
+
+		if (val > sysctl_rmem_max)
+			val = sysctl_rmem_max;
+set_rcvbuf:
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+		/*
+		 * We double it on the way in to account for
+		 * "struct sk_buff" etc. overhead.   Applications
+		 * assume that the SO_RCVBUF setting they make will
+		 * allow that much actual data to be received on that
+		 * socket.
+		 *
+		 * Applications are unaware that "struct sk_buff" and
+		 * other overheads allocate from the receive buffer
+		 * during socket buffer allocation.
+		 *
+		 * And after considering the possible alternatives,
+		 * returning the value we actually used in getsockopt
+		 * is the most desirable behavior.
+		 */
+		if ((val * 2) < SOCK_MIN_RCVBUF)
+			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
+		else
+			sk->sk_rcvbuf = val * 2;
+		break;
+
+	case SO_RCVBUFFORCE:
+		if (!capable(CAP_NET_ADMIN)) {
+			ret = -EPERM;
+			break;
+		}
+		goto set_rcvbuf;
+
+	case SO_KEEPALIVE:
+#ifdef CONFIG_INET
+		if (sk->sk_protocol == IPPROTO_TCP)
+			tcp_set_keepalive(sk, valbool);
+#endif
+		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+		break;
+
+	case SO_OOBINLINE:
+		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+		break;
+
+	case SO_NO_CHECK:
+		sk->sk_no_check = valbool;
+		break;
+
+	case SO_PRIORITY:
+		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+			sk->sk_priority = val;
+		else
+			ret = -EPERM;
+		break;
+
+	case SO_LINGER:
+		if (optlen < sizeof(ling)) {
+			ret = -EINVAL;	/* 1003.1g */
+			break;
+		}
+		if (copy_from_user(&ling, optval, sizeof(ling))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (!ling.l_onoff)
+			sock_reset_flag(sk, SOCK_LINGER);
+		else {
+#if (BITS_PER_LONG == 32)
+			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+			else
+#endif
+				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+			sock_set_flag(sk, SOCK_LINGER);
+		}
+		break;
+
+	case SO_BSDCOMPAT:
+		sock_warn_obsolete_bsdism("setsockopt");
+		break;
+
+	case SO_PASSCRED:
+		if (valbool)
+			set_bit(SOCK_PASSCRED, &sock->flags);
+		else
+			clear_bit(SOCK_PASSCRED, &sock->flags);
+		break;
+
+	case SO_TIMESTAMP:
+	case SO_TIMESTAMPNS:
+		if (valbool)  {
+			if (optname == SO_TIMESTAMP)
+				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+			else
+				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+			sock_set_flag(sk, SOCK_RCVTSTAMP);
+			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+		} else {
+			sock_reset_flag(sk, SOCK_RCVTSTAMP);
+			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+		}
+		break;
+
+	case SO_TIMESTAMPING:
+		if (val & ~SOF_TIMESTAMPING_MASK) {
+			ret = -EINVAL;
+			break;
+		}
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
+				  val & SOF_TIMESTAMPING_TX_HARDWARE);
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
+				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
+				  val & SOF_TIMESTAMPING_RX_HARDWARE);
+		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+			sock_enable_timestamp(sk,
+					      SOCK_TIMESTAMPING_RX_SOFTWARE);
+		else
+			sock_disable_timestamp(sk,
+					       SOCK_TIMESTAMPING_RX_SOFTWARE);
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
+				  val & SOF_TIMESTAMPING_SOFTWARE);
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
+				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
+		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
+				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
+		break;
+
+	case SO_RCVLOWAT:
+		if (val < 0)
+			val = INT_MAX;
+		sk->sk_rcvlowat = val ? : 1;
+		break;
+
+	case SO_RCVTIMEO:
+		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+		break;
+
+	case SO_SNDTIMEO:
+		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+		break;
+
+	case SO_ATTACH_FILTER:
+		ret = -EINVAL;
+		if (optlen == sizeof(struct sock_fprog)) {
+			struct sock_fprog fprog;
+
+			ret = -EFAULT;
+			if (copy_from_user(&fprog, optval, sizeof(fprog)))
+				break;
+
+			ret = sk_attach_filter(&fprog, sk);
+		}
+		break;
+
+	case SO_DETACH_FILTER:
+		ret = sk_detach_filter(sk);
+		break;
+
+	case SO_PASSSEC:
+		if (valbool)
+			set_bit(SOCK_PASSSEC, &sock->flags);
+		else
+			clear_bit(SOCK_PASSSEC, &sock->flags);
+		break;
+	case SO_MARK:
+		if (!capable(CAP_NET_ADMIN))
+			ret = -EPERM;
+		else
+			sk->sk_mark = val;
+		break;
+
+		/* We implement the SO_SNDLOWAT etc to
+		   not be settable (1003.1g 5.3) */
+	case SO_RXQ_OVFL:
+		if (valbool)
+			sock_set_flag(sk, SOCK_RXQ_OVFL);
+		else
+			sock_reset_flag(sk, SOCK_RXQ_OVFL);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+		break;
+	}
+	release_sock(sk);
+	return ret;
+}
+EXPORT_SYMBOL(sock_setsockopt);
+
+
+void cred_to_ucred(struct pid *pid, const struct cred *cred,
+		   struct ucred *ucred)
+{
+	ucred->pid = pid_vnr(pid);
+	ucred->uid = ucred->gid = -1;
+	if (cred) {
+		struct user_namespace *current_ns = current_user_ns();
+
+		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
+		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+	}
+}
+EXPORT_SYMBOL_GPL(cred_to_ucred);
+
+int sock_getsockopt(struct socket *sock, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+
+	union {
+		int val;
+		struct linger ling;
+		struct timeval tm;
+	} v;
+
+	int lv = sizeof(int);
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	memset(&v, 0, sizeof(v));
+
+	switch (optname) {
+	case SO_DEBUG:
+		v.val = sock_flag(sk, SOCK_DBG);
+		break;
+
+	case SO_DONTROUTE:
+		v.val = sock_flag(sk, SOCK_LOCALROUTE);
+		break;
+
+	case SO_BROADCAST:
+		v.val = !!sock_flag(sk, SOCK_BROADCAST);
+		break;
+
+	case SO_SNDBUF:
+		v.val = sk->sk_sndbuf;
+		break;
+
+	case SO_RCVBUF:
+		v.val = sk->sk_rcvbuf;
+		break;
+
+	case SO_REUSEADDR:
+		v.val = sk->sk_reuse;
+		break;
+
+	case SO_KEEPALIVE:
+		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+		break;
+
+	case SO_TYPE:
+		v.val = sk->sk_type;
+		break;
+
+	case SO_PROTOCOL:
+		v.val = sk->sk_protocol;
+		break;
+
+	case SO_DOMAIN:
+		v.val = sk->sk_family;
+		break;
+
+	case SO_ERROR:
+		v.val = -sock_error(sk);
+		if (v.val == 0)
+			v.val = xchg(&sk->sk_err_soft, 0);
+		break;
+
+	case SO_OOBINLINE:
+		v.val = !!sock_flag(sk, SOCK_URGINLINE);
+		break;
+
+	case SO_NO_CHECK:
+		v.val = sk->sk_no_check;
+		break;
+
+	case SO_PRIORITY:
+		v.val = sk->sk_priority;
+		break;
+
+	case SO_LINGER:
+		lv		= sizeof(v.ling);
+		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
+		v.ling.l_linger	= sk->sk_lingertime / HZ;
+		break;
+
+	case SO_BSDCOMPAT:
+		sock_warn_obsolete_bsdism("getsockopt");
+		break;
+
+	case SO_TIMESTAMP:
+		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+				!sock_flag(sk, SOCK_RCVTSTAMPNS);
+		break;
+
+	case SO_TIMESTAMPNS:
+		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+		break;
+
+	case SO_TIMESTAMPING:
+		v.val = 0;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
+			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
+			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
+			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
+			v.val |= SOF_TIMESTAMPING_SOFTWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
+			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
+			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+		break;
+
+	case SO_RCVTIMEO:
+		lv = sizeof(struct timeval);
+		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+			v.tm.tv_sec = 0;
+			v.tm.tv_usec = 0;
+		} else {
+			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+		}
+		break;
+
+	case SO_SNDTIMEO:
+		lv = sizeof(struct timeval);
+		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+			v.tm.tv_sec = 0;
+			v.tm.tv_usec = 0;
+		} else {
+			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+		}
+		break;
+
+	case SO_RCVLOWAT:
+		v.val = sk->sk_rcvlowat;
+		break;
+
+	case SO_SNDLOWAT:
+		v.val = 1;
+		break;
+
+	case SO_PASSCRED:
+		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+		break;
+
+	case SO_PEERCRED:
+	{
+		struct ucred peercred;
+		if (len > sizeof(peercred))
+			len = sizeof(peercred);
+		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+		if (copy_to_user(optval, &peercred, len))
+			return -EFAULT;
+		goto lenout;
+	}
+
+	case SO_PEERNAME:
+	{
+		char address[128];
+
+		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+			return -ENOTCONN;
+		if (lv < len)
+			return -EINVAL;
+		if (copy_to_user(optval, address, len))
+			return -EFAULT;
+		goto lenout;
+	}
+
+	/* Dubious BSD thing... Probably nobody even uses it, but
+	 * the UNIX standard wants it for whatever reason... -DaveM
+	 */
+	case SO_ACCEPTCONN:
+		v.val = sk->sk_state == TCP_LISTEN;
+		break;
+
+	case SO_PASSSEC:
+		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+		break;
+
+	case SO_PEERSEC:
+		return security_socket_getpeersec_stream(sock, optval, optlen, len);
+
+	case SO_MARK:
+		v.val = sk->sk_mark;
+		break;
+
+	case SO_RXQ_OVFL:
+		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (len > lv)
+		len = lv;
+	if (copy_to_user(optval, &v, len))
+		return -EFAULT;
+lenout:
+	if (put_user(len, optlen))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static inline void sock_lock_init(struct sock *sk)
+{
+	sock_lock_init_class_and_name(sk,
+			af_family_slock_key_strings[sk->sk_family],
+			af_family_slock_keys + sk->sk_family,
+			af_family_key_strings[sk->sk_family],
+			af_family_keys + sk->sk_family);
+}
+
+/*
+ * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
+ * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
+ */
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+	void *sptr = nsk->sk_security;
+#endif
+	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
+
+	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+#ifdef CONFIG_SECURITY_NETWORK
+	nsk->sk_security = sptr;
+	security_sk_clone(osk, nsk);
+#endif
+}
+
+/*
+ * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
+ * un-modified. Special care is taken when initializing object to zero.
+ */
+static inline void sk_prot_clear_nulls(struct sock *sk, int size)
+{
+	if (offsetof(struct sock, sk_node.next) != 0)
+		memset(sk, 0, offsetof(struct sock, sk_node.next));
+	memset(&sk->sk_node.pprev, 0,
+	       size - offsetof(struct sock, sk_node.pprev));
+}
+
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+	unsigned long nulls1, nulls2;
+
+	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+	if (nulls1 > nulls2)
+		swap(nulls1, nulls2);
+
+	if (nulls1 != 0)
+		memset((char *)sk, 0, nulls1);
+	memset((char *)sk + nulls1 + sizeof(void *), 0,
+	       nulls2 - nulls1 - sizeof(void *));
+	memset((char *)sk + nulls2 + sizeof(void *), 0,
+	       size - nulls2 - sizeof(void *));
+}
+EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+		int family)
+{
+	struct sock *sk;
+	struct kmem_cache *slab;
+
+	slab = prot->slab;
+	if (slab != NULL) {
+		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
+		if (!sk)
+			return sk;
+		if (priority & __GFP_ZERO) {
+			if (prot->clear_sk)
+				prot->clear_sk(sk, prot->obj_size);
+			else
+				sk_prot_clear_nulls(sk, prot->obj_size);
+		}
+	} else
+		sk = kmalloc(prot->obj_size, priority);
+
+	if (sk != NULL) {
+		kmemcheck_annotate_bitfield(sk, flags);
+
+		if (security_sk_alloc(sk, family, priority))
+			goto out_free;
+
+		if (!try_module_get(prot->owner))
+			goto out_free_sec;
+		sk_tx_queue_clear(sk);
+	}
+
+	return sk;
+
+out_free_sec:
+	security_sk_free(sk);
+out_free:
+	if (slab != NULL)
+		kmem_cache_free(slab, sk);
+	else
+		kfree(sk);
+	return NULL;
+}
+
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+	struct kmem_cache *slab;
+	struct module *owner;
+
+	owner = prot->owner;
+	slab = prot->slab;
+
+	security_sk_free(sk);
+	if (slab != NULL)
+		kmem_cache_free(slab, sk);
+	else
+		kfree(sk);
+	module_put(owner);
+}
+
+#ifdef CONFIG_CGROUPS
+void sock_update_classid(struct sock *sk)
+{
+	u32 classid;
+
+	rcu_read_lock();  /* doing current task, which cannot vanish. */
+	classid = task_cls_classid(current);
+	rcu_read_unlock();
+	if (classid && classid != sk->sk_classid)
+		sk->sk_classid = classid;
+}
+EXPORT_SYMBOL(sock_update_classid);
+#endif
+
+/**
+ *	sk_alloc - All socket objects are allocated here
+ *	@net: the applicable net namespace
+ *	@family: protocol family
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *	@prot: struct proto associated with this new sock instance
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+		      struct proto *prot)
+{
+	struct sock *sk;
+
+	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+	if (sk) {
+		sk->sk_family = family;
+		/*
+		 * See comment in struct sock definition to understand
+		 * why we need sk_prot_creator -acme
+		 */
+		sk->sk_prot = sk->sk_prot_creator = prot;
+		sock_lock_init(sk);
+		sock_net_set(sk, get_net(net));
+		atomic_set(&sk->sk_wmem_alloc, 1);
+
+		sock_update_classid(sk);
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL(sk_alloc);
+
+static void __sk_free(struct sock *sk)
+{
+	struct sk_filter *filter;
+
+	if (sk->sk_destruct)
+		sk->sk_destruct(sk);
+
+	filter = rcu_dereference_check(sk->sk_filter,
+				       atomic_read(&sk->sk_wmem_alloc) == 0);
+	if (filter) {
+		sk_filter_uncharge(sk, filter);
+		rcu_assign_pointer(sk->sk_filter, NULL);
+	}
+
+	sock_disable_timestamp(sk, SOCK_TIMESTAMP);
+	sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+
+	if (atomic_read(&sk->sk_omem_alloc))
+		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
+		       __func__, atomic_read(&sk->sk_omem_alloc));
+
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	put_pid(sk->sk_peer_pid);
+	put_net(sock_net(sk));
+	sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+void sk_free(struct sock *sk)
+{
+	/*
+	 * We subtract one from sk_wmem_alloc and can know if
+	 * some packets are still in some tx queue.
+	 * If not null, sock_wfree() will call __sk_free(sk) later
+	 */
+	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+		__sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+/*
+ * Last sock_put should drop reference to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking reference to stopping namespace
+ * is not an option.
+ * Take reference to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+	if (sk == NULL || sk->sk_socket == NULL)
+		return;
+
+	sock_hold(sk);
+	sock_release(sk->sk_socket);
+	release_net(sock_net(sk));
+	sock_net_set(sk, get_net(&init_net));
+	sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+{
+	struct sock *newsk;
+
+	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+	if (newsk != NULL) {
+		struct sk_filter *filter;
+
+		sock_copy(newsk, sk);
+
+		/* SANITY */
+		get_net(sock_net(newsk));
+		sk_node_init(&newsk->sk_node);
+		sock_lock_init(newsk);
+		bh_lock_sock(newsk);
+		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
+		newsk->sk_backlog.len = 0;
+
+		atomic_set(&newsk->sk_rmem_alloc, 0);
+		/*
+		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
+		 */
+		atomic_set(&newsk->sk_wmem_alloc, 1);
+		atomic_set(&newsk->sk_omem_alloc, 0);
+		skb_queue_head_init(&newsk->sk_receive_queue);
+		skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+		skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
+
+		spin_lock_init(&newsk->sk_dst_lock);
+		rwlock_init(&newsk->sk_callback_lock);
+		lockdep_set_class_and_name(&newsk->sk_callback_lock,
+				af_callback_keys + newsk->sk_family,
+				af_family_clock_key_strings[newsk->sk_family]);
+
+		newsk->sk_dst_cache	= NULL;
+		newsk->sk_wmem_queued	= 0;
+		newsk->sk_forward_alloc = 0;
+		newsk->sk_send_head	= NULL;
+		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+		sock_reset_flag(newsk, SOCK_DONE);
+		skb_queue_head_init(&newsk->sk_error_queue);
+
+		filter = rcu_dereference_protected(newsk->sk_filter, 1);
+		if (filter != NULL)
+			sk_filter_charge(newsk, filter);
+
+		if (unlikely(xfrm_sk_clone_policy(newsk))) {
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			bh_unlock_sock(newsk);
+			sk_free(newsk);
+			newsk = NULL;
+			goto out;
+		}
+
+		newsk->sk_err	   = 0;
+		newsk->sk_priority = 0;
+		/*
+		 * Before updating sk_refcnt, we must commit prior changes to memory
+		 * (Documentation/RCU/rculist_nulls.txt for details)
+		 */
+		smp_wmb();
+		atomic_set(&newsk->sk_refcnt, 2);
+
+		/*
+		 * Increment the counter in the same struct proto as the master
+		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+		 * is the same as sk->sk_prot->socks, as this field was copied
+		 * with memcpy).
+		 *
+		 * This _changes_ the previous behaviour, where
+		 * tcp_create_openreq_child always was incrementing the
+		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+		 * to be taken into account in all callers. -acme
+		 */
+		sk_refcnt_debug_inc(newsk);
+		sk_set_socket(newsk, NULL);
+		newsk->sk_wq = NULL;
+
+		if (newsk->sk_prot->sockets_allocated)
+			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+
+		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
+		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+			net_enable_timestamp();
+	}
+out:
+	return newsk;
+}
+EXPORT_SYMBOL_GPL(sk_clone);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+	__sk_dst_set(sk, dst);
+	sk->sk_route_caps = dst->dev->features;
+	if (sk->sk_route_caps & NETIF_F_GSO)
+		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+	sk->sk_route_caps &= ~sk->sk_route_nocaps;
+	if (sk_can_gso(sk)) {
+		if (dst->header_len) {
+			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+		} else {
+			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+			sk->sk_gso_max_size = dst->dev->gso_max_size;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
+void __init sk_init(void)
+{
+	if (totalram_pages <= 4096) {
+		sysctl_wmem_max = 32767;
+		sysctl_rmem_max = 32767;
+		sysctl_wmem_default = 32767;
+		sysctl_rmem_default = 32767;
+	} else if (totalram_pages >= 131072) {
+		sysctl_wmem_max = 131071;
+		sysctl_rmem_max = 131071;
+	}
+}
+
+/*
+ *	Simple resource managers for sockets.
+ */
+
+
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void sock_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	unsigned int len = skb->truesize;
+
+	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+		/*
+		 * Keep a reference on sk_wmem_alloc, this will be released
+		 * after sk_write_space() call
+		 */
+		atomic_sub(len - 1, &sk->sk_wmem_alloc);
+		sk->sk_write_space(sk);
+		len = 1;
+	}
+	/*
+	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
+	 * could not do because of in-flight packets
+	 */
+	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+		__sk_free(sk);
+}
+EXPORT_SYMBOL(sock_wfree);
+
+/*
+ * Read buffer destructor automatically called from kfree_skb.
+ */
+void sock_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	unsigned int len = skb->truesize;
+
+	atomic_sub(len, &sk->sk_rmem_alloc);
+	sk_mem_uncharge(sk, len);
+}
+EXPORT_SYMBOL(sock_rfree);
+
+
+int sock_i_uid(struct sock *sk)
+{
+	int uid;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
+	read_unlock_bh(&sk->sk_callback_lock);
+	return uid;
+}
+EXPORT_SYMBOL(sock_i_uid);
+
+unsigned long sock_i_ino(struct sock *sk)
+{
+	unsigned long ino;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
+	read_unlock_bh(&sk->sk_callback_lock);
+	return ino;
+}
+EXPORT_SYMBOL(sock_i_ino);
+
+/*
+ * Allocate a skb from the socket's send buffer.
+ */
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+			     gfp_t priority)
+{
+	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+		struct sk_buff *skb = alloc_skb(size, priority);
+		if (skb) {
+			skb_set_owner_w(skb, sk);
+			return skb;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(sock_wmalloc);
+
+/*
+ * Allocate a skb from the socket's receive buffer.
+ */
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+			     gfp_t priority)
+{
+	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
+		struct sk_buff *skb = alloc_skb(size, priority);
+		if (skb) {
+			skb_set_owner_r(skb, sk);
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Allocate a memory block from the socket's option memory buffer.
+ */
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
+{
+	if ((unsigned)size <= sysctl_optmem_max &&
+	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+		void *mem;
+		/* First do the add, to avoid the race if kmalloc
+		 * might sleep.
+		 */
+		atomic_add(size, &sk->sk_omem_alloc);
+		mem = kmalloc(size, priority);
+		if (mem)
+			return mem;
+		atomic_sub(size, &sk->sk_omem_alloc);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(sock_kmalloc);
+
+/*
+ * Free an option memory block.
+ */
+void sock_kfree_s(struct sock *sk, void *mem, int size)
+{
+	kfree(mem);
+	atomic_sub(size, &sk->sk_omem_alloc);
+}
+EXPORT_SYMBOL(sock_kfree_s);
+
+/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
+   I think, these locks should be removed for datagram sockets.
+ */
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
+{
+	DEFINE_WAIT(wait);
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+	for (;;) {
+		if (!timeo)
+			break;
+		if (signal_pending(current))
+			break;
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+			break;
+		if (sk->sk_shutdown & SEND_SHUTDOWN)
+			break;
+		if (sk->sk_err)
+			break;
+		timeo = schedule_timeout(timeo);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return timeo;
+}
+
+
+/*
+ *	Generic send/receive buffer handlers
+ */
+
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+				     unsigned long data_len, int noblock,
+				     int *errcode)
+{
+	struct sk_buff *skb;
+	gfp_t gfp_mask;
+	long timeo;
+	int err;
+	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+
+	err = -EMSGSIZE;
+	if (npages > MAX_SKB_FRAGS)
+		goto failure;
+
+	gfp_mask = sk->sk_allocation;
+	if (gfp_mask & __GFP_WAIT)
+		gfp_mask |= __GFP_REPEAT;
+
+	timeo = sock_sndtimeo(sk, noblock);
+	while (1) {
+		err = sock_error(sk);
+		if (err != 0)
+			goto failure;
+
+		err = -EPIPE;
+		if (sk->sk_shutdown & SEND_SHUTDOWN)
+			goto failure;
+
+		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+			skb = alloc_skb(header_len, gfp_mask);
+			if (skb) {
+				int i;
+
+				/* No pages, we're done... */
+				if (!data_len)
+					break;
+
+				skb->truesize += data_len;
+				skb_shinfo(skb)->nr_frags = npages;
+				for (i = 0; i < npages; i++) {
+					struct page *page;
+					skb_frag_t *frag;
+
+					page = alloc_pages(sk->sk_allocation, 0);
+					if (!page) {
+						err = -ENOBUFS;
+						skb_shinfo(skb)->nr_frags = i;
+						kfree_skb(skb);
+						goto failure;
+					}
+
+					frag = &skb_shinfo(skb)->frags[i];
+					frag->page = page;
+					frag->page_offset = 0;
+					frag->size = (data_len >= PAGE_SIZE ?
+						      PAGE_SIZE :
+						      data_len);
+					data_len -= PAGE_SIZE;
+				}
+
+				/* Full success... */
+				break;
+			}
+			err = -ENOBUFS;
+			goto failure;
+		}
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		err = -EAGAIN;
+		if (!timeo)
+			goto failure;
+		if (signal_pending(current))
+			goto interrupted;
+		timeo = sock_wait_for_wmem(sk, timeo);
+	}
+
+	skb_set_owner_w(skb, sk);
+	return skb;
+
+interrupted:
+	err = sock_intr_errno(timeo);
+failure:
+	*errcode = err;
+	return NULL;
+}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
+
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+				    int noblock, int *errcode)
+{
+	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+}
+EXPORT_SYMBOL(sock_alloc_send_skb);
+
+static void __lock_sock(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+	__acquires(&sk->sk_lock.slock)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+		spin_unlock_bh(&sk->sk_lock.slock);
+		schedule();
+		spin_lock_bh(&sk->sk_lock.slock);
+		if (!sock_owned_by_user(sk))
+			break;
+	}
+	finish_wait(&sk->sk_lock.wq, &wait);
+}
+
+static void __release_sock(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+	__acquires(&sk->sk_lock.slock)
+{
+	struct sk_buff *skb = sk->sk_backlog.head;
+
+	do {
+		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+		bh_unlock_sock(sk);
+
+		do {
+			struct sk_buff *next = skb->next;
+
+			WARN_ON_ONCE(skb_dst_is_noref(skb));
+			skb->next = NULL;
+			sk_backlog_rcv(sk, skb);
+
+			/*
+			 * We are in process context here with softirqs
+			 * disabled, use cond_resched_softirq() to preempt.
+			 * This is safe to do because we've taken the backlog
+			 * queue private:
+			 */
+			cond_resched_softirq();
+
+			skb = next;
+		} while (skb != NULL);
+
+		bh_lock_sock(sk);
+	} while ((skb = sk->sk_backlog.head) != NULL);
+
+	/*
+	 * Doing the zeroing here guarantee we can not loop forever
+	 * while a wild producer attempts to flood us.
+	 */
+	sk->sk_backlog.len = 0;
+}
+
+/**
+ * sk_wait_data - wait for data to arrive at sk_receive_queue
+ * @sk:    sock to wait on
+ * @timeo: for how long
+ *
+ * Now socket state including sk->sk_err is changed only under lock,
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
+ */
+int sk_wait_data(struct sock *sk, long *timeo)
+{
+	int rc;
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	finish_wait(sk_sleep(sk), &wait);
+	return rc;
+}
+EXPORT_SYMBOL(sk_wait_data);
+
+/**
+ *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *	@sk: socket
+ *	@size: memory size to allocate
+ *	@kind: allocation type
+ *
+ *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ *	rmem allocation. This function assumes that protocols which have
+ *	memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+	struct proto *prot = sk->sk_prot;
+	int amt = sk_mem_pages(size);
+	long allocated;
+
+	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+	/* Under limit. */
+	if (allocated <= prot->sysctl_mem[0]) {
+		if (prot->memory_pressure && *prot->memory_pressure)
+			*prot->memory_pressure = 0;
+		return 1;
+	}
+
+	/* Under pressure. */
+	if (allocated > prot->sysctl_mem[1])
+		if (prot->enter_memory_pressure)
+			prot->enter_memory_pressure(sk);
+
+	/* Over hard limit. */
+	if (allocated > prot->sysctl_mem[2])
+		goto suppress_allocation;
+
+	/* guarantee minimum buffer size under pressure */
+	if (kind == SK_MEM_RECV) {
+		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+			return 1;
+	} else { /* SK_MEM_SEND */
+		if (sk->sk_type == SOCK_STREAM) {
+			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+				return 1;
+		} else if (atomic_read(&sk->sk_wmem_alloc) <
+			   prot->sysctl_wmem[0])
+				return 1;
+	}
+
+	if (prot->memory_pressure) {
+		int alloc;
+
+		if (!*prot->memory_pressure)
+			return 1;
+		alloc = percpu_counter_read_positive(prot->sockets_allocated);
+		if (prot->sysctl_mem[2] > alloc *
+		    sk_mem_pages(sk->sk_wmem_queued +
+				 atomic_read(&sk->sk_rmem_alloc) +
+				 sk->sk_forward_alloc))
+			return 1;
+	}
+
+suppress_allocation:
+
+	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+		sk_stream_moderate_sndbuf(sk);
+
+		/* Fail only if socket is _under_ its sndbuf.
+		 * In this case we cannot block, so that we have to fail.
+		 */
+		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+			return 1;
+	}
+
+	/* Alas. Undo changes. */
+	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+	atomic_long_sub(amt, prot->memory_allocated);
+	return 0;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ *	__sk_reclaim - reclaim memory_allocated
+ *	@sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+
+	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+		   prot->memory_allocated);
+	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+	if (prot->memory_pressure && *prot->memory_pressure &&
+	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+		*prot->memory_pressure = 0;
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
+/*
+ * Set of default routines for initialising struct proto_ops when
+ * the protocol does not support a particular function. In certain
+ * cases where it makes no sense for a protocol to have a "do nothing"
+ * function, some default processing is provided.
+ */
+
+int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_bind);
+
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+		    int len, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_connect);
+
+int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_socketpair);
+
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_accept);
+
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
+		    int *len, int peer)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getname);
+
+unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
+{
+	return 0;
+}
+EXPORT_SYMBOL(sock_no_poll);
+
+int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_ioctl);
+
+int sock_no_listen(struct socket *sock, int backlog)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_listen);
+
+int sock_no_shutdown(struct socket *sock, int how)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_shutdown);
+
+int sock_no_setsockopt(struct socket *sock, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_setsockopt);
+
+int sock_no_getsockopt(struct socket *sock, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_getsockopt);
+
+int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+		    size_t len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg);
+
+int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+		    size_t len, int flags)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_recvmsg);
+
+int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
+{
+	/* Mirror missing mmap method error code */
+	return -ENODEV;
+}
+EXPORT_SYMBOL(sock_no_mmap);
+
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+	ssize_t res;
+	struct msghdr msg = {.msg_flags = flags};
+	struct kvec iov;
+	char *kaddr = kmap(page);
+	iov.iov_base = kaddr + offset;
+	iov.iov_len = size;
+	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
+	kunmap(page);
+	return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage);
+
+/*
+ *	Default Socket Callbacks
+ */
+
+static void sock_def_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
+	rcu_read_unlock();
+}
+
+static void sock_def_error_report(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_poll(&wq->wait, POLLERR);
+	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+	rcu_read_unlock();
+}
+
+static void sock_def_readable(struct sock *sk, int len)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+						POLLRDNORM | POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+static void sock_def_write_space(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+
+	/* Do not wake up a writer until he can make "significant"
+	 * progress.  --DaveM
+	 */
+	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+						POLLWRNORM | POLLWRBAND);
+
+		/* Should agree with poll, otherwise some programs break */
+		if (sock_writeable(sk))
+			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	}
+
+	rcu_read_unlock();
+}
+
+static void sock_def_destruct(struct sock *sk)
+{
+	kfree(sk->sk_protinfo);
+}
+
+void sk_send_sigurg(struct sock *sk)
+{
+	if (sk->sk_socket && sk->sk_socket->file)
+		if (send_sigurg(&sk->sk_socket->file->f_owner))
+			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
+}
+EXPORT_SYMBOL(sk_send_sigurg);
+
+void sk_reset_timer(struct sock *sk, struct timer_list* timer,
+		    unsigned long expires)
+{
+	if (!mod_timer(timer, expires))
+		sock_hold(sk);
+}
+EXPORT_SYMBOL(sk_reset_timer);
+
+void sk_stop_timer(struct sock *sk, struct timer_list* timer)
+{
+	if (timer_pending(timer) && del_timer(timer))
+		__sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+	skb_queue_head_init(&sk->sk_receive_queue);
+	skb_queue_head_init(&sk->sk_write_queue);
+	skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+	skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
+
+	sk->sk_send_head	=	NULL;
+
+	init_timer(&sk->sk_timer);
+
+	sk->sk_allocation	=	GFP_KERNEL;
+	sk->sk_rcvbuf		=	sysctl_rmem_default;
+	sk->sk_sndbuf		=	sysctl_wmem_default;
+	sk->sk_state		=	TCP_CLOSE;
+	sk_set_socket(sk, sock);
+
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	if (sock) {
+		sk->sk_type	=	sock->type;
+		sk->sk_wq	=	sock->wq;
+		sock->sk	=	sk;
+	} else
+		sk->sk_wq	=	NULL;
+
+	spin_lock_init(&sk->sk_dst_lock);
+	rwlock_init(&sk->sk_callback_lock);
+	lockdep_set_class_and_name(&sk->sk_callback_lock,
+			af_callback_keys + sk->sk_family,
+			af_family_clock_key_strings[sk->sk_family]);
+
+	sk->sk_state_change	=	sock_def_wakeup;
+	sk->sk_data_ready	=	sock_def_readable;
+	sk->sk_write_space	=	sock_def_write_space;
+	sk->sk_error_report	=	sock_def_error_report;
+	sk->sk_destruct		=	sock_def_destruct;
+
+	sk->sk_sndmsg_page	=	NULL;
+	sk->sk_sndmsg_off	=	0;
+
+	sk->sk_peer_pid 	=	NULL;
+	sk->sk_peer_cred	=	NULL;
+	sk->sk_write_pending	=	0;
+	sk->sk_rcvlowat		=	1;
+	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
+	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
+
+	sk->sk_stamp = ktime_set(-1L, 0);
+
+	/*
+	 * Before updating sk_refcnt, we must commit prior changes to memory
+	 * (Documentation/RCU/rculist_nulls.txt for details)
+	 */
+	smp_wmb();
+	atomic_set(&sk->sk_refcnt, 1);
+	atomic_set(&sk->sk_drops, 0);
+}
+EXPORT_SYMBOL(sock_init_data);
+
+void lock_sock_nested(struct sock *sk, int subclass)
+{
+	might_sleep();
+	spin_lock_bh(&sk->sk_lock.slock);
+	if (sk->sk_lock.owned)
+		__lock_sock(sk);
+	sk->sk_lock.owned = 1;
+	spin_unlock(&sk->sk_lock.slock);
+	/*
+	 * The sk_lock has mutex_lock() semantics here:
+	 */
+	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+	local_bh_enable();
+}
+EXPORT_SYMBOL(lock_sock_nested);
+
+void release_sock(struct sock *sk)
+{
+	/*
+	 * The sk_lock has mutex_unlock() semantics:
+	 */
+	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+	spin_lock_bh(&sk->sk_lock.slock);
+	if (sk->sk_backlog.tail)
+		__release_sock(sk);
+	sk->sk_lock.owned = 0;
+	if (waitqueue_active(&sk->sk_lock.wq))
+		wake_up(&sk->sk_lock.wq);
+	spin_unlock_bh(&sk->sk_lock.slock);
+}
+EXPORT_SYMBOL(release_sock);
+
+/**
+ * lock_sock_fast - fast version of lock_sock
+ * @sk: socket
+ *
+ * This version should be used for very small section, where process wont block
+ * return false if fast path is taken
+ *   sk_lock.slock locked, owned = 0, BH disabled
+ * return true if slow path is taken
+ *   sk_lock.slock unlocked, owned = 1, BH enabled
+ */
+bool lock_sock_fast(struct sock *sk)
+{
+	might_sleep();
+	spin_lock_bh(&sk->sk_lock.slock);
+
+	if (!sk->sk_lock.owned)
+		/*
+		 * Note : We must disable BH
+		 */
+		return false;
+
+	__lock_sock(sk);
+	sk->sk_lock.owned = 1;
+	spin_unlock(&sk->sk_lock.slock);
+	/*
+	 * The sk_lock has mutex_lock() semantics here:
+	 */
+	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
+	local_bh_enable();
+	return true;
+}
+EXPORT_SYMBOL(lock_sock_fast);
+
+int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+{
+	struct timeval tv;
+	if (!sock_flag(sk, SOCK_TIMESTAMP))
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	tv = ktime_to_timeval(sk->sk_stamp);
+	if (tv.tv_sec == -1)
+		return -ENOENT;
+	if (tv.tv_sec == 0) {
+		sk->sk_stamp = ktime_get_real();
+		tv = ktime_to_timeval(sk->sk_stamp);
+	}
+	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestamp);
+
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+	struct timespec ts;
+	if (!sock_flag(sk, SOCK_TIMESTAMP))
+		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+	ts = ktime_to_timespec(sk->sk_stamp);
+	if (ts.tv_sec == -1)
+		return -ENOENT;
+	if (ts.tv_sec == 0) {
+		sk->sk_stamp = ktime_get_real();
+		ts = ktime_to_timespec(sk->sk_stamp);
+	}
+	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
+void sock_enable_timestamp(struct sock *sk, int flag)
+{
+	if (!sock_flag(sk, flag)) {
+		sock_set_flag(sk, flag);
+		/*
+		 * we just set one of the two flags which require net
+		 * time stamping, but time stamping might have been on
+		 * already because of the other one
+		 */
+		if (!sock_flag(sk,
+				flag == SOCK_TIMESTAMP ?
+				SOCK_TIMESTAMPING_RX_SOFTWARE :
+				SOCK_TIMESTAMP))
+			net_enable_timestamp();
+	}
+}
+
+/*
+ *	Get a socket option on an socket.
+ *
+ *	FIX: POSIX 1003.1g is very ambiguous here. It states that
+ *	asynchronous errors should be reported by getsockopt. We assume
+ *	this means if you specify SO_ERROR (otherwise whats the point of it).
+ */
+int sock_common_getsockopt(struct socket *sock, int level, int optname,
+			   char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+
+	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+				  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_prot->compat_getsockopt != NULL)
+		return sk->sk_prot->compat_getsockopt(sk, level, optname,
+						      optval, optlen);
+	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
+int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	int addr_len = 0;
+	int err;
+
+	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+				   flags & ~MSG_DONTWAIT, &addr_len);
+	if (err >= 0)
+		msg->msg_namelen = addr_len;
+	return err;
+}
+EXPORT_SYMBOL(sock_common_recvmsg);
+
+/*
+ *	Set socket options on an inet socket.
+ */
+int sock_common_setsockopt(struct socket *sock, int level, int optname,
+			   char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+
+	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(sock_common_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+				  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_prot->compat_setsockopt != NULL)
+		return sk->sk_prot->compat_setsockopt(sk, level, optname,
+						      optval, optlen);
+	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
+void sk_common_release(struct sock *sk)
+{
+	if (sk->sk_prot->destroy)
+		sk->sk_prot->destroy(sk);
+
+	/*
+	 * Observation: when sock_common_release is called, processes have
+	 * no access to socket. But net still has.
+	 * Step one, detach it from networking:
+	 *
+	 * A. Remove from hash tables.
+	 */
+
+	sk->sk_prot->unhash(sk);
+
+	/*
+	 * In this point socket cannot receive new packets, but it is possible
+	 * that some packets are in flight because some CPU runs receiver and
+	 * did hash table lookup before we unhashed socket. They will achieve
+	 * receive queue and will be purged by socket destructor.
+	 *
+	 * Also we still have packets pending on receive queue and probably,
+	 * our own packets waiting in device queues. sock_destroy will drain
+	 * receive queue, but transmitted packets will delay socket destruction
+	 * until the last reference will be released.
+	 */
+
+	sock_orphan(sk);
+
+	xfrm_sk_free_policy(sk);
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+}
+EXPORT_SYMBOL(sk_common_release);
+
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR	64	/* should be enough for the first time */
+struct prot_inuse {
+	int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+	int cpu, idx = prot->inuse_idx;
+	int res = 0;
+
+	for_each_possible_cpu(cpu)
+		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+	return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static int __net_init sock_inuse_init_net(struct net *net)
+{
+	net->core.inuse = alloc_percpu(struct prot_inuse);
+	return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void __net_exit sock_inuse_exit_net(struct net *net)
+{
+	free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+	.init = sock_inuse_init_net,
+	.exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+	if (register_pernet_subsys(&net_inuse_ops))
+		panic("Cannot initialize net inuse counters");
+
+	return 0;
+}
+
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+	int cpu, idx = prot->inuse_idx;
+	int res = 0;
+
+	for_each_possible_cpu(cpu)
+		res += per_cpu(prot_inuse, cpu).val[idx];
+
+	return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+		return;
+	}
+
+	set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+		clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
+{
+}
+#endif
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
+	if (alloc_slab) {
+		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
+					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					NULL);
+
+		if (prot->slab == NULL) {
+			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
+			       prot->name);
+			goto out;
+		}
+
+		if (prot->rsk_prot != NULL) {
+			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
+			if (prot->rsk_prot->slab_name == NULL)
+				goto out_free_sock_slab;
+
+			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
+								 prot->rsk_prot->obj_size, 0,
+								 SLAB_HWCACHE_ALIGN, NULL);
+
+			if (prot->rsk_prot->slab == NULL) {
+				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+				       prot->name);
+				goto out_free_request_sock_slab_name;
+			}
+		}
+
+		if (prot->twsk_prot != NULL) {
+			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
+
+			if (prot->twsk_prot->twsk_slab_name == NULL)
+				goto out_free_request_sock_slab;
+
+			prot->twsk_prot->twsk_slab =
+				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
+						  prot->twsk_prot->twsk_obj_size,
+						  0,
+						  SLAB_HWCACHE_ALIGN |
+							prot->slab_flags,
+						  NULL);
+			if (prot->twsk_prot->twsk_slab == NULL)
+				goto out_free_timewait_sock_slab_name;
+		}
+	}
+
+	write_lock(&proto_list_lock);
+	list_add(&prot->node, &proto_list);
+	assign_proto_idx(prot);
+	write_unlock(&proto_list_lock);
+	return 0;
+
+out_free_timewait_sock_slab_name:
+	kfree(prot->twsk_prot->twsk_slab_name);
+out_free_request_sock_slab:
+	if (prot->rsk_prot && prot->rsk_prot->slab) {
+		kmem_cache_destroy(prot->rsk_prot->slab);
+		prot->rsk_prot->slab = NULL;
+	}
+out_free_request_sock_slab_name:
+	if (prot->rsk_prot)
+		kfree(prot->rsk_prot->slab_name);
+out_free_sock_slab:
+	kmem_cache_destroy(prot->slab);
+	prot->slab = NULL;
+out:
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(proto_register);
+
+void proto_unregister(struct proto *prot)
+{
+	write_lock(&proto_list_lock);
+	release_proto_idx(prot);
+	list_del(&prot->node);
+	write_unlock(&proto_list_lock);
+
+	if (prot->slab != NULL) {
+		kmem_cache_destroy(prot->slab);
+		prot->slab = NULL;
+	}
+
+	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
+		kmem_cache_destroy(prot->rsk_prot->slab);
+		kfree(prot->rsk_prot->slab_name);
+		prot->rsk_prot->slab = NULL;
+	}
+
+	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
+		kfree(prot->twsk_prot->twsk_slab_name);
+		prot->twsk_prot->twsk_slab = NULL;
+	}
+}
+EXPORT_SYMBOL(proto_unregister);
+
+#ifdef CONFIG_PROC_FS
+static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(proto_list_lock)
+{
+	read_lock(&proto_list_lock);
+	return seq_list_start_head(&proto_list, *pos);
+}
+
+static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &proto_list, pos);
+}
+
+static void proto_seq_stop(struct seq_file *seq, void *v)
+	__releases(proto_list_lock)
+{
+	read_unlock(&proto_list_lock);
+}
+
+static char proto_method_implemented(const void *method)
+{
+	return method == NULL ? 'n' : 'y';
+}
+
+static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
+{
+	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
+			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+		   proto->name,
+		   proto->obj_size,
+		   sock_prot_inuse_get(seq_file_net(seq), proto),
+		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
+		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+		   proto->max_header,
+		   proto->slab == NULL ? "no" : "yes",
+		   module_name(proto->owner),
+		   proto_method_implemented(proto->close),
+		   proto_method_implemented(proto->connect),
+		   proto_method_implemented(proto->disconnect),
+		   proto_method_implemented(proto->accept),
+		   proto_method_implemented(proto->ioctl),
+		   proto_method_implemented(proto->init),
+		   proto_method_implemented(proto->destroy),
+		   proto_method_implemented(proto->shutdown),
+		   proto_method_implemented(proto->setsockopt),
+		   proto_method_implemented(proto->getsockopt),
+		   proto_method_implemented(proto->sendmsg),
+		   proto_method_implemented(proto->recvmsg),
+		   proto_method_implemented(proto->sendpage),
+		   proto_method_implemented(proto->bind),
+		   proto_method_implemented(proto->backlog_rcv),
+		   proto_method_implemented(proto->hash),
+		   proto_method_implemented(proto->unhash),
+		   proto_method_implemented(proto->get_port),
+		   proto_method_implemented(proto->enter_memory_pressure));
+}
+
+static int proto_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == &proto_list)
+		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
+			   "protocol",
+			   "size",
+			   "sockets",
+			   "memory",
+			   "press",
+			   "maxhdr",
+			   "slab",
+			   "module",
+			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+	else
+		proto_seq_printf(seq, list_entry(v, struct proto, node));
+	return 0;
+}
+
+static const struct seq_operations proto_seq_ops = {
+	.start  = proto_seq_start,
+	.next   = proto_seq_next,
+	.stop   = proto_seq_stop,
+	.show   = proto_seq_show,
+};
+
+static int proto_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &proto_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations proto_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= proto_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+	proc_net_remove(net, "protocols");
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+	.init = proto_init_net,
+	.exit = proto_exit_net,
+};
+
+static int __init proto_init(void)
+{
+	return register_pernet_subsys(&proto_net_ops);
+}
+
+subsys_initcall(proto_init);
+
+#endif /* PROC_FS */
diff --git a/net/core/stream.c b/net/core/stream.c
new file mode 100644
index 00000000..f5df85dc
--- /dev/null
+++ b/net/core/stream.c
@@ -0,0 +1,208 @@
+/*
+ *     SUCS NET3:
+ *
+ *     Generic stream handling routines. These are generic for most
+ *     protocols. Even IP. Tonight 8-).
+ *     This is used because TCP, LLC (others too) layer all have mostly
+ *     identical sendmsg() and recvmsg() code.
+ *     So we (will) share it here.
+ *
+ *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *                     (from old tcp.c code)
+ *                     Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/signal.h>
+#include <linux/tcp.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+/**
+ * sk_stream_write_space - stream socket write_space callback.
+ * @sk: socket
+ *
+ * FIXME: write proper description
+ */
+void sk_stream_write_space(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+	struct socket_wq *wq;
+
+	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_poll(&wq->wait, POLLOUT |
+						POLLWRNORM | POLLWRBAND);
+		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+			sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(sk_stream_write_space);
+
+/**
+ * sk_stream_wait_connect - Wait for a socket to get into the connected state
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
+{
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	int done;
+
+	do {
+		int err = sock_error(sk);
+		if (err)
+			return err;
+		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
+			return -EPIPE;
+		if (!*timeo_p)
+			return -EAGAIN;
+		if (signal_pending(tsk))
+			return sock_intr_errno(*timeo_p);
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		sk->sk_write_pending++;
+		done = sk_wait_event(sk, timeo_p,
+				     !sk->sk_err &&
+				     !((1 << sk->sk_state) &
+				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
+		finish_wait(sk_sleep(sk), &wait);
+		sk->sk_write_pending--;
+	} while (!done);
+	return 0;
+}
+EXPORT_SYMBOL(sk_stream_wait_connect);
+
+/**
+ * sk_stream_closing - Return 1 if we still have things to send in our buffers.
+ * @sk: socket to verify
+ */
+static inline int sk_stream_closing(struct sock *sk)
+{
+	return (1 << sk->sk_state) &
+	       (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
+}
+
+void sk_stream_wait_close(struct sock *sk, long timeout)
+{
+	if (timeout) {
+		DEFINE_WAIT(wait);
+
+		do {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
+				break;
+		} while (!signal_pending(current) && timeout);
+
+		finish_wait(sk_sleep(sk), &wait);
+	}
+}
+EXPORT_SYMBOL(sk_stream_wait_close);
+
+/**
+ * sk_stream_wait_memory - Wait for more memory for a socket
+ * @sk: socket to wait for memory
+ * @timeo_p: for how long
+ */
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+	int err = 0;
+	long vm_wait = 0;
+	long current_timeo = *timeo_p;
+	DEFINE_WAIT(wait);
+
+	if (sk_stream_memory_free(sk))
+		current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
+
+	while (1) {
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+			goto do_error;
+		if (!*timeo_p)
+			goto do_nonblock;
+		if (signal_pending(current))
+			goto do_interrupted;
+		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		if (sk_stream_memory_free(sk) && !vm_wait)
+			break;
+
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		sk_wait_event(sk, &current_timeo, sk->sk_err ||
+						  (sk->sk_shutdown & SEND_SHUTDOWN) ||
+						  (sk_stream_memory_free(sk) &&
+						  !vm_wait));
+		sk->sk_write_pending--;
+
+		if (vm_wait) {
+			vm_wait -= current_timeo;
+			current_timeo = *timeo_p;
+			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+			    (current_timeo -= vm_wait) < 0)
+				current_timeo = 0;
+			vm_wait = 0;
+		}
+		*timeo_p = current_timeo;
+	}
+out:
+	finish_wait(sk_sleep(sk), &wait);
+	return err;
+
+do_error:
+	err = -EPIPE;
+	goto out;
+do_nonblock:
+	err = -EAGAIN;
+	goto out;
+do_interrupted:
+	err = sock_intr_errno(*timeo_p);
+	goto out;
+}
+EXPORT_SYMBOL(sk_stream_wait_memory);
+
+int sk_stream_error(struct sock *sk, int flags, int err)
+{
+	if (err == -EPIPE)
+		err = sock_error(sk) ? : -EPIPE;
+	if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+		send_sig(SIGPIPE, current, 0);
+	return err;
+}
+EXPORT_SYMBOL(sk_stream_error);
+
+void sk_stream_kill_queues(struct sock *sk)
+{
+	/* First the read buffer. */
+	__skb_queue_purge(&sk->sk_receive_queue);
+
+	/* Next, the error queue. */
+	__skb_queue_purge(&sk->sk_error_queue);
+
+	/* Next, the write queue. */
+	WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+
+	/* Account for returned memory. */
+	sk_mem_reclaim(sk);
+
+	WARN_ON(sk->sk_wmem_queued);
+	WARN_ON(sk->sk_forward_alloc);
+
+	/* It is _impossible_ for the backlog to contain anything
+	 * when we get here.  All user references to this socket
+	 * have gone away, only the net layer knows can touch it.
+	 */
+}
+EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
new file mode 100644
index 00000000..77a65f03
--- /dev/null
+++ b/net/core/sysctl_net_core.c
@@ -0,0 +1,259 @@
+/* -*- linux-c -*-
+ * sysctl_net_core.c: sysctl interface to net core subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/core directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int orig_size, size;
+	int ret, i;
+	ctl_table tmp = {
+		.data = &size,
+		.maxlen = sizeof(size),
+		.mode = table->mode
+	};
+	struct rps_sock_flow_table *orig_sock_table, *sock_table;
+	static DEFINE_MUTEX(sock_flow_mutex);
+
+	mutex_lock(&sock_flow_mutex);
+
+	orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+					lockdep_is_held(&sock_flow_mutex));
+	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+	if (write) {
+		if (size) {
+			if (size > 1<<30) {
+				/* Enforce limit to prevent overflow */
+				mutex_unlock(&sock_flow_mutex);
+				return -EINVAL;
+			}
+			size = roundup_pow_of_two(size);
+			if (size != orig_size) {
+				sock_table =
+				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+				if (!sock_table) {
+					mutex_unlock(&sock_flow_mutex);
+					return -ENOMEM;
+				}
+
+				sock_table->mask = size - 1;
+			} else
+				sock_table = orig_sock_table;
+
+			for (i = 0; i < size; i++)
+				sock_table->ents[i] = RPS_NO_CPU;
+		} else
+			sock_table = NULL;
+
+		if (sock_table != orig_sock_table) {
+			rcu_assign_pointer(rps_sock_flow_table, sock_table);
+			synchronize_rcu();
+			vfree(orig_sock_table);
+		}
+	}
+
+	mutex_unlock(&sock_flow_mutex);
+
+	return ret;
+}
+#endif /* CONFIG_RPS */
+
+static struct ctl_table net_core_table[] = {
+#ifdef CONFIG_NET
+	{
+		.procname	= "wmem_max",
+		.data		= &sysctl_wmem_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "rmem_max",
+		.data		= &sysctl_rmem_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "wmem_default",
+		.data		= &sysctl_wmem_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "rmem_default",
+		.data		= &sysctl_rmem_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "dev_weight",
+		.data		= &weight_p,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "netdev_max_backlog",
+		.data		= &netdev_max_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_BPF_JIT
+	{
+		.procname	= "bpf_jit_enable",
+		.data		= &bpf_jit_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
+	{
+		.procname	= "netdev_tstamp_prequeue",
+		.data		= &netdev_tstamp_prequeue,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "message_cost",
+		.data		= &net_ratelimit_state.interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "message_burst",
+		.data		= &net_ratelimit_state.burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "optmem_max",
+		.data		= &sysctl_optmem_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_RPS
+	{
+		.procname	= "rps_sock_flow_entries",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= rps_sock_flow_sysctl
+	},
+#endif
+#endif /* CONFIG_NET */
+	{
+		.procname	= "netdev_budget",
+		.data		= &netdev_budget,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "warnings",
+		.data		= &net_msg_warn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+
+static struct ctl_table netns_core_table[] = {
+	{
+		.procname	= "somaxconn",
+		.data		= &init_net.core.sysctl_somaxconn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+
+__net_initdata struct ctl_path net_core_path[] = {
+	{ .procname = "net", },
+	{ .procname = "core", },
+	{ },
+};
+
+static __net_init int sysctl_core_net_init(struct net *net)
+{
+	struct ctl_table *tbl;
+
+	net->core.sysctl_somaxconn = SOMAXCONN;
+
+	tbl = netns_core_table;
+	if (!net_eq(net, &init_net)) {
+		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
+		if (tbl == NULL)
+			goto err_dup;
+
+		tbl[0].data = &net->core.sysctl_somaxconn;
+	}
+
+	net->core.sysctl_hdr = register_net_sysctl_table(net,
+			net_core_path, tbl);
+	if (net->core.sysctl_hdr == NULL)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	if (tbl != netns_core_table)
+		kfree(tbl);
+err_dup:
+	return -ENOMEM;
+}
+
+static __net_exit void sysctl_core_net_exit(struct net *net)
+{
+	struct ctl_table *tbl;
+
+	tbl = net->core.sysctl_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->core.sysctl_hdr);
+	BUG_ON(tbl == netns_core_table);
+	kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_core_ops = {
+	.init = sysctl_core_net_init,
+	.exit = sysctl_core_net_exit,
+};
+
+static __init int sysctl_core_init(void)
+{
+	static struct ctl_table empty[1];
+
+	register_sysctl_paths(net_core_path, empty);
+	register_net_sysctl_rotable(net_core_path, net_core_table);
+	return register_pernet_subsys(&sysctl_core_ops);
+}
+
+fs_initcall(sysctl_core_init);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 00000000..97d036a6
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,136 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+
+static struct sock_filter ptp_filter[] = {
+	PTP_FILTER
+};
+
+static unsigned int classify(const struct sk_buff *skb)
+{
+	if (likely(skb->dev &&
+		   skb->dev->phydev &&
+		   skb->dev->phydev->drv))
+		return sk_run_filter(skb, ptp_filter);
+	else
+		return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	struct sk_buff *clone;
+	struct sock *sk = skb->sk;
+	unsigned int type;
+
+	if (!sk)
+		return;
+
+	type = classify(skb);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->txtstamp)) {
+			if (!atomic_inc_not_zero(&sk->sk_refcnt))
+				return;
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (!clone) {
+				sock_put(sk);
+				return;
+			}
+			clone->sk = sk;
+			phydev->drv->txtstamp(phydev, clone, type);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+			       struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock *sk = skb->sk;
+	struct sock_exterr_skb *serr;
+	int err;
+
+	if (!hwtstamps) {
+		sock_put(sk);
+		kfree_skb(skb);
+		return;
+	}
+
+	*skb_hwtstamps(skb) = *hwtstamps;
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+	skb->sk = NULL;
+	err = sock_queue_err_skb(sk, skb);
+	sock_put(sk);
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	unsigned int type;
+
+	if (skb_headroom(skb) < ETH_HLEN)
+		return false;
+	__skb_push(skb, ETH_HLEN);
+
+	type = classify(skb);
+
+	__skb_pull(skb, ETH_HLEN);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->rxtstamp))
+			return phydev->drv->rxtstamp(phydev, skb, type);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+void __init skb_timestamping_init(void)
+{
+	BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
+}
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
new file mode 100644
index 00000000..25d717eb
--- /dev/null
+++ b/net/core/user_dma.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/socket.h>
+#include <net/tcp.h>
+#include <net/netdma.h>
+
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+EXPORT_SYMBOL(sysctl_tcp_dma_copybreak);
+
+/**
+ *	dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
+ *	@skb - buffer to copy
+ *	@offset - offset in the buffer to start copying from
+ *	@iovec - io vector to copy to
+ *	@len - amount of data to copy from buffer to iovec
+ *	@pinned_list - locked iovec buffer data
+ *
+ *	Note: the iovec is modified during the copy.
+ */
+int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
+			struct sk_buff *skb, int offset, struct iovec *to,
+			size_t len, struct dma_pinned_list *pinned_list)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+	dma_cookie_t cookie = 0;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		cookie = dma_memcpy_to_iovec(chan, to, pinned_list,
+					    skb->data + offset, copy);
+		if (cookie < 0)
+			goto fault;
+		len -= copy;
+		if (len == 0)
+			goto end;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		copy = end - offset;
+		if (copy > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+
+			cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page,
+					frag->page_offset + offset - start, copy);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			cookie = dma_skb_copy_datagram_iovec(chan, frag_iter,
+							     offset - start,
+							     to, copy,
+							     pinned_list);
+			if (cookie < 0)
+				goto fault;
+			len -= copy;
+			if (len == 0)
+				goto end;
+			offset += copy;
+		}
+		start = end;
+	}
+
+end:
+	if (!len) {
+		skb->dma_cookie = cookie;
+		return cookie;
+	}
+
+fault:
+	return -EFAULT;
+}
diff --git a/net/core/utils.c b/net/core/utils.c
new file mode 100644
index 00000000..386e263f
--- /dev/null
+++ b/net/core/utils.c
@@ -0,0 +1,323 @@
+/*
+ *	Generic address resultion entity
+ *
+ *	Authors:
+ *	net_random Alan Cox
+ *	net_ratelimit Andi Kleen
+ *	in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
+ *
+ *	Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/inet.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/ratelimit.h>
+
+#include <net/sock.h>
+#include <net/net_ratelimit.h>
+
+#include <asm/byteorder.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+int net_msg_warn __read_mostly = 1;
+EXPORT_SYMBOL(net_msg_warn);
+
+DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
+/*
+ * All net warning printk()s should be guarded by this function.
+ */
+int net_ratelimit(void)
+{
+	return __ratelimit(&net_ratelimit_state);
+}
+EXPORT_SYMBOL(net_ratelimit);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__be32 in_aton(const char *str)
+{
+	unsigned long l;
+	unsigned int val;
+	int i;
+
+	l = 0;
+	for (i = 0; i < 4; i++)
+	{
+		l <<= 8;
+		if (*str != '\0')
+		{
+			val = 0;
+			while (*str != '\0' && *str != '.' && *str != '\n')
+			{
+				val *= 10;
+				val += *str - '0';
+				str++;
+			}
+			l |= val;
+			if (*str != '\0')
+				str++;
+		}
+	}
+	return htonl(l);
+}
+EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT		0x00010000
+#define IN6PTON_DIGIT		0x00020000
+#define IN6PTON_COLON_MASK	0x00700000
+#define IN6PTON_COLON_1		0x00100000	/* single : requested */
+#define IN6PTON_COLON_2		0x00200000	/* second : requested */
+#define IN6PTON_COLON_1_2	0x00400000	/* :: requested */
+#define IN6PTON_DOT		0x00800000	/* . */
+#define IN6PTON_DELIM		0x10000000
+#define IN6PTON_NULL		0x20000000	/* first/tail */
+#define IN6PTON_UNKNOWN		0x40000000
+
+static inline int xdigit2bin(char c, int delim)
+{
+	int val;
+
+	if (c == delim || c == '\0')
+		return IN6PTON_DELIM;
+	if (c == ':')
+		return IN6PTON_COLON_MASK;
+	if (c == '.')
+		return IN6PTON_DOT;
+
+	val = hex_to_bin(c);
+	if (val >= 0)
+		return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
+	if (delim == -1)
+		return IN6PTON_DELIM;
+	return IN6PTON_UNKNOWN;
+}
+
+int in4_pton(const char *src, int srclen,
+	     u8 *dst,
+	     int delim, const char **end)
+{
+	const char *s;
+	u8 *d;
+	u8 dbuf[4];
+	int ret = 0;
+	int i;
+	int w = 0;
+
+	if (srclen < 0)
+		srclen = strlen(src);
+	s = src;
+	d = dbuf;
+	i = 0;
+	while(1) {
+		int c;
+		c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+		if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
+			goto out;
+		}
+		if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+			if (w == 0)
+				goto out;
+			*d++ = w & 0xff;
+			w = 0;
+			i++;
+			if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+				if (i != 4)
+					goto out;
+				break;
+			}
+			goto cont;
+		}
+		w = (w * 10) + c;
+		if ((w & 0xffff) > 255) {
+			goto out;
+		}
+cont:
+		if (i >= 4)
+			goto out;
+		s++;
+		srclen--;
+	}
+	ret = 1;
+	memcpy(dst, dbuf, sizeof(dbuf));
+out:
+	if (end)
+		*end = s;
+	return ret;
+}
+EXPORT_SYMBOL(in4_pton);
+
+int in6_pton(const char *src, int srclen,
+	     u8 *dst,
+	     int delim, const char **end)
+{
+	const char *s, *tok = NULL;
+	u8 *d, *dc = NULL;
+	u8 dbuf[16];
+	int ret = 0;
+	int i;
+	int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+	int w = 0;
+
+	memset(dbuf, 0, sizeof(dbuf));
+
+	s = src;
+	d = dbuf;
+	if (srclen < 0)
+		srclen = strlen(src);
+
+	while (1) {
+		int c;
+
+		c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+		if (!(c & state))
+			goto out;
+		if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+			/* process one 16-bit word */
+			if (!(state & IN6PTON_NULL)) {
+				*d++ = (w >> 8) & 0xff;
+				*d++ = w & 0xff;
+			}
+			w = 0;
+			if (c & IN6PTON_DELIM) {
+				/* We've processed last word */
+				break;
+			}
+			/*
+			 * COLON_1 => XDIGIT
+			 * COLON_2 => XDIGIT|DELIM
+			 * COLON_1_2 => COLON_2
+			 */
+			switch (state & IN6PTON_COLON_MASK) {
+			case IN6PTON_COLON_2:
+				dc = d;
+				state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+				if (dc - dbuf >= sizeof(dbuf))
+					state |= IN6PTON_NULL;
+				break;
+			case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+				state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+				break;
+			case IN6PTON_COLON_1:
+				state = IN6PTON_XDIGIT;
+				break;
+			case IN6PTON_COLON_1_2:
+				state = IN6PTON_COLON_2;
+				break;
+			default:
+				state = 0;
+			}
+			tok = s + 1;
+			goto cont;
+		}
+
+		if (c & IN6PTON_DOT) {
+			ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+			if (ret > 0) {
+				d += 4;
+				break;
+			}
+			goto out;
+		}
+
+		w = (w << 4) | (0xff & c);
+		state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+		if (!(w & 0xf000)) {
+			state |= IN6PTON_XDIGIT;
+		}
+		if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+			state |= IN6PTON_COLON_1_2;
+			state &= ~IN6PTON_DELIM;
+		}
+		if (d + 2 >= dbuf + sizeof(dbuf)) {
+			state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+		}
+cont:
+		if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+		    d + 4 == dbuf + sizeof(dbuf)) {
+			state |= IN6PTON_DOT;
+		}
+		if (d >= dbuf + sizeof(dbuf)) {
+			state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+		}
+		s++;
+		srclen--;
+	}
+
+	i = 15; d--;
+
+	if (dc) {
+		while(d >= dc)
+			dst[i--] = *d--;
+		while(i >= dc - dbuf)
+			dst[i--] = 0;
+		while(i >= 0)
+			dst[i--] = *d--;
+	} else
+		memcpy(dst, dbuf, sizeof(dbuf));
+
+	ret = 1;
+out:
+	if (end)
+		*end = s;
+	return ret;
+}
+EXPORT_SYMBOL(in6_pton);
+
+void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
+			      __be32 from, __be32 to, int pseudohdr)
+{
+	__be32 diff[] = { ~from, to };
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		*sum = csum_fold(csum_partial(diff, sizeof(diff),
+				~csum_unfold(*sum)));
+		if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+			skb->csum = ~csum_partial(diff, sizeof(diff),
+						~skb->csum);
+	} else if (pseudohdr)
+		*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
+				csum_unfold(*sum)));
+}
+EXPORT_SYMBOL(inet_proto_csum_replace4);
+
+int mac_pton(const char *s, u8 *mac)
+{
+	int i;
+
+	/* XX:XX:XX:XX:XX:XX */
+	if (strlen(s) < 3 * ETH_ALEN - 1)
+		return 0;
+
+	/* Don't dirty result unless string is valid MAC. */
+	for (i = 0; i < ETH_ALEN; i++) {
+		if (!strchr("0123456789abcdefABCDEF", s[i * 3]))
+			return 0;
+		if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1]))
+			return 0;
+		if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':')
+			return 0;
+	}
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(mac_pton);
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
new file mode 100644
index 00000000..4066d59c
--- /dev/null
+++ b/net/dcb/Kconfig
@@ -0,0 +1,22 @@
+config DCB
+	bool "Data Center Bridging support"
+	default n
+	---help---
+	  This enables support for configuring Data Center Bridging (DCB)
+	  features on DCB capable Ethernet adapters via rtnetlink.  Say 'Y'
+	  if you have a DCB capable Ethernet adapter which supports this
+	  interface and you are connected to a DCB capable switch.
+
+	  DCB is a collection of Ethernet enhancements which allow DCB capable
+	  NICs and switches to support network traffic with differing
+	  requirements (highly reliable, no drops vs. best effort vs. low
+	  latency) to co-exist on Ethernet.
+
+	  DCB features include:
+	    Enhanced Transmission Selection (aka Priority Grouping) - provides a
+	      framework for assigning bandwidth guarantees to traffic classes.
+	    Priority-based Flow Control (PFC) - a MAC control pause frame which
+	      works at the granularity of the 802.1p priority instead of the
+	      link (802.3x).
+
+	  If unsure, say N.
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
new file mode 100644
index 00000000..c1282c9e
--- /dev/null
+++ b/net/dcb/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
new file mode 100644
index 00000000..665a8802
--- /dev/null
+++ b/net/dcb/dcbevent.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain);
+
+int register_dcbevent_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(register_dcbevent_notifier);
+
+int unregister_dcbevent_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(unregister_dcbevent_notifier);
+
+int call_dcbevent_notifiers(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&dcbevent_notif_chain, val, v);
+}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
new file mode 100644
index 00000000..3609eaca
--- /dev/null
+++ b/net/dcb/dcbnl.c
@@ -0,0 +1,1835 @@
+/*
+ * Copyright (c) 2008-2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <linux/dcbnl.h>
+#include <net/dcbevent.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+
+/**
+ * Data Center Bridging (DCB) is a collection of Ethernet enhancements
+ * intended to allow network traffic with differing requirements
+ * (highly reliable, no drops vs. best effort vs. low latency) to operate
+ * and co-exist on Ethernet.  Current DCB features are:
+ *
+ * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a
+ *   framework for assigning bandwidth guarantees to traffic classes.
+ *
+ * Priority-based Flow Control (PFC) - provides a flow control mechanism which
+ *   can work independently for each 802.1p priority.
+ *
+ * Congestion Notification - provides a mechanism for end-to-end congestion
+ *   control for protocols which do not have built-in congestion management.
+ *
+ * More information about the emerging standards for these Ethernet features
+ * can be found at: http://www.ieee802.org/1/pages/dcbridges.html
+ *
+ * This file implements an rtnetlink interface to allow configuration of DCB
+ * features for capable devices.
+ */
+
+MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>");
+MODULE_DESCRIPTION("Data Center Bridging netlink interface");
+MODULE_LICENSE("GPL");
+
+/**************** DCB attribute policies *************************************/
+
+/* DCB netlink attributes policy */
+static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
+	[DCB_ATTR_IFNAME]      = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
+	[DCB_ATTR_STATE]       = {.type = NLA_U8},
+	[DCB_ATTR_PFC_CFG]     = {.type = NLA_NESTED},
+	[DCB_ATTR_PG_CFG]      = {.type = NLA_NESTED},
+	[DCB_ATTR_SET_ALL]     = {.type = NLA_U8},
+	[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
+	[DCB_ATTR_CAP]         = {.type = NLA_NESTED},
+	[DCB_ATTR_PFC_STATE]   = {.type = NLA_U8},
+	[DCB_ATTR_BCN]         = {.type = NLA_NESTED},
+	[DCB_ATTR_APP]         = {.type = NLA_NESTED},
+	[DCB_ATTR_IEEE]	       = {.type = NLA_NESTED},
+	[DCB_ATTR_DCBX]        = {.type = NLA_U8},
+	[DCB_ATTR_FEATCFG]     = {.type = NLA_NESTED},
+};
+
+/* DCB priority flow control to User Priority nested attributes */
+static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = {
+	[DCB_PFC_UP_ATTR_0]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_1]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_2]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_3]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_4]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_5]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_6]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_7]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB priority grouping nested attributes */
+static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = {
+	[DCB_PG_ATTR_TC_0]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_1]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_2]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_3]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_4]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_5]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_6]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_7]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_ALL]    = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_BW_ID_0]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_1]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_2]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_3]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_4]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_5]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_6]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_7]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB traffic class nested attributes. */
+static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
+	[DCB_TC_ATTR_PARAM_PGID]            = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_UP_MAPPING]      = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_STRICT_PRIO]     = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_BW_PCT]          = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_ALL]             = {.type = NLA_FLAG},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
+	[DCB_CAP_ATTR_ALL]     = {.type = NLA_FLAG},
+	[DCB_CAP_ATTR_PG]      = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PFC]     = {.type = NLA_U8},
+	[DCB_CAP_ATTR_UP2TC]   = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PG_TCS]  = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8},
+	[DCB_CAP_ATTR_GSP]     = {.type = NLA_U8},
+	[DCB_CAP_ATTR_BCN]     = {.type = NLA_U8},
+	[DCB_CAP_ATTR_DCBX]    = {.type = NLA_U8},
+};
+
+/* DCB capabilities nested attributes. */
+static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
+	[DCB_NUMTCS_ATTR_ALL]     = {.type = NLA_FLAG},
+	[DCB_NUMTCS_ATTR_PG]      = {.type = NLA_U8},
+	[DCB_NUMTCS_ATTR_PFC]     = {.type = NLA_U8},
+};
+
+/* DCB BCN nested attributes. */
+static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
+	[DCB_BCN_ATTR_RP_0]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_1]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_2]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_3]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_4]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_5]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_6]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_7]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_ALL]       = {.type = NLA_FLAG},
+	[DCB_BCN_ATTR_BCNA_0]       = {.type = NLA_U32},
+	[DCB_BCN_ATTR_BCNA_1]       = {.type = NLA_U32},
+	[DCB_BCN_ATTR_ALPHA]        = {.type = NLA_U32},
+	[DCB_BCN_ATTR_BETA]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_GD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_GI]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_TMAX]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_TD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RMIN]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_W]            = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RU]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_WRTT]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RI]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_C]            = {.type = NLA_U32},
+	[DCB_BCN_ATTR_ALL]          = {.type = NLA_FLAG},
+};
+
+/* DCB APP nested attributes. */
+static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = {
+	[DCB_APP_ATTR_IDTYPE]       = {.type = NLA_U8},
+	[DCB_APP_ATTR_ID]           = {.type = NLA_U16},
+	[DCB_APP_ATTR_PRIORITY]     = {.type = NLA_U8},
+};
+
+/* IEEE 802.1Qaz nested attributes. */
+static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
+	[DCB_ATTR_IEEE_ETS]	    = {.len = sizeof(struct ieee_ets)},
+	[DCB_ATTR_IEEE_PFC]	    = {.len = sizeof(struct ieee_pfc)},
+	[DCB_ATTR_IEEE_APP_TABLE]   = {.type = NLA_NESTED},
+};
+
+static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = {
+	[DCB_ATTR_IEEE_APP]	    = {.len = sizeof(struct dcb_app)},
+};
+
+/* DCB number of traffic classes nested attributes. */
+static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
+	[DCB_FEATCFG_ATTR_ALL]      = {.type = NLA_FLAG},
+	[DCB_FEATCFG_ATTR_PG]       = {.type = NLA_U8},
+	[DCB_FEATCFG_ATTR_PFC]      = {.type = NLA_U8},
+	[DCB_FEATCFG_ATTR_APP]      = {.type = NLA_U8},
+};
+
+static LIST_HEAD(dcb_app_list);
+static DEFINE_SPINLOCK(dcb_lock);
+
+/* standard netlink reply call */
+static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
+                       u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct dcbmsg *dcb;
+	struct nlmsghdr *nlh;
+	int ret = -EINVAL;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		return ret;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, event, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = cmd;
+	dcb->dcb_pad = 0;
+
+	ret = nla_put_u8(dcbnl_skb, attr, value);
+	if (ret)
+		goto err;
+
+	/* end the message, assign the nlmsg_len. */
+	nlmsg_end(dcbnl_skb, nlh);
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+	return ret;
+}
+
+static int dcbnl_getstate(struct net_device *netdev, struct nlattr **tb,
+                          u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	/* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */
+	if (!netdev->dcbnl_ops->getstate)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->getstate(netdev), RTM_GETDCB,
+	                  DCB_CMD_GSTATE, DCB_ATTR_STATE, pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_getpfccfg(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->getpfccfg)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+	                       tb[DCB_ATTR_PFC_CFG],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_PFC_GCFG;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PFC_CFG);
+	if (!nest)
+		goto err;
+
+	if (data[DCB_PFC_UP_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0,
+		                             &value);
+		ret = nla_put_u8(dcbnl_skb, i, value);
+
+		if (ret) {
+			nla_nest_cancel(dcbnl_skb, nest);
+			goto err;
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err_out;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
+static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlattr **tb,
+                                u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	u8 perm_addr[MAX_ADDR_LEN];
+	int ret = -EINVAL;
+
+	if (!netdev->dcbnl_ops->getpermhwaddr)
+		return ret;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GPERM_HWADDR;
+
+	netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);
+
+	ret = nla_put(dcbnl_skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr),
+	              perm_addr);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err_out;
+
+	return 0;
+
+nlmsg_failure:
+	kfree_skb(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
+static int dcbnl_getcap(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_CAP] || !netdev->dcbnl_ops->getcap)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP],
+	                       dcbnl_cap_nest);
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GCAP;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_CAP);
+	if (!nest)
+		goto err;
+
+	if (data[DCB_CAP_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) {
+			ret = nla_put_u8(dcbnl_skb, i, value);
+
+			if (ret) {
+				nla_nest_cancel(dcbnl_skb, nest);
+				goto err;
+			}
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err_out;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
+static int dcbnl_getnumtcs(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->getnumtcs)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+	                       dcbnl_numtcs_nest);
+	if (ret) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GNUMTCS;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_NUMTCS);
+	if (!nest) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (data[DCB_NUMTCS_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value);
+		if (!ret) {
+			ret = nla_put_u8(dcbnl_skb, i, value);
+
+			if (ret) {
+				nla_nest_cancel(dcbnl_skb, nest);
+				ret = -EINVAL;
+				goto err;
+			}
+		} else {
+			goto err;
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+err_out:
+	return ret;
+}
+
+static int dcbnl_setnumtcs(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1];
+	int ret = -EINVAL;
+	u8 value;
+	int i;
+
+	if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->setnumtcs)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+	                       dcbnl_numtcs_nest);
+
+	if (ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+		if (data[i] == NULL)
+			continue;
+
+		value = nla_get_u8(data[i]);
+
+		ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value);
+
+		if (ret)
+			goto operr;
+	}
+
+operr:
+	ret = dcbnl_reply(!!ret, RTM_SETDCB, DCB_CMD_SNUMTCS,
+	                  DCB_ATTR_NUMTCS, pid, seq, flags);
+
+err:
+	return ret;
+}
+
+static int dcbnl_getpfcstate(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	if (!netdev->dcbnl_ops->getpfcstate)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->getpfcstate(netdev), RTM_GETDCB,
+	                  DCB_CMD_PFC_GSTATE, DCB_ATTR_PFC_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_setpfcstate(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_PFC_STATE] || !netdev->dcbnl_ops->setpfcstate)
+		return ret;
+
+	value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]);
+
+	netdev->dcbnl_ops->setpfcstate(netdev, value);
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SSTATE, DCB_ATTR_PFC_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_getapp(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *app_nest;
+	struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+	u16 id;
+	u8 up, idtype;
+	int ret = -EINVAL;
+
+	if (!tb[DCB_ATTR_APP])
+		goto out;
+
+	ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
+	                       dcbnl_app_nest);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	/* all must be non-null */
+	if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+	    (!app_tb[DCB_APP_ATTR_ID]))
+		goto out;
+
+	/* either by eth type or by socket number */
+	idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+	if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+	    (idtype != DCB_APP_IDTYPE_PORTNUM))
+		goto out;
+
+	id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+
+	if (netdev->dcbnl_ops->getapp) {
+		up = netdev->dcbnl_ops->getapp(netdev, idtype, id);
+	} else {
+		struct dcb_app app = {
+					.selector = idtype,
+					.protocol = id,
+				     };
+		up = dcb_getapp(netdev, &app);
+	}
+
+	/* send this back */
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GAPP;
+
+	app_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_APP);
+	if (!app_nest)
+		goto out_cancel;
+
+	ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_IDTYPE, idtype);
+	if (ret)
+		goto out_cancel;
+
+	ret = nla_put_u16(dcbnl_skb, DCB_APP_ATTR_ID, id);
+	if (ret)
+		goto out_cancel;
+
+	ret = nla_put_u8(dcbnl_skb, DCB_APP_ATTR_PRIORITY, up);
+	if (ret)
+		goto out_cancel;
+
+	nla_nest_end(dcbnl_skb, app_nest);
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto nlmsg_failure;
+
+	goto out;
+
+out_cancel:
+	nla_nest_cancel(dcbnl_skb, app_nest);
+nlmsg_failure:
+	kfree_skb(dcbnl_skb);
+out:
+	return ret;
+}
+
+static int dcbnl_setapp(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	int err, ret = -EINVAL;
+	u16 id;
+	u8 up, idtype;
+	struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
+
+	if (!tb[DCB_ATTR_APP])
+		goto out;
+
+	ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
+	                       dcbnl_app_nest);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	/* all must be non-null */
+	if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
+	    (!app_tb[DCB_APP_ATTR_ID]) ||
+	    (!app_tb[DCB_APP_ATTR_PRIORITY]))
+		goto out;
+
+	/* either by eth type or by socket number */
+	idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
+	if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
+	    (idtype != DCB_APP_IDTYPE_PORTNUM))
+		goto out;
+
+	id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
+	up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]);
+
+	if (netdev->dcbnl_ops->setapp) {
+		err = netdev->dcbnl_ops->setapp(netdev, idtype, id, up);
+	} else {
+		struct dcb_app app;
+		app.selector = idtype;
+		app.protocol = id;
+		app.priority = up;
+		err = dcb_setapp(netdev, &app);
+	}
+
+	ret = dcbnl_reply(err, RTM_SETDCB, DCB_CMD_SAPP, DCB_ATTR_APP,
+			  pid, seq, flags);
+out:
+	return ret;
+}
+
+static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags, int dir)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *pg_nest, *param_nest, *data;
+	struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+	struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+	u8 prio, pgid, tc_pct, up_map;
+	int ret  = -EINVAL;
+	int getall = 0;
+	int i;
+
+	if (!tb[DCB_ATTR_PG_CFG] ||
+	    !netdev->dcbnl_ops->getpgtccfgtx ||
+	    !netdev->dcbnl_ops->getpgtccfgrx ||
+	    !netdev->dcbnl_ops->getpgbwgcfgtx ||
+	    !netdev->dcbnl_ops->getpgbwgcfgrx)
+		return ret;
+
+	ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+	                       tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = (dir) ? DCB_CMD_PGRX_GCFG : DCB_CMD_PGTX_GCFG;
+
+	pg_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PG_CFG);
+	if (!pg_nest)
+		goto err;
+
+	if (pg_tb[DCB_PG_ATTR_TC_ALL])
+		getall = 1;
+
+	for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+		if (!getall && !pg_tb[i])
+			continue;
+
+		if (pg_tb[DCB_PG_ATTR_TC_ALL])
+			data = pg_tb[DCB_PG_ATTR_TC_ALL];
+		else
+			data = pg_tb[i];
+		ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+				       data, dcbnl_tc_param_nest);
+		if (ret)
+			goto err_pg;
+
+		param_nest = nla_nest_start(dcbnl_skb, i);
+		if (!param_nest)
+			goto err_pg;
+
+		pgid = DCB_ATTR_VALUE_UNDEFINED;
+		prio = DCB_ATTR_VALUE_UNDEFINED;
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+		up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->getpgtccfgrx(netdev,
+						i - DCB_PG_ATTR_TC_0, &prio,
+						&pgid, &tc_pct, &up_map);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->getpgtccfgtx(netdev,
+						i - DCB_PG_ATTR_TC_0, &prio,
+						&pgid, &tc_pct, &up_map);
+		}
+
+		if (param_tb[DCB_TC_ATTR_PARAM_PGID] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_PGID, pgid);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb, DCB_TC_ATTR_PARAM_BW_PCT,
+			                 tc_pct);
+			if (ret)
+				goto err_param;
+		}
+		nla_nest_end(dcbnl_skb, param_nest);
+	}
+
+	if (pg_tb[DCB_PG_ATTR_BW_ID_ALL])
+		getall = 1;
+	else
+		getall = 0;
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+		if (!getall && !pg_tb[i])
+			continue;
+
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->getpgbwgcfgrx(netdev,
+					i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->getpgbwgcfgtx(netdev,
+					i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+		}
+		ret = nla_put_u8(dcbnl_skb, i, tc_pct);
+
+		if (ret)
+			goto err_pg;
+	}
+
+	nla_nest_end(dcbnl_skb, pg_nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err_out;
+
+	return 0;
+
+err_param:
+	nla_nest_cancel(dcbnl_skb, param_nest);
+err_pg:
+	nla_nest_cancel(dcbnl_skb, pg_nest);
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+err_out:
+	ret  = -EINVAL;
+	return ret;
+}
+
+static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 0);
+}
+
+static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 1);
+}
+
+static int dcbnl_setstate(struct net_device *netdev, struct nlattr **tb,
+                          u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->setstate)
+		return ret;
+
+	value = nla_get_u8(tb[DCB_ATTR_STATE]);
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->setstate(netdev, value),
+	                  RTM_SETDCB, DCB_CMD_SSTATE, DCB_ATTR_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_setpfccfg(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1];
+	int i;
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->setpfccfg)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+	                       tb[DCB_ATTR_PFC_CFG],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+		if (data[i] == NULL)
+			continue;
+		value = nla_get_u8(data[i]);
+		netdev->dcbnl_ops->setpfccfg(netdev,
+			data[i]->nla_type - DCB_PFC_UP_ATTR_0, value);
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SCFG, DCB_ATTR_PFC_CFG,
+	                  pid, seq, flags);
+err:
+	return ret;
+}
+
+static int dcbnl_setall(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	if (!tb[DCB_ATTR_SET_ALL] || !netdev->dcbnl_ops->setall)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->setall(netdev), RTM_SETDCB,
+	                  DCB_CMD_SET_ALL, DCB_ATTR_SET_ALL, pid, seq, flags);
+
+	return ret;
+}
+
+static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags, int dir)
+{
+	struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+	struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+	int ret = -EINVAL;
+	int i;
+	u8 pgid;
+	u8 up_map;
+	u8 prio;
+	u8 tc_pct;
+
+	if (!tb[DCB_ATTR_PG_CFG] ||
+	    !netdev->dcbnl_ops->setpgtccfgtx ||
+	    !netdev->dcbnl_ops->setpgtccfgrx ||
+	    !netdev->dcbnl_ops->setpgbwgcfgtx ||
+	    !netdev->dcbnl_ops->setpgbwgcfgrx)
+		return ret;
+
+	ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+	                       tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+		if (!pg_tb[i])
+			continue;
+
+		ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+		                       pg_tb[i], dcbnl_tc_param_nest);
+		if (ret)
+			goto err;
+
+		pgid = DCB_ATTR_VALUE_UNDEFINED;
+		prio = DCB_ATTR_VALUE_UNDEFINED;
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+		up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO])
+			prio =
+			    nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_PGID])
+			pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT])
+			tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING])
+			up_map =
+			     nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]);
+
+		/* dir: Tx = 0, Rx = 1 */
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->setpgtccfgrx(netdev,
+				i - DCB_PG_ATTR_TC_0,
+				prio, pgid, tc_pct, up_map);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->setpgtccfgtx(netdev,
+				i - DCB_PG_ATTR_TC_0,
+				prio, pgid, tc_pct, up_map);
+		}
+	}
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+		if (!pg_tb[i])
+			continue;
+
+		tc_pct = nla_get_u8(pg_tb[i]);
+
+		/* dir: Tx = 0, Rx = 1 */
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->setpgbwgcfgrx(netdev,
+					 i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->setpgbwgcfgtx(netdev,
+					 i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+		}
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB,
+			  (dir ? DCB_CMD_PGRX_SCFG : DCB_CMD_PGTX_SCFG),
+			  DCB_ATTR_PG_CFG, pid, seq, flags);
+
+err:
+	return ret;
+}
+
+static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 0);
+}
+
+static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 1);
+}
+
+static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlattr **tb,
+                            u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *bcn_nest;
+	struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1];
+	u8 value_byte;
+	u32 value_integer;
+	int ret  = -EINVAL;
+	bool getall = false;
+	int i;
+
+	if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->getbcnrp ||
+	    !netdev->dcbnl_ops->getbcncfg)
+		return ret;
+
+	ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX,
+	                       tb[DCB_ATTR_BCN], dcbnl_bcn_nest);
+
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_BCN_GCFG;
+
+	bcn_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_BCN);
+	if (!bcn_nest)
+		goto err;
+
+	if (bcn_tb[DCB_BCN_ATTR_ALL])
+		getall = true;
+
+	for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+		if (!getall && !bcn_tb[i])
+			continue;
+
+		netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0,
+		                            &value_byte);
+		ret = nla_put_u8(dcbnl_skb, i, value_byte);
+		if (ret)
+			goto err_bcn;
+	}
+
+	for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+		if (!getall && !bcn_tb[i])
+			continue;
+
+		netdev->dcbnl_ops->getbcncfg(netdev, i,
+		                             &value_integer);
+		ret = nla_put_u32(dcbnl_skb, i, value_integer);
+		if (ret)
+			goto err_bcn;
+	}
+
+	nla_nest_end(dcbnl_skb, bcn_nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err_out;
+
+	return 0;
+
+err_bcn:
+	nla_nest_cancel(dcbnl_skb, bcn_nest);
+nlmsg_failure:
+err:
+	kfree_skb(dcbnl_skb);
+err_out:
+	ret  = -EINVAL;
+	return ret;
+}
+
+static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlattr **tb,
+                            u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_BCN_ATTR_MAX + 1];
+	int i;
+	int ret = -EINVAL;
+	u8 value_byte;
+	u32 value_int;
+
+	if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->setbcncfg ||
+	    !netdev->dcbnl_ops->setbcnrp)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX,
+	                       tb[DCB_ATTR_BCN],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+		if (data[i] == NULL)
+			continue;
+		value_byte = nla_get_u8(data[i]);
+		netdev->dcbnl_ops->setbcnrp(netdev,
+			data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
+	}
+
+	for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
+		if (data[i] == NULL)
+			continue;
+		value_int = nla_get_u32(data[i]);
+		netdev->dcbnl_ops->setbcncfg(netdev,
+	                                     i, value_int);
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_BCN_SCFG, DCB_ATTR_BCN,
+	                  pid, seq, flags);
+err:
+	return ret;
+}
+
+/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not
+ * be completed the entire msg is aborted and error value is returned.
+ * No attempt is made to reconcile the case where only part of the
+ * cmd can be completed.
+ */
+static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+	int err = -EOPNOTSUPP;
+
+	if (!ops)
+		goto err;
+
+	err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+			       tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+	if (err)
+		goto err;
+
+	if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
+		struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
+		err = ops->ieee_setets(netdev, ets);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
+		struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
+		err = ops->ieee_setpfc(netdev, pfc);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+		struct nlattr *attr;
+		int rem;
+
+		nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+			struct dcb_app *app_data;
+			if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+				continue;
+			app_data = nla_data(attr);
+			if (ops->ieee_setapp)
+				err = ops->ieee_setapp(netdev, app_data);
+			else
+				err = dcb_setapp(netdev, app_data);
+			if (err)
+				goto err;
+		}
+	}
+
+err:
+	dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_SET, DCB_ATTR_IEEE,
+		    pid, seq, flags);
+	return err;
+}
+
+static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
+				int app_nested_type, int app_info_type,
+				int app_entry_type)
+{
+	struct dcb_peer_app_info info;
+	struct dcb_app *table = NULL;
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	u16 app_count;
+	int err;
+
+
+	/**
+	 * retrieve the peer app configuration form the driver. If the driver
+	 * handlers fail exit without doing anything
+	 */
+	err = ops->peer_getappinfo(netdev, &info, &app_count);
+	if (!err && app_count) {
+		table = kmalloc(sizeof(struct dcb_app) * app_count, GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+
+		err = ops->peer_getapptable(netdev, table);
+	}
+
+	if (!err) {
+		u16 i;
+		struct nlattr *app;
+
+		/**
+		 * build the message, from here on the only possible failure
+		 * is due to the skb size
+		 */
+		err = -EMSGSIZE;
+
+		app = nla_nest_start(skb, app_nested_type);
+		if (!app)
+			goto nla_put_failure;
+
+		if (app_info_type)
+			NLA_PUT(skb, app_info_type, sizeof(info), &info);
+
+		for (i = 0; i < app_count; i++)
+			NLA_PUT(skb, app_entry_type, sizeof(struct dcb_app),
+				&table[i]);
+
+		nla_nest_end(skb, app);
+	}
+	err = 0;
+
+nla_put_failure:
+	kfree(table);
+	return err;
+}
+
+/* Handle IEEE 802.1Qaz GET commands. */
+static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *ieee, *app;
+	struct dcb_app_type *itr;
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	int err;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_IEEE_GET;
+
+	NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name);
+
+	ieee = nla_nest_start(skb, DCB_ATTR_IEEE);
+	if (!ieee)
+		goto nla_put_failure;
+
+	if (ops->ieee_getets) {
+		struct ieee_ets ets;
+		err = ops->ieee_getets(netdev, &ets);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets);
+	}
+
+	if (ops->ieee_getpfc) {
+		struct ieee_pfc pfc;
+		err = ops->ieee_getpfc(netdev, &pfc);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc);
+	}
+
+	app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
+	if (!app)
+		goto nla_put_failure;
+
+	spin_lock(&dcb_lock);
+	list_for_each_entry(itr, &dcb_app_list, list) {
+		if (strncmp(itr->name, netdev->name, IFNAMSIZ) == 0) {
+			err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
+					 &itr->app);
+			if (err) {
+				spin_unlock(&dcb_lock);
+				goto nla_put_failure;
+			}
+		}
+	}
+	spin_unlock(&dcb_lock);
+	nla_nest_end(skb, app);
+
+	/* get peer info if available */
+	if (ops->ieee_peer_getets) {
+		struct ieee_ets ets;
+		err = ops->ieee_peer_getets(netdev, &ets);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets);
+	}
+
+	if (ops->ieee_peer_getpfc) {
+		struct ieee_pfc pfc;
+		err = ops->ieee_peer_getpfc(netdev, &pfc);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc);
+	}
+
+	if (ops->peer_getappinfo && ops->peer_getapptable) {
+		err = dcbnl_build_peer_app(netdev, skb,
+					   DCB_ATTR_IEEE_PEER_APP,
+					   DCB_ATTR_IEEE_APP_UNSPEC,
+					   DCB_ATTR_IEEE_APP);
+		if (err)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, ieee);
+	nlmsg_end(skb, nlh);
+
+	return rtnl_unicast(skb, &init_net, pid);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+/* DCBX configuration */
+static int dcbnl_getdcbx(struct net_device *netdev, struct nlattr **tb,
+			 u32 pid, u32 seq, u16 flags)
+{
+	int ret;
+
+	if (!netdev->dcbnl_ops->getdcbx)
+		return -EOPNOTSUPP;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->getdcbx(netdev), RTM_GETDCB,
+			  DCB_CMD_GDCBX, DCB_ATTR_DCBX, pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_setdcbx(struct net_device *netdev, struct nlattr **tb,
+			 u32 pid, u32 seq, u16 flags)
+{
+	int ret;
+	u8 value;
+
+	if (!netdev->dcbnl_ops->setdcbx)
+		return -EOPNOTSUPP;
+
+	if (!tb[DCB_ATTR_DCBX])
+		return -EINVAL;
+
+	value = nla_get_u8(tb[DCB_ATTR_DCBX]);
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->setdcbx(netdev, value),
+			  RTM_SETDCB, DCB_CMD_SDCBX, DCB_ATTR_DCBX,
+			  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlattr **tb,
+			    u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret, i;
+	int getall = 0;
+
+	if (!netdev->dcbnl_ops->getfeatcfg)
+		return -EOPNOTSUPP;
+
+	if (!tb[DCB_ATTR_FEATCFG])
+		return -EINVAL;
+
+	ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG],
+			       dcbnl_featcfg_nest);
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb) {
+		ret = -ENOBUFS;
+		goto err_out;
+	}
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GFEATCFG;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_FEATCFG);
+	if (!nest) {
+		ret = -EMSGSIZE;
+		goto nla_put_failure;
+	}
+
+	if (data[DCB_FEATCFG_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value);
+		if (!ret)
+			ret = nla_put_u8(dcbnl_skb, i, value);
+
+		if (ret) {
+			nla_nest_cancel(dcbnl_skb, nest);
+			goto nla_put_failure;
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	return rtnl_unicast(dcbnl_skb, &init_net, pid);
+nla_put_failure:
+	nlmsg_cancel(dcbnl_skb, nlh);
+nlmsg_failure:
+	kfree_skb(dcbnl_skb);
+err_out:
+	return ret;
+}
+
+static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlattr **tb,
+			    u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1];
+	int ret, i;
+	u8 value;
+
+	if (!netdev->dcbnl_ops->setfeatcfg)
+		return -ENOTSUPP;
+
+	if (!tb[DCB_ATTR_FEATCFG])
+		return -EINVAL;
+
+	ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG],
+			       dcbnl_featcfg_nest);
+
+	if (ret)
+		goto err;
+
+	for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
+		if (data[i] == NULL)
+			continue;
+
+		value = nla_get_u8(data[i]);
+
+		ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value);
+
+		if (ret)
+			goto err;
+	}
+err:
+	dcbnl_reply(ret, RTM_SETDCB, DCB_CMD_SFEATCFG, DCB_ATTR_FEATCFG,
+		    pid, seq, flags);
+
+	return ret;
+}
+
+/* Handle CEE DCBX GET commands. */
+static int dcbnl_cee_get(struct net_device *netdev, struct nlattr **tb,
+			 u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *cee;
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	int err;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_CEE_GET;
+
+	NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name);
+
+	cee = nla_nest_start(skb, DCB_ATTR_CEE);
+	if (!cee)
+		goto nla_put_failure;
+
+	/* get peer info if available */
+	if (ops->cee_peer_getpg) {
+		struct cee_pg pg;
+		err = ops->cee_peer_getpg(netdev, &pg);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg);
+	}
+
+	if (ops->cee_peer_getpfc) {
+		struct cee_pfc pfc;
+		err = ops->cee_peer_getpfc(netdev, &pfc);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc);
+	}
+
+	if (ops->peer_getappinfo && ops->peer_getapptable) {
+		err = dcbnl_build_peer_app(netdev, skb,
+					   DCB_ATTR_CEE_PEER_APP_TABLE,
+					   DCB_ATTR_CEE_PEER_APP_INFO,
+					   DCB_ATTR_CEE_PEER_APP);
+		if (err)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, cee);
+	nlmsg_end(skb, nlh);
+
+	return rtnl_unicast(skb, &init_net, pid);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *netdev;
+	struct dcbmsg  *dcb = (struct dcbmsg *)NLMSG_DATA(nlh);
+	struct nlattr *tb[DCB_ATTR_MAX + 1];
+	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	int ret = -EINVAL;
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
+			  dcbnl_rtnl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[DCB_ATTR_IFNAME])
+		return -EINVAL;
+
+	netdev = dev_get_by_name(&init_net, nla_data(tb[DCB_ATTR_IFNAME]));
+	if (!netdev)
+		return -EINVAL;
+
+	if (!netdev->dcbnl_ops)
+		goto errout;
+
+	switch (dcb->cmd) {
+	case DCB_CMD_GSTATE:
+		ret = dcbnl_getstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                     nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_GCFG:
+		ret = dcbnl_getpfccfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GPERM_HWADDR:
+		ret = dcbnl_getperm_hwaddr(netdev, tb, pid, nlh->nlmsg_seq,
+		                           nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGTX_GCFG:
+		ret = dcbnl_pgtx_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGRX_GCFG:
+		ret = dcbnl_pgrx_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_BCN_GCFG:
+		ret = dcbnl_bcn_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                       nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SSTATE:
+		ret = dcbnl_setstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                     nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_SCFG:
+		ret = dcbnl_setpfccfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+
+	case DCB_CMD_SET_ALL:
+		ret = dcbnl_setall(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGTX_SCFG:
+		ret = dcbnl_pgtx_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGRX_SCFG:
+		ret = dcbnl_pgrx_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GCAP:
+		ret = dcbnl_getcap(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GNUMTCS:
+		ret = dcbnl_getnumtcs(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SNUMTCS:
+		ret = dcbnl_setnumtcs(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_GSTATE:
+		ret = dcbnl_getpfcstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_SSTATE:
+		ret = dcbnl_setpfcstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_BCN_SCFG:
+		ret = dcbnl_bcn_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                       nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GAPP:
+		ret = dcbnl_getapp(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SAPP:
+		ret = dcbnl_setapp(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_IEEE_SET:
+		ret = dcbnl_ieee_set(netdev, tb, pid, nlh->nlmsg_seq,
+				 nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_IEEE_GET:
+		ret = dcbnl_ieee_get(netdev, tb, pid, nlh->nlmsg_seq,
+				 nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GDCBX:
+		ret = dcbnl_getdcbx(netdev, tb, pid, nlh->nlmsg_seq,
+				    nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SDCBX:
+		ret = dcbnl_setdcbx(netdev, tb, pid, nlh->nlmsg_seq,
+				    nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GFEATCFG:
+		ret = dcbnl_getfeatcfg(netdev, tb, pid, nlh->nlmsg_seq,
+				       nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SFEATCFG:
+		ret = dcbnl_setfeatcfg(netdev, tb, pid, nlh->nlmsg_seq,
+				       nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_CEE_GET:
+		ret = dcbnl_cee_get(netdev, tb, pid, nlh->nlmsg_seq,
+				    nlh->nlmsg_flags);
+		goto out;
+	default:
+		goto errout;
+	}
+errout:
+	ret = -EINVAL;
+out:
+	dev_put(netdev);
+	return ret;
+}
+
+/**
+ * dcb_getapp - retrieve the DCBX application user priority
+ *
+ * On success returns a non-zero 802.1p user priority bitmap
+ * otherwise returns 0 as the invalid user priority bitmap to
+ * indicate an error.
+ */
+u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct dcb_app_type *itr;
+	u8 prio = 0;
+
+	spin_lock(&dcb_lock);
+	list_for_each_entry(itr, &dcb_app_list, list) {
+		if (itr->app.selector == app->selector &&
+		    itr->app.protocol == app->protocol &&
+		    (strncmp(itr->name, dev->name, IFNAMSIZ) == 0)) {
+			prio = itr->app.priority;
+			break;
+		}
+	}
+	spin_unlock(&dcb_lock);
+
+	return prio;
+}
+EXPORT_SYMBOL(dcb_getapp);
+
+/**
+ * ixgbe_dcbnl_setapp - add dcb application data to app list
+ *
+ * Priority 0 is the default priority this removes applications
+ * from the app list if the priority is set to zero.
+ */
+u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
+{
+	struct dcb_app_type *itr;
+	struct dcb_app_type event;
+
+	memcpy(&event.name, dev->name, sizeof(event.name));
+	memcpy(&event.app, new, sizeof(event.app));
+
+	spin_lock(&dcb_lock);
+	/* Search for existing match and replace */
+	list_for_each_entry(itr, &dcb_app_list, list) {
+		if (itr->app.selector == new->selector &&
+		    itr->app.protocol == new->protocol &&
+		    (strncmp(itr->name, dev->name, IFNAMSIZ) == 0)) {
+			if (new->priority)
+				itr->app.priority = new->priority;
+			else {
+				list_del(&itr->list);
+				kfree(itr);
+			}
+			goto out;
+		}
+	}
+	/* App type does not exist add new application type */
+	if (new->priority) {
+		struct dcb_app_type *entry;
+		entry = kmalloc(sizeof(struct dcb_app_type), GFP_ATOMIC);
+		if (!entry) {
+			spin_unlock(&dcb_lock);
+			return -ENOMEM;
+		}
+
+		memcpy(&entry->app, new, sizeof(*new));
+		strncpy(entry->name, dev->name, IFNAMSIZ);
+		list_add(&entry->list, &dcb_app_list);
+	}
+out:
+	spin_unlock(&dcb_lock);
+	call_dcbevent_notifiers(DCB_APP_EVENT, &event);
+	return 0;
+}
+EXPORT_SYMBOL(dcb_setapp);
+
+static void dcb_flushapp(void)
+{
+	struct dcb_app_type *app;
+	struct dcb_app_type *tmp;
+
+	spin_lock(&dcb_lock);
+	list_for_each_entry_safe(app, tmp, &dcb_app_list, list) {
+		list_del(&app->list);
+		kfree(app);
+	}
+	spin_unlock(&dcb_lock);
+}
+
+static int __init dcbnl_init(void)
+{
+	INIT_LIST_HEAD(&dcb_app_list);
+
+	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL);
+	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL);
+
+	return 0;
+}
+module_init(dcbnl_init);
+
+static void __exit dcbnl_exit(void)
+{
+	rtnl_unregister(PF_UNSPEC, RTM_GETDCB);
+	rtnl_unregister(PF_UNSPEC, RTM_SETDCB);
+	dcb_flushapp();
+}
+module_exit(dcbnl_exit);
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 00000000..b75968a0
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,62 @@
+menuconfig IP_DCCP
+	tristate "The DCCP Protocol (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	---help---
+	  Datagram Congestion Control Protocol (RFC 4340)
+
+	  From http://www.ietf.org/rfc/rfc4340.txt:
+
+	  The Datagram Congestion Control Protocol (DCCP) is a transport
+	  protocol that implements bidirectional, unicast connections of
+	  congestion-controlled, unreliable datagrams. It should be suitable
+	  for use by applications such as streaming media, Internet telephony,
+	  and on-line games.
+
+	  To compile this protocol support as a module, choose M here: the
+	  module will be called dccp.
+
+	  If in doubt, say N.
+
+if IP_DCCP
+
+config INET_DCCP_DIAG
+	depends on INET_DIAG
+	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+	def_tristate m
+
+source "net/dccp/ccids/Kconfig"
+
+menu "DCCP Kernel Hacking"
+	depends on DEBUG_KERNEL=y
+
+config IP_DCCP_DEBUG
+	bool "DCCP debug messages"
+	---help---
+	  Only use this if you're hacking DCCP.
+
+	  When compiling DCCP as a module, this debugging output can be toggled
+	  by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
+
+	  Just say N.
+
+config NET_DCCPPROBE
+	tristate "DCCP connection probing"
+	depends on PROC_FS && KPROBES
+	---help---
+	This module allows for capturing the changes to DCCP connection
+	state in response to incoming packets. It is used for debugging
+	DCCP congestion avoidance modules. If you don't understand
+	what was just said, you don't need it: say N.
+
+	Documentation on how to use DCCP connection probing can be found
+	at:
+	
+	  http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
+
+	To compile this code as a module, choose M here: the
+	module will be called dccp_probe.
+
+
+endmenu
+
+endif # IP_DDCP
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 00000000..5c8362b0
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,28 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
+
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+	  qpolicy.o
+#
+# CCID algorithms to be used by dccp.ko
+#
+# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
+dccp-y += ccids/ccid2.o ackvec.o
+dccp-$(CONFIG_IP_DCCP_CCID3)	+= ccids/ccid3.o
+dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o		\
+				   ccids/lib/tfrc_equation.o	\
+				   ccids/lib/packet_history.o	\
+				   ccids/lib/loss_interval.o
+
+dccp_ipv4-y := ipv4.o
+
+# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
+obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
+dccp_ipv6-y := ipv6.o
+
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
+
+dccp-$(CONFIG_SYSCTL) += sysctl.o
+
+dccp_diag-y := diag.o
+dccp_probe-y := probe.o
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
new file mode 100644
index 00000000..25b7a8d1
--- /dev/null
+++ b/net/dccp/ackvec.c
@@ -0,0 +1,408 @@
+/*
+ *  net/dccp/ackvec.c
+ *
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ *      This program is free software; you can redistribute it and/or modify it
+ *      under the terms of the GNU General Public License as published by the
+ *      Free Software Foundation; version 2 of the License;
+ */
+#include "dccp.h"
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+static struct kmem_cache *dccp_ackvec_slab;
+static struct kmem_cache *dccp_ackvec_record_slab;
+
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+{
+	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+
+	if (av != NULL) {
+		av->av_buf_head	= av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
+		INIT_LIST_HEAD(&av->av_records);
+	}
+	return av;
+}
+
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+{
+	struct dccp_ackvec_record *cur, *next;
+
+	list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+		kmem_cache_free(dccp_ackvec_record_slab, cur);
+	INIT_LIST_HEAD(&av->av_records);
+}
+
+void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+	if (likely(av != NULL)) {
+		dccp_ackvec_purge_records(av);
+		kmem_cache_free(dccp_ackvec_slab, av);
+	}
+}
+
+/**
+ * dccp_ackvec_update_records  -  Record information about sent Ack Vectors
+ * @av:		Ack Vector records to update
+ * @seqno:	Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum:	The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
+{
+	struct dccp_ackvec_record *avr;
+
+	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+	if (avr == NULL)
+		return -ENOBUFS;
+
+	avr->avr_ack_seqno  = seqno;
+	avr->avr_ack_ptr    = av->av_buf_head;
+	avr->avr_ack_ackno  = av->av_buf_ackno;
+	avr->avr_ack_nonce  = nonce_sum;
+	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
+	/*
+	 * When the buffer overflows, we keep no more than one record. This is
+	 * the simplest way of disambiguating sender-Acks dating from before the
+	 * overflow from sender-Acks which refer to after the overflow; a simple
+	 * solution is preferable here since we are handling an exception.
+	 */
+	if (av->av_overflow)
+		dccp_ackvec_purge_records(av);
+	/*
+	 * Since GSS is incremented for each packet, the list is automatically
+	 * arranged in descending order of @ack_seqno.
+	 */
+	list_add(&avr->avr_node, &av->av_records);
+
+	dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
+		      (unsigned long long)avr->avr_ack_seqno,
+		      (unsigned long long)avr->avr_ack_ackno,
+		      avr->avr_ack_runlen);
+	return 0;
+}
+
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+						     const u64 ackno)
+{
+	struct dccp_ackvec_record *avr;
+	/*
+	 * Exploit that records are inserted in descending order of sequence
+	 * number, start with the oldest record first. If @ackno is `before'
+	 * the earliest ack_ackno, the packet is too old to be considered.
+	 */
+	list_for_each_entry_reverse(avr, av_list, avr_node) {
+		if (avr->avr_ack_seqno == ackno)
+			return avr;
+		if (before48(ackno, avr->avr_ack_seqno))
+			break;
+	}
+	return NULL;
+}
+
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+{
+	return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+}
+
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+{
+	return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+}
+
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+{
+	if (unlikely(av->av_overflow))
+		return DCCPAV_MAX_ACKVEC_LEN;
+	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+}
+
+/**
+ * dccp_ackvec_update_old  -  Update previous state as per RFC 4340, 11.4.1
+ * @av:		non-empty buffer to update
+ * @distance:   negative or zero distance of @seqno from buf_ackno downward
+ * @seqno:	the (old) sequence number whose record is to be updated
+ * @state:	state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+				   u64 seqno, enum dccp_ackvec_states state)
+{
+	u16 ptr = av->av_buf_head;
+
+	BUG_ON(distance > 0);
+	if (unlikely(dccp_ackvec_is_empty(av)))
+		return;
+
+	do {
+		u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
+
+		if (distance + runlen >= 0) {
+			/*
+			 * Only update the state if packet has not been received
+			 * yet. This is OK as per the second table in RFC 4340,
+			 * 11.4.1; i.e. here we are using the following table:
+			 *                     RECEIVED
+			 *                      0   1   3
+			 *              S     +---+---+---+
+			 *              T   0 | 0 | 0 | 0 |
+			 *              O     +---+---+---+
+			 *              R   1 | 1 | 1 | 1 |
+			 *              E     +---+---+---+
+			 *              D   3 | 0 | 1 | 3 |
+			 *                    +---+---+---+
+			 * The "Not Received" state was set by reserve_seats().
+			 */
+			if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+				av->av_buf[ptr] = state;
+			else
+				dccp_pr_debug("Not changing %llu state to %u\n",
+					      (unsigned long long)seqno, state);
+			break;
+		}
+
+		distance += runlen + 1;
+		ptr	  = __ackvec_idx_add(ptr, 1);
+
+	} while (ptr != av->av_buf_tail);
+}
+
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+	u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+	    len	  = DCCPAV_MAX_ACKVEC_LEN - start;
+
+	/* check for buffer wrap-around */
+	if (num > len) {
+		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+		start = 0;
+		num  -= len;
+	}
+	if (num)
+		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
+}
+
+/**
+ * dccp_ackvec_add_new  -  Record one or more new entries in Ack Vector buffer
+ * @av:		 container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno:	 sequence number of the first packet in @num_packets
+ * @state:	 state in which packet carrying @seqno was received
+ */
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+				u64 seqno, enum dccp_ackvec_states state)
+{
+	u32 num_cells = num_packets;
+
+	if (num_packets > DCCPAV_BURST_THRESH) {
+		u32 lost_packets = num_packets - 1;
+
+		DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
+		/*
+		 * We received 1 packet and have a loss of size "num_packets-1"
+		 * which we squeeze into num_cells-1 rather than reserving an
+		 * entire byte for each lost packet.
+		 * The reason is that the vector grows in O(burst_length); when
+		 * it grows too large there will no room left for the payload.
+		 * This is a trade-off: if a few packets out of the burst show
+		 * up later, their state will not be changed; it is simply too
+		 * costly to reshuffle/reallocate/copy the buffer each time.
+		 * Should such problems persist, we will need to switch to a
+		 * different underlying data structure.
+		 */
+		for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+			u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
+
+			av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+			av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+			lost_packets -= len;
+		}
+	}
+
+	if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+		av->av_overflow = true;
+	}
+
+	av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+	if (av->av_overflow)
+		av->av_buf_tail = av->av_buf_head;
+
+	av->av_buf[av->av_buf_head] = state;
+	av->av_buf_ackno	    = seqno;
+
+	if (num_packets > 1)
+		dccp_ackvec_reserve_seats(av, num_packets - 1);
+}
+
+/**
+ * dccp_ackvec_input  -  Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+{
+	u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+	enum dccp_ackvec_states state = DCCPAV_RECEIVED;
+
+	if (dccp_ackvec_is_empty(av)) {
+		dccp_ackvec_add_new(av, 1, seqno, state);
+		av->av_tail_ackno = seqno;
+
+	} else {
+		s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+		u8 *current_head = av->av_buf + av->av_buf_head;
+
+		if (num_packets == 1 &&
+		    dccp_ackvec_state(current_head) == state &&
+		    dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
+
+			*current_head   += 1;
+			av->av_buf_ackno = seqno;
+
+		} else if (num_packets > 0) {
+			dccp_ackvec_add_new(av, num_packets, seqno, state);
+		} else {
+			dccp_ackvec_update_old(av, num_packets, seqno, state);
+		}
+	}
+}
+
+/**
+ * dccp_ackvec_clear_state  -  Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
+{
+	struct dccp_ackvec_record *avr, *next;
+	u8 runlen_now, eff_runlen;
+	s64 delta;
+
+	avr = dccp_ackvec_lookup(&av->av_records, ackno);
+	if (avr == NULL)
+		return;
+	/*
+	 * Deal with outdated acknowledgments: this arises when e.g. there are
+	 * several old records and the acks from the peer come in slowly. In
+	 * that case we may still have records that pre-date tail_ackno.
+	 */
+	delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+	if (delta < 0)
+		goto free_records;
+	/*
+	 * Deal with overlapping Ack Vectors: don't subtract more than the
+	 * number of packets between tail_ackno and ack_ackno.
+	 */
+	eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
+
+	runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+	/*
+	 * The run length of Ack Vector cells does not decrease over time. If
+	 * the run length is the same as at the time the Ack Vector was sent, we
+	 * free the ack_ptr cell. That cell can however not be freed if the run
+	 * length has increased: in this case we need to move the tail pointer
+	 * backwards (towards higher indices), to its next-oldest neighbour.
+	 */
+	if (runlen_now > eff_runlen) {
+
+		av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+		av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+
+		/* This move may not have cleared the overflow flag. */
+		if (av->av_overflow)
+			av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+	} else {
+		av->av_buf_tail	= avr->avr_ack_ptr;
+		/*
+		 * We have made sure that avr points to a valid cell within the
+		 * buffer. This cell is either older than head, or equals head
+		 * (empty buffer): in both cases we no longer have any overflow.
+		 */
+		av->av_overflow	= 0;
+	}
+
+	/*
+	 * The peer has acknowledged up to and including ack_ackno. Hence the
+	 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+	 */
+	av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+		list_del(&avr->avr_node);
+		kmem_cache_free(dccp_ackvec_record_slab, avr);
+	}
+}
+
+/*
+ *	Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+{
+	struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+	if (new == NULL)
+		return -ENOBUFS;
+	new->vec   = vec;
+	new->len   = len;
+	new->nonce = nonce;
+
+	list_add_tail(&new->node, head);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+	struct dccp_ackvec_parsed *cur, *next;
+
+	list_for_each_entry_safe(cur, next, parsed_chunks, node)
+		kfree(cur);
+	INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
+
+int __init dccp_ackvec_init(void)
+{
+	dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
+					     sizeof(struct dccp_ackvec), 0,
+					     SLAB_HWCACHE_ALIGN, NULL);
+	if (dccp_ackvec_slab == NULL)
+		goto out_err;
+
+	dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+					     sizeof(struct dccp_ackvec_record),
+					     0, SLAB_HWCACHE_ALIGN, NULL);
+	if (dccp_ackvec_record_slab == NULL)
+		goto out_destroy_slab;
+
+	return 0;
+
+out_destroy_slab:
+	kmem_cache_destroy(dccp_ackvec_slab);
+	dccp_ackvec_slab = NULL;
+out_err:
+	DCCP_CRIT("Unable to create Ack Vector slab cache");
+	return -ENOBUFS;
+}
+
+void dccp_ackvec_exit(void)
+{
+	if (dccp_ackvec_slab != NULL) {
+		kmem_cache_destroy(dccp_ackvec_slab);
+		dccp_ackvec_slab = NULL;
+	}
+	if (dccp_ackvec_record_slab != NULL) {
+		kmem_cache_destroy(dccp_ackvec_record_slab);
+		dccp_ackvec_record_slab = NULL;
+	}
+}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
new file mode 100644
index 00000000..e2ab0627
--- /dev/null
+++ b/net/dccp/ackvec.h
@@ -0,0 +1,136 @@
+#ifndef _ACKVEC_H
+#define _ACKVEC_H
+/*
+ *  net/dccp/ackvec.h
+ *
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
+ */
+#define DCCPAV_NUM_ACKVECS	2
+#define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
+
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN	16
+
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH	(DCCPAV_MAX_ACKVEC_LEN / 8)
+
+enum dccp_ackvec_states {
+	DCCPAV_RECEIVED =	0x00,
+	DCCPAV_ECN_MARKED =	0x40,
+	DCCPAV_RESERVED =	0x80,
+	DCCPAV_NOT_RECEIVED =	0xC0
+};
+#define DCCPAV_MAX_RUNLEN	0x3F
+
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+	return *cell & DCCPAV_MAX_RUNLEN;
+}
+
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+	return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/** struct dccp_ackvec - Ack Vector main data structure
+ *
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
+ *
+ * @av_buf:	   circular buffer storage area
+ * @av_buf_head:   head index; begin of live portion in @av_buf
+ * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest  seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
+ *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
+ * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
+ */
+struct dccp_ackvec {
+	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
+	u16			av_buf_head;
+	u16			av_buf_tail;
+	u64			av_buf_ackno:48;
+	u64			av_tail_ackno:48;
+	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
+	u8			av_overflow:1;
+	struct list_head	av_records;
+};
+
+/** struct dccp_ackvec_record - Records information about sent Ack Vectors
+ *
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
+ *
+ * @avr_node:	    the list node in @av_records
+ * @avr_ack_seqno:  sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno:  the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr:    pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce:  the sum of @av_buf_nonce's at the time this record was sent
+ *
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
+ */
+struct dccp_ackvec_record {
+	struct list_head avr_node;
+	u64		 avr_ack_seqno:48;
+	u64		 avr_ack_ackno:48;
+	u16		 avr_ack_ptr;
+	u8		 avr_ack_runlen;
+	u8		 avr_ack_nonce:1;
+};
+
+extern int dccp_ackvec_init(void);
+extern void dccp_ackvec_exit(void);
+
+extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
+extern void dccp_ackvec_free(struct dccp_ackvec *av);
+
+extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
+extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
+extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
+
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
+{
+	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
+}
+
+/**
+ * struct dccp_ackvec_parsed  -  Record offsets of Ack Vectors in skb
+ * @vec:	start of vector (offset into skb)
+ * @len:	length of @vec
+ * @nonce:	whether @vec had an ECN nonce of 0 or 1
+ * @node:	FIFO - arranged in descending order of ack_ackno
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+	u8		 *vec,
+			 len,
+			 nonce:1;
+	struct list_head node;
+};
+
+extern int dccp_ackvec_parsed_add(struct list_head *head,
+				  u8 *vec, u8 len, u8 nonce);
+extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
+#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 00000000..36479ca6
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,222 @@
+/*
+ *  net/dccp/ccid.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+
+#include "ccid.h"
+#include "ccids/lib/tfrc.h"
+
+static struct ccid_operations *ccids[] = {
+	&ccid2_ops,
+#ifdef CONFIG_IP_DCCP_CCID3
+	&ccid3_ops,
+#endif
+};
+
+static struct ccid_operations *ccid_by_number(const u8 id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ccids); i++)
+		if (ccids[i]->ccid_id == id)
+			return ccids[i];
+	return NULL;
+}
+
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
+{
+	while (array_len > 0)
+		if (ccid_by_number(ccid_array[--array_len]) == NULL)
+			return false;
+	return true;
+}
+
+/**
+ * ccid_get_builtin_ccids  -  Populate a list of built-in CCIDs
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
+{
+	*ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
+	if (*ccid_array == NULL)
+		return -ENOBUFS;
+
+	for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
+		(*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
+	return 0;
+}
+
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+				  char __user *optval, int __user *optlen)
+{
+	u8 *ccid_array, array_len;
+	int err = 0;
+
+	if (ccid_get_builtin_ccids(&ccid_array, &array_len))
+		return -ENOBUFS;
+
+	if (put_user(array_len, optlen))
+		err = -EFAULT;
+	else if (len > 0 && copy_to_user(optval, ccid_array,
+					 len > array_len ? array_len : len))
+		err = -EFAULT;
+
+	kfree(ccid_array);
+	return err;
+}
+
+static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
+{
+	struct kmem_cache *slab;
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
+	va_end(args);
+
+	slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
+				 SLAB_HWCACHE_ALIGN, NULL);
+	return slab;
+}
+
+static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
+{
+	if (slab != NULL)
+		kmem_cache_destroy(slab);
+}
+
+static int ccid_activate(struct ccid_operations *ccid_ops)
+{
+	int err = -ENOBUFS;
+
+	ccid_ops->ccid_hc_rx_slab =
+			ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
+					       ccid_ops->ccid_hc_rx_slab_name,
+					       "ccid%u_hc_rx_sock",
+					       ccid_ops->ccid_id);
+	if (ccid_ops->ccid_hc_rx_slab == NULL)
+		goto out;
+
+	ccid_ops->ccid_hc_tx_slab =
+			ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
+					       ccid_ops->ccid_hc_tx_slab_name,
+					       "ccid%u_hc_tx_sock",
+					       ccid_ops->ccid_id);
+	if (ccid_ops->ccid_hc_tx_slab == NULL)
+		goto out_free_rx_slab;
+
+	pr_info("CCID: Activated CCID %d (%s)\n",
+		ccid_ops->ccid_id, ccid_ops->ccid_name);
+	err = 0;
+out:
+	return err;
+out_free_rx_slab:
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+	ccid_ops->ccid_hc_rx_slab = NULL;
+	goto out;
+}
+
+static void ccid_deactivate(struct ccid_operations *ccid_ops)
+{
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
+	ccid_ops->ccid_hc_tx_slab = NULL;
+	ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+	ccid_ops->ccid_hc_rx_slab = NULL;
+
+	pr_info("CCID: Deactivated CCID %d (%s)\n",
+		ccid_ops->ccid_id, ccid_ops->ccid_name);
+}
+
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
+{
+	struct ccid_operations *ccid_ops = ccid_by_number(id);
+	struct ccid *ccid = NULL;
+
+	if (ccid_ops == NULL)
+		goto out;
+
+	ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
+				     ccid_ops->ccid_hc_tx_slab, gfp_any());
+	if (ccid == NULL)
+		goto out;
+	ccid->ccid_ops = ccid_ops;
+	if (rx) {
+		memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
+		if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
+		    ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
+			goto out_free_ccid;
+	} else {
+		memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
+		if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
+		    ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
+			goto out_free_ccid;
+	}
+out:
+	return ccid;
+out_free_ccid:
+	kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
+			ccid_ops->ccid_hc_tx_slab, ccid);
+	ccid = NULL;
+	goto out;
+}
+
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
+{
+	if (ccid != NULL) {
+		if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
+			ccid->ccid_ops->ccid_hc_rx_exit(sk);
+		kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
+	}
+}
+
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
+{
+	if (ccid != NULL) {
+		if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
+			ccid->ccid_ops->ccid_hc_tx_exit(sk);
+		kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
+	}
+}
+
+int __init ccid_initialize_builtins(void)
+{
+	int i, err = tfrc_lib_init();
+
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(ccids); i++) {
+		err = ccid_activate(ccids[i]);
+		if (err)
+			goto unwind_registrations;
+	}
+	return 0;
+
+unwind_registrations:
+	while(--i >= 0)
+		ccid_deactivate(ccids[i]);
+	tfrc_lib_exit();
+	return err;
+}
+
+void ccid_cleanup_builtins(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ccids); i++)
+		ccid_deactivate(ccids[i]);
+	tfrc_lib_exit();
+}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 00000000..75c3582a
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,265 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ *  net/dccp/ccid.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <linux/compiler.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+/* maximum value for a CCID (RFC 4340, 19.5) */
+#define CCID_MAX		255
+#define CCID_SLAB_NAME_LENGTH	32
+
+struct tcp_info;
+
+/**
+ *  struct ccid_operations  -  Interface to Congestion-Control Infrastructure
+ *
+ *  @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
+ *  @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
+ *  @ccid_name: alphabetical identifier string for @ccid_id
+ *  @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
+ *  @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
+ *
+ *  @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
+ *  @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
+ *  @ccid_hc_rx_packet_recv: implements the HC-receiver side
+ *  @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
+ *  @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
+ *  @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
+ *  @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
+ *  @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
+ *  @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
+ *  @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
+ */
+struct ccid_operations {
+	unsigned char		ccid_id;
+	__u32			ccid_ccmps;
+	const char		*ccid_name;
+	struct kmem_cache	*ccid_hc_rx_slab,
+				*ccid_hc_tx_slab;
+	char			ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
+	char			ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
+	__u32			ccid_hc_rx_obj_size,
+				ccid_hc_tx_obj_size;
+	/* Interface Routines */
+	int		(*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
+	int		(*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
+	void		(*ccid_hc_rx_exit)(struct sock *sk);
+	void		(*ccid_hc_tx_exit)(struct sock *sk);
+	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
+						  struct sk_buff *skb);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_rx_insert_options)(struct sock *sk,
+						     struct sk_buff *skb);
+	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
+						  struct sk_buff *skb);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
+						  struct sk_buff *skb);
+	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
+						  unsigned int len);
+	void		(*ccid_hc_rx_get_info)(struct sock *sk,
+					       struct tcp_info *info);
+	void		(*ccid_hc_tx_get_info)(struct sock *sk,
+					       struct tcp_info *info);
+	int		(*ccid_hc_rx_getsockopt)(struct sock *sk,
+						 const int optname, int len,
+						 u32 __user *optval,
+						 int __user *optlen);
+	int		(*ccid_hc_tx_getsockopt)(struct sock *sk,
+						 const int optname, int len,
+						 u32 __user *optval,
+						 int __user *optlen);
+};
+
+extern struct ccid_operations ccid2_ops;
+#ifdef CONFIG_IP_DCCP_CCID3
+extern struct ccid_operations ccid3_ops;
+#endif
+
+extern int  ccid_initialize_builtins(void);
+extern void ccid_cleanup_builtins(void);
+
+struct ccid {
+	struct ccid_operations *ccid_ops;
+	char		       ccid_priv[0];
+};
+
+static inline void *ccid_priv(const struct ccid *ccid)
+{
+	return (void *)ccid->ccid_priv;
+}
+
+extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+					  char __user *, int __user *);
+
+extern struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
+
+static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
+{
+	struct ccid *ccid = dp->dccps_hc_rx_ccid;
+
+	if (ccid == NULL || ccid->ccid_ops == NULL)
+		return -1;
+	return ccid->ccid_ops->ccid_id;
+}
+
+static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
+{
+	struct ccid *ccid = dp->dccps_hc_tx_ccid;
+
+	if (ccid == NULL || ccid->ccid_ops == NULL)
+		return -1;
+	return ccid->ccid_ops->ccid_id;
+}
+
+extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing    (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+	CCID_PACKET_SEND_AT_ONCE =	 0x00000,  /* "green light": no delay */
+	CCID_PACKET_DELAY_MAX =		 0x0FFFF,  /* maximum delay in msecs  */
+	CCID_PACKET_DELAY =		 0x10000,  /* CCID msec-delay mode */
+	CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000,  /* CCID autonomous mode */
+	CCID_PACKET_ERR =		 0xF0000,  /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+	if (return_code < 0)
+		return CCID_PACKET_ERR;
+	if (return_code == 0)
+		return CCID_PACKET_SEND_AT_ONCE;
+	if (return_code <= CCID_PACKET_DELAY_MAX)
+		return CCID_PACKET_DELAY;
+	return return_code;
+}
+
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+					 struct sk_buff *skb)
+{
+	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
+		return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+	return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+					  unsigned int len)
+{
+	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
+}
+
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+					  struct sk_buff *skb)
+{
+	if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
+		ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
+}
+
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+					  struct sk_buff *skb)
+{
+	if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
+		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
+}
+
+/**
+ * ccid_hc_tx_parse_options  -  Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+					   u8 pkt, u8 opt, u8 *val, u8 len)
+{
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
+}
+
+/**
+ * ccid_hc_rx_parse_options  -  Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+					   u8 pkt, u8 opt, u8 *val, u8 len)
+{
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
+}
+
+static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+					    struct sk_buff *skb)
+{
+	if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
+		return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
+	return 0;
+}
+
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+				       struct tcp_info *info)
+{
+	if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
+		ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
+}
+
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+				       struct tcp_info *info)
+{
+	if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
+		ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
+}
+
+static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
+					const int optname, int len,
+					u32 __user *optval, int __user *optlen)
+{
+	int rc = -ENOPROTOOPT;
+	if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+		rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
+						 optval, optlen);
+	return rc;
+}
+
+static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
+					const int optname, int len,
+					u32 __user *optval, int __user *optlen)
+{
+	int rc = -ENOPROTOOPT;
+	if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
+						 optval, optlen);
+	return rc;
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 00000000..0581143c
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,55 @@
+menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+
+config IP_DCCP_CCID2_DEBUG
+	bool "CCID-2 debugging messages"
+	---help---
+	  Enable CCID-2 specific debugging messages.
+
+	  The debugging output can additionally be toggled by setting the
+	  ccid2_debug parameter to 0 or 1.
+
+	  If in doubt, say N.
+
+config IP_DCCP_CCID3
+	bool "CCID-3 (TCP-Friendly) (EXPERIMENTAL)"
+	def_bool y if (IP_DCCP = y || IP_DCCP = m)
+	---help---
+	  CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+	  rate-controlled congestion control mechanism.  TFRC is designed to
+	  be reasonably fair when competing for bandwidth with TCP-like flows,
+	  where a flow is "reasonably fair" if its sending rate is generally
+	  within a factor of two of the sending rate of a TCP flow under the
+	  same conditions.  However, TFRC has a much lower variation of
+	  throughput over time compared with TCP, which makes CCID-3 more
+	  suitable than CCID-2 for applications such streaming media where a
+	  relatively smooth sending rate is of importance.
+
+	  CCID-3 is further described in RFC 4342,
+	  http://www.ietf.org/rfc/rfc4342.txt
+
+	  The TFRC congestion control algorithms were initially described in
+	  RFC 5348.
+
+	  This text was extracted from RFC 4340 (sec. 10.2),
+	  http://www.ietf.org/rfc/rfc4340.txt
+
+	  If in doubt, say N.
+
+config IP_DCCP_CCID3_DEBUG
+	bool "CCID-3 debugging messages"
+	depends on IP_DCCP_CCID3
+	---help---
+	  Enable CCID-3 specific debugging messages.
+
+	  The debugging output can additionally be toggled by setting the
+	  ccid3_debug parameter to 0 or 1.
+
+	  If in doubt, say N.
+
+config IP_DCCP_TFRC_LIB
+	def_bool y if IP_DCCP_CCID3
+
+config IP_DCCP_TFRC_DEBUG
+	def_bool y if IP_DCCP_CCID3_DEBUG
+endmenu
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
new file mode 100644
index 00000000..fadecd20
--- /dev/null
+++ b/net/dccp/ccids/ccid2.c
@@ -0,0 +1,671 @@
+/*
+ *  Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *  Changes to meet Linux coding standards, and DCCP infrastructure fixes.
+ *
+ *  Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This implementation should follow RFC 4341
+ */
+#include <linux/slab.h>
+#include "../feat.h"
+#include "ccid2.h"
+
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+static int ccid2_debug;
+#define ccid2_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid2_debug, format, ##a)
+#else
+#define ccid2_pr_debug(format, a...)
+#endif
+
+static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
+{
+	struct ccid2_seq *seqp;
+	int i;
+
+	/* check if we have space to preserve the pointer to the buffer */
+	if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
+			       sizeof(struct ccid2_seq *)))
+		return -ENOMEM;
+
+	/* allocate buffer and initialize linked list */
+	seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any());
+	if (seqp == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
+		seqp[i].ccid2s_next = &seqp[i + 1];
+		seqp[i + 1].ccid2s_prev = &seqp[i];
+	}
+	seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
+	seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+
+	/* This is the first allocation.  Initiate the head and tail.  */
+	if (hc->tx_seqbufc == 0)
+		hc->tx_seqh = hc->tx_seqt = seqp;
+	else {
+		/* link the existing list with the one we just created */
+		hc->tx_seqh->ccid2s_next = seqp;
+		seqp->ccid2s_prev = hc->tx_seqh;
+
+		hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
+	}
+
+	/* store the original pointer to the buffer so we can free it */
+	hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
+	hc->tx_seqbufc++;
+
+	return 0;
+}
+
+static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
+{
+	if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+		return CCID_PACKET_WILL_DEQUEUE_LATER;
+	return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
+
+	/*
+	 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
+	 * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
+	 * acceptable since this causes starvation/deadlock whenever cwnd < 2.
+	 * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
+	 */
+	if (val == 0 || val > max_ratio) {
+		DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
+		val = max_ratio;
+	}
+	if (val > DCCPF_ACK_RATIO_MAX)
+		val = DCCPF_ACK_RATIO_MAX;
+
+	if (val == dp->dccps_l_ack_ratio)
+		return;
+
+	ccid2_pr_debug("changing local ack ratio to %u\n", val);
+	dp->dccps_l_ack_ratio = val;
+}
+
+static void ccid2_hc_tx_rto_expire(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
+		goto out;
+	}
+
+	ccid2_pr_debug("RTO_EXPIRE\n");
+
+	/* back-off timer */
+	hc->tx_rto <<= 1;
+	if (hc->tx_rto > DCCP_RTO_MAX)
+		hc->tx_rto = DCCP_RTO_MAX;
+
+	/* adjust pipe, cwnd etc */
+	hc->tx_ssthresh = hc->tx_cwnd / 2;
+	if (hc->tx_ssthresh < 2)
+		hc->tx_ssthresh = 2;
+	hc->tx_cwnd	= 1;
+	hc->tx_pipe	= 0;
+
+	/* clear state about stuff we sent */
+	hc->tx_seqt = hc->tx_seqh;
+	hc->tx_packets_acked = 0;
+
+	/* clear ack ratio state. */
+	hc->tx_rpseq    = 0;
+	hc->tx_rpdupack = -1;
+	ccid2_change_l_ack_ratio(sk, 1);
+
+	/* if we were blocked before, we may now send cwnd=1 packet */
+	if (sender_was_blocked)
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+	/* restart backed-off timer */
+	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	struct ccid2_seq *next;
+
+	hc->tx_pipe++;
+
+	hc->tx_seqh->ccid2s_seq   = dp->dccps_gss;
+	hc->tx_seqh->ccid2s_acked = 0;
+	hc->tx_seqh->ccid2s_sent  = ccid2_time_stamp;
+
+	next = hc->tx_seqh->ccid2s_next;
+	/* check if we need to alloc more space */
+	if (next == hc->tx_seqt) {
+		if (ccid2_hc_tx_alloc_seq(hc)) {
+			DCCP_CRIT("packet history - out of memory!");
+			/* FIXME: find a more graceful way to bail out */
+			return;
+		}
+		next = hc->tx_seqh->ccid2s_next;
+		BUG_ON(next == hc->tx_seqt);
+	}
+	hc->tx_seqh = next;
+
+	ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
+
+	/*
+	 * FIXME: The code below is broken and the variables have been removed
+	 * from the socket struct. The `ackloss' variable was always set to 0,
+	 * and with arsent there are several problems:
+	 *  (i) it doesn't just count the number of Acks, but all sent packets;
+	 *  (ii) it is expressed in # of packets, not # of windows, so the
+	 *  comparison below uses the wrong formula: Appendix A of RFC 4341
+	 *  comes up with the number K = cwnd / (R^2 - R) of consecutive windows
+	 *  of data with no lost or marked Ack packets. If arsent were the # of
+	 *  consecutive Acks received without loss, then Ack Ratio needs to be
+	 *  decreased by 1 when
+	 *	      arsent >=  K * cwnd / R  =  cwnd^2 / (R^3 - R^2)
+	 *  where cwnd / R is the number of Acks received per window of data
+	 *  (cf. RFC 4341, App. A). The problems are that
+	 *  - arsent counts other packets as well;
+	 *  - the comparison uses a formula different from RFC 4341;
+	 *  - computing a cubic/quadratic equation each time is too complicated.
+	 *  Hence a different algorithm is needed.
+	 */
+#if 0
+	/* Ack Ratio.  Need to maintain a concept of how many windows we sent */
+	hc->tx_arsent++;
+	/* We had an ack loss in this window... */
+	if (hc->tx_ackloss) {
+		if (hc->tx_arsent >= hc->tx_cwnd) {
+			hc->tx_arsent  = 0;
+			hc->tx_ackloss = 0;
+		}
+	} else {
+		/* No acks lost up to now... */
+		/* decrease ack ratio if enough packets were sent */
+		if (dp->dccps_l_ack_ratio > 1) {
+			/* XXX don't calculate denominator each time */
+			int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
+				    dp->dccps_l_ack_ratio;
+
+			denom = hc->tx_cwnd * hc->tx_cwnd / denom;
+
+			if (hc->tx_arsent >= denom) {
+				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
+				hc->tx_arsent = 0;
+			}
+		} else {
+			/* we can't increase ack ratio further [1] */
+			hc->tx_arsent = 0; /* or maybe set it to cwnd*/
+		}
+	}
+#endif
+
+	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+	do {
+		struct ccid2_seq *seqp = hc->tx_seqt;
+
+		while (seqp != hc->tx_seqh) {
+			ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
+				       (unsigned long long)seqp->ccid2s_seq,
+				       seqp->ccid2s_acked, seqp->ccid2s_sent);
+			seqp = seqp->ccid2s_next;
+		}
+	} while (0);
+	ccid2_pr_debug("=========\n");
+#endif
+}
+
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ *   which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+{
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	long m = mrtt ? : 1;
+
+	if (hc->tx_srtt == 0) {
+		/* First measurement m */
+		hc->tx_srtt = m << 3;
+		hc->tx_mdev = m << 1;
+
+		hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+		hc->tx_rttvar   = hc->tx_mdev_max;
+
+		hc->tx_rtt_seq  = dccp_sk(sk)->dccps_gss;
+	} else {
+		/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+		m -= (hc->tx_srtt >> 3);
+		hc->tx_srtt += m;
+
+		/* Similarly, update scaled mdev with regard to |m| */
+		if (m < 0) {
+			m = -m;
+			m -= (hc->tx_mdev >> 2);
+			/*
+			 * This neutralises RTO increase when RTT < SRTT - mdev
+			 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+			 * in Linux TCP", USENIX 2002, pp. 49-62).
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (hc->tx_mdev >> 2);
+		}
+		hc->tx_mdev += m;
+
+		if (hc->tx_mdev > hc->tx_mdev_max) {
+			hc->tx_mdev_max = hc->tx_mdev;
+			if (hc->tx_mdev_max > hc->tx_rttvar)
+				hc->tx_rttvar = hc->tx_mdev_max;
+		}
+
+		/*
+		 * Decay RTTVAR at most once per flight, exploiting that
+		 *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
+		 *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
+		 * GAR is a useful bound for FlightSize = pipe.
+		 * AWL is probably too low here, as it over-estimates pipe.
+		 */
+		if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+			if (hc->tx_mdev_max < hc->tx_rttvar)
+				hc->tx_rttvar -= (hc->tx_rttvar -
+						  hc->tx_mdev_max) >> 2;
+			hc->tx_rtt_seq  = dccp_sk(sk)->dccps_gss;
+			hc->tx_mdev_max = tcp_rto_min(sk);
+		}
+	}
+
+	/*
+	 * Set RTO from SRTT and RTTVAR
+	 * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+	 * This agrees with RFC 4341, 5:
+	 *	"Because DCCP does not retransmit data, DCCP does not require
+	 *	 TCP's recommended minimum timeout of one second".
+	 */
+	hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
+
+	if (hc->tx_rto > DCCP_RTO_MAX)
+		hc->tx_rto = DCCP_RTO_MAX;
+}
+
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+			  unsigned int *maxincr)
+{
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+	if (hc->tx_cwnd < hc->tx_ssthresh) {
+		if (*maxincr > 0 && ++hc->tx_packets_acked == 2) {
+			hc->tx_cwnd += 1;
+			*maxincr    -= 1;
+			hc->tx_packets_acked = 0;
+		}
+	} else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
+			hc->tx_cwnd += 1;
+			hc->tx_packets_acked = 0;
+	}
+	/*
+	 * FIXME: RTT is sampled several times per acknowledgment (for each
+	 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+	 * This causes the RTT to be over-estimated, since the older entries
+	 * in the Ack Vector have earlier sending times.
+	 * The cleanest solution is to not use the ccid2s_sent field at all
+	 * and instead use DCCP timestamps: requires changes in other places.
+	 */
+	ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
+}
+
+static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+{
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+	if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
+		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
+		return;
+	}
+
+	hc->tx_last_cong = ccid2_time_stamp;
+
+	hc->tx_cwnd      = hc->tx_cwnd / 2 ? : 1U;
+	hc->tx_ssthresh  = max(hc->tx_cwnd, 2U);
+
+	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
+	if (dccp_sk(sk)->dccps_l_ack_ratio > hc->tx_cwnd)
+		ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
+}
+
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+				     u8 option, u8 *optval, u8 optlen)
+{
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+	switch (option) {
+	case DCCPO_ACK_VECTOR_0:
+	case DCCPO_ACK_VECTOR_1:
+		return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+					      option - DCCPO_ACK_VECTOR_0);
+	}
+	return 0;
+}
+
+static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+	struct dccp_ackvec_parsed *avp;
+	u64 ackno, seqno;
+	struct ccid2_seq *seqp;
+	int done = 0;
+	unsigned int maxincr = 0;
+
+	/* check reverse path congestion */
+	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+
+	/* XXX this whole "algorithm" is broken.  Need to fix it to keep track
+	 * of the seqnos of the dupacks so that rpseq and rpdupack are correct
+	 * -sorbo.
+	 */
+	/* need to bootstrap */
+	if (hc->tx_rpdupack == -1) {
+		hc->tx_rpdupack = 0;
+		hc->tx_rpseq    = seqno;
+	} else {
+		/* check if packet is consecutive */
+		if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
+			hc->tx_rpseq = seqno;
+		/* it's a later packet */
+		else if (after48(seqno, hc->tx_rpseq)) {
+			hc->tx_rpdupack++;
+
+			/* check if we got enough dupacks */
+			if (hc->tx_rpdupack >= NUMDUPACK) {
+				hc->tx_rpdupack = -1; /* XXX lame */
+				hc->tx_rpseq    = 0;
+
+				ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+			}
+		}
+	}
+
+	/* check forward path congestion */
+	if (dccp_packet_without_ack(skb))
+		return;
+
+	/* still didn't send out new data packets */
+	if (hc->tx_seqh == hc->tx_seqt)
+		goto done;
+
+	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+	if (after48(ackno, hc->tx_high_ack))
+		hc->tx_high_ack = ackno;
+
+	seqp = hc->tx_seqt;
+	while (before48(seqp->ccid2s_seq, ackno)) {
+		seqp = seqp->ccid2s_next;
+		if (seqp == hc->tx_seqh) {
+			seqp = hc->tx_seqh->ccid2s_prev;
+			break;
+		}
+	}
+
+	/*
+	 * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
+	 * packets per acknowledgement. Rounding up avoids that cwnd is not
+	 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
+	 */
+	if (hc->tx_cwnd < hc->tx_ssthresh)
+		maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
+
+	/* go through all ack vectors */
+	list_for_each_entry(avp, &hc->tx_av_chunks, node) {
+		/* go through this ack vector */
+		for (; avp->len--; avp->vec++) {
+			u64 ackno_end_rl = SUB48(ackno,
+						 dccp_ackvec_runlen(avp->vec));
+
+			ccid2_pr_debug("ackvec %llu |%u,%u|\n",
+				       (unsigned long long)ackno,
+				       dccp_ackvec_state(avp->vec) >> 6,
+				       dccp_ackvec_runlen(avp->vec));
+			/* if the seqno we are analyzing is larger than the
+			 * current ackno, then move towards the tail of our
+			 * seqnos.
+			 */
+			while (after48(seqp->ccid2s_seq, ackno)) {
+				if (seqp == hc->tx_seqt) {
+					done = 1;
+					break;
+				}
+				seqp = seqp->ccid2s_prev;
+			}
+			if (done)
+				break;
+
+			/* check all seqnos in the range of the vector
+			 * run length
+			 */
+			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
+				const u8 state = dccp_ackvec_state(avp->vec);
+
+				/* new packet received or marked */
+				if (state != DCCPAV_NOT_RECEIVED &&
+				    !seqp->ccid2s_acked) {
+					if (state == DCCPAV_ECN_MARKED)
+						ccid2_congestion_event(sk,
+								       seqp);
+					else
+						ccid2_new_ack(sk, seqp,
+							      &maxincr);
+
+					seqp->ccid2s_acked = 1;
+					ccid2_pr_debug("Got ack for %llu\n",
+						       (unsigned long long)seqp->ccid2s_seq);
+					hc->tx_pipe--;
+				}
+				if (seqp == hc->tx_seqt) {
+					done = 1;
+					break;
+				}
+				seqp = seqp->ccid2s_prev;
+			}
+			if (done)
+				break;
+
+			ackno = SUB48(ackno_end_rl, 1);
+		}
+		if (done)
+			break;
+	}
+
+	/* The state about what is acked should be correct now
+	 * Check for NUMDUPACK
+	 */
+	seqp = hc->tx_seqt;
+	while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
+		seqp = seqp->ccid2s_next;
+		if (seqp == hc->tx_seqh) {
+			seqp = hc->tx_seqh->ccid2s_prev;
+			break;
+		}
+	}
+	done = 0;
+	while (1) {
+		if (seqp->ccid2s_acked) {
+			done++;
+			if (done == NUMDUPACK)
+				break;
+		}
+		if (seqp == hc->tx_seqt)
+			break;
+		seqp = seqp->ccid2s_prev;
+	}
+
+	/* If there are at least 3 acknowledgements, anything unacknowledged
+	 * below the last sequence number is considered lost
+	 */
+	if (done == NUMDUPACK) {
+		struct ccid2_seq *last_acked = seqp;
+
+		/* check for lost packets */
+		while (1) {
+			if (!seqp->ccid2s_acked) {
+				ccid2_pr_debug("Packet lost: %llu\n",
+					       (unsigned long long)seqp->ccid2s_seq);
+				/* XXX need to traverse from tail -> head in
+				 * order to detect multiple congestion events in
+				 * one ack vector.
+				 */
+				ccid2_congestion_event(sk, seqp);
+				hc->tx_pipe--;
+			}
+			if (seqp == hc->tx_seqt)
+				break;
+			seqp = seqp->ccid2s_prev;
+		}
+
+		hc->tx_seqt = last_acked;
+	}
+
+	/* trim acked packets in tail */
+	while (hc->tx_seqt != hc->tx_seqh) {
+		if (!hc->tx_seqt->ccid2s_acked)
+			break;
+
+		hc->tx_seqt = hc->tx_seqt->ccid2s_next;
+	}
+
+	/* restart RTO timer if not all outstanding data has been acked */
+	if (hc->tx_pipe == 0)
+		sk_stop_timer(sk, &hc->tx_rtotimer);
+	else
+		sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+done:
+	/* check if incoming Acks allow pending packets to be sent */
+	if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+	dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
+}
+
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+	return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
+static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+	struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
+	struct dccp_sock *dp = dccp_sk(sk);
+	u32 max_ratio;
+
+	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
+	hc->tx_ssthresh = ~0U;
+
+	/* Use larger initial windows (RFC 4341, section 5). */
+	hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+
+	/* Make sure that Ack Ratio is enabled and within bounds. */
+	max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
+	if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
+		dp->dccps_l_ack_ratio = max_ratio;
+
+	/* XXX init ~ to window size... */
+	if (ccid2_hc_tx_alloc_seq(hc))
+		return -ENOMEM;
+
+	hc->tx_rto	 = DCCP_TIMEOUT_INIT;
+	hc->tx_rpdupack  = -1;
+	hc->tx_last_cong = ccid2_time_stamp;
+	setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
+			(unsigned long)sk);
+	INIT_LIST_HEAD(&hc->tx_av_chunks);
+	return 0;
+}
+
+static void ccid2_hc_tx_exit(struct sock *sk)
+{
+	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	int i;
+
+	sk_stop_timer(sk, &hc->tx_rtotimer);
+
+	for (i = 0; i < hc->tx_seqbufc; i++)
+		kfree(hc->tx_seqbuf[i]);
+	hc->tx_seqbufc = 0;
+}
+
+static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
+
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_DATA:
+	case DCCP_PKT_DATAACK:
+		hc->rx_data++;
+		if (hc->rx_data >= dp->dccps_r_ack_ratio) {
+			dccp_send_ack(sk);
+			hc->rx_data = 0;
+		}
+		break;
+	}
+}
+
+struct ccid_operations ccid2_ops = {
+	.ccid_id		  = DCCPC_CCID2,
+	.ccid_name		  = "TCP-like",
+	.ccid_hc_tx_obj_size	  = sizeof(struct ccid2_hc_tx_sock),
+	.ccid_hc_tx_init	  = ccid2_hc_tx_init,
+	.ccid_hc_tx_exit	  = ccid2_hc_tx_exit,
+	.ccid_hc_tx_send_packet	  = ccid2_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	  = ccid2_hc_tx_packet_sent,
+	.ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+	.ccid_hc_tx_packet_recv	  = ccid2_hc_tx_packet_recv,
+	.ccid_hc_rx_obj_size	  = sizeof(struct ccid2_hc_rx_sock),
+	.ccid_hc_rx_packet_recv	  = ccid2_hc_rx_packet_recv,
+};
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+module_param(ccid2_debug, bool, 0644);
+MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
+#endif
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
new file mode 100644
index 00000000..e9985daf
--- /dev/null
+++ b/net/dccp/ccids/ccid2.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID2_H_
+#define _DCCP_CCID2_H_
+
+#include <linux/timer.h>
+#include <linux/types.h>
+#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp	tcp_time_stamp
+
+/* NUMDUPACK parameter from RFC 4341, p. 6 */
+#define NUMDUPACK	3
+
+struct ccid2_seq {
+	u64			ccid2s_seq;
+	u32			ccid2s_sent;
+	int			ccid2s_acked;
+	struct ccid2_seq	*ccid2s_prev;
+	struct ccid2_seq	*ccid2s_next;
+};
+
+#define CCID2_SEQBUF_LEN 1024
+#define CCID2_SEQBUF_MAX 128
+
+/**
+ * struct ccid2_hc_tx_sock - CCID2 TX half connection
+ * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @tx_packets_acked:	     Ack counter for deriving cwnd growth (RFC 3465)
+ * @tx_srtt:		     smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev:		     smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max:	     maximum of @mdev during one flight
+ * @tx_rttvar:		     moving average/maximum of @mdev_max
+ * @tx_rto:		     RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq:		     to decay RTTVAR at most once per flight
+ * @tx_rpseq:		     last consecutive seqno
+ * @tx_rpdupack:	     dupacks since rpseq
+ * @tx_av_chunks:	     list of Ack Vectors received on current skb
+ */
+struct ccid2_hc_tx_sock {
+	u32			tx_cwnd;
+	u32			tx_ssthresh;
+	u32			tx_pipe;
+	u32			tx_packets_acked;
+	struct ccid2_seq	*tx_seqbuf[CCID2_SEQBUF_MAX];
+	int			tx_seqbufc;
+	struct ccid2_seq	*tx_seqh;
+	struct ccid2_seq	*tx_seqt;
+
+	/* RTT measurement: variables/principles are the same as in TCP */
+	u32			tx_srtt,
+				tx_mdev,
+				tx_mdev_max,
+				tx_rttvar,
+				tx_rto;
+	u64			tx_rtt_seq:48;
+	struct timer_list	tx_rtotimer;
+
+	u64			tx_rpseq;
+	int			tx_rpdupack;
+	u32			tx_last_cong;
+	u64			tx_high_ack;
+	struct list_head	tx_av_chunks;
+};
+
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+	return hc->tx_pipe >= hc->tx_cwnd;
+}
+
+struct ccid2_hc_rx_sock {
+	int	rx_data;
+};
+
+static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+{
+	return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+}
+
+static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
+{
+	return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+}
+#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 00000000..3d604e13
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,866 @@
+/*
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include "../dccp.h"
+#include "ccid3.h"
+
+#include <asm/unaligned.h>
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static int ccid3_debug;
+#define ccid3_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid3_debug, format, ##a)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+
+/*
+ *	Transmitter Half-Connection Routines
+ */
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+	static const char *const ccid3_state_names[] = {
+	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
+	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+	[TFRC_SSTATE_FBACK]    = "FBACK",
+	};
+
+	return ccid3_state_names[state];
+}
+#endif
+
+static void ccid3_hc_tx_set_state(struct sock *sk,
+				  enum ccid3_hc_tx_states state)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	enum ccid3_hc_tx_states oldstate = hc->tx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+		       ccid3_tx_state_name(state));
+	WARN_ON(state == oldstate);
+	hc->tx_state = state;
+}
+
+/*
+ * Compute the initial sending rate X_init in the manner of RFC 3390:
+ *
+ *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
+ *
+ * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
+ * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
+ * For consistency with other parts of the code, X_init is scaled by 2^6.
+ */
+static inline u64 rfc3390_initial_rate(struct sock *sk)
+{
+	const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
+
+	return scaled_div(w_init << 6, hc->tx_rtt);
+}
+
+/**
+ * ccid3_update_send_interval  -  Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
+ */
+static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
+{
+	hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
+
+	ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+		       hc->tx_s, (unsigned)(hc->tx_x >> 6));
+}
+
+static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
+{
+	u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
+
+	return delta / hc->tx_rtt;
+}
+
+/**
+ * ccid3_hc_tx_update_x  -  Update allowed sending rate X
+ * @stamp: most recent time if available - can be left NULL.
+ * This function tracks draft rfc3448bis, check there for latest details.
+ *
+ * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
+ *       fine-grained resolution of sending rates. This requires scaling by 2^6
+ *       throughout the code. Only X_calc is unscaled (in bytes/second).
+ *
+ */
+static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	__u64 min_rate = 2 * hc->tx_x_recv;
+	const __u64 old_x = hc->tx_x;
+	ktime_t now = stamp ? *stamp : ktime_get_real();
+
+	/*
+	 * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
+	 * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
+	 * a sender is idle if it has not sent anything over a 2-RTT-period.
+	 * For consistency with X and X_recv, min_rate is also scaled by 2^6.
+	 */
+	if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
+		min_rate = rfc3390_initial_rate(sk);
+		min_rate = max(min_rate, 2 * hc->tx_x_recv);
+	}
+
+	if (hc->tx_p > 0) {
+
+		hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
+		hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+
+	} else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
+
+		hc->tx_x = min(2 * hc->tx_x, min_rate);
+		hc->tx_x = max(hc->tx_x,
+			       scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
+		hc->tx_t_ld = now;
+	}
+
+	if (hc->tx_x != old_x) {
+		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
+			       "X_recv=%u\n", (unsigned)(old_x >> 6),
+			       (unsigned)(hc->tx_x >> 6), hc->tx_x_calc,
+			       (unsigned)(hc->tx_x_recv >> 6));
+
+		ccid3_update_send_interval(hc);
+	}
+}
+
+/*
+ *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
+ *	@len: DCCP packet payload size in bytes
+ */
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
+{
+	const u16 old_s = hc->tx_s;
+
+	hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
+
+	if (hc->tx_s != old_s)
+		ccid3_update_send_interval(hc);
+}
+
+/*
+ *	Update Window Counter using the algorithm from [RFC 4342, 8.1].
+ *	As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
+ */
+static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
+						ktime_t now)
+{
+	u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
+	    quarter_rtts = (4 * delta) / hc->tx_rtt;
+
+	if (quarter_rtts > 0) {
+		hc->tx_t_last_win_count = now;
+		hc->tx_last_win_count  += min(quarter_rtts, 5U);
+		hc->tx_last_win_count  &= 0xF;		/* mod 16 */
+	}
+}
+
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	unsigned long t_nfb = USEC_PER_SEC / 5;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		/* XXX: set some sensible MIB */
+		goto restart_timer;
+	}
+
+	ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
+		       ccid3_tx_state_name(hc->tx_state));
+
+	/* Ignore and do not restart after leaving the established state */
+	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+		goto out;
+
+	/* Reset feedback state to "no feedback received" */
+	if (hc->tx_state == TFRC_SSTATE_FBACK)
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
+	/*
+	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+	 * RTO is 0 if and only if no feedback has been received yet.
+	 */
+	if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
+
+		/* halve send rate directly */
+		hc->tx_x = max(hc->tx_x / 2,
+			       (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+		ccid3_update_send_interval(hc);
+	} else {
+		/*
+		 *  Modify the cached value of X_recv
+		 *
+		 *  If (X_calc > 2 * X_recv)
+		 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
+		 *  Else
+		 *    X_recv = X_calc / 4;
+		 *
+		 *  Note that X_recv is scaled by 2^6 while X_calc is not
+		 */
+		BUG_ON(hc->tx_p && !hc->tx_x_calc);
+
+		if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
+			hc->tx_x_recv =
+				max(hc->tx_x_recv / 2,
+				    (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
+		else {
+			hc->tx_x_recv = hc->tx_x_calc;
+			hc->tx_x_recv <<= 4;
+		}
+		ccid3_hc_tx_update_x(sk, NULL);
+	}
+	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
+			(unsigned long long)hc->tx_x);
+
+	/*
+	 * Set new timeout for the nofeedback timer.
+	 * See comments in packet_recv() regarding the value of t_RTO.
+	 */
+	if (unlikely(hc->tx_t_rto == 0))	/* no feedback received yet */
+		t_nfb = TFRC_INITIAL_TIMEOUT;
+	else
+		t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
+
+restart_timer:
+	sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+			   jiffies + usecs_to_jiffies(t_nfb));
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/**
+ * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
+ */
+static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	ktime_t now = ktime_get_real();
+	s64 delay;
+
+	/*
+	 * This function is called only for Data and DataAck packets. Sending
+	 * zero-sized Data(Ack)s is theoretically possible, but for congestion
+	 * control this case is pathological - ignore it.
+	 */
+	if (unlikely(skb->len == 0))
+		return -EBADMSG;
+
+	if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
+		sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
+			       usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
+		hc->tx_last_win_count	= 0;
+		hc->tx_t_last_win_count = now;
+
+		/* Set t_0 for initial packet */
+		hc->tx_t_nom = now;
+
+		hc->tx_s = skb->len;
+
+		/*
+		 * Use initial RTT sample when available: recommended by erratum
+		 * to RFC 4342. This implements the initialisation procedure of
+		 * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
+		 */
+		if (dp->dccps_syn_rtt) {
+			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
+			hc->tx_rtt  = dp->dccps_syn_rtt;
+			hc->tx_x    = rfc3390_initial_rate(sk);
+			hc->tx_t_ld = now;
+		} else {
+			/*
+			 * Sender does not have RTT sample:
+			 * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+			 *   is needed in several parts (e.g.  window counter);
+			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+			 */
+			hc->tx_rtt = DCCP_FALLBACK_RTT;
+			hc->tx_x   = hc->tx_s;
+			hc->tx_x <<= 6;
+		}
+		ccid3_update_send_interval(hc);
+
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
+	} else {
+		delay = ktime_us_delta(hc->tx_t_nom, now);
+		ccid3_pr_debug("delay=%ld\n", (long)delay);
+		/*
+		 *	Scheduling of packet transmissions (RFC 5348, 8.3)
+		 *
+		 * if (t_now > t_nom - delta)
+		 *       // send the packet now
+		 * else
+		 *       // send the packet in (t_nom - t_now) milliseconds.
+		 */
+		if (delay >= TFRC_T_DELTA)
+			return (u32)delay / USEC_PER_MSEC;
+
+		ccid3_hc_tx_update_win_count(hc, now);
+	}
+
+	/* prepare to send now (add options etc.) */
+	dp->dccps_hc_tx_insert_options = 1;
+	DCCP_SKB_CB(skb)->dccpd_ccval  = hc->tx_last_win_count;
+
+	/* set the nominal send time for the next following packet */
+	hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
+	return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+
+	ccid3_hc_tx_update_s(hc, len);
+
+	if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
+		DCCP_CRIT("packet history - out of memory!");
+}
+
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	struct tfrc_tx_hist_entry *acked;
+	ktime_t now;
+	unsigned long t_nfb;
+	u32 r_sample;
+
+	/* we are only interested in ACKs */
+	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+		return;
+	/*
+	 * Locate the acknowledged packet in the TX history.
+	 *
+	 * Returning "entry not found" here can for instance happen when
+	 *  - the host has not sent out anything (e.g. a passive server),
+	 *  - the Ack is outdated (packet with higher Ack number was received),
+	 *  - it is a bogus Ack (for a packet not sent on this connection).
+	 */
+	acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+	if (acked == NULL)
+		return;
+	/* For the sake of RTT sampling, ignore/remove all older entries */
+	tfrc_tx_hist_purge(&acked->next);
+
+	/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+	now	  = ktime_get_real();
+	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
+	hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
+	/*
+	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
+	 */
+	if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+
+		if (hc->tx_t_rto == 0) {
+			/*
+			 * Initial feedback packet: Larger Initial Windows (4.2)
+			 */
+			hc->tx_x    = rfc3390_initial_rate(sk);
+			hc->tx_t_ld = now;
+
+			ccid3_update_send_interval(hc);
+
+			goto done_computing_x;
+		} else if (hc->tx_p == 0) {
+			/*
+			 * First feedback after nofeedback timer expiry (4.3)
+			 */
+			goto done_computing_x;
+		}
+	}
+
+	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
+	if (hc->tx_p > 0)
+		hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
+	ccid3_hc_tx_update_x(sk, &now);
+
+done_computing_x:
+	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
+			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
+			       dccp_role(sk), sk, hc->tx_rtt, r_sample,
+			       hc->tx_s, hc->tx_p, hc->tx_x_calc,
+			       (unsigned)(hc->tx_x_recv >> 6),
+			       (unsigned)(hc->tx_x >> 6));
+
+	/* unschedule no feedback timer */
+	sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+
+	/*
+	 * As we have calculated new ipi, delta, t_nom it is possible
+	 * that we now can send a packet, so wake up dccp_wait_for_ccid
+	 */
+	sk->sk_write_space(sk);
+
+	/*
+	 * Update timeout interval for the nofeedback timer. In order to control
+	 * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+	 * tunable RTAX_RTO_MIN value as the lower bound.
+	 */
+	hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+				  USEC_PER_SEC/HZ * tcp_rto_min(sk));
+	/*
+	 * Schedule no feedback timer to expire in
+	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
+	 */
+	t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
+
+	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
+		       "expire in %lu jiffies (%luus)\n",
+		       dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
+
+	sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+			   jiffies + usecs_to_jiffies(t_nfb));
+}
+
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+				     u8 option, u8 *optval, u8 optlen)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	__be32 opt_val;
+
+	switch (option) {
+	case TFRC_OPT_RECEIVE_RATE:
+	case TFRC_OPT_LOSS_EVENT_RATE:
+		/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+		if (packet_type == DCCP_PKT_DATA)
+			break;
+		if (unlikely(optlen != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d for %u\n",
+				  dccp_role(sk), sk, optlen, option);
+			return -EINVAL;
+		}
+		opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+		if (option == TFRC_OPT_RECEIVE_RATE) {
+			/* Receive Rate is kept in units of 64 bytes/second */
+			hc->tx_x_recv = opt_val;
+			hc->tx_x_recv <<= 6;
+
+			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
+				       dccp_role(sk), sk, opt_val);
+		} else {
+			/* Update the fixpoint Loss Event Rate fraction */
+			hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+				       dccp_role(sk), sk, opt_val);
+		}
+	}
+	return 0;
+}
+
+static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+	struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
+
+	hc->tx_state = TFRC_SSTATE_NO_SENT;
+	hc->tx_hist  = NULL;
+	setup_timer(&hc->tx_no_feedback_timer,
+			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+	return 0;
+}
+
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+
+	sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+	tfrc_tx_hist_purge(&hc->tx_hist);
+}
+
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+	info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+	info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
+}
+
+static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
+				  u32 __user *optval, int __user *optlen)
+{
+	const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	struct tfrc_tx_info tfrc;
+	const void *val;
+
+	switch (optname) {
+	case DCCP_SOCKOPT_CCID_TX_INFO:
+		if (len < sizeof(tfrc))
+			return -EINVAL;
+		tfrc.tfrctx_x	   = hc->tx_x;
+		tfrc.tfrctx_x_recv = hc->tx_x_recv;
+		tfrc.tfrctx_x_calc = hc->tx_x_calc;
+		tfrc.tfrctx_rtt	   = hc->tx_rtt;
+		tfrc.tfrctx_p	   = hc->tx_p;
+		tfrc.tfrctx_rto	   = hc->tx_t_rto;
+		tfrc.tfrctx_ipi	   = hc->tx_t_ipi;
+		len = sizeof(tfrc);
+		val = &tfrc;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ *	Receiver Half-Connection Routines
+ */
+
+/* CCID3 feedback types */
+enum ccid3_fback_type {
+	CCID3_FBACK_NONE = 0,
+	CCID3_FBACK_INITIAL,
+	CCID3_FBACK_PERIODIC,
+	CCID3_FBACK_PARAM_CHANGE
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+	static const char *const ccid3_rx_state_names[] = {
+	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
+	[TFRC_RSTATE_DATA]    = "DATA",
+	};
+
+	return ccid3_rx_state_names[state];
+}
+#endif
+
+static void ccid3_hc_rx_set_state(struct sock *sk,
+				  enum ccid3_hc_rx_states state)
+{
+	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	enum ccid3_hc_rx_states oldstate = hc->rx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+		       ccid3_rx_state_name(state));
+	WARN_ON(state == oldstate);
+	hc->rx_state = state;
+}
+
+static void ccid3_hc_rx_send_feedback(struct sock *sk,
+				      const struct sk_buff *skb,
+				      enum ccid3_fback_type fbtype)
+{
+	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	ktime_t now = ktime_get_real();
+	s64 delta = 0;
+
+	switch (fbtype) {
+	case CCID3_FBACK_INITIAL:
+		hc->rx_x_recv = 0;
+		hc->rx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
+		break;
+	case CCID3_FBACK_PARAM_CHANGE:
+		/*
+		 * When parameters change (new loss or p > p_prev), we do not
+		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
+		 * need to  reuse the previous value of X_recv. However, when
+		 * X_recv was 0 (due to early loss), this would kill X down to
+		 * s/t_mbi (i.e. one packet in 64 seconds).
+		 * To avoid such drastic reduction, we approximate X_recv as
+		 * the number of bytes since last feedback.
+		 * This is a safe fallback, since X is bounded above by X_calc.
+		 */
+		if (hc->rx_x_recv > 0)
+			break;
+		/* fall through */
+	case CCID3_FBACK_PERIODIC:
+		delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
+		if (delta <= 0)
+			DCCP_BUG("delta (%ld) <= 0", (long)delta);
+		else
+			hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
+		break;
+	default:
+		return;
+	}
+
+	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
+		       hc->rx_x_recv, hc->rx_pinv);
+
+	hc->rx_tstamp_last_feedback = now;
+	hc->rx_last_counter	    = dccp_hdr(skb)->dccph_ccval;
+	hc->rx_bytes_recv	    = 0;
+
+	dp->dccps_hc_rx_insert_options = 1;
+	dccp_send_ack(sk);
+}
+
+static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+	const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	__be32 x_recv, pinv;
+
+	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
+		return 0;
+
+	if (dccp_packet_without_ack(skb))
+		return 0;
+
+	x_recv = htonl(hc->rx_x_recv);
+	pinv   = htonl(hc->rx_pinv);
+
+	if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
+			       &pinv, sizeof(pinv)) ||
+	    dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
+			       &x_recv, sizeof(x_recv)))
+		return -1;
+
+	return 0;
+}
+
+/**
+ * ccid3_first_li  -  Implements [RFC 5348, 6.3.1]
+ *
+ * Determine the length of the first loss interval via inverse lookup.
+ * Assume that X_recv can be computed by the throughput equation
+ *		    s
+ *	X_recv = --------
+ *		 R * fval
+ * Find some p such that f(p) = fval; return 1/p (scaled).
+ */
+static u32 ccid3_first_li(struct sock *sk)
+{
+	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	u32 x_recv, p, delta;
+	u64 fval;
+
+	if (hc->rx_rtt == 0) {
+		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+		hc->rx_rtt = DCCP_FALLBACK_RTT;
+	}
+
+	delta  = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback));
+	x_recv = scaled_div32(hc->rx_bytes_recv, delta);
+	if (x_recv == 0) {		/* would also trigger divide-by-zero */
+		DCCP_WARN("X_recv==0\n");
+		if (hc->rx_x_recv == 0) {
+			DCCP_BUG("stored value of X_recv is zero");
+			return ~0U;
+		}
+		x_recv = hc->rx_x_recv;
+	}
+
+	fval = scaled_div(hc->rx_s, hc->rx_rtt);
+	fval = scaled_div32(fval, x_recv);
+	p = tfrc_calc_x_reverse_lookup(fval);
+
+	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
+		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+
+	return p == 0 ? ~0U : scaled_div(1, p);
+}
+
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
+	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
+	const bool is_data_packet = dccp_data_packet(skb);
+
+	if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
+		if (is_data_packet) {
+			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+			do_feedback = CCID3_FBACK_INITIAL;
+			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+			hc->rx_s = payload;
+			/*
+			 * Not necessary to update rx_bytes_recv here,
+			 * since X_recv = 0 for the first feedback packet (cf.
+			 * RFC 3448, 6.3) -- gerrit
+			 */
+		}
+		goto update_records;
+	}
+
+	if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
+		return; /* done receiving */
+
+	if (is_data_packet) {
+		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+		/*
+		 * Update moving-average of s and the sum of received payload bytes
+		 */
+		hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
+		hc->rx_bytes_recv += payload;
+	}
+
+	/*
+	 * Perform loss detection and handle pending losses
+	 */
+	if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
+				skb, ndp, ccid3_first_li, sk)) {
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+		goto done_receiving;
+	}
+
+	if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
+		return; /* done receiving */
+
+	/*
+	 * Handle data packets: RTT sampling and monitoring p
+	 */
+	if (unlikely(!is_data_packet))
+		goto update_records;
+
+	if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
+		const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
+		/*
+		 * Empty loss history: no loss so far, hence p stays 0.
+		 * Sample RTT values, since an RTT estimate is required for the
+		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
+		 */
+		if (sample != 0)
+			hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
+
+	} else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
+		/*
+		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
+		 * has decreased (resp. p has increased), send feedback now.
+		 */
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+	}
+
+	/*
+	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
+	 */
+	if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
+		do_feedback = CCID3_FBACK_PERIODIC;
+
+update_records:
+	tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
+
+done_receiving:
+	if (do_feedback)
+		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
+}
+
+static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+	struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
+
+	hc->rx_state = TFRC_RSTATE_NO_DATA;
+	tfrc_lh_init(&hc->rx_li_hist);
+	return tfrc_rx_hist_alloc(&hc->rx_hist);
+}
+
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+
+	tfrc_rx_hist_purge(&hc->rx_hist);
+	tfrc_lh_cleanup(&hc->rx_li_hist);
+}
+
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+	info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
+	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
+	info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rx_rtt;
+}
+
+static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
+				  u32 __user *optval, int __user *optlen)
+{
+	const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+	struct tfrc_rx_info rx_info;
+	const void *val;
+
+	switch (optname) {
+	case DCCP_SOCKOPT_CCID_RX_INFO:
+		if (len < sizeof(rx_info))
+			return -EINVAL;
+		rx_info.tfrcrx_x_recv = hc->rx_x_recv;
+		rx_info.tfrcrx_rtt    = hc->rx_rtt;
+		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hc->rx_pinv);
+		len = sizeof(rx_info);
+		val = &rx_info;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+struct ccid_operations ccid3_ops = {
+	.ccid_id		   = DCCPC_CCID3,
+	.ccid_name		   = "TCP-Friendly Rate Control",
+	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
+	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
+	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
+	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
+	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
+	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
+	.ccid_hc_rx_obj_size	   = sizeof(struct ccid3_hc_rx_sock),
+	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
+	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
+	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
+	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
+	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
+	.ccid_hc_rx_getsockopt	   = ccid3_hc_rx_getsockopt,
+	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+module_param(ccid3_debug, bool, 0644);
+MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
+#endif
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 00000000..1a9933c2
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/tfrc.h>
+#include "lib/tfrc.h"
+#include "../ccid.h"
+
+/* Two seconds as per RFC 5348, 4.2 */
+#define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
+
+/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
+#define TFRC_T_MBI		   64
+
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta  =  t_gran / 2  =  %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA		   USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA		   (USEC_PER_SEC / (2 * HZ))
+#endif
+
+enum ccid3_options {
+	TFRC_OPT_LOSS_EVENT_RATE = 192,
+	TFRC_OPT_LOSS_INTERVALS	 = 193,
+	TFRC_OPT_RECEIVE_RATE	 = 194,
+};
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+	TFRC_SSTATE_NO_SENT = 1,
+	TFRC_SSTATE_NO_FBACK,
+	TFRC_SSTATE_FBACK,
+};
+
+/**
+ * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
+ * @tx_x:		  Current sending rate in 64 * bytes per second
+ * @tx_x_recv:		  Receive rate in 64 * bytes per second
+ * @tx_x_calc:		  Calculated rate in bytes per second
+ * @tx_rtt:		  Estimate of current round trip time in usecs
+ * @tx_p:		  Current loss event rate (0-1) scaled by 1000000
+ * @tx_s:		  Packet size in bytes
+ * @tx_t_rto:		  Nofeedback Timer setting in usecs
+ * @tx_t_ipi:		  Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @tx_state:		  Sender state, one of %ccid3_hc_tx_states
+ * @tx_last_win_count:	  Last window counter sent
+ * @tx_t_last_win_count:  Timestamp of earliest packet
+ *			  with last_win_count value sent
+ * @tx_no_feedback_timer: Handle to no feedback timer
+ * @tx_t_ld:		  Time last doubled during slow start
+ * @tx_t_nom:		  Nominal send time of next packet
+ * @tx_hist:		  Packet history
+ */
+struct ccid3_hc_tx_sock {
+	u64				tx_x;
+	u64				tx_x_recv;
+	u32				tx_x_calc;
+	u32				tx_rtt;
+	u32				tx_p;
+	u32				tx_t_rto;
+	u32				tx_t_ipi;
+	u16				tx_s;
+	enum ccid3_hc_tx_states		tx_state:8;
+	u8				tx_last_win_count;
+	ktime_t				tx_t_last_win_count;
+	struct timer_list		tx_no_feedback_timer;
+	ktime_t				tx_t_ld;
+	ktime_t				tx_t_nom;
+	struct tfrc_tx_hist_entry	*tx_hist;
+};
+
+static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
+{
+	struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+	BUG_ON(hctx == NULL);
+	return hctx;
+}
+
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+	TFRC_RSTATE_NO_DATA = 1,
+	TFRC_RSTATE_DATA,
+};
+
+/**
+ * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
+ * @rx_last_counter:	     Tracks window counter (RFC 4342, 8.1)
+ * @rx_state:		     Receiver state, one of %ccid3_hc_rx_states
+ * @rx_bytes_recv:	     Total sum of DCCP payload bytes
+ * @rx_x_recv:		     Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ * @rx_rtt:		     Receiver estimate of RTT
+ * @rx_tstamp_last_feedback: Time at which last feedback was sent
+ * @rx_hist:		     Packet history (loss detection + RTT sampling)
+ * @rx_li_hist:		     Loss Interval database
+ * @rx_s:		     Received packet size in bytes
+ * @rx_pinv:		     Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ */
+struct ccid3_hc_rx_sock {
+	u8				rx_last_counter:4;
+	enum ccid3_hc_rx_states		rx_state:8;
+	u32				rx_bytes_recv;
+	u32				rx_x_recv;
+	u32				rx_rtt;
+	ktime_t				rx_tstamp_last_feedback;
+	struct tfrc_rx_hist		rx_hist;
+	struct tfrc_loss_hist		rx_li_hist;
+	u16				rx_s;
+#define rx_pinv				rx_li_hist.i_mean
+};
+
+static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
+{
+	struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+	BUG_ON(hcrx == NULL);
+	return hcrx;
+}
+
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 00000000..497723c4
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <net/sock.h>
+#include "tfrc.h"
+
+static struct kmem_cache  *tfrc_lh_slab  __read_mostly;
+/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
+static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
+
+/* implements LIFO semantics on the array */
+static inline u8 LIH_INDEX(const u8 ctr)
+{
+	return LIH_SIZE - 1 - (ctr % LIH_SIZE);
+}
+
+/* the `counter' index always points at the next entry to be populated */
+static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
+{
+	return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
+}
+
+/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
+static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
+{
+	BUG_ON(i >= lh->counter);
+	return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
+}
+
+/*
+ *	On-demand allocation and de-allocation of entries
+ */
+static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
+{
+	if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
+		lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
+								    GFP_ATOMIC);
+	return lh->ring[LIH_INDEX(lh->counter)];
+}
+
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
+{
+	if (!tfrc_lh_is_initialised(lh))
+		return;
+
+	for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
+		if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
+			kmem_cache_free(tfrc_lh_slab,
+					lh->ring[LIH_INDEX(lh->counter)]);
+			lh->ring[LIH_INDEX(lh->counter)] = NULL;
+		}
+}
+
+static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
+{
+	u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
+	int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
+
+	if (k <= 0)
+		return;
+
+	for (i = 0; i <= k; i++) {
+		i_i = tfrc_lh_get_interval(lh, i);
+
+		if (i < k) {
+			i_tot0 += i_i * tfrc_lh_weights[i];
+			w_tot  += tfrc_lh_weights[i];
+		}
+		if (i > 0)
+			i_tot1 += i_i * tfrc_lh_weights[i-1];
+	}
+
+	lh->i_mean = max(i_tot0, i_tot1) / w_tot;
+}
+
+/**
+ * tfrc_lh_update_i_mean  -  Update the `open' loss interval I_0
+ * For recomputing p: returns `true' if p > p_prev  <=>  1/p < 1/p_prev
+ */
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
+{
+	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
+	u32 old_i_mean = lh->i_mean;
+	s64 len;
+
+	if (cur == NULL)			/* not initialised */
+		return 0;
+
+	len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
+
+	if (len - (s64)cur->li_length <= 0)	/* duplicate or reordered */
+		return 0;
+
+	if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
+		/*
+		 * Implements RFC 4342, 10.2:
+		 * If a packet S (skb) exists whose seqno comes `after' the one
+		 * starting the current loss interval (cur) and if the modulo-16
+		 * distance from C(cur) to C(S) is greater than 4, consider all
+		 * subsequent packets as belonging to a new loss interval. This
+		 * test is necessary since CCVal may wrap between intervals.
+		 */
+		cur->li_is_closed = 1;
+
+	if (tfrc_lh_length(lh) == 1)		/* due to RFC 3448, 6.3.1 */
+		return 0;
+
+	cur->li_length = len;
+	tfrc_lh_calc_i_mean(lh);
+
+	return lh->i_mean < old_i_mean;
+}
+
+/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
+static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
+				     struct tfrc_rx_hist_entry *new_loss)
+{
+	return	dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
+		(cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
+}
+
+/**
+ * tfrc_lh_interval_add  -  Insert new record into the Loss Interval database
+ * @lh:		   Loss Interval database
+ * @rh:		   Receive history containing a fresh loss event
+ * @calc_first_li: Caller-dependent routine to compute length of first interval
+ * @sk:		   Used by @calc_first_li in caller-specific way (subtyping)
+ * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
+ */
+int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+			 u32 (*calc_first_li)(struct sock *), struct sock *sk)
+{
+	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
+
+	if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
+		return 0;
+
+	new = tfrc_lh_demand_next(lh);
+	if (unlikely(new == NULL)) {
+		DCCP_CRIT("Cannot allocate/add loss record.");
+		return 0;
+	}
+
+	new->li_seqno	  = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
+	new->li_ccval	  = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
+	new->li_is_closed = 0;
+
+	if (++lh->counter == 1)
+		lh->i_mean = new->li_length = (*calc_first_li)(sk);
+	else {
+		cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
+		new->li_length = dccp_delta_seqno(new->li_seqno,
+				  tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
+		if (lh->counter > (2*LIH_SIZE))
+			lh->counter -= LIH_SIZE;
+
+		tfrc_lh_calc_i_mean(lh);
+	}
+	return 1;
+}
+
+int __init tfrc_li_init(void)
+{
+	tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
+					 sizeof(struct tfrc_loss_interval), 0,
+					 SLAB_HWCACHE_ALIGN, NULL);
+	return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_li_exit(void)
+{
+	if (tfrc_lh_slab != NULL) {
+		kmem_cache_destroy(tfrc_lh_slab);
+		tfrc_lh_slab = NULL;
+	}
+}
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 00000000..d1d2f538
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,73 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ */
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/*
+ * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
+ * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
+ */
+#define NINTERVAL	8
+#define LIH_SIZE	(NINTERVAL + 1)
+
+/**
+ *  tfrc_loss_interval  -  Loss history record for TFRC-based protocols
+ *  @li_seqno:		Highest received seqno before the start of loss
+ *  @li_ccval:		The CCVal belonging to @li_seqno
+ *  @li_is_closed:	Whether @li_seqno is older than 1 RTT
+ *  @li_length:		Loss interval sequence length
+ */
+struct tfrc_loss_interval {
+	u64		 li_seqno:48,
+			 li_ccval:4,
+			 li_is_closed:1;
+	u32		 li_length;
+};
+
+/**
+ *  tfrc_loss_hist  -  Loss record database
+ *  @ring:	Circular queue managed in LIFO manner
+ *  @counter:	Current count of entries (can be more than %LIH_SIZE)
+ *  @i_mean:	Current Average Loss Interval [RFC 3448, 5.4]
+ */
+struct tfrc_loss_hist {
+	struct tfrc_loss_interval	*ring[LIH_SIZE];
+	u8				counter;
+	u32				i_mean;
+};
+
+static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
+{
+	memset(lh, 0, sizeof(struct tfrc_loss_hist));
+}
+
+static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
+{
+	return lh->counter > 0;
+}
+
+static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
+{
+	return min(lh->counter, (u8)LIH_SIZE);
+}
+
+struct tfrc_rx_hist;
+
+extern int  tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+				 u32 (*first_li)(struct sock *), struct sock *);
+extern u8   tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
+
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 00000000..de8fe294
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,448 @@
+/*
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "packet_history.h"
+#include "../../dccp.h"
+
+/*
+ * Transmitter History Routines
+ */
+static struct kmem_cache *tfrc_tx_hist_slab;
+
+int __init tfrc_tx_packet_history_init(void)
+{
+	tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
+					      sizeof(struct tfrc_tx_hist_entry),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_tx_packet_history_exit(void)
+{
+	if (tfrc_tx_hist_slab != NULL) {
+		kmem_cache_destroy(tfrc_tx_hist_slab);
+		tfrc_tx_hist_slab = NULL;
+	}
+}
+
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
+{
+	struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
+
+	if (entry == NULL)
+		return -ENOBUFS;
+	entry->seqno = seqno;
+	entry->stamp = ktime_get_real();
+	entry->next  = *headp;
+	*headp	     = entry;
+	return 0;
+}
+
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
+{
+	struct tfrc_tx_hist_entry *head = *headp;
+
+	while (head != NULL) {
+		struct tfrc_tx_hist_entry *next = head->next;
+
+		kmem_cache_free(tfrc_tx_hist_slab, head);
+		head = next;
+	}
+
+	*headp = NULL;
+}
+
+/*
+ *	Receiver History Routines
+ */
+static struct kmem_cache *tfrc_rx_hist_slab;
+
+int __init tfrc_rx_packet_history_init(void)
+{
+	tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
+					      sizeof(struct tfrc_rx_hist_entry),
+					      0, SLAB_HWCACHE_ALIGN, NULL);
+	return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
+}
+
+void tfrc_rx_packet_history_exit(void)
+{
+	if (tfrc_rx_hist_slab != NULL) {
+		kmem_cache_destroy(tfrc_rx_hist_slab);
+		tfrc_rx_hist_slab = NULL;
+	}
+}
+
+static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
+					       const struct sk_buff *skb,
+					       const u64 ndp)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+
+	entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+	entry->tfrchrx_ccval = dh->dccph_ccval;
+	entry->tfrchrx_type  = dh->dccph_type;
+	entry->tfrchrx_ndp   = ndp;
+	entry->tfrchrx_tstamp = ktime_get_real();
+}
+
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
+			     const struct sk_buff *skb,
+			     const u64 ndp)
+{
+	struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
+
+	tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
+}
+
+/* has the packet contained in skb been seen before? */
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
+{
+	const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
+	int i;
+
+	if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
+		return 1;
+
+	for (i = 1; i <= h->loss_count; i++)
+		if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
+			return 1;
+
+	return 0;
+}
+
+static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
+{
+	const u8 idx_a = tfrc_rx_hist_index(h, a),
+		 idx_b = tfrc_rx_hist_index(h, b);
+	struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
+
+	h->ring[idx_a] = h->ring[idx_b];
+	h->ring[idx_b] = tmp;
+}
+
+/*
+ * Private helper functions for loss detection.
+ *
+ * In the descriptions, `Si' refers to the sequence number of entry number i,
+ * whose NDP count is `Ni' (lower case is used for variables).
+ * Note: All __xxx_loss functions expect that a test against duplicates has been
+ *       performed already: the seqno of the skb must not be less than the seqno
+ *       of loss_prev; and it must not equal that of any valid history entry.
+ */
+static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
+{
+	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+	    s1 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+	if (!dccp_loss_free(s0, s1, n1)) {	/* gap between S0 and S1 */
+		h->loss_count = 1;
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
+	}
+}
+
+static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
+{
+	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+	    s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+	    s2 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+	if (likely(dccp_delta_seqno(s1, s2) > 0)) {	/* S1  <  S2 */
+		h->loss_count = 2;
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
+		return;
+	}
+
+	/* S0  <  S2  <  S1 */
+
+	if (dccp_loss_free(s0, s2, n2)) {
+		u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+		if (dccp_loss_free(s2, s1, n1)) {
+			/* hole is filled: S0, S2, and S1 are consecutive */
+			h->loss_count = 0;
+			h->loss_start = tfrc_rx_hist_index(h, 1);
+		} else
+			/* gap between S2 and S1: just update loss_prev */
+			tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
+
+	} else {	/* gap between S0 and S2 */
+		/*
+		 * Reorder history to insert S2 between S0 and S1
+		 */
+		tfrc_rx_hist_swap(h, 0, 3);
+		h->loss_start = tfrc_rx_hist_index(h, 3);
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
+		h->loss_count = 2;
+	}
+}
+
+/* return 1 if a new loss event has been identified */
+static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
+{
+	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+	    s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+	    s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+	    s3 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+	if (likely(dccp_delta_seqno(s2, s3) > 0)) {	/* S2  <  S3 */
+		h->loss_count = 3;
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
+		return 1;
+	}
+
+	/* S3  <  S2 */
+
+	if (dccp_delta_seqno(s1, s3) > 0) {		/* S1  <  S3  <  S2 */
+		/*
+		 * Reorder history to insert S3 between S1 and S2
+		 */
+		tfrc_rx_hist_swap(h, 2, 3);
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
+		h->loss_count = 3;
+		return 1;
+	}
+
+	/* S0  <  S3  <  S1 */
+
+	if (dccp_loss_free(s0, s3, n3)) {
+		u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+		if (dccp_loss_free(s3, s1, n1)) {
+			/* hole between S0 and S1 filled by S3 */
+			u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
+
+			if (dccp_loss_free(s1, s2, n2)) {
+				/* entire hole filled by S0, S3, S1, S2 */
+				h->loss_start = tfrc_rx_hist_index(h, 2);
+				h->loss_count = 0;
+			} else {
+				/* gap remains between S1 and S2 */
+				h->loss_start = tfrc_rx_hist_index(h, 1);
+				h->loss_count = 1;
+			}
+
+		} else /* gap exists between S3 and S1, loss_count stays at 2 */
+			tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
+
+		return 0;
+	}
+
+	/*
+	 * The remaining case:  S0  <  S3  <  S1  <  S2;  gap between S0 and S3
+	 * Reorder history to insert S3 between S0 and S1.
+	 */
+	tfrc_rx_hist_swap(h, 0, 3);
+	h->loss_start = tfrc_rx_hist_index(h, 3);
+	tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
+	h->loss_count = 3;
+
+	return 1;
+}
+
+/* recycle RX history records to continue loss detection if necessary */
+static void __three_after_loss(struct tfrc_rx_hist *h)
+{
+	/*
+	 * At this stage we know already that there is a gap between S0 and S1
+	 * (since S0 was the highest sequence number received before detecting
+	 * the loss). To recycle the loss record, it is	thus only necessary to
+	 * check for other possible gaps between S1/S2 and between S2/S3.
+	 */
+	u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+	    s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+	    s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
+	u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
+	    n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
+
+	if (dccp_loss_free(s1, s2, n2)) {
+
+		if (dccp_loss_free(s2, s3, n3)) {
+			/* no gap between S2 and S3: entire hole is filled */
+			h->loss_start = tfrc_rx_hist_index(h, 3);
+			h->loss_count = 0;
+		} else {
+			/* gap between S2 and S3 */
+			h->loss_start = tfrc_rx_hist_index(h, 2);
+			h->loss_count = 1;
+		}
+
+	} else {	/* gap between S1 and S2 */
+		h->loss_start = tfrc_rx_hist_index(h, 1);
+		h->loss_count = 2;
+	}
+}
+
+/**
+ *  tfrc_rx_handle_loss  -  Loss detection and further processing
+ *  @h:		    The non-empty RX history object
+ *  @lh:	    Loss Intervals database to update
+ *  @skb:	    Currently received packet
+ *  @ndp:	    The NDP count belonging to @skb
+ *  @calc_first_li: Caller-dependent computation of first loss interval in @lh
+ *  @sk:	    Used by @calc_first_li (see tfrc_lh_interval_add)
+ *  Chooses action according to pending loss, updates LI database when a new
+ *  loss was detected, and does required post-processing. Returns 1 when caller
+ *  should send feedback, 0 otherwise.
+ *  Since it also takes care of reordering during loss detection and updates the
+ *  records accordingly, the caller should not perform any more RX history
+ *  operations when loss_count is greater than 0 after calling this function.
+ */
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+			struct tfrc_loss_hist *lh,
+			struct sk_buff *skb, const u64 ndp,
+			u32 (*calc_first_li)(struct sock *), struct sock *sk)
+{
+	int is_new_loss = 0;
+
+	if (h->loss_count == 0) {
+		__do_track_loss(h, skb, ndp);
+	} else if (h->loss_count == 1) {
+		__one_after_loss(h, skb, ndp);
+	} else if (h->loss_count != 2) {
+		DCCP_BUG("invalid loss_count %d", h->loss_count);
+	} else if (__two_after_loss(h, skb, ndp)) {
+		/*
+		 * Update Loss Interval database and recycle RX records
+		 */
+		is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
+		__three_after_loss(h);
+	}
+	return is_new_loss;
+}
+
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
+{
+	int i;
+
+	for (i = 0; i <= TFRC_NDUPACK; i++) {
+		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+		if (h->ring[i] == NULL)
+			goto out_free;
+	}
+
+	h->loss_count = h->loss_start = 0;
+	return 0;
+
+out_free:
+	while (i-- != 0) {
+		kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+		h->ring[i] = NULL;
+	}
+	return -ENOBUFS;
+}
+
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
+{
+	int i;
+
+	for (i = 0; i <= TFRC_NDUPACK; ++i)
+		if (h->ring[i] != NULL) {
+			kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+			h->ring[i] = NULL;
+		}
+}
+
+/**
+ * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
+{
+	return h->ring[0];
+}
+
+/**
+ * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
+{
+	return h->ring[h->rtt_sample_prev];
+}
+
+/**
+ * tfrc_rx_hist_sample_rtt  -  Sample RTT from timestamp / CCVal
+ * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
+ * to compute a sample with given data - calling function should check this.
+ */
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
+{
+	u32 sample = 0,
+	    delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+			    tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+
+	if (delta_v < 1 || delta_v > 4) {	/* unsuitable CCVal delta */
+		if (h->rtt_sample_prev == 2) {	/* previous candidate stored */
+			sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+				       tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+			if (sample)
+				sample = 4 / sample *
+				         ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
+							tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
+			else    /*
+				 * FIXME: This condition is in principle not
+				 * possible but occurs when CCID is used for
+				 * two-way data traffic. I have tried to trace
+				 * it, but the cause does not seem to be here.
+				 */
+				DCCP_BUG("please report to dccp@vger.kernel.org"
+					 " => prev = %u, last = %u",
+					 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+					 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+		} else if (delta_v < 1) {
+			h->rtt_sample_prev = 1;
+			goto keep_ref_for_next_time;
+		}
+
+	} else if (delta_v == 4) /* optimal match */
+		sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
+	else {			 /* suboptimal match */
+		h->rtt_sample_prev = 2;
+		goto keep_ref_for_next_time;
+	}
+
+	if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %u too large, using max\n", sample);
+		sample = DCCP_SANE_RTT_MAX;
+	}
+
+	h->rtt_sample_prev = 0;	       /* use current entry as next reference */
+keep_ref_for_next_time:
+
+	return sample;
+}
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 00000000..7ee4a9d9
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,158 @@
+/*
+ *  Packet RX/TX history data structures and routines for TFRC-based protocols.
+ *
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "tfrc.h"
+
+/**
+ *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
+ *  @next:  next oldest entry (LIFO order)
+ *  @seqno: sequence number of this entry
+ *  @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+	struct tfrc_tx_hist_entry *next;
+	u64			  seqno;
+	ktime_t			  stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+	while (head != NULL && head->seqno != seqno)
+		head = head->next;
+	return head;
+}
+
+extern int  tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
+extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
+
+/* Subtraction a-b modulo-16, respects circular wrap-around */
+#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
+
+/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
+#define TFRC_NDUPACK 3
+
+/**
+ * tfrc_rx_hist_entry - Store information about a single received packet
+ * @tfrchrx_seqno:	DCCP packet sequence number
+ * @tfrchrx_ccval:	window counter value of packet (RFC 4342, 8.1)
+ * @tfrchrx_ndp:	the NDP count (if any) of the packet
+ * @tfrchrx_tstamp:	actual receive time of packet
+ */
+struct tfrc_rx_hist_entry {
+	u64		 tfrchrx_seqno:48,
+			 tfrchrx_ccval:4,
+			 tfrchrx_type:4;
+	u64		 tfrchrx_ndp:48;
+	ktime_t		 tfrchrx_tstamp;
+};
+
+/**
+ * tfrc_rx_hist  -  RX history structure for TFRC-based protocols
+ * @ring:		Packet history for RTT sampling and loss detection
+ * @loss_count:		Number of entries in circular history
+ * @loss_start:		Movable index (for loss detection)
+ * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
+ */
+struct tfrc_rx_hist {
+	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
+	u8			  loss_count:2,
+				  loss_start:2;
+#define rtt_sample_prev		  loss_start
+};
+
+/**
+ * tfrc_rx_hist_index - index to reach n-th entry after loss_start
+ */
+static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
+{
+	return (h->loss_start + n) & TFRC_NDUPACK;
+}
+
+/**
+ * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
+{
+	return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
+}
+
+/**
+ * tfrc_rx_hist_entry - return the n-th history entry after loss_start
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
+{
+	return h->ring[tfrc_rx_hist_index(h, n)];
+}
+
+/**
+ * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
+{
+	return h->ring[h->loss_start];
+}
+
+/* indicate whether previously a packet was detected missing */
+static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
+{
+	return h->loss_count > 0;
+}
+
+extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
+				    const struct sk_buff *skb, const u64 ndp);
+
+extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
+
+struct tfrc_loss_hist;
+extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+				struct tfrc_loss_hist *lh,
+				struct sk_buff *skb, const u64 ndp,
+				u32 (*first_li)(struct sock *sk),
+				struct sock *sk);
+extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
+				   const struct sk_buff *skb);
+extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
+
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
new file mode 100644
index 00000000..49020298
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -0,0 +1,44 @@
+/*
+ * TFRC library initialisation
+ *
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+#include "tfrc.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+int tfrc_debug;
+module_param(tfrc_debug, bool, 0644);
+MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
+#endif
+
+int __init tfrc_lib_init(void)
+{
+	int rc = tfrc_li_init();
+
+	if (rc)
+		goto out;
+
+	rc = tfrc_tx_packet_history_init();
+	if (rc)
+		goto out_free_loss_intervals;
+
+	rc = tfrc_rx_packet_history_init();
+	if (rc)
+		goto out_free_tx_history;
+	return 0;
+
+out_free_tx_history:
+	tfrc_tx_packet_history_exit();
+out_free_loss_intervals:
+	tfrc_li_exit();
+out:
+	return rc;
+}
+
+void tfrc_lib_exit(void)
+{
+	tfrc_rx_packet_history_exit();
+	tfrc_tx_packet_history_exit();
+	tfrc_li_exit();
+}
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 00000000..f8ee3f54
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,77 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ *  Copyright (c) 2007   The University of Aberdeen, Scotland, UK
+ *  Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *  Copyright (c) 2005   Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003   Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/math64.h>
+#include "../../dccp.h"
+
+/* internal includes that this library exports: */
+#include "loss_interval.h"
+#include "packet_history.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+extern int tfrc_debug;
+#define tfrc_pr_debug(format, a...)	DCCP_PR_DEBUG(tfrc_debug, format, ##a)
+#else
+#define tfrc_pr_debug(format, a...)
+#endif
+
+/* integer-arithmetic divisions of type (a * 1000000)/b */
+static inline u64 scaled_div(u64 a, u64 b)
+{
+	BUG_ON(b == 0);
+	return div64_u64(a * 1000000, b);
+}
+
+static inline u32 scaled_div32(u64 a, u64 b)
+{
+	u64 result = scaled_div(a, b);
+
+	if (result > UINT_MAX) {
+		DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+			  (unsigned long long)a, (unsigned long long)b);
+		return UINT_MAX;
+	}
+	return result;
+}
+
+/**
+ * tfrc_ewma  -  Exponentially weighted moving average
+ * @weight: Weight to be used as damping factor, in units of 1/10
+ */
+static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
+{
+	return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
+}
+
+extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32  tfrc_invert_loss_event_rate(u32 loss_event_rate);
+
+extern int  tfrc_tx_packet_history_init(void);
+extern void tfrc_tx_packet_history_exit(void);
+extern int  tfrc_rx_packet_history_init(void);
+extern void tfrc_rx_packet_history_exit(void);
+
+extern int  tfrc_li_init(void);
+extern void tfrc_li_exit(void);
+
+#ifdef CONFIG_IP_DCCP_TFRC_LIB
+extern int  tfrc_lib_init(void);
+extern void tfrc_lib_exit(void);
+#else
+#define tfrc_lib_init() (0)
+#define tfrc_lib_exit()
+#endif
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 00000000..a052a437
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,703 @@
+/*
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include "../../dccp.h"
+#include "tfrc.h"
+
+#define TFRC_CALC_X_ARRSIZE 500
+#define TFRC_CALC_X_SPLIT   50000	/* 0.05 * 1000000, details below */
+#define TFRC_SMALLEST_P	    (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
+
+/*
+  TFRC TCP Reno Throughput Equation Lookup Table for f(p)
+
+  The following two-column lookup table implements a part of the TCP throughput
+  equation from [RFC 3448, sec. 3.1]:
+
+				     s
+  X_calc  =  --------------------------------------------------------------
+	     R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
+
+  Where:
+	X      is the transmit rate in bytes/second
+	s      is the packet size in bytes
+	R      is the round trip time in seconds
+	p      is the loss event rate, between 0 and 1.0, of the number of loss
+		      events as a fraction of the number of packets transmitted
+	t_RTO  is the TCP retransmission timeout value in seconds
+	b      is the number of packets acknowledged by a single TCP ACK
+
+  We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
+
+				     s
+  X_calc  =  -------------------------------------------------------
+	     R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
+
+  which we can break down into:
+
+		      s
+	X_calc  =  ---------
+		    R * f(p)
+
+  where f(p) is given for 0 < p <= 1 by:
+
+	f(p)  =  sqrt(2*p/3) + 12 * sqrt(3*p/8) *  (p + 32*p^3)
+
+  Since this is kernel code, floating-point arithmetic is avoided in favour of
+  integer arithmetic. This means that nearly all fractional parameters are
+  scaled by 1000000:
+    * the parameters p and R
+    * the return result f(p)
+  The lookup table therefore actually tabulates the following function g(q):
+
+	g(q)  =  1000000 * f(q/1000000)
+
+  Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
+  granularity for the practically more relevant case of small values of p (up to
+  5%), the second column is used; the first one ranges up to 100%.  This split
+  corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
+  determines the smallest resolution possible with this lookup table:
+
+    TFRC_SMALLEST_P   =  TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
+
+  The entire table is generated by:
+    for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
+	lookup[i][0]  =  g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
+	lookup[i][1]  =  g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
+    }
+
+  With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
+    lookup[0][0]  =  g(1000000/(M+1))		= 1000000 * f(0.2%)
+    lookup[M][0]  =  g(1000000)			= 1000000 * f(100%)
+    lookup[0][1]  =  g(TFRC_SMALLEST_P)		= 1000000 * f(0.01%)
+    lookup[M][1]  =  g(TFRC_CALC_X_SPLIT)	= 1000000 * f(5%)
+
+  In summary, the two columns represent f(p) for the following ranges:
+    * The first column is for   0.002  <= p <= 1.0
+    * The second column is for  0.0001 <= p <= 0.05
+  Where the columns overlap, the second (finer-grained) is given preference,
+  i.e. the first column is used only for p >= 0.05.
+ */
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+	{     37172,   8172 },
+	{     53499,  11567 },
+	{     66664,  14180 },
+	{     78298,  16388 },
+	{     89021,  18339 },
+	{     99147,  20108 },
+	{    108858,  21738 },
+	{    118273,  23260 },
+	{    127474,  24693 },
+	{    136520,  26052 },
+	{    145456,  27348 },
+	{    154316,  28589 },
+	{    163130,  29783 },
+	{    171919,  30935 },
+	{    180704,  32049 },
+	{    189502,  33130 },
+	{    198328,  34180 },
+	{    207194,  35202 },
+	{    216114,  36198 },
+	{    225097,  37172 },
+	{    234153,  38123 },
+	{    243294,  39055 },
+	{    252527,  39968 },
+	{    261861,  40864 },
+	{    271305,  41743 },
+	{    280866,  42607 },
+	{    290553,  43457 },
+	{    300372,  44293 },
+	{    310333,  45117 },
+	{    320441,  45929 },
+	{    330705,  46729 },
+	{    341131,  47518 },
+	{    351728,  48297 },
+	{    362501,  49066 },
+	{    373460,  49826 },
+	{    384609,  50577 },
+	{    395958,  51320 },
+	{    407513,  52054 },
+	{    419281,  52780 },
+	{    431270,  53499 },
+	{    443487,  54211 },
+	{    455940,  54916 },
+	{    468635,  55614 },
+	{    481581,  56306 },
+	{    494785,  56991 },
+	{    508254,  57671 },
+	{    521996,  58345 },
+	{    536019,  59014 },
+	{    550331,  59677 },
+	{    564939,  60335 },
+	{    579851,  60988 },
+	{    595075,  61636 },
+	{    610619,  62279 },
+	{    626491,  62918 },
+	{    642700,  63553 },
+	{    659253,  64183 },
+	{    676158,  64809 },
+	{    693424,  65431 },
+	{    711060,  66050 },
+	{    729073,  66664 },
+	{    747472,  67275 },
+	{    766266,  67882 },
+	{    785464,  68486 },
+	{    805073,  69087 },
+	{    825103,  69684 },
+	{    845562,  70278 },
+	{    866460,  70868 },
+	{    887805,  71456 },
+	{    909606,  72041 },
+	{    931873,  72623 },
+	{    954614,  73202 },
+	{    977839,  73778 },
+	{   1001557,  74352 },
+	{   1025777,  74923 },
+	{   1050508,  75492 },
+	{   1075761,  76058 },
+	{   1101544,  76621 },
+	{   1127867,  77183 },
+	{   1154739,  77741 },
+	{   1182172,  78298 },
+	{   1210173,  78852 },
+	{   1238753,  79405 },
+	{   1267922,  79955 },
+	{   1297689,  80503 },
+	{   1328066,  81049 },
+	{   1359060,  81593 },
+	{   1390684,  82135 },
+	{   1422947,  82675 },
+	{   1455859,  83213 },
+	{   1489430,  83750 },
+	{   1523671,  84284 },
+	{   1558593,  84817 },
+	{   1594205,  85348 },
+	{   1630518,  85878 },
+	{   1667543,  86406 },
+	{   1705290,  86932 },
+	{   1743770,  87457 },
+	{   1782994,  87980 },
+	{   1822973,  88501 },
+	{   1863717,  89021 },
+	{   1905237,  89540 },
+	{   1947545,  90057 },
+	{   1990650,  90573 },
+	{   2034566,  91087 },
+	{   2079301,  91600 },
+	{   2124869,  92111 },
+	{   2171279,  92622 },
+	{   2218543,  93131 },
+	{   2266673,  93639 },
+	{   2315680,  94145 },
+	{   2365575,  94650 },
+	{   2416371,  95154 },
+	{   2468077,  95657 },
+	{   2520707,  96159 },
+	{   2574271,  96660 },
+	{   2628782,  97159 },
+	{   2684250,  97658 },
+	{   2740689,  98155 },
+	{   2798110,  98651 },
+	{   2856524,  99147 },
+	{   2915944,  99641 },
+	{   2976382, 100134 },
+	{   3037850, 100626 },
+	{   3100360, 101117 },
+	{   3163924, 101608 },
+	{   3228554, 102097 },
+	{   3294263, 102586 },
+	{   3361063, 103073 },
+	{   3428966, 103560 },
+	{   3497984, 104045 },
+	{   3568131, 104530 },
+	{   3639419, 105014 },
+	{   3711860, 105498 },
+	{   3785467, 105980 },
+	{   3860253, 106462 },
+	{   3936229, 106942 },
+	{   4013410, 107422 },
+	{   4091808, 107902 },
+	{   4171435, 108380 },
+	{   4252306, 108858 },
+	{   4334431, 109335 },
+	{   4417825, 109811 },
+	{   4502501, 110287 },
+	{   4588472, 110762 },
+	{   4675750, 111236 },
+	{   4764349, 111709 },
+	{   4854283, 112182 },
+	{   4945564, 112654 },
+	{   5038206, 113126 },
+	{   5132223, 113597 },
+	{   5227627, 114067 },
+	{   5324432, 114537 },
+	{   5422652, 115006 },
+	{   5522299, 115474 },
+	{   5623389, 115942 },
+	{   5725934, 116409 },
+	{   5829948, 116876 },
+	{   5935446, 117342 },
+	{   6042439, 117808 },
+	{   6150943, 118273 },
+	{   6260972, 118738 },
+	{   6372538, 119202 },
+	{   6485657, 119665 },
+	{   6600342, 120128 },
+	{   6716607, 120591 },
+	{   6834467, 121053 },
+	{   6953935, 121514 },
+	{   7075025, 121976 },
+	{   7197752, 122436 },
+	{   7322131, 122896 },
+	{   7448175, 123356 },
+	{   7575898, 123815 },
+	{   7705316, 124274 },
+	{   7836442, 124733 },
+	{   7969291, 125191 },
+	{   8103877, 125648 },
+	{   8240216, 126105 },
+	{   8378321, 126562 },
+	{   8518208, 127018 },
+	{   8659890, 127474 },
+	{   8803384, 127930 },
+	{   8948702, 128385 },
+	{   9095861, 128840 },
+	{   9244875, 129294 },
+	{   9395760, 129748 },
+	{   9548529, 130202 },
+	{   9703198, 130655 },
+	{   9859782, 131108 },
+	{  10018296, 131561 },
+	{  10178755, 132014 },
+	{  10341174, 132466 },
+	{  10505569, 132917 },
+	{  10671954, 133369 },
+	{  10840345, 133820 },
+	{  11010757, 134271 },
+	{  11183206, 134721 },
+	{  11357706, 135171 },
+	{  11534274, 135621 },
+	{  11712924, 136071 },
+	{  11893673, 136520 },
+	{  12076536, 136969 },
+	{  12261527, 137418 },
+	{  12448664, 137867 },
+	{  12637961, 138315 },
+	{  12829435, 138763 },
+	{  13023101, 139211 },
+	{  13218974, 139658 },
+	{  13417071, 140106 },
+	{  13617407, 140553 },
+	{  13819999, 140999 },
+	{  14024862, 141446 },
+	{  14232012, 141892 },
+	{  14441465, 142339 },
+	{  14653238, 142785 },
+	{  14867346, 143230 },
+	{  15083805, 143676 },
+	{  15302632, 144121 },
+	{  15523842, 144566 },
+	{  15747453, 145011 },
+	{  15973479, 145456 },
+	{  16201939, 145900 },
+	{  16432847, 146345 },
+	{  16666221, 146789 },
+	{  16902076, 147233 },
+	{  17140429, 147677 },
+	{  17381297, 148121 },
+	{  17624696, 148564 },
+	{  17870643, 149007 },
+	{  18119154, 149451 },
+	{  18370247, 149894 },
+	{  18623936, 150336 },
+	{  18880241, 150779 },
+	{  19139176, 151222 },
+	{  19400759, 151664 },
+	{  19665007, 152107 },
+	{  19931936, 152549 },
+	{  20201564, 152991 },
+	{  20473907, 153433 },
+	{  20748982, 153875 },
+	{  21026807, 154316 },
+	{  21307399, 154758 },
+	{  21590773, 155199 },
+	{  21876949, 155641 },
+	{  22165941, 156082 },
+	{  22457769, 156523 },
+	{  22752449, 156964 },
+	{  23049999, 157405 },
+	{  23350435, 157846 },
+	{  23653774, 158287 },
+	{  23960036, 158727 },
+	{  24269236, 159168 },
+	{  24581392, 159608 },
+	{  24896521, 160049 },
+	{  25214642, 160489 },
+	{  25535772, 160929 },
+	{  25859927, 161370 },
+	{  26187127, 161810 },
+	{  26517388, 162250 },
+	{  26850728, 162690 },
+	{  27187165, 163130 },
+	{  27526716, 163569 },
+	{  27869400, 164009 },
+	{  28215234, 164449 },
+	{  28564236, 164889 },
+	{  28916423, 165328 },
+	{  29271815, 165768 },
+	{  29630428, 166208 },
+	{  29992281, 166647 },
+	{  30357392, 167087 },
+	{  30725779, 167526 },
+	{  31097459, 167965 },
+	{  31472452, 168405 },
+	{  31850774, 168844 },
+	{  32232445, 169283 },
+	{  32617482, 169723 },
+	{  33005904, 170162 },
+	{  33397730, 170601 },
+	{  33792976, 171041 },
+	{  34191663, 171480 },
+	{  34593807, 171919 },
+	{  34999428, 172358 },
+	{  35408544, 172797 },
+	{  35821174, 173237 },
+	{  36237335, 173676 },
+	{  36657047, 174115 },
+	{  37080329, 174554 },
+	{  37507197, 174993 },
+	{  37937673, 175433 },
+	{  38371773, 175872 },
+	{  38809517, 176311 },
+	{  39250924, 176750 },
+	{  39696012, 177190 },
+	{  40144800, 177629 },
+	{  40597308, 178068 },
+	{  41053553, 178507 },
+	{  41513554, 178947 },
+	{  41977332, 179386 },
+	{  42444904, 179825 },
+	{  42916290, 180265 },
+	{  43391509, 180704 },
+	{  43870579, 181144 },
+	{  44353520, 181583 },
+	{  44840352, 182023 },
+	{  45331092, 182462 },
+	{  45825761, 182902 },
+	{  46324378, 183342 },
+	{  46826961, 183781 },
+	{  47333531, 184221 },
+	{  47844106, 184661 },
+	{  48358706, 185101 },
+	{  48877350, 185541 },
+	{  49400058, 185981 },
+	{  49926849, 186421 },
+	{  50457743, 186861 },
+	{  50992759, 187301 },
+	{  51531916, 187741 },
+	{  52075235, 188181 },
+	{  52622735, 188622 },
+	{  53174435, 189062 },
+	{  53730355, 189502 },
+	{  54290515, 189943 },
+	{  54854935, 190383 },
+	{  55423634, 190824 },
+	{  55996633, 191265 },
+	{  56573950, 191706 },
+	{  57155606, 192146 },
+	{  57741621, 192587 },
+	{  58332014, 193028 },
+	{  58926806, 193470 },
+	{  59526017, 193911 },
+	{  60129666, 194352 },
+	{  60737774, 194793 },
+	{  61350361, 195235 },
+	{  61967446, 195677 },
+	{  62589050, 196118 },
+	{  63215194, 196560 },
+	{  63845897, 197002 },
+	{  64481179, 197444 },
+	{  65121061, 197886 },
+	{  65765563, 198328 },
+	{  66414705, 198770 },
+	{  67068508, 199213 },
+	{  67726992, 199655 },
+	{  68390177, 200098 },
+	{  69058085, 200540 },
+	{  69730735, 200983 },
+	{  70408147, 201426 },
+	{  71090343, 201869 },
+	{  71777343, 202312 },
+	{  72469168, 202755 },
+	{  73165837, 203199 },
+	{  73867373, 203642 },
+	{  74573795, 204086 },
+	{  75285124, 204529 },
+	{  76001380, 204973 },
+	{  76722586, 205417 },
+	{  77448761, 205861 },
+	{  78179926, 206306 },
+	{  78916102, 206750 },
+	{  79657310, 207194 },
+	{  80403571, 207639 },
+	{  81154906, 208084 },
+	{  81911335, 208529 },
+	{  82672880, 208974 },
+	{  83439562, 209419 },
+	{  84211402, 209864 },
+	{  84988421, 210309 },
+	{  85770640, 210755 },
+	{  86558080, 211201 },
+	{  87350762, 211647 },
+	{  88148708, 212093 },
+	{  88951938, 212539 },
+	{  89760475, 212985 },
+	{  90574339, 213432 },
+	{  91393551, 213878 },
+	{  92218133, 214325 },
+	{  93048107, 214772 },
+	{  93883493, 215219 },
+	{  94724314, 215666 },
+	{  95570590, 216114 },
+	{  96422343, 216561 },
+	{  97279594, 217009 },
+	{  98142366, 217457 },
+	{  99010679, 217905 },
+	{  99884556, 218353 },
+	{ 100764018, 218801 },
+	{ 101649086, 219250 },
+	{ 102539782, 219698 },
+	{ 103436128, 220147 },
+	{ 104338146, 220596 },
+	{ 105245857, 221046 },
+	{ 106159284, 221495 },
+	{ 107078448, 221945 },
+	{ 108003370, 222394 },
+	{ 108934074, 222844 },
+	{ 109870580, 223294 },
+	{ 110812910, 223745 },
+	{ 111761087, 224195 },
+	{ 112715133, 224646 },
+	{ 113675069, 225097 },
+	{ 114640918, 225548 },
+	{ 115612702, 225999 },
+	{ 116590442, 226450 },
+	{ 117574162, 226902 },
+	{ 118563882, 227353 },
+	{ 119559626, 227805 },
+	{ 120561415, 228258 },
+	{ 121569272, 228710 },
+	{ 122583219, 229162 },
+	{ 123603278, 229615 },
+	{ 124629471, 230068 },
+	{ 125661822, 230521 },
+	{ 126700352, 230974 },
+	{ 127745083, 231428 },
+	{ 128796039, 231882 },
+	{ 129853241, 232336 },
+	{ 130916713, 232790 },
+	{ 131986475, 233244 },
+	{ 133062553, 233699 },
+	{ 134144966, 234153 },
+	{ 135233739, 234608 },
+	{ 136328894, 235064 },
+	{ 137430453, 235519 },
+	{ 138538440, 235975 },
+	{ 139652876, 236430 },
+	{ 140773786, 236886 },
+	{ 141901190, 237343 },
+	{ 143035113, 237799 },
+	{ 144175576, 238256 },
+	{ 145322604, 238713 },
+	{ 146476218, 239170 },
+	{ 147636442, 239627 },
+	{ 148803298, 240085 },
+	{ 149976809, 240542 },
+	{ 151156999, 241000 },
+	{ 152343890, 241459 },
+	{ 153537506, 241917 },
+	{ 154737869, 242376 },
+	{ 155945002, 242835 },
+	{ 157158929, 243294 },
+	{ 158379673, 243753 },
+	{ 159607257, 244213 },
+	{ 160841704, 244673 },
+	{ 162083037, 245133 },
+	{ 163331279, 245593 },
+	{ 164586455, 246054 },
+	{ 165848586, 246514 },
+	{ 167117696, 246975 },
+	{ 168393810, 247437 },
+	{ 169676949, 247898 },
+	{ 170967138, 248360 },
+	{ 172264399, 248822 },
+	{ 173568757, 249284 },
+	{ 174880235, 249747 },
+	{ 176198856, 250209 },
+	{ 177524643, 250672 },
+	{ 178857621, 251136 },
+	{ 180197813, 251599 },
+	{ 181545242, 252063 },
+	{ 182899933, 252527 },
+	{ 184261908, 252991 },
+	{ 185631191, 253456 },
+	{ 187007807, 253920 },
+	{ 188391778, 254385 },
+	{ 189783129, 254851 },
+	{ 191181884, 255316 },
+	{ 192588065, 255782 },
+	{ 194001698, 256248 },
+	{ 195422805, 256714 },
+	{ 196851411, 257181 },
+	{ 198287540, 257648 },
+	{ 199731215, 258115 },
+	{ 201182461, 258582 },
+	{ 202641302, 259050 },
+	{ 204107760, 259518 },
+	{ 205581862, 259986 },
+	{ 207063630, 260454 },
+	{ 208553088, 260923 },
+	{ 210050262, 261392 },
+	{ 211555174, 261861 },
+	{ 213067849, 262331 },
+	{ 214588312, 262800 },
+	{ 216116586, 263270 },
+	{ 217652696, 263741 },
+	{ 219196666, 264211 },
+	{ 220748520, 264682 },
+	{ 222308282, 265153 },
+	{ 223875978, 265625 },
+	{ 225451630, 266097 },
+	{ 227035265, 266569 },
+	{ 228626905, 267041 },
+	{ 230226576, 267514 },
+	{ 231834302, 267986 },
+	{ 233450107, 268460 },
+	{ 235074016, 268933 },
+	{ 236706054, 269407 },
+	{ 238346244, 269881 },
+	{ 239994613, 270355 },
+	{ 241651183, 270830 },
+	{ 243315981, 271305 }
+};
+
+/* return largest index i such that fval <= lookup[i][small] */
+static inline u32 tfrc_binsearch(u32 fval, u8 small)
+{
+	u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
+
+	while (low < high) {
+		try = (low + high) / 2;
+		if (fval <= tfrc_calc_x_lookup[try][small])
+			high = try;
+		else
+			low  = try + 1;
+	}
+	return high;
+}
+
+/**
+ * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
+ * @s: packet size          in bytes
+ * @R: RTT                  scaled by 1000000   (i.e., microseconds)
+ * @p: loss ratio estimate  scaled by 1000000
+ * Returns X_calc           in bytes per second (not scaled).
+ */
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+	u16 index;
+	u32 f;
+	u64 result;
+
+	/* check against invalid parameters and divide-by-zero   */
+	BUG_ON(p >  1000000);		/* p must not exceed 100%   */
+	BUG_ON(p == 0);			/* f(0) = 0, divide by zero */
+	if (R == 0) {			/* possible  divide by zero */
+		DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
+		return ~0U;
+	}
+
+	if (p <= TFRC_CALC_X_SPLIT)		{     /* 0.0000 < p <= 0.05   */
+		if (p < TFRC_SMALLEST_P) {	      /* 0.0000 < p <  0.0001 */
+			DCCP_WARN("Value of p (%d) below resolution. "
+				  "Substituting %d\n", p, TFRC_SMALLEST_P);
+			index = 0;
+		} else				      /* 0.0001 <= p <= 0.05  */
+			index =  p/TFRC_SMALLEST_P - 1;
+
+		f = tfrc_calc_x_lookup[index][1];
+
+	} else {				      /* 0.05   <  p <= 1.00  */
+		index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
+
+		f = tfrc_calc_x_lookup[index][0];
+	}
+
+	/*
+	 * Compute X = s/(R*f(p)) in bytes per second.
+	 * Since f(p) and R are both scaled by 1000000, we need to multiply by
+	 * 1000000^2. To avoid overflow, the result is computed in two stages.
+	 * This works under almost all reasonable operational conditions, for a
+	 * wide range of parameters. Yet, should some strange combination of
+	 * parameters result in overflow, the use of scaled_div32 will catch
+	 * this and return UINT_MAX - which is a logically adequate consequence.
+	 */
+	result = scaled_div(s, R);
+	return scaled_div32(result, f);
+}
+
+/**
+ *  tfrc_calc_x_reverse_lookup  -  try to find p given f(p)
+ *  @fvalue: function value to match, scaled by 1000000
+ *  Returns closest match for p, also scaled by 1000000
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+	int index;
+
+	if (fvalue == 0)	/* f(p) = 0  whenever  p = 0 */
+		return 0;
+
+	/* Error cases. */
+	if (fvalue < tfrc_calc_x_lookup[0][1]) {
+		DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+		return TFRC_SMALLEST_P;
+	}
+	if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
+		DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
+		return 1000000;
+	}
+
+	if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
+		index = tfrc_binsearch(fvalue, 1);
+		return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
+	}
+
+	/* else ... it must be in the coarse-grained column */
+	index = tfrc_binsearch(fvalue, 0);
+	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
+}
+
+/**
+ * tfrc_invert_loss_event_rate  -  Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+	if (loss_event_rate == UINT_MAX)		/* see RFC 4342, 8.5 */
+		return 0;
+	if (unlikely(loss_event_rate == 0))		/* map 1/0 into 100% */
+		return 1000000;
+	return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 00000000..5fdb0722
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,506 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ *  net/dccp/dccp.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/ktime.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include "ackvec.h"
+
+/*
+ * 	DCCP - specific warning and debugging macros.
+ */
+#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt,       \
+							__func__, ##a)
+#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
+					 __FILE__, __LINE__, __func__)
+#define DCCP_BUG(a...)       do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
+#define DCCP_BUG_ON(cond)    do { if (unlikely((cond) != 0))		   \
+				     DCCP_BUG("\"%s\" holds (exception!)", \
+					      __stringify(cond));          \
+			     } while (0)
+
+#define DCCP_PRINTK(enable, fmt, args...)	do { if (enable)	     \
+							printk(fmt, ##args); \
+						} while(0)
+#define DCCP_PR_DEBUG(enable, fmt, a...)	DCCP_PRINTK(enable, KERN_DEBUG \
+						  "%s: " fmt, __func__, ##a)
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern int dccp_debug;
+#define dccp_pr_debug(format, a...)	  DCCP_PR_DEBUG(dccp_debug, format, ##a)
+#define dccp_pr_debug_cat(format, a...)   DCCP_PRINTK(dccp_debug, format, ##a)
+#define dccp_debug(fmt, a...)		  dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#define dccp_debug(format, a...)
+#endif
+
+extern struct inet_hashinfo dccp_hashinfo;
+
+extern struct percpu_counter dccp_orphan_count;
+
+extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+
+/*
+ *  Set safe upper bounds for header and option length. Since Data Offset is 8
+ *  bits (RFC 4340, sec. 5.1), the total header length can never be more than
+ *  4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
+ *    - DCCP-Response with ACK Subheader and 4 bytes of Service code      OR
+ *    - DCCP-Reset    with ACK Subheader and 4 bytes of Reset Code fields
+ *  Hence a safe upper bound for the maximum option length is 1020-28 = 992
+ */
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
+#define DCCP_MAX_PACKET_HDR 28
+#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
+#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
+
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD	 (32 * sizeof(uint32_t))
+
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+				     * state, about 60 seconds */
+
+/* RFC 1122, 4.2.3.1 initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+
+/*
+ * The maximum back-off value for retransmissions. This is needed for
+ *  - retransmitting client-Requests (sec. 8.1.1),
+ *  - retransmitting Close/CloseReq when closing (sec. 8.3),
+ *  - feature-negotiation retransmission (sec. 6.6.3),
+ *  - Acks in client-PARTOPEN state (sec. 8.1.5).
+ */
+#define DCCP_RTO_MAX ((unsigned)(64 * HZ))
+
+/*
+ * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
+ */
+#define DCCP_SANE_RTT_MIN	100
+#define DCCP_FALLBACK_RTT	(USEC_PER_SEC / 5)
+#define DCCP_SANE_RTT_MAX	(3 * USEC_PER_SEC)
+
+/* sysctl variables for DCCP */
+extern int  sysctl_dccp_request_retries;
+extern int  sysctl_dccp_retries1;
+extern int  sysctl_dccp_retries2;
+extern int  sysctl_dccp_tx_qlen;
+extern int  sysctl_dccp_sync_ratelimit;
+
+/*
+ *	48-bit sequence number arithmetic (signed and unsigned)
+ */
+#define INT48_MIN	  0x800000000000LL		/* 2^47	    */
+#define UINT48_MAX	  0xFFFFFFFFFFFFLL		/* 2^48 - 1 */
+#define COMPLEMENT48(x)	 (0x1000000000000LL - (x))	/* 2^48 - x */
+#define TO_SIGNED48(x)	 (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
+#define TO_UNSIGNED48(x) (((x) >= 0)?	     (x) :  COMPLEMENT48(-(x)))
+#define ADD48(a, b)	 (((a) + (b)) & UINT48_MAX)
+#define SUB48(a, b)	 ADD48((a), COMPLEMENT48(b))
+
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+	*seqno = value & UINT48_MAX;
+}
+
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+	*seqno = ADD48(*seqno, 1);
+}
+
+/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
+static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
+{
+	u64 delta = SUB48(seqno2, seqno1);
+
+	return TO_SIGNED48(delta);
+}
+
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+	return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
+}
+
+/* is seq1 > seq2 ? */
+#define after48(seq1, seq2)	before48(seq2, seq1)
+
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+	return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+	return after48(seq1, seq2) ? seq1 : seq2;
+}
+
+/**
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1:  last known sequence number before the loss ('hole')
+ * @s2:  first sequence number seen after the 'hole'
+ * @ndp: NDP count on packet with sequence number @s2
+ */
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
+{
+	s64 delta = dccp_delta_seqno(s1, s2);
+
+	WARN_ON(delta < 0);
+	delta -= ndp + 1;
+
+	return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+	return dccp_loss_count(s1, s2, ndp) == 0;
+}
+
+enum {
+	DCCP_MIB_NUM = 0,
+	DCCP_MIB_ACTIVEOPENS,			/* ActiveOpens */
+	DCCP_MIB_ESTABRESETS,			/* EstabResets */
+	DCCP_MIB_CURRESTAB,			/* CurrEstab */
+	DCCP_MIB_OUTSEGS,			/* OutSegs */
+	DCCP_MIB_OUTRSTS,
+	DCCP_MIB_ABORTONTIMEOUT,
+	DCCP_MIB_TIMEOUTS,
+	DCCP_MIB_ABORTFAILED,
+	DCCP_MIB_PASSIVEOPENS,
+	DCCP_MIB_ATTEMPTFAILS,
+	DCCP_MIB_OUTDATAGRAMS,
+	DCCP_MIB_INERRS,
+	DCCP_MIB_OPTMANDATORYERROR,
+	DCCP_MIB_INVALIDOPT,
+	__DCCP_MIB_MAX
+};
+
+#define DCCP_MIB_MAX	__DCCP_MIB_MAX
+struct dccp_mib {
+	unsigned long	mibs[DCCP_MIB_MAX];
+};
+
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field)	    SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field)    SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_DEC_STATS(field)	    SNMP_DEC_STATS(dccp_statistics, field)
+
+/*
+ * 	Checksumming routines
+ */
+static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
+{
+	const struct dccp_hdr* dh = dccp_hdr(skb);
+
+	if (dh->dccph_cscov == 0)
+		return skb->len;
+	return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
+}
+
+static inline void dccp_csum_outgoing(struct sk_buff *skb)
+{
+	unsigned int cov = dccp_csum_coverage(skb);
+
+	if (cov >= skb->len)
+		dccp_hdr(skb)->dccph_cscov = 0;
+
+	skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
+}
+
+extern void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+
+extern int  dccp_retransmit_skb(struct sock *sk);
+
+extern void dccp_send_ack(struct sock *sk);
+extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+				struct request_sock *rsk);
+
+extern void dccp_send_sync(struct sock *sk, const u64 seq,
+			   const enum dccp_pkt_type pkt_type);
+
+/*
+ * TX Packet Dequeueing Interface
+ */
+extern void		dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+extern bool		dccp_qpolicy_full(struct sock *sk);
+extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
+extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
+extern bool		dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
+
+/*
+ * TX Packet Output and TX Timers
+ */
+extern void   dccp_write_xmit(struct sock *sk);
+extern void   dccp_write_space(struct sock *sk);
+extern void   dccp_flush_write_queue(struct sock *sk, long *time_budget);
+
+extern void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+	inet_csk_clear_xmit_timers(sk);
+}
+
+extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+
+extern const char *dccp_packet_name(const int type);
+
+extern void dccp_set_state(struct sock *sk, const int state);
+extern void dccp_done(struct sock *sk);
+
+extern int  dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+			    struct sk_buff const *skb);
+
+extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+
+extern struct sock *dccp_create_openreq_child(struct sock *sk,
+					      const struct request_sock *req,
+					      const struct sk_buff *skb);
+
+extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
+					      struct sk_buff *skb,
+					      struct request_sock *req,
+					      struct dst_entry *dst);
+extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+				   struct request_sock *req,
+				   struct request_sock **prev);
+
+extern int dccp_child_process(struct sock *parent, struct sock *child,
+			      struct sk_buff *skb);
+extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+				  struct dccp_hdr *dh, unsigned len);
+extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+				const struct dccp_hdr *dh, const unsigned len);
+
+extern int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
+extern void dccp_destroy_sock(struct sock *sk);
+
+extern void		dccp_close(struct sock *sk, long timeout);
+extern struct sk_buff	*dccp_make_response(struct sock *sk,
+					    struct dst_entry *dst,
+					    struct request_sock *req);
+
+extern int	   dccp_connect(struct sock *sk);
+extern int	   dccp_disconnect(struct sock *sk, int flags);
+extern int	   dccp_getsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, int __user *optlen);
+extern int	   dccp_setsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, unsigned int optlen);
+#ifdef CONFIG_COMPAT
+extern int	   compat_dccp_getsockopt(struct sock *sk,
+				int level, int optname,
+				char __user *optval, int __user *optlen);
+extern int	   compat_dccp_setsockopt(struct sock *sk,
+				int level, int optname,
+				char __user *optval, unsigned int optlen);
+#endif
+extern int	   dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int	   dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+				struct msghdr *msg, size_t size);
+extern int	   dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+				struct msghdr *msg, size_t len, int nonblock,
+				int flags, int *addr_len);
+extern void	   dccp_shutdown(struct sock *sk, int how);
+extern int	   inet_dccp_listen(struct socket *sock, int backlog);
+extern unsigned int dccp_poll(struct file *file, struct socket *sock,
+			     poll_table *wait);
+extern int	   dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+				   int addr_len);
+
+extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
+					   struct sk_buff *skb);
+extern int	   dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
+extern void	   dccp_send_close(struct sock *sk, const int active);
+extern int	   dccp_invalid_packet(struct sk_buff *skb);
+extern u32	   dccp_sample_rtt(struct sock *sk, long delta);
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+					const __be32 service)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+
+	if (dp->dccps_service == service)
+		return 0;
+	return !dccp_list_has_service(dp->dccps_service_list, service);
+}
+
+/**
+ * dccp_skb_cb  -  DCCP per-packet control information
+ * @dccpd_type: one of %dccp_pkt_type (or unknown)
+ * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
+ * @dccpd_reset_code: one of %dccp_reset_codes
+ * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
+ * @dccpd_opt_len: total length of all options (5.8) in the packet
+ * @dccpd_seq: sequence number
+ * @dccpd_ack_seq: acknowledgment number subheader field value
+ * This is used for transmission as well as for reception.
+ */
+struct dccp_skb_cb {
+	union {
+		struct inet_skb_parm	h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;
+	__u8  dccpd_type:4;
+	__u8  dccpd_ccval:4;
+	__u8  dccpd_reset_code,
+	      dccpd_reset_data[3];
+	__u16 dccpd_opt_len;
+	__u64 dccpd_seq;
+	__u64 dccpd_ack_seq;
+};
+
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+
+/* RFC 4340, sec. 7.7 */
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+	const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+	return type == DCCP_PKT_ACK	 ||
+	       type == DCCP_PKT_CLOSE	 ||
+	       type == DCCP_PKT_CLOSEREQ ||
+	       type == DCCP_PKT_RESET	 ||
+	       type == DCCP_PKT_SYNC	 ||
+	       type == DCCP_PKT_SYNCACK;
+}
+
+/* RFC 4340, sec. 7.7 */
+static inline int dccp_data_packet(const struct sk_buff *skb)
+{
+	const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+	return type == DCCP_PKT_DATA	 ||
+	       type == DCCP_PKT_DATAACK  ||
+	       type == DCCP_PKT_REQUEST  ||
+	       type == DCCP_PKT_RESPONSE;
+}
+
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+	const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+	return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+
+#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
+
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+	struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+							   sizeof(*dh));
+	dh->dccph_seq2 = 0;
+	dh->dccph_seq = htons((gss >> 32) & 0xfffff);
+	dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+				    const u64 gsr)
+{
+	dhack->dccph_reserved1 = 0;
+	dhack->dccph_ack_nr_high = htons(gsr >> 32);
+	dhack->dccph_ack_nr_low  = htonl(gsr & 0xffffffff);
+}
+
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (after48(seq, dp->dccps_gsr))
+		dp->dccps_gsr = seq;
+	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
+	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	/*
+	 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+	 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+	 * always > 32, so for the first W/W' packets in the lifetime of a
+	 * connection we always have to adjust SWL.
+	 * A second reason why we are doing this is that the window depends on
+	 * the feature-remote value of Sequence Window: nothing stops the peer
+	 * from updating this value while we are busy adjusting SWL for the
+	 * first W packets (we would have to count from scratch again then).
+	 * Therefore it is safer to always make sure that the Sequence Window
+	 * is not artificially extended by a peer who grows SWL downwards by
+	 * continually updating the feature-remote Sequence-Window.
+	 * If sequence numbers wrap it is bad luck. But that will take a while
+	 * (48 bit), and this measure prevents Sequence-number attacks.
+	 */
+	if (before48(dp->dccps_swl, dp->dccps_isr))
+		dp->dccps_swl = dp->dccps_isr;
+	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
+}
+
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	dp->dccps_gss = seq;
+	/* Ack validity window depends on local Sequence Window value (7.5.1) */
+	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	/* Adjust AWL so that it is not below ISS - see comment above for SWL */
+	if (before48(dp->dccps_awl, dp->dccps_iss))
+		dp->dccps_awl = dp->dccps_iss;
+	dp->dccps_awh = dp->dccps_gss;
+}
+
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+	       !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+}
+
+static inline int dccp_ack_pending(const struct sock *sk)
+{
+	return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
+}
+
+extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
+extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+				  struct sk_buff *skb);
+extern int  dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
+extern void dccp_feat_list_purge(struct list_head *fn_list);
+
+extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
+extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
+extern u32 dccp_timestamp(void);
+extern void dccp_timestamping_init(void);
+extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+			      const void *value, unsigned char len);
+
+#ifdef CONFIG_SYSCTL
+extern int dccp_sysctl_init(void);
+extern void dccp_sysctl_exit(void);
+#else
+static inline int dccp_sysctl_init(void)
+{
+	return 0;
+}
+
+static inline void dccp_sysctl_exit(void)
+{
+}
+#endif
+
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 00000000..b21f261d
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,74 @@
+/*
+ *  net/dccp/diag.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	memset(info, 0, sizeof(*info));
+
+	info->tcpi_state	= sk->sk_state;
+	info->tcpi_retransmits	= icsk->icsk_retransmits;
+	info->tcpi_probes	= icsk->icsk_probes_out;
+	info->tcpi_backoff	= icsk->icsk_backoff;
+	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
+
+	if (dp->dccps_hc_rx_ackvec != NULL)
+		info->tcpi_options |= TCPI_OPT_SACK;
+
+	if (dp->dccps_hc_rx_ccid != NULL)
+		ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+
+	if (dp->dccps_hc_tx_ccid != NULL)
+		ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			       void *_info)
+{
+	r->idiag_rqueue = r->idiag_wqueue = 0;
+
+	if (_info != NULL)
+		dccp_get_info(sk, _info);
+}
+
+static const struct inet_diag_handler dccp_diag_handler = {
+	.idiag_hashinfo	 = &dccp_hashinfo,
+	.idiag_get_info	 = dccp_diag_get_info,
+	.idiag_type	 = DCCPDIAG_GETSOCK,
+	.idiag_info_size = sizeof(struct tcp_info),
+};
+
+static int __init dccp_diag_init(void)
+{
+	return inet_diag_register(&dccp_diag_handler);
+}
+
+static void __exit dccp_diag_fini(void)
+{
+	inet_diag_unregister(&dccp_diag_handler);
+}
+
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, DCCPDIAG_GETSOCK);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
new file mode 100644
index 00000000..568def95
--- /dev/null
+++ b/net/dccp/feat.c
@@ -0,0 +1,1359 @@
+/*
+ *  net/dccp/feat.c
+ *
+ *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *
+ *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *  Rewrote from scratch, some bits from earlier code by
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *
+ *  ASSUMPTIONS
+ *  -----------
+ *  o Feature negotiation is coordinated with connection setup (as in TCP), wild
+ *    changes of parameters of an established connection are not supported.
+ *  o All currently known SP features have 1-byte quantities. If in the future
+ *    extensions of RFCs 4340..42 define features with item lengths larger than
+ *    one byte, a feature-specific extension of the code will be required.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "ccid.h"
+#include "feat.h"
+
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long	sysctl_dccp_sequence_window __read_mostly = 100;
+int		sysctl_dccp_rx_ccid	    __read_mostly = 2,
+		sysctl_dccp_tx_ccid	    __read_mostly = 2;
+
+/*
+ * Feature activation handlers.
+ *
+ * These all use an u64 argument, to provide enough room for NN/SP features. At
+ * this stage the negotiated values have been checked to be within their range.
+ */
+static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid *new_ccid = ccid_new(ccid, sk, rx);
+
+	if (new_ccid == NULL)
+		return -ENOMEM;
+
+	if (rx) {
+		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+		dp->dccps_hc_rx_ccid = new_ccid;
+	} else {
+		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+		dp->dccps_hc_tx_ccid = new_ccid;
+	}
+	return 0;
+}
+
+static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		dp->dccps_r_seq_win = seq_win;
+		/* propagate changes to update SWL/SWH */
+		dccp_update_gsr(sk, dp->dccps_gsr);
+	} else {
+		dp->dccps_l_seq_win = seq_win;
+		/* propagate changes to update AWL */
+		dccp_update_gss(sk, dp->dccps_gss);
+	}
+	return 0;
+}
+
+static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
+{
+	if (rx)
+		dccp_sk(sk)->dccps_r_ack_ratio = ratio;
+	else
+		dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+	return 0;
+}
+
+static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		if (enable && dp->dccps_hc_rx_ackvec == NULL) {
+			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
+			if (dp->dccps_hc_rx_ackvec == NULL)
+				return -ENOMEM;
+		} else if (!enable) {
+			dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+			dp->dccps_hc_rx_ackvec = NULL;
+		}
+	}
+	return 0;
+}
+
+static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
+{
+	if (!rx)
+		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
+	return 0;
+}
+
+/*
+ * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
+ * `rx' holds when the sending peer informs about his partial coverage via a
+ * ChangeR() option. In the other case, we are the sender and the receiver
+ * announces its coverage via ChangeL() options. The policy here is to honour
+ * such communication by enabling the corresponding partial coverage - but only
+ * if it has not been set manually before; the warning here means that all
+ * packets will be dropped.
+ */
+static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx)
+		dp->dccps_pcrlen = cscov;
+	else {
+		if (dp->dccps_pcslen == 0)
+			dp->dccps_pcslen = cscov;
+		else if (cscov > dp->dccps_pcslen)
+			DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
+				  dp->dccps_pcslen, (u8)cscov);
+	}
+	return 0;
+}
+
+static const struct {
+	u8			feat_num;		/* DCCPF_xxx */
+	enum dccp_feat_type	rxtx;			/* RX or TX  */
+	enum dccp_feat_type	reconciliation;		/* SP or NN  */
+	u8			default_value;		/* as in 6.4 */
+	int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
+/*
+ *    Lookup table for location and type of features (from RFC 4340/4342)
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | Feature                  | Location | Reconc. | Initial |  Section  |
+ *  |                          | RX | TX  | SP | NN |  Value  | Reference |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | DCCPF_CCID               |    |  X  | X  |    |   2     | 10        |
+ *  | DCCPF_SHORT_SEQNOS       |    |  X  | X  |    |   0     |  7.6.1    |
+ *  | DCCPF_SEQUENCE_WINDOW    |    |  X  |    | X  | 100     |  7.5.2    |
+ *  | DCCPF_ECN_INCAPABLE      | X  |     | X  |    |   0     | 12.1      |
+ *  | DCCPF_ACK_RATIO          |    |  X  |    | X  |   2     | 11.3      |
+ *  | DCCPF_SEND_ACK_VECTOR    | X  |     | X  |    |   0     | 11.5      |
+ *  | DCCPF_SEND_NDP_COUNT     |    |  X  | X  |    |   0     |  7.7.2    |
+ *  | DCCPF_MIN_CSUM_COVER     | X  |     | X  |    |   0     |  9.2.1    |
+ *  | DCCPF_DATA_CHECKSUM      | X  |     | X  |    |   0     |  9.3.1    |
+ *  | DCCPF_SEND_LEV_RATE      | X  |     | X  |    |   0     | 4342/8.4  |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2,   dccp_hdlr_ccid     },
+	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0,   NULL },
+	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win  },
+	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
+	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2,   dccp_hdlr_ack_ratio},
+	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_ackvec   },
+	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0,   dccp_hdlr_ndp      },
+	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_min_cscov},
+	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
+	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
+};
+#define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
+
+/**
+ * dccp_feat_index  -  Hash function to map feature number into array position
+ * Returns consecutive array index or -1 if the feature is not understood.
+ */
+static int dccp_feat_index(u8 feat_num)
+{
+	/* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
+	if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
+		return feat_num - 1;
+
+	/*
+	 * Other features: add cases for new feature types here after adding
+	 * them to the above table.
+	 */
+	switch (feat_num) {
+	case DCCPF_SEND_LEV_RATE:
+			return DCCP_FEAT_SUPPORTED_MAX - 1;
+	}
+	return -1;
+}
+
+static u8 dccp_feat_type(u8 feat_num)
+{
+	int idx = dccp_feat_index(feat_num);
+
+	if (idx < 0)
+		return FEAT_UNKNOWN;
+	return dccp_feat_table[idx].reconciliation;
+}
+
+static int dccp_feat_default_value(u8 feat_num)
+{
+	int idx = dccp_feat_index(feat_num);
+	/*
+	 * There are no default values for unknown features, so encountering a
+	 * negative index here indicates a serious problem somewhere else.
+	 */
+	DCCP_BUG_ON(idx < 0);
+
+	return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
+}
+
+/*
+ *	Debugging and verbose-printing section
+ */
+static const char *dccp_feat_fname(const u8 feat)
+{
+	static const char *const feature_names[] = {
+		[DCCPF_RESERVED]	= "Reserved",
+		[DCCPF_CCID]		= "CCID",
+		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
+		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
+		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
+		[DCCPF_ACK_RATIO]	= "Ack Ratio",
+		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
+		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
+		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
+		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
+	};
+	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+		return feature_names[DCCPF_RESERVED];
+
+	if (feat ==  DCCPF_SEND_LEV_RATE)
+		return "Send Loss Event Rate";
+	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+		return "CCID-specific";
+
+	return feature_names[feat];
+}
+
+static const char *const dccp_feat_sname[] = {
+	"DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
+};
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_feat_oname(const u8 opt)
+{
+	switch (opt) {
+	case DCCPO_CHANGE_L:  return "Change_L";
+	case DCCPO_CONFIRM_L: return "Confirm_L";
+	case DCCPO_CHANGE_R:  return "Change_R";
+	case DCCPO_CONFIRM_R: return "Confirm_R";
+	}
+	return NULL;
+}
+
+static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
+{
+	u8 i, type = dccp_feat_type(feat_num);
+
+	if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
+		dccp_pr_debug_cat("(NULL)");
+	else if (type == FEAT_SP)
+		for (i = 0; i < val->sp.len; i++)
+			dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
+	else if (type == FEAT_NN)
+		dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
+	else
+		dccp_pr_debug_cat("unknown type %u", type);
+}
+
+static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
+{
+	u8 type = dccp_feat_type(feat_num);
+	dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
+
+	if (type == FEAT_NN)
+		fval.nn = dccp_decode_value_var(list, len);
+	dccp_feat_printval(feat_num, &fval);
+}
+
+static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
+{
+	dccp_debug("   * %s %s = ", entry->is_local ? "local" : "remote",
+				    dccp_feat_fname(entry->feat_num));
+	dccp_feat_printval(entry->feat_num, &entry->val);
+	dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
+			  entry->needs_confirm ? "(Confirm pending)" : "");
+}
+
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)	do {	      \
+	dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
+	dccp_feat_printvals(feat, val, len);				      \
+	dccp_pr_debug_cat(") %s\n", mandatory ? "!" : "");	} while (0)
+
+#define dccp_feat_print_fnlist(fn_list)  {		\
+	const struct dccp_feat_entry *___entry;		\
+							\
+	dccp_pr_debug("List Dump:\n");			\
+	list_for_each_entry(___entry, fn_list, node)	\
+		dccp_feat_print_entry(___entry);	\
+}
+#else	/* ! CONFIG_IP_DCCP_DEBUG */
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
+#define dccp_feat_print_fnlist(fn_list)
+#endif
+
+static int __dccp_feat_activate(struct sock *sk, const int idx,
+				const bool is_local, dccp_feat_val const *fval)
+{
+	bool rx;
+	u64 val;
+
+	if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
+		return -1;
+	if (dccp_feat_table[idx].activation_hdlr == NULL)
+		return 0;
+
+	if (fval == NULL) {
+		val = dccp_feat_table[idx].default_value;
+	} else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
+		if (fval->sp.vec == NULL) {
+			/*
+			 * This can happen when an empty Confirm is sent
+			 * for an SP (i.e. known) feature. In this case
+			 * we would be using the default anyway.
+			 */
+			DCCP_CRIT("Feature #%d undefined: using default", idx);
+			val = dccp_feat_table[idx].default_value;
+		} else {
+			val = fval->sp.vec[0];
+		}
+	} else {
+		val = fval->nn;
+	}
+
+	/* Location is RX if this is a local-RX or remote-TX feature */
+	rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
+
+	dccp_debug("   -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
+		   dccp_feat_fname(dccp_feat_table[idx].feat_num),
+		   fval ? "" : "default ",  (unsigned long long)val);
+
+	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
+}
+
+/* Test for "Req'd" feature (RFC 4340, 6.4) */
+static inline int dccp_feat_must_be_understood(u8 feat_num)
+{
+	return	feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
+		feat_num == DCCPF_SEQUENCE_WINDOW;
+}
+
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+	fval->sp.len = len;
+	if (fval->sp.len > 0) {
+		fval->sp.vec = kmemdup(val, len, gfp_any());
+		if (fval->sp.vec == NULL) {
+			fval->sp.len = 0;
+			return -ENOBUFS;
+		}
+	}
+	return 0;
+}
+
+static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
+{
+	if (unlikely(val == NULL))
+		return;
+	if (dccp_feat_type(feat_num) == FEAT_SP)
+		kfree(val->sp.vec);
+	memset(val, 0, sizeof(*val));
+}
+
+static struct dccp_feat_entry *
+	      dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+{
+	struct dccp_feat_entry *new;
+	u8 type = dccp_feat_type(original->feat_num);
+
+	if (type == FEAT_UNKNOWN)
+		return NULL;
+
+	new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+	if (new == NULL)
+		return NULL;
+
+	if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+						      original->val.sp.vec,
+						      original->val.sp.len)) {
+		kfree(new);
+		return NULL;
+	}
+	return new;
+}
+
+static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
+{
+	if (entry != NULL) {
+		dccp_feat_val_destructor(entry->feat_num, &entry->val);
+		kfree(entry);
+	}
+}
+
+/*
+ * List management functions
+ *
+ * Feature negotiation lists rely on and maintain the following invariants:
+ * - each feat_num in the list is known, i.e. we know its type and default value
+ * - each feat_num/is_local combination is unique (old entries are overwritten)
+ * - SP values are always freshly allocated
+ * - list is sorted in increasing order of feature number (faster lookup)
+ */
+static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
+						     u8 feat_num, bool is_local)
+{
+	struct dccp_feat_entry *entry;
+
+	list_for_each_entry(entry, fn_list, node) {
+		if (entry->feat_num == feat_num && entry->is_local == is_local)
+			return entry;
+		else if (entry->feat_num > feat_num)
+			break;
+	}
+	return NULL;
+}
+
+/**
+ * dccp_feat_entry_new  -  Central list update routine (called by all others)
+ * @head:  list to add to
+ * @feat:  feature number
+ * @local: whether the local (1) or remote feature with number @feat is meant
+ * This is the only constructor and serves to ensure the above invariants.
+ */
+static struct dccp_feat_entry *
+	      dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+{
+	struct dccp_feat_entry *entry;
+
+	list_for_each_entry(entry, head, node)
+		if (entry->feat_num == feat && entry->is_local == local) {
+			dccp_feat_val_destructor(entry->feat_num, &entry->val);
+			return entry;
+		} else if (entry->feat_num > feat) {
+			head = &entry->node;
+			break;
+		}
+
+	entry = kmalloc(sizeof(*entry), gfp_any());
+	if (entry != NULL) {
+		entry->feat_num = feat;
+		entry->is_local = local;
+		list_add_tail(&entry->node, head);
+	}
+	return entry;
+}
+
+/**
+ * dccp_feat_push_change  -  Add/overwrite a Change option in the list
+ * @fn_list: feature-negotiation list to update
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @needs_mandatory: whether to use Mandatory feature negotiation options
+ * @fval: pointer to NN/SP value to be inserted (will be copied)
+ */
+static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
+				 u8 mandatory, dccp_feat_val *fval)
+{
+	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+	if (new == NULL)
+		return -ENOMEM;
+
+	new->feat_num	     = feat;
+	new->is_local	     = local;
+	new->state	     = FEAT_INITIALISING;
+	new->needs_confirm   = 0;
+	new->empty_confirm   = 0;
+	new->val	     = *fval;
+	new->needs_mandatory = mandatory;
+
+	return 0;
+}
+
+/**
+ * dccp_feat_push_confirm  -  Add a Confirm entry to the FN list
+ * @fn_list: feature-negotiation list to add to
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is being confirmed
+ * @fval: pointer to NN/SP value to be inserted or NULL
+ * Returns 0 on success, a Reset code for further processing otherwise.
+ */
+static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
+				  dccp_feat_val *fval)
+{
+	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+	if (new == NULL)
+		return DCCP_RESET_CODE_TOO_BUSY;
+
+	new->feat_num	     = feat;
+	new->is_local	     = local;
+	new->state	     = FEAT_STABLE;	/* transition in 6.6.2 */
+	new->needs_confirm   = 1;
+	new->empty_confirm   = (fval == NULL);
+	new->val.nn	     = 0;		/* zeroes the whole structure */
+	if (!new->empty_confirm)
+		new->val     = *fval;
+	new->needs_mandatory = 0;
+
+	return 0;
+}
+
+static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+{
+	return dccp_feat_push_confirm(fn_list, feat, local, NULL);
+}
+
+static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
+{
+	list_del(&entry->node);
+	dccp_feat_entry_destructor(entry);
+}
+
+void dccp_feat_list_purge(struct list_head *fn_list)
+{
+	struct dccp_feat_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, fn_list, node)
+		dccp_feat_entry_destructor(entry);
+	INIT_LIST_HEAD(fn_list);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
+
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+	struct dccp_feat_entry *entry, *new;
+
+	INIT_LIST_HEAD(to);
+	list_for_each_entry(entry, from, node) {
+		new = dccp_feat_clone_entry(entry);
+		if (new == NULL)
+			goto cloning_failed;
+		list_add_tail(&new->node, to);
+	}
+	return 0;
+
+cloning_failed:
+	dccp_feat_list_purge(to);
+	return -ENOMEM;
+}
+
+/**
+ * dccp_feat_valid_nn_length  -  Enforce length constraints on NN options
+ * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
+ * incoming options are accepted as long as their values are valid.
+ */
+static u8 dccp_feat_valid_nn_length(u8 feat_num)
+{
+	if (feat_num == DCCPF_ACK_RATIO)	/* RFC 4340, 11.3 and 6.6.8 */
+		return 2;
+	if (feat_num == DCCPF_SEQUENCE_WINDOW)	/* RFC 4340, 7.5.2 and 6.5  */
+		return 6;
+	return 0;
+}
+
+static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
+{
+	switch (feat_num) {
+	case DCCPF_ACK_RATIO:
+		return val <= DCCPF_ACK_RATIO_MAX;
+	case DCCPF_SEQUENCE_WINDOW:
+		return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
+	}
+	return 0;	/* feature unknown - so we can't tell */
+}
+
+/* check that SP values are within the ranges defined in RFC 4340 */
+static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+{
+	switch (feat_num) {
+	case DCCPF_CCID:
+		return val == DCCPC_CCID2 || val == DCCPC_CCID3;
+	/* Type-check Boolean feature values: */
+	case DCCPF_SHORT_SEQNOS:
+	case DCCPF_ECN_INCAPABLE:
+	case DCCPF_SEND_ACK_VECTOR:
+	case DCCPF_SEND_NDP_COUNT:
+	case DCCPF_DATA_CHECKSUM:
+	case DCCPF_SEND_LEV_RATE:
+		return val < 2;
+	case DCCPF_MIN_CSUM_COVER:
+		return val < 16;
+	}
+	return 0;			/* feature unknown */
+}
+
+static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
+{
+	if (sp_list == NULL || sp_len < 1)
+		return 0;
+	while (sp_len--)
+		if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
+			return 0;
+	return 1;
+}
+
+/**
+ * dccp_feat_insert_opts  -  Generate FN options from current list state
+ * @skb: next sk_buff to be sent to the peer
+ * @dp: for client during handshake and general negotiation
+ * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
+ */
+int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
+			  struct sk_buff *skb)
+{
+	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+	struct dccp_feat_entry *pos, *next;
+	u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
+	bool rpt;
+
+	/* put entries into @skb in the order they appear in the list */
+	list_for_each_entry_safe_reverse(pos, next, fn, node) {
+		opt  = dccp_feat_genopt(pos);
+		type = dccp_feat_type(pos->feat_num);
+		rpt  = false;
+
+		if (pos->empty_confirm) {
+			len = 0;
+			ptr = NULL;
+		} else {
+			if (type == FEAT_SP) {
+				len = pos->val.sp.len;
+				ptr = pos->val.sp.vec;
+				rpt = pos->needs_confirm;
+			} else if (type == FEAT_NN) {
+				len = dccp_feat_valid_nn_length(pos->feat_num);
+				ptr = nn_in_nbo;
+				dccp_encode_value_var(pos->val.nn, ptr, len);
+			} else {
+				DCCP_BUG("unknown feature %u", pos->feat_num);
+				return -1;
+			}
+		}
+		dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
+
+		if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
+			return -1;
+		if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
+			return -1;
+		/*
+		 * Enter CHANGING after transmitting the Change option (6.6.2).
+		 */
+		if (pos->state == FEAT_INITIALISING)
+			pos->state = FEAT_CHANGING;
+	}
+	return 0;
+}
+
+/**
+ * __feat_register_nn  -  Register new NN value on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an NN feature from %dccp_feature_numbers
+ * @mandatory: use Mandatory option if 1
+ * @nn_val: value to register (restricted to 4 bytes)
+ * Note that NN features are local by definition (RFC 4340, 6.3.2).
+ */
+static int __feat_register_nn(struct list_head *fn, u8 feat,
+			      u8 mandatory, u64 nn_val)
+{
+	dccp_feat_val fval = { .nn = nn_val };
+
+	if (dccp_feat_type(feat) != FEAT_NN ||
+	    !dccp_feat_is_valid_nn_val(feat, nn_val))
+		return -EINVAL;
+
+	/* Don't bother with default values, they will be activated anyway. */
+	if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
+		return 0;
+
+	return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
+}
+
+/**
+ * __feat_register_sp  -  Register new SP value/list on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an SP feature from %dccp_feature_numbers
+ * @is_local: whether the local (1) or the remote (0) @feat is meant
+ * @mandatory: use Mandatory option if 1
+ * @sp_val: SP value followed by optional preference list
+ * @sp_len: length of @sp_val in bytes
+ */
+static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
+			      u8 mandatory, u8 const *sp_val, u8 sp_len)
+{
+	dccp_feat_val fval;
+
+	if (dccp_feat_type(feat) != FEAT_SP ||
+	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
+		return -EINVAL;
+
+	/* Avoid negotiating alien CCIDs by only advertising supported ones */
+	if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+		return -EOPNOTSUPP;
+
+	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
+		return -ENOMEM;
+
+	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+}
+
+/**
+ * dccp_feat_register_sp  -  Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+			  u8 const *list, u8 len)
+{	 /* any changes must be registered before establishing the connection */
+	if (sk->sk_state != DCCP_CLOSED)
+		return -EISCONN;
+	if (dccp_feat_type(feat) != FEAT_SP)
+		return -EINVAL;
+	return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+				  0, list, len);
+}
+
+
+/*
+ *	Tracking features whose value depend on the choice of CCID
+ *
+ * This is designed with an extension in mind so that a list walk could be done
+ * before activating any features. However, the existing framework was found to
+ * work satisfactorily up until now, the automatic verification is left open.
+ * When adding new CCIDs, add a corresponding dependency table here.
+ */
+static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
+{
+	static const struct ccid_dependency ccid2_dependencies[2][2] = {
+		/*
+		 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
+		 * feature and Send Ack Vector is an RX feature, `is_local'
+		 * needs to be reversed.
+		 */
+		{	/* Dependencies of the receiver-side (remote) CCID2 */
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= true,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		},
+		{	/* Dependencies of the sender-side (local) CCID2 */
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		}
+	};
+	static const struct ccid_dependency ccid3_dependencies[2][5] = {
+		{	/*
+			 * Dependencies of the receiver-side CCID3
+			 */
+			{	/* locally disable Ack Vectors */
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{	/* see below why Send Loss Event Rate is on */
+				.dependent_feat	= DCCPF_SEND_LEV_RATE,
+				.is_local	= true,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{	/* NDP Count is needed as per RFC 4342, 6.1.1 */
+				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 },
+		},
+		{	/*
+			 * CCID3 at the TX side: we request that the HC-receiver
+			 * will not send Ack Vectors (they will be ignored, so
+			 * Mandatory is not set); we enable Send Loss Event Rate
+			 * (Mandatory since the implementation does not support
+			 * the Loss Intervals option of RFC 4342, 8.6).
+			 * The last two options are for peer's information only.
+			*/
+			{
+				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
+				.is_local	= false,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{
+				.dependent_feat	= DCCPF_SEND_LEV_RATE,
+				.is_local	= false,
+				.is_mandatory	= true,
+				.val		= 1
+			},
+			{	/* this CCID does not support Ack Ratio */
+				.dependent_feat	= DCCPF_ACK_RATIO,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 0
+			},
+			{	/* tell receiver we are sending NDP counts */
+				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
+				.is_local	= true,
+				.is_mandatory	= false,
+				.val		= 1
+			},
+			{ 0, 0, 0, 0 }
+		}
+	};
+	switch (ccid) {
+	case DCCPC_CCID2:
+		return ccid2_dependencies[is_local];
+	case DCCPC_CCID3:
+		return ccid3_dependencies[is_local];
+	default:
+		return NULL;
+	}
+}
+
+/**
+ * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
+ * @fn: feature-negotiation list to update
+ * @id: CCID number to track
+ * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ * This function needs to be called after registering all other features.
+ */
+static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
+{
+	const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
+	int i, rc = (table == NULL);
+
+	for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
+		if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
+			rc = __feat_register_sp(fn, table[i].dependent_feat,
+						    table[i].is_local,
+						    table[i].is_mandatory,
+						    &table[i].val, 1);
+		else
+			rc = __feat_register_nn(fn, table[i].dependent_feat,
+						    table[i].is_mandatory,
+						    table[i].val);
+	return rc;
+}
+
+/**
+ * dccp_feat_finalise_settings  -  Finalise settings before starting negotiation
+ * @dp: client or listening socket (settings will be inherited)
+ * This is called after all registrations (socket initialisation, sysctls, and
+ * sockopt calls), and before sending the first packet containing Change options
+ * (ie. client-Request or server-Response), to ensure internal consistency.
+ */
+int dccp_feat_finalise_settings(struct dccp_sock *dp)
+{
+	struct list_head *fn = &dp->dccps_featneg;
+	struct dccp_feat_entry *entry;
+	int i = 2, ccids[2] = { -1, -1 };
+
+	/*
+	 * Propagating CCIDs:
+	 * 1) not useful to propagate CCID settings if this host advertises more
+	 *    than one CCID: the choice of CCID  may still change - if this is
+	 *    the client, or if this is the server and the client sends
+	 *    singleton CCID values.
+	 * 2) since is that propagate_ccid changes the list, we defer changing
+	 *    the sorted list until after the traversal.
+	 */
+	list_for_each_entry(entry, fn, node)
+		if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
+			ccids[entry->is_local] = entry->val.sp.vec[0];
+	while (i--)
+		if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
+			return -1;
+	dccp_feat_print_fnlist(fn);
+	return 0;
+}
+
+/**
+ * dccp_feat_server_ccid_dependencies  -  Resolve CCID-dependent features
+ * It is the server which resolves the dependencies once the CCID has been
+ * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
+ */
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
+{
+	struct list_head *fn = &dreq->dreq_featneg;
+	struct dccp_feat_entry *entry;
+	u8 is_local, ccid;
+
+	for (is_local = 0; is_local <= 1; is_local++) {
+		entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
+
+		if (entry != NULL && !entry->empty_confirm)
+			ccid = entry->val.sp.vec[0];
+		else
+			ccid = dccp_feat_default_value(DCCPF_CCID);
+
+		if (dccp_feat_propagate_ccid(fn, ccid, is_local))
+			return -1;
+	}
+	return 0;
+}
+
+/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
+static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+{
+	u8 c, s;
+
+	for (s = 0; s < slen; s++)
+		for (c = 0; c < clen; c++)
+			if (servlist[s] == clilist[c])
+				return servlist[s];
+	return -1;
+}
+
+/**
+ * dccp_feat_prefer  -  Move preferred entry to the start of array
+ * Reorder the @array_len elements in @array so that @preferred_value comes
+ * first. Returns >0 to indicate that @preferred_value does occur in @array.
+ */
+static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
+{
+	u8 i, does_occur = 0;
+
+	if (array != NULL) {
+		for (i = 0; i < array_len; i++)
+			if (array[i] == preferred_value) {
+				array[i] = array[0];
+				does_occur++;
+			}
+		if (does_occur)
+			array[0] = preferred_value;
+	}
+	return does_occur;
+}
+
+/**
+ * dccp_feat_reconcile  -  Reconcile SP preference lists
+ *  @fval: SP list to reconcile into
+ *  @arr: received SP preference list
+ *  @len: length of @arr in bytes
+ *  @is_server: whether this side is the server (and @fv is the server's list)
+ *  @reorder: whether to reorder the list in @fv after reconciling with @arr
+ * When successful, > 0 is returned and the reconciled list is in @fval.
+ * A value of 0 means that negotiation failed (no shared entry).
+ */
+static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
+			       bool is_server, bool reorder)
+{
+	int rc;
+
+	if (!fv->sp.vec || !arr) {
+		DCCP_CRIT("NULL feature value or array");
+		return 0;
+	}
+
+	if (is_server)
+		rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
+	else
+		rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
+
+	if (!reorder)
+		return rc;
+	if (rc < 0)
+		return 0;
+
+	/*
+	 * Reorder list: used for activating features and in dccp_insert_fn_opt.
+	 */
+	return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
+}
+
+/**
+ * dccp_feat_change_recv  -  Process incoming ChangeL/R options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether the Change was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is the server (1) or the client (0)
+ */
+static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+				u8 feat, u8 *val, u8 len, const bool server)
+{
+	u8 defval, type = dccp_feat_type(feat);
+	const bool local = (opt == DCCPO_CHANGE_R);
+	struct dccp_feat_entry *entry;
+	dccp_feat_val fval;
+
+	if (len == 0 || type == FEAT_UNKNOWN)		/* 6.1 and 6.6.8 */
+		goto unknown_feature_or_value;
+
+	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+	/*
+	 *	Negotiation of NN features: Change R is invalid, so there is no
+	 *	simultaneous negotiation; hence we do not look up in the list.
+	 */
+	if (type == FEAT_NN) {
+		if (local || len > sizeof(fval.nn))
+			goto unknown_feature_or_value;
+
+		/* 6.3.2: "The feature remote MUST accept any valid value..." */
+		fval.nn = dccp_decode_value_var(val, len);
+		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+			goto unknown_feature_or_value;
+
+		return dccp_feat_push_confirm(fn, feat, local, &fval);
+	}
+
+	/*
+	 *	Unidirectional/simultaneous negotiation of SP features (6.3.1)
+	 */
+	entry = dccp_feat_list_lookup(fn, feat, local);
+	if (entry == NULL) {
+		/*
+		 * No particular preferences have been registered. We deal with
+		 * this situation by assuming that all valid values are equally
+		 * acceptable, and apply the following checks:
+		 * - if the peer's list is a singleton, we accept a valid value;
+		 * - if we are the server, we first try to see if the peer (the
+		 *   client) advertises the default value. If yes, we use it,
+		 *   otherwise we accept the preferred value;
+		 * - else if we are the client, we use the first list element.
+		 */
+		if (dccp_feat_clone_sp_val(&fval, val, 1))
+			return DCCP_RESET_CODE_TOO_BUSY;
+
+		if (len > 1 && server) {
+			defval = dccp_feat_default_value(feat);
+			if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
+				fval.sp.vec[0] = defval;
+		} else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
+			kfree(fval.sp.vec);
+			goto unknown_feature_or_value;
+		}
+
+		/* Treat unsupported CCIDs like invalid values */
+		if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
+			kfree(fval.sp.vec);
+			goto not_valid_or_not_known;
+		}
+
+		return dccp_feat_push_confirm(fn, feat, local, &fval);
+
+	} else if (entry->state == FEAT_UNSTABLE) {	/* 6.6.2 */
+		return 0;
+	}
+
+	if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
+		entry->empty_confirm = 0;
+	} else if (is_mandatory) {
+		return DCCP_RESET_CODE_MANDATORY_ERROR;
+	} else if (entry->state == FEAT_INITIALISING) {
+		/*
+		 * Failed simultaneous negotiation (server only): try to `save'
+		 * the connection by checking whether entry contains the default
+		 * value for @feat. If yes, send an empty Confirm to signal that
+		 * the received Change was not understood - which implies using
+		 * the default value.
+		 * If this also fails, we use Reset as the last resort.
+		 */
+		WARN_ON(!server);
+		defval = dccp_feat_default_value(feat);
+		if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
+			return DCCP_RESET_CODE_OPTION_ERROR;
+		entry->empty_confirm = 1;
+	}
+	entry->needs_confirm   = 1;
+	entry->needs_mandatory = 0;
+	entry->state	       = FEAT_STABLE;
+	return 0;
+
+unknown_feature_or_value:
+	if (!is_mandatory)
+		return dccp_push_empty_confirm(fn, feat, local);
+
+not_valid_or_not_known:
+	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+			    : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_confirm_recv  -  Process received Confirm options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is server (1) or client (0)
+ */
+static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+				 u8 feat, u8 *val, u8 len, const bool server)
+{
+	u8 *plist, plen, type = dccp_feat_type(feat);
+	const bool local = (opt == DCCPO_CONFIRM_R);
+	struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
+
+	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+	if (entry == NULL) {	/* nothing queued: ignore or handle error */
+		if (is_mandatory && type == FEAT_UNKNOWN)
+			return DCCP_RESET_CODE_MANDATORY_ERROR;
+
+		if (!local && type == FEAT_NN)		/* 6.3.2 */
+			goto confirmation_failed;
+		return 0;
+	}
+
+	if (entry->state != FEAT_CHANGING)		/* 6.6.2 */
+		return 0;
+
+	if (len == 0) {
+		if (dccp_feat_must_be_understood(feat))	/* 6.6.7 */
+			goto confirmation_failed;
+		/*
+		 * Empty Confirm during connection setup: this means reverting
+		 * to the `old' value, which in this case is the default. Since
+		 * we handle default values automatically when no other values
+		 * have been set, we revert to the old value by removing this
+		 * entry from the list.
+		 */
+		dccp_feat_list_pop(entry);
+		return 0;
+	}
+
+	if (type == FEAT_NN) {
+		if (len > sizeof(entry->val.nn))
+			goto confirmation_failed;
+
+		if (entry->val.nn == dccp_decode_value_var(val, len))
+			goto confirmation_succeeded;
+
+		DCCP_WARN("Bogus Confirm for non-existing value\n");
+		goto confirmation_failed;
+	}
+
+	/*
+	 * Parsing SP Confirms: the first element of @val is the preferred
+	 * SP value which the peer confirms, the remainder depends on @len.
+	 * Note that only the confirmed value need to be a valid SP value.
+	 */
+	if (!dccp_feat_is_valid_sp_val(feat, *val))
+		goto confirmation_failed;
+
+	if (len == 1) {		/* peer didn't supply a preference list */
+		plist = val;
+		plen  = len;
+	} else {		/* preferred value + preference list */
+		plist = val + 1;
+		plen  = len - 1;
+	}
+
+	/* Check whether the peer got the reconciliation right (6.6.8) */
+	if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
+		DCCP_WARN("Confirm selected the wrong value %u\n", *val);
+		return DCCP_RESET_CODE_OPTION_ERROR;
+	}
+	entry->val.sp.vec[0] = *val;
+
+confirmation_succeeded:
+	entry->state = FEAT_STABLE;
+	return 0;
+
+confirmation_failed:
+	DCCP_WARN("Confirmation failed\n");
+	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+			    : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_parse_options  -  Process Feature-Negotiation Options
+ * @sk: for general use and used by the client during connection setup
+ * @dreq: used by the server during connection setup
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: value contents of @opt
+ * @len: length of @val in bytes
+ * Returns 0 on success, a Reset code for ending the connection otherwise.
+ */
+int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+			    u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+	bool server = false;
+
+	switch (sk->sk_state) {
+	/*
+	 *	Negotiation during connection setup
+	 */
+	case DCCP_LISTEN:
+		server = true;			/* fall through */
+	case DCCP_REQUESTING:
+		switch (opt) {
+		case DCCPO_CHANGE_L:
+		case DCCPO_CHANGE_R:
+			return dccp_feat_change_recv(fn, mandatory, opt, feat,
+						     val, len, server);
+		case DCCPO_CONFIRM_R:
+		case DCCPO_CONFIRM_L:
+			return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
+						      val, len, server);
+		}
+	}
+	return 0;	/* ignore FN options in all other states */
+}
+
+/**
+ * dccp_feat_init  -  Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
+int dccp_feat_init(struct sock *sk)
+{
+	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+	u8 on = 1, off = 0;
+	int rc;
+	struct {
+		u8 *val;
+		u8 len;
+	} tx, rx;
+
+	/* Non-negotiable (NN) features */
+	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+				    sysctl_dccp_sequence_window);
+	if (rc)
+		return rc;
+
+	/* Server-priority (SP) features */
+
+	/* Advertise that short seqnos are not supported (7.6.1) */
+	rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
+	if (rc)
+		return rc;
+
+	/* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+	rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+	if (rc)
+		return rc;
+
+	/*
+	 * We advertise the available list of CCIDs and reorder according to
+	 * preferences, to avoid failure resulting from negotiating different
+	 * singleton values (which always leads to failure).
+	 * These settings can still (later) be overridden via sockopts.
+	 */
+	if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+	    ccid_get_builtin_ccids(&rx.val, &rx.len))
+		return -ENOBUFS;
+
+	if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+	    !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+	if (rc)
+		goto free_ccid_lists;
+
+	rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+	kfree(tx.val);
+	kfree(rx.val);
+	return rc;
+}
+
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_feat_entry *cur, *next;
+	int idx;
+	dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
+		 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
+	};
+
+	list_for_each_entry(cur, fn_list, node) {
+		/*
+		 * An empty Confirm means that either an unknown feature type
+		 * or an invalid value was present. In the first case there is
+		 * nothing to activate, in the other the default value is used.
+		 */
+		if (cur->empty_confirm)
+			continue;
+
+		idx = dccp_feat_index(cur->feat_num);
+		if (idx < 0) {
+			DCCP_BUG("Unknown feature %u", cur->feat_num);
+			goto activation_failed;
+		}
+		if (cur->state != FEAT_STABLE) {
+			DCCP_CRIT("Negotiation of %s %s failed in state %s",
+				  cur->is_local ? "local" : "remote",
+				  dccp_feat_fname(cur->feat_num),
+				  dccp_feat_sname[cur->state]);
+			goto activation_failed;
+		}
+		fvals[idx][cur->is_local] = &cur->val;
+	}
+
+	/*
+	 * Activate in decreasing order of index, so that the CCIDs are always
+	 * activated as the last feature. This avoids the case where a CCID
+	 * relies on the initialisation of one or more features that it depends
+	 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
+	 */
+	for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
+		if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
+		    __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
+			DCCP_CRIT("Could not activate %d", idx);
+			goto activation_failed;
+		}
+
+	/* Clean up Change options which have been confirmed already */
+	list_for_each_entry_safe(cur, next, fn_list, node)
+		if (!cur->needs_confirm)
+			dccp_feat_list_pop(cur);
+
+	dccp_pr_debug("Activation OK\n");
+	return 0;
+
+activation_failed:
+	/*
+	 * We clean up everything that may have been allocated, since
+	 * it is difficult to track at which stage negotiation failed.
+	 * This is ok, since all allocation functions below are robust
+	 * against NULL arguments.
+	 */
+	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+	dp->dccps_hc_rx_ackvec = NULL;
+	return -1;
+}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
new file mode 100644
index 00000000..e56a4e5e
--- /dev/null
+++ b/net/dccp/feat.h
@@ -0,0 +1,136 @@
+#ifndef _DCCP_FEAT_H
+#define _DCCP_FEAT_H
+/*
+ *  net/dccp/feat.h
+ *
+ *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include "dccp.h"
+
+/*
+ * Known limit values
+ */
+/* Ack Ratio takes 2-byte integer values (11.3) */
+#define DCCPF_ACK_RATIO_MAX	0xFFFF
+/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
+#define DCCPF_SEQ_WMIN		32
+#define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
+
+enum dccp_feat_type {
+	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
+	FEAT_AT_TX   = 2,	/* located at TX side of half-connection  */
+	FEAT_SP      = 4,	/* server-priority reconciliation (6.3.1) */
+	FEAT_NN	     = 8,	/* non-negotiable reconciliation (6.3.2)  */
+	FEAT_UNKNOWN = 0xFF	/* not understood or invalid feature	  */
+};
+
+enum dccp_feat_state {
+	FEAT_DEFAULT = 0,	/* using default values from 6.4 */
+	FEAT_INITIALISING,	/* feature is being initialised  */
+	FEAT_CHANGING,		/* Change sent but not confirmed yet */
+	FEAT_UNSTABLE,		/* local modification in state CHANGING */
+	FEAT_STABLE		/* both ends (think they) agree */
+};
+
+/**
+ * dccp_feat_val  -  Container for SP or NN feature values
+ * @nn:     single NN value
+ * @sp.vec: single SP value plus optional preference list
+ * @sp.len: length of @sp.vec in bytes
+ */
+typedef union {
+	u64 nn;
+	struct {
+		u8	*vec;
+		u8	len;
+	}   sp;
+} dccp_feat_val;
+
+/**
+ * struct feat_entry  -  Data structure to perform feature negotiation
+ * @val: feature's current value (SP features may have preference list)
+ * @state: feature's current state
+ * @feat_num: one of %dccp_feature_numbers
+ * @needs_mandatory: whether Mandatory options should be sent
+ * @needs_confirm: whether to send a Confirm instead of a Change
+ * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
+ * @is_local: feature location (1) or feature-remote (0)
+ * @node: list pointers, entries arranged in FIFO order
+ */
+struct dccp_feat_entry {
+	dccp_feat_val           val;
+	enum dccp_feat_state    state:8;
+	u8                      feat_num;
+
+	bool			needs_mandatory,
+				needs_confirm,
+				empty_confirm,
+				is_local;
+
+	struct list_head	node;
+};
+
+static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+{
+	if (entry->needs_confirm)
+		return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
+	return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+}
+
+/**
+ * struct ccid_dependency  -  Track changes resulting from choosing a CCID
+ * @dependent_feat: one of %dccp_feature_numbers
+ * @is_local: local (1) or remote (0) @dependent_feat
+ * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
+ * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
+ */
+struct ccid_dependency {
+	u8	dependent_feat;
+	bool	is_local:1,
+		is_mandatory:1;
+	u8	val;
+};
+
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int	     sysctl_dccp_rx_ccid;
+extern int	     sysctl_dccp_tx_ccid;
+
+extern int  dccp_feat_init(struct sock *sk);
+extern void dccp_feat_initialise_sysctls(void);
+extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+				  u8 const *list, u8 len);
+extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
+extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
+
+/*
+ * Encoding variable-length options and their maximum length.
+ *
+ * This affects NN options (SP options are all u8) and other variable-length
+ * options (see table 3 in RFC 4340). The limit is currently given the Sequence
+ * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
+ * options consume less than 6 bytes (timestamps are 4 bytes).
+ * When updating this constant (e.g. due to new internet drafts / RFCs), make
+ * sure that you also update all code which refers to it.
+ */
+#define DCCP_OPTVAL_MAXLEN	6
+
+extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
+
+extern int  dccp_insert_option_mandatory(struct sk_buff *skb);
+extern int  dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+			       u8 *val, u8 len, bool repeat_first);
+#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 00000000..4222e7a6
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,732 @@
+/*
+ *  net/dccp/input.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
+int sysctl_dccp_sync_ratelimit	__read_mostly = HZ / 8;
+
+static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_set_owner_r(skb, sk);
+	sk->sk_data_ready(sk, 0);
+}
+
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+	/*
+	 * On receiving Close/CloseReq, both RD/WR shutdown are performed.
+	 * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
+	 * receiving the closing segment, but there is no guarantee that such
+	 * data will be processed at all.
+	 */
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	sock_set_flag(sk, SOCK_DONE);
+	dccp_enqueue_skb(sk, skb);
+}
+
+static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+	int queued = 0;
+
+	switch (sk->sk_state) {
+	/*
+	 * We ignore Close when received in one of the following states:
+	 *  - CLOSED		(may be a late or duplicate packet)
+	 *  - PASSIVE_CLOSEREQ	(the peer has sent a CloseReq earlier)
+	 *  - RESPOND		(already handled by dccp_check_req)
+	 */
+	case DCCP_CLOSING:
+		/*
+		 * Simultaneous-close: receiving a Close after sending one. This
+		 * can happen if both client and server perform active-close and
+		 * will result in an endless ping-pong of crossing and retrans-
+		 * mitted Close packets, which only terminates when one of the
+		 * nodes times out (min. 64 seconds). Quicker convergence can be
+		 * achieved when one of the nodes acts as tie-breaker.
+		 * This is ok as both ends are done with data transfer and each
+		 * end is just waiting for the other to acknowledge termination.
+		 */
+		if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
+			break;
+		/* fall through */
+	case DCCP_REQUESTING:
+	case DCCP_ACTIVE_CLOSEREQ:
+		dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+		dccp_done(sk);
+		break;
+	case DCCP_OPEN:
+	case DCCP_PARTOPEN:
+		/* Give waiting application a chance to read pending data */
+		queued = 1;
+		dccp_fin(sk, skb);
+		dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
+		/* fall through */
+	case DCCP_PASSIVE_CLOSE:
+		/*
+		 * Retransmitted Close: we have already enqueued the first one.
+		 */
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+	}
+	return queued;
+}
+
+static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+	int queued = 0;
+
+	/*
+	 *   Step 7: Check for unexpected packet types
+	 *      If (S.is_server and P.type == CloseReq)
+	 *	  Send Sync packet acknowledging P.seqno
+	 *	  Drop packet and return
+	 */
+	if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+		return queued;
+	}
+
+	/* Step 13: process relevant Client states < CLOSEREQ */
+	switch (sk->sk_state) {
+	case DCCP_REQUESTING:
+		dccp_send_close(sk, 0);
+		dccp_set_state(sk, DCCP_CLOSING);
+		break;
+	case DCCP_OPEN:
+	case DCCP_PARTOPEN:
+		/* Give waiting application a chance to read pending data */
+		queued = 1;
+		dccp_fin(sk, skb);
+		dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
+		/* fall through */
+	case DCCP_PASSIVE_CLOSEREQ:
+		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+	}
+	return queued;
+}
+
+static u16 dccp_reset_code_convert(const u8 code)
+{
+	const u16 error_code[] = {
+	[DCCP_RESET_CODE_CLOSED]	     = 0,	/* normal termination */
+	[DCCP_RESET_CODE_UNSPECIFIED]	     = 0,	/* nothing known */
+	[DCCP_RESET_CODE_ABORTED]	     = ECONNRESET,
+
+	[DCCP_RESET_CODE_NO_CONNECTION]	     = ECONNREFUSED,
+	[DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
+	[DCCP_RESET_CODE_TOO_BUSY]	     = EUSERS,
+	[DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
+
+	[DCCP_RESET_CODE_PACKET_ERROR]	     = ENOMSG,
+	[DCCP_RESET_CODE_BAD_INIT_COOKIE]    = EBADR,
+	[DCCP_RESET_CODE_BAD_SERVICE_CODE]   = EBADRQC,
+	[DCCP_RESET_CODE_OPTION_ERROR]	     = EILSEQ,
+	[DCCP_RESET_CODE_MANDATORY_ERROR]    = EOPNOTSUPP,
+	};
+
+	return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
+}
+
+static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
+{
+	u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
+
+	sk->sk_err = err;
+
+	/* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
+	dccp_fin(sk, skb);
+
+	if (err && !sock_flag(sk, SOCK_DEAD))
+		sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+	dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+}
+
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
+
+	if (av == NULL)
+		return;
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	dccp_ackvec_input(av, skb);
+}
+
+static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+
+	/* Don't deliver to RX CCID when node has shut down read end. */
+	if (!(sk->sk_shutdown & RCV_SHUTDOWN))
+		ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+	/*
+	 * Until the TX queue has been drained, we can not honour SHUT_WR, since
+	 * we need received feedback as input to adjust congestion control.
+	 */
+	if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
+		ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+}
+
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	struct dccp_sock *dp = dccp_sk(sk);
+	u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
+			ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+
+	/*
+	 *   Step 5: Prepare sequence numbers for Sync
+	 *     If P.type == Sync or P.type == SyncAck,
+	 *	  If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+	 *	     / * P is valid, so update sequence number variables
+	 *		 accordingly.  After this update, P will pass the tests
+	 *		 in Step 6.  A SyncAck is generated if necessary in
+	 *		 Step 15 * /
+	 *	     Update S.GSR, S.SWL, S.SWH
+	 *	  Otherwise,
+	 *	     Drop packet and return
+	 */
+	if (dh->dccph_type == DCCP_PKT_SYNC ||
+	    dh->dccph_type == DCCP_PKT_SYNCACK) {
+		if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
+		    dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
+			dccp_update_gsr(sk, seqno);
+		else
+			return -1;
+	}
+
+	/*
+	 *   Step 6: Check sequence numbers
+	 *      Let LSWL = S.SWL and LAWL = S.AWL
+	 *      If P.type == CloseReq or P.type == Close or P.type == Reset,
+	 *	  LSWL := S.GSR + 1, LAWL := S.GAR
+	 *      If LSWL <= P.seqno <= S.SWH
+	 *	     and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+	 *	  Update S.GSR, S.SWL, S.SWH
+	 *	  If P.type != Sync,
+	 *	     Update S.GAR
+	 */
+	lswl = dp->dccps_swl;
+	lawl = dp->dccps_awl;
+
+	if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+	    dh->dccph_type == DCCP_PKT_CLOSE ||
+	    dh->dccph_type == DCCP_PKT_RESET) {
+		lswl = ADD48(dp->dccps_gsr, 1);
+		lawl = dp->dccps_gar;
+	}
+
+	if (between48(seqno, lswl, dp->dccps_swh) &&
+	    (ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
+	     between48(ackno, lawl, dp->dccps_awh))) {
+		dccp_update_gsr(sk, seqno);
+
+		if (dh->dccph_type != DCCP_PKT_SYNC &&
+		    ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+		    after48(ackno, dp->dccps_gar))
+			dp->dccps_gar = ackno;
+	} else {
+		unsigned long now = jiffies;
+		/*
+		 *   Step 6: Check sequence numbers
+		 *      Otherwise,
+		 *         If P.type == Reset,
+		 *            Send Sync packet acknowledging S.GSR
+		 *         Otherwise,
+		 *            Send Sync packet acknowledging P.seqno
+		 *      Drop packet and return
+		 *
+		 *   These Syncs are rate-limited as per RFC 4340, 7.5.4:
+		 *   at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
+		 */
+		if (time_before(now, (dp->dccps_rate_last +
+				      sysctl_dccp_sync_ratelimit)))
+			return -1;
+
+		DCCP_WARN("Step 6 failed for %s packet, "
+			  "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+			  "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+			  "sending SYNC...\n",  dccp_packet_name(dh->dccph_type),
+			  (unsigned long long) lswl, (unsigned long long) seqno,
+			  (unsigned long long) dp->dccps_swh,
+			  (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
+							      : "exists",
+			  (unsigned long long) lawl, (unsigned long long) ackno,
+			  (unsigned long long) dp->dccps_awh);
+
+		dp->dccps_rate_last = now;
+
+		if (dh->dccph_type == DCCP_PKT_RESET)
+			seqno = dp->dccps_gsr;
+		dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+				  const struct dccp_hdr *dh, const unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	switch (dccp_hdr(skb)->dccph_type) {
+	case DCCP_PKT_DATAACK:
+	case DCCP_PKT_DATA:
+		/*
+		 * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
+		 * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
+		 * - sk_receive_queue is full, use Code 2, "Receive Buffer"
+		 */
+		dccp_enqueue_skb(sk, skb);
+		return 0;
+	case DCCP_PKT_ACK:
+		goto discard;
+	case DCCP_PKT_RESET:
+		/*
+		 *  Step 9: Process Reset
+		 *	If P.type == Reset,
+		 *		Tear down connection
+		 *		S.state := TIMEWAIT
+		 *		Set TIMEWAIT timer
+		 *		Drop packet and return
+		 */
+		dccp_rcv_reset(sk, skb);
+		return 0;
+	case DCCP_PKT_CLOSEREQ:
+		if (dccp_rcv_closereq(sk, skb))
+			return 0;
+		goto discard;
+	case DCCP_PKT_CLOSE:
+		if (dccp_rcv_close(sk, skb))
+			return 0;
+		goto discard;
+	case DCCP_PKT_REQUEST:
+		/* Step 7
+		 *   or (S.is_server and P.type == Response)
+		 *   or (S.is_client and P.type == Request)
+		 *   or (S.state >= OPEN and P.type == Request
+		 *	and P.seqno >= S.OSR)
+		 *    or (S.state >= OPEN and P.type == Response
+		 *	and P.seqno >= S.OSR)
+		 *    or (S.state == RESPOND and P.type == Data),
+		 *  Send Sync packet acknowledging P.seqno
+		 *  Drop packet and return
+		 */
+		if (dp->dccps_role != DCCP_ROLE_LISTEN)
+			goto send_sync;
+		goto check_seq;
+	case DCCP_PKT_RESPONSE:
+		if (dp->dccps_role != DCCP_ROLE_CLIENT)
+			goto send_sync;
+check_seq:
+		if (dccp_delta_seqno(dp->dccps_osr,
+				     DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
+send_sync:
+			dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+				       DCCP_PKT_SYNC);
+		}
+		break;
+	case DCCP_PKT_SYNC:
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+			       DCCP_PKT_SYNCACK);
+		/*
+		 * From RFC 4340, sec. 5.7
+		 *
+		 * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+		 * MAY have non-zero-length application data areas, whose
+		 * contents receivers MUST ignore.
+		 */
+		goto discard;
+	}
+
+	DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			 const struct dccp_hdr *dh, const unsigned len)
+{
+	if (dccp_check_seqno(sk, skb))
+		goto discard;
+
+	if (dccp_parse_options(sk, NULL, skb))
+		return 1;
+
+	dccp_handle_ackvec_processing(sk, skb);
+	dccp_deliver_input_to_ccids(sk, skb);
+
+	return __dccp_rcv_established(sk, skb, dh, len);
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_established);
+
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+					       struct sk_buff *skb,
+					       const struct dccp_hdr *dh,
+					       const unsigned len)
+{
+	/*
+	 *  Step 4: Prepare sequence numbers in REQUEST
+	 *     If S.state == REQUEST,
+	 *	  If (P.type == Response or P.type == Reset)
+	 *		and S.AWL <= P.ackno <= S.AWH,
+	 *	     / * Set sequence number variables corresponding to the
+	 *		other endpoint, so P will pass the tests in Step 6 * /
+	 *	     Set S.GSR, S.ISR, S.SWL, S.SWH
+	 *	     / * Response processing continues in Step 10; Reset
+	 *		processing continues in Step 9 * /
+	*/
+	if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		struct dccp_sock *dp = dccp_sk(sk);
+		long tstamp = dccp_timestamp();
+
+		if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			       dp->dccps_awl, dp->dccps_awh)) {
+			dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+				      "P.ackno=%llu, S.AWH=%llu\n",
+				      (unsigned long long)dp->dccps_awl,
+			   (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+				      (unsigned long long)dp->dccps_awh);
+			goto out_invalid_packet;
+		}
+
+		/*
+		 * If option processing (Step 8) failed, return 1 here so that
+		 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
+		 * the option type and is set in dccp_parse_options().
+		 */
+		if (dccp_parse_options(sk, NULL, skb))
+			return 1;
+
+		/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+		if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
+			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
+			    dp->dccps_options_received.dccpor_timestamp_echo));
+
+		/* Stop the REQUEST timer */
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+		WARN_ON(sk->sk_send_head == NULL);
+		kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+
+		/*
+		 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+		 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+		 * is done as part of activating the feature values below, since
+		 * these settings depend on the local/remote Sequence Window
+		 * features, which were undefined or not confirmed until now.
+		 */
+		dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+
+		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+
+		/*
+		 *    Step 10: Process REQUEST state (second part)
+		 *       If S.state == REQUEST,
+		 *	  / * If we get here, P is a valid Response from the
+		 *	      server (see Step 4), and we should move to
+		 *	      PARTOPEN state. PARTOPEN means send an Ack,
+		 *	      don't send Data packets, retransmit Acks
+		 *	      periodically, and always include any Init Cookie
+		 *	      from the Response * /
+		 *	  S.state := PARTOPEN
+		 *	  Set PARTOPEN timer
+		 *	  Continue with S.state == PARTOPEN
+		 *	  / * Step 12 will send the Ack completing the
+		 *	      three-way handshake * /
+		 */
+		dccp_set_state(sk, DCCP_PARTOPEN);
+
+		/*
+		 * If feature negotiation was successful, activate features now;
+		 * an activation failure means that this host could not activate
+		 * one ore more features (e.g. insufficient memory), which would
+		 * leave at least one feature in an undefined state.
+		 */
+		if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
+			goto unable_to_proceed;
+
+		/* Make sure socket is routed, for correct metrics. */
+		icsk->icsk_af_ops->rebuild_header(sk);
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+		}
+
+		if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+		    icsk->icsk_accept_queue.rskq_defer_accept) {
+			/* Save one ACK. Data will be ready after
+			 * several ticks, if write_pending is set.
+			 *
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
+			 */
+			/*
+			 * OK, in DCCP we can as well do a similar trick, its
+			 * even in the draft, but there is no need for us to
+			 * schedule an ack here, as dccp_sendmsg does this for
+			 * us, also stated in the draft. -acme
+			 */
+			__kfree_skb(skb);
+			return 0;
+		}
+		dccp_send_ack(sk);
+		return -1;
+	}
+
+out_invalid_packet:
+	/* dccp_v4_do_rcv will send a reset */
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+	return 1;
+
+unable_to_proceed:
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
+	/*
+	 * We mark this socket as no longer usable, so that the loop in
+	 * dccp_sendmsg() terminates and the application gets notified.
+	 */
+	dccp_set_state(sk, DCCP_CLOSED);
+	sk->sk_err = ECOMM;
+	return 1;
+}
+
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+						   struct sk_buff *skb,
+						   const struct dccp_hdr *dh,
+						   const unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
+	int queued = 0;
+
+	switch (dh->dccph_type) {
+	case DCCP_PKT_RESET:
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+		break;
+	case DCCP_PKT_DATA:
+		if (sk->sk_state == DCCP_RESPOND)
+			break;
+	case DCCP_PKT_DATAACK:
+	case DCCP_PKT_ACK:
+		/*
+		 * FIXME: we should be reseting the PARTOPEN (DELACK) timer
+		 * here but only if we haven't used the DELACK timer for
+		 * something else, like sending a delayed ack for a TIMESTAMP
+		 * echo, etc, for now were not clearing it, sending an extra
+		 * ACK when there is nothing else to do in DELACK is not a big
+		 * deal after all.
+		 */
+
+		/* Stop the PARTOPEN timer */
+		if (sk->sk_state == DCCP_PARTOPEN)
+			inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+
+		/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+		if (likely(sample)) {
+			long delta = dccp_timestamp() - sample;
+
+			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+		}
+
+		dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_set_state(sk, DCCP_OPEN);
+
+		if (dh->dccph_type == DCCP_PKT_DATAACK ||
+		    dh->dccph_type == DCCP_PKT_DATA) {
+			__dccp_rcv_established(sk, skb, dh, len);
+			queued = 1; /* packet was queued
+				       (by __dccp_rcv_established) */
+		}
+		break;
+	}
+
+	return queued;
+}
+
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+			   struct dccp_hdr *dh, unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+	const int old_state = sk->sk_state;
+	int queued = 0;
+
+	/*
+	 *  Step 3: Process LISTEN state
+	 *
+	 *     If S.state == LISTEN,
+	 *	 If P.type == Request or P contains a valid Init Cookie option,
+	 *	      (* Must scan the packet's options to check for Init
+	 *		 Cookies.  Only Init Cookies are processed here,
+	 *		 however; other options are processed in Step 8.  This
+	 *		 scan need only be performed if the endpoint uses Init
+	 *		 Cookies *)
+	 *	      (* Generate a new socket and switch to that socket *)
+	 *	      Set S := new socket for this port pair
+	 *	      S.state = RESPOND
+	 *	      Choose S.ISS (initial seqno) or set from Init Cookies
+	 *	      Initialize S.GAR := S.ISS
+	 *	      Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+	 *	      Cookies Continue with S.state == RESPOND
+	 *	      (* A Response packet will be generated in Step 11 *)
+	 *	 Otherwise,
+	 *	      Generate Reset(No Connection) unless P.type == Reset
+	 *	      Drop packet and return
+	 */
+	if (sk->sk_state == DCCP_LISTEN) {
+		if (dh->dccph_type == DCCP_PKT_REQUEST) {
+			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+								    skb) < 0)
+				return 1;
+			goto discard;
+		}
+		if (dh->dccph_type == DCCP_PKT_RESET)
+			goto discard;
+
+		/* Caller (dccp_v4_do_rcv) will send Reset */
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
+	} else if (sk->sk_state == DCCP_CLOSED) {
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
+	}
+
+	if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) {
+		if (dccp_check_seqno(sk, skb))
+			goto discard;
+
+		/*
+		 * Step 8: Process options and mark acknowledgeable
+		 */
+		if (dccp_parse_options(sk, NULL, skb))
+			return 1;
+
+		dccp_handle_ackvec_processing(sk, skb);
+		dccp_deliver_input_to_ccids(sk, skb);
+	}
+
+	/*
+	 *  Step 9: Process Reset
+	 *	If P.type == Reset,
+	 *		Tear down connection
+	 *		S.state := TIMEWAIT
+	 *		Set TIMEWAIT timer
+	 *		Drop packet and return
+	*/
+	if (dh->dccph_type == DCCP_PKT_RESET) {
+		dccp_rcv_reset(sk, skb);
+		return 0;
+		/*
+		 *   Step 7: Check for unexpected packet types
+		 *      If (S.is_server and P.type == Response)
+		 *	    or (S.is_client and P.type == Request)
+		 *	    or (S.state == RESPOND and P.type == Data),
+		 *	  Send Sync packet acknowledging P.seqno
+		 *	  Drop packet and return
+		 */
+	} else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+		    dh->dccph_type == DCCP_PKT_RESPONSE) ||
+		    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+		     dh->dccph_type == DCCP_PKT_REQUEST) ||
+		    (sk->sk_state == DCCP_RESPOND &&
+		     dh->dccph_type == DCCP_PKT_DATA)) {
+		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+		if (dccp_rcv_closereq(sk, skb))
+			return 0;
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+		if (dccp_rcv_close(sk, skb))
+			return 0;
+		goto discard;
+	}
+
+	switch (sk->sk_state) {
+	case DCCP_REQUESTING:
+		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+		if (queued >= 0)
+			return queued;
+
+		__kfree_skb(skb);
+		return 0;
+
+	case DCCP_RESPOND:
+	case DCCP_PARTOPEN:
+		queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+								 dh, len);
+		break;
+	}
+
+	if (dh->dccph_type == DCCP_PKT_ACK ||
+	    dh->dccph_type == DCCP_PKT_DATAACK) {
+		switch (old_state) {
+		case DCCP_PARTOPEN:
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+			break;
+		}
+	} else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
+		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
+		goto discard;
+	}
+
+	if (!queued) {
+discard:
+		__kfree_skb(skb);
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
+
+/**
+ *  dccp_sample_rtt  -  Validate and finalise computation of RTT sample
+ *  @delta:	number of microseconds between packet and acknowledgment
+ *  The routine is kept generic to work in different contexts. It should be
+ *  called immediately when the ACK used for the RTT sample arrives.
+ */
+u32 dccp_sample_rtt(struct sock *sk, long delta)
+{
+	/* dccpor_elapsed_time is either zeroed out or set and > 0 */
+	delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
+
+	if (unlikely(delta <= 0)) {
+		DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
+		return DCCP_SANE_RTT_MIN;
+	}
+	if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %ld too large, using max\n", delta);
+		return DCCP_SANE_RTT_MAX;
+	}
+
+	return delta;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 00000000..332639b5
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1080 @@
+/*
+ *  net/dccp/ipv4.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+
+#include <net/icmp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/timewait_sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+/*
+ * The per-net dccp.v4_ctl_sk socket is used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	__be16 orig_sport, orig_dport;
+	__be32 daddr, nexthop;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	int err;
+	struct ip_options_rcu *inet_opt;
+
+	dp->dccps_role = DCCP_ROLE_CLIENT;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	nexthop = daddr = usin->sin_addr.s_addr;
+
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt != NULL && inet_opt->opt.srr) {
+		if (daddr == 0)
+			return -EINVAL;
+		nexthop = inet_opt->opt.faddr;
+	}
+
+	orig_sport = inet->inet_sport;
+	orig_dport = usin->sin_port;
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+			      IPPROTO_DCCP,
+			      orig_sport, orig_dport, sk, true);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (inet_opt == NULL || !inet_opt->opt.srr)
+		daddr = fl4->daddr;
+
+	if (inet->inet_saddr == 0)
+		inet->inet_saddr = fl4->saddr;
+	inet->inet_rcv_saddr = inet->inet_saddr;
+
+	inet->inet_dport = usin->sin_port;
+	inet->inet_daddr = daddr;
+
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
+	if (inet_opt)
+		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+	/*
+	 * Socket identity is still unknown (sport may be zero).
+	 * However we set state to DCCP_REQUESTING and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initialization after this.
+	 */
+	dccp_set_state(sk, DCCP_REQUESTING);
+	err = inet_hash_connect(&dccp_death_row, sk);
+	if (err != 0)
+		goto failure;
+
+	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+			       inet->inet_sport, inet->inet_dport, sk);
+	if (IS_ERR(rt)) {
+		rt = NULL;
+		goto failure;
+	}
+	/* OK, now commit destination to socket.  */
+	sk_setup_caps(sk, &rt->dst);
+
+	dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
+						    inet->inet_daddr,
+						    inet->inet_sport,
+						    inet->inet_dport);
+	inet->inet_id = dp->dccps_iss ^ jiffies;
+
+	err = dccp_connect(sk);
+	rt = NULL;
+	if (err != 0)
+		goto failure;
+out:
+	return err;
+failure:
+	/*
+	 * This unhashes the socket and releases the local port, if necessary.
+	 */
+	dccp_set_state(sk, DCCP_CLOSED);
+	ip_rt_put(rt);
+	sk->sk_route_caps = 0;
+	inet->inet_dport = 0;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_connect);
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+					  const struct iphdr *iph,
+					  u32 mtu)
+{
+	struct dst_entry *dst;
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct dccp_sock *dp = dccp_sk(sk);
+
+	/* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+	 * send out by Linux are always < 576bytes so they should go through
+	 * unfragmented).
+	 */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	/* We don't check in the destentry if pmtu discovery is forbidden
+	 * on this route. We just assume that no packet_to_big packets
+	 * are send back when pmtu discovery is not active.
+	 * There is a small race when the user changes this flag in the
+	 * route, but I think that's acceptable.
+	 */
+	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+		return;
+
+	dst->ops->update_pmtu(dst, mtu);
+
+	/* Something is about to be wrong... Remember soft error
+	 * for the case, if this connection will not able to recover.
+	 */
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+		sk->sk_err_soft = EMSGSIZE;
+
+	mtu = dst_mtu(dst);
+
+	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+		dccp_sync_mss(sk, mtu);
+
+		/*
+		 * From RFC 4340, sec. 14.1:
+		 *
+		 *	DCCP-Sync packets are the best choice for upward
+		 *	probing, since DCCP-Sync probes do not risk application
+		 *	data loss.
+		 */
+		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+	} /* else let the usual retransmit timer handle it */
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+static void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (struct iphdr *)skb->data;
+	const u8 offset = iph->ihl << 2;
+	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+	struct dccp_sock *dp;
+	struct inet_sock *inet;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct sock *sk;
+	__u64 seq;
+	int err;
+	struct net *net = dev_net(skb->dev);
+
+	if (skb->len < offset + sizeof(*dh) ||
+	    skb->len < offset + __dccp_basic_hdr_len(dh)) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		return;
+	}
+
+	sk = inet_lookup(net, &dccp_hashinfo,
+			iph->daddr, dh->dccph_dport,
+			iph->saddr, dh->dccph_sport, inet_iif(skb));
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		return;
+	}
+
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	bh_lock_sock(sk);
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 */
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == DCCP_CLOSED)
+		goto out;
+
+	dp = dccp_sk(sk);
+	seq = dccp_hdr_seq(dh);
+	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+	    !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_SOURCE_QUENCH:
+		/* Just silently ignore these. */
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out;
+
+		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			if (!sock_owned_by_user(sk))
+				dccp_do_pmtu_discovery(sk, iph, info);
+			goto out;
+		}
+
+		err = icmp_err_convert[code].errno;
+		break;
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (sk->sk_state) {
+		struct request_sock *req , **prev;
+	case DCCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+		req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
+					  iph->daddr, iph->saddr);
+		if (!req)
+			goto out;
+
+		/*
+		 * ICMPs are not backlogged, hence we cannot get an established
+		 * socket here.
+		 */
+		WARN_ON(req->sk);
+
+		if (seq != dccp_rsk(req)->dreq_iss) {
+			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+		/*
+		 * Still in RESPOND, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept().
+		 */
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case DCCP_REQUESTING:
+	case DCCP_RESPOND:
+		if (!sock_owned_by_user(sk)) {
+			DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+			sk->sk_err = err;
+
+			sk->sk_error_report(sk);
+
+			dccp_done(sk);
+		} else
+			sk->sk_err_soft = err;
+		goto out;
+	}
+
+	/* If we've already connected we will keep trying
+	 * until we time out, or the user gives up.
+	 *
+	 * rfc1122 4.2.3.9 allows to consider as hard errors
+	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+	 * but it is obsoleted by pmtu discovery).
+	 *
+	 * Note, that in modern internet, where routing is unreliable
+	 * and in each dark corner broken firewalls sit, sending random
+	 * errors ordered by their masters even this two messages finally lose
+	 * their original sense (even Linux sends invalid PORT_UNREACHs)
+	 *
+	 * Now we are in compliance with RFCs.
+	 *							--ANK (980905)
+	 */
+
+	inet = inet_sk(sk);
+	if (!sock_owned_by_user(sk) && inet->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else /* Only an error on timeout */
+		sk->sk_err_soft = err;
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
+				      __be32 src, __be32 dst)
+{
+	return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
+}
+
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	dccp_csum_outgoing(skb);
+	dh->dccph_checksum = dccp_v4_csum_finish(skb,
+						 inet->inet_saddr,
+						 inet->inet_daddr);
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_send_check);
+
+static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
+{
+	return secure_dccp_sequence_number(ip_hdr(skb)->daddr,
+					   ip_hdr(skb)->saddr,
+					   dccp_hdr(skb)->dccph_dport,
+					   dccp_hdr(skb)->dccph_sport);
+}
+
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+				       struct request_sock *req,
+				       struct dst_entry *dst)
+{
+	struct inet_request_sock *ireq;
+	struct inet_sock *newinet;
+	struct sock *newsk;
+
+	if (sk_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	newsk = dccp_create_openreq_child(sk, req, skb);
+	if (newsk == NULL)
+		goto exit_nonewsk;
+
+	newinet		   = inet_sk(newsk);
+	ireq		   = inet_rsk(req);
+	newinet->inet_daddr	= ireq->rmt_addr;
+	newinet->inet_rcv_saddr = ireq->loc_addr;
+	newinet->inet_saddr	= ireq->loc_addr;
+	newinet->inet_opt	= ireq->opt;
+	ireq->opt	   = NULL;
+	newinet->mc_index  = inet_iif(skb);
+	newinet->mc_ttl	   = ip_hdr(skb)->ttl;
+	newinet->inet_id   = jiffies;
+
+	if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+		goto put_and_exit;
+
+	sk_setup_caps(newsk, dst);
+
+	dccp_sync_mss(newsk, dst_mtu(dst));
+
+	if (__inet_inherit_port(sk, newsk) < 0)
+		goto put_and_exit;
+	__inet_hash_nolisten(newsk, NULL);
+
+	return newsk;
+
+exit_overflow:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
+exit:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	return NULL;
+put_and_exit:
+	sock_put(newsk);
+	goto exit;
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
+
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const struct iphdr *iph = ip_hdr(skb);
+	struct sock *nsk;
+	struct request_sock **prev;
+	/* Find possible connection requests. */
+	struct request_sock *req = inet_csk_search_req(sk, &prev,
+						       dh->dccph_sport,
+						       iph->saddr, iph->daddr);
+	if (req != NULL)
+		return dccp_check_req(sk, skb, req, prev);
+
+	nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
+				      iph->saddr, dh->dccph_sport,
+				      iph->daddr, dh->dccph_dport,
+				      inet_iif(skb));
+	if (nsk != NULL) {
+		if (nsk->sk_state != DCCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put(inet_twsk(nsk));
+		return NULL;
+	}
+
+	return sk;
+}
+
+static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
+					   struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi4 fl4 = {
+		.flowi4_oif = skb_rtable(skb)->rt_iif,
+		.daddr = ip_hdr(skb)->saddr,
+		.saddr = ip_hdr(skb)->daddr,
+		.flowi4_tos = RT_CONN_FLAGS(sk),
+		.flowi4_proto = sk->sk_protocol,
+		.fl4_sport = dccp_hdr(skb)->dccph_dport,
+		.fl4_dport = dccp_hdr(skb)->dccph_sport,
+	};
+
+	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt)) {
+		IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+
+	return &rt->dst;
+}
+
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
+				 struct request_values *rv_unused)
+{
+	int err = -1;
+	struct sk_buff *skb;
+	struct dst_entry *dst;
+	struct flowi4 fl4;
+
+	dst = inet_csk_route_req(sk, &fl4, req);
+	if (dst == NULL)
+		goto out;
+
+	skb = dccp_make_response(sk, dst, req);
+	if (skb != NULL) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+		struct dccp_hdr *dh = dccp_hdr(skb);
+
+		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
+							      ireq->rmt_addr);
+		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+					    ireq->rmt_addr,
+					    ireq->opt);
+		err = net_xmit_eval(err);
+	}
+
+out:
+	dst_release(dst);
+	return err;
+}
+
+static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+{
+	int err;
+	const struct iphdr *rxiph;
+	struct sk_buff *skb;
+	struct dst_entry *dst;
+	struct net *net = dev_net(skb_dst(rxskb)->dev);
+	struct sock *ctl_sk = net->dccp.v4_ctl_sk;
+
+	/* Never send a reset in response to a reset. */
+	if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
+		return;
+
+	if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
+		return;
+
+	dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
+	if (dst == NULL)
+		return;
+
+	skb = dccp_ctl_make_reset(ctl_sk, rxskb);
+	if (skb == NULL)
+		goto out;
+
+	rxiph = ip_hdr(rxskb);
+	dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
+								 rxiph->daddr);
+	skb_dst_set(skb, dst_clone(dst));
+
+	bh_lock_sock(ctl_sk);
+	err = ip_build_and_send_pkt(skb, ctl_sk,
+				    rxiph->daddr, rxiph->saddr, NULL);
+	bh_unlock_sock(ctl_sk);
+
+	if (net_xmit_eval(err) == 0) {
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+	}
+out:
+	 dst_release(dst);
+}
+
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+	kfree(inet_rsk(req)->opt);
+}
+
+static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
+	.family		= PF_INET,
+	.obj_size	= sizeof(struct dccp_request_sock),
+	.rtx_syn_ack	= dccp_v4_send_response,
+	.send_ack	= dccp_reqsk_send_ack,
+	.destructor	= dccp_v4_reqsk_destructor,
+	.send_reset	= dccp_v4_ctl_send_reset,
+};
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq;
+	struct request_sock *req;
+	struct dccp_request_sock *dreq;
+	const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+
+	/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return 0;	/* discard, don't send a reset here */
+
+	if (dccp_bad_service_code(sk, service)) {
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+		goto drop;
+	}
+	/*
+	 * TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+	if (inet_csk_reqsk_queue_is_full(sk))
+		goto drop;
+
+	/*
+	 * Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = inet_reqsk_alloc(&dccp_request_sock_ops);
+	if (req == NULL)
+		goto drop;
+
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
+
+	dreq = dccp_rsk(req);
+	if (dccp_parse_options(sk, dreq, skb))
+		goto drop_and_free;
+
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
+	ireq = inet_rsk(req);
+	ireq->loc_addr = ip_hdr(skb)->daddr;
+	ireq->rmt_addr = ip_hdr(skb)->saddr;
+
+	/*
+	 * Step 3: Process LISTEN state
+	 *
+	 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *
+	 * In fact we defer setting S.GSR, S.SWL, S.SWH to
+	 * dccp_create_openreq_child.
+	 */
+	dreq->dreq_isr	   = dcb->dccpd_seq;
+	dreq->dreq_iss	   = dccp_v4_init_sequence(skb);
+	dreq->dreq_service = service;
+
+	if (dccp_v4_send_response(sk, req, NULL))
+		goto drop_and_free;
+
+	inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_free:
+	reqsk_free(req);
+drop:
+	DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+	return -1;
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+		if (dccp_rcv_established(sk, skb, dh, skb->len))
+			goto reset;
+		return 0;
+	}
+
+	/*
+	 *  Step 3: Process LISTEN state
+	 *	 If P.type == Request or P contains a valid Init Cookie option,
+	 *	      (* Must scan the packet's options to check for Init
+	 *		 Cookies.  Only Init Cookies are processed here,
+	 *		 however; other options are processed in Step 8.  This
+	 *		 scan need only be performed if the endpoint uses Init
+	 *		 Cookies *)
+	 *	      (* Generate a new socket and switch to that socket *)
+	 *	      Set S := new socket for this port pair
+	 *	      S.state = RESPOND
+	 *	      Choose S.ISS (initial seqno) or set from Init Cookies
+	 *	      Initialize S.GAR := S.ISS
+	 *	      Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+	 *	      Continue with S.state == RESPOND
+	 *	      (* A Response packet will be generated in Step 11 *)
+	 *	 Otherwise,
+	 *	      Generate Reset(No Connection) unless P.type == Reset
+	 *	      Drop packet and return
+	 *
+	 * NOTE: the check for the packet types is done in
+	 *	 dccp_rcv_state_process
+	 */
+	if (sk->sk_state == DCCP_LISTEN) {
+		struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+
+		if (nsk == NULL)
+			goto discard;
+
+		if (nsk != sk) {
+			if (dccp_child_process(sk, nsk, skb))
+				goto reset;
+			return 0;
+		}
+	}
+
+	if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+		goto reset;
+	return 0;
+
+reset:
+	dccp_v4_ctl_send_reset(sk, skb);
+discard:
+	kfree_skb(skb);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+
+/**
+ *	dccp_invalid_packet  -  check for malformed packets
+ *	Implements RFC 4340, 8.5:  Step 1: Check header basics
+ *	Packets that fail these checks are ignored and do not receive Resets.
+ */
+int dccp_invalid_packet(struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh;
+	unsigned int cscov;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return 1;
+
+	/* If the packet is shorter than 12 bytes, drop packet and return */
+	if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+		DCCP_WARN("pskb_may_pull failed\n");
+		return 1;
+	}
+
+	dh = dccp_hdr(skb);
+
+	/* If P.type is not understood, drop packet and return */
+	if (dh->dccph_type >= DCCP_PKT_INVALID) {
+		DCCP_WARN("invalid packet type\n");
+		return 1;
+	}
+
+	/*
+	 * If P.Data Offset is too small for packet type, drop packet and return
+	 */
+	if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+		DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
+		return 1;
+	}
+	/*
+	 * If P.Data Offset is too too large for packet, drop packet and return
+	 */
+	if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+		DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
+		return 1;
+	}
+
+	/*
+	 * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+	 * has short sequence numbers), drop packet and return
+	 */
+	if ((dh->dccph_type < DCCP_PKT_DATA    ||
+	    dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0)  {
+		DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
+			  dccp_packet_name(dh->dccph_type));
+		return 1;
+	}
+
+	/*
+	 * If P.CsCov is too large for the packet size, drop packet and return.
+	 * This must come _before_ checksumming (not as RFC 4340 suggests).
+	 */
+	cscov = dccp_csum_coverage(skb);
+	if (cscov > skb->len) {
+		DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
+			  dh->dccph_cscov, skb->len);
+		return 1;
+	}
+
+	/* If header checksum is incorrect, drop packet and return.
+	 * (This step is completed in the AF-dependent functions.) */
+	skb->csum = skb_checksum(skb, 0, cscov, 0);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_invalid_packet);
+
+/* this is called when real data arrives */
+static int dccp_v4_rcv(struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh;
+	const struct iphdr *iph;
+	struct sock *sk;
+	int min_cov;
+
+	/* Step 1: Check header basics */
+
+	if (dccp_invalid_packet(skb))
+		goto discard_it;
+
+	iph = ip_hdr(skb);
+	/* Step 1: If header checksum is incorrect, drop packet and return */
+	if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) {
+		DCCP_WARN("dropped packet with invalid checksum\n");
+		goto discard_it;
+	}
+
+	dh = dccp_hdr(skb);
+
+	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(dh);
+	DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+	dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
+		      dccp_packet_name(dh->dccph_type),
+		      &iph->saddr, ntohs(dh->dccph_sport),
+		      &iph->daddr, ntohs(dh->dccph_dport),
+		      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+	if (dccp_packet_without_ack(skb)) {
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+		dccp_pr_debug_cat("\n");
+	} else {
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+		dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
+				  DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	}
+
+	/* Step 2:
+	 *	Look up flow ID in table and get corresponding socket */
+	sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+			       dh->dccph_sport, dh->dccph_dport);
+	/*
+	 * Step 2:
+	 *	If no socket ...
+	 */
+	if (sk == NULL) {
+		dccp_pr_debug("failed to look up flow ID in table and "
+			      "get corresponding socket\n");
+		goto no_dccp_socket;
+	}
+
+	/*
+	 * Step 2:
+	 *	... or S.state == TIMEWAIT,
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+		inet_twsk_put(inet_twsk(sk));
+		goto no_dccp_socket;
+	}
+
+	/*
+	 * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+	 *	o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+	 *	o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+	 */
+	min_cov = dccp_sk(sk)->dccps_pcrlen;
+	if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov))  {
+		dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+			      dh->dccph_cscov, min_cov);
+		/* FIXME: "Such packets SHOULD be reported using Data Dropped
+		 *         options (Section 11.7) with Drop Code 0, Protocol
+		 *         Constraints."                                     */
+		goto discard_and_relse;
+	}
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+	nf_reset(skb);
+
+	return sk_receive_skb(sk, skb, 1);
+
+no_dccp_socket:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+	/*
+	 * Step 2:
+	 *	If no socket ...
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (dh->dccph_type != DCCP_PKT_RESET) {
+		DCCP_SKB_CB(skb)->dccpd_reset_code =
+					DCCP_RESET_CODE_NO_CONNECTION;
+		dccp_v4_ctl_send_reset(sk, skb);
+	}
+
+discard_it:
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+}
+
+static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+	.queue_xmit	   = ip_queue_xmit,
+	.send_check	   = dccp_v4_send_check,
+	.rebuild_header	   = inet_sk_rebuild_header,
+	.conn_request	   = dccp_v4_conn_request,
+	.syn_recv_sock	   = dccp_v4_request_recv_sock,
+	.net_header_len	   = sizeof(struct iphdr),
+	.setsockopt	   = ip_setsockopt,
+	.getsockopt	   = ip_getsockopt,
+	.addr2sockaddr	   = inet_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in),
+	.bind_conflict	   = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ip_setsockopt,
+	.compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+static int dccp_v4_init_sock(struct sock *sk)
+{
+	static __u8 dccp_v4_ctl_sock_initialized;
+	int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
+
+	if (err == 0) {
+		if (unlikely(!dccp_v4_ctl_sock_initialized))
+			dccp_v4_ctl_sock_initialized = 1;
+		inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
+	}
+
+	return err;
+}
+
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct inet_timewait_sock),
+};
+
+static struct proto dccp_v4_prot = {
+	.name			= "DCCP",
+	.owner			= THIS_MODULE,
+	.close			= dccp_close,
+	.connect		= dccp_v4_connect,
+	.disconnect		= dccp_disconnect,
+	.ioctl			= dccp_ioctl,
+	.init			= dccp_v4_init_sock,
+	.setsockopt		= dccp_setsockopt,
+	.getsockopt		= dccp_getsockopt,
+	.sendmsg		= dccp_sendmsg,
+	.recvmsg		= dccp_recvmsg,
+	.backlog_rcv		= dccp_v4_do_rcv,
+	.hash			= inet_hash,
+	.unhash			= inet_unhash,
+	.accept			= inet_csk_accept,
+	.get_port		= inet_csk_get_port,
+	.shutdown		= dccp_shutdown,
+	.destroy		= dccp_destroy_sock,
+	.orphan_count		= &dccp_orphan_count,
+	.max_header		= MAX_DCCP_HEADER,
+	.obj_size		= sizeof(struct dccp_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.rsk_prot		= &dccp_request_sock_ops,
+	.twsk_prot		= &dccp_timewait_sock_ops,
+	.h.hashinfo		= &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_dccp_setsockopt,
+	.compat_getsockopt	= compat_dccp_getsockopt,
+#endif
+};
+
+static const struct net_protocol dccp_v4_protocol = {
+	.handler	= dccp_v4_rcv,
+	.err_handler	= dccp_v4_err,
+	.no_policy	= 1,
+	.netns_ok	= 1,
+};
+
+static const struct proto_ops inet_dccp_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_stream_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet_getname,
+	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+	.poll		   = dccp_poll,
+	.ioctl		   = inet_ioctl,
+	/* FIXME: work on inet_listen to rename it to sock_common_listen */
+	.listen		   = inet_dccp_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw dccp_v4_protosw = {
+	.type		= SOCK_DCCP,
+	.protocol	= IPPROTO_DCCP,
+	.prot		= &dccp_v4_prot,
+	.ops		= &inet_dccp_ops,
+	.no_check	= 0,
+	.flags		= INET_PROTOSW_ICSK,
+};
+
+static int __net_init dccp_v4_init_net(struct net *net)
+{
+	if (dccp_hashinfo.bhash == NULL)
+		return -ESOCKTNOSUPPORT;
+
+	return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
+				    SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v4_exit_net(struct net *net)
+{
+	inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
+}
+
+static struct pernet_operations dccp_v4_ops = {
+	.init	= dccp_v4_init_net,
+	.exit	= dccp_v4_exit_net,
+};
+
+static int __init dccp_v4_init(void)
+{
+	int err = proto_register(&dccp_v4_prot, 1);
+
+	if (err != 0)
+		goto out;
+
+	err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+	if (err != 0)
+		goto out_proto_unregister;
+
+	inet_register_protosw(&dccp_v4_protosw);
+
+	err = register_pernet_subsys(&dccp_v4_ops);
+	if (err)
+		goto out_destroy_ctl_sock;
+out:
+	return err;
+out_destroy_ctl_sock:
+	inet_unregister_protosw(&dccp_v4_protosw);
+	inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+out_proto_unregister:
+	proto_unregister(&dccp_v4_prot);
+	goto out;
+}
+
+static void __exit dccp_v4_exit(void)
+{
+	unregister_pernet_subsys(&dccp_v4_ops);
+	inet_unregister_protosw(&dccp_v4_protosw);
+	inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+	proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_v4_init);
+module_exit(dccp_v4_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 00000000..b74f7611
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1211 @@
+/*
+ *	DCCP over IPv6
+ *	Linux INET6 implementation
+ *
+ *	Based on net/dccp6/ipv6.c
+ *
+ *	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/xfrm.h>
+
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+#include <net/secure_seq.h>
+
+#include "dccp.h"
+#include "ipv6.h"
+#include "feat.h"
+
+/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
+
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+
+static void dccp_v6_hash(struct sock *sk)
+{
+	if (sk->sk_state != DCCP_CLOSED) {
+		if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
+			inet_hash(sk);
+			return;
+		}
+		local_bh_disable();
+		__inet6_hash(sk, NULL);
+		local_bh_enable();
+	}
+}
+
+/* add pseudo-header to DCCP checksum stored in skb->csum */
+static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
+				      const struct in6_addr *saddr,
+				      const struct in6_addr *daddr)
+{
+	return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
+}
+
+static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	dccp_csum_outgoing(skb);
+	dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &np->daddr);
+}
+
+static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
+{
+	return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+					     ipv6_hdr(skb)->saddr.s6_addr32,
+					     dccp_hdr(skb)->dccph_dport,
+					     dccp_hdr(skb)->dccph_sport     );
+
+}
+
+static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			u8 type, u8 code, int offset, __be32 info)
+{
+	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+	struct dccp_sock *dp;
+	struct ipv6_pinfo *np;
+	struct sock *sk;
+	int err;
+	__u64 seq;
+	struct net *net = dev_net(skb->dev);
+
+	if (skb->len < offset + sizeof(*dh) ||
+	    skb->len < offset + __dccp_basic_hdr_len(dh)) {
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
+		return;
+	}
+
+	sk = inet6_lookup(net, &dccp_hashinfo,
+			&hdr->daddr, dh->dccph_dport,
+			&hdr->saddr, dh->dccph_sport, inet6_iif(skb));
+
+	if (sk == NULL) {
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
+		return;
+	}
+
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == DCCP_CLOSED)
+		goto out;
+
+	dp = dccp_sk(sk);
+	seq = dccp_hdr_seq(dh);
+	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+	    !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	np = inet6_sk(sk);
+
+	if (type == ICMPV6_PKT_TOOBIG) {
+		struct dst_entry *dst = NULL;
+
+		if (sock_owned_by_user(sk))
+			goto out;
+		if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
+			goto out;
+
+		/* icmp should have updated the destination cache entry */
+		dst = __sk_dst_check(sk, np->dst_cookie);
+		if (dst == NULL) {
+			struct inet_sock *inet = inet_sk(sk);
+			struct flowi6 fl6;
+
+			/* BUGGG_FUTURE: Again, it is not clear how
+			   to handle rthdr case. Ignore this complexity
+			   for now.
+			 */
+			memset(&fl6, 0, sizeof(fl6));
+			fl6.flowi6_proto = IPPROTO_DCCP;
+			ipv6_addr_copy(&fl6.daddr, &np->daddr);
+			ipv6_addr_copy(&fl6.saddr, &np->saddr);
+			fl6.flowi6_oif = sk->sk_bound_dev_if;
+			fl6.fl6_dport = inet->inet_dport;
+			fl6.fl6_sport = inet->inet_sport;
+			security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+			dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
+			if (IS_ERR(dst)) {
+				sk->sk_err_soft = -PTR_ERR(dst);
+				goto out;
+			}
+		} else
+			dst_hold(dst);
+
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+			dccp_sync_mss(sk, dst_mtu(dst));
+		} /* else let the usual retransmit timer handle it */
+		dst_release(dst);
+		goto out;
+	}
+
+	icmpv6_err_convert(type, code, &err);
+
+	/* Might be for an request_sock */
+	switch (sk->sk_state) {
+		struct request_sock *req, **prev;
+	case DCCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+
+		req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
+					   &hdr->daddr, &hdr->saddr,
+					   inet6_iif(skb));
+		if (req == NULL)
+			goto out;
+
+		/*
+		 * ICMPs are not backlogged, hence we cannot get an established
+		 * socket here.
+		 */
+		WARN_ON(req->sk != NULL);
+
+		if (seq != dccp_rsk(req)->dreq_iss) {
+			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case DCCP_REQUESTING:
+	case DCCP_RESPOND:  /* Cannot happen.
+			       It can, it SYNs are crossed. --ANK */
+		if (!sock_owned_by_user(sk)) {
+			DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+			sk->sk_err = err;
+			/*
+			 * Wake people up to see the error
+			 * (see connect in sock.c)
+			 */
+			sk->sk_error_report(sk);
+			dccp_done(sk);
+		} else
+			sk->sk_err_soft = err;
+		goto out;
+	}
+
+	if (!sock_owned_by_user(sk) && np->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else
+		sk->sk_err_soft = err;
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
+				 struct request_values *rv_unused)
+{
+	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *skb;
+	struct ipv6_txoptions *opt = NULL;
+	struct in6_addr *final_p, final;
+	struct flowi6 fl6;
+	int err = -1;
+	struct dst_entry *dst;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_DCCP;
+	ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+	ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+	fl6.flowlabel = 0;
+	fl6.flowi6_oif = ireq6->iif;
+	fl6.fl6_dport = inet_rsk(req)->rmt_port;
+	fl6.fl6_sport = inet_rsk(req)->loc_port;
+	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+	opt = np->opt;
+
+	final_p = fl6_update_dst(&fl6, opt, &final);
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		dst = NULL;
+		goto done;
+	}
+
+	skb = dccp_make_response(sk, dst, req);
+	if (skb != NULL) {
+		struct dccp_hdr *dh = dccp_hdr(skb);
+
+		dh->dccph_checksum = dccp_v6_csum_finish(skb,
+							 &ireq6->loc_addr,
+							 &ireq6->rmt_addr);
+		ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+		err = ip6_xmit(sk, skb, &fl6, opt);
+		err = net_xmit_eval(err);
+	}
+
+done:
+	if (opt != NULL && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	dst_release(dst);
+	return err;
+}
+
+static void dccp_v6_reqsk_destructor(struct request_sock *req)
+{
+	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+	if (inet6_rsk(req)->pktopts != NULL)
+		kfree_skb(inet6_rsk(req)->pktopts);
+}
+
+static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
+{
+	const struct ipv6hdr *rxip6h;
+	struct sk_buff *skb;
+	struct flowi6 fl6;
+	struct net *net = dev_net(skb_dst(rxskb)->dev);
+	struct sock *ctl_sk = net->dccp.v6_ctl_sk;
+	struct dst_entry *dst;
+
+	if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
+		return;
+
+	if (!ipv6_unicast_destination(rxskb))
+		return;
+
+	skb = dccp_ctl_make_reset(ctl_sk, rxskb);
+	if (skb == NULL)
+		return;
+
+	rxip6h = ipv6_hdr(rxskb);
+	dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
+							    &rxip6h->daddr);
+
+	memset(&fl6, 0, sizeof(fl6));
+	ipv6_addr_copy(&fl6.daddr, &rxip6h->saddr);
+	ipv6_addr_copy(&fl6.saddr, &rxip6h->daddr);
+
+	fl6.flowi6_proto = IPPROTO_DCCP;
+	fl6.flowi6_oif = inet6_iif(rxskb);
+	fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+	fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+	security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
+
+	/* sk = NULL, but it is safe for now. RST socket required. */
+	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+	if (!IS_ERR(dst)) {
+		skb_dst_set(skb, dst);
+		ip6_xmit(ctl_sk, skb, &fl6, NULL);
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+		return;
+	}
+
+	kfree_skb(skb);
+}
+
+static struct request_sock_ops dccp6_request_sock_ops = {
+	.family		= AF_INET6,
+	.obj_size	= sizeof(struct dccp6_request_sock),
+	.rtx_syn_ack	= dccp_v6_send_response,
+	.send_ack	= dccp_reqsk_send_ack,
+	.destructor	= dccp_v6_reqsk_destructor,
+	.send_reset	= dccp_v6_ctl_send_reset,
+};
+
+static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct sock *nsk;
+	struct request_sock **prev;
+	/* Find possible connection requests. */
+	struct request_sock *req = inet6_csk_search_req(sk, &prev,
+							dh->dccph_sport,
+							&iph->saddr,
+							&iph->daddr,
+							inet6_iif(skb));
+	if (req != NULL)
+		return dccp_check_req(sk, skb, req, prev);
+
+	nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
+					 &iph->saddr, dh->dccph_sport,
+					 &iph->daddr, ntohs(dh->dccph_dport),
+					 inet6_iif(skb));
+	if (nsk != NULL) {
+		if (nsk->sk_state != DCCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put(inet_twsk(nsk));
+		return NULL;
+	}
+
+	return sk;
+}
+
+static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct request_sock *req;
+	struct dccp_request_sock *dreq;
+	struct inet6_request_sock *ireq6;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return dccp_v4_conn_request(sk, skb);
+
+	if (!ipv6_unicast_destination(skb))
+		return 0;	/* discard, don't send a reset here */
+
+	if (dccp_bad_service_code(sk, service)) {
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+		goto drop;
+	}
+	/*
+	 * There are no SYN attacks on IPv6, yet...
+	 */
+	dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+	if (inet_csk_reqsk_queue_is_full(sk))
+		goto drop;
+
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
+	if (req == NULL)
+		goto drop;
+
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
+
+	dreq = dccp_rsk(req);
+	if (dccp_parse_options(sk, dreq, skb))
+		goto drop_and_free;
+
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
+	ireq6 = inet6_rsk(req);
+	ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
+	ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
+
+	if (ipv6_opt_accepted(sk, skb) ||
+	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+		atomic_inc(&skb->users);
+		ireq6->pktopts = skb;
+	}
+	ireq6->iif = sk->sk_bound_dev_if;
+
+	/* So that link locals have meaning */
+	if (!sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq6->iif = inet6_iif(skb);
+
+	/*
+	 * Step 3: Process LISTEN state
+	 *
+	 *   Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *
+	 *   In fact we defer setting S.GSR, S.SWL, S.SWH to
+	 *   dccp_create_openreq_child.
+	 */
+	dreq->dreq_isr	   = dcb->dccpd_seq;
+	dreq->dreq_iss	   = dccp_v6_init_sequence(skb);
+	dreq->dreq_service = service;
+
+	if (dccp_v6_send_response(sk, req, NULL))
+		goto drop_and_free;
+
+	inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_free:
+	reqsk_free(req);
+drop:
+	DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+	return -1;
+}
+
+static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+					      struct sk_buff *skb,
+					      struct request_sock *req,
+					      struct dst_entry *dst)
+{
+	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+	struct inet_sock *newinet;
+	struct dccp6_sock *newdp6;
+	struct sock *newsk;
+	struct ipv6_txoptions *opt;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		/*
+		 *	v6 mapped
+		 */
+		newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+		if (newsk == NULL)
+			return NULL;
+
+		newdp6 = (struct dccp6_sock *)newsk;
+		newinet = inet_sk(newsk);
+		newinet->pinet6 = &newdp6->inet6;
+		newnp = inet6_sk(newsk);
+
+		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
+
+		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
+
+		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+
+		inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
+		newsk->sk_backlog_rcv = dccp_v4_do_rcv;
+		newnp->pktoptions  = NULL;
+		newnp->opt	   = NULL;
+		newnp->mcast_oif   = inet6_iif(skb);
+		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
+
+		/*
+		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+		 * here, dccp_create_openreq_child now does this for us, see the comment in
+		 * that function for the gory details. -acme
+		 */
+
+		/* It is tricky place. Until this moment IPv4 tcp
+		   worked with IPv6 icsk.icsk_af_ops.
+		   Sync it now.
+		 */
+		dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+		return newsk;
+	}
+
+	opt = np->opt;
+
+	if (sk_acceptq_is_full(sk))
+		goto out_overflow;
+
+	if (dst == NULL) {
+		struct in6_addr *final_p, final;
+		struct flowi6 fl6;
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_DCCP;
+		ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+		final_p = fl6_update_dst(&fl6, opt, &final);
+		ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+		fl6.fl6_dport = inet_rsk(req)->rmt_port;
+		fl6.fl6_sport = inet_rsk(req)->loc_port;
+		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		if (IS_ERR(dst))
+			goto out;
+	}
+
+	newsk = dccp_create_openreq_child(sk, req, skb);
+	if (newsk == NULL)
+		goto out_nonewsk;
+
+	/*
+	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
+	 * count here, dccp_create_openreq_child now does this for us, see the
+	 * comment in that function for the gory details. -acme
+	 */
+
+	__ip6_dst_store(newsk, dst, NULL, NULL);
+	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
+						      NETIF_F_TSO);
+	newdp6 = (struct dccp6_sock *)newsk;
+	newinet = inet_sk(newsk);
+	newinet->pinet6 = &newdp6->inet6;
+	newnp = inet6_sk(newsk);
+
+	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+	ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
+	ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
+	ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
+	newsk->sk_bound_dev_if = ireq6->iif;
+
+	/* Now IPv6 options...
+
+	   First: no IPv4 options.
+	 */
+	newinet->inet_opt = NULL;
+
+	/* Clone RX bits */
+	newnp->rxopt.all = np->rxopt.all;
+
+	/* Clone pktoptions received with SYN */
+	newnp->pktoptions = NULL;
+	if (ireq6->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
+		kfree_skb(ireq6->pktopts);
+		ireq6->pktopts = NULL;
+		if (newnp->pktoptions)
+			skb_set_owner_r(newnp->pktoptions, newsk);
+	}
+	newnp->opt	  = NULL;
+	newnp->mcast_oif  = inet6_iif(skb);
+	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+
+	/*
+	 * Clone native IPv6 options from listening socket (if any)
+	 *
+	 * Yes, keeping reference count would be much more clever, but we make
+	 * one more one thing there: reattach optmem to newsk.
+	 */
+	if (opt != NULL) {
+		newnp->opt = ipv6_dup_options(newsk, opt);
+		if (opt != np->opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+	}
+
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
+	if (newnp->opt != NULL)
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
+
+	dccp_sync_mss(newsk, dst_mtu(dst));
+
+	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
+	__inet6_hash(newsk, NULL);
+
+	return newsk;
+
+out_overflow:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+	dst_release(dst);
+out:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	if (opt != NULL && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *opt_skb = NULL;
+
+	/* Imagine: socket is IPv6. IPv4 packet arrives,
+	   goes to IPv4 receive handler and backlogged.
+	   From backlog it always goes here. Kerboom...
+	   Fortunately, dccp_rcv_established and rcv_established
+	   handle them correctly, but it is not case with
+	   dccp_v6_hnd_req and dccp_v6_ctl_send_reset().   --ANK
+	 */
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return dccp_v4_do_rcv(sk, skb);
+
+	if (sk_filter(sk, skb))
+		goto discard;
+
+	/*
+	 * socket locking is here for SMP purposes as backlog rcv is currently
+	 * called with bh processing disabled.
+	 */
+
+	/* Do Stevens' IPV6_PKTOPTIONS.
+
+	   Yes, guys, it is the only place in our code, where we
+	   may make it not affecting IPv4.
+	   The rest of code is protocol independent,
+	   and I do not like idea to uglify IPv4.
+
+	   Actually, all the idea behind IPV6_PKTOPTIONS
+	   looks not very well thought. For now we latch
+	   options, received in the last packet, enqueued
+	   by tcp. Feel free to propose better solution.
+					       --ANK (980728)
+	 */
+	if (np->rxopt.all)
+	/*
+	 * FIXME: Add handling of IPV6_PKTOPTIONS skb. See the comments below
+	 *        (wrt ipv6_pktopions) and net/ipv6/tcp_ipv6.c for an example.
+	 */
+		opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+	if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+		if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
+			goto reset;
+		if (opt_skb) {
+			/* XXX This is where we would goto ipv6_pktoptions. */
+			__kfree_skb(opt_skb);
+		}
+		return 0;
+	}
+
+	/*
+	 *  Step 3: Process LISTEN state
+	 *     If S.state == LISTEN,
+	 *	 If P.type == Request or P contains a valid Init Cookie option,
+	 *	      (* Must scan the packet's options to check for Init
+	 *		 Cookies.  Only Init Cookies are processed here,
+	 *		 however; other options are processed in Step 8.  This
+	 *		 scan need only be performed if the endpoint uses Init
+	 *		 Cookies *)
+	 *	      (* Generate a new socket and switch to that socket *)
+	 *	      Set S := new socket for this port pair
+	 *	      S.state = RESPOND
+	 *	      Choose S.ISS (initial seqno) or set from Init Cookies
+	 *	      Initialize S.GAR := S.ISS
+	 *	      Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+	 *	      Continue with S.state == RESPOND
+	 *	      (* A Response packet will be generated in Step 11 *)
+	 *	 Otherwise,
+	 *	      Generate Reset(No Connection) unless P.type == Reset
+	 *	      Drop packet and return
+	 *
+	 * NOTE: the check for the packet types is done in
+	 *	 dccp_rcv_state_process
+	 */
+	if (sk->sk_state == DCCP_LISTEN) {
+		struct sock *nsk = dccp_v6_hnd_req(sk, skb);
+
+		if (nsk == NULL)
+			goto discard;
+		/*
+		 * Queue it on the new socket if the new socket is active,
+		 * otherwise we just shortcircuit this and continue with
+		 * the new socket..
+		 */
+		if (nsk != sk) {
+			if (dccp_child_process(sk, nsk, skb))
+				goto reset;
+			if (opt_skb != NULL)
+				__kfree_skb(opt_skb);
+			return 0;
+		}
+	}
+
+	if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
+		goto reset;
+	if (opt_skb) {
+		/* XXX This is where we would goto ipv6_pktoptions. */
+		__kfree_skb(opt_skb);
+	}
+	return 0;
+
+reset:
+	dccp_v6_ctl_send_reset(sk, skb);
+discard:
+	if (opt_skb != NULL)
+		__kfree_skb(opt_skb);
+	kfree_skb(skb);
+	return 0;
+}
+
+static int dccp_v6_rcv(struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh;
+	struct sock *sk;
+	int min_cov;
+
+	/* Step 1: Check header basics */
+
+	if (dccp_invalid_packet(skb))
+		goto discard_it;
+
+	/* Step 1: If header checksum is incorrect, drop packet and return. */
+	if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr,
+				     &ipv6_hdr(skb)->daddr)) {
+		DCCP_WARN("dropped packet with invalid checksum\n");
+		goto discard_it;
+	}
+
+	dh = dccp_hdr(skb);
+
+	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(dh);
+	DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+	if (dccp_packet_without_ack(skb))
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+	else
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+
+	/* Step 2:
+	 *	Look up flow ID in table and get corresponding socket */
+	sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+			        dh->dccph_sport, dh->dccph_dport);
+	/*
+	 * Step 2:
+	 *	If no socket ...
+	 */
+	if (sk == NULL) {
+		dccp_pr_debug("failed to look up flow ID in table and "
+			      "get corresponding socket\n");
+		goto no_dccp_socket;
+	}
+
+	/*
+	 * Step 2:
+	 *	... or S.state == TIMEWAIT,
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+		inet_twsk_put(inet_twsk(sk));
+		goto no_dccp_socket;
+	}
+
+	/*
+	 * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+	 *	o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+	 *	o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+	 */
+	min_cov = dccp_sk(sk)->dccps_pcrlen;
+	if (dh->dccph_cscov  &&  (min_cov == 0 || dh->dccph_cscov < min_cov))  {
+		dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+			      dh->dccph_cscov, min_cov);
+		/* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
+		goto discard_and_relse;
+	}
+
+	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+
+	return sk_receive_skb(sk, skb, 1) ? -1 : 0;
+
+no_dccp_socket:
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+	/*
+	 * Step 2:
+	 *	If no socket ...
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (dh->dccph_type != DCCP_PKT_RESET) {
+		DCCP_SKB_CB(skb)->dccpd_reset_code =
+					DCCP_RESET_CODE_NO_CONNECTION;
+		dccp_v6_ctl_send_reset(sk, skb);
+	}
+
+discard_it:
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+			   int addr_len)
+{
+	struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct in6_addr *saddr = NULL, *final_p, final;
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int addr_type;
+	int err;
+
+	dp->dccps_role = DCCP_ROLE_CLIENT;
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	if (usin->sin6_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	memset(&fl6, 0, sizeof(fl6));
+
+	if (np->sndflow) {
+		fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+		IP6_ECN_flow_init(fl6.flowlabel);
+		if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
+			struct ip6_flowlabel *flowlabel;
+			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+			ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+			fl6_sock_release(flowlabel);
+		}
+	}
+	/*
+	 * connect() to INADDR_ANY means loopback (BSD'ism).
+	 */
+	if (ipv6_addr_any(&usin->sin6_addr))
+		usin->sin6_addr.s6_addr[15] = 1;
+
+	addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+	if (addr_type & IPV6_ADDR_MULTICAST)
+		return -ENETUNREACH;
+
+	if (addr_type & IPV6_ADDR_LINKLOCAL) {
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    usin->sin6_scope_id) {
+			/* If interface is set while binding, indices
+			 * must coincide.
+			 */
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != usin->sin6_scope_id)
+				return -EINVAL;
+
+			sk->sk_bound_dev_if = usin->sin6_scope_id;
+		}
+
+		/* Connect to link-local address requires an interface */
+		if (!sk->sk_bound_dev_if)
+			return -EINVAL;
+	}
+
+	ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
+	np->flow_label = fl6.flowlabel;
+
+	/*
+	 * DCCP over IPv4
+	 */
+	if (addr_type == IPV6_ADDR_MAPPED) {
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
+		struct sockaddr_in sin;
+
+		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+		if (__ipv6_only_sock(sk))
+			return -ENETUNREACH;
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = usin->sin6_port;
+		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+		icsk->icsk_af_ops = &dccp_ipv6_mapped;
+		sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+		err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+		if (err) {
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &dccp_ipv6_af_ops;
+			sk->sk_backlog_rcv = dccp_v6_do_rcv;
+			goto failure;
+		}
+		ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+		ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &np->rcv_saddr);
+
+		return err;
+	}
+
+	if (!ipv6_addr_any(&np->rcv_saddr))
+		saddr = &np->rcv_saddr;
+
+	fl6.flowi6_proto = IPPROTO_DCCP;
+	ipv6_addr_copy(&fl6.daddr, &np->daddr);
+	ipv6_addr_copy(&fl6.saddr, saddr ? saddr : &np->saddr);
+	fl6.flowi6_oif = sk->sk_bound_dev_if;
+	fl6.fl6_dport = usin->sin6_port;
+	fl6.fl6_sport = inet->inet_sport;
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto failure;
+	}
+
+	if (saddr == NULL) {
+		saddr = &fl6.saddr;
+		ipv6_addr_copy(&np->rcv_saddr, saddr);
+	}
+
+	/* set the source address */
+	ipv6_addr_copy(&np->saddr, saddr);
+	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+	__ip6_dst_store(sk, dst, NULL, NULL);
+
+	icsk->icsk_ext_hdr_len = 0;
+	if (np->opt != NULL)
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
+
+	inet->inet_dport = usin->sin6_port;
+
+	dccp_set_state(sk, DCCP_REQUESTING);
+	err = inet6_hash_connect(&dccp_death_row, sk);
+	if (err)
+		goto late_failure;
+
+	dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
+						      np->daddr.s6_addr32,
+						      inet->inet_sport,
+						      inet->inet_dport);
+	err = dccp_connect(sk);
+	if (err)
+		goto late_failure;
+
+	return 0;
+
+late_failure:
+	dccp_set_state(sk, DCCP_CLOSED);
+	__sk_dst_reset(sk);
+failure:
+	inet->inet_dport = 0;
+	sk->sk_route_caps = 0;
+	return err;
+}
+
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+	.queue_xmit	   = inet6_csk_xmit,
+	.send_check	   = dccp_v6_send_check,
+	.rebuild_header	   = inet6_sk_rebuild_header,
+	.conn_request	   = dccp_v6_conn_request,
+	.syn_recv_sock	   = dccp_v6_request_recv_sock,
+	.net_header_len	   = sizeof(struct ipv6hdr),
+	.setsockopt	   = ipv6_setsockopt,
+	.getsockopt	   = ipv6_getsockopt,
+	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ipv6_setsockopt,
+	.compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+/*
+ *	DCCP over IPv4 via INET6 API
+ */
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+	.queue_xmit	   = ip_queue_xmit,
+	.send_check	   = dccp_v4_send_check,
+	.rebuild_header	   = inet_sk_rebuild_header,
+	.conn_request	   = dccp_v6_conn_request,
+	.syn_recv_sock	   = dccp_v6_request_recv_sock,
+	.net_header_len	   = sizeof(struct iphdr),
+	.setsockopt	   = ipv6_setsockopt,
+	.getsockopt	   = ipv6_getsockopt,
+	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ipv6_setsockopt,
+	.compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int dccp_v6_init_sock(struct sock *sk)
+{
+	static __u8 dccp_v6_ctl_sock_initialized;
+	int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
+
+	if (err == 0) {
+		if (unlikely(!dccp_v6_ctl_sock_initialized))
+			dccp_v6_ctl_sock_initialized = 1;
+		inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+	}
+
+	return err;
+}
+
+static void dccp_v6_destroy_sock(struct sock *sk)
+{
+	dccp_destroy_sock(sk);
+	inet6_destroy_sock(sk);
+}
+
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct dccp6_timewait_sock),
+};
+
+static struct proto dccp_v6_prot = {
+	.name		   = "DCCPv6",
+	.owner		   = THIS_MODULE,
+	.close		   = dccp_close,
+	.connect	   = dccp_v6_connect,
+	.disconnect	   = dccp_disconnect,
+	.ioctl		   = dccp_ioctl,
+	.init		   = dccp_v6_init_sock,
+	.setsockopt	   = dccp_setsockopt,
+	.getsockopt	   = dccp_getsockopt,
+	.sendmsg	   = dccp_sendmsg,
+	.recvmsg	   = dccp_recvmsg,
+	.backlog_rcv	   = dccp_v6_do_rcv,
+	.hash		   = dccp_v6_hash,
+	.unhash		   = inet_unhash,
+	.accept		   = inet_csk_accept,
+	.get_port	   = inet_csk_get_port,
+	.shutdown	   = dccp_shutdown,
+	.destroy	   = dccp_v6_destroy_sock,
+	.orphan_count	   = &dccp_orphan_count,
+	.max_header	   = MAX_DCCP_HEADER,
+	.obj_size	   = sizeof(struct dccp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.rsk_prot	   = &dccp6_request_sock_ops,
+	.twsk_prot	   = &dccp6_timewait_sock_ops,
+	.h.hashinfo	   = &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_dccp_setsockopt,
+	.compat_getsockopt = compat_dccp_getsockopt,
+#endif
+};
+
+static const struct inet6_protocol dccp_v6_protocol = {
+	.handler	= dccp_v6_rcv,
+	.err_handler	= dccp_v6_err,
+	.flags		= INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static const struct proto_ops inet6_dccp_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = inet6_bind,
+	.connect	   = inet_stream_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet6_getname,
+	.poll		   = dccp_poll,
+	.ioctl		   = inet6_ioctl,
+	.listen		   = inet_dccp_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw dccp_v6_protosw = {
+	.type		= SOCK_DCCP,
+	.protocol	= IPPROTO_DCCP,
+	.prot		= &dccp_v6_prot,
+	.ops		= &inet6_dccp_ops,
+	.flags		= INET_PROTOSW_ICSK,
+};
+
+static int __net_init dccp_v6_init_net(struct net *net)
+{
+	if (dccp_hashinfo.bhash == NULL)
+		return -ESOCKTNOSUPPORT;
+
+	return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6,
+				    SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v6_exit_net(struct net *net)
+{
+	inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
+}
+
+static struct pernet_operations dccp_v6_ops = {
+	.init   = dccp_v6_init_net,
+	.exit   = dccp_v6_exit_net,
+};
+
+static int __init dccp_v6_init(void)
+{
+	int err = proto_register(&dccp_v6_prot, 1);
+
+	if (err != 0)
+		goto out;
+
+	err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+	if (err != 0)
+		goto out_unregister_proto;
+
+	inet6_register_protosw(&dccp_v6_protosw);
+
+	err = register_pernet_subsys(&dccp_v6_ops);
+	if (err != 0)
+		goto out_destroy_ctl_sock;
+out:
+	return err;
+
+out_destroy_ctl_sock:
+	inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+	inet6_unregister_protosw(&dccp_v6_protosw);
+out_unregister_proto:
+	proto_unregister(&dccp_v6_prot);
+	goto out;
+}
+
+static void __exit dccp_v6_exit(void)
+{
+	unregister_pernet_subsys(&dccp_v6_ops);
+	inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+	inet6_unregister_protosw(&dccp_v6_protosw);
+	proto_unregister(&dccp_v6_prot);
+}
+
+module_init(dccp_v6_init);
+module_exit(dccp_v6_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 00000000..6eef81fd
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,36 @@
+#ifndef _DCCP_IPV6_H
+#define _DCCP_IPV6_H
+/*
+ *  net/dccp/ipv6.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/ipv6.h>
+
+struct dccp6_sock {
+	struct dccp_sock  dccp;
+	/*
+	 * ipv6_pinfo has to be the last member of dccp6_sock,
+	 * see inet6_sk_generic.
+	 */
+	struct ipv6_pinfo inet6;
+};
+
+struct dccp6_request_sock {
+	struct dccp_request_sock  dccp;
+	struct inet6_request_sock inet6;
+};
+
+struct dccp6_timewait_sock {
+	struct inet_timewait_sock   inet;
+	struct inet6_timewait_sock  tw6;
+};
+
+#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 00000000..d7041a09
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,278 @@
+/*
+ *  net/dccp/minisocks.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+struct inet_timewait_death_row dccp_death_row = {
+	.sysctl_max_tw_buckets = NR_FILE * 2,
+	.period		= DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+	.death_lock	= __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
+	.hashinfo	= &dccp_hashinfo,
+	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
+					    (unsigned long)&dccp_death_row),
+	.twkill_work	= __WORK_INITIALIZER(dccp_death_row.twkill_work,
+					     inet_twdr_twkill_work),
+/* Short-time timewait calendar */
+
+	.twcal_hand	= -1,
+	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+					    (unsigned long)&dccp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(dccp_death_row);
+
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+	struct inet_timewait_sock *tw = NULL;
+
+	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+		tw = inet_twsk_alloc(sk, state);
+
+	if (tw != NULL) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (tw->tw_family == PF_INET6) {
+			const struct ipv6_pinfo *np = inet6_sk(sk);
+			struct inet6_timewait_sock *tw6;
+
+			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+			tw6 = inet6_twsk((struct sock *)tw);
+			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6only = np->ipv6only;
+		}
+#endif
+		/* Linkage updates. */
+		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+
+		/* Get the TIME_WAIT timeout firing. */
+		if (timeo < rto)
+			timeo = rto;
+
+		tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+		if (state == DCCP_TIME_WAIT)
+			timeo = DCCP_TIMEWAIT_LEN;
+
+		inet_twsk_schedule(tw, &dccp_death_row, timeo,
+				   DCCP_TIMEWAIT_LEN);
+		inet_twsk_put(tw);
+	} else {
+		/* Sorry, if we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		DCCP_WARN("time wait bucket table overflow\n");
+	}
+
+	dccp_done(sk);
+}
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+				       const struct request_sock *req,
+				       const struct sk_buff *skb)
+{
+	/*
+	 * Step 3: Process LISTEN state
+	 *
+	 *   (* Generate a new socket and switch to that socket *)
+	 *   Set S := new socket for this port pair
+	 */
+	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+
+	if (newsk != NULL) {
+		struct dccp_request_sock *dreq = dccp_rsk(req);
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+		struct dccp_sock *newdp = dccp_sk(newsk);
+
+		newdp->dccps_role	    = DCCP_ROLE_SERVER;
+		newdp->dccps_hc_rx_ackvec   = NULL;
+		newdp->dccps_service_list   = NULL;
+		newdp->dccps_service	    = dreq->dreq_service;
+		newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
+		newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
+		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
+
+		INIT_LIST_HEAD(&newdp->dccps_featneg);
+		/*
+		 * Step 3: Process LISTEN state
+		 *
+		 *    Choose S.ISS (initial seqno) or set from Init Cookies
+		 *    Initialize S.GAR := S.ISS
+		 *    Set S.ISR, S.GSR from packet (or Init Cookies)
+		 *
+		 *    Setting AWL/AWH and SWL/SWH happens as part of the feature
+		 *    activation below, as these windows all depend on the local
+		 *    and remote Sequence Window feature values (7.5.2).
+		 */
+		newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
+		newdp->dccps_gar = newdp->dccps_iss;
+		newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
+
+		/*
+		 * Activate features: initialise CCIDs, sequence windows etc.
+		 */
+		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
+		dccp_init_xmit_timers(newsk);
+
+		DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+	}
+	return newsk;
+}
+
+EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
+
+/*
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+			    struct request_sock *req,
+			    struct request_sock **prev)
+{
+	struct sock *child = NULL;
+	struct dccp_request_sock *dreq = dccp_rsk(req);
+
+	/* Check for retransmitted REQUEST */
+	if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+
+		if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_isr)) {
+			dccp_pr_debug("Retransmitted REQUEST\n");
+			dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+			/*
+			 * Send another RESPONSE packet
+			 * To protect against Request floods, increment retrans
+			 * counter (backoff, monitored by dccp_response_timer).
+			 */
+			req->retrans++;
+			req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+		}
+		/* Network Duplicate, discard packet */
+		return NULL;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+
+	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+	    dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+		goto drop;
+
+	/* Invalid ACK */
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dreq->dreq_iss) {
+		dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+			      "dreq_iss=%llu\n",
+			      (unsigned long long)
+			      DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			      (unsigned long long) dreq->dreq_iss);
+		goto drop;
+	}
+
+	if (dccp_parse_options(sk, dreq, skb))
+		 goto drop;
+
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+	if (child == NULL)
+		goto listen_overflow;
+
+	inet_csk_reqsk_queue_unlink(sk, req, prev);
+	inet_csk_reqsk_queue_removed(sk, req);
+	inet_csk_reqsk_queue_add(sk, req, child);
+out:
+	return child;
+listen_overflow:
+	dccp_pr_debug("listen_overflow!\n");
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+		req->rsk_ops->send_reset(sk, skb);
+
+	inet_csk_reqsk_queue_drop(sk, req, prev);
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_check_req);
+
+/*
+ *  Queue segment on the new socket if the new socket is active,
+ *  otherwise we just shortcircuit this and continue with
+ *  the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+		       struct sk_buff *skb)
+{
+	int ret = 0;
+	const int state = child->sk_state;
+
+	if (!sock_owned_by_user(child)) {
+		ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+					     skb->len);
+
+		/* Wakeup parent, send SIGIO */
+		if (state == DCCP_RESPOND && child->sk_state != state)
+			parent->sk_data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		__sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	sock_put(child);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(dccp_child_process);
+
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+			 struct request_sock *rsk)
+{
+	DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
+
+int dccp_reqsk_init(struct request_sock *req,
+		    struct dccp_sock const *dp, struct sk_buff const *skb)
+{
+	struct dccp_request_sock *dreq = dccp_rsk(req);
+
+	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
+	inet_rsk(req)->loc_port	  = dccp_hdr(skb)->dccph_dport;
+	inet_rsk(req)->acked	  = 0;
+	dreq->dreq_timestamp_echo = 0;
+
+	/* inherit feature negotiation options from listening socket */
+	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 00000000..4b2ab657
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,640 @@
+/*
+ *  net/dccp/options.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *  Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+u64 dccp_decode_value_var(const u8 *bf, const u8 len)
+{
+	u64 value = 0;
+
+	if (len >= DCCP_OPTVAL_MAXLEN)
+		value += ((u64)*bf++) << 40;
+	if (len > 4)
+		value += ((u64)*bf++) << 32;
+	if (len > 3)
+		value += ((u64)*bf++) << 24;
+	if (len > 2)
+		value += ((u64)*bf++) << 16;
+	if (len > 1)
+		value += ((u64)*bf++) << 8;
+	if (len > 0)
+		value += *bf;
+
+	return value;
+}
+
+/**
+ * dccp_parse_options  -  Parse DCCP options present in @skb
+ * @sk: client|server|listening dccp socket (when @dreq != NULL)
+ * @dreq: request socket to use during connection setup, or NULL
+ */
+int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+		       struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+	unsigned char *opt_ptr = options;
+	const unsigned char *opt_end = (unsigned char *)dh +
+					(dh->dccph_doff * 4);
+	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+	unsigned char opt, len;
+	unsigned char *uninitialized_var(value);
+	u32 elapsed_time;
+	__be32 opt_val;
+	int rc;
+	int mandatory = 0;
+
+	memset(opt_recv, 0, sizeof(*opt_recv));
+
+	opt = len = 0;
+	while (opt_ptr != opt_end) {
+		opt   = *opt_ptr++;
+		len   = 0;
+		value = NULL;
+
+		/* Check if this isn't a single byte option */
+		if (opt > DCCPO_MAX_RESERVED) {
+			if (opt_ptr == opt_end)
+				goto out_nonsensical_length;
+
+			len = *opt_ptr++;
+			if (len < 2)
+				goto out_nonsensical_length;
+			/*
+			 * Remove the type and len fields, leaving
+			 * just the value size
+			 */
+			len	-= 2;
+			value	= opt_ptr;
+			opt_ptr += len;
+
+			if (opt_ptr > opt_end)
+				goto out_nonsensical_length;
+		}
+
+		/*
+		 * CCID-specific options are ignored during connection setup, as
+		 * negotiation may still be in progress (see RFC 4340, 10.3).
+		 * The same applies to Ack Vectors, as these depend on the CCID.
+		 */
+		if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
+			goto ignore_option;
+
+		switch (opt) {
+		case DCCPO_PADDING:
+			break;
+		case DCCPO_MANDATORY:
+			if (mandatory)
+				goto out_invalid_option;
+			if (pkt_type != DCCP_PKT_DATA)
+				mandatory = 1;
+			break;
+		case DCCPO_NDP_COUNT:
+			if (len > 6)
+				goto out_invalid_option;
+
+			opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+			dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
+				      (unsigned long long)opt_recv->dccpor_ndp);
+			break;
+		case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
+			if (pkt_type == DCCP_PKT_DATA)      /* RFC 4340, 6 */
+				break;
+			if (len == 0)
+				goto out_invalid_option;
+			rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
+						    *value, value + 1, len - 1);
+			if (rc)
+				goto out_featneg_failed;
+			break;
+		case DCCPO_TIMESTAMP:
+			if (len != 4)
+				goto out_invalid_option;
+			/*
+			 * RFC 4340 13.1: "The precise time corresponding to
+			 * Timestamp Value zero is not specified". We use
+			 * zero to indicate absence of a meaningful timestamp.
+			 */
+			opt_val = get_unaligned((__be32 *)value);
+			if (unlikely(opt_val == 0)) {
+				DCCP_WARN("Timestamp with zero value\n");
+				break;
+			}
+
+			if (dreq != NULL) {
+				dreq->dreq_timestamp_echo = ntohl(opt_val);
+				dreq->dreq_timestamp_time = dccp_timestamp();
+			} else {
+				opt_recv->dccpor_timestamp =
+					dp->dccps_timestamp_echo = ntohl(opt_val);
+				dp->dccps_timestamp_time = dccp_timestamp();
+			}
+			dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
+				      dccp_role(sk), ntohl(opt_val),
+				      (unsigned long long)
+				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+			/* schedule an Ack in case this sender is quiescent */
+			inet_csk_schedule_ack(sk);
+			break;
+		case DCCPO_TIMESTAMP_ECHO:
+			if (len != 4 && len != 6 && len != 8)
+				goto out_invalid_option;
+
+			opt_val = get_unaligned((__be32 *)value);
+			opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
+
+			dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
+				      "ackno=%llu", dccp_role(sk),
+				      opt_recv->dccpor_timestamp_echo,
+				      len + 2,
+				      (unsigned long long)
+				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+
+			value += 4;
+
+			if (len == 4) {		/* no elapsed time included */
+				dccp_pr_debug_cat("\n");
+				break;
+			}
+
+			if (len == 6) {		/* 2-byte elapsed time */
+				__be16 opt_val2 = get_unaligned((__be16 *)value);
+				elapsed_time = ntohs(opt_val2);
+			} else {		/* 4-byte elapsed time */
+				opt_val = get_unaligned((__be32 *)value);
+				elapsed_time = ntohl(opt_val);
+			}
+
+			dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
+
+			/* Give precedence to the biggest ELAPSED_TIME */
+			if (elapsed_time > opt_recv->dccpor_elapsed_time)
+				opt_recv->dccpor_elapsed_time = elapsed_time;
+			break;
+		case DCCPO_ELAPSED_TIME:
+			if (dccp_packet_without_ack(skb))   /* RFC 4340, 13.2 */
+				break;
+
+			if (len == 2) {
+				__be16 opt_val2 = get_unaligned((__be16 *)value);
+				elapsed_time = ntohs(opt_val2);
+			} else if (len == 4) {
+				opt_val = get_unaligned((__be32 *)value);
+				elapsed_time = ntohl(opt_val);
+			} else {
+				goto out_invalid_option;
+			}
+
+			if (elapsed_time > opt_recv->dccpor_elapsed_time)
+				opt_recv->dccpor_elapsed_time = elapsed_time;
+
+			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
+				      dccp_role(sk), elapsed_time);
+			break;
+		case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
+			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+						     pkt_type, opt, value, len))
+				goto out_invalid_option;
+			break;
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
+				break;
+			/*
+			 * Ack vectors are processed by the TX CCID if it is
+			 * interested. The RX CCID need not parse Ack Vectors,
+			 * since it is only interested in clearing old state.
+			 * Fall through.
+			 */
+		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
+			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+						     pkt_type, opt, value, len))
+				goto out_invalid_option;
+			break;
+		default:
+			DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
+				  "implemented, ignoring", sk, opt, len);
+			break;
+		}
+ignore_option:
+		if (opt != DCCPO_MANDATORY)
+			mandatory = 0;
+	}
+
+	/* mandatory was the last byte in option list -> reset connection */
+	if (mandatory)
+		goto out_invalid_option;
+
+out_nonsensical_length:
+	/* RFC 4340, 5.8: ignore option and all remaining option space */
+	return 0;
+
+out_invalid_option:
+	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+	rc = DCCP_RESET_CODE_OPTION_ERROR;
+out_featneg_failed:
+	DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
+	DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
+	return -1;
+}
+
+EXPORT_SYMBOL_GPL(dccp_parse_options);
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
+{
+	if (len >= DCCP_OPTVAL_MAXLEN)
+		*to++ = (value & 0xFF0000000000ull) >> 40;
+	if (len > 4)
+		*to++ = (value & 0xFF00000000ull) >> 32;
+	if (len > 3)
+		*to++ = (value & 0xFF000000) >> 24;
+	if (len > 2)
+		*to++ = (value & 0xFF0000) >> 16;
+	if (len > 1)
+		*to++ = (value & 0xFF00) >> 8;
+	if (len > 0)
+		*to++ = (value & 0xFF);
+}
+
+static inline u8 dccp_ndp_len(const u64 ndp)
+{
+	if (likely(ndp <= 0xFF))
+		return 1;
+	return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
+}
+
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+		       const void *value, const unsigned char len)
+{
+	unsigned char *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+
+	to    = skb_push(skb, len + 2);
+	*to++ = option;
+	*to++ = len + 2;
+
+	memcpy(to, value, len);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+
+static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	u64 ndp = dp->dccps_ndp_count;
+
+	if (dccp_non_data_packet(skb))
+		++dp->dccps_ndp_count;
+	else
+		dp->dccps_ndp_count = 0;
+
+	if (ndp > 0) {
+		unsigned char *ptr;
+		const int ndp_len = dccp_ndp_len(ndp);
+		const int len = ndp_len + 2;
+
+		if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+			return -1;
+
+		DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+		ptr = skb_push(skb, len);
+		*ptr++ = DCCPO_NDP_COUNT;
+		*ptr++ = len;
+		dccp_encode_value_var(ndp, ptr, ndp_len);
+	}
+
+	return 0;
+}
+
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+	return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+
+/* FIXME: This function is currently not used anywhere */
+int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
+{
+	const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+	const int len = 2 + elapsed_time_len;
+	unsigned char *to;
+
+	if (elapsed_time_len == 0)
+		return 0;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to    = skb_push(skb, len);
+	*to++ = DCCPO_ELAPSED_TIME;
+	*to++ = len;
+
+	if (elapsed_time_len == 2) {
+		const __be16 var16 = htons((u16)elapsed_time);
+		memcpy(to, &var16, 2);
+	} else {
+		const __be32 var32 = htonl(elapsed_time);
+		memcpy(to, &var32, 4);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
+
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
+{
+	__be32 now = htonl(dccp_timestamp());
+	/* yes this will overflow but that is the point as we want a
+	 * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+
+	return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
+					     struct dccp_request_sock *dreq,
+					     struct sk_buff *skb)
+{
+	__be32 tstamp_echo;
+	unsigned char *to;
+	u32 elapsed_time, elapsed_time_len, len;
+
+	if (dreq != NULL) {
+		elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
+		tstamp_echo  = htonl(dreq->dreq_timestamp_echo);
+		dreq->dreq_timestamp_echo = 0;
+	} else {
+		elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
+		tstamp_echo  = htonl(dp->dccps_timestamp_echo);
+		dp->dccps_timestamp_echo = 0;
+	}
+
+	elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+	len = 6 + elapsed_time_len;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to    = skb_push(skb, len);
+	*to++ = DCCPO_TIMESTAMP_ECHO;
+	*to++ = len;
+
+	memcpy(to, &tstamp_echo, 4);
+	to += 4;
+
+	if (elapsed_time_len == 2) {
+		const __be16 var16 = htons((u16)elapsed_time);
+		memcpy(to, &var16, 2);
+	} else if (elapsed_time_len == 4) {
+		const __be32 var32 = htonl(elapsed_time);
+		memcpy(to, &var32, 4);
+	}
+
+	return 0;
+}
+
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+	const u16 buflen = dccp_ackvec_buflen(av);
+	/* Figure out how many options do we need to represent the ackvec */
+	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+	u16 len = buflen + 2 * nr_opts;
+	u8 i, nonce = 0;
+	const unsigned char *tail, *from;
+	unsigned char *to;
+
+	if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+			  dccp_packet_name(dcb->dccpd_type));
+		return -1;
+	}
+	/*
+	 * Since Ack Vectors are variable-length, we can not always predict
+	 * their size. To catch exception cases where the space is running out
+	 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+	 */
+	if (len > DCCPAV_MIN_OPTLEN &&
+	    len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+		DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+			  "MPS=%u ==> reduce payload size?\n", len, skb->len,
+			  dcb->dccpd_opt_len, dp->dccps_mss_cache);
+		dp->dccps_sync_scheduled = 1;
+		return 0;
+	}
+	dcb->dccpd_opt_len += len;
+
+	to   = skb_push(skb, len);
+	len  = buflen;
+	from = av->av_buf + av->av_buf_head;
+	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+	for (i = 0; i < nr_opts; ++i) {
+		int copylen = len;
+
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
+
+		/*
+		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+		 * its type; ack_nonce is the sum of all individual buf_nonce's.
+		 */
+		nonce ^= av->av_buf_nonce[i];
+
+		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+		*to++ = copylen + 2;
+
+		/* Check if buf_head wraps */
+		if (from + copylen > tail) {
+			const u16 tailsize = tail - from;
+
+			memcpy(to, from, tailsize);
+			to	+= tailsize;
+			len	-= tailsize;
+			copylen	-= tailsize;
+			from	= av->av_buf;
+		}
+
+		memcpy(to, from, copylen);
+		from += copylen;
+		to   += copylen;
+		len  -= copylen;
+	}
+	/*
+	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+	 */
+	if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
+		return -ENOBUFS;
+	return 0;
+}
+
+/**
+ * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
+ * Note that since we are using skb_push, this function needs to be called
+ * _after_ inserting the option it is supposed to influence (stack order).
+ */
+int dccp_insert_option_mandatory(struct sk_buff *skb)
+{
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len++;
+	*skb_push(skb, 1) = DCCPO_MANDATORY;
+	return 0;
+}
+
+/**
+ * dccp_insert_fn_opt  -  Insert single Feature-Negotiation option into @skb
+ * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
+ * @feat: one out of %dccp_feature_numbers
+ * @val: NN value or SP array (preferred element first) to copy
+ * @len: true length of @val in bytes (excluding first element repetition)
+ * @repeat_first: whether to copy the first element of @val twice
+ * The last argument is used to construct Confirm options, where the preferred
+ * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
+ * lists are kept such that the preferred entry is always first, so we only need
+ * to copy twice, and avoid the overhead of cloning into a bigger array.
+ */
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+		       u8 *val, u8 len, bool repeat_first)
+{
+	u8 tot_len, *to;
+
+	/* take the `Feature' field and possible repetition into account */
+	if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
+		DCCP_WARN("length %u for feature %u too large\n", len, feat);
+		return -1;
+	}
+
+	if (unlikely(val == NULL || len == 0))
+		len = repeat_first = 0;
+	tot_len = 3 + repeat_first + len;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("packet too small for feature %d option!\n", feat);
+		return -1;
+	}
+	DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
+
+	to    = skb_push(skb, tot_len);
+	*to++ = type;
+	*to++ = tot_len;
+	*to++ = feat;
+
+	if (repeat_first)
+		*to++ = *val;
+	if (len)
+		memcpy(to, val, len);
+	return 0;
+}
+
+/* The length of all options needs to be a multiple of 4 (5.8) */
+static void dccp_insert_option_padding(struct sk_buff *skb)
+{
+	int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+
+	if (padding != 0) {
+		padding = 4 - padding;
+		memset(skb_push(skb, padding), 0, padding);
+		DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+	}
+}
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
+		return -1;
+
+	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
+
+		/* Feature Negotiation */
+		if (dccp_feat_insert_opts(dp, NULL, skb))
+			return -1;
+
+		if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
+			/*
+			 * Obtain RTT sample from Request/Response exchange.
+			 * This is currently used for TFRC initialisation.
+			 */
+			if (dccp_insert_option_timestamp(skb))
+				return -1;
+
+		} else if (dccp_ackvec_pending(sk) &&
+			   dccp_insert_option_ackvec(sk, skb)) {
+				return -1;
+		}
+	}
+
+	if (dp->dccps_hc_rx_insert_options) {
+		if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
+			return -1;
+		dp->dccps_hc_rx_insert_options = 0;
+	}
+
+	if (dp->dccps_timestamp_echo != 0 &&
+	    dccp_insert_option_timestamp_echo(dp, NULL, skb))
+		return -1;
+
+	dccp_insert_option_padding(skb);
+	return 0;
+}
+
+int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
+{
+	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+	if (dccp_feat_insert_opts(NULL, dreq, skb))
+		return -1;
+
+	/* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+	if (dccp_insert_option_timestamp(skb))
+		return -1;
+
+	if (dreq->dreq_timestamp_echo != 0 &&
+	    dccp_insert_option_timestamp_echo(NULL, dreq, skb))
+		return -1;
+
+	dccp_insert_option_padding(skb);
+	return 0;
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 00000000..fab108e5
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,697 @@
+/*
+ *  net/dccp/output.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/inet_sock.h>
+#include <net/sock.h>
+
+#include "ackvec.h"
+#include "ccid.h"
+#include "dccp.h"
+
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+	skb_set_owner_w(skb, sk);
+	WARN_ON(sk->sk_send_head);
+	sk->sk_send_head = skb;
+}
+
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (likely(skb != NULL)) {
+		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		struct dccp_sock *dp = dccp_sk(sk);
+		struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+		struct dccp_hdr *dh;
+		/* XXX For now we're using only 48 bits sequence numbers */
+		const u32 dccp_header_size = sizeof(*dh) +
+					     sizeof(struct dccp_hdr_ext) +
+					  dccp_packet_hdr_len(dcb->dccpd_type);
+		int err, set_ack = 1;
+		u64 ackno = dp->dccps_gsr;
+		/*
+		 * Increment GSS here already in case the option code needs it.
+		 * Update GSS for real only if option processing below succeeds.
+		 */
+		dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
+
+		switch (dcb->dccpd_type) {
+		case DCCP_PKT_DATA:
+			set_ack = 0;
+			/* fall through */
+		case DCCP_PKT_DATAACK:
+		case DCCP_PKT_RESET:
+			break;
+
+		case DCCP_PKT_REQUEST:
+			set_ack = 0;
+			/* Use ISS on the first (non-retransmitted) Request. */
+			if (icsk->icsk_retransmits == 0)
+				dcb->dccpd_seq = dp->dccps_iss;
+			/* fall through */
+
+		case DCCP_PKT_SYNC:
+		case DCCP_PKT_SYNCACK:
+			ackno = dcb->dccpd_ack_seq;
+			/* fall through */
+		default:
+			/*
+			 * Set owner/destructor: some skbs are allocated via
+			 * alloc_skb (e.g. when retransmission may happen).
+			 * Only Data, DataAck, and Reset packets should come
+			 * through here with skb->sk set.
+			 */
+			WARN_ON(skb->sk);
+			skb_set_owner_w(skb, sk);
+			break;
+		}
+
+		if (dccp_insert_options(sk, skb)) {
+			kfree_skb(skb);
+			return -EPROTO;
+		}
+
+
+		/* Build DCCP header and checksum it. */
+		dh = dccp_zeroed_hdr(skb, dccp_header_size);
+		dh->dccph_type	= dcb->dccpd_type;
+		dh->dccph_sport	= inet->inet_sport;
+		dh->dccph_dport	= inet->inet_dport;
+		dh->dccph_doff	= (dccp_header_size + dcb->dccpd_opt_len) / 4;
+		dh->dccph_ccval	= dcb->dccpd_ccval;
+		dh->dccph_cscov = dp->dccps_pcslen;
+		/* XXX For now we're using only 48 bits sequence numbers */
+		dh->dccph_x	= 1;
+
+		dccp_update_gss(sk, dcb->dccpd_seq);
+		dccp_hdr_set_seq(dh, dp->dccps_gss);
+		if (set_ack)
+			dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+
+		switch (dcb->dccpd_type) {
+		case DCCP_PKT_REQUEST:
+			dccp_hdr_request(skb)->dccph_req_service =
+							dp->dccps_service;
+			/*
+			 * Limit Ack window to ISS <= P.ackno <= GSS, so that
+			 * only Responses to Requests we sent are considered.
+			 */
+			dp->dccps_awl = dp->dccps_iss;
+			break;
+		case DCCP_PKT_RESET:
+			dccp_hdr_reset(skb)->dccph_reset_code =
+							dcb->dccpd_reset_code;
+			break;
+		}
+
+		icsk->icsk_af_ops->send_check(sk, skb);
+
+		if (set_ack)
+			dccp_event_ack_sent(sk);
+
+		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+
+		err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
+		return net_xmit_eval(err);
+	}
+	return -ENOBUFS;
+}
+
+/**
+ * dccp_determine_ccmps  -  Find out about CCID-specific packet-size limits
+ * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
+ * since the RX CCID is restricted to feedback packets (Acks), which are small
+ * in comparison with the data traffic. A value of 0 means "no current CCMPS".
+ */
+static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
+{
+	const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
+
+	if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
+		return 0;
+	return tx_ccid->ccid_ops->ccid_ccmps;
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	u32 ccmps = dccp_determine_ccmps(dp);
+	u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+
+	/* Account for header lengths and IPv4/v6 option overhead */
+	cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
+		    sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
+
+	/*
+	 * Leave enough headroom for common DCCP header options.
+	 * This only considers options which may appear on DCCP-Data packets, as
+	 * per table 3 in RFC 4340, 5.8. When running out of space for other
+	 * options (eg. Ack Vector which can take up to 255 bytes), it is better
+	 * to schedule a separate Ack. Thus we leave headroom for the following:
+	 *  - 1 byte for Slow Receiver (11.6)
+	 *  - 6 bytes for Timestamp (13.1)
+	 *  - 10 bytes for Timestamp Echo (13.3)
+	 *  - 8 bytes for NDP count (7.7, when activated)
+	 *  - 6 bytes for Data Checksum (9.3)
+	 *  - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
+	 */
+	cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+			   (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
+
+	/* And store cached results */
+	icsk->icsk_pmtu_cookie = pmtu;
+	dp->dccps_mss_cache = cur_mps;
+
+	return cur_mps;
+}
+
+EXPORT_SYMBOL_GPL(dccp_sync_mss);
+
+void dccp_write_space(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible(&wq->wait);
+	/* Should agree with poll, otherwise some programs break */
+	if (sock_writeable(sk))
+		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+
+	rcu_read_unlock();
+}
+
+/**
+ * dccp_wait_for_ccid  -  Await CCID send permission
+ * @sk:    socket to wait for
+ * @delay: timeout in jiffies
+ * This is used by CCIDs which need to delay the send time in process context.
+ */
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
+{
+	DEFINE_WAIT(wait);
+	long remaining;
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sk->sk_write_pending++;
+	release_sock(sk);
+
+	remaining = schedule_timeout(delay);
+
+	lock_sock(sk);
+	sk->sk_write_pending--;
+	finish_wait(sk_sleep(sk), &wait);
+
+	if (signal_pending(current) || sk->sk_err)
+		return -1;
+	return remaining;
+}
+
+/**
+ * dccp_xmit_packet  -  Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+	int err, len;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb = dccp_qpolicy_pop(sk);
+
+	if (unlikely(skb == NULL))
+		return;
+	len = skb->len;
+
+	if (sk->sk_state == DCCP_PARTOPEN) {
+		const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+		/*
+		 * See 8.1.5 - Handshake Completion.
+		 *
+		 * For robustness we resend Confirm options until the client has
+		 * entered OPEN. During the initial feature negotiation, the MPS
+		 * is smaller than usual, reduced by the Change/Confirm options.
+		 */
+		if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+			DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+			dccp_send_ack(sk);
+			dccp_feat_list_purge(&dp->dccps_featneg);
+		}
+
+		inet_csk_schedule_ack(sk);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+					      inet_csk(sk)->icsk_rto,
+					      DCCP_RTO_MAX);
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else if (dccp_ack_pending(sk)) {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+	}
+
+	err = dccp_transmit_skb(sk, skb);
+	if (err)
+		dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+	/*
+	 * Register this one as sent even if an error occurred. To the remote
+	 * end a local packet drop is indistinguishable from network loss, i.e.
+	 * any local drop will eventually be reported via receiver feedback.
+	 */
+	ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+	/*
+	 * If the CCID needs to transfer additional header options out-of-band
+	 * (e.g. Ack Vectors or feature-negotiation options), it activates this
+	 * flag to schedule a Sync. The Sync will automatically incorporate all
+	 * currently pending header options, thus clearing the backlog.
+	 */
+	if (dp->dccps_sync_scheduled)
+		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+}
+
+/**
+ * dccp_flush_write_queue  -  Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	long delay, rc;
+
+	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			/*
+			 * If the CCID determines when to send, the next sending
+			 * time is unknown or the CCID may not even send again
+			 * (e.g. remote host crashes or lost Ack packets).
+			 */
+			DCCP_WARN("CCID did not manage to send all packets\n");
+			return;
+		case CCID_PACKET_DELAY:
+			delay = msecs_to_jiffies(rc);
+			if (delay > *time_budget)
+				return;
+			rc = dccp_wait_for_ccid(sk, delay);
+			if (rc < 0)
+				return;
+			*time_budget -= (delay - rc);
+			/* check again if we can send now */
+			break;
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			skb_dequeue(&sk->sk_write_queue);
+			kfree_skb(skb);
+			dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+		}
+	}
+}
+
+void dccp_write_xmit(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+
+	while ((skb = dccp_qpolicy_top(sk))) {
+		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			return;
+		case CCID_PACKET_DELAY:
+			sk_reset_timer(sk, &dp->dccps_xmit_timer,
+				       jiffies + msecs_to_jiffies(rc));
+			return;
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			dccp_qpolicy_drop(sk, skb);
+			dccp_pr_debug("packet discarded due to err=%d\n", rc);
+		}
+	}
+}
+
+/**
+ * dccp_retransmit_skb  -  Retransmit Request, Close, or CloseReq packets
+ * There are only four retransmittable packet types in DCCP:
+ * - Request  in client-REQUEST  state (sec. 8.1.1),
+ * - CloseReq in server-CLOSEREQ state (sec. 8.3),
+ * - Close    in   node-CLOSING  state (sec. 8.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
+ * This function expects sk->sk_send_head to contain the original skb.
+ */
+int dccp_retransmit_skb(struct sock *sk)
+{
+	WARN_ON(sk->sk_send_head == NULL);
+
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	/* this count is used to distinguish original and retransmitted skb */
+	inet_csk(sk)->icsk_retransmits++;
+
+	return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
+}
+
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+				   struct request_sock *req)
+{
+	struct dccp_hdr *dh;
+	struct dccp_request_sock *dreq;
+	const u32 dccp_header_size = sizeof(struct dccp_hdr) +
+				     sizeof(struct dccp_hdr_ext) +
+				     sizeof(struct dccp_hdr_response);
+	struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
+					   GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+
+	skb_dst_set(skb, dst_clone(dst));
+
+	dreq = dccp_rsk(req);
+	if (inet_rsk(req)->acked)	/* increase ISS upon retransmission */
+		dccp_inc_seqno(&dreq->dreq_iss);
+	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+	DCCP_SKB_CB(skb)->dccpd_seq  = dreq->dreq_iss;
+
+	/* Resolve feature dependencies resulting from choice of CCID */
+	if (dccp_feat_server_ccid_dependencies(dreq))
+		goto response_failed;
+
+	if (dccp_insert_options_rsk(dreq, skb))
+		goto response_failed;
+
+	/* Build and checksum header */
+	dh = dccp_zeroed_hdr(skb, dccp_header_size);
+
+	dh->dccph_sport	= inet_rsk(req)->loc_port;
+	dh->dccph_dport	= inet_rsk(req)->rmt_port;
+	dh->dccph_doff	= (dccp_header_size +
+			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+	dh->dccph_type	= DCCP_PKT_RESPONSE;
+	dh->dccph_x	= 1;
+	dccp_hdr_set_seq(dh, dreq->dreq_iss);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
+	dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
+
+	dccp_csum_outgoing(skb);
+
+	/* We use `acked' to remember that a Response was already sent. */
+	inet_rsk(req)->acked = 1;
+	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+	return skb;
+response_failed:
+	kfree_skb(skb);
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(dccp_make_response);
+
+/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
+{
+	struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
+	const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+				       sizeof(struct dccp_hdr_ext) +
+				       sizeof(struct dccp_hdr_reset);
+	struct dccp_hdr_reset *dhr;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	skb_reserve(skb, sk->sk_prot->max_header);
+
+	/* Swap the send and the receive. */
+	dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
+	dh->dccph_type	= DCCP_PKT_RESET;
+	dh->dccph_sport	= rxdh->dccph_dport;
+	dh->dccph_dport	= rxdh->dccph_sport;
+	dh->dccph_doff	= dccp_hdr_reset_len / 4;
+	dh->dccph_x	= 1;
+
+	dhr = dccp_hdr_reset(skb);
+	dhr->dccph_reset_code = dcb->dccpd_reset_code;
+
+	switch (dcb->dccpd_reset_code) {
+	case DCCP_RESET_CODE_PACKET_ERROR:
+		dhr->dccph_reset_data[0] = rxdh->dccph_type;
+		break;
+	case DCCP_RESET_CODE_OPTION_ERROR:	/* fall through */
+	case DCCP_RESET_CODE_MANDATORY_ERROR:
+		memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
+		break;
+	}
+	/*
+	 * From RFC 4340, 8.3.1:
+	 *   If P.ackno exists, set R.seqno := P.ackno + 1.
+	 *   Else set R.seqno := 0.
+	 */
+	if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
+
+	dccp_csum_outgoing(skb);
+	return skb;
+}
+
+EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
+
+/* send Reset on established socket, to close or abort the connection */
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+	struct sk_buff *skb;
+	/*
+	 * FIXME: what if rebuild_header fails?
+	 * Should we be doing a rebuild_header here?
+	 */
+	int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
+
+	if (err != 0)
+		return err;
+
+	skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOBUFS;
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+	DCCP_SKB_CB(skb)->dccpd_type	   = DCCP_PKT_RESET;
+	DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+
+	return dccp_transmit_skb(sk, skb);
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+int dccp_connect(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	sk->sk_err = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+
+	dccp_sync_mss(sk, dst_mtu(dst));
+
+	/* do not connect if feature negotiation setup fails */
+	if (dccp_feat_finalise_settings(dccp_sk(sk)))
+		return -EPROTO;
+
+	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+	dp->dccps_gar = dp->dccps_iss;
+
+	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
+	if (unlikely(skb == NULL))
+		return -ENOBUFS;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+
+	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+
+	dccp_skb_entail(sk, skb);
+	dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+
+	/* Timer for repeating the REQUEST until an answer. */
+	icsk->icsk_retransmits = 0;
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  icsk->icsk_rto, DCCP_RTO_MAX);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_connect);
+
+void dccp_send_ack(struct sock *sk)
+{
+	/* If we have been reset, we may not send again. */
+	if (sk->sk_state != DCCP_CLOSED) {
+		struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
+						GFP_ATOMIC);
+
+		if (skb == NULL) {
+			inet_csk_schedule_ack(sk);
+			inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX,
+						  DCCP_RTO_MAX);
+			return;
+		}
+
+		/* Reserve space for headers */
+		skb_reserve(skb, sk->sk_prot->max_header);
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+		dccp_transmit_skb(sk, skb);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+
+#if 0
+/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
+void dccp_send_delayed_ack(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	/*
+	 * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+	 * with using 2s, and active senders also piggyback the ACK into a
+	 * DATAACK packet, so this is really for quiescent senders.
+	 */
+	unsigned long timeout = jiffies + 2 * HZ;
+
+	/* Use new timeout only if there wasn't a older one earlier. */
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 *
+		 * FIXME: check the "about to expire" part
+		 */
+		if (icsk->icsk_ack.blocked) {
+			dccp_send_ack(sk);
+			return;
+		}
+
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
+	}
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+#endif
+
+void dccp_send_sync(struct sock *sk, const u64 ackno,
+		    const enum dccp_pkt_type pkt_type)
+{
+	/*
+	 * We are not putting this on the write queue, so
+	 * dccp_transmit_skb() will set the ownership to this
+	 * sock.
+	 */
+	struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
+
+	if (skb == NULL) {
+		/* FIXME: how to make sure the sync is sent? */
+		DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+	DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+	DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+
+	/*
+	 * Clear the flag in case the Sync was scheduled for out-of-band data,
+	 * such as carrying a long Ack Vector.
+	 */
+	dccp_sk(sk)->dccps_sync_scheduled = 0;
+
+	dccp_transmit_skb(sk, skb);
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_sync);
+
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
+
+	skb = alloc_skb(sk->sk_prot->max_header, prio);
+	if (skb == NULL)
+		return;
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+	if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
+	else
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
+
+	if (active) {
+		dccp_skb_entail(sk, skb);
+		dccp_transmit_skb(sk, skb_clone(skb, prio));
+		/*
+		 * Retransmission timer for active-close: RFC 4340, 8.3 requires
+		 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
+		 * state can be left. The initial timeout is 2 RTTs.
+		 * Since RTT measurement is done by the CCIDs, there is no easy
+		 * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
+		 * is too low (200ms); we use a high value to avoid unnecessary
+		 * retransmissions when the link RTT is > 0.2 seconds.
+		 * FIXME: Let main module sample RTTs and use that instead.
+		 */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
+	} else
+		dccp_transmit_skb(sk, skb);
+}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
new file mode 100644
index 00000000..33d0e629
--- /dev/null
+++ b/net/dccp/probe.c
@@ -0,0 +1,198 @@
+/*
+ * dccp_probe - Observe the DCCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for DCCP from Stephen Hemminger's code
+ * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/dccp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+
+#include "dccp.h"
+#include "ccid.h"
+#include "ccids/ccid3.h"
+
+static int port;
+
+static int bufsize = 64 * 1024;
+
+static const char procname[] = "dccpprobe";
+
+static struct {
+	struct kfifo	  fifo;
+	spinlock_t	  lock;
+	wait_queue_head_t wait;
+	struct timespec	  tstart;
+} dccpw;
+
+static void printl(const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	struct timespec now;
+	char tbuf[256];
+
+	va_start(args, fmt);
+	getnstimeofday(&now);
+
+	now = timespec_sub(now, dccpw.tstart);
+
+	len = sprintf(tbuf, "%lu.%06lu ",
+		      (unsigned long) now.tv_sec,
+		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+	va_end(args);
+
+	kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+	wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			 struct msghdr *msg, size_t size)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct ccid3_hc_tx_sock *hc = NULL;
+
+	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+		hc = ccid3_hc_tx_sk(sk);
+
+	if (port == 0 || ntohs(inet->inet_dport) == port ||
+	    ntohs(inet->inet_sport) == port) {
+		if (hc)
+			printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
+			       &inet->inet_saddr, ntohs(inet->inet_sport),
+			       &inet->inet_daddr, ntohs(inet->inet_dport), size,
+			       hc->tx_s, hc->tx_rtt, hc->tx_p,
+			       hc->tx_x_calc, hc->tx_x_recv >> 6,
+			       hc->tx_x >> 6, hc->tx_t_ipi);
+		else
+			printl("%pI4:%u %pI4:%u %d\n",
+			       &inet->inet_saddr, ntohs(inet->inet_sport),
+			       &inet->inet_daddr, ntohs(inet->inet_dport),
+			       size);
+	}
+
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe dccp_send_probe = {
+	.kp	= {
+		.symbol_name = "dccp_sendmsg",
+	},
+	.entry	= jdccp_sendmsg,
+};
+
+static int dccpprobe_open(struct inode *inode, struct file *file)
+{
+	kfifo_reset(&dccpw.fifo);
+	getnstimeofday(&dccpw.tstart);
+	return 0;
+}
+
+static ssize_t dccpprobe_read(struct file *file, char __user *buf,
+			      size_t len, loff_t *ppos)
+{
+	int error = 0, cnt = 0;
+	unsigned char *tbuf;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (len == 0)
+		return 0;
+
+	tbuf = vmalloc(len);
+	if (!tbuf)
+		return -ENOMEM;
+
+	error = wait_event_interruptible(dccpw.wait,
+					 kfifo_len(&dccpw.fifo) != 0);
+	if (error)
+		goto out_free;
+
+	cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
+
+out_free:
+	vfree(tbuf);
+
+	return error ? error : cnt;
+}
+
+static const struct file_operations dccpprobe_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = dccpprobe_open,
+	.read    = dccpprobe_read,
+	.llseek  = noop_llseek,
+};
+
+static __init int dccpprobe_init(void)
+{
+	int ret = -ENOMEM;
+
+	init_waitqueue_head(&dccpw.wait);
+	spin_lock_init(&dccpw.lock);
+	if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
+		return ret;
+	if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &dccpprobe_fops))
+		goto err0;
+
+	try_then_request_module((ret = register_jprobe(&dccp_send_probe)) == 0,
+				"dccp");
+	if (ret)
+		goto err1;
+
+	pr_info("DCCP watch registered (port=%d)\n", port);
+	return 0;
+err1:
+	proc_net_remove(&init_net, procname);
+err0:
+	kfifo_free(&dccpw.fifo);
+	return ret;
+}
+module_init(dccpprobe_init);
+
+static __exit void dccpprobe_exit(void)
+{
+	kfifo_free(&dccpw.fifo);
+	proc_net_remove(&init_net, procname);
+	unregister_jprobe(&dccp_send_probe);
+
+}
+module_exit(dccpprobe_exit);
+
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
+MODULE_DESCRIPTION("DCCP snooper");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 00000000..152975d9
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,1257 @@
+/*
+ *  net/dccp/proto.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/checksum.h>
+
+#include <net/inet_sock.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+#include <asm/ioctls.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+
+#include "ccid.h"
+#include "dccp.h"
+#include "feat.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+
+EXPORT_SYMBOL_GPL(dccp_statistics);
+
+struct percpu_counter dccp_orphan_count;
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
+struct inet_hashinfo dccp_hashinfo;
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+/* the maximum queue length for tx in packets. 0 is no limit */
+int sysctl_dccp_tx_qlen __read_mostly = 5;
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+	static const char *const dccp_state_names[] = {
+	[DCCP_OPEN]		= "OPEN",
+	[DCCP_REQUESTING]	= "REQUESTING",
+	[DCCP_PARTOPEN]		= "PARTOPEN",
+	[DCCP_LISTEN]		= "LISTEN",
+	[DCCP_RESPOND]		= "RESPOND",
+	[DCCP_CLOSING]		= "CLOSING",
+	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
+	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
+	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
+	[DCCP_TIME_WAIT]	= "TIME_WAIT",
+	[DCCP_CLOSED]		= "CLOSED",
+	};
+
+	if (state >= DCCP_MAX_STATES)
+		return "INVALID STATE!";
+	else
+		return dccp_state_names[state];
+}
+#endif
+
+void dccp_set_state(struct sock *sk, const int state)
+{
+	const int oldstate = sk->sk_state;
+
+	dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
+		      dccp_state_name(oldstate), dccp_state_name(state));
+	WARN_ON(state == oldstate);
+
+	switch (state) {
+	case DCCP_OPEN:
+		if (oldstate != DCCP_OPEN)
+			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+		/* Client retransmits all Confirm options until entering OPEN */
+		if (oldstate == DCCP_PARTOPEN)
+			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
+		break;
+
+	case DCCP_CLOSED:
+		if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
+		    oldstate == DCCP_CLOSING)
+			DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+		sk->sk_prot->unhash(sk);
+		if (inet_csk(sk)->icsk_bind_hash != NULL &&
+		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+			inet_put_port(sk);
+		/* fall through */
+	default:
+		if (oldstate == DCCP_OPEN)
+			DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+	}
+
+	/* Change state AFTER socket is unhashed to avoid closed
+	 * socket sitting in hash tables.
+	 */
+	sk->sk_state = state;
+}
+
+EXPORT_SYMBOL_GPL(dccp_set_state);
+
+static void dccp_finish_passive_close(struct sock *sk)
+{
+	switch (sk->sk_state) {
+	case DCCP_PASSIVE_CLOSE:
+		/* Node (client or server) has received Close packet. */
+		dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+		dccp_set_state(sk, DCCP_CLOSED);
+		break;
+	case DCCP_PASSIVE_CLOSEREQ:
+		/*
+		 * Client received CloseReq. We set the `active' flag so that
+		 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
+		 */
+		dccp_send_close(sk, 1);
+		dccp_set_state(sk, DCCP_CLOSING);
+	}
+}
+
+void dccp_done(struct sock *sk)
+{
+	dccp_set_state(sk, DCCP_CLOSED);
+	dccp_clear_xmit_timers(sk);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+	else
+		inet_csk_destroy_sock(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_done);
+
+const char *dccp_packet_name(const int type)
+{
+	static const char *const dccp_packet_names[] = {
+		[DCCP_PKT_REQUEST]  = "REQUEST",
+		[DCCP_PKT_RESPONSE] = "RESPONSE",
+		[DCCP_PKT_DATA]	    = "DATA",
+		[DCCP_PKT_ACK]	    = "ACK",
+		[DCCP_PKT_DATAACK]  = "DATAACK",
+		[DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+		[DCCP_PKT_CLOSE]    = "CLOSE",
+		[DCCP_PKT_RESET]    = "RESET",
+		[DCCP_PKT_SYNC]	    = "SYNC",
+		[DCCP_PKT_SYNCACK]  = "SYNCACK",
+	};
+
+	if (type >= DCCP_NR_PKT_TYPES)
+		return "INVALID";
+	else
+		return dccp_packet_names[type];
+}
+
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
+	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
+	sk->sk_state		= DCCP_CLOSED;
+	sk->sk_write_space	= dccp_write_space;
+	icsk->icsk_sync_mss	= dccp_sync_mss;
+	dp->dccps_mss_cache	= 536;
+	dp->dccps_rate_last	= jiffies;
+	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
+	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
+	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
+	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
+
+	dccp_init_xmit_timers(sk);
+
+	INIT_LIST_HEAD(&dp->dccps_featneg);
+	/* control socket doesn't need feat nego */
+	if (likely(ctl_sock_initialized))
+		return dccp_feat_init(sk);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_init_sock);
+
+void dccp_destroy_sock(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	/*
+	 * DCCP doesn't use sk_write_queue, just sk_send_head
+	 * for retransmissions
+	 */
+	if (sk->sk_send_head != NULL) {
+		kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+	}
+
+	/* Clean up a referenced DCCP bind bucket. */
+	if (inet_csk(sk)->icsk_bind_hash != NULL)
+		inet_put_port(sk);
+
+	kfree(dp->dccps_service_list);
+	dp->dccps_service_list = NULL;
+
+	if (dp->dccps_hc_rx_ackvec != NULL) {
+		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+		dp->dccps_hc_rx_ackvec = NULL;
+	}
+	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+	/* clean up feature negotiation state */
+	dccp_feat_list_purge(&dp->dccps_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_destroy_sock);
+
+static inline int dccp_listen_start(struct sock *sk, int backlog)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	dp->dccps_role = DCCP_ROLE_LISTEN;
+	/* do not start to listen if feature negotiation setup fails */
+	if (dccp_feat_finalise_settings(dp))
+		return -EPROTO;
+	return inet_csk_listen_start(sk, backlog);
+}
+
+static inline int dccp_need_reset(int state)
+{
+	return state != DCCP_CLOSED && state != DCCP_LISTEN &&
+	       state != DCCP_REQUESTING;
+}
+
+int dccp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	int err = 0;
+	const int old_state = sk->sk_state;
+
+	if (old_state != DCCP_CLOSED)
+		dccp_set_state(sk, DCCP_CLOSED);
+
+	/*
+	 * This corresponds to the ABORT function of RFC793, sec. 3.8
+	 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
+	 */
+	if (old_state == DCCP_LISTEN) {
+		inet_csk_listen_stop(sk);
+	} else if (dccp_need_reset(old_state)) {
+		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+		sk->sk_err = ECONNRESET;
+	} else if (old_state == DCCP_REQUESTING)
+		sk->sk_err = ECONNRESET;
+
+	dccp_clear_xmit_timers(sk);
+
+	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_write_queue);
+	if (sk->sk_send_head != NULL) {
+		__kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+	}
+
+	inet->inet_dport = 0;
+
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	sk->sk_shutdown = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+
+	icsk->icsk_backoff = 0;
+	inet_csk_delack_init(sk);
+	__sk_dst_reset(sk);
+
+	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+	sk->sk_error_report(sk);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
+/*
+ *	Wait for a DCCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait)
+{
+	unsigned int mask;
+	struct sock *sk = sock->sk;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	if (sk->sk_state == DCCP_LISTEN)
+		return inet_csk_listen_poll(sk);
+
+	/* Socket is not locked. We are protected from async events
+	   by poll logic and correct handling of state changes
+	   made by another threads is impossible in any case.
+	 */
+
+	mask = 0;
+	if (sk->sk_err)
+		mask = POLLERR;
+
+	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+	/* Connected? */
+	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+		if (atomic_read(&sk->sk_rmem_alloc) > 0)
+			mask |= POLLIN | POLLRDNORM;
+
+		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {  /* send SIGIO later */
+				set_bit(SOCK_ASYNC_NOSPACE,
+					&sk->sk_socket->flags);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+				/* Race breaker. If space is freed after
+				 * wspace test but before the flags are set,
+				 * IO signal will be lost.
+				 */
+				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+					mask |= POLLOUT | POLLWRNORM;
+			}
+		}
+	}
+	return mask;
+}
+
+EXPORT_SYMBOL_GPL(dccp_poll);
+
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	int rc = -ENOTCONN;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == DCCP_LISTEN)
+		goto out;
+
+	switch (cmd) {
+	case SIOCINQ: {
+		struct sk_buff *skb;
+		unsigned long amount = 0;
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb != NULL) {
+			/*
+			 * We will only return the amount of this packet since
+			 * that is all that will be read.
+			 */
+			amount = skb->len;
+		}
+		rc = put_user(amount, (int __user *)arg);
+	}
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+out:
+	release_sock(sk);
+	return rc;
+}
+
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
+static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
+				   char __user *optval, unsigned int optlen)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_service_list *sl = NULL;
+
+	if (service == DCCP_SERVICE_INVALID_VALUE ||
+	    optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
+		return -EINVAL;
+
+	if (optlen > sizeof(service)) {
+		sl = kmalloc(optlen, GFP_KERNEL);
+		if (sl == NULL)
+			return -ENOMEM;
+
+		sl->dccpsl_nr = optlen / sizeof(u32) - 1;
+		if (copy_from_user(sl->dccpsl_list,
+				   optval + sizeof(service),
+				   optlen - sizeof(service)) ||
+		    dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
+			kfree(sl);
+			return -EFAULT;
+		}
+	}
+
+	lock_sock(sk);
+	dp->dccps_service = service;
+
+	kfree(dp->dccps_service_list);
+
+	dp->dccps_service_list = sl;
+	release_sock(sk);
+	return 0;
+}
+
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+{
+	u8 *list, len;
+	int i, rc;
+
+	if (cscov < 0 || cscov > 15)
+		return -EINVAL;
+	/*
+	 * Populate a list of permissible values, in the range cscov...15. This
+	 * is necessary since feature negotiation of single values only works if
+	 * both sides incidentally choose the same value. Since the list starts
+	 * lowest-value first, negotiation will pick the smallest shared value.
+	 */
+	if (cscov == 0)
+		return 0;
+	len = 16 - cscov;
+
+	list = kmalloc(len, GFP_KERNEL);
+	if (list == NULL)
+		return -ENOBUFS;
+
+	for (i = 0; i < len; i++)
+		list[i] = cscov++;
+
+	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+	if (rc == 0) {
+		if (rx)
+			dccp_sk(sk)->dccps_pcrlen = cscov;
+		else
+			dccp_sk(sk)->dccps_pcslen = cscov;
+	}
+	kfree(list);
+	return rc;
+}
+
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+				char __user *optval, unsigned int optlen)
+{
+	u8 *val;
+	int rc = 0;
+
+	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+		return -EINVAL;
+
+	val = memdup_user(optval, optlen);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	lock_sock(sk);
+	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+	release_sock(sk);
+
+	kfree(val);
+	return rc;
+}
+
+static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
+		char __user *optval, unsigned int optlen)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	int val, err = 0;
+
+	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+		return 0;
+	case DCCP_SOCKOPT_CHANGE_L:
+	case DCCP_SOCKOPT_CHANGE_R:
+		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+		return 0;
+	case DCCP_SOCKOPT_CCID:
+	case DCCP_SOCKOPT_RX_CCID:
+	case DCCP_SOCKOPT_TX_CCID:
+		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
+	}
+
+	if (optlen < (int)sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	if (optname == DCCP_SOCKOPT_SERVICE)
+		return dccp_setsockopt_service(sk, val, optval, optlen);
+
+	lock_sock(sk);
+	switch (optname) {
+	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+		if (dp->dccps_role != DCCP_ROLE_SERVER)
+			err = -EOPNOTSUPP;
+		else
+			dp->dccps_server_timewait = (val != 0);
+		break;
+	case DCCP_SOCKOPT_SEND_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, false);
+		break;
+	case DCCP_SOCKOPT_RECV_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, true);
+		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		if (sk->sk_state != DCCP_CLOSED)
+			err = -EISCONN;
+		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+			err = -EINVAL;
+		else
+			dp->dccps_qpolicy = val;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		if (val < 0)
+			err = -EINVAL;
+		else
+			dp->dccps_tx_qlen = val;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+	release_sock(sk);
+
+	return err;
+}
+
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+							     optname, optval,
+							     optlen);
+	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+			   char __user *optval, unsigned int optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk_compat_setsockopt(sk, level, optname,
+						  optval, optlen);
+	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
+#endif
+
+static int dccp_getsockopt_service(struct sock *sk, int len,
+				   __be32 __user *optval,
+				   int __user *optlen)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	const struct dccp_service_list *sl;
+	int err = -ENOENT, slen = 0, total_len = sizeof(u32);
+
+	lock_sock(sk);
+	if ((sl = dp->dccps_service_list) != NULL) {
+		slen = sl->dccpsl_nr * sizeof(u32);
+		total_len += slen;
+	}
+
+	err = -EINVAL;
+	if (total_len > len)
+		goto out;
+
+	err = 0;
+	if (put_user(total_len, optlen) ||
+	    put_user(dp->dccps_service, optval) ||
+	    (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
+		err = -EFAULT;
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	struct dccp_sock *dp;
+	int val, len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < (int)sizeof(int))
+		return -EINVAL;
+
+	dp = dccp_sk(sk);
+
+	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+		return 0;
+	case DCCP_SOCKOPT_SERVICE:
+		return dccp_getsockopt_service(sk, len,
+					       (__be32 __user *)optval, optlen);
+	case DCCP_SOCKOPT_GET_CUR_MPS:
+		val = dp->dccps_mss_cache;
+		break;
+	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+	case DCCP_SOCKOPT_TX_CCID:
+		val = ccid_get_current_tx_ccid(dp);
+		if (val < 0)
+			return -ENOPROTOOPT;
+		break;
+	case DCCP_SOCKOPT_RX_CCID:
+		val = ccid_get_current_rx_ccid(dp);
+		if (val < 0)
+			return -ENOPROTOOPT;
+		break;
+	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+		val = dp->dccps_server_timewait;
+		break;
+	case DCCP_SOCKOPT_SEND_CSCOV:
+		val = dp->dccps_pcslen;
+		break;
+	case DCCP_SOCKOPT_RECV_CSCOV:
+		val = dp->dccps_pcrlen;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		val = dp->dccps_qpolicy;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		val = dp->dccps_tx_qlen;
+		break;
+	case 128 ... 191:
+		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
+					     len, (u32 __user *)optval, optlen);
+	case 192 ... 255:
+		return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
+					     len, (u32 __user *)optval, optlen);
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = sizeof(val);
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+							     optname, optval,
+							     optlen);
+	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+			   char __user *optval, int __user *optlen)
+{
+	if (level != SOL_DCCP)
+		return inet_csk_compat_getsockopt(sk, level, optname,
+						  optval, optlen);
+	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
+#endif
+
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+	/*
+	 * Assign an (opaque) qpolicy priority value to skb->priority.
+	 *
+	 * We are overloading this skb field for use with the qpolicy subystem.
+	 * The skb->priority is normally used for the SO_PRIORITY option, which
+	 * is initialised from sk_priority. Since the assignment of sk_priority
+	 * to skb->priority happens later (on layer 3), we overload this field
+	 * for use with queueing priorities as long as the skb is on layer 4.
+	 * The default priority value (if nothing is set) is 0.
+	 */
+	skb->priority = 0;
+
+	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_DCCP)
+			continue;
+
+		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+			return -EINVAL;
+
+		switch (cmsg->cmsg_type) {
+		case DCCP_SCM_PRIORITY:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+				return -EINVAL;
+			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	const int flags = msg->msg_flags;
+	const int noblock = flags & MSG_DONTWAIT;
+	struct sk_buff *skb;
+	int rc, size;
+	long timeo;
+
+	if (len > dp->dccps_mss_cache)
+		return -EMSGSIZE;
+
+	lock_sock(sk);
+
+	if (dccp_qpolicy_full(sk)) {
+		rc = -EAGAIN;
+		goto out_release;
+	}
+
+	timeo = sock_sndtimeo(sk, noblock);
+
+	/*
+	 * We have to use sk_stream_wait_connect here to set sk_write_pending,
+	 * so that the trick in dccp_rcv_request_sent_state_process.
+	 */
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+		if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_release;
+
+	size = sk->sk_prot->max_header + len;
+	release_sock(sk);
+	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+	lock_sock(sk);
+	if (skb == NULL)
+		goto out_release;
+
+	skb_reserve(skb, sk->sk_prot->max_header);
+	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (rc != 0)
+		goto out_discard;
+
+	rc = dccp_msghdr_parse(msg, skb);
+	if (rc != 0)
+		goto out_discard;
+
+	dccp_qpolicy_push(sk, skb);
+	/*
+	 * The xmit_timer is set if the TX CCID is rate-based and will expire
+	 * when congestion control permits to release further packets into the
+	 * network. Window-based CCIDs do not use this timer.
+	 */
+	if (!timer_pending(&dp->dccps_xmit_timer))
+		dccp_write_xmit(sk);
+out_release:
+	release_sock(sk);
+	return rc ? : len;
+out_discard:
+	kfree_skb(skb);
+	goto out_release;
+}
+
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len, int nonblock, int flags, int *addr_len)
+{
+	const struct dccp_hdr *dh;
+	long timeo;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		len = -ENOTCONN;
+		goto out;
+	}
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	do {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+		if (skb == NULL)
+			goto verify_sock_status;
+
+		dh = dccp_hdr(skb);
+
+		switch (dh->dccph_type) {
+		case DCCP_PKT_DATA:
+		case DCCP_PKT_DATAACK:
+			goto found_ok_skb;
+
+		case DCCP_PKT_CLOSE:
+		case DCCP_PKT_CLOSEREQ:
+			if (!(flags & MSG_PEEK))
+				dccp_finish_passive_close(sk);
+			/* fall through */
+		case DCCP_PKT_RESET:
+			dccp_pr_debug("found fin (%s) ok!\n",
+				      dccp_packet_name(dh->dccph_type));
+			len = 0;
+			goto found_fin_ok;
+		default:
+			dccp_pr_debug("packet_type=%s\n",
+				      dccp_packet_name(dh->dccph_type));
+			sk_eat_skb(sk, skb, 0);
+		}
+verify_sock_status:
+		if (sock_flag(sk, SOCK_DONE)) {
+			len = 0;
+			break;
+		}
+
+		if (sk->sk_err) {
+			len = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN) {
+			len = 0;
+			break;
+		}
+
+		if (sk->sk_state == DCCP_CLOSED) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				len = -ENOTCONN;
+				break;
+			}
+			len = 0;
+			break;
+		}
+
+		if (!timeo) {
+			len = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			len = sock_intr_errno(timeo);
+			break;
+		}
+
+		sk_wait_data(sk, &timeo);
+		continue;
+	found_ok_skb:
+		if (len > skb->len)
+			len = skb->len;
+		else if (len < skb->len)
+			msg->msg_flags |= MSG_TRUNC;
+
+		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+			/* Exception. Bailout! */
+			len = -EFAULT;
+			break;
+		}
+		if (flags & MSG_TRUNC)
+			len = skb->len;
+	found_fin_ok:
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb, 0);
+		break;
+	} while (1);
+out:
+	release_sock(sk);
+	return len;
+}
+
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	unsigned char old_state;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+		goto out;
+
+	old_state = sk->sk_state;
+	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+		goto out;
+
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != DCCP_LISTEN) {
+		/*
+		 * FIXME: here it probably should be sk->sk_prot->listen_start
+		 * see tcp_listen_start
+		 */
+		err = dccp_listen_start(sk, backlog);
+		if (err)
+			goto out;
+	}
+	sk->sk_max_ack_backlog = backlog;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
+
+static void dccp_terminate_connection(struct sock *sk)
+{
+	u8 next_state = DCCP_CLOSED;
+
+	switch (sk->sk_state) {
+	case DCCP_PASSIVE_CLOSE:
+	case DCCP_PASSIVE_CLOSEREQ:
+		dccp_finish_passive_close(sk);
+		break;
+	case DCCP_PARTOPEN:
+		dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+		/* fall through */
+	case DCCP_OPEN:
+		dccp_send_close(sk, 1);
+
+		if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
+		    !dccp_sk(sk)->dccps_server_timewait)
+			next_state = DCCP_ACTIVE_CLOSEREQ;
+		else
+			next_state = DCCP_CLOSING;
+		/* fall through */
+	default:
+		dccp_set_state(sk, next_state);
+	}
+}
+
+void dccp_close(struct sock *sk, long timeout)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	u32 data_was_unread = 0;
+	int state;
+
+	lock_sock(sk);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		dccp_set_state(sk, DCCP_CLOSED);
+
+		/* Special case. */
+		inet_csk_listen_stop(sk);
+
+		goto adjudge_to_death;
+	}
+
+	sk_stop_timer(sk, &dp->dccps_xmit_timer);
+
+	/*
+	 * We need to flush the recv. buffs.  We do this only on the
+	 * descriptor close, not protocol-sourced closes, because the
+	  *reader process may not have drained the data yet!
+	 */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		data_was_unread += skb->len;
+		__kfree_skb(skb);
+	}
+
+	if (data_was_unread) {
+		/* Unread data was tossed, send an appropriate Reset Code */
+		DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
+		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+		dccp_set_state(sk, DCCP_CLOSED);
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+	} else if (sk->sk_state != DCCP_CLOSED) {
+		/*
+		 * Normal connection termination. May need to wait if there are
+		 * still packets in the TX queue that are delayed by the CCID.
+		 */
+		dccp_flush_write_queue(sk, &timeout);
+		dccp_terminate_connection(sk);
+	}
+
+	/*
+	 * Flush write queue. This may be necessary in several cases:
+	 * - we have been closed by the peer but still have application data;
+	 * - abortive termination (unread data or zero linger time),
+	 * - normal termination but queue could not be flushed within time limit
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	state = sk->sk_state;
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*
+	 * It is the last release_sock in its life. It will remove backlog.
+	 */
+	release_sock(sk);
+	/*
+	 * Now socket is owned by kernel and we acquire BH lock
+	 * to finish close. No need to check for user refs.
+	 */
+	local_bh_disable();
+	bh_lock_sock(sk);
+	WARN_ON(sock_owned_by_user(sk));
+
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+
+	/* Have we already been destroyed by a softirq or backlog? */
+	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
+		goto out;
+
+	if (sk->sk_state == DCCP_CLOSED)
+		inet_csk_destroy_sock(sk);
+
+	/* Otherwise, socket is reprieved until protocol close. */
+
+out:
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	sock_put(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_close);
+
+void dccp_shutdown(struct sock *sk, int how)
+{
+	dccp_pr_debug("called shutdown(%x)\n", how);
+}
+
+EXPORT_SYMBOL_GPL(dccp_shutdown);
+
+static inline int dccp_mib_init(void)
+{
+	return snmp_mib_init((void __percpu **)dccp_statistics,
+			     sizeof(struct dccp_mib),
+			     __alignof__(struct dccp_mib));
+}
+
+static inline void dccp_mib_exit(void)
+{
+	snmp_mib_free((void __percpu **)dccp_statistics);
+}
+
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+int dccp_debug;
+module_param(dccp_debug, bool, 0644);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+
+EXPORT_SYMBOL_GPL(dccp_debug);
+#endif
+
+static int __init dccp_init(void)
+{
+	unsigned long goal;
+	int ehash_order, bhash_order, i;
+	int rc;
+
+	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+		     FIELD_SIZEOF(struct sk_buff, cb));
+	rc = percpu_counter_init(&dccp_orphan_count, 0);
+	if (rc)
+		goto out_fail;
+	rc = -ENOBUFS;
+	inet_hashinfo_init(&dccp_hashinfo);
+	dccp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("dccp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN, NULL);
+	if (!dccp_hashinfo.bind_bucket_cachep)
+		goto out_free_percpu;
+
+	/*
+	 * Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (21 - PAGE_SHIFT);
+	else
+		goal = totalram_pages >> (23 - PAGE_SHIFT);
+
+	if (thash_entries)
+		goal = (thash_entries *
+			sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+	for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+		;
+	do {
+		unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
+					sizeof(struct inet_ehash_bucket);
+
+		while (hash_size & (hash_size - 1))
+			hash_size--;
+		dccp_hashinfo.ehash_mask = hash_size - 1;
+		dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
+	} while (!dccp_hashinfo.ehash && --ehash_order > 0);
+
+	if (!dccp_hashinfo.ehash) {
+		DCCP_CRIT("Failed to allocate DCCP established hash table");
+		goto out_free_bind_bucket_cachep;
+	}
+
+	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
+		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
+		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
+	}
+
+	if (inet_ehash_locks_alloc(&dccp_hashinfo))
+			goto out_free_dccp_ehash;
+
+	bhash_order = ehash_order;
+
+	do {
+		dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+					sizeof(struct inet_bind_hashbucket);
+		if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+		    bhash_order > 0)
+			continue;
+		dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
+	} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+
+	if (!dccp_hashinfo.bhash) {
+		DCCP_CRIT("Failed to allocate DCCP bind hash table");
+		goto out_free_dccp_locks;
+	}
+
+	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+	}
+
+	rc = dccp_mib_init();
+	if (rc)
+		goto out_free_dccp_bhash;
+
+	rc = dccp_ackvec_init();
+	if (rc)
+		goto out_free_dccp_mib;
+
+	rc = dccp_sysctl_init();
+	if (rc)
+		goto out_ackvec_exit;
+
+	rc = ccid_initialize_builtins();
+	if (rc)
+		goto out_sysctl_exit;
+
+	dccp_timestamping_init();
+
+	return 0;
+
+out_sysctl_exit:
+	dccp_sysctl_exit();
+out_ackvec_exit:
+	dccp_ackvec_exit();
+out_free_dccp_mib:
+	dccp_mib_exit();
+out_free_dccp_bhash:
+	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+out_free_dccp_locks:
+	inet_ehash_locks_free(&dccp_hashinfo);
+out_free_dccp_ehash:
+	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+out_free_bind_bucket_cachep:
+	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+out_free_percpu:
+	percpu_counter_destroy(&dccp_orphan_count);
+out_fail:
+	dccp_hashinfo.bhash = NULL;
+	dccp_hashinfo.ehash = NULL;
+	dccp_hashinfo.bind_bucket_cachep = NULL;
+	return rc;
+}
+
+static void __exit dccp_fini(void)
+{
+	ccid_cleanup_builtins();
+	dccp_mib_exit();
+	free_pages((unsigned long)dccp_hashinfo.bhash,
+		   get_order(dccp_hashinfo.bhash_size *
+			     sizeof(struct inet_bind_hashbucket)));
+	free_pages((unsigned long)dccp_hashinfo.ehash,
+		   get_order((dccp_hashinfo.ehash_mask + 1) *
+			     sizeof(struct inet_ehash_bucket)));
+	inet_ehash_locks_free(&dccp_hashinfo);
+	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+	dccp_ackvec_exit();
+	dccp_sysctl_exit();
+	percpu_counter_destroy(&dccp_orphan_count);
+}
+
+module_init(dccp_init);
+module_exit(dccp_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000..63c30bfa
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ *  net/dccp/qpolicy.c
+ *
+ *  Policy-based packet dequeueing interface for DCCP.
+ *
+ *  Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License v2
+ *  as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ *	Simple Dequeueing Policy:
+ *	If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+	skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_tx_qlen &&
+	       sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+	return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ *	Priority-based Dequeueing Policy:
+ *	If tx_qlen is different from 0 and the queue has reached its upper bound
+ *	of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *best = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (best == NULL || skb->priority > best->priority)
+			best = skb;
+	return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *worst = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (worst == NULL || skb->priority < worst->priority)
+			worst = skb;
+	return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+	if (qpolicy_simple_full(sk))
+		dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+	return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations  -  TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top:  peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+	void		(*push)	(struct sock *sk, struct sk_buff *skb);
+	bool		(*full) (struct sock *sk);
+	struct sk_buff*	(*top)  (struct sock *sk);
+	__be32		params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+	[DCCPQ_POLICY_SIMPLE] = {
+		.push   = qpolicy_simple_push,
+		.full   = qpolicy_simple_full,
+		.top    = qpolicy_simple_top,
+		.params = 0,
+	},
+	[DCCPQ_POLICY_PRIO] = {
+		.push   = qpolicy_simple_push,
+		.full   = qpolicy_prio_full,
+		.top    = qpolicy_prio_best_skb,
+		.params = DCCP_SCM_PRIORITY,
+	},
+};
+
+/*
+ *	Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+	qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb != NULL) {
+		skb_unlink(skb, &sk->sk_write_queue);
+		kfree_skb(skb);
+	}
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+	struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+	if (skb != NULL) {
+		/* Clear any skb fields that we used internally */
+		skb->priority = 0;
+		skb_unlink(skb, &sk->sk_write_queue);
+	}
+	return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+	/* check if exactly one bit is set */
+	if (!param || (param & (param - 1)))
+		return false;
+	return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
new file mode 100644
index 00000000..42348824
--- /dev/null
+++ b/net/dccp/sysctl.c
@@ -0,0 +1,124 @@
+/*
+ *  net/dccp/sysctl.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License v2
+ *	as published by the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include "dccp.h"
+#include "feat.h"
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+/* Boundary values */
+static int		zero     = 0,
+			u8_max   = 0xFF;
+static unsigned long	seqw_min = DCCPF_SEQ_WMIN,
+			seqw_max = 0xFFFFFFFF;		/* maximum on 32 bit */
+
+static struct ctl_table dccp_default_table[] = {
+	{
+		.procname	= "seq_window",
+		.data		= &sysctl_dccp_sequence_window,
+		.maxlen		= sizeof(sysctl_dccp_sequence_window),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+		.extra1		= &seqw_min,		/* RFC 4340, 7.5.2 */
+		.extra2		= &seqw_max,
+	},
+	{
+		.procname	= "rx_ccid",
+		.data		= &sysctl_dccp_rx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
+	},
+	{
+		.procname	= "tx_ccid",
+		.data		= &sysctl_dccp_tx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,		/* RFC 4340, 10. */
+	},
+	{
+		.procname	= "request_retries",
+		.data		= &sysctl_dccp_request_retries,
+		.maxlen		= sizeof(sysctl_dccp_request_retries),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
+	},
+	{
+		.procname	= "retries1",
+		.data		= &sysctl_dccp_retries1,
+		.maxlen		= sizeof(sysctl_dccp_retries1),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
+	},
+	{
+		.procname	= "retries2",
+		.data		= &sysctl_dccp_retries2,
+		.maxlen		= sizeof(sysctl_dccp_retries2),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &u8_max,
+	},
+	{
+		.procname	= "tx_qlen",
+		.data		= &sysctl_dccp_tx_qlen,
+		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sync_ratelimit",
+		.data		= &sysctl_dccp_sync_ratelimit,
+		.maxlen		= sizeof(sysctl_dccp_sync_ratelimit),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+
+	{ }
+};
+
+static struct ctl_path dccp_path[] = {
+	{ .procname = "net", },
+	{ .procname = "dccp", },
+	{ .procname = "default", },
+	{ }
+};
+
+static struct ctl_table_header *dccp_table_header;
+
+int __init dccp_sysctl_init(void)
+{
+	dccp_table_header = register_sysctl_paths(dccp_path,
+			dccp_default_table);
+
+	return dccp_table_header != NULL ? 0 : -ENOMEM;
+}
+
+void dccp_sysctl_exit(void)
+{
+	if (dccp_table_header != NULL) {
+		unregister_sysctl_table(dccp_table_header);
+		dccp_table_header = NULL;
+	}
+}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 00000000..7587870b
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,292 @@
+/*
+ *  net/dccp/timer.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include "dccp.h"
+
+/* sysctl variables governing numbers of retransmission attempts */
+int  sysctl_dccp_request_retries	__read_mostly = TCP_SYN_RETRIES;
+int  sysctl_dccp_retries1		__read_mostly = TCP_RETR1;
+int  sysctl_dccp_retries2		__read_mostly = TCP_RETR2;
+
+static void dccp_write_err(struct sock *sk)
+{
+	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+	sk->sk_error_report(sk);
+
+	dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+	dccp_done(sk);
+	DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	int retry_until;
+
+	if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+		if (icsk->icsk_retransmits != 0)
+			dst_negative_advice(sk);
+		retry_until = icsk->icsk_syn_retries ?
+			    : sysctl_dccp_request_retries;
+	} else {
+		if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
+			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+			   black hole detection. :-(
+
+			   It is place to make it. It is not made. I do not want
+			   to make it. It is disguisting. It does not work in any
+			   case. Let me to cite the same draft, which requires for
+			   us to implement this:
+
+   "The one security concern raised by this memo is that ICMP black holes
+   are often caused by over-zealous security administrators who block
+   all ICMP messages.  It is vitally important that those who design and
+   deploy security systems understand the impact of strict filtering on
+   upper-layer protocols.  The safest web site in the world is worthless
+   if most TCP implementations cannot transfer data from it.  It would
+   be far nicer to have all of the black holes fixed rather than fixing
+   all of the TCP implementations."
+
+			   Golden words :-).
+		   */
+
+			dst_negative_advice(sk);
+		}
+
+		retry_until = sysctl_dccp_retries2;
+		/*
+		 * FIXME: see tcp_write_timout and tcp_out_of_resources
+		 */
+	}
+
+	if (icsk->icsk_retransmits >= retry_until) {
+		/* Has it gone just too far? */
+		dccp_write_err(sk);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ *	The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/*
+	 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+	 * sent, no need to retransmit, this sock is dead.
+	 */
+	if (dccp_write_timeout(sk))
+		return;
+
+	/*
+	 * We want to know the number of packets retransmitted, not the
+	 * total number of retransmissions of clones of original packets.
+	 */
+	if (icsk->icsk_retransmits == 0)
+		DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+
+	if (dccp_retransmit_skb(sk) != 0) {
+		/*
+		 * Retransmission failed because of local congestion,
+		 * do not backoff.
+		 */
+		if (--icsk->icsk_retransmits == 0)
+			icsk->icsk_retransmits = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  min(icsk->icsk_rto,
+					      TCP_RESOURCE_PROBE_INTERVAL),
+					  DCCP_RTO_MAX);
+		return;
+	}
+
+	icsk->icsk_backoff++;
+
+	icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+				  DCCP_RTO_MAX);
+	if (icsk->icsk_retransmits > sysctl_dccp_retries1)
+		__sk_dst_reset(sk);
+}
+
+static void dccp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int event = 0;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+			       jiffies + (HZ / 20));
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+		goto out;
+
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+			       icsk->icsk_timeout);
+		goto out;
+	}
+
+	event = icsk->icsk_pending;
+	icsk->icsk_pending = 0;
+
+	switch (event) {
+	case ICSK_TIME_RETRANS:
+		dccp_retransmit_timer(sk);
+		break;
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	Timer for listening sockets
+ */
+static void dccp_response_timer(struct sock *sk)
+{
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
+				   DCCP_RTO_MAX);
+}
+
+static void dccp_keepalive_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	/* Only process if socket is not in use. */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		inet_csk_reset_keepalive_timer(sk, HZ / 20);
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		dccp_response_timer(sk);
+		goto out;
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		icsk->icsk_ack.blocked = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer,
+			       jiffies + TCP_DELACK_MIN);
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_CLOSED ||
+	    !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+		goto out;
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer,
+			       icsk->icsk_ack.timeout);
+		goto out;
+	}
+
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!icsk->icsk_ack.pingpong) {
+			/* Delayed ACK missed: inflate ATO. */
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+						 icsk->icsk_rto);
+		} else {
+			/* Delayed ACK missed: leave pingpong mode and
+			 * deflate ATO.
+			 */
+			icsk->icsk_ack.pingpong = 0;
+			icsk->icsk_ack.ato = TCP_ATO_MIN;
+		}
+		dccp_send_ack(sk);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/**
+ * dccp_write_xmitlet  -  Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
+	else
+		dccp_write_xmit(sk);
+	bh_unlock_sock(sk);
+}
+
+static void dccp_write_xmit_timer(unsigned long data)
+{
+	dccp_write_xmitlet(data);
+	sock_put((struct sock *)data);
+}
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+							     (unsigned long)sk);
+	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+				  &dccp_keepalive_timer);
+}
+
+static ktime_t dccp_timestamp_seed;
+/**
+ * dccp_timestamp  -  10s of microseconds time source
+ * Returns the number of 10s of microseconds since loading DCCP. This is native
+ * DCCP time difference format (RFC 4340, sec. 13).
+ * Please note: This will wrap around about circa every 11.9 hours.
+ */
+u32 dccp_timestamp(void)
+{
+	s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
+
+	do_div(delta, 10);
+	return delta;
+}
+EXPORT_SYMBOL_GPL(dccp_timestamp);
+
+void __init dccp_timestamping_init(void)
+{
+	dccp_timestamp_seed = ktime_get_real();
+}
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
new file mode 100644
index 00000000..7914fd61
--- /dev/null
+++ b/net/decnet/Kconfig
@@ -0,0 +1,43 @@
+#
+# DECnet configuration
+#
+config DECNET
+	tristate "DECnet Support"
+	---help---
+	  The DECnet networking protocol was used in many products made by
+	  Digital (now Compaq).  It provides reliable stream and sequenced
+	  packet communications over which run a variety of services similar
+	  to those which run over TCP/IP.
+
+	  To find some tools to use with the kernel layer support, please
+	  look at Patrick Caulfield's web site:
+	  <http://linux-decnet.sourceforge.net/>.
+
+	  More detailed documentation is available in
+	  <file:Documentation/networking/decnet.txt>.
+
+	  Be sure to say Y to "/proc file system support" and "Sysctl support"
+	  below when using DECnet, since you will need sysctl support to aid
+	  in configuration at run time.
+
+	  The DECnet code is also available as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want).
+	  The module is called decnet.
+
+config DECNET_ROUTER
+	bool "DECnet: router support (EXPERIMENTAL)"
+	depends on DECNET && EXPERIMENTAL
+	select FIB_RULES
+	---help---
+	  Add support for turning your DECnet Endnode into a level 1 or 2
+	  router.  This is an experimental, but functional option.  If you
+	  do say Y here, then make sure that you also say Y to "Kernel/User
+	  network link driver", "Routing messages" and "Network packet
+	  filtering".  The first two are required to allow configuration via
+	  rtnetlink (you will need Alexey Kuznetsov's iproute2 package
+	  from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet
+	  filtering" option will be required for the forthcoming routing daemon
+	  to work.
+
+	  See <file:Documentation/networking/decnet.txt> for more information.
+
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
new file mode 100644
index 00000000..e44003af
--- /dev/null
+++ b/net/decnet/Makefile
@@ -0,0 +1,10 @@
+
+obj-$(CONFIG_DECNET) += decnet.o
+
+decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \
+	    dn_route.o dn_dev.o dn_neigh.o dn_timer.o
+decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
+decnet-y += sysctl_net_decnet.o
+
+obj-$(CONFIG_NETFILTER) += netfilter/
+
diff --git a/net/decnet/README b/net/decnet/README
new file mode 100644
index 00000000..60e7ec88
--- /dev/null
+++ b/net/decnet/README
@@ -0,0 +1,8 @@
+                       Linux DECnet Project
+                      ======================
+
+The documentation for this kernel subsystem is available in the
+Documentation/networking subdirectory of this distribution and also
+on line at http://www.chygwyn.com/DECnet/
+
+Steve Whitehouse <SteveW@ACM.org>
diff --git a/net/decnet/TODO b/net/decnet/TODO
new file mode 100644
index 00000000..ebb5ac69
--- /dev/null
+++ b/net/decnet/TODO
@@ -0,0 +1,41 @@
+Steve's quick list of things that need finishing off:
+[they are in no particular order and range from the trivial to the long winded]
+
+ o Proper timeouts on each neighbour (in routing mode) rather than
+   just the 60 second On-Ethernet cache value.
+
+ o Support for X.25 linklayer
+
+ o Support for DDCMP link layer
+
+ o The DDCMP device itself
+
+ o PPP support (rfc1762)
+
+ o Lots of testing with real applications
+
+ o Verify errors etc. against POSIX 1003.1g (draft)
+
+ o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) 
+   [maybe this should be done at socket level... the control data in the
+    send/recvmsg() calls should simply be a vector of set/getsockopt()
+    calls]
+
+ o check MSG_CTRUNC is set where it should be.
+
+ o Find all the commonality between DECnet and IPv4 routing code and extract 
+   it into a small library of routines. [probably a project for 2.7.xx]
+
+ o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
+   we have a half-way house scheme which seems to work reasonably well, but
+   the full scheme is still worth implementing, its not not top of my list
+   right now.
+
+ o Add session control message flow control
+
+ o Add NSP message flow control
+
+ o DECnet sendpages() function
+
+ o AIO for DECnet
+
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
new file mode 100644
index 00000000..ea3b6ee2
--- /dev/null
+++ b/net/decnet/af_decnet.c
@@ -0,0 +1,2423 @@
+
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Socket Layer Interface
+ *
+ * Authors:     Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *              Patrick Caulfield <patrick@pandh.demon.co.uk>
+ *
+ * Changes:
+ *        Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's
+ *                          version of the code. Original copyright preserved
+ *                          below.
+ *        Steve Whitehouse: Some bug fixes, cleaning up some code to make it
+ *                          compatible with my routing layer.
+ *        Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick
+ *                          Caulfield.
+ *        Steve Whitehouse: Further bug fixes, checking module code still works
+ *                          with new routing layer.
+ *        Steve Whitehouse: Additional set/get_sockopt() calls.
+ *        Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new
+ *                          code.
+ *        Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like
+ *                          way. Didn't manage it entirely, but its better.
+ *        Steve Whitehouse: ditto for sendmsg().
+ *        Steve Whitehouse: A selection of bug fixes to various things.
+ *        Steve Whitehouse: Added TIOCOUTQ ioctl.
+ *        Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username.
+ *        Steve Whitehouse: Fixes to connect() error returns.
+ *       Patrick Caulfield: Fixes to delayed acceptance logic.
+ *         David S. Miller: New socket locking
+ *        Steve Whitehouse: Socket list hashing/locking
+ *         Arnaldo C. Melo: use capable, not suser
+ *        Steve Whitehouse: Removed unused code. Fix to use sk->allocation
+ *                          when required.
+ *       Patrick Caulfield: /proc/net/decnet now has object name/number
+ *        Steve Whitehouse: Fixed local port allocation, hashed sk list
+ *          Matthew Wilcox: Fixes for dn_ioctl()
+ *        Steve Whitehouse: New connect/accept logic to allow timeouts and
+ *                          prepare for sendpage etc.
+ */
+
+
+/******************************************************************************
+    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+HISTORY:
+
+Version           Kernel     Date       Author/Comments
+-------           ------     ----       ---------------
+Version 0.0.1     2.0.30    01-dic-97	Eduardo Marcelo Serrat
+					(emserrat@geocities.com)
+
+					First Development of DECnet Socket La-
+					yer for Linux. Only supports outgoing
+					connections.
+
+Version 0.0.2	  2.1.105   20-jun-98   Patrick J. Caulfield
+					(patrick@pandh.demon.co.uk)
+
+					Port to new kernel development version.
+
+Version 0.0.3     2.1.106   25-jun-98   Eduardo Marcelo Serrat
+					(emserrat@geocities.com)
+					_
+					Added support for incoming connections
+					so we can start developing server apps
+					on Linux.
+					-
+					Module Support
+Version 0.0.4     2.1.109   21-jul-98   Eduardo Marcelo Serrat
+				       (emserrat@geocities.com)
+				       _
+					Added support for X11R6.4. Now we can
+					use DECnet transport for X on Linux!!!
+				       -
+Version 0.0.5    2.1.110   01-aug-98   Eduardo Marcelo Serrat
+				       (emserrat@geocities.com)
+				       Removed bugs on flow control
+				       Removed bugs on incoming accessdata
+				       order
+				       -
+Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
+				       dn_recvmsg fixes
+
+					Patrick J. Caulfield
+				       dn_bind fixes
+*******************************************************************************/
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/flow.h>
+#include <asm/system.h>
+#include <asm/ioctls.h>
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+
+struct dn_sock {
+	struct sock sk;
+	struct dn_scp scp;
+};
+
+static void dn_keepalive(struct sock *sk);
+
+#define DN_SK_HASH_SHIFT 8
+#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT)
+#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
+
+
+static const struct proto_ops dn_proto_ops;
+static DEFINE_RWLOCK(dn_hash_lock);
+static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
+static struct hlist_head dn_wild_sk;
+static atomic_long_t decnet_memory_allocated;
+
+static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
+static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
+
+static struct hlist_head *dn_find_list(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->addr.sdn_flags & SDF_WILD)
+		return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL;
+
+	return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK];
+}
+
+/*
+ * Valid ports are those greater than zero and not already in use.
+ */
+static int check_port(__le16 port)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	if (port == 0)
+		return -1;
+
+	sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) {
+		struct dn_scp *scp = DN_SK(sk);
+		if (scp->addrloc == port)
+			return -1;
+	}
+	return 0;
+}
+
+static unsigned short port_alloc(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+static unsigned short port = 0x2000;
+	unsigned short i_port = port;
+
+	while(check_port(cpu_to_le16(++port)) != 0) {
+		if (port == i_port)
+			return 0;
+	}
+
+	scp->addrloc = cpu_to_le16(port);
+
+	return 1;
+}
+
+/*
+ * Since this is only ever called from user
+ * level, we don't need a write_lock() version
+ * of this.
+ */
+static int dn_hash_sock(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct hlist_head *list;
+	int rv = -EUSERS;
+
+	BUG_ON(sk_hashed(sk));
+
+	write_lock_bh(&dn_hash_lock);
+
+	if (!scp->addrloc && !port_alloc(sk))
+		goto out;
+
+	rv = -EADDRINUSE;
+	if ((list = dn_find_list(sk)) == NULL)
+		goto out;
+
+	sk_add_node(sk, list);
+	rv = 0;
+out:
+	write_unlock_bh(&dn_hash_lock);
+	return rv;
+}
+
+static void dn_unhash_sock(struct sock *sk)
+{
+	write_lock(&dn_hash_lock);
+	sk_del_node_init(sk);
+	write_unlock(&dn_hash_lock);
+}
+
+static void dn_unhash_sock_bh(struct sock *sk)
+{
+	write_lock_bh(&dn_hash_lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&dn_hash_lock);
+}
+
+static struct hlist_head *listen_hash(struct sockaddr_dn *addr)
+{
+	int i;
+	unsigned hash = addr->sdn_objnum;
+
+	if (hash == 0) {
+		hash = addr->sdn_objnamel;
+		for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) {
+			hash ^= addr->sdn_objname[i];
+			hash ^= (hash << 3);
+		}
+	}
+
+	return &dn_sk_hash[hash & DN_SK_HASH_MASK];
+}
+
+/*
+ * Called to transform a socket from bound (i.e. with a local address)
+ * into a listening socket (doesn't need a local port number) and rehashes
+ * based upon the object name/number.
+ */
+static void dn_rehash_sock(struct sock *sk)
+{
+	struct hlist_head *list;
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->addr.sdn_flags & SDF_WILD)
+		return;
+
+	write_lock_bh(&dn_hash_lock);
+	sk_del_node_init(sk);
+	DN_SK(sk)->addrloc = 0;
+	list = listen_hash(&DN_SK(sk)->addr);
+	sk_add_node(sk, list);
+	write_unlock_bh(&dn_hash_lock);
+}
+
+int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type)
+{
+	int len = 2;
+
+	*buf++ = type;
+
+	switch(type) {
+		case 0:
+			*buf++ = sdn->sdn_objnum;
+			break;
+		case 1:
+			*buf++ = 0;
+			*buf++ = le16_to_cpu(sdn->sdn_objnamel);
+			memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
+			len = 3 + le16_to_cpu(sdn->sdn_objnamel);
+			break;
+		case 2:
+			memset(buf, 0, 5);
+			buf += 5;
+			*buf++ = le16_to_cpu(sdn->sdn_objnamel);
+			memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
+			len = 7 + le16_to_cpu(sdn->sdn_objnamel);
+			break;
+	}
+
+	return len;
+}
+
+/*
+ * On reception of usernames, we handle types 1 and 0 for destination
+ * addresses only. Types 2 and 4 are used for source addresses, but the
+ * UIC, GIC are ignored and they are both treated the same way. Type 3
+ * is never used as I've no idea what its purpose might be or what its
+ * format is.
+ */
+int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt)
+{
+	unsigned char type;
+	int size = len;
+	int namel = 12;
+
+	sdn->sdn_objnum = 0;
+	sdn->sdn_objnamel = cpu_to_le16(0);
+	memset(sdn->sdn_objname, 0, DN_MAXOBJL);
+
+	if (len < 2)
+		return -1;
+
+	len -= 2;
+	*fmt = *data++;
+	type = *data++;
+
+	switch(*fmt) {
+		case 0:
+			sdn->sdn_objnum = type;
+			return 2;
+		case 1:
+			namel = 16;
+			break;
+		case 2:
+			len  -= 4;
+			data += 4;
+			break;
+		case 4:
+			len  -= 8;
+			data += 8;
+			break;
+		default:
+			return -1;
+	}
+
+	len -= 1;
+
+	if (len < 0)
+		return -1;
+
+	sdn->sdn_objnamel = cpu_to_le16(*data++);
+	len -= le16_to_cpu(sdn->sdn_objnamel);
+
+	if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel))
+		return -1;
+
+	memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel));
+
+	return size - len;
+}
+
+struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr)
+{
+	struct hlist_head *list = listen_hash(addr);
+	struct hlist_node *node;
+	struct sock *sk;
+
+	read_lock(&dn_hash_lock);
+	sk_for_each(sk, node, list) {
+		struct dn_scp *scp = DN_SK(sk);
+		if (sk->sk_state != TCP_LISTEN)
+			continue;
+		if (scp->addr.sdn_objnum) {
+			if (scp->addr.sdn_objnum != addr->sdn_objnum)
+				continue;
+		} else {
+			if (addr->sdn_objnum)
+				continue;
+			if (scp->addr.sdn_objnamel != addr->sdn_objnamel)
+				continue;
+			if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0)
+				continue;
+		}
+		sock_hold(sk);
+		read_unlock(&dn_hash_lock);
+		return sk;
+	}
+
+	sk = sk_head(&dn_wild_sk);
+	if (sk) {
+		if (sk->sk_state == TCP_LISTEN)
+			sock_hold(sk);
+		else
+			sk = NULL;
+	}
+
+	read_unlock(&dn_hash_lock);
+	return sk;
+}
+
+struct sock *dn_find_by_skb(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct sock *sk;
+	struct hlist_node *node;
+	struct dn_scp *scp;
+
+	read_lock(&dn_hash_lock);
+	sk_for_each(sk, node, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) {
+		scp = DN_SK(sk);
+		if (cb->src != dn_saddr2dn(&scp->peer))
+			continue;
+		if (cb->dst_port != scp->addrloc)
+			continue;
+		if (scp->addrrem && (cb->src_port != scp->addrrem))
+			continue;
+		sock_hold(sk);
+		goto found;
+	}
+	sk = NULL;
+found:
+	read_unlock(&dn_hash_lock);
+	return sk;
+}
+
+
+
+static void dn_destruct(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	skb_queue_purge(&scp->data_xmit_queue);
+	skb_queue_purge(&scp->other_xmit_queue);
+	skb_queue_purge(&scp->other_receive_queue);
+
+	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+}
+
+static int dn_memory_pressure;
+
+static void dn_enter_memory_pressure(struct sock *sk)
+{
+	if (!dn_memory_pressure) {
+		dn_memory_pressure = 1;
+	}
+}
+
+static struct proto dn_proto = {
+	.name			= "NSP",
+	.owner			= THIS_MODULE,
+	.enter_memory_pressure	= dn_enter_memory_pressure,
+	.memory_pressure	= &dn_memory_pressure,
+	.memory_allocated	= &decnet_memory_allocated,
+	.sysctl_mem		= sysctl_decnet_mem,
+	.sysctl_wmem		= sysctl_decnet_wmem,
+	.sysctl_rmem		= sysctl_decnet_rmem,
+	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
+	.obj_size		= sizeof(struct dn_sock),
+};
+
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+{
+	struct dn_scp *scp;
+	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+
+	if  (!sk)
+		goto out;
+
+	if (sock)
+		sock->ops = &dn_proto_ops;
+	sock_init_data(sock, sk);
+
+	sk->sk_backlog_rcv = dn_nsp_backlog_rcv;
+	sk->sk_destruct    = dn_destruct;
+	sk->sk_no_check    = 1;
+	sk->sk_family      = PF_DECnet;
+	sk->sk_protocol    = 0;
+	sk->sk_allocation  = gfp;
+	sk->sk_sndbuf	   = sysctl_decnet_wmem[1];
+	sk->sk_rcvbuf	   = sysctl_decnet_rmem[1];
+
+	/* Initialization of DECnet Session Control Port		*/
+	scp = DN_SK(sk);
+	scp->state	= DN_O;		/* Open			*/
+	scp->numdat	= 1;		/* Next data seg to tx	*/
+	scp->numoth	= 1;		/* Next oth data to tx  */
+	scp->ackxmt_dat = 0;		/* Last data seg ack'ed */
+	scp->ackxmt_oth = 0;		/* Last oth data ack'ed */
+	scp->ackrcv_dat = 0;		/* Highest data ack recv*/
+	scp->ackrcv_oth = 0;		/* Last oth data ack rec*/
+	scp->flowrem_sw = DN_SEND;
+	scp->flowloc_sw = DN_SEND;
+	scp->flowrem_dat = 0;
+	scp->flowrem_oth = 1;
+	scp->flowloc_dat = 0;
+	scp->flowloc_oth = 1;
+	scp->services_rem = 0;
+	scp->services_loc = 1 | NSP_FC_NONE;
+	scp->info_rem = 0;
+	scp->info_loc = 0x03; /* NSP version 4.1 */
+	scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */
+	scp->nonagle = 0;
+	scp->multi_ireq = 1;
+	scp->accept_mode = ACC_IMMED;
+	scp->addr.sdn_family    = AF_DECnet;
+	scp->peer.sdn_family    = AF_DECnet;
+	scp->accessdata.acc_accl = 5;
+	memcpy(scp->accessdata.acc_acc, "LINUX", 5);
+
+	scp->max_window   = NSP_MAX_WINDOW;
+	scp->snd_window   = NSP_MIN_WINDOW;
+	scp->nsp_srtt     = NSP_INITIAL_SRTT;
+	scp->nsp_rttvar   = NSP_INITIAL_RTTVAR;
+	scp->nsp_rxtshift = 0;
+
+	skb_queue_head_init(&scp->data_xmit_queue);
+	skb_queue_head_init(&scp->other_xmit_queue);
+	skb_queue_head_init(&scp->other_receive_queue);
+
+	scp->persist = 0;
+	scp->persist_fxn = NULL;
+	scp->keepalive = 10 * HZ;
+	scp->keepalive_fxn = dn_keepalive;
+
+	init_timer(&scp->delack_timer);
+	scp->delack_pending = 0;
+	scp->delack_fxn = dn_nsp_delayed_ack;
+
+	dn_start_slow_timer(sk);
+out:
+	return sk;
+}
+
+/*
+ * Keepalive timer.
+ * FIXME: Should respond to SO_KEEPALIVE etc.
+ */
+static void dn_keepalive(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	/*
+	 * By checking the other_data transmit queue is empty
+	 * we are double checking that we are not sending too
+	 * many of these keepalive frames.
+	 */
+	if (skb_queue_empty(&scp->other_xmit_queue))
+		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
+}
+
+
+/*
+ * Timer for shutdown/destroyed sockets.
+ * When socket is dead & no packets have been sent for a
+ * certain amount of time, they are removed by this
+ * routine. Also takes care of sending out DI & DC
+ * frames at correct times.
+ */
+int dn_destroy_timer(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	scp->persist = dn_nsp_persist(sk);
+
+	switch(scp->state) {
+		case DN_DI:
+			dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
+			if (scp->nsp_rxtshift >= decnet_di_count)
+				scp->state = DN_CN;
+			return 0;
+
+		case DN_DR:
+			dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
+			if (scp->nsp_rxtshift >= decnet_dr_count)
+				scp->state = DN_DRC;
+			return 0;
+
+		case DN_DN:
+			if (scp->nsp_rxtshift < decnet_dn_count) {
+				/* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */
+				dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
+				return 0;
+			}
+	}
+
+	scp->persist = (HZ * decnet_time_wait);
+
+	if (sk->sk_socket)
+		return 0;
+
+	if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) {
+		dn_unhash_sock(sk);
+		sock_put(sk);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void dn_destroy_sock(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	scp->nsp_rxtshift = 0; /* reset back off */
+
+	if (sk->sk_socket) {
+		if (sk->sk_socket->state != SS_UNCONNECTED)
+			sk->sk_socket->state = SS_DISCONNECTING;
+	}
+
+	sk->sk_state = TCP_CLOSE;
+
+	switch(scp->state) {
+		case DN_DN:
+			dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
+					 sk->sk_allocation);
+			scp->persist_fxn = dn_destroy_timer;
+			scp->persist = dn_nsp_persist(sk);
+			break;
+		case DN_CR:
+			scp->state = DN_DR;
+			goto disc_reject;
+		case DN_RUN:
+			scp->state = DN_DI;
+		case DN_DI:
+		case DN_DR:
+disc_reject:
+			dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation);
+		case DN_NC:
+		case DN_NR:
+		case DN_RJ:
+		case DN_DIC:
+		case DN_CN:
+		case DN_DRC:
+		case DN_CI:
+		case DN_CD:
+			scp->persist_fxn = dn_destroy_timer;
+			scp->persist = dn_nsp_persist(sk);
+			break;
+		default:
+			printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n");
+		case DN_O:
+			dn_stop_slow_timer(sk);
+
+			dn_unhash_sock_bh(sk);
+			sock_put(sk);
+
+			break;
+	}
+}
+
+char *dn_addr2asc(__u16 addr, char *buf)
+{
+	unsigned short node, area;
+
+	node = addr & 0x03ff;
+	area = addr >> 10;
+	sprintf(buf, "%hd.%hd", area, node);
+
+	return buf;
+}
+
+
+
+static int dn_create(struct net *net, struct socket *sock, int protocol,
+		     int kern)
+{
+	struct sock *sk;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	switch(sock->type) {
+		case SOCK_SEQPACKET:
+			if (protocol != DNPROTO_NSP)
+				return -EPROTONOSUPPORT;
+			break;
+		case SOCK_STREAM:
+			break;
+		default:
+			return -ESOCKTNOSUPPORT;
+	}
+
+
+	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+		return -ENOBUFS;
+
+	sk->sk_protocol = protocol;
+
+	return 0;
+}
+
+
+static int
+dn_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		sock_orphan(sk);
+		sock_hold(sk);
+		lock_sock(sk);
+		dn_destroy_sock(sk);
+		release_sock(sk);
+		sock_put(sk);
+	}
+
+	return 0;
+}
+
+static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr;
+	struct net_device *dev, *ldev;
+	int rv;
+
+	if (addr_len != sizeof(struct sockaddr_dn))
+		return -EINVAL;
+
+	if (saddr->sdn_family != AF_DECnet)
+		return -EINVAL;
+
+	if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2))
+		return -EINVAL;
+
+	if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL)
+		return -EINVAL;
+
+	if (saddr->sdn_flags & ~SDF_WILD)
+		return -EINVAL;
+
+	if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
+	    (saddr->sdn_flags & SDF_WILD)))
+		return -EACCES;
+
+	if (!(saddr->sdn_flags & SDF_WILD)) {
+		if (le16_to_cpu(saddr->sdn_nodeaddrl)) {
+			rcu_read_lock();
+			ldev = NULL;
+			for_each_netdev_rcu(&init_net, dev) {
+				if (!dev->dn_ptr)
+					continue;
+				if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) {
+					ldev = dev;
+					break;
+				}
+			}
+			rcu_read_unlock();
+			if (ldev == NULL)
+				return -EADDRNOTAVAIL;
+		}
+	}
+
+	rv = -EINVAL;
+	lock_sock(sk);
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		memcpy(&scp->addr, saddr, addr_len);
+		sock_reset_flag(sk, SOCK_ZAPPED);
+
+		rv = dn_hash_sock(sk);
+		if (rv)
+			sock_set_flag(sk, SOCK_ZAPPED);
+	}
+	release_sock(sk);
+
+	return rv;
+}
+
+
+static int dn_auto_bind(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	int rv;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	scp->addr.sdn_flags  = 0;
+	scp->addr.sdn_objnum = 0;
+
+	/*
+	 * This stuff is to keep compatibility with Eduardo's
+	 * patch. I hope I can dispense with it shortly...
+	 */
+	if ((scp->accessdata.acc_accl != 0) &&
+		(scp->accessdata.acc_accl <= 12)) {
+
+		scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl);
+		memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel));
+
+		scp->accessdata.acc_accl = 0;
+		memset(scp->accessdata.acc_acc, 0, 40);
+	}
+	/* End of compatibility stuff */
+
+	scp->addr.sdn_add.a_len = cpu_to_le16(2);
+	rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
+	if (rv == 0) {
+		rv = dn_hash_sock(sk);
+		if (rv)
+			sock_set_flag(sk, SOCK_ZAPPED);
+	}
+
+	return rv;
+}
+
+static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	if (scp->state != DN_CR)
+		return -EINVAL;
+
+	scp->state = DN_CC;
+	scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk));
+	dn_send_conn_conf(sk, allocation);
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	for(;;) {
+		release_sock(sk);
+		if (scp->state == DN_CC)
+			*timeo = schedule_timeout(*timeo);
+		lock_sock(sk);
+		err = 0;
+		if (scp->state == DN_RUN)
+			break;
+		err = sock_error(sk);
+		if (err)
+			break;
+		err = sock_intr_errno(*timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!*timeo)
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (err == 0) {
+		sk->sk_socket->state = SS_CONNECTED;
+	} else if (scp->state != DN_CC) {
+		sk->sk_socket->state = SS_UNCONNECTED;
+	}
+	return err;
+}
+
+static int dn_wait_run(struct sock *sk, long *timeo)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	DEFINE_WAIT(wait);
+	int err = 0;
+
+	if (scp->state == DN_RUN)
+		goto out;
+
+	if (!*timeo)
+		return -EALREADY;
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	for(;;) {
+		release_sock(sk);
+		if (scp->state == DN_CI || scp->state == DN_CC)
+			*timeo = schedule_timeout(*timeo);
+		lock_sock(sk);
+		err = 0;
+		if (scp->state == DN_RUN)
+			break;
+		err = sock_error(sk);
+		if (err)
+			break;
+		err = sock_intr_errno(*timeo);
+		if (signal_pending(current))
+			break;
+		err = -ETIMEDOUT;
+		if (!*timeo)
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+out:
+	if (err == 0) {
+		sk->sk_socket->state = SS_CONNECTED;
+	} else if (scp->state != DN_CI && scp->state != DN_CC) {
+		sk->sk_socket->state = SS_UNCONNECTED;
+	}
+	return err;
+}
+
+static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+	struct dn_scp *scp = DN_SK(sk);
+	int err = -EISCONN;
+	struct flowidn fld;
+
+	if (sock->state == SS_CONNECTED)
+		goto out;
+
+	if (sock->state == SS_CONNECTING) {
+		err = 0;
+		if (scp->state == DN_RUN) {
+			sock->state = SS_CONNECTED;
+			goto out;
+		}
+		err = -ECONNREFUSED;
+		if (scp->state != DN_CI && scp->state != DN_CC) {
+			sock->state = SS_UNCONNECTED;
+			goto out;
+		}
+		return dn_wait_run(sk, timeo);
+	}
+
+	err = -EINVAL;
+	if (scp->state != DN_O)
+		goto out;
+
+	if (addr == NULL || addrlen != sizeof(struct sockaddr_dn))
+		goto out;
+	if (addr->sdn_family != AF_DECnet)
+		goto out;
+	if (addr->sdn_flags & SDF_WILD)
+		goto out;
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		err = dn_auto_bind(sk->sk_socket);
+		if (err)
+			goto out;
+	}
+
+	memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
+
+	err = -EHOSTUNREACH;
+	memset(&fld, 0, sizeof(fld));
+	fld.flowidn_oif = sk->sk_bound_dev_if;
+	fld.daddr = dn_saddr2dn(&scp->peer);
+	fld.saddr = dn_saddr2dn(&scp->addr);
+	dn_sk_ports_copy(&fld, scp);
+	fld.flowidn_proto = DNPROTO_NSP;
+	if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0)
+		goto out;
+	sk->sk_route_caps = sk->sk_dst_cache->dev->features;
+	sock->state = SS_CONNECTING;
+	scp->state = DN_CI;
+	scp->segsize_loc = dst_metric_advmss(sk->sk_dst_cache);
+
+	dn_nsp_send_conninit(sk, NSP_CI);
+	err = -EINPROGRESS;
+	if (*timeo) {
+		err = dn_wait_run(sk, timeo);
+	}
+out:
+	return err;
+}
+
+static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags)
+{
+	struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr;
+	struct sock *sk = sock->sk;
+	int err;
+	long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	lock_sock(sk);
+	err = __dn_connect(sk, addr, addrlen, &timeo, 0);
+	release_sock(sk);
+
+	return err;
+}
+
+static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	switch(scp->state) {
+		case DN_RUN:
+			return 0;
+		case DN_CR:
+			return dn_confirm_accept(sk, timeo, sk->sk_allocation);
+		case DN_CI:
+		case DN_CC:
+			return dn_wait_run(sk, timeo);
+		case DN_O:
+			return __dn_connect(sk, addr, addrlen, timeo, flags);
+	}
+
+	return -EINVAL;
+}
+
+
+static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc)
+{
+	unsigned char *ptr = skb->data;
+
+	acc->acc_userl = *ptr++;
+	memcpy(&acc->acc_user, ptr, acc->acc_userl);
+	ptr += acc->acc_userl;
+
+	acc->acc_passl = *ptr++;
+	memcpy(&acc->acc_pass, ptr, acc->acc_passl);
+	ptr += acc->acc_passl;
+
+	acc->acc_accl = *ptr++;
+	memcpy(&acc->acc_acc, ptr, acc->acc_accl);
+
+	skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3);
+
+}
+
+static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
+{
+	unsigned char *ptr = skb->data;
+	u16 len = *ptr++; /* yes, it's 8bit on the wire */
+
+	BUG_ON(len > 16); /* we've checked the contents earlier */
+	opt->opt_optl   = cpu_to_le16(len);
+	opt->opt_status = 0;
+	memcpy(opt->opt_data, ptr, len);
+	skb_pull(skb, len + 1);
+}
+
+static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
+{
+	DEFINE_WAIT(wait);
+	struct sk_buff *skb = NULL;
+	int err = 0;
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	for(;;) {
+		release_sock(sk);
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb == NULL) {
+			*timeo = schedule_timeout(*timeo);
+			skb = skb_dequeue(&sk->sk_receive_queue);
+		}
+		lock_sock(sk);
+		if (skb != NULL)
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(*timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!*timeo)
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+
+	return skb == NULL ? ERR_PTR(err) : skb;
+}
+
+static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk, *newsk;
+	struct sk_buff *skb = NULL;
+	struct dn_skb_cb *cb;
+	unsigned char menuver;
+	int err = 0;
+	unsigned char type;
+	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	struct dst_entry *dst;
+
+	lock_sock(sk);
+
+	if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+
+	skb = skb_dequeue(&sk->sk_receive_queue);
+	if (skb == NULL) {
+		skb = dn_wait_for_connect(sk, &timeo);
+		if (IS_ERR(skb)) {
+			release_sock(sk);
+			return PTR_ERR(skb);
+		}
+	}
+
+	cb = DN_SKB_CB(skb);
+	sk->sk_ack_backlog--;
+	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+	if (newsk == NULL) {
+		release_sock(sk);
+		kfree_skb(skb);
+		return -ENOBUFS;
+	}
+	release_sock(sk);
+
+	dst = skb_dst(skb);
+	sk_dst_set(newsk, dst);
+	skb_dst_set(skb, NULL);
+
+	DN_SK(newsk)->state        = DN_CR;
+	DN_SK(newsk)->addrrem      = cb->src_port;
+	DN_SK(newsk)->services_rem = cb->services;
+	DN_SK(newsk)->info_rem     = cb->info;
+	DN_SK(newsk)->segsize_rem  = cb->segsize;
+	DN_SK(newsk)->accept_mode  = DN_SK(sk)->accept_mode;
+
+	if (DN_SK(newsk)->segsize_rem < 230)
+		DN_SK(newsk)->segsize_rem = 230;
+
+	if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
+		DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd;
+
+	newsk->sk_state  = TCP_LISTEN;
+	memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn));
+
+	/*
+	 * If we are listening on a wild socket, we don't want
+	 * the newly created socket on the wrong hash queue.
+	 */
+	DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD;
+
+	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
+	skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
+	*(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
+	*(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
+
+	menuver = *skb->data;
+	skb_pull(skb, 1);
+
+	if (menuver & DN_MENUVER_ACC)
+		dn_access_copy(skb, &(DN_SK(newsk)->accessdata));
+
+	if (menuver & DN_MENUVER_USR)
+		dn_user_copy(skb, &(DN_SK(newsk)->conndata_in));
+
+	if (menuver & DN_MENUVER_PRX)
+		DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY;
+
+	if (menuver & DN_MENUVER_UIC)
+		DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY;
+
+	kfree_skb(skb);
+
+	memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out),
+		sizeof(struct optdata_dn));
+	memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out),
+		sizeof(struct optdata_dn));
+
+	lock_sock(newsk);
+	err = dn_hash_sock(newsk);
+	if (err == 0) {
+		sock_reset_flag(newsk, SOCK_ZAPPED);
+		dn_send_conn_ack(newsk);
+
+		/*
+		 * Here we use sk->sk_allocation since although the conn conf is
+		 * for the newsk, the context is the old socket.
+		 */
+		if (DN_SK(newsk)->accept_mode == ACC_IMMED)
+			err = dn_confirm_accept(newsk, &timeo,
+						sk->sk_allocation);
+	}
+	release_sock(newsk);
+	return err;
+}
+
+
+static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer)
+{
+	struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+
+	*uaddr_len = sizeof(struct sockaddr_dn);
+
+	lock_sock(sk);
+
+	if (peer) {
+		if ((sock->state != SS_CONNECTED &&
+		     sock->state != SS_CONNECTING) &&
+		    scp->accept_mode == ACC_IMMED) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+
+		memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn));
+	} else {
+		memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn));
+	}
+
+	release_sock(sk);
+
+	return 0;
+}
+
+
+static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table  *wait)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	int mask = datagram_poll(file, sock, wait);
+
+	if (!skb_queue_empty(&scp->other_receive_queue))
+		mask |= POLLRDBAND;
+
+	return mask;
+}
+
+static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	int err = -EOPNOTSUPP;
+	long amount = 0;
+	struct sk_buff *skb;
+	int val;
+
+	switch(cmd)
+	{
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+		return dn_dev_ioctl(cmd, (void __user *)arg);
+
+	case SIOCATMARK:
+		lock_sock(sk);
+		val = !skb_queue_empty(&scp->other_receive_queue);
+		if (scp->state != DN_RUN)
+			val = -ENOTCONN;
+		release_sock(sk);
+		return val;
+
+	case TIOCOUTQ:
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		err = put_user(amount, (int __user *)arg);
+		break;
+
+	case TIOCINQ:
+		lock_sock(sk);
+		skb = skb_peek(&scp->other_receive_queue);
+		if (skb) {
+			amount = skb->len;
+		} else {
+			skb_queue_walk(&sk->sk_receive_queue, skb)
+				amount += skb->len;
+		}
+		release_sock(sk);
+		err = put_user(amount, (int __user *)arg);
+		break;
+
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+
+static int dn_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = -EINVAL;
+
+	lock_sock(sk);
+
+	if (sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+
+	if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN))
+		goto out;
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog     = 0;
+	sk->sk_state           = TCP_LISTEN;
+	err                 = 0;
+	dn_rehash_sock(sk);
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+
+static int dn_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	int err = -ENOTCONN;
+
+	lock_sock(sk);
+
+	if (sock->state == SS_UNCONNECTED)
+		goto out;
+
+	err = 0;
+	if (sock->state == SS_DISCONNECTING)
+		goto out;
+
+	err = -EINVAL;
+	if (scp->state == DN_O)
+		goto out;
+
+	if (how != SHUTDOWN_MASK)
+		goto out;
+
+	sk->sk_shutdown = how;
+	dn_destroy_sock(sk);
+	err = 0;
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	lock_sock(sk);
+	err = __dn_setsockopt(sock, level, optname, optval, optlen, 0);
+	release_sock(sk);
+
+	return err;
+}
+
+static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags)
+{
+	struct	sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	long timeo;
+	union {
+		struct optdata_dn opt;
+		struct accessdata_dn acc;
+		int mode;
+		unsigned long win;
+		int val;
+		unsigned char services;
+		unsigned char info;
+	} u;
+	int err;
+
+	if (optlen && !optval)
+		return -EINVAL;
+
+	if (optlen > sizeof(u))
+		return -EINVAL;
+
+	if (copy_from_user(&u, optval, optlen))
+		return -EFAULT;
+
+	switch(optname) {
+		case DSO_CONDATA:
+			if (sock->state == SS_CONNECTED)
+				return -EISCONN;
+			if ((scp->state != DN_O) && (scp->state != DN_CR))
+				return -EINVAL;
+
+			if (optlen != sizeof(struct optdata_dn))
+				return -EINVAL;
+
+			if (le16_to_cpu(u.opt.opt_optl) > 16)
+				return -EINVAL;
+
+			memcpy(&scp->conndata_out, &u.opt, optlen);
+			break;
+
+		case DSO_DISDATA:
+			if (sock->state != SS_CONNECTED && scp->accept_mode == ACC_IMMED)
+				return -ENOTCONN;
+
+			if (optlen != sizeof(struct optdata_dn))
+				return -EINVAL;
+
+			if (le16_to_cpu(u.opt.opt_optl) > 16)
+				return -EINVAL;
+
+			memcpy(&scp->discdata_out, &u.opt, optlen);
+			break;
+
+		case DSO_CONACCESS:
+			if (sock->state == SS_CONNECTED)
+				return -EISCONN;
+			if (scp->state != DN_O)
+				return -EINVAL;
+
+			if (optlen != sizeof(struct accessdata_dn))
+				return -EINVAL;
+
+			if ((u.acc.acc_accl > DN_MAXACCL) ||
+					(u.acc.acc_passl > DN_MAXACCL) ||
+					(u.acc.acc_userl > DN_MAXACCL))
+				return -EINVAL;
+
+			memcpy(&scp->accessdata, &u.acc, optlen);
+			break;
+
+		case DSO_ACCEPTMODE:
+			if (sock->state == SS_CONNECTED)
+				return -EISCONN;
+			if (scp->state != DN_O)
+				return -EINVAL;
+
+			if (optlen != sizeof(int))
+				return -EINVAL;
+
+			if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER))
+				return -EINVAL;
+
+			scp->accept_mode = (unsigned char)u.mode;
+			break;
+
+		case DSO_CONACCEPT:
+
+			if (scp->state != DN_CR)
+				return -EINVAL;
+			timeo = sock_rcvtimeo(sk, 0);
+			err = dn_confirm_accept(sk, &timeo, sk->sk_allocation);
+			return err;
+
+		case DSO_CONREJECT:
+
+			if (scp->state != DN_CR)
+				return -EINVAL;
+
+			scp->state = DN_DR;
+			sk->sk_shutdown = SHUTDOWN_MASK;
+			dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation);
+			break;
+
+		default:
+#ifdef CONFIG_NETFILTER
+		return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen);
+#endif
+		case DSO_LINKINFO:
+		case DSO_STREAM:
+		case DSO_SEQPACKET:
+			return -ENOPROTOOPT;
+
+		case DSO_MAXWINDOW:
+			if (optlen != sizeof(unsigned long))
+				return -EINVAL;
+			if (u.win > NSP_MAX_WINDOW)
+				u.win = NSP_MAX_WINDOW;
+			if (u.win == 0)
+				return -EINVAL;
+			scp->max_window = u.win;
+			if (scp->snd_window > u.win)
+				scp->snd_window = u.win;
+			break;
+
+		case DSO_NODELAY:
+			if (optlen != sizeof(int))
+				return -EINVAL;
+			if (scp->nonagle == 2)
+				return -EINVAL;
+			scp->nonagle = (u.val == 0) ? 0 : 1;
+			/* if (scp->nonagle == 1) { Push pending frames } */
+			break;
+
+		case DSO_CORK:
+			if (optlen != sizeof(int))
+				return -EINVAL;
+			if (scp->nonagle == 1)
+				return -EINVAL;
+			scp->nonagle = (u.val == 0) ? 0 : 2;
+			/* if (scp->nonagle == 0) { Push pending frames } */
+			break;
+
+		case DSO_SERVICES:
+			if (optlen != sizeof(unsigned char))
+				return -EINVAL;
+			if ((u.services & ~NSP_FC_MASK) != 0x01)
+				return -EINVAL;
+			if ((u.services & NSP_FC_MASK) == NSP_FC_MASK)
+				return -EINVAL;
+			scp->services_loc = u.services;
+			break;
+
+		case DSO_INFO:
+			if (optlen != sizeof(unsigned char))
+				return -EINVAL;
+			if (u.info & 0xfc)
+				return -EINVAL;
+			scp->info_loc = u.info;
+			break;
+	}
+
+	return 0;
+}
+
+static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	lock_sock(sk);
+	err = __dn_getsockopt(sock, level, optname, optval, optlen, 0);
+	release_sock(sk);
+
+	return err;
+}
+
+static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags)
+{
+	struct	sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	struct linkinfo_dn link;
+	unsigned int r_len;
+	void *r_data = NULL;
+	unsigned int val;
+
+	if(get_user(r_len , optlen))
+		return -EFAULT;
+
+	switch(optname) {
+		case DSO_CONDATA:
+			if (r_len > sizeof(struct optdata_dn))
+				r_len = sizeof(struct optdata_dn);
+			r_data = &scp->conndata_in;
+			break;
+
+		case DSO_DISDATA:
+			if (r_len > sizeof(struct optdata_dn))
+				r_len = sizeof(struct optdata_dn);
+			r_data = &scp->discdata_in;
+			break;
+
+		case DSO_CONACCESS:
+			if (r_len > sizeof(struct accessdata_dn))
+				r_len = sizeof(struct accessdata_dn);
+			r_data = &scp->accessdata;
+			break;
+
+		case DSO_ACCEPTMODE:
+			if (r_len > sizeof(unsigned char))
+				r_len = sizeof(unsigned char);
+			r_data = &scp->accept_mode;
+			break;
+
+		case DSO_LINKINFO:
+			if (r_len > sizeof(struct linkinfo_dn))
+				r_len = sizeof(struct linkinfo_dn);
+
+			memset(&link, 0, sizeof(link));
+
+			switch(sock->state) {
+				case SS_CONNECTING:
+					link.idn_linkstate = LL_CONNECTING;
+					break;
+				case SS_DISCONNECTING:
+					link.idn_linkstate = LL_DISCONNECTING;
+					break;
+				case SS_CONNECTED:
+					link.idn_linkstate = LL_RUNNING;
+					break;
+				default:
+					link.idn_linkstate = LL_INACTIVE;
+			}
+
+			link.idn_segsize = scp->segsize_rem;
+			r_data = &link;
+			break;
+
+		default:
+#ifdef CONFIG_NETFILTER
+		{
+			int ret, len;
+
+			if(get_user(len, optlen))
+				return -EFAULT;
+
+			ret = nf_getsockopt(sk, PF_DECnet, optname,
+							optval, &len);
+			if (ret >= 0)
+				ret = put_user(len, optlen);
+			return ret;
+		}
+#endif
+		case DSO_STREAM:
+		case DSO_SEQPACKET:
+		case DSO_CONACCEPT:
+		case DSO_CONREJECT:
+			return -ENOPROTOOPT;
+
+		case DSO_MAXWINDOW:
+			if (r_len > sizeof(unsigned long))
+				r_len = sizeof(unsigned long);
+			r_data = &scp->max_window;
+			break;
+
+		case DSO_NODELAY:
+			if (r_len > sizeof(int))
+				r_len = sizeof(int);
+			val = (scp->nonagle == 1);
+			r_data = &val;
+			break;
+
+		case DSO_CORK:
+			if (r_len > sizeof(int))
+				r_len = sizeof(int);
+			val = (scp->nonagle == 2);
+			r_data = &val;
+			break;
+
+		case DSO_SERVICES:
+			if (r_len > sizeof(unsigned char))
+				r_len = sizeof(unsigned char);
+			r_data = &scp->services_rem;
+			break;
+
+		case DSO_INFO:
+			if (r_len > sizeof(unsigned char))
+				r_len = sizeof(unsigned char);
+			r_data = &scp->info_rem;
+			break;
+	}
+
+	if (r_data) {
+		if (copy_to_user(optval, r_data, r_len))
+			return -EFAULT;
+		if (put_user(r_len, optlen))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+
+static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target)
+{
+	struct sk_buff *skb;
+	int len = 0;
+
+	if (flags & MSG_OOB)
+		return !skb_queue_empty(q) ? 1 : 0;
+
+	skb_queue_walk(q, skb) {
+		struct dn_skb_cb *cb = DN_SKB_CB(skb);
+		len += skb->len;
+
+		if (cb->nsp_flags & 0x40) {
+			/* SOCK_SEQPACKET reads to EOM */
+			if (sk->sk_type == SOCK_SEQPACKET)
+				return 1;
+			/* so does SOCK_STREAM unless WAITALL is specified */
+			if (!(flags & MSG_WAITALL))
+				return 1;
+		}
+
+		/* minimum data length for read exceeded */
+		if (len >= target)
+			return 1;
+	}
+
+	return 0;
+}
+
+
+static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
+	struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff_head *queue = &sk->sk_receive_queue;
+	size_t target = size > 1 ? 1 : 0;
+	size_t copied = 0;
+	int rv = 0;
+	struct sk_buff *skb, *n;
+	struct dn_skb_cb *cb = NULL;
+	unsigned char eor = 0;
+	long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		rv = -EADDRNOTAVAIL;
+		goto out;
+	}
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN) {
+		rv = 0;
+		goto out;
+	}
+
+	rv = dn_check_state(sk, NULL, 0, &timeo, flags);
+	if (rv)
+		goto out;
+
+	if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
+		rv = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (flags & MSG_OOB)
+		queue = &scp->other_receive_queue;
+
+	if (flags & MSG_WAITALL)
+		target = size;
+
+
+	/*
+	 * See if there is data ready to read, sleep if there isn't
+	 */
+	for(;;) {
+		DEFINE_WAIT(wait);
+
+		if (sk->sk_err)
+			goto out;
+
+		if (!skb_queue_empty(&scp->other_receive_queue)) {
+			if (!(flags & MSG_OOB)) {
+				msg->msg_flags |= MSG_OOB;
+				if (!scp->other_report) {
+					scp->other_report = 1;
+					goto out;
+				}
+			}
+		}
+
+		if (scp->state != DN_RUN)
+			goto out;
+
+		if (signal_pending(current)) {
+			rv = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		if (dn_data_ready(sk, queue, flags, target))
+			break;
+
+		if (flags & MSG_DONTWAIT) {
+			rv = -EWOULDBLOCK;
+			goto out;
+		}
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
+		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		finish_wait(sk_sleep(sk), &wait);
+	}
+
+	skb_queue_walk_safe(queue, skb, n) {
+		unsigned int chunk = skb->len;
+		cb = DN_SKB_CB(skb);
+
+		if ((chunk + copied) > size)
+			chunk = size - copied;
+
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			rv = -EFAULT;
+			break;
+		}
+		copied += chunk;
+
+		if (!(flags & MSG_PEEK))
+			skb_pull(skb, chunk);
+
+		eor = cb->nsp_flags & 0x40;
+
+		if (skb->len == 0) {
+			skb_unlink(skb, queue);
+			kfree_skb(skb);
+			/*
+			 * N.B. Don't refer to skb or cb after this point
+			 * in loop.
+			 */
+			if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) {
+				scp->flowloc_sw = DN_SEND;
+				dn_nsp_send_link(sk, DN_SEND, 0);
+			}
+		}
+
+		if (eor) {
+			if (sk->sk_type == SOCK_SEQPACKET)
+				break;
+			if (!(flags & MSG_WAITALL))
+				break;
+		}
+
+		if (flags & MSG_OOB)
+			break;
+
+		if (copied >= target)
+			break;
+	}
+
+	rv = copied;
+
+
+	if (eor && (sk->sk_type == SOCK_SEQPACKET))
+		msg->msg_flags |= MSG_EOR;
+
+out:
+	if (rv == 0)
+		rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk);
+
+	if ((rv >= 0) && msg->msg_name) {
+		memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn));
+		msg->msg_namelen = sizeof(struct sockaddr_dn);
+	}
+
+	release_sock(sk);
+
+	return rv;
+}
+
+
+static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags)
+{
+	unsigned char fctype = scp->services_rem & NSP_FC_MASK;
+	if (skb_queue_len(queue) >= scp->snd_window)
+		return 1;
+	if (fctype != NSP_FC_NONE) {
+		if (flags & MSG_OOB) {
+			if (scp->flowrem_oth == 0)
+				return 1;
+		} else {
+			if (scp->flowrem_dat == 0)
+				return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * The DECnet spec requires that the "routing layer" accepts packets which
+ * are at least 230 bytes in size. This excludes any headers which the NSP
+ * layer might add, so we always assume that we'll be using the maximal
+ * length header on data packets. The variation in length is due to the
+ * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't
+ * make much practical difference.
+ */
+unsigned dn_mss_from_pmtu(struct net_device *dev, int mtu)
+{
+	unsigned mss = 230 - DN_MAX_NSP_DATA_HEADER;
+	if (dev) {
+		struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+		mtu -= LL_RESERVED_SPACE(dev);
+		if (dn_db->use_long)
+			mtu -= 21;
+		else
+			mtu -= 6;
+		mtu -= DN_MAX_NSP_DATA_HEADER;
+	} else {
+		/*
+		 * 21 = long header, 16 = guess at MAC header length
+		 */
+		mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16);
+	}
+	if (mtu > mss)
+		mss = mtu;
+	return mss;
+}
+
+static inline unsigned int dn_current_mss(struct sock *sk, int flags)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct dn_scp *scp = DN_SK(sk);
+	int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem);
+
+	/* Other data messages are limited to 16 bytes per packet */
+	if (flags & MSG_OOB)
+		return 16;
+
+	/* This works out the maximum size of segment we can send out */
+	if (dst) {
+		u32 mtu = dst_mtu(dst);
+		mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
+	}
+
+	return mss_now;
+}
+
+/*
+ * N.B. We get the timeout wrong here, but then we always did get it
+ * wrong before and this is another step along the road to correcting
+ * it. It ought to get updated each time we pass through the routine,
+ * but in practise it probably doesn't matter too much for now.
+ */
+static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
+			      unsigned long datalen, int noblock,
+			      int *errcode)
+{
+	struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
+						   noblock, errcode);
+	if (skb) {
+		skb->protocol = htons(ETH_P_DNA_RT);
+		skb->pkt_type = PACKET_OUTGOING;
+	}
+	return skb;
+}
+
+static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
+		      struct msghdr *msg, size_t size)
+{
+	struct sock *sk = sock->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	size_t mss;
+	struct sk_buff_head *queue = &scp->data_xmit_queue;
+	int flags = msg->msg_flags;
+	int err = 0;
+	size_t sent = 0;
+	int addr_len = msg->msg_namelen;
+	struct sockaddr_dn *addr = (struct sockaddr_dn *)msg->msg_name;
+	struct sk_buff *skb = NULL;
+	struct dn_skb_cb *cb;
+	size_t len;
+	unsigned char fctype;
+	long timeo;
+
+	if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
+		return -EOPNOTSUPP;
+
+	if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
+		return -EINVAL;
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	/*
+	 * The only difference between stream sockets and sequenced packet
+	 * sockets is that the stream sockets always behave as if MSG_EOR
+	 * has been set.
+	 */
+	if (sock->type == SOCK_STREAM) {
+		if (flags & MSG_EOR) {
+			err = -EINVAL;
+			goto out;
+		}
+		flags |= MSG_EOR;
+	}
+
+
+	err = dn_check_state(sk, addr, addr_len, &timeo, flags);
+	if (err)
+		goto out_err;
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		err = -EPIPE;
+		if (!(flags & MSG_NOSIGNAL))
+			send_sig(SIGPIPE, current, 0);
+		goto out_err;
+	}
+
+	if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
+		dst_negative_advice(sk);
+
+	mss = scp->segsize_rem;
+	fctype = scp->services_rem & NSP_FC_MASK;
+
+	mss = dn_current_mss(sk, flags);
+
+	if (flags & MSG_OOB) {
+		queue = &scp->other_xmit_queue;
+		if (size > mss) {
+			err = -EMSGSIZE;
+			goto out;
+		}
+	}
+
+	scp->persist_fxn = dn_nsp_xmit_timeout;
+
+	while(sent < size) {
+		err = sock_error(sk);
+		if (err)
+			goto out;
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		/*
+		 * Calculate size that we wish to send.
+		 */
+		len = size - sent;
+
+		if (len > mss)
+			len = mss;
+
+		/*
+		 * Wait for queue size to go down below the window
+		 * size.
+		 */
+		if (dn_queue_too_long(scp, queue, flags)) {
+			DEFINE_WAIT(wait);
+
+			if (flags & MSG_DONTWAIT) {
+				err = -EWOULDBLOCK;
+				goto out;
+			}
+
+			prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+			set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			sk_wait_event(sk, &timeo,
+				      !dn_queue_too_long(scp, queue, flags));
+			clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+			finish_wait(sk_sleep(sk), &wait);
+			continue;
+		}
+
+		/*
+		 * Get a suitably sized skb.
+		 * 64 is a bit of a hack really, but its larger than any
+		 * link-layer headers and has served us well as a good
+		 * guess as to their real length.
+		 */
+		skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
+					 flags & MSG_DONTWAIT, &err);
+
+		if (err)
+			break;
+
+		if (!skb)
+			continue;
+
+		cb = DN_SKB_CB(skb);
+
+		skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
+
+		if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		if (flags & MSG_OOB) {
+			cb->nsp_flags = 0x30;
+			if (fctype != NSP_FC_NONE)
+				scp->flowrem_oth--;
+		} else {
+			cb->nsp_flags = 0x00;
+			if (scp->seg_total == 0)
+				cb->nsp_flags |= 0x20;
+
+			scp->seg_total += len;
+
+			if (((sent + len) == size) && (flags & MSG_EOR)) {
+				cb->nsp_flags |= 0x40;
+				scp->seg_total = 0;
+				if (fctype == NSP_FC_SCMC)
+					scp->flowrem_dat--;
+			}
+			if (fctype == NSP_FC_SRC)
+				scp->flowrem_dat--;
+		}
+
+		sent += len;
+		dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB);
+		skb = NULL;
+
+		scp->persist = dn_nsp_persist(sk);
+
+	}
+out:
+
+	kfree_skb(skb);
+
+	release_sock(sk);
+
+	return sent ? sent : err;
+
+out_err:
+	err = sk_stream_error(sk, flags, err);
+	release_sock(sk);
+	return err;
+}
+
+static int dn_device_event(struct notifier_block *this, unsigned long event,
+			void *ptr)
+{
+	struct net_device *dev = (struct net_device *)ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	switch(event) {
+		case NETDEV_UP:
+			dn_dev_up(dev);
+			break;
+		case NETDEV_DOWN:
+			dn_dev_down(dev);
+			break;
+		default:
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dn_dev_notifier = {
+	.notifier_call = dn_device_event,
+};
+
+extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
+
+static struct packet_type dn_dix_packet_type __read_mostly = {
+	.type =		cpu_to_be16(ETH_P_DNA_RT),
+	.func =		dn_route_rcv,
+};
+
+#ifdef CONFIG_PROC_FS
+struct dn_iter_state {
+	int bucket;
+};
+
+static struct sock *dn_socket_get_first(struct seq_file *seq)
+{
+	struct dn_iter_state *state = seq->private;
+	struct sock *n = NULL;
+
+	for(state->bucket = 0;
+	    state->bucket < DN_SK_HASH_SIZE;
+	    ++state->bucket) {
+		n = sk_head(&dn_sk_hash[state->bucket]);
+		if (n)
+			break;
+	}
+
+	return n;
+}
+
+static struct sock *dn_socket_get_next(struct seq_file *seq,
+				       struct sock *n)
+{
+	struct dn_iter_state *state = seq->private;
+
+	n = sk_next(n);
+try_again:
+	if (n)
+		goto out;
+	if (++state->bucket >= DN_SK_HASH_SIZE)
+		goto out;
+	n = sk_head(&dn_sk_hash[state->bucket]);
+	goto try_again;
+out:
+	return n;
+}
+
+static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	struct sock *sk = dn_socket_get_first(seq);
+
+	if (sk) {
+		while(*pos && (sk = dn_socket_get_next(seq, sk)))
+			--*pos;
+	}
+	return *pos ? NULL : sk;
+}
+
+static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos)
+{
+	void *rc;
+	read_lock_bh(&dn_hash_lock);
+	rc = socket_get_idx(seq, &pos);
+	if (!rc) {
+		read_unlock_bh(&dn_hash_lock);
+	}
+	return rc;
+}
+
+static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	void *rc;
+
+	if (v == SEQ_START_TOKEN) {
+		rc = dn_socket_get_idx(seq, 0);
+		goto out;
+	}
+
+	rc = dn_socket_get_next(seq, v);
+	if (rc)
+		goto out;
+	read_unlock_bh(&dn_hash_lock);
+out:
+	++*pos;
+	return rc;
+}
+
+static void dn_socket_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v && v != SEQ_START_TOKEN)
+		read_unlock_bh(&dn_hash_lock);
+}
+
+#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126)
+
+static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf)
+{
+	int i;
+
+	switch (le16_to_cpu(dn->sdn_objnamel)) {
+		case 0:
+			sprintf(buf, "%d", dn->sdn_objnum);
+			break;
+		default:
+			for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) {
+				buf[i] = dn->sdn_objname[i];
+				if (IS_NOT_PRINTABLE(buf[i]))
+					buf[i] = '.';
+			}
+			buf[i] = 0;
+	}
+}
+
+static char *dn_state2asc(unsigned char state)
+{
+	switch(state) {
+		case DN_O:
+			return "OPEN";
+		case DN_CR:
+			return "  CR";
+		case DN_DR:
+			return "  DR";
+		case DN_DRC:
+			return " DRC";
+		case DN_CC:
+			return "  CC";
+		case DN_CI:
+			return "  CI";
+		case DN_NR:
+			return "  NR";
+		case DN_NC:
+			return "  NC";
+		case DN_CD:
+			return "  CD";
+		case DN_RJ:
+			return "  RJ";
+		case DN_RUN:
+			return " RUN";
+		case DN_DI:
+			return "  DI";
+		case DN_DIC:
+			return " DIC";
+		case DN_DN:
+			return "  DN";
+		case DN_CL:
+			return "  CL";
+		case DN_CN:
+			return "  CN";
+	}
+
+	return "????";
+}
+
+static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	char buf1[DN_ASCBUF_LEN];
+	char buf2[DN_ASCBUF_LEN];
+	char local_object[DN_MAXOBJL+3];
+	char remote_object[DN_MAXOBJL+3];
+
+	dn_printable_object(&scp->addr, local_object);
+	dn_printable_object(&scp->peer, remote_object);
+
+	seq_printf(seq,
+		   "%6s/%04X %04d:%04d %04d:%04d %01d %-16s "
+		   "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n",
+		   dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1),
+		   scp->addrloc,
+		   scp->numdat,
+		   scp->numoth,
+		   scp->ackxmt_dat,
+		   scp->ackxmt_oth,
+		   scp->flowloc_sw,
+		   local_object,
+		   dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2),
+		   scp->addrrem,
+		   scp->numdat_rcv,
+		   scp->numoth_rcv,
+		   scp->ackrcv_dat,
+		   scp->ackrcv_oth,
+		   scp->flowrem_sw,
+		   remote_object,
+		   dn_state2asc(scp->state),
+		   ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER"));
+}
+
+static int dn_socket_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Local                                              Remote\n");
+	} else {
+		dn_socket_format_entry(seq, v);
+	}
+	return 0;
+}
+
+static const struct seq_operations dn_socket_seq_ops = {
+	.start	= dn_socket_seq_start,
+	.next	= dn_socket_seq_next,
+	.stop	= dn_socket_seq_stop,
+	.show	= dn_socket_seq_show,
+};
+
+static int dn_socket_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &dn_socket_seq_ops,
+			sizeof(struct dn_iter_state));
+}
+
+static const struct file_operations dn_socket_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dn_socket_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+#endif
+
+static const struct net_proto_family	dn_family_ops = {
+	.family =	AF_DECnet,
+	.create =	dn_create,
+	.owner	=	THIS_MODULE,
+};
+
+static const struct proto_ops dn_proto_ops = {
+	.family =	AF_DECnet,
+	.owner =	THIS_MODULE,
+	.release =	dn_release,
+	.bind =		dn_bind,
+	.connect =	dn_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	dn_accept,
+	.getname =	dn_getname,
+	.poll =		dn_poll,
+	.ioctl =	dn_ioctl,
+	.listen =	dn_listen,
+	.shutdown =	dn_shutdown,
+	.setsockopt =	dn_setsockopt,
+	.getsockopt =	dn_getsockopt,
+	.sendmsg =	dn_sendmsg,
+	.recvmsg =	dn_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+void dn_register_sysctl(void);
+void dn_unregister_sysctl(void);
+
+MODULE_DESCRIPTION("The Linux DECnet Network Protocol");
+MODULE_AUTHOR("Linux DECnet Project Team");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_DECnet);
+
+static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
+
+static int __init decnet_init(void)
+{
+	int rc;
+
+	printk(banner);
+
+	rc = proto_register(&dn_proto, 1);
+	if (rc != 0)
+		goto out;
+
+	dn_neigh_init();
+	dn_dev_init();
+	dn_route_init();
+	dn_fib_init();
+
+	sock_register(&dn_family_ops);
+	dev_add_pack(&dn_dix_packet_type);
+	register_netdevice_notifier(&dn_dev_notifier);
+
+	proc_net_fops_create(&init_net, "decnet", S_IRUGO, &dn_socket_seq_fops);
+	dn_register_sysctl();
+out:
+	return rc;
+
+}
+module_init(decnet_init);
+
+/*
+ * Prevent DECnet module unloading until its fixed properly.
+ * Requires an audit of the code to check for memory leaks and
+ * initialisation problems etc.
+ */
+#if 0
+static void __exit decnet_exit(void)
+{
+	sock_unregister(AF_DECnet);
+	rtnl_unregister_all(PF_DECnet);
+	dev_remove_pack(&dn_dix_packet_type);
+
+	dn_unregister_sysctl();
+
+	unregister_netdevice_notifier(&dn_dev_notifier);
+
+	dn_route_cleanup();
+	dn_dev_cleanup();
+	dn_neigh_cleanup();
+	dn_fib_cleanup();
+
+	proc_net_remove(&init_net, "decnet");
+
+	proto_unregister(&dn_proto);
+
+	rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */
+}
+module_exit(decnet_exit);
+#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
new file mode 100644
index 00000000..cf26ac74
--- /dev/null
+++ b/net/decnet/dn_dev.c
@@ -0,0 +1,1445 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Device Layer
+ *
+ * Authors:     Steve Whitehouse <SteveW@ACM.org>
+ *              Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *          Steve Whitehouse : Devices now see incoming frames so they
+ *                             can mark on who it came from.
+ *          Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour
+ *                             can now have a device specific setup func.
+ *          Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/
+ *          Steve Whitehouse : Fixed bug which sometimes killed timer
+ *          Steve Whitehouse : Multiple ifaddr support
+ *          Steve Whitehouse : SIOCGIFCONF is now a compile time option
+ *          Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding
+ *          Steve Whitehouse : Removed timer1 - it's a user space issue now
+ *         Patrick Caulfield : Fixed router hello message format
+ *          Steve Whitehouse : Got rid of constant sizes for blksize for
+ *                             devices. All mtu based now.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/netlink.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+#include <net/dn_neigh.h>
+#include <net/dn_fib.h>
+
+#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
+
+static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
+static char dn_rt_all_rt_mcast[ETH_ALEN]  = {0xAB,0x00,0x00,0x03,0x00,0x00};
+static char dn_hiord[ETH_ALEN]            = {0xAA,0x00,0x04,0x00,0x00,0x00};
+static unsigned char dn_eco_version[3]    = {0x02,0x00,0x00};
+
+extern struct neigh_table dn_neigh_table;
+
+/*
+ * decnet_address is kept in network order.
+ */
+__le16 decnet_address = 0;
+
+static DEFINE_SPINLOCK(dndev_lock);
+static struct net_device *decnet_default_device;
+static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
+
+static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
+static void dn_dev_delete(struct net_device *dev);
+static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa);
+
+static int dn_eth_up(struct net_device *);
+static void dn_eth_down(struct net_device *);
+static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa);
+static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa);
+
+static struct dn_dev_parms dn_dev_list[] =  {
+{
+	.type =		ARPHRD_ETHER, /* Ethernet */
+	.mode =		DN_DEV_BCAST,
+	.state =	DN_DEV_S_RU,
+	.t2 =		1,
+	.t3 =		10,
+	.name =		"ethernet",
+	.up =		dn_eth_up,
+	.down = 	dn_eth_down,
+	.timer3 =	dn_send_brd_hello,
+},
+{
+	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
+	.mode =		DN_DEV_BCAST,
+	.state =	DN_DEV_S_RU,
+	.t2 =		1,
+	.t3 =		10,
+	.name =		"ipgre",
+	.timer3 =	dn_send_brd_hello,
+},
+#if 0
+{
+	.type =		ARPHRD_X25, /* Bog standard X.25 */
+	.mode =		DN_DEV_UCAST,
+	.state =	DN_DEV_S_DS,
+	.t2 =		1,
+	.t3 =		120,
+	.name =		"x25",
+	.timer3 =	dn_send_ptp_hello,
+},
+#endif
+#if 0
+{
+	.type =		ARPHRD_PPP, /* DECnet over PPP */
+	.mode =		DN_DEV_BCAST,
+	.state =	DN_DEV_S_RU,
+	.t2 =		1,
+	.t3 =		10,
+	.name =		"ppp",
+	.timer3 =	dn_send_brd_hello,
+},
+#endif
+{
+	.type =		ARPHRD_DDCMP, /* DECnet over DDCMP */
+	.mode =		DN_DEV_UCAST,
+	.state =	DN_DEV_S_DS,
+	.t2 =		1,
+	.t3 =		120,
+	.name =		"ddcmp",
+	.timer3 =	dn_send_ptp_hello,
+},
+{
+	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
+	.mode =		DN_DEV_BCAST,
+	.state =	DN_DEV_S_RU,
+	.t2 =		1,
+	.t3 =		10,
+	.name =		"loopback",
+	.timer3 =	dn_send_brd_hello,
+}
+};
+
+#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list)
+
+#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x)
+
+#ifdef CONFIG_SYSCTL
+
+static int min_t2[] = { 1 };
+static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */
+static int min_t3[] = { 1 };
+static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */
+
+static int min_priority[1];
+static int max_priority[] = { 127 }; /* From DECnet spec */
+
+static int dn_forwarding_proc(ctl_table *, int,
+			void __user *, size_t *, loff_t *);
+static struct dn_dev_sysctl_table {
+	struct ctl_table_header *sysctl_header;
+	ctl_table dn_dev_vars[5];
+} dn_dev_sysctl = {
+	NULL,
+	{
+	{
+		.procname = "forwarding",
+		.data = (void *)DN_DEV_PARMS_OFFSET(forwarding),
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = dn_forwarding_proc,
+	},
+	{
+		.procname = "priority",
+		.data = (void *)DN_DEV_PARMS_OFFSET(priority),
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_priority,
+		.extra2 = &max_priority
+	},
+	{
+		.procname = "t2",
+		.data = (void *)DN_DEV_PARMS_OFFSET(t2),
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_t2,
+		.extra2 = &max_t2
+	},
+	{
+		.procname = "t3",
+		.data = (void *)DN_DEV_PARMS_OFFSET(t3),
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_t3,
+		.extra2 = &max_t3
+	},
+	{0}
+	},
+};
+
+static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
+{
+	struct dn_dev_sysctl_table *t;
+	int i;
+
+#define DN_CTL_PATH_DEV	3
+
+	struct ctl_path dn_ctl_path[] = {
+		{ .procname = "net",  },
+		{ .procname = "decnet",  },
+		{ .procname = "conf",  },
+		{ /* to be set */ },
+		{ },
+	};
+
+	t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL);
+	if (t == NULL)
+		return;
+
+	for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) {
+		long offset = (long)t->dn_dev_vars[i].data;
+		t->dn_dev_vars[i].data = ((char *)parms) + offset;
+	}
+
+	if (dev) {
+		dn_ctl_path[DN_CTL_PATH_DEV].procname = dev->name;
+	} else {
+		dn_ctl_path[DN_CTL_PATH_DEV].procname = parms->name;
+	}
+
+	t->dn_dev_vars[0].extra1 = (void *)dev;
+
+	t->sysctl_header = register_sysctl_paths(dn_ctl_path, t->dn_dev_vars);
+	if (t->sysctl_header == NULL)
+		kfree(t);
+	else
+		parms->sysctl = t;
+}
+
+static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
+{
+	if (parms->sysctl) {
+		struct dn_dev_sysctl_table *t = parms->sysctl;
+		parms->sysctl = NULL;
+		unregister_sysctl_table(t->sysctl_header);
+		kfree(t);
+	}
+}
+
+static int dn_forwarding_proc(ctl_table *table, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+#ifdef CONFIG_DECNET_ROUTER
+	struct net_device *dev = table->extra1;
+	struct dn_dev *dn_db;
+	int err;
+	int tmp, old;
+
+	if (table->extra1 == NULL)
+		return -EINVAL;
+
+	dn_db = rcu_dereference_raw(dev->dn_ptr);
+	old = dn_db->parms.forwarding;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if ((err >= 0) && write) {
+		if (dn_db->parms.forwarding < 0)
+			dn_db->parms.forwarding = 0;
+		if (dn_db->parms.forwarding > 2)
+			dn_db->parms.forwarding = 2;
+		/*
+		 * What an ugly hack this is... its works, just. It
+		 * would be nice if sysctl/proc were just that little
+		 * bit more flexible so I don't have to write a special
+		 * routine, or suffer hacks like this - SJW
+		 */
+		tmp = dn_db->parms.forwarding;
+		dn_db->parms.forwarding = old;
+		if (dn_db->parms.down)
+			dn_db->parms.down(dev);
+		dn_db->parms.forwarding = tmp;
+		if (dn_db->parms.up)
+			dn_db->parms.up(dev);
+	}
+
+	return err;
+#else
+	return -EINVAL;
+#endif
+}
+
+#else /* CONFIG_SYSCTL */
+static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
+{
+}
+static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
+{
+}
+
+#endif /* CONFIG_SYSCTL */
+
+static inline __u16 mtu2blksize(struct net_device *dev)
+{
+	u32 blksize = dev->mtu;
+	if (blksize > 0xffff)
+		blksize = 0xffff;
+
+	if (dev->type == ARPHRD_ETHER ||
+	    dev->type == ARPHRD_PPP ||
+	    dev->type == ARPHRD_IPGRE ||
+	    dev->type == ARPHRD_LOOPBACK)
+		blksize -= 2;
+
+	return (__u16)blksize;
+}
+
+static struct dn_ifaddr *dn_dev_alloc_ifa(void)
+{
+	struct dn_ifaddr *ifa;
+
+	ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
+
+	return ifa;
+}
+
+static void dn_dev_free_ifa(struct dn_ifaddr *ifa)
+{
+	kfree_rcu(ifa, rcu);
+}
+
+static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy)
+{
+	struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap);
+	unsigned char mac_addr[6];
+	struct net_device *dev = dn_db->dev;
+
+	ASSERT_RTNL();
+
+	*ifap = ifa1->ifa_next;
+
+	if (dn_db->dev->type == ARPHRD_ETHER) {
+		if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
+			dn_dn2eth(mac_addr, ifa1->ifa_local);
+			dev_mc_del(dev, mac_addr);
+		}
+	}
+
+	dn_ifaddr_notify(RTM_DELADDR, ifa1);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
+	if (destroy) {
+		dn_dev_free_ifa(ifa1);
+
+		if (dn_db->ifa_list == NULL)
+			dn_dev_delete(dn_db->dev);
+	}
+}
+
+static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
+{
+	struct net_device *dev = dn_db->dev;
+	struct dn_ifaddr *ifa1;
+	unsigned char mac_addr[6];
+
+	ASSERT_RTNL();
+
+	/* Check for duplicates */
+	for (ifa1 = rtnl_dereference(dn_db->ifa_list);
+	     ifa1 != NULL;
+	     ifa1 = rtnl_dereference(ifa1->ifa_next)) {
+		if (ifa1->ifa_local == ifa->ifa_local)
+			return -EEXIST;
+	}
+
+	if (dev->type == ARPHRD_ETHER) {
+		if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
+			dn_dn2eth(mac_addr, ifa->ifa_local);
+			dev_mc_add(dev, mac_addr);
+		}
+	}
+
+	ifa->ifa_next = dn_db->ifa_list;
+	rcu_assign_pointer(dn_db->ifa_list, ifa);
+
+	dn_ifaddr_notify(RTM_NEWADDR, ifa);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
+
+	return 0;
+}
+
+static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+	int rv;
+
+	if (dn_db == NULL) {
+		int err;
+		dn_db = dn_dev_create(dev, &err);
+		if (dn_db == NULL)
+			return err;
+	}
+
+	ifa->ifa_dev = dn_db;
+
+	if (dev->flags & IFF_LOOPBACK)
+		ifa->ifa_scope = RT_SCOPE_HOST;
+
+	rv = dn_dev_insert_ifa(dn_db, ifa);
+	if (rv)
+		dn_dev_free_ifa(ifa);
+	return rv;
+}
+
+
+int dn_dev_ioctl(unsigned int cmd, void __user *arg)
+{
+	char buffer[DN_IFREQ_SIZE];
+	struct ifreq *ifr = (struct ifreq *)buffer;
+	struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;
+	struct dn_dev *dn_db;
+	struct net_device *dev;
+	struct dn_ifaddr *ifa = NULL;
+	struct dn_ifaddr __rcu **ifap = NULL;
+	int ret = 0;
+
+	if (copy_from_user(ifr, arg, DN_IFREQ_SIZE))
+		return -EFAULT;
+	ifr->ifr_name[IFNAMSIZ-1] = 0;
+
+	dev_load(&init_net, ifr->ifr_name);
+
+	switch(cmd) {
+		case SIOCGIFADDR:
+			break;
+		case SIOCSIFADDR:
+			if (!capable(CAP_NET_ADMIN))
+				return -EACCES;
+			if (sdn->sdn_family != AF_DECnet)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	rtnl_lock();
+
+	if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) {
+		for (ifap = &dn_db->ifa_list;
+		     (ifa = rtnl_dereference(*ifap)) != NULL;
+		     ifap = &ifa->ifa_next)
+			if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0)
+				break;
+	}
+
+	if (ifa == NULL && cmd != SIOCSIFADDR) {
+		ret = -EADDRNOTAVAIL;
+		goto done;
+	}
+
+	switch(cmd) {
+		case SIOCGIFADDR:
+			*((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
+			goto rarok;
+
+		case SIOCSIFADDR:
+			if (!ifa) {
+				if ((ifa = dn_dev_alloc_ifa()) == NULL) {
+					ret = -ENOBUFS;
+					break;
+				}
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+			} else {
+				if (ifa->ifa_local == dn_saddr2dn(sdn))
+					break;
+				dn_dev_del_ifa(dn_db, ifap, 0);
+			}
+
+			ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn);
+
+			ret = dn_dev_set_ifa(dev, ifa);
+	}
+done:
+	rtnl_unlock();
+
+	return ret;
+rarok:
+	if (copy_to_user(arg, ifr, DN_IFREQ_SIZE))
+		ret = -EFAULT;
+	goto done;
+}
+
+struct net_device *dn_dev_get_default(void)
+{
+	struct net_device *dev;
+
+	spin_lock(&dndev_lock);
+	dev = decnet_default_device;
+	if (dev) {
+		if (dev->dn_ptr)
+			dev_hold(dev);
+		else
+			dev = NULL;
+	}
+	spin_unlock(&dndev_lock);
+
+	return dev;
+}
+
+int dn_dev_set_default(struct net_device *dev, int force)
+{
+	struct net_device *old = NULL;
+	int rv = -EBUSY;
+	if (!dev->dn_ptr)
+		return -ENODEV;
+
+	spin_lock(&dndev_lock);
+	if (force || decnet_default_device == NULL) {
+		old = decnet_default_device;
+		decnet_default_device = dev;
+		rv = 0;
+	}
+	spin_unlock(&dndev_lock);
+
+	if (old)
+		dev_put(old);
+	return rv;
+}
+
+static void dn_dev_check_default(struct net_device *dev)
+{
+	spin_lock(&dndev_lock);
+	if (dev == decnet_default_device) {
+		decnet_default_device = NULL;
+	} else {
+		dev = NULL;
+	}
+	spin_unlock(&dndev_lock);
+
+	if (dev)
+		dev_put(dev);
+}
+
+/*
+ * Called with RTNL
+ */
+static struct dn_dev *dn_dev_by_index(int ifindex)
+{
+	struct net_device *dev;
+	struct dn_dev *dn_dev = NULL;
+
+	dev = __dev_get_by_index(&init_net, ifindex);
+	if (dev)
+		dn_dev = rtnl_dereference(dev->dn_ptr);
+
+	return dn_dev;
+}
+
+static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
+	[IFA_ADDRESS]		= { .type = NLA_U16 },
+	[IFA_LOCAL]		= { .type = NLA_U16 },
+	[IFA_LABEL]		= { .type = NLA_STRING,
+				    .len = IFNAMSIZ - 1 },
+};
+
+static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct dn_dev *dn_db;
+	struct ifaddrmsg *ifm;
+	struct dn_ifaddr *ifa;
+	struct dn_ifaddr __rcu **ifap;
+	int err = -EINVAL;
+
+	if (!net_eq(net, &init_net))
+		goto errout;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -ENODEV;
+	ifm = nlmsg_data(nlh);
+	if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
+		goto errout;
+
+	err = -EADDRNOTAVAIL;
+	for (ifap = &dn_db->ifa_list;
+	     (ifa = rtnl_dereference(*ifap)) != NULL;
+	     ifap = &ifa->ifa_next) {
+		if (tb[IFA_LOCAL] &&
+		    nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
+			continue;
+
+		if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+			continue;
+
+		dn_dev_del_ifa(dn_db, ifap, 1);
+		return 0;
+	}
+
+errout:
+	return err;
+}
+
+static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct net_device *dev;
+	struct dn_dev *dn_db;
+	struct ifaddrmsg *ifm;
+	struct dn_ifaddr *ifa;
+	int err;
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFA_LOCAL] == NULL)
+		return -EINVAL;
+
+	ifm = nlmsg_data(nlh);
+	if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL)
+		return -ENODEV;
+
+	if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) {
+		dn_db = dn_dev_create(dev, &err);
+		if (!dn_db)
+			return err;
+	}
+
+	if ((ifa = dn_dev_alloc_ifa()) == NULL)
+		return -ENOBUFS;
+
+	if (tb[IFA_ADDRESS] == NULL)
+		tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+	ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]);
+	ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]);
+	ifa->ifa_flags = ifm->ifa_flags;
+	ifa->ifa_scope = ifm->ifa_scope;
+	ifa->ifa_dev = dn_db;
+
+	if (tb[IFA_LABEL])
+		nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+	else
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+	err = dn_dev_insert_ifa(dn_db, ifa);
+	if (err)
+		dn_dev_free_ifa(ifa);
+
+	return err;
+}
+
+static inline size_t dn_ifaddr_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+	       + nla_total_size(2) /* IFA_ADDRESS */
+	       + nla_total_size(2); /* IFA_LOCAL */
+}
+
+static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
+			     u32 pid, u32 seq, int event, unsigned int flags)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifa_family = AF_DECnet;
+	ifm->ifa_prefixlen = 16;
+	ifm->ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT;
+	ifm->ifa_scope = ifa->ifa_scope;
+	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+	if (ifa->ifa_address)
+		NLA_PUT_LE16(skb, IFA_ADDRESS, ifa->ifa_address);
+	if (ifa->ifa_local)
+		NLA_PUT_LE16(skb, IFA_LOCAL, ifa->ifa_local);
+	if (ifa->ifa_label[0])
+		NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
+}
+
+static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int idx, dn_idx = 0, skip_ndevs, skip_naddr;
+	struct net_device *dev;
+	struct dn_dev *dn_db;
+	struct dn_ifaddr *ifa;
+
+	if (!net_eq(net, &init_net))
+		return 0;
+
+	skip_ndevs = cb->args[0];
+	skip_naddr = cb->args[1];
+
+	idx = 0;
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if (idx < skip_ndevs)
+			goto cont;
+		else if (idx > skip_ndevs) {
+			/* Only skip over addresses for first dev dumped
+			 * in this iteration (idx == skip_ndevs) */
+			skip_naddr = 0;
+		}
+
+		if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL)
+			goto cont;
+
+		for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa;
+		     ifa = rcu_dereference(ifa->ifa_next), dn_idx++) {
+			if (dn_idx < skip_naddr)
+				continue;
+
+			if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+					      cb->nlh->nlmsg_seq, RTM_NEWADDR,
+					      NLM_F_MULTI) < 0)
+				goto done;
+		}
+cont:
+		idx++;
+	}
+done:
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	cb->args[1] = dn_idx;
+
+	return skb->len;
+}
+
+static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
+{
+	struct dn_dev *dn_db;
+	struct dn_ifaddr *ifa;
+	int rv = -ENODEV;
+
+	rcu_read_lock();
+	dn_db = rcu_dereference(dev->dn_ptr);
+	if (dn_db == NULL)
+		goto out;
+
+	ifa = rcu_dereference(dn_db->ifa_list);
+	if (ifa != NULL) {
+		*addr = ifa->ifa_local;
+		rv = 0;
+	}
+out:
+	rcu_read_unlock();
+	return rv;
+}
+
+/*
+ * Find a default address to bind to.
+ *
+ * This is one of those areas where the initial VMS concepts don't really
+ * map onto the Linux concepts, and since we introduced multiple addresses
+ * per interface we have to cope with slightly odd ways of finding out what
+ * "our address" really is. Mostly it's not a problem; for this we just guess
+ * a sensible default. Eventually the routing code will take care of all the
+ * nasties for us I hope.
+ */
+int dn_dev_bind_default(__le16 *addr)
+{
+	struct net_device *dev;
+	int rv;
+	dev = dn_dev_get_default();
+last_chance:
+	if (dev) {
+		rv = dn_dev_get_first(dev, addr);
+		dev_put(dev);
+		if (rv == 0 || dev == init_net.loopback_dev)
+			return rv;
+	}
+	dev = init_net.loopback_dev;
+	dev_hold(dev);
+	goto last_chance;
+}
+
+static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+	struct endnode_hello_message *msg;
+	struct sk_buff *skb = NULL;
+	__le16 *pktlen;
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+	if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
+		return;
+
+	skb->dev = dev;
+
+	msg = (struct endnode_hello_message *)skb_put(skb,sizeof(*msg));
+
+	msg->msgflg  = 0x0D;
+	memcpy(msg->tiver, dn_eco_version, 3);
+	dn_dn2eth(msg->id, ifa->ifa_local);
+	msg->iinfo   = DN_RT_INFO_ENDN;
+	msg->blksize = cpu_to_le16(mtu2blksize(dev));
+	msg->area    = 0x00;
+	memset(msg->seed, 0, 8);
+	memcpy(msg->neighbor, dn_hiord, ETH_ALEN);
+
+	if (dn_db->router) {
+		struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
+		dn_dn2eth(msg->neighbor, dn->addr);
+	}
+
+	msg->timer   = cpu_to_le16((unsigned short)dn_db->parms.t3);
+	msg->mpd     = 0x00;
+	msg->datalen = 0x02;
+	memset(msg->data, 0xAA, 2);
+
+	pktlen = (__le16 *)skb_push(skb,2);
+	*pktlen = cpu_to_le16(skb->len - 2);
+
+	skb_reset_network_header(skb);
+
+	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id);
+}
+
+
+#define DRDELAY (5 * HZ)
+
+static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa)
+{
+	/* First check time since device went up */
+	if ((jiffies - dn_db->uptime) < DRDELAY)
+		return 0;
+
+	/* If there is no router, then yes... */
+	if (!dn_db->router)
+		return 1;
+
+	/* otherwise only if we have a higher priority or.. */
+	if (dn->priority < dn_db->parms.priority)
+		return 1;
+
+	/* if we have equal priority and a higher node number */
+	if (dn->priority != dn_db->parms.priority)
+		return 0;
+
+	if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local))
+		return 1;
+
+	return 0;
+}
+
+static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+	int n;
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+	struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
+	struct sk_buff *skb;
+	size_t size;
+	unsigned char *ptr;
+	unsigned char *i1, *i2;
+	__le16 *pktlen;
+	char *src;
+
+	if (mtu2blksize(dev) < (26 + 7))
+		return;
+
+	n = mtu2blksize(dev) - 26;
+	n /= 7;
+
+	if (n > 32)
+		n = 32;
+
+	size = 2 + 26 + 7 * n;
+
+	if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb->dev = dev;
+	ptr = skb_put(skb, size);
+
+	*ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH;
+	*ptr++ = 2; /* ECO */
+	*ptr++ = 0;
+	*ptr++ = 0;
+	dn_dn2eth(ptr, ifa->ifa_local);
+	src = ptr;
+	ptr += ETH_ALEN;
+	*ptr++ = dn_db->parms.forwarding == 1 ?
+			DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
+	*((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev));
+	ptr += 2;
+	*ptr++ = dn_db->parms.priority; /* Priority */
+	*ptr++ = 0; /* Area: Reserved */
+	*((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3);
+	ptr += 2;
+	*ptr++ = 0; /* MPD: Reserved */
+	i1 = ptr++;
+	memset(ptr, 0, 7); /* Name: Reserved */
+	ptr += 7;
+	i2 = ptr++;
+
+	n = dn_neigh_elist(dev, ptr, n);
+
+	*i2 = 7 * n;
+	*i1 = 8 + *i2;
+
+	skb_trim(skb, (27 + *i2));
+
+	pktlen = (__le16 *)skb_push(skb, 2);
+	*pktlen = cpu_to_le16(skb->len - 2);
+
+	skb_reset_network_header(skb);
+
+	if (dn_am_i_a_router(dn, dn_db, ifa)) {
+		struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
+		if (skb2) {
+			dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src);
+		}
+	}
+
+	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
+}
+
+static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+	if (dn_db->parms.forwarding == 0)
+		dn_send_endnode_hello(dev, ifa);
+	else
+		dn_send_router_hello(dev, ifa);
+}
+
+static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
+{
+	int tdlen = 16;
+	int size = dev->hard_header_len + 2 + 4 + tdlen;
+	struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC);
+	int i;
+	unsigned char *ptr;
+	char src[ETH_ALEN];
+
+	if (skb == NULL)
+		return ;
+
+	skb->dev = dev;
+	skb_push(skb, dev->hard_header_len);
+	ptr = skb_put(skb, 2 + 4 + tdlen);
+
+	*ptr++ = DN_RT_PKT_HELO;
+	*((__le16 *)ptr) = ifa->ifa_local;
+	ptr += 2;
+	*ptr++ = tdlen;
+
+	for(i = 0; i < tdlen; i++)
+		*ptr++ = 0252;
+
+	dn_dn2eth(src, ifa->ifa_local);
+	dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
+}
+
+static int dn_eth_up(struct net_device *dev)
+{
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+	if (dn_db->parms.forwarding == 0)
+		dev_mc_add(dev, dn_rt_all_end_mcast);
+	else
+		dev_mc_add(dev, dn_rt_all_rt_mcast);
+
+	dn_db->use_long = 1;
+
+	return 0;
+}
+
+static void dn_eth_down(struct net_device *dev)
+{
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+	if (dn_db->parms.forwarding == 0)
+		dev_mc_del(dev, dn_rt_all_end_mcast);
+	else
+		dev_mc_del(dev, dn_rt_all_rt_mcast);
+}
+
+static void dn_dev_set_timer(struct net_device *dev);
+
+static void dn_dev_timer_func(unsigned long arg)
+{
+	struct net_device *dev = (struct net_device *)arg;
+	struct dn_dev *dn_db;
+	struct dn_ifaddr *ifa;
+
+	rcu_read_lock();
+	dn_db = rcu_dereference(dev->dn_ptr);
+	if (dn_db->t3 <= dn_db->parms.t2) {
+		if (dn_db->parms.timer3) {
+			for (ifa = rcu_dereference(dn_db->ifa_list);
+			     ifa;
+			     ifa = rcu_dereference(ifa->ifa_next)) {
+				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+					dn_db->parms.timer3(dev, ifa);
+			}
+		}
+		dn_db->t3 = dn_db->parms.t3;
+	} else {
+		dn_db->t3 -= dn_db->parms.t2;
+	}
+	rcu_read_unlock();
+	dn_dev_set_timer(dev);
+}
+
+static void dn_dev_set_timer(struct net_device *dev)
+{
+	struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
+
+	if (dn_db->parms.t2 > dn_db->parms.t3)
+		dn_db->parms.t2 = dn_db->parms.t3;
+
+	dn_db->timer.data = (unsigned long)dev;
+	dn_db->timer.function = dn_dev_timer_func;
+	dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
+
+	add_timer(&dn_db->timer);
+}
+
+static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
+{
+	int i;
+	struct dn_dev_parms *p = dn_dev_list;
+	struct dn_dev *dn_db;
+
+	for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) {
+		if (p->type == dev->type)
+			break;
+	}
+
+	*err = -ENODEV;
+	if (i == DN_DEV_LIST_SIZE)
+		return NULL;
+
+	*err = -ENOBUFS;
+	if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL)
+		return NULL;
+
+	memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
+
+	rcu_assign_pointer(dev->dn_ptr, dn_db);
+	dn_db->dev = dev;
+	init_timer(&dn_db->timer);
+
+	dn_db->uptime = jiffies;
+
+	dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table);
+	if (!dn_db->neigh_parms) {
+		rcu_assign_pointer(dev->dn_ptr, NULL);
+		kfree(dn_db);
+		return NULL;
+	}
+
+	if (dn_db->parms.up) {
+		if (dn_db->parms.up(dev) < 0) {
+			neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
+			dev->dn_ptr = NULL;
+			kfree(dn_db);
+			return NULL;
+		}
+	}
+
+	dn_dev_sysctl_register(dev, &dn_db->parms);
+
+	dn_dev_set_timer(dev);
+
+	*err = 0;
+	return dn_db;
+}
+
+
+/*
+ * This processes a device up event. We only start up
+ * the loopback device & ethernet devices with correct
+ * MAC addresses automatically. Others must be started
+ * specifically.
+ *
+ * FIXME: How should we configure the loopback address ? If we could dispense
+ * with using decnet_address here and for autobind, it will be one less thing
+ * for users to worry about setting up.
+ */
+
+void dn_dev_up(struct net_device *dev)
+{
+	struct dn_ifaddr *ifa;
+	__le16 addr = decnet_address;
+	int maybe_default = 0;
+	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+
+	if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+		return;
+
+	/*
+	 * Need to ensure that loopback device has a dn_db attached to it
+	 * to allow creation of neighbours against it, even though it might
+	 * not have a local address of its own. Might as well do the same for
+	 * all autoconfigured interfaces.
+	 */
+	if (dn_db == NULL) {
+		int err;
+		dn_db = dn_dev_create(dev, &err);
+		if (dn_db == NULL)
+			return;
+	}
+
+	if (dev->type == ARPHRD_ETHER) {
+		if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
+			return;
+		addr = dn_eth2dn(dev->dev_addr);
+		maybe_default = 1;
+	}
+
+	if (addr == 0)
+		return;
+
+	if ((ifa = dn_dev_alloc_ifa()) == NULL)
+		return;
+
+	ifa->ifa_local = ifa->ifa_address = addr;
+	ifa->ifa_flags = 0;
+	ifa->ifa_scope = RT_SCOPE_UNIVERSE;
+	strcpy(ifa->ifa_label, dev->name);
+
+	dn_dev_set_ifa(dev, ifa);
+
+	/*
+	 * Automagically set the default device to the first automatically
+	 * configured ethernet card in the system.
+	 */
+	if (maybe_default) {
+		dev_hold(dev);
+		if (dn_dev_set_default(dev, 0))
+			dev_put(dev);
+	}
+}
+
+static void dn_dev_delete(struct net_device *dev)
+{
+	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+
+	if (dn_db == NULL)
+		return;
+
+	del_timer_sync(&dn_db->timer);
+	dn_dev_sysctl_unregister(&dn_db->parms);
+	dn_dev_check_default(dev);
+	neigh_ifdown(&dn_neigh_table, dev);
+
+	if (dn_db->parms.down)
+		dn_db->parms.down(dev);
+
+	dev->dn_ptr = NULL;
+
+	neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
+	neigh_ifdown(&dn_neigh_table, dev);
+
+	if (dn_db->router)
+		neigh_release(dn_db->router);
+	if (dn_db->peer)
+		neigh_release(dn_db->peer);
+
+	kfree(dn_db);
+}
+
+void dn_dev_down(struct net_device *dev)
+{
+	struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
+	struct dn_ifaddr *ifa;
+
+	if (dn_db == NULL)
+		return;
+
+	while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) {
+		dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0);
+		dn_dev_free_ifa(ifa);
+	}
+
+	dn_dev_delete(dev);
+}
+
+void dn_dev_init_pkt(struct sk_buff *skb)
+{
+}
+
+void dn_dev_veri_pkt(struct sk_buff *skb)
+{
+}
+
+void dn_dev_hello(struct sk_buff *skb)
+{
+}
+
+void dn_dev_devices_off(void)
+{
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev)
+		dn_dev_down(dev);
+	rtnl_unlock();
+
+}
+
+void dn_dev_devices_on(void)
+{
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(&init_net, dev) {
+		if (dev->flags & IFF_UP)
+			dn_dev_up(dev);
+	}
+	rtnl_unlock();
+}
+
+int register_dnaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&dnaddr_chain, nb);
+}
+
+int unregister_dnaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
+}
+
+#ifdef CONFIG_PROC_FS
+static inline int is_dn_dev(struct net_device *dev)
+{
+	return dev->dn_ptr != NULL;
+}
+
+static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	int i;
+	struct net_device *dev;
+
+	rcu_read_lock();
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	i = 1;
+	for_each_netdev_rcu(&init_net, dev) {
+		if (!is_dn_dev(dev))
+			continue;
+
+		if (i++ == *pos)
+			return dev;
+	}
+
+	return NULL;
+}
+
+static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net_device *dev;
+
+	++*pos;
+
+	dev = (struct net_device *)v;
+	if (v == SEQ_START_TOKEN)
+		dev = net_device_entry(&init_net.dev_base_head);
+
+	for_each_netdev_continue_rcu(&init_net, dev) {
+		if (!is_dn_dev(dev))
+			continue;
+
+		return dev;
+	}
+
+	return NULL;
+}
+
+static void dn_dev_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static char *dn_type2asc(char type)
+{
+	switch(type) {
+		case DN_DEV_BCAST:
+			return "B";
+		case DN_DEV_UCAST:
+			return "U";
+		case DN_DEV_MPOINT:
+			return "M";
+	}
+
+	return "?";
+}
+
+static int dn_dev_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Name     Flags T1   Timer1 T3   Timer3 BlkSize Pri State DevType    Router Peer\n");
+	else {
+		struct net_device *dev = v;
+		char peer_buf[DN_ASCBUF_LEN];
+		char router_buf[DN_ASCBUF_LEN];
+		struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr);
+
+		seq_printf(seq, "%-8s %1s     %04u %04u   %04lu %04lu"
+				"   %04hu    %03d %02x    %-10s %-7s %-7s\n",
+				dev->name ? dev->name : "???",
+				dn_type2asc(dn_db->parms.mode),
+				0, 0,
+				dn_db->t3, dn_db->parms.t3,
+				mtu2blksize(dev),
+				dn_db->parms.priority,
+				dn_db->parms.state, dn_db->parms.name,
+				dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
+				dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
+	}
+	return 0;
+}
+
+static const struct seq_operations dn_dev_seq_ops = {
+	.start	= dn_dev_seq_start,
+	.next	= dn_dev_seq_next,
+	.stop	= dn_dev_seq_stop,
+	.show	= dn_dev_seq_show,
+};
+
+static int dn_dev_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &dn_dev_seq_ops);
+}
+
+static const struct file_operations dn_dev_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = dn_dev_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int addr[2];
+module_param_array(addr, int, NULL, 0444);
+MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node");
+
+void __init dn_dev_init(void)
+{
+	if (addr[0] > 63 || addr[0] < 0) {
+		printk(KERN_ERR "DECnet: Area must be between 0 and 63");
+		return;
+	}
+
+	if (addr[1] > 1023 || addr[1] < 0) {
+		printk(KERN_ERR "DECnet: Node must be between 0 and 1023");
+		return;
+	}
+
+	decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]);
+
+	dn_dev_devices_on();
+
+	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL);
+	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL);
+	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr);
+
+	proc_net_fops_create(&init_net, "decnet_dev", S_IRUGO, &dn_dev_seq_fops);
+
+#ifdef CONFIG_SYSCTL
+	{
+		int i;
+		for(i = 0; i < DN_DEV_LIST_SIZE; i++)
+			dn_dev_sysctl_register(NULL, &dn_dev_list[i]);
+	}
+#endif /* CONFIG_SYSCTL */
+}
+
+void __exit dn_dev_cleanup(void)
+{
+#ifdef CONFIG_SYSCTL
+	{
+		int i;
+		for(i = 0; i < DN_DEV_LIST_SIZE; i++)
+			dn_dev_sysctl_unregister(&dn_dev_list[i]);
+	}
+#endif /* CONFIG_SYSCTL */
+
+	proc_net_remove(&init_net, "decnet_dev");
+
+	dn_dev_devices_off();
+}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
new file mode 100644
index 00000000..1c74ed36
--- /dev/null
+++ b/net/decnet/dn_fib.c
@@ -0,0 +1,770 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Routing Forwarding Information Base (Glue/Info List)
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ *              Alexey Kuznetsov : SMP locking changes
+ *              Steve Whitehouse : Rewrote it... Well to be more correct, I
+ *                                 copied most of it from the ipv4 fib code.
+ *              Steve Whitehouse : Updated it in style and fixed a few bugs
+ *                                 which were fixed in the ipv4 code since
+ *                                 this code was copied from it.
+ *
+ */
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+
+#define RT_MIN_TABLE 1
+
+#define for_fib_info() { struct dn_fib_info *fi;\
+	for(fi = dn_fib_info_list; fi; fi = fi->fib_next)
+#define endfor_fib_info() }
+
+#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
+	for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\
+	for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define endfor_nexthops(fi) }
+
+static DEFINE_SPINLOCK(dn_fib_multipath_lock);
+static struct dn_fib_info *dn_fib_info_list;
+static DEFINE_SPINLOCK(dn_fib_info_lock);
+
+static struct
+{
+	int error;
+	u8 scope;
+} dn_fib_props[RTN_MAX+1] = {
+	[RTN_UNSPEC] =      { .error = 0,       .scope = RT_SCOPE_NOWHERE },
+	[RTN_UNICAST] =     { .error = 0,       .scope = RT_SCOPE_UNIVERSE },
+	[RTN_LOCAL] =       { .error = 0,       .scope = RT_SCOPE_HOST },
+	[RTN_BROADCAST] =   { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+	[RTN_ANYCAST] =     { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+	[RTN_MULTICAST] =   { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+	[RTN_BLACKHOLE] =   { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE },
+	[RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE },
+	[RTN_PROHIBIT] =    { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE },
+	[RTN_THROW] =       { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE },
+	[RTN_NAT] =         { .error = 0,       .scope = RT_SCOPE_NOWHERE },
+	[RTN_XRESOLVE] =    { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
+};
+
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
+static int dn_fib_sync_up(struct net_device *dev);
+
+void dn_fib_free_info(struct dn_fib_info *fi)
+{
+	if (fi->fib_dead == 0) {
+		printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n");
+		return;
+	}
+
+	change_nexthops(fi) {
+		if (nh->nh_dev)
+			dev_put(nh->nh_dev);
+		nh->nh_dev = NULL;
+	} endfor_nexthops(fi);
+	kfree(fi);
+}
+
+void dn_fib_release_info(struct dn_fib_info *fi)
+{
+	spin_lock(&dn_fib_info_lock);
+	if (fi && --fi->fib_treeref == 0) {
+		if (fi->fib_next)
+			fi->fib_next->fib_prev = fi->fib_prev;
+		if (fi->fib_prev)
+			fi->fib_prev->fib_next = fi->fib_next;
+		if (fi == dn_fib_info_list)
+			dn_fib_info_list = fi->fib_next;
+		fi->fib_dead = 1;
+		dn_fib_info_put(fi);
+	}
+	spin_unlock(&dn_fib_info_lock);
+}
+
+static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi)
+{
+	const struct dn_fib_nh *onh = ofi->fib_nh;
+
+	for_nexthops(fi) {
+		if (nh->nh_oif != onh->nh_oif ||
+			nh->nh_gw != onh->nh_gw ||
+			nh->nh_scope != onh->nh_scope ||
+			nh->nh_weight != onh->nh_weight ||
+			((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+				return -1;
+		onh++;
+	} endfor_nexthops(fi);
+	return 0;
+}
+
+static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi)
+{
+	for_fib_info() {
+		if (fi->fib_nhs != nfi->fib_nhs)
+			continue;
+		if (nfi->fib_protocol == fi->fib_protocol &&
+			nfi->fib_prefsrc == fi->fib_prefsrc &&
+			nfi->fib_priority == fi->fib_priority &&
+			memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
+			((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+			(nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0))
+				return fi;
+	} endfor_fib_info();
+	return NULL;
+}
+
+__le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type)
+{
+	while(RTA_OK(attr,attrlen)) {
+		if (attr->rta_type == type)
+			return *(__le16*)RTA_DATA(attr);
+		attr = RTA_NEXT(attr, attrlen);
+	}
+
+	return 0;
+}
+
+static int dn_fib_count_nhs(struct rtattr *rta)
+{
+	int nhs = 0;
+	struct rtnexthop *nhp = RTA_DATA(rta);
+	int nhlen = RTA_PAYLOAD(rta);
+
+	while(nhlen >= (int)sizeof(struct rtnexthop)) {
+		if ((nhlen -= nhp->rtnh_len) < 0)
+			return 0;
+		nhs++;
+		nhp = RTNH_NEXT(nhp);
+	}
+
+	return nhs;
+}
+
+static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+	struct rtnexthop *nhp = RTA_DATA(rta);
+	int nhlen = RTA_PAYLOAD(rta);
+
+	change_nexthops(fi) {
+		int attrlen = nhlen - sizeof(struct rtnexthop);
+		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+			return -EINVAL;
+
+		nh->nh_flags  = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+		nh->nh_oif    = nhp->rtnh_ifindex;
+		nh->nh_weight = nhp->rtnh_hops + 1;
+
+		if (attrlen) {
+			nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+		}
+		nhp = RTNH_NEXT(nhp);
+	} endfor_nexthops(fi);
+
+	return 0;
+}
+
+
+static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh)
+{
+	int err;
+
+	if (nh->nh_gw) {
+		struct flowidn fld;
+		struct dn_fib_res res;
+
+		if (nh->nh_flags&RTNH_F_ONLINK) {
+			struct net_device *dev;
+
+			if (r->rtm_scope >= RT_SCOPE_LINK)
+				return -EINVAL;
+			if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST)
+				return -EINVAL;
+			if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
+				return -ENODEV;
+			if (!(dev->flags&IFF_UP))
+				return -ENETDOWN;
+			nh->nh_dev = dev;
+			dev_hold(dev);
+			nh->nh_scope = RT_SCOPE_LINK;
+			return 0;
+		}
+
+		memset(&fld, 0, sizeof(fld));
+		fld.daddr = nh->nh_gw;
+		fld.flowidn_oif = nh->nh_oif;
+		fld.flowidn_scope = r->rtm_scope + 1;
+
+		if (fld.flowidn_scope < RT_SCOPE_LINK)
+			fld.flowidn_scope = RT_SCOPE_LINK;
+
+		if ((err = dn_fib_lookup(&fld, &res)) != 0)
+			return err;
+
+		err = -EINVAL;
+		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+			goto out;
+		nh->nh_scope = res.scope;
+		nh->nh_oif = DN_FIB_RES_OIF(res);
+		nh->nh_dev = DN_FIB_RES_DEV(res);
+		if (nh->nh_dev == NULL)
+			goto out;
+		dev_hold(nh->nh_dev);
+		err = -ENETDOWN;
+		if (!(nh->nh_dev->flags & IFF_UP))
+			goto out;
+		err = 0;
+out:
+		dn_fib_res_put(&res);
+		return err;
+	} else {
+		struct net_device *dev;
+
+		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+			return -EINVAL;
+
+		dev = __dev_get_by_index(&init_net, nh->nh_oif);
+		if (dev == NULL || dev->dn_ptr == NULL)
+			return -ENODEV;
+		if (!(dev->flags&IFF_UP))
+			return -ENETDOWN;
+		nh->nh_dev = dev;
+		dev_hold(nh->nh_dev);
+		nh->nh_scope = RT_SCOPE_HOST;
+	}
+
+	return 0;
+}
+
+
+struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp)
+{
+	int err;
+	struct dn_fib_info *fi = NULL;
+	struct dn_fib_info *ofi;
+	int nhs = 1;
+
+	if (r->rtm_type > RTN_MAX)
+		goto err_inval;
+
+	if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
+		goto err_inval;
+
+	if (rta->rta_mp) {
+		nhs = dn_fib_count_nhs(rta->rta_mp);
+		if (nhs == 0)
+			goto err_inval;
+	}
+
+	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL);
+	err = -ENOBUFS;
+	if (fi == NULL)
+		goto failure;
+
+	fi->fib_protocol = r->rtm_protocol;
+	fi->fib_nhs = nhs;
+	fi->fib_flags = r->rtm_flags;
+	if (rta->rta_priority)
+		fi->fib_priority = *rta->rta_priority;
+	if (rta->rta_mx) {
+		int attrlen = RTA_PAYLOAD(rta->rta_mx);
+		struct rtattr *attr = RTA_DATA(rta->rta_mx);
+
+		while(RTA_OK(attr, attrlen)) {
+			unsigned flavour = attr->rta_type;
+			if (flavour) {
+				if (flavour > RTAX_MAX)
+					goto err_inval;
+				fi->fib_metrics[flavour-1] = *(unsigned*)RTA_DATA(attr);
+			}
+			attr = RTA_NEXT(attr, attrlen);
+		}
+	}
+	if (rta->rta_prefsrc)
+		memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2);
+
+	if (rta->rta_mp) {
+		if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+			goto failure;
+		if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+			goto err_inval;
+		if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2))
+			goto err_inval;
+	} else {
+		struct dn_fib_nh *nh = fi->fib_nh;
+		if (rta->rta_oif)
+			nh->nh_oif = *rta->rta_oif;
+		if (rta->rta_gw)
+			memcpy(&nh->nh_gw, rta->rta_gw, 2);
+		nh->nh_flags = r->rtm_flags;
+		nh->nh_weight = 1;
+	}
+
+	if (r->rtm_type == RTN_NAT) {
+		if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif)
+			goto err_inval;
+		memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2);
+		goto link_it;
+	}
+
+	if (dn_fib_props[r->rtm_type].error) {
+		if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+			goto err_inval;
+		goto link_it;
+	}
+
+	if (r->rtm_scope > RT_SCOPE_HOST)
+		goto err_inval;
+
+	if (r->rtm_scope == RT_SCOPE_HOST) {
+		struct dn_fib_nh *nh = fi->fib_nh;
+
+		/* Local address is added */
+		if (nhs != 1 || nh->nh_gw)
+			goto err_inval;
+		nh->nh_scope = RT_SCOPE_NOWHERE;
+		nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
+		err = -ENODEV;
+		if (nh->nh_dev == NULL)
+			goto failure;
+	} else {
+		change_nexthops(fi) {
+			if ((err = dn_fib_check_nh(r, fi, nh)) != 0)
+				goto failure;
+		} endfor_nexthops(fi)
+	}
+
+	if (fi->fib_prefsrc) {
+		if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+		    memcmp(&fi->fib_prefsrc, rta->rta_dst, 2))
+			if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+				goto err_inval;
+	}
+
+link_it:
+	if ((ofi = dn_fib_find_info(fi)) != NULL) {
+		fi->fib_dead = 1;
+		dn_fib_free_info(fi);
+		ofi->fib_treeref++;
+		return ofi;
+	}
+
+	fi->fib_treeref++;
+	atomic_inc(&fi->fib_clntref);
+	spin_lock(&dn_fib_info_lock);
+	fi->fib_next = dn_fib_info_list;
+	fi->fib_prev = NULL;
+	if (dn_fib_info_list)
+		dn_fib_info_list->fib_prev = fi;
+	dn_fib_info_list = fi;
+	spin_unlock(&dn_fib_info_lock);
+	return fi;
+
+err_inval:
+	err = -EINVAL;
+
+failure:
+	*errp = err;
+	if (fi) {
+		fi->fib_dead = 1;
+		dn_fib_free_info(fi);
+	}
+
+	return NULL;
+}
+
+int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res)
+{
+	int err = dn_fib_props[type].error;
+
+	if (err == 0) {
+		if (fi->fib_flags & RTNH_F_DEAD)
+			return 1;
+
+		res->fi = fi;
+
+		switch(type) {
+			case RTN_NAT:
+				DN_FIB_RES_RESET(*res);
+				atomic_inc(&fi->fib_clntref);
+				return 0;
+			case RTN_UNICAST:
+			case RTN_LOCAL:
+				for_nexthops(fi) {
+					if (nh->nh_flags & RTNH_F_DEAD)
+						continue;
+					if (!fld->flowidn_oif ||
+					    fld->flowidn_oif == nh->nh_oif)
+						break;
+				}
+				if (nhsel < fi->fib_nhs) {
+					res->nh_sel = nhsel;
+					atomic_inc(&fi->fib_clntref);
+					return 0;
+				}
+				endfor_nexthops(fi);
+				res->fi = NULL;
+				return 1;
+			default:
+				if (net_ratelimit())
+					 printk("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n", type);
+				res->fi = NULL;
+				return -EINVAL;
+		}
+	}
+	return err;
+}
+
+void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
+{
+	struct dn_fib_info *fi = res->fi;
+	int w;
+
+	spin_lock_bh(&dn_fib_multipath_lock);
+	if (fi->fib_power <= 0) {
+		int power = 0;
+		change_nexthops(fi) {
+			if (!(nh->nh_flags&RTNH_F_DEAD)) {
+				power += nh->nh_weight;
+				nh->nh_power = nh->nh_weight;
+			}
+		} endfor_nexthops(fi);
+		fi->fib_power = power;
+		if (power < 0) {
+			spin_unlock_bh(&dn_fib_multipath_lock);
+			res->nh_sel = 0;
+			return;
+		}
+	}
+
+	w = jiffies % fi->fib_power;
+
+	change_nexthops(fi) {
+		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+			if ((w -= nh->nh_power) <= 0) {
+				nh->nh_power--;
+				fi->fib_power--;
+				res->nh_sel = nhsel;
+				spin_unlock_bh(&dn_fib_multipath_lock);
+				return;
+			}
+		}
+	} endfor_nexthops(fi);
+	res->nh_sel = 0;
+	spin_unlock_bh(&dn_fib_multipath_lock);
+}
+
+
+static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+	int i;
+
+	for(i = 1; i <= RTA_MAX; i++) {
+		struct rtattr *attr = rta[i-1];
+		if (attr) {
+			if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2)
+				return -EINVAL;
+			if (i != RTA_MULTIPATH && i != RTA_METRICS &&
+			    i != RTA_TABLE)
+				rta[i-1] = (struct rtattr *)RTA_DATA(attr);
+		}
+	}
+
+	return 0;
+}
+
+static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct dn_fib_table *tb;
+	struct rtattr **rta = arg;
+	struct rtmsg *r = NLMSG_DATA(nlh);
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	if (dn_fib_check_attr(r, rta))
+		return -EINVAL;
+
+	tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0);
+	if (tb)
+		return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
+
+	return -ESRCH;
+}
+
+static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct dn_fib_table *tb;
+	struct rtattr **rta = arg;
+	struct rtmsg *r = NLMSG_DATA(nlh);
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	if (dn_fib_check_attr(r, rta))
+		return -EINVAL;
+
+	tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1);
+	if (tb)
+		return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
+
+	return -ENOBUFS;
+}
+
+static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
+{
+	struct dn_fib_table *tb;
+	struct {
+		struct nlmsghdr nlh;
+		struct rtmsg rtm;
+	} req;
+	struct dn_kern_rta rta;
+
+	memset(&req.rtm, 0, sizeof(req.rtm));
+	memset(&rta, 0, sizeof(rta));
+
+	if (type == RTN_UNICAST)
+		tb = dn_fib_get_table(RT_MIN_TABLE, 1);
+	else
+		tb = dn_fib_get_table(RT_TABLE_LOCAL, 1);
+
+	if (tb == NULL)
+		return;
+
+	req.nlh.nlmsg_len = sizeof(req);
+	req.nlh.nlmsg_type = cmd;
+	req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+	req.nlh.nlmsg_pid = 0;
+	req.nlh.nlmsg_seq = 0;
+
+	req.rtm.rtm_dst_len = dst_len;
+	req.rtm.rtm_table = tb->n;
+	req.rtm.rtm_protocol = RTPROT_KERNEL;
+	req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+	req.rtm.rtm_type = type;
+
+	rta.rta_dst = &dst;
+	rta.rta_prefsrc = &ifa->ifa_local;
+	rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+
+	if (cmd == RTM_NEWROUTE)
+		tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+	else
+		tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+
+static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
+{
+
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
+
+#if 0
+	if (!(dev->flags&IFF_UP))
+		return;
+	/* In the future, we will want to add default routes here */
+
+#endif
+}
+
+static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
+{
+	int found_it = 0;
+	struct net_device *dev;
+	struct dn_dev *dn_db;
+	struct dn_ifaddr *ifa2;
+
+	ASSERT_RTNL();
+
+	/* Scan device list */
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		dn_db = rcu_dereference(dev->dn_ptr);
+		if (dn_db == NULL)
+			continue;
+		for (ifa2 = rcu_dereference(dn_db->ifa_list);
+		     ifa2 != NULL;
+		     ifa2 = rcu_dereference(ifa2->ifa_next)) {
+			if (ifa2->ifa_local == ifa->ifa_local) {
+				found_it = 1;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	if (found_it == 0) {
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
+
+		if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+			if (dn_fib_sync_down(ifa->ifa_local, NULL, 0))
+				dn_fib_flush();
+		}
+	}
+}
+
+static void dn_fib_disable_addr(struct net_device *dev, int force)
+{
+	if (dn_fib_sync_down(0, dev, force))
+		dn_fib_flush();
+	dn_rt_cache_flush(0);
+	neigh_ifdown(&dn_neigh_table, dev);
+}
+
+static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr;
+
+	switch(event) {
+		case NETDEV_UP:
+			dn_fib_add_ifaddr(ifa);
+			dn_fib_sync_up(ifa->ifa_dev->dev);
+			dn_rt_cache_flush(-1);
+			break;
+		case NETDEV_DOWN:
+			dn_fib_del_ifaddr(ifa);
+			if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+				dn_fib_disable_addr(ifa->ifa_dev->dev, 1);
+			} else {
+				dn_rt_cache_flush(-1);
+			}
+			break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
+{
+	int ret = 0;
+	int scope = RT_SCOPE_NOWHERE;
+
+	if (force)
+		scope = -1;
+
+	for_fib_info() {
+		/*
+		 * This makes no sense for DECnet.... we will almost
+		 * certainly have more than one local address the same
+		 * over all our interfaces. It needs thinking about
+		 * some more.
+		 */
+		if (local && fi->fib_prefsrc == local) {
+			fi->fib_flags |= RTNH_F_DEAD;
+			ret++;
+		} else if (dev && fi->fib_nhs) {
+			int dead = 0;
+
+			change_nexthops(fi) {
+				if (nh->nh_flags&RTNH_F_DEAD)
+					dead++;
+				else if (nh->nh_dev == dev &&
+						nh->nh_scope != scope) {
+					spin_lock_bh(&dn_fib_multipath_lock);
+					nh->nh_flags |= RTNH_F_DEAD;
+					fi->fib_power -= nh->nh_power;
+					nh->nh_power = 0;
+					spin_unlock_bh(&dn_fib_multipath_lock);
+					dead++;
+				}
+			} endfor_nexthops(fi)
+			if (dead == fi->fib_nhs) {
+				fi->fib_flags |= RTNH_F_DEAD;
+				ret++;
+			}
+		}
+	} endfor_fib_info();
+	return ret;
+}
+
+
+static int dn_fib_sync_up(struct net_device *dev)
+{
+	int ret = 0;
+
+	if (!(dev->flags&IFF_UP))
+		return 0;
+
+	for_fib_info() {
+		int alive = 0;
+
+		change_nexthops(fi) {
+			if (!(nh->nh_flags&RTNH_F_DEAD)) {
+				alive++;
+				continue;
+			}
+			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+				continue;
+			if (nh->nh_dev != dev || dev->dn_ptr == NULL)
+				continue;
+			alive++;
+			spin_lock_bh(&dn_fib_multipath_lock);
+			nh->nh_power = 0;
+			nh->nh_flags &= ~RTNH_F_DEAD;
+			spin_unlock_bh(&dn_fib_multipath_lock);
+		} endfor_nexthops(fi);
+
+		if (alive > 0) {
+			fi->fib_flags &= ~RTNH_F_DEAD;
+			ret++;
+		}
+	} endfor_fib_info();
+	return ret;
+}
+
+static struct notifier_block dn_fib_dnaddr_notifier = {
+	.notifier_call = dn_fib_dnaddr_event,
+};
+
+void __exit dn_fib_cleanup(void)
+{
+	dn_fib_table_cleanup();
+	dn_fib_rules_cleanup();
+
+	unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier);
+}
+
+
+void __init dn_fib_init(void)
+{
+	dn_fib_table_init();
+	dn_fib_rules_init();
+
+	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
+
+	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL);
+	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL);
+}
+
+
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
new file mode 100644
index 00000000..9810610d
--- /dev/null
+++ b/net/decnet/dn_neigh.c
@@ -0,0 +1,612 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Neighbour Functions (Adjacency Database and
+ *                                                        On-Ethernet Cache)
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ *     Steve Whitehouse     : Fixed router listing routine
+ *     Steve Whitehouse     : Added error_report functions
+ *     Steve Whitehouse     : Added default router detection
+ *     Steve Whitehouse     : Hop counts in outgoing messages
+ *     Steve Whitehouse     : Fixed src/dst in outgoing messages so
+ *                            forwarding now stands a good chance of
+ *                            working.
+ *     Steve Whitehouse     : Fixed neighbour states (for now anyway).
+ *     Steve Whitehouse     : Made error_report functions dummies. This
+ *                            is not the right place to return skbs.
+ *     Steve Whitehouse     : Convert to seq_file
+ *
+ */
+
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+#include <linux/netfilter_decnet.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <asm/atomic.h>
+#include <net/net_namespace.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_neigh.h>
+#include <net/dn_route.h>
+
+static int dn_neigh_construct(struct neighbour *);
+static void dn_long_error_report(struct neighbour *, struct sk_buff *);
+static void dn_short_error_report(struct neighbour *, struct sk_buff *);
+static int dn_long_output(struct sk_buff *);
+static int dn_short_output(struct sk_buff *);
+static int dn_phase3_output(struct sk_buff *);
+
+
+/*
+ * For talking to broadcast devices: Ethernet & PPP
+ */
+static const struct neigh_ops dn_long_ops = {
+	.family =		AF_DECnet,
+	.error_report =		dn_long_error_report,
+	.output =		dn_long_output,
+	.connected_output =	dn_long_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+/*
+ * For talking to pointopoint and multidrop devices: DDCMP and X.25
+ */
+static const struct neigh_ops dn_short_ops = {
+	.family =		AF_DECnet,
+	.error_report =		dn_short_error_report,
+	.output =		dn_short_output,
+	.connected_output =	dn_short_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+/*
+ * For talking to DECnet phase III nodes
+ */
+static const struct neigh_ops dn_phase3_ops = {
+	.family =		AF_DECnet,
+	.error_report =		dn_short_error_report, /* Can use short version here */
+	.output =		dn_phase3_output,
+	.connected_output =	dn_phase3_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit
+};
+
+static u32 dn_neigh_hash(const void *pkey,
+			 const struct net_device *dev,
+			 __u32 hash_rnd)
+{
+	return jhash_2words(*(__u16 *)pkey, 0, hash_rnd);
+}
+
+struct neigh_table dn_neigh_table = {
+	.family =			PF_DECnet,
+	.entry_size =			sizeof(struct dn_neigh),
+	.key_len =			sizeof(__le16),
+	.hash =				dn_neigh_hash,
+	.constructor =			dn_neigh_construct,
+	.id =				"dn_neigh_cache",
+	.parms ={
+		.tbl =			&dn_neigh_table,
+		.base_reachable_time =	30 * HZ,
+		.retrans_time =	1 * HZ,
+		.gc_staletime =	60 * HZ,
+		.reachable_time =		30 * HZ,
+		.delay_probe_time =	5 * HZ,
+		.queue_len =		3,
+		.ucast_probes =	0,
+		.app_probes =		0,
+		.mcast_probes =	0,
+		.anycast_delay =	0,
+		.proxy_delay =		0,
+		.proxy_qlen =		0,
+		.locktime =		1 * HZ,
+	},
+	.gc_interval =			30 * HZ,
+	.gc_thresh1 =			128,
+	.gc_thresh2 =			512,
+	.gc_thresh3 =			1024,
+};
+
+static int dn_neigh_construct(struct neighbour *neigh)
+{
+	struct net_device *dev = neigh->dev;
+	struct dn_neigh *dn = (struct dn_neigh *)neigh;
+	struct dn_dev *dn_db;
+	struct neigh_parms *parms;
+
+	rcu_read_lock();
+	dn_db = rcu_dereference(dev->dn_ptr);
+	if (dn_db == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = dn_db->neigh_parms;
+	if (!parms) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+
+	if (dn_db->use_long)
+		neigh->ops = &dn_long_ops;
+	else
+		neigh->ops = &dn_short_ops;
+	rcu_read_unlock();
+
+	if (dn->flags & DN_NDFLAG_P3)
+		neigh->ops = &dn_phase3_ops;
+
+	neigh->nud_state = NUD_NOARP;
+	neigh->output = neigh->ops->connected_output;
+
+	if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT))
+		memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+	else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK))
+		dn_dn2eth(neigh->ha, dn->addr);
+	else {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "Trying to create neigh for hw %d\n",  dev->type);
+		return -EINVAL;
+	}
+
+	/*
+	 * Make an estimate of the remote block size by assuming that its
+	 * two less then the device mtu, which it true for ethernet (and
+	 * other things which support long format headers) since there is
+	 * an extra length field (of 16 bits) which isn't part of the
+	 * ethernet headers and which the DECnet specs won't admit is part
+	 * of the DECnet routing headers either.
+	 *
+	 * If we over estimate here its no big deal, the NSP negotiations
+	 * will prevent us from sending packets which are too large for the
+	 * remote node to handle. In any case this figure is normally updated
+	 * by a hello message in most cases.
+	 */
+	dn->blksize = dev->mtu - 2;
+
+	return 0;
+}
+
+static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	printk(KERN_DEBUG "dn_long_error_report: called\n");
+	kfree_skb(skb);
+}
+
+
+static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	printk(KERN_DEBUG "dn_short_error_report: called\n");
+	kfree_skb(skb);
+}
+
+static int dn_neigh_output_packet(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct dn_route *rt = (struct dn_route *)dst;
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	struct net_device *dev = neigh->dev;
+	char mac_addr[ETH_ALEN];
+
+	dn_dn2eth(mac_addr, rt->rt_local_src);
+	if (dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha,
+			    mac_addr, skb->len) >= 0)
+		return neigh->ops->queue_xmit(skb);
+
+	if (net_ratelimit())
+		printk(KERN_DEBUG "dn_neigh_output_packet: oops, can't send packet\n");
+
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int dn_long_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	struct net_device *dev = neigh->dev;
+	int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
+	unsigned char *data;
+	struct dn_long_packet *lp;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+
+	if (skb_headroom(skb) < headroom) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+		if (skb2 == NULL) {
+			if (net_ratelimit())
+				printk(KERN_CRIT "dn_long_output: no memory\n");
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+		kfree_skb(skb);
+		skb = skb2;
+		if (net_ratelimit())
+			printk(KERN_INFO "dn_long_output: Increasing headroom\n");
+	}
+
+	data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
+	lp = (struct dn_long_packet *)(data+3);
+
+	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
+	*(data + 2) = 1 | DN_RT_F_PF; /* Padding */
+
+	lp->msgflg   = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
+	lp->d_area   = lp->d_subarea = 0;
+	dn_dn2eth(lp->d_id, cb->dst);
+	lp->s_area   = lp->s_subarea = 0;
+	dn_dn2eth(lp->s_id, cb->src);
+	lp->nl2      = 0;
+	lp->visit_ct = cb->hops & 0x3f;
+	lp->s_class  = 0;
+	lp->pt       = 0;
+
+	skb_reset_network_header(skb);
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL,
+		       neigh->dev, dn_neigh_output_packet);
+}
+
+static int dn_short_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	struct net_device *dev = neigh->dev;
+	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
+	struct dn_short_packet *sp;
+	unsigned char *data;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+
+	if (skb_headroom(skb) < headroom) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+		if (skb2 == NULL) {
+			if (net_ratelimit())
+				printk(KERN_CRIT "dn_short_output: no memory\n");
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+		kfree_skb(skb);
+		skb = skb2;
+		if (net_ratelimit())
+			printk(KERN_INFO "dn_short_output: Increasing headroom\n");
+	}
+
+	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
+	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
+	sp = (struct dn_short_packet *)(data+2);
+
+	sp->msgflg     = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
+	sp->dstnode    = cb->dst;
+	sp->srcnode    = cb->src;
+	sp->forward    = cb->hops & 0x3f;
+
+	skb_reset_network_header(skb);
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL,
+		       neigh->dev, dn_neigh_output_packet);
+}
+
+/*
+ * Phase 3 output is the same is short output, execpt that
+ * it clears the area bits before transmission.
+ */
+static int dn_phase3_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	struct net_device *dev = neigh->dev;
+	int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
+	struct dn_short_packet *sp;
+	unsigned char *data;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+	if (skb_headroom(skb) < headroom) {
+		struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
+		if (skb2 == NULL) {
+			if (net_ratelimit())
+				printk(KERN_CRIT "dn_phase3_output: no memory\n");
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+		kfree_skb(skb);
+		skb = skb2;
+		if (net_ratelimit())
+			printk(KERN_INFO "dn_phase3_output: Increasing headroom\n");
+	}
+
+	data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
+	*((__le16 *)data) = cpu_to_le16(skb->len - 2);
+	sp = (struct dn_short_packet *)(data + 2);
+
+	sp->msgflg   = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
+	sp->dstnode  = cb->dst & cpu_to_le16(0x03ff);
+	sp->srcnode  = cb->src & cpu_to_le16(0x03ff);
+	sp->forward  = cb->hops & 0x3f;
+
+	skb_reset_network_header(skb);
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING, skb, NULL,
+		       neigh->dev, dn_neigh_output_packet);
+}
+
+/*
+ * Unfortunately, the neighbour code uses the device in its hash
+ * function, so we don't get any advantage from it. This function
+ * basically does a neigh_lookup(), but without comparing the device
+ * field. This is required for the On-Ethernet cache
+ */
+
+/*
+ * Pointopoint link receives a hello message
+ */
+void dn_neigh_pointopoint_hello(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+/*
+ * Ethernet router hello message received
+ */
+int dn_neigh_router_hello(struct sk_buff *skb)
+{
+	struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
+
+	struct neighbour *neigh;
+	struct dn_neigh *dn;
+	struct dn_dev *dn_db;
+	__le16 src;
+
+	src = dn_eth2dn(msg->id);
+
+	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
+
+	dn = (struct dn_neigh *)neigh;
+
+	if (neigh) {
+		write_lock(&neigh->lock);
+
+		neigh->used = jiffies;
+		dn_db = rcu_dereference(neigh->dev->dn_ptr);
+
+		if (!(neigh->nud_state & NUD_PERMANENT)) {
+			neigh->updated = jiffies;
+
+			if (neigh->dev->type == ARPHRD_ETHER)
+				memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
+
+			dn->blksize  = le16_to_cpu(msg->blksize);
+			dn->priority = msg->priority;
+
+			dn->flags &= ~DN_NDFLAG_P3;
+
+			switch(msg->iinfo & DN_RT_INFO_TYPE) {
+				case DN_RT_INFO_L1RT:
+					dn->flags &=~DN_NDFLAG_R2;
+					dn->flags |= DN_NDFLAG_R1;
+					break;
+				case DN_RT_INFO_L2RT:
+					dn->flags |= DN_NDFLAG_R2;
+			}
+		}
+
+		/* Only use routers in our area */
+		if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) {
+			if (!dn_db->router) {
+				dn_db->router = neigh_clone(neigh);
+			} else {
+				if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
+					neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+			}
+		}
+		write_unlock(&neigh->lock);
+		neigh_release(neigh);
+	}
+
+	kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Endnode hello message received
+ */
+int dn_neigh_endnode_hello(struct sk_buff *skb)
+{
+	struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
+	struct neighbour *neigh;
+	struct dn_neigh *dn;
+	__le16 src;
+
+	src = dn_eth2dn(msg->id);
+
+	neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
+
+	dn = (struct dn_neigh *)neigh;
+
+	if (neigh) {
+		write_lock(&neigh->lock);
+
+		neigh->used = jiffies;
+
+		if (!(neigh->nud_state & NUD_PERMANENT)) {
+			neigh->updated = jiffies;
+
+			if (neigh->dev->type == ARPHRD_ETHER)
+				memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
+			dn->flags   &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2);
+			dn->blksize  = le16_to_cpu(msg->blksize);
+			dn->priority = 0;
+		}
+
+		write_unlock(&neigh->lock);
+		neigh_release(neigh);
+	}
+
+	kfree_skb(skb);
+	return 0;
+}
+
+static char *dn_find_slot(char *base, int max, int priority)
+{
+	int i;
+	unsigned char *min = NULL;
+
+	base += 6; /* skip first id */
+
+	for(i = 0; i < max; i++) {
+		if (!min || (*base < *min))
+			min = base;
+		base += 7; /* find next priority */
+	}
+
+	if (!min)
+		return NULL;
+
+	return (*min < priority) ? (min - 6) : NULL;
+}
+
+struct elist_cb_state {
+	struct net_device *dev;
+	unsigned char *ptr;
+	unsigned char *rs;
+	int t, n;
+};
+
+static void neigh_elist_cb(struct neighbour *neigh, void *_info)
+{
+	struct elist_cb_state *s = _info;
+	struct dn_neigh *dn;
+
+	if (neigh->dev != s->dev)
+		return;
+
+	dn = (struct dn_neigh *) neigh;
+	if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
+		return;
+
+	if (s->t == s->n)
+		s->rs = dn_find_slot(s->ptr, s->n, dn->priority);
+	else
+		s->t++;
+	if (s->rs == NULL)
+		return;
+
+	dn_dn2eth(s->rs, dn->addr);
+	s->rs += 6;
+	*(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0;
+	*(s->rs) |= dn->priority;
+	s->rs++;
+}
+
+int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
+{
+	struct elist_cb_state state;
+
+	state.dev = dev;
+	state.t = 0;
+	state.n = n;
+	state.ptr = ptr;
+	state.rs = ptr;
+
+	neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state);
+
+	return state.t;
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+static inline void dn_neigh_format_entry(struct seq_file *seq,
+					 struct neighbour *n)
+{
+	struct dn_neigh *dn = (struct dn_neigh *) n;
+	char buf[DN_ASCBUF_LEN];
+
+	read_lock(&n->lock);
+	seq_printf(seq, "%-7s %s%s%s   %02x    %02d  %07ld %-8s\n",
+		   dn_addr2asc(le16_to_cpu(dn->addr), buf),
+		   (dn->flags&DN_NDFLAG_R1) ? "1" : "-",
+		   (dn->flags&DN_NDFLAG_R2) ? "2" : "-",
+		   (dn->flags&DN_NDFLAG_P3) ? "3" : "-",
+		   dn->n.nud_state,
+		   atomic_read(&dn->n.refcnt),
+		   dn->blksize,
+		   (dn->n.dev) ? dn->n.dev->name : "?");
+	read_unlock(&n->lock);
+}
+
+static int dn_neigh_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Addr    Flags State Use Blksize Dev\n");
+	} else {
+		dn_neigh_format_entry(seq, v);
+	}
+
+	return 0;
+}
+
+static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return neigh_seq_start(seq, pos, &dn_neigh_table,
+			       NEIGH_SEQ_NEIGH_ONLY);
+}
+
+static const struct seq_operations dn_neigh_seq_ops = {
+	.start = dn_neigh_seq_start,
+	.next  = neigh_seq_next,
+	.stop  = neigh_seq_stop,
+	.show  = dn_neigh_seq_show,
+};
+
+static int dn_neigh_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &dn_neigh_seq_ops,
+			    sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations dn_neigh_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= dn_neigh_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+#endif
+
+void __init dn_neigh_init(void)
+{
+	neigh_table_init(&dn_neigh_table);
+	proc_net_fops_create(&init_net, "decnet_neigh", S_IRUGO, &dn_neigh_seq_fops);
+}
+
+void __exit dn_neigh_cleanup(void)
+{
+	proc_net_remove(&init_net, "decnet_neigh");
+	neigh_table_clear(&dn_neigh_table);
+}
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
new file mode 100644
index 00000000..b430549e
--- /dev/null
+++ b/net/decnet/dn_nsp_in.c
@@ -0,0 +1,912 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Network Services Protocol (Input)
+ *
+ * Author:      Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *
+ *    Steve Whitehouse:  Split into dn_nsp_in.c and dn_nsp_out.c from
+ *                       original dn_nsp.c.
+ *    Steve Whitehouse:  Updated to work with my new routing architecture.
+ *    Steve Whitehouse:  Add changes from Eduardo Serrat's patches.
+ *    Steve Whitehouse:  Put all ack handling code in a common routine.
+ *    Steve Whitehouse:  Put other common bits into dn_nsp_rx()
+ *    Steve Whitehouse:  More checks on skb->len to catch bogus packets
+ *                       Fixed various race conditions and possible nasties.
+ *    Steve Whitehouse:  Now handles returned conninit frames.
+ *     David S. Miller:  New socket locking
+ *    Steve Whitehouse:  Fixed lockup when socket filtering was enabled.
+ *         Paul Koning:  Fix to push CC sockets into RUN when acks are
+ *                       received.
+ *    Steve Whitehouse:
+ *   Patrick Caulfield:  Checking conninits for correctness & sending of error
+ *                       responses.
+ *    Steve Whitehouse:  Added backlog congestion level return codes.
+ *   Patrick Caulfield:
+ *    Steve Whitehouse:  Added flow control support (outbound)
+ *    Steve Whitehouse:  Prepare for nonlinear skbs
+ */
+
+/******************************************************************************
+    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/termios.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_decnet.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+extern int decnet_log_martians;
+
+static void dn_log_martian(struct sk_buff *skb, const char *msg)
+{
+	if (decnet_log_martians && net_ratelimit()) {
+		char *devname = skb->dev ? skb->dev->name : "???";
+		struct dn_skb_cb *cb = DN_SKB_CB(skb);
+		printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n",
+		       msg, devname, le16_to_cpu(cb->src), le16_to_cpu(cb->dst),
+		       le16_to_cpu(cb->src_port), le16_to_cpu(cb->dst_port));
+	}
+}
+
+/*
+ * For this function we've flipped the cross-subchannel bit
+ * if the message is an otherdata or linkservice message. Thus
+ * we can use it to work out what to update.
+ */
+static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned short type = ((ack >> 12) & 0x0003);
+	int wakeup = 0;
+
+	switch(type) {
+		case 0: /* ACK - Data */
+			if (dn_after(ack, scp->ackrcv_dat)) {
+				scp->ackrcv_dat = ack & 0x0fff;
+				wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->data_xmit_queue, ack);
+			}
+			break;
+		case 1: /* NAK - Data */
+			break;
+		case 2: /* ACK - OtherData */
+			if (dn_after(ack, scp->ackrcv_oth)) {
+				scp->ackrcv_oth = ack & 0x0fff;
+				wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->other_xmit_queue, ack);
+			}
+			break;
+		case 3: /* NAK - OtherData */
+			break;
+	}
+
+	if (wakeup && !sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+}
+
+/*
+ * This function is a universal ack processor.
+ */
+static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
+{
+	__le16 *ptr = (__le16 *)skb->data;
+	int len = 0;
+	unsigned short ack;
+
+	if (skb->len < 2)
+		return len;
+
+	if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
+		skb_pull(skb, 2);
+		ptr++;
+		len += 2;
+		if ((ack & 0x4000) == 0) {
+			if (oth)
+				ack ^= 0x2000;
+			dn_ack(sk, skb, ack);
+		}
+	}
+
+	if (skb->len < 2)
+		return len;
+
+	if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
+		skb_pull(skb, 2);
+		len += 2;
+		if ((ack & 0x4000) == 0) {
+			if (oth)
+				ack ^= 0x2000;
+			dn_ack(sk, skb, ack);
+		}
+	}
+
+	return len;
+}
+
+
+/**
+ * dn_check_idf - Check an image data field format is correct.
+ * @pptr: Pointer to pointer to image data
+ * @len: Pointer to length of image data
+ * @max: The maximum allowed length of the data in the image data field
+ * @follow_on: Check that this many bytes exist beyond the end of the image data
+ *
+ * Returns: 0 if ok, -1 on error
+ */
+static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on)
+{
+	unsigned char *ptr = *pptr;
+	unsigned char flen = *ptr++;
+
+	(*len)--;
+	if (flen > max)
+		return -1;
+	if ((flen + follow_on) > *len)
+		return -1;
+
+	*len -= flen;
+	*pptr = ptr + flen;
+	return 0;
+}
+
+/*
+ * Table of reason codes to pass back to node which sent us a badly
+ * formed message, plus text messages for the log. A zero entry in
+ * the reason field means "don't reply" otherwise a disc init is sent with
+ * the specified reason code.
+ */
+static struct {
+	unsigned short reason;
+	const char *text;
+} ci_err_table[] = {
+ { 0,             "CI: Truncated message" },
+ { NSP_REASON_ID, "CI: Destination username error" },
+ { NSP_REASON_ID, "CI: Destination username type" },
+ { NSP_REASON_US, "CI: Source username error" },
+ { 0,             "CI: Truncated at menuver" },
+ { 0,             "CI: Truncated before access or user data" },
+ { NSP_REASON_IO, "CI: Access data format error" },
+ { NSP_REASON_IO, "CI: User data format error" }
+};
+
+/*
+ * This function uses a slightly different lookup method
+ * to find its sockets, since it searches on object name/number
+ * rather than port numbers. Various tests are done to ensure that
+ * the incoming data is in the correct format before it is queued to
+ * a socket.
+ */
+static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data;
+	struct sockaddr_dn dstaddr;
+	struct sockaddr_dn srcaddr;
+	unsigned char type = 0;
+	int dstlen;
+	int srclen;
+	unsigned char *ptr;
+	int len;
+	int err = 0;
+	unsigned char menuver;
+
+	memset(&dstaddr, 0, sizeof(struct sockaddr_dn));
+	memset(&srcaddr, 0, sizeof(struct sockaddr_dn));
+
+	/*
+	 * 1. Decode & remove message header
+	 */
+	cb->src_port = msg->srcaddr;
+	cb->dst_port = msg->dstaddr;
+	cb->services = msg->services;
+	cb->info     = msg->info;
+	cb->segsize  = le16_to_cpu(msg->segsize);
+
+	if (!pskb_may_pull(skb, sizeof(*msg)))
+		goto err_out;
+
+	skb_pull(skb, sizeof(*msg));
+
+	len = skb->len;
+	ptr = skb->data;
+
+	/*
+	 * 2. Check destination end username format
+	 */
+	dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type);
+	err++;
+	if (dstlen < 0)
+		goto err_out;
+
+	err++;
+	if (type > 1)
+		goto err_out;
+
+	len -= dstlen;
+	ptr += dstlen;
+
+	/*
+	 * 3. Check source end username format
+	 */
+	srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type);
+	err++;
+	if (srclen < 0)
+		goto err_out;
+
+	len -= srclen;
+	ptr += srclen;
+	err++;
+	if (len < 1)
+		goto err_out;
+
+	menuver = *ptr;
+	ptr++;
+	len--;
+
+	/*
+	 * 4. Check that optional data actually exists if menuver says it does
+	 */
+	err++;
+	if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1))
+		goto err_out;
+
+	/*
+	 * 5. Check optional access data format
+	 */
+	err++;
+	if (menuver & DN_MENUVER_ACC) {
+		if (dn_check_idf(&ptr, &len, 39, 1))
+			goto err_out;
+		if (dn_check_idf(&ptr, &len, 39, 1))
+			goto err_out;
+		if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0))
+			goto err_out;
+	}
+
+	/*
+	 * 6. Check optional user data format
+	 */
+	err++;
+	if (menuver & DN_MENUVER_USR) {
+		if (dn_check_idf(&ptr, &len, 16, 0))
+			goto err_out;
+	}
+
+	/*
+	 * 7. Look up socket based on destination end username
+	 */
+	return dn_sklist_find_listener(&dstaddr);
+err_out:
+	dn_log_martian(skb, ci_err_table[err].text);
+	*reason = ci_err_table[err].reason;
+	return NULL;
+}
+
+
+static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
+{
+	if (sk_acceptq_is_full(sk)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	sk->sk_ack_backlog++;
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_state_change(sk);
+}
+
+static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned char *ptr;
+
+	if (skb->len < 4)
+		goto out;
+
+	ptr = skb->data;
+	cb->services = *ptr++;
+	cb->info = *ptr++;
+	cb->segsize = le16_to_cpu(*(__le16 *)ptr);
+
+	if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
+		scp->persist = 0;
+		scp->addrrem = cb->src_port;
+		sk->sk_state = TCP_ESTABLISHED;
+		scp->state = DN_RUN;
+		scp->services_rem = cb->services;
+		scp->info_rem = cb->info;
+		scp->segsize_rem = cb->segsize;
+
+		if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
+			scp->max_window = decnet_no_fc_max_cwnd;
+
+		if (skb->len > 0) {
+			u16 dlen = *skb->data;
+			if ((dlen <= 16) && (dlen <= skb->len)) {
+				scp->conndata_in.opt_optl = cpu_to_le16(dlen);
+				skb_copy_from_linear_data_offset(skb, 1,
+					      scp->conndata_in.opt_data, dlen);
+			}
+		}
+		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+	}
+
+out:
+	kfree_skb(skb);
+}
+
+static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->state == DN_CI) {
+		scp->state = DN_CD;
+		scp->persist = 0;
+	}
+
+	kfree_skb(skb);
+}
+
+static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	unsigned short reason;
+
+	if (skb->len < 2)
+		goto out;
+
+	reason = le16_to_cpu(*(__le16 *)skb->data);
+	skb_pull(skb, 2);
+
+	scp->discdata_in.opt_status = cpu_to_le16(reason);
+	scp->discdata_in.opt_optl   = 0;
+	memset(scp->discdata_in.opt_data, 0, 16);
+
+	if (skb->len > 0) {
+		u16 dlen = *skb->data;
+		if ((dlen <= 16) && (dlen <= skb->len)) {
+			scp->discdata_in.opt_optl = cpu_to_le16(dlen);
+			skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen);
+		}
+	}
+
+	scp->addrrem = cb->src_port;
+	sk->sk_state = TCP_CLOSE;
+
+	switch(scp->state) {
+		case DN_CI:
+		case DN_CD:
+			scp->state = DN_RJ;
+			sk->sk_err = ECONNREFUSED;
+			break;
+		case DN_RUN:
+			sk->sk_shutdown |= SHUTDOWN_MASK;
+			scp->state = DN_DN;
+			break;
+		case DN_DI:
+			scp->state = DN_DIC;
+			break;
+	}
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		if (sk->sk_socket->state != SS_UNCONNECTED)
+			sk->sk_socket->state = SS_DISCONNECTING;
+		sk->sk_state_change(sk);
+	}
+
+	/*
+	 * It appears that its possible for remote machines to send disc
+	 * init messages with no port identifier if we are in the CI and
+	 * possibly also the CD state. Obviously we shouldn't reply with
+	 * a message if we don't know what the end point is.
+	 */
+	if (scp->addrrem) {
+		dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
+	}
+	scp->persist_fxn = dn_destroy_timer;
+	scp->persist = dn_nsp_persist(sk);
+
+out:
+	kfree_skb(skb);
+}
+
+/*
+ * disc_conf messages are also called no_resources or no_link
+ * messages depending upon the "reason" field.
+ */
+static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned short reason;
+
+	if (skb->len != 2)
+		goto out;
+
+	reason = le16_to_cpu(*(__le16 *)skb->data);
+
+	sk->sk_state = TCP_CLOSE;
+
+	switch(scp->state) {
+		case DN_CI:
+			scp->state = DN_NR;
+			break;
+		case DN_DR:
+			if (reason == NSP_REASON_DC)
+				scp->state = DN_DRC;
+			if (reason == NSP_REASON_NL)
+				scp->state = DN_CN;
+			break;
+		case DN_DI:
+			scp->state = DN_DIC;
+			break;
+		case DN_RUN:
+			sk->sk_shutdown |= SHUTDOWN_MASK;
+		case DN_CC:
+			scp->state = DN_CN;
+	}
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		if (sk->sk_socket->state != SS_UNCONNECTED)
+			sk->sk_socket->state = SS_DISCONNECTING;
+		sk->sk_state_change(sk);
+	}
+
+	scp->persist_fxn = dn_destroy_timer;
+	scp->persist = dn_nsp_persist(sk);
+
+out:
+	kfree_skb(skb);
+}
+
+static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned short segnum;
+	unsigned char lsflags;
+	signed char fcval;
+	int wake_up = 0;
+	char *ptr = skb->data;
+	unsigned char fctype = scp->services_rem & NSP_FC_MASK;
+
+	if (skb->len != 4)
+		goto out;
+
+	segnum = le16_to_cpu(*(__le16 *)ptr);
+	ptr += 2;
+	lsflags = *(unsigned char *)ptr++;
+	fcval = *ptr;
+
+	/*
+	 * Here we ignore erronous packets which should really
+	 * should cause a connection abort. It is not critical
+	 * for now though.
+	 */
+	if (lsflags & 0xf8)
+		goto out;
+
+	if (seq_next(scp->numoth_rcv, segnum)) {
+		seq_add(&scp->numoth_rcv, 1);
+		switch(lsflags & 0x04) { /* FCVAL INT */
+		case 0x00: /* Normal Request */
+			switch(lsflags & 0x03) { /* FCVAL MOD */
+			case 0x00: /* Request count */
+				if (fcval < 0) {
+					unsigned char p_fcval = -fcval;
+					if ((scp->flowrem_dat > p_fcval) &&
+					    (fctype == NSP_FC_SCMC)) {
+						scp->flowrem_dat -= p_fcval;
+					}
+				} else if (fcval > 0) {
+					scp->flowrem_dat += fcval;
+					wake_up = 1;
+				}
+				break;
+			case 0x01: /* Stop outgoing data */
+				scp->flowrem_sw = DN_DONTSEND;
+				break;
+			case 0x02: /* Ok to start again */
+				scp->flowrem_sw = DN_SEND;
+				dn_nsp_output(sk);
+				wake_up = 1;
+			}
+			break;
+		case 0x04: /* Interrupt Request */
+			if (fcval > 0) {
+				scp->flowrem_oth += fcval;
+				wake_up = 1;
+			}
+			break;
+		}
+		if (wake_up && !sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+	}
+
+	dn_nsp_send_oth_ack(sk);
+
+out:
+	kfree_skb(skb);
+}
+
+/*
+ * Copy of sock_queue_rcv_skb (from sock.h) without
+ * bh_lock_sock() (its already held when this is called) which
+ * also allows data and other data to be queued to a socket.
+ */
+static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
+{
+	int err;
+	int skb_len;
+
+	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+	   number of warnings when compiling with -W --ANK
+	 */
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = sk_filter(sk, skb);
+	if (err)
+		goto out;
+
+	skb_len = skb->len;
+	skb_set_owner_r(skb, sk);
+	skb_queue_tail(queue, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb_len);
+out:
+	return err;
+}
+
+static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned short segnum;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	int queued = 0;
+
+	if (skb->len < 2)
+		goto out;
+
+	cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
+	skb_pull(skb, 2);
+
+	if (seq_next(scp->numoth_rcv, segnum)) {
+
+		if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) {
+			seq_add(&scp->numoth_rcv, 1);
+			scp->other_report = 0;
+			queued = 1;
+		}
+	}
+
+	dn_nsp_send_oth_ack(sk);
+out:
+	if (!queued)
+		kfree_skb(skb);
+}
+
+static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
+{
+	int queued = 0;
+	unsigned short segnum;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (skb->len < 2)
+		goto out;
+
+	cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
+	skb_pull(skb, 2);
+
+	if (seq_next(scp->numdat_rcv, segnum)) {
+		if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) {
+			seq_add(&scp->numdat_rcv, 1);
+			queued = 1;
+		}
+
+		if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) {
+			scp->flowloc_sw = DN_DONTSEND;
+			dn_nsp_send_link(sk, DN_DONTSEND, 0);
+		}
+	}
+
+	dn_nsp_send_data_ack(sk);
+out:
+	if (!queued)
+		kfree_skb(skb);
+}
+
+/*
+ * If one of our conninit messages is returned, this function
+ * deals with it. It puts the socket into the NO_COMMUNICATION
+ * state.
+ */
+static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->state == DN_CI) {
+		scp->state = DN_NC;
+		sk->sk_state = TCP_CLOSE;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+	}
+
+	kfree_skb(skb);
+}
+
+static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	int ret = NET_RX_DROP;
+
+	/* Must not reply to returned packets */
+	if (cb->rt_flags & DN_RT_F_RTS)
+		goto out;
+
+	if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) {
+		switch(cb->nsp_flags & 0x70) {
+			case 0x10:
+			case 0x60: /* (Retransmitted) Connect Init */
+				dn_nsp_return_disc(skb, NSP_DISCINIT, reason);
+				ret = NET_RX_SUCCESS;
+				break;
+			case 0x20: /* Connect Confirm */
+				dn_nsp_return_disc(skb, NSP_DISCCONF, reason);
+				ret = NET_RX_SUCCESS;
+				break;
+		}
+	}
+
+out:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int dn_nsp_rx_packet(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct sock *sk = NULL;
+	unsigned char *ptr = (unsigned char *)skb->data;
+	unsigned short reason = NSP_REASON_NL;
+
+	if (!pskb_may_pull(skb, 2))
+		goto free_out;
+
+	skb_reset_transport_header(skb);
+	cb->nsp_flags = *ptr++;
+
+	if (decnet_debug_level & 2)
+		printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags);
+
+	if (cb->nsp_flags & 0x83)
+		goto free_out;
+
+	/*
+	 * Filter out conninits and useless packet types
+	 */
+	if ((cb->nsp_flags & 0x0c) == 0x08) {
+		switch(cb->nsp_flags & 0x70) {
+			case 0x00: /* NOP */
+			case 0x70: /* Reserved */
+			case 0x50: /* Reserved, Phase II node init */
+				goto free_out;
+			case 0x10:
+			case 0x60:
+				if (unlikely(cb->rt_flags & DN_RT_F_RTS))
+					goto free_out;
+				sk = dn_find_listener(skb, &reason);
+				goto got_it;
+		}
+	}
+
+	if (!pskb_may_pull(skb, 3))
+		goto free_out;
+
+	/*
+	 * Grab the destination address.
+	 */
+	cb->dst_port = *(__le16 *)ptr;
+	cb->src_port = 0;
+	ptr += 2;
+
+	/*
+	 * If not a connack, grab the source address too.
+	 */
+	if (pskb_may_pull(skb, 5)) {
+		cb->src_port = *(__le16 *)ptr;
+		ptr += 2;
+		skb_pull(skb, 5);
+	}
+
+	/*
+	 * Returned packets...
+	 * Swap src & dst and look up in the normal way.
+	 */
+	if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
+		__le16 tmp = cb->dst_port;
+		cb->dst_port = cb->src_port;
+		cb->src_port = tmp;
+		tmp = cb->dst;
+		cb->dst = cb->src;
+		cb->src = tmp;
+	}
+
+	/*
+	 * Find the socket to which this skb is destined.
+	 */
+	sk = dn_find_by_skb(skb);
+got_it:
+	if (sk != NULL) {
+		struct dn_scp *scp = DN_SK(sk);
+
+		/* Reset backoff */
+		scp->nsp_rxtshift = 0;
+
+		/*
+		 * We linearize everything except data segments here.
+		 */
+		if (cb->nsp_flags & ~0x60) {
+			if (unlikely(skb_linearize(skb)))
+				goto free_out;
+		}
+
+		return sk_receive_skb(sk, skb, 0);
+	}
+
+	return dn_nsp_no_socket(skb, reason);
+
+free_out:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+int dn_nsp_rx(struct sk_buff *skb)
+{
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN, skb, skb->dev, NULL,
+		       dn_nsp_rx_packet);
+}
+
+/*
+ * This is the main receive routine for sockets. It is called
+ * from the above when the socket is not busy, and also from
+ * sock_release() when there is a backlog queued up.
+ */
+int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+	if (cb->rt_flags & DN_RT_F_RTS) {
+		if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68)
+			dn_returned_conn_init(sk, skb);
+		else
+			kfree_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+	/*
+	 * Control packet.
+	 */
+	if ((cb->nsp_flags & 0x0c) == 0x08) {
+		switch(cb->nsp_flags & 0x70) {
+			case 0x10:
+			case 0x60:
+				dn_nsp_conn_init(sk, skb);
+				break;
+			case 0x20:
+				dn_nsp_conn_conf(sk, skb);
+				break;
+			case 0x30:
+				dn_nsp_disc_init(sk, skb);
+				break;
+			case 0x40:
+				dn_nsp_disc_conf(sk, skb);
+				break;
+		}
+
+	} else if (cb->nsp_flags == 0x24) {
+		/*
+		 * Special for connacks, 'cos they don't have
+		 * ack data or ack otherdata info.
+		 */
+		dn_nsp_conn_ack(sk, skb);
+	} else {
+		int other = 1;
+
+		/* both data and ack frames can kick a CC socket into RUN */
+		if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) {
+			scp->state = DN_RUN;
+			sk->sk_state = TCP_ESTABLISHED;
+			sk->sk_state_change(sk);
+		}
+
+		if ((cb->nsp_flags & 0x1c) == 0)
+			other = 0;
+		if (cb->nsp_flags == 0x04)
+			other = 0;
+
+		/*
+		 * Read out ack data here, this applies equally
+		 * to data, other data, link serivce and both
+		 * ack data and ack otherdata.
+		 */
+		dn_process_ack(sk, skb, other);
+
+		/*
+		 * If we've some sort of data here then call a
+		 * suitable routine for dealing with it, otherwise
+		 * the packet is an ack and can be discarded.
+		 */
+		if ((cb->nsp_flags & 0x0c) == 0) {
+
+			if (scp->state != DN_RUN)
+				goto free_out;
+
+			switch(cb->nsp_flags) {
+				case 0x10: /* LS */
+					dn_nsp_linkservice(sk, skb);
+					break;
+				case 0x30: /* OD */
+					dn_nsp_otherdata(sk, skb);
+					break;
+				default:
+					dn_nsp_data(sk, skb);
+			}
+
+		} else { /* Ack, chuck it out here */
+free_out:
+			kfree_skb(skb);
+		}
+	}
+
+	return NET_RX_SUCCESS;
+}
+
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
new file mode 100644
index 00000000..bd78836a
--- /dev/null
+++ b/net/decnet/dn_nsp_out.c
@@ -0,0 +1,720 @@
+
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Network Services Protocol (Output)
+ *
+ * Author:      Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *
+ *    Steve Whitehouse:  Split into dn_nsp_in.c and dn_nsp_out.c from
+ *                       original dn_nsp.c.
+ *    Steve Whitehouse:  Updated to work with my new routing architecture.
+ *    Steve Whitehouse:  Added changes from Eduardo Serrat's patches.
+ *    Steve Whitehouse:  Now conninits have the "return" bit set.
+ *    Steve Whitehouse:  Fixes to check alloc'd skbs are non NULL!
+ *                       Moved output state machine into one function
+ *    Steve Whitehouse:  New output state machine
+ *         Paul Koning:  Connect Confirm message fix.
+ *      Eduardo Serrat:  Fix to stop dn_nsp_do_disc() sending malformed packets.
+ *    Steve Whitehouse:  dn_nsp_output() and friends needed a spring clean
+ *    Steve Whitehouse:  Moved dn_nsp_send() in here from route.h
+ */
+
+/******************************************************************************
+    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/termios.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/if_packet.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_nsp.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+
+static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
+
+static void dn_nsp_send(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct dn_scp *scp = DN_SK(sk);
+	struct dst_entry *dst;
+	struct flowidn fld;
+
+	skb_reset_transport_header(skb);
+	scp->stamp = jiffies;
+
+	dst = sk_dst_check(sk, 0);
+	if (dst) {
+try_again:
+		skb_dst_set(skb, dst);
+		dst_output(skb);
+		return;
+	}
+
+	memset(&fld, 0, sizeof(fld));
+	fld.flowidn_oif = sk->sk_bound_dev_if;
+	fld.saddr = dn_saddr2dn(&scp->addr);
+	fld.daddr = dn_saddr2dn(&scp->peer);
+	dn_sk_ports_copy(&fld, scp);
+	fld.flowidn_proto = DNPROTO_NSP;
+	if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) {
+		dst = sk_dst_get(sk);
+		sk->sk_route_caps = dst->dev->features;
+		goto try_again;
+	}
+
+	sk->sk_err = EHOSTUNREACH;
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+}
+
+
+/*
+ * If sk == NULL, then we assume that we are supposed to be making
+ * a routing layer skb. If sk != NULL, then we are supposed to be
+ * creating an skb for the NSP layer.
+ *
+ * The eventual aim is for each socket to have a cached header size
+ * for its outgoing packets, and to set hdr from this when sk != NULL.
+ */
+struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
+{
+	struct sk_buff *skb;
+	int hdr = 64;
+
+	if ((skb = alloc_skb(size + hdr, pri)) == NULL)
+		return NULL;
+
+	skb->protocol = htons(ETH_P_DNA_RT);
+	skb->pkt_type = PACKET_OUTGOING;
+
+	if (sk)
+		skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, hdr);
+
+	return skb;
+}
+
+/*
+ * Calculate persist timer based upon the smoothed round
+ * trip time and the variance. Backoff according to the
+ * nsp_backoff[] array.
+ */
+unsigned long dn_nsp_persist(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
+
+	t *= nsp_backoff[scp->nsp_rxtshift];
+
+	if (t < HZ) t = HZ;
+	if (t > (600*HZ)) t = (600*HZ);
+
+	if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT)
+		scp->nsp_rxtshift++;
+
+	/* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */
+
+	return t;
+}
+
+/*
+ * This is called each time we get an estimate for the rtt
+ * on the link.
+ */
+static void dn_nsp_rtt(struct sock *sk, long rtt)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	long srtt = (long)scp->nsp_srtt;
+	long rttvar = (long)scp->nsp_rttvar;
+	long delta;
+
+	/*
+	 * If the jiffies clock flips over in the middle of timestamp
+	 * gathering this value might turn out negative, so we make sure
+	 * that is it always positive here.
+	 */
+	if (rtt < 0)
+		rtt = -rtt;
+	/*
+	 * Add new rtt to smoothed average
+	 */
+	delta = ((rtt << 3) - srtt);
+	srtt += (delta >> 3);
+	if (srtt >= 1)
+		scp->nsp_srtt = (unsigned long)srtt;
+	else
+		scp->nsp_srtt = 1;
+
+	/*
+	 * Add new rtt varience to smoothed varience
+	 */
+	delta >>= 1;
+	rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2);
+	if (rttvar >= 1)
+		scp->nsp_rttvar = (unsigned long)rttvar;
+	else
+		scp->nsp_rttvar = 1;
+
+	/* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */
+}
+
+/**
+ * dn_nsp_clone_and_send - Send a data packet by cloning it
+ * @skb: The packet to clone and transmit
+ * @gfp: memory allocation flag
+ *
+ * Clone a queued data or other data packet and transmit it.
+ *
+ * Returns: The number of times the packet has been sent previously
+ */
+static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb,
+					     gfp_t gfp)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct sk_buff *skb2;
+	int ret = 0;
+
+	if ((skb2 = skb_clone(skb, gfp)) != NULL) {
+		ret = cb->xmit_count;
+		cb->xmit_count++;
+		cb->stamp = jiffies;
+		skb2->sk = skb->sk;
+		dn_nsp_send(skb2);
+	}
+
+	return ret;
+}
+
+/**
+ * dn_nsp_output - Try and send something from socket queues
+ * @sk: The socket whose queues are to be investigated
+ *
+ * Try and send the packet on the end of the data and other data queues.
+ * Other data gets priority over data, and if we retransmit a packet we
+ * reduce the window by dividing it in two.
+ *
+ */
+void dn_nsp_output(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff *skb;
+	unsigned reduce_win = 0;
+
+	/*
+	 * First we check for otherdata/linkservice messages
+	 */
+	if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL)
+		reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
+
+	/*
+	 * If we may not send any data, we don't.
+	 * If we are still trying to get some other data down the
+	 * channel, we don't try and send any data.
+	 */
+	if (reduce_win || (scp->flowrem_sw != DN_SEND))
+		goto recalc_window;
+
+	if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL)
+		reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
+
+	/*
+	 * If we've sent any frame more than once, we cut the
+	 * send window size in half. There is always a minimum
+	 * window size of one available.
+	 */
+recalc_window:
+	if (reduce_win) {
+		scp->snd_window >>= 1;
+		if (scp->snd_window < NSP_MIN_WINDOW)
+			scp->snd_window = NSP_MIN_WINDOW;
+	}
+}
+
+int dn_nsp_xmit_timeout(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	dn_nsp_output(sk);
+
+	if (!skb_queue_empty(&scp->data_xmit_queue) ||
+	    !skb_queue_empty(&scp->other_xmit_queue))
+		scp->persist = dn_nsp_persist(sk);
+
+	return 0;
+}
+
+static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
+{
+	unsigned char *ptr = skb_push(skb, len);
+
+	BUG_ON(len < 5);
+
+	*ptr++ = msgflag;
+	*((__le16 *)ptr) = scp->addrrem;
+	ptr += 2;
+	*((__le16 *)ptr) = scp->addrloc;
+	ptr += 2;
+	return (__le16 __force *)ptr;
+}
+
+static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	unsigned short acknum = scp->numdat_rcv & 0x0FFF;
+	unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
+	__le16 *ptr;
+
+	BUG_ON(hlen < 9);
+
+	scp->ackxmt_dat = acknum;
+	scp->ackxmt_oth = ackcrs;
+	acknum |= 0x8000;
+	ackcrs |= 0x8000;
+
+	/* If this is an "other data/ack" message, swap acknum and ackcrs */
+	if (other) {
+		unsigned short tmp = acknum;
+		acknum = ackcrs;
+		ackcrs = tmp;
+	}
+
+	/* Set "cross subchannel" bit in ackcrs */
+	ackcrs |= 0x2000;
+
+	ptr = (__le16 *)dn_mk_common_header(scp, skb, msgflag, hlen);
+
+	*ptr++ = cpu_to_le16(acknum);
+	*ptr++ = cpu_to_le16(ackcrs);
+
+	return ptr;
+}
+
+static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	__le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
+
+	if (unlikely(oth)) {
+		cb->segnum = scp->numoth;
+		seq_add(&scp->numoth, 1);
+	} else {
+		cb->segnum = scp->numdat;
+		seq_add(&scp->numdat, 1);
+	}
+	*(ptr++) = cpu_to_le16(cb->segnum);
+
+	return ptr;
+}
+
+void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
+			gfp_t gfp, int oth)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
+
+	cb->xmit_count = 0;
+	dn_nsp_mk_data_header(sk, skb, oth);
+
+	/*
+	 * Slow start: If we have been idle for more than
+	 * one RTT, then reset window to min size.
+	 */
+	if ((jiffies - scp->stamp) > t)
+		scp->snd_window = NSP_MIN_WINDOW;
+
+	if (oth)
+		skb_queue_tail(&scp->other_xmit_queue, skb);
+	else
+		skb_queue_tail(&scp->data_xmit_queue, skb);
+
+	if (scp->flowrem_sw != DN_SEND)
+		return;
+
+	dn_nsp_clone_and_send(skb, gfp);
+}
+
+
+int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff *skb2, *n, *ack = NULL;
+	int wakeup = 0;
+	int try_retrans = 0;
+	unsigned long reftime = cb->stamp;
+	unsigned long pkttime;
+	unsigned short xmit_count;
+	unsigned short segnum;
+
+	skb_queue_walk_safe(q, skb2, n) {
+		struct dn_skb_cb *cb2 = DN_SKB_CB(skb2);
+
+		if (dn_before_or_equal(cb2->segnum, acknum))
+			ack = skb2;
+
+		/* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */
+
+		if (ack == NULL)
+			continue;
+
+		/* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */
+
+		/* Does _last_ packet acked have xmit_count > 1 */
+		try_retrans = 0;
+		/* Remember to wake up the sending process */
+		wakeup = 1;
+		/* Keep various statistics */
+		pkttime = cb2->stamp;
+		xmit_count = cb2->xmit_count;
+		segnum = cb2->segnum;
+		/* Remove and drop ack'ed packet */
+		skb_unlink(ack, q);
+		kfree_skb(ack);
+		ack = NULL;
+
+		/*
+		 * We don't expect to see acknowledgements for packets we
+		 * haven't sent yet.
+		 */
+		WARN_ON(xmit_count == 0);
+
+		/*
+		 * If the packet has only been sent once, we can use it
+		 * to calculate the RTT and also open the window a little
+		 * further.
+		 */
+		if (xmit_count == 1) {
+			if (dn_equal(segnum, acknum))
+				dn_nsp_rtt(sk, (long)(pkttime - reftime));
+
+			if (scp->snd_window < scp->max_window)
+				scp->snd_window++;
+		}
+
+		/*
+		 * Packet has been sent more than once. If this is the last
+		 * packet to be acknowledged then we want to send the next
+		 * packet in the send queue again (assumes the remote host does
+		 * go-back-N error control).
+		 */
+		if (xmit_count > 1)
+			try_retrans = 1;
+	}
+
+	if (try_retrans)
+		dn_nsp_output(sk);
+
+	return wakeup;
+}
+
+void dn_nsp_send_data_ack(struct sock *sk)
+{
+	struct sk_buff *skb = NULL;
+
+	if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, 9);
+	dn_mk_ack_header(sk, skb, 0x04, 9, 0);
+	dn_nsp_send(skb);
+}
+
+void dn_nsp_send_oth_ack(struct sock *sk)
+{
+	struct sk_buff *skb = NULL;
+
+	if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, 9);
+	dn_mk_ack_header(sk, skb, 0x14, 9, 1);
+	dn_nsp_send(skb);
+}
+
+
+void dn_send_conn_ack (struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff *skb = NULL;
+	struct nsp_conn_ack_msg *msg;
+
+	if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL)
+		return;
+
+	msg = (struct nsp_conn_ack_msg *)skb_put(skb, 3);
+	msg->msgflg = 0x24;
+	msg->dstaddr = scp->addrrem;
+
+	dn_nsp_send(skb);
+}
+
+void dn_nsp_delayed_ack(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->ackxmt_oth != scp->numoth_rcv)
+		dn_nsp_send_oth_ack(sk);
+
+	if (scp->ackxmt_dat != scp->numdat_rcv)
+		dn_nsp_send_data_ack(sk);
+}
+
+static int dn_nsp_retrans_conn_conf(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->state == DN_CC)
+		dn_send_conn_conf(sk, GFP_ATOMIC);
+
+	return 0;
+}
+
+void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff *skb = NULL;
+	struct nsp_conn_init_msg *msg;
+	__u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
+
+	if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL)
+		return;
+
+	msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg));
+	msg->msgflg = 0x28;
+	msg->dstaddr = scp->addrrem;
+	msg->srcaddr = scp->addrloc;
+	msg->services = scp->services_loc;
+	msg->info = scp->info_loc;
+	msg->segsize = cpu_to_le16(scp->segsize_loc);
+
+	*skb_put(skb,1) = len;
+
+	if (len > 0)
+		memcpy(skb_put(skb, len), scp->conndata_out.opt_data, len);
+
+
+	dn_nsp_send(skb);
+
+	scp->persist = dn_nsp_persist(sk);
+	scp->persist_fxn = dn_nsp_retrans_conn_conf;
+}
+
+
+static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
+			unsigned short reason, gfp_t gfp,
+			struct dst_entry *dst,
+			int ddl, unsigned char *dd, __le16 rem, __le16 loc)
+{
+	struct sk_buff *skb = NULL;
+	int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
+	unsigned char *msg;
+
+	if ((dst == NULL) || (rem == 0)) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", le16_to_cpu(rem), dst);
+		return;
+	}
+
+	if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL)
+		return;
+
+	msg = skb_put(skb, size);
+	*msg++ = msgflg;
+	*(__le16 *)msg = rem;
+	msg += 2;
+	*(__le16 *)msg = loc;
+	msg += 2;
+	*(__le16 *)msg = cpu_to_le16(reason);
+	msg += 2;
+	if (msgflg == NSP_DISCINIT)
+		*msg++ = ddl;
+
+	if (ddl) {
+		memcpy(msg, dd, ddl);
+	}
+
+	/*
+	 * This doesn't go via the dn_nsp_send() function since we need
+	 * to be able to send disc packets out which have no socket
+	 * associations.
+	 */
+	skb_dst_set(skb, dst_clone(dst));
+	dst_output(skb);
+}
+
+
+void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
+			unsigned short reason, gfp_t gfp)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	int ddl = 0;
+
+	if (msgflg == NSP_DISCINIT)
+		ddl = le16_to_cpu(scp->discdata_out.opt_optl);
+
+	if (reason == 0)
+		reason = le16_to_cpu(scp->discdata_out.opt_status);
+
+	dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl,
+		scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
+}
+
+
+void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
+			unsigned short reason)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	int ddl = 0;
+	gfp_t gfp = GFP_ATOMIC;
+
+	dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl,
+			NULL, cb->src_port, cb->dst_port);
+}
+
+
+void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct sk_buff *skb;
+	unsigned char *ptr;
+	gfp_t gfp = GFP_ATOMIC;
+
+	if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
+		return;
+
+	skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
+	ptr = skb_put(skb, 2);
+	DN_SKB_CB(skb)->nsp_flags = 0x10;
+	*ptr++ = lsflags;
+	*ptr = fcval;
+
+	dn_nsp_queue_xmit(sk, skb, gfp, 1);
+
+	scp->persist = dn_nsp_persist(sk);
+	scp->persist_fxn = dn_nsp_xmit_timeout;
+}
+
+static int dn_nsp_retrans_conninit(struct sock *sk)
+{
+	struct dn_scp *scp = DN_SK(sk);
+
+	if (scp->state == DN_CI)
+		dn_nsp_send_conninit(sk, NSP_RCI);
+
+	return 0;
+}
+
+void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
+{
+	struct dn_scp *scp = DN_SK(sk);
+	struct nsp_conn_init_msg *msg;
+	unsigned char aux;
+	unsigned char menuver;
+	struct dn_skb_cb *cb;
+	unsigned char type = 1;
+	gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
+	struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
+
+	if (!skb)
+		return;
+
+	cb  = DN_SKB_CB(skb);
+	msg = (struct nsp_conn_init_msg *)skb_put(skb,sizeof(*msg));
+
+	msg->msgflg	= msgflg;
+	msg->dstaddr	= 0x0000;		/* Remote Node will assign it*/
+
+	msg->srcaddr	= scp->addrloc;
+	msg->services	= scp->services_loc;	/* Requested flow control    */
+	msg->info	= scp->info_loc;	/* Version Number            */
+	msg->segsize	= cpu_to_le16(scp->segsize_loc);	/* Max segment size  */
+
+	if (scp->peer.sdn_objnum)
+		type = 0;
+
+	skb_put(skb, dn_sockaddr2username(&scp->peer,
+					  skb_tail_pointer(skb), type));
+	skb_put(skb, dn_sockaddr2username(&scp->addr,
+					  skb_tail_pointer(skb), 2));
+
+	menuver = DN_MENUVER_ACC | DN_MENUVER_USR;
+	if (scp->peer.sdn_flags & SDF_PROXY)
+		menuver |= DN_MENUVER_PRX;
+	if (scp->peer.sdn_flags & SDF_UICPROXY)
+		menuver |= DN_MENUVER_UIC;
+
+	*skb_put(skb, 1) = menuver;	/* Menu Version		*/
+
+	aux = scp->accessdata.acc_userl;
+	*skb_put(skb, 1) = aux;
+	if (aux > 0)
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
+
+	aux = scp->accessdata.acc_passl;
+	*skb_put(skb, 1) = aux;
+	if (aux > 0)
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
+
+	aux = scp->accessdata.acc_accl;
+	*skb_put(skb, 1) = aux;
+	if (aux > 0)
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
+
+	aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
+	*skb_put(skb, 1) = aux;
+	if (aux > 0)
+		memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux);
+
+	scp->persist = dn_nsp_persist(sk);
+	scp->persist_fxn = dn_nsp_retrans_conninit;
+
+	cb->rt_flags = DN_RT_F_RQR;
+
+	dn_nsp_send(skb);
+}
+
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
new file mode 100644
index 00000000..b91b6036
--- /dev/null
+++ b/net/decnet/dn_route.c
@@ -0,0 +1,1861 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Routing Functions (Endnode and Router)
+ *
+ * Authors:     Steve Whitehouse <SteveW@ACM.org>
+ *              Eduardo Marcelo Serrat <emserrat@geocities.com>
+ *
+ * Changes:
+ *              Steve Whitehouse : Fixes to allow "intra-ethernet" and
+ *                                 "return-to-sender" bits on outgoing
+ *                                 packets.
+ *		Steve Whitehouse : Timeouts for cached routes.
+ *              Steve Whitehouse : Use dst cache for input routes too.
+ *              Steve Whitehouse : Fixed error values in dn_send_skb.
+ *              Steve Whitehouse : Rework routing functions to better fit
+ *                                 DECnet routing design
+ *              Alexey Kuznetsov : New SMP locking
+ *              Steve Whitehouse : More SMP locking changes & dn_cache_dump()
+ *              Steve Whitehouse : Prerouting NF hook, now really is prerouting.
+ *				   Fixed possible skb leak in rtnetlink funcs.
+ *              Steve Whitehouse : Dave Miller's dynamic hash table sizing and
+ *                                 Alexey Kuznetsov's finer grained locking
+ *                                 from ipv4/route.c.
+ *              Steve Whitehouse : Routing is now starting to look like a
+ *                                 sensible set of code now, mainly due to
+ *                                 my copying the IPv4 routing code. The
+ *                                 hooks here are modified and will continue
+ *                                 to evolve for a while.
+ *              Steve Whitehouse : Real SMP at last :-) Also new netfilter
+ *                                 stuff. Look out raw sockets your days
+ *                                 are numbered!
+ *              Steve Whitehouse : Added return-to-sender functions. Added
+ *                                 backlog congestion level return codes.
+ *		Steve Whitehouse : Fixed bug where routes were set up with
+ *                                 no ref count on net devices.
+ *              Steve Whitehouse : RCU for the route cache
+ *              Steve Whitehouse : Preparations for the flow cache
+ *              Steve Whitehouse : Prepare for nonlinear skbs
+ */
+
+/******************************************************************************
+    (c) 1995-1998 E.M. Serrat		emserrat@geocities.com
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+*******************************************************************************/
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/inet.h>
+#include <linux/route.h>
+#include <linux/in_route.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/string.h>
+#include <linux/netfilter_decnet.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <asm/errno.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_nsp.h>
+#include <net/dn_route.h>
+#include <net/dn_neigh.h>
+#include <net/dn_fib.h>
+
+struct dn_rt_hash_bucket
+{
+	struct dn_route __rcu *chain;
+	spinlock_t lock;
+};
+
+extern struct neigh_table dn_neigh_table;
+
+
+static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00};
+
+static const int dn_rt_min_delay = 2 * HZ;
+static const int dn_rt_max_delay = 10 * HZ;
+static const int dn_rt_mtu_expires = 10 * 60 * HZ;
+
+static unsigned long dn_rt_deadline;
+
+static int dn_dst_gc(struct dst_ops *ops);
+static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
+static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
+static unsigned int dn_dst_default_mtu(const struct dst_entry *dst);
+static void dn_dst_destroy(struct dst_entry *);
+static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
+static void dn_dst_link_failure(struct sk_buff *);
+static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
+static int dn_route_input(struct sk_buff *);
+static void dn_run_flush(unsigned long dummy);
+
+static struct dn_rt_hash_bucket *dn_rt_hash_table;
+static unsigned dn_rt_hash_mask;
+
+static struct timer_list dn_route_timer;
+static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
+int decnet_dst_gc_interval = 2;
+
+static struct dst_ops dn_dst_ops = {
+	.family =		PF_DECnet,
+	.protocol =		cpu_to_be16(ETH_P_DNA_RT),
+	.gc_thresh =		128,
+	.gc =			dn_dst_gc,
+	.check =		dn_dst_check,
+	.default_advmss =	dn_dst_default_advmss,
+	.default_mtu =		dn_dst_default_mtu,
+	.cow_metrics =		dst_cow_metrics_generic,
+	.destroy =		dn_dst_destroy,
+	.negative_advice =	dn_dst_negative_advice,
+	.link_failure =		dn_dst_link_failure,
+	.update_pmtu =		dn_dst_update_pmtu,
+};
+
+static void dn_dst_destroy(struct dst_entry *dst)
+{
+	dst_destroy_metrics_generic(dst);
+}
+
+static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
+{
+	__u16 tmp = (__u16 __force)(src ^ dst);
+	tmp ^= (tmp >> 3);
+	tmp ^= (tmp >> 5);
+	tmp ^= (tmp >> 10);
+	return dn_rt_hash_mask & (unsigned)tmp;
+}
+
+static inline void dnrt_free(struct dn_route *rt)
+{
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static inline void dnrt_drop(struct dn_route *rt)
+{
+	dst_release(&rt->dst);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static void dn_dst_check_expire(unsigned long dummy)
+{
+	int i;
+	struct dn_route *rt;
+	struct dn_route __rcu **rtp;
+	unsigned long now = jiffies;
+	unsigned long expire = 120 * HZ;
+
+	for (i = 0; i <= dn_rt_hash_mask; i++) {
+		rtp = &dn_rt_hash_table[i].chain;
+
+		spin_lock(&dn_rt_hash_table[i].lock);
+		while ((rt = rcu_dereference_protected(*rtp,
+						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
+			if (atomic_read(&rt->dst.__refcnt) ||
+					(now - rt->dst.lastuse) < expire) {
+				rtp = &rt->dst.dn_next;
+				continue;
+			}
+			*rtp = rt->dst.dn_next;
+			rt->dst.dn_next = NULL;
+			dnrt_free(rt);
+		}
+		spin_unlock(&dn_rt_hash_table[i].lock);
+
+		if ((jiffies - now) > 0)
+			break;
+	}
+
+	mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ);
+}
+
+static int dn_dst_gc(struct dst_ops *ops)
+{
+	struct dn_route *rt;
+	struct dn_route __rcu **rtp;
+	int i;
+	unsigned long now = jiffies;
+	unsigned long expire = 10 * HZ;
+
+	for (i = 0; i <= dn_rt_hash_mask; i++) {
+
+		spin_lock_bh(&dn_rt_hash_table[i].lock);
+		rtp = &dn_rt_hash_table[i].chain;
+
+		while ((rt = rcu_dereference_protected(*rtp,
+						lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
+			if (atomic_read(&rt->dst.__refcnt) ||
+					(now - rt->dst.lastuse) < expire) {
+				rtp = &rt->dst.dn_next;
+				continue;
+			}
+			*rtp = rt->dst.dn_next;
+			rt->dst.dn_next = NULL;
+			dnrt_drop(rt);
+			break;
+		}
+		spin_unlock_bh(&dn_rt_hash_table[i].lock);
+	}
+
+	return 0;
+}
+
+/*
+ * The decnet standards don't impose a particular minimum mtu, what they
+ * do insist on is that the routing layer accepts a datagram of at least
+ * 230 bytes long. Here we have to subtract the routing header length from
+ * 230 to get the minimum acceptable mtu. If there is no neighbour, then we
+ * assume the worst and use a long header size.
+ *
+ * We update both the mtu and the advertised mss (i.e. the segment size we
+ * advertise to the other end).
+ */
+static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct neighbour *n = dst_get_neighbour(dst);
+	u32 min_mtu = 230;
+	struct dn_dev *dn;
+
+	dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL;
+
+	if (dn && dn->use_long == 0)
+		min_mtu -= 6;
+	else
+		min_mtu -= 21;
+
+	if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
+		if (!(dst_metric_locked(dst, RTAX_MTU))) {
+			dst_metric_set(dst, RTAX_MTU, mtu);
+			dst_set_expires(dst, dn_rt_mtu_expires);
+		}
+		if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
+			u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
+			u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS);
+			if (!existing_mss || existing_mss > mss)
+				dst_metric_set(dst, RTAX_ADVMSS, mss);
+		}
+	}
+}
+
+/*
+ * When a route has been marked obsolete. (e.g. routing cache flush)
+ */
+static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
+{
+	return NULL;
+}
+
+static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst)
+{
+	dst_release(dst);
+	return NULL;
+}
+
+static void dn_dst_link_failure(struct sk_buff *skb)
+{
+}
+
+static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2)
+{
+	return ((fl1->daddr ^ fl2->daddr) |
+		(fl1->saddr ^ fl2->saddr) |
+		(fl1->flowidn_mark ^ fl2->flowidn_mark) |
+		(fl1->flowidn_scope ^ fl2->flowidn_scope) |
+		(fl1->flowidn_oif ^ fl2->flowidn_oif) |
+		(fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0;
+}
+
+static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp)
+{
+	struct dn_route *rth;
+	struct dn_route __rcu **rthp;
+	unsigned long now = jiffies;
+
+	rthp = &dn_rt_hash_table[hash].chain;
+
+	spin_lock_bh(&dn_rt_hash_table[hash].lock);
+	while ((rth = rcu_dereference_protected(*rthp,
+						lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
+		if (compare_keys(&rth->fld, &rt->fld)) {
+			/* Put it first */
+			*rthp = rth->dst.dn_next;
+			rcu_assign_pointer(rth->dst.dn_next,
+					   dn_rt_hash_table[hash].chain);
+			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
+
+			dst_use(&rth->dst, now);
+			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
+
+			dnrt_drop(rt);
+			*rp = rth;
+			return 0;
+		}
+		rthp = &rth->dst.dn_next;
+	}
+
+	rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
+	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
+
+	dst_use(&rt->dst, now);
+	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
+	*rp = rt;
+	return 0;
+}
+
+static void dn_run_flush(unsigned long dummy)
+{
+	int i;
+	struct dn_route *rt, *next;
+
+	for (i = 0; i < dn_rt_hash_mask; i++) {
+		spin_lock_bh(&dn_rt_hash_table[i].lock);
+
+		if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL)
+			goto nothing_to_declare;
+
+		for(; rt; rt = next) {
+			next = rcu_dereference_raw(rt->dst.dn_next);
+			RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+			dst_free((struct dst_entry *)rt);
+		}
+
+nothing_to_declare:
+		spin_unlock_bh(&dn_rt_hash_table[i].lock);
+	}
+}
+
+static DEFINE_SPINLOCK(dn_rt_flush_lock);
+
+void dn_rt_cache_flush(int delay)
+{
+	unsigned long now = jiffies;
+	int user_mode = !in_interrupt();
+
+	if (delay < 0)
+		delay = dn_rt_min_delay;
+
+	spin_lock_bh(&dn_rt_flush_lock);
+
+	if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) {
+		long tmo = (long)(dn_rt_deadline - now);
+
+		if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay)
+			tmo = 0;
+
+		if (delay > tmo)
+			delay = tmo;
+	}
+
+	if (delay <= 0) {
+		spin_unlock_bh(&dn_rt_flush_lock);
+		dn_run_flush(0);
+		return;
+	}
+
+	if (dn_rt_deadline == 0)
+		dn_rt_deadline = now + dn_rt_max_delay;
+
+	dn_rt_flush_timer.expires = now + delay;
+	add_timer(&dn_rt_flush_timer);
+	spin_unlock_bh(&dn_rt_flush_lock);
+}
+
+/**
+ * dn_return_short - Return a short packet to its sender
+ * @skb: The packet to return
+ *
+ */
+static int dn_return_short(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb;
+	unsigned char *ptr;
+	__le16 *src;
+	__le16 *dst;
+
+	/* Add back headers */
+	skb_push(skb, skb->data - skb_network_header(skb));
+
+	if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	cb = DN_SKB_CB(skb);
+	/* Skip packet length and point to flags */
+	ptr = skb->data + 2;
+	*ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
+
+	dst = (__le16 *)ptr;
+	ptr += 2;
+	src = (__le16 *)ptr;
+	ptr += 2;
+	*ptr = 0; /* Zero hop count */
+
+	swap(*src, *dst);
+
+	skb->pkt_type = PACKET_OUTGOING;
+	dn_rt_finish_output(skb, NULL, NULL);
+	return NET_RX_SUCCESS;
+}
+
+/**
+ * dn_return_long - Return a long packet to its sender
+ * @skb: The long format packet to return
+ *
+ */
+static int dn_return_long(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb;
+	unsigned char *ptr;
+	unsigned char *src_addr, *dst_addr;
+	unsigned char tmp[ETH_ALEN];
+
+	/* Add back all headers */
+	skb_push(skb, skb->data - skb_network_header(skb));
+
+	if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	cb = DN_SKB_CB(skb);
+	/* Ignore packet length and point to flags */
+	ptr = skb->data + 2;
+
+	/* Skip padding */
+	if (*ptr & DN_RT_F_PF) {
+		char padlen = (*ptr & ~DN_RT_F_PF);
+		ptr += padlen;
+	}
+
+	*ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
+	ptr += 2;
+	dst_addr = ptr;
+	ptr += 8;
+	src_addr = ptr;
+	ptr += 6;
+	*ptr = 0; /* Zero hop count */
+
+	/* Swap source and destination */
+	memcpy(tmp, src_addr, ETH_ALEN);
+	memcpy(src_addr, dst_addr, ETH_ALEN);
+	memcpy(dst_addr, tmp, ETH_ALEN);
+
+	skb->pkt_type = PACKET_OUTGOING;
+	dn_rt_finish_output(skb, dst_addr, src_addr);
+	return NET_RX_SUCCESS;
+}
+
+/**
+ * dn_route_rx_packet - Try and find a route for an incoming packet
+ * @skb: The packet to find a route for
+ *
+ * Returns: result of input function if route is found, error code otherwise
+ */
+static int dn_route_rx_packet(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb;
+	int err;
+
+	if ((err = dn_route_input(skb)) == 0)
+		return dst_input(skb);
+
+	cb = DN_SKB_CB(skb);
+	if (decnet_debug_level & 4) {
+		char *devname = skb->dev ? skb->dev->name : "???";
+
+		printk(KERN_DEBUG
+			"DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
+			(int)cb->rt_flags, devname, skb->len,
+			le16_to_cpu(cb->src), le16_to_cpu(cb->dst),
+			err, skb->pkt_type);
+	}
+
+	if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) {
+		switch(cb->rt_flags & DN_RT_PKT_MSK) {
+			case DN_RT_PKT_SHORT:
+				return dn_return_short(skb);
+			case DN_RT_PKT_LONG:
+				return dn_return_long(skb);
+		}
+	}
+
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static int dn_route_rx_long(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	unsigned char *ptr = skb->data;
+
+	if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */
+		goto drop_it;
+
+	skb_pull(skb, 20);
+	skb_reset_transport_header(skb);
+
+	/* Destination info */
+	ptr += 2;
+	cb->dst = dn_eth2dn(ptr);
+	if (memcmp(ptr, dn_hiord_addr, 4) != 0)
+		goto drop_it;
+	ptr += 6;
+
+
+	/* Source info */
+	ptr += 2;
+	cb->src = dn_eth2dn(ptr);
+	if (memcmp(ptr, dn_hiord_addr, 4) != 0)
+		goto drop_it;
+	ptr += 6;
+	/* Other junk */
+	ptr++;
+	cb->hops = *ptr++; /* Visit Count */
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL,
+		       dn_route_rx_packet);
+
+drop_it:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+
+
+static int dn_route_rx_short(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	unsigned char *ptr = skb->data;
+
+	if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */
+		goto drop_it;
+
+	skb_pull(skb, 5);
+	skb_reset_transport_header(skb);
+
+	cb->dst = *(__le16 *)ptr;
+	ptr += 2;
+	cb->src = *(__le16 *)ptr;
+	ptr += 2;
+	cb->hops = *ptr & 0x3f;
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING, skb, skb->dev, NULL,
+		       dn_route_rx_packet);
+
+drop_it:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static int dn_route_discard(struct sk_buff *skb)
+{
+	/*
+	 * I know we drop the packet here, but thats considered success in
+	 * this case
+	 */
+	kfree_skb(skb);
+	return NET_RX_SUCCESS;
+}
+
+static int dn_route_ptp_hello(struct sk_buff *skb)
+{
+	dn_dev_hello(skb);
+	dn_neigh_pointopoint_hello(skb);
+	return NET_RX_SUCCESS;
+}
+
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dn_skb_cb *cb;
+	unsigned char flags = 0;
+	__u16 len = le16_to_cpu(*(__le16 *)skb->data);
+	struct dn_dev *dn = rcu_dereference(dev->dn_ptr);
+	unsigned char padlen = 0;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto dump_it;
+
+	if (dn == NULL)
+		goto dump_it;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		goto out;
+
+	if (!pskb_may_pull(skb, 3))
+		goto dump_it;
+
+	skb_pull(skb, 2);
+
+	if (len > skb->len)
+		goto dump_it;
+
+	skb_trim(skb, len);
+
+	flags = *skb->data;
+
+	cb = DN_SKB_CB(skb);
+	cb->stamp = jiffies;
+	cb->iif = dev->ifindex;
+
+	/*
+	 * If we have padding, remove it.
+	 */
+	if (flags & DN_RT_F_PF) {
+		padlen = flags & ~DN_RT_F_PF;
+		if (!pskb_may_pull(skb, padlen + 1))
+			goto dump_it;
+		skb_pull(skb, padlen);
+		flags = *skb->data;
+	}
+
+	skb_reset_network_header(skb);
+
+	/*
+	 * Weed out future version DECnet
+	 */
+	if (flags & DN_RT_F_VER)
+		goto dump_it;
+
+	cb->rt_flags = flags;
+
+	if (decnet_debug_level & 1)
+		printk(KERN_DEBUG
+			"dn_route_rcv: got 0x%02x from %s [%d %d %d]\n",
+			(int)flags, (dev) ? dev->name : "???", len, skb->len,
+			padlen);
+
+	if (flags & DN_RT_PKT_CNTL) {
+		if (unlikely(skb_linearize(skb)))
+			goto dump_it;
+
+		switch(flags & DN_RT_CNTL_MSK) {
+			case DN_RT_PKT_INIT:
+				dn_dev_init_pkt(skb);
+				break;
+			case DN_RT_PKT_VERI:
+				dn_dev_veri_pkt(skb);
+				break;
+		}
+
+		if (dn->parms.state != DN_DEV_S_RU)
+			goto dump_it;
+
+		switch(flags & DN_RT_CNTL_MSK) {
+			case DN_RT_PKT_HELO:
+				return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+					       skb, skb->dev, NULL,
+					       dn_route_ptp_hello);
+
+			case DN_RT_PKT_L1RT:
+			case DN_RT_PKT_L2RT:
+				return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE,
+					       skb, skb->dev, NULL,
+					       dn_route_discard);
+			case DN_RT_PKT_ERTH:
+				return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+					       skb, skb->dev, NULL,
+					       dn_neigh_router_hello);
+
+			case DN_RT_PKT_EEDH:
+				return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
+					       skb, skb->dev, NULL,
+					       dn_neigh_endnode_hello);
+		}
+	} else {
+		if (dn->parms.state != DN_DEV_S_RU)
+			goto dump_it;
+
+		skb_pull(skb, 1); /* Pull flags */
+
+		switch(flags & DN_RT_PKT_MSK) {
+			case DN_RT_PKT_LONG:
+				return dn_route_rx_long(skb);
+			case DN_RT_PKT_SHORT:
+				return dn_route_rx_short(skb);
+		}
+	}
+
+dump_it:
+	kfree_skb(skb);
+out:
+	return NET_RX_DROP;
+}
+
+static int dn_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct dn_route *rt = (struct dn_route *)dst;
+	struct net_device *dev = dst->dev;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct neighbour *neigh;
+
+	int err = -EINVAL;
+
+	if ((neigh = dst_get_neighbour(dst)) == NULL)
+		goto error;
+
+	skb->dev = dev;
+
+	cb->src = rt->rt_saddr;
+	cb->dst = rt->rt_daddr;
+
+	/*
+	 * Always set the Intra-Ethernet bit on all outgoing packets
+	 * originated on this node. Only valid flag from upper layers
+	 * is return-to-sender-requested. Set hop count to 0 too.
+	 */
+	cb->rt_flags &= ~DN_RT_F_RQR;
+	cb->rt_flags |= DN_RT_F_IE;
+	cb->hops = 0;
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT, skb, NULL, dev,
+		       neigh->output);
+
+error:
+	if (net_ratelimit())
+		printk(KERN_DEBUG "dn_output: This should not happen\n");
+
+	kfree_skb(skb);
+
+	return err;
+}
+
+static int dn_forward(struct sk_buff *skb)
+{
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct dst_entry *dst = skb_dst(skb);
+	struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
+	struct dn_route *rt;
+	struct neighbour *neigh = dst_get_neighbour(dst);
+	int header_len;
+#ifdef CONFIG_NETFILTER
+	struct net_device *dev = skb->dev;
+#endif
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	/* Ensure that we have enough space for headers */
+	rt = (struct dn_route *)skb_dst(skb);
+	header_len = dn_db->use_long ? 21 : 6;
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len))
+		goto drop;
+
+	/*
+	 * Hop count exceeded.
+	 */
+	if (++cb->hops > 30)
+		goto drop;
+
+	skb->dev = rt->dst.dev;
+
+	/*
+	 * If packet goes out same interface it came in on, then set
+	 * the Intra-Ethernet bit. This has no effect for short
+	 * packets, so we don't need to test for them here.
+	 */
+	cb->rt_flags &= ~DN_RT_F_IE;
+	if (rt->rt_flags & RTCF_DOREDIRECT)
+		cb->rt_flags |= DN_RT_F_IE;
+
+	return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD, skb, dev, skb->dev,
+		       neigh->output);
+
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+/*
+ * Used to catch bugs. This should never normally get
+ * called.
+ */
+static int dn_rt_bug(struct sk_buff *skb)
+{
+	if (net_ratelimit()) {
+		struct dn_skb_cb *cb = DN_SKB_CB(skb);
+
+		printk(KERN_DEBUG "dn_rt_bug: skb from:%04x to:%04x\n",
+				le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
+	}
+
+	kfree_skb(skb);
+
+	return NET_RX_DROP;
+}
+
+static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
+{
+	return dn_mss_from_pmtu(dst->dev, dst_mtu(dst));
+}
+
+static unsigned int dn_dst_default_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
+static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
+{
+	struct dn_fib_info *fi = res->fi;
+	struct net_device *dev = rt->dst.dev;
+	unsigned int mss_metric;
+	struct neighbour *n;
+
+	if (fi) {
+		if (DN_FIB_RES_GW(*res) &&
+		    DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = DN_FIB_RES_GW(*res);
+		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+	}
+	rt->rt_type = res->type;
+
+	if (dev != NULL && dst_get_neighbour(&rt->dst) == NULL) {
+		n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		dst_set_neighbour(&rt->dst, n);
+	}
+
+	if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
+		dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
+	mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
+	if (mss_metric) {
+		unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
+		if (mss_metric > mss)
+			dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
+	}
+	return 0;
+}
+
+static inline int dn_match_addr(__le16 addr1, __le16 addr2)
+{
+	__u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2);
+	int match = 16;
+	while(tmp) {
+		tmp >>= 1;
+		match--;
+	}
+	return match;
+}
+
+static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
+{
+	__le16 saddr = 0;
+	struct dn_dev *dn_db;
+	struct dn_ifaddr *ifa;
+	int best_match = 0;
+	int ret;
+
+	rcu_read_lock();
+	dn_db = rcu_dereference(dev->dn_ptr);
+	for (ifa = rcu_dereference(dn_db->ifa_list);
+	     ifa != NULL;
+	     ifa = rcu_dereference(ifa->ifa_next)) {
+		if (ifa->ifa_scope > scope)
+			continue;
+		if (!daddr) {
+			saddr = ifa->ifa_local;
+			break;
+		}
+		ret = dn_match_addr(daddr, ifa->ifa_local);
+		if (ret > best_match)
+			saddr = ifa->ifa_local;
+		if (best_match == 0)
+			saddr = ifa->ifa_local;
+	}
+	rcu_read_unlock();
+
+	return saddr;
+}
+
+static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
+{
+	return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
+}
+
+static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
+{
+	__le16 mask = dnet_make_mask(res->prefixlen);
+	return (daddr&~mask)|res->fi->fib_nh->nh_gw;
+}
+
+static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard)
+{
+	struct flowidn fld = {
+		.daddr = oldflp->daddr,
+		.saddr = oldflp->saddr,
+		.flowidn_scope = RT_SCOPE_UNIVERSE,
+		.flowidn_mark = oldflp->flowidn_mark,
+		.flowidn_iif = init_net.loopback_dev->ifindex,
+		.flowidn_oif = oldflp->flowidn_oif,
+	};
+	struct dn_route *rt = NULL;
+	struct net_device *dev_out = NULL, *dev;
+	struct neighbour *neigh = NULL;
+	unsigned hash;
+	unsigned flags = 0;
+	struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
+	int err;
+	int free_res = 0;
+	__le16 gateway = 0;
+
+	if (decnet_debug_level & 16)
+		printk(KERN_DEBUG
+		       "dn_route_output_slow: dst=%04x src=%04x mark=%d"
+		       " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr),
+		       le16_to_cpu(oldflp->saddr),
+		       oldflp->flowidn_mark, init_net.loopback_dev->ifindex,
+		       oldflp->flowidn_oif);
+
+	/* If we have an output interface, verify its a DECnet device */
+	if (oldflp->flowidn_oif) {
+		dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif);
+		err = -ENODEV;
+		if (dev_out && dev_out->dn_ptr == NULL) {
+			dev_put(dev_out);
+			dev_out = NULL;
+		}
+		if (dev_out == NULL)
+			goto out;
+	}
+
+	/* If we have a source address, verify that its a local address */
+	if (oldflp->saddr) {
+		err = -EADDRNOTAVAIL;
+
+		if (dev_out) {
+			if (dn_dev_islocal(dev_out, oldflp->saddr))
+				goto source_ok;
+			dev_put(dev_out);
+			goto out;
+		}
+		rcu_read_lock();
+		for_each_netdev_rcu(&init_net, dev) {
+			if (!dev->dn_ptr)
+				continue;
+			if (!dn_dev_islocal(dev, oldflp->saddr))
+				continue;
+			if ((dev->flags & IFF_LOOPBACK) &&
+			    oldflp->daddr &&
+			    !dn_dev_islocal(dev, oldflp->daddr))
+				continue;
+
+			dev_out = dev;
+			break;
+		}
+		rcu_read_unlock();
+		if (dev_out == NULL)
+			goto out;
+		dev_hold(dev_out);
+source_ok:
+		;
+	}
+
+	/* No destination? Assume its local */
+	if (!fld.daddr) {
+		fld.daddr = fld.saddr;
+
+		err = -EADDRNOTAVAIL;
+		if (dev_out)
+			dev_put(dev_out);
+		dev_out = init_net.loopback_dev;
+		dev_hold(dev_out);
+		if (!fld.daddr) {
+			fld.daddr =
+			fld.saddr = dnet_select_source(dev_out, 0,
+						       RT_SCOPE_HOST);
+			if (!fld.daddr)
+				goto out;
+		}
+		fld.flowidn_oif = init_net.loopback_dev->ifindex;
+		res.type = RTN_LOCAL;
+		goto make_route;
+	}
+
+	if (decnet_debug_level & 16)
+		printk(KERN_DEBUG
+		       "dn_route_output_slow: initial checks complete."
+		       " dst=%o4x src=%04x oif=%d try_hard=%d\n",
+		       le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr),
+		       fld.flowidn_oif, try_hard);
+
+	/*
+	 * N.B. If the kernel is compiled without router support then
+	 * dn_fib_lookup() will evaluate to non-zero so this if () block
+	 * will always be executed.
+	 */
+	err = -ESRCH;
+	if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) {
+		struct dn_dev *dn_db;
+		if (err != -ESRCH)
+			goto out;
+		/*
+		 * Here the fallback is basically the standard algorithm for
+		 * routing in endnodes which is described in the DECnet routing
+		 * docs
+		 *
+		 * If we are not trying hard, look in neighbour cache.
+		 * The result is tested to ensure that if a specific output
+		 * device/source address was requested, then we honour that
+		 * here
+		 */
+		if (!try_hard) {
+			neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr);
+			if (neigh) {
+				if ((oldflp->flowidn_oif &&
+				    (neigh->dev->ifindex != oldflp->flowidn_oif)) ||
+				    (oldflp->saddr &&
+				    (!dn_dev_islocal(neigh->dev,
+						     oldflp->saddr)))) {
+					neigh_release(neigh);
+					neigh = NULL;
+				} else {
+					if (dev_out)
+						dev_put(dev_out);
+					if (dn_dev_islocal(neigh->dev, fld.daddr)) {
+						dev_out = init_net.loopback_dev;
+						res.type = RTN_LOCAL;
+					} else {
+						dev_out = neigh->dev;
+					}
+					dev_hold(dev_out);
+					goto select_source;
+				}
+			}
+		}
+
+		/* Not there? Perhaps its a local address */
+		if (dev_out == NULL)
+			dev_out = dn_dev_get_default();
+		err = -ENODEV;
+		if (dev_out == NULL)
+			goto out;
+		dn_db = rcu_dereference_raw(dev_out->dn_ptr);
+		/* Possible improvement - check all devices for local addr */
+		if (dn_dev_islocal(dev_out, fld.daddr)) {
+			dev_put(dev_out);
+			dev_out = init_net.loopback_dev;
+			dev_hold(dev_out);
+			res.type = RTN_LOCAL;
+			goto select_source;
+		}
+		/* Not local either.... try sending it to the default router */
+		neigh = neigh_clone(dn_db->router);
+		BUG_ON(neigh && neigh->dev != dev_out);
+
+		/* Ok then, we assume its directly connected and move on */
+select_source:
+		if (neigh)
+			gateway = ((struct dn_neigh *)neigh)->addr;
+		if (gateway == 0)
+			gateway = fld.daddr;
+		if (fld.saddr == 0) {
+			fld.saddr = dnet_select_source(dev_out, gateway,
+						       res.type == RTN_LOCAL ?
+						       RT_SCOPE_HOST :
+						       RT_SCOPE_LINK);
+			if (fld.saddr == 0 && res.type != RTN_LOCAL)
+				goto e_addr;
+		}
+		fld.flowidn_oif = dev_out->ifindex;
+		goto make_route;
+	}
+	free_res = 1;
+
+	if (res.type == RTN_NAT)
+		goto e_inval;
+
+	if (res.type == RTN_LOCAL) {
+		if (!fld.saddr)
+			fld.saddr = fld.daddr;
+		if (dev_out)
+			dev_put(dev_out);
+		dev_out = init_net.loopback_dev;
+		dev_hold(dev_out);
+		fld.flowidn_oif = dev_out->ifindex;
+		if (res.fi)
+			dn_fib_info_put(res.fi);
+		res.fi = NULL;
+		goto make_route;
+	}
+
+	if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+		dn_fib_select_multipath(&fld, &res);
+
+	/*
+	 * We could add some logic to deal with default routes here and
+	 * get rid of some of the special casing above.
+	 */
+
+	if (!fld.saddr)
+		fld.saddr = DN_FIB_RES_PREFSRC(res);
+
+	if (dev_out)
+		dev_put(dev_out);
+	dev_out = DN_FIB_RES_DEV(res);
+	dev_hold(dev_out);
+	fld.flowidn_oif = dev_out->ifindex;
+	gateway = DN_FIB_RES_GW(res);
+
+make_route:
+	if (dev_out->flags & IFF_LOOPBACK)
+		flags |= RTCF_LOCAL;
+
+	rt = dst_alloc(&dn_dst_ops, dev_out, 1, 0, DST_HOST);
+	if (rt == NULL)
+		goto e_nobufs;
+
+	memset(&rt->fld, 0, sizeof(rt->fld));
+	rt->fld.saddr        = oldflp->saddr;
+	rt->fld.daddr        = oldflp->daddr;
+	rt->fld.flowidn_oif  = oldflp->flowidn_oif;
+	rt->fld.flowidn_iif  = 0;
+	rt->fld.flowidn_mark = oldflp->flowidn_mark;
+
+	rt->rt_saddr      = fld.saddr;
+	rt->rt_daddr      = fld.daddr;
+	rt->rt_gateway    = gateway ? gateway : fld.daddr;
+	rt->rt_local_src  = fld.saddr;
+
+	rt->rt_dst_map    = fld.daddr;
+	rt->rt_src_map    = fld.saddr;
+
+	dst_set_neighbour(&rt->dst, neigh);
+	neigh = NULL;
+
+	rt->dst.lastuse = jiffies;
+	rt->dst.output  = dn_output;
+	rt->dst.input   = dn_rt_bug;
+	rt->rt_flags      = flags;
+	if (flags & RTCF_LOCAL)
+		rt->dst.input = dn_nsp_rx;
+
+	err = dn_rt_set_next_hop(rt, &res);
+	if (err)
+		goto e_neighbour;
+
+	hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+	dn_insert_route(rt, hash, (struct dn_route **)pprt);
+
+done:
+	if (neigh)
+		neigh_release(neigh);
+	if (free_res)
+		dn_fib_res_put(&res);
+	if (dev_out)
+		dev_put(dev_out);
+out:
+	return err;
+
+e_addr:
+	err = -EADDRNOTAVAIL;
+	goto done;
+e_inval:
+	err = -EINVAL;
+	goto done;
+e_nobufs:
+	err = -ENOBUFS;
+	goto done;
+e_neighbour:
+	dst_free(&rt->dst);
+	goto e_nobufs;
+}
+
+
+/*
+ * N.B. The flags may be moved into the flowi at some future stage.
+ */
+static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags)
+{
+	unsigned hash = dn_hash(flp->saddr, flp->daddr);
+	struct dn_route *rt = NULL;
+
+	if (!(flags & MSG_TRYHARD)) {
+		rcu_read_lock_bh();
+		for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
+			rt = rcu_dereference_bh(rt->dst.dn_next)) {
+			if ((flp->daddr == rt->fld.daddr) &&
+			    (flp->saddr == rt->fld.saddr) &&
+			    (flp->flowidn_mark == rt->fld.flowidn_mark) &&
+			    dn_is_output_route(rt) &&
+			    (rt->fld.flowidn_oif == flp->flowidn_oif)) {
+				dst_use(&rt->dst, jiffies);
+				rcu_read_unlock_bh();
+				*pprt = &rt->dst;
+				return 0;
+			}
+		}
+		rcu_read_unlock_bh();
+	}
+
+	return dn_route_output_slow(pprt, flp, flags);
+}
+
+static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags)
+{
+	int err;
+
+	err = __dn_route_output_key(pprt, flp, flags);
+	if (err == 0 && flp->flowidn_proto) {
+		*pprt = xfrm_lookup(&init_net, *pprt,
+				    flowidn_to_flowi(flp), NULL, 0);
+		if (IS_ERR(*pprt)) {
+			err = PTR_ERR(*pprt);
+			*pprt = NULL;
+		}
+	}
+	return err;
+}
+
+int dn_route_output_sock(struct dst_entry **pprt, struct flowidn *fl, struct sock *sk, int flags)
+{
+	int err;
+
+	err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
+	if (err == 0 && fl->flowidn_proto) {
+		if (!(flags & MSG_DONTWAIT))
+			fl->flowidn_flags |= FLOWI_FLAG_CAN_SLEEP;
+		*pprt = xfrm_lookup(&init_net, *pprt,
+				    flowidn_to_flowi(fl), sk, 0);
+		if (IS_ERR(*pprt)) {
+			err = PTR_ERR(*pprt);
+			*pprt = NULL;
+		}
+	}
+	return err;
+}
+
+static int dn_route_input_slow(struct sk_buff *skb)
+{
+	struct dn_route *rt = NULL;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	struct net_device *in_dev = skb->dev;
+	struct net_device *out_dev = NULL;
+	struct dn_dev *dn_db;
+	struct neighbour *neigh = NULL;
+	unsigned hash;
+	int flags = 0;
+	__le16 gateway = 0;
+	__le16 local_src = 0;
+	struct flowidn fld = {
+		.daddr = cb->dst,
+		.saddr = cb->src,
+		.flowidn_scope = RT_SCOPE_UNIVERSE,
+		.flowidn_mark = skb->mark,
+		.flowidn_iif = skb->dev->ifindex,
+	};
+	struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
+	int err = -EINVAL;
+	int free_res = 0;
+
+	dev_hold(in_dev);
+
+	if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL)
+		goto out;
+
+	/* Zero source addresses are not allowed */
+	if (fld.saddr == 0)
+		goto out;
+
+	/*
+	 * In this case we've just received a packet from a source
+	 * outside ourselves pretending to come from us. We don't
+	 * allow it any further to prevent routing loops, spoofing and
+	 * other nasties. Loopback packets already have the dst attached
+	 * so this only affects packets which have originated elsewhere.
+	 */
+	err  = -ENOTUNIQ;
+	if (dn_dev_islocal(in_dev, cb->src))
+		goto out;
+
+	err = dn_fib_lookup(&fld, &res);
+	if (err) {
+		if (err != -ESRCH)
+			goto out;
+		/*
+		 * Is the destination us ?
+		 */
+		if (!dn_dev_islocal(in_dev, cb->dst))
+			goto e_inval;
+
+		res.type = RTN_LOCAL;
+	} else {
+		__le16 src_map = fld.saddr;
+		free_res = 1;
+
+		out_dev = DN_FIB_RES_DEV(res);
+		if (out_dev == NULL) {
+			if (net_ratelimit())
+				printk(KERN_CRIT "Bug in dn_route_input_slow() "
+						 "No output device\n");
+			goto e_inval;
+		}
+		dev_hold(out_dev);
+
+		if (res.r)
+			src_map = fld.saddr; /* no NAT support for now */
+
+		gateway = DN_FIB_RES_GW(res);
+		if (res.type == RTN_NAT) {
+			fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res);
+			dn_fib_res_put(&res);
+			free_res = 0;
+			if (dn_fib_lookup(&fld, &res))
+				goto e_inval;
+			free_res = 1;
+			if (res.type != RTN_UNICAST)
+				goto e_inval;
+			flags |= RTCF_DNAT;
+			gateway = fld.daddr;
+		}
+		fld.saddr = src_map;
+	}
+
+	switch(res.type) {
+	case RTN_UNICAST:
+		/*
+		 * Forwarding check here, we only check for forwarding
+		 * being turned off, if you want to only forward intra
+		 * area, its up to you to set the routing tables up
+		 * correctly.
+		 */
+		if (dn_db->parms.forwarding == 0)
+			goto e_inval;
+
+		if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
+			dn_fib_select_multipath(&fld, &res);
+
+		/*
+		 * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
+		 * flag as a hint to set the intra-ethernet bit when
+		 * forwarding. If we've got NAT in operation, we don't do
+		 * this optimisation.
+		 */
+		if (out_dev == in_dev && !(flags & RTCF_NAT))
+			flags |= RTCF_DOREDIRECT;
+
+		local_src = DN_FIB_RES_PREFSRC(res);
+
+	case RTN_BLACKHOLE:
+	case RTN_UNREACHABLE:
+		break;
+	case RTN_LOCAL:
+		flags |= RTCF_LOCAL;
+		fld.saddr = cb->dst;
+		fld.daddr = cb->src;
+
+		/* Routing tables gave us a gateway */
+		if (gateway)
+			goto make_route;
+
+		/* Packet was intra-ethernet, so we know its on-link */
+		if (cb->rt_flags & DN_RT_F_IE) {
+			gateway = cb->src;
+			flags |= RTCF_DIRECTSRC;
+			goto make_route;
+		}
+
+		/* Use the default router if there is one */
+		neigh = neigh_clone(dn_db->router);
+		if (neigh) {
+			gateway = ((struct dn_neigh *)neigh)->addr;
+			goto make_route;
+		}
+
+		/* Close eyes and pray */
+		gateway = cb->src;
+		flags |= RTCF_DIRECTSRC;
+		goto make_route;
+	default:
+		goto e_inval;
+	}
+
+make_route:
+	rt = dst_alloc(&dn_dst_ops, out_dev, 0, 0, DST_HOST);
+	if (rt == NULL)
+		goto e_nobufs;
+
+	memset(&rt->fld, 0, sizeof(rt->fld));
+	rt->rt_saddr      = fld.saddr;
+	rt->rt_daddr      = fld.daddr;
+	rt->rt_gateway    = fld.daddr;
+	if (gateway)
+		rt->rt_gateway = gateway;
+	rt->rt_local_src  = local_src ? local_src : rt->rt_saddr;
+
+	rt->rt_dst_map    = fld.daddr;
+	rt->rt_src_map    = fld.saddr;
+
+	rt->fld.saddr        = cb->src;
+	rt->fld.daddr        = cb->dst;
+	rt->fld.flowidn_oif  = 0;
+	rt->fld.flowidn_iif  = in_dev->ifindex;
+	rt->fld.flowidn_mark = fld.flowidn_mark;
+
+	dst_set_neighbour(&rt->dst, neigh);
+	rt->dst.lastuse = jiffies;
+	rt->dst.output = dn_rt_bug;
+	switch(res.type) {
+		case RTN_UNICAST:
+			rt->dst.input = dn_forward;
+			break;
+		case RTN_LOCAL:
+			rt->dst.output = dn_output;
+			rt->dst.input = dn_nsp_rx;
+			rt->dst.dev = in_dev;
+			flags |= RTCF_LOCAL;
+			break;
+		default:
+		case RTN_UNREACHABLE:
+		case RTN_BLACKHOLE:
+			rt->dst.input = dst_discard;
+	}
+	rt->rt_flags = flags;
+
+	err = dn_rt_set_next_hop(rt, &res);
+	if (err)
+		goto e_neighbour;
+
+	hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
+	dn_insert_route(rt, hash, &rt);
+	skb_dst_set(skb, &rt->dst);
+
+done:
+	if (neigh)
+		neigh_release(neigh);
+	if (free_res)
+		dn_fib_res_put(&res);
+	dev_put(in_dev);
+	if (out_dev)
+		dev_put(out_dev);
+out:
+	return err;
+
+e_inval:
+	err = -EINVAL;
+	goto done;
+
+e_nobufs:
+	err = -ENOBUFS;
+	goto done;
+
+e_neighbour:
+	dst_free(&rt->dst);
+	goto done;
+}
+
+static int dn_route_input(struct sk_buff *skb)
+{
+	struct dn_route *rt;
+	struct dn_skb_cb *cb = DN_SKB_CB(skb);
+	unsigned hash = dn_hash(cb->src, cb->dst);
+
+	if (skb_dst(skb))
+		return 0;
+
+	rcu_read_lock();
+	for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
+	    rt = rcu_dereference(rt->dst.dn_next)) {
+		if ((rt->fld.saddr == cb->src) &&
+		    (rt->fld.daddr == cb->dst) &&
+		    (rt->fld.flowidn_oif == 0) &&
+		    (rt->fld.flowidn_mark == skb->mark) &&
+		    (rt->fld.flowidn_iif == cb->iif)) {
+			dst_use(&rt->dst, jiffies);
+			rcu_read_unlock();
+			skb_dst_set(skb, (struct dst_entry *)rt);
+			return 0;
+		}
+	}
+	rcu_read_unlock();
+
+	return dn_route_input_slow(skb);
+}
+
+static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+			   int event, int nowait, unsigned int flags)
+{
+	struct dn_route *rt = (struct dn_route *)skb_dst(skb);
+	struct rtmsg *r;
+	struct nlmsghdr *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+	long expires;
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
+	r = NLMSG_DATA(nlh);
+	r->rtm_family = AF_DECnet;
+	r->rtm_dst_len = 16;
+	r->rtm_src_len = 0;
+	r->rtm_tos = 0;
+	r->rtm_table = RT_TABLE_MAIN;
+	RTA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+	r->rtm_type = rt->rt_type;
+	r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+	r->rtm_scope = RT_SCOPE_UNIVERSE;
+	r->rtm_protocol = RTPROT_UNSPEC;
+	if (rt->rt_flags & RTCF_NOTIFY)
+		r->rtm_flags |= RTM_F_NOTIFY;
+	RTA_PUT(skb, RTA_DST, 2, &rt->rt_daddr);
+	if (rt->fld.saddr) {
+		r->rtm_src_len = 16;
+		RTA_PUT(skb, RTA_SRC, 2, &rt->fld.saddr);
+	}
+	if (rt->dst.dev)
+		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->dst.dev->ifindex);
+	/*
+	 * Note to self - change this if input routes reverse direction when
+	 * they deal only with inputs and not with replies like they do
+	 * currently.
+	 */
+	RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src);
+	if (rt->rt_daddr != rt->rt_gateway)
+		RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway);
+	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+		goto rtattr_failure;
+	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires,
+			       rt->dst.error) < 0)
+		goto rtattr_failure;
+	if (dn_is_input_route(rt))
+		RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fld.flowidn_iif);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/*
+ * This is called by both endnodes and routers now.
+ */
+static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct rtattr **rta = arg;
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct dn_route *rt = NULL;
+	struct dn_skb_cb *cb;
+	int err;
+	struct sk_buff *skb;
+	struct flowidn fld;
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	memset(&fld, 0, sizeof(fld));
+	fld.flowidn_proto = DNPROTO_NSP;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		return -ENOBUFS;
+	skb_reset_mac_header(skb);
+	cb = DN_SKB_CB(skb);
+
+	if (rta[RTA_SRC-1])
+		memcpy(&fld.saddr, RTA_DATA(rta[RTA_SRC-1]), 2);
+	if (rta[RTA_DST-1])
+		memcpy(&fld.daddr, RTA_DATA(rta[RTA_DST-1]), 2);
+	if (rta[RTA_IIF-1])
+		memcpy(&fld.flowidn_iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
+
+	if (fld.flowidn_iif) {
+		struct net_device *dev;
+		if ((dev = dev_get_by_index(&init_net, fld.flowidn_iif)) == NULL) {
+			kfree_skb(skb);
+			return -ENODEV;
+		}
+		if (!dev->dn_ptr) {
+			dev_put(dev);
+			kfree_skb(skb);
+			return -ENODEV;
+		}
+		skb->protocol = htons(ETH_P_DNA_RT);
+		skb->dev = dev;
+		cb->src = fld.saddr;
+		cb->dst = fld.daddr;
+		local_bh_disable();
+		err = dn_route_input(skb);
+		local_bh_enable();
+		memset(cb, 0, sizeof(struct dn_skb_cb));
+		rt = (struct dn_route *)skb_dst(skb);
+		if (!err && -rt->dst.error)
+			err = rt->dst.error;
+	} else {
+		int oif = 0;
+		if (rta[RTA_OIF - 1])
+			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
+		fld.flowidn_oif = oif;
+		err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
+	}
+
+	if (skb->dev)
+		dev_put(skb->dev);
+	skb->dev = NULL;
+	if (err)
+		goto out_free;
+	skb_dst_set(skb, &rt->dst);
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
+		rt->rt_flags |= RTCF_NOTIFY;
+
+	err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
+
+	if (err == 0)
+		goto out_free;
+	if (err < 0) {
+		err = -EMSGSIZE;
+		goto out_free;
+	}
+
+	return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
+
+out_free:
+	kfree_skb(skb);
+	return err;
+}
+
+/*
+ * For routers, this is called from dn_fib_dump, but for endnodes its
+ * called directly from the rtnetlink dispatch table.
+ */
+int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct dn_route *rt;
+	int h, s_h;
+	int idx, s_idx;
+
+	if (!net_eq(net, &init_net))
+		return 0;
+
+	if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg))
+		return -EINVAL;
+	if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED))
+		return 0;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	for(h = 0; h <= dn_rt_hash_mask; h++) {
+		if (h < s_h)
+			continue;
+		if (h > s_h)
+			s_idx = 0;
+		rcu_read_lock_bh();
+		for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
+			rt;
+			rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
+			if (idx < s_idx)
+				continue;
+			skb_dst_set(skb, dst_clone(&rt->dst));
+			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+					1, NLM_F_MULTI) <= 0) {
+				skb_dst_drop(skb);
+				rcu_read_unlock_bh();
+				goto done;
+			}
+			skb_dst_drop(skb);
+		}
+		rcu_read_unlock_bh();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+struct dn_rt_cache_iter_state {
+	int bucket;
+};
+
+static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
+{
+	struct dn_route *rt = NULL;
+	struct dn_rt_cache_iter_state *s = seq->private;
+
+	for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
+		rcu_read_lock_bh();
+		rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
+		if (rt)
+			break;
+		rcu_read_unlock_bh();
+	}
+	return rt;
+}
+
+static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt)
+{
+	struct dn_rt_cache_iter_state *s = seq->private;
+
+	rt = rcu_dereference_bh(rt->dst.dn_next);
+	while (!rt) {
+		rcu_read_unlock_bh();
+		if (--s->bucket < 0)
+			break;
+		rcu_read_lock_bh();
+		rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
+	}
+	return rt;
+}
+
+static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct dn_route *rt = dn_rt_cache_get_first(seq);
+
+	if (rt) {
+		while(*pos && (rt = dn_rt_cache_get_next(seq, rt)))
+			--*pos;
+	}
+	return *pos ? NULL : rt;
+}
+
+static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct dn_route *rt = dn_rt_cache_get_next(seq, v);
+	++*pos;
+	return rt;
+}
+
+static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v)
+		rcu_read_unlock_bh();
+}
+
+static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+	struct dn_route *rt = v;
+	char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
+
+	seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
+			rt->dst.dev ? rt->dst.dev->name : "*",
+			dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
+			dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
+			atomic_read(&rt->dst.__refcnt),
+			rt->dst.__use,
+			(int) dst_metric(&rt->dst, RTAX_RTT));
+	return 0;
+}
+
+static const struct seq_operations dn_rt_cache_seq_ops = {
+	.start	= dn_rt_cache_seq_start,
+	.next	= dn_rt_cache_seq_next,
+	.stop	= dn_rt_cache_seq_stop,
+	.show	= dn_rt_cache_seq_show,
+};
+
+static int dn_rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &dn_rt_cache_seq_ops,
+			sizeof(struct dn_rt_cache_iter_state));
+}
+
+static const struct file_operations dn_rt_cache_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = dn_rt_cache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+void __init dn_route_init(void)
+{
+	int i, goal, order;
+
+	dn_dst_ops.kmem_cachep =
+		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	dst_entries_init(&dn_dst_ops);
+	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
+	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
+	add_timer(&dn_route_timer);
+
+	goal = totalram_pages >> (26 - PAGE_SHIFT);
+
+	for(order = 0; (1UL << order) < goal; order++)
+		/* NOTHING */;
+
+	/*
+	 * Only want 1024 entries max, since the table is very, very unlikely
+	 * to be larger than that.
+	 */
+	while(order && ((((1UL << order) * PAGE_SIZE) /
+				sizeof(struct dn_rt_hash_bucket)) >= 2048))
+		order--;
+
+	do {
+		dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
+			sizeof(struct dn_rt_hash_bucket);
+		while(dn_rt_hash_mask & (dn_rt_hash_mask - 1))
+			dn_rt_hash_mask--;
+		dn_rt_hash_table = (struct dn_rt_hash_bucket *)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (dn_rt_hash_table == NULL && --order > 0);
+
+	if (!dn_rt_hash_table)
+		panic("Failed to allocate DECnet route cache hash table\n");
+
+	printk(KERN_INFO
+		"DECnet: Routing cache hash table of %u buckets, %ldKbytes\n",
+		dn_rt_hash_mask,
+		(long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
+
+	dn_rt_hash_mask--;
+	for(i = 0; i <= dn_rt_hash_mask; i++) {
+		spin_lock_init(&dn_rt_hash_table[i].lock);
+		dn_rt_hash_table[i].chain = NULL;
+	}
+
+	dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
+
+	proc_net_fops_create(&init_net, "decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
+
+#ifdef CONFIG_DECNET_ROUTER
+	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute, dn_fib_dump);
+#else
+	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
+		      dn_cache_dump);
+#endif
+}
+
+void __exit dn_route_cleanup(void)
+{
+	del_timer(&dn_route_timer);
+	dn_run_flush(0);
+
+	proc_net_remove(&init_net, "decnet_cache");
+	dst_entries_destroy(&dn_dst_ops);
+}
+
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
new file mode 100644
index 00000000..f0efb0cc
--- /dev/null
+++ b/net/decnet/dn_rules.c
@@ -0,0 +1,254 @@
+
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Routing Forwarding Information Base (Rules)
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *              Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c
+ *
+ *
+ * Changes:
+ *              Steve Whitehouse <steve@chygwyn.com>
+ *              Updated for Thomas Graf's generic rules
+ *
+ */
+#include <linux/net.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+static struct fib_rules_ops *dn_fib_rules_ops;
+
+struct dn_fib_rule
+{
+	struct fib_rule		common;
+	unsigned char		dst_len;
+	unsigned char		src_len;
+	__le16			src;
+	__le16			srcmask;
+	__le16			dst;
+	__le16			dstmask;
+	__le16			srcmap;
+	u8			flags;
+};
+
+
+int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res)
+{
+	struct fib_lookup_arg arg = {
+		.result = res,
+	};
+	int err;
+
+	err = fib_rules_lookup(dn_fib_rules_ops,
+			       flowidn_to_flowi(flp), 0, &arg);
+	res->r = arg.rule;
+
+	return err;
+}
+
+static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
+{
+	struct flowidn *fld = &flp->u.dn;
+	int err = -EAGAIN;
+	struct dn_fib_table *tbl;
+
+	switch(rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+
+	case FR_ACT_UNREACHABLE:
+		err = -ENETUNREACH;
+		goto errout;
+
+	case FR_ACT_PROHIBIT:
+		err = -EACCES;
+		goto errout;
+
+	case FR_ACT_BLACKHOLE:
+	default:
+		err = -EINVAL;
+		goto errout;
+	}
+
+	tbl = dn_fib_get_table(rule->table, 0);
+	if (tbl == NULL)
+		goto errout;
+
+	err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result);
+	if (err > 0)
+		err = -EAGAIN;
+errout:
+	return err;
+}
+
+static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = {
+	FRA_GENERIC_POLICY,
+};
+
+static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+	struct flowidn *fld = &fl->u.dn;
+	__le16 daddr = fld->daddr;
+	__le16 saddr = fld->saddr;
+
+	if (((saddr ^ r->src) & r->srcmask) ||
+	    ((daddr ^ r->dst) & r->dstmask))
+		return 0;
+
+	return 1;
+}
+
+static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+				 struct fib_rule_hdr *frh,
+				 struct nlattr **tb)
+{
+	int err = -EINVAL;
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+	if (frh->tos)
+		goto  errout;
+
+	if (rule->table == RT_TABLE_UNSPEC) {
+		if (rule->action == FR_ACT_TO_TBL) {
+			struct dn_fib_table *table;
+
+			table = dn_fib_empty_table();
+			if (table == NULL) {
+				err = -ENOBUFS;
+				goto errout;
+			}
+
+			rule->table = table->n;
+		}
+	}
+
+	if (frh->src_len)
+		r->src = nla_get_le16(tb[FRA_SRC]);
+
+	if (frh->dst_len)
+		r->dst = nla_get_le16(tb[FRA_DST]);
+
+	r->src_len = frh->src_len;
+	r->srcmask = dnet_make_mask(r->src_len);
+	r->dst_len = frh->dst_len;
+	r->dstmask = dnet_make_mask(r->dst_len);
+	err = 0;
+errout:
+	return err;
+}
+
+static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
+{
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+	if (frh->src_len && (r->src_len != frh->src_len))
+		return 0;
+
+	if (frh->dst_len && (r->dst_len != frh->dst_len))
+		return 0;
+
+	if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC])))
+		return 0;
+
+	if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST])))
+		return 0;
+
+	return 1;
+}
+
+unsigned dnet_addr_type(__le16 addr)
+{
+	struct flowidn fld = { .daddr = addr };
+	struct dn_fib_res res;
+	unsigned ret = RTN_UNICAST;
+	struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
+
+	res.r = NULL;
+
+	if (tb) {
+		if (!tb->lookup(tb, &fld, &res)) {
+			ret = res.type;
+			dn_fib_res_put(&res);
+		}
+	}
+	return ret;
+}
+
+static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			    struct fib_rule_hdr *frh)
+{
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+	frh->dst_len = r->dst_len;
+	frh->src_len = r->src_len;
+	frh->tos = 0;
+
+	if (r->dst_len)
+		NLA_PUT_LE16(skb, FRA_DST, r->dst);
+	if (r->src_len)
+		NLA_PUT_LE16(skb, FRA_SRC, r->src);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
+{
+	dn_rt_cache_flush(-1);
+}
+
+static const struct fib_rules_ops __net_initdata dn_fib_rules_ops_template = {
+	.family		= AF_DECnet,
+	.rule_size	= sizeof(struct dn_fib_rule),
+	.addr_size	= sizeof(u16),
+	.action		= dn_fib_rule_action,
+	.match		= dn_fib_rule_match,
+	.configure	= dn_fib_rule_configure,
+	.compare	= dn_fib_rule_compare,
+	.fill		= dn_fib_rule_fill,
+	.default_pref	= fib_default_rule_pref,
+	.flush_cache	= dn_fib_rule_flush_cache,
+	.nlgroup	= RTNLGRP_DECnet_RULE,
+	.policy		= dn_fib_rule_policy,
+	.owner		= THIS_MODULE,
+	.fro_net	= &init_net,
+};
+
+void __init dn_fib_rules_init(void)
+{
+	dn_fib_rules_ops =
+		fib_rules_register(&dn_fib_rules_ops_template, &init_net);
+	BUG_ON(IS_ERR(dn_fib_rules_ops));
+	BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
+			            RT_TABLE_MAIN, 0));
+}
+
+void __exit dn_fib_rules_cleanup(void)
+{
+	fib_rules_unregister(dn_fib_rules_ops);
+	rcu_barrier();
+}
+
+
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
new file mode 100644
index 00000000..bd0a52dd
--- /dev/null
+++ b/net/decnet/dn_table.c
@@ -0,0 +1,908 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Routing Forwarding Information Base (Routing Tables)
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *              Mostly copied from the IPv4 routing code
+ *
+ *
+ * Changes:
+ *
+ */
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/route.h> /* RTF_xxx */
+#include <net/neighbour.h>
+#include <net/netlink.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/fib_rules.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+#include <net/dn_fib.h>
+#include <net/dn_neigh.h>
+#include <net/dn_dev.h>
+
+struct dn_zone
+{
+	struct dn_zone		*dz_next;
+	struct dn_fib_node 	**dz_hash;
+	int			dz_nent;
+	int			dz_divisor;
+	u32			dz_hashmask;
+#define DZ_HASHMASK(dz)	((dz)->dz_hashmask)
+	int			dz_order;
+	__le16			dz_mask;
+#define DZ_MASK(dz)	((dz)->dz_mask)
+};
+
+struct dn_hash
+{
+	struct dn_zone	*dh_zones[17];
+	struct dn_zone	*dh_zone_list;
+};
+
+#define dz_key_0(key)		((key).datum = 0)
+
+#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
+	for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define endfor_nexthops(fi) }
+
+#define DN_MAX_DIVISOR 1024
+#define DN_S_ZOMBIE 1
+#define DN_S_ACCESSED 2
+
+#define DN_FIB_SCAN(f, fp) \
+for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
+
+#define DN_FIB_SCAN_KEY(f, fp, key) \
+for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
+
+#define RT_TABLE_MIN 1
+#define DN_FIB_TABLE_HASHSZ 256
+static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
+static DEFINE_RWLOCK(dn_fib_tables_lock);
+
+static struct kmem_cache *dn_hash_kmem __read_mostly;
+static int dn_fib_hash_zombies;
+
+static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
+{
+	u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order);
+	h ^= (h >> 10);
+	h ^= (h >> 6);
+	h &= DZ_HASHMASK(dz);
+	return *(dn_fib_idx_t *)&h;
+}
+
+static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
+{
+	dn_fib_key_t k;
+	k.datum = dst & DZ_MASK(dz);
+	return k;
+}
+
+static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz)
+{
+	return &dz->dz_hash[dn_hash(key, dz).datum];
+}
+
+static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz)
+{
+	return dz->dz_hash[dn_hash(key, dz).datum];
+}
+
+static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b)
+{
+	return a.datum == b.datum;
+}
+
+static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b)
+{
+	return a.datum <= b.datum;
+}
+
+static inline void dn_rebuild_zone(struct dn_zone *dz,
+				   struct dn_fib_node **old_ht,
+				   int old_divisor)
+{
+	struct dn_fib_node *f, **fp, *next;
+	int i;
+
+	for(i = 0; i < old_divisor; i++) {
+		for(f = old_ht[i]; f; f = next) {
+			next = f->fn_next;
+			for(fp = dn_chain_p(f->fn_key, dz);
+				*fp && dn_key_leq((*fp)->fn_key, f->fn_key);
+				fp = &(*fp)->fn_next)
+				/* NOTHING */;
+			f->fn_next = *fp;
+			*fp = f;
+		}
+	}
+}
+
+static void dn_rehash_zone(struct dn_zone *dz)
+{
+	struct dn_fib_node **ht, **old_ht;
+	int old_divisor, new_divisor;
+	u32 new_hashmask;
+
+	old_divisor = dz->dz_divisor;
+
+	switch(old_divisor) {
+		case 16:
+			new_divisor = 256;
+			new_hashmask = 0xFF;
+			break;
+		default:
+			printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor);
+		case 256:
+			new_divisor = 1024;
+			new_hashmask = 0x3FF;
+			break;
+	}
+
+	ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL);
+	if (ht == NULL)
+		return;
+
+	write_lock_bh(&dn_fib_tables_lock);
+	old_ht = dz->dz_hash;
+	dz->dz_hash = ht;
+	dz->dz_hashmask = new_hashmask;
+	dz->dz_divisor = new_divisor;
+	dn_rebuild_zone(dz, old_ht, old_divisor);
+	write_unlock_bh(&dn_fib_tables_lock);
+	kfree(old_ht);
+}
+
+static void dn_free_node(struct dn_fib_node *f)
+{
+	dn_fib_release_info(DN_FIB_INFO(f));
+	kmem_cache_free(dn_hash_kmem, f);
+}
+
+
+static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
+{
+	int i;
+	struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL);
+	if (!dz)
+		return NULL;
+
+	if (z) {
+		dz->dz_divisor = 16;
+		dz->dz_hashmask = 0x0F;
+	} else {
+		dz->dz_divisor = 1;
+		dz->dz_hashmask = 0;
+	}
+
+	dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL);
+	if (!dz->dz_hash) {
+		kfree(dz);
+		return NULL;
+	}
+
+	dz->dz_order = z;
+	dz->dz_mask = dnet_make_mask(z);
+
+	for(i = z + 1; i <= 16; i++)
+		if (table->dh_zones[i])
+			break;
+
+	write_lock_bh(&dn_fib_tables_lock);
+	if (i>16) {
+		dz->dz_next = table->dh_zone_list;
+		table->dh_zone_list = dz;
+	} else {
+		dz->dz_next = table->dh_zones[i]->dz_next;
+		table->dh_zones[i]->dz_next = dz;
+	}
+	table->dh_zones[z] = dz;
+	write_unlock_bh(&dn_fib_tables_lock);
+	return dz;
+}
+
+
+static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi)
+{
+	struct rtnexthop *nhp;
+	int nhlen;
+
+	if (rta->rta_priority && *rta->rta_priority != fi->fib_priority)
+		return 1;
+
+	if (rta->rta_oif || rta->rta_gw) {
+		if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+		    (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0))
+			return 0;
+		return 1;
+	}
+
+	if (rta->rta_mp == NULL)
+		return 0;
+
+	nhp = RTA_DATA(rta->rta_mp);
+	nhlen = RTA_PAYLOAD(rta->rta_mp);
+
+	for_nexthops(fi) {
+		int attrlen = nhlen - sizeof(struct rtnexthop);
+		__le16 gw;
+
+		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+			return -EINVAL;
+		if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+			return 1;
+		if (attrlen) {
+			gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+
+			if (gw && gw != nh->nh_gw)
+				return 1;
+		}
+		nhp = RTNH_NEXT(nhp);
+	} endfor_nexthops(fi);
+
+	return 0;
+}
+
+static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
+{
+	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+			 + nla_total_size(4) /* RTA_TABLE */
+			 + nla_total_size(2) /* RTA_DST */
+			 + nla_total_size(4); /* RTA_PRIORITY */
+
+	/* space for nested metrics */
+	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+	if (fi->fib_nhs) {
+		/* Also handles the special case fib_nhs == 1 */
+
+		/* each nexthop is packed in an attribute */
+		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+
+		/* may contain a gateway attribute */
+		nhsize += nla_total_size(4);
+
+		/* all nexthops are packed in a nested attribute */
+		payload += nla_total_size(fi->fib_nhs * nhsize);
+	}
+
+	return payload;
+}
+
+static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+			u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
+			struct dn_fib_info *fi, unsigned int flags)
+{
+	struct rtmsg *rtm;
+	struct nlmsghdr *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
+	rtm = NLMSG_DATA(nlh);
+	rtm->rtm_family = AF_DECnet;
+	rtm->rtm_dst_len = dst_len;
+	rtm->rtm_src_len = 0;
+	rtm->rtm_tos = 0;
+	rtm->rtm_table = tb_id;
+	RTA_PUT_U32(skb, RTA_TABLE, tb_id);
+	rtm->rtm_flags = fi->fib_flags;
+	rtm->rtm_scope = scope;
+	rtm->rtm_type  = type;
+	if (rtm->rtm_dst_len)
+		RTA_PUT(skb, RTA_DST, 2, dst);
+	rtm->rtm_protocol = fi->fib_protocol;
+	if (fi->fib_priority)
+		RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+		goto rtattr_failure;
+	if (fi->fib_nhs == 1) {
+		if (fi->fib_nh->nh_gw)
+			RTA_PUT(skb, RTA_GATEWAY, 2, &fi->fib_nh->nh_gw);
+		if (fi->fib_nh->nh_oif)
+			RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+	}
+	if (fi->fib_nhs > 1) {
+		struct rtnexthop *nhp;
+		struct rtattr *mp_head;
+		if (skb_tailroom(skb) <= RTA_SPACE(0))
+			goto rtattr_failure;
+		mp_head = (struct rtattr *)skb_put(skb, RTA_SPACE(0));
+
+		for_nexthops(fi) {
+			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+				goto rtattr_failure;
+			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			nhp->rtnh_flags = nh->nh_flags & 0xFF;
+			nhp->rtnh_hops = nh->nh_weight - 1;
+			nhp->rtnh_ifindex = nh->nh_oif;
+			if (nh->nh_gw)
+				RTA_PUT(skb, RTA_GATEWAY, 2, &nh->nh_gw);
+			nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp;
+		} endfor_nexthops(fi);
+		mp_head->rta_type = RTA_MULTIPATH;
+		mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+	}
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+
+nlmsg_failure:
+rtattr_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+
+static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
+			struct nlmsghdr *nlh, struct netlink_skb_parms *req)
+{
+	struct sk_buff *skb;
+	u32 pid = req ? req->pid : 0;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id,
+			       f->fn_type, f->fn_scope, &f->fn_key, z,
+			       DN_FIB_INFO(f), 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
+}
+
+static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
+				struct netlink_callback *cb,
+				struct dn_fib_table *tb,
+				struct dn_zone *dz,
+				struct dn_fib_node *f)
+{
+	int i, s_i;
+
+	s_i = cb->args[4];
+	for(i = 0; f; i++, f = f->fn_next) {
+		if (i < s_i)
+			continue;
+		if (f->fn_state & DN_S_ZOMBIE)
+			continue;
+		if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq,
+				RTM_NEWROUTE,
+				tb->n,
+				(f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
+				f->fn_scope, &f->fn_key, dz->dz_order,
+				f->fn_info, NLM_F_MULTI) < 0) {
+			cb->args[4] = i;
+			return -1;
+		}
+	}
+	cb->args[4] = i;
+	return skb->len;
+}
+
+static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
+				struct netlink_callback *cb,
+				struct dn_fib_table *tb,
+				struct dn_zone *dz)
+{
+	int h, s_h;
+
+	s_h = cb->args[3];
+	for(h = 0; h < dz->dz_divisor; h++) {
+		if (h < s_h)
+			continue;
+		if (h > s_h)
+			memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
+		if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
+			continue;
+		if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
+			cb->args[3] = h;
+			return -1;
+		}
+	}
+	cb->args[3] = h;
+	return skb->len;
+}
+
+static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	int m, s_m;
+	struct dn_zone *dz;
+	struct dn_hash *table = (struct dn_hash *)tb->data;
+
+	s_m = cb->args[2];
+	read_lock(&dn_fib_tables_lock);
+	for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
+		if (m < s_m)
+			continue;
+		if (m > s_m)
+			memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
+
+		if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
+			cb->args[2] = m;
+			read_unlock(&dn_fib_tables_lock);
+			return -1;
+		}
+	}
+	read_unlock(&dn_fib_tables_lock);
+	cb->args[2] = m;
+
+	return skb->len;
+}
+
+int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct dn_fib_table *tb;
+	struct hlist_node *node;
+	int dumped = 0;
+
+	if (!net_eq(net, &init_net))
+		return 0;
+
+	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+		((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+			return dn_cache_dump(skb, cb);
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
+		e = 0;
+		hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) {
+			if (e < s_e)
+				goto next;
+			if (dumped)
+				memset(&cb->args[2], 0, sizeof(cb->args) -
+						 2 * sizeof(cb->args[0]));
+			if (tb->dump(tb, skb, cb) < 0)
+				goto out;
+			dumped = 1;
+next:
+			e++;
+		}
+	}
+out:
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	return skb->len;
+}
+
+static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+	struct dn_hash *table = (struct dn_hash *)tb->data;
+	struct dn_fib_node *new_f, *f, **fp, **del_fp;
+	struct dn_zone *dz;
+	struct dn_fib_info *fi;
+	int z = r->rtm_dst_len;
+	int type = r->rtm_type;
+	dn_fib_key_t key;
+	int err;
+
+	if (z > 16)
+		return -EINVAL;
+
+	dz = table->dh_zones[z];
+	if (!dz && !(dz = dn_new_zone(table, z)))
+		return -ENOBUFS;
+
+	dz_key_0(key);
+	if (rta->rta_dst) {
+		__le16 dst;
+		memcpy(&dst, rta->rta_dst, 2);
+		if (dst & ~DZ_MASK(dz))
+			return -EINVAL;
+		key = dz_key(dst, dz);
+	}
+
+	if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL)
+		return err;
+
+	if (dz->dz_nent > (dz->dz_divisor << 2) &&
+			dz->dz_divisor > DN_MAX_DIVISOR &&
+			(z==16 || (1<<z) > dz->dz_divisor))
+		dn_rehash_zone(dz);
+
+	fp = dn_chain_p(key, dz);
+
+	DN_FIB_SCAN(f, fp) {
+		if (dn_key_leq(key, f->fn_key))
+			break;
+	}
+
+	del_fp = NULL;
+
+	if (f && (f->fn_state & DN_S_ZOMBIE) &&
+			dn_key_eq(f->fn_key, key)) {
+		del_fp = fp;
+		fp = &f->fn_next;
+		f = *fp;
+		goto create;
+	}
+
+	DN_FIB_SCAN_KEY(f, fp, key) {
+		if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority)
+			break;
+	}
+
+	if (f && dn_key_eq(f->fn_key, key) &&
+			fi->fib_priority == DN_FIB_INFO(f)->fib_priority) {
+		struct dn_fib_node **ins_fp;
+
+		err = -EEXIST;
+		if (n->nlmsg_flags & NLM_F_EXCL)
+			goto out;
+
+		if (n->nlmsg_flags & NLM_F_REPLACE) {
+			del_fp = fp;
+			fp = &f->fn_next;
+			f = *fp;
+			goto replace;
+		}
+
+		ins_fp = fp;
+		err = -EEXIST;
+
+		DN_FIB_SCAN_KEY(f, fp, key) {
+			if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority)
+				break;
+			if (f->fn_type == type &&
+			    f->fn_scope == r->rtm_scope &&
+			    DN_FIB_INFO(f) == fi)
+				goto out;
+		}
+
+		if (!(n->nlmsg_flags & NLM_F_APPEND)) {
+			fp = ins_fp;
+			f = *fp;
+		}
+	}
+
+create:
+	err = -ENOENT;
+	if (!(n->nlmsg_flags & NLM_F_CREATE))
+		goto out;
+
+replace:
+	err = -ENOBUFS;
+	new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL);
+	if (new_f == NULL)
+		goto out;
+
+	new_f->fn_key = key;
+	new_f->fn_type = type;
+	new_f->fn_scope = r->rtm_scope;
+	DN_FIB_INFO(new_f) = fi;
+
+	new_f->fn_next = f;
+	write_lock_bh(&dn_fib_tables_lock);
+	*fp = new_f;
+	write_unlock_bh(&dn_fib_tables_lock);
+	dz->dz_nent++;
+
+	if (del_fp) {
+		f = *del_fp;
+		write_lock_bh(&dn_fib_tables_lock);
+		*del_fp = f->fn_next;
+		write_unlock_bh(&dn_fib_tables_lock);
+
+		if (!(f->fn_state & DN_S_ZOMBIE))
+			dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
+		if (f->fn_state & DN_S_ACCESSED)
+			dn_rt_cache_flush(-1);
+		dn_free_node(f);
+		dz->dz_nent--;
+	} else {
+		dn_rt_cache_flush(-1);
+	}
+
+	dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req);
+
+	return 0;
+out:
+	dn_fib_release_info(fi);
+	return err;
+}
+
+
+static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+	struct dn_hash *table = (struct dn_hash*)tb->data;
+	struct dn_fib_node **fp, **del_fp, *f;
+	int z = r->rtm_dst_len;
+	struct dn_zone *dz;
+	dn_fib_key_t key;
+	int matched;
+
+
+	if (z > 16)
+		return -EINVAL;
+
+	if ((dz = table->dh_zones[z]) == NULL)
+		return -ESRCH;
+
+	dz_key_0(key);
+	if (rta->rta_dst) {
+		__le16 dst;
+		memcpy(&dst, rta->rta_dst, 2);
+		if (dst & ~DZ_MASK(dz))
+			return -EINVAL;
+		key = dz_key(dst, dz);
+	}
+
+	fp = dn_chain_p(key, dz);
+
+	DN_FIB_SCAN(f, fp) {
+		if (dn_key_eq(f->fn_key, key))
+			break;
+		if (dn_key_leq(key, f->fn_key))
+			return -ESRCH;
+	}
+
+	matched = 0;
+	del_fp = NULL;
+	DN_FIB_SCAN_KEY(f, fp, key) {
+		struct dn_fib_info *fi = DN_FIB_INFO(f);
+
+		if (f->fn_state & DN_S_ZOMBIE)
+			return -ESRCH;
+
+		matched++;
+
+		if (del_fp == NULL &&
+				(!r->rtm_type || f->fn_type == r->rtm_type) &&
+				(r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
+				(!r->rtm_protocol ||
+					fi->fib_protocol == r->rtm_protocol) &&
+				dn_fib_nh_match(r, n, rta, fi) == 0)
+			del_fp = fp;
+	}
+
+	if (del_fp) {
+		f = *del_fp;
+		dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
+
+		if (matched != 1) {
+			write_lock_bh(&dn_fib_tables_lock);
+			*del_fp = f->fn_next;
+			write_unlock_bh(&dn_fib_tables_lock);
+
+			if (f->fn_state & DN_S_ACCESSED)
+				dn_rt_cache_flush(-1);
+			dn_free_node(f);
+			dz->dz_nent--;
+		} else {
+			f->fn_state |= DN_S_ZOMBIE;
+			if (f->fn_state & DN_S_ACCESSED) {
+				f->fn_state &= ~DN_S_ACCESSED;
+				dn_rt_cache_flush(-1);
+			}
+			if (++dn_fib_hash_zombies > 128)
+				dn_fib_flush();
+		}
+
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
+static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table)
+{
+	int found = 0;
+	struct dn_fib_node *f;
+
+	while((f = *fp) != NULL) {
+		struct dn_fib_info *fi = DN_FIB_INFO(f);
+
+		if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) {
+			write_lock_bh(&dn_fib_tables_lock);
+			*fp = f->fn_next;
+			write_unlock_bh(&dn_fib_tables_lock);
+
+			dn_free_node(f);
+			found++;
+			continue;
+		}
+		fp = &f->fn_next;
+	}
+
+	return found;
+}
+
+static int dn_fib_table_flush(struct dn_fib_table *tb)
+{
+	struct dn_hash *table = (struct dn_hash *)tb->data;
+	struct dn_zone *dz;
+	int found = 0;
+
+	dn_fib_hash_zombies = 0;
+	for(dz = table->dh_zone_list; dz; dz = dz->dz_next) {
+		int i;
+		int tmp = 0;
+		for(i = dz->dz_divisor-1; i >= 0; i--)
+			tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table);
+		dz->dz_nent -= tmp;
+		found += tmp;
+	}
+
+	return found;
+}
+
+static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res)
+{
+	int err;
+	struct dn_zone *dz;
+	struct dn_hash *t = (struct dn_hash *)tb->data;
+
+	read_lock(&dn_fib_tables_lock);
+	for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
+		struct dn_fib_node *f;
+		dn_fib_key_t k = dz_key(flp->daddr, dz);
+
+		for(f = dz_chain(k, dz); f; f = f->fn_next) {
+			if (!dn_key_eq(k, f->fn_key)) {
+				if (dn_key_leq(k, f->fn_key))
+					break;
+				else
+					continue;
+			}
+
+			f->fn_state |= DN_S_ACCESSED;
+
+			if (f->fn_state&DN_S_ZOMBIE)
+				continue;
+
+			if (f->fn_scope < flp->flowidn_scope)
+				continue;
+
+			err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
+
+			if (err == 0) {
+				res->type = f->fn_type;
+				res->scope = f->fn_scope;
+				res->prefixlen = dz->dz_order;
+				goto out;
+			}
+			if (err < 0)
+				goto out;
+		}
+	}
+	err = 1;
+out:
+	read_unlock(&dn_fib_tables_lock);
+	return err;
+}
+
+
+struct dn_fib_table *dn_fib_get_table(u32 n, int create)
+{
+	struct dn_fib_table *t;
+	struct hlist_node *node;
+	unsigned int h;
+
+	if (n < RT_TABLE_MIN)
+		return NULL;
+
+	if (n > RT_TABLE_MAX)
+		return NULL;
+
+	h = n & (DN_FIB_TABLE_HASHSZ - 1);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(t, node, &dn_fib_table_hash[h], hlist) {
+		if (t->n == n) {
+			rcu_read_unlock();
+			return t;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!create)
+		return NULL;
+
+	if (in_interrupt() && net_ratelimit()) {
+		printk(KERN_DEBUG "DECnet: BUG! Attempt to create routing table from interrupt\n");
+		return NULL;
+	}
+
+	t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash),
+		    GFP_KERNEL);
+	if (t == NULL)
+		return NULL;
+
+	t->n = n;
+	t->insert = dn_fib_table_insert;
+	t->delete = dn_fib_table_delete;
+	t->lookup = dn_fib_table_lookup;
+	t->flush  = dn_fib_table_flush;
+	t->dump = dn_fib_table_dump;
+	hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
+
+	return t;
+}
+
+struct dn_fib_table *dn_fib_empty_table(void)
+{
+	u32 id;
+
+	for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
+		if (dn_fib_get_table(id, 0) == NULL)
+			return dn_fib_get_table(id, 1);
+	return NULL;
+}
+
+void dn_fib_flush(void)
+{
+	int flushed = 0;
+	struct dn_fib_table *tb;
+	struct hlist_node *node;
+	unsigned int h;
+
+	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist)
+			flushed += tb->flush(tb);
+	}
+
+	if (flushed)
+		dn_rt_cache_flush(-1);
+}
+
+void __init dn_fib_table_init(void)
+{
+	dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
+					sizeof(struct dn_fib_info),
+					0, SLAB_HWCACHE_ALIGN,
+					NULL);
+}
+
+void __exit dn_fib_table_cleanup(void)
+{
+	struct dn_fib_table *t;
+	struct hlist_node *node, *next;
+	unsigned int h;
+
+	write_lock(&dn_fib_tables_lock);
+	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry_safe(t, node, next, &dn_fib_table_hash[h],
+					  hlist) {
+			hlist_del(&t->hlist);
+			kfree(t);
+		}
+	}
+	write_unlock(&dn_fib_tables_lock);
+}
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
new file mode 100644
index 00000000..09825711
--- /dev/null
+++ b/net/decnet/dn_timer.c
@@ -0,0 +1,109 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Socket Timer Functions
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ *       Steve Whitehouse      : Made keepalive timer part of the same
+ *                               timer idea.
+ *       Steve Whitehouse      : Added checks for sk->sock_readers
+ *       David S. Miller       : New socket locking
+ *       Steve Whitehouse      : Timer grabs socket ref.
+ */
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <asm/atomic.h>
+#include <net/flow.h>
+#include <net/dn.h>
+
+/*
+ * Slow timer is for everything else (n * 500mS)
+ */
+
+#define SLOW_INTERVAL (HZ/2)
+
+static void dn_slow_timer(unsigned long arg);
+
+void dn_start_slow_timer(struct sock *sk)
+{
+	sk->sk_timer.expires	= jiffies + SLOW_INTERVAL;
+	sk->sk_timer.function	= dn_slow_timer;
+	sk->sk_timer.data	= (unsigned long)sk;
+
+	add_timer(&sk->sk_timer);
+}
+
+void dn_stop_slow_timer(struct sock *sk)
+{
+	del_timer(&sk->sk_timer);
+}
+
+static void dn_slow_timer(unsigned long arg)
+{
+	struct sock *sk = (struct sock *)arg;
+	struct dn_scp *scp = DN_SK(sk);
+
+	sock_hold(sk);
+	bh_lock_sock(sk);
+
+	if (sock_owned_by_user(sk)) {
+		sk->sk_timer.expires = jiffies + HZ / 10;
+		add_timer(&sk->sk_timer);
+		goto out;
+	}
+
+	/*
+	 * The persist timer is the standard slow timer used for retransmits
+	 * in both connection establishment and disconnection as well as
+	 * in the RUN state. The different states are catered for by changing
+	 * the function pointer in the socket. Setting the timer to a value
+	 * of zero turns it off. We allow the persist_fxn to turn the
+	 * timer off in a permant way by returning non-zero, so that
+	 * timer based routines may remove sockets. This is why we have a
+	 * sock_hold()/sock_put() around the timer to prevent the socket
+	 * going away in the middle.
+	 */
+	if (scp->persist && scp->persist_fxn) {
+		if (scp->persist <= SLOW_INTERVAL) {
+			scp->persist = 0;
+
+			if (scp->persist_fxn(sk))
+				goto out;
+		} else {
+			scp->persist -= SLOW_INTERVAL;
+		}
+	}
+
+	/*
+	 * Check for keepalive timeout. After the other timer 'cos if
+	 * the previous timer caused a retransmit, we don't need to
+	 * do this. scp->stamp is the last time that we sent a packet.
+	 * The keepalive function sends a link service packet to the
+	 * other end. If it remains unacknowledged, the standard
+	 * socket timers will eventually shut the socket down. Each
+	 * time we do this, scp->stamp will be updated, thus
+	 * we won't try and send another until scp->keepalive has passed
+	 * since the last successful transmission.
+	 */
+	if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) {
+		if ((jiffies - scp->stamp) >= scp->keepalive)
+			scp->keepalive_fxn(sk);
+	}
+
+	sk->sk_timer.expires = jiffies + SLOW_INTERVAL;
+
+	add_timer(&sk->sk_timer);
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig
new file mode 100644
index 00000000..2f81de5e
--- /dev/null
+++ b/net/decnet/netfilter/Kconfig
@@ -0,0 +1,16 @@
+#
+# DECnet netfilter configuration
+#
+
+menu "DECnet: Netfilter Configuration"
+	depends on DECNET && NETFILTER && EXPERIMENTAL
+	depends on NETFILTER_ADVANCED
+
+config DECNET_NF_GRABULATOR
+	tristate "Routing message grabulator (for userland routing daemon)"
+	help
+	  Enable this module if you want to use the userland DECnet routing
+	  daemon. You will also need to enable routing support for DECnet
+	  unless you just want to monitor routing messages from other nodes.
+
+endmenu
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
new file mode 100644
index 00000000..255c1ae9
--- /dev/null
+++ b/net/decnet/netfilter/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for DECnet netfilter modules
+#
+
+obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
+
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
new file mode 100644
index 00000000..64a7f39e
--- /dev/null
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -0,0 +1,161 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet Routing Message Grabulator
+ *
+ *              (C) 2000 ChyGwyn Limited  -  http://www.chygwyn.com/
+ *              This code may be copied under the GPL v.2 or at your option
+ *              any later version.
+ *
+ * Author:      Steven Whitehouse <steve@chygwyn.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter_decnet.h>
+
+#include <net/sock.h>
+#include <net/flow.h>
+#include <net/dn.h>
+#include <net/dn_route.h>
+
+static struct sock *dnrmg = NULL;
+
+
+static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
+{
+	struct sk_buff *skb = NULL;
+	size_t size;
+	sk_buff_data_t old_tail;
+	struct nlmsghdr *nlh;
+	unsigned char *ptr;
+	struct nf_dn_rtmsg *rtm;
+
+	size = NLMSG_SPACE(rt_skb->len);
+	size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+	old_tail = skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh));
+	rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh);
+	rtm->nfdn_ifindex = rt_skb->dev->ifindex;
+	ptr = NFDN_RTMSG(rtm);
+	skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len);
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+	if (skb)
+		kfree_skb(skb);
+	*errp = -ENOMEM;
+	if (net_ratelimit())
+		printk(KERN_ERR "dn_rtmsg: error creating netlink message\n");
+	return NULL;
+}
+
+static void dnrmg_send_peer(struct sk_buff *skb)
+{
+	struct sk_buff *skb2;
+	int status = 0;
+	int group = 0;
+	unsigned char flags = *skb->data;
+
+	switch(flags & DN_RT_CNTL_MSK) {
+		case DN_RT_PKT_L1RT:
+			group = DNRNG_NLGRP_L1;
+			break;
+		case DN_RT_PKT_L2RT:
+			group = DNRNG_NLGRP_L2;
+			break;
+		default:
+			return;
+	}
+
+	skb2 = dnrmg_build_message(skb, &status);
+	if (skb2 == NULL)
+		return;
+	NETLINK_CB(skb2).dst_group = group;
+	netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
+}
+
+
+static unsigned int dnrmg_hook(unsigned int hook,
+			struct sk_buff *skb,
+			const struct net_device *in,
+			const struct net_device *out,
+			int (*okfn)(struct sk_buff *))
+{
+	dnrmg_send_peer(skb);
+	return NF_ACCEPT;
+}
+
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
+
+	if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+		return;
+
+	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+		RCV_SKB_FAIL(-EPERM);
+
+	/* Eventually we might send routing messages too */
+
+	RCV_SKB_FAIL(-EINVAL);
+}
+
+static struct nf_hook_ops dnrmg_ops __read_mostly = {
+	.hook		= dnrmg_hook,
+	.pf		= PF_DECnet,
+	.hooknum	= NF_DN_ROUTE,
+	.priority	= NF_DN_PRI_DNRTMSG,
+};
+
+static int __init dn_rtmsg_init(void)
+{
+	int rv = 0;
+
+	dnrmg = netlink_kernel_create(&init_net,
+				      NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+				      dnrmg_receive_user_skb,
+				      NULL, THIS_MODULE);
+	if (dnrmg == NULL) {
+		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
+		return -ENOMEM;
+	}
+
+	rv = nf_register_hook(&dnrmg_ops);
+	if (rv) {
+		netlink_kernel_release(dnrmg);
+	}
+
+	return rv;
+}
+
+static void __exit dn_rtmsg_fini(void)
+{
+	nf_unregister_hook(&dnrmg_ops);
+	netlink_kernel_release(dnrmg);
+}
+
+
+MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
+MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
+
+module_init(dn_rtmsg_init);
+module_exit(dn_rtmsg_fini);
+
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
new file mode 100644
index 00000000..28f8b5e5
--- /dev/null
+++ b/net/decnet/sysctl_net_decnet.c
@@ -0,0 +1,377 @@
+/*
+ * DECnet       An implementation of the DECnet protocol suite for the LINUX
+ *              operating system.  DECnet is implemented using the  BSD Socket
+ *              interface as the means of communication with the user level.
+ *
+ *              DECnet sysctl support functions
+ *
+ * Author:      Steve Whitehouse <SteveW@ACM.org>
+ *
+ *
+ * Changes:
+ * Steve Whitehouse - C99 changes and default device handling
+ * Steve Whitehouse - Memory buffer settings, like the tcp ones
+ *
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <net/neighbour.h>
+#include <net/dst.h>
+#include <net/flow.h>
+
+#include <asm/uaccess.h>
+
+#include <net/dn.h>
+#include <net/dn_dev.h>
+#include <net/dn_route.h>
+
+
+int decnet_debug_level;
+int decnet_time_wait = 30;
+int decnet_dn_count = 1;
+int decnet_di_count = 3;
+int decnet_dr_count = 3;
+int decnet_log_martians = 1;
+int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
+
+/* Reasonable defaults, I hope, based on tcp's defaults */
+long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
+int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
+#ifdef CONFIG_SYSCTL
+extern int decnet_dst_gc_interval;
+static int min_decnet_time_wait[] = { 5 };
+static int max_decnet_time_wait[] = { 600 };
+static int min_state_count[] = { 1 };
+static int max_state_count[] = { NSP_MAXRXTSHIFT };
+static int min_decnet_dst_gc_interval[] = { 1 };
+static int max_decnet_dst_gc_interval[] = { 60 };
+static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW };
+static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW };
+static char node_name[7] = "???";
+
+static struct ctl_table_header *dn_table_header = NULL;
+
+/*
+ * ctype.h :-)
+ */
+#define ISNUM(x) (((x) >= '0') && ((x) <= '9'))
+#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z'))
+#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z'))
+#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x))
+#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x))
+
+static void strip_it(char *str)
+{
+	for(;;) {
+		switch(*str) {
+			case ' ':
+			case '\n':
+			case '\r':
+			case ':':
+				*str = 0;
+			case 0:
+				return;
+		}
+		str++;
+	}
+}
+
+/*
+ * Simple routine to parse an ascii DECnet address
+ * into a network order address.
+ */
+static int parse_addr(__le16 *addr, char *str)
+{
+	__u16 area, node;
+
+	while(*str && !ISNUM(*str)) str++;
+
+	if (*str == 0)
+		return -1;
+
+	area = (*str++ - '0');
+	if (ISNUM(*str)) {
+		area *= 10;
+		area += (*str++ - '0');
+	}
+
+	if (*str++ != '.')
+		return -1;
+
+	if (!ISNUM(*str))
+		return -1;
+
+	node = *str++ - '0';
+	if (ISNUM(*str)) {
+		node *= 10;
+		node += (*str++ - '0');
+	}
+	if (ISNUM(*str)) {
+		node *= 10;
+		node += (*str++ - '0');
+	}
+	if (ISNUM(*str)) {
+		node *= 10;
+		node += (*str++ - '0');
+	}
+
+	if ((node > 1023) || (area > 63))
+		return -1;
+
+	if (INVALID_END_CHAR(*str))
+		return -1;
+
+	*addr = cpu_to_le16((area << 10) | node);
+
+	return 0;
+}
+
+static int dn_node_address_handler(ctl_table *table, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+	char addr[DN_ASCBUF_LEN];
+	size_t len;
+	__le16 dnaddr;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
+
+		if (copy_from_user(addr, buffer, len))
+			return -EFAULT;
+
+		addr[len] = 0;
+		strip_it(addr);
+
+		if (parse_addr(&dnaddr, addr))
+			return -EINVAL;
+
+		dn_dev_devices_off();
+
+		decnet_address = dnaddr;
+
+		dn_dev_devices_on();
+
+		*ppos += len;
+
+		return 0;
+	}
+
+	dn_addr2asc(le16_to_cpu(decnet_address), addr);
+	len = strlen(addr);
+	addr[len++] = '\n';
+
+	if (len > *lenp) len = *lenp;
+
+	if (copy_to_user(buffer, addr, len))
+		return -EFAULT;
+
+	*lenp = len;
+	*ppos += len;
+
+	return 0;
+}
+
+static int dn_def_dev_handler(ctl_table *table, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+	size_t len;
+	struct net_device *dev;
+	char devname[17];
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		if (*lenp > 16)
+			return -E2BIG;
+
+		if (copy_from_user(devname, buffer, *lenp))
+			return -EFAULT;
+
+		devname[*lenp] = 0;
+		strip_it(devname);
+
+		dev = dev_get_by_name(&init_net, devname);
+		if (dev == NULL)
+			return -ENODEV;
+
+		if (dev->dn_ptr == NULL) {
+			dev_put(dev);
+			return -ENODEV;
+		}
+
+		if (dn_dev_set_default(dev, 1)) {
+			dev_put(dev);
+			return -ENODEV;
+		}
+		*ppos += *lenp;
+
+		return 0;
+	}
+
+	dev = dn_dev_get_default();
+	if (dev == NULL) {
+		*lenp = 0;
+		return 0;
+	}
+
+	strcpy(devname, dev->name);
+	dev_put(dev);
+	len = strlen(devname);
+	devname[len++] = '\n';
+
+	if (len > *lenp) len = *lenp;
+
+	if (copy_to_user(buffer, devname, len))
+		return -EFAULT;
+
+	*lenp = len;
+	*ppos += len;
+
+	return 0;
+}
+
+static ctl_table dn_table[] = {
+	{
+		.procname = "node_address",
+		.maxlen = 7,
+		.mode = 0644,
+		.proc_handler = dn_node_address_handler,
+	},
+	{
+		.procname = "node_name",
+		.data = node_name,
+		.maxlen = 7,
+		.mode = 0644,
+		.proc_handler = proc_dostring,
+	},
+	{
+		.procname = "default_device",
+		.maxlen = 16,
+		.mode = 0644,
+		.proc_handler = dn_def_dev_handler,
+	},
+	{
+		.procname = "time_wait",
+		.data = &decnet_time_wait,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_decnet_time_wait,
+		.extra2 = &max_decnet_time_wait
+	},
+	{
+		.procname = "dn_count",
+		.data = &decnet_dn_count,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_state_count,
+		.extra2 = &max_state_count
+	},
+	{
+		.procname = "di_count",
+		.data = &decnet_di_count,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_state_count,
+		.extra2 = &max_state_count
+	},
+	{
+		.procname = "dr_count",
+		.data = &decnet_dr_count,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_state_count,
+		.extra2 = &max_state_count
+	},
+	{
+		.procname = "dst_gc_interval",
+		.data = &decnet_dst_gc_interval,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_decnet_dst_gc_interval,
+		.extra2 = &max_decnet_dst_gc_interval
+	},
+	{
+		.procname = "no_fc_max_cwnd",
+		.data = &decnet_no_fc_max_cwnd,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec_minmax,
+		.extra1 = &min_decnet_no_fc_max_cwnd,
+		.extra2 = &max_decnet_no_fc_max_cwnd
+	},
+       {
+		.procname = "decnet_mem",
+		.data = &sysctl_decnet_mem,
+		.maxlen = sizeof(sysctl_decnet_mem),
+		.mode = 0644,
+		.proc_handler = proc_doulongvec_minmax
+	},
+	{
+		.procname = "decnet_rmem",
+		.data = &sysctl_decnet_rmem,
+		.maxlen = sizeof(sysctl_decnet_rmem),
+		.mode = 0644,
+		.proc_handler = proc_dointvec,
+	},
+	{
+		.procname = "decnet_wmem",
+		.data = &sysctl_decnet_wmem,
+		.maxlen = sizeof(sysctl_decnet_wmem),
+		.mode = 0644,
+		.proc_handler = proc_dointvec,
+	},
+	{
+		.procname = "debug",
+		.data = &decnet_debug_level,
+		.maxlen = sizeof(int),
+		.mode = 0644,
+		.proc_handler = proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_path dn_path[] = {
+	{ .procname = "net", },
+	{ .procname = "decnet", },
+	{ }
+};
+
+void dn_register_sysctl(void)
+{
+	dn_table_header = register_sysctl_paths(dn_path, dn_table);
+}
+
+void dn_unregister_sysctl(void)
+{
+	unregister_sysctl_table(dn_table_header);
+}
+
+#else  /* CONFIG_SYSCTL */
+void dn_unregister_sysctl(void)
+{
+}
+void dn_register_sysctl(void)
+{
+}
+
+#endif
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 00000000..50d49f7e
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+	tristate "DNS Resolver support"
+	depends on NET && KEYS
+	help
+	  Saying Y here will include support for the DNS Resolver key type
+	  which can be used to make upcalls to perform DNS lookups in
+	  userspace.
+
+	  DNS Resolver is used to query DNS server for information.  Examples
+	  being resolving a UNC hostname element to an IP address for CIFS or
+	  performing a DNS query for AFSDB records so that AFS can locate a
+	  cell's volume location database servers.
+
+	  DNS Resolver is used by the CIFS and AFS modules, and would support
+	  SMB2 later.  DNS Resolver is supported by the userspace upcall
+	  helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+	  See <file:Documentation/networking/dns_resolver.txt> for further
+	  information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  dnsresolver.
+
+	  If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 00000000..d5c13c2e
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-y :=  dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 00000000..fa000d26
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,309 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+#define	DNS_ERRORNO_OPTION	"dnserror"
+
+/*
+ * Instantiate a user defined key for dns_resolver.
+ *
+ * The data must be a NUL-terminated string, with the NUL char accounted in
+ * datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'.  The actual data of interest is
+ * the string leading up to the first '#'.  For instance:
+ *
+ *        "ip1,ip2,...#foo=bar"
+ */
+static int
+dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
+{
+	struct user_key_payload *upayload;
+	unsigned long derrno;
+	int ret;
+	size_t result_len = 0;
+	const char *data = _data, *end, *opt;
+
+	kenter("%%%d,%s,'%*.*s',%zu",
+	       key->serial, key->description,
+	       (int)datalen, (int)datalen, data, datalen);
+
+	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+		return -EINVAL;
+	datalen--;
+
+	/* deal with any options embedded in the data */
+	end = data + datalen;
+	opt = memchr(data, '#', datalen);
+	if (!opt) {
+		/* no options: the entire data is the result */
+		kdebug("no options");
+		result_len = datalen;
+	} else {
+		const char *next_opt;
+
+		result_len = opt - data;
+		opt++;
+		kdebug("options: '%s'", opt);
+		do {
+			const char *eq;
+			int opt_len, opt_nlen, opt_vlen, tmp;
+
+			next_opt = memchr(opt, '#', end - opt) ?: end;
+			opt_len = next_opt - opt;
+			if (!opt_len) {
+				printk(KERN_WARNING
+				       "Empty option to dns_resolver key %d\n",
+				       key->serial);
+				return -EINVAL;
+			}
+
+			eq = memchr(opt, '=', opt_len) ?: end;
+			opt_nlen = eq - opt;
+			eq++;
+			opt_vlen = next_opt - eq; /* will be -1 if no value */
+
+			tmp = opt_vlen >= 0 ? opt_vlen : 0;
+			kdebug("option '%*.*s' val '%*.*s'",
+			       opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+
+			/* see if it's an error number representing a DNS error
+			 * that's to be recorded as the result in this key */
+			if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
+			    memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
+				kdebug("dns error number option");
+				if (opt_vlen <= 0)
+					goto bad_option_value;
+
+				ret = strict_strtoul(eq, 10, &derrno);
+				if (ret < 0)
+					goto bad_option_value;
+
+				if (derrno < 1 || derrno > 511)
+					goto bad_option_value;
+
+				kdebug("dns error no. = %lu", derrno);
+				key->type_data.x[0] = -derrno;
+				continue;
+			}
+
+		bad_option_value:
+			printk(KERN_WARNING
+			       "Option '%*.*s' to dns_resolver key %d:"
+			       " bad/missing value\n",
+			       opt_nlen, opt_nlen, opt, key->serial);
+			return -EINVAL;
+		} while (opt = next_opt + 1, opt < end);
+	}
+
+	/* don't cache the result if we're caching an error saying there's no
+	 * result */
+	if (key->type_data.x[0]) {
+		kleave(" = 0 [h_error %ld]", key->type_data.x[0]);
+		return 0;
+	}
+
+	kdebug("store result");
+	ret = key_payload_reserve(key, result_len);
+	if (ret < 0)
+		return -EINVAL;
+
+	upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+	if (!upayload) {
+		kleave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	upayload->datalen = result_len;
+	memcpy(upayload->data, data, result_len);
+	upayload->data[result_len] = '\0';
+	rcu_assign_pointer(key->payload.data, upayload);
+
+	kleave(" = 0");
+	return 0;
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period).  The domain name is case-independent.
+ */
+static int
+dns_resolver_match(const struct key *key, const void *description)
+{
+	int slen, dlen, ret = 0;
+	const char *src = key->description, *dsp = description;
+
+	kenter("%s,%s", src, dsp);
+
+	if (!src || !dsp)
+		goto no_match;
+
+	if (strcasecmp(src, dsp) == 0)
+		goto matched;
+
+	slen = strlen(src);
+	dlen = strlen(dsp);
+	if (slen <= 0 || dlen <= 0)
+		goto no_match;
+	if (src[slen - 1] == '.')
+		slen--;
+	if (dsp[dlen - 1] == '.')
+		dlen--;
+	if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+		goto no_match;
+
+matched:
+	ret = 1;
+no_match:
+	kleave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Describe a DNS key
+ */
+static void dns_resolver_describe(const struct key *key, struct seq_file *m)
+{
+	int err = key->type_data.x[0];
+
+	seq_puts(m, key->description);
+	if (key_is_instantiated(key)) {
+		if (err)
+			seq_printf(m, ": %d", err);
+		else
+			seq_printf(m, ": %u", key->datalen);
+	}
+}
+
+/*
+ * read the DNS data
+ * - the key's semaphore is read-locked
+ */
+static long dns_resolver_read(const struct key *key,
+			      char __user *buffer, size_t buflen)
+{
+	if (key->type_data.x[0])
+		return key->type_data.x[0];
+
+	return user_read(key, buffer, buflen);
+}
+
+struct key_type key_type_dns_resolver = {
+	.name		= "dns_resolver",
+	.instantiate	= dns_resolver_instantiate,
+	.match		= dns_resolver_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= dns_resolver_describe,
+	.read		= dns_resolver_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	printk(KERN_NOTICE "Registering the %s key type\n",
+	       key_type_dns_resolver.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which DNS requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_dns_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	dns_resolver_cache = cred;
+
+	kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+	key_revoke(dns_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_dns_resolver);
+	put_cred(dns_resolver_cache);
+	printk(KERN_NOTICE "Unregistered %s key type\n",
+	       key_type_dns_resolver.name);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 00000000..c32be292
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,165 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   The upcall wrapper used to make an arbitrary DNS query.
+ *
+ *   This function requires the appropriate userspace tool dns.upcall to be
+ *   installed and something like the following lines should be added to the
+ *   /etc/request-key.conf file:
+ *
+ *	create dns_resolver * * /sbin/dns.upcall %k
+ *
+ *   For example to use this module to query AFSDB RR:
+ *
+ *	create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include <linux/err.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/**
+ * dns_query - Query the DNS
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data.
+ * @_expiry: Where to store the result expiry time (or NULL)
+ *
+ * The data will be returned in the pointer at *result, and the caller is
+ * responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested.  If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(const char *type, const char *name, size_t namelen,
+	      const char *options, char **_result, time_t *_expiry)
+{
+	struct key *rkey;
+	struct user_key_payload *upayload;
+	const struct cred *saved_cred;
+	size_t typelen, desclen;
+	char *desc, *cp;
+	int ret, len;
+
+	kenter("%s,%*.*s,%zu,%s",
+	       type, (int)namelen, (int)namelen, name, namelen, options);
+
+	if (!name || namelen == 0 || !_result)
+		return -EINVAL;
+
+	/* construct the query key description as "[<type>:]<name>" */
+	typelen = 0;
+	desclen = 0;
+	if (type) {
+		typelen = strlen(type);
+		if (typelen < 1)
+			return -EINVAL;
+		desclen += typelen + 1;
+	}
+
+	if (!namelen)
+		namelen = strlen(name);
+	if (namelen < 3)
+		return -EINVAL;
+	desclen += namelen + 1;
+
+	desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = desc;
+	if (type) {
+		memcpy(cp, type, typelen);
+		cp += typelen;
+		*cp++ = ':';
+	}
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+
+	if (!options)
+		options = "";
+	kdebug("call request_key(,%s,%s)", desc, options);
+
+	/* make the upcall, using special credentials to prevent the use of
+	 * add_key() to preinstall malicious redirections
+	 */
+	saved_cred = override_creds(dns_resolver_cache);
+	rkey = request_key(&key_type_dns_resolver, desc, options);
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	down_read(&rkey->sem);
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto put;
+
+	/* If the DNS server gave an error, return that to the caller */
+	ret = rkey->type_data.x[0];
+	if (ret)
+		goto put;
+
+	upayload = rcu_dereference_protected(rkey->payload.data,
+					     lockdep_is_held(&rkey->sem));
+	len = upayload->datalen;
+
+	ret = -ENOMEM;
+	*_result = kmalloc(len + 1, GFP_KERNEL);
+	if (!*_result)
+		goto put;
+
+	memcpy(*_result, upayload->data, len + 1);
+	if (_expiry)
+		*_expiry = rkey->expiry;
+
+	ret = len;
+put:
+	up_read(&rkey->sem);
+	key_put(rkey);
+out:
+	kleave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 00000000..189ca9e9
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,44 @@
+/*
+ *   Copyright (c) 2010 Wang Lei
+ *   Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ *   Internal DNS Rsolver stuff
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned dns_resolver_debug;
+
+#define	kdebug(FMT, ...)				\
+do {							\
+	if (unlikely(dns_resolver_debug))		\
+		printk(KERN_DEBUG "[%-6.6s] "FMT"\n",	\
+		       current->comm, ##__VA_ARGS__);	\
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
new file mode 100644
index 00000000..c53ded2a
--- /dev/null
+++ b/net/dsa/Kconfig
@@ -0,0 +1,60 @@
+menuconfig NET_DSA
+	bool "Distributed Switch Architecture support"
+	default n
+	depends on EXPERIMENTAL && NETDEVICES && !S390
+	select PHYLIB
+	---help---
+	  This allows you to use hardware switch chips that use
+	  the Distributed Switch Architecture.
+
+
+if NET_DSA
+
+# tagging formats
+config NET_DSA_TAG_DSA
+	bool
+	default n
+
+config NET_DSA_TAG_EDSA
+	bool
+	default n
+
+config NET_DSA_TAG_TRAILER
+	bool
+	default n
+
+
+# switch drivers
+config NET_DSA_MV88E6XXX
+	bool
+	default n
+
+config NET_DSA_MV88E6060
+	bool "Marvell 88E6060 ethernet switch chip support"
+	select NET_DSA_TAG_TRAILER
+	---help---
+	  This enables support for the Marvell 88E6060 ethernet switch
+	  chip.
+
+config NET_DSA_MV88E6XXX_NEED_PPU
+	bool
+	default n
+
+config NET_DSA_MV88E6131
+	bool "Marvell 88E6085/6095/6095F/6131 ethernet switch chip support"
+	select NET_DSA_MV88E6XXX
+	select NET_DSA_MV88E6XXX_NEED_PPU
+	select NET_DSA_TAG_DSA
+	---help---
+	  This enables support for the Marvell 88E6085/6095/6095F/6131
+	  ethernet switch chips.
+
+config NET_DSA_MV88E6123_61_65
+	bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
+	select NET_DSA_MV88E6XXX
+	select NET_DSA_TAG_EDSA
+	---help---
+	  This enables support for the Marvell 88E6123/6161/6165
+	  ethernet switch chips.
+
+endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
new file mode 100644
index 00000000..2374faff
--- /dev/null
+++ b/net/dsa/Makefile
@@ -0,0 +1,13 @@
+# tagging formats
+obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
+obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+
+# switch drivers
+obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
+obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
+obj-$(CONFIG_NET_DSA_MV88E6123_61_65) += mv88e6123_61_65.o
+obj-$(CONFIG_NET_DSA_MV88E6131) += mv88e6131.o
+
+# the core
+obj-$(CONFIG_NET_DSA) += dsa.o slave.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
new file mode 100644
index 00000000..3fb14b7c
--- /dev/null
+++ b/net/dsa/dsa.c
@@ -0,0 +1,434 @@
+/*
+ * net/dsa/dsa.c - Hardware switch handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <net/dsa.h>
+#include "dsa_priv.h"
+
+char dsa_driver_version[] = "0.1";
+
+
+/* switch driver registration ***********************************************/
+static DEFINE_MUTEX(dsa_switch_drivers_mutex);
+static LIST_HEAD(dsa_switch_drivers);
+
+void register_switch_driver(struct dsa_switch_driver *drv)
+{
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_add_tail(&drv->list, &dsa_switch_drivers);
+	mutex_unlock(&dsa_switch_drivers_mutex);
+}
+
+void unregister_switch_driver(struct dsa_switch_driver *drv)
+{
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_del_init(&drv->list);
+	mutex_unlock(&dsa_switch_drivers_mutex);
+}
+
+static struct dsa_switch_driver *
+dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name)
+{
+	struct dsa_switch_driver *ret;
+	struct list_head *list;
+	char *name;
+
+	ret = NULL;
+	name = NULL;
+
+	mutex_lock(&dsa_switch_drivers_mutex);
+	list_for_each(list, &dsa_switch_drivers) {
+		struct dsa_switch_driver *drv;
+
+		drv = list_entry(list, struct dsa_switch_driver, list);
+
+		name = drv->probe(bus, sw_addr);
+		if (name != NULL) {
+			ret = drv;
+			break;
+		}
+	}
+	mutex_unlock(&dsa_switch_drivers_mutex);
+
+	*_name = name;
+
+	return ret;
+}
+
+
+/* basic switch operations **************************************************/
+static struct dsa_switch *
+dsa_switch_setup(struct dsa_switch_tree *dst, int index,
+		 struct device *parent, struct mii_bus *bus)
+{
+	struct dsa_chip_data *pd = dst->pd->chip + index;
+	struct dsa_switch_driver *drv;
+	struct dsa_switch *ds;
+	int ret;
+	char *name;
+	int i;
+
+	/*
+	 * Probe for switch model.
+	 */
+	drv = dsa_switch_probe(bus, pd->sw_addr, &name);
+	if (drv == NULL) {
+		printk(KERN_ERR "%s[%d]: could not detect attached switch\n",
+		       dst->master_netdev->name, index);
+		return ERR_PTR(-EINVAL);
+	}
+	printk(KERN_INFO "%s[%d]: detected a %s switch\n",
+		dst->master_netdev->name, index, name);
+
+
+	/*
+	 * Allocate and initialise switch state.
+	 */
+	ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL);
+	if (ds == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ds->dst = dst;
+	ds->index = index;
+	ds->pd = dst->pd->chip + index;
+	ds->drv = drv;
+	ds->master_mii_bus = bus;
+
+
+	/*
+	 * Validate supplied switch configuration.
+	 */
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		char *name;
+
+		name = pd->port_names[i];
+		if (name == NULL)
+			continue;
+
+		if (!strcmp(name, "cpu")) {
+			if (dst->cpu_switch != -1) {
+				printk(KERN_ERR "multiple cpu ports?!\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			dst->cpu_switch = index;
+			dst->cpu_port = i;
+		} else if (!strcmp(name, "dsa")) {
+			ds->dsa_port_mask |= 1 << i;
+		} else {
+			ds->phys_port_mask |= 1 << i;
+		}
+	}
+
+
+	/*
+	 * If the CPU connects to this switch, set the switch tree
+	 * tagging protocol to the preferred tagging format of this
+	 * switch.
+	 */
+	if (ds->dst->cpu_switch == index)
+		ds->dst->tag_protocol = drv->tag_protocol;
+
+
+	/*
+	 * Do basic register setup.
+	 */
+	ret = drv->setup(ds);
+	if (ret < 0)
+		goto out;
+
+	ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
+	if (ret < 0)
+		goto out;
+
+	ds->slave_mii_bus = mdiobus_alloc();
+	if (ds->slave_mii_bus == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	dsa_slave_mii_bus_init(ds);
+
+	ret = mdiobus_register(ds->slave_mii_bus);
+	if (ret < 0)
+		goto out_free;
+
+
+	/*
+	 * Create network devices for physical switch ports.
+	 */
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *slave_dev;
+
+		if (!(ds->phys_port_mask & (1 << i)))
+			continue;
+
+		slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]);
+		if (slave_dev == NULL) {
+			printk(KERN_ERR "%s[%d]: can't create dsa "
+			       "slave device for port %d(%s)\n",
+			       dst->master_netdev->name,
+			       index, i, pd->port_names[i]);
+			continue;
+		}
+
+		ds->ports[i] = slave_dev;
+	}
+
+	return ds;
+
+out_free:
+	mdiobus_free(ds->slave_mii_bus);
+out:
+	kfree(ds);
+	return ERR_PTR(ret);
+}
+
+static void dsa_switch_destroy(struct dsa_switch *ds)
+{
+}
+
+
+/* hooks for ethertype-less tagging formats *********************************/
+/*
+ * The original DSA tag format and some other tag formats have no
+ * ethertype, which means that we need to add a little hack to the
+ * networking receive path to make sure that received frames get
+ * the right ->protocol assigned to them when one of those tag
+ * formats is in use.
+ */
+bool dsa_uses_dsa_tags(void *dsa_ptr)
+{
+	struct dsa_switch_tree *dst = dsa_ptr;
+
+	return !!(dst->tag_protocol == htons(ETH_P_DSA));
+}
+
+bool dsa_uses_trailer_tags(void *dsa_ptr)
+{
+	struct dsa_switch_tree *dst = dsa_ptr;
+
+	return !!(dst->tag_protocol == htons(ETH_P_TRAILER));
+}
+
+
+/* link polling *************************************************************/
+static void dsa_link_poll_work(struct work_struct *ugly)
+{
+	struct dsa_switch_tree *dst;
+	int i;
+
+	dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
+
+	for (i = 0; i < dst->pd->nr_chips; i++) {
+		struct dsa_switch *ds = dst->ds[i];
+
+		if (ds != NULL && ds->drv->poll_link != NULL)
+			ds->drv->poll_link(ds);
+	}
+
+	mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
+}
+
+static void dsa_link_poll_timer(unsigned long _dst)
+{
+	struct dsa_switch_tree *dst = (void *)_dst;
+
+	schedule_work(&dst->link_poll_work);
+}
+
+
+/* platform driver init and cleanup *****************************************/
+static int dev_is_class(struct device *dev, void *class)
+{
+	if (dev->class != NULL && !strcmp(dev->class->name, class))
+		return 1;
+
+	return 0;
+}
+
+static struct device *dev_find_class(struct device *parent, char *class)
+{
+	if (dev_is_class(parent, class)) {
+		get_device(parent);
+		return parent;
+	}
+
+	return device_find_child(parent, class, dev_is_class);
+}
+
+static struct mii_bus *dev_to_mii_bus(struct device *dev)
+{
+	struct device *d;
+
+	d = dev_find_class(dev, "mdio_bus");
+	if (d != NULL) {
+		struct mii_bus *bus;
+
+		bus = to_mii_bus(d);
+		put_device(d);
+
+		return bus;
+	}
+
+	return NULL;
+}
+
+static struct net_device *dev_to_net_device(struct device *dev)
+{
+	struct device *d;
+
+	d = dev_find_class(dev, "net");
+	if (d != NULL) {
+		struct net_device *nd;
+
+		nd = to_net_dev(d);
+		dev_hold(nd);
+		put_device(d);
+
+		return nd;
+	}
+
+	return NULL;
+}
+
+static int dsa_probe(struct platform_device *pdev)
+{
+	static int dsa_version_printed;
+	struct dsa_platform_data *pd = pdev->dev.platform_data;
+	struct net_device *dev;
+	struct dsa_switch_tree *dst;
+	int i;
+
+	if (!dsa_version_printed++)
+		printk(KERN_NOTICE "Distributed Switch Architecture "
+			"driver version %s\n", dsa_driver_version);
+
+	if (pd == NULL || pd->netdev == NULL)
+		return -EINVAL;
+
+	dev = dev_to_net_device(pd->netdev);
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (dev->dsa_ptr != NULL) {
+		dev_put(dev);
+		return -EEXIST;
+	}
+
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (dst == NULL) {
+		dev_put(dev);
+		return -ENOMEM;
+	}
+
+	platform_set_drvdata(pdev, dst);
+
+	dst->pd = pd;
+	dst->master_netdev = dev;
+	dst->cpu_switch = -1;
+	dst->cpu_port = -1;
+
+	for (i = 0; i < pd->nr_chips; i++) {
+		struct mii_bus *bus;
+		struct dsa_switch *ds;
+
+		bus = dev_to_mii_bus(pd->chip[i].mii_bus);
+		if (bus == NULL) {
+			printk(KERN_ERR "%s[%d]: no mii bus found for "
+				"dsa switch\n", dev->name, i);
+			continue;
+		}
+
+		ds = dsa_switch_setup(dst, i, &pdev->dev, bus);
+		if (IS_ERR(ds)) {
+			printk(KERN_ERR "%s[%d]: couldn't create dsa switch "
+				"instance (error %ld)\n", dev->name, i,
+				PTR_ERR(ds));
+			continue;
+		}
+
+		dst->ds[i] = ds;
+		if (ds->drv->poll_link != NULL)
+			dst->link_poll_needed = 1;
+	}
+
+	/*
+	 * If we use a tagging format that doesn't have an ethertype
+	 * field, make sure that all packets from this point on get
+	 * sent to the tag format's receive function.
+	 */
+	wmb();
+	dev->dsa_ptr = (void *)dst;
+
+	if (dst->link_poll_needed) {
+		INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
+		init_timer(&dst->link_poll_timer);
+		dst->link_poll_timer.data = (unsigned long)dst;
+		dst->link_poll_timer.function = dsa_link_poll_timer;
+		dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
+		add_timer(&dst->link_poll_timer);
+	}
+
+	return 0;
+}
+
+static int dsa_remove(struct platform_device *pdev)
+{
+	struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+	int i;
+
+	if (dst->link_poll_needed)
+		del_timer_sync(&dst->link_poll_timer);
+
+	flush_work_sync(&dst->link_poll_work);
+
+	for (i = 0; i < dst->pd->nr_chips; i++) {
+		struct dsa_switch *ds = dst->ds[i];
+
+		if (ds != NULL)
+			dsa_switch_destroy(ds);
+	}
+
+	return 0;
+}
+
+static void dsa_shutdown(struct platform_device *pdev)
+{
+}
+
+static struct platform_driver dsa_driver = {
+	.probe		= dsa_probe,
+	.remove		= dsa_remove,
+	.shutdown	= dsa_shutdown,
+	.driver = {
+		.name	= "dsa",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init dsa_init_module(void)
+{
+	return platform_driver_register(&dsa_driver);
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+	platform_driver_unregister(&dsa_driver);
+}
+module_exit(dsa_cleanup_module);
+
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
+MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:dsa");
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
new file mode 100644
index 00000000..4b0ea054
--- /dev/null
+++ b/net/dsa/dsa_priv.h
@@ -0,0 +1,181 @@
+/*
+ * net/dsa/dsa_priv.h - Hardware switch handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __DSA_PRIV_H
+#define __DSA_PRIV_H
+
+#include <linux/list.h>
+#include <linux/phy.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <net/dsa.h>
+
+struct dsa_switch {
+	/*
+	 * Parent switch tree, and switch index.
+	 */
+	struct dsa_switch_tree	*dst;
+	int			index;
+
+	/*
+	 * Configuration data for this switch.
+	 */
+	struct dsa_chip_data	*pd;
+
+	/*
+	 * The used switch driver.
+	 */
+	struct dsa_switch_driver	*drv;
+
+	/*
+	 * Reference to mii bus to use.
+	 */
+	struct mii_bus		*master_mii_bus;
+
+	/*
+	 * Slave mii_bus and devices for the individual ports.
+	 */
+	u32			dsa_port_mask;
+	u32			phys_port_mask;
+	struct mii_bus		*slave_mii_bus;
+	struct net_device	*ports[DSA_MAX_PORTS];
+};
+
+struct dsa_switch_tree {
+	/*
+	 * Configuration data for the platform device that owns
+	 * this dsa switch tree instance.
+	 */
+	struct dsa_platform_data	*pd;
+
+	/*
+	 * Reference to network device to use, and which tagging
+	 * protocol to use.
+	 */
+	struct net_device	*master_netdev;
+	__be16			tag_protocol;
+
+	/*
+	 * The switch and port to which the CPU is attached.
+	 */
+	s8			cpu_switch;
+	s8			cpu_port;
+
+	/*
+	 * Link state polling.
+	 */
+	int			link_poll_needed;
+	struct work_struct	link_poll_work;
+	struct timer_list	link_poll_timer;
+
+	/*
+	 * Data for the individual switch chips.
+	 */
+	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
+};
+
+static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
+{
+	return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
+}
+
+static inline u8 dsa_upstream_port(struct dsa_switch *ds)
+{
+	struct dsa_switch_tree *dst = ds->dst;
+
+	/*
+	 * If this is the root switch (i.e. the switch that connects
+	 * to the CPU), return the cpu port number on this switch.
+	 * Else return the (DSA) port number that connects to the
+	 * switch that is one hop closer to the cpu.
+	 */
+	if (dst->cpu_switch == ds->index)
+		return dst->cpu_port;
+	else
+		return ds->pd->rtable[dst->cpu_switch];
+}
+
+struct dsa_slave_priv {
+	/*
+	 * The linux network interface corresponding to this
+	 * switch port.
+	 */
+	struct net_device	*dev;
+
+	/*
+	 * Which switch this port is a part of, and the port index
+	 * for this port.
+	 */
+	struct dsa_switch	*parent;
+	u8			port;
+
+	/*
+	 * The phylib phy_device pointer for the PHY connected
+	 * to this port.
+	 */
+	struct phy_device	*phy;
+};
+
+struct dsa_switch_driver {
+	struct list_head	list;
+
+	__be16			tag_protocol;
+	int			priv_size;
+
+	/*
+	 * Probing and setup.
+	 */
+	char	*(*probe)(struct mii_bus *bus, int sw_addr);
+	int	(*setup)(struct dsa_switch *ds);
+	int	(*set_addr)(struct dsa_switch *ds, u8 *addr);
+
+	/*
+	 * Access to the switch's PHY registers.
+	 */
+	int	(*phy_read)(struct dsa_switch *ds, int port, int regnum);
+	int	(*phy_write)(struct dsa_switch *ds, int port,
+			     int regnum, u16 val);
+
+	/*
+	 * Link state polling and IRQ handling.
+	 */
+	void	(*poll_link)(struct dsa_switch *ds);
+
+	/*
+	 * ethtool hardware statistics.
+	 */
+	void	(*get_strings)(struct dsa_switch *ds, int port, uint8_t *data);
+	void	(*get_ethtool_stats)(struct dsa_switch *ds,
+				     int port, uint64_t *data);
+	int	(*get_sset_count)(struct dsa_switch *ds);
+};
+
+/* dsa.c */
+extern char dsa_driver_version[];
+void register_switch_driver(struct dsa_switch_driver *type);
+void unregister_switch_driver(struct dsa_switch_driver *type);
+
+/* slave.c */
+void dsa_slave_mii_bus_init(struct dsa_switch *ds);
+struct net_device *dsa_slave_create(struct dsa_switch *ds,
+				    struct device *parent,
+				    int port, char *name);
+
+/* tag_dsa.c */
+netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev);
+
+/* tag_edsa.c */
+netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev);
+
+/* tag_trailer.c */
+netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev);
+
+
+#endif
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
new file mode 100644
index 00000000..8f4ff5a2
--- /dev/null
+++ b/net/dsa/mv88e6060.c
@@ -0,0 +1,288 @@
+/*
+ * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+
+#define REG_PORT(p)		(8 + (p))
+#define REG_GLOBAL		0x0f
+
+static int reg_read(struct dsa_switch *ds, int addr, int reg)
+{
+	return mdiobus_read(ds->master_mii_bus, ds->pd->sw_addr + addr, reg);
+}
+
+#define REG_READ(addr, reg)					\
+	({							\
+		int __ret;					\
+								\
+		__ret = reg_read(ds, addr, reg);		\
+		if (__ret < 0)					\
+			return __ret;				\
+		__ret;						\
+	})
+
+
+static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
+{
+	return mdiobus_write(ds->master_mii_bus, ds->pd->sw_addr + addr,
+			     reg, val);
+}
+
+#define REG_WRITE(addr, reg, val)				\
+	({							\
+		int __ret;					\
+								\
+		__ret = reg_write(ds, addr, reg, val);		\
+		if (__ret < 0)					\
+			return __ret;				\
+	})
+
+static char *mv88e6060_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = mdiobus_read(bus, sw_addr + REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == 0x0600)
+			return "Marvell 88E6060";
+	}
+
+	return NULL;
+}
+
+static int mv88e6060_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 6; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0xa130);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0x8000) == 0x0000)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6060_setup_global(struct dsa_switch *ds)
+{
+	/*
+	 * Disable discarding of frames with excessive collisions,
+	 * set the maximum frame size to 1536 bytes, and mask all
+	 * interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x0800);
+
+	/*
+	 * Enable automatic address learning, set the address
+	 * database size to 1024 entries, and set the default aging
+	 * time to 5 minutes.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x2130);
+
+	return 0;
+}
+
+static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
+{
+	int addr = REG_PORT(p);
+
+	/*
+	 * Do not force flow control, disable Ingress and Egress
+	 * Header tagging, disable VLAN tunneling, and set the port
+	 * state to Forwarding.  Additionally, if this is the CPU
+	 * port, enable Ingress and Egress Trailer tagging mode.
+	 */
+	REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ?  0x4103 : 0x0003);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the CPU port.
+	 */
+	REG_WRITE(addr, 0x06,
+			((p & 0xf) << 12) |
+			 (dsa_is_cpu_port(ds, p) ?
+				ds->phys_port_mask :
+				(1 << ds->dst->cpu_port)));
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	return 0;
+}
+
+static int mv88e6060_setup(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	ret = mv88e6060_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise atu */
+
+	ret = mv88e6060_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 6; i++) {
+		ret = mv88e6060_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6060_set_addr(struct dsa_switch *ds, u8 *addr)
+{
+	REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]);
+	REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]);
+	REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]);
+
+	return 0;
+}
+
+static int mv88e6060_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port <= 5)
+		return port;
+	return -1;
+}
+
+static int mv88e6060_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr;
+
+	addr = mv88e6060_port_to_phy_addr(port);
+	if (addr == -1)
+		return 0xffff;
+
+	return reg_read(ds, addr, regnum);
+}
+
+static int
+mv88e6060_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val)
+{
+	int addr;
+
+	addr = mv88e6060_port_to_phy_addr(port);
+	if (addr == -1)
+		return 0xffff;
+
+	return reg_write(ds, addr, regnum, val);
+}
+
+static void mv88e6060_poll_link(struct dsa_switch *ds)
+{
+	int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *dev;
+		int uninitialized_var(port_status);
+		int link;
+		int speed;
+		int duplex;
+		int fc;
+
+		dev = ds->ports[i];
+		if (dev == NULL)
+			continue;
+
+		link = 0;
+		if (dev->flags & IFF_UP) {
+			port_status = reg_read(ds, REG_PORT(i), 0x00);
+			if (port_status < 0)
+				continue;
+
+			link = !!(port_status & 0x1000);
+		}
+
+		if (!link) {
+			if (netif_carrier_ok(dev)) {
+				printk(KERN_INFO "%s: link down\n", dev->name);
+				netif_carrier_off(dev);
+			}
+			continue;
+		}
+
+		speed = (port_status & 0x0100) ? 100 : 10;
+		duplex = (port_status & 0x0200) ? 1 : 0;
+		fc = ((port_status & 0xc000) == 0xc000) ? 1 : 0;
+
+		if (!netif_carrier_ok(dev)) {
+			printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, "
+					 "flow control %sabled\n", dev->name,
+					 speed, duplex ? "full" : "half",
+					 fc ? "en" : "dis");
+			netif_carrier_on(dev);
+		}
+	}
+}
+
+static struct dsa_switch_driver mv88e6060_switch_driver = {
+	.tag_protocol	= htons(ETH_P_TRAILER),
+	.probe		= mv88e6060_probe,
+	.setup		= mv88e6060_setup,
+	.set_addr	= mv88e6060_set_addr,
+	.phy_read	= mv88e6060_phy_read,
+	.phy_write	= mv88e6060_phy_write,
+	.poll_link	= mv88e6060_poll_link,
+};
+
+static int __init mv88e6060_init(void)
+{
+	register_switch_driver(&mv88e6060_switch_driver);
+	return 0;
+}
+module_init(mv88e6060_init);
+
+static void __exit mv88e6060_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6060_switch_driver);
+}
+module_exit(mv88e6060_cleanup);
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
new file mode 100644
index 00000000..52faaa21
--- /dev/null
+++ b/net/dsa/mv88e6123_61_65.c
@@ -0,0 +1,447 @@
+/*
+ * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+static char *mv88e6123_61_65_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == 0x1210)
+			return "Marvell 88E6123";
+		if (ret == 0x1610)
+			return "Marvell 88E6161";
+		if (ret == 0x1650)
+			return "Marvell 88E6165";
+	}
+
+	return NULL;
+}
+
+static int mv88e6123_61_65_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 8; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0xc400);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0xc800) == 0xc800)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	/*
+	 * Disable the PHY polling unit (since there won't be any
+	 * external PHYs to poll), don't discard packets with
+	 * excessive collisions, and mask all interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x0000);
+
+	/*
+	 * Set the default address aging time to 5 minutes, and
+	 * enable address learn messages to be sent to all message
+	 * ports.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x0148);
+
+	/*
+	 * Configure the priority mapping registers.
+	 */
+	ret = mv88e6xxx_config_prio(ds);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Configure the upstream port, and configure the upstream
+	 * port as the port to which ingress and egress monitor frames
+	 * are to be sent.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110));
+
+	/*
+	 * Disable remote management for now, and set the switch's
+	 * DSA device number.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f);
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:2x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x02, 0xffff);
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:0x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x03, 0xffff);
+
+	/*
+	 * Disable the loopback filter, disable flow control
+	 * messages, disable flood broadcast override, disable
+	 * removing of provider tags, disable ATU age violation
+	 * interrupts, disable tag flow control, force flow
+	 * control priority to the highest, and send all special
+	 * multicast frames to the CPU at the highest priority.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
+
+	/*
+	 * Program the DSA routing table.
+	 */
+	for (i = 0; i < 32; i++) {
+		int nexthop;
+
+		nexthop = 0x1f;
+		if (i != ds->index && i < ds->dst->pd->nr_chips)
+			nexthop = ds->pd->rtable[i] & 0x1f;
+
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+	}
+
+	/*
+	 * Clear all trunk masks.
+	 */
+	for (i = 0; i < 8; i++)
+		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff);
+
+	/*
+	 * Clear all trunk mappings.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11));
+
+	/*
+	 * Disable ingress rate limiting by resetting all ingress
+	 * rate limit registers to their initial state.
+	 */
+	for (i = 0; i < 6; i++)
+		REG_WRITE(REG_GLOBAL2, 0x09, 0x9000 | (i << 8));
+
+	/*
+	 * Initialise cross-chip port VLAN table to reset defaults.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x0b, 0x9000);
+
+	/*
+	 * Clear the priority override table.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x0f, 0x8000 | (i << 8));
+
+	/* @@@ initialise AVB (22/23) watchdog (27) sdet (29) registers */
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
+{
+	int addr = REG_PORT(p);
+	u16 val;
+
+	/*
+	 * MAC Forcing register: don't force link, speed, duplex
+	 * or flow control state to any particular values on physical
+	 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+	 * full duplex.
+	 */
+	if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+		REG_WRITE(addr, 0x01, 0x003e);
+	else
+		REG_WRITE(addr, 0x01, 0x0003);
+
+	/*
+	 * Do not limit the period of time that this port can be
+	 * paused for by the remote end or the period of time that
+	 * this port can pause the remote end.
+	 */
+	REG_WRITE(addr, 0x02, 0x0000);
+
+	/*
+	 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
+	 * disable Header mode, enable IGMP/MLD snooping, disable VLAN
+	 * tunneling, determine priority by looking at 802.1p and IP
+	 * priority fields (IP prio has precedence), and set STP state
+	 * to Forwarding.
+	 *
+	 * If this is the CPU link, use DSA or EDSA tagging depending
+	 * on which tagging mode was configured.
+	 *
+	 * If this is a link to another switch, use DSA tagging mode.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown unicasts and multicasts.
+	 */
+	val = 0x0433;
+	if (dsa_is_cpu_port(ds, p)) {
+		if (ds->dst->tag_protocol == htons(ETH_P_EDSA))
+			val |= 0x3300;
+		else
+			val |= 0x0100;
+	}
+	if (ds->dsa_port_mask & (1 << p))
+		val |= 0x0100;
+	if (p == dsa_upstream_port(ds))
+		val |= 0x000c;
+	REG_WRITE(addr, 0x04, val);
+
+	/*
+	 * Port Control 1: disable trunking.  Also, if this is the
+	 * CPU port, enable learn messages to be sent to this port.
+	 */
+	REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the upstream port.
+	 */
+	val = (p & 0xf) << 12;
+	if (dsa_is_cpu_port(ds, p))
+		val |= ds->phys_port_mask;
+	else
+		val |= 1 << dsa_upstream_port(ds);
+	REG_WRITE(addr, 0x06, val);
+
+	/*
+	 * Default VLAN ID and priority: don't set a default VLAN
+	 * ID, and set the default packet priority to zero.
+	 */
+	REG_WRITE(addr, 0x07, 0x0000);
+
+	/*
+	 * Port Control 2: don't force a good FCS, set the maximum
+	 * frame size to 10240 bytes, don't let the switch add or
+	 * strip 802.1q tags, don't discard tagged or untagged frames
+	 * on this port, do a destination address lookup on all
+	 * received packets as usual, disable ARP mirroring and don't
+	 * send a copy of all transmitted/received frames on this port
+	 * to the CPU.
+	 */
+	REG_WRITE(addr, 0x08, 0x2080);
+
+	/*
+	 * Egress rate control: disable egress rate control.
+	 */
+	REG_WRITE(addr, 0x09, 0x0001);
+
+	/*
+	 * Egress rate control 2: disable egress rate control.
+	 */
+	REG_WRITE(addr, 0x0a, 0x0000);
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	/*
+	 * Port ATU control: disable limiting the number of address
+	 * database entries that this port is allowed to use.
+	 */
+	REG_WRITE(addr, 0x0c, 0x0000);
+
+	/*
+	 * Priorit Override: disable DA, SA and VTU priority override.
+	 */
+	REG_WRITE(addr, 0x0d, 0x0000);
+
+	/*
+	 * Port Ethertype: use the Ethertype DSA Ethertype value.
+	 */
+	REG_WRITE(addr, 0x0f, ETH_P_EDSA);
+
+	/*
+	 * Tag Remap: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x18, 0x3210);
+
+	/*
+	 * Tag Remap 2: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x19, 0x7654);
+
+	return 0;
+}
+
+static int mv88e6123_61_65_setup(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int i;
+	int ret;
+
+	mutex_init(&ps->smi_mutex);
+	mutex_init(&ps->stats_mutex);
+
+	ret = mv88e6123_61_65_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise vtu and atu */
+
+	ret = mv88e6123_61_65_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 6; i++) {
+		ret = mv88e6123_61_65_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6123_61_65_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port <= 4)
+		return port;
+	return -1;
+}
+
+static int
+mv88e6123_61_65_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr = mv88e6123_61_65_port_to_phy_addr(port);
+	return mv88e6xxx_phy_read(ds, addr, regnum);
+}
+
+static int
+mv88e6123_61_65_phy_write(struct dsa_switch *ds,
+			      int port, int regnum, u16 val)
+{
+	int addr = mv88e6123_61_65_port_to_phy_addr(port);
+	return mv88e6xxx_phy_write(ds, addr, regnum, val);
+}
+
+static struct mv88e6xxx_hw_stat mv88e6123_61_65_hw_stats[] = {
+	{ "in_good_octets", 8, 0x00, },
+	{ "in_bad_octets", 4, 0x02, },
+	{ "in_unicast", 4, 0x04, },
+	{ "in_broadcasts", 4, 0x06, },
+	{ "in_multicasts", 4, 0x07, },
+	{ "in_pause", 4, 0x16, },
+	{ "in_undersize", 4, 0x18, },
+	{ "in_fragments", 4, 0x19, },
+	{ "in_oversize", 4, 0x1a, },
+	{ "in_jabber", 4, 0x1b, },
+	{ "in_rx_error", 4, 0x1c, },
+	{ "in_fcs_error", 4, 0x1d, },
+	{ "out_octets", 8, 0x0e, },
+	{ "out_unicast", 4, 0x10, },
+	{ "out_broadcasts", 4, 0x13, },
+	{ "out_multicasts", 4, 0x12, },
+	{ "out_pause", 4, 0x15, },
+	{ "excessive", 4, 0x11, },
+	{ "collisions", 4, 0x1e, },
+	{ "deferred", 4, 0x05, },
+	{ "single", 4, 0x14, },
+	{ "multiple", 4, 0x17, },
+	{ "out_fcs_error", 4, 0x03, },
+	{ "late", 4, 0x1f, },
+	{ "hist_64bytes", 4, 0x08, },
+	{ "hist_65_127bytes", 4, 0x09, },
+	{ "hist_128_255bytes", 4, 0x0a, },
+	{ "hist_256_511bytes", 4, 0x0b, },
+	{ "hist_512_1023bytes", 4, 0x0c, },
+	{ "hist_1024_max_bytes", 4, 0x0d, },
+};
+
+static void
+mv88e6123_61_65_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+{
+	mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats),
+			      mv88e6123_61_65_hw_stats, port, data);
+}
+
+static void
+mv88e6123_61_65_get_ethtool_stats(struct dsa_switch *ds,
+				  int port, uint64_t *data)
+{
+	mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6123_61_65_hw_stats),
+				    mv88e6123_61_65_hw_stats, port, data);
+}
+
+static int mv88e6123_61_65_get_sset_count(struct dsa_switch *ds)
+{
+	return ARRAY_SIZE(mv88e6123_61_65_hw_stats);
+}
+
+static struct dsa_switch_driver mv88e6123_61_65_switch_driver = {
+	.tag_protocol		= cpu_to_be16(ETH_P_EDSA),
+	.priv_size		= sizeof(struct mv88e6xxx_priv_state),
+	.probe			= mv88e6123_61_65_probe,
+	.setup			= mv88e6123_61_65_setup,
+	.set_addr		= mv88e6xxx_set_addr_indirect,
+	.phy_read		= mv88e6123_61_65_phy_read,
+	.phy_write		= mv88e6123_61_65_phy_write,
+	.poll_link		= mv88e6xxx_poll_link,
+	.get_strings		= mv88e6123_61_65_get_strings,
+	.get_ethtool_stats	= mv88e6123_61_65_get_ethtool_stats,
+	.get_sset_count		= mv88e6123_61_65_get_sset_count,
+};
+
+static int __init mv88e6123_61_65_init(void)
+{
+	register_switch_driver(&mv88e6123_61_65_switch_driver);
+	return 0;
+}
+module_init(mv88e6123_61_65_init);
+
+static void __exit mv88e6123_61_65_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6123_61_65_switch_driver);
+}
+module_exit(mv88e6123_61_65_cleanup);
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
new file mode 100644
index 00000000..45f7411e
--- /dev/null
+++ b/net/dsa/mv88e6131.c
@@ -0,0 +1,439 @@
+/*
+ * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+/*
+ * Switch product IDs
+ */
+#define ID_6085		0x04a0
+#define ID_6095		0x0950
+#define ID_6131		0x1060
+
+static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+
+	ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
+	if (ret >= 0) {
+		ret &= 0xfff0;
+		if (ret == ID_6085)
+			return "Marvell 88E6085";
+		if (ret == ID_6095)
+			return "Marvell 88E6095/88E6095F";
+		if (ret == ID_6131)
+			return "Marvell 88E6131";
+	}
+
+	return NULL;
+}
+
+static int mv88e6131_switch_reset(struct dsa_switch *ds)
+{
+	int i;
+	int ret;
+
+	/*
+	 * Set all ports to the disabled state.
+	 */
+	for (i = 0; i < 11; i++) {
+		ret = REG_READ(REG_PORT(i), 0x04);
+		REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+	}
+
+	/*
+	 * Wait for transmit queues to drain.
+	 */
+	msleep(2);
+
+	/*
+	 * Reset the switch.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0xc400);
+
+	/*
+	 * Wait up to one second for reset to complete.
+	 */
+	for (i = 0; i < 1000; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x00);
+		if ((ret & 0xc800) == 0xc800)
+			break;
+
+		msleep(1);
+	}
+	if (i == 1000)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mv88e6131_setup_global(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	/*
+	 * Enable the PHY polling unit, don't discard packets with
+	 * excessive collisions, use a weighted fair queueing scheme
+	 * to arbitrate between packet queues, set the maximum frame
+	 * size to 1632, and mask all interrupt sources.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x04, 0x4400);
+
+	/*
+	 * Set the default address aging time to 5 minutes, and
+	 * enable address learn messages to be sent to all message
+	 * ports.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x0a, 0x0148);
+
+	/*
+	 * Configure the priority mapping registers.
+	 */
+	ret = mv88e6xxx_config_prio(ds);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Set the VLAN ethertype to 0x8100.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x19, 0x8100);
+
+	/*
+	 * Disable ARP mirroring, and configure the upstream port as
+	 * the port to which ingress and egress monitor frames are to
+	 * be sent.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0);
+
+	/*
+	 * Disable cascade port functionality, and set the switch's
+	 * DSA device number.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f));
+
+	/*
+	 * Send all frames with destination addresses matching
+	 * 01:80:c2:00:00:0x to the CPU port.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x03, 0xffff);
+
+	/*
+	 * Ignore removed tag data on doubly tagged packets, disable
+	 * flow control messages, force flow control priority to the
+	 * highest, and send all special multicast frames to the CPU
+	 * port at the highest priority.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
+
+	/*
+	 * Program the DSA routing table.
+	 */
+	for (i = 0; i < 32; i++) {
+		int nexthop;
+
+		nexthop = 0x1f;
+		if (i != ds->index && i < ds->dst->pd->nr_chips)
+			nexthop = ds->pd->rtable[i] & 0x1f;
+
+		REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
+	}
+
+	/*
+	 * Clear all trunk masks.
+	 */
+	for (i = 0; i < 8; i++)
+		REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff);
+
+	/*
+	 * Clear all trunk mappings.
+	 */
+	for (i = 0; i < 16; i++)
+		REG_WRITE(REG_GLOBAL2, 0x08, 0x8000 | (i << 11));
+
+	/*
+	 * Force the priority of IGMP/MLD snoop frames and ARP frames
+	 * to the highest setting.
+	 */
+	REG_WRITE(REG_GLOBAL2, 0x0f, 0x00ff);
+
+	return 0;
+}
+
+static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int addr = REG_PORT(p);
+	u16 val;
+
+	/*
+	 * MAC Forcing register: don't force link, speed, duplex
+	 * or flow control state to any particular values on physical
+	 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
+	 * (100 Mb/s on 6085) full duplex.
+	 */
+	if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
+		if (ps->id == ID_6085)
+			REG_WRITE(addr, 0x01, 0x003d); /* 100 Mb/s */
+		else
+			REG_WRITE(addr, 0x01, 0x003e); /* 1000 Mb/s */
+	else
+		REG_WRITE(addr, 0x01, 0x0003);
+
+	/*
+	 * Port Control: disable Core Tag, disable Drop-on-Lock,
+	 * transmit frames unmodified, disable Header mode,
+	 * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN
+	 * tunneling, determine priority by looking at 802.1p and
+	 * IP priority fields (IP prio has precedence), and set STP
+	 * state to Forwarding.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown unicasts, and enable DSA tagging
+	 * mode.
+	 *
+	 * If this is the link to another switch, use DSA tagging
+	 * mode, but do not enable forwarding of unknown unicasts.
+	 */
+	val = 0x0433;
+	if (p == dsa_upstream_port(ds)) {
+		val |= 0x0104;
+		/*
+		 * On 6085, unknown multicast forward is controlled
+		 * here rather than in Port Control 2 register.
+		 */
+		if (ps->id == ID_6085)
+			val |= 0x0008;
+	}
+	if (ds->dsa_port_mask & (1 << p))
+		val |= 0x0100;
+	REG_WRITE(addr, 0x04, val);
+
+	/*
+	 * Port Control 1: disable trunking.  Also, if this is the
+	 * CPU port, enable learn messages to be sent to this port.
+	 */
+	REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
+
+	/*
+	 * Port based VLAN map: give each port its own address
+	 * database, allow the CPU port to talk to each of the 'real'
+	 * ports, and allow each of the 'real' ports to only talk to
+	 * the upstream port.
+	 */
+	val = (p & 0xf) << 12;
+	if (dsa_is_cpu_port(ds, p))
+		val |= ds->phys_port_mask;
+	else
+		val |= 1 << dsa_upstream_port(ds);
+	REG_WRITE(addr, 0x06, val);
+
+	/*
+	 * Default VLAN ID and priority: don't set a default VLAN
+	 * ID, and set the default packet priority to zero.
+	 */
+	REG_WRITE(addr, 0x07, 0x0000);
+
+	/*
+	 * Port Control 2: don't force a good FCS, don't use
+	 * VLAN-based, source address-based or destination
+	 * address-based priority overrides, don't let the switch
+	 * add or strip 802.1q tags, don't discard tagged or
+	 * untagged frames on this port, do a destination address
+	 * lookup on received packets as usual, don't send a copy
+	 * of all transmitted/received frames on this port to the
+	 * CPU, and configure the upstream port number.
+	 *
+	 * If this is the upstream port for this switch, enable
+	 * forwarding of unknown multicast addresses.
+	 */
+	if (ps->id == ID_6085)
+		/*
+		 * on 6085, bits 3:0 are reserved, bit 6 control ARP
+		 * mirroring, and multicast forward is handled in
+		 * Port Control register.
+		 */
+		REG_WRITE(addr, 0x08, 0x0080);
+	else {
+		val = 0x0080 | dsa_upstream_port(ds);
+		if (p == dsa_upstream_port(ds))
+			val |= 0x0040;
+		REG_WRITE(addr, 0x08, val);
+	}
+
+	/*
+	 * Rate Control: disable ingress rate limiting.
+	 */
+	REG_WRITE(addr, 0x09, 0x0000);
+
+	/*
+	 * Rate Control 2: disable egress rate limiting.
+	 */
+	REG_WRITE(addr, 0x0a, 0x0000);
+
+	/*
+	 * Port Association Vector: when learning source addresses
+	 * of packets, add the address to the address database using
+	 * a port bitmap that has only the bit for this port set and
+	 * the other bits clear.
+	 */
+	REG_WRITE(addr, 0x0b, 1 << p);
+
+	/*
+	 * Tag Remap: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x18, 0x3210);
+
+	/*
+	 * Tag Remap 2: use an identity 802.1p prio -> switch prio
+	 * mapping.
+	 */
+	REG_WRITE(addr, 0x19, 0x7654);
+
+	return 0;
+}
+
+static int mv88e6131_setup(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int i;
+	int ret;
+
+	mutex_init(&ps->smi_mutex);
+	mv88e6xxx_ppu_state_init(ds);
+	mutex_init(&ps->stats_mutex);
+
+	ps->id = REG_READ(REG_PORT(0), 0x03) & 0xfff0;
+
+	ret = mv88e6131_switch_reset(ds);
+	if (ret < 0)
+		return ret;
+
+	/* @@@ initialise vtu and atu */
+
+	ret = mv88e6131_setup_global(ds);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < 11; i++) {
+		ret = mv88e6131_setup_port(ds, i);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mv88e6131_port_to_phy_addr(int port)
+{
+	if (port >= 0 && port <= 11)
+		return port;
+	return -1;
+}
+
+static int
+mv88e6131_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+	int addr = mv88e6131_port_to_phy_addr(port);
+	return mv88e6xxx_phy_read_ppu(ds, addr, regnum);
+}
+
+static int
+mv88e6131_phy_write(struct dsa_switch *ds,
+			      int port, int regnum, u16 val)
+{
+	int addr = mv88e6131_port_to_phy_addr(port);
+	return mv88e6xxx_phy_write_ppu(ds, addr, regnum, val);
+}
+
+static struct mv88e6xxx_hw_stat mv88e6131_hw_stats[] = {
+	{ "in_good_octets", 8, 0x00, },
+	{ "in_bad_octets", 4, 0x02, },
+	{ "in_unicast", 4, 0x04, },
+	{ "in_broadcasts", 4, 0x06, },
+	{ "in_multicasts", 4, 0x07, },
+	{ "in_pause", 4, 0x16, },
+	{ "in_undersize", 4, 0x18, },
+	{ "in_fragments", 4, 0x19, },
+	{ "in_oversize", 4, 0x1a, },
+	{ "in_jabber", 4, 0x1b, },
+	{ "in_rx_error", 4, 0x1c, },
+	{ "in_fcs_error", 4, 0x1d, },
+	{ "out_octets", 8, 0x0e, },
+	{ "out_unicast", 4, 0x10, },
+	{ "out_broadcasts", 4, 0x13, },
+	{ "out_multicasts", 4, 0x12, },
+	{ "out_pause", 4, 0x15, },
+	{ "excessive", 4, 0x11, },
+	{ "collisions", 4, 0x1e, },
+	{ "deferred", 4, 0x05, },
+	{ "single", 4, 0x14, },
+	{ "multiple", 4, 0x17, },
+	{ "out_fcs_error", 4, 0x03, },
+	{ "late", 4, 0x1f, },
+	{ "hist_64bytes", 4, 0x08, },
+	{ "hist_65_127bytes", 4, 0x09, },
+	{ "hist_128_255bytes", 4, 0x0a, },
+	{ "hist_256_511bytes", 4, 0x0b, },
+	{ "hist_512_1023bytes", 4, 0x0c, },
+	{ "hist_1024_max_bytes", 4, 0x0d, },
+};
+
+static void
+mv88e6131_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+{
+	mv88e6xxx_get_strings(ds, ARRAY_SIZE(mv88e6131_hw_stats),
+			      mv88e6131_hw_stats, port, data);
+}
+
+static void
+mv88e6131_get_ethtool_stats(struct dsa_switch *ds,
+				  int port, uint64_t *data)
+{
+	mv88e6xxx_get_ethtool_stats(ds, ARRAY_SIZE(mv88e6131_hw_stats),
+				    mv88e6131_hw_stats, port, data);
+}
+
+static int mv88e6131_get_sset_count(struct dsa_switch *ds)
+{
+	return ARRAY_SIZE(mv88e6131_hw_stats);
+}
+
+static struct dsa_switch_driver mv88e6131_switch_driver = {
+	.tag_protocol		= cpu_to_be16(ETH_P_DSA),
+	.priv_size		= sizeof(struct mv88e6xxx_priv_state),
+	.probe			= mv88e6131_probe,
+	.setup			= mv88e6131_setup,
+	.set_addr		= mv88e6xxx_set_addr_direct,
+	.phy_read		= mv88e6131_phy_read,
+	.phy_write		= mv88e6131_phy_write,
+	.poll_link		= mv88e6xxx_poll_link,
+	.get_strings		= mv88e6131_get_strings,
+	.get_ethtool_stats	= mv88e6131_get_ethtool_stats,
+	.get_sset_count		= mv88e6131_get_sset_count,
+};
+
+static int __init mv88e6131_init(void)
+{
+	register_switch_driver(&mv88e6131_switch_driver);
+	return 0;
+}
+module_init(mv88e6131_init);
+
+static void __exit mv88e6131_cleanup(void)
+{
+	unregister_switch_driver(&mv88e6131_switch_driver);
+}
+module_exit(mv88e6131_cleanup);
diff --git a/net/dsa/mv88e6xxx.c b/net/dsa/mv88e6xxx.c
new file mode 100644
index 00000000..efe661a9
--- /dev/null
+++ b/net/dsa/mv88e6xxx.c
@@ -0,0 +1,522 @@
+/*
+ * net/dsa/mv88e6xxx.c - Marvell 88e6xxx switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+#include "mv88e6xxx.h"
+
+/*
+ * If the switch's ADDR[4:0] strap pins are strapped to zero, it will
+ * use all 32 SMI bus addresses on its SMI bus, and all switch registers
+ * will be directly accessible on some {device address,register address}
+ * pair.  If the ADDR[4:0] pins are not strapped to zero, the switch
+ * will only respond to SMI transactions to that specific address, and
+ * an indirect addressing mechanism needs to be used to access its
+ * registers.
+ */
+static int mv88e6xxx_reg_wait_ready(struct mii_bus *bus, int sw_addr)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		ret = mdiobus_read(bus, sw_addr, 0);
+		if (ret < 0)
+			return ret;
+
+		if ((ret & 0x8000) == 0)
+			return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg)
+{
+	int ret;
+
+	if (sw_addr == 0)
+		return mdiobus_read(bus, addr, reg);
+
+	/*
+	 * Wait for the bus to become free.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the read command.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 0, 0x9800 | (addr << 5) | reg);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Wait for the read command to complete.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Read the data.
+	 */
+	ret = mdiobus_read(bus, sw_addr, 1);
+	if (ret < 0)
+		return ret;
+
+	return ret & 0xffff;
+}
+
+int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->smi_mutex);
+	ret = __mv88e6xxx_reg_read(ds->master_mii_bus,
+				   ds->pd->sw_addr, addr, reg);
+	mutex_unlock(&ps->smi_mutex);
+
+	return ret;
+}
+
+int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
+			  int reg, u16 val)
+{
+	int ret;
+
+	if (sw_addr == 0)
+		return mdiobus_write(bus, addr, reg, val);
+
+	/*
+	 * Wait for the bus to become free.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the data to write.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 1, val);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Transmit the write command.
+	 */
+	ret = mdiobus_write(bus, sw_addr, 0, 0x9400 | (addr << 5) | reg);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Wait for the write command to complete.
+	 */
+	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->smi_mutex);
+	ret = __mv88e6xxx_reg_write(ds->master_mii_bus,
+				    ds->pd->sw_addr, addr, reg, val);
+	mutex_unlock(&ps->smi_mutex);
+
+	return ret;
+}
+
+int mv88e6xxx_config_prio(struct dsa_switch *ds)
+{
+	/*
+	 * Configure the IP ToS mapping registers.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x10, 0x0000);
+	REG_WRITE(REG_GLOBAL, 0x11, 0x0000);
+	REG_WRITE(REG_GLOBAL, 0x12, 0x5555);
+	REG_WRITE(REG_GLOBAL, 0x13, 0x5555);
+	REG_WRITE(REG_GLOBAL, 0x14, 0xaaaa);
+	REG_WRITE(REG_GLOBAL, 0x15, 0xaaaa);
+	REG_WRITE(REG_GLOBAL, 0x16, 0xffff);
+	REG_WRITE(REG_GLOBAL, 0x17, 0xffff);
+
+	/*
+	 * Configure the IEEE 802.1p priority mapping register.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x18, 0xfa41);
+
+	return 0;
+}
+
+int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr)
+{
+	REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]);
+	REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]);
+	REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]);
+
+	return 0;
+}
+
+int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < 6; i++) {
+		int j;
+
+		/*
+		 * Write the MAC address byte.
+		 */
+		REG_WRITE(REG_GLOBAL2, 0x0d, 0x8000 | (i << 8) | addr[i]);
+
+		/*
+		 * Wait for the write to complete.
+		 */
+		for (j = 0; j < 16; j++) {
+			ret = REG_READ(REG_GLOBAL2, 0x0d);
+			if ((ret & 0x8000) == 0)
+				break;
+		}
+		if (j == 16)
+			return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum)
+{
+	if (addr >= 0)
+		return mv88e6xxx_reg_read(ds, addr, regnum);
+	return 0xffff;
+}
+
+int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val)
+{
+	if (addr >= 0)
+		return mv88e6xxx_reg_write(ds, addr, regnum, val);
+	return 0;
+}
+
+#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU
+static int mv88e6xxx_ppu_disable(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	ret = REG_READ(REG_GLOBAL, 0x04);
+	REG_WRITE(REG_GLOBAL, 0x04, ret & ~0x4000);
+
+	for (i = 0; i < 1000; i++) {
+	        ret = REG_READ(REG_GLOBAL, 0x00);
+	        msleep(1);
+	        if ((ret & 0xc000) != 0xc000)
+	                return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int mv88e6xxx_ppu_enable(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	ret = REG_READ(REG_GLOBAL, 0x04);
+	REG_WRITE(REG_GLOBAL, 0x04, ret | 0x4000);
+
+	for (i = 0; i < 1000; i++) {
+	        ret = REG_READ(REG_GLOBAL, 0x00);
+	        msleep(1);
+	        if ((ret & 0xc000) == 0xc000)
+	                return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static void mv88e6xxx_ppu_reenable_work(struct work_struct *ugly)
+{
+	struct mv88e6xxx_priv_state *ps;
+
+	ps = container_of(ugly, struct mv88e6xxx_priv_state, ppu_work);
+	if (mutex_trylock(&ps->ppu_mutex)) {
+	        struct dsa_switch *ds = ((struct dsa_switch *)ps) - 1;
+
+	        if (mv88e6xxx_ppu_enable(ds) == 0)
+	                ps->ppu_disabled = 0;
+	        mutex_unlock(&ps->ppu_mutex);
+	}
+}
+
+static void mv88e6xxx_ppu_reenable_timer(unsigned long _ps)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)_ps;
+
+	schedule_work(&ps->ppu_work);
+}
+
+static int mv88e6xxx_ppu_access_get(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+
+	mutex_lock(&ps->ppu_mutex);
+
+	/*
+	 * If the PHY polling unit is enabled, disable it so that
+	 * we can access the PHY registers.  If it was already
+	 * disabled, cancel the timer that is going to re-enable
+	 * it.
+	 */
+	if (!ps->ppu_disabled) {
+	        ret = mv88e6xxx_ppu_disable(ds);
+	        if (ret < 0) {
+	                mutex_unlock(&ps->ppu_mutex);
+	                return ret;
+	        }
+	        ps->ppu_disabled = 1;
+	} else {
+	        del_timer(&ps->ppu_timer);
+	        ret = 0;
+	}
+
+	return ret;
+}
+
+static void mv88e6xxx_ppu_access_put(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+
+	/*
+	 * Schedule a timer to re-enable the PHY polling unit.
+	 */
+	mod_timer(&ps->ppu_timer, jiffies + msecs_to_jiffies(10));
+	mutex_unlock(&ps->ppu_mutex);
+}
+
+void mv88e6xxx_ppu_state_init(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+
+	mutex_init(&ps->ppu_mutex);
+	INIT_WORK(&ps->ppu_work, mv88e6xxx_ppu_reenable_work);
+	init_timer(&ps->ppu_timer);
+	ps->ppu_timer.data = (unsigned long)ps;
+	ps->ppu_timer.function = mv88e6xxx_ppu_reenable_timer;
+}
+
+int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum)
+{
+	int ret;
+
+	ret = mv88e6xxx_ppu_access_get(ds);
+	if (ret >= 0) {
+	        ret = mv88e6xxx_reg_read(ds, addr, regnum);
+	        mv88e6xxx_ppu_access_put(ds);
+	}
+
+	return ret;
+}
+
+int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr,
+			    int regnum, u16 val)
+{
+	int ret;
+
+	ret = mv88e6xxx_ppu_access_get(ds);
+	if (ret >= 0) {
+	        ret = mv88e6xxx_reg_write(ds, addr, regnum, val);
+	        mv88e6xxx_ppu_access_put(ds);
+	}
+
+	return ret;
+}
+#endif
+
+void mv88e6xxx_poll_link(struct dsa_switch *ds)
+{
+	int i;
+
+	for (i = 0; i < DSA_MAX_PORTS; i++) {
+		struct net_device *dev;
+		int uninitialized_var(port_status);
+		int link;
+		int speed;
+		int duplex;
+		int fc;
+
+		dev = ds->ports[i];
+		if (dev == NULL)
+			continue;
+
+		link = 0;
+		if (dev->flags & IFF_UP) {
+			port_status = mv88e6xxx_reg_read(ds, REG_PORT(i), 0x00);
+			if (port_status < 0)
+				continue;
+
+			link = !!(port_status & 0x0800);
+		}
+
+		if (!link) {
+			if (netif_carrier_ok(dev)) {
+				printk(KERN_INFO "%s: link down\n", dev->name);
+				netif_carrier_off(dev);
+			}
+			continue;
+		}
+
+		switch (port_status & 0x0300) {
+		case 0x0000:
+			speed = 10;
+			break;
+		case 0x0100:
+			speed = 100;
+			break;
+		case 0x0200:
+			speed = 1000;
+			break;
+		default:
+			speed = -1;
+			break;
+		}
+		duplex = (port_status & 0x0400) ? 1 : 0;
+		fc = (port_status & 0x8000) ? 1 : 0;
+
+		if (!netif_carrier_ok(dev)) {
+			printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, "
+					 "flow control %sabled\n", dev->name,
+					 speed, duplex ? "full" : "half",
+					 fc ? "en" : "dis");
+			netif_carrier_on(dev);
+		}
+	}
+}
+
+static int mv88e6xxx_stats_wait(struct dsa_switch *ds)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		ret = REG_READ(REG_GLOBAL, 0x1d);
+		if ((ret & 0x8000) == 0)
+			return 0;
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int mv88e6xxx_stats_snapshot(struct dsa_switch *ds, int port)
+{
+	int ret;
+
+	/*
+	 * Snapshot the hardware statistics counters for this port.
+	 */
+	REG_WRITE(REG_GLOBAL, 0x1d, 0xdc00 | port);
+
+	/*
+	 * Wait for the snapshotting to complete.
+	 */
+	ret = mv88e6xxx_stats_wait(ds);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void mv88e6xxx_stats_read(struct dsa_switch *ds, int stat, u32 *val)
+{
+	u32 _val;
+	int ret;
+
+	*val = 0;
+
+	ret = mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x1d, 0xcc00 | stat);
+	if (ret < 0)
+		return;
+
+	ret = mv88e6xxx_stats_wait(ds);
+	if (ret < 0)
+		return;
+
+	ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1e);
+	if (ret < 0)
+		return;
+
+	_val = ret << 16;
+
+	ret = mv88e6xxx_reg_read(ds, REG_GLOBAL, 0x1f);
+	if (ret < 0)
+		return;
+
+	*val = _val | ret;
+}
+
+void mv88e6xxx_get_strings(struct dsa_switch *ds,
+			   int nr_stats, struct mv88e6xxx_hw_stat *stats,
+			   int port, uint8_t *data)
+{
+	int i;
+
+	for (i = 0; i < nr_stats; i++) {
+		memcpy(data + i * ETH_GSTRING_LEN,
+		       stats[i].string, ETH_GSTRING_LEN);
+	}
+}
+
+void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds,
+				 int nr_stats, struct mv88e6xxx_hw_stat *stats,
+				 int port, uint64_t *data)
+{
+	struct mv88e6xxx_priv_state *ps = (void *)(ds + 1);
+	int ret;
+	int i;
+
+	mutex_lock(&ps->stats_mutex);
+
+	ret = mv88e6xxx_stats_snapshot(ds, port);
+	if (ret < 0) {
+		mutex_unlock(&ps->stats_mutex);
+		return;
+	}
+
+	/*
+	 * Read each of the counters.
+	 */
+	for (i = 0; i < nr_stats; i++) {
+		struct mv88e6xxx_hw_stat *s = stats + i;
+		u32 low;
+		u32 high;
+
+		mv88e6xxx_stats_read(ds, s->reg, &low);
+		if (s->sizeof_stat == 8)
+			mv88e6xxx_stats_read(ds, s->reg + 1, &high);
+		else
+			high = 0;
+
+		data[i] = (((u64)high) << 32) | low;
+	}
+
+	mutex_unlock(&ps->stats_mutex);
+}
diff --git a/net/dsa/mv88e6xxx.h b/net/dsa/mv88e6xxx.h
new file mode 100644
index 00000000..61156ca2
--- /dev/null
+++ b/net/dsa/mv88e6xxx.h
@@ -0,0 +1,95 @@
+/*
+ * net/dsa/mv88e6xxx.h - Marvell 88e6xxx switch chip support
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __MV88E6XXX_H
+#define __MV88E6XXX_H
+
+#define REG_PORT(p)		(0x10 + (p))
+#define REG_GLOBAL		0x1b
+#define REG_GLOBAL2		0x1c
+
+struct mv88e6xxx_priv_state {
+	/*
+	 * When using multi-chip addressing, this mutex protects
+	 * access to the indirect access registers.  (In single-chip
+	 * mode, this mutex is effectively useless.)
+	 */
+	struct mutex	smi_mutex;
+
+#ifdef CONFIG_NET_DSA_MV88E6XXX_NEED_PPU
+	/*
+	 * Handles automatic disabling and re-enabling of the PHY
+	 * polling unit.
+	 */
+	struct mutex		ppu_mutex;
+	int			ppu_disabled;
+	struct work_struct	ppu_work;
+	struct timer_list	ppu_timer;
+#endif
+
+	/*
+	 * This mutex serialises access to the statistics unit.
+	 * Hold this mutex over snapshot + dump sequences.
+	 */
+	struct mutex	stats_mutex;
+
+	int		id; /* switch product id */
+};
+
+struct mv88e6xxx_hw_stat {
+	char string[ETH_GSTRING_LEN];
+	int sizeof_stat;
+	int reg;
+};
+
+int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg);
+int mv88e6xxx_reg_read(struct dsa_switch *ds, int addr, int reg);
+int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
+                          int reg, u16 val);
+int mv88e6xxx_reg_write(struct dsa_switch *ds, int addr, int reg, u16 val);
+int mv88e6xxx_config_prio(struct dsa_switch *ds);
+int mv88e6xxx_set_addr_direct(struct dsa_switch *ds, u8 *addr);
+int mv88e6xxx_set_addr_indirect(struct dsa_switch *ds, u8 *addr);
+int mv88e6xxx_phy_read(struct dsa_switch *ds, int addr, int regnum);
+int mv88e6xxx_phy_write(struct dsa_switch *ds, int addr, int regnum, u16 val);
+void mv88e6xxx_ppu_state_init(struct dsa_switch *ds);
+int mv88e6xxx_phy_read_ppu(struct dsa_switch *ds, int addr, int regnum);
+int mv88e6xxx_phy_write_ppu(struct dsa_switch *ds, int addr,
+			    int regnum, u16 val);
+void mv88e6xxx_poll_link(struct dsa_switch *ds);
+void mv88e6xxx_get_strings(struct dsa_switch *ds,
+			   int nr_stats, struct mv88e6xxx_hw_stat *stats,
+			   int port, uint8_t *data);
+void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds,
+				 int nr_stats, struct mv88e6xxx_hw_stat *stats,
+				 int port, uint64_t *data);
+
+#define REG_READ(addr, reg)						\
+	({								\
+		int __ret;						\
+									\
+		__ret = mv88e6xxx_reg_read(ds, addr, reg);		\
+		if (__ret < 0)						\
+			return __ret;					\
+		__ret;							\
+	})
+
+#define REG_WRITE(addr, reg, val)					\
+	({								\
+		int __ret;						\
+									\
+		__ret = mv88e6xxx_reg_write(ds, addr, reg, val);	\
+		if (__ret < 0)						\
+			return __ret;					\
+	})
+
+
+
+#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
new file mode 100644
index 00000000..0a47b6c3
--- /dev/null
+++ b/net/dsa/slave.c
@@ -0,0 +1,407 @@
+/*
+ * net/dsa/slave.c - Slave device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+
+/* slave mii_bus handling ***************************************************/
+static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_port_mask & (1 << addr))
+		return ds->drv->phy_read(ds, addr, reg);
+
+	return 0xffff;
+}
+
+static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+	struct dsa_switch *ds = bus->priv;
+
+	if (ds->phys_port_mask & (1 << addr))
+		return ds->drv->phy_write(ds, addr, reg, val);
+
+	return 0;
+}
+
+void dsa_slave_mii_bus_init(struct dsa_switch *ds)
+{
+	ds->slave_mii_bus->priv = (void *)ds;
+	ds->slave_mii_bus->name = "dsa slave smi";
+	ds->slave_mii_bus->read = dsa_slave_phy_read;
+	ds->slave_mii_bus->write = dsa_slave_phy_write;
+	snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x",
+			ds->master_mii_bus->id, ds->pd->sw_addr);
+	ds->slave_mii_bus->parent = &ds->master_mii_bus->dev;
+}
+
+
+/* slave device handling ****************************************************/
+static int dsa_slave_init(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	dev->iflink = p->parent->dst->master_netdev->ifindex;
+
+	return 0;
+}
+
+static int dsa_slave_open(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->dst->master_netdev;
+	int err;
+
+	if (!(master->flags & IFF_UP))
+		return -ENETDOWN;
+
+	if (compare_ether_addr(dev->dev_addr, master->dev_addr)) {
+		err = dev_uc_add(master, dev->dev_addr);
+		if (err < 0)
+			goto out;
+	}
+
+	if (dev->flags & IFF_ALLMULTI) {
+		err = dev_set_allmulti(master, 1);
+		if (err < 0)
+			goto del_unicast;
+	}
+	if (dev->flags & IFF_PROMISC) {
+		err = dev_set_promiscuity(master, 1);
+		if (err < 0)
+			goto clear_allmulti;
+	}
+
+	return 0;
+
+clear_allmulti:
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(master, -1);
+del_unicast:
+	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
+		dev_uc_del(master, dev->dev_addr);
+out:
+	return err;
+}
+
+static int dsa_slave_close(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->dst->master_netdev;
+
+	dev_mc_unsync(master, dev);
+	dev_uc_unsync(master, dev);
+	if (dev->flags & IFF_ALLMULTI)
+		dev_set_allmulti(master, -1);
+	if (dev->flags & IFF_PROMISC)
+		dev_set_promiscuity(master, -1);
+
+	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
+		dev_uc_del(master, dev->dev_addr);
+
+	return 0;
+}
+
+static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->dst->master_netdev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
+	if (change & IFF_PROMISC)
+		dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
+}
+
+static void dsa_slave_set_rx_mode(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->dst->master_netdev;
+
+	dev_mc_sync(master, dev);
+	dev_uc_sync(master, dev);
+}
+
+static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct net_device *master = p->parent->dst->master_netdev;
+	struct sockaddr *addr = a;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (!(dev->flags & IFF_UP))
+		goto out;
+
+	if (compare_ether_addr(addr->sa_data, master->dev_addr)) {
+		err = dev_uc_add(master, addr->sa_data);
+		if (err < 0)
+			return err;
+	}
+
+	if (compare_ether_addr(dev->dev_addr, master->dev_addr))
+		dev_uc_del(master, dev->dev_addr);
+
+out:
+	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+
+	return 0;
+}
+
+static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL)
+		return phy_mii_ioctl(p->phy, ifr, cmd);
+
+	return -EOPNOTSUPP;
+}
+
+
+/* ethtool operations *******************************************************/
+static int
+dsa_slave_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	int err;
+
+	err = -EOPNOTSUPP;
+	if (p->phy != NULL) {
+		err = phy_read_status(p->phy);
+		if (err == 0)
+			err = phy_ethtool_gset(p->phy, cmd);
+	}
+
+	return err;
+}
+
+static int
+dsa_slave_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL)
+		return phy_ethtool_sset(p->phy, cmd);
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_drvinfo(struct net_device *dev,
+				  struct ethtool_drvinfo *drvinfo)
+{
+	strncpy(drvinfo->driver, "dsa", 32);
+	strncpy(drvinfo->version, dsa_driver_version, 32);
+	strncpy(drvinfo->fw_version, "N/A", 32);
+	strncpy(drvinfo->bus_info, "platform", 32);
+}
+
+static int dsa_slave_nway_reset(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL)
+		return genphy_restart_aneg(p->phy);
+
+	return -EOPNOTSUPP;
+}
+
+static u32 dsa_slave_get_link(struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+
+	if (p->phy != NULL) {
+		genphy_update_link(p->phy);
+		return p->phy->link;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static void dsa_slave_get_strings(struct net_device *dev,
+				  uint32_t stringset, uint8_t *data)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	if (stringset == ETH_SS_STATS) {
+		int len = ETH_GSTRING_LEN;
+
+		strncpy(data, "tx_packets", len);
+		strncpy(data + len, "tx_bytes", len);
+		strncpy(data + 2 * len, "rx_packets", len);
+		strncpy(data + 3 * len, "rx_bytes", len);
+		if (ds->drv->get_strings != NULL)
+			ds->drv->get_strings(ds, p->port, data + 4 * len);
+	}
+}
+
+static void dsa_slave_get_ethtool_stats(struct net_device *dev,
+					struct ethtool_stats *stats,
+					uint64_t *data)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	data[0] = p->dev->stats.tx_packets;
+	data[1] = p->dev->stats.tx_bytes;
+	data[2] = p->dev->stats.rx_packets;
+	data[3] = p->dev->stats.rx_bytes;
+	if (ds->drv->get_ethtool_stats != NULL)
+		ds->drv->get_ethtool_stats(ds, p->port, data + 4);
+}
+
+static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->parent;
+
+	if (sset == ETH_SS_STATS) {
+		int count;
+
+		count = 4;
+		if (ds->drv->get_sset_count != NULL)
+			count += ds->drv->get_sset_count(ds);
+
+		return count;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct ethtool_ops dsa_slave_ethtool_ops = {
+	.get_settings		= dsa_slave_get_settings,
+	.set_settings		= dsa_slave_set_settings,
+	.get_drvinfo		= dsa_slave_get_drvinfo,
+	.nway_reset		= dsa_slave_nway_reset,
+	.get_link		= dsa_slave_get_link,
+	.get_strings		= dsa_slave_get_strings,
+	.get_ethtool_stats	= dsa_slave_get_ethtool_stats,
+	.get_sset_count		= dsa_slave_get_sset_count,
+};
+
+#ifdef CONFIG_NET_DSA_TAG_DSA
+static const struct net_device_ops dsa_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
+	.ndo_open	 	= dsa_slave_open,
+	.ndo_stop		= dsa_slave_close,
+	.ndo_start_xmit		= dsa_xmit,
+	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
+	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
+	.ndo_set_multicast_list = dsa_slave_set_rx_mode,
+	.ndo_set_mac_address	= dsa_slave_set_mac_address,
+	.ndo_do_ioctl		= dsa_slave_ioctl,
+};
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+static const struct net_device_ops edsa_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
+	.ndo_open	 	= dsa_slave_open,
+	.ndo_stop		= dsa_slave_close,
+	.ndo_start_xmit		= edsa_xmit,
+	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
+	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
+	.ndo_set_multicast_list = dsa_slave_set_rx_mode,
+	.ndo_set_mac_address	= dsa_slave_set_mac_address,
+	.ndo_do_ioctl		= dsa_slave_ioctl,
+};
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+static const struct net_device_ops trailer_netdev_ops = {
+	.ndo_init		= dsa_slave_init,
+	.ndo_open	 	= dsa_slave_open,
+	.ndo_stop		= dsa_slave_close,
+	.ndo_start_xmit		= trailer_xmit,
+	.ndo_change_rx_flags	= dsa_slave_change_rx_flags,
+	.ndo_set_rx_mode	= dsa_slave_set_rx_mode,
+	.ndo_set_multicast_list = dsa_slave_set_rx_mode,
+	.ndo_set_mac_address	= dsa_slave_set_mac_address,
+	.ndo_do_ioctl		= dsa_slave_ioctl,
+};
+#endif
+
+/* slave device setup *******************************************************/
+struct net_device *
+dsa_slave_create(struct dsa_switch *ds, struct device *parent,
+		 int port, char *name)
+{
+	struct net_device *master = ds->dst->master_netdev;
+	struct net_device *slave_dev;
+	struct dsa_slave_priv *p;
+	int ret;
+
+	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv),
+				 name, ether_setup);
+	if (slave_dev == NULL)
+		return slave_dev;
+
+	slave_dev->features = master->vlan_features;
+	SET_ETHTOOL_OPS(slave_dev, &dsa_slave_ethtool_ops);
+	memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
+	slave_dev->tx_queue_len = 0;
+
+	switch (ds->dst->tag_protocol) {
+#ifdef CONFIG_NET_DSA_TAG_DSA
+	case htons(ETH_P_DSA):
+		slave_dev->netdev_ops = &dsa_netdev_ops;
+		break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+	case htons(ETH_P_EDSA):
+		slave_dev->netdev_ops = &edsa_netdev_ops;
+		break;
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+	case htons(ETH_P_TRAILER):
+		slave_dev->netdev_ops = &trailer_netdev_ops;
+		break;
+#endif
+	default:
+		BUG();
+	}
+
+	SET_NETDEV_DEV(slave_dev, parent);
+	slave_dev->vlan_features = master->vlan_features;
+
+	p = netdev_priv(slave_dev);
+	p->dev = slave_dev;
+	p->parent = ds;
+	p->port = port;
+	p->phy = ds->slave_mii_bus->phy_map[port];
+
+	ret = register_netdev(slave_dev);
+	if (ret) {
+		printk(KERN_ERR "%s: error %d registering interface %s\n",
+				master->name, ret, slave_dev->name);
+		free_netdev(slave_dev);
+		return NULL;
+	}
+
+	netif_carrier_off(slave_dev);
+
+	if (p->phy != NULL) {
+		phy_attach(slave_dev, dev_name(&p->phy->dev),
+			   0, PHY_INTERFACE_MODE_GMII);
+
+		p->phy->autoneg = AUTONEG_ENABLE;
+		p->phy->speed = 0;
+		p->phy->duplex = 0;
+		p->phy->advertising = p->phy->supported | ADVERTISED_Autoneg;
+		phy_start_aneg(p->phy);
+	}
+
+	return slave_dev;
+}
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
new file mode 100644
index 00000000..98dfe80b
--- /dev/null
+++ b/net/dsa/tag_dsa.c
@@ -0,0 +1,205 @@
+/*
+ * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN	4
+
+netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u8 *dsa_header;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * Convert the outermost 802.1q tag to a DSA tag for tagged
+	 * packets, or insert a DSA tag between the addresses and
+	 * the ethertype field for untagged packets.
+	 */
+	if (skb->protocol == htons(ETH_P_8021Q)) {
+		if (skb_cow_head(skb, 0) < 0)
+			goto out_free;
+
+		/*
+		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+		 */
+		dsa_header = skb->data + 2 * ETH_ALEN;
+		dsa_header[0] = 0x60 | p->parent->index;
+		dsa_header[1] = p->port << 3;
+
+		/*
+		 * Move CFI field from byte 2 to byte 1.
+		 */
+		if (dsa_header[2] & 0x10) {
+			dsa_header[1] |= 0x01;
+			dsa_header[2] &= ~0x10;
+		}
+	} else {
+		if (skb_cow_head(skb, DSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, DSA_HLEN);
+
+		memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct untagged FROM_CPU DSA tag.
+		 */
+		dsa_header = skb->data + 2 * ETH_ALEN;
+		dsa_header[0] = 0x40 | p->parent->index;
+		dsa_header[1] = p->port << 3;
+		dsa_header[2] = 0x00;
+		dsa_header[3] = 0x00;
+	}
+
+	skb->protocol = htons(ETH_P_DSA);
+
+	skb->dev = p->parent->dst->master_netdev;
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+
+out_free:
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 *dsa_header;
+	int source_device;
+	int source_port;
+
+	if (unlikely(dst == NULL))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+		goto out_drop;
+
+	/*
+	 * The ethertype field is part of the DSA header.
+	 */
+	dsa_header = skb->data - 2;
+
+	/*
+	 * Check that frame type is either TO_CPU or FORWARD.
+	 */
+	if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
+		goto out_drop;
+
+	/*
+	 * Determine source device and port.
+	 */
+	source_device = dsa_header[0] & 0x1f;
+	source_port = (dsa_header[1] >> 3) & 0x1f;
+
+	/*
+	 * Check that the source device exists and that the source
+	 * port is a registered DSA port.
+	 */
+	if (source_device >= dst->pd->nr_chips)
+		goto out_drop;
+	ds = dst->ds[source_device];
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	/*
+	 * Convert the DSA header to an 802.1q header if the 'tagged'
+	 * bit in the DSA header is set.  If the 'tagged' bit is clear,
+	 * delete the DSA header entirely.
+	 */
+	if (dsa_header[0] & 0x20) {
+		u8 new_header[4];
+
+		/*
+		 * Insert 802.1q ethertype and copy the VLAN-related
+		 * fields, but clear the bit that will hold CFI (since
+		 * DSA uses that bit location for another purpose).
+		 */
+		new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+		new_header[1] = ETH_P_8021Q & 0xff;
+		new_header[2] = dsa_header[2] & ~0x10;
+		new_header[3] = dsa_header[3];
+
+		/*
+		 * Move CFI bit from its place in the DSA header to
+		 * its 802.1q-designated place.
+		 */
+		if (dsa_header[1] & 0x01)
+			new_header[2] |= 0x10;
+
+		/*
+		 * Update packet checksum if skb is CHECKSUM_COMPLETE.
+		 */
+		if (skb->ip_summed == CHECKSUM_COMPLETE) {
+			__wsum c = skb->csum;
+			c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+			c = csum_sub(c, csum_partial(dsa_header + 2, 2, 0));
+			skb->csum = c;
+		}
+
+		memcpy(dsa_header, new_header, DSA_HLEN);
+	} else {
+		/*
+		 * Remove DSA tag and update checksum.
+		 */
+		skb_pull_rcsum(skb, DSA_HLEN);
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - DSA_HLEN,
+			2 * ETH_ALEN);
+	}
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type dsa_packet_type __read_mostly = {
+	.type	= cpu_to_be16(ETH_P_DSA),
+	.func	= dsa_rcv,
+};
+
+static int __init dsa_init_module(void)
+{
+	dev_add_pack(&dsa_packet_type);
+	return 0;
+}
+module_init(dsa_init_module);
+
+static void __exit dsa_cleanup_module(void)
+{
+	dev_remove_pack(&dsa_packet_type);
+}
+module_exit(dsa_cleanup_module);
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
new file mode 100644
index 00000000..6f383322
--- /dev/null
+++ b/net/dsa/tag_edsa.c
@@ -0,0 +1,224 @@
+/*
+ * net/dsa/tag_edsa.c - Ethertype DSA tagging
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+#define DSA_HLEN	4
+#define EDSA_HLEN	8
+
+netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	u8 *edsa_header;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * Convert the outermost 802.1q tag to a DSA tag and prepend
+	 * a DSA ethertype field is the packet is tagged, or insert
+	 * a DSA ethertype plus DSA tag between the addresses and the
+	 * current ethertype field if the packet is untagged.
+	 */
+	if (skb->protocol == htons(ETH_P_8021Q)) {
+		if (skb_cow_head(skb, DSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, DSA_HLEN);
+
+		memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
+		 */
+		edsa_header = skb->data + 2 * ETH_ALEN;
+		edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+		edsa_header[1] = ETH_P_EDSA & 0xff;
+		edsa_header[2] = 0x00;
+		edsa_header[3] = 0x00;
+		edsa_header[4] = 0x60 | p->parent->index;
+		edsa_header[5] = p->port << 3;
+
+		/*
+		 * Move CFI field from byte 6 to byte 5.
+		 */
+		if (edsa_header[6] & 0x10) {
+			edsa_header[5] |= 0x01;
+			edsa_header[6] &= ~0x10;
+		}
+	} else {
+		if (skb_cow_head(skb, EDSA_HLEN) < 0)
+			goto out_free;
+		skb_push(skb, EDSA_HLEN);
+
+		memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN);
+
+		/*
+		 * Construct untagged FROM_CPU DSA tag.
+		 */
+		edsa_header = skb->data + 2 * ETH_ALEN;
+		edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+		edsa_header[1] = ETH_P_EDSA & 0xff;
+		edsa_header[2] = 0x00;
+		edsa_header[3] = 0x00;
+		edsa_header[4] = 0x40 | p->parent->index;
+		edsa_header[5] = p->port << 3;
+		edsa_header[6] = 0x00;
+		edsa_header[7] = 0x00;
+	}
+
+	skb->protocol = htons(ETH_P_EDSA);
+
+	skb->dev = p->parent->dst->master_netdev;
+	dev_queue_xmit(skb);
+
+	return NETDEV_TX_OK;
+
+out_free:
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 *edsa_header;
+	int source_device;
+	int source_port;
+
+	if (unlikely(dst == NULL))
+		goto out_drop;
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+		goto out_drop;
+
+	/*
+	 * Skip the two null bytes after the ethertype.
+	 */
+	edsa_header = skb->data + 2;
+
+	/*
+	 * Check that frame type is either TO_CPU or FORWARD.
+	 */
+	if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
+		goto out_drop;
+
+	/*
+	 * Determine source device and port.
+	 */
+	source_device = edsa_header[0] & 0x1f;
+	source_port = (edsa_header[1] >> 3) & 0x1f;
+
+	/*
+	 * Check that the source device exists and that the source
+	 * port is a registered DSA port.
+	 */
+	if (source_device >= dst->pd->nr_chips)
+		goto out_drop;
+	ds = dst->ds[source_device];
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	/*
+	 * If the 'tagged' bit is set, convert the DSA tag to a 802.1q
+	 * tag and delete the ethertype part.  If the 'tagged' bit is
+	 * clear, delete the ethertype and the DSA tag parts.
+	 */
+	if (edsa_header[0] & 0x20) {
+		u8 new_header[4];
+
+		/*
+		 * Insert 802.1q ethertype and copy the VLAN-related
+		 * fields, but clear the bit that will hold CFI (since
+		 * DSA uses that bit location for another purpose).
+		 */
+		new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
+		new_header[1] = ETH_P_8021Q & 0xff;
+		new_header[2] = edsa_header[2] & ~0x10;
+		new_header[3] = edsa_header[3];
+
+		/*
+		 * Move CFI bit from its place in the DSA header to
+		 * its 802.1q-designated place.
+		 */
+		if (edsa_header[1] & 0x01)
+			new_header[2] |= 0x10;
+
+		skb_pull_rcsum(skb, DSA_HLEN);
+
+		/*
+		 * Update packet checksum if skb is CHECKSUM_COMPLETE.
+		 */
+		if (skb->ip_summed == CHECKSUM_COMPLETE) {
+			__wsum c = skb->csum;
+			c = csum_add(c, csum_partial(new_header + 2, 2, 0));
+			c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0));
+			skb->csum = c;
+		}
+
+		memcpy(edsa_header, new_header, DSA_HLEN);
+
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - DSA_HLEN,
+			2 * ETH_ALEN);
+	} else {
+		/*
+		 * Remove DSA tag and update checksum.
+		 */
+		skb_pull_rcsum(skb, EDSA_HLEN);
+		memmove(skb->data - ETH_HLEN,
+			skb->data - ETH_HLEN - EDSA_HLEN,
+			2 * ETH_ALEN);
+	}
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type edsa_packet_type __read_mostly = {
+	.type	= cpu_to_be16(ETH_P_EDSA),
+	.func	= edsa_rcv,
+};
+
+static int __init edsa_init_module(void)
+{
+	dev_add_pack(&edsa_packet_type);
+	return 0;
+}
+module_init(edsa_init_module);
+
+static void __exit edsa_cleanup_module(void)
+{
+	dev_remove_pack(&edsa_packet_type);
+}
+module_exit(edsa_cleanup_module);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
new file mode 100644
index 00000000..d6d7d0ad
--- /dev/null
+++ b/net/dsa/tag_trailer.c
@@ -0,0 +1,133 @@
+/*
+ * net/dsa/tag_trailer.c - Trailer tag format handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include "dsa_priv.h"
+
+netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct sk_buff *nskb;
+	int padlen;
+	u8 *trailer;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/*
+	 * We have to make sure that the trailer ends up as the very
+	 * last 4 bytes of the packet.  This means that we have to pad
+	 * the packet to the minimum ethernet frame size, if necessary,
+	 * before adding the trailer.
+	 */
+	padlen = 0;
+	if (skb->len < 60)
+		padlen = 60 - skb->len;
+
+	nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
+	if (nskb == NULL) {
+		kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+	skb_reserve(nskb, NET_IP_ALIGN);
+
+	skb_reset_mac_header(nskb);
+	skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
+	skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
+	skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
+	kfree_skb(skb);
+
+	if (padlen) {
+		u8 *pad = skb_put(nskb, padlen);
+		memset(pad, 0, padlen);
+	}
+
+	trailer = skb_put(nskb, 4);
+	trailer[0] = 0x80;
+	trailer[1] = 1 << p->port;
+	trailer[2] = 0x10;
+	trailer[3] = 0x00;
+
+	nskb->protocol = htons(ETH_P_TRAILER);
+
+	nskb->dev = p->parent->dst->master_netdev;
+	dev_queue_xmit(nskb);
+
+	return NETDEV_TX_OK;
+}
+
+static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct dsa_switch_tree *dst = dev->dsa_ptr;
+	struct dsa_switch *ds;
+	u8 *trailer;
+	int source_port;
+
+	if (unlikely(dst == NULL))
+		goto out_drop;
+	ds = dst->ds[0];
+
+	skb = skb_unshare(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	if (skb_linearize(skb))
+		goto out_drop;
+
+	trailer = skb_tail_pointer(skb) - 4;
+	if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 ||
+	    (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00)
+		goto out_drop;
+
+	source_port = trailer[1] & 7;
+	if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
+		goto out_drop;
+
+	pskb_trim_rcsum(skb, skb->len - 4);
+
+	skb->dev = ds->ports[source_port];
+	skb_push(skb, ETH_HLEN);
+	skb->pkt_type = PACKET_HOST;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	skb->dev->stats.rx_packets++;
+	skb->dev->stats.rx_bytes += skb->len;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+out_drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static struct packet_type trailer_packet_type __read_mostly = {
+	.type	= cpu_to_be16(ETH_P_TRAILER),
+	.func	= trailer_rcv,
+};
+
+static int __init trailer_init_module(void)
+{
+	dev_add_pack(&trailer_packet_type);
+	return 0;
+}
+module_init(trailer_init_module);
+
+static void __exit trailer_cleanup_module(void)
+{
+	dev_remove_pack(&trailer_packet_type);
+}
+module_exit(trailer_cleanup_module);
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 00000000..39a2d297
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
+#
+# Acorn Econet/AUN protocols 
+#
+
+config ECONET
+	tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && INET
+	---help---
+	  Econet is a fairly old and slow networking protocol mainly used by
+	  Acorn computers to access file and print servers. It uses native
+	  Econet network cards. AUN is an implementation of the higher level
+	  parts of Econet that runs over ordinary Ethernet connections, on
+	  top of the UDP packet protocol, which in turn runs on top of the
+	  Internet protocol IP.
+
+	  If you say Y here, you can choose with the next two options whether
+	  to send Econet/AUN traffic over a UDP Ethernet connection or over
+	  a native Econet network card.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called econet.
+
+config ECONET_AUNUDP
+	bool "AUN over UDP"
+	depends on ECONET
+	help
+	  Say Y here if you want to send Econet/AUN traffic over a UDP
+	  connection (UDP is a packet based protocol that runs on top of the
+	  Internet protocol IP) using an ordinary Ethernet network card.
+
+config ECONET_NATIVE
+	bool "Native Econet"
+	depends on ECONET
+	help
+	  Say Y here if you have a native Econet network card installed in
+	  your computer.
diff --git a/net/econet/Makefile b/net/econet/Makefile
new file mode 100644
index 00000000..05fae8be
--- /dev/null
+++ b/net/econet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for Econet support code.
+#
+
+obj-$(CONFIG_ECONET) += econet.o
+
+econet-y := af_econet.o
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
new file mode 100644
index 00000000..a1d9f378
--- /dev/null
+++ b/net/econet/af_econet.c
@@ -0,0 +1,1174 @@
+/*
+ *	An implementation of the Acorn Econet and AUN protocols.
+ *	Philip Blundell <philb@gnu.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/route.h>
+#include <linux/inet.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/wireless.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/if_ec.h>
+#include <net/udp.h>
+#include <net/ip.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+static const struct proto_ops econet_ops;
+static struct hlist_head econet_sklist;
+static DEFINE_SPINLOCK(econet_lock);
+static DEFINE_MUTEX(econet_mutex);
+
+/* Since there are only 256 possible network numbers (or fewer, depends
+   how you count) it makes sense to use a simple lookup table. */
+static struct net_device *net2dev_map[256];
+
+#define EC_PORT_IP	0xd2
+
+#ifdef CONFIG_ECONET_AUNUDP
+static DEFINE_SPINLOCK(aun_queue_lock);
+static struct socket *udpsock;
+#define AUN_PORT	0x8000
+
+
+struct aunhdr
+{
+	unsigned char code;		/* AUN magic protocol byte */
+	unsigned char port;
+	unsigned char cb;
+	unsigned char pad;
+	unsigned long handle;
+};
+
+static unsigned long aun_seq;
+
+/* Queue of packets waiting to be transmitted. */
+static struct sk_buff_head aun_queue;
+static struct timer_list ab_cleanup_timer;
+
+#endif		/* CONFIG_ECONET_AUNUDP */
+
+/* Per-packet information */
+struct ec_cb
+{
+	struct sockaddr_ec sec;
+	unsigned long cookie;		/* Supplied by user. */
+#ifdef CONFIG_ECONET_AUNUDP
+	int done;
+	unsigned long seq;		/* Sequencing */
+	unsigned long timeout;		/* Timeout */
+	unsigned long start;		/* jiffies */
+#endif
+#ifdef CONFIG_ECONET_NATIVE
+	void (*sent)(struct sk_buff *, int result);
+#endif
+};
+
+static void econet_remove_socket(struct hlist_head *list, struct sock *sk)
+{
+	spin_lock_bh(&econet_lock);
+	sk_del_node_init(sk);
+	spin_unlock_bh(&econet_lock);
+}
+
+static void econet_insert_socket(struct hlist_head *list, struct sock *sk)
+{
+	spin_lock_bh(&econet_lock);
+	sk_add_node(sk, list);
+	spin_unlock_bh(&econet_lock);
+}
+
+/*
+ *	Pull a packet from our receive queue and hand it to the user.
+ *	If necessary we block.
+ */
+
+static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
+			  struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	size_t copied;
+	int err;
+
+	msg->msg_namelen = sizeof(struct sockaddr_ec);
+
+	mutex_lock(&econet_mutex);
+
+	/*
+	 *	Call the generic datagram receiver. This handles all sorts
+	 *	of horrible races and re-entrancy so we can forget about it
+	 *	in the protocol layers.
+	 *
+	 *	Now it will return ENETDOWN, if device have just gone down,
+	 *	but then it will block.
+	 */
+
+	skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
+
+	/*
+	 *	An error occurred so return it. Because skb_recv_datagram()
+	 *	handles the blocking we don't see and worry about blocking
+	 *	retries.
+	 */
+
+	if(skb==NULL)
+		goto out;
+
+	/*
+	 *	You lose any data beyond the buffer you gave. If it worries a
+	 *	user program they can ask the device for its MTU anyway.
+	 */
+
+	copied = skb->len;
+	if (copied > len)
+	{
+		copied=len;
+		msg->msg_flags|=MSG_TRUNC;
+	}
+
+	/* We can't use skb_copy_datagram here */
+	err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
+	if (err)
+		goto out_free;
+	sk->sk_stamp = skb->tstamp;
+
+	if (msg->msg_name)
+		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+
+	/*
+	 *	Free or return the buffer as appropriate. Again this
+	 *	hides all the races and re-entrancy issues from us.
+	 */
+	err = copied;
+
+out_free:
+	skb_free_datagram(sk, skb);
+out:
+	mutex_unlock(&econet_mutex);
+	return err;
+}
+
+/*
+ *	Bind an Econet socket.
+ */
+
+static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr;
+	struct sock *sk;
+	struct econet_sock *eo;
+
+	/*
+	 *	Check legality
+	 */
+
+	if (addr_len < sizeof(struct sockaddr_ec) ||
+	    sec->sec_family != AF_ECONET)
+		return -EINVAL;
+
+	mutex_lock(&econet_mutex);
+
+	sk = sock->sk;
+	eo = ec_sk(sk);
+
+	eo->cb	    = sec->cb;
+	eo->port    = sec->port;
+	eo->station = sec->addr.station;
+	eo->net	    = sec->addr.net;
+
+	mutex_unlock(&econet_mutex);
+
+	return 0;
+}
+
+#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
+/*
+ *	Queue a transmit result for the user to be told about.
+ */
+
+static void tx_result(struct sock *sk, unsigned long cookie, int result)
+{
+	struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
+	struct ec_cb *eb;
+	struct sockaddr_ec *sec;
+
+	if (skb == NULL)
+	{
+		printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n");
+		return;
+	}
+
+	eb = (struct ec_cb *)&skb->cb;
+	sec = (struct sockaddr_ec *)&eb->sec;
+	memset(sec, 0, sizeof(struct sockaddr_ec));
+	sec->cookie = cookie;
+	sec->type = ECTYPE_TRANSMIT_STATUS | result;
+	sec->sec_family = AF_ECONET;
+
+	if (sock_queue_rcv_skb(sk, skb) < 0)
+		kfree_skb(skb);
+}
+#endif
+
+#ifdef CONFIG_ECONET_NATIVE
+/*
+ *	Called by the Econet hardware driver when a packet transmit
+ *	has completed.  Tell the user.
+ */
+
+static void ec_tx_done(struct sk_buff *skb, int result)
+{
+	struct ec_cb *eb = (struct ec_cb *)&skb->cb;
+	tx_result(skb->sk, eb->cookie, result);
+}
+#endif
+
+/*
+ *	Send a packet.  We have to work out which device it's going out on
+ *	and hence whether to use real Econet or the UDP emulation.
+ */
+
+static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
+			  struct msghdr *msg, size_t len)
+{
+	struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name;
+	struct net_device *dev;
+	struct ec_addr addr;
+	int err;
+	unsigned char port, cb;
+#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	struct ec_cb *eb;
+#endif
+#ifdef CONFIG_ECONET_AUNUDP
+	struct msghdr udpmsg;
+	struct iovec iov[2];
+	struct aunhdr ah;
+	struct sockaddr_in udpdest;
+	__kernel_size_t size;
+	mm_segment_t oldfs;
+	char *userbuf;
+#endif
+
+	/*
+	 *	Check the flags.
+	 */
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	mutex_lock(&econet_mutex);
+
+        if (saddr == NULL || msg->msg_namelen < sizeof(struct sockaddr_ec)) {
+                mutex_unlock(&econet_mutex);
+                return -EINVAL;
+        }
+        addr.station = saddr->addr.station;
+        addr.net = saddr->addr.net;
+        port = saddr->port;
+        cb = saddr->cb;
+
+	/* Look for a device with the right network number. */
+	dev = net2dev_map[addr.net];
+
+	/* If not directly reachable, use some default */
+	if (dev == NULL) {
+		dev = net2dev_map[0];
+		/* No interfaces at all? */
+		if (dev == NULL) {
+			mutex_unlock(&econet_mutex);
+			return -ENETDOWN;
+		}
+	}
+
+	if (dev->type == ARPHRD_ECONET) {
+		/* Real hardware Econet.  We're not worthy etc. */
+#ifdef CONFIG_ECONET_NATIVE
+		unsigned short proto = 0;
+		int res;
+
+		if (len + 15 > dev->mtu) {
+			mutex_unlock(&econet_mutex);
+			return -EMSGSIZE;
+		}
+
+		dev_hold(dev);
+
+		skb = sock_alloc_send_skb(sk, len+LL_ALLOCATED_SPACE(dev),
+					  msg->msg_flags & MSG_DONTWAIT, &err);
+		if (skb==NULL)
+			goto out_unlock;
+
+		skb_reserve(skb, LL_RESERVED_SPACE(dev));
+		skb_reset_network_header(skb);
+
+		eb = (struct ec_cb *)&skb->cb;
+
+		eb->cookie = saddr->cookie;
+		eb->sec = *saddr;
+		eb->sent = ec_tx_done;
+
+		err = -EINVAL;
+		res = dev_hard_header(skb, dev, ntohs(proto), &addr, NULL, len);
+		if (res < 0)
+			goto out_free;
+		if (res > 0) {
+			struct ec_framehdr *fh;
+			/* Poke in our control byte and
+			   port number.  Hack, hack.  */
+			fh = (struct ec_framehdr *)(skb->data);
+			fh->cb = cb;
+			fh->port = port;
+			if (sock->type != SOCK_DGRAM) {
+				skb_reset_tail_pointer(skb);
+				skb->len = 0;
+			}
+		}
+
+		/* Copy the data. Returns -EFAULT on error */
+		err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+		skb->protocol = proto;
+		skb->dev = dev;
+		skb->priority = sk->sk_priority;
+		if (err)
+			goto out_free;
+
+		err = -ENETDOWN;
+		if (!(dev->flags & IFF_UP))
+			goto out_free;
+
+		/*
+		 *	Now send it
+		 */
+
+		dev_queue_xmit(skb);
+		dev_put(dev);
+		mutex_unlock(&econet_mutex);
+		return len;
+
+	out_free:
+		kfree_skb(skb);
+	out_unlock:
+		if (dev)
+			dev_put(dev);
+#else
+		err = -EPROTOTYPE;
+#endif
+		mutex_unlock(&econet_mutex);
+
+		return err;
+	}
+
+#ifdef CONFIG_ECONET_AUNUDP
+	/* AUN virtual Econet. */
+
+	if (udpsock == NULL) {
+		mutex_unlock(&econet_mutex);
+		return -ENETDOWN;		/* No socket - can't send */
+	}
+
+	if (len > 32768) {
+		err = -E2BIG;
+		goto error;
+	}
+
+	/* Make up a UDP datagram and hand it off to some higher intellect. */
+
+	memset(&udpdest, 0, sizeof(udpdest));
+	udpdest.sin_family = AF_INET;
+	udpdest.sin_port = htons(AUN_PORT);
+
+	/* At the moment we use the stupid Acorn scheme of Econet address
+	   y.x maps to IP a.b.c.x.  This should be replaced with something
+	   more flexible and more aware of subnet masks.  */
+	{
+		struct in_device *idev;
+		unsigned long network = 0;
+
+		rcu_read_lock();
+		idev = __in_dev_get_rcu(dev);
+		if (idev) {
+			if (idev->ifa_list)
+				network = ntohl(idev->ifa_list->ifa_address) &
+					0xffffff00;		/* !!! */
+		}
+		rcu_read_unlock();
+		udpdest.sin_addr.s_addr = htonl(network | addr.station);
+	}
+
+	memset(&ah, 0, sizeof(ah));
+	ah.port = port;
+	ah.cb = cb & 0x7f;
+	ah.code = 2;		/* magic */
+
+	/* tack our header on the front of the iovec */
+	size = sizeof(struct aunhdr);
+	iov[0].iov_base = (void *)&ah;
+	iov[0].iov_len = size;
+
+	userbuf = vmalloc(len);
+	if (userbuf == NULL) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	iov[1].iov_base = userbuf;
+	iov[1].iov_len = len;
+	err = memcpy_fromiovec(userbuf, msg->msg_iov, len);
+	if (err)
+		goto error_free_buf;
+
+	/* Get a skbuff (no data, just holds our cb information) */
+	if ((skb = sock_alloc_send_skb(sk, 0,
+				       msg->msg_flags & MSG_DONTWAIT,
+				       &err)) == NULL)
+		goto error_free_buf;
+
+	eb = (struct ec_cb *)&skb->cb;
+
+	eb->cookie = saddr->cookie;
+	eb->timeout = (5*HZ);
+	eb->start = jiffies;
+	ah.handle = aun_seq;
+	eb->seq = (aun_seq++);
+	eb->sec = *saddr;
+
+	skb_queue_tail(&aun_queue, skb);
+
+	udpmsg.msg_name = (void *)&udpdest;
+	udpmsg.msg_namelen = sizeof(udpdest);
+	udpmsg.msg_iov = &iov[0];
+	udpmsg.msg_iovlen = 2;
+	udpmsg.msg_control = NULL;
+	udpmsg.msg_controllen = 0;
+	udpmsg.msg_flags=0;
+
+	oldfs = get_fs(); set_fs(KERNEL_DS);	/* More privs :-) */
+	err = sock_sendmsg(udpsock, &udpmsg, size);
+	set_fs(oldfs);
+
+error_free_buf:
+	vfree(userbuf);
+error:
+#else
+	err = -EPROTOTYPE;
+#endif
+	mutex_unlock(&econet_mutex);
+
+	return err;
+}
+
+/*
+ *	Look up the address of a socket.
+ */
+
+static int econet_getname(struct socket *sock, struct sockaddr *uaddr,
+			  int *uaddr_len, int peer)
+{
+	struct sock *sk;
+	struct econet_sock *eo;
+	struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr;
+
+	if (peer)
+		return -EOPNOTSUPP;
+
+	memset(sec, 0, sizeof(*sec));
+	mutex_lock(&econet_mutex);
+
+	sk = sock->sk;
+	eo = ec_sk(sk);
+
+	sec->sec_family	  = AF_ECONET;
+	sec->port	  = eo->port;
+	sec->addr.station = eo->station;
+	sec->addr.net	  = eo->net;
+
+	mutex_unlock(&econet_mutex);
+
+	*uaddr_len = sizeof(*sec);
+	return 0;
+}
+
+static void econet_destroy_timer(unsigned long data)
+{
+	struct sock *sk=(struct sock *)data;
+
+	if (!sk_has_allocations(sk)) {
+		sk_free(sk);
+		return;
+	}
+
+	sk->sk_timer.expires = jiffies + 10 * HZ;
+	add_timer(&sk->sk_timer);
+	printk(KERN_DEBUG "econet socket destroy delayed\n");
+}
+
+/*
+ *	Close an econet socket.
+ */
+
+static int econet_release(struct socket *sock)
+{
+	struct sock *sk;
+
+	mutex_lock(&econet_mutex);
+
+	sk = sock->sk;
+	if (!sk)
+		goto out_unlock;
+
+	econet_remove_socket(&econet_sklist, sk);
+
+	/*
+	 *	Now the socket is dead. No more input will appear.
+	 */
+
+	sk->sk_state_change(sk);	/* It is useless. Just for sanity. */
+
+	sock_orphan(sk);
+
+	/* Purge queues */
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	if (sk_has_allocations(sk)) {
+		sk->sk_timer.data     = (unsigned long)sk;
+		sk->sk_timer.expires  = jiffies + HZ;
+		sk->sk_timer.function = econet_destroy_timer;
+		add_timer(&sk->sk_timer);
+
+		goto out_unlock;
+	}
+
+	sk_free(sk);
+
+out_unlock:
+	mutex_unlock(&econet_mutex);
+	return 0;
+}
+
+static struct proto econet_proto = {
+	.name	  = "ECONET",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct econet_sock),
+};
+
+/*
+ *	Create an Econet socket
+ */
+
+static int econet_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
+{
+	struct sock *sk;
+	struct econet_sock *eo;
+	int err;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	/* Econet only provides datagram services. */
+	if (sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	err = -ENOBUFS;
+	sk = sk_alloc(net, PF_ECONET, GFP_KERNEL, &econet_proto);
+	if (sk == NULL)
+		goto out;
+
+	sk->sk_reuse = 1;
+	sock->ops = &econet_ops;
+	sock_init_data(sock, sk);
+
+	eo = ec_sk(sk);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	sk->sk_family = PF_ECONET;
+	eo->num = protocol;
+
+	econet_insert_socket(&econet_sklist, sk);
+	return 0;
+out:
+	return err;
+}
+
+/*
+ *	Handle Econet specific ioctls
+ */
+
+static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
+{
+	struct ifreq ifr;
+	struct ec_device *edev;
+	struct net_device *dev;
+	struct sockaddr_ec *sec;
+	int err;
+
+	/*
+	 *	Fetch the caller's info block into kernel space
+	 */
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		return -EFAULT;
+
+	if ((dev = dev_get_by_name(&init_net, ifr.ifr_name)) == NULL)
+		return -ENODEV;
+
+	sec = (struct sockaddr_ec *)&ifr.ifr_addr;
+
+	mutex_lock(&econet_mutex);
+
+	err = 0;
+	switch (cmd) {
+	case SIOCSIFADDR:
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
+
+		edev = dev->ec_ptr;
+		if (edev == NULL) {
+			/* Magic up a new one. */
+			edev = kzalloc(sizeof(struct ec_device), GFP_KERNEL);
+			if (edev == NULL) {
+				err = -ENOMEM;
+				break;
+			}
+			dev->ec_ptr = edev;
+		} else
+			net2dev_map[edev->net] = NULL;
+		edev->station = sec->addr.station;
+		edev->net = sec->addr.net;
+		net2dev_map[sec->addr.net] = dev;
+		if (!net2dev_map[0])
+			net2dev_map[0] = dev;
+		break;
+
+	case SIOCGIFADDR:
+		edev = dev->ec_ptr;
+		if (edev == NULL) {
+			err = -ENODEV;
+			break;
+		}
+		memset(sec, 0, sizeof(struct sockaddr_ec));
+		sec->addr.station = edev->station;
+		sec->addr.net = edev->net;
+		sec->sec_family = AF_ECONET;
+		dev_put(dev);
+		if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+			err = -EFAULT;
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&econet_mutex);
+
+	dev_put(dev);
+
+	return err;
+}
+
+/*
+ *	Handle generic ioctls
+ */
+
+static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *)arg;
+
+	switch(cmd) {
+		case SIOCGSTAMP:
+			return sock_get_timestamp(sk, argp);
+
+		case SIOCGSTAMPNS:
+			return sock_get_timestampns(sk, argp);
+
+		case SIOCSIFADDR:
+		case SIOCGIFADDR:
+			return ec_dev_ioctl(sock, cmd, argp);
+			break;
+
+		default:
+			return -ENOIOCTLCMD;
+	}
+	/*NOTREACHED*/
+	return 0;
+}
+
+static const struct net_proto_family econet_family_ops = {
+	.family =	PF_ECONET,
+	.create =	econet_create,
+	.owner	=	THIS_MODULE,
+};
+
+static const struct proto_ops econet_ops = {
+	.family =	PF_ECONET,
+	.owner =	THIS_MODULE,
+	.release =	econet_release,
+	.bind =		econet_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	econet_getname,
+	.poll =		datagram_poll,
+	.ioctl =	econet_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	sock_no_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	econet_sendmsg,
+	.recvmsg =	econet_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE)
+/*
+ *	Find the listening socket, if any, for the given data.
+ */
+
+static struct sock *ec_listening_socket(unsigned char port, unsigned char
+				 station, unsigned char net)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	spin_lock(&econet_lock);
+	sk_for_each(sk, node, &econet_sklist) {
+		struct econet_sock *opt = ec_sk(sk);
+		if ((opt->port == port || opt->port == 0) &&
+		    (opt->station == station || opt->station == 0) &&
+		    (opt->net == net || opt->net == 0)) {
+			sock_hold(sk);
+			goto found;
+		}
+	}
+	sk = NULL;
+found:
+	spin_unlock(&econet_lock);
+	return sk;
+}
+
+/*
+ *	Queue a received packet for a socket.
+ */
+
+static int ec_queue_packet(struct sock *sk, struct sk_buff *skb,
+			   unsigned char stn, unsigned char net,
+			   unsigned char cb, unsigned char port)
+{
+	struct ec_cb *eb = (struct ec_cb *)&skb->cb;
+	struct sockaddr_ec *sec = (struct sockaddr_ec *)&eb->sec;
+
+	memset(sec, 0, sizeof(struct sockaddr_ec));
+	sec->sec_family = AF_ECONET;
+	sec->type = ECTYPE_PACKET_RECEIVED;
+	sec->port = port;
+	sec->cb = cb;
+	sec->addr.net = net;
+	sec->addr.station = stn;
+
+	return sock_queue_rcv_skb(sk, skb);
+}
+#endif
+
+#ifdef CONFIG_ECONET_AUNUDP
+/*
+ *	Send an AUN protocol response.
+ */
+
+static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb)
+{
+	struct sockaddr_in sin = {
+		.sin_family = AF_INET,
+		.sin_port = htons(AUN_PORT),
+		.sin_addr = {.s_addr = addr}
+	};
+	struct aunhdr ah = {.code = code, .cb = cb, .handle = seq};
+	struct kvec iov = {.iov_base = (void *)&ah, .iov_len = sizeof(ah)};
+	struct msghdr udpmsg;
+
+	udpmsg.msg_name = (void *)&sin;
+	udpmsg.msg_namelen = sizeof(sin);
+	udpmsg.msg_control = NULL;
+	udpmsg.msg_controllen = 0;
+	udpmsg.msg_flags=0;
+
+	kernel_sendmsg(udpsock, &udpmsg, &iov, 1, sizeof(ah));
+}
+
+
+/*
+ *	Handle incoming AUN packets.  Work out if anybody wants them,
+ *	and send positive or negative acknowledgements as appropriate.
+ */
+
+static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len)
+{
+	struct iphdr *ip = ip_hdr(skb);
+	unsigned char stn = ntohl(ip->saddr) & 0xff;
+	struct dst_entry *dst = skb_dst(skb);
+	struct ec_device *edev = NULL;
+	struct sock *sk = NULL;
+	struct sk_buff *newskb;
+
+	if (dst)
+		edev = dst->dev->ec_ptr;
+
+	if (! edev)
+		goto bad;
+
+	if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL)
+		goto bad;		/* Nobody wants it */
+
+	newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15,
+			   GFP_ATOMIC);
+	if (newskb == NULL)
+	{
+		printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n");
+		/* Send nack and hope sender tries again */
+		goto bad;
+	}
+
+	memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1),
+	       len - sizeof(struct aunhdr));
+
+	if (ec_queue_packet(sk, newskb, stn, edev->net, ah->cb, ah->port))
+	{
+		/* Socket is bankrupt. */
+		kfree_skb(newskb);
+		goto bad;
+	}
+
+	aun_send_response(ip->saddr, ah->handle, 3, 0);
+	sock_put(sk);
+	return;
+
+bad:
+	aun_send_response(ip->saddr, ah->handle, 4, 0);
+	if (sk)
+		sock_put(sk);
+}
+
+/*
+ *	Handle incoming AUN transmit acknowledgements.  If the sequence
+ *      number matches something in our backlog then kill it and tell
+ *	the user.  If the remote took too long to reply then we may have
+ *	dropped the packet already.
+ */
+
+static void aun_tx_ack(unsigned long seq, int result)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+	struct ec_cb *eb;
+
+	spin_lock_irqsave(&aun_queue_lock, flags);
+	skb_queue_walk(&aun_queue, skb) {
+		eb = (struct ec_cb *)&skb->cb;
+		if (eb->seq == seq)
+			goto foundit;
+	}
+	spin_unlock_irqrestore(&aun_queue_lock, flags);
+	printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq);
+	return;
+
+foundit:
+	tx_result(skb->sk, eb->cookie, result);
+	skb_unlink(skb, &aun_queue);
+	spin_unlock_irqrestore(&aun_queue_lock, flags);
+	kfree_skb(skb);
+}
+
+/*
+ *	Deal with received AUN frames - sort out what type of thing it is
+ *	and hand it to the right function.
+ */
+
+static void aun_data_available(struct sock *sk, int slen)
+{
+	int err;
+	struct sk_buff *skb;
+	unsigned char *data;
+	struct aunhdr *ah;
+	size_t len;
+
+	while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) {
+		if (err == -EAGAIN) {
+			printk(KERN_ERR "AUN: no data available?!");
+			return;
+		}
+		printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err);
+	}
+
+	data = skb_transport_header(skb) + sizeof(struct udphdr);
+	ah = (struct aunhdr *)data;
+	len = skb->len - sizeof(struct udphdr);
+
+	switch (ah->code)
+	{
+	case 2:
+		aun_incoming(skb, ah, len);
+		break;
+	case 3:
+		aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK);
+		break;
+	case 4:
+		aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING);
+		break;
+	default:
+		printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]);
+	}
+
+	skb_free_datagram(sk, skb);
+}
+
+/*
+ *	Called by the timer to manage the AUN transmit queue.  If a packet
+ *	was sent to a dead or nonexistent host then we will never get an
+ *	acknowledgement back.  After a few seconds we need to spot this and
+ *	drop the packet.
+ */
+
+static void ab_cleanup(unsigned long h)
+{
+	struct sk_buff *skb, *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&aun_queue_lock, flags);
+	skb_queue_walk_safe(&aun_queue, skb, n) {
+		struct ec_cb *eb = (struct ec_cb *)&skb->cb;
+		if ((jiffies - eb->start) > eb->timeout) {
+			tx_result(skb->sk, eb->cookie,
+				  ECTYPE_TRANSMIT_NOT_PRESENT);
+			skb_unlink(skb, &aun_queue);
+			kfree_skb(skb);
+		}
+	}
+	spin_unlock_irqrestore(&aun_queue_lock, flags);
+
+	mod_timer(&ab_cleanup_timer, jiffies + (HZ*2));
+}
+
+static int __init aun_udp_initialise(void)
+{
+	int error;
+	struct sockaddr_in sin;
+
+	skb_queue_head_init(&aun_queue);
+	setup_timer(&ab_cleanup_timer, ab_cleanup, 0);
+	ab_cleanup_timer.expires = jiffies + (HZ*2);
+	add_timer(&ab_cleanup_timer);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_port = htons(AUN_PORT);
+
+	/* We can count ourselves lucky Acorn machines are too dim to
+	   speak IPv6. :-) */
+	if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, 0, &udpsock)) < 0)
+	{
+		printk("AUN: socket error %d\n", -error);
+		return error;
+	}
+
+	udpsock->sk->sk_reuse = 1;
+	udpsock->sk->sk_allocation = GFP_ATOMIC; /* we're going to call it
+						    from interrupts */
+
+	error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin,
+				sizeof(sin));
+	if (error < 0)
+	{
+		printk("AUN: bind error %d\n", -error);
+		goto release;
+	}
+
+	udpsock->sk->sk_data_ready = aun_data_available;
+
+	return 0;
+
+release:
+	sock_release(udpsock);
+	udpsock = NULL;
+	return error;
+}
+#endif
+
+#ifdef CONFIG_ECONET_NATIVE
+
+/*
+ *	Receive an Econet frame from a device.
+ */
+
+static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct ec_framehdr *hdr;
+	struct sock *sk = NULL;
+	struct ec_device *edev = dev->ec_ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	if (!edev)
+		goto drop;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb, sizeof(struct ec_framehdr)))
+		goto drop;
+
+	hdr = (struct ec_framehdr *) skb->data;
+
+	/* First check for encapsulated IP */
+	if (hdr->port == EC_PORT_IP) {
+		skb->protocol = htons(ETH_P_IP);
+		skb_pull(skb, sizeof(struct ec_framehdr));
+		netif_rx(skb);
+		return NET_RX_SUCCESS;
+	}
+
+	sk = ec_listening_socket(hdr->port, hdr->src_stn, hdr->src_net);
+	if (!sk)
+		goto drop;
+
+	if (ec_queue_packet(sk, skb, edev->net, hdr->src_stn, hdr->cb,
+			    hdr->port))
+		goto drop;
+	sock_put(sk);
+	return NET_RX_SUCCESS;
+
+drop:
+	if (sk)
+		sock_put(sk);
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static struct packet_type econet_packet_type __read_mostly = {
+	.type =		cpu_to_be16(ETH_P_ECONET),
+	.func =		econet_rcv,
+};
+
+static void econet_hw_initialise(void)
+{
+	dev_add_pack(&econet_packet_type);
+}
+
+#endif
+
+static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct ec_device *edev;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	switch (msg) {
+	case NETDEV_UNREGISTER:
+		/* A device has gone down - kill any data we hold for it. */
+		edev = dev->ec_ptr;
+		if (edev)
+		{
+			if (net2dev_map[0] == dev)
+				net2dev_map[0] = NULL;
+			net2dev_map[edev->net] = NULL;
+			kfree(edev);
+			dev->ec_ptr = NULL;
+		}
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block econet_netdev_notifier = {
+	.notifier_call =econet_notifier,
+};
+
+static void __exit econet_proto_exit(void)
+{
+#ifdef CONFIG_ECONET_AUNUDP
+	del_timer(&ab_cleanup_timer);
+	if (udpsock)
+		sock_release(udpsock);
+#endif
+	unregister_netdevice_notifier(&econet_netdev_notifier);
+#ifdef CONFIG_ECONET_NATIVE
+	dev_remove_pack(&econet_packet_type);
+#endif
+	sock_unregister(econet_family_ops.family);
+	proto_unregister(&econet_proto);
+}
+
+static int __init econet_proto_init(void)
+{
+	int err = proto_register(&econet_proto, 0);
+
+	if (err != 0)
+		goto out;
+	sock_register(&econet_family_ops);
+#ifdef CONFIG_ECONET_AUNUDP
+	aun_udp_initialise();
+#endif
+#ifdef CONFIG_ECONET_NATIVE
+	econet_hw_initialise();
+#endif
+	register_netdevice_notifier(&econet_netdev_notifier);
+out:
+	return err;
+}
+
+module_init(econet_proto_init);
+module_exit(econet_proto_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_ECONET);
diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile
new file mode 100644
index 00000000..7cef1d8a
--- /dev/null
+++ b/net/ethernet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux Ethernet layer.
+#
+
+obj-y					+= eth.o
+obj-$(subst m,y,$(CONFIG_IPX))		+= pe2.o
+obj-$(subst m,y,$(CONFIG_ATALK))	+= pe2.o
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
new file mode 100644
index 00000000..2780e9b2
--- /dev/null
+++ b/net/ethernet/eth.c
@@ -0,0 +1,395 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Ethernet-type device handling.
+ *
+ * Version:	@(#)eth.c	1.0.7	05/25/93
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Florian  La Roche, <rzsfl@rz.uni-sb.de>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * Fixes:
+ *		Mr Linux	: Arp problems
+ *		Alan Cox	: Generic queue tidyup (very tiny here)
+ *		Alan Cox	: eth_header ntohs should be htons
+ *		Alan Cox	: eth_rebuild_header missing an htons and
+ *				  minor other things.
+ *		Tegge		: Arp bug fixes.
+ *		Florian		: Removed many unnecessary functions, code cleanup
+ *				  and changes for new arp and skbuff.
+ *		Alan Cox	: Redid header building to reflect new format.
+ *		Alan Cox	: ARP only when compiled with CONFIG_INET
+ *		Greg Page	: 802.2 and SNAP stuff.
+ *		Alan Cox	: MAC layer pointers/new format.
+ *		Paul Gortmaker	: eth_copy_and_sum shouldn't csum padding.
+ *		Alan Cox	: Protect against forwarding explosions with
+ *				  older network drivers and IFF_ALLMULTI.
+ *	Christer Weinigel	: Better rebuild header message.
+ *             Andrew Morton    : 26Feb01: kill ether_setup() - use netdev_boot_setup().
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <net/dst.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <net/dsa.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+__setup("ether=", netdev_boot_setup);
+
+/**
+ * eth_header - create the Ethernet header
+ * @skb:	buffer to alter
+ * @dev:	source device
+ * @type:	Ethernet type field
+ * @daddr: destination address (NULL leave destination address)
+ * @saddr: source address (NULL use device source address)
+ * @len:   packet length (<= skb->len)
+ *
+ *
+ * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
+ * in here instead.
+ */
+int eth_header(struct sk_buff *skb, struct net_device *dev,
+	       unsigned short type,
+	       const void *daddr, const void *saddr, unsigned len)
+{
+	struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+
+	if (type != ETH_P_802_3 && type != ETH_P_802_2)
+		eth->h_proto = htons(type);
+	else
+		eth->h_proto = htons(len);
+
+	/*
+	 *      Set the source hardware address.
+	 */
+
+	if (!saddr)
+		saddr = dev->dev_addr;
+	memcpy(eth->h_source, saddr, ETH_ALEN);
+
+	if (daddr) {
+		memcpy(eth->h_dest, daddr, ETH_ALEN);
+		return ETH_HLEN;
+	}
+
+	/*
+	 *      Anyway, the loopback-device should never use this function...
+	 */
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
+		memset(eth->h_dest, 0, ETH_ALEN);
+		return ETH_HLEN;
+	}
+
+	return -ETH_HLEN;
+}
+EXPORT_SYMBOL(eth_header);
+
+/**
+ * eth_rebuild_header- rebuild the Ethernet MAC header.
+ * @skb: socket buffer to update
+ *
+ * This is called after an ARP or IPV6 ndisc it's resolution on this
+ * sk_buff. We now let protocol (ARP) fill in the other fields.
+ *
+ * This routine CANNOT use cached dst->neigh!
+ * Really, it is used only when dst->neigh is wrong.
+ */
+int eth_rebuild_header(struct sk_buff *skb)
+{
+	struct ethhdr *eth = (struct ethhdr *)skb->data;
+	struct net_device *dev = skb->dev;
+
+	switch (eth->h_proto) {
+#ifdef CONFIG_INET
+	case htons(ETH_P_IP):
+		return arp_find(eth->h_dest, skb);
+#endif
+	default:
+		printk(KERN_DEBUG
+		       "%s: unable to resolve type %X addresses.\n",
+		       dev->name, ntohs(eth->h_proto));
+
+		memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(eth_rebuild_header);
+
+/**
+ * eth_type_trans - determine the packet's protocol ID.
+ * @skb: received socket data
+ * @dev: receiving network device
+ *
+ * The rule here is that we
+ * assume 802.3 if the type field is short enough to be a length.
+ * This is normal practice and works for any 'now in use' protocol.
+ */
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ethhdr *eth;
+
+	skb->dev = dev;
+	skb_reset_mac_header(skb);
+	skb_pull_inline(skb, ETH_HLEN);
+	eth = eth_hdr(skb);
+
+	if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
+		if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
+			skb->pkt_type = PACKET_BROADCAST;
+		else
+			skb->pkt_type = PACKET_MULTICAST;
+	}
+
+	/*
+	 *      This ALLMULTI check should be redundant by 1.4
+	 *      so don't forget to remove it.
+	 *
+	 *      Seems, you forgot to remove it. All silly devices
+	 *      seems to set IFF_PROMISC.
+	 */
+
+	else if (1 /*dev->flags&IFF_PROMISC */ ) {
+		if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr)))
+			skb->pkt_type = PACKET_OTHERHOST;
+	}
+
+	/*
+	 * Some variants of DSA tagging don't have an ethertype field
+	 * at all, so we check here whether one of those tagging
+	 * variants has been configured on the receiving interface,
+	 * and if so, set skb->protocol without looking at the packet.
+	 */
+	if (netdev_uses_dsa_tags(dev))
+		return htons(ETH_P_DSA);
+	if (netdev_uses_trailer_tags(dev))
+		return htons(ETH_P_TRAILER);
+
+	if (ntohs(eth->h_proto) >= 1536)
+		return eth->h_proto;
+
+	/*
+	 *      This is a magic hack to spot IPX packets. Older Novell breaks
+	 *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
+	 *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
+	 *      won't work for fault tolerant netware but does for the rest.
+	 */
+	if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF)
+		return htons(ETH_P_802_3);
+
+	/*
+	 *      Real 802.2 LLC
+	 */
+	return htons(ETH_P_802_2);
+}
+EXPORT_SYMBOL(eth_type_trans);
+
+/**
+ * eth_header_parse - extract hardware address from packet
+ * @skb: packet to extract header from
+ * @haddr: destination buffer
+ */
+int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	const struct ethhdr *eth = eth_hdr(skb);
+	memcpy(haddr, eth->h_source, ETH_ALEN);
+	return ETH_ALEN;
+}
+EXPORT_SYMBOL(eth_header_parse);
+
+/**
+ * eth_header_cache - fill cache entry from neighbour
+ * @neigh: source neighbour
+ * @hh: destination cache entry
+ * Create an Ethernet header template from the neighbour.
+ */
+int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh)
+{
+	__be16 type = hh->hh_type;
+	struct ethhdr *eth;
+	const struct net_device *dev = neigh->dev;
+
+	eth = (struct ethhdr *)
+	    (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));
+
+	if (type == htons(ETH_P_802_3))
+		return -1;
+
+	eth->h_proto = type;
+	memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
+	memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
+	hh->hh_len = ETH_HLEN;
+	return 0;
+}
+EXPORT_SYMBOL(eth_header_cache);
+
+/**
+ * eth_header_cache_update - update cache entry
+ * @hh: destination cache entry
+ * @dev: network device
+ * @haddr: new hardware address
+ *
+ * Called by Address Resolution module to notify changes in address.
+ */
+void eth_header_cache_update(struct hh_cache *hh,
+			     const struct net_device *dev,
+			     const unsigned char *haddr)
+{
+	memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
+	       haddr, ETH_ALEN);
+}
+EXPORT_SYMBOL(eth_header_cache_update);
+
+/**
+ * eth_mac_addr - set new Ethernet hardware address
+ * @dev: network device
+ * @p: socket address
+ * Change hardware address of device.
+ *
+ * This doesn't change hardware matching, so needs to be overridden
+ * for most real devices.
+ */
+int eth_mac_addr(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (netif_running(dev))
+		return -EBUSY;
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+	return 0;
+}
+EXPORT_SYMBOL(eth_mac_addr);
+
+/**
+ * eth_change_mtu - set new MTU size
+ * @dev: network device
+ * @new_mtu: new Maximum Transfer Unit
+ *
+ * Allow changing MTU size. Needs to be overridden for devices
+ * supporting jumbo frames.
+ */
+int eth_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+EXPORT_SYMBOL(eth_change_mtu);
+
+int eth_validate_addr(struct net_device *dev)
+{
+	if (!is_valid_ether_addr(dev->dev_addr))
+		return -EADDRNOTAVAIL;
+
+	return 0;
+}
+EXPORT_SYMBOL(eth_validate_addr);
+
+const struct header_ops eth_header_ops ____cacheline_aligned = {
+	.create		= eth_header,
+	.parse		= eth_header_parse,
+	.rebuild	= eth_rebuild_header,
+	.cache		= eth_header_cache,
+	.cache_update	= eth_header_cache_update,
+};
+
+/**
+ * ether_setup - setup Ethernet network device
+ * @dev: network device
+ * Fill in the fields of the device structure with Ethernet-generic values.
+ */
+void ether_setup(struct net_device *dev)
+{
+	dev->header_ops		= &eth_header_ops;
+	dev->type		= ARPHRD_ETHER;
+	dev->hard_header_len 	= ETH_HLEN;
+	dev->mtu		= ETH_DATA_LEN;
+	dev->addr_len		= ETH_ALEN;
+	dev->tx_queue_len	= 1000;	/* Ethernet wants good queues */
+	dev->flags		= IFF_BROADCAST|IFF_MULTICAST;
+	dev->priv_flags		= IFF_TX_SKB_SHARING;
+
+	memset(dev->broadcast, 0xFF, ETH_ALEN);
+
+}
+EXPORT_SYMBOL(ether_setup);
+
+/**
+ * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
+ * @sizeof_priv: Size of additional driver-private structure to be allocated
+ *	for this Ethernet device
+ * @txqs: The number of TX queues this device has.
+ * @rxqs: The number of RX queues this device has.
+ *
+ * Fill in the fields of the device structure with Ethernet-generic
+ * values. Basically does everything except registering the device.
+ *
+ * Constructs a new net device, complete with a private data area of
+ * size (sizeof_priv).  A 32-byte (not bit) alignment is enforced for
+ * this private data area.
+ */
+
+struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
+				      unsigned int rxqs)
+{
+	return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs);
+}
+EXPORT_SYMBOL(alloc_etherdev_mqs);
+
+static size_t _format_mac_addr(char *buf, int buflen,
+			       const unsigned char *addr, int len)
+{
+	int i;
+	char *cp = buf;
+
+	for (i = 0; i < len; i++) {
+		cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
+		if (i == len - 1)
+			break;
+		cp += scnprintf(cp, buflen - (cp - buf), ":");
+	}
+	return cp - buf;
+}
+
+ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
+{
+	size_t l;
+
+	l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
+	l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
+	return (ssize_t)l;
+}
+EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c
new file mode 100644
index 00000000..85d574ad
--- /dev/null
+++ b/net/ethernet/pe2.c
@@ -0,0 +1,37 @@
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/datalink.h>
+
+static int pEII_request(struct datalink_proto *dl,
+			struct sk_buff *skb, unsigned char *dest_node)
+{
+	struct net_device *dev = skb->dev;
+
+	skb->protocol = htons(ETH_P_IPX);
+	dev_hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len);
+	return dev_queue_xmit(skb);
+}
+
+struct datalink_proto *make_EII_client(void)
+{
+	struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
+
+	if (proto) {
+		proto->header_length = 0;
+		proto->request = pEII_request;
+	}
+
+	return proto;
+}
+EXPORT_SYMBOL(make_EII_client);
+
+void destroy_EII_client(struct datalink_proto *dl)
+{
+	kfree(dl);
+}
+EXPORT_SYMBOL(destroy_EII_client);
diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig
new file mode 100644
index 00000000..1c1de97d
--- /dev/null
+++ b/net/ieee802154/Kconfig
@@ -0,0 +1,12 @@
+config IEEE802154
+	tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  IEEE Std 802.15.4 defines a low data rate, low power and low
+	  complexity short range wireless personal area networks. It was
+	  designed to organise networks of sensors, switches, etc automation
+	  devices. Maximum allowed data rate is 250 kb/s and typical personal
+	  operating space around 10m.
+
+	  Say Y here to compile LR-WPAN support into the kernel or say M to
+	  compile it as modules.
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
new file mode 100644
index 00000000..5761185f
--- /dev/null
+++ b/net/ieee802154/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IEEE802154) +=	ieee802154.o af_802154.o
+ieee802154-y		:= netlink.o nl-mac.o nl-phy.o nl_policy.o wpan-class.o
+af_802154-y		:= af_ieee802154.o raw.o dgram.o
diff --git a/net/ieee802154/af802154.h b/net/ieee802154/af802154.h
new file mode 100644
index 00000000..b1ec5253
--- /dev/null
+++ b/net/ieee802154/af802154.h
@@ -0,0 +1,36 @@
+/*
+ * Internal interfaces for ieee 802.15.4 address family.
+ *
+ * Copyright 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ */
+
+#ifndef AF802154_H
+#define AF802154_H
+
+struct sk_buff;
+struct net_devce;
+extern struct proto ieee802154_raw_prot;
+extern struct proto ieee802154_dgram_prot;
+void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb);
+int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb);
+struct net_device *ieee802154_get_dev(struct net *net,
+		struct ieee802154_addr *addr);
+
+#endif
diff --git a/net/ieee802154/af_ieee802154.c b/net/ieee802154/af_ieee802154.c
new file mode 100644
index 00000000..6df6ecf4
--- /dev/null
+++ b/net/ieee802154/af_ieee802154.c
@@ -0,0 +1,373 @@
+/*
+ * IEEE802154.4 socket interface
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Maxim Gorbachyov <maxim.gorbachev@siemens.com>
+ */
+
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/if.h>
+#include <linux/termios.h>	/* For TIOCOUTQ/INQ */
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/datalink.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/route.h>
+
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+
+#include "af802154.h"
+
+/*
+ * Utility function for families
+ */
+struct net_device *ieee802154_get_dev(struct net *net,
+		struct ieee802154_addr *addr)
+{
+	struct net_device *dev = NULL;
+	struct net_device *tmp;
+	u16 pan_id, short_addr;
+
+	switch (addr->addr_type) {
+	case IEEE802154_ADDR_LONG:
+		rcu_read_lock();
+		dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, addr->hwaddr);
+		if (dev)
+			dev_hold(dev);
+		rcu_read_unlock();
+		break;
+	case IEEE802154_ADDR_SHORT:
+		if (addr->pan_id == 0xffff ||
+		    addr->short_addr == IEEE802154_ADDR_UNDEF ||
+		    addr->short_addr == 0xffff)
+			break;
+
+		rtnl_lock();
+
+		for_each_netdev(net, tmp) {
+			if (tmp->type != ARPHRD_IEEE802154)
+				continue;
+
+			pan_id = ieee802154_mlme_ops(tmp)->get_pan_id(tmp);
+			short_addr =
+				ieee802154_mlme_ops(tmp)->get_short_addr(tmp);
+
+			if (pan_id == addr->pan_id &&
+			    short_addr == addr->short_addr) {
+				dev = tmp;
+				dev_hold(dev);
+				break;
+			}
+		}
+
+		rtnl_unlock();
+		break;
+	default:
+		pr_warning("Unsupported ieee802154 address type: %d\n",
+				addr->addr_type);
+		break;
+	}
+
+	return dev;
+}
+
+static int ieee802154_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		sock->sk = NULL;
+		sk->sk_prot->close(sk, 0);
+	}
+	return 0;
+}
+static int ieee802154_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+
+	return sk->sk_prot->sendmsg(iocb, sk, msg, len);
+}
+
+static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr,
+		int addr_len)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+	return sock_no_bind(sock, uaddr, addr_len);
+}
+
+static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
+
+	if (uaddr->sa_family == AF_UNSPEC)
+		return sk->sk_prot->disconnect(sk, flags);
+
+	return sk->sk_prot->connect(sk, uaddr, addr_len);
+}
+
+static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
+		unsigned int cmd)
+{
+	struct ifreq ifr;
+	int ret = -ENOIOCTLCMD;
+	struct net_device *dev;
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		return -EFAULT;
+
+	ifr.ifr_name[IFNAMSIZ-1] = 0;
+
+	dev_load(sock_net(sk), ifr.ifr_name);
+	dev = dev_get_by_name(sock_net(sk), ifr.ifr_name);
+
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
+		ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
+
+	if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+		ret = -EFAULT;
+	dev_put(dev);
+
+	return ret;
+}
+
+static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
+		unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+	case SIOCGSTAMP:
+		return sock_get_timestamp(sk, (struct timeval __user *)arg);
+	case SIOCGSTAMPNS:
+		return sock_get_timestampns(sk, (struct timespec __user *)arg);
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+		return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg,
+				cmd);
+	default:
+		if (!sk->sk_prot->ioctl)
+			return -ENOIOCTLCMD;
+		return sk->sk_prot->ioctl(sk, cmd, arg);
+	}
+}
+
+static const struct proto_ops ieee802154_raw_ops = {
+	.family		   = PF_IEEE802154,
+	.owner		   = THIS_MODULE,
+	.release	   = ieee802154_sock_release,
+	.bind		   = ieee802154_sock_bind,
+	.connect	   = ieee802154_sock_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = sock_no_getname,
+	.poll		   = datagram_poll,
+	.ioctl		   = ieee802154_sock_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = sock_no_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = ieee802154_sock_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static const struct proto_ops ieee802154_dgram_ops = {
+	.family		   = PF_IEEE802154,
+	.owner		   = THIS_MODULE,
+	.release	   = ieee802154_sock_release,
+	.bind		   = ieee802154_sock_bind,
+	.connect	   = ieee802154_sock_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = sock_no_getname,
+	.poll		   = datagram_poll,
+	.ioctl		   = ieee802154_sock_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = sock_no_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = ieee802154_sock_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+
+/*
+ * Create a socket. Initialise the socket, blank the addresses
+ * set the state.
+ */
+static int ieee802154_create(struct net *net, struct socket *sock,
+			     int protocol, int kern)
+{
+	struct sock *sk;
+	int rc;
+	struct proto *proto;
+	const struct proto_ops *ops;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	switch (sock->type) {
+	case SOCK_RAW:
+		proto = &ieee802154_raw_prot;
+		ops = &ieee802154_raw_ops;
+		break;
+	case SOCK_DGRAM:
+		proto = &ieee802154_dgram_prot;
+		ops = &ieee802154_dgram_ops;
+		break;
+	default:
+		rc = -ESOCKTNOSUPPORT;
+		goto out;
+	}
+
+	rc = -ENOMEM;
+	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+	if (!sk)
+		goto out;
+	rc = 0;
+
+	sock->ops = ops;
+
+	sock_init_data(sock, sk);
+	/* FIXME: sk->sk_destruct */
+	sk->sk_family = PF_IEEE802154;
+
+	/* Checksums on by default */
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	if (sk->sk_prot->hash)
+		sk->sk_prot->hash(sk);
+
+	if (sk->sk_prot->init) {
+		rc = sk->sk_prot->init(sk);
+		if (rc)
+			sk_common_release(sk);
+	}
+out:
+	return rc;
+}
+
+static const struct net_proto_family ieee802154_family_ops = {
+	.family		= PF_IEEE802154,
+	.create		= ieee802154_create,
+	.owner		= THIS_MODULE,
+};
+
+static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev,
+	struct packet_type *pt, struct net_device *orig_dev)
+{
+	if (!netif_running(dev))
+		return -ENODEV;
+	pr_debug("got frame, type %d, dev %p\n", dev->type, dev);
+#ifdef DEBUG
+	print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len);
+#endif
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	ieee802154_raw_deliver(dev, skb);
+
+	if (dev->type != ARPHRD_IEEE802154)
+		goto drop;
+
+	if (skb->pkt_type != PACKET_OTHERHOST)
+		return ieee802154_dgram_deliver(dev, skb);
+
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+
+static struct packet_type ieee802154_packet_type = {
+	.type = __constant_htons(ETH_P_IEEE802154),
+	.func = ieee802154_rcv,
+};
+
+static int __init af_ieee802154_init(void)
+{
+	int rc = -EINVAL;
+
+	rc = proto_register(&ieee802154_raw_prot, 1);
+	if (rc)
+		goto out;
+
+	rc = proto_register(&ieee802154_dgram_prot, 1);
+	if (rc)
+		goto err_dgram;
+
+	/* Tell SOCKET that we are alive */
+	rc = sock_register(&ieee802154_family_ops);
+	if (rc)
+		goto err_sock;
+	dev_add_pack(&ieee802154_packet_type);
+
+	rc = 0;
+	goto out;
+
+err_sock:
+	proto_unregister(&ieee802154_dgram_prot);
+err_dgram:
+	proto_unregister(&ieee802154_raw_prot);
+out:
+	return rc;
+}
+static void __exit af_ieee802154_remove(void)
+{
+	dev_remove_pack(&ieee802154_packet_type);
+	sock_unregister(PF_IEEE802154);
+	proto_unregister(&ieee802154_dgram_prot);
+	proto_unregister(&ieee802154_raw_prot);
+}
+
+module_init(af_ieee802154_init);
+module_exit(af_ieee802154_remove);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IEEE802154);
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
new file mode 100644
index 00000000..1a3334c2
--- /dev/null
+++ b/net/ieee802154/dgram.c
@@ -0,0 +1,461 @@
+/*
+ * ZigBee socket interface
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ */
+
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154.h>
+#include <net/ieee802154_netdev.h>
+
+#include <asm/ioctls.h>
+
+#include "af802154.h"
+
+static HLIST_HEAD(dgram_head);
+static DEFINE_RWLOCK(dgram_lock);
+
+struct dgram_sock {
+	struct sock sk;
+
+	struct ieee802154_addr src_addr;
+	struct ieee802154_addr dst_addr;
+
+	unsigned bound:1;
+	unsigned want_ack:1;
+};
+
+static inline struct dgram_sock *dgram_sk(const struct sock *sk)
+{
+	return container_of(sk, struct dgram_sock, sk);
+}
+
+static void dgram_hash(struct sock *sk)
+{
+	write_lock_bh(&dgram_lock);
+	sk_add_node(sk, &dgram_head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&dgram_lock);
+}
+
+static void dgram_unhash(struct sock *sk)
+{
+	write_lock_bh(&dgram_lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&dgram_lock);
+}
+
+static int dgram_init(struct sock *sk)
+{
+	struct dgram_sock *ro = dgram_sk(sk);
+
+	ro->dst_addr.addr_type = IEEE802154_ADDR_LONG;
+	ro->dst_addr.pan_id = 0xffff;
+	ro->want_ack = 1;
+	memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr));
+	return 0;
+}
+
+static void dgram_close(struct sock *sk, long timeout)
+{
+	sk_common_release(sk);
+}
+
+static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
+{
+	struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+	struct dgram_sock *ro = dgram_sk(sk);
+	int err = -EINVAL;
+	struct net_device *dev;
+
+	lock_sock(sk);
+
+	ro->bound = 0;
+
+	if (len < sizeof(*addr))
+		goto out;
+
+	if (addr->family != AF_IEEE802154)
+		goto out;
+
+	dev = ieee802154_get_dev(sock_net(sk), &addr->addr);
+	if (!dev) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (dev->type != ARPHRD_IEEE802154) {
+		err = -ENODEV;
+		goto out_put;
+	}
+
+	memcpy(&ro->src_addr, &addr->addr, sizeof(struct ieee802154_addr));
+
+	ro->bound = 1;
+	err = 0;
+out_put:
+	dev_put(dev);
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	{
+		int amount = sk_wmem_alloc_get(sk);
+
+		return put_user(amount, (int __user *)arg);
+	}
+
+	case SIOCINQ:
+	{
+		struct sk_buff *skb;
+		unsigned long amount;
+
+		amount = 0;
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb != NULL) {
+			/*
+			 * We will only return the amount
+			 * of this packet since that is all
+			 * that will be read.
+			 */
+			/* FIXME: parse the header for more correct value */
+			amount = skb->len - (3+8+8);
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		return put_user(amount, (int __user *)arg);
+	}
+
+	}
+	return -ENOIOCTLCMD;
+}
+
+/* FIXME: autobind */
+static int dgram_connect(struct sock *sk, struct sockaddr *uaddr,
+			int len)
+{
+	struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+	struct dgram_sock *ro = dgram_sk(sk);
+	int err = 0;
+
+	if (len < sizeof(*addr))
+		return -EINVAL;
+
+	if (addr->family != AF_IEEE802154)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (!ro->bound) {
+		err = -ENETUNREACH;
+		goto out;
+	}
+
+	memcpy(&ro->dst_addr, &addr->addr, sizeof(struct ieee802154_addr));
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int dgram_disconnect(struct sock *sk, int flags)
+{
+	struct dgram_sock *ro = dgram_sk(sk);
+
+	lock_sock(sk);
+
+	ro->dst_addr.addr_type = IEEE802154_ADDR_LONG;
+	memset(&ro->dst_addr.hwaddr, 0xff, sizeof(ro->dst_addr.hwaddr));
+
+	release_sock(sk);
+
+	return 0;
+}
+
+static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk,
+		struct msghdr *msg, size_t size)
+{
+	struct net_device *dev;
+	unsigned mtu;
+	struct sk_buff *skb;
+	struct dgram_sock *ro = dgram_sk(sk);
+	int err;
+
+	if (msg->msg_flags & MSG_OOB) {
+		pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+		return -EOPNOTSUPP;
+	}
+
+	if (!ro->bound)
+		dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+	else
+		dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr);
+
+	if (!dev) {
+		pr_debug("no dev\n");
+		err = -ENXIO;
+		goto out;
+	}
+	mtu = dev->mtu;
+	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+	skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size,
+			msg->msg_flags & MSG_DONTWAIT,
+			&err);
+	if (!skb)
+		goto out_dev;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb_reset_network_header(skb);
+
+	mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
+	if (ro->want_ack)
+		mac_cb(skb)->flags |= MAC_CB_FLAG_ACKREQ;
+
+	mac_cb(skb)->seq = ieee802154_mlme_ops(dev)->get_dsn(dev);
+	err = dev_hard_header(skb, dev, ETH_P_IEEE802154, &ro->dst_addr,
+			ro->bound ? &ro->src_addr : NULL, size);
+	if (err < 0)
+		goto out_skb;
+
+	skb_reset_mac_header(skb);
+
+	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	if (err < 0)
+		goto out_skb;
+
+	if (size > mtu) {
+		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		err = -EINVAL;
+		goto out_skb;
+	}
+
+	skb->dev = dev;
+	skb->sk  = sk;
+	skb->protocol = htons(ETH_P_IEEE802154);
+
+	dev_put(dev);
+
+	err = dev_queue_xmit(skb);
+	if (err > 0)
+		err = net_xmit_errno(err);
+
+	return err ?: size;
+
+out_skb:
+	kfree_skb(skb);
+out_dev:
+	dev_put(dev);
+out:
+	return err;
+}
+
+static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
+		struct msghdr *msg, size_t len, int noblock, int flags,
+		int *addr_len)
+{
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	struct sk_buff *skb;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	/* FIXME: skip headers if necessary ?! */
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:
+	if (err)
+		return err;
+	return copied;
+}
+
+static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+static inline int ieee802154_match_sock(u8 *hw_addr, u16 pan_id,
+		u16 short_addr, struct dgram_sock *ro)
+{
+	if (!ro->bound)
+		return 1;
+
+	if (ro->src_addr.addr_type == IEEE802154_ADDR_LONG &&
+	    !memcmp(ro->src_addr.hwaddr, hw_addr, IEEE802154_ADDR_LEN))
+		return 1;
+
+	if (ro->src_addr.addr_type == IEEE802154_ADDR_SHORT &&
+		     pan_id == ro->src_addr.pan_id &&
+		     short_addr == ro->src_addr.short_addr)
+		return 1;
+
+	return 0;
+}
+
+int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+	struct sock *sk, *prev = NULL;
+	struct hlist_node *node;
+	int ret = NET_RX_SUCCESS;
+	u16 pan_id, short_addr;
+
+	/* Data frame processing */
+	BUG_ON(dev->type != ARPHRD_IEEE802154);
+
+	pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+	short_addr = ieee802154_mlme_ops(dev)->get_short_addr(dev);
+
+	read_lock(&dgram_lock);
+	sk_for_each(sk, node, &dgram_head) {
+		if (ieee802154_match_sock(dev->dev_addr, pan_id, short_addr,
+					dgram_sk(sk))) {
+			if (prev) {
+				struct sk_buff *clone;
+				clone = skb_clone(skb, GFP_ATOMIC);
+				if (clone)
+					dgram_rcv_skb(prev, clone);
+			}
+
+			prev = sk;
+		}
+	}
+
+	if (prev)
+		dgram_rcv_skb(prev, skb);
+	else {
+		kfree_skb(skb);
+		ret = NET_RX_DROP;
+	}
+	read_unlock(&dgram_lock);
+
+	return ret;
+}
+
+static int dgram_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	struct dgram_sock *ro = dgram_sk(sk);
+
+	int val, len;
+
+	if (level != SOL_IEEE802154)
+		return -EOPNOTSUPP;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	switch (optname) {
+	case WPAN_WANTACK:
+		val = ro->want_ack;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static int dgram_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	struct dgram_sock *ro = dgram_sk(sk);
+	int val;
+	int err = 0;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case WPAN_WANTACK:
+		ro->want_ack = !!val;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+struct proto ieee802154_dgram_prot = {
+	.name		= "IEEE-802.15.4-MAC",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct dgram_sock),
+	.init		= dgram_init,
+	.close		= dgram_close,
+	.bind		= dgram_bind,
+	.sendmsg	= dgram_sendmsg,
+	.recvmsg	= dgram_recvmsg,
+	.hash		= dgram_hash,
+	.unhash		= dgram_unhash,
+	.connect	= dgram_connect,
+	.disconnect	= dgram_disconnect,
+	.ioctl		= dgram_ioctl,
+	.getsockopt	= dgram_getsockopt,
+	.setsockopt	= dgram_setsockopt,
+};
+
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
new file mode 100644
index 00000000..aadec428
--- /dev/null
+++ b/net/ieee802154/ieee802154.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef IEEE_802154_LOCAL_H
+#define IEEE_802154_LOCAL_H
+
+int __init ieee802154_nl_init(void);
+void __exit ieee802154_nl_exit(void);
+
+#define IEEE802154_OP(_cmd, _func)			\
+	{						\
+		.cmd	= _cmd,				\
+		.policy	= ieee802154_policy,		\
+		.doit	= _func,			\
+		.dumpit	= NULL,				\
+		.flags	= GENL_ADMIN_PERM,		\
+	}
+
+#define IEEE802154_DUMP(_cmd, _func, _dump)		\
+	{						\
+		.cmd	= _cmd,				\
+		.policy	= ieee802154_policy,		\
+		.doit	= _func,			\
+		.dumpit	= _dump,			\
+	}
+
+struct genl_info;
+
+struct sk_buff *ieee802154_nl_create(int flags, u8 req);
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group);
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+		int flags, u8 req);
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info);
+
+extern struct genl_family nl802154_family;
+int nl802154_mac_register(void);
+int nl802154_phy_register(void);
+
+#endif
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
new file mode 100644
index 00000000..c8097ae2
--- /dev/null
+++ b/net/ieee802154/netlink.c
@@ -0,0 +1,139 @@
+/*
+ * Netlink inteface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+
+static unsigned int ieee802154_seq_num;
+static DEFINE_SPINLOCK(ieee802154_seq_lock);
+
+struct genl_family nl802154_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= IEEE802154_NL_NAME,
+	.version	= 1,
+	.maxattr	= IEEE802154_ATTR_MAX,
+};
+
+/* Requests to userspace */
+struct sk_buff *ieee802154_nl_create(int flags, u8 req)
+{
+	void *hdr;
+	struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+	unsigned long f;
+
+	if (!msg)
+		return NULL;
+
+	spin_lock_irqsave(&ieee802154_seq_lock, f);
+	hdr = genlmsg_put(msg, 0, ieee802154_seq_num++,
+			&nl802154_family, flags, req);
+	spin_unlock_irqrestore(&ieee802154_seq_lock, f);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return NULL;
+	}
+
+	return msg;
+}
+
+int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group)
+{
+	/* XXX: nlh is right at the start of msg */
+	void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
+
+	if (genlmsg_end(msg, hdr) < 0)
+		goto out;
+
+	return genlmsg_multicast(msg, 0, group, GFP_ATOMIC);
+out:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info,
+		int flags, u8 req)
+{
+	void *hdr;
+	struct sk_buff *msg = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
+
+	if (!msg)
+		return NULL;
+
+	hdr = genlmsg_put_reply(msg, info,
+			&nl802154_family, flags, req);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return NULL;
+	}
+
+	return msg;
+}
+
+int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
+{
+	/* XXX: nlh is right at the start of msg */
+	void *hdr = genlmsg_data(NLMSG_DATA(msg->data));
+
+	if (genlmsg_end(msg, hdr) < 0)
+		goto out;
+
+	return genlmsg_reply(msg, info);
+out:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+int __init ieee802154_nl_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&nl802154_family);
+	if (rc)
+		goto fail;
+
+	rc = nl802154_mac_register();
+	if (rc)
+		goto fail;
+
+	rc = nl802154_phy_register();
+	if (rc)
+		goto fail;
+
+	return 0;
+
+fail:
+	genl_unregister_family(&nl802154_family);
+	return rc;
+}
+
+void __exit ieee802154_nl_exit(void)
+{
+	genl_unregister_family(&nl802154_family);
+}
+
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
new file mode 100644
index 00000000..71ee1108
--- /dev/null
+++ b/net/ieee802154/nl-mac.c
@@ -0,0 +1,618 @@
+/*
+ * Netlink inteface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include <linux/nl802154.h>
+#include <net/af_ieee802154.h>
+#include <net/nl802154.h>
+#include <net/ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/wpan-phy.h>
+
+#include "ieee802154.h"
+
+static struct genl_multicast_group ieee802154_coord_mcgrp = {
+	.name		= IEEE802154_MCAST_COORD_NAME,
+};
+
+static struct genl_multicast_group ieee802154_beacon_mcgrp = {
+	.name		= IEEE802154_MCAST_BEACON_NAME,
+};
+
+int ieee802154_nl_assoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 cap)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	if (addr->addr_type != IEEE802154_ADDR_LONG) {
+		pr_err("%s: received non-long source address!\n", __func__);
+		return -EINVAL;
+	}
+
+	msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN,
+			addr->hwaddr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_CAPABILITY, cap);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_assoc_indic);
+
+int ieee802154_nl_assoc_confirm(struct net_device *dev, u16 short_addr,
+		u8 status)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_ASSOCIATE_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U16(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_assoc_confirm);
+
+int ieee802154_nl_disassoc_indic(struct net_device *dev,
+		struct ieee802154_addr *addr, u8 reason)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	if (addr->addr_type == IEEE802154_ADDR_LONG)
+		NLA_PUT(msg, IEEE802154_ATTR_SRC_HW_ADDR, IEEE802154_ADDR_LEN,
+				addr->hwaddr);
+	else
+		NLA_PUT_U16(msg, IEEE802154_ATTR_SRC_SHORT_ADDR,
+				addr->short_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_REASON, reason);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_disassoc_indic);
+
+int ieee802154_nl_disassoc_confirm(struct net_device *dev, u8 status)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_DISASSOCIATE_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_disassoc_confirm);
+
+int ieee802154_nl_beacon_indic(struct net_device *dev,
+		u16 panid, u16 coord_addr)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_BEACON_NOTIFY_INDIC);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+	NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_SHORT_ADDR, coord_addr);
+	NLA_PUT_U16(msg, IEEE802154_ATTR_COORD_PAN_ID, panid);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_beacon_indic);
+
+int ieee802154_nl_scan_confirm(struct net_device *dev,
+		u8 status, u8 scan_type, u32 unscanned, u8 page,
+		u8 *edl/* , struct list_head *pan_desc_list */)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_SCAN_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_SCAN_TYPE, scan_type);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_CHANNELS, unscanned);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_PAGE, page);
+
+	if (edl)
+		NLA_PUT(msg, IEEE802154_ATTR_ED_LIST, 27, edl);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_scan_confirm);
+
+int ieee802154_nl_start_confirm(struct net_device *dev, u8 status)
+{
+	struct sk_buff *msg;
+
+	pr_debug("%s\n", __func__);
+
+	msg = ieee802154_nl_create(0, IEEE802154_START_CONF);
+	if (!msg)
+		return -ENOBUFS;
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+			dev->dev_addr);
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_STATUS, status);
+
+	return ieee802154_nl_mcast(msg, ieee802154_coord_mcgrp.id);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(ieee802154_nl_start_confirm);
+
+static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 pid,
+	u32 seq, int flags, struct net_device *dev)
+{
+	void *hdr;
+	struct wpan_phy *phy;
+
+	pr_debug("%s\n", __func__);
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+		IEEE802154_LIST_IFACE);
+	if (!hdr)
+		goto out;
+
+	phy = ieee802154_mlme_ops(dev)->get_phy(dev);
+	BUG_ON(!phy);
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+	NLA_PUT_U32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex);
+
+	NLA_PUT(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN,
+		dev->dev_addr);
+	NLA_PUT_U16(msg, IEEE802154_ATTR_SHORT_ADDR,
+		ieee802154_mlme_ops(dev)->get_short_addr(dev));
+	NLA_PUT_U16(msg, IEEE802154_ATTR_PAN_ID,
+		ieee802154_mlme_ops(dev)->get_pan_id(dev));
+	wpan_phy_put(phy);
+	return genlmsg_end(msg, hdr);
+
+nla_put_failure:
+	wpan_phy_put(phy);
+	genlmsg_cancel(msg, hdr);
+out:
+	return -EMSGSIZE;
+}
+
+/* Requests from userspace */
+static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
+{
+	struct net_device *dev;
+
+	if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+		char name[IFNAMSIZ + 1];
+		nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+				sizeof(name));
+		dev = dev_get_by_name(&init_net, name);
+	} else if (info->attrs[IEEE802154_ATTR_DEV_INDEX])
+		dev = dev_get_by_index(&init_net,
+			nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX]));
+	else
+		return NULL;
+
+	if (!dev)
+		return NULL;
+
+	if (dev->type != ARPHRD_IEEE802154) {
+		dev_put(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+
+static int ieee802154_associate_req(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	u8 page;
+	int ret = -EINVAL;
+
+	if (!info->attrs[IEEE802154_ATTR_CHANNEL] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+	    (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] &&
+		!info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) ||
+	    !info->attrs[IEEE802154_ATTR_CAPABILITY])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) {
+		addr.addr_type = IEEE802154_ADDR_LONG;
+		nla_memcpy(addr.hwaddr,
+				info->attrs[IEEE802154_ATTR_COORD_HW_ADDR],
+				IEEE802154_ADDR_LEN);
+	} else {
+		addr.addr_type = IEEE802154_ADDR_SHORT;
+		addr.short_addr = nla_get_u16(
+				info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+	}
+	addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+	if (info->attrs[IEEE802154_ATTR_PAGE])
+		page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+	else
+		page = 0;
+
+	ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr,
+			nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]),
+			page,
+			nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY]));
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_associate_resp(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	int ret = -EINVAL;
+
+	if (!info->attrs[IEEE802154_ATTR_STATUS] ||
+	    !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] ||
+	    !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	addr.addr_type = IEEE802154_ADDR_LONG;
+	nla_memcpy(addr.hwaddr, info->attrs[IEEE802154_ATTR_DEST_HW_ADDR],
+			IEEE802154_ADDR_LEN);
+	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+
+	ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr,
+		nla_get_u16(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]),
+		nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS]));
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_disassociate_req(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+	int ret = -EINVAL;
+
+	if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] &&
+		!info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) ||
+	    !info->attrs[IEEE802154_ATTR_REASON])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) {
+		addr.addr_type = IEEE802154_ADDR_LONG;
+		nla_memcpy(addr.hwaddr,
+				info->attrs[IEEE802154_ATTR_DEST_HW_ADDR],
+				IEEE802154_ADDR_LEN);
+	} else {
+		addr.addr_type = IEEE802154_ADDR_SHORT;
+		addr.short_addr = nla_get_u16(
+				info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]);
+	}
+	addr.pan_id = ieee802154_mlme_ops(dev)->get_pan_id(dev);
+
+	ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr,
+			nla_get_u8(info->attrs[IEEE802154_ATTR_REASON]));
+
+	dev_put(dev);
+	return ret;
+}
+
+/*
+ * PANid, channel, beacon_order = 15, superframe_order = 15,
+ * PAN_coordinator, battery_life_extension = 0,
+ * coord_realignment = 0, security_enable = 0
+*/
+static int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *dev;
+	struct ieee802154_addr addr;
+
+	u8 channel, bcn_ord, sf_ord;
+	u8 page;
+	int pan_coord, blx, coord_realign;
+	int ret;
+
+	if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] ||
+	    !info->attrs[IEEE802154_ATTR_CHANNEL] ||
+	    !info->attrs[IEEE802154_ATTR_BCN_ORD] ||
+	    !info->attrs[IEEE802154_ATTR_SF_ORD] ||
+	    !info->attrs[IEEE802154_ATTR_PAN_COORD] ||
+	    !info->attrs[IEEE802154_ATTR_BAT_EXT] ||
+	    !info->attrs[IEEE802154_ATTR_COORD_REALIGN]
+	 )
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	addr.addr_type = IEEE802154_ADDR_SHORT;
+	addr.short_addr = nla_get_u16(
+			info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]);
+	addr.pan_id = nla_get_u16(info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
+
+	channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]);
+	bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]);
+	sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]);
+	pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]);
+	blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]);
+	coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]);
+
+	if (info->attrs[IEEE802154_ATTR_PAGE])
+		page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+	else
+		page = 0;
+
+
+	if (addr.short_addr == IEEE802154_ADDR_BROADCAST) {
+		ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS);
+		dev_put(dev);
+		return -EINVAL;
+	}
+
+	ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page,
+		bcn_ord, sf_ord, pan_coord, blx, coord_realign);
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device *dev;
+	int ret;
+	u8 type;
+	u32 channels;
+	u8 duration;
+	u8 page;
+
+	if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] ||
+	    !info->attrs[IEEE802154_ATTR_CHANNELS] ||
+	    !info->attrs[IEEE802154_ATTR_DURATION])
+		return -EINVAL;
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]);
+	channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
+	duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]);
+
+	if (info->attrs[IEEE802154_ATTR_PAGE])
+		page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
+	else
+		page = 0;
+
+
+	ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page,
+			duration);
+
+	dev_put(dev);
+	return ret;
+}
+
+static int ieee802154_list_iface(struct sk_buff *skb,
+	struct genl_info *info)
+{
+	/* Request for interface name, index, type, IEEE address,
+	   PAN Id, short address */
+	struct sk_buff *msg;
+	struct net_device *dev = NULL;
+	int rc = -ENOBUFS;
+
+	pr_debug("%s\n", __func__);
+
+	dev = ieee802154_nl_get_dev(info);
+	if (!dev)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		goto out_dev;
+
+	rc = ieee802154_nl_fill_iface(msg, info->snd_pid, info->snd_seq,
+			0, dev);
+	if (rc < 0)
+		goto out_free;
+
+	dev_put(dev);
+
+	return genlmsg_reply(msg, info);
+out_free:
+	nlmsg_free(msg);
+out_dev:
+	dev_put(dev);
+	return rc;
+
+}
+
+static int ieee802154_dump_iface(struct sk_buff *skb,
+	struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	int idx;
+	int s_idx = cb->args[0];
+
+	pr_debug("%s\n", __func__);
+
+	idx = 0;
+	for_each_netdev(net, dev) {
+		if (idx < s_idx || (dev->type != ARPHRD_IEEE802154))
+			goto cont;
+
+		if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).pid,
+			cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0)
+			break;
+cont:
+		idx++;
+	}
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static struct genl_ops ieee802154_coordinator_ops[] = {
+	IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req),
+	IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp),
+	IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req),
+	IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req),
+	IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req),
+	IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface,
+							ieee802154_dump_iface),
+};
+
+/*
+ * No need to unregister as family unregistration will do it.
+ */
+int nl802154_mac_register(void)
+{
+	int i;
+	int rc;
+
+	rc = genl_register_mc_group(&nl802154_family,
+			&ieee802154_coord_mcgrp);
+	if (rc)
+		return rc;
+
+	rc = genl_register_mc_group(&nl802154_family,
+			&ieee802154_beacon_mcgrp);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < ARRAY_SIZE(ieee802154_coordinator_ops); i++) {
+		rc = genl_register_ops(&nl802154_family,
+				&ieee802154_coordinator_ops[i]);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
new file mode 100644
index 00000000..02548b29
--- /dev/null
+++ b/net/ieee802154/nl-phy.c
@@ -0,0 +1,346 @@
+/*
+ * Netlink inteface for IEEE 802.15.4 stack
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ * Maxim Osipov <maxim.osipov@siemens.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/wpan-phy.h>
+#include <net/af_ieee802154.h>
+#include <net/ieee802154_netdev.h>
+#include <net/rtnetlink.h> /* for rtnl_{un,}lock */
+#include <linux/nl802154.h>
+
+#include "ieee802154.h"
+
+static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 pid,
+	u32 seq, int flags, struct wpan_phy *phy)
+{
+	void *hdr;
+	int i, pages = 0;
+	uint32_t *buf = kzalloc(32 * sizeof(uint32_t), GFP_KERNEL);
+
+	pr_debug("%s\n", __func__);
+
+	if (!buf)
+		return -EMSGSIZE;
+
+	hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags,
+		IEEE802154_LIST_PHY);
+	if (!hdr)
+		goto out;
+
+	mutex_lock(&phy->pib_lock);
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+
+	NLA_PUT_U8(msg, IEEE802154_ATTR_PAGE, phy->current_page);
+	NLA_PUT_U8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel);
+	for (i = 0; i < 32; i++) {
+		if (phy->channels_supported[i])
+			buf[pages++] = phy->channels_supported[i] | (i << 27);
+	}
+	if (pages)
+		NLA_PUT(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST,
+				pages * sizeof(uint32_t), buf);
+
+	mutex_unlock(&phy->pib_lock);
+	kfree(buf);
+	return genlmsg_end(msg, hdr);
+
+nla_put_failure:
+	mutex_unlock(&phy->pib_lock);
+	genlmsg_cancel(msg, hdr);
+out:
+	kfree(buf);
+	return -EMSGSIZE;
+}
+
+static int ieee802154_list_phy(struct sk_buff *skb,
+	struct genl_info *info)
+{
+	/* Request for interface name, index, type, IEEE address,
+	   PAN Id, short address */
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	int rc = -ENOBUFS;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+		return -EINVAL; /* phy name should be null-terminated */
+
+
+	phy = wpan_phy_find(name);
+	if (!phy)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		goto out_dev;
+
+	rc = ieee802154_nl_fill_phy(msg, info->snd_pid, info->snd_seq,
+			0, phy);
+	if (rc < 0)
+		goto out_free;
+
+	wpan_phy_put(phy);
+
+	return genlmsg_reply(msg, info);
+out_free:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	return rc;
+
+}
+
+struct dump_phy_data {
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx, s_idx;
+};
+
+static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data)
+{
+	int rc;
+	struct dump_phy_data *data = _data;
+
+	pr_debug("%s\n", __func__);
+
+	if (data->idx++ < data->s_idx)
+		return 0;
+
+	rc = ieee802154_nl_fill_phy(data->skb,
+			NETLINK_CB(data->cb->skb).pid,
+			data->cb->nlh->nlmsg_seq,
+			NLM_F_MULTI,
+			phy);
+
+	if (rc < 0) {
+		data->idx--;
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ieee802154_dump_phy(struct sk_buff *skb,
+	struct netlink_callback *cb)
+{
+	struct dump_phy_data data = {
+		.cb = cb,
+		.skb = skb,
+		.s_idx = cb->args[0],
+		.idx = 0,
+	};
+
+	pr_debug("%s\n", __func__);
+
+	wpan_phy_for_each(ieee802154_dump_phy_iter, &data);
+
+	cb->args[0] = data.idx;
+
+	return skb->len;
+}
+
+static int ieee802154_add_iface(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	const char *devname;
+	int rc = -ENOBUFS;
+	struct net_device *dev;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_PHY_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0')
+		return -EINVAL; /* phy name should be null-terminated */
+
+	if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
+		devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+		if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1]
+				!= '\0')
+			return -EINVAL; /* phy name should be null-terminated */
+	} else  {
+		devname = "wpan%d";
+	}
+
+	if (strlen(devname) >= IFNAMSIZ)
+		return -ENAMETOOLONG;
+
+	phy = wpan_phy_find(name);
+	if (!phy)
+		return -ENODEV;
+
+	msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE);
+	if (!msg)
+		goto out_dev;
+
+	if (!phy->add_iface) {
+		rc = -EINVAL;
+		goto nla_put_failure;
+	}
+
+	dev = phy->add_iface(phy, devname);
+	if (IS_ERR(dev)) {
+		rc = PTR_ERR(dev);
+		goto nla_put_failure;
+	}
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, dev->name);
+
+	dev_put(dev);
+
+	wpan_phy_put(phy);
+
+	return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	return rc;
+}
+
+static int ieee802154_del_iface(struct sk_buff *skb,
+		struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct wpan_phy *phy;
+	const char *name;
+	int rc;
+	struct net_device *dev;
+
+	pr_debug("%s\n", __func__);
+
+	if (!info->attrs[IEEE802154_ATTR_DEV_NAME])
+		return -EINVAL;
+
+	name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]);
+	if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0')
+		return -EINVAL; /* name should be null-terminated */
+
+	dev = dev_get_by_name(genl_info_net(info), name);
+	if (!dev)
+		return -ENODEV;
+
+	phy = ieee802154_mlme_ops(dev)->get_phy(dev);
+	BUG_ON(!phy);
+
+	rc = -EINVAL;
+	/* phy name is optional, but should be checked if it's given */
+	if (info->attrs[IEEE802154_ATTR_PHY_NAME]) {
+		struct wpan_phy *phy2;
+
+		const char *pname =
+			nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]);
+		if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1]
+				!= '\0')
+			/* name should be null-terminated */
+			goto out_dev;
+
+		phy2 = wpan_phy_find(pname);
+		if (!phy2)
+			goto out_dev;
+
+		if (phy != phy2) {
+			wpan_phy_put(phy2);
+			goto out_dev;
+		}
+	}
+
+	rc = -ENOBUFS;
+
+	msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE);
+	if (!msg)
+		goto out_dev;
+
+	if (!phy->del_iface) {
+		rc = -EINVAL;
+		goto nla_put_failure;
+	}
+
+	rtnl_lock();
+	phy->del_iface(phy, dev);
+
+	/* We don't have device anymore */
+	dev_put(dev);
+	dev = NULL;
+
+	rtnl_unlock();
+
+
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy));
+	NLA_PUT_STRING(msg, IEEE802154_ATTR_DEV_NAME, name);
+
+	wpan_phy_put(phy);
+
+	return ieee802154_nl_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+out_dev:
+	wpan_phy_put(phy);
+	if (dev)
+		dev_put(dev);
+
+	return rc;
+}
+
+static struct genl_ops ieee802154_phy_ops[] = {
+	IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
+							ieee802154_dump_phy),
+	IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface),
+	IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface),
+};
+
+/*
+ * No need to unregister as family unregistration will do it.
+ */
+int nl802154_phy_register(void)
+{
+	int i;
+	int rc;
+
+	for (i = 0; i < ARRAY_SIZE(ieee802154_phy_ops); i++) {
+		rc = genl_register_ops(&nl802154_family,
+				&ieee802154_phy_ops[i]);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
new file mode 100644
index 00000000..6adda4d4
--- /dev/null
+++ b/net/ieee802154/nl_policy.c
@@ -0,0 +1,56 @@
+/*
+ * nl802154.h
+ *
+ * Copyright (C) 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/nl802154.h>
+
+#define NLA_HW_ADDR NLA_U64
+
+const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
+	[IEEE802154_ATTR_DEV_NAME] = { .type = NLA_STRING, },
+	[IEEE802154_ATTR_DEV_INDEX] = { .type = NLA_U32, },
+	[IEEE802154_ATTR_PHY_NAME] = { .type = NLA_STRING, },
+
+	[IEEE802154_ATTR_STATUS] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_PAGE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_SRC_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_SRC_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_SRC_PAN_ID] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_DEST_SHORT_ADDR] = { .type = NLA_U16, },
+	[IEEE802154_ATTR_DEST_HW_ADDR] = { .type = NLA_HW_ADDR, },
+	[IEEE802154_ATTR_DEST_PAN_ID] = { .type = NLA_U16, },
+
+	[IEEE802154_ATTR_CAPABILITY] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_REASON] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_SCAN_TYPE] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_CHANNELS] = { .type = NLA_U32, },
+	[IEEE802154_ATTR_DURATION] = { .type = NLA_U8, },
+	[IEEE802154_ATTR_ED_LIST] = { .len = 27 },
+	[IEEE802154_ATTR_CHANNEL_PAGE_LIST] = { .len = 32 * 4, },
+};
+
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
new file mode 100644
index 00000000..10970ca8
--- /dev/null
+++ b/net/ieee802154/raw.c
@@ -0,0 +1,267 @@
+/*
+ * Raw IEEE 802.15.4 sockets
+ *
+ * Copyright 2007, 2008 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Written by:
+ * Sergey Lapin <slapin@ossfans.org>
+ * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+ */
+
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_ieee802154.h>
+
+#include "af802154.h"
+
+static HLIST_HEAD(raw_head);
+static DEFINE_RWLOCK(raw_lock);
+
+static void raw_hash(struct sock *sk)
+{
+	write_lock_bh(&raw_lock);
+	sk_add_node(sk, &raw_head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&raw_lock);
+}
+
+static void raw_unhash(struct sock *sk)
+{
+	write_lock_bh(&raw_lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&raw_lock);
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+	sk_common_release(sk);
+}
+
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int len)
+{
+	struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
+	int err = 0;
+	struct net_device *dev = NULL;
+
+	if (len < sizeof(*addr))
+		return -EINVAL;
+
+	if (addr->family != AF_IEEE802154)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	dev = ieee802154_get_dev(sock_net(sk), &addr->addr);
+	if (!dev) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (dev->type != ARPHRD_IEEE802154) {
+		err = -ENODEV;
+		goto out_put;
+	}
+
+	sk->sk_bound_dev_if = dev->ifindex;
+	sk_dst_reset(sk);
+
+out_put:
+	dev_put(dev);
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+static int raw_connect(struct sock *sk, struct sockaddr *uaddr,
+			int addr_len)
+{
+	return -ENOTSUPP;
+}
+
+static int raw_disconnect(struct sock *sk, int flags)
+{
+	return 0;
+}
+
+static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t size)
+{
+	struct net_device *dev;
+	unsigned mtu;
+	struct sk_buff *skb;
+	int err;
+
+	if (msg->msg_flags & MSG_OOB) {
+		pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags);
+		return -EOPNOTSUPP;
+	}
+
+	lock_sock(sk);
+	if (!sk->sk_bound_dev_if)
+		dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
+	else
+		dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if);
+	release_sock(sk);
+
+	if (!dev) {
+		pr_debug("no dev\n");
+		err = -ENXIO;
+		goto out;
+	}
+
+	mtu = dev->mtu;
+	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
+
+	if (size > mtu) {
+		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		err = -EINVAL;
+		goto out_dev;
+	}
+
+	skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + size,
+			msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		goto out_dev;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+
+	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+	if (err < 0)
+		goto out_skb;
+
+	skb->dev = dev;
+	skb->sk  = sk;
+	skb->protocol = htons(ETH_P_IEEE802154);
+
+	dev_put(dev);
+
+	err = dev_queue_xmit(skb);
+	if (err > 0)
+		err = net_xmit_errno(err);
+
+	return err ?: size;
+
+out_skb:
+	kfree_skb(skb);
+out_dev:
+	dev_put(dev);
+out:
+	return err;
+}
+
+static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len, int noblock, int flags, int *addr_len)
+{
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	struct sk_buff *skb;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:
+	if (err)
+		return err;
+	return copied;
+}
+
+static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+
+void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock(&raw_lock);
+	sk_for_each(sk, node, &raw_head) {
+		bh_lock_sock(sk);
+		if (!sk->sk_bound_dev_if ||
+		    sk->sk_bound_dev_if == dev->ifindex) {
+
+			struct sk_buff *clone;
+
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (clone)
+				raw_rcv_skb(sk, clone);
+		}
+		bh_unlock_sock(sk);
+	}
+	read_unlock(&raw_lock);
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	return -EOPNOTSUPP;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	return -EOPNOTSUPP;
+}
+
+struct proto ieee802154_raw_prot = {
+	.name		= "IEEE-802.15.4-RAW",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct sock),
+	.close		= raw_close,
+	.bind		= raw_bind,
+	.sendmsg	= raw_sendmsg,
+	.recvmsg	= raw_recvmsg,
+	.hash		= raw_hash,
+	.unhash		= raw_unhash,
+	.connect	= raw_connect,
+	.disconnect	= raw_disconnect,
+	.getsockopt	= raw_getsockopt,
+	.setsockopt	= raw_setsockopt,
+};
+
diff --git a/net/ieee802154/wpan-class.c b/net/ieee802154/wpan-class.c
new file mode 100644
index 00000000..1627ef2e
--- /dev/null
+++ b/net/ieee802154/wpan-class.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2007, 2008, 2009 Siemens AG
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+
+#include <net/wpan-phy.h>
+
+#include "ieee802154.h"
+
+#define MASTER_SHOW_COMPLEX(name, format_string, args...)		\
+static ssize_t name ## _show(struct device *dev,			\
+			    struct device_attribute *attr, char *buf)	\
+{									\
+	struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev);	\
+	int ret;							\
+									\
+	mutex_lock(&phy->pib_lock);					\
+	ret = snprintf(buf, PAGE_SIZE, format_string "\n", args);	\
+	mutex_unlock(&phy->pib_lock);					\
+	return ret;							\
+}
+
+#define MASTER_SHOW(field, format_string)				\
+	MASTER_SHOW_COMPLEX(field, format_string, phy->field)
+
+MASTER_SHOW(current_channel, "%d");
+MASTER_SHOW(current_page, "%d");
+MASTER_SHOW_COMPLEX(transmit_power, "%d +- %d dB",
+	((signed char) (phy->transmit_power << 2)) >> 2,
+	(phy->transmit_power >> 6) ? (phy->transmit_power >> 6) * 3 : 1 );
+MASTER_SHOW(cca_mode, "%d");
+
+static ssize_t channels_supported_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev);
+	int ret;
+	int i, len = 0;
+
+	mutex_lock(&phy->pib_lock);
+	for (i = 0; i < 32; i++) {
+		ret = snprintf(buf + len, PAGE_SIZE - len,
+				"%#09x\n", phy->channels_supported[i]);
+		if (ret < 0)
+			break;
+		len += ret;
+	}
+	mutex_unlock(&phy->pib_lock);
+	return len;
+}
+
+static struct device_attribute pmib_attrs[] = {
+	__ATTR_RO(current_channel),
+	__ATTR_RO(current_page),
+	__ATTR_RO(channels_supported),
+	__ATTR_RO(transmit_power),
+	__ATTR_RO(cca_mode),
+	{},
+};
+
+static void wpan_phy_release(struct device *d)
+{
+	struct wpan_phy *phy = container_of(d, struct wpan_phy, dev);
+	kfree(phy);
+}
+
+static struct class wpan_phy_class = {
+	.name = "ieee802154",
+	.dev_release = wpan_phy_release,
+	.dev_attrs = pmib_attrs,
+};
+
+static DEFINE_MUTEX(wpan_phy_mutex);
+static int wpan_phy_idx;
+
+static int wpan_phy_match(struct device *dev, void *data)
+{
+	return !strcmp(dev_name(dev), (const char *)data);
+}
+
+struct wpan_phy *wpan_phy_find(const char *str)
+{
+	struct device *dev;
+
+	if (WARN_ON(!str))
+		return NULL;
+
+	dev = class_find_device(&wpan_phy_class, NULL,
+			(void *)str, wpan_phy_match);
+	if (!dev)
+		return NULL;
+
+	return container_of(dev, struct wpan_phy, dev);
+}
+EXPORT_SYMBOL(wpan_phy_find);
+
+struct wpan_phy_iter_data {
+	int (*fn)(struct wpan_phy *phy, void *data);
+	void *data;
+};
+
+static int wpan_phy_iter(struct device *dev, void *_data)
+{
+	struct wpan_phy_iter_data *wpid = _data;
+	struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev);
+	return wpid->fn(phy, wpid->data);
+}
+
+int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data),
+		void *data)
+{
+	struct wpan_phy_iter_data wpid = {
+		.fn = fn,
+		.data = data,
+	};
+
+	return class_for_each_device(&wpan_phy_class, NULL,
+			&wpid, wpan_phy_iter);
+}
+EXPORT_SYMBOL(wpan_phy_for_each);
+
+static int wpan_phy_idx_valid(int idx)
+{
+	return idx >= 0;
+}
+
+struct wpan_phy *wpan_phy_alloc(size_t priv_size)
+{
+	struct wpan_phy *phy = kzalloc(sizeof(*phy) + priv_size,
+			GFP_KERNEL);
+
+	if (!phy)
+		goto out;
+	mutex_lock(&wpan_phy_mutex);
+	phy->idx = wpan_phy_idx++;
+	if (unlikely(!wpan_phy_idx_valid(phy->idx))) {
+		wpan_phy_idx--;
+		mutex_unlock(&wpan_phy_mutex);
+		kfree(phy);
+		goto out;
+	}
+	mutex_unlock(&wpan_phy_mutex);
+
+	mutex_init(&phy->pib_lock);
+
+	device_initialize(&phy->dev);
+	dev_set_name(&phy->dev, "wpan-phy%d", phy->idx);
+
+	phy->dev.class = &wpan_phy_class;
+
+	phy->current_channel = -1; /* not initialised */
+	phy->current_page = 0; /* for compatibility */
+
+	return phy;
+
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(wpan_phy_alloc);
+
+int wpan_phy_register(struct wpan_phy *phy)
+{
+	return device_add(&phy->dev);
+}
+EXPORT_SYMBOL(wpan_phy_register);
+
+void wpan_phy_unregister(struct wpan_phy *phy)
+{
+	device_del(&phy->dev);
+}
+EXPORT_SYMBOL(wpan_phy_unregister);
+
+void wpan_phy_free(struct wpan_phy *phy)
+{
+	put_device(&phy->dev);
+}
+EXPORT_SYMBOL(wpan_phy_free);
+
+static int __init wpan_phy_class_init(void)
+{
+	int rc;
+	rc = class_register(&wpan_phy_class);
+	if (rc)
+		goto err;
+
+	rc = ieee802154_nl_init();
+	if (rc)
+		goto err_nl;
+
+	return 0;
+err_nl:
+	class_unregister(&wpan_phy_class);
+err:
+	return rc;
+}
+subsys_initcall(wpan_phy_class_init);
+
+static void __exit wpan_phy_class_exit(void)
+{
+	ieee802154_nl_exit();
+	class_unregister(&wpan_phy_class);
+}
+module_exit(wpan_phy_class_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
+MODULE_AUTHOR("Dmitry Eremin-Solenikov");
+
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 00000000..cbb505ba
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,626 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+	bool "IP: multicasting"
+	help
+	  This is code for addressing several networked computers at once,
+	  enlarging your kernel by about 2 KB. You need multicasting if you
+	  intend to participate in the MBONE, a high bandwidth network on top
+	  of the Internet which carries audio and video broadcasts. More
+	  information about the MBONE is on the WWW at
+	  <http://www.savetz.com/mbone/>. Information about the multicast
+	  capabilities of the various network cards is contained in
+	  <file:Documentation/networking/multicast.txt>. For most people, it's
+	  safe to say N.
+
+config IP_ADVANCED_ROUTER
+	bool "IP: advanced router"
+	---help---
+	  If you intend to run your Linux box mostly as a router, i.e. as a
+	  computer that forwards and redistributes network packets, say Y; you
+	  will then be presented with several options that allow more precise
+	  control about the routing process.
+
+	  The answer to this question won't directly affect the kernel:
+	  answering N will just cause the configurator to skip all the
+	  questions about advanced routing.
+
+	  Note that your box can only act as a router if you enable IP
+	  forwarding in your kernel; you can do that by saying Y to "/proc
+	  file system support" and "Sysctl support" below and executing the
+	  line
+
+	  echo "1" > /proc/sys/net/ipv4/ip_forward
+
+	  at boot time after the /proc file system has been mounted.
+
+	  If you turn on IP forwarding, you should consider the rp_filter, which
+	  automatically rejects incoming packets if the routing table entry
+	  for their source address doesn't match the network interface they're
+	  arriving on. This has security advantages because it prevents the
+	  so-called IP spoofing, however it can pose problems if you use
+	  asymmetric routing (packets from you to a host take a different path
+	  than packets from that host to you) or if you operate a non-routing
+	  host which has several IP addresses on different interfaces. To turn
+	  rp_filter on use:
+
+	  echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+	   or
+	  echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+	  Note that some distributions enable it in startup scripts.
+	  For details about rp_filter strict and loose mode read
+	  <file:Documentation/networking/ip-sysctl.txt>.
+
+	  If unsure, say N here.
+
+config IP_FIB_TRIE_STATS
+	bool "FIB TRIE statistics"
+	depends on IP_ADVANCED_ROUTER
+	---help---
+	  Keep track of statistics on structure of FIB TRIE table.
+	  Useful for testing and measuring TRIE performance.
+
+config IP_MULTIPLE_TABLES
+	bool "IP: policy routing"
+	depends on IP_ADVANCED_ROUTER
+	select FIB_RULES
+	---help---
+	  Normally, a router decides what to do with a received packet based
+	  solely on the packet's final destination address. If you say Y here,
+	  the Linux router will also be able to take the packet's source
+	  address into account. Furthermore, the TOS (Type-Of-Service) field
+	  of the packet can be used for routing decisions as well.
+
+	  If you are interested in this, please see the preliminary
+	  documentation at <http://www.compendium.com.ar/policy-routing.txt>
+	  and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+	  You will need supporting software from
+	  <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+	  If unsure, say N.
+
+config IP_ROUTE_MULTIPATH
+	bool "IP: equal cost multipath"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  Normally, the routing tables specify a single action to be taken in
+	  a deterministic manner for a given packet. If you say Y here
+	  however, it becomes possible to attach several actions to a packet
+	  pattern, in effect specifying several alternative paths to travel
+	  for those packets. The router considers all these paths to be of
+	  equal "cost" and chooses one of them in a non-deterministic fashion
+	  if a matching packet arrives.
+
+config IP_ROUTE_VERBOSE
+	bool "IP: verbose route monitoring"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  If you say Y here, which is recommended, then the kernel will print
+	  verbose messages regarding the routing, for example warnings about
+	  received packets which look strange and could be evidence of an
+	  attack or a misconfigured system somewhere. The information is
+	  handled by the klogd daemon which is responsible for kernel messages
+	  ("man klogd").
+
+config IP_ROUTE_CLASSID
+	bool
+
+config IP_PNP
+	bool "IP: kernel level autoconfiguration"
+	help
+	  This enables automatic configuration of IP addresses of devices and
+	  of the routing table during kernel boot, based on either information
+	  supplied on the kernel command line or by BOOTP or RARP protocols.
+	  You need to say Y only for diskless machines requiring network
+	  access to boot (in which case you want to say Y to "Root file system
+	  on NFS" as well), because all other machines configure the network
+	  in their startup scripts.
+
+config IP_PNP_DHCP
+	bool "IP: DHCP support"
+	depends on IP_PNP
+	---help---
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the DHCP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does DHCP itself, providing all necessary information on the kernel
+	  command line, you can say N here.
+
+	  If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+	  must be operating on your network.  Read
+	  <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+	bool "IP: BOOTP support"
+	depends on IP_PNP
+	---help---
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the BOOTP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does BOOTP itself, providing all necessary information on the kernel
+	  command line, you can say N here. If unsure, say Y. Note that if you
+	  want to use BOOTP, a BOOTP server must be operating on your network.
+	  Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+	bool "IP: RARP support"
+	depends on IP_PNP
+	help
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the RARP protocol (an
+	  older protocol which is being obsoleted by BOOTP and DHCP), say Y
+	  here. Note that if you want to use RARP, a RARP server must be
+	  operating on your network. Read
+	  <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+
+# not yet ready..
+#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP
+config NET_IPIP
+	tristate "IP: tunneling"
+	select INET_TUNNEL
+	---help---
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  encapsulation of IP within IP, which sounds kind of pointless, but
+	  can be useful if you want to make your (or some other) machine
+	  appear on a different network than it physically is, or to use
+	  mobile-IP facilities (allowing laptops to seamlessly move between
+	  networks without changing their IP addresses).
+
+	  Saying Y to this option will produce two modules ( = code which can
+	  be inserted in and removed from the running kernel whenever you
+	  want). Most people won't need this and can say N.
+
+config NET_IPGRE_DEMUX
+	tristate "IP: GRE demultiplexer"
+	help
+	 This is helper module to demultiplex GRE packets on GRE version field criteria.
+	 Required by ip_gre and pptp modules.
+
+config NET_IPGRE
+	tristate "IP: GRE tunnels over IP"
+	depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+	help
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  GRE (Generic Routing Encapsulation) and at this time allows
+	  encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+	  This driver is useful if the other endpoint is a Cisco router: Cisco
+	  likes GRE much better than the other Linux tunneling driver ("IP
+	  tunneling" above). In addition, GRE allows multicast redistribution
+	  through the tunnel.
+
+config NET_IPGRE_BROADCAST
+	bool "IP: broadcast GRE over IP"
+	depends on IP_MULTICAST && NET_IPGRE
+	help
+	  One application of GRE/IP is to construct a broadcast WAN (Wide Area
+	  Network), which looks like a normal Ethernet LAN (Local Area
+	  Network), but can be distributed all over the Internet. If you want
+	  to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+	bool "IP: multicast routing"
+	depends on IP_MULTICAST
+	help
+	  This is used if you want your machine to act as a router for IP
+	  packets that have several destination addresses. It is needed on the
+	  MBONE, a high bandwidth network on top of the Internet which carries
+	  audio and video broadcasts. In order to do that, you would most
+	  likely run the program mrouted. Information about the multicast
+	  capabilities of the various network cards is contained in
+	  <file:Documentation/networking/multicast.txt>. If you haven't heard
+	  about it, you don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+	bool "IP: multicast policy routing"
+	depends on IP_MROUTE && IP_ADVANCED_ROUTER
+	select FIB_RULES
+	help
+	  Normally, a multicast router runs a userspace daemon and decides
+	  what to do with a multicast packet based on the source and
+	  destination addresses. If you say Y here, the multicast router
+	  will also be able to take interfaces and packet marks into
+	  account and run multiple instances of userspace daemons
+	  simultaneously, each one handling a single table.
+
+	  If unsure, say N.
+
+config IP_PIMSM_V1
+	bool "IP: PIM-SM version 1 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM (Protocol Independent
+	  Multicast) version 1. This multicast routing protocol is used widely
+	  because Cisco supports it. You need special software to use it
+	  (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+	  information about PIM.
+
+	  Say Y if you want to use PIM-SM v1. Note that you can say N here if
+	  you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+	bool "IP: PIM-SM version 2 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM version 2. In order to use
+	  this, you need an experimental routing daemon supporting it (pimd or
+	  gated-5). This routing protocol is not used widely, so say N unless
+	  you want to play with it.
+
+config ARPD
+	bool "IP: ARP daemon support"
+	---help---
+	  The kernel maintains an internal cache which maps IP addresses to
+	  hardware addresses on the local network, so that Ethernet/Token Ring/
+	  etc. frames are sent to the proper address on the physical networking
+	  layer. Normally, kernel uses the ARP protocol to resolve these
+	  mappings.
+
+	  Saying Y here adds support to have an user space daemon to do this
+	  resolution instead. This is useful for implementing an alternate
+	  address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
+	  testing purposes.
+
+	  If unsure, say N.
+
+config SYN_COOKIES
+	bool "IP: TCP syncookie support"
+	---help---
+	  Normal TCP/IP networking is open to an attack known as "SYN
+	  flooding". This denial-of-service attack prevents legitimate remote
+	  users from being able to connect to your computer during an ongoing
+	  attack and requires very little work from the attacker, who can
+	  operate from anywhere on the Internet.
+
+	  SYN cookies provide protection against this type of attack. If you
+	  say Y here, the TCP/IP stack will use a cryptographic challenge
+	  protocol known as "SYN cookies" to enable legitimate users to
+	  continue to connect, even when your machine is under attack. There
+	  is no need for the legitimate users to change their TCP/IP software;
+	  SYN cookies work transparently to them. For technical information
+	  about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+	  If you are SYN flooded, the source address reported by the kernel is
+	  likely to have been forged by the attacker; it is only reported as
+	  an aid in tracing the packets to their actual source and should not
+	  be taken as absolute truth.
+
+	  SYN cookies may prevent correct error reporting on clients when the
+	  server is really overloaded. If this happens frequently better turn
+	  them off.
+
+	  If you say Y here, you can disable SYN cookies at run time by
+	  saying Y to "/proc file system support" and
+	  "Sysctl support" below and executing the command
+
+	  echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+	  after the /proc file system has been mounted.
+
+	  If unsure, say N.
+
+config INET_AH
+	tristate "IP: AH transformation"
+	select XFRM
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	---help---
+	  Support for IPsec AH.
+
+	  If unsure, say Y.
+
+config INET_ESP
+	tristate "IP: ESP transformation"
+	select XFRM
+	select CRYPTO
+	select CRYPTO_AUTHENC
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_CBC
+	select CRYPTO_SHA1
+	select CRYPTO_DES
+	---help---
+	  Support for IPsec ESP.
+
+	  If unsure, say Y.
+
+config INET_IPCOMP
+	tristate "IP: IPComp transformation"
+	select INET_XFRM_TUNNEL
+	select XFRM_IPCOMP
+	---help---
+	  Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+	  typically needed for IPsec.
+
+	  If unsure, say Y.
+
+config INET_XFRM_TUNNEL
+	tristate
+	select INET_TUNNEL
+	default n
+
+config INET_TUNNEL
+	tristate
+	default n
+
+config INET_XFRM_MODE_TRANSPORT
+	tristate "IP: IPsec transport mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+	tristate "IP: IPsec tunnel mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
+config INET_XFRM_MODE_BEET
+	tristate "IP: IPsec BEET mode"
+	default y
+	select XFRM
+	---help---
+	  Support for IPsec BEET mode.
+
+	  If unsure, say Y.
+
+config INET_LRO
+	tristate "Large Receive Offload (ipv4/tcp)"
+	default y
+	---help---
+	  Support for Large Receive Offload (ipv4/tcp).
+
+	  If unsure, say Y.
+
+config INET_DIAG
+	tristate "INET: socket monitoring interface"
+	default y
+	---help---
+	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+	  native Linux tools such as ss. ss is included in iproute2, currently
+	  downloadable at:
+	  
+	    http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
+	  If unsure, say Y.
+
+config INET_TCP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
+
+menuconfig TCP_CONG_ADVANCED
+	bool "TCP: advanced congestion control"
+	---help---
+	  Support for selection of various TCP congestion control
+	  modules.
+
+	  Nearly all users can safely say no here, and a safe default
+	  selection will be made (CUBIC with new Reno as a fallback).
+
+	  If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+	tristate "Binary Increase Congestion (BIC) control"
+	default m
+	---help---
+	BIC-TCP is a sender-side only change that ensures a linear RTT
+	fairness under large windows while offering both scalability and
+	bounded TCP-friendliness. The protocol combines two schemes
+	called additive increase and binary search increase. When the
+	congestion window is large, additive increase with a large
+	increment ensures linear RTT fairness as well as good
+	scalability. Under small congestion windows, binary search
+	increase provides TCP friendliness.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+	tristate "CUBIC TCP"
+	default y
+	---help---
+	This is version 2.0 of BIC-TCP which uses a cubic growth function
+	among other techniques.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+	tristate "TCP Westwood+"
+	default m
+	---help---
+	TCP Westwood+ is a sender-side only modification of the TCP Reno
+	protocol stack that optimizes the performance of TCP congestion
+	control. It is based on end-to-end bandwidth estimation to set
+	congestion window and slow start threshold after a congestion
+	episode. Using this estimation, TCP Westwood+ adaptively sets a
+	slow start threshold and a congestion window which takes into
+	account the bandwidth used  at the time congestion is experienced.
+	TCP Westwood+ significantly increases fairness wrt TCP Reno in
+	wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+        tristate "H-TCP"
+        default m
+	---help---
+	H-TCP is a send-side only modifications of the TCP Reno
+	protocol stack that optimizes the performance of TCP
+	congestion control for high speed network links. It uses a
+	modeswitch to change the alpha and beta parameters of TCP Reno
+	based on network conditions and in a way so as to be fair with
+	other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+	tristate "High Speed TCP"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+	A modification to TCP's congestion control mechanism for use
+	with large congestion windows. A table indicates how much to
+	increase the congestion window by when an ACK is received.
+ 	For more detail	see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+	tristate "TCP-Hybla congestion control algorithm"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP-Hybla is a sender-side only change that eliminates penalization of
+	long-RTT, large-bandwidth connections, like when satellite legs are
+	involved, especially when sharing a common bottleneck with normal
+	terrestrial connections.
+
+config TCP_CONG_VEGAS
+	tristate "TCP Vegas"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Vegas is a sender-side only change to TCP that anticipates
+	the onset of congestion by estimating the bandwidth. TCP Vegas
+	adjusts the sending rate by modifying the congestion
+	window. TCP Vegas should provide less packet loss, but it is
+	not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+	tristate "Scalable TCP"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	Scalable TCP is a sender-side only change to TCP which uses a
+	MIMD congestion control algorithm which has some nice scaling
+	properties, though is known to have fairness issues.
+	See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+	tristate "TCP Low Priority"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+	to utilize only the excess network bandwidth as compared to the
+	``fair share`` of bandwidth as targeted by TCP.
+	See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+	tristate "TCP Veno"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP Veno is a sender-side only enhancement of TCP to obtain better
+	throughput over wireless networks. TCP Veno makes use of state
+	distinguishing to circumvent the difficult judgment of the packet loss
+	type. TCP Veno cuts down less congestion window in response to random
+	loss packets.
+	See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
+
+config TCP_CONG_YEAH
+	tristate "YeAH TCP"
+	depends on EXPERIMENTAL
+	select TCP_CONG_VEGAS
+	default n
+	---help---
+	YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+	algorithm, which uses a mixed loss/delay approach to compute the
+	congestion window. It's design goals target high efficiency,
+	internal, RTT and Reno fairness, resilience to link loss while
+	keeping network elements load as low as possible.
+
+	For further details look here:
+	  http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+	tristate "TCP Illinois"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	TCP-Illinois is a sender-side modification of TCP Reno for
+	high speed long delay links. It uses round-trip-time to
+	adjust the alpha and beta parameters to achieve a higher average
+	throughput and maintain fairness.
+
+	For further details see:
+	  http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+choice
+	prompt "Default TCP congestion control"
+	default DEFAULT_CUBIC
+	help
+	  Select the TCP congestion control that will be used by default
+	  for all connections.
+
+	config DEFAULT_BIC
+		bool "Bic" if TCP_CONG_BIC=y
+
+	config DEFAULT_CUBIC
+		bool "Cubic" if TCP_CONG_CUBIC=y
+
+	config DEFAULT_HTCP
+		bool "Htcp" if TCP_CONG_HTCP=y
+
+	config DEFAULT_HYBLA
+		bool "Hybla" if TCP_CONG_HYBLA=y
+
+	config DEFAULT_VEGAS
+		bool "Vegas" if TCP_CONG_VEGAS=y
+
+	config DEFAULT_VENO
+		bool "Veno" if TCP_CONG_VENO=y
+
+	config DEFAULT_WESTWOOD
+		bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+	config DEFAULT_RENO
+		bool "Reno"
+
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+	tristate
+	depends on !TCP_CONG_ADVANCED
+	default y
+
+config DEFAULT_TCP_CONG
+	string
+	default "bic" if DEFAULT_BIC
+	default "cubic" if DEFAULT_CUBIC
+	default "htcp" if DEFAULT_HTCP
+	default "hybla" if DEFAULT_HYBLA
+	default "vegas" if DEFAULT_VEGAS
+	default "westwood" if DEFAULT_WESTWOOD
+	default "veno" if DEFAULT_VENO
+	default "reno" if DEFAULT_RENO
+	default "cubic"
+
+config TCP_MD5SIG
+	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select CRYPTO
+	select CRYPTO_MD5
+	---help---
+	  RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+	  Its main (only?) use is to protect BGP sessions between core routers
+	  on the Internet.
+
+	  If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 00000000..681084d7
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,54 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y     := route.o inetpeer.o protocol.o \
+	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+	     ip_output.o ip_sockglue.o inet_hashtables.o \
+	     inet_timewait_sock.o inet_connection_sock.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+	     tcp_minisocks.o tcp_cong.o \
+	     datagram.o raw.o udp.o udplite.o \
+	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
+	     fib_frontend.o fib_semantics.o fib_trie.o \
+	     inet_fragment.o ping.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
+obj-$(CONFIG_INET_LRO) += inet_lro.o
+obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
+obj-$(CONFIG_INET_DIAG) += inet_diag.o 
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+		      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 00000000..4d60f12c
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1823 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		PF_INET protocol family socket handler.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ *		piggy,
+ *		Karl Knutson	:	Socket protocol table
+ *		A.N.Kuznetsov	:	Socket death error in accept().
+ *		John Richardson :	Fix non blocking error in connect()
+ *					so sockets that fail to connect
+ *					don't return -EINPROGRESS.
+ *		Alan Cox	:	Asynchronous I/O support
+ *		Alan Cox	:	Keep correct socket pointer on sock
+ *					structures
+ *					when accept() ed
+ *		Alan Cox	:	Semantics of SO_LINGER aren't state
+ *					moved to close when you look carefully.
+ *					With this fixed and the accept bug fixed
+ *					some RPC stuff seems happier.
+ *		Niibe Yutaka	:	4.4BSD style write async I/O
+ *		Alan Cox,
+ *		Tony Gale 	:	Fixed reuse semantics.
+ *		Alan Cox	:	bind() shouldn't abort existing but dead
+ *					sockets. Stops FTP netin:.. I hope.
+ *		Alan Cox	:	bind() works correctly for RAW sockets.
+ *					Note that FreeBSD at least was broken
+ *					in this respect so be careful with
+ *					compatibility tests...
+ *		Alan Cox	:	routing cache support
+ *		Alan Cox	:	memzero the socket structure for
+ *					compactness.
+ *		Matt Day	:	nonblock connect error handler
+ *		Alan Cox	:	Allow large numbers of pending sockets
+ *					(eg for big web sites), but only if
+ *					specifically application requested.
+ *		Alan Cox	:	New buffering throughout IP. Used
+ *					dumbly.
+ *		Alan Cox	:	New buffering now used smartly.
+ *		Alan Cox	:	BSD rather than common sense
+ *					interpretation of listen.
+ *		Germano Caronni	:	Assorted small races.
+ *		Alan Cox	:	sendmsg/recvmsg basic support.
+ *		Alan Cox	:	Only sendmsg/recvmsg now supported.
+ *		Alan Cox	:	Locked down bind (see security list).
+ *		Alan Cox	:	Loosened bind a little.
+ *		Mike McLagan	:	ADD/DEL DLCI Ioctls
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Some other random speedups.
+ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
+ *		Andi Kleen	:	Fix inet_stream_connect TCP race.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+	return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+	return 1;
+}
+#endif
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+struct ipv4_config ipv4_config;
+EXPORT_SYMBOL(ipv4_config);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_error_queue);
+
+	sk_mem_reclaim(sk);
+
+	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+		pr_err("Attempt to release TCP socket in state %d %p\n",
+		       sk->sk_state, sk);
+		return;
+	}
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		pr_err("Attempt to release alive inet socket %p\n", sk);
+		return;
+	}
+
+	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+	WARN_ON(sk->sk_wmem_queued);
+	WARN_ON(sk->sk_forward_alloc);
+
+	kfree(rcu_dereference_protected(inet->inet_opt, 1));
+	dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+	sk_refcnt_debug_dec(sk);
+}
+EXPORT_SYMBOL(inet_sock_destruct);
+
+/*
+ *	The routines beyond this point handle the behaviour of an AF_INET
+ *	socket object. Mostly it punts to the subprotocols of IP to do
+ *	the work.
+ */
+
+/*
+ *	Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+	struct inet_sock *inet;
+	/* We may need to bind the socket. */
+	lock_sock(sk);
+	inet = inet_sk(sk);
+	if (!inet->inet_num) {
+		if (sk->sk_prot->get_port(sk, 0)) {
+			release_sock(sk);
+			return -EAGAIN;
+		}
+		inet->inet_sport = htons(inet->inet_num);
+	}
+	release_sock(sk);
+	return 0;
+}
+
+/*
+ *	Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	unsigned char old_state;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+		goto out;
+
+	old_state = sk->sk_state;
+	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+		goto out;
+
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != TCP_LISTEN) {
+		err = inet_csk_listen_start(sk, backlog);
+		if (err)
+			goto out;
+	}
+	sk->sk_max_ack_backlog = backlog;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(inet_listen);
+
+u32 inet_ehash_secret __read_mostly;
+EXPORT_SYMBOL(inet_ehash_secret);
+
+/*
+ * inet_ehash_secret must be set exactly once
+ */
+void build_ehash_secret(void)
+{
+	u32 rnd;
+
+	do {
+		get_random_bytes(&rnd, sizeof(rnd));
+	} while (rnd == 0);
+
+	cmpxchg(&inet_ehash_secret, 0, rnd);
+}
+EXPORT_SYMBOL(build_ehash_secret);
+
+static inline int inet_netns_ok(struct net *net, int protocol)
+{
+	int hash;
+	const struct net_protocol *ipprot;
+
+	if (net_eq(net, &init_net))
+		return 1;
+
+	hash = protocol & (MAX_INET_PROTOS - 1);
+	ipprot = rcu_dereference(inet_protos[hash]);
+
+	if (ipprot == NULL)
+		/* raw IP is OK */
+		return 1;
+	return ipprot->netns_ok;
+}
+
+
+/*
+ *	Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	struct sock *sk;
+	struct inet_protosw *answer;
+	struct inet_sock *inet;
+	struct proto *answer_prot;
+	unsigned char answer_flags;
+	char answer_no_check;
+	int try_loading_module = 0;
+	int err;
+
+	if (!current_has_network())
+		return -EACCES;
+
+	if (unlikely(!inet_ehash_secret))
+		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+			build_ehash_secret();
+
+	sock->state = SS_UNCONNECTED;
+
+	/* Look for the requested type/protocol pair. */
+lookup_protocol:
+	err = -ESOCKTNOSUPPORT;
+	rcu_read_lock();
+	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+
+		err = 0;
+		/* Check the non-wild match. */
+		if (protocol == answer->protocol) {
+			if (protocol != IPPROTO_IP)
+				break;
+		} else {
+			/* Check for the two wild cases. */
+			if (IPPROTO_IP == protocol) {
+				protocol = answer->protocol;
+				break;
+			}
+			if (IPPROTO_IP == answer->protocol)
+				break;
+		}
+		err = -EPROTONOSUPPORT;
+	}
+
+	if (unlikely(err)) {
+		if (try_loading_module < 2) {
+			rcu_read_unlock();
+			/*
+			 * Be more specific, e.g. net-pf-2-proto-132-type-1
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+			 */
+			if (++try_loading_module == 1)
+				request_module("net-pf-%d-proto-%d-type-%d",
+					       PF_INET, protocol, sock->type);
+			/*
+			 * Fall back to generic, e.g. net-pf-2-proto-132
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+			 */
+			else
+				request_module("net-pf-%d-proto-%d",
+					       PF_INET, protocol);
+			goto lookup_protocol;
+		} else
+			goto out_rcu_unlock;
+	}
+
+	err = -EPERM;
+	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+		goto out_rcu_unlock;
+
+	err = -EAFNOSUPPORT;
+	if (!inet_netns_ok(net, protocol))
+		goto out_rcu_unlock;
+
+	sock->ops = answer->ops;
+	answer_prot = answer->prot;
+	answer_no_check = answer->no_check;
+	answer_flags = answer->flags;
+	rcu_read_unlock();
+
+	WARN_ON(answer_prot->slab == NULL);
+
+	err = -ENOBUFS;
+	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+	if (sk == NULL)
+		goto out;
+
+	err = 0;
+	sk->sk_no_check = answer_no_check;
+	if (INET_PROTOSW_REUSE & answer_flags)
+		sk->sk_reuse = 1;
+
+	inet = inet_sk(sk);
+	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+	inet->nodefrag = 0;
+
+	if (SOCK_RAW == sock->type) {
+		inet->inet_num = protocol;
+		if (IPPROTO_RAW == protocol)
+			inet->hdrincl = 1;
+	}
+
+	if (ipv4_config.no_pmtu_disc)
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+	else
+		inet->pmtudisc = IP_PMTUDISC_WANT;
+
+	inet->inet_id = 0;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct	   = inet_sock_destruct;
+	sk->sk_protocol	   = protocol;
+	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+	inet->uc_ttl	= -1;
+	inet->mc_loop	= 1;
+	inet->mc_ttl	= 1;
+	inet->mc_all	= 1;
+	inet->mc_index	= 0;
+	inet->mc_list	= NULL;
+
+	sk_refcnt_debug_inc(sk);
+
+	if (inet->inet_num) {
+		/* It assumes that any protocol which allows
+		 * the user to assign a number at socket
+		 * creation time automatically
+		 * shares.
+		 */
+		inet->inet_sport = htons(inet->inet_num);
+		/* Add to protocol hash chains. */
+		sk->sk_prot->hash(sk);
+	}
+
+	if (sk->sk_prot->init) {
+		err = sk->sk_prot->init(sk);
+		if (err)
+			sk_common_release(sk);
+	}
+out:
+	return err;
+out_rcu_unlock:
+	rcu_read_unlock();
+	goto out;
+}
+
+
+/*
+ *	The peer socket should always be NULL (or else). When we call this
+ *	function we are destroying the object and from then on nobody
+ *	should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		long timeout;
+
+		sock_rps_reset_flow(sk);
+
+		/* Applications forget to leave groups before exiting */
+		ip_mc_drop_socket(sk);
+
+		/* If linger is set, we don't return until the close
+		 * is complete.  Otherwise we return immediately. The
+		 * actually closing is done the same either way.
+		 *
+		 * If the close is due to the process exiting, we never
+		 * linger..
+		 */
+		timeout = 0;
+		if (sock_flag(sk, SOCK_LINGER) &&
+		    !(current->flags & PF_EXITING))
+			timeout = sk->sk_lingertime;
+		sock->sk = NULL;
+		sk->sk_prot->close(sk, timeout);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(inet_release);
+
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind __read_mostly;
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct sock *sk = sock->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	int err;
+
+	/* If the socket has its own bind function then use it. (RAW) */
+	if (sk->sk_prot->bind) {
+		err = sk->sk_prot->bind(sk, uaddr, addr_len);
+		goto out;
+	}
+	err = -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	if (addr->sin_family != AF_INET) {
+		err = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+	/* Not specified by any standard per-se, however it breaks too
+	 * many applications when removed.  It is unfortunate since
+	 * allowing applications to make a non-local bind solves
+	 * several problems with systems using dynamic addressing.
+	 * (ie. your servers still start up even if your ISDN link
+	 *  is temporarily down)
+	 */
+	err = -EADDRNOTAVAIL;
+	if (!sysctl_ip_nonlocal_bind &&
+	    !(inet->freebind || inet->transparent) &&
+	    addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
+	    chk_addr_ret != RTN_LOCAL &&
+	    chk_addr_ret != RTN_MULTICAST &&
+	    chk_addr_ret != RTN_BROADCAST)
+		goto out;
+
+	snum = ntohs(addr->sin_port);
+	err = -EACCES;
+	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		goto out;
+
+	/*      We keep a pair of addresses. rcv_saddr is the one
+	 *      used by hash lookups, and saddr is used for transmit.
+	 *
+	 *      In the BSD API these are the same except where it
+	 *      would be illegal to use them (multicast/broadcast) in
+	 *      which case the sending device address is used.
+	 */
+	lock_sock(sk);
+
+	/* Check these errors (active socket, double bind). */
+	err = -EINVAL;
+	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
+		goto out_release_sock;
+
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->inet_saddr = 0;  /* Use device */
+
+	/* Make sure we are allowed to bind here. */
+	if (sk->sk_prot->get_port(sk, snum)) {
+		inet->inet_saddr = inet->inet_rcv_saddr = 0;
+		err = -EADDRINUSE;
+		goto out_release_sock;
+	}
+
+	if (inet->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	inet->inet_sport = htons(inet->inet_num);
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
+	sk_dst_reset(sk);
+	err = 0;
+out_release_sock:
+	release_sock(sk);
+out:
+	return err;
+}
+EXPORT_SYMBOL(inet_bind);
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
+	if (uaddr->sa_family == AF_UNSPEC)
+		return sk->sk_prot->disconnect(sk, flags);
+
+	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+		return -EAGAIN;
+	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_dgram_connect);
+
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+	/* Basic assumption: if someone sets sk->sk_err, he _must_
+	 * change state of the socket from TCP_SYN_*.
+	 * Connect() does not allow to get error notifications
+	 * without closing the socket.
+	 */
+	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		if (signal_pending(current) || !timeo)
+			break;
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return timeo;
+}
+
+/*
+ *	Connect to a remote host. There is regrettably still a little
+ *	TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err;
+	long timeo;
+
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (uaddr->sa_family == AF_UNSPEC) {
+		err = sk->sk_prot->disconnect(sk, flags);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		goto out;
+	}
+
+	switch (sock->state) {
+	default:
+		err = -EINVAL;
+		goto out;
+	case SS_CONNECTED:
+		err = -EISCONN;
+		goto out;
+	case SS_CONNECTING:
+		err = -EALREADY;
+		/* Fall out of switch with err, set for this state */
+		break;
+	case SS_UNCONNECTED:
+		err = -EISCONN;
+		if (sk->sk_state != TCP_CLOSE)
+			goto out;
+
+		err = sk->sk_prot->connect(sk, uaddr, addr_len);
+		if (err < 0)
+			goto out;
+
+		sock->state = SS_CONNECTING;
+
+		/* Just entered SS_CONNECTING state; the only
+		 * difference is that return value in non-blocking
+		 * case is EINPROGRESS, rather than EALREADY.
+		 */
+		err = -EINPROGRESS;
+		break;
+	}
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		/* Error code is set above */
+		if (!timeo || !inet_wait_for_connect(sk, timeo))
+			goto out;
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out;
+	}
+
+	/* Connection was closed by RST, timeout, ICMP error
+	 * or another process disconnected us.
+	 */
+	if (sk->sk_state == TCP_CLOSE)
+		goto sock_error;
+
+	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
+	 * and error was received after socket entered established state.
+	 * Hence, it is handled normally after connect() return successfully.
+	 */
+
+	sock->state = SS_CONNECTED;
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+
+sock_error:
+	err = sock_error(sk) ? : -ECONNABORTED;
+	sock->state = SS_UNCONNECTED;
+	if (sk->sk_prot->disconnect(sk, flags))
+		sock->state = SS_DISCONNECTING;
+	goto out;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+/*
+ *	Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk1 = sock->sk;
+	int err = -EINVAL;
+	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+	if (!sk2)
+		goto do_err;
+
+	lock_sock(sk2);
+
+	sock_rps_record_flow(sk2);
+	WARN_ON(!((1 << sk2->sk_state) &
+		  (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+
+	sock_graft(sk2, newsock);
+
+	newsock->state = SS_CONNECTED;
+	err = 0;
+	release_sock(sk2);
+do_err:
+	return err;
+}
+EXPORT_SYMBOL(inet_accept);
+
+
+/*
+ *	This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+			int *uaddr_len, int peer)
+{
+	struct sock *sk		= sock->sk;
+	struct inet_sock *inet	= inet_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+
+	sin->sin_family = AF_INET;
+	if (peer) {
+		if (!inet->inet_dport ||
+		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+		     peer == 1))
+			return -ENOTCONN;
+		sin->sin_port = inet->inet_dport;
+		sin->sin_addr.s_addr = inet->inet_daddr;
+	} else {
+		__be32 addr = inet->inet_rcv_saddr;
+		if (!addr)
+			addr = inet->inet_saddr;
+		sin->sin_port = inet->inet_sport;
+		sin->sin_addr.s_addr = addr;
+	}
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+EXPORT_SYMBOL(inet_getname);
+
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		 size_t size)
+{
+	struct sock *sk = sock->sk;
+
+	sock_rps_record_flow(sk);
+
+	/* We may need to bind the socket. */
+	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+	    inet_autobind(sk))
+		return -EAGAIN;
+
+	return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+		      size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	sock_rps_record_flow(sk);
+
+	/* We may need to bind the socket. */
+	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+	    inet_autobind(sk))
+		return -EAGAIN;
+
+	if (sk->sk_prot->sendpage)
+		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+	return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(inet_sendpage);
+
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		 size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	int addr_len = 0;
+	int err;
+
+	sock_rps_record_flow(sk);
+
+	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+				   flags & ~MSG_DONTWAIT, &addr_len);
+	if (err >= 0)
+		msg->msg_namelen = addr_len;
+	return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
+
+int inet_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	/* This should really check to make sure
+	 * the socket is a TCP socket. (WHY AC...)
+	 */
+	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+		       1->2 bit 2 snds.
+		       2->3 */
+	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sock->state == SS_CONNECTING) {
+		if ((1 << sk->sk_state) &
+		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+			sock->state = SS_DISCONNECTING;
+		else
+			sock->state = SS_CONNECTED;
+	}
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		err = -ENOTCONN;
+		/* Hack to wake up other listeners, who can poll for
+		   POLLHUP, even on eg. unconnected UDP sockets -- RR */
+	default:
+		sk->sk_shutdown |= how;
+		if (sk->sk_prot->shutdown)
+			sk->sk_prot->shutdown(sk, how);
+		break;
+
+	/* Remaining two branches are temporary solution for missing
+	 * close() in multithreaded environment. It is _not_ a good idea,
+	 * but we have no choice until close() is repaired at VFS level.
+	 */
+	case TCP_LISTEN:
+		if (!(how & RCV_SHUTDOWN))
+			break;
+		/* Fall through */
+	case TCP_SYN_SENT:
+		err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		break;
+	}
+
+	/* Wake up anyone sleeping in poll. */
+	sk->sk_state_change(sk);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(inet_shutdown);
+
+/*
+ *	ioctl() calls you can issue on an INET socket. Most of these are
+ *	device configuration and stuff and very rarely used. Some ioctls
+ *	pass on to the socket itself.
+ *
+ *	NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ *	loads the devconfigure module does its configuring and unloads it.
+ *	There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	struct net *net = sock_net(sk);
+
+	switch (cmd) {
+	case SIOCGSTAMP:
+		err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+		break;
+	case SIOCGSTAMPNS:
+		err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+		break;
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCRTMSG:
+		err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+		break;
+	case SIOCDARP:
+	case SIOCGARP:
+	case SIOCSARP:
+		err = arp_ioctl(net, cmd, (void __user *)arg);
+		break;
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCSIFPFLAGS:
+	case SIOCGIFPFLAGS:
+	case SIOCSIFFLAGS:
+	case SIOCKILLADDR:
+		err = devinet_ioctl(net, cmd, (void __user *)arg);
+		break;
+	default:
+		if (sk->sk_prot->ioctl)
+			err = sk->sk_prot->ioctl(sk, cmd, arg);
+		else
+			err = -ENOIOCTLCMD;
+		break;
+	}
+	return err;
+}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	int err = -ENOIOCTLCMD;
+
+	if (sk->sk_prot->compat_ioctl)
+		err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+	return err;
+}
+#endif
+
+const struct proto_ops inet_stream_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_stream_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet_getname,
+	.poll		   = tcp_poll,
+	.ioctl		   = inet_ioctl,
+	.listen		   = inet_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = inet_sendpage,
+	.splice_read	   = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_stream_ops);
+
+const struct proto_ops inet_dgram_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet_getname,
+	.poll		   = udp_poll,
+	.ioctl		   = inet_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = inet_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_dgram_ops);
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static const struct proto_ops inet_sockraw_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = inet_getname,
+	.poll		   = datagram_poll,
+	.ioctl		   = inet_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = inet_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = inet_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+	.compat_ioctl	   = inet_compat_ioctl,
+#endif
+};
+
+static const struct net_proto_family inet_family_ops = {
+	.family = PF_INET,
+	.create = inet_create,
+	.owner	= THIS_MODULE,
+};
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+	{
+		.type =       SOCK_STREAM,
+		.protocol =   IPPROTO_TCP,
+		.prot =       &tcp_prot,
+		.ops =        &inet_stream_ops,
+		.no_check =   0,
+		.flags =      INET_PROTOSW_PERMANENT |
+			      INET_PROTOSW_ICSK,
+	},
+
+	{
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_UDP,
+		.prot =       &udp_prot,
+		.ops =        &inet_dgram_ops,
+		.no_check =   UDP_CSUM_DEFAULT,
+		.flags =      INET_PROTOSW_PERMANENT,
+       },
+
+       {
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_ICMP,
+		.prot =       &ping_prot,
+		.ops =        &inet_dgram_ops,
+		.no_check =   UDP_CSUM_DEFAULT,
+		.flags =      INET_PROTOSW_REUSE,
+       },
+
+       {
+	       .type =       SOCK_RAW,
+	       .protocol =   IPPROTO_IP,	/* wild card */
+	       .prot =       &raw_prot,
+	       .ops =        &inet_sockraw_ops,
+	       .no_check =   UDP_CSUM_DEFAULT,
+	       .flags =      INET_PROTOSW_REUSE,
+       }
+};
+
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+	struct list_head *lh;
+	struct inet_protosw *answer;
+	int protocol = p->protocol;
+	struct list_head *last_perm;
+
+	spin_lock_bh(&inetsw_lock);
+
+	if (p->type >= SOCK_MAX)
+		goto out_illegal;
+
+	/* If we are trying to override a permanent protocol, bail. */
+	answer = NULL;
+	last_perm = &inetsw[p->type];
+	list_for_each(lh, &inetsw[p->type]) {
+		answer = list_entry(lh, struct inet_protosw, list);
+
+		/* Check only the non-wild match. */
+		if (INET_PROTOSW_PERMANENT & answer->flags) {
+			if (protocol == answer->protocol)
+				break;
+			last_perm = lh;
+		}
+
+		answer = NULL;
+	}
+	if (answer)
+		goto out_permanent;
+
+	/* Add the new entry after the last permanent entry if any, so that
+	 * the new entry does not override a permanent entry when matched with
+	 * a wild-card protocol. But it is allowed to override any existing
+	 * non-permanent entry.  This means that when we remove this entry, the
+	 * system automatically returns to the old behavior.
+	 */
+	list_add_rcu(&p->list, last_perm);
+out:
+	spin_unlock_bh(&inetsw_lock);
+
+	return;
+
+out_permanent:
+	printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+	       protocol);
+	goto out;
+
+out_illegal:
+	printk(KERN_ERR
+	       "Ignoring attempt to register invalid socket type %d.\n",
+	       p->type);
+	goto out;
+}
+EXPORT_SYMBOL(inet_register_protosw);
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+	if (INET_PROTOSW_PERMANENT & p->flags) {
+		printk(KERN_ERR
+		       "Attempt to unregister permanent protocol %d.\n",
+		       p->protocol);
+	} else {
+		spin_lock_bh(&inetsw_lock);
+		list_del_rcu(&p->list);
+		spin_unlock_bh(&inetsw_lock);
+
+		synchronize_net();
+	}
+}
+EXPORT_SYMBOL(inet_unregister_protosw);
+
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr __read_mostly;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	__be32 old_saddr = inet->inet_saddr;
+	__be32 daddr = inet->inet_daddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	__be32 new_saddr;
+	struct ip_options_rcu *inet_opt;
+
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+
+	/* Query new route. */
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+			      sk->sk_bound_dev_if, sk->sk_protocol,
+			      inet->inet_sport, inet->inet_dport, sk, false);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	sk_setup_caps(sk, &rt->dst);
+
+	new_saddr = fl4->saddr;
+
+	if (new_saddr == old_saddr)
+		return 0;
+
+	if (sysctl_ip_dynaddr > 1) {
+		printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
+		       __func__, &old_saddr, &new_saddr);
+	}
+
+	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
+
+	/*
+	 * XXX The only one ugly spot where we need to
+	 * XXX really change the sockets identity after
+	 * XXX it has entered the hashes. -DaveM
+	 *
+	 * Besides that, it does not check for connection
+	 * uniqueness. Wait for troubles.
+	 */
+	__sk_prot_rehash(sk);
+	return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+	__be32 daddr;
+	struct ip_options_rcu *inet_opt;
+	struct flowi4 *fl4;
+	int err;
+
+	/* Route is OK, nothing to do. */
+	if (rt)
+		return 0;
+
+	/* Reroute. */
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	daddr = inet->inet_daddr;
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	rcu_read_unlock();
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+				   inet->inet_dport, inet->inet_sport,
+				   sk->sk_protocol, RT_CONN_FLAGS(sk),
+				   sk->sk_bound_dev_if);
+	if (!IS_ERR(rt)) {
+		err = 0;
+		sk_setup_caps(sk, &rt->dst);
+	} else {
+		err = PTR_ERR(rt);
+
+		/* Routing failed... */
+		sk->sk_route_caps = 0;
+		/*
+		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+		 */
+		if (!sysctl_ip_dynaddr ||
+		    sk->sk_state != TCP_SYN_SENT ||
+		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+		    (err = inet_sk_reselect_saddr(sk)) != 0)
+			sk->sk_err_soft = -err;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
+static int inet_gso_send_check(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	const struct net_protocol *ops;
+	int proto;
+	int ihl;
+	int err = -EINVAL;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	iph = ip_hdr(skb);
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, ihl)))
+		goto out;
+
+	__skb_pull(skb, ihl);
+	skb_reset_transport_header(skb);
+	iph = ip_hdr(skb);
+	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	err = -EPROTONOSUPPORT;
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (likely(ops && ops->gso_send_check))
+		err = ops->gso_send_check(skb);
+	rcu_read_unlock();
+
+out:
+	return err;
+}
+
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct iphdr *iph;
+	const struct net_protocol *ops;
+	int proto;
+	int ihl;
+	int id;
+	unsigned int offset = 0;
+
+	if (!(features & NETIF_F_V4_CSUM))
+		features &= ~NETIF_F_SG;
+
+	if (unlikely(skb_shinfo(skb)->gso_type &
+		     ~(SKB_GSO_TCPV4 |
+		       SKB_GSO_UDP |
+		       SKB_GSO_DODGY |
+		       SKB_GSO_TCP_ECN |
+		       0)))
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	iph = ip_hdr(skb);
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, ihl)))
+		goto out;
+
+	__skb_pull(skb, ihl);
+	skb_reset_transport_header(skb);
+	iph = ip_hdr(skb);
+	id = ntohs(iph->id);
+	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (likely(ops && ops->gso_segment))
+		segs = ops->gso_segment(skb, features);
+	rcu_read_unlock();
+
+	if (!segs || IS_ERR(segs))
+		goto out;
+
+	skb = segs;
+	do {
+		iph = ip_hdr(skb);
+		if (proto == IPPROTO_UDP) {
+			iph->id = htons(id);
+			iph->frag_off = htons(offset >> 3);
+			if (skb->next != NULL)
+				iph->frag_off |= htons(IP_MF);
+			offset += (skb->len - skb->mac_len - iph->ihl * 4);
+		} else
+			iph->id = htons(id++);
+		iph->tot_len = htons(skb->len - skb->mac_len);
+		iph->check = 0;
+		iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+	} while ((skb = skb->next));
+
+out:
+	return segs;
+}
+
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	const struct net_protocol *ops;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	const struct iphdr *iph;
+	unsigned int hlen;
+	unsigned int off;
+	unsigned int id;
+	int flush = 1;
+	int proto;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+	iph = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		iph = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!iph))
+			goto out;
+	}
+
+	proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (!ops || !ops->gro_receive)
+		goto out_unlock;
+
+	if (*(u8 *)iph != 0x45)
+		goto out_unlock;
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto out_unlock;
+
+	id = ntohl(*(__be32 *)&iph->id);
+	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+	id >>= 16;
+
+	for (p = *head; p; p = p->next) {
+		struct iphdr *iph2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		iph2 = ip_hdr(p);
+
+		if ((iph->protocol ^ iph2->protocol) |
+		    (iph->tos ^ iph2->tos) |
+		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* All fields must match except length and checksum. */
+		NAPI_GRO_CB(p)->flush |=
+			(iph->ttl ^ iph2->ttl) |
+			((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+
+		NAPI_GRO_CB(p)->flush |= flush;
+	}
+
+	NAPI_GRO_CB(skb)->flush |= flush;
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
+
+	pp = ops->gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int inet_gro_complete(struct sk_buff *skb)
+{
+	const struct net_protocol *ops;
+	struct iphdr *iph = ip_hdr(skb);
+	int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+	int err = -ENOSYS;
+	__be16 newlen = htons(skb->len - skb_network_offset(skb));
+
+	csum_replace2(&iph->check, iph->tot_len, newlen);
+	iph->tot_len = newlen;
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet_protos[proto]);
+	if (WARN_ON(!ops || !ops->gro_complete))
+		goto out_unlock;
+
+	err = ops->gro_complete(skb);
+
+out_unlock:
+	rcu_read_unlock();
+
+	return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+			 unsigned short type, unsigned char protocol,
+			 struct net *net)
+{
+	struct socket *sock;
+	int rc = sock_create_kern(family, type, protocol, &sock);
+
+	if (rc == 0) {
+		*sk = sock->sk;
+		(*sk)->sk_allocation = GFP_ATOMIC;
+		/*
+		 * Unhash it so that IP input processing does not even see it,
+		 * we do not wish this socket to see incoming packets.
+		 */
+		(*sk)->sk_prot->unhash(*sk);
+
+		sk_change_net(*sk, net);
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+{
+	unsigned long res = 0;
+	int i;
+
+	for_each_possible_cpu(i) {
+		res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+		res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+	u64 res = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *bhptr, *userptr;
+		struct u64_stats_sync *syncp;
+		u64 v_bh, v_user;
+		unsigned int start;
+
+		/* first mib used by softirq context, we must use _bh() accessors */
+		bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin_bh(syncp);
+			v_bh = *(((u64 *) bhptr) + offt);
+		} while (u64_stats_fetch_retry_bh(syncp, start));
+
+		/* second mib used in USER context */
+		userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
+		syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin(syncp);
+			v_user = *(((u64 *) userptr) + offt);
+		} while (u64_stats_fetch_retry(syncp, start));
+
+		res += v_bh + v_user;
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
+{
+	BUG_ON(ptr == NULL);
+	ptr[0] = __alloc_percpu(mibsize, align);
+	if (!ptr[0])
+		goto err0;
+	ptr[1] = __alloc_percpu(mibsize, align);
+	if (!ptr[1])
+		goto err1;
+	return 0;
+err1:
+	free_percpu(ptr[0]);
+	ptr[0] = NULL;
+err0:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(snmp_mib_init);
+
+void snmp_mib_free(void __percpu *ptr[2])
+{
+	BUG_ON(ptr == NULL);
+	free_percpu(ptr[0]);
+	free_percpu(ptr[1]);
+	ptr[0] = ptr[1] = NULL;
+}
+EXPORT_SYMBOL_GPL(snmp_mib_free);
+
+#ifdef CONFIG_IP_MULTICAST
+static const struct net_protocol igmp_protocol = {
+	.handler =	igmp_rcv,
+	.netns_ok =	1,
+};
+#endif
+
+static const struct net_protocol tcp_protocol = {
+	.handler =	tcp_v4_rcv,
+	.err_handler =	tcp_v4_err,
+	.gso_send_check = tcp_v4_gso_send_check,
+	.gso_segment =	tcp_tso_segment,
+	.gro_receive =	tcp4_gro_receive,
+	.gro_complete =	tcp4_gro_complete,
+	.no_policy =	1,
+	.netns_ok =	1,
+};
+
+static const struct net_protocol udp_protocol = {
+	.handler =	udp_rcv,
+	.err_handler =	udp_err,
+	.gso_send_check = udp4_ufo_send_check,
+	.gso_segment = udp4_ufo_fragment,
+	.no_policy =	1,
+	.netns_ok =	1,
+};
+
+static const struct net_protocol icmp_protocol = {
+	.handler =	icmp_rcv,
+	.err_handler =	ping_err,
+	.no_policy =	1,
+	.netns_ok =	1,
+};
+
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
+			  sizeof(struct tcp_mib),
+			  __alignof__(struct tcp_mib)) < 0)
+		goto err_tcp_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
+		goto err_ip_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
+			  sizeof(struct linux_mib),
+			  __alignof__(struct linux_mib)) < 0)
+		goto err_net_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
+		goto err_udp_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
+		goto err_udplite_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
+			  sizeof(struct icmp_mib),
+			  __alignof__(struct icmp_mib)) < 0)
+		goto err_icmp_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
+			  sizeof(struct icmpmsg_mib),
+			  __alignof__(struct icmpmsg_mib)) < 0)
+		goto err_icmpmsg_mib;
+
+	tcp_mib_init(net);
+	return 0;
+
+err_icmpmsg_mib:
+	snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+err_icmp_mib:
+	snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+err_udplite_mib:
+	snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+err_udp_mib:
+	snmp_mib_free((void __percpu **)net->mib.net_statistics);
+err_net_mib:
+	snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+err_ip_mib:
+	snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+err_tcp_mib:
+	return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+	snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+	snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+	snmp_mib_free((void __percpu **)net->mib.net_statistics);
+	snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+	snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+	.init = ipv4_mib_init_net,
+	.exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+	return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static int ipv4_proc_init(void);
+
+/*
+ *	IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IP),
+	.func = ip_rcv,
+	.gso_send_check = inet_gso_send_check,
+	.gso_segment = inet_gso_segment,
+	.gro_receive = inet_gro_receive,
+	.gro_complete = inet_gro_complete,
+};
+
+static int __init inet_init(void)
+{
+	struct sk_buff *dummy_skb;
+	struct inet_protosw *q;
+	struct list_head *r;
+	int rc = -EINVAL;
+
+	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
+	rc = proto_register(&tcp_prot, 1);
+	if (rc)
+		goto out_free_reserved_ports;
+
+	rc = proto_register(&udp_prot, 1);
+	if (rc)
+		goto out_unregister_tcp_proto;
+
+	rc = proto_register(&raw_prot, 1);
+	if (rc)
+		goto out_unregister_udp_proto;
+
+	rc = proto_register(&ping_prot, 1);
+	if (rc)
+		goto out_unregister_raw_proto;
+
+	/*
+	 *	Tell SOCKET that we are alive...
+	 */
+
+	(void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+	ip_static_sysctl_init();
+#endif
+
+	/*
+	 *	Add all the base protocols.
+	 */
+
+	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+#ifdef CONFIG_IP_MULTICAST
+	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+#endif
+
+	/* Register the socket-side information for inet_create. */
+	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+		inet_register_protosw(q);
+
+	/*
+	 *	Set the ARP module up
+	 */
+
+	arp_init();
+
+	/*
+	 *	Set the IP module up
+	 */
+
+	ip_init();
+
+	tcp_v4_init();
+
+	/* Setup TCP slab cache for open requests. */
+	tcp_init();
+
+	/* Setup UDP memory threshold */
+	udp_init();
+
+	/* Add UDP-Lite (RFC 3828) */
+	udplite4_register();
+
+	ping_init();
+
+	/*
+	 *	Set the ICMP layer up
+	 */
+
+	if (icmp_init() < 0)
+		panic("Failed to create the ICMP control socket.\n");
+
+	/*
+	 *	Initialise the multicast router
+	 */
+#if defined(CONFIG_IP_MROUTE)
+	if (ip_mr_init())
+		printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
+#endif
+	/*
+	 *	Initialise per-cpu ipv4 mibs
+	 */
+
+	if (init_ipv4_mibs())
+		printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
+
+	ipv4_proc_init();
+
+	ipfrag_init();
+
+	dev_add_pack(&ip_packet_type);
+
+	rc = 0;
+out:
+	return rc;
+out_unregister_raw_proto:
+	proto_unregister(&raw_prot);
+out_unregister_udp_proto:
+	proto_unregister(&udp_prot);
+out_unregister_tcp_proto:
+	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
+	goto out;
+}
+
+fs_initcall(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+static int __init ipv4_proc_init(void)
+{
+	int rc = 0;
+
+	if (raw_proc_init())
+		goto out_raw;
+	if (tcp4_proc_init())
+		goto out_tcp;
+	if (udp4_proc_init())
+		goto out_udp;
+	if (ping_proc_init())
+		goto out_ping;
+	if (ip_misc_proc_init())
+		goto out_misc;
+out:
+	return rc;
+out_misc:
+	ping_proc_exit();
+out_ping:
+	udp4_proc_exit();
+out_udp:
+	tcp4_proc_exit();
+out_tcp:
+	raw_proc_exit();
+out_raw:
+	rc = -ENOMEM;
+	goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 00000000..36d14406
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,535 @@
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/scatterlist.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+
+struct ah_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+			  unsigned int size)
+{
+	unsigned int len;
+
+	len = size + crypto_ahash_digestsize(ahash) +
+	      (crypto_ahash_alignmask(ahash) &
+	       ~(crypto_tfm_ctx_alignment() - 1));
+
+	len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+	len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+	return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+			     unsigned int offset)
+{
+	return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+					       u8 *icv)
+{
+	struct ahash_request *req;
+
+	req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+				crypto_tfm_ctx_alignment());
+
+	ahash_request_set_tfm(req, ahash);
+
+	return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+					     struct ahash_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_ahash_reqsize(ahash),
+			     __alignof__(struct scatterlist));
+}
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
+{
+	unsigned char * optptr = (unsigned char*)(iph+1);
+	int  l = iph->ihl*4 - sizeof(struct iphdr);
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return 0;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l)
+			return -EINVAL;
+		switch (*optptr) {
+		case IPOPT_SEC:
+		case 0x85:	/* Some "Extended Security" crap. */
+		case IPOPT_CIPSO:
+		case IPOPT_RA:
+		case 0x80|21:	/* RFC1770 */
+			break;
+		case IPOPT_LSRR:
+		case IPOPT_SSRR:
+			if (optlen < 6)
+				return -EINVAL;
+			memcpy(daddr, optptr+optlen-4, 4);
+			/* Fall through */
+		default:
+			memset(optptr, 0, optlen);
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+	return 0;
+}
+
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+	u8 *icv;
+	struct iphdr *iph;
+	struct sk_buff *skb = base->data;
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+	struct ah_data *ahp = x->data;
+	struct iphdr *top_iph = ip_hdr(skb);
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	int ihl = ip_hdrlen(skb);
+
+	iph = AH_SKB_CB(skb)->tmp;
+	icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+	top_iph->tos = iph->tos;
+	top_iph->ttl = iph->ttl;
+	top_iph->frag_off = iph->frag_off;
+	if (top_iph->ihl != 5) {
+		top_iph->daddr = iph->daddr;
+		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+	}
+
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_output_resume(skb, err);
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	int nfrags;
+	int ihl;
+	u8 *icv;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct iphdr *iph, *top_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	skb_push(skb, -skb_network_offset(skb));
+	ah = ip_auth_hdr(skb);
+	ihl = ip_hdrlen(skb);
+
+	err = -ENOMEM;
+	iph = ah_alloc_tmp(ahash, nfrags, ihl);
+	if (!iph)
+		goto out;
+
+	icv = ah_tmp_icv(ahash, iph, ihl);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	top_iph = ip_hdr(skb);
+
+	iph->tos = top_iph->tos;
+	iph->ttl = top_iph->ttl;
+	iph->frag_off = top_iph->frag_off;
+
+	if (top_iph->ihl != 5) {
+		iph->daddr = top_iph->daddr;
+		memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+		err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+		if (err)
+			goto out_free;
+	}
+
+	ah->nexthdr = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_AH;
+
+	top_iph->tos = 0;
+	top_iph->tot_len = htons(skb->len);
+	top_iph->frag_off = 0;
+	top_iph->ttl = 0;
+	top_iph->check = 0;
+
+	if (x->props.flags & XFRM_STATE_ALIGN4)
+		ah->hdrlen  = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+	else
+		ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+	ah->reserved = 0;
+	ah->spi = x->id.spi;
+	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
+
+	ahash_request_set_crypt(req, sg, icv, skb->len);
+	ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+	AH_SKB_CB(skb)->tmp = iph;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		if (err == -EBUSY)
+			err = NET_XMIT_DROP;
+		goto out_free;
+	}
+
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+	top_iph->tos = iph->tos;
+	top_iph->ttl = iph->ttl;
+	top_iph->frag_off = iph->frag_off;
+	if (top_iph->ihl != 5) {
+		top_iph->daddr = iph->daddr;
+		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+	}
+
+out_free:
+	kfree(iph);
+out:
+	return err;
+}
+
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+	u8 *auth_data;
+	u8 *icv;
+	struct iphdr *work_iph;
+	struct sk_buff *skb = base->data;
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ah_data *ahp = x->data;
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	int ihl = ip_hdrlen(skb);
+	int ah_hlen = (ah->hdrlen + 2) << 2;
+
+	work_iph = AH_SKB_CB(skb)->tmp;
+	auth_data = ah_tmp_auth(work_iph, ihl);
+	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	if (err)
+		goto out;
+
+	err = ah->nexthdr;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, ihl);
+	__skb_pull(skb, ah_hlen + ihl);
+	skb_set_transport_header(skb, -ihl);
+out:
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_input_resume(skb, err);
+}
+
+static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int ah_hlen;
+	int ihl;
+	int nexthdr;
+	int nfrags;
+	u8 *auth_data;
+	u8 *icv;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct iphdr *iph, *work_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	int err = -ENOMEM;
+
+	if (!pskb_may_pull(skb, sizeof(*ah)))
+		goto out;
+
+	ah = (struct ip_auth_hdr *)skb->data;
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	nexthdr = ah->nexthdr;
+	ah_hlen = (ah->hdrlen + 2) << 2;
+
+	if (x->props.flags & XFRM_STATE_ALIGN4) {
+		if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+		    ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+			goto out;
+	} else {
+		if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+		    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+			goto out;
+	}
+
+	if (!pskb_may_pull(skb, ah_hlen))
+		goto out;
+
+	/* We are going to _remove_ AH header to keep sockets happy,
+	 * so... Later this can change. */
+	if (skb_cloned(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	ah = (struct ip_auth_hdr *)skb->data;
+	iph = ip_hdr(skb);
+	ihl = ip_hdrlen(skb);
+
+	work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+	if (!work_iph)
+		goto out;
+
+	auth_data = ah_tmp_auth(work_iph, ihl);
+	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+
+	memcpy(work_iph, iph, ihl);
+	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	iph->ttl = 0;
+	iph->tos = 0;
+	iph->frag_off = 0;
+	iph->check = 0;
+	if (ihl > sizeof(*iph)) {
+		__be32 dummy;
+		err = ip_clear_mutable_options(iph, &dummy);
+		if (err)
+			goto out_free;
+	}
+
+	skb_push(skb, ihl);
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
+
+	ahash_request_set_crypt(req, sg, icv, skb->len);
+	ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+	AH_SKB_CB(skb)->tmp = work_iph;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		goto out_free;
+	}
+
+	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	if (err)
+		goto out_free;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, ihl);
+	__skb_pull(skb, ah_hlen + ihl);
+	skb_set_transport_header(skb, -ihl);
+
+	err = nexthdr;
+
+out_free:
+	kfree (work_iph);
+out:
+	return err;
+}
+
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+		return;
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      ah->spi, IPPROTO_AH, AF_INET);
+	if (!x)
+		return;
+	printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+	       ntohl(ah->spi), ntohl(iph->daddr));
+	xfrm_state_put(x);
+}
+
+static int ah_init_state(struct xfrm_state *x)
+{
+	struct ah_data *ahp = NULL;
+	struct xfrm_algo_desc *aalg_desc;
+	struct crypto_ahash *ahash;
+
+	if (!x->aalg)
+		goto error;
+
+	if (x->encap)
+		goto error;
+
+	ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+	if (!ahp)
+		return -ENOMEM;
+
+	ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+	if (IS_ERR(ahash))
+		goto error;
+
+	ahp->ahash = ahash;
+	if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+				(x->aalg->alg_key_len + 7) / 8))
+		goto error;
+
+	/*
+	 * Lookup the algorithm description maintained by xfrm_algo,
+	 * verify crypto transform properties, and store information
+	 * we need for AH processing.  This lookup cannot fail here
+	 * after a successful crypto_alloc_ahash().
+	 */
+	aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+	BUG_ON(!aalg_desc);
+
+	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+	    crypto_ahash_digestsize(ahash)) {
+		printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+		       x->aalg->alg_name, crypto_ahash_digestsize(ahash),
+		       aalg_desc->uinfo.auth.icv_fullbits/8);
+		goto error;
+	}
+
+	ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+	ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+	BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+
+	if (x->props.flags & XFRM_STATE_ALIGN4)
+		x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+						  ahp->icv_trunc_len);
+	else
+		x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+						  ahp->icv_trunc_len);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		x->props.header_len += sizeof(struct iphdr);
+	x->data = ahp;
+
+	return 0;
+
+error:
+	if (ahp) {
+		crypto_free_ahash(ahp->ahash);
+		kfree(ahp);
+	}
+	return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+	struct ah_data *ahp = x->data;
+
+	if (!ahp)
+		return;
+
+	crypto_free_ahash(ahp->ahash);
+	kfree(ahp);
+}
+
+
+static const struct xfrm_type ah_type =
+{
+	.description	= "AH4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_AH,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= ah_init_state,
+	.destructor	= ah_destroy,
+	.input		= ah_input,
+	.output		= ah_output
+};
+
+static const struct net_protocol ah4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	ah4_err,
+	.no_policy	=	1,
+	.netns_ok	=	1,
+};
+
+static int __init ah4_init(void)
+{
+	if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+		printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+		printk(KERN_INFO "ip ah init: can't add protocol\n");
+		xfrm_unregister_type(&ah_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+	if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+		printk(KERN_INFO "ip ah close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+		printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 00000000..d8f852db
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1490 @@
+/* linux/net/ipv4/arp.c
+ *
+ * Copyright (C) 1994 by Florian  La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *		Alan Cox	:	Removed the Ethernet assumptions in
+ *					Florian's code
+ *		Alan Cox	:	Fixed some small errors in the ARP
+ *					logic
+ *		Alan Cox	:	Allow >4K in /proc
+ *		Alan Cox	:	Make ARP add its own protocol entry
+ *		Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
+ *		Stephen Henson	:	Add AX25 support to arp_get_info()
+ *		Alan Cox	:	Drop data when a device is downed.
+ *		Alan Cox	:	Use init_timer().
+ *		Alan Cox	:	Double lock fixes.
+ *		Martin Seine	:	Move the arphdr structure
+ *					to if_arp.h for compatibility.
+ *					with BSD based programs.
+ *		Andrew Tridgell :       Added ARP netmask code and
+ *					re-arranged proxy handling.
+ *		Alan Cox	:	Changed to use notifiers.
+ *		Niibe Yutaka	:	Reply for this device or proxies only.
+ *		Alan Cox	:	Don't proxy across hardware types!
+ *		Jonathan Naylor :	Added support for NET/ROM.
+ *		Mike Shaver     :       RFC1122 checks.
+ *		Jonathan Naylor :	Only lookup the hardware address for
+ *					the correct hardware type.
+ *		Germano Caronni	:	Assorted subtle races.
+ *		Craig Schlenter :	Don't modify permanent entry
+ *					during arp_rcv.
+ *		Russ Nelson	:	Tidied up a few bits.
+ *		Alexey Kuznetsov:	Major changes to caching and behaviour,
+ *					eg intelligent arp probing and
+ *					generation
+ *					of host down events.
+ *		Alan Cox	:	Missing unlock in device events.
+ *		Eckes		:	ARP ioctl control errors.
+ *		Alexey Kuznetsov:	Arp free fix.
+ *		Manuel Rodriguez:	Gratuitous ARP.
+ *              Jonathan Layes  :       Added arpd support through kerneld
+ *                                      message queue (960314)
+ *		Mike Shaver	:	/proc/sys/net/ipv4/arp_* support
+ *		Mike McLagan    :	Routing by source
+ *		Stuart Cheshire	:	Metricom and grat arp fixes
+ *					*** FOR 2.1 clean this up ***
+ *		Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ *		Alan Cox	:	Took the AP1000 nasty FDDI hack and
+ *					folded into the mainstream FDDI code.
+ *					Ack spit, Linus how did you allow that
+ *					one in...
+ *		Jes Sorensen	:	Make FDDI work again in 2.1.x and
+ *					clean up the APFDDI & gen. FDDI bits.
+ *		Alexey Kuznetsov:	new arp state machine;
+ *					now it is in net/core/neighbour.c.
+ *		Krzysztof Halasa:	Added Frame Relay ARP support.
+ *		Arnaldo C. Melo :	convert /proc/net/arp to seq_file
+ *		Shmulik Hen:		Split arp_send to arp_create and
+ *					arp_xmit so intermediate drivers like
+ *					bonding can change the skb before
+ *					sending (e.g. insert 8021q tag).
+ *		Harald Welte	:	convert to make use of jenkins hash
+ *		Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ *	Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static const struct neigh_ops arp_generic_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_connected_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static const struct neigh_ops arp_hh_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_resolve_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static const struct neigh_ops arp_direct_ops = {
+	.family =		AF_INET,
+	.output =		dev_queue_xmit,
+	.connected_output =	dev_queue_xmit,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static const struct neigh_ops arp_broken_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_compat_output,
+	.connected_output =	neigh_compat_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+struct neigh_table arp_tbl = {
+	.family		= AF_INET,
+	.entry_size	= sizeof(struct neighbour) + 4,
+	.key_len	= 4,
+	.hash		= arp_hash,
+	.constructor	= arp_constructor,
+	.proxy_redo	= parp_redo,
+	.id		= "arp_cache",
+	.parms		= {
+		.tbl			= &arp_tbl,
+		.base_reachable_time	= 30 * HZ,
+		.retrans_time		= 1 * HZ,
+		.gc_staletime		= 60 * HZ,
+		.reachable_time		= 30 * HZ,
+		.delay_probe_time	= 5 * HZ,
+		.queue_len		= 3,
+		.ucast_probes		= 3,
+		.mcast_probes		= 3,
+		.anycast_delay		= 1 * HZ,
+		.proxy_delay		= (8 * HZ) / 10,
+		.proxy_qlen		= 64,
+		.locktime		= 1 * HZ,
+	},
+	.gc_interval	= 30 * HZ,
+	.gc_thresh1	= 128,
+	.gc_thresh2	= 512,
+	.gc_thresh3	= 1024,
+};
+EXPORT_SYMBOL(arp_tbl);
+
+int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802:
+		ip_eth_mc_map(addr, haddr);
+		return 0;
+	case ARPHRD_IEEE802_TR:
+		ip_tr_mc_map(addr, haddr);
+		return 0;
+	case ARPHRD_INFINIBAND:
+		ip_ib_mc_map(addr, dev->broadcast, haddr);
+		return 0;
+	case ARPHRD_IPGRE:
+		ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+		return 0;
+	default:
+		if (dir) {
+			memcpy(haddr, dev->broadcast, dev->addr_len);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey,
+		    const struct net_device *dev,
+		    __u32 hash_rnd)
+{
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+	__be32 addr = *(__be32 *)neigh->primary_key;
+	struct net_device *dev = neigh->dev;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (in_dev == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	neigh->type = inet_addr_type(dev_net(dev), addr);
+
+	parms = in_dev->arp_parms;
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+	rcu_read_unlock();
+
+	if (!dev->header_ops) {
+		neigh->nud_state = NUD_NOARP;
+		neigh->ops = &arp_direct_ops;
+		neigh->output = neigh->ops->queue_xmit;
+	} else {
+		/* Good devices (checked by reading texts, but only Ethernet is
+		   tested)
+
+		   ARPHRD_ETHER: (ethernet, apfddi)
+		   ARPHRD_FDDI: (fddi)
+		   ARPHRD_IEEE802: (tr)
+		   ARPHRD_METRICOM: (strip)
+		   ARPHRD_ARCNET:
+		   etc. etc. etc.
+
+		   ARPHRD_IPDDP will also work, if author repairs it.
+		   I did not it, because this driver does not work even
+		   in old paradigm.
+		 */
+
+#if 1
+		/* So... these "amateur" devices are hopeless.
+		   The only thing, that I can say now:
+		   It is very sad that we need to keep ugly obsolete
+		   code to make them happy.
+
+		   They should be moved to more reasonable state, now
+		   they use rebuild_header INSTEAD OF hard_start_xmit!!!
+		   Besides that, they are sort of out of date
+		   (a lot of redundant clones/copies, useless in 2.1),
+		   I wonder why people believe that they work.
+		 */
+		switch (dev->type) {
+		default:
+			break;
+		case ARPHRD_ROSE:
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+		case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+		case ARPHRD_NETROM:
+#endif
+			neigh->ops = &arp_broken_ops;
+			neigh->output = neigh->ops->output;
+			return 0;
+#else
+			break;
+#endif
+		}
+#endif
+		if (neigh->type == RTN_MULTICAST) {
+			neigh->nud_state = NUD_NOARP;
+			arp_mc_map(addr, neigh->ha, dev, 1);
+		} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+		} else if (neigh->type == RTN_BROADCAST ||
+			   (dev->flags & IFF_POINTOPOINT)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+		}
+
+		if (dev->header_ops->cache)
+			neigh->ops = &arp_hh_ops;
+		else
+			neigh->ops = &arp_generic_ops;
+
+		if (neigh->nud_state & NUD_VALID)
+			neigh->output = neigh->ops->connected_output;
+		else
+			neigh->output = neigh->ops->output;
+	}
+	return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	dst_link_failure(skb);
+	kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+	__be32 saddr = 0;
+	u8  *dst_ha = NULL;
+	struct net_device *dev = neigh->dev;
+	__be32 target = *(__be32 *)neigh->primary_key;
+	int probes = atomic_read(&neigh->probes);
+	struct in_device *in_dev;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return;
+	}
+	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+	default:
+	case 0:		/* By default announce any local IP */
+		if (skb && inet_addr_type(dev_net(dev),
+					  ip_hdr(skb)->saddr) == RTN_LOCAL)
+			saddr = ip_hdr(skb)->saddr;
+		break;
+	case 1:		/* Restrict announcements of saddr in same subnet */
+		if (!skb)
+			break;
+		saddr = ip_hdr(skb)->saddr;
+		if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+			/* saddr should be known to target */
+			if (inet_addr_onlink(in_dev, target, saddr))
+				break;
+		}
+		saddr = 0;
+		break;
+	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
+		break;
+	}
+	rcu_read_unlock();
+
+	if (!saddr)
+		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+	probes -= neigh->parms->ucast_probes;
+	if (probes < 0) {
+		if (!(neigh->nud_state & NUD_VALID))
+			printk(KERN_DEBUG
+			       "trying to ucast probe in NUD_INVALID\n");
+		dst_ha = neigh->ha;
+		read_lock_bh(&neigh->lock);
+	} else {
+		probes -= neigh->parms->app_probes;
+		if (probes < 0) {
+#ifdef CONFIG_ARPD
+			neigh_app_ns(neigh);
+#endif
+			return;
+		}
+	}
+
+	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+		 dst_ha, dev->dev_addr, NULL);
+	if (dst_ha)
+		read_unlock_bh(&neigh->lock);
+}
+
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
+{
+	int scope;
+
+	switch (IN_DEV_ARP_IGNORE(in_dev)) {
+	case 0:	/* Reply, the tip is already validated */
+		return 0;
+	case 1:	/* Reply only if tip is configured on the incoming interface */
+		sip = 0;
+		scope = RT_SCOPE_HOST;
+		break;
+	case 2:	/*
+		 * Reply only if tip is configured on the incoming interface
+		 * and is in same subnet as sip
+		 */
+		scope = RT_SCOPE_HOST;
+		break;
+	case 3:	/* Do not reply for scope host addresses */
+		sip = 0;
+		scope = RT_SCOPE_LINK;
+		break;
+	case 4:	/* Reserved */
+	case 5:
+	case 6:
+	case 7:
+		return 0;
+	case 8:	/* Do not reply */
+		return 1;
+	default:
+		return 0;
+	}
+	return !inet_confirm_addr(in_dev, sip, tip, scope);
+}
+
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+{
+	struct rtable *rt;
+	int flag = 0;
+	/*unsigned long now; */
+	struct net *net = dev_net(dev);
+
+	rt = ip_route_output(net, sip, tip, 0, 0);
+	if (IS_ERR(rt))
+		return 1;
+	if (rt->dst.dev != dev) {
+		NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+		flag = 1;
+	}
+	ip_rt_put(rt);
+	return flag;
+}
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ *	Find an arp mapping in the cache. If not found, post a request.
+ *
+ *	It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ *	even if it exists. It is supposed that skb->dev was mangled
+ *	by a virtual device (eql, shaper). Nobody but broken devices
+ *	is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+			      __be32 paddr, struct net_device *dev)
+{
+	switch (addr_hint) {
+	case RTN_LOCAL:
+		printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+		memcpy(haddr, dev->dev_addr, dev->addr_len);
+		return 1;
+	case RTN_MULTICAST:
+		arp_mc_map(paddr, haddr, dev, 1);
+		return 1;
+	case RTN_BROADCAST:
+		memcpy(haddr, dev->broadcast, dev->addr_len);
+		return 1;
+	}
+	return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	__be32 paddr;
+	struct neighbour *n;
+
+	if (!skb_dst(skb)) {
+		printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+		kfree_skb(skb);
+		return 1;
+	}
+
+	paddr = skb_rtable(skb)->rt_gateway;
+
+	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+			       paddr, dev))
+		return 0;
+
+	n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+	if (n) {
+		n->used = jiffies;
+		if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+			neigh_ha_snapshot(haddr, n, dev);
+			neigh_release(n);
+			return 0;
+		}
+		neigh_release(n);
+	} else
+		kfree_skb(skb);
+	return 1;
+}
+EXPORT_SYMBOL(arp_find);
+
+/* END OF OBSOLETE FUNCTIONS */
+
+struct neighbour *__arp_bind_neighbour(struct dst_entry *dst, __be32 nexthop)
+{
+	struct net_device *dev = dst->dev;
+
+	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+		nexthop = 0;
+	return __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+		dev->type == ARPHRD_ATM ?
+		clip_tbl_hook :
+#endif
+		&arp_tbl, &nexthop, dev);
+}
+
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+	struct net_device *dev = dst->dev;
+	struct neighbour *n = dst_get_neighbour(dst);
+
+	if (dev == NULL)
+		return -EINVAL;
+	if (n == NULL) {
+		n = __arp_bind_neighbour(dst, ((struct rtable *)dst)->rt_gateway);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		dst_set_neighbour(dst, n);
+	}
+	return 0;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+				struct net_device *dev,	struct rtable *rt)
+{
+	struct in_device *out_dev;
+	int imi, omi = -1;
+
+	if (rt->dst.dev == dev)
+		return 0;
+
+	if (!IN_DEV_PROXY_ARP(in_dev))
+		return 0;
+	imi = IN_DEV_MEDIUM_ID(in_dev);
+	if (imi == 0)
+		return 1;
+	if (imi == -1)
+		return 0;
+
+	/* place to check for proxy_arp for routes */
+
+	out_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (out_dev)
+		omi = IN_DEV_MEDIUM_ID(out_dev);
+
+	return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface.  This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router.  As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ *  This technology is known by different names:
+ *    In RFC 3069 it is called VLAN Aggregation.
+ *    Cisco and Allied Telesyn call it Private VLAN.
+ *    Hewlett-Packard call it Source-Port filtering or port-isolation.
+ *    Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+				struct net_device *dev,	struct rtable *rt,
+				__be32 sip, __be32 tip)
+{
+	/* Private VLAN is only concerned about the same ethernet segment */
+	if (rt->dst.dev != dev)
+		return 0;
+
+	/* Don't reply on self probes (often done by windowz boxes)*/
+	if (sip == tip)
+		return 0;
+
+	if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ *	Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ *	Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ *	message.
+ */
+struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
+			   struct net_device *dev, __be32 src_ip,
+			   const unsigned char *dest_hw,
+			   const unsigned char *src_hw,
+			   const unsigned char *target_hw)
+{
+	struct sk_buff *skb;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+
+	/*
+	 *	Allocate a buffer
+	 */
+
+	skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	skb_reset_network_header(skb);
+	arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_ARP);
+	if (src_hw == NULL)
+		src_hw = dev->dev_addr;
+	if (dest_hw == NULL)
+		dest_hw = dev->broadcast;
+
+	/*
+	 *	Fill the device header for the ARP frame
+	 */
+	if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
+		goto out;
+
+	/*
+	 * Fill out the arp protocol part.
+	 *
+	 * The arp hardware type should match the device type, except for FDDI,
+	 * which (according to RFC 1390) should always equal 1 (Ethernet).
+	 */
+	/*
+	 *	Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+	 *	DIX code for the protocol. Make these device structure fields.
+	 */
+	switch (dev->type) {
+	default:
+		arp->ar_hrd = htons(dev->type);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	case ARPHRD_AX25:
+		arp->ar_hrd = htons(ARPHRD_AX25);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+	case ARPHRD_NETROM:
+		arp->ar_hrd = htons(ARPHRD_NETROM);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+#endif
+#endif
+
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
+	case ARPHRD_FDDI:
+		arp->ar_hrd = htons(ARPHRD_ETHER);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+#endif
+#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
+	case ARPHRD_IEEE802_TR:
+		arp->ar_hrd = htons(ARPHRD_IEEE802);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+#endif
+	}
+
+	arp->ar_hln = dev->addr_len;
+	arp->ar_pln = 4;
+	arp->ar_op = htons(type);
+
+	arp_ptr = (unsigned char *)(arp + 1);
+
+	memcpy(arp_ptr, src_hw, dev->addr_len);
+	arp_ptr += dev->addr_len;
+	memcpy(arp_ptr, &src_ip, 4);
+	arp_ptr += 4;
+	if (target_hw != NULL)
+		memcpy(arp_ptr, target_hw, dev->addr_len);
+	else
+		memset(arp_ptr, 0, dev->addr_len);
+	arp_ptr += dev->addr_len;
+	memcpy(arp_ptr, &dest_ip, 4);
+
+	return skb;
+
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+EXPORT_SYMBOL(arp_create);
+
+/*
+ *	Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+	/* Send it off, maybe filter it using firewalling first.  */
+	NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+EXPORT_SYMBOL(arp_xmit);
+
+/*
+ *	Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, __be32 dest_ip,
+	      struct net_device *dev, __be32 src_ip,
+	      const unsigned char *dest_hw, const unsigned char *src_hw,
+	      const unsigned char *target_hw)
+{
+	struct sk_buff *skb;
+
+	/*
+	 *	No arp on this interface.
+	 */
+
+	if (dev->flags&IFF_NOARP)
+		return;
+
+	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+			 dest_hw, src_hw, target_hw);
+	if (skb == NULL)
+		return;
+
+	arp_xmit(skb);
+}
+EXPORT_SYMBOL(arp_send);
+
+/*
+ *	Process an arp request.
+ */
+
+static int arp_process(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	struct rtable *rt;
+	unsigned char *sha;
+	__be32 sip, tip;
+	u16 dev_type = dev->type;
+	int addr_type;
+	struct neighbour *n;
+	struct net *net = dev_net(dev);
+
+	/* arp_rcv below verifies the ARP header and verifies the device
+	 * is ARP'able.
+	 */
+
+	if (in_dev == NULL)
+		goto out;
+
+	arp = arp_hdr(skb);
+
+	switch (dev_type) {
+	default:
+		if (arp->ar_pro != htons(ETH_P_IP) ||
+		    htons(dev_type) != arp->ar_hrd)
+			goto out;
+		break;
+	case ARPHRD_ETHER:
+	case ARPHRD_IEEE802_TR:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802:
+		/*
+		 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+		 * devices, according to RFC 2625) devices will accept ARP
+		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+		 * This is the case also of FDDI, where the RFC 1390 says that
+		 * FDDI devices should accept ARP hardware of (1) Ethernet,
+		 * however, to be more robust, we'll accept both 1 (Ethernet)
+		 * or 6 (IEEE 802.2)
+		 */
+		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+		    arp->ar_pro != htons(ETH_P_IP))
+			goto out;
+		break;
+	case ARPHRD_AX25:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_AX25))
+			goto out;
+		break;
+	case ARPHRD_NETROM:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_NETROM))
+			goto out;
+		break;
+	}
+
+	/* Understand only these message types */
+
+	if (arp->ar_op != htons(ARPOP_REPLY) &&
+	    arp->ar_op != htons(ARPOP_REQUEST))
+		goto out;
+
+/*
+ *	Extract fields
+ */
+	arp_ptr = (unsigned char *)(arp + 1);
+	sha	= arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, 4);
+	arp_ptr += 4;
+	arp_ptr += dev->addr_len;
+	memcpy(&tip, arp_ptr, 4);
+/*
+ *	Check for bad requests for 127.x.x.x and requests for multicast
+ *	addresses.  If this is one such, delete it.
+ */
+	if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+		goto out;
+
+/*
+ *     Special case: We must set Frame Relay source Q.922 address
+ */
+	if (dev_type == ARPHRD_DLCI)
+		sha = dev->broadcast;
+
+/*
+ *  Process entry.  The idea here is we want to send a reply if it is a
+ *  request for us or if it is a request for someone else that we hold
+ *  a proxy for.  We want to add an entry to our cache if it is a reply
+ *  to us or if it is a request for our address.
+ *  (The assumption for this last is that if someone is requesting our
+ *  address, they are probably intending to talk to us, so it saves time
+ *  if we cache their address.  Their address is also probably not in
+ *  our cache, since ours is not in their cache.)
+ *
+ *  Putting this another way, we only care about replies if they are to
+ *  us, in which case we add them to the cache.  For requests, we care
+ *  about those for us and those for our proxies.  We reply to both,
+ *  and in the case of requests for us we add the requester to the arp
+ *  cache.
+ */
+
+	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
+	if (sip == 0) {
+		if (arp->ar_op == htons(ARPOP_REQUEST) &&
+		    inet_addr_type(net, tip) == RTN_LOCAL &&
+		    !arp_ignore(in_dev, sip, tip))
+			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+				 dev->dev_addr, sha);
+		goto out;
+	}
+
+	if (arp->ar_op == htons(ARPOP_REQUEST) &&
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+		rt = skb_rtable(skb);
+		addr_type = rt->rt_type;
+
+		if (addr_type == RTN_LOCAL) {
+			int dont_send;
+
+			dont_send = arp_ignore(in_dev, sip, tip);
+			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+				dont_send = arp_filter(sip, tip, dev);
+			if (!dont_send) {
+				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+				if (n) {
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
+					neigh_release(n);
+				}
+			}
+			goto out;
+		} else if (IN_DEV_FORWARD(in_dev)) {
+			if (addr_type == RTN_UNICAST  &&
+			    (arp_fwd_proxy(in_dev, dev, rt) ||
+			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+			     (rt->dst.dev != dev &&
+			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+				if (n)
+					neigh_release(n);
+
+				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+				    skb->pkt_type == PACKET_HOST ||
+				    in_dev->arp_parms->proxy_delay == 0) {
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
+				} else {
+					pneigh_enqueue(&arp_tbl,
+						       in_dev->arp_parms, skb);
+					return 0;
+				}
+				goto out;
+			}
+		}
+	}
+
+	/* Update our ARP tables */
+
+	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+	if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) {
+		/* Unsolicited ARP is not accepted by default.
+		   It is possible, that this option should be enabled for some
+		   devices (strip is candidate)
+		 */
+		if (n == NULL &&
+		    (arp->ar_op == htons(ARPOP_REPLY) ||
+		     (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
+		    inet_addr_type(net, sip) == RTN_UNICAST)
+			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+	}
+
+	if (n) {
+		int state = NUD_REACHABLE;
+		int override;
+
+		/* If several different ARP replies follows back-to-back,
+		   use the FIRST one. It is possible, if several proxy
+		   agents are active. Taking the first reply prevents
+		   arp trashing and chooses the fastest router.
+		 */
+		override = time_after(jiffies, n->updated + n->parms->locktime);
+
+		/* Broadcast replies and request packets
+		   do not assert neighbour reachability.
+		 */
+		if (arp->ar_op != htons(ARPOP_REPLY) ||
+		    skb->pkt_type != PACKET_HOST)
+			state = NUD_STALE;
+		neigh_update(n, sha, state,
+			     override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+		neigh_release(n);
+	}
+
+out:
+	consume_skb(skb);
+	return 0;
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+	arp_process(skb);
+}
+
+
+/*
+ *	Receive an arp request from the device layer.
+ */
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+		   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct arphdr *arp;
+
+	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+		goto freeskb;
+
+	arp = arp_hdr(skb);
+	if (arp->ar_hln != dev->addr_len ||
+	    dev->flags & IFF_NOARP ||
+	    skb->pkt_type == PACKET_OTHERHOST ||
+	    skb->pkt_type == PACKET_LOOPBACK ||
+	    arp->ar_pln != 4)
+		goto freeskb;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out_of_mem;
+
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+	return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+	kfree_skb(skb);
+out_of_mem:
+	return 0;
+}
+
+/*
+ *	User level interface (ioctl)
+ */
+
+/*
+ *	Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
+{
+	if (dev == NULL) {
+		IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+		return 0;
+	}
+	if (__in_dev_get_rtnl(dev)) {
+		IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+		return 0;
+	}
+	return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+		struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+	if (mask && mask != htonl(0xFFFFFFFF))
+		return -EINVAL;
+	if (!dev && (r->arp_flags & ATF_COM)) {
+		dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+				      r->arp_ha.sa_data);
+		if (!dev)
+			return -ENODEV;
+	}
+	if (mask) {
+		if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL)
+			return -ENOBUFS;
+		return 0;
+	}
+
+	return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+		       struct net_device *dev)
+{
+	__be32 ip;
+	struct neighbour *neigh;
+	int err;
+
+	if (r->arp_flags & ATF_PUBL)
+		return arp_req_set_public(net, r, dev);
+
+	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	if (r->arp_flags & ATF_PERM)
+		r->arp_flags |= ATF_COM;
+	if (dev == NULL) {
+		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+		if (IS_ERR(rt))
+			return PTR_ERR(rt);
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	switch (dev->type) {
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
+	case ARPHRD_FDDI:
+		/*
+		 * According to RFC 1390, FDDI devices should accept ARP
+		 * hardware types of 1 (Ethernet).  However, to be more
+		 * robust, we'll accept hardware types of either 1 (Ethernet)
+		 * or 6 (IEEE 802.2).
+		 */
+		if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+		    r->arp_ha.sa_family != ARPHRD_ETHER &&
+		    r->arp_ha.sa_family != ARPHRD_IEEE802)
+			return -EINVAL;
+		break;
+#endif
+	default:
+		if (r->arp_ha.sa_family != dev->type)
+			return -EINVAL;
+		break;
+	}
+
+	neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+	err = PTR_ERR(neigh);
+	if (!IS_ERR(neigh)) {
+		unsigned state = NUD_STALE;
+		if (r->arp_flags & ATF_PERM)
+			state = NUD_PERMANENT;
+		err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+				   r->arp_ha.sa_data : NULL, state,
+				   NEIGH_UPDATE_F_OVERRIDE |
+				   NEIGH_UPDATE_F_ADMIN);
+		neigh_release(neigh);
+	}
+	return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+	if (neigh->nud_state&NUD_PERMANENT)
+		return ATF_PERM | ATF_COM;
+	else if (neigh->nud_state&NUD_VALID)
+		return ATF_COM;
+	else
+		return 0;
+}
+
+/*
+ *	Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	struct neighbour *neigh;
+	int err = -ENXIO;
+
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	if (neigh) {
+		read_lock_bh(&neigh->lock);
+		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+		r->arp_flags = arp_state_to_flags(neigh);
+		read_unlock_bh(&neigh->lock);
+		r->arp_ha.sa_family = dev->type;
+		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+		neigh_release(neigh);
+		err = 0;
+	}
+	return err;
+}
+
+int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+	struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	int err = -ENXIO;
+
+	if (neigh) {
+		if (neigh->nud_state & ~NUD_NOARP)
+			err = neigh_update(neigh, NULL, NUD_FAILED,
+					   NEIGH_UPDATE_F_OVERRIDE|
+					   NEIGH_UPDATE_F_ADMIN);
+		neigh_release(neigh);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(arp_invalidate);
+
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+		struct net_device *dev)
+{
+	__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+	if (mask == htonl(0xFFFFFFFF))
+		return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+	if (mask)
+		return -EINVAL;
+
+	return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+			  struct net_device *dev)
+{
+	__be32 ip;
+
+	if (r->arp_flags & ATF_PUBL)
+		return arp_req_delete_public(net, r, dev);
+
+	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	if (dev == NULL) {
+		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+		if (IS_ERR(rt))
+			return PTR_ERR(rt);
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	return arp_invalidate(dev, ip);
+}
+
+/*
+ *	Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	int err;
+	struct arpreq r;
+	struct net_device *dev = NULL;
+
+	switch (cmd) {
+	case SIOCDARP:
+	case SIOCSARP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+	case SIOCGARP:
+		err = copy_from_user(&r, arg, sizeof(struct arpreq));
+		if (err)
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (r.arp_pa.sa_family != AF_INET)
+		return -EPFNOSUPPORT;
+
+	if (!(r.arp_flags & ATF_PUBL) &&
+	    (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
+		return -EINVAL;
+	if (!(r.arp_flags & ATF_NETMASK))
+		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+							   htonl(0xFFFFFFFFUL);
+	rtnl_lock();
+	if (r.arp_dev[0]) {
+		err = -ENODEV;
+		dev = __dev_get_by_name(net, r.arp_dev);
+		if (dev == NULL)
+			goto out;
+
+		/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+		if (!r.arp_ha.sa_family)
+			r.arp_ha.sa_family = dev->type;
+		err = -EINVAL;
+		if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+			goto out;
+	} else if (cmd == SIOCGARP) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	switch (cmd) {
+	case SIOCDARP:
+		err = arp_req_delete(net, &r, dev);
+		break;
+	case SIOCSARP:
+		err = arp_req_set(net, &r, dev);
+		break;
+	case SIOCGARP:
+		err = arp_req_get(&r, dev);
+		break;
+	}
+out:
+	rtnl_unlock();
+	if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+		err = -EFAULT;
+	return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	switch (event) {
+	case NETDEV_CHANGEADDR:
+		neigh_changeaddr(&arp_tbl, dev);
+		rt_cache_flush(dev_net(dev), 0);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+	.notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+   It is necessary, that this routine was called after route cache will be
+   flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+	neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ *	Called once on startup.
+ */
+
+static struct packet_type arp_packet_type __read_mostly = {
+	.type =	cpu_to_be16(ETH_P_ARP),
+	.func =	arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+	neigh_table_init(&arp_tbl);
+
+	dev_add_pack(&arp_packet_type);
+	arp_proc_init();
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
+#endif
+	register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+
+/* ------------------------------------------------------------------------ */
+/*
+ *	ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+	char c, *s;
+	int n;
+
+	for (n = 0, s = buf; n < 6; n++) {
+		c = (a->ax25_call[n] >> 1) & 0x7F;
+
+		if (c != ' ')
+			*s++ = c;
+	}
+
+	*s++ = '-';
+	n = (a->ax25_call[6] >> 1) & 0x0F;
+	if (n > 9) {
+		*s++ = '1';
+		n -= 10;
+	}
+
+	*s++ = n + '0';
+	*s++ = '\0';
+
+	if (*buf == '\0' || *buf == '-')
+		return "*";
+
+	return buf;
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+				   struct neighbour *n)
+{
+	char hbuffer[HBUFFERLEN];
+	int k, j;
+	char tbuf[16];
+	struct net_device *dev = n->dev;
+	int hatype = dev->type;
+
+	read_lock(&n->lock);
+	/* Convert hardware address to XX:XX:XX:XX ... form. */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+		ax2asc2((ax25_address *)n->ha, hbuffer);
+	else {
+#endif
+	for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+		hbuffer[k++] = hex_asc_hi(n->ha[j]);
+		hbuffer[k++] = hex_asc_lo(n->ha[j]);
+		hbuffer[k++] = ':';
+	}
+	if (k != 0)
+		--k;
+	hbuffer[k] = 0;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	}
+#endif
+	sprintf(tbuf, "%pI4", n->primary_key);
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+		   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+	read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+				    struct pneigh_entry *n)
+{
+	struct net_device *dev = n->dev;
+	int hatype = dev ? dev->type : 0;
+	char tbuf[16];
+
+	sprintf(tbuf, "%pI4", n->key);
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+		   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+		   dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "IP address       HW type     Flags       "
+			      "HW address            Mask     Device\n");
+	} else {
+		struct neigh_seq_state *state = seq->private;
+
+		if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+			arp_format_pneigh_entry(seq, v);
+		else
+			arp_format_neigh_entry(seq, v);
+	}
+
+	return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* Don't want to confuse "arp -a" w/ magic entries,
+	 * so we tell the generic iterator to skip NUD_NOARP.
+	 */
+	return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const struct seq_operations arp_seq_ops = {
+	.start	= arp_seq_start,
+	.next	= neigh_seq_next,
+	.stop	= neigh_seq_stop,
+	.show	= arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &arp_seq_ops,
+			    sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = arp_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_net,
+};
+
+
+static int __net_init arp_net_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit arp_net_exit(struct net *net)
+{
+	proc_net_remove(net, "arp");
+}
+
+static struct pernet_operations arp_net_ops = {
+	.init = arp_net_init,
+	.exit = arp_net_exit,
+};
+
+static int __init arp_proc_init(void)
+{
+	return register_pernet_subsys(&arp_net_ops);
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 00000000..2b3c23c2
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,2368 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188.  While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ *   http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ *   http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
+#include <asm/bug.h>
+#include <asm/unaligned.h>
+
+/* List of available DOI definitions */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static LIST_HEAD(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS     7
+#define CIPSO_V4_CACHE_BUCKETS        (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT   10
+struct cipso_v4_map_cache_bkt {
+	spinlock_t lock;
+	u32 size;
+	struct list_head list;
+};
+struct cipso_v4_map_cache_entry {
+	u32 hash;
+	unsigned char *key;
+	size_t key_len;
+
+	struct netlbl_lsm_cache *lsm_data;
+
+	u32 activity;
+	struct list_head list;
+};
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Protocol Constants
+ */
+
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX          40
+
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN              6
+
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN         4
+
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN        4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN         4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5).  You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX      8
+
+/* Base length of the local tag (non-standard tag).
+ *  Tag definition (may change between kernel versions)
+ *
+ * 0          8          16         24         32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value  |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN         6
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end.  Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
+				u32 bitmap_len,
+				u32 offset,
+				u8 state)
+{
+	u32 bit_spot;
+	u32 byte_offset;
+	unsigned char bitmask;
+	unsigned char byte;
+
+	/* gcc always rounds to zero when doing integer division */
+	byte_offset = offset / 8;
+	byte = bitmap[byte_offset];
+	bit_spot = offset;
+	bitmask = 0x80 >> (offset % 8);
+
+	while (bit_spot < bitmap_len) {
+		if ((state && (byte & bitmask) == bitmask) ||
+		    (state == 0 && (byte & bitmask) == 0))
+			return bit_spot;
+
+		bit_spot++;
+		bitmask >>= 1;
+		if (bitmask == 0) {
+			byte = bitmap[++byte_offset];
+			bitmask = 0x80;
+		}
+	}
+
+	return -1;
+}
+
+/**
+ * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask.  Returns zero on success, negative values
+ * on error.
+ */
+static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
+				   u32 bit,
+				   u8 state)
+{
+	u32 byte_spot;
+	u8 bitmask;
+
+	/* gcc always rounds to zero when doing integer division */
+	byte_spot = bit / 8;
+	bitmask = 0x80 >> (bit % 8);
+	if (state)
+		bitmap[byte_spot] |= bitmask;
+	else
+		bitmap[byte_spot] &= ~bitmask;
+}
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+	if (entry->lsm_data)
+		netlbl_secattr_cache_free(entry->lsm_data);
+	kfree(entry->key);
+	kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function.  Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file.  Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int cipso_v4_cache_init(void)
+{
+	u32 iter;
+
+	cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+				 sizeof(struct cipso_v4_map_cache_bkt),
+				 GFP_KERNEL);
+	if (cipso_v4_cache == NULL)
+		return -ENOMEM;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock_init(&cipso_v4_cache[iter].lock);
+		cipso_v4_cache[iter].size = 0;
+		INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+	struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+	u32 iter;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock_bh(&cipso_v4_cache[iter].lock);
+		list_for_each_entry_safe(entry,
+					 tmp_entry,
+					 &cipso_v4_cache[iter].list, list) {
+			list_del(&entry->list);
+			cipso_v4_cache_entry_free(entry);
+		}
+		cipso_v4_cache[iter].size = 0;
+		spin_unlock_bh(&cipso_v4_cache[iter].lock);
+	}
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key.  If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes.  The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ *  1. The cache entry's activity counter is incremented
+ *  2. The previous (higher ranking) entry's activity counter is decremented
+ *  3. If the difference between the two activity counters is geater than
+ *     CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+				u32 key_len,
+				struct netlbl_lsm_secattr *secattr)
+{
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry;
+	struct cipso_v4_map_cache_entry *prev_entry = NULL;
+	u32 hash;
+
+	if (!cipso_v4_cache_enabled)
+		return -ENOENT;
+
+	hash = cipso_v4_map_cache_hash(key, key_len);
+	bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
+	list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+		if (entry->hash == hash &&
+		    entry->key_len == key_len &&
+		    memcmp(entry->key, key, key_len) == 0) {
+			entry->activity += 1;
+			atomic_inc(&entry->lsm_data->refcount);
+			secattr->cache = entry->lsm_data;
+			secattr->flags |= NETLBL_SECATTR_CACHE;
+			secattr->type = NETLBL_NLTYPE_CIPSOV4;
+			if (prev_entry == NULL) {
+				spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+				return 0;
+			}
+
+			if (prev_entry->activity > 0)
+				prev_entry->activity -= 1;
+			if (entry->activity > prev_entry->activity &&
+			    entry->activity - prev_entry->activity >
+			    CIPSO_V4_CACHE_REORDERLIMIT) {
+				__list_del(entry->list.prev, entry->list.next);
+				__list_add(&entry->list,
+					   prev_entry->list.prev,
+					   &prev_entry->list);
+			}
+
+			spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+			return 0;
+		}
+		prev_entry = entry;
+	}
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+	return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache.  Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first.  It is important to note that there is
+ * currently no checking for duplicate keys.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const struct sk_buff *skb,
+		       const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry = NULL;
+	struct cipso_v4_map_cache_entry *old_entry = NULL;
+	unsigned char *cipso_ptr;
+	u32 cipso_ptr_len;
+
+	if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+		return 0;
+
+	cipso_ptr = CIPSO_V4_OPTPTR(skb);
+	cipso_ptr_len = cipso_ptr[1];
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+	entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
+	if (entry->key == NULL) {
+		ret_val = -ENOMEM;
+		goto cache_add_failure;
+	}
+	entry->key_len = cipso_ptr_len;
+	entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+	atomic_inc(&secattr->cache->refcount);
+	entry->lsm_data = secattr->cache;
+
+	bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
+	if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache[bkt].size += 1;
+	} else {
+		old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+				       struct cipso_v4_map_cache_entry, list);
+		list_del(&old_entry->list);
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache_entry_free(old_entry);
+	}
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+	return 0;
+
+cache_add_failure:
+	if (entry)
+		cipso_v4_cache_entry_free(entry);
+	return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi.  The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+	struct cipso_v4_doi *iter;
+
+	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+		if (iter->doi == doi && atomic_read(&iter->refcount))
+			return iter;
+	return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains.  The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details).  Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+		     struct netlbl_audit *audit_info)
+{
+	int ret_val = -EINVAL;
+	u32 iter;
+	u32 doi;
+	u32 doi_type;
+	struct audit_buffer *audit_buf;
+
+	doi = doi_def->doi;
+	doi_type = doi_def->type;
+
+	if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+		goto doi_add_return;
+	for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
+		switch (doi_def->tags[iter]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			break;
+		case CIPSO_V4_TAG_RANGE:
+		case CIPSO_V4_TAG_ENUM:
+			if (doi_def->type != CIPSO_V4_MAP_PASS)
+				goto doi_add_return;
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+				goto doi_add_return;
+			break;
+		case CIPSO_V4_TAG_INVALID:
+			if (iter == 0)
+				goto doi_add_return;
+			break;
+		default:
+			goto doi_add_return;
+		}
+	}
+
+	atomic_set(&doi_def->refcount, 1);
+
+	spin_lock(&cipso_v4_doi_list_lock);
+	if (cipso_v4_doi_search(doi_def->doi) != NULL) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		ret_val = -EEXIST;
+		goto doi_add_return;
+	}
+	list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+	ret_val = 0;
+
+doi_add_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+	if (audit_buf != NULL) {
+		const char *type_str;
+		switch (doi_type) {
+		case CIPSO_V4_MAP_TRANS:
+			type_str = "trans";
+			break;
+		case CIPSO_V4_MAP_PASS:
+			type_str = "pass";
+			break;
+		case CIPSO_V4_MAP_LOCAL:
+			type_str = "local";
+			break;
+		default:
+			type_str = "(unknown)";
+		}
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u cipso_type=%s res=%u",
+				 doi, type_str, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+	if (doi_def == NULL)
+		return;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_TRANS:
+		kfree(doi_def->map.std->lvl.cipso);
+		kfree(doi_def->map.std->lvl.local);
+		kfree(doi_def->map.std->cat.cipso);
+		kfree(doi_def->map.std->cat.local);
+		break;
+	}
+	kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+	struct cipso_v4_doi *doi_def;
+
+	doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+	cipso_v4_doi_free(doi_def);
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine.  The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list.  Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct cipso_v4_doi *doi_def;
+	struct audit_buffer *audit_buf;
+
+	spin_lock(&cipso_v4_doi_list_lock);
+	doi_def = cipso_v4_doi_search(doi);
+	if (doi_def == NULL) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		ret_val = -ENOENT;
+		goto doi_remove_return;
+	}
+	if (!atomic_dec_and_test(&doi_def->refcount)) {
+		spin_unlock(&cipso_v4_doi_list_lock);
+		ret_val = -EBUSY;
+		goto doi_remove_return;
+	}
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+
+	cipso_v4_cache_invalidate();
+	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+	ret_val = 0;
+
+doi_remove_return:
+	audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+	if (audit_buf != NULL) {
+		audit_log_format(audit_buf,
+				 " cipso_doi=%u res=%u",
+				 doi, ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	return ret_val;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller.  Otherwise NULL is returned.  The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+	struct cipso_v4_doi *doi_def;
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(doi);
+	if (doi_def == NULL)
+		goto doi_getdef_return;
+	if (!atomic_inc_not_zero(&doi_def->refcount))
+		doi_def = NULL;
+
+doi_getdef_return:
+	rcu_read_unlock();
+	return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+	if (doi_def == NULL)
+		return;
+
+	if (!atomic_dec_and_test(&doi_def->refcount))
+		return;
+	spin_lock(&cipso_v4_doi_list_lock);
+	list_del_rcu(&doi_def->list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+
+	cipso_v4_cache_invalidate();
+	call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+}
+
+/**
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return.  Updates the value in @skip_cnt upon
+ * return.  Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_walk(u32 *skip_cnt,
+		     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+		     void *cb_arg)
+{
+	int ret_val = -ENOENT;
+	u32 doi_cnt = 0;
+	struct cipso_v4_doi *iter_doi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+		if (atomic_read(&iter_doi->refcount) > 0) {
+			if (doi_cnt++ < *skip_cnt)
+				continue;
+			ret_val = callback(iter_doi, cb_arg);
+			if (ret_val < 0) {
+				doi_cnt--;
+				goto doi_walk_return;
+			}
+		}
+
+doi_walk_return:
+	rcu_read_unlock();
+	*skip_cnt = doi_cnt;
+	return ret_val;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition.  Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+				 u32 host_lvl,
+				 u32 *net_lvl)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*net_lvl = host_lvl;
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		if (host_lvl < doi_def->map.std->lvl.local_size &&
+		    doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
+			*net_lvl = doi_def->map.std->lvl.local[host_lvl];
+			return 0;
+		}
+		return -EPERM;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition.  Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+				 u32 net_lvl,
+				 u32 *host_lvl)
+{
+	struct cipso_v4_std_map_tbl *map_tbl;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*host_lvl = net_lvl;
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		map_tbl = doi_def->map.std;
+		if (net_lvl < map_tbl->lvl.cipso_size &&
+		    map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+			*host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+			return 0;
+		}
+		return -EPERM;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *bitmap,
+				      u32 bitmap_len)
+{
+	int cat = -1;
+	u32 bitmap_len_bits = bitmap_len * 8;
+	u32 cipso_cat_size;
+	u32 *cipso_array;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_TRANS:
+		cipso_cat_size = doi_def->map.std->cat.cipso_size;
+		cipso_array = doi_def->map.std->cat.cipso;
+		for (;;) {
+			cat = cipso_v4_bitmap_walk(bitmap,
+						   bitmap_len_bits,
+						   cat + 1,
+						   1);
+			if (cat < 0)
+				break;
+			if (cat >= cipso_cat_size ||
+			    cipso_array[cat] >= CIPSO_V4_INV_CAT)
+				return -EFAULT;
+		}
+
+		if (cat == -1)
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition.  Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+				     const struct netlbl_lsm_secattr *secattr,
+				     unsigned char *net_cat,
+				     u32 net_cat_len)
+{
+	int host_spot = -1;
+	u32 net_spot = CIPSO_V4_INV_CAT;
+	u32 net_spot_max = 0;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 host_cat_size = 0;
+	u32 *host_cat_array = NULL;
+
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+		host_cat_size = doi_def->map.std->cat.local_size;
+		host_cat_array = doi_def->map.std->cat.local;
+	}
+
+	for (;;) {
+		host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+						       host_spot + 1);
+		if (host_spot < 0)
+			break;
+
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_PASS:
+			net_spot = host_spot;
+			break;
+		case CIPSO_V4_MAP_TRANS:
+			if (host_spot >= host_cat_size)
+				return -EPERM;
+			net_spot = host_cat_array[host_spot];
+			if (net_spot >= CIPSO_V4_INV_CAT)
+				return -EPERM;
+			break;
+		}
+		if (net_spot >= net_clen_bits)
+			return -ENOSPC;
+		cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+
+		if (net_spot > net_spot_max)
+			net_spot_max = net_spot;
+	}
+
+	if (++net_spot_max % 8)
+		return net_spot_max / 8 + 1;
+	return net_spot_max / 8;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *net_cat,
+				     u32 net_cat_len,
+				     struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	int net_spot = -1;
+	u32 host_spot = CIPSO_V4_INV_CAT;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 net_cat_size = 0;
+	u32 *net_cat_array = NULL;
+
+	if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+		net_cat_size = doi_def->map.std->cat.cipso_size;
+		net_cat_array = doi_def->map.std->cat.cipso;
+	}
+
+	for (;;) {
+		net_spot = cipso_v4_bitmap_walk(net_cat,
+						net_clen_bits,
+						net_spot + 1,
+						1);
+		if (net_spot < 0) {
+			if (net_spot == -2)
+				return -EFAULT;
+			return 0;
+		}
+
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_PASS:
+			host_spot = net_spot;
+			break;
+		case CIPSO_V4_MAP_TRANS:
+			if (net_spot >= net_cat_size)
+				return -EPERM;
+			host_spot = net_cat_array[net_spot];
+			if (host_spot >= CIPSO_V4_INV_CAT)
+				return -EPERM;
+			break;
+		}
+		ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+						       host_spot,
+						       GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+				       const unsigned char *enumcat,
+				       u32 enumcat_len)
+{
+	u16 cat;
+	int cat_prev = -1;
+	u32 iter;
+
+	if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+		return -EFAULT;
+
+	for (iter = 0; iter < enumcat_len; iter += 2) {
+		cat = get_unaligned_be16(&enumcat[iter]);
+		if (cat <= cat_prev)
+			return -EFAULT;
+		cat_prev = cat;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition.   Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+				      const struct netlbl_lsm_secattr *secattr,
+				      unsigned char *net_cat,
+				      u32 net_cat_len)
+{
+	int cat = -1;
+	u32 cat_iter = 0;
+
+	for (;;) {
+		cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+						 cat + 1);
+		if (cat < 0)
+			break;
+		if ((cat_iter + 2) > net_cat_len)
+			return -ENOSPC;
+
+		*((__be16 *)&net_cat[cat_iter]) = htons(cat);
+		cat_iter += 2;
+	}
+
+	return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *net_cat,
+				      u32 net_cat_len,
+				      struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 iter;
+
+	for (iter = 0; iter < net_cat_len; iter += 2) {
+		ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+				get_unaligned_be16(&net_cat[iter]),
+				GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *rngcat,
+				      u32 rngcat_len)
+{
+	u16 cat_high;
+	u16 cat_low;
+	u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+	u32 iter;
+
+	if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+		return -EFAULT;
+
+	for (iter = 0; iter < rngcat_len; iter += 4) {
+		cat_high = get_unaligned_be16(&rngcat[iter]);
+		if ((iter + 4) <= rngcat_len)
+			cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+		else
+			cat_low = 0;
+
+		if (cat_high > cat_prev)
+			return -EFAULT;
+
+		cat_prev = cat_low;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition.   Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+				     const struct netlbl_lsm_secattr *secattr,
+				     unsigned char *net_cat,
+				     u32 net_cat_len)
+{
+	int iter = -1;
+	u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+	u32 array_cnt = 0;
+	u32 cat_size = 0;
+
+	/* make sure we don't overflow the 'array[]' variable */
+	if (net_cat_len >
+	    (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+		return -ENOSPC;
+
+	for (;;) {
+		iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+						  iter + 1);
+		if (iter < 0)
+			break;
+		cat_size += (iter == 0 ? 0 : sizeof(u16));
+		if (cat_size > net_cat_len)
+			return -ENOSPC;
+		array[array_cnt++] = iter;
+
+		iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
+						      iter);
+		if (iter < 0)
+			return -EFAULT;
+		cat_size += sizeof(u16);
+		if (cat_size > net_cat_len)
+			return -ENOSPC;
+		array[array_cnt++] = iter;
+	}
+
+	for (iter = 0; array_cnt > 0;) {
+		*((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+		iter += 2;
+		array_cnt--;
+		if (array[array_cnt] != 0) {
+			*((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+			iter += 2;
+		}
+	}
+
+	return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *net_cat,
+				     u32 net_cat_len,
+				     struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 net_iter;
+	u16 cat_low;
+	u16 cat_high;
+
+	for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+		cat_high = get_unaligned_be16(&net_cat[net_iter]);
+		if ((net_iter + 4) <= net_cat_len)
+			cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+		else
+			cat_low = 0;
+
+		ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
+						       cat_low,
+						       cat_high,
+						       GFP_ATOMIC);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	return 0;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes, not including this header
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.
+ *
+ */
+static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+				unsigned char *buf,
+				u32 len)
+{
+	buf[0] = IPOPT_CIPSO;
+	buf[1] = CIPSO_V4_HDR_LEN + len;
+	*(__be32 *)&buf[2] = htonl(doi_def->doi);
+}
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1.  The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps.  Returns the size of
+ * the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+						    secattr,
+						    &buffer[4],
+						    buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		/* This will send packets using the "optimized" format when
+		 * possible as specified in  section 3.4.2.6 of the
+		 * CIPSO draft. */
+		if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+			tag_len = 14;
+		else
+			tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_RBITMAP;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr.  Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		secattr->attr.mls.cat =
+		                       netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->attr.mls.cat == NULL)
+			return -ENOMEM;
+
+		ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+						    &tag[4],
+						    tag_len - 4,
+						    secattr);
+		if (ret_val != 0) {
+			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2.  Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+				const struct netlbl_lsm_secattr *secattr,
+				unsigned char *buffer,
+				u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+						     secattr,
+						     &buffer[4],
+						     buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_ENUM;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr.  Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+				  const unsigned char *tag,
+				  struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		secattr->attr.mls.cat =
+			               netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->attr.mls.cat == NULL)
+			return -ENOMEM;
+
+		ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+						     &tag[4],
+						     tag_len - 4,
+						     secattr);
+		if (ret_val != 0) {
+			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5.  Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	int ret_val;
+	u32 tag_len;
+	u32 level;
+
+	if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+		return -EPERM;
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def,
+					secattr->attr.mls.lvl,
+					&level);
+	if (ret_val != 0)
+		return ret_val;
+
+	if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+		ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+						    secattr,
+						    &buffer[4],
+						    buffer_len - 4);
+		if (ret_val < 0)
+			return ret_val;
+
+		tag_len = 4 + ret_val;
+	} else
+		tag_len = 4;
+
+	buffer[0] = CIPSO_V4_TAG_RANGE;
+	buffer[1] = tag_len;
+	buffer[3] = level;
+
+	return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr.  Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->attr.mls.lvl = level;
+	secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+	if (tag_len > 4) {
+		secattr->attr.mls.cat =
+			               netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+		if (secattr->attr.mls.cat == NULL)
+			return -ENOMEM;
+
+		ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+						    &tag[4],
+						    tag_len - 4,
+						    secattr);
+		if (ret_val != 0) {
+			netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+			return ret_val;
+		}
+
+		secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag.  Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char *buffer,
+			       u32 buffer_len)
+{
+	if (!(secattr->flags & NETLBL_SECATTR_SECID))
+		return -EPERM;
+
+	buffer[0] = CIPSO_V4_TAG_LOCAL;
+	buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+	*(u32 *)&buffer[2] = secattr->attr.secid;
+
+	return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	secattr->attr.secid = *(u32 *)&tag[2];
+	secattr->flags |= NETLBL_SECATTR_SECID;
+
+	return 0;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details.  If the option is valid then a zero value is returned and
+ * the value of @option is unchanged.  If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option.  From the IETF draft ...
+ *
+ *  "If any field within the CIPSO options, such as the DOI identifier, is not
+ *   recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ *   (type 12) is generated and returned.  The ICMP code field is set to 'bad
+ *   parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ *   that is unrecognized."
+ *
+ */
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
+{
+	unsigned char *opt = *option;
+	unsigned char *tag;
+	unsigned char opt_iter;
+	unsigned char err_offset = 0;
+	u8 opt_len;
+	u8 tag_len;
+	struct cipso_v4_doi *doi_def = NULL;
+	u32 tag_iter;
+
+	/* caller already checks for length values that are too large */
+	opt_len = opt[1];
+	if (opt_len < 8) {
+		err_offset = 1;
+		goto validate_return;
+	}
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+	if (doi_def == NULL) {
+		err_offset = 2;
+		goto validate_return_locked;
+	}
+
+	opt_iter = CIPSO_V4_HDR_LEN;
+	tag = opt + opt_iter;
+	while (opt_iter < opt_len) {
+		for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+			if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+			    ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+
+		tag_len = tag[1];
+		if (tag_len > (opt_len - opt_iter)) {
+			err_offset = opt_iter + 1;
+			goto validate_return_locked;
+		}
+
+		switch (tag[0]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			/* We are already going to do all the verification
+			 * necessary at the socket layer so from our point of
+			 * view it is safe to turn these checks off (and less
+			 * work), however, the CIPSO draft says we should do
+			 * all the CIPSO validations here but it doesn't
+			 * really specify _exactly_ what we need to validate
+			 * ... so, just make it a sysctl tunable. */
+			if (cipso_v4_rbm_strictvalid) {
+				if (cipso_v4_map_lvl_valid(doi_def,
+							   tag[3]) < 0) {
+					err_offset = opt_iter + 3;
+					goto validate_return_locked;
+				}
+				if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
+				    cipso_v4_map_cat_rbm_valid(doi_def,
+							    &tag[4],
+							    tag_len - 4) < 0) {
+					err_offset = opt_iter + 4;
+					goto validate_return_locked;
+				}
+			}
+			break;
+		case CIPSO_V4_TAG_ENUM:
+			if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			if (cipso_v4_map_lvl_valid(doi_def,
+						   tag[3]) < 0) {
+				err_offset = opt_iter + 3;
+				goto validate_return_locked;
+			}
+			if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+			    cipso_v4_map_cat_enum_valid(doi_def,
+							&tag[4],
+							tag_len - 4) < 0) {
+				err_offset = opt_iter + 4;
+				goto validate_return_locked;
+			}
+			break;
+		case CIPSO_V4_TAG_RANGE:
+			if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			if (cipso_v4_map_lvl_valid(doi_def,
+						   tag[3]) < 0) {
+				err_offset = opt_iter + 3;
+				goto validate_return_locked;
+			}
+			if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+			    cipso_v4_map_cat_rng_valid(doi_def,
+						       &tag[4],
+						       tag_len - 4) < 0) {
+				err_offset = opt_iter + 4;
+				goto validate_return_locked;
+			}
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			/* This is a non-standard tag that we only allow for
+			 * local connections, so if the incoming interface is
+			 * not the loopback device drop the packet. */
+			if (!(skb->dev->flags & IFF_LOOPBACK)) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+			if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+			break;
+		default:
+			err_offset = opt_iter;
+			goto validate_return_locked;
+		}
+
+		tag += tag_len;
+		opt_iter += tag_len;
+	}
+
+validate_return_locked:
+	rcu_read_unlock();
+validate_return:
+	*option = opt + err_offset;
+	return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct response for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host.  From the IETF draft ...
+ *
+ *  "If the contents of the CIPSO [option] are valid but the security label is
+ *   outside of the configured host or port label range, the datagram is
+ *   discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ *   returned.  The code field of the ICMP is set to 'communication with
+ *   destination network administratively prohibited' (code 9) or to
+ *   'communication with destination host administratively prohibited'
+ *   (code 10).  The value of the code is dependent on whether the originator
+ *   of the ICMP message is acting as a CIPSO host or a CIPSO gateway.  The
+ *   recipient of the ICMP message MUST be able to handle either value.  The
+ *   same procedure is performed if a CIPSO [option] can not be added to an
+ *   IP packet because it is too large to fit in the IP options area."
+ *
+ *  "If the error is triggered by receipt of an ICMP message, the message is
+ *   discarded and no response is permitted (consistent with general ICMP
+ *   processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+	if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+		return;
+
+	if (gateway)
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+	else
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+}
+
+/**
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function.  Returns the length of the option on success and
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+			   const struct cipso_v4_doi *doi_def,
+			   const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u32 iter;
+
+	if (buf_len <= CIPSO_V4_HDR_LEN)
+		return -ENOSPC;
+
+	/* XXX - This code assumes only one tag per CIPSO option which isn't
+	 * really a good assumption to make but since we only support the MAC
+	 * tags right now it is a safe assumption. */
+	iter = 0;
+	do {
+		memset(buf, 0, buf_len);
+		switch (doi_def->tags[iter]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			ret_val = cipso_v4_gentag_rbm(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_ENUM:
+			ret_val = cipso_v4_gentag_enum(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_RANGE:
+			ret_val = cipso_v4_gentag_rng(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		case CIPSO_V4_TAG_LOCAL:
+			ret_val = cipso_v4_gentag_loc(doi_def,
+						   secattr,
+						   &buf[CIPSO_V4_HDR_LEN],
+						   buf_len - CIPSO_V4_HDR_LEN);
+			break;
+		default:
+			return -EPERM;
+		}
+
+		iter++;
+	} while (ret_val < 0 &&
+		 iter < CIPSO_V4_TAG_MAXCNT &&
+		 doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+	if (ret_val < 0)
+		return ret_val;
+	cipso_v4_gentag_hdr(doi_def, buf, ret_val);
+	return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct ip_options_rcu, rcu));
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked.  Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+			  const struct cipso_v4_doi *doi_def,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 opt_len;
+	struct ip_options_rcu *old, *opt = NULL;
+	struct inet_sock *sk_inet;
+	struct inet_connection_sock *sk_conn;
+
+	/* In the case of sock_create_lite(), the sock->sk field is not
+	 * defined yet but it is not a problem as the only users of these
+	 * "lite" PF_INET sockets are functions which do an accept() call
+	 * afterwards so we will label the socket as part of the accept(). */
+	if (sk == NULL)
+		return 0;
+
+	/* We allocate the maximum CIPSO option size here so we are probably
+	 * being a little wasteful, but it makes our life _much_ easier later
+	 * on and after all we are only talking about 40 bytes. */
+	buf_len = CIPSO_V4_OPT_LEN_MAX;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (buf == NULL) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		goto socket_setattr_failure;
+	buf_len = ret_val;
+
+	/* We can't use ip_options_get() directly because it makes a call to
+	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+	 * we won't always have CAP_NET_RAW even though we _always_ want to
+	 * set the IPOPT_CIPSO option. */
+	opt_len = (buf_len + 3) & ~3;
+	opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+	if (opt == NULL) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
+	kfree(buf);
+	buf = NULL;
+
+	sk_inet = inet_sk(sk);
+
+	old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+	if (sk_inet->is_icsk) {
+		sk_conn = inet_csk(sk);
+		if (old)
+			sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+		sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+	rcu_assign_pointer(sk_inet->inet_opt, opt);
+	if (old)
+		call_rcu(&old->rcu, opt_kfree_rcu);
+
+	return 0;
+
+socket_setattr_failure:
+	kfree(buf);
+	kfree(opt);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+			 const struct cipso_v4_doi *doi_def,
+			 const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 opt_len;
+	struct ip_options_rcu *opt = NULL;
+	struct inet_request_sock *req_inet;
+
+	/* We allocate the maximum CIPSO option size here so we are probably
+	 * being a little wasteful, but it makes our life _much_ easier later
+	 * on and after all we are only talking about 40 bytes. */
+	buf_len = CIPSO_V4_OPT_LEN_MAX;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (buf == NULL) {
+		ret_val = -ENOMEM;
+		goto req_setattr_failure;
+	}
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		goto req_setattr_failure;
+	buf_len = ret_val;
+
+	/* We can't use ip_options_get() directly because it makes a call to
+	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+	 * we won't always have CAP_NET_RAW even though we _always_ want to
+	 * set the IPOPT_CIPSO option. */
+	opt_len = (buf_len + 3) & ~3;
+	opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+	if (opt == NULL) {
+		ret_val = -ENOMEM;
+		goto req_setattr_failure;
+	}
+	memcpy(opt->opt.__data, buf, buf_len);
+	opt->opt.optlen = opt_len;
+	opt->opt.cipso = sizeof(struct iphdr);
+	kfree(buf);
+	buf = NULL;
+
+	req_inet = inet_rsk(req);
+	opt = xchg(&req_inet->opt, opt);
+	if (opt)
+		call_rcu(&opt->rcu, opt_kfree_rcu);
+
+	return 0;
+
+req_setattr_failure:
+	kfree(buf);
+	kfree(opt);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+{
+	int hdr_delta = 0;
+	struct ip_options_rcu *opt = *opt_ptr;
+
+	if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+		u8 cipso_len;
+		u8 cipso_off;
+		unsigned char *cipso_ptr;
+		int iter;
+		int optlen_new;
+
+		cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+		cipso_ptr = &opt->opt.__data[cipso_off];
+		cipso_len = cipso_ptr[1];
+
+		if (opt->opt.srr > opt->opt.cipso)
+			opt->opt.srr -= cipso_len;
+		if (opt->opt.rr > opt->opt.cipso)
+			opt->opt.rr -= cipso_len;
+		if (opt->opt.ts > opt->opt.cipso)
+			opt->opt.ts -= cipso_len;
+		if (opt->opt.router_alert > opt->opt.cipso)
+			opt->opt.router_alert -= cipso_len;
+		opt->opt.cipso = 0;
+
+		memmove(cipso_ptr, cipso_ptr + cipso_len,
+			opt->opt.optlen - cipso_off - cipso_len);
+
+		/* determining the new total option length is tricky because of
+		 * the padding necessary, the only thing i can think to do at
+		 * this point is walk the options one-by-one, skipping the
+		 * padding at the end to determine the actual option size and
+		 * from there we can determine the new total option length */
+		iter = 0;
+		optlen_new = 0;
+		while (iter < opt->opt.optlen)
+			if (opt->opt.__data[iter] != IPOPT_NOP) {
+				iter += opt->opt.__data[iter + 1];
+				optlen_new = iter;
+			} else
+				iter++;
+		hdr_delta = opt->opt.optlen;
+		opt->opt.optlen = (optlen_new + 3) & ~3;
+		hdr_delta -= opt->opt.optlen;
+	} else {
+		/* only the cipso option was present on the socket so we can
+		 * remove the entire option struct */
+		*opt_ptr = NULL;
+		hdr_delta = opt->opt.optlen;
+		call_rcu(&opt->rcu, opt_kfree_rcu);
+	}
+
+	return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+	int hdr_delta;
+	struct ip_options_rcu *opt;
+	struct inet_sock *sk_inet;
+
+	sk_inet = inet_sk(sk);
+	opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+	if (opt == NULL || opt->opt.cipso == 0)
+		return;
+
+	hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+	if (sk_inet->is_icsk && hdr_delta > 0) {
+		struct inet_connection_sock *sk_conn = inet_csk(sk);
+		sk_conn->icsk_ext_hdr_len -= hdr_delta;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+	struct ip_options_rcu *opt;
+	struct inet_request_sock *req_inet;
+
+	req_inet = inet_rsk(req);
+	opt = req_inet->opt;
+	if (opt == NULL || opt->opt.cipso == 0)
+		return;
+
+	cipso_v4_delopt(&req_inet->opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @cipso and return the security attributes in @secattr.  Returns zero
+ * on success and negative values on failure.
+ *
+ */
+static int cipso_v4_getattr(const unsigned char *cipso,
+			    struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	u32 doi;
+	struct cipso_v4_doi *doi_def;
+
+	if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+		return 0;
+
+	doi = get_unaligned_be32(&cipso[2]);
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_search(doi);
+	if (doi_def == NULL)
+		goto getattr_return;
+	/* XXX - This code assumes only one tag per CIPSO option which isn't
+	 * really a good assumption to make but since we only support the MAC
+	 * tags right now it is a safe assumption. */
+	switch (cipso[6]) {
+	case CIPSO_V4_TAG_RBITMAP:
+		ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_ENUM:
+		ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_RANGE:
+		ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+		break;
+	case CIPSO_V4_TAG_LOCAL:
+		ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+		break;
+	}
+	if (ret_val == 0)
+		secattr->type = NETLBL_NLTYPE_CIPSOV4;
+
+getattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr.  This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself.  Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+	struct ip_options_rcu *opt;
+	int res = -ENOMSG;
+
+	rcu_read_lock();
+	opt = rcu_dereference(inet_sk(sk)->inet_opt);
+	if (opt && opt->opt.cipso)
+		res = cipso_v4_getattr(opt->opt.__data +
+						opt->opt.cipso -
+						sizeof(struct iphdr),
+				       secattr);
+	rcu_read_unlock();
+	return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+			    const struct cipso_v4_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+	u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+	u32 opt_len;
+	int len_delta;
+
+	ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+	if (ret_val < 0)
+		return ret_val;
+	buf_len = ret_val;
+	opt_len = (buf_len + 3) & ~3;
+
+	/* we overwrite any existing options to ensure that we have enough
+	 * room for the CIPSO option, the reason is that we _need_ to guarantee
+	 * that the security label is applied to the packet - we do the same
+	 * thing when using the socket options and it hasn't caused a problem,
+	 * if we need to we can always revisit this choice later */
+
+	len_delta = opt_len - opt->optlen;
+	/* if we don't ensure enough headroom we could panic on the skb_push()
+	 * call below so make sure we have enough, we are also "mangling" the
+	 * packet so we should probably do a copy-on-write call anyway */
+	ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+	if (ret_val < 0)
+		return ret_val;
+
+	if (len_delta > 0) {
+		/* we assume that the header + opt->optlen have already been
+		 * "pushed" in ip_options_build() or similar */
+		iph = ip_hdr(skb);
+		skb_push(skb, len_delta);
+		memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+		skb_reset_network_header(skb);
+		iph = ip_hdr(skb);
+	} else if (len_delta < 0) {
+		iph = ip_hdr(skb);
+		memset(iph + 1, IPOPT_NOP, opt->optlen);
+	} else
+		iph = ip_hdr(skb);
+
+	if (opt->optlen > 0)
+		memset(opt, 0, sizeof(*opt));
+	opt->optlen = opt_len;
+	opt->cipso = sizeof(struct iphdr);
+	opt->is_changed = 1;
+
+	/* we have to do the following because we are being called from a
+	 * netfilter hook which means the packet already has had the header
+	 * fields populated and the checksum calculated - yes this means we
+	 * are doing more work than needed but we do it to keep the core
+	 * stack clean and tidy */
+	memcpy(iph + 1, buf, buf_len);
+	if (opt_len > buf_len)
+		memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+	if (len_delta != 0) {
+		iph->ihl = 5 + (opt_len >> 2);
+		iph->tot_len = htons(skb->len);
+	}
+	ip_send_check(iph);
+
+	return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+	int ret_val;
+	struct iphdr *iph;
+	struct ip_options *opt = &IPCB(skb)->opt;
+	unsigned char *cipso_ptr;
+
+	if (opt->cipso == 0)
+		return 0;
+
+	/* since we are changing the packet we should make a copy */
+	ret_val = skb_cow(skb, skb_headroom(skb));
+	if (ret_val < 0)
+		return ret_val;
+
+	/* the easiest thing to do is just replace the cipso option with noop
+	 * options since we don't change the size of the packet, although we
+	 * still need to recalculate the checksum */
+
+	iph = ip_hdr(skb);
+	cipso_ptr = (unsigned char *)iph + opt->cipso;
+	memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+	opt->cipso = 0;
+	opt->is_changed = 1;
+
+	ip_send_check(iph);
+
+	return 0;
+}
+
+/**
+ * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse the given packet's CIPSO option and return the security attributes.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
+			    struct netlbl_lsm_secattr *secattr)
+{
+	return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr);
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use.  Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+	int ret_val;
+
+	ret_val = cipso_v4_cache_init();
+	if (ret_val != 0)
+		panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+		      ret_val);
+
+	return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 00000000..424fafbc
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,87 @@
+/*
+ *	common UDP/RAW code
+ *	Linux INET implementation
+ *
+ * Authors:
+ * 	Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * 	This program is free software; you can redistribute it and/or
+ * 	modify it under the terms of the GNU General Public License
+ * 	as published by the Free Software Foundation; either version
+ * 	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	__be32 saddr;
+	int oif;
+	int err;
+
+
+	if (addr_len < sizeof(*usin))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	sk_dst_reset(sk);
+
+	lock_sock(sk);
+
+	oif = sk->sk_bound_dev_if;
+	saddr = inet->inet_saddr;
+	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+		if (!oif)
+			oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+			      RT_CONN_FLAGS(sk), oif,
+			      sk->sk_protocol,
+			      inet->inet_sport, usin->sin_port, sk, true);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		if (err == -ENETUNREACH)
+			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+		ip_rt_put(rt);
+		err = -EACCES;
+		goto out;
+	}
+	if (!inet->inet_saddr)
+		inet->inet_saddr = fl4->saddr;	/* Update source address */
+	if (!inet->inet_rcv_saddr) {
+		inet->inet_rcv_saddr = fl4->saddr;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
+	inet->inet_daddr = fl4->daddr;
+	inet->inet_dport = usin->sin_port;
+	sk->sk_state = TCP_ESTABLISHED;
+	inet->inet_id = jiffies;
+
+	sk_dst_set(sk, &rt->dst);
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 00000000..c48323ad
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1851 @@
+/*
+ *	NET3	IP device support routines.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Derived from the IP parts of dev.c 1.0.19
+ * 		Authors:	Ross Biro
+ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ *	Additional Authors:
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	Changes:
+ *		Alexey Kuznetsov:	pa_* fields are replaced with ifaddr
+ *					lists.
+ *		Cyrus Durgin:		updated for kmod
+ *		Matthias Andree:	in devinet_ioctl, compare label and
+ *					address (4.4BSD alias style support),
+ *					fall back to comparing just the label
+ *					if no match found.
+ */
+
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_addr.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+
+#include "fib_lookup.h"
+
+static struct ipv4_devconf ipv4_devconf = {
+	.data = {
+		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+	},
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+	.data = {
+		[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+		[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+		[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+	},
+};
+
+#define IPV4_DEVCONF_DFLT(net, attr) \
+	IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
+	[IFA_LOCAL]     	= { .type = NLA_U32 },
+	[IFA_ADDRESS]   	= { .type = NLA_U32 },
+	[IFA_BROADCAST] 	= { .type = NLA_U32 },
+	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+};
+
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value.  So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE	256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+	u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+
+	return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+		(IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+	unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+
+	spin_lock(&inet_addr_hash_lock);
+	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+	spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+	spin_lock(&inet_addr_hash_lock);
+	hlist_del_init_rcu(&ifa->hash);
+	spin_unlock(&inet_addr_hash_lock);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+	unsigned int hash = inet_addr_hash(net, addr);
+	struct net_device *result = NULL;
+	struct in_ifaddr *ifa;
+	struct hlist_node *node;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+		struct net_device *dev = ifa->ifa_dev->dev;
+
+		if (!net_eq(dev_net(dev), net))
+			continue;
+		if (ifa->ifa_local == addr) {
+			result = dev;
+			break;
+		}
+	}
+	if (!result) {
+		struct flowi4 fl4 = { .daddr = addr };
+		struct fib_result res = { 0 };
+		struct fib_table *local;
+
+		/* Fallback to FIB local table so that communication
+		 * over loopback subnets work.
+		 */
+		local = fib_get_table(net, RT_TABLE_LOCAL);
+		if (local &&
+		    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+		    res.type == RTN_LOCAL)
+			result = FIB_RES_DEV(res);
+	}
+	if (result && devref)
+		dev_hold(result);
+	rcu_read_unlock();
+	return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
+
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+			 int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static inline void devinet_sysctl_register(struct in_device *idev)
+{
+}
+static inline void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+	if (ifa->ifa_dev)
+		in_dev_put(ifa->ifa_dev);
+	kfree(ifa);
+}
+
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+	struct net_device *dev = idev->dev;
+
+	WARN_ON(idev->ifa_list);
+	WARN_ON(idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+	printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+	       idev, dev ? dev->name : "NIL");
+#endif
+	dev_put(dev);
+	if (!idev->dead)
+		pr_err("Freeing alive in_device %p\n", idev);
+	else
+		kfree(idev);
+}
+EXPORT_SYMBOL(in_dev_finish_destroy);
+
+static struct in_device *inetdev_init(struct net_device *dev)
+{
+	struct in_device *in_dev;
+
+	ASSERT_RTNL();
+
+	in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
+	if (!in_dev)
+		goto out;
+	memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+			sizeof(in_dev->cnf));
+	in_dev->cnf.sysctl = NULL;
+	in_dev->dev = dev;
+	in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+	if (!in_dev->arp_parms)
+		goto out_kfree;
+	if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+		dev_disable_lro(dev);
+	/* Reference in_dev->dev */
+	dev_hold(dev);
+	/* Account for reference dev->ip_ptr (below) */
+	in_dev_hold(in_dev);
+
+	devinet_sysctl_register(in_dev);
+	ip_mc_init_dev(in_dev);
+	if (dev->flags & IFF_UP)
+		ip_mc_up(in_dev);
+
+	/* we can receive as soon as ip_ptr is set -- do this last */
+	rcu_assign_pointer(dev->ip_ptr, in_dev);
+out:
+	return in_dev;
+out_kfree:
+	kfree(in_dev);
+	in_dev = NULL;
+	goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+	struct in_device *idev = container_of(head, struct in_device, rcu_head);
+	in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+	struct in_ifaddr *ifa;
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	dev = in_dev->dev;
+
+	in_dev->dead = 1;
+
+	ip_mc_destroy_dev(in_dev);
+
+	while ((ifa = in_dev->ifa_list) != NULL) {
+		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+		inet_free_ifa(ifa);
+	}
+
+	rcu_assign_pointer(dev->ip_ptr, NULL);
+
+	devinet_sysctl_unregister(in_dev);
+	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+	arp_ifdown(dev);
+
+	call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+{
+	rcu_read_lock();
+	for_primary_ifa(in_dev) {
+		if (inet_ifa_match(a, ifa)) {
+			if (!b || inet_ifa_match(b, ifa)) {
+				rcu_read_unlock();
+				return 1;
+			}
+		}
+	} endfor_ifa(in_dev);
+	rcu_read_unlock();
+	return 0;
+}
+
+static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+			 int destroy, struct nlmsghdr *nlh, u32 pid)
+{
+	struct in_ifaddr *promote = NULL;
+	struct in_ifaddr *ifa, *ifa1 = *ifap;
+	struct in_ifaddr *last_prim = in_dev->ifa_list;
+	struct in_ifaddr *prev_prom = NULL;
+	int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+
+	ASSERT_RTNL();
+
+	/* 1. Deleting primary ifaddr forces deletion all secondaries
+	 * unless alias promotion is set
+	 **/
+
+	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+		struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+		while ((ifa = *ifap1) != NULL) {
+			if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+			    ifa1->ifa_scope <= ifa->ifa_scope)
+				last_prim = ifa;
+
+			if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+			    ifa1->ifa_mask != ifa->ifa_mask ||
+			    !inet_ifa_match(ifa1->ifa_address, ifa)) {
+				ifap1 = &ifa->ifa_next;
+				prev_prom = ifa;
+				continue;
+			}
+
+			if (!do_promote) {
+				inet_hash_remove(ifa);
+				*ifap1 = ifa->ifa_next;
+
+				rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+				blocking_notifier_call_chain(&inetaddr_chain,
+						NETDEV_DOWN, ifa);
+				inet_free_ifa(ifa);
+			} else {
+				promote = ifa;
+				break;
+			}
+		}
+	}
+
+	/* On promotion all secondaries from subnet are changing
+	 * the primary IP, we must remove all their routes silently
+	 * and later to add them back with new prefsrc. Do this
+	 * while all addresses are on the device list.
+	 */
+	for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa))
+			fib_del_ifaddr(ifa, ifa1);
+	}
+
+	/* 2. Unlink it */
+
+	*ifap = ifa1->ifa_next;
+	inet_hash_remove(ifa1);
+
+	/* 3. Announce address deletion */
+
+	/* Send message first, then call notifier.
+	   At first sight, FIB update triggered by notifier
+	   will refer to already deleted ifaddr, that could confuse
+	   netlink listeners. It is not true: look, gated sees
+	   that route deleted and if it still thinks that ifaddr
+	   is valid, it will try to restore deleted routes... Grr.
+	   So that, this order is correct.
+	 */
+	rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+	if (promote) {
+		struct in_ifaddr *next_sec = promote->ifa_next;
+
+		if (prev_prom) {
+			prev_prom->ifa_next = promote->ifa_next;
+			promote->ifa_next = last_prim->ifa_next;
+			last_prim->ifa_next = promote;
+		}
+
+		promote->ifa_flags &= ~IFA_F_SECONDARY;
+		rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+		blocking_notifier_call_chain(&inetaddr_chain,
+				NETDEV_UP, promote);
+		for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+			if (ifa1->ifa_mask != ifa->ifa_mask ||
+			    !inet_ifa_match(ifa1->ifa_address, ifa))
+					continue;
+			fib_add_ifaddr(ifa);
+		}
+
+	}
+	if (destroy)
+		inet_free_ifa(ifa1);
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+			 int destroy)
+{
+	__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+}
+
+static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+			     u32 pid)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+	ASSERT_RTNL();
+
+	if (!ifa->ifa_local) {
+		inet_free_ifa(ifa);
+		return 0;
+	}
+
+	ifa->ifa_flags &= ~IFA_F_SECONDARY;
+	last_primary = &in_dev->ifa_list;
+
+	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+	     ifap = &ifa1->ifa_next) {
+		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+		    ifa->ifa_scope <= ifa1->ifa_scope)
+			last_primary = &ifa1->ifa_next;
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa)) {
+			if (ifa1->ifa_local == ifa->ifa_local) {
+				inet_free_ifa(ifa);
+				return -EEXIST;
+			}
+			if (ifa1->ifa_scope != ifa->ifa_scope) {
+				inet_free_ifa(ifa);
+				return -EINVAL;
+			}
+			ifa->ifa_flags |= IFA_F_SECONDARY;
+		}
+	}
+
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+		net_srandom(ifa->ifa_local);
+		ifap = last_primary;
+	}
+
+	ifa->ifa_next = *ifap;
+	*ifap = ifa;
+
+	inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+	/* Send message first, then call notifier.
+	   Notifier will trigger FIB update, so that
+	   listeners of netlink will know about new ifaddr */
+	rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+	return 0;
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+	return __inet_insert_ifa(ifa, NULL, 0);
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		inet_free_ifa(ifa);
+		return -ENOBUFS;
+	}
+	ipv4_devconf_setall(in_dev);
+	if (ifa->ifa_dev != in_dev) {
+		WARN_ON(ifa->ifa_dev);
+		in_dev_hold(in_dev);
+		ifa->ifa_dev = in_dev;
+	}
+	if (ipv4_is_loopback(ifa->ifa_local))
+		ifa->ifa_scope = RT_SCOPE_HOST;
+	return inet_insert_ifa(ifa);
+}
+
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+	struct in_device *in_dev = NULL;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (dev)
+		in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+	rcu_read_unlock();
+	return in_dev;
+}
+EXPORT_SYMBOL(inetdev_by_index);
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+				    __be32 mask)
+{
+	ASSERT_RTNL();
+
+	for_primary_ifa(in_dev) {
+		if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+			return ifa;
+	} endfor_ifa(in_dev);
+	return NULL;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct in_device *in_dev;
+	struct ifaddrmsg *ifm;
+	struct in_ifaddr *ifa, **ifap;
+	int err = -EINVAL;
+
+	ASSERT_RTNL();
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	ifm = nlmsg_data(nlh);
+	in_dev = inetdev_by_index(net, ifm->ifa_index);
+	if (in_dev == NULL) {
+		err = -ENODEV;
+		goto errout;
+	}
+
+	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+	     ifap = &ifa->ifa_next) {
+		if (tb[IFA_LOCAL] &&
+		    ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL]))
+			continue;
+
+		if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+			continue;
+
+		if (tb[IFA_ADDRESS] &&
+		    (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+		    !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
+			continue;
+
+		__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+		return 0;
+	}
+
+	err = -EADDRNOTAVAIL;
+errout:
+	return err;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[IFA_MAX+1];
+	struct in_ifaddr *ifa;
+	struct ifaddrmsg *ifm;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	ifm = nlmsg_data(nlh);
+	err = -EINVAL;
+	if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL)
+		goto errout;
+
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	err = -ENODEV;
+	if (dev == NULL)
+		goto errout;
+
+	in_dev = __in_dev_get_rtnl(dev);
+	err = -ENOBUFS;
+	if (in_dev == NULL)
+		goto errout;
+
+	ifa = inet_alloc_ifa();
+	if (ifa == NULL)
+		/*
+		 * A potential indev allocation can be left alive, it stays
+		 * assigned to its device and is destroy with it.
+		 */
+		goto errout;
+
+	ipv4_devconf_setall(in_dev);
+	in_dev_hold(in_dev);
+
+	if (tb[IFA_ADDRESS] == NULL)
+		tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+	INIT_HLIST_NODE(&ifa->hash);
+	ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+	ifa->ifa_flags = ifm->ifa_flags;
+	ifa->ifa_scope = ifm->ifa_scope;
+	ifa->ifa_dev = in_dev;
+
+	ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]);
+	ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]);
+
+	if (tb[IFA_BROADCAST])
+		ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
+
+	if (tb[IFA_LABEL])
+		nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+	else
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+	return ifa;
+
+errout:
+	return ERR_PTR(err);
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct in_ifaddr *ifa;
+
+	ASSERT_RTNL();
+
+	ifa = rtm_to_ifaddr(net, nlh);
+	if (IS_ERR(ifa))
+		return PTR_ERR(ifa);
+
+	return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+}
+
+/*
+ *	Determine a default network mask, based on the IP address.
+ */
+
+static inline int inet_abc_len(__be32 addr)
+{
+	int rc = -1;	/* Something else, probably a multicast. */
+
+	if (ipv4_is_zeronet(addr))
+		rc = 0;
+	else {
+		__u32 haddr = ntohl(addr);
+
+		if (IN_CLASSA(haddr))
+			rc = 8;
+		else if (IN_CLASSB(haddr))
+			rc = 16;
+		else if (IN_CLASSC(haddr))
+			rc = 24;
+	}
+
+	return rc;
+}
+
+
+int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	struct ifreq ifr;
+	struct sockaddr_in sin_orig;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+	struct in_device *in_dev;
+	struct in_ifaddr **ifap = NULL;
+	struct in_ifaddr *ifa = NULL;
+	struct net_device *dev;
+	char *colon;
+	int ret = -EFAULT;
+	int tryaddrmatch = 0;
+
+	/*
+	 *	Fetch the caller's info block into kernel space
+	 */
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		goto out;
+	ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+	/* save original address for comparison */
+	memcpy(&sin_orig, sin, sizeof(*sin));
+
+	colon = strchr(ifr.ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+	dev_load(net, ifr.ifr_name);
+
+	switch (cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		/* Note that these ioctls will not sleep,
+		   so that we do not impose a lock.
+		   One day we will be forced to put shlock here (I mean SMP)
+		 */
+		tryaddrmatch = (sin_orig.sin_family == AF_INET);
+		memset(sin, 0, sizeof(*sin));
+		sin->sin_family = AF_INET;
+		break;
+
+	case SIOCSIFFLAGS:
+		ret = -EACCES;
+		if (!capable(CAP_NET_ADMIN))
+			goto out;
+		break;
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+	case SIOCKILLADDR:	/* Nuke all sockets on this address */
+		ret = -EACCES;
+		if (!capable(CAP_NET_ADMIN))
+			goto out;
+		ret = -EINVAL;
+		if (sin->sin_family != AF_INET)
+			goto out;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rtnl_lock();
+
+	ret = -ENODEV;
+	dev = __dev_get_by_name(net, ifr.ifr_name);
+	if (!dev)
+		goto done;
+
+	if (colon)
+		*colon = ':';
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
+		if (tryaddrmatch) {
+			/* Matthias Andree */
+			/* compare label and address (4.4BSD style) */
+			/* note: we only do this for a limited set of ioctls
+			   and only if the original address family was AF_INET.
+			   This is checked above. */
+			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			     ifap = &ifa->ifa_next) {
+				if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+				    sin_orig.sin_addr.s_addr ==
+							ifa->ifa_local) {
+					break; /* found */
+				}
+			}
+		}
+		/* we didn't get a match, maybe the application is
+		   4.3BSD-style and passed in junk so we fall back to
+		   comparing just the label */
+		if (!ifa) {
+			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			     ifap = &ifa->ifa_next)
+				if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+					break;
+		}
+	}
+
+	ret = -EADDRNOTAVAIL;
+	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS
+	    && cmd != SIOCKILLADDR)
+		goto done;
+
+	switch (cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+		sin->sin_addr.s_addr = ifa->ifa_local;
+		goto rarok;
+
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+		sin->sin_addr.s_addr = ifa->ifa_broadcast;
+		goto rarok;
+
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+		sin->sin_addr.s_addr = ifa->ifa_address;
+		goto rarok;
+
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		sin->sin_addr.s_addr = ifa->ifa_mask;
+		goto rarok;
+
+	case SIOCSIFFLAGS:
+		if (colon) {
+			ret = -EADDRNOTAVAIL;
+			if (!ifa)
+				break;
+			ret = 0;
+			if (!(ifr.ifr_flags & IFF_UP))
+				inet_del_ifa(in_dev, ifap, 1);
+			break;
+		}
+		ret = dev_change_flags(dev, ifr.ifr_flags);
+		break;
+
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+
+		if (!ifa) {
+			ret = -ENOBUFS;
+			ifa = inet_alloc_ifa();
+			INIT_HLIST_NODE(&ifa->hash);
+			if (!ifa)
+				break;
+			if (colon)
+				memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+			else
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+		} else {
+			ret = 0;
+			if (ifa->ifa_local == sin->sin_addr.s_addr)
+				break;
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = 0;
+			ifa->ifa_scope = 0;
+		}
+
+		ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+		if (!(dev->flags & IFF_POINTOPOINT)) {
+			ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+			ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+			if ((dev->flags & IFF_BROADCAST) &&
+			    ifa->ifa_prefixlen < 31)
+				ifa->ifa_broadcast = ifa->ifa_address |
+						     ~ifa->ifa_mask;
+		} else {
+			ifa->ifa_prefixlen = 32;
+			ifa->ifa_mask = inet_make_mask(32);
+		}
+		ret = inet_set_ifa(dev, ifa);
+		break;
+
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+		ret = 0;
+		if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = sin->sin_addr.s_addr;
+			inet_insert_ifa(ifa);
+		}
+		break;
+
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+		ret = 0;
+		if (ifa->ifa_address == sin->sin_addr.s_addr)
+			break;
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+		ret = 0;
+		inet_del_ifa(in_dev, ifap, 0);
+		ifa->ifa_address = sin->sin_addr.s_addr;
+		inet_insert_ifa(ifa);
+		break;
+
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+
+		/*
+		 *	The mask we set must be legal.
+		 */
+		ret = -EINVAL;
+		if (bad_mask(sin->sin_addr.s_addr, 0))
+			break;
+		ret = 0;
+		if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+			__be32 old_mask = ifa->ifa_mask;
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_mask = sin->sin_addr.s_addr;
+			ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+			/* See if current broadcast address matches
+			 * with current netmask, then recalculate
+			 * the broadcast address. Otherwise it's a
+			 * funny address, so don't touch it since
+			 * the user seems to know what (s)he's doing...
+			 */
+			if ((dev->flags & IFF_BROADCAST) &&
+			    (ifa->ifa_prefixlen < 31) &&
+			    (ifa->ifa_broadcast ==
+			     (ifa->ifa_local|~old_mask))) {
+				ifa->ifa_broadcast = (ifa->ifa_local |
+						      ~sin->sin_addr.s_addr);
+			}
+			inet_insert_ifa(ifa);
+		}
+		break;
+	case SIOCKILLADDR:	/* Nuke all connections on this address */
+		ret = tcp_nuke_addr(net, (struct sockaddr *) sin);
+		break;
+	}
+done:
+	rtnl_unlock();
+out:
+	return ret;
+rarok:
+	rtnl_unlock();
+	ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+	goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct in_ifaddr *ifa;
+	struct ifreq ifr;
+	int done = 0;
+
+	if (!in_dev)
+		goto out;
+
+	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+		if (!buf) {
+			done += sizeof(ifr);
+			continue;
+		}
+		if (len < (int) sizeof(ifr))
+			break;
+		memset(&ifr, 0, sizeof(struct ifreq));
+		if (ifa->ifa_label)
+			strcpy(ifr.ifr_name, ifa->ifa_label);
+		else
+			strcpy(ifr.ifr_name, dev->name);
+
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+								ifa->ifa_local;
+
+		if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+			done = -EFAULT;
+			break;
+		}
+		buf  += sizeof(struct ifreq);
+		len  -= sizeof(struct ifreq);
+		done += sizeof(struct ifreq);
+	}
+out:
+	return done;
+}
+
+__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+{
+	__be32 addr = 0;
+	struct in_device *in_dev;
+	struct net *net = dev_net(dev);
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		goto no_in_dev;
+
+	for_primary_ifa(in_dev) {
+		if (ifa->ifa_scope > scope)
+			continue;
+		if (!dst || inet_ifa_match(dst, ifa)) {
+			addr = ifa->ifa_local;
+			break;
+		}
+		if (!addr)
+			addr = ifa->ifa_local;
+	} endfor_ifa(in_dev);
+
+	if (addr)
+		goto out_unlock;
+no_in_dev:
+
+	/* Not loopback addresses on loopback should be preferred
+	   in this case. It is importnat that lo is the first interface
+	   in dev_base list.
+	 */
+	for_each_netdev_rcu(net, dev) {
+		in_dev = __in_dev_get_rcu(dev);
+		if (!in_dev)
+			continue;
+
+		for_primary_ifa(in_dev) {
+			if (ifa->ifa_scope != RT_SCOPE_LINK &&
+			    ifa->ifa_scope <= scope) {
+				addr = ifa->ifa_local;
+				goto out_unlock;
+			}
+		} endfor_ifa(in_dev);
+	}
+out_unlock:
+	rcu_read_unlock();
+	return addr;
+}
+EXPORT_SYMBOL(inet_select_addr);
+
+static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
+			      __be32 local, int scope)
+{
+	int same = 0;
+	__be32 addr = 0;
+
+	for_ifa(in_dev) {
+		if (!addr &&
+		    (local == ifa->ifa_local || !local) &&
+		    ifa->ifa_scope <= scope) {
+			addr = ifa->ifa_local;
+			if (same)
+				break;
+		}
+		if (!same) {
+			same = (!local || inet_ifa_match(local, ifa)) &&
+				(!dst || inet_ifa_match(dst, ifa));
+			if (same && addr) {
+				if (local || !dst)
+					break;
+				/* Is the selected addr into dst subnet? */
+				if (inet_ifa_match(addr, ifa))
+					break;
+				/* No, then can we use new local src? */
+				if (ifa->ifa_scope <= scope) {
+					addr = ifa->ifa_local;
+					break;
+				}
+				/* search for large dst subnet for addr */
+				same = 0;
+			}
+		}
+	} endfor_ifa(in_dev);
+
+	return same ? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - in_dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+__be32 inet_confirm_addr(struct in_device *in_dev,
+			 __be32 dst, __be32 local, int scope)
+{
+	__be32 addr = 0;
+	struct net_device *dev;
+	struct net *net;
+
+	if (scope != RT_SCOPE_LINK)
+		return confirm_addr_indev(in_dev, dst, local, scope);
+
+	net = dev_net(in_dev->dev);
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		in_dev = __in_dev_get_rcu(dev);
+		if (in_dev) {
+			addr = confirm_addr_indev(in_dev, dst, local, scope);
+			if (addr)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	return addr;
+}
+
+/*
+ *	Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_notifier);
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+	struct in_ifaddr *ifa;
+	int named = 0;
+
+	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+		char old[IFNAMSIZ], *dot;
+
+		memcpy(old, ifa->ifa_label, IFNAMSIZ);
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+		if (named++ == 0)
+			goto skip;
+		dot = strchr(old, ':');
+		if (dot == NULL) {
+			sprintf(old, ":%d", named);
+			dot = old;
+		}
+		if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+			strcat(ifa->ifa_label, dot);
+		else
+			strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+		rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+	}
+}
+
+static inline bool inetdev_valid_mtu(unsigned mtu)
+{
+	return mtu >= 68;
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+					struct in_device *in_dev)
+
+{
+	struct in_ifaddr *ifa;
+
+	for (ifa = in_dev->ifa_list; ifa;
+	     ifa = ifa->ifa_next) {
+		arp_send(ARPOP_REQUEST, ETH_P_ARP,
+			 ifa->ifa_local, dev,
+			 ifa->ifa_local, NULL,
+			 dev->dev_addr, NULL);
+	}
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		if (event == NETDEV_REGISTER) {
+			in_dev = inetdev_init(dev);
+			if (!in_dev)
+				return notifier_from_errno(-ENOMEM);
+			if (dev->flags & IFF_LOOPBACK) {
+				IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+				IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+			}
+		} else if (event == NETDEV_CHANGEMTU) {
+			/* Re-enabling IP */
+			if (inetdev_valid_mtu(dev->mtu))
+				in_dev = inetdev_init(dev);
+		}
+		goto out;
+	}
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		printk(KERN_DEBUG "inetdev_event: bug\n");
+		rcu_assign_pointer(dev->ip_ptr, NULL);
+		break;
+	case NETDEV_UP:
+		if (!inetdev_valid_mtu(dev->mtu))
+			break;
+		if (dev->flags & IFF_LOOPBACK) {
+			struct in_ifaddr *ifa = inet_alloc_ifa();
+
+			if (ifa) {
+				INIT_HLIST_NODE(&ifa->hash);
+				ifa->ifa_local =
+				  ifa->ifa_address = htonl(INADDR_LOOPBACK);
+				ifa->ifa_prefixlen = 8;
+				ifa->ifa_mask = inet_make_mask(8);
+				in_dev_hold(in_dev);
+				ifa->ifa_dev = in_dev;
+				ifa->ifa_scope = RT_SCOPE_HOST;
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+				inet_insert_ifa(ifa);
+			}
+		}
+		ip_mc_up(in_dev);
+		/* fall through */
+	case NETDEV_CHANGEADDR:
+		if (!IN_DEV_ARP_NOTIFY(in_dev))
+			break;
+		/* fall through */
+	case NETDEV_NOTIFY_PEERS:
+		/* Send gratuitous ARP to notify of link change */
+		inetdev_send_gratuitous_arp(dev, in_dev);
+		break;
+	case NETDEV_DOWN:
+		ip_mc_down(in_dev);
+		break;
+	case NETDEV_PRE_TYPE_CHANGE:
+		ip_mc_unmap(in_dev);
+		break;
+	case NETDEV_POST_TYPE_CHANGE:
+		ip_mc_remap(in_dev);
+		break;
+	case NETDEV_CHANGEMTU:
+		if (inetdev_valid_mtu(dev->mtu))
+			break;
+		/* disable IP when MTU is not enough */
+	case NETDEV_UNREGISTER:
+		inetdev_destroy(in_dev);
+		break;
+	case NETDEV_CHANGENAME:
+		/* Do not notify about label change, this event is
+		 * not interesting to applications using netlink.
+		 */
+		inetdev_changename(dev, in_dev);
+
+		devinet_sysctl_unregister(in_dev);
+		devinet_sysctl_register(in_dev);
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+	.notifier_call = inetdev_event,
+};
+
+static inline size_t inet_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+	       + nla_total_size(4) /* IFA_ADDRESS */
+	       + nla_total_size(4) /* IFA_LOCAL */
+	       + nla_total_size(4) /* IFA_BROADCAST */
+	       + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+}
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+			    u32 pid, u32 seq, int event, unsigned int flags)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr  *nlh;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifa_family = AF_INET;
+	ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+	ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+	ifm->ifa_scope = ifa->ifa_scope;
+	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+	if (ifa->ifa_address)
+		NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
+
+	if (ifa->ifa_local)
+		NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
+
+	if (ifa->ifa_broadcast)
+		NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
+
+	if (ifa->ifa_label[0])
+		NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int h, s_h;
+	int idx, s_idx;
+	int ip_idx, s_ip_idx;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	s_ip_idx = ip_idx = cb->args[2];
+
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			if (h > s_h || idx > s_idx)
+				s_ip_idx = 0;
+			in_dev = __in_dev_get_rcu(dev);
+			if (!in_dev)
+				goto cont;
+
+			for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+			     ifa = ifa->ifa_next, ip_idx++) {
+				if (ip_idx < s_ip_idx)
+					continue;
+				if (inet_fill_ifaddr(skb, ifa,
+					     NETLINK_CB(cb->skb).pid,
+					     cb->nlh->nlmsg_seq,
+					     RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+					rcu_read_unlock();
+					goto done;
+				}
+			}
+cont:
+			idx++;
+		}
+		rcu_read_unlock();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	cb->args[2] = ip_idx;
+
+	return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+		      u32 pid)
+{
+	struct sk_buff *skb;
+	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	int err = -ENOBUFS;
+	struct net *net;
+
+	net = dev_net(ifa->ifa_dev->dev);
+	skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+	if (!in_dev)
+		return 0;
+
+	return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+	struct nlattr *nla;
+	int i;
+
+	if (!in_dev)
+		return -ENODATA;
+
+	nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+		((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+	return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+	[IFLA_INET_CONF]	= { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+				 const struct nlattr *nla)
+{
+	struct nlattr *a, *tb[IFLA_INET_MAX+1];
+	int err, rem;
+
+	if (dev && !__in_dev_get_rtnl(dev))
+		return -EAFNOSUPPORT;
+
+	err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[IFLA_INET_CONF]) {
+		nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+			int cfgid = nla_type(a);
+
+			if (nla_len(a) < 4)
+				return -EINVAL;
+
+			if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct nlattr *a, *tb[IFLA_INET_MAX+1];
+	int rem;
+
+	if (!in_dev)
+		return -EAFNOSUPPORT;
+
+	if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+		BUG();
+
+	if (tb[IFLA_INET_CONF]) {
+		nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+			ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		struct in_device *in_dev;
+
+		in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !test_bit(i, in_dev->cnf.state))
+			in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+	}
+	rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+	struct net_device *dev;
+	int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+	IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+	IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+
+	for_each_netdev(net, dev) {
+		struct in_device *in_dev;
+		if (on)
+			dev_disable_lro(dev);
+		rcu_read_lock();
+		in_dev = __in_dev_get_rcu(dev);
+		if (in_dev)
+			IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+		rcu_read_unlock();
+	}
+}
+
+static int devinet_conf_proc(ctl_table *ctl, int write,
+			     void __user *buffer,
+			     size_t *lenp, loff_t *ppos)
+{
+	int old_value = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	int new_value = *(int *)ctl->data;
+
+	if (write) {
+		struct ipv4_devconf *cnf = ctl->extra1;
+		struct net *net = ctl->extra2;
+		int i = (int *)ctl->data - cnf->data;
+
+		set_bit(i, cnf->state);
+
+		if (cnf == net->ipv4.devconf_dflt)
+			devinet_copy_dflt_conf(net, i);
+		if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+			if ((new_value == 0) && (old_value != 0))
+				rt_cache_flush(net, 0);
+	}
+
+	return ret;
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+				  void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write && *valp != val) {
+		struct net *net = ctl->extra2;
+
+		if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+			if (!rtnl_trylock()) {
+				/* Restore the original values before restarting */
+				*valp = val;
+				*ppos = pos;
+				return restart_syscall();
+			}
+			if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+				inet_forward_change(net);
+			} else if (*valp) {
+				struct ipv4_devconf *cnf = ctl->extra1;
+				struct in_device *idev =
+					container_of(cnf, struct in_device, cnf);
+				dev_disable_lro(idev->dev);
+			}
+			rtnl_unlock();
+			rt_cache_flush(net, 0);
+		}
+	}
+
+	return ret;
+}
+
+static int ipv4_doint_and_flush(ctl_table *ctl, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	struct net *net = ctl->extra2;
+
+	if (write && *valp != val)
+		rt_cache_flush(net, 0);
+
+	return ret;
+}
+
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+	{ \
+		.procname	= name, \
+		.data		= ipv4_devconf.data + \
+				  IPV4_DEVCONF_ ## attr - 1, \
+		.maxlen		= sizeof(int), \
+		.mode		= mval, \
+		.proc_handler	= proc, \
+		.extra1		= &ipv4_devconf, \
+	}
+
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+	DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+	DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
+
+static struct devinet_sysctl_table {
+	struct ctl_table_header *sysctl_header;
+	struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+	char *dev_name;
+} devinet_sysctl = {
+	.devinet_vars = {
+		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+					     devinet_sysctl_forward),
+		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+		DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+		DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+					"accept_source_route"),
+		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+		DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+		DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+		DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+		DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+		DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+		DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+		DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+
+		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
+					      "force_igmp_version"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+					      "promote_secondaries"),
+	},
+};
+
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+					struct ipv4_devconf *p)
+{
+	int i;
+	struct devinet_sysctl_table *t;
+
+#define DEVINET_CTL_PATH_DEV	3
+
+	struct ctl_path devinet_ctl_path[] = {
+		{ .procname = "net",  },
+		{ .procname = "ipv4", },
+		{ .procname = "conf", },
+		{ /* to be set */ },
+		{ },
+	};
+
+	t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+		t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+		t->devinet_vars[i].extra1 = p;
+		t->devinet_vars[i].extra2 = net;
+	}
+
+	/*
+	 * Make a copy of dev_name, because '.procname' is regarded as const
+	 * by sysctl and we wouldn't want anyone to change it under our feet
+	 * (see SIOCSIFNAME).
+	 */
+	t->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	if (!t->dev_name)
+		goto free;
+
+	devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+
+	t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
+			t->devinet_vars);
+	if (!t->sysctl_header)
+		goto free_procname;
+
+	p->sysctl = t;
+	return 0;
+
+free_procname:
+	kfree(t->dev_name);
+free:
+	kfree(t);
+out:
+	return -ENOBUFS;
+}
+
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+{
+	struct devinet_sysctl_table *t = cnf->sysctl;
+
+	if (t == NULL)
+		return;
+
+	cnf->sysctl = NULL;
+	unregister_net_sysctl_table(t->sysctl_header);
+	kfree(t->dev_name);
+	kfree(t);
+}
+
+static void devinet_sysctl_register(struct in_device *idev)
+{
+	neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
+	__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+					&idev->cnf);
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+	__devinet_sysctl_unregister(&idev->cnf);
+	neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+	{
+		.procname	= "ip_forward",
+		.data		= &ipv4_devconf.data[
+					IPV4_DEVCONF_FORWARDING - 1],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= devinet_sysctl_forward,
+		.extra1		= &ipv4_devconf,
+		.extra2		= &init_net,
+	},
+	{ },
+};
+
+static __net_initdata struct ctl_path net_ipv4_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+	int err;
+	struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+	struct ctl_table *tbl = ctl_forward_entry;
+	struct ctl_table_header *forw_hdr;
+#endif
+
+	err = -ENOMEM;
+	all = &ipv4_devconf;
+	dflt = &ipv4_devconf_dflt;
+
+	if (!net_eq(net, &init_net)) {
+		all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+		if (all == NULL)
+			goto err_alloc_all;
+
+		dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+		if (dflt == NULL)
+			goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+		tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+		if (tbl == NULL)
+			goto err_alloc_ctl;
+
+		tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+		tbl[0].extra1 = all;
+		tbl[0].extra2 = net;
+#endif
+	}
+
+#ifdef CONFIG_SYSCTL
+	err = __devinet_sysctl_register(net, "all", all);
+	if (err < 0)
+		goto err_reg_all;
+
+	err = __devinet_sysctl_register(net, "default", dflt);
+	if (err < 0)
+		goto err_reg_dflt;
+
+	err = -ENOMEM;
+	forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+	if (forw_hdr == NULL)
+		goto err_reg_ctl;
+	net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+	net->ipv4.devconf_all = all;
+	net->ipv4.devconf_dflt = dflt;
+	return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+	__devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+	__devinet_sysctl_unregister(all);
+err_reg_all:
+	if (tbl != ctl_forward_entry)
+		kfree(tbl);
+err_alloc_ctl:
+#endif
+	if (dflt != &ipv4_devconf_dflt)
+		kfree(dflt);
+err_alloc_dflt:
+	if (all != &ipv4_devconf)
+		kfree(all);
+err_alloc_all:
+	return err;
+}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+	struct ctl_table *tbl;
+
+	tbl = net->ipv4.forw_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.forw_hdr);
+	__devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+	__devinet_sysctl_unregister(net->ipv4.devconf_all);
+	kfree(tbl);
+#endif
+	kfree(net->ipv4.devconf_dflt);
+	kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+	.init = devinet_init_net,
+	.exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops = {
+	.family		  = AF_INET,
+	.fill_link_af	  = inet_fill_link_af,
+	.get_link_af_size = inet_get_link_af_size,
+	.validate_link_af = inet_validate_link_af,
+	.set_link_af	  = inet_set_link_af,
+};
+
+void __init devinet_init(void)
+{
+	int i;
+
+	for (i = 0; i < IN4_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+	register_pernet_subsys(&devinet_ops);
+
+	register_gifconf(PF_INET, inet_gifconf);
+	register_netdevice_notifier(&ip_netdev_notifier);
+
+	rtnl_af_register(&inet_af_ops);
+
+	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
+	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
+	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
+}
+
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 00000000..530787bc
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,725 @@
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+
+struct esp_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+{
+	unsigned int len;
+
+	len = seqhilen;
+
+	len += crypto_aead_ivsize(aead);
+
+	if (len) {
+		len += crypto_aead_alignmask(aead) &
+		       ~(crypto_tfm_ctx_alignment() - 1);
+		len = ALIGN(len, crypto_tfm_ctx_alignment());
+	}
+
+	len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+	return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+	return crypto_aead_ivsize(aead) ?
+	       PTR_ALIGN((u8 *)tmp + seqhilen,
+			 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+	struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_givcrypt_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_givcrypt_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_request_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+					     struct aead_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+	struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+	struct sk_buff *skb = base->data;
+
+	kfree(ESP_SKB_CB(skb)->tmp);
+	xfrm_output_resume(skb, err);
+}
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct aead_givcrypt_request *req;
+	struct scatterlist *sg;
+	struct scatterlist *asg;
+	struct esp_data *esp;
+	struct sk_buff *trailer;
+	void *tmp;
+	u8 *iv;
+	u8 *tail;
+	int blksize;
+	int clen;
+	int alen;
+	int plen;
+	int tfclen;
+	int nfrags;
+	int assoclen;
+	int sglists;
+	int seqhilen;
+	__be32 *seqhi;
+
+	/* skb is pure payload to encrypt */
+
+	err = -ENOMEM;
+
+	esp = x->data;
+	aead = esp->aead;
+	alen = crypto_aead_authsize(aead);
+
+	tfclen = 0;
+	if (x->tfcpad) {
+		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+		u32 padto;
+
+		padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+		if (skb->len < padto)
+			tfclen = padto - skb->len;
+	}
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	clen = ALIGN(skb->len + 2 + tfclen, blksize);
+	if (esp->padlen)
+		clen = ALIGN(clen, esp->padlen);
+	plen = clen - skb->len - tfclen;
+
+	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
+
+	assoclen = sizeof(*esph);
+	sglists = 1;
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists += 2;
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+	if (!tmp)
+		goto error;
+
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_givreq(aead, iv);
+	asg = esp_givreq_sg(aead, req);
+	sg = asg + sglists;
+
+	/* Fill padding... */
+	tail = skb_tail_pointer(trailer);
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = *skb_mac_header(skb);
+	pskb_put(skb, trailer, clen - skb->len + alen);
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph = ip_esp_hdr(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	/* this is non-NULL only with UDP Encapsulation */
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+		struct udphdr *uh;
+		__be32 *udpdata32;
+		__be16 sport, dport;
+		int encap_type;
+
+		spin_lock_bh(&x->lock);
+		sport = encap->encap_sport;
+		dport = encap->encap_dport;
+		encap_type = encap->encap_type;
+		spin_unlock_bh(&x->lock);
+
+		uh = (struct udphdr *)esph;
+		uh->source = sport;
+		uh->dest = dport;
+		uh->len = htons(skb->len - skb_transport_offset(skb));
+		uh->check = 0;
+
+		switch (encap_type) {
+		default:
+		case UDP_ENCAP_ESPINUDP:
+			esph = (struct ip_esp_hdr *)(uh + 1);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			udpdata32 = (__be32 *)(uh + 1);
+			udpdata32[0] = udpdata32[1] = 0;
+			esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+			break;
+		}
+
+		*skb_mac_header(skb) = IPPROTO_UDP;
+	}
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg,
+		     esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+		     clen + alen);
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_init_table(asg, 3);
+		sg_set_buf(asg, &esph->spi, sizeof(__be32));
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_set_buf(asg + 1, seqhi, seqhilen);
+		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+	} else
+		sg_init_one(asg, esph, sizeof(*esph));
+
+	aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+	aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+	aead_givcrypt_set_assoc(req, asg, assoclen);
+	aead_givcrypt_set_giv(req, esph->enc_data,
+			      XFRM_SKB_CB(skb)->seq.output.low);
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	err = crypto_aead_givencrypt(req);
+	if (err == -EINPROGRESS)
+		goto error;
+
+	if (err == -EBUSY)
+		err = NET_XMIT_DROP;
+
+	kfree(tmp);
+
+error:
+	return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+	const struct iphdr *iph;
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = esp->aead;
+	int alen = crypto_aead_authsize(aead);
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	int elen = skb->len - hlen;
+	int ihl;
+	u8 nexthdr[2];
+	int padlen;
+
+	kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+		BUG();
+
+	err = -EINVAL;
+	padlen = nexthdr[0];
+	if (padlen + 2 + alen >= elen)
+		goto out;
+
+	/* ... check padding bits here. Silly. :-) */
+
+	iph = ip_hdr(skb);
+	ihl = iph->ihl * 4;
+
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+		struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+
+		/*
+		 * 1) if the NAT-T peer's IP or port changed then
+		 *    advertize the change to the keying daemon.
+		 *    This is an inbound SA, so just compare
+		 *    SRC ports.
+		 */
+		if (iph->saddr != x->props.saddr.a4 ||
+		    uh->source != encap->encap_sport) {
+			xfrm_address_t ipaddr;
+
+			ipaddr.a4 = iph->saddr;
+			km_new_mapping(x, &ipaddr, uh->source);
+
+			/* XXX: perhaps add an extra
+			 * policy check here, to see
+			 * if we should allow or
+			 * reject a packet from a
+			 * different source
+			 * address/port.
+			 */
+		}
+
+		/*
+		 * 2) ignore UDP/TCP checksums in case
+		 *    of NAT-T in Transport Mode, or
+		 *    perform other post-processing fixes
+		 *    as per draft-ietf-ipsec-udp-encaps-06,
+		 *    section 3.1.2
+		 */
+		if (x->props.mode == XFRM_MODE_TRANSPORT)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	pskb_trim(skb, skb->len - alen - padlen - 2);
+	__skb_pull(skb, hlen);
+	skb_set_transport_header(skb, -ihl);
+
+	err = nexthdr[1];
+
+	/* RFC4303: Drop dummy packets without any error */
+	if (err == IPPROTO_NONE)
+		err = -EINVAL;
+
+out:
+	return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+	struct sk_buff *skb = base->data;
+
+	xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_esp_hdr *esph;
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = esp->aead;
+	struct aead_request *req;
+	struct sk_buff *trailer;
+	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+	int nfrags;
+	int assoclen;
+	int sglists;
+	int seqhilen;
+	__be32 *seqhi;
+	void *tmp;
+	u8 *iv;
+	struct scatterlist *sg;
+	struct scatterlist *asg;
+	int err = -EINVAL;
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+		goto out;
+
+	if (elen <= 0)
+		goto out;
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	assoclen = sizeof(*esph);
+	sglists = 1;
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists += 2;
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	err = -ENOMEM;
+	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+	if (!tmp)
+		goto out;
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	asg = esp_req_sg(aead, req);
+	sg = asg + sglists;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	esph = (struct ip_esp_hdr *)skb->data;
+
+	/* Get ivec. This can be wrong, check against another impls. */
+	iv = esph->enc_data;
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_init_table(asg, 3);
+		sg_set_buf(asg, &esph->spi, sizeof(__be32));
+		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+		sg_set_buf(asg + 1, seqhi, seqhilen);
+		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+	} else
+		sg_init_one(asg, esph, sizeof(*esph));
+
+	aead_request_set_callback(req, 0, esp_input_done, skb);
+	aead_request_set_crypt(req, sg, sg, elen, iv);
+	aead_request_set_assoc(req, asg, assoclen);
+
+	err = crypto_aead_decrypt(req);
+	if (err == -EINPROGRESS)
+		goto out;
+
+	err = esp_input_done2(skb, err);
+
+out:
+	return err;
+}
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+{
+	struct esp_data *esp = x->data;
+	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+	u32 align = max_t(u32, blksize, esp->padlen);
+	unsigned int net_adj;
+
+	switch (x->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+	case XFRM_MODE_BEET:
+		net_adj = sizeof(struct iphdr);
+		break;
+	case XFRM_MODE_TUNNEL:
+		net_adj = 0;
+		break;
+	default:
+		BUG();
+	}
+
+	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+		 net_adj) & ~(align - 1)) + (net_adj - 2);
+}
+
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+		return;
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      esph->spi, IPPROTO_ESP, AF_INET);
+	if (!x)
+		return;
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+		 ntohl(esph->spi), ntohl(iph->daddr));
+	xfrm_state_put(x);
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+
+	if (!esp)
+		return;
+
+	crypto_free_aead(esp->aead);
+	kfree(esp);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead;
+	int err;
+
+	aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	esp->aead = aead;
+
+	err = crypto_aead_setkey(aead, x->aead->alg_key,
+				 (x->aead->alg_key_len + 7) / 8);
+	if (err)
+		goto error;
+
+	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+	if (err)
+		goto error;
+
+error:
+	return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead;
+	struct crypto_authenc_key_param *param;
+	struct rtattr *rta;
+	char *key;
+	char *p;
+	char authenc_name[CRYPTO_MAX_ALG_NAME];
+	unsigned int keylen;
+	int err;
+
+	err = -EINVAL;
+	if (x->ealg == NULL)
+		goto error;
+
+	err = -ENAMETOOLONG;
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "authencesn(%s,%s)",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+			goto error;
+	} else {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "authenc(%s,%s)",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+			goto error;
+	}
+
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	esp->aead = aead;
+
+	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+	err = -ENOMEM;
+	key = kmalloc(keylen, GFP_KERNEL);
+	if (!key)
+		goto error;
+
+	p = key;
+	rta = (void *)p;
+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+	rta->rta_len = RTA_LENGTH(sizeof(*param));
+	param = RTA_DATA(rta);
+	p += RTA_SPACE(sizeof(*param));
+
+	if (x->aalg) {
+		struct xfrm_algo_desc *aalg_desc;
+
+		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+		p += (x->aalg->alg_key_len + 7) / 8;
+
+		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		BUG_ON(!aalg_desc);
+
+		err = -EINVAL;
+		if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+		    crypto_aead_authsize(aead)) {
+			NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+				 x->aalg->alg_name,
+				 crypto_aead_authsize(aead),
+				 aalg_desc->uinfo.auth.icv_fullbits/8);
+			goto free_key;
+		}
+
+		err = crypto_aead_setauthsize(
+			aead, x->aalg->alg_trunc_len / 8);
+		if (err)
+			goto free_key;
+	}
+
+	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+	err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+	kfree(key);
+
+error:
+	return err;
+}
+
+static int esp_init_state(struct xfrm_state *x)
+{
+	struct esp_data *esp;
+	struct crypto_aead *aead;
+	u32 align;
+	int err;
+
+	esp = kzalloc(sizeof(*esp), GFP_KERNEL);
+	if (esp == NULL)
+		return -ENOMEM;
+
+	x->data = esp;
+
+	if (x->aead)
+		err = esp_init_aead(x);
+	else
+		err = esp_init_authenc(x);
+
+	if (err)
+		goto error;
+
+	aead = esp->aead;
+
+	esp->padlen = 0;
+
+	x->props.header_len = sizeof(struct ip_esp_hdr) +
+			      crypto_aead_ivsize(aead);
+	if (x->props.mode == XFRM_MODE_TUNNEL)
+		x->props.header_len += sizeof(struct iphdr);
+	else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+		x->props.header_len += IPV4_BEET_PHMAXLEN;
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+
+		switch (encap->encap_type) {
+		default:
+			goto error;
+		case UDP_ENCAP_ESPINUDP:
+			x->props.header_len += sizeof(struct udphdr);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+			break;
+		}
+	}
+
+	align = ALIGN(crypto_aead_blocksize(aead), 4);
+	if (esp->padlen)
+		align = max_t(u32, align, esp->padlen);
+	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+
+error:
+	return err;
+}
+
+static const struct xfrm_type esp_type =
+{
+	.description	= "ESP4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ESP,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= esp_init_state,
+	.destructor	= esp_destroy,
+	.get_mtu	= esp4_get_mtu,
+	.input		= esp_input,
+	.output		= esp_output
+};
+
+static const struct net_protocol esp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	esp4_err,
+	.no_policy	=	1,
+	.netns_ok	=	1,
+};
+
+static int __init esp4_init(void)
+{
+	if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+		printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+		printk(KERN_INFO "ip esp init: can't add protocol\n");
+		xfrm_unregister_type(&esp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+	if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+		printk(KERN_INFO "ip esp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+		printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 00000000..22524716
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,1136 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/rtnetlink.h>
+#include <net/xfrm.h>
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+static int __net_init fib4_rules_init(struct net *net)
+{
+	struct fib_table *local_table, *main_table;
+
+	local_table = fib_trie_table(RT_TABLE_LOCAL);
+	if (local_table == NULL)
+		return -ENOMEM;
+
+	main_table  = fib_trie_table(RT_TABLE_MAIN);
+	if (main_table == NULL)
+		goto fail;
+
+	hlist_add_head_rcu(&local_table->tb_hlist,
+				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+	hlist_add_head_rcu(&main_table->tb_hlist,
+				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+	return 0;
+
+fail:
+	kfree(local_table);
+	return -ENOMEM;
+}
+#else
+
+struct fib_table *fib_new_table(struct net *net, u32 id)
+{
+	struct fib_table *tb;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	tb = fib_get_table(net, id);
+	if (tb)
+		return tb;
+
+	tb = fib_trie_table(id);
+	if (!tb)
+		return NULL;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
+	return tb;
+}
+
+struct fib_table *fib_get_table(struct net *net, u32 id)
+{
+	struct fib_table *tb;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+
+	rcu_read_lock();
+	head = &net->ipv4.fib_table_hash[h];
+	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+		if (tb->tb_id == id) {
+			rcu_read_unlock();
+			return tb;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+static void fib_flush(struct net *net)
+{
+	int flushed = 0;
+	struct fib_table *tb;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		head = &net->ipv4.fib_table_hash[h];
+		hlist_for_each_entry(tb, node, head, tb_hlist)
+			flushed += fib_table_flush(tb);
+	}
+
+	if (flushed)
+		rt_cache_flush(net, -1);
+}
+
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned __inet_dev_addr_type(struct net *net,
+					    const struct net_device *dev,
+					    __be32 addr)
+{
+	struct flowi4		fl4 = { .daddr = addr };
+	struct fib_result	res;
+	unsigned ret = RTN_BROADCAST;
+	struct fib_table *local_table;
+
+	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+		return RTN_BROADCAST;
+	if (ipv4_is_multicast(addr))
+		return RTN_MULTICAST;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
+
+	local_table = fib_get_table(net, RT_TABLE_LOCAL);
+	if (local_table) {
+		ret = RTN_UNICAST;
+		rcu_read_lock();
+		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+			if (!dev || dev == res.fi->fib_dev)
+				ret = res.type;
+		}
+		rcu_read_unlock();
+	}
+	return ret;
+}
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+	return __inet_dev_addr_type(net, NULL, addr);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+				__be32 addr)
+{
+	return __inet_dev_addr_type(net, dev, addr);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ *   address.
+ * - figure out what "logical" interface this packet arrived
+ *   and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
+			int oif, struct net_device *dev, __be32 *spec_dst,
+			u32 *itag)
+{
+	struct in_device *in_dev;
+	struct flowi4 fl4;
+	struct fib_result res;
+	int no_addr, rpf, accept_local;
+	bool dev_match;
+	int ret;
+	struct net *net;
+
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = oif;
+	fl4.daddr = src;
+	fl4.saddr = dst;
+	fl4.flowi4_tos = tos;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+	no_addr = rpf = accept_local = 0;
+	in_dev = __in_dev_get_rcu(dev);
+	if (in_dev) {
+		no_addr = in_dev->ifa_list == NULL;
+
+		/* Ignore rp_filter for packets protected by IPsec. */
+		rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
+
+		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
+		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+	}
+
+	if (in_dev == NULL)
+		goto e_inval;
+
+	net = dev_net(dev);
+	if (fib_lookup(net, &fl4, &res))
+		goto last_resort;
+	if (res.type != RTN_UNICAST) {
+		if (res.type != RTN_LOCAL || !accept_local)
+			goto e_inval;
+	}
+	*spec_dst = FIB_RES_PREFSRC(net, res);
+	fib_combine_itag(itag, &res);
+	dev_match = false;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+		struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+		if (nh->nh_dev == dev) {
+			dev_match = true;
+			break;
+		}
+	}
+#else
+	if (FIB_RES_DEV(res) == dev)
+		dev_match = true;
+#endif
+	if (dev_match) {
+		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+		return ret;
+	}
+	if (no_addr)
+		goto last_resort;
+	if (rpf == 1)
+		goto e_rpf;
+	fl4.flowi4_oif = dev->ifindex;
+
+	ret = 0;
+	if (fib_lookup(net, &fl4, &res) == 0) {
+		if (res.type == RTN_UNICAST) {
+			*spec_dst = FIB_RES_PREFSRC(net, res);
+			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+		}
+	}
+	return ret;
+
+last_resort:
+	if (rpf)
+		goto e_rpf;
+	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	*itag = 0;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+e_rpf:
+	return -EXDEV;
+}
+
+static inline __be32 sk_extract_addr(struct sockaddr *addr)
+{
+	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+	struct nlattr *nla;
+
+	nla = (struct nlattr *) ((char *) mx + len);
+	nla->nla_type = type;
+	nla->nla_len = nla_attr_size(4);
+	*(u32 *) nla_data(nla) = value;
+
+	return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
+				 struct fib_config *cfg)
+{
+	__be32 addr;
+	int plen;
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->fc_nlinfo.nl_net = net;
+
+	if (rt->rt_dst.sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/*
+	 * Check mask for validity:
+	 * a) it must be contiguous.
+	 * b) destination must have all host bits clear.
+	 * c) if application forgot to set correct family (AF_INET),
+	 *    reject request unless it is absolutely clear i.e.
+	 *    both family and mask are zero.
+	 */
+	plen = 32;
+	addr = sk_extract_addr(&rt->rt_dst);
+	if (!(rt->rt_flags & RTF_HOST)) {
+		__be32 mask = sk_extract_addr(&rt->rt_genmask);
+
+		if (rt->rt_genmask.sa_family != AF_INET) {
+			if (mask || rt->rt_genmask.sa_family)
+				return -EAFNOSUPPORT;
+		}
+
+		if (bad_mask(mask, addr))
+			return -EINVAL;
+
+		plen = inet_mask_len(mask);
+	}
+
+	cfg->fc_dst_len = plen;
+	cfg->fc_dst = addr;
+
+	if (cmd != SIOCDELRT) {
+		cfg->fc_nlflags = NLM_F_CREATE;
+		cfg->fc_protocol = RTPROT_BOOT;
+	}
+
+	if (rt->rt_metric)
+		cfg->fc_priority = rt->rt_metric - 1;
+
+	if (rt->rt_flags & RTF_REJECT) {
+		cfg->fc_scope = RT_SCOPE_HOST;
+		cfg->fc_type = RTN_UNREACHABLE;
+		return 0;
+	}
+
+	cfg->fc_scope = RT_SCOPE_NOWHERE;
+	cfg->fc_type = RTN_UNICAST;
+
+	if (rt->rt_dev) {
+		char *colon;
+		struct net_device *dev;
+		char devname[IFNAMSIZ];
+
+		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+			return -EFAULT;
+
+		devname[IFNAMSIZ-1] = 0;
+		colon = strchr(devname, ':');
+		if (colon)
+			*colon = 0;
+		dev = __dev_get_by_name(net, devname);
+		if (!dev)
+			return -ENODEV;
+		cfg->fc_oif = dev->ifindex;
+		if (colon) {
+			struct in_ifaddr *ifa;
+			struct in_device *in_dev = __in_dev_get_rtnl(dev);
+			if (!in_dev)
+				return -ENODEV;
+			*colon = ':';
+			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+				if (strcmp(ifa->ifa_label, devname) == 0)
+					break;
+			if (ifa == NULL)
+				return -ENODEV;
+			cfg->fc_prefsrc = ifa->ifa_local;
+		}
+	}
+
+	addr = sk_extract_addr(&rt->rt_gateway);
+	if (rt->rt_gateway.sa_family == AF_INET && addr) {
+		cfg->fc_gw = addr;
+		if (rt->rt_flags & RTF_GATEWAY &&
+		    inet_addr_type(net, addr) == RTN_UNICAST)
+			cfg->fc_scope = RT_SCOPE_UNIVERSE;
+	}
+
+	if (cmd == SIOCDELRT)
+		return 0;
+
+	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+		return -EINVAL;
+
+	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+		cfg->fc_scope = RT_SCOPE_LINK;
+
+	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+		struct nlattr *mx;
+		int len = 0;
+
+		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
+		if (mx == NULL)
+			return -ENOMEM;
+
+		if (rt->rt_flags & RTF_MTU)
+			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+		if (rt->rt_flags & RTF_WINDOW)
+			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+		if (rt->rt_flags & RTF_IRTT)
+			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+		cfg->fc_mx = mx;
+		cfg->fc_mx_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
+ */
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	struct fib_config cfg;
+	struct rtentry rt;
+	int err;
+
+	switch (cmd) {
+	case SIOCADDRT:		/* Add a route */
+	case SIOCDELRT:		/* Delete a route */
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&rt, arg, sizeof(rt)))
+			return -EFAULT;
+
+		rtnl_lock();
+		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
+		if (err == 0) {
+			struct fib_table *tb;
+
+			if (cmd == SIOCDELRT) {
+				tb = fib_get_table(net, cfg.fc_table);
+				if (tb)
+					err = fib_table_delete(tb, &cfg);
+				else
+					err = -ESRCH;
+			} else {
+				tb = fib_new_table(net, cfg.fc_table);
+				if (tb)
+					err = fib_table_insert(tb, &cfg);
+				else
+					err = -ENOBUFS;
+			}
+
+			/* allocated by rtentry_to_fib_config() */
+			kfree(cfg.fc_mx);
+		}
+		rtnl_unlock();
+		return err;
+	}
+	return -EINVAL;
+}
+
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+	[RTA_DST]		= { .type = NLA_U32 },
+	[RTA_SRC]		= { .type = NLA_U32 },
+	[RTA_IIF]		= { .type = NLA_U32 },
+	[RTA_OIF]		= { .type = NLA_U32 },
+	[RTA_GATEWAY]		= { .type = NLA_U32 },
+	[RTA_PRIORITY]		= { .type = NLA_U32 },
+	[RTA_PREFSRC]		= { .type = NLA_U32 },
+	[RTA_METRICS]		= { .type = NLA_NESTED },
+	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
+	[RTA_FLOW]		= { .type = NLA_U32 },
+};
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+			     struct nlmsghdr *nlh, struct fib_config *cfg)
+{
+	struct nlattr *attr;
+	int err, remaining;
+	struct rtmsg *rtm;
+
+	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	rtm = nlmsg_data(nlh);
+	cfg->fc_dst_len = rtm->rtm_dst_len;
+	cfg->fc_tos = rtm->rtm_tos;
+	cfg->fc_table = rtm->rtm_table;
+	cfg->fc_protocol = rtm->rtm_protocol;
+	cfg->fc_scope = rtm->rtm_scope;
+	cfg->fc_type = rtm->rtm_type;
+	cfg->fc_flags = rtm->rtm_flags;
+	cfg->fc_nlflags = nlh->nlmsg_flags;
+
+	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+	cfg->fc_nlinfo.nlh = nlh;
+	cfg->fc_nlinfo.nl_net = net;
+
+	if (cfg->fc_type > RTN_MAX) {
+		err = -EINVAL;
+		goto errout;
+	}
+
+	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+		switch (nla_type(attr)) {
+		case RTA_DST:
+			cfg->fc_dst = nla_get_be32(attr);
+			break;
+		case RTA_OIF:
+			cfg->fc_oif = nla_get_u32(attr);
+			break;
+		case RTA_GATEWAY:
+			cfg->fc_gw = nla_get_be32(attr);
+			break;
+		case RTA_PRIORITY:
+			cfg->fc_priority = nla_get_u32(attr);
+			break;
+		case RTA_PREFSRC:
+			cfg->fc_prefsrc = nla_get_be32(attr);
+			break;
+		case RTA_METRICS:
+			cfg->fc_mx = nla_data(attr);
+			cfg->fc_mx_len = nla_len(attr);
+			break;
+		case RTA_MULTIPATH:
+			cfg->fc_mp = nla_data(attr);
+			cfg->fc_mp_len = nla_len(attr);
+			break;
+		case RTA_FLOW:
+			cfg->fc_flow = nla_get_u32(attr);
+			break;
+		case RTA_TABLE:
+			cfg->fc_table = nla_get_u32(attr);
+			break;
+		}
+	}
+
+	return 0;
+errout:
+	return err;
+}
+
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
+
+	err = rtm_to_fib_config(net, skb, nlh, &cfg);
+	if (err < 0)
+		goto errout;
+
+	tb = fib_get_table(net, cfg.fc_table);
+	if (tb == NULL) {
+		err = -ESRCH;
+		goto errout;
+	}
+
+	err = fib_table_delete(tb, &cfg);
+errout:
+	return err;
+}
+
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
+
+	err = rtm_to_fib_config(net, skb, nlh, &cfg);
+	if (err < 0)
+		goto errout;
+
+	tb = fib_new_table(net, cfg.fc_table);
+	if (tb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	err = fib_table_insert(tb, &cfg);
+errout:
+	return err;
+}
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct fib_table *tb;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	int dumped = 0;
+
+	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
+	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+		return ip_rt_dump(skb, cb);
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+		e = 0;
+		head = &net->ipv4.fib_table_hash[h];
+		hlist_for_each_entry(tb, node, head, tb_hlist) {
+			if (e < s_e)
+				goto next;
+			if (dumped)
+				memset(&cb->args[2], 0, sizeof(cb->args) -
+						 2 * sizeof(cb->args[0]));
+			if (fib_table_dump(tb, skb, cb) < 0)
+				goto out;
+			dumped = 1;
+next:
+			e++;
+		}
+	}
+out:
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+	struct net *net = dev_net(ifa->ifa_dev->dev);
+	struct fib_table *tb;
+	struct fib_config cfg = {
+		.fc_protocol = RTPROT_KERNEL,
+		.fc_type = type,
+		.fc_dst = dst,
+		.fc_dst_len = dst_len,
+		.fc_prefsrc = ifa->ifa_local,
+		.fc_oif = ifa->ifa_dev->dev->ifindex,
+		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+		.fc_nlinfo = {
+			.nl_net = net,
+		},
+	};
+
+	if (type == RTN_UNICAST)
+		tb = fib_new_table(net, RT_TABLE_MAIN);
+	else
+		tb = fib_new_table(net, RT_TABLE_LOCAL);
+
+	if (tb == NULL)
+		return;
+
+	cfg.fc_table = tb->tb_id;
+
+	if (type != RTN_LOCAL)
+		cfg.fc_scope = RT_SCOPE_LINK;
+	else
+		cfg.fc_scope = RT_SCOPE_HOST;
+
+	if (cmd == RTM_NEWROUTE)
+		fib_table_insert(tb, &cfg);
+	else
+		fib_table_delete(tb, &cfg);
+}
+
+void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *prim = ifa;
+	__be32 mask = ifa->ifa_mask;
+	__be32 addr = ifa->ifa_local;
+	__be32 prefix = ifa->ifa_address & mask;
+
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
+		prim = inet_ifa_byprefix(in_dev, prefix, mask);
+		if (prim == NULL) {
+			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+	}
+
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+	if (!(dev->flags & IFF_UP))
+		return;
+
+	/* Add broadcast address, if it is explicitly assigned. */
+	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
+	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
+		fib_magic(RTM_NEWROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  prefix, ifa->ifa_prefixlen, prim);
+
+		/* Add network specific broadcasts, when it takes a sense */
+		if (ifa->ifa_prefixlen < 31) {
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+				  32, prim);
+		}
+	}
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *ifa1;
+	struct in_ifaddr *prim = ifa, *prim1 = NULL;
+	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+	__be32 any = ifa->ifa_address & ifa->ifa_mask;
+#define LOCAL_OK	1
+#define BRD_OK		2
+#define BRD0_OK		4
+#define BRD1_OK		8
+	unsigned ok = 0;
+	int subnet = 0;		/* Primary network */
+	int gone = 1;		/* Address is missing */
+	int same_prefsrc = 0;	/* Another primary with same IP */
+
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
+		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+		if (prim == NULL) {
+			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+		if (iprim && iprim != prim) {
+			printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
+			return;
+		}
+	} else if (!ipv4_is_zeronet(any) &&
+		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+		fib_magic(RTM_DELROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  any, ifa->ifa_prefixlen, prim);
+		subnet = 1;
+	}
+
+	/* Deletion is more complicated than add.
+	 * We should take care of not to delete too much :-)
+	 *
+	 * Scan address list to be sure that addresses are really gone.
+	 */
+
+	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+		if (ifa1 == ifa) {
+			/* promotion, keep the IP */
+			gone = 0;
+			continue;
+		}
+		/* Ignore IFAs from our subnet */
+		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, iprim))
+			continue;
+
+		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
+		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+			/* Another address from our subnet? */
+			if (ifa1->ifa_mask == prim->ifa_mask &&
+			    inet_ifa_match(ifa1->ifa_address, prim))
+				prim1 = prim;
+			else {
+				/* We reached the secondaries, so
+				 * same_prefsrc should be determined.
+				 */
+				if (!same_prefsrc)
+					continue;
+				/* Search new prim1 if ifa1 is not
+				 * using the current prim1
+				 */
+				if (!prim1 ||
+				    ifa1->ifa_mask != prim1->ifa_mask ||
+				    !inet_ifa_match(ifa1->ifa_address, prim1))
+					prim1 = inet_ifa_byprefix(in_dev,
+							ifa1->ifa_address,
+							ifa1->ifa_mask);
+				if (!prim1)
+					continue;
+				if (prim1->ifa_local != prim->ifa_local)
+					continue;
+			}
+		} else {
+			if (prim->ifa_local != ifa1->ifa_local)
+				continue;
+			prim1 = ifa1;
+			if (prim != prim1)
+				same_prefsrc = 1;
+		}
+		if (ifa->ifa_local == ifa1->ifa_local)
+			ok |= LOCAL_OK;
+		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+			ok |= BRD_OK;
+		if (brd == ifa1->ifa_broadcast)
+			ok |= BRD1_OK;
+		if (any == ifa1->ifa_broadcast)
+			ok |= BRD0_OK;
+		/* primary has network specific broadcasts */
+		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+			if (!ipv4_is_zeronet(any1)) {
+				if (ifa->ifa_broadcast == brd1 ||
+				    ifa->ifa_broadcast == any1)
+					ok |= BRD_OK;
+				if (brd == brd1 || brd == any1)
+					ok |= BRD1_OK;
+				if (any == brd1 || any == any1)
+					ok |= BRD0_OK;
+			}
+		}
+	}
+
+	if (!(ok & BRD_OK))
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+	if (subnet && ifa->ifa_prefixlen < 31) {
+		if (!(ok & BRD1_OK))
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+		if (!(ok & BRD0_OK))
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+	}
+	if (!(ok & LOCAL_OK)) {
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+		/* Check, that this local address finally disappeared. */
+		if (gone &&
+		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+			/* And the last, but not the least thing.
+			 * We must flush stray FIB entries.
+			 *
+			 * First of all, we scan fib_info list searching
+			 * for stray nexthop entries, then ignite fib_flush.
+			 */
+			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+				fib_flush(dev_net(dev));
+		}
+	}
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
+{
+
+	struct fib_result       res;
+	struct flowi4           fl4 = {
+		.flowi4_mark = frn->fl_mark,
+		.daddr = frn->fl_addr,
+		.flowi4_tos = frn->fl_tos,
+		.flowi4_scope = frn->fl_scope,
+	};
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
+
+	frn->err = -ENOENT;
+	if (tb) {
+		local_bh_disable();
+
+		frn->tb_id = tb->tb_id;
+		rcu_read_lock();
+		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+		if (!frn->err) {
+			frn->prefixlen = res.prefixlen;
+			frn->nh_sel = res.nh_sel;
+			frn->type = res.type;
+			frn->scope = res.scope;
+		}
+		rcu_read_unlock();
+		local_bh_enable();
+	}
+}
+
+static void nl_fib_input(struct sk_buff *skb)
+{
+	struct net *net;
+	struct fib_result_nl *frn;
+	struct nlmsghdr *nlh;
+	struct fib_table *tb;
+	u32 pid;
+
+	net = sock_net(skb->sk);
+	nlh = nlmsg_hdr(skb);
+	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
+	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
+		return;
+
+	skb = skb_clone(skb, GFP_KERNEL);
+	if (skb == NULL)
+		return;
+	nlh = nlmsg_hdr(skb);
+
+	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+	tb = fib_get_table(net, frn->tb_id_in);
+
+	nl_fib_lookup(frn, tb);
+
+	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
+	NETLINK_CB(skb).pid = 0;        /* from kernel */
+	NETLINK_CB(skb).dst_group = 0;  /* unicast */
+	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+	struct sock *sk;
+	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
+				   nl_fib_input, NULL, THIS_MODULE);
+	if (sk == NULL)
+		return -EAFNOSUPPORT;
+	net->ipv4.fibnl = sk;
+	return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+	netlink_kernel_release(net->ipv4.fibnl);
+	net->ipv4.fibnl = NULL;
+}
+
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
+{
+	if (fib_sync_down_dev(dev, force))
+		fib_flush(dev_net(dev));
+	rt_cache_flush(dev_net(dev), delay);
+	arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+	struct net *net = dev_net(dev);
+
+	switch (event) {
+	case NETDEV_UP:
+		fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(dev);
+#endif
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		rt_cache_flush(dev_net(dev), -1);
+		break;
+	case NETDEV_DOWN:
+		fib_del_ifaddr(ifa, NULL);
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		if (ifa->ifa_dev->ifa_list == NULL) {
+			/* Last address was deleted from this interface.
+			 * Disable IP.
+			 */
+			fib_disable_ip(dev, 1, 0);
+		} else {
+			rt_cache_flush(dev_net(dev), -1);
+		}
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
+	struct net *net = dev_net(dev);
+
+	if (event == NETDEV_UNREGISTER) {
+		fib_disable_ip(dev, 2, -1);
+		return NOTIFY_DONE;
+	}
+
+	if (!in_dev)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		for_ifa(in_dev) {
+			fib_add_ifaddr(ifa);
+		} endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(dev);
+#endif
+		atomic_inc(&net->ipv4.dev_addr_genid);
+		rt_cache_flush(dev_net(dev), -1);
+		break;
+	case NETDEV_DOWN:
+		fib_disable_ip(dev, 0, 0);
+		break;
+	case NETDEV_CHANGEMTU:
+	case NETDEV_CHANGE:
+		rt_cache_flush(dev_net(dev), 0);
+		break;
+	case NETDEV_UNREGISTER_BATCH:
+		/* The batch unregister is only called on the first
+		 * device in the list of devices being unregistered.
+		 * Therefore we should not pass dev_net(dev) in here.
+		 */
+		rt_cache_flush_batch(NULL);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+	.notifier_call = fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+	.notifier_call = fib_netdev_event,
+};
+
+static int __net_init ip_fib_net_init(struct net *net)
+{
+	int err;
+	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+	if (net->ipv4.fib_table_hash == NULL)
+		return -ENOMEM;
+
+	err = fib4_rules_init(net);
+	if (err < 0)
+		goto fail;
+	return 0;
+
+fail:
+	kfree(net->ipv4.fib_table_hash);
+	return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+	unsigned int i;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	fib4_rules_exit(net);
+#endif
+
+	rtnl_lock();
+	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+		struct fib_table *tb;
+		struct hlist_head *head;
+		struct hlist_node *node, *tmp;
+
+		head = &net->ipv4.fib_table_hash[i];
+		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
+			hlist_del(node);
+			fib_table_flush(tb);
+			fib_free_table(tb);
+		}
+	}
+	rtnl_unlock();
+	kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+	int error;
+
+	error = ip_fib_net_init(net);
+	if (error < 0)
+		goto out;
+	error = nl_fib_lookup_init(net);
+	if (error < 0)
+		goto out_nlfl;
+	error = fib_proc_init(net);
+	if (error < 0)
+		goto out_proc;
+out:
+	return error;
+
+out_proc:
+	nl_fib_lookup_exit(net);
+out_nlfl:
+	ip_fib_net_exit(net);
+	goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+	fib_proc_exit(net);
+	nl_fib_lookup_exit(net);
+	ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+	.init = fib_net_init,
+	.exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
+	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
+	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
+	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
+
+	register_pernet_subsys(&fib_net_ops);
+	register_netdevice_notifier(&fib_netdev_notifier);
+	register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+	fib_trie_init();
+}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 00000000..af0f14ab
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,57 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+	struct list_head	fa_list;
+	struct fib_info		*fa_info;
+	u8			fa_tos;
+	u8			fa_type;
+	u8			fa_state;
+	struct rcu_head		rcu;
+};
+
+#define FA_S_ACCESSED	0x01
+
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+	if (!(fa->fa_state & FA_S_ACCESSED))
+		fa->fa_state |= FA_S_ACCESSED;
+}
+
+/* Exported by fib_semantics.c */
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(struct fib_config *cfg);
+extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+			 u32 tb_id, u8 type, __be32 dst,
+			 int dst_len, u8 tos, struct fib_info *fi,
+			 unsigned int);
+extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+		      int dst_len, u32 tb_id, struct nl_info *info,
+		      unsigned int nlm_flags);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+					u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+			    struct fib_info **last_resort,
+			    int *last_idx, int dflt);
+
+static inline void fib_result_assign(struct fib_result *res,
+				     struct fib_info *fi)
+{
+	/* we used to play games with refcounts, but we now use RCU */
+	res->fi = fi;
+}
+
+struct fib_prop {
+	int	error;
+	u8	scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 00000000..a53bb1b5
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,307 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: policy rules.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *		Thomas Graf <tgraf@suug.ch>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *		Rani Assaf	:	local_rule cannot be deleted
+ *		Marc Boucher	:	routing by fwmark
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip_fib.h>
+#include <net/fib_rules.h>
+
+struct fib4_rule {
+	struct fib_rule		common;
+	u8			dst_len;
+	u8			src_len;
+	u8			tos;
+	__be32			src;
+	__be32			srcmask;
+	__be32			dst;
+	__be32			dstmask;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	u32			tclassid;
+#endif
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
+{
+	return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
+}
+#endif
+
+int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+{
+	struct fib_lookup_arg arg = {
+		.result = res,
+		.flags = FIB_LOOKUP_NOREF,
+	};
+	int err;
+
+	err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+	res->r = arg.rule;
+
+	return err;
+}
+
+static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	int err = -EAGAIN;
+	struct fib_table *tbl;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+
+	case FR_ACT_UNREACHABLE:
+		err = -ENETUNREACH;
+		goto errout;
+
+	case FR_ACT_PROHIBIT:
+		err = -EACCES;
+		goto errout;
+
+	case FR_ACT_BLACKHOLE:
+	default:
+		err = -EINVAL;
+		goto errout;
+	}
+
+	tbl = fib_get_table(rule->fr_net, rule->table);
+	if (!tbl)
+		goto errout;
+
+	err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
+	if (err > 0)
+		err = -EAGAIN;
+errout:
+	return err;
+}
+
+
+static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	struct fib4_rule *r = (struct fib4_rule *) rule;
+	struct flowi4 *fl4 = &fl->u.ip4;
+	__be32 daddr = fl4->daddr;
+	__be32 saddr = fl4->saddr;
+
+	if (((saddr ^ r->src) & r->srcmask) ||
+	    ((daddr ^ r->dst) & r->dstmask))
+		return 0;
+
+	if (r->tos && (r->tos != fl4->flowi4_tos))
+		return 0;
+
+	return 1;
+}
+
+static struct fib_table *fib_empty_table(struct net *net)
+{
+	u32 id;
+
+	for (id = 1; id <= RT_TABLE_MAX; id++)
+		if (fib_get_table(net, id) == NULL)
+			return fib_new_table(net, id);
+	return NULL;
+}
+
+static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
+	FRA_GENERIC_POLICY,
+	[FRA_FLOW]	= { .type = NLA_U32 },
+};
+
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
+{
+	struct net *net = sock_net(skb->sk);
+	int err = -EINVAL;
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	if (frh->tos & ~IPTOS_TOS_MASK)
+		goto errout;
+
+	if (rule->table == RT_TABLE_UNSPEC) {
+		if (rule->action == FR_ACT_TO_TBL) {
+			struct fib_table *table;
+
+			table = fib_empty_table(net);
+			if (table == NULL) {
+				err = -ENOBUFS;
+				goto errout;
+			}
+
+			rule->table = table->tb_id;
+		}
+	}
+
+	if (frh->src_len)
+		rule4->src = nla_get_be32(tb[FRA_SRC]);
+
+	if (frh->dst_len)
+		rule4->dst = nla_get_be32(tb[FRA_DST]);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (tb[FRA_FLOW])
+		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+#endif
+
+	rule4->src_len = frh->src_len;
+	rule4->srcmask = inet_make_mask(rule4->src_len);
+	rule4->dst_len = frh->dst_len;
+	rule4->dstmask = inet_make_mask(rule4->dst_len);
+	rule4->tos = frh->tos;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	if (frh->src_len && (rule4->src_len != frh->src_len))
+		return 0;
+
+	if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+		return 0;
+
+	if (frh->tos && (rule4->tos != frh->tos))
+		return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+		return 0;
+#endif
+
+	if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC])))
+		return 0;
+
+	if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST])))
+		return 0;
+
+	return 1;
+}
+
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+	frh->dst_len = rule4->dst_len;
+	frh->src_len = rule4->src_len;
+	frh->tos = rule4->tos;
+
+	if (rule4->dst_len)
+		NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
+
+	if (rule4->src_len)
+		NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (rule4->tclassid)
+		NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+#endif
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+{
+	return nla_total_size(4) /* dst */
+	       + nla_total_size(4) /* src */
+	       + nla_total_size(4); /* flow */
+}
+
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+	rt_cache_flush(ops->fro_net, -1);
+}
+
+static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
+	.family		= AF_INET,
+	.rule_size	= sizeof(struct fib4_rule),
+	.addr_size	= sizeof(u32),
+	.action		= fib4_rule_action,
+	.match		= fib4_rule_match,
+	.configure	= fib4_rule_configure,
+	.compare	= fib4_rule_compare,
+	.fill		= fib4_rule_fill,
+	.default_pref	= fib_default_rule_pref,
+	.nlmsg_payload	= fib4_rule_nlmsg_payload,
+	.flush_cache	= fib4_rule_flush_cache,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.policy		= fib4_rule_policy,
+	.owner		= THIS_MODULE,
+};
+
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+	int err;
+
+	err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
+{
+	int err;
+	struct fib_rules_ops *ops;
+
+	ops = fib_rules_register(&fib4_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	err = fib_default_rules_init(ops);
+	if (err < 0)
+		goto fail;
+	net->ipv4.rules_ops = ops;
+	return 0;
+
+fail:
+	/* also cleans all rules already added */
+	fib_rules_unregister(ops);
+	return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+	fib_rules_unregister(net->ipv4.rules_ops);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 00000000..7e454ba8
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1248 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: semantics.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/netlink.h>
+#include <net/nexthop.h>
+
+#include "fib_lookup.h"
+
+static DEFINE_SPINLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_info_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh;				\
+	for (nhsel = 0, nh = (fi)->fib_nh;				\
+	     nhsel < (fi)->fib_nhs;					\
+	     nh++, nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel; struct fib_nh *nexthop_nh;				\
+	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	     nhsel < (fi)->fib_nhs;					\
+	     nexthop_nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
+	for (nhsel = 0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel;							\
+	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	for (nhsel = 0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+const struct fib_prop fib_props[RTN_MAX + 1] = {
+	[RTN_UNSPEC] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+	[RTN_UNICAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_LOCAL] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_HOST,
+	},
+	[RTN_BROADCAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},
+	[RTN_ANYCAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},
+	[RTN_MULTICAST] = {
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_BLACKHOLE] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_UNREACHABLE] = {
+		.error	= -EHOSTUNREACH,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_PROHIBIT] = {
+		.error	= -EACCES,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_THROW] = {
+		.error	= -EAGAIN,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},
+	[RTN_NAT] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+	[RTN_XRESOLVE] = {
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},
+};
+
+/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+	struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+	change_nexthops(fi) {
+		if (nexthop_nh->nh_dev)
+			dev_put(nexthop_nh->nh_dev);
+	} endfor_nexthops(fi);
+
+	release_net(fi->fib_net);
+	kfree(fi);
+}
+
+void free_fib_info(struct fib_info *fi)
+{
+	if (fi->fib_dead == 0) {
+		pr_warning("Freeing alive fib_info %p\n", fi);
+		return;
+	}
+	fib_info_cnt--;
+	call_rcu(&fi->rcu, free_fib_info_rcu);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+	spin_lock_bh(&fib_info_lock);
+	if (fi && --fi->fib_treeref == 0) {
+		hlist_del(&fi->fib_hash);
+		if (fi->fib_prefsrc)
+			hlist_del(&fi->fib_lhash);
+		change_nexthops(fi) {
+			if (!nexthop_nh->nh_dev)
+				continue;
+			hlist_del(&nexthop_nh->nh_hash);
+		} endfor_nexthops(fi)
+		fi->fib_dead = 1;
+		fib_info_put(fi);
+	}
+	spin_unlock_bh(&fib_info_lock);
+}
+
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+	const struct fib_nh *onh = ofi->fib_nh;
+
+	for_nexthops(fi) {
+		if (nh->nh_oif != onh->nh_oif ||
+		    nh->nh_gw  != onh->nh_gw ||
+		    nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		    nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		    nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+			return -1;
+		onh++;
+	} endfor_nexthops(fi);
+	return 0;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+	unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+	return (val ^
+		(val >> DEVINDEX_HASHBITS) ^
+		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+	unsigned int mask = (fib_info_hash_size - 1);
+	unsigned int val = fi->fib_nhs;
+
+	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
+	val ^= (__force u32)fi->fib_prefsrc;
+	val ^= fi->fib_priority;
+	for_nexthops(fi) {
+		val ^= fib_devindex_hashfn(nh->nh_oif);
+	} endfor_nexthops(fi)
+
+	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_info *fi;
+	unsigned int hash;
+
+	hash = fib_info_hashfn(nfi);
+	head = &fib_info_hash[hash];
+
+	hlist_for_each_entry(fi, node, head, fib_hash) {
+		if (!net_eq(fi->fib_net, nfi->fib_net))
+			continue;
+		if (fi->fib_nhs != nfi->fib_nhs)
+			continue;
+		if (nfi->fib_protocol == fi->fib_protocol &&
+		    nfi->fib_scope == fi->fib_scope &&
+		    nfi->fib_prefsrc == fi->fib_prefsrc &&
+		    nfi->fib_priority == fi->fib_priority &&
+		    memcmp(nfi->fib_metrics, fi->fib_metrics,
+			   sizeof(u32) * RTAX_MAX) == 0 &&
+		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
+		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+			return fi;
+	}
+
+	return NULL;
+}
+
+/* Check, that the gateway is already configured.
+ * Used only by redirect accept routine.
+ */
+int ip_fib_check_default(__be32 gw, struct net_device *dev)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_nh *nh;
+	unsigned int hash;
+
+	spin_lock(&fib_info_lock);
+
+	hash = fib_devindex_hashfn(dev->ifindex);
+	head = &fib_info_devhash[hash];
+	hlist_for_each_entry(nh, node, head, nh_hash) {
+		if (nh->nh_dev == dev &&
+		    nh->nh_gw == gw &&
+		    !(nh->nh_flags & RTNH_F_DEAD)) {
+			spin_unlock(&fib_info_lock);
+			return 0;
+		}
+	}
+
+	spin_unlock(&fib_info_lock);
+
+	return -1;
+}
+
+static inline size_t fib_nlmsg_size(struct fib_info *fi)
+{
+	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+			 + nla_total_size(4) /* RTA_TABLE */
+			 + nla_total_size(4) /* RTA_DST */
+			 + nla_total_size(4) /* RTA_PRIORITY */
+			 + nla_total_size(4); /* RTA_PREFSRC */
+
+	/* space for nested metrics */
+	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
+
+	if (fi->fib_nhs) {
+		/* Also handles the special case fib_nhs == 1 */
+
+		/* each nexthop is packed in an attribute */
+		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+
+		/* may contain flow and gateway attribute */
+		nhsize += 2 * nla_total_size(4);
+
+		/* all nexthops are packed in a nested attribute */
+		payload += nla_total_size(fi->fib_nhs * nhsize);
+	}
+
+	return payload;
+}
+
+void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+	       int dst_len, u32 tb_id, struct nl_info *info,
+	       unsigned int nlm_flags)
+{
+	struct sk_buff *skb;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+
+	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
+			    fa->fa_type, key, dst_len,
+			    fa->fa_tos, fa->fa_info, nlm_flags);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
+		    info->nlh, GFP_KERNEL);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+	if (fah) {
+		struct fib_alias *fa;
+		list_for_each_entry(fa, fah, fa_list) {
+			if (fa->fa_tos > tos)
+				continue;
+			if (fa->fa_info->fib_priority >= prio ||
+			    fa->fa_tos < tos)
+				return fa;
+		}
+	}
+	return NULL;
+}
+
+int fib_detect_death(struct fib_info *fi, int order,
+		     struct fib_info **last_resort, int *last_idx, int dflt)
+{
+	struct neighbour *n;
+	int state = NUD_NONE;
+
+	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+	if (n) {
+		state = n->nud_state;
+		neigh_release(n);
+	}
+	if (state == NUD_REACHABLE)
+		return 0;
+	if ((state & NUD_VALID) && order != dflt)
+		return 0;
+	if ((state & NUD_VALID) ||
+	    (*last_idx < 0 && order > dflt)) {
+		*last_resort = fi;
+		*last_idx = order;
+	}
+	return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
+{
+	int nhs = 0;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		nhs++;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	/* leftover implies invalid nexthop configuration, discard it */
+	return remaining > 0 ? 0 : nhs;
+}
+
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg)
+{
+	change_nexthops(fi) {
+		int attrlen;
+
+		if (!rtnh_ok(rtnh, remaining))
+			return -EINVAL;
+
+		nexthop_nh->nh_flags =
+			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
+		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+#endif
+		}
+
+		rtnh = rtnh_next(rtnh, &remaining);
+	} endfor_nexthops(fi);
+
+	return 0;
+}
+
+#endif
+
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	struct rtnexthop *rtnh;
+	int remaining;
+#endif
+
+	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
+		return 1;
+
+	if (cfg->fc_oif || cfg->fc_gw) {
+		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
+		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
+			return 0;
+		return 1;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (cfg->fc_mp == NULL)
+		return 0;
+
+	rtnh = cfg->fc_mp;
+	remaining = cfg->fc_mp_len;
+
+	for_nexthops(fi) {
+		int attrlen;
+
+		if (!rtnh_ok(rtnh, remaining))
+			return -EINVAL;
+
+		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+			return 1;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen < 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			if (nla && nla_get_be32(nla) != nh->nh_gw)
+				return 1;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
+				return 1;
+#endif
+		}
+
+		rtnh = rtnh_next(rtnh, &remaining);
+	} endfor_nexthops(fi);
+#endif
+	return 0;
+}
+
+
+/*
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ *    so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ *    described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ *    contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix}  -> (gw, oif) [scope link]
+ *		  |
+ *		  |-> {link prefix} -> (gw, oif) [scope local]
+ *					|
+ *					|-> {local prefix} (terminal node)
+ */
+static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
+			struct fib_nh *nh)
+{
+	int err;
+	struct net *net;
+	struct net_device *dev;
+
+	net = cfg->fc_nlinfo.nl_net;
+	if (nh->nh_gw) {
+		struct fib_result res;
+
+		if (nh->nh_flags & RTNH_F_ONLINK) {
+
+			if (cfg->fc_scope >= RT_SCOPE_LINK)
+				return -EINVAL;
+			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
+				return -EINVAL;
+			dev = __dev_get_by_index(net, nh->nh_oif);
+			if (!dev)
+				return -ENODEV;
+			if (!(dev->flags & IFF_UP))
+				return -ENETDOWN;
+			nh->nh_dev = dev;
+			dev_hold(dev);
+			nh->nh_scope = RT_SCOPE_LINK;
+			return 0;
+		}
+		rcu_read_lock();
+		{
+			struct flowi4 fl4 = {
+				.daddr = nh->nh_gw,
+				.flowi4_scope = cfg->fc_scope + 1,
+				.flowi4_oif = nh->nh_oif,
+			};
+
+			/* It is not necessary, but requires a bit of thinking */
+			if (fl4.flowi4_scope < RT_SCOPE_LINK)
+				fl4.flowi4_scope = RT_SCOPE_LINK;
+			err = fib_lookup(net, &fl4, &res);
+			if (err) {
+				rcu_read_unlock();
+				return err;
+			}
+		}
+		err = -EINVAL;
+		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+			goto out;
+		nh->nh_scope = res.scope;
+		nh->nh_oif = FIB_RES_OIF(res);
+		nh->nh_dev = dev = FIB_RES_DEV(res);
+		if (!dev)
+			goto out;
+		dev_hold(dev);
+		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
+	} else {
+		struct in_device *in_dev;
+
+		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
+			return -EINVAL;
+
+		rcu_read_lock();
+		err = -ENODEV;
+		in_dev = inetdev_by_index(net, nh->nh_oif);
+		if (in_dev == NULL)
+			goto out;
+		err = -ENETDOWN;
+		if (!(in_dev->dev->flags & IFF_UP))
+			goto out;
+		nh->nh_dev = in_dev->dev;
+		dev_hold(nh->nh_dev);
+		nh->nh_scope = RT_SCOPE_HOST;
+		err = 0;
+	}
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static inline unsigned int fib_laddr_hashfn(__be32 val)
+{
+	unsigned int mask = (fib_info_hash_size - 1);
+
+	return ((__force u32)val ^
+		((__force u32)val >> 7) ^
+		((__force u32)val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_info_hash_alloc(int bytes)
+{
+	if (bytes <= PAGE_SIZE)
+		return kzalloc(bytes, GFP_KERNEL);
+	else
+		return (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					 get_order(bytes));
+}
+
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
+{
+	if (!hash)
+		return;
+
+	if (bytes <= PAGE_SIZE)
+		kfree(hash);
+	else
+		free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+			       struct hlist_head *new_laddrhash,
+			       unsigned int new_size)
+{
+	struct hlist_head *old_info_hash, *old_laddrhash;
+	unsigned int old_size = fib_info_hash_size;
+	unsigned int i, bytes;
+
+	spin_lock_bh(&fib_info_lock);
+	old_info_hash = fib_info_hash;
+	old_laddrhash = fib_info_laddrhash;
+	fib_info_hash_size = new_size;
+
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *head = &fib_info_hash[i];
+		struct hlist_node *node, *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+			struct hlist_head *dest;
+			unsigned int new_hash;
+
+			hlist_del(&fi->fib_hash);
+
+			new_hash = fib_info_hashfn(fi);
+			dest = &new_info_hash[new_hash];
+			hlist_add_head(&fi->fib_hash, dest);
+		}
+	}
+	fib_info_hash = new_info_hash;
+
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *lhead = &fib_info_laddrhash[i];
+		struct hlist_node *node, *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+			struct hlist_head *ldest;
+			unsigned int new_hash;
+
+			hlist_del(&fi->fib_lhash);
+
+			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+			ldest = &new_laddrhash[new_hash];
+			hlist_add_head(&fi->fib_lhash, ldest);
+		}
+	}
+	fib_info_laddrhash = new_laddrhash;
+
+	spin_unlock_bh(&fib_info_lock);
+
+	bytes = old_size * sizeof(struct hlist_head *);
+	fib_info_hash_free(old_info_hash, bytes);
+	fib_info_hash_free(old_laddrhash, bytes);
+}
+
+__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
+{
+	nh->nh_saddr = inet_select_addr(nh->nh_dev,
+					nh->nh_gw,
+					nh->nh_parent->fib_scope);
+	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+
+	return nh->nh_saddr;
+}
+
+struct fib_info *fib_create_info(struct fib_config *cfg)
+{
+	int err;
+	struct fib_info *fi = NULL;
+	struct fib_info *ofi;
+	int nhs = 1;
+	struct net *net = cfg->fc_nlinfo.nl_net;
+
+	if (cfg->fc_type > RTN_MAX)
+		goto err_inval;
+
+	/* Fast check to catch the most weird cases */
+	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
+		goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (cfg->fc_mp) {
+		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
+		if (nhs == 0)
+			goto err_inval;
+	}
+#endif
+
+	err = -ENOBUFS;
+	if (fib_info_cnt >= fib_info_hash_size) {
+		unsigned int new_size = fib_info_hash_size << 1;
+		struct hlist_head *new_info_hash;
+		struct hlist_head *new_laddrhash;
+		unsigned int bytes;
+
+		if (!new_size)
+			new_size = 1;
+		bytes = new_size * sizeof(struct hlist_head *);
+		new_info_hash = fib_info_hash_alloc(bytes);
+		new_laddrhash = fib_info_hash_alloc(bytes);
+		if (!new_info_hash || !new_laddrhash) {
+			fib_info_hash_free(new_info_hash, bytes);
+			fib_info_hash_free(new_laddrhash, bytes);
+		} else
+			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
+
+		if (!fib_info_hash_size)
+			goto failure;
+	}
+
+	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+	if (fi == NULL)
+		goto failure;
+	if (cfg->fc_mx) {
+		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+		if (!fi->fib_metrics)
+			goto failure;
+	} else
+		fi->fib_metrics = (u32 *) dst_default_metrics;
+	fib_info_cnt++;
+
+	fi->fib_net = hold_net(net);
+	fi->fib_protocol = cfg->fc_protocol;
+	fi->fib_scope = cfg->fc_scope;
+	fi->fib_flags = cfg->fc_flags;
+	fi->fib_priority = cfg->fc_priority;
+	fi->fib_prefsrc = cfg->fc_prefsrc;
+
+	fi->fib_nhs = nhs;
+	change_nexthops(fi) {
+		nexthop_nh->nh_parent = fi;
+	} endfor_nexthops(fi)
+
+	if (cfg->fc_mx) {
+		struct nlattr *nla;
+		int remaining;
+
+		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+			int type = nla_type(nla);
+
+			if (type) {
+				if (type > RTAX_MAX)
+					goto err_inval;
+				fi->fib_metrics[type - 1] = nla_get_u32(nla);
+			}
+		}
+	}
+
+	if (cfg->fc_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
+		if (err != 0)
+			goto failure;
+		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
+			goto err_inval;
+		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
+			goto err_inval;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
+			goto err_inval;
+#endif
+#else
+		goto err_inval;
+#endif
+	} else {
+		struct fib_nh *nh = fi->fib_nh;
+
+		nh->nh_oif = cfg->fc_oif;
+		nh->nh_gw = cfg->fc_gw;
+		nh->nh_flags = cfg->fc_flags;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		nh->nh_tclassid = cfg->fc_flow;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		nh->nh_weight = 1;
+#endif
+	}
+
+	if (fib_props[cfg->fc_type].error) {
+		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
+			goto err_inval;
+		goto link_it;
+	} else {
+		switch (cfg->fc_type) {
+		case RTN_UNICAST:
+		case RTN_LOCAL:
+		case RTN_BROADCAST:
+		case RTN_ANYCAST:
+		case RTN_MULTICAST:
+			break;
+		default:
+			goto err_inval;
+		}
+	}
+
+	if (cfg->fc_scope > RT_SCOPE_HOST)
+		goto err_inval;
+
+	if (cfg->fc_scope == RT_SCOPE_HOST) {
+		struct fib_nh *nh = fi->fib_nh;
+
+		/* Local address is added. */
+		if (nhs != 1 || nh->nh_gw)
+			goto err_inval;
+		nh->nh_scope = RT_SCOPE_NOWHERE;
+		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
+		err = -ENODEV;
+		if (nh->nh_dev == NULL)
+			goto failure;
+	} else {
+		change_nexthops(fi) {
+			err = fib_check_nh(cfg, fi, nexthop_nh);
+			if (err != 0)
+				goto failure;
+		} endfor_nexthops(fi)
+	}
+
+	if (fi->fib_prefsrc) {
+		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+		    fi->fib_prefsrc != cfg->fc_dst)
+			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
+				goto err_inval;
+	}
+
+	change_nexthops(fi) {
+		fib_info_update_nh_saddr(net, nexthop_nh);
+	} endfor_nexthops(fi)
+
+link_it:
+	ofi = fib_find_info(fi);
+	if (ofi) {
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+		ofi->fib_treeref++;
+		return ofi;
+	}
+
+	fi->fib_treeref++;
+	atomic_inc(&fi->fib_clntref);
+	spin_lock_bh(&fib_info_lock);
+	hlist_add_head(&fi->fib_hash,
+		       &fib_info_hash[fib_info_hashfn(fi)]);
+	if (fi->fib_prefsrc) {
+		struct hlist_head *head;
+
+		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+		hlist_add_head(&fi->fib_lhash, head);
+	}
+	change_nexthops(fi) {
+		struct hlist_head *head;
+		unsigned int hash;
+
+		if (!nexthop_nh->nh_dev)
+			continue;
+		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
+		head = &fib_info_devhash[hash];
+		hlist_add_head(&nexthop_nh->nh_hash, head);
+	} endfor_nexthops(fi)
+	spin_unlock_bh(&fib_info_lock);
+	return fi;
+
+err_inval:
+	err = -EINVAL;
+
+failure:
+	if (fi) {
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+	}
+
+	return ERR_PTR(err);
+}
+
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
+		  struct fib_info *fi, unsigned int flags)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family = AF_INET;
+	rtm->rtm_dst_len = dst_len;
+	rtm->rtm_src_len = 0;
+	rtm->rtm_tos = tos;
+	if (tb_id < 256)
+		rtm->rtm_table = tb_id;
+	else
+		rtm->rtm_table = RT_TABLE_COMPAT;
+	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
+	rtm->rtm_type = type;
+	rtm->rtm_flags = fi->fib_flags;
+	rtm->rtm_scope = fi->fib_scope;
+	rtm->rtm_protocol = fi->fib_protocol;
+
+	if (rtm->rtm_dst_len)
+		NLA_PUT_BE32(skb, RTA_DST, dst);
+
+	if (fi->fib_priority)
+		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
+
+	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+		goto nla_put_failure;
+
+	if (fi->fib_prefsrc)
+		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
+
+	if (fi->fib_nhs == 1) {
+		if (fi->fib_nh->nh_gw)
+			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
+
+		if (fi->fib_nh->nh_oif)
+			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		if (fi->fib_nh[0].nh_tclassid)
+			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
+#endif
+	}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (fi->fib_nhs > 1) {
+		struct rtnexthop *rtnh;
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (mp == NULL)
+			goto nla_put_failure;
+
+		for_nexthops(fi) {
+			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+			if (rtnh == NULL)
+				goto nla_put_failure;
+
+			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+			rtnh->rtnh_hops = nh->nh_weight - 1;
+			rtnh->rtnh_ifindex = nh->nh_oif;
+
+			if (nh->nh_gw)
+				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+			if (nh->nh_tclassid)
+				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
+#endif
+			/* length of rtnetlink header + attributes */
+			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
+		} endfor_nexthops(fi);
+
+		nla_nest_end(skb, mp);
+	}
+#endif
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+/*
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ *   referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
+ */
+int fib_sync_down_addr(struct net *net, __be32 local)
+{
+	int ret = 0;
+	unsigned int hash = fib_laddr_hashfn(local);
+	struct hlist_head *head = &fib_info_laddrhash[hash];
+	struct hlist_node *node;
+	struct fib_info *fi;
+
+	if (fib_info_laddrhash == NULL || local == 0)
+		return 0;
+
+	hlist_for_each_entry(fi, node, head, fib_lhash) {
+		if (!net_eq(fi->fib_net, net))
+			continue;
+		if (fi->fib_prefsrc == local) {
+			fi->fib_flags |= RTNH_F_DEAD;
+			ret++;
+		}
+	}
+	return ret;
+}
+
+int fib_sync_down_dev(struct net_device *dev, int force)
+{
+	int ret = 0;
+	int scope = RT_SCOPE_NOWHERE;
+	struct fib_info *prev_fi = NULL;
+	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+	struct hlist_head *head = &fib_info_devhash[hash];
+	struct hlist_node *node;
+	struct fib_nh *nh;
+
+	if (force)
+		scope = -1;
+
+	hlist_for_each_entry(nh, node, head, nh_hash) {
+		struct fib_info *fi = nh->nh_parent;
+		int dead;
+
+		BUG_ON(!fi->fib_nhs);
+		if (nh->nh_dev != dev || fi == prev_fi)
+			continue;
+		prev_fi = fi;
+		dead = 0;
+		change_nexthops(fi) {
+			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+				dead++;
+			else if (nexthop_nh->nh_dev == dev &&
+				 nexthop_nh->nh_scope != scope) {
+				nexthop_nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+				spin_lock_bh(&fib_multipath_lock);
+				fi->fib_power -= nexthop_nh->nh_power;
+				nexthop_nh->nh_power = 0;
+				spin_unlock_bh(&fib_multipath_lock);
+#endif
+				dead++;
+			}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+			if (force > 1 && nexthop_nh->nh_dev == dev) {
+				dead = fi->fib_nhs;
+				break;
+			}
+#endif
+		} endfor_nexthops(fi)
+		if (dead == fi->fib_nhs) {
+			fi->fib_flags |= RTNH_F_DEAD;
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
+/* Must be invoked inside of an RCU protected region.  */
+void fib_select_default(struct fib_result *res)
+{
+	struct fib_info *fi = NULL, *last_resort = NULL;
+	struct list_head *fa_head = res->fa_head;
+	struct fib_table *tb = res->table;
+	int order = -1, last_idx = -1;
+	struct fib_alias *fa;
+
+	list_for_each_entry_rcu(fa, fa_head, fa_list) {
+		struct fib_info *next_fi = fa->fa_info;
+
+		if (next_fi->fib_scope != res->scope ||
+		    fa->fa_type != RTN_UNICAST)
+			continue;
+
+		if (next_fi->fib_priority > res->fi->fib_priority)
+			break;
+		if (!next_fi->fib_nh[0].nh_gw ||
+		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+			continue;
+
+		fib_alias_accessed(fa);
+
+		if (fi == NULL) {
+			if (next_fi != res->fi)
+				break;
+		} else if (!fib_detect_death(fi, order, &last_resort,
+					     &last_idx, tb->tb_default)) {
+			fib_result_assign(res, fi);
+			tb->tb_default = order;
+			goto out;
+		}
+		fi = next_fi;
+		order++;
+	}
+
+	if (order <= 0 || fi == NULL) {
+		tb->tb_default = -1;
+		goto out;
+	}
+
+	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+				tb->tb_default)) {
+		fib_result_assign(res, fi);
+		tb->tb_default = order;
+		goto out;
+	}
+
+	if (last_idx >= 0)
+		fib_result_assign(res, last_resort);
+	tb->tb_default = last_idx;
+out:
+	return;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
+ */
+int fib_sync_up(struct net_device *dev)
+{
+	struct fib_info *prev_fi;
+	unsigned int hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_nh *nh;
+	int ret;
+
+	if (!(dev->flags & IFF_UP))
+		return 0;
+
+	prev_fi = NULL;
+	hash = fib_devindex_hashfn(dev->ifindex);
+	head = &fib_info_devhash[hash];
+	ret = 0;
+
+	hlist_for_each_entry(nh, node, head, nh_hash) {
+		struct fib_info *fi = nh->nh_parent;
+		int alive;
+
+		BUG_ON(!fi->fib_nhs);
+		if (nh->nh_dev != dev || fi == prev_fi)
+			continue;
+
+		prev_fi = fi;
+		alive = 0;
+		change_nexthops(fi) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+				alive++;
+				continue;
+			}
+			if (nexthop_nh->nh_dev == NULL ||
+			    !(nexthop_nh->nh_dev->flags & IFF_UP))
+				continue;
+			if (nexthop_nh->nh_dev != dev ||
+			    !__in_dev_get_rtnl(dev))
+				continue;
+			alive++;
+			spin_lock_bh(&fib_multipath_lock);
+			nexthop_nh->nh_power = 0;
+			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
+			spin_unlock_bh(&fib_multipath_lock);
+		} endfor_nexthops(fi)
+
+		if (alive > 0) {
+			fi->fib_flags &= ~RTNH_F_DEAD;
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
+ */
+void fib_select_multipath(struct fib_result *res)
+{
+	struct fib_info *fi = res->fi;
+	int w;
+
+	spin_lock_bh(&fib_multipath_lock);
+	if (fi->fib_power <= 0) {
+		int power = 0;
+		change_nexthops(fi) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
+				power += nexthop_nh->nh_weight;
+				nexthop_nh->nh_power = nexthop_nh->nh_weight;
+			}
+		} endfor_nexthops(fi);
+		fi->fib_power = power;
+		if (power <= 0) {
+			spin_unlock_bh(&fib_multipath_lock);
+			/* Race condition: route has just become dead. */
+			res->nh_sel = 0;
+			return;
+		}
+	}
+
+
+	/* w should be random number [0..fi->fib_power-1],
+	 * it is pretty bad approximation.
+	 */
+
+	w = jiffies % fi->fib_power;
+
+	change_nexthops(fi) {
+		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
+		    nexthop_nh->nh_power) {
+			w -= nexthop_nh->nh_power;
+			if (w <= 0) {
+				nexthop_nh->nh_power--;
+				fi->fib_power--;
+				res->nh_sel = nhsel;
+				spin_unlock_bh(&fib_multipath_lock);
+				return;
+			}
+		}
+	} endfor_nexthops(fi);
+
+	/* Race condition: route has just become dead. */
+	res->nh_sel = 0;
+	spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 00000000..0d884eb2
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2636 @@
+/*
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ *
+ *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ *     & Swedish University of Agricultural Sciences.
+ *
+ *   Jens Laas <jens.laas@data.slu.se> Swedish University of
+ *     Agricultural Sciences.
+ *
+ *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally described in:
+ *
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
+ *
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ *
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ *		David S. Miller, <davem@davemloft.net>
+ *		Stephen Hemminger <shemminger@osdl.org>
+ *		Paul E. McKenney <paulmck@us.ibm.com>
+ *		Patrick McHardy <kaber@trash.net>
+ */
+
+#define VERSION "0.409"
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include "fib_lookup.h"
+
+#define MAX_STAT_DEPTH 32
+
+#define KEYLENGTH (8*sizeof(t_key))
+
+typedef unsigned int t_key;
+
+#define T_TNODE 0
+#define T_LEAF  1
+#define NODE_TYPE_MASK	0x1UL
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
+
+struct rt_trie_node {
+	unsigned long parent;
+	t_key key;
+};
+
+struct leaf {
+	unsigned long parent;
+	t_key key;
+	struct hlist_head list;
+	struct rcu_head rcu;
+};
+
+struct leaf_info {
+	struct hlist_node hlist;
+	struct rcu_head rcu;
+	int plen;
+	struct list_head falh;
+};
+
+struct tnode {
+	unsigned long parent;
+	t_key key;
+	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
+	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
+	unsigned int full_children;	/* KEYLENGTH bits needed */
+	unsigned int empty_children;	/* KEYLENGTH bits needed */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+		struct tnode *tnode_free;
+	};
+	struct rt_trie_node __rcu *child[0];
+};
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+	unsigned int gets;
+	unsigned int backtrack;
+	unsigned int semantic_match_passed;
+	unsigned int semantic_match_miss;
+	unsigned int null_node_hit;
+	unsigned int resize_node_skipped;
+};
+#endif
+
+struct trie_stat {
+	unsigned int totdepth;
+	unsigned int maxdepth;
+	unsigned int tnodes;
+	unsigned int leaves;
+	unsigned int nullpointers;
+	unsigned int prefixes;
+	unsigned int nodesizes[MAX_STAT_DEPTH];
+};
+
+struct trie {
+	struct rt_trie_node __rcu *trie;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats stats;
+#endif
+};
+
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+				  int wasfull);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
+/* tnodes to free after resize(); protected by RTNL */
+static struct tnode *tnode_free_head;
+static size_t tnode_free_size;
+
+/*
+ * synchronize_rcu after call_rcu for that many pages; it should be especially
+ * useful before resizing the root node with PREEMPT_NONE configs; the value was
+ * obtained experimentally, aiming to avoid visible slowdown.
+ */
+static const int sync_pages = 128;
+
+static struct kmem_cache *fn_alias_kmem __read_mostly;
+static struct kmem_cache *trie_leaf_kmem __read_mostly;
+
+/*
+ * caller must hold RTNL
+ */
+static inline struct tnode *node_parent(const struct rt_trie_node *node)
+{
+	unsigned long parent;
+
+	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+
+	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+}
+
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
+{
+	unsigned long parent;
+
+	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
+							   lockdep_rtnl_is_held());
+
+	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+}
+
+/* Same as rcu_assign_pointer
+ * but that macro() assumes that value is a pointer.
+ */
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+{
+	smp_wmb();
+	node->parent = (unsigned long)ptr | NODE_TYPE(node);
+}
+
+/*
+ * caller must hold RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
+{
+	BUG_ON(i >= 1U << tn->bits);
+
+	return rtnl_dereference(tn->child[i]);
+}
+
+/*
+ * caller must hold RCU read lock or RTNL
+ */
+static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+{
+	BUG_ON(i >= 1U << tn->bits);
+
+	return rcu_dereference_rtnl(tn->child[i]);
+}
+
+static inline int tnode_child_length(const struct tnode *tn)
+{
+	return 1 << tn->bits;
+}
+
+static inline t_key mask_pfx(t_key k, unsigned int l)
+{
+	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
+}
+
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
+{
+	if (offset < KEYLENGTH)
+		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
+	else
+		return 0;
+}
+
+static inline int tkey_equals(t_key a, t_key b)
+{
+	return a == b;
+}
+
+static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
+{
+	if (bits == 0 || offset >= KEYLENGTH)
+		return 1;
+	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+}
+
+static inline int tkey_mismatch(t_key a, int offset, t_key b)
+{
+	t_key diff = a ^ b;
+	int i = offset;
+
+	if (!diff)
+		return 0;
+	while ((diff << i) >> (KEYLENGTH-1) == 0)
+		i++;
+	return i;
+}
+
+/*
+  To understand this stuff, an understanding of keys and all their bits is
+  necessary. Every node in the trie has a key associated with it, but not
+  all of the bits in that key are significant.
+
+  Consider a node 'n' and its parent 'tp'.
+
+  If n is a leaf, every bit in its key is significant. Its presence is
+  necessitated by path compression, since during a tree traversal (when
+  searching for a leaf - unless we are doing an insertion) we will completely
+  ignore all skipped bits we encounter. Thus we need to verify, at the end of
+  a potentially successful search, that we have indeed been walking the
+  correct key path.
+
+  Note that we can never "miss" the correct key in the tree if present by
+  following the wrong path. Path compression ensures that segments of the key
+  that are the same for all keys with a given prefix are skipped, but the
+  skipped part *is* identical for each node in the subtrie below the skipped
+  bit! trie_insert() in this implementation takes care of that - note the
+  call to tkey_sub_equals() in trie_insert().
+
+  if n is an internal node - a 'tnode' here, the various parts of its key
+  have many different meanings.
+
+  Example:
+  _________________________________________________________________
+  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+  -----------------------------------------------------------------
+    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+
+  _________________________________________________________________
+  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+  -----------------------------------------------------------------
+   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+
+  tp->pos = 7
+  tp->bits = 3
+  n->pos = 15
+  n->bits = 4
+
+  First, let's just ignore the bits that come before the parent tp, that is
+  the bits from 0 to (tp->pos-1). They are *known* but at this point we do
+  not use them for anything.
+
+  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+  index into the parent's child array. That is, they will be used to find
+  'n' among tp's children.
+
+  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
+  for the node n.
+
+  All the bits we have seen so far are significant to the node n. The rest
+  of the bits are really not needed or indeed known in n->key.
+
+  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+  n's child array, and will of course be different for each child.
+
+
+  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
+  at this point.
+
+*/
+
+static inline void check_tnode(const struct tnode *tn)
+{
+	WARN_ON(tn && tn->pos+tn->bits > 32);
+}
+
+static const int halve_threshold = 25;
+static const int inflate_threshold = 50;
+static const int halve_threshold_root = 15;
+static const int inflate_threshold_root = 30;
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+	call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+	struct leaf *l = container_of(head, struct leaf, rcu);
+	kmem_cache_free(trie_leaf_kmem, l);
+}
+
+static inline void free_leaf(struct leaf *l)
+{
+	call_rcu_bh(&l->rcu, __leaf_free_rcu);
+}
+
+static inline void free_leaf_info(struct leaf_info *leaf)
+{
+	kfree_rcu(leaf, rcu);
+}
+
+static struct tnode *tnode_alloc(size_t size)
+{
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_KERNEL);
+	else
+		return vzalloc(size);
+}
+
+static void __tnode_vfree(struct work_struct *arg)
+{
+	struct tnode *tn = container_of(arg, struct tnode, work);
+	vfree(tn);
+}
+
+static void __tnode_free_rcu(struct rcu_head *head)
+{
+	struct tnode *tn = container_of(head, struct tnode, rcu);
+	size_t size = sizeof(struct tnode) +
+		      (sizeof(struct rt_trie_node *) << tn->bits);
+
+	if (size <= PAGE_SIZE)
+		kfree(tn);
+	else {
+		INIT_WORK(&tn->work, __tnode_vfree);
+		schedule_work(&tn->work);
+	}
+}
+
+static inline void tnode_free(struct tnode *tn)
+{
+	if (IS_LEAF(tn))
+		free_leaf((struct leaf *) tn);
+	else
+		call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static void tnode_free_safe(struct tnode *tn)
+{
+	BUG_ON(IS_LEAF(tn));
+	tn->tnode_free = tnode_free_head;
+	tnode_free_head = tn;
+	tnode_free_size += sizeof(struct tnode) +
+			   (sizeof(struct rt_trie_node *) << tn->bits);
+}
+
+static void tnode_free_flush(void)
+{
+	struct tnode *tn;
+
+	while ((tn = tnode_free_head)) {
+		tnode_free_head = tn->tnode_free;
+		tn->tnode_free = NULL;
+		tnode_free(tn);
+	}
+
+	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+		tnode_free_size = 0;
+		synchronize_rcu();
+	}
+}
+
+static struct leaf *leaf_new(void)
+{
+	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+	if (l) {
+		l->parent = T_LEAF;
+		INIT_HLIST_HEAD(&l->list);
+	}
+	return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+	if (li) {
+		li->plen = plen;
+		INIT_LIST_HEAD(&li->falh);
+	}
+	return li;
+}
+
+static struct tnode *tnode_new(t_key key, int pos, int bits)
+{
+	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
+	struct tnode *tn = tnode_alloc(sz);
+
+	if (tn) {
+		tn->parent = T_TNODE;
+		tn->pos = pos;
+		tn->bits = bits;
+		tn->key = key;
+		tn->full_children = 0;
+		tn->empty_children = 1<<bits;
+	}
+
+	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+		 sizeof(struct rt_trie_node) << bits);
+	return tn;
+}
+
+/*
+ * Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
+{
+	if (n == NULL || IS_LEAF(n))
+		return 0;
+
+	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+}
+
+static inline void put_child(struct trie *t, struct tnode *tn, int i,
+			     struct rt_trie_node *n)
+{
+	tnode_put_child_reorg(tn, i, n, -1);
+}
+
+ /*
+  * Add a child at position i overwriting the old value.
+  * Update the value of full_children and empty_children.
+  */
+
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+				  int wasfull)
+{
+	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
+	int isfull;
+
+	BUG_ON(i >= 1<<tn->bits);
+
+	/* update emptyChildren */
+	if (n == NULL && chi != NULL)
+		tn->empty_children++;
+	else if (n != NULL && chi == NULL)
+		tn->empty_children--;
+
+	/* update fullChildren */
+	if (wasfull == -1)
+		wasfull = tnode_full(tn, chi);
+
+	isfull = tnode_full(tn, n);
+	if (wasfull && !isfull)
+		tn->full_children--;
+	else if (!wasfull && isfull)
+		tn->full_children++;
+
+	if (n)
+		node_set_parent(n, tn);
+
+	rcu_assign_pointer(tn->child[i], n);
+}
+
+#define MAX_WORK 10
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
+{
+	int i;
+	struct tnode *old_tn;
+	int inflate_threshold_use;
+	int halve_threshold_use;
+	int max_work;
+
+	if (!tn)
+		return NULL;
+
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
+
+	/* No children */
+	if (tn->empty_children == tnode_child_length(tn)) {
+		tnode_free_safe(tn);
+		return NULL;
+	}
+	/* One child */
+	if (tn->empty_children == tnode_child_length(tn) - 1)
+		goto one_child;
+	/*
+	 * Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
+	 */
+
+	/*
+	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
+	 * Telecommunications, page 6:
+	 * "A node is doubled if the ratio of non-empty children to all
+	 * children in the *doubled* node is at least 'high'."
+	 *
+	 * 'high' in this instance is the variable 'inflate_threshold'. It
+	 * is expressed as a percentage, so we multiply it with
+	 * tnode_child_length() and instead of multiplying by 2 (since the
+	 * child array will be doubled by inflate()) and multiplying
+	 * the left-hand side by 100 (to handle the percentage thing) we
+	 * multiply the left-hand side by 50.
+	 *
+	 * The left-hand side may look a bit weird: tnode_child_length(tn)
+	 * - tn->empty_children is of course the number of non-null children
+	 * in the current node. tn->full_children is the number of "full"
+	 * children, that is non-null tnodes with a skip value of 0.
+	 * All of those will be doubled in the resulting inflated tnode, so
+	 * we just count them one extra time here.
+	 *
+	 * A clearer way to write this would be:
+	 *
+	 * to_be_doubled = tn->full_children;
+	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+	 *     tn->full_children;
+	 *
+	 * new_child_length = tnode_child_length(tn) * 2;
+	 *
+	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+	 *      new_child_length;
+	 * if (new_fill_factor >= inflate_threshold)
+	 *
+	 * ...and so on, tho it would mess up the while () loop.
+	 *
+	 * anyway,
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+	 *      inflate_threshold
+	 *
+	 * avoid a division:
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+	 *      inflate_threshold * new_child_length
+	 *
+	 * expand not_to_be_doubled and to_be_doubled, and shorten:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >= inflate_threshold * new_child_length
+	 *
+	 * expand new_child_length:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children +
+	 *    tn->full_children) >=
+	 *      inflate_threshold * tnode_child_length(tn) * 2
+	 *
+	 * shorten again:
+	 * 50 * (tn->full_children + tnode_child_length(tn) -
+	 *    tn->empty_children) >= inflate_threshold *
+	 *    tnode_child_length(tn)
+	 *
+	 */
+
+	check_tnode(tn);
+
+	/* Keep root node larger  */
+
+	if (!node_parent((struct rt_trie_node *)tn)) {
+		inflate_threshold_use = inflate_threshold_root;
+		halve_threshold_use = halve_threshold_root;
+	} else {
+		inflate_threshold_use = inflate_threshold;
+		halve_threshold_use = halve_threshold;
+	}
+
+	max_work = MAX_WORK;
+	while ((tn->full_children > 0 &&  max_work-- &&
+		50 * (tn->full_children + tnode_child_length(tn)
+		      - tn->empty_children)
+		>= inflate_threshold_use * tnode_child_length(tn))) {
+
+		old_tn = tn;
+		tn = inflate(t, tn);
+
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
+	}
+
+	check_tnode(tn);
+
+	/* Return if at least one inflate is run */
+	if (max_work != MAX_WORK)
+		return (struct rt_trie_node *) tn;
+
+	/*
+	 * Halve as long as the number of empty children in this
+	 * node is above threshold.
+	 */
+
+	max_work = MAX_WORK;
+	while (tn->bits > 1 &&  max_work-- &&
+	       100 * (tnode_child_length(tn) - tn->empty_children) <
+	       halve_threshold_use * tnode_child_length(tn)) {
+
+		old_tn = tn;
+		tn = halve(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.resize_node_skipped++;
+#endif
+			break;
+		}
+	}
+
+
+	/* Only one child remains */
+	if (tn->empty_children == tnode_child_length(tn) - 1) {
+one_child:
+		for (i = 0; i < tnode_child_length(tn); i++) {
+			struct rt_trie_node *n;
+
+			n = rtnl_dereference(tn->child[i]);
+			if (!n)
+				continue;
+
+			/* compress one level */
+
+			node_set_parent(n, NULL);
+			tnode_free_safe(tn);
+			return n;
+		}
+	}
+	return (struct rt_trie_node *) tn;
+}
+
+
+static void tnode_clean_free(struct tnode *tn)
+{
+	int i;
+	struct tnode *tofree;
+
+	for (i = 0; i < tnode_child_length(tn); i++) {
+		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+		if (tofree)
+			tnode_free(tofree);
+	}
+	tnode_free(tn);
+}
+
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
+{
+	struct tnode *oldtnode = tn;
+	int olen = tnode_child_length(tn);
+	int i;
+
+	pr_debug("In inflate\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we
+	 * don't get into an inconsistent state if memory allocation
+	 * fails. In case of failure we return the oldnode and  inflate
+	 * of tnode is ignored.
+	 */
+
+	for (i = 0; i < olen; i++) {
+		struct tnode *inode;
+
+		inode = (struct tnode *) tnode_get_child(oldtnode, i);
+		if (inode &&
+		    IS_TNODE(inode) &&
+		    inode->pos == oldtnode->pos + oldtnode->bits &&
+		    inode->bits > 1) {
+			struct tnode *left, *right;
+			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
+
+			left = tnode_new(inode->key&(~m), inode->pos + 1,
+					 inode->bits - 1);
+			if (!left)
+				goto nomem;
+
+			right = tnode_new(inode->key|m, inode->pos + 1,
+					  inode->bits - 1);
+
+			if (!right) {
+				tnode_free(left);
+				goto nomem;
+			}
+
+			put_child(t, tn, 2*i, (struct rt_trie_node *) left);
+			put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
+		}
+	}
+
+	for (i = 0; i < olen; i++) {
+		struct tnode *inode;
+		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
+		struct tnode *left, *right;
+		int size, j;
+
+		/* An empty child */
+		if (node == NULL)
+			continue;
+
+		/* A leaf or an internal node with skipped bits */
+
+		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
+		   tn->pos + tn->bits - 1) {
+			if (tkey_extract_bits(node->key,
+					      oldtnode->pos + oldtnode->bits,
+					      1) == 0)
+				put_child(t, tn, 2*i, node);
+			else
+				put_child(t, tn, 2*i+1, node);
+			continue;
+		}
+
+		/* An internal node with two children */
+		inode = (struct tnode *) node;
+
+		if (inode->bits == 1) {
+			put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
+			put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
+
+			tnode_free_safe(inode);
+			continue;
+		}
+
+		/* An internal node with more than two children */
+
+		/* We will replace this node 'inode' with two new
+		 * ones, 'left' and 'right', each with half of the
+		 * original children. The two new nodes will have
+		 * a position one bit further down the key and this
+		 * means that the "significant" part of their keys
+		 * (see the discussion near the top of this file)
+		 * will differ by one bit, which will be "0" in
+		 * left's key and "1" in right's key. Since we are
+		 * moving the key position by one step, the bit that
+		 * we are moving away from - the bit at position
+		 * (inode->pos) - is the one that will differ between
+		 * left and right. So... we synthesize that bit in the
+		 * two  new keys.
+		 * The mask 'm' below will be a single "one" bit at
+		 * the position (inode->pos)
+		 */
+
+		/* Use the old key, but set the new significant
+		 *   bit to zero.
+		 */
+
+		left = (struct tnode *) tnode_get_child(tn, 2*i);
+		put_child(t, tn, 2*i, NULL);
+
+		BUG_ON(!left);
+
+		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+		put_child(t, tn, 2*i+1, NULL);
+
+		BUG_ON(!right);
+
+		size = tnode_child_length(left);
+		for (j = 0; j < size; j++) {
+			put_child(t, left, j, rtnl_dereference(inode->child[j]));
+			put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
+		}
+		put_child(t, tn, 2*i, resize(t, left));
+		put_child(t, tn, 2*i+1, resize(t, right));
+
+		tnode_free_safe(inode);
+	}
+	tnode_free_safe(oldtnode);
+	return tn;
+nomem:
+	tnode_clean_free(tn);
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct tnode *halve(struct trie *t, struct tnode *tn)
+{
+	struct tnode *oldtnode = tn;
+	struct rt_trie_node *left, *right;
+	int i;
+	int olen = tnode_child_length(tn);
+
+	pr_debug("In halve\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Preallocate and store tnodes before the actual work so we
+	 * don't get into an inconsistent state if memory allocation
+	 * fails. In case of failure we return the oldnode and halve
+	 * of tnode is ignored.
+	 */
+
+	for (i = 0; i < olen; i += 2) {
+		left = tnode_get_child(oldtnode, i);
+		right = tnode_get_child(oldtnode, i+1);
+
+		/* Two nonempty children */
+		if (left && right) {
+			struct tnode *newn;
+
+			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+
+			if (!newn)
+				goto nomem;
+
+			put_child(t, tn, i/2, (struct rt_trie_node *)newn);
+		}
+
+	}
+
+	for (i = 0; i < olen; i += 2) {
+		struct tnode *newBinNode;
+
+		left = tnode_get_child(oldtnode, i);
+		right = tnode_get_child(oldtnode, i+1);
+
+		/* At least one of the children is empty */
+		if (left == NULL) {
+			if (right == NULL)    /* Both are empty */
+				continue;
+			put_child(t, tn, i/2, right);
+			continue;
+		}
+
+		if (right == NULL) {
+			put_child(t, tn, i/2, left);
+			continue;
+		}
+
+		/* Two nonempty children */
+		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+		put_child(t, tn, i/2, NULL);
+		put_child(t, newBinNode, 0, left);
+		put_child(t, newBinNode, 1, right);
+		put_child(t, tn, i/2, resize(t, newBinNode));
+	}
+	tnode_free_safe(oldtnode);
+	return tn;
+nomem:
+	tnode_clean_free(tn);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* readside must use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
+static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
+{
+	struct hlist_head *head = &l->list;
+	struct hlist_node *node;
+	struct leaf_info *li;
+
+	hlist_for_each_entry_rcu(li, node, head, hlist)
+		if (li->plen == plen)
+			return li;
+
+	return NULL;
+}
+
+static inline struct list_head *get_fa_head(struct leaf *l, int plen)
+{
+	struct leaf_info *li = find_leaf_info(l, plen);
+
+	if (!li)
+		return NULL;
+
+	return &li->falh;
+}
+
+static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+{
+	struct leaf_info *li = NULL, *last = NULL;
+	struct hlist_node *node;
+
+	if (hlist_empty(head)) {
+		hlist_add_head_rcu(&new->hlist, head);
+	} else {
+		hlist_for_each_entry(li, node, head, hlist) {
+			if (new->plen > li->plen)
+				break;
+
+			last = li;
+		}
+		if (last)
+			hlist_add_after_rcu(&last->hlist, &new->hlist);
+		else
+			hlist_add_before_rcu(&new->hlist, &li->hlist);
+	}
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+
+static struct leaf *
+fib_find_node(struct trie *t, u32 key)
+{
+	int pos;
+	struct tnode *tn;
+	struct rt_trie_node *n;
+
+	pos = 0;
+	n = rcu_dereference_rtnl(t->trie);
+
+	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+		tn = (struct tnode *) n;
+
+		check_tnode(tn);
+
+		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+			pos = tn->pos + tn->bits;
+			n = tnode_get_child_rcu(tn,
+						tkey_extract_bits(key,
+								  tn->pos,
+								  tn->bits));
+		} else
+			break;
+	}
+	/* Case we have found a leaf. Compare prefixes */
+
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+		return (struct leaf *)n;
+
+	return NULL;
+}
+
+static void trie_rebalance(struct trie *t, struct tnode *tn)
+{
+	int wasfull;
+	t_key cindex, key;
+	struct tnode *tp;
+
+	key = tn->key;
+
+	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
+		tn = (struct tnode *) resize(t, (struct tnode *)tn);
+
+		tnode_put_child_reorg((struct tnode *)tp, cindex,
+				      (struct rt_trie_node *)tn, wasfull);
+
+		tp = node_parent((struct rt_trie_node *) tn);
+		if (!tp)
+			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+
+		tnode_free_flush();
+		if (!tp)
+			break;
+		tn = tp;
+	}
+
+	/* Handle last (top) tnode */
+	if (IS_TNODE(tn))
+		tn = (struct tnode *)resize(t, (struct tnode *)tn);
+
+	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+	tnode_free_flush();
+}
+
+/* only used from updater-side */
+
+static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
+{
+	int pos, newpos;
+	struct tnode *tp = NULL, *tn = NULL;
+	struct rt_trie_node *n;
+	struct leaf *l;
+	int missbit;
+	struct list_head *fa_head = NULL;
+	struct leaf_info *li;
+	t_key cindex;
+
+	pos = 0;
+	n = rtnl_dereference(t->trie);
+
+	/* If we point to NULL, stop. Either the tree is empty and we should
+	 * just put a new leaf in if, or we have reached an empty child slot,
+	 * and we should just put our new leaf in that.
+	 * If we point to a T_TNODE, check if it matches our key. Note that
+	 * a T_TNODE might be skipping any number of bits - its 'pos' need
+	 * not be the parent's 'pos'+'bits'!
+	 *
+	 * If it does match the current key, get pos/bits from it, extract
+	 * the index from our key, push the T_TNODE and walk the tree.
+	 *
+	 * If it doesn't, we have to replace it with a new T_TNODE.
+	 *
+	 * If we point to a T_LEAF, it might or might not have the same key
+	 * as we do. If it does, just change the value, update the T_LEAF's
+	 * value, and return it.
+	 * If it doesn't, we need to replace it with a T_TNODE.
+	 */
+
+	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+		tn = (struct tnode *) n;
+
+		check_tnode(tn);
+
+		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+			tp = tn;
+			pos = tn->pos + tn->bits;
+			n = tnode_get_child(tn,
+					    tkey_extract_bits(key,
+							      tn->pos,
+							      tn->bits));
+
+			BUG_ON(n && node_parent(n) != tn);
+		} else
+			break;
+	}
+
+	/*
+	 * n  ----> NULL, LEAF or TNODE
+	 *
+	 * tp is n's (parent) ----> NULL or TNODE
+	 */
+
+	BUG_ON(tp && IS_LEAF(tp));
+
+	/* Case 1: n is a leaf. Compare prefixes */
+
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
+		l = (struct leaf *) n;
+		li = leaf_info_new(plen);
+
+		if (!li)
+			return NULL;
+
+		fa_head = &li->falh;
+		insert_leaf_info(&l->list, li);
+		goto done;
+	}
+	l = leaf_new();
+
+	if (!l)
+		return NULL;
+
+	l->key = key;
+	li = leaf_info_new(plen);
+
+	if (!li) {
+		free_leaf(l);
+		return NULL;
+	}
+
+	fa_head = &li->falh;
+	insert_leaf_info(&l->list, li);
+
+	if (t->trie && n == NULL) {
+		/* Case 2: n is NULL, and will just insert a new leaf */
+
+		node_set_parent((struct rt_trie_node *)l, tp);
+
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
+	} else {
+		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
+		/*
+		 *  Add a new tnode here
+		 *  first tnode need some special handling
+		 */
+
+		if (tp)
+			pos = tp->pos+tp->bits;
+		else
+			pos = 0;
+
+		if (n) {
+			newpos = tkey_mismatch(key, pos, n->key);
+			tn = tnode_new(n->key, newpos, 1);
+		} else {
+			newpos = 0;
+			tn = tnode_new(key, newpos, 1); /* First tnode */
+		}
+
+		if (!tn) {
+			free_leaf_info(li);
+			free_leaf(l);
+			return NULL;
+		}
+
+		node_set_parent((struct rt_trie_node *)tn, tp);
+
+		missbit = tkey_extract_bits(key, newpos, 1);
+		put_child(t, tn, missbit, (struct rt_trie_node *)l);
+		put_child(t, tn, 1-missbit, n);
+
+		if (tp) {
+			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+			put_child(t, (struct tnode *)tp, cindex,
+				  (struct rt_trie_node *)tn);
+		} else {
+			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+			tp = tn;
+		}
+	}
+
+	if (tp && tp->pos + tp->bits > 32)
+		pr_warning("fib_trie"
+			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+			   tp, tp->pos, tp->bits, key, plen);
+
+	/* Rebalance the trie */
+
+	trie_rebalance(t, tp);
+done:
+	return fa_head;
+}
+
+/*
+ * Caller must hold RTNL.
+ */
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	struct fib_alias *fa, *new_fa;
+	struct list_head *fa_head = NULL;
+	struct fib_info *fi;
+	int plen = cfg->fc_dst_len;
+	u8 tos = cfg->fc_tos;
+	u32 key, mask;
+	int err;
+	struct leaf *l;
+
+	if (plen > 32)
+		return -EINVAL;
+
+	key = ntohl(cfg->fc_dst);
+
+	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
+	mask = ntohl(inet_make_mask(plen));
+
+	if (key & ~mask)
+		return -EINVAL;
+
+	key = key & mask;
+
+	fi = fib_create_info(cfg);
+	if (IS_ERR(fi)) {
+		err = PTR_ERR(fi);
+		goto err;
+	}
+
+	l = fib_find_node(t, key);
+	fa = NULL;
+
+	if (l) {
+		fa_head = get_fa_head(l, plen);
+		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+	}
+
+	/* Now fa, if non-NULL, points to the first fib alias
+	 * with the same keys [prefix,tos,priority], if such key already
+	 * exists or to the node before which we will insert new one.
+	 *
+	 * If fa is NULL, we will need to allocate a new one and
+	 * insert to the head of f.
+	 *
+	 * If f is NULL, no fib node matched the destination key
+	 * and we need to allocate a new one of those as well.
+	 */
+
+	if (fa && fa->fa_tos == tos &&
+	    fa->fa_info->fib_priority == fi->fib_priority) {
+		struct fib_alias *fa_first, *fa_match;
+
+		err = -EEXIST;
+		if (cfg->fc_nlflags & NLM_F_EXCL)
+			goto out;
+
+		/* We have 2 goals:
+		 * 1. Find exact match for type, scope, fib_info to avoid
+		 * duplicate routes
+		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+		 */
+		fa_match = NULL;
+		fa_first = fa;
+		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+		list_for_each_entry_continue(fa, fa_head, fa_list) {
+			if (fa->fa_tos != tos)
+				break;
+			if (fa->fa_info->fib_priority != fi->fib_priority)
+				break;
+			if (fa->fa_type == cfg->fc_type &&
+			    fa->fa_info == fi) {
+				fa_match = fa;
+				break;
+			}
+		}
+
+		if (cfg->fc_nlflags & NLM_F_REPLACE) {
+			struct fib_info *fi_drop;
+			u8 state;
+
+			fa = fa_first;
+			if (fa_match) {
+				if (fa == fa_match)
+					err = 0;
+				goto out;
+			}
+			err = -ENOBUFS;
+			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+			if (new_fa == NULL)
+				goto out;
+
+			fi_drop = fa->fa_info;
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = cfg->fc_type;
+			state = fa->fa_state;
+			new_fa->fa_state = state & ~FA_S_ACCESSED;
+
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+			alias_free_mem_rcu(fa);
+
+			fib_release_info(fi_drop);
+			if (state & FA_S_ACCESSED)
+				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+
+			goto succeeded;
+		}
+		/* Error if we find a perfect match which
+		 * uses the same scope, type, and nexthop
+		 * information.
+		 */
+		if (fa_match)
+			goto out;
+
+		if (!(cfg->fc_nlflags & NLM_F_APPEND))
+			fa = fa_first;
+	}
+	err = -ENOENT;
+	if (!(cfg->fc_nlflags & NLM_F_CREATE))
+		goto out;
+
+	err = -ENOBUFS;
+	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+	if (new_fa == NULL)
+		goto out;
+
+	new_fa->fa_info = fi;
+	new_fa->fa_tos = tos;
+	new_fa->fa_type = cfg->fc_type;
+	new_fa->fa_state = 0;
+	/*
+	 * Insert new entry to the list.
+	 */
+
+	if (!fa_head) {
+		fa_head = fib_insert_node(t, key, plen);
+		if (unlikely(!fa_head)) {
+			err = -ENOMEM;
+			goto out_free_new_fa;
+		}
+	}
+
+	if (!plen)
+		tb->tb_num_default++;
+
+	list_add_tail_rcu(&new_fa->fa_list,
+			  (fa ? &fa->fa_list : fa_head));
+
+	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+		  &cfg->fc_nlinfo, 0);
+succeeded:
+	return 0;
+
+out_free_new_fa:
+	kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+	fib_release_info(fi);
+err:
+	return err;
+}
+
+/* should be called with rcu_read_lock */
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+		      t_key key,  const struct flowi4 *flp,
+		      struct fib_result *res, int fib_flags)
+{
+	struct leaf_info *li;
+	struct hlist_head *hhead = &l->list;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
+		struct fib_alias *fa;
+		int plen = li->plen;
+		__be32 mask = inet_make_mask(plen);
+
+		if (l->key != (key & ntohl(mask)))
+			continue;
+
+		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+			int nhsel, err;
+
+			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+				continue;
+			if (fi->fib_dead)
+				continue;
+			if (fa->fa_info->fib_scope < flp->flowi4_scope)
+				continue;
+			fib_alias_accessed(fa);
+			err = fib_props[fa->fa_type].error;
+			if (err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				t->stats.semantic_match_passed++;
+#endif
+				return err;
+			}
+			if (fi->fib_flags & RTNH_F_DEAD)
+				continue;
+			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+				const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+				if (nh->nh_flags & RTNH_F_DEAD)
+					continue;
+				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+					continue;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+				t->stats.semantic_match_passed++;
+#endif
+				res->prefixlen = plen;
+				res->nh_sel = nhsel;
+				res->type = fa->fa_type;
+				res->scope = fa->fa_info->fib_scope;
+				res->fi = fi;
+				res->table = tb;
+				res->fa_head = &li->falh;
+				if (!(fib_flags & FIB_LOOKUP_NOREF))
+					atomic_inc(&res->fi->fib_clntref);
+				return 0;
+			}
+		}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+		t->stats.semantic_match_miss++;
+#endif
+	}
+
+	return 1;
+}
+
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+		     struct fib_result *res, int fib_flags)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	int ret;
+	struct rt_trie_node *n;
+	struct tnode *pn;
+	unsigned int pos, bits;
+	t_key key = ntohl(flp->daddr);
+	unsigned int chopped_off;
+	t_key cindex = 0;
+	unsigned int current_prefix_length = KEYLENGTH;
+	struct tnode *cn;
+	t_key pref_mismatch;
+
+	rcu_read_lock();
+
+	n = rcu_dereference(t->trie);
+	if (!n)
+		goto failed;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	t->stats.gets++;
+#endif
+
+	/* Just a leaf? */
+	if (IS_LEAF(n)) {
+		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+		goto found;
+	}
+
+	pn = (struct tnode *) n;
+	chopped_off = 0;
+
+	while (pn) {
+		pos = pn->pos;
+		bits = pn->bits;
+
+		if (!chopped_off)
+			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
+						   pos, bits);
+
+		n = tnode_get_child_rcu(pn, cindex);
+
+		if (n == NULL) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.null_node_hit++;
+#endif
+			goto backtrace;
+		}
+
+		if (IS_LEAF(n)) {
+			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+			if (ret > 0)
+				goto backtrace;
+			goto found;
+		}
+
+		cn = (struct tnode *)n;
+
+		/*
+		 * It's a tnode, and we can do some extra checks here if we
+		 * like, to avoid descending into a dead-end branch.
+		 * This tnode is in the parent's child array at index
+		 * key[p_pos..p_pos+p_bits] but potentially with some bits
+		 * chopped off, so in reality the index may be just a
+		 * subprefix, padded with zero at the end.
+		 * We can also take a look at any skipped bits in this
+		 * tnode - everything up to p_pos is supposed to be ok,
+		 * and the non-chopped bits of the index (se previous
+		 * paragraph) are also guaranteed ok, but the rest is
+		 * considered unknown.
+		 *
+		 * The skipped bits are key[pos+bits..cn->pos].
+		 */
+
+		/* If current_prefix_length < pos+bits, we are already doing
+		 * actual prefix  matching, which means everything from
+		 * pos+(bits-chopped_off) onward must be zero along some
+		 * branch of this subtree - otherwise there is *no* valid
+		 * prefix present. Here we can only check the skipped
+		 * bits. Remember, since we have already indexed into the
+		 * parent's child array, we know that the bits we chopped of
+		 * *are* zero.
+		 */
+
+		/* NOTA BENE: Checking only skipped bits
+		   for the new node here */
+
+		if (current_prefix_length < pos+bits) {
+			if (tkey_extract_bits(cn->key, current_prefix_length,
+						cn->pos - current_prefix_length)
+			    || !(cn->child[0]))
+				goto backtrace;
+		}
+
+		/*
+		 * If chopped_off=0, the index is fully validated and we
+		 * only need to look at the skipped bits for this, the new,
+		 * tnode. What we actually want to do is to find out if
+		 * these skipped bits match our key perfectly, or if we will
+		 * have to count on finding a matching prefix further down,
+		 * because if we do, we would like to have some way of
+		 * verifying the existence of such a prefix at this point.
+		 */
+
+		/* The only thing we can do at this point is to verify that
+		 * any such matching prefix can indeed be a prefix to our
+		 * key, and if the bits in the node we are inspecting that
+		 * do not match our key are not ZERO, this cannot be true.
+		 * Thus, find out where there is a mismatch (before cn->pos)
+		 * and verify that all the mismatching bits are zero in the
+		 * new tnode's key.
+		 */
+
+		/*
+		 * Note: We aren't very concerned about the piece of
+		 * the key that precede pn->pos+pn->bits, since these
+		 * have already been checked. The bits after cn->pos
+		 * aren't checked since these are by definition
+		 * "unknown" at this point. Thus, what we want to see
+		 * is if we are about to enter the "prefix matching"
+		 * state, and in that case verify that the skipped
+		 * bits that will prevail throughout this subtree are
+		 * zero, as they have to be if we are to find a
+		 * matching prefix.
+		 */
+
+		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
+
+		/*
+		 * In short: If skipped bits in this node do not match
+		 * the search key, enter the "prefix matching"
+		 * state.directly.
+		 */
+		if (pref_mismatch) {
+			int mp = KEYLENGTH - fls(pref_mismatch);
+
+			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
+				goto backtrace;
+
+			if (current_prefix_length >= cn->pos)
+				current_prefix_length = mp;
+		}
+
+		pn = (struct tnode *)n; /* Descend */
+		chopped_off = 0;
+		continue;
+
+backtrace:
+		chopped_off++;
+
+		/* As zero don't change the child key (cindex) */
+		while ((chopped_off <= pn->bits)
+		       && !(cindex & (1<<(chopped_off-1))))
+			chopped_off++;
+
+		/* Decrease current_... with bits chopped off */
+		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
+			current_prefix_length = pn->pos + pn->bits
+				- chopped_off;
+
+		/*
+		 * Either we do the actual chop off according or if we have
+		 * chopped off all bits in this tnode walk up to our parent.
+		 */
+
+		if (chopped_off <= pn->bits) {
+			cindex &= ~(1 << (chopped_off-1));
+		} else {
+			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
+			if (!parent)
+				goto failed;
+
+			/* Get Child's index */
+			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
+			pn = parent;
+			chopped_off = 0;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.backtrack++;
+#endif
+			goto backtrace;
+		}
+	}
+failed:
+	ret = 1;
+found:
+	rcu_read_unlock();
+	return ret;
+}
+
+/*
+ * Remove the leaf and return parent.
+ */
+static void trie_leaf_remove(struct trie *t, struct leaf *l)
+{
+	struct tnode *tp = node_parent((struct rt_trie_node *) l);
+
+	pr_debug("entering trie_leaf_remove(%p)\n", l);
+
+	if (tp) {
+		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
+		put_child(t, (struct tnode *)tp, cindex, NULL);
+		trie_rebalance(t, tp);
+	} else
+		rcu_assign_pointer(t->trie, NULL);
+
+	free_leaf(l);
+}
+
+/*
+ * Caller must hold RTNL.
+ */
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	u32 key, mask;
+	int plen = cfg->fc_dst_len;
+	u8 tos = cfg->fc_tos;
+	struct fib_alias *fa, *fa_to_delete;
+	struct list_head *fa_head;
+	struct leaf *l;
+	struct leaf_info *li;
+
+	if (plen > 32)
+		return -EINVAL;
+
+	key = ntohl(cfg->fc_dst);
+	mask = ntohl(inet_make_mask(plen));
+
+	if (key & ~mask)
+		return -EINVAL;
+
+	key = key & mask;
+	l = fib_find_node(t, key);
+
+	if (!l)
+		return -ESRCH;
+
+	fa_head = get_fa_head(l, plen);
+	fa = fib_find_alias(fa_head, tos, 0);
+
+	if (!fa)
+		return -ESRCH;
+
+	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+
+	fa_to_delete = NULL;
+	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+	list_for_each_entry_continue(fa, fa_head, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (fa->fa_tos != tos)
+			break;
+
+		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_info->fib_scope == cfg->fc_scope) &&
+		    (!cfg->fc_prefsrc ||
+		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
+		    (!cfg->fc_protocol ||
+		     fi->fib_protocol == cfg->fc_protocol) &&
+		    fib_nh_match(cfg, fi) == 0) {
+			fa_to_delete = fa;
+			break;
+		}
+	}
+
+	if (!fa_to_delete)
+		return -ESRCH;
+
+	fa = fa_to_delete;
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
+		  &cfg->fc_nlinfo, 0);
+
+	l = fib_find_node(t, key);
+	li = find_leaf_info(l, plen);
+
+	list_del_rcu(&fa->fa_list);
+
+	if (!plen)
+		tb->tb_num_default--;
+
+	if (list_empty(fa_head)) {
+		hlist_del_rcu(&li->hlist);
+		free_leaf_info(li);
+	}
+
+	if (hlist_empty(&l->list))
+		trie_leaf_remove(t, l);
+
+	if (fa->fa_state & FA_S_ACCESSED)
+		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
+
+	fib_release_info(fa->fa_info);
+	alias_free_mem_rcu(fa);
+	return 0;
+}
+
+static int trie_flush_list(struct list_head *head)
+{
+	struct fib_alias *fa, *fa_node;
+	int found = 0;
+
+	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+			list_del_rcu(&fa->fa_list);
+			fib_release_info(fa->fa_info);
+			alias_free_mem_rcu(fa);
+			found++;
+		}
+	}
+	return found;
+}
+
+static int trie_flush_leaf(struct leaf *l)
+{
+	int found = 0;
+	struct hlist_head *lih = &l->list;
+	struct hlist_node *node, *tmp;
+	struct leaf_info *li = NULL;
+
+	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+		found += trie_flush_list(&li->falh);
+
+		if (list_empty(&li->falh)) {
+			hlist_del_rcu(&li->hlist);
+			free_leaf_info(li);
+		}
+	}
+	return found;
+}
+
+/*
+ * Scan for the next right leaf starting at node p->child[idx]
+ * Since we have back pointer, no recursion necessary.
+ */
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
+{
+	do {
+		t_key idx;
+
+		if (c)
+			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
+		else
+			idx = 0;
+
+		while (idx < 1u << p->bits) {
+			c = tnode_get_child_rcu(p, idx++);
+			if (!c)
+				continue;
+
+			if (IS_LEAF(c)) {
+				prefetch(rcu_dereference_rtnl(p->child[idx]));
+				return (struct leaf *) c;
+			}
+
+			/* Rescan start scanning in new node */
+			p = (struct tnode *) c;
+			idx = 0;
+		}
+
+		/* Node empty, walk back up to parent */
+		c = (struct rt_trie_node *) p;
+	} while ((p = node_parent_rcu(c)) != NULL);
+
+	return NULL; /* Root of trie */
+}
+
+static struct leaf *trie_firstleaf(struct trie *t)
+{
+	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
+
+	if (!n)
+		return NULL;
+
+	if (IS_LEAF(n))          /* trie is just a leaf */
+		return (struct leaf *) n;
+
+	return leaf_walk_rcu(n, NULL);
+}
+
+static struct leaf *trie_nextleaf(struct leaf *l)
+{
+	struct rt_trie_node *c = (struct rt_trie_node *) l;
+	struct tnode *p = node_parent_rcu(c);
+
+	if (!p)
+		return NULL;	/* trie with just one leaf */
+
+	return leaf_walk_rcu(p, c);
+}
+
+static struct leaf *trie_leafindex(struct trie *t, int index)
+{
+	struct leaf *l = trie_firstleaf(t);
+
+	while (l && index-- > 0)
+		l = trie_nextleaf(l);
+
+	return l;
+}
+
+
+/*
+ * Caller must hold RTNL.
+ */
+int fib_table_flush(struct fib_table *tb)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	struct leaf *l, *ll = NULL;
+	int found = 0;
+
+	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
+		found += trie_flush_leaf(l);
+
+		if (ll && hlist_empty(&ll->list))
+			trie_leaf_remove(t, ll);
+		ll = l;
+	}
+
+	if (ll && hlist_empty(&ll->list))
+		trie_leaf_remove(t, ll);
+
+	pr_debug("trie_flush found=%d\n", found);
+	return found;
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+	kfree(tb);
+}
+
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
+			   struct fib_table *tb,
+			   struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, s_i;
+	struct fib_alias *fa;
+	__be32 xkey = htonl(key);
+
+	s_i = cb->args[5];
+	i = 0;
+
+	/* rcu_read_lock is hold by caller */
+
+	list_for_each_entry_rcu(fa, fah, fa_list) {
+		if (i < s_i) {
+			i++;
+			continue;
+		}
+
+		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+				  cb->nlh->nlmsg_seq,
+				  RTM_NEWROUTE,
+				  tb->tb_id,
+				  fa->fa_type,
+				  xkey,
+				  plen,
+				  fa->fa_tos,
+				  fa->fa_info, NLM_F_MULTI) < 0) {
+			cb->args[5] = i;
+			return -1;
+		}
+		i++;
+	}
+	cb->args[5] = i;
+	return skb->len;
+}
+
+static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+			struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct leaf_info *li;
+	struct hlist_node *node;
+	int i, s_i;
+
+	s_i = cb->args[4];
+	i = 0;
+
+	/* rcu_read_lock is hold by caller */
+	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+		if (i < s_i) {
+			i++;
+			continue;
+		}
+
+		if (i > s_i)
+			cb->args[5] = 0;
+
+		if (list_empty(&li->falh))
+			continue;
+
+		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+			cb->args[4] = i;
+			return -1;
+		}
+		i++;
+	}
+
+	cb->args[4] = i;
+	return skb->len;
+}
+
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+		   struct netlink_callback *cb)
+{
+	struct leaf *l;
+	struct trie *t = (struct trie *) tb->tb_data;
+	t_key key = cb->args[2];
+	int count = cb->args[3];
+
+	rcu_read_lock();
+	/* Dump starting at last key.
+	 * Note: 0.0.0.0/0 (ie default) is first key.
+	 */
+	if (count == 0)
+		l = trie_firstleaf(t);
+	else {
+		/* Normally, continue from last key, but if that is missing
+		 * fallback to using slow rescan
+		 */
+		l = fib_find_node(t, key);
+		if (!l)
+			l = trie_leafindex(t, count);
+	}
+
+	while (l) {
+		cb->args[2] = l->key;
+		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+			cb->args[3] = count;
+			rcu_read_unlock();
+			return -1;
+		}
+
+		++count;
+		l = trie_nextleaf(l);
+		memset(&cb->args[4], 0,
+		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
+	}
+	cb->args[3] = count;
+	rcu_read_unlock();
+
+	return skb->len;
+}
+
+void __init fib_trie_init(void)
+{
+	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+					  sizeof(struct fib_alias),
+					  0, SLAB_PANIC, NULL);
+
+	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+					   max(sizeof(struct leaf),
+					       sizeof(struct leaf_info)),
+					   0, SLAB_PANIC, NULL);
+}
+
+
+struct fib_table *fib_trie_table(u32 id)
+{
+	struct fib_table *tb;
+	struct trie *t;
+
+	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
+		     GFP_KERNEL);
+	if (tb == NULL)
+		return NULL;
+
+	tb->tb_id = id;
+	tb->tb_default = -1;
+	tb->tb_num_default = 0;
+
+	t = (struct trie *) tb->tb_data;
+	memset(t, 0, sizeof(*t));
+
+	return tb;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Depth first Trie walk iterator */
+struct fib_trie_iter {
+	struct seq_net_private p;
+	struct fib_table *tb;
+	struct tnode *tnode;
+	unsigned int index;
+	unsigned int depth;
+};
+
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
+{
+	struct tnode *tn = iter->tnode;
+	unsigned int cindex = iter->index;
+	struct tnode *p;
+
+	/* A single entry routing table */
+	if (!tn)
+		return NULL;
+
+	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
+		 iter->tnode, iter->index, iter->depth);
+rescan:
+	while (cindex < (1<<tn->bits)) {
+		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
+
+		if (n) {
+			if (IS_LEAF(n)) {
+				iter->tnode = tn;
+				iter->index = cindex + 1;
+			} else {
+				/* push down one level */
+				iter->tnode = (struct tnode *) n;
+				iter->index = 0;
+				++iter->depth;
+			}
+			return n;
+		}
+
+		++cindex;
+	}
+
+	/* Current node exhausted, pop back up */
+	p = node_parent_rcu((struct rt_trie_node *)tn);
+	if (p) {
+		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+		tn = p;
+		--iter->depth;
+		goto rescan;
+	}
+
+	/* got root? */
+	return NULL;
+}
+
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
+				       struct trie *t)
+{
+	struct rt_trie_node *n;
+
+	if (!t)
+		return NULL;
+
+	n = rcu_dereference(t->trie);
+	if (!n)
+		return NULL;
+
+	if (IS_TNODE(n)) {
+		iter->tnode = (struct tnode *) n;
+		iter->index = 0;
+		iter->depth = 1;
+	} else {
+		iter->tnode = NULL;
+		iter->index = 0;
+		iter->depth = 0;
+	}
+
+	return n;
+}
+
+static void trie_collect_stats(struct trie *t, struct trie_stat *s)
+{
+	struct rt_trie_node *n;
+	struct fib_trie_iter iter;
+
+	memset(s, 0, sizeof(*s));
+
+	rcu_read_lock();
+	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
+		if (IS_LEAF(n)) {
+			struct leaf *l = (struct leaf *)n;
+			struct leaf_info *li;
+			struct hlist_node *tmp;
+
+			s->leaves++;
+			s->totdepth += iter.depth;
+			if (iter.depth > s->maxdepth)
+				s->maxdepth = iter.depth;
+
+			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
+				++s->prefixes;
+		} else {
+			const struct tnode *tn = (const struct tnode *) n;
+			int i;
+
+			s->tnodes++;
+			if (tn->bits < MAX_STAT_DEPTH)
+				s->nodesizes[tn->bits]++;
+
+			for (i = 0; i < (1<<tn->bits); i++)
+				if (!tn->child[i])
+					s->nullpointers++;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
+ *	This outputs /proc/net/fib_triestats
+ */
+static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
+{
+	unsigned int i, max, pointers, bytes, avdepth;
+
+	if (stat->leaves)
+		avdepth = stat->totdepth*100 / stat->leaves;
+	else
+		avdepth = 0;
+
+	seq_printf(seq, "\tAver depth:     %u.%02d\n",
+		   avdepth / 100, avdepth % 100);
+	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
+
+	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
+	bytes = sizeof(struct leaf) * stat->leaves;
+
+	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
+	bytes += sizeof(struct leaf_info) * stat->prefixes;
+
+	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
+	bytes += sizeof(struct tnode) * stat->tnodes;
+
+	max = MAX_STAT_DEPTH;
+	while (max > 0 && stat->nodesizes[max-1] == 0)
+		max--;
+
+	pointers = 0;
+	for (i = 1; i <= max; i++)
+		if (stat->nodesizes[i] != 0) {
+			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
+			pointers += (1<<i) * stat->nodesizes[i];
+		}
+	seq_putc(seq, '\n');
+	seq_printf(seq, "\tPointers: %u\n", pointers);
+
+	bytes += sizeof(struct rt_trie_node *) * pointers;
+	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
+	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
+}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+static void trie_show_usage(struct seq_file *seq,
+			    const struct trie_use_stats *stats)
+{
+	seq_printf(seq, "\nCounters:\n---------\n");
+	seq_printf(seq, "gets = %u\n", stats->gets);
+	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+	seq_printf(seq, "semantic match passed = %u\n",
+		   stats->semantic_match_passed);
+	seq_printf(seq, "semantic match miss = %u\n",
+		   stats->semantic_match_miss);
+	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
+	seq_printf(seq, "skipped node resize = %u\n\n",
+		   stats->resize_node_skipped);
+}
+#endif /*  CONFIG_IP_FIB_TRIE_STATS */
+
+static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
+{
+	if (tb->tb_id == RT_TABLE_LOCAL)
+		seq_puts(seq, "Local:\n");
+	else if (tb->tb_id == RT_TABLE_MAIN)
+		seq_puts(seq, "Main:\n");
+	else
+		seq_printf(seq, "Id %d:\n", tb->tb_id);
+}
+
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = (struct net *)seq->private;
+	unsigned int h;
+
+	seq_printf(seq,
+		   "Basic info: size of leaf:"
+		   " %Zd bytes, size of tnode: %Zd bytes.\n",
+		   sizeof(struct leaf), sizeof(struct tnode));
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct hlist_node *node;
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+			struct trie *t = (struct trie *) tb->tb_data;
+			struct trie_stat stat;
+
+			if (!t)
+				continue;
+
+			fib_table_print(seq, tb);
+
+			trie_collect_stats(t, &stat);
+			trie_show_stats(seq, &stat);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			trie_show_usage(seq, &t->stats);
+#endif
+		}
+	}
+
+	return 0;
+}
+
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, fib_triestat_seq_show);
+}
+
+static const struct file_operations fib_triestat_fops = {
+	.owner	= THIS_MODULE,
+	.open	= fib_triestat_seq_open,
+	.read	= seq_read,
+	.llseek	= seq_lseek,
+	.release = single_release_net,
+};
+
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct fib_trie_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	loff_t idx = 0;
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct hlist_node *node;
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+			struct rt_trie_node *n;
+
+			for (n = fib_trie_get_first(iter,
+						    (struct trie *) tb->tb_data);
+			     n; n = fib_trie_get_next(iter))
+				if (pos == idx++) {
+					iter->tb = tb;
+					return n;
+				}
+		}
+	}
+
+	return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return fib_trie_get_idx(seq, *pos);
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct fib_trie_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct fib_table *tb = iter->tb;
+	struct hlist_node *tb_node;
+	unsigned int h;
+	struct rt_trie_node *n;
+
+	++*pos;
+	/* next node in same table */
+	n = fib_trie_get_next(iter);
+	if (n)
+		return n;
+
+	/* walk rest of this hash chain */
+	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
+	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
+		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
+		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+		if (n)
+			goto found;
+	}
+
+	/* new hash chain */
+	while (++h < FIB_TABLE_HASHSZ) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
+			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
+			if (n)
+				goto found;
+		}
+	}
+	return NULL;
+
+found:
+	iter->tb = tb;
+	return n;
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static void seq_indent(struct seq_file *seq, int n)
+{
+	while (n-- > 0)
+		seq_puts(seq, "   ");
+}
+
+static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
+{
+	switch (s) {
+	case RT_SCOPE_UNIVERSE: return "universe";
+	case RT_SCOPE_SITE:	return "site";
+	case RT_SCOPE_LINK:	return "link";
+	case RT_SCOPE_HOST:	return "host";
+	case RT_SCOPE_NOWHERE:	return "nowhere";
+	default:
+		snprintf(buf, len, "scope=%d", s);
+		return buf;
+	}
+}
+
+static const char *const rtn_type_names[__RTN_MAX] = {
+	[RTN_UNSPEC] = "UNSPEC",
+	[RTN_UNICAST] = "UNICAST",
+	[RTN_LOCAL] = "LOCAL",
+	[RTN_BROADCAST] = "BROADCAST",
+	[RTN_ANYCAST] = "ANYCAST",
+	[RTN_MULTICAST] = "MULTICAST",
+	[RTN_BLACKHOLE] = "BLACKHOLE",
+	[RTN_UNREACHABLE] = "UNREACHABLE",
+	[RTN_PROHIBIT] = "PROHIBIT",
+	[RTN_THROW] = "THROW",
+	[RTN_NAT] = "NAT",
+	[RTN_XRESOLVE] = "XRESOLVE",
+};
+
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
+{
+	if (t < __RTN_MAX && rtn_type_names[t])
+		return rtn_type_names[t];
+	snprintf(buf, len, "type %u", t);
+	return buf;
+}
+
+/* Pretty print the trie */
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+	const struct fib_trie_iter *iter = seq->private;
+	struct rt_trie_node *n = v;
+
+	if (!node_parent_rcu(n))
+		fib_table_print(seq, iter->tb);
+
+	if (IS_TNODE(n)) {
+		struct tnode *tn = (struct tnode *) n;
+		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
+
+		seq_indent(seq, iter->depth-1);
+		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
+			   &prf, tn->pos, tn->bits, tn->full_children,
+			   tn->empty_children);
+
+	} else {
+		struct leaf *l = (struct leaf *) n;
+		struct leaf_info *li;
+		struct hlist_node *node;
+		__be32 val = htonl(l->key);
+
+		seq_indent(seq, iter->depth);
+		seq_printf(seq, "  |-- %pI4\n", &val);
+
+		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+			struct fib_alias *fa;
+
+			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+				char buf1[32], buf2[32];
+
+				seq_indent(seq, iter->depth+1);
+				seq_printf(seq, "  /%d %s %s", li->plen,
+					   rtn_scope(buf1, sizeof(buf1),
+						     fa->fa_info->fib_scope),
+					   rtn_type(buf2, sizeof(buf2),
+						    fa->fa_type));
+				if (fa->fa_tos)
+					seq_printf(seq, " tos=%d", fa->fa_tos);
+				seq_putc(seq, '\n');
+			}
+		}
+	}
+
+	return 0;
+}
+
+static const struct seq_operations fib_trie_seq_ops = {
+	.start  = fib_trie_seq_start,
+	.next   = fib_trie_seq_next,
+	.stop   = fib_trie_seq_stop,
+	.show   = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &fib_trie_seq_ops,
+			    sizeof(struct fib_trie_iter));
+}
+
+static const struct file_operations fib_trie_fops = {
+	.owner  = THIS_MODULE,
+	.open   = fib_trie_seq_open,
+	.read   = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_net,
+};
+
+struct fib_route_iter {
+	struct seq_net_private p;
+	struct trie *main_trie;
+	loff_t	pos;
+	t_key	key;
+};
+
+static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+{
+	struct leaf *l = NULL;
+	struct trie *t = iter->main_trie;
+
+	/* use cache location of last found key */
+	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
+		pos -= iter->pos;
+	else {
+		iter->pos = 0;
+		l = trie_firstleaf(t);
+	}
+
+	while (l && pos-- > 0) {
+		iter->pos++;
+		l = trie_nextleaf(l);
+	}
+
+	if (l)
+		iter->key = pos;	/* remember it */
+	else
+		iter->pos = 0;		/* forget it */
+
+	return l;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct fib_route_iter *iter = seq->private;
+	struct fib_table *tb;
+
+	rcu_read_lock();
+	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+	if (!tb)
+		return NULL;
+
+	iter->main_trie = (struct trie *) tb->tb_data;
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+	else
+		return fib_route_get_idx(iter, *pos - 1);
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct fib_route_iter *iter = seq->private;
+	struct leaf *l = v;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		iter->pos = 0;
+		l = trie_firstleaf(iter->main_trie);
+	} else {
+		iter->pos++;
+		l = trie_nextleaf(l);
+	}
+
+	if (l)
+		iter->key = l->key;
+	else
+		iter->pos = 0;
+	return l;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+{
+	unsigned int flags = 0;
+
+	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+		flags = RTF_REJECT;
+	if (fi && fi->fib_nh->nh_gw)
+		flags |= RTF_GATEWAY;
+	if (mask == htonl(0xFFFFFFFF))
+		flags |= RTF_HOST;
+	flags |= RTF_UP;
+	return flags;
+}
+
+/*
+ *	This outputs /proc/net/route.
+ *	The format of the file is not supposed to be changed
+ *	and needs to be same as fib_hash output to avoid breaking
+ *	legacy utilities
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
+{
+	struct leaf *l = v;
+	struct leaf_info *li;
+	struct hlist_node *node;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+			   "\tWindow\tIRTT");
+		return 0;
+	}
+
+	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
+		struct fib_alias *fa;
+		__be32 mask, prefix;
+
+		mask = inet_make_mask(li->plen);
+		prefix = htonl(l->key);
+
+		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+			const struct fib_info *fi = fa->fa_info;
+			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+			int len;
+
+			if (fa->fa_type == RTN_BROADCAST
+			    || fa->fa_type == RTN_MULTICAST)
+				continue;
+
+			if (fi)
+				seq_printf(seq,
+					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+					 "%d\t%08X\t%d\t%u\t%u%n",
+					 fi->fib_dev ? fi->fib_dev->name : "*",
+					 prefix,
+					 fi->fib_nh->nh_gw, flags, 0, 0,
+					 fi->fib_priority,
+					 mask,
+					 (fi->fib_advmss ?
+					  fi->fib_advmss + 40 : 0),
+					 fi->fib_window,
+					 fi->fib_rtt >> 3, &len);
+			else
+				seq_printf(seq,
+					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+					 "%d\t%08X\t%d\t%u\t%u%n",
+					 prefix, 0, flags, 0, 0, 0,
+					 mask, 0, 0, 0, &len);
+
+			seq_printf(seq, "%*s\n", 127 - len, "");
+		}
+	}
+
+	return 0;
+}
+
+static const struct seq_operations fib_route_seq_ops = {
+	.start  = fib_route_seq_start,
+	.next   = fib_route_seq_next,
+	.stop   = fib_route_seq_stop,
+	.show   = fib_route_seq_show,
+};
+
+static int fib_route_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &fib_route_seq_ops,
+			    sizeof(struct fib_route_iter));
+}
+
+static const struct file_operations fib_route_fops = {
+	.owner  = THIS_MODULE,
+	.open   = fib_route_seq_open,
+	.read   = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_net,
+};
+
+int __net_init fib_proc_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
+		goto out1;
+
+	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
+				  &fib_triestat_fops))
+		goto out2;
+
+	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
+		goto out3;
+
+	return 0;
+
+out3:
+	proc_net_remove(net, "fib_triestat");
+out2:
+	proc_net_remove(net, "fib_trie");
+out1:
+	return -ENOMEM;
+}
+
+void __net_exit fib_proc_exit(struct net *net)
+{
+	proc_net_remove(net, "fib_trie");
+	proc_net_remove(net, "fib_triestat");
+	proc_net_remove(net, "route");
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 00000000..3e3f75d9
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,143 @@
+/*
+ *	GRE over IPv4 demultiplexer driver
+ *
+ *	Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (gre_proto[version])
+		goto err_out_unlock;
+
+	rcu_assign_pointer(gre_proto[version], proto);
+	spin_unlock(&gre_proto_lock);
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (rcu_dereference_protected(gre_proto[version],
+			lockdep_is_held(&gre_proto_lock)) != proto)
+		goto err_out_unlock;
+	rcu_assign_pointer(gre_proto[version], NULL);
+	spin_unlock(&gre_proto_lock);
+	synchronize_rcu();
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+	int ret;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->handler)
+		goto drop_unlock;
+	ret = proto->handler(skb);
+	rcu_read_unlock();
+	return ret;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+	const struct gre_protocol *proto;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+
+	if (ver >= GREPROTO_MAX)
+		return;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (proto && proto->err_handler)
+		proto->err_handler(skb, info);
+	rcu_read_unlock();
+}
+
+static const struct net_protocol net_gre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
+	.netns_ok    = 1,
+};
+
+static int __init gre_init(void)
+{
+	pr_info("GRE over IPv4 demultiplexor driver");
+
+	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+		pr_err("gre: can't add protocol\n");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __exit gre_exit(void)
+{
+	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
+
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 00000000..23ef31ba
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1207 @@
+/*
+ *	NET3:	Implementation of the ICMP protocol layer.
+ *
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Some of the function names and the icmp unreach table for this
+ *	module were derived from [icmp.c 1.0.11 06/02/93] by
+ *	Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ *	Other than that this module is a complete rewrite.
+ *
+ *	Fixes:
+ *	Clemens Fruhwirth	:	introduce global icmp rate limiting
+ *					with icmp type masking ability instead
+ *					of broken per type icmp timeouts.
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Multicast ping reply as self.
+ *		Alan Cox	:	Fix atomicity lockup in ip_build_xmit
+ *					call.
+ *		Alan Cox	:	Added 216,128 byte paths to the MTU
+ *					code.
+ *		Martin Mares	:	RFC1812 checks.
+ *		Martin Mares	:	Can be configured to follow redirects
+ *					if acting as a router _without_ a
+ *					routing protocol (RFC 1812).
+ *		Martin Mares	:	Echo requests may be configured to
+ *					be ignored (RFC 1812).
+ *		Martin Mares	:	Limitation of ICMP error message
+ *					transmit rate (RFC 1812).
+ *		Martin Mares	:	TOS and Precedence set correctly
+ *					(RFC 1812).
+ *		Martin Mares	:	Now copying as much data from the
+ *					original packet as we can without
+ *					exceeding 576 bytes (RFC 1812).
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Keith Owens	:	RFC1191 correction for 4.2BSD based
+ *					path MTU bug.
+ *		Thomas Quinot	:	ICMP Dest Unreach codes up to 15 are
+ *					valid (RFC 1812).
+ *		Andi Kleen	:	Check all packet lengths properly
+ *					and moved all kfree_skb() up to
+ *					icmp_rcv.
+ *		Andi Kleen	:	Move the rate limit bookkeeping
+ *					into the dest entry and use a token
+ *					bucket filter (thanks to ANK). Make
+ *					the rates sysctl configurable.
+ *		Yu Tianli	:	Fixed two ugly bugs in icmp_send
+ *					- IP option length was accounted wrongly
+ *					- ICMP header length was not accounted
+ *					  at all.
+ *              Tristan Greaves :       Added sysctl option to ignore bogus
+ *              			broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ *	- Should use skb_pull() instead of all the manual checking.
+ *	  This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/ping.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+
+/*
+ *	Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+	struct sk_buff *skb;
+	int offset;
+	int data_len;
+
+	struct {
+		struct icmphdr icmph;
+		__be32	       times[3];
+	} data;
+	int head_len;
+	struct ip_options_data replyopts;
+};
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+const struct icmp_err icmp_err_convert[] = {
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = ENOPROTOOPT	/* ICMP_PROT_UNREACH */,
+		.fatal = 1,
+	},
+	{
+		.errno = ECONNREFUSED,	/* ICMP_PORT_UNREACH */
+		.fatal = 1,
+	},
+	{
+		.errno = EMSGSIZE,	/* ICMP_FRAG_NEEDED */
+		.fatal = 0,
+	},
+	{
+		.errno = EOPNOTSUPP,	/* ICMP_SR_FAILED */
+		.fatal = 0,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTDOWN,	/* ICMP_HOST_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = ENONET,	/* ICMP_HOST_ISOLATED */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_ANO	*/
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_ANO */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PKT_FILTERED */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_VIOLATION */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_CUTOFF */
+		.fatal = 1,
+	},
+};
+EXPORT_SYMBOL(icmp_err_convert);
+
+/*
+ *	ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+	void (*handler)(struct sk_buff *skb);
+	short   error;		/* This ICMP is classed as an error message */
+};
+
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ *	The ICMP socket(s). This is the most convenient way to flow control
+ *	our ICMP output as well as maintain a clean interface throughout
+ *	all layers. All Socketless IP sends will soon be gone.
+ *
+ *	On SMP we have one ICMP socket per-cpu.
+ */
+static struct sock *icmp_sk(struct net *net)
+{
+	return net->ipv4.icmp_sk[smp_processor_id()];
+}
+
+static inline struct sock *icmp_xmit_lock(struct net *net)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+
+	sk = icmp_sk(net);
+
+	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+		/* This can happen if the output path signals a
+		 * dst_link_failure() for an outgoing ICMP packet.
+		 */
+		local_bh_enable();
+		return NULL;
+	}
+	return sk;
+}
+
+static inline void icmp_xmit_unlock(struct sock *sk)
+{
+	spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+/*
+ *	Send an ICMP frame.
+ */
+
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+				      struct flowi4 *fl4, int type, int code)
+{
+	struct dst_entry *dst = &rt->dst;
+	bool rc = true;
+
+	if (type > NR_ICMP_TYPES)
+		goto out;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		goto out;
+
+	/* No rate limit on loopback */
+	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+		goto out;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+		if (!rt->peer)
+			rt_bind_peer(rt, fl4->daddr, 1);
+		rc = inet_peer_xrlim_allow(rt->peer,
+					   net->ipv4.sysctl_icmp_ratelimit);
+	}
+out:
+	return rc;
+}
+
+/*
+ *	Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+void icmp_out_count(struct net *net, unsigned char type)
+{
+	ICMPMSGOUT_INC_STATS(net, type);
+	ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
+}
+
+/*
+ *	Checksum each fragment, and on the first include the headers and final
+ *	checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+			  struct sk_buff *skb)
+{
+	struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+	__wsum csum;
+
+	csum = skb_copy_and_csum_bits(icmp_param->skb,
+				      icmp_param->offset + offset,
+				      to, len, 0);
+
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	if (icmp_pointers[icmp_param->data.icmph.type].error)
+		nf_ct_attach(skb, icmp_param->skb);
+	return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+			    struct flowi4 *fl4,
+			    struct ipcm_cookie *ipc, struct rtable **rt)
+{
+	struct sock *sk;
+	struct sk_buff *skb;
+
+	sk = icmp_sk(dev_net((*rt)->dst.dev));
+	if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
+			   icmp_param->data_len+icmp_param->head_len,
+			   icmp_param->head_len,
+			   ipc, rt, MSG_DONTWAIT) < 0) {
+		ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+		ip_flush_pending_frames(sk);
+	} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+		struct icmphdr *icmph = icmp_hdr(skb);
+		__wsum csum = 0;
+		struct sk_buff *skb1;
+
+		skb_queue_walk(&sk->sk_write_queue, skb1) {
+			csum = csum_add(csum, skb1->csum);
+		}
+		csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+						 (char *)icmph,
+						 icmp_param->head_len, csum);
+		icmph->checksum = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(sk, fl4);
+	}
+}
+
+/*
+ *	Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+	struct ipcm_cookie ipc;
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = dev_net(rt->dst.dev);
+	struct flowi4 fl4;
+	struct sock *sk;
+	struct inet_sock *inet;
+	__be32 daddr;
+
+	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+		return;
+
+	sk = icmp_xmit_lock(net);
+	if (sk == NULL)
+		return;
+	inet = inet_sk(sk);
+
+	icmp_param->data.icmph.checksum = 0;
+
+	inet->tos = ip_hdr(skb)->tos;
+	daddr = ipc.addr = ip_hdr(skb)->saddr;
+	ipc.opt = NULL;
+	ipc.tx_flags = 0;
+	if (icmp_param->replyopts.opt.opt.optlen) {
+		ipc.opt = &icmp_param->replyopts.opt;
+		if (ipc.opt->opt.srr)
+			daddr = icmp_param->replyopts.opt.opt.faddr;
+	}
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = daddr;
+	fl4.saddr = rt->rt_spec_dst;
+	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+	fl4.flowi4_proto = IPPROTO_ICMP;
+	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		goto out_unlock;
+	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
+			       icmp_param->data.icmph.code))
+		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock(sk);
+}
+
+static struct rtable *icmp_route_lookup(struct net *net,
+					struct flowi4 *fl4,
+					struct sk_buff *skb_in,
+					const struct iphdr *iph,
+					__be32 saddr, u8 tos,
+					int type, int code,
+					struct icmp_bxm *param)
+{
+	struct rtable *rt, *rt2;
+	struct flowi4 fl4_dec;
+	int err;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->daddr = (param->replyopts.opt.opt.srr ?
+		      param->replyopts.opt.opt.faddr : iph->saddr);
+	fl4->saddr = saddr;
+	fl4->flowi4_tos = RT_TOS(tos);
+	fl4->flowi4_proto = IPPROTO_ICMP;
+	fl4->fl4_icmp_type = type;
+	fl4->fl4_icmp_code = code;
+	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+	rt = __ip_route_output_key(net, fl4);
+	if (IS_ERR(rt))
+		return rt;
+
+	/* No need to clone since we're just using its address. */
+	rt2 = rt;
+
+	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+					   flowi4_to_flowi(fl4), NULL, 0);
+	if (!IS_ERR(rt)) {
+		if (rt != rt2)
+			return rt;
+	} else if (PTR_ERR(rt) == -EPERM) {
+		rt = NULL;
+	} else
+		return rt;
+
+	err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+	if (err)
+		goto relookup_failed;
+
+	if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
+		rt2 = __ip_route_output_key(net, &fl4_dec);
+		if (IS_ERR(rt2))
+			err = PTR_ERR(rt2);
+	} else {
+		struct flowi4 fl4_2 = {};
+		unsigned long orefdst;
+
+		fl4_2.daddr = fl4_dec.saddr;
+		rt2 = ip_route_output_key(net, &fl4_2);
+		if (IS_ERR(rt2)) {
+			err = PTR_ERR(rt2);
+			goto relookup_failed;
+		}
+		/* Ugh! */
+		orefdst = skb_in->_skb_refdst; /* save old refdst */
+		err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
+				     RT_TOS(tos), rt2->dst.dev);
+
+		dst_release(&rt2->dst);
+		rt2 = skb_rtable(skb_in);
+		skb_in->_skb_refdst = orefdst; /* restore old refdst */
+	}
+
+	if (err)
+		goto relookup_failed;
+
+	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
+					    flowi4_to_flowi(&fl4_dec), NULL,
+					    XFRM_LOOKUP_ICMP);
+	if (!IS_ERR(rt2)) {
+		dst_release(&rt->dst);
+		memcpy(fl4, &fl4_dec, sizeof(*fl4));
+		rt = rt2;
+	} else if (PTR_ERR(rt2) == -EPERM) {
+		if (rt)
+			dst_release(&rt->dst);
+		return rt2;
+	} else {
+		err = PTR_ERR(rt2);
+		goto relookup_failed;
+	}
+	return rt;
+
+relookup_failed:
+	if (rt)
+		return rt;
+	return ERR_PTR(err);
+}
+
+/*
+ *	Send an ICMP message in response to a situation
+ *
+ *	RFC 1122: 3.2.2	MUST send at least the IP header and 8 bytes of header.
+ *		  MAY send more (we do).
+ *			MUST NOT change this header information.
+ *			MUST NOT reply to a multicast/broadcast IP address.
+ *			MUST NOT reply to a multicast/broadcast MAC address.
+ *			MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+	struct iphdr *iph;
+	int room;
+	struct icmp_bxm icmp_param;
+	struct rtable *rt = skb_rtable(skb_in);
+	struct ipcm_cookie ipc;
+	struct flowi4 fl4;
+	__be32 saddr;
+	u8  tos;
+	struct net *net;
+	struct sock *sk;
+
+	if (!rt)
+		goto out;
+	net = dev_net(rt->dst.dev);
+
+	/*
+	 *	Find the original header. It is expected to be valid, of course.
+	 *	Check this, icmp_send is called from the most obscure devices
+	 *	sometimes.
+	 */
+	iph = ip_hdr(skb_in);
+
+	if ((u8 *)iph < skb_in->head ||
+	    (skb_in->network_header + sizeof(*iph)) > skb_in->tail)
+		goto out;
+
+	/*
+	 *	No replies to physical multicast/broadcast
+	 */
+	if (skb_in->pkt_type != PACKET_HOST)
+		goto out;
+
+	/*
+	 *	Now check at the protocol level
+	 */
+	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto out;
+
+	/*
+	 *	Only reply to fragment 0. We byte re-order the constant
+	 *	mask for efficiency.
+	 */
+	if (iph->frag_off & htons(IP_OFFSET))
+		goto out;
+
+	/*
+	 *	If we send an ICMP error to an ICMP error a mess would result..
+	 */
+	if (icmp_pointers[type].error) {
+		/*
+		 *	We are an error, check if we are replying to an
+		 *	ICMP error
+		 */
+		if (iph->protocol == IPPROTO_ICMP) {
+			u8 _inner_type, *itp;
+
+			itp = skb_header_pointer(skb_in,
+						 skb_network_header(skb_in) +
+						 (iph->ihl << 2) +
+						 offsetof(struct icmphdr,
+							  type) -
+						 skb_in->data,
+						 sizeof(_inner_type),
+						 &_inner_type);
+			if (itp == NULL)
+				goto out;
+
+			/*
+			 *	Assume any unknown ICMP type is an error. This
+			 *	isn't specified by the RFC, but think about it..
+			 */
+			if (*itp > NR_ICMP_TYPES ||
+			    icmp_pointers[*itp].error)
+				goto out;
+		}
+	}
+
+	sk = icmp_xmit_lock(net);
+	if (sk == NULL)
+		return;
+
+	/*
+	 *	Construct source address and options.
+	 */
+
+	saddr = iph->daddr;
+	if (!(rt->rt_flags & RTCF_LOCAL)) {
+		struct net_device *dev = NULL;
+
+		rcu_read_lock();
+		if (rt_is_input_route(rt) &&
+		    net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
+			dev = dev_get_by_index_rcu(net, rt->rt_iif);
+
+		if (dev)
+			saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+		else
+			saddr = 0;
+		rcu_read_unlock();
+	}
+
+	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+					   IPTOS_PREC_INTERNETCONTROL) :
+					  iph->tos;
+
+	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
+		goto out_unlock;
+
+
+	/*
+	 *	Prepare data for ICMP header.
+	 */
+
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_network_offset(skb_in);
+	inet_sk(sk)->tos = tos;
+	ipc.addr = iph->saddr;
+	ipc.opt = &icmp_param.replyopts.opt;
+	ipc.tx_flags = 0;
+
+	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
+			       type, code, &icmp_param);
+	if (IS_ERR(rt))
+		goto out_unlock;
+
+	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+		goto ende;
+
+	/* RFC says return as much as we can without exceeding 576 bytes. */
+
+	room = dst_mtu(&rt->dst);
+	if (room > 576)
+		room = 576;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+	room -= sizeof(struct icmphdr);
+
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
+
+	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ende:
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock(sk);
+out:;
+}
+EXPORT_SYMBOL(icmp_send);
+
+
+/*
+ *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+
+static void icmp_unreach(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	struct icmphdr *icmph;
+	int hash, protocol;
+	const struct net_protocol *ipprot;
+	u32 info = 0;
+	struct net *net;
+
+	net = dev_net(skb_dst(skb)->dev);
+
+	/*
+	 *	Incomplete header ?
+	 * 	Only checks for the IP header, there should be an
+	 *	additional check for longer headers in upper levels.
+	 */
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out_err;
+
+	icmph = icmp_hdr(skb);
+	iph   = (const struct iphdr *)skb->data;
+
+	if (iph->ihl < 5) /* Mangled header, drop. */
+		goto out_err;
+
+	if (icmph->type == ICMP_DEST_UNREACH) {
+		switch (icmph->code & 15) {
+		case ICMP_NET_UNREACH:
+		case ICMP_HOST_UNREACH:
+		case ICMP_PROT_UNREACH:
+		case ICMP_PORT_UNREACH:
+			break;
+		case ICMP_FRAG_NEEDED:
+			if (ipv4_config.no_pmtu_disc) {
+				LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
+					       &iph->daddr);
+			} else {
+				info = ip_rt_frag_needed(net, iph,
+							 ntohs(icmph->un.frag.mtu),
+							 skb->dev);
+				if (!info)
+					goto out;
+			}
+			break;
+		case ICMP_SR_FAILED:
+			LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
+				       &iph->daddr);
+			break;
+		default:
+			break;
+		}
+		if (icmph->code > NR_ICMP_UNREACH)
+			goto out;
+	} else if (icmph->type == ICMP_PARAMETERPROB)
+		info = ntohl(icmph->un.gateway) >> 24;
+
+	/*
+	 *	Throw it at our lower layers
+	 *
+	 *	RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+	 *		  header.
+	 *	RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+	 *		  transport layer.
+	 *	RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+	 *		  transport layer.
+	 */
+
+	/*
+	 *	Check the other end isn't violating RFC 1122. Some routers send
+	 *	bogus responses to broadcast frames. If you see this message
+	 *	first check your netmask matches at both ends, if it does then
+	 *	get the other vendor to fix their kit.
+	 */
+
+	if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+	    inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "%pI4 sent an invalid ICMP "
+					    "type %u, code %u "
+					    "error to a broadcast: %pI4 on %s\n",
+			       &ip_hdr(skb)->saddr,
+			       icmph->type, icmph->code,
+			       &iph->daddr,
+			       skb->dev->name);
+		goto out;
+	}
+
+	/* Checkin full IP header plus 8 bytes of protocol to
+	 * avoid additional coding at protocol handlers.
+	 */
+	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+		goto out;
+
+	iph = (const struct iphdr *)skb->data;
+	protocol = iph->protocol;
+
+	/*
+	 *	Deliver ICMP message to raw sockets. Pretty useless feature?
+	 */
+	raw_icmp_error(skb, protocol, info);
+
+	hash = protocol & (MAX_INET_PROTOS - 1);
+	rcu_read_lock();
+	ipprot = rcu_dereference(inet_protos[hash]);
+	if (ipprot && ipprot->err_handler)
+		ipprot->err_handler(skb, info);
+	rcu_read_unlock();
+
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+	goto out;
+}
+
+
+/*
+ *	Handle ICMP_REDIRECT.
+ */
+
+static void icmp_redirect(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+
+	if (skb->len < sizeof(struct iphdr))
+		goto out_err;
+
+	/*
+	 *	Get the copied header of the packet that caused the redirect
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	iph = (const struct iphdr *)skb->data;
+
+	switch (icmp_hdr(skb)->code & 7) {
+	case ICMP_REDIR_NET:
+	case ICMP_REDIR_NETTOS:
+		/*
+		 * As per RFC recommendations now handle it as a host redirect.
+		 */
+	case ICMP_REDIR_HOST:
+	case ICMP_REDIR_HOSTTOS:
+		ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
+			       icmp_hdr(skb)->un.gateway,
+			       iph->saddr, skb->dev);
+		break;
+	}
+
+	/* Ping wants to see redirects.
+         * Let's pretend they are errors of sorts... */
+	if (iph->protocol == IPPROTO_ICMP &&
+	    iph->ihl >= 5 &&
+	    pskb_may_pull(skb, (iph->ihl<<2)+8)) {
+		ping_err(skb, icmp_hdr(skb)->un.gateway);
+	}
+
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+	goto out;
+}
+
+/*
+ *	Handle ICMP_ECHO ("ping") requests.
+ *
+ *	RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ *		  requests.
+ *	RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ *		  included in the reply.
+ *	RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ *		  echo requests, MUST have default=NOT.
+ *	See also WRT handling of options once they are done and working.
+ */
+
+static void icmp_echo(struct sk_buff *skb)
+{
+	struct net *net;
+
+	net = dev_net(skb_dst(skb)->dev);
+	if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
+		struct icmp_bxm icmp_param;
+
+		icmp_param.data.icmph	   = *icmp_hdr(skb);
+		icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+		icmp_param.skb		   = skb;
+		icmp_param.offset	   = 0;
+		icmp_param.data_len	   = skb->len;
+		icmp_param.head_len	   = sizeof(struct icmphdr);
+		icmp_reply(&icmp_param, skb);
+	}
+}
+
+/*
+ *	Handle ICMP Timestamp requests.
+ *	RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ *		  SHOULD be in the kernel for minimum random latency.
+ *		  MUST be accurate to a few minutes.
+ *		  MUST be updated at least at 15Hz.
+ */
+static void icmp_timestamp(struct sk_buff *skb)
+{
+	struct timespec tv;
+	struct icmp_bxm icmp_param;
+	/*
+	 *	Too short.
+	 */
+	if (skb->len < 4)
+		goto out_err;
+
+	/*
+	 *	Fill in the current time as ms since midnight UT:
+	 */
+	getnstimeofday(&tv);
+	icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
+					 tv.tv_nsec / NSEC_PER_MSEC);
+	icmp_param.data.times[2] = icmp_param.data.times[1];
+	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+		BUG();
+	icmp_param.data.icmph	   = *icmp_hdr(skb);
+	icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+	icmp_param.data.icmph.code = 0;
+	icmp_param.skb		   = skb;
+	icmp_param.offset	   = 0;
+	icmp_param.data_len	   = 0;
+	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
+	icmp_reply(&icmp_param, skb);
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+	goto out;
+}
+
+
+/*
+ *	Handle ICMP_ADDRESS_MASK requests.  (RFC950)
+ *
+ * RFC1122 (3.2.2.9).  A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent.  Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off.  Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9).	A router MUST implement it.
+ *			A router SHOULD have switch turning it on/off.
+ *		      	This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+	if (net_ratelimit())
+		printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+
+/*
+ * RFC1812 (4.3.3.9).	A router SHOULD listen all replies, and complain
+ *			loudly if an inconsistency is found.
+ * called with rcu_read_lock()
+ */
+
+static void icmp_address_reply(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+
+	if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+		return;
+
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev)
+		return;
+
+	if (in_dev->ifa_list &&
+	    IN_DEV_LOG_MARTIANS(in_dev) &&
+	    IN_DEV_FORWARD(in_dev)) {
+		__be32 _mask, *mp;
+
+		mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+		BUG_ON(mp == NULL);
+		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+			if (*mp == ifa->ifa_mask &&
+			    inet_ifa_match(ip_hdr(skb)->saddr, ifa))
+				break;
+		}
+		if (!ifa && net_ratelimit()) {
+			printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
+			       mp, dev->name, &ip_hdr(skb)->saddr);
+		}
+	}
+}
+
+static void icmp_discard(struct sk_buff *skb)
+{
+}
+
+/*
+ *	Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+	struct icmphdr *icmph;
+	struct rtable *rt = skb_rtable(skb);
+	struct net *net = dev_net(rt->dst.dev);
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
+		int nh;
+
+		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+				 XFRM_STATE_ICMP))
+			goto drop;
+
+		if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
+			goto drop;
+
+		nh = skb_network_offset(skb);
+		skb_set_network_header(skb, sizeof(*icmph));
+
+		if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
+
+		skb_set_network_header(skb, nh);
+	}
+
+	ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!csum_fold(skb->csum))
+			break;
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = 0;
+		if (__skb_checksum_complete(skb))
+			goto error;
+	}
+
+	if (!pskb_pull(skb, sizeof(*icmph)))
+		goto error;
+
+	icmph = icmp_hdr(skb);
+
+	ICMPMSGIN_INC_STATS_BH(net, icmph->type);
+	/*
+	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
+	 *
+	 *	RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+	 *		  discarded.
+	 */
+	if (icmph->type > NR_ICMP_TYPES)
+		goto error;
+
+
+	/*
+	 *	Parse the ICMP message
+	 */
+
+	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		/*
+		 *	RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+		 *	  silently ignored (we let user decide with a sysctl).
+		 *	RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+		 *	  discarded if to broadcast/multicast.
+		 */
+		if ((icmph->type == ICMP_ECHO ||
+		     icmph->type == ICMP_TIMESTAMP) &&
+		    net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+			goto error;
+		}
+		if (icmph->type != ICMP_ECHO &&
+		    icmph->type != ICMP_TIMESTAMP &&
+		    icmph->type != ICMP_ADDRESS &&
+		    icmph->type != ICMP_ADDRESSREPLY) {
+			goto error;
+		}
+	}
+
+	icmp_pointers[icmph->type].handler(skb);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+error:
+	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+	goto drop;
+}
+
+/*
+ *	This table is the definition of how we handle ICMP.
+ */
+static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+	[ICMP_ECHOREPLY] = {
+		.handler = ping_rcv,
+	},
+	[1] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[2] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_DEST_UNREACH] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_SOURCE_QUENCH] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_REDIRECT] = {
+		.handler = icmp_redirect,
+		.error = 1,
+	},
+	[6] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[7] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_ECHO] = {
+		.handler = icmp_echo,
+	},
+	[9] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[10] = {
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_TIME_EXCEEDED] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_PARAMETERPROB] = {
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_TIMESTAMP] = {
+		.handler = icmp_timestamp,
+	},
+	[ICMP_TIMESTAMPREPLY] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_INFO_REQUEST] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_INFO_REPLY] = {
+		.handler = icmp_discard,
+	},
+	[ICMP_ADDRESS] = {
+		.handler = icmp_address,
+	},
+	[ICMP_ADDRESSREPLY] = {
+		.handler = icmp_address_reply,
+	},
+};
+
+static void __net_exit icmp_sk_exit(struct net *net)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
+	kfree(net->ipv4.icmp_sk);
+	net->ipv4.icmp_sk = NULL;
+}
+
+static int __net_init icmp_sk_init(struct net *net)
+{
+	int i, err;
+
+	net->ipv4.icmp_sk =
+		kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
+	if (net->ipv4.icmp_sk == NULL)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct sock *sk;
+
+		err = inet_ctl_sock_create(&sk, PF_INET,
+					   SOCK_RAW, IPPROTO_ICMP, net);
+		if (err < 0)
+			goto fail;
+
+		net->ipv4.icmp_sk[i] = sk;
+
+		/* Enough space for 2 64K ICMP packets, including
+		 * sk_buff struct overhead.
+		 */
+		sk->sk_sndbuf =
+			(2 * ((64 * 1024) + sizeof(struct sk_buff)));
+
+		/*
+		 * Speedup sock_wfree()
+		 */
+		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+	}
+
+	/* Control parameters for ECHO replies. */
+	net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+	net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
+
+	/* Control parameter - ignore bogus broadcast responses? */
+	net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
+
+	/*
+	 * 	Configurable global rate limit.
+	 *
+	 *	ratelimit defines tokens/packet consumed for dst->rate_token
+	 *	bucket ratemask defines which icmp types are ratelimited by
+	 *	setting	it's bit position.
+	 *
+	 *	default:
+	 *	dest unreachable (3), source quench (4),
+	 *	time exceeded (11), parameter problem (12)
+	 */
+
+	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
+	net->ipv4.sysctl_icmp_ratemask = 0x1818;
+	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+
+	return 0;
+
+fail:
+	for_each_possible_cpu(i)
+		inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
+	kfree(net->ipv4.icmp_sk);
+	return err;
+}
+
+static struct pernet_operations __net_initdata icmp_sk_ops = {
+       .init = icmp_sk_init,
+       .exit = icmp_sk_exit,
+};
+
+int __init icmp_init(void)
+{
+	return register_pernet_subsys(&icmp_sk_ops);
+}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 00000000..e0d42dbb
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2661 @@
+/*
+ *	Linux NET3:	Internet Group Management Protocol  [IGMP]
+ *
+ *	This code implements the IGMP protocol as defined in RFC1112. There has
+ *	been a further revision of this protocol since which is now supported.
+ *
+ *	If you have trouble with this module be careful what gcc you have used,
+ *	the older version didn't come out right using gcc 2.5.8, the newer one
+ *	seems to fall out with gcc 2.6.2.
+ *
+ *	Authors:
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *
+ *		Alan Cox	:	Added lots of __inline__ to optimise
+ *					the memory usage of all the tiny little
+ *					functions.
+ *		Alan Cox	:	Dumped the header building experiment.
+ *		Alan Cox	:	Minor tweaks ready for multicast routing
+ *					and extended IGMP protocol.
+ *		Alan Cox	:	Removed a load of inline directives. Gcc 2.5.8
+ *					writes utterly bogus code otherwise (sigh)
+ *					fixed IGMP loopback to behave in the manner
+ *					desired by mrouted, fixed the fact it has been
+ *					broken since 1.3.6 and cleaned up a few minor
+ *					points.
+ *
+ *		Chih-Jen Chang	:	Tried to revise IGMP to Version 2
+ *		Tsu-Sheng Tsao		E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ *					The enhancements are mainly based on Steve Deering's
+ * 					ipmulti-3.5 source code.
+ *		Chih-Jen Chang	:	Added the igmp_get_mrouter_info and
+ *		Tsu-Sheng Tsao		igmp_set_mrouter_info to keep track of
+ *					the mrouted version on that device.
+ *		Chih-Jen Chang	:	Added the max_resp_time parameter to
+ *		Tsu-Sheng Tsao		igmp_heard_query(). Using this parameter
+ *					to identify the multicast router version
+ *					and do what the IGMP version 2 specified.
+ *		Chih-Jen Chang	:	Added a timer to revert to IGMP V2 router
+ *		Tsu-Sheng Tsao		if the specified time expired.
+ *		Alan Cox	:	Stop IGMP from 0.0.0.0 being accepted.
+ *		Alan Cox	:	Use GFP_ATOMIC in the right places.
+ *		Christian Daudt :	igmp timer wasn't set for local group
+ *					memberships but was being deleted,
+ *					which caused a "del_timer() called
+ *					from %p with timer not initialized\n"
+ *					message (960131).
+ *		Christian Daudt :	removed del_timer from
+ *					igmp_timer_expire function (960205).
+ *             Christian Daudt :       igmp_heard_report now only calls
+ *                                     igmp_timer_expire if tm->running is
+ *                                     true (960216).
+ *		Malcolm Beattie :	ttl comparison wrong in igmp_rcv made
+ *					igmp_heard_query never trigger. Expiry
+ *					miscalculation fixed in igmp_heard_query
+ *					and random() made to return unsigned to
+ *					prevent negative expiry times.
+ *		Alexey Kuznetsov:	Wrong group leaving behaviour, backport
+ *					fix from pending 2.1.x patches.
+ *		Alan Cox:		Forget to enable FDDI support earlier.
+ *		Alexey Kuznetsov:	Fixed leaving groups on device down.
+ *		Alexey Kuznetsov:	Accordance to igmp-v2-06 draft.
+ *		David L Stevens:	IGMPv3 support, with help from
+ *					Vinay Kulkarni
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS	20
+#define IP_MAX_MSF		10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_Router_Present_Timeout		(400*HZ)
+#define IGMP_V2_Router_Present_Timeout		(400*HZ)
+#define IGMP_Unsolicited_Report_Interval	(10*HZ)
+#define IGMP_Query_Response_Interval		(10*HZ)
+#define IGMP_Unsolicited_Report_Count		2
+
+
+#define IGMP_Initial_Report_Delay		(1)
+
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) \
+	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
+	 ((in_dev)->mr_v1_seen && \
+	  time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) \
+	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
+	 ((in_dev)->mr_v2_seen && \
+	  time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+	if (atomic_dec_and_test(&im->refcnt)) {
+		in_dev_put(im->interface);
+		kfree_rcu(im, rcu);
+	}
+}
+
+#define for_each_pmc_rcu(in_dev, pmc)				\
+	for (pmc = rcu_dereference(in_dev->mc_list);		\
+	     pmc != NULL;					\
+	     pmc = rcu_dereference(pmc->next_rcu))
+
+#define for_each_pmc_rtnl(in_dev, pmc)				\
+	for (pmc = rtnl_dereference(in_dev->mc_list);		\
+	     pmc != NULL;					\
+	     pmc = rtnl_dereference(pmc->next_rcu))
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ *	Timer management
+ */
+
+static void igmp_stop_timer(struct ip_mc_list *im)
+{
+	spin_lock_bh(&im->lock);
+	if (del_timer(&im->timer))
+		atomic_dec(&im->refcnt);
+	im->tm_running = 0;
+	im->reporter = 0;
+	im->unsolicit_count = 0;
+	spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+	int tv = net_random() % max_delay;
+
+	im->tm_running = 1;
+	if (!mod_timer(&im->timer, jiffies+tv+2))
+		atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+	int tv = net_random() % in_dev->mr_maxdelay;
+
+	in_dev->mr_gq_running = 1;
+	if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+	int tv = net_random() % delay;
+
+	if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+	spin_lock_bh(&im->lock);
+	im->unsolicit_count = 0;
+	if (del_timer(&im->timer)) {
+		if ((long)(im->timer.expires-jiffies) < max_delay) {
+			add_timer(&im->timer);
+			im->tm_running = 1;
+			spin_unlock_bh(&im->lock);
+			return;
+		}
+		atomic_dec(&im->refcnt);
+	}
+	igmp_start_timer(im, max_delay);
+	spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ *	Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+	int gdeleted, int sdeleted)
+{
+	switch (type) {
+	case IGMPV3_MODE_IS_INCLUDE:
+	case IGMPV3_MODE_IS_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (!(pmc->gsquery && !psf->sf_gsresp)) {
+			if (pmc->sfmode == MCAST_INCLUDE)
+				return 1;
+			/* don't include if this source is excluded
+			 * in all filters
+			 */
+			if (psf->sf_count[MCAST_INCLUDE])
+				return type == IGMPV3_MODE_IS_INCLUDE;
+			return pmc->sfcount[MCAST_EXCLUDE] ==
+				psf->sf_count[MCAST_EXCLUDE];
+		}
+		return 0;
+	case IGMPV3_CHANGE_TO_INCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		return psf->sf_count[MCAST_INCLUDE] != 0;
+	case IGMPV3_CHANGE_TO_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+		    psf->sf_count[MCAST_INCLUDE])
+			return 0;
+		return pmc->sfcount[MCAST_EXCLUDE] ==
+			psf->sf_count[MCAST_EXCLUDE];
+	case IGMPV3_ALLOW_NEW_SOURCES:
+		if (gdeleted || !psf->sf_crcount)
+			return 0;
+		return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+	case IGMPV3_BLOCK_OLD_SOURCES:
+		if (pmc->sfmode == MCAST_INCLUDE)
+			return gdeleted || (psf->sf_crcount && sdeleted);
+		return psf->sf_crcount && !gdeleted && !sdeleted;
+	}
+	return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+	struct ip_sf_list *psf;
+	int scount = 0;
+
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+			continue;
+		scount++;
+	}
+	return scount;
+}
+
+#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+{
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct iphdr *pip;
+	struct igmpv3_report *pig;
+	struct net *net = dev_net(dev);
+	struct flowi4 fl4;
+
+	while (1) {
+		skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (skb)
+			break;
+		size >>= 1;
+		if (size < 256)
+			return NULL;
+	}
+	igmp_skb_size(skb) = size;
+
+	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
+				   0, 0,
+				   IPPROTO_IGMP, 0, dev->ifindex);
+	if (IS_ERR(rt)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	skb_dst_set(skb, &rt->dst);
+	skb->dev = dev;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb_reset_network_header(skb);
+	pip = ip_hdr(skb);
+	skb_put(skb, sizeof(struct iphdr) + 4);
+
+	pip->version  = 4;
+	pip->ihl      = (sizeof(struct iphdr)+4)>>2;
+	pip->tos      = 0xc0;
+	pip->frag_off = htons(IP_DF);
+	pip->ttl      = 1;
+	pip->daddr    = fl4.daddr;
+	pip->saddr    = fl4.saddr;
+	pip->protocol = IPPROTO_IGMP;
+	pip->tot_len  = 0;	/* filled in later */
+	ip_select_ident(pip, &rt->dst, NULL);
+	((u8*)&pip[1])[0] = IPOPT_RA;
+	((u8*)&pip[1])[1] = 4;
+	((u8*)&pip[1])[2] = 0;
+	((u8*)&pip[1])[3] = 0;
+
+	skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
+	skb_put(skb, sizeof(*pig));
+	pig = igmpv3_report_hdr(skb);
+	pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+	pig->resv1 = 0;
+	pig->csum = 0;
+	pig->resv2 = 0;
+	pig->ngrec = 0;
+	return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+	struct igmphdr *pig = igmp_hdr(skb);
+	const int igmplen = skb->tail - skb->transport_header;
+
+	pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
+
+	return ip_local_out(skb);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+	return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, struct igmpv3_grec **ppgr)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr;
+
+	if (!skb)
+		skb = igmpv3_newpack(dev, dev->mtu);
+	if (!skb)
+		return NULL;
+	pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+	pgr->grec_type = type;
+	pgr->grec_auxwords = 0;
+	pgr->grec_nsrcs = 0;
+	pgr->grec_mca = pmc->multiaddr;
+	pih = igmpv3_report_hdr(skb);
+	pih->ngrec = htons(ntohs(pih->ngrec)+1);
+	*ppgr = pgr;
+	return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
+	skb_tailroom(skb)) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, int gdeleted, int sdeleted)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr = NULL;
+	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+	int scount, stotal, first, isquery, truncate;
+
+	if (pmc->multiaddr == IGMP_ALL_HOSTS)
+		return skb;
+
+	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+		  type == IGMPV3_MODE_IS_EXCLUDE;
+	truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+		    type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+	stotal = scount = 0;
+
+	psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+	if (!*psf_list)
+		goto empty_source;
+
+	pih = skb ? igmpv3_report_hdr(skb) : NULL;
+
+	/* EX and TO_EX get a fresh packet, if needed */
+	if (truncate) {
+		if (pih && pih->ngrec &&
+		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, dev->mtu);
+		}
+	}
+	first = 1;
+	psf_prev = NULL;
+	for (psf=*psf_list; psf; psf=psf_next) {
+		__be32 *psrc;
+
+		psf_next = psf->sf_next;
+
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+			psf_prev = psf;
+			continue;
+		}
+
+		/* clear marks on query responses */
+		if (isquery)
+			psf->sf_gsresp = 0;
+
+		if (AVAILABLE(skb) < sizeof(__be32) +
+		    first*sizeof(struct igmpv3_grec)) {
+			if (truncate && !first)
+				break;	 /* truncate these */
+			if (pgr)
+				pgr->grec_nsrcs = htons(scount);
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, dev->mtu);
+			first = 1;
+			scount = 0;
+		}
+		if (first) {
+			skb = add_grhead(skb, pmc, type, &pgr);
+			first = 0;
+		}
+		if (!skb)
+			return NULL;
+		psrc = (__be32 *)skb_put(skb, sizeof(__be32));
+		*psrc = psf->sf_inaddr;
+		scount++; stotal++;
+		if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+		     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+			psf->sf_crcount--;
+			if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+				if (psf_prev)
+					psf_prev->sf_next = psf->sf_next;
+				else
+					*psf_list = psf->sf_next;
+				kfree(psf);
+				continue;
+			}
+		}
+		psf_prev = psf;
+	}
+
+empty_source:
+	if (!stotal) {
+		if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+		    type == IGMPV3_BLOCK_OLD_SOURCES)
+			return skb;
+		if (pmc->crcount || isquery) {
+			/* make sure we have room for group header */
+			if (skb && AVAILABLE(skb)<sizeof(struct igmpv3_grec)) {
+				igmpv3_sendpack(skb);
+				skb = NULL; /* add_grhead will get a new one */
+			}
+			skb = add_grhead(skb, pmc, type, &pgr);
+		}
+	}
+	if (pgr)
+		pgr->grec_nsrcs = htons(scount);
+
+	if (isquery)
+		pmc->gsquery = 0;	/* clear query state on report */
+	return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+	struct sk_buff *skb = NULL;
+	int type;
+
+	if (!pmc) {
+		rcu_read_lock();
+		for_each_pmc_rcu(in_dev, pmc) {
+			if (pmc->multiaddr == IGMP_ALL_HOSTS)
+				continue;
+			spin_lock_bh(&pmc->lock);
+			if (pmc->sfcount[MCAST_EXCLUDE])
+				type = IGMPV3_MODE_IS_EXCLUDE;
+			else
+				type = IGMPV3_MODE_IS_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			spin_unlock_bh(&pmc->lock);
+		}
+		rcu_read_unlock();
+	} else {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			type = IGMPV3_MODE_IS_EXCLUDE;
+		else
+			type = IGMPV3_MODE_IS_INCLUDE;
+		skb = add_grec(skb, pmc, type, 0, 0);
+		spin_unlock_bh(&pmc->lock);
+	}
+	if (!skb)
+		return 0;
+	return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+	struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+	psf_prev = NULL;
+	for (psf=*ppsf; psf; psf = psf_next) {
+		psf_next = psf->sf_next;
+		if (psf->sf_crcount == 0) {
+			if (psf_prev)
+				psf_prev->sf_next = psf->sf_next;
+			else
+				*ppsf = psf->sf_next;
+			kfree(psf);
+		} else
+			psf_prev = psf;
+	}
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+	struct sk_buff *skb = NULL;
+	int type, dtype;
+
+	rcu_read_lock();
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+
+	/* deleted MCA's */
+	pmc_prev = NULL;
+	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+		pmc_next = pmc->next;
+		if (pmc->sfmode == MCAST_INCLUDE) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+			skb = add_grec(skb, pmc, type, 1, 0);
+			skb = add_grec(skb, pmc, dtype, 1, 1);
+		}
+		if (pmc->crcount) {
+			if (pmc->sfmode == MCAST_EXCLUDE) {
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+				skb = add_grec(skb, pmc, type, 1, 0);
+			}
+			pmc->crcount--;
+			if (pmc->crcount == 0) {
+				igmpv3_clear_zeros(&pmc->tomb);
+				igmpv3_clear_zeros(&pmc->sources);
+			}
+		}
+		if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+			if (pmc_prev)
+				pmc_prev->next = pmc_next;
+			else
+				in_dev->mc_tomb = pmc_next;
+			in_dev_put(pmc->interface);
+			kfree(pmc);
+		} else
+			pmc_prev = pmc;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	/* change recs */
+	for_each_pmc_rcu(in_dev, pmc) {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_ALLOW_NEW_SOURCES;
+		} else {
+			type = IGMPV3_ALLOW_NEW_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+		}
+		skb = add_grec(skb, pmc, type, 0, 0);
+		skb = add_grec(skb, pmc, dtype, 0, 1);	/* deleted sources */
+
+		/* filter mode changes */
+		if (pmc->crcount) {
+			if (pmc->sfmode == MCAST_EXCLUDE)
+				type = IGMPV3_CHANGE_TO_EXCLUDE;
+			else
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			pmc->crcount--;
+		}
+		spin_unlock_bh(&pmc->lock);
+	}
+	rcu_read_unlock();
+
+	if (!skb)
+		return;
+	(void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+	int type)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct igmphdr *ih;
+	struct rtable *rt;
+	struct net_device *dev = in_dev->dev;
+	struct net *net = dev_net(dev);
+	__be32	group = pmc ? pmc->multiaddr : 0;
+	struct flowi4 fl4;
+	__be32	dst;
+
+	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+		return igmpv3_send_report(in_dev, pmc);
+	else if (type == IGMP_HOST_LEAVE_MESSAGE)
+		dst = IGMP_ALL_ROUTER;
+	else
+		dst = group;
+
+	rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
+				   0, 0,
+				   IPPROTO_IGMP, 0, dev->ifindex);
+	if (IS_ERR(rt))
+		return -1;
+
+	skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
+	if (skb == NULL) {
+		ip_rt_put(rt);
+		return -1;
+	}
+
+	skb_dst_set(skb, &rt->dst);
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	skb_put(skb, sizeof(struct iphdr) + 4);
+
+	iph->version  = 4;
+	iph->ihl      = (sizeof(struct iphdr)+4)>>2;
+	iph->tos      = 0xc0;
+	iph->frag_off = htons(IP_DF);
+	iph->ttl      = 1;
+	iph->daddr    = dst;
+	iph->saddr    = fl4.saddr;
+	iph->protocol = IPPROTO_IGMP;
+	ip_select_ident(iph, &rt->dst, NULL);
+	((u8*)&iph[1])[0] = IPOPT_RA;
+	((u8*)&iph[1])[1] = 4;
+	((u8*)&iph[1])[2] = 0;
+	((u8*)&iph[1])[3] = 0;
+
+	ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+	ih->type = type;
+	ih->code = 0;
+	ih->csum = 0;
+	ih->group = group;
+	ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+	return ip_local_out(skb);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+	struct in_device *in_dev = (struct in_device *)data;
+
+	in_dev->mr_gq_running = 0;
+	igmpv3_send_report(in_dev, NULL);
+	__in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+	struct in_device *in_dev = (struct in_device *)data;
+
+	igmpv3_send_cr(in_dev);
+	if (in_dev->mr_ifc_count) {
+		in_dev->mr_ifc_count--;
+		igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+	}
+	__in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+		return;
+	in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+	struct ip_mc_list *im=(struct ip_mc_list *)data;
+	struct in_device *in_dev = im->interface;
+
+	spin_lock(&im->lock);
+	im->tm_running = 0;
+
+	if (im->unsolicit_count) {
+		im->unsolicit_count--;
+		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+	}
+	im->reporter = 1;
+	spin_unlock(&im->lock);
+
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+	ip_ma_put(im);
+}
+
+/* mark EXCLUDE-mode sources */
+static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+	struct ip_sf_list *psf;
+	int i, scount;
+
+	scount = 0;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i=0; i<nsrcs; i++) {
+			/* skip inactive filters */
+			if (psf->sf_count[MCAST_INCLUDE] ||
+			    pmc->sfcount[MCAST_EXCLUDE] !=
+			    psf->sf_count[MCAST_EXCLUDE])
+				continue;
+			if (srcs[i] == psf->sf_inaddr) {
+				scount++;
+				break;
+			}
+		}
+	}
+	pmc->gsquery = 0;
+	if (scount == nsrcs)	/* all sources excluded */
+		return 0;
+	return 1;
+}
+
+static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
+{
+	struct ip_sf_list *psf;
+	int i, scount;
+
+	if (pmc->sfmode == MCAST_EXCLUDE)
+		return igmp_xmarksources(pmc, nsrcs, srcs);
+
+	/* mark INCLUDE-mode sources */
+	scount = 0;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i=0; i<nsrcs; i++)
+			if (srcs[i] == psf->sf_inaddr) {
+				psf->sf_gsresp = 1;
+				scount++;
+				break;
+			}
+	}
+	if (!scount) {
+		pmc->gsquery = 0;
+		return 0;
+	}
+	pmc->gsquery = 1;
+	return 1;
+}
+
+static void igmp_heard_report(struct in_device *in_dev, __be32 group)
+{
+	struct ip_mc_list *im;
+
+	/* Timers are only set for non-local groups */
+
+	if (group == IGMP_ALL_HOSTS)
+		return;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, im) {
+		if (im->multiaddr == group) {
+			igmp_stop_timer(im);
+			break;
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+	int len)
+{
+	struct igmphdr 		*ih = igmp_hdr(skb);
+	struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
+	struct ip_mc_list	*im;
+	__be32			group = ih->group;
+	int			max_delay;
+	int			mark = 0;
+
+
+	if (len == 8) {
+		if (ih->code == 0) {
+			/* Alas, old v1 router presents here. */
+
+			max_delay = IGMP_Query_Response_Interval;
+			in_dev->mr_v1_seen = jiffies +
+				IGMP_V1_Router_Present_Timeout;
+			group = 0;
+		} else {
+			/* v2 router present */
+			max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+			in_dev->mr_v2_seen = jiffies +
+				IGMP_V2_Router_Present_Timeout;
+		}
+		/* cancel the interface change timer */
+		in_dev->mr_ifc_count = 0;
+		if (del_timer(&in_dev->mr_ifc_timer))
+			__in_dev_put(in_dev);
+		/* clear deleted report items */
+		igmpv3_clear_delrec(in_dev);
+	} else if (len < 12) {
+		return;	/* ignore bogus packet; freed by caller */
+	} else if (IGMP_V1_SEEN(in_dev)) {
+		/* This is a v3 query with v1 queriers present */
+		max_delay = IGMP_Query_Response_Interval;
+		group = 0;
+	} else if (IGMP_V2_SEEN(in_dev)) {
+		/* this is a v3 query with v2 queriers present;
+		 * Interpretation of the max_delay code is problematic here.
+		 * A real v2 host would use ih_code directly, while v3 has a
+		 * different encoding. We use the v3 encoding as more likely
+		 * to be intended in a v3 query.
+		 */
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
+	} else { /* v3 */
+		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+			return;
+
+		ih3 = igmpv3_query_hdr(skb);
+		if (ih3->nsrcs) {
+			if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+					   + ntohs(ih3->nsrcs)*sizeof(__be32)))
+				return;
+			ih3 = igmpv3_query_hdr(skb);
+		}
+
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
+		in_dev->mr_maxdelay = max_delay;
+		if (ih3->qrv)
+			in_dev->mr_qrv = ih3->qrv;
+		if (!group) { /* general query */
+			if (ih3->nsrcs)
+				return;	/* no sources allowed */
+			igmp_gq_start_timer(in_dev);
+			return;
+		}
+		/* mark sources to include, if group & source-specific */
+		mark = ih3->nsrcs != 0;
+	}
+
+	/*
+	 * - Start the timers in all of our membership records
+	 *   that the query applies to for the interface on
+	 *   which the query arrived excl. those that belong
+	 *   to a "local" group (224.0.0.X)
+	 * - For timers already running check if they need to
+	 *   be reset.
+	 * - Use the igmp->igmp_code field as the maximum
+	 *   delay possible
+	 */
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, im) {
+		int changed;
+
+		if (group && group != im->multiaddr)
+			continue;
+		if (im->multiaddr == IGMP_ALL_HOSTS)
+			continue;
+		spin_lock_bh(&im->lock);
+		if (im->tm_running)
+			im->gsquery = im->gsquery && mark;
+		else
+			im->gsquery = mark;
+		changed = !im->gsquery ||
+			igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+		spin_unlock_bh(&im->lock);
+		if (changed)
+			igmp_mod_timer(im, max_delay);
+	}
+	rcu_read_unlock();
+}
+
+/* called in rcu_read_lock() section */
+int igmp_rcv(struct sk_buff *skb)
+{
+	/* This basically follows the spec line by line -- see RFC1112 */
+	struct igmphdr *ih;
+	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+	int len = skb->len;
+
+	if (in_dev == NULL)
+		goto drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+		goto drop;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!csum_fold(skb->csum))
+			break;
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = 0;
+		if (__skb_checksum_complete(skb))
+			goto drop;
+	}
+
+	ih = igmp_hdr(skb);
+	switch (ih->type) {
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		igmp_heard_query(in_dev, skb, len);
+		break;
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		/* Is it our report looped back? */
+		if (rt_is_output_route(skb_rtable(skb)))
+			break;
+		/* don't rely on MC router hearing unicast reports */
+		if (skb->pkt_type == PACKET_MULTICAST ||
+		    skb->pkt_type == PACKET_BROADCAST)
+			igmp_heard_report(in_dev, ih->group);
+		break;
+	case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+		return pim_rcv_v1(skb);
+#endif
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+	case IGMP_DVMRP:
+	case IGMP_TRACE:
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_MTRACE:
+	case IGMP_MTRACE_RESP:
+		break;
+	default:
+		break;
+	}
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+#endif
+
+
+/*
+ *	Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+	   We will get multicast token leakage, when IFF_MULTICAST
+	   is changed. This check should be done in dev->set_multicast_list
+	   routine. Something sort of:
+	   if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+	   --ANK
+	   */
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_add(dev, buf);
+}
+
+/*
+ *	Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_del(dev, buf);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+	struct ip_mc_list *pmc;
+
+	/* this is an "ip_mc_list" for convenience; only the fields below
+	 * are actually used. In particular, the refcnt and users are not
+	 * used for management of the delete list. Using the same structure
+	 * for deleted items allows change reports to use common code with
+	 * non-deleted or query-response MCA's.
+	 */
+	pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+	if (!pmc)
+		return;
+	spin_lock_bh(&im->lock);
+	pmc->interface = im->interface;
+	in_dev_hold(in_dev);
+	pmc->multiaddr = im->multiaddr;
+	pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	pmc->sfmode = im->sfmode;
+	if (pmc->sfmode == MCAST_INCLUDE) {
+		struct ip_sf_list *psf;
+
+		pmc->tomb = im->tomb;
+		pmc->sources = im->sources;
+		im->tomb = im->sources = NULL;
+		for (psf=pmc->sources; psf; psf=psf->sf_next)
+			psf->sf_crcount = pmc->crcount;
+	}
+	spin_unlock_bh(&im->lock);
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc->next = in_dev->mc_tomb;
+	in_dev->mc_tomb = pmc;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __be32 multiaddr)
+{
+	struct ip_mc_list *pmc, *pmc_prev;
+	struct ip_sf_list *psf, *psf_next;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc_prev = NULL;
+	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+		if (pmc->multiaddr == multiaddr)
+			break;
+		pmc_prev = pmc;
+	}
+	if (pmc) {
+		if (pmc_prev)
+			pmc_prev->next = pmc->next;
+		else
+			in_dev->mc_tomb = pmc->next;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+	if (pmc) {
+		for (psf=pmc->tomb; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+		in_dev_put(pmc->interface);
+		kfree(pmc);
+	}
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *nextpmc;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc = in_dev->mc_tomb;
+	in_dev->mc_tomb = NULL;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	for (; pmc; pmc = nextpmc) {
+		nextpmc = pmc->next;
+		ip_mc_clear_src(pmc);
+		in_dev_put(pmc->interface);
+		kfree(pmc);
+	}
+	/* clear dead sources, too */
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		struct ip_sf_list *psf, *psf_next;
+
+		spin_lock_bh(&pmc->lock);
+		psf = pmc->tomb;
+		pmc->tomb = NULL;
+		spin_unlock_bh(&pmc->lock);
+		for (; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+	}
+	rcu_read_unlock();
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+	int reporter;
+#endif
+
+	if (im->loaded) {
+		im->loaded = 0;
+		ip_mc_filter_del(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+
+	reporter = im->reporter;
+	igmp_stop_timer(im);
+
+	if (!in_dev->dead) {
+		if (IGMP_V1_SEEN(in_dev))
+			return;
+		if (IGMP_V2_SEEN(in_dev)) {
+			if (reporter)
+				igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+			return;
+		}
+		/* IGMPv3 */
+		igmpv3_add_delrec(in_dev, im);
+
+		igmp_ifc_event(in_dev);
+	}
+#endif
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+
+	if (im->loaded == 0) {
+		im->loaded = 1;
+		ip_mc_filter_add(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+
+	if (in_dev->dead)
+		return;
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+		spin_lock_bh(&im->lock);
+		igmp_start_timer(im, IGMP_Initial_Report_Delay);
+		spin_unlock_bh(&im->lock);
+		return;
+	}
+	/* else, v3 */
+
+	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ *	Multicast list managers
+ */
+
+
+/*
+ *	A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+	struct ip_mc_list *im;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, im) {
+		if (im->multiaddr == addr) {
+			im->users++;
+			ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+			goto out;
+		}
+	}
+
+	im = kzalloc(sizeof(*im), GFP_KERNEL);
+	if (!im)
+		goto out;
+
+	im->users = 1;
+	im->interface = in_dev;
+	in_dev_hold(in_dev);
+	im->multiaddr = addr;
+	/* initial mode is (EX, empty) */
+	im->sfmode = MCAST_EXCLUDE;
+	im->sfcount[MCAST_EXCLUDE] = 1;
+	atomic_set(&im->refcnt, 1);
+	spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+	setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
+	im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+#endif
+
+	im->next_rcu = in_dev->mc_list;
+	in_dev->mc_count++;
+	rcu_assign_pointer(in_dev->mc_list, im);
+
+#ifdef CONFIG_IP_MULTICAST
+	igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+	igmp_group_added(im);
+	if (!in_dev->dead)
+		ip_rt_multicast_event(in_dev);
+out:
+	return;
+}
+EXPORT_SYMBOL(ip_mc_inc_group);
+
+/*
+ *	Resend IGMP JOIN report; used for bonding.
+ *	Called with rcu_read_lock()
+ */
+void ip_mc_rejoin_groups(struct in_device *in_dev)
+{
+#ifdef CONFIG_IP_MULTICAST
+	struct ip_mc_list *im;
+	int type;
+
+	for_each_pmc_rcu(in_dev, im) {
+		if (im->multiaddr == IGMP_ALL_HOSTS)
+			continue;
+
+		/* a failover is happening and switches
+		 * must be notified immediately
+		 */
+		if (IGMP_V1_SEEN(in_dev))
+			type = IGMP_HOST_MEMBERSHIP_REPORT;
+		else if (IGMP_V2_SEEN(in_dev))
+			type = IGMPV2_HOST_MEMBERSHIP_REPORT;
+		else
+			type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+		igmp_send_report(in_dev, im, type);
+	}
+#endif
+}
+EXPORT_SYMBOL(ip_mc_rejoin_groups);
+
+/*
+ *	A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+{
+	struct ip_mc_list *i;
+	struct ip_mc_list __rcu **ip;
+
+	ASSERT_RTNL();
+
+	for (ip = &in_dev->mc_list;
+	     (i = rtnl_dereference(*ip)) != NULL;
+	     ip = &i->next_rcu) {
+		if (i->multiaddr == addr) {
+			if (--i->users == 0) {
+				*ip = i->next_rcu;
+				in_dev->mc_count--;
+				igmp_group_dropped(i);
+				ip_mc_clear_src(i);
+
+				if (!in_dev->dead)
+					ip_rt_multicast_event(in_dev);
+
+				ip_ma_put(i);
+				return;
+			}
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(ip_mc_dec_group);
+
+/* Device changing type */
+
+void ip_mc_unmap(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_dropped(pmc);
+}
+
+void ip_mc_remap(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_added(pmc);
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_dropped(pmc);
+
+#ifdef CONFIG_IP_MULTICAST
+	in_dev->mr_ifc_count = 0;
+	if (del_timer(&in_dev->mr_ifc_timer))
+		__in_dev_put(in_dev);
+	in_dev->mr_gq_running = 0;
+	if (del_timer(&in_dev->mr_gq_timer))
+		__in_dev_put(in_dev);
+	igmpv3_clear_delrec(in_dev);
+#endif
+
+	ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+	ASSERT_RTNL();
+
+	in_dev->mc_tomb = NULL;
+#ifdef CONFIG_IP_MULTICAST
+	in_dev->mr_gq_running = 0;
+	setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
+			(unsigned long)in_dev);
+	in_dev->mr_ifc_count = 0;
+	in_dev->mc_count     = 0;
+	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
+			(unsigned long)in_dev);
+	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+#endif
+
+	spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc;
+
+	ASSERT_RTNL();
+
+	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+	for_each_pmc_rtnl(in_dev, pmc)
+		igmp_group_added(pmc);
+}
+
+/*
+ *	Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+	struct ip_mc_list *i;
+
+	ASSERT_RTNL();
+
+	/* Deactivate timers */
+	ip_mc_down(in_dev);
+
+	while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
+		in_dev->mc_list = i->next_rcu;
+		in_dev->mc_count--;
+
+		/* We've dropped the groups in ip_mc_down already */
+		ip_mc_clear_src(i);
+		ip_ma_put(i);
+	}
+}
+
+/* RTNL is locked */
+static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
+{
+	struct net_device *dev = NULL;
+	struct in_device *idev = NULL;
+
+	if (imr->imr_ifindex) {
+		idev = inetdev_by_index(net, imr->imr_ifindex);
+		return idev;
+	}
+	if (imr->imr_address.s_addr) {
+		dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
+		if (!dev)
+			return NULL;
+	}
+
+	if (!dev) {
+		struct rtable *rt = ip_route_output(net,
+						    imr->imr_multiaddr.s_addr,
+						    0, 0, 0);
+		if (!IS_ERR(rt)) {
+			dev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+	}
+	if (dev) {
+		imr->imr_ifindex = dev->ifindex;
+		idev = __in_dev_get_rtnl(dev);
+	}
+	return idev;
+}
+
+/*
+ *	Join a socket to a group
+ */
+int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
+
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+	__be32 *psfsrc)
+{
+	struct ip_sf_list *psf, *psf_prev;
+	int rv = 0;
+
+	psf_prev = NULL;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf || psf->sf_count[sfmode] == 0) {
+		/* source filter not found, or count wrong =>  bug */
+		return -ESRCH;
+	}
+	psf->sf_count[sfmode]--;
+	if (psf->sf_count[sfmode] == 0) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct in_device *in_dev = pmc->interface;
+#endif
+
+		/* no more filters for this source */
+		if (psf_prev)
+			psf_prev->sf_next = psf->sf_next;
+		else
+			pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+		if (psf->sf_oldin &&
+		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+			psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+				IGMP_Unsolicited_Report_Count;
+			psf->sf_next = pmc->tomb;
+			pmc->tomb = psf;
+			rv = 1;
+		} else
+#endif
+			kfree(psf);
+	}
+	return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x)	do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	changerec = 0;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	rcu_read_unlock();
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	if (!delta) {
+		err = -EINVAL;
+		if (!pmc->sfcount[sfmode])
+			goto out_unlock;
+		pmc->sfcount[sfmode]--;
+	}
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+		changerec |= rv > 0;
+		if (!err && rv < 0)
+			err = rv;
+	}
+	if (pmc->sfmode == MCAST_EXCLUDE &&
+	    pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+	    pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct ip_sf_list *psf;
+#endif
+
+		/* filter mode change */
+		pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+			IGMP_Unsolicited_Report_Count;
+		in_dev->mr_ifc_count = pmc->crcount;
+		for (psf=pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(pmc->interface);
+	} else if (sf_setstate(pmc) || changerec) {
+		igmp_ifc_event(pmc->interface);
+#endif
+	}
+out_unlock:
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+	__be32 *psfsrc, int delta)
+{
+	struct ip_sf_list *psf, *psf_prev;
+
+	psf_prev = NULL;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf) {
+		psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+		if (!psf)
+			return -ENOBUFS;
+		psf->sf_inaddr = *psfsrc;
+		if (psf_prev) {
+			psf_prev->sf_next = psf;
+		} else
+			pmc->sources = psf;
+	}
+	psf->sf_count[sfmode]++;
+	if (psf->sf_count[sfmode] == 1) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+	for (psf=pmc->sources; psf; psf=psf->sf_next)
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			psf->sf_oldin = mca_xcount ==
+				psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf, *dpsf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+	int qrv = pmc->interface->mr_qrv;
+	int new_in, rv;
+
+	rv = 0;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+		if (new_in) {
+			if (!psf->sf_oldin) {
+				struct ip_sf_list *prev = NULL;
+
+				for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next) {
+					if (dpsf->sf_inaddr == psf->sf_inaddr)
+						break;
+					prev = dpsf;
+				}
+				if (dpsf) {
+					if (prev)
+						prev->sf_next = dpsf->sf_next;
+					else
+						pmc->tomb = dpsf->sf_next;
+					kfree(dpsf);
+				}
+				psf->sf_crcount = qrv;
+				rv++;
+			}
+		} else if (psf->sf_oldin) {
+
+			psf->sf_crcount = 0;
+			/*
+			 * add or update "delete" records if an active filter
+			 * is now inactive
+			 */
+			for (dpsf=pmc->tomb; dpsf; dpsf=dpsf->sf_next)
+				if (dpsf->sf_inaddr == psf->sf_inaddr)
+					break;
+			if (!dpsf) {
+				dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+				if (!dpsf)
+					continue;
+				*dpsf = *psf;
+				/* pmc->lock held by callers */
+				dpsf->sf_next = pmc->tomb;
+				pmc->tomb = dpsf;
+			}
+			dpsf->sf_crcount = qrv;
+			rv++;
+		}
+	}
+	return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
+			 int sfcount, __be32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	isexclude;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	rcu_read_lock();
+	for_each_pmc_rcu(in_dev, pmc) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	rcu_read_unlock();
+
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	isexclude = pmc->sfmode == MCAST_EXCLUDE;
+	if (!delta)
+		pmc->sfcount[sfmode]++;
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+		if (err)
+			break;
+	}
+	if (err) {
+		int j;
+
+		pmc->sfcount[sfmode]--;
+		for (j=0; j<i; j++)
+			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+		struct ip_sf_list *psf;
+		in_dev = pmc->interface;
+#endif
+
+		/* filter mode change */
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			pmc->sfmode = MCAST_EXCLUDE;
+		else if (pmc->sfcount[MCAST_INCLUDE])
+			pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		/* else no filters; keep old mode for reports */
+
+		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+			IGMP_Unsolicited_Report_Count;
+		in_dev->mr_ifc_count = pmc->crcount;
+		for (psf=pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(in_dev);
+	} else if (sf_setstate(pmc)) {
+		igmp_ifc_event(in_dev);
+#endif
+	}
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf, *nextpsf;
+
+	for (psf=pmc->tomb; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->tomb = NULL;
+	for (psf=pmc->sources; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->sources = NULL;
+	pmc->sfmode = MCAST_EXCLUDE;
+	pmc->sfcount[MCAST_INCLUDE] = 0;
+	pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+	int err;
+	__be32 addr = imr->imr_multiaddr.s_addr;
+	struct ip_mc_socklist *iml = NULL, *i;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	int ifindex;
+	int count = 0;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	rtnl_lock();
+
+	in_dev = ip_mc_find_dev(net, imr);
+
+	if (!in_dev) {
+		iml = NULL;
+		err = -ENODEV;
+		goto done;
+	}
+
+	err = -EADDRINUSE;
+	ifindex = imr->imr_ifindex;
+	for_each_pmc_rtnl(inet, i) {
+		if (i->multi.imr_multiaddr.s_addr == addr &&
+		    i->multi.imr_ifindex == ifindex)
+			goto done;
+		count++;
+	}
+	err = -ENOBUFS;
+	if (count >= sysctl_igmp_max_memberships)
+		goto done;
+	iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+	if (iml == NULL)
+		goto done;
+
+	memcpy(&iml->multi, imr, sizeof(*imr));
+	iml->next_rcu = inet->mc_list;
+	iml->sflist = NULL;
+	iml->sfmode = MCAST_EXCLUDE;
+	rcu_assign_pointer(inet->mc_list, iml);
+	ip_mc_inc_group(in_dev, addr);
+	err = 0;
+done:
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(ip_mc_join_group);
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+			   struct in_device *in_dev)
+{
+	struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
+	int err;
+
+	if (psf == NULL) {
+		/* any-source empty exclude case */
+		return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, 0, NULL, 0);
+	}
+	err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+	rcu_assign_pointer(iml->sflist, NULL);
+	/* decrease mem now to avoid the memleak warning */
+	atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+	kfree_rcu(psf, rcu);
+	return err;
+}
+
+/*
+ *	Ask a socket to leave a group.
+ */
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+	struct ip_mc_socklist __rcu **imlp;
+	struct in_device *in_dev;
+	struct net *net = sock_net(sk);
+	__be32 group = imr->imr_multiaddr.s_addr;
+	u32 ifindex;
+	int ret = -EADDRNOTAVAIL;
+
+	rtnl_lock();
+	in_dev = ip_mc_find_dev(net, imr);
+	ifindex = imr->imr_ifindex;
+	for (imlp = &inet->mc_list;
+	     (iml = rtnl_dereference(*imlp)) != NULL;
+	     imlp = &iml->next_rcu) {
+		if (iml->multi.imr_multiaddr.s_addr != group)
+			continue;
+		if (ifindex) {
+			if (iml->multi.imr_ifindex != ifindex)
+				continue;
+		} else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
+				iml->multi.imr_address.s_addr)
+			continue;
+
+		(void) ip_mc_leave_src(sk, iml, in_dev);
+
+		*imlp = iml->next_rcu;
+
+		if (in_dev)
+			ip_mc_dec_group(in_dev, group);
+		rtnl_unlock();
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+		kfree_rcu(iml, rcu);
+		return 0;
+	}
+	if (!in_dev)
+		ret = -ENODEV;
+	rtnl_unlock();
+	return ret;
+}
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+	ip_mreq_source *mreqs, int ifindex)
+{
+	int err;
+	struct ip_mreqn imr;
+	__be32 addr = mreqs->imr_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+	int leavegroup = 0;
+	int i, j, rv;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	rtnl_lock();
+
+	imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+	imr.imr_address.s_addr = mreqs->imr_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if ((pmc->multi.imr_multiaddr.s_addr ==
+		     imr.imr_multiaddr.s_addr) &&
+		    (pmc->multi.imr_ifindex == imr.imr_ifindex))
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	/* if a source filter was set, must be the same mode as before */
+	if (pmc->sflist) {
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
+			goto done;
+		}
+	} else if (pmc->sfmode != omode) {
+		/* allow mode switches for empty-set filters */
+		ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
+			NULL, 0);
+		pmc->sfmode = omode;
+	}
+
+	psl = rtnl_dereference(pmc->sflist);
+	if (!add) {
+		if (!psl)
+			goto done;	/* err = -EADDRNOTAVAIL */
+		rv = !0;
+		for (i=0; i<psl->sl_count; i++) {
+			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+				sizeof(__be32));
+			if (rv == 0)
+				break;
+		}
+		if (rv)		/* source not found */
+			goto done;	/* err = -EADDRNOTAVAIL */
+
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
+			goto done;
+		}
+
+		/* update the interface filter */
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+			&mreqs->imr_sourceaddr, 1);
+
+		for (j=i+1; j<psl->sl_count; j++)
+			psl->sl_addr[j-1] = psl->sl_addr[j];
+		psl->sl_count--;
+		err = 0;
+		goto done;
+	}
+	/* else, add a new source to the filter */
+
+	if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+		err = -ENOBUFS;
+		goto done;
+	}
+	if (!psl || psl->sl_count == psl->sl_max) {
+		struct ip_sf_socklist *newpsl;
+		int count = IP_SFBLOCK;
+
+		if (psl)
+			count += psl->sl_max;
+		newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = count;
+		newpsl->sl_count = count - IP_SFBLOCK;
+		if (psl) {
+			for (i=0; i<psl->sl_count; i++)
+				newpsl->sl_addr[i] = psl->sl_addr[i];
+			/* decrease mem now to avoid the memleak warning */
+			atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+			kfree_rcu(psl, rcu);
+		}
+		rcu_assign_pointer(pmc->sflist, newpsl);
+		psl = newpsl;
+	}
+	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */
+	for (i=0; i<psl->sl_count; i++) {
+		rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+			sizeof(__be32));
+		if (rv == 0)
+			break;
+	}
+	if (rv == 0)		/* address already there is an error */
+		goto done;
+	for (j=psl->sl_count-1; j>=i; j--)
+		psl->sl_addr[j+1] = psl->sl_addr[j];
+	psl->sl_addr[i] = mreqs->imr_sourceaddr;
+	psl->sl_count++;
+	err = 0;
+	/* update the interface list */
+	ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
+		&mreqs->imr_sourceaddr, 1);
+done:
+	rtnl_unlock();
+	if (leavegroup)
+		return ip_mc_leave_group(sk, &imr);
+	return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+	int err = 0;
+	struct ip_mreqn	imr;
+	__be32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *newpsl, *psl;
+	struct net *net = sock_net(sk);
+	int leavegroup = 0;
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+	if (msf->imsf_fmode != MCAST_INCLUDE &&
+	    msf->imsf_fmode != MCAST_EXCLUDE)
+		return -EINVAL;
+
+	rtnl_lock();
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+
+	/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+	if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	if (msf->imsf_numsrc) {
+		newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
+							   GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+		memcpy(newpsl->sl_addr, msf->imsf_slist,
+			msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+		if (err) {
+			sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+			goto done;
+		}
+	} else {
+		newpsl = NULL;
+		(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+				     msf->imsf_fmode, 0, NULL, 0);
+	}
+	psl = rtnl_dereference(pmc->sflist);
+	if (psl) {
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			psl->sl_count, psl->sl_addr, 0);
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+		kfree_rcu(psl, rcu);
+	} else
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			0, NULL, 0);
+	rcu_assign_pointer(pmc->sflist, newpsl);
+	pmc->sfmode = msf->imsf_fmode;
+	err = 0;
+done:
+	rtnl_unlock();
+	if (leavegroup)
+		err = ip_mc_leave_group(sk, &imr);
+	return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+	struct ip_msfilter __user *optval, int __user *optlen)
+{
+	int err, len, count, copycount;
+	struct ip_mreqn	imr;
+	__be32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	rtnl_lock();
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = 0;
+	in_dev = ip_mc_find_dev(net, &imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	msf->imsf_fmode = pmc->sfmode;
+	psl = rtnl_dereference(pmc->sflist);
+	rtnl_unlock();
+	if (!psl) {
+		len = 0;
+		count = 0;
+	} else {
+		count = psl->sl_count;
+	}
+	copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+	len = copycount * sizeof(psl->sl_addr[0]);
+	msf->imsf_numsrc = count;
+	if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+	    copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	if (len &&
+	    copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+		return -EFAULT;
+	return 0;
+done:
+	rtnl_unlock();
+	return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+	struct group_filter __user *optval, int __user *optlen)
+{
+	int err, i, count, copycount;
+	struct sockaddr_in *psin;
+	__be32 addr;
+	struct ip_mc_socklist *pmc;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+
+	psin = (struct sockaddr_in *)&gsf->gf_group;
+	if (psin->sin_family != AF_INET)
+		return -EINVAL;
+	addr = psin->sin_addr.s_addr;
+	if (!ipv4_is_multicast(addr))
+		return -EINVAL;
+
+	rtnl_lock();
+
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rtnl(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == addr &&
+		    pmc->multi.imr_ifindex == gsf->gf_interface)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	gsf->gf_fmode = pmc->sfmode;
+	psl = rtnl_dereference(pmc->sflist);
+	rtnl_unlock();
+	count = psl ? psl->sl_count : 0;
+	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+	gsf->gf_numsrc = count;
+	if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	for (i=0; i<copycount; i++) {
+		struct sockaddr_storage ss;
+
+		psin = (struct sockaddr_in *)&ss;
+		memset(&ss, 0, sizeof(ss));
+		psin->sin_family = AF_INET;
+		psin->sin_addr.s_addr = psl->sl_addr[i];
+		if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+			return -EFAULT;
+	}
+	return 0;
+done:
+	rtnl_unlock();
+	return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *pmc;
+	struct ip_sf_socklist *psl;
+	int i;
+	int ret;
+
+	ret = 1;
+	if (!ipv4_is_multicast(loc_addr))
+		goto out;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(inet, pmc) {
+		if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+		    pmc->multi.imr_ifindex == dif)
+			break;
+	}
+	ret = inet->mc_all;
+	if (!pmc)
+		goto unlock;
+	psl = rcu_dereference(pmc->sflist);
+	ret = (pmc->sfmode == MCAST_EXCLUDE);
+	if (!psl)
+		goto unlock;
+
+	for (i=0; i<psl->sl_count; i++) {
+		if (psl->sl_addr[i] == rmt_addr)
+			break;
+	}
+	ret = 0;
+	if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+		goto unlock;
+	if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+		goto unlock;
+	ret = 1;
+unlock:
+	rcu_read_unlock();
+out:
+	return ret;
+}
+
+/*
+ *	A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+	struct net *net = sock_net(sk);
+
+	if (inet->mc_list == NULL)
+		return;
+
+	rtnl_lock();
+	while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
+		struct in_device *in_dev;
+
+		inet->mc_list = iml->next_rcu;
+		in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
+		(void) ip_mc_leave_src(sk, iml, in_dev);
+		if (in_dev != NULL)
+			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+		/* decrease mem now to avoid the memleak warning */
+		atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+		kfree_rcu(iml, rcu);
+	}
+	rtnl_unlock();
+}
+
+/* called with rcu_read_lock() */
+int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+{
+	struct ip_mc_list *im;
+	struct ip_sf_list *psf;
+	int rv = 0;
+
+	for_each_pmc_rcu(in_dev, im) {
+		if (im->multiaddr == mc_addr)
+			break;
+	}
+	if (im && proto == IPPROTO_IGMP) {
+		rv = 1;
+	} else if (im) {
+		if (src_addr) {
+			for (psf=im->sources; psf; psf=psf->sf_next) {
+				if (psf->sf_inaddr == src_addr)
+					break;
+			}
+			if (psf)
+				rv = psf->sf_count[MCAST_INCLUDE] ||
+					psf->sf_count[MCAST_EXCLUDE] !=
+					im->sfcount[MCAST_EXCLUDE];
+			else
+				rv = im->sfcount[MCAST_EXCLUDE] != 0;
+		} else
+			rv = 1; /* unspecified source; tentatively allow */
+	}
+	return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct in_device *in_dev;
+};
+
+#define	igmp_mc_seq_private(seq)	((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ip_mc_list *im = NULL;
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	state->in_dev = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct in_device *in_dev;
+
+		in_dev = __in_dev_get_rcu(state->dev);
+		if (!in_dev)
+			continue;
+		im = rcu_dereference(in_dev->mc_list);
+		if (im) {
+			state->in_dev = in_dev;
+			break;
+		}
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	im = rcu_dereference(im->next_rcu);
+	while (!im) {
+		state->dev = next_net_device_rcu(state->dev);
+		if (!state->dev) {
+			state->in_dev = NULL;
+			break;
+		}
+		state->in_dev = __in_dev_get_rcu(state->dev);
+		if (!state->in_dev)
+			continue;
+		im = rcu_dereference(state->in_dev->mc_list);
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_mc_list *im = igmp_mc_get_first(seq);
+	if (im)
+		while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+			--pos;
+	return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_mc_list *im;
+	if (v == SEQ_START_TOKEN)
+		im = igmp_mc_get_first(seq);
+	else
+		im = igmp_mc_get_next(seq, v);
+	++*pos;
+	return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	state->in_dev = NULL;
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
+	else {
+		struct ip_mc_list *im = (struct ip_mc_list *)v;
+		struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+		char   *querier;
+#ifdef CONFIG_IP_MULTICAST
+		querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+			  IGMP_V2_SEEN(state->in_dev) ? "V2" :
+			  "V3";
+#else
+		querier = "NONE";
+#endif
+
+		if (rcu_dereference(state->in_dev->mc_list) == im) {
+			seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+				   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
+		}
+
+		seq_printf(seq,
+			   "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
+			   im->multiaddr, im->users,
+			   im->tm_running, im->tm_running ?
+			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+			   im->reporter);
+	}
+	return 0;
+}
+
+static const struct seq_operations igmp_mc_seq_ops = {
+	.start	=	igmp_mc_seq_start,
+	.next	=	igmp_mc_seq_next,
+	.stop	=	igmp_mc_seq_stop,
+	.show	=	igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &igmp_mc_seq_ops,
+			sizeof(struct igmp_mc_iter_state));
+}
+
+static const struct file_operations igmp_mc_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp_mc_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+struct igmp_mcf_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct in_device *idev;
+	struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq)	((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ip_sf_list *psf = NULL;
+	struct ip_mc_list *im = NULL;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	state->idev = NULL;
+	state->im = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct in_device *idev;
+		idev = __in_dev_get_rcu(state->dev);
+		if (unlikely(idev == NULL))
+			continue;
+		im = rcu_dereference(idev->mc_list);
+		if (likely(im != NULL)) {
+			spin_lock_bh(&im->lock);
+			psf = im->sources;
+			if (likely(psf != NULL)) {
+				state->im = im;
+				state->idev = idev;
+				break;
+			}
+			spin_unlock_bh(&im->lock);
+		}
+	}
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	psf = psf->sf_next;
+	while (!psf) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = state->im->next;
+		while (!state->im) {
+			state->dev = next_net_device_rcu(state->dev);
+			if (!state->dev) {
+				state->idev = NULL;
+				goto out;
+			}
+			state->idev = __in_dev_get_rcu(state->dev);
+			if (!state->idev)
+				continue;
+			state->im = rcu_dereference(state->idev->mc_list);
+		}
+		if (!state->im)
+			break;
+		spin_lock_bh(&state->im->lock);
+		psf = state->im->sources;
+	}
+out:
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+	if (psf)
+		while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+			--pos;
+	return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_sf_list *psf;
+	if (v == SEQ_START_TOKEN)
+		psf = igmp_mcf_get_first(seq);
+	else
+		psf = igmp_mcf_get_next(seq, v);
+	++*pos;
+	return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+	if (likely(state->im != NULL)) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = NULL;
+	}
+	state->idev = NULL;
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+	struct ip_sf_list *psf = (struct ip_sf_list *)v;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			   "%3s %6s "
+			   "%10s %10s %6s %6s\n", "Idx",
+			   "Device", "MCA",
+			   "SRC", "INC", "EXC");
+	} else {
+		seq_printf(seq,
+			   "%3d %6.6s 0x%08x "
+			   "0x%08x %6lu %6lu\n",
+			   state->dev->ifindex, state->dev->name,
+			   ntohl(state->im->multiaddr),
+			   ntohl(psf->sf_inaddr),
+			   psf->sf_count[MCAST_INCLUDE],
+			   psf->sf_count[MCAST_EXCLUDE]);
+	}
+	return 0;
+}
+
+static const struct seq_operations igmp_mcf_seq_ops = {
+	.start	=	igmp_mcf_seq_start,
+	.next	=	igmp_mcf_seq_next,
+	.stop	=	igmp_mcf_seq_stop,
+	.show	=	igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &igmp_mcf_seq_ops,
+			sizeof(struct igmp_mcf_iter_state));
+}
+
+static const struct file_operations igmp_mcf_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp_mcf_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+static int __net_init igmp_net_init(struct net *net)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+	if (!pde)
+		goto out_igmp;
+	pde = proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+	if (!pde)
+		goto out_mcfilter;
+	return 0;
+
+out_mcfilter:
+	proc_net_remove(net, "igmp");
+out_igmp:
+	return -ENOMEM;
+}
+
+static void __net_exit igmp_net_exit(struct net *net)
+{
+	proc_net_remove(net, "mcfilter");
+	proc_net_remove(net, "igmp");
+}
+
+static struct pernet_operations igmp_net_ops = {
+	.init = igmp_net_init,
+	.exit = igmp_net_exit,
+};
+
+int __init igmp_mc_proc_init(void)
+{
+	return register_pernet_subsys(&igmp_net_ops);
+}
+#endif
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 00000000..c14d88ad
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,775 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Support for INET connection oriented protocols.
+ *
+ * Authors:	See the TCP sources
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This struct holds the first and last local port number.
+ */
+struct local_ports sysctl_local_ports __read_mostly = {
+	.lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
+	.range = { 32768, 61000 },
+};
+
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
+void inet_get_local_port_range(int *low, int *high)
+{
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = sysctl_local_ports.range[0];
+		*high = sysctl_local_ports.range[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+EXPORT_SYMBOL(inet_get_local_port_range);
+
+int inet_csk_bind_conflict(const struct sock *sk,
+			   const struct inet_bind_bucket *tb)
+{
+	struct sock *sk2;
+	struct hlist_node *node;
+	int reuse = sk->sk_reuse;
+
+	/*
+	 * Unlike other sk lookup places we do not check
+	 * for sk_net here, since _all_ the socks listed
+	 * in tb->owners list belong to the same net - the
+	 * one this bucket belongs to.
+	 */
+
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    !inet_v6_ipv6only(sk2) &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if (!reuse || !sk2->sk_reuse ||
+			    sk2->sk_state == TCP_LISTEN) {
+				const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+				if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
+				    sk2_rcv_saddr == sk_rcv_saddr(sk))
+					break;
+			}
+		}
+	}
+	return node != NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct inet_bind_hashbucket *head;
+	struct hlist_node *node;
+	struct inet_bind_bucket *tb;
+	int ret, attempts = 5;
+	struct net *net = sock_net(sk);
+	int smallest_size = -1, smallest_rover;
+
+	local_bh_disable();
+	if (!snum) {
+		int remaining, rover, low, high;
+
+again:
+		inet_get_local_port_range(&low, &high);
+		remaining = (high - low) + 1;
+		smallest_rover = rover = net_random() % remaining + low;
+
+		smallest_size = -1;
+		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
+			head = &hashinfo->bhash[inet_bhashfn(net, rover,
+					hashinfo->bhash_size)];
+			spin_lock(&head->lock);
+			inet_bind_bucket_for_each(tb, node, &head->chain)
+				if (net_eq(ib_net(tb), net) && tb->port == rover) {
+					if (tb->fastreuse > 0 &&
+					    sk->sk_reuse &&
+					    sk->sk_state != TCP_LISTEN &&
+					    (tb->num_owners < smallest_size || smallest_size == -1)) {
+						smallest_size = tb->num_owners;
+						smallest_rover = rover;
+						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
+							spin_unlock(&head->lock);
+							snum = smallest_rover;
+							goto have_snum;
+						}
+					}
+					goto next;
+				}
+			break;
+		next:
+			spin_unlock(&head->lock);
+		next_nolock:
+			if (++rover > high)
+				rover = low;
+		} while (--remaining > 0);
+
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
+		ret = 1;
+		if (remaining <= 0) {
+			if (smallest_size != -1) {
+				snum = smallest_rover;
+				goto have_snum;
+			}
+			goto fail;
+		}
+		/* OK, here is the one we will use.  HEAD is
+		 * non-NULL and we hold it's mutex.
+		 */
+		snum = rover;
+	} else {
+have_snum:
+		head = &hashinfo->bhash[inet_bhashfn(net, snum,
+				hashinfo->bhash_size)];
+		spin_lock(&head->lock);
+		inet_bind_bucket_for_each(tb, node, &head->chain)
+			if (net_eq(ib_net(tb), net) && tb->port == snum)
+				goto tb_found;
+	}
+	tb = NULL;
+	goto tb_not_found;
+tb_found:
+	if (!hlist_empty(&tb->owners)) {
+		if (tb->fastreuse > 0 &&
+		    sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+		    smallest_size == -1) {
+			goto success;
+		} else {
+			ret = 1;
+			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+				if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
+				    smallest_size != -1 && --attempts >= 0) {
+					spin_unlock(&head->lock);
+					goto again;
+				}
+				goto fail_unlock;
+			}
+		}
+	}
+tb_not_found:
+	ret = 1;
+	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
+					net, head, snum)) == NULL)
+		goto fail_unlock;
+	if (hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+			tb->fastreuse = 1;
+		else
+			tb->fastreuse = 0;
+	} else if (tb->fastreuse &&
+		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+		tb->fastreuse = 0;
+success:
+	if (!inet_csk(sk)->icsk_bind_hash)
+		inet_bind_hash(sk, tb, snum);
+	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+	ret = 0;
+
+fail_unlock:
+	spin_unlock(&head->lock);
+fail:
+	local_bh_enable();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+			timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		err = 0;
+		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *newsk;
+	int error;
+
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out_err;
+
+	/* Find already established connection */
+	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out_err;
+
+		error = inet_csk_wait_for_connect(sk, timeo);
+		if (error)
+			goto out_err;
+	}
+
+	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+	WARN_ON(newsk->sk_state == TCP_SYN_RECV);
+out:
+	release_sock(sk);
+	return newsk;
+out_err:
+	newsk = NULL;
+	*err = error;
+	goto out;
+}
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+			       void (*retransmit_handler)(unsigned long),
+			       void (*delack_handler)(unsigned long),
+			       void (*keepalive_handler)(unsigned long))
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
+			(unsigned long)sk);
+	setup_timer(&icsk->icsk_delack_timer, delack_handler,
+			(unsigned long)sk);
+	setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
+	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry *inet_csk_route_req(struct sock *sk,
+				     struct flowi4 *fl4,
+				     const struct request_sock *req)
+{
+	struct rtable *rt;
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct ip_options_rcu *opt = inet_rsk(req)->opt;
+	struct net *net = sock_net(sk);
+
+	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+			   sk->sk_protocol, inet_sk_flowi_flags(sk),
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+	security_req_classify_flow(req, flowi4_to_flowi(fl4));
+	rt = ip_route_output_flow(net, fl4, sk);
+	if (IS_ERR(rt))
+		goto no_route;
+	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+		goto route_err;
+	return &rt->dst;
+
+route_err:
+	ip_rt_put(rt);
+no_route:
+	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
+					    struct sock *newsk,
+					    const struct request_sock *req)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct inet_sock *newinet = inet_sk(newsk);
+	struct ip_options_rcu *opt = ireq->opt;
+	struct net *net = sock_net(sk);
+	struct flowi4 *fl4;
+	struct rtable *rt;
+
+	fl4 = &newinet->cork.fl.u.ip4;
+	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+			   sk->sk_protocol, inet_sk_flowi_flags(sk),
+			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
+			   ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
+	security_req_classify_flow(req, flowi4_to_flowi(fl4));
+	rt = ip_route_output_flow(net, fl4, sk);
+	if (IS_ERR(rt))
+		goto no_route;
+	if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+		goto route_err;
+	return &rt->dst;
+
+route_err:
+	ip_rt_put(rt);
+no_route:
+	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
+
+static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
+				 const u32 rnd, const u32 synq_hsize)
+{
+	return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+					 struct request_sock ***prevp,
+					 const __be16 rport, const __be32 raddr,
+					 const __be32 laddr)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	struct request_sock *req, **prev;
+
+	for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+						    lopt->nr_table_entries)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+
+		if (ireq->rmt_port == rport &&
+		    ireq->rmt_addr == raddr &&
+		    ireq->loc_addr == laddr &&
+		    AF_INET_FAMILY(req->rsk_ops->family)) {
+			WARN_ON(req->sk);
+			*prevp = prev;
+			break;
+		}
+	}
+
+	return req;
+}
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+				   unsigned long timeout)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+				     lopt->hash_rnd, lopt->nr_table_entries);
+
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+	inet_csk_reqsk_queue_added(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+
+/* Decide when to expire the request and when to resend SYN-ACK */
+static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
+				  const int max_retries,
+				  const u8 rskq_defer_accept,
+				  int *expire, int *resend)
+{
+	if (!rskq_defer_accept) {
+		*expire = req->retrans >= thresh;
+		*resend = 1;
+		return;
+	}
+	*expire = req->retrans >= thresh &&
+		  (!inet_rsk(req)->acked || req->retrans >= max_retries);
+	/*
+	 * Do not resend while waiting for data after ACK,
+	 * start to resend on end of deferring period to give
+	 * last chance for data or ACK to create established socket.
+	 */
+	*resend = !inet_rsk(req)->acked ||
+		  req->retrans >= rskq_defer_accept - 1;
+}
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+				const unsigned long interval,
+				const unsigned long timeout,
+				const unsigned long max_rto)
+{
+	struct inet_connection_sock *icsk = inet_csk(parent);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct listen_sock *lopt = queue->listen_opt;
+	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
+	unsigned long now = jiffies;
+	struct request_sock **reqp, *req;
+	int i, budget;
+
+	if (lopt == NULL || lopt->qlen == 0)
+		return;
+
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 3 seconds, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+		int young = (lopt->qlen_young<<1);
+
+		while (thresh > 2) {
+			if (lopt->qlen < young)
+				break;
+			thresh--;
+			young <<= 1;
+		}
+	}
+
+	if (queue->rskq_defer_accept)
+		max_retries = queue->rskq_defer_accept;
+
+	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+	i = lopt->clock_hand;
+
+	do {
+		reqp=&lopt->syn_table[i];
+		while ((req = *reqp) != NULL) {
+			if (time_after_eq(now, req->expires)) {
+				int expire = 0, resend = 0;
+
+				syn_ack_recalc(req, thresh, max_retries,
+					       queue->rskq_defer_accept,
+					       &expire, &resend);
+				if (req->rsk_ops->syn_ack_timeout)
+					req->rsk_ops->syn_ack_timeout(parent, req);
+				if (!expire &&
+				    (!resend ||
+				     !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
+				     inet_rsk(req)->acked)) {
+					unsigned long timeo;
+
+					if (req->retrans++ == 0)
+						lopt->qlen_young--;
+					timeo = min((timeout << req->retrans), max_rto);
+					req->expires = now + timeo;
+					reqp = &req->dl_next;
+					continue;
+				}
+
+				/* Drop this request */
+				inet_csk_reqsk_queue_unlink(parent, req, reqp);
+				reqsk_queue_removed(queue, req);
+				reqsk_free(req);
+				continue;
+			}
+			reqp = &req->dl_next;
+		}
+
+		i = (i + 1) & (lopt->nr_table_entries - 1);
+
+	} while (--budget > 0);
+
+	lopt->clock_hand = i;
+
+	if (lopt->qlen)
+		inet_csk_reset_keepalive_timer(parent, interval);
+}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+			    const gfp_t priority)
+{
+	struct sock *newsk = sk_clone(sk, priority);
+
+	if (newsk != NULL) {
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+		newsk->sk_state = TCP_SYN_RECV;
+		newicsk->icsk_bind_hash = NULL;
+
+		inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
+		inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
+		inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
+		newsk->sk_write_space = sk_stream_write_space;
+
+		newicsk->icsk_retransmits = 0;
+		newicsk->icsk_backoff	  = 0;
+		newicsk->icsk_probes_out  = 0;
+
+		/* Deinitialize accept_queue to trap illegal accesses. */
+		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+
+		security_inet_csk_clone(newsk, req);
+	}
+	return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+	WARN_ON(sk->sk_state != TCP_CLOSE);
+	WARN_ON(!sock_flag(sk, SOCK_DEAD));
+
+	/* It cannot be in hash table! */
+	WARN_ON(!sk_unhashed(sk));
+
+	/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+	WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
+
+	sk->sk_prot->destroy(sk);
+
+	sk_stream_kill_queues(sk);
+
+	xfrm_sk_free_policy(sk);
+
+	sk_refcnt_debug_release(sk);
+
+	percpu_counter_dec(sk->sk_prot->orphan_count);
+	sock_put(sk);
+}
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+	if (rc != 0)
+		return rc;
+
+	sk->sk_max_ack_backlog = 0;
+	sk->sk_ack_backlog = 0;
+	inet_csk_delack_init(sk);
+
+	/* There is race window here: we announce ourselves listening,
+	 * but this transition is still not validated by get_port().
+	 * It is OK, because this socket enters to hash table only
+	 * after validation is complete.
+	 */
+	sk->sk_state = TCP_LISTEN;
+	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+		inet->inet_sport = htons(inet->inet_num);
+
+		sk_dst_reset(sk);
+		sk->sk_prot->hash(sk);
+
+		return 0;
+	}
+
+	sk->sk_state = TCP_CLOSE;
+	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
+	return -EADDRINUSE;
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *acc_req;
+	struct request_sock *req;
+
+	inet_csk_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+	/* Following specs, it would be better either to send FIN
+	 * (and enter FIN-WAIT-1, it is normal close)
+	 * or to send active reset (abort).
+	 * Certainly, it is pretty dangerous while synflood, but it is
+	 * bad justification for our negligence 8)
+	 * To be honest, we are not able to make either
+	 * of the variants now.			--ANK
+	 */
+	reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+	while ((req = acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		WARN_ON(sock_owned_by_user(child));
+		sock_hold(child);
+
+		sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		percpu_counter_inc(sk->sk_prot->orphan_count);
+
+		inet_csk_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		sk_acceptq_removed(sk);
+		__reqsk_free(req);
+	}
+	WARN_ON(sk->sk_ack_backlog);
+}
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	const struct inet_sock *inet = inet_sk(sk);
+
+	sin->sin_family		= AF_INET;
+	sin->sin_addr.s_addr	= inet->inet_daddr;
+	sin->sin_port		= inet->inet_dport;
+}
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
+
+#ifdef CONFIG_COMPAT
+int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
+			       char __user *optval, int __user *optlen)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_af_ops->compat_getsockopt != NULL)
+		return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
+							    optval, optlen);
+	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+					     optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
+
+int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
+			       char __user *optval, unsigned int optlen)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_af_ops->compat_setsockopt != NULL)
+		return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
+							    optval, optlen);
+	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+					     optval, optlen);
+}
+EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
+#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 00000000..3267d389
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,951 @@
+/*
+ * inet_diag.c	Module for monitoring INET transport protocols sockets.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/netlink.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+	__be32 *saddr;
+	__be32 *daddr;
+	u16 sport;
+	u16 dport;
+	u16 family;
+	u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static DEFINE_MUTEX(inet_diag_table_mutex);
+
+static const struct inet_diag_handler *inet_diag_lock_handler(int type)
+{
+	if (!inet_diag_table[type])
+		request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+			       NETLINK_INET_DIAG, type);
+
+	mutex_lock(&inet_diag_table_mutex);
+	if (!inet_diag_table[type])
+		return ERR_PTR(-ENOENT);
+
+	return inet_diag_table[type];
+}
+
+static inline void inet_diag_unlock_handler(
+	const struct inet_diag_handler *handler)
+{
+	mutex_unlock(&inet_diag_table_mutex);
+}
+
+static int inet_csk_diag_fill(struct sock *sk,
+			      struct sk_buff *skb,
+			      int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+			      const struct nlmsghdr *unlh)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	void *info = NULL;
+	struct inet_diag_meminfo  *minfo = NULL;
+	unsigned char	 *b = skb_tail_pointer(skb);
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_table[unlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = nlmsg_flags;
+
+	r = NLMSG_DATA(nlh);
+	BUG_ON(sk->sk_state == TCP_TIME_WAIT);
+
+	if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+		minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
+
+	if (ext & (1 << (INET_DIAG_INFO - 1)))
+		info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+				     handler->idiag_info_size);
+
+	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+		const size_t len = strlen(icsk->icsk_ca_ops->name);
+
+		strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+		       icsk->icsk_ca_ops->name);
+	}
+
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = sk->sk_state;
+	r->idiag_timer = 0;
+	r->idiag_retrans = 0;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+	r->id.idiag_sport = inet->inet_sport;
+	r->id.idiag_dport = inet->inet_dport;
+	r->id.idiag_src[0] = inet->inet_rcv_saddr;
+	r->id.idiag_dst[0] = inet->inet_daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		const struct ipv6_pinfo *np = inet6_sk(sk);
+
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &np->rcv_saddr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &np->daddr);
+	}
+#endif
+
+#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		r->idiag_timer = 4;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->idiag_timer = 2;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_expires = 0;
+	}
+#undef EXPIRES_IN_MS
+
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = sock_i_ino(sk);
+
+	if (minfo) {
+		minfo->idiag_rmem = sk_rmem_alloc_get(sk);
+		minfo->idiag_wmem = sk->sk_wmem_queued;
+		minfo->idiag_fmem = sk->sk_forward_alloc;
+		minfo->idiag_tmem = sk_wmem_alloc_get(sk);
+	}
+
+	handler->idiag_get_info(sk, r, info);
+
+	if (sk->sk_state < TCP_TIME_WAIT &&
+	    icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+		icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
+			       struct sk_buff *skb, int ext, u32 pid,
+			       u32 seq, u16 nlmsg_flags,
+			       const struct nlmsghdr *unlh)
+{
+	long tmo;
+	struct inet_diag_msg *r;
+	const unsigned char *previous_tail = skb_tail_pointer(skb);
+	struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
+					 unlh->nlmsg_type, sizeof(*r));
+
+	r = NLMSG_DATA(nlh);
+	BUG_ON(tw->tw_state != TCP_TIME_WAIT);
+
+	nlh->nlmsg_flags = nlmsg_flags;
+
+	tmo = tw->tw_ttd - jiffies;
+	if (tmo < 0)
+		tmo = 0;
+
+	r->idiag_family	      = tw->tw_family;
+	r->idiag_retrans      = 0;
+	r->id.idiag_if	      = tw->tw_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
+	r->id.idiag_sport     = tw->tw_sport;
+	r->id.idiag_dport     = tw->tw_dport;
+	r->id.idiag_src[0]    = tw->tw_rcv_saddr;
+	r->id.idiag_dst[0]    = tw->tw_daddr;
+	r->idiag_state	      = tw->tw_substate;
+	r->idiag_timer	      = 3;
+	r->idiag_expires      = DIV_ROUND_UP(tmo * 1000, HZ);
+	r->idiag_rqueue	      = 0;
+	r->idiag_wqueue	      = 0;
+	r->idiag_uid	      = 0;
+	r->idiag_inode	      = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (tw->tw_family == AF_INET6) {
+		const struct inet6_timewait_sock *tw6 =
+						inet6_twsk((struct sock *)tw);
+
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &tw6->tw_v6_rcv_saddr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &tw6->tw_v6_daddr);
+	}
+#endif
+	nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
+	return skb->len;
+nlmsg_failure:
+	nlmsg_trim(skb, previous_tail);
+	return -EMSGSIZE;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+			int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+			const struct nlmsghdr *unlh)
+{
+	if (sk->sk_state == TCP_TIME_WAIT)
+		return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
+					   skb, ext, pid, seq, nlmsg_flags,
+					   unlh);
+	return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb,
+			       const struct nlmsghdr *nlh)
+{
+	int err;
+	struct sock *sk;
+	struct inet_diag_req *req = NLMSG_DATA(nlh);
+	struct sk_buff *rep;
+	struct inet_hashinfo *hashinfo;
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_lock_handler(nlh->nlmsg_type);
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		goto unlock;
+	}
+
+	hashinfo = handler->idiag_hashinfo;
+	err = -EINVAL;
+
+	if (req->idiag_family == AF_INET) {
+		sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+				 req->id.idiag_dport, req->id.idiag_src[0],
+				 req->id.idiag_sport, req->id.idiag_if);
+	}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	else if (req->idiag_family == AF_INET6) {
+		sk = inet6_lookup(&init_net, hashinfo,
+				  (struct in6_addr *)req->id.idiag_dst,
+				  req->id.idiag_dport,
+				  (struct in6_addr *)req->id.idiag_src,
+				  req->id.idiag_sport,
+				  req->id.idiag_if);
+	}
+#endif
+	else {
+		goto unlock;
+	}
+
+	err = -ENOENT;
+	if (sk == NULL)
+		goto unlock;
+
+	err = -ESTALE;
+	if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+	     req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+	    ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+		goto out;
+
+	err = -ENOMEM;
+	rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+				     sizeof(struct inet_diag_meminfo) +
+				     handler->idiag_info_size + 64)),
+			GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	err = sk_diag_fill(sk, rep, req->idiag_ext,
+			   NETLINK_CB(in_skb).pid,
+			   nlh->nlmsg_seq, 0, nlh);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(rep);
+		goto out;
+	}
+	err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+
+out:
+	if (sk) {
+		if (sk->sk_state == TCP_TIME_WAIT)
+			inet_twsk_put((struct inet_timewait_sock *)sk);
+		else
+			sock_put(sk);
+	}
+unlock:
+	inet_diag_unlock_handler(handler);
+	return err;
+}
+
+static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
+{
+	int words = bits >> 5;
+
+	bits &= 0x1f;
+
+	if (words) {
+		if (memcmp(a1, a2, words << 2))
+			return 0;
+	}
+	if (bits) {
+		__be32 w1, w2;
+		__be32 mask;
+
+		w1 = a1[words];
+		w2 = a2[words];
+
+		mask = htonl((0xffffffff) << (32 - bits));
+
+		if ((w1 ^ w2) & mask)
+			return 0;
+	}
+
+	return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+			    const struct inet_diag_entry *entry)
+{
+	while (len > 0) {
+		int yes = 1;
+		const struct inet_diag_bc_op *op = bc;
+
+		switch (op->code) {
+		case INET_DIAG_BC_NOP:
+			break;
+		case INET_DIAG_BC_JMP:
+			yes = 0;
+			break;
+		case INET_DIAG_BC_S_GE:
+			yes = entry->sport >= op[1].no;
+			break;
+		case INET_DIAG_BC_S_LE:
+			yes = entry->sport <= op[1].no;
+			break;
+		case INET_DIAG_BC_D_GE:
+			yes = entry->dport >= op[1].no;
+			break;
+		case INET_DIAG_BC_D_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_AUTO:
+			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+			break;
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND: {
+			struct inet_diag_hostcond *cond;
+			__be32 *addr;
+
+			cond = (struct inet_diag_hostcond *)(op + 1);
+			if (cond->port != -1 &&
+			    cond->port != (op->code == INET_DIAG_BC_S_COND ?
+					     entry->sport : entry->dport)) {
+				yes = 0;
+				break;
+			}
+
+			if (cond->prefix_len == 0)
+				break;
+
+			if (op->code == INET_DIAG_BC_S_COND)
+				addr = entry->saddr;
+			else
+				addr = entry->daddr;
+
+			if (bitstring_match(addr, cond->addr,
+					    cond->prefix_len))
+				break;
+			if (entry->family == AF_INET6 &&
+			    cond->family == AF_INET) {
+				if (addr[0] == 0 && addr[1] == 0 &&
+				    addr[2] == htonl(0xffff) &&
+				    bitstring_match(addr + 3, cond->addr,
+						    cond->prefix_len))
+					break;
+			}
+			yes = 0;
+			break;
+		}
+		}
+
+		if (yes) {
+			len -= op->yes;
+			bc += op->yes;
+		} else {
+			len -= op->no;
+			bc += op->no;
+		}
+	}
+	return len == 0;
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+	while (len >= 0) {
+		const struct inet_diag_bc_op *op = bc;
+
+		if (cc > len)
+			return 0;
+		if (cc == len)
+			return 1;
+		if (op->yes < 4 || op->yes & 3)
+			return 0;
+		len -= op->yes;
+		bc  += op->yes;
+	}
+	return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+	const void *bc = bytecode;
+	int  len = bytecode_len;
+
+	while (len > 0) {
+		const struct inet_diag_bc_op *op = bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+		switch (op->code) {
+		case INET_DIAG_BC_AUTO:
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND:
+		case INET_DIAG_BC_S_GE:
+		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_GE:
+		case INET_DIAG_BC_D_LE:
+		case INET_DIAG_BC_JMP:
+			if (op->no < 4 || op->no > len + 4 || op->no & 3)
+				return -EINVAL;
+			if (op->no < len &&
+			    !valid_cc(bytecode, bytecode_len, len - op->no))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_NOP:
+			break;
+		default:
+			return -EINVAL;
+		}
+		if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
+			return -EINVAL;
+		bc  += op->yes;
+		len -= op->yes;
+	}
+	return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_csk_diag_dump(struct sock *sk,
+			      struct sk_buff *skb,
+			      struct netlink_callback *cb)
+{
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
+		struct inet_diag_entry entry;
+		const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+							  sizeof(*r),
+							  INET_DIAG_REQ_BYTECODE);
+		struct inet_sock *inet = inet_sk(sk);
+
+		entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (entry.family == AF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			entry.saddr = np->rcv_saddr.s6_addr32;
+			entry.daddr = np->daddr.s6_addr32;
+		} else
+#endif
+		{
+			entry.saddr = &inet->inet_rcv_saddr;
+			entry.daddr = &inet->inet_daddr;
+		}
+		entry.sport = inet->inet_num;
+		entry.dport = ntohs(inet->inet_dport);
+		entry.userlocks = sk->sk_userlocks;
+
+		if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
+			return 0;
+	}
+
+	return inet_csk_diag_fill(sk, skb, r->idiag_ext,
+				  NETLINK_CB(cb->skb).pid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
+			       struct sk_buff *skb,
+			       struct netlink_callback *cb)
+{
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
+		struct inet_diag_entry entry;
+		const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
+							  sizeof(*r),
+							  INET_DIAG_REQ_BYTECODE);
+
+		entry.family = tw->tw_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (tw->tw_family == AF_INET6) {
+			struct inet6_timewait_sock *tw6 =
+						inet6_twsk((struct sock *)tw);
+			entry.saddr = tw6->tw_v6_rcv_saddr.s6_addr32;
+			entry.daddr = tw6->tw_v6_daddr.s6_addr32;
+		} else
+#endif
+		{
+			entry.saddr = &tw->tw_rcv_saddr;
+			entry.daddr = &tw->tw_daddr;
+		}
+		entry.sport = tw->tw_num;
+		entry.dport = ntohs(tw->tw_dport);
+		entry.userlocks = 0;
+
+		if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
+			return 0;
+	}
+
+	return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
+				   NETLINK_CB(cb->skb).pid,
+				   cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+			      struct request_sock *req, u32 pid, u32 seq,
+			      const struct nlmsghdr *unlh)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct inet_diag_msg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = NLM_F_MULTI;
+	r = NLMSG_DATA(nlh);
+
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = TCP_SYN_RECV;
+	r->idiag_timer = 1;
+	r->idiag_retrans = req->retrans;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+	tmo = req->expires - jiffies;
+	if (tmo < 0)
+		tmo = 0;
+
+	r->id.idiag_sport = inet->inet_sport;
+	r->id.idiag_dport = ireq->rmt_port;
+	r->id.idiag_src[0] = ireq->loc_addr;
+	r->id.idiag_dst[0] = ireq->rmt_addr;
+	r->idiag_expires = jiffies_to_msecs(tmo);
+	r->idiag_rqueue = 0;
+	r->idiag_wqueue = 0;
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &inet6_rsk(req)->loc_addr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &inet6_rsk(req)->rmt_addr);
+	}
+#endif
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+
+	return skb->len;
+
+nlmsg_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+			       struct netlink_callback *cb)
+{
+	struct inet_diag_entry entry;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt;
+	const struct nlattr *bc = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	int j, s_j;
+	int reqnum, s_reqnum;
+	int err = 0;
+
+	s_j = cb->args[3];
+	s_reqnum = cb->args[4];
+
+	if (s_j > 0)
+		s_j--;
+
+	entry.family = sk->sk_family;
+
+	read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	lopt = icsk->icsk_accept_queue.listen_opt;
+	if (!lopt || !lopt->qlen)
+		goto out;
+
+	if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
+		bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
+				     INET_DIAG_REQ_BYTECODE);
+		entry.sport = inet->inet_num;
+		entry.userlocks = sk->sk_userlocks;
+	}
+
+	for (j = s_j; j < lopt->nr_table_entries; j++) {
+		struct request_sock *req, *head = lopt->syn_table[j];
+
+		reqnum = 0;
+		for (req = head; req; reqnum++, req = req->dl_next) {
+			struct inet_request_sock *ireq = inet_rsk(req);
+
+			if (reqnum < s_reqnum)
+				continue;
+			if (r->id.idiag_dport != ireq->rmt_port &&
+			    r->id.idiag_dport)
+				continue;
+
+			if (bc) {
+				entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					inet6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+					&ireq->loc_addr;
+				entry.daddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					inet6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+					&ireq->rmt_addr;
+				entry.dport = ntohs(ireq->rmt_port);
+
+				if (!inet_diag_bc_run(nla_data(bc),
+						      nla_len(bc), &entry))
+					continue;
+			}
+
+			err = inet_diag_fill_req(skb, sk, req,
+					       NETLINK_CB(cb->skb).pid,
+					       cb->nlh->nlmsg_seq, cb->nlh);
+			if (err < 0) {
+				cb->args[3] = j + 1;
+				cb->args[4] = reqnum;
+				goto out;
+			}
+		}
+
+		s_reqnum = 0;
+	}
+
+out:
+	read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, num;
+	int s_i, s_num;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	const struct inet_diag_handler *handler;
+	struct inet_hashinfo *hashinfo;
+
+	handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
+	if (IS_ERR(handler))
+		goto unlock;
+
+	hashinfo = handler->idiag_hashinfo;
+
+	s_i = cb->args[1];
+	s_num = num = cb->args[2];
+
+	if (cb->args[0] == 0) {
+		if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+			goto skip_listen_ht;
+
+		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+			struct sock *sk;
+			struct hlist_nulls_node *node;
+			struct inet_listen_hashbucket *ilb;
+
+			num = 0;
+			ilb = &hashinfo->listening_hash[i];
+			spin_lock_bh(&ilb->lock);
+			sk_nulls_for_each(sk, node, &ilb->head) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num) {
+					num++;
+					continue;
+				}
+
+				if (r->id.idiag_sport != inet->inet_sport &&
+				    r->id.idiag_sport)
+					goto next_listen;
+
+				if (!(r->idiag_states & TCPF_LISTEN) ||
+				    r->id.idiag_dport ||
+				    cb->args[3] > 0)
+					goto syn_recv;
+
+				if (inet_csk_diag_dump(sk, skb, cb) < 0) {
+					spin_unlock_bh(&ilb->lock);
+					goto done;
+				}
+
+syn_recv:
+				if (!(r->idiag_states & TCPF_SYN_RECV))
+					goto next_listen;
+
+				if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+					spin_unlock_bh(&ilb->lock);
+					goto done;
+				}
+
+next_listen:
+				cb->args[3] = 0;
+				cb->args[4] = 0;
+				++num;
+			}
+			spin_unlock_bh(&ilb->lock);
+
+			s_num = 0;
+			cb->args[3] = 0;
+			cb->args[4] = 0;
+		}
+skip_listen_ht:
+		cb->args[0] = 1;
+		s_i = num = s_num = 0;
+	}
+
+	if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+		goto unlock;
+
+	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+		struct sock *sk;
+		struct hlist_nulls_node *node;
+
+		num = 0;
+
+		if (hlist_nulls_empty(&head->chain) &&
+			hlist_nulls_empty(&head->twchain))
+			continue;
+
+		if (i > s_i)
+			s_num = 0;
+
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &head->chain) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (num < s_num)
+				goto next_normal;
+			if (!(r->idiag_states & (1 << sk->sk_state)))
+				goto next_normal;
+			if (r->id.idiag_sport != inet->inet_sport &&
+			    r->id.idiag_sport)
+				goto next_normal;
+			if (r->id.idiag_dport != inet->inet_dport &&
+			    r->id.idiag_dport)
+				goto next_normal;
+			if (inet_csk_diag_dump(sk, skb, cb) < 0) {
+				spin_unlock_bh(lock);
+				goto done;
+			}
+next_normal:
+			++num;
+		}
+
+		if (r->idiag_states & TCPF_TIME_WAIT) {
+			struct inet_timewait_sock *tw;
+
+			inet_twsk_for_each(tw, node,
+				    &head->twchain) {
+
+				if (num < s_num)
+					goto next_dying;
+				if (r->id.idiag_sport != tw->tw_sport &&
+				    r->id.idiag_sport)
+					goto next_dying;
+				if (r->id.idiag_dport != tw->tw_dport &&
+				    r->id.idiag_dport)
+					goto next_dying;
+				if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
+					spin_unlock_bh(lock);
+					goto done;
+				}
+next_dying:
+				++num;
+			}
+		}
+		spin_unlock_bh(lock);
+	}
+
+done:
+	cb->args[1] = i;
+	cb->args[2] = num;
+unlock:
+	inet_diag_unlock_handler(handler);
+	return skb->len;
+}
+
+static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	int hdrlen = sizeof(struct inet_diag_req);
+
+	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
+	    nlmsg_len(nlh) < hdrlen)
+		return -EINVAL;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		if (nlmsg_attrlen(nlh, hdrlen)) {
+			struct nlattr *attr;
+
+			attr = nlmsg_find_attr(nlh, hdrlen,
+					       INET_DIAG_REQ_BYTECODE);
+			if (attr == NULL ||
+			    nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
+			    inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
+				return -EINVAL;
+		}
+
+		return netlink_dump_start(idiagnl, skb, nlh,
+					  inet_diag_dump, NULL);
+	}
+
+	return inet_diag_get_exact(skb, nlh);
+}
+
+static DEFINE_MUTEX(inet_diag_mutex);
+
+static void inet_diag_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&inet_diag_mutex);
+	netlink_rcv_skb(skb, &inet_diag_rcv_msg);
+	mutex_unlock(&inet_diag_mutex);
+}
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+	int err = -EINVAL;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		goto out;
+
+	mutex_lock(&inet_diag_table_mutex);
+	err = -EEXIST;
+	if (inet_diag_table[type] == NULL) {
+		inet_diag_table[type] = h;
+		err = 0;
+	}
+	mutex_unlock(&inet_diag_table_mutex);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		return;
+
+	mutex_lock(&inet_diag_table_mutex);
+	inet_diag_table[type] = NULL;
+	mutex_unlock(&inet_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+	const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+					  sizeof(struct inet_diag_handler *));
+	int err = -ENOMEM;
+
+	inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
+	if (!inet_diag_table)
+		goto out;
+
+	idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
+					inet_diag_rcv, NULL, THIS_MODULE);
+	if (idiagnl == NULL)
+		goto out_free_table;
+	err = 0;
+out:
+	return err;
+out_free_table:
+	kfree(inet_diag_table);
+	goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+	netlink_kernel_release(idiagnl);
+	kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
new file mode 100644
index 00000000..5ff2a51b
--- /dev/null
+++ b/net/ipv4/inet_fragment.c
@@ -0,0 +1,286 @@
+/*
+ * inet fragments management
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * 		Authors:	Pavel Emelyanov <xemul@openvz.org>
+ *				Started as consolidation of ipv4/ip_fragment.c,
+ *				ipv6/reassembly. and ipv6 nf conntrack reassembly
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include <net/inet_frag.h>
+
+static void inet_frag_secret_rebuild(unsigned long dummy)
+{
+	struct inet_frags *f = (struct inet_frags *)dummy;
+	unsigned long now = jiffies;
+	int i;
+
+	write_lock(&f->lock);
+	get_random_bytes(&f->rnd, sizeof(u32));
+	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_queue *q;
+		struct hlist_node *p, *n;
+
+		hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+			unsigned int hval = f->hashfn(q);
+
+			if (hval != i) {
+				hlist_del(&q->list);
+
+				/* Relink to new hash chain. */
+				hlist_add_head(&q->list, &f->hash[hval]);
+			}
+		}
+	}
+	write_unlock(&f->lock);
+
+	mod_timer(&f->secret_timer, now + f->secret_interval);
+}
+
+void inet_frags_init(struct inet_frags *f)
+{
+	int i;
+
+	for (i = 0; i < INETFRAGS_HASHSZ; i++)
+		INIT_HLIST_HEAD(&f->hash[i]);
+
+	rwlock_init(&f->lock);
+
+	f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+				   (jiffies ^ (jiffies >> 6)));
+
+	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
+			(unsigned long)f);
+	f->secret_timer.expires = jiffies + f->secret_interval;
+	add_timer(&f->secret_timer);
+}
+EXPORT_SYMBOL(inet_frags_init);
+
+void inet_frags_init_net(struct netns_frags *nf)
+{
+	nf->nqueues = 0;
+	atomic_set(&nf->mem, 0);
+	INIT_LIST_HEAD(&nf->lru_list);
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
+void inet_frags_fini(struct inet_frags *f)
+{
+	del_timer(&f->secret_timer);
+}
+EXPORT_SYMBOL(inet_frags_fini);
+
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+	nf->low_thresh = 0;
+
+	local_bh_disable();
+	inet_frag_evictor(nf, f);
+	local_bh_enable();
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+	write_lock(&f->lock);
+	hlist_del(&fq->list);
+	list_del(&fq->lru_list);
+	fq->net->nqueues--;
+	write_unlock(&f->lock);
+}
+
+void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+	if (del_timer(&fq->timer))
+		atomic_dec(&fq->refcnt);
+
+	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
+		fq_unlink(fq, f);
+		atomic_dec(&fq->refcnt);
+		fq->last_in |= INET_FRAG_COMPLETE;
+	}
+}
+EXPORT_SYMBOL(inet_frag_kill);
+
+static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
+		struct sk_buff *skb, int *work)
+{
+	if (work)
+		*work -= skb->truesize;
+
+	atomic_sub(skb->truesize, &nf->mem);
+	if (f->skb_free)
+		f->skb_free(skb);
+	kfree_skb(skb);
+}
+
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
+					int *work)
+{
+	struct sk_buff *fp;
+	struct netns_frags *nf;
+
+	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
+	WARN_ON(del_timer(&q->timer) != 0);
+
+	/* Release all fragment data. */
+	fp = q->fragments;
+	nf = q->net;
+	while (fp) {
+		struct sk_buff *xp = fp->next;
+
+		frag_kfree_skb(nf, f, fp, work);
+		fp = xp;
+	}
+
+	if (work)
+		*work -= f->qsize;
+	atomic_sub(f->qsize, &nf->mem);
+
+	if (f->destructor)
+		f->destructor(q);
+	kfree(q);
+
+}
+EXPORT_SYMBOL(inet_frag_destroy);
+
+int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
+{
+	struct inet_frag_queue *q;
+	int work, evicted = 0;
+
+	work = atomic_read(&nf->mem) - nf->low_thresh;
+	while (work > 0) {
+		read_lock(&f->lock);
+		if (list_empty(&nf->lru_list)) {
+			read_unlock(&f->lock);
+			break;
+		}
+
+		q = list_first_entry(&nf->lru_list,
+				struct inet_frag_queue, lru_list);
+		atomic_inc(&q->refcnt);
+		read_unlock(&f->lock);
+
+		spin_lock(&q->lock);
+		if (!(q->last_in & INET_FRAG_COMPLETE))
+			inet_frag_kill(q, f);
+		spin_unlock(&q->lock);
+
+		if (atomic_dec_and_test(&q->refcnt))
+			inet_frag_destroy(q, f, &work);
+		evicted++;
+	}
+
+	return evicted;
+}
+EXPORT_SYMBOL(inet_frag_evictor);
+
+static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
+		struct inet_frag_queue *qp_in, struct inet_frags *f,
+		void *arg)
+{
+	struct inet_frag_queue *qp;
+#ifdef CONFIG_SMP
+	struct hlist_node *n;
+#endif
+	unsigned int hash;
+
+	write_lock(&f->lock);
+	/*
+	 * While we stayed w/o the lock other CPU could update
+	 * the rnd seed, so we need to re-calculate the hash
+	 * chain. Fortunatelly the qp_in can be used to get one.
+	 */
+	hash = f->hashfn(qp_in);
+#ifdef CONFIG_SMP
+	/* With SMP race we have to recheck hash table, because
+	 * such entry could be created on other cpu, while we
+	 * promoted read lock to write lock.
+	 */
+	hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+		if (qp->net == nf && f->match(qp, arg)) {
+			atomic_inc(&qp->refcnt);
+			write_unlock(&f->lock);
+			qp_in->last_in |= INET_FRAG_COMPLETE;
+			inet_frag_put(qp_in, f);
+			return qp;
+		}
+	}
+#endif
+	qp = qp_in;
+	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
+		atomic_inc(&qp->refcnt);
+
+	atomic_inc(&qp->refcnt);
+	hlist_add_head(&qp->list, &f->hash[hash]);
+	list_add_tail(&qp->lru_list, &nf->lru_list);
+	nf->nqueues++;
+	write_unlock(&f->lock);
+	return qp;
+}
+
+static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+		struct inet_frags *f, void *arg)
+{
+	struct inet_frag_queue *q;
+
+	q = kzalloc(f->qsize, GFP_ATOMIC);
+	if (q == NULL)
+		return NULL;
+
+	f->constructor(q, arg);
+	atomic_add(f->qsize, &nf->mem);
+	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
+	spin_lock_init(&q->lock);
+	atomic_set(&q->refcnt, 1);
+	q->net = nf;
+
+	return q;
+}
+
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
+		struct inet_frags *f, void *arg)
+{
+	struct inet_frag_queue *q;
+
+	q = inet_frag_alloc(nf, f, arg);
+	if (q == NULL)
+		return NULL;
+
+	return inet_frag_intern(nf, q, f, arg);
+}
+
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
+		struct inet_frags *f, void *key, unsigned int hash)
+	__releases(&f->lock)
+{
+	struct inet_frag_queue *q;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(q, n, &f->hash[hash], list) {
+		if (q->net == nf && f->match(q, key)) {
+			atomic_inc(&q->refcnt);
+			read_unlock(&f->lock);
+			return q;
+		}
+	}
+	read_unlock(&f->lock);
+
+	return inet_frag_create(nf, f, key);
+}
+EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 00000000..984ec656
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,584 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/secure_seq.h>
+#include <net/ip.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
+						 struct net *net,
+						 struct inet_bind_hashbucket *head,
+						 const unsigned short snum)
+{
+	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+	if (tb != NULL) {
+		write_pnet(&tb->ib_net, hold_net(net));
+		tb->port      = snum;
+		tb->fastreuse = 0;
+		tb->num_owners = 0;
+		INIT_HLIST_HEAD(&tb->owners);
+		hlist_add_head(&tb->node, &head->chain);
+	}
+	return tb;
+}
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+{
+	if (hlist_empty(&tb->owners)) {
+		__hlist_del(&tb->node);
+		release_net(ib_net(tb));
+		kmem_cache_free(cachep, tb);
+	}
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+		    const unsigned short snum)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+
+	atomic_inc(&hashinfo->bsockets);
+
+	inet_sk(sk)->inet_num = snum;
+	sk_add_bind_node(sk, &tb->owners);
+	tb->num_owners++;
+	inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
+			hashinfo->bhash_size);
+	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+	struct inet_bind_bucket *tb;
+
+	atomic_dec(&hashinfo->bsockets);
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	__sk_del_bind_node(sk);
+	tb->num_owners--;
+	inet_csk(sk)->icsk_bind_hash = NULL;
+	inet_sk(sk)->inet_num = 0;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct sock *sk)
+{
+	local_bh_disable();
+	__inet_put_port(sk);
+	local_bh_enable();
+}
+EXPORT_SYMBOL(inet_put_port);
+
+int __inet_inherit_port(struct sock *sk, struct sock *child)
+{
+	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+	unsigned short port = inet_sk(child)->inet_num;
+	const int bhash = inet_bhashfn(sock_net(sk), port,
+			table->bhash_size);
+	struct inet_bind_hashbucket *head = &table->bhash[bhash];
+	struct inet_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	if (tb->port != port) {
+		/* NOTE: using tproxy and redirecting skbs to a proxy
+		 * on a different listener port breaks the assumption
+		 * that the listener socket's icsk_bind_hash is the same
+		 * as that of the child socket. We have to look up or
+		 * create a new bind bucket for the child here. */
+		struct hlist_node *node;
+		inet_bind_bucket_for_each(tb, node, &head->chain) {
+			if (net_eq(ib_net(tb), sock_net(sk)) &&
+			    tb->port == port)
+				break;
+		}
+		if (!node) {
+			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+						     sock_net(sk), head, port);
+			if (!tb) {
+				spin_unlock(&head->lock);
+				return -ENOMEM;
+			}
+		}
+	}
+	inet_bind_hash(child, tb, port);
+	spin_unlock(&head->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__inet_inherit_port);
+
+static inline int compute_score(struct sock *sk, struct net *net,
+				const unsigned short hnum, const __be32 daddr,
+				const int dif)
+{
+	int score = -1;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
+			!ipv6_only_sock(sk)) {
+		__be32 rcv_saddr = inet->inet_rcv_saddr;
+		score = sk->sk_family == PF_INET ? 1 : 0;
+		if (rcv_saddr) {
+			if (rcv_saddr != daddr)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+
+
+struct sock *__inet_lookup_listener(struct net *net,
+				    struct inet_hashinfo *hashinfo,
+				    const __be32 daddr, const unsigned short hnum,
+				    const int dif)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned int hash = inet_lhashfn(net, hnum);
+	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+	int score, hiscore;
+
+	rcu_read_lock();
+begin:
+	result = NULL;
+	hiscore = -1;
+	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+		score = compute_score(sk, net, hnum, daddr, dif);
+		if (score > hiscore) {
+			result = sk;
+			hiscore = score;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+		goto begin;
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, daddr,
+				  dif) < hiscore)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+struct sock * __inet_lookup_established(struct net *net,
+				  struct inet_hashinfo *hashinfo,
+				  const __be32 saddr, const __be16 sport,
+				  const __be32 daddr, const u16 hnum,
+				  const int dif)
+{
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+	unsigned int slot = hash & hashinfo->ehash_mask;
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+	rcu_read_lock();
+begin:
+	sk_nulls_for_each_rcu(sk, node, &head->chain) {
+		if (INET_MATCH(sk, net, hash, acookie,
+					saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+				goto begintw;
+			if (unlikely(!INET_MATCH(sk, net, hash, acookie,
+				saddr, daddr, ports, dif))) {
+				sock_put(sk);
+				goto begin;
+			}
+			goto out;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+begintw:
+	/* Must check for a TIME_WAIT'er before going to listener hash. */
+	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+		if (INET_TW_MATCH(sk, net, hash, acookie,
+					saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+				sk = NULL;
+				goto out;
+			}
+			if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
+				 saddr, daddr, ports, dif))) {
+				sock_put(sk);
+				goto begintw;
+			}
+			goto out;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begintw;
+	sk = NULL;
+out:
+	rcu_read_unlock();
+	return sk;
+}
+EXPORT_SYMBOL_GPL(__inet_lookup_established);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+				    struct sock *sk, __u16 lport,
+				    struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_sock *inet = inet_sk(sk);
+	__be32 daddr = inet->inet_rcv_saddr;
+	__be32 saddr = inet->inet_daddr;
+	int dif = sk->sk_bound_dev_if;
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+	struct net *net = sock_net(sk);
+	unsigned int hash = inet_ehashfn(net, daddr, lport,
+					 saddr, inet->inet_dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_nulls_node *node;
+	struct inet_timewait_sock *tw;
+	int twrefcnt = 0;
+
+	spin_lock(lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_nulls_for_each(sk2, node, &head->twchain) {
+		tw = inet_twsk(sk2);
+
+		if (INET_TW_MATCH(sk2, net, hash, acookie,
+					saddr, daddr, ports, dif)) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_nulls_for_each(sk2, node, &head->chain) {
+		if (INET_MATCH(sk2, net, hash, acookie,
+					saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->inet_num = lport;
+	inet->inet_sport = htons(lport);
+	sk->sk_hash = hash;
+	WARN_ON(!sk_unhashed(sk));
+	__sk_nulls_add_node_rcu(sk, &head->chain);
+	if (tw) {
+		twrefcnt = inet_twsk_unhash(tw);
+		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+	}
+	spin_unlock(lock);
+	if (twrefcnt)
+		inet_twsk_put(tw);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+	if (twp) {
+		*twp = tw;
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+
+		inet_twsk_put(tw);
+	}
+	return 0;
+
+not_unique:
+	spin_unlock(lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+					  inet->inet_daddr,
+					  inet->inet_dport);
+}
+
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct hlist_nulls_head *list;
+	spinlock_t *lock;
+	struct inet_ehash_bucket *head;
+	int twrefcnt = 0;
+
+	WARN_ON(!sk_unhashed(sk));
+
+	sk->sk_hash = inet_sk_ehashfn(sk);
+	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	list = &head->chain;
+	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+	spin_lock(lock);
+	__sk_nulls_add_node_rcu(sk, list);
+	if (tw) {
+		WARN_ON(sk->sk_hash != tw->tw_hash);
+		twrefcnt = inet_twsk_unhash(tw);
+	}
+	spin_unlock(lock);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	return twrefcnt;
+}
+EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+
+static void __inet_hash(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	struct inet_listen_hashbucket *ilb;
+
+	if (sk->sk_state != TCP_LISTEN) {
+		__inet_hash_nolisten(sk, NULL);
+		return;
+	}
+
+	WARN_ON(!sk_unhashed(sk));
+	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+
+	spin_lock(&ilb->lock);
+	__sk_nulls_add_node_rcu(sk, &ilb->head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	spin_unlock(&ilb->lock);
+}
+
+void inet_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__inet_hash(sk);
+		local_bh_enable();
+	}
+}
+EXPORT_SYMBOL_GPL(inet_hash);
+
+void inet_unhash(struct sock *sk)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	spinlock_t *lock;
+	int done;
+
+	if (sk_unhashed(sk))
+		return;
+
+	if (sk->sk_state == TCP_LISTEN)
+		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+	else
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+	spin_lock_bh(lock);
+	done =__sk_nulls_del_node_init_rcu(sk);
+	if (done)
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL_GPL(inet_unhash);
+
+int __inet_hash_connect(struct inet_timewait_death_row *death_row,
+		struct sock *sk, u32 port_offset,
+		int (*check_established)(struct inet_timewait_death_row *,
+			struct sock *, __u16, struct inet_timewait_sock **),
+		int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const unsigned short snum = inet_sk(sk)->inet_num;
+	struct inet_bind_hashbucket *head;
+	struct inet_bind_bucket *tb;
+	int ret;
+	struct net *net = sock_net(sk);
+	int twrefcnt = 1;
+
+	if (!snum) {
+		int i, remaining, low, high, port;
+		static u32 hint;
+		u32 offset = hint + port_offset;
+		struct hlist_node *node;
+		struct inet_timewait_sock *tw = NULL;
+
+		inet_get_local_port_range(&low, &high);
+		remaining = (high - low) + 1;
+
+		local_bh_disable();
+		for (i = 1; i <= remaining; i++) {
+			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
+			head = &hinfo->bhash[inet_bhashfn(net, port,
+					hinfo->bhash_size)];
+			spin_lock(&head->lock);
+
+			/* Does not bother with rcv_saddr checks,
+			 * because the established check is already
+			 * unique enough.
+			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+				if (net_eq(ib_net(tb), net) &&
+				    tb->port == port) {
+					if (tb->fastreuse >= 0)
+						goto next_port;
+					WARN_ON(hlist_empty(&tb->owners));
+					if (!check_established(death_row, sk,
+								port, &tw))
+						goto ok;
+					goto next_port;
+				}
+			}
+
+			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+					net, head, port);
+			if (!tb) {
+				spin_unlock(&head->lock);
+				break;
+			}
+			tb->fastreuse = -1;
+			goto ok;
+
+		next_port:
+			spin_unlock(&head->lock);
+		}
+		local_bh_enable();
+
+		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+		/* Head lock still held and bh's disabled */
+		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+			inet_sk(sk)->inet_sport = htons(port);
+			twrefcnt += hash(sk, tw);
+		}
+		if (tw)
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+		spin_unlock(&head->lock);
+
+		if (tw) {
+			inet_twsk_deschedule(tw, death_row);
+			while (twrefcnt) {
+				twrefcnt--;
+				inet_twsk_put(tw);
+			}
+		}
+
+		ret = 0;
+		goto out;
+	}
+
+	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
+	tb  = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+		hash(sk, NULL);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = check_established(death_row, sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+		      struct sock *sk)
+{
+	return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+			__inet_check_established, __inet_hash_nolisten);
+}
+EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+	int i;
+
+	atomic_set(&h->bsockets, 0);
+	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
+		spin_lock_init(&h->listening_hash[i].lock);
+		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
+				      i + LISTENING_NULLS_BASE);
+		}
+}
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
new file mode 100644
index 00000000..85a0f75d
--- /dev/null
+++ b/net/ipv4/inet_lro.c
@@ -0,0 +1,600 @@
+/*
+ *  linux/net/ipv4/inet_lro.c
+ *
+ *  Large Receive Offload (ipv4 / tcp)
+ *
+ *  (C) Copyright IBM Corp. 2007
+ *
+ *  Authors:
+ *       Jan-Bernd Themann <themann@de.ibm.com>
+ *       Christoph Raisch <raisch@de.ibm.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
+MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
+
+#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
+#define IP_HDR_LEN(iph) (iph->ihl << 2)
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+	(ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+#define LRO_MAX_PG_HLEN 64
+
+#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
+
+/*
+ * Basic tcp checks whether packet is suitable for LRO
+ */
+
+static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
+			    int len, const struct net_lro_desc *lro_desc)
+{
+        /* check ip header: don't aggregate padded frames */
+	if (ntohs(iph->tot_len) != len)
+		return -1;
+
+	if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
+		return -1;
+
+	if (iph->ihl != IPH_LEN_WO_OPTIONS)
+		return -1;
+
+	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+	    tcph->rst || tcph->syn || tcph->fin)
+		return -1;
+
+	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+		return -1;
+
+	if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+	    tcph->doff != TCPH_LEN_W_TIMESTAMP)
+		return -1;
+
+	/* check tcp options (only timestamp allowed) */
+	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+		__be32 *topt = (__be32 *)(tcph + 1);
+
+		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				   | (TCPOPT_TIMESTAMP << 8)
+				   | TCPOLEN_TIMESTAMP))
+			return -1;
+
+		/* timestamp should be in right order */
+		topt++;
+		if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
+				      ntohl(*topt)))
+			return -1;
+
+		/* timestamp reply should not be zero */
+		topt++;
+		if (*topt == 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
+{
+	struct iphdr *iph = lro_desc->iph;
+	struct tcphdr *tcph = lro_desc->tcph;
+	__be32 *p;
+	__wsum tcp_hdr_csum;
+
+	tcph->ack_seq = lro_desc->tcp_ack;
+	tcph->window = lro_desc->tcp_window;
+
+	if (lro_desc->tcp_saw_tstamp) {
+		p = (__be32 *)(tcph + 1);
+		*(p+2) = lro_desc->tcp_rcv_tsecr;
+	}
+
+	iph->tot_len = htons(lro_desc->ip_tot_len);
+
+	iph->check = 0;
+	iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
+
+	tcph->check = 0;
+	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
+	lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
+	tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+					lro_desc->ip_tot_len -
+					IP_HDR_LEN(iph), IPPROTO_TCP,
+					lro_desc->data_csum);
+}
+
+static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
+{
+	__wsum tcp_csum;
+	__wsum tcp_hdr_csum;
+	__wsum tcp_ps_hdr_csum;
+
+	tcp_csum = ~csum_unfold(tcph->check);
+	tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
+
+	tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+					     len + TCP_HDR_LEN(tcph),
+					     IPPROTO_TCP, 0);
+
+	return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
+			tcp_ps_hdr_csum);
+}
+
+static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+			  struct iphdr *iph, struct tcphdr *tcph,
+			  u16 vlan_tag, struct vlan_group *vgrp)
+{
+	int nr_frags;
+	__be32 *ptr;
+	u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	lro_desc->parent = skb;
+	lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
+	lro_desc->iph = iph;
+	lro_desc->tcph = tcph;
+	lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+	lro_desc->tcp_ack = tcph->ack_seq;
+	lro_desc->tcp_window = tcph->window;
+
+	lro_desc->pkt_aggr_cnt = 1;
+	lro_desc->ip_tot_len = ntohs(iph->tot_len);
+
+	if (tcph->doff == 8) {
+		ptr = (__be32 *)(tcph+1);
+		lro_desc->tcp_saw_tstamp = 1;
+		lro_desc->tcp_rcv_tsval = *(ptr+1);
+		lro_desc->tcp_rcv_tsecr = *(ptr+2);
+	}
+
+	lro_desc->mss = tcp_data_len;
+	lro_desc->vgrp = vgrp;
+	lro_desc->vlan_tag = vlan_tag;
+	lro_desc->active = 1;
+
+	lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
+						tcp_data_len);
+}
+
+static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
+{
+	memset(lro_desc, 0, sizeof(struct net_lro_desc));
+}
+
+static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
+			   struct tcphdr *tcph, int tcp_data_len)
+{
+	struct sk_buff *parent = lro_desc->parent;
+	__be32 *topt;
+
+	lro_desc->pkt_aggr_cnt++;
+	lro_desc->ip_tot_len += tcp_data_len;
+	lro_desc->tcp_next_seq += tcp_data_len;
+	lro_desc->tcp_window = tcph->window;
+	lro_desc->tcp_ack = tcph->ack_seq;
+
+	/* don't update tcp_rcv_tsval, would not work with PAWS */
+	if (lro_desc->tcp_saw_tstamp) {
+		topt = (__be32 *) (tcph + 1);
+		lro_desc->tcp_rcv_tsecr = *(topt + 2);
+	}
+
+	lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
+					     lro_tcp_data_csum(iph, tcph,
+							       tcp_data_len),
+					     parent->len);
+
+	parent->len += tcp_data_len;
+	parent->data_len += tcp_data_len;
+	if (tcp_data_len > lro_desc->mss)
+		lro_desc->mss = tcp_data_len;
+}
+
+static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+			   struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct sk_buff *parent = lro_desc->parent;
+	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+	lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+	skb_pull(skb, (skb->len - tcp_data_len));
+	parent->truesize += skb->truesize;
+
+	if (lro_desc->last_skb)
+		lro_desc->last_skb->next = skb;
+	else
+		skb_shinfo(parent)->frag_list = skb;
+
+	lro_desc->last_skb = skb;
+}
+
+static void lro_add_frags(struct net_lro_desc *lro_desc,
+			  int len, int hlen, int truesize,
+			  struct skb_frag_struct *skb_frags,
+			  struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct sk_buff *skb = lro_desc->parent;
+	int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+	lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+	skb->truesize += truesize;
+
+	skb_frags[0].page_offset += hlen;
+	skb_frags[0].size -= hlen;
+
+	while (tcp_data_len > 0) {
+		*(lro_desc->next_frag) = *skb_frags;
+		tcp_data_len -= skb_frags->size;
+		lro_desc->next_frag++;
+		skb_frags++;
+		skb_shinfo(skb)->nr_frags++;
+	}
+}
+
+static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
+			      struct iphdr *iph,
+			      struct tcphdr *tcph)
+{
+	if ((lro_desc->iph->saddr != iph->saddr) ||
+	    (lro_desc->iph->daddr != iph->daddr) ||
+	    (lro_desc->tcph->source != tcph->source) ||
+	    (lro_desc->tcph->dest != tcph->dest))
+		return -1;
+	return 0;
+}
+
+static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
+					 struct net_lro_desc *lro_arr,
+					 struct iphdr *iph,
+					 struct tcphdr *tcph)
+{
+	struct net_lro_desc *lro_desc = NULL;
+	struct net_lro_desc *tmp;
+	int max_desc = lro_mgr->max_desc;
+	int i;
+
+	for (i = 0; i < max_desc; i++) {
+		tmp = &lro_arr[i];
+		if (tmp->active)
+			if (!lro_check_tcp_conn(tmp, iph, tcph)) {
+				lro_desc = tmp;
+				goto out;
+			}
+	}
+
+	for (i = 0; i < max_desc; i++) {
+		if (!lro_arr[i].active) {
+			lro_desc = &lro_arr[i];
+			goto out;
+		}
+	}
+
+	LRO_INC_STATS(lro_mgr, no_desc);
+out:
+	return lro_desc;
+}
+
+static void lro_flush(struct net_lro_mgr *lro_mgr,
+		      struct net_lro_desc *lro_desc)
+{
+	if (lro_desc->pkt_aggr_cnt > 1)
+		lro_update_tcp_ip_header(lro_desc);
+
+	skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
+
+	if (lro_desc->vgrp) {
+		if (lro_mgr->features & LRO_F_NAPI)
+			vlan_hwaccel_receive_skb(lro_desc->parent,
+						 lro_desc->vgrp,
+						 lro_desc->vlan_tag);
+		else
+			vlan_hwaccel_rx(lro_desc->parent,
+					lro_desc->vgrp,
+					lro_desc->vlan_tag);
+
+	} else {
+		if (lro_mgr->features & LRO_F_NAPI)
+			netif_receive_skb(lro_desc->parent);
+		else
+			netif_rx(lro_desc->parent);
+	}
+
+	LRO_INC_STATS(lro_mgr, flushed);
+	lro_clear_desc(lro_desc);
+}
+
+static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
+			  struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+{
+	struct net_lro_desc *lro_desc;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	u64 flags;
+	int vlan_hdr_len = 0;
+
+	if (!lro_mgr->get_skb_header ||
+	    lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+				    &flags, priv))
+		goto out;
+
+	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+		goto out;
+
+	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+	if (!lro_desc)
+		goto out;
+
+	if ((skb->protocol == htons(ETH_P_8021Q)) &&
+	    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+		vlan_hdr_len = VLAN_HLEN;
+
+	if (!lro_desc->active) { /* start new lro session */
+		if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
+			goto out;
+
+		skb->ip_summed = lro_mgr->ip_summed_aggr;
+		lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+		LRO_INC_STATS(lro_mgr, aggregated);
+		return 0;
+	}
+
+	if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+		goto out2;
+
+	if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
+		goto out2;
+
+	lro_add_packet(lro_desc, skb, iph, tcph);
+	LRO_INC_STATS(lro_mgr, aggregated);
+
+	if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
+	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+		lro_flush(lro_mgr, lro_desc);
+
+	return 0;
+
+out2: /* send aggregated SKBs to stack */
+	lro_flush(lro_mgr, lro_desc);
+
+out:
+	return 1;
+}
+
+
+static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
+				   struct skb_frag_struct *frags,
+				   int len, int true_size,
+				   void *mac_hdr,
+				   int hlen, __wsum sum,
+				   u32 ip_summed)
+{
+	struct sk_buff *skb;
+	struct skb_frag_struct *skb_frags;
+	int data_len = len;
+	int hdr_len = min(len, hlen);
+
+	skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, lro_mgr->frag_align_pad);
+	skb->len = len;
+	skb->data_len = len - hdr_len;
+	skb->truesize += true_size;
+	skb->tail += hdr_len;
+
+	memcpy(skb->data, mac_hdr, hdr_len);
+
+	skb_frags = skb_shinfo(skb)->frags;
+	while (data_len > 0) {
+		*skb_frags = *frags;
+		data_len -= frags->size;
+		skb_frags++;
+		frags++;
+		skb_shinfo(skb)->nr_frags++;
+	}
+
+	skb_shinfo(skb)->frags[0].page_offset += hdr_len;
+	skb_shinfo(skb)->frags[0].size -= hdr_len;
+
+	skb->ip_summed = ip_summed;
+	skb->csum = sum;
+	skb->protocol = eth_type_trans(skb, lro_mgr->dev);
+	return skb;
+}
+
+static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
+					  struct skb_frag_struct *frags,
+					  int len, int true_size,
+					  struct vlan_group *vgrp,
+					  u16 vlan_tag, void *priv, __wsum sum)
+{
+	struct net_lro_desc *lro_desc;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct sk_buff *skb;
+	u64 flags;
+	void *mac_hdr;
+	int mac_hdr_len;
+	int hdr_len = LRO_MAX_PG_HLEN;
+	int vlan_hdr_len = 0;
+
+	if (!lro_mgr->get_frag_header ||
+	    lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
+				     (void *)&tcph, &flags, priv)) {
+		mac_hdr = page_address(frags->page) + frags->page_offset;
+		goto out1;
+	}
+
+	if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+		goto out1;
+
+	hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
+	mac_hdr_len = (int)((void *)(iph) - mac_hdr);
+
+	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+	if (!lro_desc)
+		goto out1;
+
+	if (!lro_desc->active) { /* start new lro session */
+		if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
+			goto out1;
+
+		skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
+				  hdr_len, 0, lro_mgr->ip_summed_aggr);
+		if (!skb)
+			goto out;
+
+		if ((skb->protocol == htons(ETH_P_8021Q)) &&
+		    !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+			vlan_hdr_len = VLAN_HLEN;
+
+		iph = (void *)(skb->data + vlan_hdr_len);
+		tcph = (void *)((u8 *)skb->data + vlan_hdr_len
+				+ IP_HDR_LEN(iph));
+
+		lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
+		LRO_INC_STATS(lro_mgr, aggregated);
+		return NULL;
+	}
+
+	if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+		goto out2;
+
+	if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
+		goto out2;
+
+	lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
+	LRO_INC_STATS(lro_mgr, aggregated);
+
+	if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
+	    lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+		lro_flush(lro_mgr, lro_desc);
+
+	return NULL;
+
+out2: /* send aggregated packets to the stack */
+	lro_flush(lro_mgr, lro_desc);
+
+out1:  /* Original packet has to be posted to the stack */
+	skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
+			  hdr_len, sum, lro_mgr->ip_summed);
+out:
+	return skb;
+}
+
+void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+		     struct sk_buff *skb,
+		     void *priv)
+{
+	if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
+		if (lro_mgr->features & LRO_F_NAPI)
+			netif_receive_skb(skb);
+		else
+			netif_rx(skb);
+	}
+}
+EXPORT_SYMBOL(lro_receive_skb);
+
+void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
+				  struct sk_buff *skb,
+				  struct vlan_group *vgrp,
+				  u16 vlan_tag,
+				  void *priv)
+{
+	if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
+		if (lro_mgr->features & LRO_F_NAPI)
+			vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+		else
+			vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+	}
+}
+EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
+
+void lro_receive_frags(struct net_lro_mgr *lro_mgr,
+		       struct skb_frag_struct *frags,
+		       int len, int true_size, void *priv, __wsum sum)
+{
+	struct sk_buff *skb;
+
+	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
+				 priv, sum);
+	if (!skb)
+		return;
+
+	if (lro_mgr->features & LRO_F_NAPI)
+		netif_receive_skb(skb);
+	else
+		netif_rx(skb);
+}
+EXPORT_SYMBOL(lro_receive_frags);
+
+void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
+				    struct skb_frag_struct *frags,
+				    int len, int true_size,
+				    struct vlan_group *vgrp,
+				    u16 vlan_tag, void *priv, __wsum sum)
+{
+	struct sk_buff *skb;
+
+	skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
+				 vlan_tag, priv, sum);
+	if (!skb)
+		return;
+
+	if (lro_mgr->features & LRO_F_NAPI)
+		vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+	else
+		vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+}
+EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
+
+void lro_flush_all(struct net_lro_mgr *lro_mgr)
+{
+	int i;
+	struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
+
+	for (i = 0; i < lro_mgr->max_desc; i++) {
+		if (lro_desc[i].active)
+			lro_flush(lro_mgr, &lro_desc[i]);
+	}
+}
+EXPORT_SYMBOL(lro_flush_all);
+
+void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
+		  struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct net_lro_desc *lro_desc;
+
+	lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+	if (lro_desc->active)
+		lro_flush(lro_mgr, lro_desc);
+}
+EXPORT_SYMBOL(lro_flush_pkt);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 00000000..3c8dfa16
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,523 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic TIME_WAIT sockets functions
+ *
+ *		From code orinally in TCP
+ */
+
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/slab.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+
+/**
+ *	inet_twsk_unhash - unhash a timewait socket from established hash
+ *	@tw: timewait socket
+ *
+ *	unhash a timewait socket from established hash, if hashed.
+ *	ehash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+	if (hlist_nulls_unhashed(&tw->tw_node))
+		return 0;
+
+	hlist_nulls_del_rcu(&tw->tw_node);
+	sk_nulls_node_init(&tw->tw_node);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
+	return 1;
+}
+
+/**
+ *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ *	@tw: timewait socket
+ *	@hashinfo: hashinfo pointer
+ *
+ *	unhash a timewait socket from bind hash, if hashed.
+ *	bind hash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			  struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_bucket *tb = tw->tw_tb;
+
+	if (!tb)
+		return 0;
+
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
+	return 1;
+}
+
+/* Must be called with locally disabled BHs. */
+static void __inet_twsk_kill(struct inet_timewait_sock *tw,
+			     struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_hashbucket *bhead;
+	int refcnt;
+	/* Unlink from established hashes. */
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+	spin_lock(lock);
+	refcnt = inet_twsk_unhash(tw);
+	spin_unlock(lock);
+
+	/* Disassociate with bind bucket. */
+	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
+			hashinfo->bhash_size)];
+
+	spin_lock(&bhead->lock);
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
+	spin_unlock(&bhead->lock);
+
+#ifdef SOCK_REFCNT_DEBUG
+	if (atomic_read(&tw->tw_refcnt) != 1) {
+		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+		       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+	}
+#endif
+	while (refcnt) {
+		inet_twsk_put(tw);
+		refcnt--;
+	}
+}
+
+static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
+{
+	struct module *owner = tw->tw_prot->owner;
+	twsk_destructor((struct sock *)tw);
+#ifdef SOCK_REFCNT_DEBUG
+	pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
+#endif
+	release_net(twsk_net(tw));
+	kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+	module_put(owner);
+}
+
+void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+	if (atomic_dec_and_test(&tw->tw_refcnt))
+		inet_twsk_free(tw);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_put);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+			   struct inet_hashinfo *hashinfo)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+	struct inet_bind_hashbucket *bhead;
+	/* Step 1: Put TW into bind hash. Original socket stays there too.
+	   Note, that any socket with inet->num != 0 MUST be bound in
+	   binding cache, even if it is closed.
+	 */
+	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
+			hashinfo->bhash_size)];
+	spin_lock(&bhead->lock);
+	tw->tw_tb = icsk->icsk_bind_hash;
+	WARN_ON(!icsk->icsk_bind_hash);
+	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+	spin_unlock(&bhead->lock);
+
+	spin_lock(lock);
+
+	/*
+	 * Step 2: Hash TW into TIMEWAIT chain.
+	 * Should be done before removing sk from established chain
+	 * because readers are lockless and search established first.
+	 */
+	inet_twsk_add_node_rcu(tw, &ehead->twchain);
+
+	/* Step 3: Remove SK from established hash. */
+	if (__sk_nulls_del_node_init_rcu(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+	/*
+	 * Notes :
+	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
+	 * - We add one reference for the bhash link
+	 * - We add one reference for the ehash link
+	 * - We want this refcnt update done before allowing other
+	 *   threads to find this tw in ehash chain.
+	 */
+	atomic_add(1 + 1 + 1, &tw->tw_refcnt);
+
+	spin_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+	struct inet_timewait_sock *tw =
+		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+				 GFP_ATOMIC);
+	if (tw != NULL) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		kmemcheck_annotate_bitfield(tw, flags);
+
+		/* Give us an identity. */
+		tw->tw_daddr	    = inet->inet_daddr;
+		tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
+		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+		tw->tw_num	    = inet->inet_num;
+		tw->tw_state	    = TCP_TIME_WAIT;
+		tw->tw_substate	    = state;
+		tw->tw_sport	    = inet->inet_sport;
+		tw->tw_dport	    = inet->inet_dport;
+		tw->tw_family	    = sk->sk_family;
+		tw->tw_reuse	    = sk->sk_reuse;
+		tw->tw_hash	    = sk->sk_hash;
+		tw->tw_ipv6only	    = 0;
+		tw->tw_transparent  = inet->transparent;
+		tw->tw_prot	    = sk->sk_prot_creator;
+		twsk_net_set(tw, hold_net(sock_net(sk)));
+		/*
+		 * Because we use RCU lookups, we should not set tw_refcnt
+		 * to a non null value before everything is setup for this
+		 * timewait socket.
+		 */
+		atomic_set(&tw->tw_refcnt, 0);
+		inet_twsk_dead_node_init(tw);
+		__module_get(tw->tw_prot->owner);
+	}
+
+	return tw;
+}
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded.  */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+				    const int slot)
+{
+	struct inet_timewait_sock *tw;
+	struct hlist_node *node;
+	unsigned int killed;
+	int ret;
+
+	/* NOTE: compare this to previous version where lock
+	 * was released after detaching chain. It was racy,
+	 * because tw buckets are scheduled in not serialized context
+	 * in 2.3 (with netfilter), and with softnet it is common, because
+	 * soft irqs are not sequenced.
+	 */
+	killed = 0;
+	ret = 0;
+rescan:
+	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+		__inet_twsk_del_dead_node(tw);
+		spin_unlock(&twdr->death_lock);
+		__inet_twsk_kill(tw, twdr->hashinfo);
+#ifdef CONFIG_NET_NS
+		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+#endif
+		inet_twsk_put(tw);
+		killed++;
+		spin_lock(&twdr->death_lock);
+		if (killed > INET_TWDR_TWKILL_QUOTA) {
+			ret = 1;
+			break;
+		}
+
+		/* While we dropped twdr->death_lock, another cpu may have
+		 * killed off the next TW bucket in the list, therefore
+		 * do a fresh re-read of the hlist head node with the
+		 * lock reacquired.  We still use the hlist traversal
+		 * macro in order to get the prefetches.
+		 */
+		goto rescan;
+	}
+
+	twdr->tw_count -= killed;
+#ifndef CONFIG_NET_NS
+	NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
+#endif
+	return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int unsigned need_timer;
+
+	twdr = (struct inet_timewait_death_row *)data;
+	spin_lock(&twdr->death_lock);
+
+	if (twdr->tw_count == 0)
+		goto out;
+
+	need_timer = 0;
+	if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+		twdr->thread_slots |= (1 << twdr->slot);
+		schedule_work(&twdr->twkill_work);
+		need_timer = 1;
+	} else {
+		/* We purged the entire slot, anything left?  */
+		if (twdr->tw_count)
+			need_timer = 1;
+		twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+	}
+	if (need_timer)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+	spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+void inet_twdr_twkill_work(struct work_struct *work)
+{
+	struct inet_timewait_death_row *twdr =
+		container_of(work, struct inet_timewait_death_row, twkill_work);
+	int i;
+
+	BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
+			(sizeof(twdr->thread_slots) * 8));
+
+	while (twdr->thread_slots) {
+		spin_lock_bh(&twdr->death_lock);
+		for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+			if (!(twdr->thread_slots & (1 << i)))
+				continue;
+
+			while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+				if (need_resched()) {
+					spin_unlock_bh(&twdr->death_lock);
+					schedule();
+					spin_lock_bh(&twdr->death_lock);
+				}
+			}
+
+			twdr->thread_slots &= ~(1 << i);
+		}
+		spin_unlock_bh(&twdr->death_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+			  struct inet_timewait_death_row *twdr)
+{
+	spin_lock(&twdr->death_lock);
+	if (inet_twsk_del_dead_node(tw)) {
+		inet_twsk_put(tw);
+		if (--twdr->tw_count == 0)
+			del_timer(&twdr->tw_timer);
+	}
+	spin_unlock(&twdr->death_lock);
+	__inet_twsk_kill(tw, twdr->hashinfo);
+}
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+		       struct inet_timewait_death_row *twdr,
+		       const int timeo, const int timewait_len)
+{
+	struct hlist_head *list;
+	int slot;
+
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+	slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+	spin_lock(&twdr->death_lock);
+
+	/* Unlink it, if it was scheduled */
+	if (inet_twsk_del_dead_node(tw))
+		twdr->tw_count--;
+	else
+		atomic_inc(&tw->tw_refcnt);
+
+	if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+		/* Schedule to slow timer */
+		if (timeo >= timewait_len) {
+			slot = INET_TWDR_TWKILL_SLOTS - 1;
+		} else {
+			slot = DIV_ROUND_UP(timeo, twdr->period);
+			if (slot >= INET_TWDR_TWKILL_SLOTS)
+				slot = INET_TWDR_TWKILL_SLOTS - 1;
+		}
+		tw->tw_ttd = jiffies + timeo;
+		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+		list = &twdr->cells[slot];
+	} else {
+		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+		if (twdr->twcal_hand < 0) {
+			twdr->twcal_hand = 0;
+			twdr->twcal_jiffie = jiffies;
+			twdr->twcal_timer.expires = twdr->twcal_jiffie +
+					      (slot << INET_TWDR_RECYCLE_TICK);
+			add_timer(&twdr->twcal_timer);
+		} else {
+			if (time_after(twdr->twcal_timer.expires,
+				       jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+				mod_timer(&twdr->twcal_timer,
+					  jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+			slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+		}
+		list = &twdr->twcal_row[slot];
+	}
+
+	hlist_add_head(&tw->tw_death_node, list);
+
+	if (twdr->tw_count++ == 0)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+	spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int n, slot;
+	unsigned long j;
+	unsigned long now = jiffies;
+	int killed = 0;
+	int adv = 0;
+
+	twdr = (struct inet_timewait_death_row *)data;
+
+	spin_lock(&twdr->death_lock);
+	if (twdr->twcal_hand < 0)
+		goto out;
+
+	slot = twdr->twcal_hand;
+	j = twdr->twcal_jiffie;
+
+	for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+		if (time_before_eq(j, now)) {
+			struct hlist_node *node, *safe;
+			struct inet_timewait_sock *tw;
+
+			inet_twsk_for_each_inmate_safe(tw, node, safe,
+						       &twdr->twcal_row[slot]) {
+				__inet_twsk_del_dead_node(tw);
+				__inet_twsk_kill(tw, twdr->hashinfo);
+#ifdef CONFIG_NET_NS
+				NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+#endif
+				inet_twsk_put(tw);
+				killed++;
+			}
+		} else {
+			if (!adv) {
+				adv = 1;
+				twdr->twcal_jiffie = j;
+				twdr->twcal_hand = slot;
+			}
+
+			if (!hlist_empty(&twdr->twcal_row[slot])) {
+				mod_timer(&twdr->twcal_timer, j);
+				goto out;
+			}
+		}
+		j += 1 << INET_TWDR_RECYCLE_TICK;
+		slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+	}
+	twdr->twcal_hand = -1;
+
+out:
+	if ((twdr->tw_count -= killed) == 0)
+		del_timer(&twdr->tw_timer);
+#ifndef CONFIG_NET_NS
+	NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
+#endif
+	spin_unlock(&twdr->death_lock);
+}
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
+
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
+		     struct inet_timewait_death_row *twdr, int family)
+{
+	struct inet_timewait_sock *tw;
+	struct sock *sk;
+	struct hlist_nulls_node *node;
+	unsigned int slot;
+
+	for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+		rcu_read_lock();
+restart:
+		sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+			tw = inet_twsk(sk);
+			if ((tw->tw_family != family) ||
+				atomic_read(&twsk_net(tw)->count))
+				continue;
+
+			if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+				continue;
+
+			if (unlikely((tw->tw_family != family) ||
+				     atomic_read(&twsk_net(tw)->count))) {
+				inet_twsk_put(tw);
+				goto restart;
+			}
+
+			rcu_read_unlock();
+			local_bh_disable();
+			inet_twsk_deschedule(tw, twdr);
+			local_bh_enable();
+			inet_twsk_put(tw);
+			goto restart_rcu;
+		}
+		/* If the nulls value we got at the end of this lookup is
+		 * not the expected one, we must restart lookup.
+		 * We probably met an item that was moved to another chain.
+		 */
+		if (get_nulls_value(node) != slot)
+			goto restart;
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 00000000..68776454
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,637 @@
+/*
+ *		INETPEER - A storage for permanent information about peers
+ *
+ *  This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ *  Authors:	Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/inetpeer.h>
+#include <net/secure_seq.h>
+
+/*
+ *  Theory of operations.
+ *  We keep one entry for each peer IP address.  The nodes contains long-living
+ *  information about the peer which doesn't depend on routes.
+ *  At this moment this information consists only of ID field for the next
+ *  outgoing IP packet.  This field is incremented with each packet as encoded
+ *  in inet_getid() function (include/net/inetpeer.h).
+ *  At the moment of writing this notes identifier of IP packets is generated
+ *  to be unpredictable using this code only for packets subjected
+ *  (actually or potentially) to defragmentation.  I.e. DF packets less than
+ *  PMTU in size uses a constant ID and do not use this code (see
+ *  ip_select_ident() in include/net/ip.h).
+ *
+ *  Route cache entries hold references to our nodes.
+ *  New cache entries get references via lookup by destination IP address in
+ *  the avl tree.  The reference is grabbed only when it's needed i.e. only
+ *  when we try to output IP packet which needs an unpredictable ID (see
+ *  __ip_select_ident() in net/ipv4/route.c).
+ *  Nodes are removed only when reference counter goes to 0.
+ *  When it's happened the node may be removed when a sufficient amount of
+ *  time has been passed since its last use.  The less-recently-used entry can
+ *  also be removed if the pool is overloaded i.e. if the total amount of
+ *  entries is greater-or-equal than the threshold.
+ *
+ *  Node pool is organised as an AVL tree.
+ *  Such an implementation has been chosen not just for fun.  It's a way to
+ *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
+ *  amount of long living nodes in a single hash slot would significantly delay
+ *  lookups performed with disabled BHs.
+ *
+ *  Serialisation issues.
+ *  1.  Nodes may appear in the tree only with the pool lock held.
+ *  2.  Nodes may disappear from the tree only with the pool lock held
+ *      AND reference count being 0.
+ *  3.  Nodes appears and disappears from unused node list only under
+ *      "inet_peer_unused_lock".
+ *  4.  Global variable peer_total is modified under the pool lock.
+ *  5.  struct inet_peer fields modification:
+ *		avl_left, avl_right, avl_parent, avl_height: pool lock
+ *		unused: unused node list lock
+ *		refcnt: atomically against modifications on other CPU;
+ *		   usually under some other lock to prevent node disappearing
+ *		dtime: unused node list lock
+ *		daddr: unchangeable
+ *		ip_id_count: atomic value (no lock needed)
+ */
+
+static struct kmem_cache *peer_cachep __read_mostly;
+
+#define node_height(x) x->avl_height
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+	.avl_left	= peer_avl_empty_rcu,
+	.avl_right	= peer_avl_empty_rcu,
+	.avl_height	= 0
+};
+
+struct inet_peer_base {
+	struct inet_peer __rcu *root;
+	seqlock_t	lock;
+	int		total;
+};
+
+static struct inet_peer_base v4_peers = {
+	.root		= peer_avl_empty_rcu,
+	.lock		= __SEQLOCK_UNLOCKED(v4_peers.lock),
+	.total		= 0,
+};
+
+static struct inet_peer_base v6_peers = {
+	.root		= peer_avl_empty_rcu,
+	.lock		= __SEQLOCK_UNLOCKED(v6_peers.lock),
+	.total		= 0,
+};
+
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
+					 * aggressively at this stage */
+int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
+int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
+int inet_peer_gc_mintime __read_mostly = 10 * HZ;
+int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
+
+static struct {
+	struct list_head	list;
+	spinlock_t		lock;
+} unused_peers = {
+	.list			= LIST_HEAD_INIT(unused_peers.list),
+	.lock			= __SPIN_LOCK_UNLOCKED(unused_peers.lock),
+};
+
+static void peer_check_expire(unsigned long dummy);
+static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
+
+
+/* Called from ip_output.c:ip_init  */
+void __init inet_initpeers(void)
+{
+	struct sysinfo si;
+
+	/* Use the straight interface to information about memory. */
+	si_meminfo(&si);
+	/* The values below were suggested by Alexey Kuznetsov
+	 * <kuznet@ms2.inr.ac.ru>.  I don't have any opinion about the values
+	 * myself.  --SAW
+	 */
+	if (si.totalram <= (32768*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+	if (si.totalram <= (16384*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 1; /* about 512KB */
+	if (si.totalram <= (8192*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 2; /* about 128KB */
+
+	peer_cachep = kmem_cache_create("inet_peer_cache",
+			sizeof(struct inet_peer),
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+			NULL);
+
+	/* All the timers, started at system startup tend
+	   to synchronize. Perturb it a bit.
+	 */
+	peer_periodic_timer.expires = jiffies
+		+ net_random() % inet_peer_gc_maxtime
+		+ inet_peer_gc_maxtime;
+	add_timer(&peer_periodic_timer);
+}
+
+/* Called with or without local BH being disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+	spin_lock_bh(&unused_peers.lock);
+	list_del_init(&p->unused);
+	spin_unlock_bh(&unused_peers.lock);
+}
+
+static int addr_compare(const struct inetpeer_addr *a,
+			const struct inetpeer_addr *b)
+{
+	int i, n = (a->family == AF_INET ? 1 : 4);
+
+	for (i = 0; i < n; i++) {
+		if (a->addr.a6[i] == b->addr.a6[i])
+			continue;
+		if (a->addr.a6[i] < b->addr.a6[i])
+			return -1;
+		return 1;
+	}
+
+	return 0;
+}
+
+#define rcu_deref_locked(X, BASE)				\
+	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
+
+/*
+ * Called with local BH disabled and the pool lock held.
+ */
+#define lookup(_daddr, _stack, _base)				\
+({								\
+	struct inet_peer *u;					\
+	struct inet_peer __rcu **v;				\
+								\
+	stackptr = _stack;					\
+	*stackptr++ = &_base->root;				\
+	for (u = rcu_deref_locked(_base->root, _base);		\
+	     u != peer_avl_empty; ) {				\
+		int cmp = addr_compare(_daddr, &u->daddr);	\
+		if (cmp == 0)					\
+			break;					\
+		if (cmp == -1)					\
+			v = &u->avl_left;			\
+		else						\
+			v = &u->avl_right;			\
+		*stackptr++ = v;				\
+		u = rcu_deref_locked(*v, _base);		\
+	}							\
+	u;							\
+})
+
+static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
+{
+	int cur, old = atomic_read(ptr);
+
+	while (old != u) {
+		*newv = old + a;
+		cur = atomic_cmpxchg(ptr, old, *newv);
+		if (cur == old)
+			return true;
+		old = cur;
+	}
+	return false;
+}
+
+/*
+ * Called with rcu_read_lock()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
+				    struct inet_peer_base *base,
+				    int *newrefcnt)
+{
+	struct inet_peer *u = rcu_dereference(base->root);
+	int count = 0;
+
+	while (u != peer_avl_empty) {
+		int cmp = addr_compare(daddr, &u->daddr);
+		if (cmp == 0) {
+			/* Before taking a reference, check if this entry was
+			 * deleted, unlink_from_pool() sets refcnt=-1 to make
+			 * distinction between an unused entry (refcnt=0) and
+			 * a freed one.
+			 */
+			if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
+				u = NULL;
+			return u;
+		}
+		if (cmp == -1)
+			u = rcu_dereference(u->avl_left);
+		else
+			u = rcu_dereference(u->avl_right);
+		if (unlikely(++count == PEER_MAXDEPTH))
+			break;
+	}
+	return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup_rightempty(start, base)				\
+({								\
+	struct inet_peer *u;					\
+	struct inet_peer __rcu **v;				\
+	*stackptr++ = &start->avl_left;				\
+	v = &start->avl_left;					\
+	for (u = rcu_deref_locked(*v, base);			\
+	     u->avl_right != peer_avl_empty_rcu; ) {		\
+		v = &u->avl_right;				\
+		*stackptr++ = v;				\
+		u = rcu_deref_locked(*v, base);			\
+	}							\
+	u;							\
+})
+
+/* Called with local BH disabled and the pool lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+			       struct inet_peer __rcu ***stackend,
+			       struct inet_peer_base *base)
+{
+	struct inet_peer __rcu **nodep;
+	struct inet_peer *node, *l, *r;
+	int lh, rh;
+
+	while (stackend > stack) {
+		nodep = *--stackend;
+		node = rcu_deref_locked(*nodep, base);
+		l = rcu_deref_locked(node->avl_left, base);
+		r = rcu_deref_locked(node->avl_right, base);
+		lh = node_height(l);
+		rh = node_height(r);
+		if (lh > rh + 1) { /* l: RH+2 */
+			struct inet_peer *ll, *lr, *lrl, *lrr;
+			int lrh;
+			ll = rcu_deref_locked(l->avl_left, base);
+			lr = rcu_deref_locked(l->avl_right, base);
+			lrh = node_height(lr);
+			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
+				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */
+				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
+				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+				RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
+				RCU_INIT_POINTER(l->avl_right, node);	/* node: RH+1 or RH+2 */
+				l->avl_height = node->avl_height + 1;
+				RCU_INIT_POINTER(*nodep, l);
+			} else { /* ll: RH, lr: RH+1 */
+				lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
+				lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
+				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */
+				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
+				node->avl_height = rh + 1; /* node: RH+1 */
+				RCU_INIT_POINTER(l->avl_left, ll);	/* ll: RH */
+				RCU_INIT_POINTER(l->avl_right, lrl);	/* lrl: RH or RH-1 */
+				l->avl_height = rh + 1;	/* l: RH+1 */
+				RCU_INIT_POINTER(lr->avl_left, l);	/* l: RH+1 */
+				RCU_INIT_POINTER(lr->avl_right, node);	/* node: RH+1 */
+				lr->avl_height = rh + 2;
+				RCU_INIT_POINTER(*nodep, lr);
+			}
+		} else if (rh > lh + 1) { /* r: LH+2 */
+			struct inet_peer *rr, *rl, *rlr, *rll;
+			int rlh;
+			rr = rcu_deref_locked(r->avl_right, base);
+			rl = rcu_deref_locked(r->avl_left, base);
+			rlh = node_height(rl);
+			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
+				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */
+				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
+				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH+1 */
+				RCU_INIT_POINTER(r->avl_left, node);	/* node: LH+1 or LH+2 */
+				r->avl_height = node->avl_height + 1;
+				RCU_INIT_POINTER(*nodep, r);
+			} else { /* rr: RH, rl: RH+1 */
+				rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
+				rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
+				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */
+				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
+				node->avl_height = lh + 1; /* node: LH+1 */
+				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH */
+				RCU_INIT_POINTER(r->avl_left, rlr);	/* rlr: LH or LH-1 */
+				r->avl_height = lh + 1;	/* r: LH+1 */
+				RCU_INIT_POINTER(rl->avl_right, r);	/* r: LH+1 */
+				RCU_INIT_POINTER(rl->avl_left, node);	/* node: LH+1 */
+				rl->avl_height = lh + 2;
+				RCU_INIT_POINTER(*nodep, rl);
+			}
+		} else {
+			node->avl_height = (lh > rh ? lh : rh) + 1;
+		}
+	}
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define link_to_pool(n, base)					\
+do {								\
+	n->avl_height = 1;					\
+	n->avl_left = peer_avl_empty_rcu;			\
+	n->avl_right = peer_avl_empty_rcu;			\
+	/* lockless readers can catch us now */			\
+	rcu_assign_pointer(**--stackptr, n);			\
+	peer_avl_rebalance(stack, stackptr, base);		\
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
+
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
+			     struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+{
+	int do_free;
+
+	do_free = 0;
+
+	write_seqlock_bh(&base->lock);
+	/* Check the reference counter.  It was artificially incremented by 1
+	 * in cleanup() function to prevent sudden disappearing.  If we can
+	 * atomically (because of lockless readers) take this last reference,
+	 * it's safe to remove the node and free it later.
+	 * We use refcnt=-1 to alert lockless readers this entry is deleted.
+	 */
+	if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
+		struct inet_peer __rcu ***stackptr, ***delp;
+		if (lookup(&p->daddr, stack, base) != p)
+			BUG();
+		delp = stackptr - 1; /* *delp[0] == p */
+		if (p->avl_left == peer_avl_empty_rcu) {
+			*delp[0] = p->avl_right;
+			--stackptr;
+		} else {
+			/* look for a node to insert instead of p */
+			struct inet_peer *t;
+			t = lookup_rightempty(p, base);
+			BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
+			**--stackptr = t->avl_left;
+			/* t is removed, t->daddr > x->daddr for any
+			 * x in p->avl_left subtree.
+			 * Put t in the old place of p. */
+			RCU_INIT_POINTER(*delp[0], t);
+			t->avl_left = p->avl_left;
+			t->avl_right = p->avl_right;
+			t->avl_height = p->avl_height;
+			BUG_ON(delp[1] != &p->avl_left);
+			delp[1] = &t->avl_left; /* was &p->avl_left */
+		}
+		peer_avl_rebalance(stack, stackptr, base);
+		base->total--;
+		do_free = 1;
+	}
+	write_sequnlock_bh(&base->lock);
+
+	if (do_free)
+		call_rcu(&p->rcu, inetpeer_free_rcu);
+	else
+		/* The node is used again.  Decrease the reference counter
+		 * back.  The loop "cleanup -> unlink_from_unused
+		 *   -> unlink_from_pool -> putpeer -> link_to_unused
+		 *   -> cleanup (for the same node)"
+		 * doesn't really exist because the entry will have a
+		 * recent deletion time and will not be cleaned again soon.
+		 */
+		inet_putpeer(p);
+}
+
+static struct inet_peer_base *family_to_base(int family)
+{
+	return (family == AF_INET ? &v4_peers : &v6_peers);
+}
+
+static struct inet_peer_base *peer_to_base(struct inet_peer *p)
+{
+	return family_to_base(p->daddr.family);
+}
+
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
+{
+	struct inet_peer *p = NULL;
+
+	/* Remove the first entry from the list of unused nodes. */
+	spin_lock_bh(&unused_peers.lock);
+	if (!list_empty(&unused_peers.list)) {
+		__u32 delta;
+
+		p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
+		delta = (__u32)jiffies - p->dtime;
+
+		if (delta < ttl) {
+			/* Do not prune fresh entries. */
+			spin_unlock_bh(&unused_peers.lock);
+			return -1;
+		}
+
+		list_del_init(&p->unused);
+
+		/* Grab an extra reference to prevent node disappearing
+		 * before unlink_from_pool() call. */
+		atomic_inc(&p->refcnt);
+	}
+	spin_unlock_bh(&unused_peers.lock);
+
+	if (p == NULL)
+		/* It means that the total number of USED entries has
+		 * grown over inet_peer_threshold.  It shouldn't really
+		 * happen because of entry limits in route cache. */
+		return -1;
+
+	unlink_from_pool(p, peer_to_base(p), stack);
+	return 0;
+}
+
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
+{
+	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
+	struct inet_peer_base *base = family_to_base(daddr->family);
+	struct inet_peer *p;
+	unsigned int sequence;
+	int invalidated, newrefcnt = 0;
+
+	/* Look up for the address quickly, lockless.
+	 * Because of a concurrent writer, we might not find an existing entry.
+	 */
+	rcu_read_lock();
+	sequence = read_seqbegin(&base->lock);
+	p = lookup_rcu(daddr, base, &newrefcnt);
+	invalidated = read_seqretry(&base->lock, sequence);
+	rcu_read_unlock();
+
+	if (p) {
+found:		/* The existing node has been found.
+		 * Remove the entry from unused list if it was there.
+		 */
+		if (newrefcnt == 1)
+			unlink_from_unused(p);
+		return p;
+	}
+
+	/* If no writer did a change during our lookup, we can return early. */
+	if (!create && !invalidated)
+		return NULL;
+
+	/* retry an exact lookup, taking the lock before.
+	 * At least, nodes should be hot in our cache.
+	 */
+	write_seqlock_bh(&base->lock);
+	p = lookup(daddr, stack, base);
+	if (p != peer_avl_empty) {
+		newrefcnt = atomic_inc_return(&p->refcnt);
+		write_sequnlock_bh(&base->lock);
+		goto found;
+	}
+	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+	if (p) {
+		p->daddr = *daddr;
+		atomic_set(&p->refcnt, 1);
+		atomic_set(&p->rid, 0);
+		atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
+		p->tcp_ts_stamp = 0;
+		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+		p->rate_tokens = 0;
+		p->rate_last = 0;
+		p->pmtu_expires = 0;
+		p->pmtu_orig = 0;
+		memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
+		INIT_LIST_HEAD(&p->unused);
+
+
+		/* Link the node. */
+		link_to_pool(p, base);
+		base->total++;
+	}
+	write_sequnlock_bh(&base->lock);
+
+	if (base->total >= inet_peer_threshold)
+		/* Remove one less-recently-used entry. */
+		cleanup_once(0, stack);
+
+	return p;
+}
+
+static int compute_total(void)
+{
+	return v4_peers.total + v6_peers.total;
+}
+EXPORT_SYMBOL_GPL(inet_getpeer);
+
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+	unsigned long now = jiffies;
+	int ttl, total;
+	struct inet_peer __rcu **stack[PEER_MAXDEPTH];
+
+	total = compute_total();
+	if (total >= inet_peer_threshold)
+		ttl = inet_peer_minttl;
+	else
+		ttl = inet_peer_maxttl
+				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
+					total / inet_peer_threshold * HZ;
+	while (!cleanup_once(ttl, stack)) {
+		if (jiffies != now)
+			break;
+	}
+
+	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+	 * interval depending on the total number of entries (more entries,
+	 * less interval). */
+	total = compute_total();
+	if (total >= inet_peer_threshold)
+		peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+	else
+		peer_periodic_timer.expires = jiffies
+			+ inet_peer_gc_maxtime
+			- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+				total / inet_peer_threshold * HZ;
+	add_timer(&peer_periodic_timer);
+}
+
+void inet_putpeer(struct inet_peer *p)
+{
+	local_bh_disable();
+
+	if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
+		list_add_tail(&p->unused, &unused_peers.list);
+		p->dtime = (__u32)jiffies;
+		spin_unlock(&unused_peers.lock);
+	}
+
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ *	Check transmit rate limitation for given message.
+ *	The rate information is held in the inet_peer entries now.
+ *	This function is generic and could be used for other purposes
+ *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ *	Note that the same inet_peer fields are modified by functions in
+ *	route.c too, but these work for packet destinations while xrlim_allow
+ *	works for icmp destinations. This means the rate limiting information
+ *	for one "ip object" is shared - and these ICMPs are twice limited:
+ *	by source and by destination.
+ *
+ *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ *			  SHOULD allow setting of rate limits
+ *
+ * 	Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+	unsigned long now, token;
+	bool rc = false;
+
+	if (!peer)
+		return true;
+
+	token = peer->rate_tokens;
+	now = jiffies;
+	token += now - peer->rate_last;
+	peer->rate_last = now;
+	if (token > XRLIM_BURST_FACTOR * timeout)
+		token = XRLIM_BURST_FACTOR * timeout;
+	if (token >= timeout) {
+		token -= timeout;
+		rc = true;
+	}
+	peer->rate_tokens = token;
+	return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 00000000..29a07b6c
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,132 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP forwarding functionality.
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip_input.c for
+ *					history.
+ *		Dave Gregorich	:	NULL ip_rt_put fix for multicast
+ *					routing.
+ *		Jos Vos		:	Add call_out_firewall before sending,
+ *					use output device for accounting.
+ *		Jos Vos		:	Call forward firewall after routing
+ *					(always use output device).
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static int ip_forward_finish(struct sk_buff *skb)
+{
+	struct ip_options * opt	= &(IPCB(skb)->opt);
+
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	return dst_output(skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+	struct iphdr *iph;	/* Our header */
+	struct rtable *rt;	/* Route we use */
+	struct ip_options * opt	= &(IPCB(skb)->opt);
+
+	if (skb_warn_if_lro(skb))
+		goto drop;
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+		goto drop;
+
+	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+		return NET_RX_SUCCESS;
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	/*
+	 *	According to the RFC, we must first decrease the TTL field. If
+	 *	that reaches zero, we must reply an ICMP control message telling
+	 *	that the packet's lifetime expired.
+	 */
+	if (ip_hdr(skb)->ttl <= 1)
+		goto too_many_hops;
+
+	if (!xfrm4_route_forward(skb))
+		goto drop;
+
+	rt = skb_rtable(skb);
+
+	if (opt->is_strictroute && opt->nexthop != rt->rt_gateway)
+		goto sr_failed;
+
+	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
+		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(dst_mtu(&rt->dst)));
+		goto drop;
+	}
+
+	/* We are about to mangle packet. Copy it! */
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
+		goto drop;
+	iph = ip_hdr(skb);
+
+	/* Decrease ttl after skb cow done */
+	ip_decrease_ttl(iph);
+
+	/*
+	 *	We now generate an ICMP HOST REDIRECT giving the route
+	 *	we calculated.
+	 */
+	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
+		ip_rt_send_redirect(skb);
+
+	skb->priority = rt_tos2priority(iph->tos);
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
+		       rt->dst.dev, ip_forward_finish);
+
+sr_failed:
+	/*
+	 *	Strict routing permits no gatewaying
+	 */
+	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+	 goto drop;
+
+too_many_hops:
+	/* Tell the sender its packet died... */
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
+	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 00000000..0ad6035f
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,835 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP fragmentation functionality.
+ *
+ * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *
+ * Fixes:
+ *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
+ *		David S. Miller :	Begin massive cleanup...
+ *		Andi Kleen	:	Add sysctls.
+ *		xxxx		:	Overlapfrag bug.
+ *		Ultima          :       ip_expire() kernel panic.
+ *		Bill Hawes	:	Frag accounting and evictor fixes.
+ *		John McDonald	:	0 length frag bug.
+ *		Alexey Kuznetsov:	SMP races, threading, cleanup.
+ *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ */
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/inet_frag.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/inet_ecn.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+static int sysctl_ipfrag_max_dist __read_mostly = 64;
+
+struct ipfrag_skb_cb
+{
+	struct inet_skb_parm	h;
+	int			offset;
+};
+
+#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+	struct inet_frag_queue q;
+
+	u32		user;
+	__be32		saddr;
+	__be32		daddr;
+	__be16		id;
+	u8		protocol;
+	u8		ecn; /* RFC3168 support */
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
+};
+
+/* RFC 3168 support :
+ * We want to check ECN values of all fragments, do detect invalid combinations.
+ * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
+ */
+#define	IPFRAG_ECN_NOT_ECT	0x01 /* one frag had ECN_NOT_ECT */
+#define	IPFRAG_ECN_ECT_1	0x02 /* one frag had ECN_ECT_1 */
+#define	IPFRAG_ECN_ECT_0	0x04 /* one frag had ECN_ECT_0 */
+#define	IPFRAG_ECN_CE		0x08 /* one frag had ECN_CE */
+
+static inline u8 ip4_frag_ecn(u8 tos)
+{
+	return 1 << (tos & INET_ECN_MASK);
+}
+
+/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
+ * Value : 0xff if frame should be dropped.
+ *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
+ */
+static const u8 ip4_frag_ecn_table[16] = {
+	/* at least one fragment had CE, and others ECT_0 or ECT_1 */
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1]			= INET_ECN_CE,
+	[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1]	= INET_ECN_CE,
+
+	/* invalid combinations : drop frame */
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
+	[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
+};
+
+static struct inet_frags ip4_frags;
+
+int ip_frag_nqueues(struct net *net)
+{
+	return net->ipv4.frags.nqueues;
+}
+
+int ip_frag_mem(struct net *net)
+{
+	return atomic_read(&net->ipv4.frags.mem);
+}
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+			 struct net_device *dev);
+
+struct ip4_create_arg {
+	struct iphdr *iph;
+	u32 user;
+};
+
+static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
+{
+	return jhash_3words((__force u32)id << 16 | prot,
+			    (__force u32)saddr, (__force u32)daddr,
+			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
+}
+
+static unsigned int ip4_hashfn(struct inet_frag_queue *q)
+{
+	struct ipq *ipq;
+
+	ipq = container_of(q, struct ipq, q);
+	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+}
+
+static int ip4_frag_match(struct inet_frag_queue *q, void *a)
+{
+	struct ipq *qp;
+	struct ip4_create_arg *arg = a;
+
+	qp = container_of(q, struct ipq, q);
+	return	qp->id == arg->iph->id &&
+			qp->saddr == arg->iph->saddr &&
+			qp->daddr == arg->iph->daddr &&
+			qp->protocol == arg->iph->protocol &&
+			qp->user == arg->user;
+}
+
+/* Memory Tracking Functions. */
+static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
+{
+	atomic_sub(skb->truesize, &nf->mem);
+	kfree_skb(skb);
+}
+
+static void ip4_frag_init(struct inet_frag_queue *q, void *a)
+{
+	struct ipq *qp = container_of(q, struct ipq, q);
+	struct ip4_create_arg *arg = a;
+
+	qp->protocol = arg->iph->protocol;
+	qp->id = arg->iph->id;
+	qp->ecn = ip4_frag_ecn(arg->iph->tos);
+	qp->saddr = arg->iph->saddr;
+	qp->daddr = arg->iph->daddr;
+	qp->user = arg->user;
+	qp->peer = sysctl_ipfrag_max_dist ?
+		inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
+}
+
+static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
+{
+	struct ipq *qp;
+
+	qp = container_of(q, struct ipq, q);
+	if (qp->peer)
+		inet_putpeer(qp->peer);
+}
+
+
+/* Destruction primitives. */
+
+static __inline__ void ipq_put(struct ipq *ipq)
+{
+	inet_frag_put(&ipq->q, &ip4_frags);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+	inet_frag_kill(&ipq->q, &ip4_frags);
+}
+
+/* Memory limiting on fragments.  Evictor trashes the oldest
+ * fragment queue until we are back under the threshold.
+ */
+static void ip_evictor(struct net *net)
+{
+	int evicted;
+
+	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
+	if (evicted)
+		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
+}
+
+/*
+ * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+	struct ipq *qp;
+	struct net *net;
+
+	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+	net = container_of(qp->q.net, struct net, ipv4.frags);
+
+	spin_lock(&qp->q.lock);
+
+	if (qp->q.last_in & INET_FRAG_COMPLETE)
+		goto out;
+
+	ipq_kill(qp);
+
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+
+	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
+		struct sk_buff *head = qp->q.fragments;
+		const struct iphdr *iph;
+		int err;
+
+		rcu_read_lock();
+		head->dev = dev_get_by_index_rcu(net, qp->iif);
+		if (!head->dev)
+			goto out_rcu_unlock;
+
+		/* skb dst is stale, drop it, and perform route lookup again */
+		skb_dst_drop(head);
+		iph = ip_hdr(head);
+		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+					   iph->tos, head->dev);
+		if (err)
+			goto out_rcu_unlock;
+
+		/*
+		 * Only an end host needs to send an ICMP
+		 * "Fragment Reassembly Timeout" message, per RFC792.
+		 */
+		if (qp->user == IP_DEFRAG_CONNTRACK_IN &&
+		    skb_rtable(head)->rt_type != RTN_LOCAL)
+			goto out_rcu_unlock;
+
+
+		/* Send an ICMP "Fragment Reassembly Timeout" message. */
+		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+		rcu_read_unlock();
+	}
+out:
+	spin_unlock(&qp->q.lock);
+	ipq_put(qp);
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+{
+	struct inet_frag_queue *q;
+	struct ip4_create_arg arg;
+	unsigned int hash;
+
+	arg.iph = iph;
+	arg.user = user;
+
+	read_lock(&ip4_frags.lock);
+	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+
+	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+	if (q == NULL)
+		goto out_nomem;
+
+	return container_of(q, struct ipq, q);
+
+out_nomem:
+	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
+	return NULL;
+}
+
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->q.fragments && (end - start) > max;
+
+	if (rc) {
+		struct net *net;
+
+		net = container_of(qp->q.net, struct net, ipv4.frags);
+		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+	}
+
+	return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+	struct sk_buff *fp;
+
+	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+		atomic_inc(&qp->q.refcnt);
+		return -ETIMEDOUT;
+	}
+
+	fp = qp->q.fragments;
+	do {
+		struct sk_buff *xp = fp->next;
+		frag_kfree_skb(qp->q.net, fp);
+		fp = xp;
+	} while (fp);
+
+	qp->q.last_in = 0;
+	qp->q.len = 0;
+	qp->q.meat = 0;
+	qp->q.fragments = NULL;
+	qp->q.fragments_tail = NULL;
+	qp->iif = 0;
+	qp->ecn = 0;
+
+	return 0;
+}
+
+/* Add new segment to existing queue. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *next;
+	struct net_device *dev;
+	int flags, offset;
+	int ihl, end;
+	int err = -ENOENT;
+	u8 ecn;
+
+	if (qp->q.last_in & INET_FRAG_COMPLETE)
+		goto err;
+
+	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	    unlikely(ip_frag_too_far(qp)) &&
+	    unlikely(err = ip_frag_reinit(qp))) {
+		ipq_kill(qp);
+		goto err;
+	}
+
+	ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
+	offset = ntohs(ip_hdr(skb)->frag_off);
+	flags = offset & ~IP_OFFSET;
+	offset &= IP_OFFSET;
+	offset <<= 3;		/* offset is in 8-byte chunks */
+	ihl = ip_hdrlen(skb);
+
+	/* Determine the position of this fragment. */
+	end = offset + skb->len - ihl;
+	err = -EINVAL;
+
+	/* Is this the final fragment? */
+	if ((flags & IP_MF) == 0) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrrupted.
+		 */
+		if (end < qp->q.len ||
+		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
+			goto err;
+		qp->q.last_in |= INET_FRAG_LAST_IN;
+		qp->q.len = end;
+	} else {
+		if (end&7) {
+			end &= ~7;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+		if (end > qp->q.len) {
+			/* Some bits beyond end -> corruption. */
+			if (qp->q.last_in & INET_FRAG_LAST_IN)
+				goto err;
+			qp->q.len = end;
+		}
+	}
+	if (end == offset)
+		goto err;
+
+	err = -ENOMEM;
+	if (pskb_pull(skb, ihl) == NULL)
+		goto err;
+
+	err = pskb_trim_rcsum(skb, end - offset);
+	if (err)
+		goto err;
+
+	/* Find out which fragments are in front and at the back of us
+	 * in the chain of fragments so far.  We must know where to put
+	 * this fragment, right?
+	 */
+	prev = qp->q.fragments_tail;
+	if (!prev || FRAG_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
+	prev = NULL;
+	for (next = qp->q.fragments; next != NULL; next = next->next) {
+		if (FRAG_CB(next)->offset >= offset)
+			break;	/* bingo! */
+		prev = next;
+	}
+
+found:
+	/* We found where to put this one.  Check for overlap with
+	 * preceding fragment, and, if needed, align things so that
+	 * any overlaps are eliminated.
+	 */
+	if (prev) {
+		int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+		if (i > 0) {
+			offset += i;
+			err = -EINVAL;
+			if (end <= offset)
+				goto err;
+			err = -ENOMEM;
+			if (!pskb_pull(skb, i))
+				goto err;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+	}
+
+	err = -ENOMEM;
+
+	while (next && FRAG_CB(next)->offset < end) {
+		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+		if (i < next->len) {
+			/* Eat head of the next overlapped fragment
+			 * and leave the loop. The next ones cannot overlap.
+			 */
+			if (!pskb_pull(next, i))
+				goto err;
+			FRAG_CB(next)->offset += i;
+			qp->q.meat -= i;
+			if (next->ip_summed != CHECKSUM_UNNECESSARY)
+				next->ip_summed = CHECKSUM_NONE;
+			break;
+		} else {
+			struct sk_buff *free_it = next;
+
+			/* Old fragment is completely overridden with
+			 * new one drop it.
+			 */
+			next = next->next;
+
+			if (prev)
+				prev->next = next;
+			else
+				qp->q.fragments = next;
+
+			qp->q.meat -= free_it->len;
+			frag_kfree_skb(qp->q.net, free_it);
+		}
+	}
+
+	FRAG_CB(skb)->offset = offset;
+
+	/* Insert this fragment in the chain of fragments. */
+	skb->next = next;
+	if (!next)
+		qp->q.fragments_tail = skb;
+	if (prev)
+		prev->next = skb;
+	else
+		qp->q.fragments = skb;
+
+	dev = skb->dev;
+	if (dev) {
+		qp->iif = dev->ifindex;
+		skb->dev = NULL;
+	}
+	qp->q.stamp = skb->tstamp;
+	qp->q.meat += skb->len;
+	qp->ecn |= ecn;
+	atomic_add(skb->truesize, &qp->q.net->mem);
+	if (offset == 0)
+		qp->q.last_in |= INET_FRAG_FIRST_IN;
+
+	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	    qp->q.meat == qp->q.len)
+		return ip_frag_reasm(qp, prev, dev);
+
+	write_lock(&ip4_frags.lock);
+	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
+	write_unlock(&ip4_frags.lock);
+	return -EINPROGRESS;
+
+err:
+	kfree_skb(skb);
+	return err;
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+			 struct net_device *dev)
+{
+	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+	struct iphdr *iph;
+	struct sk_buff *fp, *head = qp->q.fragments;
+	int len;
+	int ihlen;
+	int err;
+	u8 ecn;
+
+	ipq_kill(qp);
+
+	ecn = ip4_frag_ecn_table[qp->ecn];
+	if (unlikely(ecn == 0xff)) {
+		err = -EINVAL;
+		goto out_fail;
+	}
+	/* Make the one we just received the head. */
+	if (prev) {
+		head = prev->next;
+		fp = skb_clone(head, GFP_ATOMIC);
+		if (!fp)
+			goto out_nomem;
+
+		fp->next = head->next;
+		if (!fp->next)
+			qp->q.fragments_tail = fp;
+		prev->next = fp;
+
+		skb_morph(head, qp->q.fragments);
+		head->next = qp->q.fragments->next;
+
+		kfree_skb(qp->q.fragments);
+		qp->q.fragments = head;
+	}
+
+	WARN_ON(head == NULL);
+	WARN_ON(FRAG_CB(head)->offset != 0);
+
+	/* Allocate a new buffer for the datagram. */
+	ihlen = ip_hdrlen(head);
+	len = ihlen + qp->q.len;
+
+	err = -E2BIG;
+	if (len > 65535)
+		goto out_oversize;
+
+	/* Head of list must not be cloned. */
+	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+		goto out_nomem;
+
+	/* If the first fragment is fragmented itself, we split
+	 * it to two chunks: the first with data and paged part
+	 * and the second, holding only fragments. */
+	if (skb_has_frag_list(head)) {
+		struct sk_buff *clone;
+		int i, plen = 0;
+
+		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+			goto out_nomem;
+		clone->next = head->next;
+		head->next = clone;
+		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+		skb_frag_list_init(head);
+		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+			plen += skb_shinfo(head)->frags[i].size;
+		clone->len = clone->data_len = head->data_len - plen;
+		head->data_len -= clone->len;
+		head->len -= clone->len;
+		clone->csum = 0;
+		clone->ip_summed = head->ip_summed;
+		atomic_add(clone->truesize, &qp->q.net->mem);
+	}
+
+	skb_shinfo(head)->frag_list = head->next;
+	skb_push(head, head->data - skb_network_header(head));
+
+	for (fp=head->next; fp; fp = fp->next) {
+		head->data_len += fp->len;
+		head->len += fp->len;
+		if (head->ip_summed != fp->ip_summed)
+			head->ip_summed = CHECKSUM_NONE;
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
+			head->csum = csum_add(head->csum, fp->csum);
+		head->truesize += fp->truesize;
+	}
+	atomic_sub(head->truesize, &qp->q.net->mem);
+
+	head->next = NULL;
+	head->dev = dev;
+	head->tstamp = qp->q.stamp;
+
+	iph = ip_hdr(head);
+	iph->frag_off = 0;
+	iph->tot_len = htons(len);
+	iph->tos |= ecn;
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+	qp->q.fragments = NULL;
+	qp->q.fragments_tail = NULL;
+	return 0;
+
+out_nomem:
+	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+			      "queue %p\n", qp);
+	err = -ENOMEM;
+	goto out_fail;
+out_oversize:
+	if (net_ratelimit())
+		printk(KERN_INFO "Oversized IP packet from %pI4.\n",
+			&qp->saddr);
+out_fail:
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+	return err;
+}
+
+/* Process an incoming IP datagram fragment. */
+int ip_defrag(struct sk_buff *skb, u32 user)
+{
+	struct ipq *qp;
+	struct net *net;
+
+	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+
+	/* Start by cleaning up the memory. */
+	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
+		ip_evictor(net);
+
+	/* Lookup (or create) queue header */
+	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
+		int ret;
+
+		spin_lock(&qp->q.lock);
+
+		ret = ip_frag_queue(qp, skb);
+
+		spin_unlock(&qp->q.lock);
+		ipq_put(qp);
+		return ret;
+	}
+
+	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag);
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+
+static struct ctl_table ip4_frags_ns_ctl_table[] = {
+	{
+		.procname	= "ipfrag_high_thresh",
+		.data		= &init_net.ipv4.frags.high_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ipfrag_low_thresh",
+		.data		= &init_net.ipv4.frags.low_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ipfrag_time",
+		.data		= &init_net.ipv4.frags.timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+static struct ctl_table ip4_frags_ctl_table[] = {
+	{
+		.procname	= "ipfrag_secret_interval",
+		.data		= &ip4_frags.secret_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ipfrag_max_dist",
+		.data		= &sysctl_ipfrag_max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
+	{ }
+};
+
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
+{
+	struct ctl_table *table;
+	struct ctl_table_header *hdr;
+
+	table = ip4_frags_ns_ctl_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
+		if (table == NULL)
+			goto err_alloc;
+
+		table[0].data = &net->ipv4.frags.high_thresh;
+		table[1].data = &net->ipv4.frags.low_thresh;
+		table[2].data = &net->ipv4.frags.timeout;
+	}
+
+	hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
+	if (hdr == NULL)
+		goto err_reg;
+
+	net->ipv4.frags_hdr = hdr;
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ipv4.frags_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.frags_hdr);
+	kfree(table);
+}
+
+static void ip4_frags_ctl_register(void)
+{
+	register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
+}
+#else
+static inline int ip4_frags_ns_ctl_register(struct net *net)
+{
+	return 0;
+}
+
+static inline void ip4_frags_ns_ctl_unregister(struct net *net)
+{
+}
+
+static inline void ip4_frags_ctl_register(void)
+{
+}
+#endif
+
+static int __net_init ipv4_frags_init_net(struct net *net)
+{
+	/*
+	 * Fragment cache limits. We will commit 256K at one time. Should we
+	 * cross that limit we will prune down to 192K. This should cope with
+	 * even the most extreme cases without allowing an attacker to
+	 * measurably harm machine performance.
+	 */
+	net->ipv4.frags.high_thresh = 256 * 1024;
+	net->ipv4.frags.low_thresh = 192 * 1024;
+	/*
+	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
+	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
+	 * by TTL.
+	 */
+	net->ipv4.frags.timeout = IP_FRAG_TIME;
+
+	inet_frags_init_net(&net->ipv4.frags);
+
+	return ip4_frags_ns_ctl_register(net);
+}
+
+static void __net_exit ipv4_frags_exit_net(struct net *net)
+{
+	ip4_frags_ns_ctl_unregister(net);
+	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+}
+
+static struct pernet_operations ip4_frags_ops = {
+	.init = ipv4_frags_init_net,
+	.exit = ipv4_frags_exit_net,
+};
+
+void __init ipfrag_init(void)
+{
+	ip4_frags_ctl_register();
+	register_pernet_subsys(&ip4_frags_ops);
+	ip4_frags.hashfn = ip4_hashfn;
+	ip4_frags.constructor = ip4_frag_init;
+	ip4_frags.destructor = ip4_frag_free;
+	ip4_frags.skb_free = NULL;
+	ip4_frags.qsize = sizeof(struct ipq);
+	ip4_frags.match = ip4_frag_match;
+	ip4_frags.frag_expire = ip_expire;
+	ip4_frags.secret_interval = 10 * 60 * HZ;
+	inet_frags_init(&ip4_frags);
+}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 00000000..d7bb94c4
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1754 @@
+/*
+ *	Linux NET3:	GRE over IP protocol decoder.
+ *
+ *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+   Problems & solutions
+   --------------------
+
+   1. The most important issue is detecting local dead loops.
+   They would cause complete host lockup in transmit, which
+   would be "resolved" by stack overflow or, if queueing is enabled,
+   with infinite looping in net_bh.
+
+   We cannot track such dead loops during route installation,
+   it is infeasible task. The most general solutions would be
+   to keep skb->encapsulation counter (sort of local ttl),
+   and silently drop packet when it expires. It is a good
+   solution, but it supposes maintaing new variable in ALL
+   skb, even if no tunneling is used.
+
+   Current solution: xmit_recursion breaks dead loops. This is a percpu
+   counter, since when we enter the first ndo_xmit(), cpu migration is
+   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
+
+   2. Networking dead loops would not kill routers, but would really
+   kill network. IP hop limit plays role of "t->recursion" in this case,
+   if we copy it from packet being encapsulated to upper header.
+   It is very good solution, but it introduces two problems:
+
+   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+     do not work over tunnels.
+   - traceroute does not work. I planned to relay ICMP from tunnel,
+     so that this problem would be solved and traceroute output
+     would even more informative. This idea appeared to be wrong:
+     only Linux complies to rfc1812 now (yes, guys, Linux is the only
+     true router now :-)), all routers (at least, in neighbourhood of mine)
+     return only 8 bytes of payload. It is the end.
+
+   Hence, if we want that OSPF worked or traceroute said something reasonable,
+   we should search for another solution.
+
+   One of them is to parse packet trying to detect inner encapsulation
+   made by our node. It is difficult or even impossible, especially,
+   taking into account fragmentation. TO be short, tt is not solution at all.
+
+   Current solution: The solution was UNEXPECTEDLY SIMPLE.
+   We force DF flag on tunnels with preconfigured hop limit,
+   that is ALL. :-) Well, it does not remove the problem completely,
+   but exponential growth of network traffic is changed to linear
+   (branches, that exceed pmtu are pruned) and tunnel mtu
+   fastly degrades to value <68, where looping stops.
+   Yes, it is not good if there exists a router in the loop,
+   which does not force DF, even when encapsulating packets have DF set.
+   But it is not our problem! Nobody could accuse us, we made
+   all that we could make. Even if it is your gated who injected
+   fatal route to network, even if it were you who configured
+   fatal static route: you are innocent. :-)
+
+
+
+   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+   practically identical code. It would be good to glue them
+   together, but it is not very evident, how to make them modular.
+   sit is integral part of IPv6, ipip and gre are naturally modular.
+   We could extract common parts (hash table, ioctl etc)
+   to a separate module (ip_tunnel.c).
+
+   Alexey Kuznetsov.
+ */
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static int ipgre_tunnel_init(struct net_device *dev);
+static void ipgre_tunnel_setup(struct net_device *dev);
+static int ipgre_tunnel_bind_dev(struct net_device *dev);
+
+/* Fallback tunnel: no source, no destination, no key, no options */
+
+#define HASH_SIZE  16
+
+static int ipgre_net_id __read_mostly;
+struct ipgre_net {
+	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
+
+	struct net_device *fb_tunnel_dev;
+};
+
+/* Tunnel hash table */
+
+/*
+   4 hash tables:
+
+   3: (remote,local)
+   2: (remote,*)
+   1: (*,local)
+   0: (*,*)
+
+   We require exact key match i.e. if a key is present in packet
+   it will match only tunnel with the same key; if it is not present,
+   it will match only keyless tunnel.
+
+   All keysless packets, if not matched configured keyless tunnels
+   will match fallback tunnel.
+ */
+
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+
+#define tunnels_r_l	tunnels[3]
+#define tunnels_r	tunnels[2]
+#define tunnels_l	tunnels[1]
+#define tunnels_wc	tunnels[0]
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+
+#define for_each_ip_tunnel_rcu(start) \
+	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+
+static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
+					      __be32 remote, __be32 local,
+					      __be32 key, __be16 gre_proto)
+{
+	struct net *net = dev_net(dev);
+	int link = dev->ifindex;
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(key);
+	struct ip_tunnel *t, *cand = NULL;
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
+		       ARPHRD_ETHER : ARPHRD_IPGRE;
+	int score, cand_score = 4;
+
+	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
+		if (local != t->parms.iph.saddr ||
+		    remote != t->parms.iph.daddr ||
+		    key != t->parms.i_key ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->dev->type != ARPHRD_IPGRE &&
+		    t->dev->type != dev_type)
+			continue;
+
+		score = 0;
+		if (t->parms.link != link)
+			score |= 1;
+		if (t->dev->type != dev_type)
+			score |= 2;
+		if (score == 0)
+			return t;
+
+		if (score < cand_score) {
+			cand = t;
+			cand_score = score;
+		}
+	}
+
+	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
+		if (remote != t->parms.iph.daddr ||
+		    key != t->parms.i_key ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->dev->type != ARPHRD_IPGRE &&
+		    t->dev->type != dev_type)
+			continue;
+
+		score = 0;
+		if (t->parms.link != link)
+			score |= 1;
+		if (t->dev->type != dev_type)
+			score |= 2;
+		if (score == 0)
+			return t;
+
+		if (score < cand_score) {
+			cand = t;
+			cand_score = score;
+		}
+	}
+
+	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
+		if ((local != t->parms.iph.saddr &&
+		     (local != t->parms.iph.daddr ||
+		      !ipv4_is_multicast(local))) ||
+		    key != t->parms.i_key ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->dev->type != ARPHRD_IPGRE &&
+		    t->dev->type != dev_type)
+			continue;
+
+		score = 0;
+		if (t->parms.link != link)
+			score |= 1;
+		if (t->dev->type != dev_type)
+			score |= 2;
+		if (score == 0)
+			return t;
+
+		if (score < cand_score) {
+			cand = t;
+			cand_score = score;
+		}
+	}
+
+	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
+		if (t->parms.i_key != key ||
+		    !(t->dev->flags & IFF_UP))
+			continue;
+
+		if (t->dev->type != ARPHRD_IPGRE &&
+		    t->dev->type != dev_type)
+			continue;
+
+		score = 0;
+		if (t->parms.link != link)
+			score |= 1;
+		if (t->dev->type != dev_type)
+			score |= 2;
+		if (score == 0)
+			return t;
+
+		if (score < cand_score) {
+			cand = t;
+			cand_score = score;
+		}
+	}
+
+	if (cand != NULL)
+		return cand;
+
+	dev = ign->fb_tunnel_dev;
+	if (dev->flags & IFF_UP)
+		return netdev_priv(dev);
+
+	return NULL;
+}
+
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
+		struct ip_tunnel_parm *parms)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	__be32 key = parms->i_key;
+	unsigned int h = HASH(key);
+	int prio = 0;
+
+	if (local)
+		prio |= 1;
+	if (remote && !ipv4_is_multicast(remote)) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+
+	return &ign->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
+		struct ip_tunnel *t)
+{
+	return __ipgre_bucket(ign, &t->parms);
+}
+
+static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
+
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+	rcu_assign_pointer(*tp, t);
+}
+
+static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipgre_bucket(ign, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
+			break;
+		}
+	}
+}
+
+static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
+					   struct ip_tunnel_parm *parms,
+					   int type)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	__be32 key = parms->i_key;
+	int link = parms->link;
+	struct ip_tunnel *t;
+	struct ip_tunnel __rcu **tp;
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+
+	for (tp = __ipgre_bucket(ign, parms);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next)
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr &&
+		    key == t->parms.i_key &&
+		    link == t->parms.link &&
+		    type == t->dev->type)
+			break;
+
+	return t;
+}
+
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
+		struct ip_tunnel_parm *parms, int create)
+{
+	struct ip_tunnel *t, *nt;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+
+	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
+	if (t || !create)
+		return t;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else
+		strcpy(name, "gre%d");
+
+	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
+	if (!dev)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	nt = netdev_priv(dev);
+	nt->parms = *parms;
+	dev->rtnl_link_ops = &ipgre_link_ops;
+
+	dev->mtu = ipgre_tunnel_bind_dev(dev);
+
+	if (register_netdevice(dev) < 0)
+		goto failed_free;
+
+	dev_hold(dev);
+	ipgre_tunnel_link(ign, nt);
+	return nt;
+
+failed_free:
+	free_netdev(dev);
+	return NULL;
+}
+
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+
+	ipgre_tunnel_unlink(ign, netdev_priv(dev));
+	dev_put(dev);
+}
+
+
+static void ipgre_err(struct sk_buff *skb, u32 info)
+{
+
+/* All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+
+   Moreover, Cisco "wise men" put GRE key to the third word
+   in GRE header. It makes impossible maintaining even soft state for keyed
+   GRE tunnels with enabled checksum. Tell them "thank you".
+
+   Well, I wonder, rfc1812 was written by Cisco employee,
+   what the hell these idiots break standrads established
+   by themself???
+ */
+
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
+	int grehlen = (iph->ihl<<2) + 4;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	__be16 flags;
+
+	flags = p[0];
+	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+		if (flags&(GRE_VERSION|GRE_ROUTING))
+			return;
+		if (flags&GRE_KEY) {
+			grehlen += 4;
+			if (flags&GRE_CSUM)
+				grehlen += 4;
+		}
+	}
+
+	/* If only 8 bytes returned, keyed message will be dropped here */
+	if (skb_headlen(skb) < grehlen)
+		return;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return;
+		case ICMP_FRAG_NEEDED:
+			/* Soft state for pmtu is maintained by IP core. */
+			return;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+	}
+
+	rcu_read_lock();
+	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
+				flags & GRE_KEY ?
+				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
+				p[1]);
+	if (t == NULL || t->parms.iph.daddr == 0 ||
+	    ipv4_is_multicast(t->parms.iph.daddr))
+		goto out;
+
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	rcu_read_unlock();
+}
+
+static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos)) {
+		if (skb->protocol == htons(ETH_P_IP)) {
+			IP_ECN_set_ce(ip_hdr(skb));
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			IP6_ECN_set_ce(ipv6_hdr(skb));
+		}
+	}
+}
+
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
+{
+	u8 inner = 0;
+	if (skb->protocol == htons(ETH_P_IP))
+		inner = old_iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+static int ipgre_rcv(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	u8     *h;
+	__be16    flags;
+	__sum16   csum = 0;
+	__be32 key = 0;
+	u32    seqno = 0;
+	struct ip_tunnel *tunnel;
+	int    offset = 4;
+	__be16 gre_proto;
+
+	if (!pskb_may_pull(skb, 16))
+		goto drop_nolock;
+
+	iph = ip_hdr(skb);
+	h = skb->data;
+	flags = *(__be16*)h;
+
+	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+		/* - Version must be 0.
+		   - We do not support routing headers.
+		 */
+		if (flags&(GRE_VERSION|GRE_ROUTING))
+			goto drop_nolock;
+
+		if (flags&GRE_CSUM) {
+			switch (skb->ip_summed) {
+			case CHECKSUM_COMPLETE:
+				csum = csum_fold(skb->csum);
+				if (!csum)
+					break;
+				/* fall through */
+			case CHECKSUM_NONE:
+				skb->csum = 0;
+				csum = __skb_checksum_complete(skb);
+				skb->ip_summed = CHECKSUM_COMPLETE;
+			}
+			offset += 4;
+		}
+		if (flags&GRE_KEY) {
+			key = *(__be32*)(h + offset);
+			offset += 4;
+		}
+		if (flags&GRE_SEQ) {
+			seqno = ntohl(*(__be32*)(h + offset));
+			offset += 4;
+		}
+	}
+
+	gre_proto = *(__be16 *)(h + 2);
+
+	rcu_read_lock();
+	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
+					  iph->saddr, iph->daddr, key,
+					  gre_proto))) {
+		struct pcpu_tstats *tstats;
+
+		secpath_reset(skb);
+
+		skb->protocol = gre_proto;
+		/* WCCP version 1 and 2 protocol decoding.
+		 * - Change protocol to IP
+		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+		 */
+		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
+			skb->protocol = htons(ETH_P_IP);
+			if ((*(h + offset) & 0xF0) != 0x40)
+				offset += 4;
+		}
+
+		skb->mac_header = skb->network_header;
+		__pskb_pull(skb, offset);
+		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
+		skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+		if (ipv4_is_multicast(iph->daddr)) {
+			/* Looped back packet, drop it! */
+			if (rt_is_output_route(skb_rtable(skb)))
+				goto drop;
+			tunnel->dev->stats.multicast++;
+			skb->pkt_type = PACKET_BROADCAST;
+		}
+#endif
+
+		if (((flags&GRE_CSUM) && csum) ||
+		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+			tunnel->dev->stats.rx_crc_errors++;
+			tunnel->dev->stats.rx_errors++;
+			goto drop;
+		}
+		if (tunnel->parms.i_flags&GRE_SEQ) {
+			if (!(flags&GRE_SEQ) ||
+			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+				tunnel->dev->stats.rx_fifo_errors++;
+				tunnel->dev->stats.rx_errors++;
+				goto drop;
+			}
+			tunnel->i_seqno = seqno + 1;
+		}
+
+		/* Warning: All skb pointers will be invalidated! */
+		if (tunnel->dev->type == ARPHRD_ETHER) {
+			if (!pskb_may_pull(skb, ETH_HLEN)) {
+				tunnel->dev->stats.rx_length_errors++;
+				tunnel->dev->stats.rx_errors++;
+				goto drop;
+			}
+
+			iph = ip_hdr(skb);
+			skb->protocol = eth_type_trans(skb, tunnel->dev);
+			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+		}
+
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		skb_reset_network_header(skb);
+		ipgre_ecn_decapsulate(iph, skb);
+
+		netif_rx(skb);
+
+		rcu_read_unlock();
+		return 0;
+	}
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	rcu_read_unlock();
+drop_nolock:
+	kfree_skb(skb);
+	return 0;
+}
+
+static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct pcpu_tstats *tstats;
+	const struct iphdr  *old_iph = ip_hdr(skb);
+	const struct iphdr  *tiph;
+	struct flowi4 fl4;
+	u8     tos;
+	__be16 df;
+	struct rtable *rt;     			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *iph;			/* Our new IP header */
+	unsigned int max_headroom;		/* The extra header space needed */
+	int    gre_hlen;
+	__be32 dst;
+	int    mtu;
+
+	if (dev->type == ARPHRD_ETHER)
+		IPCB(skb)->flags = 0;
+
+	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
+		gre_hlen = 0;
+		tiph = (const struct iphdr *)skb->data;
+	} else {
+		gre_hlen = tunnel->hlen;
+		tiph = &tunnel->parms.iph;
+	}
+
+	if ((dst = tiph->daddr) == 0) {
+		/* NBMA tunnel */
+
+		if (skb_dst(skb) == NULL) {
+			dev->stats.tx_fifo_errors++;
+			goto tx_error;
+		}
+
+		if (skb->protocol == htons(ETH_P_IP)) {
+			rt = skb_rtable(skb);
+			if ((dst = rt->rt_gateway) == 0)
+				goto tx_error_icmp;
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		else if (skb->protocol == htons(ETH_P_IPV6)) {
+			struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
+			const struct in6_addr *addr6;
+			int addr_type;
+
+			if (neigh == NULL)
+				goto tx_error;
+
+			addr6 = (const struct in6_addr *)&neigh->primary_key;
+			addr_type = ipv6_addr_type(addr6);
+
+			if (addr_type == IPV6_ADDR_ANY) {
+				addr6 = &ipv6_hdr(skb)->daddr;
+				addr_type = ipv6_addr_type(addr6);
+			}
+
+			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+				goto tx_error_icmp;
+
+			dst = addr6->s6_addr32[3];
+		}
+#endif
+		else
+			goto tx_error;
+	}
+
+	tos = tiph->tos;
+	if (tos == 1) {
+		tos = 0;
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = old_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
+	}
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
+				 tunnel->parms.o_key, RT_TOS(tos),
+				 tunnel->parms.link);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+	tdev = rt->dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	df = tiph->frag_off;
+	if (df)
+		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
+	else
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= (old_iph->frag_off&htons(IP_DF));
+
+		if ((old_iph->frag_off&htons(IP_DF)) &&
+		    mtu < ntohs(old_iph->tot_len)) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
+			if ((tunnel->parms.iph.daddr &&
+			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+			    rt6->rt6i_dst.plen == 128) {
+				rt6->rt6i_flags |= RTF_MODIFIED;
+				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+			}
+		}
+
+		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
+
+	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
+	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (max_headroom > dev->needed_headroom)
+			dev->needed_headroom = max_headroom;
+		if (!new_skb) {
+			ip_rt_put(rt);
+			dev->stats.tx_dropped++;
+			dev_kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		dev_kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ip_hdr(skb);
+	}
+
+	skb_reset_transport_header(skb);
+	skb_push(skb, gre_hlen);
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+
+	iph 			=	ip_hdr(skb);
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr) >> 2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_GRE;
+	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
+	iph->daddr		=	fl4.daddr;
+	iph->saddr		=	fl4.saddr;
+
+	if ((iph->ttl = tiph->ttl) == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			iph->ttl = old_iph->ttl;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
+#endif
+		else
+			iph->ttl = ip4_dst_hoplimit(&rt->dst);
+	}
+
+	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
+	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
+				   htons(ETH_P_TEB) : skb->protocol;
+
+	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
+
+		if (tunnel->parms.o_flags&GRE_SEQ) {
+			++tunnel->o_seqno;
+			*ptr = htonl(tunnel->o_seqno);
+			ptr--;
+		}
+		if (tunnel->parms.o_flags&GRE_KEY) {
+			*ptr = tunnel->parms.o_key;
+			ptr--;
+		}
+		if (tunnel->parms.o_flags&GRE_CSUM) {
+			*ptr = 0;
+			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+		}
+	}
+
+	nf_reset(skb);
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
+	return NETDEV_TX_OK;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+
+tx_error:
+	dev->stats.tx_errors++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int ipgre_tunnel_bind_dev(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph;
+	int hlen = LL_MAX_HEADER;
+	int mtu = ETH_DATA_LEN;
+	int addend = sizeof(struct iphdr) + 4;
+
+	tunnel = netdev_priv(dev);
+	iph = &tunnel->parms.iph;
+
+	/* Guess output device to choose reasonable mtu and needed_headroom */
+
+	if (iph->daddr) {
+		struct flowi4 fl4;
+		struct rtable *rt;
+
+		rt = ip_route_output_gre(dev_net(dev), &fl4,
+					 iph->daddr, iph->saddr,
+					 tunnel->parms.o_key,
+					 RT_TOS(iph->tos),
+					 tunnel->parms.link);
+		if (!IS_ERR(rt)) {
+			tdev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+
+		if (dev->type != ARPHRD_ETHER)
+			dev->flags |= IFF_POINTOPOINT;
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+	if (tdev) {
+		hlen = tdev->hard_header_len + tdev->needed_headroom;
+		mtu = tdev->mtu;
+	}
+	dev->iflink = tunnel->parms.link;
+
+	/* Precalculate GRE options length */
+	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+		if (tunnel->parms.o_flags&GRE_CSUM)
+			addend += 4;
+		if (tunnel->parms.o_flags&GRE_KEY)
+			addend += 4;
+		if (tunnel->parms.o_flags&GRE_SEQ)
+			addend += 4;
+	}
+	dev->needed_headroom = addend + hlen;
+	mtu -= dev->hard_header_len + addend;
+
+	if (mtu < 68)
+		mtu = 68;
+
+	tunnel->hlen = addend;
+
+	return mtu;
+}
+
+static int
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *t;
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		t = NULL;
+		if (dev == ign->fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ipgre_tunnel_locate(net, &p, 0);
+		}
+		if (t == NULL)
+			t = netdev_priv(dev);
+		memcpy(&p, &t->parms, sizeof(p));
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			err = -EFAULT;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+			goto done;
+		if (p.iph.ttl)
+			p.iph.frag_off |= htons(IP_DF);
+
+		if (!(p.i_flags&GRE_KEY))
+			p.i_key = 0;
+		if (!(p.o_flags&GRE_KEY))
+			p.o_key = 0;
+
+		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				unsigned int nflags = 0;
+
+				t = netdev_priv(dev);
+
+				if (ipv4_is_multicast(p.iph.daddr))
+					nflags = IFF_BROADCAST;
+				else if (p.iph.daddr)
+					nflags = IFF_POINTOPOINT;
+
+				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+					err = -EINVAL;
+					break;
+				}
+				ipgre_tunnel_unlink(ign, t);
+				synchronize_net();
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				t->parms.i_key = p.i_key;
+				t->parms.o_key = p.o_key;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				ipgre_tunnel_link(ign, t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.iph.ttl = p.iph.ttl;
+				t->parms.iph.tos = p.iph.tos;
+				t->parms.iph.frag_off = p.iph.frag_off;
+				if (t->parms.link != p.link) {
+					t->parms.link = p.link;
+					dev->mtu = ipgre_tunnel_bind_dev(dev);
+					netdev_state_change(dev);
+				}
+			}
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == ign->fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+				goto done;
+			err = -ENOENT;
+			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
+				goto done;
+			err = -EPERM;
+			if (t == netdev_priv(ign->fb_tunnel_dev))
+				goto done;
+			dev = t->dev;
+		}
+		unregister_netdevice(dev);
+		err = 0;
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	if (new_mtu < 68 ||
+	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+/* Nice toy. Unfortunately, useless in real life :-)
+   It allows to construct virtual multiprotocol broadcast "LAN"
+   over the Internet, provided multicast routing is tuned.
+
+
+   I have no idea was this bicycle invented before me,
+   so that I had to set ARPHRD_IPGRE to a random value.
+   I have an impression, that Cisco could make something similar,
+   but this feature is apparently missing in IOS<=11.2(8).
+
+   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+   ping -t 255 224.66.66.66
+
+   If nobody answers, mbone does not work.
+
+   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+   ip addr add 10.66.66.<somewhat>/24 dev Universe
+   ifconfig Universe up
+   ifconfig Universe add fe80::<Your_real_addr>/10
+   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+   ftp 10.66.66.66
+   ...
+   ftp fec0:6666:6666::193.233.7.65
+   ...
+
+ */
+
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+			unsigned short type,
+			const void *daddr, const void *saddr, unsigned int len)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+	__be16 *p = (__be16*)(iph+1);
+
+	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+	p[0]		= t->parms.o_flags;
+	p[1]		= htons(type);
+
+	/*
+	 *	Set the source hardware address.
+	 */
+
+	if (saddr)
+		memcpy(&iph->saddr, saddr, 4);
+	if (daddr)
+		memcpy(&iph->daddr, daddr, 4);
+	if (iph->daddr)
+		return t->hlen;
+
+	return -t->hlen;
+}
+
+static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
+	memcpy(haddr, &iph->saddr, 4);
+	return 4;
+}
+
+static const struct header_ops ipgre_header_ops = {
+	.create	= ipgre_header,
+	.parse	= ipgre_header_parse,
+};
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+static int ipgre_open(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (ipv4_is_multicast(t->parms.iph.daddr)) {
+		struct flowi4 fl4;
+		struct rtable *rt;
+
+		rt = ip_route_output_gre(dev_net(dev), &fl4,
+					 t->parms.iph.daddr,
+					 t->parms.iph.saddr,
+					 t->parms.o_key,
+					 RT_TOS(t->parms.iph.tos),
+					 t->parms.link);
+		if (IS_ERR(rt))
+			return -EADDRNOTAVAIL;
+		dev = rt->dst.dev;
+		ip_rt_put(rt);
+		if (__in_dev_get_rtnl(dev) == NULL)
+			return -EADDRNOTAVAIL;
+		t->mlink = dev->ifindex;
+		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
+	}
+	return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
+		struct in_device *in_dev;
+		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
+		if (in_dev)
+			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+	}
+	return 0;
+}
+
+#endif
+
+static const struct net_device_ops ipgre_netdev_ops = {
+	.ndo_init		= ipgre_tunnel_init,
+	.ndo_uninit		= ipgre_tunnel_uninit,
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+	.ndo_open		= ipgre_open,
+	.ndo_stop		= ipgre_close,
+#endif
+	.ndo_start_xmit		= ipgre_tunnel_xmit,
+	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
+	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
+};
+
+static void ipgre_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &ipgre_netdev_ops;
+	dev->destructor 	= ipgre_dev_free;
+
+	dev->type		= ARPHRD_IPGRE;
+	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
+	dev->flags		= IFF_NOARP;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel;
+	struct iphdr *iph;
+
+	tunnel = netdev_priv(dev);
+	iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	if (iph->daddr) {
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+		if (ipv4_is_multicast(iph->daddr)) {
+			if (!iph->saddr)
+				return -EINVAL;
+			dev->flags = IFF_BROADCAST;
+			dev->header_ops = &ipgre_header_ops;
+		}
+#endif
+	} else
+		dev->header_ops = &ipgre_header_ops;
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ipgre_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_GRE;
+	iph->ihl		= 5;
+	tunnel->hlen		= sizeof(struct iphdr) + 4;
+
+	dev_hold(dev);
+}
+
+
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = ipgre_rcv,
+	.err_handler = ipgre_err,
+};
+
+static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
+{
+	int prio;
+
+	for (prio = 0; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(ign->tunnels[prio][h]);
+
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = rtnl_dereference(t->next);
+			}
+		}
+	}
+}
+
+static int __net_init ipgre_init_net(struct net *net)
+{
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	int err;
+
+	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
+					   ipgre_tunnel_setup);
+	if (!ign->fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err_alloc_dev;
+	}
+	dev_net_set(ign->fb_tunnel_dev, net);
+
+	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
+	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
+
+	if ((err = register_netdev(ign->fb_tunnel_dev)))
+		goto err_reg_dev;
+
+	rcu_assign_pointer(ign->tunnels_wc[0],
+			   netdev_priv(ign->fb_tunnel_dev));
+	return 0;
+
+err_reg_dev:
+	ipgre_dev_free(ign->fb_tunnel_dev);
+err_alloc_dev:
+	return err;
+}
+
+static void __net_exit ipgre_exit_net(struct net *net)
+{
+	struct ipgre_net *ign;
+	LIST_HEAD(list);
+
+	ign = net_generic(net, ipgre_net_id);
+	rtnl_lock();
+	ipgre_destroy_tunnels(ign, &list);
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations ipgre_net_ops = {
+	.init = ipgre_init_net,
+	.exit = ipgre_exit_net,
+	.id   = &ipgre_net_id,
+	.size = sizeof(struct ipgre_net),
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	__be16 flags;
+
+	if (!data)
+		return 0;
+
+	flags = 0;
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (flags & (GRE_VERSION|GRE_ROUTING))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	__be32 daddr;
+
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+			return -EINVAL;
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+			return -EADDRNOTAVAIL;
+	}
+
+	if (!data)
+		goto out;
+
+	if (data[IFLA_GRE_REMOTE]) {
+		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+		if (!daddr)
+			return -EINVAL;
+	}
+
+out:
+	return ipgre_tunnel_validate(tb, data);
+}
+
+static void ipgre_netlink_parms(struct nlattr *data[],
+				struct ip_tunnel_parm *parms)
+{
+	memset(parms, 0, sizeof(*parms));
+
+	parms->iph.protocol = IPPROTO_GRE;
+
+	if (!data)
+		return;
+
+	if (data[IFLA_GRE_LINK])
+		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
+
+	if (data[IFLA_GRE_IFLAGS])
+		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+
+	if (data[IFLA_GRE_OFLAGS])
+		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+
+	if (data[IFLA_GRE_IKEY])
+		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
+
+	if (data[IFLA_GRE_OKEY])
+		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
+
+	if (data[IFLA_GRE_LOCAL])
+		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
+
+	if (data[IFLA_GRE_REMOTE])
+		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
+
+	if (data[IFLA_GRE_TTL])
+		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
+
+	if (data[IFLA_GRE_TOS])
+		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
+
+	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
+		parms->iph.frag_off = htons(IP_DF);
+}
+
+static int ipgre_tap_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel;
+
+	tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	ipgre_tunnel_bind_dev(dev);
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static const struct net_device_ops ipgre_tap_netdev_ops = {
+	.ndo_init		= ipgre_tap_init,
+	.ndo_uninit		= ipgre_tunnel_uninit,
+	.ndo_start_xmit		= ipgre_tunnel_xmit,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+
+	ether_setup(dev);
+
+	dev->netdev_ops		= &ipgre_tap_netdev_ops;
+	dev->destructor 	= ipgre_dev_free;
+
+	dev->iflink		= 0;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
+			 struct nlattr *data[])
+{
+	struct ip_tunnel *nt;
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	int mtu;
+	int err;
+
+	nt = netdev_priv(dev);
+	ipgre_netlink_parms(data, &nt->parms);
+
+	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
+		return -EEXIST;
+
+	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+		random_ether_addr(dev->dev_addr);
+
+	mtu = ipgre_tunnel_bind_dev(dev);
+	if (!tb[IFLA_MTU])
+		dev->mtu = mtu;
+
+	/* Can use a lockless transmit, unless we generate output sequences */
+	if (!(nt->parms.o_flags & GRE_SEQ))
+		dev->features |= NETIF_F_LLTX;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto out;
+
+	dev_hold(dev);
+	ipgre_tunnel_link(ign, nt);
+
+out:
+	return err;
+}
+
+static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[])
+{
+	struct ip_tunnel *t, *nt;
+	struct net *net = dev_net(dev);
+	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	struct ip_tunnel_parm p;
+	int mtu;
+
+	if (dev == ign->fb_tunnel_dev)
+		return -EINVAL;
+
+	nt = netdev_priv(dev);
+	ipgre_netlink_parms(data, &p);
+
+	t = ipgre_tunnel_locate(net, &p, 0);
+
+	if (t) {
+		if (t->dev != dev)
+			return -EEXIST;
+	} else {
+		t = nt;
+
+		if (dev->type != ARPHRD_ETHER) {
+			unsigned int nflags = 0;
+
+			if (ipv4_is_multicast(p.iph.daddr))
+				nflags = IFF_BROADCAST;
+			else if (p.iph.daddr)
+				nflags = IFF_POINTOPOINT;
+
+			if ((dev->flags ^ nflags) &
+			    (IFF_POINTOPOINT | IFF_BROADCAST))
+				return -EINVAL;
+		}
+
+		ipgre_tunnel_unlink(ign, t);
+		t->parms.iph.saddr = p.iph.saddr;
+		t->parms.iph.daddr = p.iph.daddr;
+		t->parms.i_key = p.i_key;
+		if (dev->type != ARPHRD_ETHER) {
+			memcpy(dev->dev_addr, &p.iph.saddr, 4);
+			memcpy(dev->broadcast, &p.iph.daddr, 4);
+		}
+		ipgre_tunnel_link(ign, t);
+		netdev_state_change(dev);
+	}
+
+	t->parms.o_key = p.o_key;
+	t->parms.iph.ttl = p.iph.ttl;
+	t->parms.iph.tos = p.iph.tos;
+	t->parms.iph.frag_off = p.iph.frag_off;
+
+	if (t->parms.link != p.link) {
+		t->parms.link = p.link;
+		mtu = ipgre_tunnel_bind_dev(dev);
+		if (!tb[IFLA_MTU])
+			dev->mtu = mtu;
+		netdev_state_change(dev);
+	}
+
+	return 0;
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+	return
+		/* IFLA_GRE_LINK */
+		nla_total_size(4) +
+		/* IFLA_GRE_IFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_OFLAGS */
+		nla_total_size(2) +
+		/* IFLA_GRE_IKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_OKEY */
+		nla_total_size(4) +
+		/* IFLA_GRE_LOCAL */
+		nla_total_size(4) +
+		/* IFLA_GRE_REMOTE */
+		nla_total_size(4) +
+		/* IFLA_GRE_TTL */
+		nla_total_size(1) +
+		/* IFLA_GRE_TOS */
+		nla_total_size(1) +
+		/* IFLA_GRE_PMTUDISC */
+		nla_total_size(1) +
+		0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm *p = &t->parms;
+
+	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
+	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
+	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
+	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
+	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
+	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
+	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
+	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
+	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
+	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
+	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
+	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
+	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
+	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
+	.kind		= "gre",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tunnel_setup,
+	.validate	= ipgre_tunnel_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+	.kind		= "gretap",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= ipgre_tap_setup,
+	.validate	= ipgre_tap_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+};
+
+/*
+ *	And now the modules code and kernel interface.
+ */
+
+static int __init ipgre_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
+	err = register_pernet_device(&ipgre_net_ops);
+	if (err < 0)
+		return err;
+
+	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
+	if (err < 0) {
+		printk(KERN_INFO "ipgre init: can't add protocol\n");
+		goto add_proto_failed;
+	}
+
+	err = rtnl_link_register(&ipgre_link_ops);
+	if (err < 0)
+		goto rtnl_link_failed;
+
+	err = rtnl_link_register(&ipgre_tap_ops);
+	if (err < 0)
+		goto tap_ops_failed;
+
+out:
+	return err;
+
+tap_ops_failed:
+	rtnl_link_unregister(&ipgre_link_ops);
+rtnl_link_failed:
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
+add_proto_failed:
+	unregister_pernet_device(&ipgre_net_ops);
+	goto out;
+}
+
+static void __exit ipgre_fini(void)
+{
+	rtnl_link_unregister(&ipgre_tap_ops);
+	rtnl_link_unregister(&ipgre_link_ops);
+	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
+		printk(KERN_INFO "ipgre close: can't remove protocol\n");
+	unregister_pernet_device(&ipgre_net_ops);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("gre");
+MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 00000000..c8f48efc
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,452 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) module.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *
+ *
+ * Fixes:
+ *		Alan Cox	:	Commented a couple of minor bits of surplus code
+ *		Alan Cox	:	Undefining IP_FORWARD doesn't include the code
+ *					(just stops a compiler warning).
+ *		Alan Cox	:	Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ *					are junked rather than corrupting things.
+ *		Alan Cox	:	Frames to bad broadcast subnets are dumped
+ *					We used to process them non broadcast and
+ *					boy could that cause havoc.
+ *		Alan Cox	:	ip_forward sets the free flag on the
+ *					new frame it queues. Still crap because
+ *					it copies the frame but at least it
+ *					doesn't eat memory too.
+ *		Alan Cox	:	Generic queue code and memory fixes.
+ *		Fred Van Kempen :	IP fragment support (borrowed from NET2E)
+ *		Gerhard Koerting:	Forward fragmented frames correctly.
+ *		Gerhard Koerting: 	Fixes to my fix of the above 8-).
+ *		Gerhard Koerting:	IP interface addressing fix.
+ *		Linus Torvalds	:	More robustness checks
+ *		Alan Cox	:	Even more checks: Still not as robust as it ought to be
+ *		Alan Cox	:	Save IP header pointer for later
+ *		Alan Cox	:	ip option setting
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings
+ *		Alan Cox	:	Fragmentation bogosity removed
+ *					(Thanks to Mark.Bush@prg.ox.ac.uk)
+ *		Dmitry Gorodchanin :	Send of a raw packet crash fix.
+ *		Alan Cox	:	Silly ip bug when an overlength
+ *					fragment turns up. Now frees the
+ *					queue.
+ *		Linus Torvalds/ :	Memory leakage on fragmentation
+ *		Alan Cox	:	handling.
+ *		Gerhard Koerting:	Forwarding uses IP priority hints
+ *		Teemu Rantanen	:	Fragment problems.
+ *		Alan Cox	:	General cleanup, comments and reformat
+ *		Alan Cox	:	SNMP statistics
+ *		Alan Cox	:	BSD address rule semantics. Also see
+ *					UDP as there is a nasty checksum issue
+ *					if you do things the wrong way.
+ *		Alan Cox	:	Always defrag, moved IP_FORWARD to the config.in file
+ *		Alan Cox	: 	IP options adjust sk->priority.
+ *		Pedro Roque	:	Fix mtu/length error in ip_forward.
+ *		Alan Cox	:	Avoid ip_chk_addr when possible.
+ *	Richard Underwood	:	IP multicasting.
+ *		Alan Cox	:	Cleaned up multicast handlers.
+ *		Alan Cox	:	RAW sockets demultiplex in the BSD style.
+ *		Gunther Mayer	:	Fix the SNMP reporting typo
+ *		Alan Cox	:	Always in group 224.0.0.1
+ *	Pauline Middelink	:	Fast ip_checksum update when forwarding
+ *					Masquerading support.
+ *		Alan Cox	:	Multicast loopback error for 224.0.0.1
+ *		Alan Cox	:	IP_MULTICAST_LOOP option.
+ *		Alan Cox	:	Use notifiers.
+ *		Bjorn Ekwall	:	Removed ip_csum (from slhc.c too)
+ *		Bjorn Ekwall	:	Moved ip_fast_csum to ip.h (inline!)
+ *		Stefan Becker   :       Send out ICMP HOST REDIRECT
+ *	Arnt Gulbrandsen	:	ip_build_xmit
+ *		Alan Cox	:	Per socket routing cache
+ *		Alan Cox	:	Fixed routing cache, added header cache.
+ *		Alan Cox	:	Loopback didn't work right in original ip_build_xmit - fixed it.
+ *		Alan Cox	:	Only send ICMP_REDIRECT if src/dest are the same net.
+ *		Alan Cox	:	Incoming IP option handling.
+ *		Alan Cox	:	Set saddr on raw output frames as per BSD.
+ *		Alan Cox	:	Stopped broadcast source route explosions.
+ *		Alan Cox	:	Can disable source routing
+ *		Takeshi Sone    :	Masquerading didn't work.
+ *	Dave Bonn,Alan Cox	:	Faster IP forwarding whenever possible.
+ *		Alan Cox	:	Memory leaks, tramples, misc debugging.
+ *		Alan Cox	:	Fixed multicast (by popular demand 8))
+ *		Alan Cox	:	Fixed forwarding (by even more popular demand 8))
+ *		Alan Cox	:	Fixed SNMP statistics [I think]
+ *	Gerhard Koerting	:	IP fragmentation forwarding fix
+ *		Alan Cox	:	Device lock against page fault.
+ *		Alan Cox	:	IP_HDRINCL facility.
+ *	Werner Almesberger	:	Zero fragment bug
+ *		Alan Cox	:	RAW IP frame length bug
+ *		Alan Cox	:	Outgoing firewall on build_xmit
+ *		A.N.Kuznetsov	:	IP_OPTIONS support throughout the kernel
+ *		Alan Cox	:	Multicast routing hooks
+ *		Jos Vos		:	Do accounting *before* call_in_firewall
+ *	Willy Konynenberg	:	Transparent proxying support
+ *
+ *
+ *
+ * To Fix:
+ *		IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ *		and could be made very efficient with the addition of some virtual memory hacks to permit
+ *		the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ *		Output fragmentation wants updating along with the buffer management to use a single
+ *		interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ *		output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ *		fragmentation anyway.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ *	Process Router Attention IP option (RFC 2113)
+ */
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+	struct ip_ra_chain *ra;
+	u8 protocol = ip_hdr(skb)->protocol;
+	struct sock *last = NULL;
+	struct net_device *dev = skb->dev;
+
+	for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
+		struct sock *sk = ra->sk;
+
+		/* If socket is bound to an interface, only report
+		 * the packet if it came  from that interface.
+		 */
+		if (sk && inet_sk(sk)->inet_num == protocol &&
+		    (!sk->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == dev->ifindex) &&
+		    net_eq(sock_net(sk), dev_net(dev))) {
+			if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+				if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+					return 1;
+			}
+			if (last) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					raw_rcv(last, skb2);
+			}
+			last = sk;
+		}
+	}
+
+	if (last) {
+		raw_rcv(last, skb);
+		return 1;
+	}
+	return 0;
+}
+
+static int ip_local_deliver_finish(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+
+	__skb_pull(skb, ip_hdrlen(skb));
+
+	/* Point into the IP datagram, just past the header. */
+	skb_reset_transport_header(skb);
+
+	rcu_read_lock();
+	{
+		int protocol = ip_hdr(skb)->protocol;
+		int hash, raw;
+		const struct net_protocol *ipprot;
+
+	resubmit:
+		raw = raw_local_deliver(skb, protocol);
+
+		hash = protocol & (MAX_INET_PROTOS - 1);
+		ipprot = rcu_dereference(inet_protos[hash]);
+		if (ipprot != NULL) {
+			int ret;
+
+			if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
+				if (net_ratelimit())
+					printk("%s: proto %d isn't netns-ready\n",
+						__func__, protocol);
+				kfree_skb(skb);
+				goto out;
+			}
+
+			if (!ipprot->no_policy) {
+				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+					kfree_skb(skb);
+					goto out;
+				}
+				nf_reset(skb);
+			}
+			ret = ipprot->handler(skb);
+			if (ret < 0) {
+				protocol = -ret;
+				goto resubmit;
+			}
+			IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+		} else {
+			if (!raw) {
+				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+					IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+					icmp_send(skb, ICMP_DEST_UNREACH,
+						  ICMP_PROT_UNREACH, 0);
+				}
+			} else
+				IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+			kfree_skb(skb);
+		}
+	}
+ out:
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * 	Deliver IP Packets to the higher protocol layers.
+ */
+int ip_local_deliver(struct sk_buff *skb)
+{
+	/*
+	 *	Reassemble IP fragments.
+	 */
+
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+			return 0;
+	}
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+		       ip_local_deliver_finish);
+}
+
+static inline int ip_rcv_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	const struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+
+	/* It looks as overkill, because not all
+	   IP options require packet mangling.
+	   But it is the easiest for now, especially taking
+	   into account that combination of IP options
+	   and running sniffer is extremely rare condition.
+					      --ANK (980813)
+	*/
+	if (skb_cow(skb, skb_headroom(skb))) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+
+	if (ip_options_compile(dev_net(dev), opt, skb)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+		goto drop;
+	}
+
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+		if (in_dev) {
+			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+				if (IN_DEV_LOG_MARTIANS(in_dev) &&
+				    net_ratelimit())
+					printk(KERN_INFO "source route option %pI4 -> %pI4\n",
+					       &iph->saddr, &iph->daddr);
+				goto drop;
+			}
+		}
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+drop:
+	return -1;
+}
+
+static int ip_rcv_finish(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+
+	/*
+	 *	Initialise the virtual path cache for the packet. It describes
+	 *	how the packet travels inside Linux networking.
+	 */
+	if (skb_dst(skb) == NULL) {
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
+		if (unlikely(err)) {
+			if (err == -EHOSTUNREACH)
+				IP_INC_STATS_BH(dev_net(skb->dev),
+						IPSTATS_MIB_INADDRERRORS);
+			else if (err == -ENETUNREACH)
+				IP_INC_STATS_BH(dev_net(skb->dev),
+						IPSTATS_MIB_INNOROUTES);
+			else if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
+			goto drop;
+		}
+	}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (unlikely(skb_dst(skb)->tclassid)) {
+		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
+		u32 idx = skb_dst(skb)->tclassid;
+		st[idx&0xFF].o_packets++;
+		st[idx&0xFF].o_bytes += skb->len;
+		st[(idx>>16)&0xFF].i_packets++;
+		st[(idx>>16)&0xFF].i_bytes += skb->len;
+	}
+#endif
+
+	if (iph->ihl > 5 && ip_rcv_options(skb))
+		goto drop;
+
+	rt = skb_rtable(skb);
+	if (rt->rt_type == RTN_MULTICAST) {
+		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
+				skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST)
+		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
+				skb->len);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+/*
+ * 	Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	const struct iphdr *iph;
+	u32 len;
+
+	/* When the interface is in promisc. mode, drop all the crap
+	 * that it receives, do not try to analyse it.
+	 */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+
+	IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto out;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+
+	/*
+	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
+	 *
+	 *	Is the datagram acceptable?
+	 *
+	 *	1.	Length at least the size of an ip header
+	 *	2.	Version of 4
+	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+	 *	4.	Doesn't have a bogus length
+	 */
+
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	/* Our transport medium may have padded the buffer out. Now we know it
+	 * is IP we can trim to the true length of the frame.
+	 * Note this now means skb->len holds ntohs(iph->tot_len).
+	 */
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	/* Remove any debris in the socket control block */
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+
+	/* Must drop socket now because of tproxy. */
+	skb_orphan(skb);
+
+	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
+		       ip_rcv_finish);
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	kfree_skb(skb);
+out:
+	return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 00000000..42dd1a90
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,649 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The options processing module for ip.c
+ *
+ * Authors:	A.N.Kuznetsov
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/cipso_ipv4.h>
+
+/*
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this  function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
+		      __be32 daddr, struct rtable *rt, int is_frag)
+{
+	unsigned char *iph = skb_network_header(skb);
+
+	memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+	memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+	opt = &(IPCB(skb)->opt);
+
+	if (opt->srr)
+		memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+	if (!is_frag) {
+		if (opt->rr_needaddr)
+			ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
+		if (opt->ts_needaddr)
+			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
+		if (opt->ts_needtime) {
+			struct timespec tv;
+			__be32 midtime;
+			getnstimeofday(&tv);
+			midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+			memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+		}
+		return;
+	}
+	if (opt->rr) {
+		memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+		opt->rr = 0;
+		opt->rr_needaddr = 0;
+	}
+	if (opt->ts) {
+		memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+		opt->ts = 0;
+		opt->ts_needaddr = opt->ts_needtime = 0;
+	}
+}
+
+/*
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
+{
+	const struct ip_options *sopt;
+	unsigned char *sptr, *dptr;
+	int soffset, doffset;
+	int	optlen;
+	__be32	daddr;
+
+	memset(dopt, 0, sizeof(struct ip_options));
+
+	sopt = &(IPCB(skb)->opt);
+
+	if (sopt->optlen == 0)
+		return 0;
+
+	sptr = skb_network_header(skb);
+	dptr = dopt->__data;
+
+	daddr = skb_rtable(skb)->rt_spec_dst;
+
+	if (sopt->rr) {
+		optlen  = sptr[sopt->rr+1];
+		soffset = sptr[sopt->rr+2];
+		dopt->rr = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->rr, optlen);
+		if (sopt->rr_needaddr && soffset <= optlen) {
+			if (soffset + 3 > optlen)
+				return -EINVAL;
+			dptr[2] = soffset + 4;
+			dopt->rr_needaddr = 1;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->ts) {
+		optlen = sptr[sopt->ts+1];
+		soffset = sptr[sopt->ts+2];
+		dopt->ts = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->ts, optlen);
+		if (soffset <= optlen) {
+			if (sopt->ts_needaddr) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				dopt->ts_needaddr = 1;
+				soffset += 4;
+			}
+			if (sopt->ts_needtime) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+					dopt->ts_needtime = 1;
+					soffset += 4;
+				} else {
+					dopt->ts_needtime = 0;
+
+					if (soffset + 7 <= optlen) {
+						__be32 addr;
+
+						memcpy(&addr, dptr+soffset-1, 4);
+						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+							dopt->ts_needtime = 1;
+							soffset += 8;
+						}
+					}
+				}
+			}
+			dptr[2] = soffset;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->srr) {
+		unsigned char *start = sptr+sopt->srr;
+		__be32 faddr;
+
+		optlen  = start[1];
+		soffset = start[2];
+		doffset = 0;
+		if (soffset > optlen)
+			soffset = optlen + 1;
+		soffset -= 4;
+		if (soffset > 3) {
+			memcpy(&faddr, &start[soffset-1], 4);
+			for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+				memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+			/*
+			 * RFC1812 requires to fix illegal source routes.
+			 */
+			if (memcmp(&ip_hdr(skb)->saddr,
+				   &start[soffset + 3], 4) == 0)
+				doffset -= 4;
+		}
+		if (doffset > 3) {
+			memcpy(&start[doffset-1], &daddr, 4);
+			dopt->faddr = faddr;
+			dptr[0] = start[0];
+			dptr[1] = doffset+3;
+			dptr[2] = 4;
+			dptr += doffset+3;
+			dopt->srr = dopt->optlen + sizeof(struct iphdr);
+			dopt->optlen += doffset+3;
+			dopt->is_strictroute = sopt->is_strictroute;
+		}
+	}
+	if (sopt->cipso) {
+		optlen  = sptr[sopt->cipso+1];
+		dopt->cipso = dopt->optlen+sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->cipso, optlen);
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	while (dopt->optlen & 3) {
+		*dptr++ = IPOPT_END;
+		dopt->optlen++;
+	}
+	return 0;
+}
+
+/*
+ *	Options "fragmenting", just fill options not
+ *	allowed in fragments with NOOPs.
+ *	Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff * skb)
+{
+	unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
+	struct ip_options * opt = &(IPCB(skb)->opt);
+	int  l = opt->optlen;
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l)
+		  return;
+		if (!IPOPT_COPIED(*optptr))
+			memset(optptr, IPOPT_NOOP, optlen);
+		l -= optlen;
+		optptr += optlen;
+	}
+	opt->ts = 0;
+	opt->rr = 0;
+	opt->rr_needaddr = 0;
+	opt->ts_needaddr = 0;
+	opt->ts_needtime = 0;
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct net *net,
+		       struct ip_options * opt, struct sk_buff * skb)
+{
+	int l;
+	unsigned char * iph;
+	unsigned char * optptr;
+	int optlen;
+	unsigned char * pp_ptr = NULL;
+	struct rtable *rt = NULL;
+
+	if (skb != NULL) {
+		rt = skb_rtable(skb);
+		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
+	} else
+		optptr = opt->__data;
+	iph = optptr - sizeof(struct iphdr);
+
+	for (l = opt->optlen; l > 0; ) {
+		switch (*optptr) {
+		      case IPOPT_END:
+			for (optptr++, l--; l>0; optptr++, l--) {
+				if (*optptr != IPOPT_END) {
+					*optptr = IPOPT_END;
+					opt->is_changed = 1;
+				}
+			}
+			goto eol;
+		      case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l) {
+			pp_ptr = optptr;
+			goto error;
+		}
+		switch (*optptr) {
+		      case IPOPT_SSRR:
+		      case IPOPT_LSRR:
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			/* NB: cf RFC-1812 5.2.4.1 */
+			if (opt->srr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (!skb) {
+				if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+					pp_ptr = optptr + 1;
+					goto error;
+				}
+				memcpy(&opt->faddr, &optptr[3], 4);
+				if (optlen > 7)
+					memmove(&optptr[3], &optptr[7], optlen-7);
+			}
+			opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+			opt->srr = optptr - iph;
+			break;
+		      case IPOPT_RR:
+			if (opt->rr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				if (optptr[2]+3 > optlen) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				if (rt) {
+					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+					opt->is_changed = 1;
+				}
+				optptr[2] += 4;
+				opt->rr_needaddr = 1;
+			}
+			opt->rr = optptr - iph;
+			break;
+		      case IPOPT_TIMESTAMP:
+			if (opt->ts) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 5) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				unsigned char *timeptr = NULL;
+				if (optptr[2]+3 > optptr[1]) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				switch (optptr[3]&0xF) {
+				      case IPOPT_TS_TSONLY:
+					opt->ts = optptr - iph;
+					if (skb)
+						timeptr = &optptr[optptr[2]-1];
+					opt->ts_needtime = 1;
+					optptr[2] += 4;
+					break;
+				      case IPOPT_TS_TSANDADDR:
+					if (optptr[2]+7 > optptr[1]) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					opt->ts = optptr - iph;
+					if (rt)  {
+						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+						timeptr = &optptr[optptr[2]+3];
+					}
+					opt->ts_needaddr = 1;
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				      case IPOPT_TS_PRESPEC:
+					if (optptr[2]+7 > optptr[1]) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					opt->ts = optptr - iph;
+					{
+						__be32 addr;
+						memcpy(&addr, &optptr[optptr[2]-1], 4);
+						if (inet_addr_type(net, addr) == RTN_UNICAST)
+							break;
+						if (skb)
+							timeptr = &optptr[optptr[2]+3];
+					}
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				      default:
+					if (!skb && !capable(CAP_NET_RAW)) {
+						pp_ptr = optptr + 3;
+						goto error;
+					}
+					break;
+				}
+				if (timeptr) {
+					struct timespec tv;
+					u32  midtime;
+					getnstimeofday(&tv);
+					midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
+					put_unaligned_be32(midtime, timeptr);
+					opt->is_changed = 1;
+				}
+			} else {
+				unsigned overflow = optptr[3]>>4;
+				if (overflow == 15) {
+					pp_ptr = optptr + 3;
+					goto error;
+				}
+				opt->ts = optptr - iph;
+				if (skb) {
+					optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+					opt->is_changed = 1;
+				}
+			}
+			break;
+		      case IPOPT_RA:
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] == 0 && optptr[3] == 0)
+				opt->router_alert = optptr - iph;
+			break;
+		      case IPOPT_CIPSO:
+			if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			opt->cipso = optptr - iph;
+			if (cipso_v4_validate(skb, &optptr)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
+		      case IPOPT_SEC:
+		      case IPOPT_SID:
+		      default:
+			if (!skb && !capable(CAP_NET_RAW)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+
+eol:
+	if (!pp_ptr)
+		return 0;
+
+error:
+	if (skb) {
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ip_options_compile);
+
+/*
+ *	Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options * opt)
+{
+	if (opt->srr) {
+		unsigned  char * optptr = opt->__data+opt->srr-sizeof(struct  iphdr);
+		memmove(optptr+7, optptr+3, optptr[1]-7);
+		memcpy(optptr+3, &opt->faddr, 4);
+	}
+	if (opt->rr_needaddr) {
+		unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
+		optptr[2] -= 4;
+		memset(&optptr[optptr[2]-1], 0, 4);
+	}
+	if (opt->ts) {
+		unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
+		if (opt->ts_needtime) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2]-1], 0, 4);
+			if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+				optptr[2] -= 4;
+		}
+		if (opt->ts_needaddr) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2]-1], 0, 4);
+		}
+	}
+}
+
+static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
+{
+	return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+		       GFP_KERNEL);
+}
+
+static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
+				 struct ip_options_rcu *opt, int optlen)
+{
+	while (optlen & 3)
+		opt->opt.__data[optlen++] = IPOPT_END;
+	opt->opt.optlen = optlen;
+	if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
+		kfree(opt);
+		return -EINVAL;
+	}
+	kfree(*optp);
+	*optp = opt;
+	return 0;
+}
+
+int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
+			     unsigned char __user *data, int optlen)
+{
+	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
+		kfree(opt);
+		return -EFAULT;
+	}
+	return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+		   unsigned char *data, int optlen)
+{
+	struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen)
+		memcpy(opt->opt.__data, data, optlen);
+	return ip_options_get_finish(net, optp, opt, optlen);
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+	struct   ip_options * opt	= &(IPCB(skb)->opt);
+	unsigned char * optptr;
+	struct rtable *rt = skb_rtable(skb);
+	unsigned char *raw = skb_network_header(skb);
+
+	if (opt->rr_needaddr) {
+		optptr = (unsigned char *)raw + opt->rr;
+		ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
+		opt->is_changed = 1;
+	}
+	if (opt->srr_is_hit) {
+		int srrptr, srrspace;
+
+		optptr = raw + opt->srr;
+
+		for ( srrptr=optptr[2], srrspace = optptr[1];
+		     srrptr <= srrspace;
+		     srrptr += 4
+		     ) {
+			if (srrptr + 3 > srrspace)
+				break;
+			if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
+				break;
+		}
+		if (srrptr + 3 <= srrspace) {
+			opt->is_changed = 1;
+			ip_hdr(skb)->daddr = opt->nexthop;
+			ip_rt_get_source(&optptr[srrptr-1], skb, rt);
+			optptr[2] = srrptr+4;
+		} else if (net_ratelimit())
+			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+		if (opt->ts_needaddr) {
+			optptr = raw + opt->ts;
+			ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
+			opt->is_changed = 1;
+		}
+	}
+	if (opt->is_changed) {
+		opt->is_changed = 0;
+		ip_send_check(ip_hdr(skb));
+	}
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+	int srrspace, srrptr;
+	__be32 nexthop;
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned char *optptr = skb_network_header(skb) + opt->srr;
+	struct rtable *rt = skb_rtable(skb);
+	struct rtable *rt2;
+	unsigned long orefdst;
+	int err;
+
+	if (!rt)
+		return 0;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return -EINVAL;
+	if (rt->rt_type == RTN_UNICAST) {
+		if (!opt->is_strictroute)
+			return 0;
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+		return -EINVAL;
+	}
+	if (rt->rt_type != RTN_LOCAL)
+		return -EINVAL;
+
+	for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+		if (srrptr + 3 > srrspace) {
+			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+			return -EINVAL;
+		}
+		memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+		orefdst = skb->_skb_refdst;
+		skb_dst_set(skb, NULL);
+		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+		rt2 = skb_rtable(skb);
+		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+			skb_dst_drop(skb);
+			skb->_skb_refdst = orefdst;
+			return -EINVAL;
+		}
+		refdst_drop(orefdst);
+		if (rt2->rt_type != RTN_LOCAL)
+			break;
+		/* Superfast 8) loopback forward */
+		iph->daddr = nexthop;
+		opt->is_changed = 1;
+	}
+	if (srrptr <= srrspace) {
+		opt->srr_is_hit = 1;
+		opt->nexthop = nexthop;
+		opt->is_changed = 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 00000000..51a3eec2
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1543 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) output module.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <Alan.Cox@linux.org>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ *	See ip_input.c for original log
+ *
+ *	Fixes:
+ *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
+ *		Mike Kilburn	:	htons() missing in ip_build_xmit.
+ *		Bradford Johnson:	Fix faulty handling of some frames when
+ *					no route is found.
+ *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
+ *					(in case if packet not accepted by
+ *					output firewall rules)
+ *		Mike McLagan	:	Routing by source
+ *		Alexey Kuznetsov:	use new route cache
+ *		Andi Kleen:		Fix broken PMTU recovery and remove
+ *					some redundant tests.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
+ *		Andi Kleen	:	Split fast and slow ip_build_xmit path
+ *					for decreased register pressure on x86
+ *					and more readibility.
+ *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
+ *					silently drop skb instead of failing with -EPERM.
+ *		Detlev Wengorz	:	Copy protocol for fragments.
+ *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *		Hirokazu Takahashi:	sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+#include <linux/tcp.h>
+
+int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+	iph->check = 0;
+	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+EXPORT_SYMBOL(ip_send_check);
+
+int __ip_local_out(struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	iph->tot_len = htons(skb->len);
+	ip_send_check(iph);
+	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
+		       skb_dst(skb)->dev, dst_output);
+}
+
+int ip_local_out(struct sk_buff *skb)
+{
+	int err;
+
+	err = __ip_local_out(skb);
+	if (likely(err == 1))
+		err = dst_output(skb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip_local_out);
+
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+	skb_reset_mac_header(newskb);
+	__skb_pull(newskb, skb_network_offset(newskb));
+	newskb->pkt_type = PACKET_LOOPBACK;
+	newskb->ip_summed = CHECKSUM_UNNECESSARY;
+	WARN_ON(!skb_dst(newskb));
+	netif_rx_ni(newskb);
+	return 0;
+}
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+	int ttl = inet->uc_ttl;
+
+	if (ttl < 0)
+		ttl = ip4_dst_hoplimit(dst);
+	return ttl;
+}
+
+/*
+ *		Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = skb_rtable(skb);
+	struct iphdr *iph;
+
+	/* Build the IP header. */
+	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	iph->version  = 4;
+	iph->ihl      = 5;
+	iph->tos      = inet->tos;
+	if (ip_dont_fragment(sk, &rt->dst))
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
+	iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
+	iph->saddr    = saddr;
+	iph->protocol = sk->sk_protocol;
+	ip_select_ident(iph, &rt->dst, sk);
+
+	if (opt && opt->opt.optlen) {
+		iph->ihl += opt->opt.optlen>>2;
+		ip_options_build(skb, &opt->opt, daddr, rt, 0);
+	}
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+
+	/* Send it out. */
+	return ip_local_out(skb);
+}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
+static inline int ip_finish_output2(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = (struct rtable *)dst;
+	struct net_device *dev = dst->dev;
+	unsigned int hh_len = LL_RESERVED_SPACE(dev);
+	struct neighbour *neigh;
+	int res;
+
+	if (rt->rt_type == RTN_MULTICAST) {
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
+	} else if (rt->rt_type == RTN_BROADCAST)
+		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
+
+	/* Be paranoid, rather than too clever. */
+	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (skb2 == NULL) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
+
+	rcu_read_lock();
+	if (dst->hh) {
+		int res = neigh_hh_output(dst->hh, skb);
+
+		rcu_read_unlock();
+		return res;
+	} else {
+		neigh = dst_get_neighbour(dst);
+		if (neigh) {
+			res = neigh->output(skb);
+
+			rcu_read_unlock();
+			return res;
+		}
+		rcu_read_unlock();
+	}
+
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+
+	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+}
+
+static int ip_finish_output(struct sk_buff *skb)
+{
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+	/* Policy lookup after SNAT yielded a new policy */
+	if (skb_dst(skb)->xfrm != NULL) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(skb);
+	}
+#endif
+	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
+		return ip_fragment(skb, ip_finish_output2);
+	else
+		return ip_finish_output2(skb);
+}
+
+int ip_mc_output(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+
+	/*
+	 *	If the indicated interface is up and running, send the packet.
+	 */
+	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	/*
+	 *	Multicasts are looped back for other local users
+	 */
+
+	if (rt->rt_flags&RTCF_MULTICAST) {
+		if (sk_mc_loop(sk)
+#ifdef CONFIG_IP_MROUTE
+		/* Small optimization: do not loopback not local frames,
+		   which returned after forwarding; they will be  dropped
+		   by ip_mr_input in any case.
+		   Note, that local frames are looped back to be delivered
+		   to local recipients.
+
+		   This check is duplicated in ip_mr_input at the moment.
+		 */
+		    &&
+		    ((rt->rt_flags & RTCF_LOCAL) ||
+		     !(IPCB(skb)->flags & IPSKB_FORWARDED))
+#endif
+		   ) {
+			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+			if (newskb)
+				NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+					newskb, NULL, newskb->dev,
+					ip_dev_loopback_xmit);
+		}
+
+		/* Multicasts with ttl 0 must not go beyond the host */
+
+		if (ip_hdr(skb)->ttl == 0) {
+			kfree_skb(skb);
+			return 0;
+		}
+	}
+
+	if (rt->rt_flags&RTCF_BROADCAST) {
+		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+		if (newskb)
+			NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
+				NULL, newskb->dev, ip_dev_loopback_xmit);
+	}
+
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
+			    skb->dev, ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+int ip_output(struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+
+	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
+			    ip_finish_output,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	struct iphdr *iph;
+	int res;
+
+	/* Skip all of this if the packet is already routed,
+	 * f.e. by something like SCTP.
+	 */
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	fl4 = &fl->u.ip4;
+	rt = skb_rtable(skb);
+	if (rt != NULL)
+		goto packet_routed;
+
+	/* Make sure we can route this packet. */
+	rt = (struct rtable *)__sk_dst_check(sk, 0);
+	if (rt == NULL) {
+		__be32 daddr;
+
+		/* Use correct destination address if we have options. */
+		daddr = inet->inet_daddr;
+		if (inet_opt && inet_opt->opt.srr)
+			daddr = inet_opt->opt.faddr;
+
+		/* If this fails, retransmit mechanism of transport layer will
+		 * keep trying until route appears or the connection times
+		 * itself out.
+		 */
+		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+					   daddr, inet->inet_saddr,
+					   inet->inet_dport,
+					   inet->inet_sport,
+					   sk->sk_protocol,
+					   RT_CONN_FLAGS(sk),
+					   sk->sk_bound_dev_if);
+		if (IS_ERR(rt))
+			goto no_route;
+		sk_setup_caps(sk, &rt->dst);
+	}
+	skb_dst_set_noref(skb, &rt->dst);
+
+packet_routed:
+	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
+		goto no_route;
+
+	/* OK, we know where to send it, allocate and build IP header. */
+	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
+	iph->protocol = sk->sk_protocol;
+	iph->saddr    = fl4->saddr;
+	iph->daddr    = fl4->daddr;
+	/* Transport layer set skb->h.foo itself. */
+
+	if (inet_opt && inet_opt->opt.optlen) {
+		iph->ihl += inet_opt->opt.optlen >> 2;
+		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+	}
+
+	ip_select_ident_more(iph, &rt->dst, sk,
+			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+
+	res = ip_local_out(skb);
+	rcu_read_unlock();
+	return res;
+
+no_route:
+	rcu_read_unlock();
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+	to->pkt_type = from->pkt_type;
+	to->priority = from->priority;
+	to->protocol = from->protocol;
+	skb_dst_drop(to);
+	skb_dst_copy(to, from);
+	to->dev = from->dev;
+	to->mark = from->mark;
+
+	/* Copy the flags to each fragment. */
+	IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+	to->tc_index = from->tc_index;
+#endif
+	nf_copy(to, from);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	to->nf_trace = from->nf_trace;
+#endif
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+	to->ipvs_property = from->ipvs_property;
+#endif
+	skb_copy_secmark(to, from);
+}
+
+/*
+ *	This IP datagram is too large to be sent in one piece.  Break it up into
+ *	smaller pieces (each of size equal to IP header plus
+ *	a block of the data of the original IP data part) that will yet fit in a
+ *	single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+{
+	struct iphdr *iph;
+	int ptr;
+	struct net_device *dev;
+	struct sk_buff *skb2;
+	unsigned int mtu, hlen, left, len, ll_rs;
+	int offset;
+	__be16 not_last_frag;
+	struct rtable *rt = skb_rtable(skb);
+	int err = 0;
+
+	dev = rt->dst.dev;
+
+	/*
+	 *	Point into the IP datagram header.
+	 */
+
+	iph = ip_hdr(skb);
+
+	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(ip_skb_dst_mtu(skb)));
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 *	Setup starting values.
+	 */
+
+	hlen = iph->ihl * 4;
+	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge)
+		mtu -= nf_bridge_mtu_reduction(skb);
+#endif
+	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
+
+	/* When frag_list is given, use it. First, check its validity:
+	 * some transformers could create wrong frag_list or break existing
+	 * one, it is not prohibited. In this case fall back to copying.
+	 *
+	 * LATER: this step can be merged to real generation of fragments,
+	 * we can switch to copy when see the first bad fragment.
+	 */
+	if (skb_has_frag_list(skb)) {
+		struct sk_buff *frag, *frag2;
+		int first_len = skb_pagelen(skb);
+
+		if (first_len - hlen > mtu ||
+		    ((first_len - hlen) & 7) ||
+		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+		    skb_cloned(skb))
+			goto slow_path;
+
+		skb_walk_frags(skb, frag) {
+			/* Correct geometry. */
+			if (frag->len > mtu ||
+			    ((frag->len & 7) && frag->next) ||
+			    skb_headroom(frag) < hlen)
+				goto slow_path_clean;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag))
+				goto slow_path_clean;
+
+			BUG_ON(frag->sk);
+			if (skb->sk) {
+				frag->sk = skb->sk;
+				frag->destructor = sock_wfree;
+			}
+			skb->truesize -= frag->truesize;
+		}
+
+		/* Everything is OK. Generate! */
+
+		err = 0;
+		offset = 0;
+		frag = skb_shinfo(skb)->frag_list;
+		skb_frag_list_init(skb);
+		skb->data_len = first_len - skb_headlen(skb);
+		skb->len = first_len;
+		iph->tot_len = htons(first_len);
+		iph->frag_off = htons(IP_MF);
+		ip_send_check(iph);
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down. */
+			if (frag) {
+				frag->ip_summed = CHECKSUM_NONE;
+				skb_reset_transport_header(frag);
+				__skb_push(frag, hlen);
+				skb_reset_network_header(frag);
+				memcpy(skb_network_header(frag), iph, hlen);
+				iph = ip_hdr(frag);
+				iph->tot_len = htons(frag->len);
+				ip_copy_metadata(frag, skb);
+				if (offset == 0)
+					ip_options_fragment(frag);
+				offset += skb->len - hlen;
+				iph->frag_off = htons(offset>>3);
+				if (frag->next != NULL)
+					iph->frag_off |= htons(IP_MF);
+				/* Ready, complete checksum */
+				ip_send_check(iph);
+			}
+
+			err = output(skb);
+
+			if (!err)
+				IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+			if (err || !frag)
+				break;
+
+			skb = frag;
+			frag = skb->next;
+			skb->next = NULL;
+		}
+
+		if (err == 0) {
+			IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+			return 0;
+		}
+
+		while (frag) {
+			skb = frag->next;
+			kfree_skb(frag);
+			frag = skb;
+		}
+		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+		return err;
+
+slow_path_clean:
+		skb_walk_frags(skb, frag2) {
+			if (frag2 == frag)
+				break;
+			frag2->sk = NULL;
+			frag2->destructor = NULL;
+			skb->truesize += frag2->truesize;
+		}
+	}
+
+slow_path:
+	left = skb->len - hlen;		/* Space per frame */
+	ptr = hlen;		/* Where to start from */
+
+	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
+	 * we need to make room for the encapsulating header
+	 */
+	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
+
+	/*
+	 *	Fragment the datagram.
+	 */
+
+	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+	not_last_frag = iph->frag_off & htons(IP_MF);
+
+	/*
+	 *	Keep copying data until we run out.
+	 */
+
+	while (left > 0) {
+		len = left;
+		/* IF: it doesn't fit, use 'mtu' - the data space left */
+		if (len > mtu)
+			len = mtu;
+		/* IF: we are not sending up to and including the packet end
+		   then align the next start on an eight byte boundary */
+		if (len < left)	{
+			len &= ~7;
+		}
+		/*
+		 *	Allocate buffer.
+		 */
+
+		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		/*
+		 *	Set up data on packet
+		 */
+
+		ip_copy_metadata(skb2, skb);
+		skb_reserve(skb2, ll_rs);
+		skb_put(skb2, len + hlen);
+		skb_reset_network_header(skb2);
+		skb2->transport_header = skb2->network_header + hlen;
+
+		/*
+		 *	Charge the memory for the fragment to any owner
+		 *	it might possess
+		 */
+
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+
+		/*
+		 *	Copy the packet header into the new buffer.
+		 */
+
+		skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
+
+		/*
+		 *	Copy a block of the IP datagram.
+		 */
+		if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
+			BUG();
+		left -= len;
+
+		/*
+		 *	Fill in the new header fields.
+		 */
+		iph = ip_hdr(skb2);
+		iph->frag_off = htons((offset >> 3));
+
+		/* ANK: dirty, but effective trick. Upgrade options only if
+		 * the segment to be fragmented was THE FIRST (otherwise,
+		 * options are already fixed) and make it ONCE
+		 * on the initial skb, so that all the following fragments
+		 * will inherit fixed options.
+		 */
+		if (offset == 0)
+			ip_options_fragment(skb);
+
+		/*
+		 *	Added AC : If we are fragmenting a fragment that's not the
+		 *		   last fragment then keep MF on each bit
+		 */
+		if (left > 0 || not_last_frag)
+			iph->frag_off |= htons(IP_MF);
+		ptr += len;
+		offset += len;
+
+		/*
+		 *	Put this fragment into the sending queue.
+		 */
+		iph->tot_len = htons(len + hlen);
+
+		ip_send_check(iph);
+
+		err = output(skb2);
+		if (err)
+			goto fail;
+
+		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
+	}
+	kfree_skb(skb);
+	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
+	return err;
+
+fail:
+	kfree_skb(skb);
+	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+	return err;
+}
+EXPORT_SYMBOL(ip_fragment);
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+	struct iovec *iov = from;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+			return -EFAULT;
+	} else {
+		__wsum csum = 0;
+		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+			return -EFAULT;
+		skb->csum = csum_block_add(skb->csum, csum, odd);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ip_generic_getfrag);
+
+static inline __wsum
+csum_page(struct page *page, int offset, int copy)
+{
+	char *kaddr;
+	__wsum csum;
+	kaddr = kmap(page);
+	csum = csum_partial(kaddr + offset, copy, 0);
+	kunmap(page);
+	return csum;
+}
+
+static inline int ip_ufo_append_data(struct sock *sk,
+			struct sk_buff_head *queue,
+			int getfrag(void *from, char *to, int offset, int len,
+			       int odd, struct sk_buff *skb),
+			void *from, int length, int hh_len, int fragheaderlen,
+			int transhdrlen, int maxfraglen, unsigned int flags)
+{
+	struct sk_buff *skb;
+	int err;
+
+	/* There is support for UDP fragmentation offload by network
+	 * device, so create one single skb packet containing complete
+	 * udp datagram
+	 */
+	if ((skb = skb_peek_tail(queue)) == NULL) {
+		skb = sock_alloc_send_skb(sk,
+			hh_len + fragheaderlen + transhdrlen + 20,
+			(flags & MSG_DONTWAIT), &err);
+
+		if (skb == NULL)
+			return err;
+
+		/* reserve space for Hardware header */
+		skb_reserve(skb, hh_len);
+
+		/* create space for UDP/IP header */
+		skb_put(skb, fragheaderlen + transhdrlen);
+
+		/* initialize network header pointer */
+		skb_reset_network_header(skb);
+
+		/* initialize protocol header pointer */
+		skb->transport_header = skb->network_header + fragheaderlen;
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum = 0;
+
+		/* specify the length of each IP datagram fragment */
+		skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+		__skb_queue_tail(queue, skb);
+	}
+
+	return skb_append_datato_frags(sk, skb, getfrag, from,
+				       (length - transhdrlen));
+}
+
+static int __ip_append_data(struct sock *sk,
+			    struct flowi4 *fl4,
+			    struct sk_buff_head *queue,
+			    struct inet_cork *cork,
+			    int getfrag(void *from, char *to, int offset,
+					int len, int odd, struct sk_buff *skb),
+			    void *from, int length, int transhdrlen,
+			    unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+
+	struct ip_options *opt = cork->opt;
+	int hh_len;
+	int exthdrlen;
+	int mtu;
+	int copy;
+	int err;
+	int offset = 0;
+	unsigned int maxfraglen, fragheaderlen;
+	int csummode = CHECKSUM_NONE;
+	struct rtable *rt = (struct rtable *)cork->dst;
+
+	skb = skb_peek_tail(queue);
+
+	exthdrlen = !skb ? rt->dst.header_len : 0;
+	mtu = cork->fragsize;
+
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+	if (cork->length + length > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+			       mtu-exthdrlen);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 * transhdrlen > 0 means that this is the first fragment and we wish
+	 * it won't be fragmented in the future.
+	 */
+	if (transhdrlen &&
+	    length + fragheaderlen <= mtu &&
+	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
+	    !exthdrlen)
+		csummode = CHECKSUM_PARTIAL;
+
+	cork->length += length;
+	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+	    (sk->sk_protocol == IPPROTO_UDP) &&
+	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
+		err = ip_ufo_append_data(sk, queue, getfrag, from, length,
+					 hh_len, fragheaderlen, transhdrlen,
+					 maxfraglen, flags);
+		if (err)
+			goto error;
+		return 0;
+	}
+
+	/* So, what's going on in the loop below?
+	 *
+	 * We use calculated fragment length to generate chained skb,
+	 * each of segments is IP fragment ready for sending to network after
+	 * adding appropriate IP header.
+	 */
+
+	if (!skb)
+		goto alloc_new_skb;
+
+	while (length > 0) {
+		/* Check if the remaining data fits into current packet. */
+		copy = mtu - skb->len;
+		if (copy < length)
+			copy = maxfraglen - skb->len;
+		if (copy <= 0) {
+			char *data;
+			unsigned int datalen;
+			unsigned int fraglen;
+			unsigned int fraggap;
+			unsigned int alloclen;
+			struct sk_buff *skb_prev;
+alloc_new_skb:
+			skb_prev = skb;
+			if (skb_prev)
+				fraggap = skb_prev->len - maxfraglen;
+			else
+				fraggap = 0;
+
+			/*
+			 * If remaining data exceeds the mtu,
+			 * we know we need more fragment(s).
+			 */
+			datalen = length + fraggap;
+			if (datalen > mtu - fragheaderlen)
+				datalen = maxfraglen - fragheaderlen;
+			fraglen = datalen + fragheaderlen;
+
+			if ((flags & MSG_MORE) &&
+			    !(rt->dst.dev->features&NETIF_F_SG))
+				alloclen = mtu;
+			else
+				alloclen = fraglen;
+
+			alloclen += exthdrlen;
+
+			/* The last fragment gets additional space at tail.
+			 * Note, with MSG_MORE we overallocate on fragments,
+			 * because we have no idea what fragment will be
+			 * the last.
+			 */
+			if (datalen == length + fraggap)
+				alloclen += rt->dst.trailer_len;
+
+			if (transhdrlen) {
+				skb = sock_alloc_send_skb(sk,
+						alloclen + hh_len + 15,
+						(flags & MSG_DONTWAIT), &err);
+			} else {
+				skb = NULL;
+				if (atomic_read(&sk->sk_wmem_alloc) <=
+				    2 * sk->sk_sndbuf)
+					skb = sock_wmalloc(sk,
+							   alloclen + hh_len + 15, 1,
+							   sk->sk_allocation);
+				if (unlikely(skb == NULL))
+					err = -ENOBUFS;
+				else
+					/* only the initial fragment is
+					   time stamped */
+					cork->tx_flags = 0;
+			}
+			if (skb == NULL)
+				goto error;
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = csummode;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+			skb_shinfo(skb)->tx_flags = cork->tx_flags;
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fraglen + exthdrlen);
+			skb_set_network_header(skb, exthdrlen);
+			skb->transport_header = (skb->network_header +
+						 fragheaderlen);
+			data += fragheaderlen + exthdrlen;
+
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(
+					skb_prev, maxfraglen,
+					data + transhdrlen, fraggap, 0);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				data += fraggap;
+				pskb_trim_unique(skb_prev, maxfraglen);
+			}
+
+			copy = datalen - transhdrlen - fraggap;
+			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+				err = -EFAULT;
+				kfree_skb(skb);
+				goto error;
+			}
+
+			offset += copy;
+			length -= datalen - fraggap;
+			transhdrlen = 0;
+			exthdrlen = 0;
+			csummode = CHECKSUM_NONE;
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(queue, skb);
+			continue;
+		}
+
+		if (copy > length)
+			copy = length;
+
+		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+			unsigned int off;
+
+			off = skb->len;
+			if (getfrag(from, skb_put(skb, copy),
+					offset, copy, off, skb) < 0) {
+				__skb_trim(skb, off);
+				err = -EFAULT;
+				goto error;
+			}
+		} else {
+			int i = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+			struct page *page = cork->page;
+			int off = cork->off;
+			unsigned int left;
+
+			if (page && (left = PAGE_SIZE - off) > 0) {
+				if (copy >= left)
+					copy = left;
+				if (page != frag->page) {
+					if (i == MAX_SKB_FRAGS) {
+						err = -EMSGSIZE;
+						goto error;
+					}
+					get_page(page);
+					skb_fill_page_desc(skb, i, page, off, 0);
+					frag = &skb_shinfo(skb)->frags[i];
+				}
+			} else if (i < MAX_SKB_FRAGS) {
+				if (copy > PAGE_SIZE)
+					copy = PAGE_SIZE;
+				page = alloc_pages(sk->sk_allocation, 0);
+				if (page == NULL)  {
+					err = -ENOMEM;
+					goto error;
+				}
+				cork->page = page;
+				cork->off = 0;
+
+				skb_fill_page_desc(skb, i, page, 0, 0);
+				frag = &skb_shinfo(skb)->frags[i];
+			} else {
+				err = -EMSGSIZE;
+				goto error;
+			}
+			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+				err = -EFAULT;
+				goto error;
+			}
+			cork->off += copy;
+			frag->size += copy;
+			skb->len += copy;
+			skb->data_len += copy;
+			skb->truesize += copy;
+			atomic_add(copy, &sk->sk_wmem_alloc);
+		}
+		offset += copy;
+		length -= copy;
+	}
+
+	return 0;
+
+error:
+	cork->length -= length;
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+	return err;
+}
+
+static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
+			 struct ipcm_cookie *ipc, struct rtable **rtp)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *opt;
+	struct rtable *rt;
+
+	/*
+	 * setup for corking.
+	 */
+	opt = ipc->opt;
+	if (opt) {
+		if (cork->opt == NULL) {
+			cork->opt = kmalloc(sizeof(struct ip_options) + 40,
+					    sk->sk_allocation);
+			if (unlikely(cork->opt == NULL))
+				return -ENOBUFS;
+		}
+		memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
+		cork->flags |= IPCORK_OPT;
+		cork->addr = ipc->addr;
+	}
+	rt = *rtp;
+	if (unlikely(!rt))
+		return -EFAULT;
+	/*
+	 * We steal reference to this route, caller should not release it
+	 */
+	*rtp = NULL;
+	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+			 rt->dst.dev->mtu : dst_mtu(&rt->dst);
+	cork->dst = &rt->dst;
+	cork->length = 0;
+	cork->tx_flags = ipc->tx_flags;
+	cork->page = NULL;
+	cork->off = 0;
+
+	return 0;
+}
+
+/*
+ *	ip_append_data() and ip_append_page() can make one large IP datagram
+ *	from many pieces of data. Each pieces will be holded on the socket
+ *	until ip_push_pending_frames() is called. Each piece can be a page
+ *	or non-page data.
+ *
+ *	Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *	this interface potentially.
+ *
+ *	LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk, struct flowi4 *fl4,
+		   int getfrag(void *from, char *to, int offset, int len,
+			       int odd, struct sk_buff *skb),
+		   void *from, int length, int transhdrlen,
+		   struct ipcm_cookie *ipc, struct rtable **rtp,
+		   unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->sk_write_queue)) {
+		err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
+		if (err)
+			return err;
+	} else {
+		transhdrlen = 0;
+	}
+
+	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
+				from, length, transhdrlen, flags);
+}
+
+ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+		       int offset, size_t size, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct ip_options *opt = NULL;
+	struct inet_cork *cork;
+	int hh_len;
+	int mtu;
+	int len;
+	int err;
+	unsigned int maxfraglen, fragheaderlen, fraggap;
+
+	if (inet->hdrincl)
+		return -EPERM;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->sk_write_queue))
+		return -EINVAL;
+
+	cork = &inet->cork.base;
+	rt = (struct rtable *)cork->dst;
+	if (cork->flags & IPCORK_OPT)
+		opt = cork->opt;
+
+	if (!(rt->dst.dev->features&NETIF_F_SG))
+		return -EOPNOTSUPP;
+
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+	mtu = cork->fragsize;
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+	if (cork->length + size > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
+		return -EMSGSIZE;
+	}
+
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+		return -EINVAL;
+
+	cork->length += size;
+	if ((size + skb->len > mtu) &&
+	    (sk->sk_protocol == IPPROTO_UDP) &&
+	    (rt->dst.dev->features & NETIF_F_UFO)) {
+		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+	}
+
+
+	while (size > 0) {
+		int i;
+
+		if (skb_is_gso(skb))
+			len = size;
+		else {
+
+			/* Check if the remaining data fits into current packet. */
+			len = mtu - skb->len;
+			if (len < size)
+				len = maxfraglen - skb->len;
+		}
+		if (len <= 0) {
+			struct sk_buff *skb_prev;
+			int alloclen;
+
+			skb_prev = skb;
+			fraggap = skb_prev->len - maxfraglen;
+
+			alloclen = fragheaderlen + hh_len + fraggap + 15;
+			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+			if (unlikely(!skb)) {
+				err = -ENOBUFS;
+				goto error;
+			}
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = CHECKSUM_NONE;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			skb_put(skb, fragheaderlen + fraggap);
+			skb_reset_network_header(skb);
+			skb->transport_header = (skb->network_header +
+						 fragheaderlen);
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(skb_prev,
+								   maxfraglen,
+						    skb_transport_header(skb),
+								   fraggap, 0);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				pskb_trim_unique(skb_prev, maxfraglen);
+			}
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(&sk->sk_write_queue, skb);
+			continue;
+		}
+
+		i = skb_shinfo(skb)->nr_frags;
+		if (len > size)
+			len = size;
+		if (skb_can_coalesce(skb, i, page, offset)) {
+			skb_shinfo(skb)->frags[i-1].size += len;
+		} else if (i < MAX_SKB_FRAGS) {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, len);
+		} else {
+			err = -EMSGSIZE;
+			goto error;
+		}
+
+		if (skb->ip_summed == CHECKSUM_NONE) {
+			__wsum csum;
+			csum = csum_page(page, offset, len);
+			skb->csum = csum_block_add(skb->csum, csum, skb->len);
+		}
+
+		skb->len += len;
+		skb->data_len += len;
+		skb->truesize += len;
+		atomic_add(len, &sk->sk_wmem_alloc);
+		offset += len;
+		size -= len;
+	}
+	return 0;
+
+error:
+	cork->length -= size;
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+	return err;
+}
+
+static void ip_cork_release(struct inet_cork *cork)
+{
+	cork->flags &= ~IPCORK_OPT;
+	kfree(cork->opt);
+	cork->opt = NULL;
+	dst_release(cork->dst);
+	cork->dst = NULL;
+}
+
+/*
+ *	Combined all pending IP fragments on the socket as one IP datagram
+ *	and push them out.
+ */
+struct sk_buff *__ip_make_skb(struct sock *sk,
+			      struct flowi4 *fl4,
+			      struct sk_buff_head *queue,
+			      struct inet_cork *cork)
+{
+	struct sk_buff *skb, *tmp_skb;
+	struct sk_buff **tail_skb;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct ip_options *opt = NULL;
+	struct rtable *rt = (struct rtable *)cork->dst;
+	struct iphdr *iph;
+	__be16 df = 0;
+	__u8 ttl;
+
+	if ((skb = __skb_dequeue(queue)) == NULL)
+		goto out;
+	tail_skb = &(skb_shinfo(skb)->frag_list);
+
+	/* move skb->data to ip header from ext header */
+	if (skb->data < skb_network_header(skb))
+		__skb_pull(skb, skb_network_offset(skb));
+	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
+		__skb_pull(tmp_skb, skb_network_header_len(skb));
+		*tail_skb = tmp_skb;
+		tail_skb = &(tmp_skb->next);
+		skb->len += tmp_skb->len;
+		skb->data_len += tmp_skb->len;
+		skb->truesize += tmp_skb->truesize;
+		tmp_skb->destructor = NULL;
+		tmp_skb->sk = NULL;
+	}
+
+	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+	 * to fragment the frame generated here. No matter, what transforms
+	 * how transforms change size of the packet, it will come out.
+	 */
+	if (inet->pmtudisc < IP_PMTUDISC_DO)
+		skb->local_df = 1;
+
+	/* DF bit is set when we want to see DF on outgoing frames.
+	 * If local_df is set too, we still allow to fragment this frame
+	 * locally. */
+	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
+	    (skb->len <= dst_mtu(&rt->dst) &&
+	     ip_dont_fragment(sk, &rt->dst)))
+		df = htons(IP_DF);
+
+	if (cork->flags & IPCORK_OPT)
+		opt = cork->opt;
+
+	if (rt->rt_type == RTN_MULTICAST)
+		ttl = inet->mc_ttl;
+	else
+		ttl = ip_select_ttl(inet, &rt->dst);
+
+	iph = (struct iphdr *)skb->data;
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->tos = inet->tos;
+	iph->frag_off = df;
+	ip_select_ident(iph, &rt->dst, sk);
+	iph->ttl = ttl;
+	iph->protocol = sk->sk_protocol;
+	iph->saddr = fl4->saddr;
+	iph->daddr = fl4->daddr;
+
+	if (opt) {
+		iph->ihl += opt->optlen>>2;
+		ip_options_build(skb, opt, cork->addr, rt, 0);
+	}
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+	/*
+	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
+	 * on dst refcount
+	 */
+	cork->dst = NULL;
+	skb_dst_set(skb, &rt->dst);
+
+	if (iph->protocol == IPPROTO_ICMP)
+		icmp_out_count(net, ((struct icmphdr *)
+			skb_transport_header(skb))->type);
+
+	ip_cork_release(cork);
+out:
+	return skb;
+}
+
+int ip_send_skb(struct sk_buff *skb)
+{
+	struct net *net = sock_net(skb->sk);
+	int err;
+
+	err = ip_local_out(skb);
+	if (err) {
+		if (err > 0)
+			err = net_xmit_errno(err);
+		if (err)
+			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+	}
+
+	return err;
+}
+
+int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
+{
+	struct sk_buff *skb;
+
+	skb = ip_finish_skb(sk, fl4);
+	if (!skb)
+		return 0;
+
+	/* Netfilter gets whole the not fragmented skb. */
+	return ip_send_skb(skb);
+}
+
+/*
+ *	Throw away all pending data on the socket.
+ */
+static void __ip_flush_pending_frames(struct sock *sk,
+				      struct sk_buff_head *queue,
+				      struct inet_cork *cork)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(queue)) != NULL)
+		kfree_skb(skb);
+
+	ip_cork_release(cork);
+}
+
+void ip_flush_pending_frames(struct sock *sk)
+{
+	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
+}
+
+struct sk_buff *ip_make_skb(struct sock *sk,
+			    struct flowi4 *fl4,
+			    int getfrag(void *from, char *to, int offset,
+					int len, int odd, struct sk_buff *skb),
+			    void *from, int length, int transhdrlen,
+			    struct ipcm_cookie *ipc, struct rtable **rtp,
+			    unsigned int flags)
+{
+	struct inet_cork cork;
+	struct sk_buff_head queue;
+	int err;
+
+	if (flags & MSG_PROBE)
+		return NULL;
+
+	__skb_queue_head_init(&queue);
+
+	cork.flags = 0;
+	cork.addr = 0;
+	cork.opt = NULL;
+	err = ip_setup_cork(sk, &cork, ipc, rtp);
+	if (err)
+		return ERR_PTR(err);
+
+	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
+			       from, length, transhdrlen, flags);
+	if (err) {
+		__ip_flush_pending_frames(sk, &queue, &cork);
+		return ERR_PTR(err);
+	}
+
+	return __ip_make_skb(sk, fl4, &queue, &cork);
+}
+
+/*
+ *	Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset,
+			      int len, int odd, struct sk_buff *skb)
+{
+	__wsum csum;
+
+	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	return 0;
+}
+
+/*
+ *	Generic function to send a packet as reply to another packet.
+ *	Used to send TCP resets so far. ICMP should use this function too.
+ *
+ *	Should run single threaded per socket because it uses the sock
+ *     	structure to pass arguments.
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
+		   struct ip_reply_arg *arg, unsigned int len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_data replyopts;
+	struct ipcm_cookie ipc;
+	struct flowi4 fl4;
+	struct rtable *rt = skb_rtable(skb);
+
+	if (ip_options_echo(&replyopts.opt.opt, skb))
+		return;
+
+	ipc.addr = daddr;
+	ipc.opt = NULL;
+	ipc.tx_flags = 0;
+
+	if (replyopts.opt.opt.optlen) {
+		ipc.opt = &replyopts.opt;
+
+		if (replyopts.opt.opt.srr)
+			daddr = replyopts.opt.opt.faddr;
+	}
+
+	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
+			   RT_TOS(ip_hdr(skb)->tos),
+			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+			   ip_reply_arg_flowi_flags(arg),
+			   daddr, rt->rt_spec_dst,
+			   tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_key(sock_net(sk), &fl4);
+	if (IS_ERR(rt))
+		return;
+
+	/* And let IP do all the hard work.
+
+	   This chunk is not reenterable, hence spinlock.
+	   Note that it uses the fact, that this function is called
+	   with locally disabled BH and that sk cannot be already spinlocked.
+	 */
+	bh_lock_sock(sk);
+	inet->tos = ip_hdr(skb)->tos;
+	sk->sk_priority = skb->priority;
+	sk->sk_protocol = ip_hdr(skb)->protocol;
+	sk->sk_bound_dev_if = arg->bound_dev_if;
+	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+		       &ipc, &rt, MSG_DONTWAIT);
+	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+		if (arg->csumoffset >= 0)
+			*((__sum16 *)skb_transport_header(skb) +
+			  arg->csumoffset) = csum_fold(csum_add(skb->csum,
+								arg->csum));
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(sk, &fl4);
+	}
+
+	bh_unlock_sock(sk);
+
+	ip_rt_put(rt);
+}
+
+void __init ip_init(void)
+{
+	ip_rt_init();
+	inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
+	igmp_mc_proc_init();
+#endif
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 00000000..ab0c9efd
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1352 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP to API glue.
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip.c for history.
+ *		Martin Mares	:	TOS setting fixed.
+ *		Alan Cox	:	Fixed a couple of oopses in Martin's
+ *					TOS tweaks.
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp_states.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+#define IP_CMSG_PKTINFO		1
+#define IP_CMSG_TTL		2
+#define IP_CMSG_TOS		4
+#define IP_CMSG_RECVOPTS	8
+#define IP_CMSG_RETOPTS		16
+#define IP_CMSG_PASSSEC		32
+#define IP_CMSG_ORIGDSTADDR     64
+
+/*
+ *	SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct in_pktinfo info;
+	struct rtable *rt = skb_rtable(skb);
+
+	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
+	if (rt) {
+		info.ipi_ifindex = rt->rt_iif;
+		info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+	} else {
+		info.ipi_ifindex = 0;
+		info.ipi_spec_dst.s_addr = 0;
+	}
+
+	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+	int ttl = ip_hdr(skb)->ttl;
+	put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+	put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen,
+		 ip_hdr(skb) + 1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options * opt = (struct ip_options *)optbuf;
+
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	if (ip_options_echo(opt, skb)) {
+		msg->msg_flags |= MSG_CTRUNC;
+		return;
+	}
+	ip_options_undo(opt);
+
+	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
+{
+	char *secdata;
+	u32 seclen, secid;
+	int err;
+
+	err = security_socket_getpeersec_dgram(NULL, skb, &secid);
+	if (err)
+		return;
+
+	err = security_secid_to_secctx(secid, &secdata, &seclen);
+	if (err)
+		return;
+
+	put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
+	security_release_secctx(secdata, seclen);
+}
+
+static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct sockaddr_in sin;
+	const struct iphdr *iph = ip_hdr(skb);
+	__be16 *ports = (__be16 *)skb_transport_header(skb);
+
+	if (skb_transport_offset(skb) + 4 > skb->len)
+		return;
+
+	/* All current transport protocols have the port numbers in the
+	 * first four bytes of the transport header and this function is
+	 * written with this assumption in mind.
+	 */
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = iph->daddr;
+	sin.sin_port = ports[1];
+	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
+
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct inet_sock *inet = inet_sk(skb->sk);
+	unsigned flags = inet->cmsg_flags;
+
+	/* Ordered by supposed usage frequency */
+	if (flags & 1)
+		ip_cmsg_recv_pktinfo(msg, skb);
+	if ((flags >>= 1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_ttl(msg, skb);
+	if ((flags >>= 1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_tos(msg, skb);
+	if ((flags >>= 1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_opts(msg, skb);
+	if ((flags >>= 1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_retopts(msg, skb);
+	if ((flags >>= 1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_security(msg, skb);
+
+	if ((flags >>= 1) == 0)
+		return;
+	if (flags & 1)
+		ip_cmsg_recv_dstaddr(msg, skb);
+
+}
+EXPORT_SYMBOL(ip_cmsg_recv);
+
+int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+	int err;
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_IP)
+			continue;
+		switch (cmsg->cmsg_type) {
+		case IP_RETOPTS:
+			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
+					     err < 40 ? err : 40);
+			if (err)
+				return err;
+			break;
+		case IP_PKTINFO:
+		{
+			struct in_pktinfo *info;
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+				return -EINVAL;
+			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+			ipc->oif = info->ipi_ifindex;
+			ipc->addr = info->ipi_spec_dst.s_addr;
+			break;
+		}
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+   They are selected only by protocol field, and then processed likely
+   local ones; but only if someone wants them! Otherwise, router
+   not running rsvpd will kill RSVP.
+
+   It is user level problem, what it will make with them.
+   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+   but receiver should be enough clever f.e. to forward mtrace requests,
+   sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+	struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+	sock_put(ra->saved_sk);
+	kfree(ra);
+}
+
+int ip_ra_control(struct sock *sk, unsigned char on,
+		  void (*destructor)(struct sock *))
+{
+	struct ip_ra_chain *ra, *new_ra;
+	struct ip_ra_chain __rcu **rap;
+
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
+		return -EINVAL;
+
+	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+	spin_lock_bh(&ip_ra_lock);
+	for (rap = &ip_ra_chain;
+	     (ra = rcu_dereference_protected(*rap,
+			lockdep_is_held(&ip_ra_lock))) != NULL;
+	     rap = &ra->next) {
+		if (ra->sk == sk) {
+			if (on) {
+				spin_unlock_bh(&ip_ra_lock);
+				kfree(new_ra);
+				return -EADDRINUSE;
+			}
+			/* dont let ip_call_ra_chain() use sk again */
+			ra->sk = NULL;
+			rcu_assign_pointer(*rap, ra->next);
+			spin_unlock_bh(&ip_ra_lock);
+
+			if (ra->destructor)
+				ra->destructor(sk);
+			/*
+			 * Delay sock_put(sk) and kfree(ra) after one rcu grace
+			 * period. This guarantee ip_call_ra_chain() dont need
+			 * to mess with socket refcounts.
+			 */
+			ra->saved_sk = sk;
+			call_rcu(&ra->rcu, ip_ra_destroy_rcu);
+			return 0;
+		}
+	}
+	if (new_ra == NULL) {
+		spin_unlock_bh(&ip_ra_lock);
+		return -ENOBUFS;
+	}
+	new_ra->sk = sk;
+	new_ra->destructor = destructor;
+
+	new_ra->next = ra;
+	rcu_assign_pointer(*rap, new_ra);
+	sock_hold(sk);
+	spin_unlock_bh(&ip_ra_lock);
+
+	return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+		   __be16 port, u32 info, u8 *payload)
+{
+	struct sock_exterr_skb *serr;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+	serr->ee.ee_type = icmp_hdr(skb)->type;
+	serr->ee.ee_code = icmp_hdr(skb)->code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) -
+				   skb_network_header(skb);
+	serr->port = port;
+
+	if (skb_pull(skb, payload - skb->data) != NULL) {
+		skb_reset_transport_header(skb);
+		if (sock_queue_err_skb(sk, skb) == 0)
+			return;
+	}
+	kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sock_exterr_skb *serr;
+	struct iphdr *iph;
+	struct sk_buff *skb;
+
+	if (!inet->recverr)
+		return;
+
+	skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_put(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	iph->daddr = daddr;
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+	serr->ee.ee_type = 0;
+	serr->ee.ee_code = 0;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+	serr->port = port;
+
+	__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+	skb_reset_transport_header(skb);
+
+	if (sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+/*
+ *	Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb, *skb2;
+	struct sockaddr_in *sin;
+	struct {
+		struct sock_extended_err ee;
+		struct sockaddr_in	 offender;
+	} errhdr;
+	int err;
+	int copied;
+
+	err = -EAGAIN;
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+
+	sin = (struct sockaddr_in *)msg->msg_name;
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) +
+						   serr->addr_offset);
+		sin->sin_port = serr->port;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+
+	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+	sin = &errhdr.offender;
+	sin->sin_family = AF_UNSPEC;
+	if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		sin->sin_port = 0;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		if (inet->cmsg_flags)
+			ip_cmsg_recv(msg, skb);
+	}
+
+	put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+	/* Now we could try to dump offended packet options */
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	/* Reset and regenerate socket error */
+	spin_lock_bh(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	skb2 = skb_peek(&sk->sk_error_queue);
+	if (skb2 != NULL) {
+		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+	kfree_skb(skb);
+out:
+	return err;
+}
+
+
+static void opt_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct ip_options_rcu, rcu));
+}
+
+/*
+ *	Socket option code for IP. This is the end of the line after any
+ *	TCP,UDP etc options on an IP socket.
+ */
+
+static int do_ip_setsockopt(struct sock *sk, int level,
+			    int optname, char __user *optval, unsigned int optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int val = 0, err;
+
+	if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
+			     (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
+			     (1<<IP_RETOPTS) | (1<<IP_TOS) |
+			     (1<<IP_TTL) | (1<<IP_HDRINCL) |
+			     (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
+			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
+			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
+			     (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
+	    optname == IP_MULTICAST_TTL ||
+	    optname == IP_MULTICAST_ALL ||
+	    optname == IP_MULTICAST_LOOP ||
+	    optname == IP_RECVORIGDSTADDR) {
+		if (optlen >= sizeof(int)) {
+			if (get_user(val, (int __user *) optval))
+				return -EFAULT;
+		} else if (optlen >= sizeof(char)) {
+			unsigned char ucval;
+
+			if (get_user(ucval, (unsigned char __user *) optval))
+				return -EFAULT;
+			val = (int) ucval;
+		}
+	}
+
+	/* If optlen==0, it is equivalent to val == 0 */
+
+	if (ip_mroute_opt(optname))
+		return ip_mroute_setsockopt(sk, optname, optval, optlen);
+
+	err = 0;
+	lock_sock(sk);
+
+	switch (optname) {
+	case IP_OPTIONS:
+	{
+		struct ip_options_rcu *old, *opt = NULL;
+
+		if (optlen > 40)
+			goto e_inval;
+		err = ip_options_get_from_user(sock_net(sk), &opt,
+					       optval, optlen);
+		if (err)
+			break;
+		old = rcu_dereference_protected(inet->inet_opt,
+						sock_owned_by_user(sk));
+		if (inet->is_icsk) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			if (sk->sk_family == PF_INET ||
+			    (!((1 << sk->sk_state) &
+			       (TCPF_LISTEN | TCPF_CLOSE)) &&
+			     inet->inet_daddr != LOOPBACK4_IPV6)) {
+#endif
+				if (old)
+					icsk->icsk_ext_hdr_len -= old->opt.optlen;
+				if (opt)
+					icsk->icsk_ext_hdr_len += opt->opt.optlen;
+				icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			}
+#endif
+		}
+		rcu_assign_pointer(inet->inet_opt, opt);
+		if (old)
+			call_rcu(&old->rcu, opt_kfree_rcu);
+		break;
+	}
+	case IP_PKTINFO:
+		if (val)
+			inet->cmsg_flags |= IP_CMSG_PKTINFO;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+		break;
+	case IP_RECVTTL:
+		if (val)
+			inet->cmsg_flags |=  IP_CMSG_TTL;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_TTL;
+		break;
+	case IP_RECVTOS:
+		if (val)
+			inet->cmsg_flags |=  IP_CMSG_TOS;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_TOS;
+		break;
+	case IP_RECVOPTS:
+		if (val)
+			inet->cmsg_flags |=  IP_CMSG_RECVOPTS;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+		break;
+	case IP_RETOPTS:
+		if (val)
+			inet->cmsg_flags |= IP_CMSG_RETOPTS;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+		break;
+	case IP_PASSSEC:
+		if (val)
+			inet->cmsg_flags |= IP_CMSG_PASSSEC;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
+		break;
+	case IP_RECVORIGDSTADDR:
+		if (val)
+			inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
+		break;
+	case IP_TOS:	/* This sets both TOS and Precedence */
+		if (sk->sk_type == SOCK_STREAM) {
+			val &= ~3;
+			val |= inet->tos & 3;
+		}
+		if (inet->tos != val) {
+			inet->tos = val;
+			sk->sk_priority = rt_tos2priority(val);
+			sk_dst_reset(sk);
+		}
+		break;
+	case IP_TTL:
+		if (optlen < 1)
+			goto e_inval;
+		if (val != -1 && (val < 0 || val > 255))
+			goto e_inval;
+		inet->uc_ttl = val;
+		break;
+	case IP_HDRINCL:
+		if (sk->sk_type != SOCK_RAW) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+		inet->hdrincl = val ? 1 : 0;
+		break;
+	case IP_NODEFRAG:
+		if (sk->sk_type != SOCK_RAW) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+		inet->nodefrag = val ? 1 : 0;
+		break;
+	case IP_MTU_DISCOVER:
+		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+			goto e_inval;
+		inet->pmtudisc = val;
+		break;
+	case IP_RECVERR:
+		inet->recverr = !!val;
+		if (!val)
+			skb_queue_purge(&sk->sk_error_queue);
+		break;
+	case IP_MULTICAST_TTL:
+		if (sk->sk_type == SOCK_STREAM)
+			goto e_inval;
+		if (optlen < 1)
+			goto e_inval;
+		if (val == -1)
+			val = 1;
+		if (val < 0 || val > 255)
+			goto e_inval;
+		inet->mc_ttl = val;
+		break;
+	case IP_MULTICAST_LOOP:
+		if (optlen < 1)
+			goto e_inval;
+		inet->mc_loop = !!val;
+		break;
+	case IP_MULTICAST_IF:
+	{
+		struct ip_mreqn mreq;
+		struct net_device *dev = NULL;
+
+		if (sk->sk_type == SOCK_STREAM)
+			goto e_inval;
+		/*
+		 *	Check the arguments are allowable
+		 */
+
+		if (optlen < sizeof(struct in_addr))
+			goto e_inval;
+
+		err = -EFAULT;
+		if (optlen >= sizeof(struct ip_mreqn)) {
+			if (copy_from_user(&mreq, optval, sizeof(mreq)))
+				break;
+		} else {
+			memset(&mreq, 0, sizeof(mreq));
+			if (optlen >= sizeof(struct in_addr) &&
+			    copy_from_user(&mreq.imr_address, optval,
+					   sizeof(struct in_addr)))
+				break;
+		}
+
+		if (!mreq.imr_ifindex) {
+			if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
+				inet->mc_index = 0;
+				inet->mc_addr  = 0;
+				err = 0;
+				break;
+			}
+			dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
+			if (dev)
+				mreq.imr_ifindex = dev->ifindex;
+		} else
+			dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
+
+
+		err = -EADDRNOTAVAIL;
+		if (!dev)
+			break;
+		dev_put(dev);
+
+		err = -EINVAL;
+		if (sk->sk_bound_dev_if &&
+		    mreq.imr_ifindex != sk->sk_bound_dev_if)
+			break;
+
+		inet->mc_index = mreq.imr_ifindex;
+		inet->mc_addr  = mreq.imr_address.s_addr;
+		err = 0;
+		break;
+	}
+
+	case IP_ADD_MEMBERSHIP:
+	case IP_DROP_MEMBERSHIP:
+	{
+		struct ip_mreqn mreq;
+
+		err = -EPROTO;
+		if (inet_sk(sk)->is_icsk)
+			break;
+
+		if (optlen < sizeof(struct ip_mreq))
+			goto e_inval;
+		err = -EFAULT;
+		if (optlen >= sizeof(struct ip_mreqn)) {
+			if (copy_from_user(&mreq, optval, sizeof(mreq)))
+				break;
+		} else {
+			memset(&mreq, 0, sizeof(mreq));
+			if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
+				break;
+		}
+
+		if (optname == IP_ADD_MEMBERSHIP)
+			err = ip_mc_join_group(sk, &mreq);
+		else
+			err = ip_mc_leave_group(sk, &mreq);
+		break;
+	}
+	case IP_MSFILTER:
+	{
+		struct ip_msfilter *msf;
+
+		if (optlen < IP_MSFILTER_SIZE(0))
+			goto e_inval;
+		if (optlen > sysctl_optmem_max) {
+			err = -ENOBUFS;
+			break;
+		}
+		msf = kmalloc(optlen, GFP_KERNEL);
+		if (!msf) {
+			err = -ENOBUFS;
+			break;
+		}
+		err = -EFAULT;
+		if (copy_from_user(msf, optval, optlen)) {
+			kfree(msf);
+			break;
+		}
+		/* numsrc >= (1G-4) overflow in 32 bits */
+		if (msf->imsf_numsrc >= 0x3ffffffcU ||
+		    msf->imsf_numsrc > sysctl_igmp_max_msf) {
+			kfree(msf);
+			err = -ENOBUFS;
+			break;
+		}
+		if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+			kfree(msf);
+			err = -EINVAL;
+			break;
+		}
+		err = ip_mc_msfilter(sk, msf, 0);
+		kfree(msf);
+		break;
+	}
+	case IP_BLOCK_SOURCE:
+	case IP_UNBLOCK_SOURCE:
+	case IP_ADD_SOURCE_MEMBERSHIP:
+	case IP_DROP_SOURCE_MEMBERSHIP:
+	{
+		struct ip_mreq_source mreqs;
+		int omode, add;
+
+		if (optlen != sizeof(struct ip_mreq_source))
+			goto e_inval;
+		if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+			err = -EFAULT;
+			break;
+		}
+		if (optname == IP_BLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 1;
+		} else if (optname == IP_UNBLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 0;
+		} else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+			struct ip_mreqn mreq;
+
+			mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+			mreq.imr_address.s_addr = mreqs.imr_interface;
+			mreq.imr_ifindex = 0;
+			err = ip_mc_join_group(sk, &mreq);
+			if (err && err != -EADDRINUSE)
+				break;
+			omode = MCAST_INCLUDE;
+			add = 1;
+		} else /* IP_DROP_SOURCE_MEMBERSHIP */ {
+			omode = MCAST_INCLUDE;
+			add = 0;
+		}
+		err = ip_mc_source(add, omode, sk, &mreqs, 0);
+		break;
+	}
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+	{
+		struct group_req greq;
+		struct sockaddr_in *psin;
+		struct ip_mreqn mreq;
+
+		if (optlen < sizeof(struct group_req))
+			goto e_inval;
+		err = -EFAULT;
+		if (copy_from_user(&greq, optval, sizeof(greq)))
+			break;
+		psin = (struct sockaddr_in *)&greq.gr_group;
+		if (psin->sin_family != AF_INET)
+			goto e_inval;
+		memset(&mreq, 0, sizeof(mreq));
+		mreq.imr_multiaddr = psin->sin_addr;
+		mreq.imr_ifindex = greq.gr_interface;
+
+		if (optname == MCAST_JOIN_GROUP)
+			err = ip_mc_join_group(sk, &mreq);
+		else
+			err = ip_mc_leave_group(sk, &mreq);
+		break;
+	}
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	{
+		struct group_source_req greqs;
+		struct ip_mreq_source mreqs;
+		struct sockaddr_in *psin;
+		int omode, add;
+
+		if (optlen != sizeof(struct group_source_req))
+			goto e_inval;
+		if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+			err = -EFAULT;
+			break;
+		}
+		if (greqs.gsr_group.ss_family != AF_INET ||
+		    greqs.gsr_source.ss_family != AF_INET) {
+			err = -EADDRNOTAVAIL;
+			break;
+		}
+		psin = (struct sockaddr_in *)&greqs.gsr_group;
+		mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+		psin = (struct sockaddr_in *)&greqs.gsr_source;
+		mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+		mreqs.imr_interface = 0; /* use index for mc_source */
+
+		if (optname == MCAST_BLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 1;
+		} else if (optname == MCAST_UNBLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 0;
+		} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+			struct ip_mreqn mreq;
+
+			psin = (struct sockaddr_in *)&greqs.gsr_group;
+			mreq.imr_multiaddr = psin->sin_addr;
+			mreq.imr_address.s_addr = 0;
+			mreq.imr_ifindex = greqs.gsr_interface;
+			err = ip_mc_join_group(sk, &mreq);
+			if (err && err != -EADDRINUSE)
+				break;
+			greqs.gsr_interface = mreq.imr_ifindex;
+			omode = MCAST_INCLUDE;
+			add = 1;
+		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+			omode = MCAST_INCLUDE;
+			add = 0;
+		}
+		err = ip_mc_source(add, omode, sk, &mreqs,
+				   greqs.gsr_interface);
+		break;
+	}
+	case MCAST_MSFILTER:
+	{
+		struct sockaddr_in *psin;
+		struct ip_msfilter *msf = NULL;
+		struct group_filter *gsf = NULL;
+		int msize, i, ifindex;
+
+		if (optlen < GROUP_FILTER_SIZE(0))
+			goto e_inval;
+		if (optlen > sysctl_optmem_max) {
+			err = -ENOBUFS;
+			break;
+		}
+		gsf = kmalloc(optlen, GFP_KERNEL);
+		if (!gsf) {
+			err = -ENOBUFS;
+			break;
+		}
+		err = -EFAULT;
+		if (copy_from_user(gsf, optval, optlen))
+			goto mc_msf_out;
+
+		/* numsrc >= (4G-140)/128 overflow in 32 bits */
+		if (gsf->gf_numsrc >= 0x1ffffff ||
+		    gsf->gf_numsrc > sysctl_igmp_max_msf) {
+			err = -ENOBUFS;
+			goto mc_msf_out;
+		}
+		if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+			err = -EINVAL;
+			goto mc_msf_out;
+		}
+		msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+		msf = kmalloc(msize, GFP_KERNEL);
+		if (!msf) {
+			err = -ENOBUFS;
+			goto mc_msf_out;
+		}
+		ifindex = gsf->gf_interface;
+		psin = (struct sockaddr_in *)&gsf->gf_group;
+		if (psin->sin_family != AF_INET) {
+			err = -EADDRNOTAVAIL;
+			goto mc_msf_out;
+		}
+		msf->imsf_multiaddr = psin->sin_addr.s_addr;
+		msf->imsf_interface = 0;
+		msf->imsf_fmode = gsf->gf_fmode;
+		msf->imsf_numsrc = gsf->gf_numsrc;
+		err = -EADDRNOTAVAIL;
+		for (i = 0; i < gsf->gf_numsrc; ++i) {
+			psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+			if (psin->sin_family != AF_INET)
+				goto mc_msf_out;
+			msf->imsf_slist[i] = psin->sin_addr.s_addr;
+		}
+		kfree(gsf);
+		gsf = NULL;
+
+		err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+		kfree(msf);
+		kfree(gsf);
+		break;
+	}
+	case IP_MULTICAST_ALL:
+		if (optlen < 1)
+			goto e_inval;
+		if (val != 0 && val != 1)
+			goto e_inval;
+		inet->mc_all = val;
+		break;
+	case IP_ROUTER_ALERT:
+		err = ip_ra_control(sk, val ? 1 : 0, NULL);
+		break;
+
+	case IP_FREEBIND:
+		if (optlen < 1)
+			goto e_inval;
+		inet->freebind = !!val;
+		break;
+
+	case IP_IPSEC_POLICY:
+	case IP_XFRM_POLICY:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		err = xfrm_user_policy(sk, optname, optval, optlen);
+		break;
+
+	case IP_TRANSPARENT:
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
+		if (optlen < 1)
+			goto e_inval;
+		inet->transparent = !!val;
+		break;
+
+	case IP_MINTTL:
+		if (optlen < 1)
+			goto e_inval;
+		if (val < 0 || val > 255)
+			goto e_inval;
+		inet->min_ttl = val;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+	release_sock(sk);
+	return err;
+
+e_inval:
+	release_sock(sk);
+	return -EINVAL;
+}
+
+/**
+ * ip_queue_rcv_skb - Queue an skb into sock receive queue
+ * @sk: socket
+ * @skb: buffer
+ *
+ * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
+ * is not set, we drop skb dst entry now, while dst cache line is hot.
+ */
+int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
+		skb_dst_drop(skb);
+	return sock_queue_rcv_skb(sk, skb);
+}
+EXPORT_SYMBOL(ip_queue_rcv_skb);
+
+int ip_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, unsigned int optlen)
+{
+	int err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+			optname != IP_IPSEC_POLICY &&
+			optname != IP_XFRM_POLICY &&
+			!ip_mroute_opt(optname)) {
+		lock_sock(sk);
+		err = nf_setsockopt(sk, PF_INET, optname, optval, optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(ip_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_setsockopt(struct sock *sk, int level, int optname,
+			 char __user *optval, unsigned int optlen)
+{
+	int err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
+			ip_setsockopt);
+
+	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
+			optname != IP_IPSEC_POLICY &&
+			optname != IP_XFRM_POLICY &&
+			!ip_mroute_opt(optname)) {
+		lock_sock(sk);
+		err = compat_nf_setsockopt(sk, PF_INET, optname,
+					   optval, optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(compat_ip_setsockopt);
+#endif
+
+/*
+ *	Get the options. Note for future reference. The GET of IP options gets
+ *	the _received_ ones. The set sets the _sent_ ones.
+ */
+
+static int do_ip_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int val;
+	int len;
+
+	if (level != SOL_IP)
+		return -EOPNOTSUPP;
+
+	if (ip_mroute_opt(optname))
+		return ip_mroute_getsockopt(sk, optname, optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case IP_OPTIONS:
+	{
+		unsigned char optbuf[sizeof(struct ip_options)+40];
+		struct ip_options *opt = (struct ip_options *)optbuf;
+		struct ip_options_rcu *inet_opt;
+
+		inet_opt = rcu_dereference_protected(inet->inet_opt,
+						     sock_owned_by_user(sk));
+		opt->optlen = 0;
+		if (inet_opt)
+			memcpy(optbuf, &inet_opt->opt,
+			       sizeof(struct ip_options) +
+			       inet_opt->opt.optlen);
+		release_sock(sk);
+
+		if (opt->optlen == 0)
+			return put_user(0, optlen);
+
+		ip_options_undo(opt);
+
+		len = min_t(unsigned int, len, opt->optlen);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, opt->__data, len))
+			return -EFAULT;
+		return 0;
+	}
+	case IP_PKTINFO:
+		val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+		break;
+	case IP_RECVTTL:
+		val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+		break;
+	case IP_RECVTOS:
+		val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+		break;
+	case IP_RECVOPTS:
+		val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+		break;
+	case IP_RETOPTS:
+		val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+		break;
+	case IP_PASSSEC:
+		val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
+		break;
+	case IP_RECVORIGDSTADDR:
+		val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
+		break;
+	case IP_TOS:
+		val = inet->tos;
+		break;
+	case IP_TTL:
+		val = (inet->uc_ttl == -1 ?
+		       sysctl_ip_default_ttl :
+		       inet->uc_ttl);
+		break;
+	case IP_HDRINCL:
+		val = inet->hdrincl;
+		break;
+	case IP_NODEFRAG:
+		val = inet->nodefrag;
+		break;
+	case IP_MTU_DISCOVER:
+		val = inet->pmtudisc;
+		break;
+	case IP_MTU:
+	{
+		struct dst_entry *dst;
+		val = 0;
+		dst = sk_dst_get(sk);
+		if (dst) {
+			val = dst_mtu(dst);
+			dst_release(dst);
+		}
+		if (!val) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+		break;
+	}
+	case IP_RECVERR:
+		val = inet->recverr;
+		break;
+	case IP_MULTICAST_TTL:
+		val = inet->mc_ttl;
+		break;
+	case IP_MULTICAST_LOOP:
+		val = inet->mc_loop;
+		break;
+	case IP_MULTICAST_IF:
+	{
+		struct in_addr addr;
+		len = min_t(unsigned int, len, sizeof(struct in_addr));
+		addr.s_addr = inet->mc_addr;
+		release_sock(sk);
+
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &addr, len))
+			return -EFAULT;
+		return 0;
+	}
+	case IP_MSFILTER:
+	{
+		struct ip_msfilter msf;
+		int err;
+
+		if (len < IP_MSFILTER_SIZE(0)) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+			release_sock(sk);
+			return -EFAULT;
+		}
+		err = ip_mc_msfget(sk, &msf,
+				   (struct ip_msfilter __user *)optval, optlen);
+		release_sock(sk);
+		return err;
+	}
+	case MCAST_MSFILTER:
+	{
+		struct group_filter gsf;
+		int err;
+
+		if (len < GROUP_FILTER_SIZE(0)) {
+			release_sock(sk);
+			return -EINVAL;
+		}
+		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+			release_sock(sk);
+			return -EFAULT;
+		}
+		err = ip_mc_gsfget(sk, &gsf,
+				   (struct group_filter __user *)optval,
+				   optlen);
+		release_sock(sk);
+		return err;
+	}
+	case IP_MULTICAST_ALL:
+		val = inet->mc_all;
+		break;
+	case IP_PKTOPTIONS:
+	{
+		struct msghdr msg;
+
+		release_sock(sk);
+
+		if (sk->sk_type != SOCK_STREAM)
+			return -ENOPROTOOPT;
+
+		msg.msg_control = optval;
+		msg.msg_controllen = len;
+		msg.msg_flags = 0;
+
+		if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+			struct in_pktinfo info;
+
+			info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+			info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
+			info.ipi_ifindex = inet->mc_index;
+			put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+		}
+		if (inet->cmsg_flags & IP_CMSG_TTL) {
+			int hlim = inet->mc_ttl;
+			put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+		}
+		len -= msg.msg_controllen;
+		return put_user(len, optlen);
+	}
+	case IP_FREEBIND:
+		val = inet->freebind;
+		break;
+	case IP_TRANSPARENT:
+		val = inet->transparent;
+		break;
+	case IP_MINTTL:
+		val = inet->min_ttl;
+		break;
+	default:
+		release_sock(sk);
+		return -ENOPROTOOPT;
+	}
+	release_sock(sk);
+
+	if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
+		unsigned char ucval = (unsigned char)val;
+		len = 1;
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &ucval, 1))
+			return -EFAULT;
+	} else {
+		len = min_t(unsigned int, sizeof(int), len);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &val, len))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+int ip_getsockopt(struct sock *sk, int level,
+		  int optname, char __user *optval, int __user *optlen)
+{
+	int err;
+
+	err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+			!ip_mroute_opt(optname)) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = nf_getsockopt(sk, PF_INET, optname, optval,
+				&len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+		return err;
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(ip_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ip_getsockopt(struct sock *sk, int level, int optname,
+			 char __user *optval, int __user *optlen)
+{
+	int err;
+
+	if (optname == MCAST_MSFILTER)
+		return compat_mc_getsockopt(sk, level, optname, optval, optlen,
+			ip_getsockopt);
+
+	err = do_ip_getsockopt(sk, level, optname, optval, optlen);
+
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
+			!ip_mroute_opt(optname)) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+		return err;
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(compat_ip_getsockopt);
+#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 00000000..c857f6f4
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,185 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ *   - Tunable compression parameters.
+ *   - Compression stats.
+ *   - Adaptive compression.
+ */
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+
+static void ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	__be32 spi;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+	    icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+		return;
+
+	spi = htonl(ntohs(ipch->cpi));
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, IPPROTO_COMP, AF_INET);
+	if (!x)
+		return;
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
+		 spi, &iph->daddr);
+	xfrm_state_put(x);
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	struct xfrm_state *t;
+
+	t = xfrm_state_alloc(net);
+	if (t == NULL)
+		goto out;
+
+	t->id.proto = IPPROTO_IPIP;
+	t->id.spi = x->props.saddr.a4;
+	t->id.daddr.a4 = x->id.daddr.a4;
+	memcpy(&t->sel, &x->sel, sizeof(t->sel));
+	t->props.family = AF_INET;
+	t->props.mode = x->props.mode;
+	t->props.saddr.a4 = x->props.saddr.a4;
+	t->props.flags = x->props.flags;
+	memcpy(&t->mark, &x->mark, sizeof(t->mark));
+
+	if (xfrm_init_state(t))
+		goto error;
+
+	atomic_set(&t->tunnel_users, 1);
+out:
+	return t;
+
+error:
+	t->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(t);
+	t = NULL;
+	goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_mutex.  State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	int err = 0;
+	struct xfrm_state *t;
+	u32 mark = x->mark.v & x->mark.m;
+
+	t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
+			      x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+	if (!t) {
+		t = ipcomp_tunnel_create(x);
+		if (!t) {
+			err = -EINVAL;
+			goto out;
+		}
+		xfrm_state_insert(t);
+		xfrm_state_hold(t);
+	}
+	x->tunnel = t;
+	atomic_inc(&t->tunnel_users);
+out:
+	return err;
+}
+
+static int ipcomp4_init_state(struct xfrm_state *x)
+{
+	int err = -EINVAL;
+
+	x->props.header_len = 0;
+	switch (x->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+		break;
+	case XFRM_MODE_TUNNEL:
+		x->props.header_len += sizeof(struct iphdr);
+		break;
+	default:
+		goto out;
+	}
+
+	err = ipcomp_init_state(x);
+	if (err)
+		goto out;
+
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
+		err = ipcomp_tunnel_attach(x);
+		if (err)
+			goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static const struct xfrm_type ipcomp_type = {
+	.description	= "IPCOMP4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_COMP,
+	.init_state	= ipcomp4_init_state,
+	.destructor	= ipcomp_destroy,
+	.input		= ipcomp_input,
+	.output		= ipcomp_output
+};
+
+static const struct net_protocol ipcomp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	ipcomp4_err,
+	.no_policy	=	1,
+};
+
+static int __init ipcomp4_init(void)
+{
+	if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+		printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+		printk(KERN_INFO "ipcomp init: can't add protocol\n");
+		xfrm_unregister_type(&ipcomp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+	if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+		printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+		printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 00000000..7fbcabaf
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1636 @@
+/*
+ *  Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ *  user-supplied information to configure own IP address and routes.
+ *
+ *  Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *  Derived from network configuration code in fs/nfs/nfsroot.c,
+ *  originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ *  BOOTP rewritten to construct and analyse packets itself instead
+ *  of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ *					     -- MJ, December 1998
+ *
+ *  Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ *  initialization scheme.
+ *	- Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ *  DHCP support added.  To users this looks like a whole separate
+ *  protocol, but we know it's just a bag on the side of BOOTP.
+ *		-- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ *  Ported DHCP support from 2.2.16 to 2.4.0-test4
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ *  Merged changes from 2.2.19 into 2.4.3
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ *  Multiple Nameservers in /proc/net/pnp
+ *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <linux/nfs_fs.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_POST_OPEN		10	/* After opening: 10 msecs */
+#define CONF_CARRIER_TIMEOUT	120000	/* Wait for carrier timeout */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 	2	/* (Re)open devices twice */
+#define CONF_SEND_RETRIES 	6	/* Send six requests per open */
+#define CONF_INTER_TIMEOUT	(HZ/2)	/* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT	(HZ*2)	/* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM	(HZ)	/* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT	*7/4	/* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX	(HZ*30)	/* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers
+					   - '3' from resolv.h */
+
+#define NONE cpu_to_be32(INADDR_NONE)
+#define ANY cpu_to_be32(INADDR_ANY)
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars.  If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0;		/* IPconfig parameters set manually */
+
+static int ic_enable __initdata = 0;		/* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+			| IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+			| IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+			| IC_RARP
+#endif
+			;
+
+static int ic_host_name_set __initdata = 0;	/* Host name set by us? */
+
+__be32 ic_myaddr = NONE;		/* My IP address */
+static __be32 ic_netmask = NONE;	/* Netmask for local subnet */
+__be32 ic_gateway = NONE;	/* Gateway IP address */
+
+__be32 ic_servaddr = NONE;	/* Boot server IP address */
+
+__be32 root_server_addr = NONE;	/* Address of NFS server */
+u8 root_server_path[256] = { 0, };	/* Path to mount as root */
+
+u32 ic_dev_xid;		/* Device under configuration */
+
+/* vendor class identifier */
+static char vendor_class_identifier[253] __initdata;
+
+/* Persistent data: */
+
+static int ic_proto_used;			/* Protocol used, if any */
+static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64];		/* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata = 0;
+
+/* MTU for boot device */
+static int ic_dev_mtu __initdata = 0;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata = 0;    /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata = 0;	/* DHCP msg type received */
+#endif
+
+
+/*
+ *	Network devices
+ */
+
+struct ic_device {
+	struct ic_device *next;
+	struct net_device *dev;
+	unsigned short flags;
+	short able;
+	__be32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct net_device *ic_dev __initdata = NULL;	/* Selected device */
+
+static bool __init ic_is_init_dev(struct net_device *dev)
+{
+	if (dev->flags & IFF_LOOPBACK)
+		return false;
+	return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+	    (!(dev->flags & IFF_LOOPBACK) &&
+	     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+	     strncmp(dev->name, "dummy", 5));
+}
+
+static int __init ic_open_devs(void)
+{
+	struct ic_device *d, **last;
+	struct net_device *dev;
+	unsigned short oflags;
+	unsigned long start;
+
+	last = &ic_first_dev;
+	rtnl_lock();
+
+	/* bring loopback device up first */
+	for_each_netdev(&init_net, dev) {
+		if (!(dev->flags & IFF_LOOPBACK))
+			continue;
+		if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+			printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+	}
+
+	for_each_netdev(&init_net, dev) {
+		if (ic_is_init_dev(dev)) {
+			int able = 0;
+			if (dev->mtu >= 364)
+				able |= IC_BOOTP;
+			else
+				printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+			if (!(dev->flags & IFF_NOARP))
+				able |= IC_RARP;
+			able &= ic_proto_enabled;
+			if (ic_proto_enabled && !able)
+				continue;
+			oflags = dev->flags;
+			if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+				printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+				continue;
+			}
+			if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+				rtnl_unlock();
+				return -ENOMEM;
+			}
+			d->dev = dev;
+			*last = d;
+			last = &d->next;
+			d->flags = oflags;
+			d->able = able;
+			if (able & IC_BOOTP)
+				get_random_bytes(&d->xid, sizeof(__be32));
+			else
+				d->xid = 0;
+			ic_proto_have_if |= able;
+			DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+				dev->name, able, d->xid));
+		}
+	}
+
+	/* no point in waiting if we could not bring up at least one device */
+	if (!ic_first_dev)
+		goto have_carrier;
+
+	/* wait for a carrier on at least one device */
+	start = jiffies;
+	while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
+		for_each_netdev(&init_net, dev)
+			if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+				goto have_carrier;
+
+		msleep(1);
+	}
+have_carrier:
+	rtnl_unlock();
+
+	*last = NULL;
+
+	if (!ic_first_dev) {
+		if (user_dev_name[0])
+			printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+		else
+			printk(KERN_ERR "IP-Config: No network devices available.\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+	struct ic_device *d, *next;
+	struct net_device *dev;
+
+	rtnl_lock();
+	next = ic_first_dev;
+	while ((d = next)) {
+		next = d->next;
+		dev = d->dev;
+		if (dev != ic_dev) {
+			DBG(("IP-Config: Downing %s\n", dev->name));
+			dev_change_flags(dev, d->flags);
+		}
+		kfree(d);
+	}
+	rtnl_unlock();
+}
+
+/*
+ *	Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, __be32 addr, __be16 port)
+{
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = addr;
+	sin->sin_port = port;
+}
+
+static int __init ic_devinet_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+	int res;
+
+	mm_segment_t oldfs = get_fs();
+	set_fs(get_ds());
+	res = devinet_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+	set_fs(oldfs);
+	return res;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+	int res;
+
+	mm_segment_t oldfs = get_fs();
+	set_fs(get_ds());
+	res = dev_ioctl(&init_net, cmd, (struct ifreq __user *) arg);
+	set_fs(oldfs);
+	return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+	int res;
+
+	mm_segment_t oldfs = get_fs();
+	set_fs(get_ds());
+	res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg);
+	set_fs(oldfs);
+	return res;
+}
+
+/*
+ *	Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+	struct ifreq ir;
+	struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+	int err;
+
+	memset(&ir, 0, sizeof(ir));
+	strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+	set_sockaddr(sin, ic_myaddr, 0);
+	if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_netmask, 0);
+	if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+	if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+		return -1;
+	}
+	/* Handle the case where we need non-standard MTU on the boot link (a network
+	 * using jumbo frames, for instance).  If we can't set the mtu, don't error
+	 * out, we'll try to muddle along.
+	 */
+	if (ic_dev_mtu != 0) {
+		strcpy(ir.ifr_name, ic_dev->name);
+		ir.ifr_mtu = ic_dev_mtu;
+		if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
+			printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
+			                 ic_dev_mtu, err);
+	}
+	return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+	/* No need to setup device routes, only the default route... */
+
+	if (ic_gateway != NONE) {
+		struct rtentry rm;
+		int err;
+
+		memset(&rm, 0, sizeof(rm));
+		if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+			printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+			return -1;
+		}
+		set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+		rm.rt_flags = RTF_UP | RTF_GATEWAY;
+		if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+			printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+	/*
+	 *	At this point we have no userspace running so need not
+	 *	claim locks on system_utsname
+	 */
+
+	if (!ic_host_name_set)
+		sprintf(init_utsname()->nodename, "%pI4", &ic_myaddr);
+
+	if (root_server_addr == NONE)
+		root_server_addr = ic_servaddr;
+
+	if (ic_netmask == NONE) {
+		if (IN_CLASSA(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSA_NET);
+		else if (IN_CLASSB(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSB_NET);
+		else if (IN_CLASSC(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSC_NET);
+		else {
+			printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
+				&ic_myaddr);
+			return -1;
+		}
+		printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
+	}
+
+	return 0;
+}
+
+/*
+ *	RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type rarp_packet_type __initdata = {
+	.type =	cpu_to_be16(ETH_P_RARP),
+	.func =	ic_rarp_recv,
+};
+
+static inline void __init ic_rarp_init(void)
+{
+	dev_add_pack(&rarp_packet_type);
+}
+
+static inline void __init ic_rarp_cleanup(void)
+{
+	dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ *  Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct arphdr *rarp;
+	unsigned char *rarp_ptr;
+	__be32 sip, tip;
+	unsigned char *sha, *tha;		/* s for "source", t for "target" */
+	struct ic_device *d;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+		goto drop;
+
+	/* Basic sanity checks can be done without the lock.  */
+	rarp = (struct arphdr *)skb_transport_header(skb);
+
+	/* If this test doesn't pass, it's not IP, or we should
+	 * ignore it anyway.
+	 */
+	if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+		goto drop;
+
+	/* If it's not a RARP reply, delete it. */
+	if (rarp->ar_op != htons(ARPOP_RREPLY))
+		goto drop;
+
+	/* If it's not Ethernet, delete it. */
+	if (rarp->ar_pro != htons(ETH_P_IP))
+		goto drop;
+
+	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+		goto drop;
+
+	/* OK, it is all there and looks valid, process... */
+	rarp = (struct arphdr *)skb_transport_header(skb);
+	rarp_ptr = (unsigned char *) (rarp + 1);
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;	/* should never happen */
+
+	/* Extract variable-width fields */
+	sha = rarp_ptr;
+	rarp_ptr += dev->addr_len;
+	memcpy(&sip, rarp_ptr, 4);
+	rarp_ptr += 4;
+	tha = rarp_ptr;
+	rarp_ptr += dev->addr_len;
+	memcpy(&tip, rarp_ptr, 4);
+
+	/* Discard packets which are not meant for us. */
+	if (memcmp(tha, dev->dev_addr, dev->addr_len))
+		goto drop_unlock;
+
+	/* Discard packets which are not from specified server. */
+	if (ic_servaddr != NONE && ic_servaddr != sip)
+		goto drop_unlock;
+
+	/* We have a winner! */
+	ic_dev = dev;
+	if (ic_myaddr == NONE)
+		ic_myaddr = tip;
+	ic_servaddr = sip;
+	ic_got_reply = IC_RARP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *  Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+	struct net_device *dev = d->dev;
+	arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+		 dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ *	DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt {		/* BOOTP packet format */
+	struct iphdr iph;	/* IP header */
+	struct udphdr udph;	/* UDP header */
+	u8 op;			/* 1=request, 2=reply */
+	u8 htype;		/* HW address type */
+	u8 hlen;		/* HW address length */
+	u8 hops;		/* Used only by gateways */
+	__be32 xid;		/* Transaction ID */
+	__be16 secs;		/* Seconds since we started */
+	__be16 flags;		/* Just what it says */
+	__be32 client_ip;		/* Client's IP address if known */
+	__be32 your_ip;		/* Assigned IP address */
+	__be32 server_ip;		/* (Next, e.g. NFS) Server's IP address */
+	__be32 relay_ip;		/* IP address of BOOTP relay */
+	u8 hw_addr[16];		/* Client's HW address */
+	u8 serv_name[64];	/* Server host name */
+	u8 boot_file[128];	/* Name of boot file */
+	u8 exten[312];		/* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST	1
+#define BOOTP_REPLY	2
+
+/* DHCP message types */
+#define DHCPDISCOVER	1
+#define DHCPOFFER	2
+#define DHCPREQUEST	3
+#define DHCPDECLINE	4
+#define DHCPACK		5
+#define DHCPNAK		6
+#define DHCPRELEASE	7
+#define DHCPINFORM	8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type bootp_packet_type __initdata = {
+	.type =	cpu_to_be16(ETH_P_IP),
+	.func =	ic_bootp_recv,
+};
+
+
+/*
+ *  Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+	u8 mt = ((ic_servaddr == NONE)
+		 ? DHCPDISCOVER : DHCPREQUEST);
+	u8 *e = options;
+	int len;
+
+#ifdef IPCONFIG_DEBUG
+	printk("DHCP: Sending message type %d\n", mt);
+#endif
+
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+
+	*e++ = 53;		/* DHCP message type */
+	*e++ = 1;
+	*e++ = mt;
+
+	if (mt == DHCPREQUEST) {
+		*e++ = 54;	/* Server ID (IP address) */
+		*e++ = 4;
+		memcpy(e, &ic_servaddr, 4);
+		e += 4;
+
+		*e++ = 50;	/* Requested IP address */
+		*e++ = 4;
+		memcpy(e, &ic_myaddr, 4);
+		e += 4;
+	}
+
+	/* always? */
+	{
+		static const u8 ic_req_params[] = {
+			1,	/* Subnet mask */
+			3,	/* Default gateway */
+			6,	/* DNS server */
+			12,	/* Host name */
+			15,	/* Domain name */
+			17,	/* Boot path */
+			26,	/* MTU */
+			40,	/* NIS domain name */
+		};
+
+		*e++ = 55;	/* Parameter request list */
+		*e++ = sizeof(ic_req_params);
+		memcpy(e, ic_req_params, sizeof(ic_req_params));
+		e += sizeof(ic_req_params);
+
+		if (ic_host_name_set) {
+			*e++ = 12;	/* host-name */
+			len = strlen(utsname()->nodename);
+			*e++ = len;
+			memcpy(e, utsname()->nodename, len);
+			e += len;
+		}
+		if (*vendor_class_identifier) {
+			printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
+			       vendor_class_identifier);
+			*e++ = 60;	/* Class-identifier */
+			len = strlen(vendor_class_identifier);
+			*e++ = len;
+			memcpy(e, vendor_class_identifier, len);
+			e += len;
+		}
+	}
+
+	*e++ = 255;	/* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+	*e++ = 1;		/* Subnet mask request */
+	*e++ = 4;
+	e += 4;
+	*e++ = 3;		/* Default gateway request */
+	*e++ = 4;
+	e += 4;
+	*e++ = 5;		/* Name server request */
+	*e++ = 8;
+	e += 8;
+	*e++ = 12;		/* Host name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 40;		/* NIS Domain name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 17;		/* Boot path */
+	*e++ = 40;
+	e += 40;
+
+	*e++ = 57;		/* set extension buffer size for reply */
+	*e++ = 2;
+	*e++ = 1;		/* 128+236+8+20+14, see dhcpd sources */
+	*e++ = 150;
+
+	*e++ = 255;		/* End of the list */
+}
+
+
+/*
+ *  Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void __init ic_bootp_init(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+		ic_nameservers[i] = NONE;
+
+	dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  DHCP/BOOTP cleanup.
+ */
+static inline void __init ic_bootp_cleanup(void)
+{
+	dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+	struct net_device *dev = d->dev;
+	struct sk_buff *skb;
+	struct bootp_pkt *b;
+	struct iphdr *h;
+
+	/* Allocate packet */
+	skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15,
+			GFP_KERNEL);
+	if (!skb)
+		return;
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+	memset(b, 0, sizeof(struct bootp_pkt));
+
+	/* Construct IP header */
+	skb_reset_network_header(skb);
+	h = ip_hdr(skb);
+	h->version = 4;
+	h->ihl = 5;
+	h->tot_len = htons(sizeof(struct bootp_pkt));
+	h->frag_off = htons(IP_DF);
+	h->ttl = 64;
+	h->protocol = IPPROTO_UDP;
+	h->daddr = htonl(INADDR_BROADCAST);
+	h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+	/* Construct UDP header */
+	b->udph.source = htons(68);
+	b->udph.dest = htons(67);
+	b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+	/* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+	/* Construct DHCP/BOOTP header */
+	b->op = BOOTP_REQUEST;
+	if (dev->type < 256) /* check for false types */
+		b->htype = dev->type;
+	else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+		b->htype = ARPHRD_IEEE802;
+	else if (dev->type == ARPHRD_FDDI)
+		b->htype = ARPHRD_ETHER;
+	else {
+		printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+		b->htype = dev->type; /* can cause undefined behavior */
+	}
+
+	/* server_ip and your_ip address are both already zero per RFC2131 */
+	b->hlen = dev->addr_len;
+	memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+	b->secs = htons(jiffies_diff / HZ);
+	b->xid = d->xid;
+
+	/* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+	if (ic_proto_enabled & IC_USE_DHCP)
+		ic_dhcp_init_options(b->exten);
+	else
+#endif
+		ic_bootp_init_ext(b->exten);
+
+	/* Chain packet down the line... */
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+	if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+			    dev->broadcast, dev->dev_addr, skb->len) < 0 ||
+	    dev_queue_xmit(skb) < 0)
+		printk("E");
+}
+
+
+/*
+ *  Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+	if (!len)
+		return 0;
+	if (len > max-1)
+		len = max-1;
+	memcpy(dest, src, len);
+	dest[len] = '\0';
+	return 1;
+}
+
+
+/*
+ *  Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+       u8 servers;
+       int i;
+	u16 mtu;
+
+#ifdef IPCONFIG_DEBUG
+	u8 *c;
+
+	printk("DHCP/BOOTP: Got extension %d:",*ext);
+	for (c=ext+2; c<ext+2+ext[1]; c++)
+		printk(" %02x", *c);
+	printk("\n");
+#endif
+
+	switch (*ext++) {
+		case 1:		/* Subnet mask */
+			if (ic_netmask == NONE)
+				memcpy(&ic_netmask, ext+1, 4);
+			break;
+		case 3:		/* Default gateway */
+			if (ic_gateway == NONE)
+				memcpy(&ic_gateway, ext+1, 4);
+			break;
+		case 6:		/* DNS server */
+			servers= *ext/4;
+			if (servers > CONF_NAMESERVERS_MAX)
+				servers = CONF_NAMESERVERS_MAX;
+			for (i = 0; i < servers; i++) {
+				if (ic_nameservers[i] == NONE)
+					memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+			}
+			break;
+		case 12:	/* Host name */
+			ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN);
+			ic_host_name_set = 1;
+			break;
+		case 15:	/* Domain name (DNS) */
+			ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+			break;
+		case 17:	/* Root path */
+			if (!root_server_path[0])
+				ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
+			break;
+		case 26:	/* Interface MTU */
+			memcpy(&mtu, ext+1, sizeof(mtu));
+			ic_dev_mtu = ntohs(mtu);
+			break;
+		case 40:	/* NIS Domain name (_not_ DNS) */
+			ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
+			break;
+	}
+}
+
+
+/*
+ *  Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct bootp_pkt *b;
+	struct iphdr *h;
+	struct ic_device *d;
+	int len, ext_len;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	/* Perform verifications before taking the lock.  */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb,
+			   sizeof(struct iphdr) +
+			   sizeof(struct udphdr)))
+		goto drop;
+
+	b = (struct bootp_pkt *)skb_network_header(skb);
+	h = &b->iph;
+
+	if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+		goto drop;
+
+	/* Fragments are not supported */
+	if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
+			       "reply.\n");
+		goto drop;
+	}
+
+	if (skb->len < ntohs(h->tot_len))
+		goto drop;
+
+	if (ip_fast_csum((char *) h, h->ihl))
+		goto drop;
+
+	if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+		goto drop;
+
+	if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+		goto drop;
+
+	len = ntohs(b->udph.len) - sizeof(struct udphdr);
+	ext_len = len - (sizeof(*b) -
+			 sizeof(struct iphdr) -
+			 sizeof(struct udphdr) -
+			 sizeof(b->exten));
+	if (ext_len < 0)
+		goto drop;
+
+	/* Ok the front looks good, make sure we can get at the rest.  */
+	if (!pskb_may_pull(skb, skb->len))
+		goto drop;
+
+	b = (struct bootp_pkt *)skb_network_header(skb);
+	h = &b->iph;
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;  /* should never happen */
+
+	/* Is it a reply to our BOOTP request? */
+	if (b->op != BOOTP_REPLY ||
+	    b->xid != d->xid) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
+			       "op[%x] xid[%x]\n",
+			       b->op, b->xid);
+		goto drop_unlock;
+	}
+
+	/* Is it a reply for the device we are configuring? */
+	if (b->xid != ic_dev_xid) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
+		goto drop_unlock;
+	}
+
+	/* Parse extensions */
+	if (ext_len >= 4 &&
+	    !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+		u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+		u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+		if (ic_proto_enabled & IC_USE_DHCP) {
+			__be32 server_id = NONE;
+			int mt = 0;
+
+			ext = &b->exten[4];
+			while (ext < end && *ext != 0xff) {
+				u8 *opt = ext++;
+				if (*opt == 0)	/* Padding */
+					continue;
+				ext += *ext + 1;
+				if (ext >= end)
+					break;
+				switch (*opt) {
+				case 53:	/* Message type */
+					if (opt[1])
+						mt = opt[2];
+					break;
+				case 54:	/* Server ID (IP address) */
+					if (opt[1] >= 4)
+						memcpy(&server_id, opt + 2, 4);
+					break;
+				}
+			}
+
+#ifdef IPCONFIG_DEBUG
+			printk("DHCP: Got message type %d\n", mt);
+#endif
+
+			switch (mt) {
+			case DHCPOFFER:
+				/* While in the process of accepting one offer,
+				 * ignore all others.
+				 */
+				if (ic_myaddr != NONE)
+					goto drop_unlock;
+
+				/* Let's accept that offer. */
+				ic_myaddr = b->your_ip;
+				ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+				printk("DHCP: Offered address %pI4 by server %pI4\n",
+				       &ic_myaddr, &ic_servaddr);
+#endif
+				/* The DHCP indicated server address takes
+				 * precedence over the bootp header one if
+				 * they are different.
+				 */
+				if ((server_id != NONE) &&
+				    (b->server_ip != server_id))
+					b->server_ip = ic_servaddr;
+				break;
+
+			case DHCPACK:
+				if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+					goto drop_unlock;
+
+				/* Yeah! */
+				break;
+
+			default:
+				/* Urque.  Forget it*/
+				ic_myaddr = NONE;
+				ic_servaddr = NONE;
+				goto drop_unlock;
+			}
+
+			ic_dhcp_msgtype = mt;
+
+		}
+#endif /* IPCONFIG_DHCP */
+
+		ext = &b->exten[4];
+		while (ext < end && *ext != 0xff) {
+			u8 *opt = ext++;
+			if (*opt == 0)	/* Padding */
+				continue;
+			ext += *ext + 1;
+			if (ext < end)
+				ic_do_bootp_ext(opt);
+		}
+	}
+
+	/* We have a winner! */
+	ic_dev = dev;
+	ic_myaddr = b->your_ip;
+	ic_servaddr = b->server_ip;
+	if (ic_gateway == NONE && b->relay_ip)
+		ic_gateway = b->relay_ip;
+	if (ic_nameservers[0] == NONE)
+		ic_nameservers[0] = ic_servaddr;
+	ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+
+	return 0;
+}
+
+
+#endif
+
+
+/*
+ *	Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+	int retries;
+	struct ic_device *d;
+	unsigned long start_jiffies, timeout, jiff;
+	int do_bootp = ic_proto_have_if & IC_BOOTP;
+	int do_rarp = ic_proto_have_if & IC_RARP;
+
+	/*
+	 * If none of DHCP/BOOTP/RARP was selected, return with an error.
+	 * This routine gets only called when some pieces of information
+	 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+	 */
+	if (!ic_proto_enabled) {
+		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+		return -1;
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+		printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+#endif
+#ifdef IPCONFIG_RARP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+		printk(KERN_ERR "RARP: No suitable device found.\n");
+#endif
+
+	if (!ic_proto_have_if)
+		/* Error message already printed */
+		return -1;
+
+	/*
+	 * Setup protocols
+	 */
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_init();
+#endif
+
+	/*
+	 * Send requests and wait, until we get an answer. This loop
+	 * seems to be a terrible waste of CPU time, but actually there is
+	 * only one process running at all, so we don't need to use any
+	 * scheduler functions.
+	 * [Actually we could now, but the nothing else running note still
+	 *  applies.. - AC]
+	 */
+	printk(KERN_NOTICE "Sending %s%s%s requests .",
+	       do_bootp
+		? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+	       (do_bootp && do_rarp) ? " and " : "",
+	       do_rarp ? "RARP" : "");
+
+	start_jiffies = jiffies;
+	d = ic_first_dev;
+	retries = CONF_SEND_RETRIES;
+	get_random_bytes(&timeout, sizeof(timeout));
+	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+	for (;;) {
+		/* Track the device we are configuring */
+		ic_dev_xid = d->xid;
+
+#ifdef IPCONFIG_BOOTP
+		if (do_bootp && (d->able & IC_BOOTP))
+			ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+		if (do_rarp && (d->able & IC_RARP))
+			ic_rarp_send_if(d);
+#endif
+
+		jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+		while (time_before(jiffies, jiff) && !ic_got_reply)
+			schedule_timeout_uninterruptible(1);
+#ifdef IPCONFIG_DHCP
+		/* DHCP isn't done until we get a DHCPACK. */
+		if ((ic_got_reply & IC_BOOTP) &&
+		    (ic_proto_enabled & IC_USE_DHCP) &&
+		    ic_dhcp_msgtype != DHCPACK) {
+			ic_got_reply = 0;
+			printk(KERN_CONT ",");
+			continue;
+		}
+#endif /* IPCONFIG_DHCP */
+
+		if (ic_got_reply) {
+			printk(KERN_CONT " OK\n");
+			break;
+		}
+
+		if ((d = d->next))
+			continue;
+
+		if (! --retries) {
+			printk(KERN_CONT " timed out!\n");
+			break;
+		}
+
+		d = ic_first_dev;
+
+		timeout = timeout CONF_TIMEOUT_MULT;
+		if (timeout > CONF_TIMEOUT_MAX)
+			timeout = CONF_TIMEOUT_MAX;
+
+		printk(KERN_CONT ".");
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_cleanup();
+#endif
+
+	if (!ic_got_reply) {
+		ic_myaddr = NONE;
+		return -1;
+	}
+
+	printk("IP-Config: Got %s answer from %pI4, ",
+		((ic_got_reply & IC_RARP) ? "RARP"
+		 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+		&ic_servaddr);
+	printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
+
+	return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	if (ic_proto_used & IC_PROTO)
+		seq_printf(seq, "#PROTO: %s\n",
+			   (ic_proto_used & IC_RARP) ? "RARP"
+			   : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+	else
+		seq_puts(seq, "#MANUAL\n");
+
+	if (ic_domain[0])
+		seq_printf(seq,
+			   "domain %s\n", ic_domain);
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+		if (ic_nameservers[i] != NONE)
+			seq_printf(seq, "nameserver %pI4\n",
+				   &ic_nameservers[i]);
+	}
+	if (ic_servaddr != NONE)
+		seq_printf(seq, "bootserver %pI4\n",
+			   &ic_servaddr);
+	return 0;
+}
+
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+	return single_open(file, pnp_seq_show, NULL);
+}
+
+static const struct file_operations pnp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= pnp_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+/*
+ *  Extract IP address from the parameter string if needed. Note that we
+ *  need to have root_server_addr set _before_ IPConfig gets called as it
+ *  can override it.
+ */
+__be32 __init root_nfs_parse_addr(char *name)
+{
+	__be32 addr;
+	int octets = 0;
+	char *cp, *cq;
+
+	cp = cq = name;
+	while (octets < 4) {
+		while (*cp >= '0' && *cp <= '9')
+			cp++;
+		if (cp == cq || cp - cq > 3)
+			break;
+		if (*cp == '.' || octets == 3)
+			octets++;
+		if (octets < 4)
+			cp++;
+		cq = cp;
+	}
+	if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+		if (*cp == ':')
+			*cp++ = '\0';
+		addr = in_aton(name);
+		memmove(name, cp, strlen(cp) + 1);
+	} else
+		addr = NONE;
+
+	return addr;
+}
+
+#define DEVICE_WAIT_MAX		12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+	int i;
+
+	for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+		struct net_device *dev;
+		int found = 0;
+
+		rtnl_lock();
+		for_each_netdev(&init_net, dev) {
+			if (ic_is_init_dev(dev)) {
+				found = 1;
+				break;
+			}
+		}
+		rtnl_unlock();
+		if (found)
+			return 0;
+		ssleep(1);
+	}
+	return -ENODEV;
+}
+
+/*
+ *	IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+	__be32 addr;
+#ifdef IPCONFIG_DYNAMIC
+	int retries = CONF_OPEN_RETRIES;
+#endif
+	int err;
+
+#ifdef CONFIG_PROC_FS
+	proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+
+	if (!ic_enable)
+		return 0;
+
+	DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+	/* Wait for devices to appear */
+	err = wait_for_devices();
+	if (err)
+		return err;
+
+	/* Setup all network devices */
+	err = ic_open_devs();
+	if (err)
+		return err;
+
+	/* Give drivers a chance to settle */
+	msleep(CONF_POST_OPEN);
+
+	/*
+	 * If the config information is insufficient (e.g., our IP address or
+	 * IP address of the boot server is missing or we have multiple network
+	 * interfaces and no default was set), use BOOTP or RARP to get the
+	 * missing values.
+	 */
+	if (ic_myaddr == NONE ||
+#ifdef CONFIG_ROOT_NFS
+	    (root_server_addr == NONE &&
+	     ic_servaddr == NONE &&
+	     ROOT_DEV == Root_NFS) ||
+#endif
+	    ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+		if (ic_dynamic() < 0) {
+			ic_close_devs();
+
+			/*
+			 * I don't know why, but sometimes the
+			 * eepro100 driver (at least) gets upset and
+			 * doesn't work the first time it's opened.
+			 * But then if you close it and reopen it, it
+			 * works just fine.  So we need to try that at
+			 * least once before giving up.
+			 *
+			 * Also, if the root will be NFS-mounted, we
+			 * have nowhere to go if DHCP fails.  So we
+			 * just have to keep trying forever.
+			 *
+			 * 				-- Chip
+			 */
+#ifdef CONFIG_ROOT_NFS
+			if (ROOT_DEV ==  Root_NFS) {
+				printk(KERN_ERR
+					"IP-Config: Retrying forever (NFS root)...\n");
+				goto try_try_again;
+			}
+#endif
+
+			if (--retries) {
+				printk(KERN_ERR
+				       "IP-Config: Reopening network devices...\n");
+				goto try_try_again;
+			}
+
+			/* Oh, well.  At least we tried. */
+			printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+			return -1;
+		}
+#else /* !DYNAMIC */
+		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+		ic_close_devs();
+		return -1;
+#endif /* IPCONFIG_DYNAMIC */
+	} else {
+		/* Device selected manually or only one device -> use it */
+		ic_dev = ic_first_dev->dev;
+	}
+
+	addr = root_nfs_parse_addr(root_server_path);
+	if (root_server_addr == NONE)
+		root_server_addr = addr;
+
+	/*
+	 * Use defaults wherever applicable.
+	 */
+	if (ic_defaults() < 0)
+		return -1;
+
+	/*
+	 * Close all network devices except the device we've
+	 * autoconfigured and set up routes.
+	 */
+	ic_close_devs();
+	if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+		return -1;
+
+	/*
+	 * Record which protocol was actually used.
+	 */
+#ifdef IPCONFIG_DYNAMIC
+	ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+	/*
+	 * Clue in the operator.
+	 */
+	printk("IP-Config: Complete:\n");
+	printk("     device=%s", ic_dev->name);
+	printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
+	printk(KERN_CONT ", mask=%pI4", &ic_netmask);
+	printk(KERN_CONT ", gw=%pI4", &ic_gateway);
+	printk(KERN_CONT ",\n     host=%s, domain=%s, nis-domain=%s",
+	       utsname()->nodename, ic_domain, utsname()->domainname);
+	printk(KERN_CONT ",\n     bootserver=%pI4", &ic_servaddr);
+	printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
+	printk(KERN_CONT ", rootpath=%s", root_server_path);
+	if (ic_dev_mtu)
+		printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
+	printk(KERN_CONT "\n");
+#endif /* !SILENT */
+
+	return 0;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ *  Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ *  command line parameter.  See Documentation/filesystems/nfs/nfsroot.txt.
+ */
+static int __init ic_proto_name(char *name)
+{
+	if (!strcmp(name, "on") || !strcmp(name, "any")) {
+		return 1;
+	}
+	if (!strcmp(name, "off") || !strcmp(name, "none")) {
+		return 0;
+	}
+#ifdef CONFIG_IP_PNP_DHCP
+	else if (!strcmp(name, "dhcp")) {
+		ic_proto_enabled &= ~IC_RARP;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+	else if (!strcmp(name, "bootp")) {
+		ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+	else if (!strcmp(name, "rarp")) {
+		ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef IPCONFIG_DYNAMIC
+	else if (!strcmp(name, "both")) {
+		ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+	char *cp, *ip, *dp;
+	int num = 0;
+
+	ic_set_manually = 1;
+	ic_enable = 1;
+
+	/*
+	 * If any dhcp, bootp etc options are set, leave autoconfig on
+	 * and skip the below static IP processing.
+	 */
+	if (ic_proto_name(addrs))
+		return 1;
+
+	/* If no static IP is given, turn off autoconfig and bail.  */
+	if (*addrs == 0 ||
+	    strcmp(addrs, "off") == 0 ||
+	    strcmp(addrs, "none") == 0) {
+		ic_enable = 0;
+		return 1;
+	}
+
+	/* Parse string for static IP assignment.  */
+	ip = addrs;
+	while (ip && *ip) {
+		if ((cp = strchr(ip, ':')))
+			*cp++ = '\0';
+		if (strlen(ip) > 0) {
+			DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+			switch (num) {
+			case 0:
+				if ((ic_myaddr = in_aton(ip)) == ANY)
+					ic_myaddr = NONE;
+				break;
+			case 1:
+				if ((ic_servaddr = in_aton(ip)) == ANY)
+					ic_servaddr = NONE;
+				break;
+			case 2:
+				if ((ic_gateway = in_aton(ip)) == ANY)
+					ic_gateway = NONE;
+				break;
+			case 3:
+				if ((ic_netmask = in_aton(ip)) == ANY)
+					ic_netmask = NONE;
+				break;
+			case 4:
+				if ((dp = strchr(ip, '.'))) {
+					*dp++ = '\0';
+					strlcpy(utsname()->domainname, dp,
+						sizeof(utsname()->domainname));
+				}
+				strlcpy(utsname()->nodename, ip,
+					sizeof(utsname()->nodename));
+				ic_host_name_set = 1;
+				break;
+			case 5:
+				strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+				break;
+			case 6:
+				if (ic_proto_name(ip) == 0 &&
+				    ic_myaddr == NONE) {
+					ic_enable = 0;
+				}
+				break;
+			}
+		}
+		ip = cp;
+		num++;
+	}
+
+	return 1;
+}
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+	return ip_auto_config_setup(addrs);
+}
+
+static int __init vendor_class_identifier_setup(char *addrs)
+{
+	if (strlcpy(vendor_class_identifier, addrs,
+		    sizeof(vendor_class_identifier))
+	    >= sizeof(vendor_class_identifier))
+		printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
+		       vendor_class_identifier);
+	return 1;
+}
+
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
+__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 00000000..6f06f7f3
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,912 @@
+/*
+ *	Linux NET3:	IP/IP protocol decoder.
+ *
+ *	Authors:
+ *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
+ *
+ *	Fixes:
+ *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
+ *					a module taking up 2 pages).
+ *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ *					to keep ip_forward happy.
+ *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
+ *              David Woodhouse :       Perform some basic ICMP handling.
+ *                                      IPIP Routing without decapsulation.
+ *              Carlos Picoto   :       GRE over IP support
+ *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ *					I do not want to merge them together.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+	The purpose of this driver is to provide an IP tunnel through
+	which you can tunnel network traffic transparently across subnets.
+
+	This was written by looking at Nick Holloway's dummy driver
+	Thanks for the great code!
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
+
+	Minor tweaks:
+		Cleaned up the code a little and added some pre-1.3.0 tweaks.
+		dev->hard_header/hard_header_len changed to use no headers.
+		Comments/bracketing tweaked.
+		Made the tunnels use dev->name not tunnel: when error reporting.
+		Added tx_dropped stat
+
+		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
+
+	Reworked:
+		Changed to tunnel to destination gateway in addition to the
+			tunnel's pointopoint address
+		Almost completely rewritten
+		Note:  There is currently no firewall or ICMP handling done.
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
+
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+	When the tunnel_xmit() function is called, the skb contains the
+	packet to be sent (plus a great deal of extra info), and dev
+	contains the tunnel device that _we_ are.
+
+	When we are passed a packet, we are expected to fill in the
+	source address with our source IP address.
+
+	What is the proper way to allocate, copy and free a buffer?
+	After you allocate it, it is a "0 length" chunk of memory
+	starting at zero.  If you want to add headers to the buffer
+	later, you'll have to call "skb_reserve(skb, amount)" with
+	the amount of memory you want reserved.  Then, you call
+	"skb_put(skb, amount)" with the amount of space you want in
+	the buffer.  skb_put() returns a pointer to the top (#0) of
+	that buffer.  skb->len is set to the amount of space you have
+	"allocated" with skb_put().  You can then write up to skb->len
+	bytes to that buffer.  If you need more, you can call skb_put()
+	again with the additional amount of space you need.  You can
+	find out how much more space you can allocate by calling
+	"skb_tailroom(skb)".
+	Now, to add header space, call "skb_push(skb, header_len)".
+	This creates space at the beginning of the buffer and returns
+	a pointer to this new space.  If later you need to strip a
+	header from a buffer, call "skb_pull(skb, header_len)".
+	skb_headroom() will return how much space is left at the top
+	of the buffer (before the main data).  Remember, this headroom
+	space must be reserved before the skb_put() function is called.
+	*/
+
+/*
+   This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define HASH_SIZE  16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+
+static int ipip_net_id __read_mostly;
+struct ipip_net {
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel __rcu **tunnels[4];
+
+	struct net_device *fb_tunnel_dev;
+};
+
+static int ipip_tunnel_init(struct net_device *dev);
+static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
+
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+
+#define for_each_ip_tunnel_rcu(start) \
+	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
+static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
+		__be32 remote, __be32 local)
+{
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
+	struct ip_tunnel *t;
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
+		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+
+	for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
+		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+			return t;
+
+	t = rcu_dereference(ipn->tunnels_wc[0]);
+	if (t && (t->dev->flags&IFF_UP))
+		return t;
+	return NULL;
+}
+
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
+		struct ip_tunnel_parm *parms)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	unsigned int h = 0;
+	int prio = 0;
+
+	if (remote) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	if (local) {
+		prio |= 1;
+		h ^= HASH(local);
+	}
+	return &ipn->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
+		struct ip_tunnel *t)
+{
+	return __ipip_bucket(ipn, &t->parms);
+}
+
+static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipip_bucket(ipn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
+			break;
+		}
+	}
+}
+
+static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
+
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+	rcu_assign_pointer(*tp, t);
+}
+
+static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
+		struct ip_tunnel_parm *parms, int create)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+	for (tp = __ipip_bucket(ipn, parms);
+		 (t = rtnl_dereference(*tp)) != NULL;
+		 tp = &t->next) {
+		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+			return t;
+	}
+	if (!create)
+		return NULL;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else
+		strcpy(name, "tunl%d");
+
+	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
+	if (dev == NULL)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	nt = netdev_priv(dev);
+	nt->parms = *parms;
+
+	if (ipip_tunnel_init(dev) < 0)
+		goto failed_free;
+
+	if (register_netdevice(dev) < 0)
+		goto failed_free;
+
+	strcpy(nt->parms.name, dev->name);
+
+	dev_hold(dev);
+	ipip_tunnel_link(ipn, nt);
+	return nt;
+
+failed_free:
+	ipip_dev_free(dev);
+	return NULL;
+}
+
+/* called with RTNL */
+static void ipip_tunnel_uninit(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+	if (dev == ipn->fb_tunnel_dev)
+		rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+	else
+		ipip_tunnel_unlink(ipn, netdev_priv(dev));
+	dev_put(dev);
+}
+
+static int ipip_err(struct sk_buff *skb, u32 info)
+{
+
+/* All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+ */
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	int err;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return 0;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return 0;
+		case ICMP_FRAG_NEEDED:
+			/* Soft state for pmtu is maintained by IP core. */
+			return 0;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return 0;
+		break;
+	}
+
+	err = -ENOENT;
+
+	rcu_read_lock();
+	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
+	if (t == NULL || t->parms.iph.daddr == 0)
+		goto out;
+
+	err = 0;
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
+					struct sk_buff *skb)
+{
+	struct iphdr *inner_iph = ip_hdr(skb);
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	rcu_read_lock();
+	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+			rcu_read_unlock();
+			kfree_skb(skb);
+			return 0;
+		}
+
+		secpath_reset(skb);
+
+		skb->mac_header = skb->network_header;
+		skb_reset_network_header(skb);
+		skb->protocol = htons(ETH_P_IP);
+		skb->pkt_type = PACKET_HOST;
+
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		ipip_ecn_decapsulate(iph, skb);
+
+		netif_rx(skb);
+
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+
+	return -1;
+}
+
+/*
+ *	This function assumes it is being called from dev_queue_xmit()
+ *	and that skb is filled properly by that function.
+ */
+
+static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct pcpu_tstats *tstats;
+	const struct iphdr  *tiph = &tunnel->parms.iph;
+	u8     tos = tunnel->parms.iph.tos;
+	__be16 df = tiph->frag_off;
+	struct rtable *rt;     			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	const struct iphdr  *old_iph = ip_hdr(skb);
+	struct iphdr  *iph;			/* Our new IP header */
+	unsigned int max_headroom;		/* The extra header space needed */
+	__be32 dst = tiph->daddr;
+	struct flowi4 fl4;
+	int    mtu;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto tx_error;
+
+	if (tos & 1)
+		tos = old_iph->tos;
+
+	if (!dst) {
+		/* NBMA tunnel */
+		if ((rt = skb_rtable(skb)) == NULL) {
+			dev->stats.tx_fifo_errors++;
+			goto tx_error;
+		}
+		if ((dst = rt->rt_gateway) == 0)
+			goto tx_error_icmp;
+	}
+
+	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+				   dst, tiph->saddr,
+				   0, 0,
+				   IPPROTO_IPIP, RT_TOS(tos),
+				   tunnel->parms.link);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error_icmp;
+	}
+	tdev = rt->dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	df |= old_iph->frag_off & htons(IP_DF);
+
+	if (df) {
+		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+
+		if (mtu < 68) {
+			dev->stats.collisions++;
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+
+		if (skb_dst(skb))
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+
+		if ((old_iph->frag_off & htons(IP_DF)) &&
+		    mtu < ntohs(old_iph->tot_len)) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
+
+	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			dev->stats.tx_dropped++;
+			dev_kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		dev_kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ip_hdr(skb);
+	}
+
+	skb->transport_header = skb->network_header;
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+
+	iph 			=	ip_hdr(skb);
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
+	iph->daddr		=	fl4.daddr;
+	iph->saddr		=	fl4.saddr;
+
+	if ((iph->ttl = tiph->ttl) == 0)
+		iph->ttl	=	old_iph->ttl;
+
+	nf_reset(skb);
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
+	return NETDEV_TX_OK;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev->stats.tx_errors++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void ipip_tunnel_bind_dev(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph;
+
+	tunnel = netdev_priv(dev);
+	iph = &tunnel->parms.iph;
+
+	if (iph->daddr) {
+		struct rtable *rt;
+		struct flowi4 fl4;
+
+		rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+					   iph->daddr, iph->saddr,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos),
+					   tunnel->parms.link);
+		if (!IS_ERR(rt)) {
+			tdev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+		dev->flags |= IFF_POINTOPOINT;
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+	if (tdev) {
+		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+		dev->mtu = tdev->mtu - sizeof(struct iphdr);
+	}
+	dev->iflink = tunnel->parms.link;
+}
+
+static int
+ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *t;
+	struct net *net = dev_net(dev);
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		t = NULL;
+		if (dev == ipn->fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ipip_tunnel_locate(net, &p, 0);
+		}
+		if (t == NULL)
+			t = netdev_priv(dev);
+		memcpy(&p, &t->parms, sizeof(p));
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			err = -EFAULT;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+			goto done;
+		if (p.iph.ttl)
+			p.iph.frag_off |= htons(IP_DF);
+
+		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+					err = -EINVAL;
+					break;
+				}
+				t = netdev_priv(dev);
+				ipip_tunnel_unlink(ipn, t);
+				synchronize_net();
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				ipip_tunnel_link(ipn, t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.iph.ttl = p.iph.ttl;
+				t->parms.iph.tos = p.iph.tos;
+				t->parms.iph.frag_off = p.iph.frag_off;
+				if (t->parms.link != p.link) {
+					t->parms.link = p.link;
+					ipip_tunnel_bind_dev(dev);
+					netdev_state_change(dev);
+				}
+			}
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == ipn->fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+				goto done;
+			err = -ENOENT;
+			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
+				goto done;
+			err = -EPERM;
+			if (t->dev == ipn->fb_tunnel_dev)
+				goto done;
+			dev = t->dev;
+		}
+		unregister_netdevice(dev);
+		err = 0;
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static const struct net_device_ops ipip_netdev_ops = {
+	.ndo_uninit	= ipip_tunnel_uninit,
+	.ndo_start_xmit	= ipip_tunnel_xmit,
+	.ndo_do_ioctl	= ipip_tunnel_ioctl,
+	.ndo_change_mtu	= ipip_tunnel_change_mtu,
+	.ndo_get_stats  = ipip_get_stats,
+};
+
+static void ipip_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &ipip_netdev_ops;
+	dev->destructor		= ipip_dev_free;
+
+	dev->type		= ARPHRD_TUNNEL;
+	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
+	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
+	dev->flags		= IFF_NOARP;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	ipip_tunnel_bind_dev(dev);
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_IPIP;
+	iph->ihl		= 5;
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	dev_hold(dev);
+	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+	return 0;
+}
+
+static struct xfrm_tunnel ipip_handler __read_mostly = {
+	.handler	=	ipip_rcv,
+	.err_handler	=	ipip_err,
+	.priority	=	1,
+};
+
+static const char banner[] __initconst =
+	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+
+static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
+{
+	int prio;
+
+	for (prio = 1; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(ipn->tunnels[prio][h]);
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = rtnl_dereference(t->next);
+			}
+		}
+	}
+}
+
+static int __net_init ipip_init_net(struct net *net)
+{
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+	struct ip_tunnel *t;
+	int err;
+
+	ipn->tunnels[0] = ipn->tunnels_wc;
+	ipn->tunnels[1] = ipn->tunnels_l;
+	ipn->tunnels[2] = ipn->tunnels_r;
+	ipn->tunnels[3] = ipn->tunnels_r_l;
+
+	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+					   "tunl0",
+					   ipip_tunnel_setup);
+	if (!ipn->fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err_alloc_dev;
+	}
+	dev_net_set(ipn->fb_tunnel_dev, net);
+
+	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+	if (err)
+		goto err_reg_dev;
+
+	if ((err = register_netdev(ipn->fb_tunnel_dev)))
+		goto err_reg_dev;
+
+	t = netdev_priv(ipn->fb_tunnel_dev);
+
+	strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
+	return 0;
+
+err_reg_dev:
+	ipip_dev_free(ipn->fb_tunnel_dev);
+err_alloc_dev:
+	/* nothing */
+	return err;
+}
+
+static void __net_exit ipip_exit_net(struct net *net)
+{
+	struct ipip_net *ipn = net_generic(net, ipip_net_id);
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	ipip_destroy_tunnels(ipn, &list);
+	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations ipip_net_ops = {
+	.init = ipip_init_net,
+	.exit = ipip_exit_net,
+	.id   = &ipip_net_id,
+	.size = sizeof(struct ipip_net),
+};
+
+static int __init ipip_init(void)
+{
+	int err;
+
+	printk(banner);
+
+	err = register_pernet_device(&ipip_net_ops);
+	if (err < 0)
+		return err;
+	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	if (err < 0) {
+		unregister_pernet_device(&ipip_net_ops);
+		printk(KERN_INFO "ipip init: can't register tunnel\n");
+	}
+	return err;
+}
+
+static void __exit ipip_fini(void)
+{
+	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
+		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+
+	unregister_pernet_device(&ipip_net_ops);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 00000000..f81af8dd
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,2559 @@
+/*
+ *	IP multicast routing support for mrouted 3.6/3.8
+ *
+ *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *	  Linux Consultancy and Custom Driver Development
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *	Michael Chastain	:	Incorrect size of copying.
+ *	Alan Cox		:	Added the cache manager code
+ *	Alan Cox		:	Fixed the clone/copy bug and device race.
+ *	Mike McLagan		:	Routing by source
+ *	Malcolm Beattie		:	Buffer handling fixes.
+ *	Alexey Kuznetsov	:	Double buffer free and other fixes.
+ *	SVR Anand		:	Fixed several multicast bugs and problems.
+ *	Alexey Kuznetsov	:	Status, optimisations and more.
+ *	Brad Parker		:	Better behaviour on mrouted upcall
+ *					overflow.
+ *      Carlos Picoto           :       PIMv1 Support
+ *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
+ *					Relax this requirement to work with older peers.
+ *
+ */
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM	1
+#endif
+
+struct mr_table {
+	struct list_head	list;
+#ifdef CONFIG_NET_NS
+	struct net		*net;
+#endif
+	u32			id;
+	struct sock __rcu	*mroute_sk;
+	struct timer_list	ipmr_expire_timer;
+	struct list_head	mfc_unres_queue;
+	struct list_head	mfc_cache_array[MFC_LINES];
+	struct vif_device	vif_table[MAXVIFS];
+	int			maxvif;
+	atomic_t		cache_resolve_queue_len;
+	int			mroute_do_assert;
+	int			mroute_do_pim;
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+	int			mroute_reg_vif_num;
+#endif
+};
+
+struct ipmr_rule {
+	struct fib_rule		common;
+};
+
+struct ipmr_result {
+	struct mr_table		*mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ * Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ *	Multicast router control variables
+ */
+
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+			 struct sk_buff *skb, struct mfc_cache *cache,
+			 int local);
+static int ipmr_cache_report(struct mr_table *mrt,
+			     struct sk_buff *pkt, vifi_t vifi, int assert);
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			      struct mfc_cache *c, struct rtmsg *rtm);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+
+	ipmr_for_each_table(mrt, net) {
+		if (mrt->id == id)
+			return mrt;
+	}
+	return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+			   struct mr_table **mrt)
+{
+	struct ipmr_result res;
+	struct fib_lookup_arg arg = { .result = &res, };
+	int err;
+
+	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+			       flowi4_to_flowi(flp4), 0, &arg);
+	if (err < 0)
+		return err;
+	*mrt = res.mrt;
+	return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	struct ipmr_result *res = arg->result;
+	struct mr_table *mrt;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	mrt = ipmr_get_table(rule->fr_net, rule->table);
+	if (mrt == NULL)
+		return -EAGAIN;
+	res->mrt = mrt;
+	return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	return 1;
+}
+
+static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
+	FRA_GENERIC_POLICY,
+};
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+	return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	frh->dst_len = 0;
+	frh->src_len = 0;
+	frh->tos     = 0;
+	return 0;
+}
+
+static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
+	.family		= RTNL_FAMILY_IPMR,
+	.rule_size	= sizeof(struct ipmr_rule),
+	.addr_size	= sizeof(u32),
+	.action		= ipmr_rule_action,
+	.match		= ipmr_rule_match,
+	.configure	= ipmr_rule_configure,
+	.compare	= ipmr_rule_compare,
+	.default_pref	= fib_default_rule_pref,
+	.fill		= ipmr_rule_fill,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.policy		= ipmr_rule_policy,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	struct fib_rules_ops *ops;
+	struct mr_table *mrt;
+	int err;
+
+	ops = fib_rules_register(&ipmr_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		goto err2;
+
+	net->ipv4.mr_rules_ops = ops;
+	return 0;
+
+err2:
+	kfree(mrt);
+err1:
+	fib_rules_unregister(ops);
+	return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	struct mr_table *mrt, *next;
+
+	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+		list_del(&mrt->list);
+		kfree(mrt);
+	}
+	fib_rules_unregister(net->ipv4.mr_rules_ops);
+}
+#else
+#define ipmr_for_each_table(mrt, net) \
+	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+	return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
+			   struct mr_table **mrt)
+{
+	*mrt = net->ipv4.mrt;
+	return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+	net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+	return net->ipv4.mrt ? 0 : -ENOMEM;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+	kfree(net->ipv4.mrt);
+}
+#endif
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+	struct mr_table *mrt;
+	unsigned int i;
+
+	mrt = ipmr_get_table(net, id);
+	if (mrt != NULL)
+		return mrt;
+
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (mrt == NULL)
+		return NULL;
+	write_pnet(&mrt->net, net);
+	mrt->id = id;
+
+	/* Forwarding cache */
+	for (i = 0; i < MFC_LINES; i++)
+		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
+
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+		    (unsigned long)mrt);
+
+#ifdef CONFIG_IP_PIMSM
+	mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+	return mrt;
+}
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
+{
+	struct net *net = dev_net(dev);
+
+	dev_close(dev);
+
+	dev = __dev_get_by_name(net, "tunl0");
+	if (dev) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct ifreq ifr;
+		struct ip_tunnel_parm p;
+
+		memset(&p, 0, sizeof(p));
+		p.iph.daddr = v->vifc_rmt_addr.s_addr;
+		p.iph.saddr = v->vifc_lcl_addr.s_addr;
+		p.iph.version = 4;
+		p.iph.ihl = 5;
+		p.iph.protocol = IPPROTO_IPIP;
+		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+		if (ops->ndo_do_ioctl) {
+			mm_segment_t oldfs = get_fs();
+
+			set_fs(KERNEL_DS);
+			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
+			set_fs(oldfs);
+		}
+	}
+}
+
+static
+struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+{
+	struct net_device  *dev;
+
+	dev = __dev_get_by_name(net, "tunl0");
+
+	if (dev) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		int err;
+		struct ifreq ifr;
+		struct ip_tunnel_parm p;
+		struct in_device  *in_dev;
+
+		memset(&p, 0, sizeof(p));
+		p.iph.daddr = v->vifc_rmt_addr.s_addr;
+		p.iph.saddr = v->vifc_lcl_addr.s_addr;
+		p.iph.version = 4;
+		p.iph.ihl = 5;
+		p.iph.protocol = IPPROTO_IPIP;
+		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+		if (ops->ndo_do_ioctl) {
+			mm_segment_t oldfs = get_fs();
+
+			set_fs(KERNEL_DS);
+			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+			set_fs(oldfs);
+		} else {
+			err = -EOPNOTSUPP;
+		}
+		dev = NULL;
+
+		if (err == 0 &&
+		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
+			dev->flags |= IFF_MULTICAST;
+
+			in_dev = __in_dev_get_rtnl(dev);
+			if (in_dev == NULL)
+				goto failure;
+
+			ipv4_devconf_setall(in_dev);
+			IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+			if (dev_open(dev))
+				goto failure;
+			dev_hold(dev);
+		}
+	}
+	return dev;
+
+failure:
+	/* allow the register to be completed before unregistering. */
+	rtnl_unlock();
+	rtnl_lock();
+
+	unregister_netdevice(dev);
+	return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct mr_table *mrt;
+	struct flowi4 fl4 = {
+		.flowi4_oif	= dev->ifindex,
+		.flowi4_iif	= skb->skb_iif,
+		.flowi4_mark	= skb->mark,
+	};
+	int err;
+
+	err = ipmr_fib_lookup(net, &fl4, &mrt);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	read_lock(&mrt_lock);
+	dev->stats.tx_bytes += skb->len;
+	dev->stats.tx_packets++;
+	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
+	read_unlock(&mrt_lock);
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+	.ndo_start_xmit	= reg_vif_xmit,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+	dev->type		= ARPHRD_PIMREG;
+	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
+	dev->flags		= IFF_NOARP;
+	dev->netdev_ops		= &reg_vif_netdev_ops,
+	dev->destructor		= free_netdev;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+	struct net_device *dev;
+	struct in_device *in_dev;
+	char name[IFNAMSIZ];
+
+	if (mrt->id == RT_TABLE_DEFAULT)
+		sprintf(name, "pimreg");
+	else
+		sprintf(name, "pimreg%u", mrt->id);
+
+	dev = alloc_netdev(0, name, reg_vif_setup);
+
+	if (dev == NULL)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	if (register_netdevice(dev)) {
+		free_netdev(dev);
+		return NULL;
+	}
+	dev->iflink = 0;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		goto failure;
+	}
+
+	ipv4_devconf_setall(in_dev);
+	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+	rcu_read_unlock();
+
+	if (dev_open(dev))
+		goto failure;
+
+	dev_hold(dev);
+
+	return dev;
+
+failure:
+	/* allow the register to be completed before unregistering. */
+	rtnl_unlock();
+	rtnl_lock();
+
+	unregister_netdevice(dev);
+	return NULL;
+}
+#endif
+
+/*
+ *	Delete a VIF entry
+ *	@notify: Set to 1, if the caller is a notifier_call
+ */
+
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
+		      struct list_head *head)
+{
+	struct vif_device *v;
+	struct net_device *dev;
+	struct in_device *in_dev;
+
+	if (vifi < 0 || vifi >= mrt->maxvif)
+		return -EADDRNOTAVAIL;
+
+	v = &mrt->vif_table[vifi];
+
+	write_lock_bh(&mrt_lock);
+	dev = v->dev;
+	v->dev = NULL;
+
+	if (!dev) {
+		write_unlock_bh(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	}
+
+#ifdef CONFIG_IP_PIMSM
+	if (vifi == mrt->mroute_reg_vif_num)
+		mrt->mroute_reg_vif_num = -1;
+#endif
+
+	if (vifi + 1 == mrt->maxvif) {
+		int tmp;
+
+		for (tmp = vifi - 1; tmp >= 0; tmp--) {
+			if (VIF_EXISTS(mrt, tmp))
+				break;
+		}
+		mrt->maxvif = tmp+1;
+	}
+
+	write_unlock_bh(&mrt_lock);
+
+	dev_set_allmulti(dev, -1);
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
+		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
+		ip_rt_multicast_event(in_dev);
+	}
+
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
+		unregister_netdevice_queue(dev, head);
+
+	dev_put(dev);
+	return 0;
+}
+
+static void ipmr_cache_free_rcu(struct rcu_head *head)
+{
+	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
+	kmem_cache_free(mrt_cachep, c);
+}
+
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+	call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+ * and reporting error to netlink readers.
+ */
+
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct sk_buff *skb;
+	struct nlmsgerr *e;
+
+	atomic_dec(&mrt->cache_resolve_queue_len);
+
+	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
+		if (ip_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+			nlh->nlmsg_type = NLMSG_ERROR;
+			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+			skb_trim(skb, nlh->nlmsg_len);
+			e = NLMSG_DATA(nlh);
+			e->error = -ETIMEDOUT;
+			memset(&e->msg, 0, sizeof(e->msg));
+
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+		} else {
+			kfree_skb(skb);
+		}
+	}
+
+	ipmr_cache_free(c);
+}
+
+
+/* Timer process for the unresolved queue. */
+
+static void ipmr_expire_process(unsigned long arg)
+{
+	struct mr_table *mrt = (struct mr_table *)arg;
+	unsigned long now;
+	unsigned long expires;
+	struct mfc_cache *c, *next;
+
+	if (!spin_trylock(&mfc_unres_lock)) {
+		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
+		return;
+	}
+
+	if (list_empty(&mrt->mfc_unres_queue))
+		goto out;
+
+	now = jiffies;
+	expires = 10*HZ;
+
+	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+		if (time_after(c->mfc_un.unres.expires, now)) {
+			unsigned long interval = c->mfc_un.unres.expires - now;
+			if (interval < expires)
+				expires = interval;
+			continue;
+		}
+
+		list_del(&c->list);
+		ipmr_destroy_unres(mrt, c);
+	}
+
+	if (!list_empty(&mrt->mfc_unres_queue))
+		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+
+out:
+	spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
+				   unsigned char *ttls)
+{
+	int vifi;
+
+	cache->mfc_un.res.minvif = MAXVIFS;
+	cache->mfc_un.res.maxvif = 0;
+	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+		if (VIF_EXISTS(mrt, vifi) &&
+		    ttls[vifi] && ttls[vifi] < 255) {
+			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+			if (cache->mfc_un.res.minvif > vifi)
+				cache->mfc_un.res.minvif = vifi;
+			if (cache->mfc_un.res.maxvif <= vifi)
+				cache->mfc_un.res.maxvif = vifi + 1;
+		}
+	}
+}
+
+static int vif_add(struct net *net, struct mr_table *mrt,
+		   struct vifctl *vifc, int mrtsock)
+{
+	int vifi = vifc->vifc_vifi;
+	struct vif_device *v = &mrt->vif_table[vifi];
+	struct net_device *dev;
+	struct in_device *in_dev;
+	int err;
+
+	/* Is vif busy ? */
+	if (VIF_EXISTS(mrt, vifi))
+		return -EADDRINUSE;
+
+	switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+	case VIFF_REGISTER:
+		/*
+		 * Special Purpose VIF in PIM
+		 * All the packets will be sent to the daemon
+		 */
+		if (mrt->mroute_reg_vif_num >= 0)
+			return -EADDRINUSE;
+		dev = ipmr_reg_vif(net, mrt);
+		if (!dev)
+			return -ENOBUFS;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			unregister_netdevice(dev);
+			dev_put(dev);
+			return err;
+		}
+		break;
+#endif
+	case VIFF_TUNNEL:
+		dev = ipmr_new_tunnel(net, vifc);
+		if (!dev)
+			return -ENOBUFS;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			ipmr_del_tunnel(dev, vifc);
+			dev_put(dev);
+			return err;
+		}
+		break;
+
+	case VIFF_USE_IFINDEX:
+	case 0:
+		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+			if (dev && __in_dev_get_rtnl(dev) == NULL) {
+				dev_put(dev);
+				return -EADDRNOTAVAIL;
+			}
+		} else {
+			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+		}
+		if (!dev)
+			return -EADDRNOTAVAIL;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			dev_put(dev);
+			return err;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev) {
+		dev_put(dev);
+		return -EADDRNOTAVAIL;
+	}
+	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
+	ip_rt_multicast_event(in_dev);
+
+	/* Fill in the VIF structures */
+
+	v->rate_limit = vifc->vifc_rate_limit;
+	v->local = vifc->vifc_lcl_addr.s_addr;
+	v->remote = vifc->vifc_rmt_addr.s_addr;
+	v->flags = vifc->vifc_flags;
+	if (!mrtsock)
+		v->flags |= VIFF_STATIC;
+	v->threshold = vifc->vifc_threshold;
+	v->bytes_in = 0;
+	v->bytes_out = 0;
+	v->pkt_in = 0;
+	v->pkt_out = 0;
+	v->link = dev->ifindex;
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
+		v->link = dev->iflink;
+
+	/* And finish update writing critical data */
+	write_lock_bh(&mrt_lock);
+	v->dev = dev;
+#ifdef CONFIG_IP_PIMSM
+	if (v->flags & VIFF_REGISTER)
+		mrt->mroute_reg_vif_num = vifi;
+#endif
+	if (vifi+1 > mrt->maxvif)
+		mrt->maxvif = vifi+1;
+	write_unlock_bh(&mrt_lock);
+	return 0;
+}
+
+/* called with rcu_read_lock() */
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
+					 __be32 origin,
+					 __be32 mcastgrp)
+{
+	int line = MFC_HASH(mcastgrp, origin);
+	struct mfc_cache *c;
+
+	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
+		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
+			return c;
+	}
+	return NULL;
+}
+
+/*
+ *	Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+
+	if (c)
+		c->mfc_un.res.minvif = MAXVIFS;
+	return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+
+	if (c) {
+		skb_queue_head_init(&c->mfc_un.unres.unresolved);
+		c->mfc_un.unres.expires = jiffies + 10*HZ;
+	}
+	return c;
+}
+
+/*
+ *	A cache entry has gone into a resolved state from queued
+ */
+
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+			       struct mfc_cache *uc, struct mfc_cache *c)
+{
+	struct sk_buff *skb;
+	struct nlmsgerr *e;
+
+	/* Play the pending entries through our router */
+
+	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+		if (ip_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+				nlh->nlmsg_len = skb_tail_pointer(skb) -
+						 (u8 *)nlh;
+			} else {
+				nlh->nlmsg_type = NLMSG_ERROR;
+				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+				skb_trim(skb, nlh->nlmsg_len);
+				e = NLMSG_DATA(nlh);
+				e->error = -EMSGSIZE;
+				memset(&e->msg, 0, sizeof(e->msg));
+			}
+
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+		} else {
+			ip_mr_forward(net, mrt, skb, c, 0);
+		}
+	}
+}
+
+/*
+ *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ *	expects the following bizarre scheme.
+ *
+ *	Called under mrt_lock.
+ */
+
+static int ipmr_cache_report(struct mr_table *mrt,
+			     struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+	struct sk_buff *skb;
+	const int ihl = ip_hdrlen(pkt);
+	struct igmphdr *igmp;
+	struct igmpmsg *msg;
+	struct sock *mroute_sk;
+	int ret;
+
+#ifdef CONFIG_IP_PIMSM
+	if (assert == IGMPMSG_WHOLEPKT)
+		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+	else
+#endif
+		skb = alloc_skb(128, GFP_ATOMIC);
+
+	if (!skb)
+		return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+	if (assert == IGMPMSG_WHOLEPKT) {
+		/* Ugly, but we have no choice with this interface.
+		 * Duplicate old header, fix ihl, length etc.
+		 * And all this only to mangle msg->im_msgtype and
+		 * to set msg->im_mbz to "mbz" :-)
+		 */
+		skb_push(skb, sizeof(struct iphdr));
+		skb_reset_network_header(skb);
+		skb_reset_transport_header(skb);
+		msg = (struct igmpmsg *)skb_network_header(skb);
+		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
+		msg->im_msgtype = IGMPMSG_WHOLEPKT;
+		msg->im_mbz = 0;
+		msg->im_vif = mrt->mroute_reg_vif_num;
+		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
+		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
+					     sizeof(struct iphdr));
+	} else
+#endif
+	{
+
+	/* Copy the IP header */
+
+	skb->network_header = skb->tail;
+	skb_put(skb, ihl);
+	skb_copy_to_linear_data(skb, pkt->data, ihl);
+	ip_hdr(skb)->protocol = 0;	/* Flag to the kernel this is a route add */
+	msg = (struct igmpmsg *)skb_network_header(skb);
+	msg->im_vif = vifi;
+	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+
+	/* Add our header */
+
+	igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+	igmp->type	=
+	msg->im_msgtype = assert;
+	igmp->code	= 0;
+	ip_hdr(skb)->tot_len = htons(skb->len);		/* Fix the length */
+	skb->transport_header = skb->network_header;
+	}
+
+	rcu_read_lock();
+	mroute_sk = rcu_dereference(mrt->mroute_sk);
+	if (mroute_sk == NULL) {
+		rcu_read_unlock();
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	/* Deliver to mrouted */
+
+	ret = sock_queue_rcv_skb(mroute_sk, skb);
+	rcu_read_unlock();
+	if (ret < 0) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+		kfree_skb(skb);
+	}
+
+	return ret;
+}
+
+/*
+ *	Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+{
+	bool found = false;
+	int err;
+	struct mfc_cache *c;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+		if (c->mfc_mcastgrp == iph->daddr &&
+		    c->mfc_origin == iph->saddr) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		/* Create a new entry if allowable */
+
+		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+		    (c = ipmr_cache_alloc_unres()) == NULL) {
+			spin_unlock_bh(&mfc_unres_lock);
+
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+
+		/* Fill in the new cache entry */
+
+		c->mfc_parent	= -1;
+		c->mfc_origin	= iph->saddr;
+		c->mfc_mcastgrp	= iph->daddr;
+
+		/* Reflect first query at mrouted. */
+
+		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+		if (err < 0) {
+			/* If the report failed throw the cache entry
+			   out - Brad Parker
+			 */
+			spin_unlock_bh(&mfc_unres_lock);
+
+			ipmr_cache_free(c);
+			kfree_skb(skb);
+			return err;
+		}
+
+		atomic_inc(&mrt->cache_resolve_queue_len);
+		list_add(&c->list, &mrt->mfc_unres_queue);
+
+		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
+	}
+
+	/* See if we can append the packet */
+
+	if (c->mfc_un.unres.unresolved.qlen > 3) {
+		kfree_skb(skb);
+		err = -ENOBUFS;
+	} else {
+		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+		err = 0;
+	}
+
+	spin_unlock_bh(&mfc_unres_lock);
+	return err;
+}
+
+/*
+ *	MFC cache manipulation by user space mroute daemon
+ */
+
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
+{
+	int line;
+	struct mfc_cache *c, *next;
+
+	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
+		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+			list_del_rcu(&c->list);
+
+			ipmr_cache_free(c);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+			struct mfcctl *mfc, int mrtsock)
+{
+	bool found = false;
+	int line;
+	struct mfc_cache *uc, *c;
+
+	if (mfc->mfcc_parent >= MAXVIFS)
+		return -ENFILE;
+
+	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+			found = true;
+			break;
+		}
+	}
+
+	if (found) {
+		write_lock_bh(&mrt_lock);
+		c->mfc_parent = mfc->mfcc_parent;
+		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+		if (!mrtsock)
+			c->mfc_flags |= MFC_STATIC;
+		write_unlock_bh(&mrt_lock);
+		return 0;
+	}
+
+	if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
+		return -EINVAL;
+
+	c = ipmr_cache_alloc();
+	if (c == NULL)
+		return -ENOMEM;
+
+	c->mfc_origin = mfc->mfcc_origin.s_addr;
+	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
+	c->mfc_parent = mfc->mfcc_parent;
+	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+	if (!mrtsock)
+		c->mfc_flags |= MFC_STATIC;
+
+	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
+
+	/*
+	 *	Check to see if we resolved a queued list. If so we
+	 *	need to send on the frames and tidy up.
+	 */
+	found = false;
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+		if (uc->mfc_origin == c->mfc_origin &&
+		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+			list_del(&uc->list);
+			atomic_dec(&mrt->cache_resolve_queue_len);
+			found = true;
+			break;
+		}
+	}
+	if (list_empty(&mrt->mfc_unres_queue))
+		del_timer(&mrt->ipmr_expire_timer);
+	spin_unlock_bh(&mfc_unres_lock);
+
+	if (found) {
+		ipmr_cache_resolve(net, mrt, uc, c);
+		ipmr_cache_free(uc);
+	}
+	return 0;
+}
+
+/*
+ *	Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr_table *mrt)
+{
+	int i;
+	LIST_HEAD(list);
+	struct mfc_cache *c, *next;
+
+	/* Shut down all active vif entries */
+
+	for (i = 0; i < mrt->maxvif; i++) {
+		if (!(mrt->vif_table[i].flags & VIFF_STATIC))
+			vif_delete(mrt, i, 0, &list);
+	}
+	unregister_netdevice_many(&list);
+
+	/* Wipe the cache */
+
+	for (i = 0; i < MFC_LINES; i++) {
+		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
+			if (c->mfc_flags & MFC_STATIC)
+				continue;
+			list_del_rcu(&c->list);
+			ipmr_cache_free(c);
+		}
+	}
+
+	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+		spin_lock_bh(&mfc_unres_lock);
+		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+			list_del(&c->list);
+			ipmr_destroy_unres(mrt, c);
+		}
+		spin_unlock_bh(&mfc_unres_lock);
+	}
+}
+
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
+static void mrtsock_destruct(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	rtnl_lock();
+	ipmr_for_each_table(mrt, net) {
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+			rcu_assign_pointer(mrt->mroute_sk, NULL);
+			mroute_clean_tables(mrt);
+		}
+	}
+	rtnl_unlock();
+}
+
+/*
+ *	Socket options and virtual interface manipulation. The whole
+ *	virtual interface system is a complete heap, but unfortunately
+ *	that's how BSD mrouted happens to think. Maybe one day with a proper
+ *	MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+{
+	int ret;
+	struct vifctl vif;
+	struct mfcctl mfc;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	if (optname != MRT_INIT) {
+		if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+		    !capable(CAP_NET_ADMIN))
+			return -EACCES;
+	}
+
+	switch (optname) {
+	case MRT_INIT:
+		if (sk->sk_type != SOCK_RAW ||
+		    inet_sk(sk)->inet_num != IPPROTO_IGMP)
+			return -EOPNOTSUPP;
+		if (optlen != sizeof(int))
+			return -ENOPROTOOPT;
+
+		rtnl_lock();
+		if (rtnl_dereference(mrt->mroute_sk)) {
+			rtnl_unlock();
+			return -EADDRINUSE;
+		}
+
+		ret = ip_ra_control(sk, 1, mrtsock_destruct);
+		if (ret == 0) {
+			rcu_assign_pointer(mrt->mroute_sk, sk);
+			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
+		}
+		rtnl_unlock();
+		return ret;
+	case MRT_DONE:
+		if (sk != rcu_dereference_raw(mrt->mroute_sk))
+			return -EACCES;
+		return ip_ra_control(sk, 0, NULL);
+	case MRT_ADD_VIF:
+	case MRT_DEL_VIF:
+		if (optlen != sizeof(vif))
+			return -EINVAL;
+		if (copy_from_user(&vif, optval, sizeof(vif)))
+			return -EFAULT;
+		if (vif.vifc_vifi >= MAXVIFS)
+			return -ENFILE;
+		rtnl_lock();
+		if (optname == MRT_ADD_VIF) {
+			ret = vif_add(net, mrt, &vif,
+				      sk == rtnl_dereference(mrt->mroute_sk));
+		} else {
+			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
+		}
+		rtnl_unlock();
+		return ret;
+
+		/*
+		 *	Manipulate the forwarding caches. These live
+		 *	in a sort of kernel/user symbiosis.
+		 */
+	case MRT_ADD_MFC:
+	case MRT_DEL_MFC:
+		if (optlen != sizeof(mfc))
+			return -EINVAL;
+		if (copy_from_user(&mfc, optval, sizeof(mfc)))
+			return -EFAULT;
+		rtnl_lock();
+		if (optname == MRT_DEL_MFC)
+			ret = ipmr_mfc_delete(mrt, &mfc);
+		else
+			ret = ipmr_mfc_add(net, mrt, &mfc,
+					   sk == rtnl_dereference(mrt->mroute_sk));
+		rtnl_unlock();
+		return ret;
+		/*
+		 *	Control PIM assert.
+		 */
+	case MRT_ASSERT:
+	{
+		int v;
+		if (get_user(v, (int __user *)optval))
+			return -EFAULT;
+		mrt->mroute_do_assert = (v) ? 1 : 0;
+		return 0;
+	}
+#ifdef CONFIG_IP_PIMSM
+	case MRT_PIM:
+	{
+		int v;
+
+		if (get_user(v, (int __user *)optval))
+			return -EFAULT;
+		v = (v) ? 1 : 0;
+
+		rtnl_lock();
+		ret = 0;
+		if (v != mrt->mroute_do_pim) {
+			mrt->mroute_do_pim = v;
+			mrt->mroute_do_assert = v;
+		}
+		rtnl_unlock();
+		return ret;
+	}
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	case MRT_TABLE:
+	{
+		u32 v;
+
+		if (optlen != sizeof(u32))
+			return -EINVAL;
+		if (get_user(v, (u32 __user *)optval))
+			return -EFAULT;
+
+		rtnl_lock();
+		ret = 0;
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			ret = -EBUSY;
+		} else {
+			if (!ipmr_new_table(net, v))
+				ret = -ENOMEM;
+			raw_sk(sk)->ipmr_table = v;
+		}
+		rtnl_unlock();
+		return ret;
+	}
+#endif
+	/*
+	 *	Spurious command, or MRT_VERSION which you cannot
+	 *	set.
+	 */
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+/*
+ *	Getsock opt support for the multicast routing system.
+ */
+
+int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
+{
+	int olr;
+	int val;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	if (optname != MRT_VERSION &&
+#ifdef CONFIG_IP_PIMSM
+	   optname != MRT_PIM &&
+#endif
+	   optname != MRT_ASSERT)
+		return -ENOPROTOOPT;
+
+	if (get_user(olr, optlen))
+		return -EFAULT;
+
+	olr = min_t(unsigned int, olr, sizeof(int));
+	if (olr < 0)
+		return -EINVAL;
+
+	if (put_user(olr, optlen))
+		return -EFAULT;
+	if (optname == MRT_VERSION)
+		val = 0x0305;
+#ifdef CONFIG_IP_PIMSM
+	else if (optname == MRT_PIM)
+		val = mrt->mroute_do_pim;
+#endif
+	else
+		val = mrt->mroute_do_assert;
+	if (copy_to_user(optval, &val, olr))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ *	The IP multicast ioctl support routines.
+ */
+
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+	struct sioc_sg_req sr;
+	struct sioc_vif_req vr;
+	struct vif_device *vif;
+	struct mfc_cache *c;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETVIFCNT:
+		if (copy_from_user(&vr, arg, sizeof(vr)))
+			return -EFAULT;
+		if (vr.vifi >= mrt->maxvif)
+			return -EINVAL;
+		read_lock(&mrt_lock);
+		vif = &mrt->vif_table[vr.vifi];
+		if (VIF_EXISTS(mrt, vr.vifi)) {
+			vr.icount = vif->pkt_in;
+			vr.ocount = vif->pkt_out;
+			vr.ibytes = vif->bytes_in;
+			vr.obytes = vif->bytes_out;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &vr, sizeof(vr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+
+		rcu_read_lock();
+		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+		if (c) {
+			sr.pktcnt = c->mfc_un.res.pkt;
+			sr.bytecnt = c->mfc_un.res.bytes;
+			sr.wrong_if = c->mfc_un.res.wrong_if;
+			rcu_read_unlock();
+
+			if (copy_to_user(arg, &sr, sizeof(sr)))
+				return -EFAULT;
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req {
+	struct in_addr src;
+	struct in_addr grp;
+	compat_ulong_t pktcnt;
+	compat_ulong_t bytecnt;
+	compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_vif_req {
+	vifi_t	vifi;		/* Which iface */
+	compat_ulong_t icount;
+	compat_ulong_t ocount;
+	compat_ulong_t ibytes;
+	compat_ulong_t obytes;
+};
+
+int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+	struct compat_sioc_sg_req sr;
+	struct compat_sioc_vif_req vr;
+	struct vif_device *vif;
+	struct mfc_cache *c;
+	struct net *net = sock_net(sk);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETVIFCNT:
+		if (copy_from_user(&vr, arg, sizeof(vr)))
+			return -EFAULT;
+		if (vr.vifi >= mrt->maxvif)
+			return -EINVAL;
+		read_lock(&mrt_lock);
+		vif = &mrt->vif_table[vr.vifi];
+		if (VIF_EXISTS(mrt, vr.vifi)) {
+			vr.icount = vif->pkt_in;
+			vr.ocount = vif->pkt_out;
+			vr.ibytes = vif->bytes_in;
+			vr.obytes = vif->bytes_out;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &vr, sizeof(vr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+
+		rcu_read_lock();
+		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+		if (c) {
+			sr.pktcnt = c->mfc_un.res.pkt;
+			sr.bytecnt = c->mfc_un.res.bytes;
+			sr.wrong_if = c->mfc_un.res.wrong_if;
+			rcu_read_unlock();
+
+			if (copy_to_user(arg, &sr, sizeof(sr)))
+				return -EFAULT;
+			return 0;
+		}
+		rcu_read_unlock();
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+	struct mr_table *mrt;
+	struct vif_device *v;
+	int ct;
+	LIST_HEAD(list);
+
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+
+	ipmr_for_each_table(mrt, net) {
+		v = &mrt->vif_table[0];
+		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+			if (v->dev == dev)
+				vif_delete(mrt, ct, 1, &list);
+		}
+	}
+	unregister_netdevice_many(&list);
+	return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier = {
+	.notifier_call = ipmr_device_event,
+};
+
+/*
+ *	Encapsulate a packet by attaching a valid IPIP header to it.
+ *	This avoids tunnel drivers and other mess and gives us the speed so
+ *	important for multicast video.
+ */
+
+static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+{
+	struct iphdr *iph;
+	const struct iphdr *old_iph = ip_hdr(skb);
+
+	skb_push(skb, sizeof(struct iphdr));
+	skb->transport_header = skb->network_header;
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+
+	iph->version	=	4;
+	iph->tos	=	old_iph->tos;
+	iph->ttl	=	old_iph->ttl;
+	iph->frag_off	=	0;
+	iph->daddr	=	daddr;
+	iph->saddr	=	saddr;
+	iph->protocol	=	IPPROTO_IPIP;
+	iph->ihl	=	5;
+	iph->tot_len	=	htons(skb->len);
+	ip_select_ident(iph, skb_dst(skb), NULL);
+	ip_send_check(iph);
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	nf_reset(skb);
+}
+
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	return dst_output(skb);
+}
+
+/*
+ *	Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct vif_device *vif = &mrt->vif_table[vifi];
+	struct net_device *dev;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	int    encap = 0;
+
+	if (vif->dev == NULL)
+		goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+	if (vif->flags & VIFF_REGISTER) {
+		vif->pkt_out++;
+		vif->bytes_out += skb->len;
+		vif->dev->stats.tx_bytes += skb->len;
+		vif->dev->stats.tx_packets++;
+		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
+		goto out_free;
+	}
+#endif
+
+	if (vif->flags & VIFF_TUNNEL) {
+		rt = ip_route_output_ports(net, &fl4, NULL,
+					   vif->remote, vif->local,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos), vif->link);
+		if (IS_ERR(rt))
+			goto out_free;
+		encap = sizeof(struct iphdr);
+	} else {
+		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(iph->tos), vif->link);
+		if (IS_ERR(rt))
+			goto out_free;
+	}
+
+	dev = rt->dst.dev;
+
+	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+		/* Do not fragment multicasts. Alas, IPv4 does not
+		 * allow to send ICMP, so that packets will disappear
+		 * to blackhole.
+		 */
+
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+
+	if (skb_cow(skb, encap)) {
+		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	vif->pkt_out++;
+	vif->bytes_out += skb->len;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	ip_decrease_ttl(ip_hdr(skb));
+
+	/* FIXME: forward and output firewalls used to be called here.
+	 * What do we do with netfilter? -- RR
+	 */
+	if (vif->flags & VIFF_TUNNEL) {
+		ip_encap(skb, vif->local, vif->remote);
+		/* FIXME: extra output firewall step used to be here. --RR */
+		vif->dev->stats.tx_packets++;
+		vif->dev->stats.tx_bytes += skb->len;
+	}
+
+	IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+	/*
+	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+	 * not only before forwarding, but after forwarding on all output
+	 * interfaces. It is clear, if mrouter runs a multicasting
+	 * program, it should receive packets not depending to what interface
+	 * program is joined.
+	 * If we will not make it, the program will have to join on all
+	 * interfaces. On the other hand, multihoming host (or router, but
+	 * not mrouter) cannot join to more than one interface - it will
+	 * result in receiving multiple packets.
+	 */
+	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
+		ipmr_forward_finish);
+	return;
+
+out_free:
+	kfree_skb(skb);
+}
+
+static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+{
+	int ct;
+
+	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
+		if (mrt->vif_table[ct].dev == dev)
+			break;
+	}
+	return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+			 struct sk_buff *skb, struct mfc_cache *cache,
+			 int local)
+{
+	int psend = -1;
+	int vif, ct;
+
+	vif = cache->mfc_parent;
+	cache->mfc_un.res.pkt++;
+	cache->mfc_un.res.bytes += skb->len;
+
+	/*
+	 * Wrong interface: drop packet and (maybe) send PIM assert.
+	 */
+	if (mrt->vif_table[vif].dev != skb->dev) {
+		int true_vifi;
+
+		if (rt_is_output_route(skb_rtable(skb))) {
+			/* It is our own packet, looped back.
+			 * Very complicated situation...
+			 *
+			 * The best workaround until routing daemons will be
+			 * fixed is not to redistribute packet, if it was
+			 * send through wrong interface. It means, that
+			 * multicast applications WILL NOT work for
+			 * (S,G), which have default multicast route pointing
+			 * to wrong oif. In any case, it is not a good
+			 * idea to use multicasting applications on router.
+			 */
+			goto dont_forward;
+		}
+
+		cache->mfc_un.res.wrong_if++;
+		true_vifi = ipmr_find_vif(mrt, skb->dev);
+
+		if (true_vifi >= 0 && mrt->mroute_do_assert &&
+		    /* pimsm uses asserts, when switching from RPT to SPT,
+		     * so that we cannot check that packet arrived on an oif.
+		     * It is bad, but otherwise we would need to move pretty
+		     * large chunk of pimd to kernel. Ough... --ANK
+		     */
+		    (mrt->mroute_do_pim ||
+		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
+		    time_after(jiffies,
+			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+			cache->mfc_un.res.last_assert = jiffies;
+			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+		}
+		goto dont_forward;
+	}
+
+	mrt->vif_table[vif].pkt_in++;
+	mrt->vif_table[vif].bytes_in += skb->len;
+
+	/*
+	 *	Forward the frame
+	 */
+	for (ct = cache->mfc_un.res.maxvif - 1;
+	     ct >= cache->mfc_un.res.minvif; ct--) {
+		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+				if (skb2)
+					ipmr_queue_xmit(net, mrt, skb2, cache,
+							psend);
+			}
+			psend = ct;
+		}
+	}
+	if (psend != -1) {
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+			if (skb2)
+				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
+		} else {
+			ipmr_queue_xmit(net, mrt, skb, cache, psend);
+			return 0;
+		}
+	}
+
+dont_forward:
+	if (!local)
+		kfree_skb(skb);
+	return 0;
+}
+
+static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct iphdr *iph = ip_hdr(skb);
+	struct flowi4 fl4 = {
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+		.flowi4_tos = RT_TOS(iph->tos),
+		.flowi4_oif = rt->rt_oif,
+		.flowi4_iif = rt->rt_iif,
+		.flowi4_mark = rt->rt_mark,
+	};
+	struct mr_table *mrt;
+	int err;
+
+	err = ipmr_fib_lookup(net, &fl4, &mrt);
+	if (err)
+		return ERR_PTR(err);
+	return mrt;
+}
+
+/*
+ *	Multicast packets for forwarding arrive here
+ *	Called with rcu_read_lock();
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+	struct mfc_cache *cache;
+	struct net *net = dev_net(skb->dev);
+	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+	struct mr_table *mrt;
+
+	/* Packet is looped back after forward, it should not be
+	 * forwarded second time, but still can be delivered locally.
+	 */
+	if (IPCB(skb)->flags & IPSKB_FORWARDED)
+		goto dont_forward;
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt)) {
+		kfree_skb(skb);
+		return PTR_ERR(mrt);
+	}
+	if (!local) {
+		if (IPCB(skb)->opt.router_alert) {
+			if (ip_call_ra_chain(skb))
+				return 0;
+		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+			/* IGMPv1 (and broken IGMPv2 implementations sort of
+			 * Cisco IOS <= 11.2(8)) do not put router alert
+			 * option to IGMP packets destined to routable
+			 * groups. It is very bad, because it means
+			 * that we can forward NO IGMP messages.
+			 */
+			struct sock *mroute_sk;
+
+			mroute_sk = rcu_dereference(mrt->mroute_sk);
+			if (mroute_sk) {
+				nf_reset(skb);
+				raw_rcv(mroute_sk, skb);
+				return 0;
+			}
+		    }
+	}
+
+	/* already under rcu_read_lock() */
+	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+
+	/*
+	 *	No usable cache entry
+	 */
+	if (cache == NULL) {
+		int vif;
+
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			ip_local_deliver(skb);
+			if (skb2 == NULL)
+				return -ENOBUFS;
+			skb = skb2;
+		}
+
+		read_lock(&mrt_lock);
+		vif = ipmr_find_vif(mrt, skb->dev);
+		if (vif >= 0) {
+			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+			read_unlock(&mrt_lock);
+
+			return err2;
+		}
+		read_unlock(&mrt_lock);
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	read_lock(&mrt_lock);
+	ip_mr_forward(net, mrt, skb, cache, local);
+	read_unlock(&mrt_lock);
+
+	if (local)
+		return ip_local_deliver(skb);
+
+	return 0;
+
+dont_forward:
+	if (local)
+		return ip_local_deliver(skb);
+	kfree_skb(skb);
+	return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+		     unsigned int pimlen)
+{
+	struct net_device *reg_dev = NULL;
+	struct iphdr *encap;
+
+	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+	/*
+	 * Check that:
+	 * a. packet is really sent to a multicast group
+	 * b. packet is not a NULL-REGISTER
+	 * c. packet is not truncated
+	 */
+	if (!ipv4_is_multicast(encap->daddr) ||
+	    encap->tot_len == 0 ||
+	    ntohs(encap->tot_len) + pimlen > skb->len)
+		return 1;
+
+	read_lock(&mrt_lock);
+	if (mrt->mroute_reg_vif_num >= 0)
+		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+	read_unlock(&mrt_lock);
+
+	if (reg_dev == NULL)
+		return 1;
+
+	skb->mac_header = skb->network_header;
+	skb_pull(skb, (u8 *)encap - skb->data);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	skb->ip_summed = CHECKSUM_NONE;
+	skb->pkt_type = PACKET_HOST;
+
+	skb_tunnel_rx(skb, reg_dev);
+
+	netif_rx(skb);
+
+	return NET_RX_SUCCESS;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff *skb)
+{
+	struct igmphdr *pim;
+	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+		goto drop;
+
+	pim = igmp_hdr(skb);
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt))
+		goto drop;
+	if (!mrt->mroute_do_pim ||
+	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
+		goto drop;
+
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+		kfree_skb(skb);
+	}
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff *skb)
+{
+	struct pimreghdr *pim;
+	struct net *net = dev_net(skb->dev);
+	struct mr_table *mrt;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
+		goto drop;
+
+	pim = (struct pimreghdr *)skb_transport_header(skb);
+	if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+	    (pim->flags & PIM_NULL_REGISTER) ||
+	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+		goto drop;
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt))
+		goto drop;
+	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
+drop:
+		kfree_skb(skb);
+	}
+	return 0;
+}
+#endif
+
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			      struct mfc_cache *c, struct rtmsg *rtm)
+{
+	int ct;
+	struct rtnexthop *nhp;
+	u8 *b = skb_tail_pointer(skb);
+	struct rtattr *mp_head;
+
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mfc_parent >= MAXVIFS)
+		return -ENOENT;
+
+	if (VIF_EXISTS(mrt, c->mfc_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
+
+	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+				goto rtattr_failure;
+			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			nhp->rtnh_flags = 0;
+			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
+			nhp->rtnh_len = sizeof(*nhp);
+		}
+	}
+	mp_head->rta_type = RTA_MULTIPATH;
+	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+	rtm->rtm_type = RTN_MULTICAST;
+	return 1;
+
+rtattr_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+int ipmr_get_route(struct net *net, struct sk_buff *skb,
+		   __be32 saddr, __be32 daddr,
+		   struct rtmsg *rtm, int nowait)
+{
+	struct mfc_cache *cache;
+	struct mr_table *mrt;
+	int err;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	rcu_read_lock();
+	cache = ipmr_cache_find(mrt, saddr, daddr);
+
+	if (cache == NULL) {
+		struct sk_buff *skb2;
+		struct iphdr *iph;
+		struct net_device *dev;
+		int vif = -1;
+
+		if (nowait) {
+			rcu_read_unlock();
+			return -EAGAIN;
+		}
+
+		dev = skb->dev;
+		read_lock(&mrt_lock);
+		if (dev)
+			vif = ipmr_find_vif(mrt, dev);
+		if (vif < 0) {
+			read_unlock(&mrt_lock);
+			rcu_read_unlock();
+			return -ENODEV;
+		}
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb2) {
+			read_unlock(&mrt_lock);
+			rcu_read_unlock();
+			return -ENOMEM;
+		}
+
+		skb_push(skb2, sizeof(struct iphdr));
+		skb_reset_network_header(skb2);
+		iph = ip_hdr(skb2);
+		iph->ihl = sizeof(struct iphdr) >> 2;
+		iph->saddr = saddr;
+		iph->daddr = daddr;
+		iph->version = 0;
+		err = ipmr_cache_unresolved(mrt, vif, skb2);
+		read_unlock(&mrt_lock);
+		rcu_read_unlock();
+		return err;
+	}
+
+	read_lock(&mrt_lock);
+	if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
+		cache->mfc_flags |= MFC_NOTIFY;
+	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
+	read_unlock(&mrt_lock);
+	rcu_read_unlock();
+	return err;
+}
+
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			    u32 pid, u32 seq, struct mfc_cache *c)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+
+	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family   = RTNL_FAMILY_IPMR;
+	rtm->rtm_dst_len  = 32;
+	rtm->rtm_src_len  = 32;
+	rtm->rtm_tos      = 0;
+	rtm->rtm_table    = mrt->id;
+	NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+	rtm->rtm_type     = RTN_MULTICAST;
+	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
+	rtm->rtm_protocol = RTPROT_UNSPEC;
+	rtm->rtm_flags    = 0;
+
+	NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
+	NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
+
+	if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct mr_table *mrt;
+	struct mfc_cache *mfc;
+	unsigned int t = 0, s_t;
+	unsigned int h = 0, s_h;
+	unsigned int e = 0, s_e;
+
+	s_t = cb->args[0];
+	s_h = cb->args[1];
+	s_e = cb->args[2];
+
+	rcu_read_lock();
+	ipmr_for_each_table(mrt, net) {
+		if (t < s_t)
+			goto next_table;
+		if (t > s_t)
+			s_h = 0;
+		for (h = s_h; h < MFC_LINES; h++) {
+			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
+				if (e < s_e)
+					goto next_entry;
+				if (ipmr_fill_mroute(mrt, skb,
+						     NETLINK_CB(cb->skb).pid,
+						     cb->nlh->nlmsg_seq,
+						     mfc) < 0)
+					goto done;
+next_entry:
+				e++;
+			}
+			e = s_e = 0;
+		}
+		s_h = 0;
+next_table:
+		t++;
+	}
+done:
+	rcu_read_unlock();
+
+	cb->args[2] = e;
+	cb->args[1] = h;
+	cb->args[0] = t;
+
+	return skb->len;
+}
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	The /proc interfaces to multicast routing :
+ *	/proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+	struct seq_net_private p;
+	struct mr_table *mrt;
+	int ct;
+};
+
+static struct vif_device *ipmr_vif_seq_idx(struct net *net,
+					   struct ipmr_vif_iter *iter,
+					   loff_t pos)
+{
+	struct mr_table *mrt = iter->mrt;
+
+	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		if (pos-- == 0)
+			return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(mrt_lock)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
+
+	iter->mrt = mrt;
+
+	read_lock(&mrt_lock);
+	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
+		: SEQ_START_TOKEN;
+}
+
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = iter->mrt;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ipmr_vif_seq_idx(net, iter, 0);
+
+	while (++iter->ct < mrt->maxvif) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+	__releases(mrt_lock)
+{
+	read_unlock(&mrt_lock);
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_table *mrt = iter->mrt;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
+	} else {
+		const struct vif_device *vif = v;
+		const char *name =  vif->dev ? vif->dev->name : "none";
+
+		seq_printf(seq,
+			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   vif - mrt->vif_table,
+			   name, vif->bytes_in, vif->pkt_in,
+			   vif->bytes_out, vif->pkt_out,
+			   vif->flags, vif->local, vif->remote);
+	}
+	return 0;
+}
+
+static const struct seq_operations ipmr_vif_seq_ops = {
+	.start = ipmr_vif_seq_start,
+	.next  = ipmr_vif_seq_next,
+	.stop  = ipmr_vif_seq_stop,
+	.show  = ipmr_vif_seq_show,
+};
+
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
+			    sizeof(struct ipmr_vif_iter));
+}
+
+static const struct file_operations ipmr_vif_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ipmr_vif_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+struct ipmr_mfc_iter {
+	struct seq_net_private p;
+	struct mr_table *mrt;
+	struct list_head *cache;
+	int ct;
+};
+
+
+static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
+					  struct ipmr_mfc_iter *it, loff_t pos)
+{
+	struct mr_table *mrt = it->mrt;
+	struct mfc_cache *mfc;
+
+	rcu_read_lock();
+	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
+		it->cache = &mrt->mfc_cache_array[it->ct];
+		list_for_each_entry_rcu(mfc, it->cache, list)
+			if (pos-- == 0)
+				return mfc;
+	}
+	rcu_read_unlock();
+
+	spin_lock_bh(&mfc_unres_lock);
+	it->cache = &mrt->mfc_unres_queue;
+	list_for_each_entry(mfc, it->cache, list)
+		if (pos-- == 0)
+			return mfc;
+	spin_unlock_bh(&mfc_unres_lock);
+
+	it->cache = NULL;
+	return NULL;
+}
+
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt;
+
+	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
+
+	it->mrt = mrt;
+	it->cache = NULL;
+	it->ct = 0;
+	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
+		: SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct mfc_cache *mfc = v;
+	struct ipmr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = it->mrt;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return ipmr_mfc_seq_idx(net, seq->private, 0);
+
+	if (mfc->list.next != it->cache)
+		return list_entry(mfc->list.next, struct mfc_cache, list);
+
+	if (it->cache == &mrt->mfc_unres_queue)
+		goto end_of_list;
+
+	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
+
+	while (++it->ct < MFC_LINES) {
+		it->cache = &mrt->mfc_cache_array[it->ct];
+		if (list_empty(it->cache))
+			continue;
+		return list_first_entry(it->cache, struct mfc_cache, list);
+	}
+
+	/* exhausted cache_array, show unresolved */
+	rcu_read_unlock();
+	it->cache = &mrt->mfc_unres_queue;
+	it->ct = 0;
+
+	spin_lock_bh(&mfc_unres_lock);
+	if (!list_empty(it->cache))
+		return list_first_entry(it->cache, struct mfc_cache, list);
+
+end_of_list:
+	spin_unlock_bh(&mfc_unres_lock);
+	it->cache = NULL;
+
+	return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+	struct mr_table *mrt = it->mrt;
+
+	if (it->cache == &mrt->mfc_unres_queue)
+		spin_unlock_bh(&mfc_unres_lock);
+	else if (it->cache == &mrt->mfc_cache_array[it->ct])
+		rcu_read_unlock();
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+	int n;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
+	} else {
+		const struct mfc_cache *mfc = v;
+		const struct ipmr_mfc_iter *it = seq->private;
+		const struct mr_table *mrt = it->mrt;
+
+		seq_printf(seq, "%08X %08X %-3hd",
+			   (__force u32) mfc->mfc_mcastgrp,
+			   (__force u32) mfc->mfc_origin,
+			   mfc->mfc_parent);
+
+		if (it->cache != &mrt->mfc_unres_queue) {
+			seq_printf(seq, " %8lu %8lu %8lu",
+				   mfc->mfc_un.res.pkt,
+				   mfc->mfc_un.res.bytes,
+				   mfc->mfc_un.res.wrong_if);
+			for (n = mfc->mfc_un.res.minvif;
+			     n < mfc->mfc_un.res.maxvif; n++) {
+				if (VIF_EXISTS(mrt, n) &&
+				    mfc->mfc_un.res.ttls[n] < 255)
+					seq_printf(seq,
+					   " %2d:%-3d",
+					   n, mfc->mfc_un.res.ttls[n]);
+			}
+		} else {
+			/* unresolved mfc_caches don't contain
+			 * pkt, bytes and wrong_if values
+			 */
+			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+		}
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+	.start = ipmr_mfc_seq_start,
+	.next  = ipmr_mfc_seq_next,
+	.stop  = ipmr_mfc_seq_stop,
+	.show  = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
+			    sizeof(struct ipmr_mfc_iter));
+}
+
+static const struct file_operations ipmr_mfc_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ipmr_mfc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static const struct net_protocol pim_protocol = {
+	.handler	=	pim_rcv,
+	.netns_ok	=	1,
+};
+#endif
+
+
+/*
+ *	Setup for IP multicast routing
+ */
+static int __net_init ipmr_net_init(struct net *net)
+{
+	int err;
+
+	err = ipmr_rules_init(net);
+	if (err < 0)
+		goto fail;
+
+#ifdef CONFIG_PROC_FS
+	err = -ENOMEM;
+	if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
+		goto proc_vif_fail;
+	if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
+		goto proc_cache_fail;
+#endif
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+	proc_net_remove(net, "ip_mr_vif");
+proc_vif_fail:
+	ipmr_rules_exit(net);
+#endif
+fail:
+	return err;
+}
+
+static void __net_exit ipmr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "ip_mr_cache");
+	proc_net_remove(net, "ip_mr_vif");
+#endif
+	ipmr_rules_exit(net);
+}
+
+static struct pernet_operations ipmr_net_ops = {
+	.init = ipmr_net_init,
+	.exit = ipmr_net_exit,
+};
+
+int __init ip_mr_init(void)
+{
+	int err;
+
+	mrt_cachep = kmem_cache_create("ip_mrt_cache",
+				       sizeof(struct mfc_cache),
+				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+				       NULL);
+	if (!mrt_cachep)
+		return -ENOMEM;
+
+	err = register_pernet_subsys(&ipmr_net_ops);
+	if (err)
+		goto reg_pernet_fail;
+
+	err = register_netdevice_notifier(&ip_mr_notifier);
+	if (err)
+		goto reg_notif_fail;
+#ifdef CONFIG_IP_PIMSM_V2
+	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
+		printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
+		err = -EAGAIN;
+		goto add_proto_fail;
+	}
+#endif
+	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
+	return 0;
+
+#ifdef CONFIG_IP_PIMSM_V2
+add_proto_fail:
+	unregister_netdevice_notifier(&ip_mr_notifier);
+#endif
+reg_notif_fail:
+	unregister_pernet_subsys(&ipmr_net_ops);
+reg_pernet_fail:
+	kmem_cache_destroy(mrt_cachep);
+	return err;
+}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 00000000..929b27bd
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,247 @@
+/* IPv4 specific functions of netfilter core */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_queue.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	const struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt;
+	struct flowi4 fl4 = {};
+	__be32 saddr = iph->saddr;
+	__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+	unsigned int hh_len;
+
+	if (addr_type == RTN_UNSPEC)
+		addr_type = inet_addr_type(net, saddr);
+	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
+		flags |= FLOWI_FLAG_ANYSRC;
+	else
+		saddr = 0;
+
+	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+	 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
+	 */
+	fl4.daddr = iph->daddr;
+	fl4.saddr = saddr;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_flags = flags;
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return -1;
+
+	/* Drop old route. */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	if (skb_dst(skb)->error)
+		return -1;
+
+#ifdef CONFIG_XFRM
+	if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+		struct dst_entry *dst = skb_dst(skb);
+		skb_dst_set(skb, NULL);
+		dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
+		if (IS_ERR(dst))
+			return -1;
+		skb_dst_set(skb, dst);
+	}
+#endif
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = skb_dst(skb)->dev->hard_header_len;
+	if (skb_headroom(skb) < hh_len &&
+	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+#ifdef CONFIG_XFRM
+int ip_xfrm_me_harder(struct sk_buff *skb)
+{
+	struct flowi fl;
+	unsigned int hh_len;
+	struct dst_entry *dst;
+
+	if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
+		return 0;
+	if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
+		return -1;
+
+	dst = skb_dst(skb);
+	if (dst->xfrm)
+		dst = ((struct xfrm_dst *)dst)->route;
+	dst_hold(dst);
+
+	dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
+	if (IS_ERR(dst))
+		return -1;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = skb_dst(skb)->dev->hard_header_len;
+	if (skb_headroom(skb) < hh_len &&
+	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(ip_xfrm_me_harder);
+#endif
+
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+	__be32 daddr;
+	__be32 saddr;
+	u_int8_t tos;
+	u_int32_t mark;
+};
+
+static void nf_ip_saveroute(const struct sk_buff *skb,
+			    struct nf_queue_entry *entry)
+{
+	struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->hook == NF_INET_LOCAL_OUT) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		rt_info->tos = iph->tos;
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+		rt_info->mark = skb->mark;
+	}
+}
+
+static int nf_ip_reroute(struct sk_buff *skb,
+			 const struct nf_queue_entry *entry)
+{
+	const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->hook == NF_INET_LOCAL_OUT) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (!(iph->tos == rt_info->tos &&
+		      skb->mark == rt_info->mark &&
+		      iph->daddr == rt_info->daddr &&
+		      iph->saddr == rt_info->saddr))
+			return ip_route_me_harder(skb, RTN_UNSPEC);
+	}
+	return 0;
+}
+
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+			    unsigned int dataoff, u_int8_t protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if ((protocol == 0 && !csum_fold(skb->csum)) ||
+		    !csum_tcpudp_magic(iph->saddr, iph->daddr,
+				       skb->len - dataoff, protocol,
+				       skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		if (protocol == 0)
+			skb->csum = 0;
+		else
+			skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						       skb->len - dataoff,
+						       protocol, 0);
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				      unsigned int dataoff, unsigned int len,
+				      u_int8_t protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+					       skb->len - dataoff, 0);
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+}
+
+static int nf_ip_route(struct net *net, struct dst_entry **dst,
+		       struct flowi *fl, bool strict __always_unused)
+{
+	struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+	*dst = &rt->dst;
+	return 0;
+}
+
+static const struct nf_afinfo nf_ip_afinfo = {
+	.family			= AF_INET,
+	.checksum		= nf_ip_checksum,
+	.checksum_partial	= nf_ip_checksum_partial,
+	.route			= nf_ip_route,
+	.saveroute		= nf_ip_saveroute,
+	.reroute		= nf_ip_reroute,
+	.route_key_size		= sizeof(struct ip_rt_info),
+};
+
+static int ipv4_netfilter_init(void)
+{
+	return nf_register_afinfo(&nf_ip_afinfo);
+}
+
+static void ipv4_netfilter_fini(void)
+{
+	nf_unregister_afinfo(&nf_ip_afinfo);
+}
+
+module_init(ipv4_netfilter_init);
+module_exit(ipv4_netfilter_fini);
+
+#ifdef CONFIG_SYSCTL
+struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ .procname = "netfilter", },
+	{ }
+};
+EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
+#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 00000000..73b4e91a
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,395 @@
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+	depends on INET && NETFILTER
+
+config NF_DEFRAG_IPV4
+	tristate
+	default n
+
+config NF_CONNTRACK_IPV4
+	tristate "IPv4 connection tracking support (required for NAT)"
+	depends on NF_CONNTRACK
+	default m if NETFILTER_ADVANCED=n
+	select NF_DEFRAG_IPV4
+	---help---
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections.
+
+	  This is IPv4 support on Layer 3 independent connection tracking.
+	  Layer 3 independent connection tracking is experimental scheme
+	  which generalize ip_conntrack to support other layer 3 protocols.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_PROC_COMPAT
+	bool "proc/sysctl compatibility with old connection tracking"
+	depends on NF_CONNTRACK_IPV4
+	default y
+	help
+	  This option enables /proc and sysctl compatibility with the old
+	  layer 3 dependent connection tracking. This is needed to keep
+	  old programs that have not been adapted to the new names working.
+
+	  If unsure, say Y.
+
+config IP_NF_QUEUE
+	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
+	depends on NETFILTER_ADVANCED
+	help
+	  Netfilter has the ability to queue packets to user space: the
+	  netlink device can be used to access them using this driver.
+
+	  This option enables the old IPv4-only "ip_queue" implementation
+	  which has been obsoleted by the new "nfnetlink_queue" code (see
+	  CONFIG_NETFILTER_NETLINK_QUEUE).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_IPTABLES
+	tristate "IP tables support (required for filtering/masq/NAT)"
+	default m if NETFILTER_ADVANCED=n
+	select NETFILTER_XTABLES
+	help
+	  iptables is a general, extensible packet identification framework.
+	  The packet filtering and full NAT (masquerading, port forwarding,
+	  etc) subsystems now use this: say `Y' or `M' here if you want to use
+	  either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_NF_IPTABLES
+
+# The matches.
+config IP_NF_MATCH_AH
+	tristate '"ah" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This match extension allows you to match a range of SPIs
+	  inside AH header of IPSec packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_ECN
+	tristate '"ecn" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `ECN' match, which allows you to match against
+	  the IPv4 and TCP header ECN fields.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_TTL
+	tristate '"ttl" match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MATCH_HL
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MATCH_HL.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+	tristate "Packet filtering"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Packet filtering defines a table `filter', which has a series of
+	  rules for simple packet filtering at local input, forwarding and
+	  local output.  See the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+	tristate "REJECT target support"
+	depends on IP_NF_FILTER
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The REJECT target allows a filtering rule to specify that an ICMP
+	  error should be issued in response to an incoming packet, rather
+	  than silently being dropped.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REJECT_SKERR
+	bool "Force socket error when rejecting with icmp*"
+	depends on IP_NF_TARGET_REJECT
+	default n
+	help
+          This option enables turning a "--reject-with icmp*" into a matching
+          socket error also.
+	  The REJECT target normally allows sending an ICMP message. But it
+          leaves the local socket unaware of any ingress rejects.
+
+	  If unsure, say N.
+
+config IP_NF_TARGET_LOG
+	tristate "LOG target support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option adds a `LOG' target, which allows you to create rules in
+	  any iptables table which records the packet header to the syslog.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_ULOG
+	tristate "ULOG target support"
+	default m if NETFILTER_ADVANCED=n
+	---help---
+
+	  This option enables the old IPv4-only "ipt_ULOG" implementation
+	  which has been obsoleted by the new "nfnetlink_log" code (see
+	  CONFIG_NETFILTER_NETLINK_LOG).
+
+	  This option adds a `ULOG' target, which allows you to create rules in
+	  any iptables table. The packet is passed to a userspace logging
+	  daemon using netlink multicast sockets; unlike the LOG target
+	  which can only be viewed through syslog.
+
+	  The appropriate userspace logging daemon (ulogd) may be obtained from
+	  <http://www.netfilter.org/projects/ulogd/index.html>
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# NAT + specific targets: nf_conntrack
+config NF_NAT
+	tristate "Full NAT"
+	depends on NF_CONNTRACK_IPV4
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The Full NAT option allows masquerading, port forwarding and other
+	  forms of full Network Address Port Translation.  It is controlled by
+	  the `nat' table in iptables: see the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_NAT_NEEDED
+	bool
+	depends on NF_NAT
+	default y
+
+config IP_NF_TARGET_MASQUERADE
+	tristate "MASQUERADE target support"
+	depends on NF_NAT
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Masquerading is a special case of NAT: all outgoing connections are
+	  changed to seem to come from a particular interface's address, and
+	  if the interface goes down, those connections are lost.  This is
+	  only useful for dialup accounts with dynamic IP address (ie. your IP
+	  address will be different on next dialup).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_NETMAP
+	tristate "NETMAP target support"
+	depends on NF_NAT
+	depends on NETFILTER_ADVANCED
+	help
+	  NETMAP is an implementation of static 1:1 NAT mapping of network
+	  addresses. It maps the network address part, while keeping the host
+	  address part intact.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REDIRECT
+	tristate "REDIRECT target support"
+	depends on NF_NAT
+	depends on NETFILTER_ADVANCED
+	help
+	  REDIRECT is a special case of NAT: all incoming connections are
+	  mapped onto the incoming interface's address, causing the packets to
+	  come to the local machine instead of passing through.  This is
+	  useful for transparent proxies.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_NAT_SNMP_BASIC
+	tristate "Basic SNMP-ALG support"
+	depends on NF_CONNTRACK_SNMP && NF_NAT
+	depends on NETFILTER_ADVANCED
+	default NF_NAT && NF_CONNTRACK_SNMP
+	---help---
+
+	  This module implements an Application Layer Gateway (ALG) for
+	  SNMP payloads.  In conjunction with NAT, it allows a network
+	  management system to access multiple private networks with
+	  conflicting addresses.  It works by modifying IP addresses
+	  inside SNMP payloads to match IP-layer NAT mapping.
+
+	  This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
+# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.
+# From kconfig-language.txt:
+#
+#           <expr> '&&' <expr>                   (6)
+#
+# (6) Returns the result of min(/expr/, /expr/).
+config NF_NAT_PROTO_DCCP
+	tristate
+	depends on NF_NAT && NF_CT_PROTO_DCCP
+	default NF_NAT && NF_CT_PROTO_DCCP
+
+config NF_NAT_PROTO_GRE
+	tristate
+	depends on NF_NAT && NF_CT_PROTO_GRE
+
+config NF_NAT_PROTO_UDPLITE
+	tristate
+	depends on NF_NAT && NF_CT_PROTO_UDPLITE
+	default NF_NAT && NF_CT_PROTO_UDPLITE
+
+config NF_NAT_PROTO_SCTP
+	tristate
+	default NF_NAT && NF_CT_PROTO_SCTP
+	depends on NF_NAT && NF_CT_PROTO_SCTP
+	select LIBCRC32C
+
+config NF_NAT_FTP
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_FTP
+
+config NF_NAT_IRC
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_IRC
+
+config NF_NAT_TFTP
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_TFTP
+
+config NF_NAT_AMANDA
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_AMANDA
+
+config NF_NAT_PPTP
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_PPTP
+	select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_H323
+
+config NF_NAT_SIP
+	tristate
+	depends on NF_CONNTRACK && NF_NAT
+	default NF_NAT && NF_CONNTRACK_SIP
+
+# mangle + specific targets
+config IP_NF_MANGLE
+	tristate "Packet mangling"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option adds a `mangle' table to iptables: see the man page for
+	  iptables(8).  This table is used for various packet alterations
+	  which can effect how the packet is routed.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_CLUSTERIP
+	tristate "CLUSTERIP target support (EXPERIMENTAL)"
+	depends on IP_NF_MANGLE && EXPERIMENTAL
+	depends on NF_CONNTRACK_IPV4
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_MARK
+	help
+	  The CLUSTERIP target allows you to build load-balancing clusters of
+	  network servers without having a dedicated load-balancing
+	  router/server/switch.
+	
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_ECN
+	tristate "ECN target support"
+	depends on IP_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `ECN' target, which can be used in the iptables mangle
+	  table.  
+
+	  You can use this target to remove the ECN bits from the IPv4 header of
+	  an IP packet.  This is particularly useful, if you need to work around
+	  existing ECN blackholes on the internet, but don't want to disable
+	  ECN support in general.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_TTL
+	tristate '"TTL" target support'
+	depends on NETFILTER_ADVANCED && IP_NF_MANGLE
+	select NETFILTER_XT_TARGET_HL
+	---help---
+	This is a backwards-compatible option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_HL.
+
+# raw + specific targets
+config IP_NF_RAW
+	tristate  'raw table support (required for NOTRACK/TRACE)'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `raw' table to iptables. This table is the very
+	  first in the netfilter framework and hooks in at the PREROUTING
+	  and OUTPUT chains.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+# security table for MAC policy
+config IP_NF_SECURITY
+	tristate "Security table"
+	depends on SECURITY
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `security' table to iptables, for use
+	  with Mandatory Access Control (MAC) policy.
+	 
+	  If unsure, say N.
+
+endif # IP_NF_IPTABLES
+
+# ARP tables
+config IP_NF_ARPTABLES
+	tristate "ARP tables support"
+	select NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	help
+	  arptables is a general, extensible packet identification framework.
+	  The ARP packet filtering and mangling (manipulation)subsystems
+	  use this: say Y or M here if you want to use either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_NF_ARPTABLES
+
+config IP_NF_ARPFILTER
+	tristate "ARP packet filtering"
+	help
+	  ARP packet filtering defines a table `filter', which has a series of
+	  rules for simple ARP packet filtering at local input and
+	  local output.  On a bridge, you can also specify filtering rules
+	  for forwarded ARP packets. See the man page for arptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+	tristate "ARP payload mangling"
+	help
+	  Allows altering the ARP packet payload: source and destination
+	  hardware and network addresses.
+
+endif # IP_NF_ARPTABLES
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 00000000..dca2082e
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,72 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv4-y	:=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
+ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
+ifeq ($(CONFIG_PROC_FS),y)
+nf_conntrack_ipv4-objs	+= nf_conntrack_l3proto_ipv4_compat.o
+endif
+endif
+
+nf_nat-y		:= nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
+iptable_nat-y	:= nf_nat_rule.o nf_nat_standalone.o
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
+
+obj-$(CONFIG_NF_NAT) += nf_nat.o
+
+# defrag
+obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+
+# NAT helpers (nf_conntrack)
+obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
+obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
+obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
+obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
+obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
+obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
+obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
+obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
+
+# NAT protocols (nf_nat)
+obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
+obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
+obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+
+# generic IP tables 
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 00000000..fd7a3f68
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1914 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <net/compat.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...)  printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x)	WARN_ON(!(x))
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+	return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+				      const char *hdr_addr, int len)
+{
+	int i, ret;
+
+	if (len > ARPT_DEV_ADDR_LEN_MAX)
+		len = ARPT_DEV_ADDR_LEN_MAX;
+
+	ret = 0;
+	for (i = 0; i < len; i++)
+		ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+	return ret != 0;
+}
+
+/*
+ * Unfortunately, _b and _mask are not aligned to an int (or long int)
+ * Some arches dont care, unrolling the loop is a win on them.
+ * For other arches, we only have a 16bit alignement.
+ */
+static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	unsigned long ret = ifname_compare_aligned(_a, _b, _mask);
+#else
+	unsigned long ret = 0;
+	const u16 *a = (const u16 *)_a;
+	const u16 *b = (const u16 *)_b;
+	const u16 *mask = (const u16 *)_mask;
+	int i;
+
+	for (i = 0; i < IFNAMSIZ/sizeof(u16); i++)
+		ret |= (a[i] ^ b[i]) & mask[i];
+#endif
+	return ret;
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+				   struct net_device *dev,
+				   const char *indev,
+				   const char *outdev,
+				   const struct arpt_arp *arpinfo)
+{
+	const char *arpptr = (char *)(arphdr + 1);
+	const char *src_devaddr, *tgt_devaddr;
+	__be32 src_ipaddr, tgt_ipaddr;
+	long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
+
+	if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+		  ARPT_INV_ARPOP)) {
+		dprintf("ARP operation field mismatch.\n");
+		dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+			arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+		  ARPT_INV_ARPHRD)) {
+		dprintf("ARP hardware address format mismatch.\n");
+		dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+			arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+		  ARPT_INV_ARPPRO)) {
+		dprintf("ARP protocol address format mismatch.\n");
+		dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+			arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+		  ARPT_INV_ARPHLN)) {
+		dprintf("ARP hardware address length mismatch.\n");
+		dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+			arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+		return 0;
+	}
+
+	src_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&src_ipaddr, arpptr, sizeof(u32));
+	arpptr += sizeof(u32);
+	tgt_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+	if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+		  ARPT_INV_SRCDEVADDR) ||
+	    FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+		  ARPT_INV_TGTDEVADDR)) {
+		dprintf("Source or target device address mismatch.\n");
+
+		return 0;
+	}
+
+	if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+		  ARPT_INV_SRCIP) ||
+	    FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+		  ARPT_INV_TGTIP)) {
+		dprintf("Source or target IP address mismatch.\n");
+
+		dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+			&src_ipaddr,
+			&arpinfo->smsk.s_addr,
+			&arpinfo->src.s_addr,
+			arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+		dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+			&tgt_ipaddr,
+			&arpinfo->tmsk.s_addr,
+			&arpinfo->tgt.s_addr,
+			arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+		return 0;
+	}
+
+	/* Look for ifname matches.  */
+	ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
+
+	if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+		dprintf("VIA in mismatch (%s vs %s).%s\n",
+			indev, arpinfo->iniface,
+			arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+		return 0;
+	}
+
+	ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
+
+	if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+		dprintf("VIA out mismatch (%s vs %s).%s\n",
+			outdev, arpinfo->outiface,
+			arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+		return 0;
+	}
+
+	return 1;
+#undef FWINV
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+	if (arp->flags & ~ARPT_F_MASK) {
+		duprintf("Unknown flag bits set: %08X\n",
+			 arp->flags & ~ARPT_F_MASK);
+		return 0;
+	}
+	if (arp->invflags & ~ARPT_INV_MASK) {
+		duprintf("Unknown invflag bits set: %08X\n",
+			 arp->invflags & ~ARPT_INV_MASK);
+		return 0;
+	}
+
+	return 1;
+}
+
+static unsigned int
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (net_ratelimit())
+		pr_err("arp_tables: error: '%s'\n",
+		       (const char *)par->targinfo);
+
+	return NF_DROP;
+}
+
+static inline const struct xt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+	return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+	return (struct arpt_entry *)(base + offset);
+}
+
+static inline __pure
+struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+unsigned int arpt_do_table(struct sk_buff *skb,
+			   unsigned int hook,
+			   const struct net_device *in,
+			   const struct net_device *out,
+			   struct xt_table *table)
+{
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	unsigned int verdict = NF_DROP;
+	const struct arphdr *arp;
+	struct arpt_entry *e, *back;
+	const char *indev, *outdev;
+	void *table_base;
+	const struct xt_table_info *private;
+	struct xt_action_param acpar;
+	unsigned int addend;
+
+	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
+		return NF_DROP;
+
+	indev = in ? in->name : nulldevname;
+	outdev = out ? out->name : nulldevname;
+
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
+
+	e = get_entry(table_base, private->hook_entry[hook]);
+	back = get_entry(table_base, private->underflow[hook]);
+
+	acpar.in      = in;
+	acpar.out     = out;
+	acpar.hooknum = hook;
+	acpar.family  = NFPROTO_ARP;
+	acpar.hotdrop = false;
+
+	arp = arp_hdr(skb);
+	do {
+		const struct xt_entry_target *t;
+
+		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
+			e = arpt_next_entry(e);
+			continue;
+		}
+
+		ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+
+		t = arpt_get_target_c(e);
+
+		/* Standard target? */
+		if (!t->u.kernel.target->target) {
+			int v;
+
+			v = ((struct xt_standard_target *)t)->verdict;
+			if (v < 0) {
+				/* Pop from stack? */
+				if (v != XT_RETURN) {
+					verdict = (unsigned)(-v) - 1;
+					break;
+				}
+				e = back;
+				back = get_entry(table_base, back->comefrom);
+				continue;
+			}
+			if (table_base + v
+			    != arpt_next_entry(e)) {
+				/* Save old back ptr in next entry */
+				struct arpt_entry *next = arpt_next_entry(e);
+				next->comefrom = (void *)back - table_base;
+
+				/* set back pointer to next entry */
+				back = next;
+			}
+
+			e = get_entry(table_base, v);
+			continue;
+		}
+
+		/* Targets which reenter must return
+		 * abs. verdicts
+		 */
+		acpar.target   = t->u.kernel.target;
+		acpar.targinfo = t->data;
+		verdict = t->u.kernel.target->target(skb, &acpar);
+
+		/* Target might have changed stuff. */
+		arp = arp_hdr(skb);
+
+		if (verdict == XT_CONTINUE)
+			e = arpt_next_entry(e);
+		else
+			/* Verdict */
+			break;
+	} while (!acpar.hotdrop);
+	xt_write_recseq_end(addend);
+	local_bh_enable();
+
+	if (acpar.hotdrop)
+		return NF_DROP;
+	else
+		return verdict;
+}
+
+/* All zeroes == unconditional rule. */
+static inline bool unconditional(const struct arpt_arp *arp)
+{
+	static const struct arpt_arp uncond;
+
+	return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops.  Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(const struct xt_table_info *newinfo,
+			      unsigned int valid_hooks, void *entry0)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	 * to 0 as we leave), and comefrom to save source hook bitmask.
+	 */
+	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct arpt_entry *e
+			= (struct arpt_entry *)(entry0 + pos);
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			const struct xt_standard_target *t
+				= (void *)arpt_get_target_c(e);
+			int visited = e->comefrom & (1 << hook);
+
+			if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+				pr_notice("arptables: loop hook %u pos %u %08X.\n",
+				       hook, pos, e->comefrom);
+				return 0;
+			}
+			e->comefrom
+				|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if ((e->target_offset == sizeof(struct arpt_entry) &&
+			     (strcmp(t->target.u.user.name,
+				     XT_STANDARD_TARGET) == 0) &&
+			     t->verdict < 0 && unconditional(&e->arp)) ||
+			    visited) {
+				unsigned int oldpos, size;
+
+				if ((strcmp(t->target.u.user.name,
+					    XT_STANDARD_TARGET) == 0) &&
+				    t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
+				/* Return: backtrack through the last
+				 * big jump.
+				 */
+				do {
+					e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = (struct arpt_entry *)
+						(entry0 + pos);
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = (struct arpt_entry *)
+					(entry0 + pos + size);
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   XT_STANDARD_TARGET) == 0 &&
+				    newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct arpt_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
+
+					/* This a jump; chase it. */
+					duprintf("Jump rule %u -> %u\n",
+						 pos, newpos);
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+				}
+				e = (struct arpt_entry *)
+					(entry0 + newpos);
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+		next:
+		duprintf("Finished chain %u\n", hook);
+	}
+	return 1;
+}
+
+static inline int check_entry(const struct arpt_entry *e, const char *name)
+{
+	const struct xt_entry_target *t;
+
+	if (!arp_checkentry(&e->arp)) {
+		duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+		return -EINVAL;
+	}
+
+	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
+		return -EINVAL;
+
+	t = arpt_get_target_c(e);
+	if (e->target_offset + t->u.target_size > e->next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline int check_target(struct arpt_entry *e, const char *name)
+{
+	struct xt_entry_target *t = arpt_get_target(e);
+	int ret;
+	struct xt_tgchk_param par = {
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+		.family    = NFPROTO_ARP,
+	};
+
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+	if (ret < 0) {
+		duprintf("arp_tables: check failed for `%s'.\n",
+			 t->u.kernel.target->name);
+		return ret;
+	}
+	return 0;
+}
+
+static inline int
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	int ret;
+
+	ret = check_entry(e, name);
+	if (ret)
+		return ret;
+
+	t = arpt_get_target(e);
+	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+		ret = PTR_ERR(target);
+		goto out;
+	}
+	t->u.kernel.target = target;
+
+	ret = check_target(e, name);
+	if (ret)
+		goto err;
+	return 0;
+err:
+	module_put(t->u.kernel.target->me);
+out:
+	return ret;
+}
+
+static bool check_underflow(const struct arpt_entry *e)
+{
+	const struct xt_entry_target *t;
+	unsigned int verdict;
+
+	if (!unconditional(&e->arp))
+		return false;
+	t = arpt_get_target_c(e);
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+		return false;
+	verdict = ((struct xt_standard_target *)t)->verdict;
+	verdict = -verdict - 1;
+	return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+					     struct xt_table_info *newinfo,
+					     const unsigned char *base,
+					     const unsigned char *limit,
+					     const unsigned int *hook_entries,
+					     const unsigned int *underflows,
+					     unsigned int valid_hooks)
+{
+	unsigned int h;
+
+	if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+		duprintf("Bad offset %p\n", e);
+		return -EINVAL;
+	}
+
+	if (e->next_offset
+	    < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if (!(valid_hooks & (1 << h)))
+			continue;
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h]) {
+			if (!check_underflow(e)) {
+				pr_err("Underflows must be unconditional and "
+				       "use the STANDARD target with "
+				       "ACCEPT/DROP\n");
+				return -EINVAL;
+			}
+			newinfo->underflow[h] = underflows[h];
+		}
+	}
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct xt_counters) { 0, 0 });
+	e->comefrom = 0;
+	return 0;
+}
+
+static inline void cleanup_entry(struct arpt_entry *e)
+{
+	struct xt_tgdtor_param par;
+	struct xt_entry_target *t;
+
+	t = arpt_get_target(e);
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_ARP;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(struct xt_table_info *newinfo, void *entry0,
+                           const struct arpt_replace *repl)
+{
+	struct arpt_entry *iter;
+	unsigned int i;
+	int ret = 0;
+
+	newinfo->size = repl->size;
+	newinfo->number = repl->num_entries;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_table: size %u\n", newinfo->size);
+	i = 0;
+
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+						 entry0 + repl->size,
+						 repl->hook_entry,
+						 repl->underflow,
+						 repl->valid_hooks);
+		if (ret != 0)
+			break;
+		++i;
+		if (strcmp(arpt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+	duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+	if (ret != 0)
+		return ret;
+
+	if (i != repl->num_entries) {
+		duprintf("translate_table: %u not %u entries\n",
+			 i, repl->num_entries);
+		return -EINVAL;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(repl->valid_hooks & (1 << i)))
+			continue;
+		if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, repl->hook_entry[i]);
+			return -EINVAL;
+		}
+		if (newinfo->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, repl->underflow[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
+		duprintf("Looping hook\n");
+		return -ELOOP;
+	}
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = find_check_entry(iter, repl->name, repl->size);
+		if (ret != 0)
+			break;
+		++i;
+	}
+
+	if (ret != 0) {
+		xt_entry_foreach(iter, entry0, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter);
+		}
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i) {
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
+	}
+
+	return ret;
+}
+
+static void get_counters(const struct xt_table_info *t,
+			 struct xt_counters counters[])
+{
+	struct arpt_entry *iter;
+	unsigned int cpu;
+	unsigned int i;
+
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+		i = 0;
+		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			++i;
+		}
+	}
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+	unsigned int countersize;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	 * (other than comefrom, which userspace doesn't care
+	 * about).
+	 */
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vzalloc(countersize);
+
+	if (counters == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	get_counters(private, counters);
+
+	return counters;
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+				const struct xt_table *table,
+				void __user *userptr)
+{
+	unsigned int off, num;
+	const struct arpt_entry *e;
+	struct xt_counters *counters;
+	struct xt_table_info *private = table->private;
+	int ret = 0;
+	void *loc_cpu_entry;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	/* ... then copy entire thing ... */
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+		ret = -EFAULT;
+		goto free_counters;
+	}
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		const struct xt_entry_target *t;
+
+		e = (struct arpt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off
+				 + offsetof(struct arpt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		t = arpt_get_target_c(e);
+		if (copy_to_user(userptr + off + e->target_offset
+				 + offsetof(struct xt_entry_target,
+					    u.user.name),
+				 t->u.kernel.target->name,
+				 strlen(t->u.kernel.target->name)+1) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v > 0)
+		v += xt_compat_calc_jump(NFPROTO_ARP, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv > 0)
+		cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct arpt_entry *e,
+			     const struct xt_table_info *info,
+			     const void *base, struct xt_table_info *newinfo)
+{
+	const struct xt_entry_target *t;
+	unsigned int entry_offset;
+	int off, i, ret;
+
+	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+	entry_offset = (void *)e - base;
+
+	t = arpt_get_target_c(e);
+	off += xt_compat_target_offset(t->u.kernel.target);
+	newinfo->size -= off;
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		if (info->hook_entry[i] &&
+		    (e < (struct arpt_entry *)(base + info->hook_entry[i])))
+			newinfo->hook_entry[i] -= off;
+		if (info->underflow[i] &&
+		    (e < (struct arpt_entry *)(base + info->underflow[i])))
+			newinfo->underflow[i] -= off;
+	}
+	return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+			     struct xt_table_info *newinfo)
+{
+	struct arpt_entry *iter;
+	void *loc_cpu_entry;
+	int ret;
+
+	if (!newinfo || !info)
+		return -EINVAL;
+
+	/* we dont care about newinfo->entries[] */
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	newinfo->initial_entries = 0;
+	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(NFPROTO_ARP, info->number);
+	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+		if (ret != 0)
+			return ret;
+	}
+	return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+                    const int *len, int compat)
+{
+	char name[XT_TABLE_MAXNAMELEN];
+	struct xt_table *t;
+	int ret;
+
+	if (*len != sizeof(struct arpt_getinfo)) {
+		duprintf("length %u != %Zu\n", *len,
+			 sizeof(struct arpt_getinfo));
+		return -EINVAL;
+	}
+
+	if (copy_from_user(name, user, sizeof(name)) != 0)
+		return -EFAULT;
+
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_lock(NFPROTO_ARP);
+#endif
+	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+				    "arptable_%s", name);
+	if (t && !IS_ERR(t)) {
+		struct arpt_getinfo info;
+		const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+		struct xt_table_info tmp;
+
+		if (compat) {
+			ret = compat_table_info(private, &tmp);
+			xt_compat_flush_offsets(NFPROTO_ARP);
+			private = &tmp;
+		}
+#endif
+		memset(&info, 0, sizeof(info));
+		info.valid_hooks = t->valid_hooks;
+		memcpy(info.hook_entry, private->hook_entry,
+		       sizeof(info.hook_entry));
+		memcpy(info.underflow, private->underflow,
+		       sizeof(info.underflow));
+		info.num_entries = private->number;
+		info.size = private->size;
+		strcpy(info.name, name);
+
+		if (copy_to_user(user, &info, *len) != 0)
+			ret = -EFAULT;
+		else
+			ret = 0;
+		xt_table_unlock(t);
+		module_put(t->me);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_unlock(NFPROTO_ARP);
+#endif
+	return ret;
+}
+
+static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
+		       const int *len)
+{
+	int ret;
+	struct arpt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct arpt_get_entries) + get.size) {
+		duprintf("get_entries: %u != %Zu\n", *len,
+			 sizeof(struct arpt_get_entries) + get.size);
+		return -EINVAL;
+	}
+
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+	if (t && !IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+
+		duprintf("t->private->number = %u\n",
+			 private->number);
+		if (get.size == private->size)
+			ret = copy_entries_to_user(private->size,
+						   t, uptr->entrytable);
+		else {
+			duprintf("get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	return ret;
+}
+
+static int __do_replace(struct net *net, const char *name,
+			unsigned int valid_hooks,
+			struct xt_table_info *newinfo,
+			unsigned int num_counters,
+			void __user *counters_ptr)
+{
+	int ret;
+	struct xt_table *t;
+	struct xt_table_info *oldinfo;
+	struct xt_counters *counters;
+	void *loc_cpu_old_entry;
+	struct arpt_entry *iter;
+
+	ret = 0;
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	if (!counters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
+				    "arptable_%s", name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (valid_hooks != t->valid_hooks) {
+		duprintf("Valid hook crap: %08X vs %08X\n",
+			 valid_hooks, t->valid_hooks);
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+		oldinfo->number, oldinfo->initial_entries, newinfo->number);
+	if ((oldinfo->number > oldinfo->initial_entries) ||
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	/* Get the old counters, and synchronize with replace */
+	get_counters(oldinfo, counters);
+
+	/* Decrease module usage counts and free resource */
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+		cleanup_entry(iter);
+
+	xt_free_table_info(oldinfo);
+	if (copy_to_user(counters_ptr, counters,
+			 sizeof(struct xt_counters) * num_counters) != 0)
+		ret = -EFAULT;
+	vfree(counters);
+	xt_table_unlock(t);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+	xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+	vfree(counters);
+ out:
+	return ret;
+}
+
+static int do_replace(struct net *net, const void __user *user,
+                      unsigned int len)
+{
+	int ret;
+	struct arpt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct arpt_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("arp_tables: Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, tmp.counters);
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int do_add_counters(struct net *net, const void __user *user,
+			   unsigned int len, int compat)
+{
+	unsigned int i, curcpu;
+	struct xt_counters_info tmp;
+	struct xt_counters *paddc;
+	unsigned int num_counters;
+	const char *name;
+	int size;
+	void *ptmp;
+	struct xt_table *t;
+	const struct xt_table_info *private;
+	int ret = 0;
+	void *loc_cpu_entry;
+	struct arpt_entry *iter;
+	unsigned int addend;
+#ifdef CONFIG_COMPAT
+	struct compat_xt_counters_info compat_tmp;
+
+	if (compat) {
+		ptmp = &compat_tmp;
+		size = sizeof(struct compat_xt_counters_info);
+	} else
+#endif
+	{
+		ptmp = &tmp;
+		size = sizeof(struct xt_counters_info);
+	}
+
+	if (copy_from_user(ptmp, user, size) != 0)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		num_counters = compat_tmp.num_counters;
+		name = compat_tmp.name;
+	} else
+#endif
+	{
+		num_counters = tmp.num_counters;
+		name = tmp.name;
+	}
+
+	if (len != size + num_counters * sizeof(struct xt_counters))
+		return -EINVAL;
+
+	paddc = vmalloc(len - size);
+	if (!paddc)
+		return -ENOMEM;
+
+	if (copy_from_user(paddc, user + size, len - size) != 0) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	t = xt_find_table_lock(net, NFPROTO_ARP, name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free;
+	}
+
+	local_bh_disable();
+	private = t->private;
+	if (private->number != num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	/* Choose the copy that is on our node */
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	addend = xt_write_recseq_begin();
+	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		++i;
+	}
+	xt_write_recseq_end(addend);
+ unlock_up_free:
+	local_bh_enable();
+	xt_table_unlock(t);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static inline void compat_release_entry(struct compat_arpt_entry *e)
+{
+	struct xt_entry_target *t;
+
+	t = compat_arpt_get_target(e);
+	module_put(t->u.kernel.target->me);
+}
+
+static inline int
+check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
+				  struct xt_table_info *newinfo,
+				  unsigned int *size,
+				  const unsigned char *base,
+				  const unsigned char *limit,
+				  const unsigned int *hook_entries,
+				  const unsigned int *underflows,
+				  const char *name)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	unsigned int entry_offset;
+	int ret, off, h;
+
+	duprintf("check_compat_entry_size_and_hooks %p\n", e);
+	if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+		duprintf("Bad offset %p, limit = %p\n", e, limit);
+		return -EINVAL;
+	}
+
+	if (e->next_offset < sizeof(struct compat_arpt_entry) +
+			     sizeof(struct compat_xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* For purposes of check_entry casting the compat entry is fine */
+	ret = check_entry((struct arpt_entry *)e, name);
+	if (ret)
+		return ret;
+
+	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+	entry_offset = (void *)e - (void *)base;
+
+	t = compat_arpt_get_target(e);
+	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+			 t->u.user.name);
+		ret = PTR_ERR(target);
+		goto out;
+	}
+	t->u.kernel.target = target;
+
+	off += xt_compat_target_offset(target);
+	*size += off;
+	ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
+	if (ret)
+		goto release_target;
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h])
+			newinfo->underflow[h] = underflows[h];
+	}
+
+	/* Clear counters and comefrom */
+	memset(&e->counters, 0, sizeof(e->counters));
+	e->comefrom = 0;
+	return 0;
+
+release_target:
+	module_put(t->u.kernel.target->me);
+out:
+	return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
+			    unsigned int *size, const char *name,
+			    struct xt_table_info *newinfo, unsigned char *base)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	struct arpt_entry *de;
+	unsigned int origsize;
+	int ret, h;
+
+	ret = 0;
+	origsize = *size;
+	de = (struct arpt_entry *)*dstptr;
+	memcpy(de, e, sizeof(struct arpt_entry));
+	memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+	*dstptr += sizeof(struct arpt_entry);
+	*size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+	de->target_offset = e->target_offset - (origsize - *size);
+	t = compat_arpt_get_target(e);
+	target = t->u.kernel.target;
+	xt_compat_target_from_user(t, dstptr, size);
+
+	de->next_offset = e->next_offset - (origsize - *size);
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if ((unsigned char *)de - base < newinfo->hook_entry[h])
+			newinfo->hook_entry[h] -= origsize - *size;
+		if ((unsigned char *)de - base < newinfo->underflow[h])
+			newinfo->underflow[h] -= origsize - *size;
+	}
+	return ret;
+}
+
+static int translate_compat_table(const char *name,
+				  unsigned int valid_hooks,
+				  struct xt_table_info **pinfo,
+				  void **pentry0,
+				  unsigned int total_size,
+				  unsigned int number,
+				  unsigned int *hook_entries,
+				  unsigned int *underflows)
+{
+	unsigned int i, j;
+	struct xt_table_info *newinfo, *info;
+	void *pos, *entry0, *entry1;
+	struct compat_arpt_entry *iter0;
+	struct arpt_entry *iter1;
+	unsigned int size;
+	int ret = 0;
+
+	info = *pinfo;
+	entry0 = *pentry0;
+	size = total_size;
+	info->number = number;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		info->hook_entry[i] = 0xFFFFFFFF;
+		info->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_compat_table: size %u\n", info->size);
+	j = 0;
+	xt_compat_lock(NFPROTO_ARP);
+	xt_compat_init_offsets(NFPROTO_ARP, number);
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+							entry0,
+							entry0 + total_size,
+							hook_entries,
+							underflows,
+							name);
+		if (ret != 0)
+			goto out_unlock;
+		++j;
+	}
+
+	ret = -EINVAL;
+	if (j != number) {
+		duprintf("translate_compat_table: %u not %u entries\n",
+			 j, number);
+		goto out_unlock;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(valid_hooks & (1 << i)))
+			continue;
+		if (info->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, hook_entries[i]);
+			goto out_unlock;
+		}
+		if (info->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, underflows[i]);
+			goto out_unlock;
+		}
+	}
+
+	ret = -ENOMEM;
+	newinfo = xt_alloc_table_info(size);
+	if (!newinfo)
+		goto out_unlock;
+
+	newinfo->number = number;
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = info->hook_entry[i];
+		newinfo->underflow[i] = info->underflow[i];
+	}
+	entry1 = newinfo->entries[raw_smp_processor_id()];
+	pos = entry1;
+	size = total_size;
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = compat_copy_entry_from_user(iter0, &pos, &size,
+						  name, newinfo, entry1);
+		if (ret != 0)
+			break;
+	}
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
+	if (ret)
+		goto free_newinfo;
+
+	ret = -ELOOP;
+	if (!mark_source_chains(newinfo, valid_hooks, entry1))
+		goto free_newinfo;
+
+	i = 0;
+	xt_entry_foreach(iter1, entry1, newinfo->size) {
+		ret = check_target(iter1, name);
+		if (ret != 0)
+			break;
+		++i;
+		if (strcmp(arpt_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+	if (ret) {
+		/*
+		 * The first i matches need cleanup_entry (calls ->destroy)
+		 * because they had called ->check already. The other j-i
+		 * entries need only release.
+		 */
+		int skip = i;
+		j -= i;
+		xt_entry_foreach(iter0, entry0, newinfo->size) {
+			if (skip-- > 0)
+				continue;
+			if (j-- == 0)
+				break;
+			compat_release_entry(iter0);
+		}
+		xt_entry_foreach(iter1, entry1, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter1);
+		}
+		xt_free_table_info(newinfo);
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i)
+		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+			memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+	*pinfo = newinfo;
+	*pentry0 = entry1;
+	xt_free_table_info(info);
+	return 0;
+
+free_newinfo:
+	xt_free_table_info(newinfo);
+out:
+	xt_entry_foreach(iter0, entry0, total_size) {
+		if (j-- == 0)
+			break;
+		compat_release_entry(iter0);
+	}
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(NFPROTO_ARP);
+	xt_compat_unlock(NFPROTO_ARP);
+	goto out;
+}
+
+struct compat_arpt_replace {
+	char				name[XT_TABLE_MAXNAMELEN];
+	u32				valid_hooks;
+	u32				num_entries;
+	u32				size;
+	u32				hook_entry[NF_ARP_NUMHOOKS];
+	u32				underflow[NF_ARP_NUMHOOKS];
+	u32				num_counters;
+	compat_uptr_t			counters;
+	struct compat_arpt_entry	entries[0];
+};
+
+static int compat_do_replace(struct net *net, void __user *user,
+			     unsigned int len)
+{
+	int ret;
+	struct compat_arpt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct arpt_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.size >= INT_MAX / num_possible_cpus())
+		return -ENOMEM;
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_compat_table(tmp.name, tmp.valid_hooks,
+				     &newinfo, &loc_cpu_entry, tmp.size,
+				     tmp.num_entries, tmp.hook_entry,
+				     tmp.underflow);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("compat_do_replace: Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, compat_ptr(tmp.counters));
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
+				  unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_SET_REPLACE:
+		ret = compat_do_replace(sock_net(sk), user, len);
+		break;
+
+	case ARPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 1);
+		break;
+
+	default:
+		duprintf("do_arpt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
+				     compat_uint_t *size,
+				     struct xt_counters *counters,
+				     unsigned int i)
+{
+	struct xt_entry_target *t;
+	struct compat_arpt_entry __user *ce;
+	u_int16_t target_offset, next_offset;
+	compat_uint_t origsize;
+	int ret;
+
+	origsize = *size;
+	ce = (struct compat_arpt_entry __user *)*dstptr;
+	if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+	    copy_to_user(&ce->counters, &counters[i],
+	    sizeof(counters[i])) != 0)
+		return -EFAULT;
+
+	*dstptr += sizeof(struct compat_arpt_entry);
+	*size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
+
+	target_offset = e->target_offset - (origsize - *size);
+
+	t = arpt_get_target(e);
+	ret = xt_compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+	if (put_user(target_offset, &ce->target_offset) != 0 ||
+	    put_user(next_offset, &ce->next_offset) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_copy_entries_to_user(unsigned int total_size,
+				       struct xt_table *table,
+				       void __user *userptr)
+{
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	void __user *pos;
+	unsigned int size;
+	int ret = 0;
+	void *loc_cpu_entry;
+	unsigned int i = 0;
+	struct arpt_entry *iter;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	/* choose the copy on our node/cpu */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	pos = userptr;
+	size = total_size;
+	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+		ret = compat_copy_entry_to_user(iter, &pos,
+						&size, counters, i++);
+		if (ret != 0)
+			break;
+	}
+	vfree(counters);
+	return ret;
+}
+
+struct compat_arpt_get_entries {
+	char name[XT_TABLE_MAXNAMELEN];
+	compat_uint_t size;
+	struct compat_arpt_entry entrytable[0];
+};
+
+static int compat_get_entries(struct net *net,
+			      struct compat_arpt_get_entries __user *uptr,
+			      int *len)
+{
+	int ret;
+	struct compat_arpt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
+		duprintf("compat_get_entries: %u != %zu\n",
+			 *len, sizeof(get) + get.size);
+		return -EINVAL;
+	}
+
+	xt_compat_lock(NFPROTO_ARP);
+	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
+	if (t && !IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		struct xt_table_info info;
+
+		duprintf("t->private->number = %u\n", private->number);
+		ret = compat_table_info(private, &info);
+		if (!ret && get.size == info.size) {
+			ret = compat_copy_entries_to_user(private->size,
+							  t, uptr->entrytable);
+		} else if (!ret) {
+			duprintf("compat_get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		xt_compat_flush_offsets(NFPROTO_ARP);
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	xt_compat_unlock(NFPROTO_ARP);
+	return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
+				  int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 1);
+		break;
+	case ARPT_SO_GET_ENTRIES:
+		ret = compat_get_entries(sock_net(sk), user, len);
+		break;
+	default:
+		ret = do_arpt_get_ctl(sk, cmd, user, len);
+	}
+	return ret;
+}
+#endif
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_SET_REPLACE:
+		ret = do_replace(sock_net(sk), user, len);
+		break;
+
+	case ARPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 0);
+		break;
+
+	default:
+		duprintf("do_arpt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 0);
+		break;
+
+	case ARPT_SO_GET_ENTRIES:
+		ret = get_entries(sock_net(sk), user, len);
+		break;
+
+	case ARPT_SO_GET_REVISION_TARGET: {
+		struct xt_get_revision rev;
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		rev.name[sizeof(rev.name)-1] = 0;
+
+		try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
+							 rev.revision, 1, &ret),
+					"arpt_%s", rev.name);
+		break;
+	}
+
+	default:
+		duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+struct xt_table *arpt_register_table(struct net *net,
+				     const struct xt_table *table,
+				     const struct arpt_replace *repl)
+{
+	int ret;
+	struct xt_table_info *newinfo;
+	struct xt_table_info bootstrap = {0};
+	void *loc_cpu_entry;
+	struct xt_table *new_table;
+
+	newinfo = xt_alloc_table_info(repl->size);
+	if (!newinfo) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* choose the copy on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+	ret = translate_table(newinfo, loc_cpu_entry, repl);
+	duprintf("arpt_register_table: translate table gives %d\n", ret);
+	if (ret != 0)
+		goto out_free;
+
+	new_table = xt_register_table(net, table, &bootstrap, newinfo);
+	if (IS_ERR(new_table)) {
+		ret = PTR_ERR(new_table);
+		goto out_free;
+	}
+	return new_table;
+
+out_free:
+	xt_free_table_info(newinfo);
+out:
+	return ERR_PTR(ret);
+}
+
+void arpt_unregister_table(struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct arpt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+	{
+		.name             = XT_STANDARD_TARGET,
+		.targetsize       = sizeof(int),
+		.family           = NFPROTO_ARP,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(compat_int_t),
+		.compat_from_user = compat_standard_from_user,
+		.compat_to_user   = compat_standard_to_user,
+#endif
+	},
+	{
+		.name             = XT_ERROR_TARGET,
+		.target           = arpt_error,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
+		.family           = NFPROTO_ARP,
+	},
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= ARPT_BASE_CTL,
+	.set_optmax	= ARPT_SO_SET_MAX+1,
+	.set		= do_arpt_set_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_set	= compat_do_arpt_set_ctl,
+#endif
+	.get_optmin	= ARPT_BASE_CTL,
+	.get_optmax	= ARPT_SO_GET_MAX+1,
+	.get		= do_arpt_get_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_get	= compat_do_arpt_get_ctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init arp_tables_net_init(struct net *net)
+{
+	return xt_proto_init(net, NFPROTO_ARP);
+}
+
+static void __net_exit arp_tables_net_exit(struct net *net)
+{
+	xt_proto_fini(net, NFPROTO_ARP);
+}
+
+static struct pernet_operations arp_tables_net_ops = {
+	.init = arp_tables_net_init,
+	.exit = arp_tables_net_exit,
+};
+
+static int __init arp_tables_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&arp_tables_net_ops);
+	if (ret < 0)
+		goto err1;
+
+	/* No one else will be downing sem now, so we won't sleep */
+	ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+	if (ret < 0)
+		goto err2;
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&arpt_sockopts);
+	if (ret < 0)
+		goto err4;
+
+	printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n");
+	return 0;
+
+err4:
+	xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+err2:
+	unregister_pernet_subsys(&arp_tables_net_ops);
+err1:
+	return ret;
+}
+
+static void __exit arp_tables_fini(void)
+{
+	nf_unregister_sockopt(&arpt_sockopts);
+	xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
+	unregister_pernet_subsys(&arp_tables_net_ops);
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+
+module_init(arp_tables_init);
+module_exit(arp_tables_fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 00000000..a5e52a9f
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,91 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct arpt_mangle *mangle = par->targinfo;
+	const struct arphdr *arp;
+	unsigned char *arpptr;
+	int pln, hln;
+
+	if (!skb_make_writable(skb, skb->len))
+		return NF_DROP;
+
+	arp = arp_hdr(skb);
+	arpptr = skb_network_header(skb) + sizeof(*arp);
+	pln = arp->ar_pln;
+	hln = arp->ar_hln;
+	/* We assume that pln and hln were checked in the match */
+	if (mangle->flags & ARPT_MANGLE_SDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, mangle->src_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_SIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_s.src_ip, pln);
+	}
+	arpptr += pln;
+	if (mangle->flags & ARPT_MANGLE_TDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, mangle->tgt_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_TIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > skb_tail_pointer(skb)))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+	}
+	return mangle->target;
+}
+
+static int checkentry(const struct xt_tgchk_param *par)
+{
+	const struct arpt_mangle *mangle = par->targinfo;
+
+	if (mangle->flags & ~ARPT_MANGLE_MASK ||
+	    !(mangle->flags & ARPT_MANGLE_MASK))
+		return -EINVAL;
+
+	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+	   mangle->target != XT_CONTINUE)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target arpt_mangle_reg __read_mostly = {
+	.name		= "mangle",
+	.family		= NFPROTO_ARP,
+	.target		= target,
+	.targetsize	= sizeof(struct arpt_mangle),
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init arpt_mangle_init(void)
+{
+	return xt_register_target(&arpt_mangle_reg);
+}
+
+static void __exit arpt_mangle_fini(void)
+{
+	xt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(arpt_mangle_init);
+module_exit(arpt_mangle_fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 00000000..79ca5e70
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,93 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+			   (1 << NF_ARP_FORWARD))
+
+static const struct xt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_ARP,
+	.priority	= NF_IP_PRI_FILTER,
+};
+
+/* The work comes in here from netfilter.c */
+static unsigned int
+arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	const struct net *net = dev_net((in != NULL) ? in : out);
+
+	return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
+}
+
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
+
+static int __net_init arptable_filter_net_init(struct net *net)
+{
+	struct arpt_replace *repl;
+	
+	repl = arpt_alloc_initial_table(&packet_filter);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.arptable_filter =
+		arpt_register_table(net, &packet_filter, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.arptable_filter))
+		return PTR_ERR(net->ipv4.arptable_filter);
+	return 0;
+}
+
+static void __net_exit arptable_filter_net_exit(struct net *net)
+{
+	arpt_unregister_table(net->ipv4.arptable_filter);
+}
+
+static struct pernet_operations arptable_filter_net_ops = {
+	.init = arptable_filter_net_init,
+	.exit = arptable_filter_net_exit,
+};
+
+static int __init arptable_filter_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&arptable_filter_net_ops);
+	if (ret < 0)
+		return ret;
+
+	arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
+	if (IS_ERR(arpfilter_ops)) {
+		ret = PTR_ERR(arpfilter_ops);
+		goto cleanup_table;
+	}
+	return ret;
+
+cleanup_table:
+	unregister_pernet_subsys(&arptable_filter_net_ops);
+	return ret;
+}
+
+static void __exit arptable_filter_fini(void)
+{
+	xt_hook_unlink(&packet_filter, arpfilter_ops);
+	unregister_pernet_subsys(&arptable_filter_net_ops);
+}
+
+module_init(arptable_filter_init);
+module_exit(arptable_filter_fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 00000000..5c9b9d96
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,639 @@
+/*
+ * This is a module which is used for queueing IPv4 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/security.h>
+#include <linux/net.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/ip.h>
+
+#define IPQ_QMAX_DEFAULT 1024
+#define IPQ_PROC_FS_NAME "ip_queue"
+#define NET_IPQ_QMAX 2088
+#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
+
+typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
+static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
+static DEFINE_SPINLOCK(queue_lock);
+static int peer_pid __read_mostly;
+static unsigned int copy_range __read_mostly;
+static unsigned int queue_total;
+static unsigned int queue_dropped = 0;
+static unsigned int queue_user_dropped = 0;
+static struct sock *ipqnl __read_mostly;
+static LIST_HEAD(queue_list);
+static DEFINE_MUTEX(ipqnl_mutex);
+
+static inline void
+__ipq_enqueue_entry(struct nf_queue_entry *entry)
+{
+       list_add_tail(&entry->list, &queue_list);
+       queue_total++;
+}
+
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status = 0;
+
+	switch(mode) {
+	case IPQ_COPY_NONE:
+	case IPQ_COPY_META:
+		copy_mode = mode;
+		copy_range = 0;
+		break;
+
+	case IPQ_COPY_PACKET:
+		if (range > 0xFFFF)
+			range = 0xFFFF;
+		copy_range = range;
+		copy_mode = mode;
+		break;
+
+	default:
+		status = -EINVAL;
+
+	}
+	return status;
+}
+
+static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
+
+static inline void
+__ipq_reset(void)
+{
+	peer_pid = 0;
+	net_disable_timestamp();
+	__ipq_set_mode(IPQ_COPY_NONE, 0);
+	__ipq_flush(NULL, 0);
+}
+
+static struct nf_queue_entry *
+ipq_find_dequeue_entry(unsigned long id)
+{
+	struct nf_queue_entry *entry = NULL, *i;
+
+	spin_lock_bh(&queue_lock);
+
+	list_for_each_entry(i, &queue_list, list) {
+		if ((unsigned long)i == id) {
+			entry = i;
+			break;
+		}
+	}
+
+	if (entry) {
+		list_del(&entry->list);
+		queue_total--;
+	}
+
+	spin_unlock_bh(&queue_lock);
+	return entry;
+}
+
+static void
+__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct nf_queue_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &queue_list, list) {
+		if (!cmpfn || cmpfn(entry, data)) {
+			list_del(&entry->list);
+			queue_total--;
+			nf_reinject(entry, NF_DROP);
+		}
+	}
+}
+
+static void
+ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
+{
+	spin_lock_bh(&queue_lock);
+	__ipq_flush(cmpfn, data);
+	spin_unlock_bh(&queue_lock);
+}
+
+static struct sk_buff *
+ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
+{
+	sk_buff_data_t old_tail;
+	size_t size = 0;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct ipq_packet_msg *pmsg;
+	struct nlmsghdr *nlh;
+	struct timeval tv;
+
+	switch (ACCESS_ONCE(copy_mode)) {
+	case IPQ_COPY_META:
+	case IPQ_COPY_NONE:
+		size = NLMSG_SPACE(sizeof(*pmsg));
+		break;
+
+	case IPQ_COPY_PACKET:
+		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
+		    (*errp = skb_checksum_help(entry->skb)))
+			return NULL;
+
+		data_len = ACCESS_ONCE(copy_range);
+		if (data_len == 0 || data_len > entry->skb->len)
+			data_len = entry->skb->len;
+
+		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+		break;
+
+	default:
+		*errp = -EINVAL;
+		return NULL;
+	}
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+
+	old_tail = skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+	pmsg = NLMSG_DATA(nlh);
+	memset(pmsg, 0, sizeof(*pmsg));
+
+	pmsg->packet_id       = (unsigned long )entry;
+	pmsg->data_len        = data_len;
+	tv = ktime_to_timeval(entry->skb->tstamp);
+	pmsg->timestamp_sec   = tv.tv_sec;
+	pmsg->timestamp_usec  = tv.tv_usec;
+	pmsg->mark            = entry->skb->mark;
+	pmsg->hook            = entry->hook;
+	pmsg->hw_protocol     = entry->skb->protocol;
+
+	if (entry->indev)
+		strcpy(pmsg->indev_name, entry->indev->name);
+	else
+		pmsg->indev_name[0] = '\0';
+
+	if (entry->outdev)
+		strcpy(pmsg->outdev_name, entry->outdev->name);
+	else
+		pmsg->outdev_name[0] = '\0';
+
+	if (entry->indev && entry->skb->dev &&
+	    entry->skb->mac_header != entry->skb->network_header) {
+		pmsg->hw_type = entry->skb->dev->type;
+		pmsg->hw_addrlen = dev_parse_header(entry->skb,
+						    pmsg->hw_addr);
+	}
+
+	if (data_len)
+		if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
+			BUG();
+
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+	*errp = -EINVAL;
+	printk(KERN_ERR "ip_queue: error creating packet message\n");
+	return NULL;
+}
+
+static int
+ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+	int status = -EINVAL;
+	struct sk_buff *nskb;
+
+	if (copy_mode == IPQ_COPY_NONE)
+		return -EAGAIN;
+
+	nskb = ipq_build_packet_message(entry, &status);
+	if (nskb == NULL)
+		return status;
+
+	spin_lock_bh(&queue_lock);
+
+	if (!peer_pid)
+		goto err_out_free_nskb;
+
+	if (queue_total >= queue_maxlen) {
+		queue_dropped++;
+		status = -ENOSPC;
+		if (net_ratelimit())
+			  printk (KERN_WARNING "ip_queue: full at %d entries, "
+				  "dropping packets(s). Dropped: %d\n", queue_total,
+				  queue_dropped);
+		goto err_out_free_nskb;
+	}
+
+	/* netlink_unicast will either free the nskb or attach it to a socket */
+	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+	if (status < 0) {
+		queue_user_dropped++;
+		goto err_out_unlock;
+	}
+
+	__ipq_enqueue_entry(entry);
+
+	spin_unlock_bh(&queue_lock);
+	return status;
+
+err_out_free_nskb:
+	kfree_skb(nskb);
+
+err_out_unlock:
+	spin_unlock_bh(&queue_lock);
+	return status;
+}
+
+static int
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
+{
+	int diff;
+	struct iphdr *user_iph = (struct iphdr *)v->payload;
+	struct sk_buff *nskb;
+
+	if (v->data_len < sizeof(*user_iph))
+		return 0;
+	diff = v->data_len - e->skb->len;
+	if (diff < 0) {
+		if (pskb_trim(e->skb, v->data_len))
+			return -ENOMEM;
+	} else if (diff > 0) {
+		if (v->data_len > 0xFFFF)
+			return -EINVAL;
+		if (diff > skb_tailroom(e->skb)) {
+			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+					       diff, GFP_ATOMIC);
+			if (!nskb) {
+				printk(KERN_WARNING "ip_queue: error "
+				      "in mangle, dropping packet\n");
+				return -ENOMEM;
+			}
+			kfree_skb(e->skb);
+			e->skb = nskb;
+		}
+		skb_put(e->skb, diff);
+	}
+	if (!skb_make_writable(e->skb, v->data_len))
+		return -ENOMEM;
+	skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
+
+	return 0;
+}
+
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
+{
+	struct nf_queue_entry *entry;
+
+	if (vmsg->value > NF_MAX_VERDICT)
+		return -EINVAL;
+
+	entry = ipq_find_dequeue_entry(vmsg->id);
+	if (entry == NULL)
+		return -ENOENT;
+	else {
+		int verdict = vmsg->value;
+
+		if (vmsg->data_len && vmsg->data_len == len)
+			if (ipq_mangle_ipv4(vmsg, entry) < 0)
+				verdict = NF_DROP;
+
+		nf_reinject(entry, verdict);
+		return 0;
+	}
+}
+
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status;
+
+	spin_lock_bh(&queue_lock);
+	status = __ipq_set_mode(mode, range);
+	spin_unlock_bh(&queue_lock);
+	return status;
+}
+
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+		 unsigned char type, unsigned int len)
+{
+	int status = 0;
+
+	if (len < sizeof(*pmsg))
+		return -EINVAL;
+
+	switch (type) {
+	case IPQM_MODE:
+		status = ipq_set_mode(pmsg->msg.mode.value,
+				      pmsg->msg.mode.range);
+		break;
+
+	case IPQM_VERDICT:
+		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+			status = -EINVAL;
+		else
+			status = ipq_set_verdict(&pmsg->msg.verdict,
+						 len - sizeof(*pmsg));
+			break;
+	default:
+		status = -EINVAL;
+	}
+	return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+	if (entry->indev)
+		if (entry->indev->ifindex == ifindex)
+			return 1;
+	if (entry->outdev)
+		if (entry->outdev->ifindex == ifindex)
+			return 1;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (entry->skb->nf_bridge) {
+		if (entry->skb->nf_bridge->physindev &&
+		    entry->skb->nf_bridge->physindev->ifindex == ifindex)
+			return 1;
+		if (entry->skb->nf_bridge->physoutdev &&
+		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
+			return 1;
+	}
+#endif
+	return 0;
+}
+
+static void
+ipq_dev_drop(int ifindex)
+{
+	ipq_flush(dev_cmp, ifindex);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void
+__ipq_rcv_skb(struct sk_buff *skb)
+{
+	int status, type, pid, flags;
+	unsigned int nlmsglen, skblen;
+	struct nlmsghdr *nlh;
+
+	skblen = skb->len;
+	if (skblen < sizeof(*nlh))
+		return;
+
+	nlh = nlmsg_hdr(skb);
+	nlmsglen = nlh->nlmsg_len;
+	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+		return;
+
+	pid = nlh->nlmsg_pid;
+	flags = nlh->nlmsg_flags;
+
+	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
+		RCV_SKB_FAIL(-EINVAL);
+
+	if (flags & MSG_TRUNC)
+		RCV_SKB_FAIL(-ECOMM);
+
+	type = nlh->nlmsg_type;
+	if (type < NLMSG_NOOP || type >= IPQM_MAX)
+		RCV_SKB_FAIL(-EINVAL);
+
+	if (type <= IPQM_BASE)
+		return;
+
+	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+		RCV_SKB_FAIL(-EPERM);
+
+	spin_lock_bh(&queue_lock);
+
+	if (peer_pid) {
+		if (peer_pid != pid) {
+			spin_unlock_bh(&queue_lock);
+			RCV_SKB_FAIL(-EBUSY);
+		}
+	} else {
+		net_enable_timestamp();
+		peer_pid = pid;
+	}
+
+	spin_unlock_bh(&queue_lock);
+
+	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+				  nlmsglen - NLMSG_LENGTH(0));
+	if (status < 0)
+		RCV_SKB_FAIL(status);
+
+	if (flags & NLM_F_ACK)
+		netlink_ack(skb, nlh, 0);
+}
+
+static void
+ipq_rcv_skb(struct sk_buff *skb)
+{
+	mutex_lock(&ipqnl_mutex);
+	__ipq_rcv_skb(skb);
+	mutex_unlock(&ipqnl_mutex);
+}
+
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+		  unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	/* Drop any packets associated with the downed device */
+	if (event == NETDEV_DOWN)
+		ipq_dev_drop(dev->ifindex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_dev_notifier = {
+	.notifier_call	= ipq_rcv_dev_event,
+};
+
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+		 unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
+		spin_lock_bh(&queue_lock);
+		if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
+			__ipq_reset();
+		spin_unlock_bh(&queue_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_nl_notifier = {
+	.notifier_call	= ipq_rcv_nl_event,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *ipq_sysctl_header;
+
+static ctl_table ipq_table[] = {
+	{
+		.procname	= NET_IPQ_QMAX_NAME,
+		.data		= &queue_maxlen,
+		.maxlen		= sizeof(queue_maxlen),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+#endif
+
+#ifdef CONFIG_PROC_FS
+static int ip_queue_show(struct seq_file *m, void *v)
+{
+	spin_lock_bh(&queue_lock);
+
+	seq_printf(m,
+		      "Peer PID          : %d\n"
+		      "Copy mode         : %hu\n"
+		      "Copy range        : %u\n"
+		      "Queue length      : %u\n"
+		      "Queue max. length : %u\n"
+		      "Queue dropped     : %u\n"
+		      "Netlink dropped   : %u\n",
+		      peer_pid,
+		      copy_mode,
+		      copy_range,
+		      queue_total,
+		      queue_maxlen,
+		      queue_dropped,
+		      queue_user_dropped);
+
+	spin_unlock_bh(&queue_lock);
+	return 0;
+}
+
+static int ip_queue_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ip_queue_show, NULL);
+}
+
+static const struct file_operations ip_queue_proc_fops = {
+	.open		= ip_queue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.owner		= THIS_MODULE,
+};
+#endif
+
+static const struct nf_queue_handler nfqh = {
+	.name	= "ip_queue",
+	.outfn	= &ipq_enqueue_packet,
+};
+
+static int __init ip_queue_init(void)
+{
+	int status = -ENOMEM;
+	struct proc_dir_entry *proc __maybe_unused;
+
+	netlink_register_notifier(&ipq_nl_notifier);
+	ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
+				      ipq_rcv_skb, NULL, THIS_MODULE);
+	if (ipqnl == NULL) {
+		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
+			   &ip_queue_proc_fops);
+	if (!proc) {
+		printk(KERN_ERR "ip_queue: failed to create proc entry\n");
+		goto cleanup_ipqnl;
+	}
+#endif
+	register_netdevice_notifier(&ipq_dev_notifier);
+#ifdef CONFIG_SYSCTL
+	ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table);
+#endif
+	status = nf_register_queue_handler(NFPROTO_IPV4, &nfqh);
+	if (status < 0) {
+		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
+		goto cleanup_sysctl;
+	}
+	return status;
+
+cleanup_sysctl:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(ipq_sysctl_header);
+#endif
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
+cleanup_ipqnl: __maybe_unused
+	netlink_kernel_release(ipqnl);
+	mutex_lock(&ipqnl_mutex);
+	mutex_unlock(&ipqnl_mutex);
+
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&ipq_nl_notifier);
+	return status;
+}
+
+static void __exit ip_queue_fini(void)
+{
+	nf_unregister_queue_handlers(&nfqh);
+
+	ipq_flush(NULL, 0);
+
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(ipq_sysctl_header);
+#endif
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
+
+	netlink_kernel_release(ipqnl);
+	mutex_lock(&ipqnl_mutex);
+	mutex_unlock(&ipqnl_mutex);
+
+	netlink_unregister_notifier(&ipq_nl_notifier);
+}
+
+MODULE_DESCRIPTION("IPv4 packet queue handler");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_FIREWALL);
+
+module_init(ip_queue_init);
+module_exit(ip_queue_fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 00000000..24e556e8
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,2271 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cache.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) pr_info(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) pr_info(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x)		WARN_ON(!(x))
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+	return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip_packet_match(const struct iphdr *ip,
+		const char *indev,
+		const char *outdev,
+		const struct ipt_ip *ipinfo,
+		int isfrag)
+{
+	unsigned long ret;
+
+#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
+
+	if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+		  IPT_INV_SRCIP) ||
+	    FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+		  IPT_INV_DSTIP)) {
+		dprintf("Source or dest mismatch.\n");
+
+		dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
+			&ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
+			ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+		dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
+			&ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
+			ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+		return false;
+	}
+
+	ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
+
+	if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+		dprintf("VIA in mismatch (%s vs %s).%s\n",
+			indev, ipinfo->iniface,
+			ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+		return false;
+	}
+
+	ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
+
+	if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+		dprintf("VIA out mismatch (%s vs %s).%s\n",
+			outdev, ipinfo->outiface,
+			ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+		return false;
+	}
+
+	/* Check specific protocol */
+	if (ipinfo->proto &&
+	    FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+		dprintf("Packet protocol %hi does not match %hi.%s\n",
+			ip->protocol, ipinfo->proto,
+			ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+		return false;
+	}
+
+	/* If we have a fragment rule but the packet is not a fragment
+	 * then we return zero */
+	if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+		dprintf("Fragment rule but not fragment.%s\n",
+			ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+		return false;
+	}
+
+	return true;
+}
+
+static bool
+ip_checkentry(const struct ipt_ip *ip)
+{
+	if (ip->flags & ~IPT_F_MASK) {
+		duprintf("Unknown flag bits set: %08X\n",
+			 ip->flags & ~IPT_F_MASK);
+		return false;
+	}
+	if (ip->invflags & ~IPT_INV_MASK) {
+		duprintf("Unknown invflag bits set: %08X\n",
+			 ip->invflags & ~IPT_INV_MASK);
+		return false;
+	}
+	return true;
+}
+
+static unsigned int
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (net_ratelimit())
+		pr_info("error: `%s'\n", (const char *)par->targinfo);
+
+	return NF_DROP;
+}
+
+/* Performance critical */
+static inline struct ipt_entry *
+get_entry(const void *base, unsigned int offset)
+{
+	return (struct ipt_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ipt_ip *ip)
+{
+	static const struct ipt_ip uncond;
+
+	return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+#undef FWINV
+}
+
+/* for const-correctness */
+static inline const struct xt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+	return ipt_get_target((struct ipt_entry *)e);
+}
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+static const char *const hooknames[] = {
+	[NF_INET_PRE_ROUTING]		= "PREROUTING",
+	[NF_INET_LOCAL_IN]		= "INPUT",
+	[NF_INET_FORWARD]		= "FORWARD",
+	[NF_INET_LOCAL_OUT]		= "OUTPUT",
+	[NF_INET_POST_ROUTING]		= "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+	NF_IP_TRACE_COMMENT_RULE,
+	NF_IP_TRACE_COMMENT_RETURN,
+	NF_IP_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+	[NF_IP_TRACE_COMMENT_RULE]	= "rule",
+	[NF_IP_TRACE_COMMENT_RETURN]	= "return",
+	[NF_IP_TRACE_COMMENT_POLICY]	= "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+	.type = NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level = 4,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
+		      const char *hookname, const char **chainname,
+		      const char **comment, unsigned int *rulenum)
+{
+	const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
+
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+		/* Head of user chain: ERROR target with chainname */
+		*chainname = t->target.data;
+		(*rulenum) = 0;
+	} else if (s == e) {
+		(*rulenum)++;
+
+		if (s->target_offset == sizeof(struct ipt_entry) &&
+		    strcmp(t->target.u.kernel.target->name,
+			   XT_STANDARD_TARGET) == 0 &&
+		   t->verdict < 0 &&
+		   unconditional(&s->ip)) {
+			/* Tail of chains: STANDARD target (return/policy) */
+			*comment = *chainname == hookname
+				? comments[NF_IP_TRACE_COMMENT_POLICY]
+				: comments[NF_IP_TRACE_COMMENT_RETURN];
+		}
+		return 1;
+	} else
+		(*rulenum)++;
+
+	return 0;
+}
+
+static void trace_packet(const struct sk_buff *skb,
+			 unsigned int hook,
+			 const struct net_device *in,
+			 const struct net_device *out,
+			 const char *tablename,
+			 const struct xt_table_info *private,
+			 const struct ipt_entry *e)
+{
+	const void *table_base;
+	const struct ipt_entry *root;
+	const char *hookname, *chainname, *comment;
+	const struct ipt_entry *iter;
+	unsigned int rulenum = 0;
+
+	table_base = private->entries[smp_processor_id()];
+	root = get_entry(table_base, private->hook_entry[hook]);
+
+	hookname = chainname = hooknames[hook];
+	comment = comments[NF_IP_TRACE_COMMENT_RULE];
+
+	xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+		if (get_chainname_rulenum(iter, e, hookname,
+		    &chainname, &comment, &rulenum) != 0)
+			break;
+
+	nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
+		      "TRACE: %s:%s:%s:%u ",
+		      tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline __pure
+struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff *skb,
+	     unsigned int hook,
+	     const struct net_device *in,
+	     const struct net_device *out,
+	     struct xt_table *table)
+{
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	const struct iphdr *ip;
+	/* Initializing verdict to NF_DROP keeps gcc happy. */
+	unsigned int verdict = NF_DROP;
+	const char *indev, *outdev;
+	const void *table_base;
+	struct ipt_entry *e, **jumpstack;
+	unsigned int *stackptr, origptr, cpu;
+	const struct xt_table_info *private;
+	struct xt_action_param acpar;
+	unsigned int addend;
+
+	/* Initialization */
+	ip = ip_hdr(skb);
+	indev = in ? in->name : nulldevname;
+	outdev = out ? out->name : nulldevname;
+	/* We handle fragments by dealing with the first fragment as
+	 * if it was a normal packet.  All other fragments are treated
+	 * normally, except that they will NEVER match rules that ask
+	 * things we don't know, ie. tcp syn flag or ports).  If the
+	 * rule is also a fragment-specific rule, non-fragments won't
+	 * match it. */
+	acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+	acpar.thoff   = ip_hdrlen(skb);
+	acpar.hotdrop = false;
+	acpar.in      = in;
+	acpar.out     = out;
+	acpar.family  = NFPROTO_IPV4;
+	acpar.hooknum = hook;
+
+	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = table->private;
+	cpu        = smp_processor_id();
+	table_base = private->entries[cpu];
+	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
+	origptr    = *stackptr;
+
+	e = get_entry(table_base, private->hook_entry[hook]);
+
+	pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
+		 table->name, hook, origptr,
+		 get_entry(table_base, private->underflow[hook]));
+
+	do {
+		const struct xt_entry_target *t;
+		const struct xt_entry_match *ematch;
+
+		IP_NF_ASSERT(e);
+		if (!ip_packet_match(ip, indev, outdev,
+		    &e->ip, acpar.fragoff)) {
+ no_match:
+			e = ipt_next_entry(e);
+			continue;
+		}
+
+		xt_ematch_foreach(ematch, e) {
+			acpar.match     = ematch->u.kernel.match;
+			acpar.matchinfo = ematch->data;
+			if (!acpar.match->match(skb, &acpar))
+				goto no_match;
+		}
+
+		ADD_COUNTER(e->counters, skb->len, 1);
+
+		t = ipt_get_target(e);
+		IP_NF_ASSERT(t->u.kernel.target);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+		/* The packet is traced: log it */
+		if (unlikely(skb->nf_trace))
+			trace_packet(skb, hook, in, out,
+				     table->name, private, e);
+#endif
+		/* Standard target? */
+		if (!t->u.kernel.target->target) {
+			int v;
+
+			v = ((struct xt_standard_target *)t)->verdict;
+			if (v < 0) {
+				/* Pop from stack? */
+				if (v != XT_RETURN) {
+					verdict = (unsigned)(-v) - 1;
+					break;
+				}
+				if (*stackptr <= origptr) {
+					e = get_entry(table_base,
+					    private->underflow[hook]);
+					pr_debug("Underflow (this is normal) "
+						 "to %p\n", e);
+				} else {
+					e = jumpstack[--*stackptr];
+					pr_debug("Pulled %p out from pos %u\n",
+						 e, *stackptr);
+					e = ipt_next_entry(e);
+				}
+				continue;
+			}
+			if (table_base + v != ipt_next_entry(e) &&
+			    !(e->ip.flags & IPT_F_GOTO)) {
+				if (*stackptr >= private->stacksize) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[(*stackptr)++] = e;
+				pr_debug("Pushed %p into pos %u\n",
+					 e, *stackptr - 1);
+			}
+
+			e = get_entry(table_base, v);
+			continue;
+		}
+
+		acpar.target   = t->u.kernel.target;
+		acpar.targinfo = t->data;
+
+		verdict = t->u.kernel.target->target(skb, &acpar);
+		/* Target might have changed stuff. */
+		ip = ip_hdr(skb);
+		if (verdict == XT_CONTINUE)
+			e = ipt_next_entry(e);
+		else
+			/* Verdict */
+			break;
+	} while (!acpar.hotdrop);
+	pr_debug("Exiting %s; resetting sp from %u to %u\n",
+		 __func__, *stackptr, origptr);
+	*stackptr = origptr;
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
+
+#ifdef DEBUG_ALLOW_ALL
+	return NF_ACCEPT;
+#else
+	if (acpar.hotdrop)
+		return NF_DROP;
+	else return verdict;
+#endif
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+   there are loops.  Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+		   unsigned int valid_hooks, void *entry0)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	   to 0 as we leave), and comefrom to save source hook bitmask */
+	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos);
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			const struct xt_standard_target *t
+				= (void *)ipt_get_target_c(e);
+			int visited = e->comefrom & (1 << hook);
+
+			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+				pr_err("iptables: loop hook %u pos %u %08X.\n",
+				       hook, pos, e->comefrom);
+				return 0;
+			}
+			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if ((e->target_offset == sizeof(struct ipt_entry) &&
+			     (strcmp(t->target.u.user.name,
+				     XT_STANDARD_TARGET) == 0) &&
+			     t->verdict < 0 && unconditional(&e->ip)) ||
+			    visited) {
+				unsigned int oldpos, size;
+
+				if ((strcmp(t->target.u.user.name,
+			    		    XT_STANDARD_TARGET) == 0) &&
+				    t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
+				/* Return: backtrack through the last
+				   big jump. */
+				do {
+					e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+					if (e->comefrom
+					    & (1 << NF_INET_NUMHOOKS)) {
+						duprintf("Back unset "
+							 "on hook %u "
+							 "rule %u\n",
+							 hook, pos);
+					}
+#endif
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = (struct ipt_entry *)
+						(entry0 + pos);
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = (struct ipt_entry *)
+					(entry0 + pos + size);
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   XT_STANDARD_TARGET) == 0 &&
+				    newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct ipt_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
+					/* This a jump; chase it. */
+					duprintf("Jump rule %u -> %u\n",
+						 pos, newpos);
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+				}
+				e = (struct ipt_entry *)
+					(entry0 + newpos);
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+		next:
+		duprintf("Finished chain %u\n", hook);
+	}
+	return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+	struct xt_mtdtor_param par;
+
+	par.net       = net;
+	par.match     = m->u.kernel.match;
+	par.matchinfo = m->data;
+	par.family    = NFPROTO_IPV4;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
+}
+
+static int
+check_entry(const struct ipt_entry *e, const char *name)
+{
+	const struct xt_entry_target *t;
+
+	if (!ip_checkentry(&e->ip)) {
+		duprintf("ip check failed %p %s.\n", e, name);
+		return -EINVAL;
+	}
+
+	if (e->target_offset + sizeof(struct xt_entry_target) >
+	    e->next_offset)
+		return -EINVAL;
+
+	t = ipt_get_target_c(e);
+	if (e->target_offset + t->u.target_size > e->next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	const struct ipt_ip *ip = par->entryinfo;
+	int ret;
+
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
+	ret = xt_check_match(par, m->u.match_size - sizeof(*m),
+	      ip->proto, ip->invflags & IPT_INV_PROTO);
+	if (ret < 0) {
+		duprintf("check failed for `%s'.\n", par->match->name);
+		return ret;
+	}
+	return 0;
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	struct xt_match *match;
+	int ret;
+
+	match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match)) {
+		duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+		return PTR_ERR(match);
+	}
+	m->u.kernel.match = match;
+
+	ret = check_match(m, par);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	module_put(m->u.kernel.match->me);
+	return ret;
+}
+
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
+{
+	struct xt_entry_target *t = ipt_get_target(e);
+	struct xt_tgchk_param par = {
+		.net       = net,
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+		.family    = NFPROTO_IPV4,
+	};
+	int ret;
+
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
+	      e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
+	if (ret < 0) {
+		duprintf("check failed for `%s'.\n",
+			 t->u.kernel.target->name);
+		return ret;
+	}
+	return 0;
+}
+
+static int
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+		 unsigned int size)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	int ret;
+	unsigned int j;
+	struct xt_mtchk_param mtpar;
+	struct xt_entry_match *ematch;
+
+	ret = check_entry(e, name);
+	if (ret)
+		return ret;
+
+	j = 0;
+	mtpar.net	= net;
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ip;
+	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV4;
+	xt_ematch_foreach(ematch, e) {
+		ret = find_check_match(ematch, &mtpar);
+		if (ret != 0)
+			goto cleanup_matches;
+		++j;
+	}
+
+	t = ipt_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+		ret = PTR_ERR(target);
+		goto cleanup_matches;
+	}
+	t->u.kernel.target = target;
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto err;
+	return 0;
+ err:
+	module_put(t->u.kernel.target->me);
+ cleanup_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		cleanup_match(ematch, net);
+	}
+	return ret;
+}
+
+static bool check_underflow(const struct ipt_entry *e)
+{
+	const struct xt_entry_target *t;
+	unsigned int verdict;
+
+	if (!unconditional(&e->ip))
+		return false;
+	t = ipt_get_target_c(e);
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+		return false;
+	verdict = ((struct xt_standard_target *)t)->verdict;
+	verdict = -verdict - 1;
+	return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ipt_entry *e,
+			   struct xt_table_info *newinfo,
+			   const unsigned char *base,
+			   const unsigned char *limit,
+			   const unsigned int *hook_entries,
+			   const unsigned int *underflows,
+			   unsigned int valid_hooks)
+{
+	unsigned int h;
+
+	if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+		duprintf("Bad offset %p\n", e);
+		return -EINVAL;
+	}
+
+	if (e->next_offset
+	    < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if (!(valid_hooks & (1 << h)))
+			continue;
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h]) {
+			if (!check_underflow(e)) {
+				pr_err("Underflows must be unconditional and "
+				       "use the STANDARD target with "
+				       "ACCEPT/DROP\n");
+				return -EINVAL;
+			}
+			newinfo->underflow[h] = underflows[h];
+		}
+	}
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct xt_counters) { 0, 0 });
+	e->comefrom = 0;
+	return 0;
+}
+
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
+{
+	struct xt_tgdtor_param par;
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		cleanup_match(ematch, net);
+	t = ipt_get_target(e);
+
+	par.net      = net;
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_IPV4;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+   newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+                const struct ipt_replace *repl)
+{
+	struct ipt_entry *iter;
+	unsigned int i;
+	int ret = 0;
+
+	newinfo->size = repl->size;
+	newinfo->number = repl->num_entries;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_table: size %u\n", newinfo->size);
+	i = 0;
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+						 entry0 + repl->size,
+						 repl->hook_entry,
+						 repl->underflow,
+						 repl->valid_hooks);
+		if (ret != 0)
+			return ret;
+		++i;
+		if (strcmp(ipt_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+
+	if (i != repl->num_entries) {
+		duprintf("translate_table: %u not %u entries\n",
+			 i, repl->num_entries);
+		return -EINVAL;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(repl->valid_hooks & (1 << i)))
+			continue;
+		if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, repl->hook_entry[i]);
+			return -EINVAL;
+		}
+		if (newinfo->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, repl->underflow[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
+		return -ELOOP;
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = find_check_entry(iter, net, repl->name, repl->size);
+		if (ret != 0)
+			break;
+		++i;
+	}
+
+	if (ret != 0) {
+		xt_entry_foreach(iter, entry0, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter, net);
+		}
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i) {
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
+	}
+
+	return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+	     struct xt_counters counters[])
+{
+	struct ipt_entry *iter;
+	unsigned int cpu;
+	unsigned int i;
+
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+		i = 0;
+		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			++i; /* macro does multi eval of i */
+		}
+	}
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+	unsigned int countersize;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	   (other than comefrom, which userspace doesn't care
+	   about). */
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vzalloc(countersize);
+
+	if (counters == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	get_counters(private, counters);
+
+	return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+		     const struct xt_table *table,
+		     void __user *userptr)
+{
+	unsigned int off, num;
+	const struct ipt_entry *e;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	int ret = 0;
+	const void *loc_cpu_entry;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	/* choose the copy that is on our node/cpu, ...
+	 * This choice is lazy (because current thread is
+	 * allowed to migrate to another cpu)
+	 */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+		ret = -EFAULT;
+		goto free_counters;
+	}
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		unsigned int i;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
+
+		e = (struct ipt_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off
+				 + offsetof(struct ipt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		for (i = sizeof(struct ipt_entry);
+		     i < e->target_offset;
+		     i += m->u.match_size) {
+			m = (void *)e + i;
+
+			if (copy_to_user(userptr + off + i
+					 + offsetof(struct xt_entry_match,
+						    u.user.name),
+					 m->u.kernel.match->name,
+					 strlen(m->u.kernel.match->name)+1)
+			    != 0) {
+				ret = -EFAULT;
+				goto free_counters;
+			}
+		}
+
+		t = ipt_get_target_c(e);
+		if (copy_to_user(userptr + off + e->target_offset
+				 + offsetof(struct xt_entry_target,
+					    u.user.name),
+				 t->u.kernel.target->name,
+				 strlen(t->u.kernel.target->name)+1) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v > 0)
+		v += xt_compat_calc_jump(AF_INET, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv > 0)
+		cv -= xt_compat_calc_jump(AF_INET, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ipt_entry *e,
+			     const struct xt_table_info *info,
+			     const void *base, struct xt_table_info *newinfo)
+{
+	const struct xt_entry_match *ematch;
+	const struct xt_entry_target *t;
+	unsigned int entry_offset;
+	int off, i, ret;
+
+	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+	entry_offset = (void *)e - base;
+	xt_ematch_foreach(ematch, e)
+		off += xt_compat_match_offset(ematch->u.kernel.match);
+	t = ipt_get_target_c(e);
+	off += xt_compat_target_offset(t->u.kernel.target);
+	newinfo->size -= off;
+	ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		if (info->hook_entry[i] &&
+		    (e < (struct ipt_entry *)(base + info->hook_entry[i])))
+			newinfo->hook_entry[i] -= off;
+		if (info->underflow[i] &&
+		    (e < (struct ipt_entry *)(base + info->underflow[i])))
+			newinfo->underflow[i] -= off;
+	}
+	return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+			     struct xt_table_info *newinfo)
+{
+	struct ipt_entry *iter;
+	void *loc_cpu_entry;
+	int ret;
+
+	if (!newinfo || !info)
+		return -EINVAL;
+
+	/* we dont care about newinfo->entries[] */
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	newinfo->initial_entries = 0;
+	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(AF_INET, info->number);
+	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+		if (ret != 0)
+			return ret;
+	}
+	return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+                    const int *len, int compat)
+{
+	char name[XT_TABLE_MAXNAMELEN];
+	struct xt_table *t;
+	int ret;
+
+	if (*len != sizeof(struct ipt_getinfo)) {
+		duprintf("length %u != %zu\n", *len,
+			 sizeof(struct ipt_getinfo));
+		return -EINVAL;
+	}
+
+	if (copy_from_user(name, user, sizeof(name)) != 0)
+		return -EFAULT;
+
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_lock(AF_INET);
+#endif
+	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+				    "iptable_%s", name);
+	if (t && !IS_ERR(t)) {
+		struct ipt_getinfo info;
+		const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+		struct xt_table_info tmp;
+
+		if (compat) {
+			ret = compat_table_info(private, &tmp);
+			xt_compat_flush_offsets(AF_INET);
+			private = &tmp;
+		}
+#endif
+		memset(&info, 0, sizeof(info));
+		info.valid_hooks = t->valid_hooks;
+		memcpy(info.hook_entry, private->hook_entry,
+		       sizeof(info.hook_entry));
+		memcpy(info.underflow, private->underflow,
+		       sizeof(info.underflow));
+		info.num_entries = private->number;
+		info.size = private->size;
+		strcpy(info.name, name);
+
+		if (copy_to_user(user, &info, *len) != 0)
+			ret = -EFAULT;
+		else
+			ret = 0;
+
+		xt_table_unlock(t);
+		module_put(t->me);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_unlock(AF_INET);
+#endif
+	return ret;
+}
+
+static int
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+	    const int *len)
+{
+	int ret;
+	struct ipt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct ipt_get_entries) + get.size) {
+		duprintf("get_entries: %u != %zu\n",
+			 *len, sizeof(get) + get.size);
+		return -EINVAL;
+	}
+
+	t = xt_find_table_lock(net, AF_INET, get.name);
+	if (t && !IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		duprintf("t->private->number = %u\n", private->number);
+		if (get.size == private->size)
+			ret = copy_entries_to_user(private->size,
+						   t, uptr->entrytable);
+		else {
+			duprintf("get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+	     struct xt_table_info *newinfo, unsigned int num_counters,
+	     void __user *counters_ptr)
+{
+	int ret;
+	struct xt_table *t;
+	struct xt_table_info *oldinfo;
+	struct xt_counters *counters;
+	void *loc_cpu_old_entry;
+	struct ipt_entry *iter;
+
+	ret = 0;
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	if (!counters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	t = try_then_request_module(xt_find_table_lock(net, AF_INET, name),
+				    "iptable_%s", name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (valid_hooks != t->valid_hooks) {
+		duprintf("Valid hook crap: %08X vs %08X\n",
+			 valid_hooks, t->valid_hooks);
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+		oldinfo->number, oldinfo->initial_entries, newinfo->number);
+	if ((oldinfo->number > oldinfo->initial_entries) ||
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	/* Get the old counters, and synchronize with replace */
+	get_counters(oldinfo, counters);
+
+	/* Decrease module usage counts and free resource */
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+		cleanup_entry(iter, net);
+
+	xt_free_table_info(oldinfo);
+	if (copy_to_user(counters_ptr, counters,
+			 sizeof(struct xt_counters) * num_counters) != 0)
+		ret = -EFAULT;
+	vfree(counters);
+	xt_table_unlock(t);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+	xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+	vfree(counters);
+ out:
+	return ret;
+}
+
+static int
+do_replace(struct net *net, const void __user *user, unsigned int len)
+{
+	int ret;
+	struct ipt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ipt_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, tmp.counters);
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int
+do_add_counters(struct net *net, const void __user *user,
+                unsigned int len, int compat)
+{
+	unsigned int i, curcpu;
+	struct xt_counters_info tmp;
+	struct xt_counters *paddc;
+	unsigned int num_counters;
+	const char *name;
+	int size;
+	void *ptmp;
+	struct xt_table *t;
+	const struct xt_table_info *private;
+	int ret = 0;
+	void *loc_cpu_entry;
+	struct ipt_entry *iter;
+	unsigned int addend;
+#ifdef CONFIG_COMPAT
+	struct compat_xt_counters_info compat_tmp;
+
+	if (compat) {
+		ptmp = &compat_tmp;
+		size = sizeof(struct compat_xt_counters_info);
+	} else
+#endif
+	{
+		ptmp = &tmp;
+		size = sizeof(struct xt_counters_info);
+	}
+
+	if (copy_from_user(ptmp, user, size) != 0)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		num_counters = compat_tmp.num_counters;
+		name = compat_tmp.name;
+	} else
+#endif
+	{
+		num_counters = tmp.num_counters;
+		name = tmp.name;
+	}
+
+	if (len != size + num_counters * sizeof(struct xt_counters))
+		return -EINVAL;
+
+	paddc = vmalloc(len - size);
+	if (!paddc)
+		return -ENOMEM;
+
+	if (copy_from_user(paddc, user + size, len - size) != 0) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	t = xt_find_table_lock(net, AF_INET, name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free;
+	}
+
+	local_bh_disable();
+	private = t->private;
+	if (private->number != num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	/* Choose the copy that is on our node */
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	addend = xt_write_recseq_begin();
+	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		++i;
+	}
+	xt_write_recseq_end(addend);
+ unlock_up_free:
+	local_bh_enable();
+	xt_table_unlock(t);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_replace {
+	char			name[XT_TABLE_MAXNAMELEN];
+	u32			valid_hooks;
+	u32			num_entries;
+	u32			size;
+	u32			hook_entry[NF_INET_NUMHOOKS];
+	u32			underflow[NF_INET_NUMHOOKS];
+	u32			num_counters;
+	compat_uptr_t		counters;	/* struct xt_counters * */
+	struct compat_ipt_entry	entries[0];
+};
+
+static int
+compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
+			  unsigned int *size, struct xt_counters *counters,
+			  unsigned int i)
+{
+	struct xt_entry_target *t;
+	struct compat_ipt_entry __user *ce;
+	u_int16_t target_offset, next_offset;
+	compat_uint_t origsize;
+	const struct xt_entry_match *ematch;
+	int ret = 0;
+
+	origsize = *size;
+	ce = (struct compat_ipt_entry __user *)*dstptr;
+	if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+	    copy_to_user(&ce->counters, &counters[i],
+	    sizeof(counters[i])) != 0)
+		return -EFAULT;
+
+	*dstptr += sizeof(struct compat_ipt_entry);
+	*size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+	xt_ematch_foreach(ematch, e) {
+		ret = xt_compat_match_to_user(ematch, dstptr, size);
+		if (ret != 0)
+			return ret;
+	}
+	target_offset = e->target_offset - (origsize - *size);
+	t = ipt_get_target(e);
+	ret = xt_compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+	if (put_user(target_offset, &ce->target_offset) != 0 ||
+	    put_user(next_offset, &ce->next_offset) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+		       const char *name,
+		       const struct ipt_ip *ip,
+		       unsigned int hookmask,
+		       int *size)
+{
+	struct xt_match *match;
+
+	match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match)) {
+		duprintf("compat_check_calc_match: `%s' not found\n",
+			 m->u.user.name);
+		return PTR_ERR(match);
+	}
+	m->u.kernel.match = match;
+	*size += xt_compat_match_offset(match);
+	return 0;
+}
+
+static void compat_release_entry(struct compat_ipt_entry *e)
+{
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		module_put(ematch->u.kernel.match->me);
+	t = compat_ipt_get_target(e);
+	module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
+				  struct xt_table_info *newinfo,
+				  unsigned int *size,
+				  const unsigned char *base,
+				  const unsigned char *limit,
+				  const unsigned int *hook_entries,
+				  const unsigned int *underflows,
+				  const char *name)
+{
+	struct xt_entry_match *ematch;
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	unsigned int entry_offset;
+	unsigned int j;
+	int ret, off, h;
+
+	duprintf("check_compat_entry_size_and_hooks %p\n", e);
+	if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+		duprintf("Bad offset %p, limit = %p\n", e, limit);
+		return -EINVAL;
+	}
+
+	if (e->next_offset < sizeof(struct compat_ipt_entry) +
+			     sizeof(struct compat_xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* For purposes of check_entry casting the compat entry is fine */
+	ret = check_entry((struct ipt_entry *)e, name);
+	if (ret)
+		return ret;
+
+	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+	entry_offset = (void *)e - (void *)base;
+	j = 0;
+	xt_ematch_foreach(ematch, e) {
+		ret = compat_find_calc_match(ematch, name,
+					     &e->ip, e->comefrom, &off);
+		if (ret != 0)
+			goto release_matches;
+		++j;
+	}
+
+	t = compat_ipt_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+			 t->u.user.name);
+		ret = PTR_ERR(target);
+		goto release_matches;
+	}
+	t->u.kernel.target = target;
+
+	off += xt_compat_target_offset(target);
+	*size += off;
+	ret = xt_compat_add_offset(AF_INET, entry_offset, off);
+	if (ret)
+		goto out;
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h])
+			newinfo->underflow[h] = underflows[h];
+	}
+
+	/* Clear counters and comefrom */
+	memset(&e->counters, 0, sizeof(e->counters));
+	e->comefrom = 0;
+	return 0;
+
+out:
+	module_put(t->u.kernel.target->me);
+release_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		module_put(ematch->u.kernel.match->me);
+	}
+	return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
+			    unsigned int *size, const char *name,
+			    struct xt_table_info *newinfo, unsigned char *base)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	struct ipt_entry *de;
+	unsigned int origsize;
+	int ret, h;
+	struct xt_entry_match *ematch;
+
+	ret = 0;
+	origsize = *size;
+	de = (struct ipt_entry *)*dstptr;
+	memcpy(de, e, sizeof(struct ipt_entry));
+	memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+	*dstptr += sizeof(struct ipt_entry);
+	*size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
+
+	xt_ematch_foreach(ematch, e) {
+		ret = xt_compat_match_from_user(ematch, dstptr, size);
+		if (ret != 0)
+			return ret;
+	}
+	de->target_offset = e->target_offset - (origsize - *size);
+	t = compat_ipt_get_target(e);
+	target = t->u.kernel.target;
+	xt_compat_target_from_user(t, dstptr, size);
+
+	de->next_offset = e->next_offset - (origsize - *size);
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if ((unsigned char *)de - base < newinfo->hook_entry[h])
+			newinfo->hook_entry[h] -= origsize - *size;
+		if ((unsigned char *)de - base < newinfo->underflow[h])
+			newinfo->underflow[h] -= origsize - *size;
+	}
+	return ret;
+}
+
+static int
+compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
+{
+	struct xt_entry_match *ematch;
+	struct xt_mtchk_param mtpar;
+	unsigned int j;
+	int ret = 0;
+
+	j = 0;
+	mtpar.net	= net;
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ip;
+	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV4;
+	xt_ematch_foreach(ematch, e) {
+		ret = check_match(ematch, &mtpar);
+		if (ret != 0)
+			goto cleanup_matches;
+		++j;
+	}
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto cleanup_matches;
+	return 0;
+
+ cleanup_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		cleanup_match(ematch, net);
+	}
+	return ret;
+}
+
+static int
+translate_compat_table(struct net *net,
+		       const char *name,
+		       unsigned int valid_hooks,
+		       struct xt_table_info **pinfo,
+		       void **pentry0,
+		       unsigned int total_size,
+		       unsigned int number,
+		       unsigned int *hook_entries,
+		       unsigned int *underflows)
+{
+	unsigned int i, j;
+	struct xt_table_info *newinfo, *info;
+	void *pos, *entry0, *entry1;
+	struct compat_ipt_entry *iter0;
+	struct ipt_entry *iter1;
+	unsigned int size;
+	int ret;
+
+	info = *pinfo;
+	entry0 = *pentry0;
+	size = total_size;
+	info->number = number;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		info->hook_entry[i] = 0xFFFFFFFF;
+		info->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_compat_table: size %u\n", info->size);
+	j = 0;
+	xt_compat_lock(AF_INET);
+	xt_compat_init_offsets(AF_INET, number);
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+							entry0,
+							entry0 + total_size,
+							hook_entries,
+							underflows,
+							name);
+		if (ret != 0)
+			goto out_unlock;
+		++j;
+	}
+
+	ret = -EINVAL;
+	if (j != number) {
+		duprintf("translate_compat_table: %u not %u entries\n",
+			 j, number);
+		goto out_unlock;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(valid_hooks & (1 << i)))
+			continue;
+		if (info->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, hook_entries[i]);
+			goto out_unlock;
+		}
+		if (info->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, underflows[i]);
+			goto out_unlock;
+		}
+	}
+
+	ret = -ENOMEM;
+	newinfo = xt_alloc_table_info(size);
+	if (!newinfo)
+		goto out_unlock;
+
+	newinfo->number = number;
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = info->hook_entry[i];
+		newinfo->underflow[i] = info->underflow[i];
+	}
+	entry1 = newinfo->entries[raw_smp_processor_id()];
+	pos = entry1;
+	size = total_size;
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = compat_copy_entry_from_user(iter0, &pos, &size,
+						  name, newinfo, entry1);
+		if (ret != 0)
+			break;
+	}
+	xt_compat_flush_offsets(AF_INET);
+	xt_compat_unlock(AF_INET);
+	if (ret)
+		goto free_newinfo;
+
+	ret = -ELOOP;
+	if (!mark_source_chains(newinfo, valid_hooks, entry1))
+		goto free_newinfo;
+
+	i = 0;
+	xt_entry_foreach(iter1, entry1, newinfo->size) {
+		ret = compat_check_entry(iter1, net, name);
+		if (ret != 0)
+			break;
+		++i;
+		if (strcmp(ipt_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+	if (ret) {
+		/*
+		 * The first i matches need cleanup_entry (calls ->destroy)
+		 * because they had called ->check already. The other j-i
+		 * entries need only release.
+		 */
+		int skip = i;
+		j -= i;
+		xt_entry_foreach(iter0, entry0, newinfo->size) {
+			if (skip-- > 0)
+				continue;
+			if (j-- == 0)
+				break;
+			compat_release_entry(iter0);
+		}
+		xt_entry_foreach(iter1, entry1, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter1, net);
+		}
+		xt_free_table_info(newinfo);
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i)
+		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+			memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+	*pinfo = newinfo;
+	*pentry0 = entry1;
+	xt_free_table_info(info);
+	return 0;
+
+free_newinfo:
+	xt_free_table_info(newinfo);
+out:
+	xt_entry_foreach(iter0, entry0, total_size) {
+		if (j-- == 0)
+			break;
+		compat_release_entry(iter0);
+	}
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(AF_INET);
+	xt_compat_unlock(AF_INET);
+	goto out;
+}
+
+static int
+compat_do_replace(struct net *net, void __user *user, unsigned int len)
+{
+	int ret;
+	struct compat_ipt_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ipt_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.size >= INT_MAX / num_possible_cpus())
+		return -ENOMEM;
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
+				     &newinfo, &loc_cpu_entry, tmp.size,
+				     tmp.num_entries, tmp.hook_entry,
+				     tmp.underflow);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("compat_do_replace: Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, compat_ptr(tmp.counters));
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int
+compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
+		      unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_SET_REPLACE:
+		ret = compat_do_replace(sock_net(sk), user, len);
+		break;
+
+	case IPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 1);
+		break;
+
+	default:
+		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+struct compat_ipt_get_entries {
+	char name[XT_TABLE_MAXNAMELEN];
+	compat_uint_t size;
+	struct compat_ipt_entry entrytable[0];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+			    void __user *userptr)
+{
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	void __user *pos;
+	unsigned int size;
+	int ret = 0;
+	const void *loc_cpu_entry;
+	unsigned int i = 0;
+	struct ipt_entry *iter;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	/* choose the copy that is on our node/cpu, ...
+	 * This choice is lazy (because current thread is
+	 * allowed to migrate to another cpu)
+	 */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	pos = userptr;
+	size = total_size;
+	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+		ret = compat_copy_entry_to_user(iter, &pos,
+						&size, counters, i++);
+		if (ret != 0)
+			break;
+	}
+
+	vfree(counters);
+	return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
+		   int *len)
+{
+	int ret;
+	struct compat_ipt_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+
+	if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
+		duprintf("compat_get_entries: %u != %zu\n",
+			 *len, sizeof(get) + get.size);
+		return -EINVAL;
+	}
+
+	xt_compat_lock(AF_INET);
+	t = xt_find_table_lock(net, AF_INET, get.name);
+	if (t && !IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		struct xt_table_info info;
+		duprintf("t->private->number = %u\n", private->number);
+		ret = compat_table_info(private, &info);
+		if (!ret && get.size == info.size) {
+			ret = compat_copy_entries_to_user(private->size,
+							  t, uptr->entrytable);
+		} else if (!ret) {
+			duprintf("compat_get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		xt_compat_flush_offsets(AF_INET);
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	xt_compat_unlock(AF_INET);
+	return ret;
+}
+
+static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
+
+static int
+compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 1);
+		break;
+	case IPT_SO_GET_ENTRIES:
+		ret = compat_get_entries(sock_net(sk), user, len);
+		break;
+	default:
+		ret = do_ipt_get_ctl(sk, cmd, user, len);
+	}
+	return ret;
+}
+#endif
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_SET_REPLACE:
+		ret = do_replace(sock_net(sk), user, len);
+		break;
+
+	case IPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 0);
+		break;
+
+	default:
+		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 0);
+		break;
+
+	case IPT_SO_GET_ENTRIES:
+		ret = get_entries(sock_net(sk), user, len);
+		break;
+
+	case IPT_SO_GET_REVISION_MATCH:
+	case IPT_SO_GET_REVISION_TARGET: {
+		struct xt_get_revision rev;
+		int target;
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		rev.name[sizeof(rev.name)-1] = 0;
+
+		if (cmd == IPT_SO_GET_REVISION_TARGET)
+			target = 1;
+		else
+			target = 0;
+
+		try_then_request_module(xt_find_revision(AF_INET, rev.name,
+							 rev.revision,
+							 target, &ret),
+					"ipt_%s", rev.name);
+		break;
+	}
+
+	default:
+		duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+struct xt_table *ipt_register_table(struct net *net,
+				    const struct xt_table *table,
+				    const struct ipt_replace *repl)
+{
+	int ret;
+	struct xt_table_info *newinfo;
+	struct xt_table_info bootstrap = {0};
+	void *loc_cpu_entry;
+	struct xt_table *new_table;
+
+	newinfo = xt_alloc_table_info(repl->size);
+	if (!newinfo) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* choose the copy on our node/cpu, but dont care about preemption */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+	if (ret != 0)
+		goto out_free;
+
+	new_table = xt_register_table(net, table, &bootstrap, newinfo);
+	if (IS_ERR(new_table)) {
+		ret = PTR_ERR(new_table);
+		goto out_free;
+	}
+
+	return new_table;
+
+out_free:
+	xt_free_table_info(newinfo);
+out:
+	return ERR_PTR(ret);
+}
+
+void ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ipt_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline bool
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+		     u_int8_t type, u_int8_t code,
+		     bool invert)
+{
+	return ((test_type == 0xFF) ||
+		(type == test_type && code >= min_code && code <= max_code))
+		^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct icmphdr *ic;
+	struct icmphdr _icmph;
+	const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+	if (ic == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("Dropping evil ICMP tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return icmp_type_code_match(icmpinfo->type,
+				    icmpinfo->code[0],
+				    icmpinfo->code[1],
+				    ic->type, ic->code,
+				    !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+	const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+	{
+		.name             = XT_STANDARD_TARGET,
+		.targetsize       = sizeof(int),
+		.family           = NFPROTO_IPV4,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(compat_int_t),
+		.compat_from_user = compat_standard_from_user,
+		.compat_to_user   = compat_standard_to_user,
+#endif
+	},
+	{
+		.name             = XT_ERROR_TARGET,
+		.target           = ipt_error,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
+		.family           = NFPROTO_IPV4,
+	},
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IPT_BASE_CTL,
+	.set_optmax	= IPT_SO_SET_MAX+1,
+	.set		= do_ipt_set_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_set	= compat_do_ipt_set_ctl,
+#endif
+	.get_optmin	= IPT_BASE_CTL,
+	.get_optmax	= IPT_SO_GET_MAX+1,
+	.get		= do_ipt_get_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_get	= compat_do_ipt_get_ctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static struct xt_match ipt_builtin_mt[] __read_mostly = {
+	{
+		.name       = "icmp",
+		.match      = icmp_match,
+		.matchsize  = sizeof(struct ipt_icmp),
+		.checkentry = icmp_checkentry,
+		.proto      = IPPROTO_ICMP,
+		.family     = NFPROTO_IPV4,
+	},
+};
+
+static int __net_init ip_tables_net_init(struct net *net)
+{
+	return xt_proto_init(net, NFPROTO_IPV4);
+}
+
+static void __net_exit ip_tables_net_exit(struct net *net)
+{
+	xt_proto_fini(net, NFPROTO_IPV4);
+}
+
+static struct pernet_operations ip_tables_net_ops = {
+	.init = ip_tables_net_init,
+	.exit = ip_tables_net_exit,
+};
+
+static int __init ip_tables_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip_tables_net_ops);
+	if (ret < 0)
+		goto err1;
+
+	/* No one else will be downing sem now, so we won't sleep */
+	ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+	if (ret < 0)
+		goto err2;
+	ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+	if (ret < 0)
+		goto err4;
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&ipt_sockopts);
+	if (ret < 0)
+		goto err5;
+
+	pr_info("(C) 2000-2006 Netfilter Core Team\n");
+	return 0;
+
+err5:
+	xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+err4:
+	xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+err2:
+	unregister_pernet_subsys(&ip_tables_net_ops);
+err1:
+	return ret;
+}
+
+static void __exit ip_tables_fini(void)
+{
+	nf_unregister_sockopt(&ipt_sockopts);
+
+	xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+	xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
+	unregister_pernet_subsys(&ip_tables_net_ops);
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_do_table);
+module_init(ip_tables_init);
+module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 00000000..5c9e97c7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,746 @@
+/* Cluster IP hashmark target
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/net_namespace.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+
+#define CLUSTERIP_VERSION "0.8"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
+
+struct clusterip_config {
+	struct list_head list;			/* list of all configs */
+	atomic_t refcount;			/* reference count */
+	atomic_t entries;			/* number of entries/rules
+						 * referencing us */
+
+	__be32 clusterip;			/* the IP address */
+	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
+	struct net_device *dev;			/* device */
+	u_int16_t num_total_nodes;		/* total number of nodes */
+	unsigned long local_nodes;		/* node number array */
+
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;		/* proc dir entry */
+#endif
+	enum clusterip_hashmode hash_mode;	/* which hashing mode */
+	u_int32_t hash_initval;			/* hash initialization */
+	struct rcu_head rcu;
+};
+
+static LIST_HEAD(clusterip_configs);
+
+/* clusterip_lock protects the clusterip_configs list */
+static DEFINE_SPINLOCK(clusterip_lock);
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+#endif
+
+static inline void
+clusterip_config_get(struct clusterip_config *c)
+{
+	atomic_inc(&c->refcount);
+}
+
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+	kfree(container_of(head, struct clusterip_config, rcu));
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c)
+{
+	if (atomic_dec_and_test(&c->refcount))
+		call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
+}
+
+/* decrease the count of entries using/referencing this config.  If last
+ * entry(rule) is removed, remove the config from lists, but don't free it
+ * yet, since proc-files could still be holding references */
+static inline void
+clusterip_config_entry_put(struct clusterip_config *c)
+{
+	local_bh_disable();
+	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+		list_del_rcu(&c->list);
+		spin_unlock(&clusterip_lock);
+		local_bh_enable();
+
+		dev_mc_del(c->dev, c->clustermac);
+		dev_put(c->dev);
+
+		/* In case anyone still accesses the file, the open/close
+		 * functions are also incrementing the refcount on their own,
+		 * so it's safe to remove the entry even if it's in use. */
+#ifdef CONFIG_PROC_FS
+		remove_proc_entry(c->pde->name, c->pde->parent);
+#endif
+		return;
+	}
+	local_bh_enable();
+}
+
+static struct clusterip_config *
+__clusterip_config_find(__be32 clusterip)
+{
+	struct clusterip_config *c;
+
+	list_for_each_entry_rcu(c, &clusterip_configs, list) {
+		if (c->clusterip == clusterip)
+			return c;
+	}
+
+	return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(__be32 clusterip, int entry)
+{
+	struct clusterip_config *c;
+
+	rcu_read_lock_bh();
+	c = __clusterip_config_find(clusterip);
+	if (c) {
+		if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+			c = NULL;
+		else if (entry)
+			atomic_inc(&c->entries);
+	}
+	rcu_read_unlock_bh();
+
+	return c;
+}
+
+static void
+clusterip_config_init_nodelist(struct clusterip_config *c,
+			       const struct ipt_clusterip_tgt_info *i)
+{
+	int n;
+
+	for (n = 0; n < i->num_local_nodes; n++)
+		set_bit(i->local_nodes[n] - 1, &c->local_nodes);
+}
+
+static struct clusterip_config *
+clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
+			struct net_device *dev)
+{
+	struct clusterip_config *c;
+
+	c = kzalloc(sizeof(*c), GFP_ATOMIC);
+	if (!c)
+		return NULL;
+
+	c->dev = dev;
+	c->clusterip = ip;
+	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+	c->num_total_nodes = i->num_total_nodes;
+	clusterip_config_init_nodelist(c, i);
+	c->hash_mode = i->hash_mode;
+	c->hash_initval = i->hash_initval;
+	atomic_set(&c->refcount, 1);
+	atomic_set(&c->entries, 1);
+
+#ifdef CONFIG_PROC_FS
+	{
+		char buffer[16];
+
+		/* create proc dir entry */
+		sprintf(buffer, "%pI4", &ip);
+		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
+					  clusterip_procdir,
+					  &clusterip_proc_fops, c);
+		if (!c->pde) {
+			kfree(c);
+			return NULL;
+		}
+	}
+#endif
+
+	spin_lock_bh(&clusterip_lock);
+	list_add_rcu(&c->list, &clusterip_configs);
+	spin_unlock_bh(&clusterip_lock);
+
+	return c;
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+
+	if (nodenum == 0 ||
+	    nodenum > c->num_total_nodes)
+		return 1;
+
+	/* check if we already have this number in our bitfield */
+	if (test_and_set_bit(nodenum - 1, &c->local_nodes))
+		return 1;
+
+	return 0;
+}
+
+static bool
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	if (nodenum == 0 ||
+	    nodenum > c->num_total_nodes)
+		return true;
+
+	if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
+		return false;
+
+	return true;
+}
+#endif
+
+static inline u_int32_t
+clusterip_hashfn(const struct sk_buff *skb,
+		 const struct clusterip_config *config)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	unsigned long hashval;
+	u_int16_t sport = 0, dport = 0;
+	int poff;
+
+	poff = proto_ports_offset(iph->protocol);
+	if (poff >= 0) {
+		const u_int16_t *ports;
+		u16 _ports[2];
+
+		ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+		if (ports) {
+			sport = ports[0];
+			dport = ports[1];
+		}
+	} else {
+		if (net_ratelimit())
+			pr_info("unknown protocol %u\n", iph->protocol);
+	}
+
+	switch (config->hash_mode) {
+	case CLUSTERIP_HASHMODE_SIP:
+		hashval = jhash_1word(ntohl(iph->saddr),
+				      config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT:
+		hashval = jhash_2words(ntohl(iph->saddr), sport,
+				       config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+		hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+				       config->hash_initval);
+		break;
+	default:
+		/* to make gcc happy */
+		hashval = 0;
+		/* This cannot happen, unless the check function wasn't called
+		 * at rule load time */
+		pr_info("unknown mode %u\n", config->hash_mode);
+		BUG();
+		break;
+	}
+
+	/* node numbers are 1..n, not 0..n */
+	return (((u64)hashval * config->num_total_nodes) >> 32) + 1;
+}
+
+static inline int
+clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
+{
+	return test_bit(hash - 1, &config->local_nodes);
+}
+
+/***********************************************************************
+ * IPTABLES TARGET
+ ***********************************************************************/
+
+static unsigned int
+clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t hash;
+
+	/* don't need to clusterip_config_get() here, since refcount
+	 * is only decremented by destroy() - and ip_tables guarantees
+	 * that the ->target() function isn't called after ->destroy() */
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return NF_DROP;
+
+	/* special case: ICMP error handling. conntrack distinguishes between
+	 * error messages (RELATED) and information requests (see below) */
+	if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
+	    (ctinfo == IP_CT_RELATED ||
+	     ctinfo == IP_CT_RELATED_REPLY))
+		return XT_CONTINUE;
+
+	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
+	 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+	 * on, which all have an ID field [relevant for hashing]. */
+
+	hash = clusterip_hashfn(skb, cipinfo->config);
+
+	switch (ctinfo) {
+		case IP_CT_NEW:
+			ct->mark = hash;
+			break;
+		case IP_CT_RELATED:
+		case IP_CT_RELATED_REPLY:
+			/* FIXME: we don't handle expectations at the
+			 * moment.  they can arrive on a different node than
+			 * the master connection (e.g. FTP passive mode) */
+		case IP_CT_ESTABLISHED:
+		case IP_CT_ESTABLISHED_REPLY:
+			break;
+		default:
+			break;
+	}
+
+#ifdef DEBUG
+	nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+	pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
+	if (!clusterip_responsible(cipinfo->config, hash)) {
+		pr_debug("not responsible\n");
+		return NF_DROP;
+	}
+	pr_debug("responsible\n");
+
+	/* despite being received via linklayer multicast, this is
+	 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+	skb->pkt_type = PACKET_HOST;
+
+	return XT_CONTINUE;
+}
+
+static int clusterip_tg_check(const struct xt_tgchk_param *par)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
+	struct clusterip_config *config;
+	int ret;
+
+	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+		pr_info("unknown mode %u\n", cipinfo->hash_mode);
+		return -EINVAL;
+
+	}
+	if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
+	    e->ip.dst.s_addr == 0) {
+		pr_info("Please specify destination IP\n");
+		return -EINVAL;
+	}
+
+	/* FIXME: further sanity checks */
+
+	config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+	if (!config) {
+		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+			pr_info("no config found for %pI4, need 'new'\n",
+				&e->ip.dst.s_addr);
+			return -EINVAL;
+		} else {
+			struct net_device *dev;
+
+			if (e->ip.iniface[0] == '\0') {
+				pr_info("Please specify an interface name\n");
+				return -EINVAL;
+			}
+
+			dev = dev_get_by_name(&init_net, e->ip.iniface);
+			if (!dev) {
+				pr_info("no such interface %s\n",
+					e->ip.iniface);
+				return -ENOENT;
+			}
+
+			config = clusterip_config_init(cipinfo,
+							e->ip.dst.s_addr, dev);
+			if (!config) {
+				pr_info("cannot allocate config\n");
+				dev_put(dev);
+				return -ENOMEM;
+			}
+			dev_mc_add(config->dev, config->clustermac);
+		}
+	}
+	cipinfo->config = config;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
+
+	/* if no more entries are referencing the config, remove it
+	 * from the list and destroy the proc entry */
+	clusterip_config_entry_put(cipinfo->config);
+
+	clusterip_config_put(cipinfo->config);
+
+	nf_ct_l3proto_module_put(par->family);
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_clusterip_tgt_info
+{
+	u_int32_t	flags;
+	u_int8_t	clustermac[6];
+	u_int16_t	num_total_nodes;
+	u_int16_t	num_local_nodes;
+	u_int16_t	local_nodes[CLUSTERIP_MAX_NODES];
+	u_int32_t	hash_mode;
+	u_int32_t	hash_initval;
+	compat_uptr_t	config;
+};
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target clusterip_tg_reg __read_mostly = {
+	.name		= "CLUSTERIP",
+	.family		= NFPROTO_IPV4,
+	.target		= clusterip_tg,
+	.checkentry	= clusterip_tg_check,
+	.destroy	= clusterip_tg_destroy,
+	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ipt_clusterip_tgt_info),
+#endif /* CONFIG_COMPAT */
+	.me		= THIS_MODULE
+};
+
+
+/***********************************************************************
+ * ARP MANGLING CODE
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+	u_int8_t src_hw[ETH_ALEN];
+	__be32 src_ip;
+	u_int8_t dst_hw[ETH_ALEN];
+	__be32 dst_ip;
+} __packed;
+
+#ifdef DEBUG
+static void arp_print(struct arp_payload *payload)
+{
+#define HBUFFERLEN 30
+	char hbuffer[HBUFFERLEN];
+	int j,k;
+
+	for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+		hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
+		hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
+		hbuffer[k++]=':';
+	}
+	hbuffer[--k]='\0';
+
+	pr_debug("src %pI4@%s, dst %pI4\n",
+		 &payload->src_ip, hbuffer, &payload->dst_ip);
+}
+#endif
+
+static unsigned int
+arp_mangle(unsigned int hook,
+	   struct sk_buff *skb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+	struct arphdr *arp = arp_hdr(skb);
+	struct arp_payload *payload;
+	struct clusterip_config *c;
+
+	/* we don't care about non-ethernet and non-ipv4 ARP */
+	if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+	    arp->ar_pro != htons(ETH_P_IP) ||
+	    arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+		return NF_ACCEPT;
+
+	/* we only want to mangle arp requests and replies */
+	if (arp->ar_op != htons(ARPOP_REPLY) &&
+	    arp->ar_op != htons(ARPOP_REQUEST))
+		return NF_ACCEPT;
+
+	payload = (void *)(arp+1);
+
+	/* if there is no clusterip configuration for the arp reply's
+	 * source ip, we don't want to mangle it */
+	c = clusterip_config_find_get(payload->src_ip, 0);
+	if (!c)
+		return NF_ACCEPT;
+
+	/* normally the linux kernel always replies to arp queries of
+	 * addresses on different interfacs.  However, in the CLUSTERIP case
+	 * this wouldn't work, since we didn't subscribe the mcast group on
+	 * other interfaces */
+	if (c->dev != out) {
+		pr_debug("not mangling arp reply on different "
+			 "interface: cip'%s'-skb'%s'\n",
+			 c->dev->name, out->name);
+		clusterip_config_put(c);
+		return NF_ACCEPT;
+	}
+
+	/* mangle reply hardware address */
+	memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef DEBUG
+	pr_debug("mangled arp reply: ");
+	arp_print(payload);
+#endif
+
+	clusterip_config_put(c);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops __read_mostly = {
+	.hook = arp_mangle,
+	.pf = NFPROTO_ARP,
+	.hooknum = NF_ARP_OUT,
+	.priority = -1
+};
+
+/***********************************************************************
+ * PROC DIR HANDLING
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+struct clusterip_seq_position {
+	unsigned int pos;	/* position */
+	unsigned int weight;	/* number of bits set == size */
+	unsigned int bit;	/* current bit */
+	unsigned long val;	/* current value */
+};
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct clusterip_config *c = s->private;
+	unsigned int weight;
+	u_int32_t local_nodes;
+	struct clusterip_seq_position *idx;
+
+	/* FIXME: possible race */
+	local_nodes = c->local_nodes;
+	weight = hweight32(local_nodes);
+	if (*pos >= weight)
+		return NULL;
+
+	idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
+	if (!idx)
+		return ERR_PTR(-ENOMEM);
+
+	idx->pos = *pos;
+	idx->weight = weight;
+	idx->bit = ffs(local_nodes);
+	idx->val = local_nodes;
+	clear_bit(idx->bit - 1, &idx->val);
+
+	return idx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct clusterip_seq_position *idx = v;
+
+	*pos = ++idx->pos;
+	if (*pos >= idx->weight) {
+		kfree(v);
+		return NULL;
+	}
+	idx->bit = ffs(idx->val);
+	clear_bit(idx->bit - 1, &idx->val);
+	return idx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+	if (!IS_ERR(v))
+		kfree(v);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+	struct clusterip_seq_position *idx = v;
+
+	if (idx->pos != 0)
+		seq_putc(s, ',');
+
+	seq_printf(s, "%u", idx->bit);
+
+	if (idx->pos == idx->weight - 1)
+		seq_putc(s, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations clusterip_seq_ops = {
+	.start	= clusterip_seq_start,
+	.next	= clusterip_seq_next,
+	.stop	= clusterip_seq_stop,
+	.show	= clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &clusterip_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		struct clusterip_config *c = PDE(inode)->data;
+
+		sf->private = c;
+
+		clusterip_config_get(c);
+	}
+
+	return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+	struct clusterip_config *c = PDE(inode)->data;
+	int ret;
+
+	ret = seq_release(inode, file);
+
+	if (!ret)
+		clusterip_config_put(c);
+
+	return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+				size_t size, loff_t *ofs)
+{
+	struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
+#define PROC_WRITELEN	10
+	char buffer[PROC_WRITELEN+1];
+	unsigned long nodenum;
+
+	if (size > PROC_WRITELEN)
+		return -EIO;
+	if (copy_from_user(buffer, input, size))
+		return -EFAULT;
+	buffer[size] = 0;
+
+	if (*buffer == '+') {
+		nodenum = simple_strtoul(buffer+1, NULL, 10);
+		if (clusterip_add_node(c, nodenum))
+			return -ENOMEM;
+	} else if (*buffer == '-') {
+		nodenum = simple_strtoul(buffer+1, NULL,10);
+		if (clusterip_del_node(c, nodenum))
+			return -ENOENT;
+	} else
+		return -EIO;
+
+	return size;
+}
+
+static const struct file_operations clusterip_proc_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = clusterip_proc_open,
+	.read	 = seq_read,
+	.write	 = clusterip_proc_write,
+	.llseek	 = seq_lseek,
+	.release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int __init clusterip_tg_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&clusterip_tg_reg);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hook(&cip_arp_ops);
+	if (ret < 0)
+		goto cleanup_target;
+
+#ifdef CONFIG_PROC_FS
+	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
+	if (!clusterip_procdir) {
+		pr_err("Unable to proc dir entry\n");
+		ret = -ENOMEM;
+		goto cleanup_hook;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	pr_info("ClusterIP Version %s loaded successfully\n",
+		CLUSTERIP_VERSION);
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+cleanup_hook:
+	nf_unregister_hook(&cip_arp_ops);
+#endif /* CONFIG_PROC_FS */
+cleanup_target:
+	xt_unregister_target(&clusterip_tg_reg);
+	return ret;
+}
+
+static void __exit clusterip_tg_exit(void)
+{
+	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+	nf_unregister_hook(&cip_arp_ops);
+	xt_unregister_target(&clusterip_tg_reg);
+
+	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+	rcu_barrier_bh();
+}
+
+module_init(clusterip_tg_init);
+module_exit(clusterip_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 00000000..d76d6c9e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,516 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ipt_LOG.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
+
+/* One level of recursion won't kill us */
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
+			const struct sk_buff *skb,
+			unsigned int iphoff)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
+
+	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+	if (ih == NULL) {
+		sb_add(m, "TRUNCATED");
+		return;
+	}
+
+	/* Important fields:
+	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+	sb_add(m, "SRC=%pI4 DST=%pI4 ",
+	       &ih->saddr, &ih->daddr);
+
+	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+	sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+	/* Max length: 6 "CE DF MF " */
+	if (ntohs(ih->frag_off) & IP_CE)
+		sb_add(m, "CE ");
+	if (ntohs(ih->frag_off) & IP_DF)
+		sb_add(m, "DF ");
+	if (ntohs(ih->frag_off) & IP_MF)
+		sb_add(m, "MF ");
+
+	/* Max length: 11 "FRAG:65535 " */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+	if ((logflags & IPT_LOG_IPOPT) &&
+	    ih->ihl * 4 > sizeof(struct iphdr)) {
+		const unsigned char *op;
+		unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+		unsigned int i, optsize;
+
+		optsize = ih->ihl * 4 - sizeof(struct iphdr);
+		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+					optsize, _opt);
+		if (op == NULL) {
+			sb_add(m, "TRUNCATED");
+			return;
+		}
+
+		/* Max length: 127 "OPT (" 15*4*2chars ") " */
+		sb_add(m, "OPT (");
+		for (i = 0; i < optsize; i++)
+			sb_add(m, "%02X", op[i]);
+		sb_add(m, ") ");
+	}
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph;
+		const struct tcphdr *th;
+
+		/* Max length: 10 "PROTO=TCP " */
+		sb_add(m, "PROTO=TCP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+					sizeof(_tcph), &_tcph);
+		if (th == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		sb_add(m, "SPT=%u DPT=%u ",
+		       ntohs(th->source), ntohs(th->dest));
+		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+		if (logflags & IPT_LOG_TCPSEQ)
+			sb_add(m, "SEQ=%u ACK=%u ",
+			       ntohl(th->seq), ntohl(th->ack_seq));
+		/* Max length: 13 "WINDOW=65535 " */
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
+		/* Max length: 9 "RES=0x3F " */
+		sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+		if (th->cwr)
+			sb_add(m, "CWR ");
+		if (th->ece)
+			sb_add(m, "ECE ");
+		if (th->urg)
+			sb_add(m, "URG ");
+		if (th->ack)
+			sb_add(m, "ACK ");
+		if (th->psh)
+			sb_add(m, "PSH ");
+		if (th->rst)
+			sb_add(m, "RST ");
+		if (th->syn)
+			sb_add(m, "SYN ");
+		if (th->fin)
+			sb_add(m, "FIN ");
+		/* Max length: 11 "URGP=65535 " */
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+		if ((logflags & IPT_LOG_TCPOPT) &&
+		    th->doff * 4 > sizeof(struct tcphdr)) {
+			unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
+			const unsigned char *op;
+			unsigned int i, optsize;
+
+			optsize = th->doff * 4 - sizeof(struct tcphdr);
+			op = skb_header_pointer(skb,
+						iphoff+ih->ihl*4+sizeof(_tcph),
+						optsize, _opt);
+			if (op == NULL) {
+				sb_add(m, "TRUNCATED");
+				return;
+			}
+
+			/* Max length: 127 "OPT (" 15*4*2chars ") " */
+			sb_add(m, "OPT (");
+			for (i = 0; i < optsize; i++)
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
+		}
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
+		struct udphdr _udph;
+		const struct udphdr *uh;
+
+		if (ih->protocol == IPPROTO_UDP)
+			/* Max length: 10 "PROTO=UDP "     */
+			sb_add(m, "PROTO=UDP " );
+		else	/* Max length: 14 "PROTO=UDPLITE " */
+			sb_add(m, "PROTO=UDPLITE ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_udph), &_udph);
+		if (uh == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
+		       ntohs(uh->source), ntohs(uh->dest),
+		       ntohs(uh->len));
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr _icmph;
+		const struct icmphdr *ich;
+		static const size_t required_len[NR_ICMP_TYPES+1]
+			= { [ICMP_ECHOREPLY] = 4,
+			    [ICMP_DEST_UNREACH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_SOURCE_QUENCH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_REDIRECT]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_ECHO] = 4,
+			    [ICMP_TIME_EXCEEDED]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_PARAMETERPROB]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_TIMESTAMP] = 20,
+			    [ICMP_TIMESTAMPREPLY] = 20,
+			    [ICMP_ADDRESS] = 12,
+			    [ICMP_ADDRESSREPLY] = 12 };
+
+		/* Max length: 11 "PROTO=ICMP " */
+		sb_add(m, "PROTO=ICMP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+					 sizeof(_icmph), &_icmph);
+		if (ich == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 18 "TYPE=255 CODE=255 " */
+		sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		if (ich->type <= NR_ICMP_TYPES &&
+		    required_len[ich->type] &&
+		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		switch (ich->type) {
+		case ICMP_ECHOREPLY:
+		case ICMP_ECHO:
+			/* Max length: 19 "ID=65535 SEQ=65535 " */
+			sb_add(m, "ID=%u SEQ=%u ",
+			       ntohs(ich->un.echo.id),
+			       ntohs(ich->un.echo.sequence));
+			break;
+
+		case ICMP_PARAMETERPROB:
+			/* Max length: 14 "PARAMETER=255 " */
+			sb_add(m, "PARAMETER=%u ",
+			       ntohl(ich->un.gateway) >> 24);
+			break;
+		case ICMP_REDIRECT:
+			/* Max length: 24 "GATEWAY=255.255.255.255 " */
+			sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+			/* Fall through */
+		case ICMP_DEST_UNREACH:
+		case ICMP_SOURCE_QUENCH:
+		case ICMP_TIME_EXCEEDED:
+			/* Max length: 3+maxlen */
+			if (!iphoff) { /* Only recurse once. */
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
+					    iphoff + ih->ihl*4+sizeof(_icmph));
+				sb_add(m, "] ");
+			}
+
+			/* Max length: 10 "MTU=65535 " */
+			if (ich->type == ICMP_DEST_UNREACH &&
+			    ich->code == ICMP_FRAG_NEEDED)
+				sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
+		}
+		break;
+	}
+	/* Max Length */
+	case IPPROTO_AH: {
+		struct ip_auth_hdr _ahdr;
+		const struct ip_auth_hdr *ah;
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 9 "PROTO=AH " */
+		sb_add(m, "PROTO=AH ");
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_ahdr), &_ahdr);
+		if (ah == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
+		break;
+	}
+	case IPPROTO_ESP: {
+		struct ip_esp_hdr _esph;
+		const struct ip_esp_hdr *eh;
+
+		/* Max length: 10 "PROTO=ESP " */
+		sb_add(m, "PROTO=ESP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_esph), &_esph);
+		if (eh == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
+		break;
+	}
+	/* Max length: 10 "PROTO 255 " */
+	default:
+		sb_add(m, "PROTO=%u ", ih->protocol);
+	}
+
+	/* Max length: 15 "UID=4294967295 " */
+	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+			sb_add(m, "UID=%u GID=%u ",
+				skb->sk->sk_socket->file->f_cred->fsuid,
+				skb->sk->sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+	}
+
+	/* Max length: 16 "MARK=0xFFFFFFFF " */
+	if (!iphoff && skb->mark)
+		sb_add(m, "MARK=0x%x ", skb->mark);
+
+	/* Proto    Max log string length */
+	/* IP:      40+46+6+11+127 = 230 */
+	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */
+	/* UDP:     10+max(25,20) = 35 */
+	/* UDPLITE: 14+max(25,20) = 39 */
+	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+	/* ESP:     10+max(25)+15 = 50 */
+	/* AH:      9+max(25)+15 = 49 */
+	/* unknown: 10 */
+
+	/* (ICMP allows recursion one level deep) */
+	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */
+	/* maxlen = 230+   91  + 230 + 252 = 803 */
+}
+
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IPT_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	sb_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int i;
+
+		sb_add(m, "%02x", *p++);
+		for (i = 1; i < dev->hard_header_len; i++, p++)
+			sb_add(m, ":%02x", *p);
+	}
+	sb_add(m, " ");
+}
+
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level    = 5,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+static void
+ipt_log_packet(u_int8_t pf,
+	       unsigned int hooknum,
+	       const struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       const struct nf_loginfo *loginfo,
+	       const char *prefix)
+{
+	struct sbuff *m = sb_open();
+
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge) {
+		const struct net_device *physindev;
+		const struct net_device *physoutdev;
+
+		physindev = skb->nf_bridge->physindev;
+		if (physindev && in != physindev)
+			sb_add(m, "PHYSIN=%s ", physindev->name);
+		physoutdev = skb->nf_bridge->physoutdev;
+		if (physoutdev && out != physoutdev)
+			sb_add(m, "PHYSOUT=%s ", physoutdev->name);
+	}
+#endif
+
+	if (in != NULL)
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, 0);
+
+	sb_close(m);
+}
+
+static unsigned int
+log_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_log_info *loginfo = par->targinfo;
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = loginfo->level;
+	li.u.log.logflags = loginfo->logflags;
+
+	ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li,
+		       loginfo->prefix);
+	return XT_CONTINUE;
+}
+
+static int log_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_log_info *loginfo = par->targinfo;
+
+	if (loginfo->level >= 8) {
+		pr_debug("level %u >= 8\n", loginfo->level);
+		return -EINVAL;
+	}
+	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+		pr_debug("prefix is not null-terminated\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target log_tg_reg __read_mostly = {
+	.name		= "LOG",
+	.family		= NFPROTO_IPV4,
+	.target		= log_tg,
+	.targetsize	= sizeof(struct ipt_log_info),
+	.checkentry	= log_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static struct nf_logger ipt_log_logger __read_mostly = {
+	.name		= "ipt_LOG",
+	.logfn		= &ipt_log_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __init log_tg_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&log_tg_reg);
+	if (ret < 0)
+		return ret;
+	nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
+	return 0;
+}
+
+static void __exit log_tg_exit(void)
+{
+	nf_log_unregister(&ipt_log_logger);
+	xt_unregister_target(&log_tg_reg);
+}
+
+module_init(log_tg_init);
+module_exit(log_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 00000000..9931152a
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,173 @@
+/* Masquerade.  Simple mapping which alters range to a local IP address
+   (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
+
+/* FIXME: Multiple targets. --RR */
+static int masquerade_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+		pr_debug("bad MAP_IPS.\n");
+		return -EINVAL;
+	}
+	if (mr->rangesize != 1) {
+		pr_debug("bad rangesize %u\n", mr->rangesize);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static unsigned int
+masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	struct nf_conn_nat *nat;
+	enum ip_conntrack_info ctinfo;
+	struct nf_nat_range newrange;
+	const struct nf_nat_multi_range_compat *mr;
+	const struct rtable *rt;
+	__be32 newsrc;
+
+	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
+
+	ct = nf_ct_get(skb, &ctinfo);
+	nat = nfct_nat(ct);
+
+	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+			    ctinfo == IP_CT_RELATED_REPLY));
+
+	/* Source address is 0.0.0.0 - locally generated packet that is
+	 * probably not supposed to be masqueraded.
+	 */
+	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
+		return NF_ACCEPT;
+
+	mr = par->targinfo;
+	rt = skb_rtable(skb);
+	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	if (!newsrc) {
+		pr_info("%s ate my IP address\n", par->out->name);
+		return NF_DROP;
+	}
+
+	nat->masq_index = par->out->ifindex;
+
+	/* Transfer from original range. */
+	newrange = ((struct nf_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  newsrc, newsrc,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
+}
+
+static int
+device_cmp(struct nf_conn *i, void *ifindex)
+{
+	const struct nf_conn_nat *nat = nfct_nat(i);
+
+	if (!nat)
+		return 0;
+
+	return nat->masq_index == (int)(long)ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+			     unsigned long event,
+			     void *ptr)
+{
+	const struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+
+	if (event == NETDEV_DOWN) {
+		/* Device was downed.  Search entire table for
+		   conntracks which were associated with that device,
+		   and forget them. */
+		NF_CT_ASSERT(dev->ifindex != 0);
+
+		nf_ct_iterate_cleanup(net, device_cmp,
+				      (void *)(long)dev->ifindex);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+			   unsigned long event,
+			   void *ptr)
+{
+	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+	return masq_device_event(this, event, dev);
+}
+
+static struct notifier_block masq_dev_notifier = {
+	.notifier_call	= masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+	.notifier_call	= masq_inet_event,
+};
+
+static struct xt_target masquerade_tg_reg __read_mostly = {
+	.name		= "MASQUERADE",
+	.family		= NFPROTO_IPV4,
+	.target		= masquerade_tg,
+	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
+	.table		= "nat",
+	.hooks		= 1 << NF_INET_POST_ROUTING,
+	.checkentry	= masquerade_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init masquerade_tg_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&masquerade_tg_reg);
+
+	if (ret == 0) {
+		/* Register for device down reports */
+		register_netdevice_notifier(&masq_dev_notifier);
+		/* Register IP address change reports */
+		register_inetaddr_notifier(&masq_inet_notifier);
+	}
+
+	return ret;
+}
+
+static void __exit masquerade_tg_exit(void)
+{
+	xt_unregister_target(&masquerade_tg_reg);
+	unregister_netdevice_notifier(&masq_dev_notifier);
+	unregister_inetaddr_notifier(&masq_inet_notifier);
+}
+
+module_init(masquerade_tg_init);
+module_exit(masquerade_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
new file mode 100644
index 00000000..6cdb298f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -0,0 +1,98 @@
+/* NETMAP - static NAT mapping of IP network addresses (1:1).
+ * The mapping can be applied to source (POSTROUTING),
+ * destination (PREROUTING), or both (with separate rules).
+ */
+
+/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat_rule.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
+MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
+
+static int netmap_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
+		pr_debug("bad MAP_IPS.\n");
+		return -EINVAL;
+	}
+	if (mr->rangesize != 1) {
+		pr_debug("bad rangesize %u.\n", mr->rangesize);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static unsigned int
+netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	__be32 new_ip, netmask;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+	struct nf_nat_range newrange;
+
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_POST_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT ||
+		     par->hooknum == NF_INET_LOCAL_IN);
+	ct = nf_ct_get(skb, &ctinfo);
+
+	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_OUT)
+		new_ip = ip_hdr(skb)->daddr & ~netmask;
+	else
+		new_ip = ip_hdr(skb)->saddr & ~netmask;
+	new_ip |= mr->range[0].min_ip & netmask;
+
+	newrange = ((struct nf_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  new_ip, new_ip,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+}
+
+static struct xt_target netmap_tg_reg __read_mostly = {
+	.name 		= "NETMAP",
+	.family		= NFPROTO_IPV4,
+	.target 	= netmap_tg,
+	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
+	.table		= "nat",
+	.hooks		= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.checkentry 	= netmap_tg_check,
+	.me 		= THIS_MODULE
+};
+
+static int __init netmap_tg_init(void)
+{
+	return xt_register_target(&netmap_tg_reg);
+}
+
+static void __exit netmap_tg_exit(void)
+{
+	xt_unregister_target(&netmap_tg_reg);
+}
+
+module_init(netmap_tg_init);
+module_exit(netmap_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 00000000..18a06565
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,110 @@
+/* Redirect.  Simple mapping which alters dst to a local IP address. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat_rule.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
+
+/* FIXME: Take multiple ranges --RR */
+static int redirect_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+		pr_debug("bad MAP_IPS.\n");
+		return -EINVAL;
+	}
+	if (mr->rangesize != 1) {
+		pr_debug("bad rangesize %u.\n", mr->rangesize);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static unsigned int
+redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	__be32 newdst;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+	struct nf_nat_range newrange;
+
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT);
+
+	ct = nf_ct_get(skb, &ctinfo);
+	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+	/* Local packets: make them go to loopback */
+	if (par->hooknum == NF_INET_LOCAL_OUT)
+		newdst = htonl(0x7F000001);
+	else {
+		struct in_device *indev;
+		struct in_ifaddr *ifa;
+
+		newdst = 0;
+
+		rcu_read_lock();
+		indev = __in_dev_get_rcu(skb->dev);
+		if (indev && (ifa = indev->ifa_list))
+			newdst = ifa->ifa_local;
+		rcu_read_unlock();
+
+		if (!newdst)
+			return NF_DROP;
+	}
+
+	/* Transfer from original range. */
+	newrange = ((struct nf_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  newdst, newdst,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST);
+}
+
+static struct xt_target redirect_tg_reg __read_mostly = {
+	.name		= "REDIRECT",
+	.family		= NFPROTO_IPV4,
+	.target		= redirect_tg,
+	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
+	.table		= "nat",
+	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
+	.checkentry	= redirect_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init redirect_tg_init(void)
+{
+	return xt_register_target(&redirect_tg_reg);
+}
+
+static void __exit redirect_tg_exit(void)
+{
+	xt_unregister_target(&redirect_tg_reg);
+}
+
+module_init(redirect_tg_init);
+module_exit(redirect_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 00000000..9dd754c7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,220 @@
+/*
+ * This is a module which is used for rejecting packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include <linux/netfilter_bridge.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	const struct iphdr *oiph;
+	struct iphdr *niph;
+	const struct tcphdr *oth;
+	struct tcphdr _otcph, *tcph;
+
+	/* IP header checks: fragment. */
+	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+		return;
+
+	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
+				 sizeof(_otcph), &_otcph);
+	if (oth == NULL)
+		return;
+
+	/* No RST for RST. */
+	if (oth->rst)
+		return;
+
+	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return;
+
+	/* Check checksum */
+	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
+		return;
+	oiph = ip_hdr(oldskb);
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
+
+	skb_reserve(nskb, LL_MAX_HEADER);
+
+	skb_reset_network_header(nskb);
+	niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+	niph->version	= 4;
+	niph->ihl	= sizeof(struct iphdr) / 4;
+	niph->tos	= 0;
+	niph->id	= 0;
+	niph->frag_off	= htons(IP_DF);
+	niph->protocol	= IPPROTO_TCP;
+	niph->check	= 0;
+	niph->saddr	= oiph->daddr;
+	niph->daddr	= oiph->saddr;
+
+	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+	memset(tcph, 0, sizeof(*tcph));
+	tcph->source	= oth->dest;
+	tcph->dest	= oth->source;
+	tcph->doff	= sizeof(struct tcphdr) / 4;
+
+	if (oth->ack)
+		tcph->seq = oth->ack_seq;
+	else {
+		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
+				      oldskb->len - ip_hdrlen(oldskb) -
+				      (oth->doff << 2));
+		tcph->ack = 1;
+	}
+
+	tcph->rst	= 1;
+	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+				    niph->daddr, 0);
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+	nskb->csum_start = (unsigned char *)tcph - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	/* ip_route_me_harder expects skb->dst to be set */
+	skb_dst_set_noref(nskb, skb_dst(oldskb));
+
+	nskb->protocol = htons(ETH_P_IP);
+	if (ip_route_me_harder(nskb, RTN_UNSPEC))
+		goto free_nskb;
+
+	niph->ttl	= ip4_dst_hoplimit(skb_dst(nskb));
+
+	/* "Never happens" */
+	if (nskb->len > dst_mtu(skb_dst(nskb)))
+		goto free_nskb;
+
+	nf_ct_attach(nskb, oldskb);
+
+	ip_local_out(nskb);
+	return;
+
+ free_nskb:
+	kfree_skb(nskb);
+}
+
+static inline void send_unreach(struct sk_buff *skb_in, int code)
+{
+	icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+#ifdef CONFIG_IP_NF_TARGET_REJECT_SKERR
+	if (skb_in->sk) {
+		skb_in->sk->sk_err = icmp_err_convert[code].errno;
+		skb_in->sk->sk_error_report(skb_in->sk);
+		pr_debug("ipt_REJECT: sk_err=%d for skb=%p sk=%p\n",
+			skb_in->sk->sk_err, skb_in, skb_in->sk);
+	}
+#endif
+}
+
+static unsigned int
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_reject_info *reject = par->targinfo;
+
+	switch (reject->with) {
+	case IPT_ICMP_NET_UNREACHABLE:
+		send_unreach(skb, ICMP_NET_UNREACH);
+		break;
+	case IPT_ICMP_HOST_UNREACHABLE:
+		send_unreach(skb, ICMP_HOST_UNREACH);
+		break;
+	case IPT_ICMP_PROT_UNREACHABLE:
+		send_unreach(skb, ICMP_PROT_UNREACH);
+		break;
+	case IPT_ICMP_PORT_UNREACHABLE:
+		send_unreach(skb, ICMP_PORT_UNREACH);
+		break;
+	case IPT_ICMP_NET_PROHIBITED:
+		send_unreach(skb, ICMP_NET_ANO);
+		break;
+	case IPT_ICMP_HOST_PROHIBITED:
+		send_unreach(skb, ICMP_HOST_ANO);
+		break;
+	case IPT_ICMP_ADMIN_PROHIBITED:
+		send_unreach(skb, ICMP_PKT_FILTERED);
+		break;
+	case IPT_TCP_RESET:
+		send_reset(skb, par->hooknum);
+	case IPT_ICMP_ECHOREPLY:
+		/* Doesn't happen. */
+		break;
+	}
+
+	return NF_DROP;
+}
+
+static int reject_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_reject_info *rejinfo = par->targinfo;
+	const struct ipt_entry *e = par->entryinfo;
+
+	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+		pr_info("ECHOREPLY no longer supported.\n");
+		return -EINVAL;
+	} else if (rejinfo->with == IPT_TCP_RESET) {
+		/* Must specify that it's a TCP packet */
+		if (e->ip.proto != IPPROTO_TCP ||
+		    (e->ip.invflags & XT_INV_PROTO)) {
+			pr_info("TCP_RESET invalid for non-tcp\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static struct xt_target reject_tg_reg __read_mostly = {
+	.name		= "REJECT",
+	.family		= NFPROTO_IPV4,
+	.target		= reject_tg,
+	.targetsize	= sizeof(struct ipt_reject_info),
+	.table		= "filter",
+	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_LOCAL_OUT),
+	.checkentry	= reject_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init reject_tg_init(void)
+{
+	return xt_register_target(&reject_tg_reg);
+}
+
+static void __exit reject_tg_exit(void)
+{
+	xt_unregister_target(&reject_tg_reg);
+}
+
+module_init(reject_tg_init);
+module_exit(reject_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
new file mode 100644
index 00000000..446e0f46
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -0,0 +1,442 @@
+/*
+ * netfilter module for userspace packet logging daemons
+ *
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This module accepts two parameters:
+ *
+ * nlbufsiz:
+ *   The parameter specifies how big the buffer for each netlink multicast
+ * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
+ * get accumulated in the kernel until they are sent to userspace. It is
+ * NOT possible to allocate more than 128kB, and it is strongly discouraged,
+ * because atomically allocating 128kB inside the network rx softirq is not
+ * reliable. Please also keep in mind that this buffer size is allocated for
+ * each nlgroup you are using, so the total kernel memory usage increases
+ * by that factor.
+ *
+ * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
+ * nlbufsiz is used with alloc_skb, which adds another
+ * sizeof(struct skb_shared_info).  Use NLMSG_GOODSIZE instead.
+ *
+ * flushtimeout:
+ *   Specify, after how many hundredths of a second the queue should be
+ *   flushed even if it is not full yet.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#include <net/netfilter/nf_log.h>
+#include <net/sock.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
+
+#define ULOG_NL_EVENT		111		/* Harald's favorite number */
+#define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
+
+static unsigned int nlbufsiz = NLMSG_GOODSIZE;
+module_param(nlbufsiz, uint, 0400);
+MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
+
+static unsigned int flushtimeout = 10;
+module_param(flushtimeout, uint, 0600);
+MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
+
+static int nflog = 1;
+module_param(nflog, bool, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+
+/* global data structures */
+
+typedef struct {
+	unsigned int qlen;		/* number of nlmsgs' in the skb */
+	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
+	struct sk_buff *skb;		/* the pre-allocated skb */
+	struct timer_list timer;	/* the timer function */
+} ulog_buff_t;
+
+static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];	/* array of buffers */
+
+static struct sock *nflognl;		/* our socket */
+static DEFINE_SPINLOCK(ulog_lock);	/* spinlock */
+
+/* send one ulog_buff_t to userspace */
+static void ulog_send(unsigned int nlgroupnum)
+{
+	ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+
+	if (timer_pending(&ub->timer)) {
+		pr_debug("ulog_send: timer was pending, deleting\n");
+		del_timer(&ub->timer);
+	}
+
+	if (!ub->skb) {
+		pr_debug("ulog_send: nothing to send\n");
+		return;
+	}
+
+	/* last nlmsg needs NLMSG_DONE */
+	if (ub->qlen > 1)
+		ub->lastnlh->nlmsg_type = NLMSG_DONE;
+
+	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+	pr_debug("throwing %d packets to netlink group %u\n",
+		 ub->qlen, nlgroupnum + 1);
+	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
+
+	ub->qlen = 0;
+	ub->skb = NULL;
+	ub->lastnlh = NULL;
+}
+
+
+/* timer function to flush queue in flushtimeout time */
+static void ulog_timer(unsigned long data)
+{
+	pr_debug("timer function called, calling ulog_send\n");
+
+	/* lock to protect against somebody modifying our structure
+	 * from ipt_ulog_target at the same time */
+	spin_lock_bh(&ulog_lock);
+	ulog_send(data);
+	spin_unlock_bh(&ulog_lock);
+}
+
+static struct sk_buff *ulog_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb;
+	unsigned int n;
+
+	/* alloc skb which should be big enough for a whole
+	 * multipart message. WARNING: has to be <= 131000
+	 * due to slab allocator restrictions */
+
+	n = max(size, nlbufsiz);
+	skb = alloc_skb(n, GFP_ATOMIC);
+	if (!skb) {
+		pr_debug("cannot alloc whole buffer %ub!\n", n);
+
+		if (n > size) {
+			/* try to allocate only as much as we need for
+			 * current packet */
+
+			skb = alloc_skb(size, GFP_ATOMIC);
+			if (!skb)
+				pr_debug("cannot even allocate %ub\n", size);
+		}
+	}
+
+	return skb;
+}
+
+static void ipt_ulog_packet(unsigned int hooknum,
+			    const struct sk_buff *skb,
+			    const struct net_device *in,
+			    const struct net_device *out,
+			    const struct ipt_ulog_info *loginfo,
+			    const char *prefix)
+{
+	ulog_buff_t *ub;
+	ulog_packet_msg_t *pm;
+	size_t size, copy_len;
+	struct nlmsghdr *nlh;
+	struct timeval tv;
+
+	/* ffs == find first bit set, necessary because userspace
+	 * is already shifting groupnumber, but we need unshifted.
+	 * ffs() returns [1..32], we need [0..31] */
+	unsigned int groupnum = ffs(loginfo->nl_group) - 1;
+
+	/* calculate the size of the skb needed */
+	if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
+		copy_len = skb->len;
+	else
+		copy_len = loginfo->copy_range;
+
+	size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+
+	ub = &ulog_buffers[groupnum];
+
+	spin_lock_bh(&ulog_lock);
+
+	if (!ub->skb) {
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	} else if (ub->qlen >= loginfo->qthreshold ||
+		   size > skb_tailroom(ub->skb)) {
+		/* either the queue len is too high or we don't have
+		 * enough room in nlskb left. send it to userspace. */
+
+		ulog_send(groupnum);
+
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	}
+
+	pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
+
+	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
+	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+			sizeof(*pm)+copy_len);
+	ub->qlen++;
+
+	pm = NLMSG_DATA(nlh);
+
+	/* We might not have a timestamp, get one */
+	if (skb->tstamp.tv64 == 0)
+		__net_timestamp((struct sk_buff *)skb);
+
+	/* copy hook, prefix, timestamp, payload, etc. */
+	pm->data_len = copy_len;
+	tv = ktime_to_timeval(skb->tstamp);
+	put_unaligned(tv.tv_sec, &pm->timestamp_sec);
+	put_unaligned(tv.tv_usec, &pm->timestamp_usec);
+	put_unaligned(skb->mark, &pm->mark);
+	pm->hook = hooknum;
+	if (prefix != NULL)
+		strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+	else if (loginfo->prefix[0] != '\0')
+		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
+	else
+		*(pm->prefix) = '\0';
+
+	if (in && in->hard_header_len > 0 &&
+	    skb->mac_header != skb->network_header &&
+	    in->hard_header_len <= ULOG_MAC_LEN) {
+		memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
+		pm->mac_len = in->hard_header_len;
+	} else
+		pm->mac_len = 0;
+
+	if (in)
+		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+	else
+		pm->indev_name[0] = '\0';
+
+	if (out)
+		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+	else
+		pm->outdev_name[0] = '\0';
+
+	/* copy_len <= skb->len, so can't fail. */
+	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
+		BUG();
+
+	/* check if we are building multi-part messages */
+	if (ub->qlen > 1)
+		ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
+
+	ub->lastnlh = nlh;
+
+	/* if timer isn't already running, start it */
+	if (!timer_pending(&ub->timer)) {
+		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
+		add_timer(&ub->timer);
+	}
+
+	/* if threshold is reached, send message to userspace */
+	if (ub->qlen >= loginfo->qthreshold) {
+		if (loginfo->qthreshold > 1)
+			nlh->nlmsg_type = NLMSG_DONE;
+		ulog_send(groupnum);
+	}
+
+	spin_unlock_bh(&ulog_lock);
+
+	return;
+
+nlmsg_failure:
+	pr_debug("error during NLMSG_PUT\n");
+alloc_failure:
+	pr_debug("Error building netlink message\n");
+	spin_unlock_bh(&ulog_lock);
+}
+
+static unsigned int
+ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
+	                par->targinfo, NULL);
+	return XT_CONTINUE;
+}
+
+static void ipt_logfn(u_int8_t pf,
+		      unsigned int hooknum,
+		      const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_loginfo *li,
+		      const char *prefix)
+{
+	struct ipt_ulog_info loginfo;
+
+	if (!li || li->type != NF_LOG_TYPE_ULOG) {
+		loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+		loginfo.copy_range = 0;
+		loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+		loginfo.prefix[0] = '\0';
+	} else {
+		loginfo.nl_group = li->u.ulog.group;
+		loginfo.copy_range = li->u.ulog.copy_len;
+		loginfo.qthreshold = li->u.ulog.qthreshold;
+		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+	}
+
+	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static int ulog_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_ulog_info *loginfo = par->targinfo;
+
+	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
+		pr_debug("prefix not null-terminated\n");
+		return -EINVAL;
+	}
+	if (loginfo->qthreshold > ULOG_MAX_QLEN) {
+		pr_debug("queue threshold %Zu > MAX_QLEN\n",
+			 loginfo->qthreshold);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ipt_ulog_info {
+	compat_uint_t	nl_group;
+	compat_size_t	copy_range;
+	compat_size_t	qthreshold;
+	char		prefix[ULOG_PREFIX_LEN];
+};
+
+static void ulog_tg_compat_from_user(void *dst, const void *src)
+{
+	const struct compat_ipt_ulog_info *cl = src;
+	struct ipt_ulog_info l = {
+		.nl_group	= cl->nl_group,
+		.copy_range	= cl->copy_range,
+		.qthreshold	= cl->qthreshold,
+	};
+
+	memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
+	memcpy(dst, &l, sizeof(l));
+}
+
+static int ulog_tg_compat_to_user(void __user *dst, const void *src)
+{
+	const struct ipt_ulog_info *l = src;
+	struct compat_ipt_ulog_info cl = {
+		.nl_group	= l->nl_group,
+		.copy_range	= l->copy_range,
+		.qthreshold	= l->qthreshold,
+	};
+
+	memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
+	return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target ulog_tg_reg __read_mostly = {
+	.name		= "ULOG",
+	.family		= NFPROTO_IPV4,
+	.target		= ulog_tg,
+	.targetsize	= sizeof(struct ipt_ulog_info),
+	.checkentry	= ulog_tg_check,
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ipt_ulog_info),
+	.compat_from_user = ulog_tg_compat_from_user,
+	.compat_to_user	= ulog_tg_compat_to_user,
+#endif
+	.me		= THIS_MODULE,
+};
+
+static struct nf_logger ipt_ulog_logger __read_mostly = {
+	.name		= "ipt_ULOG",
+	.logfn		= ipt_logfn,
+	.me		= THIS_MODULE,
+};
+
+static int __init ulog_tg_init(void)
+{
+	int ret, i;
+
+	pr_debug("init module\n");
+
+	if (nlbufsiz > 128*1024) {
+		pr_warning("Netlink buffer has to be <= 128kB\n");
+		return -EINVAL;
+	}
+
+	/* initialize ulog_buffers */
+	for (i = 0; i < ULOG_MAXNLGROUPS; i++)
+		setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
+
+	nflognl = netlink_kernel_create(&init_net,
+					NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+					NULL, THIS_MODULE);
+	if (!nflognl)
+		return -ENOMEM;
+
+	ret = xt_register_target(&ulog_tg_reg);
+	if (ret < 0) {
+		netlink_kernel_release(nflognl);
+		return ret;
+	}
+	if (nflog)
+		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
+
+	return 0;
+}
+
+static void __exit ulog_tg_exit(void)
+{
+	ulog_buff_t *ub;
+	int i;
+
+	pr_debug("cleanup_module\n");
+
+	if (nflog)
+		nf_log_unregister(&ipt_ulog_logger);
+	xt_unregister_target(&ulog_tg_reg);
+	netlink_kernel_release(nflognl);
+
+	/* remove pending timers and free allocated skb's */
+	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+		ub = &ulog_buffers[i];
+		if (timer_pending(&ub->timer)) {
+			pr_debug("timer was pending, deleting\n");
+			del_timer(&ub->timer);
+		}
+
+		if (ub->skb) {
+			kfree_skb(ub->skb);
+			ub->skb = NULL;
+		}
+	}
+}
+
+module_init(ulog_tg_init);
+module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 00000000..14a2aa8b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,91 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+	bool r;
+	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+		 invert ? '!' : ' ', min, spi, max);
+	r=(spi >= min && spi <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ip_auth_hdr _ahdr;
+	const struct ip_auth_hdr *ah;
+	const struct ipt_ah *ahinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
+	if (ah == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		pr_debug("Dropping evil AH tinygram.\n");
+		par->hotdrop = true;
+		return 0;
+	}
+
+	return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			 ntohl(ah->spi),
+			 !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+static int ah_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ipt_ah *ahinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+		pr_debug("unknown flags %X\n", ahinfo->invflags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match ah_mt_reg __read_mostly = {
+	.name		= "ah",
+	.family		= NFPROTO_IPV4,
+	.match		= ah_mt,
+	.matchsize	= sizeof(struct ipt_ah),
+	.proto		= IPPROTO_AH,
+	.checkentry	= ah_mt_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init ah_mt_init(void)
+{
+	return xt_register_match(&ah_mt_reg);
+}
+
+static void __exit ah_mt_exit(void)
+{
+	xt_unregister_match(&ah_mt_reg);
+}
+
+module_init(ah_mt_init);
+module_exit(ah_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
new file mode 100644
index 00000000..2b57e52c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -0,0 +1,127 @@
+/* IP tables module for matching the value of the IPv4 and TCP ECN bits
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ecn.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4");
+MODULE_LICENSE("GPL");
+
+static inline bool match_ip(const struct sk_buff *skb,
+			    const struct ipt_ecn_info *einfo)
+{
+	return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^
+	       !!(einfo->invert & IPT_ECN_OP_MATCH_IP);
+}
+
+static inline bool match_tcp(const struct sk_buff *skb,
+			     const struct ipt_ecn_info *einfo,
+			     bool *hotdrop)
+{
+	struct tcphdr _tcph;
+	const struct tcphdr *th;
+
+	/* In practice, TCP match does this, so can't fail.  But let's
+	 * be good citizens.
+	 */
+	th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		*hotdrop = false;
+		return false;
+	}
+
+	if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
+		if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
+			if (th->ece == 1)
+				return false;
+		} else {
+			if (th->ece == 0)
+				return false;
+		}
+	}
+
+	if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
+		if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
+			if (th->cwr == 1)
+				return false;
+		} else {
+			if (th->cwr == 0)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ipt_ecn_info *info = par->matchinfo;
+
+	if (info->operation & IPT_ECN_OP_MATCH_IP)
+		if (!match_ip(skb, info))
+			return false;
+
+	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
+		if (!match_tcp(skb, info, &par->hotdrop))
+			return false;
+	}
+
+	return true;
+}
+
+static int ecn_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ipt_ecn_info *info = par->matchinfo;
+	const struct ipt_ip *ip = par->entryinfo;
+
+	if (info->operation & IPT_ECN_OP_MATCH_MASK)
+		return -EINVAL;
+
+	if (info->invert & IPT_ECN_OP_MATCH_MASK)
+		return -EINVAL;
+
+	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
+	    (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
+		pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match ecn_mt_reg __read_mostly = {
+	.name		= "ecn",
+	.family		= NFPROTO_IPV4,
+	.match		= ecn_mt,
+	.matchsize	= sizeof(struct ipt_ecn_info),
+	.checkentry	= ecn_mt_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init ecn_mt_init(void)
+{
+	return xt_register_match(&ecn_mt_reg);
+}
+
+static void __exit ecn_mt_exit(void)
+{
+	xt_unregister_match(&ecn_mt_reg);
+}
+
+module_init(ecn_mt_init);
+module_exit(ecn_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 00000000..c37641e8
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,121 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_FILTER,
+};
+
+static unsigned int
+iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+		    const struct net_device *in, const struct net_device *out,
+		    int (*okfn)(struct sk_buff *))
+{
+	const struct net *net;
+
+	if (hook == NF_INET_LOCAL_OUT &&
+	    (skb->len < sizeof(struct iphdr) ||
+	     ip_hdrlen(skb) < sizeof(struct iphdr)))
+		/* root is playing with raw sockets. */
+		return NF_ACCEPT;
+
+	net = dev_net((in != NULL) ? in : out);
+	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+}
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static int forward = NF_ACCEPT;
+module_param(forward, bool, 0000);
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&packet_filter);
+	if (repl == NULL)
+		return -ENOMEM;
+	/* Entry 1 is the FORWARD hook */
+	((struct ipt_standard *)repl->entries)[1].target.verdict =
+		-forward - 1;
+
+	net->ipv4.iptable_filter =
+		ipt_register_table(net, &packet_filter, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.iptable_filter))
+		return PTR_ERR(net->ipv4.iptable_filter);
+	return 0;
+}
+
+static void __net_exit iptable_filter_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.iptable_filter);
+}
+
+static struct pernet_operations iptable_filter_net_ops = {
+	.init = iptable_filter_net_init,
+	.exit = iptable_filter_net_exit,
+};
+
+static int __init iptable_filter_init(void)
+{
+	int ret;
+
+	if (forward < 0 || forward > NF_MAX_VERDICT) {
+		pr_err("iptables forward must be 0 or 1\n");
+		return -EINVAL;
+	}
+
+	ret = register_pernet_subsys(&iptable_filter_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
+	if (IS_ERR(filter_ops)) {
+		ret = PTR_ERR(filter_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&iptable_filter_net_ops);
+	return ret;
+}
+
+static void __exit iptable_filter_fini(void)
+{
+	xt_hook_unlink(&packet_filter, filter_ops);
+	unregister_pernet_subsys(&iptable_filter_net_ops);
+}
+
+module_init(iptable_filter_init);
+module_exit(iptable_filter_fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 00000000..aef5d1fb
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,151 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+			    (1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT) | \
+			    (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+	.name		= "mangle",
+	.valid_hooks	= MANGLE_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_MANGLE,
+};
+
+static unsigned int
+ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
+{
+	unsigned int ret;
+	const struct iphdr *iph;
+	u_int8_t tos;
+	__be32 saddr, daddr;
+	u_int32_t mark;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	/* Save things which could affect route */
+	mark = skb->mark;
+	iph = ip_hdr(skb);
+	saddr = iph->saddr;
+	daddr = iph->daddr;
+	tos = iph->tos;
+
+	ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
+			   dev_net(out)->ipv4.iptable_mangle);
+	/* Reroute for ANY change. */
+	if (ret != NF_DROP && ret != NF_STOLEN) {
+		iph = ip_hdr(skb);
+
+		if (iph->saddr != saddr ||
+		    iph->daddr != daddr ||
+		    skb->mark != mark ||
+		    iph->tos != tos)
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+	}
+
+	return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_mangle_hook(unsigned int hook,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	if (hook == NF_INET_LOCAL_OUT)
+		return ipt_mangle_out(skb, out);
+	if (hook == NF_INET_POST_ROUTING)
+		return ipt_do_table(skb, hook, in, out,
+				    dev_net(out)->ipv4.iptable_mangle);
+	/* PREROUTING/INPUT/FORWARD: */
+	return ipt_do_table(skb, hook, in, out,
+			    dev_net(in)->ipv4.iptable_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+
+static int __net_init iptable_mangle_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&packet_mangler);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.iptable_mangle =
+		ipt_register_table(net, &packet_mangler, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.iptable_mangle))
+		return PTR_ERR(net->ipv4.iptable_mangle);
+	return 0;
+}
+
+static void __net_exit iptable_mangle_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.iptable_mangle);
+}
+
+static struct pernet_operations iptable_mangle_net_ops = {
+	.init = iptable_mangle_net_init,
+	.exit = iptable_mangle_net_exit,
+};
+
+static int __init iptable_mangle_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&iptable_mangle_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
+	if (IS_ERR(mangle_ops)) {
+		ret = PTR_ERR(mangle_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&iptable_mangle_net_ops);
+	return ret;
+}
+
+static void __exit iptable_mangle_fini(void)
+{
+	xt_hook_unlink(&packet_mangler, mangle_ops);
+	unregister_pernet_subsys(&iptable_mangle_net_ops);
+}
+
+module_init(iptable_mangle_init);
+module_exit(iptable_mangle_fini);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 00000000..07fb710c
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,96 @@
+/*
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_raw = {
+	.name = "raw",
+	.valid_hooks =  RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV4,
+	.priority = NF_IP_PRI_RAW,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+		 const struct net_device *in, const struct net_device *out,
+		 int (*okfn)(struct sk_buff *))
+{
+	const struct net *net;
+
+	if (hook == NF_INET_LOCAL_OUT && 
+	    (skb->len < sizeof(struct iphdr) ||
+	     ip_hdrlen(skb) < sizeof(struct iphdr)))
+		/* root is playing with raw sockets. */
+		return NF_ACCEPT;
+
+	net = dev_net((in != NULL) ? in : out);
+	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
+}
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int __net_init iptable_raw_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&packet_raw);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.iptable_raw =
+		ipt_register_table(net, &packet_raw, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.iptable_raw))
+		return PTR_ERR(net->ipv4.iptable_raw);
+	return 0;
+}
+
+static void __net_exit iptable_raw_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.iptable_raw);
+}
+
+static struct pernet_operations iptable_raw_net_ops = {
+	.init = iptable_raw_net_init,
+	.exit = iptable_raw_net_exit,
+};
+
+static int __init iptable_raw_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&iptable_raw_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
+	if (IS_ERR(rawtable_ops)) {
+		ret = PTR_ERR(rawtable_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&iptable_raw_net_ops);
+	return ret;
+}
+
+static void __exit iptable_raw_fini(void)
+{
+	xt_hook_unlink(&packet_raw, rawtable_ops);
+	unregister_pernet_subsys(&iptable_raw_net_ops);
+}
+
+module_init(iptable_raw_init);
+module_exit(iptable_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
new file mode 100644
index 00000000..be45bdc4
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -0,0 +1,113 @@
+/*
+ * "security" table
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("iptables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS	(1 << NF_INET_LOCAL_IN) | \
+				(1 << NF_INET_FORWARD) | \
+				(1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+	.name		= "security",
+	.valid_hooks	= SECURITY_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+	.priority	= NF_IP_PRI_SECURITY,
+};
+
+static unsigned int
+iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	const struct net *net;
+
+	if (hook == NF_INET_LOCAL_OUT &&
+	    (skb->len < sizeof(struct iphdr) ||
+	     ip_hdrlen(skb) < sizeof(struct iphdr)))
+		/* Somebody is playing with raw sockets. */
+		return NF_ACCEPT;
+
+	net = dev_net((in != NULL) ? in : out);
+	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
+}
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int __net_init iptable_security_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&security_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.iptable_security =
+		ipt_register_table(net, &security_table, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.iptable_security))
+		return PTR_ERR(net->ipv4.iptable_security);
+
+	return 0;
+}
+
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.iptable_security);
+}
+
+static struct pernet_operations iptable_security_net_ops = {
+	.init = iptable_security_net_init,
+	.exit = iptable_security_net_exit,
+};
+
+static int __init iptable_security_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&iptable_security_net_ops);
+        if (ret < 0)
+		return ret;
+
+	sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
+	if (IS_ERR(sectbl_ops)) {
+		ret = PTR_ERR(sectbl_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+cleanup_table:
+	unregister_pernet_subsys(&iptable_security_net_ops);
+	return ret;
+}
+
+static void __exit iptable_security_fini(void)
+{
+	xt_hook_unlink(&security_table, sectbl_ops);
+	unregister_pernet_subsys(&iptable_security_net_ops);
+}
+
+module_init(iptable_security_init);
+module_exit(iptable_security_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
new file mode 100644
index 00000000..de9da211
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -0,0 +1,459 @@
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/nf_log.h>
+
+int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
+			      struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
+
+static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+			      struct nf_conntrack_tuple *tuple)
+{
+	const __be32 *ap;
+	__be32 _addrs[2];
+	ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
+				sizeof(u_int32_t) * 2, _addrs);
+	if (ap == NULL)
+		return false;
+
+	tuple->src.u3.ip = ap[0];
+	tuple->dst.u3.ip = ap[1];
+
+	return true;
+}
+
+static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u3.ip = orig->dst.u3.ip;
+	tuple->dst.u3.ip = orig->src.u3.ip;
+
+	return true;
+}
+
+static int ipv4_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "src=%pI4 dst=%pI4 ",
+			  &tuple->src.u3.ip, &tuple->dst.u3.ip);
+}
+
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    unsigned int *dataoff, u_int8_t *protonum)
+{
+	const struct iphdr *iph;
+	struct iphdr _iph;
+
+	iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+	if (iph == NULL)
+		return -NF_DROP;
+
+	/* Conntrack defragments packets, we might still see fragments
+	 * inside ICMP packets though. */
+	if (iph->frag_off & htons(IP_OFFSET))
+		return -NF_DROP;
+
+	*dataoff = nhoff + (iph->ihl << 2);
+	*protonum = iph->protocol;
+
+	return NF_ACCEPT;
+}
+
+static unsigned int ipv4_confirm(unsigned int hooknum,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_helper *helper;
+	unsigned int ret;
+
+	/* This is where we call the helper: as the packet goes out. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		goto out;
+
+	help = nfct_help(ct);
+	if (!help)
+		goto out;
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		goto out;
+
+	ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+			   ct, ctinfo);
+	if (ret != NF_ACCEPT) {
+		nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
+			      "nf_ct_%s: dropping packet", helper->name);
+		return ret;
+	}
+
+	/* adjust seqs for loopback traffic only in outgoing direction */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_is_loopback_packet(skb)) {
+		typeof(nf_nat_seq_adjust_hook) seq_adjust;
+
+		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
+		if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
+			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+			return NF_DROP;
+		}
+	}
+out:
+	/* We've seen it coming out the other side: confirm it */
+	return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
+}
+
+static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+					 struct sk_buff *skb,
+					 const struct net_device *in,
+					 const struct net_device *out,
+					 int (*okfn)(struct sk_buff *))
+{
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+   make it the first hook. */
+static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
+	{
+		.hook		= ipv4_conntrack_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv4_conntrack_local,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv4_confirm,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
+	},
+	{
+		.hook		= ipv4_confirm,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
+	},
+};
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static ctl_table ip_ct_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_max",
+		.data		= &nf_conntrack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_count",
+		.data		= &init_net.ct.count,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_buckets",
+		.data		= &init_net.ct.htable_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_checksum",
+		.data		= &init_net.ct.sysctl_checksum,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_log_invalid",
+		.data		= &init_net.ct.sysctl_log_invalid,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &log_invalid_proto_min,
+		.extra2		= &log_invalid_proto_max,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
+
+/* Fast function for those who don't want to parse /proc (and I don't
+   blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+   mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+
+	memset(&tuple, 0, sizeof(tuple));
+	tuple.src.u3.ip = inet->inet_rcv_saddr;
+	tuple.src.u.tcp.port = inet->inet_sport;
+	tuple.dst.u3.ip = inet->inet_daddr;
+	tuple.dst.u.tcp.port = inet->inet_dport;
+	tuple.src.l3num = PF_INET;
+	tuple.dst.protonum = sk->sk_protocol;
+
+	/* We only do TCP and SCTP at the moment: is there a better way? */
+	if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) {
+		pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+		return -ENOPROTOOPT;
+	}
+
+	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+		pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
+			 *len, sizeof(struct sockaddr_in));
+		return -EINVAL;
+	}
+
+	h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
+	if (h) {
+		struct sockaddr_in sin;
+		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.u.tcp.port;
+		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.u3.ip;
+		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+		pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+			 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+		nf_ct_put(ct);
+		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+			return -EFAULT;
+		else
+			return 0;
+	}
+	pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+		 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+		 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+	return -ENOENT;
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
+	NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
+	[CTA_IP_V4_SRC]	= { .type = NLA_U32 },
+	[CTA_IP_V4_DST]	= { .type = NLA_U32 },
+};
+
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+		return -EINVAL;
+
+	t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]);
+	t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]);
+
+	return 0;
+}
+
+static int ipv4_nlattr_tuple_size(void)
+{
+	return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
+}
+#endif
+
+static struct nf_sockopt_ops so_getorigdst = {
+	.pf		= PF_INET,
+	.get_optmin	= SO_ORIGINAL_DST,
+	.get_optmax	= SO_ORIGINAL_DST+1,
+	.get		= &getorigdst,
+	.owner		= THIS_MODULE,
+};
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
+	.l3proto	 = PF_INET,
+	.name		 = "ipv4",
+	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
+	.invert_tuple	 = ipv4_invert_tuple,
+	.print_tuple	 = ipv4_print_tuple,
+	.get_l4proto	 = ipv4_get_l4proto,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr = ipv4_tuple_to_nlattr,
+	.nlattr_tuple_size = ipv4_nlattr_tuple_size,
+	.nlattr_to_tuple = ipv4_nlattr_to_tuple,
+	.nla_policy	 = ipv4_nla_policy,
+#endif
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	.ctl_table_path  = nf_net_ipv4_netfilter_sysctl_path,
+	.ctl_table	 = ip_ct_sysctl_table,
+#endif
+	.me		 = THIS_MODULE,
+};
+
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+		  &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("ip_conntrack");
+MODULE_LICENSE("GPL");
+
+static int __init nf_conntrack_l3proto_ipv4_init(void)
+{
+	int ret = 0;
+
+	need_conntrack();
+	nf_defrag_ipv4_enable();
+
+	ret = nf_register_sockopt(&so_getorigdst);
+	if (ret < 0) {
+		printk(KERN_ERR "Unable to register netfilter socket option\n");
+		return ret;
+	}
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv4: can't register tcp.\n");
+		goto cleanup_sockopt;
+	}
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv4: can't register udp.\n");
+		goto cleanup_tcp;
+	}
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv4: can't register icmp.\n");
+		goto cleanup_udp;
+	}
+
+	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv4: can't register ipv4\n");
+		goto cleanup_icmp;
+	}
+
+	ret = nf_register_hooks(ipv4_conntrack_ops,
+				ARRAY_SIZE(ipv4_conntrack_ops));
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv4: can't register hooks.\n");
+		goto cleanup_ipv4;
+	}
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	ret = nf_conntrack_ipv4_compat_init();
+	if (ret < 0)
+		goto cleanup_hooks;
+#endif
+	return ret;
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+ cleanup_hooks:
+	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+#endif
+ cleanup_ipv4:
+	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ cleanup_icmp:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+ cleanup_udp:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+ cleanup_tcp:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+ cleanup_sockopt:
+	nf_unregister_sockopt(&so_getorigdst);
+	return ret;
+}
+
+static void __exit nf_conntrack_l3proto_ipv4_fini(void)
+{
+	synchronize_net();
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	nf_conntrack_ipv4_compat_fini();
+#endif
+	nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
+	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
+	nf_unregister_sockopt(&so_getorigdst);
+}
+
+module_init(nf_conntrack_l3proto_ipv4_init);
+module_exit(nf_conntrack_l3proto_ipv4_fini);
+
+void need_ipv4_conntrack(void)
+{
+	return;
+}
+EXPORT_SYMBOL_GPL(need_ipv4_conntrack);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
new file mode 100644
index 00000000..5585980f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -0,0 +1,462 @@
+/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
+
+struct ct_iter_state {
+	struct seq_net_private p;
+	unsigned int bucket;
+};
+
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_iter_state *st = seq->private;
+	struct hlist_nulls_node *n;
+
+	for (st->bucket = 0;
+	     st->bucket < net->ct.htable_size;
+	     st->bucket++) {
+		n = rcu_dereference(
+			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+		if (!is_a_nulls(n))
+			return n;
+	}
+	return NULL;
+}
+
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+				      struct hlist_nulls_node *head)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_iter_state *st = seq->private;
+
+	head = rcu_dereference(hlist_nulls_next_rcu(head));
+	while (is_a_nulls(head)) {
+		if (likely(get_nulls_value(head) == st->bucket)) {
+			if (++st->bucket >= net->ct.htable_size)
+				return NULL;
+		}
+		head = rcu_dereference(
+			hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+	}
+	return head;
+}
+
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_nulls_node *head = ct_get_first(seq);
+
+	if (head)
+		while (pos && (head = ct_get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return 0;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+	struct nf_conntrack_tuple_hash *hash = v;
+	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
+	int ret = 0;
+
+	NF_CT_ASSERT(ct);
+	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+		return 0;
+
+
+	/* we only want to print DIR_ORIGINAL */
+	if (NF_CT_DIRECTION(hash))
+		goto release;
+	if (nf_ct_l3num(ct) != AF_INET)
+		goto release;
+
+	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+	NF_CT_ASSERT(l3proto);
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	NF_CT_ASSERT(l4proto);
+
+	ret = -ENOSPC;
+	if (seq_printf(s, "%-8s %u %ld ",
+		      l4proto->name, nf_ct_protonum(ct),
+		      timer_pending(&ct->timeout)
+		      ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
+		goto release;
+
+	if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
+		goto release;
+
+	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+			l3proto, l4proto))
+		goto release;
+
+	if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+		goto release;
+
+	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
+		if (seq_printf(s, "[UNREPLIED] "))
+			goto release;
+
+	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+			l3proto, l4proto))
+		goto release;
+
+	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+		goto release;
+
+	if (test_bit(IPS_ASSURED_BIT, &ct->status))
+		if (seq_printf(s, "[ASSURED] "))
+			goto release;
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	if (seq_printf(s, "mark=%u ", ct->mark))
+		goto release;
+#endif
+
+	if (ct_show_secctx(s, ct))
+		goto release;
+
+	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
+		goto release;
+	ret = 0;
+release:
+	nf_ct_put(ct);
+	return ret;
+}
+
+static const struct seq_operations ct_seq_ops = {
+	.start = ct_seq_start,
+	.next  = ct_seq_next,
+	.stop  = ct_seq_stop,
+	.show  = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ct_seq_ops,
+			    sizeof(struct ct_iter_state));
+}
+
+static const struct file_operations ct_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = ct_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+/* expects */
+struct ct_expect_iter_state {
+	struct seq_net_private p;
+	unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_expect_iter_state *st = seq->private;
+	struct hlist_node *n;
+
+	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+		n = rcu_dereference(
+			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+		if (n)
+			return n;
+	}
+	return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+					     struct hlist_node *head)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_expect_iter_state *st = seq->private;
+
+	head = rcu_dereference(hlist_next_rcu(head));
+	while (head == NULL) {
+		if (++st->bucket >= nf_ct_expect_hsize)
+			return NULL;
+		head = rcu_dereference(
+			hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+	}
+	return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_node *head = ct_expect_get_first(seq);
+
+	if (head)
+		while (pos && (head = ct_expect_get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+	struct nf_conntrack_expect *exp;
+	const struct hlist_node *n = v;
+
+	exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+	if (exp->tuple.src.l3num != AF_INET)
+		return 0;
+
+	if (exp->timeout.function)
+		seq_printf(s, "%ld ", timer_pending(&exp->timeout)
+			   ? (long)(exp->timeout.expires - jiffies)/HZ : 0);
+	else
+		seq_printf(s, "- ");
+
+	seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
+
+	print_tuple(s, &exp->tuple,
+		    __nf_ct_l3proto_find(exp->tuple.src.l3num),
+		    __nf_ct_l4proto_find(exp->tuple.src.l3num,
+					 exp->tuple.dst.protonum));
+	return seq_putc(s, '\n');
+}
+
+static const struct seq_operations exp_seq_ops = {
+	.start = exp_seq_start,
+	.next = exp_seq_next,
+	.stop = exp_seq_stop,
+	.show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &exp_seq_ops,
+			    sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations ip_exp_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = exp_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(net->ct.stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(net->ct.stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned int nr_conntracks = atomic_read(&net->ct.count);
+	const struct ip_conntrack_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
+			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
+		   nr_conntracks,
+		   st->searched,
+		   st->found,
+		   st->new,
+		   st->invalid,
+		   st->ignore,
+		   st->delete,
+		   st->delete_list,
+		   st->insert,
+		   st->insert_failed,
+		   st->drop,
+		   st->early_drop,
+		   st->error,
+
+		   st->expect_new,
+		   st->expect_create,
+		   st->expect_delete,
+		   st->search_restart
+		);
+	return 0;
+}
+
+static const struct seq_operations ct_cpu_seq_ops = {
+	.start  = ct_cpu_seq_start,
+	.next   = ct_cpu_seq_next,
+	.stop   = ct_cpu_seq_stop,
+	.show   = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ct_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ct_cpu_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ct_cpu_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static int __net_init ip_conntrack_net_init(struct net *net)
+{
+	struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+
+	proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
+	if (!proc)
+		goto err1;
+
+	proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
+					&ip_exp_file_ops);
+	if (!proc_exp)
+		goto err2;
+
+	proc_stat = proc_create("ip_conntrack", S_IRUGO,
+				net->proc_net_stat, &ct_cpu_seq_fops);
+	if (!proc_stat)
+		goto err3;
+	return 0;
+
+err3:
+	proc_net_remove(net, "ip_conntrack_expect");
+err2:
+	proc_net_remove(net, "ip_conntrack");
+err1:
+	return -ENOMEM;
+}
+
+static void __net_exit ip_conntrack_net_exit(struct net *net)
+{
+	remove_proc_entry("ip_conntrack", net->proc_net_stat);
+	proc_net_remove(net, "ip_conntrack_expect");
+	proc_net_remove(net, "ip_conntrack");
+}
+
+static struct pernet_operations ip_conntrack_net_ops = {
+	.init = ip_conntrack_net_init,
+	.exit = ip_conntrack_net_exit,
+};
+
+int __init nf_conntrack_ipv4_compat_init(void)
+{
+	return register_pernet_subsys(&ip_conntrack_net_ops);
+}
+
+void __exit nf_conntrack_ipv4_compat_fini(void)
+{
+	unregister_pernet_subsys(&ip_conntrack_net_ops);
+}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
new file mode 100644
index 00000000..ab5b27a2
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -0,0 +1,318 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
+
+static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+			      struct nf_conntrack_tuple *tuple)
+{
+	const struct icmphdr *hp;
+	struct icmphdr _hdr;
+
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return false;
+
+	tuple->dst.u.icmp.type = hp->type;
+	tuple->src.u.icmp.id = hp->un.echo.id;
+	tuple->dst.u.icmp.code = hp->code;
+
+	return true;
+}
+
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+	[ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+	[ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+	[ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+	[ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+	[ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+	[ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+	[ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+	[ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
+};
+
+static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_tuple *orig)
+{
+	if (orig->dst.u.icmp.type >= sizeof(invmap) ||
+	    !invmap[orig->dst.u.icmp.type])
+		return false;
+
+	tuple->src.u.icmp.id = orig->src.u.icmp.id;
+	tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+	tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmp_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "type=%u code=%u id=%u ",
+			  tuple->dst.u.icmp.type,
+			  tuple->dst.u.icmp.code,
+			  ntohs(tuple->src.u.icmp.id));
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct nf_conn *ct,
+		       const struct sk_buff *skb,
+		       unsigned int dataoff,
+		       enum ip_conntrack_info ctinfo,
+		       u_int8_t pf,
+		       unsigned int hooknum)
+{
+	/* Do not immediately delete the connection after the first
+	   successful reply to avoid excessive conntrackd traffic
+	   and also to handle correctly ICMP echo reply duplicates. */
+	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
+		     unsigned int dataoff)
+{
+	static const u_int8_t valid_new[] = {
+		[ICMP_ECHO] = 1,
+		[ICMP_TIMESTAMP] = 1,
+		[ICMP_INFO_REQUEST] = 1,
+		[ICMP_ADDRESS] = 1
+	};
+
+	if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
+	    !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
+		/* Can't create a new ICMP `conn' with this. */
+		pr_debug("icmp: can't create new conn with type %u\n",
+			 ct->tuplehash[0].tuple.dst.u.icmp.type);
+		nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
+		return false;
+	}
+	return true;
+}
+
+/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
+static int
+icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+		 enum ip_conntrack_info *ctinfo,
+		 unsigned int hooknum)
+{
+	struct nf_conntrack_tuple innertuple, origtuple;
+	const struct nf_conntrack_l4proto *innerproto;
+	const struct nf_conntrack_tuple_hash *h;
+	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+	NF_CT_ASSERT(skb->nfct == NULL);
+
+	/* Are they talking about one of our connections? */
+	if (!nf_ct_get_tuplepr(skb,
+			       skb_network_offset(skb) + ip_hdrlen(skb)
+						       + sizeof(struct icmphdr),
+			       PF_INET, &origtuple)) {
+		pr_debug("icmp_error_message: failed to get tuple\n");
+		return -NF_ACCEPT;
+	}
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
+
+	/* Ordinarily, we'd expect the inverted tupleproto, but it's
+	   been preserved inside the ICMP. */
+	if (!nf_ct_invert_tuple(&innertuple, &origtuple,
+				&nf_conntrack_l3proto_ipv4, innerproto)) {
+		pr_debug("icmp_error_message: no match\n");
+		return -NF_ACCEPT;
+	}
+
+	*ctinfo = IP_CT_RELATED;
+
+	h = nf_conntrack_find_get(net, zone, &innertuple);
+	if (!h) {
+		pr_debug("icmp_error_message: no match\n");
+		return -NF_ACCEPT;
+	}
+
+	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+		*ctinfo += IP_CT_IS_REPLY;
+
+	/* Update skb to refer to this connection */
+	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct net *net, struct nf_conn *tmpl,
+	   struct sk_buff *skb, unsigned int dataoff,
+	   enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+{
+	const struct icmphdr *icmph;
+	struct icmphdr _ih;
+
+	/* Not enough header? */
+	icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
+	if (icmph == NULL) {
+		if (LOG_INVALID(net, IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmp: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* See ip_conntrack_proto_tcp.c */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_ip_checksum(skb, hooknum, dataoff, 0)) {
+		if (LOG_INVALID(net, IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmp: bad HW ICMP checksum ");
+		return -NF_ACCEPT;
+	}
+
+	/*
+	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
+	 *
+	 *	RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+	 *		  discarded.
+	 */
+	if (icmph->type > NR_ICMP_TYPES) {
+		if (LOG_INVALID(net, IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmp: invalid ICMP type ");
+		return -NF_ACCEPT;
+	}
+
+	/* Need to track icmp error message? */
+	if (icmph->type != ICMP_DEST_UNREACH &&
+	    icmph->type != ICMP_SOURCE_QUENCH &&
+	    icmph->type != ICMP_TIME_EXCEEDED &&
+	    icmph->type != ICMP_PARAMETERPROB &&
+	    icmph->type != ICMP_REDIRECT)
+		return NF_ACCEPT;
+
+	return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int icmp_tuple_to_nlattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *t)
+{
+	NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
+	NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
+	NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
+	[CTA_PROTO_ICMP_TYPE]	= { .type = NLA_U8 },
+	[CTA_PROTO_ICMP_CODE]	= { .type = NLA_U8 },
+	[CTA_PROTO_ICMP_ID]	= { .type = NLA_U16 },
+};
+
+static int icmp_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMP_TYPE] ||
+	    !tb[CTA_PROTO_ICMP_CODE] ||
+	    !tb[CTA_PROTO_ICMP_ID])
+		return -EINVAL;
+
+	tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+	tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+	tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+
+	if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+	    !invmap[tuple->dst.u.icmp.type])
+		return -EINVAL;
+
+	return 0;
+}
+
+static int icmp_nlattr_tuple_size(void)
+{
+	return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *icmp_sysctl_header;
+static struct ctl_table icmp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_icmp_timeout",
+		.data		= &nf_ct_icmp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table icmp_compat_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_icmp_timeout",
+		.data		= &nf_ct_icmp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto		= IPPROTO_ICMP,
+	.name			= "icmp",
+	.pkt_to_tuple		= icmp_pkt_to_tuple,
+	.invert_tuple		= icmp_invert_tuple,
+	.print_tuple		= icmp_print_tuple,
+	.packet			= icmp_packet,
+	.new			= icmp_new,
+	.error			= icmp_error,
+	.destroy		= NULL,
+	.me			= NULL,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= icmp_tuple_to_nlattr,
+	.nlattr_tuple_size	= icmp_nlattr_tuple_size,
+	.nlattr_to_tuple	= icmp_nlattr_to_tuple,
+	.nla_policy		= icmp_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_header	= &icmp_sysctl_header,
+	.ctl_table		= icmp_sysctl_table,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	.ctl_compat_table	= icmp_compat_sysctl_table,
+#endif
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 00000000..f3a9b42b
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,128 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+
+/* Returns new sk_buff, or NULL */
+static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	int err;
+
+	skb_orphan(skb);
+
+	local_bh_disable();
+	err = ip_defrag(skb, user);
+	local_bh_enable();
+
+	if (!err)
+		ip_send_check(ip_hdr(skb));
+
+	return err;
+}
+
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+					      struct sk_buff *skb)
+{
+	u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (skb->nfct)
+		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge &&
+	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+	if (hooknum == NF_INET_PRE_ROUTING)
+		return IP_DEFRAG_CONNTRACK_IN + zone;
+	else
+		return IP_DEFRAG_CONNTRACK_OUT + zone;
+}
+
+static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+					  struct sk_buff *skb,
+					  const struct net_device *in,
+					  const struct net_device *out,
+					  int (*okfn)(struct sk_buff *))
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(skb->sk);
+
+	if (sk && (sk->sk_family == PF_INET) &&
+	    inet->nodefrag)
+		return NF_ACCEPT;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+	   fragment check. */
+	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+		return NF_ACCEPT;
+#endif
+#endif
+	/* Gather fragments. */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+		if (nf_ct_ipv4_gather_frags(skb, user))
+			return NF_STOLEN;
+	}
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops ipv4_defrag_ops[] = {
+	{
+		.hook		= ipv4_conntrack_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+	{
+		.hook           = ipv4_conntrack_defrag,
+		.owner          = THIS_MODULE,
+		.pf             = PF_INET,
+		.hooknum        = NF_INET_LOCAL_OUT,
+		.priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
+	},
+};
+
+static int __init nf_defrag_init(void)
+{
+	return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+static void __exit nf_defrag_fini(void)
+{
+	nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
+}
+
+void nf_defrag_ipv4_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
new file mode 100644
index 00000000..703f366f
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -0,0 +1,85 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_amanda");
+
+static unsigned int help(struct sk_buff *skb,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int matchoff,
+			 unsigned int matchlen,
+			 struct nf_conntrack_expect *exp)
+{
+	char buffer[sizeof("65535")];
+	u_int16_t port;
+	unsigned int ret;
+
+	/* Connection comes from client. */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = IP_CT_DIR_ORIGINAL;
+
+	/* When you see the packet, we need to NAT it the same as the
+	 * this one (ie. same IP: it will be TCP and master is UDP). */
+	exp->expectfn = nf_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int res;
+
+		exp->tuple.dst.u.tcp.port = htons(port);
+		res = nf_ct_expect_related(exp);
+		if (res == 0)
+			break;
+		else if (res != -EBUSY) {
+			port = 0;
+			break;
+		}
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	sprintf(buffer, "%u", port);
+	ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo,
+				       matchoff, matchlen,
+				       buffer, strlen(buffer));
+	if (ret != NF_ACCEPT)
+		nf_ct_unexpect_related(exp);
+	return ret;
+}
+
+static void __exit nf_nat_amanda_fini(void)
+{
+	rcu_assign_pointer(nf_nat_amanda_hook, NULL);
+	synchronize_rcu();
+}
+
+static int __init nf_nat_amanda_init(void)
+{
+	BUG_ON(nf_nat_amanda_hook != NULL);
+	rcu_assign_pointer(nf_nat_amanda_hook, help);
+	return 0;
+}
+
+module_init(nf_nat_amanda_init);
+module_exit(nf_nat_amanda_fini);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
new file mode 100644
index 00000000..3346de5d
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -0,0 +1,779 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>  /* For tcp_prot in getorigdst */
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/jhash.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static DEFINE_SPINLOCK(nf_nat_lock);
+
+static struct nf_conntrack_l3proto *l3proto __read_mostly;
+
+#define MAX_IP_NAT_PROTO 256
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
+						__read_mostly;
+
+static inline const struct nf_nat_protocol *
+__nf_nat_proto_find(u_int8_t protonum)
+{
+	return rcu_dereference(nf_nat_protos[protonum]);
+}
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *net, u16 zone,
+	    const struct nf_conntrack_tuple *tuple)
+{
+	unsigned int hash;
+
+	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash_3words((__force u32)tuple->src.u3.ip,
+			    (__force u32)tuple->src.u.all ^ zone,
+			    tuple->dst.protonum, 0);
+	return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
+		  const struct nf_conn *ignored_conntrack)
+{
+	/* Conntrack tracking doesn't keep track of outgoing tuples; only
+	   incoming ones.  NAT means they don't have a fixed mapping,
+	   so we invert the tuple and look for the incoming reply.
+
+	   We could keep a separate hash if this proves too slow. */
+	struct nf_conntrack_tuple reply;
+
+	nf_ct_invert_tuplepr(&reply, tuple);
+	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+EXPORT_SYMBOL(nf_nat_used_tuple);
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range. */
+static int
+in_range(const struct nf_conntrack_tuple *tuple,
+	 const struct nf_nat_range *range)
+{
+	const struct nf_nat_protocol *proto;
+	int ret = 0;
+
+	/* If we are supposed to map IPs, then we must be in the
+	   range specified, otherwise let this drag us onto a new src IP. */
+	if (range->flags & IP_NAT_RANGE_MAP_IPS) {
+		if (ntohl(tuple->src.u3.ip) < ntohl(range->min_ip) ||
+		    ntohl(tuple->src.u3.ip) > ntohl(range->max_ip))
+			return 0;
+	}
+
+	rcu_read_lock();
+	proto = __nf_nat_proto_find(tuple->dst.protonum);
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
+	    proto->in_range(tuple, IP_NAT_MANIP_SRC,
+			    &range->min, &range->max))
+		ret = 1;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static inline int
+same_src(const struct nf_conn *ct,
+	 const struct nf_conntrack_tuple *tuple)
+{
+	const struct nf_conntrack_tuple *t;
+
+	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	return (t->dst.protonum == tuple->dst.protonum &&
+		t->src.u3.ip == tuple->src.u3.ip &&
+		t->src.u.all == tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(struct net *net, u16 zone,
+		     const struct nf_conntrack_tuple *tuple,
+		     struct nf_conntrack_tuple *result,
+		     const struct nf_nat_range *range)
+{
+	unsigned int h = hash_by_src(net, zone, tuple);
+	const struct nf_conn_nat *nat;
+	const struct nf_conn *ct;
+	const struct hlist_node *n;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
+		ct = nat->ct;
+		if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
+
+			if (in_range(result, range)) {
+				rcu_read_unlock();
+				return 1;
+			}
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
+   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+   1-65535, we don't do pro-rata allocation based on ports; we choose
+   the ip with the lowest src-ip/dst-ip/proto usage.
+*/
+static void
+find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
+		    const struct nf_nat_range *range,
+		    const struct nf_conn *ct,
+		    enum nf_nat_manip_type maniptype)
+{
+	__be32 *var_ipp;
+	/* Host order */
+	u_int32_t minip, maxip, j;
+
+	/* No IP mapping?  Do nothing. */
+	if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
+		return;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		var_ipp = &tuple->src.u3.ip;
+	else
+		var_ipp = &tuple->dst.u3.ip;
+
+	/* Fast path: only one choice. */
+	if (range->min_ip == range->max_ip) {
+		*var_ipp = range->min_ip;
+		return;
+	}
+
+	/* Hashing source and destination IPs gives a fairly even
+	 * spread in practice (if there are a small number of IPs
+	 * involved, there usually aren't that many connections
+	 * anyway).  The consistency means that servers see the same
+	 * client coming from the same IP (some Internet Banking sites
+	 * like this), even across reboots. */
+	minip = ntohl(range->min_ip);
+	maxip = ntohl(range->max_ip);
+	j = jhash_2words((__force u32)tuple->src.u3.ip,
+			 range->flags & IP_NAT_RANGE_PERSISTENT ?
+				0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
+	j = ((u64)j * (maxip - minip + 1)) >> 32;
+	*var_ipp = htonl(minip + j);
+}
+
+/* Manipulate the tuple into the range given.  For NF_INET_POST_ROUTING,
+ * we change the source to map into the range.  For NF_INET_PRE_ROUTING
+ * and NF_INET_LOCAL_OUT, we change the destination to map into the
+ * range.  It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct nf_conntrack_tuple *tuple,
+		 const struct nf_conntrack_tuple *orig_tuple,
+		 const struct nf_nat_range *range,
+		 struct nf_conn *ct,
+		 enum nf_nat_manip_type maniptype)
+{
+	struct net *net = nf_ct_net(ct);
+	const struct nf_nat_protocol *proto;
+	u16 zone = nf_ct_zone(ct);
+
+	/* 1) If this srcip/proto/src-proto-part is currently mapped,
+	   and that same mapping gives a unique tuple within the given
+	   range, use that.
+
+	   This is only required for source (ie. NAT/masq) mappings.
+	   So far, we don't do local source mappings, so multiple
+	   manips not an issue.  */
+	if (maniptype == IP_NAT_MANIP_SRC &&
+	    !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+		/* try the original tuple first */
+		if (in_range(orig_tuple, range)) {
+			if (!nf_nat_used_tuple(orig_tuple, ct)) {
+				*tuple = *orig_tuple;
+				return;
+			}
+		} else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+			   range)) {
+			pr_debug("get_unique_tuple: Found current src map\n");
+			if (!nf_nat_used_tuple(tuple, ct))
+				return;
+		}
+	}
+
+	/* 2) Select the least-used IP/proto combination in the given
+	   range. */
+	*tuple = *orig_tuple;
+	find_best_ips_proto(zone, tuple, range, ct, maniptype);
+
+	/* 3) The per-protocol part of the manip is made to map into
+	   the range to make a unique tuple. */
+
+	rcu_read_lock();
+	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
+
+	/* Only bother mapping if it's not already in range and unique */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+		if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+			if (proto->in_range(tuple, maniptype, &range->min,
+					    &range->max) &&
+			    (range->min.all == range->max.all ||
+			     !nf_nat_used_tuple(tuple, ct)))
+				goto out;
+		} else if (!nf_nat_used_tuple(tuple, ct)) {
+			goto out;
+		}
+	}
+
+	/* Last change: get protocol to try to obtain unique tuple. */
+	proto->unique_tuple(tuple, range, maniptype, ct);
+out:
+	rcu_read_unlock();
+}
+
+unsigned int
+nf_nat_setup_info(struct nf_conn *ct,
+		  const struct nf_nat_range *range,
+		  enum nf_nat_manip_type maniptype)
+{
+	struct net *net = nf_ct_net(ct);
+	struct nf_conntrack_tuple curr_tuple, new_tuple;
+	struct nf_conn_nat *nat;
+
+	/* nat helper or nfctnetlink also setup binding */
+	nat = nfct_nat(ct);
+	if (!nat) {
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL) {
+			pr_debug("failed to add NAT extension\n");
+			return NF_ACCEPT;
+		}
+	}
+
+	NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC ||
+		     maniptype == IP_NAT_MANIP_DST);
+	BUG_ON(nf_nat_initialized(ct, maniptype));
+
+	/* What we've got will look like inverse of reply. Normally
+	   this is what is in the conntrack, except for prior
+	   manipulations (future optimization: if num_manips == 0,
+	   orig_tp =
+	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+	nf_ct_invert_tuplepr(&curr_tuple,
+			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+
+	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+		struct nf_conntrack_tuple reply;
+
+		/* Alter conntrack table so will recognize replies. */
+		nf_ct_invert_tuplepr(&reply, &new_tuple);
+		nf_conntrack_alter_reply(ct, &reply);
+
+		/* Non-atomic: we own this at the moment. */
+		if (maniptype == IP_NAT_MANIP_SRC)
+			ct->status |= IPS_SRC_NAT;
+		else
+			ct->status |= IPS_DST_NAT;
+	}
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		unsigned int srchash;
+
+		srchash = hash_by_src(net, nf_ct_zone(ct),
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		/* nf_conntrack_alter_reply might re-allocate exntension aera */
+		nat = nfct_nat(ct);
+		nat->ct = ct;
+		hlist_add_head_rcu(&nat->bysource,
+				   &net->ipv4.nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
+	}
+
+	/* It's done. */
+	if (maniptype == IP_NAT_MANIP_DST)
+		ct->status |= IPS_DST_NAT_DONE;
+	else
+		ct->status |= IPS_SRC_NAT_DONE;
+
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL(nf_nat_setup_info);
+
+/* Returns true if succeeded. */
+static bool
+manip_pkt(u_int16_t proto,
+	  struct sk_buff *skb,
+	  unsigned int iphdroff,
+	  const struct nf_conntrack_tuple *target,
+	  enum nf_nat_manip_type maniptype)
+{
+	struct iphdr *iph;
+	const struct nf_nat_protocol *p;
+
+	if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
+		return false;
+
+	iph = (void *)skb->data + iphdroff;
+
+	/* Manipulate protcol part. */
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	p = __nf_nat_proto_find(proto);
+	if (!p->manip_pkt(skb, iphdroff, target, maniptype))
+		return false;
+
+	iph = (void *)skb->data + iphdroff;
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+		iph->saddr = target->src.u3.ip;
+	} else {
+		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+		iph->daddr = target->dst.u3.ip;
+	}
+	return true;
+}
+
+/* Do packet manipulations according to nf_nat_setup_info. */
+unsigned int nf_nat_packet(struct nf_conn *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned int hooknum,
+			   struct sk_buff *skb)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned long statusbit;
+	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+	if (mtype == IP_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply dir. */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	/* Non-atomic: these bits don't change. */
+	if (ct->status & statusbit) {
+		struct nf_conntrack_tuple target;
+
+		/* We are aiming to look like inverse of other direction. */
+		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+		if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
+			return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_nat_packet);
+
+/* Dir is direction ICMP is coming from (opposite to packet it contains) */
+int nf_nat_icmp_reply_translation(struct nf_conn *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned int hooknum,
+				  struct sk_buff *skb)
+{
+	struct {
+		struct icmphdr icmp;
+		struct iphdr ip;
+	} *inside;
+	const struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple inner, target;
+	int hdrlen = ip_hdrlen(skb);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned long statusbit;
+	enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+
+	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
+		return 0;
+
+	inside = (void *)skb->data + hdrlen;
+
+	/* We're actually going to mangle it beyond trivial checksum
+	   adjustment, so make sure the current checksum is correct. */
+	if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
+		return 0;
+
+	/* Must be RELATED */
+	NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
+		     skb->nfctinfo == IP_CT_RELATED_REPLY);
+
+	/* Redirects on non-null nats must be dropped, else they'll
+	   start talking to each other without our translation, and be
+	   confused... --RR */
+	if (inside->icmp.type == ICMP_REDIRECT) {
+		/* If NAT isn't finished, assume it and drop. */
+		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+			return 0;
+
+		if (ct->status & IPS_NAT_MASK)
+			return 0;
+	}
+
+	if (manip == IP_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply dir. */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	if (!(ct->status & statusbit))
+		return 1;
+
+	pr_debug("icmp_reply_translation: translating error %p manip %u "
+		 "dir %s\n", skb, manip,
+		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
+
+	if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
+			     (hdrlen +
+			      sizeof(struct icmphdr) + inside->ip.ihl * 4),
+			     (u_int16_t)AF_INET, inside->ip.protocol,
+			     &inner, l3proto, l4proto))
+		return 0;
+
+	/* Change inner back to look like incoming packet.  We do the
+	   opposite manip on this hook to normal, because it might not
+	   pass all hooks (locally-generated ICMP).  Consider incoming
+	   packet: PREROUTING (DST manip), routing produces ICMP, goes
+	   through POSTROUTING (which must correct the DST manip). */
+	if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
+		       &ct->tuplehash[!dir].tuple, !manip))
+		return 0;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* Reloading "inside" here since manip_pkt inner. */
+		inside = (void *)skb->data + hdrlen;
+		inside->icmp.checksum = 0;
+		inside->icmp.checksum =
+			csum_fold(skb_checksum(skb, hdrlen,
+					       skb->len - hdrlen, 0));
+	}
+
+	/* Change outer to look the reply to an incoming packet
+	 * (proto 0 means don't invert per-proto part). */
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	if (!manip_pkt(0, skb, 0, &target, manip))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+/* Protocol registration. */
+int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
+{
+	int ret = 0;
+
+	spin_lock_bh(&nf_nat_lock);
+	if (rcu_dereference_protected(
+			nf_nat_protos[proto->protonum],
+			lockdep_is_held(&nf_nat_lock)
+			) != &nf_nat_unknown_protocol) {
+		ret = -EBUSY;
+		goto out;
+	}
+	rcu_assign_pointer(nf_nat_protos[proto->protonum], proto);
+ out:
+	spin_unlock_bh(&nf_nat_lock);
+	return ret;
+}
+EXPORT_SYMBOL(nf_nat_protocol_register);
+
+/* No one stores the protocol anywhere; simply delete it. */
+void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
+{
+	spin_lock_bh(&nf_nat_lock);
+	rcu_assign_pointer(nf_nat_protos[proto->protonum],
+			   &nf_nat_unknown_protocol);
+	spin_unlock_bh(&nf_nat_lock);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_nat_protocol_unregister);
+
+/* No one using conntrack by the time this called. */
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
+{
+	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
+
+	if (nat == NULL || nat->ct == NULL)
+		return;
+
+	NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
+
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&nat->bysource);
+	spin_unlock_bh(&nf_nat_lock);
+}
+
+static void nf_nat_move_storage(void *new, void *old)
+{
+	struct nf_conn_nat *new_nat = new;
+	struct nf_conn_nat *old_nat = old;
+	struct nf_conn *ct = old_nat->ct;
+
+	if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
+		return;
+
+	spin_lock_bh(&nf_nat_lock);
+	hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
+	spin_unlock_bh(&nf_nat_lock);
+}
+
+static struct nf_ct_ext_type nat_extend __read_mostly = {
+	.len		= sizeof(struct nf_conn_nat),
+	.align		= __alignof__(struct nf_conn_nat),
+	.destroy	= nf_nat_cleanup_conntrack,
+	.move		= nf_nat_move_storage,
+	.id		= NF_CT_EXT_NAT,
+	.flags		= NF_CT_EXT_F_PREALLOC,
+};
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static const struct nf_nat_protocol *
+nf_nat_proto_find_get(u_int8_t protonum)
+{
+	const struct nf_nat_protocol *p;
+
+	rcu_read_lock();
+	p = __nf_nat_proto_find(protonum);
+	if (!try_module_get(p->me))
+		p = &nf_nat_unknown_protocol;
+	rcu_read_unlock();
+
+	return p;
+}
+
+static void
+nf_nat_proto_put(const struct nf_nat_protocol *p)
+{
+	module_put(p->me);
+}
+
+static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
+	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
+	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
+};
+
+static int nfnetlink_parse_nat_proto(struct nlattr *attr,
+				     const struct nf_conn *ct,
+				     struct nf_nat_range *range)
+{
+	struct nlattr *tb[CTA_PROTONAT_MAX+1];
+	const struct nf_nat_protocol *npt;
+	int err;
+
+	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy);
+	if (err < 0)
+		return err;
+
+	npt = nf_nat_proto_find_get(nf_ct_protonum(ct));
+	if (npt->nlattr_to_range)
+		err = npt->nlattr_to_range(tb, range);
+	nf_nat_proto_put(npt);
+	return err;
+}
+
+static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
+	[CTA_NAT_MINIP]		= { .type = NLA_U32 },
+	[CTA_NAT_MAXIP]		= { .type = NLA_U32 },
+};
+
+static int
+nfnetlink_parse_nat(const struct nlattr *nat,
+		    const struct nf_conn *ct, struct nf_nat_range *range)
+{
+	struct nlattr *tb[CTA_NAT_MAX+1];
+	int err;
+
+	memset(range, 0, sizeof(*range));
+
+	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[CTA_NAT_MINIP])
+		range->min_ip = nla_get_be32(tb[CTA_NAT_MINIP]);
+
+	if (!tb[CTA_NAT_MAXIP])
+		range->max_ip = range->min_ip;
+	else
+		range->max_ip = nla_get_be32(tb[CTA_NAT_MAXIP]);
+
+	if (range->min_ip)
+		range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+	if (!tb[CTA_NAT_PROTO])
+		return 0;
+
+	err = nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  const struct nlattr *attr)
+{
+	struct nf_nat_range range;
+
+	if (nfnetlink_parse_nat(attr, ct, &range) < 0)
+		return -EINVAL;
+	if (nf_nat_initialized(ct, manip))
+		return -EEXIST;
+
+	return nf_nat_setup_info(ct, &range, manip);
+}
+#else
+static int
+nfnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  const struct nlattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static int __net_init nf_nat_net_init(struct net *net)
+{
+	/* Leave them the same for the moment. */
+	net->ipv4.nat_htable_size = net->ct.htable_size;
+	net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
+	if (!net->ipv4.nat_bysource)
+		return -ENOMEM;
+	return 0;
+}
+
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(struct nf_conn *i, void *data)
+{
+	struct nf_conn_nat *nat = nfct_nat(i);
+
+	if (!nat)
+		return 0;
+	memset(nat, 0, sizeof(*nat));
+	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+	return 0;
+}
+
+static void __net_exit nf_nat_net_exit(struct net *net)
+{
+	nf_ct_iterate_cleanup(net, &clean_nat, NULL);
+	synchronize_rcu();
+	nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
+}
+
+static struct pernet_operations nf_nat_net_ops = {
+	.init = nf_nat_net_init,
+	.exit = nf_nat_net_exit,
+};
+
+static int __init nf_nat_init(void)
+{
+	size_t i;
+	int ret;
+
+	need_ipv4_conntrack();
+
+	ret = nf_ct_extend_register(&nat_extend);
+	if (ret < 0) {
+		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+		return ret;
+	}
+
+	ret = register_pernet_subsys(&nf_nat_net_ops);
+	if (ret < 0)
+		goto cleanup_extend;
+
+	/* Sew in builtin protocols. */
+	spin_lock_bh(&nf_nat_lock);
+	for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+		rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol);
+	rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp);
+	rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp);
+	rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp);
+	spin_unlock_bh(&nf_nat_lock);
+
+	/* Initialize fake conntrack so that NAT will skip it */
+	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
+
+	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
+
+	BUG_ON(nf_nat_seq_adjust_hook != NULL);
+	rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
+	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
+	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
+			   nfnetlink_parse_nat_setup);
+	BUG_ON(nf_ct_nat_offset != NULL);
+	rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
+	return 0;
+
+ cleanup_extend:
+	nf_ct_extend_unregister(&nat_extend);
+	return ret;
+}
+
+static void __exit nf_nat_cleanup(void)
+{
+	unregister_pernet_subsys(&nf_nat_net_ops);
+	nf_ct_l3proto_put(l3proto);
+	nf_ct_extend_unregister(&nat_extend);
+	rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
+	rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
+	rcu_assign_pointer(nf_ct_nat_offset, NULL);
+	synchronize_net();
+}
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf-nat-ipv4");
+
+module_init(nf_nat_init);
+module_exit(nf_nat_cleanup);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
new file mode 100644
index 00000000..dc73abb3
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -0,0 +1,137 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+MODULE_ALIAS("ip_nat_ftp");
+
+/* FIXME: Time out? --RR */
+
+static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
+			      char *buffer, size_t buflen,
+			      __be32 addr, u16 port)
+{
+	switch (type) {
+	case NF_CT_FTP_PORT:
+	case NF_CT_FTP_PASV:
+		return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
+				((unsigned char *)&addr)[0],
+				((unsigned char *)&addr)[1],
+				((unsigned char *)&addr)[2],
+				((unsigned char *)&addr)[3],
+				port >> 8,
+				port & 0xFF);
+	case NF_CT_FTP_EPRT:
+		return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
+	case NF_CT_FTP_EPSV:
+		return snprintf(buffer, buflen, "|||%u|", port);
+	}
+
+	return 0;
+}
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int nf_nat_ftp(struct sk_buff *skb,
+			       enum ip_conntrack_info ctinfo,
+			       enum nf_ct_ftp_type type,
+			       unsigned int matchoff,
+			       unsigned int matchlen,
+			       struct nf_conntrack_expect *exp)
+{
+	__be32 newip;
+	u_int16_t port;
+	int dir = CTINFO2DIR(ctinfo);
+	struct nf_conn *ct = exp->master;
+	char buffer[sizeof("|1|255.255.255.255|65535|")];
+	unsigned int buflen;
+
+	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+	/* Connection will come from wherever this packet goes, hence !dir */
+	newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = !dir;
+
+	/* When you see the packet, we need to NAT it the same as the
+	 * this one. */
+	exp->expectfn = nf_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
+			break;
+		}
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
+	if (!buflen)
+		goto out;
+
+	pr_debug("calling nf_nat_mangle_tcp_packet\n");
+
+	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
+				      matchlen, buffer, buflen))
+		goto out;
+
+	return NF_ACCEPT;
+
+out:
+	nf_ct_unexpect_related(exp);
+	return NF_DROP;
+}
+
+static void __exit nf_nat_ftp_fini(void)
+{
+	rcu_assign_pointer(nf_nat_ftp_hook, NULL);
+	synchronize_rcu();
+}
+
+static int __init nf_nat_ftp_init(void)
+{
+	BUG_ON(nf_nat_ftp_hook != NULL);
+	rcu_assign_pointer(nf_nat_ftp_hook, nf_nat_ftp);
+	return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+	printk(KERN_INFO KBUILD_MODNAME
+	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_ftp_init);
+module_exit(nf_nat_ftp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
new file mode 100644
index 00000000..790f3160
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -0,0 +1,618 @@
+/*
+ * H.323 extension for NAT alteration.
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 NAT module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/****************************************************************************/
+static int set_addr(struct sk_buff *skb,
+		    unsigned char **data, int dataoff,
+		    unsigned int addroff, __be32 ip, __be16 port)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct {
+		__be32 ip;
+		__be16 port;
+	} __attribute__ ((__packed__)) buf;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+
+	buf.ip = ip;
+	buf.port = port;
+	addroff += dataoff;
+
+	if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
+		if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+					      addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			if (net_ratelimit())
+				pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
+				       " error\n");
+			return -1;
+		}
+
+		/* Relocate data pointer */
+		th = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			return -1;
+		*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
+	} else {
+		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+					      addroff, sizeof(buf),
+					      (char *) &buf, sizeof(buf))) {
+			if (net_ratelimit())
+				pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
+				       " error\n");
+			return -1;
+		}
+		/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+		 * or pull everything in a linear buffer, so we can safely
+		 * use the skb pointers now */
+		*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_h225_addr(struct sk_buff *skb,
+			 unsigned char **data, int dataoff,
+			 TransportAddress *taddr,
+			 union nf_inet_addr *addr, __be16 port)
+{
+	return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
+			addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_h245_addr(struct sk_buff *skb,
+			 unsigned char **data, int dataoff,
+			 H245_TransportAddress *taddr,
+			 union nf_inet_addr *addr, __be16 port)
+{
+	return set_addr(skb, data, dataoff,
+			taddr->unicastAddress.iPAddress.network,
+			addr->ip, port);
+}
+
+/****************************************************************************/
+static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data,
+			TransportAddress *taddr, int count)
+{
+	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
+			if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+			    port == info->sig_port[dir]) {
+				/* GW->GK */
+
+				/* Fix for Gnomemeeting */
+				if (i > 0 &&
+				    get_h225_addr(ct, *data, &taddr[0],
+						  &addr, &port) &&
+				    (ntohl(addr.ip) & 0xff000000) == 0x7f000000)
+					i = 0;
+
+				pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+					 &addr.ip, port,
+					 &ct->tuplehash[!dir].tuple.dst.u3.ip,
+					 info->sig_port[!dir]);
+				return set_h225_addr(skb, data, 0, &taddr[i],
+						     &ct->tuplehash[!dir].
+						     tuple.dst.u3,
+						     info->sig_port[!dir]);
+			} else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+				   port == info->sig_port[dir]) {
+				/* GK->GW */
+				pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
+					 &addr.ip, port,
+					 &ct->tuplehash[!dir].tuple.src.u3.ip,
+					 info->sig_port[!dir]);
+				return set_h225_addr(skb, data, 0, &taddr[i],
+						     &ct->tuplehash[!dir].
+						     tuple.src.u3,
+						     info->sig_port[!dir]);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data,
+			TransportAddress *taddr, int count)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+		    port == ct->tuplehash[dir].tuple.src.u.udp.port) {
+			pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
+				 &addr.ip, ntohs(port),
+				 &ct->tuplehash[!dir].tuple.dst.u3.ip,
+				 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
+			return set_h225_addr(skb, data, 0, &taddr[i],
+					     &ct->tuplehash[!dir].tuple.dst.u3,
+					     ct->tuplehash[!dir].tuple.
+								dst.u.udp.port);
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			H245_TransportAddress *taddr,
+			__be16 port, __be16 rtp_port,
+			struct nf_conntrack_expect *rtp_exp,
+			struct nf_conntrack_expect *rtcp_exp)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int i;
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+	rtp_exp->expectfn = nf_nat_follow_master;
+	rtp_exp->dir = !dir;
+	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+	rtcp_exp->expectfn = nf_nat_follow_master;
+	rtcp_exp->dir = !dir;
+
+	/* Lookup existing expects */
+	for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
+		if (info->rtp_port[i][dir] == rtp_port) {
+			/* Expected */
+
+			/* Use allocated ports first. This will refresh
+			 * the expects */
+			rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(ntohs(info->rtp_port[i][dir]) + 1);
+			break;
+		} else if (info->rtp_port[i][dir] == 0) {
+			/* Not expected */
+			break;
+		}
+	}
+
+	/* Run out of expectations */
+	if (i >= H323_RTP_CHANNEL_MAX) {
+		if (net_ratelimit())
+			pr_notice("nf_nat_h323: out of expectations\n");
+		return 0;
+	}
+
+	/* Try to get a pair of ports. */
+	for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+	     nated_port != 0; nated_port += 2) {
+		int ret;
+
+		rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == 0) {
+			rtcp_exp->tuple.dst.u.udp.port =
+			    htons(nated_port + 1);
+			ret = nf_ct_expect_related(rtcp_exp);
+			if (ret == 0)
+				break;
+			else if (ret != -EBUSY) {
+				nf_ct_unexpect_related(rtp_exp);
+				nated_port = 0;
+				break;
+			}
+		} else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			pr_notice("nf_nat_h323: out of RTP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(skb, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons((port & htons(1)) ? nated_port + 1 :
+						    nated_port)) == 0) {
+		/* Save ports */
+		info->rtp_port[i][dir] = rtp_port;
+		info->rtp_port[i][!dir] = htons(nated_port);
+	} else {
+		nf_ct_unexpect_related(rtp_exp);
+		nf_ct_unexpect_related(rtcp_exp);
+		return -1;
+	}
+
+	/* Success */
+	pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
+		 &rtp_exp->tuple.src.u3.ip,
+		 ntohs(rtp_exp->tuple.src.u.udp.port),
+		 &rtp_exp->tuple.dst.u3.ip,
+		 ntohs(rtp_exp->tuple.dst.u.udp.port));
+	pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
+		 &rtcp_exp->tuple.src.u3.ip,
+		 ntohs(rtcp_exp->tuple.src.u.udp.port),
+		 &rtcp_exp->tuple.dst.u3.ip,
+		 ntohs(rtcp_exp->tuple.dst.u.udp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, int dataoff,
+		    H245_TransportAddress *taddr, __be16 port,
+		    struct nf_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = nf_nat_follow_master;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			pr_notice("nf_nat_h323: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h245_addr(skb, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port)) < 0) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, int dataoff,
+		    TransportAddress *taddr, __be16 port,
+		    struct nf_conntrack_expect *exp)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = nf_nat_follow_master;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = ntohs(info->sig_port[!dir]);
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			pr_notice("nf_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(skb, data, dataoff, taddr,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port)) == 0) {
+		/* Save ports */
+		info->sig_port[dir] = port;
+		info->sig_port[!dir] = htons(nated_port);
+	} else {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************
+ * This conntrack expect function replaces nf_conntrack_q931_expect()
+ * which was set by nf_conntrack_h323.c.
+ ****************************************************************************/
+static void ip_nat_q931_expect(struct nf_conn *new,
+			       struct nf_conntrack_expect *this)
+{
+	struct nf_nat_range range;
+
+	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
+		nf_nat_follow_master(new, this);
+		return;
+	}
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+	nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip =
+	    new->master->tuplehash[!this->dir].tuple.src.u3.ip;
+	nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo,
+		    unsigned char **data, TransportAddress *taddr, int idx,
+		    __be16 port, struct nf_conntrack_expect *exp)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port = ntohs(port);
+	union nf_inet_addr addr;
+
+	/* Set expectations for NAT */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_q931_expect;
+	exp->dir = !dir;
+
+	/* Check existing expects */
+	if (info->sig_port[dir] == port)
+		nated_port = ntohs(info->sig_port[!dir]);
+
+	/* Try to get same port: if not, try to change it. */
+	for (; nated_port != 0; nated_port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			pr_notice("nf_nat_ras: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (set_h225_addr(skb, data, 0, &taddr[idx],
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  htons(nated_port)) == 0) {
+		/* Save ports */
+		info->sig_port[dir] = port;
+		info->sig_port[!dir] = htons(nated_port);
+
+		/* Fix for Gnomemeeting */
+		if (idx > 0 &&
+		    get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
+		    (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
+			set_h225_addr(skb, data, 0, &taddr[0],
+				      &ct->tuplehash[!dir].tuple.dst.u3,
+				      info->sig_port[!dir]);
+		}
+	} else {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static void ip_nat_callforwarding_expect(struct nf_conn *new,
+					 struct nf_conntrack_expect *this)
+{
+	struct nf_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(new->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
+	nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = this->saved_proto;
+	range.min_ip = range.max_ip = this->saved_ip;
+	nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      unsigned char **data, int dataoff,
+			      TransportAddress *taddr, __be16 port,
+			      struct nf_conntrack_expect *exp)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	u_int16_t nated_port;
+
+	/* Set expectations for NAT */
+	exp->saved_ip = exp->tuple.dst.u3.ip;
+	exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->expectfn = ip_nat_callforwarding_expect;
+	exp->dir = !dir;
+
+	/* Try to get same port: if not, try to change it. */
+	for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(nated_port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
+	}
+
+	if (nated_port == 0) {	/* No port available */
+		if (net_ratelimit())
+			pr_notice("nf_nat_q931: out of TCP ports\n");
+		return 0;
+	}
+
+	/* Modify signal */
+	if (!set_h225_addr(skb, data, dataoff, taddr,
+			   &ct->tuplehash[!dir].tuple.dst.u3,
+			   htons(nated_port)) == 0) {
+		nf_ct_unexpect_related(exp);
+		return -1;
+	}
+
+	/* Success */
+	pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
+		 &exp->tuple.src.u3.ip,
+		 ntohs(exp->tuple.src.u.tcp.port),
+		 &exp->tuple.dst.u3.ip,
+		 ntohs(exp->tuple.dst.u.tcp.port));
+
+	return 0;
+}
+
+/****************************************************************************/
+static int __init init(void)
+{
+	BUG_ON(set_h245_addr_hook != NULL);
+	BUG_ON(set_h225_addr_hook != NULL);
+	BUG_ON(set_sig_addr_hook != NULL);
+	BUG_ON(set_ras_addr_hook != NULL);
+	BUG_ON(nat_rtp_rtcp_hook != NULL);
+	BUG_ON(nat_t120_hook != NULL);
+	BUG_ON(nat_h245_hook != NULL);
+	BUG_ON(nat_callforwarding_hook != NULL);
+	BUG_ON(nat_q931_hook != NULL);
+
+	rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
+	rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
+	rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
+	rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
+	rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
+	rcu_assign_pointer(nat_t120_hook, nat_t120);
+	rcu_assign_pointer(nat_h245_hook, nat_h245);
+	rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
+	rcu_assign_pointer(nat_q931_hook, nat_q931);
+	return 0;
+}
+
+/****************************************************************************/
+static void __exit fini(void)
+{
+	rcu_assign_pointer(set_h245_addr_hook, NULL);
+	rcu_assign_pointer(set_h225_addr_hook, NULL);
+	rcu_assign_pointer(set_sig_addr_hook, NULL);
+	rcu_assign_pointer(set_ras_addr_hook, NULL);
+	rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
+	rcu_assign_pointer(nat_t120_hook, NULL);
+	rcu_assign_pointer(nat_h245_hook, NULL);
+	rcu_assign_pointer(nat_callforwarding_hook, NULL);
+	rcu_assign_pointer(nat_q931_hook, NULL);
+	synchronize_rcu();
+}
+
+/****************************************************************************/
+module_init(init);
+module_exit(fini);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_h323");
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
new file mode 100644
index 00000000..ebc5f889
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -0,0 +1,451 @@
+/* ip_nat_helper.c - generic support functions for NAT helpers
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+
+#define DUMP_OFFSET(x) \
+	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
+		 x->offset_before, x->offset_after, x->correction_pos);
+
+static DEFINE_SPINLOCK(nf_nat_seqofs_lock);
+
+/* Setup TCP sequence correction given this change at this sequence */
+static inline void
+adjust_tcp_sequence(u32 seq,
+		    int sizediff,
+		    struct nf_conn *ct,
+		    enum ip_conntrack_info ctinfo)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_nat_seq *this_way = &nat->seq[dir];
+
+	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
+		 seq, sizediff);
+
+	pr_debug("adjust_tcp_sequence: Seq_offset before: ");
+	DUMP_OFFSET(this_way);
+
+	spin_lock_bh(&nf_nat_seqofs_lock);
+
+	/* SYN adjust. If it's uninitialized, or this is after last
+	 * correction, record it: we don't handle more than one
+	 * adjustment in the window, but do deal with common case of a
+	 * retransmit */
+	if (this_way->offset_before == this_way->offset_after ||
+	    before(this_way->correction_pos, seq)) {
+		this_way->correction_pos = seq;
+		this_way->offset_before = this_way->offset_after;
+		this_way->offset_after += sizediff;
+	}
+	spin_unlock_bh(&nf_nat_seqofs_lock);
+
+	pr_debug("adjust_tcp_sequence: Seq_offset after: ");
+	DUMP_OFFSET(this_way);
+}
+
+/* Get the offset value, for conntrack */
+s16 nf_nat_get_offset(const struct nf_conn *ct,
+		      enum ip_conntrack_dir dir,
+		      u32 seq)
+{
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_nat_seq *this_way;
+	s16 offset;
+
+	if (!nat)
+		return 0;
+
+	this_way = &nat->seq[dir];
+	spin_lock_bh(&nf_nat_seqofs_lock);
+	offset = after(seq, this_way->correction_pos)
+		 ? this_way->offset_after : this_way->offset_before;
+	spin_unlock_bh(&nf_nat_seqofs_lock);
+
+	return offset;
+}
+EXPORT_SYMBOL_GPL(nf_nat_get_offset);
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+			    unsigned int dataoff,
+			    unsigned int match_offset,
+			    unsigned int match_len,
+			    const char *rep_buffer,
+			    unsigned int rep_len)
+{
+	unsigned char *data;
+
+	BUG_ON(skb_is_nonlinear(skb));
+	data = skb_network_header(skb) + dataoff;
+
+	/* move post-replacement */
+	memmove(data + match_offset + rep_len,
+		data + match_offset + match_len,
+		skb->tail - (skb->network_header + dataoff +
+			     match_offset + match_len));
+
+	/* insert data from buffer */
+	memcpy(data + match_offset, rep_buffer, rep_len);
+
+	/* update skb info */
+	if (rep_len > match_len) {
+		pr_debug("nf_nat_mangle_packet: Extending packet by "
+			 "%u from %u bytes\n", rep_len - match_len, skb->len);
+		skb_put(skb, rep_len - match_len);
+	} else {
+		pr_debug("nf_nat_mangle_packet: Shrinking packet from "
+			 "%u from %u bytes\n", match_len - rep_len, skb->len);
+		__skb_trim(skb, skb->len + rep_len - match_len);
+	}
+
+	/* fix IP hdr checksum information */
+	ip_hdr(skb)->tot_len = htons(skb->len);
+	ip_send_check(ip_hdr(skb));
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
+{
+	if (skb->len + extra > 65535)
+		return 0;
+
+	if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC))
+		return 0;
+
+	return 1;
+}
+
+void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			   __be32 seq, s16 off)
+{
+	if (!off)
+		return;
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+	adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
+	nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
+}
+EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+
+static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
+			int datalen, __sum16 *check, int oldlen)
+{
+	struct rtable *rt = skb_rtable(skb);
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		if (!(rt->rt_flags & RTCF_LOCAL) &&
+		    (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_headroom(skb) +
+					  skb_network_offset(skb) +
+					  iph->ihl * 4;
+			skb->csum_offset = (void *)check - data;
+			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						    datalen, iph->protocol, 0);
+		} else {
+			*check = 0;
+			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						   datalen, iph->protocol,
+						   csum_partial(data, datalen,
+								0));
+			if (iph->protocol == IPPROTO_UDP && !*check)
+				*check = CSUM_MANGLED_0;
+		}
+	} else
+		inet_proto_csum_replace2(check, skb,
+					 htons(oldlen), htons(datalen), 1);
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
+			       struct nf_conn *ct,
+			       enum ip_conntrack_info ctinfo,
+			       unsigned int match_offset,
+			       unsigned int match_len,
+			       const char *rep_buffer,
+			       unsigned int rep_len, bool adjust)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	int oldlen, datalen;
+
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	if (rep_len > match_len &&
+	    rep_len - match_len > skb_tailroom(skb) &&
+	    !enlarge_skb(skb, rep_len - match_len))
+		return 0;
+
+	SKB_LINEAR_ASSERT(skb);
+
+	iph = ip_hdr(skb);
+	tcph = (void *)iph + iph->ihl*4;
+
+	oldlen = skb->len - iph->ihl*4;
+	mangle_contents(skb, iph->ihl*4 + tcph->doff*4,
+			match_offset, match_len, rep_buffer, rep_len);
+
+	datalen = skb->len - iph->ihl*4;
+	nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
+
+	if (adjust && rep_len != match_len)
+		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
+				      (int)rep_len - (int)match_len);
+
+	return 1;
+}
+EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with nf_nat_mangle_tcp_packet which
+ *       should be fairly easy to do.
+ */
+int
+nf_nat_mangle_udp_packet(struct sk_buff *skb,
+			 struct nf_conn *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int match_offset,
+			 unsigned int match_len,
+			 const char *rep_buffer,
+			 unsigned int rep_len)
+{
+	struct iphdr *iph;
+	struct udphdr *udph;
+	int datalen, oldlen;
+
+	/* UDP helpers might accidentally mangle the wrong packet */
+	iph = ip_hdr(skb);
+	if (skb->len < iph->ihl*4 + sizeof(*udph) +
+			       match_offset + match_len)
+		return 0;
+
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	if (rep_len > match_len &&
+	    rep_len - match_len > skb_tailroom(skb) &&
+	    !enlarge_skb(skb, rep_len - match_len))
+		return 0;
+
+	iph = ip_hdr(skb);
+	udph = (void *)iph + iph->ihl*4;
+
+	oldlen = skb->len - iph->ihl*4;
+	mangle_contents(skb, iph->ihl*4 + sizeof(*udph),
+			match_offset, match_len, rep_buffer, rep_len);
+
+	/* update the length of the UDP packet */
+	datalen = skb->len - iph->ihl*4;
+	udph->len = htons(datalen);
+
+	/* fix udp checksum if udp checksum was previously calculated */
+	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
+		return 1;
+
+	nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
+
+	return 1;
+}
+EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
+
+/* Adjust one found SACK option including checksum correction */
+static void
+sack_adjust(struct sk_buff *skb,
+	    struct tcphdr *tcph,
+	    unsigned int sackoff,
+	    unsigned int sackend,
+	    struct nf_nat_seq *natseq)
+{
+	while (sackoff < sackend) {
+		struct tcp_sack_block_wire *sack;
+		__be32 new_start_seq, new_end_seq;
+
+		sack = (void *)skb->data + sackoff;
+		if (after(ntohl(sack->start_seq) - natseq->offset_before,
+			  natseq->correction_pos))
+			new_start_seq = htonl(ntohl(sack->start_seq)
+					- natseq->offset_after);
+		else
+			new_start_seq = htonl(ntohl(sack->start_seq)
+					- natseq->offset_before);
+
+		if (after(ntohl(sack->end_seq) - natseq->offset_before,
+			  natseq->correction_pos))
+			new_end_seq = htonl(ntohl(sack->end_seq)
+				      - natseq->offset_after);
+		else
+			new_end_seq = htonl(ntohl(sack->end_seq)
+				      - natseq->offset_before);
+
+		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+			 ntohl(sack->start_seq), new_start_seq,
+			 ntohl(sack->end_seq), new_end_seq);
+
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->start_seq, new_start_seq, 0);
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->end_seq, new_end_seq, 0);
+		sack->start_seq = new_start_seq;
+		sack->end_seq = new_end_seq;
+		sackoff += sizeof(*sack);
+	}
+}
+
+/* TCP SACK sequence number adjustment */
+static inline unsigned int
+nf_nat_sack_adjust(struct sk_buff *skb,
+		   struct tcphdr *tcph,
+		   struct nf_conn *ct,
+		   enum ip_conntrack_info ctinfo)
+{
+	unsigned int dir, optoff, optend;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+
+	optoff = ip_hdrlen(skb) + sizeof(struct tcphdr);
+	optend = ip_hdrlen(skb) + tcph->doff * 4;
+
+	if (!skb_make_writable(skb, optend))
+		return 0;
+
+	dir = CTINFO2DIR(ctinfo);
+
+	while (optoff < optend) {
+		/* Usually: option, length. */
+		unsigned char *op = skb->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			/* no partial options */
+			if (optoff + 1 == optend ||
+			    optoff + op[1] > optend ||
+			    op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_SACK &&
+			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+				sack_adjust(skb, tcph, optoff+2,
+					    optoff+op[1], &nat->seq[!dir]);
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+int
+nf_nat_seq_adjust(struct sk_buff *skb,
+		  struct nf_conn *ct,
+		  enum ip_conntrack_info ctinfo)
+{
+	struct tcphdr *tcph;
+	int dir;
+	__be32 newseq, newack;
+	s16 seqoff, ackoff;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_nat_seq *this_way, *other_way;
+
+	dir = CTINFO2DIR(ctinfo);
+
+	this_way = &nat->seq[dir];
+	other_way = &nat->seq[!dir];
+
+	if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+		return 0;
+
+	tcph = (void *)skb->data + ip_hdrlen(skb);
+	if (after(ntohl(tcph->seq), this_way->correction_pos))
+		seqoff = this_way->offset_after;
+	else
+		seqoff = this_way->offset_before;
+
+	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+		  other_way->correction_pos))
+		ackoff = other_way->offset_after;
+	else
+		ackoff = other_way->offset_before;
+
+	newseq = htonl(ntohl(tcph->seq) + seqoff);
+	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+		 ntohl(newack));
+
+	tcph->seq = newseq;
+	tcph->ack_seq = newack;
+
+	return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
+}
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void nf_nat_follow_master(struct nf_conn *ct,
+			  struct nf_conntrack_expect *exp)
+{
+	struct nf_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = exp->saved_proto;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
+	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+}
+EXPORT_SYMBOL(nf_nat_follow_master);
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
new file mode 100644
index 00000000..535e1a80
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -0,0 +1,99 @@
+/* IRC extension for TCP NAT alteration.
+ *
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_irc");
+
+static unsigned int help(struct sk_buff *skb,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int matchoff,
+			 unsigned int matchlen,
+			 struct nf_conntrack_expect *exp)
+{
+	char buffer[sizeof("4294967296 65635")];
+	u_int32_t ip;
+	u_int16_t port;
+	unsigned int ret;
+
+	/* Reply comes from server. */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = IP_CT_DIR_REPLY;
+	exp->expectfn = nf_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
+		exp->tuple.dst.u.tcp.port = htons(port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
+			break;
+		}
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	ip = ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip);
+	sprintf(buffer, "%u %u", ip, port);
+	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
+		 buffer, &ip, port);
+
+	ret = nf_nat_mangle_tcp_packet(skb, exp->master, ctinfo,
+				       matchoff, matchlen, buffer,
+				       strlen(buffer));
+	if (ret != NF_ACCEPT)
+		nf_ct_unexpect_related(exp);
+	return ret;
+}
+
+static void __exit nf_nat_irc_fini(void)
+{
+	rcu_assign_pointer(nf_nat_irc_hook, NULL);
+	synchronize_rcu();
+}
+
+static int __init nf_nat_irc_init(void)
+{
+	BUG_ON(nf_nat_irc_hook != NULL);
+	rcu_assign_pointer(nf_nat_irc_hook, help);
+	return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+	printk(KERN_INFO KBUILD_MODNAME
+	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(nf_nat_irc_init);
+module_exit(nf_nat_irc_fini);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
new file mode 100644
index 00000000..4c060038
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -0,0 +1,308 @@
+/*
+ * nf_nat_pptp.c
+ *
+ * NAT support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft.  PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702.  Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * TODO: - NAT to a unique tuple, not to TCP source port
+ * 	   (needs netfilter tuple reservation)
+ */
+
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_NAT_PPTP_VERSION "3.0"
+
+#define REQ_CID(req, off)		(*(__be16 *)((char *)(req) + (off)))
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
+MODULE_ALIAS("ip_nat_pptp");
+
+static void pptp_nat_expected(struct nf_conn *ct,
+			      struct nf_conntrack_expect *exp)
+{
+	struct net *net = nf_ct_net(ct);
+	const struct nf_conn *master = ct->master;
+	struct nf_conntrack_expect *other_exp;
+	struct nf_conntrack_tuple t;
+	const struct nf_ct_pptp_master *ct_pptp_info;
+	const struct nf_nat_pptp *nat_pptp_info;
+	struct nf_nat_range range;
+
+	ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
+	nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
+
+	/* And here goes the grand finale of corrosion... */
+	if (exp->dir == IP_CT_DIR_ORIGINAL) {
+		pr_debug("we are PNS->PAC\n");
+		/* therefore, build tuple for PAC->PNS */
+		t.src.l3num = AF_INET;
+		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+		t.src.u.gre.key = ct_pptp_info->pac_call_id;
+		t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		t.dst.u.gre.key = ct_pptp_info->pns_call_id;
+		t.dst.protonum = IPPROTO_GRE;
+	} else {
+		pr_debug("we are PAC->PNS\n");
+		/* build tuple for PNS->PAC */
+		t.src.l3num = AF_INET;
+		t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
+		t.src.u.gre.key = nat_pptp_info->pns_call_id;
+		t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		t.dst.u.gre.key = nat_pptp_info->pac_call_id;
+		t.dst.protonum = IPPROTO_GRE;
+	}
+
+	pr_debug("trying to unexpect other dir: ");
+	nf_ct_dump_tuple_ip(&t);
+	other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
+	if (other_exp) {
+		nf_ct_unexpect_related(other_exp);
+		nf_ct_expect_put(other_exp);
+		pr_debug("success\n");
+	} else {
+		pr_debug("not found!\n");
+	}
+
+	/* This must be a fresh one. */
+	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+	if (exp->dir == IP_CT_DIR_ORIGINAL) {
+		range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+		range.min = range.max = exp->saved_proto;
+	}
+	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
+	if (exp->dir == IP_CT_DIR_REPLY) {
+		range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+		range.min = range.max = exp->saved_proto;
+	}
+	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+}
+
+/* outbound packets == from PNS to PAC */
+static int
+pptp_outbound_pkt(struct sk_buff *skb,
+		  struct nf_conn *ct,
+		  enum ip_conntrack_info ctinfo,
+		  struct PptpControlHeader *ctlh,
+		  union pptp_ctrl_union *pptpReq)
+
+{
+	struct nf_ct_pptp_master *ct_pptp_info;
+	struct nf_nat_pptp *nat_pptp_info;
+	u_int16_t msg;
+	__be16 new_callid;
+	unsigned int cid_off;
+
+	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+	new_callid = ct_pptp_info->pns_call_id;
+
+	switch (msg = ntohs(ctlh->messageType)) {
+	case PPTP_OUT_CALL_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+		/* FIXME: ideally we would want to reserve a call ID
+		 * here.  current netfilter NAT core is not able to do
+		 * this :( For now we use TCP source port. This breaks
+		 * multiple calls within one control session */
+
+		/* save original call ID in nat_info */
+		nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+		/* don't use tcph->source since we are at a DSTmanip
+		 * hook (e.g. PREROUTING) and pkt is not mangled yet */
+		new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+		/* save new call ID in ct info */
+		ct_pptp_info->pns_call_id = new_callid;
+		break;
+	case PPTP_IN_CALL_REPLY:
+		cid_off = offsetof(union pptp_ctrl_union, icack.callID);
+		break;
+	case PPTP_CALL_CLEAR_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+		break;
+	default:
+		pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
+			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+					       pptp_msg_name[0]);
+		/* fall through */
+	case PPTP_SET_LINK_INFO:
+		/* only need to NAT in case PAC is behind NAT box */
+	case PPTP_START_SESSION_REQUEST:
+	case PPTP_START_SESSION_REPLY:
+	case PPTP_STOP_SESSION_REQUEST:
+	case PPTP_STOP_SESSION_REPLY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* no need to alter packet */
+		return NF_ACCEPT;
+	}
+
+	/* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
+	 * down to here */
+	pr_debug("altering call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
+
+	/* mangle packet */
+	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+				     cid_off + sizeof(struct pptp_pkt_hdr) +
+				     sizeof(struct PptpControlHeader),
+				     sizeof(new_callid), (char *)&new_callid,
+				     sizeof(new_callid)) == 0)
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+
+static void
+pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
+	     struct nf_conntrack_expect *expect_reply)
+{
+	const struct nf_conn *ct = expect_orig->master;
+	struct nf_ct_pptp_master *ct_pptp_info;
+	struct nf_nat_pptp *nat_pptp_info;
+
+	ct_pptp_info  = &nfct_help(ct)->help.ct_pptp_info;
+	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+
+	/* save original PAC call ID in nat_info */
+	nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
+
+	/* alter expectation for PNS->PAC direction */
+	expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+	expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+	expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
+	expect_orig->dir = IP_CT_DIR_ORIGINAL;
+
+	/* alter expectation for PAC->PNS direction */
+	expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+	expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+	expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
+	expect_reply->dir = IP_CT_DIR_REPLY;
+}
+
+/* inbound packets == from PAC to PNS */
+static int
+pptp_inbound_pkt(struct sk_buff *skb,
+		 struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo,
+		 struct PptpControlHeader *ctlh,
+		 union pptp_ctrl_union *pptpReq)
+{
+	const struct nf_nat_pptp *nat_pptp_info;
+	u_int16_t msg;
+	__be16 new_pcid;
+	unsigned int pcid_off;
+
+	nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
+	new_pcid = nat_pptp_info->pns_call_id;
+
+	switch (msg = ntohs(ctlh->messageType)) {
+	case PPTP_OUT_CALL_REPLY:
+		pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
+		break;
+	case PPTP_IN_CALL_CONNECT:
+		pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
+		break;
+	case PPTP_IN_CALL_REQUEST:
+		/* only need to nat in case PAC is behind NAT box */
+		return NF_ACCEPT;
+	case PPTP_WAN_ERROR_NOTIFY:
+		pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
+		break;
+	case PPTP_CALL_DISCONNECT_NOTIFY:
+		pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
+		break;
+	case PPTP_SET_LINK_INFO:
+		pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
+		break;
+	default:
+		pr_debug("unknown inbound packet %s\n",
+			 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
+					       pptp_msg_name[0]);
+		/* fall through */
+	case PPTP_START_SESSION_REQUEST:
+	case PPTP_START_SESSION_REPLY:
+	case PPTP_STOP_SESSION_REQUEST:
+	case PPTP_STOP_SESSION_REPLY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* no need to alter packet */
+		return NF_ACCEPT;
+	}
+
+	/* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
+	 * WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
+
+	/* mangle packet */
+	pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
+		 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
+
+	if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+				     pcid_off + sizeof(struct pptp_pkt_hdr) +
+				     sizeof(struct PptpControlHeader),
+				     sizeof(new_pcid), (char *)&new_pcid,
+				     sizeof(new_pcid)) == 0)
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+
+static int __init nf_nat_helper_pptp_init(void)
+{
+	nf_nat_need_gre();
+
+	BUG_ON(nf_nat_pptp_hook_outbound != NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
+
+	BUG_ON(nf_nat_pptp_hook_inbound != NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
+
+	BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+
+	BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
+	return 0;
+}
+
+static void __exit nf_nat_helper_pptp_fini(void)
+{
+	rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL);
+	rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL);
+	synchronize_rcu();
+}
+
+module_init(nf_nat_helper_pptp_init);
+module_exit(nf_nat_helper_pptp_fini);
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
new file mode 100644
index 00000000..f52d41ea
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -0,0 +1,125 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter.h>
+#include <net/secure_seq.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
+			   enum nf_nat_manip_type maniptype,
+			   const union nf_conntrack_man_proto *min,
+			   const union nf_conntrack_man_proto *max)
+{
+	__be16 port;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		port = tuple->src.u.all;
+	else
+		port = tuple->dst.u.all;
+
+	return ntohs(port) >= ntohs(min->all) &&
+	       ntohs(port) <= ntohs(max->all);
+}
+EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
+
+void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+			       const struct nf_nat_range *range,
+			       enum nf_nat_manip_type maniptype,
+			       const struct nf_conn *ct,
+			       u_int16_t *rover)
+{
+	unsigned int range_size, min, i;
+	__be16 *portptr;
+	u_int16_t off;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		portptr = &tuple->src.u.all;
+	else
+		portptr = &tuple->dst.u.all;
+
+	/* If no range specified... */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+		/* If it's dst rewrite, can't change port */
+		if (maniptype == IP_NAT_MANIP_DST)
+			return;
+
+		if (ntohs(*portptr) < 1024) {
+			/* Loose convention: >> 512 is credential passing */
+			if (ntohs(*portptr) < 512) {
+				min = 1;
+				range_size = 511 - min + 1;
+			} else {
+				min = 600;
+				range_size = 1023 - min + 1;
+			}
+		} else {
+			min = 1024;
+			range_size = 65535 - 1024 + 1;
+		}
+	} else {
+		min = ntohs(range->min.all);
+		range_size = ntohs(range->max.all) - min + 1;
+	}
+
+	if (range->flags & IP_NAT_RANGE_PROTO_RANDOM)
+		off = secure_ipv4_port_ephemeral(tuple->src.u3.ip, tuple->dst.u3.ip,
+						 maniptype == IP_NAT_MANIP_SRC
+						 ? tuple->dst.u.all
+						 : tuple->src.u.all);
+	else
+		off = *rover;
+
+	for (i = 0; ; ++off) {
+		*portptr = htons(min + off % range_size);
+		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+			continue;
+		if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
+			*rover = off;
+		return;
+	}
+	return;
+}
+EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+int nf_nat_proto_range_to_nlattr(struct sk_buff *skb,
+				 const struct nf_nat_range *range)
+{
+	NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.all);
+	NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.all);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_proto_nlattr_to_range);
+
+int nf_nat_proto_nlattr_to_range(struct nlattr *tb[],
+				 struct nf_nat_range *range)
+{
+	if (tb[CTA_PROTONAT_PORT_MIN]) {
+		range->min.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
+		range->max.all = range->min.tcp.port;
+		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+	}
+	if (tb[CTA_PROTONAT_PORT_MAX]) {
+		range->max.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
+		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_proto_range_to_nlattr);
+#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
new file mode 100644
index 00000000..570faf26
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -0,0 +1,108 @@
+/*
+ * DCCP NAT protocol helper
+ *
+ * Copyright (c) 2005, 2006. 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/dccp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+static u_int16_t dccp_port_rover;
+
+static void
+dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
+		  const struct nf_nat_range *range,
+		  enum nf_nat_manip_type maniptype,
+		  const struct nf_conn *ct)
+{
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &dccp_port_rover);
+}
+
+static bool
+dccp_manip_pkt(struct sk_buff *skb,
+	       unsigned int iphdroff,
+	       const struct nf_conntrack_tuple *tuple,
+	       enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (const void *)(skb->data + iphdroff);
+	struct dccp_hdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl * 4;
+	__be32 oldip, newip;
+	__be16 *portptr, oldport, newport;
+	int hdrsize = 8; /* DCCP connection tracking guarantees this much */
+
+	if (skb->len >= hdroff + sizeof(struct dccp_hdr))
+		hdrsize = sizeof(struct dccp_hdr);
+
+	if (!skb_make_writable(skb, hdroff + hdrsize))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + iphdroff);
+	hdr = (struct dccp_hdr *)(skb->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		oldip = iph->saddr;
+		newip = tuple->src.u3.ip;
+		newport = tuple->src.u.dccp.port;
+		portptr = &hdr->dccph_sport;
+	} else {
+		oldip = iph->daddr;
+		newip = tuple->dst.u3.ip;
+		newport = tuple->dst.u.dccp.port;
+		portptr = &hdr->dccph_dport;
+	}
+
+	oldport = *portptr;
+	*portptr = newport;
+
+	if (hdrsize < sizeof(*hdr))
+		return true;
+
+	inet_proto_csum_replace4(&hdr->dccph_checksum, skb, oldip, newip, 1);
+	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
+				 0);
+	return true;
+}
+
+static const struct nf_nat_protocol nf_nat_protocol_dccp = {
+	.protonum		= IPPROTO_DCCP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= dccp_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= dccp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_dccp_init(void)
+{
+	return nf_nat_protocol_register(&nf_nat_protocol_dccp);
+}
+
+static void __exit nf_nat_proto_dccp_fini(void)
+{
+	nf_nat_protocol_unregister(&nf_nat_protocol_dccp);
+}
+
+module_init(nf_nat_proto_dccp_init);
+module_exit(nf_nat_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP NAT protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
new file mode 100644
index 00000000..bc8d83a3
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -0,0 +1,149 @@
+/*
+ * nf_nat_proto_gre.c
+ *
+ * NAT protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
+
+/* generate unique tuple ... */
+static void
+gre_unique_tuple(struct nf_conntrack_tuple *tuple,
+		 const struct nf_nat_range *range,
+		 enum nf_nat_manip_type maniptype,
+		 const struct nf_conn *ct)
+{
+	static u_int16_t key;
+	__be16 *keyptr;
+	unsigned int min, i, range_size;
+
+	/* If there is no master conntrack we are not PPTP,
+	   do not change tuples */
+	if (!ct->master)
+		return;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		keyptr = &tuple->src.u.gre.key;
+	else
+		keyptr = &tuple->dst.u.gre.key;
+
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+		pr_debug("%p: NATing GRE PPTP\n", ct);
+		min = 1;
+		range_size = 0xffff;
+	} else {
+		min = ntohs(range->min.gre.key);
+		range_size = ntohs(range->max.gre.key) - min + 1;
+	}
+
+	pr_debug("min = %u, range_size = %u\n", min, range_size);
+
+	for (i = 0; ; ++key) {
+		*keyptr = htons(min + key % range_size);
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+			return;
+	}
+
+	pr_debug("%p: no NAT mapping\n", ct);
+	return;
+}
+
+/* manipulate a GRE packet according to maniptype */
+static bool
+gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
+	      const struct nf_conntrack_tuple *tuple,
+	      enum nf_nat_manip_type maniptype)
+{
+	const struct gre_hdr *greh;
+	struct gre_hdr_pptp *pgreh;
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	unsigned int hdroff = iphdroff + iph->ihl * 4;
+
+	/* pgreh includes two optional 32bit fields which are not required
+	 * to be there.  That's where the magic '8' comes from */
+	if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+		return false;
+
+	greh = (void *)skb->data + hdroff;
+	pgreh = (struct gre_hdr_pptp *)greh;
+
+	/* we only have destination manip of a packet, since 'source key'
+	 * is not present in the packet itself */
+	if (maniptype != IP_NAT_MANIP_DST)
+		return true;
+	switch (greh->version) {
+	case GRE_VERSION_1701:
+		/* We do not currently NAT any GREv0 packets.
+		 * Try to behave like "nf_nat_proto_unknown" */
+		break;
+	case GRE_VERSION_PPTP:
+		pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
+		pgreh->call_id = tuple->dst.u.gre.key;
+		break;
+	default:
+		pr_debug("can't nat unknown GRE version\n");
+		return false;
+	}
+	return true;
+}
+
+static const struct nf_nat_protocol gre = {
+	.protonum		= IPPROTO_GRE,
+	.me			= THIS_MODULE,
+	.manip_pkt		= gre_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= gre_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_gre_init(void)
+{
+	return nf_nat_protocol_register(&gre);
+}
+
+static void __exit nf_nat_proto_gre_fini(void)
+{
+	nf_nat_protocol_unregister(&gre);
+}
+
+module_init(nf_nat_proto_gre_init);
+module_exit(nf_nat_proto_gre_fini);
+
+void nf_nat_need_gre(void)
+{
+	return;
+}
+EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
new file mode 100644
index 00000000..5744c3ec
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -0,0 +1,84 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+static bool
+icmp_in_range(const struct nf_conntrack_tuple *tuple,
+	      enum nf_nat_manip_type maniptype,
+	      const union nf_conntrack_man_proto *min,
+	      const union nf_conntrack_man_proto *max)
+{
+	return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
+	       ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
+}
+
+static void
+icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
+		  const struct nf_nat_range *range,
+		  enum nf_nat_manip_type maniptype,
+		  const struct nf_conn *ct)
+{
+	static u_int16_t id;
+	unsigned int range_size;
+	unsigned int i;
+
+	range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
+	/* If no range specified... */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+		range_size = 0xFFFF;
+
+	for (i = 0; ; ++id) {
+		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+					     (id % range_size));
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+			return;
+	}
+	return;
+}
+
+static bool
+icmp_manip_pkt(struct sk_buff *skb,
+	       unsigned int iphdroff,
+	       const struct nf_conntrack_tuple *tuple,
+	       enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct icmphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	hdr = (struct icmphdr *)(skb->data + hdroff);
+	inet_proto_csum_replace2(&hdr->checksum, skb,
+				 hdr->un.echo.id, tuple->src.u.icmp.id, 0);
+	hdr->un.echo.id = tuple->src.u.icmp.id;
+	return true;
+}
+
+const struct nf_nat_protocol nf_nat_protocol_icmp = {
+	.protonum		= IPPROTO_ICMP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= icmp_manip_pkt,
+	.in_range		= icmp_in_range,
+	.unique_tuple		= icmp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
new file mode 100644
index 00000000..756331d4
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <net/sctp/checksum.h>
+
+#include <net/netfilter/nf_nat_protocol.h>
+
+static u_int16_t nf_sctp_port_rover;
+
+static void
+sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
+		  const struct nf_nat_range *range,
+		  enum nf_nat_manip_type maniptype,
+		  const struct nf_conn *ct)
+{
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &nf_sctp_port_rover);
+}
+
+static bool
+sctp_manip_pkt(struct sk_buff *skb,
+	       unsigned int iphdroff,
+	       const struct nf_conntrack_tuple *tuple,
+	       enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct sk_buff *frag;
+	sctp_sctphdr_t *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	__be32 oldip, newip;
+	__be32 crc32;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + iphdroff);
+	hdr = (struct sctphdr *)(skb->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.u3.ip;
+		hdr->source = tuple->src.u.sctp.port;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.u3.ip;
+		hdr->dest = tuple->dst.u.sctp.port;
+	}
+
+	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
+	skb_walk_frags(skb, frag)
+		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
+					  crc32);
+	crc32 = sctp_end_cksum(crc32);
+	hdr->checksum = crc32;
+
+	return true;
+}
+
+static const struct nf_nat_protocol nf_nat_protocol_sctp = {
+	.protonum		= IPPROTO_SCTP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= sctp_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= sctp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_sctp_init(void)
+{
+	return nf_nat_protocol_register(&nf_nat_protocol_sctp);
+}
+
+static void __exit nf_nat_proto_sctp_exit(void)
+{
+	nf_nat_protocol_unregister(&nf_nat_protocol_sctp);
+}
+
+module_init(nf_nat_proto_sctp_init);
+module_exit(nf_nat_proto_sctp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SCTP NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
new file mode 100644
index 00000000..aa460a59
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -0,0 +1,92 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_core.h>
+
+static u_int16_t tcp_port_rover;
+
+static void
+tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
+		 const struct nf_nat_range *range,
+		 enum nf_nat_manip_type maniptype,
+		 const struct nf_conn *ct)
+{
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
+}
+
+static bool
+tcp_manip_pkt(struct sk_buff *skb,
+	      unsigned int iphdroff,
+	      const struct nf_conntrack_tuple *tuple,
+	      enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct tcphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	__be32 oldip, newip;
+	__be16 *portptr, newport, oldport;
+	int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+	/* this could be a inner header returned in icmp packet; in such
+	   cases we cannot update the checksum field since it is outside of
+	   the 8 bytes of transport layer headers we are guaranteed */
+	if (skb->len >= hdroff + sizeof(struct tcphdr))
+		hdrsize = sizeof(struct tcphdr);
+
+	if (!skb_make_writable(skb, hdroff + hdrsize))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + iphdroff);
+	hdr = (struct tcphdr *)(skb->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.u3.ip;
+		newport = tuple->src.u.tcp.port;
+		portptr = &hdr->source;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.u3.ip;
+		newport = tuple->dst.u.tcp.port;
+		portptr = &hdr->dest;
+	}
+
+	oldport = *portptr;
+	*portptr = newport;
+
+	if (hdrsize < sizeof(*hdr))
+		return true;
+
+	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
+	inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0);
+	return true;
+}
+
+const struct nf_nat_protocol nf_nat_protocol_tcp = {
+	.protonum		= IPPROTO_TCP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= tcp_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= tcp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
new file mode 100644
index 00000000..dfe65c7e
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -0,0 +1,83 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+static u_int16_t udp_port_rover;
+
+static void
+udp_unique_tuple(struct nf_conntrack_tuple *tuple,
+		 const struct nf_nat_range *range,
+		 enum nf_nat_manip_type maniptype,
+		 const struct nf_conn *ct)
+{
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
+}
+
+static bool
+udp_manip_pkt(struct sk_buff *skb,
+	      unsigned int iphdroff,
+	      const struct nf_conntrack_tuple *tuple,
+	      enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct udphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	__be32 oldip, newip;
+	__be16 *portptr, newport;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + iphdroff);
+	hdr = (struct udphdr *)(skb->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.u3.ip;
+		newport = tuple->src.u.udp.port;
+		portptr = &hdr->source;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.u3.ip;
+		newport = tuple->dst.u.udp.port;
+		portptr = &hdr->dest;
+	}
+	if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
+		inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
+					 0);
+		if (!hdr->check)
+			hdr->check = CSUM_MANGLED_0;
+	}
+	*portptr = newport;
+	return true;
+}
+
+const struct nf_nat_protocol nf_nat_protocol_udp = {
+	.protonum		= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= udp_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= udp_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
new file mode 100644
index 00000000..3cc8c8af
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -0,0 +1,99 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+static u_int16_t udplite_port_rover;
+
+static void
+udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
+		     const struct nf_nat_range *range,
+		     enum nf_nat_manip_type maniptype,
+		     const struct nf_conn *ct)
+{
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &udplite_port_rover);
+}
+
+static bool
+udplite_manip_pkt(struct sk_buff *skb,
+		  unsigned int iphdroff,
+		  const struct nf_conntrack_tuple *tuple,
+		  enum nf_nat_manip_type maniptype)
+{
+	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct udphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	__be32 oldip, newip;
+	__be16 *portptr, newport;
+
+	if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+		return false;
+
+	iph = (struct iphdr *)(skb->data + iphdroff);
+	hdr = (struct udphdr *)(skb->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.u3.ip;
+		newport = tuple->src.u.udp.port;
+		portptr = &hdr->source;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.u3.ip;
+		newport = tuple->dst.u.udp.port;
+		portptr = &hdr->dest;
+	}
+
+	inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1);
+	inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0);
+	if (!hdr->check)
+		hdr->check = CSUM_MANGLED_0;
+
+	*portptr = newport;
+	return true;
+}
+
+static const struct nf_nat_protocol nf_nat_protocol_udplite = {
+	.protonum		= IPPROTO_UDPLITE,
+	.me			= THIS_MODULE,
+	.manip_pkt		= udplite_manip_pkt,
+	.in_range		= nf_nat_proto_in_range,
+	.unique_tuple		= udplite_unique_tuple,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.range_to_nlattr	= nf_nat_proto_range_to_nlattr,
+	.nlattr_to_range	= nf_nat_proto_nlattr_to_range,
+#endif
+};
+
+static int __init nf_nat_proto_udplite_init(void)
+{
+	return nf_nat_protocol_register(&nf_nat_protocol_udplite);
+}
+
+static void __exit nf_nat_proto_udplite_fini(void)
+{
+	nf_nat_protocol_unregister(&nf_nat_protocol_udplite);
+}
+
+module_init(nf_nat_proto_udplite_init);
+module_exit(nf_nat_proto_udplite_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP-Lite NAT protocol helper");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
new file mode 100644
index 00000000..a50f2bc1
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -0,0 +1,53 @@
+/* The "unknown" protocol.  This is what is used for protocols we
+ * don't understand.  It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+
+static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
+			     enum nf_nat_manip_type manip_type,
+			     const union nf_conntrack_man_proto *min,
+			     const union nf_conntrack_man_proto *max)
+{
+	return true;
+}
+
+static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
+				 const struct nf_nat_range *range,
+				 enum nf_nat_manip_type maniptype,
+				 const struct nf_conn *ct)
+{
+	/* Sorry: we can't help you; if it's not unique, we can't frob
+	   anything. */
+	return;
+}
+
+static bool
+unknown_manip_pkt(struct sk_buff *skb,
+		  unsigned int iphdroff,
+		  const struct nf_conntrack_tuple *tuple,
+		  enum nf_nat_manip_type maniptype)
+{
+	return true;
+}
+
+const struct nf_nat_protocol nf_nat_unknown_protocol = {
+	/* .me isn't set: getting a ref to this cannot fail. */
+	.manip_pkt		= unknown_manip_pkt,
+	.in_range		= unknown_in_range,
+	.unique_tuple		= unknown_unique_tuple,
+};
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
new file mode 100644
index 00000000..733c9abc
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -0,0 +1,214 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Everything about the rules for NAT. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/bitops.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_rule.h>
+
+#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+			 (1 << NF_INET_POST_ROUTING) | \
+			 (1 << NF_INET_LOCAL_OUT) | \
+			 (1 << NF_INET_LOCAL_IN))
+
+static const struct xt_table nat_table = {
+	.name		= "nat",
+	.valid_hooks	= NAT_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV4,
+};
+
+/* Source NAT */
+static unsigned int
+ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_IN);
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+	/* Connection must be valid and new. */
+	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+			    ctinfo == IP_CT_RELATED_REPLY));
+	NF_CT_ASSERT(par->out != NULL);
+
+	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
+}
+
+static unsigned int
+ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_OUT);
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+	/* Connection must be valid and new. */
+	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+	return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
+}
+
+static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
+{
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	/* Must be a valid range */
+	if (mr->rangesize != 1) {
+		pr_info("SNAT: multiple ranges no longer supported\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
+{
+	const struct nf_nat_multi_range_compat *mr = par->targinfo;
+
+	/* Must be a valid range */
+	if (mr->rangesize != 1) {
+		pr_info("DNAT: multiple ranges no longer supported\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static unsigned int
+alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+	/* Force range to this IP; let proto decide mapping for
+	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+	*/
+	struct nf_nat_range range;
+
+	range.flags = 0;
+	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+		 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
+	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+int nf_nat_rule_find(struct sk_buff *skb,
+		     unsigned int hooknum,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	int ret;
+
+	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
+
+	if (ret == NF_ACCEPT) {
+		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
+			/* NUL mapping */
+			ret = alloc_null_binding(ct, hooknum);
+	}
+	return ret;
+}
+
+static struct xt_target ipt_snat_reg __read_mostly = {
+	.name		= "SNAT",
+	.target		= ipt_snat_target,
+	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
+	.table		= "nat",
+	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
+	.checkentry	= ipt_snat_checkentry,
+	.family		= AF_INET,
+};
+
+static struct xt_target ipt_dnat_reg __read_mostly = {
+	.name		= "DNAT",
+	.target		= ipt_dnat_target,
+	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
+	.table		= "nat",
+	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT),
+	.checkentry	= ipt_dnat_checkentry,
+	.family		= AF_INET,
+};
+
+static int __net_init nf_nat_rule_net_init(struct net *net)
+{
+	struct ipt_replace *repl;
+
+	repl = ipt_alloc_initial_table(&nat_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv4.nat_table))
+		return PTR_ERR(net->ipv4.nat_table);
+	return 0;
+}
+
+static void __net_exit nf_nat_rule_net_exit(struct net *net)
+{
+	ipt_unregister_table(net, net->ipv4.nat_table);
+}
+
+static struct pernet_operations nf_nat_rule_net_ops = {
+	.init = nf_nat_rule_net_init,
+	.exit = nf_nat_rule_net_exit,
+};
+
+int __init nf_nat_rule_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&nf_nat_rule_net_ops);
+	if (ret != 0)
+		goto out;
+	ret = xt_register_target(&ipt_snat_reg);
+	if (ret != 0)
+		goto unregister_table;
+
+	ret = xt_register_target(&ipt_dnat_reg);
+	if (ret != 0)
+		goto unregister_snat;
+
+	return ret;
+
+ unregister_snat:
+	xt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+	unregister_pernet_subsys(&nf_nat_rule_net_ops);
+ out:
+	return ret;
+}
+
+void nf_nat_rule_cleanup(void)
+{
+	xt_unregister_target(&ipt_dnat_reg);
+	xt_unregister_target(&ipt_snat_reg);
+	unregister_pernet_subsys(&nf_nat_rule_net_ops);
+}
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
new file mode 100644
index 00000000..e40cf781
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -0,0 +1,561 @@
+/* SIP extension for NAT alteration.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_nat_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP NAT helper");
+MODULE_ALIAS("ip_nat_sip");
+
+
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
+				  const char **dptr, unsigned int *datalen,
+				  unsigned int matchoff, unsigned int matchlen,
+				  const char *buffer, unsigned int buflen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct tcphdr *th;
+	unsigned int baseoff;
+
+	if (nf_ct_protonum(ct) == IPPROTO_TCP) {
+		th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+		baseoff = ip_hdrlen(skb) + th->doff * 4;
+		matchoff += dataoff - baseoff;
+
+		if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						matchoff, matchlen,
+						buffer, buflen, false))
+			return 0;
+	} else {
+		baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
+		matchoff += dataoff - baseoff;
+
+		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+					      matchoff, matchlen,
+					      buffer, buflen))
+			return 0;
+	}
+
+	/* Reload data pointer and adjust datalen value */
+	*dptr = skb->data + dataoff;
+	*datalen += buflen - matchlen;
+	return 1;
+}
+
+static int map_addr(struct sk_buff *skb, unsigned int dataoff,
+		    const char **dptr, unsigned int *datalen,
+		    unsigned int matchoff, unsigned int matchlen,
+		    union nf_inet_addr *addr, __be16 port)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+	unsigned int buflen;
+	__be32 newaddr;
+	__be16 newport;
+
+	if (ct->tuplehash[dir].tuple.src.u3.ip == addr->ip &&
+	    ct->tuplehash[dir].tuple.src.u.udp.port == port) {
+		newaddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
+		newport = ct->tuplehash[!dir].tuple.dst.u.udp.port;
+	} else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
+		   ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
+		newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+		newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
+	} else
+		return 1;
+
+	if (newaddr == addr->ip && newport == port)
+		return 1;
+
+	buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
+
+	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			     buffer, buflen);
+}
+
+static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
+			const char **dptr, unsigned int *datalen,
+			enum sip_header_types type)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchlen, matchoff;
+	union nf_inet_addr addr;
+	__be16 port;
+
+	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
+				    &matchoff, &matchlen, &addr, &port) <= 0)
+		return 1;
+	return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			&addr, port);
+}
+
+static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
+			       const char **dptr, unsigned int *datalen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned int coff, matchoff, matchlen;
+	enum sip_header_types hdr;
+	union nf_inet_addr addr;
+	__be16 port;
+	int request, in_header;
+
+	/* Basic rules: requests and responses. */
+	if (strnicmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
+		if (ct_sip_parse_request(ct, *dptr, *datalen,
+					 &matchoff, &matchlen,
+					 &addr, &port) > 0 &&
+		    !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			      &addr, port))
+			return NF_DROP;
+		request = 1;
+	} else
+		request = 0;
+
+	if (nf_ct_protonum(ct) == IPPROTO_TCP)
+		hdr = SIP_HDR_VIA_TCP;
+	else
+		hdr = SIP_HDR_VIA_UDP;
+
+	/* Translate topmost Via header and parameters */
+	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+				    hdr, NULL, &matchoff, &matchlen,
+				    &addr, &port) > 0) {
+		unsigned int matchend, poff, plen, buflen, n;
+		char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+
+		/* We're only interested in headers related to this
+		 * connection */
+		if (request) {
+			if (addr.ip != ct->tuplehash[dir].tuple.src.u3.ip ||
+			    port != ct->tuplehash[dir].tuple.src.u.udp.port)
+				goto next;
+		} else {
+			if (addr.ip != ct->tuplehash[dir].tuple.dst.u3.ip ||
+			    port != ct->tuplehash[dir].tuple.dst.u.udp.port)
+				goto next;
+		}
+
+		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			      &addr, port))
+			return NF_DROP;
+
+		matchend = matchoff + matchlen;
+
+		/* The maddr= parameter (RFC 2361) specifies where to send
+		 * the reply. */
+		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+					       "maddr=", &poff, &plen,
+					       &addr) > 0 &&
+		    addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
+		    addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
+			buflen = sprintf(buffer, "%pI4",
+					&ct->tuplehash[!dir].tuple.dst.u3.ip);
+			if (!mangle_packet(skb, dataoff, dptr, datalen,
+					   poff, plen, buffer, buflen))
+				return NF_DROP;
+		}
+
+		/* The received= parameter (RFC 2361) contains the address
+		 * from which the server received the request. */
+		if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
+					       "received=", &poff, &plen,
+					       &addr) > 0 &&
+		    addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
+		    addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
+			buflen = sprintf(buffer, "%pI4",
+					&ct->tuplehash[!dir].tuple.src.u3.ip);
+			if (!mangle_packet(skb, dataoff, dptr, datalen,
+					   poff, plen, buffer, buflen))
+				return NF_DROP;
+		}
+
+		/* The rport= parameter (RFC 3581) contains the port number
+		 * from which the server received the request. */
+		if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen,
+						 "rport=", &poff, &plen,
+						 &n) > 0 &&
+		    htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port &&
+		    htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
+			__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
+			buflen = sprintf(buffer, "%u", ntohs(p));
+			if (!mangle_packet(skb, dataoff, dptr, datalen,
+					   poff, plen, buffer, buflen))
+				return NF_DROP;
+		}
+	}
+
+next:
+	/* Translate Contact headers */
+	coff = 0;
+	in_header = 0;
+	while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+				       SIP_HDR_CONTACT, &in_header,
+				       &matchoff, &matchlen,
+				       &addr, &port) > 0) {
+		if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			      &addr, port))
+			return NF_DROP;
+	}
+
+	if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+	    !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
+		return NF_DROP;
+
+	return NF_ACCEPT;
+}
+
+static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	const struct tcphdr *th;
+
+	if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
+		return;
+
+	th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+}
+
+/* Handles expected signalling connections and media streams */
+static void ip_nat_sip_expected(struct nf_conn *ct,
+				struct nf_conntrack_expect *exp)
+{
+	struct nf_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = exp->saved_proto;
+	range.min_ip = range.max_ip = exp->saved_ip;
+	nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+
+	/* Change src to where master sends to, but only if the connection
+	 * actually came from the same source. */
+	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip ==
+	    ct->master->tuplehash[exp->dir].tuple.src.u3.ip) {
+		range.flags = IP_NAT_RANGE_MAP_IPS;
+		range.min_ip = range.max_ip
+			= ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
+		nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+	}
+}
+
+static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
+				      const char **dptr, unsigned int *datalen,
+				      struct nf_conntrack_expect *exp,
+				      unsigned int matchoff,
+				      unsigned int matchlen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	__be32 newip;
+	u_int16_t port;
+	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
+	unsigned buflen;
+
+	/* Connection will come from reply */
+	if (ct->tuplehash[dir].tuple.src.u3.ip == ct->tuplehash[!dir].tuple.dst.u3.ip)
+		newip = exp->tuple.dst.u3.ip;
+	else
+		newip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+
+	/* If the signalling port matches the connection's source port in the
+	 * original direction, try to use the destination port in the opposite
+	 * direction. */
+	if (exp->tuple.dst.u.udp.port ==
+	    ct->tuplehash[dir].tuple.src.u.udp.port)
+		port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
+	else
+		port = ntohs(exp->tuple.dst.u.udp.port);
+
+	exp->saved_ip = exp->tuple.dst.u3.ip;
+	exp->tuple.dst.u3.ip = newip;
+	exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port;
+	exp->dir = !dir;
+	exp->expectfn = ip_nat_sip_expected;
+
+	for (; port != 0; port++) {
+		int ret;
+
+		exp->tuple.dst.u.udp.port = htons(port);
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
+			break;
+		}
+	}
+
+	if (port == 0)
+		return NF_DROP;
+
+	if (exp->tuple.dst.u3.ip != exp->saved_ip ||
+	    exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
+		buflen = sprintf(buffer, "%pI4:%u", &newip, port);
+		if (!mangle_packet(skb, dataoff, dptr, datalen,
+				   matchoff, matchlen, buffer, buflen))
+			goto err;
+	}
+	return NF_ACCEPT;
+
+err:
+	nf_ct_unexpect_related(exp);
+	return NF_DROP;
+}
+
+static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
+			      const char **dptr, unsigned int *datalen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchoff, matchlen;
+	char buffer[sizeof("65536")];
+	int buflen, c_len;
+
+	/* Get actual SDP length */
+	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+				  SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+				  &matchoff, &matchlen) <= 0)
+		return 0;
+	c_len = *datalen - matchoff + strlen("v=");
+
+	/* Now, update SDP length */
+	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CONTENT_LENGTH,
+			      &matchoff, &matchlen) <= 0)
+		return 0;
+
+	buflen = sprintf(buffer, "%u", c_len);
+	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			     buffer, buflen);
+}
+
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
+			     const char **dptr, unsigned int *datalen,
+			     unsigned int sdpoff,
+			     enum sdp_header_types type,
+			     enum sdp_header_types term,
+			     char *buffer, int buflen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchlen, matchoff;
+
+	if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
+				  &matchoff, &matchlen) <= 0)
+		return -ENOENT;
+	return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			     buffer, buflen) ? 0 : -EINVAL;
+}
+
+static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
+				    const char **dptr, unsigned int *datalen,
+				    unsigned int sdpoff,
+				    enum sdp_header_types type,
+				    enum sdp_header_types term,
+				    const union nf_inet_addr *addr)
+{
+	char buffer[sizeof("nnn.nnn.nnn.nnn")];
+	unsigned int buflen;
+
+	buflen = sprintf(buffer, "%pI4", &addr->ip);
+	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
+			      buffer, buflen))
+		return 0;
+
+	return mangle_content_len(skb, dataoff, dptr, datalen);
+}
+
+static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
+				    const char **dptr, unsigned int *datalen,
+				    unsigned int matchoff,
+				    unsigned int matchlen,
+				    u_int16_t port)
+{
+	char buffer[sizeof("nnnnn")];
+	unsigned int buflen;
+
+	buflen = sprintf(buffer, "%u", port);
+	if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
+			   buffer, buflen))
+		return 0;
+
+	return mangle_content_len(skb, dataoff, dptr, datalen);
+}
+
+static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
+				       const char **dptr, unsigned int *datalen,
+				       unsigned int sdpoff,
+				       const union nf_inet_addr *addr)
+{
+	char buffer[sizeof("nnn.nnn.nnn.nnn")];
+	unsigned int buflen;
+
+	/* Mangle session description owner and contact addresses */
+	buflen = sprintf(buffer, "%pI4", &addr->ip);
+	if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
+			       SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
+			       buffer, buflen))
+		return 0;
+
+	switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
+				  SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
+				  buffer, buflen)) {
+	case 0:
+	/*
+	 * RFC 2327:
+	 *
+	 * Session description
+	 *
+	 * c=* (connection information - not required if included in all media)
+	 */
+	case -ENOENT:
+		break;
+	default:
+		return 0;
+	}
+
+	return mangle_content_len(skb, dataoff, dptr, datalen);
+}
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
+				     const char **dptr, unsigned int *datalen,
+				     struct nf_conntrack_expect *rtp_exp,
+				     struct nf_conntrack_expect *rtcp_exp,
+				     unsigned int mediaoff,
+				     unsigned int medialen,
+				     union nf_inet_addr *rtp_addr)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	u_int16_t port;
+
+	/* Connection will come from reply */
+	if (ct->tuplehash[dir].tuple.src.u3.ip ==
+	    ct->tuplehash[!dir].tuple.dst.u3.ip)
+		rtp_addr->ip = rtp_exp->tuple.dst.u3.ip;
+	else
+		rtp_addr->ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
+
+	rtp_exp->saved_ip = rtp_exp->tuple.dst.u3.ip;
+	rtp_exp->tuple.dst.u3.ip = rtp_addr->ip;
+	rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
+	rtp_exp->dir = !dir;
+	rtp_exp->expectfn = ip_nat_sip_expected;
+
+	rtcp_exp->saved_ip = rtcp_exp->tuple.dst.u3.ip;
+	rtcp_exp->tuple.dst.u3.ip = rtp_addr->ip;
+	rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
+	rtcp_exp->dir = !dir;
+	rtcp_exp->expectfn = ip_nat_sip_expected;
+
+	/* Try to get same pair of ports: if not, try to change them. */
+	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
+	     port != 0; port += 2) {
+		int ret;
+
+		rtp_exp->tuple.dst.u.udp.port = htons(port);
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == -EBUSY)
+			continue;
+		else if (ret < 0) {
+			port = 0;
+			break;
+		}
+		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
+		ret = nf_ct_expect_related(rtcp_exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nf_ct_unexpect_related(rtp_exp);
+			port = 0;
+			break;
+		}
+	}
+
+	if (port == 0)
+		goto err1;
+
+	/* Update media port. */
+	if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
+	    !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
+			     mediaoff, medialen, port))
+		goto err2;
+
+	return NF_ACCEPT;
+
+err2:
+	nf_ct_unexpect_related(rtp_exp);
+	nf_ct_unexpect_related(rtcp_exp);
+err1:
+	return NF_DROP;
+}
+
+static void __exit nf_nat_sip_fini(void)
+{
+	rcu_assign_pointer(nf_nat_sip_hook, NULL);
+	rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
+	rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
+	rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
+	rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
+	rcu_assign_pointer(nf_nat_sdp_session_hook, NULL);
+	rcu_assign_pointer(nf_nat_sdp_media_hook, NULL);
+	synchronize_rcu();
+}
+
+static int __init nf_nat_sip_init(void)
+{
+	BUG_ON(nf_nat_sip_hook != NULL);
+	BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
+	BUG_ON(nf_nat_sip_expect_hook != NULL);
+	BUG_ON(nf_nat_sdp_addr_hook != NULL);
+	BUG_ON(nf_nat_sdp_port_hook != NULL);
+	BUG_ON(nf_nat_sdp_session_hook != NULL);
+	BUG_ON(nf_nat_sdp_media_hook != NULL);
+	rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
+	rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
+	rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
+	rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
+	rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
+	rcu_assign_pointer(nf_nat_sdp_session_hook, ip_nat_sdp_session);
+	rcu_assign_pointer(nf_nat_sdp_media_hook, ip_nat_sdp_media);
+	return 0;
+}
+
+module_init(nf_nat_sip_init);
+module_exit(nf_nat_sip_fini);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
new file mode 100644
index 00000000..8812a020
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -0,0 +1,1334 @@
+/*
+ * nf_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network
+ * discovery and monitoring applications where target networks use
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified.  The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+MODULE_ALIAS("ip_nat_snmp_basic");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (*(u8 *)(n))
+
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+
+/*
+ * Application layer address mapping mimics the NAT mapping, but
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+	u_int8_t from;
+	u_int8_t to;
+};
+
+
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI	0	/* Universal */
+#define ASN1_APL	1	/* Application */
+#define ASN1_CTX	2	/* Context */
+#define ASN1_PRV	3	/* Private */
+
+/* Tag */
+#define ASN1_EOC	0	/* End Of Contents */
+#define ASN1_BOL	1	/* Boolean */
+#define ASN1_INT	2	/* Integer */
+#define ASN1_BTS	3	/* Bit String */
+#define ASN1_OTS	4	/* Octet String */
+#define ASN1_NUL	5	/* Null */
+#define ASN1_OJI	6	/* Object Identifier  */
+#define ASN1_OJD	7	/* Object Description */
+#define ASN1_EXT	8	/* External */
+#define ASN1_SEQ	16	/* Sequence */
+#define ASN1_SET	17	/* Set */
+#define ASN1_NUMSTR	18	/* Numerical String */
+#define ASN1_PRNSTR	19	/* Printable String */
+#define ASN1_TEXSTR	20	/* Teletext String */
+#define ASN1_VIDSTR	21	/* Video String */
+#define ASN1_IA5STR	22	/* IA5 String */
+#define ASN1_UNITIM	23	/* Universal Time */
+#define ASN1_GENTIM	24	/* General Time */
+#define ASN1_GRASTR	25	/* Graphical String */
+#define ASN1_VISSTR	26	/* Visible String */
+#define ASN1_GENSTR	27	/* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI	0	/* Primitive */
+#define ASN1_CON	1	/* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR		0
+#define ASN1_ERR_DEC_EMPTY		2
+#define ASN1_ERR_DEC_EOC_MISMATCH	3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
+#define ASN1_ERR_DEC_BADVALUE		5
+
+/*
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+	int error;			/* Error condition */
+	unsigned char *pointer;		/* Octet just to be decoded */
+	unsigned char *begin;		/* First octet */
+	unsigned char *end;		/* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+	unsigned char *data;
+	unsigned int len;
+};
+
+static void asn1_open(struct asn1_ctx *ctx,
+		      unsigned char *buf,
+		      unsigned int len)
+{
+	ctx->begin = buf;
+	ctx->end = buf + len;
+	ctx->pointer = buf;
+	ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+	*ch = *(ctx->pointer)++;
+	return 1;
+}
+
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+	unsigned char ch;
+
+	*tag = 0;
+
+	do
+	{
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+		*tag <<= 7;
+		*tag |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx,
+				    unsigned int *cls,
+				    unsigned int *con,
+				    unsigned int *tag)
+{
+	unsigned char ch;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*cls = (ch & 0xC0) >> 6;
+	*con = (ch & 0x20) >> 5;
+	*tag = (ch & 0x1F);
+
+	if (*tag == 0x1F) {
+		if (!asn1_tag_decode(ctx, tag))
+			return 0;
+	}
+	return 1;
+}
+
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+					unsigned int *def,
+					unsigned int *len)
+{
+	unsigned char ch, cnt;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	if (ch == 0x80)
+		*def = 0;
+	else {
+		*def = 1;
+
+		if (ch < 0x80)
+			*len = ch;
+		else {
+			cnt = ch & 0x7F;
+			*len = 0;
+
+			while (cnt > 0) {
+				if (!asn1_octet_decode(ctx, &ch))
+					return 0;
+				*len <<= 8;
+				*len |= ch;
+				cnt--;
+			}
+		}
+	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
+	return 1;
+}
+
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+					unsigned char **eoc,
+					unsigned int *cls,
+					unsigned int *con,
+					unsigned int *tag)
+{
+	unsigned int def, len;
+
+	if (!asn1_id_decode(ctx, cls, con, tag))
+		return 0;
+
+	def = len = 0;
+	if (!asn1_length_decode(ctx, &def, &len))
+		return 0;
+
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
+	if (def)
+		*eoc = ctx->pointer + len;
+	else
+		*eoc = NULL;
+	return 1;
+}
+
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	unsigned char ch;
+
+	if (eoc == NULL) {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+		return 1;
+	} else {
+		if (ctx->pointer != eoc) {
+			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+			return 0;
+		}
+		return 1;
+	}
+}
+
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	ctx->pointer = eoc;
+	return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc,
+				      long *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = (signed char) ch;
+	len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+				      unsigned char *eoc,
+				      unsigned int *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0) len = 0;
+	else len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (unsigned int)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+				       unsigned char *eoc,
+				       unsigned long *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+
+	*integer = ch;
+	if (ch == 0) len = 0;
+	else len = 1;
+
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (unsigned long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+					unsigned char *eoc,
+					unsigned char **octets,
+					unsigned int *len)
+{
+	unsigned char *ptr;
+
+	*len = 0;
+
+	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+	if (*octets == NULL) {
+		if (net_ratelimit())
+			pr_notice("OOM in bsalg (%d)\n", __LINE__);
+		return 0;
+	}
+
+	ptr = *octets;
+	while (ctx->pointer < eoc) {
+		if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+			kfree(*octets);
+			*octets = NULL;
+			return 0;
+		}
+		(*len)++;
+	}
+	return 1;
+}
+
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+				       unsigned long *subid)
+{
+	unsigned char ch;
+
+	*subid = 0;
+
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+
+		*subid <<= 7;
+		*subid |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+				     unsigned char *eoc,
+				     unsigned long **oid,
+				     unsigned int *len)
+{
+	unsigned long subid;
+	unsigned long *optr;
+	size_t size;
+
+	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+		return 0;
+
+	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+	if (*oid == NULL) {
+		if (net_ratelimit())
+			pr_notice("OOM in bsalg (%d)\n", __LINE__);
+		return 0;
+	}
+
+	optr = *oid;
+
+	if (!asn1_subid_decode(ctx, &subid)) {
+		kfree(*oid);
+		*oid = NULL;
+		return 0;
+	}
+
+	if (subid < 40) {
+		optr [0] = 0;
+		optr [1] = subid;
+	} else if (subid < 80) {
+		optr [0] = 1;
+		optr [1] = subid - 40;
+	} else {
+		optr [0] = 2;
+		optr [1] = subid - 80;
+	}
+
+	*len = 2;
+	optr += 2;
+
+	while (ctx->pointer < eoc) {
+		if (++(*len) > size) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+
+		if (!asn1_subid_decode(ctx, optr++)) {
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* SNMP Versions */
+#define SNMP_V1				0
+#define SNMP_V2C			1
+#define SNMP_V2				2
+#define SNMP_V3				3
+
+/* Default Sizes */
+#define SNMP_SIZE_COMM			256
+#define SNMP_SIZE_OBJECTID		128
+#define SNMP_SIZE_BUFCHR		256
+#define SNMP_SIZE_BUFINT		128
+#define SNMP_SIZE_SMALLOBJECTID		16
+
+/* Requests */
+#define SNMP_PDU_GET			0
+#define SNMP_PDU_NEXT			1
+#define SNMP_PDU_RESPONSE		2
+#define SNMP_PDU_SET			3
+#define SNMP_PDU_TRAP1			4
+#define SNMP_PDU_BULK			5
+#define SNMP_PDU_INFORM			6
+#define SNMP_PDU_TRAP2			7
+
+/* Errors */
+#define SNMP_NOERROR			0
+#define SNMP_TOOBIG			1
+#define SNMP_NOSUCHNAME			2
+#define SNMP_BADVALUE			3
+#define SNMP_READONLY			4
+#define SNMP_GENERROR			5
+#define SNMP_NOACCESS			6
+#define SNMP_WRONGTYPE			7
+#define SNMP_WRONGLENGTH		8
+#define SNMP_WRONGENCODING		9
+#define SNMP_WRONGVALUE			10
+#define SNMP_NOCREATION			11
+#define SNMP_INCONSISTENTVALUE		12
+#define SNMP_RESOURCEUNAVAILABLE	13
+#define SNMP_COMMITFAILED		14
+#define SNMP_UNDOFAILED			15
+#define SNMP_AUTHORIZATIONERROR		16
+#define SNMP_NOTWRITABLE		17
+#define SNMP_INCONSISTENTNAME		18
+
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART		0
+#define SNMP_TRAP_WARMSTART		1
+#define SNMP_TRAP_LINKDOWN		2
+#define SNMP_TRAP_LINKUP		3
+#define SNMP_TRAP_AUTFAILURE		4
+#define SNMP_TRAP_EQPNEIGHBORLOSS	5
+#define SNMP_TRAP_ENTSPECIFIC		6
+
+/* SNMPv1 Types */
+#define SNMP_NULL                0
+#define SNMP_INTEGER             1    /* l  */
+#define SNMP_OCTETSTR            2    /* c  */
+#define SNMP_DISPLAYSTR          2    /* c  */
+#define SNMP_OBJECTID            3    /* ul */
+#define SNMP_IPADDR              4    /* uc */
+#define SNMP_COUNTER             5    /* ul */
+#define SNMP_GAUGE               6    /* ul */
+#define SNMP_TIMETICKS           7    /* ul */
+#define SNMP_OPAQUE              8    /* c  */
+
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER            5    /* ul */
+#define SNMP_BITSTR              9    /* uc */
+#define SNMP_NSAP               10    /* uc */
+#define SNMP_COUNTER64          11    /* ul */
+#define SNMP_NOSUCHOBJECT       12
+#define SNMP_NOSUCHINSTANCE     13
+#define SNMP_ENDOFMIBVIEW       14
+
+union snmp_syntax
+{
+	unsigned char uc[0];	/* 8 bit unsigned */
+	char c[0];		/* 8 bit signed */
+	unsigned long ul[0];	/* 32 bit unsigned */
+	long l[0];		/* 32 bit signed */
+};
+
+struct snmp_object
+{
+	unsigned long *id;
+	unsigned int id_len;
+	unsigned short type;
+	unsigned int syntax_len;
+	union snmp_syntax syntax;
+};
+
+struct snmp_request
+{
+	unsigned long id;
+	unsigned int error_status;
+	unsigned int error_index;
+};
+
+struct snmp_v1_trap
+{
+	unsigned long *id;
+	unsigned int id_len;
+	unsigned long ip_address;	/* pointer  */
+	unsigned int general;
+	unsigned int specific;
+	unsigned long time;
+};
+
+/* SNMP types */
+#define SNMP_IPA    0
+#define SNMP_CNT    1
+#define SNMP_GGE    2
+#define SNMP_TIT    3
+#define SNMP_OPQ    4
+#define SNMP_C64    6
+
+/* SNMP errors */
+#define SERR_NSO    0
+#define SERR_NSI    1
+#define SERR_EOM    2
+
+static inline void mangle_address(unsigned char *begin,
+				  unsigned char *addr,
+				  const struct oct1_map *map,
+				  __sum16 *check);
+struct snmp_cnv
+{
+	unsigned int class;
+	unsigned int tag;
+	int syntax;
+};
+
+static const struct snmp_cnv snmp_conv[] = {
+	{ASN1_UNI, ASN1_NUL, SNMP_NULL},
+	{ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+	{ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+	{ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+	{ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+	{ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+	{ASN1_APL, SNMP_CNT, SNMP_COUNTER},	/* Counter32 */
+	{ASN1_APL, SNMP_GGE, SNMP_GAUGE},	/* Gauge32 == Unsigned32  */
+	{ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+	{ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+
+	/* SNMPv2 data types and errors */
+	{ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+	{ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+	{ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+	{ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+	{ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+	{0,       0,       -1}
+};
+
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+					 unsigned int cls,
+					 unsigned short *syntax)
+{
+	const struct snmp_cnv *cnv;
+
+	cnv = snmp_conv;
+
+	while (cnv->syntax != -1) {
+		if (cnv->tag == tag && cnv->class == cls) {
+			*syntax = cnv->syntax;
+			return 1;
+		}
+		cnv++;
+	}
+	return 0;
+}
+
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+					struct snmp_object **obj)
+{
+	unsigned int cls, con, tag, len, idlen;
+	unsigned short type;
+	unsigned char *eoc, *end, *p;
+	unsigned long *lp, *id;
+	unsigned long ul;
+	long l;
+
+	*obj = NULL;
+	id = NULL;
+
+	if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+		return 0;
+
+	if (!asn1_oid_decode(ctx, end, &id, &idlen))
+		return 0;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+		kfree(id);
+		return 0;
+	}
+
+	if (con != ASN1_PRI) {
+		kfree(id);
+		return 0;
+	}
+
+	type = 0;
+	if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+		kfree(id);
+		return 0;
+	}
+
+	l = 0;
+	switch (type) {
+		case SNMP_INTEGER:
+			len = sizeof(long);
+			if (!asn1_long_decode(ctx, end, &l)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len,
+				       GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			(*obj)->syntax.l[0] = l;
+			break;
+		case SNMP_OCTETSTR:
+		case SNMP_OPAQUE:
+			if (!asn1_octets_decode(ctx, end, &p, &len)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len,
+				       GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(p);
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.c, p, len);
+			kfree(p);
+			break;
+		case SNMP_NULL:
+		case SNMP_NOSUCHOBJECT:
+		case SNMP_NOSUCHINSTANCE:
+		case SNMP_ENDOFMIBVIEW:
+			len = 0;
+			*obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			if (!asn1_null_decode(ctx, end)) {
+				kfree(id);
+				kfree(*obj);
+				*obj = NULL;
+				return 0;
+			}
+			break;
+		case SNMP_OBJECTID:
+			if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+				kfree(id);
+				return 0;
+			}
+			len *= sizeof(unsigned long);
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(lp);
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.ul, lp, len);
+			kfree(lp);
+			break;
+		case SNMP_IPADDR:
+			if (!asn1_octets_decode(ctx, end, &p, &len)) {
+				kfree(id);
+				return 0;
+			}
+			if (len != 4) {
+				kfree(p);
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(p);
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.uc, p, len);
+			kfree(p);
+			break;
+		case SNMP_COUNTER:
+		case SNMP_GAUGE:
+		case SNMP_TIMETICKS:
+			len = sizeof(unsigned long);
+			if (!asn1_ulong_decode(ctx, end, &ul)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					pr_notice("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			(*obj)->syntax.ul[0] = ul;
+			break;
+		default:
+			kfree(id);
+			return 0;
+	}
+
+	(*obj)->syntax_len = len;
+	(*obj)->type = type;
+	(*obj)->id = id;
+	(*obj)->id_len = idlen;
+
+	if (!asn1_eoc_decode(ctx, eoc)) {
+		kfree(id);
+		kfree(*obj);
+		*obj = NULL;
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+					 struct snmp_request *request)
+{
+	unsigned int cls, con, tag;
+	unsigned char *end;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+
+	if (!asn1_ulong_decode(ctx, end, &request->id))
+		return 0;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+
+	if (!asn1_uint_decode(ctx, end, &request->error_status))
+		return 0;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+
+	if (!asn1_uint_decode(ctx, end, &request->error_index))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(__sum16 *csum,
+		      const unsigned char *optr,
+		      const unsigned char *nptr,
+		      int offset)
+{
+	unsigned char s[4];
+
+	if (offset & 1) {
+		s[0] = ~0;
+		s[1] = ~*optr;
+		s[2] = 0;
+		s[3] = *nptr;
+	} else {
+		s[0] = ~*optr;
+		s[1] = ~0;
+		s[2] = *nptr;
+		s[3] = 0;
+	}
+
+	*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
+}
+
+/*
+ * Mangle IP address.
+ * 	- begin points to the start of the snmp messgae
+ *      - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+				  unsigned char *addr,
+				  const struct oct1_map *map,
+				  __sum16 *check)
+{
+	if (map->from == NOCT1(addr)) {
+		u_int32_t old;
+
+		if (debug)
+			memcpy(&old, addr, sizeof(old));
+
+		*addr = map->to;
+
+		/* Update UDP checksum if being used */
+		if (*check) {
+			fast_csum(check,
+				  &map->from, &map->to, addr - begin);
+
+		}
+
+		if (debug)
+			printk(KERN_DEBUG "bsalg: mapped %pI4 to %pI4\n",
+			       &old, addr);
+	}
+}
+
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+				      struct snmp_v1_trap *trap,
+				      const struct oct1_map *map,
+				      __sum16 *check)
+{
+	unsigned int cls, con, tag, len;
+	unsigned char *end;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+		return 0;
+
+	if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+		return 0;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_id_free;
+
+	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+		goto err_id_free;
+
+	if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+		goto err_id_free;
+
+	/* IPv4 only */
+	if (len != 4)
+		goto err_addr_free;
+
+	mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		goto err_addr_free;
+
+	if (!asn1_uint_decode(ctx, end, &trap->general))
+		goto err_addr_free;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		goto err_addr_free;
+
+	if (!asn1_uint_decode(ctx, end, &trap->specific))
+		goto err_addr_free;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+
+	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+		goto err_addr_free;
+
+	if (!asn1_ulong_decode(ctx, end, &trap->time))
+		goto err_addr_free;
+
+	return 1;
+
+err_addr_free:
+	kfree((unsigned long *)trap->ip_address);
+
+err_id_free:
+	kfree(trap->id);
+
+	return 0;
+}
+
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+
+static void hex_dump(const unsigned char *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (i && !(i % 16))
+			printk("\n");
+		printk("%02x ", *(buf + i));
+	}
+	printk("\n");
+}
+
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+			     u_int16_t len,
+			     const struct oct1_map *map,
+			     __sum16 *check)
+{
+	unsigned char *eoc, *end;
+	unsigned int cls, con, tag, vers, pdutype;
+	struct asn1_ctx ctx;
+	struct asn1_octstr comm;
+	struct snmp_object *obj;
+
+	if (debug > 1)
+		hex_dump(msg, len);
+
+	asn1_open(&ctx, msg, len);
+
+	/*
+	 * Start of SNMP message.
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+
+	/*
+	 * Version 1 or 2 handled.
+	 */
+	if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+	if (!asn1_uint_decode (&ctx, end, &vers))
+		return 0;
+	if (debug > 1)
+		printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+	if (vers > 1)
+		return 1;
+
+	/*
+	 * Community.
+	 */
+	if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+		return 0;
+	if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+		return 0;
+	if (debug > 1) {
+		unsigned int i;
+
+		printk(KERN_DEBUG "bsalg: community: ");
+		for (i = 0; i < comm.len; i++)
+			printk("%c", comm.data[i]);
+		printk("\n");
+	}
+	kfree(comm.data);
+
+	/*
+	 * PDU type
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+		return 0;
+	if (cls != ASN1_CTX || con != ASN1_CON)
+		return 0;
+	if (debug > 1) {
+		static const unsigned char *const pdus[] = {
+			[SNMP_PDU_GET] = "get",
+			[SNMP_PDU_NEXT] = "get-next",
+			[SNMP_PDU_RESPONSE] = "response",
+			[SNMP_PDU_SET] = "set",
+			[SNMP_PDU_TRAP1] = "trapv1",
+			[SNMP_PDU_BULK] = "bulk",
+			[SNMP_PDU_INFORM] = "inform",
+			[SNMP_PDU_TRAP2] = "trapv2"
+		};
+
+		if (pdutype > SNMP_PDU_TRAP2)
+			printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+		else
+			printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+	}
+	if (pdutype != SNMP_PDU_RESPONSE &&
+	    pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+		return 1;
+
+	/*
+	 * Request header or v1 trap
+	 */
+	if (pdutype == SNMP_PDU_TRAP1) {
+		struct snmp_v1_trap trap;
+		unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+
+		if (ret) {
+			kfree(trap.id);
+			kfree((unsigned long *)trap.ip_address);
+		} else
+			return ret;
+
+	} else {
+		struct snmp_request req;
+
+		if (!snmp_request_decode(&ctx, &req))
+			return 0;
+
+		if (debug > 1)
+			printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+			"error_index=%u\n", req.id, req.error_status,
+			req.error_index);
+	}
+
+	/*
+	 * Loop through objects, look for IP addresses to mangle.
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+		return 0;
+
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+
+	while (!asn1_eoc_decode(&ctx, eoc)) {
+		unsigned int i;
+
+		if (!snmp_object_decode(&ctx, &obj)) {
+			if (obj) {
+				kfree(obj->id);
+				kfree(obj);
+			}
+			return 0;
+		}
+
+		if (debug > 1) {
+			printk(KERN_DEBUG "bsalg: object: ");
+			for (i = 0; i < obj->id_len; i++) {
+				if (i > 0)
+					printk(".");
+				printk("%lu", obj->id[i]);
+			}
+			printk(": type=%u\n", obj->type);
+
+		}
+
+		if (obj->type == SNMP_IPADDR)
+			mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+
+		kfree(obj->id);
+		kfree(obj);
+	}
+
+	if (!asn1_eoc_decode(&ctx, eoc))
+		return 0;
+
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+
+/*
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+	u_int16_t udplen = ntohs(udph->len);
+	u_int16_t paylen = udplen - sizeof(struct udphdr);
+	int dir = CTINFO2DIR(ctinfo);
+	struct oct1_map map;
+
+	/*
+	 * Determine mappping for application layer addresses based
+	 * on NAT manipulations for the packet.
+	 */
+	if (dir == IP_CT_DIR_ORIGINAL) {
+		/* SNAT traps */
+		map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
+		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+	} else {
+		/* DNAT replies */
+		map.from = NOCT1(&ct->tuplehash[dir].tuple.src.u3.ip);
+		map.to = NOCT1(&ct->tuplehash[!dir].tuple.dst.u3.ip);
+	}
+
+	if (map.from == map.to)
+		return NF_ACCEPT;
+
+	if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+			       paylen, &map, &udph->check)) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "bsalg: parser failed\n");
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int ret;
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph = (struct udphdr *)((__be32 *)iph + iph->ihl);
+
+	/* SNMP replies and originating SNMP traps get mangled */
+	if (udph->source == htons(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+	if (udph->dest == htons(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* No NAT? */
+	if (!(ct->status & IPS_NAT_MASK))
+		return NF_ACCEPT;
+
+	/*
+	 * Make sure the packet length is ok.  So far, we were only guaranteed
+	 * to have a valid length IP header plus 8 bytes, which means we have
+	 * enough room for a UDP header.  Just verify the UDP length field so we
+	 * can mess around with the payload.
+	 */
+	if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
+		 if (net_ratelimit())
+			 printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
+				&iph->saddr, &iph->daddr);
+		 return NF_DROP;
+	}
+
+	if (!skb_make_writable(skb, skb->len))
+		return NF_DROP;
+
+	spin_lock_bh(&snmp_lock);
+	ret = snmp_translate(ct, ctinfo, skb);
+	spin_unlock_bh(&snmp_lock);
+	return ret;
+}
+
+static const struct nf_conntrack_expect_policy snmp_exp_policy = {
+	.max_expected	= 0,
+	.timeout	= 180,
+};
+
+static struct nf_conntrack_helper snmp_helper __read_mostly = {
+	.me			= THIS_MODULE,
+	.help			= help,
+	.expect_policy		= &snmp_exp_policy,
+	.name			= "snmp",
+	.tuple.src.l3num	= AF_INET,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+};
+
+static struct nf_conntrack_helper snmp_trap_helper __read_mostly = {
+	.me			= THIS_MODULE,
+	.help			= help,
+	.expect_policy		= &snmp_exp_policy,
+	.name			= "snmp_trap",
+	.tuple.src.l3num	= AF_INET,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_TRAP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+};
+
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+
+static int __init nf_nat_snmp_basic_init(void)
+{
+	int ret = 0;
+
+	BUG_ON(nf_nat_snmp_hook != NULL);
+	rcu_assign_pointer(nf_nat_snmp_hook, help);
+
+	ret = nf_conntrack_helper_register(&snmp_trap_helper);
+	if (ret < 0) {
+		nf_conntrack_helper_unregister(&snmp_helper);
+		return ret;
+	}
+	return ret;
+}
+
+static void __exit nf_nat_snmp_basic_fini(void)
+{
+	rcu_assign_pointer(nf_nat_snmp_hook, NULL);
+	nf_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(nf_nat_snmp_basic_init);
+module_exit(nf_nat_snmp_basic_fini);
+
+module_param(debug, int, 0600);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
new file mode 100644
index 00000000..483b76d0
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -0,0 +1,326 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/spinlock.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#ifdef CONFIG_XFRM
+static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct flowi4 *fl4 = &fl->u.ip4;
+	const struct nf_conn *ct;
+	const struct nf_conntrack_tuple *t;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return;
+	dir = CTINFO2DIR(ctinfo);
+	t = &ct->tuplehash[dir].tuple;
+
+	if (dir == IP_CT_DIR_ORIGINAL)
+		statusbit = IPS_DST_NAT;
+	else
+		statusbit = IPS_SRC_NAT;
+
+	if (ct->status & statusbit) {
+		fl4->daddr = t->dst.u3.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP ||
+		    t->dst.protonum == IPPROTO_UDPLITE ||
+		    t->dst.protonum == IPPROTO_DCCP ||
+		    t->dst.protonum == IPPROTO_SCTP)
+			fl4->fl4_dport = t->dst.u.tcp.port;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl4->saddr = t->src.u3.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP ||
+		    t->dst.protonum == IPPROTO_UDPLITE ||
+		    t->dst.protonum == IPPROTO_DCCP ||
+		    t->dst.protonum == IPPROTO_SCTP)
+			fl4->fl4_sport = t->src.u.tcp.port;
+	}
+}
+#endif
+
+static unsigned int
+nf_nat_fn(unsigned int hooknum,
+	  struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_nat *nat;
+	/* maniptype == SRC for postrouting. */
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+	/* We never see fragments: conntrack defrags on pre-routing
+	   and local-out, and nf_nat_out protects post-routing. */
+	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	   have dropped it.  Hence it's the user's responsibilty to
+	   packet filter it out, or implement conntrack/NAT for that
+	   protocol. 8) --RR */
+	if (!ct)
+		return NF_ACCEPT;
+
+	/* Don't try to NAT if this packet is not conntracked */
+	if (nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (!nat) {
+		/* NAT module was loaded late. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL) {
+			pr_debug("failed to add NAT extension\n");
+			return NF_ACCEPT;
+		}
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(ct, ctinfo,
+							   hooknum, skb))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+	case IP_CT_NEW:
+
+		/* Seen it before?  This can happen for loopback, retrans,
+		   or local packets.. */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			if (ret != NF_ACCEPT)
+				return ret;
+		} else
+			pr_debug("Already setup manip %s for ct %p\n",
+				 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct);
+		break;
+
+	default:
+		/* ESTABLISHED */
+		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
+			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+	}
+
+	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+}
+
+static unsigned int
+nf_nat_in(unsigned int hooknum,
+	  struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	__be32 daddr = ip_hdr(skb)->daddr;
+
+	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    daddr != ip_hdr(skb)->daddr)
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int
+nf_nat_out(unsigned int hooknum,
+	   struct sk_buff *skb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+#ifdef CONFIG_XFRM
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+#endif
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all)
+		   )
+			return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int
+nf_nat_local_fn(unsigned int hooknum,
+		struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+		    ct->tuplehash[!dir].tuple.src.u3.ip) {
+			if (ip_route_me_harder(skb, RTN_UNSPEC))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (ip_xfrm_me_harder(skb))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+/* We must be after connection tracking and before packet filtering. */
+
+static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_out,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_local_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_fn,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+static int __init nf_nat_standalone_init(void)
+{
+	int ret = 0;
+
+	need_ipv4_conntrack();
+
+#ifdef CONFIG_XFRM
+	BUG_ON(ip_nat_decode_session != NULL);
+	rcu_assign_pointer(ip_nat_decode_session, nat_decode_session);
+#endif
+	ret = nf_nat_rule_init();
+	if (ret < 0) {
+		pr_err("nf_nat_init: can't setup rules.\n");
+		goto cleanup_decode_session;
+	}
+	ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+	if (ret < 0) {
+		pr_err("nf_nat_init: can't register hooks.\n");
+		goto cleanup_rule_init;
+	}
+	return ret;
+
+ cleanup_rule_init:
+	nf_nat_rule_cleanup();
+ cleanup_decode_session:
+#ifdef CONFIG_XFRM
+	rcu_assign_pointer(ip_nat_decode_session, NULL);
+	synchronize_net();
+#endif
+	return ret;
+}
+
+static void __exit nf_nat_standalone_fini(void)
+{
+	nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
+	nf_nat_rule_cleanup();
+#ifdef CONFIG_XFRM
+	rcu_assign_pointer(ip_nat_decode_session, NULL);
+	synchronize_net();
+#endif
+	/* Conntrack caches are unregistered in nf_conntrack_cleanup */
+}
+
+module_init(nf_nat_standalone_init);
+module_exit(nf_nat_standalone_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat");
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
new file mode 100644
index 00000000..7274a43c
--- /dev/null
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -0,0 +1,51 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/udp.h>
+
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/nf_nat_rule.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP NAT helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_nat_tftp");
+
+static unsigned int help(struct sk_buff *skb,
+			 enum ip_conntrack_info ctinfo,
+			 struct nf_conntrack_expect *exp)
+{
+	const struct nf_conn *ct = exp->master;
+
+	exp->saved_proto.udp.port
+		= ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+	exp->dir = IP_CT_DIR_REPLY;
+	exp->expectfn = nf_nat_follow_master;
+	if (nf_ct_expect_related(exp) != 0)
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+
+static void __exit nf_nat_tftp_fini(void)
+{
+	rcu_assign_pointer(nf_nat_tftp_hook, NULL);
+	synchronize_rcu();
+}
+
+static int __init nf_nat_tftp_init(void)
+{
+	BUG_ON(nf_nat_tftp_hook != NULL);
+	rcu_assign_pointer(nf_nat_tftp_hook, help);
+	return 0;
+}
+
+module_init(nf_nat_tftp_init);
+module_exit(nf_nat_tftp_fini);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 00000000..39b403f8
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,931 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		"Ping" sockets
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors:	Vasiliy Kulikov / Openwall (for Linux 2.6),
+ *		Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+
+static struct ping_table ping_table;
+
+static u16 ping_port_rover;
+
+static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+{
+	int res = (num + net_hash_mix(net)) & mask;
+	pr_debug("hash(%d) = %d\n", num, res);
+	return res;
+}
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+					     struct net *net, unsigned num)
+{
+	return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+static int ping_v4_get_port(struct sock *sk, unsigned short ident)
+{
+	struct hlist_nulls_node *node;
+	struct hlist_nulls_head *hlist;
+	struct inet_sock *isk, *isk2;
+	struct sock *sk2 = NULL;
+
+	isk = inet_sk(sk);
+	write_lock_bh(&ping_table.lock);
+	if (ident == 0) {
+		u32 i;
+		u16 result = ping_port_rover + 1;
+
+		for (i = 0; i < (1L << 16); i++, result++) {
+			if (!result)
+				result++; /* avoid zero */
+			hlist = ping_hashslot(&ping_table, sock_net(sk),
+					    result);
+			ping_portaddr_for_each_entry(sk2, node, hlist) {
+				isk2 = inet_sk(sk2);
+
+				if (isk2->inet_num == result)
+					goto next_port;
+			}
+
+			/* found */
+			ping_port_rover = ident = result;
+			break;
+next_port:
+			;
+		}
+		if (i >= (1L << 16))
+			goto fail;
+	} else {
+		hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+		ping_portaddr_for_each_entry(sk2, node, hlist) {
+			isk2 = inet_sk(sk2);
+
+			if ((isk2->inet_num == ident) &&
+			    (sk2 != sk) &&
+			    (!sk2->sk_reuse || !sk->sk_reuse))
+				goto fail;
+		}
+	}
+
+	pr_debug("found port/ident = %d\n", ident);
+	isk->inet_num = ident;
+	if (sk_unhashed(sk)) {
+		pr_debug("was not hashed\n");
+		sock_hold(sk);
+		hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	}
+	write_unlock_bh(&ping_table.lock);
+	return 0;
+
+fail:
+	write_unlock_bh(&ping_table.lock);
+	return 1;
+}
+
+static void ping_v4_hash(struct sock *sk)
+{
+	pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+	BUG(); /* "Please do not press this button again." */
+}
+
+static void ping_v4_unhash(struct sock *sk)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+	if (sk_hashed(sk)) {
+		write_lock_bh(&ping_table.lock);
+		hlist_nulls_del(&sk->sk_nulls_node);
+		sock_put(sk);
+		isk->inet_num = isk->inet_sport = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		write_unlock_bh(&ping_table.lock);
+	}
+}
+
+static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
+				   u16 ident, int dif)
+{
+	struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+	struct sock *sk = NULL;
+	struct inet_sock *isk;
+	struct hlist_nulls_node *hnode;
+
+	pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
+			 (int)ident, (unsigned long)daddr, dif);
+	read_lock_bh(&ping_table.lock);
+
+	ping_portaddr_for_each_entry(sk, hnode, hslot) {
+		isk = inet_sk(sk);
+
+		pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
+			 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
+			 sk->sk_bound_dev_if);
+
+		pr_debug("iterate\n");
+		if (isk->inet_num != ident)
+			continue;
+		if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
+			continue;
+		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+			continue;
+
+		sock_hold(sk);
+		goto exit;
+	}
+
+	sk = NULL;
+exit:
+	read_unlock_bh(&ping_table.lock);
+
+	return sk;
+}
+
+static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
+					  gid_t *high)
+{
+	gid_t *data = net->ipv4.sysctl_ping_group_range;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+
+static int ping_init_sock(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	gid_t group = current_egid();
+	gid_t range[2];
+	struct group_info *group_info = get_current_groups();
+	int i, j, count = group_info->ngroups;
+
+	inet_get_ping_group_range_net(net, range, range+1);
+	if (range[0] <= group && group <= range[1])
+		return 0;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+
+		for (j = 0; j < cp_count; j++) {
+			group = group_info->blocks[i][j];
+			if (range[0] <= group && group <= range[1])
+				return 0;
+		}
+
+		count -= cp_count;
+	}
+
+	return -EACCES;
+}
+
+static void ping_close(struct sock *sk, long timeout)
+{
+	pr_debug("ping_close(sk=%p,sk->num=%u)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num);
+	pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+	sk_common_release(sk);
+}
+
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct inet_sock *isk = inet_sk(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	int err;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
+		sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+	if (addr->sin_addr.s_addr == INADDR_ANY)
+		chk_addr_ret = RTN_LOCAL;
+
+	if ((sysctl_ip_nonlocal_bind == 0 &&
+	    isk->freebind == 0 && isk->transparent == 0 &&
+	     chk_addr_ret != RTN_LOCAL) ||
+	    chk_addr_ret == RTN_MULTICAST ||
+	    chk_addr_ret == RTN_BROADCAST)
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (isk->inet_num != 0)
+		goto out;
+
+	err = -EADDRINUSE;
+	isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+	snum = ntohs(addr->sin_port);
+	if (ping_v4_get_port(sk, snum) != 0) {
+		isk->inet_saddr = isk->inet_rcv_saddr = 0;
+		goto out;
+	}
+
+	pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
+		(int)isk->inet_num,
+		(unsigned long) isk->inet_rcv_saddr,
+		(int)sk->sk_bound_dev_if);
+
+	err = 0;
+	if (isk->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	isk->inet_sport = htons(isk->inet_num);
+	isk->inet_daddr = 0;
+	isk->inet_dport = 0;
+	sk_dst_reset(sk);
+out:
+	release_sock(sk);
+	pr_debug("ping_v4_bind -> %d\n", err);
+	return err;
+}
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int type, int code)
+{
+	if (type == ICMP_ECHO && code == 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+void ping_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+	struct inet_sock *inet_sock;
+	int type = icmph->type;
+	int code = icmph->code;
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	int harderr;
+	int err;
+
+	/* We assume the packet has already been checked by icmp_unreach */
+
+	if (!ping_supported(icmph->type, icmph->code))
+		return;
+
+	pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
+		code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
+			    ntohs(icmph->un.echo.id), skb->dev->ifindex);
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		pr_debug("no socket, dropping\n");
+		return;	/* No socket for error */
+	}
+	pr_debug("err on socket %p\n", sk);
+
+	err = 0;
+	harderr = 0;
+	inet_sock = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		/* This is not a real error but ping wants to see it.
+		 * Report it with some fake errno. */
+		err = EREMOTEIO;
+		break;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	case ICMP_REDIRECT:
+		/* See ICMP_SOURCE_QUENCH */
+		err = EREMOTEIO;
+		break;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if (!inet_sock->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
+		ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+			 info, (u8 *)icmph);
+	}
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+/*
+ *	Copy and checksum an ICMP Echo packet from user space into a buffer.
+ */
+
+struct pingfakehdr {
+	struct icmphdr icmph;
+	struct iovec *iov;
+	u32 wcheck;
+};
+
+static int ping_getfrag(void *from, char * to,
+			int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+	if (offset == 0) {
+		if (fraglen < sizeof(struct icmphdr))
+			BUG();
+		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
+			    pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+			    &pfh->wcheck))
+			return -EFAULT;
+
+		return 0;
+	}
+	if (offset < sizeof(struct icmphdr))
+		BUG();
+	if (csum_partial_copy_fromiovecend
+			(to, pfh->iov, offset - sizeof(struct icmphdr),
+			 fraglen, &pfh->wcheck))
+		return -EFAULT;
+	return 0;
+}
+
+static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
+				    struct flowi4 *fl4)
+{
+	struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+	pfh->wcheck = csum_partial((char *)&pfh->icmph,
+		sizeof(struct icmphdr), pfh->wcheck);
+	pfh->icmph.checksum = csum_fold(pfh->wcheck);
+	memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+	skb->ip_summed = CHECKSUM_NONE;
+	return ip_push_pending_frames(sk, fl4);
+}
+
+static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+			size_t len)
+{
+	struct net *net = sock_net(sk);
+	struct flowi4 fl4;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct icmphdr user_icmph;
+	struct pingfakehdr pfh;
+	struct rtable *rt = NULL;
+	struct ip_options_data opt_copy;
+	int free = 0;
+	u32 saddr, daddr, faddr;
+	u8  tos;
+	int err;
+
+	pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
+
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	/* Mirror BSD error message compatibility */
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 *	Fetch the ICMP header provided by the userland.
+	 *	iovec is modified!
+	 */
+
+	if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
+			     sizeof(struct icmphdr)))
+		return -EFAULT;
+	if (!ping_supported(user_icmph.type, user_icmph.code))
+		return -EINVAL;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_name) {
+		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET)
+			return -EINVAL;
+		daddr = usin->sin_addr.s_addr;
+		/* no remote port */
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->inet_daddr;
+		/* no remote port */
+	}
+
+	ipc.addr = inet->inet_saddr;
+	ipc.opt = NULL;
+	ipc.oif = sk->sk_bound_dev_if;
+	ipc.tx_flags = 0;
+	err = sock_tx_timestamp(sk, &ipc.tx_flags);
+	if (err)
+		return err;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+		if (err)
+			return err;
+		if (ipc.opt)
+			free = 1;
+	}
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->opt.srr) {
+		if (!daddr)
+			return -EINVAL;
+		faddr = ipc.opt->opt.faddr;
+	}
+	tos = RT_TOS(inet->tos);
+	if (sock_flag(sk, SOCK_LOCALROUTE) ||
+	    (msg->msg_flags & MSG_DONTROUTE) ||
+	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
+		tos |= RTO_ONLINK;
+	}
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+
+	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+			   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+			   inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
+
+	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_flow(net, &fl4, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		if (err == -ENETUNREACH)
+			IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	err = -EACCES;
+	if ((rt->rt_flags & RTCF_BROADCAST) &&
+	    !sock_flag(sk, SOCK_BROADCAST))
+		goto out;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (!ipc.addr)
+		ipc.addr = fl4.daddr;
+
+	lock_sock(sk);
+
+	pfh.icmph.type = user_icmph.type; /* already checked */
+	pfh.icmph.code = user_icmph.code; /* ditto */
+	pfh.icmph.checksum = 0;
+	pfh.icmph.un.echo.id = inet->inet_sport;
+	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+	pfh.iov = msg->msg_iov;
+	pfh.wcheck = 0;
+
+	err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
+			0, &ipc, &rt, msg->msg_flags);
+	if (err)
+		ip_flush_pending_frames(sk);
+	else
+		err = ping_push_pending_frames(sk, &pfh, &fl4);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+	if (free)
+		kfree(ipc.opt);
+	if (!err) {
+		icmp_out_count(sock_net(sk), user_icmph.type);
+		return len;
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(&rt->dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+			size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+	int copied, err;
+
+	pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	/* Don't bother checking the checksum */
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = 0 /* skb->h.uh->source */;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (isk->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	err = copied;
+
+done:
+	skb_free_datagram(sk, skb);
+out:
+	pr_debug("ping_recvmsg -> %d\n", err);
+	return err;
+}
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num, skb);
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
+		kfree_skb(skb);
+		pr_debug("ping_queue_rcv_skb -> failed\n");
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ *	All we need to do is get the socket.
+ */
+
+void ping_rcv(struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct net *net = dev_net(skb->dev);
+	struct iphdr *iph = ip_hdr(skb);
+	struct icmphdr *icmph = icmp_hdr(skb);
+	u32 saddr = iph->saddr;
+	u32 daddr = iph->daddr;
+
+	/* We assume the packet has already been checked by icmp_rcv */
+
+	pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+		skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	/* Push ICMP header back */
+	skb_push(skb, skb->data - (u8 *)icmph);
+
+	sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
+			    skb->dev->ifindex);
+	if (sk != NULL) {
+		pr_debug("rcv on socket %p\n", sk);
+		ping_queue_rcv_skb(sk, skb_get(skb));
+		sock_put(sk);
+		return;
+	}
+	pr_debug("no socket, dropping\n");
+
+	/* We're called from icmp_rcv(). kfree_skb() is done there. */
+}
+
+struct proto ping_prot = {
+	.name =		"PING",
+	.owner =	THIS_MODULE,
+	.init =		ping_init_sock,
+	.close =	ping_close,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	udp_disconnect,
+	.setsockopt =	ip_setsockopt,
+	.getsockopt =	ip_getsockopt,
+	.sendmsg =	ping_sendmsg,
+	.recvmsg =	ping_recvmsg,
+	.bind =		ping_bind,
+	.backlog_rcv =	ping_queue_rcv_skb,
+	.hash =		ping_v4_hash,
+	.unhash =	ping_v4_unhash,
+	.get_port =	ping_v4_get_port,
+	.obj_size =	sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+	struct sock *sk;
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+	     ++state->bucket) {
+		struct hlist_nulls_node *node;
+		struct hlist_nulls_head *hslot;
+
+		hslot = &ping_table.hash[state->bucket];
+
+		if (hlist_nulls_empty(hslot))
+			continue;
+
+		sk_nulls_for_each(sk, node, hslot) {
+			if (net_eq(sock_net(sk), net))
+				goto found;
+		}
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	do {
+		sk = sk_nulls_next(sk);
+	} while (sk && (!net_eq(sock_net(sk), net)));
+
+	if (!sk)
+		return ping_get_first(seq, state->bucket + 1);
+	return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = ping_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ping_iter_state *state = seq->private;
+	state->bucket = 0;
+
+	read_lock_bh(&ping_table.lock);
+
+	return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = ping_get_idx(seq, 0);
+	else
+		sk = ping_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+
+static void ping_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock_bh(&ping_table.lock);
+}
+
+static void ping_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket, int *len)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops), len);
+}
+
+static int ping_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct ping_iter_state *state = seq->private;
+		int len;
+
+		ping_format_sock(v, seq, state->bucket, &len);
+		seq_printf(seq, "%*s\n", 127 - len, "");
+	}
+	return 0;
+}
+
+static const struct seq_operations ping_seq_ops = {
+	.show		= ping_seq_show,
+	.start		= ping_seq_start,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ping_seq_ops,
+			   sizeof(struct ping_iter_state));
+}
+
+static const struct file_operations ping_seq_fops = {
+	.open		= ping_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static int ping_proc_register(struct net *net)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+
+static void ping_proc_unregister(struct net *net)
+{
+	proc_net_remove(net, "icmp");
+}
+
+
+static int __net_init ping_proc_init_net(struct net *net)
+{
+	return ping_proc_register(net);
+}
+
+static void __net_exit ping_proc_exit_net(struct net *net)
+{
+	ping_proc_unregister(net);
+}
+
+static struct pernet_operations ping_net_ops = {
+	.init = ping_proc_init_net,
+	.exit = ping_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+	return register_pernet_subsys(&ping_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+	unregister_pernet_subsys(&ping_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+	int i;
+
+	for (i = 0; i < PING_HTABLE_SIZE; i++)
+		INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+	rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 00000000..b14ec7d0
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,495 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		This file implements the various access functions for the
+ *		PROC file system.  It is mainly used for debugging and
+ *		statistics.
+ *
+ * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ *		Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ *		Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ *		Alan Cox	:	UDP sockets show the rxqueue/txqueue
+ *					using hint flag for the netinfo.
+ *	Pauline Middelink	:	identd support
+ *		Alan Cox	:	Make /proc safer.
+ *	Erik Schoenfelder	:	/proc/net/snmp
+ *		Alan Cox	:	Handle dead sockets properly.
+ *	Gerhard Koerting	:	Show both timers
+ *		Alan Cox	:	Allow inode to be NULL (kernel socket)
+ *	Andi Kleen		:	Add support for open_requests and
+ *					split functions for more readibility.
+ *	Andi Kleen		:	Add support for /proc/net/netstat
+ *	Arnaldo C. Melo		:	Convert to seq_file
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <linux/bottom_half.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+/*
+ *	Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+	int orphans, sockets;
+
+	local_bh_disable();
+	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
+	local_bh_enable();
+
+	socket_seq_show(seq);
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+		   sock_prot_inuse_get(net, &tcp_prot), orphans,
+		   tcp_death_row.tw_count, sockets,
+		   atomic_long_read(&tcp_memory_allocated));
+	seq_printf(seq, "UDP: inuse %d mem %ld\n",
+		   sock_prot_inuse_get(net, &udp_prot),
+		   atomic_long_read(&udp_memory_allocated));
+	seq_printf(seq, "UDPLITE: inuse %d\n",
+		   sock_prot_inuse_get(net, &udplite_prot));
+	seq_printf(seq, "RAW: inuse %d\n",
+		   sock_prot_inuse_get(net, &raw_prot));
+	seq_printf(seq,  "FRAG: inuse %d memory %d\n",
+			ip_frag_nqueues(net), ip_frag_mem(net));
+	return 0;
+}
+
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, sockstat_seq_show);
+}
+
+static const struct file_operations sockstat_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = sockstat_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+/* snmp items */
+static const struct snmp_mib snmp4_ipstats_list[] = {
+	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
+	SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+	SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+	SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+	SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+	SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+	SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+	SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+	SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+	SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+	SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+	SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+	SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+	SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+	SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+	SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+	SNMP_MIB_SENTINEL
+};
+
+/* Following RFC4293 items are displayed in /proc/net/netstat */
+static const struct snmp_mib snmp4_ipextstats_list[] = {
+	SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
+	SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+	SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+	SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+	SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
+	SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
+	SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
+	SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
+	SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct {
+	const char *name;
+	int index;
+} icmpmibmap[] = {
+	{ "DestUnreachs", ICMP_DEST_UNREACH },
+	{ "TimeExcds", ICMP_TIME_EXCEEDED },
+	{ "ParmProbs", ICMP_PARAMETERPROB },
+	{ "SrcQuenchs", ICMP_SOURCE_QUENCH },
+	{ "Redirects", ICMP_REDIRECT },
+	{ "Echos", ICMP_ECHO },
+	{ "EchoReps", ICMP_ECHOREPLY },
+	{ "Timestamps", ICMP_TIMESTAMP },
+	{ "TimestampReps", ICMP_TIMESTAMPREPLY },
+	{ "AddrMasks", ICMP_ADDRESS },
+	{ "AddrMaskReps", ICMP_ADDRESSREPLY },
+	{ NULL, 0 }
+};
+
+
+static const struct snmp_mib snmp4_tcp_list[] = {
+	SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+	SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+	SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+	SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+	SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+	SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+	SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+	SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+	SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+	SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+	SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+	SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+	SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+	SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_udp_list[] = {
+	SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+	SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+	SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+	SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp4_net_list[] = {
+	SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+	SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+	SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+	SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+	SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+	SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+	SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+	SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+	SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+	SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+	SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+	SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+	SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+	SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+	SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+	SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+	SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+	SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+	SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+	SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+	SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+	SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+	SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+	SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+	SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+	SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+	SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+	SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+	SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
+	SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+	SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+	SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+	SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
+	SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+	SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+	SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+	SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+	SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+	SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+	SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+	SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+	SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+	SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+	SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
+	SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
+	SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+	SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
+	SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
+	SNMP_MIB_SENTINEL
+};
+
+static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
+			     unsigned short *type, int count)
+{
+	int j;
+
+	if (count) {
+		seq_printf(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %sType%u",
+				type[j] & 0x100 ? "Out" : "In",
+				type[j] & 0xff);
+		seq_printf(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %lu", vals[j]);
+	}
+}
+
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE	16
+
+	int i, count;
+	unsigned short type[PERLINE];
+	unsigned long vals[PERLINE], val;
+	struct net *net = seq->private;
+
+	count = 0;
+	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+		val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
+		if (val) {
+			type[count] = i;
+			vals[count++] = val;
+		}
+		if (count == PERLINE) {
+			icmpmsg_put_line(seq, vals, type, count);
+			count = 0;
+		}
+	}
+	icmpmsg_put_line(seq, vals, type, count);
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+	int i;
+	struct net *net = seq->private;
+
+	seq_puts(seq, "\nIcmp: InMsgs InErrors");
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " In%s", icmpmibmap[i].name);
+	seq_printf(seq, " OutMsgs OutErrors");
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " Out%s", icmpmibmap[i].name);
+	seq_printf(seq, "\nIcmp: %lu %lu",
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+				icmpmibmap[i].index));
+	seq_printf(seq, " %lu %lu",
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+				icmpmibmap[i].index | 0x100));
+}
+
+/*
+ *	Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+	struct net *net = seq->private;
+
+	seq_puts(seq, "Ip: Forwarding DefaultTTL");
+
+	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+	seq_printf(seq, "\nIp: %d %d",
+		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+		   sysctl_ip_default_ttl);
+
+	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					     snmp4_ipstats_list[i].entry,
+					     offsetof(struct ipstats_mib, syncp)));
+
+	icmp_put(seq);	/* RFC 2011 compatibility */
+	icmpmsg_put(seq);
+
+	seq_puts(seq, "\nTcp:");
+	for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+	seq_puts(seq, "\nTcp:");
+	for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+		/* MaxConn field is signed, RFC 2012 */
+		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+			seq_printf(seq, " %ld",
+				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+						   snmp4_tcp_list[i].entry));
+		else
+			seq_printf(seq, " %lu",
+				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
+						   snmp4_tcp_list[i].entry));
+	}
+
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   snmp_fold_field((void __percpu **)net->mib.udp_statistics,
+					   snmp4_udp_list[i].entry));
+
+	/* the UDP and UDP-Lite MIBs are the same */
+	seq_puts(seq, "\nUdpLite:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+	seq_puts(seq, "\nUdpLite:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
+					   snmp4_udp_list[i].entry));
+
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, snmp_seq_show);
+}
+
+static const struct file_operations snmp_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = snmp_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+
+
+/*
+ *	Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+	struct net *net = seq->private;
+
+	seq_puts(seq, "TcpExt:");
+	for (i = 0; snmp4_net_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+	seq_puts(seq, "\nTcpExt:");
+	for (i = 0; snmp4_net_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   snmp_fold_field((void __percpu **)net->mib.net_statistics,
+					   snmp4_net_list[i].entry));
+
+	seq_puts(seq, "\nIpExt:");
+	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
+
+	seq_puts(seq, "\nIpExt:");
+	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					     snmp4_ipextstats_list[i].entry,
+					     offsetof(struct ipstats_mib, syncp)));
+
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, netstat_seq_show);
+}
+
+static const struct file_operations netstat_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = netstat_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+static __net_init int ip_proc_init_net(struct net *net)
+{
+	if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops))
+		goto out_sockstat;
+	if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops))
+		goto out_netstat;
+	if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops))
+		goto out_snmp;
+
+	return 0;
+
+out_snmp:
+	proc_net_remove(net, "netstat");
+out_netstat:
+	proc_net_remove(net, "sockstat");
+out_sockstat:
+	return -ENOMEM;
+}
+
+static __net_exit void ip_proc_exit_net(struct net *net)
+{
+	proc_net_remove(net, "snmp");
+	proc_net_remove(net, "netstat");
+	proc_net_remove(net, "sockstat");
+}
+
+static __net_initdata struct pernet_operations ip_proc_ops = {
+	.init = ip_proc_init_net,
+	.exit = ip_proc_exit_net,
+};
+
+int __init ip_misc_proc_init(void)
+{
+	return register_pernet_subsys(&ip_proc_ops);
+}
+
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 00000000..9ae5c01c
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,61 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		INET protocol dispatch tables.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	: Ahah! udp icmp errors don't work because
+ *				  udp_err is never called!
+ *		Alan Cox	: Added new fields for init and ready for
+ *				  proper fragmentation (_NO_ 4K limits!)
+ *		Richard Colella	: Hang on hash collision
+ *		Vince Laviano	: Modified inet_del_protocol() to correctly
+ *				  maintain copy bit.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+
+/*
+ *	Add a protocol handler to the hash tables
+ */
+
+int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+	int hash = protocol & (MAX_INET_PROTOS - 1);
+
+	return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+			NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet_add_protocol);
+
+/*
+ *	Remove a protocol from the hash tables.
+ */
+
+int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
+{
+	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+		       prot, NULL) == prot) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 00000000..c9893d43
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,1064 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		RAW - implementation of IP "raw" sockets.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() fixed up
+ *		Alan Cox	:	ICMP error handling
+ *		Alan Cox	:	EMSGSIZE if you send too big a packet
+ *		Alan Cox	: 	Now uses generic datagrams and shared
+ *					skbuff library. No more peek crashes,
+ *					no more backlogs
+ *		Alan Cox	:	Checks sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
+ *		Alan Cox	:	Raw passes ip options too
+ *		Alan Cox	:	Setsocketopt added
+ *		Alan Cox	:	Fixed error return for broadcasts
+ *		Alan Cox	:	Removed wake_up calls
+ *		Alan Cox	:	Use ttl/tos
+ *		Alan Cox	:	Cleaned up old debugging
+ *		Alan Cox	:	Use new kernel side addresses
+ *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
+ *		Alan Cox	:	BSD style RAW socket demultiplexing.
+ *		Alan Cox	:	Beginnings of mrouted support.
+ *		Alan Cox	:	Added IP_HDRINCL option.
+ *		Alan Cox	:	Skip broadcast check if BSDism set.
+ *		David S. Miller	:	New socket lookup architecture.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/aio.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/tcp_states.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/compat.h>
+
+static struct raw_hashinfo raw_v4_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
+};
+
+void raw_hash_sk(struct sock *sk)
+{
+	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+	struct hlist_head *head;
+
+	head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+
+	write_lock_bh(&h->lock);
+	sk_add_node(sk, head);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_hash_sk);
+
+void raw_unhash_sk(struct sock *sk)
+{
+	struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
+
+	write_lock_bh(&h->lock);
+	if (sk_del_node_init(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_unhash_sk);
+
+static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+		unsigned short num, __be32 raddr, __be32 laddr, int dif)
+{
+	struct hlist_node *node;
+
+	sk_for_each_from(sk, node) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
+		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
+		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+			goto found; /* gotcha */
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+/*
+ *	0 - deliver
+ *	1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+	int type;
+
+	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+		return 1;
+
+	type = icmp_hdr(skb)->type;
+	if (type < 32) {
+		__u32 data = raw_sk(sk)->filter.data;
+
+		return ((1 << type) & data) != 0;
+	}
+
+	/* Do not block unknown ICMP types */
+	return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
+{
+	struct sock *sk;
+	struct hlist_head *head;
+	int delivered = 0;
+	struct net *net;
+
+	read_lock(&raw_v4_hashinfo.lock);
+	head = &raw_v4_hashinfo.ht[hash];
+	if (hlist_empty(head))
+		goto out;
+
+	net = dev_net(skb->dev);
+	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
+			     iph->saddr, iph->daddr,
+			     skb->dev->ifindex);
+
+	while (sk) {
+		delivered = 1;
+		if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+			/* Not releasing hash table! */
+			if (clone)
+				raw_rcv(sk, clone);
+		}
+		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
+				     iph->saddr, iph->daddr,
+				     skb->dev->ifindex);
+	}
+out:
+	read_unlock(&raw_v4_hashinfo.lock);
+	return delivered;
+}
+
+int raw_local_deliver(struct sk_buff *skb, int protocol)
+{
+	int hash;
+	struct sock *raw_sk;
+
+	hash = protocol & (RAW_HTABLE_SIZE - 1);
+	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+
+	/* If there maybe a raw socket we must check - if not we
+	 * don't care less
+	 */
+	if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
+		raw_sk = NULL;
+
+	return raw_sk != NULL;
+
+}
+
+static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	int err = 0;
+	int harderr = 0;
+
+	/* Report error on raw socket, if:
+	   1. User requested ip_recverr.
+	   2. Socket is connected (otherwise the error indication
+	      is useless without ip_recverr and error is hard.
+	 */
+	if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+		return;
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		return;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		err = EHOSTUNREACH;
+		if (code > NR_ICMP_UNREACH)
+			break;
+		err = icmp_err_convert[code].errno;
+		harderr = icmp_err_convert[code].fatal;
+		if (code == ICMP_FRAG_NEEDED) {
+			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+			err = EMSGSIZE;
+		}
+	}
+
+	if (inet->recverr) {
+		const struct iphdr *iph = (const struct iphdr *)skb->data;
+		u8 *payload = skb->data + (iph->ihl << 2);
+
+		if (inet->hdrincl)
+			payload = skb->data;
+		ip_icmp_error(sk, skb, err, 0, info, payload);
+	}
+
+	if (inet->recverr || harderr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	}
+}
+
+void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
+{
+	int hash;
+	struct sock *raw_sk;
+	const struct iphdr *iph;
+	struct net *net;
+
+	hash = protocol & (RAW_HTABLE_SIZE - 1);
+
+	read_lock(&raw_v4_hashinfo.lock);
+	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
+	if (raw_sk != NULL) {
+		iph = (const struct iphdr *)skb->data;
+		net = dev_net(skb->dev);
+
+		while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
+						iph->daddr, iph->saddr,
+						skb->dev->ifindex)) != NULL) {
+			raw_err(raw_sk, skb, info);
+			raw_sk = sk_next(raw_sk);
+			iph = (const struct iphdr *)skb->data;
+		}
+	}
+	read_unlock(&raw_v4_hashinfo.lock);
+}
+
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+	/* Charge it to the socket. */
+
+	if (ip_queue_rcv_skb(sk, skb) < 0) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		atomic_inc(&sk->sk_drops);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+	nf_reset(skb);
+
+	skb_push(skb, skb->data - skb_network_header(skb));
+
+	raw_rcv_skb(sk, skb);
+	return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
+			   void *from, size_t length,
+			   struct rtable **rtp,
+			   unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	struct iphdr *iph;
+	struct sk_buff *skb;
+	unsigned int iphlen;
+	int err;
+	struct rtable *rt = *rtp;
+
+	if (length > rt->dst.dev->mtu) {
+		ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
+			       rt->dst.dev->mtu);
+		return -EMSGSIZE;
+	}
+	if (flags&MSG_PROBE)
+		goto out;
+
+	skb = sock_alloc_send_skb(sk,
+				  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
+				  flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto error;
+	skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+	skb_dst_set(skb, &rt->dst);
+	*rtp = NULL;
+
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	skb_put(skb, length);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb->transport_header = skb->network_header;
+	err = -EFAULT;
+	if (memcpy_fromiovecend((void *)iph, from, 0, length))
+		goto error_free;
+
+	iphlen = iph->ihl * 4;
+
+	/*
+	 * We don't want to modify the ip header, but we do need to
+	 * be sure that it won't cause problems later along the network
+	 * stack.  Specifically we want to make sure that iph->ihl is a
+	 * sane value.  If ihl points beyond the length of the buffer passed
+	 * in, reject the frame as invalid
+	 */
+	err = -EINVAL;
+	if (iphlen > length)
+		goto error_free;
+
+	if (iphlen >= sizeof(*iph)) {
+		if (!iph->saddr)
+			iph->saddr = fl4->saddr;
+		iph->check   = 0;
+		iph->tot_len = htons(length);
+		if (!iph->id)
+			ip_select_ident(iph, &rt->dst, NULL);
+
+		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+	}
+	if (iph->protocol == IPPROTO_ICMP)
+		icmp_out_count(net, ((struct icmphdr *)
+			skb_transport_header(skb))->type);
+
+	err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
+		      rt->dst.dev, dst_output);
+	if (err > 0)
+		err = net_xmit_errno(err);
+	if (err)
+		goto error;
+out:
+	return 0;
+
+error_free:
+	kfree_skb(skb);
+error:
+	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
+	if (err == -ENOBUFS && !inet->recverr)
+		err = 0;
+	return err;
+}
+
+static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
+{
+	struct iovec *iov;
+	u8 __user *type = NULL;
+	u8 __user *code = NULL;
+	int probed = 0;
+	unsigned int i;
+
+	if (!msg->msg_iov)
+		return 0;
+
+	for (i = 0; i < msg->msg_iovlen; i++) {
+		iov = &msg->msg_iov[i];
+		if (!iov)
+			continue;
+
+		switch (fl4->flowi4_proto) {
+		case IPPROTO_ICMP:
+			/* check if one-byte field is readable or not. */
+			if (iov->iov_base && iov->iov_len < 1)
+				break;
+
+			if (!type) {
+				type = iov->iov_base;
+				/* check if code field is readable or not. */
+				if (iov->iov_len > 1)
+					code = type + 1;
+			} else if (!code)
+				code = iov->iov_base;
+
+			if (type && code) {
+				if (get_user(fl4->fl4_icmp_type, type) ||
+				    get_user(fl4->fl4_icmp_code, code))
+					return -EFAULT;
+				probed = 1;
+			}
+			break;
+		default:
+			probed = 1;
+			break;
+		}
+		if (probed)
+			break;
+	}
+	return 0;
+}
+
+static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+	int free = 0;
+	__be32 daddr;
+	__be32 saddr;
+	u8  tos;
+	int err;
+	struct ip_options_data opt_copy;
+
+	err = -EMSGSIZE;
+	if (len > 0xFFFF)
+		goto out;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags & MSG_OOB)	/* Mirror BSD error message */
+		goto out;               /* compatibility */
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_namelen) {
+		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(*usin))
+			goto out;
+		if (usin->sin_family != AF_INET) {
+			static int complained;
+			if (!complained++)
+				printk(KERN_INFO "%s forgot to set AF_INET in "
+						 "raw sendmsg. Fix it!\n",
+						 current->comm);
+			err = -EAFNOSUPPORT;
+			if (usin->sin_family)
+				goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		/* ANK: I did not forget to get protocol from port field.
+		 * I just do not know, who uses this weirdness.
+		 * IP_HDRINCL is much more convenient.
+		 */
+	} else {
+		err = -EDESTADDRREQ;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+		daddr = inet->inet_daddr;
+	}
+
+	ipc.addr = inet->inet_saddr;
+	ipc.opt = NULL;
+	ipc.tx_flags = 0;
+	ipc.oif = sk->sk_bound_dev_if;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+		if (err)
+			goto out;
+		if (ipc.opt)
+			free = 1;
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = daddr;
+
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	if (ipc.opt) {
+		err = -EINVAL;
+		/* Linux does not mangle headers on raw sockets,
+		 * so that IP options + IP_HDRINCL is non-sense.
+		 */
+		if (inet->hdrincl)
+			goto done;
+		if (ipc.opt->opt.srr) {
+			if (!daddr)
+				goto done;
+			daddr = ipc.opt->opt.faddr;
+		}
+	}
+	tos = RT_CONN_FLAGS(sk);
+	if (msg->msg_flags & MSG_DONTROUTE)
+		tos |= RTO_ONLINK;
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+
+	flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+			   RT_SCOPE_UNIVERSE,
+			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			   FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0);
+
+	if (!inet->hdrincl) {
+		err = raw_probe_proto_opt(&fl4, msg);
+		if (err)
+			goto done;
+	}
+
+	security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		goto done;
+	}
+
+	err = -EACCES;
+	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+		goto done;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (inet->hdrincl)
+		err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
+				      &rt, msg->msg_flags);
+
+	 else {
+		if (!ipc.addr)
+			ipc.addr = fl4.daddr;
+		lock_sock(sk);
+		err = ip_append_data(sk, &fl4, ip_generic_getfrag,
+				     msg->msg_iov, len, 0,
+				     &ipc, &rt, msg->msg_flags);
+		if (err)
+			ip_flush_pending_frames(sk);
+		else if (!(msg->msg_flags & MSG_MORE)) {
+			err = ip_push_pending_frames(sk, &fl4);
+			if (err == -ENOBUFS && !inet->recverr)
+				err = 0;
+		}
+		release_sock(sk);
+	}
+done:
+	if (free)
+		kfree(ipc.opt);
+	ip_rt_put(rt);
+
+out:
+	if (err < 0)
+		return err;
+	return len;
+
+do_confirm:
+	dst_confirm(&rt->dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+	/*
+	 * Raw sockets may have direct kernel references. Kill them.
+	 */
+	ip_ra_control(sk, 0, NULL);
+
+	sk_common_release(sk);
+}
+
+static void raw_destroy(struct sock *sk)
+{
+	lock_sock(sk);
+	ip_flush_pending_frames(sk);
+	release_sock(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+	int ret = -EINVAL;
+	int chk_addr_ret;
+
+	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+		goto out;
+	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+	ret = -EADDRNOTAVAIL;
+	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+		goto out;
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->inet_saddr = 0;  /* Use device */
+	sk_dst_reset(sk);
+	ret = 0;
+out:	return ret;
+}
+
+/*
+ *	This should be easy, if there is something there
+ *	we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE) {
+		err = ip_recv_error(sk, msg, len);
+		goto out;
+	}
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		sin->sin_port = 0;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (inet->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:
+	if (err)
+		return err;
+	return copied;
+}
+
+static int raw_init(struct sock *sk)
+{
+	struct raw_sock *rp = raw_sk(sk);
+
+	if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
+		memset(&rp->filter, 0, sizeof(rp->filter));
+	return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+	if (optlen > sizeof(struct icmp_filter))
+		optlen = sizeof(struct icmp_filter);
+	if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+		return -EFAULT;
+	return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+	int len, ret = -EFAULT;
+
+	if (get_user(len, optlen))
+		goto out;
+	ret = -EINVAL;
+	if (len < 0)
+		goto out;
+	if (len > sizeof(struct icmp_filter))
+		len = sizeof(struct icmp_filter);
+	ret = -EFAULT;
+	if (put_user(len, optlen) ||
+	    copy_to_user(optval, &raw_sk(sk)->filter, len))
+		goto out;
+	ret = 0;
+out:	return ret;
+}
+
+static int do_raw_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_seticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	if (level != SOL_RAW)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+	return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
+				 char __user *optval, unsigned int optlen)
+{
+	if (level != SOL_RAW)
+		return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+	return do_raw_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_geticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+	return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
+				 char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+	return do_raw_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch (cmd) {
+		case SIOCOUTQ: {
+			int amount = sk_wmem_alloc_get(sk);
+
+			return put_user(amount, (int __user *)arg);
+		}
+		case SIOCINQ: {
+			struct sk_buff *skb;
+			int amount = 0;
+
+			spin_lock_bh(&sk->sk_receive_queue.lock);
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb != NULL)
+				amount = skb->len;
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
+			return put_user(amount, (int __user *)arg);
+		}
+
+		default:
+#ifdef CONFIG_IP_MROUTE
+			return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+			return -ENOIOCTLCMD;
+#endif
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	case SIOCINQ:
+		return -ENOIOCTLCMD;
+	default:
+#ifdef CONFIG_IP_MROUTE
+		return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+		return -ENOIOCTLCMD;
+#endif
+	}
+}
+#endif
+
+struct proto raw_prot = {
+	.name		   = "RAW",
+	.owner		   = THIS_MODULE,
+	.close		   = raw_close,
+	.destroy	   = raw_destroy,
+	.connect	   = ip4_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = raw_ioctl,
+	.init		   = raw_init,
+	.setsockopt	   = raw_setsockopt,
+	.getsockopt	   = raw_getsockopt,
+	.sendmsg	   = raw_sendmsg,
+	.recvmsg	   = raw_recvmsg,
+	.bind		   = raw_bind,
+	.backlog_rcv	   = raw_rcv_skb,
+	.hash		   = raw_hash_sk,
+	.unhash		   = raw_unhash_sk,
+	.obj_size	   = sizeof(struct raw_sock),
+	.h.raw_hash	   = &raw_v4_hashinfo,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_raw_setsockopt,
+	.compat_getsockopt = compat_raw_getsockopt,
+	.compat_ioctl	   = compat_raw_ioctl,
+#endif
+};
+
+#ifdef CONFIG_PROC_FS
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+	struct sock *sk;
+	struct raw_iter_state *state = raw_seq_private(seq);
+
+	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+			++state->bucket) {
+		struct hlist_node *node;
+
+		sk_for_each(sk, node, &state->h->ht[state->bucket])
+			if (sock_net(sk) == seq_file_net(seq))
+				goto found;
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct raw_iter_state *state = raw_seq_private(seq);
+
+	do {
+		sk = sk_next(sk);
+try_again:
+		;
+	} while (sk && sock_net(sk) != seq_file_net(seq));
+
+	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
+		sk = sk_head(&state->h->ht[state->bucket]);
+		goto try_again;
+	}
+	return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = raw_get_first(seq);
+
+	if (sk)
+		while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct raw_iter_state *state = raw_seq_private(seq);
+
+	read_lock(&state->h->lock);
+	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+EXPORT_SYMBOL_GPL(raw_seq_start);
+
+void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = raw_get_first(seq);
+	else
+		sk = raw_get_next(seq, v);
+	++*pos;
+	return sk;
+}
+EXPORT_SYMBOL_GPL(raw_seq_next);
+
+void raw_seq_stop(struct seq_file *seq, void *v)
+{
+	struct raw_iter_state *state = raw_seq_private(seq);
+
+	read_unlock(&state->h->lock);
+}
+EXPORT_SYMBOL_GPL(raw_seq_stop);
+
+static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr,
+	       src = inet->inet_rcv_saddr;
+	__u16 destp = 0,
+	      srcp  = inet->inet_num;
+
+	seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+		i, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "  sl  local_address rem_address   st tx_queue "
+				"rx_queue tr tm->when retrnsmt   uid  timeout "
+				"inode ref pointer drops\n");
+	else
+		raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
+	return 0;
+}
+
+static const struct seq_operations raw_seq_ops = {
+	.start = raw_seq_start,
+	.next  = raw_seq_next,
+	.stop  = raw_seq_stop,
+	.show  = raw_seq_show,
+};
+
+int raw_seq_open(struct inode *ino, struct file *file,
+		 struct raw_hashinfo *h, const struct seq_operations *ops)
+{
+	int err;
+	struct raw_iter_state *i;
+
+	err = seq_open_net(ino, file, ops, sizeof(struct raw_iter_state));
+	if (err < 0)
+		return err;
+
+	i = raw_seq_private((struct seq_file *)file->private_data);
+	i->h = h;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(raw_seq_open);
+
+static int raw_v4_seq_open(struct inode *inode, struct file *file)
+{
+	return raw_seq_open(inode, file, &raw_v4_hashinfo, &raw_seq_ops);
+}
+
+static const struct file_operations raw_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = raw_v4_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_net,
+};
+
+static __net_init int raw_init_net(struct net *net)
+{
+	if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static __net_exit void raw_exit_net(struct net *net)
+{
+	proc_net_remove(net, "raw");
+}
+
+static __net_initdata struct pernet_operations raw_net_ops = {
+	.init = raw_init_net,
+	.exit = raw_exit_net,
+};
+
+int __init raw_proc_init(void)
+{
+	return register_pernet_subsys(&raw_net_ops);
+}
+
+void __init raw_proc_exit(void)
+{
+	unregister_pernet_subsys(&raw_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 00000000..6b95f74a
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3471 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		ROUTE - implementation of the IP router.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *		Alan Cox	:	Verify area fixes.
+ *		Alan Cox	:	cli() protects routing changes
+ *		Rui Oliveira	:	ICMP routing table updates
+ *		(rco@di.uminho.pt)	Routing table insertion and update
+ *		Linus Torvalds	:	Rewrote bits to be sensible
+ *		Alan Cox	:	Added BSD route gw semantics
+ *		Alan Cox	:	Super /proc >4K
+ *		Alan Cox	:	MTU in route table
+ *		Alan Cox	: 	MSS actually. Also added the window
+ *					clamper.
+ *		Sam Lantinga	:	Fixed route matching in rt_del()
+ *		Alan Cox	:	Routing cache support.
+ *		Alan Cox	:	Removed compatibility cruft.
+ *		Alan Cox	:	RTF_REJECT support.
+ *		Alan Cox	:	TCP irtt support.
+ *		Jonathan Naylor	:	Added Metric support.
+ *	Miquel van Smoorenburg	:	BSD API fixes.
+ *	Miquel van Smoorenburg	:	Metrics.
+ *		Alan Cox	:	Use __u32 properly
+ *		Alan Cox	:	Aligned routing errors more closely with BSD
+ *					our system is still very different.
+ *		Alan Cox	:	Faster /proc handling
+ *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
+ *					routing caches and better behaviour.
+ *
+ *		Olaf Erb	:	irtt wasn't being copied right.
+ *		Bjorn Ekwall	:	Kerneld route support.
+ *		Alan Cox	:	Multicast fixed (I hope)
+ * 		Pavel Krauz	:	Limited broadcast fixed
+ *		Mike McLagan	:	Routing by source
+ *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
+ *					route.c and rewritten from scratch.
+ *		Andi Kleen	:	Load-limit warning messages.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
+ *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
+ *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
+ *		Marc Boucher	:	routing by fwmark
+ *	Robert Olsson		:	Added rt_cache statistics
+ *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
+ * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
+ * 	Ilia Sotnikov		:	Removed TOS from hash calculations
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/netevent.h>
+#include <net/rtnetlink.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/secure_seq.h>
+
+#define RT_FL_TOS(oldflp4) \
+    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+
+#define IP_MAX_MTU	0xFFF0
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+static int ip_rt_max_size;
+static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
+static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
+static int ip_rt_redirect_number __read_mostly	= 9;
+static int ip_rt_redirect_load __read_mostly	= HZ / 50;
+static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost __read_mostly	= HZ;
+static int ip_rt_error_burst __read_mostly	= 5 * HZ;
+static int ip_rt_gc_elasticity __read_mostly	= 8;
+static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
+static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
+static int ip_rt_min_advmss __read_mostly	= 256;
+static int rt_chain_length_max __read_mostly	= 20;
+
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
+/*
+ *	Interface to generic destination cache.
+ */
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
+static unsigned int	 ipv4_default_mtu(const struct dst_entry *dst);
+static void		 ipv4_dst_destroy(struct dst_entry *dst);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void		 ipv4_link_failure(struct sk_buff *skb);
+static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static int rt_garbage_collect(struct dst_ops *ops);
+
+static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+			    int how)
+{
+}
+
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct inet_peer *peer;
+	u32 *p = NULL;
+
+	if (!rt->peer)
+		rt_bind_peer(rt, rt->rt_dst, 1);
+
+	peer = rt->peer;
+	if (peer) {
+		u32 *old_p = __DST_METRICS_PTR(old);
+		unsigned long prev, new;
+
+		p = peer->metrics;
+		if (inet_metrics_new(peer))
+			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+		new = (unsigned long) p;
+		prev = cmpxchg(&dst->_metrics, old, new);
+
+		if (prev != old) {
+			p = __DST_METRICS_PTR(prev);
+			if (prev & DST_METRICS_READ_ONLY)
+				p = NULL;
+		} else {
+			if (rt->fi) {
+				fib_info_put(rt->fi);
+				rt->fi = NULL;
+			}
+		}
+	}
+	return p;
+}
+
+static struct dst_ops ipv4_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		cpu_to_be16(ETH_P_IP),
+	.gc =			rt_garbage_collect,
+	.check =		ipv4_dst_check,
+	.default_advmss =	ipv4_default_advmss,
+	.default_mtu =		ipv4_default_mtu,
+	.cow_metrics =		ipv4_cow_metrics,
+	.destroy =		ipv4_dst_destroy,
+	.ifdown =		ipv4_dst_ifdown,
+	.negative_advice =	ipv4_negative_advice,
+	.link_failure =		ipv4_link_failure,
+	.update_pmtu =		ip_rt_update_pmtu,
+	.local_out =		__ip_local_out,
+};
+
+#define ECN_OR_COST(class)	TC_PRIO_##class
+
+const __u8 ip_tos2prio[16] = {
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(BESTEFFORT),
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(BESTEFFORT),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK)
+};
+
+
+/*
+ * Route cache.
+ */
+
+/* The locking scheme is rather straight forward:
+ *
+ * 1) Read-Copy Update protects the buckets of the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ *    as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ *    they do so with atomic increments and with the
+ *    lock held.
+ */
+
+struct rt_hash_bucket {
+	struct rtable __rcu	*chain;
+};
+
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_PROVE_LOCKING)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ * (on lockdep we have a quite big spinlock_t, so keep the size down there)
+ */
+#ifdef CONFIG_LOCKDEP
+# define RT_HASH_LOCK_SZ	256
+#else
+# if NR_CPUS >= 32
+#  define RT_HASH_LOCK_SZ	4096
+# elif NR_CPUS >= 16
+#  define RT_HASH_LOCK_SZ	2048
+# elif NR_CPUS >= 8
+#  define RT_HASH_LOCK_SZ	1024
+# elif NR_CPUS >= 4
+#  define RT_HASH_LOCK_SZ	512
+# else
+#  define RT_HASH_LOCK_SZ	256
+# endif
+#endif
+
+static spinlock_t	*rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+
+static __init void rt_hash_lock_init(void)
+{
+	int i;
+
+	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
+			GFP_KERNEL);
+	if (!rt_hash_locks)
+		panic("IP: failed to allocate rt_hash_locks\n");
+
+	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
+		spin_lock_init(&rt_hash_locks[i]);
+}
+#else
+# define rt_hash_lock_addr(slot) NULL
+
+static inline void rt_hash_lock_init(void)
+{
+}
+#endif
+
+static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
+static unsigned			rt_hash_mask __read_mostly;
+static unsigned int		rt_hash_log  __read_mostly;
+
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+
+static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
+				   int genid)
+{
+	return jhash_3words((__force u32)daddr, (__force u32)saddr,
+			    idx, genid)
+		& rt_hash_mask;
+}
+
+static inline int rt_genid(struct net *net)
+{
+	return atomic_read(&net->ipv4.rt_genid);
+}
+
+#ifdef CONFIG_PROC_FS
+struct rt_cache_iter_state {
+	struct seq_net_private p;
+	int bucket;
+	int genid;
+};
+
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
+{
+	struct rt_cache_iter_state *st = seq->private;
+	struct rtable *r = NULL;
+
+	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
+			continue;
+		rcu_read_lock_bh();
+		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
+		while (r) {
+			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
+			    r->rt_genid == st->genid)
+				return r;
+			r = rcu_dereference_bh(r->dst.rt_next);
+		}
+		rcu_read_unlock_bh();
+	}
+	return r;
+}
+
+static struct rtable *__rt_cache_get_next(struct seq_file *seq,
+					  struct rtable *r)
+{
+	struct rt_cache_iter_state *st = seq->private;
+
+	r = rcu_dereference_bh(r->dst.rt_next);
+	while (!r) {
+		rcu_read_unlock_bh();
+		do {
+			if (--st->bucket < 0)
+				return NULL;
+		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
+		rcu_read_lock_bh();
+		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
+	}
+	return r;
+}
+
+static struct rtable *rt_cache_get_next(struct seq_file *seq,
+					struct rtable *r)
+{
+	struct rt_cache_iter_state *st = seq->private;
+	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
+		if (dev_net(r->dst.dev) != seq_file_net(seq))
+			continue;
+		if (r->rt_genid == st->genid)
+			break;
+	}
+	return r;
+}
+
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct rtable *r = rt_cache_get_first(seq);
+
+	if (r)
+		while (pos && (r = rt_cache_get_next(seq, r)))
+			--pos;
+	return pos ? NULL : r;
+}
+
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct rt_cache_iter_state *st = seq->private;
+	if (*pos)
+		return rt_cache_get_idx(seq, *pos - 1);
+	st->genid = rt_genid(seq_file_net(seq));
+	return SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct rtable *r;
+
+	if (v == SEQ_START_TOKEN)
+		r = rt_cache_get_first(seq);
+	else
+		r = rt_cache_get_next(seq, v);
+	++*pos;
+	return r;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v && v != SEQ_START_TOKEN)
+		rcu_read_unlock_bh();
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+			   "HHUptod\tSpecDst");
+	else {
+		struct rtable *r = v;
+		struct neighbour *n;
+		int len, HHUptod;
+
+		rcu_read_lock();
+		n = dst_get_neighbour(&r->dst);
+		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
+		rcu_read_unlock();
+
+		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
+			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+			r->dst.dev ? r->dst.dev->name : "*",
+			(__force u32)r->rt_dst,
+			(__force u32)r->rt_gateway,
+			r->rt_flags, atomic_read(&r->dst.__refcnt),
+			r->dst.__use, 0, (__force u32)r->rt_src,
+			dst_metric_advmss(&r->dst) + 40,
+			dst_metric(&r->dst, RTAX_WINDOW),
+			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
+			      dst_metric(&r->dst, RTAX_RTTVAR)),
+			r->rt_key_tos,
+			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
+			HHUptod,
+			r->rt_spec_dst, &len);
+
+		seq_printf(seq, "%*s\n", 127 - len, "");
+	}
+	return 0;
+}
+
+static const struct seq_operations rt_cache_seq_ops = {
+	.start  = rt_cache_seq_start,
+	.next   = rt_cache_seq_next,
+	.stop   = rt_cache_seq_stop,
+	.show   = rt_cache_seq_show,
+};
+
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &rt_cache_seq_ops,
+			sizeof(struct rt_cache_iter_state));
+}
+
+static const struct file_operations rt_cache_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = rt_cache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_net,
+};
+
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(rt_cache_stat, cpu);
+	}
+	return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(rt_cache_stat, cpu);
+	}
+	return NULL;
+
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct rt_cache_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+		return 0;
+	}
+
+	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
+		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+		   dst_entries_get_slow(&ipv4_dst_ops),
+		   st->in_hit,
+		   st->in_slow_tot,
+		   st->in_slow_mc,
+		   st->in_no_route,
+		   st->in_brd,
+		   st->in_martian_dst,
+		   st->in_martian_src,
+
+		   st->out_hit,
+		   st->out_slow_tot,
+		   st->out_slow_mc,
+
+		   st->gc_total,
+		   st->gc_ignored,
+		   st->gc_goal_miss,
+		   st->gc_dst_overflow,
+		   st->in_hlist_search,
+		   st->out_hlist_search
+		);
+	return 0;
+}
+
+static const struct seq_operations rt_cpu_seq_ops = {
+	.start  = rt_cpu_seq_start,
+	.next   = rt_cpu_seq_next,
+	.stop   = rt_cpu_seq_stop,
+	.show   = rt_cpu_seq_show,
+};
+
+
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rt_cpu_seq_ops);
+}
+
+static const struct file_operations rt_cpu_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = rt_cpu_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+	struct ip_rt_acct *dst, *src;
+	unsigned int i, j;
+
+	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+	if (!dst)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+		for (j = 0; j < 256; j++) {
+			dst[j].o_bytes   += src[j].o_bytes;
+			dst[j].o_packets += src[j].o_packets;
+			dst[j].i_bytes   += src[j].i_bytes;
+			dst[j].i_packets += src[j].i_packets;
+		}
+	}
+
+	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+	kfree(dst);
+	return 0;
+}
+
+static int rt_acct_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rt_acct_proc_show, NULL);
+}
+
+static const struct file_operations rt_acct_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rt_acct_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static int __net_init ip_rt_do_proc_init(struct net *net)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
+			&rt_cache_seq_fops);
+	if (!pde)
+		goto err1;
+
+	pde = proc_create("rt_cache", S_IRUGO,
+			  net->proc_net_stat, &rt_cpu_seq_fops);
+	if (!pde)
+		goto err2;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+	if (!pde)
+		goto err3;
+#endif
+	return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+err3:
+	remove_proc_entry("rt_cache", net->proc_net_stat);
+#endif
+err2:
+	remove_proc_entry("rt_cache", net->proc_net);
+err1:
+	return -ENOMEM;
+}
+
+static void __net_exit ip_rt_do_proc_exit(struct net *net)
+{
+	remove_proc_entry("rt_cache", net->proc_net_stat);
+	remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	remove_proc_entry("rt_acct", net->proc_net);
+#endif
+}
+
+static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
+	.init = ip_rt_do_proc_init,
+	.exit = ip_rt_do_proc_exit,
+};
+
+static int __init ip_rt_proc_init(void)
+{
+	return register_pernet_subsys(&ip_rt_proc_ops);
+}
+
+#else
+static inline int ip_rt_proc_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+static inline void rt_free(struct rtable *rt)
+{
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static inline void rt_drop(struct rtable *rt)
+{
+	ip_rt_put(rt);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static inline int rt_fast_clean(struct rtable *rth)
+{
+	/* Kill broadcast/multicast entries very aggresively, if they
+	   collide in hash table with more useful entries */
+	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
+		rt_is_input_route(rth) && rth->dst.rt_next;
+}
+
+static inline int rt_valuable(struct rtable *rth)
+{
+	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
+		(rth->peer && rth->peer->pmtu_expires);
+}
+
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+	unsigned long age;
+	int ret = 0;
+
+	if (atomic_read(&rth->dst.__refcnt))
+		goto out;
+
+	age = jiffies - rth->dst.lastuse;
+	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+	    (age <= tmo2 && rt_valuable(rth)))
+		goto out;
+	ret = 1;
+out:	return ret;
+}
+
+/* Bits of score are:
+ * 31: very valuable
+ * 30: not quite useless
+ * 29..0: usage counter
+ */
+static inline u32 rt_score(struct rtable *rt)
+{
+	u32 score = jiffies - rt->dst.lastuse;
+
+	score = ~score & ~(3<<30);
+
+	if (rt_valuable(rt))
+		score |= (1<<31);
+
+	if (rt_is_output_route(rt) ||
+	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+		score |= (1<<30);
+
+	return score;
+}
+
+static inline bool rt_caching(const struct net *net)
+{
+	return net->ipv4.current_rt_cache_rebuild_count <=
+		net->ipv4.sysctl_rt_cache_rebuild_count;
+}
+
+static inline bool compare_hash_inputs(const struct rtable *rt1,
+				       const struct rtable *rt2)
+{
+	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
+		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
+		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
+}
+
+static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
+{
+	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
+		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
+		(rt1->rt_mark ^ rt2->rt_mark) |
+		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
+		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
+		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
+}
+
+static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
+{
+	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
+}
+
+static inline int rt_is_expired(struct rtable *rth)
+{
+	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
+}
+
+/*
+ * Perform a full scan of hash table and free all entries.
+ * Can be called by a softirq or a process.
+ * In the later case, we want to be reschedule if necessary
+ */
+static void rt_do_flush(struct net *net, int process_context)
+{
+	unsigned int i;
+	struct rtable *rth, *next;
+
+	for (i = 0; i <= rt_hash_mask; i++) {
+		struct rtable __rcu **pprev;
+		struct rtable *list;
+
+		if (process_context && need_resched())
+			cond_resched();
+		rth = rcu_dereference_raw(rt_hash_table[i].chain);
+		if (!rth)
+			continue;
+
+		spin_lock_bh(rt_hash_lock_addr(i));
+
+		list = NULL;
+		pprev = &rt_hash_table[i].chain;
+		rth = rcu_dereference_protected(*pprev,
+			lockdep_is_held(rt_hash_lock_addr(i)));
+
+		while (rth) {
+			next = rcu_dereference_protected(rth->dst.rt_next,
+				lockdep_is_held(rt_hash_lock_addr(i)));
+
+			if (!net ||
+			    net_eq(dev_net(rth->dst.dev), net)) {
+				rcu_assign_pointer(*pprev, next);
+				rcu_assign_pointer(rth->dst.rt_next, list);
+				list = rth;
+			} else {
+				pprev = &rth->dst.rt_next;
+			}
+			rth = next;
+		}
+
+		spin_unlock_bh(rt_hash_lock_addr(i));
+
+		for (; list; list = next) {
+			next = rcu_dereference_protected(list->dst.rt_next, 1);
+			rt_free(list);
+		}
+	}
+}
+
+/*
+ * While freeing expired entries, we compute average chain length
+ * and standard deviation, using fixed-point arithmetic.
+ * This to have an estimation of rt_chain_length_max
+ *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
+ * We use 3 bits for frational part, and 29 (or 61) for magnitude.
+ */
+
+#define FRACT_BITS 3
+#define ONE (1UL << FRACT_BITS)
+
+/*
+ * Given a hash chain and an item in this hash chain,
+ * find if a previous entry has the same hash_inputs
+ * (but differs on tos, mark or oif)
+ * Returns 0 if an alias is found.
+ * Returns ONE if rth has no alias before itself.
+ */
+static int has_noalias(const struct rtable *head, const struct rtable *rth)
+{
+	const struct rtable *aux = head;
+
+	while (aux != rth) {
+		if (compare_hash_inputs(aux, rth))
+			return 0;
+		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
+	}
+	return ONE;
+}
+
+static void rt_check_expire(void)
+{
+	static unsigned int rover;
+	unsigned int i = rover, goal;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
+	unsigned long samples = 0;
+	unsigned long sum = 0, sum2 = 0;
+	unsigned long delta;
+	u64 mult;
+
+	delta = jiffies - expires_ljiffies;
+	expires_ljiffies = jiffies;
+	mult = ((u64)delta) << rt_hash_log;
+	if (ip_rt_gc_timeout > 1)
+		do_div(mult, ip_rt_gc_timeout);
+	goal = (unsigned int)mult;
+	if (goal > rt_hash_mask)
+		goal = rt_hash_mask + 1;
+	for (; goal > 0; goal--) {
+		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
+
+		i = (i + 1) & rt_hash_mask;
+		rthp = &rt_hash_table[i].chain;
+
+		if (need_resched())
+			cond_resched();
+
+		samples++;
+
+		if (rcu_dereference_raw(*rthp) == NULL)
+			continue;
+		length = 0;
+		spin_lock_bh(rt_hash_lock_addr(i));
+		while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+			prefetch(rth->dst.rt_next);
+			if (rt_is_expired(rth)) {
+				*rthp = rth->dst.rt_next;
+				rt_free(rth);
+				continue;
+			}
+			if (rth->dst.expires) {
+				/* Entry is expired even if it is in use */
+				if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+					tmo >>= 1;
+					rthp = &rth->dst.rt_next;
+					/*
+					 * We only count entries on
+					 * a chain with equal hash inputs once
+					 * so that entries for different QOS
+					 * levels, and other non-hash input
+					 * attributes don't unfairly skew
+					 * the length computation
+					 */
+					length += has_noalias(rt_hash_table[i].chain, rth);
+					continue;
+				}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
+
+			/* Cleanup aged off entries. */
+			*rthp = rth->dst.rt_next;
+			rt_free(rth);
+		}
+		spin_unlock_bh(rt_hash_lock_addr(i));
+		sum += length;
+		sum2 += length*length;
+	}
+	if (samples) {
+		unsigned long avg = sum / samples;
+		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+		rt_chain_length_max = max_t(unsigned long,
+					ip_rt_gc_elasticity,
+					(avg + 4*sd) >> FRACT_BITS);
+	}
+	rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+	rt_check_expire();
+	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
+/*
+ * Perturbation of rt_genid by a small quantity [1..256]
+ * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
+ * many times (2^24) without giving recent rt_genid.
+ * Jenkins hash is strong enough that litle changes of rt_genid are OK.
+ */
+static void rt_cache_invalidate(struct net *net)
+{
+	unsigned char shuffle;
+
+	get_random_bytes(&shuffle, sizeof(shuffle));
+	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
+}
+
+/*
+ * delay < 0  : invalidate cache (fast : entries will be deleted later)
+ * delay >= 0 : invalidate & flush cache (can be long)
+ */
+void rt_cache_flush(struct net *net, int delay)
+{
+	rt_cache_invalidate(net);
+	if (delay >= 0)
+		rt_do_flush(net, !in_softirq());
+}
+
+/* Flush previous cache invalidated entries from the cache */
+void rt_cache_flush_batch(struct net *net)
+{
+	rt_do_flush(net, !in_softirq());
+}
+
+static void rt_emergency_hash_rebuild(struct net *net)
+{
+	if (net_ratelimit())
+		printk(KERN_WARNING "Route hash chain too long!\n");
+	rt_cache_invalidate(net);
+}
+
+/*
+   Short description of GC goals.
+
+   We want to build algorithm, which will keep routing cache
+   at some equilibrium point, when number of aged off entries
+   is kept approximately equal to newly generated ones.
+
+   Current expiration strength is variable "expire".
+   We try to adjust it dynamically, so that if networking
+   is idle expires is large enough to keep enough of warm entries,
+   and when load increases it reduces to limit cache size.
+ */
+
+static int rt_garbage_collect(struct dst_ops *ops)
+{
+	static unsigned long expire = RT_GC_TIMEOUT;
+	static unsigned long last_gc;
+	static int rover;
+	static int equilibrium;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
+	unsigned long now = jiffies;
+	int goal;
+	int entries = dst_entries_get_fast(&ipv4_dst_ops);
+
+	/*
+	 * Garbage collection is pretty expensive,
+	 * do not make it too frequently.
+	 */
+
+	RT_CACHE_STAT_INC(gc_total);
+
+	if (now - last_gc < ip_rt_gc_min_interval &&
+	    entries < ip_rt_max_size) {
+		RT_CACHE_STAT_INC(gc_ignored);
+		goto out;
+	}
+
+	entries = dst_entries_get_slow(&ipv4_dst_ops);
+	/* Calculate number of entries, which we want to expire now. */
+	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
+	if (goal <= 0) {
+		if (equilibrium < ipv4_dst_ops.gc_thresh)
+			equilibrium = ipv4_dst_ops.gc_thresh;
+		goal = entries - equilibrium;
+		if (goal > 0) {
+			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
+			goal = entries - equilibrium;
+		}
+	} else {
+		/* We are in dangerous area. Try to reduce cache really
+		 * aggressively.
+		 */
+		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
+		equilibrium = entries - goal;
+	}
+
+	if (now - last_gc >= ip_rt_gc_min_interval)
+		last_gc = now;
+
+	if (goal <= 0) {
+		equilibrium += goal;
+		goto work_done;
+	}
+
+	do {
+		int i, k;
+
+		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+			unsigned long tmo = expire;
+
+			k = (k + 1) & rt_hash_mask;
+			rthp = &rt_hash_table[k].chain;
+			spin_lock_bh(rt_hash_lock_addr(k));
+			while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
+				if (!rt_is_expired(rth) &&
+					!rt_may_expire(rth, tmo, expire)) {
+					tmo >>= 1;
+					rthp = &rth->dst.rt_next;
+					continue;
+				}
+				*rthp = rth->dst.rt_next;
+				rt_free(rth);
+				goal--;
+			}
+			spin_unlock_bh(rt_hash_lock_addr(k));
+			if (goal <= 0)
+				break;
+		}
+		rover = k;
+
+		if (goal <= 0)
+			goto work_done;
+
+		/* Goal is not achieved. We stop process if:
+
+		   - if expire reduced to zero. Otherwise, expire is halfed.
+		   - if table is not full.
+		   - if we are called from interrupt.
+		   - jiffies check is just fallback/debug loop breaker.
+		     We will not spin here for long time in any case.
+		 */
+
+		RT_CACHE_STAT_INC(gc_goal_miss);
+
+		if (expire == 0)
+			break;
+
+		expire >>= 1;
+
+		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+			goto out;
+	} while (!in_softirq() && time_before_eq(jiffies, now));
+
+	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+		goto out;
+	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
+		goto out;
+	if (net_ratelimit())
+		printk(KERN_WARNING "dst cache overflow\n");
+	RT_CACHE_STAT_INC(gc_dst_overflow);
+	return 1;
+
+work_done:
+	expire += ip_rt_gc_min_interval;
+	if (expire > ip_rt_gc_timeout ||
+	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
+		expire = ip_rt_gc_timeout;
+out:	return 0;
+}
+
+/*
+ * Returns number of entries in a hash chain that have different hash_inputs
+ */
+static int slow_chain_length(const struct rtable *head)
+{
+	int length = 0;
+	const struct rtable *rth = head;
+
+	while (rth) {
+		length += has_noalias(head, rth);
+		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
+	}
+	return length >> FRACT_BITS;
+}
+
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
+				     struct sk_buff *skb, int ifindex)
+{
+	struct rtable	*rth, *cand;
+	struct rtable __rcu **rthp, **candp;
+	unsigned long	now;
+	u32 		min_score;
+	int		chain_length;
+	int attempts = !in_softirq();
+
+restart:
+	chain_length = 0;
+	min_score = ~(u32)0;
+	cand = NULL;
+	candp = NULL;
+	now = jiffies;
+
+	if (!rt_caching(dev_net(rt->dst.dev))) {
+		/*
+		 * If we're not caching, just tell the caller we
+		 * were successful and don't touch the route.  The
+		 * caller hold the sole reference to the cache entry, and
+		 * it will be released when the caller is done with it.
+		 * If we drop it here, the callers have no way to resolve routes
+		 * when we're not caching.  Instead, just point *rp at rt, so
+		 * the caller gets a single use out of the route
+		 * Note that we do rt_free on this new route entry, so that
+		 * once its refcount hits zero, we are still able to reap it
+		 * (Thanks Alexey)
+		 * Note: To avoid expensive rcu stuff for this uncached dst,
+		 * we set DST_NOCACHE so that dst_release() can free dst without
+		 * waiting a grace period.
+		 */
+
+		rt->dst.flags |= DST_NOCACHE;
+		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
+			int err = arp_bind_neighbour(&rt->dst);
+			if (err) {
+				if (net_ratelimit())
+					printk(KERN_WARNING
+					    "Neighbour table failure & not caching routes.\n");
+				ip_rt_put(rt);
+				return ERR_PTR(err);
+			}
+		}
+
+		goto skip_hashing;
+	}
+
+	rthp = &rt_hash_table[hash].chain;
+
+	spin_lock_bh(rt_hash_lock_addr(hash));
+	while ((rth = rcu_dereference_protected(*rthp,
+			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
+		if (rt_is_expired(rth)) {
+			*rthp = rth->dst.rt_next;
+			rt_free(rth);
+			continue;
+		}
+		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
+			/* Put it first */
+			*rthp = rth->dst.rt_next;
+			/*
+			 * Since lookup is lockfree, the deletion
+			 * must be visible to another weakly ordered CPU before
+			 * the insertion at the start of the hash chain.
+			 */
+			rcu_assign_pointer(rth->dst.rt_next,
+					   rt_hash_table[hash].chain);
+			/*
+			 * Since lookup is lockfree, the update writes
+			 * must be ordered for consistency on SMP.
+			 */
+			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+
+			dst_use(&rth->dst, now);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
+
+			rt_drop(rt);
+			if (skb)
+				skb_dst_set(skb, &rth->dst);
+			return rth;
+		}
+
+		if (!atomic_read(&rth->dst.__refcnt)) {
+			u32 score = rt_score(rth);
+
+			if (score <= min_score) {
+				cand = rth;
+				candp = rthp;
+				min_score = score;
+			}
+		}
+
+		chain_length++;
+
+		rthp = &rth->dst.rt_next;
+	}
+
+	if (cand) {
+		/* ip_rt_gc_elasticity used to be average length of chain
+		 * length, when exceeded gc becomes really aggressive.
+		 *
+		 * The second limit is less certain. At the moment it allows
+		 * only 2 entries per bucket. We will see.
+		 */
+		if (chain_length > ip_rt_gc_elasticity) {
+			*candp = cand->dst.rt_next;
+			rt_free(cand);
+		}
+	} else {
+		if (chain_length > rt_chain_length_max &&
+		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
+			struct net *net = dev_net(rt->dst.dev);
+			int num = ++net->ipv4.current_rt_cache_rebuild_count;
+			if (!rt_caching(net)) {
+				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+					rt->dst.dev->name, num);
+			}
+			rt_emergency_hash_rebuild(net);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
+
+			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+					ifindex, rt_genid(net));
+			goto restart;
+		}
+	}
+
+	/* Try to bind route to arp only if it is output
+	   route or unicast forwarding path.
+	 */
+	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
+		int err = arp_bind_neighbour(&rt->dst);
+		if (err) {
+			spin_unlock_bh(rt_hash_lock_addr(hash));
+
+			if (err != -ENOBUFS) {
+				rt_drop(rt);
+				return ERR_PTR(err);
+			}
+
+			/* Neighbour tables are full and nothing
+			   can be released. Try to shrink route cache,
+			   it is most likely it holds some neighbour records.
+			 */
+			if (attempts-- > 0) {
+				int saved_elasticity = ip_rt_gc_elasticity;
+				int saved_int = ip_rt_gc_min_interval;
+				ip_rt_gc_elasticity	= 1;
+				ip_rt_gc_min_interval	= 0;
+				rt_garbage_collect(&ipv4_dst_ops);
+				ip_rt_gc_min_interval	= saved_int;
+				ip_rt_gc_elasticity	= saved_elasticity;
+				goto restart;
+			}
+
+			if (net_ratelimit())
+				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
+			rt_drop(rt);
+			return ERR_PTR(-ENOBUFS);
+		}
+	}
+
+	rt->dst.rt_next = rt_hash_table[hash].chain;
+
+	/*
+	 * Since lookup is lockfree, we must make sure
+	 * previous writes to rt are committed to memory
+	 * before making rt visible to other CPUS.
+	 */
+	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+
+	spin_unlock_bh(rt_hash_lock_addr(hash));
+
+skip_hashing:
+	if (skb)
+		skb_dst_set(skb, &rt->dst);
+	return rt;
+}
+
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt_peer_genid(void)
+{
+	return atomic_read(&__rt_peer_genid);
+}
+
+void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
+{
+	struct inet_peer *peer;
+
+	peer = inet_getpeer_v4(daddr, create);
+
+	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
+		inet_putpeer(peer);
+	else
+		rt->rt_peer_genid = rt_peer_genid();
+}
+
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions.  However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+	static DEFINE_SPINLOCK(ip_fb_id_lock);
+	static u32 ip_fallback_id;
+	u32 salt;
+
+	spin_lock_bh(&ip_fb_id_lock);
+	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
+	iph->id = htons(salt & 0xFFFF);
+	ip_fallback_id = salt;
+	spin_unlock_bh(&ip_fb_id_lock);
+}
+
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+{
+	struct rtable *rt = (struct rtable *) dst;
+
+	if (rt) {
+		if (rt->peer == NULL)
+			rt_bind_peer(rt, rt->rt_dst, 1);
+
+		/* If peer is attached to destination, it is never detached,
+		   so that we need not to grab a lock to dereference it.
+		 */
+		if (rt->peer) {
+			iph->id = htons(inet_getid(rt->peer, more));
+			return;
+		}
+	} else
+		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
+		       __builtin_return_address(0));
+
+	ip_select_fb_ident(iph);
+}
+EXPORT_SYMBOL(__ip_select_ident);
+
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+	struct rtable __rcu **rthp;
+	struct rtable *aux;
+
+	rthp = &rt_hash_table[hash].chain;
+	spin_lock_bh(rt_hash_lock_addr(hash));
+	ip_rt_put(rt);
+	while ((aux = rcu_dereference_protected(*rthp,
+			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
+		if (aux == rt || rt_is_expired(aux)) {
+			*rthp = aux->dst.rt_next;
+			rt_free(aux);
+			continue;
+		}
+		rthp = &aux->dst.rt_next;
+	}
+	spin_unlock_bh(rt_hash_lock_addr(hash));
+}
+
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	__be32 orig_gw = rt->rt_gateway;
+	struct neighbour *n, *old_n;
+
+	dst_confirm(&rt->dst);
+
+	rt->rt_gateway = peer->redirect_learned.a4;
+	n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway);
+	if (IS_ERR(n))
+		return PTR_ERR(n);
+	old_n = xchg(&rt->dst._neighbour, n);
+	if (old_n)
+		neigh_release(old_n);
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		rt->rt_gateway = orig_gw;
+		return -EAGAIN;
+	} else {
+		rt->rt_flags |= RTCF_REDIRECTED;
+		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
+	}
+	return 0;
+}
+
+/* called in rcu_read_lock() section */
+void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
+		    __be32 saddr, struct net_device *dev)
+{
+	int s, i;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	__be32 skeys[2] = { saddr, 0 };
+	int    ikeys[2] = { dev->ifindex, 0 };
+	struct inet_peer *peer;
+	struct net *net;
+
+	if (!in_dev)
+		return;
+
+	net = dev_net(dev);
+	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+	    ipv4_is_zeronet(new_gw))
+		goto reject_redirect;
+
+	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+			goto reject_redirect;
+		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+			goto reject_redirect;
+	} else {
+		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
+			goto reject_redirect;
+	}
+
+	for (s = 0; s < 2; s++) {
+		for (i = 0; i < 2; i++) {
+			unsigned int hash;
+			struct rtable __rcu **rthp;
+			struct rtable *rt;
+
+			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
+
+			rthp = &rt_hash_table[hash].chain;
+
+			while ((rt = rcu_dereference(*rthp)) != NULL) {
+				rthp = &rt->dst.rt_next;
+
+				if (rt->rt_key_dst != daddr ||
+				    rt->rt_key_src != skeys[s] ||
+				    rt->rt_oif != ikeys[i] ||
+				    rt_is_input_route(rt) ||
+				    rt_is_expired(rt) ||
+				    !net_eq(dev_net(rt->dst.dev), net) ||
+				    rt->dst.error ||
+				    rt->dst.dev != dev ||
+				    rt->rt_gateway != old_gw)
+					continue;
+
+				if (!rt->peer)
+					rt_bind_peer(rt, rt->rt_dst, 1);
+
+				peer = rt->peer;
+				if (peer) {
+					if (peer->redirect_learned.a4 != new_gw) {
+						peer->redirect_learned.a4 = new_gw;
+						atomic_inc(&__rt_peer_genid);
+					}
+					check_peer_redir(&rt->dst, peer);
+				}
+			}
+		}
+	}
+	return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
+			"  Advised path = %pI4 -> %pI4\n",
+		       &old_gw, dev->name, &new_gw,
+		       &saddr, &daddr);
+#endif
+	;
+}
+
+static bool peer_pmtu_expired(struct inet_peer *peer)
+{
+	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+
+	return orig &&
+	       time_after_eq(jiffies, orig) &&
+	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+}
+
+static bool peer_pmtu_cleaned(struct inet_peer *peer)
+{
+	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
+
+	return orig &&
+	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *)dst;
+	struct dst_entry *ret = dst;
+
+	if (rt) {
+		if (dst->obsolete > 0) {
+			ip_rt_put(rt);
+			ret = NULL;
+		} else if (rt->rt_flags & RTCF_REDIRECTED) {
+			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
+						rt->rt_oif,
+						rt_genid(dev_net(dst->dev)));
+			rt_del(hash, rt);
+			ret = NULL;
+		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
+			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
+		}
+	}
+	return ret;
+}
+
+/*
+ * Algorithm:
+ *	1. The first ip_rt_redirect_number redirects are sent
+ *	   with exponential backoff, then we stop sending them at all,
+ *	   assuming that the host ignores our redirects.
+ *	2. If we did not see packets requiring redirects
+ *	   during ip_rt_redirect_silence, we assume that the host
+ *	   forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct in_device *in_dev;
+	struct inet_peer *peer;
+	int log_martians;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+		rcu_read_unlock();
+		return;
+	}
+	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+	rcu_read_unlock();
+
+	if (!rt->peer)
+		rt_bind_peer(rt, rt->rt_dst, 1);
+	peer = rt->peer;
+	if (!peer) {
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		return;
+	}
+
+	/* No redirected packets during ip_rt_redirect_silence;
+	 * reset the algorithm.
+	 */
+	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+		peer->rate_tokens = 0;
+
+	/* Too many ignored redirects; do not send anything
+	 * set dst.rate_last to the last seen redirected packet.
+	 */
+	if (peer->rate_tokens >= ip_rt_redirect_number) {
+		peer->rate_last = jiffies;
+		return;
+	}
+
+	/* Check for load limit; set rate_last to the latest sent
+	 * redirect.
+	 */
+	if (peer->rate_tokens == 0 ||
+	    time_after(jiffies,
+		       (peer->rate_last +
+			(ip_rt_redirect_load << peer->rate_tokens)))) {
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		peer->rate_last = jiffies;
+		++peer->rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+		if (log_martians &&
+		    peer->rate_tokens == ip_rt_redirect_number &&
+		    net_ratelimit())
+			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
+			       &ip_hdr(skb)->saddr, rt->rt_iif,
+				&rt->rt_dst, &rt->rt_gateway);
+#endif
+	}
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct inet_peer *peer;
+	unsigned long now;
+	bool send;
+	int code;
+
+	switch (rt->dst.error) {
+		case EINVAL:
+		default:
+			goto out;
+		case EHOSTUNREACH:
+			code = ICMP_HOST_UNREACH;
+			break;
+		case ENETUNREACH:
+			code = ICMP_NET_UNREACH;
+			IP_INC_STATS_BH(dev_net(rt->dst.dev),
+					IPSTATS_MIB_INNOROUTES);
+			break;
+		case EACCES:
+			code = ICMP_PKT_FILTERED;
+			break;
+	}
+
+	if (!rt->peer)
+		rt_bind_peer(rt, rt->rt_dst, 1);
+	peer = rt->peer;
+
+	send = true;
+	if (peer) {
+		now = jiffies;
+		peer->rate_tokens += now - peer->rate_last;
+		if (peer->rate_tokens > ip_rt_error_burst)
+			peer->rate_tokens = ip_rt_error_burst;
+		peer->rate_last = now;
+		if (peer->rate_tokens >= ip_rt_error_cost)
+			peer->rate_tokens -= ip_rt_error_cost;
+		else
+			send = false;
+	}
+	if (send)
+		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+
+out:	kfree_skb(skb);
+	return 0;
+}
+
+/*
+ *	The last two values are not from the RFC but
+ *	are needed for AMPRnet AX.25 paths.
+ */
+
+static const unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+
+static inline unsigned short guess_mtu(unsigned short old_mtu)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
+		if (old_mtu > mtu_plateau[i])
+			return mtu_plateau[i];
+	return 68;
+}
+
+unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
+				 unsigned short new_mtu,
+				 struct net_device *dev)
+{
+	unsigned short old_mtu = ntohs(iph->tot_len);
+	unsigned short est_mtu = 0;
+	struct inet_peer *peer;
+
+	peer = inet_getpeer_v4(iph->daddr, 1);
+	if (peer) {
+		unsigned short mtu = new_mtu;
+
+		if (new_mtu < 68 || new_mtu >= old_mtu) {
+			/* BSD 4.2 derived systems incorrectly adjust
+			 * tot_len by the IP header length, and report
+			 * a zero MTU in the ICMP message.
+			 */
+			if (mtu == 0 &&
+			    old_mtu >= 68 + (iph->ihl << 2))
+				old_mtu -= iph->ihl << 2;
+			mtu = guess_mtu(old_mtu);
+		}
+
+		if (mtu < ip_rt_min_pmtu)
+			mtu = ip_rt_min_pmtu;
+		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+			unsigned long pmtu_expires;
+
+			pmtu_expires = jiffies + ip_rt_mtu_expires;
+			if (!pmtu_expires)
+				pmtu_expires = 1UL;
+
+			est_mtu = mtu;
+			peer->pmtu_learned = mtu;
+			peer->pmtu_expires = pmtu_expires;
+		}
+
+		inet_putpeer(peer);
+
+		atomic_inc(&__rt_peer_genid);
+	}
+	return est_mtu ? : new_mtu;
+}
+
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+{
+	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
+
+	if (!expires)
+		return;
+	if (time_before(jiffies, expires)) {
+		u32 orig_dst_mtu = dst_mtu(dst);
+		if (peer->pmtu_learned < orig_dst_mtu) {
+			if (!peer->pmtu_orig)
+				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+		}
+	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct inet_peer *peer;
+
+	dst_confirm(dst);
+
+	if (!rt->peer)
+		rt_bind_peer(rt, rt->rt_dst, 1);
+	peer = rt->peer;
+	if (peer) {
+		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
+
+		if (mtu < ip_rt_min_pmtu)
+			mtu = ip_rt_min_pmtu;
+		if (!pmtu_expires || mtu < peer->pmtu_learned) {
+
+			pmtu_expires = jiffies + ip_rt_mtu_expires;
+			if (!pmtu_expires)
+				pmtu_expires = 1UL;
+
+			peer->pmtu_learned = mtu;
+			peer->pmtu_expires = pmtu_expires;
+
+			atomic_inc(&__rt_peer_genid);
+			rt->rt_peer_genid = rt_peer_genid();
+		}
+		check_peer_pmtu(dst, peer);
+	}
+}
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	struct rtable *rt = (struct rtable *) dst;
+
+	if (rt_is_expired(rt))
+		return NULL;
+	if (rt->rt_peer_genid != rt_peer_genid()) {
+		struct inet_peer *peer;
+
+		if (!rt->peer)
+			rt_bind_peer(rt, rt->rt_dst, 0);
+
+		peer = rt->peer;
+		if (peer) {
+			check_peer_pmtu(dst, peer);
+
+			if (peer->redirect_learned.a4 &&
+			    peer->redirect_learned.a4 != rt->rt_gateway) {
+				if (check_peer_redir(dst, peer))
+					return NULL;
+			}
+		}
+
+		rt->rt_peer_genid = rt_peer_genid();
+	}
+	return dst;
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct inet_peer *peer = rt->peer;
+
+	if (rt->fi) {
+		fib_info_put(rt->fi);
+		rt->fi = NULL;
+	}
+	if (peer) {
+		rt->peer = NULL;
+		inet_putpeer(peer);
+	}
+}
+
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+	struct rtable *rt;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+	rt = skb_rtable(skb);
+	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
+		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+}
+
+static int ip_rt_bug(struct sk_buff *skb)
+{
+	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
+		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+		skb->dev ? skb->dev->name : "?");
+	kfree_skb(skb);
+	WARN_ON(1);
+	return 0;
+}
+
+/*
+   We do not cache source address of outgoing interface,
+   because it is used only by IP RR, TS and SRR options,
+   so that it out of fast path.
+
+   BTW remember: "addr" is allowed to be not aligned
+   in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
+{
+	__be32 src;
+
+	if (rt_is_output_route(rt))
+		src = ip_hdr(skb)->saddr;
+	else {
+		struct fib_result res;
+		struct flowi4 fl4;
+		struct iphdr *iph;
+
+		iph = ip_hdr(skb);
+
+		memset(&fl4, 0, sizeof(fl4));
+		fl4.daddr = iph->daddr;
+		fl4.saddr = iph->saddr;
+		fl4.flowi4_tos = RT_TOS(iph->tos);
+		fl4.flowi4_oif = rt->dst.dev->ifindex;
+		fl4.flowi4_iif = skb->dev->ifindex;
+		fl4.flowi4_mark = skb->mark;
+
+		rcu_read_lock();
+		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+		else
+			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+					RT_SCOPE_UNIVERSE);
+		rcu_read_unlock();
+	}
+	memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+	if (!(rt->dst.tclassid & 0xFFFF))
+		rt->dst.tclassid |= tag & 0xFFFF;
+	if (!(rt->dst.tclassid & 0xFFFF0000))
+		rt->dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
+{
+	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
+
+	if (advmss == 0) {
+		advmss = max_t(unsigned int, dst->dev->mtu - 40,
+			       ip_rt_min_advmss);
+		if (advmss > 65535 - 40)
+			advmss = 65535 - 40;
+	}
+	return advmss;
+}
+
+static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
+{
+	unsigned int mtu = dst->dev->mtu;
+
+	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+		const struct rtable *rt = (const struct rtable *) dst;
+
+		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+			mtu = 576;
+	}
+
+	if (mtu > IP_MAX_MTU)
+		mtu = IP_MAX_MTU;
+
+	return mtu;
+}
+
+static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
+			    struct fib_info *fi)
+{
+	struct inet_peer *peer;
+	int create = 0;
+
+	/* If a peer entry exists for this destination, we must hook
+	 * it up in order to get at cached metrics.
+	 */
+	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
+		create = 1;
+
+	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
+	if (peer) {
+		rt->rt_peer_genid = rt_peer_genid();
+		if (inet_metrics_new(peer))
+			memcpy(peer->metrics, fi->fib_metrics,
+			       sizeof(u32) * RTAX_MAX);
+		dst_init_metrics(&rt->dst, peer->metrics, false);
+
+		check_peer_pmtu(&rt->dst, peer);
+		if (peer->redirect_learned.a4 &&
+		    peer->redirect_learned.a4 != rt->rt_gateway) {
+			rt->rt_gateway = peer->redirect_learned.a4;
+			rt->rt_flags |= RTCF_REDIRECTED;
+		}
+	} else {
+		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+			rt->fi = fi;
+			atomic_inc(&fi->fib_clntref);
+		}
+		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+	}
+}
+
+static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+			   const struct fib_result *res,
+			   struct fib_info *fi, u16 type, u32 itag)
+{
+	struct dst_entry *dst = &rt->dst;
+
+	if (fi) {
+		if (FIB_RES_GW(*res) &&
+		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = FIB_RES_GW(*res);
+		rt_init_metrics(rt, fl4, fi);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
+#endif
+	}
+
+	if (dst_mtu(dst) > IP_MAX_MTU)
+		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
+	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
+		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	set_class_tag(rt, fib_rules_tclass(res));
+#endif
+	set_class_tag(rt, itag);
+#endif
+}
+
+static struct rtable *rt_dst_alloc(struct net_device *dev,
+				   bool nopolicy, bool noxfrm)
+{
+	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
+			 DST_HOST |
+			 (nopolicy ? DST_NOPOLICY : 0) |
+			 (noxfrm ? DST_NOXFRM : 0));
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+				u8 tos, struct net_device *dev, int our)
+{
+	unsigned int hash;
+	struct rtable *rth;
+	__be32 spec_dst;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	u32 itag = 0;
+	int err;
+
+	/* Primary sanity checks. */
+
+	if (in_dev == NULL)
+		return -EINVAL;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
+		goto e_inval;
+
+	if (ipv4_is_zeronet(saddr)) {
+		if (!ipv4_is_local_multicast(daddr))
+			goto e_inval;
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+	} else {
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag);
+		if (err < 0)
+			goto e_err;
+	}
+	rth = rt_dst_alloc(init_net.loopback_dev,
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+	if (!rth)
+		goto e_nobufs;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	rth->dst.tclassid = itag;
+#endif
+	rth->dst.output = ip_rt_bug;
+
+	rth->rt_key_dst	= daddr;
+	rth->rt_key_src	= saddr;
+	rth->rt_genid	= rt_genid(dev_net(dev));
+	rth->rt_flags	= RTCF_MULTICAST;
+	rth->rt_type	= RTN_MULTICAST;
+	rth->rt_key_tos	= tos;
+	rth->rt_dst	= daddr;
+	rth->rt_src	= saddr;
+	rth->rt_route_iif = dev->ifindex;
+	rth->rt_iif	= dev->ifindex;
+	rth->rt_oif	= 0;
+	rth->rt_mark    = skb->mark;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->rt_peer_genid = 0;
+	rth->peer = NULL;
+	rth->fi = NULL;
+	if (our) {
+		rth->dst.input= ip_local_deliver;
+		rth->rt_flags |= RTCF_LOCAL;
+	}
+
+#ifdef CONFIG_IP_MROUTE
+	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
+		rth->dst.input = ip_mr_input;
+#endif
+	RT_CACHE_STAT_INC(in_slow_mc);
+
+	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
+	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
+	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
+
+e_nobufs:
+	return -ENOBUFS;
+e_inval:
+	return -EINVAL;
+e_err:
+	return err;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+				     struct in_device *in_dev,
+				     struct sk_buff *skb,
+				     __be32 daddr,
+				     __be32 saddr)
+{
+	RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+		/*
+		 *	RFC1812 recommendation, if source is martian,
+		 *	the only hint is MAC header.
+		 */
+		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
+			&daddr, &saddr, dev->name);
+		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
+			int i;
+			const unsigned char *p = skb_mac_header(skb);
+			printk(KERN_WARNING "ll header: ");
+			for (i = 0; i < dev->hard_header_len; i++, p++) {
+				printk("%02x", *p);
+				if (i < (dev->hard_header_len - 1))
+					printk(":");
+			}
+			printk("\n");
+		}
+	}
+#endif
+}
+
+/* called in rcu_read_lock() section */
+static int __mkroute_input(struct sk_buff *skb,
+			   const struct fib_result *res,
+			   struct in_device *in_dev,
+			   __be32 daddr, __be32 saddr, u32 tos,
+			   struct rtable **result)
+{
+	struct rtable *rth;
+	int err;
+	struct in_device *out_dev;
+	unsigned int flags = 0;
+	__be32 spec_dst;
+	u32 itag;
+
+	/* get a working reference to the output device */
+	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+	if (out_dev == NULL) {
+		if (net_ratelimit())
+			printk(KERN_CRIT "Bug in ip_route_input" \
+			       "_slow(). Please, report\n");
+		return -EINVAL;
+	}
+
+
+	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+				  in_dev->dev, &spec_dst, &itag);
+	if (err < 0) {
+		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
+					 saddr);
+
+		goto cleanup;
+	}
+
+	if (err)
+		flags |= RTCF_DIRECTSRC;
+
+	if (out_dev == in_dev && err &&
+	    (IN_DEV_SHARED_MEDIA(out_dev) ||
+	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+		flags |= RTCF_DOREDIRECT;
+
+	if (skb->protocol != htons(ETH_P_IP)) {
+		/* Not IP (i.e. ARP). Do not create route, if it is
+		 * invalid for proxy arp. DNAT routes are always valid.
+		 *
+		 * Proxy arp feature have been extended to allow, ARP
+		 * replies back to the same interface, to support
+		 * Private VLAN switch technologies. See arp.c.
+		 */
+		if (out_dev == in_dev &&
+		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+	}
+
+	rth = rt_dst_alloc(out_dev->dev,
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
+			   IN_DEV_CONF_GET(out_dev, NOXFRM));
+	if (!rth) {
+		err = -ENOBUFS;
+		goto cleanup;
+	}
+
+	rth->rt_key_dst	= daddr;
+	rth->rt_key_src	= saddr;
+	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
+	rth->rt_flags = flags;
+	rth->rt_type = res->type;
+	rth->rt_key_tos	= tos;
+	rth->rt_dst	= daddr;
+	rth->rt_src	= saddr;
+	rth->rt_route_iif = in_dev->dev->ifindex;
+	rth->rt_iif 	= in_dev->dev->ifindex;
+	rth->rt_oif 	= 0;
+	rth->rt_mark    = skb->mark;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->rt_peer_genid = 0;
+	rth->peer = NULL;
+	rth->fi = NULL;
+
+	rth->dst.input = ip_forward;
+	rth->dst.output = ip_output;
+
+	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
+
+	*result = rth;
+	err = 0;
+ cleanup:
+	return err;
+}
+
+static int ip_mkroute_input(struct sk_buff *skb,
+			    struct fib_result *res,
+			    const struct flowi4 *fl4,
+			    struct in_device *in_dev,
+			    __be32 daddr, __be32 saddr, u32 tos)
+{
+	struct rtable* rth = NULL;
+	int err;
+	unsigned hash;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res->fi && res->fi->fib_nhs > 1)
+		fib_select_multipath(res);
+#endif
+
+	/* create a routing cache entry */
+	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+	if (err)
+		return err;
+
+	/* put it into the cache */
+	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
+		       rt_genid(dev_net(rth->dst.dev)));
+	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
+	if (IS_ERR(rth))
+		return PTR_ERR(rth);
+	return 0;
+}
+
+/*
+ *	NOTE. We drop all the packets that has local source
+ *	addresses, because every properly looped back packet
+ *	must have correct destination already attached by output routine.
+ *
+ *	Such approach solves two big problems:
+ *	1. Not simplex devices are handled properly.
+ *	2. IP spoofing attempts are filtered with 100% of guarantee.
+ *	called with rcu_read_lock()
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			       u8 tos, struct net_device *dev)
+{
+	struct fib_result res;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct flowi4	fl4;
+	unsigned	flags = 0;
+	u32		itag = 0;
+	struct rtable * rth;
+	unsigned	hash;
+	__be32		spec_dst;
+	int		err = -EINVAL;
+	struct net    * net = dev_net(dev);
+
+	/* IP on this device is disabled. */
+
+	if (!in_dev)
+		goto out;
+
+	/* Check for the most weird martians, which can be not detected
+	   by fib_lookup.
+	 */
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    ipv4_is_loopback(saddr))
+		goto martian_source;
+
+	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
+		goto brd_input;
+
+	/* Accept zero addresses only to limited broadcast;
+	 * I even do not know to fix it or not. Waiting for complains :-)
+	 */
+	if (ipv4_is_zeronet(saddr))
+		goto martian_source;
+
+	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
+		goto martian_destination;
+
+	/*
+	 *	Now we are ready to route packet.
+	 */
+	fl4.flowi4_oif = 0;
+	fl4.flowi4_iif = dev->ifindex;
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_tos = tos;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+	err = fib_lookup(net, &fl4, &res);
+	if (err != 0) {
+		if (!IN_DEV_FORWARD(in_dev))
+			goto e_hostunreach;
+		goto no_route;
+	}
+
+	RT_CACHE_STAT_INC(in_slow_tot);
+
+	if (res.type == RTN_BROADCAST)
+		goto brd_input;
+
+	if (res.type == RTN_LOCAL) {
+		err = fib_validate_source(skb, saddr, daddr, tos,
+					  net->loopback_dev->ifindex,
+					  dev, &spec_dst, &itag);
+		if (err < 0)
+			goto martian_source_keep_err;
+		if (err)
+			flags |= RTCF_DIRECTSRC;
+		spec_dst = daddr;
+		goto local_input;
+	}
+
+	if (!IN_DEV_FORWARD(in_dev))
+		goto e_hostunreach;
+	if (res.type != RTN_UNICAST)
+		goto martian_destination;
+
+	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
+out:	return err;
+
+brd_input:
+	if (skb->protocol != htons(ETH_P_IP))
+		goto e_inval;
+
+	if (ipv4_is_zeronet(saddr))
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+	else {
+		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag);
+		if (err < 0)
+			goto martian_source_keep_err;
+		if (err)
+			flags |= RTCF_DIRECTSRC;
+	}
+	flags |= RTCF_BROADCAST;
+	res.type = RTN_BROADCAST;
+	RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+	rth = rt_dst_alloc(net->loopback_dev,
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+	if (!rth)
+		goto e_nobufs;
+
+	rth->dst.input= ip_local_deliver;
+	rth->dst.output= ip_rt_bug;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	rth->dst.tclassid = itag;
+#endif
+
+	rth->rt_key_dst	= daddr;
+	rth->rt_key_src	= saddr;
+	rth->rt_genid = rt_genid(net);
+	rth->rt_flags 	= flags|RTCF_LOCAL;
+	rth->rt_type	= res.type;
+	rth->rt_key_tos	= tos;
+	rth->rt_dst	= daddr;
+	rth->rt_src	= saddr;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	rth->dst.tclassid = itag;
+#endif
+	rth->rt_route_iif = dev->ifindex;
+	rth->rt_iif	= dev->ifindex;
+	rth->rt_oif	= 0;
+	rth->rt_mark    = skb->mark;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->rt_peer_genid = 0;
+	rth->peer = NULL;
+	rth->fi = NULL;
+	if (res.type == RTN_UNREACHABLE) {
+		rth->dst.input= ip_error;
+		rth->dst.error= -err;
+		rth->rt_flags 	&= ~RTCF_LOCAL;
+	}
+	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
+	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
+	err = 0;
+	if (IS_ERR(rth))
+		err = PTR_ERR(rth);
+	goto out;
+
+no_route:
+	RT_CACHE_STAT_INC(in_no_route);
+	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	res.type = RTN_UNREACHABLE;
+	if (err == -ESRCH)
+		err = -ENETUNREACH;
+	goto local_input;
+
+	/*
+	 *	Do not cache martian addresses: they should be logged (RFC1812)
+	 */
+martian_destination:
+	RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
+			&daddr, &saddr, dev->name);
+#endif
+
+e_hostunreach:
+	err = -EHOSTUNREACH;
+	goto out;
+
+e_inval:
+	err = -EINVAL;
+	goto out;
+
+e_nobufs:
+	err = -ENOBUFS;
+	goto out;
+
+martian_source:
+	err = -EINVAL;
+martian_source_keep_err:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	goto out;
+}
+
+int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			   u8 tos, struct net_device *dev, bool noref)
+{
+	struct rtable * rth;
+	unsigned	hash;
+	int iif = dev->ifindex;
+	struct net *net;
+	int res;
+
+	net = dev_net(dev);
+
+	rcu_read_lock();
+
+	if (!rt_caching(net))
+		goto skip_cache;
+
+	tos &= IPTOS_RT_MASK;
+	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
+
+	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	     rth = rcu_dereference(rth->dst.rt_next)) {
+		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
+		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
+		     (rth->rt_route_iif ^ iif) |
+		     (rth->rt_key_tos ^ tos)) == 0 &&
+		    rth->rt_mark == skb->mark &&
+		    net_eq(dev_net(rth->dst.dev), net) &&
+		    !rt_is_expired(rth)) {
+			if (noref) {
+				dst_use_noref(&rth->dst, jiffies);
+				skb_dst_set_noref(skb, &rth->dst);
+			} else {
+				dst_use(&rth->dst, jiffies);
+				skb_dst_set(skb, &rth->dst);
+			}
+			RT_CACHE_STAT_INC(in_hit);
+			rcu_read_unlock();
+			return 0;
+		}
+		RT_CACHE_STAT_INC(in_hlist_search);
+	}
+
+skip_cache:
+	/* Multicast recognition logic is moved from route cache to here.
+	   The problem was that too many Ethernet cards have broken/missing
+	   hardware multicast filters :-( As result the host on multicasting
+	   network acquires a lot of useless route cache entries, sort of
+	   SDR messages from all the world. Now we try to get rid of them.
+	   Really, provided software IP multicast filter is organized
+	   reasonably (at least, hashed), it does not result in a slowdown
+	   comparing with route cache reject entries.
+	   Note, that multicast routers are not affected, because
+	   route cache entry is created eventually.
+	 */
+	if (ipv4_is_multicast(daddr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+		if (in_dev) {
+			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
+						  ip_hdr(skb)->protocol);
+			if (our
+#ifdef CONFIG_IP_MROUTE
+				||
+			    (!ipv4_is_local_multicast(daddr) &&
+			     IN_DEV_MFORWARD(in_dev))
+#endif
+			   ) {
+				int res = ip_route_input_mc(skb, daddr, saddr,
+							    tos, dev, our);
+				rcu_read_unlock();
+				return res;
+			}
+		}
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	rcu_read_unlock();
+	return res;
+}
+EXPORT_SYMBOL(ip_route_input_common);
+
+/* called with rcu_read_lock() */
+static struct rtable *__mkroute_output(const struct fib_result *res,
+				       const struct flowi4 *fl4,
+				       __be32 orig_daddr, __be32 orig_saddr,
+				       int orig_oif, struct net_device *dev_out,
+				       unsigned int flags)
+{
+	struct fib_info *fi = res->fi;
+	u32 tos = RT_FL_TOS(fl4);
+	struct in_device *in_dev;
+	u16 type = res->type;
+	struct rtable *rth;
+
+	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
+		return ERR_PTR(-EINVAL);
+
+	if (ipv4_is_lbcast(fl4->daddr))
+		type = RTN_BROADCAST;
+	else if (ipv4_is_multicast(fl4->daddr))
+		type = RTN_MULTICAST;
+	else if (ipv4_is_zeronet(fl4->daddr))
+		return ERR_PTR(-EINVAL);
+
+	if (dev_out->flags & IFF_LOOPBACK)
+		flags |= RTCF_LOCAL;
+
+	in_dev = __in_dev_get_rcu(dev_out);
+	if (!in_dev)
+		return ERR_PTR(-EINVAL);
+
+	if (type == RTN_BROADCAST) {
+		flags |= RTCF_BROADCAST | RTCF_LOCAL;
+		fi = NULL;
+	} else if (type == RTN_MULTICAST) {
+		flags |= RTCF_MULTICAST | RTCF_LOCAL;
+		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
+				     fl4->flowi4_proto))
+			flags &= ~RTCF_LOCAL;
+		/* If multicast route do not exist use
+		 * default one, but do not gateway in this case.
+		 * Yes, it is hack.
+		 */
+		if (fi && res->prefixlen < 4)
+			fi = NULL;
+	}
+
+	rth = rt_dst_alloc(dev_out,
+			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
+			   IN_DEV_CONF_GET(in_dev, NOXFRM));
+	if (!rth)
+		return ERR_PTR(-ENOBUFS);
+
+	rth->dst.output = ip_output;
+
+	rth->rt_key_dst	= orig_daddr;
+	rth->rt_key_src	= orig_saddr;
+	rth->rt_genid = rt_genid(dev_net(dev_out));
+	rth->rt_flags	= flags;
+	rth->rt_type	= type;
+	rth->rt_key_tos	= tos;
+	rth->rt_dst	= fl4->daddr;
+	rth->rt_src	= fl4->saddr;
+	rth->rt_route_iif = 0;
+	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
+	rth->rt_oif	= orig_oif;
+	rth->rt_mark    = fl4->flowi4_mark;
+	rth->rt_gateway = fl4->daddr;
+	rth->rt_spec_dst= fl4->saddr;
+	rth->rt_peer_genid = 0;
+	rth->peer = NULL;
+	rth->fi = NULL;
+
+	RT_CACHE_STAT_INC(out_slow_tot);
+
+	if (flags & RTCF_LOCAL) {
+		rth->dst.input = ip_local_deliver;
+		rth->rt_spec_dst = fl4->daddr;
+	}
+	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		rth->rt_spec_dst = fl4->saddr;
+		if (flags & RTCF_LOCAL &&
+		    !(dev_out->flags & IFF_LOOPBACK)) {
+			rth->dst.output = ip_mc_output;
+			RT_CACHE_STAT_INC(out_slow_mc);
+		}
+#ifdef CONFIG_IP_MROUTE
+		if (type == RTN_MULTICAST) {
+			if (IN_DEV_MFORWARD(in_dev) &&
+			    !ipv4_is_local_multicast(fl4->daddr)) {
+				rth->dst.input = ip_mr_input;
+				rth->dst.output = ip_mc_output;
+			}
+		}
+#endif
+	}
+
+	rt_set_nexthop(rth, fl4, res, fi, type, 0);
+
+	return rth;
+}
+
+/*
+ * Major route resolver routine.
+ * called with rcu_read_lock();
+ */
+
+static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
+{
+	struct net_device *dev_out = NULL;
+	u32 tos	= RT_FL_TOS(fl4);
+	unsigned int flags = 0;
+	struct fib_result res;
+	struct rtable *rth;
+	__be32 orig_daddr;
+	__be32 orig_saddr;
+	int orig_oif;
+
+	res.fi		= NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r		= NULL;
+#endif
+
+	orig_daddr = fl4->daddr;
+	orig_saddr = fl4->saddr;
+	orig_oif = fl4->flowi4_oif;
+
+	fl4->flowi4_iif = net->loopback_dev->ifindex;
+	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
+	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
+			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
+
+	rcu_read_lock();
+	if (fl4->saddr) {
+		rth = ERR_PTR(-EINVAL);
+		if (ipv4_is_multicast(fl4->saddr) ||
+		    ipv4_is_lbcast(fl4->saddr) ||
+		    ipv4_is_zeronet(fl4->saddr))
+			goto out;
+
+		/* I removed check for oif == dev_out->oif here.
+		   It was wrong for two reasons:
+		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+		      is assigned to multiple interfaces.
+		   2. Moreover, we are allowed to send packets with saddr
+		      of another iface. --ANK
+		 */
+
+		if (fl4->flowi4_oif == 0 &&
+		    (ipv4_is_multicast(fl4->daddr) ||
+		     ipv4_is_lbcast(fl4->daddr))) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			dev_out = __ip_dev_find(net, fl4->saddr, false);
+			if (dev_out == NULL)
+				goto out;
+
+			/* Special hack: user can direct multicasts
+			   and limited broadcast via necessary interface
+			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+			   This hack is not just for fun, it allows
+			   vic,vat and friends to work.
+			   They bind socket to loopback, set ttl to zero
+			   and expect that it will work.
+			   From the viewpoint of routing cache they are broken,
+			   because we are not allowed to build multicast path
+			   with loopback source addr (look, routing cache
+			   cannot know, that ttl is zero, so that packet
+			   will not leave this host and route is valid).
+			   Luckily, this hack is good workaround.
+			 */
+
+			fl4->flowi4_oif = dev_out->ifindex;
+			goto make_route;
+		}
+
+		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+			if (!__ip_dev_find(net, fl4->saddr, false))
+				goto out;
+		}
+	}
+
+
+	if (fl4->flowi4_oif) {
+		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
+		rth = ERR_PTR(-ENODEV);
+		if (dev_out == NULL)
+			goto out;
+
+		/* RACE: Check return value of inet_select_addr instead. */
+		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
+			rth = ERR_PTR(-ENETUNREACH);
+			goto out;
+		}
+		if (ipv4_is_local_multicast(fl4->daddr) ||
+		    ipv4_is_lbcast(fl4->daddr)) {
+			if (!fl4->saddr)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			goto make_route;
+		}
+		if (fl4->saddr) {
+			if (ipv4_is_multicast(fl4->daddr))
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      fl4->flowi4_scope);
+			else if (!fl4->daddr)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_HOST);
+		}
+	}
+
+	if (!fl4->daddr) {
+		fl4->daddr = fl4->saddr;
+		if (!fl4->daddr)
+			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
+		dev_out = net->loopback_dev;
+		fl4->flowi4_oif = net->loopback_dev->ifindex;
+		res.type = RTN_LOCAL;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+	if (fib_lookup(net, fl4, &res)) {
+		res.fi = NULL;
+		if (fl4->flowi4_oif) {
+			/* Apparently, routing tables are wrong. Assume,
+			   that the destination is on link.
+
+			   WHY? DW.
+			   Because we are allowed to send to iface
+			   even if it has NO routes and NO assigned
+			   addresses. When oif is specified, routing
+			   tables are looked up with only one purpose:
+			   to catch if destination is gatewayed, rather than
+			   direct. Moreover, if MSG_DONTROUTE is set,
+			   we send packet, ignoring both routing tables
+			   and ifaddr state. --ANK
+
+
+			   We could make it even if oif is unknown,
+			   likely IPv6, but we do not.
+			 */
+
+			if (fl4->saddr == 0)
+				fl4->saddr = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			res.type = RTN_UNICAST;
+			goto make_route;
+		}
+		rth = ERR_PTR(-ENETUNREACH);
+		goto out;
+	}
+
+	if (res.type == RTN_LOCAL) {
+		if (!fl4->saddr) {
+			if (res.fi->fib_prefsrc)
+				fl4->saddr = res.fi->fib_prefsrc;
+			else
+				fl4->saddr = fl4->daddr;
+		}
+		dev_out = net->loopback_dev;
+		fl4->flowi4_oif = dev_out->ifindex;
+		res.fi = NULL;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
+		fib_select_multipath(&res);
+	else
+#endif
+	if (!res.prefixlen &&
+	    res.table->tb_num_default > 1 &&
+	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
+		fib_select_default(&res);
+
+	if (!fl4->saddr)
+		fl4->saddr = FIB_RES_PREFSRC(net, res);
+
+	dev_out = FIB_RES_DEV(res);
+	fl4->flowi4_oif = dev_out->ifindex;
+
+
+make_route:
+	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
+			       dev_out, flags);
+	if (!IS_ERR(rth)) {
+		unsigned int hash;
+
+		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
+			       rt_genid(dev_net(dev_out)));
+		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
+	}
+
+out:
+	rcu_read_unlock();
+	return rth;
+}
+
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
+{
+	struct rtable *rth;
+	unsigned int hash;
+
+	if (!rt_caching(net))
+		goto slow_output;
+
+	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
+
+	rcu_read_lock_bh();
+	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
+		rth = rcu_dereference_bh(rth->dst.rt_next)) {
+		if (rth->rt_key_dst == flp4->daddr &&
+		    rth->rt_key_src == flp4->saddr &&
+		    rt_is_output_route(rth) &&
+		    rth->rt_oif == flp4->flowi4_oif &&
+		    rth->rt_mark == flp4->flowi4_mark &&
+		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
+			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
+		    net_eq(dev_net(rth->dst.dev), net) &&
+		    !rt_is_expired(rth)) {
+			dst_use(&rth->dst, jiffies);
+			RT_CACHE_STAT_INC(out_hit);
+			rcu_read_unlock_bh();
+			if (!flp4->saddr)
+				flp4->saddr = rth->rt_src;
+			if (!flp4->daddr)
+				flp4->daddr = rth->rt_dst;
+			return rth;
+		}
+		RT_CACHE_STAT_INC(out_hlist_search);
+	}
+	rcu_read_unlock_bh();
+
+slow_output:
+	return ip_route_output_slow(net, flp4);
+}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	return NULL;
+}
+
+static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
+{
+	return 0;
+}
+
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+}
+
+static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
+					  unsigned long old)
+{
+	return NULL;
+}
+
+static struct dst_ops ipv4_dst_blackhole_ops = {
+	.family			=	AF_INET,
+	.protocol		=	cpu_to_be16(ETH_P_IP),
+	.destroy		=	ipv4_dst_destroy,
+	.check			=	ipv4_blackhole_dst_check,
+	.default_mtu		=	ipv4_blackhole_default_mtu,
+	.default_advmss		=	ipv4_default_advmss,
+	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
+	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
+};
+
+struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
+	struct rtable *ort = (struct rtable *) dst_orig;
+
+	if (rt) {
+		struct dst_entry *new = &rt->dst;
+
+		new->__use = 1;
+		new->input = dst_discard;
+		new->output = dst_discard;
+		dst_copy_metrics(new, &ort->dst);
+
+		new->dev = ort->dst.dev;
+		if (new->dev)
+			dev_hold(new->dev);
+
+		rt->rt_key_dst = ort->rt_key_dst;
+		rt->rt_key_src = ort->rt_key_src;
+		rt->rt_key_tos = ort->rt_key_tos;
+		rt->rt_route_iif = ort->rt_route_iif;
+		rt->rt_iif = ort->rt_iif;
+		rt->rt_oif = ort->rt_oif;
+		rt->rt_mark = ort->rt_mark;
+
+		rt->rt_genid = rt_genid(net);
+		rt->rt_flags = ort->rt_flags;
+		rt->rt_type = ort->rt_type;
+		rt->rt_dst = ort->rt_dst;
+		rt->rt_src = ort->rt_src;
+		rt->rt_gateway = ort->rt_gateway;
+		rt->rt_spec_dst = ort->rt_spec_dst;
+		rt->peer = ort->peer;
+		if (rt->peer)
+			atomic_inc(&rt->peer->refcnt);
+		rt->fi = ort->fi;
+		if (rt->fi)
+			atomic_inc(&rt->fi->fib_clntref);
+
+		dst_free(new);
+	}
+
+	dst_release(dst_orig);
+
+	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
+}
+
+struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
+				    struct sock *sk)
+{
+	struct rtable *rt = __ip_route_output_key(net, flp4);
+
+	if (IS_ERR(rt))
+		return rt;
+
+	if (flp4->flowi4_proto)
+		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
+						   flowi4_to_flowi(flp4),
+						   sk, 0);
+
+	return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
+static int rt_fill_info(struct net *net,
+			struct sk_buff *skb, u32 pid, u32 seq, int event,
+			int nowait, unsigned int flags)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct rtmsg *r;
+	struct nlmsghdr *nlh;
+	long expires = 0;
+	const struct inet_peer *peer = rt->peer;
+	u32 id = 0, ts = 0, tsage = 0, error;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	r->rtm_family	 = AF_INET;
+	r->rtm_dst_len	= 32;
+	r->rtm_src_len	= 0;
+	r->rtm_tos	= rt->rt_key_tos;
+	r->rtm_table	= RT_TABLE_MAIN;
+	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
+	r->rtm_type	= rt->rt_type;
+	r->rtm_scope	= RT_SCOPE_UNIVERSE;
+	r->rtm_protocol = RTPROT_UNSPEC;
+	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+	if (rt->rt_flags & RTCF_NOTIFY)
+		r->rtm_flags |= RTM_F_NOTIFY;
+
+	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
+
+	if (rt->rt_key_src) {
+		r->rtm_src_len = 32;
+		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
+	}
+	if (rt->dst.dev)
+		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (rt->dst.tclassid)
+		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
+#endif
+	if (rt_is_input_route(rt))
+		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
+	else if (rt->rt_src != rt->rt_key_src)
+		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
+
+	if (rt->rt_dst != rt->rt_gateway)
+		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
+
+	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+		goto nla_put_failure;
+
+	if (rt->rt_mark)
+		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
+
+	error = rt->dst.error;
+	if (peer) {
+		inet_peer_refcheck(rt->peer);
+		id = atomic_read(&peer->ip_id_count) & 0xffff;
+		if (peer->tcp_ts_stamp) {
+			ts = peer->tcp_ts;
+			tsage = get_seconds() - peer->tcp_ts_stamp;
+		}
+		expires = ACCESS_ONCE(peer->pmtu_expires);
+		if (expires)
+			expires -= jiffies;
+	}
+
+	if (rt_is_input_route(rt)) {
+#ifdef CONFIG_IP_MROUTE
+		__be32 dst = rt->rt_dst;
+
+		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
+		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+			int err = ipmr_get_route(net, skb,
+						 rt->rt_src, rt->rt_dst,
+						 r, nowait);
+			if (err <= 0) {
+				if (!nowait) {
+					if (err == 0)
+						return 0;
+					goto nla_put_failure;
+				} else {
+					if (err == -EMSGSIZE)
+						goto nla_put_failure;
+					error = err;
+				}
+			}
+		} else
+#endif
+			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
+	}
+
+	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
+			       expires, error) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct rtmsg *rtm;
+	struct nlattr *tb[RTA_MAX+1];
+	struct rtable *rt = NULL;
+	__be32 dst = 0;
+	__be32 src = 0;
+	u32 iif;
+	int err;
+	int mark;
+	struct sk_buff *skb;
+
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	rtm = nlmsg_data(nlh);
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	/* Reserve room for dummy headers, this skb can pass
+	   through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+
+	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
+	ip_hdr(skb)->protocol = IPPROTO_ICMP;
+	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
+	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
+	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+
+	if (iif) {
+		struct net_device *dev;
+
+		dev = __dev_get_by_index(net, iif);
+		if (dev == NULL) {
+			err = -ENODEV;
+			goto errout_free;
+		}
+
+		skb->protocol	= htons(ETH_P_IP);
+		skb->dev	= dev;
+		skb->mark	= mark;
+		local_bh_disable();
+		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+		local_bh_enable();
+
+		rt = skb_rtable(skb);
+		if (err == 0 && rt->dst.error)
+			err = -rt->dst.error;
+	} else {
+		struct flowi4 fl4 = {
+			.daddr = dst,
+			.saddr = src,
+			.flowi4_tos = rtm->rtm_tos,
+			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+			.flowi4_mark = mark,
+		};
+		rt = ip_route_output_key(net, &fl4);
+
+		err = 0;
+		if (IS_ERR(rt))
+			err = PTR_ERR(rt);
+	}
+
+	if (err)
+		goto errout_free;
+
+	skb_dst_set(skb, &rt->dst);
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
+		rt->rt_flags |= RTCF_NOTIFY;
+
+	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+			   RTM_NEWROUTE, 0, 0);
+	if (err <= 0)
+		goto errout_free;
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+errout:
+	return err;
+
+errout_free:
+	kfree_skb(skb);
+	goto errout;
+}
+
+int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+{
+	struct rtable *rt;
+	int h, s_h;
+	int idx, s_idx;
+	struct net *net;
+
+	net = sock_net(skb->sk);
+
+	s_h = cb->args[0];
+	if (s_h < 0)
+		s_h = 0;
+	s_idx = idx = cb->args[1];
+	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
+		if (!rt_hash_table[h].chain)
+			continue;
+		rcu_read_lock_bh();
+		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
+		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
+			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
+				continue;
+			if (rt_is_expired(rt))
+				continue;
+			skb_dst_set_noref(skb, &rt->dst);
+			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
+					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+					 1, NLM_F_MULTI) <= 0) {
+				skb_dst_drop(skb);
+				rcu_read_unlock_bh();
+				goto done;
+			}
+			skb_dst_drop(skb);
+		}
+		rcu_read_unlock_bh();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	return skb->len;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+	rt_cache_flush(dev_net(in_dev->dev), 0);
+}
+
+#ifdef CONFIG_SYSCTL
+static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
+					void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	if (write) {
+		int flush_delay;
+		ctl_table ctl;
+		struct net *net;
+
+		memcpy(&ctl, __ctl, sizeof(ctl));
+		ctl.data = &flush_delay;
+		proc_dointvec(&ctl, write, buffer, lenp, ppos);
+
+		net = (struct net *)__ctl->extra1;
+		rt_cache_flush(net, flush_delay);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static ctl_table ipv4_route_table[] = {
+	{
+		.procname	= "gc_thresh",
+		.data		= &ipv4_dst_ops.gc_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "max_size",
+		.data		= &ip_rt_max_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		/*  Deprecated. Use gc_min_interval_ms */
+
+		.procname	= "gc_min_interval",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "gc_min_interval_ms",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
+		.procname	= "gc_timeout",
+		.data		= &ip_rt_gc_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "redirect_load",
+		.data		= &ip_rt_redirect_load,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "redirect_number",
+		.data		= &ip_rt_redirect_number,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "redirect_silence",
+		.data		= &ip_rt_redirect_silence,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "error_cost",
+		.data		= &ip_rt_error_cost,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "error_burst",
+		.data		= &ip_rt_error_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "gc_elasticity",
+		.data		= &ip_rt_gc_elasticity,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "mtu_expires",
+		.data		= &ip_rt_mtu_expires,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "min_pmtu",
+		.data		= &ip_rt_min_pmtu,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "min_adv_mss",
+		.data		= &ip_rt_min_advmss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table empty[1];
+
+static struct ctl_table ipv4_skeleton[] =
+{
+	{ .procname = "route", 
+	  .mode = 0555, .child = ipv4_route_table},
+	{ .procname = "neigh", 
+	  .mode = 0555, .child = empty},
+	{ }
+};
+
+static __net_initdata struct ctl_path ipv4_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ },
+};
+
+static struct ctl_table ipv4_route_flush_table[] = {
+	{
+		.procname	= "flush",
+		.maxlen		= sizeof(int),
+		.mode		= 0200,
+		.proc_handler	= ipv4_sysctl_rtcache_flush,
+	},
+	{ },
+};
+
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ .procname = "route", },
+	{ },
+};
+
+static __net_init int sysctl_route_net_init(struct net *net)
+{
+	struct ctl_table *tbl;
+
+	tbl = ipv4_route_flush_table;
+	if (!net_eq(net, &init_net)) {
+		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+		if (tbl == NULL)
+			goto err_dup;
+	}
+	tbl[0].extra1 = net;
+
+	net->ipv4.route_hdr =
+		register_net_sysctl_table(net, ipv4_route_path, tbl);
+	if (net->ipv4.route_hdr == NULL)
+		goto err_reg;
+	return 0;
+
+err_reg:
+	if (tbl != ipv4_route_flush_table)
+		kfree(tbl);
+err_dup:
+	return -ENOMEM;
+}
+
+static __net_exit void sysctl_route_net_exit(struct net *net)
+{
+	struct ctl_table *tbl;
+
+	tbl = net->ipv4.route_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.route_hdr);
+	BUG_ON(tbl == ipv4_route_flush_table);
+	kfree(tbl);
+}
+
+static __net_initdata struct pernet_operations sysctl_route_ops = {
+	.init = sysctl_route_net_init,
+	.exit = sysctl_route_net_exit,
+};
+#endif
+
+static __net_init int rt_genid_init(struct net *net)
+{
+	get_random_bytes(&net->ipv4.rt_genid,
+			 sizeof(net->ipv4.rt_genid));
+	get_random_bytes(&net->ipv4.dev_addr_genid,
+			 sizeof(net->ipv4.dev_addr_genid));
+	return 0;
+}
+
+static __net_initdata struct pernet_operations rt_genid_ops = {
+	.init = rt_genid_init,
+};
+
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
+#endif /* CONFIG_IP_ROUTE_CLASSID */
+
+static __initdata unsigned long rhash_entries;
+static int __init set_rhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	rhash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("rhash_entries=", set_rhash_entries);
+
+int __init ip_rt_init(void)
+{
+	int rc = 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
+	if (!ip_rt_acct)
+		panic("IP: failed to allocate ip_rt_acct\n");
+#endif
+
+	ipv4_dst_ops.kmem_cachep =
+		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+
+	if (dst_entries_init(&ipv4_dst_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
+	rt_hash_table = (struct rt_hash_bucket *)
+		alloc_large_system_hash("IP route cache",
+					sizeof(struct rt_hash_bucket),
+					rhash_entries,
+					(totalram_pages >= 128 * 1024) ?
+					15 : 17,
+					0,
+					&rt_hash_log,
+					&rt_hash_mask,
+					rhash_entries ? 0 : 512 * 1024);
+	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+	rt_hash_lock_init();
+
+	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
+	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+
+	devinet_init();
+	ip_fib_init();
+
+	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+	expires_ljiffies = jiffies;
+	schedule_delayed_work(&expires_work,
+		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
+	if (ip_rt_proc_init())
+		printk(KERN_ERR "Unable to create route proc files\n");
+#ifdef CONFIG_XFRM
+	xfrm_init();
+	xfrm4_init(ip_rt_max_size);
+#endif
+	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
+
+#ifdef CONFIG_SYSCTL
+	register_pernet_subsys(&sysctl_route_ops);
+#endif
+	register_pernet_subsys(&rt_genid_ops);
+	return rc;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+	register_sysctl_paths(ipv4_path, ipv4_skeleton);
+}
+#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 00000000..895f2157
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,377 @@
+/*
+ *  Syncookies implementation for the Linux kernel
+ *
+ *  Copyright (C) 1997 Andi Kleen
+ *  Based on ideas by D.J.Bernstein and Eric Schenk.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+/* Timestamps: lowest bits store TCP options */
+#define TSBITS 6
+#define TSMASK (((__u32)1 << TSBITS) - 1)
+
+extern int sysctl_tcp_syncookies;
+
+__u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+EXPORT_SYMBOL(syncookie_secret);
+
+static __init int init_syncookies(void)
+{
+	get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+	return 0;
+}
+__initcall(init_syncookies);
+
+#define COOKIEBITS 24	/* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+		      ipv4_cookie_scratch);
+
+static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+		       u32 count, int c)
+{
+	__u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
+
+	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
+	tmp[0] = (__force u32)saddr;
+	tmp[1] = (__force u32)daddr;
+	tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
+	tmp[3] = count;
+	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+	return tmp[17];
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we encode
+ * tcp options in the lower bits of the timestamp value that will be
+ * sent in the syn-ack.
+ * Since subsequent timestamps use the normal tcp_time_stamp value, we
+ * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
+ */
+__u32 cookie_init_timestamp(struct request_sock *req)
+{
+	struct inet_request_sock *ireq;
+	u32 ts, ts_now = tcp_time_stamp;
+	u32 options = 0;
+
+	ireq = inet_rsk(req);
+
+	options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
+	options |= ireq->sack_ok << 4;
+	options |= ireq->ecn_ok << 5;
+
+	ts = ts_now & ~TSMASK;
+	ts |= options;
+	if (ts > ts_now) {
+		ts >>= TSBITS;
+		ts--;
+		ts <<= TSBITS;
+		ts |= options;
+	}
+	return ts;
+}
+
+
+static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
+				   __be16 dport, __u32 sseq, __u32 count,
+				   __u32 data)
+{
+	/*
+	 * Compute the secure sequence number.
+	 * The output should be:
+	 *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+	 *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+	 * Where sseq is their sequence number and count increases every
+	 * minute by 1.
+	 * As an extra hack, we add a small "data" value that encodes the
+	 * MSS into the second hash value.
+	 */
+
+	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+		sseq + (count << COOKIEBITS) +
+		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+		 & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range.  This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count".  The return value
+ * is (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
+				  __be16 sport, __be16 dport, __u32 sseq,
+				  __u32 count, __u32 maxdiff)
+{
+	__u32 diff;
+
+	/* Strip away the layers from the cookie */
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
+	if (diff >= maxdiff)
+		return (__u32)-1;
+
+	return (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;	/* Leaving the data behind */
+}
+
+/*
+ * MSS Values are taken from the 2009 paper
+ * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
+ *  - values 1440 to 1460 accounted for 80% of observed mss values
+ *  - values outside the 536-1460 range are rare (<0.2%).
+ *
+ * Table must be sorted.
+ */
+static __u16 const msstab[] = {
+	64,
+	512,
+	536,
+	1024,
+	1440,
+	1460,
+	4312,
+	8960,
+};
+
+/*
+ * Generate a syncookie.  mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+	int mssind;
+	const __u16 mss = *mssp;
+
+	tcp_synq_overflow(sk);
+
+	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+		if (mss >= msstab[mssind])
+			break;
+	*mssp = msstab[mssind];
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+	return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
+				     th->source, th->dest, ntohl(th->seq),
+				     jiffies / (HZ * 60), mssind);
+}
+
+/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*
+ * Check if a ack sequence number is a valid syncookie.
+ * Return the decoded mss if it is, or 0 if not.
+ */
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+	__u32 seq = ntohl(th->seq) - 1;
+	__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+					    th->source, th->dest, seq,
+					    jiffies / (HZ * 60),
+					    COUNTER_TRIES);
+
+	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+					   struct request_sock *req,
+					   struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *child;
+
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+	if (child)
+		inet_csk_reqsk_queue_add(sk, req, child);
+	else
+		reqsk_free(req);
+
+	return child;
+}
+
+
+/*
+ * when syncookies are in effect and tcp timestamps are enabled we stored
+ * additional tcp options in the timestamp.
+ * This extracts these options from the timestamp echo.
+ *
+ * The lowest 4 bits store snd_wscale.
+ * next 2 bits indicate SACK and ECN support.
+ *
+ * return false if we decode an option that should not be.
+ */
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
+{
+	/* echoed timestamp, lowest bits contain options */
+	u32 options = tcp_opt->rcv_tsecr & TSMASK;
+
+	if (!tcp_opt->saw_tstamp)  {
+		tcp_clear_options(tcp_opt);
+		return true;
+	}
+
+	if (!sysctl_tcp_timestamps)
+		return false;
+
+	tcp_opt->sack_ok = (options >> 4) & 0x1;
+	*ecn_ok = (options >> 5) & 1;
+	if (*ecn_ok && !sysctl_tcp_ecn)
+		return false;
+
+	if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+		return false;
+
+	if ((options & 0xf) == 0xf)
+		return true; /* no window scaling */
+
+	tcp_opt->wscale_ok = 1;
+	tcp_opt->snd_wscale = options & 0xf;
+	return sysctl_tcp_window_scaling != 0;
+}
+EXPORT_SYMBOL(cookie_check_timestamp);
+
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+			     struct ip_options *opt)
+{
+	struct tcp_options_received tcp_opt;
+	u8 *hash_location;
+	struct inet_request_sock *ireq;
+	struct tcp_request_sock *treq;
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcphdr *th = tcp_hdr(skb);
+	__u32 cookie = ntohl(th->ack_seq) - 1;
+	struct sock *ret = sk;
+	struct request_sock *req;
+	int mss;
+	struct rtable *rt;
+	__u8 rcv_wscale;
+	bool ecn_ok = false;
+	struct flowi4 fl4;
+
+	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+		goto out;
+
+	if (tcp_synq_no_recent_overflow(sk) ||
+	    (mss = cookie_check(skb, cookie)) == 0) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+		goto out;
+	}
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+		goto out;
+
+	ret = NULL;
+	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
+	if (!req)
+		goto out;
+
+	ireq = inet_rsk(req);
+	treq = tcp_rsk(req);
+	treq->rcv_isn		= ntohl(th->seq) - 1;
+	treq->snt_isn		= cookie;
+	req->mss		= mss;
+	ireq->loc_port		= th->dest;
+	ireq->rmt_port		= th->source;
+	ireq->loc_addr		= ip_hdr(skb)->daddr;
+	ireq->rmt_addr		= ip_hdr(skb)->saddr;
+	ireq->ecn_ok		= ecn_ok;
+	ireq->snd_wscale	= tcp_opt.snd_wscale;
+	ireq->sack_ok		= tcp_opt.sack_ok;
+	ireq->wscale_ok		= tcp_opt.wscale_ok;
+	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
+	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+
+	/* We throwed the options of the initial SYN away, so we hope
+	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
+	 */
+	if (opt && opt->optlen) {
+		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
+
+		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
+		if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
+			kfree(ireq->opt);
+			ireq->opt = NULL;
+		}
+	}
+
+	if (security_inet_conn_request(sk, skb, req)) {
+		reqsk_free(req);
+		goto out;
+	}
+
+	req->expires	= 0UL;
+	req->retrans	= 0;
+
+	/*
+	 * We need to lookup the route here to get at the correct
+	 * window size. We should better make sure that the window size
+	 * hasn't changed since we received the original syn, but I see
+	 * no easy way to do this.
+	 */
+	flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
+			   RT_SCOPE_UNIVERSE, IPPROTO_TCP,
+			   inet_sk_flowi_flags(sk),
+			   (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
+			   ireq->loc_addr, th->source, th->dest);
+	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
+	rt = ip_route_output_key(sock_net(sk), &fl4);
+	if (IS_ERR(rt)) {
+		reqsk_free(req);
+		goto out;
+	}
+
+	/* Try to redo what tcp_v4_send_synack did. */
+	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+
+	tcp_select_initial_window(tcp_full_space(sk), req->mss,
+				  &req->rcv_wnd, &req->window_clamp,
+				  ireq->wscale_ok, &rcv_wscale,
+				  dst_metric(&rt->dst, RTAX_INITRWND));
+
+	ireq->rcv_wscale  = rcv_wscale;
+
+	ret = get_cookie_sock(sk, skb, req, &rt->dst);
+	/* ip_queue_xmit() depends on our flow being setup
+	 * Normal sockets get it right from inet_csk_route_child_sock()
+	 */
+	if (ret)
+		inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out:	return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 00000000..57d0752e
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,840 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/seqlock.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <net/snmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/cipso_ipv4.h>
+#include <net/inet_frag.h>
+#include <net/ping.h>
+
+static int zero;
+static int tcp_retr1_max = 255;
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+static int tcp_adv_win_scale_min = -31;
+static int tcp_adv_win_scale_max = 31;
+static int ip_ttl_min = 1;
+static int ip_ttl_max = 255;
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+
+/* Update system visible IP port range */
+static void set_local_port_range(int range[2])
+{
+	write_seqlock(&sysctl_local_ports.lock);
+	sysctl_local_ports.range[0] = range[0];
+	sysctl_local_ports.range[1] = range[1];
+	write_sequnlock(&sysctl_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(ctl_table *table, int write,
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int range[2];
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &ip_local_port_range_min,
+		.extra2 = &ip_local_port_range_max,
+	};
+
+	inet_get_local_port_range(range, range + 1);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		if (range[1] < range[0])
+			ret = -EINVAL;
+		else
+			set_local_port_range(range);
+	}
+
+	return ret;
+}
+
+
+void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+{
+	gid_t *data = table->data;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, int range[2])
+{
+	gid_t *data = table->data;
+	write_seqlock(&sysctl_local_ports.lock);
+	data[0] = range[0];
+	data[1] = range[1];
+	write_sequnlock(&sysctl_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(ctl_table *table, int write,
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	gid_t range[2];
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &ip_ping_group_range_min,
+		.extra2 = &ip_ping_group_range_max,
+	};
+
+	inet_get_ping_group_range_table(table, range, range + 1);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0)
+		set_ping_group_range(table, range);
+
+	return ret;
+}
+
+static int proc_tcp_congestion_control(ctl_table *ctl, int write,
+				       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+static int proc_tcp_available_congestion_control(ctl_table *ctl,
+						 int write,
+						 void __user *buffer, size_t *lenp,
+						 loff_t *ppos)
+{
+	ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+	tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+	return ret;
+}
+
+static int proc_allowed_congestion_control(ctl_table *ctl,
+					   int write,
+					   void __user *buffer, size_t *lenp,
+					   loff_t *ppos)
+{
+	ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+
+	tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_allowed_congestion_control(tbl.data);
+	kfree(tbl.data);
+	return ret;
+}
+
+static struct ctl_table ipv4_table[] = {
+	{
+		.procname	= "tcp_timestamps",
+		.data		= &sysctl_tcp_timestamps,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_window_scaling",
+		.data		= &sysctl_tcp_window_scaling,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_sack",
+		.data		= &sysctl_tcp_sack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_retrans_collapse",
+		.data		= &sysctl_tcp_retrans_collapse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip_default_ttl",
+		.data		= &sysctl_ip_default_ttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &ip_ttl_min,
+		.extra2		= &ip_ttl_max,
+	},
+	{
+		.procname	= "ip_no_pmtu_disc",
+		.data		= &ipv4_config.no_pmtu_disc,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip_nonlocal_bind",
+		.data		= &sysctl_ip_nonlocal_bind,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_syn_retries",
+		.data		= &sysctl_tcp_syn_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_synack_retries",
+		.data		= &sysctl_tcp_synack_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_max_orphans",
+		.data		= &sysctl_tcp_max_orphans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_max_tw_buckets",
+		.data		= &tcp_death_row.sysctl_max_tw_buckets,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip_dynaddr",
+		.data		= &sysctl_ip_dynaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_keepalive_time",
+		.data		= &sysctl_tcp_keepalive_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_keepalive_probes",
+		.data		= &sysctl_tcp_keepalive_probes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_keepalive_intvl",
+		.data		= &sysctl_tcp_keepalive_intvl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_retries1",
+		.data		= &sysctl_tcp_retries1,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra2		= &tcp_retr1_max
+	},
+	{
+		.procname	= "tcp_retries2",
+		.data		= &sysctl_tcp_retries2,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &sysctl_tcp_fin_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+#ifdef CONFIG_SYN_COOKIES
+	{
+		.procname	= "tcp_syncookies",
+		.data		= &sysctl_tcp_syncookies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
+	{
+		.procname	= "tcp_tw_recycle",
+		.data		= &tcp_death_row.sysctl_tw_recycle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_abort_on_overflow",
+		.data		= &sysctl_tcp_abort_on_overflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_stdurg",
+		.data		= &sysctl_tcp_stdurg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_rfc1337",
+		.data		= &sysctl_tcp_rfc1337,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip_local_port_range",
+		.data		= &sysctl_local_ports.range,
+		.maxlen		= sizeof(sysctl_local_ports.range),
+		.mode		= 0644,
+		.proc_handler	= ipv4_local_port_range,
+	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
+	{
+		.procname	= "igmp_max_memberships",
+		.data		= &sysctl_igmp_max_memberships,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "igmp_max_msf",
+		.data		= &sysctl_igmp_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "inet_peer_threshold",
+		.data		= &inet_peer_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "inet_peer_minttl",
+		.data		= &inet_peer_minttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "inet_peer_maxttl",
+		.data		= &inet_peer_maxttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "inet_peer_gc_mintime",
+		.data		= &inet_peer_gc_mintime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "inet_peer_gc_maxtime",
+		.data		= &inet_peer_gc_maxtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "tcp_orphan_retries",
+		.data		= &sysctl_tcp_orphan_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_fack",
+		.data		= &sysctl_tcp_fack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_reordering",
+		.data		= &sysctl_tcp_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_ecn",
+		.data		= &sysctl_tcp_ecn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_dsack",
+		.data		= &sysctl_tcp_dsack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_mem",
+		.data		= &sysctl_tcp_mem,
+		.maxlen		= sizeof(sysctl_tcp_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax
+	},
+	{
+		.procname	= "tcp_wmem",
+		.data		= &sysctl_tcp_wmem,
+		.maxlen		= sizeof(sysctl_tcp_wmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_rmem",
+		.data		= &sysctl_tcp_rmem,
+		.maxlen		= sizeof(sysctl_tcp_rmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_app_win",
+		.data		= &sysctl_tcp_app_win,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_adv_win_scale",
+		.data		= &sysctl_tcp_adv_win_scale,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &tcp_adv_win_scale_min,
+		.extra2		= &tcp_adv_win_scale_max,
+	},
+	{
+		.procname	= "tcp_tw_reuse",
+		.data		= &sysctl_tcp_tw_reuse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_frto",
+		.data		= &sysctl_tcp_frto,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_frto_response",
+		.data		= &sysctl_tcp_frto_response,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_low_latency",
+		.data		= &sysctl_tcp_low_latency,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_no_metrics_save",
+		.data		= &sysctl_tcp_nometrics_save,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_moderate_rcvbuf",
+		.data		= &sysctl_tcp_moderate_rcvbuf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_tso_win_divisor",
+		.data		= &sysctl_tcp_tso_win_divisor,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_congestion_control",
+		.mode		= 0644,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= proc_tcp_congestion_control,
+	},
+	{
+		.procname	= "tcp_abc",
+		.data		= &sysctl_tcp_abc,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_mtu_probing",
+		.data		= &sysctl_tcp_mtu_probing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_base_mss",
+		.data		= &sysctl_tcp_base_mss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_workaround_signed_windows",
+		.data		= &sysctl_tcp_workaround_signed_windows,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_NET_DMA
+	{
+		.procname	= "tcp_dma_copybreak",
+		.data		= &sysctl_tcp_dma_copybreak,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
+	{
+		.procname	= "tcp_slow_start_after_idle",
+		.data		= &sysctl_tcp_slow_start_after_idle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#ifdef CONFIG_NETLABEL
+	{
+		.procname	= "cipso_cache_enable",
+		.data		= &cipso_v4_cache_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_cache_bucket_size",
+		.data		= &cipso_v4_cache_bucketsize,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_rbm_optfmt",
+		.data		= &cipso_v4_rbm_optfmt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "cipso_rbm_strictvalid",
+		.data		= &cipso_v4_rbm_strictvalid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif /* CONFIG_NETLABEL */
+	{
+		.procname	= "tcp_available_congestion_control",
+		.maxlen		= TCP_CA_BUF_MAX,
+		.mode		= 0444,
+		.proc_handler   = proc_tcp_available_congestion_control,
+	},
+	{
+		.procname	= "tcp_allowed_congestion_control",
+		.maxlen		= TCP_CA_BUF_MAX,
+		.mode		= 0644,
+		.proc_handler   = proc_allowed_congestion_control,
+	},
+	{
+		.procname	= "tcp_max_ssthresh",
+		.data		= &sysctl_tcp_max_ssthresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_cookie_size",
+		.data		= &sysctl_tcp_cookie_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname       = "tcp_thin_linear_timeouts",
+		.data           = &sysctl_tcp_thin_linear_timeouts,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
+        {
+		.procname       = "tcp_thin_dupack",
+		.data           = &sysctl_tcp_thin_dupack,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec
+	},
+	{
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "udp_rmem_min",
+		.data		= &sysctl_udp_rmem_min,
+		.maxlen		= sizeof(sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
+	{
+		.procname	= "udp_wmem_min",
+		.data		= &sysctl_udp_wmem_min,
+		.maxlen		= sizeof(sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero
+	},
+	{ }
+};
+
+static struct ctl_table ipv4_net_table[] = {
+	{
+		.procname	= "icmp_echo_ignore_all",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "icmp_echo_ignore_broadcasts",
+		.data		= &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "icmp_ignore_bogus_error_responses",
+		.data		= &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "icmp_errors_use_inbound_ifaddr",
+		.data		= &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "icmp_ratelimit",
+		.data		= &init_net.ipv4.sysctl_icmp_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
+		.procname	= "icmp_ratemask",
+		.data		= &init_net.ipv4.sysctl_icmp_ratemask,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "rt_cache_rebuild_count",
+		.data		= &init_net.ipv4.sysctl_rt_cache_rebuild_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ping_group_range",
+		.data		= &init_net.ipv4.sysctl_ping_group_range,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_ping_group_range),
+		.mode		= 0644,
+		.proc_handler	= ipv4_ping_group_range,
+	},
+	{ }
+};
+
+struct ctl_path net_ipv4_ctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ },
+};
+EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
+
+static __net_init int ipv4_sysctl_init_net(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = ipv4_net_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
+		if (table == NULL)
+			goto err_alloc;
+
+		table[0].data =
+			&net->ipv4.sysctl_icmp_echo_ignore_all;
+		table[1].data =
+			&net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
+		table[2].data =
+			&net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
+		table[3].data =
+			&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
+		table[4].data =
+			&net->ipv4.sysctl_icmp_ratelimit;
+		table[5].data =
+			&net->ipv4.sysctl_icmp_ratemask;
+		table[6].data =
+			&net->ipv4.sysctl_rt_cache_rebuild_count;
+		table[7].data =
+			&net->ipv4.sysctl_ping_group_range;
+
+	}
+
+	/*
+	 * Sane defaults - nobody may create ping sockets.
+	 * Boot scripts should set this to distro-specific group.
+	 */
+	net->ipv4.sysctl_ping_group_range[0] = 1;
+	net->ipv4.sysctl_ping_group_range[1] = 0;
+
+	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
+
+	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
+			net_ipv4_ctl_path, table);
+	if (net->ipv4.ipv4_hdr == NULL)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static __net_exit void ipv4_sysctl_exit_net(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ipv4.ipv4_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
+	kfree(table);
+}
+
+static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
+	.init = ipv4_sysctl_init_net,
+	.exit = ipv4_sysctl_exit_net,
+};
+
+static __init int sysctl_ipv4_init(void)
+{
+	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
+
+	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
+	if (hdr == NULL)
+		return -ENOMEM;
+
+	if (register_pernet_subsys(&ipv4_sysctl_ops)) {
+		unregister_sysctl_table(hdr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+__initcall(sysctl_ipv4_init);
diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c
new file mode 100644
index 00000000..0cbbf100
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+			    struct kobj_attribute *attr, char *buf) \
+{ \
+	return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+			     struct kobj_attribute *attr, \
+			     const char *buf, size_t count) \
+{ \
+	int val, ret; \
+	ret = sscanf(buf, "%d", &val); \
+	if (ret != 1) \
+		return -EINVAL; \
+	if (val < 0) \
+		return -EINVAL; \
+	_var = val; \
+	return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+	&tcp_wmem_min_attr.attr,
+	&tcp_wmem_def_attr.attr,
+	&tcp_wmem_max_attr.attr,
+	&tcp_rmem_min_attr.attr,
+	&tcp_rmem_def_attr.attr,
+	&tcp_rmem_max_attr.attr,
+	NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+	.attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+	struct kobject *ipv4_kobject;
+	int ret;
+
+	ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+	if (!ipv4_kobject)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+	if (ret) {
+		kobject_put(ipv4_kobject);
+		return ret;
+	}
+
+	return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 00000000..30eda52c
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,3431 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ *		Alan Cox	:	Numerous verify_area() calls
+ *		Alan Cox	:	Set the ACK bit on a reset
+ *		Alan Cox	:	Stopped it crashing if it closed while
+ *					sk->inuse=1 and was trying to connect
+ *					(tcp_err()).
+ *		Alan Cox	:	All icmp error handling was broken
+ *					pointers passed where wrong and the
+ *					socket was looked up backwards. Nobody
+ *					tested any icmp error code obviously.
+ *		Alan Cox	:	tcp_err() now handled properly. It
+ *					wakes people on errors. poll
+ *					behaves and the icmp error race
+ *					has gone by moving it into sock.c
+ *		Alan Cox	:	tcp_send_reset() fixed to work for
+ *					everything not just packets for
+ *					unknown sockets.
+ *		Alan Cox	:	tcp option processing.
+ *		Alan Cox	:	Reset tweaked (still not 100%) [Had
+ *					syn rule wrong]
+ *		Herp Rosmanith  :	More reset fixes
+ *		Alan Cox	:	No longer acks invalid rst frames.
+ *					Acking any kind of RST is right out.
+ *		Alan Cox	:	Sets an ignore me flag on an rst
+ *					receive otherwise odd bits of prattle
+ *					escape still
+ *		Alan Cox	:	Fixed another acking RST frame bug.
+ *					Should stop LAN workplace lockups.
+ *		Alan Cox	: 	Some tidyups using the new skb list
+ *					facilities
+ *		Alan Cox	:	sk->keepopen now seems to work
+ *		Alan Cox	:	Pulls options out correctly on accepts
+ *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
+ *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
+ *					bit to skb ops.
+ *		Alan Cox	:	Tidied tcp_data to avoid a potential
+ *					nasty.
+ *		Alan Cox	:	Added some better commenting, as the
+ *					tcp is hard to follow
+ *		Alan Cox	:	Removed incorrect check for 20 * psh
+ *	Michael O'Reilly	:	ack < copied bug fix.
+ *	Johannes Stille		:	Misc tcp fixes (not all in yet).
+ *		Alan Cox	:	FIN with no memory -> CRASH
+ *		Alan Cox	:	Added socket option proto entries.
+ *					Also added awareness of them to accept.
+ *		Alan Cox	:	Added TCP options (SOL_TCP)
+ *		Alan Cox	:	Switched wakeup calls to callbacks,
+ *					so the kernel can layer network
+ *					sockets.
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings.
+ *		Alan Cox	:	Handle FIN (more) properly (we hope).
+ *		Alan Cox	:	RST frames sent on unsynchronised
+ *					state ack error.
+ *		Alan Cox	:	Put in missing check for SYN bit.
+ *		Alan Cox	:	Added tcp_select_window() aka NET2E
+ *					window non shrink trick.
+ *		Alan Cox	:	Added a couple of small NET2E timer
+ *					fixes
+ *		Charles Hedrick :	TCP fixes
+ *		Toomas Tamm	:	TCP window fixes
+ *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
+ *		Charles Hedrick	:	Rewrote most of it to actually work
+ *		Linus		:	Rewrote tcp_read() and URG handling
+ *					completely
+ *		Gerhard Koerting:	Fixed some missing timer handling
+ *		Matthew Dillon  :	Reworked TCP machine states as per RFC
+ *		Gerhard Koerting:	PC/TCP workarounds
+ *		Adam Caldwell	:	Assorted timer/timing errors
+ *		Matthew Dillon	:	Fixed another RST bug
+ *		Alan Cox	:	Move to kernel side addressing changes.
+ *		Alan Cox	:	Beginning work on TCP fastpathing
+ *					(not yet usable)
+ *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
+ *		Alan Cox	:	TCP fast path debugging
+ *		Alan Cox	:	Window clamping
+ *		Michael Riepe	:	Bug in tcp_check()
+ *		Matt Dillon	:	More TCP improvements and RST bug fixes
+ *		Matt Dillon	:	Yet more small nasties remove from the
+ *					TCP code (Be very nice to this man if
+ *					tcp finally works 100%) 8)
+ *		Alan Cox	:	BSD accept semantics.
+ *		Alan Cox	:	Reset on closedown bug.
+ *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
+ *		Michael Pall	:	Handle poll() after URG properly in
+ *					all cases.
+ *		Michael Pall	:	Undo the last fix in tcp_read_urg()
+ *					(multi URG PUSH broke rlogin).
+ *		Michael Pall	:	Fix the multi URG PUSH problem in
+ *					tcp_readable(), poll() after URG
+ *					works now.
+ *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
+ *					BSD api.
+ *		Alan Cox	:	Changed the semantics of sk->socket to
+ *					fix a race and a signal problem with
+ *					accept() and async I/O.
+ *		Alan Cox	:	Relaxed the rules on tcp_sendto().
+ *		Yury Shevchuk	:	Really fixed accept() blocking problem.
+ *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
+ *					clients/servers which listen in on
+ *					fixed ports.
+ *		Alan Cox	:	Cleaned the above up and shrank it to
+ *					a sensible code size.
+ *		Alan Cox	:	Self connect lockup fix.
+ *		Alan Cox	:	No connect to multicast.
+ *		Ross Biro	:	Close unaccepted children on master
+ *					socket close.
+ *		Alan Cox	:	Reset tracing code.
+ *		Alan Cox	:	Spurious resets on shutdown.
+ *		Alan Cox	:	Giant 15 minute/60 second timer error
+ *		Alan Cox	:	Small whoops in polling before an
+ *					accept.
+ *		Alan Cox	:	Kept the state trace facility since
+ *					it's handy for debugging.
+ *		Alan Cox	:	More reset handler fixes.
+ *		Alan Cox	:	Started rewriting the code based on
+ *					the RFC's for other useful protocol
+ *					references see: Comer, KA9Q NOS, and
+ *					for a reference on the difference
+ *					between specifications and how BSD
+ *					works see the 4.4lite source.
+ *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
+ *					close.
+ *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
+ *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
+ *		Alan Cox	:	Reimplemented timers as per the RFC
+ *					and using multiple timers for sanity.
+ *		Alan Cox	:	Small bug fixes, and a lot of new
+ *					comments.
+ *		Alan Cox	:	Fixed dual reader crash by locking
+ *					the buffers (much like datagram.c)
+ *		Alan Cox	:	Fixed stuck sockets in probe. A probe
+ *					now gets fed up of retrying without
+ *					(even a no space) answer.
+ *		Alan Cox	:	Extracted closing code better
+ *		Alan Cox	:	Fixed the closing state machine to
+ *					resemble the RFC.
+ *		Alan Cox	:	More 'per spec' fixes.
+ *		Jorge Cwik	:	Even faster checksumming.
+ *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
+ *					only frames. At least one pc tcp stack
+ *					generates them.
+ *		Alan Cox	:	Cache last socket.
+ *		Alan Cox	:	Per route irtt.
+ *		Matt Day	:	poll()->select() match BSD precisely on error
+ *		Alan Cox	:	New buffers
+ *		Marc Tamsky	:	Various sk->prot->retransmits and
+ *					sk->retransmits misupdating fixed.
+ *					Fixed tcp_write_timeout: stuck close,
+ *					and TCP syn retries gets used now.
+ *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
+ *					ack if state is TCP_CLOSED.
+ *		Alan Cox	:	Look up device on a retransmit - routes may
+ *					change. Doesn't yet cope with MSS shrink right
+ *					but it's a start!
+ *		Marc Tamsky	:	Closing in closing fixes.
+ *		Mike Shaver	:	RFC1122 verifications.
+ *		Alan Cox	:	rcv_saddr errors.
+ *		Alan Cox	:	Block double connect().
+ *		Alan Cox	:	Small hooks for enSKIP.
+ *		Alexey Kuznetsov:	Path MTU discovery.
+ *		Alan Cox	:	Support soft errors.
+ *		Alan Cox	:	Fix MTU discovery pathological case
+ *					when the remote claims no mtu!
+ *		Marc Tamsky	:	TCP_CLOSE fix.
+ *		Colin (G3TNE)	:	Send a reset on syn ack replies in
+ *					window but wrong (fixes NT lpd problems)
+ *		Pedro Roque	:	Better TCP window handling, delayed ack.
+ *		Joerg Reuter	:	No modification of locked buffers in
+ *					tcp_do_retransmit()
+ *		Eric Schenk	:	Changed receiver side silly window
+ *					avoidance algorithm to BSD style
+ *					algorithm. This doubles throughput
+ *					against machines running Solaris,
+ *					and seems to result in general
+ *					improvement.
+ *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *	Mike McLagan		:	Routing by source
+ *		Keith Owens	:	Do proper merging with partial SKB's in
+ *					tcp_do_sendmsg to avoid burstiness.
+ *		Eric Schenk	:	Fix fast close down bug with
+ *					shutdown() followed by close().
+ *		Andi Kleen 	:	Make poll agree with SIGIO
+ *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
+ *					lingertime == 0 (RFC 793 ABORT Call)
+ *	Hirokazu Takahashi	:	Use copy_from_user() instead of
+ *					csum_and_copy_from_user() if possible.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ *	TCP_SYN_SENT		sent a connection request, waiting for ack
+ *
+ *	TCP_SYN_RECV		received a connection request, sent ack,
+ *				waiting for final ack in three-way handshake.
+ *
+ *	TCP_ESTABLISHED		connection established
+ *
+ *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
+ *				transmission of remaining buffered data
+ *
+ *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
+ *				to shutdown
+ *
+ *	TCP_CLOSING		both sides have shutdown but we still have
+ *				data we have to finish sending
+ *
+ *	TCP_TIME_WAIT		timeout to catch resent junk before entering
+ *				closed, can only be entered from FIN_WAIT2
+ *				or CLOSING.  Required because the other end
+ *				may not have gotten our last ACK causing it
+ *				to retransmit the data packet (which we ignore)
+ *
+ *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
+ *				us to finish writing our data and to shutdown
+ *				(we have to close() to move on to LAST_ACK)
+ *
+ *	TCP_LAST_ACK		out side has shutdown after remote has
+ *				shutdown.  There may still be data in our
+ *				buffer that we have to finish sending
+ *
+ *	TCP_CLOSE		socket is finished
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+#include <linux/splice.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/crypto.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/uid_stat.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/netdma.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+
+struct percpu_counter tcp_orphan_count;
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
+long sysctl_tcp_mem[3] __read_mostly;
+int sysctl_tcp_wmem[3] __read_mostly;
+int sysctl_tcp_rmem[3] __read_mostly;
+
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+
+atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
+EXPORT_SYMBOL(tcp_memory_allocated);
+
+/*
+ * Current number of TCP sockets.
+ */
+struct percpu_counter tcp_sockets_allocated;
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+	struct pipe_inode_info *pipe;
+	size_t len;
+	unsigned int flags;
+};
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+	if (!tcp_memory_pressure) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+		tcp_memory_pressure = 1;
+	}
+}
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+/* Convert seconds to retransmits based on initial and max timeout */
+static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
+{
+	u8 res = 0;
+
+	if (seconds > 0) {
+		int period = timeout;
+
+		res = 1;
+		while (seconds > period && res < 255) {
+			res++;
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return res;
+}
+
+/* Convert retransmits to seconds based on initial and max timeout */
+static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
+{
+	int period = 0;
+
+	if (retrans > 0) {
+		period = timeout;
+		while (--retrans) {
+			timeout <<= 1;
+			if (timeout > rto_max)
+				timeout = rto_max;
+			period += timeout;
+		}
+	}
+	return period;
+}
+
+/*
+ *	Wait for a TCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	unsigned int mask;
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	if (sk->sk_state == TCP_LISTEN)
+		return inet_csk_listen_poll(sk);
+
+	/* Socket is not locked. We are protected from async events
+	 * by poll logic and correct handling of state changes
+	 * made by other threads is impossible in any case.
+	 */
+
+	mask = 0;
+
+	/*
+	 * POLLHUP is certainly not done right. But poll() doesn't
+	 * have a notion of HUP in just one direction, and for a
+	 * socket the read side is more interesting.
+	 *
+	 * Some poll() documentation says that POLLHUP is incompatible
+	 * with the POLLOUT/POLLWR flags, so somebody should check this
+	 * all. But careful, it tends to be safer to return too many
+	 * bits than too few, and you can easily break real applications
+	 * if you don't tell them that something has hung up!
+	 *
+	 * Check-me.
+	 *
+	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * our fs/select.c). It means that after we received EOF,
+	 * poll always returns immediately, making impossible poll() on write()
+	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+	 * if and only if shutdown has been made in both directions.
+	 * Actually, it is interesting to look how Solaris and DUX
+	 * solve this dilemma. I would prefer, if POLLHUP were maskable,
+	 * then we could set it on SND_SHUTDOWN. BTW examples given
+	 * in Stevens' books assume exactly this behaviour, it explains
+	 * why POLLHUP is incompatible with POLLOUT.	--ANK
+	 *
+	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+	 * blocking on fresh not-connected or disconnected socket. --ANK
+	 */
+	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+	/* Connected? */
+	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		int target = sock_rcvlowat(sk, 0, INT_MAX);
+
+		if (tp->urg_seq == tp->copied_seq &&
+		    !sock_flag(sk, SOCK_URGINLINE) &&
+		    tp->urg_data)
+			target++;
+
+		/* Potential race condition. If read of tp below will
+		 * escape above sk->sk_state, we can be illegally awaken
+		 * in SYN_* states. */
+		if (tp->rcv_nxt - tp->copied_seq >= target)
+			mask |= POLLIN | POLLRDNORM;
+
+		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {  /* send SIGIO later */
+				set_bit(SOCK_ASYNC_NOSPACE,
+					&sk->sk_socket->flags);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+				/* Race breaker. If space is freed after
+				 * wspace test but before the flags are set,
+				 * IO signal will be lost.
+				 */
+				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+					mask |= POLLOUT | POLLWRNORM;
+			}
+		} else
+			mask |= POLLOUT | POLLWRNORM;
+
+		if (tp->urg_data & TCP_URG_VALID)
+			mask |= POLLPRI;
+	}
+	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+	smp_rmb();
+	if (sk->sk_err)
+		mask |= POLLERR;
+
+	return mask;
+}
+EXPORT_SYMBOL(tcp_poll);
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		lock_sock(sk);
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else if (sock_flag(sk, SOCK_URGINLINE) ||
+			 !tp->urg_data ||
+			 before(tp->urg_seq, tp->copied_seq) ||
+			 !before(tp->urg_seq, tp->rcv_nxt)) {
+			struct sk_buff *skb;
+
+			answ = tp->rcv_nxt - tp->copied_seq;
+
+			/* Subtract 1, if FIN is in queue. */
+			skb = skb_peek_tail(&sk->sk_receive_queue);
+			if (answ && skb)
+				answ -= tcp_hdr(skb)->fin;
+		} else
+			answ = tp->urg_seq - tp->copied_seq;
+		release_sock(sk);
+		break;
+	case SIOCATMARK:
+		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+		break;
+	case SIOCOUTQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = tp->write_seq - tp->snd_una;
+		break;
+	case SIOCOUTQNSD:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = tp->write_seq - tp->snd_nxt;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return put_user(answ, (int __user *)arg);
+}
+EXPORT_SYMBOL(tcp_ioctl);
+
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+	tp->pushed_seq = tp->write_seq;
+}
+
+static inline int forced_push(struct tcp_sock *tp)
+{
+	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	skb->csum    = 0;
+	tcb->seq     = tcb->end_seq = tp->write_seq;
+	tcb->flags   = TCPHDR_ACK;
+	tcb->sacked  = 0;
+	skb_header_release(skb);
+	tcp_add_write_queue_tail(sk, skb);
+	sk->sk_wmem_queued += skb->truesize;
+	sk_mem_charge(sk, skb->truesize);
+	if (tp->nonagle & TCP_NAGLE_PUSH)
+		tp->nonagle &= ~TCP_NAGLE_PUSH;
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
+{
+	if (flags & MSG_OOB)
+		tp->snd_up = tp->write_seq;
+}
+
+static inline void tcp_push(struct sock *sk, int flags, int mss_now,
+			    int nonagle)
+{
+	if (tcp_send_head(sk)) {
+		struct tcp_sock *tp = tcp_sk(sk);
+
+		if (!(flags & MSG_MORE) || forced_push(tp))
+			tcp_mark_push(tp, tcp_write_queue_tail(sk));
+
+		tcp_mark_urg(tp, flags);
+		__tcp_push_pending_frames(sk, mss_now,
+					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+	}
+}
+
+static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+				unsigned int offset, size_t len)
+{
+	struct tcp_splice_state *tss = rd_desc->arg.data;
+	int ret;
+
+	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
+			      tss->flags);
+	if (ret > 0)
+		rd_desc->count -= ret;
+	return ret;
+}
+
+static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+	/* Store TCP splice context information in read_descriptor_t. */
+	read_descriptor_t rd_desc = {
+		.arg.data = tss,
+		.count	  = tss->len,
+	};
+
+	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ *  tcp_splice_read - splice data from TCP socket to a pipe
+ * @sock:	socket to splice from
+ * @ppos:	position (not valid)
+ * @pipe:	pipe to splice to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will read pages from given socket and fill them into a pipe.
+ *
+ **/
+ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
+			struct pipe_inode_info *pipe, size_t len,
+			unsigned int flags)
+{
+	struct sock *sk = sock->sk;
+	struct tcp_splice_state tss = {
+		.pipe = pipe,
+		.len = len,
+		.flags = flags,
+	};
+	long timeo;
+	ssize_t spliced;
+	int ret;
+
+	sock_rps_record_flow(sk);
+	/*
+	 * We can't seek on a socket input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+	ret = spliced = 0;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+	while (tss.len) {
+		ret = __tcp_splice_read(sk, &tss);
+		if (ret < 0)
+			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+			if (sk->sk_err) {
+				ret = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+			if (sk->sk_state == TCP_CLOSE) {
+				/*
+				 * This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				if (!sock_flag(sk, SOCK_DONE))
+					ret = -ENOTCONN;
+				break;
+			}
+			if (!timeo) {
+				ret = -EAGAIN;
+				break;
+			}
+			sk_wait_data(sk, &timeo);
+			if (signal_pending(current)) {
+				ret = sock_intr_errno(timeo);
+				break;
+			}
+			continue;
+		}
+		tss.len -= ret;
+		spliced += ret;
+
+		if (!timeo)
+			break;
+		release_sock(sk);
+		lock_sock(sk);
+
+		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current))
+			break;
+	}
+
+	release_sock(sk);
+
+	if (spliced)
+		return spliced;
+
+	return ret;
+}
+EXPORT_SYMBOL(tcp_splice_read);
+
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+{
+	struct sk_buff *skb;
+
+	/* The TCP header must be at least 32-bit aligned.  */
+	size = ALIGN(size, 4);
+
+	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+	if (skb) {
+		if (sk_wmem_schedule(sk, skb->truesize)) {
+			/*
+			 * Make sure that we have exactly size bytes
+			 * available to the caller, no more, no less.
+			 */
+			skb_reserve(skb, skb_tailroom(skb) - size);
+			return skb;
+		}
+		__kfree_skb(skb);
+	} else {
+		sk->sk_prot->enter_memory_pressure(sk);
+		sk_stream_moderate_sndbuf(sk);
+	}
+	return NULL;
+}
+
+static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+				       int large_allowed)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 xmit_size_goal, old_size_goal;
+
+	xmit_size_goal = mss_now;
+
+	if (large_allowed && sk_can_gso(sk)) {
+		xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+				  inet_csk(sk)->icsk_af_ops->net_header_len -
+				  inet_csk(sk)->icsk_ext_hdr_len -
+				  tp->tcp_header_len);
+
+		xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
+
+		/* We try hard to avoid divides here */
+		old_size_goal = tp->xmit_size_goal_segs * mss_now;
+
+		if (likely(old_size_goal <= xmit_size_goal &&
+			   old_size_goal + mss_now > xmit_size_goal)) {
+			xmit_size_goal = old_size_goal;
+		} else {
+			tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
+			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+		}
+	}
+
+	return max(xmit_size_goal, mss_now);
+}
+
+static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+{
+	int mss_now;
+
+	mss_now = tcp_current_mss(sk);
+	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+
+	return mss_now;
+}
+
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+			 size_t psize, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now, size_goal;
+	int err;
+	ssize_t copied;
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto out_err;
+
+	while (psize > 0) {
+		struct sk_buff *skb = tcp_write_queue_tail(sk);
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, psize, PAGE_SIZE - offset);
+
+		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, skb);
+			copy = size_goal;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (!sk_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk_mem_charge(sk, copy);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(psize -= copy))
+			goto out;
+
+		if (skb->len < size_goal || (flags & MSG_OOB))
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == tcp_send_head(sk))
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+			goto do_error;
+
+		mss_now = tcp_send_mss(sk, &size_goal, flags);
+	}
+
+out:
+	if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
+		tcp_push(sk, flags, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, flags, err);
+}
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+		 size_t size, int flags)
+{
+	ssize_t res;
+
+	if (!(sk->sk_route_caps & NETIF_F_SG) ||
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+		return sock_no_sendpage(sk->sk_socket, page, offset, size,
+					flags);
+
+	lock_sock(sk);
+	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	release_sock(sk);
+	return res;
+}
+EXPORT_SYMBOL(tcp_sendpage);
+
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+
+static inline int select_size(struct sock *sk, int sg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int tmp = tp->mss_cache;
+
+	if (sg) {
+		if (sk_can_gso(sk))
+			tmp = 0;
+		else {
+			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+			if (tmp >= pgbreak &&
+			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+				tmp = pgbreak;
+		}
+	}
+
+	return tmp;
+}
+
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t size)
+{
+	struct iovec *iov;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int iovlen, flags;
+	int mss_now, size_goal;
+	int sg, err, copied;
+	long timeo;
+
+	lock_sock(sk);
+
+	flags = msg->msg_flags;
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_err;
+
+	/* This should be in poll */
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_send_mss(sk, &size_goal, flags);
+
+	/* Ok commence sending. */
+	iovlen = msg->msg_iovlen;
+	iov = msg->msg_iov;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto out_err;
+
+	sg = sk->sk_route_caps & NETIF_F_SG;
+
+	while (--iovlen >= 0) {
+		size_t seglen = iov->iov_len;
+		unsigned char __user *from = iov->iov_base;
+
+		iov++;
+
+		while (seglen > 0) {
+			int copy = 0;
+			int max = size_goal;
+
+			skb = tcp_write_queue_tail(sk);
+			if (tcp_send_head(sk)) {
+				if (skb->ip_summed == CHECKSUM_NONE)
+					max = mss_now;
+				copy = max - skb->len;
+			}
+
+			if (copy <= 0) {
+new_segment:
+				/* Allocate new segment. If the interface is SG,
+				 * allocate skb fitting to single page.
+				 */
+				if (!sk_stream_memory_free(sk))
+					goto wait_for_sndbuf;
+
+				skb = sk_stream_alloc_skb(sk,
+							  select_size(sk, sg),
+							  sk->sk_allocation);
+				if (!skb)
+					goto wait_for_memory;
+
+				/*
+				 * Check whether we can use HW checksum.
+				 */
+				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+					skb->ip_summed = CHECKSUM_PARTIAL;
+
+				skb_entail(sk, skb);
+				copy = size_goal;
+				max = size_goal;
+			}
+
+			/* Try to append data to the end of skb. */
+			if (copy > seglen)
+				copy = seglen;
+
+			/* Where to copy to? */
+			if (skb_tailroom(skb) > 0) {
+				/* We have some space in skb head. Superb! */
+				if (copy > skb_tailroom(skb))
+					copy = skb_tailroom(skb);
+				err = skb_add_data_nocache(sk, skb, from, copy);
+				if (err)
+					goto do_fault;
+			} else {
+				int merge = 0;
+				int i = skb_shinfo(skb)->nr_frags;
+				struct page *page = TCP_PAGE(sk);
+				int off = TCP_OFF(sk);
+
+				if (skb_can_coalesce(skb, i, page, off) &&
+				    off != PAGE_SIZE) {
+					/* We can extend the last page
+					 * fragment. */
+					merge = 1;
+				} else if (i == MAX_SKB_FRAGS || !sg) {
+					/* Need to add new fragment and cannot
+					 * do this because interface is non-SG,
+					 * or because all the page slots are
+					 * busy. */
+					tcp_mark_push(tp, skb);
+					goto new_segment;
+				} else if (page) {
+					if (off == PAGE_SIZE) {
+						put_page(page);
+						TCP_PAGE(sk) = page = NULL;
+						off = 0;
+					}
+				} else
+					off = 0;
+
+				if (copy > PAGE_SIZE - off)
+					copy = PAGE_SIZE - off;
+
+				if (!sk_wmem_schedule(sk, copy))
+					goto wait_for_memory;
+
+				if (!page) {
+					/* Allocate new cache page. */
+					if (!(page = sk_stream_alloc_page(sk)))
+						goto wait_for_memory;
+				}
+
+				/* Time to copy data. We are close to
+				 * the end! */
+				err = skb_copy_to_page_nocache(sk, from, skb,
+							       page, off, copy);
+				if (err) {
+					/* If this page was new, give it to the
+					 * socket so it does not get leaked.
+					 */
+					if (!TCP_PAGE(sk)) {
+						TCP_PAGE(sk) = page;
+						TCP_OFF(sk) = 0;
+					}
+					goto do_error;
+				}
+
+				/* Update the skb. */
+				if (merge) {
+					skb_shinfo(skb)->frags[i - 1].size +=
+									copy;
+				} else {
+					skb_fill_page_desc(skb, i, page, off, copy);
+					if (TCP_PAGE(sk)) {
+						get_page(page);
+					} else if (off + copy < PAGE_SIZE) {
+						get_page(page);
+						TCP_PAGE(sk) = page;
+					}
+				}
+
+				TCP_OFF(sk) = off + copy;
+			}
+
+			if (!copied)
+				TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+
+			tp->write_seq += copy;
+			TCP_SKB_CB(skb)->end_seq += copy;
+			skb_shinfo(skb)->gso_segs = 0;
+
+			from += copy;
+			copied += copy;
+			if ((seglen -= copy) == 0 && iovlen == 0)
+				goto out;
+
+			if (skb->len < max || (flags & MSG_OOB))
+				continue;
+
+			if (forced_push(tp)) {
+				tcp_mark_push(tp, skb);
+				__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
+			} else if (skb == tcp_send_head(sk))
+				tcp_push_one(sk, mss_now);
+			continue;
+
+wait_for_sndbuf:
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+			if (copied)
+				tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+				goto do_error;
+
+			mss_now = tcp_send_mss(sk, &size_goal, flags);
+		}
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, flags, mss_now, tp->nonagle);
+	release_sock(sk);
+
+	if (copied > 0)
+		uid_stat_tcp_snd(current_uid(), copied);
+	return copied;
+
+do_fault:
+	if (!skb->len) {
+		tcp_unlink_write_queue(skb, sk);
+		/* It is the one place in all of TCP, except connection
+		 * reset, where we can be unlinking the send_head.
+		 */
+		tcp_check_send_head(sk, skb);
+		sk_wmem_free_skb(sk, skb);
+	}
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	err = sk_stream_error(sk, flags, err);
+	release_sock(sk);
+	return err;
+}
+EXPORT_SYMBOL(tcp_sendmsg);
+
+/*
+ *	Handle reading urgent data. BSD has very simple semantics for
+ *	this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* No URG data to read. */
+	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+	    tp->urg_data == TCP_URG_READ)
+		return -EINVAL;	/* Yes this is right ! */
+
+	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+		return -ENOTCONN;
+
+	if (tp->urg_data & TCP_URG_VALID) {
+		int err = 0;
+		char c = tp->urg_data;
+
+		if (!(flags & MSG_PEEK))
+			tp->urg_data = TCP_URG_READ;
+
+		/* Read urgent data. */
+		msg->msg_flags |= MSG_OOB;
+
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				err = memcpy_toiovec(msg->msg_iov, &c, 1);
+			len = 1;
+		} else
+			msg->msg_flags |= MSG_TRUNC;
+
+		return err ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+		return 0;
+
+	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
+	 * the available implementations agree in this case:
+	 * this call should never block, independent of the
+	 * blocking state of the socket.
+	 * Mike <pall@rz.uni-karlsruhe.de>
+	 */
+	return -EAGAIN;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int time_to_ack = 0;
+
+#if TCP_DEBUG
+	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+#endif
+
+	if (inet_csk_ack_scheduled(sk)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		   /* Delayed ACKs frequently hit locked sockets during bulk
+		    * receive. */
+		if (icsk->icsk_ack.blocked ||
+		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+		    /*
+		     * If this read emptied read buffer, we send ACK, if
+		     * connection is not bidirectional, user drained
+		     * receive buffer and there was a small segment
+		     * in queue.
+		     */
+		    (copied > 0 &&
+		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		       !icsk->icsk_ack.pingpong)) &&
+		      !atomic_read(&sk->sk_rmem_alloc)))
+			time_to_ack = 1;
+	}
+
+	/* We send an ACK if we can now advertise a non-zero window
+	 * which has been raised "significantly".
+	 *
+	 * Even if window raised up to infinity, do not send window open ACK
+	 * in states, where we will not receive more. It is useless.
+	 */
+	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		__u32 rcv_window_now = tcp_receive_window(tp);
+
+		/* Optimize, __tcp_select_window() is not cheap. */
+		if (2*rcv_window_now <= tp->window_clamp) {
+			__u32 new_window = __tcp_select_window(sk);
+
+			/* Send ACK now, if this read freed lots of space
+			 * in our buffer. Certainly, new_window is new window.
+			 * We can advertise it now, if it is not less than current one.
+			 * "Lots" means "at least twice" here.
+			 */
+			if (new_window && new_window >= 2 * rcv_window_now)
+				time_to_ack = 1;
+		}
+	}
+	if (time_to_ack)
+		tcp_send_ack(sk);
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
+
+	/* RX process wants to run with disabled BHs, though it is not
+	 * necessary */
+	local_bh_disable();
+	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+		sk_backlog_rcv(sk, skb);
+	local_bh_enable();
+
+	/* Clear memory counter. */
+	tp->ucopy.memory = 0;
+}
+
+#ifdef CONFIG_NET_DMA
+static void tcp_service_net_dma(struct sock *sk, bool wait)
+{
+	dma_cookie_t done, used;
+	dma_cookie_t last_issued;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->ucopy.dma_chan)
+		return;
+
+	last_issued = tp->ucopy.dma_cookie;
+	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+	do {
+		if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+					      last_issued, &done,
+					      &used) == DMA_SUCCESS) {
+			/* Safe to free early-copied skbs now */
+			__skb_queue_purge(&sk->sk_async_wait_queue);
+			break;
+		} else {
+			struct sk_buff *skb;
+			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+			       (dma_async_is_complete(skb->dma_cookie, done,
+						      used) == DMA_SUCCESS)) {
+				__skb_dequeue(&sk->sk_async_wait_queue);
+				kfree_skb(skb);
+			}
+		}
+	} while (wait);
+}
+#endif
+
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+	struct sk_buff *skb;
+	u32 offset;
+
+	skb_queue_walk(&sk->sk_receive_queue, skb) {
+		offset = seq - TCP_SKB_CB(skb)->seq;
+		if (tcp_hdr(skb)->syn)
+			offset--;
+		if (offset < skb->len || tcp_hdr(skb)->fin) {
+			*off = offset;
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ *	- It is assumed that the socket was locked by the caller.
+ *	- The routine does not block.
+ *	- At present, there is no support for reading OOB data
+ *	  or for 'peeking' the socket using this routine
+ *	  (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	u32 offset;
+	int copied = 0;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		if (offset < skb->len) {
+			int used;
+			size_t len;
+
+			len = skb->len - offset;
+			/* Stop reading if we hit a patch of urgent data */
+			if (tp->urg_data) {
+				u32 urg_offset = tp->urg_seq - seq;
+				if (urg_offset < len)
+					len = urg_offset;
+				if (!len)
+					break;
+			}
+			used = recv_actor(desc, skb, offset, len);
+			if (used < 0) {
+				if (!copied)
+					copied = used;
+				break;
+			} else if (used <= len) {
+				seq += used;
+				copied += used;
+				offset += used;
+			}
+			/*
+			 * If recv_actor drops the lock (e.g. TCP splice
+			 * receive) the skb pointer might be invalid when
+			 * getting here: tcp_collapse might have deleted it
+			 * while aggregating skbs from the socket queue.
+			 */
+			skb = tcp_recv_skb(sk, seq-1, &offset);
+			if (!skb || (offset+1 != skb->len))
+				break;
+		}
+		if (tcp_hdr(skb)->fin) {
+			sk_eat_skb(sk, skb, 0);
+			++seq;
+			break;
+		}
+		sk_eat_skb(sk, skb, 0);
+		if (!desc->count)
+			break;
+		tp->copied_seq = seq;
+	}
+	tp->copied_seq = seq;
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (copied > 0) {
+		tcp_cleanup_rbuf(sk, copied);
+		uid_stat_tcp_rcv(current_uid(), copied);
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(tcp_read_sock);
+
+/*
+ *	This routine copies from a sock struct into the user buffer.
+ *
+ *	Technical note: in 2.3 we work on _locked_ socket, so that
+ *	tricks with *seq access order and skb->users are not required.
+ *	Probably, code can be easily improved even more.
+ */
+
+int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len, int nonblock, int flags, int *addr_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 peek_seq;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	struct task_struct *user_recv = NULL;
+	int copied_early = 0;
+	struct sk_buff *skb;
+	u32 urg_hole = 0;
+
+	lock_sock(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	/* Urgent data needs to be handled specially. */
+	if (flags & MSG_OOB)
+		goto recv_urg;
+
+	seq = &tp->copied_seq;
+	if (flags & MSG_PEEK) {
+		peek_seq = tp->copied_seq;
+		seq = &peek_seq;
+	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+#ifdef CONFIG_NET_DMA
+	tp->ucopy.dma_chan = NULL;
+	preempt_disable();
+	skb = skb_peek_tail(&sk->sk_receive_queue);
+	{
+		int available = 0;
+
+		if (skb)
+			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
+		if ((available < target) &&
+		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
+		    !sysctl_tcp_low_latency &&
+		    dma_find_channel(DMA_MEMCPY)) {
+			preempt_enable_no_resched();
+			tp->ucopy.pinned_list =
+					dma_pin_iovec_pages(msg->msg_iov, len);
+		} else {
+			preempt_enable_no_resched();
+		}
+	}
+#endif
+
+	do {
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				break;
+			}
+		}
+
+		/* Next get a buffer. */
+
+		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
+				 flags))
+				break;
+
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (tcp_hdr(skb)->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (tcp_hdr(skb)->fin)
+				goto found_fin_ok;
+			WARN(!(flags & MSG_PEEK),
+			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+		}
+
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && !sk->sk_backlog.tail)
+			break;
+
+		if (copied) {
+			if (sk->sk_err ||
+			    sk->sk_state == TCP_CLOSE ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (sk->sk_state == TCP_CLOSE) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					copied = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		tcp_cleanup_rbuf(sk, copied);
+
+		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+			/* Install new reader */
+			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+				user_recv = current;
+				tp->ucopy.task = user_recv;
+				tp->ucopy.iov = msg->msg_iov;
+			}
+
+			tp->ucopy.len = len;
+
+			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+				!(flags & (MSG_PEEK | MSG_TRUNC)));
+
+			/* Ugly... If prequeue is not empty, we have to
+			 * process it before releasing socket, otherwise
+			 * order will be broken at second iteration.
+			 * More elegant solution is required!!!
+			 *
+			 * Look: we have the following (pseudo)queues:
+			 *
+			 * 1. packets in flight
+			 * 2. backlog
+			 * 3. prequeue
+			 * 4. receive_queue
+			 *
+			 * Each queue can be processed only if the next ones
+			 * are empty. At this point we have empty receive_queue.
+			 * But prequeue _can_ be not empty after 2nd iteration,
+			 * when we jumped to start of loop because backlog
+			 * processing added something to receive_queue.
+			 * We cannot release_sock(), because backlog contains
+			 * packets arrived _after_ prequeued ones.
+			 *
+			 * Shortly, algorithm is clear --- to process all
+			 * the queues in order. We could make it more directly,
+			 * requeueing packets from backlog to prequeue, if
+			 * is not empty. It is more elegant, but eats cycles,
+			 * unfortunately.
+			 */
+			if (!skb_queue_empty(&tp->ucopy.prequeue))
+				goto do_prequeue;
+
+			/* __ Set realtime policy in scheduler __ */
+		}
+
+#ifdef CONFIG_NET_DMA
+		if (tp->ucopy.dma_chan)
+			dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+#endif
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else
+			sk_wait_data(sk, &timeo);
+
+#ifdef CONFIG_NET_DMA
+		tcp_service_net_dma(sk, false);  /* Don't block */
+		tp->ucopy.wakeup = 0;
+#endif
+
+		if (user_recv) {
+			int chunk;
+
+			/* __ Restore normal policy in scheduler __ */
+
+			if ((chunk = len - tp->ucopy.len) != 0) {
+				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+				len -= chunk;
+				copied += chunk;
+			}
+
+			if (tp->rcv_nxt == tp->copied_seq &&
+			    !skb_queue_empty(&tp->ucopy.prequeue)) {
+do_prequeue:
+				tcp_prequeue_process(sk);
+
+				if ((chunk = len - tp->ucopy.len) != 0) {
+					NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+					len -= chunk;
+					copied += chunk;
+				}
+			}
+		}
+		if ((flags & MSG_PEEK) &&
+		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
+				       current->comm, task_pid_nr(current));
+			peek_seq = tp->copied_seq;
+		}
+		continue;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						urg_hole++;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+
+		if (!(flags & MSG_TRUNC)) {
+#ifdef CONFIG_NET_DMA
+			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+
+			if (tp->ucopy.dma_chan) {
+				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+					tp->ucopy.dma_chan, skb, offset,
+					msg->msg_iov, used,
+					tp->ucopy.pinned_list);
+
+				if (tp->ucopy.dma_cookie < 0) {
+
+					printk(KERN_ALERT "dma_cookie < 0\n");
+
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+
+				dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+				if ((offset + used) == skb->len)
+					copied_early = 1;
+
+			} else
+#endif
+			{
+				err = skb_copy_datagram_iovec(skb, offset,
+						msg->msg_iov, used);
+				if (err) {
+					/* Exception. Bailout! */
+					if (!copied)
+						copied = -EFAULT;
+					break;
+				}
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (tcp_hdr(skb)->fin)
+			goto found_fin_ok;
+		if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, copied_early);
+			copied_early = 0;
+		}
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, copied_early);
+			copied_early = 0;
+		}
+		break;
+	} while (len > 0);
+
+	if (user_recv) {
+		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+			int chunk;
+
+			tp->ucopy.len = copied > 0 ? len : 0;
+
+			tcp_prequeue_process(sk);
+
+			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+				NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+				len -= chunk;
+				copied += chunk;
+			}
+		}
+
+		tp->ucopy.task = NULL;
+		tp->ucopy.len = 0;
+	}
+
+#ifdef CONFIG_NET_DMA
+	tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
+	tp->ucopy.dma_chan = NULL;
+
+	if (tp->ucopy.pinned_list) {
+		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
+		tp->ucopy.pinned_list = NULL;
+	}
+#endif
+
+	/* According to UNIX98, msg_name/msg_namelen are ignored
+	 * on connected socket. I was just happy when found this 8) --ANK
+	 */
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_cleanup_rbuf(sk, copied);
+
+	release_sock(sk);
+
+	if (copied > 0)
+		uid_stat_tcp_rcv(current_uid(), copied);
+	return copied;
+
+out:
+	release_sock(sk);
+	return err;
+
+recv_urg:
+	err = tcp_recv_urg(sk, msg, len, flags);
+	if (err > 0)
+		uid_stat_tcp_rcv(current_uid(), err);
+	goto out;
+}
+EXPORT_SYMBOL(tcp_recvmsg);
+
+void tcp_set_state(struct sock *sk, int state)
+{
+	int oldstate = sk->sk_state;
+
+	switch (state) {
+	case TCP_ESTABLISHED:
+		if (oldstate != TCP_ESTABLISHED)
+			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+		break;
+
+	case TCP_CLOSE:
+		if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
+			TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
+
+		sk->sk_prot->unhash(sk);
+		if (inet_csk(sk)->icsk_bind_hash &&
+		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+			inet_put_port(sk);
+		/* fall through */
+	default:
+		if (oldstate == TCP_ESTABLISHED)
+			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+	}
+
+	/* Change state AFTER socket is unhashed to avoid closed
+	 * socket sitting in hash tables.
+	 */
+	sk->sk_state = state;
+
+#ifdef STATE_TRACE
+	SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_set_state);
+
+/*
+ *	State processing on a close. This implements the state shift for
+ *	sending our FIN frame. Note that we only send a FIN for some
+ *	states. A shutdown() may have already sent the FIN, or we may be
+ *	closed.
+ */
+
+static const unsigned char new_state[16] = {
+  /* current state:        new state:      action:	*/
+  /* (Invalid)		*/ TCP_CLOSE,
+  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_SYN_SENT	*/ TCP_CLOSE,
+  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
+  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
+  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
+  /* TCP_CLOSE		*/ TCP_CLOSE,
+  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
+  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
+  /* TCP_LISTEN		*/ TCP_CLOSE,
+  /* TCP_CLOSING	*/ TCP_CLOSING,
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+	int next = (int)new_state[sk->sk_state];
+	int ns = next & TCP_STATE_MASK;
+
+	tcp_set_state(sk, ns);
+
+	return next & TCP_ACTION_FIN;
+}
+
+/*
+ *	Shutdown the sending side of a connection. Much like close except
+ *	that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+	/*	We need to grab some memory, and put together a FIN,
+	 *	and then put it into the queue to be sent.
+	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+	 */
+	if (!(how & SEND_SHUTDOWN))
+		return;
+
+	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+		/* Clear out any half completed packets.  FIN if needed. */
+		if (tcp_close_state(sk))
+			tcp_send_fin(sk);
+	}
+}
+EXPORT_SYMBOL(tcp_shutdown);
+
+void tcp_close(struct sock *sk, long timeout)
+{
+	struct sk_buff *skb;
+	int data_was_unread = 0;
+	int state;
+
+	lock_sock(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_set_state(sk, TCP_CLOSE);
+
+		/* Special case. */
+		inet_csk_listen_stop(sk);
+
+		goto adjudge_to_death;
+	}
+
+	/*  We need to flush the recv. buffs.  We do this only on the
+	 *  descriptor close, not protocol-sourced closes, because the
+	 *  reader process may not have drained the data yet!
+	 */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+			  tcp_hdr(skb)->fin;
+		data_was_unread += len;
+		__kfree_skb(skb);
+	}
+
+	sk_mem_reclaim(sk);
+
+	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+	if (sk->sk_state == TCP_CLOSE)
+		goto adjudge_to_death;
+
+	/* As outlined in RFC 2525, section 2.17, we send a RST here because
+	 * data was lost. To witness the awful effects of the old behavior of
+	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+	 * GET in an FTP client, suspend the process, wait for the client to
+	 * advertise a zero window, then kill -9 the FTP client, wheee...
+	 * Note: timeout is always zero in such a case.
+	 */
+	if (data_was_unread) {
+		/* Unread data was tossed, zap the connection. */
+		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk, sk->sk_allocation);
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+		NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+	} else if (tcp_close_state(sk)) {
+		/* We FIN if the application ate all the data before
+		 * zapping the connection.
+		 */
+
+		/* RED-PEN. Formally speaking, we have broken TCP state
+		 * machine. State transitions:
+		 *
+		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
+		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+		 *
+		 * are legal only when FIN has been sent (i.e. in window),
+		 * rather than queued out of window. Purists blame.
+		 *
+		 * F.e. "RFC state" is ESTABLISHED,
+		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+		 *
+		 * The visible declinations are that sometimes
+		 * we enter time-wait state, when it is not required really
+		 * (harmless), do not send active resets, when they are
+		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+		 * they look as CLOSING or LAST_ACK for Linux)
+		 * Probably, I missed some more holelets.
+		 * 						--ANK
+		 */
+		tcp_send_fin(sk);
+	}
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	state = sk->sk_state;
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	/* It is the last release_sock in its life. It will remove backlog. */
+	release_sock(sk);
+
+
+	/* Now socket is owned by kernel and we acquire BH lock
+	   to finish close. No need to check for user refs.
+	 */
+	local_bh_disable();
+	bh_lock_sock(sk);
+	WARN_ON(sock_owned_by_user(sk));
+
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+
+	/* Have we already been destroyed by a softirq or backlog? */
+	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 3 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
+
+	if (sk->sk_state == TCP_FIN_WAIT2) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (tp->linger2 < 0) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			NET_INC_STATS_BH(sock_net(sk),
+					LINUX_MIB_TCPABORTONLINGER);
+		} else {
+			const int tmo = tcp_fin_time(sk);
+
+			if (tmo > TCP_TIMEWAIT_LEN) {
+				inet_csk_reset_keepalive_timer(sk,
+						tmo - TCP_TIMEWAIT_LEN);
+			} else {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+	}
+	if (sk->sk_state != TCP_CLOSE) {
+		sk_mem_reclaim(sk);
+		if (tcp_too_many_orphans(sk, 0)) {
+			if (net_ratelimit())
+				printk(KERN_INFO "TCP: too many of orphaned "
+				       "sockets\n");
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			NET_INC_STATS_BH(sock_net(sk),
+					LINUX_MIB_TCPABORTONMEMORY);
+		}
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		inet_csk_destroy_sock(sk);
+	/* Otherwise, socket is reprieved until protocol close. */
+
+out:
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	sock_put(sk);
+}
+EXPORT_SYMBOL(tcp_close);
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline int tcp_need_reset(int state)
+{
+	return (1 << state) &
+	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err = 0;
+	int old_state = sk->sk_state;
+
+	if (old_state != TCP_CLOSE)
+		tcp_set_state(sk, TCP_CLOSE);
+
+	/* ABORT function of RFC793 */
+	if (old_state == TCP_LISTEN) {
+		inet_csk_listen_stop(sk);
+	} else if (tcp_need_reset(old_state) ||
+		   (tp->snd_nxt != tp->write_seq &&
+		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+		/* The last check adjusts for discrepancy of Linux wrt. RFC
+		 * states
+		 */
+		tcp_send_active_reset(sk, gfp_any());
+		sk->sk_err = ECONNRESET;
+	} else if (old_state == TCP_SYN_SENT)
+		sk->sk_err = ECONNRESET;
+
+	tcp_clear_xmit_timers(sk);
+	__skb_queue_purge(&sk->sk_receive_queue);
+	tcp_write_queue_purge(sk);
+	__skb_queue_purge(&tp->out_of_order_queue);
+#ifdef CONFIG_NET_DMA
+	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
+
+	inet->inet_dport = 0;
+
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	sk->sk_shutdown = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->srtt = 0;
+	if ((tp->write_seq += tp->max_window + 2) == 0)
+		tp->write_seq = 1;
+	icsk->icsk_backoff = 0;
+	tp->snd_cwnd = 2;
+	icsk->icsk_probes_out = 0;
+	tp->packets_out = 0;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	tp->snd_cwnd_cnt = 0;
+	tp->bytes_acked = 0;
+	tp->window_clamp = 0;
+	tcp_set_ca_state(sk, TCP_CA_Open);
+	tcp_clear_retrans(tp);
+	inet_csk_delack_init(sk);
+	tcp_init_send_head(sk);
+	memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
+	__sk_dst_reset(sk);
+
+	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+
+	sk->sk_error_report(sk);
+	return err;
+}
+EXPORT_SYMBOL(tcp_disconnect);
+
+/*
+ *	Socket option code for TCP.
+ */
+static int do_tcp_setsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, unsigned int optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int val;
+	int err = 0;
+
+	/* These are data/string values, all the others are ints */
+	switch (optname) {
+	case TCP_CONGESTION: {
+		char name[TCP_CA_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min_t(long, TCP_CA_NAME_MAX-1, optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_congestion_control(sk, name);
+		release_sock(sk);
+		return err;
+	}
+	case TCP_COOKIE_TRANSACTIONS: {
+		struct tcp_cookie_transactions ctd;
+		struct tcp_cookie_values *cvp = NULL;
+
+		if (sizeof(ctd) > optlen)
+			return -EINVAL;
+		if (copy_from_user(&ctd, optval, sizeof(ctd)))
+			return -EFAULT;
+
+		if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
+		    ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
+			return -EINVAL;
+
+		if (ctd.tcpct_cookie_desired == 0) {
+			/* default to global value */
+		} else if ((0x1 & ctd.tcpct_cookie_desired) ||
+			   ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
+			   ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
+			return -EINVAL;
+		}
+
+		if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
+			/* Supercedes all other values */
+			lock_sock(sk);
+			if (tp->cookie_values != NULL) {
+				kref_put(&tp->cookie_values->kref,
+					 tcp_cookie_values_release);
+				tp->cookie_values = NULL;
+			}
+			tp->rx_opt.cookie_in_always = 0; /* false */
+			tp->rx_opt.cookie_out_never = 1; /* true */
+			release_sock(sk);
+			return err;
+		}
+
+		/* Allocate ancillary memory before locking.
+		 */
+		if (ctd.tcpct_used > 0 ||
+		    (tp->cookie_values == NULL &&
+		     (sysctl_tcp_cookie_size > 0 ||
+		      ctd.tcpct_cookie_desired > 0 ||
+		      ctd.tcpct_s_data_desired > 0))) {
+			cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
+				      GFP_KERNEL);
+			if (cvp == NULL)
+				return -ENOMEM;
+
+			kref_init(&cvp->kref);
+		}
+		lock_sock(sk);
+		tp->rx_opt.cookie_in_always =
+			(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
+		tp->rx_opt.cookie_out_never = 0; /* false */
+
+		if (tp->cookie_values != NULL) {
+			if (cvp != NULL) {
+				/* Changed values are recorded by a changed
+				 * pointer, ensuring the cookie will differ,
+				 * without separately hashing each value later.
+				 */
+				kref_put(&tp->cookie_values->kref,
+					 tcp_cookie_values_release);
+			} else {
+				cvp = tp->cookie_values;
+			}
+		}
+
+		if (cvp != NULL) {
+			cvp->cookie_desired = ctd.tcpct_cookie_desired;
+
+			if (ctd.tcpct_used > 0) {
+				memcpy(cvp->s_data_payload, ctd.tcpct_value,
+				       ctd.tcpct_used);
+				cvp->s_data_desired = ctd.tcpct_used;
+				cvp->s_data_constant = 1; /* true */
+			} else {
+				/* No constant payload data. */
+				cvp->s_data_desired = ctd.tcpct_s_data_desired;
+				cvp->s_data_constant = 0; /* false */
+			}
+
+			tp->cookie_values = cvp;
+		}
+		release_sock(sk);
+		return err;
+	}
+	default:
+		/* fallthru */
+		break;
+	}
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		/* Values greater than interface MTU won't take effect. However
+		 * at the point when this call is done we typically don't yet
+		 * know which interface is going to be used */
+		if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
+			err = -EINVAL;
+			break;
+		}
+		tp->rx_opt.user_mss = val;
+		break;
+
+	case TCP_NODELAY:
+		if (val) {
+			/* TCP_NODELAY is weaker than TCP_CORK, so that
+			 * this option on corked socket is remembered, but
+			 * it is not activated until cork is cleared.
+			 *
+			 * However, when TCP_NODELAY is set we make
+			 * an explicit push, which overrides even TCP_CORK
+			 * for currently queued segments.
+			 */
+			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+			tcp_push_pending_frames(sk);
+		} else {
+			tp->nonagle &= ~TCP_NAGLE_OFF;
+		}
+		break;
+
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_lto = val;
+		break;
+
+	case TCP_THIN_DUPACK:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->thin_dupack = val;
+		break;
+
+	case TCP_CORK:
+		/* When set indicates to always queue non-full frames.
+		 * Later the user clears this option and we transmit
+		 * any pending partial frames in the queue.  This is
+		 * meant to be used alongside sendfile() to get properly
+		 * filled frames when the user (for example) must write
+		 * out headers with a write() call first and then use
+		 * sendfile to send out the data parts.
+		 *
+		 * TCP_CORK can be set together with TCP_NODELAY and it is
+		 * stronger than TCP_NODELAY.
+		 */
+		if (val) {
+			tp->nonagle |= TCP_NAGLE_CORK;
+		} else {
+			tp->nonagle &= ~TCP_NAGLE_CORK;
+			if (tp->nonagle&TCP_NAGLE_OFF)
+				tp->nonagle |= TCP_NAGLE_PUSH;
+			tcp_push_pending_frames(sk);
+		}
+		break;
+
+	case TCP_KEEPIDLE:
+		if (val < 1 || val > MAX_TCP_KEEPIDLE)
+			err = -EINVAL;
+		else {
+			tp->keepalive_time = val * HZ;
+			if (sock_flag(sk, SOCK_KEEPOPEN) &&
+			    !((1 << sk->sk_state) &
+			      (TCPF_CLOSE | TCPF_LISTEN))) {
+				u32 elapsed = keepalive_time_elapsed(tp);
+				if (tp->keepalive_time > elapsed)
+					elapsed = tp->keepalive_time - elapsed;
+				else
+					elapsed = 0;
+				inet_csk_reset_keepalive_timer(sk, elapsed);
+			}
+		}
+		break;
+	case TCP_KEEPINTVL:
+		if (val < 1 || val > MAX_TCP_KEEPINTVL)
+			err = -EINVAL;
+		else
+			tp->keepalive_intvl = val * HZ;
+		break;
+	case TCP_KEEPCNT:
+		if (val < 1 || val > MAX_TCP_KEEPCNT)
+			err = -EINVAL;
+		else
+			tp->keepalive_probes = val;
+		break;
+	case TCP_SYNCNT:
+		if (val < 1 || val > MAX_TCP_SYNCNT)
+			err = -EINVAL;
+		else
+			icsk->icsk_syn_retries = val;
+		break;
+
+	case TCP_LINGER2:
+		if (val < 0)
+			tp->linger2 = -1;
+		else if (val > sysctl_tcp_fin_timeout / HZ)
+			tp->linger2 = 0;
+		else
+			tp->linger2 = val * HZ;
+		break;
+
+	case TCP_DEFER_ACCEPT:
+		/* Translate value in seconds to number of retransmits */
+		icsk->icsk_accept_queue.rskq_defer_accept =
+			secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+					TCP_RTO_MAX / HZ);
+		break;
+
+	case TCP_WINDOW_CLAMP:
+		if (!val) {
+			if (sk->sk_state != TCP_CLOSE) {
+				err = -EINVAL;
+				break;
+			}
+			tp->window_clamp = 0;
+		} else
+			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+						SOCK_MIN_RCVBUF / 2 : val;
+		break;
+
+	case TCP_QUICKACK:
+		if (!val) {
+			icsk->icsk_ack.pingpong = 1;
+		} else {
+			icsk->icsk_ack.pingpong = 0;
+			if ((1 << sk->sk_state) &
+			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+			    inet_csk_ack_scheduled(sk)) {
+				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+				tcp_cleanup_rbuf(sk, 1);
+				if (!(val & 1))
+					icsk->icsk_ack.pingpong = 1;
+			}
+		}
+		break;
+
+#ifdef CONFIG_TCP_MD5SIG
+	case TCP_MD5SIG:
+		/* Read the IP->Key mappings from userspace */
+		err = tp->af_specific->md5_parse(sk, optval, optlen);
+		break;
+#endif
+	case TCP_USER_TIMEOUT:
+		/* Cap the max timeout in ms TCP will retry/retrans
+		 * before giving up and aborting (ETIMEDOUT) a connection.
+		 */
+		icsk->icsk_user_timeout = msecs_to_jiffies(val);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+	return err;
+}
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   unsigned int optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+						     optval, optlen);
+	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	if (level != SOL_TCP)
+		return inet_csk_compat_setsockopt(sk, level, optname,
+						  optval, optlen);
+	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_setsockopt);
+#endif
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 now = tcp_time_stamp;
+
+	memset(info, 0, sizeof(*info));
+
+	info->tcpi_state = sk->sk_state;
+	info->tcpi_ca_state = icsk->icsk_ca_state;
+	info->tcpi_retransmits = icsk->icsk_retransmits;
+	info->tcpi_probes = icsk->icsk_probes_out;
+	info->tcpi_backoff = icsk->icsk_backoff;
+
+	if (tp->rx_opt.tstamp_ok)
+		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+	if (tcp_is_sack(tp))
+		info->tcpi_options |= TCPI_OPT_SACK;
+	if (tp->rx_opt.wscale_ok) {
+		info->tcpi_options |= TCPI_OPT_WSCALE;
+		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+	}
+
+	if (tp->ecn_flags&TCP_ECN_OK)
+		info->tcpi_options |= TCPI_OPT_ECN;
+
+	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+	info->tcpi_snd_mss = tp->mss_cache;
+	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		info->tcpi_unacked = sk->sk_ack_backlog;
+		info->tcpi_sacked = sk->sk_max_ack_backlog;
+	} else {
+		info->tcpi_unacked = tp->packets_out;
+		info->tcpi_sacked = tp->sacked_out;
+	}
+	info->tcpi_lost = tp->lost_out;
+	info->tcpi_retrans = tp->retrans_out;
+	info->tcpi_fackets = tp->fackets_out;
+
+	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
+	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
+	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
+	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+	info->tcpi_snd_cwnd = tp->snd_cwnd;
+	info->tcpi_advmss = tp->advmss;
+	info->tcpi_reordering = tp->reordering;
+
+	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+	info->tcpi_rcv_space = tp->rcvq_space.space;
+
+	info->tcpi_total_retrans = tp->total_retrans;
+}
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+static int do_tcp_getsockopt(struct sock *sk, int level,
+		int optname, char __user *optval, int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int val, len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		val = tp->mss_cache;
+		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			val = tp->rx_opt.user_mss;
+		break;
+	case TCP_NODELAY:
+		val = !!(tp->nonagle&TCP_NAGLE_OFF);
+		break;
+	case TCP_CORK:
+		val = !!(tp->nonagle&TCP_NAGLE_CORK);
+		break;
+	case TCP_KEEPIDLE:
+		val = keepalive_time_when(tp) / HZ;
+		break;
+	case TCP_KEEPINTVL:
+		val = keepalive_intvl_when(tp) / HZ;
+		break;
+	case TCP_KEEPCNT:
+		val = keepalive_probes(tp);
+		break;
+	case TCP_SYNCNT:
+		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		break;
+	case TCP_LINGER2:
+		val = tp->linger2;
+		if (val >= 0)
+			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+		break;
+	case TCP_DEFER_ACCEPT:
+		val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
+				      TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+		break;
+	case TCP_WINDOW_CLAMP:
+		val = tp->window_clamp;
+		break;
+	case TCP_INFO: {
+		struct tcp_info info;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		tcp_get_info(sk, &info);
+
+		len = min_t(unsigned int, len, sizeof(info));
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &info, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_QUICKACK:
+		val = !icsk->icsk_ack.pingpong;
+		break;
+
+	case TCP_CONGESTION:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+			return -EFAULT;
+		return 0;
+
+	case TCP_COOKIE_TRANSACTIONS: {
+		struct tcp_cookie_transactions ctd;
+		struct tcp_cookie_values *cvp = tp->cookie_values;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+		if (len < sizeof(ctd))
+			return -EINVAL;
+
+		memset(&ctd, 0, sizeof(ctd));
+		ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
+				   TCP_COOKIE_IN_ALWAYS : 0)
+				| (tp->rx_opt.cookie_out_never ?
+				   TCP_COOKIE_OUT_NEVER : 0);
+
+		if (cvp != NULL) {
+			ctd.tcpct_flags |= (cvp->s_data_in ?
+					    TCP_S_DATA_IN : 0)
+					 | (cvp->s_data_out ?
+					    TCP_S_DATA_OUT : 0);
+
+			ctd.tcpct_cookie_desired = cvp->cookie_desired;
+			ctd.tcpct_s_data_desired = cvp->s_data_desired;
+
+			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
+			       cvp->cookie_pair_size);
+			ctd.tcpct_used = cvp->cookie_pair_size;
+		}
+
+		if (put_user(sizeof(ctd), optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &ctd, sizeof(ctd)))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		val = tp->thin_lto;
+		break;
+	case TCP_THIN_DUPACK:
+		val = tp->thin_dupack;
+		break;
+
+	case TCP_USER_TIMEOUT:
+		val = jiffies_to_msecs(icsk->icsk_user_timeout);
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int __user *optlen)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (level != SOL_TCP)
+		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+						     optval, optlen);
+	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(tcp_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_TCP)
+		return inet_csk_compat_getsockopt(sk, level, optname,
+						  optval, optlen);
+	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_tcp_getsockopt);
+#endif
+
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct tcphdr *th;
+	unsigned thlen;
+	unsigned int seq;
+	__be32 delta;
+	unsigned int oldlen;
+	unsigned int mss;
+
+	if (!pskb_may_pull(skb, sizeof(*th)))
+		goto out;
+
+	th = tcp_hdr(skb);
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	if (!pskb_may_pull(skb, thlen))
+		goto out;
+
+	oldlen = (u16)~skb->len;
+	__skb_pull(skb, thlen);
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+		int type = skb_shinfo(skb)->gso_type;
+
+		if (unlikely(type &
+			     ~(SKB_GSO_TCPV4 |
+			       SKB_GSO_DODGY |
+			       SKB_GSO_TCP_ECN |
+			       SKB_GSO_TCPV6 |
+			       0) ||
+			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
+			goto out;
+
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+		segs = NULL;
+		goto out;
+	}
+
+	segs = skb_segment(skb, features);
+	if (IS_ERR(segs))
+		goto out;
+
+	delta = htonl(oldlen + (thlen + mss));
+
+	skb = segs;
+	th = tcp_hdr(skb);
+	seq = ntohl(th->seq);
+
+	do {
+		th->fin = th->psh = 0;
+
+		th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+				       (__force u32)delta));
+		if (skb->ip_summed != CHECKSUM_PARTIAL)
+			th->check =
+			     csum_fold(csum_partial(skb_transport_header(skb),
+						    thlen, skb->csum));
+
+		seq += mss;
+		skb = skb->next;
+		th = tcp_hdr(skb);
+
+		th->seq = htonl(seq);
+		th->cwr = 0;
+	} while (skb->next);
+
+	delta = htonl(oldlen + (skb->tail - skb->transport_header) +
+		      skb->data_len);
+	th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
+				(__force u32)delta));
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		th->check = csum_fold(csum_partial(skb_transport_header(skb),
+						   thlen, skb->csum));
+
+out:
+	return segs;
+}
+EXPORT_SYMBOL(tcp_tso_segment);
+
+struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct tcphdr *th;
+	struct tcphdr *th2;
+	unsigned int len;
+	unsigned int thlen;
+	__be32 flags;
+	unsigned int mss = 1;
+	unsigned int hlen;
+	unsigned int off;
+	int flush = 1;
+	int i;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*th);
+	th = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			goto out;
+	}
+
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	hlen = off + thlen;
+	if (skb_gro_header_hard(skb, hlen)) {
+		th = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!th))
+			goto out;
+	}
+
+	skb_gro_pull(skb, thlen);
+
+	len = skb_gro_len(skb);
+	flags = tcp_flag_word(th);
+
+	for (; (p = *head); head = &p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		th2 = tcp_hdr(p);
+
+		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		goto found;
+	}
+
+	goto out_check_final;
+
+found:
+	flush = NAPI_GRO_CB(p)->flush;
+	flush |= (__force int)(flags & TCP_FLAG_CWR);
+	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+	for (i = sizeof(*th); i < thlen; i += 4)
+		flush |= *(u32 *)((u8 *)th + i) ^
+			 *(u32 *)((u8 *)th2 + i);
+
+	mss = skb_shinfo(p)->gso_size;
+
+	flush |= (len - 1) >= mss;
+	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+
+	if (flush || skb_gro_receive(head, skb)) {
+		mss = 1;
+		goto out_check_final;
+	}
+
+	p = *head;
+	th2 = tcp_hdr(p);
+	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
+
+out_check_final:
+	flush = len < mss;
+	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+					TCP_FLAG_RST | TCP_FLAG_SYN |
+					TCP_FLAG_FIN));
+
+	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+		pp = head;
+
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+EXPORT_SYMBOL(tcp_gro_receive);
+
+int tcp_gro_complete(struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+	if (th->cwr)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_gro_complete);
+
+#ifdef CONFIG_TCP_MD5SIG
+static unsigned long tcp_md5sig_users;
+static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
+static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
+		if (p) {
+			if (p->md5_desc.tfm)
+				crypto_free_hash(p->md5_desc.tfm);
+			kfree(p);
+		}
+	}
+	free_percpu(pool);
+}
+
+void tcp_free_md5sig_pool(void)
+{
+	struct tcp_md5sig_pool * __percpu *pool = NULL;
+
+	spin_lock_bh(&tcp_md5sig_pool_lock);
+	if (--tcp_md5sig_users == 0) {
+		pool = tcp_md5sig_pool;
+		tcp_md5sig_pool = NULL;
+	}
+	spin_unlock_bh(&tcp_md5sig_pool_lock);
+	if (pool)
+		__tcp_free_md5sig_pool(pool);
+}
+EXPORT_SYMBOL(tcp_free_md5sig_pool);
+
+static struct tcp_md5sig_pool * __percpu *
+__tcp_alloc_md5sig_pool(struct sock *sk)
+{
+	int cpu;
+	struct tcp_md5sig_pool * __percpu *pool;
+
+	pool = alloc_percpu(struct tcp_md5sig_pool *);
+	if (!pool)
+		return NULL;
+
+	for_each_possible_cpu(cpu) {
+		struct tcp_md5sig_pool *p;
+		struct crypto_hash *hash;
+
+		p = kzalloc(sizeof(*p), sk->sk_allocation);
+		if (!p)
+			goto out_free;
+		*per_cpu_ptr(pool, cpu) = p;
+
+		hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+		if (!hash || IS_ERR(hash))
+			goto out_free;
+
+		p->md5_desc.tfm = hash;
+	}
+	return pool;
+out_free:
+	__tcp_free_md5sig_pool(pool);
+	return NULL;
+}
+
+struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
+{
+	struct tcp_md5sig_pool * __percpu *pool;
+	int alloc = 0;
+
+retry:
+	spin_lock_bh(&tcp_md5sig_pool_lock);
+	pool = tcp_md5sig_pool;
+	if (tcp_md5sig_users++ == 0) {
+		alloc = 1;
+		spin_unlock_bh(&tcp_md5sig_pool_lock);
+	} else if (!pool) {
+		tcp_md5sig_users--;
+		spin_unlock_bh(&tcp_md5sig_pool_lock);
+		cpu_relax();
+		goto retry;
+	} else
+		spin_unlock_bh(&tcp_md5sig_pool_lock);
+
+	if (alloc) {
+		/* we cannot hold spinlock here because this may sleep. */
+		struct tcp_md5sig_pool * __percpu *p;
+
+		p = __tcp_alloc_md5sig_pool(sk);
+		spin_lock_bh(&tcp_md5sig_pool_lock);
+		if (!p) {
+			tcp_md5sig_users--;
+			spin_unlock_bh(&tcp_md5sig_pool_lock);
+			return NULL;
+		}
+		pool = tcp_md5sig_pool;
+		if (pool) {
+			/* oops, it has already been assigned. */
+			spin_unlock_bh(&tcp_md5sig_pool_lock);
+			__tcp_free_md5sig_pool(p);
+		} else {
+			tcp_md5sig_pool = pool = p;
+			spin_unlock_bh(&tcp_md5sig_pool_lock);
+		}
+	}
+	return pool;
+}
+EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
+
+
+/**
+ *	tcp_get_md5sig_pool - get md5sig_pool for this user
+ *
+ *	We use percpu structure, so if we succeed, we exit with preemption
+ *	and BH disabled, to make sure another thread or softirq handling
+ *	wont try to get same context.
+ */
+struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+{
+	struct tcp_md5sig_pool * __percpu *p;
+
+	local_bh_disable();
+
+	spin_lock(&tcp_md5sig_pool_lock);
+	p = tcp_md5sig_pool;
+	if (p)
+		tcp_md5sig_users++;
+	spin_unlock(&tcp_md5sig_pool_lock);
+
+	if (p)
+		return *this_cpu_ptr(p);
+
+	local_bh_enable();
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_get_md5sig_pool);
+
+void tcp_put_md5sig_pool(void)
+{
+	local_bh_enable();
+	tcp_free_md5sig_pool();
+}
+EXPORT_SYMBOL(tcp_put_md5sig_pool);
+
+int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
+			struct tcphdr *th)
+{
+	struct scatterlist sg;
+	int err;
+
+	__sum16 old_checksum = th->check;
+	th->check = 0;
+	/* options aren't included in the hash */
+	sg_init_one(&sg, th, sizeof(struct tcphdr));
+	err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
+	th->check = old_checksum;
+	return err;
+}
+EXPORT_SYMBOL(tcp_md5_hash_header);
+
+int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
+			  struct sk_buff *skb, unsigned header_len)
+{
+	struct scatterlist sg;
+	const struct tcphdr *tp = tcp_hdr(skb);
+	struct hash_desc *desc = &hp->md5_desc;
+	unsigned i;
+	const unsigned head_data_len = skb_headlen(skb) > header_len ?
+				       skb_headlen(skb) - header_len : 0;
+	const struct skb_shared_info *shi = skb_shinfo(skb);
+	struct sk_buff *frag_iter;
+
+	sg_init_table(&sg, 1);
+
+	sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
+	if (crypto_hash_update(desc, &sg, head_data_len))
+		return 1;
+
+	for (i = 0; i < shi->nr_frags; ++i) {
+		const struct skb_frag_struct *f = &shi->frags[i];
+		sg_set_page(&sg, f->page, f->size, f->page_offset);
+		if (crypto_hash_update(desc, &sg, f->size))
+			return 1;
+	}
+
+	skb_walk_frags(skb, frag_iter)
+		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+			return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_md5_hash_skb_data);
+
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, key->key, key->keylen);
+	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+}
+EXPORT_SYMBOL(tcp_md5_hash_key);
+
+#endif
+
+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover.  Each secret value has 4 states:
+ *
+ * Generating.  (tcp_secret_generating != tcp_secret_primary)
+ *    Generates new Responder-Cookies, but not yet used for primary
+ *    verification.  This is a short-term state, typically lasting only
+ *    one round trip time (RTT).
+ *
+ * Primary.  (tcp_secret_generating == tcp_secret_primary)
+ *    Used both for generation and primary verification.
+ *
+ * Retiring.  (tcp_secret_retiring != tcp_secret_secondary)
+ *    Used for verification, until the first failure that can be
+ *    verified by the newer Generating secret.  At that time, this
+ *    cookie's state is changed to Secondary, and the Generating
+ *    cookie's state is changed to Primary.  This is a short-term state,
+ *    typically lasting only one round trip time (RTT).
+ *
+ * Secondary.  (tcp_secret_retiring == tcp_secret_secondary)
+ *    Used for secondary verification, after primary verification
+ *    failures.  This state lasts no more than twice the Maximum Segment
+ *    Lifetime (2MSL).  Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+	/* The secret is divided into two parts.  The digest part is the
+	 * equivalent of previously hashing a secret and saving the state,
+	 * and serves as an initialization vector (IV).  The message part
+	 * serves as the trailing secret.
+	 */
+	u32				secrets[COOKIE_WORKSPACE_WORDS];
+	unsigned long			expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Select a pseudo-random word in the cookie workspace.
+ */
+static inline u32 tcp_cookie_work(const u32 *ws, const int n)
+{
+	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
+}
+
+/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
+ * Called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(u32 *bakery)
+{
+	unsigned long jiffy = jiffies;
+
+	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
+		spin_lock_bh(&tcp_secret_locker);
+		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
+			/* refreshed by another */
+			memcpy(bakery,
+			       &tcp_secret_generating->secrets[0],
+			       COOKIE_WORKSPACE_WORDS);
+		} else {
+			/* still needs refreshing */
+			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
+
+			/* The first time, paranoia assumes that the
+			 * randomization function isn't as strong.  But,
+			 * this secret initialization is delayed until
+			 * the last possible moment (packet arrival).
+			 * Although that time is observable, it is
+			 * unpredictably variable.  Mash in the most
+			 * volatile clock bits available, and expire the
+			 * secret extra quickly.
+			 */
+			if (unlikely(tcp_secret_primary->expires ==
+				     tcp_secret_secondary->expires)) {
+				struct timespec tv;
+
+				getnstimeofday(&tv);
+				bakery[COOKIE_DIGEST_WORDS+0] ^=
+					(u32)tv.tv_nsec;
+
+				tcp_secret_secondary->expires = jiffy
+					+ TCP_SECRET_1MSL
+					+ (0x0f & tcp_cookie_work(bakery, 0));
+			} else {
+				tcp_secret_secondary->expires = jiffy
+					+ TCP_SECRET_LIFE
+					+ (0xff & tcp_cookie_work(bakery, 1));
+				tcp_secret_primary->expires = jiffy
+					+ TCP_SECRET_2MSL
+					+ (0x1f & tcp_cookie_work(bakery, 2));
+			}
+			memcpy(&tcp_secret_secondary->secrets[0],
+			       bakery, COOKIE_WORKSPACE_WORDS);
+
+			rcu_assign_pointer(tcp_secret_generating,
+					   tcp_secret_secondary);
+			rcu_assign_pointer(tcp_secret_retiring,
+					   tcp_secret_primary);
+			/*
+			 * Neither call_rcu() nor synchronize_rcu() needed.
+			 * Retiring data is not freed.  It is replaced after
+			 * further (locked) pointer updates, and a quiet time
+			 * (minimum 1MSL, maximum LIFE - 2MSL).
+			 */
+		}
+		spin_unlock_bh(&tcp_secret_locker);
+	} else {
+		rcu_read_lock_bh();
+		memcpy(bakery,
+		       &rcu_dereference(tcp_secret_generating)->secrets[0],
+		       COOKIE_WORKSPACE_WORDS);
+		rcu_read_unlock_bh();
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
+void tcp_done(struct sock *sk)
+{
+	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
+		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+
+	tcp_set_state(sk, TCP_CLOSE);
+	tcp_clear_xmit_timers(sk);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+	else
+		inet_csk_destroy_sock(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_done);
+
+extern struct tcp_congestion_ops tcp_reno;
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	thash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+void __init tcp_init(void)
+{
+	struct sk_buff *skb = NULL;
+	unsigned long limit;
+	int i, max_rshare, max_wshare, cnt;
+	unsigned long jiffy = jiffies;
+
+	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
+
+	percpu_counter_init(&tcp_sockets_allocated, 0);
+	percpu_counter_init(&tcp_orphan_count, 0);
+	tcp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("tcp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	/* Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	tcp_hashinfo.ehash =
+		alloc_large_system_hash("TCP established",
+					sizeof(struct inet_ehash_bucket),
+					thash_entries,
+					(totalram_pages >= 128 * 1024) ?
+					13 : 15,
+					0,
+					NULL,
+					&tcp_hashinfo.ehash_mask,
+					thash_entries ? 0 : 512 * 1024);
+	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
+		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
+	}
+	if (inet_ehash_locks_alloc(&tcp_hashinfo))
+		panic("TCP: failed to alloc ehash_locks");
+	tcp_hashinfo.bhash =
+		alloc_large_system_hash("TCP bind",
+					sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.ehash_mask + 1,
+					(totalram_pages >= 128 * 1024) ?
+					13 : 15,
+					0,
+					&tcp_hashinfo.bhash_size,
+					NULL,
+					64 * 1024);
+	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+	}
+
+
+	cnt = tcp_hashinfo.ehash_mask + 1;
+
+	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+	sysctl_tcp_max_orphans = cnt / 2;
+	sysctl_max_syn_backlog = max(128, cnt / 256);
+
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	sysctl_tcp_mem[0] = limit / 4 * 3;
+	sysctl_tcp_mem[1] = limit;
+	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+
+	/* Set per-socket limits to no more than 1/128 the pressure threshold */
+	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
+	max_wshare = min(4UL*1024*1024, limit);
+	max_rshare = min(6UL*1024*1024, limit);
+
+	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+	sysctl_tcp_wmem[1] = 16*1024;
+	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
+
+	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
+	sysctl_tcp_rmem[1] = 87380;
+	sysctl_tcp_rmem[2] = max(87380, max_rshare);
+
+	printk(KERN_INFO "TCP: Hash tables configured "
+	       "(established %u bind %u)\n",
+	       tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
+
+	tcp_register_congestion_control(&tcp_reno);
+
+	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+	tcp_secret_one.expires = jiffy; /* past due */
+	tcp_secret_two.expires = jiffy; /* past due */
+	tcp_secret_generating = &tcp_secret_one;
+	tcp_secret_primary = &tcp_secret_one;
+	tcp_secret_retiring = &tcp_secret_two;
+	tcp_secret_secondary = &tcp_secret_two;
+}
+
+static int tcp_is_local(struct net *net, __be32 addr) {
+	struct rtable *rt;
+	struct flowi4 fl4 = { .daddr = addr };
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR_OR_NULL(rt))
+		return 0;
+	return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
+	struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
+	return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK);
+}
+#endif
+
+/*
+ * tcp_nuke_addr - destroy all sockets on the given local address
+ * if local address is the unspecified address (0.0.0.0 or ::), destroy all
+ * sockets with local addresses that are not configured.
+ */
+int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
+{
+	int family = addr->sa_family;
+	unsigned int bucket;
+
+	struct in_addr *in;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct in6_addr *in6 = NULL;
+#endif
+	if (family == AF_INET) {
+		in = &((struct sockaddr_in *)addr)->sin_addr;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	} else if (family == AF_INET6) {
+		in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
+#endif
+	} else {
+		return -EAFNOSUPPORT;
+	}
+
+	for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
+		struct hlist_nulls_node *node;
+		struct sock *sk;
+		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
+
+restart:
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
+				continue;
+			if (sock_flag(sk, SOCK_DEAD))
+				continue;
+
+			if (family == AF_INET) {
+				__be32 s4 = inet->inet_rcv_saddr;
+				if (s4 == LOOPBACK4_IPV6)
+					continue;
+
+				if (in->s_addr != s4 &&
+				    !(in->s_addr == INADDR_ANY &&
+				      !tcp_is_local(net, s4)))
+					continue;
+			}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			if (family == AF_INET6) {
+				struct in6_addr *s6;
+				if (!inet->pinet6)
+					continue;
+
+				s6 = &inet->pinet6->rcv_saddr;
+				if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
+					continue;
+
+				if (!ipv6_addr_equal(in6, s6) &&
+				    !(ipv6_addr_equal(in6, &in6addr_any) &&
+				      !tcp_is_local6(net, s6)))
+				continue;
+			}
+#endif
+
+			sock_hold(sk);
+			spin_unlock_bh(lock);
+
+			local_bh_disable();
+			bh_lock_sock(sk);
+			sk->sk_err = ETIMEDOUT;
+			sk->sk_error_report(sk);
+
+			tcp_done(sk);
+			bh_unlock_sock(sk);
+			local_bh_enable();
+			sock_put(sk);
+
+			goto restart;
+		}
+		spin_unlock_bh(lock);
+	}
+
+	return 0;
+}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 00000000..6187eb4d
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,239 @@
+/*
+ * Binary Increase Congestion control for TCP
+ * Home page:
+ *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define BICTCP_B		4	 /*
+					  * In binary search,
+					  * go to point (max+min)/N
+					  */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int low_window = 14;
+static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	loss_cwnd;	/* congestion window at last loss */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	epoch_start;	/* beginning of an epoch */
+#define ACK_RATIO_SHIFT	4
+	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->loss_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+	bictcp_reset(inet_csk_ca(sk));
+	if (initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_time_stamp;
+
+	if (ca->epoch_start == 0) /* record the beginning of an epoch */
+		ca->epoch_start = tcp_time_stamp;
+
+	/* start off normal */
+	if (cwnd <= low_window) {
+		ca->cnt = cwnd;
+		return;
+	}
+
+	/* binary increase */
+	if (cwnd < ca->last_max_cwnd) {
+		__u32 	dist = (ca->last_max_cwnd - cwnd)
+			/ BICTCP_B;
+
+		if (dist > max_increment)
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+		else if (dist <= 1U)
+			/* binary search increase */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else
+			/* binary search increase */
+			ca->cnt = cwnd / dist;
+	} else {
+		/* slow start AMD linear increase */
+		if (cwnd < ca->last_max_cwnd + BICTCP_B)
+			/* slow start */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+			/* slow start */
+			ca->cnt = (cwnd * (BICTCP_B-1))
+				/ (cwnd - ca->last_max_cwnd);
+		else
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+	}
+
+	/* if in slow start or link utilization is very low */
+	if (ca->loss_cwnd == 0) {
+		if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+			ca->cnt = 20;
+	}
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+	else {
+		bictcp_update(ca, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, ca->cnt);
+	}
+
+}
+
+/*
+ *	behave like Reno until low_window is reached,
+ *	then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tp->snd_cwnd;
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+
+	if (tp->snd_cwnd <= low_window)
+		return max(tp->snd_cwnd >> 1U, 2U);
+	else
+		return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bictcp *ca = inet_csk_ca(sk);
+	return max(tp->snd_cwnd, ca->last_max_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss)
+		bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		struct bictcp *ca = inet_csk_ca(sk);
+		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+		ca->delayed_ack += cnt;
+	}
+}
+
+
+static struct tcp_congestion_ops bictcp __read_mostly = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= bictcp_undo_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "bic",
+};
+
+static int __init bictcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 00000000..850c737e
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,424 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_max_ssthresh = 0;
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+/*
+ * Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret = 0;
+
+	/* all algorithms must implement ssthresh and cong_avoid ops */
+	if (!ca->ssthresh || !ca->cong_avoid) {
+		printk(KERN_ERR "TCP %s does not implement required ops\n",
+		       ca->name);
+		return -EINVAL;
+	}
+
+	spin_lock(&tcp_cong_list_lock);
+	if (tcp_ca_find(ca->name)) {
+		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+		ret = -EEXIST;
+	} else {
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
+		printk(KERN_INFO "TCP %s registered\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+	spin_lock(&tcp_cong_list_lock);
+	list_del_rcu(&ca->list);
+	spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_congestion_ops *ca;
+
+	/* if no choice made yet assign the current value set as default */
+	if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+			if (try_module_get(ca->owner)) {
+				icsk->icsk_ca_ops = ca;
+				break;
+			}
+
+			/* fallback to next available */
+		}
+		rcu_read_unlock();
+	}
+
+	if (icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->release)
+		icsk->icsk_ca_ops->release(sk);
+	module_put(icsk->icsk_ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int ret = -ENOENT;
+
+	spin_lock(&tcp_cong_list_lock);
+	ca = tcp_ca_find(name);
+#ifdef CONFIG_MODULES
+	if (!ca && capable(CAP_NET_ADMIN)) {
+		spin_unlock(&tcp_cong_list_lock);
+
+		request_module("tcp_%s", name);
+		spin_lock(&tcp_cong_list_lock);
+		ca = tcp_ca_find(name);
+	}
+#endif
+
+	if (ca) {
+		ca->flags |= TCP_CONG_NON_RESTRICTED;	/* default is always allowed */
+		list_move(&ca->list, &tcp_cong_list);
+		ret = 0;
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+
+/* Set default value from kernel configuration at bootup */
+static int __init tcp_congestion_default(void)
+{
+	return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+}
+late_initcall(tcp_congestion_default);
+
+
+/* Build string with list of available congestion control values */
+void tcp_get_available_congestion_control(char *buf, size_t maxlen)
+{
+	struct tcp_congestion_ops *ca;
+	size_t offs = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ca->name);
+
+	}
+	rcu_read_unlock();
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+	struct tcp_congestion_ops *ca;
+	/* We will always have reno... */
+	BUG_ON(list_empty(&tcp_cong_list));
+
+	rcu_read_lock();
+	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+	strncpy(name, ca->name, TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+}
+
+/* Built list of non-restricted congestion control values */
+void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
+{
+	struct tcp_congestion_ops *ca;
+	size_t offs = 0;
+
+	*buf = '\0';
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
+			continue;
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", ca->name);
+
+	}
+	rcu_read_unlock();
+}
+
+/* Change list of non-restricted congestion control */
+int tcp_set_allowed_congestion_control(char *val)
+{
+	struct tcp_congestion_ops *ca;
+	char *saved_clone, *clone, *name;
+	int ret = 0;
+
+	saved_clone = clone = kstrdup(val, GFP_USER);
+	if (!clone)
+		return -ENOMEM;
+
+	spin_lock(&tcp_cong_list_lock);
+	/* pass 1 check for bad entries */
+	while ((name = strsep(&clone, " ")) && *name) {
+		ca = tcp_ca_find(name);
+		if (!ca) {
+			ret = -ENOENT;
+			goto out;
+		}
+	}
+
+	/* pass 2 clear old values */
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
+		ca->flags &= ~TCP_CONG_NON_RESTRICTED;
+
+	/* pass 3 mark as allowed */
+	while ((name = strsep(&val, " ")) && *name) {
+		ca = tcp_ca_find(name);
+		WARN_ON(!ca);
+		if (ca)
+			ca->flags |= TCP_CONG_NON_RESTRICTED;
+	}
+out:
+	spin_unlock(&tcp_cong_list_lock);
+	kfree(saved_clone);
+
+	return ret;
+}
+
+
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct sock *sk, const char *name)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_congestion_ops *ca;
+	int err = 0;
+
+	rcu_read_lock();
+	ca = tcp_ca_find(name);
+
+	/* no change asking for existing value */
+	if (ca == icsk->icsk_ca_ops)
+		goto out;
+
+#ifdef CONFIG_MODULES
+	/* not found attempt to autoload module */
+	if (!ca && capable(CAP_NET_ADMIN)) {
+		rcu_read_unlock();
+		request_module("tcp_%s", name);
+		rcu_read_lock();
+		ca = tcp_ca_find(name);
+	}
+#endif
+	if (!ca)
+		err = -ENOENT;
+
+	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
+		err = -EPERM;
+
+	else if (!try_module_get(ca->owner))
+		err = -EBUSY;
+
+	else {
+		tcp_cleanup_congestion_control(sk);
+		icsk->icsk_ca_ops = ca;
+
+		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
+			icsk->icsk_ca_ops->init(sk);
+	}
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
+/* RFC2861 Check whether we are limited by application or congestion window
+ * This is the inverse of cwnd check in tcp_tso_should_defer
+ */
+int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 left;
+
+	if (in_flight >= tp->snd_cwnd)
+		return 1;
+
+	left = tp->snd_cwnd - in_flight;
+	if (sk_can_gso(sk) &&
+	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
+	    left * tp->mss_cache < sk->sk_gso_max_size)
+		return 1;
+	return left <= tcp_max_burst(tp);
+}
+EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
+
+/*
+ * Slow start is used when congestion window is less than slow start
+ * threshold. This version implements the basic RFC2581 version
+ * and optionally supports:
+ * 	RFC3742 Limited Slow Start  	  - growth limited to max_ssthresh
+ *	RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
+ */
+void tcp_slow_start(struct tcp_sock *tp)
+{
+	int cnt; /* increase in packets */
+
+	/* RFC3465: ABC Slow start
+	 * Increase only after a full MSS of bytes is acked
+	 *
+	 * TCP sender SHOULD increase cwnd by the number of
+	 * previously unacknowledged bytes ACKed by each incoming
+	 * acknowledgment, provided the increase is not more than L
+	 */
+	if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
+		return;
+
+	if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
+		cnt = sysctl_tcp_max_ssthresh >> 1;	/* limited slow start */
+	else
+		cnt = tp->snd_cwnd;			/* exponential increase */
+
+	/* RFC3465: ABC
+	 * We MAY increase by 2 if discovered delayed ack
+	 */
+	if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
+		cnt <<= 1;
+	tp->bytes_acked = 0;
+
+	tp->snd_cwnd_cnt += cnt;
+	while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+		tp->snd_cwnd_cnt -= tp->snd_cwnd;
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
+/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+{
+	if (tp->snd_cwnd_cnt >= w) {
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+		tp->snd_cwnd_cnt = 0;
+	} else {
+		tp->snd_cwnd_cnt++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	/* In "safe" area, increase. */
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+
+	/* In dangerous area, increase slowly. */
+	else if (sysctl_tcp_abc) {
+		/* RFC3465: Appropriate Byte Count
+		 * increase once for each full cwnd acked
+		 */
+		if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
+			tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+		}
+	} else {
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+/* Lower bound on congestion window with halving. */
+u32 tcp_reno_min_cwnd(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+	.flags		= TCP_CONG_NON_RESTRICTED,
+	.name		= "reno",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+	.name		= "",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 00000000..f376b05c
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,492 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of CUBIC TCP in
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ *  in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ *  Sangtae Ha and Injong Rhee,
+ *  "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+#include <net/tcp.h>
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */
+
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN	0x1
+#define HYSTART_DELAY		0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES	8
+#define HYSTART_DELAY_MIN	(4U<<3)
+#define HYSTART_DELAY_MAX	(16U<<3)
+#define HYSTART_DELAY_THRESH(x)	clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence __read_mostly = 1;
+static int beta __read_mostly = 717;	/* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static int tcp_friendliness __read_mostly = 1;
+
+static int hystart __read_mostly = 1;
+static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window __read_mostly = 16;
+static int hystart_ack_delta __read_mostly = 2;
+
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+module_param(hystart, int, 0644);
+MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
+module_param(hystart_detect, int, 0644);
+MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
+		 " 1: packet-train 2: delay 3: both packet-train and delay");
+module_param(hystart_low_window, int, 0644);
+MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
+module_param(hystart_ack_delta, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	loss_cwnd;	/* congestion window at last loss */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	bic_origin_point;/* origin point of bic function */
+	u32	bic_K;		/* time to origin point from the beginning of the current epoch */
+	u32	delay_min;	/* min delay (msec << 3) */
+	u32	epoch_start;	/* beginning of an epoch */
+	u32	ack_cnt;	/* number of acks */
+	u32	tcp_cwnd;	/* estimated tcp cwnd */
+#define ACK_RATIO_SHIFT	4
+#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
+	u16	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+	u8	sample_cnt;	/* number of samples to decide curr_rtt */
+	u8	found;		/* the exit point is found? */
+	u32	round_start;	/* beginning of each round */
+	u32	end_seq;	/* end_seq of the round */
+	u32	last_ack;	/* last time when the ACK spacing is close */
+	u32	curr_rtt;	/* the minimum rtt of current round */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->loss_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->bic_origin_point = 0;
+	ca->bic_K = 0;
+	ca->delay_min = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+	ca->ack_cnt = 0;
+	ca->tcp_cwnd = 0;
+	ca->found = 0;
+}
+
+static inline u32 bictcp_clock(void)
+{
+#if HZ < 1000
+	return ktime_to_ms(ktime_get_real());
+#else
+	return jiffies_to_msecs(jiffies);
+#endif
+}
+
+static inline void bictcp_hystart_reset(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->round_start = ca->last_ack = bictcp_clock();
+	ca->end_seq = tp->snd_nxt;
+	ca->curr_rtt = 0;
+	ca->sample_cnt = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+	bictcp_reset(inet_csk_ca(sk));
+
+	if (hystart)
+		bictcp_hystart_reset(sk);
+
+	if (!hystart && initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static u32 cubic_root(u64 a)
+{
+	u32 x, b, shift;
+	/*
+	 * cbrt(x) MSB values for x MSB values in [0..63].
+	 * Precomputed then refined by hand - Willy Tarreau
+	 *
+	 * For x in [0..63],
+	 *   v = cbrt(x << 18) - 1
+	 *   cbrt(x) = (v[x] + 10) >> 6
+	 */
+	static const u8 v[] = {
+		/* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
+		/* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
+		/* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
+		/* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
+		/* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
+		/* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
+		/* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
+		/* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
+	};
+
+	b = fls64(a);
+	if (b < 7) {
+		/* a in [0..63] */
+		return ((u32)v[(u32)a] + 35) >> 6;
+	}
+
+	b = ((b * 84) >> 8) - 1;
+	shift = (a >> (b * 3));
+
+	x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+
+	/*
+	 * Newton-Raphson iteration
+	 *                         2
+	 * x    = ( 2 * x  +  a / x  ) / 3
+	 *  k+1          k         k
+	 */
+	x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+	x = ((x * 341) >> 10);
+	return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	u64 offs;
+	u32 delta, t, bic_target, max_cnt;
+
+	ca->ack_cnt++;	/* count the number of ACKs */
+
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_time_stamp;
+
+	if (ca->epoch_start == 0) {
+		ca->epoch_start = tcp_time_stamp;	/* record the beginning of an epoch */
+		ca->ack_cnt = 1;			/* start counting */
+		ca->tcp_cwnd = cwnd;			/* syn with cubic */
+
+		if (ca->last_max_cwnd <= cwnd) {
+			ca->bic_K = 0;
+			ca->bic_origin_point = cwnd;
+		} else {
+			/* Compute new K based on
+			 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+			 */
+			ca->bic_K = cubic_root(cube_factor
+					       * (ca->last_max_cwnd - cwnd));
+			ca->bic_origin_point = ca->last_max_cwnd;
+		}
+	}
+
+	/* cubic function - calc*/
+	/* calculate c * time^3 / rtt,
+	 *  while considering overflow in calculation of time^3
+	 * (so time^3 is done by using 64 bit)
+	 * and without the support of division of 64bit numbers
+	 * (so all divisions are done by using 32 bit)
+	 *  also NOTE the unit of those veriables
+	 *	  time  = (t - K) / 2^bictcp_HZ
+	 *	  c = bic_scale >> 10
+	 * rtt  = (srtt >> 3) / HZ
+	 * !!! The following code does not have overflow problems,
+	 * if the cwnd < 1 million packets !!!
+	 */
+
+	/* change the unit from HZ to bictcp_HZ */
+	t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
+	      - ca->epoch_start) << BICTCP_HZ) / HZ;
+
+	if (t < ca->bic_K)		/* t - K */
+		offs = ca->bic_K - t;
+	else
+		offs = t - ca->bic_K;
+
+	/* c/rtt * (t-K)^3 */
+	delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+	if (t < ca->bic_K)                                	/* below origin*/
+		bic_target = ca->bic_origin_point - delta;
+	else                                                	/* above origin*/
+		bic_target = ca->bic_origin_point + delta;
+
+	/* cubic function - calc bictcp_cnt*/
+	if (bic_target > cwnd) {
+		ca->cnt = cwnd / (bic_target - cwnd);
+	} else {
+		ca->cnt = 100 * cwnd;              /* very small increment*/
+	}
+
+	/*
+	 * The initial growth of cubic function may be too conservative
+	 * when the available bandwidth is still unknown.
+	 */
+	if (ca->loss_cwnd == 0 && ca->cnt > 20)
+		ca->cnt = 20;	/* increase cwnd 5% per RTT */
+
+	/* TCP Friendly */
+	if (tcp_friendliness) {
+		u32 scale = beta_scale;
+		delta = (cwnd * scale) >> 3;
+		while (ca->ack_cnt > delta) {		/* update tcp cwnd */
+			ca->ack_cnt -= delta;
+			ca->tcp_cwnd++;
+		}
+
+		if (ca->tcp_cwnd > cwnd){	/* if bic is slower than tcp */
+			delta = ca->tcp_cwnd - cwnd;
+			max_cnt = cwnd / delta;
+			if (ca->cnt > max_cnt)
+				ca->cnt = max_cnt;
+		}
+	}
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		if (hystart && after(ack, ca->end_seq))
+			bictcp_hystart_reset(sk);
+		tcp_slow_start(tp);
+	} else {
+		bictcp_update(ca, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, ca->cnt);
+	}
+
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tp->snd_cwnd;
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss) {
+		bictcp_reset(inet_csk_ca(sk));
+		bictcp_hystart_reset(sk);
+	}
+}
+
+static void hystart_update(struct sock *sk, u32 delay)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (!(ca->found & hystart_detect)) {
+		u32 now = bictcp_clock();
+
+		/* first detection parameter - ack-train detection */
+		if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+			ca->last_ack = now;
+			if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
+				ca->found |= HYSTART_ACK_TRAIN;
+		}
+
+		/* obtain the minimum delay of more than sampling packets */
+		if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+			if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
+				ca->curr_rtt = delay;
+
+			ca->sample_cnt++;
+		} else {
+			if (ca->curr_rtt > ca->delay_min +
+			    HYSTART_DELAY_THRESH(ca->delay_min>>4))
+				ca->found |= HYSTART_DELAY;
+		}
+		/*
+		 * Either one of two conditions are met,
+		 * we exit from slow start immediately.
+		 */
+		if (ca->found & hystart_detect)
+			tp->snd_ssthresh = tp->snd_cwnd;
+	}
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+	u32 delay;
+
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		u32 ratio = ca->delayed_ack;
+
+		ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+		ratio += cnt;
+
+		ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
+	}
+
+	/* Some calls are for duplicates without timetamps */
+	if (rtt_us < 0)
+		return;
+
+	/* Discard delay samples right after fast recovery */
+	if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+		return;
+
+	delay = (rtt_us << 3) / USEC_PER_MSEC;
+	if (delay == 0)
+		delay = 1;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
+
+	/* hystart triggers when cwnd is larger than some threshold */
+	if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
+	    tp->snd_cwnd >= hystart_low_window)
+		hystart_update(sk, delay);
+}
+
+static struct tcp_congestion_ops cubictcp __read_mostly = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= bictcp_undo_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+	/* Precompute a bunch of the scaling factors that are used per-packet
+	 * based on SRTT of 100ms
+	 */
+
+	beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+
+	cube_rtt_scale = (bic_scale * 10);	/* 1024*c/rtt */
+
+	/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+	 *  so K = cubic_root( (wmax-cwnd)*rtt/c )
+	 * the unit of K is bictcp_HZ=2^10, not HZ
+	 *
+	 *  c = bic_scale >> 10
+	 *  rtt = 100ms
+	 *
+	 * the following code has been designed and tested for
+	 * cwnd < 1 million packets
+	 * RTT < 100 seconds
+	 * HZ < 1,000,00  (corresponding to 10 nano-second)
+	 */
+
+	/* 1/c * 2^2*bictcp_HZ * srtt */
+	cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+	/* divide by bic_scale and by constant Srtt (100ms) */
+	do_div(cube_factor, bic_scale * 10);
+
+	/* hystart needs ms clock resolution */
+	if (hystart && HZ < 1000)
+		cubictcp.flags |= TCP_CONG_RTT_STAMP;
+
+	return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.3");
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 00000000..939edb3b
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,57 @@
+/*
+ * tcp_diag.c	Module for monitoring TCP transport protocols sockets.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *_info)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_info *info = _info;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		r->idiag_rqueue = sk->sk_ack_backlog;
+		r->idiag_wqueue = sk->sk_max_ack_backlog;
+	} else {
+		r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+		r->idiag_wqueue = tp->write_seq - tp->snd_una;
+	}
+	if (info != NULL)
+		tcp_get_info(sk, info);
+}
+
+static const struct inet_diag_handler tcp_diag_handler = {
+	.idiag_hashinfo	 = &tcp_hashinfo,
+	.idiag_get_info	 = tcp_diag_get_info,
+	.idiag_type	 = TCPDIAG_GETSOCK,
+	.idiag_info_size = sizeof(struct tcp_info),
+};
+
+static int __init tcp_diag_init(void)
+{
+	return inet_diag_register(&tcp_diag_handler);
+}
+
+static void __exit tcp_diag_exit(void)
+{
+	inet_diag_unregister(&tcp_diag_handler);
+}
+
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 00000000..30f27f6b
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,187 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+	unsigned int cwnd;
+	unsigned int md;
+} hstcp_aimd_vals[] = {
+ {     38,  128, /*  0.50 */ },
+ {    118,  112, /*  0.44 */ },
+ {    221,  104, /*  0.41 */ },
+ {    347,   98, /*  0.38 */ },
+ {    495,   93, /*  0.37 */ },
+ {    663,   89, /*  0.35 */ },
+ {    851,   86, /*  0.34 */ },
+ {   1058,   83, /*  0.33 */ },
+ {   1284,   81, /*  0.32 */ },
+ {   1529,   78, /*  0.31 */ },
+ {   1793,   76, /*  0.30 */ },
+ {   2076,   74, /*  0.29 */ },
+ {   2378,   72, /*  0.28 */ },
+ {   2699,   71, /*  0.28 */ },
+ {   3039,   69, /*  0.27 */ },
+ {   3399,   68, /*  0.27 */ },
+ {   3778,   66, /*  0.26 */ },
+ {   4177,   65, /*  0.26 */ },
+ {   4596,   64, /*  0.25 */ },
+ {   5036,   62, /*  0.25 */ },
+ {   5497,   61, /*  0.24 */ },
+ {   5979,   60, /*  0.24 */ },
+ {   6483,   59, /*  0.23 */ },
+ {   7009,   58, /*  0.23 */ },
+ {   7558,   57, /*  0.22 */ },
+ {   8130,   56, /*  0.22 */ },
+ {   8726,   55, /*  0.22 */ },
+ {   9346,   54, /*  0.21 */ },
+ {   9991,   53, /*  0.21 */ },
+ {  10661,   52, /*  0.21 */ },
+ {  11358,   52, /*  0.20 */ },
+ {  12082,   51, /*  0.20 */ },
+ {  12834,   50, /*  0.20 */ },
+ {  13614,   49, /*  0.19 */ },
+ {  14424,   48, /*  0.19 */ },
+ {  15265,   48, /*  0.19 */ },
+ {  16137,   47, /*  0.19 */ },
+ {  17042,   46, /*  0.18 */ },
+ {  17981,   45, /*  0.18 */ },
+ {  18955,   45, /*  0.18 */ },
+ {  19965,   44, /*  0.17 */ },
+ {  21013,   43, /*  0.17 */ },
+ {  22101,   43, /*  0.17 */ },
+ {  23230,   42, /*  0.17 */ },
+ {  24402,   41, /*  0.16 */ },
+ {  25618,   41, /*  0.16 */ },
+ {  26881,   40, /*  0.16 */ },
+ {  28193,   39, /*  0.16 */ },
+ {  29557,   39, /*  0.15 */ },
+ {  30975,   38, /*  0.15 */ },
+ {  32450,   38, /*  0.15 */ },
+ {  33986,   37, /*  0.15 */ },
+ {  35586,   36, /*  0.14 */ },
+ {  37253,   36, /*  0.14 */ },
+ {  38992,   35, /*  0.14 */ },
+ {  40808,   35, /*  0.14 */ },
+ {  42707,   34, /*  0.13 */ },
+ {  44694,   33, /*  0.13 */ },
+ {  46776,   33, /*  0.13 */ },
+ {  48961,   32, /*  0.13 */ },
+ {  51258,   32, /*  0.13 */ },
+ {  53677,   31, /*  0.12 */ },
+ {  56230,   30, /*  0.12 */ },
+ {  58932,   30, /*  0.12 */ },
+ {  61799,   29, /*  0.12 */ },
+ {  64851,   28, /*  0.11 */ },
+ {  68113,   28, /*  0.11 */ },
+ {  71617,   27, /*  0.11 */ },
+ {  75401,   26, /*  0.10 */ },
+ {  79517,   26, /*  0.10 */ },
+ {  84035,   25, /*  0.10 */ },
+ {  89053,   24, /*  0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX	ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+	u32	ai;
+};
+
+static void hstcp_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
+
+	ca->ai = 0;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+	else {
+		/* Update AIMD parameters.
+		 *
+		 * We want to guarantee that:
+		 *     hstcp_aimd_vals[ca->ai-1].cwnd <
+		 *     snd_cwnd <=
+		 *     hstcp_aimd_vals[ca->ai].cwnd
+		 */
+		if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai < HSTCP_AIMD_MAX - 1)
+				ca->ai++;
+		} else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+			while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+				ca->ai--;
+		}
+
+		/* Do additive increase */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+			/* cwnd = cwnd + a(w) / cwnd */
+			tp->snd_cwnd_cnt += ca->ai + 1;
+			if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+				tp->snd_cwnd_cnt -= tp->snd_cwnd;
+				tp->snd_cwnd++;
+			}
+		}
+	}
+}
+
+static u32 hstcp_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct hstcp *ca = inet_csk_ca(sk);
+
+	/* Do multiplicative decrease */
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
+	.init		= hstcp_init,
+	.ssthresh	= hstcp_ssthresh,
+	.cong_avoid	= hstcp_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 00000000..c1a81753
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,315 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ *   "H-TCP: TCP for high-speed and long-distance networks"
+ *   Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE	(1<<7)	/* 1.0 with shift << 7 */
+#define BETA_MIN	(1<<6)	/* 0.5 with shift << 7 */
+#define BETA_MAX	102	/* 0.8 with shift << 7 */
+
+static int use_rtt_scaling __read_mostly = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch __read_mostly = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+	u32	alpha;		/* Fixed point arith, << 7 */
+	u8	beta;           /* Fixed point arith, << 7 */
+	u8	modeswitch;	/* Delay modeswitch
+				   until we had at least one congestion event */
+	u16	pkts_acked;
+	u32	packetcount;
+	u32	minRTT;
+	u32	maxRTT;
+	u32	last_cong;	/* Time since last congestion event end */
+	u32	undo_last_cong;
+
+	u32	undo_maxRTT;
+	u32	undo_old_maxB;
+
+	/* Bandwidth estimation */
+	u32	minB;
+	u32	maxB;
+	u32	old_maxB;
+	u32	Bi;
+	u32	lasttime;
+};
+
+static inline u32 htcp_cong_time(const struct htcp *ca)
+{
+	return jiffies - ca->last_cong;
+}
+
+static inline u32 htcp_ccount(const struct htcp *ca)
+{
+	return htcp_cong_time(ca) / ca->minRTT;
+}
+
+static inline void htcp_reset(struct htcp *ca)
+{
+	ca->undo_last_cong = ca->last_cong;
+	ca->undo_maxRTT = ca->maxRTT;
+	ca->undo_old_maxB = ca->old_maxB;
+
+	ca->last_cong = jiffies;
+}
+
+static u32 htcp_cwnd_undo(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+
+	if (ca->undo_last_cong) {
+		ca->last_cong = ca->undo_last_cong;
+		ca->maxRTT = ca->undo_maxRTT;
+		ca->old_maxB = ca->undo_old_maxB;
+		ca->undo_last_cong = 0;
+	}
+
+	return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+}
+
+static inline void measure_rtt(struct sock *sk, u32 srtt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+
+	/* keep track of minimum RTT seen so far, minRTT is zero at first */
+	if (ca->minRTT > srtt || !ca->minRTT)
+		ca->minRTT = srtt;
+
+	/* max RTT */
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		if (ca->maxRTT < ca->minRTT)
+			ca->maxRTT = ca->minRTT;
+		if (ca->maxRTT < srtt &&
+		    srtt <= ca->maxRTT + msecs_to_jiffies(20))
+			ca->maxRTT = srtt;
+	}
+}
+
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+	u32 now = tcp_time_stamp;
+
+	if (icsk->icsk_ca_state == TCP_CA_Open)
+		ca->pkts_acked = pkts_acked;
+
+	if (rtt > 0)
+		measure_rtt(sk, usecs_to_jiffies(rtt));
+
+	if (!use_bandwidth_switch)
+		return;
+
+	/* achieved throughput calculations */
+	if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
+		ca->packetcount = 0;
+		ca->lasttime = now;
+		return;
+	}
+
+	ca->packetcount += pkts_acked;
+
+	if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+	    now - ca->lasttime >= ca->minRTT &&
+	    ca->minRTT > 0) {
+		__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
+
+		if (htcp_ccount(ca) <= 3) {
+			/* just after backoff */
+			ca->minB = ca->maxB = ca->Bi = cur_Bi;
+		} else {
+			ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
+			if (ca->Bi > ca->maxB)
+				ca->maxB = ca->Bi;
+			if (ca->minB > ca->maxB)
+				ca->minB = ca->maxB;
+		}
+		ca->packetcount = 0;
+		ca->lasttime = now;
+	}
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+	if (use_bandwidth_switch) {
+		u32 maxB = ca->maxB;
+		u32 old_maxB = ca->old_maxB;
+		ca->old_maxB = ca->maxB;
+
+		if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
+			ca->beta = BETA_MIN;
+			ca->modeswitch = 0;
+			return;
+		}
+	}
+
+	if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
+		ca->beta = (minRTT << 7) / maxRTT;
+		if (ca->beta < BETA_MIN)
+			ca->beta = BETA_MIN;
+		else if (ca->beta > BETA_MAX)
+			ca->beta = BETA_MAX;
+	} else {
+		ca->beta = BETA_MIN;
+		ca->modeswitch = 1;
+	}
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+	u32 minRTT = ca->minRTT;
+	u32 factor = 1;
+	u32 diff = htcp_cong_time(ca);
+
+	if (diff > HZ) {
+		diff -= HZ;
+		factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
+	}
+
+	if (use_rtt_scaling && minRTT) {
+		u32 scale = (HZ << 3) / (10 * minRTT);
+
+		/* clamping ratio to interval [0.5,10]<<3 */
+		scale = min(max(scale, 1U << 2), 10U << 3);
+		factor = (factor << 3) / scale;
+		if (!factor)
+			factor = 1;
+	}
+
+	ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
+	if (!ca->alpha)
+		ca->alpha = ALPHA_BASE;
+}
+
+/*
+ * After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct sock *sk)
+{
+	struct htcp *ca = inet_csk_ca(sk);
+	u32 minRTT = ca->minRTT;
+	u32 maxRTT = ca->maxRTT;
+
+	htcp_beta_update(ca, minRTT, maxRTT);
+	htcp_alpha_update(ca);
+
+	/* add slowly fading memory for maxRTT to accommodate routing changes */
+	if (minRTT > 0 && maxRTT > minRTT)
+		ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
+}
+
+static u32 htcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct htcp *ca = inet_csk_ca(sk);
+
+	htcp_param_update(sk);
+	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+	else {
+		/* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+		 */
+		if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+			htcp_alpha_update(ca);
+		} else
+			tp->snd_cwnd_cnt += ca->pkts_acked;
+
+		ca->pkts_acked = 1;
+	}
+}
+
+static void htcp_init(struct sock *sk)
+{
+	struct htcp *ca = inet_csk_ca(sk);
+
+	memset(ca, 0, sizeof(struct htcp));
+	ca->alpha = ALPHA_BASE;
+	ca->beta = BETA_MIN;
+	ca->pkts_acked = 1;
+	ca->last_cong = jiffies;
+}
+
+static void htcp_state(struct sock *sk, u8 new_state)
+{
+	switch (new_state) {
+	case TCP_CA_Open:
+		{
+			struct htcp *ca = inet_csk_ca(sk);
+			if (ca->undo_last_cong) {
+				ca->last_cong = jiffies;
+				ca->undo_last_cong = 0;
+			}
+		}
+		break;
+	case TCP_CA_CWR:
+	case TCP_CA_Recovery:
+	case TCP_CA_Loss:
+		htcp_reset(inet_csk_ca(sk));
+		break;
+	}
+}
+
+static struct tcp_congestion_ops htcp __read_mostly = {
+	.init		= htcp_init,
+	.ssthresh	= htcp_recalc_ssthresh,
+	.cong_avoid	= htcp_cong_avoid,
+	.set_state	= htcp_state,
+	.undo_cwnd	= htcp_cwnd_undo,
+	.pkts_acked	= measure_achieved_throughput,
+	.owner		= THIS_MODULE,
+	.name		= "htcp",
+};
+
+static int __init htcp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
+	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+	return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 00000000..fe3ecf48
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,192 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ *   C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ *   for Heterogeneous Networks",
+ *   International Journal on satellite Communications,
+ *				       September 2004
+ *    Daniele Lacamera
+ *    root at danielinux.net
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+	u8    hybla_en;
+	u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+	u32   rho;	      /* Rho parameter, integer part  */
+	u32   rho2;	      /* Rho * Rho, integer part */
+	u32   rho_3ls;	      /* Rho parameter, <<3 */
+	u32   rho2_7ls;	      /* Rho^2, <<7	*/
+	u32   minrtt;	      /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+   expressed in jiffies */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct sock *sk)
+{
+	struct hybla *ca = inet_csk_ca(sk);
+
+	ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+	ca->rho = ca->rho_3ls >> 3;
+	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+	ca->rho2 = ca->rho2_7ls >>7;
+}
+
+static void hybla_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
+
+	ca->rho = 0;
+	ca->rho2 = 0;
+	ca->rho_3ls = 0;
+	ca->rho2_7ls = 0;
+	ca->snd_cwnd_cents = 0;
+	ca->hybla_en = 1;
+	tp->snd_cwnd = 2;
+	tp->snd_cwnd_clamp = 65535;
+
+	/* 1st Rho measurement based on initial srtt */
+	hybla_recalc_param(sk);
+
+	/* set minimum rtt as this is the 1st ever seen */
+	ca->minrtt = tp->srtt;
+	tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct sock *sk, u8 ca_state)
+{
+	struct hybla *ca = inet_csk_ca(sk);
+	ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+	static const u32 fractions[] = {
+		128, 139, 152, 165, 181, 197, 215, 234,
+	};
+
+	return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ *     o Recalc Hybla parameters if min_rtt has changed
+ *     o Give cwnd a new value based on the model proposed
+ *     o remember increments <1
+ */
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
+	u32 increment, odd, rho_fractions;
+	int is_slowstart = 0;
+
+	/*  Recalculate rho only if this srtt is the lowest */
+	if (tp->srtt < ca->minrtt){
+		hybla_recalc_param(sk);
+		ca->minrtt = tp->srtt;
+	}
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (!ca->hybla_en) {
+		tcp_reno_cong_avoid(sk, ack, in_flight);
+		return;
+	}
+
+	if (ca->rho == 0)
+		hybla_recalc_param(sk);
+
+	rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+	if (tp->snd_cwnd < tp->snd_ssthresh) {
+		/*
+		 * slow start
+		 *      INC = 2^RHO - 1
+		 * This is done by splitting the rho parameter
+		 * into 2 parts: an integer part and a fraction part.
+		 * Inrement<<7 is estimated by doing:
+		 *	       [2^(int+fract)]<<7
+		 * that is equal to:
+		 *	       (2^int)	*  [(2^fract) <<7]
+		 * 2^int is straightly computed as 1<<int,
+		 * while we will use hybla_slowstart_fraction_increment() to
+		 * calculate 2^fract in a <<7 value.
+		 */
+		is_slowstart = 1;
+		increment = ((1 << min(ca->rho, 16U)) *
+			hybla_fraction(rho_fractions)) - 128;
+	} else {
+		/*
+		 * congestion avoidance
+		 * INC = RHO^2 / W
+		 * as long as increment is estimated as (rho<<7)/window
+		 * it already is <<7 and we can easily count its fractions.
+		 */
+		increment = ca->rho2_7ls / tp->snd_cwnd;
+		if (increment < 128)
+			tp->snd_cwnd_cnt++;
+	}
+
+	odd = increment % 128;
+	tp->snd_cwnd += increment >> 7;
+	ca->snd_cwnd_cents += odd;
+
+	/* check when fractions goes >=128 and increase cwnd by 1. */
+	while (ca->snd_cwnd_cents >= 128) {
+		tp->snd_cwnd++;
+		ca->snd_cwnd_cents -= 128;
+		tp->snd_cwnd_cnt = 0;
+	}
+	/* check when cwnd has not been incremented for a while */
+	if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+		tp->snd_cwnd++;
+		tp->snd_cwnd_cnt = 0;
+	}
+	/* clamp down slowstart cwnd to ssthresh value. */
+	if (is_slowstart)
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla __read_mostly = {
+	.init		= hybla_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.cong_avoid	= hybla_cong_avoid,
+	.set_state	= hybla_state,
+
+	.owner		= THIS_MODULE,
+	.name		= "hybla"
+};
+
+static int __init hybla_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
new file mode 100644
index 00000000..813b43a7
--- /dev/null
+++ b/net/ipv4/tcp_illinois.c
@@ -0,0 +1,356 @@
+/*
+ * TCP Illinois congestion control.
+ * Home page:
+ *	http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ *
+ * The algorithm is described in:
+ * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
+ *  for High-Speed Networks"
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
+ *
+ * Implemented from description in paper and ns-2 simulation.
+ * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <asm/div64.h>
+#include <net/tcp.h>
+
+#define ALPHA_SHIFT	7
+#define ALPHA_SCALE	(1u<<ALPHA_SHIFT)
+#define ALPHA_MIN	((3*ALPHA_SCALE)/10)	/* ~0.3 */
+#define ALPHA_MAX	(10*ALPHA_SCALE)	/* 10.0 */
+#define ALPHA_BASE	ALPHA_SCALE		/* 1.0 */
+#define U32_MAX		((u32)~0U)
+#define RTT_MAX		(U32_MAX / ALPHA_MAX)	/* 3.3 secs */
+
+#define BETA_SHIFT	6
+#define BETA_SCALE	(1u<<BETA_SHIFT)
+#define BETA_MIN	(BETA_SCALE/8)		/* 0.125 */
+#define BETA_MAX	(BETA_SCALE/2)		/* 0.5 */
+#define BETA_BASE	BETA_MAX
+
+static int win_thresh __read_mostly = 15;
+module_param(win_thresh, int, 0);
+MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
+
+static int theta __read_mostly = 5;
+module_param(theta, int, 0);
+MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
+
+/* TCP Illinois Parameters */
+struct illinois {
+	u64	sum_rtt;	/* sum of rtt's measured within last rtt */
+	u16	cnt_rtt;	/* # of rtts measured within last rtt */
+	u32	base_rtt;	/* min of all rtt in usec */
+	u32	max_rtt;	/* max of all rtt in usec */
+	u32	end_seq;	/* right edge of current RTT */
+	u32	alpha;		/* Additive increase */
+	u32	beta;		/* Muliplicative decrease */
+	u16	acked;		/* # packets acked by current ACK */
+	u8	rtt_above;	/* average rtt has gone above threshold */
+	u8	rtt_low;	/* # of rtts measurements below threshold */
+};
+
+static void rtt_reset(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	ca->end_seq = tp->snd_nxt;
+	ca->cnt_rtt = 0;
+	ca->sum_rtt = 0;
+
+	/* TODO: age max_rtt? */
+}
+
+static void tcp_illinois_init(struct sock *sk)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+
+	ca->alpha = ALPHA_MAX;
+	ca->beta = BETA_BASE;
+	ca->base_rtt = 0x7fffffff;
+	ca->max_rtt = 0;
+
+	ca->acked = 0;
+	ca->rtt_low = 0;
+	ca->rtt_above = 0;
+
+	rtt_reset(sk);
+}
+
+/* Measure RTT for each ack. */
+static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+
+	ca->acked = pkts_acked;
+
+	/* dup ack, no rtt sample */
+	if (rtt < 0)
+		return;
+
+	/* ignore bogus values, this prevents wraparound in alpha math */
+	if (rtt > RTT_MAX)
+		rtt = RTT_MAX;
+
+	/* keep track of minimum RTT seen so far */
+	if (ca->base_rtt > rtt)
+		ca->base_rtt = rtt;
+
+	/* and max */
+	if (ca->max_rtt < rtt)
+		ca->max_rtt = rtt;
+
+	++ca->cnt_rtt;
+	ca->sum_rtt += rtt;
+}
+
+/* Maximum queuing delay */
+static inline u32 max_delay(const struct illinois *ca)
+{
+	return ca->max_rtt - ca->base_rtt;
+}
+
+/* Average queuing delay */
+static inline u32 avg_delay(const struct illinois *ca)
+{
+	u64 t = ca->sum_rtt;
+
+	do_div(t, ca->cnt_rtt);
+	return t - ca->base_rtt;
+}
+
+/*
+ * Compute value of alpha used for additive increase.
+ * If small window then use 1.0, equivalent to Reno.
+ *
+ * For larger windows, adjust based on average delay.
+ * A. If average delay is at minimum (we are uncongested),
+ *    then use large alpha (10.0) to increase faster.
+ * B. If average delay is at maximum (getting congested)
+ *    then use small alpha (0.3)
+ *
+ * The result is a convex window growth curve.
+ */
+static u32 alpha(struct illinois *ca, u32 da, u32 dm)
+{
+	u32 d1 = dm / 100;	/* Low threshold */
+
+	if (da <= d1) {
+		/* If never got out of low delay zone, then use max */
+		if (!ca->rtt_above)
+			return ALPHA_MAX;
+
+		/* Wait for 5 good RTT's before allowing alpha to go alpha max.
+		 * This prevents one good RTT from causing sudden window increase.
+		 */
+		if (++ca->rtt_low < theta)
+			return ca->alpha;
+
+		ca->rtt_low = 0;
+		ca->rtt_above = 0;
+		return ALPHA_MAX;
+	}
+
+	ca->rtt_above = 1;
+
+	/*
+	 * Based on:
+	 *
+	 *      (dm - d1) amin amax
+	 * k1 = -------------------
+	 *         amax - amin
+	 *
+	 *       (dm - d1) amin
+	 * k2 = ----------------  - d1
+	 *        amax - amin
+	 *
+	 *             k1
+	 * alpha = ----------
+	 *          k2 + da
+	 */
+
+	dm -= d1;
+	da -= d1;
+	return (dm * ALPHA_MAX) /
+		(dm + (da  * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
+}
+
+/*
+ * Beta used for multiplicative decrease.
+ * For small window sizes returns same value as Reno (0.5)
+ *
+ * If delay is small (10% of max) then beta = 1/8
+ * If delay is up to 80% of max then beta = 1/2
+ * In between is a linear function
+ */
+static u32 beta(u32 da, u32 dm)
+{
+	u32 d2, d3;
+
+	d2 = dm / 10;
+	if (da <= d2)
+		return BETA_MIN;
+
+	d3 = (8 * dm) / 10;
+	if (da >= d3 || d3 <= d2)
+		return BETA_MAX;
+
+	/*
+	 * Based on:
+	 *
+	 *       bmin d3 - bmax d2
+	 * k3 = -------------------
+	 *         d3 - d2
+	 *
+	 *       bmax - bmin
+	 * k4 = -------------
+	 *         d3 - d2
+	 *
+	 * b = k3 + k4 da
+	 */
+	return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
+		/ (d3 - d2);
+}
+
+/* Update alpha and beta values once per RTT */
+static void update_params(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (tp->snd_cwnd < win_thresh) {
+		ca->alpha = ALPHA_BASE;
+		ca->beta = BETA_BASE;
+	} else if (ca->cnt_rtt > 0) {
+		u32 dm = max_delay(ca);
+		u32 da = avg_delay(ca);
+
+		ca->alpha = alpha(ca, da, dm);
+		ca->beta = beta(da, dm);
+	}
+
+	rtt_reset(sk);
+}
+
+/*
+ * In case of loss, reset to default values
+ */
+static void tcp_illinois_state(struct sock *sk, u8 new_state)
+{
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss) {
+		ca->alpha = ALPHA_BASE;
+		ca->beta = BETA_BASE;
+		ca->rtt_low = 0;
+		ca->rtt_above = 0;
+		rtt_reset(sk);
+	}
+}
+
+/*
+ * Increase window in response to successful acknowledgment.
+ */
+static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	if (after(ack, ca->end_seq))
+		update_params(sk);
+
+	/* RFC2861 only increase cwnd if fully utilized */
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	/* In slow start */
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+
+	else {
+		u32 delta;
+
+		/* snd_cwnd_cnt is # of packets since last cwnd increment */
+		tp->snd_cwnd_cnt += ca->acked;
+		ca->acked = 1;
+
+		/* This is close approximation of:
+		 * tp->snd_cwnd += alpha/tp->snd_cwnd
+		*/
+		delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
+		if (delta >= tp->snd_cwnd) {
+			tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
+					   (u32) tp->snd_cwnd_clamp);
+			tp->snd_cwnd_cnt = 0;
+		}
+	}
+}
+
+static u32 tcp_illinois_ssthresh(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct illinois *ca = inet_csk_ca(sk);
+
+	/* Multiplicative decrease */
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+}
+
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_illinois_info(struct sock *sk, u32 ext,
+			      struct sk_buff *skb)
+{
+	const struct illinois *ca = inet_csk_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcpvegas_info info = {
+			.tcpv_enabled = 1,
+			.tcpv_rttcnt = ca->cnt_rtt,
+			.tcpv_minrtt = ca->base_rtt,
+		};
+		u64 t = ca->sum_rtt;
+
+		do_div(t, ca->cnt_rtt);
+		info.tcpv_rtt = t;
+
+		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+	}
+}
+
+static struct tcp_congestion_ops tcp_illinois __read_mostly = {
+	.flags		= TCP_CONG_RTT_STAMP,
+	.init		= tcp_illinois_init,
+	.ssthresh	= tcp_illinois_ssthresh,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.cong_avoid	= tcp_illinois_cong_avoid,
+	.set_state	= tcp_illinois_state,
+	.get_info	= tcp_illinois_info,
+	.pkts_acked	= tcp_illinois_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "illinois",
+};
+
+static int __init tcp_illinois_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_illinois);
+}
+
+static void __exit tcp_illinois_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_illinois);
+}
+
+module_init(tcp_illinois_register);
+module_exit(tcp_illinois_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Illinois");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 00000000..7410a8c2
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,5963 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ *		Pedro Roque	:	Fast Retransmit/Recovery.
+ *					Two receive queues.
+ *					Retransmit queue handled by TCP.
+ *					Better retransmit timer handling.
+ *					New congestion avoidance.
+ *					Header prediction.
+ *					Variable renaming.
+ *
+ *		Eric		:	Fast Retransmit.
+ *		Randy Scott	:	MSS option defines.
+ *		Eric Schenk	:	Fixes to slow start algorithm.
+ *		Eric Schenk	:	Yet another double ACK bug.
+ *		Eric Schenk	:	Delayed ACK bug fixes.
+ *		Eric Schenk	:	Floyd style fast retrans war avoidance.
+ *		David S. Miller	:	Don't allow zero congestion window.
+ *		Eric Schenk	:	Fix retransmitter so that it sends
+ *					next packet on ack of previous packet.
+ *		Andi Kleen	:	Moved open_request checking here
+ *					and process RSTs for open_requests.
+ *		Andi Kleen	:	Better prune_queue, and other fixes.
+ *		Andrey Savochkin:	Fix RTT measurements in the presence of
+ *					timestamps.
+ *		Andrey Savochkin:	Check sequence numbers correctly when
+ *					removing SACKs due to in sequence incoming
+ *					data segments.
+ *		Andi Kleen:		Make sure we never ack data there is not
+ *					enough room for. Also make this condition
+ *					a fatal error if it might still happen.
+ *		Andi Kleen:		Add tcp_measure_rcv_mss to make
+ *					connections with MSS<min(MTU,ann. MSS)
+ *					work without delayed acks.
+ *		Andi Kleen:		Process packets with PSH set in the
+ *					fast path.
+ *		J Hadi Salim:		ECN support
+ *	 	Andrei Gurtov,
+ *		Pasi Sarolahti,
+ *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
+ *					engine. Lots of bugs are found.
+ *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/kernel.h>
+#include <net/dst.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+#include <net/netdma.h>
+
+int sysctl_tcp_timestamps __read_mostly = 1;
+int sysctl_tcp_window_scaling __read_mostly = 1;
+int sysctl_tcp_sack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+int sysctl_tcp_ecn __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_ecn);
+int sysctl_tcp_dsack __read_mostly = 1;
+int sysctl_tcp_app_win __read_mostly = 31;
+int sysctl_tcp_adv_win_scale __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
+
+int sysctl_tcp_stdurg __read_mostly;
+int sysctl_tcp_rfc1337 __read_mostly;
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
+int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_frto_response __read_mostly;
+int sysctl_tcp_nometrics_save __read_mostly;
+
+int sysctl_tcp_thin_dupack __read_mostly;
+
+int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+int sysctl_tcp_abc __read_mostly;
+
+#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
+#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
+#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
+#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
+#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
+#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
+#define FLAG_ECE		0x40 /* ECE in this ACK				*/
+#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
+#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
+#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */
+#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
+
+#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
+#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+
+/* Adapt the MSS value used to make delayed ack decision to the
+ * real world.
+ */
+static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const unsigned int lss = icsk->icsk_ack.last_seg_size;
+	unsigned int len;
+
+	icsk->icsk_ack.last_seg_size = 0;
+
+	/* skb->len may jitter because of SACKs, even if peer
+	 * sends good full-sized frames.
+	 */
+	len = skb_shinfo(skb)->gso_size ? : skb->len;
+	if (len >= icsk->icsk_ack.rcv_mss) {
+		icsk->icsk_ack.rcv_mss = len;
+	} else {
+		/* Otherwise, we make more careful check taking into account,
+		 * that SACKs block is variable.
+		 *
+		 * "len" is invariant segment length, including TCP header.
+		 */
+		len += skb->data - skb_transport_header(skb);
+		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
+		    /* If PSH is not set, packet should be
+		     * full sized, provided peer TCP is not badly broken.
+		     * This observation (if it is correct 8)) allows
+		     * to handle super-low mtu links fairly.
+		     */
+		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
+			/* Subtract also invariant (if peer is RFC compliant),
+			 * tcp header plus fixed timestamp option length.
+			 * Resulting "len" is MSS free of SACK jitter.
+			 */
+			len -= tcp_sk(sk)->tcp_header_len;
+			icsk->icsk_ack.last_seg_size = len;
+			if (len == lss) {
+				icsk->icsk_ack.rcv_mss = len;
+				return;
+			}
+		}
+		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
+		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
+	}
+}
+
+static void tcp_incr_quickack(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
+
+	if (quickacks == 0)
+		quickacks = 2;
+	if (quickacks > icsk->icsk_ack.quick)
+		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+
+static void tcp_enter_quickack_mode(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	tcp_incr_quickack(sk);
+	icsk->icsk_ack.pingpong = 0;
+	icsk->icsk_ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static inline int tcp_in_quickack_mode(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
+}
+
+static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+{
+	if (tp->ecn_flags & TCP_ECN_OK)
+		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if (tcp_hdr(skb)->cwr)
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+{
+	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if (tp->ecn_flags & TCP_ECN_OK) {
+		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		/* Funny extension: if ECT is not set on a segment,
+		 * it is surely retransmit. It is not in ECN RFC,
+		 * but Linux follows this rule. */
+		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+			tcp_enter_quickack_mode((struct sock *)tp);
+	}
+}
+
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
+{
+	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
+{
+	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
+		tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
+{
+	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
+		return 1;
+	return 0;
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_fixup_sndbuf(struct sock *sk)
+{
+	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
+		     sizeof(struct sk_buff);
+
+	if (sk->sk_sndbuf < 3 * sndmem) {
+		sk->sk_sndbuf = 3 * sndmem;
+		if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+			sk->sk_sndbuf = sysctl_tcp_wmem[2];
+	}
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ *   requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ *   of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spaghetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	/* Optimize this! */
+	int truesize = tcp_win_from_space(skb->truesize) >> 1;
+	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
+
+	while (tp->rcv_ssthresh <= window) {
+		if (truesize <= skb->len)
+			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
+
+		truesize >>= 1;
+		window >>= 1;
+	}
+	return 0;
+}
+
+static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Check #1 */
+	if (tp->rcv_ssthresh < tp->window_clamp &&
+	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
+	    !tcp_memory_pressure) {
+		int incr;
+
+		/* Check #2. Increase window, if skb with such overhead
+		 * will fit to rcvbuf in future.
+		 */
+		if (tcp_win_from_space(skb->truesize) <= skb->len)
+			incr = 2 * tp->advmss;
+		else
+			incr = __tcp_grow_window(sk, skb);
+
+		if (incr) {
+			incr = max_t(int, incr, 2 * skb->len);
+			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
+					       tp->window_clamp);
+			inet_csk(sk)->icsk_ack.quick |= 1;
+		}
+	}
+}
+
+/* 3. Tuning rcvbuf, when connection enters established state. */
+
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+
+	/* Try to select rcvbuf so that 4 mss-sized segments
+	 * will fit to window and corresponding skbs will fit to our rcvbuf.
+	 * (was 3; 4 is minimum to allow fast retransmit to work.)
+	 */
+	while (tcp_win_from_space(rcvmem) < tp->advmss)
+		rcvmem += 128;
+	if (sk->sk_rcvbuf < 4 * rcvmem)
+		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+}
+
+/* 4. Try to fixup all. It is made immediately after connection enters
+ *    established state.
+ */
+static void tcp_init_buffer_space(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int maxwin;
+
+	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+		tcp_fixup_rcvbuf(sk);
+	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+		tcp_fixup_sndbuf(sk);
+
+	tp->rcvq_space.space = tp->rcv_wnd;
+
+	maxwin = tcp_full_space(sk);
+
+	if (tp->window_clamp >= maxwin) {
+		tp->window_clamp = maxwin;
+
+		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+			tp->window_clamp = max(maxwin -
+					       (maxwin >> sysctl_tcp_app_win),
+					       4 * tp->advmss);
+	}
+
+	/* Force reservation of one segment. */
+	if (sysctl_tcp_app_win &&
+	    tp->window_clamp > 2 * tp->advmss &&
+	    tp->window_clamp + tp->advmss > maxwin)
+		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_ack.quick = 0;
+
+	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+	    !tcp_memory_pressure &&
+	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+				    sysctl_tcp_rmem[2]);
+	}
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
+}
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+	hint = min(hint, tp->rcv_wnd / 2);
+	hint = min(hint, TCP_MSS_DEFAULT);
+	hint = max(hint, TCP_MIN_MSS);
+
+	inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ *
+ * More detail on this code can be found at
+ * <http://staff.psc.edu/jheffner/>,
+ * though this reference is out of date.  A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+	u32 new_sample = tp->rcv_rtt_est.rtt;
+	long m = sample;
+
+	if (m == 0)
+		m = 1;
+
+	if (new_sample != 0) {
+		/* If we sample in larger samples in the non-timestamp
+		 * case, we could grossly overestimate the RTT especially
+		 * with chatty applications or bulk transfer apps which
+		 * are stalled on filesystem I/O.
+		 *
+		 * Also, since we are only going for a minimum in the
+		 * non-timestamp case, we do not smooth things out
+		 * else with timestamps disabled convergence takes too
+		 * long.
+		 */
+		if (!win_dep) {
+			m -= (new_sample >> 3);
+			new_sample += m;
+		} else {
+			m <<= 3;
+			if (m < new_sample)
+				new_sample = m;
+		}
+	} else {
+		/* No previous measure. */
+		new_sample = m << 3;
+	}
+
+	if (tp->rcv_rtt_est.rtt != new_sample)
+		tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+	if (tp->rcv_rtt_est.time == 0)
+		goto new_measure;
+	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+		return;
+	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
+
+new_measure:
+	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+	tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
+					  const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	if (tp->rx_opt.rcv_tsecr &&
+	    (TCP_SKB_CB(skb)->end_seq -
+	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
+		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int time;
+	int space;
+
+	if (tp->rcvq_space.time == 0)
+		goto new_measure;
+
+	time = tcp_time_stamp - tp->rcvq_space.time;
+	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+		return;
+
+	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+
+	space = max(tp->rcvq_space.space, space);
+
+	if (tp->rcvq_space.space != space) {
+		int rcvmem;
+
+		tp->rcvq_space.space = space;
+
+		if (sysctl_tcp_moderate_rcvbuf &&
+		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+			int new_clamp = space;
+
+			/* Receive space grows, normalize in order to
+			 * take into account packet headers and sk_buff
+			 * structure overhead.
+			 */
+			space /= tp->advmss;
+			if (!space)
+				space = 1;
+			rcvmem = (tp->advmss + MAX_TCP_HEADER +
+				  16 + sizeof(struct sk_buff));
+			while (tcp_win_from_space(rcvmem) < tp->advmss)
+				rcvmem += 128;
+			space *= rcvmem;
+			space = min(space, sysctl_tcp_rmem[2]);
+			if (space > sk->sk_rcvbuf) {
+				sk->sk_rcvbuf = space;
+
+				/* Make the window clamp follow along.  */
+				tp->window_clamp = new_clamp;
+			}
+		}
+	}
+
+new_measure:
+	tp->rcvq_space.seq = tp->copied_seq;
+	tp->rcvq_space.time = tcp_time_stamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval.  When a
+ * connection starts up, we want to ack as quickly as possible.  The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission.  The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time.  For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue.  -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 now;
+
+	inet_csk_schedule_ack(sk);
+
+	tcp_measure_rcv_mss(sk, skb);
+
+	tcp_rcv_rtt_measure(tp);
+
+	now = tcp_time_stamp;
+
+	if (!icsk->icsk_ack.ato) {
+		/* The _first_ data packet received, initialize
+		 * delayed ACK engine.
+		 */
+		tcp_incr_quickack(sk);
+		icsk->icsk_ack.ato = TCP_ATO_MIN;
+	} else {
+		int m = now - icsk->icsk_ack.lrcvtime;
+
+		if (m <= TCP_ATO_MIN / 2) {
+			/* The fastest case is the first. */
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+		} else if (m < icsk->icsk_ack.ato) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+			if (icsk->icsk_ack.ato > icsk->icsk_rto)
+				icsk->icsk_ack.ato = icsk->icsk_rto;
+		} else if (m > icsk->icsk_rto) {
+			/* Too long gap. Apparently sender failed to
+			 * restart window, so that we send ACKs quickly.
+			 */
+			tcp_incr_quickack(sk);
+			sk_mem_reclaim(sk);
+		}
+	}
+	icsk->icsk_ack.lrcvtime = now;
+
+	TCP_ECN_check_ce(tp, skb);
+
+	if (skb->len >= 128)
+		tcp_grow_window(sk, skb);
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	long m = mrtt; /* RTT */
+
+	/*	The following amusing code comes from Jacobson's
+	 *	article in SIGCOMM '88.  Note that rtt and mdev
+	 *	are scaled versions of rtt and mean deviation.
+	 *	This is designed to be as fast as possible
+	 *	m stands for "measurement".
+	 *
+	 *	On a 1990 paper the rto value is changed to:
+	 *	RTO = rtt + 4 * mdev
+	 *
+	 * Funny. This algorithm seems to be very broken.
+	 * These formulae increase RTO, when it should be decreased, increase
+	 * too slowly, when it should be increased quickly, decrease too quickly
+	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+	 * does not matter how to _calculate_ it. Seems, it was trap
+	 * that VJ failed to avoid. 8)
+	 */
+	if (m == 0)
+		m = 1;
+	if (tp->srtt != 0) {
+		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
+		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
+		if (m < 0) {
+			m = -m;		/* m is now abs(error) */
+			m -= (tp->mdev >> 2);   /* similar update on mdev */
+			/* This is similar to one of Eifel findings.
+			 * Eifel blocks mdev updates when rtt decreases.
+			 * This solution is a bit different: we use finer gain
+			 * for mdev in this case (alpha*beta).
+			 * Like Eifel it also prevents growth of rto,
+			 * but also it limits too fast rto decreases,
+			 * happening in pure Eifel.
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (tp->mdev >> 2);   /* similar update on mdev */
+		}
+		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
+		if (tp->mdev > tp->mdev_max) {
+			tp->mdev_max = tp->mdev;
+			if (tp->mdev_max > tp->rttvar)
+				tp->rttvar = tp->mdev_max;
+		}
+		if (after(tp->snd_una, tp->rtt_seq)) {
+			if (tp->mdev_max < tp->rttvar)
+				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
+			tp->rtt_seq = tp->snd_nxt;
+			tp->mdev_max = tcp_rto_min(sk);
+		}
+	} else {
+		/* no previous measure. */
+		tp->srtt = m << 3;	/* take the measured time to be rtt */
+		tp->mdev = m << 1;	/* make sure rto = 3*rtt */
+		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+		tp->rtt_seq = tp->snd_nxt;
+	}
+}
+
+/* Calculate rto without backoff.  This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static inline void tcp_set_rto(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	/* Old crap is replaced with new one. 8)
+	 *
+	 * More seriously:
+	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
+	 *    It cannot be less due to utterly erratic ACK generation made
+	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+	 *    to do with delayed acks, because at cwnd>2 true delack timeout
+	 *    is invisible. Actually, Linux-2.4 also generates erratic
+	 *    ACKs in some circumstances.
+	 */
+	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
+
+	/* 2. Fixups made earlier cannot be right.
+	 *    If we do not estimate RTO correctly without them,
+	 *    all the algo is pure shit and should be replaced
+	 *    with correct one. It is exactly, which we pretend to do.
+	 */
+
+	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+	 * guarantees that rto is higher.
+	 */
+	tcp_bound_rto(sk);
+}
+
+/* Save metrics learned by this TCP session.
+   This function is called only, when TCP finishes successfully
+   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (sysctl_tcp_nometrics_save)
+		return;
+
+	dst_confirm(dst);
+
+	if (dst && (dst->flags & DST_HOST)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		int m;
+		unsigned long rtt;
+
+		if (icsk->icsk_backoff || !tp->srtt) {
+			/* This session failed to estimate rtt. Why?
+			 * Probably, no packets returned in time.
+			 * Reset our results.
+			 */
+			if (!(dst_metric_locked(dst, RTAX_RTT)))
+				dst_metric_set(dst, RTAX_RTT, 0);
+			return;
+		}
+
+		rtt = dst_metric_rtt(dst, RTAX_RTT);
+		m = rtt - tp->srtt;
+
+		/* If newly calculated rtt larger than stored one,
+		 * store new one. Otherwise, use EWMA. Remember,
+		 * rtt overestimation is always better than underestimation.
+		 */
+		if (!(dst_metric_locked(dst, RTAX_RTT))) {
+			if (m <= 0)
+				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
+			else
+				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
+		}
+
+		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+			unsigned long var;
+			if (m < 0)
+				m = -m;
+
+			/* Scale deviation to rttvar fixed point */
+			m >>= 1;
+			if (m < tp->mdev)
+				m = tp->mdev;
+
+			var = dst_metric_rtt(dst, RTAX_RTTVAR);
+			if (m >= var)
+				var = m;
+			else
+				var -= (var - m) >> 2;
+
+			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
+		}
+
+		if (tcp_in_initial_slowstart(tp)) {
+			/* Slow start still did not finish. */
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
+			if (!dst_metric_locked(dst, RTAX_CWND) &&
+			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
+		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+			   icsk->icsk_ca_state == TCP_CA_Open) {
+			/* Cong. avoidance phase, cwnd is reliable. */
+			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH,
+					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_cwnd) >> 1);
+		} else {
+			/* Else slow start did not finish, cwnd is non-sense,
+			   ssthresh may be also invalid.
+			 */
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst_metric_set(dst, RTAX_CWND,
+					       (dst_metric(dst, RTAX_CWND) +
+						tp->snd_ssthresh) >> 1);
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
+				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
+		}
+
+		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
+			    tp->reordering != sysctl_tcp_reordering)
+				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
+		}
+	}
+}
+
+__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+{
+	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+	if (!cwnd)
+		cwnd = TCP_INIT_CWND;
+	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	tp->prior_ssthresh = 0;
+	tp->bytes_acked = 0;
+	if (icsk->icsk_ca_state < TCP_CA_CWR) {
+		tp->undo_marker = 0;
+		if (set_ssthresh)
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tp->snd_cwnd = min(tp->snd_cwnd,
+				   tcp_packets_in_flight(tp) + 1U);
+		tp->snd_cwnd_cnt = 0;
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+		TCP_ECN_queue_cwr(tp);
+
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+	}
+}
+
+/*
+ * Packet counting of FACK is based on in-order assumptions, therefore TCP
+ * disables it when reordering is detected
+ */
+static void tcp_disable_fack(struct tcp_sock *tp)
+{
+	/* RFC3517 uses different metric in lost marker => reset on change */
+	if (tcp_is_fack(tp))
+		tp->lost_skb_hint = NULL;
+	tp->rx_opt.sack_ok &= ~2;
+}
+
+/* Take a notice that peer is sending D-SACKs */
+static void tcp_dsack_seen(struct tcp_sock *tp)
+{
+	tp->rx_opt.sack_ok |= 4;
+}
+
+/* Initialize metrics on socket. */
+
+static void tcp_init_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (dst == NULL)
+		goto reset;
+
+	dst_confirm(dst);
+
+	if (dst_metric_locked(dst, RTAX_CWND))
+		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+	if (dst_metric(dst, RTAX_SSTHRESH)) {
+		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	}
+	if (dst_metric(dst, RTAX_REORDERING) &&
+	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+		tcp_disable_fack(tp);
+		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+	}
+
+	if (dst_metric(dst, RTAX_RTT) == 0)
+		goto reset;
+
+	if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+		goto reset;
+
+	/* Initial rtt is determined from SYN,SYN-ACK.
+	 * The segment is small and rtt may appear much
+	 * less than real one. Use per-dst memory
+	 * to make it more realistic.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal circumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
+	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
+		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
+		tp->rtt_seq = tp->snd_nxt;
+	}
+	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
+		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
+		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
+	}
+	tcp_set_rto(sk);
+	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
+reset:
+		/* Play conservative. If timestamps are not
+		 * supported, TCP will fail to recalculate correct
+		 * rtt, if initial rto is too small. FORGET ALL AND RESET!
+		 */
+		if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+			tp->srtt = 0;
+			tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+			inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+		}
+	}
+	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_update_reordering(struct sock *sk, const int metric,
+				  const int ts)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	if (metric > tp->reordering) {
+		int mib_idx;
+
+		tp->reordering = min(TCP_MAX_REORDERING, metric);
+
+		/* This exciting event is worth to be remembered. 8) */
+		if (ts)
+			mib_idx = LINUX_MIB_TCPTSREORDER;
+		else if (tcp_is_reno(tp))
+			mib_idx = LINUX_MIB_TCPRENOREORDER;
+		else if (tcp_is_fack(tp))
+			mib_idx = LINUX_MIB_TCPFACKREORDER;
+		else
+			mib_idx = LINUX_MIB_TCPSACKREORDER;
+
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+#if FASTRETRANS_DEBUG > 1
+		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
+		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
+		       tp->reordering,
+		       tp->fackets_out,
+		       tp->sacked_out,
+		       tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+		tcp_disable_fack(tp);
+	}
+}
+
+/* This must be called before lost_out is incremented */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if ((tp->retransmit_skb_hint == NULL) ||
+	    before(TCP_SKB_CB(skb)->seq,
+		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+		tp->retransmit_skb_hint = skb;
+
+	if (!tp->lost_out ||
+	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+}
+
+static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+		tcp_verify_retransmit_hint(tp, skb);
+
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+	}
+}
+
+static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+					    struct sk_buff *skb)
+{
+	tcp_verify_retransmit_hint(tp, skb);
+
+	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+		tp->lost_out += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+	}
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag  InFlight	Description
+ * 0	1		- orig segment is in flight.
+ * S	0		- nothing flies, orig reached receiver.
+ * L	0		- nothing flies, orig lost by net.
+ * R	2		- both orig and retransmit are in flight.
+ * L|R	1		- orig is lost, retransmit is in flight.
+ * S|R  1		- orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ *  but it is equivalent to plain S and code short-curcuits it to S.
+ *  L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of one of three flavors:
+ *	A. Scoreboard estimator decided the packet is lost.
+ *	   A'. Reno "three dupacks" marks head of queue lost.
+ *	   A''. Its FACK modfication, head until snd.fack is lost.
+ *	B. SACK arrives sacking data transmitted after never retransmitted
+ *	   hole was sent out.
+ *	C. SACK arrives sacking SND.NXT at the moment, when the
+ *	   segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ *    ever retransmitted -> reordering. Alas, we cannot use it
+ *    when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ *    for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment.  Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ *         <- outs wnd ->                          <- wrapzone ->
+ *         u     e      n                         u_w   e_w  s n_w
+ *         |     |      |                          |     |   |  |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->|                                           |<--------...
+ * ...---- >2^31 ------>|                                    |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, D-SACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
+ */
+static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
+				  u32 start_seq, u32 end_seq)
+{
+	/* Too far in future, or reversed (interpretation is ambiguous) */
+	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+		return 0;
+
+	/* Nasty start_seq wrap-around check (see comments above) */
+	if (!before(start_seq, tp->snd_nxt))
+		return 0;
+
+	/* In outstanding window? ...This is valid exit for D-SACKs too.
+	 * start_seq == snd_una is non-sensical (see comments above)
+	 */
+	if (after(start_seq, tp->snd_una))
+		return 1;
+
+	if (!is_dsack || !tp->undo_marker)
+		return 0;
+
+	/* ...Then it's D-SACK, and must reside below snd_una completely */
+	if (after(end_seq, tp->snd_una))
+		return 0;
+
+	if (!before(start_seq, tp->undo_marker))
+		return 1;
+
+	/* Too old */
+	if (!after(end_seq, tp->undo_marker))
+		return 0;
+
+	/* Undo_marker boundary crossing (overestimates a lot). Known already:
+	 *   start_seq < undo_marker and end_seq >= undo_marker.
+	 */
+	return !before(start_seq, end_seq - tp->max_window);
+}
+
+/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
+ * Event "C". Later note: FACK people cheated me again 8), we have to account
+ * for reordering! Ugly, but should help.
+ *
+ * Search retransmitted skbs from write_queue that were sent when snd_nxt was
+ * less than what is now known to be received by the other end (derived from
+ * highest SACK block). Also calculate the lowest snd_nxt among the remaining
+ * retransmitted skbs to avoid some costly processing per ACKs.
+ */
+static void tcp_mark_lost_retrans(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int cnt = 0;
+	u32 new_low_seq = tp->snd_nxt;
+	u32 received_upto = tcp_highest_sack_seq(tp);
+
+	if (!tcp_is_fack(tp) || !tp->retrans_out ||
+	    !after(received_upto, tp->lost_retrans_low) ||
+	    icsk->icsk_ca_state != TCP_CA_Recovery)
+		return;
+
+	tcp_for_write_queue(skb, sk) {
+		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+		if (skb == tcp_send_head(sk))
+			break;
+		if (cnt == tp->retrans_out)
+			break;
+		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+			continue;
+
+		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
+			continue;
+
+		/* TODO: We would like to get rid of tcp_is_fack(tp) only
+		 * constraint here (see above) but figuring out that at
+		 * least tp->reordering SACK blocks reside between ack_seq
+		 * and received_upto is not easy task to do cheaply with
+		 * the available datastructures.
+		 *
+		 * Whether FACK should check here for tp->reordering segs
+		 * in-between one could argue for either way (it would be
+		 * rather simple to implement as we could count fack_count
+		 * during the walk and do tp->fackets_out - fack_count).
+		 */
+		if (after(received_upto, ack_seq)) {
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+			tp->retrans_out -= tcp_skb_pcount(skb);
+
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
+		} else {
+			if (before(ack_seq, new_low_seq))
+				new_low_seq = ack_seq;
+			cnt += tcp_skb_pcount(skb);
+		}
+	}
+
+	if (tp->retrans_out)
+		tp->lost_retrans_low = new_low_seq;
+}
+
+static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
+			   struct tcp_sack_block_wire *sp, int num_sacks,
+			   u32 prior_snd_una)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
+	int dup_sack = 0;
+
+	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+		dup_sack = 1;
+		tcp_dsack_seen(tp);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+	} else if (num_sacks > 1) {
+		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
+
+		if (!after(end_seq_0, end_seq_1) &&
+		    !before(start_seq_0, start_seq_1)) {
+			dup_sack = 1;
+			tcp_dsack_seen(tp);
+			NET_INC_STATS_BH(sock_net(sk),
+					LINUX_MIB_TCPDSACKOFORECV);
+		}
+	}
+
+	/* D-SACK for already forgotten data... Do dumb counting. */
+	if (dup_sack && tp->undo_marker && tp->undo_retrans &&
+	    !after(end_seq_0, prior_snd_una) &&
+	    after(end_seq_0, tp->undo_marker))
+		tp->undo_retrans--;
+
+	return dup_sack;
+}
+
+struct tcp_sacktag_state {
+	int reord;
+	int fack_count;
+	int flag;
+};
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ *
+ * FIXME: this could be merged to shift decision code
+ */
+static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+				 u32 start_seq, u32 end_seq)
+{
+	int in_sack, err;
+	unsigned int pkt_len;
+	unsigned int mss;
+
+	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+		mss = tcp_skb_mss(skb);
+		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+		if (!in_sack) {
+			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+			if (pkt_len < mss)
+				pkt_len = mss;
+		} else {
+			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+			if (pkt_len < mss)
+				return -EINVAL;
+		}
+
+		/* Round if necessary so that SACKs cover only full MSSes
+		 * and/or the remaining small portion (if present)
+		 */
+		if (pkt_len > mss) {
+			unsigned int new_len = (pkt_len / mss) * mss;
+			if (!in_sack && new_len < pkt_len) {
+				new_len += mss;
+				if (new_len > skb->len)
+					return 0;
+			}
+			pkt_len = new_len;
+		}
+		err = tcp_fragment(sk, skb, pkt_len, mss);
+		if (err < 0)
+			return err;
+	}
+
+	return in_sack;
+}
+
+/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
+static u8 tcp_sacktag_one(struct sock *sk,
+			  struct tcp_sacktag_state *state, u8 sacked,
+			  u32 start_seq, u32 end_seq,
+			  int dup_sack, int pcount)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int fack_count = state->fack_count;
+
+	/* Account D-SACK for retransmitted packet. */
+	if (dup_sack && (sacked & TCPCB_RETRANS)) {
+		if (tp->undo_marker && tp->undo_retrans &&
+		    after(end_seq, tp->undo_marker))
+			tp->undo_retrans--;
+		if (sacked & TCPCB_SACKED_ACKED)
+			state->reord = min(fack_count, state->reord);
+	}
+
+	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
+	if (!after(end_seq, tp->snd_una))
+		return sacked;
+
+	if (!(sacked & TCPCB_SACKED_ACKED)) {
+		if (sacked & TCPCB_SACKED_RETRANS) {
+			/* If the segment is not tagged as lost,
+			 * we do not clear RETRANS, believing
+			 * that retransmission is still in flight.
+			 */
+			if (sacked & TCPCB_LOST) {
+				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+				tp->lost_out -= pcount;
+				tp->retrans_out -= pcount;
+			}
+		} else {
+			if (!(sacked & TCPCB_RETRANS)) {
+				/* New sack for not retransmitted frame,
+				 * which was in hole. It is reordering.
+				 */
+				if (before(start_seq,
+					   tcp_highest_sack_seq(tp)))
+					state->reord = min(fack_count,
+							   state->reord);
+
+				/* SACK enhanced F-RTO (RFC4138; Appendix B) */
+				if (!after(end_seq, tp->frto_highmark))
+					state->flag |= FLAG_ONLY_ORIG_SACKED;
+			}
+
+			if (sacked & TCPCB_LOST) {
+				sacked &= ~TCPCB_LOST;
+				tp->lost_out -= pcount;
+			}
+		}
+
+		sacked |= TCPCB_SACKED_ACKED;
+		state->flag |= FLAG_DATA_SACKED;
+		tp->sacked_out += pcount;
+
+		fack_count += pcount;
+
+		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
+		if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
+		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
+			tp->lost_cnt_hint += pcount;
+
+		if (fack_count > tp->fackets_out)
+			tp->fackets_out = fack_count;
+	}
+
+	/* D-SACK. We can detect redundant retransmission in S|R and plain R
+	 * frames and clear it. undo_retrans is decreased above, L|R frames
+	 * are accounted above as well.
+	 */
+	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
+		sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= pcount;
+	}
+
+	return sacked;
+}
+
+/* Shift newly-SACKed bytes from this skb to the immediately previous
+ * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
+ */
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+			   struct tcp_sacktag_state *state,
+			   unsigned int pcount, int shifted, int mss,
+			   int dup_sack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
+	u32 start_seq = TCP_SKB_CB(skb)->seq;	/* start of newly-SACKed */
+	u32 end_seq = start_seq + shifted;	/* end of newly-SACKed */
+
+	BUG_ON(!pcount);
+
+	/* Adjust counters and hints for the newly sacked sequence
+	 * range but discard the return value since prev is already
+	 * marked. We must tag the range first because the seq
+	 * advancement below implicitly advances
+	 * tcp_highest_sack_seq() when skb is highest_sack.
+	 */
+	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
+			start_seq, end_seq, dup_sack, pcount);
+
+	if (skb == tp->lost_skb_hint)
+		tp->lost_cnt_hint += pcount;
+
+	TCP_SKB_CB(prev)->end_seq += shifted;
+	TCP_SKB_CB(skb)->seq += shifted;
+
+	skb_shinfo(prev)->gso_segs += pcount;
+	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+	skb_shinfo(skb)->gso_segs -= pcount;
+
+	/* When we're adding to gso_segs == 1, gso_size will be zero,
+	 * in theory this shouldn't be necessary but as long as DSACK
+	 * code can come after this skb later on it's better to keep
+	 * setting gso_size to something.
+	 */
+	if (!skb_shinfo(prev)->gso_size) {
+		skb_shinfo(prev)->gso_size = mss;
+		skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+	}
+
+	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
+	if (skb_shinfo(skb)->gso_segs <= 1) {
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
+	}
+
+	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
+	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+	if (skb->len > 0) {
+		BUG_ON(!tcp_skb_pcount(skb));
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+		return 0;
+	}
+
+	/* Whole SKB was eaten :-) */
+
+	if (skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = prev;
+	if (skb == tp->scoreboard_skb_hint)
+		tp->scoreboard_skb_hint = prev;
+	if (skb == tp->lost_skb_hint) {
+		tp->lost_skb_hint = prev;
+		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
+	}
+
+	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+	if (skb == tcp_highest_sack(sk))
+		tcp_advance_highest_sack(sk, skb);
+
+	tcp_unlink_write_queue(skb, sk);
+	sk_wmem_free_skb(sk, skb);
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+
+	return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_skb_seglen(struct sk_buff *skb)
+{
+	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+	return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+					  struct tcp_sacktag_state *state,
+					  u32 start_seq, u32 end_seq,
+					  int dup_sack)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *prev;
+	int mss;
+	int pcount = 0;
+	int len;
+	int in_sack;
+
+	if (!sk_can_gso(sk))
+		goto fallback;
+
+	/* Normally R but no L won't result in plain S */
+	if (!dup_sack &&
+	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
+		goto fallback;
+	if (!skb_can_shift(skb))
+		goto fallback;
+	/* This frame is about to be dropped (was ACKed). */
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+		goto fallback;
+
+	/* Can only happen with delayed DSACK + discard craziness */
+	if (unlikely(skb == tcp_write_queue_head(sk)))
+		goto fallback;
+	prev = tcp_write_queue_prev(sk, skb);
+
+	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+		goto fallback;
+
+	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+	if (in_sack) {
+		len = skb->len;
+		pcount = tcp_skb_pcount(skb);
+		mss = tcp_skb_seglen(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_skb_seglen(prev))
+			goto fallback;
+	} else {
+		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+			goto noop;
+		/* CHECKME: This is non-MSS split case only?, this will
+		 * cause skipped skbs due to advancing loop btw, original
+		 * has that feature too
+		 */
+		if (tcp_skb_pcount(skb) <= 1)
+			goto noop;
+
+		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+		if (!in_sack) {
+			/* TODO: head merge to next could be attempted here
+			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+			 * though it might not be worth of the additional hassle
+			 *
+			 * ...we can probably just fallback to what was done
+			 * previously. We could try merging non-SACKed ones
+			 * as well but it probably isn't going to buy off
+			 * because later SACKs might again split them, and
+			 * it would make skb timestamp tracking considerably
+			 * harder problem.
+			 */
+			goto fallback;
+		}
+
+		len = end_seq - TCP_SKB_CB(skb)->seq;
+		BUG_ON(len < 0);
+		BUG_ON(len > skb->len);
+
+		/* MSS boundaries should be honoured or else pcount will
+		 * severely break even though it makes things bit trickier.
+		 * Optimize common case to avoid most of the divides
+		 */
+		mss = tcp_skb_mss(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_skb_seglen(prev))
+			goto fallback;
+
+		if (len == mss) {
+			pcount = 1;
+		} else if (len < mss) {
+			goto noop;
+		} else {
+			pcount = len / mss;
+			len = pcount * mss;
+		}
+	}
+
+	/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
+	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
+		goto fallback;
+
+	if (!skb_shift(prev, skb, len))
+		goto fallback;
+	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+		goto out;
+
+	/* Hole filled allows collapsing with the next as well, this is very
+	 * useful when hole on every nth skb pattern happens
+	 */
+	if (prev == tcp_write_queue_tail(sk))
+		goto out;
+	skb = tcp_write_queue_next(sk, prev);
+
+	if (!skb_can_shift(skb) ||
+	    (skb == tcp_send_head(sk)) ||
+	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
+	    (mss != tcp_skb_seglen(skb)))
+		goto out;
+
+	len = skb->len;
+	if (skb_shift(prev, skb, len)) {
+		pcount += tcp_skb_pcount(skb);
+		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+	}
+
+out:
+	state->fack_count += pcount;
+	return prev;
+
+noop:
+	return skb;
+
+fallback:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+	return NULL;
+}
+
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
+					struct tcp_sack_block *next_dup,
+					struct tcp_sacktag_state *state,
+					u32 start_seq, u32 end_seq,
+					int dup_sack_in)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *tmp;
+
+	tcp_for_write_queue_from(skb, sk) {
+		int in_sack = 0;
+		int dup_sack = dup_sack_in;
+
+		if (skb == tcp_send_head(sk))
+			break;
+
+		/* queue is in-order => we can short-circuit the walk early */
+		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+			break;
+
+		if ((next_dup != NULL) &&
+		    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
+			in_sack = tcp_match_skb_to_sack(sk, skb,
+							next_dup->start_seq,
+							next_dup->end_seq);
+			if (in_sack > 0)
+				dup_sack = 1;
+		}
+
+		/* skb reference here is a bit tricky to get right, since
+		 * shifting can eat and free both this skb and the next,
+		 * so not even _safe variant of the loop is enough.
+		 */
+		if (in_sack <= 0) {
+			tmp = tcp_shift_skb_data(sk, skb, state,
+						 start_seq, end_seq, dup_sack);
+			if (tmp != NULL) {
+				if (tmp != skb) {
+					skb = tmp;
+					continue;
+				}
+
+				in_sack = 0;
+			} else {
+				in_sack = tcp_match_skb_to_sack(sk, skb,
+								start_seq,
+								end_seq);
+			}
+		}
+
+		if (unlikely(in_sack < 0))
+			break;
+
+		if (in_sack) {
+			TCP_SKB_CB(skb)->sacked =
+				tcp_sacktag_one(sk,
+						state,
+						TCP_SKB_CB(skb)->sacked,
+						TCP_SKB_CB(skb)->seq,
+						TCP_SKB_CB(skb)->end_seq,
+						dup_sack,
+						tcp_skb_pcount(skb));
+
+			if (!before(TCP_SKB_CB(skb)->seq,
+				    tcp_highest_sack_seq(tp)))
+				tcp_advance_highest_sack(sk, skb);
+		}
+
+		state->fack_count += tcp_skb_pcount(skb);
+	}
+	return skb;
+}
+
+/* Avoid all extra work that is being done by sacktag while walking in
+ * a normal way
+ */
+static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
+					struct tcp_sacktag_state *state,
+					u32 skip_to_seq)
+{
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+
+		if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
+			break;
+
+		state->fack_count += tcp_skb_pcount(skb);
+	}
+	return skb;
+}
+
+static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
+						struct sock *sk,
+						struct tcp_sack_block *next_dup,
+						struct tcp_sacktag_state *state,
+						u32 skip_to_seq)
+{
+	if (next_dup == NULL)
+		return skb;
+
+	if (before(next_dup->start_seq, skip_to_seq)) {
+		skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
+		skb = tcp_sacktag_walk(skb, sk, NULL, state,
+				       next_dup->start_seq, next_dup->end_seq,
+				       1);
+	}
+
+	return skb;
+}
+
+static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
+{
+	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+}
+
+static int
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
+			u32 prior_snd_una)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned char *ptr = (skb_transport_header(ack_skb) +
+			      TCP_SKB_CB(ack_skb)->sacked);
+	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
+	struct tcp_sack_block sp[TCP_NUM_SACKS];
+	struct tcp_sack_block *cache;
+	struct tcp_sacktag_state state;
+	struct sk_buff *skb;
+	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
+	int used_sacks;
+	int found_dup_sack = 0;
+	int i, j;
+	int first_sack_index;
+
+	state.flag = 0;
+	state.reord = tp->packets_out;
+
+	if (!tp->sacked_out) {
+		if (WARN_ON(tp->fackets_out))
+			tp->fackets_out = 0;
+		tcp_highest_sack_reset(sk);
+	}
+
+	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
+					 num_sacks, prior_snd_una);
+	if (found_dup_sack)
+		state.flag |= FLAG_DSACKING_ACK;
+
+	/* Eliminate too old ACKs, but take into
+	 * account more or less fresh ones, they can
+	 * contain valid SACK info.
+	 */
+	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
+		return 0;
+
+	if (!tp->packets_out)
+		goto out;
+
+	used_sacks = 0;
+	first_sack_index = 0;
+	for (i = 0; i < num_sacks; i++) {
+		int dup_sack = !i && found_dup_sack;
+
+		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
+
+		if (!tcp_is_sackblock_valid(tp, dup_sack,
+					    sp[used_sacks].start_seq,
+					    sp[used_sacks].end_seq)) {
+			int mib_idx;
+
+			if (dup_sack) {
+				if (!tp->undo_marker)
+					mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
+				else
+					mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
+			} else {
+				/* Don't count olds caused by ACK reordering */
+				if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+				    !after(sp[used_sacks].end_seq, tp->snd_una))
+					continue;
+				mib_idx = LINUX_MIB_TCPSACKDISCARD;
+			}
+
+			NET_INC_STATS_BH(sock_net(sk), mib_idx);
+			if (i == 0)
+				first_sack_index = -1;
+			continue;
+		}
+
+		/* Ignore very old stuff early */
+		if (!after(sp[used_sacks].end_seq, prior_snd_una))
+			continue;
+
+		used_sacks++;
+	}
+
+	/* order SACK blocks to allow in order walk of the retrans queue */
+	for (i = used_sacks - 1; i > 0; i--) {
+		for (j = 0; j < i; j++) {
+			if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
+				swap(sp[j], sp[j + 1]);
+
+				/* Track where the first SACK block goes to */
+				if (j == first_sack_index)
+					first_sack_index = j + 1;
+			}
+		}
+	}
+
+	skb = tcp_write_queue_head(sk);
+	state.fack_count = 0;
+	i = 0;
+
+	if (!tp->sacked_out) {
+		/* It's already past, so skip checking against it */
+		cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
+	} else {
+		cache = tp->recv_sack_cache;
+		/* Skip empty blocks in at head of the cache */
+		while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
+		       !cache->end_seq)
+			cache++;
+	}
+
+	while (i < used_sacks) {
+		u32 start_seq = sp[i].start_seq;
+		u32 end_seq = sp[i].end_seq;
+		int dup_sack = (found_dup_sack && (i == first_sack_index));
+		struct tcp_sack_block *next_dup = NULL;
+
+		if (found_dup_sack && ((i + 1) == first_sack_index))
+			next_dup = &sp[i + 1];
+
+		/* Event "B" in the comment above. */
+		if (after(end_seq, tp->high_seq))
+			state.flag |= FLAG_DATA_LOST;
+
+		/* Skip too early cached blocks */
+		while (tcp_sack_cache_ok(tp, cache) &&
+		       !before(start_seq, cache->end_seq))
+			cache++;
+
+		/* Can skip some work by looking recv_sack_cache? */
+		if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
+		    after(end_seq, cache->start_seq)) {
+
+			/* Head todo? */
+			if (before(start_seq, cache->start_seq)) {
+				skb = tcp_sacktag_skip(skb, sk, &state,
+						       start_seq);
+				skb = tcp_sacktag_walk(skb, sk, next_dup,
+						       &state,
+						       start_seq,
+						       cache->start_seq,
+						       dup_sack);
+			}
+
+			/* Rest of the block already fully processed? */
+			if (!after(end_seq, cache->end_seq))
+				goto advance_sp;
+
+			skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
+						       &state,
+						       cache->end_seq);
+
+			/* ...tail remains todo... */
+			if (tcp_highest_sack_seq(tp) == cache->end_seq) {
+				/* ...but better entrypoint exists! */
+				skb = tcp_highest_sack(sk);
+				if (skb == NULL)
+					break;
+				state.fack_count = tp->fackets_out;
+				cache++;
+				goto walk;
+			}
+
+			skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
+			/* Check overlap against next cached too (past this one already) */
+			cache++;
+			continue;
+		}
+
+		if (!before(start_seq, tcp_highest_sack_seq(tp))) {
+			skb = tcp_highest_sack(sk);
+			if (skb == NULL)
+				break;
+			state.fack_count = tp->fackets_out;
+		}
+		skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
+
+walk:
+		skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
+				       start_seq, end_seq, dup_sack);
+
+advance_sp:
+		/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
+		 * due to in-order walk
+		 */
+		if (after(end_seq, tp->frto_highmark))
+			state.flag &= ~FLAG_ONLY_ORIG_SACKED;
+
+		i++;
+	}
+
+	/* Clear the head of the cache sack blocks so we can skip it next time */
+	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
+		tp->recv_sack_cache[i].start_seq = 0;
+		tp->recv_sack_cache[i].end_seq = 0;
+	}
+	for (j = 0; j < used_sacks; j++)
+		tp->recv_sack_cache[i++] = sp[j];
+
+	tcp_mark_lost_retrans(sk);
+
+	tcp_verify_left_out(tp);
+
+	if ((state.reord < tp->fackets_out) &&
+	    ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
+	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+		tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
+
+out:
+
+#if FASTRETRANS_DEBUG > 0
+	WARN_ON((int)tp->sacked_out < 0);
+	WARN_ON((int)tp->lost_out < 0);
+	WARN_ON((int)tp->retrans_out < 0);
+	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
+#endif
+	return state.flag;
+}
+
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
+ */
+static int tcp_limit_reno_sacked(struct tcp_sock *tp)
+{
+	u32 holes;
+
+	holes = max(tp->lost_out, 1U);
+	holes = min(holes, tp->packets_out);
+
+	if ((tp->sacked_out + holes) > tp->packets_out) {
+		tp->sacked_out = tp->packets_out - holes;
+		return 1;
+	}
+	return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	if (tcp_limit_reno_sacked(tp))
+		tcp_update_reordering(sk, tp->packets_out + addend, 0);
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	tp->sacked_out++;
+	tcp_check_reno_reordering(sk, 0);
+	tcp_verify_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (acked > 0) {
+		/* One ACK acked hole. The rest eat duplicate ACKs. */
+		if (acked - 1 >= tp->sacked_out)
+			tp->sacked_out = 0;
+		else
+			tp->sacked_out -= acked - 1;
+	}
+	tcp_check_reno_reordering(sk, acked);
+	tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+	tp->sacked_out = 0;
+}
+
+static int tcp_is_sackfrto(const struct tcp_sock *tp)
+{
+	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
+}
+
+/* F-RTO can only be used if TCP has never retransmitted anything other than
+ * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
+ */
+int tcp_use_frto(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb;
+
+	if (!sysctl_tcp_frto)
+		return 0;
+
+	/* MTU probe and F-RTO won't really play nicely along currently */
+	if (icsk->icsk_mtup.probe_size)
+		return 0;
+
+	if (tcp_is_sackfrto(tp))
+		return 1;
+
+	/* Avoid expensive walking of rexmit queue if possible */
+	if (tp->retrans_out > 1)
+		return 0;
+
+	skb = tcp_write_queue_head(sk);
+	if (tcp_skb_is_last(sk, skb))
+		return 1;
+	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+			return 0;
+		/* Short-circuit when first non-SACKed skb has been checked */
+		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+			break;
+	}
+	return 1;
+}
+
+/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
+ * recovery a bit and use heuristics in tcp_process_frto() to detect if
+ * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
+ * keep retrans_out counting accurate (with SACK F-RTO, other than head
+ * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
+ * bits are handled if the Loss state is really to be entered (in
+ * tcp_enter_frto_loss).
+ *
+ * Do like tcp_enter_loss() would; when RTO expires the second time it
+ * does:
+ *  "Reduce ssthresh if it has not yet been made inside this window."
+ */
+void tcp_enter_frto(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
+	    tp->snd_una == tp->high_seq ||
+	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
+	     !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		/* Our state is too optimistic in ssthresh() call because cwnd
+		 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
+		 * recovery has not yet completed. Pattern would be this: RTO,
+		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
+		 * up here twice).
+		 * RFC4138 should be more specific on what to do, even though
+		 * RTO is quite unlikely to occur after the first Cumulative ACK
+		 * due to back-off and complexity of triggering events ...
+		 */
+		if (tp->frto_counter) {
+			u32 stored_cwnd;
+			stored_cwnd = tp->snd_cwnd;
+			tp->snd_cwnd = 2;
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+			tp->snd_cwnd = stored_cwnd;
+		} else {
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		}
+		/* ... in theory, cong.control module could do "any tricks" in
+		 * ssthresh(), which means that ca_state, lost bits and lost_out
+		 * counter would have to be faked before the call occurs. We
+		 * consider that too expensive, unlikely and hacky, so modules
+		 * using these in ssthresh() must deal these incompatibility
+		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
+		 */
+		tcp_ca_event(sk, CA_EVENT_FRTO);
+	}
+
+	tp->undo_marker = tp->snd_una;
+	tp->undo_retrans = 0;
+
+	skb = tcp_write_queue_head(sk);
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+		tp->undo_marker = 0;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+		tp->retrans_out -= tcp_skb_pcount(skb);
+	}
+	tcp_verify_left_out(tp);
+
+	/* Too bad if TCP was application limited */
+	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
+
+	/* Earlier loss recovery underway (see RFC4138; Appendix B).
+	 * The last condition is necessary at least in tp->frto_counter case.
+	 */
+	if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
+	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
+	    after(tp->high_seq, tp->snd_una)) {
+		tp->frto_highmark = tp->high_seq;
+	} else {
+		tp->frto_highmark = tp->snd_nxt;
+	}
+	tcp_set_ca_state(sk, TCP_CA_Disorder);
+	tp->high_seq = tp->snd_nxt;
+	tp->frto_counter = 1;
+}
+
+/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
+ * which indicates that we should follow the traditional RTO recovery,
+ * i.e. mark everything lost and do go-back-N retransmission.
+ */
+static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	tp->lost_out = 0;
+	tp->retrans_out = 0;
+	if (tcp_is_reno(tp))
+		tcp_reset_reno_sack(tp);
+
+	tcp_for_write_queue(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		/*
+		 * Count the retransmission made on RTO correctly (only when
+		 * waiting for the first ACK and did not get it)...
+		 */
+		if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
+			/* For some reason this R-bit might get cleared? */
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+				tp->retrans_out += tcp_skb_pcount(skb);
+			/* ...enter this if branch just for the first segment */
+			flag |= FLAG_DATA_ACKED;
+		} else {
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+				tp->undo_marker = 0;
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+		}
+
+		/* Marking forward transmissions that were made after RTO lost
+		 * can cause unnecessary retransmissions in some scenarios,
+		 * SACK blocks will mitigate that in some but not in all cases.
+		 * We used to not mark them but it was causing break-ups with
+		 * receivers that do only in-order receival.
+		 *
+		 * TODO: we could detect presence of such receiver and select
+		 * different behavior per flow.
+		 */
+		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+		}
+	}
+	tcp_verify_left_out(tp);
+
+	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->frto_counter = 0;
+	tp->bytes_acked = 0;
+
+	tp->reordering = min_t(unsigned int, tp->reordering,
+			       sysctl_tcp_reordering);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
+	tp->high_seq = tp->snd_nxt;
+	TCP_ECN_queue_cwr(tp);
+
+	tcp_clear_all_retrans_hints(tp);
+}
+
+static void tcp_clear_retrans_partial(struct tcp_sock *tp)
+{
+	tp->retrans_out = 0;
+	tp->lost_out = 0;
+
+	tp->undo_marker = 0;
+	tp->undo_retrans = 0;
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+	tcp_clear_retrans_partial(tp);
+
+	tp->fackets_out = 0;
+	tp->sacked_out = 0;
+}
+
+/* Enter Loss state. If "how" is not zero, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk, int how)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	/* Reduce ssthresh if it has not yet been made inside this window. */
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_LOSS);
+	}
+	tp->snd_cwnd	   = 1;
+	tp->snd_cwnd_cnt   = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+
+	tp->bytes_acked = 0;
+	tcp_clear_retrans_partial(tp);
+
+	if (tcp_is_reno(tp))
+		tcp_reset_reno_sack(tp);
+
+	if (!how) {
+		/* Push undo marker, if it was plain RTO and nothing
+		 * was retransmitted. */
+		tp->undo_marker = tp->snd_una;
+	} else {
+		tp->sacked_out = 0;
+		tp->fackets_out = 0;
+	}
+	tcp_clear_all_retrans_hints(tp);
+
+	tcp_for_write_queue(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+			tp->undo_marker = 0;
+		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+			tp->lost_out += tcp_skb_pcount(skb);
+			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+		}
+	}
+	tcp_verify_left_out(tp);
+
+	tp->reordering = min_t(unsigned int, tp->reordering,
+			       sysctl_tcp_reordering);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
+	tp->high_seq = tp->snd_nxt;
+	TCP_ECN_queue_cwr(tp);
+	/* Abort F-RTO algorithm if one is in progress */
+	tp->frto_counter = 0;
+}
+
+/* If ACK arrived pointing to a remembered SACK, it means that our
+ * remembered SACKs do not reflect real state of receiver i.e.
+ * receiver _host_ is heavily congested (or buggy).
+ *
+ * Do processing similar to RTO timeout.
+ */
+static int tcp_check_sack_reneging(struct sock *sk, int flag)
+{
+	if (flag & FLAG_SACK_RENEGING) {
+		struct inet_connection_sock *icsk = inet_csk(sk);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+
+		tcp_enter_loss(sk, 1);
+		icsk->icsk_retransmits++;
+		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  icsk->icsk_rto, TCP_RTO_MAX);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int tcp_fackets_out(struct tcp_sock *tp)
+{
+	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
+}
+
+/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
+ * counter when SACK is enabled (without SACK, sacked_out is used for
+ * that purpose).
+ *
+ * Instead, with FACK TCP uses fackets_out that includes both SACKed
+ * segments up to the highest received SACK block so far and holes in
+ * between them.
+ *
+ * With reordering, holes may still be in flight, so RFC3517 recovery
+ * uses pure sacked_out (total number of SACKed segments) even though
+ * it violates the RFC that uses duplicate ACKs, often these are equal
+ * but when e.g. out-of-window ACKs or packet duplication occurs,
+ * they differ. Since neither occurs due to loss, TCP should really
+ * ignore them.
+ */
+static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
+{
+	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
+}
+
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
+{
+	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
+}
+
+static inline int tcp_head_timedout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return tp->packets_out &&
+	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
+}
+
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open"	Normal state, no dubious events, fast path.
+ * "Disorder"   In all the respects it is "Open",
+ *		but requires a bit more attention. It is entered when
+ *		we see some SACKs or dupacks. It is split of "Open"
+ *		mainly to move some processing from fast path to slow one.
+ * "CWR"	CWND was reduced due to some Congestion Notification event.
+ *		It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery"	CWND was reduced, we are fast-retransmitting.
+ * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ *	* SACK
+ *	* Duplicate ACK.
+ *	* ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ *	in_flight = packets_out - left_out + retrans_out
+ *
+ *	packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ *	retrans_out is number of retransmitted segments.
+ *
+ *	left_out is number of segments left network, but not ACKed yet.
+ *
+ *		left_out = sacked_out + lost_out
+ *
+ *     sacked_out: Packets, which arrived to receiver out of order
+ *		   and hence not ACKed. With SACKs this number is simply
+ *		   amount of SACKed data. Even without SACKs
+ *		   it is easy to give pretty reliable estimate of this number,
+ *		   counting duplicate ACKs.
+ *
+ *       lost_out: Packets lost by network. TCP has no explicit
+ *		   "loss notification" feedback from network (for now).
+ *		   It means that this number can be only _guessed_.
+ *		   Actually, it is the heuristics to predict lossage that
+ *		   distinguishes different algorithms.
+ *
+ *	F.e. after RTO, when all the queue is considered as lost,
+ *	lost_out = packets_out and in_flight = retrans_out.
+ *
+ *		Essentially, we have now two algorithms counting
+ *		lost packets.
+ *
+ *		FACK: It is the simplest heuristics. As soon as we decided
+ *		that something is lost, we decide that _all_ not SACKed
+ *		packets until the most forward SACK are lost. I.e.
+ *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ *		It is absolutely correct estimate, if network does not reorder
+ *		packets. And it loses any connection to reality when reordering
+ *		takes place. We use FACK by default until reordering
+ *		is suspected on the path to this destination.
+ *
+ *		NewReno: when Recovery is entered, we assume that one segment
+ *		is lost (classic Reno). While we are in Recovery and
+ *		a partial ACK arrives, we assume that one more packet
+ *		is lost (NewReno). This heuristics are the same in NewReno
+ *		and SACK.
+ *
+ *  Imagine, that's all! Forget about all this shamanism about CWND inflation
+ *  deflation etc. CWND is real congestion window, never inflated, changes
+ *  only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static int tcp_time_to_recover(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 packets_out;
+
+	/* Do not perform any recovery during F-RTO algorithm */
+	if (tp->frto_counter)
+		return 0;
+
+	/* Trick#1: The loss is proven. */
+	if (tp->lost_out)
+		return 1;
+
+	/* Not-A-Trick#2 : Classic rule... */
+	if (tcp_dupack_heuristics(tp) > tp->reordering)
+		return 1;
+
+	/* Trick#3 : when we use RFC2988 timer restart, fast
+	 * retransmit can be triggered by timeout of queue head.
+	 */
+	if (tcp_is_fack(tp) && tcp_head_timedout(sk))
+		return 1;
+
+	/* Trick#4: It is still not OK... But will it be useful to delay
+	 * recovery more?
+	 */
+	packets_out = tp->packets_out;
+	if (packets_out <= tp->reordering &&
+	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+	    !tcp_may_send_now(sk)) {
+		/* We have nothing to send. This connection is limited
+		 * either by receiver window or by application.
+		 */
+		return 1;
+	}
+
+	/* If a thin stream is detected, retransmit after first
+	 * received dupack. Employ only if SACK is supported in order
+	 * to avoid possible corner-case series of spurious retransmissions
+	 * Use only if there are no unsent data.
+	 */
+	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+	    tcp_is_sack(tp) && !tcp_send_head(sk))
+		return 1;
+
+	return 0;
+}
+
+/* New heuristics: it is possible only after we switched to restart timer
+ * each time when something is ACKed. Hence, we can detect timed out packets
+ * during fast retransmit without falling to slow start.
+ *
+ * Usefulness of this as is very questionable, since we should know which of
+ * the segments is the next to timeout which is relatively expensive to find
+ * in general case unless we add some data structure just for that. The
+ * current approach certainly won't find the right one too often and when it
+ * finally does find _something_ it usually marks large part of the window
+ * right away (because a retransmission with a larger timestamp blocks the
+ * loop from advancing). -ij
+ */
+static void tcp_timeout_skbs(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
+		return;
+
+	skb = tp->scoreboard_skb_hint;
+	if (tp->scoreboard_skb_hint == NULL)
+		skb = tcp_write_queue_head(sk);
+
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+		if (!tcp_skb_timedout(sk, skb))
+			break;
+
+		tcp_skb_mark_lost(tp, skb);
+	}
+
+	tp->scoreboard_skb_hint = skb;
+
+	tcp_verify_left_out(tp);
+}
+
+/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
+ * is against sacked "cnt", otherwise it's against facked "cnt"
+ */
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int cnt, oldcnt;
+	int err;
+	unsigned int mss;
+
+	WARN_ON(packets > tp->packets_out);
+	if (tp->lost_skb_hint) {
+		skb = tp->lost_skb_hint;
+		cnt = tp->lost_cnt_hint;
+		/* Head already handled? */
+		if (mark_head && skb != tcp_write_queue_head(sk))
+			return;
+	} else {
+		skb = tcp_write_queue_head(sk);
+		cnt = 0;
+	}
+
+	tcp_for_write_queue_from(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+		/* TODO: do this better */
+		/* this is not the most efficient way to do this... */
+		tp->lost_skb_hint = skb;
+		tp->lost_cnt_hint = cnt;
+
+		if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+			break;
+
+		oldcnt = cnt;
+		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
+		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+			cnt += tcp_skb_pcount(skb);
+
+		if (cnt > packets) {
+			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+			    (oldcnt >= packets))
+				break;
+
+			mss = skb_shinfo(skb)->gso_size;
+			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+			if (err < 0)
+				break;
+			cnt = packets;
+		}
+
+		tcp_skb_mark_lost(tp, skb);
+
+		if (mark_head)
+			break;
+	}
+	tcp_verify_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_is_reno(tp)) {
+		tcp_mark_head_lost(sk, 1, 1);
+	} else if (tcp_is_fack(tp)) {
+		int lost = tp->fackets_out - tp->reordering;
+		if (lost <= 0)
+			lost = 1;
+		tcp_mark_head_lost(sk, lost, 0);
+	} else {
+		int sacked_upto = tp->sacked_out - tp->reordering;
+		if (sacked_upto >= 0)
+			tcp_mark_head_lost(sk, sacked_upto, 0);
+		else if (fast_rexmit)
+			tcp_mark_head_lost(sk, 1, 1);
+	}
+
+	tcp_timeout_skbs(sk);
+}
+
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+	tp->snd_cwnd = min(tp->snd_cwnd,
+			   tcp_packets_in_flight(tp) + tcp_max_burst(tp));
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
+	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+
+/* Decrease cwnd each second ack. */
+static void tcp_cwnd_down(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int decr = tp->snd_cwnd_cnt + 1;
+
+	if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
+	    (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
+		tp->snd_cwnd_cnt = decr & 1;
+		decr >>= 1;
+
+		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
+			tp->snd_cwnd -= decr;
+
+		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	}
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline int tcp_packet_delayed(struct tcp_sock *tp)
+{
+	return !tp->retrans_stamp ||
+		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+		 before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+}
+
+/* Undo procedures. */
+
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, const char *msg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (sk->sk_family == AF_INET) {
+		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
+		       msg,
+		       &inet->inet_daddr, ntohs(inet->inet_dport),
+		       tp->snd_cwnd, tcp_left_out(tp),
+		       tp->snd_ssthresh, tp->prior_ssthresh,
+		       tp->packets_out);
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	else if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
+		       msg,
+		       &np->daddr, ntohs(inet->inet_dport),
+		       tp->snd_cwnd, tcp_left_out(tp),
+		       tp->snd_ssthresh, tp->prior_ssthresh,
+		       tp->packets_out);
+	}
+#endif
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+
+static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->prior_ssthresh) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+
+		if (icsk->icsk_ca_ops->undo_cwnd)
+			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
+		else
+			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+
+		if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
+			tp->snd_ssthresh = tp->prior_ssthresh;
+			TCP_ECN_withdraw_cwr(tp);
+		}
+	} else {
+		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+	}
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline int tcp_may_undo(struct tcp_sock *tp)
+{
+	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+/* People celebrate: "We love our President!" */
+static int tcp_try_undo_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_may_undo(tp)) {
+		int mib_idx;
+
+		/* Happy end! We did not retransmit anything
+		 * or our original transmission succeeded.
+		 */
+		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		tcp_undo_cwr(sk, true);
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+			mib_idx = LINUX_MIB_TCPLOSSUNDO;
+		else
+			mib_idx = LINUX_MIB_TCPFULLUNDO;
+
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+		tp->undo_marker = 0;
+	}
+	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+		/* Hold old state until something *above* high_seq
+		 * is ACKed. For Reno it is MUST to prevent false
+		 * fast retransmits (RFC2582). SACK TCP is safe. */
+		tcp_moderate_cwnd(tp);
+		return 1;
+	}
+	tcp_set_ca_state(sk, TCP_CA_Open);
+	return 0;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static void tcp_try_undo_dsack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->undo_marker && !tp->undo_retrans) {
+		DBGUNDO(sk, "D-SACK");
+		tcp_undo_cwr(sk, true);
+		tp->undo_marker = 0;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+	}
+}
+
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static int tcp_any_retrans_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (tp->retrans_out)
+		return 1;
+
+	skb = tcp_write_queue_head(sk);
+	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+		return 1;
+
+	return 0;
+}
+
+/* Undo during fast recovery after partial ACK. */
+
+static int tcp_try_undo_partial(struct sock *sk, int acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	/* Partial ACK arrived. Force Hoe's retransmit. */
+	int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
+
+	if (tcp_may_undo(tp)) {
+		/* Plain luck! Hole if filled with delayed
+		 * packet, rather than with a retransmit.
+		 */
+		if (!tcp_any_retrans_done(sk))
+			tp->retrans_stamp = 0;
+
+		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
+
+		DBGUNDO(sk, "Hoe");
+		tcp_undo_cwr(sk, false);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+
+		/* So... Do not make Hoe's retransmit yet.
+		 * If the first packet was delayed, the rest
+		 * ones are most probably delayed as well.
+		 */
+		failed = 0;
+	}
+	return failed;
+}
+
+/* Undo during loss recovery after partial ACK. */
+static int tcp_try_undo_loss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_may_undo(tp)) {
+		struct sk_buff *skb;
+		tcp_for_write_queue(skb, sk) {
+			if (skb == tcp_send_head(sk))
+				break;
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		}
+
+		tcp_clear_all_retrans_hints(tp);
+
+		DBGUNDO(sk, "partial loss");
+		tp->lost_out = 0;
+		tcp_undo_cwr(sk, true);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+		inet_csk(sk)->icsk_retransmits = 0;
+		tp->undo_marker = 0;
+		if (tcp_is_sack(tp))
+			tcp_set_ca_state(sk, TCP_CA_Open);
+		return 1;
+	}
+	return 0;
+}
+
+static inline void tcp_complete_cwr(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	/* Do not moderate cwnd if it's already undone in cwr or recovery */
+	if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
+		tp->snd_cwnd = tp->snd_ssthresh;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	}
+	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
+}
+
+static void tcp_try_keep_open(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int state = TCP_CA_Open;
+
+	if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
+		state = TCP_CA_Disorder;
+
+	if (inet_csk(sk)->icsk_ca_state != state) {
+		tcp_set_ca_state(sk, state);
+		tp->high_seq = tp->snd_nxt;
+	}
+}
+
+static void tcp_try_to_open(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_verify_left_out(tp);
+
+	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
+		tp->retrans_stamp = 0;
+
+	if (flag & FLAG_ECE)
+		tcp_enter_cwr(sk, 1);
+
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
+		tcp_try_keep_open(sk);
+		tcp_moderate_cwnd(tp);
+	} else {
+		tcp_cwnd_down(sk, flag);
+	}
+}
+
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+	icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/* FIXME: breaks with very large cwnd */
+	tp->prior_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_cwnd = tp->snd_cwnd *
+		       tcp_mss_to_mtu(sk, tp->mss_cache) /
+		       icsk->icsk_mtup.probe_size;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
+
+	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+	icsk->icsk_mtup.probe_size = 0;
+	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery.
+ * The socket is already locked here.
+ */
+void tcp_simple_retransmit(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	unsigned int mss = tcp_current_mss(sk);
+	u32 prior_lost = tp->lost_out;
+
+	tcp_for_write_queue(skb, sk) {
+		if (skb == tcp_send_head(sk))
+			break;
+		if (tcp_skb_seglen(skb) > mss &&
+		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+				tp->retrans_out -= tcp_skb_pcount(skb);
+			}
+			tcp_skb_mark_lost_uncond_verify(tp, skb);
+		}
+	}
+
+	tcp_clear_retrans_hints_partial(tp);
+
+	if (prior_lost == tp->lost_out)
+		return;
+
+	if (tcp_is_reno(tp))
+		tcp_limit_reno_sacked(tp);
+
+	tcp_verify_left_out(tp);
+
+	/* Don't muck with the congestion window here.
+	 * Reason is that we do not increase amount of _data_
+	 * in network, but units changed and effective
+	 * cwnd/ssthresh really reduced now.
+	 */
+	if (icsk->icsk_ca_state != TCP_CA_Loss) {
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_ssthresh = tcp_current_ssthresh(sk);
+		tp->prior_ssthresh = 0;
+		tp->undo_marker = 0;
+		tcp_set_ca_state(sk, TCP_CA_Loss);
+	}
+	tcp_xmit_retransmit_queue(sk);
+}
+EXPORT_SYMBOL(tcp_simple_retransmit);
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+	int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+				    (tcp_fackets_out(tp) > tp->reordering));
+	int fast_rexmit = 0, mib_idx;
+
+	if (WARN_ON(!tp->packets_out && tp->sacked_out))
+		tp->sacked_out = 0;
+	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+		tp->fackets_out = 0;
+
+	/* Now state machine starts.
+	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+	if (flag & FLAG_ECE)
+		tp->prior_ssthresh = 0;
+
+	/* B. In all the states check for reneging SACKs. */
+	if (tcp_check_sack_reneging(sk, flag))
+		return;
+
+	/* C. Process data loss notification, provided it is valid. */
+	if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
+	    before(tp->snd_una, tp->high_seq) &&
+	    icsk->icsk_ca_state != TCP_CA_Open &&
+	    tp->fackets_out > tp->reordering) {
+		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
+	}
+
+	/* D. Check consistency of the current state. */
+	tcp_verify_left_out(tp);
+
+	/* E. Check state exit conditions. State can be terminated
+	 *    when high_seq is ACKed. */
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
+		WARN_ON(tp->retrans_out != 0);
+		tp->retrans_stamp = 0;
+	} else if (!before(tp->snd_una, tp->high_seq)) {
+		switch (icsk->icsk_ca_state) {
+		case TCP_CA_Loss:
+			icsk->icsk_retransmits = 0;
+			if (tcp_try_undo_recovery(sk))
+				return;
+			break;
+
+		case TCP_CA_CWR:
+			/* CWR is to be held something *above* high_seq
+			 * is ACKed for CWR bit to reach receiver. */
+			if (tp->snd_una != tp->high_seq) {
+				tcp_complete_cwr(sk);
+				tcp_set_ca_state(sk, TCP_CA_Open);
+			}
+			break;
+
+		case TCP_CA_Disorder:
+			tcp_try_undo_dsack(sk);
+			if (!tp->undo_marker ||
+			    /* For SACK case do not Open to allow to undo
+			     * catching for all duplicate ACKs. */
+			    tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
+				tp->undo_marker = 0;
+				tcp_set_ca_state(sk, TCP_CA_Open);
+			}
+			break;
+
+		case TCP_CA_Recovery:
+			if (tcp_is_reno(tp))
+				tcp_reset_reno_sack(tp);
+			if (tcp_try_undo_recovery(sk))
+				return;
+			tcp_complete_cwr(sk);
+			break;
+		}
+	}
+
+	/* F. Process state. */
+	switch (icsk->icsk_ca_state) {
+	case TCP_CA_Recovery:
+		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+			if (tcp_is_reno(tp) && is_dupack)
+				tcp_add_reno_sack(sk);
+		} else
+			do_lost = tcp_try_undo_partial(sk, pkts_acked);
+		break;
+	case TCP_CA_Loss:
+		if (flag & FLAG_DATA_ACKED)
+			icsk->icsk_retransmits = 0;
+		if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
+			tcp_reset_reno_sack(tp);
+		if (!tcp_try_undo_loss(sk)) {
+			tcp_moderate_cwnd(tp);
+			tcp_xmit_retransmit_queue(sk);
+			return;
+		}
+		if (icsk->icsk_ca_state != TCP_CA_Open)
+			return;
+		/* Loss is undone; fall through to processing in Open state. */
+	default:
+		if (tcp_is_reno(tp)) {
+			if (flag & FLAG_SND_UNA_ADVANCED)
+				tcp_reset_reno_sack(tp);
+			if (is_dupack)
+				tcp_add_reno_sack(sk);
+		}
+
+		if (icsk->icsk_ca_state == TCP_CA_Disorder)
+			tcp_try_undo_dsack(sk);
+
+		if (!tcp_time_to_recover(sk)) {
+			tcp_try_to_open(sk, flag);
+			return;
+		}
+
+		/* MTU probe failure: don't reduce cwnd */
+		if (icsk->icsk_ca_state < TCP_CA_CWR &&
+		    icsk->icsk_mtup.probe_size &&
+		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
+			tcp_mtup_probe_failed(sk);
+			/* Restores the reduction we did in tcp_mtup_probe() */
+			tp->snd_cwnd++;
+			tcp_simple_retransmit(sk);
+			return;
+		}
+
+		/* Otherwise enter Recovery state */
+
+		if (tcp_is_reno(tp))
+			mib_idx = LINUX_MIB_TCPRENORECOVERY;
+		else
+			mib_idx = LINUX_MIB_TCPSACKRECOVERY;
+
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+		tp->high_seq = tp->snd_nxt;
+		tp->prior_ssthresh = 0;
+		tp->undo_marker = tp->snd_una;
+		tp->undo_retrans = tp->retrans_out;
+
+		if (icsk->icsk_ca_state < TCP_CA_CWR) {
+			if (!(flag & FLAG_ECE))
+				tp->prior_ssthresh = tcp_current_ssthresh(sk);
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+			TCP_ECN_queue_cwr(tp);
+		}
+
+		tp->bytes_acked = 0;
+		tp->snd_cwnd_cnt = 0;
+		tcp_set_ca_state(sk, TCP_CA_Recovery);
+		fast_rexmit = 1;
+	}
+
+	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
+		tcp_update_scoreboard(sk, fast_rexmit);
+	tcp_cwnd_down(sk, flag);
+	tcp_xmit_retransmit_queue(sk);
+}
+
+static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+{
+	tcp_rtt_estimator(sk, seq_rtt);
+	tcp_set_rto(sk);
+	inet_csk(sk)->icsk_backoff = 0;
+}
+
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Supersedes RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
+{
+	/* RTTM Rule: A TSecr value received in a segment is used to
+	 * update the averaged RTT measurement only if the segment
+	 * acknowledges some new data, i.e., only if it advances the
+	 * left edge of the send window.
+	 *
+	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
+	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+	 *
+	 * Changed: reset backoff as soon as we see the first valid sample.
+	 * If we do not, we get strongly overestimated rto. With timestamps
+	 * samples are accepted even from very old segments: f.e., when rtt=1
+	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
+	 * answer arrives rto becomes 120 seconds! If at least one of segments
+	 * in window is lost... Voila.	 			--ANK (010210)
+	 */
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+}
+
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
+{
+	/* We don't have a timestamp. Can only use
+	 * packets that are not retransmitted to determine
+	 * rtt estimates. Also, we must not reset the
+	 * backoff for rto until we get a non-retransmitted
+	 * packet. This allows us to deal with a situation
+	 * where the network delay has increased suddenly.
+	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+	 */
+
+	if (flag & FLAG_RETRANS_DATA_ACKED)
+		return;
+
+	tcp_valid_rtt_meas(sk, seq_rtt);
+}
+
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+				      const s32 seq_rtt)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
+	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+		tcp_ack_saw_tstamp(sk, flag);
+	else if (seq_rtt >= 0)
+		tcp_ack_no_tstamp(sk, seq_rtt, flag);
+}
+
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
+	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+static void tcp_rearm_rto(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->packets_out) {
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+	} else {
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+	}
+}
+
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 packets_acked;
+
+	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
+
+	packets_acked = tcp_skb_pcount(skb);
+	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+		return 0;
+	packets_acked -= tcp_skb_pcount(skb);
+
+	if (packets_acked) {
+		BUG_ON(tcp_skb_pcount(skb) == 0);
+		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
+	}
+
+	return packets_acked;
+}
+
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
+			       u32 prior_snd_una)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb;
+	u32 now = tcp_time_stamp;
+	int fully_acked = 1;
+	int flag = 0;
+	u32 pkts_acked = 0;
+	u32 reord = tp->packets_out;
+	u32 prior_sacked = tp->sacked_out;
+	s32 seq_rtt = -1;
+	s32 ca_seq_rtt = -1;
+	ktime_t last_ackt = net_invalid_timestamp();
+
+	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		u32 acked_pcount;
+		u8 sacked = scb->sacked;
+
+		/* Determine how many packets and what bytes were acked, tso and else */
+		if (after(scb->end_seq, tp->snd_una)) {
+			if (tcp_skb_pcount(skb) == 1 ||
+			    !after(tp->snd_una, scb->seq))
+				break;
+
+			acked_pcount = tcp_tso_acked(sk, skb);
+			if (!acked_pcount)
+				break;
+
+			fully_acked = 0;
+		} else {
+			acked_pcount = tcp_skb_pcount(skb);
+		}
+
+		if (sacked & TCPCB_RETRANS) {
+			if (sacked & TCPCB_SACKED_RETRANS)
+				tp->retrans_out -= acked_pcount;
+			flag |= FLAG_RETRANS_DATA_ACKED;
+			ca_seq_rtt = -1;
+			seq_rtt = -1;
+			if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
+				flag |= FLAG_NONHEAD_RETRANS_ACKED;
+		} else {
+			ca_seq_rtt = now - scb->when;
+			last_ackt = skb->tstamp;
+			if (seq_rtt < 0) {
+				seq_rtt = ca_seq_rtt;
+			}
+			if (!(sacked & TCPCB_SACKED_ACKED))
+				reord = min(pkts_acked, reord);
+		}
+
+		if (sacked & TCPCB_SACKED_ACKED)
+			tp->sacked_out -= acked_pcount;
+		if (sacked & TCPCB_LOST)
+			tp->lost_out -= acked_pcount;
+
+		tp->packets_out -= acked_pcount;
+		pkts_acked += acked_pcount;
+
+		/* Initial outgoing SYN's get put onto the write_queue
+		 * just like anything else we transmit.  It is not
+		 * true data, and if we misinform our callers that
+		 * this ACK acks real data, we will erroneously exit
+		 * connection startup slow start one packet too
+		 * quickly.  This is severely frowned upon behavior.
+		 */
+		if (!(scb->flags & TCPHDR_SYN)) {
+			flag |= FLAG_DATA_ACKED;
+		} else {
+			flag |= FLAG_SYN_ACKED;
+			tp->retrans_stamp = 0;
+		}
+
+		if (!fully_acked)
+			break;
+
+		tcp_unlink_write_queue(skb, sk);
+		sk_wmem_free_skb(sk, skb);
+		tp->scoreboard_skb_hint = NULL;
+		if (skb == tp->retransmit_skb_hint)
+			tp->retransmit_skb_hint = NULL;
+		if (skb == tp->lost_skb_hint)
+			tp->lost_skb_hint = NULL;
+	}
+
+	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
+		tp->snd_up = tp->snd_una;
+
+	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+		flag |= FLAG_SACK_RENEGING;
+
+	if (flag & FLAG_ACKED) {
+		const struct tcp_congestion_ops *ca_ops
+			= inet_csk(sk)->icsk_ca_ops;
+
+		if (unlikely(icsk->icsk_mtup.probe_size &&
+			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
+			tcp_mtup_probe_success(sk);
+		}
+
+		tcp_ack_update_rtt(sk, flag, seq_rtt);
+		tcp_rearm_rto(sk);
+
+		if (tcp_is_reno(tp)) {
+			tcp_remove_reno_sacks(sk, pkts_acked);
+		} else {
+			int delta;
+
+			/* Non-retransmitted hole got filled? That's reordering */
+			if (reord < prior_fackets)
+				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
+
+			delta = tcp_is_fack(tp) ? pkts_acked :
+						  prior_sacked - tp->sacked_out;
+			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
+		}
+
+		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
+
+		if (ca_ops->pkts_acked) {
+			s32 rtt_us = -1;
+
+			/* Is the ACK triggering packet unambiguous? */
+			if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
+				/* High resolution needed and available? */
+				if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
+				    !ktime_equal(last_ackt,
+						 net_invalid_timestamp()))
+					rtt_us = ktime_us_delta(ktime_get_real(),
+								last_ackt);
+				else if (ca_seq_rtt >= 0)
+					rtt_us = jiffies_to_usecs(ca_seq_rtt);
+			}
+
+			ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
+		}
+	}
+
+#if FASTRETRANS_DEBUG > 0
+	WARN_ON((int)tp->sacked_out < 0);
+	WARN_ON((int)tp->lost_out < 0);
+	WARN_ON((int)tp->retrans_out < 0);
+	if (!tp->packets_out && tcp_is_sack(tp)) {
+		icsk = inet_csk(sk);
+		if (tp->lost_out) {
+			printk(KERN_DEBUG "Leak l=%u %d\n",
+			       tp->lost_out, icsk->icsk_ca_state);
+			tp->lost_out = 0;
+		}
+		if (tp->sacked_out) {
+			printk(KERN_DEBUG "Leak s=%u %d\n",
+			       tp->sacked_out, icsk->icsk_ca_state);
+			tp->sacked_out = 0;
+		}
+		if (tp->retrans_out) {
+			printk(KERN_DEBUG "Leak r=%u %d\n",
+			       tp->retrans_out, icsk->icsk_ca_state);
+			tp->retrans_out = 0;
+		}
+	}
+#endif
+	return flag;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/* Was it a usable window open? */
+
+	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+		icsk->icsk_backoff = 0;
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+		/* Socket must be waked up by subsequent tcp_data_snd_check().
+		 * This function is not for random using!
+		 */
+	} else {
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
+	}
+}
+
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
+{
+	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
+}
+
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
+		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline int tcp_may_update_window(const struct tcp_sock *tp,
+					const u32 ack, const u32 ack_seq,
+					const u32 nwin)
+{
+	return	after(ack, tp->snd_una) ||
+		after(ack_seq, tp->snd_wl1) ||
+		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
+				 u32 ack_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int flag = 0;
+	u32 nwin = ntohs(tcp_hdr(skb)->window);
+
+	if (likely(!tcp_hdr(skb)->syn))
+		nwin <<= tp->rx_opt.snd_wscale;
+
+	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+		flag |= FLAG_WIN_UPDATE;
+		tcp_update_wl(tp, ack_seq);
+
+		if (tp->snd_wnd != nwin) {
+			tp->snd_wnd = nwin;
+
+			/* Note, it is the only place, where
+			 * fast path is recovered for sending TCP.
+			 */
+			tp->pred_flags = 0;
+			tcp_fast_path_check(sk);
+
+			if (nwin > tp->max_window) {
+				tp->max_window = nwin;
+				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
+			}
+		}
+	}
+
+	tp->snd_una = ack;
+
+	return flag;
+}
+
+/* A very conservative spurious RTO response algorithm: reduce cwnd and
+ * continue in congestion avoidance.
+ */
+static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
+{
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd_cnt = 0;
+	tp->bytes_acked = 0;
+	TCP_ECN_queue_cwr(tp);
+	tcp_moderate_cwnd(tp);
+}
+
+/* A conservative spurious RTO response algorithm: reduce cwnd using
+ * rate halving and continue in congestion avoidance.
+ */
+static void tcp_ratehalving_spur_to_response(struct sock *sk)
+{
+	tcp_enter_cwr(sk, 0);
+}
+
+static void tcp_undo_spur_to_response(struct sock *sk, int flag)
+{
+	if (flag & FLAG_ECE)
+		tcp_ratehalving_spur_to_response(sk);
+	else
+		tcp_undo_cwr(sk, true);
+}
+
+/* F-RTO spurious RTO detection algorithm (RFC4138)
+ *
+ * F-RTO affects during two new ACKs following RTO (well, almost, see inline
+ * comments). State (ACK number) is kept in frto_counter. When ACK advances
+ * window (but not to or beyond highest sequence sent before RTO):
+ *   On First ACK,  send two new segments out.
+ *   On Second ACK, RTO was likely spurious. Do spurious response (response
+ *                  algorithm is not part of the F-RTO detection algorithm
+ *                  given in RFC4138 but can be selected separately).
+ * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
+ *
+ * Rationale: if the RTO was spurious, new ACKs should arrive from the
+ * original window even after we transmit two new data segments.
+ *
+ * SACK version:
+ *   on first step, wait until first cumulative ACK arrives, then move to
+ *   the second step. In second step, the next ACK decides.
+ *
+ * F-RTO is implemented (mainly) in four functions:
+ *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
+ *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
+ *     called when tcp_use_frto() showed green light
+ *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
+ *   - tcp_enter_frto_loss() is called if there is not enough evidence
+ *     to prove that the RTO is indeed spurious. It transfers the control
+ *     from F-RTO to the conventional RTO recovery
+ */
+static int tcp_process_frto(struct sock *sk, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_verify_left_out(tp);
+
+	/* Duplicate the behavior from Loss state (fastretrans_alert) */
+	if (flag & FLAG_DATA_ACKED)
+		inet_csk(sk)->icsk_retransmits = 0;
+
+	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
+	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
+		tp->undo_marker = 0;
+
+	if (!before(tp->snd_una, tp->frto_highmark)) {
+		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
+		return 1;
+	}
+
+	if (!tcp_is_sackfrto(tp)) {
+		/* RFC4138 shortcoming in step 2; should also have case c):
+		 * ACK isn't duplicate nor advances window, e.g., opposite dir
+		 * data, winupdate
+		 */
+		if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
+			return 1;
+
+		if (!(flag & FLAG_DATA_ACKED)) {
+			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
+					    flag);
+			return 1;
+		}
+	} else {
+		if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+			/* Prevent sending of new data. */
+			tp->snd_cwnd = min(tp->snd_cwnd,
+					   tcp_packets_in_flight(tp));
+			return 1;
+		}
+
+		if ((tp->frto_counter >= 2) &&
+		    (!(flag & FLAG_FORWARD_PROGRESS) ||
+		     ((flag & FLAG_DATA_SACKED) &&
+		      !(flag & FLAG_ONLY_ORIG_SACKED)))) {
+			/* RFC4138 shortcoming (see comment above) */
+			if (!(flag & FLAG_FORWARD_PROGRESS) &&
+			    (flag & FLAG_NOT_DUP))
+				return 1;
+
+			tcp_enter_frto_loss(sk, 3, flag);
+			return 1;
+		}
+	}
+
+	if (tp->frto_counter == 1) {
+		/* tcp_may_send_now needs to see updated state */
+		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
+		tp->frto_counter = 2;
+
+		if (!tcp_may_send_now(sk))
+			tcp_enter_frto_loss(sk, 2, flag);
+
+		return 1;
+	} else {
+		switch (sysctl_tcp_frto_response) {
+		case 2:
+			tcp_undo_spur_to_response(sk, flag);
+			break;
+		case 1:
+			tcp_conservative_spur_to_response(tp);
+			break;
+		default:
+			tcp_ratehalving_spur_to_response(sk);
+			break;
+		}
+		tp->frto_counter = 0;
+		tp->undo_marker = 0;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
+	}
+	return 0;
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_snd_una = tp->snd_una;
+	u32 ack_seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+	u32 prior_in_flight;
+	u32 prior_fackets;
+	int prior_packets;
+	int frto_cwnd = 0;
+
+	/* If the ack is older than previous acks
+	 * then we can probably ignore it.
+	 */
+	if (before(ack, prior_snd_una))
+		goto old_ack;
+
+	/* If the ack includes data we haven't sent yet, discard
+	 * this segment (RFC793 Section 3.9).
+	 */
+	if (after(ack, tp->snd_nxt))
+		goto invalid_ack;
+
+	if (after(ack, prior_snd_una))
+		flag |= FLAG_SND_UNA_ADVANCED;
+
+	if (sysctl_tcp_abc) {
+		if (icsk->icsk_ca_state < TCP_CA_CWR)
+			tp->bytes_acked += ack - prior_snd_una;
+		else if (icsk->icsk_ca_state == TCP_CA_Loss)
+			/* we assume just one segment left network */
+			tp->bytes_acked += min(ack - prior_snd_una,
+					       tp->mss_cache);
+	}
+
+	prior_fackets = tp->fackets_out;
+	prior_in_flight = tcp_packets_in_flight(tp);
+
+	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+		/* Window is constant, pure forward advance.
+		 * No more checks are required.
+		 * Note, we use the fact that SND.UNA>=SND.WL2.
+		 */
+		tcp_update_wl(tp, ack_seq);
+		tp->snd_una = ack;
+		flag |= FLAG_WIN_UPDATE;
+
+		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+	} else {
+		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+			flag |= FLAG_DATA;
+		else
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+
+		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+
+		if (TCP_SKB_CB(skb)->sacked)
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+
+		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
+			flag |= FLAG_ECE;
+
+		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+	}
+
+	/* We passed data and got it acked, remove any soft error
+	 * log. Something worked...
+	 */
+	sk->sk_err_soft = 0;
+	icsk->icsk_probes_out = 0;
+	tp->rcv_tstamp = tcp_time_stamp;
+	prior_packets = tp->packets_out;
+	if (!prior_packets)
+		goto no_queue;
+
+	/* See if we can take anything off of the retransmit queue. */
+	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
+
+	if (tp->frto_counter)
+		frto_cwnd = tcp_process_frto(sk, flag);
+	/* Guarantee sacktag reordering detection against wrap-arounds */
+	if (before(tp->frto_highmark, tp->snd_una))
+		tp->frto_highmark = 0;
+
+	if (tcp_ack_is_dubious(sk, flag)) {
+		/* Advance CWND, if state allows this. */
+		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
+		    tcp_may_raise_cwnd(sk, flag))
+			tcp_cong_avoid(sk, ack, prior_in_flight);
+		tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
+				      flag);
+	} else {
+		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
+			tcp_cong_avoid(sk, ack, prior_in_flight);
+	}
+
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		dst_confirm(__sk_dst_get(sk));
+
+	return 1;
+
+no_queue:
+	/* If this ack opens up a zero window, clear backoff.  It was
+	 * being used to time the probes, and is probably far higher than
+	 * it needs to be for normal retransmission.
+	 */
+	if (tcp_send_head(sk))
+		tcp_ack_probe(sk);
+	return 1;
+
+invalid_ack:
+	SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	return -1;
+
+old_ack:
+	if (TCP_SKB_CB(skb)->sacked) {
+		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+		if (icsk->icsk_ca_state == TCP_CA_Open)
+			tcp_try_keep_open(sk);
+	}
+
+	SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	return 0;
+}
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
+		       u8 **hvpp, int estab)
+{
+	unsigned char *ptr;
+	struct tcphdr *th = tcp_hdr(skb);
+	int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+	ptr = (unsigned char *)(th + 1);
+	opt_rx->saw_tstamp = 0;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				return;	/* don't parse partial options */
+			switch (opcode) {
+			case TCPOPT_MSS:
+				if (opsize == TCPOLEN_MSS && th->syn && !estab) {
+					u16 in_mss = get_unaligned_be16(ptr);
+					if (in_mss) {
+						if (opt_rx->user_mss &&
+						    opt_rx->user_mss < in_mss)
+							in_mss = opt_rx->user_mss;
+						opt_rx->mss_clamp = in_mss;
+					}
+				}
+				break;
+			case TCPOPT_WINDOW:
+				if (opsize == TCPOLEN_WINDOW && th->syn &&
+				    !estab && sysctl_tcp_window_scaling) {
+					__u8 snd_wscale = *(__u8 *)ptr;
+					opt_rx->wscale_ok = 1;
+					if (snd_wscale > 14) {
+						if (net_ratelimit())
+							printk(KERN_INFO "tcp_parse_options: Illegal window "
+							       "scaling value %d >14 received.\n",
+							       snd_wscale);
+						snd_wscale = 14;
+					}
+					opt_rx->snd_wscale = snd_wscale;
+				}
+				break;
+			case TCPOPT_TIMESTAMP:
+				if ((opsize == TCPOLEN_TIMESTAMP) &&
+				    ((estab && opt_rx->tstamp_ok) ||
+				     (!estab && sysctl_tcp_timestamps))) {
+					opt_rx->saw_tstamp = 1;
+					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
+				}
+				break;
+			case TCPOPT_SACK_PERM:
+				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
+				    !estab && sysctl_tcp_sack) {
+					opt_rx->sack_ok = 1;
+					tcp_sack_reset(opt_rx);
+				}
+				break;
+
+			case TCPOPT_SACK:
+				if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+				   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+				   opt_rx->sack_ok) {
+					TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+				}
+				break;
+#ifdef CONFIG_TCP_MD5SIG
+			case TCPOPT_MD5SIG:
+				/*
+				 * The MD5 Hash has already been
+				 * checked (see tcp_v{4,6}_do_rcv()).
+				 */
+				break;
+#endif
+			case TCPOPT_COOKIE:
+				/* This option is variable length.
+				 */
+				switch (opsize) {
+				case TCPOLEN_COOKIE_BASE:
+					/* not yet implemented */
+					break;
+				case TCPOLEN_COOKIE_PAIR:
+					/* not yet implemented */
+					break;
+				case TCPOLEN_COOKIE_MIN+0:
+				case TCPOLEN_COOKIE_MIN+2:
+				case TCPOLEN_COOKIE_MIN+4:
+				case TCPOLEN_COOKIE_MIN+6:
+				case TCPOLEN_COOKIE_MAX:
+					/* 16-bit multiple */
+					opt_rx->cookie_plus = opsize;
+					*hvpp = ptr;
+					break;
+				default:
+					/* ignore option */
+					break;
+				}
+				break;
+			}
+
+			ptr += opsize-2;
+			length -= opsize;
+		}
+	}
+}
+EXPORT_SYMBOL(tcp_parse_options);
+
+static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
+{
+	__be32 *ptr = (__be32 *)(th + 1);
+
+	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+		tp->rx_opt.saw_tstamp = 1;
+		++ptr;
+		tp->rx_opt.rcv_tsval = ntohl(*ptr);
+		++ptr;
+		tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+		return 1;
+	}
+	return 0;
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+				  struct tcp_sock *tp, u8 **hvpp)
+{
+	/* In the spirit of fast parsing, compare doff directly to constant
+	 * values.  Because equality is used, short doff can be ignored here.
+	 */
+	if (th->doff == (sizeof(*th) / 4)) {
+		tp->rx_opt.saw_tstamp = 0;
+		return 0;
+	} else if (tp->rx_opt.tstamp_ok &&
+		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
+		if (tcp_parse_aligned_timestamp(tp, th))
+			return 1;
+	}
+	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+	return 1;
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * Parse MD5 Signature option
+ */
+u8 *tcp_parse_md5sig_option(struct tcphdr *th)
+{
+	int length = (th->doff << 2) - sizeof (*th);
+	u8 *ptr = (u8*)(th + 1);
+
+	/* If the TCP option is too short, we can short cut */
+	if (length < TCPOLEN_MD5SIG)
+		return NULL;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch(opcode) {
+		case TCPOPT_EOL:
+			return NULL;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2 || opsize > length)
+				return NULL;
+			if (opcode == TCPOPT_MD5SIG)
+				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+		}
+		ptr += opsize - 2;
+		length -= opsize;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
+#endif
+
+static inline void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+	tp->rx_opt.ts_recent_stamp = get_seconds();
+}
+
+static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
+		 * extra check below makes sure this can only happen
+		 * for pure ACK frames.  -DaveM
+		 *
+		 * Not only, also it occurs for expired timestamps.
+		 */
+
+		if (tcp_paws_check(&tp->rx_opt, 0))
+			tcp_store_ts_recent(tp);
+	}
+}
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
+	u32 seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+	return (/* 1. Pure ACK with correct sequence number. */
+		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+		/* 2. ... and duplicate ACK. */
+		ack == tp->snd_una &&
+
+		/* 3. ... and does not update window. */
+		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+		/* 4. ... and sits in replay window. */
+		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
+}
+
+static inline int tcp_paws_discard(const struct sock *sk,
+				   const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
+	       !tcp_disordered_ack(sk, skb);
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+	return	!before(end_seq, tp->rcv_wup) &&
+		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+
+/* When we get a reset we do this. */
+static void tcp_reset(struct sock *sk)
+{
+	/* We want the right error as BSD sees it (and indeed as we do). */
+	switch (sk->sk_state) {
+	case TCP_SYN_SENT:
+		sk->sk_err = ECONNREFUSED;
+		break;
+	case TCP_CLOSE_WAIT:
+		sk->sk_err = EPIPE;
+		break;
+	case TCP_CLOSE:
+		return;
+	default:
+		sk->sk_err = ECONNRESET;
+	}
+	/* This barrier is coupled with smp_rmb() in tcp_poll() */
+	smp_wmb();
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_error_report(sk);
+
+	tcp_done(sk);
+}
+
+/*
+ * 	Process the FIN bit. This now behaves as it is supposed to work
+ *	and the FIN takes effect when it is validly part of sequence
+ *	space. Not before when we get holes.
+ *
+ *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
+ *	TIME-WAIT)
+ *
+ *	If we are in FINWAIT-1, a received FIN indicates simultaneous
+ *	close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	inet_csk_schedule_ack(sk);
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(sk, SOCK_DONE);
+
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		/* Move to CLOSE_WAIT */
+		tcp_set_state(sk, TCP_CLOSE_WAIT);
+		inet_csk(sk)->icsk_ack.pingpong = 1;
+		break;
+
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+		/* Received a retransmission of the FIN, do
+		 * nothing.
+		 */
+		break;
+	case TCP_LAST_ACK:
+		/* RFC793: Remain in the LAST-ACK state. */
+		break;
+
+	case TCP_FIN_WAIT1:
+		/* This case occurs when a simultaneous close
+		 * happens, we must ack the received FIN and
+		 * enter the CLOSING state.
+		 */
+		tcp_send_ack(sk);
+		tcp_set_state(sk, TCP_CLOSING);
+		break;
+	case TCP_FIN_WAIT2:
+		/* Received a FIN -- send ACK and enter TIME_WAIT. */
+		tcp_send_ack(sk);
+		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+		break;
+	default:
+		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
+		 * cases we should never reach this piece of code.
+		 */
+		printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+		       __func__, sk->sk_state);
+		break;
+	}
+
+	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+	 * Probably, we should reset in this case. For now drop them.
+	 */
+	__skb_queue_purge(&tp->out_of_order_queue);
+	if (tcp_is_sack(tp))
+		tcp_sack_reset(&tp->rx_opt);
+	sk_mem_reclaim(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+
+		/* Do not send POLL_HUP for half duplex close. */
+		if (sk->sk_shutdown == SHUTDOWN_MASK ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+		else
+			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	}
+}
+
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
+				  u32 end_seq)
+{
+	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+		if (before(seq, sp->start_seq))
+			sp->start_seq = seq;
+		if (after(end_seq, sp->end_seq))
+			sp->end_seq = end_seq;
+		return 1;
+	}
+	return 0;
+}
+
+static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+		int mib_idx;
+
+		if (before(seq, tp->rcv_nxt))
+			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
+		else
+			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
+
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+		tp->rx_opt.dsack = 1;
+		tp->duplicate_sack[0].start_seq = seq;
+		tp->duplicate_sack[0].end_seq = end_seq;
+	}
+}
+
+static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->rx_opt.dsack)
+		tcp_dsack_set(sk, seq, end_seq);
+	else
+		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+		tcp_enter_quickack_mode(sk);
+
+		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
+			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+				end_seq = tp->rcv_nxt;
+			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
+		}
+	}
+
+	tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+	int this_sack;
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	struct tcp_sack_block *swalk = sp + 1;
+
+	/* See if the recent change to the first SACK eats into
+	 * or hits the sequence space of other SACK blocks, if so coalesce.
+	 */
+	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
+		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+			int i;
+
+			/* Zap SWALK, by moving every further SACK up by one slot.
+			 * Decrease num_sacks.
+			 */
+			tp->rx_opt.num_sacks--;
+			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
+				sp[i] = sp[i + 1];
+			continue;
+		}
+		this_sack++, swalk++;
+	}
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int cur_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	if (!cur_sacks)
+		goto new_sack;
+
+	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
+		if (tcp_sack_extend(sp, seq, end_seq)) {
+			/* Rotate this_sack to the first one. */
+			for (; this_sack > 0; this_sack--, sp--)
+				swap(*sp, *(sp - 1));
+			if (cur_sacks > 1)
+				tcp_sack_maybe_coalesce(tp);
+			return;
+		}
+	}
+
+	/* Could not find an adjacent existing SACK, build a new one,
+	 * put it at the front, and shift everyone else down.  We
+	 * always know there is at least one SACK present already here.
+	 *
+	 * If the sack array is full, forget about the last one.
+	 */
+	if (this_sack >= TCP_NUM_SACKS) {
+		this_sack--;
+		tp->rx_opt.num_sacks--;
+		sp--;
+	}
+	for (; this_sack > 0; this_sack--, sp--)
+		*sp = *(sp - 1);
+
+new_sack:
+	/* Build the new head SACK, and we're done. */
+	sp->start_seq = seq;
+	sp->end_seq = end_seq;
+	tp->rx_opt.num_sacks++;
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int num_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+	if (skb_queue_empty(&tp->out_of_order_queue)) {
+		tp->rx_opt.num_sacks = 0;
+		return;
+	}
+
+	for (this_sack = 0; this_sack < num_sacks;) {
+		/* Check if the start of the sack is covered by RCV.NXT. */
+		if (!before(tp->rcv_nxt, sp->start_seq)) {
+			int i;
+
+			/* RCV.NXT must cover all the block! */
+			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
+
+			/* Zap this SACK, by moving forward any other SACKS. */
+			for (i=this_sack+1; i < num_sacks; i++)
+				tp->selective_acks[i-1] = tp->selective_acks[i];
+			num_sacks--;
+			continue;
+		}
+		this_sack++;
+		sp++;
+	}
+	tp->rx_opt.num_sacks = num_sacks;
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 dsack_high = tp->rcv_nxt;
+	struct sk_buff *skb;
+
+	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+			break;
+
+		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+			__u32 dsack = dsack_high;
+			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+				dsack_high = TCP_SKB_CB(skb)->end_seq;
+			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
+		}
+
+		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+			SOCK_DEBUG(sk, "ofo packet was already received\n");
+			__skb_unlink(skb, &tp->out_of_order_queue);
+			__kfree_skb(skb);
+			continue;
+		}
+		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
+
+		__skb_unlink(skb, &tp->out_of_order_queue);
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (tcp_hdr(skb)->fin)
+			tcp_fin(skb, sk, tcp_hdr(skb));
+	}
+}
+
+static int tcp_prune_ofo_queue(struct sock *sk);
+static int tcp_prune_queue(struct sock *sk);
+
+static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    !sk_rmem_schedule(sk, size)) {
+
+		if (tcp_prune_queue(sk) < 0)
+			return -1;
+
+		if (!sk_rmem_schedule(sk, size)) {
+			if (!tcp_prune_ofo_queue(sk))
+				return -1;
+
+			if (!sk_rmem_schedule(sk, size))
+				return -1;
+		}
+	}
+	return 0;
+}
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int eaten = -1;
+
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+		goto drop;
+
+	skb_dst_drop(skb);
+	__skb_pull(skb, th->doff * 4);
+
+	TCP_ECN_accept_cwr(tp, skb);
+
+	tp->rx_opt.dsack = 0;
+
+	/*  Queue data for delivery to the user.
+	 *  Packets in sequence go to the receive queue.
+	 *  Out of sequence packets to the out_of_order_queue.
+	 */
+	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (tcp_receive_window(tp) == 0)
+			goto out_of_window;
+
+		/* Ok. In sequence. In window. */
+		if (tp->ucopy.task == current &&
+		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+		    sock_owned_by_user(sk) && !tp->urg_data) {
+			int chunk = min_t(unsigned int, skb->len,
+					  tp->ucopy.len);
+
+			__set_current_state(TASK_RUNNING);
+
+			local_bh_enable();
+			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+				tp->ucopy.len -= chunk;
+				tp->copied_seq += chunk;
+				eaten = (chunk == skb->len);
+				tcp_rcv_space_adjust(sk);
+			}
+			local_bh_disable();
+		}
+
+		if (eaten <= 0) {
+queue_and_out:
+			if (eaten < 0 &&
+			    tcp_try_rmem_schedule(sk, skb->truesize))
+				goto drop;
+
+			skb_set_owner_r(skb, sk);
+			__skb_queue_tail(&sk->sk_receive_queue, skb);
+		}
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (skb->len)
+			tcp_event_data_recv(sk, skb);
+		if (th->fin)
+			tcp_fin(skb, sk, th);
+
+		if (!skb_queue_empty(&tp->out_of_order_queue)) {
+			tcp_ofo_queue(sk);
+
+			/* RFC2581. 4.2. SHOULD send immediate ACK, when
+			 * gap in queue is filled.
+			 */
+			if (skb_queue_empty(&tp->out_of_order_queue))
+				inet_csk(sk)->icsk_ack.pingpong = 0;
+		}
+
+		if (tp->rx_opt.num_sacks)
+			tcp_sack_remove(tp);
+
+		tcp_fast_path_check(sk);
+
+		if (eaten > 0)
+			__kfree_skb(skb);
+		else if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, 0);
+		return;
+	}
+
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		/* A retransmit, 2nd most common case.  Force an immediate ack. */
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+		tcp_enter_quickack_mode(sk);
+		inet_csk_schedule_ack(sk);
+drop:
+		__kfree_skb(skb);
+		return;
+	}
+
+	/* Out of window. F.e. zero window probe. */
+	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+		goto out_of_window;
+
+	tcp_enter_quickack_mode(sk);
+
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		/* Partial packet, seq < rcv_next < end_seq */
+		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
+
+		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+
+		/* If window is closed, drop tail of packet. But after
+		 * remembering D-SACK for its head made in previous line.
+		 */
+		if (!tcp_receive_window(tp))
+			goto out_of_window;
+		goto queue_and_out;
+	}
+
+	TCP_ECN_check_ce(tp, skb);
+
+	if (tcp_try_rmem_schedule(sk, skb->truesize))
+		goto drop;
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+	inet_csk_schedule_ack(sk);
+
+	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+	skb_set_owner_r(skb, sk);
+
+	if (!skb_peek(&tp->out_of_order_queue)) {
+		/* Initial out of order segment, build 1 SACK. */
+		if (tcp_is_sack(tp)) {
+			tp->rx_opt.num_sacks = 1;
+			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+			tp->selective_acks[0].end_seq =
+						TCP_SKB_CB(skb)->end_seq;
+		}
+		__skb_queue_head(&tp->out_of_order_queue, skb);
+	} else {
+		struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
+		u32 seq = TCP_SKB_CB(skb)->seq;
+		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+		if (seq == TCP_SKB_CB(skb1)->end_seq) {
+			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+			if (!tp->rx_opt.num_sacks ||
+			    tp->selective_acks[0].end_seq != seq)
+				goto add_sack;
+
+			/* Common case: data arrive in order after hole. */
+			tp->selective_acks[0].end_seq = end_seq;
+			return;
+		}
+
+		/* Find place to insert this segment. */
+		while (1) {
+			if (!after(TCP_SKB_CB(skb1)->seq, seq))
+				break;
+			if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+				skb1 = NULL;
+				break;
+			}
+			skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+		}
+
+		/* Do skb overlap to previous one? */
+		if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				__kfree_skb(skb);
+				tcp_dsack_set(sk, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(sk, seq,
+					      TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				if (skb_queue_is_first(&tp->out_of_order_queue,
+						       skb1))
+					skb1 = NULL;
+				else
+					skb1 = skb_queue_prev(
+						&tp->out_of_order_queue,
+						skb1);
+			}
+		}
+		if (!skb1)
+			__skb_queue_head(&tp->out_of_order_queue, skb);
+		else
+			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+
+		/* And clean segments covered by new one as whole. */
+		while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+			skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
+
+			if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+				break;
+			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+						 end_seq);
+				break;
+			}
+			__skb_unlink(skb1, &tp->out_of_order_queue);
+			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+					 TCP_SKB_CB(skb1)->end_seq);
+			__kfree_skb(skb1);
+		}
+
+add_sack:
+		if (tcp_is_sack(tp))
+			tcp_sack_new_ofo_skb(sk, seq, end_seq);
+	}
+}
+
+static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
+					struct sk_buff_head *list)
+{
+	struct sk_buff *next = NULL;
+
+	if (!skb_queue_is_last(list, skb))
+		next = skb_queue_next(list, skb);
+
+	__skb_unlink(skb, list);
+	__kfree_skb(skb);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+
+	return next;
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the list.
+ *
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+	     struct sk_buff *head, struct sk_buff *tail,
+	     u32 start, u32 end)
+{
+	struct sk_buff *skb, *n;
+	bool end_of_skbs;
+
+	/* First, check that queue is collapsible and find
+	 * the point where collapsing can be useful. */
+	skb = head;
+restart:
+	end_of_skbs = true;
+	skb_queue_walk_from_safe(list, skb, n) {
+		if (skb == tail)
+			break;
+		/* No new bits? It is possible on ofo queue. */
+		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+			skb = tcp_collapse_one(sk, skb, list);
+			if (!skb)
+				break;
+			goto restart;
+		}
+
+		/* The first skb to collapse is:
+		 * - not SYN/FIN and
+		 * - bloated or contains data before "start" or
+		 *   overlaps to the next one.
+		 */
+		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
+		    (tcp_win_from_space(skb->truesize) > skb->len ||
+		     before(TCP_SKB_CB(skb)->seq, start))) {
+			end_of_skbs = false;
+			break;
+		}
+
+		if (!skb_queue_is_last(list, skb)) {
+			struct sk_buff *next = skb_queue_next(list, skb);
+			if (next != tail &&
+			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+				end_of_skbs = false;
+				break;
+			}
+		}
+
+		/* Decided to skip this, advance start seq. */
+		start = TCP_SKB_CB(skb)->end_seq;
+	}
+	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+		return;
+
+	while (before(start, end)) {
+		struct sk_buff *nskb;
+		unsigned int header = skb_headroom(skb);
+		int copy = SKB_MAX_ORDER(header, 0);
+
+		/* Too big header? This can happen with IPv6. */
+		if (copy < 0)
+			return;
+		if (end - start < copy)
+			copy = end - start;
+		nskb = alloc_skb(copy + header, GFP_ATOMIC);
+		if (!nskb)
+			return;
+
+		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
+		skb_set_network_header(nskb, (skb_network_header(skb) -
+					      skb->head));
+		skb_set_transport_header(nskb, (skb_transport_header(skb) -
+						skb->head));
+		skb_reserve(nskb, header);
+		memcpy(nskb->head, skb->head, header);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+		__skb_queue_before(list, skb, nskb);
+		skb_set_owner_r(nskb, sk);
+
+		/* Copy data, releasing collapsed skbs. */
+		while (copy > 0) {
+			int offset = start - TCP_SKB_CB(skb)->seq;
+			int size = TCP_SKB_CB(skb)->end_seq - start;
+
+			BUG_ON(offset < 0);
+			if (size > 0) {
+				size = min(copy, size);
+				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+					BUG();
+				TCP_SKB_CB(nskb)->end_seq += size;
+				copy -= size;
+				start += size;
+			}
+			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+				skb = tcp_collapse_one(sk, skb, list);
+				if (!skb ||
+				    skb == tail ||
+				    tcp_hdr(skb)->syn ||
+				    tcp_hdr(skb)->fin)
+					return;
+			}
+		}
+	}
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+	struct sk_buff *head;
+	u32 start, end;
+
+	if (skb == NULL)
+		return;
+
+	start = TCP_SKB_CB(skb)->seq;
+	end = TCP_SKB_CB(skb)->end_seq;
+	head = skb;
+
+	for (;;) {
+		struct sk_buff *next = NULL;
+
+		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+			next = skb_queue_next(&tp->out_of_order_queue, skb);
+		skb = next;
+
+		/* Segment is terminated when we see gap or when
+		 * we are at the end of all the queue. */
+		if (!skb ||
+		    after(TCP_SKB_CB(skb)->seq, end) ||
+		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+			tcp_collapse(sk, &tp->out_of_order_queue,
+				     head, skb, start, end);
+			head = skb;
+			if (!skb)
+				break;
+			/* Start new segment */
+			start = TCP_SKB_CB(skb)->seq;
+			end = TCP_SKB_CB(skb)->end_seq;
+		} else {
+			if (before(TCP_SKB_CB(skb)->seq, start))
+				start = TCP_SKB_CB(skb)->seq;
+			if (after(TCP_SKB_CB(skb)->end_seq, end))
+				end = TCP_SKB_CB(skb)->end_seq;
+		}
+	}
+}
+
+/*
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+static int tcp_prune_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int res = 0;
+
+	if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+		__skb_queue_purge(&tp->out_of_order_queue);
+
+		/* Reset SACK state.  A conforming SACK implementation will
+		 * do the same at a timeout based retransmit.  When a connection
+		 * is in a sad state like this, we care only about integrity
+		 * of the connection not performance.
+		 */
+		if (tp->rx_opt.sack_ok)
+			tcp_sack_reset(&tp->rx_opt);
+		sk_mem_reclaim(sk);
+		res = 1;
+	}
+	return res;
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		tcp_clamp_window(sk);
+	else if (tcp_memory_pressure)
+		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+	tcp_collapse_ofo_queue(sk);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		tcp_collapse(sk, &sk->sk_receive_queue,
+			     skb_peek(&sk->sk_receive_queue),
+			     NULL,
+			     tp->copied_seq, tp->rcv_nxt);
+	sk_mem_reclaim(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* Collapsing did not help, destructive actions follow.
+	 * This must not ever occur. */
+
+	tcp_prune_ofo_queue(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* If we are really being abused, tell the caller to silently
+	 * drop receive data on the floor.  It will get retransmitted
+	 * and hopefully then we'll have sufficient space.
+	 */
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+
+	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
+	return -1;
+}
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+void tcp_cwnd_application_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
+	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		/* Limited by application or receiver window. */
+		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+		u32 win_used = max(tp->snd_cwnd_used, init_win);
+		if (win_used < tp->snd_cwnd) {
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+		}
+		tp->snd_cwnd_used = 0;
+	}
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static int tcp_should_expand_sndbuf(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If the user specified a specific send buffer setting, do
+	 * not modify it.
+	 */
+	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+		return 0;
+
+	/* If we are under global TCP memory pressure, do not expand.  */
+	if (tcp_memory_pressure)
+		return 0;
+
+	/* If we are under soft global TCP memory pressure, do not expand.  */
+	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+		return 0;
+
+	/* If we filled the congestion window, do not expand.  */
+	if (tp->packets_out >= tp->snd_cwnd)
+		return 0;
+
+	return 1;
+}
+
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tcp_should_expand_sndbuf(sk)) {
+		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
+			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+		int demanded = max_t(unsigned int, tp->snd_cwnd,
+				     tp->reordering + 1);
+		sndmem *= 2 * demanded;
+		if (sndmem > sk->sk_sndbuf)
+			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	}
+
+	sk->sk_write_space(sk);
+}
+
+static void tcp_check_space(struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+		if (sk->sk_socket &&
+		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+			tcp_new_space(sk);
+	}
+}
+
+static inline void tcp_data_snd_check(struct sock *sk)
+{
+	tcp_push_pending_frames(sk);
+	tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	    /* More than one full frame received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     /* ... and right edge of window advances far enough.
+	      * (tcp_recvmsg() will send ACK otherwise). Or...
+	      */
+	     __tcp_select_window(sk) >= tp->rcv_wnd) ||
+	    /* We ACK each frame or... */
+	    tcp_in_quickack_mode(sk) ||
+	    /* We have out of order data. */
+	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+		/* Then ack it now */
+		tcp_send_ack(sk);
+	} else {
+		/* Else, send delayed ack. */
+		tcp_send_delayed_ack(sk);
+	}
+}
+
+static inline void tcp_ack_snd_check(struct sock *sk)
+{
+	if (!inet_csk_ack_scheduled(sk)) {
+		/* We sent a data segment already. */
+		return;
+	}
+	__tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ *	This routine is only called when we have urgent data
+ *	signaled. Its the 'slow' part of tcp_urg. It could be
+ *	moved inline now as tcp_urg is only called from one
+ *	place. We handle URGent data wrong. We have to - as
+ *	BSD still doesn't use the correction from RFC961.
+ *	For 1003.1g we should support a new option TCP_STDURG to permit
+ *	either form (or just set the sysctl tcp_stdurg).
+ */
+
+static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 ptr = ntohs(th->urg_ptr);
+
+	if (ptr && !sysctl_tcp_stdurg)
+		ptr--;
+	ptr += ntohl(th->seq);
+
+	/* Ignore urgent data that we've already seen and read. */
+	if (after(tp->copied_seq, ptr))
+		return;
+
+	/* Do not replay urg ptr.
+	 *
+	 * NOTE: interesting situation not covered by specs.
+	 * Misbehaving sender may send urg ptr, pointing to segment,
+	 * which we already have in ofo queue. We are not able to fetch
+	 * such data and will stay in TCP_URG_NOTYET until will be eaten
+	 * by recvmsg(). Seems, we are not obliged to handle such wicked
+	 * situations. But it is worth to think about possibility of some
+	 * DoSes using some hypothetical application level deadlock.
+	 */
+	if (before(ptr, tp->rcv_nxt))
+		return;
+
+	/* Do we already have a newer (or duplicate) urgent pointer? */
+	if (tp->urg_data && !after(ptr, tp->urg_seq))
+		return;
+
+	/* Tell the world about our new urgent pointer. */
+	sk_send_sigurg(sk);
+
+	/* We may be adding urgent data when the last byte read was
+	 * urgent. To do this requires some care. We cannot just ignore
+	 * tp->copied_seq since we would read the last urgent byte again
+	 * as data, nor can we alter copied_seq until this data arrives
+	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
+	 *
+	 * NOTE. Double Dutch. Rendering to plain English: author of comment
+	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
+	 * and expect that both A and B disappear from stream. This is _wrong_.
+	 * Though this happens in BSD with high probability, this is occasional.
+	 * Any application relying on this is buggy. Note also, that fix "works"
+	 * only in this artificial test. Insert some normal data between A and B and we will
+	 * decline of BSD again. Verdict: it is better to remove to trap
+	 * buggy users.
+	 */
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+		tp->copied_seq++;
+		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			__kfree_skb(skb);
+		}
+	}
+
+	tp->urg_data = TCP_URG_NOTYET;
+	tp->urg_seq = ptr;
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Check if we get a new urgent pointer - normally not. */
+	if (th->urg)
+		tcp_check_urg(sk, th);
+
+	/* Do we wait for any urgent data? - normally not... */
+	if (tp->urg_data == TCP_URG_NOTYET) {
+		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+			  th->syn;
+
+		/* Is the urgent pointer pointing into this packet? */
+		if (ptr < skb->len) {
+			u8 tmp;
+			if (skb_copy_bits(skb, ptr, &tmp, 1))
+				BUG();
+			tp->urg_data = TCP_URG_VALID | tmp;
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_data_ready(sk, 0);
+		}
+	}
+}
+
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int chunk = skb->len - hlen;
+	int err;
+
+	local_bh_enable();
+	if (skb_csum_unnecessary(skb))
+		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+	else
+		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
+						       tp->ucopy.iov);
+
+	if (!err) {
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		tcp_rcv_space_adjust(sk);
+	}
+
+	local_bh_disable();
+	return err;
+}
+
+static __sum16 __tcp_checksum_complete_user(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	__sum16 result;
+
+	if (sock_owned_by_user(sk)) {
+		local_bh_enable();
+		result = __tcp_checksum_complete(skb);
+		local_bh_disable();
+	} else {
+		result = __tcp_checksum_complete(skb);
+	}
+	return result;
+}
+
+static inline int tcp_checksum_complete_user(struct sock *sk,
+					     struct sk_buff *skb)
+{
+	return !skb_csum_unnecessary(skb) &&
+	       __tcp_checksum_complete_user(sk, skb);
+}
+
+#ifdef CONFIG_NET_DMA
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
+				  int hlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int chunk = skb->len - hlen;
+	int dma_cookie;
+	int copied_early = 0;
+
+	if (tp->ucopy.wakeup)
+		return 0;
+
+	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+		tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+
+	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
+
+		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+							 skb, hlen,
+							 tp->ucopy.iov, chunk,
+							 tp->ucopy.pinned_list);
+
+		if (dma_cookie < 0)
+			goto out;
+
+		tp->ucopy.dma_cookie = dma_cookie;
+		copied_early = 1;
+
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		tcp_rcv_space_adjust(sk);
+
+		if ((tp->ucopy.len == 0) ||
+		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
+		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+			tp->ucopy.wakeup = 1;
+			sk->sk_data_ready(sk, 0);
+		}
+	} else if (chunk > 0) {
+		tp->ucopy.wakeup = 1;
+		sk->sk_data_ready(sk, 0);
+	}
+out:
+	return copied_early;
+}
+#endif /* CONFIG_NET_DMA */
+
+/* Does PAWS and seqno based validation of an incoming segment, flags will
+ * play significant role here.
+ */
+static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
+			      struct tcphdr *th, int syn_inerr)
+{
+	u8 *hash_location;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* RFC1323: H1. Apply PAWS check first. */
+	if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
+	    tp->rx_opt.saw_tstamp &&
+	    tcp_paws_discard(sk, skb)) {
+		if (!th->rst) {
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+			tcp_send_dupack(sk, skb);
+			goto discard;
+		}
+		/* Reset is accepted even if it did not pass PAWS. */
+	}
+
+	/* Step 1: check sequence number */
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+		/* RFC793, page 37: "In all states except SYN-SENT, all reset
+		 * (RST) segments are validated by checking their SEQ-fields."
+		 * And page 69: "If an incoming segment is not acceptable,
+		 * an acknowledgment should be sent in reply (unless the RST
+		 * bit is set, if so drop the segment and return)".
+		 */
+		if (!th->rst)
+			tcp_send_dupack(sk, skb);
+		goto discard;
+	}
+
+	/* Step 2: check RST bit */
+	if (th->rst) {
+		tcp_reset(sk);
+		goto discard;
+	}
+
+	/* ts_recent update must be made after we are sure that the packet
+	 * is in window.
+	 */
+	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+	/* step 3: check security and precedence [ignored] */
+
+	/* step 4: Check for a SYN in window. */
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		if (syn_inerr)
+			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
+		tcp_reset(sk);
+		return -1;
+	}
+
+	return 1;
+
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ *	TCP receive function for the ESTABLISHED state.
+ *
+ *	It is split into a fast path and a slow path. The fast path is
+ * 	disabled when:
+ *	- A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path.
+ *	- Out of order segments arrived.
+ *	- Urgent data is expected.
+ *	- There is no buffer space left
+ *	- Unexpected TCP flags/window values/header lengths are received
+ *	  (detected by checking the TCP header against pred_flags)
+ *	- Data is sent in both directions. Fast path only supports pure senders
+ *	  or pure receivers (this means either the sequence number or the ack
+ *	  value must stay constant)
+ *	- Unexpected TCP option.
+ *
+ *	When these conditions are not satisfied it drops into a standard
+ *	receive procedure patterned after RFC793 to handle all cases.
+ *	The first three cases are guaranteed by proper pred_flags setting,
+ *	the rest is checked inline. Fast processing is turned on in
+ *	tcp_data_queue when everything is OK.
+ */
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			struct tcphdr *th, unsigned len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int res;
+
+	/*
+	 *	Header prediction.
+	 *	The code loosely follows the one in the famous
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *
+	 *	Van's trick is to deposit buffers into socket queue
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
+
+	tp->rx_opt.saw_tstamp = 0;
+
+	/*	pred_flags is 0xS?10 << 16 + snd_wnd
+	 *	if header_prediction is to be made
+	 *	'S' will always be tp->tcp_header_len >> 2
+	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
+	 *  turn it off	(when there are holes in the receive
+	 *	 space for instance)
+	 *	PSH flag is ignored.
+	 */
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+		int tcp_header_len = tp->tcp_header_len;
+
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
+
+		/* Check timestamp */
+		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+			/* No? Slow path! */
+			if (!tcp_parse_aligned_timestamp(tp, th))
+				goto slow_path;
+
+			/* If PAWS failed, check it more carefully in slow path */
+			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+				goto slow_path;
+
+			/* DO NOT update ts_recent here, if checksum fails
+			 * and timestamp was corrupted part, it will result
+			 * in a hung connection since we will drop all
+			 * future packets due to the PAWS test.
+			 */
+		}
+
+		if (len <= tcp_header_len) {
+			/* Bulk data transfer: sender */
+			if (len == tcp_header_len) {
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
+				tcp_ack(sk, skb, 0);
+				__kfree_skb(skb);
+				tcp_data_snd_check(sk);
+				return 0;
+			} else { /* Header too small */
+				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+				goto discard;
+			}
+		} else {
+			int eaten = 0;
+			int copied_early = 0;
+
+			if (tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len) {
+#ifdef CONFIG_NET_DMA
+				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+					copied_early = 1;
+					eaten = 1;
+				}
+#endif
+				if (tp->ucopy.task == current &&
+				    sock_owned_by_user(sk) && !copied_early) {
+					__set_current_state(TASK_RUNNING);
+
+					if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+						eaten = 1;
+				}
+				if (eaten) {
+					/* Predicted packet is in window by definition.
+					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+					 * Hence, check seq<=rcv_wup reduces to:
+					 */
+					if (tcp_header_len ==
+					    (sizeof(struct tcphdr) +
+					     TCPOLEN_TSTAMP_ALIGNED) &&
+					    tp->rcv_nxt == tp->rcv_wup)
+						tcp_store_ts_recent(tp);
+
+					tcp_rcv_rtt_measure_ts(sk, skb);
+
+					__skb_pull(skb, tcp_header_len);
+					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+				}
+				if (copied_early)
+					tcp_cleanup_rbuf(sk, skb->len);
+			}
+			if (!eaten) {
+				if (tcp_checksum_complete_user(sk, skb))
+					goto csum_error;
+
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				tcp_rcv_rtt_measure_ts(sk, skb);
+
+				if ((int)skb->truesize > sk->sk_forward_alloc)
+					goto step5;
+
+				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+				/* Bulk data transfer: receiver */
+				__skb_pull(skb, tcp_header_len);
+				__skb_queue_tail(&sk->sk_receive_queue, skb);
+				skb_set_owner_r(skb, sk);
+				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+			}
+
+			tcp_event_data_recv(sk, skb);
+
+			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+				/* Well, only one small jumplet in fast path... */
+				tcp_ack(sk, skb, FLAG_DATA);
+				tcp_data_snd_check(sk);
+				if (!inet_csk_ack_scheduled(sk))
+					goto no_ack;
+			}
+
+			if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
+				__tcp_ack_snd_check(sk, 0);
+no_ack:
+#ifdef CONFIG_NET_DMA
+			if (copied_early)
+				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
+			else
+#endif
+			if (eaten)
+				__kfree_skb(skb);
+			else
+				sk->sk_data_ready(sk, 0);
+			return 0;
+		}
+	}
+
+slow_path:
+	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+		goto csum_error;
+
+	/*
+	 *	Standard slow path.
+	 */
+
+	res = tcp_validate_incoming(sk, skb, th, 1);
+	if (res <= 0)
+		return -res;
+
+step5:
+	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
+		goto discard;
+
+	tcp_rcv_rtt_measure_ts(sk, skb);
+
+	/* Process urgent data. */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	tcp_data_queue(sk, skb);
+
+	tcp_data_snd_check(sk);
+	tcp_ack_snd_check(sk);
+	return 0;
+
+csum_error:
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_rcv_established);
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+					 struct tcphdr *th, unsigned len)
+{
+	u8 *hash_location;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_cookie_values *cvp = tp->cookie_values;
+	int saved_clamp = tp->rx_opt.mss_clamp;
+
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+
+	if (th->ack) {
+		/* rfc793:
+		 * "If the state is SYN-SENT then
+		 *    first check the ACK bit
+		 *      If the ACK bit is set
+		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+		 *        a reset (unless the RST bit is set, if so drop
+		 *        the segment and return)"
+		 *
+		 *  We do not send data with SYN, so that RFC-correct
+		 *  test reduces to:
+		 */
+		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+			goto reset_and_undo;
+
+		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+			     tcp_time_stamp)) {
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+			goto reset_and_undo;
+		}
+
+		/* Now ACK is acceptable.
+		 *
+		 * "If the RST bit is set
+		 *    If the ACK was acceptable then signal the user "error:
+		 *    connection reset", drop the segment, enter CLOSED state,
+		 *    delete TCB, and return."
+		 */
+
+		if (th->rst) {
+			tcp_reset(sk);
+			goto discard;
+		}
+
+		/* rfc793:
+		 *   "fifth, if neither of the SYN or RST bits is set then
+		 *    drop the segment and return."
+		 *
+		 *    See note below!
+		 *                                        --ANK(990513)
+		 */
+		if (!th->syn)
+			goto discard_and_undo;
+
+		/* rfc793:
+		 *   "If the SYN bit is on ...
+		 *    are acceptable then ...
+		 *    (our SYN has been ACKed), change the connection
+		 *    state to ESTABLISHED..."
+		 */
+
+		TCP_ECN_rcv_synack(tp, th);
+
+		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+		/* Ok.. it's good. Set up sequence numbers and
+		 * move to established.
+		 */
+		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd = ntohs(th->window);
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+
+		if (!tp->rx_opt.wscale_ok) {
+			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+			tp->window_clamp = min(tp->window_clamp, 65535U);
+		}
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok	   = 1;
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
+			tcp_store_ts_recent(tp);
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		if (tcp_is_sack(tp) && sysctl_tcp_fack)
+			tcp_enable_fack(tp);
+
+		tcp_mtup_init(sk);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+		/* Remember, tcp_poll() does not lock socket!
+		 * Change state from SYN-SENT only after copied_seq
+		 * is initialized. */
+		tp->copied_seq = tp->rcv_nxt;
+
+		if (cvp != NULL &&
+		    cvp->cookie_pair_size > 0 &&
+		    tp->rx_opt.cookie_plus > 0) {
+			int cookie_size = tp->rx_opt.cookie_plus
+					- TCPOLEN_COOKIE_BASE;
+			int cookie_pair_size = cookie_size
+					     + cvp->cookie_desired;
+
+			/* A cookie extension option was sent and returned.
+			 * Note that each incoming SYNACK replaces the
+			 * Responder cookie.  The initial exchange is most
+			 * fragile, as protection against spoofing relies
+			 * entirely upon the sequence and timestamp (above).
+			 * This replacement strategy allows the correct pair to
+			 * pass through, while any others will be filtered via
+			 * Responder verification later.
+			 */
+			if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
+				memcpy(&cvp->cookie_pair[cvp->cookie_desired],
+				       hash_location, cookie_size);
+				cvp->cookie_pair_size = cookie_pair_size;
+			}
+		}
+
+		smp_mb();
+		tcp_set_state(sk, TCP_ESTABLISHED);
+
+		security_inet_conn_established(sk, skb);
+
+		/* Make sure socket is routed, for correct metrics.  */
+		icsk->icsk_af_ops->rebuild_header(sk);
+
+		tcp_init_metrics(sk);
+
+		tcp_init_congestion_control(sk);
+
+		/* Prevent spurious tcp_cwnd_restart() on first data
+		 * packet.
+		 */
+		tp->lsndtime = tcp_time_stamp;
+
+		tcp_init_buffer_space(sk);
+
+		if (sock_flag(sk, SOCK_KEEPOPEN))
+			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+		if (!tp->rx_opt.snd_wscale)
+			__tcp_fast_path_on(tp, tp->snd_wnd);
+		else
+			tp->pred_flags = 0;
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+		}
+
+		if (sk->sk_write_pending ||
+		    icsk->icsk_accept_queue.rskq_defer_accept ||
+		    icsk->icsk_ack.pingpong) {
+			/* Save one ACK. Data will be ready after
+			 * several ticks, if write_pending is set.
+			 *
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
+			 */
+			inet_csk_schedule_ack(sk);
+			icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+			icsk->icsk_ack.ato	 = TCP_ATO_MIN;
+			tcp_incr_quickack(sk);
+			tcp_enter_quickack_mode(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
+
+discard:
+			__kfree_skb(skb);
+			return 0;
+		} else {
+			tcp_send_ack(sk);
+		}
+		return -1;
+	}
+
+	/* No ACK in the segment */
+
+	if (th->rst) {
+		/* rfc793:
+		 * "If the RST bit is set
+		 *
+		 *      Otherwise (no ACK) drop the segment and return."
+		 */
+
+		goto discard_and_undo;
+	}
+
+	/* PAWS check. */
+	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
+	    tcp_paws_reject(&tp->rx_opt, 0))
+		goto discard_and_undo;
+
+	if (th->syn) {
+		/* We see SYN without ACK. It is attempt of
+		 * simultaneous connect with crossed SYNs.
+		 * Particularly, it can be connect to self.
+		 */
+		tcp_set_state(sk, TCP_SYN_RECV);
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok = 1;
+			tcp_store_ts_recent(tp);
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd    = ntohs(th->window);
+		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
+		tp->max_window = tp->snd_wnd;
+
+		TCP_ECN_rcv_syn(tp, th);
+
+		tcp_mtup_init(sk);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+		tcp_send_synack(sk);
+#if 0
+		/* Note, we could accept data and URG from this segment.
+		 * There are no obstacles to make this.
+		 *
+		 * However, if we ignore data in ACKless segments sometimes,
+		 * we have no reasons to accept it sometimes.
+		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
+		 * is not flawless. So, discard packet for sanity.
+		 * Uncomment this return to process the data.
+		 */
+		return -1;
+#else
+		goto discard;
+#endif
+	}
+	/* "fifth, if neither of the SYN or RST bits is set then
+	 * drop the segment and return."
+	 */
+
+discard_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	goto discard;
+
+reset_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	return 1;
+}
+
+/*
+ *	This function implements the receiving procedure of RFC 793 for
+ *	all states except ESTABLISHED and TIME_WAIT.
+ *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ *	address independent.
+ */
+
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+			  struct tcphdr *th, unsigned len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int queued = 0;
+	int res;
+
+	tp->rx_opt.saw_tstamp = 0;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		goto discard;
+
+	case TCP_LISTEN:
+		if (th->ack)
+			return 1;
+
+		if (th->rst)
+			goto discard;
+
+		if (th->syn) {
+			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
+				return 1;
+
+			/* Now we have several options: In theory there is
+			 * nothing else in the frame. KA9Q has an option to
+			 * send data with the syn, BSD accepts data with the
+			 * syn up to the [to be] advertised window and
+			 * Solaris 2.1 gives you a protocol error. For now
+			 * we just ignore it, that fits the spec precisely
+			 * and avoids incompatibilities. It would be nice in
+			 * future to drop through and process the data.
+			 *
+			 * Now that TTCP is starting to be used we ought to
+			 * queue this data.
+			 * But, this leaves one open to an easy denial of
+			 * service attack, and SYN cookies can't defend
+			 * against this problem. So, we drop the data
+			 * in the interest of security over speed unless
+			 * it's still in use.
+			 */
+			kfree_skb(skb);
+			return 0;
+		}
+		goto discard;
+
+	case TCP_SYN_SENT:
+		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+		if (queued >= 0)
+			return queued;
+
+		/* Do step6 onward by hand. */
+		tcp_urg(sk, skb, th);
+		__kfree_skb(skb);
+		tcp_data_snd_check(sk);
+		return 0;
+	}
+
+	res = tcp_validate_incoming(sk, skb, th, 0);
+	if (res <= 0)
+		return -res;
+
+	/* step 5: check the ACK field */
+	if (th->ack) {
+		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
+
+		switch (sk->sk_state) {
+		case TCP_SYN_RECV:
+			if (acceptable) {
+				tp->copied_seq = tp->rcv_nxt;
+				smp_mb();
+				tcp_set_state(sk, TCP_ESTABLISHED);
+				sk->sk_state_change(sk);
+
+				/* Note, that this wakeup is only for marginal
+				 * crossed SYN case. Passively open sockets
+				 * are not waked up, because sk->sk_sleep ==
+				 * NULL and sk->sk_socket == NULL.
+				 */
+				if (sk->sk_socket)
+					sk_wake_async(sk,
+						      SOCK_WAKE_IO, POLL_OUT);
+
+				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+				tp->snd_wnd = ntohs(th->window) <<
+					      tp->rx_opt.snd_wscale;
+				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+
+				/* tcp_ack considers this ACK as duplicate
+				 * and does not calculate rtt.
+				 * Force it here.
+				 */
+				tcp_ack_update_rtt(sk, 0, 0);
+
+				if (tp->rx_opt.tstamp_ok)
+					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+				/* Make sure socket is routed, for
+				 * correct metrics.
+				 */
+				icsk->icsk_af_ops->rebuild_header(sk);
+
+				tcp_init_metrics(sk);
+
+				tcp_init_congestion_control(sk);
+
+				/* Prevent spurious tcp_cwnd_restart() on
+				 * first data packet.
+				 */
+				tp->lsndtime = tcp_time_stamp;
+
+				tcp_mtup_init(sk);
+				tcp_initialize_rcv_mss(sk);
+				tcp_init_buffer_space(sk);
+				tcp_fast_path_on(tp);
+			} else {
+				return 1;
+			}
+			break;
+
+		case TCP_FIN_WAIT1:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_set_state(sk, TCP_FIN_WAIT2);
+				sk->sk_shutdown |= SEND_SHUTDOWN;
+				dst_confirm(__sk_dst_get(sk));
+
+				if (!sock_flag(sk, SOCK_DEAD))
+					/* Wake up lingering close() */
+					sk->sk_state_change(sk);
+				else {
+					int tmo;
+
+					if (tp->linger2 < 0 ||
+					    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+					     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+						tcp_done(sk);
+						NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+						return 1;
+					}
+
+					tmo = tcp_fin_time(sk);
+					if (tmo > TCP_TIMEWAIT_LEN) {
+						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+					} else if (th->fin || sock_owned_by_user(sk)) {
+						/* Bad case. We could lose such FIN otherwise.
+						 * It is not a big problem, but it looks confusing
+						 * and not so rare event. We still can lose it now,
+						 * if it spins in bh_lock_sock(), but it is really
+						 * marginal case.
+						 */
+						inet_csk_reset_keepalive_timer(sk, tmo);
+					} else {
+						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+						goto discard;
+					}
+				}
+			}
+			break;
+
+		case TCP_CLOSING:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+				goto discard;
+			}
+			break;
+
+		case TCP_LAST_ACK:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_update_metrics(sk);
+				tcp_done(sk);
+				goto discard;
+			}
+			break;
+		}
+	} else
+		goto discard;
+
+	/* step 6: check the URG bit */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	switch (sk->sk_state) {
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+			break;
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* RFC 793 says to queue data in these states,
+		 * RFC 1122 says we MUST send a reset.
+		 * BSD 4.4 also does reset.
+		 */
+		if (sk->sk_shutdown & RCV_SHUTDOWN) {
+			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+				tcp_reset(sk);
+				return 1;
+			}
+		}
+		/* Fall through */
+	case TCP_ESTABLISHED:
+		tcp_data_queue(sk, skb);
+		queued = 1;
+		break;
+	}
+
+	/* tcp_data could move socket to TIME-WAIT */
+	if (sk->sk_state != TCP_CLOSE) {
+		tcp_data_snd_check(sk);
+		tcp_ack_snd_check(sk);
+	}
+
+	if (!queued) {
+discard:
+		__kfree_skb(skb);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 00000000..53a5af66
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2657 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ *		IPv4 specific functions
+ *
+ *
+ *		code split from:
+ *		linux/ipv4/tcp.c
+ *		linux/ipv4/tcp_input.c
+ *		linux/ipv4/tcp_output.c
+ *
+ *		See tcp.c for author information
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *		David S. Miller	:	New socket lookup architecture.
+ *					This code is dedicated to John Dyson.
+ *		David S. Miller :	Change semantics of established hash,
+ *					half is devoted to TIME_WAIT sockets
+ *					and the rest go in the other half.
+ *		Andi Kleen :		Add support for syncookies and fixed
+ *					some bugs: ip options weren't passed to
+ *					the TCP layer, missed a check for an
+ *					ACK bit.
+ *		Andi Kleen :		Implemented fast path mtu discovery.
+ *	     				Fixed many serious bugs in the
+ *					request_sock handling and moved
+ *					most of it into the af independent code.
+ *					Added tail drop and some other bugfixes.
+ *					Added new listen semantics.
+ *		Mike McLagan	:	Routing by source
+ *	Juan Jose Ciarlante:		ip_dynaddr bits
+ *		Andi Kleen:		various fixes.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year
+ *					coma.
+ *	Andi Kleen		:	Fix new listen.
+ *	Andi Kleen		:	Fix accept error reporting.
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ */
+
+
+#include <linux/bottom_half.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/timewait_sock.h>
+#include <net/xfrm.h>
+#include <net/netdma.h>
+#include <net/secure_seq.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+int sysctl_tcp_tw_reuse __read_mostly;
+int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+
+
+#ifdef CONFIG_TCP_MD5SIG
+static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
+						   __be32 addr);
+static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+			       __be32 daddr, __be32 saddr, struct tcphdr *th);
+#else
+static inline
+struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+{
+	return NULL;
+}
+#endif
+
+struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
+
+static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
+{
+	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
+					  ip_hdr(skb)->saddr,
+					  tcp_hdr(skb)->dest,
+					  tcp_hdr(skb)->source);
+}
+
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* With PAWS, it is safe from the viewpoint
+	   of data integrity. Even without PAWS it is safe provided sequence
+	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+	   Actually, the idea is close to VJ's one, only timestamp cache is
+	   held not per host, but per port pair and TW bucket is used as state
+	   holder.
+
+	   If TW bucket has been already destroyed we fall back to VJ's scheme
+	   and use initial timestamp retrieved from peer table.
+	 */
+	if (tcptw->tw_ts_recent_stamp &&
+	    (twp == NULL || (sysctl_tcp_tw_reuse &&
+			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+		if (tp->write_seq == 0)
+			tp->write_seq = 1;
+		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		sock_hold(sktw);
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	__be16 orig_sport, orig_dport;
+	__be32 daddr, nexthop;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	int err;
+	struct ip_options_rcu *inet_opt;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	nexthop = daddr = usin->sin_addr.s_addr;
+	inet_opt = rcu_dereference_protected(inet->inet_opt,
+					     sock_owned_by_user(sk));
+	if (inet_opt && inet_opt->opt.srr) {
+		if (!daddr)
+			return -EINVAL;
+		nexthop = inet_opt->opt.faddr;
+	}
+
+	orig_sport = inet->inet_sport;
+	orig_dport = usin->sin_port;
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+			      IPPROTO_TCP,
+			      orig_sport, orig_dport, sk, true);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		if (err == -ENETUNREACH)
+			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+		return err;
+	}
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (!inet_opt || !inet_opt->opt.srr)
+		daddr = fl4->daddr;
+
+	if (!inet->inet_saddr)
+		inet->inet_saddr = fl4->saddr;
+	inet->inet_rcv_saddr = inet->inet_saddr;
+
+	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
+		/* Reset inherited state */
+		tp->rx_opt.ts_recent	   = 0;
+		tp->rx_opt.ts_recent_stamp = 0;
+		tp->write_seq		   = 0;
+	}
+
+	if (tcp_death_row.sysctl_tw_recycle &&
+	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
+		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
+		/*
+		 * VJ's idea. We save last timestamp seen from
+		 * the destination in peer table, when entering state
+		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
+		 * when trying new connection.
+		 */
+		if (peer) {
+			inet_peer_refcheck(peer);
+			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+				tp->rx_opt.ts_recent = peer->tcp_ts;
+			}
+		}
+	}
+
+	inet->inet_dport = usin->sin_port;
+	inet->inet_daddr = daddr;
+
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
+	if (inet_opt)
+		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+
+	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
+
+	/* Socket identity is still unknown (sport may be zero).
+	 * However we set state to SYN-SENT and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initialization after this.
+	 */
+	tcp_set_state(sk, TCP_SYN_SENT);
+	err = inet_hash_connect(&tcp_death_row, sk);
+	if (err)
+		goto failure;
+
+	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+			       inet->inet_sport, inet->inet_dport, sk);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		rt = NULL;
+		goto failure;
+	}
+	/* OK, now commit destination to socket.  */
+	sk->sk_gso_type = SKB_GSO_TCPV4;
+	sk_setup_caps(sk, &rt->dst);
+
+	if (!tp->write_seq)
+		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
+							   inet->inet_daddr,
+							   inet->inet_sport,
+							   usin->sin_port);
+
+	inet->inet_id = tp->write_seq ^ jiffies;
+
+	err = tcp_connect(sk);
+	rt = NULL;
+	if (err)
+		goto failure;
+
+	return 0;
+
+failure:
+	/*
+	 * This unhashes the socket and releases the local port,
+	 * if necessary.
+	 */
+	tcp_set_state(sk, TCP_CLOSE);
+	ip_rt_put(rt);
+	sk->sk_route_caps = 0;
+	inet->inet_dport = 0;
+	return err;
+}
+EXPORT_SYMBOL(tcp_v4_connect);
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
+{
+	struct dst_entry *dst;
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
+	 * send out by Linux are always <576bytes so they should go through
+	 * unfragmented).
+	 */
+	if (sk->sk_state == TCP_LISTEN)
+		return;
+
+	/* We don't check in the destentry if pmtu discovery is forbidden
+	 * on this route. We just assume that no packet_to_big packets
+	 * are send back when pmtu discovery is not active.
+	 * There is a small race when the user changes this flag in the
+	 * route, but I think that's acceptable.
+	 */
+	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+		return;
+
+	dst->ops->update_pmtu(dst, mtu);
+
+	/* Something is about to be wrong... Remember soft error
+	 * for the case, if this connection will not able to recover.
+	 */
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+		sk->sk_err_soft = EMSGSIZE;
+
+	mtu = dst_mtu(dst);
+
+	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
+		tcp_sync_mss(sk, mtu);
+
+		/* Resend the TCP packet because it's
+		 * clear that the old packet has been
+		 * dropped. This is the new "fast" path mtu
+		 * discovery.
+		 */
+		tcp_simple_retransmit(sk);
+	} /* else let the usual retransmit timer handle it */
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  After adjustment
+ * header points to the first 8 bytes of the tcp header.  We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
+	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
+	struct inet_connection_sock *icsk;
+	struct tcp_sock *tp;
+	struct inet_sock *inet;
+	const int type = icmp_hdr(icmp_skb)->type;
+	const int code = icmp_hdr(icmp_skb)->code;
+	struct sock *sk;
+	struct sk_buff *skb;
+	__u32 seq;
+	__u32 remaining;
+	int err;
+	struct net *net = dev_net(icmp_skb->dev);
+
+	if (icmp_skb->len < (iph->ihl << 2) + 8) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		return;
+	}
+
+	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
+			iph->saddr, th->source, inet_iif(icmp_skb));
+	if (!sk) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		return;
+	}
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	bh_lock_sock(sk);
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 */
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+		goto out;
+	}
+
+	icsk = inet_csk(sk);
+	tp = tcp_sk(sk);
+	seq = ntohl(th->seq);
+	if (sk->sk_state != TCP_LISTEN &&
+	    !between(seq, tp->snd_una, tp->snd_nxt)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_SOURCE_QUENCH:
+		/* Just silently ignore these. */
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out;
+
+		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			if (!sock_owned_by_user(sk))
+				do_pmtu_discovery(sk, iph, info);
+			goto out;
+		}
+
+		err = icmp_err_convert[code].errno;
+		/* check if icmp_skb allows revert of backoff
+		 * (see draft-zimmermann-tcp-lcd) */
+		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
+			break;
+		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
+		    !icsk->icsk_backoff)
+			break;
+
+		if (sock_owned_by_user(sk))
+			break;
+
+		icsk->icsk_backoff--;
+		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
+					 icsk->icsk_backoff;
+		tcp_bound_rto(sk);
+
+		skb = tcp_write_queue_head(sk);
+		BUG_ON(!skb);
+
+		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
+				tcp_time_stamp - TCP_SKB_CB(skb)->when);
+
+		if (remaining) {
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  remaining, TCP_RTO_MAX);
+		} else {
+			/* RTO revert clocked out retransmission.
+			 * Will retransmit now */
+			tcp_retransmit_timer(sk);
+		}
+
+		break;
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (sk->sk_state) {
+		struct request_sock *req, **prev;
+	case TCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+
+		req = inet_csk_search_req(sk, &prev, th->dest,
+					  iph->daddr, iph->saddr);
+		if (!req)
+			goto out;
+
+		/* ICMPs are not backlogged, hence we cannot get
+		   an established socket here.
+		 */
+		WARN_ON(req->sk);
+
+		if (seq != tcp_rsk(req)->snt_isn) {
+			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+
+		/*
+		 * Still in SYN_RECV, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept().
+		 */
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:  /* Cannot happen.
+			       It can f.e. if SYNs crossed.
+			     */
+		if (!sock_owned_by_user(sk)) {
+			sk->sk_err = err;
+
+			sk->sk_error_report(sk);
+
+			tcp_done(sk);
+		} else {
+			sk->sk_err_soft = err;
+		}
+		goto out;
+	}
+
+	/* If we've already connected we will keep trying
+	 * until we time out, or the user gives up.
+	 *
+	 * rfc1122 4.2.3.9 allows to consider as hard errors
+	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+	 * but it is obsoleted by pmtu discovery).
+	 *
+	 * Note, that in modern internet, where routing is unreliable
+	 * and in each dark corner broken firewalls sit, sending random
+	 * errors ordered by their masters even this two messages finally lose
+	 * their original sense (even Linux sends invalid PORT_UNREACHs)
+	 *
+	 * Now we are in compliance with RFCs.
+	 *							--ANK (980905)
+	 */
+
+	inet = inet_sk(sk);
+	if (!sock_owned_by_user(sk) && inet->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else	{ /* Only an error on timeout */
+		sk->sk_err_soft = err;
+	}
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void __tcp_v4_send_check(struct sk_buff *skb,
+				__be32 saddr, __be32 daddr)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct tcphdr, check);
+	} else {
+		th->check = tcp_v4_check(skb->len, saddr, daddr,
+					 csum_partial(th,
+						      th->doff << 2,
+						      skb->csum));
+	}
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_send_check);
+
+int tcp_v4_gso_send_check(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	struct tcphdr *th;
+
+	if (!pskb_may_pull(skb, sizeof(*th)))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+	th = tcp_hdr(skb);
+
+	th->check = 0;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
+	return 0;
+}
+
+/*
+ *	This routine will send an RST to the other tcp.
+ *
+ *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ *		      for reset.
+ *	Answer: if a packet caused RST, it is not for a socket
+ *		existing in our system, if it is matched to a socket,
+ *		it is just duplicate segment or bug in other side's TCP.
+ *		So that we build reply only basing on parameters
+ *		arrived with segment.
+ *	Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	struct {
+		struct tcphdr th;
+#ifdef CONFIG_TCP_MD5SIG
+		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
+#endif
+	} rep;
+	struct ip_reply_arg arg;
+#ifdef CONFIG_TCP_MD5SIG
+	struct tcp_md5sig_key *key;
+#endif
+	struct net *net;
+
+	/* Never send a reset in response to a reset. */
+	if (th->rst)
+		return;
+
+	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
+		return;
+
+	/* Swap the send and the receive. */
+	memset(&rep, 0, sizeof(rep));
+	rep.th.dest   = th->source;
+	rep.th.source = th->dest;
+	rep.th.doff   = sizeof(struct tcphdr) / 4;
+	rep.th.rst    = 1;
+
+	if (th->ack) {
+		rep.th.seq = th->ack_seq;
+	} else {
+		rep.th.ack = 1;
+		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+				       skb->len - (th->doff << 2));
+	}
+
+	memset(&arg, 0, sizeof(arg));
+	arg.iov[0].iov_base = (unsigned char *)&rep;
+	arg.iov[0].iov_len  = sizeof(rep.th);
+
+#ifdef CONFIG_TCP_MD5SIG
+	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
+	if (key) {
+		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
+				   (TCPOPT_NOP << 16) |
+				   (TCPOPT_MD5SIG << 8) |
+				   TCPOLEN_MD5SIG);
+		/* Update length and the length the header thinks exists */
+		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+		rep.th.doff = arg.iov[0].iov_len / 4;
+
+		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+				     key, ip_hdr(skb)->saddr,
+				     ip_hdr(skb)->daddr, &rep.th);
+	}
+#endif
+	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+				      ip_hdr(skb)->saddr, /* XXX */
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+	/* When socket is gone, all binding information is lost.
+	 * routing might fail in this case. using iif for oif to
+	 * make sure we can deliver it
+	 */
+	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
+
+	net = dev_net(skb_dst(skb)->dev);
+	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+		      &arg, arg.iov[0].iov_len);
+
+	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+   outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+			    u32 win, u32 ts, int oif,
+			    struct tcp_md5sig_key *key,
+			    int reply_flags)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	struct {
+		struct tcphdr th;
+		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
+#ifdef CONFIG_TCP_MD5SIG
+			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+			];
+	} rep;
+	struct ip_reply_arg arg;
+	struct net *net = dev_net(skb_dst(skb)->dev);
+
+	memset(&rep.th, 0, sizeof(struct tcphdr));
+	memset(&arg, 0, sizeof(arg));
+
+	arg.iov[0].iov_base = (unsigned char *)&rep;
+	arg.iov[0].iov_len  = sizeof(rep.th);
+	if (ts) {
+		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				   (TCPOPT_TIMESTAMP << 8) |
+				   TCPOLEN_TIMESTAMP);
+		rep.opt[1] = htonl(tcp_time_stamp);
+		rep.opt[2] = htonl(ts);
+		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+	}
+
+	/* Swap the send and the receive. */
+	rep.th.dest    = th->source;
+	rep.th.source  = th->dest;
+	rep.th.doff    = arg.iov[0].iov_len / 4;
+	rep.th.seq     = htonl(seq);
+	rep.th.ack_seq = htonl(ack);
+	rep.th.ack     = 1;
+	rep.th.window  = htons(win);
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (key) {
+		int offset = (ts) ? 3 : 0;
+
+		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+					  (TCPOPT_NOP << 16) |
+					  (TCPOPT_MD5SIG << 8) |
+					  TCPOLEN_MD5SIG);
+		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
+		rep.th.doff = arg.iov[0].iov_len/4;
+
+		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
+				    key, ip_hdr(skb)->saddr,
+				    ip_hdr(skb)->daddr, &rep.th);
+	}
+#endif
+	arg.flags = reply_flags;
+	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
+				      ip_hdr(skb)->saddr, /* XXX */
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+	if (oif)
+		arg.bound_dev_if = oif;
+
+	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
+		      &arg, arg.iov[0].iov_len);
+
+	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_ts_recent,
+			tw->tw_bound_dev_if,
+			tcp_twsk_md5_key(tcptw),
+			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
+			);
+
+	inet_twsk_put(tw);
+}
+
+static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req)
+{
+	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
+			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
+			req->ts_recent,
+			0,
+			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
+			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
+}
+
+/*
+ *	Send a SYN-ACK after having received a SYN.
+ *	This still operates on a request_sock only, not on a big
+ *	socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+			      struct request_sock *req,
+			      struct request_values *rvp)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct flowi4 fl4;
+	int err = -1;
+	struct sk_buff * skb;
+
+	/* First, grab a route. */
+	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
+		return -1;
+
+	skb = tcp_make_synack(sk, dst, req, rvp);
+
+	if (skb) {
+		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
+
+		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+					    ireq->rmt_addr,
+					    ireq->opt);
+		err = net_xmit_eval(err);
+	}
+
+	dst_release(dst);
+	return err;
+}
+
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
+			      struct request_values *rvp)
+{
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+	return tcp_v4_send_synack(sk, NULL, req, rvp);
+}
+
+/*
+ *	IPv4 request_sock destructor.
+ */
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
+{
+	kfree(inet_rsk(req)->opt);
+}
+
+static void syn_flood_warning(const struct sk_buff *skb)
+{
+	const char *msg;
+
+#ifdef CONFIG_SYN_COOKIES
+	if (sysctl_tcp_syncookies)
+		msg = "Sending cookies";
+	else
+#endif
+		msg = "Dropping request";
+
+	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
+				ntohs(tcp_hdr(skb)->dest), msg);
+}
+
+/*
+ * Save and compile IPv4 options into the request_sock if needed.
+ */
+static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
+						  struct sk_buff *skb)
+{
+	const struct ip_options *opt = &(IPCB(skb)->opt);
+	struct ip_options_rcu *dopt = NULL;
+
+	if (opt && opt->optlen) {
+		int opt_size = sizeof(*dopt) + opt->optlen;
+
+		dopt = kmalloc(opt_size, GFP_ATOMIC);
+		if (dopt) {
+			if (ip_options_echo(&dopt->opt, skb)) {
+				kfree(dopt);
+				dopt = NULL;
+			}
+		}
+	}
+	return dopt;
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+/*
+ * RFC2385 MD5 checksumming requires a mapping of
+ * IP address->MD5 Key.
+ * We need to maintain these in the sk structure.
+ */
+
+/* Find the Key structure for an address.  */
+static struct tcp_md5sig_key *
+			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int i;
+
+	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
+		return NULL;
+	for (i = 0; i < tp->md5sig_info->entries4; i++) {
+		if (tp->md5sig_info->keys4[i].addr == addr)
+			return &tp->md5sig_info->keys4[i].base;
+	}
+	return NULL;
+}
+
+struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
+					 struct sock *addr_sk)
+{
+	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_md5_lookup);
+
+static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
+						      struct request_sock *req)
+{
+	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
+}
+
+/* This can be called on a newly created socket, from other files */
+int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
+		      u8 *newkey, u8 newkeylen)
+{
+	/* Add Key to the list */
+	struct tcp_md5sig_key *key;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp4_md5sig_key *keys;
+
+	key = tcp_v4_md5_do_lookup(sk, addr);
+	if (key) {
+		/* Pre-existing entry - just update that one. */
+		kfree(key->key);
+		key->key = newkey;
+		key->keylen = newkeylen;
+	} else {
+		struct tcp_md5sig_info *md5sig;
+
+		if (!tp->md5sig_info) {
+			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
+						  GFP_ATOMIC);
+			if (!tp->md5sig_info) {
+				kfree(newkey);
+				return -ENOMEM;
+			}
+			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+		}
+
+		md5sig = tp->md5sig_info;
+		if (md5sig->entries4 == 0 &&
+		    tcp_alloc_md5sig_pool(sk) == NULL) {
+			kfree(newkey);
+			return -ENOMEM;
+		}
+
+		if (md5sig->alloced4 == md5sig->entries4) {
+			keys = kmalloc((sizeof(*keys) *
+					(md5sig->entries4 + 1)), GFP_ATOMIC);
+			if (!keys) {
+				kfree(newkey);
+				if (md5sig->entries4 == 0)
+					tcp_free_md5sig_pool();
+				return -ENOMEM;
+			}
+
+			if (md5sig->entries4)
+				memcpy(keys, md5sig->keys4,
+				       sizeof(*keys) * md5sig->entries4);
+
+			/* Free old key list, and reference new one */
+			kfree(md5sig->keys4);
+			md5sig->keys4 = keys;
+			md5sig->alloced4++;
+		}
+		md5sig->entries4++;
+		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
+		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
+		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_v4_md5_do_add);
+
+static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
+			       u8 *newkey, u8 newkeylen)
+{
+	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
+				 newkey, newkeylen);
+}
+
+int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int i;
+
+	for (i = 0; i < tp->md5sig_info->entries4; i++) {
+		if (tp->md5sig_info->keys4[i].addr == addr) {
+			/* Free the key */
+			kfree(tp->md5sig_info->keys4[i].base.key);
+			tp->md5sig_info->entries4--;
+
+			if (tp->md5sig_info->entries4 == 0) {
+				kfree(tp->md5sig_info->keys4);
+				tp->md5sig_info->keys4 = NULL;
+				tp->md5sig_info->alloced4 = 0;
+				tcp_free_md5sig_pool();
+			} else if (tp->md5sig_info->entries4 != i) {
+				/* Need to do some manipulation */
+				memmove(&tp->md5sig_info->keys4[i],
+					&tp->md5sig_info->keys4[i+1],
+					(tp->md5sig_info->entries4 - i) *
+					 sizeof(struct tcp4_md5sig_key));
+			}
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(tcp_v4_md5_do_del);
+
+static void tcp_v4_clear_md5_list(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Free each key, then the set of key keys,
+	 * the crypto element, and then decrement our
+	 * hold on the last resort crypto.
+	 */
+	if (tp->md5sig_info->entries4) {
+		int i;
+		for (i = 0; i < tp->md5sig_info->entries4; i++)
+			kfree(tp->md5sig_info->keys4[i].base.key);
+		tp->md5sig_info->entries4 = 0;
+		tcp_free_md5sig_pool();
+	}
+	if (tp->md5sig_info->keys4) {
+		kfree(tp->md5sig_info->keys4);
+		tp->md5sig_info->keys4 = NULL;
+		tp->md5sig_info->alloced4  = 0;
+	}
+}
+
+static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
+				 int optlen)
+{
+	struct tcp_md5sig cmd;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+	u8 *newkey;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (copy_from_user(&cmd, optval, sizeof(cmd)))
+		return -EFAULT;
+
+	if (sin->sin_family != AF_INET)
+		return -EINVAL;
+
+	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
+		if (!tcp_sk(sk)->md5sig_info)
+			return -ENOENT;
+		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
+	}
+
+	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+		return -EINVAL;
+
+	if (!tcp_sk(sk)->md5sig_info) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct tcp_md5sig_info *p;
+
+		p = kzalloc(sizeof(*p), sk->sk_allocation);
+		if (!p)
+			return -EINVAL;
+
+		tp->md5sig_info = p;
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+	}
+
+	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
+	if (!newkey)
+		return -ENOMEM;
+	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
+				 newkey, cmd.tcpm_keylen);
+}
+
+static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
+					__be32 daddr, __be32 saddr, int nbytes)
+{
+	struct tcp4_pseudohdr *bp;
+	struct scatterlist sg;
+
+	bp = &hp->md5_blk.ip4;
+
+	/*
+	 * 1. the TCP pseudo-header (in the order: source IP address,
+	 * destination IP address, zero-padded protocol number, and
+	 * segment length)
+	 */
+	bp->saddr = saddr;
+	bp->daddr = daddr;
+	bp->pad = 0;
+	bp->protocol = IPPROTO_TCP;
+	bp->len = cpu_to_be16(nbytes);
+
+	sg_init_one(&sg, bp, sizeof(*bp));
+	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+}
+
+static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+			       __be32 daddr, __be32 saddr, struct tcphdr *th)
+{
+	struct tcp_md5sig_pool *hp;
+	struct hash_desc *desc;
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	desc = &hp->md5_desc;
+
+	if (crypto_hash_init(desc))
+		goto clear_hash;
+	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_header(hp, th))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	if (crypto_hash_final(desc, md5_hash))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+
+int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
+			struct sock *sk, struct request_sock *req,
+			struct sk_buff *skb)
+{
+	struct tcp_md5sig_pool *hp;
+	struct hash_desc *desc;
+	struct tcphdr *th = tcp_hdr(skb);
+	__be32 saddr, daddr;
+
+	if (sk) {
+		saddr = inet_sk(sk)->inet_saddr;
+		daddr = inet_sk(sk)->inet_daddr;
+	} else if (req) {
+		saddr = inet_rsk(req)->loc_addr;
+		daddr = inet_rsk(req)->rmt_addr;
+	} else {
+		const struct iphdr *iph = ip_hdr(skb);
+		saddr = iph->saddr;
+		daddr = iph->daddr;
+	}
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	desc = &hp->md5_desc;
+
+	if (crypto_hash_init(desc))
+		goto clear_hash;
+
+	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
+		goto clear_hash;
+	if (tcp_md5_hash_header(hp, th))
+		goto clear_hash;
+	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	if (crypto_hash_final(desc, md5_hash))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+
+static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
+{
+	/*
+	 * This gets called for each TCP segment that arrives
+	 * so we want to be efficient.
+	 * We have 3 drop cases:
+	 * o No MD5 hash and one expected.
+	 * o MD5 hash and we're not expecting one.
+	 * o MD5 hash and its wrong.
+	 */
+	__u8 *hash_location = NULL;
+	struct tcp_md5sig_key *hash_expected;
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr *th = tcp_hdr(skb);
+	int genhash;
+	unsigned char newhash[16];
+
+	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
+	hash_location = tcp_parse_md5sig_option(th);
+
+	/* We've parsed the options - do we have a hash? */
+	if (!hash_expected && !hash_location)
+		return 0;
+
+	if (hash_expected && !hash_location) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+		return 1;
+	}
+
+	if (!hash_expected && hash_location) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+		return 1;
+	}
+
+	/* Okay, so this is hash_expected and hash_location -
+	 * so we need to calculate the checksum.
+	 */
+	genhash = tcp_v4_md5_hash_skb(newhash,
+				      hash_expected,
+				      NULL, NULL, skb);
+
+	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		if (net_ratelimit()) {
+			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+			       &iph->saddr, ntohs(th->source),
+			       &iph->daddr, ntohs(th->dest),
+			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
+		}
+		return 1;
+	}
+	return 0;
+}
+
+#endif
+
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
+	.family		=	PF_INET,
+	.obj_size	=	sizeof(struct tcp_request_sock),
+	.rtx_syn_ack	=	tcp_v4_rtx_synack,
+	.send_ack	=	tcp_v4_reqsk_send_ack,
+	.destructor	=	tcp_v4_reqsk_destructor,
+	.send_reset	=	tcp_v4_send_reset,
+	.syn_ack_timeout = 	tcp_syn_ack_timeout,
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
+	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
+};
+#endif
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_extend_values tmp_ext;
+	struct tcp_options_received tmp_opt;
+	u8 *hash_location;
+	struct request_sock *req;
+	struct inet_request_sock *ireq;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = NULL;
+	__be32 saddr = ip_hdr(skb)->saddr;
+	__be32 daddr = ip_hdr(skb)->daddr;
+	__u32 isn = TCP_SKB_CB(skb)->when;
+#ifdef CONFIG_SYN_COOKIES
+	int want_cookie = 0;
+#else
+#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
+#endif
+
+	/* Never answer to SYNs send to broadcast or multicast */
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto drop;
+
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+		if (net_ratelimit())
+			syn_flood_warning(skb);
+#ifdef CONFIG_SYN_COOKIES
+		if (sysctl_tcp_syncookies) {
+			want_cookie = 1;
+		} else
+#endif
+		goto drop;
+	}
+
+	/* Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = inet_reqsk_alloc(&tcp_request_sock_ops);
+	if (!req)
+		goto drop;
+
+#ifdef CONFIG_TCP_MD5SIG
+	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
+#endif
+
+	tcp_clear_options(&tmp_opt);
+	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+	tmp_opt.user_mss  = tp->rx_opt.user_mss;
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+	if (tmp_opt.cookie_plus > 0 &&
+	    tmp_opt.saw_tstamp &&
+	    !tp->rx_opt.cookie_out_never &&
+	    (sysctl_tcp_cookie_size > 0 ||
+	     (tp->cookie_values != NULL &&
+	      tp->cookie_values->cookie_desired > 0))) {
+		u8 *c;
+		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
+		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
+
+		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
+			goto drop_and_release;
+
+		/* Secret recipe starts with IP addresses */
+		*mess++ ^= (__force u32)daddr;
+		*mess++ ^= (__force u32)saddr;
+
+		/* plus variable length Initiator Cookie */
+		c = (u8 *)mess;
+		while (l-- > 0)
+			*c++ ^= *hash_location++;
+
+#ifdef CONFIG_SYN_COOKIES
+		want_cookie = 0;	/* not our kind of cookie */
+#endif
+		tmp_ext.cookie_out_never = 0; /* false */
+		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
+	} else if (!tp->rx_opt.cookie_in_always) {
+		/* redundant indications, but ensure initialization. */
+		tmp_ext.cookie_out_never = 1; /* true */
+		tmp_ext.cookie_plus = 0;
+	} else {
+		goto drop_and_release;
+	}
+	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+
+	if (want_cookie && !tmp_opt.saw_tstamp)
+		tcp_clear_options(&tmp_opt);
+
+	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+	tcp_openreq_init(req, &tmp_opt, skb);
+
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
+	ireq->opt = tcp_v4_save_options(sk, skb);
+
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
+	if (!want_cookie || tmp_opt.tstamp_ok)
+		TCP_ECN_create_request(req, tcp_hdr(skb));
+
+	if (want_cookie) {
+		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+		req->cookie_ts = tmp_opt.tstamp_ok;
+	} else if (!isn) {
+		struct inet_peer *peer = NULL;
+		struct flowi4 fl4;
+
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tmp_opt.saw_tstamp &&
+		    tcp_death_row.sysctl_tw_recycle &&
+		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
+		    fl4.daddr == saddr &&
+		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
+			inet_peer_refcheck(peer);
+			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
+			    (s32)(peer->tcp_ts - req->ts_recent) >
+							TCP_PAWS_WINDOW) {
+				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+				goto drop_and_release;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (sysctl_max_syn_backlog >> 2)) &&
+			 (!peer || !peer->tcp_ts_stamp) &&
+			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations,
+			 * proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
+				       &saddr, ntohs(tcp_hdr(skb)->source));
+			goto drop_and_release;
+		}
+
+		isn = tcp_v4_init_sequence(skb);
+	}
+	tcp_rsk(req)->snt_isn = isn;
+
+	if (tcp_v4_send_synack(sk, dst, req,
+			       (struct request_values *)&tmp_ext) ||
+	    want_cookie)
+		goto drop_and_free;
+
+	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_release:
+	dst_release(dst);
+drop_and_free:
+	reqsk_free(req);
+drop:
+	return 0;
+}
+EXPORT_SYMBOL(tcp_v4_conn_request);
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req,
+				  struct dst_entry *dst)
+{
+	struct inet_request_sock *ireq;
+	struct inet_sock *newinet;
+	struct tcp_sock *newtp;
+	struct sock *newsk;
+#ifdef CONFIG_TCP_MD5SIG
+	struct tcp_md5sig_key *key;
+#endif
+	struct ip_options_rcu *inet_opt;
+
+	if (sk_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	newsk = tcp_create_openreq_child(sk, req, skb);
+	if (!newsk)
+		goto exit_nonewsk;
+
+	newsk->sk_gso_type = SKB_GSO_TCPV4;
+
+	newtp		      = tcp_sk(newsk);
+	newinet		      = inet_sk(newsk);
+	ireq		      = inet_rsk(req);
+	newinet->inet_daddr   = ireq->rmt_addr;
+	newinet->inet_rcv_saddr = ireq->loc_addr;
+	newinet->inet_saddr	      = ireq->loc_addr;
+	inet_opt	      = ireq->opt;
+	rcu_assign_pointer(newinet->inet_opt, inet_opt);
+	ireq->opt	      = NULL;
+	newinet->mc_index     = inet_iif(skb);
+	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
+	if (inet_opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+	newinet->inet_id = newtp->write_seq ^ jiffies;
+
+	if (!dst) {
+		dst = inet_csk_route_child_sock(sk, newsk, req);
+		if (!dst)
+			goto put_and_exit;
+	} else {
+		/* syncookie case : see end of cookie_v4_check() */
+	}
+	sk_setup_caps(newsk, dst);
+
+	tcp_mtup_init(newsk);
+	tcp_sync_mss(newsk, dst_mtu(dst));
+	newtp->advmss = dst_metric_advmss(dst);
+	if (tcp_sk(sk)->rx_opt.user_mss &&
+	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
+	tcp_initialize_rcv_mss(newsk);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Copy over the MD5 key from the original socket */
+	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
+	if (key != NULL) {
+		/*
+		 * We're using one, so create a matching key
+		 * on the newsk structure. If we fail to get
+		 * memory, then we end up not copying the key
+		 * across. Shucks.
+		 */
+		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
+		if (newkey != NULL)
+			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
+					  newkey, key->keylen);
+		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+	}
+#endif
+
+	if (__inet_inherit_port(sk, newsk) < 0)
+		goto put_and_exit;
+	__inet_hash_nolisten(newsk, NULL);
+
+	return newsk;
+
+exit_overflow:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
+exit:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	return NULL;
+put_and_exit:
+	sock_put(newsk);
+	goto exit;
+}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	const struct iphdr *iph = ip_hdr(skb);
+	struct sock *nsk;
+	struct request_sock **prev;
+	/* Find possible connection requests. */
+	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+						       iph->saddr, iph->daddr);
+	if (req)
+		return tcp_check_req(sk, skb, req, prev);
+
+	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
+			th->source, iph->daddr, th->dest, inet_iif(skb));
+
+	if (nsk) {
+		if (nsk->sk_state != TCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put(inet_twsk(nsk));
+		return NULL;
+	}
+
+#ifdef CONFIG_SYN_COOKIES
+	if (!th->syn)
+		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+#endif
+	return sk;
+}
+
+static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		if (!tcp_v4_check(skb->len, iph->saddr,
+				  iph->daddr, skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			return 0;
+		}
+	}
+
+	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+				       skb->len, IPPROTO_TCP, 0);
+
+	if (skb->len <= 76) {
+		return __skb_checksum_complete(skb);
+	}
+	return 0;
+}
+
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct sock *rsk;
+#ifdef CONFIG_TCP_MD5SIG
+	/*
+	 * We really want to reject the packet as early as possible
+	 * if:
+	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
+	 *  o There is an MD5 option and we're not expecting one
+	 */
+	if (tcp_v4_inbound_md5_hash(sk, skb))
+		goto discard;
+#endif
+
+	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		sock_rps_save_rxhash(sk, skb->rxhash);
+		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
+			rsk = sk;
+			goto reset;
+		}
+		return 0;
+	}
+
+	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+		goto csum_err;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+		if (!nsk)
+			goto discard;
+
+		if (nsk != sk) {
+			sock_rps_save_rxhash(nsk, skb->rxhash);
+			if (tcp_child_process(sk, nsk, skb)) {
+				rsk = nsk;
+				goto reset;
+			}
+			return 0;
+		}
+	} else
+		sock_rps_save_rxhash(sk, skb->rxhash);
+
+	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
+		rsk = sk;
+		goto reset;
+	}
+	return 0;
+
+reset:
+	tcp_v4_send_reset(rsk, skb);
+discard:
+	kfree_skb(skb);
+	/* Be careful here. If this function gets more complicated and
+	 * gcc suffers from register pressure on the x86, sk (in %ebx)
+	 * might be destroyed here. This current version compiles correctly,
+	 * but you have been warned.
+	 */
+	return 0;
+
+csum_err:
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+	goto discard;
+}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+
+/*
+ *	From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	struct tcphdr *th;
+	struct sock *sk;
+	int ret;
+	struct net *net = dev_net(skb->dev);
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto discard_it;
+
+	/* Count it even if it's bad */
+	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+
+	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+		goto discard_it;
+
+	th = tcp_hdr(skb);
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto bad_packet;
+	if (!pskb_may_pull(skb, th->doff * 4))
+		goto discard_it;
+
+	/* An explanation is required here, I think.
+	 * Packet length and doff are validated by header prediction,
+	 * provided case of th->doff==0 is eliminated.
+	 * So, we defer the checks. */
+	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
+		goto bad_packet;
+
+	th = tcp_hdr(skb);
+	iph = ip_hdr(skb);
+	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+				    skb->len - th->doff * 4);
+	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->when	 = 0;
+	TCP_SKB_CB(skb)->flags	 = iph->tos;
+	TCP_SKB_CB(skb)->sacked	 = 0;
+
+	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+	if (!sk)
+		goto no_tcp_socket;
+
+process:
+	if (sk->sk_state == TCP_TIME_WAIT)
+		goto do_time_wait;
+
+	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+		goto discard_and_relse;
+	}
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+	nf_reset(skb);
+
+	if (sk_filter(sk, skb))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	bh_lock_sock_nested(sk);
+	ret = 0;
+	if (!sock_owned_by_user(sk)) {
+#ifdef CONFIG_NET_DMA
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+		if (tp->ucopy.dma_chan)
+			ret = tcp_v4_do_rcv(sk, skb);
+		else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
+				ret = tcp_v4_do_rcv(sk, skb);
+		}
+	} else if (unlikely(sk_add_backlog(sk, skb))) {
+		bh_unlock_sock(sk);
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+		goto discard_and_relse;
+	}
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+	return ret;
+
+no_tcp_socket:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+
+	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+bad_packet:
+		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+	} else {
+		tcp_v4_send_reset(NULL, skb);
+	}
+
+discard_it:
+	/* Discard frame. */
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		inet_twsk_put(inet_twsk(sk));
+		goto discard_it;
+	}
+
+	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+		inet_twsk_put(inet_twsk(sk));
+		goto discard_it;
+	}
+	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+	case TCP_TW_SYN: {
+		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
+							&tcp_hashinfo,
+							iph->daddr, th->dest,
+							inet_iif(skb));
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+			goto process;
+		}
+		/* Fall through to ACK */
+	}
+	case TCP_TW_ACK:
+		tcp_v4_timewait_ack(sk, skb);
+		break;
+	case TCP_TW_RST:
+		goto no_tcp_socket;
+	case TCP_TW_SUCCESS:;
+	}
+	goto discard_it;
+}
+
+struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
+{
+	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_peer *peer;
+
+	if (!rt ||
+	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
+		peer = inet_getpeer_v4(inet->inet_daddr, 1);
+		*release_it = true;
+	} else {
+		if (!rt->peer)
+			rt_bind_peer(rt, inet->inet_daddr, 1);
+		peer = rt->peer;
+		*release_it = false;
+	}
+
+	return peer;
+}
+EXPORT_SYMBOL(tcp_v4_get_peer);
+
+void *tcp_v4_tw_get_peer(struct sock *sk)
+{
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+
+	return inet_getpeer_v4(tw->tw_daddr, 1);
+}
+EXPORT_SYMBOL(tcp_v4_tw_get_peer);
+
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+	.twsk_destructor= tcp_twsk_destructor,
+	.twsk_getpeer	= tcp_v4_tw_get_peer,
+};
+
+const struct inet_connection_sock_af_ops ipv4_specific = {
+	.queue_xmit	   = ip_queue_xmit,
+	.send_check	   = tcp_v4_send_check,
+	.rebuild_header	   = inet_sk_rebuild_header,
+	.conn_request	   = tcp_v4_conn_request,
+	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
+	.get_peer	   = tcp_v4_get_peer,
+	.net_header_len	   = sizeof(struct iphdr),
+	.setsockopt	   = ip_setsockopt,
+	.getsockopt	   = ip_getsockopt,
+	.addr2sockaddr	   = inet_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in),
+	.bind_conflict	   = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ip_setsockopt,
+	.compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+EXPORT_SYMBOL(ipv4_specific);
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+	.md5_lookup		= tcp_v4_md5_lookup,
+	.calc_md5_hash		= tcp_v4_md5_hash_skb,
+	.md5_add		= tcp_v4_md5_add_func,
+	.md5_parse		= tcp_v4_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	skb_queue_head_init(&tp->out_of_order_queue);
+	tcp_init_xmit_timers(sk);
+	tcp_prequeue_init(tp);
+
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
+	tp->mdev = TCP_TIMEOUT_INIT;
+
+	/* So many TCP implementations out there (incorrectly) count the
+	 * initial SYN frame in their delayed-ACK and congestion control
+	 * algorithms that we must have the following bandaid to talk
+	 * efficiently to them.  -DaveM
+	 */
+	tp->snd_cwnd = 2;
+
+	/* See draft-stevens-tcpca-spec-01 for discussion of the
+	 * initialization of these values.
+	 */
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	tp->snd_cwnd_clamp = ~0;
+	tp->mss_cache = TCP_MSS_DEFAULT;
+
+	tp->reordering = sysctl_tcp_reordering;
+	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+
+	sk->sk_state = TCP_CLOSE;
+
+	sk->sk_write_space = sk_stream_write_space;
+	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+	icsk->icsk_af_ops = &ipv4_specific;
+	icsk->icsk_sync_mss = tcp_sync_mss;
+#ifdef CONFIG_TCP_MD5SIG
+	tp->af_specific = &tcp_sock_ipv4_specific;
+#endif
+
+	/* TCP Cookie Transactions */
+	if (sysctl_tcp_cookie_size > 0) {
+		/* Default, cookies without s_data_payload. */
+		tp->cookie_values =
+			kzalloc(sizeof(*tp->cookie_values),
+				sk->sk_allocation);
+		if (tp->cookie_values != NULL)
+			kref_init(&tp->cookie_values->kref);
+	}
+	/* Presumed zeroed, in order of appearance:
+	 *	cookie_in_always, cookie_out_never,
+	 *	s_data_constant, s_data_in, s_data_out
+	 */
+	sk->sk_sndbuf = sysctl_tcp_wmem[1];
+	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+	local_bh_disable();
+	percpu_counter_inc(&tcp_sockets_allocated);
+	local_bh_enable();
+
+	return 0;
+}
+
+void tcp_v4_destroy_sock(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_clear_xmit_timers(sk);
+
+	tcp_cleanup_congestion_control(sk);
+
+	/* Cleanup up the write buffer. */
+	tcp_write_queue_purge(sk);
+
+	/* Cleans up our, hopefully empty, out_of_order_queue. */
+	__skb_queue_purge(&tp->out_of_order_queue);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Clean up the MD5 key list, if any */
+	if (tp->md5sig_info) {
+		tcp_v4_clear_md5_list(sk);
+		kfree(tp->md5sig_info);
+		tp->md5sig_info = NULL;
+	}
+#endif
+
+#ifdef CONFIG_NET_DMA
+	/* Cleans up our sk_async_wait_queue */
+	__skb_queue_purge(&sk->sk_async_wait_queue);
+#endif
+
+	/* Clean prequeue, it must be empty really */
+	__skb_queue_purge(&tp->ucopy.prequeue);
+
+	/* Clean up a referenced TCP bind bucket. */
+	if (inet_csk(sk)->icsk_bind_hash)
+		inet_put_port(sk);
+
+	/*
+	 * If sendmsg cached page exists, toss it.
+	 */
+	if (sk->sk_sndmsg_page) {
+		__free_page(sk->sk_sndmsg_page);
+		sk->sk_sndmsg_page = NULL;
+	}
+
+	/* TCP Cookie Transactions */
+	if (tp->cookie_values != NULL) {
+		kref_put(&tp->cookie_values->kref,
+			 tcp_cookie_values_release);
+		tp->cookie_values = NULL;
+	}
+
+	percpu_counter_dec(&tcp_sockets_allocated);
+}
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
+{
+	return hlist_nulls_empty(head) ? NULL :
+		list_entry(head->first, struct inet_timewait_sock, tw_node);
+}
+
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
+{
+	return !is_a_nulls(tw->tw_node.next) ?
+		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+}
+
+/*
+ * Get next listener socket follow cur.  If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+	struct inet_connection_sock *icsk;
+	struct hlist_nulls_node *node;
+	struct sock *sk = cur;
+	struct inet_listen_hashbucket *ilb;
+	struct tcp_iter_state *st = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	if (!sk) {
+		ilb = &tcp_hashinfo.listening_hash[st->bucket];
+		spin_lock_bh(&ilb->lock);
+		sk = sk_nulls_head(&ilb->head);
+		st->offset = 0;
+		goto get_sk;
+	}
+	ilb = &tcp_hashinfo.listening_hash[st->bucket];
+	++st->num;
+	++st->offset;
+
+	if (st->state == TCP_SEQ_STATE_OPENREQ) {
+		struct request_sock *req = cur;
+
+		icsk = inet_csk(st->syn_wait_sk);
+		req = req->dl_next;
+		while (1) {
+			while (req) {
+				if (req->rsk_ops->family == st->family) {
+					cur = req;
+					goto out;
+				}
+				req = req->dl_next;
+			}
+			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
+				break;
+get_req:
+			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
+		}
+		sk	  = sk_nulls_next(st->syn_wait_sk);
+		st->state = TCP_SEQ_STATE_LISTENING;
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+	} else {
+		icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue))
+			goto start_req;
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		sk = sk_nulls_next(sk);
+	}
+get_sk:
+	sk_nulls_for_each_from(sk, node) {
+		if (!net_eq(sock_net(sk), net))
+			continue;
+		if (sk->sk_family == st->family) {
+			cur = sk;
+			goto out;
+		}
+		icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
+start_req:
+			st->uid		= sock_i_uid(sk);
+			st->syn_wait_sk = sk;
+			st->state	= TCP_SEQ_STATE_OPENREQ;
+			st->sbucket	= 0;
+			goto get_req;
+		}
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+	}
+	spin_unlock_bh(&ilb->lock);
+	st->offset = 0;
+	if (++st->bucket < INET_LHTABLE_SIZE) {
+		ilb = &tcp_hashinfo.listening_hash[st->bucket];
+		spin_lock_bh(&ilb->lock);
+		sk = sk_nulls_head(&ilb->head);
+		goto get_sk;
+	}
+	cur = NULL;
+out:
+	return cur;
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	st->offset = 0;
+	rc = listening_get_next(seq, NULL);
+
+	while (rc && *pos) {
+		rc = listening_get_next(seq, rc);
+		--*pos;
+	}
+	return rc;
+}
+
+static inline int empty_bucket(struct tcp_iter_state *st)
+{
+	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+}
+
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
+static void *established_get_first(struct seq_file *seq)
+{
+	struct tcp_iter_state *st = seq->private;
+	struct net *net = seq_file_net(seq);
+	void *rc = NULL;
+
+	st->offset = 0;
+	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+		struct sock *sk;
+		struct hlist_nulls_node *node;
+		struct inet_timewait_sock *tw;
+		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
+
+		/* Lockless fast path for the common case of empty buckets */
+		if (empty_bucket(st))
+			continue;
+
+		spin_lock_bh(lock);
+		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+			if (sk->sk_family != st->family ||
+			    !net_eq(sock_net(sk), net)) {
+				continue;
+			}
+			rc = sk;
+			goto out;
+		}
+		st->state = TCP_SEQ_STATE_TIME_WAIT;
+		inet_twsk_for_each(tw, node,
+				   &tcp_hashinfo.ehash[st->bucket].twchain) {
+			if (tw->tw_family != st->family ||
+			    !net_eq(twsk_net(tw), net)) {
+				continue;
+			}
+			rc = tw;
+			goto out;
+		}
+		spin_unlock_bh(lock);
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+	}
+out:
+	return rc;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+	struct sock *sk = cur;
+	struct inet_timewait_sock *tw;
+	struct hlist_nulls_node *node;
+	struct tcp_iter_state *st = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	++st->num;
+	++st->offset;
+
+	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+		tw = cur;
+		tw = tw_next(tw);
+get_tw:
+		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
+			tw = tw_next(tw);
+		}
+		if (tw) {
+			cur = tw;
+			goto out;
+		}
+		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+
+		/* Look for next non empty bucket */
+		st->offset = 0;
+		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
+				empty_bucket(st))
+			;
+		if (st->bucket > tcp_hashinfo.ehash_mask)
+			return NULL;
+
+		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
+	} else
+		sk = sk_nulls_next(sk);
+
+	sk_nulls_for_each_from(sk, node) {
+		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+			goto found;
+	}
+
+	st->state = TCP_SEQ_STATE_TIME_WAIT;
+	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
+	goto get_tw;
+found:
+	cur = sk;
+out:
+	return cur;
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	rc = established_get_first(seq);
+
+	while (rc && pos) {
+		rc = established_get_next(seq, rc);
+		--pos;
+	}
+	return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	void *rc;
+	struct tcp_iter_state *st = seq->private;
+
+	st->state = TCP_SEQ_STATE_LISTENING;
+	rc	  = listening_get_idx(seq, &pos);
+
+	if (!rc) {
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		rc	  = established_get_idx(seq, pos);
+	}
+
+	return rc;
+}
+
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+	struct tcp_iter_state *st = seq->private;
+	int offset = st->offset;
+	int orig_num = st->num;
+	void *rc = NULL;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+	case TCP_SEQ_STATE_LISTENING:
+		if (st->bucket >= INET_LHTABLE_SIZE)
+			break;
+		st->state = TCP_SEQ_STATE_LISTENING;
+		rc = listening_get_next(seq, NULL);
+		while (offset-- && rc)
+			rc = listening_get_next(seq, rc);
+		if (rc)
+			break;
+		st->bucket = 0;
+		/* Fallthrough */
+	case TCP_SEQ_STATE_ESTABLISHED:
+	case TCP_SEQ_STATE_TIME_WAIT:
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		if (st->bucket > tcp_hashinfo.ehash_mask)
+			break;
+		rc = established_get_first(seq);
+		while (offset-- && rc)
+			rc = established_get_next(seq, rc);
+	}
+
+	st->num = orig_num;
+
+	return rc;
+}
+
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	if (*pos && *pos == st->last_pos) {
+		rc = tcp_seek_last_pos(seq);
+		if (rc)
+			goto out;
+	}
+
+	st->state = TCP_SEQ_STATE_LISTENING;
+	st->num = 0;
+	st->bucket = 0;
+	st->offset = 0;
+	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+	st->last_pos = *pos;
+	return rc;
+}
+
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct tcp_iter_state *st = seq->private;
+	void *rc = NULL;
+
+	if (v == SEQ_START_TOKEN) {
+		rc = tcp_get_idx(seq, 0);
+		goto out;
+	}
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+	case TCP_SEQ_STATE_LISTENING:
+		rc = listening_get_next(seq, v);
+		if (!rc) {
+			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			st->bucket = 0;
+			st->offset = 0;
+			rc	  = established_get_first(seq);
+		}
+		break;
+	case TCP_SEQ_STATE_ESTABLISHED:
+	case TCP_SEQ_STATE_TIME_WAIT:
+		rc = established_get_next(seq, v);
+		break;
+	}
+out:
+	++*pos;
+	st->last_pos = *pos;
+	return rc;
+}
+
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state *st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+		if (v) {
+			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		}
+	case TCP_SEQ_STATE_LISTENING:
+		if (v != SEQ_START_TOKEN)
+			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
+		break;
+	case TCP_SEQ_STATE_TIME_WAIT:
+	case TCP_SEQ_STATE_ESTABLISHED:
+		if (v)
+			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+		break;
+	}
+}
+
+static int tcp_seq_open(struct inode *inode, struct file *file)
+{
+	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct tcp_iter_state *s;
+	int err;
+
+	err = seq_open_net(inode, file, &afinfo->seq_ops,
+			  sizeof(struct tcp_iter_state));
+	if (err < 0)
+		return err;
+
+	s = ((struct seq_file *)file->private_data)->private;
+	s->family		= afinfo->family;
+	s->last_pos 		= 0;
+	return 0;
+}
+
+int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+	int rc = 0;
+	struct proc_dir_entry *p;
+
+	afinfo->seq_fops.open		= tcp_seq_open;
+	afinfo->seq_fops.read		= seq_read;
+	afinfo->seq_fops.llseek		= seq_lseek;
+	afinfo->seq_fops.release	= seq_release_net;
+
+	afinfo->seq_ops.start		= tcp_seq_start;
+	afinfo->seq_ops.next		= tcp_seq_next;
+	afinfo->seq_ops.stop		= tcp_seq_stop;
+
+	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+			     &afinfo->seq_fops, afinfo);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+EXPORT_SYMBOL(tcp_proc_register);
+
+void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
+{
+	proc_net_remove(net, afinfo->name);
+}
+EXPORT_SYMBOL(tcp_proc_unregister);
+
+static void get_openreq4(struct sock *sk, struct request_sock *req,
+			 struct seq_file *f, int i, int uid, int *len)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	int ttd = req->expires - jiffies;
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
+		i,
+		ireq->loc_addr,
+		ntohs(inet_sk(sk)->inet_sport),
+		ireq->rmt_addr,
+		ntohs(ireq->rmt_port),
+		TCP_SYN_RECV,
+		0, 0, /* could print option size, but that is af dependent. */
+		1,    /* timers active (only the expire timer) */
+		jiffies_to_clock_t(ttd),
+		req->retrans,
+		uid,
+		0,  /* non standard timer */
+		0, /* open_requests have no inode */
+		atomic_read(&sk->sk_refcnt),
+		req,
+		len);
+}
+
+static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
+{
+	int timer_active;
+	unsigned long timer_expires;
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+	int rx_queue;
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		timer_active	= 4;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (timer_pending(&sk->sk_timer)) {
+		timer_active	= 2;
+		timer_expires	= sk->sk_timer.expires;
+	} else {
+		timer_active	= 0;
+		timer_expires = jiffies;
+	}
+
+	if (sk->sk_state == TCP_LISTEN)
+		rx_queue = sk->sk_ack_backlog;
+	else
+		/*
+		 * because we dont lock socket, we might find a transient negative value
+		 */
+		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
+		i, src, srcp, dest, destp, sk->sk_state,
+		tp->write_seq - tp->snd_una,
+		rx_queue,
+		timer_active,
+		jiffies_to_clock_t(timer_expires - jiffies),
+		icsk->icsk_retransmits,
+		sock_i_uid(sk),
+		icsk->icsk_probes_out,
+		sock_i_ino(sk),
+		atomic_read(&sk->sk_refcnt), sk,
+		jiffies_to_clock_t(icsk->icsk_rto),
+		jiffies_to_clock_t(icsk->icsk_ack.ato),
+		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
+		tp->snd_cwnd,
+		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
+		len);
+}
+
+static void get_timewait4_sock(struct inet_timewait_sock *tw,
+			       struct seq_file *f, int i, int *len)
+{
+	__be32 dest, src;
+	__u16 destp, srcp;
+	int ttd = tw->tw_ttd - jiffies;
+
+	if (ttd < 0)
+		ttd = 0;
+
+	dest  = tw->tw_daddr;
+	src   = tw->tw_rcv_saddr;
+	destp = ntohs(tw->tw_dport);
+	srcp  = ntohs(tw->tw_sport);
+
+	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
+		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+		atomic_read(&tw->tw_refcnt), tw, len);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state *st;
+	int len;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-*s\n", TMPSZ - 1,
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode");
+		goto out;
+	}
+	st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+	case TCP_SEQ_STATE_ESTABLISHED:
+		get_tcp4_sock(v, seq, st->num, &len);
+		break;
+	case TCP_SEQ_STATE_OPENREQ:
+		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
+		break;
+	case TCP_SEQ_STATE_TIME_WAIT:
+		get_timewait4_sock(v, seq, st->num, &len);
+		break;
+	}
+	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
+out:
+	return 0;
+}
+
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+	.name		= "tcp",
+	.family		= AF_INET,
+	.seq_fops	= {
+		.owner		= THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= tcp4_seq_show,
+	},
+};
+
+static int __net_init tcp4_proc_init_net(struct net *net)
+{
+	return tcp_proc_register(net, &tcp4_seq_afinfo);
+}
+
+static void __net_exit tcp4_proc_exit_net(struct net *net)
+{
+	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+}
+
+static struct pernet_operations tcp4_net_ops = {
+	.init = tcp4_proc_init_net,
+	.exit = tcp4_proc_exit_net,
+};
+
+int __init tcp4_proc_init(void)
+{
+	return register_pernet_subsys(&tcp4_net_ops);
+}
+
+void tcp4_proc_exit(void)
+{
+	unregister_pernet_subsys(&tcp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	const struct iphdr *iph = skb_gro_network_header(skb);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
+				  skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+
+		/* fall through */
+	case CHECKSUM_NONE:
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	return tcp_gro_receive(head, skb);
+}
+
+int tcp4_gro_complete(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr *th = tcp_hdr(skb);
+
+	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
+				  iph->saddr, iph->daddr, 0);
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	return tcp_gro_complete(skb);
+}
+
+struct proto tcp_prot = {
+	.name			= "TCP",
+	.owner			= THIS_MODULE,
+	.close			= tcp_close,
+	.connect		= tcp_v4_connect,
+	.disconnect		= tcp_disconnect,
+	.accept			= inet_csk_accept,
+	.ioctl			= tcp_ioctl,
+	.init			= tcp_v4_init_sock,
+	.destroy		= tcp_v4_destroy_sock,
+	.shutdown		= tcp_shutdown,
+	.setsockopt		= tcp_setsockopt,
+	.getsockopt		= tcp_getsockopt,
+	.recvmsg		= tcp_recvmsg,
+	.sendmsg		= tcp_sendmsg,
+	.sendpage		= tcp_sendpage,
+	.backlog_rcv		= tcp_v4_do_rcv,
+	.hash			= inet_hash,
+	.unhash			= inet_unhash,
+	.get_port		= inet_csk_get_port,
+	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.sockets_allocated	= &tcp_sockets_allocated,
+	.orphan_count		= &tcp_orphan_count,
+	.memory_allocated	= &tcp_memory_allocated,
+	.memory_pressure	= &tcp_memory_pressure,
+	.sysctl_mem		= sysctl_tcp_mem,
+	.sysctl_wmem		= sysctl_tcp_wmem,
+	.sysctl_rmem		= sysctl_tcp_rmem,
+	.max_header		= MAX_TCP_HEADER,
+	.obj_size		= sizeof(struct tcp_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.twsk_prot		= &tcp_timewait_sock_ops,
+	.rsk_prot		= &tcp_request_sock_ops,
+	.h.hashinfo		= &tcp_hashinfo,
+	.no_autobind		= true,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_tcp_setsockopt,
+	.compat_getsockopt	= compat_tcp_getsockopt,
+#endif
+};
+EXPORT_SYMBOL(tcp_prot);
+
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
+				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
+}
+
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static struct pernet_operations __net_initdata tcp_sk_ops = {
+       .init	   = tcp_sk_init,
+       .exit	   = tcp_sk_exit,
+       .exit_batch = tcp_sk_exit_batch,
+};
+
+void __init tcp_v4_init(void)
+{
+	inet_hashinfo_init(&tcp_hashinfo);
+	if (register_pernet_subsys(&tcp_sk_ops))
+		panic("Failed to create the TCP control socket.\n");
+}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
new file mode 100644
index 00000000..72f7218b
--- /dev/null
+++ b/net/ipv4/tcp_lp.c
@@ -0,0 +1,344 @@
+/*
+ * TCP Low Priority (TCP-LP)
+ *
+ * TCP Low Priority is a distributed algorithm whose goal is to utilize only
+ *   the excess network bandwidth as compared to the ``fair share`` of
+ *   bandwidth as targeted by TCP.
+ *
+ * As of 2.6.13, Linux supports pluggable congestion control algorithms.
+ * Due to the limitation of the API, we take the following changes from
+ * the original TCP-LP implementation:
+ *   o We use newReno in most core CA handling. Only add some checking
+ *     within cong_avoid.
+ *   o Error correcting in remote HZ, therefore remote HZ will be keeped
+ *     on checking and updating.
+ *   o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
+ *     OWD have a similar meaning as RTT. Also correct the buggy formular.
+ *   o Handle reaction for Early Congestion Indication (ECI) within
+ *     pkts_acked, as mentioned within pseudo code.
+ *   o OWD is handled in relative format, where local time stamp will in
+ *     tcp_time_stamp format.
+ *
+ * Original Author:
+ *   Aleksandar Kuzmanovic <akuzma@northwestern.edu>
+ * Available from:
+ *   http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * Original implementation for 2.4.19:
+ *   http://www-ece.rice.edu/networks/TCP-LP/
+ *
+ * 2.6.x module Authors:
+ *   Wong Hoi Sing, Edison <hswong3i@gmail.com>
+ *   Hung Hing Lun, Mike <hlhung3i@gmail.com>
+ * SourceForge project page:
+ *   http://tcp-lp-mod.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* resolution of owd */
+#define LP_RESOL       1000
+
+/**
+ * enum tcp_lp_state
+ * @LP_VALID_RHZ: is remote HZ valid?
+ * @LP_VALID_OWD: is OWD valid?
+ * @LP_WITHIN_THR: are we within threshold?
+ * @LP_WITHIN_INF: are we within inference?
+ *
+ * TCP-LP's state flags.
+ * We create this set of state flag mainly for debugging.
+ */
+enum tcp_lp_state {
+	LP_VALID_RHZ = (1 << 0),
+	LP_VALID_OWD = (1 << 1),
+	LP_WITHIN_THR = (1 << 3),
+	LP_WITHIN_INF = (1 << 4),
+};
+
+/**
+ * struct lp
+ * @flag: TCP-LP state flag
+ * @sowd: smoothed OWD << 3
+ * @owd_min: min OWD
+ * @owd_max: max OWD
+ * @owd_max_rsv: resrved max owd
+ * @remote_hz: estimated remote HZ
+ * @remote_ref_time: remote reference time
+ * @local_ref_time: local reference time
+ * @last_drop: time for last active drop
+ * @inference: current inference
+ *
+ * TCP-LP's private struct.
+ * We get the idea from original TCP-LP implementation where only left those we
+ * found are really useful.
+ */
+struct lp {
+	u32 flag;
+	u32 sowd;
+	u32 owd_min;
+	u32 owd_max;
+	u32 owd_max_rsv;
+	u32 remote_hz;
+	u32 remote_ref_time;
+	u32 local_ref_time;
+	u32 last_drop;
+	u32 inference;
+};
+
+/**
+ * tcp_lp_init
+ *
+ * Init all required variables.
+ * Clone the handling from Vegas module implementation.
+ */
+static void tcp_lp_init(struct sock *sk)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	lp->flag = 0;
+	lp->sowd = 0;
+	lp->owd_min = 0xffffffff;
+	lp->owd_max = 0;
+	lp->owd_max_rsv = 0;
+	lp->remote_hz = 0;
+	lp->remote_ref_time = 0;
+	lp->local_ref_time = 0;
+	lp->last_drop = 0;
+	lp->inference = 0;
+}
+
+/**
+ * tcp_lp_cong_avoid
+ *
+ * Implementation of cong_avoid.
+ * Will only call newReno CA when away from inference.
+ * From TCP-LP's paper, this will be handled in additive increasement.
+ */
+static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct lp *lp = inet_csk_ca(sk);
+
+	if (!(lp->flag & LP_WITHIN_INF))
+		tcp_reno_cong_avoid(sk, ack, in_flight);
+}
+
+/**
+ * tcp_lp_remote_hz_estimator
+ *
+ * Estimate remote HZ.
+ * We keep on updating the estimated value, where original TCP-LP
+ * implementation only guest it for once and use forever.
+ */
+static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 rhz = lp->remote_hz << 6;	/* remote HZ << 6 */
+	s64 m = 0;
+
+	/* not yet record reference time
+	 * go away!! record it before come back!! */
+	if (lp->remote_ref_time == 0 || lp->local_ref_time == 0)
+		goto out;
+
+	/* we can't calc remote HZ with no different!! */
+	if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+	    tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+		goto out;
+
+	m = HZ * (tp->rx_opt.rcv_tsval -
+		  lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr -
+					  lp->local_ref_time);
+	if (m < 0)
+		m = -m;
+
+	if (rhz > 0) {
+		m -= rhz >> 6;	/* m is now error in remote HZ est */
+		rhz += m;	/* 63/64 old + 1/64 new */
+	} else
+		rhz = m << 6;
+
+ out:
+	/* record time for successful remote HZ calc */
+	if ((rhz >> 6) > 0)
+		lp->flag |= LP_VALID_RHZ;
+	else
+		lp->flag &= ~LP_VALID_RHZ;
+
+	/* record reference time stamp */
+	lp->remote_ref_time = tp->rx_opt.rcv_tsval;
+	lp->local_ref_time = tp->rx_opt.rcv_tsecr;
+
+	return rhz >> 6;
+}
+
+/**
+ * tcp_lp_owd_calculator
+ *
+ * Calculate one way delay (in relative format).
+ * Original implement OWD as minus of remote time difference to local time
+ * difference directly. As this time difference just simply equal to RTT, when
+ * the network status is stable, remote RTT will equal to local RTT, and result
+ * OWD into zero.
+ * It seems to be a bug and so we fixed it.
+ */
+static u32 tcp_lp_owd_calculator(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+	s64 owd = 0;
+
+	lp->remote_hz = tcp_lp_remote_hz_estimator(sk);
+
+	if (lp->flag & LP_VALID_RHZ) {
+		owd =
+		    tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) -
+		    tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ);
+		if (owd < 0)
+			owd = -owd;
+	}
+
+	if (owd > 0)
+		lp->flag |= LP_VALID_OWD;
+	else
+		lp->flag &= ~LP_VALID_OWD;
+
+	return owd;
+}
+
+/**
+ * tcp_lp_rtt_sample
+ *
+ * Implementation or rtt_sample.
+ * Will take the following action,
+ *   1. calc OWD,
+ *   2. record the min/max OWD,
+ *   3. calc smoothed OWD (SOWD).
+ * Most ideas come from the original TCP-LP implementation.
+ */
+static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
+{
+	struct lp *lp = inet_csk_ca(sk);
+	s64 mowd = tcp_lp_owd_calculator(sk);
+
+	/* sorry that we don't have valid data */
+	if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD))
+		return;
+
+	/* record the next min owd */
+	if (mowd < lp->owd_min)
+		lp->owd_min = mowd;
+
+	/* always forget the max of the max
+	 * we just set owd_max as one below it */
+	if (mowd > lp->owd_max) {
+		if (mowd > lp->owd_max_rsv) {
+			if (lp->owd_max_rsv == 0)
+				lp->owd_max = mowd;
+			else
+				lp->owd_max = lp->owd_max_rsv;
+			lp->owd_max_rsv = mowd;
+		} else
+			lp->owd_max = mowd;
+	}
+
+	/* calc for smoothed owd */
+	if (lp->sowd != 0) {
+		mowd -= lp->sowd >> 3;	/* m is now error in owd est */
+		lp->sowd += mowd;	/* owd = 7/8 owd + 1/8 new */
+	} else
+		lp->sowd = mowd << 3;	/* take the measured time be owd */
+}
+
+/**
+ * tcp_lp_pkts_acked
+ *
+ * Implementation of pkts_acked.
+ * Deal with active drop under Early Congestion Indication.
+ * Only drop to half and 1 will be handle, because we hope to use back
+ * newReno in increase case.
+ * We work it out by following the idea from TCP-LP's paper directly
+ */
+static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct lp *lp = inet_csk_ca(sk);
+
+	if (rtt_us > 0)
+		tcp_lp_rtt_sample(sk, rtt_us);
+
+	/* calc inference */
+	if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
+		lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr);
+
+	/* test if within inference */
+	if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference))
+		lp->flag |= LP_WITHIN_INF;
+	else
+		lp->flag &= ~LP_WITHIN_INF;
+
+	/* test if within threshold */
+	if (lp->sowd >> 3 <
+	    lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100)
+		lp->flag |= LP_WITHIN_THR;
+	else
+		lp->flag &= ~LP_WITHIN_THR;
+
+	pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
+		 tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+		 lp->sowd >> 3);
+
+	if (lp->flag & LP_WITHIN_THR)
+		return;
+
+	/* FIXME: try to reset owd_min and owd_max here
+	 * so decrease the chance the min/max is no longer suitable
+	 * and will usually within threshold when whithin inference */
+	lp->owd_min = lp->sowd >> 3;
+	lp->owd_max = lp->sowd >> 2;
+	lp->owd_max_rsv = lp->sowd >> 2;
+
+	/* happened within inference
+	 * drop snd_cwnd into 1 */
+	if (lp->flag & LP_WITHIN_INF)
+		tp->snd_cwnd = 1U;
+
+	/* happened after inference
+	 * cut snd_cwnd into half */
+	else
+		tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+
+	/* record this drop time */
+	lp->last_drop = tcp_time_stamp;
+}
+
+static struct tcp_congestion_ops tcp_lp __read_mostly = {
+	.flags = TCP_CONG_RTT_STAMP,
+	.init = tcp_lp_init,
+	.ssthresh = tcp_reno_ssthresh,
+	.cong_avoid = tcp_lp_cong_avoid,
+	.min_cwnd = tcp_reno_min_cwnd,
+	.pkts_acked = tcp_lp_pkts_acked,
+
+	.owner = THIS_MODULE,
+	.name = "lp"
+};
+
+static int __init tcp_lp_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_lp);
+}
+
+static void __exit tcp_lp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_lp);
+}
+
+module_init(tcp_lp_register);
+module_exit(tcp_lp_unregister);
+
+MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun Mike");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Low Priority");
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 00000000..80b1f807
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,786 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+int sysctl_tcp_syncookies __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_syncookies);
+
+int sysctl_tcp_abort_on_overflow __read_mostly;
+
+struct inet_timewait_death_row tcp_death_row = {
+	.sysctl_max_tw_buckets = NR_FILE * 2,
+	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+	.death_lock	= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
+	.hashinfo	= &tcp_hashinfo,
+	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
+					    (unsigned long)&tcp_death_row),
+	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
+					     inet_twdr_twkill_work),
+/* Short-time timewait calendar */
+
+	.twcal_hand	= -1,
+	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+					    (unsigned long)&tcp_death_row),
+};
+EXPORT_SYMBOL_GPL(tcp_death_row);
+
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+static int tcp_remember_stamp(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_peer *peer;
+	bool release_it;
+
+	peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+		     peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+			peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
+			peer->tcp_ts = tp->rx_opt.ts_recent;
+		}
+		if (release_it)
+			inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
+{
+	struct sock *sk = (struct sock *) tw;
+	struct inet_peer *peer;
+
+	peer = twsk_getpeer(sk);
+	if (peer) {
+		const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
+		    ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+		     peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+			peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
+			peer->tcp_ts	   = tcptw->tw_ts_recent;
+		}
+		inet_putpeer(peer);
+		return 1;
+	}
+	return 0;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+	if (seq == s_win)
+		return 1;
+	if (after(end_seq, s_win) && before(seq, e_win))
+		return 1;
+	return seq == e_win && seq == end_seq;
+}
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ *   (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ *   lifetime in the internet, which results in wrong conclusion, that
+ *   it is set to catch "old duplicate segments" wandering out of their path.
+ *   It is not quite correct. This timeout is calculated so that it exceeds
+ *   maximal retransmission timeout enough to allow to lose one (or more)
+ *   segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ *   finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+			   const struct tcphdr *th)
+{
+	struct tcp_options_received tmp_opt;
+	u8 *hash_location;
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+	int paws_reject = 0;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+		if (tmp_opt.saw_tstamp) {
+			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
+			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+		}
+	}
+
+	if (tw->tw_substate == TCP_FIN_WAIT2) {
+		/* Just repeat all the checks of tcp_rcv_state_process() */
+
+		/* Out of window, send ACK */
+		if (paws_reject ||
+		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+				   tcptw->tw_rcv_nxt,
+				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+			return TCP_TW_ACK;
+
+		if (th->rst)
+			goto kill;
+
+		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+			goto kill_with_rst;
+
+		/* Dup ACK? */
+		if (!th->ack ||
+		    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+			inet_twsk_put(tw);
+			return TCP_TW_SUCCESS;
+		}
+
+		/* New data or FIN. If new data arrive after half-duplex close,
+		 * reset.
+		 */
+		if (!th->fin ||
+		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
+kill_with_rst:
+			inet_twsk_deschedule(tw, &tcp_death_row);
+			inet_twsk_put(tw);
+			return TCP_TW_RST;
+		}
+
+		/* FIN arrived, enter true time-wait state. */
+		tw->tw_substate	  = TCP_TIME_WAIT;
+		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (tmp_opt.saw_tstamp) {
+			tcptw->tw_ts_recent_stamp = get_seconds();
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+		}
+
+		if (tcp_death_row.sysctl_tw_recycle &&
+		    tcptw->tw_ts_recent_stamp &&
+		    tcp_tw_remember_stamp(tw))
+			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+					   TCP_TIMEWAIT_LEN);
+		else
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
+		return TCP_TW_ACK;
+	}
+
+	/*
+	 *	Now real TIME-WAIT state.
+	 *
+	 *	RFC 1122:
+	 *	"When a connection is [...] on TIME-WAIT state [...]
+	 *	[a TCP] MAY accept a new SYN from the remote TCP to
+	 *	reopen the connection directly, if it:
+	 *
+	 *	(1)  assigns its initial sequence number for the new
+	 *	connection to be larger than the largest sequence
+	 *	number it used on the previous connection incarnation,
+	 *	and
+	 *
+	 *	(2)  returns to TIME-WAIT state if the SYN turns out
+	 *	to be an old duplicate".
+	 */
+
+	if (!paws_reject &&
+	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+		/* In window segment, it may be only reset or bare ack. */
+
+		if (th->rst) {
+			/* This is TIME_WAIT assassination, in two flavors.
+			 * Oh well... nobody has a sufficient solution to this
+			 * protocol bug yet.
+			 */
+			if (sysctl_tcp_rfc1337 == 0) {
+kill:
+				inet_twsk_deschedule(tw, &tcp_death_row);
+				inet_twsk_put(tw);
+				return TCP_TW_SUCCESS;
+			}
+		}
+		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+				   TCP_TIMEWAIT_LEN);
+
+		if (tmp_opt.saw_tstamp) {
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = get_seconds();
+		}
+
+		inet_twsk_put(tw);
+		return TCP_TW_SUCCESS;
+	}
+
+	/* Out of window segment.
+
+	   All the segments are ACKed immediately.
+
+	   The only exception is new SYN. We accept it, if it is
+	   not old duplicate and we are not in danger to be killed
+	   by delayed old duplicates. RFC check is that it has
+	   newer sequence number works at rates <40Mbit/sec.
+	   However, if paws works, it is reliable AND even more,
+	   we even may relax silly seq space cutoff.
+
+	   RED-PEN: we violate main RFC requirement, if this SYN will appear
+	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
+	   we must return socket to time-wait state. It is not good,
+	   but not fatal yet.
+	 */
+
+	if (th->syn && !th->rst && !th->ack && !paws_reject &&
+	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+	     (tmp_opt.saw_tstamp &&
+	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
+		if (isn == 0)
+			isn++;
+		TCP_SKB_CB(skb)->when = isn;
+		return TCP_TW_SYN;
+	}
+
+	if (paws_reject)
+		NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+
+	if (!th->rst) {
+		/* In this case we must reset the TIMEWAIT timer.
+		 *
+		 * If it is ACKless SYN it may be both old duplicate
+		 * and new good SYN with random sequence number <rcv_nxt.
+		 * Do not reschedule in the last case.
+		 */
+		if (paws_reject || th->ack)
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
+
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+EXPORT_SYMBOL(tcp_timewait_state_process);
+
+/*
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+	struct inet_timewait_sock *tw = NULL;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	int recycle_ok = 0;
+
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
+		recycle_ok = tcp_remember_stamp(sk);
+
+	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+		tw = inet_twsk_alloc(sk, state);
+
+	if (tw != NULL) {
+		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
+		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
+		tcptw->tw_snd_nxt	= tp->snd_nxt;
+		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
+		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
+		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (tw->tw_family == PF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+			struct inet6_timewait_sock *tw6;
+
+			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+			tw6 = inet6_twsk((struct sock *)tw);
+			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6only = np->ipv6only;
+		}
+#endif
+
+#ifdef CONFIG_TCP_MD5SIG
+		/*
+		 * The timewait bucket does not have the key DB from the
+		 * sock structure. We just make a quick copy of the
+		 * md5 key being used (if indeed we are using one)
+		 * so the timewait ack generating code has the key.
+		 */
+		do {
+			struct tcp_md5sig_key *key;
+			memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
+			tcptw->tw_md5_keylen = 0;
+			key = tp->af_specific->md5_lookup(sk, sk);
+			if (key != NULL) {
+				memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
+				tcptw->tw_md5_keylen = key->keylen;
+				if (tcp_alloc_md5sig_pool(sk) == NULL)
+					BUG();
+			}
+		} while (0);
+#endif
+
+		/* Linkage updates. */
+		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+
+		/* Get the TIME_WAIT timeout firing. */
+		if (timeo < rto)
+			timeo = rto;
+
+		if (recycle_ok) {
+			tw->tw_timeout = rto;
+		} else {
+			tw->tw_timeout = TCP_TIMEWAIT_LEN;
+			if (state == TCP_TIME_WAIT)
+				timeo = TCP_TIMEWAIT_LEN;
+		}
+
+		inet_twsk_schedule(tw, &tcp_death_row, timeo,
+				   TCP_TIMEWAIT_LEN);
+		inet_twsk_put(tw);
+	} else {
+		/* Sorry, if we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+	}
+
+	tcp_update_metrics(sk);
+	tcp_done(sk);
+}
+
+void tcp_twsk_destructor(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+	if (twsk->tw_md5_keylen)
+		tcp_free_md5sig_pool();
+#endif
+}
+EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+
+static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
+					 struct request_sock *req)
+{
+	tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
+{
+	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+
+	if (newsk != NULL) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+		struct tcp_request_sock *treq = tcp_rsk(req);
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+		struct tcp_sock *newtp = tcp_sk(newsk);
+		struct tcp_sock *oldtp = tcp_sk(sk);
+		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+
+		/* TCP Cookie Transactions require space for the cookie pair,
+		 * as it differs for each connection.  There is no need to
+		 * copy any s_data_payload stored at the original socket.
+		 * Failure will prevent resuming the connection.
+		 *
+		 * Presumed copied, in order of appearance:
+		 *	cookie_in_always, cookie_out_never
+		 */
+		if (oldcvp != NULL) {
+			struct tcp_cookie_values *newcvp =
+				kzalloc(sizeof(*newtp->cookie_values),
+					GFP_ATOMIC);
+
+			if (newcvp != NULL) {
+				kref_init(&newcvp->kref);
+				newcvp->cookie_desired =
+						oldcvp->cookie_desired;
+				newtp->cookie_values = newcvp;
+			} else {
+				/* Not Yet Implemented */
+				newtp->cookie_values = NULL;
+			}
+		}
+
+		/* Now setup tcp_sock */
+		newtp->pred_flags = 0;
+
+		newtp->rcv_wup = newtp->copied_seq =
+		newtp->rcv_nxt = treq->rcv_isn + 1;
+
+		newtp->snd_sml = newtp->snd_una =
+		newtp->snd_nxt = newtp->snd_up =
+			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+
+		tcp_prequeue_init(newtp);
+
+		tcp_init_wl(newtp, treq->rcv_isn);
+
+		newtp->srtt = 0;
+		newtp->mdev = TCP_TIMEOUT_INIT;
+		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+
+		newtp->packets_out = 0;
+		newtp->retrans_out = 0;
+		newtp->sacked_out = 0;
+		newtp->fackets_out = 0;
+		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+
+		/* So many TCP implementations out there (incorrectly) count the
+		 * initial SYN frame in their delayed-ACK and congestion control
+		 * algorithms that we must have the following bandaid to talk
+		 * efficiently to them.  -DaveM
+		 */
+		newtp->snd_cwnd = 2;
+		newtp->snd_cwnd_cnt = 0;
+		newtp->bytes_acked = 0;
+
+		newtp->frto_counter = 0;
+		newtp->frto_highmark = 0;
+
+		newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
+
+		tcp_set_ca_state(newsk, TCP_CA_Open);
+		tcp_init_xmit_timers(newsk);
+		skb_queue_head_init(&newtp->out_of_order_queue);
+		newtp->write_seq = newtp->pushed_seq =
+			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+
+		newtp->rx_opt.saw_tstamp = 0;
+
+		newtp->rx_opt.dsack = 0;
+		newtp->rx_opt.num_sacks = 0;
+
+		newtp->urg_data = 0;
+
+		if (sock_flag(newsk, SOCK_KEEPOPEN))
+			inet_csk_reset_keepalive_timer(newsk,
+						       keepalive_time_when(newtp));
+
+		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+		if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
+			if (sysctl_tcp_fack)
+				tcp_enable_fack(newtp);
+		}
+		newtp->window_clamp = req->window_clamp;
+		newtp->rcv_ssthresh = req->rcv_wnd;
+		newtp->rcv_wnd = req->rcv_wnd;
+		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+		if (newtp->rx_opt.wscale_ok) {
+			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+		} else {
+			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+			newtp->window_clamp = min(newtp->window_clamp, 65535U);
+		}
+		newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
+				  newtp->rx_opt.snd_wscale);
+		newtp->max_window = newtp->snd_wnd;
+
+		if (newtp->rx_opt.tstamp_ok) {
+			newtp->rx_opt.ts_recent = req->ts_recent;
+			newtp->rx_opt.ts_recent_stamp = get_seconds();
+			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			newtp->rx_opt.ts_recent_stamp = 0;
+			newtp->tcp_header_len = sizeof(struct tcphdr);
+		}
+#ifdef CONFIG_TCP_MD5SIG
+		newtp->md5sig_info = NULL;	/*XXX*/
+		if (newtp->af_specific->md5_lookup(sk, newsk))
+			newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+		if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+		newtp->rx_opt.mss_clamp = req->mss;
+		TCP_ECN_openreq_child(newtp, req);
+
+		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+	}
+	return newsk;
+}
+EXPORT_SYMBOL(tcp_create_openreq_child);
+
+/*
+ *	Process an incoming packet for SYN_RECV sockets represented
+ *	as a request_sock.
+ */
+
+struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
+			   struct request_sock *req,
+			   struct request_sock **prev)
+{
+	struct tcp_options_received tmp_opt;
+	u8 *hash_location;
+	struct sock *child;
+	const struct tcphdr *th = tcp_hdr(skb);
+	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+	int paws_reject = 0;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr)>>2)) {
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+		if (tmp_opt.saw_tstamp) {
+			tmp_opt.ts_recent = req->ts_recent;
+			/* We do not store true stamp, but it is not required,
+			 * it can be estimated (approximately)
+			 * from another data.
+			 */
+			tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+			paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
+		}
+	}
+
+	/* Check for pure retransmitted SYN. */
+	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
+	    flg == TCP_FLAG_SYN &&
+	    !paws_reject) {
+		/*
+		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+		 * this case on figure 6 and figure 8, but formal
+		 * protocol description says NOTHING.
+		 * To be more exact, it says that we should send ACK,
+		 * because this segment (at least, if it has no data)
+		 * is out of window.
+		 *
+		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+		 *  describe SYN-RECV state. All the description
+		 *  is wrong, we cannot believe to it and should
+		 *  rely only on common sense and implementation
+		 *  experience.
+		 *
+		 * Enforce "SYN-ACK" according to figure 8, figure 6
+		 * of RFC793, fixed by RFC1122.
+		 */
+		req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+		return NULL;
+	}
+
+	/* Further reproduces section "SEGMENT ARRIVES"
+	   for state SYN-RECEIVED of RFC793.
+	   It is broken, however, it does not work only
+	   when SYNs are crossed.
+
+	   You would think that SYN crossing is impossible here, since
+	   we should have a SYN_SENT socket (from connect()) on our end,
+	   but this is not true if the crossed SYNs were sent to both
+	   ends by a malicious third party.  We must defend against this,
+	   and to do that we first verify the ACK (as per RFC793, page
+	   36) and reset if it is invalid.  Is this a true full defense?
+	   To convince ourselves, let us consider a way in which the ACK
+	   test can still pass in this 'malicious crossed SYNs' case.
+	   Malicious sender sends identical SYNs (and thus identical sequence
+	   numbers) to both A and B:
+
+		A: gets SYN, seq=7
+		B: gets SYN, seq=7
+
+	   By our good fortune, both A and B select the same initial
+	   send sequence number of seven :-)
+
+		A: sends SYN|ACK, seq=7, ack_seq=8
+		B: sends SYN|ACK, seq=7, ack_seq=8
+
+	   So we are now A eating this SYN|ACK, ACK test passes.  So
+	   does sequence test, SYN is truncated, and thus we consider
+	   it a bare ACK.
+
+	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+	   bare ACK.  Otherwise, we create an established connection.  Both
+	   ends (listening sockets) accept the new incoming connection and try
+	   to talk to each other. 8-)
+
+	   Note: This case is both harmless, and rare.  Possibility is about the
+	   same as us discovering intelligent life on another plant tomorrow.
+
+	   But generally, we should (RFC lies!) to accept ACK
+	   from SYNACK both here and in tcp_rcv_state_process().
+	   tcp_rcv_state_process() does not, hence, we do not too.
+
+	   Note that the case is absolutely generic:
+	   we cannot optimize anything here without
+	   violating protocol. All the checks must be made
+	   before attempt to create socket.
+	 */
+
+	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
+	 *                  and the incoming segment acknowledges something not yet
+	 *                  sent (the segment carries an unacceptable ACK) ...
+	 *                  a reset is sent."
+	 *
+	 * Invalid ACK: reset will be sent by listening socket
+	 */
+	if ((flg & TCP_FLAG_ACK) &&
+	    (TCP_SKB_CB(skb)->ack_seq !=
+	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
+		return sk;
+
+	/* Also, it would be not so bad idea to check rcv_tsecr, which
+	 * is essentially ACK extension and too early or too late values
+	 * should cause reset in unsynchronized states.
+	 */
+
+	/* RFC793: "first check sequence number". */
+
+	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
+		/* Out of window: send ACK and drop. */
+		if (!(flg & TCP_FLAG_RST))
+			req->rsk_ops->send_ack(sk, skb, req);
+		if (paws_reject)
+			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+		return NULL;
+	}
+
+	/* In sequence, PAWS is OK. */
+
+	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
+		req->ts_recent = tmp_opt.rcv_tsval;
+
+	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
+		/* Truncate SYN, it is out of window starting
+		   at tcp_rsk(req)->rcv_isn + 1. */
+		flg &= ~TCP_FLAG_SYN;
+	}
+
+	/* RFC793: "second check the RST bit" and
+	 *	   "fourth, check the SYN bit"
+	 */
+	if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
+		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+		goto embryonic_reset;
+	}
+
+	/* ACK sequence verified above, just make sure ACK is
+	 * set.  If ACK not set, just silently drop the packet.
+	 */
+	if (!(flg & TCP_FLAG_ACK))
+		return NULL;
+
+	/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
+	if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+	    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		inet_rsk(req)->acked = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+		return NULL;
+	}
+
+	/* OK, ACK is valid, create big socket and
+	 * feed this segment to it. It will repeat all
+	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+	 * ESTABLISHED STATE. If it will be dropped after
+	 * socket is created, wait for troubles.
+	 */
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+	if (child == NULL)
+		goto listen_overflow;
+
+	inet_csk_reqsk_queue_unlink(sk, req, prev);
+	inet_csk_reqsk_queue_removed(sk, req);
+
+	inet_csk_reqsk_queue_add(sk, req, child);
+	return child;
+
+listen_overflow:
+	if (!sysctl_tcp_abort_on_overflow) {
+		inet_rsk(req)->acked = 1;
+		return NULL;
+	}
+
+embryonic_reset:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+	if (!(flg & TCP_FLAG_RST))
+		req->rsk_ops->send_reset(sk, skb);
+
+	inet_csk_reqsk_queue_drop(sk, req, prev);
+	return NULL;
+}
+EXPORT_SYMBOL(tcp_check_req);
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+		      struct sk_buff *skb)
+{
+	int ret = 0;
+	int state = child->sk_state;
+
+	if (!sock_owned_by_user(child)) {
+		ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
+					    skb->len);
+		/* Wakeup parent, send SIGIO */
+		if (state == TCP_SYN_RECV && child->sk_state != state)
+			parent->sk_data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		__sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	sock_put(child);
+	return ret;
+}
+EXPORT_SYMBOL(tcp_child_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 00000000..faf257b9
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,2853 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
+ *				:	Fragmentation on mtu decrease
+ *				:	Segment collapse on retransmit
+ *				:	AF independence
+ *
+ *		Linus Torvalds	:	send_delayed_ack
+ *		David S. Miller	:	Charge memory using the right skb
+ *					during syn/ack processing.
+ *		David S. Miller :	Output engine completely rewritten.
+ *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
+ *		Cacophonix Gaul :	draft-minshall-nagle-01
+ *		J Hadi Salim	:	ECN support
+ *
+ */
+
+#include <net/tcp.h>
+
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse __read_mostly = 1;
+
+/* People can turn this on to work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume.  Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor __read_mostly = 3;
+
+int sysctl_tcp_mtu_probing __read_mostly = 0;
+int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
+
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+
+int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
+EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+
+
+/* Account for new data that has been sent to the network. */
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int prior_packets = tp->packets_out;
+
+	tcp_advance_send_head(sk, skb);
+	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+
+	/* Don't override Nagle indefinitely with F-RTO */
+	if (tp->frto_counter == 2)
+		tp->frto_counter = 3;
+
+	tp->packets_out += tcp_skb_pcount(skb);
+	if (!prior_packets)
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
+		return tp->snd_nxt;
+	else
+		return tcp_wnd_end(tp);
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	int mss = tp->advmss;
+
+	if (dst) {
+		unsigned int metric = dst_metric_advmss(dst);
+
+		if (metric < mss) {
+			mss = metric;
+			tp->advmss = mss;
+		}
+	}
+
+	return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 delta = tcp_time_stamp - tp->lsndtime;
+	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+	u32 cwnd = tp->snd_cwnd;
+
+	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
+
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
+	restart_cwnd = min(restart_cwnd, cwnd);
+
+	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
+		cwnd >>= 1;
+	tp->snd_cwnd = max(cwnd, restart_cwnd);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->snd_cwnd_used = 0;
+}
+
+/* Congestion state accounting after a packet has been sent. */
+static void tcp_event_data_sent(struct tcp_sock *tp,
+				struct sk_buff *skb, struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const u32 now = tcp_time_stamp;
+
+	if (sysctl_tcp_slow_start_after_idle &&
+	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
+		tcp_cwnd_restart(sk, __sk_dst_get(sk));
+
+	tp->lsndtime = now;
+
+	/* If it is a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+		icsk->icsk_ack.pingpong = 1;
+}
+
+/* Account for an ACK we sent. */
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+{
+	tcp_dec_quickack_mode(sk, pkts);
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+			       __u32 *rcv_wnd, __u32 *window_clamp,
+			       int wscale_ok, __u8 *rcv_wscale,
+			       __u32 init_rcv_wnd)
+{
+	unsigned int space = (__space < 0 ? 0 : __space);
+
+	/* If no clamp set the clamp to the max possible scaled window */
+	if (*window_clamp == 0)
+		(*window_clamp) = (65535 << 14);
+	space = min(*window_clamp, space);
+
+	/* Quantize space offering to a multiple of mss if possible. */
+	if (space > mss)
+		space = (space / mss) * mss;
+
+	/* NOTE: offering an initial window larger than 32767
+	 * will break some buggy TCP stacks. If the admin tells us
+	 * it is likely we could be speaking with such a buggy stack
+	 * we will truncate our initial window offering to 32K-1
+	 * unless the remote has sent us a window scaling option,
+	 * which we interpret as a sign the remote TCP is not
+	 * misinterpreting the window field as a signed quantity.
+	 */
+	if (sysctl_tcp_workaround_signed_windows)
+		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+	else
+		(*rcv_wnd) = space;
+
+	(*rcv_wscale) = 0;
+	if (wscale_ok) {
+		/* Set window scaling on max possible window
+		 * See RFC1323 for an explanation of the limit to 14
+		 */
+		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+		space = min_t(u32, space, *window_clamp);
+		while (space > 65535 && (*rcv_wscale) < 14) {
+			space >>= 1;
+			(*rcv_wscale)++;
+		}
+	}
+
+	/* Set initial window to a value enough for senders starting with
+	 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
+	 * a limit on the initial window when mss is larger than 1460.
+	 */
+	if (mss > (1 << *rcv_wscale)) {
+		int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
+		if (mss > 1460)
+			init_cwnd =
+			max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
+		/* when initializing use the value from init_rcv_wnd
+		 * rather than the default from above
+		 */
+		if (init_rcv_wnd)
+			*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+		else
+			*rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
+	}
+
+	/* Set the clamp no higher than max representable value */
+	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+EXPORT_SYMBOL(tcp_select_initial_window);
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied.  The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static u16 tcp_select_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 cur_win = tcp_receive_window(tp);
+	u32 new_win = __tcp_select_window(sk);
+
+	/* Never shrink the offered window */
+	if (new_win < cur_win) {
+		/* Danger Will Robinson!
+		 * Don't update rcv_wup/rcv_wnd here or else
+		 * we will not be able to advertise a zero
+		 * window in time.  --DaveM
+		 *
+		 * Relax Will Robinson.
+		 */
+		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+	}
+	tp->rcv_wnd = new_win;
+	tp->rcv_wup = tp->rcv_nxt;
+
+	/* Make sure we do not exceed the maximum possible
+	 * scaled window.
+	 */
+	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
+		new_win = min(new_win, MAX_TCP_WINDOW);
+	else
+		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+	/* RFC1323 scaling applied */
+	new_win >>= tp->rx_opt.rcv_wscale;
+
+	/* If we advertise zero window, disable fast path. */
+	if (new_win == 0)
+		tp->pred_flags = 0;
+
+	return new_win;
+}
+
+/* Packet ECN state for a SYN-ACK */
+static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
+	if (!(tp->ecn_flags & TCP_ECN_OK))
+		TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
+}
+
+/* Packet ECN state for a SYN.  */
+static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->ecn_flags = 0;
+	if (sysctl_tcp_ecn == 1) {
+		TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
+		tp->ecn_flags = TCP_ECN_OK;
+	}
+}
+
+static __inline__ void
+TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
+{
+	if (inet_rsk(req)->ecn_ok)
+		th->ece = 1;
+}
+
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
+static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
+				int tcp_header_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->ecn_flags & TCP_ECN_OK) {
+		/* Not-retransmitted data segment: set ECT and inject CWR. */
+		if (skb->len != tcp_header_len &&
+		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+			INET_ECN_xmit(sk);
+			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
+				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+				tcp_hdr(skb)->cwr = 1;
+				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+			}
+		} else {
+			/* ACK or retransmitted segment: clear ECT|CE */
+			INET_ECN_dontxmit(sk);
+		}
+		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+			tcp_hdr(skb)->ece = 1;
+	}
+}
+
+/* Constructs common control bits of non-data skb. If SYN/FIN is present,
+ * auto increment end seqno.
+ */
+static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+{
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum = 0;
+
+	TCP_SKB_CB(skb)->flags = flags;
+	TCP_SKB_CB(skb)->sacked = 0;
+
+	skb_shinfo(skb)->gso_segs = 1;
+	skb_shinfo(skb)->gso_size = 0;
+	skb_shinfo(skb)->gso_type = 0;
+
+	TCP_SKB_CB(skb)->seq = seq;
+	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
+		seq++;
+	TCP_SKB_CB(skb)->end_seq = seq;
+}
+
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
+#define OPTION_SACK_ADVERTISE	(1 << 0)
+#define OPTION_TS		(1 << 1)
+#define OPTION_MD5		(1 << 2)
+#define OPTION_WSCALE		(1 << 3)
+#define OPTION_COOKIE_EXTENSION	(1 << 4)
+
+struct tcp_out_options {
+	u8 options;		/* bit field of OPTION_* */
+	u8 ws;			/* window scale, 0 to disable */
+	u8 num_sack_blocks;	/* number of SACK blocks to include */
+	u8 hash_size;		/* bytes in hash_location */
+	u16 mss;		/* 0 to disable */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	__u8 *hash_location;	/* temporary pointer, overloaded */
+};
+
+/* The sysctl int routines are generic, so check consistency here.
+ */
+static u8 tcp_cookie_size_check(u8 desired)
+{
+	int cookie_size;
+
+	if (desired > 0)
+		/* previously specified */
+		return desired;
+
+	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
+	if (cookie_size <= 0)
+		/* no default specified */
+		return 0;
+
+	if (cookie_size <= TCP_COOKIE_MIN)
+		/* value too small, specify minimum */
+		return TCP_COOKIE_MIN;
+
+	if (cookie_size >= TCP_COOKIE_MAX)
+		/* value too large, specify maximum */
+		return TCP_COOKIE_MAX;
+
+	if (cookie_size & 1)
+		/* 8-bit multiple, illegal, fix it */
+		cookie_size++;
+
+	return (u8)cookie_size;
+}
+
+/* Write previously computed TCP options to the packet.
+ *
+ * Beware: Something in the Internet is very sensitive to the ordering of
+ * TCP options, we learned this through the hard way, so be careful here.
+ * Luckily we can at least blame others for their non-compliance but from
+ * inter-operatibility perspective it seems that we're somewhat stuck with
+ * the ordering which we have been using if we want to keep working with
+ * those broken things (not that it currently hurts anybody as there isn't
+ * particular reason why the ordering would need to be changed).
+ *
+ * At least SACK_PERM as the first option is known to lead to a disaster
+ * (but it may well be that other scenarios fail similarly).
+ */
+static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+			      struct tcp_out_options *opts)
+{
+	u8 options = opts->options;	/* mungable copy */
+
+	/* Having both authentication and cookies for security is redundant,
+	 * and there's certainly not enough room.  Instead, the cookie-less
+	 * extension variant is proposed.
+	 *
+	 * Consider the pessimal case with authentication.  The options
+	 * could look like:
+	 *   COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
+	 */
+	if (unlikely(OPTION_MD5 & options)) {
+		if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+			*ptr++ = htonl((TCPOPT_COOKIE << 24) |
+				       (TCPOLEN_COOKIE_BASE << 16) |
+				       (TCPOPT_MD5SIG << 8) |
+				       TCPOLEN_MD5SIG);
+		} else {
+			*ptr++ = htonl((TCPOPT_NOP << 24) |
+				       (TCPOPT_NOP << 16) |
+				       (TCPOPT_MD5SIG << 8) |
+				       TCPOLEN_MD5SIG);
+		}
+		options &= ~OPTION_COOKIE_EXTENSION;
+		/* overload cookie hash location */
+		opts->hash_location = (__u8 *)ptr;
+		ptr += 4;
+	}
+
+	if (unlikely(opts->mss)) {
+		*ptr++ = htonl((TCPOPT_MSS << 24) |
+			       (TCPOLEN_MSS << 16) |
+			       opts->mss);
+	}
+
+	if (likely(OPTION_TS & options)) {
+		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+				       (TCPOLEN_SACK_PERM << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+			options &= ~OPTION_SACK_ADVERTISE;
+		} else {
+			*ptr++ = htonl((TCPOPT_NOP << 24) |
+				       (TCPOPT_NOP << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+		}
+		*ptr++ = htonl(opts->tsval);
+		*ptr++ = htonl(opts->tsecr);
+	}
+
+	/* Specification requires after timestamp, so do it now.
+	 *
+	 * Consider the pessimal case without authentication.  The options
+	 * could look like:
+	 *   MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
+	 */
+	if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+		__u8 *cookie_copy = opts->hash_location;
+		u8 cookie_size = opts->hash_size;
+
+		/* 8-bit multiple handled in tcp_cookie_size_check() above,
+		 * and elsewhere.
+		 */
+		if (0x2 & cookie_size) {
+			__u8 *p = (__u8 *)ptr;
+
+			/* 16-bit multiple */
+			*p++ = TCPOPT_COOKIE;
+			*p++ = TCPOLEN_COOKIE_BASE + cookie_size;
+			*p++ = *cookie_copy++;
+			*p++ = *cookie_copy++;
+			ptr++;
+			cookie_size -= 2;
+		} else {
+			/* 32-bit multiple */
+			*ptr++ = htonl(((TCPOPT_NOP << 24) |
+					(TCPOPT_NOP << 16) |
+					(TCPOPT_COOKIE << 8) |
+					TCPOLEN_COOKIE_BASE) +
+				       cookie_size);
+		}
+
+		if (cookie_size > 0) {
+			memcpy(ptr, cookie_copy, cookie_size);
+			ptr += (cookie_size / 4);
+		}
+	}
+
+	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_NOP << 16) |
+			       (TCPOPT_SACK_PERM << 8) |
+			       TCPOLEN_SACK_PERM);
+	}
+
+	if (unlikely(OPTION_WSCALE & options)) {
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_WINDOW << 16) |
+			       (TCPOLEN_WINDOW << 8) |
+			       opts->ws);
+	}
+
+	if (unlikely(opts->num_sack_blocks)) {
+		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
+			tp->duplicate_sack : tp->selective_acks;
+		int this_sack;
+
+		*ptr++ = htonl((TCPOPT_NOP  << 24) |
+			       (TCPOPT_NOP  << 16) |
+			       (TCPOPT_SACK <<  8) |
+			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+						     TCPOLEN_SACK_PERBLOCK)));
+
+		for (this_sack = 0; this_sack < opts->num_sack_blocks;
+		     ++this_sack) {
+			*ptr++ = htonl(sp[this_sack].start_seq);
+			*ptr++ = htonl(sp[this_sack].end_seq);
+		}
+
+		tp->rx_opt.dsack = 0;
+	}
+}
+
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
+static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
+				struct tcp_out_options *opts,
+				struct tcp_md5sig_key **md5) {
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_cookie_values *cvp = tp->cookie_values;
+	unsigned remaining = MAX_TCP_OPTION_SPACE;
+	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
+			 tcp_cookie_size_check(cvp->cookie_desired) :
+			 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	*md5 = tp->af_specific->md5_lookup(sk, sk);
+	if (*md5) {
+		opts->options |= OPTION_MD5;
+		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+	}
+#else
+	*md5 = NULL;
+#endif
+
+	/* We always get an MSS option.  The option bytes which will be seen in
+	 * normal data packets should timestamps be used, must be in the MSS
+	 * advertised.  But we subtract them from tp->mss_cache so that
+	 * calculations in tcp_sendmsg are simpler etc.  So account for this
+	 * fact here if necessary.  If we don't do this correctly, as a
+	 * receiver we won't recognize data packets as being full sized when we
+	 * should, and thus we won't abide by the delayed ACK rules correctly.
+	 * SACKs don't matter, we never delay an ACK when we have any of those
+	 * going out.  */
+	opts->mss = tcp_advertise_mss(sk);
+	remaining -= TCPOLEN_MSS_ALIGNED;
+
+	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = TCP_SKB_CB(skb)->when;
+		opts->tsecr = tp->rx_opt.ts_recent;
+		remaining -= TCPOLEN_TSTAMP_ALIGNED;
+	}
+	if (likely(sysctl_tcp_window_scaling)) {
+		opts->ws = tp->rx_opt.rcv_wscale;
+		opts->options |= OPTION_WSCALE;
+		remaining -= TCPOLEN_WSCALE_ALIGNED;
+	}
+	if (likely(sysctl_tcp_sack)) {
+		opts->options |= OPTION_SACK_ADVERTISE;
+		if (unlikely(!(OPTION_TS & opts->options)))
+			remaining -= TCPOLEN_SACKPERM_ALIGNED;
+	}
+
+	/* Note that timestamps are required by the specification.
+	 *
+	 * Odd numbers of bytes are prohibited by the specification, ensuring
+	 * that the cookie is 16-bit aligned, and the resulting cookie pair is
+	 * 32-bit aligned.
+	 */
+	if (*md5 == NULL &&
+	    (OPTION_TS & opts->options) &&
+	    cookie_size > 0) {
+		int need = TCPOLEN_COOKIE_BASE + cookie_size;
+
+		if (0x2 & need) {
+			/* 32-bit multiple */
+			need += 2; /* NOPs */
+
+			if (need > remaining) {
+				/* try shrinking cookie to fit */
+				cookie_size -= 2;
+				need -= 4;
+			}
+		}
+		while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
+			cookie_size -= 4;
+			need -= 4;
+		}
+		if (TCP_COOKIE_MIN <= cookie_size) {
+			opts->options |= OPTION_COOKIE_EXTENSION;
+			opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
+			opts->hash_size = cookie_size;
+
+			/* Remember for future incarnations. */
+			cvp->cookie_desired = cookie_size;
+
+			if (cvp->cookie_desired != cvp->cookie_pair_size) {
+				/* Currently use random bytes as a nonce,
+				 * assuming these are completely unpredictable
+				 * by hostile users of the same system.
+				 */
+				get_random_bytes(&cvp->cookie_pair[0],
+						 cookie_size);
+				cvp->cookie_pair_size = cookie_size;
+			}
+
+			remaining -= need;
+		}
+	}
+	return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Set up TCP options for SYN-ACKs. */
+static unsigned tcp_synack_options(struct sock *sk,
+				   struct request_sock *req,
+				   unsigned mss, struct sk_buff *skb,
+				   struct tcp_out_options *opts,
+				   struct tcp_md5sig_key **md5,
+				   struct tcp_extend_values *xvp)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	unsigned remaining = MAX_TCP_OPTION_SPACE;
+	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
+			 xvp->cookie_plus :
+			 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
+	if (*md5) {
+		opts->options |= OPTION_MD5;
+		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+		/* We can't fit any SACK blocks in a packet with MD5 + TS
+		 * options. There was discussion about disabling SACK
+		 * rather than TS in order to fit in better with old,
+		 * buggy kernels, but that was deemed to be unnecessary.
+		 */
+		ireq->tstamp_ok &= !ireq->sack_ok;
+	}
+#else
+	*md5 = NULL;
+#endif
+
+	/* We always send an MSS option. */
+	opts->mss = mss;
+	remaining -= TCPOLEN_MSS_ALIGNED;
+
+	if (likely(ireq->wscale_ok)) {
+		opts->ws = ireq->rcv_wscale;
+		opts->options |= OPTION_WSCALE;
+		remaining -= TCPOLEN_WSCALE_ALIGNED;
+	}
+	if (likely(ireq->tstamp_ok)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = TCP_SKB_CB(skb)->when;
+		opts->tsecr = req->ts_recent;
+		remaining -= TCPOLEN_TSTAMP_ALIGNED;
+	}
+	if (likely(ireq->sack_ok)) {
+		opts->options |= OPTION_SACK_ADVERTISE;
+		if (unlikely(!ireq->tstamp_ok))
+			remaining -= TCPOLEN_SACKPERM_ALIGNED;
+	}
+
+	/* Similar rationale to tcp_syn_options() applies here, too.
+	 * If the <SYN> options fit, the same options should fit now!
+	 */
+	if (*md5 == NULL &&
+	    ireq->tstamp_ok &&
+	    cookie_plus > TCPOLEN_COOKIE_BASE) {
+		int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
+
+		if (0x2 & need) {
+			/* 32-bit multiple */
+			need += 2; /* NOPs */
+		}
+		if (need <= remaining) {
+			opts->options |= OPTION_COOKIE_EXTENSION;
+			opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
+			remaining -= need;
+		} else {
+			/* There's no error return, so flag it. */
+			xvp->cookie_out_never = 1; /* true */
+			opts->hash_size = 0;
+		}
+	}
+	return MAX_TCP_OPTION_SPACE - remaining;
+}
+
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
+static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
+					struct tcp_out_options *opts,
+					struct tcp_md5sig_key **md5) {
+	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned size = 0;
+	unsigned int eff_sacks;
+
+#ifdef CONFIG_TCP_MD5SIG
+	*md5 = tp->af_specific->md5_lookup(sk, sk);
+	if (unlikely(*md5)) {
+		opts->options |= OPTION_MD5;
+		size += TCPOLEN_MD5SIG_ALIGNED;
+	}
+#else
+	*md5 = NULL;
+#endif
+
+	if (likely(tp->rx_opt.tstamp_ok)) {
+		opts->options |= OPTION_TS;
+		opts->tsval = tcb ? tcb->when : 0;
+		opts->tsecr = tp->rx_opt.ts_recent;
+		size += TCPOLEN_TSTAMP_ALIGNED;
+	}
+
+	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+	if (unlikely(eff_sacks)) {
+		const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
+		opts->num_sack_blocks =
+			min_t(unsigned, eff_sacks,
+			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+			      TCPOLEN_SACK_PERBLOCK);
+		size += TCPOLEN_SACK_BASE_ALIGNED +
+			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+	}
+
+	return size;
+}
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg().  This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless.  It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+			    gfp_t gfp_mask)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet;
+	struct tcp_sock *tp;
+	struct tcp_skb_cb *tcb;
+	struct tcp_out_options opts;
+	unsigned tcp_options_size, tcp_header_size;
+	struct tcp_md5sig_key *md5;
+	struct tcphdr *th;
+	int err;
+
+	BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+	/* If congestion control is doing timestamping, we must
+	 * take such a timestamp before we potentially clone/copy.
+	 */
+	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
+		__net_timestamp(skb);
+
+	if (likely(clone_it)) {
+		if (unlikely(skb_cloned(skb)))
+			skb = pskb_copy(skb, gfp_mask);
+		else
+			skb = skb_clone(skb, gfp_mask);
+		if (unlikely(!skb))
+			return -ENOBUFS;
+	}
+
+	inet = inet_sk(sk);
+	tp = tcp_sk(sk);
+	tcb = TCP_SKB_CB(skb);
+	memset(&opts, 0, sizeof(opts));
+
+	if (unlikely(tcb->flags & TCPHDR_SYN))
+		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+	else
+		tcp_options_size = tcp_established_options(sk, skb, &opts,
+							   &md5);
+	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+
+	if (tcp_packets_in_flight(tp) == 0) {
+		tcp_ca_event(sk, CA_EVENT_TX_START);
+		skb->ooo_okay = 1;
+	} else
+		skb->ooo_okay = 0;
+
+	skb_push(skb, tcp_header_size);
+	skb_reset_transport_header(skb);
+	skb_set_owner_w(skb, sk);
+
+	/* Build TCP header and checksum it. */
+	th = tcp_hdr(skb);
+	th->source		= inet->inet_sport;
+	th->dest		= inet->inet_dport;
+	th->seq			= htonl(tcb->seq);
+	th->ack_seq		= htonl(tp->rcv_nxt);
+	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
+					tcb->flags);
+
+	if (unlikely(tcb->flags & TCPHDR_SYN)) {
+		/* RFC1323: The window in SYN & SYN/ACK segments
+		 * is never scaled.
+		 */
+		th->window	= htons(min(tp->rcv_wnd, 65535U));
+	} else {
+		th->window	= htons(tcp_select_window(sk));
+	}
+	th->check		= 0;
+	th->urg_ptr		= 0;
+
+	/* The urg_mode check is necessary during a below snd_una win probe */
+	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
+		if (before(tp->snd_up, tcb->seq + 0x10000)) {
+			th->urg_ptr = htons(tp->snd_up - tcb->seq);
+			th->urg = 1;
+		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
+			th->urg_ptr = htons(0xFFFF);
+			th->urg = 1;
+		}
+	}
+
+	tcp_options_write((__be32 *)(th + 1), tp, &opts);
+	if (likely((tcb->flags & TCPHDR_SYN) == 0))
+		TCP_ECN_send(sk, skb, tcp_header_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Calculate the MD5 hash, as we have all we need now */
+	if (md5) {
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+		tp->af_specific->calc_md5_hash(opts.hash_location,
+					       md5, sk, NULL, skb);
+	}
+#endif
+
+	icsk->icsk_af_ops->send_check(sk, skb);
+
+	if (likely(tcb->flags & TCPHDR_ACK))
+		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+
+	if (skb->len != tcp_header_size)
+		tcp_event_data_sent(tp, skb, sk);
+
+	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
+		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+			      tcp_skb_pcount(skb));
+
+	err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
+	if (likely(err <= 0))
+		return err;
+
+	tcp_enter_cwr(sk, 1);
+
+	return net_xmit_eval(err);
+}
+
+/* This routine just queues the buffer for sending.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Advance write_seq and place onto the write_queue. */
+	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+	skb_header_release(skb);
+	tcp_add_write_queue_tail(sk, skb);
+	sk->sk_wmem_queued += skb->truesize;
+	sk_mem_charge(sk, skb->truesize);
+}
+
+/* Initialize TSO segments for a packet. */
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
+				 unsigned int mss_now)
+{
+	if (skb->len <= mss_now || !sk_can_gso(sk) ||
+	    skb->ip_summed == CHECKSUM_NONE) {
+		/* Avoid the costly divide in the normal
+		 * non-TSO case.
+		 */
+		skb_shinfo(skb)->gso_segs = 1;
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
+	} else {
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
+		skb_shinfo(skb)->gso_size = mss_now;
+		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
+	}
+}
+
+/* When a modification to fackets out becomes necessary, we need to check
+ * skb is counted to fackets_out or not.
+ */
+static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
+				   int decr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->sacked_out || tcp_is_reno(tp))
+		return;
+
+	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
+		tp->fackets_out -= decr;
+}
+
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->packets_out -= decr;
+
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+		tp->sacked_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+		tp->retrans_out -= decr;
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+		tp->lost_out -= decr;
+
+	/* Reno case is special. Sigh... */
+	if (tcp_is_reno(tp) && decr > 0)
+		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+	tcp_adjust_fackets_out(sk, skb, decr);
+
+	if (tp->lost_skb_hint &&
+	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+		tp->lost_cnt_hint -= decr;
+
+	tcp_verify_left_out(tp);
+}
+
+/* Function to create two new TCP segments.  Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list.  This won't be called frequently, I hope.
+ * Remember, these are still headerless SKBs at this point.
+ */
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+		 unsigned int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+	int nsize, old_factor;
+	int nlen;
+	u8 flags;
+
+	if (WARN_ON(len > skb->len))
+		return -EINVAL;
+
+	nsize = skb_headlen(skb) - len;
+	if (nsize < 0)
+		nsize = 0;
+
+	if (skb_cloned(skb) &&
+	    skb_is_nonlinear(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* Get a new skb... force flag on. */
+	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+	if (buff == NULL)
+		return -ENOMEM; /* We'll just try again later. */
+
+	sk->sk_wmem_queued += buff->truesize;
+	sk_mem_charge(sk, buff->truesize);
+	nlen = skb->len - len - nsize;
+	buff->truesize += nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+
+	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* Copy and checksum data tail into the new buffer. */
+		buff->csum = csum_partial_copy_nocheck(skb->data + len,
+						       skb_put(buff, nsize),
+						       nsize, 0);
+
+		skb_trim(skb, len);
+
+		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+	} else {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb_split(skb, buff, len);
+	}
+
+	buff->ip_summed = skb->ip_summed;
+
+	/* Looks stupid, but our code really uses when of
+	 * skbs, which it never sent before. --ANK
+	 */
+	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+	buff->tstamp = skb->tstamp;
+
+	old_factor = tcp_skb_pcount(skb);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+	/* If this packet has been sent out already, we must
+	 * adjust the various packet counters.
+	 */
+	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+		int diff = old_factor - tcp_skb_pcount(skb) -
+			tcp_skb_pcount(buff);
+
+		if (diff)
+			tcp_adjust_pcount(sk, skb, diff);
+	}
+
+	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
+	tcp_insert_write_queue_after(skb, buff, sk);
+
+	return 0;
+}
+
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static void __pskb_trim_head(struct sk_buff *skb, int len)
+{
+	int i, k, eat;
+
+	eat = len;
+	k = 0;
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		if (skb_shinfo(skb)->frags[i].size <= eat) {
+			put_page(skb_shinfo(skb)->frags[i].page);
+			eat -= skb_shinfo(skb)->frags[i].size;
+		} else {
+			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+			if (eat) {
+				skb_shinfo(skb)->frags[k].page_offset += eat;
+				skb_shinfo(skb)->frags[k].size -= eat;
+				eat = 0;
+			}
+			k++;
+		}
+	}
+	skb_shinfo(skb)->nr_frags = k;
+
+	skb_reset_tail_pointer(skb);
+	skb->data_len -= len;
+	skb->len = skb->data_len;
+}
+
+/* Remove acked data from a packet in the transmit queue. */
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
+	if (unlikely(len < skb_headlen(skb)))
+		__skb_pull(skb, len);
+	else
+		__pskb_trim_head(skb, len - skb_headlen(skb));
+
+	TCP_SKB_CB(skb)->seq += len;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb->truesize	     -= len;
+	sk->sk_wmem_queued   -= len;
+	sk_mem_uncharge(sk, len);
+	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+
+	/* Any change of skb->len requires recalculation of tso factor. */
+	if (tcp_skb_pcount(skb) > 1)
+		tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
+
+	return 0;
+}
+
+/* Calculate MSS. Not accounting for SACKs here.  */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	 */
+	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->rx_opt.mss_clamp)
+		mss_now = tp->rx_opt.mss_clamp;
+
+	/* Now subtract optional transport overhead */
+	mss_now -= icsk->icsk_ext_hdr_len;
+
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mtu;
+
+	mtu = mss +
+	      tp->tcp_header_len +
+	      icsk->icsk_ext_hdr_len +
+	      icsk->icsk_af_ops->net_header_len;
+
+	return mtu;
+}
+
+/* MTU probing init per socket */
+void tcp_mtup_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+			       icsk->icsk_af_ops->net_header_len;
+	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+	icsk->icsk_mtup.probe_size = 0;
+}
+EXPORT_SYMBOL(tcp_mtup_init);
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+   It is minimum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->rx_opt.mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+   are READ ONLY outside this function.		--ANK (980731)
+ */
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	if (icsk->icsk_mtup.search_high > pmtu)
+		icsk->icsk_mtup.search_high = pmtu;
+
+	mss_now = tcp_mtu_to_mss(sk, pmtu);
+	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
+
+	/* And store cached results */
+	icsk->icsk_pmtu_cookie = pmtu;
+	if (icsk->icsk_mtup.enabled)
+		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
+	tp->mss_cache = mss_now;
+
+	return mss_now;
+}
+EXPORT_SYMBOL(tcp_sync_mss);
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ */
+unsigned int tcp_current_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	u32 mss_now;
+	unsigned header_len;
+	struct tcp_out_options opts;
+	struct tcp_md5sig_key *md5;
+
+	mss_now = tp->mss_cache;
+
+	if (dst) {
+		u32 mtu = dst_mtu(dst);
+		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
+			mss_now = tcp_sync_mss(sk, mtu);
+	}
+
+	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+		     sizeof(struct tcphdr);
+	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
+	 * some common options. If this is an odd packet (because we have SACK
+	 * blocks etc) then our calculated header_len will be different, and
+	 * we have to adjust mss_now correspondingly */
+	if (header_len != tp->tcp_header_len) {
+		int delta = (int) header_len - tp->tcp_header_len;
+		mss_now -= delta;
+	}
+
+	return mss_now;
+}
+
+/* Congestion window validation. (RFC2861) */
+static void tcp_cwnd_validate(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->packets_out >= tp->snd_cwnd) {
+		/* Network is feed fully. */
+		tp->snd_cwnd_used = 0;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	} else {
+		/* Network starves. */
+		if (tp->packets_out > tp->snd_cwnd_used)
+			tp->snd_cwnd_used = tp->packets_out;
+
+		if (sysctl_tcp_slow_start_after_idle &&
+		    (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
+			tcp_cwnd_application_limited(sk);
+	}
+}
+
+/* Returns the portion of skb which can be sent right away without
+ * introducing MSS oddities to segment boundaries. In rare cases where
+ * mss_now != mss_cache, we will request caller to create a small skb
+ * per input skb which could be mostly avoided here (if desired).
+ *
+ * We explicitly want to create a request for splitting write queue tail
+ * to a small skb for Nagle purposes while avoiding unnecessary modulos,
+ * thus all the complexity (cwnd_len is always MSS multiple which we
+ * return whenever allowed by the other factors). Basically we need the
+ * modulo only when the receiver window alone is the limiting factor or
+ * when we would be allowed to send the split-due-to-Nagle skb fully.
+ */
+static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
+					unsigned int mss_now, unsigned int cwnd)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 needed, window, cwnd_len;
+
+	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+	cwnd_len = mss_now * cwnd;
+
+	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
+		return cwnd_len;
+
+	needed = min(skb->len, window);
+
+	if (cwnd_len <= needed)
+		return cwnd_len;
+
+	return needed - needed % mss_now;
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
+					 struct sk_buff *skb)
+{
+	u32 in_flight, cwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tp->snd_cwnd;
+	if (in_flight < cwnd)
+		return (cwnd - in_flight);
+
+	return 0;
+}
+
+/* Initialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
+			     unsigned int mss_now)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
+		tcp_set_skb_tso_segs(sk, skb, mss_now);
+		tso_segs = tcp_skb_pcount(skb);
+	}
+	return tso_segs;
+}
+
+/* Minshall's variant of the Nagle send check. */
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml, tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+				  const struct sk_buff *skb,
+				  unsigned mss_now, int nonagle)
+{
+	return skb->len < mss_now &&
+		((nonagle & TCP_NAGLE_CORK) ||
+		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return 1;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN).
+	 * Nagle can be ignored during F-RTO too (see RFC4138).
+	 */
+	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
+	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
+		return 1;
+
+	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+		return 1;
+
+	return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
+				   unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	if (skb->len > cur_mss)
+		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+	return !after(end_seq, tcp_wnd_end(tp));
+}
+
+/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cwnd_quota;
+
+	tcp_init_tso_segs(sk, skb, cur_mss);
+
+	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+		return 0;
+
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
+		cwnd_quota = 0;
+
+	return cwnd_quota;
+}
+
+/* Test if sending is allowed right now. */
+int tcp_may_send_now(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = tcp_send_head(sk);
+
+	return skb &&
+		tcp_snd_test(sk, skb, tcp_current_mss(sk),
+			     (tcp_skb_is_last(sk, skb) ?
+			      tp->nonagle : TCP_NAGLE_PUSH));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+			unsigned int mss_now, gfp_t gfp)
+{
+	struct sk_buff *buff;
+	int nlen = skb->len - len;
+	u8 flags;
+
+	/* All of a TSO frame must be composed of paged data.  */
+	if (skb->len != skb->data_len)
+		return tcp_fragment(sk, skb, len, mss_now);
+
+	buff = sk_stream_alloc_skb(sk, 0, gfp);
+	if (unlikely(buff == NULL))
+		return -ENOMEM;
+
+	sk->sk_wmem_queued += buff->truesize;
+	sk_mem_charge(sk, buff->truesize);
+	buff->truesize += nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+
+	/* This packet was never sent out yet, so no SACK bits. */
+	TCP_SKB_CB(buff)->sacked = 0;
+
+	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
+	skb_split(skb, buff, len);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(sk, skb, mss_now);
+	tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
+	tcp_insert_write_queue_after(skb, buff, sk);
+
+	return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	u32 send_win, cong_win, limit, in_flight;
+	int win_divisor;
+
+	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
+		goto send_now;
+
+	if (icsk->icsk_ca_state != TCP_CA_Open)
+		goto send_now;
+
+	/* Defer for less than two clock ticks. */
+	if (tp->tso_deferred &&
+	    (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+		goto send_now;
+
+	in_flight = tcp_packets_in_flight(tp);
+
+	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
+
+	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+	/* From in_flight test above, we know that cwnd > in_flight.  */
+	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+	limit = min(send_win, cong_win);
+
+	/* If a full-sized TSO skb can be sent, do it. */
+	if (limit >= sk->sk_gso_max_size)
+		goto send_now;
+
+	/* Middle in queue won't get any more data, full sendable already? */
+	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
+		goto send_now;
+
+	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	if (win_divisor) {
+		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+		/* If at least some fraction of a window is available,
+		 * just use it.
+		 */
+		chunk /= win_divisor;
+		if (limit >= chunk)
+			goto send_now;
+	} else {
+		/* Different approach, try not to defer past a single
+		 * ACK.  Receiver should ACK every other full sized
+		 * frame, so if we have space for more than 3 frames
+		 * then send now.
+		 */
+		if (limit > tcp_max_burst(tp) * tp->mss_cache)
+			goto send_now;
+	}
+
+	/* Ok, it looks like it is advisable to defer.  */
+	tp->tso_deferred = 1 | (jiffies << 1);
+
+	return 1;
+
+send_now:
+	tp->tso_deferred = 0;
+	return 0;
+}
+
+/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets.  This discovers routing
+ * changes resulting in larger path MTUs.
+ *
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise
+ */
+static int tcp_mtu_probe(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *nskb, *next;
+	int len;
+	int probe_size;
+	int size_needed;
+	int copy;
+	int mss_now;
+
+	/* Not currently probing/verifying,
+	 * not in recovery,
+	 * have enough cwnd, and
+	 * not SACKing (the variable headers throw things off) */
+	if (!icsk->icsk_mtup.enabled ||
+	    icsk->icsk_mtup.probe_size ||
+	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+	    tp->snd_cwnd < 11 ||
+	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
+		return -1;
+
+	/* Very simple search strategy: just double the MSS. */
+	mss_now = tcp_current_mss(sk);
+	probe_size = 2 * tp->mss_cache;
+	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+		/* TODO: set timer for probe_converge_event */
+		return -1;
+	}
+
+	/* Have enough data in the send queue to probe? */
+	if (tp->write_seq - tp->snd_nxt < size_needed)
+		return -1;
+
+	if (tp->snd_wnd < size_needed)
+		return -1;
+	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+		return 0;
+
+	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
+	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+		if (!tcp_packets_in_flight(tp))
+			return -1;
+		else
+			return 0;
+	}
+
+	/* We're allowed to probe.  Build it now. */
+	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+		return -1;
+	sk->sk_wmem_queued += nskb->truesize;
+	sk_mem_charge(sk, nskb->truesize);
+
+	skb = tcp_send_head(sk);
+
+	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+	TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
+	TCP_SKB_CB(nskb)->sacked = 0;
+	nskb->csum = 0;
+	nskb->ip_summed = skb->ip_summed;
+
+	tcp_insert_write_queue_before(nskb, skb, sk);
+
+	len = 0;
+	tcp_for_write_queue_from_safe(skb, next, sk) {
+		copy = min_t(int, skb->len, probe_size - len);
+		if (nskb->ip_summed)
+			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+		else
+			nskb->csum = skb_copy_and_csum_bits(skb, 0,
+							    skb_put(nskb, copy),
+							    copy, nskb->csum);
+
+		if (skb->len <= copy) {
+			/* We've eaten all the data from this skb.
+			 * Throw it away. */
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+			tcp_unlink_write_queue(skb, sk);
+			sk_wmem_free_skb(sk, skb);
+		} else {
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+						   ~(TCPHDR_FIN|TCPHDR_PSH);
+			if (!skb_shinfo(skb)->nr_frags) {
+				skb_pull(skb, copy);
+				if (skb->ip_summed != CHECKSUM_PARTIAL)
+					skb->csum = csum_partial(skb->data,
+								 skb->len, 0);
+			} else {
+				__pskb_trim_head(skb, copy);
+				tcp_set_skb_tso_segs(sk, skb, mss_now);
+			}
+			TCP_SKB_CB(skb)->seq += copy;
+		}
+
+		len += copy;
+
+		if (len >= probe_size)
+			break;
+	}
+	tcp_init_tso_segs(sk, nskb, nskb->len);
+
+	/* We're ready to send.  If this fails, the probe will
+	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+		/* Decrement cwnd here because we are sending
+		 * effectively two packets. */
+		tp->snd_cwnd--;
+		tcp_event_new_data_sent(sk, nskb);
+
+		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+		return 1;
+	}
+
+	return -1;
+}
+
+/* This routine writes packets to the network.  It advances the
+ * send_head.  This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * LARGESEND note: !tcp_urg_mode is overkill, only frames between
+ * snd_up-64k-mss .. snd_up cannot be large. However, taking into
+ * account rare use of URG, this is not a big flaw.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
+ */
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			  int push_one, gfp_t gfp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	unsigned int tso_segs, sent_pkts;
+	int cwnd_quota;
+	int result;
+
+	sent_pkts = 0;
+
+	if (!push_one) {
+		/* Do MTU probing. */
+		result = tcp_mtu_probe(sk);
+		if (!result) {
+			return 0;
+		} else if (result > 0) {
+			sent_pkts = 1;
+		}
+	}
+
+	while ((skb = tcp_send_head(sk))) {
+		unsigned int limit;
+
+		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+		BUG_ON(!tso_segs);
+
+		cwnd_quota = tcp_cwnd_test(tp, skb);
+		if (!cwnd_quota)
+			break;
+
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+			break;
+
+		if (tso_segs == 1) {
+			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+						     (tcp_skb_is_last(sk, skb) ?
+						      nonagle : TCP_NAGLE_PUSH))))
+				break;
+		} else {
+			if (!push_one && tcp_tso_should_defer(sk, skb))
+				break;
+		}
+
+		limit = mss_now;
+		if (tso_segs > 1 && !tcp_urg_mode(tp))
+			limit = tcp_mss_split_point(sk, skb, mss_now,
+						    cwnd_quota);
+
+		if (skb->len > limit &&
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+			break;
+
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+			break;
+
+		/* Advance the send_head.  This one is sent out.
+		 * This call will increment packets_out.
+		 */
+		tcp_event_new_data_sent(sk, skb);
+
+		tcp_minshall_update(tp, mss_now, skb);
+		sent_pkts++;
+
+		if (push_one)
+			break;
+	}
+
+	if (likely(sent_pkts)) {
+		tcp_cwnd_validate(sk);
+		return 0;
+	}
+	return !tp->packets_out && tcp_send_head(sk);
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
+			       int nonagle)
+{
+	/* If we are closed, the bytes will have to remain here.
+	 * In time closedown will finish, we empty the write queue and
+	 * all will be happy.
+	 */
+	if (unlikely(sk->sk_state == TCP_CLOSE))
+		return;
+
+	if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
+		tcp_check_probe_timer(sk);
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+	struct sk_buff *skb = tcp_send_head(sk);
+
+	BUG_ON(!skb || skb->len < mss_now);
+
+	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ *  RECV.NEXT + RCV.WIN fixed until:
+ *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ *
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ *
+ * BSD seems to make the following compromise:
+ *
+ *	If the free space is less than the 1/4 of the maximum
+ *	space available and the free space is less than 1/2 mss,
+ *	then set the window to 0.
+ *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ *	Otherwise, just prevent the window from shrinking
+ *	and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	/* MSS for the peer's data.  Previous versions used mss_clamp
+	 * here.  I don't know if the value based on our guesses
+	 * of peer's MSS is better for the performance.  It's more correct
+	 * but may be worse for the performance because of rcv_mss
+	 * fluctuations.  --SAW  1998/11/1
+	 */
+	int mss = icsk->icsk_ack.rcv_mss;
+	int free_space = tcp_space(sk);
+	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+	int window;
+
+	if (mss > full_space)
+		mss = full_space;
+
+	if (free_space < (full_space >> 1)) {
+		icsk->icsk_ack.quick = 0;
+
+		if (tcp_memory_pressure)
+			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
+					       4U * tp->advmss);
+
+		if (free_space < mss)
+			return 0;
+	}
+
+	if (free_space > tp->rcv_ssthresh)
+		free_space = tp->rcv_ssthresh;
+
+	/* Don't do rounding if we are using window scaling, since the
+	 * scaled window will not line up with the MSS boundary anyway.
+	 */
+	window = tp->rcv_wnd;
+	if (tp->rx_opt.rcv_wscale) {
+		window = free_space;
+
+		/* Advertise enough space so that it won't get scaled away.
+		 * Import case: prevent zero window announcement if
+		 * 1<<rcv_wscale > mss.
+		 */
+		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+				  << tp->rx_opt.rcv_wscale);
+	} else {
+		/* Get the largest window that is a nice multiple of mss.
+		 * Window clamp already applied above.
+		 * If our current window offering is within 1 mss of the
+		 * free space we just keep it. This prevents the divide
+		 * and multiply from happening most of the time.
+		 * We also don't do any window rounding when the free space
+		 * is too small.
+		 */
+		if (window <= free_space - mss || window > free_space)
+			window = (free_space / mss) * mss;
+		else if (mss == full_space &&
+			 free_space > window + (full_space >> 1))
+			window = free_space;
+	}
+
+	return window;
+}
+
+/* Collapses two adjacent SKB's during retransmission. */
+static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+	int skb_size, next_skb_size;
+
+	skb_size = skb->len;
+	next_skb_size = next_skb->len;
+
+	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+
+	tcp_highest_sack_combine(sk, next_skb, skb);
+
+	tcp_unlink_write_queue(next_skb, sk);
+
+	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
+				  next_skb_size);
+
+	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
+		skb->ip_summed = CHECKSUM_PARTIAL;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+
+	/* Update sequence range on original skb. */
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+	/* Merge over control information. This moves PSH/FIN etc. over */
+	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
+
+	/* All done, get rid of second SKB and account for it so
+	 * packet counting does not break.
+	 */
+	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+
+	/* changed transmit queue under us so clear hints */
+	tcp_clear_retrans_hints_partial(tp);
+	if (next_skb == tp->retransmit_skb_hint)
+		tp->retransmit_skb_hint = skb;
+
+	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
+	sk_wmem_free_skb(sk, next_skb);
+}
+
+/* Check if coalescing SKBs is legal. */
+static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
+{
+	if (tcp_skb_pcount(skb) > 1)
+		return 0;
+	/* TODO: SACK collapsing could be used to remove this condition */
+	if (skb_shinfo(skb)->nr_frags != 0)
+		return 0;
+	if (skb_cloned(skb))
+		return 0;
+	if (skb == tcp_send_head(sk))
+		return 0;
+	/* Some heurestics for collapsing over SACK'd could be invented */
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+		return 0;
+
+	return 1;
+}
+
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
+				     int space)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = to, *tmp;
+	int first = 1;
+
+	if (!sysctl_tcp_retrans_collapse)
+		return;
+	if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
+		return;
+
+	tcp_for_write_queue_from_safe(skb, tmp, sk) {
+		if (!tcp_can_collapse(sk, skb))
+			break;
+
+		space -= skb->len;
+
+		if (first) {
+			first = 0;
+			continue;
+		}
+
+		if (space < 0)
+			break;
+		/* Punt if not enough space exists in the first SKB for
+		 * the data in the second
+		 */
+		if (skb->len > skb_tailroom(to))
+			break;
+
+		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
+			break;
+
+		tcp_collapse_retrans(sk, to);
+	}
+}
+
+/* This retransmits one SKB.  Policy decisions and retransmit queue
+ * state updates are done by the caller.  Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned int cur_mss;
+	int err;
+
+	/* Inconslusive MTU probe */
+	if (icsk->icsk_mtup.probe_size) {
+		icsk->icsk_mtup.probe_size = 0;
+	}
+
+	/* Do not sent more than we queued. 1/4 is reserved for possible
+	 * copying overhead: fragmentation, tunneling, mangling etc.
+	 */
+	if (atomic_read(&sk->sk_wmem_alloc) >
+	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+		return -EAGAIN;
+
+	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+			BUG();
+		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+			return -ENOMEM;
+	}
+
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	cur_mss = tcp_current_mss(sk);
+
+	/* If receiver has shrunk his window, and skb is out of
+	 * new window, do not retransmit it. The exception is the
+	 * case, when window is shrunk to zero. In this case
+	 * our retransmit serves as a zero window probe.
+	 */
+	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+	    TCP_SKB_CB(skb)->seq != tp->snd_una)
+		return -EAGAIN;
+
+	if (skb->len > cur_mss) {
+		if (tcp_fragment(sk, skb, cur_mss, cur_mss))
+			return -ENOMEM; /* We'll try again later. */
+	} else {
+		int oldpcount = tcp_skb_pcount(skb);
+
+		if (unlikely(oldpcount > 1)) {
+			tcp_init_tso_segs(sk, skb, cur_mss);
+			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
+		}
+	}
+
+	tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+	/* Some Solaris stacks overoptimize and ignore the FIN on a
+	 * retransmit when old data is attached.  So strip it off
+	 * since it is cheap to do so and saves bytes on the network.
+	 */
+	if (skb->len > 0 &&
+	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
+	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+		if (!pskb_trim(skb, 0)) {
+			/* Reuse, even though it does some unnecessary work */
+			tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
+					     TCP_SKB_CB(skb)->flags);
+			skb->ip_summed = CHECKSUM_NONE;
+		}
+	}
+
+	/* Make a copy, if the first transmission SKB clone we made
+	 * is still in somebody's hands, else make a clone.
+	 */
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+
+	if (err == 0) {
+		/* Update global TCP statistics. */
+		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+
+		tp->total_retrans++;
+
+#if FASTRETRANS_DEBUG > 0
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "retrans_out leaked.\n");
+		}
+#endif
+		if (!tp->retrans_out)
+			tp->lost_retrans_low = tp->snd_nxt;
+		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+		tp->retrans_out += tcp_skb_pcount(skb);
+
+		/* Save stamp of the first retransmit. */
+		if (!tp->retrans_stamp)
+			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+
+		tp->undo_retrans += tcp_skb_pcount(skb);
+
+		/* snd_nxt is stored to detect loss of retransmitted segment,
+		 * see tcp_input.c tcp_sacktag_write_queue().
+		 */
+		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+	}
+	return err;
+}
+
+/* Check if we forward retransmits are possible in the current
+ * window/congestion state.
+ */
+static int tcp_can_forward_retransmit(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Forward retransmissions are possible only during Recovery. */
+	if (icsk->icsk_ca_state != TCP_CA_Recovery)
+		return 0;
+
+	/* No forward retransmissions in Reno are possible. */
+	if (tcp_is_reno(tp))
+		return 0;
+
+	/* Yeah, we have to make difficult choice between forward transmission
+	 * and retransmission... Both ways have their merits...
+	 *
+	 * For now we do not retransmit anything, while we have some new
+	 * segments to send. In the other cases, follow rule 3 for
+	 * NextSeg() specified in RFC3517.
+	 */
+
+	if (tcp_may_send_now(sk))
+		return 0;
+
+	return 1;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	struct sk_buff *hole = NULL;
+	u32 last_lost;
+	int mib_idx;
+	int fwd_rexmitting = 0;
+
+	if (!tp->packets_out)
+		return;
+
+	if (!tp->lost_out)
+		tp->retransmit_high = tp->snd_una;
+
+	if (tp->retransmit_skb_hint) {
+		skb = tp->retransmit_skb_hint;
+		last_lost = TCP_SKB_CB(skb)->end_seq;
+		if (after(last_lost, tp->retransmit_high))
+			last_lost = tp->retransmit_high;
+	} else {
+		skb = tcp_write_queue_head(sk);
+		last_lost = tp->snd_una;
+	}
+
+	tcp_for_write_queue_from(skb, sk) {
+		__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+		if (skb == tcp_send_head(sk))
+			break;
+		/* we could do better than to assign each time */
+		if (hole == NULL)
+			tp->retransmit_skb_hint = skb;
+
+		/* Assume this retransmit will generate
+		 * only one packet for congestion window
+		 * calculation purposes.  This works because
+		 * tcp_retransmit_skb() will chop up the
+		 * packet to be MSS sized and all the
+		 * packet counting works out.
+		 */
+		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+			return;
+
+		if (fwd_rexmitting) {
+begin_fwd:
+			if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
+				break;
+			mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
+
+		} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
+			tp->retransmit_high = last_lost;
+			if (!tcp_can_forward_retransmit(sk))
+				break;
+			/* Backtrack if necessary to non-L'ed skb */
+			if (hole != NULL) {
+				skb = hole;
+				hole = NULL;
+			}
+			fwd_rexmitting = 1;
+			goto begin_fwd;
+
+		} else if (!(sacked & TCPCB_LOST)) {
+			if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+				hole = skb;
+			continue;
+
+		} else {
+			last_lost = TCP_SKB_CB(skb)->end_seq;
+			if (icsk->icsk_ca_state != TCP_CA_Loss)
+				mib_idx = LINUX_MIB_TCPFASTRETRANS;
+			else
+				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
+		}
+
+		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
+			continue;
+
+		if (tcp_retransmit_skb(sk, skb))
+			return;
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+
+		if (skb == tcp_write_queue_head(sk))
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  inet_csk(sk)->icsk_rto,
+						  TCP_RTO_MAX);
+	}
+}
+
+/* Send a fin.  The caller locks the socket for us.  This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = tcp_write_queue_tail(sk);
+	int mss_now;
+
+	/* Optimization, tack on the FIN if we have a queue of
+	 * unsent frames.  But be careful about outgoing SACKS
+	 * and IP options.
+	 */
+	mss_now = tcp_current_mss(sk);
+
+	if (tcp_send_head(sk) != NULL) {
+		TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
+		TCP_SKB_CB(skb)->end_seq++;
+		tp->write_seq++;
+	} else {
+		/* Socket is locked, keep trying until memory is available. */
+		for (;;) {
+			skb = alloc_skb_fclone(MAX_TCP_HEADER,
+					       sk->sk_allocation);
+			if (skb)
+				break;
+			yield();
+		}
+
+		/* Reserve space for headers and prepare control bits. */
+		skb_reserve(skb, MAX_TCP_HEADER);
+		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+		tcp_init_nondata_skb(skb, tp->write_seq,
+				     TCPHDR_ACK | TCPHDR_FIN);
+		tcp_queue_skb(sk, skb);
+	}
+	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue.  This behavior is recommended
+ * by RFC 2525, section 2.17.  -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+{
+	struct sk_buff *skb;
+
+	/* NOTE: No TCP options attached and we never retransmit this. */
+	skb = alloc_skb(MAX_TCP_HEADER, priority);
+	if (!skb) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+			     TCPHDR_ACK | TCPHDR_RST);
+	/* Send it off. */
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	if (tcp_transmit_skb(sk, skb, 0, priority))
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
+
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
+}
+
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	skb = tcp_write_queue_head(sk);
+	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
+		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+		return -EFAULT;
+	}
+	if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
+		if (skb_cloned(skb)) {
+			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+			if (nskb == NULL)
+				return -ENOMEM;
+			tcp_unlink_write_queue(skb, sk);
+			skb_header_release(nskb);
+			__tcp_add_write_queue_head(sk, nskb);
+			sk_wmem_free_skb(sk, skb);
+			sk->sk_wmem_queued += nskb->truesize;
+			sk_mem_charge(sk, nskb->truesize);
+			skb = nskb;
+		}
+
+		TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
+		TCP_ECN_send_synack(tcp_sk(sk), skb);
+	}
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+}
+
+/* Prepare a SYN-ACK. */
+struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+				struct request_sock *req,
+				struct request_values *rvp)
+{
+	struct tcp_out_options opts;
+	struct tcp_extend_values *xvp = tcp_xv(rvp);
+	struct inet_request_sock *ireq = inet_rsk(req);
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_cookie_values *cvp = tp->cookie_values;
+	struct tcphdr *th;
+	struct sk_buff *skb;
+	struct tcp_md5sig_key *md5;
+	int tcp_header_size;
+	int mss;
+	int s_data_desired = 0;
+
+	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
+		s_data_desired = cvp->s_data_desired;
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+
+	skb_dst_set(skb, dst_clone(dst));
+
+	mss = dst_metric_advmss(dst);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
+		mss = tp->rx_opt.user_mss;
+
+	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+		__u8 rcv_wscale;
+		/* Set this up on the first call only */
+		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+		/* limit the window selection if the user enforce a smaller rx buffer */
+		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+			req->window_clamp = tcp_full_space(sk);
+
+		/* tcp_full_space because it is guaranteed to be the first packet */
+		tcp_select_initial_window(tcp_full_space(sk),
+			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+			&req->rcv_wnd,
+			&req->window_clamp,
+			ireq->wscale_ok,
+			&rcv_wscale,
+			dst_metric(dst, RTAX_INITRWND));
+		ireq->rcv_wscale = rcv_wscale;
+	}
+
+	memset(&opts, 0, sizeof(opts));
+#ifdef CONFIG_SYN_COOKIES
+	if (unlikely(req->cookie_ts))
+		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
+	else
+#endif
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	tcp_header_size = tcp_synack_options(sk, req, mss,
+					     skb, &opts, &md5, xvp)
+			+ sizeof(*th);
+
+	skb_push(skb, tcp_header_size);
+	skb_reset_transport_header(skb);
+
+	th = tcp_hdr(skb);
+	memset(th, 0, sizeof(struct tcphdr));
+	th->syn = 1;
+	th->ack = 1;
+	TCP_ECN_make_synack(req, th);
+	th->source = ireq->loc_port;
+	th->dest = ireq->rmt_port;
+	/* Setting of flags are superfluous here for callers (and ECE is
+	 * not even correctly set)
+	 */
+	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
+			     TCPHDR_SYN | TCPHDR_ACK);
+
+	if (OPTION_COOKIE_EXTENSION & opts.options) {
+		if (s_data_desired) {
+			u8 *buf = skb_put(skb, s_data_desired);
+
+			/* copy data directly from the listening socket. */
+			memcpy(buf, cvp->s_data_payload, s_data_desired);
+			TCP_SKB_CB(skb)->end_seq += s_data_desired;
+		}
+
+		if (opts.hash_size > 0) {
+			__u32 workspace[SHA_WORKSPACE_WORDS];
+			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
+			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
+
+			/* Secret recipe depends on the Timestamp, (future)
+			 * Sequence and Acknowledgment Numbers, Initiator
+			 * Cookie, and others handled by IP variant caller.
+			 */
+			*tail-- ^= opts.tsval;
+			*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
+			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
+
+			/* recommended */
+			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
+			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
+
+			sha_transform((__u32 *)&xvp->cookie_bakery[0],
+				      (char *)mess,
+				      &workspace[0]);
+			opts.hash_location =
+				(__u8 *)&xvp->cookie_bakery[0];
+		}
+	}
+
+	th->seq = htonl(TCP_SKB_CB(skb)->seq);
+	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
+
+	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+	th->window = htons(min(req->rcv_wnd, 65535U));
+	tcp_options_write((__be32 *)(th + 1), tp, &opts);
+	th->doff = (tcp_header_size >> 2);
+	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Okay, we have all we need - do the md5 hash if needed */
+	if (md5) {
+		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
+					       md5, NULL, req, skb);
+	}
+#endif
+
+	return skb;
+}
+EXPORT_SYMBOL(tcp_make_synack);
+
+/* Do all connect socket setups that can be done AF independent. */
+static void tcp_connect_init(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u8 rcv_wscale;
+
+	/* We'll fix this up when we get a response from the other end.
+	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+	 */
+	tp->tcp_header_len = sizeof(struct tcphdr) +
+		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+	/* If user gave his TCP_MAXSEG, record it to clamp */
+	if (tp->rx_opt.user_mss)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->max_window = 0;
+	tcp_mtup_init(sk);
+	tcp_sync_mss(sk, dst_mtu(dst));
+
+	if (!tp->window_clamp)
+		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+	tp->advmss = dst_metric_advmss(dst);
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
+		tp->advmss = tp->rx_opt.user_mss;
+
+	tcp_initialize_rcv_mss(sk);
+
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+		tp->window_clamp = tcp_full_space(sk);
+
+	tcp_select_initial_window(tcp_full_space(sk),
+				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+				  &tp->rcv_wnd,
+				  &tp->window_clamp,
+				  sysctl_tcp_window_scaling,
+				  &rcv_wscale,
+				  dst_metric(dst, RTAX_INITRWND));
+
+	tp->rx_opt.rcv_wscale = rcv_wscale;
+	tp->rcv_ssthresh = tp->rcv_wnd;
+
+	sk->sk_err = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->snd_wnd = 0;
+	tcp_init_wl(tp, 0);
+	tp->snd_una = tp->write_seq;
+	tp->snd_sml = tp->write_seq;
+	tp->snd_up = tp->write_seq;
+	tp->rcv_nxt = 0;
+	tp->rcv_wup = 0;
+	tp->copied_seq = 0;
+
+	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+	inet_csk(sk)->icsk_retransmits = 0;
+	tcp_clear_retrans(tp);
+}
+
+/* Build a SYN and send it off. */
+int tcp_connect(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+	int err;
+
+	tcp_connect_init(sk);
+
+	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
+	if (unlikely(buff == NULL))
+		return -ENOBUFS;
+
+	/* Reserve space for headers. */
+	skb_reserve(buff, MAX_TCP_HEADER);
+
+	tp->snd_nxt = tp->write_seq;
+	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+	TCP_ECN_send_syn(sk, buff);
+
+	/* Send it off. */
+	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+	skb_header_release(buff);
+	__tcp_add_write_queue_tail(sk, buff);
+	sk->sk_wmem_queued += buff->truesize;
+	sk_mem_charge(sk, buff->truesize);
+	tp->packets_out += tcp_skb_pcount(buff);
+	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+	if (err == -ECONNREFUSED)
+		return err;
+
+	/* We change tp->snd_nxt after the tcp_transmit_skb() call
+	 * in order to make this packet get counted in tcpOutSegs.
+	 */
+	tp->snd_nxt = tp->write_seq;
+	tp->pushed_seq = tp->write_seq;
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
+	/* Timer for repeating the SYN until an answer. */
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_connect);
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ato = icsk->icsk_ack.ato;
+	unsigned long timeout;
+
+	if (ato > TCP_DELACK_MIN) {
+		const struct tcp_sock *tp = tcp_sk(sk);
+		int max_ato = HZ / 2;
+
+		if (icsk->icsk_ack.pingpong ||
+		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
+			max_ato = TCP_DELACK_MAX;
+
+		/* Slow path, intersegment interval is "high". */
+
+		/* If some rtt estimate is known, use it to bound delayed ack.
+		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
+		 * directly.
+		 */
+		if (tp->srtt) {
+			int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
+
+			if (rtt < max_ato)
+				max_ato = rtt;
+		}
+
+		ato = min(ato, max_ato);
+	}
+
+	/* Stay within the limit we were given */
+	timeout = jiffies + ato;
+
+	/* Use new timeout only if there wasn't a older one earlier. */
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 */
+		if (icsk->icsk_ack.blocked ||
+		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+			tcp_send_ack(sk);
+			return;
+		}
+
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
+	}
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+	struct sk_buff *buff;
+
+	/* If we have been reset, we may not send again. */
+	if (sk->sk_state == TCP_CLOSE)
+		return;
+
+	/* We are not putting this on the write queue, so
+	 * tcp_transmit_skb() will set the ownership to this
+	 * sock.
+	 */
+	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	if (buff == NULL) {
+		inet_csk_schedule_ack(sk);
+		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+					  TCP_DELACK_MAX, TCP_RTO_MAX);
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(buff, MAX_TCP_HEADER);
+	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+
+	/* Send it off, this clears delayed acks for us. */
+	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+}
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	/* We don't queue it, tcp_transmit_skb() sets ownership. */
+	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	if (skb == NULL)
+		return -1;
+
+	/* Reserve space for headers and set control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	/* Use a previous sequence.  This should cause the other
+	 * end to send an ack.  Don't queue or clone SKB, just
+	 * send it.
+	 */
+	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+}
+
+/* Initiate keepalive or window probe from timer. */
+int tcp_write_wakeup(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (sk->sk_state == TCP_CLOSE)
+		return -1;
+
+	if ((skb = tcp_send_head(sk)) != NULL &&
+	    before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+		int err;
+		unsigned int mss = tcp_current_mss(sk);
+		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+
+		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+		/* We are probing the opening of a window
+		 * but the window size is != 0
+		 * must have been a result SWS avoidance ( sender )
+		 */
+		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+		    skb->len > mss) {
+			seg_size = min(seg_size, mss);
+			TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+			if (tcp_fragment(sk, skb, seg_size, mss))
+				return -1;
+		} else if (!tcp_skb_pcount(skb))
+			tcp_set_skb_tso_segs(sk, skb, mss);
+
+		TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+		if (!err)
+			tcp_event_new_data_sent(sk, skb);
+		return err;
+	} else {
+		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
+			tcp_xmit_probe_skb(sk, 1);
+		return tcp_xmit_probe_skb(sk, 0);
+	}
+}
+
+/* A window probe timeout has occurred.  If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err;
+
+	err = tcp_write_wakeup(sk);
+
+	if (tp->packets_out || !tcp_send_head(sk)) {
+		/* Cancel probe timer, if it is not required. */
+		icsk->icsk_probes_out = 0;
+		icsk->icsk_backoff = 0;
+		return;
+	}
+
+	if (err <= 0) {
+		if (icsk->icsk_backoff < sysctl_tcp_retries2)
+			icsk->icsk_backoff++;
+		icsk->icsk_probes_out++;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
+	} else {
+		/* If packet was not sent due to local congestion,
+		 * do not backoff and do not remember icsk_probes_out.
+		 * Let local senders to fight for local resources.
+		 *
+		 * Use accumulated backoff yet.
+		 */
+		if (!icsk->icsk_probes_out)
+			icsk->icsk_probes_out = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  min(icsk->icsk_rto << icsk->icsk_backoff,
+					      TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
+	}
+}
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
new file mode 100644
index 00000000..85ee7eb7
--- /dev/null
+++ b/net/ipv4/tcp_probe.c
@@ -0,0 +1,258 @@
+/*
+ * tcpprobe - Observe the TCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <net/tcp.h>
+
+MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
+MODULE_DESCRIPTION("TCP cwnd snooper");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.1");
+
+static int port __read_mostly = 0;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static unsigned int bufsize __read_mostly = 4096;
+MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
+module_param(bufsize, uint, 0);
+
+static int full __read_mostly;
+MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
+module_param(full, int, 0);
+
+static const char procname[] = "tcpprobe";
+
+struct tcp_log {
+	ktime_t tstamp;
+	__be32	saddr, daddr;
+	__be16	sport, dport;
+	u16	length;
+	u32	snd_nxt;
+	u32	snd_una;
+	u32	snd_wnd;
+	u32	snd_cwnd;
+	u32	ssthresh;
+	u32	srtt;
+};
+
+static struct {
+	spinlock_t	lock;
+	wait_queue_head_t wait;
+	ktime_t		start;
+	u32		lastcwnd;
+
+	unsigned long	head, tail;
+	struct tcp_log	*log;
+} tcp_probe;
+
+
+static inline int tcp_probe_used(void)
+{
+	return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
+}
+
+static inline int tcp_probe_avail(void)
+{
+	return bufsize - tcp_probe_used() - 1;
+}
+
+/*
+ * Hook inserted to be called before each receive packet.
+ * Note: arguments must match tcp_rcv_established()!
+ */
+static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			       struct tcphdr *th, unsigned len)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+
+	/* Only update if port matches */
+	if ((port == 0 || ntohs(inet->inet_dport) == port ||
+	     ntohs(inet->inet_sport) == port) &&
+	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
+
+		spin_lock(&tcp_probe.lock);
+		/* If log fills, just silently drop */
+		if (tcp_probe_avail() > 1) {
+			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
+
+			p->tstamp = ktime_get();
+			p->saddr = inet->inet_saddr;
+			p->sport = inet->inet_sport;
+			p->daddr = inet->inet_daddr;
+			p->dport = inet->inet_dport;
+			p->length = skb->len;
+			p->snd_nxt = tp->snd_nxt;
+			p->snd_una = tp->snd_una;
+			p->snd_cwnd = tp->snd_cwnd;
+			p->snd_wnd = tp->snd_wnd;
+			p->ssthresh = tcp_current_ssthresh(sk);
+			p->srtt = tp->srtt >> 3;
+
+			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
+		}
+		tcp_probe.lastcwnd = tp->snd_cwnd;
+		spin_unlock(&tcp_probe.lock);
+
+		wake_up(&tcp_probe.wait);
+	}
+
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe tcp_jprobe = {
+	.kp = {
+		.symbol_name	= "tcp_rcv_established",
+	},
+	.entry	= jtcp_rcv_established,
+};
+
+static int tcpprobe_open(struct inode * inode, struct file * file)
+{
+	/* Reset (empty) log */
+	spin_lock_bh(&tcp_probe.lock);
+	tcp_probe.head = tcp_probe.tail = 0;
+	tcp_probe.start = ktime_get();
+	spin_unlock_bh(&tcp_probe.lock);
+
+	return 0;
+}
+
+static int tcpprobe_sprint(char *tbuf, int n)
+{
+	const struct tcp_log *p
+		= tcp_probe.log + tcp_probe.tail;
+	struct timespec tv
+		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+
+	return scnprintf(tbuf, n,
+			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
+			(unsigned long) tv.tv_sec,
+			(unsigned long) tv.tv_nsec,
+			&p->saddr, ntohs(p->sport),
+			&p->daddr, ntohs(p->dport),
+			p->length, p->snd_nxt, p->snd_una,
+			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt);
+}
+
+static ssize_t tcpprobe_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos)
+{
+	int error = 0;
+	size_t cnt = 0;
+
+	if (!buf)
+		return -EINVAL;
+
+	while (cnt < len) {
+		char tbuf[164];
+		int width;
+
+		/* Wait for data in buffer */
+		error = wait_event_interruptible(tcp_probe.wait,
+						 tcp_probe_used() > 0);
+		if (error)
+			break;
+
+		spin_lock_bh(&tcp_probe.lock);
+		if (tcp_probe.head == tcp_probe.tail) {
+			/* multiple readers race? */
+			spin_unlock_bh(&tcp_probe.lock);
+			continue;
+		}
+
+		width = tcpprobe_sprint(tbuf, sizeof(tbuf));
+
+		if (cnt + width < len)
+			tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
+
+		spin_unlock_bh(&tcp_probe.lock);
+
+		/* if record greater than space available
+		   return partial buffer (so far) */
+		if (cnt + width >= len)
+			break;
+
+		if (copy_to_user(buf + cnt, tbuf, width))
+			return -EFAULT;
+		cnt += width;
+	}
+
+	return cnt == 0 ? error : cnt;
+}
+
+static const struct file_operations tcpprobe_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = tcpprobe_open,
+	.read    = tcpprobe_read,
+	.llseek  = noop_llseek,
+};
+
+static __init int tcpprobe_init(void)
+{
+	int ret = -ENOMEM;
+
+	init_waitqueue_head(&tcp_probe.wait);
+	spin_lock_init(&tcp_probe.lock);
+
+	if (bufsize == 0)
+		return -EINVAL;
+
+	bufsize = roundup_pow_of_two(bufsize);
+	tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
+	if (!tcp_probe.log)
+		goto err0;
+
+	if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops))
+		goto err0;
+
+	ret = register_jprobe(&tcp_jprobe);
+	if (ret)
+		goto err1;
+
+	pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
+	return 0;
+ err1:
+	proc_net_remove(&init_net, procname);
+ err0:
+	kfree(tcp_probe.log);
+	return ret;
+}
+module_init(tcpprobe_init);
+
+static __exit void tcpprobe_exit(void)
+{
+	proc_net_remove(&init_net, procname);
+	unregister_jprobe(&tcp_jprobe);
+	kfree(tcp_probe.log);
+}
+module_exit(tcpprobe_exit);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 00000000..8ce55b8a
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,62 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See http://www.deneholme.net/tom/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT	50U
+#define TCP_SCALABLE_MD_SCALE	3
+
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+	else
+		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
+}
+
+static u32 tcp_scalable_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_scalable __read_mostly = {
+	.ssthresh	= tcp_scalable_ssthresh,
+	.cong_avoid	= tcp_scalable_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+	return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 00000000..ecd44b0c
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,602 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
+int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
+int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
+int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_thin_linear_timeouts __read_mostly;
+
+static void tcp_write_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
+}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
+
+static void tcp_write_err(struct sock *sk)
+{
+	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+	sk->sk_error_report(sk);
+
+	tcp_done(sk);
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criteria is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ *    limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, int do_reset)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int shift = 0;
+
+	/* If peer does not open window for long time, or did not transmit
+	 * anything for long time, penalize it. */
+	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+		shift++;
+
+	/* If some dubious ICMP arrived, penalize even more. */
+	if (sk->sk_err_soft)
+		shift++;
+
+	if (tcp_too_many_orphans(sk, shift)) {
+		if (net_ratelimit())
+			printk(KERN_INFO "Out of socket memory\n");
+
+		/* Catch exceptional cases, when connection requires reset.
+		 *      1. Last segment was sent recently. */
+		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+		    /*  2. Window is closed. */
+		    (!tp->snd_wnd && !tp->packets_out))
+			do_reset = 1;
+		if (do_reset)
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		tcp_done(sk);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+		return 1;
+	}
+	return 0;
+}
+
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+
+	/* We know from an ICMP that something is wrong. */
+	if (sk->sk_err_soft && !alive)
+		retries = 0;
+
+	/* However, if socket sent something recently, select some safe
+	 * number of retries. 8 corresponds to >100 seconds with minimal
+	 * RTO of 200msec. */
+	if (retries == 0 && alive)
+		retries = 8;
+	return retries;
+}
+
+static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
+{
+	/* Black hole detection */
+	if (sysctl_tcp_mtu_probing) {
+		if (!icsk->icsk_mtup.enabled) {
+			icsk->icsk_mtup.enabled = 1;
+			tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		} else {
+			struct tcp_sock *tp = tcp_sk(sk);
+			int mss;
+
+			mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
+			mss = min(sysctl_tcp_base_mss, mss);
+			mss = max(mss, 68 - tp->tcp_header_len);
+			icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+			tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		}
+	}
+}
+
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+				  unsigned int boundary,
+				  unsigned int timeout,
+				  bool syn_set)
+{
+	unsigned int linear_backoff_thresh, start_ts;
+	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+
+	if (!inet_csk(sk)->icsk_retransmits)
+		return false;
+
+	if (unlikely(!tcp_sk(sk)->retrans_stamp))
+		start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+	else
+		start_ts = tcp_sk(sk)->retrans_stamp;
+
+	if (likely(timeout == 0)) {
+		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+
+		if (boundary <= linear_backoff_thresh)
+			timeout = ((2 << boundary) - 1) * rto_base;
+		else
+			timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+				(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	}
+	return (tcp_time_stamp - start_ts) >= timeout;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int retry_until;
+	bool do_reset, syn_set = 0;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		if (icsk->icsk_retransmits)
+			dst_negative_advice(sk);
+		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		syn_set = 1;
+	} else {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+			/* Black hole detection */
+			tcp_mtu_probing(icsk, sk);
+
+			dst_negative_advice(sk);
+		}
+
+		retry_until = sysctl_tcp_retries2;
+		if (sock_flag(sk, SOCK_DEAD)) {
+			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
+
+			retry_until = tcp_orphan_retries(sk, alive);
+			do_reset = alive ||
+				!retransmits_timed_out(sk, retry_until, 0, 0);
+
+			if (tcp_out_of_resources(sk, do_reset))
+				return 1;
+		}
+	}
+
+	if (retransmits_timed_out(sk, retry_until,
+				  syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
+		/* Has it gone just too far? */
+		tcp_write_err(sk);
+		return 1;
+	}
+	return 0;
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		icsk->icsk_ack.blocked = 1;
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
+		goto out_unlock;
+	}
+
+	sk_mem_reclaim_partial(sk);
+
+	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+		goto out;
+
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+		goto out;
+	}
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
+		struct sk_buff *skb;
+
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
+
+		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+			sk_backlog_rcv(sk, skb);
+
+		tp->ucopy.memory = 0;
+	}
+
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!icsk->icsk_ack.pingpong) {
+			/* Delayed ACK missed: inflate ATO. */
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+		} else {
+			/* Delayed ACK missed: leave pingpong mode and
+			 * deflate ATO.
+			 */
+			icsk->icsk_ack.pingpong = 0;
+			icsk->icsk_ack.ato      = TCP_ATO_MIN;
+		}
+		tcp_send_ack(sk);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+	}
+
+out:
+	if (tcp_memory_pressure)
+		sk_mem_reclaim(sk);
+out_unlock:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int max_probes;
+
+	if (tp->packets_out || !tcp_send_head(sk)) {
+		icsk->icsk_probes_out = 0;
+		return;
+	}
+
+	/* *WARNING* RFC 1122 forbids this
+	 *
+	 * It doesn't AFAIK, because we kill the retransmit timer -AK
+	 *
+	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+	 * this behaviour in Solaris down as a bug fix. [AC]
+	 *
+	 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
+	 * even if they advertise zero window. Hence, connection is killed only
+	 * if we received no ACKs for normal connection timeout. It is not killed
+	 * only because window stays zero for some time, window may be zero
+	 * until armageddon and even later. We are in full accordance
+	 * with RFCs, only probe timer combines both retransmission timeout
+	 * and probe timeout in one bottle.				--ANK
+	 */
+	max_probes = sysctl_tcp_retries2;
+
+	if (sock_flag(sk, SOCK_DEAD)) {
+		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
+
+		max_probes = tcp_orphan_retries(sk, alive);
+
+		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
+			return;
+	}
+
+	if (icsk->icsk_probes_out > max_probes) {
+		tcp_write_err(sk);
+	} else {
+		/* Only send another probe if we didn't close things up. */
+		tcp_send_probe0(sk);
+	}
+}
+
+/*
+ *	The TCP retransmit timer.
+ */
+
+void tcp_retransmit_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!tp->packets_out)
+		goto out;
+
+	WARN_ON(tcp_write_queue_empty(sk));
+
+	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+		/* Receiver dastardly shrinks window. Our retransmits
+		 * become zero probes, but we should not timeout this
+		 * connection. If the socket is an orphan, time it out,
+		 * we cannot allow such beasts to hang infinitely.
+		 */
+#ifdef TCP_DEBUG
+		struct inet_sock *inet = inet_sk(sk);
+		if (sk->sk_family == AF_INET) {
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+			       &inet->inet_daddr, ntohs(inet->inet_dport),
+			       inet->inet_num, tp->snd_una, tp->snd_nxt);
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		else if (sk->sk_family == AF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+			       &np->daddr, ntohs(inet->inet_dport),
+			       inet->inet_num, tp->snd_una, tp->snd_nxt);
+		}
+#endif
+#endif
+		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+			tcp_write_err(sk);
+			goto out;
+		}
+		tcp_enter_loss(sk, 0);
+		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+		__sk_dst_reset(sk);
+		goto out_reset_timer;
+	}
+
+	if (tcp_write_timeout(sk))
+		goto out;
+
+	if (icsk->icsk_retransmits == 0) {
+		int mib_idx;
+
+		if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
+			else
+				mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
+		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
+			mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+		} else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+			   tp->sacked_out) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKFAILURES;
+			else
+				mib_idx = LINUX_MIB_TCPRENOFAILURES;
+		} else {
+			mib_idx = LINUX_MIB_TCPTIMEOUTS;
+		}
+		NET_INC_STATS_BH(sock_net(sk), mib_idx);
+	}
+
+	if (tcp_use_frto(sk)) {
+		tcp_enter_frto(sk);
+	} else {
+		tcp_enter_loss(sk, 0);
+	}
+
+	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
+		/* Retransmission failed because of local congestion,
+		 * do not backoff.
+		 */
+		if (!icsk->icsk_retransmits)
+			icsk->icsk_retransmits = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
+		goto out;
+	}
+
+	/* Increase the timeout each time we retransmit.  Note that
+	 * we do not increase the rtt estimate.  rto is initialized
+	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+	 * that doubling rto each time is the least we can get away with.
+	 * In KA9Q, Karn uses this for the first few times, and then
+	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
+	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+	 * defined in the protocol as the maximum possible RTT.  I guess
+	 * we'll have to use something other than TCP to talk to the
+	 * University of Mars.
+	 *
+	 * PAWS allows us longer timeouts and large windows, so once
+	 * implemented ftp to mars will work nicely. We will have to fix
+	 * the 120 second clamps though!
+	 */
+	icsk->icsk_backoff++;
+	icsk->icsk_retransmits++;
+
+out_reset_timer:
+	/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+	 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+	 * might be increased if the stream oscillates between thin and thick,
+	 * thus the old value might already be too high compared to the value
+	 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+	 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+	 * exponential backoff behaviour to avoid continue hammering
+	 * linear-timeout retransmissions into a black hole
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED &&
+	    (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+	    tcp_stream_is_thin(tp) &&
+	    icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+		icsk->icsk_backoff = 0;
+		icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+	} else {
+		/* Use normal (exponential) backoff */
+		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	}
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+		__sk_dst_reset(sk);
+
+out:;
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int event;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
+		goto out;
+
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+		goto out;
+	}
+
+	event = icsk->icsk_pending;
+	icsk->icsk_pending = 0;
+
+	switch (event) {
+	case ICSK_TIME_RETRANS:
+		tcp_retransmit_timer(sk);
+		break;
+	case ICSK_TIME_PROBE0:
+		tcp_probe_timer(sk);
+		break;
+	}
+
+out:
+	sk_mem_reclaim(sk);
+out_unlock:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	Timer for listening sockets
+ */
+
+static void tcp_synack_timer(struct sock *sk)
+{
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+}
+
+void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+		return;
+
+	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+	else if (!val)
+		inet_csk_delete_keepalive_timer(sk);
+}
+
+
+static void tcp_keepalive_timer (unsigned long data)
+{
+	struct sock *sk = (struct sock *) data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 elapsed;
+
+	/* Only process if socket is not in use. */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		inet_csk_reset_keepalive_timer (sk, HZ/20);
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_synack_timer(sk);
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+		if (tp->linger2 >= 0) {
+			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
+
+			if (tmo > 0) {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+		tcp_send_active_reset(sk, GFP_ATOMIC);
+		goto death;
+	}
+
+	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	elapsed = keepalive_time_when(tp);
+
+	/* It is alive without keepalive 8) */
+	if (tp->packets_out || tcp_send_head(sk))
+		goto resched;
+
+	elapsed = keepalive_time_elapsed(tp);
+
+	if (elapsed >= keepalive_time_when(tp)) {
+		/* If the TCP_USER_TIMEOUT option is enabled, use that
+		 * to determine when to timeout instead.
+		 */
+		if ((icsk->icsk_user_timeout != 0 &&
+		    elapsed >= icsk->icsk_user_timeout &&
+		    icsk->icsk_probes_out > 0) ||
+		    (icsk->icsk_user_timeout == 0 &&
+		    icsk->icsk_probes_out >= keepalive_probes(tp))) {
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_write_err(sk);
+			goto out;
+		}
+		if (tcp_write_wakeup(sk) <= 0) {
+			icsk->icsk_probes_out++;
+			elapsed = keepalive_intvl_when(tp);
+		} else {
+			/* If keepalive was lost due to local congestion,
+			 * try harder.
+			 */
+			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+		}
+	} else {
+		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+		elapsed = keepalive_time_when(tp) - elapsed;
+	}
+
+	sk_mem_reclaim(sk);
+
+resched:
+	inet_csk_reset_keepalive_timer (sk, elapsed);
+	goto out;
+
+death:
+	tcp_done(sk);
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 00000000..80fa2bfd
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,339 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+static int alpha = 2;
+static int beta  = 4;
+static int gamma = 1;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static void vegas_enable(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	/* Begin taking Vegas samples next time we send something. */
+	vegas->doing_vegas_now = 1;
+
+	/* Set the beginning of the next send window. */
+	vegas->beg_snd_nxt = tp->snd_nxt;
+
+	vegas->cntRTT = 0;
+	vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	vegas->doing_vegas_now = 0;
+}
+
+void tcp_vegas_init(struct sock *sk)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	vegas->baseRTT = 0x7fffffff;
+	vegas_enable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_init);
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+	struct vegas *vegas = inet_csk_ca(sk);
+	u32 vrtt;
+
+	if (rtt_us < 0)
+		return;
+
+	/* Never allow zero rtt or baseRTT */
+	vrtt = rtt_us + 1;
+
+	/* Filter to find propagation delay: */
+	if (vrtt < vegas->baseRTT)
+		vegas->baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	vegas->minRTT = min(vegas->minRTT, vrtt);
+	vegas->cntRTT++;
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_pkts_acked);
+
+void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+
+	if (ca_state == TCP_CA_Open)
+		vegas_enable(sk);
+	else
+		vegas_disable(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_state);
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART ||
+	    event == CA_EVENT_TX_START)
+		tcp_vegas_init(sk);
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
+
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
+
+	if (!vegas->doing_vegas_now) {
+		tcp_reno_cong_avoid(sk, ack, in_flight);
+		return;
+	}
+
+	if (after(ack, vegas->beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		vegas->beg_snd_nxt  = tp->snd_nxt;
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (vegas->cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			tcp_reno_cong_avoid(sk, ack, in_flight);
+		} else {
+			u32 rtt, diff;
+			u64 target_cwnd;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = vegas->minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 */
+			target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 */
+			diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
+
+			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
+				/* Going too fast. Time to slow down
+				 * and switch to congestion avoidance.
+				 */
+
+				/* Set cwnd to match the actual rate
+				 * exactly:
+				 *   cwnd = (actual rate) * baseRTT
+				 * Then we add 1 because the integer
+				 * truncation robs us of full link
+				 * utilization.
+				 */
+				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
+
+			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
+				/* Slow start.  */
+				tcp_slow_start(tp);
+			} else {
+				/* Congestion avoidance. */
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					tp->snd_cwnd--;
+					tp->snd_ssthresh
+						= tcp_vegas_ssthresh(tp);
+				} else if (diff < alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					tp->snd_cwnd++;
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+				}
+			}
+
+			if (tp->snd_cwnd < 2)
+				tp->snd_cwnd = 2;
+			else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+				tp->snd_cwnd = tp->snd_cwnd_clamp;
+
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
+	/* Use normal slow start */
+	else if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+{
+	const struct vegas *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcpvegas_info info = {
+			.tcpv_enabled = ca->doing_vegas_now,
+			.tcpv_rttcnt = ca->cntRTT,
+			.tcpv_rtt = ca->baseRTT,
+			.tcpv_minrtt = ca->minRTT,
+		};
+
+		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
+
+static struct tcp_congestion_ops tcp_vegas __read_mostly = {
+	.flags		= TCP_CONG_RTT_STAMP,
+	.init		= tcp_vegas_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_vegas_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.pkts_acked	= tcp_vegas_pkts_acked,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_vegas);
+	return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
new file mode 100644
index 00000000..6c0eea2f
--- /dev/null
+++ b/net/ipv4/tcp_vegas.h
@@ -0,0 +1,24 @@
+/*
+ * TCP Vegas congestion control interface
+ */
+#ifndef __TCP_VEGAS_H
+#define __TCP_VEGAS_H 1
+
+/* Vegas variables */
+struct vegas {
+	u32	beg_snd_nxt;	/* right edge during last RTT */
+	u32	beg_snd_una;	/* left edge  during last RTT */
+	u32	beg_snd_cwnd;	/* saves the size of the cwnd */
+	u8	doing_vegas_now;/* if true, do vegas for this RTT */
+	u16	cntRTT;		/* # of RTTs measured within last RTT */
+	u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
+	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+extern void tcp_vegas_init(struct sock *sk);
+extern void tcp_vegas_state(struct sock *sk, u8 ca_state);
+extern void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+extern void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
+extern void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+
+#endif	/* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
new file mode 100644
index 00000000..ac43cd74
--- /dev/null
+++ b/net/ipv4/tcp_veno.c
@@ -0,0 +1,234 @@
+/*
+ * TCP Veno congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    C. P. Fu, S. C. Liew.
+ *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
+ *    IEEE Journal on Selected Areas in Communication,
+ *    Feb. 2003.
+ * 	See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Veno variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static const int beta = 3 << V_PARAM_SHIFT;
+
+/* Veno variables */
+struct veno {
+	u8 doing_veno_now;	/* if true, do veno for this rtt */
+	u16 cntrtt;		/* # of rtts measured within last rtt */
+	u32 minrtt;		/* min of rtts measured within last rtt (in usec) */
+	u32 basertt;		/* the min of all Veno rtt measurements seen (in usec) */
+	u32 inc;		/* decide whether to increase cwnd */
+	u32 diff;		/* calculate the diff rate */
+};
+
+/* There are several situations when we must "re-start" Veno:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ */
+static inline void veno_enable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn on Veno */
+	veno->doing_veno_now = 1;
+
+	veno->minrtt = 0x7fffffff;
+}
+
+static inline void veno_disable(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	/* turn off Veno */
+	veno->doing_veno_now = 0;
+}
+
+static void tcp_veno_init(struct sock *sk)
+{
+	struct veno *veno = inet_csk_ca(sk);
+
+	veno->basertt = 0x7fffffff;
+	veno->inc = 1;
+	veno_enable(sk);
+}
+
+/* Do rtt sampling needed for Veno. */
+static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+{
+	struct veno *veno = inet_csk_ca(sk);
+	u32 vrtt;
+
+	if (rtt_us < 0)
+		return;
+
+	/* Never allow zero rtt or baseRTT */
+	vrtt = rtt_us + 1;
+
+	/* Filter to find propagation delay: */
+	if (vrtt < veno->basertt)
+		veno->basertt = vrtt;
+
+	/* Find the min rtt during the last rtt to find
+	 * the current prop. delay + queuing delay:
+	 */
+	veno->minrtt = min(veno->minrtt, vrtt);
+	veno->cntrtt++;
+}
+
+static void tcp_veno_state(struct sock *sk, u8 ca_state)
+{
+	if (ca_state == TCP_CA_Open)
+		veno_enable(sk);
+	else
+		veno_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Veno calculations
+ * until we get fresh rtt samples.  So when we
+ * restart, we reset our Veno state to a clean
+ * state. After we get acks for this flight of
+ * packets, _then_ we can make Veno calculations
+ * again.
+ */
+static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START)
+		tcp_veno_init(sk);
+}
+
+static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (!veno->doing_veno_now) {
+		tcp_reno_cong_avoid(sk, ack, in_flight);
+		return;
+	}
+
+	/* limited by applications */
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	/* We do the Veno calculations only if we got enough rtt samples */
+	if (veno->cntrtt <= 2) {
+		/* We don't have enough rtt samples to do the Veno
+		 * calculation, so we'll behave like Reno.
+		 */
+		tcp_reno_cong_avoid(sk, ack, in_flight);
+	} else {
+		u64 target_cwnd;
+		u32 rtt;
+
+		/* We have enough rtt samples, so, using the Veno
+		 * algorithm, we determine the state of the network.
+		 */
+
+		rtt = veno->minrtt;
+
+		target_cwnd = (tp->snd_cwnd * veno->basertt);
+		target_cwnd <<= V_PARAM_SHIFT;
+		do_div(target_cwnd, rtt);
+
+		veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+
+		if (tp->snd_cwnd <= tp->snd_ssthresh) {
+			/* Slow start.  */
+			tcp_slow_start(tp);
+		} else {
+			/* Congestion avoidance. */
+			if (veno->diff < beta) {
+				/* In the "non-congestive state", increase cwnd
+				 *  every rtt.
+				 */
+				tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+			} else {
+				/* In the "congestive state", increase cwnd
+				 * every other rtt.
+				 */
+				if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+					if (veno->inc &&
+					    tp->snd_cwnd < tp->snd_cwnd_clamp) {
+						tp->snd_cwnd++;
+						veno->inc = 0;
+					} else
+						veno->inc = 1;
+					tp->snd_cwnd_cnt = 0;
+				} else
+					tp->snd_cwnd_cnt++;
+			}
+
+		}
+		if (tp->snd_cwnd < 2)
+			tp->snd_cwnd = 2;
+		else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+			tp->snd_cwnd = tp->snd_cwnd_clamp;
+	}
+	/* Wipe the slate clean for the next rtt. */
+	/* veno->cntrtt = 0; */
+	veno->minrtt = 0x7fffffff;
+}
+
+/* Veno MD phase */
+static u32 tcp_veno_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct veno *veno = inet_csk_ca(sk);
+
+	if (veno->diff < beta)
+		/* in "non-congestive state", cut cwnd by 1/5 */
+		return max(tp->snd_cwnd * 4 / 5, 2U);
+	else
+		/* in "congestive state", cut cwnd by 1/2 */
+		return max(tp->snd_cwnd >> 1U, 2U);
+}
+
+static struct tcp_congestion_ops tcp_veno __read_mostly = {
+	.flags		= TCP_CONG_RTT_STAMP,
+	.init		= tcp_veno_init,
+	.ssthresh	= tcp_veno_ssthresh,
+	.cong_avoid	= tcp_veno_cong_avoid,
+	.pkts_acked	= tcp_veno_pkts_acked,
+	.set_state	= tcp_veno_state,
+	.cwnd_event	= tcp_veno_cwnd_event,
+
+	.owner		= THIS_MODULE,
+	.name		= "veno",
+};
+
+static int __init tcp_veno_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_veno);
+	return 0;
+}
+
+static void __exit tcp_veno_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_veno);
+}
+
+module_init(tcp_veno_register);
+module_exit(tcp_veno_unregister);
+
+MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Veno");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 00000000..1b91bf48
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,304 @@
+/*
+ * TCP Westwood+: end-to-end bandwidth estimation for TCP
+ *
+ *      Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4
+ *
+ * Support at http://c3lab.poliba.it/index.php/Westwood
+ * Main references in literature:
+ *
+ * - Mascolo S, Casetti, M. Gerla et al.
+ *   "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001
+ *
+ * - A. Grieco, s. Mascolo
+ *   "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer
+ *     Comm. Review, 2004
+ *
+ * - A. Dell'Aera, L. Grieco, S. Mascolo.
+ *   "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving :
+ *    A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004
+ *
+ * Westwood+ employs end-to-end bandwidth measurement to set cwnd and
+ * ssthresh after packet loss. The probing phase is as the original Reno.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+	u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+	u32    bw_est;           /* bandwidth estimate */
+	u32    rtt_win_sx;       /* here starts a new evaluation... */
+	u32    bk;
+	u32    snd_una;          /* used for evaluating the number of acked bytes */
+	u32    cumul_ack;
+	u32    accounted;
+	u32    rtt;
+	u32    rtt_min;          /* minimum observed RTT */
+	u8     first_ack;        /* flag which infers that this is the first ack */
+	u8     reset_rtt_min;    /* Reset RTT min to next RTT sample*/
+};
+
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)	/* 50ms */
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)	/* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct sock *sk)
+{
+	struct westwood *w = inet_csk_ca(sk);
+
+	w->bk = 0;
+	w->bw_ns_est = 0;
+	w->bw_est = 0;
+	w->accounted = 0;
+	w->cumul_ack = 0;
+	w->reset_rtt_min = 1;
+	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+	w->rtt_win_sx = tcp_time_stamp;
+	w->snd_una = tcp_sk(sk)->snd_una;
+	w->first_ack = 1;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+	return ((7 * a) + b) >> 3;
+}
+
+static void westwood_filter(struct westwood *w, u32 delta)
+{
+	/* If the filter is empty fill it with the first sample of bandwidth  */
+	if (w->bw_ns_est == 0 && w->bw_est == 0) {
+		w->bw_ns_est = w->bk / delta;
+		w->bw_est = w->bw_ns_est;
+	} else {
+		w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+		w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+	}
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+{
+	struct westwood *w = inet_csk_ca(sk);
+
+	if (rtt > 0)
+		w->rtt = usecs_to_jiffies(rtt);
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct sock *sk)
+{
+	struct westwood *w = inet_csk_ca(sk);
+	s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+	/* Initialize w->snd_una with the first acked sequence number in order
+	 * to fix mismatch between tp->snd_una and w->snd_una for the first
+	 * bandwidth sample
+	 */
+	if (w->first_ack) {
+		w->snd_una = tcp_sk(sk)->snd_una;
+		w->first_ack = 0;
+	}
+
+	/*
+	 * See if a RTT-window has passed.
+	 * Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was chosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obviously on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+	 */
+	if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+		westwood_filter(w, delta);
+
+		w->bk = 0;
+		w->rtt_win_sx = tcp_time_stamp;
+	}
+}
+
+static inline void update_rtt_min(struct westwood *w)
+{
+	if (w->reset_rtt_min) {
+		w->rtt_min = w->rtt;
+		w->reset_rtt_min = 0;
+	} else
+		w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	westwood_update_window(sk);
+
+	w->bk += tp->snd_una - w->snd_una;
+	w->snd_una = tp->snd_una;
+	update_rtt_min(w);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	w->cumul_ack = tp->snd_una - w->snd_una;
+
+	/* If cumul_ack is 0 this is a dupack since it's not moving
+	 * tp->snd_una.
+	 */
+	if (!w->cumul_ack) {
+		w->accounted += tp->mss_cache;
+		w->cumul_ack = tp->mss_cache;
+	}
+
+	if (w->cumul_ack > tp->mss_cache) {
+		/* Partial or delayed ack */
+		if (w->accounted >= w->cumul_ack) {
+			w->accounted -= w->cumul_ack;
+			w->cumul_ack = tp->mss_cache;
+		} else {
+			w->cumul_ack -= w->accounted;
+			w->accounted = 0;
+		}
+	}
+
+	w->snd_una = tp->snd_una;
+
+	return w->cumul_ack;
+}
+
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	switch (event) {
+	case CA_EVENT_FAST_ACK:
+		westwood_fast_bw(sk);
+		break;
+
+	case CA_EVENT_COMPLETE_CWR:
+		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		break;
+
+	case CA_EVENT_FRTO:
+		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+		/* Update RTT_min when next ack arrives */
+		w->reset_rtt_min = 1;
+		break;
+
+	case CA_EVENT_SLOW_ACK:
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
+		update_rtt_min(w);
+		break;
+
+	default:
+		/* don't care */
+		break;
+	}
+}
+
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_westwood_info(struct sock *sk, u32 ext,
+			      struct sk_buff *skb)
+{
+	const struct westwood *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		struct tcpvegas_info info = {
+			.tcpv_enabled = 1,
+			.tcpv_rtt = jiffies_to_usecs(ca->rtt),
+			.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
+		};
+
+		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+	}
+}
+
+
+static struct tcp_congestion_ops tcp_westwood __read_mostly = {
+	.init		= tcp_westwood_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_westwood_bw_rttmin,
+	.cwnd_event	= tcp_westwood_event,
+	.get_info	= tcp_westwood_info,
+	.pkts_acked	= tcp_westwood_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
new file mode 100644
index 00000000..05c3b6f0
--- /dev/null
+++ b/net/ipv4/tcp_yeah.c
@@ -0,0 +1,260 @@
+/*
+ *
+ *   YeAH TCP
+ *
+ * For further details look at:
+ *    http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ *
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+#include "tcp_vegas.h"
+
+#define TCP_YEAH_ALPHA       80 //lin number of packets queued at the bottleneck
+#define TCP_YEAH_GAMMA        1 //lin fraction of queue to be removed per rtt
+#define TCP_YEAH_DELTA        3 //log minimum fraction of cwnd to be removed on loss
+#define TCP_YEAH_EPSILON      1 //log maximum fraction to be removed on early decongestion
+#define TCP_YEAH_PHY          8 //lin maximum delta from base
+#define TCP_YEAH_RHO         16 //lin minimum number of consecutive rtt to consider competition on loss
+#define TCP_YEAH_ZETA        50 //lin minimum number of state switchs to reset reno_count
+
+#define TCP_SCALABLE_AI_CNT	 100U
+
+/* YeAH variables */
+struct yeah {
+	struct vegas vegas;	/* must be first */
+
+	/* YeAH */
+	u32 lastQ;
+	u32 doing_reno_now;
+
+	u32 reno_count;
+	u32 fast_count;
+
+	u32 pkts_acked;
+};
+
+static void tcp_yeah_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+
+	tcp_vegas_init(sk);
+
+	yeah->doing_reno_now = 0;
+	yeah->lastQ = 0;
+
+	yeah->reno_count = 2;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+
+}
+
+
+static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+
+	if (icsk->icsk_ca_state == TCP_CA_Open)
+		yeah->pkts_acked = pkts_acked;
+
+	tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+}
+
+static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+
+	else if (!yeah->doing_reno_now) {
+		/* Scalable */
+
+		tp->snd_cwnd_cnt += yeah->pkts_acked;
+		if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		}
+
+		yeah->pkts_acked = 1;
+
+	} else {
+		/* Reno */
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+	}
+
+	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up yeahly with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, yeah->vegas.beg_snd_nxt)) {
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (yeah->vegas.cntRTT > 2) {
+			u32 rtt, queue;
+			u64 bw;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = yeah->vegas.minRTT;
+
+			/* Compute excess number of packets above bandwidth
+			 * Avoid doing full 64 bit divide.
+			 */
+			bw = tp->snd_cwnd;
+			bw *= rtt - yeah->vegas.baseRTT;
+			do_div(bw, rtt);
+			queue = bw;
+
+			if (queue > TCP_YEAH_ALPHA ||
+			    rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
+				if (queue > TCP_YEAH_ALPHA &&
+				    tp->snd_cwnd > yeah->reno_count) {
+					u32 reduction = min(queue / TCP_YEAH_GAMMA ,
+							    tp->snd_cwnd >> TCP_YEAH_EPSILON);
+
+					tp->snd_cwnd -= reduction;
+
+					tp->snd_cwnd = max(tp->snd_cwnd,
+							   yeah->reno_count);
+
+					tp->snd_ssthresh = tp->snd_cwnd;
+				}
+
+				if (yeah->reno_count <= 2)
+					yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
+				else
+					yeah->reno_count++;
+
+				yeah->doing_reno_now = min(yeah->doing_reno_now + 1,
+							   0xffffffU);
+			} else {
+				yeah->fast_count++;
+
+				if (yeah->fast_count > TCP_YEAH_ZETA) {
+					yeah->reno_count = 2;
+					yeah->fast_count = 0;
+				}
+
+				yeah->doing_reno_now = 0;
+			}
+
+			yeah->lastQ = queue;
+
+		}
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		yeah->vegas.beg_snd_una  = yeah->vegas.beg_snd_nxt;
+		yeah->vegas.beg_snd_nxt  = tp->snd_nxt;
+		yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+		/* Wipe the slate clean for the next RTT. */
+		yeah->vegas.cntRTT = 0;
+		yeah->vegas.minRTT = 0x7fffffff;
+	}
+}
+
+static u32 tcp_yeah_ssthresh(struct sock *sk) {
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct yeah *yeah = inet_csk_ca(sk);
+	u32 reduction;
+
+	if (yeah->doing_reno_now < TCP_YEAH_RHO) {
+		reduction = yeah->lastQ;
+
+		reduction = min( reduction, max(tp->snd_cwnd>>1, 2U) );
+
+		reduction = max( reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+	} else
+		reduction = max(tp->snd_cwnd>>1, 2U);
+
+	yeah->fast_count = 0;
+	yeah->reno_count = max(yeah->reno_count>>1, 2U);
+
+	return tp->snd_cwnd - reduction;
+}
+
+static struct tcp_congestion_ops tcp_yeah __read_mostly = {
+	.flags		= TCP_CONG_RTT_STAMP,
+	.init		= tcp_yeah_init,
+	.ssthresh	= tcp_yeah_ssthresh,
+	.cong_avoid	= tcp_yeah_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+	.pkts_acked	= tcp_yeah_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "yeah",
+};
+
+static int __init tcp_yeah_register(void)
+{
+	BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_yeah);
+	return 0;
+}
+
+static void __exit tcp_yeah_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_yeah);
+}
+
+module_init(tcp_yeah_register);
+module_exit(tcp_yeah_unregister);
+
+MODULE_AUTHOR("Angelo P. Castellani");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("YeAH TCP");
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
new file mode 100644
index 00000000..ac3b3ee4
--- /dev/null
+++ b/net/ipv4/tunnel4.c
@@ -0,0 +1,192 @@
+/* tunnel4.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel4_mutex);
+
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
+{
+	return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
+}
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+
+	int ret = -EEXIST;
+	int priority = handler->priority;
+
+	mutex_lock(&tunnel4_mutex);
+
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
+			break;
+		if (t->priority == priority)
+			goto err;
+	}
+
+	handler->next = *pprev;
+	rcu_assign_pointer(*pprev, handler);
+
+	ret = 0;
+
+err:
+	mutex_unlock(&tunnel4_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
+{
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+	int ret = -ENOENT;
+
+	mutex_lock(&tunnel4_mutex);
+
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
+			*pprev = handler->next;
+			ret = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&tunnel4_mutex);
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+	
+static int tunnel4_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int tunnel64_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+static void tunnel4_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			break;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void tunnel64_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
+		if (!handler->err_handler(skb, info))
+			break;
+}
+#endif
+
+static const struct net_protocol tunnel4_protocol = {
+	.handler	=	tunnel4_rcv,
+	.err_handler	=	tunnel4_err,
+	.no_policy	=	1,
+	.netns_ok	=	1,
+};
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct net_protocol tunnel64_protocol = {
+	.handler	=	tunnel64_rcv,
+	.err_handler	=	tunnel64_err,
+	.no_policy	=	1,
+	.netns_ok	=	1,
+};
+#endif
+
+static int __init tunnel4_init(void)
+{
+	if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
+		printk(KERN_ERR "tunnel4 init: can't add protocol\n");
+		return -EAGAIN;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
+		printk(KERN_ERR "tunnel64 init: can't add protocol\n");
+		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+		return -EAGAIN;
+	}
+#endif
+	return 0;
+}
+
+static void __exit tunnel4_fini(void)
+{
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
+		printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
+#endif
+	if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
+		printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
+}
+
+module_init(tunnel4_init);
+module_exit(tunnel4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 00000000..198f75b7
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,2285 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The User Datagram Protocol (UDP).
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() calls
+ *		Alan Cox	: 	stopped close while in use off icmp
+ *					messages. Not a fix but a botch that
+ *					for udp at least is 'valid'.
+ *		Alan Cox	:	Fixed icmp handling properly
+ *		Alan Cox	: 	Correct error for oversized datagrams
+ *		Alan Cox	:	Tidied select() semantics.
+ *		Alan Cox	:	udp_err() fixed properly, also now
+ *					select and read wake correctly on errors
+ *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
+ *		Alan Cox	:	UDP can count its memory
+ *		Alan Cox	:	send to an unknown connection causes
+ *					an ECONNREFUSED off the icmp, but
+ *					does NOT close.
+ *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
+ *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
+ *					bug no longer crashes it.
+ *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram
+ *		Alan Cox	:	Added get/set sockopt support.
+ *		Alan Cox	:	Broadcasting without option set returns EACCES.
+ *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
+ *		Alan Cox	:	Use ip_tos and ip_ttl
+ *		Alan Cox	:	SNMP Mibs
+ *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
+ *		Matt Dillon	:	UDP length checks.
+ *		Alan Cox	:	Smarter af_inet used properly.
+ *		Alan Cox	:	Use new kernel side addressing.
+ *		Alan Cox	:	Incorrect return on truncated datagram receive.
+ *	Arnt Gulbrandsen 	:	New udp_send and stuff
+ *		Alan Cox	:	Cache last socket
+ *		Alan Cox	:	Route cache
+ *		Jon Peatfield	:	Minor efficiency fix to sendto().
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Nonblocking error fix.
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Mike McLagan	:	Routing by source
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Last socket cache retained as it
+ *					does have a high hit rate.
+ *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
+ *		Andi Kleen	:	Some cleanups, cache destination entry
+ *					for connect.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
+ *					return ENOTCONN for unconnected sockets (POSIX)
+ *		Janos Farkas	:	don't deliver multi/broadcasts to a different
+ *					bound-to-device socket
+ *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *	Hirokazu Takahashi	:	sendfile() on UDP works now.
+ *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *	James Chapman		:	Add L2TP encapsulation type.
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/igmp.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/tcp_states.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include "udp_impl.h"
+
+struct udp_table udp_table __read_mostly;
+EXPORT_SYMBOL(udp_table);
+
+long sysctl_udp_mem[3] __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_mem);
+
+int sysctl_udp_rmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+
+int sysctl_udp_wmem_min __read_mostly;
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
+atomic_long_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+
+static int udp_lib_lport_inuse(struct net *net, __u16 num,
+			       const struct udp_hslot *hslot,
+			       unsigned long *bitmap,
+			       struct sock *sk,
+			       int (*saddr_comp)(const struct sock *sk1,
+						 const struct sock *sk2),
+			       unsigned int log)
+{
+	struct sock *sk2;
+	struct hlist_nulls_node *node;
+
+	sk_nulls_for_each(sk2, node, &hslot->head)
+		if (net_eq(sock_net(sk2), net) &&
+		    sk2 != sk &&
+		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    (*saddr_comp)(sk, sk2)) {
+			if (bitmap)
+				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
+					  bitmap);
+			else
+				return 1;
+		}
+	return 0;
+}
+
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+			       struct udp_hslot *hslot2,
+			       struct sock *sk,
+			       int (*saddr_comp)(const struct sock *sk1,
+						 const struct sock *sk2))
+{
+	struct sock *sk2;
+	struct hlist_nulls_node *node;
+	int res = 0;
+
+	spin_lock(&hslot2->lock);
+	udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
+		if (net_eq(sock_net(sk2), net) &&
+		    sk2 != sk &&
+		    (udp_sk(sk2)->udp_port_hash == num) &&
+		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+		    (*saddr_comp)(sk, sk2)) {
+			res = 1;
+			break;
+		}
+	spin_unlock(&hslot2->lock);
+	return res;
+}
+
+/**
+ *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
+ *
+ *  @sk:          socket struct in question
+ *  @snum:        port number to look up
+ *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
+ *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
+ *                   with NULL address
+ */
+int udp_lib_get_port(struct sock *sk, unsigned short snum,
+		       int (*saddr_comp)(const struct sock *sk1,
+					 const struct sock *sk2),
+		     unsigned int hash2_nulladdr)
+{
+	struct udp_hslot *hslot, *hslot2;
+	struct udp_table *udptable = sk->sk_prot->h.udp_table;
+	int    error = 1;
+	struct net *net = sock_net(sk);
+
+	if (!snum) {
+		int low, high, remaining;
+		unsigned rand;
+		unsigned short first, last;
+		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+
+		inet_get_local_port_range(&low, &high);
+		remaining = (high - low) + 1;
+
+		rand = net_random();
+		first = (((u64)rand * remaining) >> 32) + low;
+		/*
+		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
+		 */
+		rand = (rand | 1) * (udptable->mask + 1);
+		last = first + udptable->mask + 1;
+		do {
+			hslot = udp_hashslot(udptable, net, first);
+			bitmap_zero(bitmap, PORTS_PER_CHAIN);
+			spin_lock_bh(&hslot->lock);
+			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
+					    saddr_comp, udptable->log);
+
+			snum = first;
+			/*
+			 * Iterate on all possible values of snum for this hash.
+			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
+			 * give us randomization and full range coverage.
+			 */
+			do {
+				if (low <= snum && snum <= high &&
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
+					goto found;
+				snum += rand;
+			} while (snum != first);
+			spin_unlock_bh(&hslot->lock);
+		} while (++first != last);
+		goto fail;
+	} else {
+		hslot = udp_hashslot(udptable, net, snum);
+		spin_lock_bh(&hslot->lock);
+		if (hslot->count > 10) {
+			int exist;
+			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+			slot2          &= udptable->mask;
+			hash2_nulladdr &= udptable->mask;
+
+			hslot2 = udp_hashslot2(udptable, slot2);
+			if (hslot->count < hslot2->count)
+				goto scan_primary_hash;
+
+			exist = udp_lib_lport_inuse2(net, snum, hslot2,
+						     sk, saddr_comp);
+			if (!exist && (hash2_nulladdr != slot2)) {
+				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+				exist = udp_lib_lport_inuse2(net, snum, hslot2,
+							     sk, saddr_comp);
+			}
+			if (exist)
+				goto fail_unlock;
+			else
+				goto found;
+		}
+scan_primary_hash:
+		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+					saddr_comp, 0))
+			goto fail_unlock;
+	}
+found:
+	inet_sk(sk)->inet_num = snum;
+	udp_sk(sk)->udp_port_hash = snum;
+	udp_sk(sk)->udp_portaddr_hash ^= snum;
+	if (sk_unhashed(sk)) {
+		sk_nulls_add_node_rcu(sk, &hslot->head);
+		hslot->count++;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		spin_lock(&hslot2->lock);
+		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+					 &hslot2->head);
+		hslot2->count++;
+		spin_unlock(&hslot2->lock);
+	}
+	error = 0;
+fail_unlock:
+	spin_unlock_bh(&hslot->lock);
+fail:
+	return error;
+}
+EXPORT_SYMBOL(udp_lib_get_port);
+
+static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+{
+	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
+
+	return 	(!ipv6_only_sock(sk2)  &&
+		 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+		   inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+}
+
+static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
+				       unsigned int port)
+{
+	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
+}
+
+int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+	unsigned int hash2_nulladdr =
+		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
+	unsigned int hash2_partial =
+		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+	/* precompute partial secondary hash */
+	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
+			 unsigned short hnum,
+			 __be16 sport, __be32 daddr, __be16 dport, int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
+			!ipv6_only_sock(sk)) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = (sk->sk_family == PF_INET ? 1 : 0);
+		if (inet->inet_rcv_saddr) {
+			if (inet->inet_rcv_saddr != daddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->inet_daddr) {
+			if (inet->inet_daddr != saddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
+/*
+ * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
+ */
+#define SCORE2_MAX (1 + 2 + 2 + 2)
+static inline int compute_score2(struct sock *sk, struct net *net,
+				 __be32 saddr, __be16 sport,
+				 __be32 daddr, unsigned int hnum, int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->inet_rcv_saddr != daddr)
+			return -1;
+		if (inet->inet_num != hnum)
+			return -1;
+
+		score = (sk->sk_family == PF_INET ? 1 : 0);
+		if (inet->inet_daddr) {
+			if (inet->inet_daddr != saddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
+
+/* called with read_rcu_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+		__be32 saddr, __be16 sport,
+		__be32 daddr, unsigned int hnum, int dif,
+		struct udp_hslot *hslot2, unsigned int slot2)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	int score, badness;
+
+begin:
+	result = NULL;
+	badness = -1;
+	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+		score = compute_score2(sk, net, saddr, sport,
+				      daddr, hnum, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+			if (score == SCORE2_MAX)
+				goto exact_match;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot2)
+		goto begin;
+
+	if (result) {
+exact_match:
+		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(compute_score2(result, net, saddr, sport,
+				  daddr, hnum, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	return result;
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+		__be16 sport, __be32 daddr, __be16 dport,
+		int dif, struct udp_table *udptable)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(dport);
+	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	int score, badness;
+
+	rcu_read_lock();
+	if (hslot->count > 10) {
+		hash2 = udp4_portaddr_hash(net, daddr, hnum);
+		slot2 = hash2 & udptable->mask;
+		hslot2 = &udptable->hash2[slot2];
+		if (hslot->count < hslot2->count)
+			goto begin;
+
+		result = udp4_lib_lookup2(net, saddr, sport,
+					  daddr, hnum, dif,
+					  hslot2, slot2);
+		if (!result) {
+			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+			slot2 = hash2 & udptable->mask;
+			hslot2 = &udptable->hash2[slot2];
+			if (hslot->count < hslot2->count)
+				goto begin;
+
+			result = udp4_lib_lookup2(net, saddr, sport,
+						  htonl(INADDR_ANY), hnum, dif,
+						  hslot2, slot2);
+		}
+		rcu_read_unlock();
+		return result;
+	}
+begin:
+	result = NULL;
+	badness = -1;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+		score = compute_score(sk, net, saddr, hnum, sport,
+				      daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+				  daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+
+static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
+						 __be16 sport, __be16 dport,
+						 struct udp_table *udptable)
+{
+	struct sock *sk;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (unlikely(sk = skb_steal_sock(skb)))
+		return sk;
+	else
+		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+					 iph->daddr, dport, inet_iif(skb),
+					 udptable);
+}
+
+struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+			     __be32 daddr, __be16 dport, int dif)
+{
+	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+
+static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
+					     __be16 loc_port, __be32 loc_addr,
+					     __be16 rmt_port, __be32 rmt_addr,
+					     int dif)
+{
+	struct hlist_nulls_node *node;
+	struct sock *s = sk;
+	unsigned short hnum = ntohs(loc_port);
+
+	sk_nulls_for_each_from(s, node) {
+		struct inet_sock *inet = inet_sk(s);
+
+		if (!net_eq(sock_net(s), net) ||
+		    udp_sk(s)->udp_port_hash != hnum ||
+		    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+		    (inet->inet_dport != rmt_port && inet->inet_dport) ||
+		    (inet->inet_rcv_saddr &&
+		     inet->inet_rcv_saddr != loc_addr) ||
+		    ipv6_only_sock(s) ||
+		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
+			continue;
+		if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+			continue;
+		goto found;
+	}
+	s = NULL;
+found:
+	return s;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header.  We need
+ * to find the appropriate port.
+ */
+
+void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+{
+	struct inet_sock *inet;
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct sock *sk;
+	int harderr;
+	int err;
+	struct net *net = dev_net(skb->dev);
+
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
+			iph->saddr, uh->source, skb->dev->ifindex, udptable);
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		return;	/* No socket for error */
+	}
+
+	err = 0;
+	harderr = 0;
+	inet = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if (!inet->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else
+		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+void udp_err(struct sk_buff *skb, u32 info)
+{
+	__udp4_lib_err(skb, info, &udp_table);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+void udp_flush_pending_frames(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	if (up->pending) {
+		up->len = 0;
+		up->pending = 0;
+		ip_flush_pending_frames(sk);
+	}
+}
+EXPORT_SYMBOL(udp_flush_pending_frames);
+
+/**
+ * 	udp4_hwcsum  -  handle outgoing HW checksumming
+ * 	@skb: 	sk_buff containing the filled-in UDP header
+ * 	        (checksum field must be zeroed out)
+ *	@src:	source IP address
+ *	@dst:	destination IP address
+ */
+static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	struct sk_buff *frags = skb_shinfo(skb)->frag_list;
+	int offset = skb_transport_offset(skb);
+	int len = skb->len - offset;
+	int hlen = len;
+	__wsum csum = 0;
+
+	if (!frags) {
+		/*
+		 * Only one fragment on the socket.
+		 */
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~csum_tcpudp_magic(src, dst, len,
+					       IPPROTO_UDP, 0);
+	} else {
+		/*
+		 * HW-checksum won't work as there are two or more
+		 * fragments on the socket so that all csums of sk_buffs
+		 * should be together
+		 */
+		do {
+			csum = csum_add(csum, frags->csum);
+			hlen -= frags->len;
+		} while ((frags = frags->next));
+
+		csum = skb_checksum(skb, offset, hlen, csum);
+		skb->ip_summed = CHECKSUM_NONE;
+
+		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	}
+}
+
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct udphdr *uh;
+	int err = 0;
+	int is_udplite = IS_UDPLITE(sk);
+	int offset = skb_transport_offset(skb);
+	int len = skb->len - offset;
+	__wsum csum = 0;
+
+	/*
+	 * Create a UDP header
+	 */
+	uh = udp_hdr(skb);
+	uh->source = inet->inet_sport;
+	uh->dest = fl4->fl4_dport;
+	uh->len = htons(len);
+	uh->check = 0;
+
+	if (is_udplite)  				 /*     UDP-Lite      */
+		csum = udplite_csum(skb);
+
+	else if (sk->sk_no_check == UDP_CSUM_NOXMIT) {   /* UDP csum disabled */
+
+		skb->ip_summed = CHECKSUM_NONE;
+		goto send;
+
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+
+		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
+		goto send;
+
+	} else
+		csum = udp_csum(skb);
+
+	/* add protocol-dependent pseudo-header */
+	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
+				      sk->sk_protocol, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+send:
+	err = ip_send_skb(skb);
+	if (err) {
+		if (err == -ENOBUFS && !inet->recverr) {
+			UDP_INC_STATS_USER(sock_net(sk),
+					   UDP_MIB_SNDBUFERRORS, is_udplite);
+			err = 0;
+		}
+	} else
+		UDP_INC_STATS_USER(sock_net(sk),
+				   UDP_MIB_OUTDATAGRAMS, is_udplite);
+	return err;
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk)
+{
+	struct udp_sock  *up = udp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
+	struct sk_buff *skb;
+	int err = 0;
+
+	skb = ip_finish_skb(sk, fl4);
+	if (!skb)
+		goto out;
+
+	err = udp_send_skb(skb, fl4);
+
+out:
+	up->len = 0;
+	up->pending = 0;
+	return err;
+}
+
+int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct udp_sock *up = udp_sk(sk);
+	struct flowi4 fl4_stack;
+	struct flowi4 *fl4;
+	int ulen = len;
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	int free = 0;
+	int connected = 0;
+	__be32 daddr, faddr, saddr;
+	__be16 dport;
+	u8  tos;
+	int err, is_udplite = IS_UDPLITE(sk);
+	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+	struct sk_buff *skb;
+	struct ip_options_data opt_copy;
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
+		return -EOPNOTSUPP;
+
+	ipc.opt = NULL;
+	ipc.tx_flags = 0;
+
+	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
+
+	fl4 = &inet->cork.fl.u.ip4;
+	if (up->pending) {
+		/*
+		 * There are pending frames.
+		 * The socket lock must be held while it's corked.
+		 */
+		lock_sock(sk);
+		if (likely(up->pending)) {
+			if (unlikely(up->pending != AF_INET)) {
+				release_sock(sk);
+				return -EINVAL;
+			}
+			goto do_append_data;
+		}
+		release_sock(sk);
+	}
+	ulen += sizeof(struct udphdr);
+
+	/*
+	 *	Get and verify the address.
+	 */
+	if (msg->msg_name) {
+		struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET) {
+			if (usin->sin_family != AF_UNSPEC)
+				return -EAFNOSUPPORT;
+		}
+
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+		if (dport == 0)
+			return -EINVAL;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->inet_daddr;
+		dport = inet->inet_dport;
+		/* Open fast path for connected socket.
+		   Route will not be used, if at least one option is set.
+		 */
+		connected = 1;
+	}
+	ipc.addr = inet->inet_saddr;
+
+	ipc.oif = sk->sk_bound_dev_if;
+	err = sock_tx_timestamp(sk, &ipc.tx_flags);
+	if (err)
+		return err;
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+		if (err)
+			return err;
+		if (ipc.opt)
+			free = 1;
+		connected = 0;
+	}
+	if (!ipc.opt) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+		if (inet_opt) {
+			memcpy(&opt_copy, inet_opt,
+			       sizeof(*inet_opt) + inet_opt->opt.optlen);
+			ipc.opt = &opt_copy.opt;
+		}
+		rcu_read_unlock();
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->opt.srr) {
+		if (!daddr)
+			return -EINVAL;
+		faddr = ipc.opt->opt.faddr;
+		connected = 0;
+	}
+	tos = RT_TOS(inet->tos);
+	if (sock_flag(sk, SOCK_LOCALROUTE) ||
+	    (msg->msg_flags & MSG_DONTROUTE) ||
+	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
+		tos |= RTO_ONLINK;
+		connected = 0;
+	}
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+		connected = 0;
+	}
+
+	if (connected)
+		rt = (struct rtable *)sk_dst_check(sk, 0);
+
+	if (rt == NULL) {
+		struct net *net = sock_net(sk);
+
+		fl4 = &fl4_stack;
+		flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
+				   inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
+				   faddr, saddr, dport, inet->inet_sport);
+
+		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+		rt = ip_route_output_flow(net, fl4, sk);
+		if (IS_ERR(rt)) {
+			err = PTR_ERR(rt);
+			rt = NULL;
+			if (err == -ENETUNREACH)
+				IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+			goto out;
+		}
+
+		err = -EACCES;
+		if ((rt->rt_flags & RTCF_BROADCAST) &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			goto out;
+		if (connected)
+			sk_dst_set(sk, dst_clone(&rt->dst));
+	}
+
+	if (msg->msg_flags&MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	saddr = fl4->saddr;
+	if (!ipc.addr)
+		daddr = ipc.addr = fl4->daddr;
+
+	/* Lockless fast path for the non-corking case. */
+	if (!corkreq) {
+		skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
+				  sizeof(struct udphdr), &ipc, &rt,
+				  msg->msg_flags);
+		err = PTR_ERR(skb);
+		if (skb && !IS_ERR(skb))
+			err = udp_send_skb(skb, fl4);
+		goto out;
+	}
+
+	lock_sock(sk);
+	if (unlikely(up->pending)) {
+		/* The socket is already corked while preparing it. */
+		/* ... which is an evident application bug. --ANK */
+		release_sock(sk);
+
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+		err = -EINVAL;
+		goto out;
+	}
+	/*
+	 *	Now cork the socket to pend data.
+	 */
+	fl4 = &inet->cork.fl.u.ip4;
+	fl4->daddr = daddr;
+	fl4->saddr = saddr;
+	fl4->fl4_dport = dport;
+	fl4->fl4_sport = inet->inet_sport;
+	up->pending = AF_INET;
+
+do_append_data:
+	up->len += ulen;
+	err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
+			     sizeof(struct udphdr), &ipc, &rt,
+			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+	if (err)
+		udp_flush_pending_frames(sk);
+	else if (!corkreq)
+		err = udp_push_pending_frames(sk);
+	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+		up->pending = 0;
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+	if (free)
+		kfree(ipc.opt);
+	if (!err)
+		return len;
+	/*
+	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
+	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
+	 * we don't have a good statistic (IpOutDiscards but it can be too many
+	 * things).  We could add another new stat but at least for now that
+	 * seems like overkill.
+	 */
+	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		UDP_INC_STATS_USER(sock_net(sk),
+				UDP_MIB_SNDBUFERRORS, is_udplite);
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(&rt->dst);
+	if (!(msg->msg_flags&MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+EXPORT_SYMBOL(udp_sendmsg);
+
+int udp_sendpage(struct sock *sk, struct page *page, int offset,
+		 size_t size, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct udp_sock *up = udp_sk(sk);
+	int ret;
+
+	if (!up->pending) {
+		struct msghdr msg = {	.msg_flags = flags|MSG_MORE };
+
+		/* Call udp_sendmsg to specify destination address which
+		 * sendpage interface can't pass.
+		 * This will succeed only when the socket is connected.
+		 */
+		ret = udp_sendmsg(NULL, sk, &msg, 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	lock_sock(sk);
+
+	if (unlikely(!up->pending)) {
+		release_sock(sk);
+
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
+		return -EINVAL;
+	}
+
+	ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+			     page, offset, size, flags);
+	if (ret == -EOPNOTSUPP) {
+		release_sock(sk);
+		return sock_no_sendpage(sk->sk_socket, page, offset,
+					size, flags);
+	}
+	if (ret < 0) {
+		udp_flush_pending_frames(sk);
+		goto out;
+	}
+
+	up->len += size;
+	if (!(up->corkflag || (flags&MSG_MORE)))
+		ret = udp_push_pending_frames(sk);
+	if (!ret)
+		ret = size;
+out:
+	release_sock(sk);
+	return ret;
+}
+
+
+/**
+ *	first_packet_length	- return length of first packet in receive queue
+ *	@sk: socket
+ *
+ *	Drops all bad checksum frames, until a valid one is found.
+ *	Returns the length of found skb, or 0 if none is found.
+ */
+static unsigned int first_packet_length(struct sock *sk)
+{
+	struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+	struct sk_buff *skb;
+	unsigned int res;
+
+	__skb_queue_head_init(&list_kill);
+
+	spin_lock_bh(&rcvq->lock);
+	while ((skb = skb_peek(rcvq)) != NULL &&
+		udp_lib_checksum_complete(skb)) {
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+				 IS_UDPLITE(sk));
+		atomic_inc(&sk->sk_drops);
+		__skb_unlink(skb, rcvq);
+		__skb_queue_tail(&list_kill, skb);
+	}
+	res = skb ? skb->len : 0;
+	spin_unlock_bh(&rcvq->lock);
+
+	if (!skb_queue_empty(&list_kill)) {
+		bool slow = lock_sock_fast(sk);
+
+		__skb_queue_purge(&list_kill);
+		sk_mem_reclaim_partial(sk);
+		unlock_sock_fast(sk, slow);
+	}
+	return res;
+}
+
+/*
+ *	IOCTL requests applicable to the UDP protocol
+ */
+
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	{
+		int amount = sk_wmem_alloc_get(sk);
+
+		return put_user(amount, (int __user *)arg);
+	}
+
+	case SIOCINQ:
+	{
+		unsigned int amount = first_packet_length(sk);
+
+		if (amount)
+			/*
+			 * We will only return the amount
+			 * of this packet since that is all
+			 * that will be read.
+			 */
+			amount -= sizeof(struct udphdr);
+
+		return put_user(amount, (int __user *)arg);
+	}
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(udp_ioctl);
+
+/*
+ * 	This should be easy, if there is something there we
+ * 	return it, otherwise we block.
+ */
+
+int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+	unsigned int ulen;
+	int peeked;
+	int err;
+	int is_udplite = IS_UDPLITE(sk);
+	bool slow;
+
+	/*
+	 *	Check any passed addresses
+	 */
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len);
+
+try_again:
+	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				  &peeked, &err);
+	if (!skb)
+		goto out;
+
+	ulen = skb->len - sizeof(struct udphdr);
+	if (len > ulen)
+		len = ulen;
+	else if (len < ulen)
+		msg->msg_flags |= MSG_TRUNC;
+
+	/*
+	 * If checksum is needed at all, try to do it while copying the
+	 * data.  If the data is truncated, or if we only want a partial
+	 * coverage checksum (UDP-Lite), do it before the copy.
+	 */
+
+	if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
+		if (udp_lib_checksum_complete(skb))
+			goto csum_copy_err;
+	}
+
+	if (skb_csum_unnecessary(skb))
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
+					      msg->msg_iov, len);
+	else {
+		err = skb_copy_and_csum_datagram_iovec(skb,
+						       sizeof(struct udphdr),
+						       msg->msg_iov);
+
+		if (err == -EINVAL)
+			goto csum_copy_err;
+	}
+
+	if (err)
+		goto out_free;
+
+	if (!peeked)
+		UDP_INC_STATS_USER(sock_net(sk),
+				UDP_MIB_INDATAGRAMS, is_udplite);
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = udp_hdr(skb)->source;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (inet->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+
+	err = len;
+	if (flags & MSG_TRUNC)
+		err = ulen;
+
+out_free:
+	skb_free_datagram_locked(sk, skb);
+out:
+	return err;
+
+csum_copy_err:
+	slow = lock_sock_fast(sk);
+	if (!skb_kill_datagram(sk, skb, flags))
+		UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	unlock_sock_fast(sk, slow);
+
+	if (noblock)
+		return -EAGAIN;
+
+	/* starting over for a new packet */
+	msg->msg_flags &= ~MSG_TRUNC;
+	goto try_again;
+}
+
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	/*
+	 *	1003.1g - break association.
+	 */
+
+	sk->sk_state = TCP_CLOSE;
+	inet->inet_daddr = 0;
+	inet->inet_dport = 0;
+	sock_rps_save_rxhash(sk, 0);
+	sk->sk_bound_dev_if = 0;
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+		sk->sk_prot->unhash(sk);
+		inet->inet_sport = 0;
+	}
+	sk_dst_reset(sk);
+	return 0;
+}
+EXPORT_SYMBOL(udp_disconnect);
+
+void udp_lib_unhash(struct sock *sk)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_hslot *hslot, *hslot2;
+
+		hslot  = udp_hashslot(udptable, sock_net(sk),
+				      udp_sk(sk)->udp_port_hash);
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+
+		spin_lock_bh(&hslot->lock);
+		if (sk_nulls_del_node_init_rcu(sk)) {
+			hslot->count--;
+			inet_sk(sk)->inet_num = 0;
+			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+			spin_lock(&hslot2->lock);
+			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
+		}
+		spin_unlock_bh(&hslot->lock);
+	}
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		nhslot2 = udp_hashslot2(udptable, newhash);
+		udp_sk(sk)->udp_portaddr_hash = newhash;
+		if (hslot2 != nhslot2) {
+			hslot = udp_hashslot(udptable, sock_net(sk),
+					     udp_sk(sk)->udp_port_hash);
+			/* we must lock primary chain too */
+			spin_lock_bh(&hslot->lock);
+
+			spin_lock(&hslot2->lock);
+			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
+
+			spin_lock(&nhslot2->lock);
+			hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+						 &nhslot2->head);
+			nhslot2->count++;
+			spin_unlock(&nhslot2->lock);
+
+			spin_unlock_bh(&hslot->lock);
+		}
+	}
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	udp_lib_rehash(sk, new_hash);
+}
+
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+
+	if (inet_sk(sk)->inet_daddr)
+		sock_rps_save_rxhash(sk, skb->rxhash);
+
+	rc = ip_queue_rcv_skb(sk, skb);
+	if (rc < 0) {
+		int is_udplite = IS_UDPLITE(sk);
+
+		/* Note that an ENOMEM error is charged twice */
+		if (rc == -ENOMEM)
+			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+					 is_udplite);
+		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+		kfree_skb(skb);
+		return -1;
+	}
+
+	return 0;
+
+}
+
+/* returns:
+ *  -1: error
+ *   0: success
+ *  >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int rc;
+	int is_udplite = IS_UDPLITE(sk);
+
+	/*
+	 *	Charge it to the socket, dropping if the queue is full.
+	 */
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto drop;
+	nf_reset(skb);
+
+	if (up->encap_type) {
+		/*
+		 * This is an encapsulation socket so pass the skb to
+		 * the socket's udp_encap_rcv() hook. Otherwise, just
+		 * fall through and pass this up the UDP socket.
+		 * up->encap_rcv() returns the following value:
+		 * =0 if skb was successfully passed to the encap
+		 *    handler or was discarded by it.
+		 * >0 if skb should be passed on to UDP.
+		 * <0 if skb should be resubmitted as proto -N
+		 */
+
+		/* if we're overly short, let UDP handle it */
+		if (skb->len > sizeof(struct udphdr) &&
+		    up->encap_rcv != NULL) {
+			int ret;
+
+			ret = (*up->encap_rcv)(sk, skb);
+			if (ret <= 0) {
+				UDP_INC_STATS_BH(sock_net(sk),
+						 UDP_MIB_INDATAGRAMS,
+						 is_udplite);
+				return -ret;
+			}
+		}
+
+		/* FALLTHROUGH -- it's a UDP Packet */
+	}
+
+	/*
+	 * 	UDP-Lite specific tests, ignored on UDP sockets
+	 */
+	if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+
+		/*
+		 * MIB statistics other than incrementing the error count are
+		 * disabled for the following two types of errors: these depend
+		 * on the application settings, not on the functioning of the
+		 * protocol stack as such.
+		 *
+		 * RFC 3828 here recommends (sec 3.3): "There should also be a
+		 * way ... to ... at least let the receiving application block
+		 * delivery of packets with coverage values less than a value
+		 * provided by the application."
+		 */
+		if (up->pcrlen == 0) {          /* full coverage was set  */
+			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
+				"%d while full coverage %d requested\n",
+				UDP_SKB_CB(skb)->cscov, skb->len);
+			goto drop;
+		}
+		/* The next case involves violating the min. coverage requested
+		 * by the receiver. This is subtle: if receiver wants x and x is
+		 * greater than the buffersize/MTU then receiver will complain
+		 * that it wants x while sender emits packets of smaller size y.
+		 * Therefore the above ...()->partial_cov statement is essential.
+		 */
+		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+			LIMIT_NETDEBUG(KERN_WARNING
+				"UDPLITE: coverage %d too small, need min %d\n",
+				UDP_SKB_CB(skb)->cscov, up->pcrlen);
+			goto drop;
+		}
+	}
+
+	if (rcu_dereference_raw(sk->sk_filter)) {
+		if (udp_lib_checksum_complete(skb))
+			goto drop;
+	}
+
+
+	if (sk_rcvqueues_full(sk, skb))
+		goto drop;
+
+	rc = 0;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		rc = __udp_queue_rcv_skb(sk, skb);
+	else if (sk_add_backlog(sk, skb)) {
+		bh_unlock_sock(sk);
+		goto drop;
+	}
+	bh_unlock_sock(sk);
+
+	return rc;
+
+drop:
+	UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	atomic_inc(&sk->sk_drops);
+	kfree_skb(skb);
+	return -1;
+}
+
+
+static void flush_stack(struct sock **stack, unsigned int count,
+			struct sk_buff *skb, unsigned int final)
+{
+	unsigned int i;
+	struct sk_buff *skb1 = NULL;
+	struct sock *sk;
+
+	for (i = 0; i < count; i++) {
+		sk = stack[i];
+		if (likely(skb1 == NULL))
+			skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+		if (!skb1) {
+			atomic_inc(&sk->sk_drops);
+			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+					 IS_UDPLITE(sk));
+			UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+					 IS_UDPLITE(sk));
+		}
+
+		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
+			skb1 = NULL;
+	}
+	if (unlikely(skb1))
+		kfree_skb(skb1);
+}
+
+/*
+ *	Multicasts and broadcasts go to each listener.
+ *
+ *	Note: called only from the BH handler context.
+ */
+static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+				    struct udphdr  *uh,
+				    __be32 saddr, __be32 daddr,
+				    struct udp_table *udptable)
+{
+	struct sock *sk, *stack[256 / sizeof(struct sock *)];
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
+	int dif;
+	unsigned int i, count = 0;
+
+	spin_lock(&hslot->lock);
+	sk = sk_nulls_head(&hslot->head);
+	dif = skb->dev->ifindex;
+	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
+	while (sk) {
+		stack[count++] = sk;
+		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
+				       daddr, uh->source, saddr, dif);
+		if (unlikely(count == ARRAY_SIZE(stack))) {
+			if (!sk)
+				break;
+			flush_stack(stack, count, skb, ~0);
+			count = 0;
+		}
+	}
+	/*
+	 * before releasing chain lock, we must take a reference on sockets
+	 */
+	for (i = 0; i < count; i++)
+		sock_hold(stack[i]);
+
+	spin_unlock(&hslot->lock);
+
+	/*
+	 * do the slow work with no lock held
+	 */
+	if (count) {
+		flush_stack(stack, count, skb, count - 1);
+
+		for (i = 0; i < count; i++)
+			sock_put(stack[i]);
+	} else {
+		kfree_skb(skb);
+	}
+	return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
+				 int proto)
+{
+	const struct iphdr *iph;
+	int err;
+
+	UDP_SKB_CB(skb)->partial_cov = 0;
+	UDP_SKB_CB(skb)->cscov = skb->len;
+
+	if (proto == IPPROTO_UDPLITE) {
+		err = udplite_checksum_init(skb, uh);
+		if (err)
+			return err;
+	}
+
+	iph = ip_hdr(skb);
+	if (uh->check == 0) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+				      proto, skb->csum))
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	if (!skb_csum_unnecessary(skb))
+		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+					       skb->len, proto, 0);
+	/* Probably, we should checksum udp header (it should be in cache
+	 * in any case) and data in tiny packets (< rx copybreak).
+	 */
+
+	return 0;
+}
+
+/*
+ *	All we need to do is get the socket, and then do a checksum.
+ */
+
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+		   int proto)
+{
+	struct sock *sk;
+	struct udphdr *uh;
+	unsigned short ulen;
+	struct rtable *rt = skb_rtable(skb);
+	__be32 saddr, daddr;
+	struct net *net = dev_net(skb->dev);
+
+	/*
+	 *  Validate the packet.
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto drop;		/* No space for header. */
+
+	uh   = udp_hdr(skb);
+	ulen = ntohs(uh->len);
+	saddr = ip_hdr(skb)->saddr;
+	daddr = ip_hdr(skb)->daddr;
+
+	if (ulen > skb->len)
+		goto short_packet;
+
+	if (proto == IPPROTO_UDP) {
+		/* UDP validates ulen. */
+		if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
+			goto short_packet;
+		uh = udp_hdr(skb);
+	}
+
+	if (udp4_csum_init(skb, uh, proto))
+		goto csum_error;
+
+	if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+		return __udp4_lib_mcast_deliver(net, skb, uh,
+				saddr, daddr, udptable);
+
+	sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+
+	if (sk != NULL) {
+		int ret = udp_queue_rcv_skb(sk, skb);
+		sock_put(sk);
+
+		/* a return value > 0 means to resubmit the input, but
+		 * it wants the return to be -protocol, or 0
+		 */
+		if (ret > 0)
+			return -ret;
+		return 0;
+	}
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto drop;
+	nf_reset(skb);
+
+	/* No socket. Drop packet silently, if checksum is wrong */
+	if (udp_lib_checksum_complete(skb))
+		goto csum_error;
+
+	UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	/*
+	 * Hmm.  We got an UDP packet to a port to which we
+	 * don't wanna listen.  Ignore it.
+	 */
+	kfree_skb(skb);
+	return 0;
+
+short_packet:
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
+		       proto == IPPROTO_UDPLITE ? "-Lite" : "",
+		       &saddr,
+		       ntohs(uh->source),
+		       ulen,
+		       skb->len,
+		       &daddr,
+		       ntohs(uh->dest));
+	goto drop;
+
+csum_error:
+	/*
+	 * RFC1122: OK.  Discards the bad packet silently (as far as
+	 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
+	 */
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
+		       proto == IPPROTO_UDPLITE ? "-Lite" : "",
+		       &saddr,
+		       ntohs(uh->source),
+		       &daddr,
+		       ntohs(uh->dest),
+		       ulen);
+drop:
+	UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+	kfree_skb(skb);
+	return 0;
+}
+
+int udp_rcv(struct sk_buff *skb)
+{
+	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+}
+
+void udp_destroy_sock(struct sock *sk)
+{
+	bool slow = lock_sock_fast(sk);
+	udp_flush_pending_frames(sk);
+	unlock_sock_fast(sk, slow);
+}
+
+/*
+ *	Socket option code for UDP
+ */
+int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+		       char __user *optval, unsigned int optlen,
+		       int (*push_pending_frames)(struct sock *))
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val;
+	int err = 0;
+	int is_udplite = IS_UDPLITE(sk);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+	case UDP_CORK:
+		if (val != 0) {
+			up->corkflag = 1;
+		} else {
+			up->corkflag = 0;
+			lock_sock(sk);
+			(*push_pending_frames)(sk);
+			release_sock(sk);
+		}
+		break;
+
+	case UDP_ENCAP:
+		switch (val) {
+		case 0:
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			up->encap_rcv = xfrm4_udp_encap_rcv;
+			/* FALLTHROUGH */
+		case UDP_ENCAP_L2TPINUDP:
+			up->encap_type = val;
+			break;
+		default:
+			err = -ENOPROTOOPT;
+			break;
+		}
+		break;
+
+	/*
+	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
+	 */
+	/* The sender sets actual checksum coverage length via this option.
+	 * The case coverage > packet length is handled by send module. */
+	case UDPLITE_SEND_CSCOV:
+		if (!is_udplite)         /* Disable the option on UDP sockets */
+			return -ENOPROTOOPT;
+		if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
+			val = 8;
+		else if (val > USHRT_MAX)
+			val = USHRT_MAX;
+		up->pcslen = val;
+		up->pcflag |= UDPLITE_SEND_CC;
+		break;
+
+	/* The receiver specifies a minimum checksum coverage value. To make
+	 * sense, this should be set to at least 8 (as done below). If zero is
+	 * used, this again means full checksum coverage.                     */
+	case UDPLITE_RECV_CSCOV:
+		if (!is_udplite)         /* Disable the option on UDP sockets */
+			return -ENOPROTOOPT;
+		if (val != 0 && val < 8) /* Avoid silly minimal values.       */
+			val = 8;
+		else if (val > USHRT_MAX)
+			val = USHRT_MAX;
+		up->pcrlen = val;
+		up->pcflag |= UDPLITE_RECV_CC;
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(udp_lib_setsockopt);
+
+int udp_setsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, unsigned int optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+					  udp_push_pending_frames);
+	return ip_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+					  udp_push_pending_frames);
+	return compat_ip_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+		       char __user *optval, int __user *optlen)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val, len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case UDP_CORK:
+		val = up->corkflag;
+		break;
+
+	case UDP_ENCAP:
+		val = up->encap_type;
+		break;
+
+	/* The following two cannot be changed on UDP sockets, the return is
+	 * always 0 (which corresponds to the full checksum coverage of UDP). */
+	case UDPLITE_SEND_CSCOV:
+		val = up->pcslen;
+		break;
+
+	case UDPLITE_RECV_CSCOV:
+		val = up->pcrlen;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL(udp_lib_getsockopt);
+
+int udp_getsockopt(struct sock *sk, int level, int optname,
+		   char __user *optval, int __user *optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+	return ip_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udp_getsockopt(struct sock *sk, int level, int optname,
+				 char __user *optval, int __user *optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+	return compat_ip_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+/**
+ * 	udp_poll - wait for a UDP event.
+ *	@file - file struct
+ *	@sock - socket
+ *	@wait - poll table
+ *
+ *	This is same as datagram poll, except for the special case of
+ *	blocking sockets. If application is using a blocking fd
+ *	and a packet with checksum error is in the queue;
+ *	then it could get return from select indicating data available
+ *	but then block when reading it. Add special case code
+ *	to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	unsigned int mask = datagram_poll(file, sock, wait);
+	struct sock *sk = sock->sk;
+
+	/* Check for false positives due to checksum errors */
+	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	    !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
+		mask &= ~(POLLIN | POLLRDNORM);
+
+	return mask;
+
+}
+EXPORT_SYMBOL(udp_poll);
+
+struct proto udp_prot = {
+	.name		   = "UDP",
+	.owner		   = THIS_MODULE,
+	.close		   = udp_lib_close,
+	.connect	   = ip4_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = udp_ioctl,
+	.destroy	   = udp_destroy_sock,
+	.setsockopt	   = udp_setsockopt,
+	.getsockopt	   = udp_getsockopt,
+	.sendmsg	   = udp_sendmsg,
+	.recvmsg	   = udp_recvmsg,
+	.sendpage	   = udp_sendpage,
+	.backlog_rcv	   = __udp_queue_rcv_skb,
+	.hash		   = udp_lib_hash,
+	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v4_rehash,
+	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
+	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.h.udp_table	   = &udp_table,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_udp_setsockopt,
+	.compat_getsockopt = compat_udp_getsockopt,
+#endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udp_prot);
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static struct sock *udp_get_first(struct seq_file *seq, int start)
+{
+	struct sock *sk;
+	struct udp_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = start; state->bucket <= state->udp_table->mask;
+	     ++state->bucket) {
+		struct hlist_nulls_node *node;
+		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+		if (hlist_nulls_empty(&hslot->head))
+			continue;
+
+		spin_lock_bh(&hslot->lock);
+		sk_nulls_for_each(sk, node, &hslot->head) {
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (sk->sk_family == state->family)
+				goto found;
+		}
+		spin_unlock_bh(&hslot->lock);
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct udp_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	do {
+		sk = sk_nulls_next(sk);
+	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+
+	if (!sk) {
+		if (state->bucket <= state->udp_table->mask)
+			spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+		return udp_get_first(seq, state->bucket + 1);
+	}
+	return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = udp_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct udp_iter_state *state = seq->private;
+	state->bucket = MAX_UDP_PORTS;
+
+	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = udp_get_idx(seq, 0);
+	else
+		sk = udp_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct udp_iter_state *state = seq->private;
+
+	if (state->bucket <= state->udp_table->mask)
+		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
+}
+
+static int udp_seq_open(struct inode *inode, struct file *file)
+{
+	struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct udp_iter_state *s;
+	int err;
+
+	err = seq_open_net(inode, file, &afinfo->seq_ops,
+			   sizeof(struct udp_iter_state));
+	if (err < 0)
+		return err;
+
+	s = ((struct seq_file *)file->private_data)->private;
+	s->family		= afinfo->family;
+	s->udp_table		= afinfo->udp_table;
+	return err;
+}
+
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	afinfo->seq_fops.open		= udp_seq_open;
+	afinfo->seq_fops.read		= seq_read;
+	afinfo->seq_fops.llseek		= seq_lseek;
+	afinfo->seq_fops.release	= seq_release_net;
+
+	afinfo->seq_ops.start		= udp_seq_start;
+	afinfo->seq_ops.next		= udp_seq_next;
+	afinfo->seq_ops.stop		= udp_seq_stop;
+
+	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+			     &afinfo->seq_fops, afinfo);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+EXPORT_SYMBOL(udp_proc_register);
+
+void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
+{
+	proc_net_remove(net, afinfo->name);
+}
+EXPORT_SYMBOL(udp_proc_unregister);
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket, int *len)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src  = inet->inet_rcv_saddr;
+	__u16 destp	  = ntohs(inet->inet_dport);
+	__u16 srcp	  = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops), len);
+}
+
+int udp4_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct udp_iter_state *state = seq->private;
+		int len;
+
+		udp4_format_sock(v, seq, state->bucket, &len);
+		seq_printf(seq, "%*s\n", 127 - len, "");
+	}
+	return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+	.name		= "udp",
+	.family		= AF_INET,
+	.udp_table	= &udp_table,
+	.seq_fops	= {
+		.owner	=	THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= udp4_seq_show,
+	},
+};
+
+static int __net_init udp4_proc_init_net(struct net *net)
+{
+	return udp_proc_register(net, &udp4_seq_afinfo);
+}
+
+static void __net_exit udp4_proc_exit_net(struct net *net)
+{
+	udp_proc_unregister(net, &udp4_seq_afinfo);
+}
+
+static struct pernet_operations udp4_net_ops = {
+	.init = udp4_proc_init_net,
+	.exit = udp4_proc_exit_net,
+};
+
+int __init udp4_proc_init(void)
+{
+	return register_pernet_subsys(&udp4_net_ops);
+}
+
+void udp4_proc_exit(void)
+{
+	unregister_pernet_subsys(&udp4_net_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	uhash_entries = simple_strtoul(str, &str, 0);
+	if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+		uhash_entries = UDP_HTABLE_SIZE_MIN;
+	return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
+
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+	unsigned int i;
+
+	if (!CONFIG_BASE_SMALL)
+		table->hash = alloc_large_system_hash(name,
+			2 * sizeof(struct udp_hslot),
+			uhash_entries,
+			21, /* one slot per 2 MB */
+			0,
+			&table->log,
+			&table->mask,
+			64 * 1024);
+	/*
+	 * Make sure hash table has the minimum size
+	 */
+	if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
+		table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
+				      2 * sizeof(struct udp_hslot), GFP_KERNEL);
+		if (!table->hash)
+			panic(name);
+		table->log = ilog2(UDP_HTABLE_SIZE_MIN);
+		table->mask = UDP_HTABLE_SIZE_MIN - 1;
+	}
+	table->hash2 = table->hash + (table->mask + 1);
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+		table->hash[i].count = 0;
+		spin_lock_init(&table->hash[i].lock);
+	}
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+		table->hash2[i].count = 0;
+		spin_lock_init(&table->hash2[i].lock);
+	}
+}
+
+void __init udp_init(void)
+{
+	unsigned long limit;
+
+	udp_table_init(&udp_table, "UDP");
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	sysctl_udp_mem[0] = limit / 4 * 3;
+	sysctl_udp_mem[1] = limit;
+	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
+
+int udp4_ufo_send_check(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	struct udphdr *uh;
+
+	if (!pskb_may_pull(skb, sizeof(*uh)))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+	uh = udp_hdr(skb);
+
+	uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+				       IPPROTO_UDP, 0);
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	return 0;
+}
+
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	int offset;
+	__wsum csum;
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+		int type = skb_shinfo(skb)->gso_type;
+
+		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+			     !(type & (SKB_GSO_UDP))))
+			goto out;
+
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+		segs = NULL;
+		goto out;
+	}
+
+	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+	 * do checksum of UDP packets sent as multiple IP fragments.
+	 */
+	offset = skb_checksum_start_offset(skb);
+	csum = skb_checksum(skb, offset, skb->len - offset, 0);
+	offset += skb->csum_offset;
+	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Fragment the skb. IP headers of the fragments are updated in
+	 * inet_gso_segment()
+	 */
+	segs = skb_segment(skb, features);
+out:
+	return segs;
+}
+
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
new file mode 100644
index 00000000..aaad650d
--- /dev/null
+++ b/net/ipv4/udp_impl.h
@@ -0,0 +1,34 @@
+#ifndef _UDP4_IMPL_H
+#define _UDP4_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+
+extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
+extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+
+extern int	udp_v4_get_port(struct sock *sk, unsigned short snum);
+
+extern int	udp_setsockopt(struct sock *sk, int level, int optname,
+			       char __user *optval, unsigned int optlen);
+extern int	udp_getsockopt(struct sock *sk, int level, int optname,
+			       char __user *optval, int __user *optlen);
+
+#ifdef CONFIG_COMPAT
+extern int	compat_udp_setsockopt(struct sock *sk, int level, int optname,
+				      char __user *optval, unsigned int optlen);
+extern int	compat_udp_getsockopt(struct sock *sk, int level, int optname,
+				      char __user *optval, int __user *optlen);
+#endif
+extern int	udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+			    size_t len, int noblock, int flags, int *addr_len);
+extern int	udp_sendpage(struct sock *sk, struct page *page, int offset,
+			     size_t size, int flags);
+extern int	udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
+extern void	udp_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+extern int	udp4_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif	/* _UDP4_IMPL_H */
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
new file mode 100644
index 00000000..aee9963f
--- /dev/null
+++ b/net/ipv4/udplite.c
@@ -0,0 +1,131 @@
+/*
+ *  UDPLITE     An implementation of the UDP-Lite protocol (RFC 3828).
+ *
+ *  Authors:    Gerrit Renker       <gerrit@erg.abdn.ac.uk>
+ *
+ *  Changes:
+ *  Fixes:
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include "udp_impl.h"
+
+struct udp_table 	udplite_table __read_mostly;
+EXPORT_SYMBOL(udplite_table);
+
+static int udplite_rcv(struct sk_buff *skb)
+{
+	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static void udplite_err(struct sk_buff *skb, u32 info)
+{
+	__udp4_lib_err(skb, info, &udplite_table);
+}
+
+static const struct net_protocol udplite_protocol = {
+	.handler	= udplite_rcv,
+	.err_handler	= udplite_err,
+	.no_policy	= 1,
+	.netns_ok	= 1,
+};
+
+struct proto 	udplite_prot = {
+	.name		   = "UDP-Lite",
+	.owner		   = THIS_MODULE,
+	.close		   = udp_lib_close,
+	.connect	   = ip4_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = udp_ioctl,
+	.init		   = udplite_sk_init,
+	.destroy	   = udp_destroy_sock,
+	.setsockopt	   = udp_setsockopt,
+	.getsockopt	   = udp_getsockopt,
+	.sendmsg	   = udp_sendmsg,
+	.recvmsg	   = udp_recvmsg,
+	.sendpage	   = udp_sendpage,
+	.backlog_rcv	   = udp_queue_rcv_skb,
+	.hash		   = udp_lib_hash,
+	.unhash		   = udp_lib_unhash,
+	.get_port	   = udp_v4_get_port,
+	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.h.udp_table	   = &udplite_table,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_udp_setsockopt,
+	.compat_getsockopt = compat_udp_getsockopt,
+#endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+};
+EXPORT_SYMBOL(udplite_prot);
+
+static struct inet_protosw udplite4_protosw = {
+	.type		=  SOCK_DGRAM,
+	.protocol	=  IPPROTO_UDPLITE,
+	.prot		=  &udplite_prot,
+	.ops		=  &inet_dgram_ops,
+	.no_check	=  0,		/* must checksum (RFC 3828) */
+	.flags		=  INET_PROTOSW_PERMANENT,
+};
+
+#ifdef CONFIG_PROC_FS
+static struct udp_seq_afinfo udplite4_seq_afinfo = {
+	.name		= "udplite",
+	.family		= AF_INET,
+	.udp_table 	= &udplite_table,
+	.seq_fops	= {
+		.owner	=	THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= udp4_seq_show,
+	},
+};
+
+static int __net_init udplite4_proc_init_net(struct net *net)
+{
+	return udp_proc_register(net, &udplite4_seq_afinfo);
+}
+
+static void __net_exit udplite4_proc_exit_net(struct net *net)
+{
+	udp_proc_unregister(net, &udplite4_seq_afinfo);
+}
+
+static struct pernet_operations udplite4_net_ops = {
+	.init = udplite4_proc_init_net,
+	.exit = udplite4_proc_exit_net,
+};
+
+static __init int udplite4_proc_init(void)
+{
+	return register_pernet_subsys(&udplite4_net_ops);
+}
+#else
+static inline int udplite4_proc_init(void)
+{
+	return 0;
+}
+#endif
+
+void __init udplite4_register(void)
+{
+	udp_table_init(&udplite_table, "UDP-Lite");
+	if (proto_register(&udplite_prot, 1))
+		goto out_register_err;
+
+	if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0)
+		goto out_unregister_proto;
+
+	inet_register_protosw(&udplite4_protosw);
+
+	if (udplite4_proc_init())
+		printk(KERN_ERR "%s: Cannot register /proc!\n", __func__);
+	return;
+
+out_unregister_proto:
+	proto_unregister(&udplite_prot);
+out_register_err:
+	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
+}
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 00000000..06814b62
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,166 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ *	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ *	Derek Atkins <derek@ihtfp.com>
+ *		Add Encapsulation support
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return xfrm4_extract_header(skb);
+}
+
+static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
+{
+	if (skb_dst(skb) == NULL) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev))
+			goto drop;
+	}
+	return dst_input(skb);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+		    int encap_type)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+	return xfrm_input(skb, nexthdr, spi, encap_type);
+}
+EXPORT_SYMBOL(xfrm4_rcv_encap);
+
+int xfrm4_transport_finish(struct sk_buff *skb, int async)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+	if (!async)
+		return -iph->protocol;
+#endif
+
+	__skb_push(skb, skb->data - skb_network_header(skb));
+	iph->tot_len = htons(skb->len);
+	ip_send_check(iph);
+
+	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+		xfrm4_rcv_encap_finish);
+	return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+	struct udphdr *uh;
+	struct iphdr *iph;
+	int iphlen, len;
+
+	__u8 *udpdata;
+	__be32 *udpdata32;
+	__u16 encap_type = up->encap_type;
+
+	/* if this is not encapsulated socket, then just return now */
+	if (!encap_type)
+		return 1;
+
+	/* If this is a paged skb, make sure we pull up
+	 * whatever data we need to look at. */
+	len = skb->len - sizeof(struct udphdr);
+	if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+		return 1;
+
+	/* Now we can get the pointers */
+	uh = udp_hdr(skb);
+	udpdata = (__u8 *)uh + sizeof(struct udphdr);
+	udpdata32 = (__be32 *)udpdata;
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+			/* ESP Packet without Non-ESP header */
+			len = sizeof(struct udphdr);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			goto drop;
+		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+			   udpdata32[0] == 0 && udpdata32[1] == 0) {
+
+			/* ESP Packet with Non-IKE marker */
+			len = sizeof(struct udphdr) + 2 * sizeof(u32);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	}
+
+	/* At this point we are sure that this is an ESPinUDP packet,
+	 * so we need to remove 'len' bytes from the packet (the UDP
+	 * header and optional ESP marker bytes) and then modify the
+	 * protocol to ESP, and then call into the transform receiver.
+	 */
+	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		goto drop;
+
+	/* Now we can update and verify the packet length... */
+	iph = ip_hdr(skb);
+	iphlen = iph->ihl << 2;
+	iph->tot_len = htons(ntohs(iph->tot_len) - len);
+	if (skb->len < iphlen + len) {
+		/* packet is too small!?! */
+		goto drop;
+	}
+
+	/* pull the data buffer up to the ESP header and set the
+	 * transport header to point to ESP.  Keep UDP on the stack
+	 * for later.
+	 */
+	__skb_pull(skb, len);
+	skb_reset_transport_header(skb);
+
+	/* process ESP */
+	return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+	return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
+}
+EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
new file mode 100644
index 00000000..e3db3f91
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -0,0 +1,156 @@
+/*
+ * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
+ *                    Miika Komu     <miika@iki.fi>
+ *                    Herbert Xu     <herbert@gondor.apana.org.au>
+ *                    Abhinav Pathak <abhinav.pathak@hiit.fi>
+ *                    Jeff Ahrenholz <ahrenholz@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static void xfrm4_beet_make_header(struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+
+	iph->ihl = 5;
+	iph->version = 4;
+
+	iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
+	iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
+
+	iph->id = XFRM_MODE_SKB_CB(skb)->id;
+	iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
+	iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
+ */
+static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_beet_phdr *ph;
+	struct iphdr *top_iph;
+	int hdrlen, optlen;
+
+	hdrlen = 0;
+	optlen = XFRM_MODE_SKB_CB(skb)->optlen;
+	if (unlikely(optlen))
+		hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
+
+	skb_set_network_header(skb, -x->props.header_len -
+			            hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
+	if (x->sel.family != AF_INET6)
+		skb->network_header += IPV4_BEET_PHMAXLEN;
+	skb->mac_header = skb->network_header +
+			  offsetof(struct iphdr, protocol);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+	xfrm4_beet_make_header(skb);
+
+	ph = (struct ip_beet_phdr *)
+		__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
+
+	top_iph = ip_hdr(skb);
+
+	if (unlikely(optlen)) {
+		BUG_ON(optlen < 0);
+
+		ph->padlen = 4 - (optlen & 4);
+		ph->hdrlen = optlen / 8;
+		ph->nexthdr = top_iph->protocol;
+		if (ph->padlen)
+			memset(ph + 1, IPOPT_NOP, ph->padlen);
+
+		top_iph->protocol = IPPROTO_BEETPH;
+		top_iph->ihl = sizeof(struct iphdr) / 4;
+	}
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+
+	return 0;
+}
+
+static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	int optlen = 0;
+	int err = -EINVAL;
+
+	if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
+		struct ip_beet_phdr *ph;
+		int phlen;
+
+		if (!pskb_may_pull(skb, sizeof(*ph)))
+			goto out;
+
+		ph = (struct ip_beet_phdr *)skb->data;
+
+		phlen = sizeof(*ph) + ph->padlen;
+		optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
+		if (optlen < 0 || optlen & 3 || optlen > 250)
+			goto out;
+
+		XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
+
+		if (!pskb_may_pull(skb, phlen))
+			goto out;
+		__skb_pull(skb, phlen);
+	}
+
+	skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	xfrm4_beet_make_header(skb);
+
+	iph = ip_hdr(skb);
+
+	iph->ihl += optlen / 4;
+	iph->tot_len = htons(skb->len);
+	iph->daddr = x->sel.daddr.a4;
+	iph->saddr = x->sel.saddr.a4;
+	iph->check = 0;
+	iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+	err = 0;
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_beet_mode = {
+	.input2 = xfrm4_beet_input,
+	.input = xfrm_prepare_input,
+	.output2 = xfrm4_beet_output,
+	.output = xfrm4_prepare_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_BEET,
+	.flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_beet_init(void)
+{
+	return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
+}
+
+static void __exit xfrm4_beet_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_beet_init);
+module_exit(xfrm4_beet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
new file mode 100644
index 00000000..fd840c7d
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -0,0 +1,80 @@
+/*
+ * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header will be moved forward to make space for the encapsulation
+ * header.
+ */
+static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	int ihl = iph->ihl * 4;
+
+	skb_set_network_header(skb, -x->props.header_len);
+	skb->mac_header = skb->network_header +
+			  offsetof(struct iphdr, protocol);
+	skb->transport_header = skb->network_header + ihl;
+	__skb_pull(skb, ihl);
+	memmove(skb_network_header(skb), iph, ihl);
+	return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is.  skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int ihl = skb->data - skb_transport_header(skb);
+
+	if (skb->transport_header != skb->network_header) {
+		memmove(skb_transport_header(skb),
+			skb_network_header(skb), ihl);
+		skb->network_header = skb->transport_header;
+	}
+	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
+	skb_reset_transport_header(skb);
+	return 0;
+}
+
+static struct xfrm_mode xfrm4_transport_mode = {
+	.input = xfrm4_transport_input,
+	.output = xfrm4_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm4_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
+}
+
+static void __exit xfrm4_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_transport_init);
+module_exit(xfrm4_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
new file mode 100644
index 00000000..ed4bf11e
--- /dev/null
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -0,0 +1,121 @@
+/*
+ * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
+ *
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *inner_iph = ipip_hdr(skb);
+
+	if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.
+ */
+static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct iphdr *top_iph;
+	int flags;
+
+	skb_set_network_header(skb, -x->props.header_len);
+	skb->mac_header = skb->network_header +
+			  offsetof(struct iphdr, protocol);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+	top_iph = ip_hdr(skb);
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
+					    XFRM_MODE_SKB_CB(skb)->tos);
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
+	ip_select_ident(top_iph, dst->child, NULL);
+
+	top_iph->ttl = ip4_dst_hoplimit(dst->child);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+
+	return 0;
+}
+
+static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
+		goto out;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip_ecn_decapsulate(skb);
+
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm4_tunnel_mode = {
+	.input2 = xfrm4_mode_tunnel_input,
+	.input = xfrm_prepare_input,
+	.output2 = xfrm4_mode_tunnel_output,
+	.output = xfrm4_prepare_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+	.flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm4_mode_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
+}
+
+static void __exit xfrm4_mode_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
+	BUG_ON(err);
+}
+
+module_init(xfrm4_mode_tunnel_init);
+module_exit(xfrm4_mode_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 00000000..327a617d
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,101 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+	int mtu, ret = 0;
+	struct dst_entry *dst;
+
+	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+		goto out;
+
+	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
+		goto out;
+
+	dst = skb_dst(skb);
+	mtu = dst_mtu(dst);
+	if (skb->len > mtu) {
+		if (skb->sk)
+			ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr,
+				       inet_sk(skb->sk)->inet_dport, mtu);
+		else
+			icmp_send(skb, ICMP_DEST_UNREACH,
+				  ICMP_FRAG_NEEDED, htonl(mtu));
+		ret = -EMSGSIZE;
+	}
+out:
+	return ret;
+}
+
+int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	err = xfrm4_tunnel_check_size(skb);
+	if (err)
+		return err;
+
+	XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
+
+	return xfrm4_extract_header(skb);
+}
+
+int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	err = xfrm_inner_extract_output(x, skb);
+	if (err)
+		return err;
+
+	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED;
+
+	skb->protocol = htons(ETH_P_IP);
+
+	return x->outer_mode->output2(x, skb);
+}
+EXPORT_SYMBOL(xfrm4_prepare_output);
+
+int xfrm4_output_finish(struct sk_buff *skb)
+{
+#ifdef CONFIG_NETFILTER
+	if (!skb_dst(skb)->xfrm) {
+		IPCB(skb)->flags |= IPSKB_REROUTED;
+		return dst_output(skb);
+	}
+
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
+#endif
+
+	skb->protocol = htons(ETH_P_IP);
+	return xfrm_output(skb);
+}
+
+int xfrm4_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+
+	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
+			    NULL, dst->dev,
+			    x->outer_mode->afinfo->output_finish,
+			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 00000000..581fe0ab
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,305 @@
+/*
+ * xfrm4_policy.c
+ *
+ * Changes:
+ *	Kazunori MIYAZAWA @USAGI
+ * 	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/inetdevice.h>
+#include <linux/if_tunnel.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+
+static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
+					    int tos,
+					    const xfrm_address_t *saddr,
+					    const xfrm_address_t *daddr)
+{
+	struct rtable *rt;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->daddr = daddr->a4;
+	fl4->flowi4_tos = tos;
+	if (saddr)
+		fl4->saddr = saddr->a4;
+
+	rt = __ip_route_output_key(net, fl4);
+	if (!IS_ERR(rt))
+		return &rt->dst;
+
+	return ERR_CAST(rt);
+}
+
+static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
+					  const xfrm_address_t *saddr,
+					  const xfrm_address_t *daddr)
+{
+	struct flowi4 fl4;
+
+	return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
+}
+
+static int xfrm4_get_saddr(struct net *net,
+			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+	struct dst_entry *dst;
+	struct flowi4 fl4;
+
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
+	if (IS_ERR(dst))
+		return -EHOSTUNREACH;
+
+	saddr->a4 = fl4.saddr;
+	dst_release(dst);
+	return 0;
+}
+
+static int xfrm4_get_tos(const struct flowi *fl)
+{
+	return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
+}
+
+static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+			   int nfheader_len)
+{
+	return 0;
+}
+
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+			  const struct flowi *fl)
+{
+	struct rtable *rt = (struct rtable *)xdst->route;
+	const struct flowi4 *fl4 = &fl->u.ip4;
+
+	xdst->u.rt.rt_key_dst = fl4->daddr;
+	xdst->u.rt.rt_key_src = fl4->saddr;
+	xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
+	xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
+	xdst->u.rt.rt_iif = fl4->flowi4_iif;
+	xdst->u.rt.rt_oif = fl4->flowi4_oif;
+	xdst->u.rt.rt_mark = fl4->flowi4_mark;
+
+	xdst->u.dst.dev = dev;
+	dev_hold(dev);
+
+	xdst->u.rt.peer = rt->peer;
+	if (rt->peer)
+		atomic_inc(&rt->peer->refcnt);
+
+	/* Sheit... I remember I did this right. Apparently,
+	 * it was magically lost, so this code needs audit */
+	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
+					      RTCF_LOCAL);
+	xdst->u.rt.rt_type = rt->rt_type;
+	xdst->u.rt.rt_src = rt->rt_src;
+	xdst->u.rt.rt_dst = rt->rt_dst;
+	xdst->u.rt.rt_gateway = rt->rt_gateway;
+	xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
+
+	return 0;
+}
+
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
+	struct flowi4 *fl4 = &fl->u.ip4;
+
+	memset(fl4, 0, sizeof(struct flowi4));
+	fl4->flowi4_mark = skb->mark;
+
+	if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+		switch (iph->protocol) {
+		case IPPROTO_UDP:
+		case IPPROTO_UDPLITE:
+		case IPPROTO_TCP:
+		case IPPROTO_SCTP:
+		case IPPROTO_DCCP:
+			if (xprth + 4 < skb->data ||
+			    pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				__be16 *ports = (__be16 *)xprth;
+
+				fl4->fl4_sport = ports[!!reverse];
+				fl4->fl4_dport = ports[!reverse];
+			}
+			break;
+
+		case IPPROTO_ICMP:
+			if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+				u8 *icmp = xprth;
+
+				fl4->fl4_icmp_type = icmp[0];
+				fl4->fl4_icmp_code = icmp[1];
+			}
+			break;
+
+		case IPPROTO_ESP:
+			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				__be32 *ehdr = (__be32 *)xprth;
+
+				fl4->fl4_ipsec_spi = ehdr[0];
+			}
+			break;
+
+		case IPPROTO_AH:
+			if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+				__be32 *ah_hdr = (__be32*)xprth;
+
+				fl4->fl4_ipsec_spi = ah_hdr[1];
+			}
+			break;
+
+		case IPPROTO_COMP:
+			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				__be16 *ipcomp_hdr = (__be16 *)xprth;
+
+				fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
+			}
+			break;
+
+		case IPPROTO_GRE:
+			if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
+				__be16 *greflags = (__be16 *)xprth;
+				__be32 *gre_hdr = (__be32 *)xprth;
+
+				if (greflags[0] & GRE_KEY) {
+					if (greflags[0] & GRE_CSUM)
+						gre_hdr++;
+					fl4->fl4_gre_key = gre_hdr[1];
+				}
+			}
+			break;
+
+		default:
+			fl4->fl4_ipsec_spi = 0;
+			break;
+		}
+	}
+	fl4->flowi4_proto = iph->protocol;
+	fl4->daddr = reverse ? iph->saddr : iph->daddr;
+	fl4->saddr = reverse ? iph->daddr : iph->saddr;
+	fl4->flowi4_tos = iph->tos;
+}
+
+static inline int xfrm4_garbage_collect(struct dst_ops *ops)
+{
+	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
+
+	xfrm4_policy_afinfo.garbage_collect(net);
+	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	path->ops->update_pmtu(path, mtu);
+}
+
+static void xfrm4_dst_destroy(struct dst_entry *dst)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+	dst_destroy_metrics_generic(dst);
+
+	if (likely(xdst->u.rt.peer))
+		inet_putpeer(xdst->u.rt.peer);
+
+	xfrm_dst_destroy(xdst);
+}
+
+static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+			     int unregister)
+{
+	if (!unregister)
+		return;
+
+	xfrm_dst_ifdown(dst, dev);
+}
+
+static struct dst_ops xfrm4_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		cpu_to_be16(ETH_P_IP),
+	.gc =			xfrm4_garbage_collect,
+	.update_pmtu =		xfrm4_update_pmtu,
+	.cow_metrics =		dst_cow_metrics_generic,
+	.destroy =		xfrm4_dst_destroy,
+	.ifdown =		xfrm4_dst_ifdown,
+	.local_out =		__ip_local_out,
+	.gc_thresh =		1024,
+};
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+	.family = 		AF_INET,
+	.dst_ops =		&xfrm4_dst_ops,
+	.dst_lookup =		xfrm4_dst_lookup,
+	.get_saddr =		xfrm4_get_saddr,
+	.decode_session =	_decode_session4,
+	.get_tos =		xfrm4_get_tos,
+	.init_path =		xfrm4_init_path,
+	.fill_dst =		xfrm4_fill_dst,
+	.blackhole_route =	ipv4_blackhole_route,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm4_policy_table[] = {
+	{
+		.procname       = "xfrm4_gc_thresh",
+		.data           = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table_header *sysctl_hdr;
+#endif
+
+static void __init xfrm4_policy_init(void)
+{
+	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+
+static void __exit xfrm4_policy_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (sysctl_hdr)
+		unregister_net_sysctl_table(sysctl_hdr);
+#endif
+	xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+}
+
+void __init xfrm4_init(int rt_max_size)
+{
+	/*
+	 * Select a default value for the gc_thresh based on the main route
+	 * table hash size.  It seems to me the worst case scenario is when
+	 * we have ipsec operating in transport mode, in which we create a
+	 * dst_entry per socket.  The xfrm gc algorithm starts trying to remove
+	 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
+	 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
+	 * That will let us store an ipsec connection per route table entry,
+	 * and start cleaning when were 1/2 full
+	 */
+	xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+	dst_entries_init(&xfrm4_dst_ops);
+
+	xfrm4_state_init();
+	xfrm4_policy_init();
+#ifdef CONFIG_SYSCTL
+	sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
+						xfrm4_policy_table);
+#endif
+}
+
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 00000000..d9ac0a00
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,98 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/netfilter_ipv4.h>
+
+static int xfrm4_init_flags(struct xfrm_state *x)
+{
+	if (ipv4_config.no_pmtu_disc)
+		x->props.flags |= XFRM_STATE_NOPMTUDISC;
+	return 0;
+}
+
+static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi4 *fl4 = &fl->u.ip4;
+
+	sel->daddr.a4 = fl4->daddr;
+	sel->saddr.a4 = fl4->saddr;
+	sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET;
+	sel->prefixlen_d = 32;
+	sel->prefixlen_s = 32;
+	sel->proto = fl4->flowi4_proto;
+	sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
+{
+	x->id = tmpl->id;
+	if (x->id.daddr.a4 == 0)
+		x->id.daddr.a4 = daddr->a4;
+	x->props.saddr = tmpl->saddr;
+	if (x->props.saddr.a4 == 0)
+		x->props.saddr.a4 = saddr->a4;
+	x->props.mode = tmpl->mode;
+	x->props.reqid = tmpl->reqid;
+	x->props.family = AF_INET;
+}
+
+int xfrm4_extract_header(struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+
+	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+	XFRM_MODE_SKB_CB(skb)->id = iph->id;
+	XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
+	XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
+	XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
+	XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
+	memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
+	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+	return 0;
+}
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+	.family			= AF_INET,
+	.proto			= IPPROTO_IPIP,
+	.eth_proto		= htons(ETH_P_IP),
+	.owner			= THIS_MODULE,
+	.init_flags		= xfrm4_init_flags,
+	.init_tempsel		= __xfrm4_init_tempsel,
+	.init_temprop		= xfrm4_init_temprop,
+	.output			= xfrm4_output,
+	.output_finish		= xfrm4_output_finish,
+	.extract_input		= xfrm4_extract_input,
+	.extract_output		= xfrm4_extract_output,
+	.transport_finish	= xfrm4_transport_finish,
+};
+
+void __init xfrm4_state_init(void)
+{
+	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+
+#if 0
+void __exit xfrm4_state_fini(void)
+{
+	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
+#endif  /*  0  */
+
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 00000000..82806455
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,113 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	skb_push(skb, -skb_network_offset(skb));
+	return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return ip_hdr(skb)->protocol;
+}
+
+static int ipip_init_state(struct xfrm_state *x)
+{
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		return -EINVAL;
+
+	if (x->encap)
+		return -EINVAL;
+
+	x->props.header_len = sizeof(struct iphdr);
+
+	return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type ipip_type = {
+	.description	= "IPIP",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_IPIP,
+	.init_state	= ipip_init_state,
+	.destructor	= ipip_destroy,
+	.input		= ipip_xfrm_rcv,
+	.output		= ipip_output
+};
+
+static int xfrm_tunnel_rcv(struct sk_buff *skb)
+{
+	return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
+}
+
+static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
+{
+	return -ENOENT;
+}
+
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
+	.handler	=	xfrm_tunnel_rcv,
+	.err_handler	=	xfrm_tunnel_err,
+	.priority	=	2,
+};
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
+	.handler	=	xfrm_tunnel_rcv,
+	.err_handler	=	xfrm_tunnel_err,
+	.priority	=	2,
+};
+#endif
+
+static int __init ipip_init(void)
+{
+	if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+		printk(KERN_INFO "ipip init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+
+	if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
+		printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n");
+		xfrm_unregister_type(&ipip_type, AF_INET);
+		return -EAGAIN;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
+		printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
+		xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
+		xfrm_unregister_type(&ipip_type, AF_INET);
+		return -EAGAIN;
+	}
+#endif
+	return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
+		printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
+#endif
+	if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
+		printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n");
+	if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+		printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
new file mode 100644
index 00000000..36d7437a
--- /dev/null
+++ b/net/ipv6/Kconfig
@@ -0,0 +1,253 @@
+#
+# IPv6 configuration
+#
+
+#   IPv6 as module will cause a CRASH if you try to unload it
+menuconfig IPV6
+	tristate "The IPv6 protocol"
+	default m
+	---help---
+	  This is complemental support for the IP version 6.
+	  You will still be able to do traditional IPv4 networking as well.
+
+	  For general information about IPv6, see
+	  <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
+	  For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
+	  For specific information about IPv6 under Linux, read the HOWTO at
+	  <http://www.bieringer.de/linux/IPv6/>.
+
+	  To compile this protocol support as a module, choose M here: the 
+	  module will be called ipv6.
+
+if IPV6
+
+config IPV6_PRIVACY
+	bool "IPv6: Privacy Extensions (RFC 3041) support"
+	---help---
+	  Privacy Extensions for Stateless Address Autoconfiguration in IPv6
+	  support.  With this option, additional periodically-altered
+	  pseudo-random global-scope unicast address(es) will be assigned to
+	  your interface(s).
+	
+	  We use our standard pseudo-random algorithm to generate the
+          randomized interface identifier, instead of one described in RFC 3041.
+
+	  By default the kernel does not generate temporary addresses.
+	  To use temporary addresses, do
+	
+	        echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr 
+
+	  See <file:Documentation/networking/ip-sysctl.txt> for details.
+
+config IPV6_ROUTER_PREF
+	bool "IPv6: Router Preference (RFC 4191) support"
+	---help---
+	  Router Preference is an optional extension to the Router
+	  Advertisement message which improves the ability of hosts
+	  to pick an appropriate router, especially when the hosts
+	  are placed in a multi-homed network.
+
+	  If unsure, say N.
+
+config IPV6_ROUTE_INFO
+	bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)"
+	depends on IPV6_ROUTER_PREF && EXPERIMENTAL
+	---help---
+	  This is experimental support of Route Information.
+
+	  If unsure, say N.
+
+config IPV6_OPTIMISTIC_DAD
+	bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  This is experimental support for optimistic Duplicate
+	  Address Detection.  It allows for autoconfigured addresses
+	  to be used more quickly.
+
+	  If unsure, say N.
+
+config INET6_AH
+	tristate "IPv6: AH transformation"
+	select XFRM
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	---help---
+	  Support for IPsec AH.
+
+	  If unsure, say Y.
+
+config INET6_ESP
+	tristate "IPv6: ESP transformation"
+	select XFRM
+	select CRYPTO
+	select CRYPTO_AUTHENC
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_CBC
+	select CRYPTO_SHA1
+	select CRYPTO_DES
+	---help---
+	  Support for IPsec ESP.
+
+	  If unsure, say Y.
+
+config INET6_IPCOMP
+	tristate "IPv6: IPComp transformation"
+	select INET6_XFRM_TUNNEL
+	select XFRM_IPCOMP
+	---help---
+	  Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+	  typically needed for IPsec.
+
+	  If unsure, say Y.
+
+config IPV6_MIP6
+	tristate "IPv6: Mobility (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select XFRM
+	---help---
+	  Support for IPv6 Mobility described in RFC 3775.
+
+	  If unsure, say N.
+
+config INET6_XFRM_TUNNEL
+	tristate
+	select INET6_TUNNEL
+	default n
+
+config INET6_TUNNEL
+	tristate
+	default n
+
+config INET6_XFRM_MODE_TRANSPORT
+	tristate "IPv6: IPsec transport mode"
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec transport mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_TUNNEL
+	tristate "IPv6: IPsec tunnel mode"
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec tunnel mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_BEET
+	tristate "IPv6: IPsec BEET mode"
+	default IPV6
+	select XFRM
+	---help---
+	  Support for IPsec BEET mode.
+
+	  If unsure, say Y.
+
+config INET6_XFRM_MODE_ROUTEOPTIMIZATION
+	tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select XFRM
+	---help---
+	  Support for MIPv6 route optimization mode.
+
+config IPV6_SIT
+	tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)"
+	select INET_TUNNEL
+	select IPV6_NDISC_NODETYPE
+	default y
+	---help---
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This driver implements encapsulation of IPv6
+	  into IPv4 packets. This is useful if you want to connect two IPv6
+	  networks over an IPv4-only path.
+
+	  Saying M here will produce a module called sit. If unsure, say Y.
+
+config IPV6_SIT_6RD
+	bool "IPv6: IPv6 Rapid Deployment (6RD) (EXPERIMENTAL)"
+	depends on IPV6_SIT && EXPERIMENTAL
+	default n
+	---help---
+	  IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
+	  mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
+	  deploy IPv6 unicast service to IPv4 sites to which it provides
+	  customer premise equipment.  Like 6to4, it utilizes stateless IPv6 in
+	  IPv4 encapsulation in order to transit IPv4-only network
+	  infrastructure.  Unlike 6to4, a 6rd service provider uses an IPv6
+	  prefix of its own in place of the fixed 6to4 prefix.
+
+	  With this option enabled, the SIT driver offers 6rd functionality by
+	  providing additional ioctl API to configure the IPv6 Prefix for in
+	  stead of static 2002::/16 for 6to4.
+
+	  If unsure, say N.
+
+config IPV6_NDISC_NODETYPE
+	bool
+
+config IPV6_TUNNEL
+	tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
+	select INET6_TUNNEL
+	---help---
+	  Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
+	  RFC 2473.
+
+	  If unsure, say N.
+
+config IPV6_MULTIPLE_TABLES
+	bool "IPv6: Multiple Routing Tables"
+	depends on EXPERIMENTAL
+	select FIB_RULES
+	---help---
+	  Support multiple routing tables.
+
+config IPV6_SUBTREES
+	bool "IPv6: source address based routing"
+	depends on IPV6_MULTIPLE_TABLES
+	---help---
+	  Enable routing by source address or prefix.
+
+	  The destination address is still the primary routing key, so mixing
+	  normal and source prefix specific routes in the same routing table
+	  may sometimes lead to unintended routing behavior.  This can be
+	  avoided by defining different routing tables for the normal and
+	  source prefix specific routes.
+
+	  If unsure, say N.
+
+config IPV6_MROUTE
+	bool "IPv6: multicast routing (EXPERIMENTAL)"
+	depends on IPV6 && EXPERIMENTAL
+	---help---
+	  Experimental support for IPv6 multicast forwarding.
+	  If unsure, say N.
+
+config IPV6_MROUTE_MULTIPLE_TABLES
+	bool "IPv6: multicast policy routing"
+	depends on IPV6_MROUTE
+	select FIB_RULES
+	help
+	  Normally, a multicast router runs a userspace daemon and decides
+	  what to do with a multicast packet based on the source and
+	  destination addresses. If you say Y here, the multicast router
+	  will also be able to take interfaces and packet marks into
+	  account and run multiple instances of userspace daemons
+	  simultaneously, each one handling a single table.
+
+	  If unsure, say N.
+
+config IPV6_PIMSM_V2
+	bool "IPv6: PIM-SM version 2 support (EXPERIMENTAL)"
+	depends on IPV6_MROUTE
+	---help---
+	  Support for IPv6 PIM multicast routing protocol PIM-SMv2.
+	  If unsure, say N.
+
+endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
new file mode 100644
index 00000000..686934ac
--- /dev/null
+++ b/net/ipv6/Makefile
@@ -0,0 +1,42 @@
+#
+# Makefile for the Linux TCP/IP (INET6) layer.
+#
+
+obj-$(CONFIG_IPV6) += ipv6.o
+
+ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+		addrlabel.o \
+		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
+		raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
+		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o
+
+ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
+ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
+
+ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
+	xfrm6_output.o
+ipv6-$(CONFIG_NETFILTER) += netfilter.o
+ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
+ipv6-$(CONFIG_PROC_FS) += proc.o
+ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+
+ipv6-objs += $(ipv6-y)
+
+obj-$(CONFIG_INET6_AH) += ah6.o
+obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
+obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
+obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
+obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
+obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
+obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
+obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
+obj-$(CONFIG_IPV6_MIP6) += mip6.o
+obj-$(CONFIG_NETFILTER)	+= netfilter/
+
+obj-$(CONFIG_IPV6_SIT) += sit.o
+obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+
+obj-y += addrconf_core.o exthdrs_core.o
+
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
new file mode 100644
index 00000000..8a4bf719
--- /dev/null
+++ b/net/ipv6/addrconf.c
@@ -0,0 +1,4783 @@
+/*
+ *	IPv6 Address [auto]configuration
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *	Changes:
+ *
+ *	Janos Farkas			:	delete timer on ifdown
+ *	<chexum@bankinf.banki.hu>
+ *	Andi Kleen			:	kill double kfree on module
+ *						unload.
+ *	Maciej W. Rozycki		:	FDDI support
+ *	sekiya@USAGI			:	Don't send too many RS
+ *						packets.
+ *	yoshfuji@USAGI			:       Fixed interval between DAD
+ *						packets.
+ *	YOSHIFUJI Hideaki @USAGI	:	improved accuracy of
+ *						address validation timer.
+ *	YOSHIFUJI Hideaki @USAGI	:	Privacy Extensions (RFC3041)
+ *						support.
+ *	Yuji SEKIYA @USAGI		:	Don't assign a same IPv6
+ *						address on a same interface.
+ *	YOSHIFUJI Hideaki @USAGI	:	ARCnet support
+ *	YOSHIFUJI Hideaki @USAGI	:	convert /proc/net/if_inet6 to
+ *						seq_file.
+ *	YOSHIFUJI Hideaki @USAGI	:	improved source address
+ *						selection; consider scope,
+ *						status etc.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/if_arcnet.h>
+#include <linux/if_infiniband.h>
+#include <linux/route.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/capability.h>
+#include <linux/delay.h>
+#include <linux/notifier.h>
+#include <linux/string.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/if_tunnel.h>
+#include <linux/rtnetlink.h>
+
+#ifdef CONFIG_IPV6_PRIVACY
+#include <linux/random.h>
+#endif
+
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/* Set to 3 to get tracing... */
+#define ACONF_DEBUG 2
+
+#if ACONF_DEBUG >= 3
+#define ADBG(x) printk x
+#else
+#define ADBG(x)
+#endif
+
+#define	INFINITY_LIFE_TIME	0xFFFFFFFF
+
+static inline u32 cstamp_delta(unsigned long cstamp)
+{
+	return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
+}
+
+#define ADDRCONF_TIMER_FUZZ_MINUS	(HZ > 50 ? HZ/50 : 1)
+#define ADDRCONF_TIMER_FUZZ		(HZ / 4)
+#define ADDRCONF_TIMER_FUZZ_MAX		(HZ)
+
+#ifdef CONFIG_SYSCTL
+static void addrconf_sysctl_register(struct inet6_dev *idev);
+static void addrconf_sysctl_unregister(struct inet6_dev *idev);
+#else
+static inline void addrconf_sysctl_register(struct inet6_dev *idev)
+{
+}
+
+static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
+{
+}
+#endif
+
+#ifdef CONFIG_IPV6_PRIVACY
+static int __ipv6_regen_rndid(struct inet6_dev *idev);
+static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
+static void ipv6_regen_rndid(unsigned long data);
+#endif
+
+static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
+static int ipv6_count_addresses(struct inet6_dev *idev);
+
+/*
+ *	Configured unicast address hash table
+ */
+static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(addrconf_hash_lock);
+
+static void addrconf_verify(unsigned long);
+
+static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
+static DEFINE_SPINLOCK(addrconf_verify_lock);
+
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
+
+static void addrconf_type_change(struct net_device *dev,
+				 unsigned long event);
+static int addrconf_ifdown(struct net_device *dev, int how);
+
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
+static void addrconf_dad_timer(unsigned long data);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_run(struct inet6_dev *idev);
+static void addrconf_rs_timer(unsigned long data);
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+				struct prefix_info *pinfo);
+static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
+			       struct net_device *dev);
+
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
+
+static struct ipv6_devconf ipv6_devconf __read_mostly = {
+	.forwarding		= 0,
+	.hop_limit		= IPV6_DEFAULT_HOPLIMIT,
+	.mtu6			= IPV6_MIN_MTU,
+	.accept_ra		= 1,
+	.accept_redirects	= 1,
+	.autoconf		= 1,
+	.force_mld_version	= 0,
+	.dad_transmits		= 1,
+	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
+	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
+#ifdef CONFIG_IPV6_PRIVACY
+	.use_tempaddr 		= 0,
+	.temp_valid_lft		= TEMP_VALID_LIFETIME,
+	.temp_prefered_lft	= TEMP_PREFERRED_LIFETIME,
+	.regen_max_retry	= REGEN_MAX_RETRY,
+	.max_desync_factor	= MAX_DESYNC_FACTOR,
+#endif
+	.max_addresses		= IPV6_MAX_ADDRESSES,
+	.accept_ra_defrtr	= 1,
+	.accept_ra_pinfo	= 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	.accept_ra_rtr_pref	= 1,
+	.rtr_probe_interval	= 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_max_plen = 0,
+#endif
+#endif
+	.proxy_ndp		= 0,
+	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
+	.disable_ipv6		= 0,
+	.accept_dad		= 1,
+};
+
+static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
+	.forwarding		= 0,
+	.hop_limit		= IPV6_DEFAULT_HOPLIMIT,
+	.mtu6			= IPV6_MIN_MTU,
+	.accept_ra		= 1,
+	.accept_redirects	= 1,
+	.autoconf		= 1,
+	.dad_transmits		= 1,
+	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
+	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
+	.rtr_solicit_delay	= MAX_RTR_SOLICITATION_DELAY,
+#ifdef CONFIG_IPV6_PRIVACY
+	.use_tempaddr		= 0,
+	.temp_valid_lft		= TEMP_VALID_LIFETIME,
+	.temp_prefered_lft	= TEMP_PREFERRED_LIFETIME,
+	.regen_max_retry	= REGEN_MAX_RETRY,
+	.max_desync_factor	= MAX_DESYNC_FACTOR,
+#endif
+	.max_addresses		= IPV6_MAX_ADDRESSES,
+	.accept_ra_defrtr	= 1,
+	.accept_ra_pinfo	= 1,
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	.accept_ra_rtr_pref	= 1,
+	.rtr_probe_interval	= 60 * HZ,
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_max_plen = 0,
+#endif
+#endif
+	.proxy_ndp		= 0,
+	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
+	.disable_ipv6		= 0,
+	.accept_dad		= 1,
+};
+
+/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
+const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+
+/* Check if a valid qdisc is available */
+static inline bool addrconf_qdisc_ok(const struct net_device *dev)
+{
+	return !qdisc_tx_is_noop(dev);
+}
+
+/* Check if a route is valid prefix route */
+static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
+{
+	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
+}
+
+static void addrconf_del_timer(struct inet6_ifaddr *ifp)
+{
+	if (del_timer(&ifp->timer))
+		__in6_ifa_put(ifp);
+}
+
+enum addrconf_timer_t {
+	AC_NONE,
+	AC_DAD,
+	AC_RS,
+};
+
+static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
+			       enum addrconf_timer_t what,
+			       unsigned long when)
+{
+	if (!del_timer(&ifp->timer))
+		in6_ifa_hold(ifp);
+
+	switch (what) {
+	case AC_DAD:
+		ifp->timer.function = addrconf_dad_timer;
+		break;
+	case AC_RS:
+		ifp->timer.function = addrconf_rs_timer;
+		break;
+	default:
+		break;
+	}
+	ifp->timer.expires = jiffies + when;
+	add_timer(&ifp->timer);
+}
+
+static int snmp6_alloc_dev(struct inet6_dev *idev)
+{
+	if (snmp_mib_init((void __percpu **)idev->stats.ipv6,
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
+		goto err_ip;
+	idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
+					GFP_KERNEL);
+	if (!idev->stats.icmpv6dev)
+		goto err_icmp;
+	idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
+					   GFP_KERNEL);
+	if (!idev->stats.icmpv6msgdev)
+		goto err_icmpmsg;
+
+	return 0;
+
+err_icmpmsg:
+	kfree(idev->stats.icmpv6dev);
+err_icmp:
+	snmp_mib_free((void __percpu **)idev->stats.ipv6);
+err_ip:
+	return -ENOMEM;
+}
+
+static void snmp6_free_dev(struct inet6_dev *idev)
+{
+	kfree(idev->stats.icmpv6msgdev);
+	kfree(idev->stats.icmpv6dev);
+	snmp_mib_free((void __percpu **)idev->stats.ipv6);
+}
+
+/* Nobody refers to this device, we may destroy it. */
+
+void in6_dev_finish_destroy(struct inet6_dev *idev)
+{
+	struct net_device *dev = idev->dev;
+
+	WARN_ON(!list_empty(&idev->addr_list));
+	WARN_ON(idev->mc_list != NULL);
+
+#ifdef NET_REFCNT_DEBUG
+	printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL");
+#endif
+	dev_put(dev);
+	if (!idev->dead) {
+		pr_warning("Freeing alive inet6 device %p\n", idev);
+		return;
+	}
+	snmp6_free_dev(idev);
+	kfree_rcu(idev, rcu);
+}
+
+EXPORT_SYMBOL(in6_dev_finish_destroy);
+
+static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
+{
+	struct inet6_dev *ndev;
+
+	ASSERT_RTNL();
+
+	if (dev->mtu < IPV6_MIN_MTU)
+		return NULL;
+
+	ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
+
+	if (ndev == NULL)
+		return NULL;
+
+	rwlock_init(&ndev->lock);
+	ndev->dev = dev;
+	INIT_LIST_HEAD(&ndev->addr_list);
+
+	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+	ndev->cnf.mtu6 = dev->mtu;
+	ndev->cnf.sysctl = NULL;
+	ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
+	if (ndev->nd_parms == NULL) {
+		kfree(ndev);
+		return NULL;
+	}
+	if (ndev->cnf.forwarding)
+		dev_disable_lro(dev);
+	/* We refer to the device */
+	dev_hold(dev);
+
+	if (snmp6_alloc_dev(ndev) < 0) {
+		ADBG((KERN_WARNING
+			"%s(): cannot allocate memory for statistics; dev=%s.\n",
+			__func__, dev->name));
+		neigh_parms_release(&nd_tbl, ndev->nd_parms);
+		ndev->dead = 1;
+		in6_dev_finish_destroy(ndev);
+		return NULL;
+	}
+
+	if (snmp6_register_dev(ndev) < 0) {
+		ADBG((KERN_WARNING
+			"%s(): cannot create /proc/net/dev_snmp6/%s\n",
+			__func__, dev->name));
+		neigh_parms_release(&nd_tbl, ndev->nd_parms);
+		ndev->dead = 1;
+		in6_dev_finish_destroy(ndev);
+		return NULL;
+	}
+
+	/* One reference from device.  We must do this before
+	 * we invoke __ipv6_regen_rndid().
+	 */
+	in6_dev_hold(ndev);
+
+	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+		ndev->cnf.accept_dad = -1;
+
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+	if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
+		printk(KERN_INFO
+		       "%s: Disabled Multicast RS\n",
+		       dev->name);
+		ndev->cnf.rtr_solicits = 0;
+	}
+#endif
+
+#ifdef CONFIG_IPV6_PRIVACY
+	INIT_LIST_HEAD(&ndev->tempaddr_list);
+	setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
+	if ((dev->flags&IFF_LOOPBACK) ||
+	    dev->type == ARPHRD_TUNNEL ||
+	    dev->type == ARPHRD_TUNNEL6 ||
+	    dev->type == ARPHRD_SIT ||
+	    dev->type == ARPHRD_NONE) {
+		ndev->cnf.use_tempaddr = -1;
+	} else {
+		in6_dev_hold(ndev);
+		ipv6_regen_rndid((unsigned long) ndev);
+	}
+#endif
+
+	if (netif_running(dev) && addrconf_qdisc_ok(dev))
+		ndev->if_flags |= IF_READY;
+
+	ipv6_mc_init_dev(ndev);
+	ndev->tstamp = jiffies;
+	addrconf_sysctl_register(ndev);
+	/* protected by rtnl_lock */
+	rcu_assign_pointer(dev->ip6_ptr, ndev);
+
+	/* Join all-node multicast group */
+	ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+
+	/* Join all-router multicast group if forwarding is set */
+	if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST))
+		ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+
+	return ndev;
+}
+
+static struct inet6_dev * ipv6_find_idev(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+
+	ASSERT_RTNL();
+
+	idev = __in6_dev_get(dev);
+	if (!idev) {
+		idev = ipv6_add_dev(dev);
+		if (!idev)
+			return NULL;
+	}
+
+	if (dev->flags&IFF_UP)
+		ipv6_mc_up(idev);
+	return idev;
+}
+
+#ifdef CONFIG_SYSCTL
+static void dev_forward_change(struct inet6_dev *idev)
+{
+	struct net_device *dev;
+	struct inet6_ifaddr *ifa;
+
+	if (!idev)
+		return;
+	dev = idev->dev;
+	if (idev->cnf.forwarding)
+		dev_disable_lro(dev);
+	if (dev && (dev->flags & IFF_MULTICAST)) {
+		if (idev->cnf.forwarding)
+			ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+		else
+			ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
+	}
+
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		if (ifa->flags&IFA_F_TENTATIVE)
+			continue;
+		if (idev->cnf.forwarding)
+			addrconf_join_anycast(ifa);
+		else
+			addrconf_leave_anycast(ifa);
+	}
+}
+
+
+static void addrconf_forward_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.forwarding) ^ (!newf);
+			idev->cnf.forwarding = newf;
+			if (changed)
+				dev_forward_change(idev);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
+{
+	struct net *net;
+
+	net = (struct net *)table->extra2;
+	if (p == &net->ipv6.devconf_dflt->forwarding)
+		return 0;
+
+	if (!rtnl_trylock()) {
+		/* Restore the original values before restarting */
+		*p = old;
+		return restart_syscall();
+	}
+
+	if (p == &net->ipv6.devconf_all->forwarding) {
+		__s32 newf = net->ipv6.devconf_all->forwarding;
+		net->ipv6.devconf_dflt->forwarding = newf;
+		addrconf_forward_change(net, newf);
+	} else if ((!*p) ^ (!old))
+		dev_forward_change((struct inet6_dev *)table->extra1);
+	rtnl_unlock();
+
+	if (*p)
+		rt6_purge_dflt_routers(net);
+	return 1;
+}
+#endif
+
+/* Nobody refers to this ifaddr, destroy it */
+void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
+{
+	WARN_ON(!hlist_unhashed(&ifp->addr_lst));
+
+#ifdef NET_REFCNT_DEBUG
+	printk(KERN_DEBUG "inet6_ifa_finish_destroy\n");
+#endif
+
+	in6_dev_put(ifp->idev);
+
+	if (del_timer(&ifp->timer))
+		pr_notice("Timer is still running, when freeing ifa=%p\n", ifp);
+
+	if (ifp->state != INET6_IFADDR_STATE_DEAD) {
+		pr_warning("Freeing alive inet6 address %p\n", ifp);
+		return;
+	}
+	dst_release(&ifp->rt->dst);
+
+	kfree_rcu(ifp, rcu);
+}
+
+static void
+ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
+{
+	struct list_head *p;
+	int ifp_scope = ipv6_addr_src_scope(&ifp->addr);
+
+	/*
+	 * Each device address list is sorted in order of scope -
+	 * global before linklocal.
+	 */
+	list_for_each(p, &idev->addr_list) {
+		struct inet6_ifaddr *ifa
+			= list_entry(p, struct inet6_ifaddr, if_list);
+		if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
+			break;
+	}
+
+	list_add_tail(&ifp->if_list, p);
+}
+
+static u32 ipv6_addr_hash(const struct in6_addr *addr)
+{
+	/*
+	 * We perform the hash function over the last 64 bits of the address
+	 * This will include the IEEE address token on links that support it.
+	 */
+	return jhash_2words((__force u32)addr->s6_addr32[2],
+			    (__force u32)addr->s6_addr32[3], 0)
+		& (IN6_ADDR_HSIZE - 1);
+}
+
+/* On success it returns ifp with increased reference count */
+
+static struct inet6_ifaddr *
+ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
+	      int scope, u32 flags)
+{
+	struct inet6_ifaddr *ifa = NULL;
+	struct rt6_info *rt;
+	unsigned int hash;
+	int err = 0;
+	int addr_type = ipv6_addr_type(addr);
+
+	if (addr_type == IPV6_ADDR_ANY ||
+	    addr_type & IPV6_ADDR_MULTICAST ||
+	    (!(idev->dev->flags & IFF_LOOPBACK) &&
+	     addr_type & IPV6_ADDR_LOOPBACK))
+		return ERR_PTR(-EADDRNOTAVAIL);
+
+	rcu_read_lock_bh();
+	if (idev->dead) {
+		err = -ENODEV;			/*XXX*/
+		goto out2;
+	}
+
+	if (idev->cnf.disable_ipv6) {
+		err = -EACCES;
+		goto out2;
+	}
+
+	spin_lock(&addrconf_hash_lock);
+
+	/* Ignore adding duplicate addresses on an interface */
+	if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) {
+		ADBG(("ipv6_add_addr: already assigned\n"));
+		err = -EEXIST;
+		goto out;
+	}
+
+	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+
+	if (ifa == NULL) {
+		ADBG(("ipv6_add_addr: malloc failed\n"));
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	rt = addrconf_dst_alloc(idev, addr, 0);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		goto out;
+	}
+
+	ipv6_addr_copy(&ifa->addr, addr);
+
+	spin_lock_init(&ifa->lock);
+	spin_lock_init(&ifa->state_lock);
+	init_timer(&ifa->timer);
+	INIT_HLIST_NODE(&ifa->addr_lst);
+	ifa->timer.data = (unsigned long) ifa;
+	ifa->scope = scope;
+	ifa->prefix_len = pfxlen;
+	ifa->flags = flags | IFA_F_TENTATIVE;
+	ifa->cstamp = ifa->tstamp = jiffies;
+
+	ifa->rt = rt;
+
+	/*
+	 * part one of RFC 4429, section 3.3
+	 * We should not configure an address as
+	 * optimistic if we do not yet know the link
+	 * layer address of our nexhop router
+	 */
+
+	if (dst_get_neighbour_raw(&rt->dst) == NULL)
+		ifa->flags &= ~IFA_F_OPTIMISTIC;
+
+	ifa->idev = idev;
+	in6_dev_hold(idev);
+	/* For caller */
+	in6_ifa_hold(ifa);
+
+	/* Add to big hash table */
+	hash = ipv6_addr_hash(addr);
+
+	hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+	spin_unlock(&addrconf_hash_lock);
+
+	write_lock(&idev->lock);
+	/* Add to inet6_dev unicast addr list. */
+	ipv6_link_dev_addr(idev, ifa);
+
+#ifdef CONFIG_IPV6_PRIVACY
+	if (ifa->flags&IFA_F_TEMPORARY) {
+		list_add(&ifa->tmp_list, &idev->tempaddr_list);
+		in6_ifa_hold(ifa);
+	}
+#endif
+
+	in6_ifa_hold(ifa);
+	write_unlock(&idev->lock);
+out2:
+	rcu_read_unlock_bh();
+
+	if (likely(err == 0))
+		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
+	else {
+		kfree(ifa);
+		ifa = ERR_PTR(err);
+	}
+
+	return ifa;
+out:
+	spin_unlock(&addrconf_hash_lock);
+	goto out2;
+}
+
+/* This function wants to get referenced ifp and releases it before return */
+
+static void ipv6_del_addr(struct inet6_ifaddr *ifp)
+{
+	struct inet6_ifaddr *ifa, *ifn;
+	struct inet6_dev *idev = ifp->idev;
+	int state;
+	int deleted = 0, onlink = 0;
+	unsigned long expires = jiffies;
+
+	spin_lock_bh(&ifp->state_lock);
+	state = ifp->state;
+	ifp->state = INET6_IFADDR_STATE_DEAD;
+	spin_unlock_bh(&ifp->state_lock);
+
+	if (state == INET6_IFADDR_STATE_DEAD)
+		goto out;
+
+	spin_lock_bh(&addrconf_hash_lock);
+	hlist_del_init_rcu(&ifp->addr_lst);
+	spin_unlock_bh(&addrconf_hash_lock);
+
+	write_lock_bh(&idev->lock);
+#ifdef CONFIG_IPV6_PRIVACY
+	if (ifp->flags&IFA_F_TEMPORARY) {
+		list_del(&ifp->tmp_list);
+		if (ifp->ifpub) {
+			in6_ifa_put(ifp->ifpub);
+			ifp->ifpub = NULL;
+		}
+		__in6_ifa_put(ifp);
+	}
+#endif
+
+	list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) {
+		if (ifa == ifp) {
+			list_del_init(&ifp->if_list);
+			__in6_ifa_put(ifp);
+
+			if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0)
+				break;
+			deleted = 1;
+			continue;
+		} else if (ifp->flags & IFA_F_PERMANENT) {
+			if (ipv6_prefix_equal(&ifa->addr, &ifp->addr,
+					      ifp->prefix_len)) {
+				if (ifa->flags & IFA_F_PERMANENT) {
+					onlink = 1;
+					if (deleted)
+						break;
+				} else {
+					unsigned long lifetime;
+
+					if (!onlink)
+						onlink = -1;
+
+					spin_lock(&ifa->lock);
+
+					lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
+					/*
+					 * Note: Because this address is
+					 * not permanent, lifetime <
+					 * LONG_MAX / HZ here.
+					 */
+					if (time_before(expires,
+							ifa->tstamp + lifetime * HZ))
+						expires = ifa->tstamp + lifetime * HZ;
+					spin_unlock(&ifa->lock);
+				}
+			}
+		}
+	}
+	write_unlock_bh(&idev->lock);
+
+	addrconf_del_timer(ifp);
+
+	ipv6_ifa_notify(RTM_DELADDR, ifp);
+
+	atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
+
+	/*
+	 * Purge or update corresponding prefix
+	 *
+	 * 1) we don't purge prefix here if address was not permanent.
+	 *    prefix is managed by its own lifetime.
+	 * 2) if there're no addresses, delete prefix.
+	 * 3) if there're still other permanent address(es),
+	 *    corresponding prefix is still permanent.
+	 * 4) otherwise, update prefix lifetime to the
+	 *    longest valid lifetime among the corresponding
+	 *    addresses on the device.
+	 *    Note: subsequent RA will update lifetime.
+	 *
+	 * --yoshfuji
+	 */
+	if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) {
+		struct in6_addr prefix;
+		struct rt6_info *rt;
+		struct net *net = dev_net(ifp->idev->dev);
+		ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len);
+		rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1);
+
+		if (rt && addrconf_is_prefix_route(rt)) {
+			if (onlink == 0) {
+				ip6_del_rt(rt);
+				rt = NULL;
+			} else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
+				rt->rt6i_expires = expires;
+				rt->rt6i_flags |= RTF_EXPIRES;
+			}
+		}
+		dst_release(&rt->dst);
+	}
+
+	/* clean up prefsrc entries */
+	rt6_remove_prefsrc(ifp);
+out:
+	in6_ifa_put(ifp);
+}
+
+#ifdef CONFIG_IPV6_PRIVACY
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift)
+{
+	struct inet6_dev *idev = ifp->idev;
+	struct in6_addr addr, *tmpaddr;
+	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+	unsigned long regen_advance;
+	int tmp_plen;
+	int ret = 0;
+	int max_addresses;
+	u32 addr_flags;
+	unsigned long now = jiffies;
+
+	write_lock(&idev->lock);
+	if (ift) {
+		spin_lock_bh(&ift->lock);
+		memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
+		spin_unlock_bh(&ift->lock);
+		tmpaddr = &addr;
+	} else {
+		tmpaddr = NULL;
+	}
+retry:
+	in6_dev_hold(idev);
+	if (idev->cnf.use_tempaddr <= 0) {
+		write_unlock(&idev->lock);
+		printk(KERN_INFO
+			"ipv6_create_tempaddr(): use_tempaddr is disabled.\n");
+		in6_dev_put(idev);
+		ret = -1;
+		goto out;
+	}
+	spin_lock_bh(&ifp->lock);
+	if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
+		idev->cnf.use_tempaddr = -1;	/*XXX*/
+		spin_unlock_bh(&ifp->lock);
+		write_unlock(&idev->lock);
+		printk(KERN_WARNING
+			"ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n");
+		in6_dev_put(idev);
+		ret = -1;
+		goto out;
+	}
+	in6_ifa_hold(ifp);
+	memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
+	if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) {
+		spin_unlock_bh(&ifp->lock);
+		write_unlock(&idev->lock);
+		printk(KERN_WARNING
+			"ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n");
+		in6_ifa_put(ifp);
+		in6_dev_put(idev);
+		ret = -1;
+		goto out;
+	}
+	memcpy(&addr.s6_addr[8], idev->rndid, 8);
+	age = (now - ifp->tstamp) / HZ;
+	tmp_valid_lft = min_t(__u32,
+			      ifp->valid_lft,
+			      idev->cnf.temp_valid_lft + age);
+	tmp_prefered_lft = min_t(__u32,
+				 ifp->prefered_lft,
+				 idev->cnf.temp_prefered_lft + age -
+				 idev->cnf.max_desync_factor);
+	tmp_plen = ifp->prefix_len;
+	max_addresses = idev->cnf.max_addresses;
+	tmp_tstamp = ifp->tstamp;
+	spin_unlock_bh(&ifp->lock);
+
+	regen_advance = idev->cnf.regen_max_retry *
+	                idev->cnf.dad_transmits *
+	                idev->nd_parms->retrans_time / HZ;
+	write_unlock(&idev->lock);
+
+	/* A temporary address is created only if this calculated Preferred
+	 * Lifetime is greater than REGEN_ADVANCE time units.  In particular,
+	 * an implementation must not create a temporary address with a zero
+	 * Preferred Lifetime.
+	 */
+	if (tmp_prefered_lft <= regen_advance) {
+		in6_ifa_put(ifp);
+		in6_dev_put(idev);
+		ret = -1;
+		goto out;
+	}
+
+	addr_flags = IFA_F_TEMPORARY;
+	/* set in addrconf_prefix_rcv() */
+	if (ifp->flags & IFA_F_OPTIMISTIC)
+		addr_flags |= IFA_F_OPTIMISTIC;
+
+	ift = !max_addresses ||
+	      ipv6_count_addresses(idev) < max_addresses ?
+		ipv6_add_addr(idev, &addr, tmp_plen,
+			      ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK,
+			      addr_flags) : NULL;
+	if (!ift || IS_ERR(ift)) {
+		in6_ifa_put(ifp);
+		in6_dev_put(idev);
+		printk(KERN_INFO
+			"ipv6_create_tempaddr(): retry temporary address regeneration.\n");
+		tmpaddr = &addr;
+		write_lock(&idev->lock);
+		goto retry;
+	}
+
+	spin_lock_bh(&ift->lock);
+	ift->ifpub = ifp;
+	ift->valid_lft = tmp_valid_lft;
+	ift->prefered_lft = tmp_prefered_lft;
+	ift->cstamp = now;
+	ift->tstamp = tmp_tstamp;
+	spin_unlock_bh(&ift->lock);
+
+	addrconf_dad_start(ift, 0);
+	in6_ifa_put(ift);
+	in6_dev_put(idev);
+out:
+	return ret;
+}
+#endif
+
+/*
+ *	Choose an appropriate source address (RFC3484)
+ */
+enum {
+	IPV6_SADDR_RULE_INIT = 0,
+	IPV6_SADDR_RULE_LOCAL,
+	IPV6_SADDR_RULE_SCOPE,
+	IPV6_SADDR_RULE_PREFERRED,
+#ifdef CONFIG_IPV6_MIP6
+	IPV6_SADDR_RULE_HOA,
+#endif
+	IPV6_SADDR_RULE_OIF,
+	IPV6_SADDR_RULE_LABEL,
+#ifdef CONFIG_IPV6_PRIVACY
+	IPV6_SADDR_RULE_PRIVACY,
+#endif
+	IPV6_SADDR_RULE_ORCHID,
+	IPV6_SADDR_RULE_PREFIX,
+	IPV6_SADDR_RULE_MAX
+};
+
+struct ipv6_saddr_score {
+	int			rule;
+	int			addr_type;
+	struct inet6_ifaddr	*ifa;
+	DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
+	int			scopedist;
+	int			matchlen;
+};
+
+struct ipv6_saddr_dst {
+	const struct in6_addr *addr;
+	int ifindex;
+	int scope;
+	int label;
+	unsigned int prefs;
+};
+
+static inline int ipv6_saddr_preferred(int type)
+{
+	if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
+		return 1;
+	return 0;
+}
+
+static int ipv6_get_saddr_eval(struct net *net,
+			       struct ipv6_saddr_score *score,
+			       struct ipv6_saddr_dst *dst,
+			       int i)
+{
+	int ret;
+
+	if (i <= score->rule) {
+		switch (i) {
+		case IPV6_SADDR_RULE_SCOPE:
+			ret = score->scopedist;
+			break;
+		case IPV6_SADDR_RULE_PREFIX:
+			ret = score->matchlen;
+			break;
+		default:
+			ret = !!test_bit(i, score->scorebits);
+		}
+		goto out;
+	}
+
+	switch (i) {
+	case IPV6_SADDR_RULE_INIT:
+		/* Rule 0: remember if hiscore is not ready yet */
+		ret = !!score->ifa;
+		break;
+	case IPV6_SADDR_RULE_LOCAL:
+		/* Rule 1: Prefer same address */
+		ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
+		break;
+	case IPV6_SADDR_RULE_SCOPE:
+		/* Rule 2: Prefer appropriate scope
+		 *
+		 *      ret
+		 *       ^
+		 *    -1 |  d 15
+		 *    ---+--+-+---> scope
+		 *       |
+		 *       |             d is scope of the destination.
+		 *  B-d  |  \
+		 *       |   \      <- smaller scope is better if
+		 *  B-15 |    \        if scope is enough for destinaion.
+		 *       |             ret = B - scope (-1 <= scope >= d <= 15).
+		 * d-C-1 | /
+		 *       |/         <- greater is better
+		 *   -C  /             if scope is not enough for destination.
+		 *      /|             ret = scope - C (-1 <= d < scope <= 15).
+		 *
+		 * d - C - 1 < B -15 (for all -1 <= d <= 15).
+		 * C > d + 14 - B >= 15 + 14 - B = 29 - B.
+		 * Assume B = 0 and we get C > 29.
+		 */
+		ret = __ipv6_addr_src_scope(score->addr_type);
+		if (ret >= dst->scope)
+			ret = -ret;
+		else
+			ret -= 128;	/* 30 is enough */
+		score->scopedist = ret;
+		break;
+	case IPV6_SADDR_RULE_PREFERRED:
+		/* Rule 3: Avoid deprecated and optimistic addresses */
+		ret = ipv6_saddr_preferred(score->addr_type) ||
+		      !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
+		break;
+#ifdef CONFIG_IPV6_MIP6
+	case IPV6_SADDR_RULE_HOA:
+	    {
+		/* Rule 4: Prefer home address */
+		int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
+		ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
+		break;
+	    }
+#endif
+	case IPV6_SADDR_RULE_OIF:
+		/* Rule 5: Prefer outgoing interface */
+		ret = (!dst->ifindex ||
+		       dst->ifindex == score->ifa->idev->dev->ifindex);
+		break;
+	case IPV6_SADDR_RULE_LABEL:
+		/* Rule 6: Prefer matching label */
+		ret = ipv6_addr_label(net,
+				      &score->ifa->addr, score->addr_type,
+				      score->ifa->idev->dev->ifindex) == dst->label;
+		break;
+#ifdef CONFIG_IPV6_PRIVACY
+	case IPV6_SADDR_RULE_PRIVACY:
+	    {
+		/* Rule 7: Prefer public address
+		 * Note: prefer temporary address if use_tempaddr >= 2
+		 */
+		int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
+				!!(dst->prefs & IPV6_PREFER_SRC_TMP) :
+				score->ifa->idev->cnf.use_tempaddr >= 2;
+		ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
+		break;
+	    }
+#endif
+	case IPV6_SADDR_RULE_ORCHID:
+		/* Rule 8-: Prefer ORCHID vs ORCHID or
+		 *	    non-ORCHID vs non-ORCHID
+		 */
+		ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
+			ipv6_addr_orchid(dst->addr));
+		break;
+	case IPV6_SADDR_RULE_PREFIX:
+		/* Rule 8: Use longest matching prefix */
+		score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr,
+						       dst->addr);
+		break;
+	default:
+		ret = 0;
+	}
+
+	if (ret)
+		__set_bit(i, score->scorebits);
+	score->rule = i;
+out:
+	return ret;
+}
+
+int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
+		       const struct in6_addr *daddr, unsigned int prefs,
+		       struct in6_addr *saddr)
+{
+	struct ipv6_saddr_score scores[2],
+				*score = &scores[0], *hiscore = &scores[1];
+	struct ipv6_saddr_dst dst;
+	struct net_device *dev;
+	int dst_type;
+
+	dst_type = __ipv6_addr_type(daddr);
+	dst.addr = daddr;
+	dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
+	dst.scope = __ipv6_addr_src_scope(dst_type);
+	dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
+	dst.prefs = prefs;
+
+	hiscore->rule = -1;
+	hiscore->ifa = NULL;
+
+	rcu_read_lock();
+
+	for_each_netdev_rcu(net, dev) {
+		struct inet6_dev *idev;
+
+		/* Candidate Source Address (section 4)
+		 *  - multicast and link-local destination address,
+		 *    the set of candidate source address MUST only
+		 *    include addresses assigned to interfaces
+		 *    belonging to the same link as the outgoing
+		 *    interface.
+		 * (- For site-local destination addresses, the
+		 *    set of candidate source addresses MUST only
+		 *    include addresses assigned to interfaces
+		 *    belonging to the same site as the outgoing
+		 *    interface.)
+		 */
+		if (((dst_type & IPV6_ADDR_MULTICAST) ||
+		     dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
+		    dst.ifindex && dev->ifindex != dst.ifindex)
+			continue;
+
+		idev = __in6_dev_get(dev);
+		if (!idev)
+			continue;
+
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+			int i;
+
+			/*
+			 * - Tentative Address (RFC2462 section 5.4)
+			 *  - A tentative address is not considered
+			 *    "assigned to an interface" in the traditional
+			 *    sense, unless it is also flagged as optimistic.
+			 * - Candidate Source Address (section 4)
+			 *  - In any case, anycast addresses, multicast
+			 *    addresses, and the unspecified address MUST
+			 *    NOT be included in a candidate set.
+			 */
+			if ((score->ifa->flags & IFA_F_TENTATIVE) &&
+			    (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
+				continue;
+
+			score->addr_type = __ipv6_addr_type(&score->ifa->addr);
+
+			if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
+				     score->addr_type & IPV6_ADDR_MULTICAST)) {
+				LIMIT_NETDEBUG(KERN_DEBUG
+					       "ADDRCONF: unspecified / multicast address "
+					       "assigned as unicast address on %s",
+					       dev->name);
+				continue;
+			}
+
+			score->rule = -1;
+			bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);
+
+			for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
+				int minihiscore, miniscore;
+
+				minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i);
+				miniscore = ipv6_get_saddr_eval(net, score, &dst, i);
+
+				if (minihiscore > miniscore) {
+					if (i == IPV6_SADDR_RULE_SCOPE &&
+					    score->scopedist > 0) {
+						/*
+						 * special case:
+						 * each remaining entry
+						 * has too small (not enough)
+						 * scope, because ifa entries
+						 * are sorted by their scope
+						 * values.
+						 */
+						goto try_nextdev;
+					}
+					break;
+				} else if (minihiscore < miniscore) {
+					if (hiscore->ifa)
+						in6_ifa_put(hiscore->ifa);
+
+					in6_ifa_hold(score->ifa);
+
+					swap(hiscore, score);
+
+					/* restore our iterator */
+					score->ifa = hiscore->ifa;
+
+					break;
+				}
+			}
+		}
+try_nextdev:
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+
+	if (!hiscore->ifa)
+		return -EADDRNOTAVAIL;
+
+	ipv6_addr_copy(saddr, &hiscore->ifa->addr);
+	in6_ifa_put(hiscore->ifa);
+	return 0;
+}
+EXPORT_SYMBOL(ipv6_dev_get_saddr);
+
+int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
+		    unsigned char banned_flags)
+{
+	struct inet6_dev *idev;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev) {
+		struct inet6_ifaddr *ifp;
+
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(ifp, &idev->addr_list, if_list) {
+			if (ifp->scope == IFA_LINK &&
+			    !(ifp->flags & banned_flags)) {
+				ipv6_addr_copy(addr, &ifp->addr);
+				err = 0;
+				break;
+			}
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
+static int ipv6_count_addresses(struct inet6_dev *idev)
+{
+	int cnt = 0;
+	struct inet6_ifaddr *ifp;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifp, &idev->addr_list, if_list)
+		cnt++;
+	read_unlock_bh(&idev->lock);
+	return cnt;
+}
+
+int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
+		  struct net_device *dev, int strict)
+{
+	struct inet6_ifaddr *ifp;
+	struct hlist_node *node;
+	unsigned int hash = ipv6_addr_hash(addr);
+
+	rcu_read_lock_bh();
+	hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) {
+		if (!net_eq(dev_net(ifp->idev->dev), net))
+			continue;
+		if (ipv6_addr_equal(&ifp->addr, addr) &&
+		    !(ifp->flags&IFA_F_TENTATIVE) &&
+		    (dev == NULL || ifp->idev->dev == dev ||
+		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
+			rcu_read_unlock_bh();
+			return 1;
+		}
+	}
+
+	rcu_read_unlock_bh();
+	return 0;
+}
+EXPORT_SYMBOL(ipv6_chk_addr);
+
+static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
+			       struct net_device *dev)
+{
+	unsigned int hash = ipv6_addr_hash(addr);
+	struct inet6_ifaddr *ifp;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) {
+		if (!net_eq(dev_net(ifp->idev->dev), net))
+			continue;
+		if (ipv6_addr_equal(&ifp->addr, addr)) {
+			if (dev == NULL || ifp->idev->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
+int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
+{
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifa;
+	int	onlink;
+
+	onlink = 0;
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev) {
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(ifa, &idev->addr_list, if_list) {
+			onlink = ipv6_prefix_equal(addr, &ifa->addr,
+						   ifa->prefix_len);
+			if (onlink)
+				break;
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return onlink;
+}
+
+EXPORT_SYMBOL(ipv6_chk_prefix);
+
+struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
+				     struct net_device *dev, int strict)
+{
+	struct inet6_ifaddr *ifp, *result = NULL;
+	unsigned int hash = ipv6_addr_hash(addr);
+	struct hlist_node *node;
+
+	rcu_read_lock_bh();
+	hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) {
+		if (!net_eq(dev_net(ifp->idev->dev), net))
+			continue;
+		if (ipv6_addr_equal(&ifp->addr, addr)) {
+			if (dev == NULL || ifp->idev->dev == dev ||
+			    !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
+				result = ifp;
+				in6_ifa_hold(ifp);
+				break;
+			}
+		}
+	}
+	rcu_read_unlock_bh();
+
+	return result;
+}
+
+/* Gets referenced address, destroys ifaddr */
+
+static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
+{
+	if (ifp->flags&IFA_F_PERMANENT) {
+		spin_lock_bh(&ifp->lock);
+		addrconf_del_timer(ifp);
+		ifp->flags |= IFA_F_TENTATIVE;
+		if (dad_failed)
+			ifp->flags |= IFA_F_DADFAILED;
+		spin_unlock_bh(&ifp->lock);
+		if (dad_failed)
+			ipv6_ifa_notify(0, ifp);
+		in6_ifa_put(ifp);
+#ifdef CONFIG_IPV6_PRIVACY
+	} else if (ifp->flags&IFA_F_TEMPORARY) {
+		struct inet6_ifaddr *ifpub;
+		spin_lock_bh(&ifp->lock);
+		ifpub = ifp->ifpub;
+		if (ifpub) {
+			in6_ifa_hold(ifpub);
+			spin_unlock_bh(&ifp->lock);
+			ipv6_create_tempaddr(ifpub, ifp);
+			in6_ifa_put(ifpub);
+		} else {
+			spin_unlock_bh(&ifp->lock);
+		}
+		ipv6_del_addr(ifp);
+#endif
+	} else
+		ipv6_del_addr(ifp);
+}
+
+static int addrconf_dad_end(struct inet6_ifaddr *ifp)
+{
+	int err = -ENOENT;
+
+	spin_lock(&ifp->state_lock);
+	if (ifp->state == INET6_IFADDR_STATE_DAD) {
+		ifp->state = INET6_IFADDR_STATE_POSTDAD;
+		err = 0;
+	}
+	spin_unlock(&ifp->state_lock);
+
+	return err;
+}
+
+void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+{
+	struct inet6_dev *idev = ifp->idev;
+
+	if (addrconf_dad_end(ifp)) {
+		in6_ifa_put(ifp);
+		return;
+	}
+
+	if (net_ratelimit())
+		printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n",
+			ifp->idev->dev->name, &ifp->addr);
+
+	if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
+		struct in6_addr addr;
+
+		addr.s6_addr32[0] = htonl(0xfe800000);
+		addr.s6_addr32[1] = 0;
+
+		if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+		    ipv6_addr_equal(&ifp->addr, &addr)) {
+			/* DAD failed for link-local based on MAC address */
+			idev->cnf.disable_ipv6 = 1;
+
+			printk(KERN_INFO "%s: IPv6 being disabled!\n",
+				ifp->idev->dev->name);
+		}
+	}
+
+	addrconf_dad_stop(ifp, 1);
+}
+
+/* Join to solicited addr multicast group. */
+
+void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct in6_addr maddr;
+
+	if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+		return;
+
+	addrconf_addr_solict_mult(addr, &maddr);
+	ipv6_dev_mc_inc(dev, &maddr);
+}
+
+void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+	struct in6_addr maddr;
+
+	if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+		return;
+
+	addrconf_addr_solict_mult(addr, &maddr);
+	__ipv6_dev_mc_dec(idev, &maddr);
+}
+
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+{
+	struct in6_addr addr;
+	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
+	if (ipv6_addr_any(&addr))
+		return;
+	ipv6_dev_ac_inc(ifp->idev->dev, &addr);
+}
+
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+{
+	struct in6_addr addr;
+	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
+	if (ipv6_addr_any(&addr))
+		return;
+	__ipv6_dev_ac_dec(ifp->idev, &addr);
+}
+
+static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+	if (dev->addr_len != ETH_ALEN)
+		return -1;
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+	/*
+	 * The zSeries OSA network cards can be shared among various
+	 * OS instances, but the OSA cards have only one MAC address.
+	 * This leads to duplicate address conflicts in conjunction
+	 * with IPv6 if more than one instance uses the same card.
+	 *
+	 * The driver for these cards can deliver a unique 16-bit
+	 * identifier for each instance sharing the same card.  It is
+	 * placed instead of 0xFFFE in the interface identifier.  The
+	 * "u" bit of the interface identifier is not inverted in this
+	 * case.  Hence the resulting interface identifier has local
+	 * scope according to RFC2373.
+	 */
+	if (dev->dev_id) {
+		eui[3] = (dev->dev_id >> 8) & 0xFF;
+		eui[4] = dev->dev_id & 0xFF;
+	} else {
+		eui[3] = 0xFF;
+		eui[4] = 0xFE;
+		eui[0] ^= 2;
+	}
+	return 0;
+}
+
+static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
+{
+	/* XXX: inherit EUI-64 from other interface -- yoshfuji */
+	if (dev->addr_len != ARCNET_ALEN)
+		return -1;
+	memset(eui, 0, 7);
+	eui[7] = *(u8*)dev->dev_addr;
+	return 0;
+}
+
+static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
+{
+	if (dev->addr_len != INFINIBAND_ALEN)
+		return -1;
+	memcpy(eui, dev->dev_addr + 12, 8);
+	eui[0] |= 2;
+	return 0;
+}
+
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+{
+	if (addr == 0)
+		return -1;
+	eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
+		  ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
+		  ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
+		  ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
+		  ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
+		  ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
+	eui[1] = 0;
+	eui[2] = 0x5E;
+	eui[3] = 0xFE;
+	memcpy(eui + 4, &addr, 4);
+	return 0;
+}
+
+static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
+{
+	if (dev->priv_flags & IFF_ISATAP)
+		return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+	return -1;
+}
+
+static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802_TR:
+		return addrconf_ifid_eui48(eui, dev);
+	case ARPHRD_ARCNET:
+		return addrconf_ifid_arcnet(eui, dev);
+	case ARPHRD_INFINIBAND:
+		return addrconf_ifid_infiniband(eui, dev);
+	case ARPHRD_SIT:
+		return addrconf_ifid_sit(eui, dev);
+	}
+	return -1;
+}
+
+static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
+{
+	int err = -1;
+	struct inet6_ifaddr *ifp;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifp, &idev->addr_list, if_list) {
+		if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
+			memcpy(eui, ifp->addr.s6_addr+8, 8);
+			err = 0;
+			break;
+		}
+	}
+	read_unlock_bh(&idev->lock);
+	return err;
+}
+
+#ifdef CONFIG_IPV6_PRIVACY
+/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
+static int __ipv6_regen_rndid(struct inet6_dev *idev)
+{
+regen:
+	get_random_bytes(idev->rndid, sizeof(idev->rndid));
+	idev->rndid[0] &= ~0x02;
+
+	/*
+	 * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
+	 * check if generated address is not inappropriate
+	 *
+	 *  - Reserved subnet anycast (RFC 2526)
+	 *	11111101 11....11 1xxxxxxx
+	 *  - ISATAP (RFC4214) 6.1
+	 *	00-00-5E-FE-xx-xx-xx-xx
+	 *  - value 0
+	 *  - XXX: already assigned to an address on the device
+	 */
+	if (idev->rndid[0] == 0xfd &&
+	    (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
+	    (idev->rndid[7]&0x80))
+		goto regen;
+	if ((idev->rndid[0]|idev->rndid[1]) == 0) {
+		if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
+			goto regen;
+		if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
+			goto regen;
+	}
+
+	return 0;
+}
+
+static void ipv6_regen_rndid(unsigned long data)
+{
+	struct inet6_dev *idev = (struct inet6_dev *) data;
+	unsigned long expires;
+
+	rcu_read_lock_bh();
+	write_lock_bh(&idev->lock);
+
+	if (idev->dead)
+		goto out;
+
+	if (__ipv6_regen_rndid(idev) < 0)
+		goto out;
+
+	expires = jiffies +
+		idev->cnf.temp_prefered_lft * HZ -
+		idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time -
+		idev->cnf.max_desync_factor * HZ;
+	if (time_before(expires, jiffies)) {
+		printk(KERN_WARNING
+			"ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n",
+			idev->dev->name);
+		goto out;
+	}
+
+	if (!mod_timer(&idev->regen_timer, expires))
+		in6_dev_hold(idev);
+
+out:
+	write_unlock_bh(&idev->lock);
+	rcu_read_unlock_bh();
+	in6_dev_put(idev);
+}
+
+static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) {
+	int ret = 0;
+
+	if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
+		ret = __ipv6_regen_rndid(idev);
+	return ret;
+}
+#endif
+
+/*
+ *	Add prefix route.
+ */
+
+static void
+addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
+		      unsigned long expires, u32 flags)
+{
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_PREFIX,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_expires = expires,
+		.fc_dst_len = plen,
+		.fc_flags = RTF_UP | flags,
+		.fc_nlinfo.nl_net = dev_net(dev),
+		.fc_protocol = RTPROT_KERNEL,
+	};
+
+	ipv6_addr_copy(&cfg.fc_dst, pfx);
+
+	/* Prevent useless cloning on PtP SIT.
+	   This thing is done here expecting that the whole
+	   class of non-broadcast devices need not cloning.
+	 */
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+	if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
+		cfg.fc_flags |= RTF_NONEXTHOP;
+#endif
+
+	ip6_route_add(&cfg);
+}
+
+/* Create "default" multicast route to the interface */
+
+static void addrconf_add_mroute(struct net_device *dev)
+{
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_LOCAL,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_dst_len = 8,
+		.fc_flags = RTF_UP,
+		.fc_nlinfo.nl_net = dev_net(dev),
+	};
+
+	ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
+
+	ip6_route_add(&cfg);
+}
+
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+static void sit_route_add(struct net_device *dev)
+{
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_MAIN,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_dst_len = 96,
+		.fc_flags = RTF_UP | RTF_NONEXTHOP,
+		.fc_nlinfo.nl_net = dev_net(dev),
+	};
+
+	/* prefix length - 96 bits "::d.d.d.d" */
+	ip6_route_add(&cfg);
+}
+#endif
+
+static void addrconf_add_lroute(struct net_device *dev)
+{
+	struct in6_addr addr;
+
+	ipv6_addr_set(&addr,  htonl(0xFE800000), 0, 0, 0);
+	addrconf_prefix_route(&addr, 64, dev, 0, 0);
+}
+
+static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+
+	ASSERT_RTNL();
+
+	idev = ipv6_find_idev(dev);
+	if (!idev)
+		return ERR_PTR(-ENOBUFS);
+
+	if (idev->cnf.disable_ipv6)
+		return ERR_PTR(-EACCES);
+
+	/* Add default multicast route */
+	addrconf_add_mroute(dev);
+
+	/* Add link local route */
+	addrconf_add_lroute(dev);
+	return idev;
+}
+
+void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
+{
+	struct prefix_info *pinfo;
+	__u32 valid_lft;
+	__u32 prefered_lft;
+	int addr_type;
+	struct inet6_dev *in6_dev;
+	struct net *net = dev_net(dev);
+
+	pinfo = (struct prefix_info *) opt;
+
+	if (len < sizeof(struct prefix_info)) {
+		ADBG(("addrconf: prefix option too short\n"));
+		return;
+	}
+
+	/*
+	 *	Validation checks ([ADDRCONF], page 19)
+	 */
+
+	addr_type = ipv6_addr_type(&pinfo->prefix);
+
+	if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL))
+		return;
+
+	valid_lft = ntohl(pinfo->valid);
+	prefered_lft = ntohl(pinfo->prefered);
+
+	if (prefered_lft > valid_lft) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n");
+		return;
+	}
+
+	in6_dev = in6_dev_get(dev);
+
+	if (in6_dev == NULL) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name);
+		return;
+	}
+
+	/*
+	 *	Two things going on here:
+	 *	1) Add routes for on-link prefixes
+	 *	2) Configure prefixes with the auto flag set
+	 */
+
+	if (pinfo->onlink) {
+		struct rt6_info *rt;
+		unsigned long rt_expires;
+
+		/* Avoid arithmetic overflow. Really, we could
+		 * save rt_expires in seconds, likely valid_lft,
+		 * but it would require division in fib gc, that it
+		 * not good.
+		 */
+		if (HZ > USER_HZ)
+			rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
+		else
+			rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);
+
+		if (addrconf_finite_timeout(rt_expires))
+			rt_expires *= HZ;
+
+		rt = rt6_lookup(net, &pinfo->prefix, NULL,
+				dev->ifindex, 1);
+
+		if (rt && addrconf_is_prefix_route(rt)) {
+			/* Autoconf prefix route */
+			if (valid_lft == 0) {
+				ip6_del_rt(rt);
+				rt = NULL;
+			} else if (addrconf_finite_timeout(rt_expires)) {
+				/* not infinity */
+				rt->rt6i_expires = jiffies + rt_expires;
+				rt->rt6i_flags |= RTF_EXPIRES;
+			} else {
+				rt->rt6i_flags &= ~RTF_EXPIRES;
+				rt->rt6i_expires = 0;
+			}
+		} else if (valid_lft) {
+			clock_t expires = 0;
+			int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
+			if (addrconf_finite_timeout(rt_expires)) {
+				/* not infinity */
+				flags |= RTF_EXPIRES;
+				expires = jiffies_to_clock_t(rt_expires);
+			}
+			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
+					      dev, expires, flags);
+		}
+		if (rt)
+			dst_release(&rt->dst);
+	}
+
+	/* Try to figure out our local address for this prefix */
+
+	if (pinfo->autoconf && in6_dev->cnf.autoconf) {
+		struct inet6_ifaddr * ifp;
+		struct in6_addr addr;
+		int create = 0, update_lft = 0;
+
+		if (pinfo->prefix_len == 64) {
+			memcpy(&addr, &pinfo->prefix, 8);
+			if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
+			    ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
+				in6_dev_put(in6_dev);
+				return;
+			}
+			goto ok;
+		}
+		if (net_ratelimit())
+			printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n",
+			       pinfo->prefix_len);
+		in6_dev_put(in6_dev);
+		return;
+
+ok:
+
+		ifp = ipv6_get_ifaddr(net, &addr, dev, 1);
+
+		if (ifp == NULL && valid_lft) {
+			int max_addresses = in6_dev->cnf.max_addresses;
+			u32 addr_flags = 0;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+			if (in6_dev->cnf.optimistic_dad &&
+			    !net->ipv6.devconf_all->forwarding)
+				addr_flags = IFA_F_OPTIMISTIC;
+#endif
+
+			/* Do not allow to create too much of autoconfigured
+			 * addresses; this would be too easy way to crash kernel.
+			 */
+			if (!max_addresses ||
+			    ipv6_count_addresses(in6_dev) < max_addresses)
+				ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len,
+						    addr_type&IPV6_ADDR_SCOPE_MASK,
+						    addr_flags);
+
+			if (!ifp || IS_ERR(ifp)) {
+				in6_dev_put(in6_dev);
+				return;
+			}
+
+			update_lft = create = 1;
+			ifp->cstamp = jiffies;
+			addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT);
+		}
+
+		if (ifp) {
+			int flags;
+			unsigned long now;
+#ifdef CONFIG_IPV6_PRIVACY
+			struct inet6_ifaddr *ift;
+#endif
+			u32 stored_lft;
+
+			/* update lifetime (RFC2462 5.5.3 e) */
+			spin_lock(&ifp->lock);
+			now = jiffies;
+			if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
+				stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
+			else
+				stored_lft = 0;
+			if (!update_lft && stored_lft) {
+				if (valid_lft > MIN_VALID_LIFETIME ||
+				    valid_lft > stored_lft)
+					update_lft = 1;
+				else if (stored_lft <= MIN_VALID_LIFETIME) {
+					/* valid_lft <= stored_lft is always true */
+					/*
+					 * RFC 4862 Section 5.5.3e:
+					 * "Note that the preferred lifetime of
+					 *  the corresponding address is always
+					 *  reset to the Preferred Lifetime in
+					 *  the received Prefix Information
+					 *  option, regardless of whether the
+					 *  valid lifetime is also reset or
+					 *  ignored."
+					 *
+					 *  So if the preferred lifetime in
+					 *  this advertisement is different
+					 *  than what we have stored, but the
+					 *  valid lifetime is invalid, just
+					 *  reset prefered_lft.
+					 *
+					 *  We must set the valid lifetime
+					 *  to the stored lifetime since we'll
+					 *  be updating the timestamp below,
+					 *  else we'll set it back to the
+					 *  minimum.
+					 */
+					if (prefered_lft != ifp->prefered_lft) {
+						valid_lft = stored_lft;
+						update_lft = 1;
+					}
+				} else {
+					valid_lft = MIN_VALID_LIFETIME;
+					if (valid_lft < prefered_lft)
+						prefered_lft = valid_lft;
+					update_lft = 1;
+				}
+			}
+
+			if (update_lft) {
+				ifp->valid_lft = valid_lft;
+				ifp->prefered_lft = prefered_lft;
+				ifp->tstamp = now;
+				flags = ifp->flags;
+				ifp->flags &= ~IFA_F_DEPRECATED;
+				spin_unlock(&ifp->lock);
+
+				if (!(flags&IFA_F_TENTATIVE))
+					ipv6_ifa_notify(0, ifp);
+			} else
+				spin_unlock(&ifp->lock);
+
+#ifdef CONFIG_IPV6_PRIVACY
+			read_lock_bh(&in6_dev->lock);
+			/* update all temporary addresses in the list */
+			list_for_each_entry(ift, &in6_dev->tempaddr_list,
+					    tmp_list) {
+				int age, max_valid, max_prefered;
+
+				if (ifp != ift->ifpub)
+					continue;
+
+				/*
+				 * RFC 4941 section 3.3:
+				 * If a received option will extend the lifetime
+				 * of a public address, the lifetimes of
+				 * temporary addresses should be extended,
+				 * subject to the overall constraint that no
+				 * temporary addresses should ever remain
+				 * "valid" or "preferred" for a time longer than
+				 * (TEMP_VALID_LIFETIME) or
+				 * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR),
+				 * respectively.
+				 */
+				age = (now - ift->cstamp) / HZ;
+				max_valid = in6_dev->cnf.temp_valid_lft - age;
+				if (max_valid < 0)
+					max_valid = 0;
+
+				max_prefered = in6_dev->cnf.temp_prefered_lft -
+					       in6_dev->cnf.max_desync_factor -
+					       age;
+				if (max_prefered < 0)
+					max_prefered = 0;
+
+				if (valid_lft > max_valid)
+					valid_lft = max_valid;
+
+				if (prefered_lft > max_prefered)
+					prefered_lft = max_prefered;
+
+				spin_lock(&ift->lock);
+				flags = ift->flags;
+				ift->valid_lft = valid_lft;
+				ift->prefered_lft = prefered_lft;
+				ift->tstamp = now;
+				if (prefered_lft > 0)
+					ift->flags &= ~IFA_F_DEPRECATED;
+
+				spin_unlock(&ift->lock);
+				if (!(flags&IFA_F_TENTATIVE))
+					ipv6_ifa_notify(0, ift);
+			}
+
+			if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) {
+				/*
+				 * When a new public address is created as
+				 * described in [ADDRCONF], also create a new
+				 * temporary address. Also create a temporary
+				 * address if it's enabled but no temporary
+				 * address currently exists.
+				 */
+				read_unlock_bh(&in6_dev->lock);
+				ipv6_create_tempaddr(ifp, NULL);
+			} else {
+				read_unlock_bh(&in6_dev->lock);
+			}
+#endif
+			in6_ifa_put(ifp);
+			addrconf_verify(0);
+		}
+	}
+	inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
+	in6_dev_put(in6_dev);
+}
+
+/*
+ *	Set destination address.
+ *	Special case for SIT interfaces where we create a new "virtual"
+ *	device.
+ */
+int addrconf_set_dstaddr(struct net *net, void __user *arg)
+{
+	struct in6_ifreq ireq;
+	struct net_device *dev;
+	int err = -EINVAL;
+
+	rtnl_lock();
+
+	err = -EFAULT;
+	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+		goto err_exit;
+
+	dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+
+	err = -ENODEV;
+	if (dev == NULL)
+		goto err_exit;
+
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+	if (dev->type == ARPHRD_SIT) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		struct ifreq ifr;
+		struct ip_tunnel_parm p;
+
+		err = -EADDRNOTAVAIL;
+		if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
+			goto err_exit;
+
+		memset(&p, 0, sizeof(p));
+		p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
+		p.iph.saddr = 0;
+		p.iph.version = 4;
+		p.iph.ihl = 5;
+		p.iph.protocol = IPPROTO_IPV6;
+		p.iph.ttl = 64;
+		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+
+		if (ops->ndo_do_ioctl) {
+			mm_segment_t oldfs = get_fs();
+
+			set_fs(KERNEL_DS);
+			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+			set_fs(oldfs);
+		} else
+			err = -EOPNOTSUPP;
+
+		if (err == 0) {
+			err = -ENOBUFS;
+			dev = __dev_get_by_name(net, p.name);
+			if (!dev)
+				goto err_exit;
+			err = dev_open(dev);
+		}
+	}
+#endif
+
+err_exit:
+	rtnl_unlock();
+	return err;
+}
+
+/*
+ *	Manual configuration of address on an interface
+ */
+static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx,
+			  unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
+			  __u32 valid_lft)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *idev;
+	struct net_device *dev;
+	int scope;
+	u32 flags;
+	clock_t expires;
+	unsigned long timeout;
+
+	ASSERT_RTNL();
+
+	if (plen > 128)
+		return -EINVAL;
+
+	/* check the lifetime */
+	if (!valid_lft || prefered_lft > valid_lft)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	idev = addrconf_add_dev(dev);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	scope = ipv6_addr_scope(pfx);
+
+	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		expires = jiffies_to_clock_t(timeout * HZ);
+		valid_lft = timeout;
+		flags = RTF_EXPIRES;
+	} else {
+		expires = 0;
+		flags = 0;
+		ifa_flags |= IFA_F_PERMANENT;
+	}
+
+	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		if (timeout == 0)
+			ifa_flags |= IFA_F_DEPRECATED;
+		prefered_lft = timeout;
+	}
+
+	ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags);
+
+	if (!IS_ERR(ifp)) {
+		spin_lock_bh(&ifp->lock);
+		ifp->valid_lft = valid_lft;
+		ifp->prefered_lft = prefered_lft;
+		ifp->tstamp = jiffies;
+		spin_unlock_bh(&ifp->lock);
+
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
+				      expires, flags);
+		/*
+		 * Note that section 3.1 of RFC 4429 indicates
+		 * that the Optimistic flag should not be set for
+		 * manually configured addresses
+		 */
+		addrconf_dad_start(ifp, 0);
+		in6_ifa_put(ifp);
+		addrconf_verify(0);
+		return 0;
+	}
+
+	return PTR_ERR(ifp);
+}
+
+static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx,
+			  unsigned int plen)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *idev;
+	struct net_device *dev;
+
+	if (plen > 128)
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	if ((idev = __in6_dev_get(dev)) == NULL)
+		return -ENXIO;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifp, &idev->addr_list, if_list) {
+		if (ifp->prefix_len == plen &&
+		    ipv6_addr_equal(pfx, &ifp->addr)) {
+			in6_ifa_hold(ifp);
+			read_unlock_bh(&idev->lock);
+
+			ipv6_del_addr(ifp);
+
+			/* If the last address is deleted administratively,
+			   disable IPv6 on this interface.
+			 */
+			if (list_empty(&idev->addr_list))
+				addrconf_ifdown(idev->dev, 1);
+			return 0;
+		}
+	}
+	read_unlock_bh(&idev->lock);
+	return -EADDRNOTAVAIL;
+}
+
+
+int addrconf_add_ifaddr(struct net *net, void __user *arg)
+{
+	struct in6_ifreq ireq;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+		return -EFAULT;
+
+	rtnl_lock();
+	err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr,
+			     ireq.ifr6_prefixlen, IFA_F_PERMANENT,
+			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+	rtnl_unlock();
+	return err;
+}
+
+int addrconf_del_ifaddr(struct net *net, void __user *arg)
+{
+	struct in6_ifreq ireq;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+		return -EFAULT;
+
+	rtnl_lock();
+	err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr,
+			     ireq.ifr6_prefixlen);
+	rtnl_unlock();
+	return err;
+}
+
+static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
+		     int plen, int scope)
+{
+	struct inet6_ifaddr *ifp;
+
+	ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT);
+	if (!IS_ERR(ifp)) {
+		spin_lock_bh(&ifp->lock);
+		ifp->flags &= ~IFA_F_TENTATIVE;
+		spin_unlock_bh(&ifp->lock);
+		ipv6_ifa_notify(RTM_NEWADDR, ifp);
+		in6_ifa_put(ifp);
+	}
+}
+
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+static void sit_add_v4_addrs(struct inet6_dev *idev)
+{
+	struct in6_addr addr;
+	struct net_device *dev;
+	struct net *net = dev_net(idev->dev);
+	int scope;
+
+	ASSERT_RTNL();
+
+	memset(&addr, 0, sizeof(struct in6_addr));
+	memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
+
+	if (idev->dev->flags&IFF_POINTOPOINT) {
+		addr.s6_addr32[0] = htonl(0xfe800000);
+		scope = IFA_LINK;
+	} else {
+		scope = IPV6_ADDR_COMPATv4;
+	}
+
+	if (addr.s6_addr32[3]) {
+		add_addr(idev, &addr, 128, scope);
+		return;
+	}
+
+	for_each_netdev(net, dev) {
+		struct in_device * in_dev = __in_dev_get_rtnl(dev);
+		if (in_dev && (dev->flags & IFF_UP)) {
+			struct in_ifaddr * ifa;
+
+			int flag = scope;
+
+			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+				int plen;
+
+				addr.s6_addr32[3] = ifa->ifa_local;
+
+				if (ifa->ifa_scope == RT_SCOPE_LINK)
+					continue;
+				if (ifa->ifa_scope >= RT_SCOPE_HOST) {
+					if (idev->dev->flags&IFF_POINTOPOINT)
+						continue;
+					flag |= IFA_HOST;
+				}
+				if (idev->dev->flags&IFF_POINTOPOINT)
+					plen = 64;
+				else
+					plen = 96;
+
+				add_addr(idev, &addr, plen, flag);
+			}
+		}
+	}
+}
+#endif
+
+static void init_loopback(struct net_device *dev)
+{
+	struct inet6_dev  *idev;
+
+	/* ::1 */
+
+	ASSERT_RTNL();
+
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
+		printk(KERN_DEBUG "init loopback: add_dev failed\n");
+		return;
+	}
+
+	add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+}
+
+static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+	struct inet6_ifaddr * ifp;
+	u32 addr_flags = IFA_F_PERMANENT;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	if (idev->cnf.optimistic_dad &&
+	    !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
+		addr_flags |= IFA_F_OPTIMISTIC;
+#endif
+
+
+	ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags);
+	if (!IS_ERR(ifp)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+		addrconf_dad_start(ifp, 0);
+		in6_ifa_put(ifp);
+	}
+}
+
+static void addrconf_dev_config(struct net_device *dev)
+{
+	struct in6_addr addr;
+	struct inet6_dev    * idev;
+
+	ASSERT_RTNL();
+
+	if ((dev->type != ARPHRD_ETHER) &&
+	    (dev->type != ARPHRD_FDDI) &&
+	    (dev->type != ARPHRD_IEEE802_TR) &&
+	    (dev->type != ARPHRD_ARCNET) &&
+	    (dev->type != ARPHRD_INFINIBAND)) {
+		/* Alas, we support only Ethernet autoconfiguration. */
+		return;
+	}
+
+	idev = addrconf_add_dev(dev);
+	if (IS_ERR(idev))
+		return;
+
+	memset(&addr, 0, sizeof(struct in6_addr));
+	addr.s6_addr32[0] = htonl(0xFE800000);
+
+	if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0)
+		addrconf_add_linklocal(idev, &addr);
+}
+
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+static void addrconf_sit_config(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+
+	ASSERT_RTNL();
+
+	/*
+	 * Configure the tunnel with one of our IPv4
+	 * addresses... we should configure all of
+	 * our v4 addrs in the tunnel
+	 */
+
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
+		printk(KERN_DEBUG "init sit: add_dev failed\n");
+		return;
+	}
+
+	if (dev->priv_flags & IFF_ISATAP) {
+		struct in6_addr addr;
+
+		ipv6_addr_set(&addr,  htonl(0xFE800000), 0, 0, 0);
+		addrconf_prefix_route(&addr, 64, dev, 0, 0);
+		if (!ipv6_generate_eui64(addr.s6_addr + 8, dev))
+			addrconf_add_linklocal(idev, &addr);
+		return;
+	}
+
+	sit_add_v4_addrs(idev);
+
+	if (dev->flags&IFF_POINTOPOINT) {
+		addrconf_add_mroute(dev);
+		addrconf_add_lroute(dev);
+	} else
+		sit_route_add(dev);
+}
+#endif
+
+static inline int
+ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
+{
+	struct in6_addr lladdr;
+
+	if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) {
+		addrconf_add_linklocal(idev, &lladdr);
+		return 0;
+	}
+	return -1;
+}
+
+static void ip6_tnl_add_linklocal(struct inet6_dev *idev)
+{
+	struct net_device *link_dev;
+	struct net *net = dev_net(idev->dev);
+
+	/* first try to inherit the link-local address from the link device */
+	if (idev->dev->iflink &&
+	    (link_dev = __dev_get_by_index(net, idev->dev->iflink))) {
+		if (!ipv6_inherit_linklocal(idev, link_dev))
+			return;
+	}
+	/* then try to inherit it from any device */
+	for_each_netdev(net, link_dev) {
+		if (!ipv6_inherit_linklocal(idev, link_dev))
+			return;
+	}
+	printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n");
+}
+
+/*
+ * Autoconfigure tunnel with a link-local address so routing protocols,
+ * DHCPv6, MLD etc. can be run over the virtual link
+ */
+
+static void addrconf_ip6_tnl_config(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+
+	ASSERT_RTNL();
+
+	idev = addrconf_add_dev(dev);
+	if (IS_ERR(idev)) {
+		printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n");
+		return;
+	}
+	ip6_tnl_add_linklocal(idev);
+}
+
+static int addrconf_notify(struct notifier_block *this, unsigned long event,
+			   void * data)
+{
+	struct net_device *dev = (struct net_device *) data;
+	struct inet6_dev *idev = __in6_dev_get(dev);
+	int run_pending = 0;
+	int err;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (!idev && dev->mtu >= IPV6_MIN_MTU) {
+			idev = ipv6_add_dev(dev);
+			if (!idev)
+				return notifier_from_errno(-ENOMEM);
+		}
+		break;
+
+	case NETDEV_UP:
+	case NETDEV_CHANGE:
+		if (dev->flags & IFF_SLAVE)
+			break;
+
+		if (event == NETDEV_UP) {
+			if (!addrconf_qdisc_ok(dev)) {
+				/* device is not ready yet. */
+				printk(KERN_INFO
+					"ADDRCONF(NETDEV_UP): %s: "
+					"link is not ready\n",
+					dev->name);
+				break;
+			}
+
+			if (!idev && dev->mtu >= IPV6_MIN_MTU)
+				idev = ipv6_add_dev(dev);
+
+			if (idev) {
+				idev->if_flags |= IF_READY;
+				run_pending = 1;
+			}
+		} else {
+			if (!addrconf_qdisc_ok(dev)) {
+				/* device is still not ready. */
+				break;
+			}
+
+			if (idev) {
+				if (idev->if_flags & IF_READY)
+					/* device is already configured. */
+					break;
+				idev->if_flags |= IF_READY;
+			}
+
+			printk(KERN_INFO
+					"ADDRCONF(NETDEV_CHANGE): %s: "
+					"link becomes ready\n",
+					dev->name);
+
+			run_pending = 1;
+		}
+
+		switch (dev->type) {
+#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE)
+		case ARPHRD_SIT:
+			addrconf_sit_config(dev);
+			break;
+#endif
+		case ARPHRD_TUNNEL6:
+			addrconf_ip6_tnl_config(dev);
+			break;
+		case ARPHRD_LOOPBACK:
+			init_loopback(dev);
+			break;
+
+		default:
+			addrconf_dev_config(dev);
+			break;
+		}
+
+		if (idev) {
+			if (run_pending)
+				addrconf_dad_run(idev);
+
+			/*
+			 * If the MTU changed during the interface down,
+			 * when the interface up, the changed MTU must be
+			 * reflected in the idev as well as routers.
+			 */
+			if (idev->cnf.mtu6 != dev->mtu &&
+			    dev->mtu >= IPV6_MIN_MTU) {
+				rt6_mtu_change(dev, dev->mtu);
+				idev->cnf.mtu6 = dev->mtu;
+			}
+			idev->tstamp = jiffies;
+			inet6_ifinfo_notify(RTM_NEWLINK, idev);
+
+			/*
+			 * If the changed mtu during down is lower than
+			 * IPV6_MIN_MTU stop IPv6 on this interface.
+			 */
+			if (dev->mtu < IPV6_MIN_MTU)
+				addrconf_ifdown(dev, 1);
+		}
+		break;
+
+	case NETDEV_CHANGEMTU:
+		if (idev && dev->mtu >= IPV6_MIN_MTU) {
+			rt6_mtu_change(dev, dev->mtu);
+			idev->cnf.mtu6 = dev->mtu;
+			break;
+		}
+
+		if (!idev && dev->mtu >= IPV6_MIN_MTU) {
+			idev = ipv6_add_dev(dev);
+			if (idev)
+				break;
+		}
+
+		/*
+		 * MTU falled under IPV6_MIN_MTU.
+		 * Stop IPv6 on this interface.
+		 */
+
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		/*
+		 *	Remove all addresses from this interface.
+		 */
+		addrconf_ifdown(dev, event != NETDEV_DOWN);
+		break;
+
+	case NETDEV_CHANGENAME:
+		if (idev) {
+			snmp6_unregister_dev(idev);
+			addrconf_sysctl_unregister(idev);
+			addrconf_sysctl_register(idev);
+			err = snmp6_register_dev(idev);
+			if (err)
+				return notifier_from_errno(err);
+		}
+		break;
+
+	case NETDEV_PRE_TYPE_CHANGE:
+	case NETDEV_POST_TYPE_CHANGE:
+		addrconf_type_change(dev, event);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ *	addrconf module should be notified of a device going up
+ */
+static struct notifier_block ipv6_dev_notf = {
+	.notifier_call = addrconf_notify,
+};
+
+static void addrconf_type_change(struct net_device *dev, unsigned long event)
+{
+	struct inet6_dev *idev;
+	ASSERT_RTNL();
+
+	idev = __in6_dev_get(dev);
+
+	if (event == NETDEV_POST_TYPE_CHANGE)
+		ipv6_mc_remap(idev);
+	else if (event == NETDEV_PRE_TYPE_CHANGE)
+		ipv6_mc_unmap(idev);
+}
+
+static int addrconf_ifdown(struct net_device *dev, int how)
+{
+	struct net *net = dev_net(dev);
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifa;
+	int state, i;
+
+	ASSERT_RTNL();
+
+	rt6_ifdown(net, dev);
+	neigh_ifdown(&nd_tbl, dev);
+
+	idev = __in6_dev_get(dev);
+	if (idev == NULL)
+		return -ENODEV;
+
+	/*
+	 * Step 1: remove reference to ipv6 device from parent device.
+	 *	   Do not dev_put!
+	 */
+	if (how) {
+		idev->dead = 1;
+
+		/* protected by rtnl_lock */
+		rcu_assign_pointer(dev->ip6_ptr, NULL);
+
+		/* Step 1.5: remove snmp6 entry */
+		snmp6_unregister_dev(idev);
+
+	}
+
+	/* Step 2: clear hash table */
+	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+		struct hlist_head *h = &inet6_addr_lst[i];
+		struct hlist_node *n;
+
+		spin_lock_bh(&addrconf_hash_lock);
+	restart:
+		hlist_for_each_entry_rcu(ifa, n, h, addr_lst) {
+			if (ifa->idev == idev) {
+				hlist_del_init_rcu(&ifa->addr_lst);
+				addrconf_del_timer(ifa);
+				goto restart;
+			}
+		}
+		spin_unlock_bh(&addrconf_hash_lock);
+	}
+
+	write_lock_bh(&idev->lock);
+
+	/* Step 2: clear flags for stateless addrconf */
+	if (!how)
+		idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
+
+#ifdef CONFIG_IPV6_PRIVACY
+	if (how && del_timer(&idev->regen_timer))
+		in6_dev_put(idev);
+
+	/* Step 3: clear tempaddr list */
+	while (!list_empty(&idev->tempaddr_list)) {
+		ifa = list_first_entry(&idev->tempaddr_list,
+				       struct inet6_ifaddr, tmp_list);
+		list_del(&ifa->tmp_list);
+		write_unlock_bh(&idev->lock);
+		spin_lock_bh(&ifa->lock);
+
+		if (ifa->ifpub) {
+			in6_ifa_put(ifa->ifpub);
+			ifa->ifpub = NULL;
+		}
+		spin_unlock_bh(&ifa->lock);
+		in6_ifa_put(ifa);
+		write_lock_bh(&idev->lock);
+	}
+#endif
+
+	while (!list_empty(&idev->addr_list)) {
+		ifa = list_first_entry(&idev->addr_list,
+				       struct inet6_ifaddr, if_list);
+		addrconf_del_timer(ifa);
+
+		list_del(&ifa->if_list);
+
+		write_unlock_bh(&idev->lock);
+
+		spin_lock_bh(&ifa->state_lock);
+		state = ifa->state;
+		ifa->state = INET6_IFADDR_STATE_DEAD;
+		spin_unlock_bh(&ifa->state_lock);
+
+		if (state != INET6_IFADDR_STATE_DEAD) {
+			__ipv6_ifa_notify(RTM_DELADDR, ifa);
+			atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
+		}
+		in6_ifa_put(ifa);
+
+		write_lock_bh(&idev->lock);
+	}
+
+	write_unlock_bh(&idev->lock);
+
+	/* Step 5: Discard multicast list */
+	if (how)
+		ipv6_mc_destroy_dev(idev);
+	else
+		ipv6_mc_down(idev);
+
+	idev->tstamp = jiffies;
+
+	/* Last: Shot the device (if unregistered) */
+	if (how) {
+		addrconf_sysctl_unregister(idev);
+		neigh_parms_release(&nd_tbl, idev->nd_parms);
+		neigh_ifdown(&nd_tbl, dev);
+		in6_dev_put(idev);
+	}
+	return 0;
+}
+
+static void addrconf_rs_timer(unsigned long data)
+{
+	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+	struct inet6_dev *idev = ifp->idev;
+
+	read_lock(&idev->lock);
+	if (idev->dead || !(idev->if_flags & IF_READY))
+		goto out;
+
+	if (idev->cnf.forwarding)
+		goto out;
+
+	/* Announcement received after solicitation was sent */
+	if (idev->if_flags & IF_RA_RCVD)
+		goto out;
+
+	spin_lock(&ifp->lock);
+	if (ifp->probes++ < idev->cnf.rtr_solicits) {
+		/* The wait after the last probe can be shorter */
+		addrconf_mod_timer(ifp, AC_RS,
+				   (ifp->probes == idev->cnf.rtr_solicits) ?
+				   idev->cnf.rtr_solicit_delay :
+				   idev->cnf.rtr_solicit_interval);
+		spin_unlock(&ifp->lock);
+
+		ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters);
+	} else {
+		spin_unlock(&ifp->lock);
+		/*
+		 * Note: we do not support deprecated "all on-link"
+		 * assumption any longer.
+		 */
+		printk(KERN_DEBUG "%s: no IPv6 routers present\n",
+		       idev->dev->name);
+	}
+
+out:
+	read_unlock(&idev->lock);
+	in6_ifa_put(ifp);
+}
+
+/*
+ *	Duplicate Address Detection
+ */
+static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
+{
+	unsigned long rand_num;
+	struct inet6_dev *idev = ifp->idev;
+
+	if (ifp->flags & IFA_F_OPTIMISTIC)
+		rand_num = 0;
+	else
+		rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1);
+
+	ifp->probes = idev->cnf.dad_transmits;
+	addrconf_mod_timer(ifp, AC_DAD, rand_num);
+}
+
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
+{
+	struct inet6_dev *idev = ifp->idev;
+	struct net_device *dev = idev->dev;
+
+	addrconf_join_solict(dev, &ifp->addr);
+
+	net_srandom(ifp->addr.s6_addr32[3]);
+
+	read_lock_bh(&idev->lock);
+	spin_lock(&ifp->lock);
+	if (ifp->state == INET6_IFADDR_STATE_DEAD)
+		goto out;
+
+	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+	    idev->cnf.accept_dad < 1 ||
+	    !(ifp->flags&IFA_F_TENTATIVE) ||
+	    ifp->flags & IFA_F_NODAD) {
+		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+		spin_unlock(&ifp->lock);
+		read_unlock_bh(&idev->lock);
+
+		addrconf_dad_completed(ifp);
+		return;
+	}
+
+	if (!(idev->if_flags & IF_READY)) {
+		spin_unlock(&ifp->lock);
+		read_unlock_bh(&idev->lock);
+		/*
+		 * If the device is not ready:
+		 * - keep it tentative if it is a permanent address.
+		 * - otherwise, kill it.
+		 */
+		in6_ifa_hold(ifp);
+		addrconf_dad_stop(ifp, 0);
+		return;
+	}
+
+	/*
+	 * Optimistic nodes can start receiving
+	 * Frames right away
+	 */
+	if (ifp->flags & IFA_F_OPTIMISTIC)
+		ip6_ins_rt(ifp->rt);
+
+	addrconf_dad_kick(ifp);
+out:
+	spin_unlock(&ifp->lock);
+	read_unlock_bh(&idev->lock);
+}
+
+static void addrconf_dad_timer(unsigned long data)
+{
+	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+	struct inet6_dev *idev = ifp->idev;
+	struct in6_addr mcaddr;
+
+	if (!ifp->probes && addrconf_dad_end(ifp))
+		goto out;
+
+	read_lock(&idev->lock);
+	if (idev->dead || !(idev->if_flags & IF_READY)) {
+		read_unlock(&idev->lock);
+		goto out;
+	}
+
+	spin_lock(&ifp->lock);
+	if (ifp->state == INET6_IFADDR_STATE_DEAD) {
+		spin_unlock(&ifp->lock);
+		read_unlock(&idev->lock);
+		goto out;
+	}
+
+	if (ifp->probes == 0) {
+		/*
+		 * DAD was successful
+		 */
+
+		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
+		spin_unlock(&ifp->lock);
+		read_unlock(&idev->lock);
+
+		addrconf_dad_completed(ifp);
+
+		goto out;
+	}
+
+	ifp->probes--;
+	addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time);
+	spin_unlock(&ifp->lock);
+	read_unlock(&idev->lock);
+
+	/* send a neighbour solicitation for our addr */
+	addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
+	ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
+out:
+	in6_ifa_put(ifp);
+}
+
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+{
+	struct net_device *dev = ifp->idev->dev;
+
+	/*
+	 *	Configure the address for reception. Now it is valid.
+	 */
+
+	ipv6_ifa_notify(RTM_NEWADDR, ifp);
+
+	/* If added prefix is link local and forwarding is off,
+	   start sending router solicitations.
+	 */
+
+	if ((ifp->idev->cnf.forwarding == 0 ||
+	     ifp->idev->cnf.forwarding == 2) &&
+	    ifp->idev->cnf.rtr_solicits > 0 &&
+	    (dev->flags&IFF_LOOPBACK) == 0 &&
+	    (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
+		/*
+		 *	If a host as already performed a random delay
+		 *	[...] as part of DAD [...] there is no need
+		 *	to delay again before sending the first RS
+		 */
+		ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters);
+
+		spin_lock_bh(&ifp->lock);
+		ifp->probes = 1;
+		ifp->idev->if_flags |= IF_RS_SENT;
+		addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval);
+		spin_unlock_bh(&ifp->lock);
+	}
+}
+
+static void addrconf_dad_run(struct inet6_dev *idev)
+{
+	struct inet6_ifaddr *ifp;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifp, &idev->addr_list, if_list) {
+		spin_lock(&ifp->lock);
+		if (ifp->flags & IFA_F_TENTATIVE &&
+		    ifp->state == INET6_IFADDR_STATE_DAD)
+			addrconf_dad_kick(ifp);
+		spin_unlock(&ifp->lock);
+	}
+	read_unlock_bh(&idev->lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct if6_iter_state {
+	struct seq_net_private p;
+	int bucket;
+};
+
+static struct inet6_ifaddr *if6_get_first(struct seq_file *seq)
+{
+	struct inet6_ifaddr *ifa = NULL;
+	struct if6_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
+		struct hlist_node *n;
+		hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket],
+					 addr_lst)
+			if (net_eq(dev_net(ifa->idev->dev), net))
+				return ifa;
+	}
+	return NULL;
+}
+
+static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
+					 struct inet6_ifaddr *ifa)
+{
+	struct if6_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct hlist_node *n = &ifa->addr_lst;
+
+	hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst)
+		if (net_eq(dev_net(ifa->idev->dev), net))
+			return ifa;
+
+	while (++state->bucket < IN6_ADDR_HSIZE) {
+		hlist_for_each_entry_rcu_bh(ifa, n,
+				     &inet6_addr_lst[state->bucket], addr_lst) {
+			if (net_eq(dev_net(ifa->idev->dev), net))
+				return ifa;
+		}
+	}
+
+	return NULL;
+}
+
+static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct inet6_ifaddr *ifa = if6_get_first(seq);
+
+	if (ifa)
+		while (pos && (ifa = if6_get_next(seq, ifa)) != NULL)
+			--pos;
+	return pos ? NULL : ifa;
+}
+
+static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu_bh)
+{
+	rcu_read_lock_bh();
+	return if6_get_idx(seq, *pos);
+}
+
+static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct inet6_ifaddr *ifa;
+
+	ifa = if6_get_next(seq, v);
+	++*pos;
+	return ifa;
+}
+
+static void if6_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu_bh)
+{
+	rcu_read_unlock_bh();
+}
+
+static int if6_seq_show(struct seq_file *seq, void *v)
+{
+	struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
+	seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
+		   &ifp->addr,
+		   ifp->idev->dev->ifindex,
+		   ifp->prefix_len,
+		   ifp->scope,
+		   ifp->flags,
+		   ifp->idev->dev->name);
+	return 0;
+}
+
+static const struct seq_operations if6_seq_ops = {
+	.start	= if6_seq_start,
+	.next	= if6_seq_next,
+	.show	= if6_seq_show,
+	.stop	= if6_seq_stop,
+};
+
+static int if6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &if6_seq_ops,
+			    sizeof(struct if6_iter_state));
+}
+
+static const struct file_operations if6_fops = {
+	.owner		= THIS_MODULE,
+	.open		= if6_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static int __net_init if6_proc_net_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit if6_proc_net_exit(struct net *net)
+{
+       proc_net_remove(net, "if_inet6");
+}
+
+static struct pernet_operations if6_proc_net_ops = {
+       .init = if6_proc_net_init,
+       .exit = if6_proc_net_exit,
+};
+
+int __init if6_proc_init(void)
+{
+	return register_pernet_subsys(&if6_proc_net_ops);
+}
+
+void if6_proc_exit(void)
+{
+	unregister_pernet_subsys(&if6_proc_net_ops);
+}
+#endif	/* CONFIG_PROC_FS */
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+/* Check if address is a home address configured on any interface. */
+int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
+{
+	int ret = 0;
+	struct inet6_ifaddr *ifp = NULL;
+	struct hlist_node *n;
+	unsigned int hash = ipv6_addr_hash(addr);
+
+	rcu_read_lock_bh();
+	hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) {
+		if (!net_eq(dev_net(ifp->idev->dev), net))
+			continue;
+		if (ipv6_addr_equal(&ifp->addr, addr) &&
+		    (ifp->flags & IFA_F_HOMEADDRESS)) {
+			ret = 1;
+			break;
+		}
+	}
+	rcu_read_unlock_bh();
+	return ret;
+}
+#endif
+
+/*
+ *	Periodic address status verification
+ */
+
+static void addrconf_verify(unsigned long foo)
+{
+	unsigned long now, next, next_sec, next_sched;
+	struct inet6_ifaddr *ifp;
+	struct hlist_node *node;
+	int i;
+
+	rcu_read_lock_bh();
+	spin_lock(&addrconf_verify_lock);
+	now = jiffies;
+	next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
+
+	del_timer(&addr_chk_timer);
+
+	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
+restart:
+		hlist_for_each_entry_rcu_bh(ifp, node,
+					 &inet6_addr_lst[i], addr_lst) {
+			unsigned long age;
+
+			if (ifp->flags & IFA_F_PERMANENT)
+				continue;
+
+			spin_lock(&ifp->lock);
+			/* We try to batch several events at once. */
+			age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+
+			if (ifp->valid_lft != INFINITY_LIFE_TIME &&
+			    age >= ifp->valid_lft) {
+				spin_unlock(&ifp->lock);
+				in6_ifa_hold(ifp);
+				ipv6_del_addr(ifp);
+				goto restart;
+			} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
+				spin_unlock(&ifp->lock);
+				continue;
+			} else if (age >= ifp->prefered_lft) {
+				/* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
+				int deprecate = 0;
+
+				if (!(ifp->flags&IFA_F_DEPRECATED)) {
+					deprecate = 1;
+					ifp->flags |= IFA_F_DEPRECATED;
+				}
+
+				if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next))
+					next = ifp->tstamp + ifp->valid_lft * HZ;
+
+				spin_unlock(&ifp->lock);
+
+				if (deprecate) {
+					in6_ifa_hold(ifp);
+
+					ipv6_ifa_notify(0, ifp);
+					in6_ifa_put(ifp);
+					goto restart;
+				}
+#ifdef CONFIG_IPV6_PRIVACY
+			} else if ((ifp->flags&IFA_F_TEMPORARY) &&
+				   !(ifp->flags&IFA_F_TENTATIVE)) {
+				unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
+					ifp->idev->cnf.dad_transmits *
+					ifp->idev->nd_parms->retrans_time / HZ;
+
+				if (age >= ifp->prefered_lft - regen_advance) {
+					struct inet6_ifaddr *ifpub = ifp->ifpub;
+					if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+						next = ifp->tstamp + ifp->prefered_lft * HZ;
+					if (!ifp->regen_count && ifpub) {
+						ifp->regen_count++;
+						in6_ifa_hold(ifp);
+						in6_ifa_hold(ifpub);
+						spin_unlock(&ifp->lock);
+
+						spin_lock(&ifpub->lock);
+						ifpub->regen_count = 0;
+						spin_unlock(&ifpub->lock);
+						ipv6_create_tempaddr(ifpub, ifp);
+						in6_ifa_put(ifpub);
+						in6_ifa_put(ifp);
+						goto restart;
+					}
+				} else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+					next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+				spin_unlock(&ifp->lock);
+#endif
+			} else {
+				/* ifp->prefered_lft <= ifp->valid_lft */
+				if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+					next = ifp->tstamp + ifp->prefered_lft * HZ;
+				spin_unlock(&ifp->lock);
+			}
+		}
+	}
+
+	next_sec = round_jiffies_up(next);
+	next_sched = next;
+
+	/* If rounded timeout is accurate enough, accept it. */
+	if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
+		next_sched = next_sec;
+
+	/* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
+	if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
+		next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
+
+	ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
+	      now, next, next_sec, next_sched));
+
+	addr_chk_timer.expires = next_sched;
+	add_timer(&addr_chk_timer);
+	spin_unlock(&addrconf_verify_lock);
+	rcu_read_unlock_bh();
+}
+
+static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local)
+{
+	struct in6_addr *pfx = NULL;
+
+	if (addr)
+		pfx = nla_data(addr);
+
+	if (local) {
+		if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
+			pfx = NULL;
+		else
+			pfx = nla_data(local);
+	}
+
+	return pfx;
+}
+
+static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
+	[IFA_ADDRESS]		= { .len = sizeof(struct in6_addr) },
+	[IFA_LOCAL]		= { .len = sizeof(struct in6_addr) },
+	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
+};
+
+static int
+inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifaddrmsg *ifm;
+	struct nlattr *tb[IFA_MAX+1];
+	struct in6_addr *pfx;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+	if (err < 0)
+		return err;
+
+	ifm = nlmsg_data(nlh);
+	pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+	if (pfx == NULL)
+		return -EINVAL;
+
+	return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen);
+}
+
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
+			     u32 prefered_lft, u32 valid_lft)
+{
+	u32 flags;
+	clock_t expires;
+	unsigned long timeout;
+
+	if (!valid_lft || (prefered_lft > valid_lft))
+		return -EINVAL;
+
+	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		expires = jiffies_to_clock_t(timeout * HZ);
+		valid_lft = timeout;
+		flags = RTF_EXPIRES;
+	} else {
+		expires = 0;
+		flags = 0;
+		ifa_flags |= IFA_F_PERMANENT;
+	}
+
+	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	if (addrconf_finite_timeout(timeout)) {
+		if (timeout == 0)
+			ifa_flags |= IFA_F_DEPRECATED;
+		prefered_lft = timeout;
+	}
+
+	spin_lock_bh(&ifp->lock);
+	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
+	ifp->tstamp = jiffies;
+	ifp->valid_lft = valid_lft;
+	ifp->prefered_lft = prefered_lft;
+
+	spin_unlock_bh(&ifp->lock);
+	if (!(ifp->flags&IFA_F_TENTATIVE))
+		ipv6_ifa_notify(0, ifp);
+
+	addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
+			      expires, flags);
+	addrconf_verify(0);
+
+	return 0;
+}
+
+static int
+inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifaddrmsg *ifm;
+	struct nlattr *tb[IFA_MAX+1];
+	struct in6_addr *pfx;
+	struct inet6_ifaddr *ifa;
+	struct net_device *dev;
+	u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
+	u8 ifa_flags;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+	if (err < 0)
+		return err;
+
+	ifm = nlmsg_data(nlh);
+	pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+	if (pfx == NULL)
+		return -EINVAL;
+
+	if (tb[IFA_CACHEINFO]) {
+		struct ifa_cacheinfo *ci;
+
+		ci = nla_data(tb[IFA_CACHEINFO]);
+		valid_lft = ci->ifa_valid;
+		preferred_lft = ci->ifa_prefered;
+	} else {
+		preferred_lft = INFINITY_LIFE_TIME;
+		valid_lft = INFINITY_LIFE_TIME;
+	}
+
+	dev =  __dev_get_by_index(net, ifm->ifa_index);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* We ignore other flags so far. */
+	ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS);
+
+	ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+	if (ifa == NULL) {
+		/*
+		 * It would be best to check for !NLM_F_CREATE here but
+		 * userspace alreay relies on not having to provide this.
+		 */
+		return inet6_addr_add(net, ifm->ifa_index, pfx,
+				      ifm->ifa_prefixlen, ifa_flags,
+				      preferred_lft, valid_lft);
+	}
+
+	if (nlh->nlmsg_flags & NLM_F_EXCL ||
+	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
+		err = -EEXIST;
+	else
+		err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+
+	in6_ifa_put(ifa);
+
+	return err;
+}
+
+static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags,
+			  u8 scope, int ifindex)
+{
+	struct ifaddrmsg *ifm;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifa_family = AF_INET6;
+	ifm->ifa_prefixlen = prefixlen;
+	ifm->ifa_flags = flags;
+	ifm->ifa_scope = scope;
+	ifm->ifa_index = ifindex;
+}
+
+static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
+			 unsigned long tstamp, u32 preferred, u32 valid)
+{
+	struct ifa_cacheinfo ci;
+
+	ci.cstamp = cstamp_delta(cstamp);
+	ci.tstamp = cstamp_delta(tstamp);
+	ci.ifa_prefered = preferred;
+	ci.ifa_valid = valid;
+
+	return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
+}
+
+static inline int rt_scope(int ifa_scope)
+{
+	if (ifa_scope & IFA_HOST)
+		return RT_SCOPE_HOST;
+	else if (ifa_scope & IFA_LINK)
+		return RT_SCOPE_LINK;
+	else if (ifa_scope & IFA_SITE)
+		return RT_SCOPE_SITE;
+	else
+		return RT_SCOPE_UNIVERSE;
+}
+
+static inline int inet6_ifaddr_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+	       + nla_total_size(16) /* IFA_ADDRESS */
+	       + nla_total_size(sizeof(struct ifa_cacheinfo));
+}
+
+static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
+			     u32 pid, u32 seq, int event, unsigned int flags)
+{
+	struct nlmsghdr  *nlh;
+	u32 preferred, valid;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
+		      ifa->idev->dev->ifindex);
+
+	if (!(ifa->flags&IFA_F_PERMANENT)) {
+		preferred = ifa->prefered_lft;
+		valid = ifa->valid_lft;
+		if (preferred != INFINITY_LIFE_TIME) {
+			long tval = (jiffies - ifa->tstamp)/HZ;
+			if (preferred > tval)
+				preferred -= tval;
+			else
+				preferred = 0;
+			if (valid != INFINITY_LIFE_TIME) {
+				if (valid > tval)
+					valid -= tval;
+				else
+					valid = 0;
+			}
+		}
+	} else {
+		preferred = INFINITY_LIFE_TIME;
+		valid = INFINITY_LIFE_TIME;
+	}
+
+	if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 ||
+	    put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	return nlmsg_end(skb, nlh);
+}
+
+static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
+				u32 pid, u32 seq, int event, u16 flags)
+{
+	struct nlmsghdr  *nlh;
+	u8 scope = RT_SCOPE_UNIVERSE;
+	int ifindex = ifmca->idev->dev->ifindex;
+
+	if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
+		scope = RT_SCOPE_SITE;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
+	if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 ||
+	    put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
+			  INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	return nlmsg_end(skb, nlh);
+}
+
+static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
+				u32 pid, u32 seq, int event, unsigned int flags)
+{
+	struct nlmsghdr  *nlh;
+	u8 scope = RT_SCOPE_UNIVERSE;
+	int ifindex = ifaca->aca_idev->dev->ifindex;
+
+	if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
+		scope = RT_SCOPE_SITE;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
+	if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 ||
+	    put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
+			  INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	return nlmsg_end(skb, nlh);
+}
+
+enum addr_type_t {
+	UNICAST_ADDR,
+	MULTICAST_ADDR,
+	ANYCAST_ADDR,
+};
+
+/* called with rcu_read_lock() */
+static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
+			  struct netlink_callback *cb, enum addr_type_t type,
+			  int s_ip_idx, int *p_ip_idx)
+{
+	struct ifmcaddr6 *ifmca;
+	struct ifacaddr6 *ifaca;
+	int err = 1;
+	int ip_idx = *p_ip_idx;
+
+	read_lock_bh(&idev->lock);
+	switch (type) {
+	case UNICAST_ADDR: {
+		struct inet6_ifaddr *ifa;
+
+		/* unicast address incl. temp addr */
+		list_for_each_entry(ifa, &idev->addr_list, if_list) {
+			if (++ip_idx < s_ip_idx)
+				continue;
+			err = inet6_fill_ifaddr(skb, ifa,
+						NETLINK_CB(cb->skb).pid,
+						cb->nlh->nlmsg_seq,
+						RTM_NEWADDR,
+						NLM_F_MULTI);
+			if (err <= 0)
+				break;
+		}
+		break;
+	}
+	case MULTICAST_ADDR:
+		/* multicast address */
+		for (ifmca = idev->mc_list; ifmca;
+		     ifmca = ifmca->next, ip_idx++) {
+			if (ip_idx < s_ip_idx)
+				continue;
+			err = inet6_fill_ifmcaddr(skb, ifmca,
+						  NETLINK_CB(cb->skb).pid,
+						  cb->nlh->nlmsg_seq,
+						  RTM_GETMULTICAST,
+						  NLM_F_MULTI);
+			if (err <= 0)
+				break;
+		}
+		break;
+	case ANYCAST_ADDR:
+		/* anycast address */
+		for (ifaca = idev->ac_list; ifaca;
+		     ifaca = ifaca->aca_next, ip_idx++) {
+			if (ip_idx < s_ip_idx)
+				continue;
+			err = inet6_fill_ifacaddr(skb, ifaca,
+						  NETLINK_CB(cb->skb).pid,
+						  cb->nlh->nlmsg_seq,
+						  RTM_GETANYCAST,
+						  NLM_F_MULTI);
+			if (err <= 0)
+				break;
+		}
+		break;
+	default:
+		break;
+	}
+	read_unlock_bh(&idev->lock);
+	*p_ip_idx = ip_idx;
+	return err;
+}
+
+static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
+			   enum addr_type_t type)
+{
+	struct net *net = sock_net(skb->sk);
+	int h, s_h;
+	int idx, ip_idx;
+	int s_idx, s_ip_idx;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	s_ip_idx = ip_idx = cb->args[2];
+
+	rcu_read_lock();
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			if (h > s_h || idx > s_idx)
+				s_ip_idx = 0;
+			ip_idx = 0;
+			idev = __in6_dev_get(dev);
+			if (!idev)
+				goto cont;
+
+			if (in6_dump_addrs(idev, skb, cb, type,
+					   s_ip_idx, &ip_idx) <= 0)
+				goto done;
+cont:
+			idx++;
+		}
+	}
+done:
+	rcu_read_unlock();
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	cb->args[2] = ip_idx;
+
+	return skb->len;
+}
+
+static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	enum addr_type_t type = UNICAST_ADDR;
+
+	return inet6_dump_addr(skb, cb, type);
+}
+
+static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	enum addr_type_t type = MULTICAST_ADDR;
+
+	return inet6_dump_addr(skb, cb, type);
+}
+
+
+static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	enum addr_type_t type = ANYCAST_ADDR;
+
+	return inet6_dump_addr(skb, cb, type);
+}
+
+static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh,
+			     void *arg)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct ifaddrmsg *ifm;
+	struct nlattr *tb[IFA_MAX+1];
+	struct in6_addr *addr = NULL;
+	struct net_device *dev = NULL;
+	struct inet6_ifaddr *ifa;
+	struct sk_buff *skb;
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
+	if (err < 0)
+		goto errout;
+
+	addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]);
+	if (addr == NULL) {
+		err = -EINVAL;
+		goto errout;
+	}
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifa_index)
+		dev = __dev_get_by_index(net, ifm->ifa_index);
+
+	ifa = ipv6_get_ifaddr(net, addr, dev, 1);
+	if (!ifa) {
+		err = -EADDRNOTAVAIL;
+		goto errout;
+	}
+
+	skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
+	if (!skb) {
+		err = -ENOBUFS;
+		goto errout_ifa;
+	}
+
+	err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid,
+				nlh->nlmsg_seq, RTM_NEWADDR, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout_ifa;
+	}
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+errout_ifa:
+	in6_ifa_put(ifa);
+errout:
+	return err;
+}
+
+static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
+{
+	struct sk_buff *skb;
+	struct net *net = dev_net(ifa->idev->dev);
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
+}
+
+static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
+				__s32 *array, int bytes)
+{
+	BUG_ON(bytes < (DEVCONF_MAX * 4));
+
+	memset(array, 0, bytes);
+	array[DEVCONF_FORWARDING] = cnf->forwarding;
+	array[DEVCONF_HOPLIMIT] = cnf->hop_limit;
+	array[DEVCONF_MTU6] = cnf->mtu6;
+	array[DEVCONF_ACCEPT_RA] = cnf->accept_ra;
+	array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects;
+	array[DEVCONF_AUTOCONF] = cnf->autoconf;
+	array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits;
+	array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
+	array[DEVCONF_RTR_SOLICIT_INTERVAL] =
+		jiffies_to_msecs(cnf->rtr_solicit_interval);
+	array[DEVCONF_RTR_SOLICIT_DELAY] =
+		jiffies_to_msecs(cnf->rtr_solicit_delay);
+	array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
+#ifdef CONFIG_IPV6_PRIVACY
+	array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
+	array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
+	array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft;
+	array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry;
+	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
+#endif
+	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
+	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
+	array[DEVCONF_RTR_PROBE_INTERVAL] =
+		jiffies_to_msecs(cnf->rtr_probe_interval);
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
+#endif
+#endif
+	array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
+	array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+	array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+#endif
+	array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
+	array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
+	array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
+}
+
+static inline size_t inet6_ifla6_size(void)
+{
+	return nla_total_size(4) /* IFLA_INET6_FLAGS */
+	     + nla_total_size(sizeof(struct ifla_cacheinfo))
+	     + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+	     + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
+	     + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */
+}
+
+static inline size_t inet6_if_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+	       + nla_total_size(4) /* IFLA_MTU */
+	       + nla_total_size(4) /* IFLA_LINK */
+	       + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
+}
+
+static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
+				      int items, int bytes)
+{
+	int i;
+	int pad = bytes - sizeof(u64) * items;
+	BUG_ON(pad < 0);
+
+	/* Use put_unaligned() because stats may not be aligned for u64. */
+	put_unaligned(items, &stats[0]);
+	for (i = 1; i < items; i++)
+		put_unaligned(atomic_long_read(&mib[i]), &stats[i]);
+
+	memset(&stats[items], 0, pad);
+}
+
+static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
+				      int items, int bytes, size_t syncpoff)
+{
+	int i;
+	int pad = bytes - sizeof(u64) * items;
+	BUG_ON(pad < 0);
+
+	/* Use put_unaligned() because stats may not be aligned for u64. */
+	put_unaligned(items, &stats[0]);
+	for (i = 1; i < items; i++)
+		put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+
+	memset(&stats[items], 0, pad);
+}
+
+static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
+			     int bytes)
+{
+	switch (attrtype) {
+	case IFLA_INET6_STATS:
+		__snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6,
+				     IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
+		break;
+	case IFLA_INET6_ICMP6STATS:
+		__snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes);
+		break;
+	}
+}
+
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+{
+	struct nlattr *nla;
+	struct ifla_cacheinfo ci;
+
+	NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags);
+
+	ci.max_reasm_len = IPV6_MAXPLEN;
+	ci.tstamp = cstamp_delta(idev->tstamp);
+	ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
+	ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time);
+	NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci);
+
+	nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
+	if (nla == NULL)
+		goto nla_put_failure;
+	ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));
+
+	/* XXX - MC not implemented */
+
+	nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
+	if (nla == NULL)
+		goto nla_put_failure;
+	snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
+
+	nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
+	if (nla == NULL)
+		goto nla_put_failure;
+	snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static size_t inet6_get_link_af_size(const struct net_device *dev)
+{
+	if (!__in6_dev_get(dev))
+		return 0;
+
+	return inet6_ifla6_size();
+}
+
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct inet6_dev *idev = __in6_dev_get(dev);
+
+	if (!idev)
+		return -ENODATA;
+
+	if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
+			     u32 pid, u32 seq, int event, unsigned int flags)
+{
+	struct net_device *dev = idev->dev;
+	struct ifinfomsg *hdr;
+	struct nlmsghdr *nlh;
+	void *protoinfo;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	hdr = nlmsg_data(nlh);
+	hdr->ifi_family = AF_INET6;
+	hdr->__ifi_pad = 0;
+	hdr->ifi_type = dev->type;
+	hdr->ifi_index = dev->ifindex;
+	hdr->ifi_flags = dev_get_flags(dev);
+	hdr->ifi_change = 0;
+
+	NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+
+	if (dev->addr_len)
+		NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+
+	NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+	if (dev->ifindex != dev->iflink)
+		NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+
+	protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
+	if (protoinfo == NULL)
+		goto nla_put_failure;
+
+	if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, protoinfo);
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int h, s_h;
+	int idx = 0, s_idx;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	s_h = cb->args[0];
+	s_idx = cb->args[1];
+
+	rcu_read_lock();
+	for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+		idx = 0;
+		head = &net->dev_index_head[h];
+		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+			if (idx < s_idx)
+				goto cont;
+			idev = __in6_dev_get(dev);
+			if (!idev)
+				goto cont;
+			if (inet6_fill_ifinfo(skb, idev,
+					      NETLINK_CB(cb->skb).pid,
+					      cb->nlh->nlmsg_seq,
+					      RTM_NEWLINK, NLM_F_MULTI) <= 0)
+				goto out;
+cont:
+			idx++;
+		}
+	}
+out:
+	rcu_read_unlock();
+	cb->args[1] = idx;
+	cb->args[0] = h;
+
+	return skb->len;
+}
+
+void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
+{
+	struct sk_buff *skb;
+	struct net *net = dev_net(idev->dev);
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
+}
+
+static inline size_t inet6_prefix_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct prefixmsg))
+	       + nla_total_size(sizeof(struct in6_addr))
+	       + nla_total_size(sizeof(struct prefix_cacheinfo));
+}
+
+static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
+			     struct prefix_info *pinfo, u32 pid, u32 seq,
+			     int event, unsigned int flags)
+{
+	struct prefixmsg *pmsg;
+	struct nlmsghdr *nlh;
+	struct prefix_cacheinfo	ci;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	pmsg = nlmsg_data(nlh);
+	pmsg->prefix_family = AF_INET6;
+	pmsg->prefix_pad1 = 0;
+	pmsg->prefix_pad2 = 0;
+	pmsg->prefix_ifindex = idev->dev->ifindex;
+	pmsg->prefix_len = pinfo->prefix_len;
+	pmsg->prefix_type = pinfo->type;
+	pmsg->prefix_pad3 = 0;
+	pmsg->prefix_flags = 0;
+	if (pinfo->onlink)
+		pmsg->prefix_flags |= IF_PREFIX_ONLINK;
+	if (pinfo->autoconf)
+		pmsg->prefix_flags |= IF_PREFIX_AUTOCONF;
+
+	NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix);
+
+	ci.preferred_time = ntohl(pinfo->prefered);
+	ci.valid_time = ntohl(pinfo->valid);
+	NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void inet6_prefix_notify(int event, struct inet6_dev *idev,
+			 struct prefix_info *pinfo)
+{
+	struct sk_buff *skb;
+	struct net *net = dev_net(idev->dev);
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
+}
+
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+{
+	inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);
+
+	switch (event) {
+	case RTM_NEWADDR:
+		/*
+		 * If the address was optimistic
+		 * we inserted the route at the start of
+		 * our DAD process, so we don't need
+		 * to do it again
+		 */
+		if (!(ifp->rt->rt6i_node))
+			ip6_ins_rt(ifp->rt);
+		if (ifp->idev->cnf.forwarding)
+			addrconf_join_anycast(ifp);
+		break;
+	case RTM_DELADDR:
+		if (ifp->idev->cnf.forwarding)
+			addrconf_leave_anycast(ifp);
+		addrconf_leave_solict(ifp->idev, &ifp->addr);
+		dst_hold(&ifp->rt->dst);
+
+		if (ip6_del_rt(ifp->rt))
+			dst_free(&ifp->rt->dst);
+		break;
+	}
+}
+
+static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+{
+	rcu_read_lock_bh();
+	if (likely(ifp->idev->dead == 0))
+		__ipv6_ifa_notify(event, ifp);
+	rcu_read_unlock_bh();
+}
+
+#ifdef CONFIG_SYSCTL
+
+static
+int addrconf_sysctl_forward(ctl_table *ctl, int write,
+			   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	int ret;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_fixup_forwarding(ctl, valp, val);
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
+static void dev_disable_change(struct inet6_dev *idev)
+{
+	if (!idev || !idev->dev)
+		return;
+
+	if (idev->cnf.disable_ipv6)
+		addrconf_notify(NULL, NETDEV_DOWN, idev->dev);
+	else
+		addrconf_notify(NULL, NETDEV_UP, idev->dev);
+}
+
+static void addrconf_disable_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
+			idev->cnf.disable_ipv6 = newf;
+			if (changed)
+				dev_disable_change(idev);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old)
+{
+	struct net *net;
+
+	net = (struct net *)table->extra2;
+
+	if (p == &net->ipv6.devconf_dflt->disable_ipv6)
+		return 0;
+
+	if (!rtnl_trylock()) {
+		/* Restore the original values before restarting */
+		*p = old;
+		return restart_syscall();
+	}
+
+	if (p == &net->ipv6.devconf_all->disable_ipv6) {
+		__s32 newf = net->ipv6.devconf_all->disable_ipv6;
+		net->ipv6.devconf_dflt->disable_ipv6 = newf;
+		addrconf_disable_change(net, newf);
+	} else if ((!*p) ^ (!old))
+		dev_disable_change((struct inet6_dev *)table->extra1);
+
+	rtnl_unlock();
+	return 0;
+}
+
+static
+int addrconf_sysctl_disable(ctl_table *ctl, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	int ret;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_disable_ipv6(ctl, valp, val);
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
+static struct addrconf_sysctl_table
+{
+	struct ctl_table_header *sysctl_header;
+	ctl_table addrconf_vars[DEVCONF_MAX+1];
+	char *dev_name;
+} addrconf_sysctl __read_mostly = {
+	.sysctl_header = NULL,
+	.addrconf_vars = {
+		{
+			.procname	= "forwarding",
+			.data		= &ipv6_devconf.forwarding,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= addrconf_sysctl_forward,
+		},
+		{
+			.procname	= "hop_limit",
+			.data		= &ipv6_devconf.hop_limit,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "mtu",
+			.data		= &ipv6_devconf.mtu6,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "accept_ra",
+			.data		= &ipv6_devconf.accept_ra,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "accept_redirects",
+			.data		= &ipv6_devconf.accept_redirects,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "autoconf",
+			.data		= &ipv6_devconf.autoconf,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "dad_transmits",
+			.data		= &ipv6_devconf.dad_transmits,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "router_solicitations",
+			.data		= &ipv6_devconf.rtr_solicits,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "router_solicitation_interval",
+			.data		= &ipv6_devconf.rtr_solicit_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "router_solicitation_delay",
+			.data		= &ipv6_devconf.rtr_solicit_delay,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+		{
+			.procname	= "force_mld_version",
+			.data		= &ipv6_devconf.force_mld_version,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+#ifdef CONFIG_IPV6_PRIVACY
+		{
+			.procname	= "use_tempaddr",
+			.data		= &ipv6_devconf.use_tempaddr,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "temp_valid_lft",
+			.data		= &ipv6_devconf.temp_valid_lft,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "temp_prefered_lft",
+			.data		= &ipv6_devconf.temp_prefered_lft,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "regen_max_retry",
+			.data		= &ipv6_devconf.regen_max_retry,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "max_desync_factor",
+			.data		= &ipv6_devconf.max_desync_factor,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+#endif
+		{
+			.procname	= "max_addresses",
+			.data		= &ipv6_devconf.max_addresses,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "accept_ra_defrtr",
+			.data		= &ipv6_devconf.accept_ra_defrtr,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "accept_ra_pinfo",
+			.data		= &ipv6_devconf.accept_ra_pinfo,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+#ifdef CONFIG_IPV6_ROUTER_PREF
+		{
+			.procname	= "accept_ra_rtr_pref",
+			.data		= &ipv6_devconf.accept_ra_rtr_pref,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "router_probe_interval",
+			.data		= &ipv6_devconf.rtr_probe_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_jiffies,
+		},
+#ifdef CONFIG_IPV6_ROUTE_INFO
+		{
+			.procname	= "accept_ra_rt_info_max_plen",
+			.data		= &ipv6_devconf.accept_ra_rt_info_max_plen,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+#endif
+#endif
+		{
+			.procname	= "proxy_ndp",
+			.data		= &ipv6_devconf.proxy_ndp,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname	= "accept_source_route",
+			.data		= &ipv6_devconf.accept_source_route,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+		{
+			.procname       = "optimistic_dad",
+			.data           = &ipv6_devconf.optimistic_dad,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
+#endif
+#ifdef CONFIG_IPV6_MROUTE
+		{
+			.procname	= "mc_forwarding",
+			.data		= &ipv6_devconf.mc_forwarding,
+			.maxlen		= sizeof(int),
+			.mode		= 0444,
+			.proc_handler	= proc_dointvec,
+		},
+#endif
+		{
+			.procname	= "disable_ipv6",
+			.data		= &ipv6_devconf.disable_ipv6,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= addrconf_sysctl_disable,
+		},
+		{
+			.procname	= "accept_dad",
+			.data		= &ipv6_devconf.accept_dad,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
+		{
+			.procname       = "force_tllao",
+			.data           = &ipv6_devconf.force_tllao,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec
+		},
+		{
+			/* sentinel */
+		}
+	},
+};
+
+static int __addrconf_sysctl_register(struct net *net, char *dev_name,
+		struct inet6_dev *idev, struct ipv6_devconf *p)
+{
+	int i;
+	struct addrconf_sysctl_table *t;
+
+#define ADDRCONF_CTL_PATH_DEV	3
+
+	struct ctl_path addrconf_ctl_path[] = {
+		{ .procname = "net", },
+		{ .procname = "ipv6", },
+		{ .procname = "conf", },
+		{ /* to be set */ },
+		{ },
+	};
+
+
+	t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
+	if (t == NULL)
+		goto out;
+
+	for (i = 0; t->addrconf_vars[i].data; i++) {
+		t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
+		t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
+		t->addrconf_vars[i].extra2 = net;
+	}
+
+	/*
+	 * Make a copy of dev_name, because '.procname' is regarded as const
+	 * by sysctl and we wouldn't want anyone to change it under our feet
+	 * (see SIOCSIFNAME).
+	 */
+	t->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	if (!t->dev_name)
+		goto free;
+
+	addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name;
+
+	t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path,
+			t->addrconf_vars);
+	if (t->sysctl_header == NULL)
+		goto free_procname;
+
+	p->sysctl = t;
+	return 0;
+
+free_procname:
+	kfree(t->dev_name);
+free:
+	kfree(t);
+out:
+	return -ENOBUFS;
+}
+
+static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
+{
+	struct addrconf_sysctl_table *t;
+
+	if (p->sysctl == NULL)
+		return;
+
+	t = p->sysctl;
+	p->sysctl = NULL;
+	unregister_net_sysctl_table(t->sysctl_header);
+	kfree(t->dev_name);
+	kfree(t);
+}
+
+static void addrconf_sysctl_register(struct inet6_dev *idev)
+{
+	neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6",
+			      &ndisc_ifinfo_sysctl_change);
+	__addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
+					idev, &idev->cnf);
+}
+
+static void addrconf_sysctl_unregister(struct inet6_dev *idev)
+{
+	__addrconf_sysctl_unregister(&idev->cnf);
+	neigh_sysctl_unregister(idev->nd_parms);
+}
+
+
+#endif
+
+static int __net_init addrconf_init_net(struct net *net)
+{
+	int err;
+	struct ipv6_devconf *all, *dflt;
+
+	err = -ENOMEM;
+	all = &ipv6_devconf;
+	dflt = &ipv6_devconf_dflt;
+
+	if (!net_eq(net, &init_net)) {
+		all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL);
+		if (all == NULL)
+			goto err_alloc_all;
+
+		dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
+		if (dflt == NULL)
+			goto err_alloc_dflt;
+	} else {
+		/* these will be inherited by all namespaces */
+		dflt->autoconf = ipv6_defaults.autoconf;
+		dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
+	}
+
+	net->ipv6.devconf_all = all;
+	net->ipv6.devconf_dflt = dflt;
+
+#ifdef CONFIG_SYSCTL
+	err = __addrconf_sysctl_register(net, "all", NULL, all);
+	if (err < 0)
+		goto err_reg_all;
+
+	err = __addrconf_sysctl_register(net, "default", NULL, dflt);
+	if (err < 0)
+		goto err_reg_dflt;
+#endif
+	return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_dflt:
+	__addrconf_sysctl_unregister(all);
+err_reg_all:
+	kfree(dflt);
+#endif
+err_alloc_dflt:
+	kfree(all);
+err_alloc_all:
+	return err;
+}
+
+static void __net_exit addrconf_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+	__addrconf_sysctl_unregister(net->ipv6.devconf_dflt);
+	__addrconf_sysctl_unregister(net->ipv6.devconf_all);
+#endif
+	if (!net_eq(net, &init_net)) {
+		kfree(net->ipv6.devconf_dflt);
+		kfree(net->ipv6.devconf_all);
+	}
+}
+
+static struct pernet_operations addrconf_ops = {
+	.init = addrconf_init_net,
+	.exit = addrconf_exit_net,
+};
+
+/*
+ *      Device notifier
+ */
+
+int register_inet6addr_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(register_inet6addr_notifier);
+
+int unregister_inet6addr_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inet6addr_notifier);
+
+static struct rtnl_af_ops inet6_ops = {
+	.family		  = AF_INET6,
+	.fill_link_af	  = inet6_fill_link_af,
+	.get_link_af_size = inet6_get_link_af_size,
+};
+
+/*
+ *	Init / cleanup code
+ */
+
+int __init addrconf_init(void)
+{
+	int i, err;
+
+	err = ipv6_addr_label_init();
+	if (err < 0) {
+		printk(KERN_CRIT "IPv6 Addrconf:"
+		       " cannot initialize default policy table: %d.\n", err);
+		goto out;
+	}
+
+	err = register_pernet_subsys(&addrconf_ops);
+	if (err < 0)
+		goto out_addrlabel;
+
+	/* The addrconf netdev notifier requires that loopback_dev
+	 * has it's ipv6 private information allocated and setup
+	 * before it can bring up and give link-local addresses
+	 * to other devices which are up.
+	 *
+	 * Unfortunately, loopback_dev is not necessarily the first
+	 * entry in the global dev_base list of net devices.  In fact,
+	 * it is likely to be the very last entry on that list.
+	 * So this causes the notifier registry below to try and
+	 * give link-local addresses to all devices besides loopback_dev
+	 * first, then loopback_dev, which cases all the non-loopback_dev
+	 * devices to fail to get a link-local address.
+	 *
+	 * So, as a temporary fix, allocate the ipv6 structure for
+	 * loopback_dev first by hand.
+	 * Longer term, all of the dependencies ipv6 has upon the loopback
+	 * device and it being up should be removed.
+	 */
+	rtnl_lock();
+	if (!ipv6_add_dev(init_net.loopback_dev))
+		err = -ENOMEM;
+	rtnl_unlock();
+	if (err)
+		goto errlo;
+
+	for (i = 0; i < IN6_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&inet6_addr_lst[i]);
+
+	register_netdevice_notifier(&ipv6_dev_notf);
+
+	addrconf_verify(0);
+
+	err = rtnl_af_register(&inet6_ops);
+	if (err < 0)
+		goto errout_af;
+
+	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo);
+	if (err < 0)
+		goto errout;
+
+	/* Only the first call to __rtnl_register can fail */
+	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL);
+	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL);
+	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, inet6_dump_ifaddr);
+	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr);
+	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr);
+
+	ipv6_addr_label_rtnl_register();
+
+	return 0;
+errout:
+	rtnl_af_unregister(&inet6_ops);
+errout_af:
+	unregister_netdevice_notifier(&ipv6_dev_notf);
+errlo:
+	unregister_pernet_subsys(&addrconf_ops);
+out_addrlabel:
+	ipv6_addr_label_cleanup();
+out:
+	return err;
+}
+
+void addrconf_cleanup(void)
+{
+	struct net_device *dev;
+	int i;
+
+	unregister_netdevice_notifier(&ipv6_dev_notf);
+	unregister_pernet_subsys(&addrconf_ops);
+	ipv6_addr_label_cleanup();
+
+	rtnl_lock();
+
+	__rtnl_af_unregister(&inet6_ops);
+
+	/* clean dev list */
+	for_each_netdev(&init_net, dev) {
+		if (__in6_dev_get(dev) == NULL)
+			continue;
+		addrconf_ifdown(dev, 1);
+	}
+	addrconf_ifdown(init_net.loopback_dev, 2);
+
+	/*
+	 *	Check hash table.
+	 */
+	spin_lock_bh(&addrconf_hash_lock);
+	for (i = 0; i < IN6_ADDR_HSIZE; i++)
+		WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
+	spin_unlock_bh(&addrconf_hash_lock);
+
+	del_timer(&addr_chk_timer);
+	rtnl_unlock();
+}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
new file mode 100644
index 00000000..6b038265
--- /dev/null
+++ b/net/ipv6/addrconf_core.c
@@ -0,0 +1,79 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+
+#include <net/ipv6.h>
+
+#define IPV6_ADDR_SCOPE_TYPE(scope)	((scope) << 16)
+
+static inline unsigned ipv6_addr_scope2type(unsigned scope)
+{
+	switch(scope) {
+	case IPV6_ADDR_SCOPE_NODELOCAL:
+		return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
+			IPV6_ADDR_LOOPBACK);
+	case IPV6_ADDR_SCOPE_LINKLOCAL:
+		return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
+			IPV6_ADDR_LINKLOCAL);
+	case IPV6_ADDR_SCOPE_SITELOCAL:
+		return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
+			IPV6_ADDR_SITELOCAL);
+	}
+	return IPV6_ADDR_SCOPE_TYPE(scope);
+}
+
+int __ipv6_addr_type(const struct in6_addr *addr)
+{
+	__be32 st;
+
+	st = addr->s6_addr32[0];
+
+	/* Consider all addresses with the first three bits different of
+	   000 and 111 as unicasts.
+	 */
+	if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
+	    (st & htonl(0xE0000000)) != htonl(0xE0000000))
+		return (IPV6_ADDR_UNICAST |
+			IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));
+
+	if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
+		/* multicast */
+		/* addr-select 3.1 */
+		return (IPV6_ADDR_MULTICAST |
+			ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
+	}
+
+	if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
+		return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
+			IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));		/* addr-select 3.1 */
+	if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
+		return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
+			IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL));		/* addr-select 3.1 */
+	if ((st & htonl(0xFE000000)) == htonl(0xFC000000))
+		return (IPV6_ADDR_UNICAST |
+			IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));			/* RFC 4193 */
+
+	if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
+		if (addr->s6_addr32[2] == 0) {
+			if (addr->s6_addr32[3] == 0)
+				return IPV6_ADDR_ANY;
+
+			if (addr->s6_addr32[3] == htonl(0x00000001))
+				return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
+					IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));	/* addr-select 3.4 */
+
+			return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
+				IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));	/* addr-select 3.3 */
+		}
+
+		if (addr->s6_addr32[2] == htonl(0x0000ffff))
+			return (IPV6_ADDR_MAPPED |
+				IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));	/* addr-select 3.3 */
+	}
+
+	return (IPV6_ADDR_UNICAST |
+		IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));	/* addr-select 3.4 */
+}
+EXPORT_SYMBOL(__ipv6_addr_type);
+
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
new file mode 100644
index 00000000..c8993e5a
--- /dev/null
+++ b/net/ipv6/addrlabel.c
@@ -0,0 +1,599 @@
+/*
+ * IPv6 Address Label subsystem
+ * for the IPv6 "Default" Source Address Selection
+ *
+ * Copyright (C)2007 USAGI/WIDE Project
+ */
+/*
+ * Author:
+ * 	YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/addrconf.h>
+#include <linux/if_addrlabel.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#if 0
+#define ADDRLABEL(x...) printk(x)
+#else
+#define ADDRLABEL(x...) do { ; } while(0)
+#endif
+
+/*
+ * Policy Table
+ */
+struct ip6addrlbl_entry
+{
+#ifdef CONFIG_NET_NS
+	struct net *lbl_net;
+#endif
+	struct in6_addr prefix;
+	int prefixlen;
+	int ifindex;
+	int addrtype;
+	u32 label;
+	struct hlist_node list;
+	atomic_t refcnt;
+	struct rcu_head rcu;
+};
+
+static struct ip6addrlbl_table
+{
+	struct hlist_head head;
+	spinlock_t lock;
+	u32 seq;
+} ip6addrlbl_table;
+
+static inline
+struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
+{
+	return read_pnet(&lbl->lbl_net);
+}
+
+/*
+ * Default policy table (RFC3484 + extensions)
+ *
+ * prefix		addr_type	label
+ * -------------------------------------------------------------------------
+ * ::1/128		LOOPBACK	0
+ * ::/0			N/A		1
+ * 2002::/16		N/A		2
+ * ::/96		COMPATv4	3
+ * ::ffff:0:0/96	V4MAPPED	4
+ * fc00::/7		N/A		5		ULA (RFC 4193)
+ * 2001::/32		N/A		6		Teredo (RFC 4380)
+ * 2001:10::/28		N/A		7		ORCHID (RFC 4843)
+ *
+ * Note: 0xffffffff is used if we do not have any policies.
+ */
+
+#define IPV6_ADDR_LABEL_DEFAULT	0xffffffffUL
+
+static const __net_initdata struct ip6addrlbl_init_table
+{
+	const struct in6_addr *prefix;
+	int prefixlen;
+	u32 label;
+} ip6addrlbl_init_table[] = {
+	{	/* ::/0 */
+		.prefix = &in6addr_any,
+		.label = 1,
+	},{	/* fc00::/7 */
+		.prefix = &(struct in6_addr){{{ 0xfc }}},
+		.prefixlen = 7,
+		.label = 5,
+	},{	/* 2002::/16 */
+		.prefix = &(struct in6_addr){{{ 0x20, 0x02 }}},
+		.prefixlen = 16,
+		.label = 2,
+	},{	/* 2001::/32 */
+		.prefix = &(struct in6_addr){{{ 0x20, 0x01 }}},
+		.prefixlen = 32,
+		.label = 6,
+	},{	/* 2001:10::/28 */
+		.prefix = &(struct in6_addr){{{ 0x20, 0x01, 0x00, 0x10 }}},
+		.prefixlen = 28,
+		.label = 7,
+	},{	/* ::ffff:0:0 */
+		.prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}},
+		.prefixlen = 96,
+		.label = 4,
+	},{	/* ::/96 */
+		.prefix = &in6addr_any,
+		.prefixlen = 96,
+		.label = 3,
+	},{	/* ::1/128 */
+		.prefix = &in6addr_loopback,
+		.prefixlen = 128,
+		.label = 0,
+	}
+};
+
+/* Object management */
+static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p)
+{
+#ifdef CONFIG_NET_NS
+	release_net(p->lbl_net);
+#endif
+	kfree(p);
+}
+
+static void ip6addrlbl_free_rcu(struct rcu_head *h)
+{
+	ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu));
+}
+
+static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p)
+{
+	return atomic_inc_not_zero(&p->refcnt);
+}
+
+static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
+{
+	if (atomic_dec_and_test(&p->refcnt))
+		call_rcu(&p->rcu, ip6addrlbl_free_rcu);
+}
+
+/* Find label */
+static int __ip6addrlbl_match(struct net *net,
+			      struct ip6addrlbl_entry *p,
+			      const struct in6_addr *addr,
+			      int addrtype, int ifindex)
+{
+	if (!net_eq(ip6addrlbl_net(p), net))
+		return 0;
+	if (p->ifindex && p->ifindex != ifindex)
+		return 0;
+	if (p->addrtype && p->addrtype != addrtype)
+		return 0;
+	if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen))
+		return 0;
+	return 1;
+}
+
+static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
+						  const struct in6_addr *addr,
+						  int type, int ifindex)
+{
+	struct hlist_node *pos;
+	struct ip6addrlbl_entry *p;
+	hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) {
+		if (__ip6addrlbl_match(net, p, addr, type, ifindex))
+			return p;
+	}
+	return NULL;
+}
+
+u32 ipv6_addr_label(struct net *net,
+		    const struct in6_addr *addr, int type, int ifindex)
+{
+	u32 label;
+	struct ip6addrlbl_entry *p;
+
+	type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK;
+
+	rcu_read_lock();
+	p = __ipv6_addr_label(net, addr, type, ifindex);
+	label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
+	rcu_read_unlock();
+
+	ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n",
+		  __func__, addr, type, ifindex, label);
+
+	return label;
+}
+
+/* allocate one entry */
+static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
+						 const struct in6_addr *prefix,
+						 int prefixlen, int ifindex,
+						 u32 label)
+{
+	struct ip6addrlbl_entry *newp;
+	int addrtype;
+
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n",
+		  __func__, prefix, prefixlen, ifindex, (unsigned int)label);
+
+	addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
+
+	switch (addrtype) {
+	case IPV6_ADDR_MAPPED:
+		if (prefixlen > 96)
+			return ERR_PTR(-EINVAL);
+		if (prefixlen < 96)
+			addrtype = 0;
+		break;
+	case IPV6_ADDR_COMPATv4:
+		if (prefixlen != 96)
+			addrtype = 0;
+		break;
+	case IPV6_ADDR_LOOPBACK:
+		if (prefixlen != 128)
+			addrtype = 0;
+		break;
+	}
+
+	newp = kmalloc(sizeof(*newp), GFP_KERNEL);
+	if (!newp)
+		return ERR_PTR(-ENOMEM);
+
+	ipv6_addr_prefix(&newp->prefix, prefix, prefixlen);
+	newp->prefixlen = prefixlen;
+	newp->ifindex = ifindex;
+	newp->addrtype = addrtype;
+	newp->label = label;
+	INIT_HLIST_NODE(&newp->list);
+#ifdef CONFIG_NET_NS
+	newp->lbl_net = hold_net(net);
+#endif
+	atomic_set(&newp->refcnt, 1);
+	return newp;
+}
+
+/* add a label */
+static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
+{
+	int ret = 0;
+
+	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n",
+			__func__,
+			newp, replace);
+
+	if (hlist_empty(&ip6addrlbl_table.head)) {
+		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
+	} else {
+		struct hlist_node *pos, *n;
+		struct ip6addrlbl_entry *p = NULL;
+		hlist_for_each_entry_safe(p, pos, n,
+					  &ip6addrlbl_table.head, list) {
+			if (p->prefixlen == newp->prefixlen &&
+			    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
+			    p->ifindex == newp->ifindex &&
+			    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
+				if (!replace) {
+					ret = -EEXIST;
+					goto out;
+				}
+				hlist_replace_rcu(&p->list, &newp->list);
+				ip6addrlbl_put(p);
+				goto out;
+			} else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
+				   (p->prefixlen < newp->prefixlen)) {
+				hlist_add_before_rcu(&newp->list, &p->list);
+				goto out;
+			}
+		}
+		hlist_add_after_rcu(&p->list, &newp->list);
+	}
+out:
+	if (!ret)
+		ip6addrlbl_table.seq++;
+	return ret;
+}
+
+/* add a label */
+static int ip6addrlbl_add(struct net *net,
+			  const struct in6_addr *prefix, int prefixlen,
+			  int ifindex, u32 label, int replace)
+{
+	struct ip6addrlbl_entry *newp;
+	int ret = 0;
+
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
+		  replace);
+
+	newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
+	if (IS_ERR(newp))
+		return PTR_ERR(newp);
+	spin_lock(&ip6addrlbl_table.lock);
+	ret = __ip6addrlbl_add(newp, replace);
+	spin_unlock(&ip6addrlbl_table.lock);
+	if (ret)
+		ip6addrlbl_free(newp);
+	return ret;
+}
+
+/* remove a label */
+static int __ip6addrlbl_del(struct net *net,
+			    const struct in6_addr *prefix, int prefixlen,
+			    int ifindex)
+{
+	struct ip6addrlbl_entry *p = NULL;
+	struct hlist_node *pos, *n;
+	int ret = -ESRCH;
+
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
+		  __func__, prefix, prefixlen, ifindex);
+
+	hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) {
+		if (p->prefixlen == prefixlen &&
+		    net_eq(ip6addrlbl_net(p), net) &&
+		    p->ifindex == ifindex &&
+		    ipv6_addr_equal(&p->prefix, prefix)) {
+			hlist_del_rcu(&p->list);
+			ip6addrlbl_put(p);
+			ret = 0;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int ip6addrlbl_del(struct net *net,
+			  const struct in6_addr *prefix, int prefixlen,
+			  int ifindex)
+{
+	struct in6_addr prefix_buf;
+	int ret;
+
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
+		  __func__, prefix, prefixlen, ifindex);
+
+	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
+	spin_lock(&ip6addrlbl_table.lock);
+	ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
+	spin_unlock(&ip6addrlbl_table.lock);
+	return ret;
+}
+
+/* add default label */
+static int __net_init ip6addrlbl_net_init(struct net *net)
+{
+	int err = 0;
+	int i;
+
+	ADDRLABEL(KERN_DEBUG "%s()\n", __func__);
+
+	for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
+		int ret = ip6addrlbl_add(net,
+					 ip6addrlbl_init_table[i].prefix,
+					 ip6addrlbl_init_table[i].prefixlen,
+					 0,
+					 ip6addrlbl_init_table[i].label, 0);
+		/* XXX: should we free all rules when we catch an error? */
+		if (ret && (!err || err != -ENOMEM))
+			err = ret;
+	}
+	return err;
+}
+
+static void __net_exit ip6addrlbl_net_exit(struct net *net)
+{
+	struct ip6addrlbl_entry *p = NULL;
+	struct hlist_node *pos, *n;
+
+	/* Remove all labels belonging to the exiting net */
+	spin_lock(&ip6addrlbl_table.lock);
+	hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) {
+		if (net_eq(ip6addrlbl_net(p), net)) {
+			hlist_del_rcu(&p->list);
+			ip6addrlbl_put(p);
+		}
+	}
+	spin_unlock(&ip6addrlbl_table.lock);
+}
+
+static struct pernet_operations ipv6_addr_label_ops = {
+	.init = ip6addrlbl_net_init,
+	.exit = ip6addrlbl_net_exit,
+};
+
+int __init ipv6_addr_label_init(void)
+{
+	spin_lock_init(&ip6addrlbl_table.lock);
+
+	return register_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+void ipv6_addr_label_cleanup(void)
+{
+	unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
+static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
+	[IFAL_ADDRESS]		= { .len = sizeof(struct in6_addr), },
+	[IFAL_LABEL]		= { .len = sizeof(u32), },
+};
+
+static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ifaddrlblmsg *ifal;
+	struct nlattr *tb[IFAL_MAX+1];
+	struct in6_addr *pfx;
+	u32 label;
+	int err = 0;
+
+	err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy);
+	if (err < 0)
+		return err;
+
+	ifal = nlmsg_data(nlh);
+
+	if (ifal->ifal_family != AF_INET6 ||
+	    ifal->ifal_prefixlen > 128)
+		return -EINVAL;
+
+	if (!tb[IFAL_ADDRESS])
+		return -EINVAL;
+
+	pfx = nla_data(tb[IFAL_ADDRESS]);
+	if (!pfx)
+		return -EINVAL;
+
+	if (!tb[IFAL_LABEL])
+		return -EINVAL;
+	label = nla_get_u32(tb[IFAL_LABEL]);
+	if (label == IPV6_ADDR_LABEL_DEFAULT)
+		return -EINVAL;
+
+	switch(nlh->nlmsg_type) {
+	case RTM_NEWADDRLABEL:
+		if (ifal->ifal_index &&
+		    !__dev_get_by_index(net, ifal->ifal_index))
+			return -EINVAL;
+
+		err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen,
+				     ifal->ifal_index, label,
+				     nlh->nlmsg_flags & NLM_F_REPLACE);
+		break;
+	case RTM_DELADDRLABEL:
+		err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen,
+				     ifal->ifal_index);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+	return err;
+}
+
+static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
+				     int prefixlen, int ifindex, u32 lseq)
+{
+	struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
+	ifal->ifal_family = AF_INET6;
+	ifal->ifal_prefixlen = prefixlen;
+	ifal->ifal_flags = 0;
+	ifal->ifal_index = ifindex;
+	ifal->ifal_seq = lseq;
+};
+
+static int ip6addrlbl_fill(struct sk_buff *skb,
+			   struct ip6addrlbl_entry *p,
+			   u32 lseq,
+			   u32 pid, u32 seq, int event,
+			   unsigned int flags)
+{
+	struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event,
+					 sizeof(struct ifaddrlblmsg), flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq);
+
+	if (nla_put(skb, IFAL_ADDRESS, 16, &p->prefix) < 0 ||
+	    nla_put_u32(skb, IFAL_LABEL, p->label) < 0) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
+
+	return nlmsg_end(skb, nlh);
+}
+
+static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct ip6addrlbl_entry *p;
+	struct hlist_node *pos;
+	int idx = 0, s_idx = cb->args[0];
+	int err;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) {
+		if (idx >= s_idx &&
+		    net_eq(ip6addrlbl_net(p), net)) {
+			if ((err = ip6addrlbl_fill(skb, p,
+						   ip6addrlbl_table.seq,
+						   NETLINK_CB(cb->skb).pid,
+						   cb->nlh->nlmsg_seq,
+						   RTM_NEWADDRLABEL,
+						   NLM_F_MULTI)) <= 0)
+				break;
+		}
+		idx++;
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static inline int ip6addrlbl_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+		+ nla_total_size(16)	/* IFAL_ADDRESS */
+		+ nla_total_size(4);	/* IFAL_LABEL */
+}
+
+static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
+			  void *arg)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct ifaddrlblmsg *ifal;
+	struct nlattr *tb[IFAL_MAX+1];
+	struct in6_addr *addr;
+	u32 lseq;
+	int err = 0;
+	struct ip6addrlbl_entry *p;
+	struct sk_buff *skb;
+
+	err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy);
+	if (err < 0)
+		return err;
+
+	ifal = nlmsg_data(nlh);
+
+	if (ifal->ifal_family != AF_INET6 ||
+	    ifal->ifal_prefixlen != 128)
+		return -EINVAL;
+
+	if (ifal->ifal_index &&
+	    !__dev_get_by_index(net, ifal->ifal_index))
+		return -EINVAL;
+
+	if (!tb[IFAL_ADDRESS])
+		return -EINVAL;
+
+	addr = nla_data(tb[IFAL_ADDRESS]);
+	if (!addr)
+		return -EINVAL;
+
+	rcu_read_lock();
+	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+	if (p && ip6addrlbl_hold(p))
+		p = NULL;
+	lseq = ip6addrlbl_table.seq;
+	rcu_read_unlock();
+
+	if (!p) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) {
+		ip6addrlbl_put(p);
+		return -ENOBUFS;
+	}
+
+	err = ip6addrlbl_fill(skb, p, lseq,
+			      NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+			      RTM_NEWADDRLABEL, 0);
+
+	ip6addrlbl_put(p);
+
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto out;
+	}
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+out:
+	return err;
+}
+
+void __init ipv6_addr_label_rtnl_register(void)
+{
+	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, NULL);
+	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, NULL);
+	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, ip6addrlbl_dump);
+}
+
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
new file mode 100644
index 00000000..7e8340ef
--- /dev/null
+++ b/net/ipv6/af_inet6.c
@@ -0,0 +1,1349 @@
+/*
+ *	PF_INET6 socket protocol family
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Adapted from linux/net/ipv4/af_inet.c
+ *
+ * 	Fixes:
+ *	piggy, Karl Knutson	:	Socket protocol table
+ * 	Hideaki YOSHIFUJI	:	sin6_scope_id support
+ * 	Arnaldo Melo		: 	check proc_net_create return, cleanups
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/tcp.h>
+#include <net/ipip.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <net/route.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#ifdef CONFIG_IPV6_TUNNEL
+#include <net/ip6_tunnel.h>
+#endif
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/mroute6.h>
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+	return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+	return 1;
+}
+#endif
+
+MODULE_AUTHOR("Cast of dozens");
+MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
+MODULE_LICENSE("GPL");
+
+/* The inetsw6 table contains everything that inet6_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw6[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw6_lock);
+
+struct ipv6_params ipv6_defaults = {
+	.disable_ipv6 = 0,
+	.autoconf = 1,
+};
+
+static int disable_ipv6_mod = 0;
+
+module_param_named(disable, disable_ipv6_mod, int, 0444);
+MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");
+
+module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444);
+MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");
+
+module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
+MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");
+
+static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+{
+	const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
+
+	return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+}
+
+static int inet6_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
+{
+	struct inet_sock *inet;
+	struct ipv6_pinfo *np;
+	struct sock *sk;
+	struct inet_protosw *answer;
+	struct proto *answer_prot;
+	unsigned char answer_flags;
+	char answer_no_check;
+	int try_loading_module = 0;
+	int err;
+
+	if (!current_has_network())
+		return -EACCES;
+
+	if (sock->type != SOCK_RAW &&
+	    sock->type != SOCK_DGRAM &&
+	    !inet_ehash_secret)
+		build_ehash_secret();
+
+	/* Look for the requested type/protocol pair. */
+lookup_protocol:
+	err = -ESOCKTNOSUPPORT;
+	rcu_read_lock();
+	list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {
+
+		err = 0;
+		/* Check the non-wild match. */
+		if (protocol == answer->protocol) {
+			if (protocol != IPPROTO_IP)
+				break;
+		} else {
+			/* Check for the two wild cases. */
+			if (IPPROTO_IP == protocol) {
+				protocol = answer->protocol;
+				break;
+			}
+			if (IPPROTO_IP == answer->protocol)
+				break;
+		}
+		err = -EPROTONOSUPPORT;
+	}
+
+	if (err) {
+		if (try_loading_module < 2) {
+			rcu_read_unlock();
+			/*
+			 * Be more specific, e.g. net-pf-10-proto-132-type-1
+			 * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+			 */
+			if (++try_loading_module == 1)
+				request_module("net-pf-%d-proto-%d-type-%d",
+						PF_INET6, protocol, sock->type);
+			/*
+			 * Fall back to generic, e.g. net-pf-10-proto-132
+			 * (net-pf-PF_INET6-proto-IPPROTO_SCTP)
+			 */
+			else
+				request_module("net-pf-%d-proto-%d",
+						PF_INET6, protocol);
+			goto lookup_protocol;
+		} else
+			goto out_rcu_unlock;
+	}
+
+	err = -EPERM;
+	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+		goto out_rcu_unlock;
+
+	sock->ops = answer->ops;
+	answer_prot = answer->prot;
+	answer_no_check = answer->no_check;
+	answer_flags = answer->flags;
+	rcu_read_unlock();
+
+	WARN_ON(answer_prot->slab == NULL);
+
+	err = -ENOBUFS;
+	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+	if (sk == NULL)
+		goto out;
+
+	sock_init_data(sock, sk);
+
+	err = 0;
+	sk->sk_no_check = answer_no_check;
+	if (INET_PROTOSW_REUSE & answer_flags)
+		sk->sk_reuse = 1;
+
+	inet = inet_sk(sk);
+	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+	if (SOCK_RAW == sock->type) {
+		inet->inet_num = protocol;
+		if (IPPROTO_RAW == protocol)
+			inet->hdrincl = 1;
+	}
+
+	sk->sk_destruct		= inet_sock_destruct;
+	sk->sk_family		= PF_INET6;
+	sk->sk_protocol		= protocol;
+
+	sk->sk_backlog_rcv	= answer->prot->backlog_rcv;
+
+	inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
+	np->hop_limit	= -1;
+	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
+	np->mc_loop	= 1;
+	np->pmtudisc	= IPV6_PMTUDISC_WANT;
+	np->ipv6only	= net->ipv6.sysctl.bindv6only;
+
+	/* Init the ipv4 part of the socket since we can have sockets
+	 * using v6 API for ipv4.
+	 */
+	inet->uc_ttl	= -1;
+
+	inet->mc_loop	= 1;
+	inet->mc_ttl	= 1;
+	inet->mc_index	= 0;
+	inet->mc_list	= NULL;
+
+	if (ipv4_config.no_pmtu_disc)
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+	else
+		inet->pmtudisc = IP_PMTUDISC_WANT;
+	/*
+	 * Increment only the relevant sk_prot->socks debug field, this changes
+	 * the previous behaviour of incrementing both the equivalent to
+	 * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+	 *
+	 * This allows better debug granularity as we'll know exactly how many
+	 * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+	 * transport protocol socks. -acme
+	 */
+	sk_refcnt_debug_inc(sk);
+
+	if (inet->inet_num) {
+		/* It assumes that any protocol which allows
+		 * the user to assign a number at socket
+		 * creation time automatically shares.
+		 */
+		inet->inet_sport = htons(inet->inet_num);
+		sk->sk_prot->hash(sk);
+	}
+	if (sk->sk_prot->init) {
+		err = sk->sk_prot->init(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+out:
+	return err;
+out_rcu_unlock:
+	rcu_read_unlock();
+	goto out;
+}
+
+
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr;
+	struct sock *sk = sock->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
+	__be32 v4addr = 0;
+	unsigned short snum;
+	int addr_type = 0;
+	int err = 0;
+
+	/* If the socket has its own bind function then use it. */
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	if (addr->sin6_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	addr_type = ipv6_addr_type(&addr->sin6_addr);
+	if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM)
+		return -EINVAL;
+
+	snum = ntohs(addr->sin6_port);
+	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	lock_sock(sk);
+
+	/* Check these errors (active socket, double bind). */
+	if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Check if the address belongs to the host. */
+	if (addr_type == IPV6_ADDR_MAPPED) {
+		int chk_addr_ret;
+
+		/* Binding to v4-mapped address on a v6-only socket
+		 * makes no sense
+		 */
+		if (np->ipv6only) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Reproduce AF_INET checks to make the bindings consistent */
+		v4addr = addr->sin6_addr.s6_addr32[3];
+		chk_addr_ret = inet_addr_type(net, v4addr);
+		if (!sysctl_ip_nonlocal_bind &&
+		    !(inet->freebind || inet->transparent) &&
+		    v4addr != htonl(INADDR_ANY) &&
+		    chk_addr_ret != RTN_LOCAL &&
+		    chk_addr_ret != RTN_MULTICAST &&
+		    chk_addr_ret != RTN_BROADCAST) {
+			err = -EADDRNOTAVAIL;
+			goto out;
+		}
+	} else {
+		if (addr_type != IPV6_ADDR_ANY) {
+			struct net_device *dev = NULL;
+
+			rcu_read_lock();
+			if (addr_type & IPV6_ADDR_LINKLOCAL) {
+				if (addr_len >= sizeof(struct sockaddr_in6) &&
+				    addr->sin6_scope_id) {
+					/* Override any existing binding, if another one
+					 * is supplied by user.
+					 */
+					sk->sk_bound_dev_if = addr->sin6_scope_id;
+				}
+
+				/* Binding to link-local address requires an interface */
+				if (!sk->sk_bound_dev_if) {
+					err = -EINVAL;
+					goto out_unlock;
+				}
+				dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+				if (!dev) {
+					err = -ENODEV;
+					goto out_unlock;
+				}
+			}
+
+			/* ipv4 addr of the socket is invalid.  Only the
+			 * unspecified and mapped address have a v4 equivalent.
+			 */
+			v4addr = LOOPBACK4_IPV6;
+			if (!(addr_type & IPV6_ADDR_MULTICAST))	{
+				if (!inet->transparent &&
+				    !ipv6_chk_addr(net, &addr->sin6_addr,
+						   dev, 0)) {
+					err = -EADDRNOTAVAIL;
+					goto out_unlock;
+				}
+			}
+			rcu_read_unlock();
+		}
+	}
+
+	inet->inet_rcv_saddr = v4addr;
+	inet->inet_saddr = v4addr;
+
+	ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
+
+	if (!(addr_type & IPV6_ADDR_MULTICAST))
+		ipv6_addr_copy(&np->saddr, &addr->sin6_addr);
+
+	/* Make sure we are allowed to bind here. */
+	if (sk->sk_prot->get_port(sk, snum)) {
+		inet_reset_saddr(sk);
+		err = -EADDRINUSE;
+		goto out;
+	}
+
+	if (addr_type != IPV6_ADDR_ANY) {
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+		if (addr_type != IPV6_ADDR_MAPPED)
+			np->ipv6only = 1;
+	}
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	inet->inet_sport = htons(inet->inet_num);
+	inet->inet_dport = 0;
+	inet->inet_daddr = 0;
+out:
+	release_sock(sk);
+	return err;
+out_unlock:
+	rcu_read_unlock();
+	goto out;
+}
+
+EXPORT_SYMBOL(inet6_bind);
+
+int inet6_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk == NULL)
+		return -EINVAL;
+
+	/* Free mc lists */
+	ipv6_sock_mc_close(sk);
+
+	/* Free ac lists */
+	ipv6_sock_ac_close(sk);
+
+	return inet_release(sock);
+}
+
+EXPORT_SYMBOL(inet6_release);
+
+void inet6_destroy_sock(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *skb;
+	struct ipv6_txoptions *opt;
+
+	/* Release rx options */
+
+	if ((skb = xchg(&np->pktoptions, NULL)) != NULL)
+		kfree_skb(skb);
+
+	if ((skb = xchg(&np->rxpmtu, NULL)) != NULL)
+		kfree_skb(skb);
+
+	/* Free flowlabels */
+	fl6_free_socklist(sk);
+
+	/* Free tx options */
+
+	if ((opt = xchg(&np->opt, NULL)) != NULL)
+		sock_kfree_s(sk, opt, opt->tot_len);
+}
+
+EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+
+/*
+ *	This does both peername and sockname.
+ */
+
+int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
+		 int *uaddr_len, int peer)
+{
+	struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr;
+	struct sock *sk = sock->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	sin->sin6_family = AF_INET6;
+	sin->sin6_flowinfo = 0;
+	sin->sin6_scope_id = 0;
+	if (peer) {
+		if (!inet->inet_dport)
+			return -ENOTCONN;
+		if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+		    peer == 1)
+			return -ENOTCONN;
+		sin->sin6_port = inet->inet_dport;
+		ipv6_addr_copy(&sin->sin6_addr, &np->daddr);
+		if (np->sndflow)
+			sin->sin6_flowinfo = np->flow_label;
+	} else {
+		if (ipv6_addr_any(&np->rcv_saddr))
+			ipv6_addr_copy(&sin->sin6_addr, &np->saddr);
+		else
+			ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr);
+
+		sin->sin6_port = inet->inet_sport;
+	}
+	if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		sin->sin6_scope_id = sk->sk_bound_dev_if;
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+
+EXPORT_SYMBOL(inet6_getname);
+
+int inet6_killaddr_ioctl(struct net *net, void __user *arg) {
+	struct in6_ifreq ireq;
+	struct sockaddr_in6 sin6;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EACCES;
+
+	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+		return -EFAULT;
+
+	sin6.sin6_family = AF_INET6;
+	ipv6_addr_copy(&sin6.sin6_addr, &ireq.ifr6_addr);
+	return tcp_nuke_addr(net, (struct sockaddr *) &sin6);
+}
+
+int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+
+	switch(cmd)
+	{
+	case SIOCGSTAMP:
+		return sock_get_timestamp(sk, (struct timeval __user *)arg);
+
+	case SIOCGSTAMPNS:
+		return sock_get_timestampns(sk, (struct timespec __user *)arg);
+
+	case SIOCADDRT:
+	case SIOCDELRT:
+
+		return ipv6_route_ioctl(net, cmd, (void __user *)arg);
+
+	case SIOCSIFADDR:
+		return addrconf_add_ifaddr(net, (void __user *) arg);
+	case SIOCDIFADDR:
+		return addrconf_del_ifaddr(net, (void __user *) arg);
+	case SIOCSIFDSTADDR:
+		return addrconf_set_dstaddr(net, (void __user *) arg);
+	case SIOCKILLADDR:
+		return inet6_killaddr_ioctl(net, (void __user *) arg);
+	default:
+		if (!sk->sk_prot->ioctl)
+			return -ENOIOCTLCMD;
+		return sk->sk_prot->ioctl(sk, cmd, arg);
+	}
+	/*NOTREACHED*/
+	return 0;
+}
+
+EXPORT_SYMBOL(inet6_ioctl);
+
+const struct proto_ops inet6_stream_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = inet6_bind,
+	.connect	   = inet_stream_connect,	/* ok		*/
+	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
+	.accept		   = inet_accept,		/* ok		*/
+	.getname	   = inet6_getname,
+	.poll		   = tcp_poll,			/* ok		*/
+	.ioctl		   = inet6_ioctl,		/* must change  */
+	.listen		   = inet_listen,		/* ok		*/
+	.shutdown	   = inet_shutdown,		/* ok		*/
+	.setsockopt	   = sock_common_setsockopt,	/* ok		*/
+	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
+	.sendmsg	   = inet_sendmsg,		/* ok		*/
+	.recvmsg	   = inet_recvmsg,		/* ok		*/
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = inet_sendpage,
+	.splice_read	   = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+const struct proto_ops inet6_dgram_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = inet6_bind,
+	.connect	   = inet_dgram_connect,	/* ok		*/
+	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
+	.accept		   = sock_no_accept,		/* a do nothing	*/
+	.getname	   = inet6_getname,
+	.poll		   = udp_poll,			/* ok		*/
+	.ioctl		   = inet6_ioctl,		/* must change  */
+	.listen		   = sock_no_listen,		/* ok		*/
+	.shutdown	   = inet_shutdown,		/* ok		*/
+	.setsockopt	   = sock_common_setsockopt,	/* ok		*/
+	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
+	.sendmsg	   = inet_sendmsg,		/* ok		*/
+	.recvmsg	   = inet_recvmsg,		/* ok		*/
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static const struct net_proto_family inet6_family_ops = {
+	.family = PF_INET6,
+	.create = inet6_create,
+	.owner	= THIS_MODULE,
+};
+
+int inet6_register_protosw(struct inet_protosw *p)
+{
+	struct list_head *lh;
+	struct inet_protosw *answer;
+	struct list_head *last_perm;
+	int protocol = p->protocol;
+	int ret;
+
+	spin_lock_bh(&inetsw6_lock);
+
+	ret = -EINVAL;
+	if (p->type >= SOCK_MAX)
+		goto out_illegal;
+
+	/* If we are trying to override a permanent protocol, bail. */
+	answer = NULL;
+	ret = -EPERM;
+	last_perm = &inetsw6[p->type];
+	list_for_each(lh, &inetsw6[p->type]) {
+		answer = list_entry(lh, struct inet_protosw, list);
+
+		/* Check only the non-wild match. */
+		if (INET_PROTOSW_PERMANENT & answer->flags) {
+			if (protocol == answer->protocol)
+				break;
+			last_perm = lh;
+		}
+
+		answer = NULL;
+	}
+	if (answer)
+		goto out_permanent;
+
+	/* Add the new entry after the last permanent entry if any, so that
+	 * the new entry does not override a permanent entry when matched with
+	 * a wild-card protocol. But it is allowed to override any existing
+	 * non-permanent entry.  This means that when we remove this entry, the
+	 * system automatically returns to the old behavior.
+	 */
+	list_add_rcu(&p->list, last_perm);
+	ret = 0;
+out:
+	spin_unlock_bh(&inetsw6_lock);
+	return ret;
+
+out_permanent:
+	printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+	       protocol);
+	goto out;
+
+out_illegal:
+	printk(KERN_ERR
+	       "Ignoring attempt to register invalid socket type %d.\n",
+	       p->type);
+	goto out;
+}
+
+EXPORT_SYMBOL(inet6_register_protosw);
+
+void
+inet6_unregister_protosw(struct inet_protosw *p)
+{
+	if (INET_PROTOSW_PERMANENT & p->flags) {
+		printk(KERN_ERR
+		       "Attempt to unregister permanent protocol %d.\n",
+		       p->protocol);
+	} else {
+		spin_lock_bh(&inetsw6_lock);
+		list_del_rcu(&p->list);
+		spin_unlock_bh(&inetsw6_lock);
+
+		synchronize_net();
+	}
+}
+
+EXPORT_SYMBOL(inet6_unregister_protosw);
+
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct dst_entry *dst;
+
+	dst = __sk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *final_p, final;
+		struct flowi6 fl6;
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = sk->sk_protocol;
+		ipv6_addr_copy(&fl6.daddr, &np->daddr);
+		ipv6_addr_copy(&fl6.saddr, &np->saddr);
+		fl6.flowlabel = np->flow_label;
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+		fl6.flowi6_mark = sk->sk_mark;
+		fl6.fl6_dport = inet->inet_dport;
+		fl6.fl6_sport = inet->inet_sport;
+		security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+		final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		if (IS_ERR(dst)) {
+			sk->sk_route_caps = 0;
+			sk->sk_err_soft = -PTR_ERR(dst);
+			return PTR_ERR(dst);
+		}
+
+		__ip6_dst_store(sk, dst, NULL, NULL);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
+int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	if (np->rxopt.all) {
+		if ((opt->hop && (np->rxopt.bits.hopopts ||
+				  np->rxopt.bits.ohopopts)) ||
+		    ((IPV6_FLOWINFO_MASK &
+		      *(__be32 *)skb_network_header(skb)) &&
+		     np->rxopt.bits.rxflow) ||
+		    (opt->srcrt && (np->rxopt.bits.srcrt ||
+		     np->rxopt.bits.osrcrt)) ||
+		    ((opt->dst1 || opt->dst0) &&
+		     (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
+			return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+
+static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
+{
+	const struct inet6_protocol *ops = NULL;
+
+	for (;;) {
+		struct ipv6_opt_hdr *opth;
+		int len;
+
+		if (proto != NEXTHDR_HOP) {
+			ops = rcu_dereference(inet6_protos[proto]);
+
+			if (unlikely(!ops))
+				break;
+
+			if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+				break;
+		}
+
+		if (unlikely(!pskb_may_pull(skb, 8)))
+			break;
+
+		opth = (void *)skb->data;
+		len = ipv6_optlen(opth);
+
+		if (unlikely(!pskb_may_pull(skb, len)))
+			break;
+
+		proto = opth->nexthdr;
+		__skb_pull(skb, len);
+	}
+
+	return proto;
+}
+
+static int ipv6_gso_send_check(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ipv6h;
+	const struct inet6_protocol *ops;
+	int err = -EINVAL;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
+		goto out;
+
+	ipv6h = ipv6_hdr(skb);
+	__skb_pull(skb, sizeof(*ipv6h));
+	err = -EPROTONOSUPPORT;
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet6_protos[
+		ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]);
+
+	if (likely(ops && ops->gso_send_check)) {
+		skb_reset_transport_header(skb);
+		err = ops->gso_send_check(skb);
+	}
+	rcu_read_unlock();
+
+out:
+	return err;
+}
+
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct ipv6hdr *ipv6h;
+	const struct inet6_protocol *ops;
+	int proto;
+	struct frag_hdr *fptr;
+	unsigned int unfrag_ip6hlen;
+	u8 *prevhdr;
+	int offset = 0;
+
+	if (!(features & NETIF_F_V6_CSUM))
+		features &= ~NETIF_F_SG;
+
+	if (unlikely(skb_shinfo(skb)->gso_type &
+		     ~(SKB_GSO_UDP |
+		       SKB_GSO_DODGY |
+		       SKB_GSO_TCP_ECN |
+		       SKB_GSO_TCPV6 |
+		       0)))
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
+		goto out;
+
+	ipv6h = ipv6_hdr(skb);
+	__skb_pull(skb, sizeof(*ipv6h));
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr);
+	rcu_read_lock();
+	ops = rcu_dereference(inet6_protos[proto]);
+	if (likely(ops && ops->gso_segment)) {
+		skb_reset_transport_header(skb);
+		segs = ops->gso_segment(skb, features);
+	}
+	rcu_read_unlock();
+
+	if (IS_ERR(segs))
+		goto out;
+
+	for (skb = segs; skb; skb = skb->next) {
+		ipv6h = ipv6_hdr(skb);
+		ipv6h->payload_len = htons(skb->len - skb->mac_len -
+					   sizeof(*ipv6h));
+		if (proto == IPPROTO_UDP) {
+			unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+			fptr = (struct frag_hdr *)(skb_network_header(skb) +
+				unfrag_ip6hlen);
+			fptr->frag_off = htons(offset);
+			if (skb->next != NULL)
+				fptr->frag_off |= htons(IP6_MF);
+			offset += (ntohs(ipv6h->payload_len) -
+				   sizeof(struct frag_hdr));
+		}
+	}
+
+out:
+	return segs;
+}
+
+struct ipv6_gro_cb {
+	struct napi_gro_cb napi;
+	int proto;
+};
+
+#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb)
+
+static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	const struct inet6_protocol *ops;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct ipv6hdr *iph;
+	unsigned int nlen;
+	unsigned int hlen;
+	unsigned int off;
+	int flush = 1;
+	int proto;
+	__wsum csum;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+	iph = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, hlen)) {
+		iph = skb_gro_header_slow(skb, hlen, off);
+		if (unlikely(!iph))
+			goto out;
+	}
+
+	skb_gro_pull(skb, sizeof(*iph));
+	skb_set_transport_header(skb, skb_gro_offset(skb));
+
+	flush += ntohs(iph->payload_len) != skb_gro_len(skb);
+
+	rcu_read_lock();
+	proto = iph->nexthdr;
+	ops = rcu_dereference(inet6_protos[proto]);
+	if (!ops || !ops->gro_receive) {
+		__pskb_pull(skb, skb_gro_offset(skb));
+		proto = ipv6_gso_pull_exthdrs(skb, proto);
+		skb_gro_pull(skb, -skb_transport_offset(skb));
+		skb_reset_transport_header(skb);
+		__skb_push(skb, skb_gro_offset(skb));
+
+		if (!ops || !ops->gro_receive)
+			goto out_unlock;
+
+		iph = ipv6_hdr(skb);
+	}
+
+	IPV6_GRO_CB(skb)->proto = proto;
+
+	flush--;
+	nlen = skb_network_header_len(skb);
+
+	for (p = *head; p; p = p->next) {
+		struct ipv6hdr *iph2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		iph2 = ipv6_hdr(p);
+
+		/* All fields must match except length. */
+		if (nlen != skb_network_header_len(p) ||
+		    memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) ||
+		    memcmp(&iph->nexthdr, &iph2->nexthdr,
+			   nlen - offsetof(struct ipv6hdr, nexthdr))) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		NAPI_GRO_CB(p)->flush |= flush;
+	}
+
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	csum = skb->csum;
+	skb_postpull_rcsum(skb, iph, skb_network_header_len(skb));
+
+	pp = ops->gro_receive(head, skb);
+
+	skb->csum = csum;
+
+out_unlock:
+	rcu_read_unlock();
+
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int ipv6_gro_complete(struct sk_buff *skb)
+{
+	const struct inet6_protocol *ops;
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	int err = -ENOSYS;
+
+	iph->payload_len = htons(skb->len - skb_network_offset(skb) -
+				 sizeof(*iph));
+
+	rcu_read_lock();
+	ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]);
+	if (WARN_ON(!ops || !ops->gro_complete))
+		goto out_unlock;
+
+	err = ops->gro_complete(skb);
+
+out_unlock:
+	rcu_read_unlock();
+
+	return err;
+}
+
+static struct packet_type ipv6_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IPV6),
+	.func = ipv6_rcv,
+	.gso_send_check = ipv6_gso_send_check,
+	.gso_segment = ipv6_gso_segment,
+	.gro_receive = ipv6_gro_receive,
+	.gro_complete = ipv6_gro_complete,
+};
+
+static int __init ipv6_packet_init(void)
+{
+	dev_add_pack(&ipv6_packet_type);
+	return 0;
+}
+
+static void ipv6_packet_cleanup(void)
+{
+	dev_remove_pack(&ipv6_packet_type);
+}
+
+static int __net_init ipv6_init_mibs(struct net *net)
+{
+	if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6,
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
+		return -ENOMEM;
+	if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6,
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
+		goto err_udplite_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics,
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
+		goto err_ip_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics,
+			  sizeof(struct icmpv6_mib),
+			  __alignof__(struct icmpv6_mib)) < 0)
+		goto err_icmp_mib;
+	if (snmp_mib_init((void __percpu **)net->mib.icmpv6msg_statistics,
+			  sizeof(struct icmpv6msg_mib),
+			  __alignof__(struct icmpv6msg_mib)) < 0)
+		goto err_icmpmsg_mib;
+	return 0;
+
+err_icmpmsg_mib:
+	snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics);
+err_icmp_mib:
+	snmp_mib_free((void __percpu **)net->mib.ipv6_statistics);
+err_ip_mib:
+	snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6);
+err_udplite_mib:
+	snmp_mib_free((void __percpu **)net->mib.udp_stats_in6);
+	return -ENOMEM;
+}
+
+static void ipv6_cleanup_mibs(struct net *net)
+{
+	snmp_mib_free((void __percpu **)net->mib.udp_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6);
+	snmp_mib_free((void __percpu **)net->mib.ipv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics);
+	snmp_mib_free((void __percpu **)net->mib.icmpv6msg_statistics);
+}
+
+static int __net_init inet6_net_init(struct net *net)
+{
+	int err = 0;
+
+	net->ipv6.sysctl.bindv6only = 0;
+	net->ipv6.sysctl.icmpv6_time = 1*HZ;
+
+	err = ipv6_init_mibs(net);
+	if (err)
+		return err;
+#ifdef CONFIG_PROC_FS
+	err = udp6_proc_init(net);
+	if (err)
+		goto out;
+	err = tcp6_proc_init(net);
+	if (err)
+		goto proc_tcp6_fail;
+	err = ac6_proc_init(net);
+	if (err)
+		goto proc_ac6_fail;
+#endif
+	return err;
+
+#ifdef CONFIG_PROC_FS
+proc_ac6_fail:
+	tcp6_proc_exit(net);
+proc_tcp6_fail:
+	udp6_proc_exit(net);
+out:
+	ipv6_cleanup_mibs(net);
+	return err;
+#endif
+}
+
+static void __net_exit inet6_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	udp6_proc_exit(net);
+	tcp6_proc_exit(net);
+	ac6_proc_exit(net);
+#endif
+	ipv6_cleanup_mibs(net);
+}
+
+static struct pernet_operations inet6_net_ops = {
+	.init = inet6_net_init,
+	.exit = inet6_net_exit,
+};
+
+static int __init inet6_init(void)
+{
+	struct sk_buff *dummy_skb;
+	struct list_head *r;
+	int err = 0;
+
+	BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
+
+	/* Register the socket-side information for inet6_create.  */
+	for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	if (disable_ipv6_mod) {
+		printk(KERN_INFO
+		       "IPv6: Loaded, but administratively disabled, "
+		       "reboot required to enable\n");
+		goto out;
+	}
+
+	initialize_hashidentrnd();
+
+	err = proto_register(&tcpv6_prot, 1);
+	if (err)
+		goto out;
+
+	err = proto_register(&udpv6_prot, 1);
+	if (err)
+		goto out_unregister_tcp_proto;
+
+	err = proto_register(&udplitev6_prot, 1);
+	if (err)
+		goto out_unregister_udp_proto;
+
+	err = proto_register(&rawv6_prot, 1);
+	if (err)
+		goto out_unregister_udplite_proto;
+
+
+	/* We MUST register RAW sockets before we create the ICMP6,
+	 * IGMP6, or NDISC control sockets.
+	 */
+	err = rawv6_init();
+	if (err)
+		goto out_unregister_raw_proto;
+
+	/* Register the family here so that the init calls below will
+	 * be able to create sockets. (?? is this dangerous ??)
+	 */
+	err = sock_register(&inet6_family_ops);
+	if (err)
+		goto out_sock_register_fail;
+
+#ifdef CONFIG_SYSCTL
+	err = ipv6_static_sysctl_register();
+	if (err)
+		goto static_sysctl_fail;
+#endif
+	/*
+	 *	ipngwg API draft makes clear that the correct semantics
+	 *	for TCP and UDP is to consider one TCP and UDP instance
+	 *	in a host available by both INET and INET6 APIs and
+	 *	able to communicate via both network protocols.
+	 */
+
+	err = register_pernet_subsys(&inet6_net_ops);
+	if (err)
+		goto register_pernet_fail;
+	err = icmpv6_init();
+	if (err)
+		goto icmp_fail;
+	err = ip6_mr_init();
+	if (err)
+		goto ipmr_fail;
+	err = ndisc_init();
+	if (err)
+		goto ndisc_fail;
+	err = igmp6_init();
+	if (err)
+		goto igmp_fail;
+	err = ipv6_netfilter_init();
+	if (err)
+		goto netfilter_fail;
+	/* Create /proc/foo6 entries. */
+#ifdef CONFIG_PROC_FS
+	err = -ENOMEM;
+	if (raw6_proc_init())
+		goto proc_raw6_fail;
+	if (udplite6_proc_init())
+		goto proc_udplite6_fail;
+	if (ipv6_misc_proc_init())
+		goto proc_misc6_fail;
+	if (if6_proc_init())
+		goto proc_if6_fail;
+#endif
+	err = ip6_route_init();
+	if (err)
+		goto ip6_route_fail;
+	err = ip6_flowlabel_init();
+	if (err)
+		goto ip6_flowlabel_fail;
+	err = addrconf_init();
+	if (err)
+		goto addrconf_fail;
+
+	/* Init v6 extension headers. */
+	err = ipv6_exthdrs_init();
+	if (err)
+		goto ipv6_exthdrs_fail;
+
+	err = ipv6_frag_init();
+	if (err)
+		goto ipv6_frag_fail;
+
+	/* Init v6 transport protocols. */
+	err = udpv6_init();
+	if (err)
+		goto udpv6_fail;
+
+	err = udplitev6_init();
+	if (err)
+		goto udplitev6_fail;
+
+	err = tcpv6_init();
+	if (err)
+		goto tcpv6_fail;
+
+	err = ipv6_packet_init();
+	if (err)
+		goto ipv6_packet_fail;
+
+#ifdef CONFIG_SYSCTL
+	err = ipv6_sysctl_register();
+	if (err)
+		goto sysctl_fail;
+#endif
+out:
+	return err;
+
+#ifdef CONFIG_SYSCTL
+sysctl_fail:
+	ipv6_packet_cleanup();
+#endif
+ipv6_packet_fail:
+	tcpv6_exit();
+tcpv6_fail:
+	udplitev6_exit();
+udplitev6_fail:
+	udpv6_exit();
+udpv6_fail:
+	ipv6_frag_exit();
+ipv6_frag_fail:
+	ipv6_exthdrs_exit();
+ipv6_exthdrs_fail:
+	addrconf_cleanup();
+addrconf_fail:
+	ip6_flowlabel_cleanup();
+ip6_flowlabel_fail:
+	ip6_route_cleanup();
+ip6_route_fail:
+#ifdef CONFIG_PROC_FS
+	if6_proc_exit();
+proc_if6_fail:
+	ipv6_misc_proc_exit();
+proc_misc6_fail:
+	udplite6_proc_exit();
+proc_udplite6_fail:
+	raw6_proc_exit();
+proc_raw6_fail:
+#endif
+	ipv6_netfilter_fini();
+netfilter_fail:
+	igmp6_cleanup();
+igmp_fail:
+	ndisc_cleanup();
+ndisc_fail:
+	ip6_mr_cleanup();
+ipmr_fail:
+	icmpv6_cleanup();
+icmp_fail:
+	unregister_pernet_subsys(&inet6_net_ops);
+register_pernet_fail:
+#ifdef CONFIG_SYSCTL
+	ipv6_static_sysctl_unregister();
+static_sysctl_fail:
+#endif
+	sock_unregister(PF_INET6);
+	rtnl_unregister_all(PF_INET6);
+out_sock_register_fail:
+	rawv6_exit();
+out_unregister_raw_proto:
+	proto_unregister(&rawv6_prot);
+out_unregister_udplite_proto:
+	proto_unregister(&udplitev6_prot);
+out_unregister_udp_proto:
+	proto_unregister(&udpv6_prot);
+out_unregister_tcp_proto:
+	proto_unregister(&tcpv6_prot);
+	goto out;
+}
+module_init(inet6_init);
+
+static void __exit inet6_exit(void)
+{
+	if (disable_ipv6_mod)
+		return;
+
+	/* First of all disallow new sockets creation. */
+	sock_unregister(PF_INET6);
+	/* Disallow any further netlink messages */
+	rtnl_unregister_all(PF_INET6);
+
+#ifdef CONFIG_SYSCTL
+	ipv6_sysctl_unregister();
+#endif
+	udpv6_exit();
+	udplitev6_exit();
+	tcpv6_exit();
+
+	/* Cleanup code parts. */
+	ipv6_packet_cleanup();
+	ipv6_frag_exit();
+	ipv6_exthdrs_exit();
+	addrconf_cleanup();
+	ip6_flowlabel_cleanup();
+	ip6_route_cleanup();
+#ifdef CONFIG_PROC_FS
+
+	/* Cleanup code parts. */
+	if6_proc_exit();
+	ipv6_misc_proc_exit();
+	udplite6_proc_exit();
+	raw6_proc_exit();
+#endif
+	ipv6_netfilter_fini();
+	igmp6_cleanup();
+	ndisc_cleanup();
+	ip6_mr_cleanup();
+	icmpv6_cleanup();
+	rawv6_exit();
+
+	unregister_pernet_subsys(&inet6_net_ops);
+#ifdef CONFIG_SYSCTL
+	ipv6_static_sysctl_unregister();
+#endif
+	proto_unregister(&rawv6_prot);
+	proto_unregister(&udplitev6_prot);
+	proto_unregister(&udpv6_prot);
+	proto_unregister(&tcpv6_prot);
+
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+module_exit(inet6_exit);
+
+MODULE_ALIAS_NETPROTO(PF_INET6);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
new file mode 100644
index 00000000..4c0f894d
--- /dev/null
+++ b/net/ipv6/ah6.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C)2002 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors
+ *
+ *	Mitsuru KANDA @USAGI       : IPv6 Support
+ * 	Kazunori MIYAZAWA @USAGI   :
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * 	This file is derived from net/ipv4/ah.c.
+ */
+
+#include <crypto/hash.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+#define IPV6HDR_BASELEN 8
+
+struct tmp_ext {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		struct in6_addr saddr;
+#endif
+		struct in6_addr daddr;
+		char hdrs[0];
+};
+
+struct ah_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+			  unsigned int size)
+{
+	unsigned int len;
+
+	len = size + crypto_ahash_digestsize(ahash) +
+	      (crypto_ahash_alignmask(ahash) &
+	       ~(crypto_tfm_ctx_alignment() - 1));
+
+	len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+	len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline struct tmp_ext *ah_tmp_ext(void *base)
+{
+	return base + IPV6HDR_BASELEN;
+}
+
+static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset)
+{
+	return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+			     unsigned int offset)
+{
+	return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+					       u8 *icv)
+{
+	struct ahash_request *req;
+
+	req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+				crypto_tfm_ctx_alignment());
+
+	ahash_request_set_tfm(req, ahash);
+
+	return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+					     struct ahash_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_ahash_reqsize(ahash),
+			     __alignof__(struct scatterlist));
+}
+
+static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr)
+{
+	u8 *opt = (u8 *)opthdr;
+	int len = ipv6_optlen(opthdr);
+	int off = 0;
+	int optlen = 0;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+
+		switch (opt[off]) {
+
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		default:
+			if (len < 2)
+				goto bad;
+			optlen = opt[off+1]+2;
+			if (len < optlen)
+				goto bad;
+			if (opt[off] & 0x20)
+				memset(&opt[off+2], 0, opt[off+1]);
+			break;
+		}
+
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 1;
+
+bad:
+	return 0;
+}
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+/**
+ *	ipv6_rearrange_destopt - rearrange IPv6 destination options header
+ *	@iph: IPv6 header
+ *	@destopt: destionation options header
+ */
+static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt)
+{
+	u8 *opt = (u8 *)destopt;
+	int len = ipv6_optlen(destopt);
+	int off = 0;
+	int optlen = 0;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+
+		switch (opt[off]) {
+
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		default:
+			if (len < 2)
+				goto bad;
+			optlen = opt[off+1]+2;
+			if (len < optlen)
+				goto bad;
+
+			/* Rearrange the source address in @iph and the
+			 * addresses in home address option for final source.
+			 * See 11.3.2 of RFC 3775 for details.
+			 */
+			if (opt[off] == IPV6_TLV_HAO) {
+				struct in6_addr final_addr;
+				struct ipv6_destopt_hao *hao;
+
+				hao = (struct ipv6_destopt_hao *)&opt[off];
+				if (hao->length != sizeof(hao->addr)) {
+					if (net_ratelimit())
+						printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length);
+					goto bad;
+				}
+				ipv6_addr_copy(&final_addr, &hao->addr);
+				ipv6_addr_copy(&hao->addr, &iph->saddr);
+				ipv6_addr_copy(&iph->saddr, &final_addr);
+			}
+			break;
+		}
+
+		off += optlen;
+		len -= optlen;
+	}
+	/* Note: ok if len == 0 */
+bad:
+	return;
+}
+#else
+static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {}
+#endif
+
+/**
+ *	ipv6_rearrange_rthdr - rearrange IPv6 routing header
+ *	@iph: IPv6 header
+ *	@rthdr: routing header
+ *
+ *	Rearrange the destination address in @iph and the addresses in @rthdr
+ *	so that they appear in the order they will at the final destination.
+ *	See Appendix A2 of RFC 2402 for details.
+ */
+static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr)
+{
+	int segments, segments_left;
+	struct in6_addr *addrs;
+	struct in6_addr final_addr;
+
+	segments_left = rthdr->segments_left;
+	if (segments_left == 0)
+		return;
+	rthdr->segments_left = 0;
+
+	/* The value of rthdr->hdrlen has been verified either by the system
+	 * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming
+	 * packets.  So we can assume that it is even and that segments is
+	 * greater than or equal to segments_left.
+	 *
+	 * For the same reason we can assume that this option is of type 0.
+	 */
+	segments = rthdr->hdrlen >> 1;
+
+	addrs = ((struct rt0_hdr *)rthdr)->addr;
+	ipv6_addr_copy(&final_addr, addrs + segments - 1);
+
+	addrs += segments - segments_left;
+	memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs));
+
+	ipv6_addr_copy(addrs, &iph->daddr);
+	ipv6_addr_copy(&iph->daddr, &final_addr);
+}
+
+static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
+{
+	union {
+		struct ipv6hdr *iph;
+		struct ipv6_opt_hdr *opth;
+		struct ipv6_rt_hdr *rth;
+		char *raw;
+	} exthdr = { .iph = iph };
+	char *end = exthdr.raw + len;
+	int nexthdr = iph->nexthdr;
+
+	exthdr.iph++;
+
+	while (exthdr.raw < end) {
+		switch (nexthdr) {
+		case NEXTHDR_DEST:
+			if (dir == XFRM_POLICY_OUT)
+				ipv6_rearrange_destopt(iph, exthdr.opth);
+		case NEXTHDR_HOP:
+			if (!zero_out_mutable_opts(exthdr.opth)) {
+				LIMIT_NETDEBUG(
+					KERN_WARNING "overrun %sopts\n",
+					nexthdr == NEXTHDR_HOP ?
+						"hop" : "dest");
+				return -EINVAL;
+			}
+			break;
+
+		case NEXTHDR_ROUTING:
+			ipv6_rearrange_rthdr(iph, exthdr.rth);
+			break;
+
+		default :
+			return 0;
+		}
+
+		nexthdr = exthdr.opth->nexthdr;
+		exthdr.raw += ipv6_optlen(exthdr.opth);
+	}
+
+	return 0;
+}
+
+static void ah6_output_done(struct crypto_async_request *base, int err)
+{
+	int extlen;
+	u8 *iph_base;
+	u8 *icv;
+	struct sk_buff *skb = base->data;
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+	struct ah_data *ahp = x->data;
+	struct ipv6hdr *top_iph = ipv6_hdr(skb);
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	struct tmp_ext *iph_ext;
+
+	extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+	if (extlen)
+		extlen += sizeof(*iph_ext);
+
+	iph_base = AH_SKB_CB(skb)->tmp;
+	iph_ext = ah_tmp_ext(iph_base);
+	icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen);
+
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+	memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+	if (extlen) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		memcpy(&top_iph->saddr, iph_ext, extlen);
+#else
+		memcpy(&top_iph->daddr, iph_ext, extlen);
+#endif
+	}
+
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_output_resume(skb, err);
+}
+
+static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	int nfrags;
+	int extlen;
+	u8 *iph_base;
+	u8 *icv;
+	u8 nexthdr;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct ipv6hdr *top_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	struct tmp_ext *iph_ext;
+
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	skb_push(skb, -skb_network_offset(skb));
+	extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr);
+	if (extlen)
+		extlen += sizeof(*iph_ext);
+
+	err = -ENOMEM;
+	iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen);
+	if (!iph_base)
+		goto out;
+
+	iph_ext = ah_tmp_ext(iph_base);
+	icv = ah_tmp_icv(ahash, iph_ext, extlen);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+
+	ah = ip_auth_hdr(skb);
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	top_iph = ipv6_hdr(skb);
+	top_iph->payload_len = htons(skb->len - sizeof(*top_iph));
+
+	nexthdr = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_AH;
+
+	/* When there are no extension headers, we only need to save the first
+	 * 8 bytes of the base IP header.
+	 */
+	memcpy(iph_base, top_iph, IPV6HDR_BASELEN);
+
+	if (extlen) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		memcpy(iph_ext, &top_iph->saddr, extlen);
+#else
+		memcpy(iph_ext, &top_iph->daddr, extlen);
+#endif
+		err = ipv6_clear_mutable_options(top_iph,
+						 extlen - sizeof(*iph_ext) +
+						 sizeof(*top_iph),
+						 XFRM_POLICY_OUT);
+		if (err)
+			goto out_free;
+	}
+
+	ah->nexthdr = nexthdr;
+
+	top_iph->priority    = 0;
+	top_iph->flow_lbl[0] = 0;
+	top_iph->flow_lbl[1] = 0;
+	top_iph->flow_lbl[2] = 0;
+	top_iph->hop_limit   = 0;
+
+	ah->hdrlen  = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+	ah->reserved = 0;
+	ah->spi = x->id.spi;
+	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
+
+	ahash_request_set_crypt(req, sg, icv, skb->len);
+	ahash_request_set_callback(req, 0, ah6_output_done, skb);
+
+	AH_SKB_CB(skb)->tmp = iph_base;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		if (err == -EBUSY)
+			err = NET_XMIT_DROP;
+		goto out_free;
+	}
+
+	memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+	memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
+
+	if (extlen) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		memcpy(&top_iph->saddr, iph_ext, extlen);
+#else
+		memcpy(&top_iph->daddr, iph_ext, extlen);
+#endif
+	}
+
+out_free:
+	kfree(iph_base);
+out:
+	return err;
+}
+
+static void ah6_input_done(struct crypto_async_request *base, int err)
+{
+	u8 *auth_data;
+	u8 *icv;
+	u8 *work_iph;
+	struct sk_buff *skb = base->data;
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ah_data *ahp = x->data;
+	struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+	int hdr_len = skb_network_header_len(skb);
+	int ah_hlen = (ah->hdrlen + 2) << 2;
+
+	work_iph = AH_SKB_CB(skb)->tmp;
+	auth_data = ah_tmp_auth(work_iph, hdr_len);
+	icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	if (err)
+		goto out;
+
+	err = ah->nexthdr;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, hdr_len);
+	__skb_pull(skb, ah_hlen + hdr_len);
+	skb_set_transport_header(skb, -hdr_len);
+out:
+	kfree(AH_SKB_CB(skb)->tmp);
+	xfrm_input_resume(skb, err);
+}
+
+
+
+static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	/*
+	 * Before process AH
+	 * [IPv6][Ext1][Ext2][AH][Dest][Payload]
+	 * |<-------------->| hdr_len
+	 *
+	 * To erase AH:
+	 * Keeping copy of cleared headers. After AH processing,
+	 * Moving the pointer of skb->network_header by using skb_pull as long
+	 * as AH header length. Then copy back the copy as long as hdr_len
+	 * If destination header following AH exists, copy it into after [Ext2].
+	 *
+	 * |<>|[IPv6][Ext1][Ext2][Dest][Payload]
+	 * There is offset of AH before IPv6 header after the process.
+	 */
+
+	u8 *auth_data;
+	u8 *icv;
+	u8 *work_iph;
+	struct sk_buff *trailer;
+	struct crypto_ahash *ahash;
+	struct ahash_request *req;
+	struct scatterlist *sg;
+	struct ip_auth_hdr *ah;
+	struct ipv6hdr *ip6h;
+	struct ah_data *ahp;
+	u16 hdr_len;
+	u16 ah_hlen;
+	int nexthdr;
+	int nfrags;
+	int err = -ENOMEM;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+		goto out;
+
+	/* We are going to _remove_ AH header to keep sockets happy,
+	 * so... Later this can change. */
+	if (skb_cloned(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	hdr_len = skb_network_header_len(skb);
+	ah = (struct ip_auth_hdr *)skb->data;
+	ahp = x->data;
+	ahash = ahp->ahash;
+
+	nexthdr = ah->nexthdr;
+	ah_hlen = (ah->hdrlen + 2) << 2;
+
+	if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+	    ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+		goto out;
+
+	if (!pskb_may_pull(skb, ah_hlen))
+		goto out;
+
+
+	if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+	nfrags = err;
+
+	ah = (struct ip_auth_hdr *)skb->data;
+	ip6h = ipv6_hdr(skb);
+
+	skb_push(skb, hdr_len);
+
+	work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len);
+	if (!work_iph)
+		goto out;
+
+	auth_data = ah_tmp_auth(work_iph, hdr_len);
+	icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+	req = ah_tmp_req(ahash, icv);
+	sg = ah_req_sg(ahash, req);
+
+	memcpy(work_iph, ip6h, hdr_len);
+	memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+	memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+	if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN))
+		goto out_free;
+
+	ip6h->priority    = 0;
+	ip6h->flow_lbl[0] = 0;
+	ip6h->flow_lbl[1] = 0;
+	ip6h->flow_lbl[2] = 0;
+	ip6h->hop_limit   = 0;
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
+
+	ahash_request_set_crypt(req, sg, icv, skb->len);
+	ahash_request_set_callback(req, 0, ah6_input_done, skb);
+
+	AH_SKB_CB(skb)->tmp = work_iph;
+
+	err = crypto_ahash_digest(req);
+	if (err) {
+		if (err == -EINPROGRESS)
+			goto out;
+
+		goto out_free;
+	}
+
+	err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+	if (err)
+		goto out_free;
+
+	skb->network_header += ah_hlen;
+	memcpy(skb_network_header(skb), work_iph, hdr_len);
+	skb->transport_header = skb->network_header;
+	__skb_pull(skb, ah_hlen + hdr_len);
+
+	err = nexthdr;
+
+out_free:
+	kfree(work_iph);
+out:
+	return err;
+}
+
+static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		    u8 type, u8 code, int offset, __be32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ipv6hdr *iph = (struct ipv6hdr*)skb->data;
+	struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset);
+	struct xfrm_state *x;
+
+	if (type != ICMPV6_DEST_UNREACH &&
+	    type != ICMPV6_PKT_TOOBIG)
+		return;
+
+	x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6);
+	if (!x)
+		return;
+
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n",
+		 ntohl(ah->spi), &iph->daddr);
+
+	xfrm_state_put(x);
+}
+
+static int ah6_init_state(struct xfrm_state *x)
+{
+	struct ah_data *ahp = NULL;
+	struct xfrm_algo_desc *aalg_desc;
+	struct crypto_ahash *ahash;
+
+	if (!x->aalg)
+		goto error;
+
+	if (x->encap)
+		goto error;
+
+	ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+	if (ahp == NULL)
+		return -ENOMEM;
+
+	ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+	if (IS_ERR(ahash))
+		goto error;
+
+	ahp->ahash = ahash;
+	if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+			       (x->aalg->alg_key_len + 7) / 8))
+		goto error;
+
+	/*
+	 * Lookup the algorithm description maintained by xfrm_algo,
+	 * verify crypto transform properties, and store information
+	 * we need for AH processing.  This lookup cannot fail here
+	 * after a successful crypto_alloc_hash().
+	 */
+	aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+	BUG_ON(!aalg_desc);
+
+	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+	    crypto_ahash_digestsize(ahash)) {
+		printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+		       x->aalg->alg_name, crypto_ahash_digestsize(ahash),
+		       aalg_desc->uinfo.auth.icv_fullbits/8);
+		goto error;
+	}
+
+	ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+	ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+	BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+
+	x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+					  ahp->icv_trunc_len);
+	switch (x->props.mode) {
+	case XFRM_MODE_BEET:
+	case XFRM_MODE_TRANSPORT:
+		break;
+	case XFRM_MODE_TUNNEL:
+		x->props.header_len += sizeof(struct ipv6hdr);
+		break;
+	default:
+		goto error;
+	}
+	x->data = ahp;
+
+	return 0;
+
+error:
+	if (ahp) {
+		crypto_free_ahash(ahp->ahash);
+		kfree(ahp);
+	}
+	return -EINVAL;
+}
+
+static void ah6_destroy(struct xfrm_state *x)
+{
+	struct ah_data *ahp = x->data;
+
+	if (!ahp)
+		return;
+
+	crypto_free_ahash(ahp->ahash);
+	kfree(ahp);
+}
+
+static const struct xfrm_type ah6_type =
+{
+	.description	= "AH6",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_AH,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= ah6_init_state,
+	.destructor	= ah6_destroy,
+	.input		= ah6_input,
+	.output		= ah6_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
+};
+
+static const struct inet6_protocol ah6_protocol = {
+	.handler	=	xfrm6_rcv,
+	.err_handler	=	ah6_err,
+	.flags		=	INET6_PROTO_NOPOLICY,
+};
+
+static int __init ah6_init(void)
+{
+	if (xfrm_register_type(&ah6_type, AF_INET6) < 0) {
+		printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+
+	if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) {
+		printk(KERN_INFO "ipv6 ah init: can't add protocol\n");
+		xfrm_unregister_type(&ah6_type, AF_INET6);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __exit ah6_fini(void)
+{
+	if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0)
+		printk(KERN_INFO "ipv6 ah close: can't remove protocol\n");
+
+	if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
+		printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n");
+
+}
+
+module_init(ah6_init);
+module_exit(ah6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
new file mode 100644
index 00000000..674255f5
--- /dev/null
+++ b/net/ipv6/anycast.c
@@ -0,0 +1,549 @@
+/*
+ *	Anycast support for IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	David L Stevens (dlstevens@us.ibm.com)
+ *
+ *	based heavily on net/ipv6/mcast.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/if_inet6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+
+#include <net/checksum.h>
+
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
+
+/* Big ac list lock for all the sockets */
+static DEFINE_RWLOCK(ipv6_sk_ac_lock);
+
+
+/*
+ *	socket join an anycast group
+ */
+
+int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net_device *dev = NULL;
+	struct inet6_dev *idev;
+	struct ipv6_ac_socklist *pac;
+	struct net *net = sock_net(sk);
+	int	ishost = !net->ipv6.devconf_all->forwarding;
+	int	err = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (ipv6_addr_is_multicast(addr))
+		return -EINVAL;
+	if (ipv6_chk_addr(net, addr, NULL, 0))
+		return -EINVAL;
+
+	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
+	if (pac == NULL)
+		return -ENOMEM;
+	pac->acl_next = NULL;
+	ipv6_addr_copy(&pac->acl_addr, addr);
+
+	rcu_read_lock();
+	if (ifindex == 0) {
+		struct rt6_info *rt;
+
+		rt = rt6_lookup(net, addr, NULL, 0, 0);
+		if (rt) {
+			dev = rt->rt6i_dev;
+			dst_release(&rt->dst);
+		} else if (ishost) {
+			err = -EADDRNOTAVAIL;
+			goto error;
+		} else {
+			/* router, no matching interface: just pick one */
+			dev = dev_get_by_flags_rcu(net, IFF_UP,
+						   IFF_UP | IFF_LOOPBACK);
+		}
+	} else
+		dev = dev_get_by_index_rcu(net, ifindex);
+
+	if (dev == NULL) {
+		err = -ENODEV;
+		goto error;
+	}
+
+	idev = __in6_dev_get(dev);
+	if (!idev) {
+		if (ifindex)
+			err = -ENODEV;
+		else
+			err = -EADDRNOTAVAIL;
+		goto error;
+	}
+	/* reset ishost, now that we have a specific device */
+	ishost = !idev->cnf.forwarding;
+
+	pac->acl_ifindex = dev->ifindex;
+
+	/* XXX
+	 * For hosts, allow link-local or matching prefix anycasts.
+	 * This obviates the need for propagating anycast routes while
+	 * still allowing some non-router anycast participation.
+	 */
+	if (!ipv6_chk_prefix(addr, dev)) {
+		if (ishost)
+			err = -EADDRNOTAVAIL;
+		if (err)
+			goto error;
+	}
+
+	err = ipv6_dev_ac_inc(dev, addr);
+	if (!err) {
+		write_lock_bh(&ipv6_sk_ac_lock);
+		pac->acl_next = np->ipv6_ac_list;
+		np->ipv6_ac_list = pac;
+		write_unlock_bh(&ipv6_sk_ac_lock);
+		pac = NULL;
+	}
+
+error:
+	rcu_read_unlock();
+	if (pac)
+		sock_kfree_s(sk, pac, sizeof(*pac));
+	return err;
+}
+
+/*
+ *	socket leave an anycast group
+ */
+int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net_device *dev;
+	struct ipv6_ac_socklist *pac, *prev_pac;
+	struct net *net = sock_net(sk);
+
+	write_lock_bh(&ipv6_sk_ac_lock);
+	prev_pac = NULL;
+	for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) {
+		if ((ifindex == 0 || pac->acl_ifindex == ifindex) &&
+		     ipv6_addr_equal(&pac->acl_addr, addr))
+			break;
+		prev_pac = pac;
+	}
+	if (!pac) {
+		write_unlock_bh(&ipv6_sk_ac_lock);
+		return -ENOENT;
+	}
+	if (prev_pac)
+		prev_pac->acl_next = pac->acl_next;
+	else
+		np->ipv6_ac_list = pac->acl_next;
+
+	write_unlock_bh(&ipv6_sk_ac_lock);
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+	if (dev)
+		ipv6_dev_ac_dec(dev, &pac->acl_addr);
+	rcu_read_unlock();
+
+	sock_kfree_s(sk, pac, sizeof(*pac));
+	return 0;
+}
+
+void ipv6_sock_ac_close(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net_device *dev = NULL;
+	struct ipv6_ac_socklist *pac;
+	struct net *net = sock_net(sk);
+	int	prev_index;
+
+	write_lock_bh(&ipv6_sk_ac_lock);
+	pac = np->ipv6_ac_list;
+	np->ipv6_ac_list = NULL;
+	write_unlock_bh(&ipv6_sk_ac_lock);
+
+	prev_index = 0;
+	rcu_read_lock();
+	while (pac) {
+		struct ipv6_ac_socklist *next = pac->acl_next;
+
+		if (pac->acl_ifindex != prev_index) {
+			dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+			prev_index = pac->acl_ifindex;
+		}
+		if (dev)
+			ipv6_dev_ac_dec(dev, &pac->acl_addr);
+		sock_kfree_s(sk, pac, sizeof(*pac));
+		pac = next;
+	}
+	rcu_read_unlock();
+}
+
+#if 0
+/* The function is not used, which is funny. Apparently, author
+ * supposed to use it to filter out datagrams inside udp/raw but forgot.
+ *
+ * It is OK, anycasts are not special comparing to delivery to unicasts.
+ */
+
+int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex)
+{
+	struct ipv6_ac_socklist *pac;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	int	found;
+
+	found = 0;
+	read_lock(&ipv6_sk_ac_lock);
+	for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) {
+		if (ifindex && pac->acl_ifindex != ifindex)
+			continue;
+		found = ipv6_addr_equal(&pac->acl_addr, addr);
+		if (found)
+			break;
+	}
+	read_unlock(&ipv6_sk_ac_lock);
+
+	return found;
+}
+
+#endif
+
+static void aca_put(struct ifacaddr6 *ac)
+{
+	if (atomic_dec_and_test(&ac->aca_refcnt)) {
+		in6_dev_put(ac->aca_idev);
+		dst_release(&ac->aca_rt->dst);
+		kfree(ac);
+	}
+}
+
+/*
+ *	device anycast group inc (add if not found)
+ */
+int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct ifacaddr6 *aca;
+	struct inet6_dev *idev;
+	struct rt6_info *rt;
+	int err;
+
+	idev = in6_dev_get(dev);
+
+	if (idev == NULL)
+		return -EINVAL;
+
+	write_lock_bh(&idev->lock);
+	if (idev->dead) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+		if (ipv6_addr_equal(&aca->aca_addr, addr)) {
+			aca->aca_users++;
+			err = 0;
+			goto out;
+		}
+	}
+
+	/*
+	 *	not found: create a new one.
+	 */
+
+	aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC);
+
+	if (aca == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rt = addrconf_dst_alloc(idev, addr, 1);
+	if (IS_ERR(rt)) {
+		kfree(aca);
+		err = PTR_ERR(rt);
+		goto out;
+	}
+
+	ipv6_addr_copy(&aca->aca_addr, addr);
+	aca->aca_idev = idev;
+	aca->aca_rt = rt;
+	aca->aca_users = 1;
+	/* aca_tstamp should be updated upon changes */
+	aca->aca_cstamp = aca->aca_tstamp = jiffies;
+	atomic_set(&aca->aca_refcnt, 2);
+	spin_lock_init(&aca->aca_lock);
+
+	aca->aca_next = idev->ac_list;
+	idev->ac_list = aca;
+	write_unlock_bh(&idev->lock);
+
+	ip6_ins_rt(rt);
+
+	addrconf_join_solict(dev, &aca->aca_addr);
+
+	aca_put(aca);
+	return 0;
+out:
+	write_unlock_bh(&idev->lock);
+	in6_dev_put(idev);
+	return err;
+}
+
+/*
+ *	device anycast group decrement
+ */
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+	struct ifacaddr6 *aca, *prev_aca;
+
+	write_lock_bh(&idev->lock);
+	prev_aca = NULL;
+	for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+		if (ipv6_addr_equal(&aca->aca_addr, addr))
+			break;
+		prev_aca = aca;
+	}
+	if (!aca) {
+		write_unlock_bh(&idev->lock);
+		return -ENOENT;
+	}
+	if (--aca->aca_users > 0) {
+		write_unlock_bh(&idev->lock);
+		return 0;
+	}
+	if (prev_aca)
+		prev_aca->aca_next = aca->aca_next;
+	else
+		idev->ac_list = aca->aca_next;
+	write_unlock_bh(&idev->lock);
+	addrconf_leave_solict(idev, &aca->aca_addr);
+
+	dst_hold(&aca->aca_rt->dst);
+	ip6_del_rt(aca->aca_rt);
+
+	aca_put(aca);
+	return 0;
+}
+
+/* called with rcu_read_lock() */
+static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct inet6_dev *idev = __in6_dev_get(dev);
+
+	if (idev == NULL)
+		return -ENODEV;
+	return __ipv6_dev_ac_dec(idev, addr);
+}
+
+/*
+ *	check if the interface has this anycast address
+ *	called with rcu_read_lock()
+ */
+static int ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct inet6_dev *idev;
+	struct ifacaddr6 *aca;
+
+	idev = __in6_dev_get(dev);
+	if (idev) {
+		read_lock_bh(&idev->lock);
+		for (aca = idev->ac_list; aca; aca = aca->aca_next)
+			if (ipv6_addr_equal(&aca->aca_addr, addr))
+				break;
+		read_unlock_bh(&idev->lock);
+		return aca != NULL;
+	}
+	return 0;
+}
+
+/*
+ *	check if given interface (or any, if dev==0) has this anycast address
+ */
+int ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
+			const struct in6_addr *addr)
+{
+	int found = 0;
+
+	rcu_read_lock();
+	if (dev)
+		found = ipv6_chk_acast_dev(dev, addr);
+	else
+		for_each_netdev_rcu(net, dev)
+			if (ipv6_chk_acast_dev(dev, addr)) {
+				found = 1;
+				break;
+			}
+	rcu_read_unlock();
+	return found;
+}
+
+
+#ifdef CONFIG_PROC_FS
+struct ac6_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+};
+
+#define ac6_seq_private(seq)	((struct ac6_iter_state *)(seq)->private)
+
+static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
+{
+	struct ifacaddr6 *im = NULL;
+	struct ac6_iter_state *state = ac6_seq_private(seq);
+	struct net *net = seq_file_net(seq);
+
+	state->idev = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct inet6_dev *idev;
+		idev = __in6_dev_get(state->dev);
+		if (!idev)
+			continue;
+		read_lock_bh(&idev->lock);
+		im = idev->ac_list;
+		if (im) {
+			state->idev = idev;
+			break;
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	return im;
+}
+
+static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
+{
+	struct ac6_iter_state *state = ac6_seq_private(seq);
+
+	im = im->aca_next;
+	while (!im) {
+		if (likely(state->idev != NULL))
+			read_unlock_bh(&state->idev->lock);
+
+		state->dev = next_net_device_rcu(state->dev);
+		if (!state->dev) {
+			state->idev = NULL;
+			break;
+		}
+		state->idev = __in6_dev_get(state->dev);
+		if (!state->idev)
+			continue;
+		read_lock_bh(&state->idev->lock);
+		im = state->idev->ac_list;
+	}
+	return im;
+}
+
+static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ifacaddr6 *im = ac6_get_first(seq);
+	if (im)
+		while (pos && (im = ac6_get_next(seq, im)) != NULL)
+			--pos;
+	return pos ? NULL : im;
+}
+
+static void *ac6_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return ac6_get_idx(seq, *pos);
+}
+
+static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ifacaddr6 *im = ac6_get_next(seq, v);
+
+	++*pos;
+	return im;
+}
+
+static void ac6_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	struct ac6_iter_state *state = ac6_seq_private(seq);
+
+	if (likely(state->idev != NULL)) {
+		read_unlock_bh(&state->idev->lock);
+		state->idev = NULL;
+	}
+	rcu_read_unlock();
+}
+
+static int ac6_seq_show(struct seq_file *seq, void *v)
+{
+	struct ifacaddr6 *im = (struct ifacaddr6 *)v;
+	struct ac6_iter_state *state = ac6_seq_private(seq);
+
+	seq_printf(seq, "%-4d %-15s %pi6 %5d\n",
+		   state->dev->ifindex, state->dev->name,
+		   &im->aca_addr, im->aca_users);
+	return 0;
+}
+
+static const struct seq_operations ac6_seq_ops = {
+	.start	=	ac6_seq_start,
+	.next	=	ac6_seq_next,
+	.stop	=	ac6_seq_stop,
+	.show	=	ac6_seq_show,
+};
+
+static int ac6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ac6_seq_ops,
+			    sizeof(struct ac6_iter_state));
+}
+
+static const struct file_operations ac6_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	ac6_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+int __net_init ac6_proc_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "anycast6", S_IRUGO, &ac6_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ac6_proc_exit(struct net *net)
+{
+	proc_net_remove(net, "anycast6");
+}
+#endif
+
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
new file mode 100644
index 00000000..16560336
--- /dev/null
+++ b/net/ipv6/datagram.c
@@ -0,0 +1,868 @@
+/*
+ *	common UDP/RAW code
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/tcp_states.h>
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
+	struct inet_sock      	*inet = inet_sk(sk);
+	struct ipv6_pinfo      	*np = inet6_sk(sk);
+	struct in6_addr		*daddr, *final_p, final;
+	struct dst_entry	*dst;
+	struct flowi6		fl6;
+	struct ip6_flowlabel	*flowlabel = NULL;
+	struct ipv6_txoptions   *opt;
+	int			addr_type;
+	int			err;
+
+	if (usin->sin6_family == AF_INET) {
+		if (__ipv6_only_sock(sk))
+			return -EAFNOSUPPORT;
+		err = ip4_datagram_connect(sk, uaddr, addr_len);
+		goto ipv4_connected;
+	}
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	if (usin->sin6_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	memset(&fl6, 0, sizeof(fl6));
+	if (np->sndflow) {
+		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+			ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+		}
+	}
+
+	addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+	if (addr_type == IPV6_ADDR_ANY) {
+		/*
+		 *	connect to self
+		 */
+		usin->sin6_addr.s6_addr[15] = 0x01;
+	}
+
+	daddr = &usin->sin6_addr;
+
+	if (addr_type == IPV6_ADDR_MAPPED) {
+		struct sockaddr_in sin;
+
+		if (__ipv6_only_sock(sk)) {
+			err = -ENETUNREACH;
+			goto out;
+		}
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = daddr->s6_addr32[3];
+		sin.sin_port = usin->sin6_port;
+
+		err = ip4_datagram_connect(sk,
+					   (struct sockaddr*) &sin,
+					   sizeof(sin));
+
+ipv4_connected:
+		if (err)
+			goto out;
+
+		ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr);
+
+		if (ipv6_addr_any(&np->saddr))
+			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+
+		if (ipv6_addr_any(&np->rcv_saddr)) {
+			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+					       &np->rcv_saddr);
+			if (sk->sk_prot->rehash)
+				sk->sk_prot->rehash(sk);
+		}
+
+		goto out;
+	}
+
+	if (addr_type&IPV6_ADDR_LINKLOCAL) {
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    usin->sin6_scope_id) {
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != usin->sin6_scope_id) {
+				err = -EINVAL;
+				goto out;
+			}
+			sk->sk_bound_dev_if = usin->sin6_scope_id;
+		}
+
+		if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
+			sk->sk_bound_dev_if = np->mcast_oif;
+
+		/* Connect to link-local address requires an interface */
+		if (!sk->sk_bound_dev_if) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	ipv6_addr_copy(&np->daddr, daddr);
+	np->flow_label = fl6.flowlabel;
+
+	inet->inet_dport = usin->sin6_port;
+
+	/*
+	 *	Check for a route to destination an obtain the
+	 *	destination cache for it.
+	 */
+
+	fl6.flowi6_proto = sk->sk_protocol;
+	ipv6_addr_copy(&fl6.daddr, &np->daddr);
+	ipv6_addr_copy(&fl6.saddr, &np->saddr);
+	fl6.flowi6_oif = sk->sk_bound_dev_if;
+	fl6.flowi6_mark = sk->sk_mark;
+	fl6.fl6_dport = inet->inet_dport;
+	fl6.fl6_sport = inet->inet_sport;
+
+	if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
+		fl6.flowi6_oif = np->mcast_oif;
+
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	opt = flowlabel ? flowlabel->opt : np->opt;
+	final_p = fl6_update_dst(&fl6, opt, &final);
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	err = 0;
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto out;
+	}
+
+	/* source address lookup done in ip6_dst_lookup */
+
+	if (ipv6_addr_any(&np->saddr))
+		ipv6_addr_copy(&np->saddr, &fl6.saddr);
+
+	if (ipv6_addr_any(&np->rcv_saddr)) {
+		ipv6_addr_copy(&np->rcv_saddr, &fl6.saddr);
+		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
+
+	ip6_dst_store(sk, dst,
+		      ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
+		      &np->daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+		      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+		      &np->saddr :
+#endif
+		      NULL);
+
+	sk->sk_state = TCP_ESTABLISHED;
+out:
+	fl6_sock_release(flowlabel);
+	return err;
+}
+
+void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
+		     __be16 port, u32 info, u8 *payload)
+{
+	struct ipv6_pinfo *np  = inet6_sk(sk);
+	struct icmp6hdr *icmph = icmp6_hdr(skb);
+	struct sock_exterr_skb *serr;
+
+	if (!np->recverr)
+		return;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb->protocol = htons(ETH_P_IPV6);
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6;
+	serr->ee.ee_type = icmph->icmp6_type;
+	serr->ee.ee_code = icmph->icmp6_code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) -
+				  skb_network_header(skb);
+	serr->port = port;
+
+	__skb_pull(skb, payload - skb->data);
+	skb_reset_transport_header(skb);
+
+	if (sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sock_exterr_skb *serr;
+	struct ipv6hdr *iph;
+	struct sk_buff *skb;
+
+	if (!np->recverr)
+		return;
+
+	skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb->protocol = htons(ETH_P_IPV6);
+
+	skb_put(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	iph = ipv6_hdr(skb);
+	ipv6_addr_copy(&iph->daddr, &fl6->daddr);
+
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+	serr->ee.ee_type = 0;
+	serr->ee.ee_code = 0;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb);
+	serr->port = fl6->fl6_dport;
+
+	__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+	skb_reset_transport_header(skb);
+
+	if (sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6hdr *iph;
+	struct sk_buff *skb;
+	struct ip6_mtuinfo *mtu_info;
+
+	if (!np->rxopt.bits.rxpmtu)
+		return;
+
+	skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_put(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	iph = ipv6_hdr(skb);
+	ipv6_addr_copy(&iph->daddr, &fl6->daddr);
+
+	mtu_info = IP6CBMTU(skb);
+	if (!mtu_info) {
+		kfree_skb(skb);
+		return;
+	}
+
+	mtu_info->ip6m_mtu = mtu;
+	mtu_info->ip6m_addr.sin6_family = AF_INET6;
+	mtu_info->ip6m_addr.sin6_port = 0;
+	mtu_info->ip6m_addr.sin6_flowinfo = 0;
+	mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif;
+	ipv6_addr_copy(&mtu_info->ip6m_addr.sin6_addr, &ipv6_hdr(skb)->daddr);
+
+	__skb_pull(skb, skb_tail_pointer(skb) - skb->data);
+	skb_reset_transport_header(skb);
+
+	skb = xchg(&np->rxpmtu, skb);
+	kfree_skb(skb);
+}
+
+/*
+ *	Handle MSG_ERRQUEUE
+ */
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb, *skb2;
+	struct sockaddr_in6 *sin;
+	struct {
+		struct sock_extended_err ee;
+		struct sockaddr_in6	 offender;
+	} errhdr;
+	int err;
+	int copied;
+
+	err = -EAGAIN;
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+
+	sin = (struct sockaddr_in6 *)msg->msg_name;
+	if (sin) {
+		const unsigned char *nh = skb_network_header(skb);
+		sin->sin6_family = AF_INET6;
+		sin->sin6_flowinfo = 0;
+		sin->sin6_port = serr->port;
+		sin->sin6_scope_id = 0;
+		if (skb->protocol == htons(ETH_P_IPV6)) {
+			ipv6_addr_copy(&sin->sin6_addr,
+				  (struct in6_addr *)(nh + serr->addr_offset));
+			if (np->sndflow)
+				sin->sin6_flowinfo =
+					(*(__be32 *)(nh + serr->addr_offset - 24) &
+					 IPV6_FLOWINFO_MASK);
+			if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				sin->sin6_scope_id = IP6CB(skb)->iif;
+		} else {
+			ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
+					       &sin->sin6_addr);
+		}
+	}
+
+	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+	sin = &errhdr.offender;
+	sin->sin6_family = AF_UNSPEC;
+	if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) {
+		sin->sin6_family = AF_INET6;
+		sin->sin6_flowinfo = 0;
+		sin->sin6_scope_id = 0;
+		if (skb->protocol == htons(ETH_P_IPV6)) {
+			ipv6_addr_copy(&sin->sin6_addr, &ipv6_hdr(skb)->saddr);
+			if (np->rxopt.all)
+				datagram_recv_ctl(sk, msg, skb);
+			if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				sin->sin6_scope_id = IP6CB(skb)->iif;
+		} else {
+			struct inet_sock *inet = inet_sk(sk);
+
+			ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+					       &sin->sin6_addr);
+			if (inet->cmsg_flags)
+				ip_cmsg_recv(msg, skb);
+		}
+	}
+
+	put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr);
+
+	/* Now we could try to dump offended packet options */
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	/* Reset and regenerate socket error */
+	spin_lock_bh(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else {
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+	}
+
+out_free_skb:
+	kfree_skb(skb);
+out:
+	return err;
+}
+
+/*
+ *	Handle IPV6_RECVPATHMTU
+ */
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *skb;
+	struct sockaddr_in6 *sin;
+	struct ip6_mtuinfo mtu_info;
+	int err;
+	int copied;
+
+	err = -EAGAIN;
+	skb = xchg(&np->rxpmtu, NULL);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info));
+
+	sin = (struct sockaddr_in6 *)msg->msg_name;
+	if (sin) {
+		sin->sin6_family = AF_INET6;
+		sin->sin6_flowinfo = 0;
+		sin->sin6_port = 0;
+		sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id;
+		ipv6_addr_copy(&sin->sin6_addr, &mtu_info.ip6m_addr.sin6_addr);
+	}
+
+	put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info);
+
+	err = copied;
+
+out_free_skb:
+	kfree_skb(skb);
+out:
+	return err;
+}
+
+
+int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	unsigned char *nh = skb_network_header(skb);
+
+	if (np->rxopt.bits.rxinfo) {
+		struct in6_pktinfo src_info;
+
+		src_info.ipi6_ifindex = opt->iif;
+		ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr);
+		put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
+	}
+
+	if (np->rxopt.bits.rxhlim) {
+		int hlim = ipv6_hdr(skb)->hop_limit;
+		put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
+	}
+
+	if (np->rxopt.bits.rxtclass) {
+		int tclass = (ntohl(*(__be32 *)ipv6_hdr(skb)) >> 20) & 0xff;
+		put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
+	}
+
+	if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) {
+		__be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK;
+		put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo);
+	}
+
+	/* HbH is allowed only once */
+	if (np->rxopt.bits.hopopts && opt->hop) {
+		u8 *ptr = nh + opt->hop;
+		put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+	}
+
+	if (opt->lastopt &&
+	    (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) {
+		/*
+		 * Silly enough, but we need to reparse in order to
+		 * report extension headers (except for HbH)
+		 * in order.
+		 *
+		 * Also note that IPV6_RECVRTHDRDSTOPTS is NOT
+		 * (and WILL NOT be) defined because
+		 * IPV6_RECVDSTOPTS is more generic. --yoshfuji
+		 */
+		unsigned int off = sizeof(struct ipv6hdr);
+		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+
+		while (off <= opt->lastopt) {
+			unsigned len;
+			u8 *ptr = nh + off;
+
+			switch(nexthdr) {
+			case IPPROTO_DSTOPTS:
+				nexthdr = ptr[0];
+				len = (ptr[1] + 1) << 3;
+				if (np->rxopt.bits.dstopts)
+					put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr);
+				break;
+			case IPPROTO_ROUTING:
+				nexthdr = ptr[0];
+				len = (ptr[1] + 1) << 3;
+				if (np->rxopt.bits.srcrt)
+					put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr);
+				break;
+			case IPPROTO_AH:
+				nexthdr = ptr[0];
+				len = (ptr[1] + 2) << 2;
+				break;
+			default:
+				nexthdr = ptr[0];
+				len = (ptr[1] + 1) << 3;
+				break;
+			}
+
+			off += len;
+		}
+	}
+
+	/* socket options in old style */
+	if (np->rxopt.bits.rxoinfo) {
+		struct in6_pktinfo src_info;
+
+		src_info.ipi6_ifindex = opt->iif;
+		ipv6_addr_copy(&src_info.ipi6_addr, &ipv6_hdr(skb)->daddr);
+		put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+	}
+	if (np->rxopt.bits.rxohlim) {
+		int hlim = ipv6_hdr(skb)->hop_limit;
+		put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+	}
+	if (np->rxopt.bits.ohopopts && opt->hop) {
+		u8 *ptr = nh + opt->hop;
+		put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.odstopts && opt->dst0) {
+		u8 *ptr = nh + opt->dst0;
+		put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.osrcrt && opt->srcrt) {
+		struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt);
+		put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+	}
+	if (np->rxopt.bits.odstopts && opt->dst1) {
+		u8 *ptr = nh + opt->dst1;
+		put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.rxorigdstaddr) {
+		struct sockaddr_in6 sin6;
+		u16 *ports = (u16 *) skb_transport_header(skb);
+
+		if (skb_transport_offset(skb) + 4 <= skb->len) {
+			/* All current transport protocols have the port numbers in the
+			 * first four bytes of the transport header and this function is
+			 * written with this assumption in mind.
+			 */
+
+			sin6.sin6_family = AF_INET6;
+			ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
+			sin6.sin6_port = ports[1];
+			sin6.sin6_flowinfo = 0;
+			sin6.sin6_scope_id = 0;
+
+			put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+		}
+	}
+	return 0;
+}
+
+int datagram_send_ctl(struct net *net,
+		      struct msghdr *msg, struct flowi6 *fl6,
+		      struct ipv6_txoptions *opt,
+		      int *hlimit, int *tclass, int *dontfrag)
+{
+	struct in6_pktinfo *src_info;
+	struct cmsghdr *cmsg;
+	struct ipv6_rt_hdr *rthdr;
+	struct ipv6_opt_hdr *hdr;
+	int len;
+	int err = 0;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		int addr_type;
+
+		if (!CMSG_OK(msg, cmsg)) {
+			err = -EINVAL;
+			goto exit_f;
+		}
+
+		if (cmsg->cmsg_level != SOL_IPV6)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case IPV6_PKTINFO:
+		case IPV6_2292PKTINFO:
+		    {
+			struct net_device *dev = NULL;
+
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+
+			if (src_info->ipi6_ifindex) {
+				if (fl6->flowi6_oif &&
+				    src_info->ipi6_ifindex != fl6->flowi6_oif)
+					return -EINVAL;
+				fl6->flowi6_oif = src_info->ipi6_ifindex;
+			}
+
+			addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
+
+			rcu_read_lock();
+			if (fl6->flowi6_oif) {
+				dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+				if (!dev) {
+					rcu_read_unlock();
+					return -ENODEV;
+				}
+			} else if (addr_type & IPV6_ADDR_LINKLOCAL) {
+				rcu_read_unlock();
+				return -EINVAL;
+			}
+
+			if (addr_type != IPV6_ADDR_ANY) {
+				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+				if (!ipv6_chk_addr(net, &src_info->ipi6_addr,
+						   strict ? dev : NULL, 0))
+					err = -EINVAL;
+				else
+					ipv6_addr_copy(&fl6->saddr, &src_info->ipi6_addr);
+			}
+
+			rcu_read_unlock();
+
+			if (err)
+				goto exit_f;
+
+			break;
+		    }
+
+		case IPV6_FLOWINFO:
+			if (cmsg->cmsg_len < CMSG_LEN(4)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
+				if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
+					err = -EINVAL;
+					goto exit_f;
+				}
+			}
+			fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
+			break;
+
+		case IPV6_2292HOPOPTS:
+		case IPV6_HOPOPTS:
+			if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			opt->opt_nflen += len;
+			opt->hopopt = hdr;
+			break;
+
+		case IPV6_2292DSTOPTS:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			if (opt->dst1opt) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			opt->opt_flen += len;
+			opt->dst1opt = hdr;
+			break;
+
+		case IPV6_DSTOPTS:
+		case IPV6_RTHDRDSTOPTS:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			if (cmsg->cmsg_type == IPV6_DSTOPTS) {
+				opt->opt_flen += len;
+				opt->dst1opt = hdr;
+			} else {
+				opt->opt_nflen += len;
+				opt->dst0opt = hdr;
+			}
+			break;
+
+		case IPV6_2292RTHDR:
+		case IPV6_RTHDR:
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
+
+			switch (rthdr->type) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+			case IPV6_SRCRT_TYPE_2:
+				if (rthdr->hdrlen != 2 ||
+				    rthdr->segments_left != 1) {
+					err = -EINVAL;
+					goto exit_f;
+				}
+				break;
+#endif
+			default:
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			len = ((rthdr->hdrlen + 1) << 3);
+
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			/* segments left must also match */
+			if ((rthdr->hdrlen >> 1) != rthdr->segments_left) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			opt->opt_nflen += len;
+			opt->srcrt = rthdr;
+
+			if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) {
+				int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
+
+				opt->opt_nflen += dsthdrlen;
+				opt->dst0opt = opt->dst1opt;
+				opt->dst1opt = NULL;
+				opt->opt_flen -= dsthdrlen;
+			}
+
+			break;
+
+		case IPV6_2292HOPLIMIT:
+		case IPV6_HOPLIMIT:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			*hlimit = *(int *)CMSG_DATA(cmsg);
+			if (*hlimit < -1 || *hlimit > 0xff) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			break;
+
+		case IPV6_TCLASS:
+		    {
+			int tc;
+
+			err = -EINVAL;
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+				goto exit_f;
+			}
+
+			tc = *(int *)CMSG_DATA(cmsg);
+			if (tc < -1 || tc > 0xff)
+				goto exit_f;
+
+			err = 0;
+			*tclass = tc;
+
+			break;
+		    }
+
+		case IPV6_DONTFRAG:
+		    {
+			int df;
+
+			err = -EINVAL;
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
+				goto exit_f;
+			}
+
+			df = *(int *)CMSG_DATA(cmsg);
+			if (df < 0 || df > 1)
+				goto exit_f;
+
+			err = 0;
+			*dontfrag = df;
+
+			break;
+		    }
+		default:
+			LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
+				       cmsg->cmsg_type);
+			err = -EINVAL;
+			goto exit_f;
+		}
+	}
+
+exit_f:
+	return err;
+}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
new file mode 100644
index 00000000..65dd5433
--- /dev/null
+++ b/net/ipv6/esp6.c
@@ -0,0 +1,674 @@
+/*
+ * Copyright (C)2002 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors
+ *
+ *	Mitsuru KANDA @USAGI       : IPv6 Support
+ * 	Kazunori MIYAZAWA @USAGI   :
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *
+ * 	This file is derived from net/ipv4/esp.c
+ */
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <linux/icmpv6.h>
+
+struct esp_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the upper 32 bits of the sequence number are
+ * placed at the front, if present. Followed by the IV, the request and finally
+ * the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
+{
+	unsigned int len;
+
+	len = seqihlen;
+
+	len += crypto_aead_ivsize(aead);
+
+	if (len) {
+		len += crypto_aead_alignmask(aead) &
+		       ~(crypto_tfm_ctx_alignment() - 1);
+		len = ALIGN(len, crypto_tfm_ctx_alignment());
+	}
+
+	len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+	return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+	return crypto_aead_ivsize(aead) ?
+	       PTR_ALIGN((u8 *)tmp + seqhilen,
+			 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+	struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_givcrypt_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_givcrypt_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_request_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+					     struct aead_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+	struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+	struct sk_buff *skb = base->data;
+
+	kfree(ESP_SKB_CB(skb)->tmp);
+	xfrm_output_resume(skb, err);
+}
+
+static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct aead_givcrypt_request *req;
+	struct scatterlist *sg;
+	struct scatterlist *asg;
+	struct sk_buff *trailer;
+	void *tmp;
+	int blksize;
+	int clen;
+	int alen;
+	int plen;
+	int tfclen;
+	int nfrags;
+	int assoclen;
+	int sglists;
+	int seqhilen;
+	u8 *iv;
+	u8 *tail;
+	__be32 *seqhi;
+	struct esp_data *esp = x->data;
+
+	/* skb is pure payload to encrypt */
+	err = -ENOMEM;
+
+	aead = esp->aead;
+	alen = crypto_aead_authsize(aead);
+
+	tfclen = 0;
+	if (x->tfcpad) {
+		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+		u32 padto;
+
+		padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+		if (skb->len < padto)
+			tfclen = padto - skb->len;
+	}
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	clen = ALIGN(skb->len + 2 + tfclen, blksize);
+	if (esp->padlen)
+		clen = ALIGN(clen, esp->padlen);
+	plen = clen - skb->len - tfclen;
+
+	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
+
+	assoclen = sizeof(*esph);
+	sglists = 1;
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists += 2;
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+	if (!tmp)
+		goto error;
+
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_givreq(aead, iv);
+	asg = esp_givreq_sg(aead, req);
+	sg = asg + sglists;
+
+	/* Fill padding... */
+	tail = skb_tail_pointer(trailer);
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = *skb_mac_header(skb);
+	pskb_put(skb, trailer, clen - skb->len + alen);
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph = ip_esp_hdr(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg,
+		     esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+		     clen + alen);
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_init_table(asg, 3);
+		sg_set_buf(asg, &esph->spi, sizeof(__be32));
+		*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+		sg_set_buf(asg + 1, seqhi, seqhilen);
+		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+	} else
+		sg_init_one(asg, esph, sizeof(*esph));
+
+	aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+	aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+	aead_givcrypt_set_assoc(req, asg, assoclen);
+	aead_givcrypt_set_giv(req, esph->enc_data,
+			      XFRM_SKB_CB(skb)->seq.output.low);
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	err = crypto_aead_givencrypt(req);
+	if (err == -EINPROGRESS)
+		goto error;
+
+	if (err == -EBUSY)
+		err = NET_XMIT_DROP;
+
+	kfree(tmp);
+
+error:
+	return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = esp->aead;
+	int alen = crypto_aead_authsize(aead);
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	int elen = skb->len - hlen;
+	int hdr_len = skb_network_header_len(skb);
+	int padlen;
+	u8 nexthdr[2];
+
+	kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+		BUG();
+
+	err = -EINVAL;
+	padlen = nexthdr[0];
+	if (padlen + 2 + alen >= elen) {
+		LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage "
+			       "padlen=%d, elen=%d\n", padlen + 2, elen - alen);
+		goto out;
+	}
+
+	/* ... check padding bits here. Silly. :-) */
+
+	pskb_trim(skb, skb->len - alen - padlen - 2);
+	__skb_pull(skb, hlen);
+	skb_set_transport_header(skb, -hdr_len);
+
+	err = nexthdr[1];
+
+	/* RFC4303: Drop dummy packets without any error */
+	if (err == IPPROTO_NONE)
+		err = -EINVAL;
+
+out:
+	return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+	struct sk_buff *skb = base->data;
+
+	xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_esp_hdr *esph;
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead = esp->aead;
+	struct aead_request *req;
+	struct sk_buff *trailer;
+	int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+	int nfrags;
+	int assoclen;
+	int sglists;
+	int seqhilen;
+	int ret = 0;
+	void *tmp;
+	__be32 *seqhi;
+	u8 *iv;
+	struct scatterlist *sg;
+	struct scatterlist *asg;
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (elen <= 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+
+	assoclen = sizeof(*esph);
+	sglists = 1;
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		sglists += 2;
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+	if (!tmp)
+		goto out;
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	asg = esp_req_sg(aead, req);
+	sg = asg + sglists;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	esph = (struct ip_esp_hdr *)skb->data;
+
+	/* Get ivec. This can be wrong, check against another impls. */
+	iv = esph->enc_data;
+
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		sg_init_table(asg, 3);
+		sg_set_buf(asg, &esph->spi, sizeof(__be32));
+		*seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+		sg_set_buf(asg + 1, seqhi, seqhilen);
+		sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+	} else
+		sg_init_one(asg, esph, sizeof(*esph));
+
+	aead_request_set_callback(req, 0, esp_input_done, skb);
+	aead_request_set_crypt(req, sg, sg, elen, iv);
+	aead_request_set_assoc(req, asg, assoclen);
+
+	ret = crypto_aead_decrypt(req);
+	if (ret == -EINPROGRESS)
+		goto out;
+
+	ret = esp_input_done2(skb, ret);
+
+out:
+	return ret;
+}
+
+static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
+{
+	struct esp_data *esp = x->data;
+	u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+	u32 align = max_t(u32, blksize, esp->padlen);
+	unsigned int net_adj;
+
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		net_adj = sizeof(struct ipv6hdr);
+	else
+		net_adj = 0;
+
+	return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+		 net_adj) & ~(align - 1)) + (net_adj - 2);
+}
+
+static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		     u8 type, u8 code, int offset, __be32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset);
+	struct xfrm_state *x;
+
+	if (type != ICMPV6_DEST_UNREACH &&
+	    type != ICMPV6_PKT_TOOBIG)
+		return;
+
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      esph->spi, IPPROTO_ESP, AF_INET6);
+	if (!x)
+		return;
+	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%pI6\n",
+			ntohl(esph->spi), &iph->daddr);
+	xfrm_state_put(x);
+}
+
+static void esp6_destroy(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+
+	if (!esp)
+		return;
+
+	crypto_free_aead(esp->aead);
+	kfree(esp);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead;
+	int err;
+
+	aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	esp->aead = aead;
+
+	err = crypto_aead_setkey(aead, x->aead->alg_key,
+				 (x->aead->alg_key_len + 7) / 8);
+	if (err)
+		goto error;
+
+	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+	if (err)
+		goto error;
+
+error:
+	return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+	struct crypto_aead *aead;
+	struct crypto_authenc_key_param *param;
+	struct rtattr *rta;
+	char *key;
+	char *p;
+	char authenc_name[CRYPTO_MAX_ALG_NAME];
+	unsigned int keylen;
+	int err;
+
+	err = -EINVAL;
+	if (x->ealg == NULL)
+		goto error;
+
+	err = -ENAMETOOLONG;
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "authencesn(%s,%s)",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+			goto error;
+	} else {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "authenc(%s,%s)",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+			goto error;
+	}
+
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	esp->aead = aead;
+
+	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+	err = -ENOMEM;
+	key = kmalloc(keylen, GFP_KERNEL);
+	if (!key)
+		goto error;
+
+	p = key;
+	rta = (void *)p;
+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+	rta->rta_len = RTA_LENGTH(sizeof(*param));
+	param = RTA_DATA(rta);
+	p += RTA_SPACE(sizeof(*param));
+
+	if (x->aalg) {
+		struct xfrm_algo_desc *aalg_desc;
+
+		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+		p += (x->aalg->alg_key_len + 7) / 8;
+
+		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		BUG_ON(!aalg_desc);
+
+		err = -EINVAL;
+		if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+		    crypto_aead_authsize(aead)) {
+			NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+				 x->aalg->alg_name,
+				 crypto_aead_authsize(aead),
+				 aalg_desc->uinfo.auth.icv_fullbits/8);
+			goto free_key;
+		}
+
+		err = crypto_aead_setauthsize(
+			aead, x->aalg->alg_trunc_len / 8);
+		if (err)
+			goto free_key;
+	}
+
+	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+	err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+	kfree(key);
+
+error:
+	return err;
+}
+
+static int esp6_init_state(struct xfrm_state *x)
+{
+	struct esp_data *esp;
+	struct crypto_aead *aead;
+	u32 align;
+	int err;
+
+	if (x->encap)
+		return -EINVAL;
+
+	esp = kzalloc(sizeof(*esp), GFP_KERNEL);
+	if (esp == NULL)
+		return -ENOMEM;
+
+	x->data = esp;
+
+	if (x->aead)
+		err = esp_init_aead(x);
+	else
+		err = esp_init_authenc(x);
+
+	if (err)
+		goto error;
+
+	aead = esp->aead;
+
+	esp->padlen = 0;
+
+	x->props.header_len = sizeof(struct ip_esp_hdr) +
+			      crypto_aead_ivsize(aead);
+	switch (x->props.mode) {
+	case XFRM_MODE_BEET:
+		if (x->sel.family != AF_INET6)
+			x->props.header_len += IPV4_BEET_PHMAXLEN +
+				               (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+		break;
+	case XFRM_MODE_TRANSPORT:
+		break;
+	case XFRM_MODE_TUNNEL:
+		x->props.header_len += sizeof(struct ipv6hdr);
+		break;
+	default:
+		goto error;
+	}
+
+	align = ALIGN(crypto_aead_blocksize(aead), 4);
+	if (esp->padlen)
+		align = max_t(u32, align, esp->padlen);
+	x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+
+error:
+	return err;
+}
+
+static const struct xfrm_type esp6_type =
+{
+	.description	= "ESP6",
+	.owner	     	= THIS_MODULE,
+	.proto	     	= IPPROTO_ESP,
+	.flags		= XFRM_TYPE_REPLAY_PROT,
+	.init_state	= esp6_init_state,
+	.destructor	= esp6_destroy,
+	.get_mtu	= esp6_get_mtu,
+	.input		= esp6_input,
+	.output		= esp6_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
+};
+
+static const struct inet6_protocol esp6_protocol = {
+	.handler 	=	xfrm6_rcv,
+	.err_handler	=	esp6_err,
+	.flags		=	INET6_PROTO_NOPOLICY,
+};
+
+static int __init esp6_init(void)
+{
+	if (xfrm_register_type(&esp6_type, AF_INET6) < 0) {
+		printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) {
+		printk(KERN_INFO "ipv6 esp init: can't add protocol\n");
+		xfrm_unregister_type(&esp6_type, AF_INET6);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __exit esp6_fini(void)
+{
+	if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0)
+		printk(KERN_INFO "ipv6 esp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
+		printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n");
+}
+
+module_init(esp6_init);
+module_exit(esp6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
new file mode 100644
index 00000000..79a485e8
--- /dev/null
+++ b/net/ipv6/exthdrs.c
@@ -0,0 +1,898 @@
+/*
+ *	Extension Header handling for IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Andi Kleen		<ak@muc.de>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/* Changes:
+ *	yoshfuji		: ensure not to overrun while parsing
+ *				  tlv options.
+ *	Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
+ *	YOSHIFUJI Hideaki @USAGI  Register inbound extension header
+ *				  handlers as inet6_protocol{}.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/slab.h>
+
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+#include <net/xfrm.h>
+#endif
+
+#include <asm/uaccess.h>
+
+int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+{
+	const unsigned char *nh = skb_network_header(skb);
+	int packet_len = skb->tail - skb->network_header;
+	struct ipv6_opt_hdr *hdr;
+	int len;
+
+	if (offset + 2 > packet_len)
+		goto bad;
+	hdr = (struct ipv6_opt_hdr *)(nh + offset);
+	len = ((hdr->hdrlen + 1) << 3);
+
+	if (offset + len > packet_len)
+		goto bad;
+
+	offset += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int opttype = nh[offset];
+		int optlen;
+
+		if (opttype == type)
+			return offset;
+
+		switch (opttype) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		default:
+			optlen = nh[offset + 1] + 2;
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		offset += optlen;
+		len -= optlen;
+	}
+	/* not_found */
+ bad:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(ipv6_find_tlv);
+
+/*
+ *	Parsing tlv encoded headers.
+ *
+ *	Parsing function "func" returns 1, if parsing succeed
+ *	and 0, if it failed.
+ *	It MUST NOT touch skb->h.
+ */
+
+struct tlvtype_proc {
+	int	type;
+	int	(*func)(struct sk_buff *skb, int offset);
+};
+
+/*********************
+  Generic functions
+ *********************/
+
+/* An unknown option is detected, decide what to do */
+
+static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
+{
+	switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
+	case 0: /* ignore */
+		return 1;
+
+	case 1: /* drop packet */
+		break;
+
+	case 3: /* Send ICMP if not a multicast address and drop packet */
+		/* Actually, it is redundant check. icmp_send
+		   will recheck in any case.
+		 */
+		if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
+			break;
+	case 2: /* send ICMP PARM PROB regardless and drop packet */
+		icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
+		return 0;
+	}
+
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Parse tlv encoded option header (hop-by-hop or destination) */
+
+static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb)
+{
+	struct tlvtype_proc *curr;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = skb_network_header_len(skb);
+	int len = (skb_transport_header(skb)[1] + 1) << 3;
+
+	if (skb_transport_offset(skb) + len > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		default: /* Other TLV code so scan list */
+			if (optlen > len)
+				goto bad;
+			for (curr=procs; curr->type >= 0; curr++) {
+				if (curr->type == nh[off]) {
+					/* type specific length/alignment
+					   checks will be performed in the
+					   func(). */
+					if (curr->func(skb, off) == 0)
+						return 0;
+					break;
+				}
+			}
+			if (curr->type < 0) {
+				if (ip6_tlvopt_unknown(skb, off) == 0)
+					return 0;
+			}
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 1;
+bad:
+	kfree_skb(skb);
+	return 0;
+}
+
+/*****************************
+  Destination options header.
+ *****************************/
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+static int ipv6_dest_hao(struct sk_buff *skb, int optoff)
+{
+	struct ipv6_destopt_hao *hao;
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct in6_addr tmp_addr;
+	int ret;
+
+	if (opt->dsthao) {
+		LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n");
+		goto discard;
+	}
+	opt->dsthao = opt->dst1;
+	opt->dst1 = 0;
+
+	hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff);
+
+	if (hao->length != 16) {
+		LIMIT_NETDEBUG(
+			KERN_DEBUG "hao invalid option length = %d\n", hao->length);
+		goto discard;
+	}
+
+	if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
+		LIMIT_NETDEBUG(
+			KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr);
+		goto discard;
+	}
+
+	ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
+			       (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
+	if (unlikely(ret < 0))
+		goto discard;
+
+	if (skb_cloned(skb)) {
+		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto discard;
+
+		/* update all variable using below by copied skbuff */
+		hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) +
+						  optoff);
+		ipv6h = ipv6_hdr(skb);
+	}
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	ipv6_addr_copy(&tmp_addr, &ipv6h->saddr);
+	ipv6_addr_copy(&ipv6h->saddr, &hao->addr);
+	ipv6_addr_copy(&hao->addr, &tmp_addr);
+
+	if (skb->tstamp.tv64 == 0)
+		__net_timestamp(skb);
+
+	return 1;
+
+ discard:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+static struct tlvtype_proc tlvprocdestopt_lst[] = {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	{
+		.type	= IPV6_TLV_HAO,
+		.func	= ipv6_dest_hao,
+	},
+#endif
+	{-1,			NULL}
+};
+
+static int ipv6_destopt_rcv(struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = IP6CB(skb);
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	__u16 dstbuf;
+#endif
+	struct dst_entry *dst;
+
+	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
+				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
+		IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INHDRERRORS);
+		kfree_skb(skb);
+		return -1;
+	}
+
+	opt->lastopt = opt->dst1 = skb_network_header_len(skb);
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	dstbuf = opt->dst1;
+#endif
+
+	dst = dst_clone(skb_dst(skb));
+	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
+		dst_release(dst);
+		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+		opt = IP6CB(skb);
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		opt->nhoff = dstbuf;
+#else
+		opt->nhoff = opt->dst1;
+#endif
+		return 1;
+	}
+
+	IP6_INC_STATS_BH(dev_net(dst->dev),
+			 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+	dst_release(dst);
+	return -1;
+}
+
+/********************************
+  Routing header.
+ ********************************/
+
+/* called with rcu_read_lock() */
+static int ipv6_rthdr_rcv(struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct in6_addr *addr = NULL;
+	struct in6_addr daddr;
+	struct inet6_dev *idev;
+	int n, i;
+	struct ipv6_rt_hdr *hdr;
+	struct rt0_hdr *rthdr;
+	struct net *net = dev_net(skb->dev);
+	int accept_source_route = net->ipv6.devconf_all->accept_source_route;
+
+	idev = __in6_dev_get(skb->dev);
+	if (idev && accept_source_route > idev->cnf.accept_source_route)
+		accept_source_route = idev->cnf.accept_source_route;
+
+	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
+	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
+				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INHDRERRORS);
+		kfree_skb(skb);
+		return -1;
+	}
+
+	hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
+
+	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
+	    skb->pkt_type != PACKET_HOST) {
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INADDRERRORS);
+		kfree_skb(skb);
+		return -1;
+	}
+
+looped_back:
+	if (hdr->segments_left == 0) {
+		switch (hdr->type) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		case IPV6_SRCRT_TYPE_2:
+			/* Silently discard type 2 header unless it was
+			 * processed by own
+			 */
+			if (!addr) {
+				IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+						 IPSTATS_MIB_INADDRERRORS);
+				kfree_skb(skb);
+				return -1;
+			}
+			break;
+#endif
+		default:
+			break;
+		}
+
+		opt->lastopt = opt->srcrt = skb_network_header_len(skb);
+		skb->transport_header += (hdr->hdrlen + 1) << 3;
+		opt->dst0 = opt->dst1;
+		opt->dst1 = 0;
+		opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+		return 1;
+	}
+
+	switch (hdr->type) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	case IPV6_SRCRT_TYPE_2:
+		if (accept_source_route < 0)
+			goto unknown_rh;
+		/* Silently discard invalid RTH type 2 */
+		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_INHDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		break;
+#endif
+	default:
+		goto unknown_rh;
+	}
+
+	/*
+	 *	This is the routing header forwarding algorithm from
+	 *	RFC 2460, page 16.
+	 */
+
+	n = hdr->hdrlen >> 1;
+
+	if (hdr->segments_left > n) {
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+				  ((&hdr->segments_left) -
+				   skb_network_header(skb)));
+		return -1;
+	}
+
+	/* We are about to mangle packet header. Be careful!
+	   Do not damage packets queued somewhere.
+	 */
+	if (skb_cloned(skb)) {
+		/* the copy is a forwarded packet */
+		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_OUTDISCARDS);
+			kfree_skb(skb);
+			return -1;
+		}
+		hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
+	}
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	i = n - --hdr->segments_left;
+
+	rthdr = (struct rt0_hdr *) hdr;
+	addr = rthdr->addr;
+	addr += i - 1;
+
+	switch (hdr->type) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	case IPV6_SRCRT_TYPE_2:
+		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
+				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
+				     IPPROTO_ROUTING) < 0) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_INADDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_INADDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+
+	if (ipv6_addr_is_multicast(addr)) {
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INADDRERRORS);
+		kfree_skb(skb);
+		return -1;
+	}
+
+	ipv6_addr_copy(&daddr, addr);
+	ipv6_addr_copy(addr, &ipv6_hdr(skb)->daddr);
+	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &daddr);
+
+	skb_dst_drop(skb);
+	ip6_route_input(skb);
+	if (skb_dst(skb)->error) {
+		skb_push(skb, skb->data - skb_network_header(skb));
+		dst_input(skb);
+		return -1;
+	}
+
+	if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
+		if (ipv6_hdr(skb)->hop_limit <= 1) {
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_INHDRERRORS);
+			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+				    0);
+			kfree_skb(skb);
+			return -1;
+		}
+		ipv6_hdr(skb)->hop_limit--;
+		goto looped_back;
+	}
+
+	skb_push(skb, skb->data - skb_network_header(skb));
+	dst_input(skb);
+	return -1;
+
+unknown_rh:
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+			  (&hdr->type) - skb_network_header(skb));
+	return -1;
+}
+
+static const struct inet6_protocol rthdr_protocol = {
+	.handler	=	ipv6_rthdr_rcv,
+	.flags		=	INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
+};
+
+static const struct inet6_protocol destopt_protocol = {
+	.handler	=	ipv6_destopt_rcv,
+	.flags		=	INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
+};
+
+static const struct inet6_protocol nodata_protocol = {
+	.handler	=	dst_discard,
+	.flags		=	INET6_PROTO_NOPOLICY,
+};
+
+int __init ipv6_exthdrs_init(void)
+{
+	int ret;
+
+	ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+	if (ret)
+		goto out;
+
+	ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+	if (ret)
+		goto out_rthdr;
+
+	ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE);
+	if (ret)
+		goto out_destopt;
+
+out:
+	return ret;
+out_rthdr:
+	inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+out_destopt:
+	inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+	goto out;
+};
+
+void ipv6_exthdrs_exit(void)
+{
+	inet6_del_protocol(&nodata_protocol, IPPROTO_NONE);
+	inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
+	inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
+}
+
+/**********************************
+  Hop-by-hop options.
+ **********************************/
+
+/*
+ * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
+ */
+static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
+{
+	return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
+}
+
+static inline struct net *ipv6_skb_net(struct sk_buff *skb)
+{
+	return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev);
+}
+
+/* Router Alert as of RFC 2711 */
+
+static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
+{
+	const unsigned char *nh = skb_network_header(skb);
+
+	if (nh[optoff + 1] == 2) {
+		IP6CB(skb)->ra = optoff;
+		return 1;
+	}
+	LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
+		       nh[optoff + 1]);
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Jumbo payload */
+
+static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
+{
+	const unsigned char *nh = skb_network_header(skb);
+	struct net *net = ipv6_skb_net(skb);
+	u32 pkt_len;
+
+	if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
+		LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+			       nh[optoff+1]);
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INHDRERRORS);
+		goto drop;
+	}
+
+	pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
+	if (pkt_len <= IPV6_MAXPLEN) {
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
+		return 0;
+	}
+	if (ipv6_hdr(skb)->payload_len) {
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
+		return 0;
+	}
+
+	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
+		IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
+				 IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	}
+
+	if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+		goto drop;
+
+	return 1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct tlvtype_proc tlvprochopopt_lst[] = {
+	{
+		.type	= IPV6_TLV_ROUTERALERT,
+		.func	= ipv6_hop_ra,
+	},
+	{
+		.type	= IPV6_TLV_JUMBO,
+		.func	= ipv6_hop_jumbo,
+	},
+	{ -1, }
+};
+
+int ipv6_parse_hopopts(struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	/*
+	 * skb_network_header(skb) is equal to skb->data, and
+	 * skb_network_header_len(skb) is always equal to
+	 * sizeof(struct ipv6hdr) by definition of
+	 * hop-by-hop options.
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
+	    !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
+				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3;
+		opt = IP6CB(skb);
+		opt->nhoff = sizeof(struct ipv6hdr);
+		return 1;
+	}
+	return -1;
+}
+
+/*
+ *	Creating outbound headers.
+ *
+ *	"build" functions work when skb is filled from head to tail (datagram)
+ *	"push"	functions work when headers are added from tail to head (tcp)
+ *
+ *	In both cases we assume, that caller reserved enough room
+ *	for headers.
+ */
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+			    struct ipv6_rt_hdr *opt,
+			    struct in6_addr **addr_p)
+{
+	struct rt0_hdr *phdr, *ihdr;
+	int hops;
+
+	ihdr = (struct rt0_hdr *) opt;
+
+	phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+	memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
+
+	hops = ihdr->rt_hdr.hdrlen >> 1;
+
+	if (hops > 1)
+		memcpy(phdr->addr, ihdr->addr + 1,
+		       (hops - 1) * sizeof(struct in6_addr));
+
+	ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p);
+	*addr_p = ihdr->addr;
+
+	phdr->rt_hdr.nexthdr = *proto;
+	*proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+
+	memcpy(h, opt, ipv6_optlen(opt));
+	h->nexthdr = *proto;
+	*proto = type;
+}
+
+void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+			  u8 *proto,
+			  struct in6_addr **daddr)
+{
+	if (opt->srcrt) {
+		ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+		/*
+		 * IPV6_RTHDRDSTOPTS is ignored
+		 * unless IPV6_RTHDR is set (RFC3542).
+		 */
+		if (opt->dst0opt)
+			ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+	}
+	if (opt->hopopt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+}
+
+EXPORT_SYMBOL(ipv6_push_nfrag_opts);
+
+void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+{
+	if (opt->dst1opt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+}
+
+struct ipv6_txoptions *
+ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
+{
+	struct ipv6_txoptions *opt2;
+
+	opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+	if (opt2) {
+		long dif = (char*)opt2 - (char*)opt;
+		memcpy(opt2, opt, opt->tot_len);
+		if (opt2->hopopt)
+			*((char**)&opt2->hopopt) += dif;
+		if (opt2->dst0opt)
+			*((char**)&opt2->dst0opt) += dif;
+		if (opt2->dst1opt)
+			*((char**)&opt2->dst1opt) += dif;
+		if (opt2->srcrt)
+			*((char**)&opt2->srcrt) += dif;
+	}
+	return opt2;
+}
+
+EXPORT_SYMBOL_GPL(ipv6_dup_options);
+
+static int ipv6_renew_option(void *ohdr,
+			     struct ipv6_opt_hdr __user *newopt, int newoptlen,
+			     int inherit,
+			     struct ipv6_opt_hdr **hdr,
+			     char **p)
+{
+	if (inherit) {
+		if (ohdr) {
+			memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
+			*hdr = (struct ipv6_opt_hdr *)*p;
+			*p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr));
+		}
+	} else {
+		if (newopt) {
+			if (copy_from_user(*p, newopt, newoptlen))
+				return -EFAULT;
+			*hdr = (struct ipv6_opt_hdr *)*p;
+			if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen)
+				return -EINVAL;
+			*p += CMSG_ALIGN(newoptlen);
+		}
+	}
+	return 0;
+}
+
+struct ipv6_txoptions *
+ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
+		   int newtype,
+		   struct ipv6_opt_hdr __user *newopt, int newoptlen)
+{
+	int tot_len = 0;
+	char *p;
+	struct ipv6_txoptions *opt2;
+	int err;
+
+	if (opt) {
+		if (newtype != IPV6_HOPOPTS && opt->hopopt)
+			tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt));
+		if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt)
+			tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt));
+		if (newtype != IPV6_RTHDR && opt->srcrt)
+			tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt));
+		if (newtype != IPV6_DSTOPTS && opt->dst1opt)
+			tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
+	}
+
+	if (newopt && newoptlen)
+		tot_len += CMSG_ALIGN(newoptlen);
+
+	if (!tot_len)
+		return NULL;
+
+	tot_len += sizeof(*opt2);
+	opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
+	if (!opt2)
+		return ERR_PTR(-ENOBUFS);
+
+	memset(opt2, 0, tot_len);
+
+	opt2->tot_len = tot_len;
+	p = (char *)(opt2 + 1);
+
+	err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
+				newtype != IPV6_HOPOPTS,
+				&opt2->hopopt, &p);
+	if (err)
+		goto out;
+
+	err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
+				newtype != IPV6_RTHDRDSTOPTS,
+				&opt2->dst0opt, &p);
+	if (err)
+		goto out;
+
+	err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
+				newtype != IPV6_RTHDR,
+				(struct ipv6_opt_hdr **)&opt2->srcrt, &p);
+	if (err)
+		goto out;
+
+	err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
+				newtype != IPV6_DSTOPTS,
+				&opt2->dst1opt, &p);
+	if (err)
+		goto out;
+
+	opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
+			  (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
+			  (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0);
+	opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
+
+	return opt2;
+out:
+	sock_kfree_s(sk, opt2, opt2->tot_len);
+	return ERR_PTR(err);
+}
+
+struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+					  struct ipv6_txoptions *opt)
+{
+	/*
+	 * ignore the dest before srcrt unless srcrt is being included.
+	 * --yoshfuji
+	 */
+	if (opt && opt->dst0opt && !opt->srcrt) {
+		if (opt_space != opt) {
+			memcpy(opt_space, opt, sizeof(*opt_space));
+			opt = opt_space;
+		}
+		opt->opt_nflen -= ipv6_optlen(opt->dst0opt);
+		opt->dst0opt = NULL;
+	}
+
+	return opt;
+}
+
+/**
+ * fl6_update_dst - update flowi destination address with info given
+ *                  by srcrt option, if any.
+ *
+ * @fl6: flowi6 for which daddr is to be updated
+ * @opt: struct ipv6_txoptions in which to look for srcrt opt
+ * @orig: copy of original daddr address if modified
+ *
+ * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * and initial value of fl6->daddr set in orig
+ */
+struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
+				const struct ipv6_txoptions *opt,
+				struct in6_addr *orig)
+{
+	if (!opt || !opt->srcrt)
+		return NULL;
+
+	ipv6_addr_copy(orig, &fl6->daddr);
+	ipv6_addr_copy(&fl6->daddr, ((struct rt0_hdr *)opt->srcrt)->addr);
+	return orig;
+}
+
+EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
new file mode 100644
index 00000000..14ed0a95
--- /dev/null
+++ b/net/ipv6/exthdrs_core.c
@@ -0,0 +1,106 @@
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+#include <net/ipv6.h>
+
+/*
+ * find out if nexthdr is a well-known extension header or a protocol
+ */
+
+int ipv6_ext_hdr(u8 nexthdr)
+{
+	/*
+	 * find out if nexthdr is an extension header or a protocol
+	 */
+	return   (nexthdr == NEXTHDR_HOP)	||
+		 (nexthdr == NEXTHDR_ROUTING)	||
+		 (nexthdr == NEXTHDR_FRAGMENT)	||
+		 (nexthdr == NEXTHDR_AUTH)	||
+		 (nexthdr == NEXTHDR_NONE)	||
+		 (nexthdr == NEXTHDR_DEST);
+}
+
+/*
+ * Skip any extension headers. This is used by the ICMP module.
+ *
+ * Note that strictly speaking this conflicts with RFC 2460 4.0:
+ * ...The contents and semantics of each extension header determine whether
+ * or not to proceed to the next header.  Therefore, extension headers must
+ * be processed strictly in the order they appear in the packet; a
+ * receiver must not, for example, scan through a packet looking for a
+ * particular kind of extension header and process that header prior to
+ * processing all preceding ones.
+ *
+ * We do exactly this. This is a protocol bug. We can't decide after a
+ * seeing an unknown discard-with-error flavour TLV option if it's a
+ * ICMP error message or not (errors should never be send in reply to
+ * ICMP error messages).
+ *
+ * But I see no other way to do this. This might need to be reexamined
+ * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr".
+ * "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *	    if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *	    it returns NULL.
+ *	  - First fragment header is skipped, not-first ones
+ *	    are considered as unparsable.
+ *	  - ESP is unparsable for now and considered like
+ *	    normal payload protocol.
+ *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
+ */
+
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp)
+{
+	u8 nexthdr = *nexthdrp;
+
+	while (ipv6_ext_hdr(nexthdr)) {
+		struct ipv6_opt_hdr _hdr, *hp;
+		int hdrlen;
+
+		if (nexthdr == NEXTHDR_NONE)
+			return -1;
+		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return -1;
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			__be16 _frag_off, *fp;
+			fp = skb_header_pointer(skb,
+						start+offsetof(struct frag_hdr,
+							       frag_off),
+						sizeof(_frag_off),
+						&_frag_off);
+			if (fp == NULL)
+				return -1;
+
+			if (ntohs(*fp) & ~0x7)
+				break;
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hp->hdrlen+2)<<2;
+		else
+			hdrlen = ipv6_optlen(hp);
+
+		nexthdr = hp->nexthdr;
+		start += hdrlen;
+	}
+
+	*nexthdrp = nexthdr;
+	return start;
+}
+
+EXPORT_SYMBOL(ipv6_ext_hdr);
+EXPORT_SYMBOL(ipv6_skip_exthdr);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
new file mode 100644
index 00000000..34d244df
--- /dev/null
+++ b/net/ipv6/fib6_rules.c
@@ -0,0 +1,307 @@
+/*
+ * net/ipv6/fib6_rules.c	IPv6 Routing Policy Rules
+ *
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ * Authors
+ *	Thomas Graf		<tgraf@suug.ch>
+ *	Ville Nuorvala		<vnuorval@tcs.hut.fi>
+ */
+
+#include <linux/netdevice.h>
+
+#include <net/fib_rules.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/netlink.h>
+
+struct fib6_rule
+{
+	struct fib_rule		common;
+	struct rt6key		src;
+	struct rt6key		dst;
+	u8			tclass;
+};
+
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+				   int flags, pol_lookup_t lookup)
+{
+	struct fib_lookup_arg arg = {
+		.lookup_ptr = lookup,
+		.flags = FIB_LOOKUP_NOREF,
+	};
+
+	fib_rules_lookup(net->ipv6.fib6_rules_ops,
+			 flowi6_to_flowi(fl6), flags, &arg);
+
+	if (arg.result)
+		return arg.result;
+
+	dst_hold(&net->ipv6.ip6_null_entry->dst);
+	return &net->ipv6.ip6_null_entry->dst;
+}
+
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct rt6_info *rt = NULL;
+	struct fib6_table *table;
+	struct net *net = rule->fr_net;
+	pol_lookup_t lookup = arg->lookup_ptr;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		rt = net->ipv6.ip6_null_entry;
+		goto discard_pkt;
+	default:
+	case FR_ACT_BLACKHOLE:
+		rt = net->ipv6.ip6_blk_hole_entry;
+		goto discard_pkt;
+	case FR_ACT_PROHIBIT:
+		rt = net->ipv6.ip6_prohibit_entry;
+		goto discard_pkt;
+	}
+
+	table = fib6_get_table(net, rule->table);
+	if (table)
+		rt = lookup(net, table, flp6, flags);
+
+	if (rt != net->ipv6.ip6_null_entry) {
+		struct fib6_rule *r = (struct fib6_rule *)rule;
+
+		/*
+		 * If we need to find a source address for this traffic,
+		 * we check the result if it meets requirement of the rule.
+		 */
+		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+			struct in6_addr saddr;
+
+			if (ipv6_dev_get_saddr(net,
+					       ip6_dst_idev(&rt->dst)->dev,
+					       &flp6->daddr,
+					       rt6_flags2srcprefs(flags),
+					       &saddr))
+				goto again;
+			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
+					       r->src.plen))
+				goto again;
+			ipv6_addr_copy(&flp6->saddr, &saddr);
+		}
+		goto out;
+	}
+again:
+	dst_release(&rt->dst);
+	rt = NULL;
+	goto out;
+
+discard_pkt:
+	dst_hold(&rt->dst);
+out:
+	arg->result = rt;
+	return rt == NULL ? -EAGAIN : 0;
+}
+
+
+static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	struct fib6_rule *r = (struct fib6_rule *) rule;
+	struct flowi6 *fl6 = &fl->u.ip6;
+
+	if (r->dst.plen &&
+	    !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
+		return 0;
+
+	/*
+	 * If FIB_RULE_FIND_SADDR is set and we do not have a
+	 * source address for the traffic, we defer check for
+	 * source address.
+	 */
+	if (r->src.plen) {
+		if (flags & RT6_LOOKUP_F_HAS_SADDR) {
+			if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
+					       r->src.plen))
+				return 0;
+		} else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
+			return 0;
+	}
+
+	if (r->tclass && r->tclass != ((ntohl(fl6->flowlabel) >> 20) & 0xff))
+		return 0;
+
+	return 1;
+}
+
+static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
+	FRA_GENERIC_POLICY,
+};
+
+static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
+{
+	int err = -EINVAL;
+	struct net *net = sock_net(skb->sk);
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	if (rule->action == FR_ACT_TO_TBL) {
+		if (rule->table == RT6_TABLE_UNSPEC)
+			goto errout;
+
+		if (fib6_new_table(net, rule->table) == NULL) {
+			err = -ENOBUFS;
+			goto errout;
+		}
+	}
+
+	if (frh->src_len)
+		nla_memcpy(&rule6->src.addr, tb[FRA_SRC],
+			   sizeof(struct in6_addr));
+
+	if (frh->dst_len)
+		nla_memcpy(&rule6->dst.addr, tb[FRA_DST],
+			   sizeof(struct in6_addr));
+
+	rule6->src.plen = frh->src_len;
+	rule6->dst.plen = frh->dst_len;
+	rule6->tclass = frh->tos;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	if (frh->src_len && (rule6->src.plen != frh->src_len))
+		return 0;
+
+	if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
+		return 0;
+
+	if (frh->tos && (rule6->tclass != frh->tos))
+		return 0;
+
+	if (frh->src_len &&
+	    nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
+		return 0;
+
+	if (frh->dst_len &&
+	    nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
+		return 0;
+
+	return 1;
+}
+
+static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct fib_rule_hdr *frh)
+{
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	frh->dst_len = rule6->dst.plen;
+	frh->src_len = rule6->src.plen;
+	frh->tos = rule6->tclass;
+
+	if (rule6->dst.plen)
+		NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr),
+			&rule6->dst.addr);
+
+	if (rule6->src.plen)
+		NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr),
+			&rule6->src.addr);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
+{
+	return 0x3FFF;
+}
+
+static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
+{
+	return nla_total_size(16) /* dst */
+	       + nla_total_size(16); /* src */
+}
+
+static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = {
+	.family			= AF_INET6,
+	.rule_size		= sizeof(struct fib6_rule),
+	.addr_size		= sizeof(struct in6_addr),
+	.action			= fib6_rule_action,
+	.match			= fib6_rule_match,
+	.configure		= fib6_rule_configure,
+	.compare		= fib6_rule_compare,
+	.fill			= fib6_rule_fill,
+	.default_pref		= fib6_rule_default_pref,
+	.nlmsg_payload		= fib6_rule_nlmsg_payload,
+	.nlgroup		= RTNLGRP_IPV6_RULE,
+	.policy			= fib6_rule_policy,
+	.owner			= THIS_MODULE,
+	.fro_net		= &init_net,
+};
+
+static int __net_init fib6_rules_net_init(struct net *net)
+{
+	struct fib_rules_ops *ops;
+	int err = -ENOMEM;
+
+	ops = fib_rules_register(&fib6_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv6.fib6_rules_ops = ops;
+
+
+	err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0,
+				   RT6_TABLE_LOCAL, 0);
+	if (err)
+		goto out_fib6_rules_ops;
+
+	err = fib_default_rule_add(net->ipv6.fib6_rules_ops,
+				   0x7FFE, RT6_TABLE_MAIN, 0);
+	if (err)
+		goto out_fib6_rules_ops;
+
+out:
+	return err;
+
+out_fib6_rules_ops:
+	fib_rules_unregister(ops);
+	goto out;
+}
+
+static void __net_exit fib6_rules_net_exit(struct net *net)
+{
+	fib_rules_unregister(net->ipv6.fib6_rules_ops);
+}
+
+static struct pernet_operations fib6_rules_net_ops = {
+	.init = fib6_rules_net_init,
+	.exit = fib6_rules_net_exit,
+};
+
+int __init fib6_rules_init(void)
+{
+	return register_pernet_subsys(&fib6_rules_net_ops);
+}
+
+
+void fib6_rules_cleanup(void)
+{
+	unregister_pernet_subsys(&fib6_rules_net_ops);
+}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
new file mode 100644
index 00000000..11900417
--- /dev/null
+++ b/net/ipv6/icmp.c
@@ -0,0 +1,982 @@
+/*
+ *	Internet Control Message Protocol (ICMPv6)
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on net/ipv4/icmp.c
+ *
+ *	RFC 1885
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *	Changes:
+ *
+ *	Andi Kleen		:	exception handling
+ *	Andi Kleen			add rate limits. never reply to a icmp.
+ *					add more length checks and other fixes.
+ *	yoshfuji		:	ensure to sent parameter problem for
+ *					fragments.
+ *	YOSHIFUJI Hideaki @USAGI:	added sysctl for icmp rate limit.
+ *	Randy Dunlap and
+ *	YOSHIFUJI Hideaki @USAGI:	Per-interface statistics support
+ *	Kazunori MIYAZAWA @USAGI:       change output process to use ip6_append_data
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmpv6.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/protocol.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/inet_common.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+/*
+ *	The ICMP socket(s). This is the most convenient way to flow control
+ *	our ICMP output as well as maintain a clean interface throughout
+ *	all layers. All Socketless IP sends will soon be gone.
+ *
+ *	On SMP we have one ICMP socket per-cpu.
+ */
+static inline struct sock *icmpv6_sk(struct net *net)
+{
+	return net->ipv6.icmp_sk[smp_processor_id()];
+}
+
+static int icmpv6_rcv(struct sk_buff *skb);
+
+static const struct inet6_protocol icmpv6_protocol = {
+	.handler	=	icmpv6_rcv,
+	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+
+	sk = icmpv6_sk(net);
+	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
+		/* This can happen if the output path (f.e. SIT or
+		 * ip6ip6 tunnel) signals dst_link_failure() for an
+		 * outgoing ICMP6 packet.
+		 */
+		local_bh_enable();
+		return NULL;
+	}
+	return sk;
+}
+
+static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+{
+	spin_unlock_bh(&sk->sk_lock.slock);
+}
+
+/*
+ * Slightly more convenient version of icmpv6_send.
+ */
+void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+{
+	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos);
+	kfree_skb(skb);
+}
+
+/*
+ * Figure out, may we reply to this packet with icmp error.
+ *
+ * We do not reply, if:
+ *	- it was icmp error message.
+ *	- it is truncated, so that it is known, that protocol is ICMPV6
+ *	  (i.e. in the middle of some exthdr)
+ *
+ *	--ANK (980726)
+ */
+
+static int is_ineligible(struct sk_buff *skb)
+{
+	int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
+	int len = skb->len - ptr;
+	__u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+
+	if (len < 0)
+		return 1;
+
+	ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr);
+	if (ptr < 0)
+		return 0;
+	if (nexthdr == IPPROTO_ICMPV6) {
+		u8 _type, *tp;
+		tp = skb_header_pointer(skb,
+			ptr+offsetof(struct icmp6hdr, icmp6_type),
+			sizeof(_type), &_type);
+		if (tp == NULL ||
+		    !(*tp & ICMPV6_INFOMSG_MASK))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Check the ICMP output rate limit
+ */
+static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
+				      struct flowi6 *fl6)
+{
+	struct dst_entry *dst;
+	struct net *net = sock_net(sk);
+	bool res = false;
+
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	/*
+	 * Look up the output route.
+	 * XXX: perhaps the expire for routing entries cloned by
+	 * this lookup should be more aggressive (not longer than timeout).
+	 */
+	dst = ip6_route_output(net, sk, fl6);
+	if (dst->error) {
+		IP6_INC_STATS(net, ip6_dst_idev(dst),
+			      IPSTATS_MIB_OUTNOROUTES);
+	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+		res = true;
+	} else {
+		struct rt6_info *rt = (struct rt6_info *)dst;
+		int tmo = net->ipv6.sysctl.icmpv6_time;
+
+		/* Give more bandwidth to wider prefixes. */
+		if (rt->rt6i_dst.plen < 128)
+			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
+
+		if (!rt->rt6i_peer)
+			rt6_bind_peer(rt, 1);
+		res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo);
+	}
+	dst_release(dst);
+	return res;
+}
+
+/*
+ *	an inline helper for the "simple" if statement below
+ *	checks if parameter problem report is caused by an
+ *	unrecognized IPv6 option that has the Option Type
+ *	highest-order two bits set to 10
+ */
+
+static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
+{
+	u8 _optval, *op;
+
+	offset += skb_network_offset(skb);
+	op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval);
+	if (op == NULL)
+		return 1;
+	return (*op & 0xC0) == 0x80;
+}
+
+static int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len)
+{
+	struct sk_buff *skb;
+	struct icmp6hdr *icmp6h;
+	int err = 0;
+
+	if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+		goto out;
+
+	icmp6h = icmp6_hdr(skb);
+	memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
+	icmp6h->icmp6_cksum = 0;
+
+	if (skb_queue_len(&sk->sk_write_queue) == 1) {
+		skb->csum = csum_partial(icmp6h,
+					sizeof(struct icmp6hdr), skb->csum);
+		icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+						      &fl6->daddr,
+						      len, fl6->flowi6_proto,
+						      skb->csum);
+	} else {
+		__wsum tmp_csum = 0;
+
+		skb_queue_walk(&sk->sk_write_queue, skb) {
+			tmp_csum = csum_add(tmp_csum, skb->csum);
+		}
+
+		tmp_csum = csum_partial(icmp6h,
+					sizeof(struct icmp6hdr), tmp_csum);
+		icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr,
+						      &fl6->daddr,
+						      len, fl6->flowi6_proto,
+						      tmp_csum);
+	}
+	ip6_push_pending_frames(sk);
+out:
+	return err;
+}
+
+struct icmpv6_msg {
+	struct sk_buff	*skb;
+	int		offset;
+	uint8_t		type;
+};
+
+static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+	struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
+	struct sk_buff *org_skb = msg->skb;
+	__wsum csum = 0;
+
+	csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
+				      to, len, csum);
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	if (!(msg->type & ICMPV6_INFOMSG_MASK))
+		nf_ct_attach(skb, org_skb);
+	return 0;
+}
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+static void mip6_addr_swap(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct ipv6_destopt_hao *hao;
+	struct in6_addr tmp;
+	int off;
+
+	if (opt->dsthao) {
+		off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
+		if (likely(off >= 0)) {
+			hao = (struct ipv6_destopt_hao *)
+					(skb_network_header(skb) + off);
+			ipv6_addr_copy(&tmp, &iph->saddr);
+			ipv6_addr_copy(&iph->saddr, &hao->addr);
+			ipv6_addr_copy(&hao->addr, &tmp);
+		}
+	}
+}
+#else
+static inline void mip6_addr_swap(struct sk_buff *skb) {}
+#endif
+
+static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb,
+					     struct sock *sk, struct flowi6 *fl6)
+{
+	struct dst_entry *dst, *dst2;
+	struct flowi6 fl2;
+	int err;
+
+	err = ip6_dst_lookup(sk, &dst, fl6);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * We won't send icmp if the destination is known
+	 * anycast.
+	 */
+	if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
+		dst_release(dst);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* No need to clone since we're just using its address. */
+	dst2 = dst;
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0);
+	if (!IS_ERR(dst)) {
+		if (dst != dst2)
+			return dst;
+	} else {
+		if (PTR_ERR(dst) == -EPERM)
+			dst = NULL;
+		else
+			return dst;
+	}
+
+	err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6);
+	if (err)
+		goto relookup_failed;
+
+	err = ip6_dst_lookup(sk, &dst2, &fl2);
+	if (err)
+		goto relookup_failed;
+
+	dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP);
+	if (!IS_ERR(dst2)) {
+		dst_release(dst);
+		dst = dst2;
+	} else {
+		err = PTR_ERR(dst2);
+		if (err == -EPERM) {
+			dst_release(dst);
+			return dst2;
+		} else
+			goto relookup_failed;
+	}
+
+relookup_failed:
+	if (dst)
+		return dst;
+	return ERR_PTR(err);
+}
+
+/*
+ *	Send an ICMP message in response to a packet in error
+ */
+void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	struct inet6_dev *idev = NULL;
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	struct sock *sk;
+	struct ipv6_pinfo *np;
+	const struct in6_addr *saddr = NULL;
+	struct dst_entry *dst;
+	struct icmp6hdr tmp_hdr;
+	struct flowi6 fl6;
+	struct icmpv6_msg msg;
+	int iif = 0;
+	int addr_type = 0;
+	int len;
+	int hlimit;
+	int err = 0;
+
+	if ((u8 *)hdr < skb->head ||
+	    (skb->network_header + sizeof(*hdr)) > skb->tail)
+		return;
+
+	/*
+	 *	Make sure we respect the rules
+	 *	i.e. RFC 1885 2.4(e)
+	 *	Rule (e.1) is enforced by not using icmpv6_send
+	 *	in any code that processes icmp errors.
+	 */
+	addr_type = ipv6_addr_type(&hdr->daddr);
+
+	if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0))
+		saddr = &hdr->daddr;
+
+	/*
+	 *	Dest addr check
+	 */
+
+	if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) {
+		if (type != ICMPV6_PKT_TOOBIG &&
+		    !(type == ICMPV6_PARAMPROB &&
+		      code == ICMPV6_UNK_OPTION &&
+		      (opt_unrec(skb, info))))
+			return;
+
+		saddr = NULL;
+	}
+
+	addr_type = ipv6_addr_type(&hdr->saddr);
+
+	/*
+	 *	Source addr check
+	 */
+
+	if (addr_type & IPV6_ADDR_LINKLOCAL)
+		iif = skb->dev->ifindex;
+
+	/*
+	 *	Must not send error if the source does not uniquely
+	 *	identify a single node (RFC2463 Section 2.4).
+	 *	We check unspecified / multicast addresses here,
+	 *	and anycast addresses will be checked later.
+	 */
+	if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
+		return;
+	}
+
+	/*
+	 *	Never answer to a ICMP packet.
+	 */
+	if (is_ineligible(skb)) {
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
+		return;
+	}
+
+	mip6_addr_swap(skb);
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_ICMPV6;
+	ipv6_addr_copy(&fl6.daddr, &hdr->saddr);
+	if (saddr)
+		ipv6_addr_copy(&fl6.saddr, saddr);
+	fl6.flowi6_oif = iif;
+	fl6.fl6_icmp_type = type;
+	fl6.fl6_icmp_code = code;
+	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+	sk = icmpv6_xmit_lock(net);
+	if (sk == NULL)
+		return;
+	np = inet6_sk(sk);
+
+	if (!icmpv6_xrlim_allow(sk, type, &fl6))
+		goto out;
+
+	tmp_hdr.icmp6_type = type;
+	tmp_hdr.icmp6_code = code;
+	tmp_hdr.icmp6_cksum = 0;
+	tmp_hdr.icmp6_pointer = htonl(info);
+
+	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+		fl6.flowi6_oif = np->mcast_oif;
+
+	dst = icmpv6_route_lookup(net, skb, sk, &fl6);
+	if (IS_ERR(dst))
+		goto out;
+
+	if (ipv6_addr_is_multicast(&fl6.daddr))
+		hlimit = np->mcast_hops;
+	else
+		hlimit = np->hop_limit;
+	if (hlimit < 0)
+		hlimit = ip6_dst_hoplimit(dst);
+
+	msg.skb = skb;
+	msg.offset = skb_network_offset(skb);
+	msg.type = type;
+
+	len = skb->len - msg.offset;
+	len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
+	if (len < 0) {
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
+		goto out_dst_release;
+	}
+
+	idev = in6_dev_get(skb->dev);
+
+	err = ip6_append_data(sk, icmpv6_getfrag, &msg,
+			      len + sizeof(struct icmp6hdr),
+			      sizeof(struct icmp6hdr), hlimit,
+			      np->tclass, NULL, &fl6, (struct rt6_info*)dst,
+			      MSG_DONTWAIT, np->dontfrag);
+	if (err) {
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS);
+		ip6_flush_pending_frames(sk);
+		goto out_put;
+	}
+	err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr));
+
+out_put:
+	if (likely(idev != NULL))
+		in6_dev_put(idev);
+out_dst_release:
+	dst_release(dst);
+out:
+	icmpv6_xmit_unlock(sk);
+}
+
+EXPORT_SYMBOL(icmpv6_send);
+
+static void icmpv6_echo_reply(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	struct inet6_dev *idev;
+	struct ipv6_pinfo *np;
+	const struct in6_addr *saddr = NULL;
+	struct icmp6hdr *icmph = icmp6_hdr(skb);
+	struct icmp6hdr tmp_hdr;
+	struct flowi6 fl6;
+	struct icmpv6_msg msg;
+	struct dst_entry *dst;
+	int err = 0;
+	int hlimit;
+
+	saddr = &ipv6_hdr(skb)->daddr;
+
+	if (!ipv6_unicast_destination(skb))
+		saddr = NULL;
+
+	memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
+	tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_ICMPV6;
+	ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr);
+	if (saddr)
+		ipv6_addr_copy(&fl6.saddr, saddr);
+	fl6.flowi6_oif = skb->dev->ifindex;
+	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+	sk = icmpv6_xmit_lock(net);
+	if (sk == NULL)
+		return;
+	np = inet6_sk(sk);
+
+	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+		fl6.flowi6_oif = np->mcast_oif;
+
+	err = ip6_dst_lookup(sk, &dst, &fl6);
+	if (err)
+		goto out;
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
+	if (IS_ERR(dst))
+		goto out;
+
+	if (ipv6_addr_is_multicast(&fl6.daddr))
+		hlimit = np->mcast_hops;
+	else
+		hlimit = np->hop_limit;
+	if (hlimit < 0)
+		hlimit = ip6_dst_hoplimit(dst);
+
+	idev = in6_dev_get(skb->dev);
+
+	msg.skb = skb;
+	msg.offset = 0;
+	msg.type = ICMPV6_ECHO_REPLY;
+
+	err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
+				sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6,
+				(struct rt6_info*)dst, MSG_DONTWAIT,
+				np->dontfrag);
+
+	if (err) {
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS);
+		ip6_flush_pending_frames(sk);
+		goto out_put;
+	}
+	err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
+
+out_put:
+	if (likely(idev != NULL))
+		in6_dev_put(idev);
+	dst_release(dst);
+out:
+	icmpv6_xmit_unlock(sk);
+}
+
+static void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
+{
+	const struct inet6_protocol *ipprot;
+	int inner_offset;
+	int hash;
+	u8 nexthdr;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		return;
+
+	nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
+	if (ipv6_ext_hdr(nexthdr)) {
+		/* now skip over extension headers */
+		inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+		if (inner_offset<0)
+			return;
+	} else {
+		inner_offset = sizeof(struct ipv6hdr);
+	}
+
+	/* Checkin header including 8 bytes of inner protocol header. */
+	if (!pskb_may_pull(skb, inner_offset+8))
+		return;
+
+	/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
+	   Without this we will not able f.e. to make source routed
+	   pmtu discovery.
+	   Corresponding argument (opt) to notifiers is already added.
+	   --ANK (980726)
+	 */
+
+	hash = nexthdr & (MAX_INET_PROTOS - 1);
+
+	rcu_read_lock();
+	ipprot = rcu_dereference(inet6_protos[hash]);
+	if (ipprot && ipprot->err_handler)
+		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+	rcu_read_unlock();
+
+	raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
+}
+
+/*
+ *	Handle icmp messages
+ */
+
+static int icmpv6_rcv(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(dev);
+	const struct in6_addr *saddr, *daddr;
+	const struct ipv6hdr *orig_hdr;
+	struct icmp6hdr *hdr;
+	u8 type;
+
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
+		int nh;
+
+		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
+				 XFRM_STATE_ICMP))
+			goto drop_no_count;
+
+		if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr)))
+			goto drop_no_count;
+
+		nh = skb_network_offset(skb);
+		skb_set_network_header(skb, sizeof(*hdr));
+
+		if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+			goto drop_no_count;
+
+		skb_set_network_header(skb, nh);
+	}
+
+	ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS);
+
+	saddr = &ipv6_hdr(skb)->saddr;
+	daddr = &ipv6_hdr(skb)->daddr;
+
+	/* Perform checksum. */
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
+				     skb->csum))
+			break;
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len,
+					     IPPROTO_ICMPV6, 0));
+		if (__skb_checksum_complete(skb)) {
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n",
+				       saddr, daddr);
+			goto discard_it;
+		}
+	}
+
+	if (!pskb_pull(skb, sizeof(*hdr)))
+		goto discard_it;
+
+	hdr = icmp6_hdr(skb);
+
+	type = hdr->icmp6_type;
+
+	ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type);
+
+	switch (type) {
+	case ICMPV6_ECHO_REQUEST:
+		icmpv6_echo_reply(skb);
+		break;
+
+	case ICMPV6_ECHO_REPLY:
+		/* we couldn't care less */
+		break;
+
+	case ICMPV6_PKT_TOOBIG:
+		/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
+		   standard destination cache. Seems, only "advanced"
+		   destination cache will allow to solve this problem
+		   --ANK (980726)
+		 */
+		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+			goto discard_it;
+		hdr = icmp6_hdr(skb);
+		orig_hdr = (struct ipv6hdr *) (hdr + 1);
+		rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev,
+				   ntohl(hdr->icmp6_mtu));
+
+		/*
+		 *	Drop through to notify
+		 */
+
+	case ICMPV6_DEST_UNREACH:
+	case ICMPV6_TIME_EXCEED:
+	case ICMPV6_PARAMPROB:
+		icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+		break;
+
+	case NDISC_ROUTER_SOLICITATION:
+	case NDISC_ROUTER_ADVERTISEMENT:
+	case NDISC_NEIGHBOUR_SOLICITATION:
+	case NDISC_NEIGHBOUR_ADVERTISEMENT:
+	case NDISC_REDIRECT:
+		ndisc_rcv(skb);
+		break;
+
+	case ICMPV6_MGM_QUERY:
+		igmp6_event_query(skb);
+		break;
+
+	case ICMPV6_MGM_REPORT:
+		igmp6_event_report(skb);
+		break;
+
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_NI_QUERY:
+	case ICMPV6_NI_REPLY:
+	case ICMPV6_MLD2_REPORT:
+	case ICMPV6_DHAAD_REQUEST:
+	case ICMPV6_DHAAD_REPLY:
+	case ICMPV6_MOBILE_PREFIX_SOL:
+	case ICMPV6_MOBILE_PREFIX_ADV:
+		break;
+
+	default:
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
+
+		/* informational */
+		if (type & ICMPV6_INFOMSG_MASK)
+			break;
+
+		/*
+		 * error of unknown type.
+		 * must pass to upper level
+		 */
+
+		icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+	}
+
+	kfree_skb(skb);
+	return 0;
+
+discard_it:
+	ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS);
+drop_no_count:
+	kfree_skb(skb);
+	return 0;
+}
+
+void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
+		      u8 type,
+		      const struct in6_addr *saddr,
+		      const struct in6_addr *daddr,
+		      int oif)
+{
+	memset(fl6, 0, sizeof(*fl6));
+	ipv6_addr_copy(&fl6->saddr, saddr);
+	ipv6_addr_copy(&fl6->daddr, daddr);
+	fl6->flowi6_proto 	= IPPROTO_ICMPV6;
+	fl6->fl6_icmp_type	= type;
+	fl6->fl6_icmp_code	= 0;
+	fl6->flowi6_oif		= oif;
+	security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+}
+
+/*
+ * Special lock-class for __icmpv6_sk:
+ */
+static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
+
+static int __net_init icmpv6_sk_init(struct net *net)
+{
+	struct sock *sk;
+	int err, i, j;
+
+	net->ipv6.icmp_sk =
+		kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL);
+	if (net->ipv6.icmp_sk == NULL)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		err = inet_ctl_sock_create(&sk, PF_INET6,
+					   SOCK_RAW, IPPROTO_ICMPV6, net);
+		if (err < 0) {
+			printk(KERN_ERR
+			       "Failed to initialize the ICMP6 control socket "
+			       "(err %d).\n",
+			       err);
+			goto fail;
+		}
+
+		net->ipv6.icmp_sk[i] = sk;
+
+		/*
+		 * Split off their lock-class, because sk->sk_dst_lock
+		 * gets used from softirqs, which is safe for
+		 * __icmpv6_sk (because those never get directly used
+		 * via userspace syscalls), but unsafe for normal sockets.
+		 */
+		lockdep_set_class(&sk->sk_dst_lock,
+				  &icmpv6_socket_sk_dst_lock_key);
+
+		/* Enough space for 2 64K ICMP packets, including
+		 * sk_buff struct overhead.
+		 */
+		sk->sk_sndbuf =
+			(2 * ((64 * 1024) + sizeof(struct sk_buff)));
+	}
+	return 0;
+
+ fail:
+	for (j = 0; j < i; j++)
+		inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]);
+	kfree(net->ipv6.icmp_sk);
+	return err;
+}
+
+static void __net_exit icmpv6_sk_exit(struct net *net)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]);
+	}
+	kfree(net->ipv6.icmp_sk);
+}
+
+static struct pernet_operations icmpv6_sk_ops = {
+       .init = icmpv6_sk_init,
+       .exit = icmpv6_sk_exit,
+};
+
+int __init icmpv6_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&icmpv6_sk_ops);
+	if (err < 0)
+		return err;
+
+	err = -EAGAIN;
+	if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
+		goto fail;
+	return 0;
+
+fail:
+	printk(KERN_ERR "Failed to register ICMP6 protocol\n");
+	unregister_pernet_subsys(&icmpv6_sk_ops);
+	return err;
+}
+
+void icmpv6_cleanup(void)
+{
+	unregister_pernet_subsys(&icmpv6_sk_ops);
+	inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
+}
+
+
+static const struct icmp6_err {
+	int err;
+	int fatal;
+} tab_unreach[] = {
+	{	/* NOROUTE */
+		.err	= ENETUNREACH,
+		.fatal	= 0,
+	},
+	{	/* ADM_PROHIBITED */
+		.err	= EACCES,
+		.fatal	= 1,
+	},
+	{	/* Was NOT_NEIGHBOUR, now reserved */
+		.err	= EHOSTUNREACH,
+		.fatal	= 0,
+	},
+	{	/* ADDR_UNREACH	*/
+		.err	= EHOSTUNREACH,
+		.fatal	= 0,
+	},
+	{	/* PORT_UNREACH	*/
+		.err	= ECONNREFUSED,
+		.fatal	= 1,
+	},
+};
+
+int icmpv6_err_convert(u8 type, u8 code, int *err)
+{
+	int fatal = 0;
+
+	*err = EPROTO;
+
+	switch (type) {
+	case ICMPV6_DEST_UNREACH:
+		fatal = 1;
+		if (code <= ICMPV6_PORT_UNREACH) {
+			*err  = tab_unreach[code].err;
+			fatal = tab_unreach[code].fatal;
+		}
+		break;
+
+	case ICMPV6_PKT_TOOBIG:
+		*err = EMSGSIZE;
+		break;
+
+	case ICMPV6_PARAMPROB:
+		*err = EPROTO;
+		fatal = 1;
+		break;
+
+	case ICMPV6_TIME_EXCEED:
+		*err = EHOSTUNREACH;
+		break;
+	}
+
+	return fatal;
+}
+
+EXPORT_SYMBOL(icmpv6_err_convert);
+
+#ifdef CONFIG_SYSCTL
+ctl_table ipv6_icmp_table_template[] = {
+	{
+		.procname	= "ratelimit",
+		.data		= &init_net.ipv6.sysctl.icmpv6_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{ },
+};
+
+struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(ipv6_icmp_table_template,
+			sizeof(ipv6_icmp_table_template),
+			GFP_KERNEL);
+
+	if (table)
+		table[0].data = &net->ipv6.sysctl.icmpv6_time;
+
+	return table;
+}
+#endif
+
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 00000000..8a58e8cf
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,252 @@
+/*
+ * INET        An implementation of the TCP/IP protocol suite for the LINUX
+ *             operating system.  INET is implemented using the  BSD Socket
+ *             interface as the means of communication with the user level.
+ *
+ *             Support for INET6 connection oriented protocols.
+ *
+ * Authors:    See the TCPv6 sources
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/sock.h>
+#include <net/inet6_connection_sock.h>
+
+int inet6_csk_bind_conflict(const struct sock *sk,
+			    const struct inet_bind_bucket *tb)
+{
+	const struct sock *sk2;
+	const struct hlist_node *node;
+
+	/* We must walk the whole port owner list in this case. -DaveM */
+	/*
+	 * See comment in inet_csk_bind_conflict about sock lookup
+	 * vs net namespaces issues.
+	 */
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+		    (!sk->sk_reuse || !sk2->sk_reuse ||
+		     sk2->sk_state == TCP_LISTEN) &&
+		     ipv6_rcv_saddr_equal(sk, sk2))
+			break;
+	}
+
+	return node != NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
+struct dst_entry *inet6_csk_route_req(struct sock *sk,
+				      const struct request_sock *req)
+{
+	struct inet6_request_sock *treq = inet6_rsk(req);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct in6_addr *final_p, final;
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+	final_p = fl6_update_dst(&fl6, np->opt, &final);
+	ipv6_addr_copy(&fl6.saddr, &treq->loc_addr);
+	fl6.flowi6_oif = sk->sk_bound_dev_if;
+	fl6.flowi6_mark = sk->sk_mark;
+	fl6.fl6_dport = inet_rsk(req)->rmt_port;
+	fl6.fl6_sport = inet_rsk(req)->loc_port;
+	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+	if (IS_ERR(dst))
+		return NULL;
+
+	return dst;
+}
+
+/*
+ * request_sock (formerly open request) hash tables.
+ */
+static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
+			   const u32 rnd, const u16 synq_hsize)
+{
+	u32 c;
+
+	c = jhash_3words((__force u32)raddr->s6_addr32[0],
+			 (__force u32)raddr->s6_addr32[1],
+			 (__force u32)raddr->s6_addr32[2],
+			 rnd);
+
+	c = jhash_2words((__force u32)raddr->s6_addr32[3],
+			 (__force u32)rport,
+			 c);
+
+	return c & (synq_hsize - 1);
+}
+
+struct request_sock *inet6_csk_search_req(const struct sock *sk,
+					  struct request_sock ***prevp,
+					  const __be16 rport,
+					  const struct in6_addr *raddr,
+					  const struct in6_addr *laddr,
+					  const int iif)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	struct request_sock *req, **prev;
+
+	for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
+						     lopt->hash_rnd,
+						     lopt->nr_table_entries)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		const struct inet6_request_sock *treq = inet6_rsk(req);
+
+		if (inet_rsk(req)->rmt_port == rport &&
+		    req->rsk_ops->family == AF_INET6 &&
+		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
+		    (!treq->iif || treq->iif == iif)) {
+			WARN_ON(req->sk != NULL);
+			*prevp = prev;
+			return req;
+		}
+	}
+
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_search_req);
+
+void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+				    struct request_sock *req,
+				    const unsigned long timeout)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
+				      inet_rsk(req)->rmt_port,
+				      lopt->hash_rnd, lopt->nr_table_entries);
+
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+	inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+	sin6->sin6_family = AF_INET6;
+	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
+	sin6->sin6_port	= inet_sk(sk)->inet_dport;
+	/* We do not store received flowlabel for TCP */
+	sin6->sin6_flowinfo = 0;
+	sin6->sin6_scope_id = 0;
+	if (sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		sin6->sin6_scope_id = sk->sk_bound_dev_if;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+static inline
+void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
+			   struct in6_addr *daddr, struct in6_addr *saddr)
+{
+	__ip6_dst_store(sk, dst, daddr, saddr);
+
+#ifdef CONFIG_XFRM
+	{
+		struct rt6_info *rt = (struct rt6_info  *)dst;
+		rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid);
+	}
+#endif
+}
+
+static inline
+struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
+{
+	struct dst_entry *dst;
+
+	dst = __sk_dst_check(sk, cookie);
+
+#ifdef CONFIG_XFRM
+	if (dst) {
+		struct rt6_info *rt = (struct rt6_info *)dst;
+		if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) {
+			__sk_dst_reset(sk);
+			dst = NULL;
+		}
+	}
+#endif
+
+	return dst;
+}
+
+int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	struct in6_addr *final_p, final;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = sk->sk_protocol;
+	ipv6_addr_copy(&fl6.daddr, &np->daddr);
+	ipv6_addr_copy(&fl6.saddr, &np->saddr);
+	fl6.flowlabel = np->flow_label;
+	IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+	fl6.flowi6_oif = sk->sk_bound_dev_if;
+	fl6.flowi6_mark = sk->sk_mark;
+	fl6.fl6_sport = inet->inet_sport;
+	fl6.fl6_dport = inet->inet_dport;
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+
+		if (IS_ERR(dst)) {
+			sk->sk_err_soft = -PTR_ERR(dst);
+			sk->sk_route_caps = 0;
+			kfree_skb(skb);
+			return PTR_ERR(dst);
+		}
+
+		__inet6_csk_dst_store(sk, dst, NULL, NULL);
+	}
+
+	skb_dst_set(skb, dst_clone(dst));
+
+	/* Restore final destination back after routing done */
+	ipv6_addr_copy(&fl6.daddr, &np->daddr);
+
+	return ip6_xmit(sk, skb, &fl6, np->opt);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 00000000..73f1a00a
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,304 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET6 transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp, generalised here
+ * 		by Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/secure_seq.h>
+#include <net/ip.h>
+
+int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw)
+{
+	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	int twrefcnt = 0;
+
+	WARN_ON(!sk_unhashed(sk));
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct inet_listen_hashbucket *ilb;
+
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		spin_lock(&ilb->lock);
+		__sk_nulls_add_node_rcu(sk, &ilb->head);
+		spin_unlock(&ilb->lock);
+	} else {
+		unsigned int hash;
+		struct hlist_nulls_head *list;
+		spinlock_t *lock;
+
+		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
+		list = &inet_ehash_bucket(hashinfo, hash)->chain;
+		lock = inet_ehash_lockp(hashinfo, hash);
+		spin_lock(lock);
+		__sk_nulls_add_node_rcu(sk, list);
+		if (tw) {
+			WARN_ON(sk->sk_hash != tw->tw_hash);
+			twrefcnt = inet_twsk_unhash(tw);
+		}
+		spin_unlock(lock);
+	}
+
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	return twrefcnt;
+}
+EXPORT_SYMBOL(__inet6_hash);
+
+/*
+ * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * The sockhash lock must be held as a reader here.
+ */
+struct sock *__inet6_lookup_established(struct net *net,
+					struct inet_hashinfo *hashinfo,
+					   const struct in6_addr *saddr,
+					   const __be16 sport,
+					   const struct in6_addr *daddr,
+					   const u16 hnum,
+					   const int dif)
+{
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+	unsigned int slot = hash & hashinfo->ehash_mask;
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+
+
+	rcu_read_lock();
+begin:
+	sk_nulls_for_each_rcu(sk, node, &head->chain) {
+		/* For IPV6 do the cheaper port and family tests first. */
+		if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+				goto begintw;
+			if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+				sock_put(sk);
+				goto begin;
+			}
+		goto out;
+		}
+	}
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+begintw:
+	/* Must check for a TIME_WAIT'er before going to listener hash. */
+	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+		if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+				sk = NULL;
+				goto out;
+			}
+			if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+				sock_put(sk);
+				goto begintw;
+			}
+			goto out;
+		}
+	}
+	if (get_nulls_value(node) != slot)
+		goto begintw;
+	sk = NULL;
+out:
+	rcu_read_unlock();
+	return sk;
+}
+EXPORT_SYMBOL(__inet6_lookup_established);
+
+static inline int compute_score(struct sock *sk, struct net *net,
+				const unsigned short hnum,
+				const struct in6_addr *daddr,
+				const int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
+	    sk->sk_family == PF_INET6) {
+		const struct ipv6_pinfo *np = inet6_sk(sk);
+
+		score = 1;
+		if (!ipv6_addr_any(&np->rcv_saddr)) {
+			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
+struct sock *inet6_lookup_listener(struct net *net,
+		struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
+		const unsigned short hnum, const int dif)
+{
+	struct sock *sk;
+	const struct hlist_nulls_node *node;
+	struct sock *result;
+	int score, hiscore;
+	unsigned int hash = inet_lhashfn(net, hnum);
+	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+
+	rcu_read_lock();
+begin:
+	result = NULL;
+	hiscore = -1;
+	sk_nulls_for_each(sk, node, &ilb->head) {
+		score = compute_score(sk, net, hnum, daddr, dif);
+		if (score > hiscore) {
+			hiscore = score;
+			result = sk;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+		goto begin;
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, daddr,
+				  dif) < hiscore)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+
+struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+			  const struct in6_addr *saddr, const __be16 sport,
+			  const struct in6_addr *daddr, const __be16 dport,
+			  const int dif)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+	sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	local_bh_enable();
+
+	return sk;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+				     struct sock *sk, const __u16 lport,
+				     struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *daddr = &np->rcv_saddr;
+	const struct in6_addr *saddr = &np->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
+	struct net *net = sock_net(sk);
+	const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
+						inet->inet_dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_nulls_node *node;
+	struct inet_timewait_sock *tw;
+	int twrefcnt = 0;
+
+	spin_lock(lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_nulls_for_each(sk2, node, &head->twchain) {
+		tw = inet_twsk(sk2);
+
+		if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_nulls_for_each(sk2, node, &head->chain) {
+		if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->inet_num = lport;
+	inet->inet_sport = htons(lport);
+	sk->sk_hash = hash;
+	WARN_ON(!sk_unhashed(sk));
+	__sk_nulls_add_node_rcu(sk, &head->chain);
+	if (tw) {
+		twrefcnt = inet_twsk_unhash(tw);
+		NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+	}
+	spin_unlock(lock);
+	if (twrefcnt)
+		inet_twsk_put(tw);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+	if (twp) {
+		*twp = tw;
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+
+		inet_twsk_put(tw);
+	}
+	return 0;
+
+not_unique:
+	spin_unlock(lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+					  np->daddr.s6_addr32,
+					  inet->inet_dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+		       struct sock *sk)
+{
+	return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk),
+			__inet6_check_established, __inet6_hash);
+}
+
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
new file mode 100644
index 00000000..0f9b37a1
--- /dev/null
+++ b/net/ipv6/ip6_fib.c
@@ -0,0 +1,1606 @@
+/*
+ *	Linux INET6 implementation
+ *	Forwarding Information Database
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * 	Changes:
+ * 	Yuji SEKIYA @USAGI:	Support default route on router node;
+ * 				remove ip6_null_entry from the top of
+ * 				routing table.
+ * 	Ville Nuorvala:		Fixed routing subtrees.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/route.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#ifdef 	CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#endif
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
+static struct kmem_cache * fib6_node_kmem __read_mostly;
+
+enum fib_walk_state_t
+{
+#ifdef CONFIG_IPV6_SUBTREES
+	FWS_S,
+#endif
+	FWS_L,
+	FWS_R,
+	FWS_C,
+	FWS_U
+};
+
+struct fib6_cleaner_t
+{
+	struct fib6_walker_t w;
+	struct net *net;
+	int (*func)(struct rt6_info *, void *arg);
+	void *arg;
+};
+
+static DEFINE_RWLOCK(fib6_walker_lock);
+
+#ifdef CONFIG_IPV6_SUBTREES
+#define FWS_INIT FWS_S
+#else
+#define FWS_INIT FWS_L
+#endif
+
+static void fib6_prune_clones(struct net *net, struct fib6_node *fn,
+			      struct rt6_info *rt);
+static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static int fib6_walk(struct fib6_walker_t *w);
+static int fib6_walk_continue(struct fib6_walker_t *w);
+
+/*
+ *	A routing update causes an increase of the serial number on the
+ *	affected subtree. This allows for cached routes to be asynchronously
+ *	tested when modifications are made to the destination cache as a
+ *	result of redirects, path MTU changes, etc.
+ */
+
+static __u32 rt_sernum;
+
+static void fib6_gc_timer_cb(unsigned long arg);
+
+static LIST_HEAD(fib6_walkers);
+#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+
+static inline void fib6_walker_link(struct fib6_walker_t *w)
+{
+	write_lock_bh(&fib6_walker_lock);
+	list_add(&w->lh, &fib6_walkers);
+	write_unlock_bh(&fib6_walker_lock);
+}
+
+static inline void fib6_walker_unlink(struct fib6_walker_t *w)
+{
+	write_lock_bh(&fib6_walker_lock);
+	list_del(&w->lh);
+	write_unlock_bh(&fib6_walker_lock);
+}
+static __inline__ u32 fib6_new_sernum(void)
+{
+	u32 n = ++rt_sernum;
+	if ((__s32)n <= 0)
+		rt_sernum = n = 1;
+	return n;
+}
+
+/*
+ *	Auxiliary address test functions for the radix tree.
+ *
+ *	These assume a 32bit processor (although it will work on
+ *	64bit processors)
+ */
+
+/*
+ *	test bit
+ */
+#if defined(__LITTLE_ENDIAN)
+# define BITOP_BE32_SWIZZLE	(0x1F & ~7)
+#else
+# define BITOP_BE32_SWIZZLE	0
+#endif
+
+static __inline__ __be32 addr_bit_set(const void *token, int fn_bit)
+{
+	const __be32 *addr = token;
+	/*
+	 * Here,
+	 * 	1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
+	 * is optimized version of
+	 *	htonl(1 << ((~fn_bit)&0x1F))
+	 * See include/asm-generic/bitops/le.h.
+	 */
+	return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
+	       addr[fn_bit >> 5];
+}
+
+static __inline__ struct fib6_node * node_alloc(void)
+{
+	struct fib6_node *fn;
+
+	fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+
+	return fn;
+}
+
+static __inline__ void node_free(struct fib6_node * fn)
+{
+	kmem_cache_free(fib6_node_kmem, fn);
+}
+
+static __inline__ void rt6_release(struct rt6_info *rt)
+{
+	if (atomic_dec_and_test(&rt->rt6i_ref))
+		dst_free(&rt->dst);
+}
+
+static void fib6_link_table(struct net *net, struct fib6_table *tb)
+{
+	unsigned int h;
+
+	/*
+	 * Initialize table lock at a single place to give lockdep a key,
+	 * tables aren't visible prior to being linked to the list.
+	 */
+	rwlock_init(&tb->tb6_lock);
+
+	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
+
+	/*
+	 * No protection necessary, this is the only list mutatation
+	 * operation, tables never disappear once they exist.
+	 */
+	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+}
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
+{
+	struct fib6_table *table;
+
+	table = kzalloc(sizeof(*table), GFP_ATOMIC);
+	if (table != NULL) {
+		table->tb6_id = id;
+		table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	}
+
+	return table;
+}
+
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
+{
+	struct fib6_table *tb;
+
+	if (id == 0)
+		id = RT6_TABLE_MAIN;
+	tb = fib6_get_table(net, id);
+	if (tb)
+		return tb;
+
+	tb = fib6_alloc_table(net, id);
+	if (tb != NULL)
+		fib6_link_table(net, tb);
+
+	return tb;
+}
+
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
+{
+	struct fib6_table *tb;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT6_TABLE_MAIN;
+	h = id & (FIB6_TABLE_HASHSZ - 1);
+	rcu_read_lock();
+	head = &net->ipv6.fib_table_hash[h];
+	hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
+		if (tb->tb6_id == id) {
+			rcu_read_unlock();
+			return tb;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void __net_init fib6_tables_init(struct net *net)
+{
+	fib6_link_table(net, net->ipv6.fib6_main_tbl);
+	fib6_link_table(net, net->ipv6.fib6_local_tbl);
+}
+#else
+
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
+{
+	return fib6_get_table(net, id);
+}
+
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
+{
+	  return net->ipv6.fib6_main_tbl;
+}
+
+struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+				   int flags, pol_lookup_t lookup)
+{
+	return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+}
+
+static void __net_init fib6_tables_init(struct net *net)
+{
+	fib6_link_table(net, net->ipv6.fib6_main_tbl);
+}
+
+#endif
+
+static int fib6_dump_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+		res = rt6_dump_route(rt, w->args);
+		if (res < 0) {
+			/* Frame is full, suspend walking */
+			w->leaf = rt;
+			return 1;
+		}
+		WARN_ON(res == 0);
+	}
+	w->leaf = NULL;
+	return 0;
+}
+
+static void fib6_dump_end(struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w = (void*)cb->args[2];
+
+	if (w) {
+		if (cb->args[4]) {
+			cb->args[4] = 0;
+			fib6_walker_unlink(w);
+		}
+		cb->args[2] = 0;
+		kfree(w);
+	}
+	cb->done = (void*)cb->args[3];
+	cb->args[1] = 3;
+}
+
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+	fib6_dump_end(cb);
+	return cb->done ? cb->done(cb) : 0;
+}
+
+static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
+			   struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w;
+	int res;
+
+	w = (void *)cb->args[2];
+	w->root = &table->tb6_root;
+
+	if (cb->args[4] == 0) {
+		w->count = 0;
+		w->skip = 0;
+
+		read_lock_bh(&table->tb6_lock);
+		res = fib6_walk(w);
+		read_unlock_bh(&table->tb6_lock);
+		if (res > 0) {
+			cb->args[4] = 1;
+			cb->args[5] = w->root->fn_sernum;
+		}
+	} else {
+		if (cb->args[5] != w->root->fn_sernum) {
+			/* Begin at the root if the tree changed */
+			cb->args[5] = w->root->fn_sernum;
+			w->state = FWS_INIT;
+			w->node = w->root;
+			w->skip = w->count;
+		} else
+			w->skip = 0;
+
+		read_lock_bh(&table->tb6_lock);
+		res = fib6_walk_continue(w);
+		read_unlock_bh(&table->tb6_lock);
+		if (res <= 0) {
+			fib6_walker_unlink(w);
+			cb->args[4] = 0;
+		}
+	}
+
+	return res;
+}
+
+static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct rt6_rtnl_dump_arg arg;
+	struct fib6_walker_t *w;
+	struct fib6_table *tb;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	int res = 0;
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	w = (void *)cb->args[2];
+	if (w == NULL) {
+		/* New dump:
+		 *
+		 * 1. hook callback destructor.
+		 */
+		cb->args[3] = (long)cb->done;
+		cb->done = fib6_dump_done;
+
+		/*
+		 * 2. allocate and initialize walker.
+		 */
+		w = kzalloc(sizeof(*w), GFP_ATOMIC);
+		if (w == NULL)
+			return -ENOMEM;
+		w->func = fib6_dump_node;
+		cb->args[2] = (long)w;
+	}
+
+	arg.skb = skb;
+	arg.cb = cb;
+	arg.net = net;
+	w->args = &arg;
+
+	rcu_read_lock();
+	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
+		e = 0;
+		head = &net->ipv6.fib_table_hash[h];
+		hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
+			if (e < s_e)
+				goto next;
+			res = fib6_dump_table(tb, skb, cb);
+			if (res != 0)
+				goto out;
+next:
+			e++;
+		}
+	}
+out:
+	rcu_read_unlock();
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	res = res < 0 ? res : skb->len;
+	if (res <= 0)
+		fib6_dump_end(cb);
+	return res;
+}
+
+/*
+ *	Routing Table
+ *
+ *	return the appropriate node for a routing tree "add" operation
+ *	by either creating and inserting or by returning an existing
+ *	node.
+ */
+
+static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
+				     int addrlen, int plen,
+				     int offset)
+{
+	struct fib6_node *fn, *in, *ln;
+	struct fib6_node *pn = NULL;
+	struct rt6key *key;
+	int	bit;
+	__be32	dir = 0;
+	__u32	sernum = fib6_new_sernum();
+
+	RT6_TRACE("fib6_add_1\n");
+
+	/* insert node in tree */
+
+	fn = root;
+
+	do {
+		key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
+			goto insert_above;
+
+		/*
+		 *	Exact match ?
+		 */
+
+		if (plen == fn->fn_bit) {
+			/* clean up an intermediate node */
+			if ((fn->fn_flags & RTN_RTINFO) == 0) {
+				rt6_release(fn->leaf);
+				fn->leaf = NULL;
+			}
+
+			fn->fn_sernum = sernum;
+
+			return fn;
+		}
+
+		/*
+		 *	We have more bits to go
+		 */
+
+		/* Try to walk down on tree. */
+		fn->fn_sernum = sernum;
+		dir = addr_bit_set(addr, fn->fn_bit);
+		pn = fn;
+		fn = dir ? fn->right: fn->left;
+	} while (fn);
+
+	/*
+	 *	We walked to the bottom of tree.
+	 *	Create new leaf node without children.
+	 */
+
+	ln = node_alloc();
+
+	if (ln == NULL)
+		return NULL;
+	ln->fn_bit = plen;
+
+	ln->parent = pn;
+	ln->fn_sernum = sernum;
+
+	if (dir)
+		pn->right = ln;
+	else
+		pn->left  = ln;
+
+	return ln;
+
+
+insert_above:
+	/*
+	 * split since we don't have a common prefix anymore or
+	 * we have a less significant route.
+	 * we've to insert an intermediate node on the list
+	 * this new node will point to the one we need to create
+	 * and the current
+	 */
+
+	pn = fn->parent;
+
+	/* find 1st bit in difference between the 2 addrs.
+
+	   See comment in __ipv6_addr_diff: bit may be an invalid value,
+	   but if it is >= plen, the value is ignored in any case.
+	 */
+
+	bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
+
+	/*
+	 *		(intermediate)[in]
+	 *	          /	   \
+	 *	(new leaf node)[ln] (old node)[fn]
+	 */
+	if (plen > bit) {
+		in = node_alloc();
+		ln = node_alloc();
+
+		if (in == NULL || ln == NULL) {
+			if (in)
+				node_free(in);
+			if (ln)
+				node_free(ln);
+			return NULL;
+		}
+
+		/*
+		 * new intermediate node.
+		 * RTN_RTINFO will
+		 * be off since that an address that chooses one of
+		 * the branches would not match less specific routes
+		 * in the other branch
+		 */
+
+		in->fn_bit = bit;
+
+		in->parent = pn;
+		in->leaf = fn->leaf;
+		atomic_inc(&in->leaf->rt6i_ref);
+
+		in->fn_sernum = sernum;
+
+		/* update parent pointer */
+		if (dir)
+			pn->right = in;
+		else
+			pn->left  = in;
+
+		ln->fn_bit = plen;
+
+		ln->parent = in;
+		fn->parent = in;
+
+		ln->fn_sernum = sernum;
+
+		if (addr_bit_set(addr, bit)) {
+			in->right = ln;
+			in->left  = fn;
+		} else {
+			in->left  = ln;
+			in->right = fn;
+		}
+	} else { /* plen <= bit */
+
+		/*
+		 *		(new leaf node)[ln]
+		 *	          /	   \
+		 *	     (old node)[fn] NULL
+		 */
+
+		ln = node_alloc();
+
+		if (ln == NULL)
+			return NULL;
+
+		ln->fn_bit = plen;
+
+		ln->parent = pn;
+
+		ln->fn_sernum = sernum;
+
+		if (dir)
+			pn->right = ln;
+		else
+			pn->left  = ln;
+
+		if (addr_bit_set(&key->addr, plen))
+			ln->right = fn;
+		else
+			ln->left  = fn;
+
+		fn->parent = ln;
+	}
+	return ln;
+}
+
+/*
+ *	Insert routing information in a node.
+ */
+
+static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+			    struct nl_info *info)
+{
+	struct rt6_info *iter = NULL;
+	struct rt6_info **ins;
+
+	ins = &fn->leaf;
+
+	for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) {
+		/*
+		 *	Search for duplicates
+		 */
+
+		if (iter->rt6i_metric == rt->rt6i_metric) {
+			/*
+			 *	Same priority level
+			 */
+
+			if (iter->rt6i_dev == rt->rt6i_dev &&
+			    iter->rt6i_idev == rt->rt6i_idev &&
+			    ipv6_addr_equal(&iter->rt6i_gateway,
+					    &rt->rt6i_gateway)) {
+				if (!(iter->rt6i_flags&RTF_EXPIRES))
+					return -EEXIST;
+				iter->rt6i_expires = rt->rt6i_expires;
+				if (!(rt->rt6i_flags&RTF_EXPIRES)) {
+					iter->rt6i_flags &= ~RTF_EXPIRES;
+					iter->rt6i_expires = 0;
+				}
+				return -EEXIST;
+			}
+		}
+
+		if (iter->rt6i_metric > rt->rt6i_metric)
+			break;
+
+		ins = &iter->dst.rt6_next;
+	}
+
+	/* Reset round-robin state, if necessary */
+	if (ins == &fn->leaf)
+		fn->rr_ptr = NULL;
+
+	/*
+	 *	insert node
+	 */
+
+	rt->dst.rt6_next = iter;
+	*ins = rt;
+	rt->rt6i_node = fn;
+	atomic_inc(&rt->rt6i_ref);
+	inet6_rt_notify(RTM_NEWROUTE, rt, info);
+	info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
+
+	if ((fn->fn_flags & RTN_RTINFO) == 0) {
+		info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
+		fn->fn_flags |= RTN_RTINFO;
+	}
+
+	return 0;
+}
+
+static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt)
+{
+	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
+	    (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
+		mod_timer(&net->ipv6.ip6_fib_timer,
+			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+}
+
+void fib6_force_start_gc(struct net *net)
+{
+	if (!timer_pending(&net->ipv6.ip6_fib_timer))
+		mod_timer(&net->ipv6.ip6_fib_timer,
+			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+}
+
+/*
+ *	Add routing information to the routing tree.
+ *	<destination addr>/<source addr>
+ *	with source addr info in sub-trees
+ */
+
+int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
+{
+	struct fib6_node *fn, *pn = NULL;
+	int err = -ENOMEM;
+
+	fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
+			rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
+
+	if (fn == NULL)
+		goto out;
+
+	pn = fn;
+
+#ifdef CONFIG_IPV6_SUBTREES
+	if (rt->rt6i_src.plen) {
+		struct fib6_node *sn;
+
+		if (fn->subtree == NULL) {
+			struct fib6_node *sfn;
+
+			/*
+			 * Create subtree.
+			 *
+			 *		fn[main tree]
+			 *		|
+			 *		sfn[subtree root]
+			 *		   \
+			 *		    sn[new leaf node]
+			 */
+
+			/* Create subtree root node */
+			sfn = node_alloc();
+			if (sfn == NULL)
+				goto st_failure;
+
+			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
+			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+			sfn->fn_flags = RTN_ROOT;
+			sfn->fn_sernum = fib6_new_sernum();
+
+			/* Now add the first leaf node to new subtree */
+
+			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					offsetof(struct rt6_info, rt6i_src));
+
+			if (sn == NULL) {
+				/* If it is failed, discard just allocated
+				   root, and then (in st_failure) stale node
+				   in main tree.
+				 */
+				node_free(sfn);
+				goto st_failure;
+			}
+
+			/* Now link new subtree to main tree */
+			sfn->parent = fn;
+			fn->subtree = sfn;
+		} else {
+			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					offsetof(struct rt6_info, rt6i_src));
+
+			if (sn == NULL)
+				goto st_failure;
+		}
+
+		if (fn->leaf == NULL) {
+			fn->leaf = rt;
+			atomic_inc(&rt->rt6i_ref);
+		}
+		fn = sn;
+	}
+#endif
+
+	err = fib6_add_rt2node(fn, rt, info);
+
+	if (err == 0) {
+		fib6_start_gc(info->nl_net, rt);
+		if (!(rt->rt6i_flags&RTF_CACHE))
+			fib6_prune_clones(info->nl_net, pn, rt);
+	}
+
+out:
+	if (err) {
+#ifdef CONFIG_IPV6_SUBTREES
+		/*
+		 * If fib6_add_1 has cleared the old leaf pointer in the
+		 * super-tree leaf node we have to find a new one for it.
+		 */
+		if (pn != fn && pn->leaf == rt) {
+			pn->leaf = NULL;
+			atomic_dec(&rt->rt6i_ref);
+		}
+		if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
+			pn->leaf = fib6_find_prefix(info->nl_net, pn);
+#if RT6_DEBUG >= 2
+			if (!pn->leaf) {
+				WARN_ON(pn->leaf == NULL);
+				pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+			}
+#endif
+			atomic_inc(&pn->leaf->rt6i_ref);
+		}
+#endif
+		dst_free(&rt->dst);
+	}
+	return err;
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* Subtree creation failed, probably main tree node
+	   is orphan. If it is, shoot it.
+	 */
+st_failure:
+	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
+		fib6_repair_tree(info->nl_net, fn);
+	dst_free(&rt->dst);
+	return err;
+#endif
+}
+
+/*
+ *	Routing tree lookup
+ *
+ */
+
+struct lookup_args {
+	int		offset;		/* key offset on rt6_info	*/
+	const struct in6_addr	*addr;		/* search key			*/
+};
+
+static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
+					struct lookup_args *args)
+{
+	struct fib6_node *fn;
+	__be32 dir;
+
+	if (unlikely(args->offset == 0))
+		return NULL;
+
+	/*
+	 *	Descend on a tree
+	 */
+
+	fn = root;
+
+	for (;;) {
+		struct fib6_node *next;
+
+		dir = addr_bit_set(args->addr, fn->fn_bit);
+
+		next = dir ? fn->right : fn->left;
+
+		if (next) {
+			fn = next;
+			continue;
+		}
+
+		break;
+	}
+
+	while(fn) {
+		if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+			struct rt6key *key;
+
+			key = (struct rt6key *) ((u8 *) fn->leaf +
+						 args->offset);
+
+			if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
+#ifdef CONFIG_IPV6_SUBTREES
+				if (fn->subtree)
+					fn = fib6_lookup_1(fn->subtree, args + 1);
+#endif
+				if (!fn || fn->fn_flags & RTN_RTINFO)
+					return fn;
+			}
+		}
+
+		if (fn->fn_flags & RTN_ROOT)
+			break;
+
+		fn = fn->parent;
+	}
+
+	return NULL;
+}
+
+struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
+			       const struct in6_addr *saddr)
+{
+	struct fib6_node *fn;
+	struct lookup_args args[] = {
+		{
+			.offset = offsetof(struct rt6_info, rt6i_dst),
+			.addr = daddr,
+		},
+#ifdef CONFIG_IPV6_SUBTREES
+		{
+			.offset = offsetof(struct rt6_info, rt6i_src),
+			.addr = saddr,
+		},
+#endif
+		{
+			.offset = 0,	/* sentinel */
+		}
+	};
+
+	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+
+	if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
+		fn = root;
+
+	return fn;
+}
+
+/*
+ *	Get node with specified destination prefix (and source prefix,
+ *	if subtrees are used)
+ */
+
+
+static struct fib6_node * fib6_locate_1(struct fib6_node *root,
+					const struct in6_addr *addr,
+					int plen, int offset)
+{
+	struct fib6_node *fn;
+
+	for (fn = root; fn ; ) {
+		struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
+			return NULL;
+
+		if (plen == fn->fn_bit)
+			return fn;
+
+		/*
+		 *	We have more bits to go
+		 */
+		if (addr_bit_set(addr, fn->fn_bit))
+			fn = fn->right;
+		else
+			fn = fn->left;
+	}
+	return NULL;
+}
+
+struct fib6_node * fib6_locate(struct fib6_node *root,
+			       const struct in6_addr *daddr, int dst_len,
+			       const struct in6_addr *saddr, int src_len)
+{
+	struct fib6_node *fn;
+
+	fn = fib6_locate_1(root, daddr, dst_len,
+			   offsetof(struct rt6_info, rt6i_dst));
+
+#ifdef CONFIG_IPV6_SUBTREES
+	if (src_len) {
+		WARN_ON(saddr == NULL);
+		if (fn && fn->subtree)
+			fn = fib6_locate_1(fn->subtree, saddr, src_len,
+					   offsetof(struct rt6_info, rt6i_src));
+	}
+#endif
+
+	if (fn && fn->fn_flags&RTN_RTINFO)
+		return fn;
+
+	return NULL;
+}
+
+
+/*
+ *	Deletion
+ *
+ */
+
+static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+{
+	if (fn->fn_flags&RTN_ROOT)
+		return net->ipv6.ip6_null_entry;
+
+	while(fn) {
+		if(fn->left)
+			return fn->left->leaf;
+
+		if(fn->right)
+			return fn->right->leaf;
+
+		fn = FIB6_SUBTREE(fn);
+	}
+	return NULL;
+}
+
+/*
+ *	Called to trim the tree of intermediate nodes when possible. "fn"
+ *	is the node we want to try and remove.
+ */
+
+static struct fib6_node *fib6_repair_tree(struct net *net,
+					   struct fib6_node *fn)
+{
+	int children;
+	int nstate;
+	struct fib6_node *child, *pn;
+	struct fib6_walker_t *w;
+	int iter = 0;
+
+	for (;;) {
+		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+		iter++;
+
+		WARN_ON(fn->fn_flags & RTN_RTINFO);
+		WARN_ON(fn->fn_flags & RTN_TL_ROOT);
+		WARN_ON(fn->leaf != NULL);
+
+		children = 0;
+		child = NULL;
+		if (fn->right) child = fn->right, children |= 1;
+		if (fn->left) child = fn->left, children |= 2;
+
+		if (children == 3 || FIB6_SUBTREE(fn)
+#ifdef CONFIG_IPV6_SUBTREES
+		    /* Subtree root (i.e. fn) may have one child */
+		    || (children && fn->fn_flags&RTN_ROOT)
+#endif
+		    ) {
+			fn->leaf = fib6_find_prefix(net, fn);
+#if RT6_DEBUG >= 2
+			if (fn->leaf==NULL) {
+				WARN_ON(!fn->leaf);
+				fn->leaf = net->ipv6.ip6_null_entry;
+			}
+#endif
+			atomic_inc(&fn->leaf->rt6i_ref);
+			return fn->parent;
+		}
+
+		pn = fn->parent;
+#ifdef CONFIG_IPV6_SUBTREES
+		if (FIB6_SUBTREE(pn) == fn) {
+			WARN_ON(!(fn->fn_flags & RTN_ROOT));
+			FIB6_SUBTREE(pn) = NULL;
+			nstate = FWS_L;
+		} else {
+			WARN_ON(fn->fn_flags & RTN_ROOT);
+#endif
+			if (pn->right == fn) pn->right = child;
+			else if (pn->left == fn) pn->left = child;
+#if RT6_DEBUG >= 2
+			else
+				WARN_ON(1);
+#endif
+			if (child)
+				child->parent = pn;
+			nstate = FWS_R;
+#ifdef CONFIG_IPV6_SUBTREES
+		}
+#endif
+
+		read_lock(&fib6_walker_lock);
+		FOR_WALKERS(w) {
+			if (child == NULL) {
+				if (w->root == fn) {
+					w->root = w->node = NULL;
+					RT6_TRACE("W %p adjusted by delroot 1\n", w);
+				} else if (w->node == fn) {
+					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+					w->node = pn;
+					w->state = nstate;
+				}
+			} else {
+				if (w->root == fn) {
+					w->root = child;
+					RT6_TRACE("W %p adjusted by delroot 2\n", w);
+				}
+				if (w->node == fn) {
+					w->node = child;
+					if (children&2) {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
+					} else {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
+					}
+				}
+			}
+		}
+		read_unlock(&fib6_walker_lock);
+
+		node_free(fn);
+		if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn))
+			return pn;
+
+		rt6_release(pn->leaf);
+		pn->leaf = NULL;
+		fn = pn;
+	}
+}
+
+static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
+			   struct nl_info *info)
+{
+	struct fib6_walker_t *w;
+	struct rt6_info *rt = *rtp;
+	struct net *net = info->nl_net;
+
+	RT6_TRACE("fib6_del_route\n");
+
+	/* Unlink it */
+	*rtp = rt->dst.rt6_next;
+	rt->rt6i_node = NULL;
+	net->ipv6.rt6_stats->fib_rt_entries--;
+	net->ipv6.rt6_stats->fib_discarded_routes++;
+
+	/* Reset round-robin state, if necessary */
+	if (fn->rr_ptr == rt)
+		fn->rr_ptr = NULL;
+
+	/* Adjust walkers */
+	read_lock(&fib6_walker_lock);
+	FOR_WALKERS(w) {
+		if (w->state == FWS_C && w->leaf == rt) {
+			RT6_TRACE("walker %p adjusted by delroute\n", w);
+			w->leaf = rt->dst.rt6_next;
+			if (w->leaf == NULL)
+				w->state = FWS_U;
+		}
+	}
+	read_unlock(&fib6_walker_lock);
+
+	rt->dst.rt6_next = NULL;
+
+	/* If it was last route, expunge its radix tree node */
+	if (fn->leaf == NULL) {
+		fn->fn_flags &= ~RTN_RTINFO;
+		net->ipv6.rt6_stats->fib_route_nodes--;
+		fn = fib6_repair_tree(net, fn);
+	}
+
+	if (atomic_read(&rt->rt6i_ref) != 1) {
+		/* This route is used as dummy address holder in some split
+		 * nodes. It is not leaked, but it still holds other resources,
+		 * which must be released in time. So, scan ascendant nodes
+		 * and replace dummy references to this route with references
+		 * to still alive ones.
+		 */
+		while (fn) {
+			if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
+				fn->leaf = fib6_find_prefix(net, fn);
+				atomic_inc(&fn->leaf->rt6i_ref);
+				rt6_release(rt);
+			}
+			fn = fn->parent;
+		}
+		/* No more references are possible at this point. */
+		BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
+	}
+
+	inet6_rt_notify(RTM_DELROUTE, rt, info);
+	rt6_release(rt);
+}
+
+int fib6_del(struct rt6_info *rt, struct nl_info *info)
+{
+	struct net *net = info->nl_net;
+	struct fib6_node *fn = rt->rt6i_node;
+	struct rt6_info **rtp;
+
+#if RT6_DEBUG >= 2
+	if (rt->dst.obsolete>0) {
+		WARN_ON(fn != NULL);
+		return -ENOENT;
+	}
+#endif
+	if (fn == NULL || rt == net->ipv6.ip6_null_entry)
+		return -ENOENT;
+
+	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
+
+	if (!(rt->rt6i_flags&RTF_CACHE)) {
+		struct fib6_node *pn = fn;
+#ifdef CONFIG_IPV6_SUBTREES
+		/* clones of this route might be in another subtree */
+		if (rt->rt6i_src.plen) {
+			while (!(pn->fn_flags&RTN_ROOT))
+				pn = pn->parent;
+			pn = pn->parent;
+		}
+#endif
+		fib6_prune_clones(info->nl_net, pn, rt);
+	}
+
+	/*
+	 *	Walk the leaf entries looking for ourself
+	 */
+
+	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
+		if (*rtp == rt) {
+			fib6_del_route(fn, rtp, info);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+/*
+ *	Tree traversal function.
+ *
+ *	Certainly, it is not interrupt safe.
+ *	However, it is internally reenterable wrt itself and fib6_add/fib6_del.
+ *	It means, that we can modify tree during walking
+ *	and use this function for garbage collection, clone pruning,
+ *	cleaning tree when a device goes down etc. etc.
+ *
+ *	It guarantees that every node will be traversed,
+ *	and that it will be traversed only once.
+ *
+ *	Callback function w->func may return:
+ *	0 -> continue walking.
+ *	positive value -> walking is suspended (used by tree dumps,
+ *	and probably by gc, if it will be split to several slices)
+ *	negative value -> terminate walking.
+ *
+ *	The function itself returns:
+ *	0   -> walk is complete.
+ *	>0  -> walk is incomplete (i.e. suspended)
+ *	<0  -> walk is terminated by an error.
+ */
+
+static int fib6_walk_continue(struct fib6_walker_t *w)
+{
+	struct fib6_node *fn, *pn;
+
+	for (;;) {
+		fn = w->node;
+		if (fn == NULL)
+			return 0;
+
+		if (w->prune && fn != w->root &&
+		    fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		}
+		switch (w->state) {
+#ifdef CONFIG_IPV6_SUBTREES
+		case FWS_S:
+			if (FIB6_SUBTREE(fn)) {
+				w->node = FIB6_SUBTREE(fn);
+				continue;
+			}
+			w->state = FWS_L;
+#endif
+		case FWS_L:
+			if (fn->left) {
+				w->node = fn->left;
+				w->state = FWS_INIT;
+				continue;
+			}
+			w->state = FWS_R;
+		case FWS_R:
+			if (fn->right) {
+				w->node = fn->right;
+				w->state = FWS_INIT;
+				continue;
+			}
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		case FWS_C:
+			if (w->leaf && fn->fn_flags&RTN_RTINFO) {
+				int err;
+
+				if (w->count < w->skip) {
+					w->count++;
+					continue;
+				}
+
+				err = w->func(w);
+				if (err)
+					return err;
+
+				w->count++;
+				continue;
+			}
+			w->state = FWS_U;
+		case FWS_U:
+			if (fn == w->root)
+				return 0;
+			pn = fn->parent;
+			w->node = pn;
+#ifdef CONFIG_IPV6_SUBTREES
+			if (FIB6_SUBTREE(pn) == fn) {
+				WARN_ON(!(fn->fn_flags & RTN_ROOT));
+				w->state = FWS_L;
+				continue;
+			}
+#endif
+			if (pn->left == fn) {
+				w->state = FWS_R;
+				continue;
+			}
+			if (pn->right == fn) {
+				w->state = FWS_C;
+				w->leaf = w->node->leaf;
+				continue;
+			}
+#if RT6_DEBUG >= 2
+			WARN_ON(1);
+#endif
+		}
+	}
+}
+
+static int fib6_walk(struct fib6_walker_t *w)
+{
+	int res;
+
+	w->state = FWS_INIT;
+	w->node = w->root;
+
+	fib6_walker_link(w);
+	res = fib6_walk_continue(w);
+	if (res <= 0)
+		fib6_walker_unlink(w);
+	return res;
+}
+
+static int fib6_clean_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+	struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w);
+	struct nl_info info = {
+		.nl_net = c->net,
+	};
+
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+		res = c->func(rt, c->arg);
+		if (res < 0) {
+			w->leaf = rt;
+			res = fib6_del(rt, &info);
+			if (res) {
+#if RT6_DEBUG >= 2
+				printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
+#endif
+				continue;
+			}
+			return 0;
+		}
+		WARN_ON(res != 0);
+	}
+	w->leaf = rt;
+	return 0;
+}
+
+/*
+ *	Convenient frontend to tree walker.
+ *
+ *	func is called on each route.
+ *		It may return -1 -> delete this route.
+ *		              0  -> continue walking
+ *
+ *	prune==1 -> only immediate children of node (certainly,
+ *	ignoring pure split nodes) will be scanned.
+ */
+
+static void fib6_clean_tree(struct net *net, struct fib6_node *root,
+			    int (*func)(struct rt6_info *, void *arg),
+			    int prune, void *arg)
+{
+	struct fib6_cleaner_t c;
+
+	c.w.root = root;
+	c.w.func = fib6_clean_node;
+	c.w.prune = prune;
+	c.w.count = 0;
+	c.w.skip = 0;
+	c.func = func;
+	c.arg = arg;
+	c.net = net;
+
+	fib6_walk(&c.w);
+}
+
+void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
+		    int prune, void *arg)
+{
+	struct fib6_table *table;
+	struct hlist_node *node;
+	struct hlist_head *head;
+	unsigned int h;
+
+	rcu_read_lock();
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		head = &net->ipv6.fib_table_hash[h];
+		hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
+			write_lock_bh(&table->tb6_lock);
+			fib6_clean_tree(net, &table->tb6_root,
+					func, prune, arg);
+			write_unlock_bh(&table->tb6_lock);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+{
+	if (rt->rt6i_flags & RTF_CACHE) {
+		RT6_TRACE("pruning clone %p\n", rt);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void fib6_prune_clones(struct net *net, struct fib6_node *fn,
+			      struct rt6_info *rt)
+{
+	fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt);
+}
+
+/*
+ *	Garbage collection
+ */
+
+static struct fib6_gc_args
+{
+	int			timeout;
+	int			more;
+} gc_args;
+
+static int fib6_age(struct rt6_info *rt, void *arg)
+{
+	unsigned long now = jiffies;
+
+	/*
+	 *	check addrconf expiration here.
+	 *	Routes are expired even if they are in use.
+	 *
+	 *	Also age clones. Note, that clones are aged out
+	 *	only if they are not in use now.
+	 */
+
+	if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
+		if (time_after(now, rt->rt6i_expires)) {
+			RT6_TRACE("expiring %p\n", rt);
+			return -1;
+		}
+		gc_args.more++;
+	} else if (rt->rt6i_flags & RTF_CACHE) {
+		if (atomic_read(&rt->dst.__refcnt) == 0 &&
+		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
+			RT6_TRACE("aging clone %p\n", rt);
+			return -1;
+		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
+			   (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) {
+			RT6_TRACE("purging route %p via non-router but gateway\n",
+				  rt);
+			return -1;
+		}
+		gc_args.more++;
+	}
+
+	return 0;
+}
+
+static DEFINE_SPINLOCK(fib6_gc_lock);
+
+void fib6_run_gc(unsigned long expires, struct net *net)
+{
+	if (expires != ~0UL) {
+		spin_lock_bh(&fib6_gc_lock);
+		gc_args.timeout = expires ? (int)expires :
+			net->ipv6.sysctl.ip6_rt_gc_interval;
+	} else {
+		if (!spin_trylock_bh(&fib6_gc_lock)) {
+			mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
+			return;
+		}
+		gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval;
+	}
+
+	gc_args.more = icmp6_dst_gc();
+
+	fib6_clean_all(net, fib6_age, 0, NULL);
+
+	if (gc_args.more)
+		mod_timer(&net->ipv6.ip6_fib_timer,
+			  round_jiffies(jiffies
+					+ net->ipv6.sysctl.ip6_rt_gc_interval));
+	else
+		del_timer(&net->ipv6.ip6_fib_timer);
+	spin_unlock_bh(&fib6_gc_lock);
+}
+
+static void fib6_gc_timer_cb(unsigned long arg)
+{
+	fib6_run_gc(0, (struct net *)arg);
+}
+
+static int __net_init fib6_net_init(struct net *net)
+{
+	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+
+	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
+
+	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
+	if (!net->ipv6.rt6_stats)
+		goto out_timer;
+
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
+	if (!net->ipv6.fib_table_hash)
+		goto out_rt6_stats;
+
+	net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
+					  GFP_KERNEL);
+	if (!net->ipv6.fib6_main_tbl)
+		goto out_fib_table_hash;
+
+	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+	net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
+		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
+					   GFP_KERNEL);
+	if (!net->ipv6.fib6_local_tbl)
+		goto out_fib6_main_tbl;
+	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
+		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+#endif
+	fib6_tables_init(net);
+
+	return 0;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_fib6_main_tbl:
+	kfree(net->ipv6.fib6_main_tbl);
+#endif
+out_fib_table_hash:
+	kfree(net->ipv6.fib_table_hash);
+out_rt6_stats:
+	kfree(net->ipv6.rt6_stats);
+out_timer:
+	return -ENOMEM;
+ }
+
+static void fib6_net_exit(struct net *net)
+{
+	rt6_ifdown(net, NULL);
+	del_timer_sync(&net->ipv6.ip6_fib_timer);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	kfree(net->ipv6.fib6_local_tbl);
+#endif
+	kfree(net->ipv6.fib6_main_tbl);
+	kfree(net->ipv6.fib_table_hash);
+	kfree(net->ipv6.rt6_stats);
+}
+
+static struct pernet_operations fib6_net_ops = {
+	.init = fib6_net_init,
+	.exit = fib6_net_exit,
+};
+
+int __init fib6_init(void)
+{
+	int ret = -ENOMEM;
+
+	fib6_node_kmem = kmem_cache_create("fib6_nodes",
+					   sizeof(struct fib6_node),
+					   0, SLAB_HWCACHE_ALIGN,
+					   NULL);
+	if (!fib6_node_kmem)
+		goto out;
+
+	ret = register_pernet_subsys(&fib6_net_ops);
+	if (ret)
+		goto out_kmem_cache_create;
+
+	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib);
+	if (ret)
+		goto out_unregister_subsys;
+out:
+	return ret;
+
+out_unregister_subsys:
+	unregister_pernet_subsys(&fib6_net_ops);
+out_kmem_cache_create:
+	kmem_cache_destroy(fib6_node_kmem);
+	goto out;
+}
+
+void fib6_gc_cleanup(void)
+{
+	unregister_pernet_subsys(&fib6_net_ops);
+	kmem_cache_destroy(fib6_node_kmem);
+}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
new file mode 100644
index 00000000..f3caf1b8
--- /dev/null
+++ b/net/ipv6/ip6_flowlabel.c
@@ -0,0 +1,782 @@
+/*
+ *	ip6_flowlabel.c		IPv6 flowlabel manager.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/route.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/rawv6.h>
+#include <net/icmp.h>
+#include <net/transp_v6.h>
+
+#include <asm/uaccess.h>
+
+#define FL_MIN_LINGER	6	/* Minimal linger. It is set to 6sec specified
+				   in old IPv6 RFC. Well, it was reasonable value.
+				 */
+#define FL_MAX_LINGER	60	/* Maximal linger timeout */
+
+/* FL hash table */
+
+#define FL_MAX_PER_SOCK	32
+#define FL_MAX_SIZE	4096
+#define FL_HASH_MASK	255
+#define FL_HASH(l)	(ntohl(l)&FL_HASH_MASK)
+
+static atomic_t fl_size = ATOMIC_INIT(0);
+static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1];
+
+static void ip6_fl_gc(unsigned long dummy);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
+
+/* FL hash table lock: it protects only of GC */
+
+static DEFINE_RWLOCK(ip6_fl_lock);
+
+/* Big socket sock */
+
+static DEFINE_RWLOCK(ip6_sk_fl_lock);
+
+
+static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
+{
+	struct ip6_flowlabel *fl;
+
+	for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) {
+		if (fl->label == label && net_eq(fl->fl_net, net))
+			return fl;
+	}
+	return NULL;
+}
+
+static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
+{
+	struct ip6_flowlabel *fl;
+
+	read_lock_bh(&ip6_fl_lock);
+	fl = __fl_lookup(net, label);
+	if (fl)
+		atomic_inc(&fl->users);
+	read_unlock_bh(&ip6_fl_lock);
+	return fl;
+}
+
+
+static void fl_free(struct ip6_flowlabel *fl)
+{
+	if (fl) {
+		release_net(fl->fl_net);
+		kfree(fl->opt);
+	}
+	kfree(fl);
+}
+
+static void fl_release(struct ip6_flowlabel *fl)
+{
+	write_lock_bh(&ip6_fl_lock);
+
+	fl->lastuse = jiffies;
+	if (atomic_dec_and_test(&fl->users)) {
+		unsigned long ttd = fl->lastuse + fl->linger;
+		if (time_after(ttd, fl->expires))
+			fl->expires = ttd;
+		ttd = fl->expires;
+		if (fl->opt && fl->share == IPV6_FL_S_EXCL) {
+			struct ipv6_txoptions *opt = fl->opt;
+			fl->opt = NULL;
+			kfree(opt);
+		}
+		if (!timer_pending(&ip6_fl_gc_timer) ||
+		    time_after(ip6_fl_gc_timer.expires, ttd))
+			mod_timer(&ip6_fl_gc_timer, ttd);
+	}
+	write_unlock_bh(&ip6_fl_lock);
+}
+
+static void ip6_fl_gc(unsigned long dummy)
+{
+	int i;
+	unsigned long now = jiffies;
+	unsigned long sched = 0;
+
+	write_lock(&ip6_fl_lock);
+
+	for (i=0; i<=FL_HASH_MASK; i++) {
+		struct ip6_flowlabel *fl, **flp;
+		flp = &fl_ht[i];
+		while ((fl=*flp) != NULL) {
+			if (atomic_read(&fl->users) == 0) {
+				unsigned long ttd = fl->lastuse + fl->linger;
+				if (time_after(ttd, fl->expires))
+					fl->expires = ttd;
+				ttd = fl->expires;
+				if (time_after_eq(now, ttd)) {
+					*flp = fl->next;
+					fl_free(fl);
+					atomic_dec(&fl_size);
+					continue;
+				}
+				if (!sched || time_before(ttd, sched))
+					sched = ttd;
+			}
+			flp = &fl->next;
+		}
+	}
+	if (!sched && atomic_read(&fl_size))
+		sched = now + FL_MAX_LINGER;
+	if (sched) {
+		mod_timer(&ip6_fl_gc_timer, sched);
+	}
+	write_unlock(&ip6_fl_lock);
+}
+
+static void __net_exit ip6_fl_purge(struct net *net)
+{
+	int i;
+
+	write_lock(&ip6_fl_lock);
+	for (i = 0; i <= FL_HASH_MASK; i++) {
+		struct ip6_flowlabel *fl, **flp;
+		flp = &fl_ht[i];
+		while ((fl = *flp) != NULL) {
+			if (net_eq(fl->fl_net, net) &&
+			    atomic_read(&fl->users) == 0) {
+				*flp = fl->next;
+				fl_free(fl);
+				atomic_dec(&fl_size);
+				continue;
+			}
+			flp = &fl->next;
+		}
+	}
+	write_unlock(&ip6_fl_lock);
+}
+
+static struct ip6_flowlabel *fl_intern(struct net *net,
+				       struct ip6_flowlabel *fl, __be32 label)
+{
+	struct ip6_flowlabel *lfl;
+
+	fl->label = label & IPV6_FLOWLABEL_MASK;
+
+	write_lock_bh(&ip6_fl_lock);
+	if (label == 0) {
+		for (;;) {
+			fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK;
+			if (fl->label) {
+				lfl = __fl_lookup(net, fl->label);
+				if (lfl == NULL)
+					break;
+			}
+		}
+	} else {
+		/*
+		 * we dropper the ip6_fl_lock, so this entry could reappear
+		 * and we need to recheck with it.
+		 *
+		 * OTOH no need to search the active socket first, like it is
+		 * done in ipv6_flowlabel_opt - sock is locked, so new entry
+		 * with the same label can only appear on another sock
+		 */
+		lfl = __fl_lookup(net, fl->label);
+		if (lfl != NULL) {
+			atomic_inc(&lfl->users);
+			write_unlock_bh(&ip6_fl_lock);
+			return lfl;
+		}
+	}
+
+	fl->lastuse = jiffies;
+	fl->next = fl_ht[FL_HASH(fl->label)];
+	fl_ht[FL_HASH(fl->label)] = fl;
+	atomic_inc(&fl_size);
+	write_unlock_bh(&ip6_fl_lock);
+	return NULL;
+}
+
+
+
+/* Socket flowlabel lists */
+
+struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label)
+{
+	struct ipv6_fl_socklist *sfl;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	label &= IPV6_FLOWLABEL_MASK;
+
+	read_lock_bh(&ip6_sk_fl_lock);
+	for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) {
+		struct ip6_flowlabel *fl = sfl->fl;
+		if (fl->label == label) {
+			fl->lastuse = jiffies;
+			atomic_inc(&fl->users);
+			read_unlock_bh(&ip6_sk_fl_lock);
+			return fl;
+		}
+	}
+	read_unlock_bh(&ip6_sk_fl_lock);
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+
+void fl6_free_socklist(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_fl_socklist *sfl;
+
+	while ((sfl = np->ipv6_fl_list) != NULL) {
+		np->ipv6_fl_list = sfl->next;
+		fl_release(sfl->fl);
+		kfree(sfl);
+	}
+}
+
+/* Service routines */
+
+
+/*
+   It is the only difficult place. flowlabel enforces equal headers
+   before and including routing header, however user may supply options
+   following rthdr.
+ */
+
+struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
+					 struct ip6_flowlabel * fl,
+					 struct ipv6_txoptions * fopt)
+{
+	struct ipv6_txoptions * fl_opt = fl->opt;
+
+	if (fopt == NULL || fopt->opt_flen == 0)
+		return fl_opt;
+
+	if (fl_opt != NULL) {
+		opt_space->hopopt = fl_opt->hopopt;
+		opt_space->dst0opt = fl_opt->dst0opt;
+		opt_space->srcrt = fl_opt->srcrt;
+		opt_space->opt_nflen = fl_opt->opt_nflen;
+	} else {
+		if (fopt->opt_nflen == 0)
+			return fopt;
+		opt_space->hopopt = NULL;
+		opt_space->dst0opt = NULL;
+		opt_space->srcrt = NULL;
+		opt_space->opt_nflen = 0;
+	}
+	opt_space->dst1opt = fopt->dst1opt;
+	opt_space->opt_flen = fopt->opt_flen;
+	return opt_space;
+}
+
+static unsigned long check_linger(unsigned long ttl)
+{
+	if (ttl < FL_MIN_LINGER)
+		return FL_MIN_LINGER*HZ;
+	if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN))
+		return 0;
+	return ttl*HZ;
+}
+
+static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires)
+{
+	linger = check_linger(linger);
+	if (!linger)
+		return -EPERM;
+	expires = check_linger(expires);
+	if (!expires)
+		return -EPERM;
+	fl->lastuse = jiffies;
+	if (time_before(fl->linger, linger))
+		fl->linger = linger;
+	if (time_before(expires, fl->linger))
+		expires = fl->linger;
+	if (time_before(fl->expires, fl->lastuse + expires))
+		fl->expires = fl->lastuse + expires;
+	return 0;
+}
+
+static struct ip6_flowlabel *
+fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
+	  int optlen, int *err_p)
+{
+	struct ip6_flowlabel *fl = NULL;
+	int olen;
+	int addr_type;
+	int err;
+
+	olen = optlen - CMSG_ALIGN(sizeof(*freq));
+	err = -EINVAL;
+	if (olen > 64 * 1024)
+		goto done;
+
+	err = -ENOMEM;
+	fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+	if (fl == NULL)
+		goto done;
+
+	if (olen > 0) {
+		struct msghdr msg;
+		struct flowi6 flowi6;
+		int junk;
+
+		err = -ENOMEM;
+		fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL);
+		if (fl->opt == NULL)
+			goto done;
+
+		memset(fl->opt, 0, sizeof(*fl->opt));
+		fl->opt->tot_len = sizeof(*fl->opt) + olen;
+		err = -EFAULT;
+		if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
+			goto done;
+
+		msg.msg_controllen = olen;
+		msg.msg_control = (void*)(fl->opt+1);
+		memset(&flowi6, 0, sizeof(flowi6));
+
+		err = datagram_send_ctl(net, &msg, &flowi6, fl->opt, &junk,
+					&junk, &junk);
+		if (err)
+			goto done;
+		err = -EINVAL;
+		if (fl->opt->opt_flen)
+			goto done;
+		if (fl->opt->opt_nflen == 0) {
+			kfree(fl->opt);
+			fl->opt = NULL;
+		}
+	}
+
+	fl->fl_net = hold_net(net);
+	fl->expires = jiffies;
+	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
+	if (err)
+		goto done;
+	fl->share = freq->flr_share;
+	addr_type = ipv6_addr_type(&freq->flr_dst);
+	if ((addr_type & IPV6_ADDR_MAPPED) ||
+	    addr_type == IPV6_ADDR_ANY) {
+		err = -EINVAL;
+		goto done;
+	}
+	ipv6_addr_copy(&fl->dst, &freq->flr_dst);
+	atomic_set(&fl->users, 1);
+	switch (fl->share) {
+	case IPV6_FL_S_EXCL:
+	case IPV6_FL_S_ANY:
+		break;
+	case IPV6_FL_S_PROCESS:
+		fl->owner = current->pid;
+		break;
+	case IPV6_FL_S_USER:
+		fl->owner = current_euid();
+		break;
+	default:
+		err = -EINVAL;
+		goto done;
+	}
+	return fl;
+
+done:
+	fl_free(fl);
+	*err_p = err;
+	return NULL;
+}
+
+static int mem_check(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_fl_socklist *sfl;
+	int room = FL_MAX_SIZE - atomic_read(&fl_size);
+	int count = 0;
+
+	if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
+		return 0;
+
+	for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next)
+		count++;
+
+	if (room <= 0 ||
+	    ((count >= FL_MAX_PER_SOCK ||
+	      (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
+	     !capable(CAP_NET_ADMIN)))
+		return -ENOBUFS;
+
+	return 0;
+}
+
+static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2)
+{
+	if (h1 == h2)
+		return 0;
+	if (h1 == NULL || h2 == NULL)
+		return 1;
+	if (h1->hdrlen != h2->hdrlen)
+		return 1;
+	return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1));
+}
+
+static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2)
+{
+	if (o1 == o2)
+		return 0;
+	if (o1 == NULL || o2 == NULL)
+		return 1;
+	if (o1->opt_nflen != o2->opt_nflen)
+		return 1;
+	if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt))
+		return 1;
+	if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt))
+		return 1;
+	if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt))
+		return 1;
+	return 0;
+}
+
+static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
+		struct ip6_flowlabel *fl)
+{
+	write_lock_bh(&ip6_sk_fl_lock);
+	sfl->fl = fl;
+	sfl->next = np->ipv6_fl_list;
+	np->ipv6_fl_list = sfl;
+	write_unlock_bh(&ip6_sk_fl_lock);
+}
+
+int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
+{
+	int uninitialized_var(err);
+	struct net *net = sock_net(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct in6_flowlabel_req freq;
+	struct ipv6_fl_socklist *sfl1=NULL;
+	struct ipv6_fl_socklist *sfl, **sflp;
+	struct ip6_flowlabel *fl, *fl1 = NULL;
+
+
+	if (optlen < sizeof(freq))
+		return -EINVAL;
+
+	if (copy_from_user(&freq, optval, sizeof(freq)))
+		return -EFAULT;
+
+	switch (freq.flr_action) {
+	case IPV6_FL_A_PUT:
+		write_lock_bh(&ip6_sk_fl_lock);
+		for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) {
+			if (sfl->fl->label == freq.flr_label) {
+				if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
+					np->flow_label &= ~IPV6_FLOWLABEL_MASK;
+				*sflp = sfl->next;
+				write_unlock_bh(&ip6_sk_fl_lock);
+				fl_release(sfl->fl);
+				kfree(sfl);
+				return 0;
+			}
+		}
+		write_unlock_bh(&ip6_sk_fl_lock);
+		return -ESRCH;
+
+	case IPV6_FL_A_RENEW:
+		read_lock_bh(&ip6_sk_fl_lock);
+		for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) {
+			if (sfl->fl->label == freq.flr_label) {
+				err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
+				read_unlock_bh(&ip6_sk_fl_lock);
+				return err;
+			}
+		}
+		read_unlock_bh(&ip6_sk_fl_lock);
+
+		if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) {
+			fl = fl_lookup(net, freq.flr_label);
+			if (fl) {
+				err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
+				fl_release(fl);
+				return err;
+			}
+		}
+		return -ESRCH;
+
+	case IPV6_FL_A_GET:
+		if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
+			return -EINVAL;
+
+		fl = fl_create(net, &freq, optval, optlen, &err);
+		if (fl == NULL)
+			return err;
+		sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+
+		if (freq.flr_label) {
+			err = -EEXIST;
+			read_lock_bh(&ip6_sk_fl_lock);
+			for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) {
+				if (sfl->fl->label == freq.flr_label) {
+					if (freq.flr_flags&IPV6_FL_F_EXCL) {
+						read_unlock_bh(&ip6_sk_fl_lock);
+						goto done;
+					}
+					fl1 = sfl->fl;
+					atomic_inc(&fl1->users);
+					break;
+				}
+			}
+			read_unlock_bh(&ip6_sk_fl_lock);
+
+			if (fl1 == NULL)
+				fl1 = fl_lookup(net, freq.flr_label);
+			if (fl1) {
+recheck:
+				err = -EEXIST;
+				if (freq.flr_flags&IPV6_FL_F_EXCL)
+					goto release;
+				err = -EPERM;
+				if (fl1->share == IPV6_FL_S_EXCL ||
+				    fl1->share != fl->share ||
+				    fl1->owner != fl->owner)
+					goto release;
+
+				err = -EINVAL;
+				if (!ipv6_addr_equal(&fl1->dst, &fl->dst) ||
+				    ipv6_opt_cmp(fl1->opt, fl->opt))
+					goto release;
+
+				err = -ENOMEM;
+				if (sfl1 == NULL)
+					goto release;
+				if (fl->linger > fl1->linger)
+					fl1->linger = fl->linger;
+				if ((long)(fl->expires - fl1->expires) > 0)
+					fl1->expires = fl->expires;
+				fl_link(np, sfl1, fl1);
+				fl_free(fl);
+				return 0;
+
+release:
+				fl_release(fl1);
+				goto done;
+			}
+		}
+		err = -ENOENT;
+		if (!(freq.flr_flags&IPV6_FL_F_CREATE))
+			goto done;
+
+		err = -ENOMEM;
+		if (sfl1 == NULL || (err = mem_check(sk)) != 0)
+			goto done;
+
+		fl1 = fl_intern(net, fl, freq.flr_label);
+		if (fl1 != NULL)
+			goto recheck;
+
+		if (!freq.flr_label) {
+			if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
+					 &fl->label, sizeof(fl->label))) {
+				/* Intentionally ignore fault. */
+			}
+		}
+
+		fl_link(np, sfl1, fl);
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+
+done:
+	fl_free(fl);
+	kfree(sfl1);
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct ip6fl_iter_state {
+	struct seq_net_private p;
+	int bucket;
+};
+
+#define ip6fl_seq_private(seq)	((struct ip6fl_iter_state *)(seq)->private)
+
+static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq)
+{
+	struct ip6_flowlabel *fl = NULL;
+	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) {
+		fl = fl_ht[state->bucket];
+
+		while (fl && !net_eq(fl->fl_net, net))
+			fl = fl->next;
+		if (fl)
+			break;
+	}
+	return fl;
+}
+
+static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl)
+{
+	struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
+	struct net *net = seq_file_net(seq);
+
+	fl = fl->next;
+try_again:
+	while (fl && !net_eq(fl->fl_net, net))
+		fl = fl->next;
+
+	while (!fl) {
+		if (++state->bucket <= FL_HASH_MASK) {
+			fl = fl_ht[state->bucket];
+			goto try_again;
+		} else
+			break;
+	}
+	return fl;
+}
+
+static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip6_flowlabel *fl = ip6fl_get_first(seq);
+	if (fl)
+		while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL)
+			--pos;
+	return pos ? NULL : fl;
+}
+
+static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(ip6_fl_lock)
+{
+	read_lock_bh(&ip6_fl_lock);
+	return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip6_flowlabel *fl;
+
+	if (v == SEQ_START_TOKEN)
+		fl = ip6fl_get_first(seq);
+	else
+		fl = ip6fl_get_next(seq, v);
+	++*pos;
+	return fl;
+}
+
+static void ip6fl_seq_stop(struct seq_file *seq, void *v)
+	__releases(ip6_fl_lock)
+{
+	read_unlock_bh(&ip6_fl_lock);
+}
+
+static int ip6fl_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n",
+			   "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt");
+	else {
+		struct ip6_flowlabel *fl = v;
+		seq_printf(seq,
+			   "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n",
+			   (unsigned)ntohl(fl->label),
+			   fl->share,
+			   (unsigned)fl->owner,
+			   atomic_read(&fl->users),
+			   fl->linger/HZ,
+			   (long)(fl->expires - jiffies)/HZ,
+			   &fl->dst,
+			   fl->opt ? fl->opt->opt_nflen : 0);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip6fl_seq_ops = {
+	.start	=	ip6fl_seq_start,
+	.next	=	ip6fl_seq_next,
+	.stop	=	ip6fl_seq_stop,
+	.show	=	ip6fl_seq_show,
+};
+
+static int ip6fl_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip6fl_seq_ops,
+			    sizeof(struct ip6fl_iter_state));
+}
+
+static const struct file_operations ip6fl_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	ip6fl_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+static int __net_init ip6_flowlabel_proc_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "ip6_flowlabel",
+				  S_IRUGO, &ip6fl_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit ip6_flowlabel_proc_fini(struct net *net)
+{
+	proc_net_remove(net, "ip6_flowlabel");
+}
+#else
+static inline int ip6_flowlabel_proc_init(struct net *net)
+{
+	return 0;
+}
+static inline void ip6_flowlabel_proc_fini(struct net *net)
+{
+}
+#endif
+
+static void __net_exit ip6_flowlabel_net_exit(struct net *net)
+{
+	ip6_fl_purge(net);
+	ip6_flowlabel_proc_fini(net);
+}
+
+static struct pernet_operations ip6_flowlabel_net_ops = {
+	.init = ip6_flowlabel_proc_init,
+	.exit = ip6_flowlabel_net_exit,
+};
+
+int ip6_flowlabel_init(void)
+{
+	return register_pernet_subsys(&ip6_flowlabel_net_ops);
+}
+
+void ip6_flowlabel_cleanup(void)
+{
+	del_timer(&ip6_fl_gc_timer);
+	unregister_pernet_subsys(&ip6_flowlabel_net_ops);
+}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
new file mode 100644
index 00000000..027c7ff6
--- /dev/null
+++ b/net/ipv6/ip6_input.c
@@ -0,0 +1,335 @@
+/*
+ *	IPv6 input
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Ian P. Morris		<I.P.Morris@soton.ac.uk>
+ *
+ *	Based in linux/net/ipv4/ip_input.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+/* Changes
+ *
+ * 	Mitsuru KANDA @USAGI and
+ * 	YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+
+
+
+inline int ip6_rcv_finish( struct sk_buff *skb)
+{
+	if (skb_dst(skb) == NULL)
+		ip6_route_input(skb);
+
+	return dst_input(skb);
+}
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	const struct ipv6hdr *hdr;
+	u32 		pkt_len;
+	struct inet6_dev *idev;
+	struct net *net = dev_net(skb->dev);
+
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	rcu_read_lock();
+
+	idev = __in6_dev_get(skb->dev);
+
+	IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len);
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
+	    !idev || unlikely(idev->cnf.disable_ipv6)) {
+		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+
+	/*
+	 * Store incoming device index. When the packet will
+	 * be queued, we cannot refer to skb->dev anymore.
+	 *
+	 * BTW, when we send a packet for our own local address on a
+	 * non-loopback interface (e.g. ethX), it is being delivered
+	 * via the loopback interface (lo) here; skb->dev = loopback_dev.
+	 * It, however, should be considered as if it is being
+	 * arrived via the sending interface (ethX), because of the
+	 * nature of scoping architecture. --yoshfuji
+	 */
+	IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
+		goto err;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto err;
+
+	/*
+	 * RFC4291 2.5.3
+	 * A packet received on an interface with a destination address
+	 * of loopback must be dropped.
+	 */
+	if (!(dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_loopback(&hdr->daddr))
+		goto err;
+
+	skb->transport_header = skb->network_header + sizeof(*hdr);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	/* pkt_len may be zero if Jumbo payload option is present */
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
+			IP6_INC_STATS_BH(net,
+					 idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+			goto drop;
+		}
+		hdr = ipv6_hdr(skb);
+	}
+
+	if (hdr->nexthdr == NEXTHDR_HOP) {
+		if (ipv6_parse_hopopts(skb) < 0) {
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+			rcu_read_unlock();
+			return NET_RX_DROP;
+		}
+	}
+
+	rcu_read_unlock();
+
+	/* Must drop socket now because of tproxy. */
+	skb_orphan(skb);
+
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL,
+		       ip6_rcv_finish);
+err:
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	rcu_read_unlock();
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+/*
+ *	Deliver the packet to the host
+ */
+
+
+static int ip6_input_finish(struct sk_buff *skb)
+{
+	const struct inet6_protocol *ipprot;
+	unsigned int nhoff;
+	int nexthdr, raw;
+	u8 hash;
+	struct inet6_dev *idev;
+	struct net *net = dev_net(skb_dst(skb)->dev);
+
+	/*
+	 *	Parse extension headers
+	 */
+
+	rcu_read_lock();
+resubmit:
+	idev = ip6_dst_idev(skb_dst(skb));
+	if (!pskb_pull(skb, skb_transport_offset(skb)))
+		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
+	nexthdr = skb_network_header(skb)[nhoff];
+
+	raw = raw6_local_deliver(skb, nexthdr);
+
+	hash = nexthdr & (MAX_INET_PROTOS - 1);
+	if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
+		int ret;
+
+		if (ipprot->flags & INET6_PROTO_FINAL) {
+			const struct ipv6hdr *hdr;
+
+			/* Free reference early: we don't need it any more,
+			   and it may hold ip_conntrack module loaded
+			   indefinitely. */
+			nf_reset(skb);
+
+			skb_postpull_rcsum(skb, skb_network_header(skb),
+					   skb_network_header_len(skb));
+			hdr = ipv6_hdr(skb);
+			if (ipv6_addr_is_multicast(&hdr->daddr) &&
+			    !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
+			    &hdr->saddr) &&
+			    !ipv6_is_mld(skb, nexthdr))
+				goto discard;
+		}
+		if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
+		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto discard;
+
+		ret = ipprot->handler(skb);
+		if (ret > 0)
+			goto resubmit;
+		else if (ret == 0)
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+	} else {
+		if (!raw) {
+			if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				IP6_INC_STATS_BH(net, idev,
+						 IPSTATS_MIB_INUNKNOWNPROTOS);
+				icmpv6_send(skb, ICMPV6_PARAMPROB,
+					    ICMPV6_UNK_NEXTHDR, nhoff);
+			}
+		} else
+			IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS);
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+	return 0;
+
+discard:
+	IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS);
+	rcu_read_unlock();
+	kfree_skb(skb);
+	return 0;
+}
+
+
+int ip6_input(struct sk_buff *skb)
+{
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+		       ip6_input_finish);
+}
+
+int ip6_mc_input(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	int deliver;
+
+	IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev),
+			 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+			 skb->len);
+
+	hdr = ipv6_hdr(skb);
+	deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+
+#ifdef CONFIG_IPV6_MROUTE
+	/*
+	 *      IPv6 multicast router mode is now supported ;)
+	 */
+	if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+	    !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) &&
+	    likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
+		/*
+		 * Okay, we try to forward - split and duplicate
+		 * packets.
+		 */
+		struct sk_buff *skb2;
+		struct inet6_skb_parm *opt = IP6CB(skb);
+
+		/* Check for MLD */
+		if (unlikely(opt->ra)) {
+			/* Check if this is a mld message */
+			u8 *ptr = skb_network_header(skb) + opt->ra;
+			struct icmp6hdr *icmp6;
+			u8 nexthdr = hdr->nexthdr;
+			int offset;
+
+			/* Check if the value of Router Alert
+			 * is for MLD (0x0000).
+			 */
+			if ((ptr[2] | ptr[3]) == 0) {
+				deliver = 0;
+
+				if (!ipv6_ext_hdr(nexthdr)) {
+					/* BUG */
+					goto out;
+				}
+				offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
+							  &nexthdr);
+				if (offset < 0)
+					goto out;
+
+				if (nexthdr != IPPROTO_ICMPV6)
+					goto out;
+
+				if (!pskb_may_pull(skb, (skb_network_header(skb) +
+						   offset + 1 - skb->data)))
+					goto out;
+
+				icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+
+				switch (icmp6->icmp6_type) {
+				case ICMPV6_MGM_QUERY:
+				case ICMPV6_MGM_REPORT:
+				case ICMPV6_MGM_REDUCTION:
+				case ICMPV6_MLD2_REPORT:
+					deliver = 1;
+					break;
+				}
+				goto out;
+			}
+			/* unknown RA - process it normally */
+		}
+
+		if (deliver)
+			skb2 = skb_clone(skb, GFP_ATOMIC);
+		else {
+			skb2 = skb;
+			skb = NULL;
+		}
+
+		if (skb2) {
+			ip6_mr_input(skb2);
+		}
+	}
+out:
+#endif
+	if (likely(deliver))
+		ip6_input(skb);
+	else {
+		/* discard */
+		kfree_skb(skb);
+	}
+
+	return 0;
+}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
new file mode 100644
index 00000000..ae9f6d43
--- /dev/null
+++ b/net/ipv6/ip6_output.c
@@ -0,0 +1,1690 @@
+/*
+ *	IPv6 output functions
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on linux/net/ipv4/ip_output.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Changes:
+ *	A.N.Kuznetsov	:	airthmetics in fragmentation.
+ *				extension headers are implemented.
+ *				route changes now work.
+ *				ip6_forward does not confuse sniffers.
+ *				etc.
+ *
+ *      H. von Brand    :       Added missing #include <linux/string.h>
+ *	Imran Patel	: 	frag id should be in NBO
+ *      Kazunori MIYAZAWA @USAGI
+ *			:       add ip6_append_data and related functions
+ *				for datagram xmit
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/tcp.h>
+#include <linux/route.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/rawv6.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/checksum.h>
+#include <linux/mroute6.h>
+
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+
+int __ip6_local_out(struct sk_buff *skb)
+{
+	int len;
+
+	len = skb->len - sizeof(struct ipv6hdr);
+	if (len > IPV6_MAXPLEN)
+		len = 0;
+	ipv6_hdr(skb)->payload_len = htons(len);
+
+	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
+		       skb_dst(skb)->dev, dst_output);
+}
+
+int ip6_local_out(struct sk_buff *skb)
+{
+	int err;
+
+	err = __ip6_local_out(skb);
+	if (likely(err == 1))
+		err = dst_output(skb);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ip6_local_out);
+
+/* dev_loopback_xmit for use with netfilter. */
+static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
+{
+	skb_reset_mac_header(newskb);
+	__skb_pull(newskb, skb_network_offset(newskb));
+	newskb->pkt_type = PACKET_LOOPBACK;
+	newskb->ip_summed = CHECKSUM_UNNECESSARY;
+	WARN_ON(!skb_dst(newskb));
+
+	netif_rx_ni(newskb);
+	return 0;
+}
+
+static int ip6_finish_output2(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *dev = dst->dev;
+	struct neighbour *neigh;
+	int res;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	skb->dev = dev;
+
+	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
+		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+
+		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
+		    ((mroute6_socket(dev_net(dev), skb) &&
+		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
+		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
+					 &ipv6_hdr(skb)->saddr))) {
+			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+
+			/* Do not check for IFF_ALLMULTI; multicast routing
+			   is not supported in any case.
+			 */
+			if (newskb)
+				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+					newskb, NULL, newskb->dev,
+					ip6_dev_loopback_xmit);
+
+			if (ipv6_hdr(skb)->hop_limit == 0) {
+				IP6_INC_STATS(dev_net(dev), idev,
+					      IPSTATS_MIB_OUTDISCARDS);
+				kfree_skb(skb);
+				return 0;
+			}
+		}
+
+		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
+				skb->len);
+	}
+
+	rcu_read_lock();
+	if (dst->hh) {
+		res = neigh_hh_output(dst->hh, skb);
+
+		rcu_read_unlock();
+		return res;
+	} else {
+		neigh = dst_get_neighbour(dst);
+		if (neigh) {
+			res = neigh->output(skb);
+
+			rcu_read_unlock();
+			return res;
+		}
+		rcu_read_unlock();
+	}
+
+	IP6_INC_STATS_BH(dev_net(dst->dev),
+			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int ip6_finish_output(struct sk_buff *skb)
+{
+	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+	    dst_allfrag(skb_dst(skb)))
+		return ip6_fragment(skb, ip6_finish_output2);
+	else
+		return ip6_finish_output2(skb);
+}
+
+int ip6_output(struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+	if (unlikely(idev->cnf.disable_ipv6)) {
+		IP6_INC_STATS(dev_net(dev), idev,
+			      IPSTATS_MIB_OUTDISCARDS);
+		kfree_skb(skb);
+		return 0;
+	}
+
+	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
+			    ip6_finish_output,
+			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+}
+
+/*
+ *	xmit an sk_buff (used by TCP, SCTP and DCCP)
+ */
+
+int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
+	     struct ipv6_txoptions *opt)
+{
+	struct net *net = sock_net(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct in6_addr *first_hop = &fl6->daddr;
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *hdr;
+	u8  proto = fl6->flowi6_proto;
+	int seg_len = skb->len;
+	int hlimit = -1;
+	int tclass = 0;
+	u32 mtu;
+
+	if (opt) {
+		unsigned int head_room;
+
+		/* First: exthdrs may take lots of space (~8K for now)
+		   MAX_HEADER is not enough.
+		 */
+		head_room = opt->opt_nflen + opt->opt_flen;
+		seg_len += head_room;
+		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+
+		if (skb_headroom(skb) < head_room) {
+			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+			if (skb2 == NULL) {
+				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+					      IPSTATS_MIB_OUTDISCARDS);
+				kfree_skb(skb);
+				return -ENOBUFS;
+			}
+			kfree_skb(skb);
+			skb = skb2;
+			skb_set_owner_w(skb, sk);
+		}
+		if (opt->opt_flen)
+			ipv6_push_frag_opts(skb, opt, &proto);
+		if (opt->opt_nflen)
+			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+	}
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	hdr = ipv6_hdr(skb);
+
+	/*
+	 *	Fill in the IPv6 header
+	 */
+	if (np) {
+		tclass = np->tclass;
+		hlimit = np->hop_limit;
+	}
+	if (hlimit < 0)
+		hlimit = ip6_dst_hoplimit(dst);
+
+	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
+
+	hdr->payload_len = htons(seg_len);
+	hdr->nexthdr = proto;
+	hdr->hop_limit = hlimit;
+
+	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
+	ipv6_addr_copy(&hdr->daddr, first_hop);
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+
+	mtu = dst_mtu(dst);
+	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
+		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			      IPSTATS_MIB_OUT, skb->len);
+		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
+			       dst->dev, dst_output);
+	}
+
+	if (net_ratelimit())
+		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+	skb->dev = dst->dev;
+	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+	kfree_skb(skb);
+	return -EMSGSIZE;
+}
+
+EXPORT_SYMBOL(ip6_xmit);
+
+/*
+ *	To avoid extra problems ND packets are send through this
+ *	routine. It's code duplication but I really want to avoid
+ *	extra checks since ipv6_build_header is used by TCP (which
+ *	is for us performance critical)
+ */
+
+int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
+	       const struct in6_addr *saddr, const struct in6_addr *daddr,
+	       int proto, int len)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6hdr *hdr;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	skb->dev = dev;
+
+	skb_reset_network_header(skb);
+	skb_put(skb, sizeof(struct ipv6hdr));
+	hdr = ipv6_hdr(skb);
+
+	*(__be32*)hdr = htonl(0x60000000);
+
+	hdr->payload_len = htons(len);
+	hdr->nexthdr = proto;
+	hdr->hop_limit = np->hop_limit;
+
+	ipv6_addr_copy(&hdr->saddr, saddr);
+	ipv6_addr_copy(&hdr->daddr, daddr);
+
+	return 0;
+}
+
+static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
+{
+	struct ip6_ra_chain *ra;
+	struct sock *last = NULL;
+
+	read_lock(&ip6_ra_lock);
+	for (ra = ip6_ra_chain; ra; ra = ra->next) {
+		struct sock *sk = ra->sk;
+		if (sk && ra->sel == sel &&
+		    (!sk->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
+			if (last) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					rawv6_rcv(last, skb2);
+			}
+			last = sk;
+		}
+	}
+
+	if (last) {
+		rawv6_rcv(last, skb);
+		read_unlock(&ip6_ra_lock);
+		return 1;
+	}
+	read_unlock(&ip6_ra_lock);
+	return 0;
+}
+
+static int ip6_forward_proxy_check(struct sk_buff *skb)
+{
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	u8 nexthdr = hdr->nexthdr;
+	int offset;
+
+	if (ipv6_ext_hdr(nexthdr)) {
+		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
+		if (offset < 0)
+			return 0;
+	} else
+		offset = sizeof(struct ipv6hdr);
+
+	if (nexthdr == IPPROTO_ICMPV6) {
+		struct icmp6hdr *icmp6;
+
+		if (!pskb_may_pull(skb, (skb_network_header(skb) +
+					 offset + 1 - skb->data)))
+			return 0;
+
+		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+
+		switch (icmp6->icmp6_type) {
+		case NDISC_ROUTER_SOLICITATION:
+		case NDISC_ROUTER_ADVERTISEMENT:
+		case NDISC_NEIGHBOUR_SOLICITATION:
+		case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		case NDISC_REDIRECT:
+			/* For reaction involving unicast neighbor discovery
+			 * message destined to the proxied address, pass it to
+			 * input function.
+			 */
+			return 1;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * The proxying router can't forward traffic sent to a link-local
+	 * address, so signal the sender and discard the packet. This
+	 * behavior is clarified by the MIPv6 specification.
+	 */
+	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
+		dst_link_failure(skb);
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline int ip6_forward_finish(struct sk_buff *skb)
+{
+	return dst_output(skb);
+}
+
+int ip6_forward(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct net *net = dev_net(dst->dev);
+	struct neighbour *n;
+	u32 mtu;
+
+	if (net->ipv6.devconf_all->forwarding == 0)
+		goto error;
+
+	if (skb_warn_if_lro(skb))
+		goto drop;
+
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	/*
+	 *	We DO NOT make any processing on
+	 *	RA packets, pushing them to user level AS IS
+	 *	without ane WARRANTY that application will be able
+	 *	to interpret them. The reason is that we
+	 *	cannot make anything clever here.
+	 *
+	 *	We are not end-node, so that if packet contains
+	 *	AH/ESP, we cannot make anything.
+	 *	Defragmentation also would be mistake, RA packets
+	 *	cannot be fragmented, because there is no warranty
+	 *	that different fragments will go along one path. --ANK
+	 */
+	if (opt->ra) {
+		u8 *ptr = skb_network_header(skb) + opt->ra;
+		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
+			return 0;
+	}
+
+	/*
+	 *	check and decrement ttl
+	 */
+	if (hdr->hop_limit <= 1) {
+		/* Force OUTPUT device used as source address */
+		skb->dev = dst->dev;
+		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+
+		kfree_skb(skb);
+		return -ETIMEDOUT;
+	}
+
+	/* XXX: idev->cnf.proxy_ndp? */
+	if (net->ipv6.devconf_all->proxy_ndp &&
+	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
+		int proxied = ip6_forward_proxy_check(skb);
+		if (proxied > 0)
+			return ip6_input(skb);
+		else if (proxied < 0) {
+			IP6_INC_STATS(net, ip6_dst_idev(dst),
+				      IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+
+	if (!xfrm6_route_forward(skb)) {
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+	dst = skb_dst(skb);
+
+	/* IPv6 specs say nothing about it, but it is clear that we cannot
+	   send redirects to source routed frames.
+	   We don't send redirects to frames decapsulated from IPsec.
+	 */
+	n = dst_get_neighbour(dst);
+	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
+		struct in6_addr *target = NULL;
+		struct rt6_info *rt;
+
+		/*
+		 *	incoming and outgoing devices are the same
+		 *	send a redirect.
+		 */
+
+		rt = (struct rt6_info *) dst;
+		if ((rt->rt6i_flags & RTF_GATEWAY))
+			target = (struct in6_addr*)&n->primary_key;
+		else
+			target = &hdr->daddr;
+
+		if (!rt->rt6i_peer)
+			rt6_bind_peer(rt, 1);
+
+		/* Limit redirects both by destination (here)
+		   and by source (inside ndisc_send_redirect)
+		 */
+		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
+			ndisc_send_redirect(skb, n, target);
+	} else {
+		int addrtype = ipv6_addr_type(&hdr->saddr);
+
+		/* This check is security critical. */
+		if (addrtype == IPV6_ADDR_ANY ||
+		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
+			goto error;
+		if (addrtype & IPV6_ADDR_LINKLOCAL) {
+			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
+				    ICMPV6_NOT_NEIGHBOUR, 0);
+			goto error;
+		}
+	}
+
+	mtu = dst_mtu(dst);
+	if (mtu < IPV6_MIN_MTU)
+		mtu = IPV6_MIN_MTU;
+
+	if (skb->len > mtu && !skb_is_gso(skb)) {
+		/* Again, force OUTPUT device used as source address */
+		skb->dev = dst->dev;
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	if (skb_cow(skb, dst->dev->hard_header_len)) {
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+		goto drop;
+	}
+
+	hdr = ipv6_hdr(skb);
+
+	/* Mangling hops number delayed to point after skb COW */
+
+	hdr->hop_limit--;
+
+	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
+		       ip6_forward_finish);
+
+error:
+	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+	to->pkt_type = from->pkt_type;
+	to->priority = from->priority;
+	to->protocol = from->protocol;
+	skb_dst_drop(to);
+	skb_dst_set(to, dst_clone(skb_dst(from)));
+	to->dev = from->dev;
+	to->mark = from->mark;
+
+#ifdef CONFIG_NET_SCHED
+	to->tc_index = from->tc_index;
+#endif
+	nf_copy(to, from);
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+	to->nf_trace = from->nf_trace;
+#endif
+	skb_copy_secmark(to, from);
+}
+
+int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
+{
+	u16 offset = sizeof(struct ipv6hdr);
+	struct ipv6_opt_hdr *exthdr =
+				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+	unsigned int packet_len = skb->tail - skb->network_header;
+	int found_rhdr = 0;
+	*nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+	while (offset + 1 <= packet_len) {
+
+		switch (**nexthdr) {
+
+		case NEXTHDR_HOP:
+			break;
+		case NEXTHDR_ROUTING:
+			found_rhdr = 1;
+			break;
+		case NEXTHDR_DEST:
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+				break;
+#endif
+			if (found_rhdr)
+				return offset;
+			break;
+		default :
+			return offset;
+		}
+
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
+		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
+						 offset);
+	}
+
+	return offset;
+}
+
+static u32 hashidentrnd __read_mostly;
+#define FID_HASH_SZ 16
+static u32 ipv6_fragmentation_id[FID_HASH_SZ];
+
+void __init initialize_hashidentrnd(void)
+{
+	get_random_bytes(&hashidentrnd, sizeof(hashidentrnd));
+}
+
+static u32 __ipv6_select_ident(const struct in6_addr *addr)
+{
+	u32 newid, oldid, hash = jhash2((u32 *)addr, 4, hashidentrnd);
+	u32 *pid = &ipv6_fragmentation_id[hash % FID_HASH_SZ];
+
+	do {
+		oldid = *pid;
+		newid = oldid + 1;
+		if (!(hash + newid))
+			newid++;
+	} while (cmpxchg(pid, oldid, newid) != oldid);
+
+	return hash + newid;
+}
+
+void ipv6_select_ident(struct frag_hdr *fhdr, struct in6_addr *addr)
+{
+	fhdr->identification = htonl(__ipv6_select_ident(addr));
+}
+
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+{
+	struct sk_buff *frag;
+	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
+	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+	struct ipv6hdr *tmp_hdr;
+	struct frag_hdr *fh;
+	unsigned int mtu, hlen, left, len;
+	__be32 frag_id = 0;
+	int ptr, offset = 0, err=0;
+	u8 *prevhdr, nexthdr = 0;
+	struct net *net = dev_net(skb_dst(skb)->dev);
+
+	hlen = ip6_find_1stfragopt(skb, &prevhdr);
+	nexthdr = *prevhdr;
+
+	mtu = ip6_skb_dst_mtu(skb);
+
+	/* We must not fragment if the socket is set to force MTU discovery
+	 * or if the skb it not generated by a local socket.
+	 */
+	if (!skb->local_df && skb->len > mtu) {
+		skb->dev = skb_dst(skb)->dev;
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			      IPSTATS_MIB_FRAGFAILS);
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	if (np && np->frag_size < mtu) {
+		if (np->frag_size)
+			mtu = np->frag_size;
+	}
+	mtu -= hlen + sizeof(struct frag_hdr);
+
+	if (skb_has_frag_list(skb)) {
+		int first_len = skb_pagelen(skb);
+		struct sk_buff *frag2;
+
+		if (first_len - hlen > mtu ||
+		    ((first_len - hlen) & 7) ||
+		    skb_cloned(skb))
+			goto slow_path;
+
+		skb_walk_frags(skb, frag) {
+			/* Correct geometry. */
+			if (frag->len > mtu ||
+			    ((frag->len & 7) && frag->next) ||
+			    skb_headroom(frag) < hlen)
+				goto slow_path_clean;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag))
+				goto slow_path_clean;
+
+			BUG_ON(frag->sk);
+			if (skb->sk) {
+				frag->sk = skb->sk;
+				frag->destructor = sock_wfree;
+			}
+			skb->truesize -= frag->truesize;
+		}
+
+		err = 0;
+		offset = 0;
+		frag = skb_shinfo(skb)->frag_list;
+		skb_frag_list_init(skb);
+		/* BUILD HEADER */
+
+		*prevhdr = NEXTHDR_FRAGMENT;
+		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+		if (!tmp_hdr) {
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				      IPSTATS_MIB_FRAGFAILS);
+			return -ENOMEM;
+		}
+
+		__skb_pull(skb, hlen);
+		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
+		__skb_push(skb, hlen);
+		skb_reset_network_header(skb);
+		memcpy(skb_network_header(skb), tmp_hdr, hlen);
+
+		ipv6_select_ident(fh, &rt->rt6i_dst.addr);
+		fh->nexthdr = nexthdr;
+		fh->reserved = 0;
+		fh->frag_off = htons(IP6_MF);
+		frag_id = fh->identification;
+
+		first_len = skb_pagelen(skb);
+		skb->data_len = first_len - skb_headlen(skb);
+		skb->len = first_len;
+		ipv6_hdr(skb)->payload_len = htons(first_len -
+						   sizeof(struct ipv6hdr));
+
+		dst_hold(&rt->dst);
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down. */
+			if (frag) {
+				frag->ip_summed = CHECKSUM_NONE;
+				skb_reset_transport_header(frag);
+				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
+				__skb_push(frag, hlen);
+				skb_reset_network_header(frag);
+				memcpy(skb_network_header(frag), tmp_hdr,
+				       hlen);
+				offset += skb->len - hlen - sizeof(struct frag_hdr);
+				fh->nexthdr = nexthdr;
+				fh->reserved = 0;
+				fh->frag_off = htons(offset);
+				if (frag->next != NULL)
+					fh->frag_off |= htons(IP6_MF);
+				fh->identification = frag_id;
+				ipv6_hdr(frag)->payload_len =
+						htons(frag->len -
+						      sizeof(struct ipv6hdr));
+				ip6_copy_metadata(frag, skb);
+			}
+
+			err = output(skb);
+			if(!err)
+				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+					      IPSTATS_MIB_FRAGCREATES);
+
+			if (err || !frag)
+				break;
+
+			skb = frag;
+			frag = skb->next;
+			skb->next = NULL;
+		}
+
+		kfree(tmp_hdr);
+
+		if (err == 0) {
+			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+				      IPSTATS_MIB_FRAGOKS);
+			dst_release(&rt->dst);
+			return 0;
+		}
+
+		while (frag) {
+			skb = frag->next;
+			kfree_skb(frag);
+			frag = skb;
+		}
+
+		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+			      IPSTATS_MIB_FRAGFAILS);
+		dst_release(&rt->dst);
+		return err;
+
+slow_path_clean:
+		skb_walk_frags(skb, frag2) {
+			if (frag2 == frag)
+				break;
+			frag2->sk = NULL;
+			frag2->destructor = NULL;
+			skb->truesize += frag2->truesize;
+		}
+	}
+
+slow_path:
+	left = skb->len - hlen;		/* Space per frame */
+	ptr = hlen;			/* Where to start from */
+
+	/*
+	 *	Fragment the datagram.
+	 */
+
+	*prevhdr = NEXTHDR_FRAGMENT;
+
+	/*
+	 *	Keep copying data until we run out.
+	 */
+	while(left > 0)	{
+		len = left;
+		/* IF: it doesn't fit, use 'mtu' - the data space left */
+		if (len > mtu)
+			len = mtu;
+		/* IF: we are not sending up to and including the packet end
+		   then align the next start on an eight byte boundary */
+		if (len < left)	{
+			len &= ~7;
+		}
+		/*
+		 *	Allocate buffer.
+		 */
+
+		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
+			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
+			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				      IPSTATS_MIB_FRAGFAILS);
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		/*
+		 *	Set up data on packet
+		 */
+
+		ip6_copy_metadata(frag, skb);
+		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
+		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
+		skb_reset_network_header(frag);
+		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
+		frag->transport_header = (frag->network_header + hlen +
+					  sizeof(struct frag_hdr));
+
+		/*
+		 *	Charge the memory for the fragment to any owner
+		 *	it might possess
+		 */
+		if (skb->sk)
+			skb_set_owner_w(frag, skb->sk);
+
+		/*
+		 *	Copy the packet header into the new buffer.
+		 */
+		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
+
+		/*
+		 *	Build fragment header.
+		 */
+		fh->nexthdr = nexthdr;
+		fh->reserved = 0;
+		if (!frag_id) {
+			ipv6_select_ident(fh, &rt->rt6i_dst.addr);
+			frag_id = fh->identification;
+		} else
+			fh->identification = frag_id;
+
+		/*
+		 *	Copy a block of the IP datagram.
+		 */
+		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
+			BUG();
+		left -= len;
+
+		fh->frag_off = htons(offset);
+		if (left > 0)
+			fh->frag_off |= htons(IP6_MF);
+		ipv6_hdr(frag)->payload_len = htons(frag->len -
+						    sizeof(struct ipv6hdr));
+
+		ptr += len;
+		offset += len;
+
+		/*
+		 *	Put this fragment into the sending queue.
+		 */
+		err = output(frag);
+		if (err)
+			goto fail;
+
+		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			      IPSTATS_MIB_FRAGCREATES);
+	}
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		      IPSTATS_MIB_FRAGOKS);
+	kfree_skb(skb);
+	return err;
+
+fail:
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		      IPSTATS_MIB_FRAGFAILS);
+	kfree_skb(skb);
+	return err;
+}
+
+static inline int ip6_rt_check(const struct rt6key *rt_key,
+			       const struct in6_addr *fl_addr,
+			       const struct in6_addr *addr_cache)
+{
+	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
+}
+
+static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
+					  struct dst_entry *dst,
+					  const struct flowi6 *fl6)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct rt6_info *rt = (struct rt6_info *)dst;
+
+	if (!dst)
+		goto out;
+
+	/* Yes, checking route validity in not connected
+	 * case is not very simple. Take into account,
+	 * that we do not support routing by source, TOS,
+	 * and MSG_DONTROUTE 		--ANK (980726)
+	 *
+	 * 1. ip6_rt_check(): If route was host route,
+	 *    check that cached destination is current.
+	 *    If it is network route, we still may
+	 *    check its validity using saved pointer
+	 *    to the last used address: daddr_cache.
+	 *    We do not want to save whole address now,
+	 *    (because main consumer of this service
+	 *    is tcp, which has not this problem),
+	 *    so that the last trick works only on connected
+	 *    sockets.
+	 * 2. oif also should be the same.
+	 */
+	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
+#ifdef CONFIG_IPV6_SUBTREES
+	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
+#endif
+	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+		dst_release(dst);
+		dst = NULL;
+	}
+
+out:
+	return dst;
+}
+
+static int ip6_dst_lookup_tail(struct sock *sk,
+			       struct dst_entry **dst, struct flowi6 *fl6)
+{
+	struct net *net = sock_net(sk);
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	struct neighbour *n;
+#endif
+	int err;
+
+	if (*dst == NULL)
+		*dst = ip6_route_output(net, sk, fl6);
+
+	if ((err = (*dst)->error))
+		goto out_err_release;
+
+	if (ipv6_addr_any(&fl6->saddr)) {
+		struct rt6_info *rt = (struct rt6_info *) *dst;
+		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+					  sk ? inet6_sk(sk)->srcprefs : 0,
+					  &fl6->saddr);
+		if (err)
+			goto out_err_release;
+	}
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	/*
+	 * Here if the dst entry we've looked up
+	 * has a neighbour entry that is in the INCOMPLETE
+	 * state and the src address from the flow is
+	 * marked as OPTIMISTIC, we release the found
+	 * dst entry and replace it instead with the
+	 * dst entry of the nexthop router
+	 */
+	rcu_read_lock();
+	n = dst_get_neighbour(*dst);
+	if (n && !(n->nud_state & NUD_VALID)) {
+		struct inet6_ifaddr *ifp;
+		struct flowi6 fl_gw6;
+		int redirect;
+
+		rcu_read_unlock();
+		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
+				      (*dst)->dev, 1);
+
+		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
+		if (ifp)
+			in6_ifa_put(ifp);
+
+		if (redirect) {
+			/*
+			 * We need to get the dst entry for the
+			 * default router instead
+			 */
+			dst_release(*dst);
+			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
+			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
+			*dst = ip6_route_output(net, sk, &fl_gw6);
+			if ((err = (*dst)->error))
+				goto out_err_release;
+		}
+	} else {
+		rcu_read_unlock();
+	}
+#endif
+
+	return 0;
+
+out_err_release:
+	if (err == -ENETUNREACH)
+		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
+	dst_release(*dst);
+	*dst = NULL;
+	return err;
+}
+
+/**
+ *	ip6_dst_lookup - perform route lookup on flow
+ *	@sk: socket which provides route info
+ *	@dst: pointer to dst_entry * for result
+ *	@fl6: flow to lookup
+ *
+ *	This function performs a route lookup on the given flow.
+ *
+ *	It returns zero on success, or a standard errno code on error.
+ */
+int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
+{
+	*dst = NULL;
+	return ip6_dst_lookup_tail(sk, dst, fl6);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
+/**
+ *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ *	@sk: socket which provides route info
+ *	@fl6: flow to lookup
+ *	@final_dst: final destination address for ipsec lookup
+ *	@can_sleep: we are in a sleepable context
+ *
+ *	This function performs a route lookup on the given flow.
+ *
+ *	It returns a valid dst pointer on success, or a pointer encoded
+ *	error code.
+ */
+struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+				      const struct in6_addr *final_dst,
+				      bool can_sleep)
+{
+	struct dst_entry *dst = NULL;
+	int err;
+
+	err = ip6_dst_lookup_tail(sk, &dst, fl6);
+	if (err)
+		return ERR_PTR(err);
+	if (final_dst)
+		ipv6_addr_copy(&fl6->daddr, final_dst);
+	if (can_sleep)
+		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
+
+	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
+
+/**
+ *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
+ *	@sk: socket which provides the dst cache and route info
+ *	@fl6: flow to lookup
+ *	@final_dst: final destination address for ipsec lookup
+ *	@can_sleep: we are in a sleepable context
+ *
+ *	This function performs a route lookup on the given flow with the
+ *	possibility of using the cached route in the socket if it is valid.
+ *	It will take the socket dst lock when operating on the dst cache.
+ *	As a result, this function can only be used in process context.
+ *
+ *	It returns a valid dst pointer on success, or a pointer encoded
+ *	error code.
+ */
+struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
+					 const struct in6_addr *final_dst,
+					 bool can_sleep)
+{
+	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
+	int err;
+
+	dst = ip6_sk_dst_check(sk, dst, fl6);
+
+	err = ip6_dst_lookup_tail(sk, &dst, fl6);
+	if (err)
+		return ERR_PTR(err);
+	if (final_dst)
+		ipv6_addr_copy(&fl6->daddr, final_dst);
+	if (can_sleep)
+		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
+
+	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+}
+EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
+
+static inline int ip6_ufo_append_data(struct sock *sk,
+			int getfrag(void *from, char *to, int offset, int len,
+			int odd, struct sk_buff *skb),
+			void *from, int length, int hh_len, int fragheaderlen,
+			int transhdrlen, int mtu,unsigned int flags,
+			struct rt6_info *rt)
+
+{
+	struct sk_buff *skb;
+	int err;
+
+	/* There is support for UDP large send offload by network
+	 * device, so create one single skb packet containing complete
+	 * udp datagram
+	 */
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
+		skb = sock_alloc_send_skb(sk,
+			hh_len + fragheaderlen + transhdrlen + 20,
+			(flags & MSG_DONTWAIT), &err);
+		if (skb == NULL)
+			return -ENOMEM;
+
+		/* reserve space for Hardware header */
+		skb_reserve(skb, hh_len);
+
+		/* create space for UDP/IP header */
+		skb_put(skb,fragheaderlen + transhdrlen);
+
+		/* initialize network header pointer */
+		skb_reset_network_header(skb);
+
+		/* initialize protocol header pointer */
+		skb->transport_header = skb->network_header + fragheaderlen;
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum = 0;
+	}
+
+	err = skb_append_datato_frags(sk,skb, getfrag, from,
+				      (length - transhdrlen));
+	if (!err) {
+		struct frag_hdr fhdr;
+
+		/* Specify the length of each IPv6 datagram fragment.
+		 * It has to be a multiple of 8.
+		 */
+		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
+					     sizeof(struct frag_hdr)) & ~7;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+		ipv6_select_ident(&fhdr, &rt->rt6i_dst.addr);
+		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
+		__skb_queue_tail(&sk->sk_write_queue, skb);
+
+		return 0;
+	}
+	/* There is not enough support do UPD LSO,
+	 * so follow normal path
+	 */
+	kfree_skb(skb);
+
+	return err;
+}
+
+static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
+					       gfp_t gfp)
+{
+	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+
+static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
+						gfp_t gfp)
+{
+	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
+}
+
+static void ip6_append_data_mtu(int *mtu,
+				int *maxfraglen,
+				unsigned int fragheaderlen,
+				struct sk_buff *skb,
+				struct rt6_info *rt)
+{
+	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
+		if (skb == NULL) {
+			/* first fragment, reserve header_len */
+			*mtu = *mtu - rt->dst.header_len;
+
+		} else {
+			/*
+			 * this fragment is not first, the headers
+			 * space is regarded as data space.
+			 */
+			*mtu = dst_mtu(rt->dst.path);
+		}
+		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
+			      + fragheaderlen - sizeof(struct frag_hdr);
+	}
+}
+
+int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
+	int offset, int len, int odd, struct sk_buff *skb),
+	void *from, int length, int transhdrlen,
+	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
+	struct rt6_info *rt, unsigned int flags, int dontfrag)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet_cork *cork;
+	struct sk_buff *skb, *skb_prev = NULL;
+	unsigned int maxfraglen, fragheaderlen;
+	int exthdrlen;
+	int hh_len;
+	int mtu;
+	int copy;
+	int err;
+	int offset = 0;
+	int csummode = CHECKSUM_NONE;
+	__u8 tx_flags = 0;
+
+	if (flags&MSG_PROBE)
+		return 0;
+	cork = &inet->cork.base;
+	if (skb_queue_empty(&sk->sk_write_queue)) {
+		/*
+		 * setup for corking
+		 */
+		if (opt) {
+			if (WARN_ON(np->cork.opt))
+				return -EINVAL;
+
+			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
+			if (unlikely(np->cork.opt == NULL))
+				return -ENOBUFS;
+
+			np->cork.opt->tot_len = opt->tot_len;
+			np->cork.opt->opt_flen = opt->opt_flen;
+			np->cork.opt->opt_nflen = opt->opt_nflen;
+
+			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
+							    sk->sk_allocation);
+			if (opt->dst0opt && !np->cork.opt->dst0opt)
+				return -ENOBUFS;
+
+			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
+							    sk->sk_allocation);
+			if (opt->dst1opt && !np->cork.opt->dst1opt)
+				return -ENOBUFS;
+
+			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
+							   sk->sk_allocation);
+			if (opt->hopopt && !np->cork.opt->hopopt)
+				return -ENOBUFS;
+
+			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
+							    sk->sk_allocation);
+			if (opt->srcrt && !np->cork.opt->srcrt)
+				return -ENOBUFS;
+
+			/* need source address above miyazawa*/
+		}
+		dst_hold(&rt->dst);
+		cork->dst = &rt->dst;
+		inet->cork.fl.u.ip6 = *fl6;
+		np->cork.hop_limit = hlimit;
+		np->cork.tclass = tclass;
+		if (rt->dst.flags & DST_XFRM_TUNNEL)
+			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
+		else
+			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+		if (np->frag_size < mtu) {
+			if (np->frag_size)
+				mtu = np->frag_size;
+		}
+		cork->fragsize = mtu;
+		if (dst_allfrag(rt->dst.path))
+			cork->flags |= IPCORK_ALLFRAG;
+		cork->length = 0;
+		sk->sk_sndmsg_page = NULL;
+		sk->sk_sndmsg_off = 0;
+		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
+			    rt->rt6i_nfheader_len;
+		length += exthdrlen;
+		transhdrlen += exthdrlen;
+	} else {
+		rt = (struct rt6_info *)cork->dst;
+		fl6 = &inet->cork.fl.u.ip6;
+		opt = np->cork.opt;
+		transhdrlen = 0;
+		exthdrlen = 0;
+		mtu = cork->fragsize;
+	}
+
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+
+	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
+			(opt ? opt->opt_nflen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
+
+	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
+		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
+			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
+			return -EMSGSIZE;
+		}
+	}
+
+	/* For UDP, check if TX timestamp is enabled */
+	if (sk->sk_type == SOCK_DGRAM) {
+		err = sock_tx_timestamp(sk, &tx_flags);
+		if (err)
+			goto error;
+	}
+
+	/*
+	 * Let's try using as much space as possible.
+	 * Use MTU if total length of the message fits into the MTU.
+	 * Otherwise, we need to reserve fragment header and
+	 * fragment alignment (= 8-15 octects, in total).
+	 *
+	 * Note that we may need to "move" the data from the tail of
+	 * of the buffer to the new fragment when we split
+	 * the message.
+	 *
+	 * FIXME: It may be fragmented into multiple chunks
+	 *        at once if non-fragmentable extension headers
+	 *        are too large.
+	 * --yoshfuji
+	 */
+
+	cork->length += length;
+	if (length > mtu) {
+		int proto = sk->sk_protocol;
+		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
+			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
+			return -EMSGSIZE;
+		}
+
+		if (proto == IPPROTO_UDP &&
+		    (rt->dst.dev->features & NETIF_F_UFO)) {
+
+			err = ip6_ufo_append_data(sk, getfrag, from, length,
+						  hh_len, fragheaderlen,
+						  transhdrlen, mtu, flags, rt);
+			if (err)
+				goto error;
+			return 0;
+		}
+	}
+
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+		goto alloc_new_skb;
+
+	while (length > 0) {
+		/* Check if the remaining data fits into current packet. */
+		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+		if (copy < length)
+			copy = maxfraglen - skb->len;
+
+		if (copy <= 0) {
+			char *data;
+			unsigned int datalen;
+			unsigned int fraglen;
+			unsigned int fraggap;
+			unsigned int alloclen;
+alloc_new_skb:
+			/* There's no room in the current skb */
+			if (skb)
+				fraggap = skb->len - maxfraglen;
+			else
+				fraggap = 0;
+			/* update mtu and maxfraglen if necessary */
+			if (skb == NULL || skb_prev == NULL)
+				ip6_append_data_mtu(&mtu, &maxfraglen,
+						    fragheaderlen, skb, rt);
+
+			skb_prev = skb;
+
+			/*
+			 * If remaining data exceeds the mtu,
+			 * we know we need more fragment(s).
+			 */
+			datalen = length + fraggap;
+
+			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+			if ((flags & MSG_MORE) &&
+			    !(rt->dst.dev->features&NETIF_F_SG))
+				alloclen = mtu;
+			else
+				alloclen = datalen + fragheaderlen;
+
+			if (datalen != length + fraggap) {
+				/*
+				 * this is not the last fragment, the trailer
+				 * space is regarded as data space.
+				 */
+				datalen += rt->dst.trailer_len;
+			}
+
+			alloclen += rt->dst.trailer_len;
+			fraglen = datalen + fragheaderlen;
+
+			/*
+			 * We just reserve space for fragment header.
+			 * Note: this may be overallocation if the message
+			 * (without MSG_MORE) fits into the MTU.
+			 */
+			alloclen += sizeof(struct frag_hdr);
+
+			if (transhdrlen) {
+				skb = sock_alloc_send_skb(sk,
+						alloclen + hh_len,
+						(flags & MSG_DONTWAIT), &err);
+			} else {
+				skb = NULL;
+				if (atomic_read(&sk->sk_wmem_alloc) <=
+				    2 * sk->sk_sndbuf)
+					skb = sock_wmalloc(sk,
+							   alloclen + hh_len, 1,
+							   sk->sk_allocation);
+				if (unlikely(skb == NULL))
+					err = -ENOBUFS;
+				else {
+					/* Only the initial fragment
+					 * is time stamped.
+					 */
+					tx_flags = 0;
+				}
+			}
+			if (skb == NULL)
+				goto error;
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = csummode;
+			skb->csum = 0;
+			/* reserve for fragmentation */
+			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
+
+			if (sk->sk_type == SOCK_DGRAM)
+				skb_shinfo(skb)->tx_flags = tx_flags;
+
+			/*
+			 *	Find where to start putting bytes
+			 */
+			data = skb_put(skb, fraglen);
+			skb_set_network_header(skb, exthdrlen);
+			data += fragheaderlen;
+			skb->transport_header = (skb->network_header +
+						 fragheaderlen);
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(
+					skb_prev, maxfraglen,
+					data + transhdrlen, fraggap, 0);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				data += fraggap;
+				pskb_trim_unique(skb_prev, maxfraglen);
+			}
+			copy = datalen - transhdrlen - fraggap;
+			if (copy < 0) {
+				err = -EINVAL;
+				kfree_skb(skb);
+				goto error;
+			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+				err = -EFAULT;
+				kfree_skb(skb);
+				goto error;
+			}
+
+			offset += copy;
+			length -= datalen - fraggap;
+			transhdrlen = 0;
+			exthdrlen = 0;
+			csummode = CHECKSUM_NONE;
+
+			/*
+			 * Put the packet on the pending queue
+			 */
+			__skb_queue_tail(&sk->sk_write_queue, skb);
+			continue;
+		}
+
+		if (copy > length)
+			copy = length;
+
+		if (!(rt->dst.dev->features&NETIF_F_SG)) {
+			unsigned int off;
+
+			off = skb->len;
+			if (getfrag(from, skb_put(skb, copy),
+						offset, copy, off, skb) < 0) {
+				__skb_trim(skb, off);
+				err = -EFAULT;
+				goto error;
+			}
+		} else {
+			int i = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+			struct page *page = sk->sk_sndmsg_page;
+			int off = sk->sk_sndmsg_off;
+			unsigned int left;
+
+			if (page && (left = PAGE_SIZE - off) > 0) {
+				if (copy >= left)
+					copy = left;
+				if (page != frag->page) {
+					if (i == MAX_SKB_FRAGS) {
+						err = -EMSGSIZE;
+						goto error;
+					}
+					get_page(page);
+					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+					frag = &skb_shinfo(skb)->frags[i];
+				}
+			} else if(i < MAX_SKB_FRAGS) {
+				if (copy > PAGE_SIZE)
+					copy = PAGE_SIZE;
+				page = alloc_pages(sk->sk_allocation, 0);
+				if (page == NULL) {
+					err = -ENOMEM;
+					goto error;
+				}
+				sk->sk_sndmsg_page = page;
+				sk->sk_sndmsg_off = 0;
+
+				skb_fill_page_desc(skb, i, page, 0, 0);
+				frag = &skb_shinfo(skb)->frags[i];
+			} else {
+				err = -EMSGSIZE;
+				goto error;
+			}
+			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+				err = -EFAULT;
+				goto error;
+			}
+			sk->sk_sndmsg_off += copy;
+			frag->size += copy;
+			skb->len += copy;
+			skb->data_len += copy;
+			skb->truesize += copy;
+			atomic_add(copy, &sk->sk_wmem_alloc);
+		}
+		offset += copy;
+		length -= copy;
+	}
+	return 0;
+error:
+	cork->length -= length;
+	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	return err;
+}
+
+static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
+{
+	if (np->cork.opt) {
+		kfree(np->cork.opt->dst0opt);
+		kfree(np->cork.opt->dst1opt);
+		kfree(np->cork.opt->hopopt);
+		kfree(np->cork.opt->srcrt);
+		kfree(np->cork.opt);
+		np->cork.opt = NULL;
+	}
+
+	if (inet->cork.base.dst) {
+		dst_release(inet->cork.base.dst);
+		inet->cork.base.dst = NULL;
+		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
+	}
+	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
+}
+
+int ip6_push_pending_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *tmp_skb;
+	struct sk_buff **tail_skb;
+	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
+	struct ipv6hdr *hdr;
+	struct ipv6_txoptions *opt = np->cork.opt;
+	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
+	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
+	unsigned char proto = fl6->flowi6_proto;
+	int err = 0;
+
+	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+		goto out;
+	tail_skb = &(skb_shinfo(skb)->frag_list);
+
+	/* move skb->data to ip header from ext header */
+	if (skb->data < skb_network_header(skb))
+		__skb_pull(skb, skb_network_offset(skb));
+	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		__skb_pull(tmp_skb, skb_network_header_len(skb));
+		*tail_skb = tmp_skb;
+		tail_skb = &(tmp_skb->next);
+		skb->len += tmp_skb->len;
+		skb->data_len += tmp_skb->len;
+		skb->truesize += tmp_skb->truesize;
+		tmp_skb->destructor = NULL;
+		tmp_skb->sk = NULL;
+	}
+
+	/* Allow local fragmentation. */
+	if (np->pmtudisc < IPV6_PMTUDISC_DO)
+		skb->local_df = 1;
+
+	ipv6_addr_copy(final_dst, &fl6->daddr);
+	__skb_pull(skb, skb_network_header_len(skb));
+	if (opt && opt->opt_flen)
+		ipv6_push_frag_opts(skb, opt, &proto);
+	if (opt && opt->opt_nflen)
+		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	hdr = ipv6_hdr(skb);
+
+	*(__be32*)hdr = fl6->flowlabel |
+		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
+
+	hdr->hop_limit = np->cork.hop_limit;
+	hdr->nexthdr = proto;
+	ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
+	ipv6_addr_copy(&hdr->daddr, final_dst);
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+
+	skb_dst_set(skb, dst_clone(&rt->dst));
+	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+	if (proto == IPPROTO_ICMPV6) {
+		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+
+		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
+	}
+
+	err = ip6_local_out(skb);
+	if (err) {
+		if (err > 0)
+			err = net_xmit_errno(err);
+		if (err)
+			goto error;
+	}
+
+out:
+	ip6_cork_release(inet, np);
+	return err;
+error:
+	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	goto out;
+}
+
+void ip6_flush_pending_frames(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
+		if (skb_dst(skb))
+			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
+				      IPSTATS_MIB_OUTDISCARDS);
+		kfree_skb(skb);
+	}
+
+	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
+}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
new file mode 100644
index 00000000..848e494f
--- /dev/null
+++ b/net/ipv6/ip6_tunnel.c
@@ -0,0 +1,1582 @@
+/*
+ *	IPv6 tunneling device
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Ville Nuorvala		<vnuorval@tcs.hut.fi>
+ *	Yasuyuki Kozakai	<kozakai@linux-ipv6.org>
+ *
+ *      Based on:
+ *      linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
+ *
+ *      RFC 2473
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/xfrm.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+MODULE_AUTHOR("Ville Nuorvala");
+MODULE_DESCRIPTION("IPv6 tunneling device");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETDEV("ip6tnl0");
+
+#ifdef IP6_TNL_DEBUG
+#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__)
+#else
+#define IP6_TNL_TRACE(x...) do {;} while(0)
+#endif
+
+#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
+#define IPV6_TCLASS_SHIFT 20
+
+#define HASH_SIZE  32
+
+#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
+		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
+		    (HASH_SIZE - 1))
+
+static int ip6_tnl_dev_init(struct net_device *dev);
+static void ip6_tnl_dev_setup(struct net_device *dev);
+
+static int ip6_tnl_net_id __read_mostly;
+struct ip6_tnl_net {
+	/* the IPv6 tunnel fallback device */
+	struct net_device *fb_tnl_dev;
+	/* lists for storing tunnels in use */
+	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+	struct ip6_tnl __rcu *tnls_wc[1];
+	struct ip6_tnl __rcu **tnls[2];
+};
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+
+static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
+{
+	struct dst_entry *dst = t->dst_cache;
+
+	if (dst && dst->obsolete &&
+	    dst->ops->check(dst, t->dst_cookie) == NULL) {
+		t->dst_cache = NULL;
+		dst_release(dst);
+		return NULL;
+	}
+
+	return dst;
+}
+
+static inline void ip6_tnl_dst_reset(struct ip6_tnl *t)
+{
+	dst_release(t->dst_cache);
+	t->dst_cache = NULL;
+}
+
+static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
+{
+	struct rt6_info *rt = (struct rt6_info *) dst;
+	t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+	dst_release(t->dst_cache);
+	t->dst_cache = dst;
+}
+
+/**
+ * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ *   @remote: the address of the tunnel exit-point
+ *   @local: the address of the tunnel entry-point
+ *
+ * Return:
+ *   tunnel matching given end-points if found,
+ *   else fallback tunnel if its device is up,
+ *   else %NULL
+ **/
+
+#define for_each_ip6_tunnel_rcu(start) \
+	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+static struct ip6_tnl *
+ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
+{
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
+	struct ip6_tnl *t;
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) {
+		if (ipv6_addr_equal(local, &t->parms.laddr) &&
+		    ipv6_addr_equal(remote, &t->parms.raddr) &&
+		    (t->dev->flags & IFF_UP))
+			return t;
+	}
+	t = rcu_dereference(ip6n->tnls_wc[0]);
+	if (t && (t->dev->flags & IFF_UP))
+		return t;
+
+	return NULL;
+}
+
+/**
+ * ip6_tnl_bucket - get head of list matching given tunnel parameters
+ *   @p: parameters containing tunnel end-points
+ *
+ * Description:
+ *   ip6_tnl_bucket() returns the head of the list matching the
+ *   &struct in6_addr entries laddr and raddr in @p.
+ *
+ * Return: head of IPv6 tunnel list
+ **/
+
+static struct ip6_tnl __rcu **
+ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p)
+{
+	const struct in6_addr *remote = &p->raddr;
+	const struct in6_addr *local = &p->laddr;
+	unsigned h = 0;
+	int prio = 0;
+
+	if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
+		prio = 1;
+		h = HASH(remote) ^ HASH(local);
+	}
+	return &ip6n->tnls[prio][h];
+}
+
+/**
+ * ip6_tnl_link - add tunnel to hash table
+ *   @t: tunnel to be added
+ **/
+
+static void
+ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
+{
+	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
+
+	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+	rcu_assign_pointer(*tp, t);
+}
+
+/**
+ * ip6_tnl_unlink - remove tunnel from hash table
+ *   @t: tunnel to be removed
+ **/
+
+static void
+ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
+{
+	struct ip6_tnl __rcu **tp;
+	struct ip6_tnl *iter;
+
+	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
+			break;
+		}
+	}
+}
+
+static void ip6_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
+/**
+ * ip6_tnl_create() - create a new tunnel
+ *   @p: tunnel parameters
+ *   @pt: pointer to new tunnel
+ *
+ * Description:
+ *   Create tunnel matching given parameters.
+ *
+ * Return:
+ *   created tunnel or NULL
+ **/
+
+static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
+{
+	struct net_device *dev;
+	struct ip6_tnl *t;
+	char name[IFNAMSIZ];
+	int err;
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	if (p->name[0])
+		strlcpy(name, p->name, IFNAMSIZ);
+	else
+		sprintf(name, "ip6tnl%%d");
+
+	dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup);
+	if (dev == NULL)
+		goto failed;
+
+	dev_net_set(dev, net);
+
+	t = netdev_priv(dev);
+	t->parms = *p;
+	err = ip6_tnl_dev_init(dev);
+	if (err < 0)
+		goto failed_free;
+
+	if ((err = register_netdevice(dev)) < 0)
+		goto failed_free;
+
+	strcpy(t->parms.name, dev->name);
+
+	dev_hold(dev);
+	ip6_tnl_link(ip6n, t);
+	return t;
+
+failed_free:
+	ip6_dev_free(dev);
+failed:
+	return NULL;
+}
+
+/**
+ * ip6_tnl_locate - find or create tunnel matching given parameters
+ *   @p: tunnel parameters
+ *   @create: != 0 if allowed to create new tunnel if no match found
+ *
+ * Description:
+ *   ip6_tnl_locate() first tries to locate an existing tunnel
+ *   based on @parms. If this is unsuccessful, but @create is set a new
+ *   tunnel device is created and registered for use.
+ *
+ * Return:
+ *   matching tunnel or NULL
+ **/
+
+static struct ip6_tnl *ip6_tnl_locate(struct net *net,
+		struct ip6_tnl_parm *p, int create)
+{
+	const struct in6_addr *remote = &p->raddr;
+	const struct in6_addr *local = &p->laddr;
+	struct ip6_tnl __rcu **tp;
+	struct ip6_tnl *t;
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	for (tp = ip6_tnl_bucket(ip6n, p);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next) {
+		if (ipv6_addr_equal(local, &t->parms.laddr) &&
+		    ipv6_addr_equal(remote, &t->parms.raddr))
+			return t;
+	}
+	if (!create)
+		return NULL;
+	return ip6_tnl_create(net, p);
+}
+
+/**
+ * ip6_tnl_dev_uninit - tunnel device uninitializer
+ *   @dev: the device to be destroyed
+ *
+ * Description:
+ *   ip6_tnl_dev_uninit() removes tunnel from its list
+ **/
+
+static void
+ip6_tnl_dev_uninit(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	if (dev == ip6n->fb_tnl_dev)
+		rcu_assign_pointer(ip6n->tnls_wc[0], NULL);
+	else
+		ip6_tnl_unlink(ip6n, t);
+	ip6_tnl_dst_reset(t);
+	dev_put(dev);
+}
+
+/**
+ * parse_tvl_tnl_enc_lim - handle encapsulation limit option
+ *   @skb: received socket buffer
+ *
+ * Return:
+ *   0 if none was found,
+ *   else index to encapsulation limit
+ **/
+
+static __u16
+parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
+{
+	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
+	__u8 nexthdr = ipv6h->nexthdr;
+	__u16 off = sizeof (*ipv6h);
+
+	while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
+		__u16 optlen = 0;
+		struct ipv6_opt_hdr *hdr;
+		if (raw + off + sizeof (*hdr) > skb->data &&
+		    !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
+			break;
+
+		hdr = (struct ipv6_opt_hdr *) (raw + off);
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
+			if (frag_hdr->frag_off)
+				break;
+			optlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH) {
+			optlen = (hdr->hdrlen + 2) << 2;
+		} else {
+			optlen = ipv6_optlen(hdr);
+		}
+		if (nexthdr == NEXTHDR_DEST) {
+			__u16 i = off + 2;
+			while (1) {
+				struct ipv6_tlv_tnl_enc_lim *tel;
+
+				/* No more room for encapsulation limit */
+				if (i + sizeof (*tel) > off + optlen)
+					break;
+
+				tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
+				/* return index of option if found and valid */
+				if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
+				    tel->length == 1)
+					return i;
+				/* else jump to next option */
+				if (tel->type)
+					i += tel->length + 2;
+				else
+					i++;
+			}
+		}
+		nexthdr = hdr->nexthdr;
+		off += optlen;
+	}
+	return 0;
+}
+
+/**
+ * ip6_tnl_err - tunnel error handler
+ *
+ * Description:
+ *   ip6_tnl_err() should handle errors in the tunnel according
+ *   to the specifications in RFC 2473.
+ **/
+
+static int
+ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
+	    u8 *type, u8 *code, int *msg, __u32 *info, int offset)
+{
+	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data;
+	struct ip6_tnl *t;
+	int rel_msg = 0;
+	u8 rel_type = ICMPV6_DEST_UNREACH;
+	u8 rel_code = ICMPV6_ADDR_UNREACH;
+	__u32 rel_info = 0;
+	__u16 len;
+	int err = -ENOENT;
+
+	/* If the packet doesn't contain the original IPv6 header we are
+	   in trouble since we might need the source address for further
+	   processing of the error. */
+
+	rcu_read_lock();
+	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr,
+					&ipv6h->saddr)) == NULL)
+		goto out;
+
+	if (t->parms.proto != ipproto && t->parms.proto != 0)
+		goto out;
+
+	err = 0;
+
+	switch (*type) {
+		__u32 teli;
+		struct ipv6_tlv_tnl_enc_lim *tel;
+		__u32 mtu;
+	case ICMPV6_DEST_UNREACH:
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "%s: Path to destination invalid "
+			       "or inactive!\n", t->parms.name);
+		rel_msg = 1;
+		break;
+	case ICMPV6_TIME_EXCEED:
+		if ((*code) == ICMPV6_EXC_HOPLIMIT) {
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "%s: Too small hop limit or "
+				       "routing loop in tunnel!\n",
+				       t->parms.name);
+			rel_msg = 1;
+		}
+		break;
+	case ICMPV6_PARAMPROB:
+		teli = 0;
+		if ((*code) == ICMPV6_HDR_FIELD)
+			teli = parse_tlv_tnl_enc_lim(skb, skb->data);
+
+		if (teli && teli == *info - 2) {
+			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
+			if (tel->encap_limit == 0) {
+				if (net_ratelimit())
+					printk(KERN_WARNING
+					       "%s: Too small encapsulation "
+					       "limit or routing loop in "
+					       "tunnel!\n", t->parms.name);
+				rel_msg = 1;
+			}
+		} else if (net_ratelimit()) {
+			printk(KERN_WARNING
+			       "%s: Recipient unable to parse tunneled "
+			       "packet!\n ", t->parms.name);
+		}
+		break;
+	case ICMPV6_PKT_TOOBIG:
+		mtu = *info - offset;
+		if (mtu < IPV6_MIN_MTU)
+			mtu = IPV6_MIN_MTU;
+		t->dev->mtu = mtu;
+
+		if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) {
+			rel_type = ICMPV6_PKT_TOOBIG;
+			rel_code = 0;
+			rel_info = mtu;
+			rel_msg = 1;
+		}
+		break;
+	}
+
+	*type = rel_type;
+	*code = rel_code;
+	*info = rel_info;
+	*msg = rel_msg;
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int
+ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+	   u8 type, u8 code, int offset, __be32 info)
+{
+	int rel_msg = 0;
+	u8 rel_type = type;
+	u8 rel_code = code;
+	__u32 rel_info = ntohl(info);
+	int err;
+	struct sk_buff *skb2;
+	const struct iphdr *eiph;
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
+			  &rel_msg, &rel_info, offset);
+	if (err < 0)
+		return err;
+
+	if (rel_msg == 0)
+		return 0;
+
+	switch (rel_type) {
+	case ICMPV6_DEST_UNREACH:
+		if (rel_code != ICMPV6_ADDR_UNREACH)
+			return 0;
+		rel_type = ICMP_DEST_UNREACH;
+		rel_code = ICMP_HOST_UNREACH;
+		break;
+	case ICMPV6_PKT_TOOBIG:
+		if (rel_code != 0)
+			return 0;
+		rel_type = ICMP_DEST_UNREACH;
+		rel_code = ICMP_FRAG_NEEDED;
+		break;
+	default:
+		return 0;
+	}
+
+	if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
+		return 0;
+
+	skb2 = skb_clone(skb, GFP_ATOMIC);
+	if (!skb2)
+		return 0;
+
+	skb_dst_drop(skb2);
+
+	skb_pull(skb2, offset);
+	skb_reset_network_header(skb2);
+	eiph = ip_hdr(skb2);
+
+	/* Try to guess incoming interface */
+	rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
+				   eiph->saddr, 0,
+				   0, 0,
+				   IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
+	if (IS_ERR(rt))
+		goto out;
+
+	skb2->dev = rt->dst.dev;
+
+	/* route "incoming" packet */
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		rt = NULL;
+		rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
+					   eiph->daddr, eiph->saddr,
+					   0, 0,
+					   IPPROTO_IPIP,
+					   RT_TOS(eiph->tos), 0);
+		if (IS_ERR(rt) ||
+		    rt->dst.dev->type != ARPHRD_TUNNEL) {
+			if (!IS_ERR(rt))
+				ip_rt_put(rt);
+			goto out;
+		}
+		skb_dst_set(skb2, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
+				   skb2->dev) ||
+		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
+			goto out;
+	}
+
+	/* change mtu on this route */
+	if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
+		if (rel_info > dst_mtu(skb_dst(skb2)))
+			goto out;
+
+		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
+	}
+
+	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
+
+out:
+	kfree_skb(skb2);
+	return 0;
+}
+
+static int
+ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+	   u8 type, u8 code, int offset, __be32 info)
+{
+	int rel_msg = 0;
+	u8 rel_type = type;
+	u8 rel_code = code;
+	__u32 rel_info = ntohl(info);
+	int err;
+
+	err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
+			  &rel_msg, &rel_info, offset);
+	if (err < 0)
+		return err;
+
+	if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
+		struct rt6_info *rt;
+		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+		if (!skb2)
+			return 0;
+
+		skb_dst_drop(skb2);
+		skb_pull(skb2, offset);
+		skb_reset_network_header(skb2);
+
+		/* Try to guess incoming interface */
+		rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
+				NULL, 0, 0);
+
+		if (rt && rt->rt6i_dev)
+			skb2->dev = rt->rt6i_dev;
+
+		icmpv6_send(skb2, rel_type, rel_code, rel_info);
+
+		if (rt)
+			dst_release(&rt->dst);
+
+		kfree_skb(skb2);
+	}
+
+	return 0;
+}
+
+static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+					const struct ipv6hdr *ipv6h,
+					struct sk_buff *skb)
+{
+	__u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;
+
+	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
+
+	if (INET_ECN_is_ce(dsfield))
+		IP_ECN_set_ce(ip_hdr(skb));
+}
+
+static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+					const struct ipv6hdr *ipv6h,
+					struct sk_buff *skb)
+{
+	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
+		ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
+
+	if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h)))
+		IP6_ECN_set_ce(ipv6_hdr(skb));
+}
+
+/* called with rcu_read_lock() */
+static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t)
+{
+	struct ip6_tnl_parm *p = &t->parms;
+	int ret = 0;
+	struct net *net = dev_net(t->dev);
+
+	if (p->flags & IP6_TNL_F_CAP_RCV) {
+		struct net_device *ldev = NULL;
+
+		if (p->link)
+			ldev = dev_get_by_index_rcu(net, p->link);
+
+		if ((ipv6_addr_is_multicast(&p->laddr) ||
+		     likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) &&
+		    likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0)))
+			ret = 1;
+
+	}
+	return ret;
+}
+
+/**
+ * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
+ *   @skb: received socket buffer
+ *   @protocol: ethernet protocol ID
+ *   @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN
+ *
+ * Return: 0
+ **/
+
+static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
+		       __u8 ipproto,
+		       void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+						    const struct ipv6hdr *ipv6h,
+						    struct sk_buff *skb))
+{
+	struct ip6_tnl *t;
+	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+	rcu_read_lock();
+
+	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
+					&ipv6h->daddr)) != NULL) {
+		struct pcpu_tstats *tstats;
+
+		if (t->parms.proto != ipproto && t->parms.proto != 0) {
+			rcu_read_unlock();
+			goto discard;
+		}
+
+		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+			rcu_read_unlock();
+			goto discard;
+		}
+
+		if (!ip6_tnl_rcv_ctl(t)) {
+			t->dev->stats.rx_dropped++;
+			rcu_read_unlock();
+			goto discard;
+		}
+		secpath_reset(skb);
+		skb->mac_header = skb->network_header;
+		skb_reset_network_header(skb);
+		skb->protocol = htons(protocol);
+		skb->pkt_type = PACKET_HOST;
+		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+
+		tstats = this_cpu_ptr(t->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, t->dev);
+
+		dscp_ecn_decapsulate(t, ipv6h, skb);
+
+		netif_rx(skb);
+
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+	return 1;
+
+discard:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int ip4ip6_rcv(struct sk_buff *skb)
+{
+	return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP,
+			   ip4ip6_dscp_ecn_decapsulate);
+}
+
+static int ip6ip6_rcv(struct sk_buff *skb)
+{
+	return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6,
+			   ip6ip6_dscp_ecn_decapsulate);
+}
+
+struct ipv6_tel_txoption {
+	struct ipv6_txoptions ops;
+	__u8 dst_opt[8];
+};
+
+static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
+{
+	memset(opt, 0, sizeof(struct ipv6_tel_txoption));
+
+	opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
+	opt->dst_opt[3] = 1;
+	opt->dst_opt[4] = encap_limit;
+	opt->dst_opt[5] = IPV6_TLV_PADN;
+	opt->dst_opt[6] = 1;
+
+	opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
+	opt->ops.opt_nflen = 8;
+}
+
+/**
+ * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ *   @t: the outgoing tunnel device
+ *   @hdr: IPv6 header from the incoming packet
+ *
+ * Description:
+ *   Avoid trivial tunneling loop by checking that tunnel exit-point
+ *   doesn't match source of incoming packet.
+ *
+ * Return:
+ *   1 if conflict,
+ *   0 else
+ **/
+
+static inline int
+ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
+{
+	return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
+}
+
+static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
+{
+	struct ip6_tnl_parm *p = &t->parms;
+	int ret = 0;
+	struct net *net = dev_net(t->dev);
+
+	if (p->flags & IP6_TNL_F_CAP_XMIT) {
+		struct net_device *ldev = NULL;
+
+		rcu_read_lock();
+		if (p->link)
+			ldev = dev_get_by_index_rcu(net, p->link);
+
+		if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0)))
+			printk(KERN_WARNING
+			       "%s xmit: Local address not yet configured!\n",
+			       p->name);
+		else if (!ipv6_addr_is_multicast(&p->raddr) &&
+			 unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0)))
+			printk(KERN_WARNING
+			       "%s xmit: Routing loop! "
+			       "Remote address found on this node!\n",
+			       p->name);
+		else
+			ret = 1;
+		rcu_read_unlock();
+	}
+	return ret;
+}
+/**
+ * ip6_tnl_xmit2 - encapsulate packet and send
+ *   @skb: the outgoing socket buffer
+ *   @dev: the outgoing tunnel device
+ *   @dsfield: dscp code for outer header
+ *   @fl: flow of tunneled packet
+ *   @encap_limit: encapsulation limit
+ *   @pmtu: Path MTU is stored if packet is too big
+ *
+ * Description:
+ *   Build new header and do some sanity checks on the packet before sending
+ *   it.
+ *
+ * Return:
+ *   0 on success
+ *   -1 fail
+ *   %-EMSGSIZE message too big. return mtu in this case.
+ **/
+
+static int ip6_tnl_xmit2(struct sk_buff *skb,
+			 struct net_device *dev,
+			 __u8 dsfield,
+			 struct flowi6 *fl6,
+			 int encap_limit,
+			 __u32 *pmtu)
+{
+	struct net *net = dev_net(dev);
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct net_device_stats *stats = &t->dev->stats;
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	struct ipv6_tel_txoption opt;
+	struct dst_entry *dst;
+	struct net_device *tdev;
+	int mtu;
+	unsigned int max_headroom = sizeof(struct ipv6hdr);
+	u8 proto;
+	int err = -1;
+	int pkt_len;
+
+	if ((dst = ip6_tnl_dst_check(t)) != NULL)
+		dst_hold(dst);
+	else {
+		dst = ip6_route_output(net, NULL, fl6);
+
+		if (dst->error)
+			goto tx_err_link_failure;
+		dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
+		if (IS_ERR(dst)) {
+			err = PTR_ERR(dst);
+			dst = NULL;
+			goto tx_err_link_failure;
+		}
+	}
+
+	tdev = dst->dev;
+
+	if (tdev == dev) {
+		stats->collisions++;
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "%s: Local routing loop detected!\n",
+			       t->parms.name);
+		goto tx_err_dst_release;
+	}
+	mtu = dst_mtu(dst) - sizeof (*ipv6h);
+	if (encap_limit >= 0) {
+		max_headroom += 8;
+		mtu -= 8;
+	}
+	if (mtu < IPV6_MIN_MTU)
+		mtu = IPV6_MIN_MTU;
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+	if (skb->len > mtu) {
+		*pmtu = mtu;
+		err = -EMSGSIZE;
+		goto tx_err_dst_release;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom += LL_RESERVED_SPACE(tdev);
+
+	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+		struct sk_buff *new_skb;
+
+		if (!(new_skb = skb_realloc_headroom(skb, max_headroom)))
+			goto tx_err_dst_release;
+
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		kfree_skb(skb);
+		skb = new_skb;
+	}
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst_clone(dst));
+
+	skb->transport_header = skb->network_header;
+
+	proto = fl6->flowi6_proto;
+	if (encap_limit >= 0) {
+		init_tel_txopt(&opt, encap_limit);
+		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
+	}
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	ipv6h = ipv6_hdr(skb);
+	*(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000);
+	dsfield = INET_ECN_encapsulate(0, dsfield);
+	ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield);
+	ipv6h->hop_limit = t->parms.hop_limit;
+	ipv6h->nexthdr = proto;
+	ipv6_addr_copy(&ipv6h->saddr, &fl6->saddr);
+	ipv6_addr_copy(&ipv6h->daddr, &fl6->daddr);
+	nf_reset(skb);
+	pkt_len = skb->len;
+	err = ip6_local_out(skb);
+
+	if (net_xmit_eval(err) == 0) {
+		struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);
+
+		tstats->tx_bytes += pkt_len;
+		tstats->tx_packets++;
+	} else {
+		stats->tx_errors++;
+		stats->tx_aborted_errors++;
+	}
+	ip6_tnl_dst_store(t, dst);
+	return 0;
+tx_err_link_failure:
+	stats->tx_carrier_errors++;
+	dst_link_failure(skb);
+tx_err_dst_release:
+	dst_release(dst);
+	return err;
+}
+
+static inline int
+ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	const struct iphdr  *iph = ip_hdr(skb);
+	int encap_limit = -1;
+	struct flowi6 fl6;
+	__u8 dsfield;
+	__u32 mtu;
+	int err;
+
+	if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) ||
+	    !ip6_tnl_xmit_ctl(t))
+		return -1;
+
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		encap_limit = t->parms.encap_limit;
+
+	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
+	fl6.flowi6_proto = IPPROTO_IPIP;
+
+	dsfield = ipv4_get_dsfield(iph);
+
+	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
+		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
+					  & IPV6_TCLASS_MASK;
+
+	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+	if (err != 0) {
+		/* XXX: send ICMP error even if DF is not set. */
+		if (err == -EMSGSIZE)
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+				  htonl(mtu));
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline int
+ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	int encap_limit = -1;
+	__u16 offset;
+	struct flowi6 fl6;
+	__u8 dsfield;
+	__u32 mtu;
+	int err;
+
+	if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
+	    !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h))
+		return -1;
+
+	offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb));
+	if (offset > 0) {
+		struct ipv6_tlv_tnl_enc_lim *tel;
+		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
+		if (tel->encap_limit == 0) {
+			icmpv6_send(skb, ICMPV6_PARAMPROB,
+				    ICMPV6_HDR_FIELD, offset + 2);
+			return -1;
+		}
+		encap_limit = tel->encap_limit - 1;
+	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		encap_limit = t->parms.encap_limit;
+
+	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
+	fl6.flowi6_proto = IPPROTO_IPV6;
+
+	dsfield = ipv6_get_dsfield(ipv6h);
+	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
+		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL))
+		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
+
+	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
+	if (err != 0) {
+		if (err == -EMSGSIZE)
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		return -1;
+	}
+
+	return 0;
+}
+
+static netdev_tx_t
+ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct net_device_stats *stats = &t->dev->stats;
+	int ret;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ret = ip4ip6_tnl_xmit(skb, dev);
+		break;
+	case htons(ETH_P_IPV6):
+		ret = ip6ip6_tnl_xmit(skb, dev);
+		break;
+	default:
+		goto tx_err;
+	}
+
+	if (ret < 0)
+		goto tx_err;
+
+	return NETDEV_TX_OK;
+
+tx_err:
+	stats->tx_errors++;
+	stats->tx_dropped++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void ip6_tnl_set_cap(struct ip6_tnl *t)
+{
+	struct ip6_tnl_parm *p = &t->parms;
+	int ltype = ipv6_addr_type(&p->laddr);
+	int rtype = ipv6_addr_type(&p->raddr);
+
+	p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV);
+
+	if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+	    rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
+	    !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
+	    (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
+		if (ltype&IPV6_ADDR_UNICAST)
+			p->flags |= IP6_TNL_F_CAP_XMIT;
+		if (rtype&IPV6_ADDR_UNICAST)
+			p->flags |= IP6_TNL_F_CAP_RCV;
+	}
+}
+
+static void ip6_tnl_link_config(struct ip6_tnl *t)
+{
+	struct net_device *dev = t->dev;
+	struct ip6_tnl_parm *p = &t->parms;
+	struct flowi6 *fl6 = &t->fl.u.ip6;
+
+	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
+
+	/* Set up flowi template */
+	ipv6_addr_copy(&fl6->saddr, &p->laddr);
+	ipv6_addr_copy(&fl6->daddr, &p->raddr);
+	fl6->flowi6_oif = p->link;
+	fl6->flowlabel = 0;
+
+	if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
+		fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
+	if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
+		fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
+
+	ip6_tnl_set_cap(t);
+
+	if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
+		dev->flags |= IFF_POINTOPOINT;
+	else
+		dev->flags &= ~IFF_POINTOPOINT;
+
+	dev->iflink = p->link;
+
+	if (p->flags & IP6_TNL_F_CAP_XMIT) {
+		int strict = (ipv6_addr_type(&p->raddr) &
+			      (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));
+
+		struct rt6_info *rt = rt6_lookup(dev_net(dev),
+						 &p->raddr, &p->laddr,
+						 p->link, strict);
+
+		if (rt == NULL)
+			return;
+
+		if (rt->rt6i_dev) {
+			dev->hard_header_len = rt->rt6i_dev->hard_header_len +
+				sizeof (struct ipv6hdr);
+
+			dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr);
+			if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+				dev->mtu-=8;
+
+			if (dev->mtu < IPV6_MIN_MTU)
+				dev->mtu = IPV6_MIN_MTU;
+		}
+		dst_release(&rt->dst);
+	}
+}
+
+/**
+ * ip6_tnl_change - update the tunnel parameters
+ *   @t: tunnel to be changed
+ *   @p: tunnel configuration parameters
+ *
+ * Description:
+ *   ip6_tnl_change() updates the tunnel parameters
+ **/
+
+static int
+ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
+{
+	ipv6_addr_copy(&t->parms.laddr, &p->laddr);
+	ipv6_addr_copy(&t->parms.raddr, &p->raddr);
+	t->parms.flags = p->flags;
+	t->parms.hop_limit = p->hop_limit;
+	t->parms.encap_limit = p->encap_limit;
+	t->parms.flowinfo = p->flowinfo;
+	t->parms.link = p->link;
+	t->parms.proto = p->proto;
+	ip6_tnl_dst_reset(t);
+	ip6_tnl_link_config(t);
+	return 0;
+}
+
+/**
+ * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ *   @dev: virtual device associated with tunnel
+ *   @ifr: parameters passed from userspace
+ *   @cmd: command to be performed
+ *
+ * Description:
+ *   ip6_tnl_ioctl() is used for managing IPv6 tunnels
+ *   from userspace.
+ *
+ *   The possible commands are the following:
+ *     %SIOCGETTUNNEL: get tunnel parameters for device
+ *     %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
+ *     %SIOCCHGTUNNEL: change tunnel parameters to those given
+ *     %SIOCDELTUNNEL: delete tunnel
+ *
+ *   The fallback device "ip6tnl0", created during module
+ *   initialization, can be used for creating other tunnel devices.
+ *
+ * Return:
+ *   0 on success,
+ *   %-EFAULT if unable to copy data to or from userspace,
+ *   %-EPERM if current process hasn't %CAP_NET_ADMIN set
+ *   %-EINVAL if passed tunnel parameters are invalid,
+ *   %-EEXIST if changing a tunnel's parameters would cause a conflict
+ *   %-ENODEV if attempting to change or delete a nonexisting device
+ **/
+
+static int
+ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip6_tnl_parm p;
+	struct ip6_tnl *t = NULL;
+	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		if (dev == ip6n->fb_tnl_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ip6_tnl_locate(net, &p, 0);
+		}
+		if (t == NULL)
+			t = netdev_priv(dev);
+		memcpy(&p, &t->parms, sizeof (p));
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) {
+			err = -EFAULT;
+		}
+		break;
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
+			break;
+		err = -EINVAL;
+		if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
+		    p.proto != 0)
+			break;
+		t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL);
+		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else
+				t = netdev_priv(dev);
+
+			ip6_tnl_unlink(ip6n, t);
+			synchronize_net();
+			err = ip6_tnl_change(t, &p);
+			ip6_tnl_link(ip6n, t);
+			netdev_state_change(dev);
+		}
+		if (t) {
+			err = 0;
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p)))
+				err = -EFAULT;
+
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+
+		if (dev == ip6n->fb_tnl_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
+				break;
+			err = -ENOENT;
+			if ((t = ip6_tnl_locate(net, &p, 0)) == NULL)
+				break;
+			err = -EPERM;
+			if (t->dev == ip6n->fb_tnl_dev)
+				break;
+			dev = t->dev;
+		}
+		err = 0;
+		unregister_netdevice(dev);
+		break;
+	default:
+		err = -EINVAL;
+	}
+	return err;
+}
+
+/**
+ * ip6_tnl_change_mtu - change mtu manually for tunnel device
+ *   @dev: virtual device associated with tunnel
+ *   @new_mtu: the new mtu
+ *
+ * Return:
+ *   0 on success,
+ *   %-EINVAL if mtu too small
+ **/
+
+static int
+ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < IPV6_MIN_MTU) {
+		return -EINVAL;
+	}
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+
+static const struct net_device_ops ip6_tnl_netdev_ops = {
+	.ndo_uninit	= ip6_tnl_dev_uninit,
+	.ndo_start_xmit = ip6_tnl_xmit,
+	.ndo_do_ioctl	= ip6_tnl_ioctl,
+	.ndo_change_mtu = ip6_tnl_change_mtu,
+	.ndo_get_stats	= ip6_get_stats,
+};
+
+
+/**
+ * ip6_tnl_dev_setup - setup virtual tunnel device
+ *   @dev: virtual device associated with tunnel
+ *
+ * Description:
+ *   Initialize function pointers and device parameters
+ **/
+
+static void ip6_tnl_dev_setup(struct net_device *dev)
+{
+	struct ip6_tnl *t;
+
+	dev->netdev_ops = &ip6_tnl_netdev_ops;
+	dev->destructor = ip6_dev_free;
+
+	dev->type = ARPHRD_TUNNEL6;
+	dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
+	dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
+	t = netdev_priv(dev);
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+		dev->mtu-=8;
+	dev->flags |= IFF_NOARP;
+	dev->addr_len = sizeof(struct in6_addr);
+	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+}
+
+
+/**
+ * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
+ *   @dev: virtual device associated with tunnel
+ **/
+
+static inline int
+ip6_tnl_dev_init_gen(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+
+	t->dev = dev;
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+	return 0;
+}
+
+/**
+ * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
+ *   @dev: virtual device associated with tunnel
+ **/
+
+static int ip6_tnl_dev_init(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	int err = ip6_tnl_dev_init_gen(dev);
+
+	if (err)
+		return err;
+	ip6_tnl_link_config(t);
+	return 0;
+}
+
+/**
+ * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
+ *   @dev: fallback device
+ *
+ * Return: 0
+ **/
+
+static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct net *net = dev_net(dev);
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+	int err = ip6_tnl_dev_init_gen(dev);
+
+	if (err)
+		return err;
+
+	t->parms.proto = IPPROTO_IPV6;
+	dev_hold(dev);
+	rcu_assign_pointer(ip6n->tnls_wc[0], t);
+	return 0;
+}
+
+static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
+	.handler	= ip4ip6_rcv,
+	.err_handler	= ip4ip6_err,
+	.priority	=	1,
+};
+
+static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
+	.handler	= ip6ip6_rcv,
+	.err_handler	= ip6ip6_err,
+	.priority	=	1,
+};
+
+static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
+{
+	int h;
+	struct ip6_tnl *t;
+	LIST_HEAD(list);
+
+	for (h = 0; h < HASH_SIZE; h++) {
+		t = rtnl_dereference(ip6n->tnls_r_l[h]);
+		while (t != NULL) {
+			unregister_netdevice_queue(t->dev, &list);
+			t = rtnl_dereference(t->next);
+		}
+	}
+
+	t = rtnl_dereference(ip6n->tnls_wc[0]);
+	unregister_netdevice_queue(t->dev, &list);
+	unregister_netdevice_many(&list);
+}
+
+static int __net_init ip6_tnl_init_net(struct net *net)
+{
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+	struct ip6_tnl *t = NULL;
+	int err;
+
+	ip6n->tnls[0] = ip6n->tnls_wc;
+	ip6n->tnls[1] = ip6n->tnls_r_l;
+
+	err = -ENOMEM;
+	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
+				      ip6_tnl_dev_setup);
+
+	if (!ip6n->fb_tnl_dev)
+		goto err_alloc_dev;
+	dev_net_set(ip6n->fb_tnl_dev, net);
+
+	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+	if (err < 0)
+		goto err_register;
+
+	err = register_netdev(ip6n->fb_tnl_dev);
+	if (err < 0)
+		goto err_register;
+
+	t = netdev_priv(ip6n->fb_tnl_dev);
+
+	strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
+	return 0;
+
+err_register:
+	ip6_dev_free(ip6n->fb_tnl_dev);
+err_alloc_dev:
+	return err;
+}
+
+static void __net_exit ip6_tnl_exit_net(struct net *net)
+{
+	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+
+	rtnl_lock();
+	ip6_tnl_destroy_tunnels(ip6n);
+	rtnl_unlock();
+}
+
+static struct pernet_operations ip6_tnl_net_ops = {
+	.init = ip6_tnl_init_net,
+	.exit = ip6_tnl_exit_net,
+	.id   = &ip6_tnl_net_id,
+	.size = sizeof(struct ip6_tnl_net),
+};
+
+/**
+ * ip6_tunnel_init - register protocol and reserve needed resources
+ *
+ * Return: 0 on success
+ **/
+
+static int __init ip6_tunnel_init(void)
+{
+	int  err;
+
+	err = register_pernet_device(&ip6_tnl_net_ops);
+	if (err < 0)
+		goto out_pernet;
+
+	err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
+	if (err < 0) {
+		printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n");
+		goto out_ip4ip6;
+	}
+
+	err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
+	if (err < 0) {
+		printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n");
+		goto out_ip6ip6;
+	}
+
+	return 0;
+
+out_ip6ip6:
+	xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
+out_ip4ip6:
+	unregister_pernet_device(&ip6_tnl_net_ops);
+out_pernet:
+	return err;
+}
+
+/**
+ * ip6_tunnel_cleanup - free resources and unregister protocol
+ **/
+
+static void __exit ip6_tunnel_cleanup(void)
+{
+	if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
+		printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n");
+
+	if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
+		printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n");
+
+	unregister_pernet_device(&ip6_tnl_net_ops);
+}
+
+module_init(ip6_tunnel_init);
+module_exit(ip6_tunnel_cleanup);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
new file mode 100644
index 00000000..86e3cc10
--- /dev/null
+++ b/net/ipv6/ip6mr.c
@@ -0,0 +1,2278 @@
+/*
+ *	Linux IPv6 multicast routing support for BSD pim6sd
+ *	Based on net/ipv4/ipmr.c.
+ *
+ *	(c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
+ *		LSIIT Laboratory, Strasbourg, France
+ *	(c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
+ *		6WIND, Paris, France
+ *	Copyright (C)2007,2008 USAGI/WIDE Project
+ *		YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <net/checksum.h>
+#include <net/netlink.h>
+#include <net/fib_rules.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <linux/mroute6.h>
+#include <linux/pim.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ip6_checksum.h>
+
+struct mr6_table {
+	struct list_head	list;
+#ifdef CONFIG_NET_NS
+	struct net		*net;
+#endif
+	u32			id;
+	struct sock		*mroute6_sk;
+	struct timer_list	ipmr_expire_timer;
+	struct list_head	mfc6_unres_queue;
+	struct list_head	mfc6_cache_array[MFC6_LINES];
+	struct mif_device	vif6_table[MAXMIFS];
+	int			maxvif;
+	atomic_t		cache_resolve_queue_len;
+	int			mroute_do_assert;
+	int			mroute_do_pim;
+#ifdef CONFIG_IPV6_PIMSM_V2
+	int			mroute_reg_vif_num;
+#endif
+};
+
+struct ip6mr_rule {
+	struct fib_rule		common;
+};
+
+struct ip6mr_result {
+	struct mr6_table	*mrt;
+};
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+   Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ *	Multicast router control variables
+ */
+
+#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL)
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+   entries is changed only in process context and protected
+   with weak lock mrt_lock. Queue of unresolved entries is protected
+   with strong spinlock mfc_unres_lock.
+
+   In this case data path is free of exclusive locks at all.
+ */
+
+static struct kmem_cache *mrt_cachep __read_mostly;
+
+static struct mr6_table *ip6mr_new_table(struct net *net, u32 id);
+static void ip6mr_free_table(struct mr6_table *mrt);
+
+static int ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+			  struct sk_buff *skb, struct mfc6_cache *cache);
+static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+			      mifi_t mifi, int assert);
+static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+			       struct mfc6_cache *c, struct rtmsg *rtm);
+static int ip6mr_rtm_dumproute(struct sk_buff *skb,
+			       struct netlink_callback *cb);
+static void mroute_clean_tables(struct mr6_table *mrt);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+#define ip6mr_for_each_table(mrt, net) \
+	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
+
+static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+{
+	struct mr6_table *mrt;
+
+	ip6mr_for_each_table(mrt, net) {
+		if (mrt->id == id)
+			return mrt;
+	}
+	return NULL;
+}
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+			    struct mr6_table **mrt)
+{
+	struct ip6mr_result res;
+	struct fib_lookup_arg arg = { .result = &res, };
+	int err;
+
+	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
+			       flowi6_to_flowi(flp6), 0, &arg);
+	if (err < 0)
+		return err;
+	*mrt = res.mrt;
+	return 0;
+}
+
+static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
+			     int flags, struct fib_lookup_arg *arg)
+{
+	struct ip6mr_result *res = arg->result;
+	struct mr6_table *mrt;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	mrt = ip6mr_get_table(rule->fr_net, rule->table);
+	if (mrt == NULL)
+		return -EAGAIN;
+	res->mrt = mrt;
+	return 0;
+}
+
+static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
+{
+	return 1;
+}
+
+static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
+	FRA_GENERIC_POLICY,
+};
+
+static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+				struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+	return 0;
+}
+
+static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			      struct nlattr **tb)
+{
+	return 1;
+}
+
+static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			   struct fib_rule_hdr *frh)
+{
+	frh->dst_len = 0;
+	frh->src_len = 0;
+	frh->tos     = 0;
+	return 0;
+}
+
+static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = {
+	.family		= RTNL_FAMILY_IP6MR,
+	.rule_size	= sizeof(struct ip6mr_rule),
+	.addr_size	= sizeof(struct in6_addr),
+	.action		= ip6mr_rule_action,
+	.match		= ip6mr_rule_match,
+	.configure	= ip6mr_rule_configure,
+	.compare	= ip6mr_rule_compare,
+	.default_pref	= fib_default_rule_pref,
+	.fill		= ip6mr_rule_fill,
+	.nlgroup	= RTNLGRP_IPV6_RULE,
+	.policy		= ip6mr_rule_policy,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+	struct fib_rules_ops *ops;
+	struct mr6_table *mrt;
+	int err;
+
+	ops = fib_rules_register(&ip6mr_rules_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	INIT_LIST_HEAD(&net->ipv6.mr6_tables);
+
+	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+	if (mrt == NULL) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0);
+	if (err < 0)
+		goto err2;
+
+	net->ipv6.mr6_rules_ops = ops;
+	return 0;
+
+err2:
+	kfree(mrt);
+err1:
+	fib_rules_unregister(ops);
+	return err;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+	struct mr6_table *mrt, *next;
+
+	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
+		list_del(&mrt->list);
+		ip6mr_free_table(mrt);
+	}
+	fib_rules_unregister(net->ipv6.mr6_rules_ops);
+}
+#else
+#define ip6mr_for_each_table(mrt, net) \
+	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
+
+static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+{
+	return net->ipv6.mrt6;
+}
+
+static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
+			    struct mr6_table **mrt)
+{
+	*mrt = net->ipv6.mrt6;
+	return 0;
+}
+
+static int __net_init ip6mr_rules_init(struct net *net)
+{
+	net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
+	return net->ipv6.mrt6 ? 0 : -ENOMEM;
+}
+
+static void __net_exit ip6mr_rules_exit(struct net *net)
+{
+	ip6mr_free_table(net->ipv6.mrt6);
+}
+#endif
+
+static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
+{
+	struct mr6_table *mrt;
+	unsigned int i;
+
+	mrt = ip6mr_get_table(net, id);
+	if (mrt != NULL)
+		return mrt;
+
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (mrt == NULL)
+		return NULL;
+	mrt->id = id;
+	write_pnet(&mrt->net, net);
+
+	/* Forwarding cache */
+	for (i = 0; i < MFC6_LINES; i++)
+		INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]);
+
+	INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
+
+	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+		    (unsigned long)mrt);
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
+#endif
+	return mrt;
+}
+
+static void ip6mr_free_table(struct mr6_table *mrt)
+{
+	del_timer(&mrt->ipmr_expire_timer);
+	mroute_clean_tables(mrt);
+	kfree(mrt);
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct ipmr_mfc_iter {
+	struct seq_net_private p;
+	struct mr6_table *mrt;
+	struct list_head *cache;
+	int ct;
+};
+
+
+static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
+					   struct ipmr_mfc_iter *it, loff_t pos)
+{
+	struct mr6_table *mrt = it->mrt;
+	struct mfc6_cache *mfc;
+
+	read_lock(&mrt_lock);
+	for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) {
+		it->cache = &mrt->mfc6_cache_array[it->ct];
+		list_for_each_entry(mfc, it->cache, list)
+			if (pos-- == 0)
+				return mfc;
+	}
+	read_unlock(&mrt_lock);
+
+	spin_lock_bh(&mfc_unres_lock);
+	it->cache = &mrt->mfc6_unres_queue;
+	list_for_each_entry(mfc, it->cache, list)
+		if (pos-- == 0)
+			return mfc;
+	spin_unlock_bh(&mfc_unres_lock);
+
+	it->cache = NULL;
+	return NULL;
+}
+
+/*
+ *	The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif
+ */
+
+struct ipmr_vif_iter {
+	struct seq_net_private p;
+	struct mr6_table *mrt;
+	int ct;
+};
+
+static struct mif_device *ip6mr_vif_seq_idx(struct net *net,
+					    struct ipmr_vif_iter *iter,
+					    loff_t pos)
+{
+	struct mr6_table *mrt = iter->mrt;
+
+	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+		if (!MIF_EXISTS(mrt, iter->ct))
+			continue;
+		if (pos-- == 0)
+			return &mrt->vif6_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(mrt_lock)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
+
+	iter->mrt = mrt;
+
+	read_lock(&mrt_lock);
+	return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1)
+		: SEQ_START_TOKEN;
+}
+
+static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr6_table *mrt = iter->mrt;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip6mr_vif_seq_idx(net, iter, 0);
+
+	while (++iter->ct < mrt->maxvif) {
+		if (!MIF_EXISTS(mrt, iter->ct))
+			continue;
+		return &mrt->vif6_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
+	__releases(mrt_lock)
+{
+	read_unlock(&mrt_lock);
+}
+
+static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+	struct mr6_table *mrt = iter->mrt;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags\n");
+	} else {
+		const struct mif_device *vif = v;
+		const char *name = vif->dev ? vif->dev->name : "none";
+
+		seq_printf(seq,
+			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
+			   vif - mrt->vif6_table,
+			   name, vif->bytes_in, vif->pkt_in,
+			   vif->bytes_out, vif->pkt_out,
+			   vif->flags);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip6mr_vif_seq_ops = {
+	.start = ip6mr_vif_seq_start,
+	.next  = ip6mr_vif_seq_next,
+	.stop  = ip6mr_vif_seq_stop,
+	.show  = ip6mr_vif_seq_show,
+};
+
+static int ip6mr_vif_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
+			    sizeof(struct ipmr_vif_iter));
+}
+
+static const struct file_operations ip6mr_vif_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip6mr_vif_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return ERR_PTR(-ENOENT);
+
+	it->mrt = mrt;
+	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
+		: SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct mfc6_cache *mfc = v;
+	struct ipmr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr6_table *mrt = it->mrt;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return ipmr_mfc_seq_idx(net, seq->private, 0);
+
+	if (mfc->list.next != it->cache)
+		return list_entry(mfc->list.next, struct mfc6_cache, list);
+
+	if (it->cache == &mrt->mfc6_unres_queue)
+		goto end_of_list;
+
+	BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]);
+
+	while (++it->ct < MFC6_LINES) {
+		it->cache = &mrt->mfc6_cache_array[it->ct];
+		if (list_empty(it->cache))
+			continue;
+		return list_first_entry(it->cache, struct mfc6_cache, list);
+	}
+
+	/* exhausted cache_array, show unresolved */
+	read_unlock(&mrt_lock);
+	it->cache = &mrt->mfc6_unres_queue;
+	it->ct = 0;
+
+	spin_lock_bh(&mfc_unres_lock);
+	if (!list_empty(it->cache))
+		return list_first_entry(it->cache, struct mfc6_cache, list);
+
+ end_of_list:
+	spin_unlock_bh(&mfc_unres_lock);
+	it->cache = NULL;
+
+	return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+	struct mr6_table *mrt = it->mrt;
+
+	if (it->cache == &mrt->mfc6_unres_queue)
+		spin_unlock_bh(&mfc_unres_lock);
+	else if (it->cache == mrt->mfc6_cache_array)
+		read_unlock(&mrt_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+	int n;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+			 "Group                            "
+			 "Origin                           "
+			 "Iif      Pkts  Bytes     Wrong  Oifs\n");
+	} else {
+		const struct mfc6_cache *mfc = v;
+		const struct ipmr_mfc_iter *it = seq->private;
+		struct mr6_table *mrt = it->mrt;
+
+		seq_printf(seq, "%pI6 %pI6 %-3hd",
+			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
+			   mfc->mf6c_parent);
+
+		if (it->cache != &mrt->mfc6_unres_queue) {
+			seq_printf(seq, " %8lu %8lu %8lu",
+				   mfc->mfc_un.res.pkt,
+				   mfc->mfc_un.res.bytes,
+				   mfc->mfc_un.res.wrong_if);
+			for (n = mfc->mfc_un.res.minvif;
+			     n < mfc->mfc_un.res.maxvif; n++) {
+				if (MIF_EXISTS(mrt, n) &&
+				    mfc->mfc_un.res.ttls[n] < 255)
+					seq_printf(seq,
+						   " %2d:%-3d",
+						   n, mfc->mfc_un.res.ttls[n]);
+			}
+		} else {
+			/* unresolved mfc_caches don't contain
+			 * pkt, bytes and wrong_if values
+			 */
+			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
+		}
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static const struct seq_operations ipmr_mfc_seq_ops = {
+	.start = ipmr_mfc_seq_start,
+	.next  = ipmr_mfc_seq_next,
+	.stop  = ipmr_mfc_seq_stop,
+	.show  = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
+			    sizeof(struct ipmr_mfc_iter));
+}
+
+static const struct file_operations ip6mr_mfc_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ipmr_mfc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+
+static int pim6_rcv(struct sk_buff *skb)
+{
+	struct pimreghdr *pim;
+	struct ipv6hdr   *encap;
+	struct net_device  *reg_dev = NULL;
+	struct net *net = dev_net(skb->dev);
+	struct mr6_table *mrt;
+	struct flowi6 fl6 = {
+		.flowi6_iif	= skb->dev->ifindex,
+		.flowi6_mark	= skb->mark,
+	};
+	int reg_vif_num;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
+		goto drop;
+
+	pim = (struct pimreghdr *)skb_transport_header(skb);
+	if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
+	    (pim->flags & PIM_NULL_REGISTER) ||
+	    (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+			     sizeof(*pim), IPPROTO_PIM,
+			     csum_partial((void *)pim, sizeof(*pim), 0)) &&
+	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
+		goto drop;
+
+	/* check if the inner packet is destined to mcast group */
+	encap = (struct ipv6hdr *)(skb_transport_header(skb) +
+				   sizeof(*pim));
+
+	if (!ipv6_addr_is_multicast(&encap->daddr) ||
+	    encap->payload_len == 0 ||
+	    ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
+		goto drop;
+
+	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+		goto drop;
+	reg_vif_num = mrt->mroute_reg_vif_num;
+
+	read_lock(&mrt_lock);
+	if (reg_vif_num >= 0)
+		reg_dev = mrt->vif6_table[reg_vif_num].dev;
+	if (reg_dev)
+		dev_hold(reg_dev);
+	read_unlock(&mrt_lock);
+
+	if (reg_dev == NULL)
+		goto drop;
+
+	skb->mac_header = skb->network_header;
+	skb_pull(skb, (u8 *)encap - skb->data);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IPV6);
+	skb->ip_summed = CHECKSUM_NONE;
+	skb->pkt_type = PACKET_HOST;
+
+	skb_tunnel_rx(skb, reg_dev);
+
+	netif_rx(skb);
+
+	dev_put(reg_dev);
+	return 0;
+ drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static const struct inet6_protocol pim6_protocol = {
+	.handler	=	pim6_rcv,
+};
+
+/* Service routines creating virtual interfaces: PIMREG */
+
+static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct mr6_table *mrt;
+	struct flowi6 fl6 = {
+		.flowi6_oif	= dev->ifindex,
+		.flowi6_iif	= skb->skb_iif,
+		.flowi6_mark	= skb->mark,
+	};
+	int err;
+
+	err = ip6mr_fib_lookup(net, &fl6, &mrt);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	read_lock(&mrt_lock);
+	dev->stats.tx_bytes += skb->len;
+	dev->stats.tx_packets++;
+	ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
+	read_unlock(&mrt_lock);
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops reg_vif_netdev_ops = {
+	.ndo_start_xmit	= reg_vif_xmit,
+};
+
+static void reg_vif_setup(struct net_device *dev)
+{
+	dev->type		= ARPHRD_PIMREG;
+	dev->mtu		= 1500 - sizeof(struct ipv6hdr) - 8;
+	dev->flags		= IFF_NOARP;
+	dev->netdev_ops		= &reg_vif_netdev_ops;
+	dev->destructor		= free_netdev;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+}
+
+static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
+{
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+
+	if (mrt->id == RT6_TABLE_DFLT)
+		sprintf(name, "pim6reg");
+	else
+		sprintf(name, "pim6reg%u", mrt->id);
+
+	dev = alloc_netdev(0, name, reg_vif_setup);
+	if (dev == NULL)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	if (register_netdevice(dev)) {
+		free_netdev(dev);
+		return NULL;
+	}
+	dev->iflink = 0;
+
+	if (dev_open(dev))
+		goto failure;
+
+	dev_hold(dev);
+	return dev;
+
+failure:
+	/* allow the register to be completed before unregistering. */
+	rtnl_unlock();
+	rtnl_lock();
+
+	unregister_netdevice(dev);
+	return NULL;
+}
+#endif
+
+/*
+ *	Delete a VIF entry
+ */
+
+static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+{
+	struct mif_device *v;
+	struct net_device *dev;
+	struct inet6_dev *in6_dev;
+
+	if (vifi < 0 || vifi >= mrt->maxvif)
+		return -EADDRNOTAVAIL;
+
+	v = &mrt->vif6_table[vifi];
+
+	write_lock_bh(&mrt_lock);
+	dev = v->dev;
+	v->dev = NULL;
+
+	if (!dev) {
+		write_unlock_bh(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	}
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (vifi == mrt->mroute_reg_vif_num)
+		mrt->mroute_reg_vif_num = -1;
+#endif
+
+	if (vifi + 1 == mrt->maxvif) {
+		int tmp;
+		for (tmp = vifi - 1; tmp >= 0; tmp--) {
+			if (MIF_EXISTS(mrt, tmp))
+				break;
+		}
+		mrt->maxvif = tmp + 1;
+	}
+
+	write_unlock_bh(&mrt_lock);
+
+	dev_set_allmulti(dev, -1);
+
+	in6_dev = __in6_dev_get(dev);
+	if (in6_dev)
+		in6_dev->cnf.mc_forwarding--;
+
+	if (v->flags & MIFF_REGISTER)
+		unregister_netdevice_queue(dev, head);
+
+	dev_put(dev);
+	return 0;
+}
+
+static inline void ip6mr_cache_free(struct mfc6_cache *c)
+{
+	kmem_cache_free(mrt_cachep, c);
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+   and reporting error to netlink readers.
+ */
+
+static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c)
+{
+	struct net *net = read_pnet(&mrt->net);
+	struct sk_buff *skb;
+
+	atomic_dec(&mrt->cache_resolve_queue_len);
+
+	while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
+		if (ipv6_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
+			nlh->nlmsg_type = NLMSG_ERROR;
+			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+			skb_trim(skb, nlh->nlmsg_len);
+			((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+		} else
+			kfree_skb(skb);
+	}
+
+	ip6mr_cache_free(c);
+}
+
+
+/* Timer process for all the unresolved queue. */
+
+static void ipmr_do_expire_process(struct mr6_table *mrt)
+{
+	unsigned long now = jiffies;
+	unsigned long expires = 10 * HZ;
+	struct mfc6_cache *c, *next;
+
+	list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+		if (time_after(c->mfc_un.unres.expires, now)) {
+			/* not yet... */
+			unsigned long interval = c->mfc_un.unres.expires - now;
+			if (interval < expires)
+				expires = interval;
+			continue;
+		}
+
+		list_del(&c->list);
+		ip6mr_destroy_unres(mrt, c);
+	}
+
+	if (!list_empty(&mrt->mfc6_unres_queue))
+		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
+}
+
+static void ipmr_expire_process(unsigned long arg)
+{
+	struct mr6_table *mrt = (struct mr6_table *)arg;
+
+	if (!spin_trylock(&mfc_unres_lock)) {
+		mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
+		return;
+	}
+
+	if (!list_empty(&mrt->mfc6_unres_queue))
+		ipmr_do_expire_process(mrt);
+
+	spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache,
+				    unsigned char *ttls)
+{
+	int vifi;
+
+	cache->mfc_un.res.minvif = MAXMIFS;
+	cache->mfc_un.res.maxvif = 0;
+	memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
+
+	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+		if (MIF_EXISTS(mrt, vifi) &&
+		    ttls[vifi] && ttls[vifi] < 255) {
+			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+			if (cache->mfc_un.res.minvif > vifi)
+				cache->mfc_un.res.minvif = vifi;
+			if (cache->mfc_un.res.maxvif <= vifi)
+				cache->mfc_un.res.maxvif = vifi + 1;
+		}
+	}
+}
+
+static int mif6_add(struct net *net, struct mr6_table *mrt,
+		    struct mif6ctl *vifc, int mrtsock)
+{
+	int vifi = vifc->mif6c_mifi;
+	struct mif_device *v = &mrt->vif6_table[vifi];
+	struct net_device *dev;
+	struct inet6_dev *in6_dev;
+	int err;
+
+	/* Is vif busy ? */
+	if (MIF_EXISTS(mrt, vifi))
+		return -EADDRINUSE;
+
+	switch (vifc->mif6c_flags) {
+#ifdef CONFIG_IPV6_PIMSM_V2
+	case MIFF_REGISTER:
+		/*
+		 * Special Purpose VIF in PIM
+		 * All the packets will be sent to the daemon
+		 */
+		if (mrt->mroute_reg_vif_num >= 0)
+			return -EADDRINUSE;
+		dev = ip6mr_reg_vif(net, mrt);
+		if (!dev)
+			return -ENOBUFS;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			unregister_netdevice(dev);
+			dev_put(dev);
+			return err;
+		}
+		break;
+#endif
+	case 0:
+		dev = dev_get_by_index(net, vifc->mif6c_pifi);
+		if (!dev)
+			return -EADDRNOTAVAIL;
+		err = dev_set_allmulti(dev, 1);
+		if (err) {
+			dev_put(dev);
+			return err;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	in6_dev = __in6_dev_get(dev);
+	if (in6_dev)
+		in6_dev->cnf.mc_forwarding++;
+
+	/*
+	 *	Fill in the VIF structures
+	 */
+	v->rate_limit = vifc->vifc_rate_limit;
+	v->flags = vifc->mif6c_flags;
+	if (!mrtsock)
+		v->flags |= VIFF_STATIC;
+	v->threshold = vifc->vifc_threshold;
+	v->bytes_in = 0;
+	v->bytes_out = 0;
+	v->pkt_in = 0;
+	v->pkt_out = 0;
+	v->link = dev->ifindex;
+	if (v->flags & MIFF_REGISTER)
+		v->link = dev->iflink;
+
+	/* And finish update writing critical data */
+	write_lock_bh(&mrt_lock);
+	v->dev = dev;
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (v->flags & MIFF_REGISTER)
+		mrt->mroute_reg_vif_num = vifi;
+#endif
+	if (vifi + 1 > mrt->maxvif)
+		mrt->maxvif = vifi + 1;
+	write_unlock_bh(&mrt_lock);
+	return 0;
+}
+
+static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
+					   const struct in6_addr *origin,
+					   const struct in6_addr *mcastgrp)
+{
+	int line = MFC6_HASH(mcastgrp, origin);
+	struct mfc6_cache *c;
+
+	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
+		if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
+		    ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
+			return c;
+	}
+	return NULL;
+}
+
+/*
+ *	Allocate a multicast cache entry
+ */
+static struct mfc6_cache *ip6mr_cache_alloc(void)
+{
+	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
+	if (c == NULL)
+		return NULL;
+	c->mfc_un.res.minvif = MAXMIFS;
+	return c;
+}
+
+static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
+{
+	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
+	if (c == NULL)
+		return NULL;
+	skb_queue_head_init(&c->mfc_un.unres.unresolved);
+	c->mfc_un.unres.expires = jiffies + 10 * HZ;
+	return c;
+}
+
+/*
+ *	A cache entry has gone into a resolved state from queued
+ */
+
+static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
+				struct mfc6_cache *uc, struct mfc6_cache *c)
+{
+	struct sk_buff *skb;
+
+	/*
+	 *	Play the pending entries through our router
+	 */
+
+	while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+		if (ipv6_hdr(skb)->version == 0) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
+
+			if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
+				nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
+			} else {
+				nlh->nlmsg_type = NLMSG_ERROR;
+				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+				skb_trim(skb, nlh->nlmsg_len);
+				((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+			}
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
+		} else
+			ip6_mr_forward(net, mrt, skb, c);
+	}
+}
+
+/*
+ *	Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd
+ *	expects the following bizarre scheme.
+ *
+ *	Called under mrt_lock.
+ */
+
+static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+			      mifi_t mifi, int assert)
+{
+	struct sk_buff *skb;
+	struct mrt6msg *msg;
+	int ret;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (assert == MRT6MSG_WHOLEPKT)
+		skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
+						+sizeof(*msg));
+	else
+#endif
+		skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
+
+	if (!skb)
+		return -ENOBUFS;
+
+	/* I suppose that internal messages
+	 * do not require checksums */
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (assert == MRT6MSG_WHOLEPKT) {
+		/* Ugly, but we have no choice with this interface.
+		   Duplicate old header, fix length etc.
+		   And all this only to mangle msg->im6_msgtype and
+		   to set msg->im6_mbz to "mbz" :-)
+		 */
+		skb_push(skb, -skb_network_offset(pkt));
+
+		skb_push(skb, sizeof(*msg));
+		skb_reset_transport_header(skb);
+		msg = (struct mrt6msg *)skb_transport_header(skb);
+		msg->im6_mbz = 0;
+		msg->im6_msgtype = MRT6MSG_WHOLEPKT;
+		msg->im6_mif = mrt->mroute_reg_vif_num;
+		msg->im6_pad = 0;
+		ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
+		ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else
+#endif
+	{
+	/*
+	 *	Copy the IP header
+	 */
+
+	skb_put(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));
+
+	/*
+	 *	Add our header
+	 */
+	skb_put(skb, sizeof(*msg));
+	skb_reset_transport_header(skb);
+	msg = (struct mrt6msg *)skb_transport_header(skb);
+
+	msg->im6_mbz = 0;
+	msg->im6_msgtype = assert;
+	msg->im6_mif = mifi;
+	msg->im6_pad = 0;
+	ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
+	ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
+
+	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	if (mrt->mroute6_sk == NULL) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Deliver to user space multicast routing algorithms
+	 */
+	ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+	if (ret < 0) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n");
+		kfree_skb(skb);
+	}
+
+	return ret;
+}
+
+/*
+ *	Queue a packet for resolution. It gets locked cache entry!
+ */
+
+static int
+ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
+{
+	bool found = false;
+	int err;
+	struct mfc6_cache *c;
+
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(c, &mrt->mfc6_unres_queue, list) {
+		if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
+		    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		/*
+		 *	Create a new entry if allowable
+		 */
+
+		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+		    (c = ip6mr_cache_alloc_unres()) == NULL) {
+			spin_unlock_bh(&mfc_unres_lock);
+
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+
+		/*
+		 *	Fill in the new cache entry
+		 */
+		c->mf6c_parent = -1;
+		c->mf6c_origin = ipv6_hdr(skb)->saddr;
+		c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
+
+		/*
+		 *	Reflect first query at pim6sd
+		 */
+		err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE);
+		if (err < 0) {
+			/* If the report failed throw the cache entry
+			   out - Brad Parker
+			 */
+			spin_unlock_bh(&mfc_unres_lock);
+
+			ip6mr_cache_free(c);
+			kfree_skb(skb);
+			return err;
+		}
+
+		atomic_inc(&mrt->cache_resolve_queue_len);
+		list_add(&c->list, &mrt->mfc6_unres_queue);
+
+		ipmr_do_expire_process(mrt);
+	}
+
+	/*
+	 *	See if we can append the packet
+	 */
+	if (c->mfc_un.unres.unresolved.qlen > 3) {
+		kfree_skb(skb);
+		err = -ENOBUFS;
+	} else {
+		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+		err = 0;
+	}
+
+	spin_unlock_bh(&mfc_unres_lock);
+	return err;
+}
+
+/*
+ *	MFC6 cache manipulation by user space
+ */
+
+static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc)
+{
+	int line;
+	struct mfc6_cache *c, *next;
+
+	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
+
+	list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) {
+		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
+		    ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) {
+			write_lock_bh(&mrt_lock);
+			list_del(&c->list);
+			write_unlock_bh(&mrt_lock);
+
+			ip6mr_cache_free(c);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int ip6mr_device_event(struct notifier_block *this,
+			      unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+	struct mr6_table *mrt;
+	struct mif_device *v;
+	int ct;
+	LIST_HEAD(list);
+
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+
+	ip6mr_for_each_table(mrt, net) {
+		v = &mrt->vif6_table[0];
+		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+			if (v->dev == dev)
+				mif6_delete(mrt, ct, &list);
+		}
+	}
+	unregister_netdevice_many(&list);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ip6_mr_notifier = {
+	.notifier_call = ip6mr_device_event
+};
+
+/*
+ *	Setup for IP multicast routing
+ */
+
+static int __net_init ip6mr_net_init(struct net *net)
+{
+	int err;
+
+	err = ip6mr_rules_init(net);
+	if (err < 0)
+		goto fail;
+
+#ifdef CONFIG_PROC_FS
+	err = -ENOMEM;
+	if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops))
+		goto proc_vif_fail;
+	if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops))
+		goto proc_cache_fail;
+#endif
+
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+proc_cache_fail:
+	proc_net_remove(net, "ip6_mr_vif");
+proc_vif_fail:
+	ip6mr_rules_exit(net);
+#endif
+fail:
+	return err;
+}
+
+static void __net_exit ip6mr_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "ip6_mr_cache");
+	proc_net_remove(net, "ip6_mr_vif");
+#endif
+	ip6mr_rules_exit(net);
+}
+
+static struct pernet_operations ip6mr_net_ops = {
+	.init = ip6mr_net_init,
+	.exit = ip6mr_net_exit,
+};
+
+int __init ip6_mr_init(void)
+{
+	int err;
+
+	mrt_cachep = kmem_cache_create("ip6_mrt_cache",
+				       sizeof(struct mfc6_cache),
+				       0, SLAB_HWCACHE_ALIGN,
+				       NULL);
+	if (!mrt_cachep)
+		return -ENOMEM;
+
+	err = register_pernet_subsys(&ip6mr_net_ops);
+	if (err)
+		goto reg_pernet_fail;
+
+	err = register_netdevice_notifier(&ip6_mr_notifier);
+	if (err)
+		goto reg_notif_fail;
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) {
+		printk(KERN_ERR "ip6_mr_init: can't add PIM protocol\n");
+		err = -EAGAIN;
+		goto add_proto_fail;
+	}
+#endif
+	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, ip6mr_rtm_dumproute);
+	return 0;
+#ifdef CONFIG_IPV6_PIMSM_V2
+add_proto_fail:
+	unregister_netdevice_notifier(&ip6_mr_notifier);
+#endif
+reg_notif_fail:
+	unregister_pernet_subsys(&ip6mr_net_ops);
+reg_pernet_fail:
+	kmem_cache_destroy(mrt_cachep);
+	return err;
+}
+
+void ip6_mr_cleanup(void)
+{
+	unregister_netdevice_notifier(&ip6_mr_notifier);
+	unregister_pernet_subsys(&ip6mr_net_ops);
+	kmem_cache_destroy(mrt_cachep);
+}
+
+static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
+			 struct mf6cctl *mfc, int mrtsock)
+{
+	bool found = false;
+	int line;
+	struct mfc6_cache *uc, *c;
+	unsigned char ttls[MAXMIFS];
+	int i;
+
+	if (mfc->mf6cc_parent >= MAXMIFS)
+		return -ENFILE;
+
+	memset(ttls, 255, MAXMIFS);
+	for (i = 0; i < MAXMIFS; i++) {
+		if (IF_ISSET(i, &mfc->mf6cc_ifset))
+			ttls[i] = 1;
+
+	}
+
+	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
+
+	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
+		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
+		    ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (found) {
+		write_lock_bh(&mrt_lock);
+		c->mf6c_parent = mfc->mf6cc_parent;
+		ip6mr_update_thresholds(mrt, c, ttls);
+		if (!mrtsock)
+			c->mfc_flags |= MFC_STATIC;
+		write_unlock_bh(&mrt_lock);
+		return 0;
+	}
+
+	if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
+		return -EINVAL;
+
+	c = ip6mr_cache_alloc();
+	if (c == NULL)
+		return -ENOMEM;
+
+	c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
+	c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
+	c->mf6c_parent = mfc->mf6cc_parent;
+	ip6mr_update_thresholds(mrt, c, ttls);
+	if (!mrtsock)
+		c->mfc_flags |= MFC_STATIC;
+
+	write_lock_bh(&mrt_lock);
+	list_add(&c->list, &mrt->mfc6_cache_array[line]);
+	write_unlock_bh(&mrt_lock);
+
+	/*
+	 *	Check to see if we resolved a queued list. If so we
+	 *	need to send on the frames and tidy up.
+	 */
+	found = false;
+	spin_lock_bh(&mfc_unres_lock);
+	list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) {
+		if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
+		    ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
+			list_del(&uc->list);
+			atomic_dec(&mrt->cache_resolve_queue_len);
+			found = true;
+			break;
+		}
+	}
+	if (list_empty(&mrt->mfc6_unres_queue))
+		del_timer(&mrt->ipmr_expire_timer);
+	spin_unlock_bh(&mfc_unres_lock);
+
+	if (found) {
+		ip6mr_cache_resolve(net, mrt, uc, c);
+		ip6mr_cache_free(uc);
+	}
+	return 0;
+}
+
+/*
+ *	Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr6_table *mrt)
+{
+	int i;
+	LIST_HEAD(list);
+	struct mfc6_cache *c, *next;
+
+	/*
+	 *	Shut down all active vif entries
+	 */
+	for (i = 0; i < mrt->maxvif; i++) {
+		if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
+			mif6_delete(mrt, i, &list);
+	}
+	unregister_netdevice_many(&list);
+
+	/*
+	 *	Wipe the cache
+	 */
+	for (i = 0; i < MFC6_LINES; i++) {
+		list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
+			if (c->mfc_flags & MFC_STATIC)
+				continue;
+			write_lock_bh(&mrt_lock);
+			list_del(&c->list);
+			write_unlock_bh(&mrt_lock);
+
+			ip6mr_cache_free(c);
+		}
+	}
+
+	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+		spin_lock_bh(&mfc_unres_lock);
+		list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+			list_del(&c->list);
+			ip6mr_destroy_unres(mrt, c);
+		}
+		spin_unlock_bh(&mfc_unres_lock);
+	}
+}
+
+static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk)
+{
+	int err = 0;
+	struct net *net = sock_net(sk);
+
+	rtnl_lock();
+	write_lock_bh(&mrt_lock);
+	if (likely(mrt->mroute6_sk == NULL)) {
+		mrt->mroute6_sk = sk;
+		net->ipv6.devconf_all->mc_forwarding++;
+	}
+	else
+		err = -EADDRINUSE;
+	write_unlock_bh(&mrt_lock);
+
+	rtnl_unlock();
+
+	return err;
+}
+
+int ip6mr_sk_done(struct sock *sk)
+{
+	int err = -EACCES;
+	struct net *net = sock_net(sk);
+	struct mr6_table *mrt;
+
+	rtnl_lock();
+	ip6mr_for_each_table(mrt, net) {
+		if (sk == mrt->mroute6_sk) {
+			write_lock_bh(&mrt_lock);
+			mrt->mroute6_sk = NULL;
+			net->ipv6.devconf_all->mc_forwarding--;
+			write_unlock_bh(&mrt_lock);
+
+			mroute_clean_tables(mrt);
+			err = 0;
+			break;
+		}
+	}
+	rtnl_unlock();
+
+	return err;
+}
+
+struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+{
+	struct mr6_table *mrt;
+	struct flowi6 fl6 = {
+		.flowi6_iif	= skb->skb_iif,
+		.flowi6_oif	= skb->dev->ifindex,
+		.flowi6_mark	= skb->mark,
+	};
+
+	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+		return NULL;
+
+	return mrt->mroute6_sk;
+}
+
+/*
+ *	Socket options and virtual interface manipulation. The whole
+ *	virtual interface system is a complete heap, but unfortunately
+ *	that's how BSD mrouted happens to think. Maybe one day with a proper
+ *	MOSPF/PIM router set up we can clean this up.
+ */
+
+int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+{
+	int ret;
+	struct mif6ctl vif;
+	struct mf6cctl mfc;
+	mifi_t mifi;
+	struct net *net = sock_net(sk);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	if (optname != MRT6_INIT) {
+		if (sk != mrt->mroute6_sk && !capable(CAP_NET_ADMIN))
+			return -EACCES;
+	}
+
+	switch (optname) {
+	case MRT6_INIT:
+		if (sk->sk_type != SOCK_RAW ||
+		    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+			return -EOPNOTSUPP;
+		if (optlen < sizeof(int))
+			return -EINVAL;
+
+		return ip6mr_sk_init(mrt, sk);
+
+	case MRT6_DONE:
+		return ip6mr_sk_done(sk);
+
+	case MRT6_ADD_MIF:
+		if (optlen < sizeof(vif))
+			return -EINVAL;
+		if (copy_from_user(&vif, optval, sizeof(vif)))
+			return -EFAULT;
+		if (vif.mif6c_mifi >= MAXMIFS)
+			return -ENFILE;
+		rtnl_lock();
+		ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk);
+		rtnl_unlock();
+		return ret;
+
+	case MRT6_DEL_MIF:
+		if (optlen < sizeof(mifi_t))
+			return -EINVAL;
+		if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
+			return -EFAULT;
+		rtnl_lock();
+		ret = mif6_delete(mrt, mifi, NULL);
+		rtnl_unlock();
+		return ret;
+
+	/*
+	 *	Manipulate the forwarding caches. These live
+	 *	in a sort of kernel/user symbiosis.
+	 */
+	case MRT6_ADD_MFC:
+	case MRT6_DEL_MFC:
+		if (optlen < sizeof(mfc))
+			return -EINVAL;
+		if (copy_from_user(&mfc, optval, sizeof(mfc)))
+			return -EFAULT;
+		rtnl_lock();
+		if (optname == MRT6_DEL_MFC)
+			ret = ip6mr_mfc_delete(mrt, &mfc);
+		else
+			ret = ip6mr_mfc_add(net, mrt, &mfc, sk == mrt->mroute6_sk);
+		rtnl_unlock();
+		return ret;
+
+	/*
+	 *	Control PIM assert (to activate pim will activate assert)
+	 */
+	case MRT6_ASSERT:
+	{
+		int v;
+		if (get_user(v, (int __user *)optval))
+			return -EFAULT;
+		mrt->mroute_do_assert = !!v;
+		return 0;
+	}
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	case MRT6_PIM:
+	{
+		int v;
+		if (get_user(v, (int __user *)optval))
+			return -EFAULT;
+		v = !!v;
+		rtnl_lock();
+		ret = 0;
+		if (v != mrt->mroute_do_pim) {
+			mrt->mroute_do_pim = v;
+			mrt->mroute_do_assert = v;
+		}
+		rtnl_unlock();
+		return ret;
+	}
+
+#endif
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+	case MRT6_TABLE:
+	{
+		u32 v;
+
+		if (optlen != sizeof(u32))
+			return -EINVAL;
+		if (get_user(v, (u32 __user *)optval))
+			return -EFAULT;
+		if (sk == mrt->mroute6_sk)
+			return -EBUSY;
+
+		rtnl_lock();
+		ret = 0;
+		if (!ip6mr_new_table(net, v))
+			ret = -ENOMEM;
+		raw6_sk(sk)->ip6mr_table = v;
+		rtnl_unlock();
+		return ret;
+	}
+#endif
+	/*
+	 *	Spurious command, or MRT6_VERSION which you cannot
+	 *	set.
+	 */
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+/*
+ *	Getsock opt support for the multicast routing system.
+ */
+
+int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
+			  int __user *optlen)
+{
+	int olr;
+	int val;
+	struct net *net = sock_net(sk);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	switch (optname) {
+	case MRT6_VERSION:
+		val = 0x0305;
+		break;
+#ifdef CONFIG_IPV6_PIMSM_V2
+	case MRT6_PIM:
+		val = mrt->mroute_do_pim;
+		break;
+#endif
+	case MRT6_ASSERT:
+		val = mrt->mroute_do_assert;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (get_user(olr, optlen))
+		return -EFAULT;
+
+	olr = min_t(int, olr, sizeof(int));
+	if (olr < 0)
+		return -EINVAL;
+
+	if (put_user(olr, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, olr))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ *	The IP multicast ioctl support routines.
+ */
+
+int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+	struct sioc_sg_req6 sr;
+	struct sioc_mif_req6 vr;
+	struct mif_device *vif;
+	struct mfc6_cache *c;
+	struct net *net = sock_net(sk);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETMIFCNT_IN6:
+		if (copy_from_user(&vr, arg, sizeof(vr)))
+			return -EFAULT;
+		if (vr.mifi >= mrt->maxvif)
+			return -EINVAL;
+		read_lock(&mrt_lock);
+		vif = &mrt->vif6_table[vr.mifi];
+		if (MIF_EXISTS(mrt, vr.mifi)) {
+			vr.icount = vif->pkt_in;
+			vr.ocount = vif->pkt_out;
+			vr.ibytes = vif->bytes_in;
+			vr.obytes = vif->bytes_out;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &vr, sizeof(vr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT_IN6:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+
+		read_lock(&mrt_lock);
+		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+		if (c) {
+			sr.pktcnt = c->mfc_un.res.pkt;
+			sr.bytecnt = c->mfc_un.res.bytes;
+			sr.wrong_if = c->mfc_un.res.wrong_if;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &sr, sizeof(sr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sioc_sg_req6 {
+	struct sockaddr_in6 src;
+	struct sockaddr_in6 grp;
+	compat_ulong_t pktcnt;
+	compat_ulong_t bytecnt;
+	compat_ulong_t wrong_if;
+};
+
+struct compat_sioc_mif_req6 {
+	mifi_t	mifi;
+	compat_ulong_t icount;
+	compat_ulong_t ocount;
+	compat_ulong_t ibytes;
+	compat_ulong_t obytes;
+};
+
+int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+	struct compat_sioc_sg_req6 sr;
+	struct compat_sioc_mif_req6 vr;
+	struct mif_device *vif;
+	struct mfc6_cache *c;
+	struct net *net = sock_net(sk);
+	struct mr6_table *mrt;
+
+	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	switch (cmd) {
+	case SIOCGETMIFCNT_IN6:
+		if (copy_from_user(&vr, arg, sizeof(vr)))
+			return -EFAULT;
+		if (vr.mifi >= mrt->maxvif)
+			return -EINVAL;
+		read_lock(&mrt_lock);
+		vif = &mrt->vif6_table[vr.mifi];
+		if (MIF_EXISTS(mrt, vr.mifi)) {
+			vr.icount = vif->pkt_in;
+			vr.ocount = vif->pkt_out;
+			vr.ibytes = vif->bytes_in;
+			vr.obytes = vif->bytes_out;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &vr, sizeof(vr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	case SIOCGETSGCNT_IN6:
+		if (copy_from_user(&sr, arg, sizeof(sr)))
+			return -EFAULT;
+
+		read_lock(&mrt_lock);
+		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+		if (c) {
+			sr.pktcnt = c->mfc_un.res.pkt;
+			sr.bytecnt = c->mfc_un.res.bytes;
+			sr.wrong_if = c->mfc_un.res.wrong_if;
+			read_unlock(&mrt_lock);
+
+			if (copy_to_user(arg, &sr, sizeof(sr)))
+				return -EFAULT;
+			return 0;
+		}
+		read_unlock(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+static inline int ip6mr_forward2_finish(struct sk_buff *skb)
+{
+	IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)),
+			 IPSTATS_MIB_OUTFORWDATAGRAMS);
+	return dst_output(skb);
+}
+
+/*
+ *	Processing handlers for ip6mr_forward
+ */
+
+static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
+			  struct sk_buff *skb, struct mfc6_cache *c, int vifi)
+{
+	struct ipv6hdr *ipv6h;
+	struct mif_device *vif = &mrt->vif6_table[vifi];
+	struct net_device *dev;
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	if (vif->dev == NULL)
+		goto out_free;
+
+#ifdef CONFIG_IPV6_PIMSM_V2
+	if (vif->flags & MIFF_REGISTER) {
+		vif->pkt_out++;
+		vif->bytes_out += skb->len;
+		vif->dev->stats.tx_bytes += skb->len;
+		vif->dev->stats.tx_packets++;
+		ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
+		goto out_free;
+	}
+#endif
+
+	ipv6h = ipv6_hdr(skb);
+
+	fl6 = (struct flowi6) {
+		.flowi6_oif = vif->link,
+		.daddr = ipv6h->daddr,
+	};
+
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (!dst)
+		goto out_free;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	/*
+	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+	 * not only before forwarding, but after forwarding on all output
+	 * interfaces. It is clear, if mrouter runs a multicasting
+	 * program, it should receive packets not depending to what interface
+	 * program is joined.
+	 * If we will not make it, the program will have to join on all
+	 * interfaces. On the other hand, multihoming host (or router, but
+	 * not mrouter) cannot join to more than one interface - it will
+	 * result in receiving multiple packets.
+	 */
+	dev = vif->dev;
+	skb->dev = dev;
+	vif->pkt_out++;
+	vif->bytes_out += skb->len;
+
+	/* We are about to write */
+	/* XXX: extension headers? */
+	if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
+		goto out_free;
+
+	ipv6h = ipv6_hdr(skb);
+	ipv6h->hop_limit--;
+
+	IP6CB(skb)->flags |= IP6SKB_FORWARDED;
+
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev,
+		       ip6mr_forward2_finish);
+
+out_free:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev)
+{
+	int ct;
+
+	for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
+		if (mrt->vif6_table[ct].dev == dev)
+			break;
+	}
+	return ct;
+}
+
+static int ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+			  struct sk_buff *skb, struct mfc6_cache *cache)
+{
+	int psend = -1;
+	int vif, ct;
+
+	vif = cache->mf6c_parent;
+	cache->mfc_un.res.pkt++;
+	cache->mfc_un.res.bytes += skb->len;
+
+	/*
+	 * Wrong interface: drop packet and (maybe) send PIM assert.
+	 */
+	if (mrt->vif6_table[vif].dev != skb->dev) {
+		int true_vifi;
+
+		cache->mfc_un.res.wrong_if++;
+		true_vifi = ip6mr_find_vif(mrt, skb->dev);
+
+		if (true_vifi >= 0 && mrt->mroute_do_assert &&
+		    /* pimsm uses asserts, when switching from RPT to SPT,
+		       so that we cannot check that packet arrived on an oif.
+		       It is bad, but otherwise we would need to move pretty
+		       large chunk of pimd to kernel. Ough... --ANK
+		     */
+		    (mrt->mroute_do_pim ||
+		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
+		    time_after(jiffies,
+			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+			cache->mfc_un.res.last_assert = jiffies;
+			ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+		}
+		goto dont_forward;
+	}
+
+	mrt->vif6_table[vif].pkt_in++;
+	mrt->vif6_table[vif].bytes_in += skb->len;
+
+	/*
+	 *	Forward the frame
+	 */
+	for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) {
+		if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					ip6mr_forward2(net, mrt, skb2, cache, psend);
+			}
+			psend = ct;
+		}
+	}
+	if (psend != -1) {
+		ip6mr_forward2(net, mrt, skb, cache, psend);
+		return 0;
+	}
+
+dont_forward:
+	kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *	Multicast packets for forwarding arrive here
+ */
+
+int ip6_mr_input(struct sk_buff *skb)
+{
+	struct mfc6_cache *cache;
+	struct net *net = dev_net(skb->dev);
+	struct mr6_table *mrt;
+	struct flowi6 fl6 = {
+		.flowi6_iif	= skb->dev->ifindex,
+		.flowi6_mark	= skb->mark,
+	};
+	int err;
+
+	err = ip6mr_fib_lookup(net, &fl6, &mrt);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	read_lock(&mrt_lock);
+	cache = ip6mr_cache_find(mrt,
+				 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+
+	/*
+	 *	No usable cache entry
+	 */
+	if (cache == NULL) {
+		int vif;
+
+		vif = ip6mr_find_vif(mrt, skb->dev);
+		if (vif >= 0) {
+			int err = ip6mr_cache_unresolved(mrt, vif, skb);
+			read_unlock(&mrt_lock);
+
+			return err;
+		}
+		read_unlock(&mrt_lock);
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	ip6_mr_forward(net, mrt, skb, cache);
+
+	read_unlock(&mrt_lock);
+
+	return 0;
+}
+
+
+static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+			       struct mfc6_cache *c, struct rtmsg *rtm)
+{
+	int ct;
+	struct rtnexthop *nhp;
+	u8 *b = skb_tail_pointer(skb);
+	struct rtattr *mp_head;
+
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mf6c_parent >= MAXMIFS)
+		return -ENOENT;
+
+	if (MIF_EXISTS(mrt, c->mf6c_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &mrt->vif6_table[c->mf6c_parent].dev->ifindex);
+
+	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+				goto rtattr_failure;
+			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			nhp->rtnh_flags = 0;
+			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex;
+			nhp->rtnh_len = sizeof(*nhp);
+		}
+	}
+	mp_head->rta_type = RTA_MULTIPATH;
+	mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
+	rtm->rtm_type = RTN_MULTICAST;
+	return 1;
+
+rtattr_failure:
+	nlmsg_trim(skb, b);
+	return -EMSGSIZE;
+}
+
+int ip6mr_get_route(struct net *net,
+		    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+	int err;
+	struct mr6_table *mrt;
+	struct mfc6_cache *cache;
+	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+
+	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+	if (mrt == NULL)
+		return -ENOENT;
+
+	read_lock(&mrt_lock);
+	cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
+
+	if (!cache) {
+		struct sk_buff *skb2;
+		struct ipv6hdr *iph;
+		struct net_device *dev;
+		int vif;
+
+		if (nowait) {
+			read_unlock(&mrt_lock);
+			return -EAGAIN;
+		}
+
+		dev = skb->dev;
+		if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
+			read_unlock(&mrt_lock);
+			return -ENODEV;
+		}
+
+		/* really correct? */
+		skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
+		if (!skb2) {
+			read_unlock(&mrt_lock);
+			return -ENOMEM;
+		}
+
+		skb_reset_transport_header(skb2);
+
+		skb_put(skb2, sizeof(struct ipv6hdr));
+		skb_reset_network_header(skb2);
+
+		iph = ipv6_hdr(skb2);
+		iph->version = 0;
+		iph->priority = 0;
+		iph->flow_lbl[0] = 0;
+		iph->flow_lbl[1] = 0;
+		iph->flow_lbl[2] = 0;
+		iph->payload_len = 0;
+		iph->nexthdr = IPPROTO_NONE;
+		iph->hop_limit = 0;
+		ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr);
+		ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
+
+		err = ip6mr_cache_unresolved(mrt, vif, skb2);
+		read_unlock(&mrt_lock);
+
+		return err;
+	}
+
+	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+		cache->mfc_flags |= MFC_NOTIFY;
+
+	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
+	read_unlock(&mrt_lock);
+	return err;
+}
+
+static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+			     u32 pid, u32 seq, struct mfc6_cache *c)
+{
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+
+	nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family   = RTNL_FAMILY_IPMR;
+	rtm->rtm_dst_len  = 128;
+	rtm->rtm_src_len  = 128;
+	rtm->rtm_tos      = 0;
+	rtm->rtm_table    = mrt->id;
+	NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
+	rtm->rtm_protocol = RTPROT_UNSPEC;
+	rtm->rtm_flags    = 0;
+
+	NLA_PUT(skb, RTA_SRC, 16, &c->mf6c_origin);
+	NLA_PUT(skb, RTA_DST, 16, &c->mf6c_mcastgrp);
+
+	if (__ip6mr_fill_mroute(mrt, skb, c, rtm) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct mr6_table *mrt;
+	struct mfc6_cache *mfc;
+	unsigned int t = 0, s_t;
+	unsigned int h = 0, s_h;
+	unsigned int e = 0, s_e;
+
+	s_t = cb->args[0];
+	s_h = cb->args[1];
+	s_e = cb->args[2];
+
+	read_lock(&mrt_lock);
+	ip6mr_for_each_table(mrt, net) {
+		if (t < s_t)
+			goto next_table;
+		if (t > s_t)
+			s_h = 0;
+		for (h = s_h; h < MFC6_LINES; h++) {
+			list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) {
+				if (e < s_e)
+					goto next_entry;
+				if (ip6mr_fill_mroute(mrt, skb,
+						      NETLINK_CB(cb->skb).pid,
+						      cb->nlh->nlmsg_seq,
+						      mfc) < 0)
+					goto done;
+next_entry:
+				e++;
+			}
+			e = s_e = 0;
+		}
+		s_h = 0;
+next_table:
+		t++;
+	}
+done:
+	read_unlock(&mrt_lock);
+
+	cb->args[2] = e;
+	cb->args[1] = h;
+	cb->args[0] = t;
+
+	return skb->len;
+}
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
new file mode 100644
index 00000000..bba658d9
--- /dev/null
+++ b/net/ipv6/ipcomp6.c
@@ -0,0 +1,218 @@
+/*
+ * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Author	Mitsuru KANDA  <mk@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * [Memo]
+ *
+ * Outbound:
+ *  The compression of IP datagram MUST be done before AH/ESP processing,
+ *  fragmentation, and the addition of Hop-by-Hop/Routing header.
+ *
+ * Inbound:
+ *  The decompression of IP datagram MUST be done after the reassembly,
+ *  AH/ESP processing.
+ */
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ipcomp.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/icmp.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+
+static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+				u8 type, u8 code, int offset, __be32 info)
+{
+	struct net *net = dev_net(skb->dev);
+	__be32 spi;
+	const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+	struct ip_comp_hdr *ipcomph =
+		(struct ip_comp_hdr *)(skb->data + offset);
+	struct xfrm_state *x;
+
+	if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG)
+		return;
+
+	spi = htonl(ntohs(ipcomph->cpi));
+	x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+			      spi, IPPROTO_COMP, AF_INET6);
+	if (!x)
+		return;
+
+	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI6\n",
+			spi, &iph->daddr);
+	xfrm_state_put(x);
+}
+
+static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	struct xfrm_state *t = NULL;
+
+	t = xfrm_state_alloc(net);
+	if (!t)
+		goto out;
+
+	t->id.proto = IPPROTO_IPV6;
+	t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
+	if (!t->id.spi)
+		goto error;
+
+	memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
+	memcpy(&t->sel, &x->sel, sizeof(t->sel));
+	t->props.family = AF_INET6;
+	t->props.mode = x->props.mode;
+	memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
+	memcpy(&t->mark, &x->mark, sizeof(t->mark));
+
+	if (xfrm_init_state(t))
+		goto error;
+
+	atomic_set(&t->tunnel_users, 1);
+
+out:
+	return t;
+
+error:
+	t->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(t);
+	t = NULL;
+	goto out;
+}
+
+static int ipcomp6_tunnel_attach(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	int err = 0;
+	struct xfrm_state *t = NULL;
+	__be32 spi;
+	u32 mark = x->mark.m & x->mark.v;
+
+	spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr);
+	if (spi)
+		t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr,
+					      spi, IPPROTO_IPV6, AF_INET6);
+	if (!t) {
+		t = ipcomp6_tunnel_create(x);
+		if (!t) {
+			err = -EINVAL;
+			goto out;
+		}
+		xfrm_state_insert(t);
+		xfrm_state_hold(t);
+	}
+	x->tunnel = t;
+	atomic_inc(&t->tunnel_users);
+
+out:
+	return err;
+}
+
+static int ipcomp6_init_state(struct xfrm_state *x)
+{
+	int err = -EINVAL;
+
+	x->props.header_len = 0;
+	switch (x->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+		break;
+	case XFRM_MODE_TUNNEL:
+		x->props.header_len += sizeof(struct ipv6hdr);
+		break;
+	default:
+		goto out;
+	}
+
+	err = ipcomp_init_state(x);
+	if (err)
+		goto out;
+
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
+		err = ipcomp6_tunnel_attach(x);
+		if (err)
+			goto out;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+static const struct xfrm_type ipcomp6_type =
+{
+	.description	= "IPCOMP6",
+	.owner		= THIS_MODULE,
+	.proto		= IPPROTO_COMP,
+	.init_state	= ipcomp6_init_state,
+	.destructor	= ipcomp_destroy,
+	.input		= ipcomp_input,
+	.output		= ipcomp_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
+};
+
+static const struct inet6_protocol ipcomp6_protocol =
+{
+	.handler	= xfrm6_rcv,
+	.err_handler	= ipcomp6_err,
+	.flags		= INET6_PROTO_NOPOLICY,
+};
+
+static int __init ipcomp6_init(void)
+{
+	if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) {
+		printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) {
+		printk(KERN_INFO "ipcomp6 init: can't add protocol\n");
+		xfrm_unregister_type(&ipcomp6_type, AF_INET6);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ipcomp6_fini(void)
+{
+	if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0)
+		printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
+		printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n");
+}
+
+module_init(ipcomp6_init);
+module_exit(ipcomp6_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173");
+MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>");
+
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
new file mode 100644
index 00000000..147ede38
--- /dev/null
+++ b/net/ipv6/ipv6_sockglue.c
@@ -0,0 +1,1290 @@
+/*
+ *	IPv6 BSD socket options interface
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on linux/net/ipv4/ip_sockglue.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	FIXME: Make the setsockopt code POSIX compliant: That is
+ *
+ *	o	Truncate getsockopt returns
+ *	o	Return an optlen of the truncated length if need be
+ *
+ *	Changes:
+ *	David L Stevens <dlstevens@us.ibm.com>:
+ *		- added multicast source filtering API for MLDv2
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/mroute6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+
+#include <asm/uaccess.h>
+
+struct ip6_ra_chain *ip6_ra_chain;
+DEFINE_RWLOCK(ip6_ra_lock);
+
+int ip6_ra_control(struct sock *sk, int sel)
+{
+	struct ip6_ra_chain *ra, *new_ra, **rap;
+
+	/* RA packet may be delivered ONLY to IPPROTO_RAW socket */
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW)
+		return -ENOPROTOOPT;
+
+	new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+	write_lock_bh(&ip6_ra_lock);
+	for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+		if (ra->sk == sk) {
+			if (sel>=0) {
+				write_unlock_bh(&ip6_ra_lock);
+				kfree(new_ra);
+				return -EADDRINUSE;
+			}
+
+			*rap = ra->next;
+			write_unlock_bh(&ip6_ra_lock);
+
+			sock_put(sk);
+			kfree(ra);
+			return 0;
+		}
+	}
+	if (new_ra == NULL) {
+		write_unlock_bh(&ip6_ra_lock);
+		return -ENOBUFS;
+	}
+	new_ra->sk = sk;
+	new_ra->sel = sel;
+	new_ra->next = ra;
+	*rap = new_ra;
+	sock_hold(sk);
+	write_unlock_bh(&ip6_ra_lock);
+	return 0;
+}
+
+static
+struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
+					   struct ipv6_txoptions *opt)
+{
+	if (inet_sk(sk)->is_icsk) {
+		if (opt &&
+		    !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+		    inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
+			struct inet_connection_sock *icsk = inet_csk(sk);
+			icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
+			icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+		}
+		opt = xchg(&inet6_sk(sk)->opt, opt);
+	} else {
+		spin_lock(&sk->sk_dst_lock);
+		opt = xchg(&inet6_sk(sk)->opt, opt);
+		spin_unlock(&sk->sk_dst_lock);
+	}
+	sk_dst_reset(sk);
+
+	return opt;
+}
+
+static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
+	int val, valbool;
+	int retv = -ENOPROTOOPT;
+
+	if (optval == NULL)
+		val=0;
+	else {
+		if (optlen >= sizeof(int)) {
+			if (get_user(val, (int __user *) optval))
+				return -EFAULT;
+		} else
+			val = 0;
+	}
+
+	valbool = (val!=0);
+
+	if (ip6_mroute_opt(optname))
+		return ip6_mroute_setsockopt(sk, optname, optval, optlen);
+
+	lock_sock(sk);
+
+	switch (optname) {
+
+	case IPV6_ADDRFORM:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val == PF_INET) {
+			struct ipv6_txoptions *opt;
+			struct sk_buff *pktopt;
+
+			if (sk->sk_type == SOCK_RAW)
+				break;
+
+			if (sk->sk_protocol == IPPROTO_UDP ||
+			    sk->sk_protocol == IPPROTO_UDPLITE) {
+				struct udp_sock *up = udp_sk(sk);
+				if (up->pending == AF_INET6) {
+					retv = -EBUSY;
+					break;
+				}
+			} else if (sk->sk_protocol != IPPROTO_TCP)
+				break;
+
+			if (sk->sk_state != TCP_ESTABLISHED) {
+				retv = -ENOTCONN;
+				break;
+			}
+
+			if (ipv6_only_sock(sk) ||
+			    !ipv6_addr_v4mapped(&np->daddr)) {
+				retv = -EADDRNOTAVAIL;
+				break;
+			}
+
+			fl6_free_socklist(sk);
+			ipv6_sock_mc_close(sk);
+
+			/*
+			 * Sock is moving from IPv6 to IPv4 (sk_prot), so
+			 * remove it from the refcnt debug socks count in the
+			 * original family...
+			 */
+			sk_refcnt_debug_dec(sk);
+
+			if (sk->sk_protocol == IPPROTO_TCP) {
+				struct inet_connection_sock *icsk = inet_csk(sk);
+				local_bh_disable();
+				sock_prot_inuse_add(net, sk->sk_prot, -1);
+				sock_prot_inuse_add(net, &tcp_prot, 1);
+				local_bh_enable();
+				sk->sk_prot = &tcp_prot;
+				icsk->icsk_af_ops = &ipv4_specific;
+				sk->sk_socket->ops = &inet_stream_ops;
+				sk->sk_family = PF_INET;
+				tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+			} else {
+				struct proto *prot = &udp_prot;
+
+				if (sk->sk_protocol == IPPROTO_UDPLITE)
+					prot = &udplite_prot;
+				local_bh_disable();
+				sock_prot_inuse_add(net, sk->sk_prot, -1);
+				sock_prot_inuse_add(net, prot, 1);
+				local_bh_enable();
+				sk->sk_prot = prot;
+				sk->sk_socket->ops = &inet_dgram_ops;
+				sk->sk_family = PF_INET;
+			}
+			opt = xchg(&np->opt, NULL);
+			if (opt)
+				sock_kfree_s(sk, opt, opt->tot_len);
+			pktopt = xchg(&np->pktoptions, NULL);
+			kfree_skb(pktopt);
+
+			sk->sk_destruct = inet_sock_destruct;
+			/*
+			 * ... and add it to the refcnt debug socks count
+			 * in the new family. -acme
+			 */
+			sk_refcnt_debug_inc(sk);
+			module_put(THIS_MODULE);
+			retv = 0;
+			break;
+		}
+		goto e_inval;
+
+	case IPV6_V6ONLY:
+		if (optlen < sizeof(int) ||
+		    inet_sk(sk)->inet_num)
+			goto e_inval;
+		np->ipv6only = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVPKTINFO:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxinfo = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_2292PKTINFO:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxoinfo = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVHOPLIMIT:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxhlim = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_2292HOPLIMIT:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxohlim = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVRTHDR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.srcrt = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_2292RTHDR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.osrcrt = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVHOPOPTS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.hopopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_2292HOPOPTS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.ohopopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVDSTOPTS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.dstopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_2292DSTOPTS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.odstopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_TCLASS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val < -1 || val > 0xff)
+			goto e_inval;
+		/* RFC 3542, 6.5: default traffic class of 0x0 */
+		if (val == -1)
+			val = 0;
+		np->tclass = val;
+		retv = 0;
+		break;
+
+	case IPV6_RECVTCLASS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxtclass = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_FLOWINFO:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxflow = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVPATHMTU:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxpmtu = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_TRANSPARENT:
+		if (!capable(CAP_NET_ADMIN)) {
+			retv = -EPERM;
+			break;
+		}
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+		inet_sk(sk)->transparent = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxorigdstaddr = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_HOPOPTS:
+	case IPV6_RTHDRDSTOPTS:
+	case IPV6_RTHDR:
+	case IPV6_DSTOPTS:
+	{
+		struct ipv6_txoptions *opt;
+
+		/* remove any sticky options header with a zero option
+		 * length, per RFC3542.
+		 */
+		if (optlen == 0)
+			optval = NULL;
+		else if (optval == NULL)
+			goto e_inval;
+		else if (optlen < sizeof(struct ipv6_opt_hdr) ||
+			 optlen & 0x7 || optlen > 8 * 255)
+			goto e_inval;
+
+		/* hop-by-hop / destination options are privileged option */
+		retv = -EPERM;
+		if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
+			break;
+
+		opt = ipv6_renew_options(sk, np->opt, optname,
+					 (struct ipv6_opt_hdr __user *)optval,
+					 optlen);
+		if (IS_ERR(opt)) {
+			retv = PTR_ERR(opt);
+			break;
+		}
+
+		/* routing header option needs extra check */
+		retv = -EINVAL;
+		if (optname == IPV6_RTHDR && opt && opt->srcrt) {
+			struct ipv6_rt_hdr *rthdr = opt->srcrt;
+			switch (rthdr->type) {
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+			case IPV6_SRCRT_TYPE_2:
+				if (rthdr->hdrlen != 2 ||
+				    rthdr->segments_left != 1)
+					goto sticky_done;
+
+				break;
+#endif
+			default:
+				goto sticky_done;
+			}
+		}
+
+		retv = 0;
+		opt = ipv6_update_options(sk, opt);
+sticky_done:
+		if (opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+		break;
+	}
+
+	case IPV6_PKTINFO:
+	{
+		struct in6_pktinfo pkt;
+
+		if (optlen == 0)
+			goto e_inval;
+		else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL)
+			goto e_inval;
+
+		if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) {
+				retv = -EFAULT;
+				break;
+		}
+		if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+			goto e_inval;
+
+		np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
+		ipv6_addr_copy(&np->sticky_pktinfo.ipi6_addr, &pkt.ipi6_addr);
+		retv = 0;
+		break;
+	}
+
+	case IPV6_2292PKTOPTIONS:
+	{
+		struct ipv6_txoptions *opt = NULL;
+		struct msghdr msg;
+		struct flowi6 fl6;
+		int junk;
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+		fl6.flowi6_mark = sk->sk_mark;
+
+		if (optlen == 0)
+			goto update;
+
+		/* 1K is probably excessive
+		 * 1K is surely not enough, 2K per standard header is 16K.
+		 */
+		retv = -EINVAL;
+		if (optlen > 64*1024)
+			break;
+
+		opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
+		retv = -ENOBUFS;
+		if (opt == NULL)
+			break;
+
+		memset(opt, 0, sizeof(*opt));
+		opt->tot_len = sizeof(*opt) + optlen;
+		retv = -EFAULT;
+		if (copy_from_user(opt+1, optval, optlen))
+			goto done;
+
+		msg.msg_controllen = optlen;
+		msg.msg_control = (void*)(opt+1);
+
+		retv = datagram_send_ctl(net, &msg, &fl6, opt, &junk, &junk,
+					 &junk);
+		if (retv)
+			goto done;
+update:
+		retv = 0;
+		opt = ipv6_update_options(sk, opt);
+done:
+		if (opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+		break;
+	}
+	case IPV6_UNICAST_HOPS:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val > 255 || val < -1)
+			goto e_inval;
+		np->hop_limit = val;
+		retv = 0;
+		break;
+
+	case IPV6_MULTICAST_HOPS:
+		if (sk->sk_type == SOCK_STREAM)
+			break;
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val > 255 || val < -1)
+			goto e_inval;
+		np->mcast_hops = val;
+		retv = 0;
+		break;
+
+	case IPV6_MULTICAST_LOOP:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val != valbool)
+			goto e_inval;
+		np->mc_loop = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_MULTICAST_IF:
+		if (sk->sk_type == SOCK_STREAM)
+			break;
+		if (optlen < sizeof(int))
+			goto e_inval;
+
+		if (val) {
+			struct net_device *dev;
+
+			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val)
+				goto e_inval;
+
+			dev = dev_get_by_index(net, val);
+			if (!dev) {
+				retv = -ENODEV;
+				break;
+			}
+			dev_put(dev);
+		}
+		np->mcast_oif = val;
+		retv = 0;
+		break;
+	case IPV6_ADD_MEMBERSHIP:
+	case IPV6_DROP_MEMBERSHIP:
+	{
+		struct ipv6_mreq mreq;
+
+		if (optlen < sizeof(struct ipv6_mreq))
+			goto e_inval;
+
+		retv = -EPROTO;
+		if (inet_sk(sk)->is_icsk)
+			break;
+
+		retv = -EFAULT;
+		if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+			break;
+
+		if (optname == IPV6_ADD_MEMBERSHIP)
+			retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
+		else
+			retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
+		break;
+	}
+	case IPV6_JOIN_ANYCAST:
+	case IPV6_LEAVE_ANYCAST:
+	{
+		struct ipv6_mreq mreq;
+
+		if (optlen < sizeof(struct ipv6_mreq))
+			goto e_inval;
+
+		retv = -EFAULT;
+		if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+			break;
+
+		if (optname == IPV6_JOIN_ANYCAST)
+			retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
+		else
+			retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
+		break;
+	}
+	case MCAST_JOIN_GROUP:
+	case MCAST_LEAVE_GROUP:
+	{
+		struct group_req greq;
+		struct sockaddr_in6 *psin6;
+
+		if (optlen < sizeof(struct group_req))
+			goto e_inval;
+
+		retv = -EFAULT;
+		if (copy_from_user(&greq, optval, sizeof(struct group_req)))
+			break;
+		if (greq.gr_group.ss_family != AF_INET6) {
+			retv = -EADDRNOTAVAIL;
+			break;
+		}
+		psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+		if (optname == MCAST_JOIN_GROUP)
+			retv = ipv6_sock_mc_join(sk, greq.gr_interface,
+				&psin6->sin6_addr);
+		else
+			retv = ipv6_sock_mc_drop(sk, greq.gr_interface,
+				&psin6->sin6_addr);
+		break;
+	}
+	case MCAST_JOIN_SOURCE_GROUP:
+	case MCAST_LEAVE_SOURCE_GROUP:
+	case MCAST_BLOCK_SOURCE:
+	case MCAST_UNBLOCK_SOURCE:
+	{
+		struct group_source_req greqs;
+		int omode, add;
+
+		if (optlen < sizeof(struct group_source_req))
+			goto e_inval;
+		if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+			retv = -EFAULT;
+			break;
+		}
+		if (greqs.gsr_group.ss_family != AF_INET6 ||
+		    greqs.gsr_source.ss_family != AF_INET6) {
+			retv = -EADDRNOTAVAIL;
+			break;
+		}
+		if (optname == MCAST_BLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 1;
+		} else if (optname == MCAST_UNBLOCK_SOURCE) {
+			omode = MCAST_EXCLUDE;
+			add = 0;
+		} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+			struct sockaddr_in6 *psin6;
+
+			psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
+			retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
+				&psin6->sin6_addr);
+			/* prior join w/ different source is ok */
+			if (retv && retv != -EADDRINUSE)
+				break;
+			omode = MCAST_INCLUDE;
+			add = 1;
+		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+			omode = MCAST_INCLUDE;
+			add = 0;
+		}
+		retv = ip6_mc_source(add, omode, sk, &greqs);
+		break;
+	}
+	case MCAST_MSFILTER:
+	{
+		extern int sysctl_mld_max_msf;
+		struct group_filter *gsf;
+
+		if (optlen < GROUP_FILTER_SIZE(0))
+			goto e_inval;
+		if (optlen > sysctl_optmem_max) {
+			retv = -ENOBUFS;
+			break;
+		}
+		gsf = kmalloc(optlen,GFP_KERNEL);
+		if (!gsf) {
+			retv = -ENOBUFS;
+			break;
+		}
+		retv = -EFAULT;
+		if (copy_from_user(gsf, optval, optlen)) {
+			kfree(gsf);
+			break;
+		}
+		/* numsrc >= (4G-140)/128 overflow in 32 bits */
+		if (gsf->gf_numsrc >= 0x1ffffffU ||
+		    gsf->gf_numsrc > sysctl_mld_max_msf) {
+			kfree(gsf);
+			retv = -ENOBUFS;
+			break;
+		}
+		if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+			kfree(gsf);
+			retv = -EINVAL;
+			break;
+		}
+		retv = ip6_mc_msfilter(sk, gsf);
+		kfree(gsf);
+
+		break;
+	}
+	case IPV6_ROUTER_ALERT:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		retv = ip6_ra_control(sk, val);
+		break;
+	case IPV6_MTU_DISCOVER:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
+			goto e_inval;
+		np->pmtudisc = val;
+		retv = 0;
+		break;
+	case IPV6_MTU:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val && val < IPV6_MIN_MTU)
+			goto e_inval;
+		np->frag_size = val;
+		retv = 0;
+		break;
+	case IPV6_RECVERR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->recverr = valbool;
+		if (!val)
+			skb_queue_purge(&sk->sk_error_queue);
+		retv = 0;
+		break;
+	case IPV6_FLOWINFO_SEND:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->sndflow = valbool;
+		retv = 0;
+		break;
+	case IPV6_FLOWLABEL_MGR:
+		retv = ipv6_flowlabel_opt(sk, optval, optlen);
+		break;
+	case IPV6_IPSEC_POLICY:
+	case IPV6_XFRM_POLICY:
+		retv = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		retv = xfrm_user_policy(sk, optname, optval, optlen);
+		break;
+
+	case IPV6_ADDR_PREFERENCES:
+	    {
+		unsigned int pref = 0;
+		unsigned int prefmask = ~0;
+
+		if (optlen < sizeof(int))
+			goto e_inval;
+
+		retv = -EINVAL;
+
+		/* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
+		switch (val & (IPV6_PREFER_SRC_PUBLIC|
+			       IPV6_PREFER_SRC_TMP|
+			       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
+		case IPV6_PREFER_SRC_PUBLIC:
+			pref |= IPV6_PREFER_SRC_PUBLIC;
+			break;
+		case IPV6_PREFER_SRC_TMP:
+			pref |= IPV6_PREFER_SRC_TMP;
+			break;
+		case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
+			break;
+		case 0:
+			goto pref_skip_pubtmp;
+		default:
+			goto e_inval;
+		}
+
+		prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
+			      IPV6_PREFER_SRC_TMP);
+pref_skip_pubtmp:
+
+		/* check HOME/COA conflicts */
+		switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
+		case IPV6_PREFER_SRC_HOME:
+			break;
+		case IPV6_PREFER_SRC_COA:
+			pref |= IPV6_PREFER_SRC_COA;
+		case 0:
+			goto pref_skip_coa;
+		default:
+			goto e_inval;
+		}
+
+		prefmask &= ~IPV6_PREFER_SRC_COA;
+pref_skip_coa:
+
+		/* check CGA/NONCGA conflicts */
+		switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
+		case IPV6_PREFER_SRC_CGA:
+		case IPV6_PREFER_SRC_NONCGA:
+		case 0:
+			break;
+		default:
+			goto e_inval;
+		}
+
+		np->srcprefs = (np->srcprefs & prefmask) | pref;
+		retv = 0;
+
+		break;
+	    }
+	case IPV6_MINHOPCOUNT:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		if (val < 0 || val > 255)
+			goto e_inval;
+		np->min_hopcount = val;
+		break;
+	case IPV6_DONTFRAG:
+		np->dontfrag = valbool;
+		retv = 0;
+		break;
+	}
+
+	release_sock(sk);
+
+	return retv;
+
+e_inval:
+	release_sock(sk);
+	return -EINVAL;
+}
+
+int ipv6_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, unsigned int optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+		return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+
+	if (level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+			optname != IPV6_XFRM_POLICY) {
+		lock_sock(sk);
+		err = nf_setsockopt(sk, PF_INET6, optname, optval,
+				optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+
+EXPORT_SYMBOL(ipv6_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
+			   char __user *optval, unsigned int optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+		if (udp_prot.compat_setsockopt != NULL)
+			return udp_prot.compat_setsockopt(sk, level, optname,
+							  optval, optlen);
+		return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+	}
+
+	if (level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
+		return compat_mc_setsockopt(sk, level, optname, optval, optlen,
+			ipv6_setsockopt);
+
+	err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
+	    optname != IPV6_XFRM_POLICY) {
+		lock_sock(sk);
+		err = compat_nf_setsockopt(sk, PF_INET6, optname,
+					   optval, optlen);
+		release_sock(sk);
+	}
+#endif
+	return err;
+}
+
+EXPORT_SYMBOL(compat_ipv6_setsockopt);
+#endif
+
+static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
+				  int optname, char __user *optval, int len)
+{
+	struct ipv6_opt_hdr *hdr;
+
+	if (!opt)
+		return 0;
+
+	switch(optname) {
+	case IPV6_HOPOPTS:
+		hdr = opt->hopopt;
+		break;
+	case IPV6_RTHDRDSTOPTS:
+		hdr = opt->dst0opt;
+		break;
+	case IPV6_RTHDR:
+		hdr = (struct ipv6_opt_hdr *)opt->srcrt;
+		break;
+	case IPV6_DSTOPTS:
+		hdr = opt->dst1opt;
+		break;
+	default:
+		return -EINVAL;	/* should not happen */
+	}
+
+	if (!hdr)
+		return 0;
+
+	len = min_t(unsigned int, len, ipv6_optlen(hdr));
+	if (copy_to_user(optval, hdr, len))
+		return -EFAULT;
+	return len;
+}
+
+static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen, unsigned flags)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	int len;
+	int val;
+
+	if (ip6_mroute_opt(optname))
+		return ip6_mroute_getsockopt(sk, optname, optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	switch (optname) {
+	case IPV6_ADDRFORM:
+		if (sk->sk_protocol != IPPROTO_UDP &&
+		    sk->sk_protocol != IPPROTO_UDPLITE &&
+		    sk->sk_protocol != IPPROTO_TCP)
+			return -ENOPROTOOPT;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -ENOTCONN;
+		val = sk->sk_family;
+		break;
+	case MCAST_MSFILTER:
+	{
+		struct group_filter gsf;
+		int err;
+
+		if (len < GROUP_FILTER_SIZE(0))
+			return -EINVAL;
+		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
+			return -EFAULT;
+		if (gsf.gf_group.ss_family != AF_INET6)
+			return -EADDRNOTAVAIL;
+		lock_sock(sk);
+		err = ip6_mc_msfget(sk, &gsf,
+			(struct group_filter __user *)optval, optlen);
+		release_sock(sk);
+		return err;
+	}
+
+	case IPV6_2292PKTOPTIONS:
+	{
+		struct msghdr msg;
+		struct sk_buff *skb;
+
+		if (sk->sk_type != SOCK_STREAM)
+			return -ENOPROTOOPT;
+
+		msg.msg_control = optval;
+		msg.msg_controllen = len;
+		msg.msg_flags = flags;
+
+		lock_sock(sk);
+		skb = np->pktoptions;
+		if (skb)
+			atomic_inc(&skb->users);
+		release_sock(sk);
+
+		if (skb) {
+			int err = datagram_recv_ctl(sk, &msg, skb);
+			kfree_skb(skb);
+			if (err)
+				return err;
+		} else {
+			if (np->rxopt.bits.rxinfo) {
+				struct in6_pktinfo src_info;
+				src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+					np->sticky_pktinfo.ipi6_ifindex;
+				np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) :
+					ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr));
+				put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
+			}
+			if (np->rxopt.bits.rxhlim) {
+				int hlim = np->mcast_hops;
+				put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
+			}
+			if (np->rxopt.bits.rxoinfo) {
+				struct in6_pktinfo src_info;
+				src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+					np->sticky_pktinfo.ipi6_ifindex;
+				np->mcast_oif? ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr) :
+					ipv6_addr_copy(&src_info.ipi6_addr, &(np->sticky_pktinfo.ipi6_addr));
+				put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
+			}
+			if (np->rxopt.bits.rxohlim) {
+				int hlim = np->mcast_hops;
+				put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
+			}
+		}
+		len -= msg.msg_controllen;
+		return put_user(len, optlen);
+	}
+	case IPV6_MTU:
+	{
+		struct dst_entry *dst;
+
+		val = 0;
+		rcu_read_lock();
+		dst = __sk_dst_get(sk);
+		if (dst)
+			val = dst_mtu(dst);
+		rcu_read_unlock();
+		if (!val)
+			return -ENOTCONN;
+		break;
+	}
+
+	case IPV6_V6ONLY:
+		val = np->ipv6only;
+		break;
+
+	case IPV6_RECVPKTINFO:
+		val = np->rxopt.bits.rxinfo;
+		break;
+
+	case IPV6_2292PKTINFO:
+		val = np->rxopt.bits.rxoinfo;
+		break;
+
+	case IPV6_RECVHOPLIMIT:
+		val = np->rxopt.bits.rxhlim;
+		break;
+
+	case IPV6_2292HOPLIMIT:
+		val = np->rxopt.bits.rxohlim;
+		break;
+
+	case IPV6_RECVRTHDR:
+		val = np->rxopt.bits.srcrt;
+		break;
+
+	case IPV6_2292RTHDR:
+		val = np->rxopt.bits.osrcrt;
+		break;
+
+	case IPV6_HOPOPTS:
+	case IPV6_RTHDRDSTOPTS:
+	case IPV6_RTHDR:
+	case IPV6_DSTOPTS:
+	{
+
+		lock_sock(sk);
+		len = ipv6_getsockopt_sticky(sk, np->opt,
+					     optname, optval, len);
+		release_sock(sk);
+		/* check if ipv6_getsockopt_sticky() returns err code */
+		if (len < 0)
+			return len;
+		return put_user(len, optlen);
+	}
+
+	case IPV6_RECVHOPOPTS:
+		val = np->rxopt.bits.hopopts;
+		break;
+
+	case IPV6_2292HOPOPTS:
+		val = np->rxopt.bits.ohopopts;
+		break;
+
+	case IPV6_RECVDSTOPTS:
+		val = np->rxopt.bits.dstopts;
+		break;
+
+	case IPV6_2292DSTOPTS:
+		val = np->rxopt.bits.odstopts;
+		break;
+
+	case IPV6_TCLASS:
+		val = np->tclass;
+		break;
+
+	case IPV6_RECVTCLASS:
+		val = np->rxopt.bits.rxtclass;
+		break;
+
+	case IPV6_FLOWINFO:
+		val = np->rxopt.bits.rxflow;
+		break;
+
+	case IPV6_RECVPATHMTU:
+		val = np->rxopt.bits.rxpmtu;
+		break;
+
+	case IPV6_PATHMTU:
+	{
+		struct dst_entry *dst;
+		struct ip6_mtuinfo mtuinfo;
+
+		if (len < sizeof(mtuinfo))
+			return -EINVAL;
+
+		len = sizeof(mtuinfo);
+		memset(&mtuinfo, 0, sizeof(mtuinfo));
+
+		rcu_read_lock();
+		dst = __sk_dst_get(sk);
+		if (dst)
+			mtuinfo.ip6m_mtu = dst_mtu(dst);
+		rcu_read_unlock();
+		if (!mtuinfo.ip6m_mtu)
+			return -ENOTCONN;
+
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &mtuinfo, len))
+			return -EFAULT;
+
+		return 0;
+		break;
+	}
+
+	case IPV6_TRANSPARENT:
+		val = inet_sk(sk)->transparent;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		val = np->rxopt.bits.rxorigdstaddr;
+		break;
+
+	case IPV6_UNICAST_HOPS:
+	case IPV6_MULTICAST_HOPS:
+	{
+		struct dst_entry *dst;
+
+		if (optname == IPV6_UNICAST_HOPS)
+			val = np->hop_limit;
+		else
+			val = np->mcast_hops;
+
+		if (val < 0) {
+			rcu_read_lock();
+			dst = __sk_dst_get(sk);
+			if (dst)
+				val = ip6_dst_hoplimit(dst);
+			rcu_read_unlock();
+		}
+
+		if (val < 0)
+			val = sock_net(sk)->ipv6.devconf_all->hop_limit;
+		break;
+	}
+
+	case IPV6_MULTICAST_LOOP:
+		val = np->mc_loop;
+		break;
+
+	case IPV6_MULTICAST_IF:
+		val = np->mcast_oif;
+		break;
+
+	case IPV6_MTU_DISCOVER:
+		val = np->pmtudisc;
+		break;
+
+	case IPV6_RECVERR:
+		val = np->recverr;
+		break;
+
+	case IPV6_FLOWINFO_SEND:
+		val = np->sndflow;
+		break;
+
+	case IPV6_ADDR_PREFERENCES:
+		val = 0;
+
+		if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+			val |= IPV6_PREFER_SRC_TMP;
+		else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+			val |= IPV6_PREFER_SRC_PUBLIC;
+		else {
+			/* XXX: should we return system default? */
+			val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
+		}
+
+		if (np->srcprefs & IPV6_PREFER_SRC_COA)
+			val |= IPV6_PREFER_SRC_COA;
+		else
+			val |= IPV6_PREFER_SRC_HOME;
+		break;
+
+	case IPV6_MINHOPCOUNT:
+		val = np->min_hopcount;
+		break;
+
+	case IPV6_DONTFRAG:
+		val = np->dontfrag;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+	len = min_t(unsigned int, sizeof(int), len);
+	if(put_user(len, optlen))
+		return -EFAULT;
+	if(copy_to_user(optval,&val,len))
+		return -EFAULT;
+	return 0;
+}
+
+int ipv6_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW)
+		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+
+	if(level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = nf_getsockopt(sk, PF_INET6, optname, optval,
+				&len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+	}
+#endif
+	return err;
+}
+
+EXPORT_SYMBOL(ipv6_getsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
+			   char __user *optval, int __user *optlen)
+{
+	int err;
+
+	if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
+		if (udp_prot.compat_getsockopt != NULL)
+			return udp_prot.compat_getsockopt(sk, level, optname,
+							  optval, optlen);
+		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+	}
+
+	if (level != SOL_IPV6)
+		return -ENOPROTOOPT;
+
+	if (optname == MCAST_MSFILTER)
+		return compat_mc_getsockopt(sk, level, optname, optval, optlen,
+			ipv6_getsockopt);
+
+	err = do_ipv6_getsockopt(sk, level, optname, optval, optlen,
+				 MSG_CMSG_COMPAT);
+#ifdef CONFIG_NETFILTER
+	/* we need to exclude all possible ENOPROTOOPTs except default case */
+	if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
+		int len;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		err = compat_nf_getsockopt(sk, PF_INET6,
+					   optname, optval, &len);
+		release_sock(sk);
+		if (err >= 0)
+			err = put_user(len, optlen);
+	}
+#endif
+	return err;
+}
+
+EXPORT_SYMBOL(compat_ipv6_getsockopt);
+#endif
+
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
new file mode 100644
index 00000000..f2d74ea1
--- /dev/null
+++ b/net/ipv6/mcast.c
@@ -0,0 +1,2678 @@
+/*
+ *	Multicast support for IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/* Changes:
+ *
+ *	yoshfuji	: fix format of router-alert option
+ *	YOSHIFUJI Hideaki @USAGI:
+ *		Fixed source address for MLD message based on
+ *		<draft-ietf-magma-mld-source-05.txt>.
+ *	YOSHIFUJI Hideaki @USAGI:
+ *		- Ignore Queries for invalid addresses.
+ *		- MLD for link-local addresses.
+ *	David L Stevens <dlstevens@us.ibm.com>:
+ *		- MLDv2 support
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <net/mld.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/if_inet6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/inet_common.h>
+
+#include <net/ip6_checksum.h>
+
+/* Set to 3 to get tracing... */
+#define MCAST_DEBUG 2
+
+#if MCAST_DEBUG >= 3
+#define MDBG(x) printk x
+#else
+#define MDBG(x)
+#endif
+
+/* Ensure that we have struct in6_addr aligned on 32bit word. */
+static void *__mld2_query_bugs[] __attribute__((__unused__)) = {
+	BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4),
+	BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4),
+	BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4)
+};
+
+static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
+
+/* Big mc list lock for all the sockets */
+static DEFINE_SPINLOCK(ipv6_sk_mc_lock);
+
+static void igmp6_join_group(struct ifmcaddr6 *ma);
+static void igmp6_leave_group(struct ifmcaddr6 *ma);
+static void igmp6_timer_handler(unsigned long data);
+
+static void mld_gq_timer_expire(unsigned long data);
+static void mld_ifc_timer_expire(unsigned long data);
+static void mld_ifc_event(struct inet6_dev *idev);
+static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
+static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr);
+static void mld_clear_delrec(struct inet6_dev *idev);
+static int sf_setstate(struct ifmcaddr6 *pmc);
+static void sf_markstate(struct ifmcaddr6 *pmc);
+static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+			  int sfmode, int sfcount, const struct in6_addr *psfsrc,
+			  int delta);
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+			  int sfmode, int sfcount, const struct in6_addr *psfsrc,
+			  int delta);
+static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
+			    struct inet6_dev *idev);
+
+
+#define IGMP6_UNSOLICITED_IVAL	(10*HZ)
+#define MLD_QRV_DEFAULT		2
+
+#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \
+		(idev)->cnf.force_mld_version == 1 || \
+		((idev)->mc_v1_seen && \
+		time_before(jiffies, (idev)->mc_v1_seen)))
+
+#define IPV6_MLD_MAX_MSF	64
+
+int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
+
+/*
+ *	socket join on multicast group
+ */
+
+#define for_each_pmc_rcu(np, pmc)				\
+	for (pmc = rcu_dereference(np->ipv6_mc_list);		\
+	     pmc != NULL;					\
+	     pmc = rcu_dereference(pmc->next))
+
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	struct net_device *dev = NULL;
+	struct ipv6_mc_socklist *mc_lst;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
+	int err;
+
+	if (!ipv6_addr_is_multicast(addr))
+		return -EINVAL;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(np, mc_lst) {
+		if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+		    ipv6_addr_equal(&mc_lst->addr, addr)) {
+			rcu_read_unlock();
+			return -EADDRINUSE;
+		}
+	}
+	rcu_read_unlock();
+
+	mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
+
+	if (mc_lst == NULL)
+		return -ENOMEM;
+
+	mc_lst->next = NULL;
+	ipv6_addr_copy(&mc_lst->addr, addr);
+
+	rcu_read_lock();
+	if (ifindex == 0) {
+		struct rt6_info *rt;
+		rt = rt6_lookup(net, addr, NULL, 0, 0);
+		if (rt) {
+			dev = rt->rt6i_dev;
+			dst_release(&rt->dst);
+		}
+	} else
+		dev = dev_get_by_index_rcu(net, ifindex);
+
+	if (dev == NULL) {
+		rcu_read_unlock();
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+		return -ENODEV;
+	}
+
+	mc_lst->ifindex = dev->ifindex;
+	mc_lst->sfmode = MCAST_EXCLUDE;
+	rwlock_init(&mc_lst->sflock);
+	mc_lst->sflist = NULL;
+
+	/*
+	 *	now add/increase the group membership on the device
+	 */
+
+	err = ipv6_dev_mc_inc(dev, addr);
+
+	if (err) {
+		rcu_read_unlock();
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
+		return err;
+	}
+
+	spin_lock(&ipv6_sk_mc_lock);
+	mc_lst->next = np->ipv6_mc_list;
+	rcu_assign_pointer(np->ipv6_mc_list, mc_lst);
+	spin_unlock(&ipv6_sk_mc_lock);
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ *	socket leave on multicast group
+ */
+int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_mc_socklist *mc_lst;
+	struct ipv6_mc_socklist __rcu **lnk;
+	struct net *net = sock_net(sk);
+
+	spin_lock(&ipv6_sk_mc_lock);
+	for (lnk = &np->ipv6_mc_list;
+	     (mc_lst = rcu_dereference_protected(*lnk,
+			lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ;
+	      lnk = &mc_lst->next) {
+		if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+		    ipv6_addr_equal(&mc_lst->addr, addr)) {
+			struct net_device *dev;
+
+			*lnk = mc_lst->next;
+			spin_unlock(&ipv6_sk_mc_lock);
+
+			rcu_read_lock();
+			dev = dev_get_by_index_rcu(net, mc_lst->ifindex);
+			if (dev != NULL) {
+				struct inet6_dev *idev = __in6_dev_get(dev);
+
+				(void) ip6_mc_leave_src(sk, mc_lst, idev);
+				if (idev)
+					__ipv6_dev_mc_dec(idev, &mc_lst->addr);
+			} else
+				(void) ip6_mc_leave_src(sk, mc_lst, NULL);
+			rcu_read_unlock();
+			atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+			kfree_rcu(mc_lst, rcu);
+			return 0;
+		}
+	}
+	spin_unlock(&ipv6_sk_mc_lock);
+
+	return -EADDRNOTAVAIL;
+}
+
+/* called with rcu_read_lock() */
+static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
+					     const struct in6_addr *group,
+					     int ifindex)
+{
+	struct net_device *dev = NULL;
+	struct inet6_dev *idev = NULL;
+
+	if (ifindex == 0) {
+		struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0);
+
+		if (rt) {
+			dev = rt->rt6i_dev;
+			dst_release(&rt->dst);
+		}
+	} else
+		dev = dev_get_by_index_rcu(net, ifindex);
+
+	if (!dev)
+		return NULL;
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return NULL;
+	read_lock_bh(&idev->lock);
+	if (idev->dead) {
+		read_unlock_bh(&idev->lock);
+		return NULL;
+	}
+	return idev;
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_mc_socklist *mc_lst;
+	struct net *net = sock_net(sk);
+
+	spin_lock(&ipv6_sk_mc_lock);
+	while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list,
+				lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) {
+		struct net_device *dev;
+
+		np->ipv6_mc_list = mc_lst->next;
+		spin_unlock(&ipv6_sk_mc_lock);
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(net, mc_lst->ifindex);
+		if (dev) {
+			struct inet6_dev *idev = __in6_dev_get(dev);
+
+			(void) ip6_mc_leave_src(sk, mc_lst, idev);
+			if (idev)
+				__ipv6_dev_mc_dec(idev, &mc_lst->addr);
+		} else
+			(void) ip6_mc_leave_src(sk, mc_lst, NULL);
+		rcu_read_unlock();
+
+		atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+		kfree_rcu(mc_lst, rcu);
+
+		spin_lock(&ipv6_sk_mc_lock);
+	}
+	spin_unlock(&ipv6_sk_mc_lock);
+}
+
+int ip6_mc_source(int add, int omode, struct sock *sk,
+	struct group_source_req *pgsr)
+{
+	struct in6_addr *source, *group;
+	struct ipv6_mc_socklist *pmc;
+	struct inet6_dev *idev;
+	struct ipv6_pinfo *inet6 = inet6_sk(sk);
+	struct ip6_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+	int i, j, rv;
+	int leavegroup = 0;
+	int pmclocked = 0;
+	int err;
+
+	source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
+	group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr;
+
+	if (!ipv6_addr_is_multicast(group))
+		return -EINVAL;
+
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface);
+	if (!idev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	err = -EADDRNOTAVAIL;
+
+	for_each_pmc_rcu(inet6, pmc) {
+		if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
+			continue;
+		if (ipv6_addr_equal(&pmc->addr, group))
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	/* if a source filter was set, must be the same mode as before */
+	if (pmc->sflist) {
+		if (pmc->sfmode != omode) {
+			err = -EINVAL;
+			goto done;
+		}
+	} else if (pmc->sfmode != omode) {
+		/* allow mode switches for empty-set filters */
+		ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
+		ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+		pmc->sfmode = omode;
+	}
+
+	write_lock(&pmc->sflock);
+	pmclocked = 1;
+
+	psl = pmc->sflist;
+	if (!add) {
+		if (!psl)
+			goto done;	/* err = -EADDRNOTAVAIL */
+		rv = !0;
+		for (i=0; i<psl->sl_count; i++) {
+			rv = memcmp(&psl->sl_addr[i], source,
+				sizeof(struct in6_addr));
+			if (rv == 0)
+				break;
+		}
+		if (rv)		/* source not found */
+			goto done;	/* err = -EADDRNOTAVAIL */
+
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
+			goto done;
+		}
+
+		/* update the interface filter */
+		ip6_mc_del_src(idev, group, omode, 1, source, 1);
+
+		for (j=i+1; j<psl->sl_count; j++)
+			psl->sl_addr[j-1] = psl->sl_addr[j];
+		psl->sl_count--;
+		err = 0;
+		goto done;
+	}
+	/* else, add a new source to the filter */
+
+	if (psl && psl->sl_count >= sysctl_mld_max_msf) {
+		err = -ENOBUFS;
+		goto done;
+	}
+	if (!psl || psl->sl_count == psl->sl_max) {
+		struct ip6_sf_socklist *newpsl;
+		int count = IP6_SFBLOCK;
+
+		if (psl)
+			count += psl->sl_max;
+		newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = count;
+		newpsl->sl_count = count - IP6_SFBLOCK;
+		if (psl) {
+			for (i=0; i<psl->sl_count; i++)
+				newpsl->sl_addr[i] = psl->sl_addr[i];
+			sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+		}
+		pmc->sflist = psl = newpsl;
+	}
+	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */
+	for (i=0; i<psl->sl_count; i++) {
+		rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr));
+		if (rv == 0)
+			break;
+	}
+	if (rv == 0)		/* address already there is an error */
+		goto done;
+	for (j=psl->sl_count-1; j>=i; j--)
+		psl->sl_addr[j+1] = psl->sl_addr[j];
+	psl->sl_addr[i] = *source;
+	psl->sl_count++;
+	err = 0;
+	/* update the interface list */
+	ip6_mc_add_src(idev, group, omode, 1, source, 1);
+done:
+	if (pmclocked)
+		write_unlock(&pmc->sflock);
+	read_unlock_bh(&idev->lock);
+	rcu_read_unlock();
+	if (leavegroup)
+		return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
+	return err;
+}
+
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+{
+	const struct in6_addr *group;
+	struct ipv6_mc_socklist *pmc;
+	struct inet6_dev *idev;
+	struct ipv6_pinfo *inet6 = inet6_sk(sk);
+	struct ip6_sf_socklist *newpsl, *psl;
+	struct net *net = sock_net(sk);
+	int leavegroup = 0;
+	int i, err;
+
+	group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
+
+	if (!ipv6_addr_is_multicast(group))
+		return -EINVAL;
+	if (gsf->gf_fmode != MCAST_INCLUDE &&
+	    gsf->gf_fmode != MCAST_EXCLUDE)
+		return -EINVAL;
+
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
+
+	if (!idev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	err = 0;
+
+	if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+		leavegroup = 1;
+		goto done;
+	}
+
+	for_each_pmc_rcu(inet6, pmc) {
+		if (pmc->ifindex != gsf->gf_interface)
+			continue;
+		if (ipv6_addr_equal(&pmc->addr, group))
+			break;
+	}
+	if (!pmc) {		/* must have a prior join */
+		err = -EINVAL;
+		goto done;
+	}
+	if (gsf->gf_numsrc) {
+		newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
+							  GFP_ATOMIC);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
+		for (i=0; i<newpsl->sl_count; ++i) {
+			struct sockaddr_in6 *psin6;
+
+			psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+			newpsl->sl_addr[i] = psin6->sin6_addr;
+		}
+		err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
+			newpsl->sl_count, newpsl->sl_addr, 0);
+		if (err) {
+			sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+			goto done;
+		}
+	} else {
+		newpsl = NULL;
+		(void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+	}
+
+	write_lock(&pmc->sflock);
+	psl = pmc->sflist;
+	if (psl) {
+		(void) ip6_mc_del_src(idev, group, pmc->sfmode,
+			psl->sl_count, psl->sl_addr, 0);
+		sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+	} else
+		(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+	pmc->sflist = newpsl;
+	pmc->sfmode = gsf->gf_fmode;
+	write_unlock(&pmc->sflock);
+	err = 0;
+done:
+	read_unlock_bh(&idev->lock);
+	rcu_read_unlock();
+	if (leavegroup)
+		err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
+	return err;
+}
+
+int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
+	struct group_filter __user *optval, int __user *optlen)
+{
+	int err, i, count, copycount;
+	const struct in6_addr *group;
+	struct ipv6_mc_socklist *pmc;
+	struct inet6_dev *idev;
+	struct ipv6_pinfo *inet6 = inet6_sk(sk);
+	struct ip6_sf_socklist *psl;
+	struct net *net = sock_net(sk);
+
+	group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
+
+	if (!ipv6_addr_is_multicast(group))
+		return -EINVAL;
+
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
+
+	if (!idev) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
+	err = -EADDRNOTAVAIL;
+	/*
+	 * changes to the ipv6_mc_list require the socket lock and
+	 * a read lock on ip6_sk_mc_lock. We have the socket lock,
+	 * so reading the list is safe.
+	 */
+
+	for_each_pmc_rcu(inet6, pmc) {
+		if (pmc->ifindex != gsf->gf_interface)
+			continue;
+		if (ipv6_addr_equal(group, &pmc->addr))
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	gsf->gf_fmode = pmc->sfmode;
+	psl = pmc->sflist;
+	count = psl ? psl->sl_count : 0;
+	read_unlock_bh(&idev->lock);
+	rcu_read_unlock();
+
+	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+	gsf->gf_numsrc = count;
+	if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	/* changes to psl require the socket lock, a read lock on
+	 * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We
+	 * have the socket lock, so reading here is safe.
+	 */
+	for (i=0; i<copycount; i++) {
+		struct sockaddr_in6 *psin6;
+		struct sockaddr_storage ss;
+
+		psin6 = (struct sockaddr_in6 *)&ss;
+		memset(&ss, 0, sizeof(ss));
+		psin6->sin6_family = AF_INET6;
+		psin6->sin6_addr = psl->sl_addr[i];
+		if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+			return -EFAULT;
+	}
+	return 0;
+done:
+	read_unlock_bh(&idev->lock);
+	rcu_read_unlock();
+	return err;
+}
+
+int inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+		   const struct in6_addr *src_addr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_mc_socklist *mc;
+	struct ip6_sf_socklist *psl;
+	int rv = 1;
+
+	rcu_read_lock();
+	for_each_pmc_rcu(np, mc) {
+		if (ipv6_addr_equal(&mc->addr, mc_addr))
+			break;
+	}
+	if (!mc) {
+		rcu_read_unlock();
+		return 1;
+	}
+	read_lock(&mc->sflock);
+	psl = mc->sflist;
+	if (!psl) {
+		rv = mc->sfmode == MCAST_EXCLUDE;
+	} else {
+		int i;
+
+		for (i=0; i<psl->sl_count; i++) {
+			if (ipv6_addr_equal(&psl->sl_addr[i], src_addr))
+				break;
+		}
+		if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+			rv = 0;
+		if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+			rv = 0;
+	}
+	read_unlock(&mc->sflock);
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void ma_put(struct ifmcaddr6 *mc)
+{
+	if (atomic_dec_and_test(&mc->mca_refcnt)) {
+		in6_dev_put(mc->idev);
+		kfree(mc);
+	}
+}
+
+static void igmp6_group_added(struct ifmcaddr6 *mc)
+{
+	struct net_device *dev = mc->idev->dev;
+	char buf[MAX_ADDR_LEN];
+
+	spin_lock_bh(&mc->mca_lock);
+	if (!(mc->mca_flags&MAF_LOADED)) {
+		mc->mca_flags |= MAF_LOADED;
+		if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
+			dev_mc_add(dev, buf);
+	}
+	spin_unlock_bh(&mc->mca_lock);
+
+	if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
+		return;
+
+	if (MLD_V1_SEEN(mc->idev)) {
+		igmp6_join_group(mc);
+		return;
+	}
+	/* else v2 */
+
+	mc->mca_crcount = mc->idev->mc_qrv;
+	mld_ifc_event(mc->idev);
+}
+
+static void igmp6_group_dropped(struct ifmcaddr6 *mc)
+{
+	struct net_device *dev = mc->idev->dev;
+	char buf[MAX_ADDR_LEN];
+
+	spin_lock_bh(&mc->mca_lock);
+	if (mc->mca_flags&MAF_LOADED) {
+		mc->mca_flags &= ~MAF_LOADED;
+		if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
+			dev_mc_del(dev, buf);
+	}
+
+	if (mc->mca_flags & MAF_NOREPORT)
+		goto done;
+	spin_unlock_bh(&mc->mca_lock);
+
+	if (!mc->idev->dead)
+		igmp6_leave_group(mc);
+
+	spin_lock_bh(&mc->mca_lock);
+	if (del_timer(&mc->mca_timer))
+		atomic_dec(&mc->mca_refcnt);
+done:
+	ip6_mc_clear_src(mc);
+	spin_unlock_bh(&mc->mca_lock);
+}
+
+/*
+ * deleted ifmcaddr6 manipulation
+ */
+static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
+{
+	struct ifmcaddr6 *pmc;
+
+	/* this is an "ifmcaddr6" for convenience; only the fields below
+	 * are actually used. In particular, the refcnt and users are not
+	 * used for management of the delete list. Using the same structure
+	 * for deleted items allows change reports to use common code with
+	 * non-deleted or query-response MCA's.
+	 */
+	pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC);
+	if (!pmc)
+		return;
+
+	spin_lock_bh(&im->mca_lock);
+	spin_lock_init(&pmc->mca_lock);
+	pmc->idev = im->idev;
+	in6_dev_hold(idev);
+	pmc->mca_addr = im->mca_addr;
+	pmc->mca_crcount = idev->mc_qrv;
+	pmc->mca_sfmode = im->mca_sfmode;
+	if (pmc->mca_sfmode == MCAST_INCLUDE) {
+		struct ip6_sf_list *psf;
+
+		pmc->mca_tomb = im->mca_tomb;
+		pmc->mca_sources = im->mca_sources;
+		im->mca_tomb = im->mca_sources = NULL;
+		for (psf=pmc->mca_sources; psf; psf=psf->sf_next)
+			psf->sf_crcount = pmc->mca_crcount;
+	}
+	spin_unlock_bh(&im->mca_lock);
+
+	spin_lock_bh(&idev->mc_lock);
+	pmc->next = idev->mc_tomb;
+	idev->mc_tomb = pmc;
+	spin_unlock_bh(&idev->mc_lock);
+}
+
+static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca)
+{
+	struct ifmcaddr6 *pmc, *pmc_prev;
+	struct ip6_sf_list *psf, *psf_next;
+
+	spin_lock_bh(&idev->mc_lock);
+	pmc_prev = NULL;
+	for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) {
+		if (ipv6_addr_equal(&pmc->mca_addr, pmca))
+			break;
+		pmc_prev = pmc;
+	}
+	if (pmc) {
+		if (pmc_prev)
+			pmc_prev->next = pmc->next;
+		else
+			idev->mc_tomb = pmc->next;
+	}
+	spin_unlock_bh(&idev->mc_lock);
+
+	if (pmc) {
+		for (psf=pmc->mca_tomb; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+		in6_dev_put(pmc->idev);
+		kfree(pmc);
+	}
+}
+
+static void mld_clear_delrec(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *pmc, *nextpmc;
+
+	spin_lock_bh(&idev->mc_lock);
+	pmc = idev->mc_tomb;
+	idev->mc_tomb = NULL;
+	spin_unlock_bh(&idev->mc_lock);
+
+	for (; pmc; pmc = nextpmc) {
+		nextpmc = pmc->next;
+		ip6_mc_clear_src(pmc);
+		in6_dev_put(pmc->idev);
+		kfree(pmc);
+	}
+
+	/* clear dead sources, too */
+	read_lock_bh(&idev->lock);
+	for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+		struct ip6_sf_list *psf, *psf_next;
+
+		spin_lock_bh(&pmc->mca_lock);
+		psf = pmc->mca_tomb;
+		pmc->mca_tomb = NULL;
+		spin_unlock_bh(&pmc->mca_lock);
+		for (; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+	}
+	read_unlock_bh(&idev->lock);
+}
+
+
+/*
+ *	device multicast group inc (add if not found)
+ */
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct ifmcaddr6 *mc;
+	struct inet6_dev *idev;
+
+	/* we need to take a reference on idev */
+	idev = in6_dev_get(dev);
+
+	if (idev == NULL)
+		return -EINVAL;
+
+	write_lock_bh(&idev->lock);
+	if (idev->dead) {
+		write_unlock_bh(&idev->lock);
+		in6_dev_put(idev);
+		return -ENODEV;
+	}
+
+	for (mc = idev->mc_list; mc; mc = mc->next) {
+		if (ipv6_addr_equal(&mc->mca_addr, addr)) {
+			mc->mca_users++;
+			write_unlock_bh(&idev->lock);
+			ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
+				NULL, 0);
+			in6_dev_put(idev);
+			return 0;
+		}
+	}
+
+	/*
+	 *	not found: create a new one.
+	 */
+
+	mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC);
+
+	if (mc == NULL) {
+		write_unlock_bh(&idev->lock);
+		in6_dev_put(idev);
+		return -ENOMEM;
+	}
+
+	setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc);
+
+	ipv6_addr_copy(&mc->mca_addr, addr);
+	mc->idev = idev; /* (reference taken) */
+	mc->mca_users = 1;
+	/* mca_stamp should be updated upon changes */
+	mc->mca_cstamp = mc->mca_tstamp = jiffies;
+	atomic_set(&mc->mca_refcnt, 2);
+	spin_lock_init(&mc->mca_lock);
+
+	/* initial mode is (EX, empty) */
+	mc->mca_sfmode = MCAST_EXCLUDE;
+	mc->mca_sfcount[MCAST_EXCLUDE] = 1;
+
+	if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
+	    IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+		mc->mca_flags |= MAF_NOREPORT;
+
+	mc->next = idev->mc_list;
+	idev->mc_list = mc;
+	write_unlock_bh(&idev->lock);
+
+	mld_del_delrec(idev, &mc->mca_addr);
+	igmp6_group_added(mc);
+	ma_put(mc);
+	return 0;
+}
+
+/*
+ *	device multicast group del
+ */
+int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+{
+	struct ifmcaddr6 *ma, **map;
+
+	write_lock_bh(&idev->lock);
+	for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) {
+		if (ipv6_addr_equal(&ma->mca_addr, addr)) {
+			if (--ma->mca_users == 0) {
+				*map = ma->next;
+				write_unlock_bh(&idev->lock);
+
+				igmp6_group_dropped(ma);
+
+				ma_put(ma);
+				return 0;
+			}
+			write_unlock_bh(&idev->lock);
+			return 0;
+		}
+	}
+	write_unlock_bh(&idev->lock);
+
+	return -ENOENT;
+}
+
+int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
+{
+	struct inet6_dev *idev;
+	int err;
+
+	rcu_read_lock();
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		err = -ENODEV;
+	else
+		err = __ipv6_dev_mc_dec(idev, addr);
+
+	rcu_read_unlock();
+	return err;
+}
+
+/*
+ * identify MLD packets for MLD filter exceptions
+ */
+int ipv6_is_mld(struct sk_buff *skb, int nexthdr)
+{
+	struct icmp6hdr *pic;
+
+	if (nexthdr != IPPROTO_ICMPV6)
+		return 0;
+
+	if (!pskb_may_pull(skb, sizeof(struct icmp6hdr)))
+		return 0;
+
+	pic = icmp6_hdr(skb);
+
+	switch (pic->icmp6_type) {
+	case ICMPV6_MGM_QUERY:
+	case ICMPV6_MGM_REPORT:
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_MLD2_REPORT:
+		return 1;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ *	check if the interface/address pair is valid
+ */
+int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
+			const struct in6_addr *src_addr)
+{
+	struct inet6_dev *idev;
+	struct ifmcaddr6 *mc;
+	int rv = 0;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev) {
+		read_lock_bh(&idev->lock);
+		for (mc = idev->mc_list; mc; mc=mc->next) {
+			if (ipv6_addr_equal(&mc->mca_addr, group))
+				break;
+		}
+		if (mc) {
+			if (src_addr && !ipv6_addr_any(src_addr)) {
+				struct ip6_sf_list *psf;
+
+				spin_lock_bh(&mc->mca_lock);
+				for (psf=mc->mca_sources;psf;psf=psf->sf_next) {
+					if (ipv6_addr_equal(&psf->sf_addr, src_addr))
+						break;
+				}
+				if (psf)
+					rv = psf->sf_count[MCAST_INCLUDE] ||
+						psf->sf_count[MCAST_EXCLUDE] !=
+						mc->mca_sfcount[MCAST_EXCLUDE];
+				else
+					rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0;
+				spin_unlock_bh(&mc->mca_lock);
+			} else
+				rv = 1; /* don't filter unspecified source */
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return rv;
+}
+
+static void mld_gq_start_timer(struct inet6_dev *idev)
+{
+	int tv = net_random() % idev->mc_maxdelay;
+
+	idev->mc_gq_running = 1;
+	if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
+		in6_dev_hold(idev);
+}
+
+static void mld_ifc_start_timer(struct inet6_dev *idev, int delay)
+{
+	int tv = net_random() % delay;
+
+	if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
+		in6_dev_hold(idev);
+}
+
+/*
+ *	IGMP handling (alias multicast ICMPv6 messages)
+ */
+
+static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
+{
+	unsigned long delay = resptime;
+
+	/* Do not start timer for these addresses */
+	if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
+	    IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
+		return;
+
+	if (del_timer(&ma->mca_timer)) {
+		atomic_dec(&ma->mca_refcnt);
+		delay = ma->mca_timer.expires - jiffies;
+	}
+
+	if (delay >= resptime) {
+		if (resptime)
+			delay = net_random() % resptime;
+		else
+			delay = 1;
+	}
+	ma->mca_timer.expires = jiffies + delay;
+	if (!mod_timer(&ma->mca_timer, jiffies + delay))
+		atomic_inc(&ma->mca_refcnt);
+	ma->mca_flags |= MAF_TIMER_RUNNING;
+}
+
+/* mark EXCLUDE-mode sources */
+static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
+	const struct in6_addr *srcs)
+{
+	struct ip6_sf_list *psf;
+	int i, scount;
+
+	scount = 0;
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i=0; i<nsrcs; i++) {
+			/* skip inactive filters */
+			if (psf->sf_count[MCAST_INCLUDE] ||
+			    pmc->mca_sfcount[MCAST_EXCLUDE] !=
+			    psf->sf_count[MCAST_EXCLUDE])
+				continue;
+			if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
+				scount++;
+				break;
+			}
+		}
+	}
+	pmc->mca_flags &= ~MAF_GSQUERY;
+	if (scount == nsrcs)	/* all sources excluded */
+		return 0;
+	return 1;
+}
+
+static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
+	const struct in6_addr *srcs)
+{
+	struct ip6_sf_list *psf;
+	int i, scount;
+
+	if (pmc->mca_sfmode == MCAST_EXCLUDE)
+		return mld_xmarksources(pmc, nsrcs, srcs);
+
+	/* mark INCLUDE-mode sources */
+
+	scount = 0;
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i=0; i<nsrcs; i++) {
+			if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
+				psf->sf_gsresp = 1;
+				scount++;
+				break;
+			}
+		}
+	}
+	if (!scount) {
+		pmc->mca_flags &= ~MAF_GSQUERY;
+		return 0;
+	}
+	pmc->mca_flags |= MAF_GSQUERY;
+	return 1;
+}
+
+/* called with rcu_read_lock() */
+int igmp6_event_query(struct sk_buff *skb)
+{
+	struct mld2_query *mlh2 = NULL;
+	struct ifmcaddr6 *ma;
+	const struct in6_addr *group;
+	unsigned long max_delay;
+	struct inet6_dev *idev;
+	struct mld_msg *mld;
+	int group_type;
+	int mark = 0;
+	int len;
+
+	if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
+		return -EINVAL;
+
+	/* compute payload length excluding extension headers */
+	len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
+	len -= skb_network_header_len(skb);
+
+	/* Drop queries with not link local source */
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	idev = __in6_dev_get(skb->dev);
+
+	if (idev == NULL)
+		return 0;
+
+	mld = (struct mld_msg *)icmp6_hdr(skb);
+	group = &mld->mld_mca;
+	group_type = ipv6_addr_type(group);
+
+	if (group_type != IPV6_ADDR_ANY &&
+	    !(group_type&IPV6_ADDR_MULTICAST))
+		return -EINVAL;
+
+	if (len == 24) {
+		int switchback;
+		/* MLDv1 router present */
+
+		/* Translate milliseconds to jiffies */
+		max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000;
+
+		switchback = (idev->mc_qrv + 1) * max_delay;
+		idev->mc_v1_seen = jiffies + switchback;
+
+		/* cancel the interface change timer */
+		idev->mc_ifc_count = 0;
+		if (del_timer(&idev->mc_ifc_timer))
+			__in6_dev_put(idev);
+		/* clear deleted report items */
+		mld_clear_delrec(idev);
+	} else if (len >= 28) {
+		int srcs_offset = sizeof(struct mld2_query) -
+				  sizeof(struct icmp6hdr);
+		if (!pskb_may_pull(skb, srcs_offset))
+			return -EINVAL;
+
+		mlh2 = (struct mld2_query *)skb_transport_header(skb);
+		max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000;
+		if (!max_delay)
+			max_delay = 1;
+		idev->mc_maxdelay = max_delay;
+		if (mlh2->mld2q_qrv)
+			idev->mc_qrv = mlh2->mld2q_qrv;
+		if (group_type == IPV6_ADDR_ANY) { /* general query */
+			if (mlh2->mld2q_nsrcs)
+				return -EINVAL; /* no sources allowed */
+
+			mld_gq_start_timer(idev);
+			return 0;
+		}
+		/* mark sources to include, if group & source-specific */
+		if (mlh2->mld2q_nsrcs != 0) {
+			if (!pskb_may_pull(skb, srcs_offset +
+			    ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
+				return -EINVAL;
+
+			mlh2 = (struct mld2_query *)skb_transport_header(skb);
+			mark = 1;
+		}
+	} else
+		return -EINVAL;
+
+	read_lock_bh(&idev->lock);
+	if (group_type == IPV6_ADDR_ANY) {
+		for (ma = idev->mc_list; ma; ma=ma->next) {
+			spin_lock_bh(&ma->mca_lock);
+			igmp6_group_queried(ma, max_delay);
+			spin_unlock_bh(&ma->mca_lock);
+		}
+	} else {
+		for (ma = idev->mc_list; ma; ma=ma->next) {
+			if (!ipv6_addr_equal(group, &ma->mca_addr))
+				continue;
+			spin_lock_bh(&ma->mca_lock);
+			if (ma->mca_flags & MAF_TIMER_RUNNING) {
+				/* gsquery <- gsquery && mark */
+				if (!mark)
+					ma->mca_flags &= ~MAF_GSQUERY;
+			} else {
+				/* gsquery <- mark */
+				if (mark)
+					ma->mca_flags |= MAF_GSQUERY;
+				else
+					ma->mca_flags &= ~MAF_GSQUERY;
+			}
+			if (!(ma->mca_flags & MAF_GSQUERY) ||
+			    mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
+				igmp6_group_queried(ma, max_delay);
+			spin_unlock_bh(&ma->mca_lock);
+			break;
+		}
+	}
+	read_unlock_bh(&idev->lock);
+
+	return 0;
+}
+
+/* called with rcu_read_lock() */
+int igmp6_event_report(struct sk_buff *skb)
+{
+	struct ifmcaddr6 *ma;
+	struct inet6_dev *idev;
+	struct mld_msg *mld;
+	int addr_type;
+
+	/* Our own report looped back. Ignore it. */
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		return 0;
+
+	/* send our report if the MC router may not have heard this report */
+	if (skb->pkt_type != PACKET_MULTICAST &&
+	    skb->pkt_type != PACKET_BROADCAST)
+		return 0;
+
+	if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
+		return -EINVAL;
+
+	mld = (struct mld_msg *)icmp6_hdr(skb);
+
+	/* Drop reports with not link local source */
+	addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
+	if (addr_type != IPV6_ADDR_ANY &&
+	    !(addr_type&IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	idev = __in6_dev_get(skb->dev);
+	if (idev == NULL)
+		return -ENODEV;
+
+	/*
+	 *	Cancel the timer for this group
+	 */
+
+	read_lock_bh(&idev->lock);
+	for (ma = idev->mc_list; ma; ma=ma->next) {
+		if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
+			spin_lock(&ma->mca_lock);
+			if (del_timer(&ma->mca_timer))
+				atomic_dec(&ma->mca_refcnt);
+			ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING);
+			spin_unlock(&ma->mca_lock);
+			break;
+		}
+	}
+	read_unlock_bh(&idev->lock);
+	return 0;
+}
+
+static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
+	int gdeleted, int sdeleted)
+{
+	switch (type) {
+	case MLD2_MODE_IS_INCLUDE:
+	case MLD2_MODE_IS_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
+			if (pmc->mca_sfmode == MCAST_INCLUDE)
+				return 1;
+			/* don't include if this source is excluded
+			 * in all filters
+			 */
+			if (psf->sf_count[MCAST_INCLUDE])
+				return type == MLD2_MODE_IS_INCLUDE;
+			return pmc->mca_sfcount[MCAST_EXCLUDE] ==
+				psf->sf_count[MCAST_EXCLUDE];
+		}
+		return 0;
+	case MLD2_CHANGE_TO_INCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		return psf->sf_count[MCAST_INCLUDE] != 0;
+	case MLD2_CHANGE_TO_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 ||
+		    psf->sf_count[MCAST_INCLUDE])
+			return 0;
+		return pmc->mca_sfcount[MCAST_EXCLUDE] ==
+			psf->sf_count[MCAST_EXCLUDE];
+	case MLD2_ALLOW_NEW_SOURCES:
+		if (gdeleted || !psf->sf_crcount)
+			return 0;
+		return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted;
+	case MLD2_BLOCK_OLD_SOURCES:
+		if (pmc->mca_sfmode == MCAST_INCLUDE)
+			return gdeleted || (psf->sf_crcount && sdeleted);
+		return psf->sf_crcount && !gdeleted && !sdeleted;
+	}
+	return 0;
+}
+
+static int
+mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
+{
+	struct ip6_sf_list *psf;
+	int scount = 0;
+
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+			continue;
+		scount++;
+	}
+	return scount;
+}
+
+static struct sk_buff *mld_newpack(struct net_device *dev, int size)
+{
+	struct net *net = dev_net(dev);
+	struct sock *sk = net->ipv6.igmp_sk;
+	struct sk_buff *skb;
+	struct mld2_report *pmr;
+	struct in6_addr addr_buf;
+	const struct in6_addr *saddr;
+	int err;
+	u8 ra[8] = { IPPROTO_ICMPV6, 0,
+		     IPV6_TLV_ROUTERALERT, 2, 0, 0,
+		     IPV6_TLV_PADN, 0 };
+
+	/* we assume size > sizeof(ra) here */
+	size += LL_ALLOCATED_SPACE(dev);
+	/* limit our allocations to order-0 page */
+	size = min_t(int, size, SKB_MAX_ORDER(0, 0));
+	skb = sock_alloc_send_skb(sk, size, 1, &err);
+
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
+		/* <draft-ietf-magma-mld-source-05.txt>:
+		 * use unspecified address as the source address
+		 * when a valid link-local address is not available.
+		 */
+		saddr = &in6addr_any;
+	} else
+		saddr = &addr_buf;
+
+	ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);
+
+	memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
+
+	skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
+	skb_put(skb, sizeof(*pmr));
+	pmr = (struct mld2_report *)skb_transport_header(skb);
+	pmr->mld2r_type = ICMPV6_MLD2_REPORT;
+	pmr->mld2r_resv1 = 0;
+	pmr->mld2r_cksum = 0;
+	pmr->mld2r_resv2 = 0;
+	pmr->mld2r_ngrec = 0;
+	return skb;
+}
+
+static void mld_sendpack(struct sk_buff *skb)
+{
+	struct ipv6hdr *pip6 = ipv6_hdr(skb);
+	struct mld2_report *pmr =
+			      (struct mld2_report *)skb_transport_header(skb);
+	int payload_len, mldlen;
+	struct inet6_dev *idev;
+	struct net *net = dev_net(skb->dev);
+	int err;
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+	payload_len = (skb->tail - skb->network_header) - sizeof(*pip6);
+	mldlen = skb->tail - skb->transport_header;
+	pip6->payload_len = htons(payload_len);
+
+	pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
+					   IPPROTO_ICMPV6,
+					   csum_partial(skb_transport_header(skb),
+							mldlen, 0));
+
+	dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
+
+	if (!dst) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
+			 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+			 skb->dev->ifindex);
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	err = 0;
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		dst = NULL;
+	}
+	skb_dst_set(skb, dst);
+	if (err)
+		goto err_out;
+
+	payload_len = skb->len;
+
+	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
+		      dst_output);
+out:
+	if (!err) {
+		ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT);
+		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
+		IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len);
+	} else
+		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS);
+
+	rcu_read_unlock();
+	return;
+
+err_out:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
+{
+	return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
+	int type, struct mld2_grec **ppgr)
+{
+	struct net_device *dev = pmc->idev->dev;
+	struct mld2_report *pmr;
+	struct mld2_grec *pgr;
+
+	if (!skb)
+		skb = mld_newpack(dev, dev->mtu);
+	if (!skb)
+		return NULL;
+	pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec));
+	pgr->grec_type = type;
+	pgr->grec_auxwords = 0;
+	pgr->grec_nsrcs = 0;
+	pgr->grec_mca = pmc->mca_addr;	/* structure copy */
+	pmr = (struct mld2_report *)skb_transport_header(skb);
+	pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1);
+	*ppgr = pgr;
+	return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+	skb_tailroom(skb)) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
+	int type, int gdeleted, int sdeleted)
+{
+	struct net_device *dev = pmc->idev->dev;
+	struct mld2_report *pmr;
+	struct mld2_grec *pgr = NULL;
+	struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+	int scount, stotal, first, isquery, truncate;
+
+	if (pmc->mca_flags & MAF_NOREPORT)
+		return skb;
+
+	isquery = type == MLD2_MODE_IS_INCLUDE ||
+		  type == MLD2_MODE_IS_EXCLUDE;
+	truncate = type == MLD2_MODE_IS_EXCLUDE ||
+		    type == MLD2_CHANGE_TO_EXCLUDE;
+
+	stotal = scount = 0;
+
+	psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
+
+	if (!*psf_list)
+		goto empty_source;
+
+	pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;
+
+	/* EX and TO_EX get a fresh packet, if needed */
+	if (truncate) {
+		if (pmr && pmr->mld2r_ngrec &&
+		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+			if (skb)
+				mld_sendpack(skb);
+			skb = mld_newpack(dev, dev->mtu);
+		}
+	}
+	first = 1;
+	psf_prev = NULL;
+	for (psf=*psf_list; psf; psf=psf_next) {
+		struct in6_addr *psrc;
+
+		psf_next = psf->sf_next;
+
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+			psf_prev = psf;
+			continue;
+		}
+
+		/* clear marks on query responses */
+		if (isquery)
+			psf->sf_gsresp = 0;
+
+		if (AVAILABLE(skb) < sizeof(*psrc) +
+		    first*sizeof(struct mld2_grec)) {
+			if (truncate && !first)
+				break;	 /* truncate these */
+			if (pgr)
+				pgr->grec_nsrcs = htons(scount);
+			if (skb)
+				mld_sendpack(skb);
+			skb = mld_newpack(dev, dev->mtu);
+			first = 1;
+			scount = 0;
+		}
+		if (first) {
+			skb = add_grhead(skb, pmc, type, &pgr);
+			first = 0;
+		}
+		if (!skb)
+			return NULL;
+		psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc));
+		*psrc = psf->sf_addr;
+		scount++; stotal++;
+		if ((type == MLD2_ALLOW_NEW_SOURCES ||
+		     type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+			psf->sf_crcount--;
+			if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+				if (psf_prev)
+					psf_prev->sf_next = psf->sf_next;
+				else
+					*psf_list = psf->sf_next;
+				kfree(psf);
+				continue;
+			}
+		}
+		psf_prev = psf;
+	}
+
+empty_source:
+	if (!stotal) {
+		if (type == MLD2_ALLOW_NEW_SOURCES ||
+		    type == MLD2_BLOCK_OLD_SOURCES)
+			return skb;
+		if (pmc->mca_crcount || isquery) {
+			/* make sure we have room for group header */
+			if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
+				mld_sendpack(skb);
+				skb = NULL; /* add_grhead will get a new one */
+			}
+			skb = add_grhead(skb, pmc, type, &pgr);
+		}
+	}
+	if (pgr)
+		pgr->grec_nsrcs = htons(scount);
+
+	if (isquery)
+		pmc->mca_flags &= ~MAF_GSQUERY;	/* clear query state */
+	return skb;
+}
+
+static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
+{
+	struct sk_buff *skb = NULL;
+	int type;
+
+	if (!pmc) {
+		read_lock_bh(&idev->lock);
+		for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+			if (pmc->mca_flags & MAF_NOREPORT)
+				continue;
+			spin_lock_bh(&pmc->mca_lock);
+			if (pmc->mca_sfcount[MCAST_EXCLUDE])
+				type = MLD2_MODE_IS_EXCLUDE;
+			else
+				type = MLD2_MODE_IS_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			spin_unlock_bh(&pmc->mca_lock);
+		}
+		read_unlock_bh(&idev->lock);
+	} else {
+		spin_lock_bh(&pmc->mca_lock);
+		if (pmc->mca_sfcount[MCAST_EXCLUDE])
+			type = MLD2_MODE_IS_EXCLUDE;
+		else
+			type = MLD2_MODE_IS_INCLUDE;
+		skb = add_grec(skb, pmc, type, 0, 0);
+		spin_unlock_bh(&pmc->mca_lock);
+	}
+	if (skb)
+		mld_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void mld_clear_zeros(struct ip6_sf_list **ppsf)
+{
+	struct ip6_sf_list *psf_prev, *psf_next, *psf;
+
+	psf_prev = NULL;
+	for (psf=*ppsf; psf; psf = psf_next) {
+		psf_next = psf->sf_next;
+		if (psf->sf_crcount == 0) {
+			if (psf_prev)
+				psf_prev->sf_next = psf->sf_next;
+			else
+				*ppsf = psf->sf_next;
+			kfree(psf);
+		} else
+			psf_prev = psf;
+	}
+}
+
+static void mld_send_cr(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next;
+	struct sk_buff *skb = NULL;
+	int type, dtype;
+
+	read_lock_bh(&idev->lock);
+	spin_lock(&idev->mc_lock);
+
+	/* deleted MCA's */
+	pmc_prev = NULL;
+	for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) {
+		pmc_next = pmc->next;
+		if (pmc->mca_sfmode == MCAST_INCLUDE) {
+			type = MLD2_BLOCK_OLD_SOURCES;
+			dtype = MLD2_BLOCK_OLD_SOURCES;
+			skb = add_grec(skb, pmc, type, 1, 0);
+			skb = add_grec(skb, pmc, dtype, 1, 1);
+		}
+		if (pmc->mca_crcount) {
+			if (pmc->mca_sfmode == MCAST_EXCLUDE) {
+				type = MLD2_CHANGE_TO_INCLUDE;
+				skb = add_grec(skb, pmc, type, 1, 0);
+			}
+			pmc->mca_crcount--;
+			if (pmc->mca_crcount == 0) {
+				mld_clear_zeros(&pmc->mca_tomb);
+				mld_clear_zeros(&pmc->mca_sources);
+			}
+		}
+		if (pmc->mca_crcount == 0 && !pmc->mca_tomb &&
+		    !pmc->mca_sources) {
+			if (pmc_prev)
+				pmc_prev->next = pmc_next;
+			else
+				idev->mc_tomb = pmc_next;
+			in6_dev_put(pmc->idev);
+			kfree(pmc);
+		} else
+			pmc_prev = pmc;
+	}
+	spin_unlock(&idev->mc_lock);
+
+	/* change recs */
+	for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+		spin_lock_bh(&pmc->mca_lock);
+		if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+			type = MLD2_BLOCK_OLD_SOURCES;
+			dtype = MLD2_ALLOW_NEW_SOURCES;
+		} else {
+			type = MLD2_ALLOW_NEW_SOURCES;
+			dtype = MLD2_BLOCK_OLD_SOURCES;
+		}
+		skb = add_grec(skb, pmc, type, 0, 0);
+		skb = add_grec(skb, pmc, dtype, 0, 1);	/* deleted sources */
+
+		/* filter mode changes */
+		if (pmc->mca_crcount) {
+			if (pmc->mca_sfmode == MCAST_EXCLUDE)
+				type = MLD2_CHANGE_TO_EXCLUDE;
+			else
+				type = MLD2_CHANGE_TO_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			pmc->mca_crcount--;
+		}
+		spin_unlock_bh(&pmc->mca_lock);
+	}
+	read_unlock_bh(&idev->lock);
+	if (!skb)
+		return;
+	(void) mld_sendpack(skb);
+}
+
+static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
+{
+	struct net *net = dev_net(dev);
+	struct sock *sk = net->ipv6.igmp_sk;
+	struct inet6_dev *idev;
+	struct sk_buff *skb;
+	struct mld_msg *hdr;
+	const struct in6_addr *snd_addr, *saddr;
+	struct in6_addr addr_buf;
+	int err, len, payload_len, full_len;
+	u8 ra[8] = { IPPROTO_ICMPV6, 0,
+		     IPV6_TLV_ROUTERALERT, 2, 0, 0,
+		     IPV6_TLV_PADN, 0 };
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+
+	if (type == ICMPV6_MGM_REDUCTION)
+		snd_addr = &in6addr_linklocal_allrouters;
+	else
+		snd_addr = addr;
+
+	len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+	payload_len = len + sizeof(ra);
+	full_len = sizeof(struct ipv6hdr) + payload_len;
+
+	rcu_read_lock();
+	IP6_UPD_PO_STATS(net, __in6_dev_get(dev),
+		      IPSTATS_MIB_OUT, full_len);
+	rcu_read_unlock();
+
+	skb = sock_alloc_send_skb(sk, LL_ALLOCATED_SPACE(dev) + full_len, 1, &err);
+
+	if (skb == NULL) {
+		rcu_read_lock();
+		IP6_INC_STATS(net, __in6_dev_get(dev),
+			      IPSTATS_MIB_OUTDISCARDS);
+		rcu_read_unlock();
+		return;
+	}
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
+		/* <draft-ietf-magma-mld-source-05.txt>:
+		 * use unspecified address as the source address
+		 * when a valid link-local address is not available.
+		 */
+		saddr = &in6addr_any;
+	} else
+		saddr = &addr_buf;
+
+	ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len);
+
+	memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
+
+	hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg));
+	memset(hdr, 0, sizeof(struct mld_msg));
+	hdr->mld_type = type;
+	ipv6_addr_copy(&hdr->mld_mca, addr);
+
+	hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len,
+					 IPPROTO_ICMPV6,
+					 csum_partial(hdr, len, 0));
+
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
+
+	dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
+	if (!dst) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	icmpv6_flow_init(sk, &fl6, type,
+			 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+			 skb->dev->ifindex);
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto err_out;
+	}
+
+	skb_dst_set(skb, dst);
+	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev,
+		      dst_output);
+out:
+	if (!err) {
+		ICMP6MSGOUT_INC_STATS(net, idev, type);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len);
+	} else
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+
+	rcu_read_unlock();
+	return;
+
+err_out:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
+	const struct in6_addr *psfsrc)
+{
+	struct ip6_sf_list *psf, *psf_prev;
+	int rv = 0;
+
+	psf_prev = NULL;
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
+			break;
+		psf_prev = psf;
+	}
+	if (!psf || psf->sf_count[sfmode] == 0) {
+		/* source filter not found, or count wrong =>  bug */
+		return -ESRCH;
+	}
+	psf->sf_count[sfmode]--;
+	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+		struct inet6_dev *idev = pmc->idev;
+
+		/* no more filters for this source */
+		if (psf_prev)
+			psf_prev->sf_next = psf->sf_next;
+		else
+			pmc->mca_sources = psf->sf_next;
+		if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
+		    !MLD_V1_SEEN(idev)) {
+			psf->sf_crcount = idev->mc_qrv;
+			psf->sf_next = pmc->mca_tomb;
+			pmc->mca_tomb = psf;
+			rv = 1;
+		} else
+			kfree(psf);
+	}
+	return rv;
+}
+
+static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+			  int sfmode, int sfcount, const struct in6_addr *psfsrc,
+			  int delta)
+{
+	struct ifmcaddr6 *pmc;
+	int	changerec = 0;
+	int	i, err;
+
+	if (!idev)
+		return -ENODEV;
+	read_lock_bh(&idev->lock);
+	for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+		if (ipv6_addr_equal(pmca, &pmc->mca_addr))
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		read_unlock_bh(&idev->lock);
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->mca_lock);
+	sf_markstate(pmc);
+	if (!delta) {
+		if (!pmc->mca_sfcount[sfmode]) {
+			spin_unlock_bh(&pmc->mca_lock);
+			read_unlock_bh(&idev->lock);
+			return -EINVAL;
+		}
+		pmc->mca_sfcount[sfmode]--;
+	}
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+		changerec |= rv > 0;
+		if (!err && rv < 0)
+			err = rv;
+	}
+	if (pmc->mca_sfmode == MCAST_EXCLUDE &&
+	    pmc->mca_sfcount[MCAST_EXCLUDE] == 0 &&
+	    pmc->mca_sfcount[MCAST_INCLUDE]) {
+		struct ip6_sf_list *psf;
+
+		/* filter mode change */
+		pmc->mca_sfmode = MCAST_INCLUDE;
+		pmc->mca_crcount = idev->mc_qrv;
+		idev->mc_ifc_count = pmc->mca_crcount;
+		for (psf=pmc->mca_sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		mld_ifc_event(pmc->idev);
+	} else if (sf_setstate(pmc) || changerec)
+		mld_ifc_event(pmc->idev);
+	spin_unlock_bh(&pmc->mca_lock);
+	read_unlock_bh(&idev->lock);
+	return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
+	const struct in6_addr *psfsrc, int delta)
+{
+	struct ip6_sf_list *psf, *psf_prev;
+
+	psf_prev = NULL;
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
+			break;
+		psf_prev = psf;
+	}
+	if (!psf) {
+		psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+		if (!psf)
+			return -ENOBUFS;
+
+		psf->sf_addr = *psfsrc;
+		if (psf_prev) {
+			psf_prev->sf_next = psf;
+		} else
+			pmc->mca_sources = psf;
+	}
+	psf->sf_count[sfmode]++;
+	return 0;
+}
+
+static void sf_markstate(struct ifmcaddr6 *pmc)
+{
+	struct ip6_sf_list *psf;
+	int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next)
+		if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+			psf->sf_oldin = mca_xcount ==
+				psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ifmcaddr6 *pmc)
+{
+	struct ip6_sf_list *psf, *dpsf;
+	int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+	int qrv = pmc->idev->mc_qrv;
+	int new_in, rv;
+
+	rv = 0;
+	for (psf=pmc->mca_sources; psf; psf=psf->sf_next) {
+		if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
+			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+		if (new_in) {
+			if (!psf->sf_oldin) {
+				struct ip6_sf_list *prev = NULL;
+
+				for (dpsf=pmc->mca_tomb; dpsf;
+				     dpsf=dpsf->sf_next) {
+					if (ipv6_addr_equal(&dpsf->sf_addr,
+					    &psf->sf_addr))
+						break;
+					prev = dpsf;
+				}
+				if (dpsf) {
+					if (prev)
+						prev->sf_next = dpsf->sf_next;
+					else
+						pmc->mca_tomb = dpsf->sf_next;
+					kfree(dpsf);
+				}
+				psf->sf_crcount = qrv;
+				rv++;
+			}
+		} else if (psf->sf_oldin) {
+			psf->sf_crcount = 0;
+			/*
+			 * add or update "delete" records if an active filter
+			 * is now inactive
+			 */
+			for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next)
+				if (ipv6_addr_equal(&dpsf->sf_addr,
+				    &psf->sf_addr))
+					break;
+			if (!dpsf) {
+				dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+				if (!dpsf)
+					continue;
+				*dpsf = *psf;
+				/* pmc->mca_lock held by callers */
+				dpsf->sf_next = pmc->mca_tomb;
+				pmc->mca_tomb = dpsf;
+			}
+			dpsf->sf_crcount = qrv;
+			rv++;
+		}
+	}
+	return rv;
+}
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
+			  int sfmode, int sfcount, const struct in6_addr *psfsrc,
+			  int delta)
+{
+	struct ifmcaddr6 *pmc;
+	int	isexclude;
+	int	i, err;
+
+	if (!idev)
+		return -ENODEV;
+	read_lock_bh(&idev->lock);
+	for (pmc=idev->mc_list; pmc; pmc=pmc->next) {
+		if (ipv6_addr_equal(pmca, &pmc->mca_addr))
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		read_unlock_bh(&idev->lock);
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->mca_lock);
+
+	sf_markstate(pmc);
+	isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
+	if (!delta)
+		pmc->mca_sfcount[sfmode]++;
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+		if (err)
+			break;
+	}
+	if (err) {
+		int j;
+
+		if (!delta)
+			pmc->mca_sfcount[sfmode]--;
+		for (j=0; j<i; j++)
+			ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
+	} else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
+		struct ip6_sf_list *psf;
+
+		/* filter mode change */
+		if (pmc->mca_sfcount[MCAST_EXCLUDE])
+			pmc->mca_sfmode = MCAST_EXCLUDE;
+		else if (pmc->mca_sfcount[MCAST_INCLUDE])
+			pmc->mca_sfmode = MCAST_INCLUDE;
+		/* else no filters; keep old mode for reports */
+
+		pmc->mca_crcount = idev->mc_qrv;
+		idev->mc_ifc_count = pmc->mca_crcount;
+		for (psf=pmc->mca_sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		mld_ifc_event(idev);
+	} else if (sf_setstate(pmc))
+		mld_ifc_event(idev);
+	spin_unlock_bh(&pmc->mca_lock);
+	read_unlock_bh(&idev->lock);
+	return err;
+}
+
+static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
+{
+	struct ip6_sf_list *psf, *nextpsf;
+
+	for (psf=pmc->mca_tomb; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->mca_tomb = NULL;
+	for (psf=pmc->mca_sources; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->mca_sources = NULL;
+	pmc->mca_sfmode = MCAST_EXCLUDE;
+	pmc->mca_sfcount[MCAST_INCLUDE] = 0;
+	pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+static void igmp6_join_group(struct ifmcaddr6 *ma)
+{
+	unsigned long delay;
+
+	if (ma->mca_flags & MAF_NOREPORT)
+		return;
+
+	igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+
+	delay = net_random() % IGMP6_UNSOLICITED_IVAL;
+
+	spin_lock_bh(&ma->mca_lock);
+	if (del_timer(&ma->mca_timer)) {
+		atomic_dec(&ma->mca_refcnt);
+		delay = ma->mca_timer.expires - jiffies;
+	}
+
+	if (!mod_timer(&ma->mca_timer, jiffies + delay))
+		atomic_inc(&ma->mca_refcnt);
+	ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
+	spin_unlock_bh(&ma->mca_lock);
+}
+
+static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
+			    struct inet6_dev *idev)
+{
+	int err;
+
+	/* callers have the socket lock and a write lock on ipv6_sk_mc_lock,
+	 * so no other readers or writers of iml or its sflist
+	 */
+	if (!iml->sflist) {
+		/* any-source empty exclude case */
+		return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+	}
+	err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+		iml->sflist->sl_count, iml->sflist->sl_addr, 0);
+	sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
+	iml->sflist = NULL;
+	return err;
+}
+
+static void igmp6_leave_group(struct ifmcaddr6 *ma)
+{
+	if (MLD_V1_SEEN(ma->idev)) {
+		if (ma->mca_flags & MAF_LAST_REPORTER)
+			igmp6_send(&ma->mca_addr, ma->idev->dev,
+				ICMPV6_MGM_REDUCTION);
+	} else {
+		mld_add_delrec(ma->idev, ma);
+		mld_ifc_event(ma->idev);
+	}
+}
+
+static void mld_gq_timer_expire(unsigned long data)
+{
+	struct inet6_dev *idev = (struct inet6_dev *)data;
+
+	idev->mc_gq_running = 0;
+	mld_send_report(idev, NULL);
+	__in6_dev_put(idev);
+}
+
+static void mld_ifc_timer_expire(unsigned long data)
+{
+	struct inet6_dev *idev = (struct inet6_dev *)data;
+
+	mld_send_cr(idev);
+	if (idev->mc_ifc_count) {
+		idev->mc_ifc_count--;
+		if (idev->mc_ifc_count)
+			mld_ifc_start_timer(idev, idev->mc_maxdelay);
+	}
+	__in6_dev_put(idev);
+}
+
+static void mld_ifc_event(struct inet6_dev *idev)
+{
+	if (MLD_V1_SEEN(idev))
+		return;
+	idev->mc_ifc_count = idev->mc_qrv;
+	mld_ifc_start_timer(idev, 1);
+}
+
+
+static void igmp6_timer_handler(unsigned long data)
+{
+	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+
+	if (MLD_V1_SEEN(ma->idev))
+		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+	else
+		mld_send_report(ma->idev, ma);
+
+	spin_lock(&ma->mca_lock);
+	ma->mca_flags |=  MAF_LAST_REPORTER;
+	ma->mca_flags &= ~MAF_TIMER_RUNNING;
+	spin_unlock(&ma->mca_lock);
+	ma_put(ma);
+}
+
+/* Device changing type */
+
+void ipv6_mc_unmap(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *i;
+
+	/* Install multicast list, except for all-nodes (already installed) */
+
+	read_lock_bh(&idev->lock);
+	for (i = idev->mc_list; i; i = i->next)
+		igmp6_group_dropped(i);
+	read_unlock_bh(&idev->lock);
+}
+
+void ipv6_mc_remap(struct inet6_dev *idev)
+{
+	ipv6_mc_up(idev);
+}
+
+/* Device going down */
+
+void ipv6_mc_down(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *i;
+
+	/* Withdraw multicast list */
+
+	read_lock_bh(&idev->lock);
+	idev->mc_ifc_count = 0;
+	if (del_timer(&idev->mc_ifc_timer))
+		__in6_dev_put(idev);
+	idev->mc_gq_running = 0;
+	if (del_timer(&idev->mc_gq_timer))
+		__in6_dev_put(idev);
+
+	for (i = idev->mc_list; i; i=i->next)
+		igmp6_group_dropped(i);
+	read_unlock_bh(&idev->lock);
+
+	mld_clear_delrec(idev);
+}
+
+
+/* Device going up */
+
+void ipv6_mc_up(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *i;
+
+	/* Install multicast list, except for all-nodes (already installed) */
+
+	read_lock_bh(&idev->lock);
+	for (i = idev->mc_list; i; i=i->next)
+		igmp6_group_added(i);
+	read_unlock_bh(&idev->lock);
+}
+
+/* IPv6 device initialization. */
+
+void ipv6_mc_init_dev(struct inet6_dev *idev)
+{
+	write_lock_bh(&idev->lock);
+	spin_lock_init(&idev->mc_lock);
+	idev->mc_gq_running = 0;
+	setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire,
+			(unsigned long)idev);
+	idev->mc_tomb = NULL;
+	idev->mc_ifc_count = 0;
+	setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire,
+			(unsigned long)idev);
+	idev->mc_qrv = MLD_QRV_DEFAULT;
+	idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL;
+	idev->mc_v1_seen = 0;
+	write_unlock_bh(&idev->lock);
+}
+
+/*
+ *	Device is about to be destroyed: clean up.
+ */
+
+void ipv6_mc_destroy_dev(struct inet6_dev *idev)
+{
+	struct ifmcaddr6 *i;
+
+	/* Deactivate timers */
+	ipv6_mc_down(idev);
+
+	/* Delete all-nodes address. */
+	/* We cannot call ipv6_dev_mc_dec() directly, our caller in
+	 * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will
+	 * fail.
+	 */
+	__ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes);
+
+	if (idev->cnf.forwarding)
+		__ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);
+
+	write_lock_bh(&idev->lock);
+	while ((i = idev->mc_list) != NULL) {
+		idev->mc_list = i->next;
+		write_unlock_bh(&idev->lock);
+
+		igmp6_group_dropped(i);
+		ma_put(i);
+
+		write_lock_bh(&idev->lock);
+	}
+	write_unlock_bh(&idev->lock);
+}
+
+#ifdef CONFIG_PROC_FS
+struct igmp6_mc_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+};
+
+#define igmp6_mc_seq_private(seq)	((struct igmp6_mc_iter_state *)(seq)->private)
+
+static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
+{
+	struct ifmcaddr6 *im = NULL;
+	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+	struct net *net = seq_file_net(seq);
+
+	state->idev = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct inet6_dev *idev;
+		idev = __in6_dev_get(state->dev);
+		if (!idev)
+			continue;
+		read_lock_bh(&idev->lock);
+		im = idev->mc_list;
+		if (im) {
+			state->idev = idev;
+			break;
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	return im;
+}
+
+static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im)
+{
+	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+	im = im->next;
+	while (!im) {
+		if (likely(state->idev != NULL))
+			read_unlock_bh(&state->idev->lock);
+
+		state->dev = next_net_device_rcu(state->dev);
+		if (!state->dev) {
+			state->idev = NULL;
+			break;
+		}
+		state->idev = __in6_dev_get(state->dev);
+		if (!state->idev)
+			continue;
+		read_lock_bh(&state->idev->lock);
+		im = state->idev->mc_list;
+	}
+	return im;
+}
+
+static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ifmcaddr6 *im = igmp6_mc_get_first(seq);
+	if (im)
+		while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL)
+			--pos;
+	return pos ? NULL : im;
+}
+
+static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return igmp6_mc_get_idx(seq, *pos);
+}
+
+static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);
+
+	++*pos;
+	return im;
+}
+
+static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+	if (likely(state->idev != NULL)) {
+		read_unlock_bh(&state->idev->lock);
+		state->idev = NULL;
+	}
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
+{
+	struct ifmcaddr6 *im = (struct ifmcaddr6 *)v;
+	struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
+
+	seq_printf(seq,
+		   "%-4d %-15s %pi6 %5d %08X %ld\n",
+		   state->dev->ifindex, state->dev->name,
+		   &im->mca_addr,
+		   im->mca_users, im->mca_flags,
+		   (im->mca_flags&MAF_TIMER_RUNNING) ?
+		   jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0);
+	return 0;
+}
+
+static const struct seq_operations igmp6_mc_seq_ops = {
+	.start	=	igmp6_mc_seq_start,
+	.next	=	igmp6_mc_seq_next,
+	.stop	=	igmp6_mc_seq_stop,
+	.show	=	igmp6_mc_seq_show,
+};
+
+static int igmp6_mc_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &igmp6_mc_seq_ops,
+			    sizeof(struct igmp6_mc_iter_state));
+}
+
+static const struct file_operations igmp6_mc_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp6_mc_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+struct igmp6_mcf_iter_state {
+	struct seq_net_private p;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct ifmcaddr6 *im;
+};
+
+#define igmp6_mcf_seq_private(seq)	((struct igmp6_mcf_iter_state *)(seq)->private)
+
+static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
+{
+	struct ip6_sf_list *psf = NULL;
+	struct ifmcaddr6 *im = NULL;
+	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+	struct net *net = seq_file_net(seq);
+
+	state->idev = NULL;
+	state->im = NULL;
+	for_each_netdev_rcu(net, state->dev) {
+		struct inet6_dev *idev;
+		idev = __in6_dev_get(state->dev);
+		if (unlikely(idev == NULL))
+			continue;
+		read_lock_bh(&idev->lock);
+		im = idev->mc_list;
+		if (likely(im != NULL)) {
+			spin_lock_bh(&im->mca_lock);
+			psf = im->mca_sources;
+			if (likely(psf != NULL)) {
+				state->im = im;
+				state->idev = idev;
+				break;
+			}
+			spin_unlock_bh(&im->mca_lock);
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	return psf;
+}
+
+static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf)
+{
+	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+
+	psf = psf->sf_next;
+	while (!psf) {
+		spin_unlock_bh(&state->im->mca_lock);
+		state->im = state->im->next;
+		while (!state->im) {
+			if (likely(state->idev != NULL))
+				read_unlock_bh(&state->idev->lock);
+
+			state->dev = next_net_device_rcu(state->dev);
+			if (!state->dev) {
+				state->idev = NULL;
+				goto out;
+			}
+			state->idev = __in6_dev_get(state->dev);
+			if (!state->idev)
+				continue;
+			read_lock_bh(&state->idev->lock);
+			state->im = state->idev->mc_list;
+		}
+		if (!state->im)
+			break;
+		spin_lock_bh(&state->im->mca_lock);
+		psf = state->im->mca_sources;
+	}
+out:
+	return psf;
+}
+
+static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip6_sf_list *psf = igmp6_mcf_get_first(seq);
+	if (psf)
+		while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL)
+			--pos;
+	return pos ? NULL : psf;
+}
+
+static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip6_sf_list *psf;
+	if (v == SEQ_START_TOKEN)
+		psf = igmp6_mcf_get_first(seq);
+	else
+		psf = igmp6_mcf_get_next(seq, v);
+	++*pos;
+	return psf;
+}
+
+static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+	if (likely(state->im != NULL)) {
+		spin_unlock_bh(&state->im->mca_lock);
+		state->im = NULL;
+	}
+	if (likely(state->idev != NULL)) {
+		read_unlock_bh(&state->idev->lock);
+		state->idev = NULL;
+	}
+	state->dev = NULL;
+	rcu_read_unlock();
+}
+
+static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
+{
+	struct ip6_sf_list *psf = (struct ip6_sf_list *)v;
+	struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			   "%3s %6s "
+			   "%32s %32s %6s %6s\n", "Idx",
+			   "Device", "Multicast Address",
+			   "Source Address", "INC", "EXC");
+	} else {
+		seq_printf(seq,
+			   "%3d %6.6s %pi6 %pi6 %6lu %6lu\n",
+			   state->dev->ifindex, state->dev->name,
+			   &state->im->mca_addr,
+			   &psf->sf_addr,
+			   psf->sf_count[MCAST_INCLUDE],
+			   psf->sf_count[MCAST_EXCLUDE]);
+	}
+	return 0;
+}
+
+static const struct seq_operations igmp6_mcf_seq_ops = {
+	.start	=	igmp6_mcf_seq_start,
+	.next	=	igmp6_mcf_seq_next,
+	.stop	=	igmp6_mcf_seq_stop,
+	.show	=	igmp6_mcf_seq_show,
+};
+
+static int igmp6_mcf_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &igmp6_mcf_seq_ops,
+			    sizeof(struct igmp6_mcf_iter_state));
+}
+
+static const struct file_operations igmp6_mcf_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp6_mcf_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_net,
+};
+
+static int __net_init igmp6_proc_init(struct net *net)
+{
+	int err;
+
+	err = -ENOMEM;
+	if (!proc_net_fops_create(net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops))
+		goto out;
+	if (!proc_net_fops_create(net, "mcfilter6", S_IRUGO,
+				  &igmp6_mcf_seq_fops))
+		goto out_proc_net_igmp6;
+
+	err = 0;
+out:
+	return err;
+
+out_proc_net_igmp6:
+	proc_net_remove(net, "igmp6");
+	goto out;
+}
+
+static void __net_exit igmp6_proc_exit(struct net *net)
+{
+	proc_net_remove(net, "mcfilter6");
+	proc_net_remove(net, "igmp6");
+}
+#else
+static inline int igmp6_proc_init(struct net *net)
+{
+	return 0;
+}
+static inline void igmp6_proc_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init igmp6_net_init(struct net *net)
+{
+	int err;
+
+	err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
+				   SOCK_RAW, IPPROTO_ICMPV6, net);
+	if (err < 0) {
+		printk(KERN_ERR
+		       "Failed to initialize the IGMP6 control socket (err %d).\n",
+		       err);
+		goto out;
+	}
+
+	inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
+
+	err = igmp6_proc_init(net);
+	if (err)
+		goto out_sock_create;
+out:
+	return err;
+
+out_sock_create:
+	inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+	goto out;
+}
+
+static void __net_exit igmp6_net_exit(struct net *net)
+{
+	inet_ctl_sock_destroy(net->ipv6.igmp_sk);
+	igmp6_proc_exit(net);
+}
+
+static struct pernet_operations igmp6_net_ops = {
+	.init = igmp6_net_init,
+	.exit = igmp6_net_exit,
+};
+
+int __init igmp6_init(void)
+{
+	return register_pernet_subsys(&igmp6_net_ops);
+}
+
+void igmp6_cleanup(void)
+{
+	unregister_pernet_subsys(&igmp6_net_ops);
+}
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
new file mode 100644
index 00000000..43242e6e
--- /dev/null
+++ b/net/ipv6/mip6.c
@@ -0,0 +1,525 @@
+/*
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * Authors:
+ *	Noriaki TAKAMIYA @USAGI
+ *	Masahide NAKAMURA @USAGI
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/time.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/rawv6.h>
+#include <net/xfrm.h>
+#include <net/mip6.h>
+
+static inline unsigned int calc_padlen(unsigned int len, unsigned int n)
+{
+	return (n - len + 16) & 0x7;
+}
+
+static inline void *mip6_padn(__u8 *data, __u8 padlen)
+{
+	if (!data)
+		return NULL;
+	if (padlen == 1) {
+		data[0] = IPV6_TLV_PAD0;
+	} else if (padlen > 1) {
+		data[0] = IPV6_TLV_PADN;
+		data[1] = padlen - 2;
+		if (padlen > 2)
+			memset(data+2, 0, data[1]);
+	}
+	return data + padlen;
+}
+
+static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos)
+{
+	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos);
+}
+
+static int mip6_mh_len(int type)
+{
+	int len = 0;
+
+	switch (type) {
+	case IP6_MH_TYPE_BRR:
+		len = 0;
+		break;
+	case IP6_MH_TYPE_HOTI:
+	case IP6_MH_TYPE_COTI:
+	case IP6_MH_TYPE_BU:
+	case IP6_MH_TYPE_BACK:
+		len = 1;
+		break;
+	case IP6_MH_TYPE_HOT:
+	case IP6_MH_TYPE_COT:
+	case IP6_MH_TYPE_BERROR:
+		len = 2;
+		break;
+	}
+	return len;
+}
+
+static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
+{
+	struct ip6_mh *mh;
+
+	if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) ||
+	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
+				 ((skb_transport_header(skb)[1] + 1) << 3))))
+		return -1;
+
+	mh = (struct ip6_mh *)skb_transport_header(skb);
+
+	if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) {
+		LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n",
+			       mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type));
+		mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) -
+					 skb_network_header(skb)));
+		return -1;
+	}
+
+	if (mh->ip6mh_proto != IPPROTO_NONE) {
+		LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n",
+			       mh->ip6mh_proto);
+		mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) -
+					 skb_network_header(skb)));
+		return -1;
+	}
+
+	return 0;
+}
+
+struct mip6_report_rate_limiter {
+	spinlock_t lock;
+	struct timeval stamp;
+	int iif;
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+static struct mip6_report_rate_limiter mip6_report_rl = {
+	.lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock)
+};
+
+static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data;
+	int err = destopt->nexthdr;
+
+	spin_lock(&x->lock);
+	if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		err = -ENOENT;
+	spin_unlock(&x->lock);
+
+	return err;
+}
+
+/* Destination Option Header is inserted.
+ * IP Header's src address is replaced with Home Address Option in
+ * Destination Option Header.
+ */
+static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	struct ipv6_destopt_hdr *dstopt;
+	struct ipv6_destopt_hao *hao;
+	u8 nexthdr;
+	int len;
+
+	skb_push(skb, -skb_network_offset(skb));
+	iph = ipv6_hdr(skb);
+
+	nexthdr = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_DSTOPTS;
+
+	dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb);
+	dstopt->nexthdr = nexthdr;
+
+	hao = mip6_padn((char *)(dstopt + 1),
+			calc_padlen(sizeof(*dstopt), 6));
+
+	hao->type = IPV6_TLV_HAO;
+	BUILD_BUG_ON(sizeof(*hao) != 18);
+	hao->length = sizeof(*hao) - 2;
+
+	len = ((char *)hao - (char *)dstopt) + sizeof(*hao);
+
+	memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr));
+	spin_lock_bh(&x->lock);
+	memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr));
+	spin_unlock_bh(&x->lock);
+
+	WARN_ON(len != x->props.header_len);
+	dstopt->hdrlen = (x->props.header_len >> 3) - 1;
+
+	return 0;
+}
+
+static inline int mip6_report_rl_allow(struct timeval *stamp,
+				       const struct in6_addr *dst,
+				       const struct in6_addr *src, int iif)
+{
+	int allow = 0;
+
+	spin_lock_bh(&mip6_report_rl.lock);
+	if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec ||
+	    mip6_report_rl.stamp.tv_usec != stamp->tv_usec ||
+	    mip6_report_rl.iif != iif ||
+	    !ipv6_addr_equal(&mip6_report_rl.src, src) ||
+	    !ipv6_addr_equal(&mip6_report_rl.dst, dst)) {
+		mip6_report_rl.stamp.tv_sec = stamp->tv_sec;
+		mip6_report_rl.stamp.tv_usec = stamp->tv_usec;
+		mip6_report_rl.iif = iif;
+		ipv6_addr_copy(&mip6_report_rl.src, src);
+		ipv6_addr_copy(&mip6_report_rl.dst, dst);
+		allow = 1;
+	}
+	spin_unlock_bh(&mip6_report_rl.lock);
+	return allow;
+}
+
+static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
+			       const struct flowi *fl)
+{
+	struct net *net = xs_net(x);
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+	const struct flowi6 *fl6 = &fl->u.ip6;
+	struct ipv6_destopt_hao *hao = NULL;
+	struct xfrm_selector sel;
+	int offset;
+	struct timeval stamp;
+	int err = 0;
+
+	if (unlikely(fl6->flowi6_proto == IPPROTO_MH &&
+		     fl6->fl6_mh_type <= IP6_MH_TYPE_MAX))
+		goto out;
+
+	if (likely(opt->dsthao)) {
+		offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO);
+		if (likely(offset >= 0))
+			hao = (struct ipv6_destopt_hao *)
+					(skb_network_header(skb) + offset);
+	}
+
+	skb_get_timestamp(skb, &stamp);
+
+	if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr,
+				  hao ? &hao->addr : &ipv6_hdr(skb)->saddr,
+				  opt->iif))
+		goto out;
+
+	memset(&sel, 0, sizeof(sel));
+	memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+	       sizeof(sel.daddr));
+	sel.prefixlen_d = 128;
+	memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
+	       sizeof(sel.saddr));
+	sel.prefixlen_s = 128;
+	sel.family = AF_INET6;
+	sel.proto = fl6->flowi6_proto;
+	sel.dport = xfrm_flowi_dport(fl, &fl6->uli);
+	if (sel.dport)
+		sel.dport_mask = htons(~0);
+	sel.sport = xfrm_flowi_sport(fl, &fl6->uli);
+	if (sel.sport)
+		sel.sport_mask = htons(~0);
+	sel.ifindex = fl6->flowi6_oif;
+
+	err = km_report(net, IPPROTO_DSTOPTS, &sel,
+			(hao ? (xfrm_address_t *)&hao->addr : NULL));
+
+ out:
+	return err;
+}
+
+static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
+			       u8 **nexthdr)
+{
+	u16 offset = sizeof(struct ipv6hdr);
+	struct ipv6_opt_hdr *exthdr =
+				   (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+	const unsigned char *nh = skb_network_header(skb);
+	unsigned int packet_len = skb->tail - skb->network_header;
+	int found_rhdr = 0;
+
+	*nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+	while (offset + 1 <= packet_len) {
+
+		switch (**nexthdr) {
+		case NEXTHDR_HOP:
+			break;
+		case NEXTHDR_ROUTING:
+			found_rhdr = 1;
+			break;
+		case NEXTHDR_DEST:
+			/*
+			 * HAO MUST NOT appear more than once.
+			 * XXX: It is better to try to find by the end of
+			 * XXX: packet if HAO exists.
+			 */
+			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
+				LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n");
+				return offset;
+			}
+
+			if (found_rhdr)
+				return offset;
+
+			break;
+		default:
+			return offset;
+		}
+
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
+		exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+	}
+
+	return offset;
+}
+
+static int mip6_destopt_init_state(struct xfrm_state *x)
+{
+	if (x->id.spi) {
+		printk(KERN_INFO "%s: spi is not 0: %u\n", __func__,
+		       x->id.spi);
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+		printk(KERN_INFO "%s: state's mode is not %u: %u\n",
+		       __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		return -EINVAL;
+	}
+
+	x->props.header_len = sizeof(struct ipv6_destopt_hdr) +
+		calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) +
+		sizeof(struct ipv6_destopt_hao);
+	WARN_ON(x->props.header_len != 24);
+
+	return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for
+ * destination options header unlike IPsec protocols.
+ */
+static void mip6_destopt_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type mip6_destopt_type =
+{
+	.description	= "MIP6DESTOPT",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_DSTOPTS,
+	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
+	.init_state	= mip6_destopt_init_state,
+	.destructor	= mip6_destopt_destroy,
+	.input		= mip6_destopt_input,
+	.output		= mip6_destopt_output,
+	.reject		= mip6_destopt_reject,
+	.hdr_offset	= mip6_destopt_offset,
+};
+
+static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data;
+	int err = rt2->rt_hdr.nexthdr;
+
+	spin_lock(&x->lock);
+	if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		err = -ENOENT;
+	spin_unlock(&x->lock);
+
+	return err;
+}
+
+/* Routing Header type 2 is inserted.
+ * IP Header's dst address is replaced with Routing Header's Home Address.
+ */
+static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	struct rt2_hdr *rt2;
+	u8 nexthdr;
+
+	skb_push(skb, -skb_network_offset(skb));
+	iph = ipv6_hdr(skb);
+
+	nexthdr = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_ROUTING;
+
+	rt2 = (struct rt2_hdr *)skb_transport_header(skb);
+	rt2->rt_hdr.nexthdr = nexthdr;
+	rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1;
+	rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2;
+	rt2->rt_hdr.segments_left = 1;
+	memset(&rt2->reserved, 0, sizeof(rt2->reserved));
+
+	WARN_ON(rt2->rt_hdr.hdrlen != 2);
+
+	memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr));
+	spin_lock_bh(&x->lock);
+	memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr));
+	spin_unlock_bh(&x->lock);
+
+	return 0;
+}
+
+static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
+			     u8 **nexthdr)
+{
+	u16 offset = sizeof(struct ipv6hdr);
+	struct ipv6_opt_hdr *exthdr =
+				   (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
+	const unsigned char *nh = skb_network_header(skb);
+	unsigned int packet_len = skb->tail - skb->network_header;
+	int found_rhdr = 0;
+
+	*nexthdr = &ipv6_hdr(skb)->nexthdr;
+
+	while (offset + 1 <= packet_len) {
+
+		switch (**nexthdr) {
+		case NEXTHDR_HOP:
+			break;
+		case NEXTHDR_ROUTING:
+			if (offset + 3 <= packet_len) {
+				struct ipv6_rt_hdr *rt;
+				rt = (struct ipv6_rt_hdr *)(nh + offset);
+				if (rt->type != 0)
+					return offset;
+			}
+			found_rhdr = 1;
+			break;
+		case NEXTHDR_DEST:
+			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+				return offset;
+
+			if (found_rhdr)
+				return offset;
+
+			break;
+		default:
+			return offset;
+		}
+
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
+		exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+	}
+
+	return offset;
+}
+
+static int mip6_rthdr_init_state(struct xfrm_state *x)
+{
+	if (x->id.spi) {
+		printk(KERN_INFO "%s: spi is not 0: %u\n", __func__,
+		       x->id.spi);
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+		printk(KERN_INFO "%s: state's mode is not %u: %u\n",
+		       __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		return -EINVAL;
+	}
+
+	x->props.header_len = sizeof(struct rt2_hdr);
+
+	return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for routing
+ * header type 2 unlike IPsec protocols.
+ */
+static void mip6_rthdr_destroy(struct xfrm_state *x)
+{
+}
+
+static const struct xfrm_type mip6_rthdr_type =
+{
+	.description	= "MIP6RT",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ROUTING,
+	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
+	.init_state	= mip6_rthdr_init_state,
+	.destructor	= mip6_rthdr_destroy,
+	.input		= mip6_rthdr_input,
+	.output		= mip6_rthdr_output,
+	.hdr_offset	= mip6_rthdr_offset,
+};
+
+static int __init mip6_init(void)
+{
+	printk(KERN_INFO "Mobile IPv6\n");
+
+	if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __func__);
+		goto mip6_destopt_xfrm_fail;
+	}
+	if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __func__);
+		goto mip6_rthdr_xfrm_fail;
+	}
+	if (rawv6_mh_filter_register(mip6_mh_filter) < 0) {
+		printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __func__);
+		goto mip6_rawv6_mh_fail;
+	}
+
+
+	return 0;
+
+ mip6_rawv6_mh_fail:
+	xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ mip6_rthdr_xfrm_fail:
+	xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
+ mip6_destopt_xfrm_fail:
+	return -EAGAIN;
+}
+
+static void __exit mip6_fini(void)
+{
+	if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
+		printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __func__);
+	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __func__);
+	if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __func__);
+}
+
+module_init(mip6_init);
+module_exit(mip6_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS);
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
new file mode 100644
index 00000000..10a8d411
--- /dev/null
+++ b/net/ipv6/ndisc.c
@@ -0,0 +1,1896 @@
+/*
+ *	Neighbour Discovery for IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Mike Shaver		<shaver@ingenia.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *	Changes:
+ *
+ *	Pierre Ynard			:	export userland ND options
+ *						through netlink (RDNSS support)
+ *	Lars Fenneberg			:	fixed MTU setting on receipt
+ *						of an RA.
+ *	Janos Farkas			:	kmalloc failure checks
+ *	Alexey Kuznetsov		:	state machine reworked
+ *						and moved to net/core.
+ *	Pekka Savola			:	RFC2461 validation
+ *	YOSHIFUJI Hideaki @USAGI	:	Verify ND options properly
+ */
+
+/* Set to 3 to get tracing... */
+#define ND_DEBUG 1
+
+#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0)
+#define ND_NOPRINTK(x...) do { ; } while(0)
+#define ND_PRINTK0 ND_PRINTK
+#define ND_PRINTK1 ND_NOPRINTK
+#define ND_PRINTK2 ND_NOPRINTK
+#define ND_PRINTK3 ND_NOPRINTK
+#if ND_DEBUG >= 1
+#undef ND_PRINTK1
+#define ND_PRINTK1 ND_PRINTK
+#endif
+#if ND_DEBUG >= 2
+#undef ND_PRINTK2
+#define ND_PRINTK2 ND_PRINTK
+#endif
+#if ND_DEBUG >= 3
+#undef ND_PRINTK3
+#define ND_PRINTK3 ND_PRINTK
+#endif
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/sched.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/route.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/jhash.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/icmp.h>
+
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <net/flow.h>
+#include <net/ip6_checksum.h>
+#include <net/inet_common.h>
+#include <linux/proc_fs.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 rnd);
+static int ndisc_constructor(struct neighbour *neigh);
+static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static int pndisc_constructor(struct pneigh_entry *n);
+static void pndisc_destructor(struct pneigh_entry *n);
+static void pndisc_redo(struct sk_buff *skb);
+
+static const struct neigh_ops ndisc_generic_ops = {
+	.family =		AF_INET6,
+	.solicit =		ndisc_solicit,
+	.error_report =		ndisc_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_connected_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static const struct neigh_ops ndisc_hh_ops = {
+	.family =		AF_INET6,
+	.solicit =		ndisc_solicit,
+	.error_report =		ndisc_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_resolve_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+
+static const struct neigh_ops ndisc_direct_ops = {
+	.family =		AF_INET6,
+	.output =		dev_queue_xmit,
+	.connected_output =	dev_queue_xmit,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+struct neigh_table nd_tbl = {
+	.family =	AF_INET6,
+	.entry_size =	sizeof(struct neighbour) + sizeof(struct in6_addr),
+	.key_len =	sizeof(struct in6_addr),
+	.hash =		ndisc_hash,
+	.constructor =	ndisc_constructor,
+	.pconstructor =	pndisc_constructor,
+	.pdestructor =	pndisc_destructor,
+	.proxy_redo =	pndisc_redo,
+	.id =		"ndisc_cache",
+	.parms = {
+		.tbl			= &nd_tbl,
+		.base_reachable_time	= ND_REACHABLE_TIME,
+		.retrans_time		= ND_RETRANS_TIMER,
+		.gc_staletime		= 60 * HZ,
+		.reachable_time		= ND_REACHABLE_TIME,
+		.delay_probe_time	= 5 * HZ,
+		.queue_len		= 3,
+		.ucast_probes		= 3,
+		.mcast_probes		= 3,
+		.anycast_delay		= 1 * HZ,
+		.proxy_delay		= (8 * HZ) / 10,
+		.proxy_qlen		= 64,
+	},
+	.gc_interval =	  30 * HZ,
+	.gc_thresh1 =	 128,
+	.gc_thresh2 =	 512,
+	.gc_thresh3 =	1024,
+};
+
+/* ND options */
+struct ndisc_options {
+	struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	struct nd_opt_hdr *nd_opts_ri;
+	struct nd_opt_hdr *nd_opts_ri_end;
+#endif
+	struct nd_opt_hdr *nd_useropts;
+	struct nd_opt_hdr *nd_useropts_end;
+};
+
+#define nd_opts_src_lladdr	nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
+#define nd_opts_tgt_lladdr	nd_opt_array[ND_OPT_TARGET_LL_ADDR]
+#define nd_opts_pi		nd_opt_array[ND_OPT_PREFIX_INFO]
+#define nd_opts_pi_end		nd_opt_array[__ND_OPT_PREFIX_INFO_END]
+#define nd_opts_rh		nd_opt_array[ND_OPT_REDIRECT_HDR]
+#define nd_opts_mtu		nd_opt_array[ND_OPT_MTU]
+
+#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
+
+/*
+ * Return the padding between the option length and the start of the
+ * link addr.  Currently only IP-over-InfiniBand needs this, although
+ * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may
+ * also need a pad of 2.
+ */
+static int ndisc_addr_option_pad(unsigned short type)
+{
+	switch (type) {
+	case ARPHRD_INFINIBAND: return 2;
+	default:                return 0;
+	}
+}
+
+static inline int ndisc_opt_addr_space(struct net_device *dev)
+{
+	return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type));
+}
+
+static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len,
+				  unsigned short addr_type)
+{
+	int space = NDISC_OPT_SPACE(data_len);
+	int pad   = ndisc_addr_option_pad(addr_type);
+
+	opt[0] = type;
+	opt[1] = space>>3;
+
+	memset(opt + 2, 0, pad);
+	opt   += pad;
+	space -= pad;
+
+	memcpy(opt+2, data, data_len);
+	data_len += 2;
+	opt += data_len;
+	if ((space -= data_len) > 0)
+		memset(opt, 0, space);
+	return opt + space;
+}
+
+static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
+					    struct nd_opt_hdr *end)
+{
+	int type;
+	if (!cur || !end || cur >= end)
+		return NULL;
+	type = cur->nd_opt_type;
+	do {
+		cur = ((void *)cur) + (cur->nd_opt_len << 3);
+	} while(cur < end && cur->nd_opt_type != type);
+	return cur <= end && cur->nd_opt_type == type ? cur : NULL;
+}
+
+static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
+{
+	return opt->nd_opt_type == ND_OPT_RDNSS;
+}
+
+static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
+					     struct nd_opt_hdr *end)
+{
+	if (!cur || !end || cur >= end)
+		return NULL;
+	do {
+		cur = ((void *)cur) + (cur->nd_opt_len << 3);
+	} while(cur < end && !ndisc_is_useropt(cur));
+	return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
+}
+
+static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
+						 struct ndisc_options *ndopts)
+{
+	struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
+
+	if (!nd_opt || opt_len < 0 || !ndopts)
+		return NULL;
+	memset(ndopts, 0, sizeof(*ndopts));
+	while (opt_len) {
+		int l;
+		if (opt_len < sizeof(struct nd_opt_hdr))
+			return NULL;
+		l = nd_opt->nd_opt_len << 3;
+		if (opt_len < l || l == 0)
+			return NULL;
+		switch (nd_opt->nd_opt_type) {
+		case ND_OPT_SOURCE_LL_ADDR:
+		case ND_OPT_TARGET_LL_ADDR:
+		case ND_OPT_MTU:
+		case ND_OPT_REDIRECT_HDR:
+			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
+				ND_PRINTK2(KERN_WARNING
+					   "%s(): duplicated ND6 option found: type=%d\n",
+					   __func__,
+					   nd_opt->nd_opt_type);
+			} else {
+				ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
+			}
+			break;
+		case ND_OPT_PREFIX_INFO:
+			ndopts->nd_opts_pi_end = nd_opt;
+			if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
+				ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
+			break;
+#ifdef CONFIG_IPV6_ROUTE_INFO
+		case ND_OPT_ROUTE_INFO:
+			ndopts->nd_opts_ri_end = nd_opt;
+			if (!ndopts->nd_opts_ri)
+				ndopts->nd_opts_ri = nd_opt;
+			break;
+#endif
+		default:
+			if (ndisc_is_useropt(nd_opt)) {
+				ndopts->nd_useropts_end = nd_opt;
+				if (!ndopts->nd_useropts)
+					ndopts->nd_useropts = nd_opt;
+			} else {
+				/*
+				 * Unknown options must be silently ignored,
+				 * to accommodate future extension to the
+				 * protocol.
+				 */
+				ND_PRINTK2(KERN_NOTICE
+					   "%s(): ignored unsupported option; type=%d, len=%d\n",
+					   __func__,
+					   nd_opt->nd_opt_type, nd_opt->nd_opt_len);
+			}
+		}
+		opt_len -= l;
+		nd_opt = ((void *)nd_opt) + l;
+	}
+	return ndopts;
+}
+
+static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
+				      struct net_device *dev)
+{
+	u8 *lladdr = (u8 *)(p + 1);
+	int lladdrlen = p->nd_opt_len << 3;
+	int prepad = ndisc_addr_option_pad(dev->type);
+	if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
+		return NULL;
+	return lladdr + prepad;
+}
+
+int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_IEEE802:	/* Not sure. Check it later. --ANK */
+	case ARPHRD_FDDI:
+		ipv6_eth_mc_map(addr, buf);
+		return 0;
+	case ARPHRD_IEEE802_TR:
+		ipv6_tr_mc_map(addr,buf);
+		return 0;
+	case ARPHRD_ARCNET:
+		ipv6_arcnet_mc_map(addr, buf);
+		return 0;
+	case ARPHRD_INFINIBAND:
+		ipv6_ib_mc_map(addr, dev->broadcast, buf);
+		return 0;
+	case ARPHRD_IPGRE:
+		return ipv6_ipgre_mc_map(addr, dev->broadcast, buf);
+	default:
+		if (dir) {
+			memcpy(buf, dev->broadcast, dev->addr_len);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+EXPORT_SYMBOL(ndisc_mc_map);
+
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 hash_rnd)
+{
+	const u32 *p32 = pkey;
+	u32 addr_hash, i;
+
+	addr_hash = 0;
+	for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
+		addr_hash ^= *p32++;
+
+	return jhash_2words(addr_hash, dev->ifindex, hash_rnd);
+}
+
+static int ndisc_constructor(struct neighbour *neigh)
+{
+	struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key;
+	struct net_device *dev = neigh->dev;
+	struct inet6_dev *in6_dev;
+	struct neigh_parms *parms;
+	int is_multicast = ipv6_addr_is_multicast(addr);
+
+	rcu_read_lock();
+	in6_dev = in6_dev_get(dev);
+	if (in6_dev == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in6_dev->nd_parms;
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+	rcu_read_unlock();
+
+	neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
+	if (!dev->header_ops) {
+		neigh->nud_state = NUD_NOARP;
+		neigh->ops = &ndisc_direct_ops;
+		neigh->output = neigh->ops->queue_xmit;
+	} else {
+		if (is_multicast) {
+			neigh->nud_state = NUD_NOARP;
+			ndisc_mc_map(addr, neigh->ha, dev, 1);
+		} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+			if (dev->flags&IFF_LOOPBACK)
+				neigh->type = RTN_LOCAL;
+		} else if (dev->flags&IFF_POINTOPOINT) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+		}
+		if (dev->header_ops->cache)
+			neigh->ops = &ndisc_hh_ops;
+		else
+			neigh->ops = &ndisc_generic_ops;
+		if (neigh->nud_state&NUD_VALID)
+			neigh->output = neigh->ops->connected_output;
+		else
+			neigh->output = neigh->ops->output;
+	}
+	in6_dev_put(in6_dev);
+	return 0;
+}
+
+static int pndisc_constructor(struct pneigh_entry *n)
+{
+	struct in6_addr *addr = (struct in6_addr*)&n->key;
+	struct in6_addr maddr;
+	struct net_device *dev = n->dev;
+
+	if (dev == NULL || __in6_dev_get(dev) == NULL)
+		return -EINVAL;
+	addrconf_addr_solict_mult(addr, &maddr);
+	ipv6_dev_mc_inc(dev, &maddr);
+	return 0;
+}
+
+static void pndisc_destructor(struct pneigh_entry *n)
+{
+	struct in6_addr *addr = (struct in6_addr*)&n->key;
+	struct in6_addr maddr;
+	struct net_device *dev = n->dev;
+
+	if (dev == NULL || __in6_dev_get(dev) == NULL)
+		return;
+	addrconf_addr_solict_mult(addr, &maddr);
+	ipv6_dev_mc_dec(dev, &maddr);
+}
+
+struct sk_buff *ndisc_build_skb(struct net_device *dev,
+				const struct in6_addr *daddr,
+				const struct in6_addr *saddr,
+				struct icmp6hdr *icmp6h,
+				const struct in6_addr *target,
+				int llinfo)
+{
+	struct net *net = dev_net(dev);
+	struct sock *sk = net->ipv6.ndisc_sk;
+	struct sk_buff *skb;
+	struct icmp6hdr *hdr;
+	int len;
+	int err;
+	u8 *opt;
+
+	if (!dev->addr_len)
+		llinfo = 0;
+
+	len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0);
+	if (llinfo)
+		len += ndisc_opt_addr_space(dev);
+
+	skb = sock_alloc_send_skb(sk,
+				  (MAX_HEADER + sizeof(struct ipv6hdr) +
+				   len + LL_ALLOCATED_SPACE(dev)),
+				  1, &err);
+	if (!skb) {
+		ND_PRINTK0(KERN_ERR
+			   "ICMPv6 ND: %s() failed to allocate an skb, err=%d.\n",
+			   __func__, err);
+		return NULL;
+	}
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len);
+
+	skb->transport_header = skb->tail;
+	skb_put(skb, len);
+
+	hdr = (struct icmp6hdr *)skb_transport_header(skb);
+	memcpy(hdr, icmp6h, sizeof(*hdr));
+
+	opt = skb_transport_header(skb) + sizeof(struct icmp6hdr);
+	if (target) {
+		ipv6_addr_copy((struct in6_addr *)opt, target);
+		opt += sizeof(*target);
+	}
+
+	if (llinfo)
+		ndisc_fill_addr_option(opt, llinfo, dev->dev_addr,
+				       dev->addr_len, dev->type);
+
+	hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len,
+					   IPPROTO_ICMPV6,
+					   csum_partial(hdr,
+							len, 0));
+
+	return skb;
+}
+
+EXPORT_SYMBOL(ndisc_build_skb);
+
+void ndisc_send_skb(struct sk_buff *skb,
+		    struct net_device *dev,
+		    struct neighbour *neigh,
+		    const struct in6_addr *daddr,
+		    const struct in6_addr *saddr,
+		    struct icmp6hdr *icmp6h)
+{
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	struct net *net = dev_net(dev);
+	struct sock *sk = net->ipv6.ndisc_sk;
+	struct inet6_dev *idev;
+	int err;
+	u8 type;
+
+	type = icmp6h->icmp6_type;
+
+	icmpv6_flow_init(sk, &fl6, type, saddr, daddr, dev->ifindex);
+
+	dst = icmp6_dst_alloc(dev, neigh, daddr);
+	if (!dst) {
+		kfree_skb(skb);
+		return;
+	}
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	skb_dst_set(skb, dst);
+
+	idev = in6_dev_get(dst->dev);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
+		      dst_output);
+	if (!err) {
+		ICMP6MSGOUT_INC_STATS(net, idev, type);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+	}
+
+	if (likely(idev != NULL))
+		in6_dev_put(idev);
+}
+
+EXPORT_SYMBOL(ndisc_send_skb);
+
+/*
+ *	Send a Neighbour Discover packet
+ */
+static void __ndisc_send(struct net_device *dev,
+			 struct neighbour *neigh,
+			 const struct in6_addr *daddr,
+			 const struct in6_addr *saddr,
+			 struct icmp6hdr *icmp6h, const struct in6_addr *target,
+			 int llinfo)
+{
+	struct sk_buff *skb;
+
+	skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo);
+	if (!skb)
+		return;
+
+	ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h);
+}
+
+static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
+			  const struct in6_addr *daddr,
+			  const struct in6_addr *solicited_addr,
+			  int router, int solicited, int override, int inc_opt)
+{
+	struct in6_addr tmpaddr;
+	struct inet6_ifaddr *ifp;
+	const struct in6_addr *src_addr;
+	struct icmp6hdr icmp6h = {
+		.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
+	};
+
+	/* for anycast or proxy, solicited_addr != src_addr */
+	ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1);
+	if (ifp) {
+		src_addr = solicited_addr;
+		if (ifp->flags & IFA_F_OPTIMISTIC)
+			override = 0;
+		inc_opt |= ifp->idev->cnf.force_tllao;
+		in6_ifa_put(ifp);
+	} else {
+		if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
+				       inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs,
+				       &tmpaddr))
+			return;
+		src_addr = &tmpaddr;
+	}
+
+	icmp6h.icmp6_router = router;
+	icmp6h.icmp6_solicited = solicited;
+	icmp6h.icmp6_override = override;
+
+	__ndisc_send(dev, neigh, daddr, src_addr,
+		     &icmp6h, solicited_addr,
+		     inc_opt ? ND_OPT_TARGET_LL_ADDR : 0);
+}
+
+static void ndisc_send_unsol_na(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr mcaddr;
+
+	idev = in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		addrconf_addr_solict_mult(&ifa->addr, &mcaddr);
+		ndisc_send_na(dev, NULL, &mcaddr, &ifa->addr,
+			      /*router=*/ !!idev->cnf.forwarding,
+			      /*solicited=*/ false, /*override=*/ true,
+			      /*inc_opt=*/ true);
+	}
+	read_unlock_bh(&idev->lock);
+
+	in6_dev_put(idev);
+}
+
+void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
+		   const struct in6_addr *solicit,
+		   const struct in6_addr *daddr, const struct in6_addr *saddr)
+{
+	struct in6_addr addr_buf;
+	struct icmp6hdr icmp6h = {
+		.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
+	};
+
+	if (saddr == NULL) {
+		if (ipv6_get_lladdr(dev, &addr_buf,
+				   (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
+			return;
+		saddr = &addr_buf;
+	}
+
+	__ndisc_send(dev, neigh, daddr, saddr,
+		     &icmp6h, solicit,
+		     !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0);
+}
+
+void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
+		   const struct in6_addr *daddr)
+{
+	struct icmp6hdr icmp6h = {
+		.icmp6_type = NDISC_ROUTER_SOLICITATION,
+	};
+	int send_sllao = dev->addr_len;
+
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	/*
+	 * According to section 2.2 of RFC 4429, we must not
+	 * send router solicitations with a sllao from
+	 * optimistic addresses, but we may send the solicitation
+	 * if we don't include the sllao.  So here we check
+	 * if our address is optimistic, and if so, we
+	 * suppress the inclusion of the sllao.
+	 */
+	if (send_sllao) {
+		struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr,
+							   dev, 1);
+		if (ifp) {
+			if (ifp->flags & IFA_F_OPTIMISTIC)  {
+				send_sllao = 0;
+			}
+			in6_ifa_put(ifp);
+		} else {
+			send_sllao = 0;
+		}
+	}
+#endif
+	__ndisc_send(dev, NULL, daddr, saddr,
+		     &icmp6h, NULL,
+		     send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0);
+}
+
+
+static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	/*
+	 *	"The sender MUST return an ICMP
+	 *	 destination unreachable"
+	 */
+	dst_link_failure(skb);
+	kfree_skb(skb);
+}
+
+/* Called with locked neigh: either read or both */
+
+static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+	struct in6_addr *saddr = NULL;
+	struct in6_addr mcaddr;
+	struct net_device *dev = neigh->dev;
+	struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
+	int probes = atomic_read(&neigh->probes);
+
+	if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1))
+		saddr = &ipv6_hdr(skb)->saddr;
+
+	if ((probes -= neigh->parms->ucast_probes) < 0) {
+		if (!(neigh->nud_state & NUD_VALID)) {
+			ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %pI6\n",
+				   __func__, target);
+		}
+		ndisc_send_ns(dev, neigh, target, target, saddr);
+	} else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+		neigh_app_ns(neigh);
+#endif
+	} else {
+		addrconf_addr_solict_mult(target, &mcaddr);
+		ndisc_send_ns(dev, NULL, target, &mcaddr, saddr);
+	}
+}
+
+static int pndisc_is_router(const void *pkey,
+			    struct net_device *dev)
+{
+	struct pneigh_entry *n;
+	int ret = -1;
+
+	read_lock_bh(&nd_tbl.lock);
+	n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
+	if (n)
+		ret = !!(n->flags & NTF_ROUTER);
+	read_unlock_bh(&nd_tbl.lock);
+
+	return ret;
+}
+
+static void ndisc_recv_ns(struct sk_buff *skb)
+{
+	struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+	const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+	const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+	u8 *lladdr = NULL;
+	u32 ndoptlen = skb->tail - (skb->transport_header +
+				    offsetof(struct nd_msg, opt));
+	struct ndisc_options ndopts;
+	struct net_device *dev = skb->dev;
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *idev = NULL;
+	struct neighbour *neigh;
+	int dad = ipv6_addr_any(saddr);
+	int inc;
+	int is_router = -1;
+
+	if (ipv6_addr_is_multicast(&msg->target)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NS: multicast target address");
+		return;
+	}
+
+	/*
+	 * RFC2461 7.1.1:
+	 * DAD has to be destined for solicited node multicast address.
+	 */
+	if (dad &&
+	    !(daddr->s6_addr32[0] == htonl(0xff020000) &&
+	      daddr->s6_addr32[1] == htonl(0x00000000) &&
+	      daddr->s6_addr32[2] == htonl(0x00000001) &&
+	      daddr->s6_addr [12] == 0xff )) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NS: bad DAD packet (wrong destination)\n");
+		return;
+	}
+
+	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NS: invalid ND options\n");
+		return;
+	}
+
+	if (ndopts.nd_opts_src_lladdr) {
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
+		if (!lladdr) {
+			ND_PRINTK2(KERN_WARNING
+				   "ICMPv6 NS: invalid link-layer address length\n");
+			return;
+		}
+
+		/* RFC2461 7.1.1:
+		 *	If the IP source address is the unspecified address,
+		 *	there MUST NOT be source link-layer address option
+		 *	in the message.
+		 */
+		if (dad) {
+			ND_PRINTK2(KERN_WARNING
+				   "ICMPv6 NS: bad DAD packet (link-layer address option)\n");
+			return;
+		}
+	}
+
+	inc = ipv6_addr_is_multicast(daddr);
+
+	ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+	if (ifp) {
+
+		if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
+			if (dad) {
+				if (dev->type == ARPHRD_IEEE802_TR) {
+					const unsigned char *sadr;
+					sadr = skb_mac_header(skb);
+					if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 &&
+					    sadr[9] == dev->dev_addr[1] &&
+					    sadr[10] == dev->dev_addr[2] &&
+					    sadr[11] == dev->dev_addr[3] &&
+					    sadr[12] == dev->dev_addr[4] &&
+					    sadr[13] == dev->dev_addr[5]) {
+						/* looped-back to us */
+						goto out;
+					}
+				}
+
+				/*
+				 * We are colliding with another node
+				 * who is doing DAD
+				 * so fail our DAD process
+				 */
+				addrconf_dad_failure(ifp);
+				return;
+			} else {
+				/*
+				 * This is not a dad solicitation.
+				 * If we are an optimistic node,
+				 * we should respond.
+				 * Otherwise, we should ignore it.
+				 */
+				if (!(ifp->flags & IFA_F_OPTIMISTIC))
+					goto out;
+			}
+		}
+
+		idev = ifp->idev;
+	} else {
+		struct net *net = dev_net(dev);
+
+		idev = in6_dev_get(dev);
+		if (!idev) {
+			/* XXX: count this drop? */
+			return;
+		}
+
+		if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
+		    (idev->cnf.forwarding &&
+		     (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) &&
+		     (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
+			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
+			    skb->pkt_type != PACKET_HOST &&
+			    inc != 0 &&
+			    idev->nd_parms->proxy_delay != 0) {
+				/*
+				 * for anycast or proxy,
+				 * sender should delay its response
+				 * by a random time between 0 and
+				 * MAX_ANYCAST_DELAY_TIME seconds.
+				 * (RFC2461) -- yoshfuji
+				 */
+				struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
+				if (n)
+					pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
+				goto out;
+			}
+		} else
+			goto out;
+	}
+
+	if (is_router < 0)
+		is_router = !!idev->cnf.forwarding;
+
+	if (dad) {
+		ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target,
+			      is_router, 0, (ifp != NULL), 1);
+		goto out;
+	}
+
+	if (inc)
+		NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
+	else
+		NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
+
+	/*
+	 *	update / create cache entry
+	 *	for the source address
+	 */
+	neigh = __neigh_lookup(&nd_tbl, saddr, dev,
+			       !inc || lladdr || !dev->addr_len);
+	if (neigh)
+		neigh_update(neigh, lladdr, NUD_STALE,
+			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
+			     NEIGH_UPDATE_F_OVERRIDE);
+	if (neigh || !dev->header_ops) {
+		ndisc_send_na(dev, neigh, saddr, &msg->target,
+			      is_router,
+			      1, (ifp != NULL && inc), inc);
+		if (neigh)
+			neigh_release(neigh);
+	}
+
+out:
+	if (ifp)
+		in6_ifa_put(ifp);
+	else
+		in6_dev_put(idev);
+}
+
+static void ndisc_recv_na(struct sk_buff *skb)
+{
+	struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
+	const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+	const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+	u8 *lladdr = NULL;
+	u32 ndoptlen = skb->tail - (skb->transport_header +
+				    offsetof(struct nd_msg, opt));
+	struct ndisc_options ndopts;
+	struct net_device *dev = skb->dev;
+	struct inet6_ifaddr *ifp;
+	struct neighbour *neigh;
+
+	if (skb->len < sizeof(struct nd_msg)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NA: packet too short\n");
+		return;
+	}
+
+	if (ipv6_addr_is_multicast(&msg->target)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NA: target address is multicast.\n");
+		return;
+	}
+
+	if (ipv6_addr_is_multicast(daddr) &&
+	    msg->icmph.icmp6_solicited) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NA: solicited NA is multicasted.\n");
+		return;
+	}
+
+	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NS: invalid ND option\n");
+		return;
+	}
+	if (ndopts.nd_opts_tgt_lladdr) {
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
+		if (!lladdr) {
+			ND_PRINTK2(KERN_WARNING
+				   "ICMPv6 NA: invalid link-layer address length\n");
+			return;
+		}
+	}
+	ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
+	if (ifp) {
+		if (skb->pkt_type != PACKET_LOOPBACK
+		    && (ifp->flags & IFA_F_TENTATIVE)) {
+				addrconf_dad_failure(ifp);
+				return;
+		}
+		/* What should we make now? The advertisement
+		   is invalid, but ndisc specs say nothing
+		   about it. It could be misconfiguration, or
+		   an smart proxy agent tries to help us :-)
+
+		   We should not print the error if NA has been
+		   received from loopback - it is just our own
+		   unsolicited advertisement.
+		 */
+		if (skb->pkt_type != PACKET_LOOPBACK)
+			ND_PRINTK1(KERN_WARNING
+			   "ICMPv6 NA: someone advertises our address %pI6 on %s!\n",
+			   &ifp->addr, ifp->idev->dev->name);
+		in6_ifa_put(ifp);
+		return;
+	}
+	neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+
+	if (neigh) {
+		u8 old_flags = neigh->flags;
+		struct net *net = dev_net(dev);
+
+		if (neigh->nud_state & NUD_FAILED)
+			goto out;
+
+		/*
+		 * Don't update the neighbor cache entry on a proxy NA from
+		 * ourselves because either the proxied node is off link or it
+		 * has already sent a NA to us.
+		 */
+		if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
+		    net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp &&
+		    pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
+			/* XXX: idev->cnf.prixy_ndp */
+			goto out;
+		}
+
+		neigh_update(neigh, lladdr,
+			     msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
+			     (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
+			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+			     (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0));
+
+		if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
+			/*
+			 * Change: router to host
+			 */
+			struct rt6_info *rt;
+			rt = rt6_get_dflt_router(saddr, dev);
+			if (rt)
+				ip6_del_rt(rt);
+		}
+
+out:
+		neigh_release(neigh);
+	}
+}
+
+static void ndisc_recv_rs(struct sk_buff *skb)
+{
+	struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
+	unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
+	struct neighbour *neigh;
+	struct inet6_dev *idev;
+	const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
+	struct ndisc_options ndopts;
+	u8 *lladdr = NULL;
+
+	if (skb->len < sizeof(*rs_msg))
+		return;
+
+	idev = in6_dev_get(skb->dev);
+	if (!idev) {
+		if (net_ratelimit())
+			ND_PRINTK1("ICMP6 RS: can't find in6 device\n");
+		return;
+	}
+
+	/* Don't accept RS if we're not in router mode */
+	if (!idev->cnf.forwarding)
+		goto out;
+
+	/*
+	 * Don't update NCE if src = ::;
+	 * this implies that the source node has no ip address assigned yet.
+	 */
+	if (ipv6_addr_any(saddr))
+		goto out;
+
+	/* Parse ND options */
+	if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) {
+		if (net_ratelimit())
+			ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n");
+		goto out;
+	}
+
+	if (ndopts.nd_opts_src_lladdr) {
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
+					     skb->dev);
+		if (!lladdr)
+			goto out;
+	}
+
+	neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
+	if (neigh) {
+		neigh_update(neigh, lladdr, NUD_STALE,
+			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
+			     NEIGH_UPDATE_F_OVERRIDE|
+			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
+		neigh_release(neigh);
+	}
+out:
+	in6_dev_put(idev);
+}
+
+static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
+{
+	struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra);
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct nduseroptmsg *ndmsg;
+	struct net *net = dev_net(ra->dev);
+	int err;
+	int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg)
+				    + (opt->nd_opt_len << 3));
+	size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr));
+
+	skb = nlmsg_new(msg_size, GFP_ATOMIC);
+	if (skb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0);
+	if (nlh == NULL) {
+		goto nla_put_failure;
+	}
+
+	ndmsg = nlmsg_data(nlh);
+	ndmsg->nduseropt_family = AF_INET6;
+	ndmsg->nduseropt_ifindex = ra->dev->ifindex;
+	ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type;
+	ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code;
+	ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3;
+
+	memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3);
+
+	NLA_PUT(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr),
+		&ipv6_hdr(ra)->saddr);
+	nlmsg_end(skb, nlh);
+
+	rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
+	return;
+
+nla_put_failure:
+	nlmsg_free(skb);
+	err = -EMSGSIZE;
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
+}
+
+static inline int accept_ra(struct inet6_dev *in6_dev)
+{
+	/*
+	 * If forwarding is enabled, RA are not accepted unless the special
+	 * hybrid mode (accept_ra=2) is enabled.
+	 */
+	if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2)
+		return 0;
+
+	return in6_dev->cnf.accept_ra;
+}
+
+static void ndisc_router_discovery(struct sk_buff *skb)
+{
+	struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
+	struct neighbour *neigh = NULL;
+	struct inet6_dev *in6_dev;
+	struct rt6_info *rt = NULL;
+	int lifetime;
+	struct ndisc_options ndopts;
+	int optlen;
+	unsigned int pref = 0;
+
+	__u8 * opt = (__u8 *)(ra_msg + 1);
+
+	optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg);
+
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 RA: source address is not link-local.\n");
+		return;
+	}
+	if (optlen < 0) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 RA: packet too short\n");
+		return;
+	}
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+	if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 RA: from host or unauthorized router\n");
+		return;
+	}
+#endif
+
+	/*
+	 *	set the RA_RECV flag in the interface
+	 */
+
+	in6_dev = in6_dev_get(skb->dev);
+	if (in6_dev == NULL) {
+		ND_PRINTK0(KERN_ERR
+			   "ICMPv6 RA: can't find inet6 device for %s.\n",
+			   skb->dev->name);
+		return;
+	}
+
+	if (!ndisc_parse_options(opt, optlen, &ndopts)) {
+		in6_dev_put(in6_dev);
+		ND_PRINTK2(KERN_WARNING
+			   "ICMP6 RA: invalid ND options\n");
+		return;
+	}
+
+	if (!accept_ra(in6_dev))
+		goto skip_linkparms;
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+	/* skip link-specific parameters from interior routers */
+	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
+		goto skip_linkparms;
+#endif
+
+	if (in6_dev->if_flags & IF_RS_SENT) {
+		/*
+		 *	flag that an RA was received after an RS was sent
+		 *	out on this interface.
+		 */
+		in6_dev->if_flags |= IF_RA_RCVD;
+	}
+
+	/*
+	 * Remember the managed/otherconf flags from most recently
+	 * received RA message (RFC 2462) -- yoshfuji
+	 */
+	in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
+				IF_RA_OTHERCONF)) |
+				(ra_msg->icmph.icmp6_addrconf_managed ?
+					IF_RA_MANAGED : 0) |
+				(ra_msg->icmph.icmp6_addrconf_other ?
+					IF_RA_OTHERCONF : 0);
+
+	if (!in6_dev->cnf.accept_ra_defrtr)
+		goto skip_defrtr;
+
+	lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	pref = ra_msg->icmph.icmp6_router_pref;
+	/* 10b is handled as if it were 00b (medium) */
+	if (pref == ICMPV6_ROUTER_PREF_INVALID ||
+	    !in6_dev->cnf.accept_ra_rtr_pref)
+		pref = ICMPV6_ROUTER_PREF_MEDIUM;
+#endif
+
+	rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+
+	if (rt)
+		neigh = dst_get_neighbour(&rt->dst);
+
+	if (rt && lifetime == 0) {
+		neigh_clone(neigh);
+		ip6_del_rt(rt);
+		rt = NULL;
+	}
+
+	if (rt == NULL && lifetime) {
+		ND_PRINTK3(KERN_DEBUG
+			   "ICMPv6 RA: adding default router.\n");
+
+		rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+		if (rt == NULL) {
+			ND_PRINTK0(KERN_ERR
+				   "ICMPv6 RA: %s() failed to add default route.\n",
+				   __func__);
+			in6_dev_put(in6_dev);
+			return;
+		}
+
+		neigh = dst_get_neighbour(&rt->dst);
+		if (neigh == NULL) {
+			ND_PRINTK0(KERN_ERR
+				   "ICMPv6 RA: %s() got default router without neighbour.\n",
+				   __func__);
+			dst_release(&rt->dst);
+			in6_dev_put(in6_dev);
+			return;
+		}
+		neigh->flags |= NTF_ROUTER;
+	} else if (rt) {
+		rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+	}
+
+	if (rt)
+		rt->rt6i_expires = jiffies + (HZ * lifetime);
+
+	if (ra_msg->icmph.icmp6_hop_limit) {
+		in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+		if (rt)
+			dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+				       ra_msg->icmph.icmp6_hop_limit);
+	}
+
+skip_defrtr:
+
+	/*
+	 *	Update Reachable Time and Retrans Timer
+	 */
+
+	if (in6_dev->nd_parms) {
+		unsigned long rtime = ntohl(ra_msg->retrans_timer);
+
+		if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
+			rtime = (rtime*HZ)/1000;
+			if (rtime < HZ/10)
+				rtime = HZ/10;
+			in6_dev->nd_parms->retrans_time = rtime;
+			in6_dev->tstamp = jiffies;
+			inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+		}
+
+		rtime = ntohl(ra_msg->reachable_time);
+		if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) {
+			rtime = (rtime*HZ)/1000;
+
+			if (rtime < HZ/10)
+				rtime = HZ/10;
+
+			if (rtime != in6_dev->nd_parms->base_reachable_time) {
+				in6_dev->nd_parms->base_reachable_time = rtime;
+				in6_dev->nd_parms->gc_staletime = 3 * rtime;
+				in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
+				in6_dev->tstamp = jiffies;
+				inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+			}
+		}
+	}
+
+skip_linkparms:
+
+	/*
+	 *	Process options.
+	 */
+
+	if (!neigh)
+		neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr,
+				       skb->dev, 1);
+	if (neigh) {
+		u8 *lladdr = NULL;
+		if (ndopts.nd_opts_src_lladdr) {
+			lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
+						     skb->dev);
+			if (!lladdr) {
+				ND_PRINTK2(KERN_WARNING
+					   "ICMPv6 RA: invalid link-layer address length\n");
+				goto out;
+			}
+		}
+		neigh_update(neigh, lladdr, NUD_STALE,
+			     NEIGH_UPDATE_F_WEAK_OVERRIDE|
+			     NEIGH_UPDATE_F_OVERRIDE|
+			     NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+			     NEIGH_UPDATE_F_ISROUTER);
+	}
+
+	if (!accept_ra(in6_dev))
+		goto out;
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+	if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) {
+		struct nd_opt_hdr *p;
+		for (p = ndopts.nd_opts_ri;
+		     p;
+		     p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
+			struct route_info *ri = (struct route_info *)p;
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+			if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
+			    ri->prefix_len == 0)
+				continue;
+#endif
+			if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
+				continue;
+			rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3,
+				      &ipv6_hdr(skb)->saddr);
+		}
+	}
+#endif
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+	/* skip link-specific ndopts from interior routers */
+	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT)
+		goto out;
+#endif
+
+	if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
+		struct nd_opt_hdr *p;
+		for (p = ndopts.nd_opts_pi;
+		     p;
+		     p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) {
+			addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3);
+		}
+	}
+
+	if (ndopts.nd_opts_mtu) {
+		__be32 n;
+		u32 mtu;
+
+		memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
+		mtu = ntohl(n);
+
+		if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
+			ND_PRINTK2(KERN_WARNING
+				   "ICMPv6 RA: invalid mtu: %d\n",
+				   mtu);
+		} else if (in6_dev->cnf.mtu6 != mtu) {
+			in6_dev->cnf.mtu6 = mtu;
+
+			if (rt)
+				dst_metric_set(&rt->dst, RTAX_MTU, mtu);
+
+			rt6_mtu_change(skb->dev, mtu);
+		}
+	}
+
+	if (ndopts.nd_useropts) {
+		struct nd_opt_hdr *p;
+		for (p = ndopts.nd_useropts;
+		     p;
+		     p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) {
+			ndisc_ra_useropt(skb, p);
+		}
+	}
+
+	if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 RA: invalid RA options");
+	}
+out:
+	if (rt)
+		dst_release(&rt->dst);
+	else if (neigh)
+		neigh_release(neigh);
+	in6_dev_put(in6_dev);
+}
+
+static void ndisc_redirect_rcv(struct sk_buff *skb)
+{
+	struct inet6_dev *in6_dev;
+	struct icmp6hdr *icmph;
+	const struct in6_addr *dest;
+	const struct in6_addr *target;	/* new first hop to destination */
+	struct neighbour *neigh;
+	int on_link = 0;
+	struct ndisc_options ndopts;
+	int optlen;
+	u8 *lladdr = NULL;
+
+#ifdef CONFIG_IPV6_NDISC_NODETYPE
+	switch (skb->ndisc_nodetype) {
+	case NDISC_NODETYPE_HOST:
+	case NDISC_NODETYPE_NODEFAULT:
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: from host or unauthorized router\n");
+		return;
+	}
+#endif
+
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: source address is not link-local.\n");
+		return;
+	}
+
+	optlen = skb->tail - skb->transport_header;
+	optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+
+	if (optlen < 0) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: packet too short\n");
+		return;
+	}
+
+	icmph = icmp6_hdr(skb);
+	target = (const struct in6_addr *) (icmph + 1);
+	dest = target + 1;
+
+	if (ipv6_addr_is_multicast(dest)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: destination address is multicast.\n");
+		return;
+	}
+
+	if (ipv6_addr_equal(dest, target)) {
+		on_link = 1;
+	} else if (ipv6_addr_type(target) !=
+		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: target address is not link-local unicast.\n");
+		return;
+	}
+
+	in6_dev = in6_dev_get(skb->dev);
+	if (!in6_dev)
+		return;
+	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) {
+		in6_dev_put(in6_dev);
+		return;
+	}
+
+	/* RFC2461 8.1:
+	 *	The IP source address of the Redirect MUST be the same as the current
+	 *	first-hop router for the specified ICMP Destination Address.
+	 */
+
+	if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: invalid ND options\n");
+		in6_dev_put(in6_dev);
+		return;
+	}
+	if (ndopts.nd_opts_tgt_lladdr) {
+		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
+					     skb->dev);
+		if (!lladdr) {
+			ND_PRINTK2(KERN_WARNING
+				   "ICMPv6 Redirect: invalid link-layer address length\n");
+			in6_dev_put(in6_dev);
+			return;
+		}
+	}
+
+	neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+	if (neigh) {
+		rt6_redirect(dest, &ipv6_hdr(skb)->daddr,
+			     &ipv6_hdr(skb)->saddr, neigh, lladdr,
+			     on_link);
+		neigh_release(neigh);
+	}
+	in6_dev_put(in6_dev);
+}
+
+void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
+			 const struct in6_addr *target)
+{
+	struct net_device *dev = skb->dev;
+	struct net *net = dev_net(dev);
+	struct sock *sk = net->ipv6.ndisc_sk;
+	int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+	struct sk_buff *buff;
+	struct icmp6hdr *icmph;
+	struct in6_addr saddr_buf;
+	struct in6_addr *addrp;
+	struct rt6_info *rt;
+	struct dst_entry *dst;
+	struct inet6_dev *idev;
+	struct flowi6 fl6;
+	u8 *opt;
+	int rd_len;
+	int err;
+	u8 ha_buf[MAX_ADDR_LEN], *ha = NULL;
+
+	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: no link-local address on %s\n",
+			   dev->name);
+		return;
+	}
+
+	if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
+	    ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
+		ND_PRINTK2(KERN_WARNING
+			"ICMPv6 Redirect: target address is not link-local unicast.\n");
+		return;
+	}
+
+	icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
+			 &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
+
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == NULL)
+		return;
+
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		return;
+
+	rt = (struct rt6_info *) dst;
+
+	if (rt->rt6i_flags & RTF_GATEWAY) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 Redirect: destination is not a neighbour.\n");
+		goto release;
+	}
+	if (!rt->rt6i_peer)
+		rt6_bind_peer(rt, 1);
+	if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
+		goto release;
+
+	if (dev->addr_len) {
+		read_lock_bh(&neigh->lock);
+		if (neigh->nud_state & NUD_VALID) {
+			memcpy(ha_buf, neigh->ha, dev->addr_len);
+			read_unlock_bh(&neigh->lock);
+			ha = ha_buf;
+			len += ndisc_opt_addr_space(dev);
+		} else
+			read_unlock_bh(&neigh->lock);
+	}
+
+	rd_len = min_t(unsigned int,
+		     IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8);
+	rd_len &= ~0x7;
+	len += rd_len;
+
+	buff = sock_alloc_send_skb(sk,
+				   (MAX_HEADER + sizeof(struct ipv6hdr) +
+				    len + LL_ALLOCATED_SPACE(dev)),
+				   1, &err);
+	if (buff == NULL) {
+		ND_PRINTK0(KERN_ERR
+			   "ICMPv6 Redirect: %s() failed to allocate an skb, err=%d.\n",
+			   __func__, err);
+		goto release;
+	}
+
+	skb_reserve(buff, LL_RESERVED_SPACE(dev));
+	ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr,
+		   IPPROTO_ICMPV6, len);
+
+	skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data);
+	skb_put(buff, len);
+	icmph = icmp6_hdr(buff);
+
+	memset(icmph, 0, sizeof(struct icmp6hdr));
+	icmph->icmp6_type = NDISC_REDIRECT;
+
+	/*
+	 *	copy target and destination addresses
+	 */
+
+	addrp = (struct in6_addr *)(icmph + 1);
+	ipv6_addr_copy(addrp, target);
+	addrp++;
+	ipv6_addr_copy(addrp, &ipv6_hdr(skb)->daddr);
+
+	opt = (u8*) (addrp + 1);
+
+	/*
+	 *	include target_address option
+	 */
+
+	if (ha)
+		opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha,
+					     dev->addr_len, dev->type);
+
+	/*
+	 *	build redirect option and copy skb over to the new packet.
+	 */
+
+	memset(opt, 0, 8);
+	*(opt++) = ND_OPT_REDIRECT_HDR;
+	*(opt++) = (rd_len >> 3);
+	opt += 6;
+
+	memcpy(opt, ipv6_hdr(skb), rd_len - 8);
+
+	icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr,
+					     len, IPPROTO_ICMPV6,
+					     csum_partial(icmph, len, 0));
+
+	skb_dst_set(buff, dst);
+	idev = in6_dev_get(dst->dev);
+	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev,
+		      dst_output);
+	if (!err) {
+		ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT);
+		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+	}
+
+	if (likely(idev != NULL))
+		in6_dev_put(idev);
+	return;
+
+release:
+	dst_release(dst);
+}
+
+static void pndisc_redo(struct sk_buff *skb)
+{
+	ndisc_recv_ns(skb);
+	kfree_skb(skb);
+}
+
+int ndisc_rcv(struct sk_buff *skb)
+{
+	struct nd_msg *msg;
+
+	if (!pskb_may_pull(skb, skb->len))
+		return 0;
+
+	msg = (struct nd_msg *)skb_transport_header(skb);
+
+	__skb_push(skb, skb->data - skb_transport_header(skb));
+
+	if (ipv6_hdr(skb)->hop_limit != 255) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NDISC: invalid hop-limit: %d\n",
+			   ipv6_hdr(skb)->hop_limit);
+		return 0;
+	}
+
+	if (msg->icmph.icmp6_code != 0) {
+		ND_PRINTK2(KERN_WARNING
+			   "ICMPv6 NDISC: invalid ICMPv6 code: %d\n",
+			   msg->icmph.icmp6_code);
+		return 0;
+	}
+
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+	switch (msg->icmph.icmp6_type) {
+	case NDISC_NEIGHBOUR_SOLICITATION:
+		ndisc_recv_ns(skb);
+		break;
+
+	case NDISC_NEIGHBOUR_ADVERTISEMENT:
+		ndisc_recv_na(skb);
+		break;
+
+	case NDISC_ROUTER_SOLICITATION:
+		ndisc_recv_rs(skb);
+		break;
+
+	case NDISC_ROUTER_ADVERTISEMENT:
+		ndisc_router_discovery(skb);
+		break;
+
+	case NDISC_REDIRECT:
+		ndisc_redirect_rcv(skb);
+		break;
+	}
+
+	return 0;
+}
+
+static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+
+	switch (event) {
+	case NETDEV_CHANGEADDR:
+		neigh_changeaddr(&nd_tbl, dev);
+		fib6_run_gc(~0UL, net);
+		break;
+	case NETDEV_DOWN:
+		neigh_ifdown(&nd_tbl, dev);
+		fib6_run_gc(~0UL, net);
+		break;
+	case NETDEV_NOTIFY_PEERS:
+		ndisc_send_unsol_na(dev);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ndisc_netdev_notifier = {
+	.notifier_call = ndisc_netdev_event,
+};
+
+#ifdef CONFIG_SYSCTL
+static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
+					 const char *func, const char *dev_name)
+{
+	static char warncomm[TASK_COMM_LEN];
+	static int warned;
+	if (strcmp(warncomm, current->comm) && warned < 5) {
+		strcpy(warncomm, current->comm);
+		printk(KERN_WARNING
+			"process `%s' is using deprecated sysctl (%s) "
+			"net.ipv6.neigh.%s.%s; "
+			"Use net.ipv6.neigh.%s.%s_ms "
+			"instead.\n",
+			warncomm, func,
+			dev_name, ctl->procname,
+			dev_name, ctl->procname);
+		warned++;
+	}
+}
+
+int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net_device *dev = ctl->extra1;
+	struct inet6_dev *idev;
+	int ret;
+
+	if ((strcmp(ctl->procname, "retrans_time") == 0) ||
+	    (strcmp(ctl->procname, "base_reachable_time") == 0))
+		ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
+
+	if (strcmp(ctl->procname, "retrans_time") == 0)
+		ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	else if (strcmp(ctl->procname, "base_reachable_time") == 0)
+		ret = proc_dointvec_jiffies(ctl, write,
+					    buffer, lenp, ppos);
+
+	else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
+		 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
+		ret = proc_dointvec_ms_jiffies(ctl, write,
+					       buffer, lenp, ppos);
+	else
+		ret = -1;
+
+	if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
+		if (ctl->data == &idev->nd_parms->base_reachable_time)
+			idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time);
+		idev->tstamp = jiffies;
+		inet6_ifinfo_notify(RTM_NEWLINK, idev);
+		in6_dev_put(idev);
+	}
+	return ret;
+}
+
+
+#endif
+
+static int __net_init ndisc_net_init(struct net *net)
+{
+	struct ipv6_pinfo *np;
+	struct sock *sk;
+	int err;
+
+	err = inet_ctl_sock_create(&sk, PF_INET6,
+				   SOCK_RAW, IPPROTO_ICMPV6, net);
+	if (err < 0) {
+		ND_PRINTK0(KERN_ERR
+			   "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n",
+			   err);
+		return err;
+	}
+
+	net->ipv6.ndisc_sk = sk;
+
+	np = inet6_sk(sk);
+	np->hop_limit = 255;
+	/* Do not loopback ndisc messages */
+	np->mc_loop = 0;
+
+	return 0;
+}
+
+static void __net_exit ndisc_net_exit(struct net *net)
+{
+	inet_ctl_sock_destroy(net->ipv6.ndisc_sk);
+}
+
+static struct pernet_operations ndisc_net_ops = {
+	.init = ndisc_net_init,
+	.exit = ndisc_net_exit,
+};
+
+int __init ndisc_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&ndisc_net_ops);
+	if (err)
+		return err;
+	/*
+	 * Initialize the neighbour table
+	 */
+	neigh_table_init(&nd_tbl);
+
+#ifdef CONFIG_SYSCTL
+	err = neigh_sysctl_register(NULL, &nd_tbl.parms, "ipv6",
+				    &ndisc_ifinfo_sysctl_change);
+	if (err)
+		goto out_unregister_pernet;
+#endif
+	err = register_netdevice_notifier(&ndisc_netdev_notifier);
+	if (err)
+		goto out_unregister_sysctl;
+out:
+	return err;
+
+out_unregister_sysctl:
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_unregister(&nd_tbl.parms);
+out_unregister_pernet:
+#endif
+	unregister_pernet_subsys(&ndisc_net_ops);
+	goto out;
+}
+
+void ndisc_cleanup(void)
+{
+	unregister_netdevice_notifier(&ndisc_netdev_notifier);
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_unregister(&nd_tbl.parms);
+#endif
+	neigh_table_clear(&nd_tbl);
+	unregister_pernet_subsys(&ndisc_net_ops);
+}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 00000000..30fcee46
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,188 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/xfrm.h>
+#include <net/ip6_checksum.h>
+#include <net/netfilter/nf_queue.h>
+
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct dst_entry *dst;
+	struct flowi6 fl6 = {
+		.flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+		.flowi6_mark = skb->mark,
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+	};
+
+	dst = ip6_route_output(net, skb->sk, &fl6);
+	if (dst->error) {
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+		dst_release(dst);
+		return -EINVAL;
+	}
+
+	/* Drop old route. */
+	skb_dst_drop(skb);
+
+	skb_dst_set(skb, dst);
+
+#ifdef CONFIG_XFRM
+	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
+		skb_dst_set(skb, NULL);
+		dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0);
+		if (IS_ERR(dst))
+			return -1;
+		skb_dst_set(skb, dst);
+	}
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip6_rt_info {
+	struct in6_addr daddr;
+	struct in6_addr saddr;
+	u_int32_t mark;
+};
+
+static void nf_ip6_saveroute(const struct sk_buff *skb,
+			     struct nf_queue_entry *entry)
+{
+	struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->hook == NF_INET_LOCAL_OUT) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+		rt_info->mark = skb->mark;
+	}
+}
+
+static int nf_ip6_reroute(struct sk_buff *skb,
+			  const struct nf_queue_entry *entry)
+{
+	struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+	if (entry->hook == NF_INET_LOCAL_OUT) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+		    !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
+		    skb->mark != rt_info->mark)
+			return ip6_route_me_harder(skb);
+	}
+	return 0;
+}
+
+static int nf_ip6_route(struct net *net, struct dst_entry **dst,
+			struct flowi *fl, bool strict)
+{
+	static const struct ipv6_pinfo fake_pinfo;
+	static const struct inet_sock fake_sk = {
+		/* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */
+		.sk.sk_bound_dev_if = 1,
+		.pinet6 = (struct ipv6_pinfo *) &fake_pinfo,
+	};
+	const void *sk = strict ? &fake_sk : NULL;
+
+	*dst = ip6_route_output(net, sk, &fl->u.ip6);
+	return (*dst)->error;
+}
+
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+			     unsigned int dataoff, u_int8_t protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				     skb->len - dataoff, protocol,
+				     csum_sub(skb->csum,
+					      skb_checksum(skb, 0,
+							   dataoff, 0)))) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = ~csum_unfold(
+				csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					     skb->len - dataoff,
+					     protocol,
+					     csum_sub(0,
+						      skb_checksum(skb, 0,
+								   dataoff, 0))));
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				       unsigned int dataoff, unsigned int len,
+				       u_int8_t protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__wsum hsum;
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip6_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		hsum = skb_checksum(skb, 0, dataoff, 0);
+		skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+							 &ip6h->daddr,
+							 skb->len - dataoff,
+							 protocol,
+							 csum_sub(0, hsum)));
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+};
+
+static const struct nf_afinfo nf_ip6_afinfo = {
+	.family			= AF_INET6,
+	.checksum		= nf_ip6_checksum,
+	.checksum_partial	= nf_ip6_checksum_partial,
+	.route			= nf_ip6_route,
+	.saveroute		= nf_ip6_saveroute,
+	.reroute		= nf_ip6_reroute,
+	.route_key_size		= sizeof(struct ip6_rt_info),
+};
+
+int __init ipv6_netfilter_init(void)
+{
+	return nf_register_afinfo(&nf_ip6_afinfo);
+}
+
+/* This can be called from inet6_init() on errors, so it cannot
+ * be marked __exit. -DaveM
+ */
+void ipv6_netfilter_fini(void)
+{
+	nf_unregister_afinfo(&nf_ip6_afinfo);
+}
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
new file mode 100644
index 00000000..5bbf5316
--- /dev/null
+++ b/net/ipv6/netfilter/Kconfig
@@ -0,0 +1,224 @@
+#
+# IP netfilter configuration
+#
+
+menu "IPv6: Netfilter Configuration"
+	depends on INET && IPV6 && NETFILTER
+
+config NF_DEFRAG_IPV6
+	tristate
+	default n
+
+config NF_CONNTRACK_IPV6
+	tristate "IPv6 connection tracking support"
+	depends on INET && IPV6 && NF_CONNTRACK
+	default m if NETFILTER_ADVANCED=n
+	select NF_DEFRAG_IPV6
+	---help---
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections.
+
+	  This is IPv6 support on Layer 3 independent connection tracking.
+	  Layer 3 independent connection tracking is experimental scheme
+	  which generalize ip_conntrack to support other layer 3 protocols.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_QUEUE
+	tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
+	depends on INET && IPV6 && NETFILTER
+	depends on NETFILTER_ADVANCED
+	---help---
+
+	  This option adds a queue handler to the kernel for IPv6
+	  packets which enables users to receive the filtered packets
+	  with QUEUE target using libipq.
+
+	  This option enables the old IPv6-only "ip6_queue" implementation
+	  which has been obsoleted by the new "nfnetlink_queue" code (see
+	  CONFIG_NETFILTER_NETLINK_QUEUE).
+
+	  (C) Fernando Anton 2001
+	  IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
+	  Universidad Carlos III de Madrid
+	  Universidad Politecnica de Alcala de Henares
+	  email: <fanton@it.uc3m.es>.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_IPTABLES
+	tristate "IP6 tables support (required for filtering)"
+	depends on INET && IPV6
+	select NETFILTER_XTABLES
+	default m if NETFILTER_ADVANCED=n
+	help
+	  ip6tables is a general, extensible packet identification framework.
+	  Currently only the packet filtering and packet mangling subsystem
+	  for IPv6 use this, but connection tracking is going to follow.
+	  Say 'Y' or 'M' here if you want to use either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP6_NF_IPTABLES
+
+# The simple matches.
+config IP6_NF_MATCH_AH
+	tristate '"ah" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This module allows one to match AH packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_EUI64
+	tristate '"eui64" address check'
+	depends on NETFILTER_ADVANCED
+	help
+	  This module performs checking on the IPv6 source address
+	  Compares the last 64 bits with the EUI64 (delivered
+	  from the MAC address) address
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_FRAG
+	tristate '"frag" Fragmentation header match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  frag matching allows you to match packets based on the fragmentation
+	  header of the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_OPTS
+	tristate '"hbh" hop-by-hop and "dst" opts header match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This allows one to match packets based on the hop-by-hop
+	  and destination options headers of a packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_HL
+	tristate '"hl" hoplimit match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MATCH_HL
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MATCH_HL.
+
+config IP6_NF_MATCH_IPV6HEADER
+	tristate '"ipv6header" IPv6 Extension Headers Match'
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This module allows one to match packets based upon
+	  the ipv6 extension headers.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_MH
+	tristate '"mh" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This module allows one to match MH packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_MATCH_RT
+	tristate '"rt" Routing header match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  rt matching allows you to match packets based on the routing
+	  header of the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# The targets
+config IP6_NF_TARGET_HL
+	tristate '"HL" hoplimit target support'
+	depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
+	select NETFILTER_XT_TARGET_HL
+	---help---
+	This is a backwards-compatible option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_TARGET_HL.
+
+config IP6_NF_TARGET_LOG
+	tristate "LOG target support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option adds a `LOG' target, which allows you to create rules in
+	  any iptables table which records the packet header to the syslog.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_FILTER
+	tristate "Packet filtering"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Packet filtering defines a table `filter', which has a series of
+	  rules for simple packet filtering at local input, forwarding and
+	  local output.  See the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_TARGET_REJECT
+	tristate "REJECT target support"
+	depends on IP6_NF_FILTER
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The REJECT target allows a filtering rule to specify that an ICMPv6
+	  error should be issued in response to an incoming packet, rather
+	  than silently being dropped.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_TARGET_REJECT_SKERR
+	bool "Force socket error when rejecting with icmp*"
+	depends on IP6_NF_TARGET_REJECT
+	default n
+	help
+          This option enables turning a "--reject-with icmp*" into a matching
+          socket error also.
+	  The REJECT target normally allows sending an ICMP message. But it
+          leaves the local socket unaware of any ingress rejects.
+
+	  If unsure, say N.
+
+config IP6_NF_MANGLE
+	tristate "Packet mangling"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option adds a `mangle' table to iptables: see the man page for
+	  iptables(8).  This table is used for various packet alterations
+	  which can effect how the packet is routed.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP6_NF_RAW
+	tristate  'raw table support (required for TRACE)'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `raw' table to ip6tables. This table is the very
+	  first in the netfilter framework and hooks in at the PREROUTING
+	  and OUTPUT chains.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+# security table for MAC policy
+config IP6_NF_SECURITY
+       tristate "Security table"
+       depends on SECURITY
+       depends on NETFILTER_ADVANCED
+       help
+         This option adds a `security' table to iptables, for use
+         with Mandatory Access Control (MAC) policy.
+        
+         If unsure, say N.
+
+endif # IP6_NF_IPTABLES
+
+endmenu
+
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
new file mode 100644
index 00000000..abfee91c
--- /dev/null
+++ b/net/ipv6/netfilter/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the netfilter modules on top of IPv6.
+#
+
+# Link order matters here.
+obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
+obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
+obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
+obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
+obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
+obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
+
+# l3 independent conntrack
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
+
+# defrag
+nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
+obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
+
+# matches
+obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
+obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
+obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
+obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o
+obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o
+obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
+
+# targets
+obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
new file mode 100644
index 00000000..24939486
--- /dev/null
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -0,0 +1,640 @@
+/*
+ * This is a module which is used for queueing IPv6 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2001 Fernando Anton, this code is GPL.
+ *     IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
+ *     Universidad Carlos III de Madrid - Leganes (Madrid) - Spain
+ *     Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain
+ *     email: fanton@it.uc3m.es
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ipv6.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/nf_queue.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+#define IPQ_QMAX_DEFAULT 1024
+#define IPQ_PROC_FS_NAME "ip6_queue"
+#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
+
+typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
+static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
+static DEFINE_SPINLOCK(queue_lock);
+static int peer_pid __read_mostly;
+static unsigned int copy_range __read_mostly;
+static unsigned int queue_total;
+static unsigned int queue_dropped = 0;
+static unsigned int queue_user_dropped = 0;
+static struct sock *ipqnl __read_mostly;
+static LIST_HEAD(queue_list);
+static DEFINE_MUTEX(ipqnl_mutex);
+
+static inline void
+__ipq_enqueue_entry(struct nf_queue_entry *entry)
+{
+       list_add_tail(&entry->list, &queue_list);
+       queue_total++;
+}
+
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status = 0;
+
+	switch(mode) {
+	case IPQ_COPY_NONE:
+	case IPQ_COPY_META:
+		copy_mode = mode;
+		copy_range = 0;
+		break;
+
+	case IPQ_COPY_PACKET:
+		if (range > 0xFFFF)
+			range = 0xFFFF;
+		copy_range = range;
+		copy_mode = mode;
+		break;
+
+	default:
+		status = -EINVAL;
+
+	}
+	return status;
+}
+
+static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data);
+
+static inline void
+__ipq_reset(void)
+{
+	peer_pid = 0;
+	net_disable_timestamp();
+	__ipq_set_mode(IPQ_COPY_NONE, 0);
+	__ipq_flush(NULL, 0);
+}
+
+static struct nf_queue_entry *
+ipq_find_dequeue_entry(unsigned long id)
+{
+	struct nf_queue_entry *entry = NULL, *i;
+
+	spin_lock_bh(&queue_lock);
+
+	list_for_each_entry(i, &queue_list, list) {
+		if ((unsigned long)i == id) {
+			entry = i;
+			break;
+		}
+	}
+
+	if (entry) {
+		list_del(&entry->list);
+		queue_total--;
+	}
+
+	spin_unlock_bh(&queue_lock);
+	return entry;
+}
+
+static void
+__ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct nf_queue_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &queue_list, list) {
+		if (!cmpfn || cmpfn(entry, data)) {
+			list_del(&entry->list);
+			queue_total--;
+			nf_reinject(entry, NF_DROP);
+		}
+	}
+}
+
+static void
+ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
+{
+	spin_lock_bh(&queue_lock);
+	__ipq_flush(cmpfn, data);
+	spin_unlock_bh(&queue_lock);
+}
+
+static struct sk_buff *
+ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
+{
+	sk_buff_data_t old_tail;
+	size_t size = 0;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct ipq_packet_msg *pmsg;
+	struct nlmsghdr *nlh;
+	struct timeval tv;
+
+	switch (ACCESS_ONCE(copy_mode)) {
+	case IPQ_COPY_META:
+	case IPQ_COPY_NONE:
+		size = NLMSG_SPACE(sizeof(*pmsg));
+		break;
+
+	case IPQ_COPY_PACKET:
+		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
+		    (*errp = skb_checksum_help(entry->skb)))
+			return NULL;
+
+		data_len = ACCESS_ONCE(copy_range);
+		if (data_len == 0 || data_len > entry->skb->len)
+			data_len = entry->skb->len;
+
+		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+		break;
+
+	default:
+		*errp = -EINVAL;
+		return NULL;
+	}
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+
+	old_tail = skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+	pmsg = NLMSG_DATA(nlh);
+	memset(pmsg, 0, sizeof(*pmsg));
+
+	pmsg->packet_id       = (unsigned long )entry;
+	pmsg->data_len        = data_len;
+	tv = ktime_to_timeval(entry->skb->tstamp);
+	pmsg->timestamp_sec   = tv.tv_sec;
+	pmsg->timestamp_usec  = tv.tv_usec;
+	pmsg->mark            = entry->skb->mark;
+	pmsg->hook            = entry->hook;
+	pmsg->hw_protocol     = entry->skb->protocol;
+
+	if (entry->indev)
+		strcpy(pmsg->indev_name, entry->indev->name);
+	else
+		pmsg->indev_name[0] = '\0';
+
+	if (entry->outdev)
+		strcpy(pmsg->outdev_name, entry->outdev->name);
+	else
+		pmsg->outdev_name[0] = '\0';
+
+	if (entry->indev && entry->skb->dev &&
+	    entry->skb->mac_header != entry->skb->network_header) {
+		pmsg->hw_type = entry->skb->dev->type;
+		pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr);
+	}
+
+	if (data_len)
+		if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
+			BUG();
+
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+	*errp = -EINVAL;
+	printk(KERN_ERR "ip6_queue: error creating packet message\n");
+	return NULL;
+}
+
+static int
+ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+	int status = -EINVAL;
+	struct sk_buff *nskb;
+
+	if (copy_mode == IPQ_COPY_NONE)
+		return -EAGAIN;
+
+	nskb = ipq_build_packet_message(entry, &status);
+	if (nskb == NULL)
+		return status;
+
+	spin_lock_bh(&queue_lock);
+
+	if (!peer_pid)
+		goto err_out_free_nskb;
+
+	if (queue_total >= queue_maxlen) {
+		queue_dropped++;
+		status = -ENOSPC;
+		if (net_ratelimit())
+			printk (KERN_WARNING "ip6_queue: fill at %d entries, "
+				"dropping packet(s).  Dropped: %d\n", queue_total,
+				queue_dropped);
+		goto err_out_free_nskb;
+	}
+
+	/* netlink_unicast will either free the nskb or attach it to a socket */
+	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+	if (status < 0) {
+		queue_user_dropped++;
+		goto err_out_unlock;
+	}
+
+	__ipq_enqueue_entry(entry);
+
+	spin_unlock_bh(&queue_lock);
+	return status;
+
+err_out_free_nskb:
+	kfree_skb(nskb);
+
+err_out_unlock:
+	spin_unlock_bh(&queue_lock);
+	return status;
+}
+
+static int
+ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e)
+{
+	int diff;
+	struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload;
+	struct sk_buff *nskb;
+
+	if (v->data_len < sizeof(*user_iph))
+		return 0;
+	diff = v->data_len - e->skb->len;
+	if (diff < 0) {
+		if (pskb_trim(e->skb, v->data_len))
+			return -ENOMEM;
+	} else if (diff > 0) {
+		if (v->data_len > 0xFFFF)
+			return -EINVAL;
+		if (diff > skb_tailroom(e->skb)) {
+			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+					       diff, GFP_ATOMIC);
+			if (!nskb) {
+				printk(KERN_WARNING "ip6_queue: OOM "
+				      "in mangle, dropping packet\n");
+				return -ENOMEM;
+			}
+			kfree_skb(e->skb);
+			e->skb = nskb;
+		}
+		skb_put(e->skb, diff);
+	}
+	if (!skb_make_writable(e->skb, v->data_len))
+		return -ENOMEM;
+	skb_copy_to_linear_data(e->skb, v->payload, v->data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
+
+	return 0;
+}
+
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
+{
+	struct nf_queue_entry *entry;
+
+	if (vmsg->value > NF_MAX_VERDICT)
+		return -EINVAL;
+
+	entry = ipq_find_dequeue_entry(vmsg->id);
+	if (entry == NULL)
+		return -ENOENT;
+	else {
+		int verdict = vmsg->value;
+
+		if (vmsg->data_len && vmsg->data_len == len)
+			if (ipq_mangle_ipv6(vmsg, entry) < 0)
+				verdict = NF_DROP;
+
+		nf_reinject(entry, verdict);
+		return 0;
+	}
+}
+
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status;
+
+	spin_lock_bh(&queue_lock);
+	status = __ipq_set_mode(mode, range);
+	spin_unlock_bh(&queue_lock);
+	return status;
+}
+
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+		 unsigned char type, unsigned int len)
+{
+	int status = 0;
+
+	if (len < sizeof(*pmsg))
+		return -EINVAL;
+
+	switch (type) {
+	case IPQM_MODE:
+		status = ipq_set_mode(pmsg->msg.mode.value,
+				      pmsg->msg.mode.range);
+		break;
+
+	case IPQM_VERDICT:
+		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+			status = -EINVAL;
+		else
+			status = ipq_set_verdict(&pmsg->msg.verdict,
+						 len - sizeof(*pmsg));
+			break;
+	default:
+		status = -EINVAL;
+	}
+	return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+	if (entry->indev)
+		if (entry->indev->ifindex == ifindex)
+			return 1;
+
+	if (entry->outdev)
+		if (entry->outdev->ifindex == ifindex)
+			return 1;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (entry->skb->nf_bridge) {
+		if (entry->skb->nf_bridge->physindev &&
+		    entry->skb->nf_bridge->physindev->ifindex == ifindex)
+			return 1;
+		if (entry->skb->nf_bridge->physoutdev &&
+		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
+			return 1;
+	}
+#endif
+	return 0;
+}
+
+static void
+ipq_dev_drop(int ifindex)
+{
+	ipq_flush(dev_cmp, ifindex);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void
+__ipq_rcv_skb(struct sk_buff *skb)
+{
+	int status, type, pid, flags;
+	unsigned int nlmsglen, skblen;
+	struct nlmsghdr *nlh;
+
+	skblen = skb->len;
+	if (skblen < sizeof(*nlh))
+		return;
+
+	nlh = nlmsg_hdr(skb);
+	nlmsglen = nlh->nlmsg_len;
+	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+		return;
+
+	pid = nlh->nlmsg_pid;
+	flags = nlh->nlmsg_flags;
+
+	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
+		RCV_SKB_FAIL(-EINVAL);
+
+	if (flags & MSG_TRUNC)
+		RCV_SKB_FAIL(-ECOMM);
+
+	type = nlh->nlmsg_type;
+	if (type < NLMSG_NOOP || type >= IPQM_MAX)
+		RCV_SKB_FAIL(-EINVAL);
+
+	if (type <= IPQM_BASE)
+		return;
+
+	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+		RCV_SKB_FAIL(-EPERM);
+
+	spin_lock_bh(&queue_lock);
+
+	if (peer_pid) {
+		if (peer_pid != pid) {
+			spin_unlock_bh(&queue_lock);
+			RCV_SKB_FAIL(-EBUSY);
+		}
+	} else {
+		net_enable_timestamp();
+		peer_pid = pid;
+	}
+
+	spin_unlock_bh(&queue_lock);
+
+	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+				  nlmsglen - NLMSG_LENGTH(0));
+	if (status < 0)
+		RCV_SKB_FAIL(status);
+
+	if (flags & NLM_F_ACK)
+		netlink_ack(skb, nlh, 0);
+}
+
+static void
+ipq_rcv_skb(struct sk_buff *skb)
+{
+	mutex_lock(&ipqnl_mutex);
+	__ipq_rcv_skb(skb);
+	mutex_unlock(&ipqnl_mutex);
+}
+
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+		  unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	/* Drop any packets associated with the downed device */
+	if (event == NETDEV_DOWN)
+		ipq_dev_drop(dev->ifindex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_dev_notifier = {
+	.notifier_call	= ipq_rcv_dev_event,
+};
+
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+		 unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) {
+		spin_lock_bh(&queue_lock);
+		if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
+			__ipq_reset();
+		spin_unlock_bh(&queue_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_nl_notifier = {
+	.notifier_call	= ipq_rcv_nl_event,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *ipq_sysctl_header;
+
+static ctl_table ipq_table[] = {
+	{
+		.procname	= NET_IPQ_QMAX_NAME,
+		.data		= &queue_maxlen,
+		.maxlen		= sizeof(queue_maxlen),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+#endif
+
+#ifdef CONFIG_PROC_FS
+static int ip6_queue_show(struct seq_file *m, void *v)
+{
+	spin_lock_bh(&queue_lock);
+
+	seq_printf(m,
+		      "Peer PID          : %d\n"
+		      "Copy mode         : %hu\n"
+		      "Copy range        : %u\n"
+		      "Queue length      : %u\n"
+		      "Queue max. length : %u\n"
+		      "Queue dropped     : %u\n"
+		      "Netfilter dropped : %u\n",
+		      peer_pid,
+		      copy_mode,
+		      copy_range,
+		      queue_total,
+		      queue_maxlen,
+		      queue_dropped,
+		      queue_user_dropped);
+
+	spin_unlock_bh(&queue_lock);
+	return 0;
+}
+
+static int ip6_queue_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ip6_queue_show, NULL);
+}
+
+static const struct file_operations ip6_queue_proc_fops = {
+	.open		= ip6_queue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.owner		= THIS_MODULE,
+};
+#endif
+
+static const struct nf_queue_handler nfqh = {
+	.name	= "ip6_queue",
+	.outfn	= &ipq_enqueue_packet,
+};
+
+static int __init ip6_queue_init(void)
+{
+	int status = -ENOMEM;
+	struct proc_dir_entry *proc __maybe_unused;
+
+	netlink_register_notifier(&ipq_nl_notifier);
+	ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0,
+			              ipq_rcv_skb, NULL, THIS_MODULE);
+	if (ipqnl == NULL) {
+		printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net,
+			   &ip6_queue_proc_fops);
+	if (!proc) {
+		printk(KERN_ERR "ip6_queue: failed to create proc entry\n");
+		goto cleanup_ipqnl;
+	}
+#endif
+	register_netdevice_notifier(&ipq_dev_notifier);
+#ifdef CONFIG_SYSCTL
+	ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table);
+#endif
+	status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh);
+	if (status < 0) {
+		printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
+		goto cleanup_sysctl;
+	}
+	return status;
+
+cleanup_sysctl:
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(ipq_sysctl_header);
+#endif
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
+
+cleanup_ipqnl: __maybe_unused
+	netlink_kernel_release(ipqnl);
+	mutex_lock(&ipqnl_mutex);
+	mutex_unlock(&ipqnl_mutex);
+
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&ipq_nl_notifier);
+	return status;
+}
+
+static void __exit ip6_queue_fini(void)
+{
+	nf_unregister_queue_handlers(&nfqh);
+
+	ipq_flush(NULL, 0);
+
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(ipq_sysctl_header);
+#endif
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
+
+	netlink_kernel_release(ipqnl);
+	mutex_lock(&ipqnl_mutex);
+	mutex_unlock(&ipqnl_mutex);
+
+	netlink_unregister_notifier(&ipq_nl_notifier);
+}
+
+MODULE_DESCRIPTION("IPv6 packet queue handler");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW);
+
+module_init(ip6_queue_init);
+module_exit(ip6_queue_fini);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
new file mode 100644
index 00000000..14cb3100
--- /dev/null
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -0,0 +1,2375 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/capability.h>
+#include <linux/in.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/poison.h>
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/compat.h>
+#include <asm/uaccess.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv6 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) pr_info(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) pr_info(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x)	WARN_ON(!(x))
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+void *ip6t_alloc_initial_table(const struct xt_table *info)
+{
+	return xt_alloc_initial_table(ip6t, IP6T);
+}
+EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
+
+/*
+   We keep a set of rules for each CPU, so we can avoid write-locking
+   them in the softirq when updating the counters and therefore
+   only need to read-lock in the softirq; doing a write_lock_bh() in user
+   context stops packets coming through and allows user context to read
+   the counters or update the rules.
+
+   Hence the start of any table is given by get_table() below.  */
+
+/* Check for an extension */
+int
+ip6t_ext_hdr(u8 nexthdr)
+{
+	return  (nexthdr == IPPROTO_HOPOPTS)   ||
+		(nexthdr == IPPROTO_ROUTING)   ||
+		(nexthdr == IPPROTO_FRAGMENT)  ||
+		(nexthdr == IPPROTO_ESP)       ||
+		(nexthdr == IPPROTO_AH)        ||
+		(nexthdr == IPPROTO_NONE)      ||
+		(nexthdr == IPPROTO_DSTOPTS);
+}
+
+/* Returns whether matches rule or not. */
+/* Performance critical - called for every packet */
+static inline bool
+ip6_packet_match(const struct sk_buff *skb,
+		 const char *indev,
+		 const char *outdev,
+		 const struct ip6t_ip6 *ip6info,
+		 unsigned int *protoff,
+		 int *fragoff, bool *hotdrop)
+{
+	unsigned long ret;
+	const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg)))
+
+	if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
+				       &ip6info->src), IP6T_INV_SRCIP) ||
+	    FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
+				       &ip6info->dst), IP6T_INV_DSTIP)) {
+		dprintf("Source or dest mismatch.\n");
+/*
+		dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
+			ipinfo->smsk.s_addr, ipinfo->src.s_addr,
+			ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : "");
+		dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
+			ipinfo->dmsk.s_addr, ipinfo->dst.s_addr,
+			ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/
+		return false;
+	}
+
+	ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);
+
+	if (FWINV(ret != 0, IP6T_INV_VIA_IN)) {
+		dprintf("VIA in mismatch (%s vs %s).%s\n",
+			indev, ip6info->iniface,
+			ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":"");
+		return false;
+	}
+
+	ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);
+
+	if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) {
+		dprintf("VIA out mismatch (%s vs %s).%s\n",
+			outdev, ip6info->outiface,
+			ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":"");
+		return false;
+	}
+
+/* ... might want to do something with class and flowlabel here ... */
+
+	/* look for the desired protocol header */
+	if((ip6info->flags & IP6T_F_PROTO)) {
+		int protohdr;
+		unsigned short _frag_off;
+
+		protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off);
+		if (protohdr < 0) {
+			if (_frag_off == 0)
+				*hotdrop = true;
+			return false;
+		}
+		*fragoff = _frag_off;
+
+		dprintf("Packet protocol %hi ?= %s%hi.\n",
+				protohdr,
+				ip6info->invflags & IP6T_INV_PROTO ? "!":"",
+				ip6info->proto);
+
+		if (ip6info->proto == protohdr) {
+			if(ip6info->invflags & IP6T_INV_PROTO) {
+				return false;
+			}
+			return true;
+		}
+
+		/* We need match for the '-p all', too! */
+		if ((ip6info->proto != 0) &&
+			!(ip6info->invflags & IP6T_INV_PROTO))
+			return false;
+	}
+	return true;
+}
+
+/* should be ip6 safe */
+static bool
+ip6_checkentry(const struct ip6t_ip6 *ipv6)
+{
+	if (ipv6->flags & ~IP6T_F_MASK) {
+		duprintf("Unknown flag bits set: %08X\n",
+			 ipv6->flags & ~IP6T_F_MASK);
+		return false;
+	}
+	if (ipv6->invflags & ~IP6T_INV_MASK) {
+		duprintf("Unknown invflag bits set: %08X\n",
+			 ipv6->invflags & ~IP6T_INV_MASK);
+		return false;
+	}
+	return true;
+}
+
+static unsigned int
+ip6t_error(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (net_ratelimit())
+		pr_info("error: `%s'\n", (const char *)par->targinfo);
+
+	return NF_DROP;
+}
+
+static inline struct ip6t_entry *
+get_entry(const void *base, unsigned int offset)
+{
+	return (struct ip6t_entry *)(base + offset);
+}
+
+/* All zeroes == unconditional rule. */
+/* Mildly perf critical (only if packet tracing is on) */
+static inline bool unconditional(const struct ip6t_ip6 *ipv6)
+{
+	static const struct ip6t_ip6 uncond;
+
+	return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
+}
+
+static inline const struct xt_entry_target *
+ip6t_get_target_c(const struct ip6t_entry *e)
+{
+	return ip6t_get_target((struct ip6t_entry *)e);
+}
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+/* This cries for unification! */
+static const char *const hooknames[] = {
+	[NF_INET_PRE_ROUTING]		= "PREROUTING",
+	[NF_INET_LOCAL_IN]		= "INPUT",
+	[NF_INET_FORWARD]		= "FORWARD",
+	[NF_INET_LOCAL_OUT]		= "OUTPUT",
+	[NF_INET_POST_ROUTING]		= "POSTROUTING",
+};
+
+enum nf_ip_trace_comments {
+	NF_IP6_TRACE_COMMENT_RULE,
+	NF_IP6_TRACE_COMMENT_RETURN,
+	NF_IP6_TRACE_COMMENT_POLICY,
+};
+
+static const char *const comments[] = {
+	[NF_IP6_TRACE_COMMENT_RULE]	= "rule",
+	[NF_IP6_TRACE_COMMENT_RETURN]	= "return",
+	[NF_IP6_TRACE_COMMENT_POLICY]	= "policy",
+};
+
+static struct nf_loginfo trace_loginfo = {
+	.type = NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level = 4,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+/* Mildly perf critical (only if packet tracing is on) */
+static inline int
+get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
+		      const char *hookname, const char **chainname,
+		      const char **comment, unsigned int *rulenum)
+{
+	const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
+
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
+		/* Head of user chain: ERROR target with chainname */
+		*chainname = t->target.data;
+		(*rulenum) = 0;
+	} else if (s == e) {
+		(*rulenum)++;
+
+		if (s->target_offset == sizeof(struct ip6t_entry) &&
+		    strcmp(t->target.u.kernel.target->name,
+			   XT_STANDARD_TARGET) == 0 &&
+		    t->verdict < 0 &&
+		    unconditional(&s->ipv6)) {
+			/* Tail of chains: STANDARD target (return/policy) */
+			*comment = *chainname == hookname
+				? comments[NF_IP6_TRACE_COMMENT_POLICY]
+				: comments[NF_IP6_TRACE_COMMENT_RETURN];
+		}
+		return 1;
+	} else
+		(*rulenum)++;
+
+	return 0;
+}
+
+static void trace_packet(const struct sk_buff *skb,
+			 unsigned int hook,
+			 const struct net_device *in,
+			 const struct net_device *out,
+			 const char *tablename,
+			 const struct xt_table_info *private,
+			 const struct ip6t_entry *e)
+{
+	const void *table_base;
+	const struct ip6t_entry *root;
+	const char *hookname, *chainname, *comment;
+	const struct ip6t_entry *iter;
+	unsigned int rulenum = 0;
+
+	table_base = private->entries[smp_processor_id()];
+	root = get_entry(table_base, private->hook_entry[hook]);
+
+	hookname = chainname = hooknames[hook];
+	comment = comments[NF_IP6_TRACE_COMMENT_RULE];
+
+	xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+		if (get_chainname_rulenum(iter, e, hookname,
+		    &chainname, &comment, &rulenum) != 0)
+			break;
+
+	nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo,
+		      "TRACE: %s:%s:%s:%u ",
+		      tablename, chainname, comment, rulenum);
+}
+#endif
+
+static inline __pure struct ip6t_entry *
+ip6t_next_entry(const struct ip6t_entry *entry)
+{
+	return (void *)entry + entry->next_offset;
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ip6t_do_table(struct sk_buff *skb,
+	      unsigned int hook,
+	      const struct net_device *in,
+	      const struct net_device *out,
+	      struct xt_table *table)
+{
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	/* Initializing verdict to NF_DROP keeps gcc happy. */
+	unsigned int verdict = NF_DROP;
+	const char *indev, *outdev;
+	const void *table_base;
+	struct ip6t_entry *e, **jumpstack;
+	unsigned int *stackptr, origptr, cpu;
+	const struct xt_table_info *private;
+	struct xt_action_param acpar;
+	unsigned int addend;
+
+	/* Initialization */
+	indev = in ? in->name : nulldevname;
+	outdev = out ? out->name : nulldevname;
+	/* We handle fragments by dealing with the first fragment as
+	 * if it was a normal packet.  All other fragments are treated
+	 * normally, except that they will NEVER match rules that ask
+	 * things we don't know, ie. tcp syn flag or ports).  If the
+	 * rule is also a fragment-specific rule, non-fragments won't
+	 * match it. */
+	acpar.hotdrop = false;
+	acpar.in      = in;
+	acpar.out     = out;
+	acpar.family  = NFPROTO_IPV6;
+	acpar.hooknum = hook;
+
+	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = table->private;
+	cpu        = smp_processor_id();
+	table_base = private->entries[cpu];
+	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
+	origptr    = *stackptr;
+
+	e = get_entry(table_base, private->hook_entry[hook]);
+
+	do {
+		const struct xt_entry_target *t;
+		const struct xt_entry_match *ematch;
+
+		IP_NF_ASSERT(e);
+		if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
+		    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
+ no_match:
+			e = ip6t_next_entry(e);
+			continue;
+		}
+
+		xt_ematch_foreach(ematch, e) {
+			acpar.match     = ematch->u.kernel.match;
+			acpar.matchinfo = ematch->data;
+			if (!acpar.match->match(skb, &acpar))
+				goto no_match;
+		}
+
+		ADD_COUNTER(e->counters, skb->len, 1);
+
+		t = ip6t_get_target_c(e);
+		IP_NF_ASSERT(t->u.kernel.target);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
+    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+		/* The packet is traced: log it */
+		if (unlikely(skb->nf_trace))
+			trace_packet(skb, hook, in, out,
+				     table->name, private, e);
+#endif
+		/* Standard target? */
+		if (!t->u.kernel.target->target) {
+			int v;
+
+			v = ((struct xt_standard_target *)t)->verdict;
+			if (v < 0) {
+				/* Pop from stack? */
+				if (v != XT_RETURN) {
+					verdict = (unsigned)(-v) - 1;
+					break;
+				}
+				if (*stackptr <= origptr)
+					e = get_entry(table_base,
+					    private->underflow[hook]);
+				else
+					e = ip6t_next_entry(jumpstack[--*stackptr]);
+				continue;
+			}
+			if (table_base + v != ip6t_next_entry(e) &&
+			    !(e->ipv6.flags & IP6T_F_GOTO)) {
+				if (*stackptr >= private->stacksize) {
+					verdict = NF_DROP;
+					break;
+				}
+				jumpstack[(*stackptr)++] = e;
+			}
+
+			e = get_entry(table_base, v);
+			continue;
+		}
+
+		acpar.target   = t->u.kernel.target;
+		acpar.targinfo = t->data;
+
+		verdict = t->u.kernel.target->target(skb, &acpar);
+		if (verdict == XT_CONTINUE)
+			e = ip6t_next_entry(e);
+		else
+			/* Verdict */
+			break;
+	} while (!acpar.hotdrop);
+
+	*stackptr = origptr;
+
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
+
+#ifdef DEBUG_ALLOW_ALL
+	return NF_ACCEPT;
+#else
+	if (acpar.hotdrop)
+		return NF_DROP;
+	else return verdict;
+#endif
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+   there are loops.  Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(const struct xt_table_info *newinfo,
+		   unsigned int valid_hooks, void *entry0)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	   to 0 as we leave), and comefrom to save source hook bitmask */
+	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos);
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			const struct xt_standard_target *t
+				= (void *)ip6t_get_target_c(e);
+			int visited = e->comefrom & (1 << hook);
+
+			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
+				pr_err("iptables: loop hook %u pos %u %08X.\n",
+				       hook, pos, e->comefrom);
+				return 0;
+			}
+			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if ((e->target_offset == sizeof(struct ip6t_entry) &&
+			     (strcmp(t->target.u.user.name,
+				     XT_STANDARD_TARGET) == 0) &&
+			     t->verdict < 0 &&
+			     unconditional(&e->ipv6)) || visited) {
+				unsigned int oldpos, size;
+
+				if ((strcmp(t->target.u.user.name,
+					    XT_STANDARD_TARGET) == 0) &&
+				    t->verdict < -NF_MAX_VERDICT - 1) {
+					duprintf("mark_source_chains: bad "
+						"negative verdict (%i)\n",
+								t->verdict);
+					return 0;
+				}
+
+				/* Return: backtrack through the last
+				   big jump. */
+				do {
+					e->comefrom ^= (1<<NF_INET_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+					if (e->comefrom
+					    & (1 << NF_INET_NUMHOOKS)) {
+						duprintf("Back unset "
+							 "on hook %u "
+							 "rule %u\n",
+							 hook, pos);
+					}
+#endif
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = (struct ip6t_entry *)
+						(entry0 + pos);
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = (struct ip6t_entry *)
+					(entry0 + pos + size);
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   XT_STANDARD_TARGET) == 0 &&
+				    newpos >= 0) {
+					if (newpos > newinfo->size -
+						sizeof(struct ip6t_entry)) {
+						duprintf("mark_source_chains: "
+							"bad verdict (%i)\n",
+								newpos);
+						return 0;
+					}
+					/* This a jump; chase it. */
+					duprintf("Jump rule %u -> %u\n",
+						 pos, newpos);
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+				}
+				e = (struct ip6t_entry *)
+					(entry0 + newpos);
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+		next:
+		duprintf("Finished chain %u\n", hook);
+	}
+	return 1;
+}
+
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
+{
+	struct xt_mtdtor_param par;
+
+	par.net       = net;
+	par.match     = m->u.kernel.match;
+	par.matchinfo = m->data;
+	par.family    = NFPROTO_IPV6;
+	if (par.match->destroy != NULL)
+		par.match->destroy(&par);
+	module_put(par.match->me);
+}
+
+static int
+check_entry(const struct ip6t_entry *e, const char *name)
+{
+	const struct xt_entry_target *t;
+
+	if (!ip6_checkentry(&e->ipv6)) {
+		duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+		return -EINVAL;
+	}
+
+	if (e->target_offset + sizeof(struct xt_entry_target) >
+	    e->next_offset)
+		return -EINVAL;
+
+	t = ip6t_get_target_c(e);
+	if (e->target_offset + t->u.target_size > e->next_offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	const struct ip6t_ip6 *ipv6 = par->entryinfo;
+	int ret;
+
+	par->match     = m->u.kernel.match;
+	par->matchinfo = m->data;
+
+	ret = xt_check_match(par, m->u.match_size - sizeof(*m),
+			     ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
+	if (ret < 0) {
+		duprintf("ip_tables: check failed for `%s'.\n",
+			 par.match->name);
+		return ret;
+	}
+	return 0;
+}
+
+static int
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
+{
+	struct xt_match *match;
+	int ret;
+
+	match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match)) {
+		duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+		return PTR_ERR(match);
+	}
+	m->u.kernel.match = match;
+
+	ret = check_match(m, par);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	module_put(m->u.kernel.match->me);
+	return ret;
+}
+
+static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
+{
+	struct xt_entry_target *t = ip6t_get_target(e);
+	struct xt_tgchk_param par = {
+		.net       = net,
+		.table     = name,
+		.entryinfo = e,
+		.target    = t->u.kernel.target,
+		.targinfo  = t->data,
+		.hook_mask = e->comefrom,
+		.family    = NFPROTO_IPV6,
+	};
+	int ret;
+
+	t = ip6t_get_target(e);
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
+	      e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO);
+	if (ret < 0) {
+		duprintf("ip_tables: check failed for `%s'.\n",
+			 t->u.kernel.target->name);
+		return ret;
+	}
+	return 0;
+}
+
+static int
+find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
+		 unsigned int size)
+{
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	int ret;
+	unsigned int j;
+	struct xt_mtchk_param mtpar;
+	struct xt_entry_match *ematch;
+
+	ret = check_entry(e, name);
+	if (ret)
+		return ret;
+
+	j = 0;
+	mtpar.net	= net;
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ipv6;
+	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV6;
+	xt_ematch_foreach(ematch, e) {
+		ret = find_check_match(ematch, &mtpar);
+		if (ret != 0)
+			goto cleanup_matches;
+		++j;
+	}
+
+	t = ip6t_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
+		ret = PTR_ERR(target);
+		goto cleanup_matches;
+	}
+	t->u.kernel.target = target;
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto err;
+	return 0;
+ err:
+	module_put(t->u.kernel.target->me);
+ cleanup_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		cleanup_match(ematch, net);
+	}
+	return ret;
+}
+
+static bool check_underflow(const struct ip6t_entry *e)
+{
+	const struct xt_entry_target *t;
+	unsigned int verdict;
+
+	if (!unconditional(&e->ipv6))
+		return false;
+	t = ip6t_get_target_c(e);
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
+		return false;
+	verdict = ((struct xt_standard_target *)t)->verdict;
+	verdict = -verdict - 1;
+	return verdict == NF_DROP || verdict == NF_ACCEPT;
+}
+
+static int
+check_entry_size_and_hooks(struct ip6t_entry *e,
+			   struct xt_table_info *newinfo,
+			   const unsigned char *base,
+			   const unsigned char *limit,
+			   const unsigned int *hook_entries,
+			   const unsigned int *underflows,
+			   unsigned int valid_hooks)
+{
+	unsigned int h;
+
+	if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
+		duprintf("Bad offset %p\n", e);
+		return -EINVAL;
+	}
+
+	if (e->next_offset
+	    < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if (!(valid_hooks & (1 << h)))
+			continue;
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h]) {
+			if (!check_underflow(e)) {
+				pr_err("Underflows must be unconditional and "
+				       "use the STANDARD target with "
+				       "ACCEPT/DROP\n");
+				return -EINVAL;
+			}
+			newinfo->underflow[h] = underflows[h];
+		}
+	}
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct xt_counters) { 0, 0 });
+	e->comefrom = 0;
+	return 0;
+}
+
+static void cleanup_entry(struct ip6t_entry *e, struct net *net)
+{
+	struct xt_tgdtor_param par;
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		cleanup_match(ematch, net);
+	t = ip6t_get_target(e);
+
+	par.net      = net;
+	par.target   = t->u.kernel.target;
+	par.targinfo = t->data;
+	par.family   = NFPROTO_IPV6;
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+}
+
+/* Checks and translates the user-supplied table segment (held in
+   newinfo) */
+static int
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+                const struct ip6t_replace *repl)
+{
+	struct ip6t_entry *iter;
+	unsigned int i;
+	int ret = 0;
+
+	newinfo->size = repl->size;
+	newinfo->number = repl->num_entries;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_table: size %u\n", newinfo->size);
+	i = 0;
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+						 entry0 + repl->size,
+						 repl->hook_entry,
+						 repl->underflow,
+						 repl->valid_hooks);
+		if (ret != 0)
+			return ret;
+		++i;
+		if (strcmp(ip6t_get_target(iter)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+
+	if (i != repl->num_entries) {
+		duprintf("translate_table: %u not %u entries\n",
+			 i, repl->num_entries);
+		return -EINVAL;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(repl->valid_hooks & (1 << i)))
+			continue;
+		if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, repl->hook_entry[i]);
+			return -EINVAL;
+		}
+		if (newinfo->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, repl->underflow[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
+		return -ELOOP;
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	xt_entry_foreach(iter, entry0, newinfo->size) {
+		ret = find_check_entry(iter, net, repl->name, repl->size);
+		if (ret != 0)
+			break;
+		++i;
+	}
+
+	if (ret != 0) {
+		xt_entry_foreach(iter, entry0, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter, net);
+		}
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i) {
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
+	}
+
+	return ret;
+}
+
+static void
+get_counters(const struct xt_table_info *t,
+	     struct xt_counters counters[])
+{
+	struct ip6t_entry *iter;
+	unsigned int cpu;
+	unsigned int i;
+
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+
+		i = 0;
+		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqcount_begin(s);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqcount_retry(s, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
+			++i;
+		}
+	}
+}
+
+static struct xt_counters *alloc_counters(const struct xt_table *table)
+{
+	unsigned int countersize;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	   (other than comefrom, which userspace doesn't care
+	   about). */
+	countersize = sizeof(struct xt_counters) * private->number;
+	counters = vzalloc(countersize);
+
+	if (counters == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	get_counters(private, counters);
+
+	return counters;
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+		     const struct xt_table *table,
+		     void __user *userptr)
+{
+	unsigned int off, num;
+	const struct ip6t_entry *e;
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	int ret = 0;
+	const void *loc_cpu_entry;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	/* choose the copy that is on our node/cpu, ...
+	 * This choice is lazy (because current thread is
+	 * allowed to migrate to another cpu)
+	 */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
+		ret = -EFAULT;
+		goto free_counters;
+	}
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		unsigned int i;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
+
+		e = (struct ip6t_entry *)(loc_cpu_entry + off);
+		if (copy_to_user(userptr + off
+				 + offsetof(struct ip6t_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		for (i = sizeof(struct ip6t_entry);
+		     i < e->target_offset;
+		     i += m->u.match_size) {
+			m = (void *)e + i;
+
+			if (copy_to_user(userptr + off + i
+					 + offsetof(struct xt_entry_match,
+						    u.user.name),
+					 m->u.kernel.match->name,
+					 strlen(m->u.kernel.match->name)+1)
+			    != 0) {
+				ret = -EFAULT;
+				goto free_counters;
+			}
+		}
+
+		t = ip6t_get_target_c(e);
+		if (copy_to_user(userptr + off + e->target_offset
+				 + offsetof(struct xt_entry_target,
+					    u.user.name),
+				 t->u.kernel.target->name,
+				 strlen(t->u.kernel.target->name)+1) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static void compat_standard_from_user(void *dst, const void *src)
+{
+	int v = *(compat_int_t *)src;
+
+	if (v > 0)
+		v += xt_compat_calc_jump(AF_INET6, v);
+	memcpy(dst, &v, sizeof(v));
+}
+
+static int compat_standard_to_user(void __user *dst, const void *src)
+{
+	compat_int_t cv = *(int *)src;
+
+	if (cv > 0)
+		cv -= xt_compat_calc_jump(AF_INET6, cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
+}
+
+static int compat_calc_entry(const struct ip6t_entry *e,
+			     const struct xt_table_info *info,
+			     const void *base, struct xt_table_info *newinfo)
+{
+	const struct xt_entry_match *ematch;
+	const struct xt_entry_target *t;
+	unsigned int entry_offset;
+	int off, i, ret;
+
+	off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+	entry_offset = (void *)e - base;
+	xt_ematch_foreach(ematch, e)
+		off += xt_compat_match_offset(ematch->u.kernel.match);
+	t = ip6t_get_target_c(e);
+	off += xt_compat_target_offset(t->u.kernel.target);
+	newinfo->size -= off;
+	ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		if (info->hook_entry[i] &&
+		    (e < (struct ip6t_entry *)(base + info->hook_entry[i])))
+			newinfo->hook_entry[i] -= off;
+		if (info->underflow[i] &&
+		    (e < (struct ip6t_entry *)(base + info->underflow[i])))
+			newinfo->underflow[i] -= off;
+	}
+	return 0;
+}
+
+static int compat_table_info(const struct xt_table_info *info,
+			     struct xt_table_info *newinfo)
+{
+	struct ip6t_entry *iter;
+	void *loc_cpu_entry;
+	int ret;
+
+	if (!newinfo || !info)
+		return -EINVAL;
+
+	/* we dont care about newinfo->entries[] */
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	newinfo->initial_entries = 0;
+	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(AF_INET6, info->number);
+	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+		if (ret != 0)
+			return ret;
+	}
+	return 0;
+}
+#endif
+
+static int get_info(struct net *net, void __user *user,
+                    const int *len, int compat)
+{
+	char name[XT_TABLE_MAXNAMELEN];
+	struct xt_table *t;
+	int ret;
+
+	if (*len != sizeof(struct ip6t_getinfo)) {
+		duprintf("length %u != %zu\n", *len,
+			 sizeof(struct ip6t_getinfo));
+		return -EINVAL;
+	}
+
+	if (copy_from_user(name, user, sizeof(name)) != 0)
+		return -EFAULT;
+
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_lock(AF_INET6);
+#endif
+	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+				    "ip6table_%s", name);
+	if (t && !IS_ERR(t)) {
+		struct ip6t_getinfo info;
+		const struct xt_table_info *private = t->private;
+#ifdef CONFIG_COMPAT
+		struct xt_table_info tmp;
+
+		if (compat) {
+			ret = compat_table_info(private, &tmp);
+			xt_compat_flush_offsets(AF_INET6);
+			private = &tmp;
+		}
+#endif
+		memset(&info, 0, sizeof(info));
+		info.valid_hooks = t->valid_hooks;
+		memcpy(info.hook_entry, private->hook_entry,
+		       sizeof(info.hook_entry));
+		memcpy(info.underflow, private->underflow,
+		       sizeof(info.underflow));
+		info.num_entries = private->number;
+		info.size = private->size;
+		strcpy(info.name, name);
+
+		if (copy_to_user(user, &info, *len) != 0)
+			ret = -EFAULT;
+		else
+			ret = 0;
+
+		xt_table_unlock(t);
+		module_put(t->me);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+#ifdef CONFIG_COMPAT
+	if (compat)
+		xt_compat_unlock(AF_INET6);
+#endif
+	return ret;
+}
+
+static int
+get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
+            const int *len)
+{
+	int ret;
+	struct ip6t_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+	if (*len != sizeof(struct ip6t_get_entries) + get.size) {
+		duprintf("get_entries: %u != %zu\n",
+			 *len, sizeof(get) + get.size);
+		return -EINVAL;
+	}
+
+	t = xt_find_table_lock(net, AF_INET6, get.name);
+	if (t && !IS_ERR(t)) {
+		struct xt_table_info *private = t->private;
+		duprintf("t->private->number = %u\n", private->number);
+		if (get.size == private->size)
+			ret = copy_entries_to_user(private->size,
+						   t, uptr->entrytable);
+		else {
+			duprintf("get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	return ret;
+}
+
+static int
+__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
+	     struct xt_table_info *newinfo, unsigned int num_counters,
+	     void __user *counters_ptr)
+{
+	int ret;
+	struct xt_table *t;
+	struct xt_table_info *oldinfo;
+	struct xt_counters *counters;
+	const void *loc_cpu_old_entry;
+	struct ip6t_entry *iter;
+
+	ret = 0;
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	if (!counters) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name),
+				    "ip6table_%s", name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (valid_hooks != t->valid_hooks) {
+		duprintf("Valid hook crap: %08X vs %08X\n",
+			 valid_hooks, t->valid_hooks);
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+		oldinfo->number, oldinfo->initial_entries, newinfo->number);
+	if ((oldinfo->number > oldinfo->initial_entries) ||
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	/* Get the old counters, and synchronize with replace */
+	get_counters(oldinfo, counters);
+
+	/* Decrease module usage counts and free resource */
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+		cleanup_entry(iter, net);
+
+	xt_free_table_info(oldinfo);
+	if (copy_to_user(counters_ptr, counters,
+			 sizeof(struct xt_counters) * num_counters) != 0)
+		ret = -EFAULT;
+	vfree(counters);
+	xt_table_unlock(t);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+	xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+	vfree(counters);
+ out:
+	return ret;
+}
+
+static int
+do_replace(struct net *net, const void __user *user, unsigned int len)
+{
+	int ret;
+	struct ip6t_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ip6t_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("ip_tables: Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, tmp.counters);
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int
+do_add_counters(struct net *net, const void __user *user, unsigned int len,
+		int compat)
+{
+	unsigned int i, curcpu;
+	struct xt_counters_info tmp;
+	struct xt_counters *paddc;
+	unsigned int num_counters;
+	char *name;
+	int size;
+	void *ptmp;
+	struct xt_table *t;
+	const struct xt_table_info *private;
+	int ret = 0;
+	const void *loc_cpu_entry;
+	struct ip6t_entry *iter;
+	unsigned int addend;
+#ifdef CONFIG_COMPAT
+	struct compat_xt_counters_info compat_tmp;
+
+	if (compat) {
+		ptmp = &compat_tmp;
+		size = sizeof(struct compat_xt_counters_info);
+	} else
+#endif
+	{
+		ptmp = &tmp;
+		size = sizeof(struct xt_counters_info);
+	}
+
+	if (copy_from_user(ptmp, user, size) != 0)
+		return -EFAULT;
+
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		num_counters = compat_tmp.num_counters;
+		name = compat_tmp.name;
+	} else
+#endif
+	{
+		num_counters = tmp.num_counters;
+		name = tmp.name;
+	}
+
+	if (len != size + num_counters * sizeof(struct xt_counters))
+		return -EINVAL;
+
+	paddc = vmalloc(len - size);
+	if (!paddc)
+		return -ENOMEM;
+
+	if (copy_from_user(paddc, user + size, len - size) != 0) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	t = xt_find_table_lock(net, AF_INET6, name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free;
+	}
+
+
+	local_bh_disable();
+	private = t->private;
+	if (private->number != num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	/* Choose the copy that is on our node */
+	curcpu = smp_processor_id();
+	addend = xt_write_recseq_begin();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		++i;
+	}
+	xt_write_recseq_end(addend);
+
+ unlock_up_free:
+	local_bh_enable();
+	xt_table_unlock(t);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_ip6t_replace {
+	char			name[XT_TABLE_MAXNAMELEN];
+	u32			valid_hooks;
+	u32			num_entries;
+	u32			size;
+	u32			hook_entry[NF_INET_NUMHOOKS];
+	u32			underflow[NF_INET_NUMHOOKS];
+	u32			num_counters;
+	compat_uptr_t		counters;	/* struct xt_counters * */
+	struct compat_ip6t_entry entries[0];
+};
+
+static int
+compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
+			  unsigned int *size, struct xt_counters *counters,
+			  unsigned int i)
+{
+	struct xt_entry_target *t;
+	struct compat_ip6t_entry __user *ce;
+	u_int16_t target_offset, next_offset;
+	compat_uint_t origsize;
+	const struct xt_entry_match *ematch;
+	int ret = 0;
+
+	origsize = *size;
+	ce = (struct compat_ip6t_entry __user *)*dstptr;
+	if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 ||
+	    copy_to_user(&ce->counters, &counters[i],
+	    sizeof(counters[i])) != 0)
+		return -EFAULT;
+
+	*dstptr += sizeof(struct compat_ip6t_entry);
+	*size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+	xt_ematch_foreach(ematch, e) {
+		ret = xt_compat_match_to_user(ematch, dstptr, size);
+		if (ret != 0)
+			return ret;
+	}
+	target_offset = e->target_offset - (origsize - *size);
+	t = ip6t_get_target(e);
+	ret = xt_compat_target_to_user(t, dstptr, size);
+	if (ret)
+		return ret;
+	next_offset = e->next_offset - (origsize - *size);
+	if (put_user(target_offset, &ce->target_offset) != 0 ||
+	    put_user(next_offset, &ce->next_offset) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int
+compat_find_calc_match(struct xt_entry_match *m,
+		       const char *name,
+		       const struct ip6t_ip6 *ipv6,
+		       unsigned int hookmask,
+		       int *size)
+{
+	struct xt_match *match;
+
+	match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
+				      m->u.user.revision);
+	if (IS_ERR(match)) {
+		duprintf("compat_check_calc_match: `%s' not found\n",
+			 m->u.user.name);
+		return PTR_ERR(match);
+	}
+	m->u.kernel.match = match;
+	*size += xt_compat_match_offset(match);
+	return 0;
+}
+
+static void compat_release_entry(struct compat_ip6t_entry *e)
+{
+	struct xt_entry_target *t;
+	struct xt_entry_match *ematch;
+
+	/* Cleanup all matches */
+	xt_ematch_foreach(ematch, e)
+		module_put(ematch->u.kernel.match->me);
+	t = compat_ip6t_get_target(e);
+	module_put(t->u.kernel.target->me);
+}
+
+static int
+check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
+				  struct xt_table_info *newinfo,
+				  unsigned int *size,
+				  const unsigned char *base,
+				  const unsigned char *limit,
+				  const unsigned int *hook_entries,
+				  const unsigned int *underflows,
+				  const char *name)
+{
+	struct xt_entry_match *ematch;
+	struct xt_entry_target *t;
+	struct xt_target *target;
+	unsigned int entry_offset;
+	unsigned int j;
+	int ret, off, h;
+
+	duprintf("check_compat_entry_size_and_hooks %p\n", e);
+	if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
+	    (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) {
+		duprintf("Bad offset %p, limit = %p\n", e, limit);
+		return -EINVAL;
+	}
+
+	if (e->next_offset < sizeof(struct compat_ip6t_entry) +
+			     sizeof(struct compat_xt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* For purposes of check_entry casting the compat entry is fine */
+	ret = check_entry((struct ip6t_entry *)e, name);
+	if (ret)
+		return ret;
+
+	off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+	entry_offset = (void *)e - (void *)base;
+	j = 0;
+	xt_ematch_foreach(ematch, e) {
+		ret = compat_find_calc_match(ematch, name,
+					     &e->ipv6, e->comefrom, &off);
+		if (ret != 0)
+			goto release_matches;
+		++j;
+	}
+
+	t = compat_ip6t_get_target(e);
+	target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target)) {
+		duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
+			 t->u.user.name);
+		ret = PTR_ERR(target);
+		goto release_matches;
+	}
+	t->u.kernel.target = target;
+
+	off += xt_compat_target_offset(target);
+	*size += off;
+	ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
+	if (ret)
+		goto out;
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h])
+			newinfo->underflow[h] = underflows[h];
+	}
+
+	/* Clear counters and comefrom */
+	memset(&e->counters, 0, sizeof(e->counters));
+	e->comefrom = 0;
+	return 0;
+
+out:
+	module_put(t->u.kernel.target->me);
+release_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		module_put(ematch->u.kernel.match->me);
+	}
+	return ret;
+}
+
+static int
+compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
+			    unsigned int *size, const char *name,
+			    struct xt_table_info *newinfo, unsigned char *base)
+{
+	struct xt_entry_target *t;
+	struct ip6t_entry *de;
+	unsigned int origsize;
+	int ret, h;
+	struct xt_entry_match *ematch;
+
+	ret = 0;
+	origsize = *size;
+	de = (struct ip6t_entry *)*dstptr;
+	memcpy(de, e, sizeof(struct ip6t_entry));
+	memcpy(&de->counters, &e->counters, sizeof(e->counters));
+
+	*dstptr += sizeof(struct ip6t_entry);
+	*size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
+
+	xt_ematch_foreach(ematch, e) {
+		ret = xt_compat_match_from_user(ematch, dstptr, size);
+		if (ret != 0)
+			return ret;
+	}
+	de->target_offset = e->target_offset - (origsize - *size);
+	t = compat_ip6t_get_target(e);
+	xt_compat_target_from_user(t, dstptr, size);
+
+	de->next_offset = e->next_offset - (origsize - *size);
+	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
+		if ((unsigned char *)de - base < newinfo->hook_entry[h])
+			newinfo->hook_entry[h] -= origsize - *size;
+		if ((unsigned char *)de - base < newinfo->underflow[h])
+			newinfo->underflow[h] -= origsize - *size;
+	}
+	return ret;
+}
+
+static int compat_check_entry(struct ip6t_entry *e, struct net *net,
+			      const char *name)
+{
+	unsigned int j;
+	int ret = 0;
+	struct xt_mtchk_param mtpar;
+	struct xt_entry_match *ematch;
+
+	j = 0;
+	mtpar.net	= net;
+	mtpar.table     = name;
+	mtpar.entryinfo = &e->ipv6;
+	mtpar.hook_mask = e->comefrom;
+	mtpar.family    = NFPROTO_IPV6;
+	xt_ematch_foreach(ematch, e) {
+		ret = check_match(ematch, &mtpar);
+		if (ret != 0)
+			goto cleanup_matches;
+		++j;
+	}
+
+	ret = check_target(e, net, name);
+	if (ret)
+		goto cleanup_matches;
+	return 0;
+
+ cleanup_matches:
+	xt_ematch_foreach(ematch, e) {
+		if (j-- == 0)
+			break;
+		cleanup_match(ematch, net);
+	}
+	return ret;
+}
+
+static int
+translate_compat_table(struct net *net,
+		       const char *name,
+		       unsigned int valid_hooks,
+		       struct xt_table_info **pinfo,
+		       void **pentry0,
+		       unsigned int total_size,
+		       unsigned int number,
+		       unsigned int *hook_entries,
+		       unsigned int *underflows)
+{
+	unsigned int i, j;
+	struct xt_table_info *newinfo, *info;
+	void *pos, *entry0, *entry1;
+	struct compat_ip6t_entry *iter0;
+	struct ip6t_entry *iter1;
+	unsigned int size;
+	int ret = 0;
+
+	info = *pinfo;
+	entry0 = *pentry0;
+	size = total_size;
+	info->number = number;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		info->hook_entry[i] = 0xFFFFFFFF;
+		info->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_compat_table: size %u\n", info->size);
+	j = 0;
+	xt_compat_lock(AF_INET6);
+	xt_compat_init_offsets(AF_INET6, number);
+	/* Walk through entries, checking offsets. */
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+							entry0,
+							entry0 + total_size,
+							hook_entries,
+							underflows,
+							name);
+		if (ret != 0)
+			goto out_unlock;
+		++j;
+	}
+
+	ret = -EINVAL;
+	if (j != number) {
+		duprintf("translate_compat_table: %u not %u entries\n",
+			 j, number);
+		goto out_unlock;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(valid_hooks & (1 << i)))
+			continue;
+		if (info->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, hook_entries[i]);
+			goto out_unlock;
+		}
+		if (info->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, underflows[i]);
+			goto out_unlock;
+		}
+	}
+
+	ret = -ENOMEM;
+	newinfo = xt_alloc_table_info(size);
+	if (!newinfo)
+		goto out_unlock;
+
+	newinfo->number = number;
+	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = info->hook_entry[i];
+		newinfo->underflow[i] = info->underflow[i];
+	}
+	entry1 = newinfo->entries[raw_smp_processor_id()];
+	pos = entry1;
+	size = total_size;
+	xt_entry_foreach(iter0, entry0, total_size) {
+		ret = compat_copy_entry_from_user(iter0, &pos, &size,
+						  name, newinfo, entry1);
+		if (ret != 0)
+			break;
+	}
+	xt_compat_flush_offsets(AF_INET6);
+	xt_compat_unlock(AF_INET6);
+	if (ret)
+		goto free_newinfo;
+
+	ret = -ELOOP;
+	if (!mark_source_chains(newinfo, valid_hooks, entry1))
+		goto free_newinfo;
+
+	i = 0;
+	xt_entry_foreach(iter1, entry1, newinfo->size) {
+		ret = compat_check_entry(iter1, net, name);
+		if (ret != 0)
+			break;
+		++i;
+		if (strcmp(ip6t_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
+	}
+	if (ret) {
+		/*
+		 * The first i matches need cleanup_entry (calls ->destroy)
+		 * because they had called ->check already. The other j-i
+		 * entries need only release.
+		 */
+		int skip = i;
+		j -= i;
+		xt_entry_foreach(iter0, entry0, newinfo->size) {
+			if (skip-- > 0)
+				continue;
+			if (j-- == 0)
+				break;
+			compat_release_entry(iter0);
+		}
+		xt_entry_foreach(iter1, entry1, newinfo->size) {
+			if (i-- == 0)
+				break;
+			cleanup_entry(iter1, net);
+		}
+		xt_free_table_info(newinfo);
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for_each_possible_cpu(i)
+		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
+			memcpy(newinfo->entries[i], entry1, newinfo->size);
+
+	*pinfo = newinfo;
+	*pentry0 = entry1;
+	xt_free_table_info(info);
+	return 0;
+
+free_newinfo:
+	xt_free_table_info(newinfo);
+out:
+	xt_entry_foreach(iter0, entry0, total_size) {
+		if (j-- == 0)
+			break;
+		compat_release_entry(iter0);
+	}
+	return ret;
+out_unlock:
+	xt_compat_flush_offsets(AF_INET6);
+	xt_compat_unlock(AF_INET6);
+	goto out;
+}
+
+static int
+compat_do_replace(struct net *net, void __user *user, unsigned int len)
+{
+	int ret;
+	struct compat_ip6t_replace tmp;
+	struct xt_table_info *newinfo;
+	void *loc_cpu_entry;
+	struct ip6t_entry *iter;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* overflow check */
+	if (tmp.size >= INT_MAX / num_possible_cpus())
+		return -ENOMEM;
+	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+		return -ENOMEM;
+	tmp.name[sizeof(tmp.name)-1] = 0;
+
+	newinfo = xt_alloc_table_info(tmp.size);
+	if (!newinfo)
+		return -ENOMEM;
+
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
+				     &newinfo, &loc_cpu_entry, tmp.size,
+				     tmp.num_entries, tmp.hook_entry,
+				     tmp.underflow);
+	if (ret != 0)
+		goto free_newinfo;
+
+	duprintf("compat_do_replace: Translated table\n");
+
+	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
+			   tmp.num_counters, compat_ptr(tmp.counters));
+	if (ret)
+		goto free_newinfo_untrans;
+	return 0;
+
+ free_newinfo_untrans:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		cleanup_entry(iter, net);
+ free_newinfo:
+	xt_free_table_info(newinfo);
+	return ret;
+}
+
+static int
+compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
+		       unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IP6T_SO_SET_REPLACE:
+		ret = compat_do_replace(sock_net(sk), user, len);
+		break;
+
+	case IP6T_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 1);
+		break;
+
+	default:
+		duprintf("do_ip6t_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+struct compat_ip6t_get_entries {
+	char name[XT_TABLE_MAXNAMELEN];
+	compat_uint_t size;
+	struct compat_ip6t_entry entrytable[0];
+};
+
+static int
+compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
+			    void __user *userptr)
+{
+	struct xt_counters *counters;
+	const struct xt_table_info *private = table->private;
+	void __user *pos;
+	unsigned int size;
+	int ret = 0;
+	const void *loc_cpu_entry;
+	unsigned int i = 0;
+	struct ip6t_entry *iter;
+
+	counters = alloc_counters(table);
+	if (IS_ERR(counters))
+		return PTR_ERR(counters);
+
+	/* choose the copy that is on our node/cpu, ...
+	 * This choice is lazy (because current thread is
+	 * allowed to migrate to another cpu)
+	 */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	pos = userptr;
+	size = total_size;
+	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+		ret = compat_copy_entry_to_user(iter, &pos,
+						&size, counters, i++);
+		if (ret != 0)
+			break;
+	}
+
+	vfree(counters);
+	return ret;
+}
+
+static int
+compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
+		   int *len)
+{
+	int ret;
+	struct compat_ip6t_get_entries get;
+	struct xt_table *t;
+
+	if (*len < sizeof(get)) {
+		duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&get, uptr, sizeof(get)) != 0)
+		return -EFAULT;
+
+	if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) {
+		duprintf("compat_get_entries: %u != %zu\n",
+			 *len, sizeof(get) + get.size);
+		return -EINVAL;
+	}
+
+	xt_compat_lock(AF_INET6);
+	t = xt_find_table_lock(net, AF_INET6, get.name);
+	if (t && !IS_ERR(t)) {
+		const struct xt_table_info *private = t->private;
+		struct xt_table_info info;
+		duprintf("t->private->number = %u\n", private->number);
+		ret = compat_table_info(private, &info);
+		if (!ret && get.size == info.size) {
+			ret = compat_copy_entries_to_user(private->size,
+							  t, uptr->entrytable);
+		} else if (!ret) {
+			duprintf("compat_get_entries: I've got %u not %u!\n",
+				 private->size, get.size);
+			ret = -EAGAIN;
+		}
+		xt_compat_flush_offsets(AF_INET6);
+		module_put(t->me);
+		xt_table_unlock(t);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	xt_compat_unlock(AF_INET6);
+	return ret;
+}
+
+static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
+
+static int
+compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IP6T_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 1);
+		break;
+	case IP6T_SO_GET_ENTRIES:
+		ret = compat_get_entries(sock_net(sk), user, len);
+		break;
+	default:
+		ret = do_ip6t_get_ctl(sk, cmd, user, len);
+	}
+	return ret;
+}
+#endif
+
+static int
+do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IP6T_SO_SET_REPLACE:
+		ret = do_replace(sock_net(sk), user, len);
+		break;
+
+	case IP6T_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(sock_net(sk), user, len, 0);
+		break;
+
+	default:
+		duprintf("do_ip6t_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int
+do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IP6T_SO_GET_INFO:
+		ret = get_info(sock_net(sk), user, len, 0);
+		break;
+
+	case IP6T_SO_GET_ENTRIES:
+		ret = get_entries(sock_net(sk), user, len);
+		break;
+
+	case IP6T_SO_GET_REVISION_MATCH:
+	case IP6T_SO_GET_REVISION_TARGET: {
+		struct xt_get_revision rev;
+		int target;
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		rev.name[sizeof(rev.name)-1] = 0;
+
+		if (cmd == IP6T_SO_GET_REVISION_TARGET)
+			target = 1;
+		else
+			target = 0;
+
+		try_then_request_module(xt_find_revision(AF_INET6, rev.name,
+							 rev.revision,
+							 target, &ret),
+					"ip6t_%s", rev.name);
+		break;
+	}
+
+	default:
+		duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+struct xt_table *ip6t_register_table(struct net *net,
+				     const struct xt_table *table,
+				     const struct ip6t_replace *repl)
+{
+	int ret;
+	struct xt_table_info *newinfo;
+	struct xt_table_info bootstrap = {0};
+	void *loc_cpu_entry;
+	struct xt_table *new_table;
+
+	newinfo = xt_alloc_table_info(repl->size);
+	if (!newinfo) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* choose the copy on our node/cpu, but dont care about preemption */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
+
+	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+	if (ret != 0)
+		goto out_free;
+
+	new_table = xt_register_table(net, table, &bootstrap, newinfo);
+	if (IS_ERR(new_table)) {
+		ret = PTR_ERR(new_table);
+		goto out_free;
+	}
+	return new_table;
+
+out_free:
+	xt_free_table_info(newinfo);
+out:
+	return ERR_PTR(ret);
+}
+
+void ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+	struct xt_table_info *private;
+	void *loc_cpu_entry;
+	struct module *table_owner = table->me;
+	struct ip6t_entry *iter;
+
+	private = xt_unregister_table(table);
+
+	/* Decrease module usage counts and free resources */
+	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	xt_entry_foreach(iter, loc_cpu_entry, private->size)
+		cleanup_entry(iter, net);
+	if (private->number > private->initial_entries)
+		module_put(table_owner);
+	xt_free_table_info(private);
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline bool
+icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+		     u_int8_t type, u_int8_t code,
+		     bool invert)
+{
+	return (type == test_type && code >= min_code && code <= max_code)
+		^ invert;
+}
+
+static bool
+icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct icmp6hdr *ic;
+	struct icmp6hdr _icmph;
+	const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+	if (ic == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("Dropping evil ICMP tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return icmp6_type_code_match(icmpinfo->type,
+				     icmpinfo->code[0],
+				     icmpinfo->code[1],
+				     ic->icmp6_type, ic->icmp6_code,
+				     !!(icmpinfo->invflags&IP6T_ICMP_INV));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int icmp6_checkentry(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct xt_target ip6t_builtin_tg[] __read_mostly = {
+	{
+		.name             = XT_STANDARD_TARGET,
+		.targetsize       = sizeof(int),
+		.family           = NFPROTO_IPV6,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(compat_int_t),
+		.compat_from_user = compat_standard_from_user,
+		.compat_to_user   = compat_standard_to_user,
+#endif
+	},
+	{
+		.name             = XT_ERROR_TARGET,
+		.target           = ip6t_error,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
+		.family           = NFPROTO_IPV6,
+	},
+};
+
+static struct nf_sockopt_ops ip6t_sockopts = {
+	.pf		= PF_INET6,
+	.set_optmin	= IP6T_BASE_CTL,
+	.set_optmax	= IP6T_SO_SET_MAX+1,
+	.set		= do_ip6t_set_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_set	= compat_do_ip6t_set_ctl,
+#endif
+	.get_optmin	= IP6T_BASE_CTL,
+	.get_optmax	= IP6T_SO_GET_MAX+1,
+	.get		= do_ip6t_get_ctl,
+#ifdef CONFIG_COMPAT
+	.compat_get	= compat_do_ip6t_get_ctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static struct xt_match ip6t_builtin_mt[] __read_mostly = {
+	{
+		.name       = "icmp6",
+		.match      = icmp6_match,
+		.matchsize  = sizeof(struct ip6t_icmp),
+		.checkentry = icmp6_checkentry,
+		.proto      = IPPROTO_ICMPV6,
+		.family     = NFPROTO_IPV6,
+	},
+};
+
+static int __net_init ip6_tables_net_init(struct net *net)
+{
+	return xt_proto_init(net, NFPROTO_IPV6);
+}
+
+static void __net_exit ip6_tables_net_exit(struct net *net)
+{
+	xt_proto_fini(net, NFPROTO_IPV6);
+}
+
+static struct pernet_operations ip6_tables_net_ops = {
+	.init = ip6_tables_net_init,
+	.exit = ip6_tables_net_exit,
+};
+
+static int __init ip6_tables_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip6_tables_net_ops);
+	if (ret < 0)
+		goto err1;
+
+	/* No one else will be downing sem now, so we won't sleep */
+	ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+	if (ret < 0)
+		goto err2;
+	ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+	if (ret < 0)
+		goto err4;
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&ip6t_sockopts);
+	if (ret < 0)
+		goto err5;
+
+	pr_info("(C) 2000-2006 Netfilter Core Team\n");
+	return 0;
+
+err5:
+	xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+err4:
+	xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+err2:
+	unregister_pernet_subsys(&ip6_tables_net_ops);
+err1:
+	return ret;
+}
+
+static void __exit ip6_tables_fini(void)
+{
+	nf_unregister_sockopt(&ip6t_sockopts);
+
+	xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
+	xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
+	unregister_pernet_subsys(&ip6_tables_net_ops);
+}
+
+/*
+ * find the offset to specified header or the protocol number of last header
+ * if target < 0. "last header" is transport protocol header, ESP, or
+ * "No next header".
+ *
+ * If target header is found, its offset is set in *offset and return protocol
+ * number. Otherwise, return -ENOENT or -EBADMSG.
+ *
+ * If the first fragment doesn't contain the final protocol header or
+ * NEXTHDR_NONE it is considered invalid.
+ *
+ * Note that non-1st fragment is special case that "the protocol number
+ * of last header" is "next header" field in Fragment header. In this case,
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
+ */
+int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
+		  int target, unsigned short *fragoff)
+{
+	unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
+	u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+	unsigned int len = skb->len - start;
+
+	if (fragoff)
+		*fragoff = 0;
+
+	while (nexthdr != target) {
+		struct ipv6_opt_hdr _hdr, *hp;
+		unsigned int hdrlen;
+
+		if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
+			if (target < 0)
+				break;
+			return -ENOENT;
+		}
+
+		hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return -EBADMSG;
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			unsigned short _frag_off;
+			__be16 *fp;
+			fp = skb_header_pointer(skb,
+						start+offsetof(struct frag_hdr,
+							       frag_off),
+						sizeof(_frag_off),
+						&_frag_off);
+			if (fp == NULL)
+				return -EBADMSG;
+
+			_frag_off = ntohs(*fp) & ~0x7;
+			if (_frag_off) {
+				if (target < 0 &&
+				    ((!ipv6_ext_hdr(hp->nexthdr)) ||
+				     hp->nexthdr == NEXTHDR_NONE)) {
+					if (fragoff) {
+						*fragoff = _frag_off;
+						return hp->nexthdr;
+					} else {
+						return -EINVAL;
+					}
+				}
+				return -ENOENT;
+			}
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hp->hdrlen + 2) << 2;
+		else
+			hdrlen = ipv6_optlen(hp);
+
+		nexthdr = hp->nexthdr;
+		len -= hdrlen;
+		start += hdrlen;
+	}
+
+	*offset = start;
+	return nexthdr;
+}
+
+EXPORT_SYMBOL(ip6t_register_table);
+EXPORT_SYMBOL(ip6t_unregister_table);
+EXPORT_SYMBOL(ip6t_do_table);
+EXPORT_SYMBOL(ip6t_ext_hdr);
+EXPORT_SYMBOL(ipv6_find_hdr);
+
+module_init(ip6_tables_init);
+module_exit(ip6_tables_fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
new file mode 100644
index 00000000..e6af8d72
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -0,0 +1,527 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 2001 Jan Rekorajski <baggins@pld.org.pl>
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <linux/spinlock.h>
+#include <linux/icmpv6.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
+
+MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
+MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
+MODULE_LICENSE("GPL");
+
+struct in_device;
+#include <net/route.h>
+#include <linux/netfilter_ipv6/ip6t_LOG.h>
+
+/* One level of recursion won't kill us */
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
+			const struct sk_buff *skb, unsigned int ip6hoff,
+			int recurse)
+{
+	u_int8_t currenthdr;
+	int fragment;
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	unsigned int ptr;
+	unsigned int hdrlen = 0;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
+
+	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+	if (ih == NULL) {
+		sb_add(m, "TRUNCATED");
+		return;
+	}
+
+	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+	sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+	sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+	       ih->hop_limit,
+	       (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+	fragment = 0;
+	ptr = ip6hoff + sizeof(struct ipv6hdr);
+	currenthdr = ih->nexthdr;
+	while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
+		struct ipv6_opt_hdr _hdr;
+		const struct ipv6_opt_hdr *hp;
+
+		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+		if (hp == NULL) {
+			sb_add(m, "TRUNCATED");
+			return;
+		}
+
+		/* Max length: 48 "OPT (...) " */
+		if (logflags & IP6T_LOG_IPOPT)
+			sb_add(m, "OPT ( ");
+
+		switch (currenthdr) {
+		case IPPROTO_FRAGMENT: {
+			struct frag_hdr _fhdr;
+			const struct frag_hdr *fh;
+
+			sb_add(m, "FRAG:");
+			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+						&_fhdr);
+			if (fh == NULL) {
+				sb_add(m, "TRUNCATED ");
+				return;
+			}
+
+			/* Max length: 6 "65535 " */
+			sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+			/* Max length: 11 "INCOMPLETE " */
+			if (fh->frag_off & htons(0x0001))
+				sb_add(m, "INCOMPLETE ");
+
+			sb_add(m, "ID:%08x ", ntohl(fh->identification));
+
+			if (ntohs(fh->frag_off) & 0xFFF8)
+				fragment = 1;
+
+			hdrlen = 8;
+
+			break;
+		}
+		case IPPROTO_DSTOPTS:
+		case IPPROTO_ROUTING:
+		case IPPROTO_HOPOPTS:
+			if (fragment) {
+				if (logflags & IP6T_LOG_IPOPT)
+					sb_add(m, ")");
+				return;
+			}
+			hdrlen = ipv6_optlen(hp);
+			break;
+		/* Max Length */
+		case IPPROTO_AH:
+			if (logflags & IP6T_LOG_IPOPT) {
+				struct ip_auth_hdr _ahdr;
+				const struct ip_auth_hdr *ah;
+
+				/* Max length: 3 "AH " */
+				sb_add(m, "AH ");
+
+				if (fragment) {
+					sb_add(m, ")");
+					return;
+				}
+
+				ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+							&_ahdr);
+				if (ah == NULL) {
+					/*
+					 * Max length: 26 "INCOMPLETE [65535
+					 *  bytes] )"
+					 */
+					sb_add(m, "INCOMPLETE [%u bytes] )",
+					       skb->len - ptr);
+					return;
+				}
+
+				/* Length: 15 "SPI=0xF1234567 */
+				sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
+
+			}
+
+			hdrlen = (hp->hdrlen+2)<<2;
+			break;
+		case IPPROTO_ESP:
+			if (logflags & IP6T_LOG_IPOPT) {
+				struct ip_esp_hdr _esph;
+				const struct ip_esp_hdr *eh;
+
+				/* Max length: 4 "ESP " */
+				sb_add(m, "ESP ");
+
+				if (fragment) {
+					sb_add(m, ")");
+					return;
+				}
+
+				/*
+				 * Max length: 26 "INCOMPLETE [65535 bytes] )"
+				 */
+				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+							&_esph);
+				if (eh == NULL) {
+					sb_add(m, "INCOMPLETE [%u bytes] )",
+					       skb->len - ptr);
+					return;
+				}
+
+				/* Length: 16 "SPI=0xF1234567 )" */
+				sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
+
+			}
+			return;
+		default:
+			/* Max length: 20 "Unknown Ext Hdr 255" */
+			sb_add(m, "Unknown Ext Hdr %u", currenthdr);
+			return;
+		}
+		if (logflags & IP6T_LOG_IPOPT)
+			sb_add(m, ") ");
+
+		currenthdr = hp->nexthdr;
+		ptr += hdrlen;
+	}
+
+	switch (currenthdr) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph;
+		const struct tcphdr *th;
+
+		/* Max length: 10 "PROTO=TCP " */
+		sb_add(m, "PROTO=TCP ");
+
+		if (fragment)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
+		if (th == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
+			return;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		sb_add(m, "SPT=%u DPT=%u ",
+		       ntohs(th->source), ntohs(th->dest));
+		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+		if (logflags & IP6T_LOG_TCPSEQ)
+			sb_add(m, "SEQ=%u ACK=%u ",
+			       ntohl(th->seq), ntohl(th->ack_seq));
+		/* Max length: 13 "WINDOW=65535 " */
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
+		/* Max length: 9 "RES=0x3C " */
+		sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+		if (th->cwr)
+			sb_add(m, "CWR ");
+		if (th->ece)
+			sb_add(m, "ECE ");
+		if (th->urg)
+			sb_add(m, "URG ");
+		if (th->ack)
+			sb_add(m, "ACK ");
+		if (th->psh)
+			sb_add(m, "PSH ");
+		if (th->rst)
+			sb_add(m, "RST ");
+		if (th->syn)
+			sb_add(m, "SYN ");
+		if (th->fin)
+			sb_add(m, "FIN ");
+		/* Max length: 11 "URGP=65535 " */
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+		if ((logflags & IP6T_LOG_TCPOPT) &&
+		    th->doff * 4 > sizeof(struct tcphdr)) {
+			u_int8_t _opt[60 - sizeof(struct tcphdr)];
+			const u_int8_t *op;
+			unsigned int i;
+			unsigned int optsize = th->doff * 4
+					       - sizeof(struct tcphdr);
+
+			op = skb_header_pointer(skb,
+						ptr + sizeof(struct tcphdr),
+						optsize, _opt);
+			if (op == NULL) {
+				sb_add(m, "OPT (TRUNCATED)");
+				return;
+			}
+
+			/* Max length: 127 "OPT (" 15*4*2chars ") " */
+			sb_add(m, "OPT (");
+			for (i =0; i < optsize; i++)
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
+		}
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
+		struct udphdr _udph;
+		const struct udphdr *uh;
+
+		if (currenthdr == IPPROTO_UDP)
+			/* Max length: 10 "PROTO=UDP "     */
+			sb_add(m, "PROTO=UDP " );
+		else	/* Max length: 14 "PROTO=UDPLITE " */
+			sb_add(m, "PROTO=UDPLITE ");
+
+		if (fragment)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
+		if (uh == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
+			return;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
+		       ntohs(uh->source), ntohs(uh->dest),
+		       ntohs(uh->len));
+		break;
+	}
+	case IPPROTO_ICMPV6: {
+		struct icmp6hdr _icmp6h;
+		const struct icmp6hdr *ic;
+
+		/* Max length: 13 "PROTO=ICMPv6 " */
+		sb_add(m, "PROTO=ICMPv6 ");
+
+		if (fragment)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+		if (ic == NULL) {
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
+			return;
+		}
+
+		/* Max length: 18 "TYPE=255 CODE=255 " */
+		sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+
+		switch (ic->icmp6_type) {
+		case ICMPV6_ECHO_REQUEST:
+		case ICMPV6_ECHO_REPLY:
+			/* Max length: 19 "ID=65535 SEQ=65535 " */
+			sb_add(m, "ID=%u SEQ=%u ",
+				ntohs(ic->icmp6_identifier),
+				ntohs(ic->icmp6_sequence));
+			break;
+		case ICMPV6_MGM_QUERY:
+		case ICMPV6_MGM_REPORT:
+		case ICMPV6_MGM_REDUCTION:
+			break;
+
+		case ICMPV6_PARAMPROB:
+			/* Max length: 17 "POINTER=ffffffff " */
+			sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
+			/* Fall through */
+		case ICMPV6_DEST_UNREACH:
+		case ICMPV6_PKT_TOOBIG:
+		case ICMPV6_TIME_EXCEED:
+			/* Max length: 3+maxlen */
+			if (recurse) {
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
+					    ptr + sizeof(_icmp6h), 0);
+				sb_add(m, "] ");
+			}
+
+			/* Max length: 10 "MTU=65535 " */
+			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
+				sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
+		}
+		break;
+	}
+	/* Max length: 10 "PROTO=255 " */
+	default:
+		sb_add(m, "PROTO=%u ", currenthdr);
+	}
+
+	/* Max length: 15 "UID=4294967295 " */
+	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+			sb_add(m, "UID=%u GID=%u ",
+				skb->sk->sk_socket->file->f_cred->fsuid,
+				skb->sk->sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+	}
+
+	/* Max length: 16 "MARK=0xFFFFFFFF " */
+	if (!recurse && skb->mark)
+		sb_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IP6T_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	sb_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int len = dev->hard_header_len;
+		unsigned int i;
+
+		if (dev->type == ARPHRD_SIT &&
+		    (p -= ETH_HLEN) < skb->head)
+			p = NULL;
+
+		if (p != NULL) {
+			sb_add(m, "%02x", *p++);
+			for (i = 1; i < len; i++)
+				sb_add(m, ":%02x", *p++);
+		}
+		sb_add(m, " ");
+
+		if (dev->type == ARPHRD_SIT) {
+			const struct iphdr *iph =
+				(struct iphdr *)skb_mac_header(skb);
+			sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+		}
+	} else
+		sb_add(m, " ");
+}
+
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level	  = 5,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+static void
+ip6t_log_packet(u_int8_t pf,
+		unsigned int hooknum,
+		const struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		const struct nf_loginfo *loginfo,
+		const char *prefix)
+{
+	struct sbuff *m = sb_open();
+
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
+
+	if (in != NULL)
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+
+	sb_close(m);
+}
+
+static unsigned int
+log_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ip6t_log_info *loginfo = par->targinfo;
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = loginfo->level;
+	li.u.log.logflags = loginfo->logflags;
+
+	ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, par->out,
+			&li, loginfo->prefix);
+	return XT_CONTINUE;
+}
+
+
+static int log_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_log_info *loginfo = par->targinfo;
+
+	if (loginfo->level >= 8) {
+		pr_debug("level %u >= 8\n", loginfo->level);
+		return -EINVAL;
+	}
+	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+		pr_debug("prefix not null-terminated\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target log_tg6_reg __read_mostly = {
+	.name 		= "LOG",
+	.family		= NFPROTO_IPV6,
+	.target 	= log_tg6,
+	.targetsize	= sizeof(struct ip6t_log_info),
+	.checkentry	= log_tg6_check,
+	.me 		= THIS_MODULE,
+};
+
+static struct nf_logger ip6t_logger __read_mostly = {
+	.name		= "ip6t_LOG",
+	.logfn		= &ip6t_log_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __init log_tg6_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&log_tg6_reg);
+	if (ret < 0)
+		return ret;
+	nf_log_register(NFPROTO_IPV6, &ip6t_logger);
+	return 0;
+}
+
+static void __exit log_tg6_exit(void)
+{
+	nf_log_unregister(&ip6t_logger);
+	xt_unregister_target(&log_tg6_reg);
+}
+
+module_init(log_tg6_init);
+module_exit(log_tg6_exit);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 00000000..09d30498
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,271 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ *	Yasuyuki Kozakai	<yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6");
+MODULE_LICENSE("GPL");
+
+/* Send RST reply */
+static void send_reset(struct net *net, struct sk_buff *oldskb)
+{
+	struct sk_buff *nskb;
+	struct tcphdr otcph, *tcph;
+	unsigned int otcplen, hh_len;
+	int tcphoff, needs_ack;
+	const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
+	struct ipv6hdr *ip6h;
+#define DEFAULT_TOS_VALUE	0x0U
+	const __u8 tclass = DEFAULT_TOS_VALUE;
+	struct dst_entry *dst = NULL;
+	u8 proto;
+	struct flowi6 fl6;
+
+	if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+	    (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+		pr_debug("addr is not unicast.\n");
+		return;
+	}
+
+	proto = oip6h->nexthdr;
+	tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+
+	if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+		pr_debug("Cannot get TCP header.\n");
+		return;
+	}
+
+	otcplen = oldskb->len - tcphoff;
+
+	/* IP header checks: fragment, too short. */
+	if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) {
+		pr_debug("proto(%d) != IPPROTO_TCP, "
+			 "or too short. otcplen = %d\n",
+			 proto, otcplen);
+		return;
+	}
+
+	if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
+		BUG();
+
+	/* No RST for RST. */
+	if (otcph.rst) {
+		pr_debug("RST is set\n");
+		return;
+	}
+
+	/* Check checksum. */
+	if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
+			    skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+		pr_debug("TCP checksum is invalid\n");
+		return;
+	}
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	ipv6_addr_copy(&fl6.saddr, &oip6h->daddr);
+	ipv6_addr_copy(&fl6.daddr, &oip6h->saddr);
+	fl6.fl6_sport = otcph.dest;
+	fl6.fl6_dport = otcph.source;
+	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == NULL || dst->error) {
+		dst_release(dst);
+		return;
+	}
+	dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+	if (IS_ERR(dst))
+		return;
+
+	hh_len = (dst->dev->hard_header_len + 15)&~15;
+	nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+			 + sizeof(struct tcphdr) + dst->trailer_len,
+			 GFP_ATOMIC);
+
+	if (!nskb) {
+		if (net_ratelimit())
+			pr_debug("cannot alloc skb\n");
+		dst_release(dst);
+		return;
+	}
+
+	skb_dst_set(nskb, dst);
+
+	skb_reserve(nskb, hh_len + dst->header_len);
+
+	skb_put(nskb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(nskb);
+	ip6h = ipv6_hdr(nskb);
+	*(__be32 *)ip6h =  htonl(0x60000000 | (tclass << 20));
+	ip6h->hop_limit = ip6_dst_hoplimit(dst);
+	ip6h->nexthdr = IPPROTO_TCP;
+	ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
+	ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
+
+	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+	/* Truncate to length (no data) */
+	tcph->doff = sizeof(struct tcphdr)/4;
+	tcph->source = otcph.dest;
+	tcph->dest = otcph.source;
+
+	if (otcph.ack) {
+		needs_ack = 0;
+		tcph->seq = otcph.ack_seq;
+		tcph->ack_seq = 0;
+	} else {
+		needs_ack = 1;
+		tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
+				      + otcplen - (otcph.doff<<2));
+		tcph->seq = 0;
+	}
+
+	/* Reset flags */
+	((u_int8_t *)tcph)[13] = 0;
+	tcph->rst = 1;
+	tcph->ack = needs_ack;
+	tcph->window = 0;
+	tcph->urg_ptr = 0;
+	tcph->check = 0;
+
+	/* Adjust TCP checksum */
+	tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr,
+				      &ipv6_hdr(nskb)->daddr,
+				      sizeof(struct tcphdr), IPPROTO_TCP,
+				      csum_partial(tcph,
+						   sizeof(struct tcphdr), 0));
+
+	nf_ct_attach(nskb, oldskb);
+
+	ip6_local_out(nskb);
+}
+
+static inline void
+send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code,
+	     unsigned int hooknum)
+{
+	if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
+		skb_in->dev = net->loopback_dev;
+
+	icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
+#ifdef CONFIG_IP6_NF_TARGET_REJECT_SKERR
+	if (skb_in->sk) {
+		icmpv6_err_convert(ICMPV6_DEST_UNREACH, code,
+				   &skb_in->sk->sk_err);
+		skb_in->sk->sk_error_report(skb_in->sk);
+		pr_debug("ip6t_REJECT: sk_err=%d for skb=%p sk=%p\n",
+			skb_in->sk->sk_err, skb_in, skb_in->sk);
+	}
+#endif
+}
+
+static unsigned int
+reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ip6t_reject_info *reject = par->targinfo;
+	struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
+
+	pr_debug("%s: medium point\n", __func__);
+	switch (reject->with) {
+	case IP6T_ICMP6_NO_ROUTE:
+		send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum);
+		break;
+	case IP6T_ICMP6_ADM_PROHIBITED:
+		send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
+		break;
+	case IP6T_ICMP6_NOT_NEIGHBOUR:
+		send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
+		break;
+	case IP6T_ICMP6_ADDR_UNREACH:
+		send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
+		break;
+	case IP6T_ICMP6_PORT_UNREACH:
+		send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
+		break;
+	case IP6T_ICMP6_ECHOREPLY:
+		/* Do nothing */
+		break;
+	case IP6T_TCP_RESET:
+		send_reset(net, skb);
+		break;
+	default:
+		if (net_ratelimit())
+			pr_info("case %u not handled yet\n", reject->with);
+		break;
+	}
+
+	return NF_DROP;
+}
+
+static int reject_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_reject_info *rejinfo = par->targinfo;
+	const struct ip6t_entry *e = par->entryinfo;
+
+	if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+		pr_info("ECHOREPLY is not supported.\n");
+		return -EINVAL;
+	} else if (rejinfo->with == IP6T_TCP_RESET) {
+		/* Must specify that it's a TCP packet */
+		if (e->ipv6.proto != IPPROTO_TCP ||
+		    (e->ipv6.invflags & XT_INV_PROTO)) {
+			pr_info("TCP_RESET illegal for non-tcp\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static struct xt_target reject_tg6_reg __read_mostly = {
+	.name		= "REJECT",
+	.family		= NFPROTO_IPV6,
+	.target		= reject_tg6,
+	.targetsize	= sizeof(struct ip6t_reject_info),
+	.table		= "filter",
+	.hooks		= (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_LOCAL_OUT),
+	.checkentry	= reject_tg6_check,
+	.me		= THIS_MODULE
+};
+
+static int __init reject_tg6_init(void)
+{
+	return xt_register_target(&reject_tg6_reg);
+}
+
+static void __exit reject_tg6_exit(void)
+{
+	xt_unregister_target(&reject_tg6_reg);
+}
+
+module_init(reject_tg6_init);
+module_exit(reject_tg6_exit);
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
new file mode 100644
index 00000000..89cccc5a
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -0,0 +1,121 @@
+/* Kernel module to match AH parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_ah.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+	bool r;
+
+	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+		 invert ? '!' : ' ', min, spi, max);
+	r = (spi >= min && spi <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ip_auth_hdr _ah;
+	const struct ip_auth_hdr *ah;
+	const struct ip6t_ah *ahinfo = par->matchinfo;
+	unsigned int ptr;
+	unsigned int hdrlen = 0;
+	int err;
+
+	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL);
+	if (err < 0) {
+		if (err != -ENOENT)
+			par->hotdrop = true;
+		return false;
+	}
+
+	ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah);
+	if (ah == NULL) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	hdrlen = (ah->hdrlen + 2) << 2;
+
+	pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
+	pr_debug("RES %04X ", ah->reserved);
+	pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi));
+
+	pr_debug("IPv6 AH spi %02X ",
+		 spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			   ntohl(ah->spi),
+			   !!(ahinfo->invflags & IP6T_AH_INV_SPI)));
+	pr_debug("len %02X %04X %02X ",
+		 ahinfo->hdrlen, hdrlen,
+		 (!ahinfo->hdrlen ||
+		  (ahinfo->hdrlen == hdrlen) ^
+		  !!(ahinfo->invflags & IP6T_AH_INV_LEN)));
+	pr_debug("res %02X %04X %02X\n",
+		 ahinfo->hdrres, ah->reserved,
+		 !(ahinfo->hdrres && ah->reserved));
+
+	return (ah != NULL) &&
+		spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			  ntohl(ah->spi),
+			  !!(ahinfo->invflags & IP6T_AH_INV_SPI)) &&
+		(!ahinfo->hdrlen ||
+		 (ahinfo->hdrlen == hdrlen) ^
+		 !!(ahinfo->invflags & IP6T_AH_INV_LEN)) &&
+		!(ahinfo->hdrres && ah->reserved);
+}
+
+static int ah_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_ah *ahinfo = par->matchinfo;
+
+	if (ahinfo->invflags & ~IP6T_AH_INV_MASK) {
+		pr_debug("unknown flags %X\n", ahinfo->invflags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match ah_mt6_reg __read_mostly = {
+	.name		= "ah",
+	.family		= NFPROTO_IPV6,
+	.match		= ah_mt6,
+	.matchsize	= sizeof(struct ip6t_ah),
+	.checkentry	= ah_mt6_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init ah_mt6_init(void)
+{
+	return xt_register_match(&ah_mt6_reg);
+}
+
+static void __exit ah_mt6_exit(void)
+{
+	xt_unregister_match(&ah_mt6_reg);
+}
+
+module_init(ah_mt6_init);
+module_exit(ah_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
new file mode 100644
index 00000000..aab07069
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -0,0 +1,74 @@
+/* Kernel module to match EUI64 address parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/if_ether.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+static bool
+eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	unsigned char eui64[8];
+
+	if (!(skb_mac_header(skb) >= skb->head &&
+	      skb_mac_header(skb) + ETH_HLEN <= skb->data) &&
+	    par->fragoff != 0) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	memset(eui64, 0, sizeof(eui64));
+
+	if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) {
+		if (ipv6_hdr(skb)->version == 0x6) {
+			memcpy(eui64, eth_hdr(skb)->h_source, 3);
+			memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3);
+			eui64[3] = 0xff;
+			eui64[4] = 0xfe;
+			eui64[0] ^= 0x02;
+
+			if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64,
+				    sizeof(eui64)))
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static struct xt_match eui64_mt6_reg __read_mostly = {
+	.name		= "eui64",
+	.family		= NFPROTO_IPV6,
+	.match		= eui64_mt6,
+	.matchsize	= sizeof(int),
+	.hooks		= (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_FORWARD),
+	.me		= THIS_MODULE,
+};
+
+static int __init eui64_mt6_init(void)
+{
+	return xt_register_match(&eui64_mt6_reg);
+}
+
+static void __exit eui64_mt6_exit(void)
+{
+	xt_unregister_match(&eui64_mt6_reg);
+}
+
+module_init(eui64_mt6_init);
+module_exit(eui64_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
new file mode 100644
index 00000000..eda898fd
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -0,0 +1,136 @@
+/* Kernel module to match FRAG parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_frag.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 fragment match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the id is matched by the range, 0 otherwise */
+static inline bool
+id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
+{
+	bool r;
+	pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ',
+		 min, id, max);
+	r = (id >= min && id <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool
+frag_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct frag_hdr _frag;
+	const struct frag_hdr *fh;
+	const struct ip6t_frag *fraginfo = par->matchinfo;
+	unsigned int ptr;
+	int err;
+
+	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL);
+	if (err < 0) {
+		if (err != -ENOENT)
+			par->hotdrop = true;
+		return false;
+	}
+
+	fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag);
+	if (fh == NULL) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	pr_debug("INFO %04X ", fh->frag_off);
+	pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7);
+	pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6);
+	pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF));
+	pr_debug("ID %u %08X\n", ntohl(fh->identification),
+		 ntohl(fh->identification));
+
+	pr_debug("IPv6 FRAG id %02X ",
+		 id_match(fraginfo->ids[0], fraginfo->ids[1],
+			  ntohl(fh->identification),
+			  !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)));
+	pr_debug("res %02X %02X%04X %02X ",
+		 fraginfo->flags & IP6T_FRAG_RES, fh->reserved,
+		 ntohs(fh->frag_off) & 0x6,
+		 !((fraginfo->flags & IP6T_FRAG_RES) &&
+		   (fh->reserved || (ntohs(fh->frag_off) & 0x06))));
+	pr_debug("first %02X %02X %02X ",
+		 fraginfo->flags & IP6T_FRAG_FST,
+		 ntohs(fh->frag_off) & ~0x7,
+		 !((fraginfo->flags & IP6T_FRAG_FST) &&
+		   (ntohs(fh->frag_off) & ~0x7)));
+	pr_debug("mf %02X %02X %02X ",
+		 fraginfo->flags & IP6T_FRAG_MF,
+		 ntohs(fh->frag_off) & IP6_MF,
+		 !((fraginfo->flags & IP6T_FRAG_MF) &&
+		   !((ntohs(fh->frag_off) & IP6_MF))));
+	pr_debug("last %02X %02X %02X\n",
+		 fraginfo->flags & IP6T_FRAG_NMF,
+		 ntohs(fh->frag_off) & IP6_MF,
+		 !((fraginfo->flags & IP6T_FRAG_NMF) &&
+		   (ntohs(fh->frag_off) & IP6_MF)));
+
+	return (fh != NULL) &&
+		id_match(fraginfo->ids[0], fraginfo->ids[1],
+			 ntohl(fh->identification),
+			 !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) &&
+		!((fraginfo->flags & IP6T_FRAG_RES) &&
+		  (fh->reserved || (ntohs(fh->frag_off) & 0x6))) &&
+		!((fraginfo->flags & IP6T_FRAG_FST) &&
+		  (ntohs(fh->frag_off) & ~0x7)) &&
+		!((fraginfo->flags & IP6T_FRAG_MF) &&
+		  !(ntohs(fh->frag_off) & IP6_MF)) &&
+		!((fraginfo->flags & IP6T_FRAG_NMF) &&
+		  (ntohs(fh->frag_off) & IP6_MF));
+}
+
+static int frag_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_frag *fraginfo = par->matchinfo;
+
+	if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) {
+		pr_debug("unknown flags %X\n", fraginfo->invflags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match frag_mt6_reg __read_mostly = {
+	.name		= "frag",
+	.family		= NFPROTO_IPV6,
+	.match		= frag_mt6,
+	.matchsize	= sizeof(struct ip6t_frag),
+	.checkentry	= frag_mt6_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init frag_mt6_init(void)
+{
+	return xt_register_match(&frag_mt6_reg);
+}
+
+static void __exit frag_mt6_exit(void)
+{
+	xt_unregister_match(&frag_mt6_reg);
+}
+
+module_init(frag_mt6_init);
+module_exit(frag_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
new file mode 100644
index 00000000..59df051e
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -0,0 +1,215 @@
+/* Kernel module to match Hop-by-Hop and Destination parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <asm/byteorder.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_opts.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+MODULE_ALIAS("ip6t_dst");
+
+/*
+ *  (Type & 0xC0) >> 6
+ *	0	-> ignorable
+ *	1	-> must drop the packet
+ *	2	-> send ICMP PARM PROB regardless and drop packet
+ *	3	-> Send ICMP if not a multicast address and drop packet
+ *  (Type & 0x20) >> 5
+ *	0	-> invariant
+ *	1	-> can change the routing
+ *  (Type & 0x1F) Type
+ *	0	-> Pad1 (only 1 byte!)
+ *	1	-> PadN LENGTH info (total length = length + 2)
+ *	C0 | 2	-> JUMBO 4 x x x x ( xxxx > 64k )
+ *	5	-> RTALERT 2 x x
+ */
+
+static struct xt_match hbh_mt6_reg[] __read_mostly;
+
+static bool
+hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6_opt_hdr _optsh;
+	const struct ipv6_opt_hdr *oh;
+	const struct ip6t_opts *optinfo = par->matchinfo;
+	unsigned int temp;
+	unsigned int ptr;
+	unsigned int hdrlen = 0;
+	bool ret = false;
+	u8 _opttype;
+	u8 _optlen;
+	const u_int8_t *tp = NULL;
+	const u_int8_t *lp = NULL;
+	unsigned int optlen;
+	int err;
+
+	err = ipv6_find_hdr(skb, &ptr,
+			    (par->match == &hbh_mt6_reg[0]) ?
+			    NEXTHDR_HOP : NEXTHDR_DEST, NULL);
+	if (err < 0) {
+		if (err != -ENOENT)
+			par->hotdrop = true;
+		return false;
+	}
+
+	oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh);
+	if (oh == NULL) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	hdrlen = ipv6_optlen(oh);
+	if (skb->len - ptr < hdrlen) {
+		/* Packet smaller than it's length field */
+		return false;
+	}
+
+	pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen);
+
+	pr_debug("len %02X %04X %02X ",
+		 optinfo->hdrlen, hdrlen,
+		 (!(optinfo->flags & IP6T_OPTS_LEN) ||
+		  ((optinfo->hdrlen == hdrlen) ^
+		   !!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
+
+	ret = (oh != NULL) &&
+	      (!(optinfo->flags & IP6T_OPTS_LEN) ||
+	       ((optinfo->hdrlen == hdrlen) ^
+		!!(optinfo->invflags & IP6T_OPTS_INV_LEN)));
+
+	ptr += 2;
+	hdrlen -= 2;
+	if (!(optinfo->flags & IP6T_OPTS_OPTS)) {
+		return ret;
+	} else {
+		pr_debug("Strict ");
+		pr_debug("#%d ", optinfo->optsnr);
+		for (temp = 0; temp < optinfo->optsnr; temp++) {
+			/* type field exists ? */
+			if (hdrlen < 1)
+				break;
+			tp = skb_header_pointer(skb, ptr, sizeof(_opttype),
+						&_opttype);
+			if (tp == NULL)
+				break;
+
+			/* Type check */
+			if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) {
+				pr_debug("Tbad %02X %02X\n", *tp,
+					 (optinfo->opts[temp] & 0xFF00) >> 8);
+				return false;
+			} else {
+				pr_debug("Tok ");
+			}
+			/* Length check */
+			if (*tp) {
+				u16 spec_len;
+
+				/* length field exists ? */
+				if (hdrlen < 2)
+					break;
+				lp = skb_header_pointer(skb, ptr + 1,
+							sizeof(_optlen),
+							&_optlen);
+				if (lp == NULL)
+					break;
+				spec_len = optinfo->opts[temp] & 0x00FF;
+
+				if (spec_len != 0x00FF && spec_len != *lp) {
+					pr_debug("Lbad %02X %04X\n", *lp,
+						 spec_len);
+					return false;
+				}
+				pr_debug("Lok ");
+				optlen = *lp + 2;
+			} else {
+				pr_debug("Pad1\n");
+				optlen = 1;
+			}
+
+			/* Step to the next */
+			pr_debug("len%04X\n", optlen);
+
+			if ((ptr > skb->len - optlen || hdrlen < optlen) &&
+			    temp < optinfo->optsnr - 1) {
+				pr_debug("new pointer is too large!\n");
+				break;
+			}
+			ptr += optlen;
+			hdrlen -= optlen;
+		}
+		if (temp == optinfo->optsnr)
+			return ret;
+		else
+			return false;
+	}
+
+	return false;
+}
+
+static int hbh_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_opts *optsinfo = par->matchinfo;
+
+	if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) {
+		pr_debug("unknown flags %X\n", optsinfo->invflags);
+		return -EINVAL;
+	}
+
+	if (optsinfo->flags & IP6T_OPTS_NSTRICT) {
+		pr_debug("Not strict - not implemented");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match hbh_mt6_reg[] __read_mostly = {
+	{
+		/* Note, hbh_mt6 relies on the order of hbh_mt6_reg */
+		.name		= "hbh",
+		.family		= NFPROTO_IPV6,
+		.match		= hbh_mt6,
+		.matchsize	= sizeof(struct ip6t_opts),
+		.checkentry	= hbh_mt6_check,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "dst",
+		.family		= NFPROTO_IPV6,
+		.match		= hbh_mt6,
+		.matchsize	= sizeof(struct ip6t_opts),
+		.checkentry	= hbh_mt6_check,
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init hbh_mt6_init(void)
+{
+	return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
+}
+
+static void __exit hbh_mt6_exit(void)
+{
+	xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg));
+}
+
+module_init(hbh_mt6_init);
+module_exit(hbh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
new file mode 100644
index 00000000..54bd9790
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -0,0 +1,154 @@
+/* ipv6header match - matches IPv6 packets based
+   on whether they contain certain headers */
+
+/* Original idea: Brad Chapman
+ * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_ipv6header.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 header types match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+static bool
+ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_ipv6header_info *info = par->matchinfo;
+	unsigned int temp;
+	int len;
+	u8 nexthdr;
+	unsigned int ptr;
+
+	/* Make sure this isn't an evil packet */
+
+	/* type of the 1st exthdr */
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	/* pointer to the 1st exthdr */
+	ptr = sizeof(struct ipv6hdr);
+	/* available length */
+	len = skb->len - ptr;
+	temp = 0;
+
+	while (ip6t_ext_hdr(nexthdr)) {
+		const struct ipv6_opt_hdr *hp;
+		struct ipv6_opt_hdr _hdr;
+		int hdrlen;
+
+		/* No more exthdr -> evaluate */
+		if (nexthdr == NEXTHDR_NONE) {
+			temp |= MASK_NONE;
+			break;
+		}
+		/* Is there enough space for the next ext header? */
+		if (len < (int)sizeof(struct ipv6_opt_hdr))
+			return false;
+		/* ESP -> evaluate */
+		if (nexthdr == NEXTHDR_ESP) {
+			temp |= MASK_ESP;
+			break;
+		}
+
+		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+		BUG_ON(hp == NULL);
+
+		/* Calculate the header length */
+		if (nexthdr == NEXTHDR_FRAGMENT)
+			hdrlen = 8;
+		else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hp->hdrlen + 2) << 2;
+		else
+			hdrlen = ipv6_optlen(hp);
+
+		/* set the flag */
+		switch (nexthdr) {
+		case NEXTHDR_HOP:
+			temp |= MASK_HOPOPTS;
+			break;
+		case NEXTHDR_ROUTING:
+			temp |= MASK_ROUTING;
+			break;
+		case NEXTHDR_FRAGMENT:
+			temp |= MASK_FRAGMENT;
+			break;
+		case NEXTHDR_AUTH:
+			temp |= MASK_AH;
+			break;
+		case NEXTHDR_DEST:
+			temp |= MASK_DSTOPTS;
+			break;
+		default:
+			return false;
+			break;
+		}
+
+		nexthdr = hp->nexthdr;
+		len -= hdrlen;
+		ptr += hdrlen;
+		if (ptr > skb->len)
+			break;
+	}
+
+	if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP)
+		temp |= MASK_PROTO;
+
+	if (info->modeflag)
+		return !((temp ^ info->matchflags ^ info->invflags)
+			 & info->matchflags);
+	else {
+		if (info->invflags)
+			return temp != info->matchflags;
+		else
+			return temp == info->matchflags;
+	}
+}
+
+static int ipv6header_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_ipv6header_info *info = par->matchinfo;
+
+	/* invflags is 0 or 0xff in hard mode */
+	if ((!info->modeflag) && info->invflags != 0x00 &&
+	    info->invflags != 0xFF)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct xt_match ipv6header_mt6_reg __read_mostly = {
+	.name		= "ipv6header",
+	.family		= NFPROTO_IPV6,
+	.match		= ipv6header_mt6,
+	.matchsize	= sizeof(struct ip6t_ipv6header_info),
+	.checkentry	= ipv6header_mt6_check,
+	.destroy	= NULL,
+	.me		= THIS_MODULE,
+};
+
+static int __init ipv6header_mt6_init(void)
+{
+	return xt_register_match(&ipv6header_mt6_reg);
+}
+
+static void __exit ipv6header_mt6_exit(void)
+{
+	xt_unregister_match(&ipv6header_mt6_reg);
+}
+
+module_init(ipv6header_mt6_init);
+module_exit(ipv6header_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
new file mode 100644
index 00000000..0c90c66b
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C)2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ *	Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com>
+ *
+ * Based on net/netfilter/xt_tcpudp.c
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/mip6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6t_mh.h>
+
+MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match");
+MODULE_LICENSE("GPL");
+
+/* Returns 1 if the type is matched by the range, 0 otherwise */
+static inline bool
+type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert)
+{
+	return (type >= min && type <= max) ^ invert;
+}
+
+static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ip6_mh _mh;
+	const struct ip6_mh *mh;
+	const struct ip6t_mh *mhinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh);
+	if (mh == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		pr_debug("Dropping evil MH tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	if (mh->ip6mh_proto != IPPROTO_NONE) {
+		pr_debug("Dropping invalid MH Payload Proto: %u\n",
+			 mh->ip6mh_proto);
+		par->hotdrop = true;
+		return false;
+	}
+
+	return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type,
+			  !!(mhinfo->invflags & IP6T_MH_INV_TYPE));
+}
+
+static int mh_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_mh *mhinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0;
+}
+
+static struct xt_match mh_mt6_reg __read_mostly = {
+	.name		= "mh",
+	.family		= NFPROTO_IPV6,
+	.checkentry	= mh_mt6_check,
+	.match		= mh_mt6,
+	.matchsize	= sizeof(struct ip6t_mh),
+	.proto		= IPPROTO_MH,
+	.me		= THIS_MODULE,
+};
+
+static int __init mh_mt6_init(void)
+{
+	return xt_register_match(&mh_mt6_reg);
+}
+
+static void __exit mh_mt6_exit(void)
+{
+	xt_unregister_match(&mh_mt6_reg);
+}
+
+module_init(mh_mt6_init);
+module_exit(mh_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
new file mode 100644
index 00000000..d8488c50
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -0,0 +1,225 @@
+/* Kernel module to match ROUTING parameters. */
+
+/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ipv6.h>
+
+#include <asm/byteorder.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_rt.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match");
+MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
+
+/* Returns 1 if the id is matched by the range, 0 otherwise */
+static inline bool
+segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
+{
+	bool r;
+	pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n",
+		 invert ? '!' : ' ', min, id, max);
+	r = (id >= min && id <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6_rt_hdr _route;
+	const struct ipv6_rt_hdr *rh;
+	const struct ip6t_rt *rtinfo = par->matchinfo;
+	unsigned int temp;
+	unsigned int ptr;
+	unsigned int hdrlen = 0;
+	bool ret = false;
+	struct in6_addr _addr;
+	const struct in6_addr *ap;
+	int err;
+
+	err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL);
+	if (err < 0) {
+		if (err != -ENOENT)
+			par->hotdrop = true;
+		return false;
+	}
+
+	rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route);
+	if (rh == NULL) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	hdrlen = ipv6_optlen(rh);
+	if (skb->len - ptr < hdrlen) {
+		/* Pcket smaller than its length field */
+		return false;
+	}
+
+	pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
+	pr_debug("TYPE %04X ", rh->type);
+	pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
+
+	pr_debug("IPv6 RT segsleft %02X ",
+		 segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+				rh->segments_left,
+				!!(rtinfo->invflags & IP6T_RT_INV_SGS)));
+	pr_debug("type %02X %02X %02X ",
+		 rtinfo->rt_type, rh->type,
+		 (!(rtinfo->flags & IP6T_RT_TYP) ||
+		  ((rtinfo->rt_type == rh->type) ^
+		   !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
+	pr_debug("len %02X %04X %02X ",
+		 rtinfo->hdrlen, hdrlen,
+		 !(rtinfo->flags & IP6T_RT_LEN) ||
+		  ((rtinfo->hdrlen == hdrlen) ^
+		   !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
+	pr_debug("res %02X %02X %02X ",
+		 rtinfo->flags & IP6T_RT_RES,
+		 ((const struct rt0_hdr *)rh)->reserved,
+		 !((rtinfo->flags & IP6T_RT_RES) &&
+		   (((const struct rt0_hdr *)rh)->reserved)));
+
+	ret = (rh != NULL) &&
+	      (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+			      rh->segments_left,
+			      !!(rtinfo->invflags & IP6T_RT_INV_SGS))) &&
+	      (!(rtinfo->flags & IP6T_RT_LEN) ||
+	       ((rtinfo->hdrlen == hdrlen) ^
+		!!(rtinfo->invflags & IP6T_RT_INV_LEN))) &&
+	      (!(rtinfo->flags & IP6T_RT_TYP) ||
+	       ((rtinfo->rt_type == rh->type) ^
+		!!(rtinfo->invflags & IP6T_RT_INV_TYP)));
+
+	if (ret && (rtinfo->flags & IP6T_RT_RES)) {
+		const u_int32_t *rp;
+		u_int32_t _reserved;
+		rp = skb_header_pointer(skb,
+					ptr + offsetof(struct rt0_hdr,
+						       reserved),
+					sizeof(_reserved),
+					&_reserved);
+
+		ret = (*rp == 0);
+	}
+
+	pr_debug("#%d ", rtinfo->addrnr);
+	if (!(rtinfo->flags & IP6T_RT_FST)) {
+		return ret;
+	} else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
+		pr_debug("Not strict ");
+		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
+			pr_debug("There isn't enough space\n");
+			return false;
+		} else {
+			unsigned int i = 0;
+
+			pr_debug("#%d ", rtinfo->addrnr);
+			for (temp = 0;
+			     temp < (unsigned int)((hdrlen - 8) / 16);
+			     temp++) {
+				ap = skb_header_pointer(skb,
+							ptr
+							+ sizeof(struct rt0_hdr)
+							+ temp * sizeof(_addr),
+							sizeof(_addr),
+							&_addr);
+
+				BUG_ON(ap == NULL);
+
+				if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
+					pr_debug("i=%d temp=%d;\n", i, temp);
+					i++;
+				}
+				if (i == rtinfo->addrnr)
+					break;
+			}
+			pr_debug("i=%d #%d\n", i, rtinfo->addrnr);
+			if (i == rtinfo->addrnr)
+				return ret;
+			else
+				return false;
+		}
+	} else {
+		pr_debug("Strict ");
+		if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
+			pr_debug("There isn't enough space\n");
+			return false;
+		} else {
+			pr_debug("#%d ", rtinfo->addrnr);
+			for (temp = 0; temp < rtinfo->addrnr; temp++) {
+				ap = skb_header_pointer(skb,
+							ptr
+							+ sizeof(struct rt0_hdr)
+							+ temp * sizeof(_addr),
+							sizeof(_addr),
+							&_addr);
+				BUG_ON(ap == NULL);
+
+				if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
+					break;
+			}
+			pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr);
+			if (temp == rtinfo->addrnr &&
+			    temp == (unsigned int)((hdrlen - 8) / 16))
+				return ret;
+			else
+				return false;
+		}
+	}
+
+	return false;
+}
+
+static int rt_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_rt *rtinfo = par->matchinfo;
+
+	if (rtinfo->invflags & ~IP6T_RT_INV_MASK) {
+		pr_debug("unknown flags %X\n", rtinfo->invflags);
+		return -EINVAL;
+	}
+	if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) &&
+	    (!(rtinfo->flags & IP6T_RT_TYP) ||
+	     (rtinfo->rt_type != 0) ||
+	     (rtinfo->invflags & IP6T_RT_INV_TYP))) {
+		pr_debug("`--rt-type 0' required before `--rt-0-*'");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match rt_mt6_reg __read_mostly = {
+	.name		= "rt",
+	.family		= NFPROTO_IPV6,
+	.match		= rt_mt6,
+	.matchsize	= sizeof(struct ip6t_rt),
+	.checkentry	= rt_mt6_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init rt_mt6_init(void)
+{
+	return xt_register_match(&rt_mt6_reg);
+}
+
+static void __exit rt_mt6_exit(void)
+{
+	xt_unregister_match(&rt_mt6_reg);
+}
+
+module_init(rt_mt6_init);
+module_exit(rt_mt6_exit);
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
new file mode 100644
index 00000000..c9e37c8f
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -0,0 +1,113 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV6,
+	.priority	= NF_IP6_PRI_FILTER,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_filter_hook(unsigned int hook, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	const struct net *net = dev_net((in != NULL) ? in : out);
+
+	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_filter);
+}
+
+static struct nf_hook_ops *filter_ops __read_mostly;
+
+/* Default to forward because I got too much mail already. */
+static int forward = NF_ACCEPT;
+module_param(forward, bool, 0000);
+
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+	struct ip6t_replace *repl;
+
+	repl = ip6t_alloc_initial_table(&packet_filter);
+	if (repl == NULL)
+		return -ENOMEM;
+	/* Entry 1 is the FORWARD hook */
+	((struct ip6t_standard *)repl->entries)[1].target.verdict =
+		-forward - 1;
+
+	net->ipv6.ip6table_filter =
+		ip6t_register_table(net, &packet_filter, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv6.ip6table_filter))
+		return PTR_ERR(net->ipv6.ip6table_filter);
+	return 0;
+}
+
+static void __net_exit ip6table_filter_net_exit(struct net *net)
+{
+	ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+}
+
+static struct pernet_operations ip6table_filter_net_ops = {
+	.init = ip6table_filter_net_init,
+	.exit = ip6table_filter_net_exit,
+};
+
+static int __init ip6table_filter_init(void)
+{
+	int ret;
+
+	if (forward < 0 || forward > NF_MAX_VERDICT) {
+		pr_err("iptables forward must be 0 or 1\n");
+		return -EINVAL;
+	}
+
+	ret = register_pernet_subsys(&ip6table_filter_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
+	if (IS_ERR(filter_ops)) {
+		ret = PTR_ERR(filter_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&ip6table_filter_net_ops);
+	return ret;
+}
+
+static void __exit ip6table_filter_fini(void)
+{
+	xt_hook_unlink(&packet_filter, filter_ops);
+	unregister_pernet_subsys(&ip6table_filter_net_ops);
+}
+
+module_init(ip6table_filter_init);
+module_exit(ip6table_filter_fini);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
new file mode 100644
index 00000000..00d19173
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -0,0 +1,145 @@
+/*
+ * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
+ *
+ * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
+			    (1 << NF_INET_LOCAL_IN) | \
+			    (1 << NF_INET_FORWARD) | \
+			    (1 << NF_INET_LOCAL_OUT) | \
+			    (1 << NF_INET_POST_ROUTING))
+
+static const struct xt_table packet_mangler = {
+	.name		= "mangle",
+	.valid_hooks	= MANGLE_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV6,
+	.priority	= NF_IP6_PRI_MANGLE,
+};
+
+static unsigned int
+ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out)
+{
+	unsigned int ret;
+	struct in6_addr saddr, daddr;
+	u_int8_t hop_limit;
+	u_int32_t flowlabel, mark;
+
+#if 0
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct iphdr) ||
+	    ip_hdrlen(skb) < sizeof(struct iphdr)) {
+		if (net_ratelimit())
+			pr_warning("ip6t_hook: happy cracking.\n");
+		return NF_ACCEPT;
+	}
+#endif
+
+	/* save source/dest address, mark, hoplimit, flowlabel, priority,  */
+	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+	mark = skb->mark;
+	hop_limit = ipv6_hdr(skb)->hop_limit;
+
+	/* flowlabel and prio (includes version, which shouldn't change either */
+	flowlabel = *((u_int32_t *)ipv6_hdr(skb));
+
+	ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
+			    dev_net(out)->ipv6.ip6table_mangle);
+
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+	     skb->mark != mark ||
+	     ipv6_hdr(skb)->hop_limit != hop_limit ||
+	     flowlabel != *((u_int32_t *)ipv6_hdr(skb))))
+		return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_mangle_hook(unsigned int hook, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	if (hook == NF_INET_LOCAL_OUT)
+		return ip6t_mangle_out(skb, out);
+	if (hook == NF_INET_POST_ROUTING)
+		return ip6t_do_table(skb, hook, in, out,
+				     dev_net(out)->ipv6.ip6table_mangle);
+	/* INPUT/FORWARD */
+	return ip6t_do_table(skb, hook, in, out,
+			     dev_net(in)->ipv6.ip6table_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
+static int __net_init ip6table_mangle_net_init(struct net *net)
+{
+	struct ip6t_replace *repl;
+
+	repl = ip6t_alloc_initial_table(&packet_mangler);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv6.ip6table_mangle =
+		ip6t_register_table(net, &packet_mangler, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv6.ip6table_mangle))
+		return PTR_ERR(net->ipv6.ip6table_mangle);
+	return 0;
+}
+
+static void __net_exit ip6table_mangle_net_exit(struct net *net)
+{
+	ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+}
+
+static struct pernet_operations ip6table_mangle_net_ops = {
+	.init = ip6table_mangle_net_init,
+	.exit = ip6table_mangle_net_exit,
+};
+
+static int __init ip6table_mangle_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip6table_mangle_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
+	if (IS_ERR(mangle_ops)) {
+		ret = PTR_ERR(mangle_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+	return ret;
+}
+
+static void __exit ip6table_mangle_fini(void)
+{
+	xt_hook_unlink(&packet_mangler, mangle_ops);
+	unregister_pernet_subsys(&ip6table_mangle_net_ops);
+}
+
+module_init(ip6table_mangle_init);
+module_exit(ip6table_mangle_fini);
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
new file mode 100644
index 00000000..5b9926a0
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -0,0 +1,88 @@
+/*
+ * IPv6 raw table, a port of the IPv4 raw table to IPv6
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+
+static const struct xt_table packet_raw = {
+	.name = "raw",
+	.valid_hooks = RAW_VALID_HOOKS,
+	.me = THIS_MODULE,
+	.af = NFPROTO_IPV6,
+	.priority = NF_IP6_PRI_RAW,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ip6table_raw_hook(unsigned int hook, struct sk_buff *skb,
+		  const struct net_device *in, const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	const struct net *net = dev_net((in != NULL) ? in : out);
+
+	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_raw);
+}
+
+static struct nf_hook_ops *rawtable_ops __read_mostly;
+
+static int __net_init ip6table_raw_net_init(struct net *net)
+{
+	struct ip6t_replace *repl;
+
+	repl = ip6t_alloc_initial_table(&packet_raw);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv6.ip6table_raw =
+		ip6t_register_table(net, &packet_raw, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv6.ip6table_raw))
+		return PTR_ERR(net->ipv6.ip6table_raw);
+	return 0;
+}
+
+static void __net_exit ip6table_raw_net_exit(struct net *net)
+{
+	ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+}
+
+static struct pernet_operations ip6table_raw_net_ops = {
+	.init = ip6table_raw_net_init,
+	.exit = ip6table_raw_net_exit,
+};
+
+static int __init ip6table_raw_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip6table_raw_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
+	if (IS_ERR(rawtable_ops)) {
+		ret = PTR_ERR(rawtable_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&ip6table_raw_net_ops);
+	return ret;
+}
+
+static void __exit ip6table_raw_fini(void)
+{
+	xt_hook_unlink(&packet_raw, rawtable_ops);
+	unregister_pernet_subsys(&ip6table_raw_net_ops);
+}
+
+module_init(ip6table_raw_init);
+module_exit(ip6table_raw_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
new file mode 100644
index 00000000..91aa2b4d
--- /dev/null
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -0,0 +1,105 @@
+/*
+ * "security" table for IPv6
+ *
+ * This is for use by Mandatory Access Control (MAC) security models,
+ * which need to be able to manage security policy in separate context
+ * to DAC.
+ *
+ * Based on iptable_mangle.c
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
+ * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
+MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
+
+#define SECURITY_VALID_HOOKS	(1 << NF_INET_LOCAL_IN) | \
+				(1 << NF_INET_FORWARD) | \
+				(1 << NF_INET_LOCAL_OUT)
+
+static const struct xt_table security_table = {
+	.name		= "security",
+	.valid_hooks	= SECURITY_VALID_HOOKS,
+	.me		= THIS_MODULE,
+	.af		= NFPROTO_IPV6,
+	.priority	= NF_IP6_PRI_SECURITY,
+};
+
+static unsigned int
+ip6table_security_hook(unsigned int hook, struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       int (*okfn)(struct sk_buff *))
+{
+	const struct net *net = dev_net((in != NULL) ? in : out);
+
+	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_security);
+}
+
+static struct nf_hook_ops *sectbl_ops __read_mostly;
+
+static int __net_init ip6table_security_net_init(struct net *net)
+{
+	struct ip6t_replace *repl;
+
+	repl = ip6t_alloc_initial_table(&security_table);
+	if (repl == NULL)
+		return -ENOMEM;
+	net->ipv6.ip6table_security =
+		ip6t_register_table(net, &security_table, repl);
+	kfree(repl);
+	if (IS_ERR(net->ipv6.ip6table_security))
+		return PTR_ERR(net->ipv6.ip6table_security);
+
+	return 0;
+}
+
+static void __net_exit ip6table_security_net_exit(struct net *net)
+{
+	ip6t_unregister_table(net, net->ipv6.ip6table_security);
+}
+
+static struct pernet_operations ip6table_security_net_ops = {
+	.init = ip6table_security_net_init,
+	.exit = ip6table_security_net_exit,
+};
+
+static int __init ip6table_security_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip6table_security_net_ops);
+	if (ret < 0)
+		return ret;
+
+	sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
+	if (IS_ERR(sectbl_ops)) {
+		ret = PTR_ERR(sectbl_ops);
+		goto cleanup_table;
+	}
+
+	return ret;
+
+cleanup_table:
+	unregister_pernet_subsys(&ip6table_security_net_ops);
+	return ret;
+}
+
+static void __exit ip6table_security_fini(void)
+{
+	xt_hook_unlink(&security_table, sectbl_ops);
+	unregister_pernet_subsys(&ip6table_security_net_ops);
+}
+
+module_init(ip6table_security_init);
+module_exit(ip6table_security_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
new file mode 100644
index 00000000..4111050a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ *	Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netfilter/nf_log.h>
+
+static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+			      struct nf_conntrack_tuple *tuple)
+{
+	const u_int32_t *ap;
+	u_int32_t _addrs[8];
+
+	ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
+				sizeof(_addrs), _addrs);
+	if (ap == NULL)
+		return false;
+
+	memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+	memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+
+	return true;
+}
+
+static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_tuple *orig)
+{
+	memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
+	memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
+
+	return true;
+}
+
+static int ipv6_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "src=%pI6 dst=%pI6 ",
+			  tuple->src.u3.ip6, tuple->dst.u3.ip6);
+}
+
+/*
+ * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * if success, *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - it may return pointer pointing beyond end of packet,
+ *          if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *          it returns -1.
+ *        - if packet is fragmented, return pointer of the fragment header.
+ *        - ESP is unparsable for now and considered like
+ *          normal payload protocol.
+ *        - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ */
+
+static int nf_ct_ipv6_skip_exthdr(const struct sk_buff *skb, int start,
+				  u8 *nexthdrp, int len)
+{
+	u8 nexthdr = *nexthdrp;
+
+	while (ipv6_ext_hdr(nexthdr)) {
+		struct ipv6_opt_hdr hdr;
+		int hdrlen;
+
+		if (len < (int)sizeof(struct ipv6_opt_hdr))
+			return -1;
+		if (nexthdr == NEXTHDR_NONE)
+			break;
+		if (nexthdr == NEXTHDR_FRAGMENT)
+			break;
+		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+			BUG();
+		if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr.hdrlen+2)<<2;
+		else
+			hdrlen = ipv6_optlen(&hdr);
+
+		nexthdr = hdr.nexthdr;
+		len -= hdrlen;
+		start += hdrlen;
+	}
+
+	*nexthdrp = nexthdr;
+	return start;
+}
+
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    unsigned int *dataoff, u_int8_t *protonum)
+{
+	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+	unsigned char pnum;
+	int protoff;
+
+	if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+			  &pnum, sizeof(pnum)) != 0) {
+		pr_debug("ip6_conntrack_core: can't get nexthdr\n");
+		return -NF_ACCEPT;
+	}
+	protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, skb->len - extoff);
+	/*
+	 * (protoff == skb->len) mean that the packet doesn't have no data
+	 * except of IPv6 & ext headers. but it's tracked anyway. - YK
+	 */
+	if ((protoff < 0) || (protoff > skb->len)) {
+		pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
+		return -NF_ACCEPT;
+	}
+
+	*dataoff = protoff;
+	*protonum = pnum;
+	return NF_ACCEPT;
+}
+
+static unsigned int ipv6_confirm(unsigned int hooknum,
+				 struct sk_buff *skb,
+				 const struct net_device *in,
+				 const struct net_device *out,
+				 int (*okfn)(struct sk_buff *))
+{
+	struct nf_conn *ct;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_helper *helper;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret, protoff;
+	unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
+	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+
+
+	/* This is where we call the helper: as the packet goes out. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		goto out;
+
+	help = nfct_help(ct);
+	if (!help)
+		goto out;
+	/* rcu_read_lock()ed by nf_hook_slow */
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		goto out;
+
+	protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum,
+					 skb->len - extoff);
+	if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) {
+		pr_debug("proto header not found\n");
+		return NF_ACCEPT;
+	}
+
+	ret = helper->help(skb, protoff, ct, ctinfo);
+	if (ret != NF_ACCEPT) {
+		nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL,
+			      "nf_ct_%s: dropping packet", helper->name);
+		return ret;
+	}
+out:
+	/* We've seen it coming out the other side: confirm it */
+	return nf_conntrack_confirm(skb);
+}
+
+static unsigned int __ipv6_conntrack_in(struct net *net,
+					unsigned int hooknum,
+					struct sk_buff *skb,
+					int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *reasm = skb->nfct_reasm;
+
+	/* This packet is fragmented and has reassembled packet. */
+	if (reasm) {
+		/* Reassembled packet isn't parsed yet ? */
+		if (!reasm->nfct) {
+			unsigned int ret;
+
+			ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+		nf_conntrack_get(reasm->nfct);
+		skb->nfct = reasm->nfct;
+		skb->nfctinfo = reasm->nfctinfo;
+		return NF_ACCEPT;
+	}
+
+	return nf_conntrack_in(net, PF_INET6, hooknum, skb);
+}
+
+static unsigned int ipv6_conntrack_in(unsigned int hooknum,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn);
+}
+
+static unsigned int ipv6_conntrack_local(unsigned int hooknum,
+					 struct sk_buff *skb,
+					 const struct net_device *in,
+					 const struct net_device *out,
+					 int (*okfn)(struct sk_buff *))
+{
+	/* root is playing with raw sockets. */
+	if (skb->len < sizeof(struct ipv6hdr)) {
+		if (net_ratelimit())
+			pr_notice("ipv6_conntrack_local: packet too short\n");
+		return NF_ACCEPT;
+	}
+	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn);
+}
+
+static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
+	{
+		.hook		= ipv6_conntrack_in,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv6_conntrack_local,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv6_confirm,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_LAST,
+	},
+	{
+		.hook		= ipv6_confirm,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_LAST-1,
+	},
+};
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	NLA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4,
+		&tuple->src.u3.ip6);
+	NLA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4,
+		&tuple->dst.u3.ip6);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
+	[CTA_IP_V6_SRC]	= { .len = sizeof(u_int32_t)*4 },
+	[CTA_IP_V6_DST]	= { .len = sizeof(u_int32_t)*4 },
+};
+
+static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
+		return -EINVAL;
+
+	memcpy(&t->src.u3.ip6, nla_data(tb[CTA_IP_V6_SRC]),
+	       sizeof(u_int32_t) * 4);
+	memcpy(&t->dst.u3.ip6, nla_data(tb[CTA_IP_V6_DST]),
+	       sizeof(u_int32_t) * 4);
+
+	return 0;
+}
+
+static int ipv6_nlattr_tuple_size(void)
+{
+	return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1);
+}
+#endif
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
+	.l3proto		= PF_INET6,
+	.name			= "ipv6",
+	.pkt_to_tuple		= ipv6_pkt_to_tuple,
+	.invert_tuple		= ipv6_invert_tuple,
+	.print_tuple		= ipv6_print_tuple,
+	.get_l4proto		= ipv6_get_l4proto,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
+	.nlattr_tuple_size	= ipv6_nlattr_tuple_size,
+	.nlattr_to_tuple	= ipv6_nlattr_to_tuple,
+	.nla_policy		= ipv6_nla_policy,
+#endif
+	.me			= THIS_MODULE,
+};
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
+
+static int __init nf_conntrack_l3proto_ipv6_init(void)
+{
+	int ret = 0;
+
+	need_conntrack();
+	nf_defrag_ipv6_enable();
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv6: can't register tcp.\n");
+		return ret;
+	}
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv6: can't register udp.\n");
+		goto cleanup_tcp;
+	}
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv6: can't register icmpv6.\n");
+		goto cleanup_udp;
+	}
+
+	ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv6: can't register ipv6\n");
+		goto cleanup_icmpv6;
+	}
+
+	ret = nf_register_hooks(ipv6_conntrack_ops,
+				ARRAY_SIZE(ipv6_conntrack_ops));
+	if (ret < 0) {
+		pr_err("nf_conntrack_ipv6: can't register pre-routing defrag "
+		       "hook.\n");
+		goto cleanup_ipv6;
+	}
+	return ret;
+
+ cleanup_ipv6:
+	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+ cleanup_icmpv6:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+ cleanup_udp:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+ cleanup_tcp:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+	return ret;
+}
+
+static void __exit nf_conntrack_l3proto_ipv6_fini(void)
+{
+	synchronize_net();
+	nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops));
+	nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
+}
+
+module_init(nf_conntrack_l3proto_ipv6_init);
+module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
new file mode 100644
index 00000000..7c05e7ea
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ *	Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ;
+
+static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
+				unsigned int dataoff,
+				struct nf_conntrack_tuple *tuple)
+{
+	const struct icmp6hdr *hp;
+	struct icmp6hdr _hdr;
+
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return false;
+	tuple->dst.u.icmp.type = hp->icmp6_type;
+	tuple->src.u.icmp.id = hp->icmp6_identifier;
+	tuple->dst.u.icmp.code = hp->icmp6_code;
+
+	return true;
+}
+
+/* Add 1; spaces filled with 0. */
+static const u_int8_t invmap[] = {
+	[ICMPV6_ECHO_REQUEST - 128]	= ICMPV6_ECHO_REPLY + 1,
+	[ICMPV6_ECHO_REPLY - 128]	= ICMPV6_ECHO_REQUEST + 1,
+	[ICMPV6_NI_QUERY - 128]		= ICMPV6_NI_REPLY + 1,
+	[ICMPV6_NI_REPLY - 128]		= ICMPV6_NI_QUERY +1
+};
+
+static const u_int8_t noct_valid_new[] = {
+	[ICMPV6_MGM_QUERY - 130] = 1,
+	[ICMPV6_MGM_REPORT -130] = 1,
+	[ICMPV6_MGM_REDUCTION - 130] = 1,
+	[NDISC_ROUTER_SOLICITATION - 130] = 1,
+	[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
+	[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
+	[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
+	[ICMPV6_MLD2_REPORT - 130] = 1
+};
+
+static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_tuple *orig)
+{
+	int type = orig->dst.u.icmp.type - 128;
+	if (type < 0 || type >= sizeof(invmap) || !invmap[type])
+		return false;
+
+	tuple->src.u.icmp.id   = orig->src.u.icmp.id;
+	tuple->dst.u.icmp.type = invmap[type] - 1;
+	tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmpv6_print_tuple(struct seq_file *s,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "type=%u code=%u id=%u ",
+			  tuple->dst.u.icmp.type,
+			  tuple->dst.u.icmp.code,
+			  ntohs(tuple->src.u.icmp.id));
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmpv6_packet(struct nf_conn *ct,
+		       const struct sk_buff *skb,
+		       unsigned int dataoff,
+		       enum ip_conntrack_info ctinfo,
+		       u_int8_t pf,
+		       unsigned int hooknum)
+{
+	/* Do not immediately delete the connection after the first
+	   successful reply to avoid excessive conntrackd traffic
+	   and also to handle correctly ICMP echo reply duplicates. */
+	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
+		       unsigned int dataoff)
+{
+	static const u_int8_t valid_new[] = {
+		[ICMPV6_ECHO_REQUEST - 128] = 1,
+		[ICMPV6_NI_QUERY - 128] = 1
+	};
+	int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
+
+	if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
+		/* Can't create a new ICMPv6 `conn' with this. */
+		pr_debug("icmpv6: can't create new conn with type %u\n",
+			 type + 128);
+		nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
+		if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6))
+			nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmpv6: invalid new with type %d ",
+				      type + 128);
+		return false;
+	}
+	return true;
+}
+
+static int
+icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
+		     struct sk_buff *skb,
+		     unsigned int icmp6off,
+		     enum ip_conntrack_info *ctinfo,
+		     unsigned int hooknum)
+{
+	struct nf_conntrack_tuple intuple, origtuple;
+	const struct nf_conntrack_tuple_hash *h;
+	const struct nf_conntrack_l4proto *inproto;
+	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+	NF_CT_ASSERT(skb->nfct == NULL);
+
+	/* Are they talking about one of our connections? */
+	if (!nf_ct_get_tuplepr(skb,
+			       skb_network_offset(skb)
+				+ sizeof(struct ipv6hdr)
+				+ sizeof(struct icmp6hdr),
+			       PF_INET6, &origtuple)) {
+		pr_debug("icmpv6_error: Can't get tuple\n");
+		return -NF_ACCEPT;
+	}
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
+
+	/* Ordinarily, we'd expect the inverted tupleproto, but it's
+	   been preserved inside the ICMP. */
+	if (!nf_ct_invert_tuple(&intuple, &origtuple,
+				&nf_conntrack_l3proto_ipv6, inproto)) {
+		pr_debug("icmpv6_error: Can't invert tuple\n");
+		return -NF_ACCEPT;
+	}
+
+	*ctinfo = IP_CT_RELATED;
+
+	h = nf_conntrack_find_get(net, zone, &intuple);
+	if (!h) {
+		pr_debug("icmpv6_error: no match\n");
+		return -NF_ACCEPT;
+	} else {
+		if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+			*ctinfo += IP_CT_IS_REPLY;
+	}
+
+	/* Update skb to refer to this connection */
+	skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return NF_ACCEPT;
+}
+
+static int
+icmpv6_error(struct net *net, struct nf_conn *tmpl,
+	     struct sk_buff *skb, unsigned int dataoff,
+	     enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
+{
+	const struct icmp6hdr *icmp6h;
+	struct icmp6hdr _ih;
+	int type;
+
+	icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
+	if (icmp6h == NULL) {
+		if (LOG_INVALID(net, IPPROTO_ICMPV6))
+		nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+			      "nf_ct_icmpv6: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
+		if (LOG_INVALID(net, IPPROTO_ICMPV6))
+			nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_icmpv6: ICMPv6 checksum failed ");
+		return -NF_ACCEPT;
+	}
+
+	type = icmp6h->icmp6_type - 130;
+	if (type >= 0 && type < sizeof(noct_valid_new) &&
+	    noct_valid_new[type]) {
+		skb->nfct = &nf_ct_untracked_get()->ct_general;
+		skb->nfctinfo = IP_CT_NEW;
+		nf_conntrack_get(skb->nfct);
+		return NF_ACCEPT;
+	}
+
+	/* is not error message ? */
+	if (icmp6h->icmp6_type >= 128)
+		return NF_ACCEPT;
+
+	return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum);
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+static int icmpv6_tuple_to_nlattr(struct sk_buff *skb,
+				  const struct nf_conntrack_tuple *t)
+{
+	NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id);
+	NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type);
+	NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = {
+	[CTA_PROTO_ICMPV6_TYPE]	= { .type = NLA_U8 },
+	[CTA_PROTO_ICMPV6_CODE]	= { .type = NLA_U8 },
+	[CTA_PROTO_ICMPV6_ID]	= { .type = NLA_U16 },
+};
+
+static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMPV6_TYPE] ||
+	    !tb[CTA_PROTO_ICMPV6_CODE] ||
+	    !tb[CTA_PROTO_ICMPV6_ID])
+		return -EINVAL;
+
+	tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
+	tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
+	tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+
+	if (tuple->dst.u.icmp.type < 128 ||
+	    tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
+	    !invmap[tuple->dst.u.icmp.type - 128])
+		return -EINVAL;
+
+	return 0;
+}
+
+static int icmpv6_nlattr_tuple_size(void)
+{
+	return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *icmpv6_sysctl_header;
+static struct ctl_table icmpv6_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_icmpv6_timeout",
+		.data		= &nf_ct_icmpv6_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto		= IPPROTO_ICMPV6,
+	.name			= "icmpv6",
+	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
+	.invert_tuple		= icmpv6_invert_tuple,
+	.print_tuple		= icmpv6_print_tuple,
+	.packet			= icmpv6_packet,
+	.new			= icmpv6_new,
+	.error			= icmpv6_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= icmpv6_tuple_to_nlattr,
+	.nlattr_tuple_size	= icmpv6_nlattr_tuple_size,
+	.nlattr_to_tuple	= icmpv6_nlattr_to_tuple,
+	.nla_policy		= icmpv6_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_header	= &icmpv6_sysctl_header,
+	.ctl_table		= icmpv6_sysctl_table,
+#endif
+};
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
new file mode 100644
index 00000000..08572726
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -0,0 +1,651 @@
+/*
+ * IPv6 fragment reassembly for connection tracking
+ *
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * Author:
+ *	Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on: net/ipv6/reassembly.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+#include <net/inet_frag.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <linux/sysctl.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+
+struct nf_ct_frag6_skb_cb
+{
+	struct inet6_skb_parm	h;
+	int			offset;
+	struct sk_buff		*orig;
+};
+
+#define NFCT_FRAG6_CB(skb)	((struct nf_ct_frag6_skb_cb*)((skb)->cb))
+
+struct nf_ct_frag6_queue
+{
+	struct inet_frag_queue	q;
+
+	__be32			id;		/* fragment id		*/
+	u32			user;
+	struct in6_addr		saddr;
+	struct in6_addr		daddr;
+
+	unsigned int		csum;
+	__u16			nhoffset;
+};
+
+static struct inet_frags nf_frags;
+static struct netns_frags nf_init_frags;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table nf_ct_frag6_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_frag6_timeout",
+		.data		= &nf_init_frags.timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_frag6_low_thresh",
+		.data		= &nf_init_frags.low_thresh,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_frag6_high_thresh",
+		.data		= &nf_init_frags.high_thresh,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table_header *nf_ct_frag6_sysctl_header;
+#endif
+
+static unsigned int nf_hashfn(struct inet_frag_queue *q)
+{
+	const struct nf_ct_frag6_queue *nq;
+
+	nq = container_of(q, struct nf_ct_frag6_queue, q);
+	return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd);
+}
+
+static void nf_skb_free(struct sk_buff *skb)
+{
+	if (NFCT_FRAG6_CB(skb)->orig)
+		kfree_skb(NFCT_FRAG6_CB(skb)->orig);
+}
+
+/* Destruction primitives. */
+
+static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
+{
+	inet_frag_put(&fq->q, &nf_frags);
+}
+
+/* Kill fq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
+{
+	inet_frag_kill(&fq->q, &nf_frags);
+}
+
+static void nf_ct_frag6_evictor(void)
+{
+	local_bh_disable();
+	inet_frag_evictor(&nf_init_frags, &nf_frags);
+	local_bh_enable();
+}
+
+static void nf_ct_frag6_expire(unsigned long data)
+{
+	struct nf_ct_frag6_queue *fq;
+
+	fq = container_of((struct inet_frag_queue *)data,
+			struct nf_ct_frag6_queue, q);
+
+	spin_lock(&fq->q.lock);
+
+	if (fq->q.last_in & INET_FRAG_COMPLETE)
+		goto out;
+
+	fq_kill(fq);
+
+out:
+	spin_unlock(&fq->q.lock);
+	fq_put(fq);
+}
+
+/* Creation primitives. */
+
+static __inline__ struct nf_ct_frag6_queue *
+fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst)
+{
+	struct inet_frag_queue *q;
+	struct ip6_create_arg arg;
+	unsigned int hash;
+
+	arg.id = id;
+	arg.user = user;
+	arg.src = src;
+	arg.dst = dst;
+
+	read_lock_bh(&nf_frags.lock);
+	hash = inet6_hash_frag(id, src, dst, nf_frags.rnd);
+
+	q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
+	local_bh_enable();
+	if (q == NULL)
+		goto oom;
+
+	return container_of(q, struct nf_ct_frag6_queue, q);
+
+oom:
+	pr_debug("Can't alloc new queue\n");
+	return NULL;
+}
+
+
+static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+			     const struct frag_hdr *fhdr, int nhoff)
+{
+	struct sk_buff *prev, *next;
+	int offset, end;
+
+	if (fq->q.last_in & INET_FRAG_COMPLETE) {
+		pr_debug("Already completed\n");
+		goto err;
+	}
+
+	offset = ntohs(fhdr->frag_off) & ~0x7;
+	end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
+			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
+
+	if ((unsigned int)end > IPV6_MAXPLEN) {
+		pr_debug("offset is too large.\n");
+		return -1;
+	}
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		const unsigned char *nh = skb_network_header(skb);
+		skb->csum = csum_sub(skb->csum,
+				     csum_partial(nh, (u8 *)(fhdr + 1) - nh,
+						  0));
+	}
+
+	/* Is this the final fragment? */
+	if (!(fhdr->frag_off & htons(IP6_MF))) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrupted.
+		 */
+		if (end < fq->q.len ||
+		    ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) {
+			pr_debug("already received last fragment\n");
+			goto err;
+		}
+		fq->q.last_in |= INET_FRAG_LAST_IN;
+		fq->q.len = end;
+	} else {
+		/* Check if the fragment is rounded to 8 bytes.
+		 * Required by the RFC.
+		 */
+		if (end & 0x7) {
+			/* RFC2460 says always send parameter problem in
+			 * this case. -DaveM
+			 */
+			pr_debug("end of fragment not rounded to 8 bytes.\n");
+			return -1;
+		}
+		if (end > fq->q.len) {
+			/* Some bits beyond end -> corruption. */
+			if (fq->q.last_in & INET_FRAG_LAST_IN) {
+				pr_debug("last packet already reached.\n");
+				goto err;
+			}
+			fq->q.len = end;
+		}
+	}
+
+	if (end == offset)
+		goto err;
+
+	/* Point into the IP datagram 'data' part. */
+	if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
+		pr_debug("queue: message is too short.\n");
+		goto err;
+	}
+	if (pskb_trim_rcsum(skb, end - offset)) {
+		pr_debug("Can't trim\n");
+		goto err;
+	}
+
+	/* Find out which fragments are in front and at the back of us
+	 * in the chain of fragments so far.  We must know where to put
+	 * this fragment, right?
+	 */
+	prev = fq->q.fragments_tail;
+	if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
+	prev = NULL;
+	for (next = fq->q.fragments; next != NULL; next = next->next) {
+		if (NFCT_FRAG6_CB(next)->offset >= offset)
+			break;	/* bingo! */
+		prev = next;
+	}
+
+found:
+	/* RFC5722, Section 4:
+	 *                                  When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments, including those not yet received) MUST be silently
+	 *   discarded.
+	 */
+
+	/* Check for overlap with preceding fragment. */
+	if (prev &&
+	    (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset)
+		goto discard_fq;
+
+	/* Look for overlap with succeeding segment. */
+	if (next && NFCT_FRAG6_CB(next)->offset < end)
+		goto discard_fq;
+
+	NFCT_FRAG6_CB(skb)->offset = offset;
+
+	/* Insert this fragment in the chain of fragments. */
+	skb->next = next;
+	if (!next)
+		fq->q.fragments_tail = skb;
+	if (prev)
+		prev->next = skb;
+	else
+		fq->q.fragments = skb;
+
+	skb->dev = NULL;
+	fq->q.stamp = skb->tstamp;
+	fq->q.meat += skb->len;
+	atomic_add(skb->truesize, &nf_init_frags.mem);
+
+	/* The first fragment.
+	 * nhoffset is obtained from the first fragment, of course.
+	 */
+	if (offset == 0) {
+		fq->nhoffset = nhoff;
+		fq->q.last_in |= INET_FRAG_FIRST_IN;
+	}
+	write_lock(&nf_frags.lock);
+	list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list);
+	write_unlock(&nf_frags.lock);
+	return 0;
+
+discard_fq:
+	fq_kill(fq);
+err:
+	return -1;
+}
+
+/*
+ *	Check if this packet is complete.
+ *	Returns NULL on failure by any reason, and pointer
+ *	to current nexthdr field in reassembled frame.
+ *
+ *	It is called with locked fq, and caller must check that
+ *	queue is eligible for reassembly i.e. it is not COMPLETE,
+ *	the last and the first frames arrived and all the bits are here.
+ */
+static struct sk_buff *
+nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+{
+	struct sk_buff *fp, *op, *head = fq->q.fragments;
+	int    payload_len;
+
+	fq_kill(fq);
+
+	WARN_ON(head == NULL);
+	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
+
+	/* Unfragmented part is taken from the first segment. */
+	payload_len = ((head->data - skb_network_header(head)) -
+		       sizeof(struct ipv6hdr) + fq->q.len -
+		       sizeof(struct frag_hdr));
+	if (payload_len > IPV6_MAXPLEN) {
+		pr_debug("payload len is too large.\n");
+		goto out_oversize;
+	}
+
+	/* Head of list must not be cloned. */
+	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) {
+		pr_debug("skb is cloned but can't expand head");
+		goto out_oom;
+	}
+
+	/* If the first fragment is fragmented itself, we split
+	 * it to two chunks: the first with data and paged part
+	 * and the second, holding only fragments. */
+	if (skb_has_frag_list(head)) {
+		struct sk_buff *clone;
+		int i, plen = 0;
+
+		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) {
+			pr_debug("Can't alloc skb\n");
+			goto out_oom;
+		}
+		clone->next = head->next;
+		head->next = clone;
+		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+		skb_frag_list_init(head);
+		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+			plen += skb_shinfo(head)->frags[i].size;
+		clone->len = clone->data_len = head->data_len - plen;
+		head->data_len -= clone->len;
+		head->len -= clone->len;
+		clone->csum = 0;
+		clone->ip_summed = head->ip_summed;
+
+		NFCT_FRAG6_CB(clone)->orig = NULL;
+		atomic_add(clone->truesize, &nf_init_frags.mem);
+	}
+
+	/* We have to remove fragment header from datagram and to relocate
+	 * header in order to calculate ICV correctly. */
+	skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0];
+	memmove(head->head + sizeof(struct frag_hdr), head->head,
+		(head->data - head->head) - sizeof(struct frag_hdr));
+	head->mac_header += sizeof(struct frag_hdr);
+	head->network_header += sizeof(struct frag_hdr);
+
+	skb_shinfo(head)->frag_list = head->next;
+	skb_reset_transport_header(head);
+	skb_push(head, head->data - skb_network_header(head));
+
+	for (fp=head->next; fp; fp = fp->next) {
+		head->data_len += fp->len;
+		head->len += fp->len;
+		if (head->ip_summed != fp->ip_summed)
+			head->ip_summed = CHECKSUM_NONE;
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
+			head->csum = csum_add(head->csum, fp->csum);
+		head->truesize += fp->truesize;
+	}
+	atomic_sub(head->truesize, &nf_init_frags.mem);
+
+	head->next = NULL;
+	head->dev = dev;
+	head->tstamp = fq->q.stamp;
+	ipv6_hdr(head)->payload_len = htons(payload_len);
+
+	/* Yes, and fold redundant checksum back. 8) */
+	if (head->ip_summed == CHECKSUM_COMPLETE)
+		head->csum = csum_partial(skb_network_header(head),
+					  skb_network_header_len(head),
+					  head->csum);
+
+	fq->q.fragments = NULL;
+	fq->q.fragments_tail = NULL;
+
+	/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+	fp = skb_shinfo(head)->frag_list;
+	if (fp && NFCT_FRAG6_CB(fp)->orig == NULL)
+		/* at above code, head skb is divided into two skbs. */
+		fp = fp->next;
+
+	op = NFCT_FRAG6_CB(head)->orig;
+	for (; fp; fp = fp->next) {
+		struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
+
+		op->next = orig;
+		op = orig;
+		NFCT_FRAG6_CB(fp)->orig = NULL;
+	}
+
+	return head;
+
+out_oversize:
+	if (net_ratelimit())
+		printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len);
+	goto out_fail;
+out_oom:
+	if (net_ratelimit())
+		printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n");
+out_fail:
+	return NULL;
+}
+
+/*
+ * find the header just before Fragment Header.
+ *
+ * if success return 0 and set ...
+ * (*prevhdrp): the value of "Next Header Field" in the header
+ *		just before Fragment Header.
+ * (*prevhoff): the offset of "Next Header Field" in the header
+ *		just before Fragment Header.
+ * (*fhoff)   : the offset of Fragment Header.
+ *
+ * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c
+ *
+ */
+static int
+find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
+{
+	u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+	const int netoff = skb_network_offset(skb);
+	u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr);
+	int start = netoff + sizeof(struct ipv6hdr);
+	int len = skb->len - start;
+	u8 prevhdr = NEXTHDR_IPV6;
+
+	while (nexthdr != NEXTHDR_FRAGMENT) {
+		struct ipv6_opt_hdr hdr;
+		int hdrlen;
+
+		if (!ipv6_ext_hdr(nexthdr)) {
+			return -1;
+		}
+		if (nexthdr == NEXTHDR_NONE) {
+			pr_debug("next header is none\n");
+			return -1;
+		}
+		if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+			pr_debug("too short\n");
+			return -1;
+		}
+		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+			BUG();
+		if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr.hdrlen+2)<<2;
+		else
+			hdrlen = ipv6_optlen(&hdr);
+
+		prevhdr = nexthdr;
+		prev_nhoff = start;
+
+		nexthdr = hdr.nexthdr;
+		len -= hdrlen;
+		start += hdrlen;
+	}
+
+	if (len < 0)
+		return -1;
+
+	*prevhdrp = prevhdr;
+	*prevhoff = prev_nhoff;
+	*fhoff = start;
+
+	return 0;
+}
+
+struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
+{
+	struct sk_buff *clone;
+	struct net_device *dev = skb->dev;
+	struct frag_hdr *fhdr;
+	struct nf_ct_frag6_queue *fq;
+	struct ipv6hdr *hdr;
+	int fhoff, nhoff;
+	u8 prevhdr;
+	struct sk_buff *ret_skb = NULL;
+
+	/* Jumbo payload inhibits frag. header */
+	if (ipv6_hdr(skb)->payload_len == 0) {
+		pr_debug("payload len = 0\n");
+		return skb;
+	}
+
+	if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
+		return skb;
+
+	clone = skb_clone(skb, GFP_ATOMIC);
+	if (clone == NULL) {
+		pr_debug("Can't clone skb\n");
+		return skb;
+	}
+
+	NFCT_FRAG6_CB(clone)->orig = skb;
+
+	if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
+		pr_debug("message is too short.\n");
+		goto ret_orig;
+	}
+
+	skb_set_transport_header(clone, fhoff);
+	hdr = ipv6_hdr(clone);
+	fhdr = (struct frag_hdr *)skb_transport_header(clone);
+
+	if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh)
+		nf_ct_frag6_evictor();
+
+	fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr);
+	if (fq == NULL) {
+		pr_debug("Can't find and can't create new queue\n");
+		goto ret_orig;
+	}
+
+	spin_lock_bh(&fq->q.lock);
+
+	if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
+		spin_unlock_bh(&fq->q.lock);
+		pr_debug("Can't insert skb to queue\n");
+		fq_put(fq);
+		goto ret_orig;
+	}
+
+	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	    fq->q.meat == fq->q.len) {
+		ret_skb = nf_ct_frag6_reasm(fq, dev);
+		if (ret_skb == NULL)
+			pr_debug("Can't reassemble fragmented packets\n");
+	}
+	spin_unlock_bh(&fq->q.lock);
+
+	fq_put(fq);
+	return ret_skb;
+
+ret_orig:
+	kfree_skb(clone);
+	return skb;
+}
+
+void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
+			struct net_device *in, struct net_device *out,
+			int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *s, *s2;
+
+	for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
+		nf_conntrack_put_reasm(s->nfct_reasm);
+		nf_conntrack_get_reasm(skb);
+		s->nfct_reasm = skb;
+
+		s2 = s->next;
+		s->next = NULL;
+
+		NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn,
+			       NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
+		s = s2;
+	}
+	nf_conntrack_put_reasm(skb);
+}
+
+int nf_ct_frag6_init(void)
+{
+	nf_frags.hashfn = nf_hashfn;
+	nf_frags.constructor = ip6_frag_init;
+	nf_frags.destructor = NULL;
+	nf_frags.skb_free = nf_skb_free;
+	nf_frags.qsize = sizeof(struct nf_ct_frag6_queue);
+	nf_frags.match = ip6_frag_match;
+	nf_frags.frag_expire = nf_ct_frag6_expire;
+	nf_frags.secret_interval = 10 * 60 * HZ;
+	nf_init_frags.timeout = IPV6_FRAG_TIMEOUT;
+	nf_init_frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	nf_init_frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+	inet_frags_init_net(&nf_init_frags);
+	inet_frags_init(&nf_frags);
+
+#ifdef CONFIG_SYSCTL
+	nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
+							  nf_ct_frag6_sysctl_table);
+	if (!nf_ct_frag6_sysctl_header) {
+		inet_frags_fini(&nf_frags);
+		return -ENOMEM;
+	}
+#endif
+
+	return 0;
+}
+
+void nf_ct_frag6_cleanup(void)
+{
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(nf_ct_frag6_sysctl_header);
+	nf_ct_frag6_sysctl_header = NULL;
+#endif
+	inet_frags_fini(&nf_frags);
+
+	nf_init_frags.low_thresh = 0;
+	nf_ct_frag6_evictor();
+}
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 00000000..cdd6d045
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,137 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+						struct sk_buff *skb)
+{
+	u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (skb->nfct)
+		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge &&
+	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+	if (hooknum == NF_INET_PRE_ROUTING)
+		return IP6_DEFRAG_CONNTRACK_IN + zone;
+	else
+		return IP6_DEFRAG_CONNTRACK_OUT + zone;
+
+}
+
+static unsigned int ipv6_defrag(unsigned int hooknum,
+				struct sk_buff *skb,
+				const struct net_device *in,
+				const struct net_device *out,
+				int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *reasm;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	/* Previously seen (loopback)?	*/
+	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+		return NF_ACCEPT;
+#endif
+
+	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+	/* queued */
+	if (reasm == NULL)
+		return NF_STOLEN;
+
+	/* error occurred or not fragmented */
+	if (reasm == skb)
+		return NF_ACCEPT;
+
+	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+			   (struct net_device *)out, okfn);
+
+	return NF_STOLEN;
+}
+
+static struct nf_hook_ops ipv6_defrag_ops[] = {
+	{
+		.hook		= ipv6_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
+	},
+	{
+		.hook		= ipv6_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
+	},
+};
+
+static int __init nf_defrag_init(void)
+{
+	int ret = 0;
+
+	ret = nf_ct_frag6_init();
+	if (ret < 0) {
+		pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+		return ret;
+	}
+	ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+	if (ret < 0) {
+		pr_err("nf_defrag_ipv6: can't register hooks\n");
+		goto cleanup_frag6;
+	}
+	return ret;
+
+cleanup_frag6:
+	nf_ct_frag6_cleanup();
+	return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+	nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+	nf_ct_frag6_cleanup();
+}
+
+void nf_defrag_ipv6_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
new file mode 100644
index 00000000..18ff5df7
--- /dev/null
+++ b/net/ipv6/proc.c
@@ -0,0 +1,342 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		This file implements the various access functions for the
+ *		PROC file system.  This is very similar to the IPv4 version,
+ *		except it reports the sockets in the INET6 address family.
+ *
+ * Authors:	David S. Miller (davem@caip.rutgers.edu)
+ * 		YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/ipv6.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/transp_v6.h>
+#include <net/ipv6.h>
+
+static int sockstat6_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+
+	seq_printf(seq, "TCP6: inuse %d\n",
+		       sock_prot_inuse_get(net, &tcpv6_prot));
+	seq_printf(seq, "UDP6: inuse %d\n",
+		       sock_prot_inuse_get(net, &udpv6_prot));
+	seq_printf(seq, "UDPLITE6: inuse %d\n",
+			sock_prot_inuse_get(net, &udplitev6_prot));
+	seq_printf(seq, "RAW6: inuse %d\n",
+		       sock_prot_inuse_get(net, &rawv6_prot));
+	seq_printf(seq, "FRAG6: inuse %d memory %d\n",
+		       ip6_frag_nqueues(net), ip6_frag_mem(net));
+	return 0;
+}
+
+static int sockstat6_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, sockstat6_seq_show);
+}
+
+static const struct file_operations sockstat6_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = sockstat6_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+static const struct snmp_mib snmp6_ipstats_list[] = {
+/* ipv6 mib according to RFC 2465 */
+	SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS),
+	SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+	SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS),
+	SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES),
+	SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+	SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+	SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
+	SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS),
+	SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS),
+	SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+	SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS),
+	SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+	SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+	SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+	SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS),
+	SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS),
+	SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS),
+	SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS),
+	SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS),
+	SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES),
+	SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
+	SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
+	SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS),
+	SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS),
+	SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
+	SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp6_icmp6_list[] = {
+/* icmpv6 mib according to RFC 2466 */
+	SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS),
+	SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS),
+	SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
+	SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+static const char *const icmp6type2name[256] = {
+	[ICMPV6_DEST_UNREACH] = "DestUnreachs",
+	[ICMPV6_PKT_TOOBIG] = "PktTooBigs",
+	[ICMPV6_TIME_EXCEED] = "TimeExcds",
+	[ICMPV6_PARAMPROB] = "ParmProblems",
+	[ICMPV6_ECHO_REQUEST] = "Echos",
+	[ICMPV6_ECHO_REPLY] = "EchoReplies",
+	[ICMPV6_MGM_QUERY] = "GroupMembQueries",
+	[ICMPV6_MGM_REPORT] = "GroupMembResponses",
+	[ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
+	[ICMPV6_MLD2_REPORT] = "MLDv2Reports",
+	[NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
+	[NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
+	[NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
+	[NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
+	[NDISC_REDIRECT] = "Redirects",
+};
+
+
+static const struct snmp_mib snmp6_udp6_list[] = {
+	SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
+	SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
+	SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS),
+	SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+static const struct snmp_mib snmp6_udplite6_list[] = {
+	SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS),
+	SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS),
+	SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS),
+	SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
+	SNMP_MIB_SENTINEL
+};
+
+/* can be called either with percpu mib (pcpumib != NULL),
+ * or shared one (smib != NULL)
+ */
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, void __percpu **pcpumib,
+				     atomic_long_t *smib)
+{
+	char name[32];
+	int i;
+
+	/* print by name -- deprecated items */
+	for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+		int icmptype;
+		const char *p;
+
+		icmptype = i & 0xff;
+		p = icmp6type2name[icmptype];
+		if (!p)	/* don't print un-named types here */
+			continue;
+		snprintf(name, sizeof(name), "Icmp6%s%s",
+			i & 0x100 ? "Out" : "In", p);
+		seq_printf(seq, "%-32s\t%lu\n", name,
+			pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i));
+	}
+
+	/* print by number (nonzero only) - ICMPMsgStat format */
+	for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+		unsigned long val;
+
+		val = pcpumib ? snmp_fold_field(pcpumib, i) : atomic_long_read(smib + i);
+		if (!val)
+			continue;
+		snprintf(name, sizeof(name), "Icmp6%sType%u",
+			i & 0x100 ?  "Out" : "In", i & 0xff);
+		seq_printf(seq, "%-32s\t%lu\n", name, val);
+	}
+}
+
+/* can be called either with percpu mib (pcpumib != NULL),
+ * or shared one (smib != NULL)
+ */
+static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib,
+				atomic_long_t *smib,
+				const struct snmp_mib *itemlist)
+{
+	int i;
+	unsigned long val;
+
+	for (i = 0; itemlist[i].name; i++) {
+		val = pcpumib ?
+			snmp_fold_field(pcpumib, itemlist[i].entry) :
+			atomic_long_read(smib + itemlist[i].entry);
+		seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val);
+	}
+}
+
+static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib,
+				  const struct snmp_mib *itemlist, size_t syncpoff)
+{
+	int i;
+
+	for (i = 0; itemlist[i].name; i++)
+		seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
+			   snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+}
+
+static int snmp6_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = (struct net *)seq->private;
+
+	snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics,
+			    snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics,
+			    NULL, snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq,
+			    (void __percpu **)net->mib.icmpv6msg_statistics, NULL);
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6,
+			    NULL, snmp6_udp6_list);
+	snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6,
+			    NULL, snmp6_udplite6_list);
+	return 0;
+}
+
+static int snmp6_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, snmp6_seq_show);
+}
+
+static const struct file_operations snmp6_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = snmp6_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
+{
+	struct inet6_dev *idev = (struct inet6_dev *)seq->private;
+
+	seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
+	snmp6_seq_show_item(seq, (void __percpu **)idev->stats.ipv6, NULL,
+			    snmp6_ipstats_list);
+	snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
+			    snmp6_icmp6_list);
+	snmp6_seq_show_icmpv6msg(seq, NULL, idev->stats.icmpv6msgdev->mibs);
+	return 0;
+}
+
+static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, snmp6_dev_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations snmp6_dev_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = snmp6_dev_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+int snmp6_register_dev(struct inet6_dev *idev)
+{
+	struct proc_dir_entry *p;
+	struct net *net;
+
+	if (!idev || !idev->dev)
+		return -EINVAL;
+
+	net = dev_net(idev->dev);
+	if (!net->mib.proc_net_devsnmp6)
+		return -ENOENT;
+
+	p = proc_create_data(idev->dev->name, S_IRUGO,
+			     net->mib.proc_net_devsnmp6,
+			     &snmp6_dev_seq_fops, idev);
+	if (!p)
+		return -ENOMEM;
+
+	idev->stats.proc_dir_entry = p;
+	return 0;
+}
+
+int snmp6_unregister_dev(struct inet6_dev *idev)
+{
+	struct net *net = dev_net(idev->dev);
+	if (!net->mib.proc_net_devsnmp6)
+		return -ENOENT;
+	if (!idev->stats.proc_dir_entry)
+		return -EINVAL;
+	remove_proc_entry(idev->stats.proc_dir_entry->name,
+			  net->mib.proc_net_devsnmp6);
+	idev->stats.proc_dir_entry = NULL;
+	return 0;
+}
+
+static int __net_init ipv6_proc_init_net(struct net *net)
+{
+	if (!proc_net_fops_create(net, "sockstat6", S_IRUGO,
+			&sockstat6_seq_fops))
+		return -ENOMEM;
+
+	if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops))
+		goto proc_snmp6_fail;
+
+	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+	if (!net->mib.proc_net_devsnmp6)
+		goto proc_dev_snmp6_fail;
+	return 0;
+
+proc_snmp6_fail:
+	proc_net_remove(net, "sockstat6");
+proc_dev_snmp6_fail:
+	proc_net_remove(net, "dev_snmp6");
+	return -ENOMEM;
+}
+
+static void __net_exit ipv6_proc_exit_net(struct net *net)
+{
+	proc_net_remove(net, "sockstat6");
+	proc_net_remove(net, "dev_snmp6");
+	proc_net_remove(net, "snmp6");
+}
+
+static struct pernet_operations ipv6_proc_ops = {
+	.init = ipv6_proc_init_net,
+	.exit = ipv6_proc_exit_net,
+};
+
+int __init ipv6_misc_proc_init(void)
+{
+	return register_pernet_subsys(&ipv6_proc_ops);
+}
+
+void ipv6_misc_proc_exit(void)
+{
+	unregister_pernet_subsys(&ipv6_proc_ops);
+}
+
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
new file mode 100644
index 00000000..9a7978fd
--- /dev/null
+++ b/net/ipv6/protocol.c
@@ -0,0 +1,54 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		PF_INET6 protocol dispatch tables.
+ *
+ * Authors:	Pedro Roque	<roque@di.fc.ul.pt>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *      Changes:
+ *
+ *      Vince Laviano (vince@cs.stanford.edu)       16 May 2001
+ *      - Removed unused variable 'inet6_protocol_base'
+ *      - Modified inet6_del_protocol() to correctly maintain copy bit.
+ */
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+
+const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
+
+int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+{
+	int hash = protocol & (MAX_INET_PROTOS - 1);
+
+	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+			NULL, prot) ? 0 : -1;
+}
+EXPORT_SYMBOL(inet6_add_protocol);
+
+/*
+ *	Remove a protocol from the hash tables.
+ */
+
+int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol)
+{
+	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
+
+	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+		       prot, NULL) == prot) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(inet6_del_protocol);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
new file mode 100644
index 00000000..cc7313b8
--- /dev/null
+++ b/net/ipv6/raw.c
@@ -0,0 +1,1370 @@
+/*
+ *	RAW sockets for IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Adapted from linux/net/ipv4/raw.c
+ *
+ *	Fixes:
+ *	Hideaki YOSHIFUJI	:	sin6_scope_id support
+ *	YOSHIFUJI,H.@USAGI	:	raw checksum (RFC2292(bis) compliance)
+ *	Kazunori MIYAZAWA @USAGI:	change process style to use ip6_append_data
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/ip6_route.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#include <net/transp_v6.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/tcp_states.h>
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+#include <net/mip6.h>
+#endif
+#include <linux/mroute6.h>
+
+#include <net/raw.h>
+#include <net/rawv6.h>
+#include <net/xfrm.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static struct raw_hashinfo raw_v6_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
+};
+
+static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+		unsigned short num, const struct in6_addr *loc_addr,
+		const struct in6_addr *rmt_addr, int dif)
+{
+	struct hlist_node *node;
+	int is_multicast = ipv6_addr_is_multicast(loc_addr);
+
+	sk_for_each_from(sk, node)
+		if (inet_sk(sk)->inet_num == num) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+
+			if (!ipv6_addr_any(&np->daddr) &&
+			    !ipv6_addr_equal(&np->daddr, rmt_addr))
+				continue;
+
+			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+				continue;
+
+			if (!ipv6_addr_any(&np->rcv_saddr)) {
+				if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+					goto found;
+				if (is_multicast &&
+				    inet6_mc_check(sk, loc_addr, rmt_addr))
+					goto found;
+				continue;
+			}
+			goto found;
+		}
+	sk = NULL;
+found:
+	return sk;
+}
+
+/*
+ *	0 - deliver
+ *	1 - block
+ */
+static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
+{
+	struct icmp6hdr *icmph;
+	struct raw6_sock *rp = raw6_sk(sk);
+
+	if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) {
+		__u32 *data = &rp->filter.data[0];
+		int bit_nr;
+
+		icmph = (struct icmp6hdr *) skb->data;
+		bit_nr = icmph->icmp6_type;
+
+		return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0;
+	}
+	return 0;
+}
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
+
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
+{
+	rcu_assign_pointer(mh_filter, filter);
+	return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_register);
+
+int rawv6_mh_filter_unregister(mh_filter_t filter)
+{
+	rcu_assign_pointer(mh_filter, NULL);
+	synchronize_rcu();
+	return 0;
+}
+EXPORT_SYMBOL(rawv6_mh_filter_unregister);
+
+#endif
+
+/*
+ *	demultiplex raw sockets.
+ *	(should consider queueing the skb in the sock receive_queue
+ *	without calling rawv6.c)
+ *
+ *	Caller owns SKB so we must make clones.
+ */
+static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+{
+	const struct in6_addr *saddr;
+	const struct in6_addr *daddr;
+	struct sock *sk;
+	int delivered = 0;
+	__u8 hash;
+	struct net *net;
+
+	saddr = &ipv6_hdr(skb)->saddr;
+	daddr = saddr + 1;
+
+	hash = nexthdr & (MAX_INET_PROTOS - 1);
+
+	read_lock(&raw_v6_hashinfo.lock);
+	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
+
+	if (sk == NULL)
+		goto out;
+
+	net = dev_net(skb->dev);
+	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
+
+	while (sk) {
+		int filtered;
+
+		delivered = 1;
+		switch (nexthdr) {
+		case IPPROTO_ICMPV6:
+			filtered = icmpv6_filter(sk, skb);
+			break;
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		case IPPROTO_MH:
+		{
+			/* XXX: To validate MH only once for each packet,
+			 * this is placed here. It should be after checking
+			 * xfrm policy, however it doesn't. The checking xfrm
+			 * policy is placed in rawv6_rcv() because it is
+			 * required for each socket.
+			 */
+			mh_filter_t *filter;
+
+			filter = rcu_dereference(mh_filter);
+			filtered = filter ? (*filter)(sk, skb) : 0;
+			break;
+		}
+#endif
+		default:
+			filtered = 0;
+			break;
+		}
+
+		if (filtered < 0)
+			break;
+		if (filtered == 0) {
+			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+			/* Not releasing hash table! */
+			if (clone) {
+				nf_reset(clone);
+				rawv6_rcv(sk, clone);
+			}
+		}
+		sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
+				     IP6CB(skb)->iif);
+	}
+out:
+	read_unlock(&raw_v6_hashinfo.lock);
+	return delivered;
+}
+
+int raw6_local_deliver(struct sk_buff *skb, int nexthdr)
+{
+	struct sock *raw_sk;
+
+	raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]);
+	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
+		raw_sk = NULL;
+
+	return raw_sk != NULL;
+}
+
+/* This cleans up af_inet6 a bit. -DaveM */
+static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
+	__be32 v4addr = 0;
+	int addr_type;
+	int err;
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+	addr_type = ipv6_addr_type(&addr->sin6_addr);
+
+	/* Raw sockets are IPv6 only */
+	if (addr_type == IPV6_ADDR_MAPPED)
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sk->sk_state != TCP_CLOSE)
+		goto out;
+
+	rcu_read_lock();
+	/* Check if the address belongs to the host. */
+	if (addr_type != IPV6_ADDR_ANY) {
+		struct net_device *dev = NULL;
+
+		if (addr_type & IPV6_ADDR_LINKLOCAL) {
+			if (addr_len >= sizeof(struct sockaddr_in6) &&
+			    addr->sin6_scope_id) {
+				/* Override any existing binding, if another
+				 * one is supplied by user.
+				 */
+				sk->sk_bound_dev_if = addr->sin6_scope_id;
+			}
+
+			/* Binding to link-local address requires an interface */
+			if (!sk->sk_bound_dev_if)
+				goto out_unlock;
+
+			err = -ENODEV;
+			dev = dev_get_by_index_rcu(sock_net(sk),
+						   sk->sk_bound_dev_if);
+			if (!dev)
+				goto out_unlock;
+		}
+
+		/* ipv4 addr of the socket is invalid.  Only the
+		 * unspecified and mapped address have a v4 equivalent.
+		 */
+		v4addr = LOOPBACK4_IPV6;
+		if (!(addr_type & IPV6_ADDR_MULTICAST))	{
+			err = -EADDRNOTAVAIL;
+			if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
+					   dev, 0)) {
+				goto out_unlock;
+			}
+		}
+	}
+
+	inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
+	ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr);
+	if (!(addr_type & IPV6_ADDR_MULTICAST))
+		ipv6_addr_copy(&np->saddr, &addr->sin6_addr);
+	err = 0;
+out_unlock:
+	rcu_read_unlock();
+out:
+	release_sock(sk);
+	return err;
+}
+
+static void rawv6_err(struct sock *sk, struct sk_buff *skb,
+	       struct inet6_skb_parm *opt,
+	       u8 type, u8 code, int offset, __be32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	int err;
+	int harderr;
+
+	/* Report error on raw socket, if:
+	   1. User requested recverr.
+	   2. Socket is connected (otherwise the error indication
+	      is useless without recverr and error is hard.
+	 */
+	if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+		return;
+
+	harderr = icmpv6_err_convert(type, code, &err);
+	if (type == ICMPV6_PKT_TOOBIG)
+		harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+
+	if (np->recverr) {
+		u8 *payload = skb->data;
+		if (!inet->hdrincl)
+			payload += offset;
+		ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
+	}
+
+	if (np->recverr || harderr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	}
+}
+
+void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
+		u8 type, u8 code, int inner_offset, __be32 info)
+{
+	struct sock *sk;
+	int hash;
+	const struct in6_addr *saddr, *daddr;
+	struct net *net;
+
+	hash = nexthdr & (RAW_HTABLE_SIZE - 1);
+
+	read_lock(&raw_v6_hashinfo.lock);
+	sk = sk_head(&raw_v6_hashinfo.ht[hash]);
+	if (sk != NULL) {
+		/* Note: ipv6_hdr(skb) != skb->data */
+		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
+		saddr = &ip6h->saddr;
+		daddr = &ip6h->daddr;
+		net = dev_net(skb->dev);
+
+		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
+						IP6CB(skb)->iif))) {
+			rawv6_err(sk, skb, NULL, type, code,
+					inner_offset, info);
+			sk = sk_next(sk);
+		}
+	}
+	read_unlock(&raw_v6_hashinfo.lock);
+}
+
+static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+	if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) &&
+	    skb_checksum_complete(skb)) {
+		atomic_inc(&sk->sk_drops);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	/* Charge it to the socket. */
+	if (ip_queue_rcv_skb(sk, skb) < 0) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return 0;
+}
+
+/*
+ *	This is next to useless...
+ *	if we demultiplex in network layer we don't need the extra call
+ *	just to queue the skb...
+ *	maybe we could have the network decide upon a hint if it
+ *	should call raw_rcv for demultiplexing
+ */
+int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct raw6_sock *rp = raw6_sk(sk);
+
+	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		atomic_inc(&sk->sk_drops);
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	if (!rp->checksum)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		skb_postpull_rcsum(skb, skb_network_header(skb),
+				   skb_network_header_len(skb));
+		if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+				     &ipv6_hdr(skb)->daddr,
+				     skb->len, inet->inet_num, skb->csum))
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	if (!skb_csum_unnecessary(skb))
+		skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+							 &ipv6_hdr(skb)->daddr,
+							 skb->len,
+							 inet->inet_num, 0));
+
+	if (inet->hdrincl) {
+		if (skb_checksum_complete(skb)) {
+			atomic_inc(&sk->sk_drops);
+			kfree_skb(skb);
+			return NET_RX_DROP;
+		}
+	}
+
+	rawv6_rcv_skb(sk, skb);
+	return 0;
+}
+
+
+/*
+ *	This should be easy, if there is something there
+ *	we return it, otherwise we block.
+ */
+
+static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
+		  struct msghdr *msg, size_t len,
+		  int noblock, int flags, int *addr_len)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name;
+	struct sk_buff *skb;
+	size_t copied;
+	int err;
+
+	if (flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (addr_len)
+		*addr_len=sizeof(*sin6);
+
+	if (flags & MSG_ERRQUEUE)
+		return ipv6_recv_error(sk, msg, len);
+
+	if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+		return ipv6_recv_rxpmtu(sk, msg, len);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		copied = len;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	if (skb_csum_unnecessary(skb)) {
+		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	} else if (msg->msg_flags&MSG_TRUNC) {
+		if (__skb_checksum_complete(skb))
+			goto csum_copy_err;
+		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	} else {
+		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
+		if (err == -EINVAL)
+			goto csum_copy_err;
+	}
+	if (err)
+		goto out_free;
+
+	/* Copy the address. */
+	if (sin6) {
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_port = 0;
+		ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr);
+		sin6->sin6_flowinfo = 0;
+		sin6->sin6_scope_id = 0;
+		if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+			sin6->sin6_scope_id = IP6CB(skb)->iif;
+	}
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (np->rxopt.all)
+		datagram_recv_ctl(sk, msg, skb);
+
+	err = copied;
+	if (flags & MSG_TRUNC)
+		err = skb->len;
+
+out_free:
+	skb_free_datagram(sk, skb);
+out:
+	return err;
+
+csum_copy_err:
+	skb_kill_datagram(sk, skb, flags);
+
+	/* Error for blocking case is chosen to masquerade
+	   as some normal condition.
+	 */
+	err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+	goto out;
+}
+
+static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+				     struct raw6_sock *rp)
+{
+	struct sk_buff *skb;
+	int err = 0;
+	int offset;
+	int len;
+	int total_len;
+	__wsum tmp_csum;
+	__sum16 csum;
+
+	if (!rp->checksum)
+		goto send;
+
+	if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+		goto out;
+
+	offset = rp->offset;
+	total_len = inet_sk(sk)->cork.base.length - (skb_network_header(skb) -
+						     skb->data);
+	if (offset >= total_len - 1) {
+		err = -EINVAL;
+		ip6_flush_pending_frames(sk);
+		goto out;
+	}
+
+	/* should be check HW csum miyazawa */
+	if (skb_queue_len(&sk->sk_write_queue) == 1) {
+		/*
+		 * Only one fragment on the socket.
+		 */
+		tmp_csum = skb->csum;
+	} else {
+		struct sk_buff *csum_skb = NULL;
+		tmp_csum = 0;
+
+		skb_queue_walk(&sk->sk_write_queue, skb) {
+			tmp_csum = csum_add(tmp_csum, skb->csum);
+
+			if (csum_skb)
+				continue;
+
+			len = skb->len - skb_transport_offset(skb);
+			if (offset >= len) {
+				offset -= len;
+				continue;
+			}
+
+			csum_skb = skb;
+		}
+
+		skb = csum_skb;
+	}
+
+	offset += skb_transport_offset(skb);
+	if (skb_copy_bits(skb, offset, &csum, 2))
+		BUG();
+
+	/* in case cksum was not initialized */
+	if (unlikely(csum))
+		tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));
+
+	csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+			       total_len, fl6->flowi6_proto, tmp_csum);
+
+	if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
+		csum = CSUM_MANGLED_0;
+
+	if (skb_store_bits(skb, offset, &csum, 2))
+		BUG();
+
+send:
+	err = ip6_push_pending_frames(sk);
+out:
+	return err;
+}
+
+static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
+			struct flowi6 *fl6, struct dst_entry **dstp,
+			unsigned int flags)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6hdr *iph;
+	struct sk_buff *skb;
+	int err;
+	struct rt6_info *rt = (struct rt6_info *)*dstp;
+
+	if (length > rt->dst.dev->mtu) {
+		ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
+		return -EMSGSIZE;
+	}
+	if (flags&MSG_PROBE)
+		goto out;
+
+	skb = sock_alloc_send_skb(sk,
+				  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
+				  flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto error;
+	skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
+
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+	skb_dst_set(skb, &rt->dst);
+	*dstp = NULL;
+
+	skb_put(skb, length);
+	skb_reset_network_header(skb);
+	iph = ipv6_hdr(skb);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb->transport_header = skb->network_header;
+	err = memcpy_fromiovecend((void *)iph, from, 0, length);
+	if (err)
+		goto error_fault;
+
+	IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
+		      rt->dst.dev, dst_output);
+	if (err > 0)
+		err = net_xmit_errno(err);
+	if (err)
+		goto error;
+out:
+	return 0;
+
+error_fault:
+	err = -EFAULT;
+	kfree_skb(skb);
+error:
+	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	if (err == -ENOBUFS && !np->recverr)
+		err = 0;
+	return err;
+}
+
+static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg)
+{
+	struct iovec *iov;
+	u8 __user *type = NULL;
+	u8 __user *code = NULL;
+	u8 len = 0;
+	int probed = 0;
+	int i;
+
+	if (!msg->msg_iov)
+		return 0;
+
+	for (i = 0; i < msg->msg_iovlen; i++) {
+		iov = &msg->msg_iov[i];
+		if (!iov)
+			continue;
+
+		switch (fl6->flowi6_proto) {
+		case IPPROTO_ICMPV6:
+			/* check if one-byte field is readable or not. */
+			if (iov->iov_base && iov->iov_len < 1)
+				break;
+
+			if (!type) {
+				type = iov->iov_base;
+				/* check if code field is readable or not. */
+				if (iov->iov_len > 1)
+					code = type + 1;
+			} else if (!code)
+				code = iov->iov_base;
+
+			if (type && code) {
+				if (get_user(fl6->fl6_icmp_type, type) ||
+				    get_user(fl6->fl6_icmp_code, code))
+					return -EFAULT;
+				probed = 1;
+			}
+			break;
+		case IPPROTO_MH:
+			if (iov->iov_base && iov->iov_len < 1)
+				break;
+			/* check if type field is readable or not. */
+			if (iov->iov_len > 2 - len) {
+				u8 __user *p = iov->iov_base;
+				if (get_user(fl6->fl6_mh_type, &p[2 - len]))
+					return -EFAULT;
+				probed = 1;
+			} else
+				len += iov->iov_len;
+
+			break;
+		default:
+			probed = 1;
+			break;
+		}
+		if (probed)
+			break;
+	}
+	return 0;
+}
+
+static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
+		   struct msghdr *msg, size_t len)
+{
+	struct ipv6_txoptions opt_space;
+	struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
+	struct in6_addr *daddr, *final_p, final;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct raw6_sock *rp = raw6_sk(sk);
+	struct ipv6_txoptions *opt = NULL;
+	struct ip6_flowlabel *flowlabel = NULL;
+	struct dst_entry *dst = NULL;
+	struct flowi6 fl6;
+	int addr_len = msg->msg_namelen;
+	int hlimit = -1;
+	int tclass = -1;
+	int dontfrag = -1;
+	u16 proto;
+	int err;
+
+	/* Rough check on arithmetic overflow,
+	   better check is made in ip6_append_data().
+	 */
+	if (len > INT_MAX)
+		return -EMSGSIZE;
+
+	/* Mirror BSD error message compatibility */
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 *	Get and verify the address.
+	 */
+	memset(&fl6, 0, sizeof(fl6));
+
+	fl6.flowi6_mark = sk->sk_mark;
+
+	if (sin6) {
+		if (addr_len < SIN6_LEN_RFC2133)
+			return -EINVAL;
+
+		if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
+			return -EAFNOSUPPORT;
+
+		/* port is the proto value [0..255] carried in nexthdr */
+		proto = ntohs(sin6->sin6_port);
+
+		if (!proto)
+			proto = inet->inet_num;
+		else if (proto != inet->inet_num)
+			return -EINVAL;
+
+		if (proto > 255)
+			return -EINVAL;
+
+		daddr = &sin6->sin6_addr;
+		if (np->sndflow) {
+			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+				if (flowlabel == NULL)
+					return -EINVAL;
+				daddr = &flowlabel->dst;
+			}
+		}
+
+		/*
+		 * Otherwise it will be difficult to maintain
+		 * sk->sk_dst_cache.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED &&
+		    ipv6_addr_equal(daddr, &np->daddr))
+			daddr = &np->daddr;
+
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    sin6->sin6_scope_id &&
+		    ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
+			fl6.flowi6_oif = sin6->sin6_scope_id;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+
+		proto = inet->inet_num;
+		daddr = &np->daddr;
+		fl6.flowlabel = np->flow_label;
+	}
+
+	if (fl6.flowi6_oif == 0)
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+	if (msg->msg_controllen) {
+		opt = &opt_space;
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
+		opt->tot_len = sizeof(struct ipv6_txoptions);
+
+		err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit,
+					&tclass, &dontfrag);
+		if (err < 0) {
+			fl6_sock_release(flowlabel);
+			return err;
+		}
+		if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+		}
+		if (!(opt->opt_nflen|opt->opt_flen))
+			opt = NULL;
+	}
+	if (opt == NULL)
+		opt = np->opt;
+	if (flowlabel)
+		opt = fl6_merge_options(&opt_space, flowlabel, opt);
+	opt = ipv6_fixup_options(&opt_space, opt);
+
+	fl6.flowi6_proto = proto;
+	err = rawv6_probe_proto_opt(&fl6, msg);
+	if (err)
+		goto out;
+
+	if (!ipv6_addr_any(daddr))
+		ipv6_addr_copy(&fl6.daddr, daddr);
+	else
+		fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+	if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+		ipv6_addr_copy(&fl6.saddr, &np->saddr);
+
+	final_p = fl6_update_dst(&fl6, opt, &final);
+
+	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
+		fl6.flowi6_oif = np->mcast_oif;
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto out;
+	}
+	if (hlimit < 0) {
+		if (ipv6_addr_is_multicast(&fl6.daddr))
+			hlimit = np->mcast_hops;
+		else
+			hlimit = np->hop_limit;
+		if (hlimit < 0)
+			hlimit = ip6_dst_hoplimit(dst);
+	}
+
+	if (tclass < 0)
+		tclass = np->tclass;
+
+	if (dontfrag < 0)
+		dontfrag = np->dontfrag;
+
+	if (msg->msg_flags&MSG_CONFIRM)
+		goto do_confirm;
+
+back_from_confirm:
+	if (inet->hdrincl)
+		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags);
+	else {
+		lock_sock(sk);
+		err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
+			len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst,
+			msg->msg_flags, dontfrag);
+
+		if (err)
+			ip6_flush_pending_frames(sk);
+		else if (!(msg->msg_flags & MSG_MORE))
+			err = rawv6_push_pending_frames(sk, &fl6, rp);
+		release_sock(sk);
+	}
+done:
+	dst_release(dst);
+out:
+	fl6_sock_release(flowlabel);
+	return err<0?err:len;
+do_confirm:
+	dst_confirm(dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto done;
+}
+
+static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
+			       char __user *optval, int optlen)
+{
+	switch (optname) {
+	case ICMPV6_FILTER:
+		if (optlen > sizeof(struct icmp6_filter))
+			optlen = sizeof(struct icmp6_filter);
+		if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	return 0;
+}
+
+static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
+			       char __user *optval, int __user *optlen)
+{
+	int len;
+
+	switch (optname) {
+	case ICMPV6_FILTER:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		if (len < 0)
+			return -EINVAL;
+		if (len > sizeof(struct icmp6_filter))
+			len = sizeof(struct icmp6_filter);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &raw6_sk(sk)->filter, len))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	return 0;
+}
+
+
+static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, unsigned int optlen)
+{
+	struct raw6_sock *rp = raw6_sk(sk);
+	int val;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+		case IPV6_CHECKSUM:
+			if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
+			    level == IPPROTO_IPV6) {
+				/*
+				 * RFC3542 tells that IPV6_CHECKSUM socket
+				 * option in the IPPROTO_IPV6 level is not
+				 * allowed on ICMPv6 sockets.
+				 * If you want to set it, use IPPROTO_RAW
+				 * level IPV6_CHECKSUM socket option
+				 * (Linux extension).
+				 */
+				return -EINVAL;
+			}
+
+			/* You may get strange result with a positive odd offset;
+			   RFC2292bis agrees with me. */
+			if (val > 0 && (val&1))
+				return -EINVAL;
+			if (val < 0) {
+				rp->checksum = 0;
+			} else {
+				rp->checksum = 1;
+				rp->offset = val;
+			}
+
+			return 0;
+			break;
+
+		default:
+			return -ENOPROTOOPT;
+	}
+}
+
+static int rawv6_setsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	switch(level) {
+		case SOL_RAW:
+			break;
+
+		case SOL_ICMPV6:
+			if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+				return -EOPNOTSUPP;
+			return rawv6_seticmpfilter(sk, level, optname, optval,
+						   optlen);
+		case SOL_IPV6:
+			if (optname == IPV6_CHECKSUM)
+				break;
+		default:
+			return ipv6_setsockopt(sk, level, optname, optval,
+					       optlen);
+	}
+
+	return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, unsigned int optlen)
+{
+	switch (level) {
+	case SOL_RAW:
+		break;
+	case SOL_ICMPV6:
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+			return -EOPNOTSUPP;
+		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
+	case SOL_IPV6:
+		if (optname == IPV6_CHECKSUM)
+			break;
+	default:
+		return compat_ipv6_setsockopt(sk, level, optname,
+					      optval, optlen);
+	}
+	return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	struct raw6_sock *rp = raw6_sk(sk);
+	int val, len;
+
+	if (get_user(len,optlen))
+		return -EFAULT;
+
+	switch (optname) {
+	case IPV6_CHECKSUM:
+		/*
+		 * We allow getsockopt() for IPPROTO_IPV6-level
+		 * IPV6_CHECKSUM socket option on ICMPv6 sockets
+		 * since RFC3542 is silent about it.
+		 */
+		if (rp->checksum == 0)
+			val = -1;
+		else
+			val = rp->offset;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = min_t(unsigned int, sizeof(int), len);
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval,&val,len))
+		return -EFAULT;
+	return 0;
+}
+
+static int rawv6_getsockopt(struct sock *sk, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	switch(level) {
+		case SOL_RAW:
+			break;
+
+		case SOL_ICMPV6:
+			if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+				return -EOPNOTSUPP;
+			return rawv6_geticmpfilter(sk, level, optname, optval,
+						   optlen);
+		case SOL_IPV6:
+			if (optname == IPV6_CHECKSUM)
+				break;
+		default:
+			return ipv6_getsockopt(sk, level, optname, optval,
+					       optlen);
+	}
+
+	return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, int __user *optlen)
+{
+	switch (level) {
+	case SOL_RAW:
+		break;
+	case SOL_ICMPV6:
+		if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+			return -EOPNOTSUPP;
+		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
+	case SOL_IPV6:
+		if (optname == IPV6_CHECKSUM)
+			break;
+	default:
+		return compat_ipv6_getsockopt(sk, level, optname,
+					      optval, optlen);
+	}
+	return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch(cmd) {
+		case SIOCOUTQ:
+		{
+			int amount = sk_wmem_alloc_get(sk);
+
+			return put_user(amount, (int __user *)arg);
+		}
+		case SIOCINQ:
+		{
+			struct sk_buff *skb;
+			int amount = 0;
+
+			spin_lock_bh(&sk->sk_receive_queue.lock);
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb != NULL)
+				amount = skb->tail - skb->transport_header;
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
+			return put_user(amount, (int __user *)arg);
+		}
+
+		default:
+#ifdef CONFIG_IPV6_MROUTE
+			return ip6mr_ioctl(sk, cmd, (void __user *)arg);
+#else
+			return -ENOIOCTLCMD;
+#endif
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SIOCOUTQ:
+	case SIOCINQ:
+		return -ENOIOCTLCMD;
+	default:
+#ifdef CONFIG_IPV6_MROUTE
+		return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
+#else
+		return -ENOIOCTLCMD;
+#endif
+	}
+}
+#endif
+
+static void rawv6_close(struct sock *sk, long timeout)
+{
+	if (inet_sk(sk)->inet_num == IPPROTO_RAW)
+		ip6_ra_control(sk, -1);
+	ip6mr_sk_done(sk);
+	sk_common_release(sk);
+}
+
+static void raw6_destroy(struct sock *sk)
+{
+	lock_sock(sk);
+	ip6_flush_pending_frames(sk);
+	release_sock(sk);
+
+	inet6_destroy_sock(sk);
+}
+
+static int rawv6_init_sk(struct sock *sk)
+{
+	struct raw6_sock *rp = raw6_sk(sk);
+
+	switch (inet_sk(sk)->inet_num) {
+	case IPPROTO_ICMPV6:
+		rp->checksum = 1;
+		rp->offset   = 2;
+		break;
+	case IPPROTO_MH:
+		rp->checksum = 1;
+		rp->offset   = 4;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+struct proto rawv6_prot = {
+	.name		   = "RAWv6",
+	.owner		   = THIS_MODULE,
+	.close		   = rawv6_close,
+	.destroy	   = raw6_destroy,
+	.connect	   = ip6_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = rawv6_ioctl,
+	.init		   = rawv6_init_sk,
+	.setsockopt	   = rawv6_setsockopt,
+	.getsockopt	   = rawv6_getsockopt,
+	.sendmsg	   = rawv6_sendmsg,
+	.recvmsg	   = rawv6_recvmsg,
+	.bind		   = rawv6_bind,
+	.backlog_rcv	   = rawv6_rcv_skb,
+	.hash		   = raw_hash_sk,
+	.unhash		   = raw_unhash_sk,
+	.obj_size	   = sizeof(struct raw6_sock),
+	.h.raw_hash	   = &raw_v6_hashinfo,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_rawv6_setsockopt,
+	.compat_getsockopt = compat_rawv6_getsockopt,
+	.compat_ioctl	   = compat_rawv6_ioctl,
+#endif
+};
+
+#ifdef CONFIG_PROC_FS
+static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
+{
+	struct ipv6_pinfo *np = inet6_sk(sp);
+	const struct in6_addr *dest, *src;
+	__u16 destp, srcp;
+
+	dest  = &np->daddr;
+	src   = &np->rcv_saddr;
+	destp = 0;
+	srcp  = inet_sk(sp)->inet_num;
+	seq_printf(seq,
+		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+		   i,
+		   src->s6_addr32[0], src->s6_addr32[1],
+		   src->s6_addr32[2], src->s6_addr32[3], srcp,
+		   dest->s6_addr32[0], dest->s6_addr32[1],
+		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
+		   sp->sk_state,
+		   sk_wmem_alloc_get(sp),
+		   sk_rmem_alloc_get(sp),
+		   0, 0L, 0,
+		   sock_i_uid(sp), 0,
+		   sock_i_ino(sp),
+		   atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+}
+
+static int raw6_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq,
+			   "  sl  "
+			   "local_address                         "
+			   "remote_address                        "
+			   "st tx_queue rx_queue tr tm->when retrnsmt"
+			   "   uid  timeout inode ref pointer drops\n");
+	else
+		raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket);
+	return 0;
+}
+
+static const struct seq_operations raw6_seq_ops = {
+	.start =	raw_seq_start,
+	.next =		raw_seq_next,
+	.stop =		raw_seq_stop,
+	.show =		raw6_seq_show,
+};
+
+static int raw6_seq_open(struct inode *inode, struct file *file)
+{
+	return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops);
+}
+
+static const struct file_operations raw6_seq_fops = {
+	.owner =	THIS_MODULE,
+	.open =		raw6_seq_open,
+	.read =		seq_read,
+	.llseek =	seq_lseek,
+	.release =	seq_release_net,
+};
+
+static int __net_init raw6_init_net(struct net *net)
+{
+	if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit raw6_exit_net(struct net *net)
+{
+	proc_net_remove(net, "raw6");
+}
+
+static struct pernet_operations raw6_net_ops = {
+	.init = raw6_init_net,
+	.exit = raw6_exit_net,
+};
+
+int __init raw6_proc_init(void)
+{
+	return register_pernet_subsys(&raw6_net_ops);
+}
+
+void raw6_proc_exit(void)
+{
+	unregister_pernet_subsys(&raw6_net_ops);
+}
+#endif	/* CONFIG_PROC_FS */
+
+/* Same as inet6_dgram_ops, sans udp_poll.  */
+static const struct proto_ops inet6_sockraw_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = inet6_bind,
+	.connect	   = inet_dgram_connect,	/* ok		*/
+	.socketpair	   = sock_no_socketpair,	/* a do nothing	*/
+	.accept		   = sock_no_accept,		/* a do nothing	*/
+	.getname	   = inet6_getname,
+	.poll		   = datagram_poll,		/* ok		*/
+	.ioctl		   = inet6_ioctl,		/* must change  */
+	.listen		   = sock_no_listen,		/* ok		*/
+	.shutdown	   = inet_shutdown,		/* ok		*/
+	.setsockopt	   = sock_common_setsockopt,	/* ok		*/
+	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
+	.sendmsg	   = inet_sendmsg,		/* ok		*/
+	.recvmsg	   = sock_common_recvmsg,	/* ok		*/
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw rawv6_protosw = {
+	.type		= SOCK_RAW,
+	.protocol	= IPPROTO_IP,	/* wild card */
+	.prot		= &rawv6_prot,
+	.ops		= &inet6_sockraw_ops,
+	.no_check	= UDP_CSUM_DEFAULT,
+	.flags		= INET_PROTOSW_REUSE,
+};
+
+int __init rawv6_init(void)
+{
+	int ret;
+
+	ret = inet6_register_protosw(&rawv6_protosw);
+	if (ret)
+		goto out;
+out:
+	return ret;
+}
+
+void rawv6_exit(void)
+{
+	inet6_unregister_protosw(&rawv6_protosw);
+}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
new file mode 100644
index 00000000..7b954e25
--- /dev/null
+++ b/net/ipv6/reassembly.c
@@ -0,0 +1,769 @@
+/*
+ *	IPv6 fragment reassembly
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on: net/ipv4/ip_fragment.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ *	Fixes:
+ *	Andi Kleen	Make it work with multiple hosts.
+ *			More RFC compliance.
+ *
+ *      Horst von Brand Add missing #include <linux/string.h>
+ *	Alexey Kuznetsov	SMP races, threading, cleanup.
+ *	Patrick McHardy		LRU queue of frag heads for evictor.
+ *	Mitsuru KANDA @USAGI	Register inet6_protocol{}.
+ *	David Stevens and
+ *	YOSHIFUJI,H. @USAGI	Always remove fragment header to
+ *				calculate ICV correctly.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/inet_frag.h>
+
+struct ip6frag_skb_cb
+{
+	struct inet6_skb_parm	h;
+	int			offset;
+};
+
+#define FRAG6_CB(skb)	((struct ip6frag_skb_cb*)((skb)->cb))
+
+
+/*
+ *	Equivalent of ipv4 struct ipq
+ */
+
+struct frag_queue
+{
+	struct inet_frag_queue	q;
+
+	__be32			id;		/* fragment id		*/
+	u32			user;
+	struct in6_addr		saddr;
+	struct in6_addr		daddr;
+
+	int			iif;
+	unsigned int		csum;
+	__u16			nhoffset;
+};
+
+static struct inet_frags ip6_frags;
+
+int ip6_frag_nqueues(struct net *net)
+{
+	return net->ipv6.frags.nqueues;
+}
+
+int ip6_frag_mem(struct net *net)
+{
+	return atomic_read(&net->ipv6.frags.mem);
+}
+
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+			  struct net_device *dev);
+
+/*
+ * callers should be careful not to use the hash value outside the ipfrag_lock
+ * as doing so could race with ipfrag_hash_rnd being recalculated.
+ */
+unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
+			     const struct in6_addr *daddr, u32 rnd)
+{
+	u32 c;
+
+	c = jhash_3words((__force u32)saddr->s6_addr32[0],
+			 (__force u32)saddr->s6_addr32[1],
+			 (__force u32)saddr->s6_addr32[2],
+			 rnd);
+
+	c = jhash_3words((__force u32)saddr->s6_addr32[3],
+			 (__force u32)daddr->s6_addr32[0],
+			 (__force u32)daddr->s6_addr32[1],
+			 c);
+
+	c =  jhash_3words((__force u32)daddr->s6_addr32[2],
+			  (__force u32)daddr->s6_addr32[3],
+			  (__force u32)id,
+			  c);
+
+	return c & (INETFRAGS_HASHSZ - 1);
+}
+EXPORT_SYMBOL_GPL(inet6_hash_frag);
+
+static unsigned int ip6_hashfn(struct inet_frag_queue *q)
+{
+	struct frag_queue *fq;
+
+	fq = container_of(q, struct frag_queue, q);
+	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr, ip6_frags.rnd);
+}
+
+int ip6_frag_match(struct inet_frag_queue *q, void *a)
+{
+	struct frag_queue *fq;
+	struct ip6_create_arg *arg = a;
+
+	fq = container_of(q, struct frag_queue, q);
+	return (fq->id == arg->id && fq->user == arg->user &&
+			ipv6_addr_equal(&fq->saddr, arg->src) &&
+			ipv6_addr_equal(&fq->daddr, arg->dst));
+}
+EXPORT_SYMBOL(ip6_frag_match);
+
+void ip6_frag_init(struct inet_frag_queue *q, void *a)
+{
+	struct frag_queue *fq = container_of(q, struct frag_queue, q);
+	struct ip6_create_arg *arg = a;
+
+	fq->id = arg->id;
+	fq->user = arg->user;
+	ipv6_addr_copy(&fq->saddr, arg->src);
+	ipv6_addr_copy(&fq->daddr, arg->dst);
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+/* Destruction primitives. */
+
+static __inline__ void fq_put(struct frag_queue *fq)
+{
+	inet_frag_put(&fq->q, &ip6_frags);
+}
+
+/* Kill fq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static __inline__ void fq_kill(struct frag_queue *fq)
+{
+	inet_frag_kill(&fq->q, &ip6_frags);
+}
+
+static void ip6_evictor(struct net *net, struct inet6_dev *idev)
+{
+	int evicted;
+
+	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags);
+	if (evicted)
+		IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS, evicted);
+}
+
+static void ip6_frag_expire(unsigned long data)
+{
+	struct frag_queue *fq;
+	struct net_device *dev = NULL;
+	struct net *net;
+
+	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+
+	spin_lock(&fq->q.lock);
+
+	if (fq->q.last_in & INET_FRAG_COMPLETE)
+		goto out;
+
+	fq_kill(fq);
+
+	net = container_of(fq->q.net, struct net, ipv6.frags);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, fq->iif);
+	if (!dev)
+		goto out_rcu_unlock;
+
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+
+	/* Don't send error if the first segment did not arrive. */
+	if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments)
+		goto out_rcu_unlock;
+
+	/*
+	   But use as source device on which LAST ARRIVED
+	   segment was received. And do not use fq->dev
+	   pointer directly, device might already disappeared.
+	 */
+	fq->q.fragments->dev = dev;
+	icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+	rcu_read_unlock();
+out:
+	spin_unlock(&fq->q.lock);
+	fq_put(fq);
+}
+
+static __inline__ struct frag_queue *
+fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst)
+{
+	struct inet_frag_queue *q;
+	struct ip6_create_arg arg;
+	unsigned int hash;
+
+	arg.id = id;
+	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
+	arg.src = src;
+	arg.dst = dst;
+
+	read_lock(&ip6_frags.lock);
+	hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd);
+
+	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
+	if (q == NULL)
+		return NULL;
+
+	return container_of(q, struct frag_queue, q);
+}
+
+static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
+			   struct frag_hdr *fhdr, int nhoff)
+{
+	struct sk_buff *prev, *next;
+	struct net_device *dev;
+	int offset, end;
+	struct net *net = dev_net(skb_dst(skb)->dev);
+
+	if (fq->q.last_in & INET_FRAG_COMPLETE)
+		goto err;
+
+	offset = ntohs(fhdr->frag_off) & ~0x7;
+	end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
+			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
+
+	if ((unsigned int)end > IPV6_MAXPLEN) {
+		IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+				 IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+				  ((u8 *)&fhdr->frag_off -
+				   skb_network_header(skb)));
+		return -1;
+	}
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		const unsigned char *nh = skb_network_header(skb);
+		skb->csum = csum_sub(skb->csum,
+				     csum_partial(nh, (u8 *)(fhdr + 1) - nh,
+						  0));
+	}
+
+	/* Is this the final fragment? */
+	if (!(fhdr->frag_off & htons(IP6_MF))) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrupted.
+		 */
+		if (end < fq->q.len ||
+		    ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len))
+			goto err;
+		fq->q.last_in |= INET_FRAG_LAST_IN;
+		fq->q.len = end;
+	} else {
+		/* Check if the fragment is rounded to 8 bytes.
+		 * Required by the RFC.
+		 */
+		if (end & 0x7) {
+			/* RFC2460 says always send parameter problem in
+			 * this case. -DaveM
+			 */
+			IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
+					 IPSTATS_MIB_INHDRERRORS);
+			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+					  offsetof(struct ipv6hdr, payload_len));
+			return -1;
+		}
+		if (end > fq->q.len) {
+			/* Some bits beyond end -> corruption. */
+			if (fq->q.last_in & INET_FRAG_LAST_IN)
+				goto err;
+			fq->q.len = end;
+		}
+	}
+
+	if (end == offset)
+		goto err;
+
+	/* Point into the IP datagram 'data' part. */
+	if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
+		goto err;
+
+	if (pskb_trim_rcsum(skb, end - offset))
+		goto err;
+
+	/* Find out which fragments are in front and at the back of us
+	 * in the chain of fragments so far.  We must know where to put
+	 * this fragment, right?
+	 */
+	prev = fq->q.fragments_tail;
+	if (!prev || FRAG6_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
+	prev = NULL;
+	for(next = fq->q.fragments; next != NULL; next = next->next) {
+		if (FRAG6_CB(next)->offset >= offset)
+			break;	/* bingo! */
+		prev = next;
+	}
+
+found:
+	/* RFC5722, Section 4:
+	 *                                  When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments, including those not yet received) MUST be silently
+	 *   discarded.
+	 */
+
+	/* Check for overlap with preceding fragment. */
+	if (prev &&
+	    (FRAG6_CB(prev)->offset + prev->len) > offset)
+		goto discard_fq;
+
+	/* Look for overlap with succeeding segment. */
+	if (next && FRAG6_CB(next)->offset < end)
+		goto discard_fq;
+
+	FRAG6_CB(skb)->offset = offset;
+
+	/* Insert this fragment in the chain of fragments. */
+	skb->next = next;
+	if (!next)
+		fq->q.fragments_tail = skb;
+	if (prev)
+		prev->next = skb;
+	else
+		fq->q.fragments = skb;
+
+	dev = skb->dev;
+	if (dev) {
+		fq->iif = dev->ifindex;
+		skb->dev = NULL;
+	}
+	fq->q.stamp = skb->tstamp;
+	fq->q.meat += skb->len;
+	atomic_add(skb->truesize, &fq->q.net->mem);
+
+	/* The first fragment.
+	 * nhoffset is obtained from the first fragment, of course.
+	 */
+	if (offset == 0) {
+		fq->nhoffset = nhoff;
+		fq->q.last_in |= INET_FRAG_FIRST_IN;
+	}
+
+	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	    fq->q.meat == fq->q.len)
+		return ip6_frag_reasm(fq, prev, dev);
+
+	write_lock(&ip6_frags.lock);
+	list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list);
+	write_unlock(&ip6_frags.lock);
+	return -1;
+
+discard_fq:
+	fq_kill(fq);
+err:
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		      IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return -1;
+}
+
+/*
+ *	Check if this packet is complete.
+ *	Returns NULL on failure by any reason, and pointer
+ *	to current nexthdr field in reassembled frame.
+ *
+ *	It is called with locked fq, and caller must check that
+ *	queue is eligible for reassembly i.e. it is not COMPLETE,
+ *	the last and the first frames arrived and all the bits are here.
+ */
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
+			  struct net_device *dev)
+{
+	struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
+	struct sk_buff *fp, *head = fq->q.fragments;
+	int    payload_len;
+	unsigned int nhoff;
+
+	fq_kill(fq);
+
+	/* Make the one we just received the head. */
+	if (prev) {
+		head = prev->next;
+		fp = skb_clone(head, GFP_ATOMIC);
+
+		if (!fp)
+			goto out_oom;
+
+		fp->next = head->next;
+		if (!fp->next)
+			fq->q.fragments_tail = fp;
+		prev->next = fp;
+
+		skb_morph(head, fq->q.fragments);
+		head->next = fq->q.fragments->next;
+
+		kfree_skb(fq->q.fragments);
+		fq->q.fragments = head;
+	}
+
+	WARN_ON(head == NULL);
+	WARN_ON(FRAG6_CB(head)->offset != 0);
+
+	/* Unfragmented part is taken from the first segment. */
+	payload_len = ((head->data - skb_network_header(head)) -
+		       sizeof(struct ipv6hdr) + fq->q.len -
+		       sizeof(struct frag_hdr));
+	if (payload_len > IPV6_MAXPLEN)
+		goto out_oversize;
+
+	/* Head of list must not be cloned. */
+	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+		goto out_oom;
+
+	/* If the first fragment is fragmented itself, we split
+	 * it to two chunks: the first with data and paged part
+	 * and the second, holding only fragments. */
+	if (skb_has_frag_list(head)) {
+		struct sk_buff *clone;
+		int i, plen = 0;
+
+		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+			goto out_oom;
+		clone->next = head->next;
+		head->next = clone;
+		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+		skb_frag_list_init(head);
+		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+			plen += skb_shinfo(head)->frags[i].size;
+		clone->len = clone->data_len = head->data_len - plen;
+		head->data_len -= clone->len;
+		head->len -= clone->len;
+		clone->csum = 0;
+		clone->ip_summed = head->ip_summed;
+		atomic_add(clone->truesize, &fq->q.net->mem);
+	}
+
+	/* We have to remove fragment header from datagram and to relocate
+	 * header in order to calculate ICV correctly. */
+	nhoff = fq->nhoffset;
+	skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
+	memmove(head->head + sizeof(struct frag_hdr), head->head,
+		(head->data - head->head) - sizeof(struct frag_hdr));
+	head->mac_header += sizeof(struct frag_hdr);
+	head->network_header += sizeof(struct frag_hdr);
+
+	skb_shinfo(head)->frag_list = head->next;
+	skb_reset_transport_header(head);
+	skb_push(head, head->data - skb_network_header(head));
+
+	for (fp=head->next; fp; fp = fp->next) {
+		head->data_len += fp->len;
+		head->len += fp->len;
+		if (head->ip_summed != fp->ip_summed)
+			head->ip_summed = CHECKSUM_NONE;
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
+			head->csum = csum_add(head->csum, fp->csum);
+		head->truesize += fp->truesize;
+	}
+	atomic_sub(head->truesize, &fq->q.net->mem);
+
+	head->next = NULL;
+	head->dev = dev;
+	head->tstamp = fq->q.stamp;
+	ipv6_hdr(head)->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
+
+	/* Yes, and fold redundant checksum back. 8) */
+	if (head->ip_summed == CHECKSUM_COMPLETE)
+		head->csum = csum_partial(skb_network_header(head),
+					  skb_network_header_len(head),
+					  head->csum);
+
+	rcu_read_lock();
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
+	rcu_read_unlock();
+	fq->q.fragments = NULL;
+	fq->q.fragments_tail = NULL;
+	return 1;
+
+out_oversize:
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len);
+	goto out_fail;
+out_oom:
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
+out_fail:
+	rcu_read_lock();
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+	rcu_read_unlock();
+	return -1;
+}
+
+static int ipv6_frag_rcv(struct sk_buff *skb)
+{
+	struct frag_hdr *fhdr;
+	struct frag_queue *fq;
+	const struct ipv6hdr *hdr = ipv6_hdr(skb);
+	struct net *net = dev_net(skb_dst(skb)->dev);
+
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
+
+	/* Jumbo payload inhibits frag. header */
+	if (hdr->payload_len==0)
+		goto fail_hdr;
+
+	if (!pskb_may_pull(skb, (skb_transport_offset(skb) +
+				 sizeof(struct frag_hdr))))
+		goto fail_hdr;
+
+	hdr = ipv6_hdr(skb);
+	fhdr = (struct frag_hdr *)skb_transport_header(skb);
+
+	if (!(fhdr->frag_off & htons(0xFFF9))) {
+		/* It is not a fragmented frame */
+		skb->transport_header += sizeof(struct frag_hdr);
+		IP6_INC_STATS_BH(net,
+				 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS);
+
+		IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
+		return 1;
+	}
+
+	if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh)
+		ip6_evictor(net, ip6_dst_idev(skb_dst(skb)));
+
+	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr);
+	if (fq != NULL) {
+		int ret;
+
+		spin_lock(&fq->q.lock);
+
+		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+
+		spin_unlock(&fq->q.lock);
+		fq_put(fq);
+		return ret;
+	}
+
+	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return -1;
+
+fail_hdr:
+	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
+	return -1;
+}
+
+static const struct inet6_protocol frag_protocol =
+{
+	.handler	=	ipv6_frag_rcv,
+	.flags		=	INET6_PROTO_NOPOLICY,
+};
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table ip6_frags_ns_ctl_table[] = {
+	{
+		.procname	= "ip6frag_high_thresh",
+		.data		= &init_net.ipv6.frags.high_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip6frag_low_thresh",
+		.data		= &init_net.ipv6.frags.low_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "ip6frag_time",
+		.data		= &init_net.ipv6.frags.timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+static struct ctl_table ip6_frags_ctl_table[] = {
+	{
+		.procname	= "ip6frag_secret_interval",
+		.data		= &ip6_frags.secret_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
+{
+	struct ctl_table *table;
+	struct ctl_table_header *hdr;
+
+	table = ip6_frags_ns_ctl_table;
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL);
+		if (table == NULL)
+			goto err_alloc;
+
+		table[0].data = &net->ipv6.frags.high_thresh;
+		table[1].data = &net->ipv6.frags.low_thresh;
+		table[2].data = &net->ipv6.frags.timeout;
+	}
+
+	hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table);
+	if (hdr == NULL)
+		goto err_reg;
+
+	net->ipv6.sysctl.frags_hdr = hdr;
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ipv6.sysctl.frags_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(table);
+}
+
+static struct ctl_table_header *ip6_ctl_header;
+
+static int ip6_frags_sysctl_register(void)
+{
+	ip6_ctl_header = register_net_sysctl_rotable(net_ipv6_ctl_path,
+			ip6_frags_ctl_table);
+	return ip6_ctl_header == NULL ? -ENOMEM : 0;
+}
+
+static void ip6_frags_sysctl_unregister(void)
+{
+	unregister_net_sysctl_table(ip6_ctl_header);
+}
+#else
+static inline int ip6_frags_ns_sysctl_register(struct net *net)
+{
+	return 0;
+}
+
+static inline void ip6_frags_ns_sysctl_unregister(struct net *net)
+{
+}
+
+static inline int ip6_frags_sysctl_register(void)
+{
+	return 0;
+}
+
+static inline void ip6_frags_sysctl_unregister(void)
+{
+}
+#endif
+
+static int __net_init ipv6_frags_init_net(struct net *net)
+{
+	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+
+	inet_frags_init_net(&net->ipv6.frags);
+
+	return ip6_frags_ns_sysctl_register(net);
+}
+
+static void __net_exit ipv6_frags_exit_net(struct net *net)
+{
+	ip6_frags_ns_sysctl_unregister(net);
+	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+}
+
+static struct pernet_operations ip6_frags_ops = {
+	.init = ipv6_frags_init_net,
+	.exit = ipv6_frags_exit_net,
+};
+
+int __init ipv6_frag_init(void)
+{
+	int ret;
+
+	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	if (ret)
+		goto out;
+
+	ret = ip6_frags_sysctl_register();
+	if (ret)
+		goto err_sysctl;
+
+	ret = register_pernet_subsys(&ip6_frags_ops);
+	if (ret)
+		goto err_pernet;
+
+	ip6_frags.hashfn = ip6_hashfn;
+	ip6_frags.constructor = ip6_frag_init;
+	ip6_frags.destructor = NULL;
+	ip6_frags.skb_free = NULL;
+	ip6_frags.qsize = sizeof(struct frag_queue);
+	ip6_frags.match = ip6_frag_match;
+	ip6_frags.frag_expire = ip6_frag_expire;
+	ip6_frags.secret_interval = 10 * 60 * HZ;
+	inet_frags_init(&ip6_frags);
+out:
+	return ret;
+
+err_pernet:
+	ip6_frags_sysctl_unregister();
+err_sysctl:
+	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	goto out;
+}
+
+void ipv6_frag_exit(void)
+{
+	inet_frags_fini(&ip6_frags);
+	ip6_frags_sysctl_unregister();
+	unregister_pernet_subsys(&ip6_frags_ops);
+	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
new file mode 100644
index 00000000..8e600f82
--- /dev/null
+++ b/net/ipv6/route.c
@@ -0,0 +1,2976 @@
+/*
+ *	Linux INET6 implementation
+ *	FIB front-end.
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*	Changes:
+ *
+ *	YOSHIFUJI Hideaki @USAGI
+ *		reworked default router selection.
+ *		- respect outgoing interface
+ *		- select from (probably) reachable routers (i.e.
+ *		routers in REACHABLE, STALE, DELAY or PROBE states).
+ *		- always select the same router if it is (probably)
+ *		reachable.  otherwise, round-robin the list.
+ *	Ville Nuorvala
+ *		Fixed routing subtrees.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/times.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/route.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/mroute6.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/snmp.h>
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/tcp.h>
+#include <linux/rtnetlink.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/netevent.h>
+#include <net/netlink.h>
+
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+/* Set to 3 to get tracing. */
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RDBG(x) printk x
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
+#else
+#define RDBG(x)
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
+static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
+static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
+static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
+static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
+static struct dst_entry *ip6_negative_advice(struct dst_entry *);
+static void		ip6_dst_destroy(struct dst_entry *);
+static void		ip6_dst_ifdown(struct dst_entry *,
+				       struct net_device *dev, int how);
+static int		 ip6_dst_gc(struct dst_ops *ops);
+
+static int		ip6_pkt_discard(struct sk_buff *skb);
+static int		ip6_pkt_discard_out(struct sk_buff *skb);
+static void		ip6_link_failure(struct sk_buff *skb);
+static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_add_route_info(struct net *net,
+					   const struct in6_addr *prefix, int prefixlen,
+					   const struct in6_addr *gwaddr, int ifindex,
+					   unsigned pref);
+static struct rt6_info *rt6_get_route_info(struct net *net,
+					   const struct in6_addr *prefix, int prefixlen,
+					   const struct in6_addr *gwaddr, int ifindex);
+#endif
+
+static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	struct rt6_info *rt = (struct rt6_info *) dst;
+	struct inet_peer *peer;
+	u32 *p = NULL;
+
+	if (!rt->rt6i_peer)
+		rt6_bind_peer(rt, 1);
+
+	peer = rt->rt6i_peer;
+	if (peer) {
+		u32 *old_p = __DST_METRICS_PTR(old);
+		unsigned long prev, new;
+
+		p = peer->metrics;
+		if (inet_metrics_new(peer))
+			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+		new = (unsigned long) p;
+		prev = cmpxchg(&dst->_metrics, old, new);
+
+		if (prev != old) {
+			p = __DST_METRICS_PTR(prev);
+			if (prev & DST_METRICS_READ_ONLY)
+				p = NULL;
+		}
+	}
+	return p;
+}
+
+static struct dst_ops ip6_dst_ops_template = {
+	.family			=	AF_INET6,
+	.protocol		=	cpu_to_be16(ETH_P_IPV6),
+	.gc			=	ip6_dst_gc,
+	.gc_thresh		=	1024,
+	.check			=	ip6_dst_check,
+	.default_advmss		=	ip6_default_advmss,
+	.default_mtu		=	ip6_default_mtu,
+	.cow_metrics		=	ipv6_cow_metrics,
+	.destroy		=	ip6_dst_destroy,
+	.ifdown			=	ip6_dst_ifdown,
+	.negative_advice	=	ip6_negative_advice,
+	.link_failure		=	ip6_link_failure,
+	.update_pmtu		=	ip6_rt_update_pmtu,
+	.local_out		=	__ip6_local_out,
+};
+
+static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
+{
+	return 0;
+}
+
+static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+}
+
+static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
+					 unsigned long old)
+{
+	return NULL;
+}
+
+static struct dst_ops ip6_dst_blackhole_ops = {
+	.family			=	AF_INET6,
+	.protocol		=	cpu_to_be16(ETH_P_IPV6),
+	.destroy		=	ip6_dst_destroy,
+	.check			=	ip6_dst_check,
+	.default_mtu		=	ip6_blackhole_default_mtu,
+	.default_advmss		=	ip6_default_advmss,
+	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
+	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
+};
+
+static const u32 ip6_template_metrics[RTAX_MAX] = {
+	[RTAX_HOPLIMIT - 1] = 255,
+};
+
+static struct rt6_info ip6_null_entry_template = {
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -ENETUNREACH,
+		.input		= ip6_pkt_discard,
+		.output		= ip6_pkt_discard_out,
+	},
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
+	.rt6i_metric	= ~(u32) 0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+};
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+static int ip6_pkt_prohibit(struct sk_buff *skb);
+static int ip6_pkt_prohibit_out(struct sk_buff *skb);
+
+static struct rt6_info ip6_prohibit_entry_template = {
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -EACCES,
+		.input		= ip6_pkt_prohibit,
+		.output		= ip6_pkt_prohibit_out,
+	},
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
+	.rt6i_metric	= ~(u32) 0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+};
+
+static struct rt6_info ip6_blk_hole_entry_template = {
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -EINVAL,
+		.input		= dst_discard,
+		.output		= dst_discard,
+	},
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
+	.rt6i_metric	= ~(u32) 0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+};
+
+#endif
+
+/* allocate dst with ip6_dst_ops */
+static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
+					     struct net_device *dev,
+					     int flags)
+{
+	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
+
+	memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
+
+	return rt;
+}
+
+static void ip6_dst_destroy(struct dst_entry *dst)
+{
+	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct inet6_dev *idev = rt->rt6i_idev;
+	struct inet_peer *peer = rt->rt6i_peer;
+
+	if (idev != NULL) {
+		rt->rt6i_idev = NULL;
+		in6_dev_put(idev);
+	}
+	if (peer) {
+		rt->rt6i_peer = NULL;
+		inet_putpeer(peer);
+	}
+}
+
+static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt6_peer_genid(void)
+{
+	return atomic_read(&__rt6_peer_genid);
+}
+
+void rt6_bind_peer(struct rt6_info *rt, int create)
+{
+	struct inet_peer *peer;
+
+	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
+	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
+		inet_putpeer(peer);
+	else
+		rt->rt6i_peer_genid = rt6_peer_genid();
+}
+
+static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+			   int how)
+{
+	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct inet6_dev *idev = rt->rt6i_idev;
+	struct net_device *loopback_dev =
+		dev_net(dev)->loopback_dev;
+
+	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
+		struct inet6_dev *loopback_idev =
+			in6_dev_get(loopback_dev);
+		if (loopback_idev != NULL) {
+			rt->rt6i_idev = loopback_idev;
+			in6_dev_put(idev);
+		}
+	}
+}
+
+static __inline__ int rt6_check_expired(const struct rt6_info *rt)
+{
+	return (rt->rt6i_flags & RTF_EXPIRES) &&
+		time_after(jiffies, rt->rt6i_expires);
+}
+
+static inline int rt6_need_strict(const struct in6_addr *daddr)
+{
+	return ipv6_addr_type(daddr) &
+		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
+/*
+ *	Route lookup. Any table->tb6_lock is implied.
+ */
+
+static inline struct rt6_info *rt6_device_match(struct net *net,
+						    struct rt6_info *rt,
+						    const struct in6_addr *saddr,
+						    int oif,
+						    int flags)
+{
+	struct rt6_info *local = NULL;
+	struct rt6_info *sprt;
+
+	if (!oif && ipv6_addr_any(saddr))
+		goto out;
+
+	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+		struct net_device *dev = sprt->rt6i_dev;
+
+		if (oif) {
+			if (dev->ifindex == oif)
+				return sprt;
+			if (dev->flags & IFF_LOOPBACK) {
+				if (sprt->rt6i_idev == NULL ||
+				    sprt->rt6i_idev->dev->ifindex != oif) {
+					if (flags & RT6_LOOKUP_F_IFACE && oif)
+						continue;
+					if (local && (!oif ||
+						      local->rt6i_idev->dev->ifindex == oif))
+						continue;
+				}
+				local = sprt;
+			}
+		} else {
+			if (ipv6_chk_addr(net, saddr, dev,
+					  flags & RT6_LOOKUP_F_IFACE))
+				return sprt;
+		}
+	}
+
+	if (oif) {
+		if (local)
+			return local;
+
+		if (flags & RT6_LOOKUP_F_IFACE)
+			return net->ipv6.ip6_null_entry;
+	}
+out:
+	return rt;
+}
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+static void rt6_probe(struct rt6_info *rt)
+{
+	struct neighbour *neigh;
+	/*
+	 * Okay, this does not seem to be appropriate
+	 * for now, however, we need to check if it
+	 * is really so; aka Router Reachability Probing.
+	 *
+	 * Router Reachability Probe MUST be rate-limited
+	 * to no more than one per minute.
+	 */
+	rcu_read_lock();
+	neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
+	if (!neigh || (neigh->nud_state & NUD_VALID))
+		goto out;
+	read_lock_bh(&neigh->lock);
+	if (!(neigh->nud_state & NUD_VALID) &&
+	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
+		struct in6_addr mcaddr;
+		struct in6_addr *target;
+
+		neigh->updated = jiffies;
+		read_unlock_bh(&neigh->lock);
+
+		target = (struct in6_addr *)&neigh->primary_key;
+		addrconf_addr_solict_mult(target, &mcaddr);
+		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
+	} else {
+		read_unlock_bh(&neigh->lock);
+	}
+out:
+	rcu_read_unlock();
+}
+#else
+static inline void rt6_probe(struct rt6_info *rt)
+{
+}
+#endif
+
+/*
+ * Default Router Selection (RFC 2461 6.3.6)
+ */
+static inline int rt6_check_dev(struct rt6_info *rt, int oif)
+{
+	struct net_device *dev = rt->rt6i_dev;
+	if (!oif || dev->ifindex == oif)
+		return 2;
+	if ((dev->flags & IFF_LOOPBACK) &&
+	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
+		return 1;
+	return 0;
+}
+
+static inline int rt6_check_neigh(struct rt6_info *rt)
+{
+	struct neighbour *neigh;
+	int m;
+
+	rcu_read_lock();
+	neigh = dst_get_neighbour(&rt->dst);
+	if (rt->rt6i_flags & RTF_NONEXTHOP ||
+	    !(rt->rt6i_flags & RTF_GATEWAY))
+		m = 1;
+	else if (neigh) {
+		read_lock_bh(&neigh->lock);
+		if (neigh->nud_state & NUD_VALID)
+			m = 2;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+		else if (neigh->nud_state & NUD_FAILED)
+			m = 0;
+#endif
+		else
+			m = 1;
+		read_unlock_bh(&neigh->lock);
+	} else
+		m = 0;
+	rcu_read_unlock();
+	return m;
+}
+
+static int rt6_score_route(struct rt6_info *rt, int oif,
+			   int strict)
+{
+	int m, n;
+
+	m = rt6_check_dev(rt, oif);
+	if (!m && (strict & RT6_LOOKUP_F_IFACE))
+		return -1;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+#endif
+	n = rt6_check_neigh(rt);
+	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
+		return -1;
+	return m;
+}
+
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+				   int *mpri, struct rt6_info *match)
+{
+	int m;
+
+	if (rt6_check_expired(rt))
+		goto out;
+
+	m = rt6_score_route(rt, oif, strict);
+	if (m < 0)
+		goto out;
+
+	if (m > *mpri) {
+		if (strict & RT6_LOOKUP_F_REACHABLE)
+			rt6_probe(match);
+		*mpri = m;
+		match = rt;
+	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
+		rt6_probe(rt);
+	}
+
+out:
+	return match;
+}
+
+static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+				     struct rt6_info *rr_head,
+				     u32 metric, int oif, int strict)
+{
+	struct rt6_info *rt, *match;
+	int mpri = -1;
+
+	match = NULL;
+	for (rt = rr_head; rt && rt->rt6i_metric == metric;
+	     rt = rt->dst.rt6_next)
+		match = find_match(rt, oif, strict, &mpri, match);
+	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
+	     rt = rt->dst.rt6_next)
+		match = find_match(rt, oif, strict, &mpri, match);
+
+	return match;
+}
+
+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+{
+	struct rt6_info *match, *rt0;
+	struct net *net;
+
+	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
+		  __func__, fn->leaf, oif);
+
+	rt0 = fn->rr_ptr;
+	if (!rt0)
+		fn->rr_ptr = rt0 = fn->leaf;
+
+	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
+
+	if (!match &&
+	    (strict & RT6_LOOKUP_F_REACHABLE)) {
+		struct rt6_info *next = rt0->dst.rt6_next;
+
+		/* no entries matched; do round-robin */
+		if (!next || next->rt6i_metric != rt0->rt6i_metric)
+			next = fn->leaf;
+
+		if (next != rt0)
+			fn->rr_ptr = next;
+	}
+
+	RT6_TRACE("%s() => %p\n",
+		  __func__, match);
+
+	net = dev_net(rt0->rt6i_dev);
+	return match ? match : net->ipv6.ip6_null_entry;
+}
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
+		  const struct in6_addr *gwaddr)
+{
+	struct net *net = dev_net(dev);
+	struct route_info *rinfo = (struct route_info *) opt;
+	struct in6_addr prefix_buf, *prefix;
+	unsigned int pref;
+	unsigned long lifetime;
+	struct rt6_info *rt;
+
+	if (len < sizeof(struct route_info)) {
+		return -EINVAL;
+	}
+
+	/* Sanity check for prefix_len and length */
+	if (rinfo->length > 3) {
+		return -EINVAL;
+	} else if (rinfo->prefix_len > 128) {
+		return -EINVAL;
+	} else if (rinfo->prefix_len > 64) {
+		if (rinfo->length < 2) {
+			return -EINVAL;
+		}
+	} else if (rinfo->prefix_len > 0) {
+		if (rinfo->length < 1) {
+			return -EINVAL;
+		}
+	}
+
+	pref = rinfo->route_pref;
+	if (pref == ICMPV6_ROUTER_PREF_INVALID)
+		return -EINVAL;
+
+	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
+
+	if (rinfo->length == 3)
+		prefix = (struct in6_addr *)rinfo->prefix;
+	else {
+		/* this function is safe */
+		ipv6_addr_prefix(&prefix_buf,
+				 (struct in6_addr *)rinfo->prefix,
+				 rinfo->prefix_len);
+		prefix = &prefix_buf;
+	}
+
+	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
+				dev->ifindex);
+
+	if (rt && !lifetime) {
+		ip6_del_rt(rt);
+		rt = NULL;
+	}
+
+	if (!rt && lifetime)
+		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
+					pref);
+	else if (rt)
+		rt->rt6i_flags = RTF_ROUTEINFO |
+				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+
+	if (rt) {
+		if (!addrconf_finite_timeout(lifetime)) {
+			rt->rt6i_flags &= ~RTF_EXPIRES;
+		} else {
+			rt->rt6i_expires = jiffies + HZ * lifetime;
+			rt->rt6i_flags |= RTF_EXPIRES;
+		}
+		dst_release(&rt->dst);
+	}
+	return 0;
+}
+#endif
+
+#define BACKTRACK(__net, saddr)			\
+do { \
+	if (rt == __net->ipv6.ip6_null_entry) {	\
+		struct fib6_node *pn; \
+		while (1) { \
+			if (fn->fn_flags & RTN_TL_ROOT) \
+				goto out; \
+			pn = fn->parent; \
+			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
+				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
+			else \
+				fn = pn; \
+			if (fn->fn_flags & RTN_RTINFO) \
+				goto restart; \
+		} \
+	} \
+} while(0)
+
+static struct rt6_info *ip6_pol_route_lookup(struct net *net,
+					     struct fib6_table *table,
+					     struct flowi6 *fl6, int flags)
+{
+	struct fib6_node *fn;
+	struct rt6_info *rt;
+
+	read_lock_bh(&table->tb6_lock);
+	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+	rt = fn->leaf;
+	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+	BACKTRACK(net, &fl6->saddr);
+out:
+	dst_use(&rt->dst, jiffies);
+	read_unlock_bh(&table->tb6_lock);
+	return rt;
+
+}
+
+struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
+			    const struct in6_addr *saddr, int oif, int strict)
+{
+	struct flowi6 fl6 = {
+		.flowi6_oif = oif,
+		.daddr = *daddr,
+	};
+	struct dst_entry *dst;
+	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
+
+	if (saddr) {
+		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+		flags |= RT6_LOOKUP_F_HAS_SADDR;
+	}
+
+	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
+	if (dst->error == 0)
+		return (struct rt6_info *) dst;
+
+	dst_release(dst);
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(rt6_lookup);
+
+/* ip6_ins_rt is called with FREE table->tb6_lock.
+   It takes new route entry, the addition fails by any reason the
+   route is freed. In any case, if caller does not hold it, it may
+   be destroyed.
+ */
+
+static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
+{
+	int err;
+	struct fib6_table *table;
+
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
+	err = fib6_add(&table->tb6_root, rt, info);
+	write_unlock_bh(&table->tb6_lock);
+
+	return err;
+}
+
+int ip6_ins_rt(struct rt6_info *rt)
+{
+	struct nl_info info = {
+		.nl_net = dev_net(rt->rt6i_dev),
+	};
+	return __ip6_ins_rt(rt, &info);
+}
+
+static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
+				      const struct in6_addr *saddr)
+{
+	struct rt6_info *rt;
+
+	/*
+	 *	Clone the route.
+	 */
+
+	rt = ip6_rt_copy(ort);
+
+	if (rt) {
+		struct neighbour *neigh;
+		int attempts = !in_softirq();
+
+		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
+			if (rt->rt6i_dst.plen != 128 &&
+			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
+				rt->rt6i_flags |= RTF_ANYCAST;
+			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
+		}
+
+		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
+		rt->rt6i_dst.plen = 128;
+		rt->rt6i_flags |= RTF_CACHE;
+		rt->dst.flags |= DST_HOST;
+
+#ifdef CONFIG_IPV6_SUBTREES
+		if (rt->rt6i_src.plen && saddr) {
+			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
+			rt->rt6i_src.plen = 128;
+		}
+#endif
+
+	retry:
+		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+		if (IS_ERR(neigh)) {
+			struct net *net = dev_net(rt->rt6i_dev);
+			int saved_rt_min_interval =
+				net->ipv6.sysctl.ip6_rt_gc_min_interval;
+			int saved_rt_elasticity =
+				net->ipv6.sysctl.ip6_rt_gc_elasticity;
+
+			if (attempts-- > 0) {
+				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
+				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
+
+				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
+
+				net->ipv6.sysctl.ip6_rt_gc_elasticity =
+					saved_rt_elasticity;
+				net->ipv6.sysctl.ip6_rt_gc_min_interval =
+					saved_rt_min_interval;
+				goto retry;
+			}
+
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "ipv6: Neighbour table overflow.\n");
+			dst_free(&rt->dst);
+			return NULL;
+		}
+		dst_set_neighbour(&rt->dst, neigh);
+	}
+
+	return rt;
+}
+
+static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
+{
+	struct rt6_info *rt = ip6_rt_copy(ort);
+	if (rt) {
+		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
+		rt->rt6i_dst.plen = 128;
+		rt->rt6i_flags |= RTF_CACHE;
+		rt->dst.flags |= DST_HOST;
+		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
+	}
+	return rt;
+}
+
+static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
+				      struct flowi6 *fl6, int flags)
+{
+	struct fib6_node *fn;
+	struct rt6_info *rt, *nrt;
+	int strict = 0;
+	int attempts = 3;
+	int err;
+	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+
+relookup:
+	read_lock_bh(&table->tb6_lock);
+
+restart_2:
+	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+
+restart:
+	rt = rt6_select(fn, oif, strict | reachable);
+
+	BACKTRACK(net, &fl6->saddr);
+	if (rt == net->ipv6.ip6_null_entry ||
+	    rt->rt6i_flags & RTF_CACHE)
+		goto out;
+
+	dst_hold(&rt->dst);
+	read_unlock_bh(&table->tb6_lock);
+
+	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
+		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
+	else if (!(rt->dst.flags & DST_HOST))
+		nrt = rt6_alloc_clone(rt, &fl6->daddr);
+	else
+		goto out2;
+
+	dst_release(&rt->dst);
+	rt = nrt ? : net->ipv6.ip6_null_entry;
+
+	dst_hold(&rt->dst);
+	if (nrt) {
+		err = ip6_ins_rt(nrt);
+		if (!err)
+			goto out2;
+	}
+
+	if (--attempts <= 0)
+		goto out2;
+
+	/*
+	 * Race condition! In the gap, when table->tb6_lock was
+	 * released someone could insert this route.  Relookup.
+	 */
+	dst_release(&rt->dst);
+	goto relookup;
+
+out:
+	if (reachable) {
+		reachable = 0;
+		goto restart_2;
+	}
+	dst_hold(&rt->dst);
+	read_unlock_bh(&table->tb6_lock);
+out2:
+	rt->dst.lastuse = jiffies;
+	rt->dst.__use++;
+
+	return rt;
+}
+
+static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
+					    struct flowi6 *fl6, int flags)
+{
+	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
+}
+
+void ip6_route_input(struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct net *net = dev_net(skb->dev);
+	int flags = RT6_LOOKUP_F_HAS_SADDR;
+	struct flowi6 fl6 = {
+		.flowi6_iif = skb->dev->ifindex,
+		.daddr = iph->daddr,
+		.saddr = iph->saddr,
+		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
+		.flowi6_mark = skb->mark,
+		.flowi6_proto = iph->nexthdr,
+	};
+
+	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
+		flags |= RT6_LOOKUP_F_IFACE;
+
+	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
+}
+
+static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
+					     struct flowi6 *fl6, int flags)
+{
+	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
+}
+
+struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
+				    struct flowi6 *fl6)
+{
+	int flags = 0;
+
+	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
+		flags |= RT6_LOOKUP_F_IFACE;
+
+	if (!ipv6_addr_any(&fl6->saddr))
+		flags |= RT6_LOOKUP_F_HAS_SADDR;
+	else if (sk)
+		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+
+	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
+}
+
+EXPORT_SYMBOL(ip6_route_output);
+
+struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
+{
+	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+	struct dst_entry *new = NULL;
+
+	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
+	if (rt) {
+		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
+
+		new = &rt->dst;
+
+		new->__use = 1;
+		new->input = dst_discard;
+		new->output = dst_discard;
+
+		dst_copy_metrics(new, &ort->dst);
+		rt->rt6i_idev = ort->rt6i_idev;
+		if (rt->rt6i_idev)
+			in6_dev_hold(rt->rt6i_idev);
+		rt->rt6i_expires = 0;
+
+		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
+		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
+		rt->rt6i_metric = 0;
+
+		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
+		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
+
+		dst_free(new);
+	}
+
+	dst_release(dst_orig);
+	return new ? new : ERR_PTR(-ENOMEM);
+}
+
+/*
+ *	Destination cache support functions
+ */
+
+static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	struct rt6_info *rt;
+
+	rt = (struct rt6_info *) dst;
+
+	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
+		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
+			if (!rt->rt6i_peer)
+				rt6_bind_peer(rt, 0);
+			rt->rt6i_peer_genid = rt6_peer_genid();
+		}
+		return dst;
+	}
+	return NULL;
+}
+
+static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
+{
+	struct rt6_info *rt = (struct rt6_info *) dst;
+
+	if (rt) {
+		if (rt->rt6i_flags & RTF_CACHE) {
+			if (rt6_check_expired(rt)) {
+				ip6_del_rt(rt);
+				dst = NULL;
+			}
+		} else {
+			dst_release(dst);
+			dst = NULL;
+		}
+	}
+	return dst;
+}
+
+static void ip6_link_failure(struct sk_buff *skb)
+{
+	struct rt6_info *rt;
+
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
+
+	rt = (struct rt6_info *) skb_dst(skb);
+	if (rt) {
+		if (rt->rt6i_flags&RTF_CACHE) {
+			dst_set_expires(&rt->dst, 0);
+			rt->rt6i_flags |= RTF_EXPIRES;
+		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
+			rt->rt6i_node->fn_sernum = -1;
+	}
+}
+
+static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct rt6_info *rt6 = (struct rt6_info*)dst;
+
+	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
+		rt6->rt6i_flags |= RTF_MODIFIED;
+		if (mtu < IPV6_MIN_MTU) {
+			u32 features = dst_metric(dst, RTAX_FEATURES);
+			mtu = IPV6_MIN_MTU;
+			features |= RTAX_FEATURE_ALLFRAG;
+			dst_metric_set(dst, RTAX_FEATURES, features);
+		}
+		dst_metric_set(dst, RTAX_MTU, mtu);
+	}
+}
+
+static unsigned int ip6_default_advmss(const struct dst_entry *dst)
+{
+	struct net_device *dev = dst->dev;
+	unsigned int mtu = dst_mtu(dst);
+	struct net *net = dev_net(dev);
+
+	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+
+	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
+		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+
+	/*
+	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
+	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
+	 * IPV6_MAXPLEN is also valid and means: "any MSS,
+	 * rely only on pmtu discovery"
+	 */
+	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
+		mtu = IPV6_MAXPLEN;
+	return mtu;
+}
+
+static unsigned int ip6_default_mtu(const struct dst_entry *dst)
+{
+	unsigned int mtu = IPV6_MIN_MTU;
+	struct inet6_dev *idev;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dst->dev);
+	if (idev)
+		mtu = idev->cnf.mtu6;
+	rcu_read_unlock();
+
+	return mtu;
+}
+
+static struct dst_entry *icmp6_dst_gc_list;
+static DEFINE_SPINLOCK(icmp6_dst_lock);
+
+struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
+				  struct neighbour *neigh,
+				  const struct in6_addr *addr)
+{
+	struct rt6_info *rt;
+	struct inet6_dev *idev = in6_dev_get(dev);
+	struct net *net = dev_net(dev);
+
+	if (unlikely(idev == NULL))
+		return NULL;
+
+	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
+	if (unlikely(rt == NULL)) {
+		in6_dev_put(idev);
+		goto out;
+	}
+
+	if (neigh)
+		neigh_hold(neigh);
+	else {
+		neigh = ndisc_get_neigh(dev, addr);
+		if (IS_ERR(neigh))
+			neigh = NULL;
+	}
+
+	rt->rt6i_idev     = idev;
+	dst_set_neighbour(&rt->dst, neigh);
+	atomic_set(&rt->dst.__refcnt, 1);
+	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
+	rt->dst.output  = ip6_output;
+
+	spin_lock_bh(&icmp6_dst_lock);
+	rt->dst.next = icmp6_dst_gc_list;
+	icmp6_dst_gc_list = &rt->dst;
+	spin_unlock_bh(&icmp6_dst_lock);
+
+	fib6_force_start_gc(net);
+
+out:
+	return &rt->dst;
+}
+
+int icmp6_dst_gc(void)
+{
+	struct dst_entry *dst, **pprev;
+	int more = 0;
+
+	spin_lock_bh(&icmp6_dst_lock);
+	pprev = &icmp6_dst_gc_list;
+
+	while ((dst = *pprev) != NULL) {
+		if (!atomic_read(&dst->__refcnt)) {
+			*pprev = dst->next;
+			dst_free(dst);
+		} else {
+			pprev = &dst->next;
+			++more;
+		}
+	}
+
+	spin_unlock_bh(&icmp6_dst_lock);
+
+	return more;
+}
+
+static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
+			    void *arg)
+{
+	struct dst_entry *dst, **pprev;
+
+	spin_lock_bh(&icmp6_dst_lock);
+	pprev = &icmp6_dst_gc_list;
+	while ((dst = *pprev) != NULL) {
+		struct rt6_info *rt = (struct rt6_info *) dst;
+		if (func(rt, arg)) {
+			*pprev = dst->next;
+			dst_free(dst);
+		} else {
+			pprev = &dst->next;
+		}
+	}
+	spin_unlock_bh(&icmp6_dst_lock);
+}
+
+static int ip6_dst_gc(struct dst_ops *ops)
+{
+	unsigned long now = jiffies;
+	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
+	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
+	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
+	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
+	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
+	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+	int entries;
+
+	entries = dst_entries_get_fast(ops);
+	if (time_after(rt_last_gc + rt_min_interval, now) &&
+	    entries <= rt_max_size)
+		goto out;
+
+	net->ipv6.ip6_rt_gc_expire++;
+	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
+	net->ipv6.ip6_rt_last_gc = now;
+	entries = dst_entries_get_slow(ops);
+	if (entries < ops->gc_thresh)
+		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+out:
+	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
+	return entries > rt_max_size;
+}
+
+/* Clean host part of a prefix. Not necessary in radix tree,
+   but results in cleaner routing tables.
+
+   Remove it only when all the things will work!
+ */
+
+int ip6_dst_hoplimit(struct dst_entry *dst)
+{
+	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+	if (hoplimit == 0) {
+		struct net_device *dev = dst->dev;
+		struct inet6_dev *idev;
+
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		if (idev)
+			hoplimit = idev->cnf.hop_limit;
+		else
+			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
+		rcu_read_unlock();
+	}
+	return hoplimit;
+}
+EXPORT_SYMBOL(ip6_dst_hoplimit);
+
+/*
+ *
+ */
+
+int ip6_route_add(struct fib6_config *cfg)
+{
+	int err;
+	struct net *net = cfg->fc_nlinfo.nl_net;
+	struct rt6_info *rt = NULL;
+	struct net_device *dev = NULL;
+	struct inet6_dev *idev = NULL;
+	struct fib6_table *table;
+	int addr_type;
+
+	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
+		return -EINVAL;
+#ifndef CONFIG_IPV6_SUBTREES
+	if (cfg->fc_src_len)
+		return -EINVAL;
+#endif
+	if (cfg->fc_ifindex) {
+		err = -ENODEV;
+		dev = dev_get_by_index(net, cfg->fc_ifindex);
+		if (!dev)
+			goto out;
+		idev = in6_dev_get(dev);
+		if (!idev)
+			goto out;
+	}
+
+	if (cfg->fc_metric == 0)
+		cfg->fc_metric = IP6_RT_PRIO_USER;
+
+	table = fib6_new_table(net, cfg->fc_table);
+	if (table == NULL) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
+
+	if (rt == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rt->dst.obsolete = -1;
+	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
+				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
+				0;
+
+	if (cfg->fc_protocol == RTPROT_UNSPEC)
+		cfg->fc_protocol = RTPROT_BOOT;
+	rt->rt6i_protocol = cfg->fc_protocol;
+
+	addr_type = ipv6_addr_type(&cfg->fc_dst);
+
+	if (addr_type & IPV6_ADDR_MULTICAST)
+		rt->dst.input = ip6_mc_input;
+	else if (cfg->fc_flags & RTF_LOCAL)
+		rt->dst.input = ip6_input;
+	else
+		rt->dst.input = ip6_forward;
+
+	rt->dst.output = ip6_output;
+
+	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+	rt->rt6i_dst.plen = cfg->fc_dst_len;
+	if (rt->rt6i_dst.plen == 128)
+	       rt->dst.flags |= DST_HOST;
+
+#ifdef CONFIG_IPV6_SUBTREES
+	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
+	rt->rt6i_src.plen = cfg->fc_src_len;
+#endif
+
+	rt->rt6i_metric = cfg->fc_metric;
+
+	/* We cannot add true routes via loopback here,
+	   they would result in kernel looping; promote them to reject routes
+	 */
+	if ((cfg->fc_flags & RTF_REJECT) ||
+	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
+					      && !(cfg->fc_flags&RTF_LOCAL))) {
+		/* hold loopback dev/idev if we haven't done so. */
+		if (dev != net->loopback_dev) {
+			if (dev) {
+				dev_put(dev);
+				in6_dev_put(idev);
+			}
+			dev = net->loopback_dev;
+			dev_hold(dev);
+			idev = in6_dev_get(dev);
+			if (!idev) {
+				err = -ENODEV;
+				goto out;
+			}
+		}
+		rt->dst.output = ip6_pkt_discard_out;
+		rt->dst.input = ip6_pkt_discard;
+		rt->dst.error = -ENETUNREACH;
+		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+		goto install_route;
+	}
+
+	if (cfg->fc_flags & RTF_GATEWAY) {
+		const struct in6_addr *gw_addr;
+		int gwa_type;
+
+		gw_addr = &cfg->fc_gateway;
+		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
+		gwa_type = ipv6_addr_type(gw_addr);
+
+		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
+			struct rt6_info *grt;
+
+			/* IPv6 strictly inhibits using not link-local
+			   addresses as nexthop address.
+			   Otherwise, router will not able to send redirects.
+			   It is very good, but in some (rare!) circumstances
+			   (SIT, PtP, NBMA NOARP links) it is handy to allow
+			   some exceptions. --ANK
+			 */
+			err = -EINVAL;
+			if (!(gwa_type&IPV6_ADDR_UNICAST))
+				goto out;
+
+			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+
+			err = -EHOSTUNREACH;
+			if (grt == NULL)
+				goto out;
+			if (dev) {
+				if (dev != grt->rt6i_dev) {
+					dst_release(&grt->dst);
+					goto out;
+				}
+			} else {
+				dev = grt->rt6i_dev;
+				idev = grt->rt6i_idev;
+				dev_hold(dev);
+				in6_dev_hold(grt->rt6i_idev);
+			}
+			if (!(grt->rt6i_flags&RTF_GATEWAY))
+				err = 0;
+			dst_release(&grt->dst);
+
+			if (err)
+				goto out;
+		}
+		err = -EINVAL;
+		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
+			goto out;
+	}
+
+	err = -ENODEV;
+	if (dev == NULL)
+		goto out;
+
+	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
+			err = -EINVAL;
+			goto out;
+		}
+		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
+		rt->rt6i_prefsrc.plen = 128;
+	} else
+		rt->rt6i_prefsrc.plen = 0;
+
+	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
+		struct neighbour *neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
+		if (IS_ERR(neigh)) {
+			err = PTR_ERR(neigh);
+			goto out;
+		}
+		dst_set_neighbour(&rt->dst, neigh);
+	}
+
+	rt->rt6i_flags = cfg->fc_flags;
+
+install_route:
+	if (cfg->fc_mx) {
+		struct nlattr *nla;
+		int remaining;
+
+		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+			int type = nla_type(nla);
+
+			if (type) {
+				if (type > RTAX_MAX) {
+					err = -EINVAL;
+					goto out;
+				}
+
+				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
+			}
+		}
+	}
+
+	rt->dst.dev = dev;
+	rt->rt6i_idev = idev;
+	rt->rt6i_table = table;
+
+	cfg->fc_nlinfo.nl_net = dev_net(dev);
+
+	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
+
+out:
+	if (dev)
+		dev_put(dev);
+	if (idev)
+		in6_dev_put(idev);
+	if (rt)
+		dst_free(&rt->dst);
+	return err;
+}
+
+static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+{
+	int err;
+	struct fib6_table *table;
+	struct net *net = dev_net(rt->rt6i_dev);
+
+	if (rt == net->ipv6.ip6_null_entry)
+		return -ENOENT;
+
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
+
+	err = fib6_del(rt, info);
+	dst_release(&rt->dst);
+
+	write_unlock_bh(&table->tb6_lock);
+
+	return err;
+}
+
+int ip6_del_rt(struct rt6_info *rt)
+{
+	struct nl_info info = {
+		.nl_net = dev_net(rt->rt6i_dev),
+	};
+	return __ip6_del_rt(rt, &info);
+}
+
+static int ip6_route_del(struct fib6_config *cfg)
+{
+	struct fib6_table *table;
+	struct fib6_node *fn;
+	struct rt6_info *rt;
+	int err = -ESRCH;
+
+	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
+	if (table == NULL)
+		return err;
+
+	read_lock_bh(&table->tb6_lock);
+
+	fn = fib6_locate(&table->tb6_root,
+			 &cfg->fc_dst, cfg->fc_dst_len,
+			 &cfg->fc_src, cfg->fc_src_len);
+
+	if (fn) {
+		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+			if (cfg->fc_ifindex &&
+			    (rt->rt6i_dev == NULL ||
+			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
+				continue;
+			if (cfg->fc_flags & RTF_GATEWAY &&
+			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+				continue;
+			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+				continue;
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+
+			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+		}
+	}
+	read_unlock_bh(&table->tb6_lock);
+
+	return err;
+}
+
+/*
+ *	Handle redirects
+ */
+struct ip6rd_flowi {
+	struct flowi6 fl6;
+	struct in6_addr gateway;
+};
+
+static struct rt6_info *__ip6_route_redirect(struct net *net,
+					     struct fib6_table *table,
+					     struct flowi6 *fl6,
+					     int flags)
+{
+	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
+	struct rt6_info *rt;
+	struct fib6_node *fn;
+
+	/*
+	 * Get the "current" route for this destination and
+	 * check if the redirect has come from approriate router.
+	 *
+	 * RFC 2461 specifies that redirects should only be
+	 * accepted if they come from the nexthop to the target.
+	 * Due to the way the routes are chosen, this notion
+	 * is a bit fuzzy and one might need to check all possible
+	 * routes.
+	 */
+
+	read_lock_bh(&table->tb6_lock);
+	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+restart:
+	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+		/*
+		 * Current route is on-link; redirect is always invalid.
+		 *
+		 * Seems, previous statement is not true. It could
+		 * be node, which looks for us as on-link (f.e. proxy ndisc)
+		 * But then router serving it might decide, that we should
+		 * know truth 8)8) --ANK (980726).
+		 */
+		if (rt6_check_expired(rt))
+			continue;
+		if (!(rt->rt6i_flags & RTF_GATEWAY))
+			continue;
+		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
+			continue;
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+			continue;
+		break;
+	}
+
+	if (!rt)
+		rt = net->ipv6.ip6_null_entry;
+	BACKTRACK(net, &fl6->saddr);
+out:
+	dst_hold(&rt->dst);
+
+	read_unlock_bh(&table->tb6_lock);
+
+	return rt;
+};
+
+static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
+					   const struct in6_addr *src,
+					   const struct in6_addr *gateway,
+					   struct net_device *dev)
+{
+	int flags = RT6_LOOKUP_F_HAS_SADDR;
+	struct net *net = dev_net(dev);
+	struct ip6rd_flowi rdfl = {
+		.fl6 = {
+			.flowi6_oif = dev->ifindex,
+			.daddr = *dest,
+			.saddr = *src,
+		},
+	};
+
+	ipv6_addr_copy(&rdfl.gateway, gateway);
+
+	if (rt6_need_strict(dest))
+		flags |= RT6_LOOKUP_F_IFACE;
+
+	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
+						   flags, __ip6_route_redirect);
+}
+
+void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
+		  const struct in6_addr *saddr,
+		  struct neighbour *neigh, u8 *lladdr, int on_link)
+{
+	struct rt6_info *rt, *nrt = NULL;
+	struct netevent_redirect netevent;
+	struct net *net = dev_net(neigh->dev);
+
+	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
+
+	if (rt == net->ipv6.ip6_null_entry) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
+			       "for redirect target\n");
+		goto out;
+	}
+
+	/*
+	 *	We have finally decided to accept it.
+	 */
+
+	neigh_update(neigh, lladdr, NUD_STALE,
+		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
+		     NEIGH_UPDATE_F_OVERRIDE|
+		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
+				     NEIGH_UPDATE_F_ISROUTER))
+		     );
+
+	/*
+	 * Redirect received -> path was valid.
+	 * Look, redirects are sent only in response to data packets,
+	 * so that this nexthop apparently is reachable. --ANK
+	 */
+	dst_confirm(&rt->dst);
+
+	/* Duplicate redirect: silently ignore. */
+	if (neigh == dst_get_neighbour_raw(&rt->dst))
+		goto out;
+
+	nrt = ip6_rt_copy(rt);
+	if (nrt == NULL)
+		goto out;
+
+	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
+	if (on_link)
+		nrt->rt6i_flags &= ~RTF_GATEWAY;
+
+	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
+	nrt->rt6i_dst.plen = 128;
+	nrt->dst.flags |= DST_HOST;
+
+	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
+	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
+
+	if (ip6_ins_rt(nrt))
+		goto out;
+
+	netevent.old = &rt->dst;
+	netevent.new = &nrt->dst;
+	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
+
+	if (rt->rt6i_flags&RTF_CACHE) {
+		ip6_del_rt(rt);
+		return;
+	}
+
+out:
+	dst_release(&rt->dst);
+}
+
+/*
+ *	Handle ICMP "packet too big" messages
+ *	i.e. Path MTU discovery
+ */
+
+static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
+			     struct net *net, u32 pmtu, int ifindex)
+{
+	struct rt6_info *rt, *nrt;
+	int allfrag = 0;
+again:
+	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
+	if (rt == NULL)
+		return;
+
+	if (rt6_check_expired(rt)) {
+		ip6_del_rt(rt);
+		goto again;
+	}
+
+	if (pmtu >= dst_mtu(&rt->dst))
+		goto out;
+
+	if (pmtu < IPV6_MIN_MTU) {
+		/*
+		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
+		 * MTU (1280) and a fragment header should always be included
+		 * after a node receiving Too Big message reporting PMTU is
+		 * less than the IPv6 Minimum Link MTU.
+		 */
+		pmtu = IPV6_MIN_MTU;
+		allfrag = 1;
+	}
+
+	/* New mtu received -> path was valid.
+	   They are sent only in response to data packets,
+	   so that this nexthop apparently is reachable. --ANK
+	 */
+	dst_confirm(&rt->dst);
+
+	/* Host route. If it is static, it would be better
+	   not to override it, but add new one, so that
+	   when cache entry will expire old pmtu
+	   would return automatically.
+	 */
+	if (rt->rt6i_flags & RTF_CACHE) {
+		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
+		if (allfrag) {
+			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
+			features |= RTAX_FEATURE_ALLFRAG;
+			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
+		}
+		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
+		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
+		goto out;
+	}
+
+	/* Network route.
+	   Two cases are possible:
+	   1. It is connected route. Action: COW
+	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
+	 */
+	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
+		nrt = rt6_alloc_cow(rt, daddr, saddr);
+	else
+		nrt = rt6_alloc_clone(rt, daddr);
+
+	if (nrt) {
+		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
+		if (allfrag) {
+			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
+			features |= RTAX_FEATURE_ALLFRAG;
+			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
+		}
+
+		/* According to RFC 1981, detecting PMTU increase shouldn't be
+		 * happened within 5 mins, the recommended timer is 10 mins.
+		 * Here this route expiration time is set to ip6_rt_mtu_expires
+		 * which is 10 mins. After 10 mins the decreased pmtu is expired
+		 * and detecting PMTU increase will be automatically happened.
+		 */
+		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
+		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
+
+		ip6_ins_rt(nrt);
+	}
+out:
+	dst_release(&rt->dst);
+}
+
+void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
+			struct net_device *dev, u32 pmtu)
+{
+	struct net *net = dev_net(dev);
+
+	/*
+	 * RFC 1981 states that a node "MUST reduce the size of the packets it
+	 * is sending along the path" that caused the Packet Too Big message.
+	 * Since it's not possible in the general case to determine which
+	 * interface was used to send the original packet, we update the MTU
+	 * on the interface that will be used to send future packets. We also
+	 * update the MTU on the interface that received the Packet Too Big in
+	 * case the original packet was forced out that interface with
+	 * SO_BINDTODEVICE or similar. This is the next best thing to the
+	 * correct behaviour, which would be to update the MTU on all
+	 * interfaces.
+	 */
+	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
+	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
+}
+
+/*
+ *	Misc support functions
+ */
+
+static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
+{
+	struct net *net = dev_net(ort->rt6i_dev);
+	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
+					    ort->dst.dev, 0);
+
+	if (rt) {
+		rt->dst.input = ort->dst.input;
+		rt->dst.output = ort->dst.output;
+
+		dst_copy_metrics(&rt->dst, &ort->dst);
+		rt->dst.error = ort->dst.error;
+		rt->rt6i_idev = ort->rt6i_idev;
+		if (rt->rt6i_idev)
+			in6_dev_hold(rt->rt6i_idev);
+		rt->dst.lastuse = jiffies;
+		rt->rt6i_expires = 0;
+
+		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
+		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
+		rt->rt6i_metric = 0;
+
+		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
+		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
+		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
+		rt->rt6i_table = ort->rt6i_table;
+	}
+	return rt;
+}
+
+#ifdef CONFIG_IPV6_ROUTE_INFO
+static struct rt6_info *rt6_get_route_info(struct net *net,
+					   const struct in6_addr *prefix, int prefixlen,
+					   const struct in6_addr *gwaddr, int ifindex)
+{
+	struct fib6_node *fn;
+	struct rt6_info *rt = NULL;
+	struct fib6_table *table;
+
+	table = fib6_get_table(net, RT6_TABLE_INFO);
+	if (table == NULL)
+		return NULL;
+
+	write_lock_bh(&table->tb6_lock);
+	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
+	if (!fn)
+		goto out;
+
+	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+		if (rt->rt6i_dev->ifindex != ifindex)
+			continue;
+		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+			continue;
+		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+			continue;
+		dst_hold(&rt->dst);
+		break;
+	}
+out:
+	write_unlock_bh(&table->tb6_lock);
+	return rt;
+}
+
+static struct rt6_info *rt6_add_route_info(struct net *net,
+					   const struct in6_addr *prefix, int prefixlen,
+					   const struct in6_addr *gwaddr, int ifindex,
+					   unsigned pref)
+{
+	struct fib6_config cfg = {
+		.fc_table	= RT6_TABLE_INFO,
+		.fc_metric	= IP6_RT_PRIO_USER,
+		.fc_ifindex	= ifindex,
+		.fc_dst_len	= prefixlen,
+		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
+				  RTF_UP | RTF_PREF(pref),
+		.fc_nlinfo.pid = 0,
+		.fc_nlinfo.nlh = NULL,
+		.fc_nlinfo.nl_net = net,
+	};
+
+	ipv6_addr_copy(&cfg.fc_dst, prefix);
+	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
+
+	/* We should treat it as a default route if prefix length is 0. */
+	if (!prefixlen)
+		cfg.fc_flags |= RTF_DEFAULT;
+
+	ip6_route_add(&cfg);
+
+	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
+}
+#endif
+
+struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
+{
+	struct rt6_info *rt;
+	struct fib6_table *table;
+
+	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
+	if (table == NULL)
+		return NULL;
+
+	write_lock_bh(&table->tb6_lock);
+	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
+		if (dev == rt->rt6i_dev &&
+		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
+			break;
+	}
+	if (rt)
+		dst_hold(&rt->dst);
+	write_unlock_bh(&table->tb6_lock);
+	return rt;
+}
+
+struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+				     struct net_device *dev,
+				     unsigned int pref)
+{
+	struct fib6_config cfg = {
+		.fc_table	= RT6_TABLE_DFLT,
+		.fc_metric	= IP6_RT_PRIO_USER,
+		.fc_ifindex	= dev->ifindex,
+		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
+				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
+		.fc_nlinfo.pid = 0,
+		.fc_nlinfo.nlh = NULL,
+		.fc_nlinfo.nl_net = dev_net(dev),
+	};
+
+	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
+
+	ip6_route_add(&cfg);
+
+	return rt6_get_dflt_router(gwaddr, dev);
+}
+
+void rt6_purge_dflt_routers(struct net *net)
+{
+	struct rt6_info *rt;
+	struct fib6_table *table;
+
+	/* NOTE: Keep consistent with rt6_get_dflt_router */
+	table = fib6_get_table(net, RT6_TABLE_DFLT);
+	if (table == NULL)
+		return;
+
+restart:
+	read_lock_bh(&table->tb6_lock);
+	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+			ip6_del_rt(rt);
+			goto restart;
+		}
+	}
+	read_unlock_bh(&table->tb6_lock);
+}
+
+static void rtmsg_to_fib6_config(struct net *net,
+				 struct in6_rtmsg *rtmsg,
+				 struct fib6_config *cfg)
+{
+	memset(cfg, 0, sizeof(*cfg));
+
+	cfg->fc_table = RT6_TABLE_MAIN;
+	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
+	cfg->fc_metric = rtmsg->rtmsg_metric;
+	cfg->fc_expires = rtmsg->rtmsg_info;
+	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
+	cfg->fc_src_len = rtmsg->rtmsg_src_len;
+	cfg->fc_flags = rtmsg->rtmsg_flags;
+
+	cfg->fc_nlinfo.nl_net = net;
+
+	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
+	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
+	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
+}
+
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+	struct fib6_config cfg;
+	struct in6_rtmsg rtmsg;
+	int err;
+
+	switch(cmd) {
+	case SIOCADDRT:		/* Add a route */
+	case SIOCDELRT:		/* Delete a route */
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		err = copy_from_user(&rtmsg, arg,
+				     sizeof(struct in6_rtmsg));
+		if (err)
+			return -EFAULT;
+
+		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
+
+		rtnl_lock();
+		switch (cmd) {
+		case SIOCADDRT:
+			err = ip6_route_add(&cfg);
+			break;
+		case SIOCDELRT:
+			err = ip6_route_del(&cfg);
+			break;
+		default:
+			err = -EINVAL;
+		}
+		rtnl_unlock();
+
+		return err;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ *	Drop the packet on the floor
+ */
+
+static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
+{
+	int type;
+	struct dst_entry *dst = skb_dst(skb);
+	switch (ipstats_mib_noroutes) {
+	case IPSTATS_MIB_INNOROUTES:
+		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
+		if (type == IPV6_ADDR_ANY) {
+			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+				      IPSTATS_MIB_INADDRERRORS);
+			break;
+		}
+		/* FALLTHROUGH */
+	case IPSTATS_MIB_OUTNOROUTES:
+		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+			      ipstats_mib_noroutes);
+		break;
+	}
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
+	kfree_skb(skb);
+	return 0;
+}
+
+static int ip6_pkt_discard(struct sk_buff *skb)
+{
+	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_discard_out(struct sk_buff *skb)
+{
+	skb->dev = skb_dst(skb)->dev;
+	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
+}
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+static int ip6_pkt_prohibit(struct sk_buff *skb)
+{
+	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_prohibit_out(struct sk_buff *skb)
+{
+	skb->dev = skb_dst(skb)->dev;
+	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
+}
+
+#endif
+
+/*
+ *	Allocate a dst for local (unicast / anycast) address.
+ */
+
+struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+				    const struct in6_addr *addr,
+				    int anycast)
+{
+	struct net *net = dev_net(idev->dev);
+	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
+					    net->loopback_dev, 0);
+	struct neighbour *neigh;
+
+	if (rt == NULL) {
+		if (net_ratelimit())
+			pr_warning("IPv6:  Maximum number of routes reached,"
+				   " consider increasing route/max_size.\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	in6_dev_hold(idev);
+
+	rt->dst.flags |= DST_HOST;
+	rt->dst.input = ip6_input;
+	rt->dst.output = ip6_output;
+	rt->rt6i_idev = idev;
+	rt->dst.obsolete = -1;
+
+	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
+	if (anycast)
+		rt->rt6i_flags |= RTF_ANYCAST;
+	else
+		rt->rt6i_flags |= RTF_LOCAL;
+	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+	if (IS_ERR(neigh)) {
+		dst_free(&rt->dst);
+
+		return ERR_CAST(neigh);
+	}
+	dst_set_neighbour(&rt->dst, neigh);
+
+	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
+	rt->rt6i_dst.plen = 128;
+	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
+
+	atomic_set(&rt->dst.__refcnt, 1);
+
+	return rt;
+}
+
+int ip6_route_get_saddr(struct net *net,
+			struct rt6_info *rt,
+			const struct in6_addr *daddr,
+			unsigned int prefs,
+			struct in6_addr *saddr)
+{
+	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
+	int err = 0;
+	if (rt->rt6i_prefsrc.plen)
+		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
+	else
+		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
+					 daddr, prefs, saddr);
+	return err;
+}
+
+/* remove deleted ip from prefsrc entries */
+struct arg_dev_net_ip {
+	struct net_device *dev;
+	struct net *net;
+	struct in6_addr *addr;
+};
+
+static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+{
+	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
+	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
+	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
+
+	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
+	    rt != net->ipv6.ip6_null_entry &&
+	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+		/* remove prefsrc entry */
+		rt->rt6i_prefsrc.plen = 0;
+	}
+	return 0;
+}
+
+void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
+{
+	struct net *net = dev_net(ifp->idev->dev);
+	struct arg_dev_net_ip adni = {
+		.dev = ifp->idev->dev,
+		.net = net,
+		.addr = &ifp->addr,
+	};
+	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
+}
+
+struct arg_dev_net {
+	struct net_device *dev;
+	struct net *net;
+};
+
+static int fib6_ifdown(struct rt6_info *rt, void *arg)
+{
+	const struct arg_dev_net *adn = arg;
+	const struct net_device *dev = adn->dev;
+
+	if ((rt->rt6i_dev == dev || dev == NULL) &&
+	    rt != adn->net->ipv6.ip6_null_entry) {
+		RT6_TRACE("deleted by ifdown %p\n", rt);
+		return -1;
+	}
+	return 0;
+}
+
+void rt6_ifdown(struct net *net, struct net_device *dev)
+{
+	struct arg_dev_net adn = {
+		.dev = dev,
+		.net = net,
+	};
+
+	fib6_clean_all(net, fib6_ifdown, 0, &adn);
+	icmp6_clean_all(fib6_ifdown, &adn);
+}
+
+struct rt6_mtu_change_arg
+{
+	struct net_device *dev;
+	unsigned mtu;
+};
+
+static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+{
+	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
+	struct inet6_dev *idev;
+
+	/* In IPv6 pmtu discovery is not optional,
+	   so that RTAX_MTU lock cannot disable it.
+	   We still use this lock to block changes
+	   caused by addrconf/ndisc.
+	*/
+
+	idev = __in6_dev_get(arg->dev);
+	if (idev == NULL)
+		return 0;
+
+	/* For administrative MTU increase, there is no way to discover
+	   IPv6 PMTU increase, so PMTU increase should be updated here.
+	   Since RFC 1981 doesn't include administrative MTU increase
+	   update PMTU increase is a MUST. (i.e. jumbo frame)
+	 */
+	/*
+	   If new MTU is less than route PMTU, this new MTU will be the
+	   lowest MTU in the path, update the route PMTU to reflect PMTU
+	   decreases; if new MTU is greater than route PMTU, and the
+	   old MTU is the lowest MTU in the path, update the route PMTU
+	   to reflect the increase. In this case if the other nodes' MTU
+	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
+	   PMTU discouvery.
+	 */
+	if (rt->rt6i_dev == arg->dev &&
+	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
+	    (dst_mtu(&rt->dst) >= arg->mtu ||
+	     (dst_mtu(&rt->dst) < arg->mtu &&
+	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
+		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
+	}
+	return 0;
+}
+
+void rt6_mtu_change(struct net_device *dev, unsigned mtu)
+{
+	struct rt6_mtu_change_arg arg = {
+		.dev = dev,
+		.mtu = mtu,
+	};
+
+	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
+}
+
+static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
+	[RTA_OIF]               = { .type = NLA_U32 },
+	[RTA_IIF]		= { .type = NLA_U32 },
+	[RTA_PRIORITY]          = { .type = NLA_U32 },
+	[RTA_METRICS]           = { .type = NLA_NESTED },
+};
+
+static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct fib6_config *cfg)
+{
+	struct rtmsg *rtm;
+	struct nlattr *tb[RTA_MAX+1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	rtm = nlmsg_data(nlh);
+	memset(cfg, 0, sizeof(*cfg));
+
+	cfg->fc_table = rtm->rtm_table;
+	cfg->fc_dst_len = rtm->rtm_dst_len;
+	cfg->fc_src_len = rtm->rtm_src_len;
+	cfg->fc_flags = RTF_UP;
+	cfg->fc_protocol = rtm->rtm_protocol;
+
+	if (rtm->rtm_type == RTN_UNREACHABLE)
+		cfg->fc_flags |= RTF_REJECT;
+
+	if (rtm->rtm_type == RTN_LOCAL)
+		cfg->fc_flags |= RTF_LOCAL;
+
+	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+	cfg->fc_nlinfo.nlh = nlh;
+	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
+
+	if (tb[RTA_GATEWAY]) {
+		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
+		cfg->fc_flags |= RTF_GATEWAY;
+	}
+
+	if (tb[RTA_DST]) {
+		int plen = (rtm->rtm_dst_len + 7) >> 3;
+
+		if (nla_len(tb[RTA_DST]) < plen)
+			goto errout;
+
+		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
+	}
+
+	if (tb[RTA_SRC]) {
+		int plen = (rtm->rtm_src_len + 7) >> 3;
+
+		if (nla_len(tb[RTA_SRC]) < plen)
+			goto errout;
+
+		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
+	}
+
+	if (tb[RTA_PREFSRC])
+		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
+
+	if (tb[RTA_OIF])
+		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
+
+	if (tb[RTA_PRIORITY])
+		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
+
+	if (tb[RTA_METRICS]) {
+		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
+		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
+	}
+
+	if (tb[RTA_TABLE])
+		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib6_config cfg;
+	int err;
+
+	err = rtm_to_fib6_config(skb, nlh, &cfg);
+	if (err < 0)
+		return err;
+
+	return ip6_route_del(&cfg);
+}
+
+static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib6_config cfg;
+	int err;
+
+	err = rtm_to_fib6_config(skb, nlh, &cfg);
+	if (err < 0)
+		return err;
+
+	return ip6_route_add(&cfg);
+}
+
+static inline size_t rt6_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct rtmsg))
+	       + nla_total_size(16) /* RTA_SRC */
+	       + nla_total_size(16) /* RTA_DST */
+	       + nla_total_size(16) /* RTA_GATEWAY */
+	       + nla_total_size(16) /* RTA_PREFSRC */
+	       + nla_total_size(4) /* RTA_TABLE */
+	       + nla_total_size(4) /* RTA_IIF */
+	       + nla_total_size(4) /* RTA_OIF */
+	       + nla_total_size(4) /* RTA_PRIORITY */
+	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
+	       + nla_total_size(sizeof(struct rta_cacheinfo));
+}
+
+static int rt6_fill_node(struct net *net,
+			 struct sk_buff *skb, struct rt6_info *rt,
+			 struct in6_addr *dst, struct in6_addr *src,
+			 int iif, int type, u32 pid, u32 seq,
+			 int prefix, int nowait, unsigned int flags)
+{
+	struct rtmsg *rtm;
+	struct nlmsghdr *nlh;
+	long expires;
+	u32 table;
+	struct neighbour *n;
+
+	if (prefix) {	/* user wants prefix routes only */
+		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
+			/* success since this is not a prefix route */
+			return 1;
+		}
+	}
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family = AF_INET6;
+	rtm->rtm_dst_len = rt->rt6i_dst.plen;
+	rtm->rtm_src_len = rt->rt6i_src.plen;
+	rtm->rtm_tos = 0;
+	if (rt->rt6i_table)
+		table = rt->rt6i_table->tb6_id;
+	else
+		table = RT6_TABLE_UNSPEC;
+	rtm->rtm_table = table;
+	NLA_PUT_U32(skb, RTA_TABLE, table);
+	if (rt->rt6i_flags&RTF_REJECT)
+		rtm->rtm_type = RTN_UNREACHABLE;
+	else if (rt->rt6i_flags&RTF_LOCAL)
+		rtm->rtm_type = RTN_LOCAL;
+	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
+		rtm->rtm_type = RTN_LOCAL;
+	else
+		rtm->rtm_type = RTN_UNICAST;
+	rtm->rtm_flags = 0;
+	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+	rtm->rtm_protocol = rt->rt6i_protocol;
+	if (rt->rt6i_flags&RTF_DYNAMIC)
+		rtm->rtm_protocol = RTPROT_REDIRECT;
+	else if (rt->rt6i_flags & RTF_ADDRCONF)
+		rtm->rtm_protocol = RTPROT_KERNEL;
+	else if (rt->rt6i_flags&RTF_DEFAULT)
+		rtm->rtm_protocol = RTPROT_RA;
+
+	if (rt->rt6i_flags&RTF_CACHE)
+		rtm->rtm_flags |= RTM_F_CLONED;
+
+	if (dst) {
+		NLA_PUT(skb, RTA_DST, 16, dst);
+		rtm->rtm_dst_len = 128;
+	} else if (rtm->rtm_dst_len)
+		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
+#ifdef CONFIG_IPV6_SUBTREES
+	if (src) {
+		NLA_PUT(skb, RTA_SRC, 16, src);
+		rtm->rtm_src_len = 128;
+	} else if (rtm->rtm_src_len)
+		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
+#endif
+	if (iif) {
+#ifdef CONFIG_IPV6_MROUTE
+		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+			int err = ip6mr_get_route(net, skb, rtm, nowait);
+			if (err <= 0) {
+				if (!nowait) {
+					if (err == 0)
+						return 0;
+					goto nla_put_failure;
+				} else {
+					if (err == -EMSGSIZE)
+						goto nla_put_failure;
+				}
+			}
+		} else
+#endif
+			NLA_PUT_U32(skb, RTA_IIF, iif);
+	} else if (dst) {
+		struct in6_addr saddr_buf;
+		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
+			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
+	}
+
+	if (rt->rt6i_prefsrc.plen) {
+		struct in6_addr saddr_buf;
+		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
+		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
+	}
+
+	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+		goto nla_put_failure;
+
+	rcu_read_lock();
+	n = dst_get_neighbour(&rt->dst);
+	if (n) {
+		if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
+			rcu_read_unlock();
+			goto nla_put_failure;
+		}
+	}
+	rcu_read_unlock();
+
+	if (rt->dst.dev)
+		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
+
+	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
+
+	if (!(rt->rt6i_flags & RTF_EXPIRES))
+		expires = 0;
+	else if (rt->rt6i_expires - jiffies < INT_MAX)
+		expires = rt->rt6i_expires - jiffies;
+	else
+		expires = INT_MAX;
+
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
+			       expires, rt->dst.error) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+{
+	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	int prefix;
+
+	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
+		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
+	} else
+		prefix = 0;
+
+	return rt6_fill_node(arg->net,
+		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
+		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
+		     prefix, 0, NLM_F_MULTI);
+}
+
+static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct nlattr *tb[RTA_MAX+1];
+	struct rt6_info *rt;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
+	struct flowi6 fl6;
+	int err, iif = 0;
+
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	memset(&fl6, 0, sizeof(fl6));
+
+	if (tb[RTA_SRC]) {
+		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
+			goto errout;
+
+		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
+	}
+
+	if (tb[RTA_DST]) {
+		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
+			goto errout;
+
+		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
+	}
+
+	if (tb[RTA_IIF])
+		iif = nla_get_u32(tb[RTA_IIF]);
+
+	if (tb[RTA_OIF])
+		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
+
+	if (iif) {
+		struct net_device *dev;
+		dev = __dev_get_by_index(net, iif);
+		if (!dev) {
+			err = -ENODEV;
+			goto errout;
+		}
+	}
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	/* Reserve room for dummy headers, this skb can pass
+	   through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
+
+	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
+	skb_dst_set(skb, &rt->dst);
+
+	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
+			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
+			    nlh->nlmsg_seq, 0, 0, 0);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+errout:
+	return err;
+}
+
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+{
+	struct sk_buff *skb;
+	struct net *net = info->nl_net;
+	u32 seq;
+	int err;
+
+	err = -ENOBUFS;
+	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
+
+	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
+	if (skb == NULL)
+		goto errout;
+
+	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
+				event, info->pid, seq, 0, 0, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
+		    info->nlh, gfp_any());
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
+static int ip6_route_dev_notify(struct notifier_block *this,
+				unsigned long event, void *data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct net *net = dev_net(dev);
+
+	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
+		net->ipv6.ip6_null_entry->dst.dev = dev;
+		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
+		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
+		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
+		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
+#endif
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ *	/proc
+ */
+
+#ifdef CONFIG_PROC_FS
+
+struct rt6_proc_arg
+{
+	char *buffer;
+	int offset;
+	int length;
+	int skip;
+	int len;
+};
+
+static int rt6_info_route(struct rt6_info *rt, void *p_arg)
+{
+	struct seq_file *m = p_arg;
+	struct neighbour *n;
+
+	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+#else
+	seq_puts(m, "00000000000000000000000000000000 00 ");
+#endif
+	rcu_read_lock();
+	n = dst_get_neighbour(&rt->dst);
+	if (n) {
+		seq_printf(m, "%pi6", n->primary_key);
+	} else {
+		seq_puts(m, "00000000000000000000000000000000");
+	}
+	rcu_read_unlock();
+	seq_printf(m, " %08x %08x %08x %08x %8s\n",
+		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
+		   rt->dst.__use, rt->rt6i_flags,
+		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
+	return 0;
+}
+
+static int ipv6_route_show(struct seq_file *m, void *v)
+{
+	struct net *net = (struct net *)m->private;
+	fib6_clean_all(net, rt6_info_route, 0, m);
+	return 0;
+}
+
+static int ipv6_route_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, ipv6_route_show);
+}
+
+static const struct file_operations ipv6_route_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ipv6_route_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release_net,
+};
+
+static int rt6_stats_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = (struct net *)seq->private;
+	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
+		   net->ipv6.rt6_stats->fib_nodes,
+		   net->ipv6.rt6_stats->fib_route_nodes,
+		   net->ipv6.rt6_stats->fib_rt_alloc,
+		   net->ipv6.rt6_stats->fib_rt_entries,
+		   net->ipv6.rt6_stats->fib_rt_cache,
+		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
+		   net->ipv6.rt6_stats->fib_discarded_routes);
+
+	return 0;
+}
+
+static int rt6_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, rt6_stats_seq_show);
+}
+
+static const struct file_operations rt6_stats_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = rt6_stats_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+#endif	/* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SYSCTL
+
+static
+int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
+			      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net;
+	int delay;
+	if (!write)
+		return -EINVAL;
+
+	net = (struct net *)ctl->extra1;
+	delay = net->ipv6.sysctl.flush_delay;
+	proc_dointvec(ctl, write, buffer, lenp, ppos);
+	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
+	return 0;
+}
+
+ctl_table ipv6_route_table_template[] = {
+	{
+		.procname	=	"flush",
+		.data		=	&init_net.ipv6.sysctl.flush_delay,
+		.maxlen		=	sizeof(int),
+		.mode		=	0200,
+		.proc_handler	=	ipv6_sysctl_rtcache_flush
+	},
+	{
+		.procname	=	"gc_thresh",
+		.data		=	&ip6_dst_ops_template.gc_thresh,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+	},
+	{
+		.procname	=	"max_size",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+	},
+	{
+		.procname	=	"gc_min_interval",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_jiffies,
+	},
+	{
+		.procname	=	"gc_timeout",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_jiffies,
+	},
+	{
+		.procname	=	"gc_interval",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_jiffies,
+	},
+	{
+		.procname	=	"gc_elasticity",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+	},
+	{
+		.procname	=	"mtu_expires",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_jiffies,
+	},
+	{
+		.procname	=	"min_adv_mss",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec,
+	},
+	{
+		.procname	=	"gc_min_interval_ms",
+		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
+		.maxlen		=	sizeof(int),
+		.mode		=	0644,
+		.proc_handler	=	proc_dointvec_ms_jiffies,
+	},
+	{ }
+};
+
+struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(ipv6_route_table_template,
+			sizeof(ipv6_route_table_template),
+			GFP_KERNEL);
+
+	if (table) {
+		table[0].data = &net->ipv6.sysctl.flush_delay;
+		table[0].extra1 = net;
+		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
+		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
+		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
+		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
+		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
+		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
+		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
+		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+	}
+
+	return table;
+}
+#endif
+
+static int __net_init ip6_route_net_init(struct net *net)
+{
+	int ret = -ENOMEM;
+
+	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
+	       sizeof(net->ipv6.ip6_dst_ops));
+
+	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+		goto out_ip6_dst_ops;
+
+	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
+					   sizeof(*net->ipv6.ip6_null_entry),
+					   GFP_KERNEL);
+	if (!net->ipv6.ip6_null_entry)
+		goto out_ip6_dst_entries;
+	net->ipv6.ip6_null_entry->dst.path =
+		(struct dst_entry *)net->ipv6.ip6_null_entry;
+	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
+			 ip6_template_metrics, true);
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
+					       sizeof(*net->ipv6.ip6_prohibit_entry),
+					       GFP_KERNEL);
+	if (!net->ipv6.ip6_prohibit_entry)
+		goto out_ip6_null_entry;
+	net->ipv6.ip6_prohibit_entry->dst.path =
+		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
+	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
+			 ip6_template_metrics, true);
+
+	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
+					       sizeof(*net->ipv6.ip6_blk_hole_entry),
+					       GFP_KERNEL);
+	if (!net->ipv6.ip6_blk_hole_entry)
+		goto out_ip6_prohibit_entry;
+	net->ipv6.ip6_blk_hole_entry->dst.path =
+		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
+	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
+			 ip6_template_metrics, true);
+#endif
+
+	net->ipv6.sysctl.flush_delay = 0;
+	net->ipv6.sysctl.ip6_rt_max_size = 4096;
+	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
+	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
+	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
+	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
+	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
+	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+
+#ifdef CONFIG_PROC_FS
+	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
+	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
+#endif
+	net->ipv6.ip6_rt_gc_expire = 30*HZ;
+
+	ret = 0;
+out:
+	return ret;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_ip6_prohibit_entry:
+	kfree(net->ipv6.ip6_prohibit_entry);
+out_ip6_null_entry:
+	kfree(net->ipv6.ip6_null_entry);
+#endif
+out_ip6_dst_entries:
+	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+out_ip6_dst_ops:
+	goto out;
+}
+
+static void __net_exit ip6_route_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "ipv6_route");
+	proc_net_remove(net, "rt6_stats");
+#endif
+	kfree(net->ipv6.ip6_null_entry);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	kfree(net->ipv6.ip6_prohibit_entry);
+	kfree(net->ipv6.ip6_blk_hole_entry);
+#endif
+	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
+}
+
+static struct pernet_operations ip6_route_net_ops = {
+	.init = ip6_route_net_init,
+	.exit = ip6_route_net_exit,
+};
+
+static struct notifier_block ip6_route_dev_notifier = {
+	.notifier_call = ip6_route_dev_notify,
+	.priority = 0,
+};
+
+int __init ip6_route_init(void)
+{
+	int ret;
+
+	ret = -ENOMEM;
+	ip6_dst_ops_template.kmem_cachep =
+		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
+				  SLAB_HWCACHE_ALIGN, NULL);
+	if (!ip6_dst_ops_template.kmem_cachep)
+		goto out;
+
+	ret = dst_entries_init(&ip6_dst_blackhole_ops);
+	if (ret)
+		goto out_kmem_cache;
+
+	ret = register_pernet_subsys(&ip6_route_net_ops);
+	if (ret)
+		goto out_dst_entries;
+
+	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
+
+	/* Registering of the loopback is done before this portion of code,
+	 * the loopback reference in rt6_info will not be taken, do it
+	 * manually for init_net */
+	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+  #endif
+	ret = fib6_init();
+	if (ret)
+		goto out_register_subsys;
+
+	ret = xfrm6_init();
+	if (ret)
+		goto out_fib6_init;
+
+	ret = fib6_rules_init();
+	if (ret)
+		goto xfrm6_init;
+
+	ret = -ENOBUFS;
+	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
+	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
+	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
+		goto fib6_rules_init;
+
+	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
+	if (ret)
+		goto fib6_rules_init;
+
+out:
+	return ret;
+
+fib6_rules_init:
+	fib6_rules_cleanup();
+xfrm6_init:
+	xfrm6_fini();
+out_fib6_init:
+	fib6_gc_cleanup();
+out_register_subsys:
+	unregister_pernet_subsys(&ip6_route_net_ops);
+out_dst_entries:
+	dst_entries_destroy(&ip6_dst_blackhole_ops);
+out_kmem_cache:
+	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
+	goto out;
+}
+
+void ip6_route_cleanup(void)
+{
+	unregister_netdevice_notifier(&ip6_route_dev_notifier);
+	fib6_rules_cleanup();
+	xfrm6_fini();
+	fib6_gc_cleanup();
+	unregister_pernet_subsys(&ip6_route_net_ops);
+	dst_entries_destroy(&ip6_dst_blackhole_ops);
+	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
+}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
new file mode 100644
index 00000000..f56acd09
--- /dev/null
+++ b/net/ipv6/sit.c
@@ -0,0 +1,1293 @@
+/*
+ *	IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT)
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Changes:
+ * Roger Venning <r.venning@telstra.com>:	6to4 support
+ * Nate Thompson <nate@thebog.net>:		6to4 support
+ * Fred Templin <fred.l.templin@boeing.com>:	isatap support
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/dsfield.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+/*
+   This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+#define HASH_SIZE  16
+#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
+
+static int ipip6_tunnel_init(struct net_device *dev);
+static void ipip6_tunnel_setup(struct net_device *dev);
+static void ipip6_dev_free(struct net_device *dev);
+
+static int sit_net_id __read_mostly;
+struct sit_net {
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel __rcu **tunnels[4];
+
+	struct net_device *fb_tunnel_dev;
+};
+
+/*
+ * Locking : hash tables are protected by RCU and RTNL
+ */
+
+#define for_each_ip_tunnel_rcu(start) \
+	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipip6_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+/*
+ * Must be invoked with rcu_read_lock
+ */
+static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
+		struct net_device *dev, __be32 remote, __be32 local)
+{
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
+	struct ip_tunnel *t;
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+
+	for_each_ip_tunnel_rcu(sitn->tunnels_r_l[h0 ^ h1]) {
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr &&
+		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (t->dev->flags & IFF_UP))
+			return t;
+	}
+	for_each_ip_tunnel_rcu(sitn->tunnels_r[h0]) {
+		if (remote == t->parms.iph.daddr &&
+		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (t->dev->flags & IFF_UP))
+			return t;
+	}
+	for_each_ip_tunnel_rcu(sitn->tunnels_l[h1]) {
+		if (local == t->parms.iph.saddr &&
+		    (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+		    (t->dev->flags & IFF_UP))
+			return t;
+	}
+	t = rcu_dereference(sitn->tunnels_wc[0]);
+	if ((t != NULL) && (t->dev->flags & IFF_UP))
+		return t;
+	return NULL;
+}
+
+static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
+		struct ip_tunnel_parm *parms)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	unsigned int h = 0;
+	int prio = 0;
+
+	if (remote) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	if (local) {
+		prio |= 1;
+		h ^= HASH(local);
+	}
+	return &sitn->tunnels[prio][h];
+}
+
+static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
+		struct ip_tunnel *t)
+{
+	return __ipip6_bucket(sitn, &t->parms);
+}
+
+static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipip6_bucket(sitn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
+			break;
+		}
+	}
+}
+
+static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
+{
+	struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
+
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
+	rcu_assign_pointer(*tp, t);
+}
+
+static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
+{
+#ifdef CONFIG_IPV6_SIT_6RD
+	struct ip_tunnel *t = netdev_priv(dev);
+
+	if (t->dev == sitn->fb_tunnel_dev) {
+		ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
+		t->ip6rd.relay_prefix = 0;
+		t->ip6rd.prefixlen = 16;
+		t->ip6rd.relay_prefixlen = 0;
+	} else {
+		struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev);
+		memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd));
+	}
+#endif
+}
+
+static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
+		struct ip_tunnel_parm *parms, int create)
+{
+	__be32 remote = parms->iph.daddr;
+	__be32 local = parms->iph.saddr;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+
+	for (tp = __ipip6_bucket(sitn, parms);
+	    (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next) {
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr &&
+		    parms->link == t->parms.link) {
+			if (create)
+				return NULL;
+			else
+				return t;
+		}
+	}
+	if (!create)
+		goto failed;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else
+		strcpy(name, "sit%d");
+
+	dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
+	if (dev == NULL)
+		return NULL;
+
+	dev_net_set(dev, net);
+
+	nt = netdev_priv(dev);
+
+	nt->parms = *parms;
+	if (ipip6_tunnel_init(dev) < 0)
+		goto failed_free;
+	ipip6_tunnel_clone_6rd(dev, sitn);
+
+	if (parms->i_flags & SIT_ISATAP)
+		dev->priv_flags |= IFF_ISATAP;
+
+	if (register_netdevice(dev) < 0)
+		goto failed_free;
+
+	strcpy(nt->parms.name, dev->name);
+
+	dev_hold(dev);
+
+	ipip6_tunnel_link(sitn, nt);
+	return nt;
+
+failed_free:
+	ipip6_dev_free(dev);
+failed:
+	return NULL;
+}
+
+#define for_each_prl_rcu(start)			\
+	for (prl = rcu_dereference(start);	\
+	     prl;				\
+	     prl = rcu_dereference(prl->next))
+
+static struct ip_tunnel_prl_entry *
+__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
+{
+	struct ip_tunnel_prl_entry *prl;
+
+	for_each_prl_rcu(t->prl)
+		if (prl->addr == addr)
+			break;
+	return prl;
+
+}
+
+static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
+				struct ip_tunnel_prl __user *a)
+{
+	struct ip_tunnel_prl kprl, *kp;
+	struct ip_tunnel_prl_entry *prl;
+	unsigned int cmax, c = 0, ca, len;
+	int ret = 0;
+
+	if (copy_from_user(&kprl, a, sizeof(kprl)))
+		return -EFAULT;
+	cmax = kprl.datalen / sizeof(kprl);
+	if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
+		cmax = 1;
+
+	/* For simple GET or for root users,
+	 * we try harder to allocate.
+	 */
+	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
+		kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+		NULL;
+
+	rcu_read_lock();
+
+	ca = t->prl_count < cmax ? t->prl_count : cmax;
+
+	if (!kp) {
+		/* We don't try hard to allocate much memory for
+		 * non-root users.
+		 * For root users, retry allocating enough memory for
+		 * the answer.
+		 */
+		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+		if (!kp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	c = 0;
+	for_each_prl_rcu(t->prl) {
+		if (c >= cmax)
+			break;
+		if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
+			continue;
+		kp[c].addr = prl->addr;
+		kp[c].flags = prl->flags;
+		c++;
+		if (kprl.addr != htonl(INADDR_ANY))
+			break;
+	}
+out:
+	rcu_read_unlock();
+
+	len = sizeof(*kp) * c;
+	ret = 0;
+	if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
+		ret = -EFAULT;
+
+	kfree(kp);
+
+	return ret;
+}
+
+static int
+ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
+{
+	struct ip_tunnel_prl_entry *p;
+	int err = 0;
+
+	if (a->addr == htonl(INADDR_ANY))
+		return -EINVAL;
+
+	ASSERT_RTNL();
+
+	for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
+		if (p->addr == a->addr) {
+			if (chg) {
+				p->flags = a->flags;
+				goto out;
+			}
+			err = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (chg) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL);
+	if (!p) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	p->next = t->prl;
+	p->addr = a->addr;
+	p->flags = a->flags;
+	t->prl_count++;
+	rcu_assign_pointer(t->prl, p);
+out:
+	return err;
+}
+
+static void prl_list_destroy_rcu(struct rcu_head *head)
+{
+	struct ip_tunnel_prl_entry *p, *n;
+
+	p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
+	do {
+		n = rcu_dereference_protected(p->next, 1);
+		kfree(p);
+		p = n;
+	} while (p);
+}
+
+static int
+ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
+{
+	struct ip_tunnel_prl_entry *x;
+	struct ip_tunnel_prl_entry __rcu **p;
+	int err = 0;
+
+	ASSERT_RTNL();
+
+	if (a && a->addr != htonl(INADDR_ANY)) {
+		for (p = &t->prl;
+		     (x = rtnl_dereference(*p)) != NULL;
+		     p = &x->next) {
+			if (x->addr == a->addr) {
+				*p = x->next;
+				kfree_rcu(x, rcu_head);
+				t->prl_count--;
+				goto out;
+			}
+		}
+		err = -ENXIO;
+	} else {
+		x = rtnl_dereference(t->prl);
+		if (x) {
+			t->prl_count = 0;
+			call_rcu(&x->rcu_head, prl_list_destroy_rcu);
+			t->prl = NULL;
+		}
+	}
+out:
+	return err;
+}
+
+static int
+isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
+{
+	struct ip_tunnel_prl_entry *p;
+	int ok = 1;
+
+	rcu_read_lock();
+	p = __ipip6_tunnel_locate_prl(t, iph->saddr);
+	if (p) {
+		if (p->flags & PRL_DEFAULT)
+			skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
+		else
+			skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
+	} else {
+		const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;
+
+		if (ipv6_addr_is_isatap(addr6) &&
+		    (addr6->s6_addr32[3] == iph->saddr) &&
+		    ipv6_chk_prefix(addr6, t->dev))
+			skb->ndisc_nodetype = NDISC_NODETYPE_HOST;
+		else
+			ok = 0;
+	}
+	rcu_read_unlock();
+	return ok;
+}
+
+static void ipip6_tunnel_uninit(struct net_device *dev)
+{
+	struct net *net = dev_net(dev);
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+
+	if (dev == sitn->fb_tunnel_dev) {
+		rcu_assign_pointer(sitn->tunnels_wc[0], NULL);
+	} else {
+		ipip6_tunnel_unlink(sitn, netdev_priv(dev));
+		ipip6_tunnel_del_prl(netdev_priv(dev), NULL);
+	}
+	dev_put(dev);
+}
+
+
+static int ipip6_err(struct sk_buff *skb, u32 info)
+{
+
+/* All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+ */
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct ip_tunnel *t;
+	int err;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return 0;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return 0;
+		case ICMP_FRAG_NEEDED:
+			/* Soft state for pmtu is maintained by IP core. */
+			return 0;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return 0;
+		break;
+	}
+
+	err = -ENOENT;
+
+	rcu_read_lock();
+	t = ipip6_tunnel_lookup(dev_net(skb->dev),
+				skb->dev,
+				iph->daddr,
+				iph->saddr);
+	if (t == NULL || t->parms.iph.daddr == 0)
+		goto out;
+
+	err = 0;
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static inline void ipip6_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos))
+		IP6_ECN_set_ce(ipv6_hdr(skb));
+}
+
+static int ipip6_rcv(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	struct ip_tunnel *tunnel;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto out;
+
+	iph = ip_hdr(skb);
+
+	rcu_read_lock();
+	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
+				     iph->saddr, iph->daddr);
+	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
+		secpath_reset(skb);
+		skb->mac_header = skb->network_header;
+		skb_reset_network_header(skb);
+		IPCB(skb)->flags = 0;
+		skb->protocol = htons(ETH_P_IPV6);
+		skb->pkt_type = PACKET_HOST;
+
+		if ((tunnel->dev->priv_flags & IFF_ISATAP) &&
+		    !isatap_chksrc(skb, iph, tunnel)) {
+			tunnel->dev->stats.rx_errors++;
+			rcu_read_unlock();
+			kfree_skb(skb);
+			return 0;
+		}
+
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		ipip6_ecn_decapsulate(iph, skb);
+
+		netif_rx(skb);
+
+		rcu_read_unlock();
+		return 0;
+	}
+
+	/* no tunnel matched,  let upstream know, ipsec may handle it */
+	rcu_read_unlock();
+	return 1;
+out:
+	kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Returns the embedded IPv4 address if the IPv6 address
+ * comes from 6rd / 6to4 (RFC 3056) addr space.
+ */
+static inline
+__be32 try_6rd(const struct in6_addr *v6dst, struct ip_tunnel *tunnel)
+{
+	__be32 dst = 0;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+	if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
+			      tunnel->ip6rd.prefixlen)) {
+		unsigned int pbw0, pbi0;
+		int pbi1;
+		u32 d;
+
+		pbw0 = tunnel->ip6rd.prefixlen >> 5;
+		pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
+
+		d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
+		    tunnel->ip6rd.relay_prefixlen;
+
+		pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
+		if (pbi1 > 0)
+			d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >>
+			     (32 - pbi1);
+
+		dst = tunnel->ip6rd.relay_prefix | htonl(d);
+	}
+#else
+	if (v6dst->s6_addr16[0] == htons(0x2002)) {
+		/* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
+		memcpy(&dst, &v6dst->s6_addr16[1], 4);
+	}
+#endif
+	return dst;
+}
+
+/*
+ *	This function assumes it is being called from dev_queue_xmit()
+ *	and that skb is filled properly by that function.
+ */
+
+static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct pcpu_tstats *tstats;
+	const struct iphdr  *tiph = &tunnel->parms.iph;
+	const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+	u8     tos = tunnel->parms.iph.tos;
+	__be16 df = tiph->frag_off;
+	struct rtable *rt;     			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *iph;			/* Our new IP header */
+	unsigned int max_headroom;		/* The extra header space needed */
+	__be32 dst = tiph->daddr;
+	struct flowi4 fl4;
+	int    mtu;
+	const struct in6_addr *addr6;
+	int addr_type;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		goto tx_error;
+
+	/* ISATAP (RFC4214) - must come before 6to4 */
+	if (dev->priv_flags & IFF_ISATAP) {
+		struct neighbour *neigh = NULL;
+
+		if (skb_dst(skb))
+			neigh = dst_get_neighbour(skb_dst(skb));
+
+		if (neigh == NULL) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "sit: nexthop == NULL\n");
+			goto tx_error;
+		}
+
+		addr6 = (const struct in6_addr*)&neigh->primary_key;
+		addr_type = ipv6_addr_type(addr6);
+
+		if ((addr_type & IPV6_ADDR_UNICAST) &&
+		     ipv6_addr_is_isatap(addr6))
+			dst = addr6->s6_addr32[3];
+		else
+			goto tx_error;
+	}
+
+	if (!dst)
+		dst = try_6rd(&iph6->daddr, tunnel);
+
+	if (!dst) {
+		struct neighbour *neigh = NULL;
+
+		if (skb_dst(skb))
+			neigh = dst_get_neighbour(skb_dst(skb));
+
+		if (neigh == NULL) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "sit: nexthop == NULL\n");
+			goto tx_error;
+		}
+
+		addr6 = (const struct in6_addr*)&neigh->primary_key;
+		addr_type = ipv6_addr_type(addr6);
+
+		if (addr_type == IPV6_ADDR_ANY) {
+			addr6 = &ipv6_hdr(skb)->daddr;
+			addr_type = ipv6_addr_type(addr6);
+		}
+
+		if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+			goto tx_error_icmp;
+
+		dst = addr6->s6_addr32[3];
+	}
+
+	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+				   dst, tiph->saddr,
+				   0, 0,
+				   IPPROTO_IPV6, RT_TOS(tos),
+				   tunnel->parms.link);
+	if (IS_ERR(rt)) {
+		dev->stats.tx_carrier_errors++;
+		goto tx_error_icmp;
+	}
+	if (rt->rt_type != RTN_UNICAST) {
+		ip_rt_put(rt);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error_icmp;
+	}
+	tdev = rt->dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	if (df) {
+		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+
+		if (mtu < 68) {
+			dev->stats.collisions++;
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+
+		if (mtu < IPV6_MIN_MTU) {
+			mtu = IPV6_MIN_MTU;
+			df = 0;
+		}
+
+		if (tunnel->parms.iph.daddr && skb_dst(skb))
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+
+		if (skb->len > mtu) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+
+	if (tunnel->err_count > 0) {
+		if (time_before(jiffies,
+				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
+			tunnel->err_count--;
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr);
+
+	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
+	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			dev->stats.tx_dropped++;
+			dev_kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		dev_kfree_skb(skb);
+		skb = new_skb;
+		iph6 = ipv6_hdr(skb);
+	}
+
+	skb->transport_header = skb->network_header;
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags = 0;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+
+	iph 			=	ip_hdr(skb);
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPV6;
+	iph->tos		=	INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
+	iph->daddr		=	fl4.daddr;
+	iph->saddr		=	fl4.saddr;
+
+	if ((iph->ttl = tiph->ttl) == 0)
+		iph->ttl	=	iph6->hop_limit;
+
+	nf_reset(skb);
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
+	return NETDEV_TX_OK;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev->stats.tx_errors++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void ipip6_tunnel_bind_dev(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	const struct iphdr *iph;
+	struct flowi4 fl4;
+
+	tunnel = netdev_priv(dev);
+	iph = &tunnel->parms.iph;
+
+	if (iph->daddr) {
+		struct rtable *rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
+							  iph->daddr, iph->saddr,
+							  0, 0,
+							  IPPROTO_IPV6,
+							  RT_TOS(iph->tos),
+							  tunnel->parms.link);
+
+		if (!IS_ERR(rt)) {
+			tdev = rt->dst.dev;
+			ip_rt_put(rt);
+		}
+		dev->flags |= IFF_POINTOPOINT;
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+
+	if (tdev) {
+		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+		dev->mtu = tdev->mtu - sizeof(struct iphdr);
+		if (dev->mtu < IPV6_MIN_MTU)
+			dev->mtu = IPV6_MIN_MTU;
+	}
+	dev->iflink = tunnel->parms.link;
+}
+
+static int
+ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel_prl prl;
+	struct ip_tunnel *t;
+	struct net *net = dev_net(dev);
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+#ifdef CONFIG_IPV6_SIT_6RD
+	struct ip_tunnel_6rd ip6rd;
+#endif
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+#ifdef CONFIG_IPV6_SIT_6RD
+	case SIOCGET6RD:
+#endif
+		t = NULL;
+		if (dev == sitn->fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ipip6_tunnel_locate(net, &p, 0);
+		}
+		if (t == NULL)
+			t = netdev_priv(dev);
+
+		err = -EFAULT;
+		if (cmd == SIOCGETTUNNEL) {
+			memcpy(&p, &t->parms, sizeof(p));
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
+					 sizeof(p)))
+				goto done;
+#ifdef CONFIG_IPV6_SIT_6RD
+		} else {
+			ipv6_addr_copy(&ip6rd.prefix, &t->ip6rd.prefix);
+			ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+			ip6rd.prefixlen = t->ip6rd.prefixlen;
+			ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
+					 sizeof(ip6rd)))
+				goto done;
+#endif
+		}
+		err = 0;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 ||
+		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+			goto done;
+		if (p.iph.ttl)
+			p.iph.frag_off |= htons(IP_DF);
+
+		t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
+
+		if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+					err = -EINVAL;
+					break;
+				}
+				t = netdev_priv(dev);
+				ipip6_tunnel_unlink(sitn, t);
+				synchronize_net();
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				ipip6_tunnel_link(sitn, t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.iph.ttl = p.iph.ttl;
+				t->parms.iph.tos = p.iph.tos;
+				if (t->parms.link != p.link) {
+					t->parms.link = p.link;
+					ipip6_tunnel_bind_dev(dev);
+					netdev_state_change(dev);
+				}
+			}
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == sitn->fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+				goto done;
+			err = -ENOENT;
+			if ((t = ipip6_tunnel_locate(net, &p, 0)) == NULL)
+				goto done;
+			err = -EPERM;
+			if (t == netdev_priv(sitn->fb_tunnel_dev))
+				goto done;
+			dev = t->dev;
+		}
+		unregister_netdevice(dev);
+		err = 0;
+		break;
+
+	case SIOCGETPRL:
+		err = -EINVAL;
+		if (dev == sitn->fb_tunnel_dev)
+			goto done;
+		err = -ENOENT;
+		if (!(t = netdev_priv(dev)))
+			goto done;
+		err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
+		break;
+
+	case SIOCADDPRL:
+	case SIOCDELPRL:
+	case SIOCCHGPRL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+		err = -EINVAL;
+		if (dev == sitn->fb_tunnel_dev)
+			goto done;
+		err = -EFAULT;
+		if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
+			goto done;
+		err = -ENOENT;
+		if (!(t = netdev_priv(dev)))
+			goto done;
+
+		switch (cmd) {
+		case SIOCDELPRL:
+			err = ipip6_tunnel_del_prl(t, &prl);
+			break;
+		case SIOCADDPRL:
+		case SIOCCHGPRL:
+			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+			break;
+		}
+		netdev_state_change(dev);
+		break;
+
+#ifdef CONFIG_IPV6_SIT_6RD
+	case SIOCADD6RD:
+	case SIOCCHG6RD:
+	case SIOCDEL6RD:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
+				   sizeof(ip6rd)))
+			goto done;
+
+		t = netdev_priv(dev);
+
+		if (cmd != SIOCDEL6RD) {
+			struct in6_addr prefix;
+			__be32 relay_prefix;
+
+			err = -EINVAL;
+			if (ip6rd.relay_prefixlen > 32 ||
+			    ip6rd.prefixlen + (32 - ip6rd.relay_prefixlen) > 64)
+				goto done;
+
+			ipv6_addr_prefix(&prefix, &ip6rd.prefix,
+					 ip6rd.prefixlen);
+			if (!ipv6_addr_equal(&prefix, &ip6rd.prefix))
+				goto done;
+			if (ip6rd.relay_prefixlen)
+				relay_prefix = ip6rd.relay_prefix &
+					       htonl(0xffffffffUL <<
+						     (32 - ip6rd.relay_prefixlen));
+			else
+				relay_prefix = 0;
+			if (relay_prefix != ip6rd.relay_prefix)
+				goto done;
+
+			ipv6_addr_copy(&t->ip6rd.prefix, &prefix);
+			t->ip6rd.relay_prefix = relay_prefix;
+			t->ip6rd.prefixlen = ip6rd.prefixlen;
+			t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen;
+		} else
+			ipip6_tunnel_clone_6rd(dev, sitn);
+
+		err = 0;
+		break;
+#endif
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static const struct net_device_ops ipip6_netdev_ops = {
+	.ndo_uninit	= ipip6_tunnel_uninit,
+	.ndo_start_xmit	= ipip6_tunnel_xmit,
+	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
+	.ndo_change_mtu	= ipip6_tunnel_change_mtu,
+	.ndo_get_stats	= ipip6_get_stats,
+};
+
+static void ipip6_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
+static void ipip6_tunnel_setup(struct net_device *dev)
+{
+	dev->netdev_ops		= &ipip6_netdev_ops;
+	dev->destructor 	= ipip6_dev_free;
+
+	dev->type		= ARPHRD_SIT;
+	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
+	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
+	dev->flags		= IFF_NOARP;
+	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
+}
+
+static int ipip6_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	tunnel->dev = dev;
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	ipip6_tunnel_bind_dev(dev);
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct iphdr *iph = &tunnel->parms.iph;
+	struct net *net = dev_net(dev);
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_IPV6;
+	iph->ihl		= 5;
+	iph->ttl		= 64;
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+	dev_hold(dev);
+	rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
+	return 0;
+}
+
+static struct xfrm_tunnel sit_handler __read_mostly = {
+	.handler	=	ipip6_rcv,
+	.err_handler	=	ipip6_err,
+	.priority	=	1,
+};
+
+static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head)
+{
+	int prio;
+
+	for (prio = 1; prio < 4; prio++) {
+		int h;
+		for (h = 0; h < HASH_SIZE; h++) {
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(sitn->tunnels[prio][h]);
+			while (t != NULL) {
+				unregister_netdevice_queue(t->dev, head);
+				t = rtnl_dereference(t->next);
+			}
+		}
+	}
+}
+
+static int __net_init sit_init_net(struct net *net)
+{
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+	struct ip_tunnel *t;
+	int err;
+
+	sitn->tunnels[0] = sitn->tunnels_wc;
+	sitn->tunnels[1] = sitn->tunnels_l;
+	sitn->tunnels[2] = sitn->tunnels_r;
+	sitn->tunnels[3] = sitn->tunnels_r_l;
+
+	sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
+					   ipip6_tunnel_setup);
+	if (!sitn->fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err_alloc_dev;
+	}
+	dev_net_set(sitn->fb_tunnel_dev, net);
+
+	err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+	if (err)
+		goto err_dev_free;
+
+	ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
+
+	if ((err = register_netdev(sitn->fb_tunnel_dev)))
+		goto err_reg_dev;
+
+	t = netdev_priv(sitn->fb_tunnel_dev);
+
+	strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
+	return 0;
+
+err_reg_dev:
+	dev_put(sitn->fb_tunnel_dev);
+err_dev_free:
+	ipip6_dev_free(sitn->fb_tunnel_dev);
+err_alloc_dev:
+	return err;
+}
+
+static void __net_exit sit_exit_net(struct net *net)
+{
+	struct sit_net *sitn = net_generic(net, sit_net_id);
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	sit_destroy_tunnels(sitn, &list);
+	unregister_netdevice_queue(sitn->fb_tunnel_dev, &list);
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations sit_net_ops = {
+	.init = sit_init_net,
+	.exit = sit_exit_net,
+	.id   = &sit_net_id,
+	.size = sizeof(struct sit_net),
+};
+
+static void __exit sit_cleanup(void)
+{
+	xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+
+	unregister_pernet_device(&sit_net_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+static int __init sit_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n");
+
+	err = register_pernet_device(&sit_net_ops);
+	if (err < 0)
+		return err;
+	err = xfrm4_tunnel_register(&sit_handler, AF_INET6);
+	if (err < 0) {
+		unregister_pernet_device(&sit_net_ops);
+		printk(KERN_INFO "sit init: Can't add protocol\n");
+	}
+	return err;
+}
+
+module_init(sit_init);
+module_exit(sit_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETDEV("sit0");
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
new file mode 100644
index 00000000..14b83395
--- /dev/null
+++ b/net/ipv6/syncookies.c
@@ -0,0 +1,268 @@
+/*
+ *  IPv6 Syncookies implementation for the Linux kernel
+ *
+ *  Authors:
+ *  Glenn Griffin	<ggriffin.kernel@gmail.com>
+ *
+ *  Based on IPv4 implementation by Andi Kleen
+ *  linux/net/ipv4/syncookies.c
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/tcp.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+
+extern int sysctl_tcp_syncookies;
+extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
+
+#define COOKIEBITS 24	/* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+/* Table must be sorted. */
+static __u16 const msstab[] = {
+	64,
+	512,
+	536,
+	1280 - 60,
+	1480 - 60,
+	1500 - 60,
+	4460 - 60,
+	9000 - 60,
+};
+
+/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+					   struct request_sock *req,
+					   struct dst_entry *dst)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *child;
+
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+	if (child)
+		inet_csk_reqsk_queue_add(sk, req, child);
+	else
+		reqsk_free(req);
+
+	return child;
+}
+
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
+		      ipv6_cookie_scratch);
+
+static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
+		       __be16 sport, __be16 dport, u32 count, int c)
+{
+	__u32 *tmp = __get_cpu_var(ipv6_cookie_scratch);
+
+	/*
+	 * we have 320 bits of information to hash, copy in the remaining
+	 * 192 bits required for sha_transform, from the syncookie_secret
+	 * and overwrite the digest with the secret
+	 */
+	memcpy(tmp + 10, syncookie_secret[c], 44);
+	memcpy(tmp, saddr, 16);
+	memcpy(tmp + 4, daddr, 16);
+	tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
+	tmp[9] = count;
+	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+	return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
+				   const struct in6_addr *daddr,
+				   __be16 sport, __be16 dport, __u32 sseq,
+				   __u32 count, __u32 data)
+{
+	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+		sseq + (count << COOKIEBITS) +
+		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+		& COOKIEMASK));
+}
+
+static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr,
+				  const struct in6_addr *daddr, __be16 sport,
+				  __be16 dport, __u32 sseq, __u32 count,
+				  __u32 maxdiff)
+{
+	__u32 diff;
+
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+	if (diff >= maxdiff)
+		return (__u32)-1;
+
+	return (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;
+}
+
+__u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+	int mssind;
+	const __u16 mss = *mssp;
+
+	tcp_synq_overflow(sk);
+
+	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+		if (mss >= msstab[mssind])
+			break;
+
+	*mssp = msstab[mssind];
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+
+	return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
+				     th->dest, ntohl(th->seq),
+				     jiffies / (HZ * 60), mssind);
+}
+
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct tcphdr *th = tcp_hdr(skb);
+	__u32 seq = ntohl(th->seq) - 1;
+	__u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+					    th->source, th->dest, seq,
+					    jiffies / (HZ * 60), COUNTER_TRIES);
+
+	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
+}
+
+struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_options_received tcp_opt;
+	u8 *hash_location;
+	struct inet_request_sock *ireq;
+	struct inet6_request_sock *ireq6;
+	struct tcp_request_sock *treq;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcphdr *th = tcp_hdr(skb);
+	__u32 cookie = ntohl(th->ack_seq) - 1;
+	struct sock *ret = sk;
+	struct request_sock *req;
+	int mss;
+	struct dst_entry *dst;
+	__u8 rcv_wscale;
+	bool ecn_ok = false;
+
+	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+		goto out;
+
+	if (tcp_synq_no_recent_overflow(sk) ||
+		(mss = cookie_check(skb, cookie)) == 0) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+		goto out;
+	}
+
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+
+	/* check for timestamp cookie support */
+	memset(&tcp_opt, 0, sizeof(tcp_opt));
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+		goto out;
+
+	ret = NULL;
+	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
+	if (!req)
+		goto out;
+
+	ireq = inet_rsk(req);
+	ireq6 = inet6_rsk(req);
+	treq = tcp_rsk(req);
+
+	if (security_inet_conn_request(sk, skb, req))
+		goto out_free;
+
+	req->mss = mss;
+	ireq->rmt_port = th->source;
+	ireq->loc_port = th->dest;
+	ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
+	ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
+	if (ipv6_opt_accepted(sk, skb) ||
+	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+		atomic_inc(&skb->users);
+		ireq6->pktopts = skb;
+	}
+
+	ireq6->iif = sk->sk_bound_dev_if;
+	/* So that link locals have meaning */
+	if (!sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq6->iif = inet6_iif(skb);
+
+	req->expires = 0UL;
+	req->retrans = 0;
+	ireq->ecn_ok		= ecn_ok;
+	ireq->snd_wscale	= tcp_opt.snd_wscale;
+	ireq->sack_ok		= tcp_opt.sack_ok;
+	ireq->wscale_ok		= tcp_opt.wscale_ok;
+	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
+	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->rcv_isn = ntohl(th->seq) - 1;
+	treq->snt_isn = cookie;
+
+	/*
+	 * We need to lookup the dst_entry to get the correct window size.
+	 * This is taken from tcp_v6_syn_recv_sock.  Somebody please enlighten
+	 * me if there is a preferred way.
+	 */
+	{
+		struct in6_addr *final_p, final;
+		struct flowi6 fl6;
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_proto = IPPROTO_TCP;
+		ipv6_addr_copy(&fl6.daddr, &ireq6->rmt_addr);
+		final_p = fl6_update_dst(&fl6, np->opt, &final);
+		ipv6_addr_copy(&fl6.saddr, &ireq6->loc_addr);
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+		fl6.flowi6_mark = sk->sk_mark;
+		fl6.fl6_dport = inet_rsk(req)->rmt_port;
+		fl6.fl6_sport = inet_sk(sk)->inet_sport;
+		security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+		if (IS_ERR(dst))
+			goto out_free;
+	}
+
+	req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
+	tcp_select_initial_window(tcp_full_space(sk), req->mss,
+				  &req->rcv_wnd, &req->window_clamp,
+				  ireq->wscale_ok, &rcv_wscale,
+				  dst_metric(dst, RTAX_INITRWND));
+
+	ireq->rcv_wscale = rcv_wscale;
+
+	ret = get_cookie_sock(sk, skb, req, dst);
+out:
+	return ret;
+out_free:
+	reqsk_free(req);
+	return NULL;
+}
+
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
new file mode 100644
index 00000000..6dcf5e7d
--- /dev/null
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -0,0 +1,176 @@
+/*
+ * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem.
+ *
+ * Changes:
+ * YOSHIFUJI Hideaki @USAGI:	added icmp sysctl table.
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/slab.h>
+#include <net/ndisc.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/inet_frag.h>
+
+static struct ctl_table empty[1];
+
+static ctl_table ipv6_static_skeleton[] = {
+	{
+		.procname	= "neigh",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= empty,
+	},
+	{ }
+};
+
+static ctl_table ipv6_table_template[] = {
+	{
+		.procname	= "route",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ipv6_route_table_template
+	},
+	{
+		.procname	= "icmp",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ipv6_icmp_table_template
+	},
+	{
+		.procname	= "bindv6only",
+		.data		= &init_net.ipv6.sysctl.bindv6only,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+
+static ctl_table ipv6_rotable[] = {
+	{
+		.procname	= "mld_max_msf",
+		.data		= &sysctl_mld_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+
+struct ctl_path net_ipv6_ctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv6", },
+	{ },
+};
+EXPORT_SYMBOL_GPL(net_ipv6_ctl_path);
+
+static int __net_init ipv6_sysctl_net_init(struct net *net)
+{
+	struct ctl_table *ipv6_table;
+	struct ctl_table *ipv6_route_table;
+	struct ctl_table *ipv6_icmp_table;
+	int err;
+
+	err = -ENOMEM;
+	ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
+			     GFP_KERNEL);
+	if (!ipv6_table)
+		goto out;
+
+	ipv6_route_table = ipv6_route_sysctl_init(net);
+	if (!ipv6_route_table)
+		goto out_ipv6_table;
+	ipv6_table[0].child = ipv6_route_table;
+
+	ipv6_icmp_table = ipv6_icmp_sysctl_init(net);
+	if (!ipv6_icmp_table)
+		goto out_ipv6_route_table;
+	ipv6_table[1].child = ipv6_icmp_table;
+
+	ipv6_table[2].data = &net->ipv6.sysctl.bindv6only;
+
+	net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path,
+							   ipv6_table);
+	if (!net->ipv6.sysctl.table)
+		goto out_ipv6_icmp_table;
+
+	err = 0;
+out:
+	return err;
+
+out_ipv6_icmp_table:
+	kfree(ipv6_icmp_table);
+out_ipv6_route_table:
+	kfree(ipv6_route_table);
+out_ipv6_table:
+	kfree(ipv6_table);
+	goto out;
+}
+
+static void __net_exit ipv6_sysctl_net_exit(struct net *net)
+{
+	struct ctl_table *ipv6_table;
+	struct ctl_table *ipv6_route_table;
+	struct ctl_table *ipv6_icmp_table;
+
+	ipv6_table = net->ipv6.sysctl.table->ctl_table_arg;
+	ipv6_route_table = ipv6_table[0].child;
+	ipv6_icmp_table = ipv6_table[1].child;
+
+	unregister_net_sysctl_table(net->ipv6.sysctl.table);
+
+	kfree(ipv6_table);
+	kfree(ipv6_route_table);
+	kfree(ipv6_icmp_table);
+}
+
+static struct pernet_operations ipv6_sysctl_net_ops = {
+	.init = ipv6_sysctl_net_init,
+	.exit = ipv6_sysctl_net_exit,
+};
+
+static struct ctl_table_header *ip6_header;
+
+int ipv6_sysctl_register(void)
+{
+	int err = -ENOMEM;
+
+	ip6_header = register_net_sysctl_rotable(net_ipv6_ctl_path, ipv6_rotable);
+	if (ip6_header == NULL)
+		goto out;
+
+	err = register_pernet_subsys(&ipv6_sysctl_net_ops);
+	if (err)
+		goto err_pernet;
+out:
+	return err;
+
+err_pernet:
+	unregister_net_sysctl_table(ip6_header);
+	goto out;
+}
+
+void ipv6_sysctl_unregister(void)
+{
+	unregister_net_sysctl_table(ip6_header);
+	unregister_pernet_subsys(&ipv6_sysctl_net_ops);
+}
+
+static struct ctl_table_header *ip6_base;
+
+int ipv6_static_sysctl_register(void)
+{
+	ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton);
+	if (ip6_base == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ipv6_static_sysctl_unregister(void)
+{
+	unregister_net_sysctl_table(ip6_base);
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
new file mode 100644
index 00000000..848f9634
--- /dev/null
+++ b/net/ipv6/tcp_ipv6.c
@@ -0,0 +1,2321 @@
+/*
+ *	TCP over IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on:
+ *	linux/net/ipv4/tcp.c
+ *	linux/net/ipv4/tcp_input.c
+ *	linux/net/ipv4/tcp_output.c
+ *
+ *	Fixes:
+ *	Hideaki YOSHIFUJI	:	sin6_scope_id support
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ *	YOSHIFUJI Hideaki @USAGI:	convert /proc/net/tcp6 to seq_file.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bottom_half.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/jiffies.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
+#include <linux/ipsec.h>
+#include <linux/times.h>
+#include <linux/slab.h>
+
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+
+#include <net/tcp.h>
+#include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet6_connection_sock.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/ip6_checksum.h>
+#include <net/inet_ecn.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/snmp.h>
+#include <net/dsfield.h>
+#include <net/timewait_sock.h>
+#include <net/netdma.h>
+#include <net/inet_common.h>
+#include <net/secure_seq.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+static void	tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
+static void	tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+				      struct request_sock *req);
+
+static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+static void	__tcp_v6_send_check(struct sk_buff *skb,
+				    const struct in6_addr *saddr,
+				    const struct in6_addr *daddr);
+
+static const struct inet_connection_sock_af_ops ipv6_mapped;
+static const struct inet_connection_sock_af_ops ipv6_specific;
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
+#else
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+						   const struct in6_addr *addr)
+{
+	return NULL;
+}
+#endif
+
+static void tcp_v6_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
+			tcp_prot.hash(sk);
+			return;
+		}
+		local_bh_disable();
+		__inet6_hash(sk, NULL);
+		local_bh_enable();
+	}
+}
+
+static __inline__ __sum16 tcp_v6_check(int len,
+				   const struct in6_addr *saddr,
+				   const struct in6_addr *daddr,
+				   __wsum base)
+{
+	return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base);
+}
+
+static __u32 tcp_v6_init_sequence(struct sk_buff *skb)
+{
+	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+					    ipv6_hdr(skb)->saddr.s6_addr32,
+					    tcp_hdr(skb)->dest,
+					    tcp_hdr(skb)->source);
+}
+
+static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+			  int addr_len)
+{
+	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct in6_addr *saddr = NULL, *final_p, final;
+	struct rt6_info *rt;
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int addr_type;
+	int err;
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	if (usin->sin6_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	memset(&fl6, 0, sizeof(fl6));
+
+	if (np->sndflow) {
+		fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+		IP6_ECN_flow_init(fl6.flowlabel);
+		if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+			struct ip6_flowlabel *flowlabel;
+			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+			ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+			fl6_sock_release(flowlabel);
+		}
+	}
+
+	/*
+	 *	connect() to INADDR_ANY means loopback (BSD'ism).
+	 */
+
+	if(ipv6_addr_any(&usin->sin6_addr))
+		usin->sin6_addr.s6_addr[15] = 0x1;
+
+	addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+	if(addr_type & IPV6_ADDR_MULTICAST)
+		return -ENETUNREACH;
+
+	if (addr_type&IPV6_ADDR_LINKLOCAL) {
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    usin->sin6_scope_id) {
+			/* If interface is set while binding, indices
+			 * must coincide.
+			 */
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != usin->sin6_scope_id)
+				return -EINVAL;
+
+			sk->sk_bound_dev_if = usin->sin6_scope_id;
+		}
+
+		/* Connect to link-local address requires an interface */
+		if (!sk->sk_bound_dev_if)
+			return -EINVAL;
+	}
+
+	if (tp->rx_opt.ts_recent_stamp &&
+	    !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) {
+		tp->rx_opt.ts_recent = 0;
+		tp->rx_opt.ts_recent_stamp = 0;
+		tp->write_seq = 0;
+	}
+
+	ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
+	np->flow_label = fl6.flowlabel;
+
+	/*
+	 *	TCP over IPv4
+	 */
+
+	if (addr_type == IPV6_ADDR_MAPPED) {
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
+		struct sockaddr_in sin;
+
+		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+		if (__ipv6_only_sock(sk))
+			return -ENETUNREACH;
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = usin->sin6_port;
+		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+		icsk->icsk_af_ops = &ipv6_mapped;
+		sk->sk_backlog_rcv = tcp_v4_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+		tp->af_specific = &tcp_sock_ipv6_mapped_specific;
+#endif
+
+		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+
+		if (err) {
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &ipv6_specific;
+			sk->sk_backlog_rcv = tcp_v6_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+			tp->af_specific = &tcp_sock_ipv6_specific;
+#endif
+			goto failure;
+		} else {
+			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
+					       &np->rcv_saddr);
+		}
+
+		return err;
+	}
+
+	if (!ipv6_addr_any(&np->rcv_saddr))
+		saddr = &np->rcv_saddr;
+
+	fl6.flowi6_proto = IPPROTO_TCP;
+	ipv6_addr_copy(&fl6.daddr, &np->daddr);
+	ipv6_addr_copy(&fl6.saddr,
+		       (saddr ? saddr : &np->saddr));
+	fl6.flowi6_oif = sk->sk_bound_dev_if;
+	fl6.flowi6_mark = sk->sk_mark;
+	fl6.fl6_dport = usin->sin6_port;
+	fl6.fl6_sport = inet->inet_sport;
+
+	final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		goto failure;
+	}
+
+	if (saddr == NULL) {
+		saddr = &fl6.saddr;
+		ipv6_addr_copy(&np->rcv_saddr, saddr);
+	}
+
+	/* set the source address */
+	ipv6_addr_copy(&np->saddr, saddr);
+	inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+	sk->sk_gso_type = SKB_GSO_TCPV6;
+	__ip6_dst_store(sk, dst, NULL, NULL);
+
+	rt = (struct rt6_info *) dst;
+	if (tcp_death_row.sysctl_tw_recycle &&
+	    !tp->rx_opt.ts_recent_stamp &&
+	    ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) {
+		struct inet_peer *peer = rt6_get_peer(rt);
+		/*
+		 * VJ's idea. We save last timestamp seen from
+		 * the destination in peer table, when entering state
+		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
+		 * when trying new connection.
+		 */
+		if (peer) {
+			inet_peer_refcheck(peer);
+			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+				tp->rx_opt.ts_recent = peer->tcp_ts;
+			}
+		}
+	}
+
+	icsk->icsk_ext_hdr_len = 0;
+	if (np->opt)
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
+
+	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+
+	inet->inet_dport = usin->sin6_port;
+
+	tcp_set_state(sk, TCP_SYN_SENT);
+	err = inet6_hash_connect(&tcp_death_row, sk);
+	if (err)
+		goto late_failure;
+
+	if (!tp->write_seq)
+		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
+							     np->daddr.s6_addr32,
+							     inet->inet_sport,
+							     inet->inet_dport);
+
+	err = tcp_connect(sk);
+	if (err)
+		goto late_failure;
+
+	return 0;
+
+late_failure:
+	tcp_set_state(sk, TCP_CLOSE);
+	__sk_dst_reset(sk);
+failure:
+	inet->inet_dport = 0;
+	sk->sk_route_caps = 0;
+	return err;
+}
+
+static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		u8 type, u8 code, int offset, __be32 info)
+{
+	const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data;
+	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+	struct ipv6_pinfo *np;
+	struct sock *sk;
+	int err;
+	struct tcp_sock *tp;
+	__u32 seq;
+	struct net *net = dev_net(skb->dev);
+
+	sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr,
+			th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
+
+	if (sk == NULL) {
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
+		return;
+	}
+
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) {
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+		goto out;
+	}
+
+	tp = tcp_sk(sk);
+	seq = ntohl(th->seq);
+	if (sk->sk_state != TCP_LISTEN &&
+	    !between(seq, tp->snd_una, tp->snd_nxt)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	np = inet6_sk(sk);
+
+	if (type == ICMPV6_PKT_TOOBIG) {
+		struct dst_entry *dst;
+
+		if (sock_owned_by_user(sk))
+			goto out;
+		if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+			goto out;
+
+		/* icmp should have updated the destination cache entry */
+		dst = __sk_dst_check(sk, np->dst_cookie);
+
+		if (dst == NULL) {
+			struct inet_sock *inet = inet_sk(sk);
+			struct flowi6 fl6;
+
+			/* BUGGG_FUTURE: Again, it is not clear how
+			   to handle rthdr case. Ignore this complexity
+			   for now.
+			 */
+			memset(&fl6, 0, sizeof(fl6));
+			fl6.flowi6_proto = IPPROTO_TCP;
+			ipv6_addr_copy(&fl6.daddr, &np->daddr);
+			ipv6_addr_copy(&fl6.saddr, &np->saddr);
+			fl6.flowi6_oif = sk->sk_bound_dev_if;
+			fl6.flowi6_mark = sk->sk_mark;
+			fl6.fl6_dport = inet->inet_dport;
+			fl6.fl6_sport = inet->inet_sport;
+			security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+			dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
+			if (IS_ERR(dst)) {
+				sk->sk_err_soft = -PTR_ERR(dst);
+				goto out;
+			}
+
+		} else
+			dst_hold(dst);
+
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+			tcp_sync_mss(sk, dst_mtu(dst));
+			tcp_simple_retransmit(sk);
+		} /* else let the usual retransmit timer handle it */
+		dst_release(dst);
+		goto out;
+	}
+
+	icmpv6_err_convert(type, code, &err);
+
+	/* Might be for an request_sock */
+	switch (sk->sk_state) {
+		struct request_sock *req, **prev;
+	case TCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+
+		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
+					   &hdr->saddr, inet6_iif(skb));
+		if (!req)
+			goto out;
+
+		/* ICMPs are not backlogged, hence we cannot get
+		 * an established socket here.
+		 */
+		WARN_ON(req->sk != NULL);
+
+		if (seq != tcp_rsk(req)->snt_isn) {
+			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:  /* Cannot happen.
+			       It can, it SYNs are crossed. --ANK */
+		if (!sock_owned_by_user(sk)) {
+			sk->sk_err = err;
+			sk->sk_error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
+
+			tcp_done(sk);
+		} else
+			sk->sk_err_soft = err;
+		goto out;
+	}
+
+	if (!sock_owned_by_user(sk) && np->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else
+		sk->sk_err_soft = err;
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+
+static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
+			      struct request_values *rvp)
+{
+	struct inet6_request_sock *treq = inet6_rsk(req);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff * skb;
+	struct ipv6_txoptions *opt = NULL;
+	struct in6_addr * final_p, final;
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int err;
+
+	memset(&fl6, 0, sizeof(fl6));
+	fl6.flowi6_proto = IPPROTO_TCP;
+	ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+	ipv6_addr_copy(&fl6.saddr, &treq->loc_addr);
+	fl6.flowlabel = 0;
+	fl6.flowi6_oif = treq->iif;
+	fl6.flowi6_mark = sk->sk_mark;
+	fl6.fl6_dport = inet_rsk(req)->rmt_port;
+	fl6.fl6_sport = inet_rsk(req)->loc_port;
+	security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+
+	opt = np->opt;
+	final_p = fl6_update_dst(&fl6, opt, &final);
+
+	dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		dst = NULL;
+		goto done;
+	}
+	skb = tcp_make_synack(sk, dst, req, rvp);
+	err = -ENOMEM;
+	if (skb) {
+		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
+
+		ipv6_addr_copy(&fl6.daddr, &treq->rmt_addr);
+		err = ip6_xmit(sk, skb, &fl6, opt);
+		err = net_xmit_eval(err);
+	}
+
+done:
+	if (opt && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	dst_release(dst);
+	return err;
+}
+
+static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
+			     struct request_values *rvp)
+{
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+	return tcp_v6_send_synack(sk, req, rvp);
+}
+
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+#ifdef CONFIG_SYN_COOKIES
+	if (sysctl_tcp_syncookies)
+		printk(KERN_INFO
+		       "TCPv6: Possible SYN flooding on port %d. "
+		       "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest));
+	else
+#endif
+		printk(KERN_INFO
+		       "TCPv6: Possible SYN flooding on port %d. "
+		       "Dropping request.\n", ntohs(tcp_hdr(skb)->dest));
+}
+
+static void tcp_v6_reqsk_destructor(struct request_sock *req)
+{
+	kfree_skb(inet6_rsk(req)->pktopts);
+}
+
+#ifdef CONFIG_TCP_MD5SIG
+static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
+						   const struct in6_addr *addr)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int i;
+
+	BUG_ON(tp == NULL);
+
+	if (!tp->md5sig_info || !tp->md5sig_info->entries6)
+		return NULL;
+
+	for (i = 0; i < tp->md5sig_info->entries6; i++) {
+		if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, addr))
+			return &tp->md5sig_info->keys6[i].base;
+	}
+	return NULL;
+}
+
+static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk,
+						struct sock *addr_sk)
+{
+	return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr);
+}
+
+static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk,
+						      struct request_sock *req)
+{
+	return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr);
+}
+
+static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer,
+			     char *newkey, u8 newkeylen)
+{
+	/* Add key to the list */
+	struct tcp_md5sig_key *key;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp6_md5sig_key *keys;
+
+	key = tcp_v6_md5_do_lookup(sk, peer);
+	if (key) {
+		/* modify existing entry - just update that one */
+		kfree(key->key);
+		key->key = newkey;
+		key->keylen = newkeylen;
+	} else {
+		/* reallocate new list if current one is full. */
+		if (!tp->md5sig_info) {
+			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
+			if (!tp->md5sig_info) {
+				kfree(newkey);
+				return -ENOMEM;
+			}
+			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+		}
+		if (tp->md5sig_info->entries6 == 0 &&
+			tcp_alloc_md5sig_pool(sk) == NULL) {
+			kfree(newkey);
+			return -ENOMEM;
+		}
+		if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) {
+			keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) *
+				       (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC);
+
+			if (!keys) {
+				kfree(newkey);
+				if (tp->md5sig_info->entries6 == 0)
+					tcp_free_md5sig_pool();
+				return -ENOMEM;
+			}
+
+			if (tp->md5sig_info->entries6)
+				memmove(keys, tp->md5sig_info->keys6,
+					(sizeof (tp->md5sig_info->keys6[0]) *
+					 tp->md5sig_info->entries6));
+
+			kfree(tp->md5sig_info->keys6);
+			tp->md5sig_info->keys6 = keys;
+			tp->md5sig_info->alloced6++;
+		}
+
+		ipv6_addr_copy(&tp->md5sig_info->keys6[tp->md5sig_info->entries6].addr,
+			       peer);
+		tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.key = newkey;
+		tp->md5sig_info->keys6[tp->md5sig_info->entries6].base.keylen = newkeylen;
+
+		tp->md5sig_info->entries6++;
+	}
+	return 0;
+}
+
+static int tcp_v6_md5_add_func(struct sock *sk, struct sock *addr_sk,
+			       u8 *newkey, __u8 newkeylen)
+{
+	return tcp_v6_md5_do_add(sk, &inet6_sk(addr_sk)->daddr,
+				 newkey, newkeylen);
+}
+
+static int tcp_v6_md5_do_del(struct sock *sk, const struct in6_addr *peer)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int i;
+
+	for (i = 0; i < tp->md5sig_info->entries6; i++) {
+		if (ipv6_addr_equal(&tp->md5sig_info->keys6[i].addr, peer)) {
+			/* Free the key */
+			kfree(tp->md5sig_info->keys6[i].base.key);
+			tp->md5sig_info->entries6--;
+
+			if (tp->md5sig_info->entries6 == 0) {
+				kfree(tp->md5sig_info->keys6);
+				tp->md5sig_info->keys6 = NULL;
+				tp->md5sig_info->alloced6 = 0;
+				tcp_free_md5sig_pool();
+			} else {
+				/* shrink the database */
+				if (tp->md5sig_info->entries6 != i)
+					memmove(&tp->md5sig_info->keys6[i],
+						&tp->md5sig_info->keys6[i+1],
+						(tp->md5sig_info->entries6 - i)
+						* sizeof (tp->md5sig_info->keys6[0]));
+			}
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static void tcp_v6_clear_md5_list (struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int i;
+
+	if (tp->md5sig_info->entries6) {
+		for (i = 0; i < tp->md5sig_info->entries6; i++)
+			kfree(tp->md5sig_info->keys6[i].base.key);
+		tp->md5sig_info->entries6 = 0;
+		tcp_free_md5sig_pool();
+	}
+
+	kfree(tp->md5sig_info->keys6);
+	tp->md5sig_info->keys6 = NULL;
+	tp->md5sig_info->alloced6 = 0;
+
+	if (tp->md5sig_info->entries4) {
+		for (i = 0; i < tp->md5sig_info->entries4; i++)
+			kfree(tp->md5sig_info->keys4[i].base.key);
+		tp->md5sig_info->entries4 = 0;
+		tcp_free_md5sig_pool();
+	}
+
+	kfree(tp->md5sig_info->keys4);
+	tp->md5sig_info->keys4 = NULL;
+	tp->md5sig_info->alloced4 = 0;
+}
+
+static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
+				  int optlen)
+{
+	struct tcp_md5sig cmd;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+	u8 *newkey;
+
+	if (optlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (copy_from_user(&cmd, optval, sizeof(cmd)))
+		return -EFAULT;
+
+	if (sin6->sin6_family != AF_INET6)
+		return -EINVAL;
+
+	if (!cmd.tcpm_keylen) {
+		if (!tcp_sk(sk)->md5sig_info)
+			return -ENOENT;
+		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+			return tcp_v4_md5_do_del(sk, sin6->sin6_addr.s6_addr32[3]);
+		return tcp_v6_md5_do_del(sk, &sin6->sin6_addr);
+	}
+
+	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
+		return -EINVAL;
+
+	if (!tcp_sk(sk)->md5sig_info) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct tcp_md5sig_info *p;
+
+		p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
+		if (!p)
+			return -ENOMEM;
+
+		tp->md5sig_info = p;
+		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+	}
+
+	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
+	if (!newkey)
+		return -ENOMEM;
+	if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+		return tcp_v4_md5_do_add(sk, sin6->sin6_addr.s6_addr32[3],
+					 newkey, cmd.tcpm_keylen);
+	}
+	return tcp_v6_md5_do_add(sk, &sin6->sin6_addr, newkey, cmd.tcpm_keylen);
+}
+
+static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
+					const struct in6_addr *daddr,
+					const struct in6_addr *saddr, int nbytes)
+{
+	struct tcp6_pseudohdr *bp;
+	struct scatterlist sg;
+
+	bp = &hp->md5_blk.ip6;
+	/* 1. TCP pseudo-header (RFC2460) */
+	ipv6_addr_copy(&bp->saddr, saddr);
+	ipv6_addr_copy(&bp->daddr, daddr);
+	bp->protocol = cpu_to_be32(IPPROTO_TCP);
+	bp->len = cpu_to_be32(nbytes);
+
+	sg_init_one(&sg, bp, sizeof(*bp));
+	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+}
+
+static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
+			       const struct in6_addr *daddr, struct in6_addr *saddr,
+			       struct tcphdr *th)
+{
+	struct tcp_md5sig_pool *hp;
+	struct hash_desc *desc;
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	desc = &hp->md5_desc;
+
+	if (crypto_hash_init(desc))
+		goto clear_hash;
+	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_header(hp, th))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	if (crypto_hash_final(desc, md5_hash))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+
+static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
+			       struct sock *sk, struct request_sock *req,
+			       struct sk_buff *skb)
+{
+	const struct in6_addr *saddr, *daddr;
+	struct tcp_md5sig_pool *hp;
+	struct hash_desc *desc;
+	struct tcphdr *th = tcp_hdr(skb);
+
+	if (sk) {
+		saddr = &inet6_sk(sk)->saddr;
+		daddr = &inet6_sk(sk)->daddr;
+	} else if (req) {
+		saddr = &inet6_rsk(req)->loc_addr;
+		daddr = &inet6_rsk(req)->rmt_addr;
+	} else {
+		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		saddr = &ip6h->saddr;
+		daddr = &ip6h->daddr;
+	}
+
+	hp = tcp_get_md5sig_pool();
+	if (!hp)
+		goto clear_hash_noput;
+	desc = &hp->md5_desc;
+
+	if (crypto_hash_init(desc))
+		goto clear_hash;
+
+	if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
+		goto clear_hash;
+	if (tcp_md5_hash_header(hp, th))
+		goto clear_hash;
+	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
+		goto clear_hash;
+	if (tcp_md5_hash_key(hp, key))
+		goto clear_hash;
+	if (crypto_hash_final(desc, md5_hash))
+		goto clear_hash;
+
+	tcp_put_md5sig_pool();
+	return 0;
+
+clear_hash:
+	tcp_put_md5sig_pool();
+clear_hash_noput:
+	memset(md5_hash, 0, 16);
+	return 1;
+}
+
+static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
+{
+	__u8 *hash_location = NULL;
+	struct tcp_md5sig_key *hash_expected;
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct tcphdr *th = tcp_hdr(skb);
+	int genhash;
+	u8 newhash[16];
+
+	hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
+	hash_location = tcp_parse_md5sig_option(th);
+
+	/* We've parsed the options - do we have a hash? */
+	if (!hash_expected && !hash_location)
+		return 0;
+
+	if (hash_expected && !hash_location) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+		return 1;
+	}
+
+	if (!hash_expected && hash_location) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+		return 1;
+	}
+
+	/* check the signature */
+	genhash = tcp_v6_md5_hash_skb(newhash,
+				      hash_expected,
+				      NULL, NULL, skb);
+
+	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
+		if (net_ratelimit()) {
+			printk(KERN_INFO "MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
+			       genhash ? "failed" : "mismatch",
+			       &ip6h->saddr, ntohs(th->source),
+			       &ip6h->daddr, ntohs(th->dest));
+		}
+		return 1;
+	}
+	return 0;
+}
+#endif
+
+struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
+	.family		=	AF_INET6,
+	.obj_size	=	sizeof(struct tcp6_request_sock),
+	.rtx_syn_ack	=	tcp_v6_rtx_synack,
+	.send_ack	=	tcp_v6_reqsk_send_ack,
+	.destructor	=	tcp_v6_reqsk_destructor,
+	.send_reset	=	tcp_v6_send_reset,
+	.syn_ack_timeout = 	tcp_syn_ack_timeout,
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,
+	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
+};
+#endif
+
+static void __tcp_v6_send_check(struct sk_buff *skb,
+				const struct in6_addr *saddr, const struct in6_addr *daddr)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct tcphdr, check);
+	} else {
+		th->check = tcp_v6_check(skb->len, saddr, daddr,
+					 csum_partial(th, th->doff << 2,
+						      skb->csum));
+	}
+}
+
+static void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	__tcp_v6_send_check(skb, &np->saddr, &np->daddr);
+}
+
+static int tcp_v6_gso_send_check(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ipv6h;
+	struct tcphdr *th;
+
+	if (!pskb_may_pull(skb, sizeof(*th)))
+		return -EINVAL;
+
+	ipv6h = ipv6_hdr(skb);
+	th = tcp_hdr(skb);
+
+	th->check = 0;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	__tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr);
+	return 0;
+}
+
+static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = skb_gro_network_header(skb);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
+				  skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+
+		/* fall through */
+	case CHECKSUM_NONE:
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	return tcp_gro_receive(head, skb);
+}
+
+static int tcp6_gro_complete(struct sk_buff *skb)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr *th = tcp_hdr(skb);
+
+	th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb),
+				  &iph->saddr, &iph->daddr, 0);
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+
+	return tcp_gro_complete(skb);
+}
+
+static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
+				 u32 ts, struct tcp_md5sig_key *key, int rst)
+{
+	struct tcphdr *th = tcp_hdr(skb), *t1;
+	struct sk_buff *buff;
+	struct flowi6 fl6;
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct sock *ctl_sk = net->ipv6.tcp_sk;
+	unsigned int tot_len = sizeof(struct tcphdr);
+	struct dst_entry *dst;
+	__be32 *topt;
+
+	if (ts)
+		tot_len += TCPOLEN_TSTAMP_ALIGNED;
+#ifdef CONFIG_TCP_MD5SIG
+	if (key)
+		tot_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
+			 GFP_ATOMIC);
+	if (buff == NULL)
+		return;
+
+	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+
+	t1 = (struct tcphdr *) skb_push(buff, tot_len);
+	skb_reset_transport_header(buff);
+
+	/* Swap the send and the receive. */
+	memset(t1, 0, sizeof(*t1));
+	t1->dest = th->source;
+	t1->source = th->dest;
+	t1->doff = tot_len / 4;
+	t1->seq = htonl(seq);
+	t1->ack_seq = htonl(ack);
+	t1->ack = !rst || !th->ack;
+	t1->rst = rst;
+	t1->window = htons(win);
+
+	topt = (__be32 *)(t1 + 1);
+
+	if (ts) {
+		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				(TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+		*topt++ = htonl(tcp_time_stamp);
+		*topt++ = htonl(ts);
+	}
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (key) {
+		*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+		tcp_v6_md5_hash_hdr((__u8 *)topt, key,
+				    &ipv6_hdr(skb)->saddr,
+				    &ipv6_hdr(skb)->daddr, t1);
+	}
+#endif
+
+	memset(&fl6, 0, sizeof(fl6));
+	ipv6_addr_copy(&fl6.daddr, &ipv6_hdr(skb)->saddr);
+	ipv6_addr_copy(&fl6.saddr, &ipv6_hdr(skb)->daddr);
+
+	buff->ip_summed = CHECKSUM_PARTIAL;
+	buff->csum = 0;
+
+	__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
+
+	fl6.flowi6_proto = IPPROTO_TCP;
+	fl6.flowi6_oif = inet6_iif(skb);
+	fl6.fl6_dport = t1->dest;
+	fl6.fl6_sport = t1->source;
+	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+
+	/* Pass a socket to ip6_dst_lookup either it is for RST
+	 * Underlying function will use this to retrieve the network
+	 * namespace
+	 */
+	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
+	if (!IS_ERR(dst)) {
+		skb_dst_set(buff, dst);
+		ip6_xmit(ctl_sk, buff, &fl6, NULL);
+		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+		if (rst)
+			TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+		return;
+	}
+
+	kfree_skb(buff);
+}
+
+static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	u32 seq = 0, ack_seq = 0;
+	struct tcp_md5sig_key *key = NULL;
+
+	if (th->rst)
+		return;
+
+	if (!ipv6_unicast_destination(skb))
+		return;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (sk)
+		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr);
+#endif
+
+	if (th->ack)
+		seq = ntohl(th->ack_seq);
+	else
+		ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
+			  (th->doff << 2);
+
+	tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1);
+}
+
+static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts,
+			    struct tcp_md5sig_key *key)
+{
+	tcp_v6_send_response(skb, seq, ack, win, ts, key, 0);
+}
+
+static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+
+	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw));
+
+	inet_twsk_put(tw);
+}
+
+static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+				  struct request_sock *req)
+{
+	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent,
+			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr));
+}
+
+
+static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+	struct request_sock *req, **prev;
+	const struct tcphdr *th = tcp_hdr(skb);
+	struct sock *nsk;
+
+	/* Find possible connection requests. */
+	req = inet6_csk_search_req(sk, &prev, th->source,
+				   &ipv6_hdr(skb)->saddr,
+				   &ipv6_hdr(skb)->daddr, inet6_iif(skb));
+	if (req)
+		return tcp_check_req(sk, skb, req, prev);
+
+	nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
+			&ipv6_hdr(skb)->saddr, th->source,
+			&ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb));
+
+	if (nsk) {
+		if (nsk->sk_state != TCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put(inet_twsk(nsk));
+		return NULL;
+	}
+
+#ifdef CONFIG_SYN_COOKIES
+	if (!th->syn)
+		sk = cookie_v6_check(sk, skb);
+#endif
+	return sk;
+}
+
+/* FIXME: this is substantially similar to the ipv4 code.
+ * Can some kind of merge be done? -- erics
+ */
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_extend_values tmp_ext;
+	struct tcp_options_received tmp_opt;
+	u8 *hash_location;
+	struct request_sock *req;
+	struct inet6_request_sock *treq;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = NULL;
+#ifdef CONFIG_SYN_COOKIES
+	int want_cookie = 0;
+#else
+#define want_cookie 0
+#endif
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return tcp_v4_conn_request(sk, skb);
+
+	if (!ipv6_unicast_destination(skb))
+		goto drop;
+
+	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+		if (net_ratelimit())
+			syn_flood_warning(skb);
+#ifdef CONFIG_SYN_COOKIES
+		if (sysctl_tcp_syncookies)
+			want_cookie = 1;
+		else
+#endif
+		goto drop;
+	}
+
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
+	if (req == NULL)
+		goto drop;
+
+#ifdef CONFIG_TCP_MD5SIG
+	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
+#endif
+
+	tcp_clear_options(&tmp_opt);
+	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+	tmp_opt.user_mss = tp->rx_opt.user_mss;
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+	if (tmp_opt.cookie_plus > 0 &&
+	    tmp_opt.saw_tstamp &&
+	    !tp->rx_opt.cookie_out_never &&
+	    (sysctl_tcp_cookie_size > 0 ||
+	     (tp->cookie_values != NULL &&
+	      tp->cookie_values->cookie_desired > 0))) {
+		u8 *c;
+		u32 *d;
+		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
+		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
+
+		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
+			goto drop_and_free;
+
+		/* Secret recipe starts with IP addresses */
+		d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0];
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+		d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0];
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+		*mess++ ^= *d++;
+
+		/* plus variable length Initiator Cookie */
+		c = (u8 *)mess;
+		while (l-- > 0)
+			*c++ ^= *hash_location++;
+
+#ifdef CONFIG_SYN_COOKIES
+		want_cookie = 0;	/* not our kind of cookie */
+#endif
+		tmp_ext.cookie_out_never = 0; /* false */
+		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
+	} else if (!tp->rx_opt.cookie_in_always) {
+		/* redundant indications, but ensure initialization. */
+		tmp_ext.cookie_out_never = 1; /* true */
+		tmp_ext.cookie_plus = 0;
+	} else {
+		goto drop_and_free;
+	}
+	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+
+	if (want_cookie && !tmp_opt.saw_tstamp)
+		tcp_clear_options(&tmp_opt);
+
+	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+	tcp_openreq_init(req, &tmp_opt, skb);
+
+	treq = inet6_rsk(req);
+	ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr);
+	ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr);
+	if (!want_cookie || tmp_opt.tstamp_ok)
+		TCP_ECN_create_request(req, tcp_hdr(skb));
+
+	if (!isn) {
+		struct inet_peer *peer = NULL;
+
+		if (ipv6_opt_accepted(sk, skb) ||
+		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+			atomic_inc(&skb->users);
+			treq->pktopts = skb;
+		}
+		treq->iif = sk->sk_bound_dev_if;
+
+		/* So that link locals have meaning */
+		if (!sk->sk_bound_dev_if &&
+		    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+			treq->iif = inet6_iif(skb);
+
+		if (want_cookie) {
+			isn = cookie_v6_init_sequence(sk, skb, &req->mss);
+			req->cookie_ts = tmp_opt.tstamp_ok;
+			goto have_isn;
+		}
+
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tmp_opt.saw_tstamp &&
+		    tcp_death_row.sysctl_tw_recycle &&
+		    (dst = inet6_csk_route_req(sk, req)) != NULL &&
+		    (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL &&
+		    ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6,
+				    &treq->rmt_addr)) {
+			inet_peer_refcheck(peer);
+			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
+			    (s32)(peer->tcp_ts - req->ts_recent) >
+							TCP_PAWS_WINDOW) {
+				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+				goto drop_and_release;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (sysctl_max_syn_backlog >> 2)) &&
+			 (!peer || !peer->tcp_ts_stamp) &&
+			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations,
+			 * proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
+				       &treq->rmt_addr, ntohs(tcp_hdr(skb)->source));
+			goto drop_and_release;
+		}
+
+		isn = tcp_v6_init_sequence(skb);
+	}
+have_isn:
+	tcp_rsk(req)->snt_isn = isn;
+
+	security_inet_conn_request(sk, skb, req);
+
+	if (tcp_v6_send_synack(sk, req,
+			       (struct request_values *)&tmp_ext) ||
+	    want_cookie)
+		goto drop_and_free;
+
+	inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_release:
+	dst_release(dst);
+drop_and_free:
+	reqsk_free(req);
+drop:
+	return 0; /* don't send reset */
+}
+
+static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+					  struct request_sock *req,
+					  struct dst_entry *dst)
+{
+	struct inet6_request_sock *treq;
+	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+	struct tcp6_sock *newtcp6sk;
+	struct inet_sock *newinet;
+	struct tcp_sock *newtp;
+	struct sock *newsk;
+	struct ipv6_txoptions *opt;
+#ifdef CONFIG_TCP_MD5SIG
+	struct tcp_md5sig_key *key;
+#endif
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		/*
+		 *	v6 mapped
+		 */
+
+		newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst);
+
+		if (newsk == NULL)
+			return NULL;
+
+		newtcp6sk = (struct tcp6_sock *)newsk;
+		inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+
+		newinet = inet_sk(newsk);
+		newnp = inet6_sk(newsk);
+		newtp = tcp_sk(newsk);
+
+		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+		ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr);
+
+		ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
+
+		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+
+		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
+#ifdef CONFIG_TCP_MD5SIG
+		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
+#endif
+
+		newnp->ipv6_ac_list = NULL;
+		newnp->ipv6_fl_list = NULL;
+		newnp->pktoptions  = NULL;
+		newnp->opt	   = NULL;
+		newnp->mcast_oif   = inet6_iif(skb);
+		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
+
+		/*
+		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+		 * here, tcp_create_openreq_child now does this for us, see the comment in
+		 * that function for the gory details. -acme
+		 */
+
+		/* It is tricky place. Until this moment IPv4 tcp
+		   worked with IPv6 icsk.icsk_af_ops.
+		   Sync it now.
+		 */
+		tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
+
+		return newsk;
+	}
+
+	treq = inet6_rsk(req);
+	opt = np->opt;
+
+	if (sk_acceptq_is_full(sk))
+		goto out_overflow;
+
+	if (!dst) {
+		dst = inet6_csk_route_req(sk, req);
+		if (!dst)
+			goto out;
+	}
+
+	newsk = tcp_create_openreq_child(sk, req, skb);
+	if (newsk == NULL)
+		goto out_nonewsk;
+
+	/*
+	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
+	 * count here, tcp_create_openreq_child now does this for us, see the
+	 * comment in that function for the gory details. -acme
+	 */
+
+	newsk->sk_gso_type = SKB_GSO_TCPV6;
+	__ip6_dst_store(newsk, dst, NULL, NULL);
+
+	newtcp6sk = (struct tcp6_sock *)newsk;
+	inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+
+	newtp = tcp_sk(newsk);
+	newinet = inet_sk(newsk);
+	newnp = inet6_sk(newsk);
+
+	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+	ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
+	ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
+	ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
+	newsk->sk_bound_dev_if = treq->iif;
+
+	/* Now IPv6 options...
+
+	   First: no IPv4 options.
+	 */
+	newinet->inet_opt = NULL;
+	newnp->ipv6_ac_list = NULL;
+	newnp->ipv6_fl_list = NULL;
+
+	/* Clone RX bits */
+	newnp->rxopt.all = np->rxopt.all;
+
+	/* Clone pktoptions received with SYN */
+	newnp->pktoptions = NULL;
+	if (treq->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
+		kfree_skb(treq->pktopts);
+		treq->pktopts = NULL;
+		if (newnp->pktoptions)
+			skb_set_owner_r(newnp->pktoptions, newsk);
+	}
+	newnp->opt	  = NULL;
+	newnp->mcast_oif  = inet6_iif(skb);
+	newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
+
+	/* Clone native IPv6 options from listening socket (if any)
+
+	   Yes, keeping reference count would be much more clever,
+	   but we make one more one thing there: reattach optmem
+	   to newsk.
+	 */
+	if (opt) {
+		newnp->opt = ipv6_dup_options(newsk, opt);
+		if (opt != np->opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+	}
+
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
+	if (newnp->opt)
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
+
+	tcp_mtup_init(newsk);
+	tcp_sync_mss(newsk, dst_mtu(dst));
+	newtp->advmss = dst_metric_advmss(dst);
+	if (tcp_sk(sk)->rx_opt.user_mss &&
+	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
+		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
+
+	tcp_initialize_rcv_mss(newsk);
+
+	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+#ifdef CONFIG_TCP_MD5SIG
+	/* Copy over the MD5 key from the original socket */
+	if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) {
+		/* We're using one, so create a matching key
+		 * on the newsk structure. If we fail to get
+		 * memory, then we end up not copying the key
+		 * across. Shucks.
+		 */
+		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
+		if (newkey != NULL)
+			tcp_v6_md5_do_add(newsk, &newnp->daddr,
+					  newkey, key->keylen);
+	}
+#endif
+
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
+	__inet6_hash(newsk, NULL);
+
+	return newsk;
+
+out_overflow:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+	if (opt && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	dst_release(dst);
+out:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	return NULL;
+}
+
+static __sum16 tcp_v6_checksum_init(struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		if (!tcp_v6_check(skb->len, &ipv6_hdr(skb)->saddr,
+				  &ipv6_hdr(skb)->daddr, skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			return 0;
+		}
+	}
+
+	skb->csum = ~csum_unfold(tcp_v6_check(skb->len,
+					      &ipv6_hdr(skb)->saddr,
+					      &ipv6_hdr(skb)->daddr, 0));
+
+	if (skb->len <= 76) {
+		return __skb_checksum_complete(skb);
+	}
+	return 0;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcp_sock *tp;
+	struct sk_buff *opt_skb = NULL;
+
+	/* Imagine: socket is IPv6. IPv4 packet arrives,
+	   goes to IPv4 receive handler and backlogged.
+	   From backlog it always goes here. Kerboom...
+	   Fortunately, tcp_rcv_established and rcv_established
+	   handle them correctly, but it is not case with
+	   tcp_v6_hnd_req and tcp_v6_send_reset().   --ANK
+	 */
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return tcp_v4_do_rcv(sk, skb);
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (tcp_v6_inbound_md5_hash (sk, skb))
+		goto discard;
+#endif
+
+	if (sk_filter(sk, skb))
+		goto discard;
+
+	/*
+	 *	socket locking is here for SMP purposes as backlog rcv
+	 *	is currently called with bh processing disabled.
+	 */
+
+	/* Do Stevens' IPV6_PKTOPTIONS.
+
+	   Yes, guys, it is the only place in our code, where we
+	   may make it not affecting IPv4.
+	   The rest of code is protocol independent,
+	   and I do not like idea to uglify IPv4.
+
+	   Actually, all the idea behind IPV6_PKTOPTIONS
+	   looks not very well thought. For now we latch
+	   options, received in the last packet, enqueued
+	   by tcp. Feel free to propose better solution.
+					       --ANK (980728)
+	 */
+	if (np->rxopt.all)
+		opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		sock_rps_save_rxhash(sk, skb->rxhash);
+		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
+			goto reset;
+		if (opt_skb)
+			goto ipv6_pktoptions;
+		return 0;
+	}
+
+	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
+		goto csum_err;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct sock *nsk = tcp_v6_hnd_req(sk, skb);
+		if (!nsk)
+			goto discard;
+
+		/*
+		 * Queue it on the new socket if the new socket is active,
+		 * otherwise we just shortcircuit this and continue with
+		 * the new socket..
+		 */
+		if(nsk != sk) {
+			sock_rps_save_rxhash(nsk, skb->rxhash);
+			if (tcp_child_process(sk, nsk, skb))
+				goto reset;
+			if (opt_skb)
+				__kfree_skb(opt_skb);
+			return 0;
+		}
+	} else
+		sock_rps_save_rxhash(sk, skb->rxhash);
+
+	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len))
+		goto reset;
+	if (opt_skb)
+		goto ipv6_pktoptions;
+	return 0;
+
+reset:
+	tcp_v6_send_reset(sk, skb);
+discard:
+	if (opt_skb)
+		__kfree_skb(opt_skb);
+	kfree_skb(skb);
+	return 0;
+csum_err:
+	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+	goto discard;
+
+
+ipv6_pktoptions:
+	/* Do you ask, what is it?
+
+	   1. skb was enqueued by tcp.
+	   2. skb is added to tail of read queue, rather than out of order.
+	   3. socket is not in passive state.
+	   4. Finally, it really contains options, which user wants to receive.
+	 */
+	tp = tcp_sk(sk);
+	if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
+	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+		if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
+			np->mcast_oif = inet6_iif(opt_skb);
+		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
+			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+		if (ipv6_opt_accepted(sk, opt_skb)) {
+			skb_set_owner_r(opt_skb, sk);
+			opt_skb = xchg(&np->pktoptions, opt_skb);
+		} else {
+			__kfree_skb(opt_skb);
+			opt_skb = xchg(&np->pktoptions, NULL);
+		}
+	}
+
+	kfree_skb(opt_skb);
+	return 0;
+}
+
+static int tcp_v6_rcv(struct sk_buff *skb)
+{
+	struct tcphdr *th;
+	const struct ipv6hdr *hdr;
+	struct sock *sk;
+	int ret;
+	struct net *net = dev_net(skb->dev);
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto discard_it;
+
+	/*
+	 *	Count it even if it's bad.
+	 */
+	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+
+	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+		goto discard_it;
+
+	th = tcp_hdr(skb);
+
+	if (th->doff < sizeof(struct tcphdr)/4)
+		goto bad_packet;
+	if (!pskb_may_pull(skb, th->doff*4))
+		goto discard_it;
+
+	if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb))
+		goto bad_packet;
+
+	th = tcp_hdr(skb);
+	hdr = ipv6_hdr(skb);
+	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+				    skb->len - th->doff*4);
+	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->when = 0;
+	TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(hdr);
+	TCP_SKB_CB(skb)->sacked = 0;
+
+	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+	if (!sk)
+		goto no_tcp_socket;
+
+process:
+	if (sk->sk_state == TCP_TIME_WAIT)
+		goto do_time_wait;
+
+	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+		goto discard_and_relse;
+	}
+
+	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+
+	if (sk_filter(sk, skb))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	bh_lock_sock_nested(sk);
+	ret = 0;
+	if (!sock_owned_by_user(sk)) {
+#ifdef CONFIG_NET_DMA
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
+		if (tp->ucopy.dma_chan)
+			ret = tcp_v6_do_rcv(sk, skb);
+		else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
+				ret = tcp_v6_do_rcv(sk, skb);
+		}
+	} else if (unlikely(sk_add_backlog(sk, skb))) {
+		bh_unlock_sock(sk);
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+		goto discard_and_relse;
+	}
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+	return ret ? -1 : 0;
+
+no_tcp_socket:
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+
+	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+bad_packet:
+		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+	} else {
+		tcp_v6_send_reset(NULL, skb);
+	}
+
+discard_it:
+
+	/*
+	 *	Discard frame
+	 */
+
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		inet_twsk_put(inet_twsk(sk));
+		goto discard_it;
+	}
+
+	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
+		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+		inet_twsk_put(inet_twsk(sk));
+		goto discard_it;
+	}
+
+	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+	case TCP_TW_SYN:
+	{
+		struct sock *sk2;
+
+		sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+					    &ipv6_hdr(skb)->daddr,
+					    ntohs(th->dest), inet6_iif(skb));
+		if (sk2 != NULL) {
+			struct inet_timewait_sock *tw = inet_twsk(sk);
+			inet_twsk_deschedule(tw, &tcp_death_row);
+			inet_twsk_put(tw);
+			sk = sk2;
+			goto process;
+		}
+		/* Fall through to ACK */
+	}
+	case TCP_TW_ACK:
+		tcp_v6_timewait_ack(sk, skb);
+		break;
+	case TCP_TW_RST:
+		goto no_tcp_socket;
+	case TCP_TW_SUCCESS:;
+	}
+	goto discard_it;
+}
+
+static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it)
+{
+	struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet_peer *peer;
+
+	if (!rt ||
+	    !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) {
+		peer = inet_getpeer_v6(&np->daddr, 1);
+		*release_it = true;
+	} else {
+		if (!rt->rt6i_peer)
+			rt6_bind_peer(rt, 1);
+		peer = rt->rt6i_peer;
+		*release_it = false;
+	}
+
+	return peer;
+}
+
+static void *tcp_v6_tw_get_peer(struct sock *sk)
+{
+	struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+
+	if (tw->tw_family == AF_INET)
+		return tcp_v4_tw_get_peer(sk);
+
+	return inet_getpeer_v6(&tw6->tw_v6_daddr, 1);
+}
+
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+	.twsk_destructor= tcp_twsk_destructor,
+	.twsk_getpeer	= tcp_v6_tw_get_peer,
+};
+
+static const struct inet_connection_sock_af_ops ipv6_specific = {
+	.queue_xmit	   = inet6_csk_xmit,
+	.send_check	   = tcp_v6_send_check,
+	.rebuild_header	   = inet6_sk_rebuild_header,
+	.conn_request	   = tcp_v6_conn_request,
+	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
+	.get_peer	   = tcp_v6_get_peer,
+	.net_header_len	   = sizeof(struct ipv6hdr),
+	.setsockopt	   = ipv6_setsockopt,
+	.getsockopt	   = ipv6_getsockopt,
+	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ipv6_setsockopt,
+	.compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+	.md5_lookup	=	tcp_v6_md5_lookup,
+	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
+	.md5_add	=	tcp_v6_md5_add_func,
+	.md5_parse	=	tcp_v6_parse_md5_keys,
+};
+#endif
+
+/*
+ *	TCP over IPv4 via INET6 API
+ */
+
+static const struct inet_connection_sock_af_ops ipv6_mapped = {
+	.queue_xmit	   = ip_queue_xmit,
+	.send_check	   = tcp_v4_send_check,
+	.rebuild_header	   = inet_sk_rebuild_header,
+	.conn_request	   = tcp_v6_conn_request,
+	.syn_recv_sock	   = tcp_v6_syn_recv_sock,
+	.get_peer	   = tcp_v4_get_peer,
+	.net_header_len	   = sizeof(struct iphdr),
+	.setsockopt	   = ipv6_setsockopt,
+	.getsockopt	   = ipv6_getsockopt,
+	.addr2sockaddr	   = inet6_csk_addr2sockaddr,
+	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+	.bind_conflict	   = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ipv6_setsockopt,
+	.compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+#ifdef CONFIG_TCP_MD5SIG
+static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+	.md5_lookup	=	tcp_v4_md5_lookup,
+	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
+	.md5_add	=	tcp_v6_md5_add_func,
+	.md5_parse	=	tcp_v6_parse_md5_keys,
+};
+#endif
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int tcp_v6_init_sock(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	skb_queue_head_init(&tp->out_of_order_queue);
+	tcp_init_xmit_timers(sk);
+	tcp_prequeue_init(tp);
+
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
+	tp->mdev = TCP_TIMEOUT_INIT;
+
+	/* So many TCP implementations out there (incorrectly) count the
+	 * initial SYN frame in their delayed-ACK and congestion control
+	 * algorithms that we must have the following bandaid to talk
+	 * efficiently to them.  -DaveM
+	 */
+	tp->snd_cwnd = 2;
+
+	/* See draft-stevens-tcpca-spec-01 for discussion of the
+	 * initialization of these values.
+	 */
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	tp->snd_cwnd_clamp = ~0;
+	tp->mss_cache = TCP_MSS_DEFAULT;
+
+	tp->reordering = sysctl_tcp_reordering;
+
+	sk->sk_state = TCP_CLOSE;
+
+	icsk->icsk_af_ops = &ipv6_specific;
+	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_sync_mss = tcp_sync_mss;
+	sk->sk_write_space = sk_stream_write_space;
+	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+#ifdef CONFIG_TCP_MD5SIG
+	tp->af_specific = &tcp_sock_ipv6_specific;
+#endif
+
+	/* TCP Cookie Transactions */
+	if (sysctl_tcp_cookie_size > 0) {
+		/* Default, cookies without s_data_payload. */
+		tp->cookie_values =
+			kzalloc(sizeof(*tp->cookie_values),
+				sk->sk_allocation);
+		if (tp->cookie_values != NULL)
+			kref_init(&tp->cookie_values->kref);
+	}
+	/* Presumed zeroed, in order of appearance:
+	 *	cookie_in_always, cookie_out_never,
+	 *	s_data_constant, s_data_in, s_data_out
+	 */
+	sk->sk_sndbuf = sysctl_tcp_wmem[1];
+	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+	local_bh_disable();
+	percpu_counter_inc(&tcp_sockets_allocated);
+	local_bh_enable();
+
+	return 0;
+}
+
+static void tcp_v6_destroy_sock(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	/* Clean up the MD5 key list */
+	if (tcp_sk(sk)->md5sig_info)
+		tcp_v6_clear_md5_list(sk);
+#endif
+	tcp_v4_destroy_sock(sk);
+	inet6_destroy_sock(sk);
+}
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCPv6 sock list dumping. */
+static void get_openreq6(struct seq_file *seq,
+			 struct sock *sk, struct request_sock *req, int i, int uid)
+{
+	int ttd = req->expires - jiffies;
+	const struct in6_addr *src = &inet6_rsk(req)->loc_addr;
+	const struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
+
+	if (ttd < 0)
+		ttd = 0;
+
+	seq_printf(seq,
+		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+		   i,
+		   src->s6_addr32[0], src->s6_addr32[1],
+		   src->s6_addr32[2], src->s6_addr32[3],
+		   ntohs(inet_rsk(req)->loc_port),
+		   dest->s6_addr32[0], dest->s6_addr32[1],
+		   dest->s6_addr32[2], dest->s6_addr32[3],
+		   ntohs(inet_rsk(req)->rmt_port),
+		   TCP_SYN_RECV,
+		   0,0, /* could print option size, but that is af dependent. */
+		   1,   /* timers active (only the expire timer) */
+		   jiffies_to_clock_t(ttd),
+		   req->retrans,
+		   uid,
+		   0,  /* non standard timer */
+		   0, /* open_requests have no inode */
+		   0, req);
+}
+
+static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
+{
+	const struct in6_addr *dest, *src;
+	__u16 destp, srcp;
+	int timer_active;
+	unsigned long timer_expires;
+	struct inet_sock *inet = inet_sk(sp);
+	struct tcp_sock *tp = tcp_sk(sp);
+	const struct inet_connection_sock *icsk = inet_csk(sp);
+	struct ipv6_pinfo *np = inet6_sk(sp);
+
+	dest  = &np->daddr;
+	src   = &np->rcv_saddr;
+	destp = ntohs(inet->inet_dport);
+	srcp  = ntohs(inet->inet_sport);
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		timer_active	= 1;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		timer_active	= 4;
+		timer_expires	= icsk->icsk_timeout;
+	} else if (timer_pending(&sp->sk_timer)) {
+		timer_active	= 2;
+		timer_expires	= sp->sk_timer.expires;
+	} else {
+		timer_active	= 0;
+		timer_expires = jiffies;
+	}
+
+	seq_printf(seq,
+		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %lu %lu %u %u %d\n",
+		   i,
+		   src->s6_addr32[0], src->s6_addr32[1],
+		   src->s6_addr32[2], src->s6_addr32[3], srcp,
+		   dest->s6_addr32[0], dest->s6_addr32[1],
+		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
+		   sp->sk_state,
+		   tp->write_seq-tp->snd_una,
+		   (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+		   timer_active,
+		   jiffies_to_clock_t(timer_expires - jiffies),
+		   icsk->icsk_retransmits,
+		   sock_i_uid(sp),
+		   icsk->icsk_probes_out,
+		   sock_i_ino(sp),
+		   atomic_read(&sp->sk_refcnt), sp,
+		   jiffies_to_clock_t(icsk->icsk_rto),
+		   jiffies_to_clock_t(icsk->icsk_ack.ato),
+		   (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
+		   tp->snd_cwnd,
+		   tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh
+		   );
+}
+
+static void get_timewait6_sock(struct seq_file *seq,
+			       struct inet_timewait_sock *tw, int i)
+{
+	const struct in6_addr *dest, *src;
+	__u16 destp, srcp;
+	struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
+	int ttd = tw->tw_ttd - jiffies;
+
+	if (ttd < 0)
+		ttd = 0;
+
+	dest = &tw6->tw_v6_daddr;
+	src  = &tw6->tw_v6_rcv_saddr;
+	destp = ntohs(tw->tw_dport);
+	srcp  = ntohs(tw->tw_sport);
+
+	seq_printf(seq,
+		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+		   i,
+		   src->s6_addr32[0], src->s6_addr32[1],
+		   src->s6_addr32[2], src->s6_addr32[3], srcp,
+		   dest->s6_addr32[0], dest->s6_addr32[1],
+		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
+		   tw->tw_substate, 0, 0,
+		   3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+		   atomic_read(&tw->tw_refcnt), tw);
+}
+
+static int tcp6_seq_show(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq,
+			 "  sl  "
+			 "local_address                         "
+			 "remote_address                        "
+			 "st tx_queue rx_queue tr tm->when retrnsmt"
+			 "   uid  timeout inode\n");
+		goto out;
+	}
+	st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+	case TCP_SEQ_STATE_ESTABLISHED:
+		get_tcp6_sock(seq, v, st->num);
+		break;
+	case TCP_SEQ_STATE_OPENREQ:
+		get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid);
+		break;
+	case TCP_SEQ_STATE_TIME_WAIT:
+		get_timewait6_sock(seq, v, st->num);
+		break;
+	}
+out:
+	return 0;
+}
+
+static struct tcp_seq_afinfo tcp6_seq_afinfo = {
+	.name		= "tcp6",
+	.family		= AF_INET6,
+	.seq_fops	= {
+		.owner		= THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= tcp6_seq_show,
+	},
+};
+
+int __net_init tcp6_proc_init(struct net *net)
+{
+	return tcp_proc_register(net, &tcp6_seq_afinfo);
+}
+
+void tcp6_proc_exit(struct net *net)
+{
+	tcp_proc_unregister(net, &tcp6_seq_afinfo);
+}
+#endif
+
+struct proto tcpv6_prot = {
+	.name			= "TCPv6",
+	.owner			= THIS_MODULE,
+	.close			= tcp_close,
+	.connect		= tcp_v6_connect,
+	.disconnect		= tcp_disconnect,
+	.accept			= inet_csk_accept,
+	.ioctl			= tcp_ioctl,
+	.init			= tcp_v6_init_sock,
+	.destroy		= tcp_v6_destroy_sock,
+	.shutdown		= tcp_shutdown,
+	.setsockopt		= tcp_setsockopt,
+	.getsockopt		= tcp_getsockopt,
+	.recvmsg		= tcp_recvmsg,
+	.sendmsg		= tcp_sendmsg,
+	.sendpage		= tcp_sendpage,
+	.backlog_rcv		= tcp_v6_do_rcv,
+	.hash			= tcp_v6_hash,
+	.unhash			= inet_unhash,
+	.get_port		= inet_csk_get_port,
+	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.sockets_allocated	= &tcp_sockets_allocated,
+	.memory_allocated	= &tcp_memory_allocated,
+	.memory_pressure	= &tcp_memory_pressure,
+	.orphan_count		= &tcp_orphan_count,
+	.sysctl_mem		= sysctl_tcp_mem,
+	.sysctl_wmem		= sysctl_tcp_wmem,
+	.sysctl_rmem		= sysctl_tcp_rmem,
+	.max_header		= MAX_TCP_HEADER,
+	.obj_size		= sizeof(struct tcp6_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.twsk_prot		= &tcp6_timewait_sock_ops,
+	.rsk_prot		= &tcp6_request_sock_ops,
+	.h.hashinfo		= &tcp_hashinfo,
+	.no_autobind		= true,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt	= compat_tcp_setsockopt,
+	.compat_getsockopt	= compat_tcp_getsockopt,
+#endif
+};
+
+static const struct inet6_protocol tcpv6_protocol = {
+	.handler	=	tcp_v6_rcv,
+	.err_handler	=	tcp_v6_err,
+	.gso_send_check	=	tcp_v6_gso_send_check,
+	.gso_segment	=	tcp_tso_segment,
+	.gro_receive	=	tcp6_gro_receive,
+	.gro_complete	=	tcp6_gro_complete,
+	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static struct inet_protosw tcpv6_protosw = {
+	.type		=	SOCK_STREAM,
+	.protocol	=	IPPROTO_TCP,
+	.prot		=	&tcpv6_prot,
+	.ops		=	&inet6_stream_ops,
+	.no_check	=	0,
+	.flags		=	INET_PROTOSW_PERMANENT |
+				INET_PROTOSW_ICSK,
+};
+
+static int __net_init tcpv6_net_init(struct net *net)
+{
+	return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
+				    SOCK_RAW, IPPROTO_TCP, net);
+}
+
+static void __net_exit tcpv6_net_exit(struct net *net)
+{
+	inet_ctl_sock_destroy(net->ipv6.tcp_sk);
+}
+
+static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
+{
+	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6);
+}
+
+static struct pernet_operations tcpv6_net_ops = {
+	.init	    = tcpv6_net_init,
+	.exit	    = tcpv6_net_exit,
+	.exit_batch = tcpv6_net_exit_batch,
+};
+
+int __init tcpv6_init(void)
+{
+	int ret;
+
+	ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP);
+	if (ret)
+		goto out;
+
+	/* register inet6 protocol */
+	ret = inet6_register_protosw(&tcpv6_protosw);
+	if (ret)
+		goto out_tcpv6_protocol;
+
+	ret = register_pernet_subsys(&tcpv6_net_ops);
+	if (ret)
+		goto out_tcpv6_protosw;
+out:
+	return ret;
+
+out_tcpv6_protocol:
+	inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+out_tcpv6_protosw:
+	inet6_unregister_protosw(&tcpv6_protosw);
+	goto out;
+}
+
+void tcpv6_exit(void)
+{
+	unregister_pernet_subsys(&tcpv6_net_ops);
+	inet6_unregister_protosw(&tcpv6_protosw);
+	inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
new file mode 100644
index 00000000..4f3cec12
--- /dev/null
+++ b/net/ipv6/tunnel6.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors	Mitsuru KANDA  <mk@linux-ipv6.org>
+ * 		YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ */
+
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static DEFINE_MUTEX(tunnel6_mutex);
+
+int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
+{
+	struct xfrm6_tunnel __rcu **pprev;
+	struct xfrm6_tunnel *t;
+	int ret = -EEXIST;
+	int priority = handler->priority;
+
+	mutex_lock(&tunnel6_mutex);
+
+	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel6_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
+			break;
+		if (t->priority == priority)
+			goto err;
+	}
+
+	handler->next = *pprev;
+	rcu_assign_pointer(*pprev, handler);
+
+	ret = 0;
+
+err:
+	mutex_unlock(&tunnel6_mutex);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(xfrm6_tunnel_register);
+
+int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
+{
+	struct xfrm6_tunnel __rcu **pprev;
+	struct xfrm6_tunnel *t;
+	int ret = -ENOENT;
+
+	mutex_lock(&tunnel6_mutex);
+
+	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel6_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
+			*pprev = handler->next;
+			ret = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&tunnel6_mutex);
+
+	synchronize_net();
+
+	return ret;
+}
+
+EXPORT_SYMBOL(xfrm6_tunnel_deregister);
+
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+
+static int tunnel6_rcv(struct sk_buff *skb)
+{
+	struct xfrm6_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int tunnel46_rcv(struct sk_buff *skb)
+{
+	struct xfrm6_tunnel *handler;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto drop;
+
+	for_each_tunnel_rcu(tunnel46_handlers, handler)
+		if (!handler->handler(skb))
+			return 0;
+
+	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			u8 type, u8 code, int offset, __be32 info)
+{
+	struct xfrm6_tunnel *handler;
+
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
+		if (!handler->err_handler(skb, opt, type, code, offset, info))
+			break;
+}
+
+static const struct inet6_protocol tunnel6_protocol = {
+	.handler	= tunnel6_rcv,
+	.err_handler	= tunnel6_err,
+	.flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static const struct inet6_protocol tunnel46_protocol = {
+	.handler	= tunnel46_rcv,
+	.err_handler	= tunnel6_err,
+	.flags          = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static int __init tunnel6_init(void)
+{
+	if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
+		printk(KERN_ERR "tunnel6 init(): can't add protocol\n");
+		return -EAGAIN;
+	}
+	if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) {
+		printk(KERN_ERR "tunnel6 init(): can't add protocol\n");
+		inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit tunnel6_fini(void)
+{
+	if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP))
+		printk(KERN_ERR "tunnel6 close: can't remove protocol\n");
+	if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
+		printk(KERN_ERR "tunnel6 close: can't remove protocol\n");
+}
+
+module_init(tunnel6_init);
+module_exit(tunnel6_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
new file mode 100644
index 00000000..0d920c58
--- /dev/null
+++ b/net/ipv6/udp.c
@@ -0,0 +1,1517 @@
+/*
+ *	UDP over IPv6
+ *	Linux INET6 implementation
+ *
+ *	Authors:
+ *	Pedro Roque		<roque@di.fc.ul.pt>
+ *
+ *	Based on linux/ipv4/udp.c
+ *
+ *	Fixes:
+ *	Hideaki YOSHIFUJI	:	sin6_scope_id support
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ *      Kazunori MIYAZAWA @USAGI:       change process style to use ip6_append_data
+ *      YOSHIFUJI Hideaki @USAGI:	convert /proc/net/udp6 to seq_file.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <net/ndisc.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/ip6_route.h>
+#include <net/raw.h>
+#include <net/tcp_states.h>
+#include <net/ip6_checksum.h>
+#include <net/xfrm.h>
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "udp_impl.h"
+
+int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
+{
+	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+	__be32 sk1_rcv_saddr = sk_rcv_saddr(sk);
+	__be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
+	int sk_ipv6only = ipv6_only_sock(sk);
+	int sk2_ipv6only = inet_v6_ipv6only(sk2);
+	int addr_type = ipv6_addr_type(sk_rcv_saddr6);
+	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+	/* if both are mapped, treat as IPv4 */
+	if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
+		return (!sk2_ipv6only &&
+			(!sk1_rcv_saddr || !sk2_rcv_saddr ||
+			  sk1_rcv_saddr == sk2_rcv_saddr));
+
+	if (addr_type2 == IPV6_ADDR_ANY &&
+	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (addr_type == IPV6_ADDR_ANY &&
+	    !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+		return 1;
+
+	if (sk2_rcv_saddr6 &&
+	    ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6))
+		return 1;
+
+	return 0;
+}
+
+static unsigned int udp6_portaddr_hash(struct net *net,
+				       const struct in6_addr *addr6,
+				       unsigned int port)
+{
+	unsigned int hash, mix = net_hash_mix(net);
+
+	if (ipv6_addr_any(addr6))
+		hash = jhash_1word(0, mix);
+	else if (ipv6_addr_v4mapped(addr6))
+		hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
+	else
+		hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);
+
+	return hash ^ port;
+}
+
+
+int udp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+	unsigned int hash2_nulladdr =
+		udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum);
+	unsigned int hash2_partial = 
+		udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0);
+
+	/* precompute partial secondary hash */
+	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
+}
+
+static void udp_v6_rehash(struct sock *sk)
+{
+	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+					  &inet6_sk(sk)->rcv_saddr,
+					  inet_sk(sk)->inet_num);
+
+	udp_lib_rehash(sk, new_hash);
+}
+
+static inline int compute_score(struct sock *sk, struct net *net,
+				unsigned short hnum,
+				const struct in6_addr *saddr, __be16 sport,
+				const struct in6_addr *daddr, __be16 dport,
+				int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
+			sk->sk_family == PF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = 0;
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->rcv_saddr)) {
+			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->daddr)) {
+			if (!ipv6_addr_equal(&np->daddr, saddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
+#define SCORE2_MAX (1 + 1 + 1)
+static inline int compute_score2(struct sock *sk, struct net *net,
+				const struct in6_addr *saddr, __be16 sport,
+				const struct in6_addr *daddr, unsigned short hnum,
+				int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
+			sk->sk_family == PF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+			return -1;
+		score = 0;
+		if (inet->inet_dport) {
+			if (inet->inet_dport != sport)
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->daddr)) {
+			if (!ipv6_addr_equal(&np->daddr, saddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
+
+/* called with read_rcu_lock() */
+static struct sock *udp6_lib_lookup2(struct net *net,
+		const struct in6_addr *saddr, __be16 sport,
+		const struct in6_addr *daddr, unsigned int hnum, int dif,
+		struct udp_hslot *hslot2, unsigned int slot2)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	int score, badness;
+
+begin:
+	result = NULL;
+	badness = -1;
+	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+		score = compute_score2(sk, net, saddr, sport,
+				      daddr, hnum, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+			if (score == SCORE2_MAX)
+				goto exact_match;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot2)
+		goto begin;
+
+	if (result) {
+exact_match:
+		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(compute_score2(result, net, saddr, sport,
+				  daddr, hnum, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	return result;
+}
+
+static struct sock *__udp6_lib_lookup(struct net *net,
+				      const struct in6_addr *saddr, __be16 sport,
+				      const struct in6_addr *daddr, __be16 dport,
+				      int dif, struct udp_table *udptable)
+{
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(dport);
+	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
+	int score, badness;
+
+	rcu_read_lock();
+	if (hslot->count > 10) {
+		hash2 = udp6_portaddr_hash(net, daddr, hnum);
+		slot2 = hash2 & udptable->mask;
+		hslot2 = &udptable->hash2[slot2];
+		if (hslot->count < hslot2->count)
+			goto begin;
+
+		result = udp6_lib_lookup2(net, saddr, sport,
+					  daddr, hnum, dif,
+					  hslot2, slot2);
+		if (!result) {
+			hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
+			slot2 = hash2 & udptable->mask;
+			hslot2 = &udptable->hash2[slot2];
+			if (hslot->count < hslot2->count)
+				goto begin;
+
+			result = udp6_lib_lookup2(net, saddr, sport,
+						  &in6addr_any, hnum, dif,
+						  hslot2, slot2);
+		}
+		rcu_read_unlock();
+		return result;
+	}
+begin:
+	result = NULL;
+	badness = -1;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
+		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+		}
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
+					daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
+	return result;
+}
+
+static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
+					  __be16 sport, __be16 dport,
+					  struct udp_table *udptable)
+{
+	struct sock *sk;
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	if (unlikely(sk = skb_steal_sock(skb)))
+		return sk;
+	return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
+				 &iph->daddr, dport, inet6_iif(skb),
+				 udptable);
+}
+
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+			     const struct in6_addr *daddr, __be16 dport, int dif)
+{
+	return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+
+/*
+ * 	This should be easy, if there is something there we
+ * 	return it, otherwise we block.
+ */
+
+int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
+		  struct msghdr *msg, size_t len,
+		  int noblock, int flags, int *addr_len)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+	unsigned int ulen;
+	int peeked;
+	int err;
+	int is_udplite = IS_UDPLITE(sk);
+	int is_udp4;
+	bool slow;
+
+	if (addr_len)
+		*addr_len=sizeof(struct sockaddr_in6);
+
+	if (flags & MSG_ERRQUEUE)
+		return ipv6_recv_error(sk, msg, len);
+
+	if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+		return ipv6_recv_rxpmtu(sk, msg, len);
+
+try_again:
+	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				  &peeked, &err);
+	if (!skb)
+		goto out;
+
+	ulen = skb->len - sizeof(struct udphdr);
+	if (len > ulen)
+		len = ulen;
+	else if (len < ulen)
+		msg->msg_flags |= MSG_TRUNC;
+
+	is_udp4 = (skb->protocol == htons(ETH_P_IP));
+
+	/*
+	 * If checksum is needed at all, try to do it while copying the
+	 * data.  If the data is truncated, or if we only want a partial
+	 * coverage checksum (UDP-Lite), do it before the copy.
+	 */
+
+	if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
+		if (udp_lib_checksum_complete(skb))
+			goto csum_copy_err;
+	}
+
+	if (skb_csum_unnecessary(skb))
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
+					      msg->msg_iov,len);
+	else {
+		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+		if (err == -EINVAL)
+			goto csum_copy_err;
+	}
+	if (err)
+		goto out_free;
+
+	if (!peeked) {
+		if (is_udp4)
+			UDP_INC_STATS_USER(sock_net(sk),
+					UDP_MIB_INDATAGRAMS, is_udplite);
+		else
+			UDP6_INC_STATS_USER(sock_net(sk),
+					UDP_MIB_INDATAGRAMS, is_udplite);
+	}
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	/* Copy the address. */
+	if (msg->msg_name) {
+		struct sockaddr_in6 *sin6;
+
+		sin6 = (struct sockaddr_in6 *) msg->msg_name;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_port = udp_hdr(skb)->source;
+		sin6->sin6_flowinfo = 0;
+		sin6->sin6_scope_id = 0;
+
+		if (is_udp4)
+			ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
+					       &sin6->sin6_addr);
+		else {
+			ipv6_addr_copy(&sin6->sin6_addr,
+				       &ipv6_hdr(skb)->saddr);
+			if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				sin6->sin6_scope_id = IP6CB(skb)->iif;
+		}
+
+	}
+	if (is_udp4) {
+		if (inet->cmsg_flags)
+			ip_cmsg_recv(msg, skb);
+	} else {
+		if (np->rxopt.all)
+			datagram_recv_ctl(sk, msg, skb);
+	}
+
+	err = len;
+	if (flags & MSG_TRUNC)
+		err = ulen;
+
+out_free:
+	skb_free_datagram_locked(sk, skb);
+out:
+	return err;
+
+csum_copy_err:
+	slow = lock_sock_fast(sk);
+	if (!skb_kill_datagram(sk, skb, flags)) {
+		if (is_udp4)
+			UDP_INC_STATS_USER(sock_net(sk),
+					UDP_MIB_INERRORS, is_udplite);
+		else
+			UDP6_INC_STATS_USER(sock_net(sk),
+					UDP_MIB_INERRORS, is_udplite);
+	}
+	unlock_sock_fast(sk, slow);
+
+	if (noblock)
+		return -EAGAIN;
+
+	/* starting over for a new packet */
+	msg->msg_flags &= ~MSG_TRUNC;
+	goto try_again;
+}
+
+void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		    u8 type, u8 code, int offset, __be32 info,
+		    struct udp_table *udptable)
+{
+	struct ipv6_pinfo *np;
+	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
+	const struct in6_addr *saddr = &hdr->saddr;
+	const struct in6_addr *daddr = &hdr->daddr;
+	struct udphdr *uh = (struct udphdr*)(skb->data+offset);
+	struct sock *sk;
+	int err;
+
+	sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest,
+			       saddr, uh->source, inet6_iif(skb), udptable);
+	if (sk == NULL)
+		return;
+
+	np = inet6_sk(sk);
+
+	if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
+		goto out;
+
+	if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
+		goto out;
+
+	if (np->recverr)
+		ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+static __inline__ void udpv6_err(struct sk_buff *skb,
+				 struct inet6_skb_parm *opt, u8 type,
+				 u8 code, int offset, __be32 info     )
+{
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+}
+
+int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int rc;
+	int is_udplite = IS_UDPLITE(sk);
+
+	if (!ipv6_addr_any(&inet6_sk(sk)->daddr))
+		sock_rps_save_rxhash(sk, skb->rxhash);
+
+	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto drop;
+
+	/*
+	 * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
+	 */
+	if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
+
+		if (up->pcrlen == 0) {          /* full coverage was set  */
+			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage"
+				" %d while full coverage %d requested\n",
+				UDP_SKB_CB(skb)->cscov, skb->len);
+			goto drop;
+		}
+		if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
+			LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: coverage %d "
+						    "too small, need min %d\n",
+				       UDP_SKB_CB(skb)->cscov, up->pcrlen);
+			goto drop;
+		}
+	}
+
+	if (rcu_dereference_raw(sk->sk_filter)) {
+		if (udp_lib_checksum_complete(skb))
+			goto drop;
+	}
+
+	if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) {
+		/* Note that an ENOMEM error is charged twice */
+		if (rc == -ENOMEM)
+			UDP6_INC_STATS_BH(sock_net(sk),
+					UDP_MIB_RCVBUFERRORS, is_udplite);
+		goto drop_no_sk_drops_inc;
+	}
+
+	return 0;
+drop:
+	atomic_inc(&sk->sk_drops);
+drop_no_sk_drops_inc:
+	UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+	kfree_skb(skb);
+	return -1;
+}
+
+static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
+				      __be16 loc_port, const struct in6_addr *loc_addr,
+				      __be16 rmt_port, const struct in6_addr *rmt_addr,
+				      int dif)
+{
+	struct hlist_nulls_node *node;
+	struct sock *s = sk;
+	unsigned short num = ntohs(loc_port);
+
+	sk_nulls_for_each_from(s, node) {
+		struct inet_sock *inet = inet_sk(s);
+
+		if (!net_eq(sock_net(s), net))
+			continue;
+
+		if (udp_sk(s)->udp_port_hash == num &&
+		    s->sk_family == PF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(s);
+			if (inet->inet_dport) {
+				if (inet->inet_dport != rmt_port)
+					continue;
+			}
+			if (!ipv6_addr_any(&np->daddr) &&
+			    !ipv6_addr_equal(&np->daddr, rmt_addr))
+				continue;
+
+			if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)
+				continue;
+
+			if (!ipv6_addr_any(&np->rcv_saddr)) {
+				if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+					continue;
+			}
+			if (!inet6_mc_check(s, loc_addr, rmt_addr))
+				continue;
+			return s;
+		}
+	}
+	return NULL;
+}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+			struct sk_buff *skb, unsigned int final)
+{
+	unsigned int i;
+	struct sock *sk;
+	struct sk_buff *skb1;
+
+	for (i = 0; i < count; i++) {
+		skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+		sk = stack[i];
+		if (skb1) {
+			if (sk_rcvqueues_full(sk, skb1)) {
+				kfree_skb(skb1);
+				goto drop;
+			}
+			bh_lock_sock(sk);
+			if (!sock_owned_by_user(sk))
+				udpv6_queue_rcv_skb(sk, skb1);
+			else if (sk_add_backlog(sk, skb1)) {
+				kfree_skb(skb1);
+				bh_unlock_sock(sk);
+				goto drop;
+			}
+			bh_unlock_sock(sk);
+			continue;
+		}
+drop:
+		atomic_inc(&sk->sk_drops);
+		UDP6_INC_STATS_BH(sock_net(sk),
+				UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk));
+		UDP6_INC_STATS_BH(sock_net(sk),
+				UDP_MIB_INERRORS, IS_UDPLITE(sk));
+	}
+}
+/*
+ * Note: called only from the BH handler context,
+ * so we don't need to lock the hashes.
+ */
+static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+		const struct in6_addr *saddr, const struct in6_addr *daddr,
+		struct udp_table *udptable)
+{
+	struct sock *sk, *stack[256 / sizeof(struct sock *)];
+	const struct udphdr *uh = udp_hdr(skb);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
+	int dif;
+	unsigned int i, count = 0;
+
+	spin_lock(&hslot->lock);
+	sk = sk_nulls_head(&hslot->head);
+	dif = inet6_iif(skb);
+	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
+	while (sk) {
+		stack[count++] = sk;
+		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
+				       uh->source, saddr, dif);
+		if (unlikely(count == ARRAY_SIZE(stack))) {
+			if (!sk)
+				break;
+			flush_stack(stack, count, skb, ~0);
+			count = 0;
+		}
+	}
+	/*
+	 * before releasing the lock, we must take reference on sockets
+	 */
+	for (i = 0; i < count; i++)
+		sock_hold(stack[i]);
+
+	spin_unlock(&hslot->lock);
+
+	if (count) {
+		flush_stack(stack, count, skb, count - 1);
+
+		for (i = 0; i < count; i++)
+			sock_put(stack[i]);
+	} else {
+		kfree_skb(skb);
+	}
+	return 0;
+}
+
+static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
+				 int proto)
+{
+	int err;
+
+	UDP_SKB_CB(skb)->partial_cov = 0;
+	UDP_SKB_CB(skb)->cscov = skb->len;
+
+	if (proto == IPPROTO_UDPLITE) {
+		err = udplite_checksum_init(skb, uh);
+		if (err)
+			return err;
+	}
+
+	if (uh->check == 0) {
+		/* RFC 2460 section 8.1 says that we SHOULD log
+		   this error. Well, it is reasonable.
+		 */
+		LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
+		return 1;
+	}
+	if (skb->ip_summed == CHECKSUM_COMPLETE &&
+	    !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
+			     skb->len, proto, skb->csum))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (!skb_csum_unnecessary(skb))
+		skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+							 &ipv6_hdr(skb)->daddr,
+							 skb->len, proto, 0));
+
+	return 0;
+}
+
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+		   int proto)
+{
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	struct udphdr *uh;
+	const struct in6_addr *saddr, *daddr;
+	u32 ulen = 0;
+
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto discard;
+
+	saddr = &ipv6_hdr(skb)->saddr;
+	daddr = &ipv6_hdr(skb)->daddr;
+	uh = udp_hdr(skb);
+
+	ulen = ntohs(uh->len);
+	if (ulen > skb->len)
+		goto short_packet;
+
+	if (proto == IPPROTO_UDP) {
+		/* UDP validates ulen. */
+
+		/* Check for jumbo payload */
+		if (ulen == 0)
+			ulen = skb->len;
+
+		if (ulen < sizeof(*uh))
+			goto short_packet;
+
+		if (ulen < skb->len) {
+			if (pskb_trim_rcsum(skb, ulen))
+				goto short_packet;
+			saddr = &ipv6_hdr(skb)->saddr;
+			daddr = &ipv6_hdr(skb)->daddr;
+			uh = udp_hdr(skb);
+		}
+	}
+
+	if (udp6_csum_init(skb, uh, proto))
+		goto discard;
+
+	/*
+	 *	Multicast receive code
+	 */
+	if (ipv6_addr_is_multicast(daddr))
+		return __udp6_lib_mcast_deliver(net, skb,
+				saddr, daddr, udptable);
+
+	/* Unicast */
+
+	/*
+	 * check socket cache ... must talk to Alan about his plans
+	 * for sock caches... i'll skip this for now.
+	 */
+	sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+
+	if (sk == NULL) {
+		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto discard;
+
+		if (udp_lib_checksum_complete(skb))
+			goto discard;
+		UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS,
+				proto == IPPROTO_UDPLITE);
+
+		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+		kfree_skb(skb);
+		return 0;
+	}
+
+	/* deliver */
+
+	if (sk_rcvqueues_full(sk, skb)) {
+		sock_put(sk);
+		goto discard;
+	}
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		udpv6_queue_rcv_skb(sk, skb);
+	else if (sk_add_backlog(sk, skb)) {
+		atomic_inc(&sk->sk_drops);
+		bh_unlock_sock(sk);
+		sock_put(sk);
+		goto discard;
+	}
+	bh_unlock_sock(sk);
+	sock_put(sk);
+	return 0;
+
+short_packet:
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
+		       proto == IPPROTO_UDPLITE ? "-Lite" : "",
+		       saddr,
+		       ntohs(uh->source),
+		       ulen,
+		       skb->len,
+		       daddr,
+		       ntohs(uh->dest));
+
+discard:
+	UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+	kfree_skb(skb);
+	return 0;
+}
+
+static __inline__ int udpv6_rcv(struct sk_buff *skb)
+{
+	return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+static void udp_v6_flush_pending_frames(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	if (up->pending == AF_INET)
+		udp_flush_pending_frames(sk);
+	else if (up->pending) {
+		up->len = 0;
+		up->pending = 0;
+		ip6_flush_pending_frames(sk);
+	}
+}
+
+/**
+ * 	udp6_hwcsum_outgoing  -  handle outgoing HW checksumming
+ * 	@sk: 	socket we are sending on
+ * 	@skb: 	sk_buff containing the filled-in UDP header
+ * 	        (checksum field must be zeroed out)
+ */
+static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
+				 const struct in6_addr *saddr,
+				 const struct in6_addr *daddr, int len)
+{
+	unsigned int offset;
+	struct udphdr *uh = udp_hdr(skb);
+	__wsum csum = 0;
+
+	if (skb_queue_len(&sk->sk_write_queue) == 1) {
+		/* Only one fragment on the socket.  */
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0);
+	} else {
+		/*
+		 * HW-checksum won't work as there are two or more
+		 * fragments on the socket so that all csums of sk_buffs
+		 * should be together
+		 */
+		offset = skb_transport_offset(skb);
+		skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+		skb->ip_summed = CHECKSUM_NONE;
+
+		skb_queue_walk(&sk->sk_write_queue, skb) {
+			csum = csum_add(csum, skb->csum);
+		}
+
+		uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP,
+					    csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	}
+}
+
+/*
+ *	Sending
+ */
+
+static int udp_v6_push_pending_frames(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct udphdr *uh;
+	struct udp_sock  *up = udp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
+	int err = 0;
+	int is_udplite = IS_UDPLITE(sk);
+	__wsum csum = 0;
+
+	/* Grab the skbuff where UDP header space exists. */
+	if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+		goto out;
+
+	/*
+	 * Create a UDP header
+	 */
+	uh = udp_hdr(skb);
+	uh->source = fl6->fl6_sport;
+	uh->dest = fl6->fl6_dport;
+	uh->len = htons(up->len);
+	uh->check = 0;
+
+	if (is_udplite)
+		csum = udplite_csum_outgoing(sk, skb);
+	else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+		udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr,
+				     up->len);
+		goto send;
+	} else
+		csum = udp_csum_outgoing(sk, skb);
+
+	/* add protocol-dependent pseudo-header */
+	uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
+				    up->len, fl6->flowi6_proto, csum);
+	if (uh->check == 0)
+		uh->check = CSUM_MANGLED_0;
+
+send:
+	err = ip6_push_pending_frames(sk);
+	if (err) {
+		if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+			UDP6_INC_STATS_USER(sock_net(sk),
+					    UDP_MIB_SNDBUFERRORS, is_udplite);
+			err = 0;
+		}
+	} else
+		UDP6_INC_STATS_USER(sock_net(sk),
+				    UDP_MIB_OUTDATAGRAMS, is_udplite);
+out:
+	up->len = 0;
+	up->pending = 0;
+	return err;
+}
+
+int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
+		  struct msghdr *msg, size_t len)
+{
+	struct ipv6_txoptions opt_space;
+	struct udp_sock *up = udp_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
+	struct in6_addr *daddr, *final_p, final;
+	struct ipv6_txoptions *opt = NULL;
+	struct ip6_flowlabel *flowlabel = NULL;
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int addr_len = msg->msg_namelen;
+	int ulen = len;
+	int hlimit = -1;
+	int tclass = -1;
+	int dontfrag = -1;
+	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+	int err;
+	int connected = 0;
+	int is_udplite = IS_UDPLITE(sk);
+	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+
+	/* destination address check */
+	if (sin6) {
+		if (addr_len < offsetof(struct sockaddr, sa_data))
+			return -EINVAL;
+
+		switch (sin6->sin6_family) {
+		case AF_INET6:
+			if (addr_len < SIN6_LEN_RFC2133)
+				return -EINVAL;
+			daddr = &sin6->sin6_addr;
+			break;
+		case AF_INET:
+			goto do_udp_sendmsg;
+		case AF_UNSPEC:
+			msg->msg_name = sin6 = NULL;
+			msg->msg_namelen = addr_len = 0;
+			daddr = NULL;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else if (!up->pending) {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = &np->daddr;
+	} else
+		daddr = NULL;
+
+	if (daddr) {
+		if (ipv6_addr_v4mapped(daddr)) {
+			struct sockaddr_in sin;
+			sin.sin_family = AF_INET;
+			sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport;
+			sin.sin_addr.s_addr = daddr->s6_addr32[3];
+			msg->msg_name = &sin;
+			msg->msg_namelen = sizeof(sin);
+do_udp_sendmsg:
+			if (__ipv6_only_sock(sk))
+				return -ENETUNREACH;
+			return udp_sendmsg(iocb, sk, msg, len);
+		}
+	}
+
+	if (up->pending == AF_INET)
+		return udp_sendmsg(iocb, sk, msg, len);
+
+	/* Rough check on arithmetic overflow,
+	   better check is made in ip6_append_data().
+	   */
+	if (len > INT_MAX - sizeof(struct udphdr))
+		return -EMSGSIZE;
+
+	if (up->pending) {
+		/*
+		 * There are pending frames.
+		 * The socket lock must be held while it's corked.
+		 */
+		lock_sock(sk);
+		if (likely(up->pending)) {
+			if (unlikely(up->pending != AF_INET6)) {
+				release_sock(sk);
+				return -EAFNOSUPPORT;
+			}
+			dst = NULL;
+			goto do_append_data;
+		}
+		release_sock(sk);
+	}
+	ulen += sizeof(struct udphdr);
+
+	memset(&fl6, 0, sizeof(fl6));
+
+	if (sin6) {
+		if (sin6->sin6_port == 0)
+			return -EINVAL;
+
+		fl6.fl6_dport = sin6->sin6_port;
+		daddr = &sin6->sin6_addr;
+
+		if (np->sndflow) {
+			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+				if (flowlabel == NULL)
+					return -EINVAL;
+				daddr = &flowlabel->dst;
+			}
+		}
+
+		/*
+		 * Otherwise it will be difficult to maintain
+		 * sk->sk_dst_cache.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED &&
+		    ipv6_addr_equal(daddr, &np->daddr))
+			daddr = &np->daddr;
+
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    sin6->sin6_scope_id &&
+		    ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL)
+			fl6.flowi6_oif = sin6->sin6_scope_id;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+
+		fl6.fl6_dport = inet->inet_dport;
+		daddr = &np->daddr;
+		fl6.flowlabel = np->flow_label;
+		connected = 1;
+	}
+
+	if (!fl6.flowi6_oif)
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+	if (!fl6.flowi6_oif)
+		fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
+	fl6.flowi6_mark = sk->sk_mark;
+
+	if (msg->msg_controllen) {
+		opt = &opt_space;
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
+		opt->tot_len = sizeof(*opt);
+
+		err = datagram_send_ctl(sock_net(sk), msg, &fl6, opt, &hlimit,
+					&tclass, &dontfrag);
+		if (err < 0) {
+			fl6_sock_release(flowlabel);
+			return err;
+		}
+		if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+		}
+		if (!(opt->opt_nflen|opt->opt_flen))
+			opt = NULL;
+		connected = 0;
+	}
+	if (opt == NULL)
+		opt = np->opt;
+	if (flowlabel)
+		opt = fl6_merge_options(&opt_space, flowlabel, opt);
+	opt = ipv6_fixup_options(&opt_space, opt);
+
+	fl6.flowi6_proto = sk->sk_protocol;
+	if (!ipv6_addr_any(daddr))
+		ipv6_addr_copy(&fl6.daddr, daddr);
+	else
+		fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+	if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
+		ipv6_addr_copy(&fl6.saddr, &np->saddr);
+	fl6.fl6_sport = inet->inet_sport;
+
+	final_p = fl6_update_dst(&fl6, opt, &final);
+	if (final_p)
+		connected = 0;
+
+	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
+		fl6.flowi6_oif = np->mcast_oif;
+		connected = 0;
+	}
+
+	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+	dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true);
+	if (IS_ERR(dst)) {
+		err = PTR_ERR(dst);
+		dst = NULL;
+		goto out;
+	}
+
+	if (hlimit < 0) {
+		if (ipv6_addr_is_multicast(&fl6.daddr))
+			hlimit = np->mcast_hops;
+		else
+			hlimit = np->hop_limit;
+		if (hlimit < 0)
+			hlimit = ip6_dst_hoplimit(dst);
+	}
+
+	if (tclass < 0)
+		tclass = np->tclass;
+
+	if (dontfrag < 0)
+		dontfrag = np->dontfrag;
+
+	if (msg->msg_flags&MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	lock_sock(sk);
+	if (unlikely(up->pending)) {
+		/* The socket is already corked while preparing it. */
+		/* ... which is an evident application bug. --ANK */
+		release_sock(sk);
+
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	up->pending = AF_INET6;
+
+do_append_data:
+	up->len += ulen;
+	getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag;
+	err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen,
+		sizeof(struct udphdr), hlimit, tclass, opt, &fl6,
+		(struct rt6_info*)dst,
+		corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag);
+	if (err)
+		udp_v6_flush_pending_frames(sk);
+	else if (!corkreq)
+		err = udp_v6_push_pending_frames(sk);
+	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
+		up->pending = 0;
+
+	if (dst) {
+		if (connected) {
+			ip6_dst_store(sk, dst,
+				      ipv6_addr_equal(&fl6.daddr, &np->daddr) ?
+				      &np->daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+				      ipv6_addr_equal(&fl6.saddr, &np->saddr) ?
+				      &np->saddr :
+#endif
+				      NULL);
+		} else {
+			dst_release(dst);
+		}
+		dst = NULL;
+	}
+
+	if (err > 0)
+		err = np->recverr ? net_xmit_errno(err) : 0;
+	release_sock(sk);
+out:
+	dst_release(dst);
+	fl6_sock_release(flowlabel);
+	if (!err)
+		return len;
+	/*
+	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
+	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
+	 * we don't have a good statistic (IpOutDiscards but it can be too many
+	 * things).  We could add another new stat but at least for now that
+	 * seems like overkill.
+	 */
+	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		UDP6_INC_STATS_USER(sock_net(sk),
+				UDP_MIB_SNDBUFERRORS, is_udplite);
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(dst);
+	if (!(msg->msg_flags&MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+void udpv6_destroy_sock(struct sock *sk)
+{
+	lock_sock(sk);
+	udp_v6_flush_pending_frames(sk);
+	release_sock(sk);
+
+	inet6_destroy_sock(sk);
+}
+
+/*
+ *	Socket option code for UDP
+ */
+int udpv6_setsockopt(struct sock *sk, int level, int optname,
+		     char __user *optval, unsigned int optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+					  udp_v6_push_pending_frames);
+	return ipv6_setsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, unsigned int optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+					  udp_v6_push_pending_frames);
+	return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+int udpv6_getsockopt(struct sock *sk, int level, int optname,
+		     char __user *optval, int __user *optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+	return ipv6_getsockopt(sk, level, optname, optval, optlen);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+			    char __user *optval, int __user *optlen)
+{
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+		return udp_lib_getsockopt(sk, level, optname, optval, optlen);
+	return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
+}
+#endif
+
+static int udp6_ufo_send_check(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ipv6h;
+	struct udphdr *uh;
+
+	if (!pskb_may_pull(skb, sizeof(*uh)))
+		return -EINVAL;
+
+	ipv6h = ipv6_hdr(skb);
+	uh = udp_hdr(skb);
+
+	uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
+				     IPPROTO_UDP, 0);
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	return 0;
+}
+
+static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	unsigned int mss;
+	unsigned int unfrag_ip6hlen, unfrag_len;
+	struct frag_hdr *fptr;
+	u8 *mac_start, *prevhdr;
+	u8 nexthdr;
+	u8 frag_hdr_sz = sizeof(struct frag_hdr);
+	int offset;
+	__wsum csum;
+	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(skb->len <= mss))
+		goto out;
+
+	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+		/* Packet is from an untrusted source, reset gso_segs. */
+		int type = skb_shinfo(skb)->gso_type;
+
+		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+			     !(type & (SKB_GSO_UDP))))
+			goto out;
+
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
+
+		segs = NULL;
+		goto out;
+	}
+
+	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
+	 * do checksum of UDP packets sent as multiple IP fragments.
+	 */
+	offset = skb_checksum_start_offset(skb);
+	csum = skb_checksum(skb, offset, skb->len- offset, 0);
+	offset += skb->csum_offset;
+	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Check if there is enough headroom to insert fragment header. */
+	if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) &&
+	    pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC))
+		goto out;
+
+	/* Find the unfragmentable header and shift it left by frag_hdr_sz
+	 * bytes to insert fragment header.
+	 */
+	unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+	nexthdr = *prevhdr;
+	*prevhdr = NEXTHDR_FRAGMENT;
+	unfrag_len = skb_network_header(skb) - skb_mac_header(skb) +
+		     unfrag_ip6hlen;
+	mac_start = skb_mac_header(skb);
+	memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len);
+
+	skb->mac_header -= frag_hdr_sz;
+	skb->network_header -= frag_hdr_sz;
+
+	fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
+	fptr->nexthdr = nexthdr;
+	fptr->reserved = 0;
+	ipv6_select_ident(fptr,
+			  rt ? &rt->rt6i_dst.addr : &ipv6_hdr(skb)->daddr);
+
+	/* Fragment the skb. ipv6 header and the remaining fields of the
+	 * fragment header are updated in ipv6_gso_segment()
+	 */
+	segs = skb_segment(skb, features);
+
+out:
+	return segs;
+}
+
+static const struct inet6_protocol udpv6_protocol = {
+	.handler	=	udpv6_rcv,
+	.err_handler	=	udpv6_err,
+	.gso_send_check =	udp6_ufo_send_check,
+	.gso_segment	=	udp6_ufo_fragment,
+	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	struct ipv6_pinfo *np = inet6_sk(sp);
+	const struct in6_addr *dest, *src;
+	__u16 destp, srcp;
+
+	dest  = &np->daddr;
+	src   = &np->rcv_saddr;
+	destp = ntohs(inet->inet_dport);
+	srcp  = ntohs(inet->inet_sport);
+	seq_printf(seq,
+		   "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
+		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
+		   bucket,
+		   src->s6_addr32[0], src->s6_addr32[1],
+		   src->s6_addr32[2], src->s6_addr32[3], srcp,
+		   dest->s6_addr32[0], dest->s6_addr32[1],
+		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
+		   sp->sk_state,
+		   sk_wmem_alloc_get(sp),
+		   sk_rmem_alloc_get(sp),
+		   0, 0L, 0,
+		   sock_i_uid(sp), 0,
+		   sock_i_ino(sp),
+		   atomic_read(&sp->sk_refcnt), sp,
+		   atomic_read(&sp->sk_drops));
+}
+
+int udp6_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq,
+			   "  sl  "
+			   "local_address                         "
+			   "remote_address                        "
+			   "st tx_queue rx_queue tr tm->when retrnsmt"
+			   "   uid  timeout inode ref pointer drops\n");
+	else
+		udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket);
+	return 0;
+}
+
+static struct udp_seq_afinfo udp6_seq_afinfo = {
+	.name		= "udp6",
+	.family		= AF_INET6,
+	.udp_table	= &udp_table,
+	.seq_fops	= {
+		.owner	=	THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= udp6_seq_show,
+	},
+};
+
+int __net_init udp6_proc_init(struct net *net)
+{
+	return udp_proc_register(net, &udp6_seq_afinfo);
+}
+
+void udp6_proc_exit(struct net *net) {
+	udp_proc_unregister(net, &udp6_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+/* ------------------------------------------------------------------------ */
+
+struct proto udpv6_prot = {
+	.name		   = "UDPv6",
+	.owner		   = THIS_MODULE,
+	.close		   = udp_lib_close,
+	.connect	   = ip6_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = udp_ioctl,
+	.destroy	   = udpv6_destroy_sock,
+	.setsockopt	   = udpv6_setsockopt,
+	.getsockopt	   = udpv6_getsockopt,
+	.sendmsg	   = udpv6_sendmsg,
+	.recvmsg	   = udpv6_recvmsg,
+	.backlog_rcv	   = udpv6_queue_rcv_skb,
+	.hash		   = udp_lib_hash,
+	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v6_rehash,
+	.get_port	   = udp_v6_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
+	.obj_size	   = sizeof(struct udp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.h.udp_table	   = &udp_table,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_udpv6_setsockopt,
+	.compat_getsockopt = compat_udpv6_getsockopt,
+#endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+};
+
+static struct inet_protosw udpv6_protosw = {
+	.type =      SOCK_DGRAM,
+	.protocol =  IPPROTO_UDP,
+	.prot =      &udpv6_prot,
+	.ops =       &inet6_dgram_ops,
+	.no_check =  UDP_CSUM_DEFAULT,
+	.flags =     INET_PROTOSW_PERMANENT,
+};
+
+
+int __init udpv6_init(void)
+{
+	int ret;
+
+	ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
+	if (ret)
+		goto out;
+
+	ret = inet6_register_protosw(&udpv6_protosw);
+	if (ret)
+		goto out_udpv6_protocol;
+out:
+	return ret;
+
+out_udpv6_protocol:
+	inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+	goto out;
+}
+
+void udpv6_exit(void)
+{
+	inet6_unregister_protosw(&udpv6_protosw);
+	inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+}
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
new file mode 100644
index 00000000..d7571046
--- /dev/null
+++ b/net/ipv6/udp_impl.h
@@ -0,0 +1,37 @@
+#ifndef _UDP6_IMPL_H
+#define _UDP6_IMPL_H
+#include <net/udp.h>
+#include <net/udplite.h>
+#include <net/protocol.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/transp_v6.h>
+
+extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
+extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
+			       u8 , u8 , int , __be32 , struct udp_table *);
+
+extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
+
+extern int	udpv6_getsockopt(struct sock *sk, int level, int optname,
+				 char __user *optval, int __user *optlen);
+extern int	udpv6_setsockopt(struct sock *sk, int level, int optname,
+				 char __user *optval, unsigned int optlen);
+#ifdef CONFIG_COMPAT
+extern int	compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
+					char __user *optval, unsigned int optlen);
+extern int	compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
+				       char __user *optval, int __user *optlen);
+#endif
+extern int	udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
+			      struct msghdr *msg, size_t len);
+extern int	udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
+			      struct msghdr *msg, size_t len,
+			      int noblock, int flags, int *addr_len);
+extern int	udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
+extern void	udpv6_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+extern int	udp6_seq_show(struct seq_file *seq, void *v);
+#endif
+#endif	/* _UDP6_IMPL_H */
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
new file mode 100644
index 00000000..986c4de5
--- /dev/null
+++ b/net/ipv6/udplite.c
@@ -0,0 +1,132 @@
+/*
+ *  UDPLITEv6   An implementation of the UDP-Lite protocol over IPv6.
+ *              See also net/ipv4/udplite.c
+ *
+ *  Authors:    Gerrit Renker       <gerrit@erg.abdn.ac.uk>
+ *
+ *  Changes:
+ *  Fixes:
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include "udp_impl.h"
+
+static int udplitev6_rcv(struct sk_buff *skb)
+{
+	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
+}
+
+static void udplitev6_err(struct sk_buff *skb,
+			  struct inet6_skb_parm *opt,
+			  u8 type, u8 code, int offset, __be32 info)
+{
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+}
+
+static const struct inet6_protocol udplitev6_protocol = {
+	.handler	=	udplitev6_rcv,
+	.err_handler	=	udplitev6_err,
+	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+struct proto udplitev6_prot = {
+	.name		   = "UDPLITEv6",
+	.owner		   = THIS_MODULE,
+	.close		   = udp_lib_close,
+	.connect	   = ip6_datagram_connect,
+	.disconnect	   = udp_disconnect,
+	.ioctl		   = udp_ioctl,
+	.init		   = udplite_sk_init,
+	.destroy	   = udpv6_destroy_sock,
+	.setsockopt	   = udpv6_setsockopt,
+	.getsockopt	   = udpv6_getsockopt,
+	.sendmsg	   = udpv6_sendmsg,
+	.recvmsg	   = udpv6_recvmsg,
+	.backlog_rcv	   = udpv6_queue_rcv_skb,
+	.hash		   = udp_lib_hash,
+	.unhash		   = udp_lib_unhash,
+	.get_port	   = udp_v6_get_port,
+	.obj_size	   = sizeof(struct udp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.h.udp_table	   = &udplite_table,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_udpv6_setsockopt,
+	.compat_getsockopt = compat_udpv6_getsockopt,
+#endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+};
+
+static struct inet_protosw udplite6_protosw = {
+	.type		= SOCK_DGRAM,
+	.protocol	= IPPROTO_UDPLITE,
+	.prot		= &udplitev6_prot,
+	.ops		= &inet6_dgram_ops,
+	.no_check	= 0,
+	.flags		= INET_PROTOSW_PERMANENT,
+};
+
+int __init udplitev6_init(void)
+{
+	int ret;
+
+	ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+	if (ret)
+		goto out;
+
+	ret = inet6_register_protosw(&udplite6_protosw);
+	if (ret)
+		goto out_udplitev6_protocol;
+out:
+	return ret;
+
+out_udplitev6_protocol:
+	inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+	goto out;
+}
+
+void udplitev6_exit(void)
+{
+	inet6_unregister_protosw(&udplite6_protosw);
+	inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE);
+}
+
+#ifdef CONFIG_PROC_FS
+static struct udp_seq_afinfo udplite6_seq_afinfo = {
+	.name		= "udplite6",
+	.family		= AF_INET6,
+	.udp_table	= &udplite_table,
+	.seq_fops	= {
+		.owner	=	THIS_MODULE,
+	},
+	.seq_ops	= {
+		.show		= udp6_seq_show,
+	},
+};
+
+static int __net_init udplite6_proc_init_net(struct net *net)
+{
+	return udp_proc_register(net, &udplite6_seq_afinfo);
+}
+
+static void __net_exit udplite6_proc_exit_net(struct net *net)
+{
+	udp_proc_unregister(net, &udplite6_seq_afinfo);
+}
+
+static struct pernet_operations udplite6_net_ops = {
+	.init = udplite6_proc_init_net,
+	.exit = udplite6_proc_exit_net,
+};
+
+int __init udplite6_proc_init(void)
+{
+	return register_pernet_subsys(&udplite6_net_ops);
+}
+
+void udplite6_proc_exit(void)
+{
+	unregister_pernet_subsys(&udplite6_net_ops);
+}
+#endif
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
new file mode 100644
index 00000000..f8c3cf84
--- /dev/null
+++ b/net/ipv6/xfrm6_input.c
@@ -0,0 +1,146 @@
+/*
+ * xfrm6_input.c: based on net/ipv4/xfrm4_input.c
+ *
+ * Authors:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *	YOSHIFUJI Hideaki @USAGI
+ *		IPv6 support
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return xfrm6_extract_header(skb);
+}
+
+int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+	return xfrm_input(skb, nexthdr, spi, 0);
+}
+EXPORT_SYMBOL(xfrm6_rcv_spi);
+
+int xfrm6_transport_finish(struct sk_buff *skb, int async)
+{
+	skb_network_header(skb)[IP6CB(skb)->nhoff] =
+		XFRM_MODE_SKB_CB(skb)->protocol;
+
+#ifndef CONFIG_NETFILTER
+	if (!async)
+		return 1;
+#endif
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len);
+	__skb_push(skb, skb->data - skb_network_header(skb));
+
+	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+		ip6_rcv_finish);
+	return -1;
+}
+
+int xfrm6_rcv(struct sk_buff *skb)
+{
+	return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
+			     0);
+}
+
+EXPORT_SYMBOL(xfrm6_rcv);
+
+int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
+		     xfrm_address_t *saddr, u8 proto)
+{
+	struct net *net = dev_net(skb->dev);
+	struct xfrm_state *x = NULL;
+	int i = 0;
+
+	/* Allocate new secpath or COW existing one. */
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		struct sec_path *sp;
+
+		sp = secpath_dup(skb->sp);
+		if (!sp) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+			goto drop;
+		}
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+
+	if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+		goto drop;
+	}
+
+	for (i = 0; i < 3; i++) {
+		xfrm_address_t *dst, *src;
+
+		switch (i) {
+		case 0:
+			dst = daddr;
+			src = saddr;
+			break;
+		case 1:
+			/* lookup state with wild-card source address */
+			dst = daddr;
+			src = (xfrm_address_t *)&in6addr_any;
+			break;
+		default:
+			/* lookup state with wild-card addresses */
+			dst = (xfrm_address_t *)&in6addr_any;
+			src = (xfrm_address_t *)&in6addr_any;
+			break;
+		}
+
+		x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6);
+		if (!x)
+			continue;
+
+		spin_lock(&x->lock);
+
+		if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) &&
+		    likely(x->km.state == XFRM_STATE_VALID) &&
+		    !xfrm_state_check_expire(x)) {
+			spin_unlock(&x->lock);
+			if (x->type->input(x, skb) > 0) {
+				/* found a valid state */
+				break;
+			}
+		} else
+			spin_unlock(&x->lock);
+
+		xfrm_state_put(x);
+		x = NULL;
+	}
+
+	if (!x) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+		xfrm_audit_state_notfound_simple(skb, AF_INET6);
+		goto drop;
+	}
+
+	skb->sp->xvec[skb->sp->len++] = x;
+
+	spin_lock(&x->lock);
+
+	x->curlft.bytes += skb->len;
+	x->curlft.packets++;
+
+	spin_unlock(&x->lock);
+
+	return 1;
+
+drop:
+	return -1;
+}
+
+EXPORT_SYMBOL(xfrm6_input_addr);
diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c
new file mode 100644
index 00000000..f37cba9e
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_beet.c
@@ -0,0 +1,131 @@
+/*
+ * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6.
+ *
+ * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
+ *                    Miika Komu     <miika@iki.fi>
+ *                    Herbert Xu     <herbert@gondor.apana.org.au>
+ *                    Abhinav Pathak <abhinav.pathak@hiit.fi>
+ *                    Jeff Ahrenholz <ahrenholz@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static void xfrm6_beet_make_header(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	iph->version = 6;
+
+	memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
+	       sizeof(iph->flow_lbl));
+	iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol;
+
+	ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos);
+	iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl;
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
+ */
+static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *top_iph;
+	struct ip_beet_phdr *ph;
+	int optlen, hdr_len;
+
+	hdr_len = 0;
+	optlen = XFRM_MODE_SKB_CB(skb)->optlen;
+	if (unlikely(optlen))
+		hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4);
+
+	skb_set_network_header(skb, -x->props.header_len - hdr_len);
+	if (x->sel.family != AF_INET6)
+		skb->network_header += IPV4_BEET_PHMAXLEN;
+	skb->mac_header = skb->network_header +
+			  offsetof(struct ipv6hdr, nexthdr);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+	ph = (struct ip_beet_phdr *)__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl-hdr_len);
+
+	xfrm6_beet_make_header(skb);
+
+	top_iph = ipv6_hdr(skb);
+	if (unlikely(optlen)) {
+
+		BUG_ON(optlen < 0);
+
+		ph->padlen = 4 - (optlen & 4);
+		ph->hdrlen = optlen / 8;
+		ph->nexthdr = top_iph->nexthdr;
+		if (ph->padlen)
+			memset(ph + 1, IPOPT_NOP, ph->padlen);
+
+		top_iph->nexthdr = IPPROTO_BEETPH;
+	}
+
+	ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr);
+	ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr);
+	return 0;
+}
+
+static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *ip6h;
+	int size = sizeof(struct ipv6hdr);
+	int err;
+
+	err = skb_cow_head(skb, size + skb->mac_len);
+	if (err)
+		goto out;
+
+	__skb_push(skb, size);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	xfrm6_beet_make_header(skb);
+
+	ip6h = ipv6_hdr(skb);
+	ip6h->payload_len = htons(skb->len - size);
+	ipv6_addr_copy(&ip6h->daddr, (struct in6_addr *) &x->sel.daddr.a6);
+	ipv6_addr_copy(&ip6h->saddr, (struct in6_addr *) &x->sel.saddr.a6);
+	err = 0;
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm6_beet_mode = {
+	.input2 = xfrm6_beet_input,
+	.input = xfrm_prepare_input,
+	.output2 = xfrm6_beet_output,
+	.output = xfrm6_prepare_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_BEET,
+	.flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm6_beet_init(void)
+{
+	return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6);
+}
+
+static void __exit xfrm6_beet_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_beet_init);
+module_exit(xfrm6_beet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET);
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
new file mode 100644
index 00000000..63d5d493
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -0,0 +1,84 @@
+/*
+ * xfrm6_mode_ro.c - Route optimization mode for IPv6.
+ *
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * Authors:
+ *	Noriaki TAKAMIYA @USAGI
+ *	Masahide NAKAMURA @USAGI
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/stringify.h>
+#include <linux/time.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add route optimization header space.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the route optimization header.
+ */
+static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	u8 *prevhdr;
+	int hdr_len;
+
+	iph = ipv6_hdr(skb);
+
+	hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+	skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
+	skb_set_network_header(skb, -x->props.header_len);
+	skb->transport_header = skb->network_header + hdr_len;
+	__skb_pull(skb, hdr_len);
+	memmove(ipv6_hdr(skb), iph, hdr_len);
+
+	x->lastused = get_seconds();
+
+	return 0;
+}
+
+static struct xfrm_mode xfrm6_ro_mode = {
+	.output = xfrm6_ro_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_ROUTEOPTIMIZATION,
+};
+
+static int __init xfrm6_ro_init(void)
+{
+	return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6);
+}
+
+static void __exit xfrm6_ro_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_ro_init);
+module_exit(xfrm6_ro_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
new file mode 100644
index 00000000..4e344105
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -0,0 +1,85 @@
+/*
+ * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+/* Add encapsulation header.
+ *
+ * The IP header and mutable extension headers will be moved forward to make
+ * space for the encapsulation header.
+ */
+static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	u8 *prevhdr;
+	int hdr_len;
+
+	iph = ipv6_hdr(skb);
+
+	hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
+	skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
+	skb_set_network_header(skb, -x->props.header_len);
+	skb->transport_header = skb->network_header + hdr_len;
+	__skb_pull(skb, hdr_len);
+	memmove(ipv6_hdr(skb), iph, hdr_len);
+	return 0;
+}
+
+/* Remove encapsulation header.
+ *
+ * The IP header will be moved over the top of the encapsulation header.
+ *
+ * On entry, skb->h shall point to where the IP header should be and skb->nh
+ * shall be set to where the IP header currently is.  skb->data shall point
+ * to the start of the payload.
+ */
+static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int ihl = skb->data - skb_transport_header(skb);
+
+	if (skb->transport_header != skb->network_header) {
+		memmove(skb_transport_header(skb),
+			skb_network_header(skb), ihl);
+		skb->network_header = skb->transport_header;
+	}
+	ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
+					   sizeof(struct ipv6hdr));
+	skb_reset_transport_header(skb);
+	return 0;
+}
+
+static struct xfrm_mode xfrm6_transport_mode = {
+	.input = xfrm6_transport_input,
+	.output = xfrm6_transport_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TRANSPORT,
+};
+
+static int __init xfrm6_transport_init(void)
+{
+	return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
+}
+
+static void __exit xfrm6_transport_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_transport_init);
+module_exit(xfrm6_transport_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
new file mode 100644
index 00000000..23ecd68a
--- /dev/null
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -0,0 +1,117 @@
+/*
+ * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
+ *
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/stringify.h>
+#include <net/dsfield.h>
+#include <net/dst.h>
+#include <net/inet_ecn.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+
+static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
+{
+	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+	struct ipv6hdr *inner_iph = ipipv6_hdr(skb);
+
+	if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph)))
+		IP6_ECN_set_ce(inner_iph);
+}
+
+/* Add encapsulation header.
+ *
+ * The top IP header will be constructed per RFC 2401.
+ */
+static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct ipv6hdr *top_iph;
+	int dsfield;
+
+	skb_set_network_header(skb, -x->props.header_len);
+	skb->mac_header = skb->network_header +
+			  offsetof(struct ipv6hdr, nexthdr);
+	skb->transport_header = skb->network_header + sizeof(*top_iph);
+	top_iph = ipv6_hdr(skb);
+
+	top_iph->version = 6;
+
+	memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
+	       sizeof(top_iph->flow_lbl));
+	top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family);
+
+	dsfield = XFRM_MODE_SKB_CB(skb)->tos;
+	dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+	if (x->props.flags & XFRM_STATE_NOECN)
+		dsfield &= ~INET_ECN_MASK;
+	ipv6_change_dsfield(top_iph, 0, dsfield);
+	top_iph->hop_limit = ip6_dst_hoplimit(dst->child);
+	ipv6_addr_copy(&top_iph->saddr, (const struct in6_addr *)&x->props.saddr);
+	ipv6_addr_copy(&top_iph->daddr, (const struct in6_addr *)&x->id.daddr);
+	return 0;
+}
+
+static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
+		goto out;
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto out;
+
+	if (skb_cloned(skb) &&
+	    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		goto out;
+
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)),
+			       ipipv6_hdr(skb));
+	if (!(x->props.flags & XFRM_STATE_NOECN))
+		ipip6_ecn_decapsulate(skb);
+
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	err = 0;
+
+out:
+	return err;
+}
+
+static struct xfrm_mode xfrm6_tunnel_mode = {
+	.input2 = xfrm6_mode_tunnel_input,
+	.input = xfrm_prepare_input,
+	.output2 = xfrm6_mode_tunnel_output,
+	.output = xfrm6_prepare_output,
+	.owner = THIS_MODULE,
+	.encap = XFRM_MODE_TUNNEL,
+	.flags = XFRM_MODE_FLAG_TUNNEL,
+};
+
+static int __init xfrm6_mode_tunnel_init(void)
+{
+	return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
+}
+
+static void __exit xfrm6_mode_tunnel_exit(void)
+{
+	int err;
+
+	err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
+	BUG_ON(err);
+}
+
+module_init(xfrm6_mode_tunnel_init);
+module_exit(xfrm6_mode_tunnel_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
new file mode 100644
index 00000000..49a91c5f
--- /dev/null
+++ b/net/ipv6/xfrm6_output.c
@@ -0,0 +1,109 @@
+/*
+ * xfrm6_output.c - Common IPsec encapsulation code for IPv6.
+ * Copyright (C) 2002 USAGI/WIDE Project
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/xfrm.h>
+
+int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
+			  u8 **prevhdr)
+{
+	return ip6_find_1stfragopt(skb, prevhdr);
+}
+
+EXPORT_SYMBOL(xfrm6_find_1stfragopt);
+
+static int xfrm6_tunnel_check_size(struct sk_buff *skb)
+{
+	int mtu, ret = 0;
+	struct dst_entry *dst = skb_dst(skb);
+
+	mtu = dst_mtu(dst);
+	if (mtu < IPV6_MIN_MTU)
+		mtu = IPV6_MIN_MTU;
+
+	if (!skb->local_df && skb->len > mtu) {
+		skb->dev = dst->dev;
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		ret = -EMSGSIZE;
+	}
+
+	return ret;
+}
+
+int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	err = xfrm6_tunnel_check_size(skb);
+	if (err)
+		return err;
+
+	XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
+
+	return xfrm6_extract_header(skb);
+}
+
+int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	err = xfrm_inner_extract_output(x, skb);
+	if (err)
+		return err;
+
+	memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+#ifdef CONFIG_NETFILTER
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
+#endif
+
+	skb->protocol = htons(ETH_P_IPV6);
+	skb->local_df = 1;
+
+	return x->outer_mode->output2(x, skb);
+}
+EXPORT_SYMBOL(xfrm6_prepare_output);
+
+int xfrm6_output_finish(struct sk_buff *skb)
+{
+#ifdef CONFIG_NETFILTER
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
+#endif
+
+	skb->protocol = htons(ETH_P_IPV6);
+	return xfrm_output(skb);
+}
+
+static int __xfrm6_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+
+	if ((x && x->props.mode == XFRM_MODE_TUNNEL) &&
+	    ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+		dst_allfrag(skb_dst(skb)))) {
+			return ip6_fragment(skb, x->outer_mode->afinfo->output_finish);
+	}
+	return x->outer_mode->afinfo->output_finish(skb);
+}
+
+int xfrm6_output(struct sk_buff *skb)
+{
+	return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL,
+		       skb_dst(skb)->dev, __xfrm6_output);
+}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
new file mode 100644
index 00000000..d879f7ef
--- /dev/null
+++ b/net/ipv6/xfrm6_policy.c
@@ -0,0 +1,357 @@
+/*
+ * xfrm6_policy.c: based on xfrm4_policy.c
+ *
+ * Authors:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * 		IPv6 support
+ * 	YOSHIFUJI Hideaki
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+#include <net/mip6.h>
+#endif
+
+static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
+
+static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos,
+					  const xfrm_address_t *saddr,
+					  const xfrm_address_t *daddr)
+{
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int err;
+
+	memset(&fl6, 0, sizeof(fl6));
+	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
+	if (saddr)
+		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
+
+	dst = ip6_route_output(net, NULL, &fl6);
+
+	err = dst->error;
+	if (dst->error) {
+		dst_release(dst);
+		dst = ERR_PTR(err);
+	}
+
+	return dst;
+}
+
+static int xfrm6_get_saddr(struct net *net,
+			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+	struct dst_entry *dst;
+	struct net_device *dev;
+
+	dst = xfrm6_dst_lookup(net, 0, NULL, daddr);
+	if (IS_ERR(dst))
+		return -EHOSTUNREACH;
+
+	dev = ip6_dst_idev(dst)->dev;
+	ipv6_dev_get_saddr(dev_net(dev), dev,
+			   (struct in6_addr *)&daddr->a6, 0,
+			   (struct in6_addr *)&saddr->a6);
+	dst_release(dst);
+	return 0;
+}
+
+static int xfrm6_get_tos(const struct flowi *fl)
+{
+	return 0;
+}
+
+static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+			   int nfheader_len)
+{
+	if (dst->ops->family == AF_INET6) {
+		struct rt6_info *rt = (struct rt6_info*)dst;
+		if (rt->rt6i_node)
+			path->path_cookie = rt->rt6i_node->fn_sernum;
+	}
+
+	path->u.rt6.rt6i_nfheader_len = nfheader_len;
+
+	return 0;
+}
+
+static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+			  const struct flowi *fl)
+{
+	struct rt6_info *rt = (struct rt6_info*)xdst->route;
+
+	xdst->u.dst.dev = dev;
+	dev_hold(dev);
+
+	xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
+	if (!xdst->u.rt6.rt6i_idev)
+		return -ENODEV;
+
+	xdst->u.rt6.rt6i_peer = rt->rt6i_peer;
+	if (rt->rt6i_peer)
+		atomic_inc(&rt->rt6i_peer->refcnt);
+
+	/* Sheit... I remember I did this right. Apparently,
+	 * it was magically lost, so this code needs audit */
+	xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
+						   RTF_LOCAL);
+	xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
+	xdst->u.rt6.rt6i_node = rt->rt6i_node;
+	if (rt->rt6i_node)
+		xdst->route_cookie = rt->rt6i_node->fn_sernum;
+	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
+	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
+	xdst->u.rt6.rt6i_src = rt->rt6i_src;
+
+	return 0;
+}
+
+static inline void
+_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
+{
+	struct flowi6 *fl6 = &fl->u.ip6;
+	int onlyproto = 0;
+	u16 offset = skb_network_header_len(skb);
+	const struct ipv6hdr *hdr = ipv6_hdr(skb);
+	struct ipv6_opt_hdr *exthdr;
+	const unsigned char *nh = skb_network_header(skb);
+	u8 nexthdr = nh[IP6CB(skb)->nhoff];
+
+	memset(fl6, 0, sizeof(struct flowi6));
+	fl6->flowi6_mark = skb->mark;
+
+	ipv6_addr_copy(&fl6->daddr, reverse ? &hdr->saddr : &hdr->daddr);
+	ipv6_addr_copy(&fl6->saddr, reverse ? &hdr->daddr : &hdr->saddr);
+
+	while (nh + offset + 1 < skb->data ||
+	       pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
+		nh = skb_network_header(skb);
+		exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+
+		switch (nexthdr) {
+		case NEXTHDR_FRAGMENT:
+			onlyproto = 1;
+		case NEXTHDR_ROUTING:
+		case NEXTHDR_HOP:
+		case NEXTHDR_DEST:
+			offset += ipv6_optlen(exthdr);
+			nexthdr = exthdr->nexthdr;
+			exthdr = (struct ipv6_opt_hdr *)(nh + offset);
+			break;
+
+		case IPPROTO_UDP:
+		case IPPROTO_UDPLITE:
+		case IPPROTO_TCP:
+		case IPPROTO_SCTP:
+		case IPPROTO_DCCP:
+			if (!onlyproto && (nh + offset + 4 < skb->data ||
+			     pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
+				__be16 *ports = (__be16 *)exthdr;
+
+				fl6->fl6_sport = ports[!!reverse];
+				fl6->fl6_dport = ports[!reverse];
+			}
+			fl6->flowi6_proto = nexthdr;
+			return;
+
+		case IPPROTO_ICMPV6:
+			if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
+				u8 *icmp = (u8 *)exthdr;
+
+				fl6->fl6_icmp_type = icmp[0];
+				fl6->fl6_icmp_code = icmp[1];
+			}
+			fl6->flowi6_proto = nexthdr;
+			return;
+
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+		case IPPROTO_MH:
+			if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
+				struct ip6_mh *mh;
+				mh = (struct ip6_mh *)exthdr;
+
+				fl6->fl6_mh_type = mh->ip6mh_type;
+			}
+			fl6->flowi6_proto = nexthdr;
+			return;
+#endif
+
+		/* XXX Why are there these headers? */
+		case IPPROTO_AH:
+		case IPPROTO_ESP:
+		case IPPROTO_COMP:
+		default:
+			fl6->fl6_ipsec_spi = 0;
+			fl6->flowi6_proto = nexthdr;
+			return;
+		}
+	}
+}
+
+static inline int xfrm6_garbage_collect(struct dst_ops *ops)
+{
+	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
+
+	xfrm6_policy_afinfo.garbage_collect(net);
+	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
+}
+
+static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	path->ops->update_pmtu(path, mtu);
+}
+
+static void xfrm6_dst_destroy(struct dst_entry *dst)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+	if (likely(xdst->u.rt6.rt6i_idev))
+		in6_dev_put(xdst->u.rt6.rt6i_idev);
+	dst_destroy_metrics_generic(dst);
+	if (likely(xdst->u.rt6.rt6i_peer))
+		inet_putpeer(xdst->u.rt6.rt6i_peer);
+	xfrm_dst_destroy(xdst);
+}
+
+static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+			     int unregister)
+{
+	struct xfrm_dst *xdst;
+
+	if (!unregister)
+		return;
+
+	xdst = (struct xfrm_dst *)dst;
+	if (xdst->u.rt6.rt6i_idev->dev == dev) {
+		struct inet6_dev *loopback_idev =
+			in6_dev_get(dev_net(dev)->loopback_dev);
+		BUG_ON(!loopback_idev);
+
+		do {
+			in6_dev_put(xdst->u.rt6.rt6i_idev);
+			xdst->u.rt6.rt6i_idev = loopback_idev;
+			in6_dev_hold(loopback_idev);
+			xdst = (struct xfrm_dst *)xdst->u.dst.child;
+		} while (xdst->u.dst.xfrm);
+
+		__in6_dev_put(loopback_idev);
+	}
+
+	xfrm_dst_ifdown(dst, dev);
+}
+
+static struct dst_ops xfrm6_dst_ops = {
+	.family =		AF_INET6,
+	.protocol =		cpu_to_be16(ETH_P_IPV6),
+	.gc =			xfrm6_garbage_collect,
+	.update_pmtu =		xfrm6_update_pmtu,
+	.cow_metrics =		dst_cow_metrics_generic,
+	.destroy =		xfrm6_dst_destroy,
+	.ifdown =		xfrm6_dst_ifdown,
+	.local_out =		__ip6_local_out,
+	.gc_thresh =		1024,
+};
+
+static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
+	.family =		AF_INET6,
+	.dst_ops =		&xfrm6_dst_ops,
+	.dst_lookup =		xfrm6_dst_lookup,
+	.get_saddr = 		xfrm6_get_saddr,
+	.decode_session =	_decode_session6,
+	.get_tos =		xfrm6_get_tos,
+	.init_path =		xfrm6_init_path,
+	.fill_dst =		xfrm6_fill_dst,
+	.blackhole_route =	ip6_blackhole_route,
+};
+
+static int __init xfrm6_policy_init(void)
+{
+	return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo);
+}
+
+static void xfrm6_policy_fini(void)
+{
+	xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm6_policy_table[] = {
+	{
+		.procname       = "xfrm6_gc_thresh",
+		.data	   	= &init_net.xfrm.xfrm6_dst_ops.gc_thresh,
+		.maxlen	 	= sizeof(int),
+		.mode	   	= 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_table_header *sysctl_hdr;
+#endif
+
+int __init xfrm6_init(void)
+{
+	int ret;
+	unsigned int gc_thresh;
+
+	/*
+	 * We need a good default value for the xfrm6 gc threshold.
+	 * In ipv4 we set it to the route hash table size * 8, which
+	 * is half the size of the maximaum route cache for ipv4.  It
+	 * would be good to do the same thing for v6, except the table is
+	 * constructed differently here.  Here each table for a net namespace
+	 * can have FIB_TABLE_HASHSZ entries, so lets go with the same
+	 * computation that we used for ipv4 here.  Also, lets keep the initial
+	 * gc_thresh to a minimum of 1024, since, the ipv6 route cache defaults
+	 * to that as a minimum as well
+	 */
+	gc_thresh = FIB6_TABLE_HASHSZ * 8;
+	xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh;
+	dst_entries_init(&xfrm6_dst_ops);
+
+	ret = xfrm6_policy_init();
+	if (ret) {
+		dst_entries_destroy(&xfrm6_dst_ops);
+		goto out;
+	}
+	ret = xfrm6_state_init();
+	if (ret)
+		goto out_policy;
+
+#ifdef CONFIG_SYSCTL
+	sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv6_ctl_path,
+						xfrm6_policy_table);
+#endif
+out:
+	return ret;
+out_policy:
+	xfrm6_policy_fini();
+	goto out;
+}
+
+void xfrm6_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (sysctl_hdr)
+		unregister_net_sysctl_table(sysctl_hdr);
+#endif
+	//xfrm6_input_fini();
+	xfrm6_policy_fini();
+	xfrm6_state_fini();
+	dst_entries_destroy(&xfrm6_dst_ops);
+}
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
new file mode 100644
index 00000000..248f0b2a
--- /dev/null
+++ b/net/ipv6/xfrm6_state.c
@@ -0,0 +1,196 @@
+/*
+ * xfrm6_state.c: based on xfrm4_state.c
+ *
+ * Authors:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * 		IPv6 support
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dsfield.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+
+static void
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi6 *fl6 = &fl->u.ip6;
+
+	/* Initialize temporary selector matching only
+	 * to current session. */
+	ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl6->daddr);
+	ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl6->saddr);
+	sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET6;
+	sel->prefixlen_d = 128;
+	sel->prefixlen_s = 128;
+	sel->proto = fl6->flowi6_proto;
+	sel->ifindex = fl6->flowi6_oif;
+}
+
+static void
+xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
+		   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
+{
+	x->id = tmpl->id;
+	if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
+		memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
+	memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
+	if (ipv6_addr_any((struct in6_addr*)&x->props.saddr))
+		memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
+	x->props.mode = tmpl->mode;
+	x->props.reqid = tmpl->reqid;
+	x->props.family = AF_INET6;
+}
+
+/* distribution counting sort function for xfrm_state and xfrm_tmpl */
+static int
+__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
+{
+	int i;
+	int class[XFRM_MAX_DEPTH];
+	int count[maxclass];
+
+	memset(count, 0, sizeof(count));
+
+	for (i = 0; i < n; i++) {
+		int c;
+		class[i] = c = cmp(src[i]);
+		count[c]++;
+	}
+
+	for (i = 2; i < maxclass; i++)
+		count[i] += count[i - 1];
+
+	for (i = 0; i < n; i++) {
+		dst[count[class[i] - 1]++] = src[i];
+		src[i] = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * Rule for xfrm_state:
+ *
+ * rule 1: select IPsec transport except AH
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec transport AH
+ * rule 4: select IPsec tunnel
+ * rule 5: others
+ */
+static int __xfrm6_state_sort_cmp(void *p)
+{
+	struct xfrm_state *v = p;
+
+	switch (v->props.mode) {
+	case XFRM_MODE_TRANSPORT:
+		if (v->id.proto != IPPROTO_AH)
+			return 1;
+		else
+			return 3;
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	case XFRM_MODE_ROUTEOPTIMIZATION:
+	case XFRM_MODE_IN_TRIGGER:
+		return 2;
+#endif
+	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_BEET:
+		return 4;
+	}
+	return 5;
+}
+
+static int
+__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
+{
+	return __xfrm6_sort((void **)dst, (void **)src, n,
+			    __xfrm6_state_sort_cmp, 6);
+}
+
+/*
+ * Rule for xfrm_tmpl:
+ *
+ * rule 1: select IPsec transport
+ * rule 2: select MIPv6 RO or inbound trigger
+ * rule 3: select IPsec tunnel
+ * rule 4: others
+ */
+static int __xfrm6_tmpl_sort_cmp(void *p)
+{
+	struct xfrm_tmpl *v = p;
+	switch (v->mode) {
+	case XFRM_MODE_TRANSPORT:
+		return 1;
+#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
+	case XFRM_MODE_ROUTEOPTIMIZATION:
+	case XFRM_MODE_IN_TRIGGER:
+		return 2;
+#endif
+	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_BEET:
+		return 3;
+	}
+	return 4;
+}
+
+static int
+__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
+{
+	return __xfrm6_sort((void **)dst, (void **)src, n,
+			    __xfrm6_tmpl_sort_cmp, 5);
+}
+
+int xfrm6_extract_header(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+
+	XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
+	XFRM_MODE_SKB_CB(skb)->id = 0;
+	XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
+	XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
+	XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
+	XFRM_MODE_SKB_CB(skb)->optlen = 0;
+	memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
+	       sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
+
+	return 0;
+}
+
+static struct xfrm_state_afinfo xfrm6_state_afinfo = {
+	.family			= AF_INET6,
+	.proto			= IPPROTO_IPV6,
+	.eth_proto		= htons(ETH_P_IPV6),
+	.owner			= THIS_MODULE,
+	.init_tempsel		= __xfrm6_init_tempsel,
+	.init_temprop		= xfrm6_init_temprop,
+	.tmpl_sort		= __xfrm6_tmpl_sort,
+	.state_sort		= __xfrm6_state_sort,
+	.output			= xfrm6_output,
+	.output_finish		= xfrm6_output_finish,
+	.extract_input		= xfrm6_extract_input,
+	.extract_output		= xfrm6_extract_output,
+	.transport_finish	= xfrm6_transport_finish,
+};
+
+int __init xfrm6_state_init(void)
+{
+	return xfrm_state_register_afinfo(&xfrm6_state_afinfo);
+}
+
+void xfrm6_state_fini(void)
+{
+	xfrm_state_unregister_afinfo(&xfrm6_state_afinfo);
+}
+
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
new file mode 100644
index 00000000..4fe1db12
--- /dev/null
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors	Mitsuru KANDA  <mk@linux-ipv6.org>
+ * 		YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ *
+ * Based on net/ipv4/xfrm4_tunnel.c
+ *
+ */
+#include <linux/module.h>
+#include <linux/xfrm.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ipv6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+#include <net/netns/generic.h>
+
+#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
+#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
+
+#define XFRM6_TUNNEL_SPI_MIN	1
+#define XFRM6_TUNNEL_SPI_MAX	0xffffffff
+
+struct xfrm6_tunnel_net {
+	struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE];
+	struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE];
+	u32 spi;
+};
+
+static int xfrm6_tunnel_net_id __read_mostly;
+static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
+{
+	return net_generic(net, xfrm6_tunnel_net_id);
+}
+
+/*
+ * xfrm_tunnel_spi things are for allocating unique id ("spi")
+ * per xfrm_address_t.
+ */
+struct xfrm6_tunnel_spi {
+	struct hlist_node	list_byaddr;
+	struct hlist_node	list_byspi;
+	xfrm_address_t		addr;
+	u32			spi;
+	atomic_t		refcnt;
+	struct rcu_head		rcu_head;
+};
+
+static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock);
+
+static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly;
+
+static inline unsigned xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr)
+{
+	unsigned h;
+
+	h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1;
+
+	return h;
+}
+
+static inline unsigned xfrm6_tunnel_spi_hash_byspi(u32 spi)
+{
+	return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE;
+}
+
+static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
+{
+	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+	struct xfrm6_tunnel_spi *x6spi;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry_rcu(x6spi, pos,
+			     &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
+			     list_byaddr) {
+		if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0)
+			return x6spi;
+	}
+
+	return NULL;
+}
+
+__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr)
+{
+	struct xfrm6_tunnel_spi *x6spi;
+	u32 spi;
+
+	rcu_read_lock_bh();
+	x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
+	spi = x6spi ? x6spi->spi : 0;
+	rcu_read_unlock_bh();
+	return htonl(spi);
+}
+
+EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup);
+
+static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi)
+{
+	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+	struct xfrm6_tunnel_spi *x6spi;
+	int index = xfrm6_tunnel_spi_hash_byspi(spi);
+	struct hlist_node *pos;
+
+	hlist_for_each_entry(x6spi, pos,
+			     &xfrm6_tn->spi_byspi[index],
+			     list_byspi) {
+		if (x6spi->spi == spi)
+			return -1;
+	}
+	return index;
+}
+
+static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
+{
+	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+	u32 spi;
+	struct xfrm6_tunnel_spi *x6spi;
+	int index;
+
+	if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN ||
+	    xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX)
+		xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN;
+	else
+		xfrm6_tn->spi++;
+
+	for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) {
+		index = __xfrm6_tunnel_spi_check(net, spi);
+		if (index >= 0)
+			goto alloc_spi;
+	}
+	for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
+		index = __xfrm6_tunnel_spi_check(net, spi);
+		if (index >= 0)
+			goto alloc_spi;
+	}
+	spi = 0;
+	goto out;
+alloc_spi:
+	xfrm6_tn->spi = spi;
+	x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC);
+	if (!x6spi)
+		goto out;
+
+	memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr));
+	x6spi->spi = spi;
+	atomic_set(&x6spi->refcnt, 1);
+
+	hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]);
+
+	index = xfrm6_tunnel_spi_hash_byaddr(saddr);
+	hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]);
+out:
+	return spi;
+}
+
+__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
+{
+	struct xfrm6_tunnel_spi *x6spi;
+	u32 spi;
+
+	spin_lock_bh(&xfrm6_tunnel_spi_lock);
+	x6spi = __xfrm6_tunnel_spi_lookup(net, saddr);
+	if (x6spi) {
+		atomic_inc(&x6spi->refcnt);
+		spi = x6spi->spi;
+	} else
+		spi = __xfrm6_tunnel_alloc_spi(net, saddr);
+	spin_unlock_bh(&xfrm6_tunnel_spi_lock);
+
+	return htonl(spi);
+}
+
+EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi);
+
+static void x6spi_destroy_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(xfrm6_tunnel_spi_kmem,
+			container_of(head, struct xfrm6_tunnel_spi, rcu_head));
+}
+
+static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
+{
+	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+	struct xfrm6_tunnel_spi *x6spi;
+	struct hlist_node *pos, *n;
+
+	spin_lock_bh(&xfrm6_tunnel_spi_lock);
+
+	hlist_for_each_entry_safe(x6spi, pos, n,
+				  &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
+				  list_byaddr)
+	{
+		if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) {
+			if (atomic_dec_and_test(&x6spi->refcnt)) {
+				hlist_del_rcu(&x6spi->list_byaddr);
+				hlist_del_rcu(&x6spi->list_byspi);
+				call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu);
+				break;
+			}
+		}
+	}
+	spin_unlock_bh(&xfrm6_tunnel_spi_lock);
+}
+
+static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	skb_push(skb, -skb_network_offset(skb));
+	return 0;
+}
+
+static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	return skb_network_header(skb)[IP6CB(skb)->nhoff];
+}
+
+static int xfrm6_tunnel_rcv(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb->dev);
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	__be32 spi;
+
+	spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr);
+	return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi);
+}
+
+static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			    u8 type, u8 code, int offset, __be32 info)
+{
+	/* xfrm6_tunnel native err handling */
+	switch (type) {
+	case ICMPV6_DEST_UNREACH:
+		switch (code) {
+		case ICMPV6_NOROUTE:
+		case ICMPV6_ADM_PROHIBITED:
+		case ICMPV6_NOT_NEIGHBOUR:
+		case ICMPV6_ADDR_UNREACH:
+		case ICMPV6_PORT_UNREACH:
+		default:
+			break;
+		}
+		break;
+	case ICMPV6_PKT_TOOBIG:
+		break;
+	case ICMPV6_TIME_EXCEED:
+		switch (code) {
+		case ICMPV6_EXC_HOPLIMIT:
+			break;
+		case ICMPV6_EXC_FRAGTIME:
+		default:
+			break;
+		}
+		break;
+	case ICMPV6_PARAMPROB:
+		switch (code) {
+		case ICMPV6_HDR_FIELD: break;
+		case ICMPV6_UNK_NEXTHDR: break;
+		case ICMPV6_UNK_OPTION: break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int xfrm6_tunnel_init_state(struct xfrm_state *x)
+{
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		return -EINVAL;
+
+	if (x->encap)
+		return -EINVAL;
+
+	x->props.header_len = sizeof(struct ipv6hdr);
+
+	return 0;
+}
+
+static void xfrm6_tunnel_destroy(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+
+	xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr);
+}
+
+static const struct xfrm_type xfrm6_tunnel_type = {
+	.description	= "IP6IP6",
+	.owner          = THIS_MODULE,
+	.proto		= IPPROTO_IPV6,
+	.init_state	= xfrm6_tunnel_init_state,
+	.destructor	= xfrm6_tunnel_destroy,
+	.input		= xfrm6_tunnel_input,
+	.output		= xfrm6_tunnel_output,
+};
+
+static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
+	.handler	= xfrm6_tunnel_rcv,
+	.err_handler	= xfrm6_tunnel_err,
+	.priority	= 2,
+};
+
+static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
+	.handler	= xfrm6_tunnel_rcv,
+	.err_handler	= xfrm6_tunnel_err,
+	.priority	= 2,
+};
+
+static int __net_init xfrm6_tunnel_net_init(struct net *net)
+{
+	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
+	unsigned int i;
+
+	for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]);
+	for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++)
+		INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]);
+	xfrm6_tn->spi = 0;
+
+	return 0;
+}
+
+static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
+{
+}
+
+static struct pernet_operations xfrm6_tunnel_net_ops = {
+	.init	= xfrm6_tunnel_net_init,
+	.exit	= xfrm6_tunnel_net_exit,
+	.id	= &xfrm6_tunnel_net_id,
+	.size	= sizeof(struct xfrm6_tunnel_net),
+};
+
+static int __init xfrm6_tunnel_init(void)
+{
+	int rv;
+
+	xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
+						  sizeof(struct xfrm6_tunnel_spi),
+						  0, SLAB_HWCACHE_ALIGN,
+						  NULL);
+	if (!xfrm6_tunnel_spi_kmem)
+		return -ENOMEM;
+	rv = register_pernet_subsys(&xfrm6_tunnel_net_ops);
+	if (rv < 0)
+		goto out_pernet;
+	rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6);
+	if (rv < 0)
+		goto out_type;
+	rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6);
+	if (rv < 0)
+		goto out_xfrm6;
+	rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET);
+	if (rv < 0)
+		goto out_xfrm46;
+	return 0;
+
+out_xfrm46:
+	xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
+out_xfrm6:
+	xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
+out_type:
+	unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
+out_pernet:
+	kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
+	return rv;
+}
+
+static void __exit xfrm6_tunnel_fini(void)
+{
+	xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET);
+	xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
+	xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
+	unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
+	kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
+}
+
+module_init(xfrm6_tunnel_init);
+module_exit(xfrm6_tunnel_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
new file mode 100644
index 00000000..e9ad0062
--- /dev/null
+++ b/net/ipx/Kconfig
@@ -0,0 +1,60 @@
+#
+# IPX configuration
+#
+config IPX
+	tristate "The IPX protocol"
+	select LLC
+	---help---
+	  This is support for the Novell networking protocol, IPX, commonly
+	  used for local networks of Windows machines.  You need it if you
+	  want to access Novell NetWare file or print servers using the Linux
+	  Novell client ncpfs (available from
+	  <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
+	  within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>).  In order
+	  to do the former, you'll also have to say Y to "NCP file system
+	  support", below.
+
+	  IPX is similar in scope to IP, while SPX, which runs on top of IPX,
+	  is similar to TCP.
+
+	  To turn your Linux box into a fully featured NetWare file server and
+	  IPX router, say Y here and fetch either lwared from
+	  <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
+	  mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
+	  information, read the IPX-HOWTO available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  The IPX driver would enlarge your kernel by about 16 KB. To compile
+	  this driver as a module, choose M here: the module will be called ipx.
+	  Unless you want to integrate your Linux box with a local Novell
+	  network, say N.
+
+config IPX_INTERN
+	bool "IPX: Full internal IPX network"
+	depends on IPX
+	---help---
+	  Every IPX network has an address that identifies it. Sometimes it is
+	  useful to give an IPX "network" address to your Linux box as well
+	  (for example if your box is acting as a file server for different
+	  IPX networks: it will then be accessible from everywhere using the
+	  same address). The way this is done is to create a virtual internal
+	  "network" inside your box and to assign an IPX address to this
+	  network. Say Y here if you want to do this; read the IPX-HOWTO at
+	  <http://www.tldp.org/docs.html#howto> for details.
+
+	  The full internal IPX network enables you to allocate sockets on
+	  different virtual nodes of the internal network. This is done by
+	  evaluating the field sipx_node of the socket address given to the
+	  bind call. So applications should always initialize the node field
+	  to 0 when binding a socket on the primary network. In this case the
+	  socket is assigned the default node that has been given to the
+	  kernel when the internal network was created. By enabling the full
+	  internal IPX network the cross-forwarding of packets targeted at
+	  'special' sockets to sockets listening on the primary network is
+	  disabled. This might break existing applications, especially RIP/SAP
+	  daemons. A RIP/SAP daemon that works well with the full internal net
+	  can be found on <ftp://ftp.gwdg.de/pub/linux/misc/ncpfs/>.
+
+	  If you don't know what you are doing, say N.
+
diff --git a/net/ipx/Makefile b/net/ipx/Makefile
new file mode 100644
index 00000000..4b95e3ea
--- /dev/null
+++ b/net/ipx/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux IPX layer.
+#
+
+obj-$(CONFIG_IPX) += ipx.o
+
+ipx-y			:= af_ipx.o ipx_route.o ipx_proc.o
+ipx-$(CONFIG_SYSCTL)	+= sysctl_net_ipx.o
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
new file mode 100644
index 00000000..96802266
--- /dev/null
+++ b/net/ipx/af_ipx.c
@@ -0,0 +1,2092 @@
+/*
+ *	Implements an IPX socket layer.
+ *
+ *	This code is derived from work by
+ *		Ross Biro	: 	Writing the original IP stack
+ *		Fred Van Kempen :	Tidying up the TCP/IP
+ *
+ *	Many thanks go to Keith Baker, Institute For Industrial Information
+ *	Technology Ltd, Swansea University for allowing me to work on this
+ *	in my own time even though it was in some ways related to commercial
+ *	work I am currently employed to do there.
+ *
+ *	All the material in this file is subject to the Gnu license version 2.
+ *	Neither Alan Cox nor the Swansea University Computer Society admit
+ *	liability nor provide warranty for any of this software. This material
+ *	is provided as is and at no charge.
+ *
+ *	Portions Copyright (c) 2000-2003 Conectiva, Inc. <acme@conectiva.com.br>
+ *	Neither Arnaldo Carvalho de Melo nor Conectiva, Inc. admit liability nor
+ *	provide warranty for any of this software. This material is provided
+ *	"AS-IS" and at no charge.
+ *
+ * 	Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com>
+ *	Neither Greg Page nor Caldera, Inc. admit liability nor provide
+ *	warranty for any of this software. This material is provided
+ *	"AS-IS" and at no charge.
+ *
+ *	See net/ipx/ChangeLog.
+ */
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/init.h>
+#include <linux/ipx.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/termios.h>
+
+#include <net/ipx.h>
+#include <net/p8022.h>
+#include <net/psnap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_SYSCTL
+extern void ipx_register_sysctl(void);
+extern void ipx_unregister_sysctl(void);
+#else
+#define ipx_register_sysctl()
+#define ipx_unregister_sysctl()
+#endif
+
+/* Configuration Variables */
+static unsigned char ipxcfg_max_hops = 16;
+static char ipxcfg_auto_select_primary;
+static char ipxcfg_auto_create_interfaces;
+int sysctl_ipx_pprop_broadcasting = 1;
+
+/* Global Variables */
+static struct datalink_proto *p8022_datalink;
+static struct datalink_proto *pEII_datalink;
+static struct datalink_proto *p8023_datalink;
+static struct datalink_proto *pSNAP_datalink;
+
+static const struct proto_ops ipx_dgram_ops;
+
+LIST_HEAD(ipx_interfaces);
+DEFINE_SPINLOCK(ipx_interfaces_lock);
+
+struct ipx_interface *ipx_primary_net;
+struct ipx_interface *ipx_internal_net;
+
+extern int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
+			    unsigned char *node);
+extern void ipxrtr_del_routes(struct ipx_interface *intrfc);
+extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
+			       struct iovec *iov, size_t len, int noblock);
+extern int ipxrtr_route_skb(struct sk_buff *skb);
+extern struct ipx_route *ipxrtr_lookup(__be32 net);
+extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg);
+
+struct ipx_interface *ipx_interfaces_head(void)
+{
+	struct ipx_interface *rc = NULL;
+
+	if (!list_empty(&ipx_interfaces))
+		rc = list_entry(ipx_interfaces.next,
+				struct ipx_interface, node);
+	return rc;
+}
+
+static void ipxcfg_set_auto_select(char val)
+{
+	ipxcfg_auto_select_primary = val;
+	if (val && !ipx_primary_net)
+		ipx_primary_net = ipx_interfaces_head();
+}
+
+static int ipxcfg_get_config_data(struct ipx_config_data __user *arg)
+{
+	struct ipx_config_data vals;
+
+	vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces;
+	vals.ipxcfg_auto_select_primary	   = ipxcfg_auto_select_primary;
+
+	return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0;
+}
+
+/*
+ * Note: Sockets may not be removed _during_ an interrupt or inet_bh
+ * handler using this technique. They can be added although we do not
+ * use this facility.
+ */
+
+static void ipx_remove_socket(struct sock *sk)
+{
+	/* Determine interface with which socket is associated */
+	struct ipx_interface *intrfc = ipx_sk(sk)->intrfc;
+
+	if (!intrfc)
+		goto out;
+
+	ipxitf_hold(intrfc);
+	spin_lock_bh(&intrfc->if_sklist_lock);
+	sk_del_node_init(sk);
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+	ipxitf_put(intrfc);
+out:
+	return;
+}
+
+static void ipx_destroy_socket(struct sock *sk)
+{
+	ipx_remove_socket(sk);
+	skb_queue_purge(&sk->sk_receive_queue);
+	sk_refcnt_debug_dec(sk);
+}
+
+/*
+ * The following code is used to support IPX Interfaces (IPXITF).  An
+ * IPX interface is defined by a physical device and a frame type.
+ */
+
+/* ipxitf_clear_primary_net has to be called with ipx_interfaces_lock held */
+
+static void ipxitf_clear_primary_net(void)
+{
+	ipx_primary_net = NULL;
+	if (ipxcfg_auto_select_primary)
+		ipx_primary_net = ipx_interfaces_head();
+}
+
+static struct ipx_interface *__ipxitf_find_using_phys(struct net_device *dev,
+						      __be16 datalink)
+{
+	struct ipx_interface *i;
+
+	list_for_each_entry(i, &ipx_interfaces, node)
+		if (i->if_dev == dev && i->if_dlink_type == datalink)
+			goto out;
+	i = NULL;
+out:
+	return i;
+}
+
+static struct ipx_interface *ipxitf_find_using_phys(struct net_device *dev,
+						    __be16 datalink)
+{
+	struct ipx_interface *i;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	i = __ipxitf_find_using_phys(dev, datalink);
+	if (i)
+		ipxitf_hold(i);
+	spin_unlock_bh(&ipx_interfaces_lock);
+	return i;
+}
+
+struct ipx_interface *ipxitf_find_using_net(__be32 net)
+{
+	struct ipx_interface *i;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	if (net) {
+		list_for_each_entry(i, &ipx_interfaces, node)
+			if (i->if_netnum == net)
+				goto hold;
+		i = NULL;
+		goto unlock;
+	}
+
+	i = ipx_primary_net;
+	if (i)
+hold:
+		ipxitf_hold(i);
+unlock:
+	spin_unlock_bh(&ipx_interfaces_lock);
+	return i;
+}
+
+/* Sockets are bound to a particular IPX interface. */
+static void ipxitf_insert_socket(struct ipx_interface *intrfc, struct sock *sk)
+{
+	ipxitf_hold(intrfc);
+	spin_lock_bh(&intrfc->if_sklist_lock);
+	ipx_sk(sk)->intrfc = intrfc;
+	sk_add_node(sk, &intrfc->if_sklist);
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+	ipxitf_put(intrfc);
+}
+
+/* caller must hold intrfc->if_sklist_lock */
+static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc,
+					 __be16 port)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	sk_for_each(s, node, &intrfc->if_sklist)
+		if (ipx_sk(s)->port == port)
+			goto found;
+	s = NULL;
+found:
+	return s;
+}
+
+/* caller must hold a reference to intrfc */
+static struct sock *ipxitf_find_socket(struct ipx_interface *intrfc,
+					__be16 port)
+{
+	struct sock *s;
+
+	spin_lock_bh(&intrfc->if_sklist_lock);
+	s = __ipxitf_find_socket(intrfc, port);
+	if (s)
+		sock_hold(s);
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+
+	return s;
+}
+
+#ifdef CONFIG_IPX_INTERN
+static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc,
+						unsigned char *ipx_node,
+						__be16 port)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	ipxitf_hold(intrfc);
+	spin_lock_bh(&intrfc->if_sklist_lock);
+
+	sk_for_each(s, node, &intrfc->if_sklist) {
+		struct ipx_sock *ipxs = ipx_sk(s);
+
+		if (ipxs->port == port &&
+		    !memcmp(ipx_node, ipxs->node, IPX_NODE_LEN))
+			goto found;
+	}
+	s = NULL;
+found:
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+	ipxitf_put(intrfc);
+	return s;
+}
+#endif
+
+static void __ipxitf_down(struct ipx_interface *intrfc)
+{
+	struct sock *s;
+	struct hlist_node *node, *t;
+
+	/* Delete all routes associated with this interface */
+	ipxrtr_del_routes(intrfc);
+
+	spin_lock_bh(&intrfc->if_sklist_lock);
+	/* error sockets */
+	sk_for_each_safe(s, node, t, &intrfc->if_sklist) {
+		struct ipx_sock *ipxs = ipx_sk(s);
+
+		s->sk_err = ENOLINK;
+		s->sk_error_report(s);
+		ipxs->intrfc = NULL;
+		ipxs->port   = 0;
+		sock_set_flag(s, SOCK_ZAPPED); /* Indicates it is no longer bound */
+		sk_del_node_init(s);
+	}
+	INIT_HLIST_HEAD(&intrfc->if_sklist);
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+
+	/* remove this interface from list */
+	list_del(&intrfc->node);
+
+	/* remove this interface from *special* networks */
+	if (intrfc == ipx_primary_net)
+		ipxitf_clear_primary_net();
+	if (intrfc == ipx_internal_net)
+		ipx_internal_net = NULL;
+
+	if (intrfc->if_dev)
+		dev_put(intrfc->if_dev);
+	kfree(intrfc);
+}
+
+void ipxitf_down(struct ipx_interface *intrfc)
+{
+	spin_lock_bh(&ipx_interfaces_lock);
+	__ipxitf_down(intrfc);
+	spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static __inline__ void __ipxitf_put(struct ipx_interface *intrfc)
+{
+	if (atomic_dec_and_test(&intrfc->refcnt))
+		__ipxitf_down(intrfc);
+}
+
+static int ipxitf_device_event(struct notifier_block *notifier,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct ipx_interface *i, *tmp;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_DOWN && event != NETDEV_UP)
+		goto out;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	list_for_each_entry_safe(i, tmp, &ipx_interfaces, node)
+		if (i->if_dev == dev) {
+			if (event == NETDEV_UP)
+				ipxitf_hold(i);
+			else
+				__ipxitf_put(i);
+		}
+	spin_unlock_bh(&ipx_interfaces_lock);
+out:
+	return NOTIFY_DONE;
+}
+
+
+static __exit void ipxitf_cleanup(void)
+{
+	struct ipx_interface *i, *tmp;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	list_for_each_entry_safe(i, tmp, &ipx_interfaces, node)
+		__ipxitf_put(i);
+	spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static void ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb)
+{
+	if (sock_queue_rcv_skb(sock, skb) < 0)
+		kfree_skb(skb);
+}
+
+/*
+ * On input skb->sk is NULL. Nobody is charged for the memory.
+ */
+
+/* caller must hold a reference to intrfc */
+
+#ifdef CONFIG_IPX_INTERN
+static int ipxitf_demux_socket(struct ipx_interface *intrfc,
+			       struct sk_buff *skb, int copy)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node,
+				   IPX_NODE_LEN);
+	struct sock *s;
+	struct hlist_node *node;
+	int rc;
+
+	spin_lock_bh(&intrfc->if_sklist_lock);
+
+	sk_for_each(s, node, &intrfc->if_sklist) {
+		struct ipx_sock *ipxs = ipx_sk(s);
+
+		if (ipxs->port == ipx->ipx_dest.sock &&
+		    (is_broadcast || !memcmp(ipx->ipx_dest.node,
+					     ipxs->node, IPX_NODE_LEN))) {
+			/* We found a socket to which to send */
+			struct sk_buff *skb1;
+
+			if (copy) {
+				skb1 = skb_clone(skb, GFP_ATOMIC);
+				rc = -ENOMEM;
+				if (!skb1)
+					goto out;
+			} else {
+				skb1 = skb;
+				copy = 1; /* skb may only be used once */
+			}
+			ipxitf_def_skb_handler(s, skb1);
+
+			/* On an external interface, one socket can listen */
+			if (intrfc != ipx_internal_net)
+				break;
+		}
+	}
+
+	/* skb was solely for us, and we did not make a copy, so free it. */
+	if (!copy)
+		kfree_skb(skb);
+
+	rc = 0;
+out:
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+	return rc;
+}
+#else
+static struct sock *ncp_connection_hack(struct ipx_interface *intrfc,
+					struct ipxhdr *ipx)
+{
+	/* The packet's target is a NCP connection handler. We want to hand it
+	 * to the correct socket directly within the kernel, so that the
+	 * mars_nwe packet distribution process does not have to do it. Here we
+	 * only care about NCP and BURST packets.
+	 *
+	 * You might call this a hack, but believe me, you do not want a
+	 * complete NCP layer in the kernel, and this is VERY fast as well. */
+	struct sock *sk = NULL;
+	int connection = 0;
+	u8 *ncphdr = (u8 *)(ipx + 1);
+
+	if (*ncphdr == 0x22 && *(ncphdr + 1) == 0x22) /* NCP request */
+		connection = (((int) *(ncphdr + 5)) << 8) | (int) *(ncphdr + 3);
+	else if (*ncphdr == 0x77 && *(ncphdr + 1) == 0x77) /* BURST packet */
+		connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8);
+
+	if (connection) {
+		struct hlist_node *node;
+		/* Now we have to look for a special NCP connection handling
+		 * socket. Only these sockets have ipx_ncp_conn != 0, set by
+		 * SIOCIPXNCPCONN. */
+		spin_lock_bh(&intrfc->if_sklist_lock);
+		sk_for_each(sk, node, &intrfc->if_sklist)
+			if (ipx_sk(sk)->ipx_ncp_conn == connection) {
+				sock_hold(sk);
+				goto found;
+			}
+		sk = NULL;
+	found:
+		spin_unlock_bh(&intrfc->if_sklist_lock);
+	}
+	return sk;
+}
+
+static int ipxitf_demux_socket(struct ipx_interface *intrfc,
+			       struct sk_buff *skb, int copy)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	struct sock *sock1 = NULL, *sock2 = NULL;
+	struct sk_buff *skb1 = NULL, *skb2 = NULL;
+	int rc;
+
+	if (intrfc == ipx_primary_net && ntohs(ipx->ipx_dest.sock) == 0x451)
+		sock1 = ncp_connection_hack(intrfc, ipx);
+	if (!sock1)
+		/* No special socket found, forward the packet the normal way */
+		sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock);
+
+	/*
+	 * We need to check if there is a primary net and if
+	 * this is addressed to one of the *SPECIAL* sockets because
+	 * these need to be propagated to the primary net.
+	 * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and
+	 * 0x456(Diagnostic).
+	 */
+
+	if (ipx_primary_net && intrfc != ipx_primary_net) {
+		const int dsock = ntohs(ipx->ipx_dest.sock);
+
+		if (dsock == 0x452 || dsock == 0x453 || dsock == 0x456)
+			/* The appropriate thing to do here is to dup the
+			 * packet and route to the primary net interface via
+			 * ipxitf_send; however, we'll cheat and just demux it
+			 * here. */
+			sock2 = ipxitf_find_socket(ipx_primary_net,
+							ipx->ipx_dest.sock);
+	}
+
+	/*
+	 * If there is nothing to do return. The kfree will cancel any charging.
+	 */
+	rc = 0;
+	if (!sock1 && !sock2) {
+		if (!copy)
+			kfree_skb(skb);
+		goto out;
+	}
+
+	/*
+	 * This next segment of code is a little awkward, but it sets it up
+	 * so that the appropriate number of copies of the SKB are made and
+	 * that skb1 and skb2 point to it (them) so that it (they) can be
+	 * demuxed to sock1 and/or sock2.  If we are unable to make enough
+	 * copies, we do as much as is possible.
+	 */
+
+	if (copy)
+		skb1 = skb_clone(skb, GFP_ATOMIC);
+	else
+		skb1 = skb;
+
+	rc = -ENOMEM;
+	if (!skb1)
+		goto out_put;
+
+	/* Do we need 2 SKBs? */
+	if (sock1 && sock2)
+		skb2 = skb_clone(skb1, GFP_ATOMIC);
+	else
+		skb2 = skb1;
+
+	if (sock1)
+		ipxitf_def_skb_handler(sock1, skb1);
+
+	if (!skb2)
+		goto out_put;
+
+	if (sock2)
+		ipxitf_def_skb_handler(sock2, skb2);
+
+	rc = 0;
+out_put:
+	if (sock1)
+		sock_put(sock1);
+	if (sock2)
+		sock_put(sock2);
+out:
+	return rc;
+}
+#endif	/* CONFIG_IPX_INTERN */
+
+static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc,
+					    struct sk_buff *skb)
+{
+	struct sk_buff *skb2;
+	int in_offset = (unsigned char *)ipx_hdr(skb) - skb->head;
+	int out_offset = intrfc->if_ipx_offset;
+	int len;
+
+	/* Hopefully, most cases */
+	if (in_offset >= out_offset)
+		return skb;
+
+	/* Need new SKB */
+	len  = skb->len + out_offset;
+	skb2 = alloc_skb(len, GFP_ATOMIC);
+	if (skb2) {
+		skb_reserve(skb2, out_offset);
+		skb_reset_network_header(skb2);
+		skb_reset_transport_header(skb2);
+		skb_put(skb2, skb->len);
+		memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len);
+		memcpy(skb2->cb, skb->cb, sizeof(skb->cb));
+	}
+	kfree_skb(skb);
+	return skb2;
+}
+
+/* caller must hold a reference to intrfc and the skb has to be unshared */
+int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	struct net_device *dev = intrfc->if_dev;
+	struct datalink_proto *dl = intrfc->if_dlink;
+	char dest_node[IPX_NODE_LEN];
+	int send_to_wire = 1;
+	int addr_len;
+
+	ipx->ipx_tctrl = IPX_SKB_CB(skb)->ipx_tctrl;
+	ipx->ipx_dest.net = IPX_SKB_CB(skb)->ipx_dest_net;
+	ipx->ipx_source.net = IPX_SKB_CB(skb)->ipx_source_net;
+
+	/* see if we need to include the netnum in the route list */
+	if (IPX_SKB_CB(skb)->last_hop.index >= 0) {
+		__be32 *last_hop = (__be32 *)(((u8 *) skb->data) +
+				sizeof(struct ipxhdr) +
+				IPX_SKB_CB(skb)->last_hop.index *
+				sizeof(__be32));
+		*last_hop = IPX_SKB_CB(skb)->last_hop.netnum;
+		IPX_SKB_CB(skb)->last_hop.index = -1;
+	}
+
+	/*
+	 * We need to know how many skbuffs it will take to send out this
+	 * packet to avoid unnecessary copies.
+	 */
+
+	if (!dl || !dev || dev->flags & IFF_LOOPBACK)
+		send_to_wire = 0;	/* No non looped */
+
+	/*
+	 * See if this should be demuxed to sockets on this interface
+	 *
+	 * We want to ensure the original was eaten or that we only use
+	 * up clones.
+	 */
+
+	if (ipx->ipx_dest.net == intrfc->if_netnum) {
+		/*
+		 * To our own node, loop and free the original.
+		 * The internal net will receive on all node address.
+		 */
+		if (intrfc == ipx_internal_net ||
+		    !memcmp(intrfc->if_node, node, IPX_NODE_LEN)) {
+			/* Don't charge sender */
+			skb_orphan(skb);
+
+			/* Will charge receiver */
+			return ipxitf_demux_socket(intrfc, skb, 0);
+		}
+
+		/* Broadcast, loop and possibly keep to send on. */
+		if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) {
+			if (!send_to_wire)
+				skb_orphan(skb);
+			ipxitf_demux_socket(intrfc, skb, send_to_wire);
+			if (!send_to_wire)
+				goto out;
+		}
+	}
+
+	/*
+	 * If the originating net is not equal to our net; this is routed
+	 * We are still charging the sender. Which is right - the driver
+	 * free will handle this fairly.
+	 */
+	if (ipx->ipx_source.net != intrfc->if_netnum) {
+		/*
+		 * Unshare the buffer before modifying the count in
+		 * case it's a flood or tcpdump
+		 */
+		skb = skb_unshare(skb, GFP_ATOMIC);
+		if (!skb)
+			goto out;
+		if (++ipx->ipx_tctrl > ipxcfg_max_hops)
+			send_to_wire = 0;
+	}
+
+	if (!send_to_wire) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	/* Determine the appropriate hardware address */
+	addr_len = dev->addr_len;
+	if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN))
+		memcpy(dest_node, dev->broadcast, addr_len);
+	else
+		memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len);
+
+	/* Make any compensation for differing physical/data link size */
+	skb = ipxitf_adjust_skbuff(intrfc, skb);
+	if (!skb)
+		goto out;
+
+	/* set up data link and physical headers */
+	skb->dev	= dev;
+	skb->protocol	= htons(ETH_P_IPX);
+
+	/* Send it out */
+	dl->request(dl, skb, dest_node);
+out:
+	return 0;
+}
+
+static int ipxitf_add_local_route(struct ipx_interface *intrfc)
+{
+	return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL);
+}
+
+static void ipxitf_discover_netnum(struct ipx_interface *intrfc,
+				   struct sk_buff *skb);
+static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb);
+
+static int ipxitf_rcv(struct ipx_interface *intrfc, struct sk_buff *skb)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	int rc = 0;
+
+	ipxitf_hold(intrfc);
+
+	/* See if we should update our network number */
+	if (!intrfc->if_netnum) /* net number of intrfc not known yet */
+		ipxitf_discover_netnum(intrfc, skb);
+
+	IPX_SKB_CB(skb)->last_hop.index = -1;
+	if (ipx->ipx_type == IPX_TYPE_PPROP) {
+		rc = ipxitf_pprop(intrfc, skb);
+		if (rc)
+			goto out_free_skb;
+	}
+
+	/* local processing follows */
+	if (!IPX_SKB_CB(skb)->ipx_dest_net)
+		IPX_SKB_CB(skb)->ipx_dest_net = intrfc->if_netnum;
+	if (!IPX_SKB_CB(skb)->ipx_source_net)
+		IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum;
+
+	/* it doesn't make sense to route a pprop packet, there's no meaning
+	 * in the ipx_dest_net for such packets */
+	if (ipx->ipx_type != IPX_TYPE_PPROP &&
+	    intrfc->if_netnum != IPX_SKB_CB(skb)->ipx_dest_net) {
+		/* We only route point-to-point packets. */
+		if (skb->pkt_type == PACKET_HOST) {
+			skb = skb_unshare(skb, GFP_ATOMIC);
+			if (skb)
+				rc = ipxrtr_route_skb(skb);
+			goto out_intrfc;
+		}
+
+		goto out_free_skb;
+	}
+
+	/* see if we should keep it */
+	if (!memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) ||
+	    !memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN)) {
+		rc = ipxitf_demux_socket(intrfc, skb, 0);
+		goto out_intrfc;
+	}
+
+	/* we couldn't pawn it off so unload it */
+out_free_skb:
+	kfree_skb(skb);
+out_intrfc:
+	ipxitf_put(intrfc);
+	return rc;
+}
+
+static void ipxitf_discover_netnum(struct ipx_interface *intrfc,
+				   struct sk_buff *skb)
+{
+	const struct ipx_cb *cb = IPX_SKB_CB(skb);
+
+	/* see if this is an intra packet: source_net == dest_net */
+	if (cb->ipx_source_net == cb->ipx_dest_net && cb->ipx_source_net) {
+		struct ipx_interface *i =
+				ipxitf_find_using_net(cb->ipx_source_net);
+		/* NB: NetWare servers lie about their hop count so we
+		 * dropped the test based on it. This is the best way
+		 * to determine this is a 0 hop count packet. */
+		if (!i) {
+			intrfc->if_netnum = cb->ipx_source_net;
+			ipxitf_add_local_route(intrfc);
+		} else {
+			printk(KERN_WARNING "IPX: Network number collision "
+				"%lx\n        %s %s and %s %s\n",
+				(unsigned long) ntohl(cb->ipx_source_net),
+				ipx_device_name(i),
+				ipx_frame_name(i->if_dlink_type),
+				ipx_device_name(intrfc),
+				ipx_frame_name(intrfc->if_dlink_type));
+			ipxitf_put(i);
+		}
+	}
+}
+
+/**
+ * ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for
+ * 		  NetBIOS broadcasts
+ * @intrfc: IPX interface receiving this packet
+ * @skb: Received packet
+ *
+ * Checks if packet is valid: if its more than %IPX_MAX_PPROP_HOPS hops or if it
+ * is smaller than a IPX header + the room for %IPX_MAX_PPROP_HOPS hops we drop
+ * it, not even processing it locally, if it has exact %IPX_MAX_PPROP_HOPS we
+ * don't broadcast it, but process it locally. See chapter 5 of Novell's "IPX
+ * RIP and SAP Router Specification", Part Number 107-000029-001.
+ *
+ * If it is valid, check if we have pprop broadcasting enabled by the user,
+ * if not, just return zero for local processing.
+ *
+ * If it is enabled check the packet and don't broadcast it if we have already
+ * seen this packet.
+ *
+ * Broadcast: send it to the interfaces that aren't on the packet visited nets
+ * array, just after the IPX header.
+ *
+ * Returns -EINVAL for invalid packets, so that the calling function drops
+ * the packet without local processing. 0 if packet is to be locally processed.
+ */
+static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	int i, rc = -EINVAL;
+	struct ipx_interface *ifcs;
+	char *c;
+	__be32 *l;
+
+	/* Illegal packet - too many hops or too short */
+	/* We decide to throw it away: no broadcasting, no local processing.
+	 * NetBIOS unaware implementations route them as normal packets -
+	 * tctrl <= 15, any data payload... */
+	if (IPX_SKB_CB(skb)->ipx_tctrl > IPX_MAX_PPROP_HOPS ||
+	    ntohs(ipx->ipx_pktsize) < sizeof(struct ipxhdr) +
+					IPX_MAX_PPROP_HOPS * sizeof(u32))
+		goto out;
+	/* are we broadcasting this damn thing? */
+	rc = 0;
+	if (!sysctl_ipx_pprop_broadcasting)
+		goto out;
+	/* We do broadcast packet on the IPX_MAX_PPROP_HOPS hop, but we
+	 * process it locally. All previous hops broadcasted it, and process it
+	 * locally. */
+	if (IPX_SKB_CB(skb)->ipx_tctrl == IPX_MAX_PPROP_HOPS)
+		goto out;
+
+	c = ((u8 *) ipx) + sizeof(struct ipxhdr);
+	l = (__be32 *) c;
+
+	/* Don't broadcast packet if already seen this net */
+	for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++)
+		if (*l++ == intrfc->if_netnum)
+			goto out;
+
+	/* < IPX_MAX_PPROP_HOPS hops && input interface not in list. Save the
+	 * position where we will insert recvd netnum into list, later on,
+	 * in ipxitf_send */
+	IPX_SKB_CB(skb)->last_hop.index = i;
+	IPX_SKB_CB(skb)->last_hop.netnum = intrfc->if_netnum;
+	/* xmit on all other interfaces... */
+	spin_lock_bh(&ipx_interfaces_lock);
+	list_for_each_entry(ifcs, &ipx_interfaces, node) {
+		/* Except unconfigured interfaces */
+		if (!ifcs->if_netnum)
+			continue;
+
+		/* That aren't in the list */
+		if (ifcs == intrfc)
+			continue;
+		l = (__be32 *) c;
+		/* don't consider the last entry in the packet list,
+		 * it is our netnum, and it is not there yet */
+		for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++)
+			if (ifcs->if_netnum == *l++)
+				break;
+		if (i == IPX_SKB_CB(skb)->ipx_tctrl) {
+			struct sk_buff *s = skb_copy(skb, GFP_ATOMIC);
+
+			if (s) {
+				IPX_SKB_CB(s)->ipx_dest_net = ifcs->if_netnum;
+				ipxrtr_route_skb(s);
+			}
+		}
+	}
+	spin_unlock_bh(&ipx_interfaces_lock);
+out:
+	return rc;
+}
+
+static void ipxitf_insert(struct ipx_interface *intrfc)
+{
+	spin_lock_bh(&ipx_interfaces_lock);
+	list_add_tail(&intrfc->node, &ipx_interfaces);
+	spin_unlock_bh(&ipx_interfaces_lock);
+
+	if (ipxcfg_auto_select_primary && !ipx_primary_net)
+		ipx_primary_net = intrfc;
+}
+
+static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __be32 netnum,
+					  __be16 dlink_type,
+					  struct datalink_proto *dlink,
+					  unsigned char internal,
+					  int ipx_offset)
+{
+	struct ipx_interface *intrfc = kmalloc(sizeof(*intrfc), GFP_ATOMIC);
+
+	if (intrfc) {
+		intrfc->if_dev		= dev;
+		intrfc->if_netnum	= netnum;
+		intrfc->if_dlink_type 	= dlink_type;
+		intrfc->if_dlink 	= dlink;
+		intrfc->if_internal 	= internal;
+		intrfc->if_ipx_offset 	= ipx_offset;
+		intrfc->if_sknum 	= IPX_MIN_EPHEMERAL_SOCKET;
+		INIT_HLIST_HEAD(&intrfc->if_sklist);
+		atomic_set(&intrfc->refcnt, 1);
+		spin_lock_init(&intrfc->if_sklist_lock);
+	}
+
+	return intrfc;
+}
+
+static int ipxitf_create_internal(struct ipx_interface_definition *idef)
+{
+	struct ipx_interface *intrfc;
+	int rc = -EEXIST;
+
+	/* Only one primary network allowed */
+	if (ipx_primary_net)
+		goto out;
+
+	/* Must have a valid network number */
+	rc = -EADDRNOTAVAIL;
+	if (!idef->ipx_network)
+		goto out;
+	intrfc = ipxitf_find_using_net(idef->ipx_network);
+	rc = -EADDRINUSE;
+	if (intrfc) {
+		ipxitf_put(intrfc);
+		goto out;
+	}
+	intrfc = ipxitf_alloc(NULL, idef->ipx_network, 0, NULL, 1, 0);
+	rc = -EAGAIN;
+	if (!intrfc)
+		goto out;
+	memcpy((char *)&(intrfc->if_node), idef->ipx_node, IPX_NODE_LEN);
+	ipx_internal_net = ipx_primary_net = intrfc;
+	ipxitf_hold(intrfc);
+	ipxitf_insert(intrfc);
+
+	rc = ipxitf_add_local_route(intrfc);
+	ipxitf_put(intrfc);
+out:
+	return rc;
+}
+
+static __be16 ipx_map_frame_type(unsigned char type)
+{
+	__be16 rc = 0;
+
+	switch (type) {
+	case IPX_FRAME_ETHERII:	rc = htons(ETH_P_IPX);		break;
+	case IPX_FRAME_8022:	rc = htons(ETH_P_802_2);	break;
+	case IPX_FRAME_SNAP:	rc = htons(ETH_P_SNAP);		break;
+	case IPX_FRAME_8023:	rc = htons(ETH_P_802_3);	break;
+	}
+
+	return rc;
+}
+
+static int ipxitf_create(struct ipx_interface_definition *idef)
+{
+	struct net_device *dev;
+	__be16 dlink_type = 0;
+	struct datalink_proto *datalink = NULL;
+	struct ipx_interface *intrfc;
+	int rc;
+
+	if (idef->ipx_special == IPX_INTERNAL) {
+		rc = ipxitf_create_internal(idef);
+		goto out;
+	}
+
+	rc = -EEXIST;
+	if (idef->ipx_special == IPX_PRIMARY && ipx_primary_net)
+		goto out;
+
+	intrfc = ipxitf_find_using_net(idef->ipx_network);
+	rc = -EADDRINUSE;
+	if (idef->ipx_network && intrfc) {
+		ipxitf_put(intrfc);
+		goto out;
+	}
+
+	if (intrfc)
+		ipxitf_put(intrfc);
+
+	dev = dev_get_by_name(&init_net, idef->ipx_device);
+	rc = -ENODEV;
+	if (!dev)
+		goto out;
+
+	switch (idef->ipx_dlink_type) {
+	case IPX_FRAME_TR_8022:
+		printk(KERN_WARNING "IPX frame type 802.2TR is "
+			"obsolete Use 802.2 instead.\n");
+		/* fall through */
+	case IPX_FRAME_8022:
+		dlink_type 	= htons(ETH_P_802_2);
+		datalink 	= p8022_datalink;
+		break;
+	case IPX_FRAME_ETHERII:
+		if (dev->type != ARPHRD_IEEE802) {
+			dlink_type 	= htons(ETH_P_IPX);
+			datalink 	= pEII_datalink;
+			break;
+		} else
+			printk(KERN_WARNING "IPX frame type EtherII over "
+					"token-ring is obsolete. Use SNAP "
+					"instead.\n");
+		/* fall through */
+	case IPX_FRAME_SNAP:
+		dlink_type 	= htons(ETH_P_SNAP);
+		datalink 	= pSNAP_datalink;
+		break;
+	case IPX_FRAME_8023:
+		dlink_type 	= htons(ETH_P_802_3);
+		datalink 	= p8023_datalink;
+		break;
+	case IPX_FRAME_NONE:
+	default:
+		rc = -EPROTONOSUPPORT;
+		goto out_dev;
+	}
+
+	rc = -ENETDOWN;
+	if (!(dev->flags & IFF_UP))
+		goto out_dev;
+
+	/* Check addresses are suitable */
+	rc = -EINVAL;
+	if (dev->addr_len > IPX_NODE_LEN)
+		goto out_dev;
+
+	intrfc = ipxitf_find_using_phys(dev, dlink_type);
+	if (!intrfc) {
+		/* Ok now create */
+		intrfc = ipxitf_alloc(dev, idef->ipx_network, dlink_type,
+				      datalink, 0, dev->hard_header_len +
+					datalink->header_length);
+		rc = -EAGAIN;
+		if (!intrfc)
+			goto out_dev;
+		/* Setup primary if necessary */
+		if (idef->ipx_special == IPX_PRIMARY)
+			ipx_primary_net = intrfc;
+		if (!memcmp(idef->ipx_node, "\000\000\000\000\000\000",
+			    IPX_NODE_LEN)) {
+			memset(intrfc->if_node, 0, IPX_NODE_LEN);
+			memcpy(intrfc->if_node + IPX_NODE_LEN - dev->addr_len,
+				dev->dev_addr, dev->addr_len);
+		} else
+			memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN);
+		ipxitf_hold(intrfc);
+		ipxitf_insert(intrfc);
+	}
+
+
+	/* If the network number is known, add a route */
+	rc = 0;
+	if (!intrfc->if_netnum)
+		goto out_intrfc;
+
+	rc = ipxitf_add_local_route(intrfc);
+out_intrfc:
+	ipxitf_put(intrfc);
+	goto out;
+out_dev:
+	dev_put(dev);
+out:
+	return rc;
+}
+
+static int ipxitf_delete(struct ipx_interface_definition *idef)
+{
+	struct net_device *dev = NULL;
+	__be16 dlink_type = 0;
+	struct ipx_interface *intrfc;
+	int rc = 0;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	if (idef->ipx_special == IPX_INTERNAL) {
+		if (ipx_internal_net) {
+			__ipxitf_put(ipx_internal_net);
+			goto out;
+		}
+		rc = -ENOENT;
+		goto out;
+	}
+
+	dlink_type = ipx_map_frame_type(idef->ipx_dlink_type);
+	rc = -EPROTONOSUPPORT;
+	if (!dlink_type)
+		goto out;
+
+	dev = __dev_get_by_name(&init_net, idef->ipx_device);
+	rc = -ENODEV;
+	if (!dev)
+		goto out;
+
+	intrfc = __ipxitf_find_using_phys(dev, dlink_type);
+	rc = -EINVAL;
+	if (!intrfc)
+		goto out;
+	__ipxitf_put(intrfc);
+
+	rc = 0;
+out:
+	spin_unlock_bh(&ipx_interfaces_lock);
+	return rc;
+}
+
+static struct ipx_interface *ipxitf_auto_create(struct net_device *dev,
+						__be16 dlink_type)
+{
+	struct ipx_interface *intrfc = NULL;
+	struct datalink_proto *datalink;
+
+	if (!dev)
+		goto out;
+
+	/* Check addresses are suitable */
+	if (dev->addr_len > IPX_NODE_LEN)
+		goto out;
+
+	switch (ntohs(dlink_type)) {
+	case ETH_P_IPX:		datalink = pEII_datalink;	break;
+	case ETH_P_802_2:	datalink = p8022_datalink;	break;
+	case ETH_P_SNAP:	datalink = pSNAP_datalink;	break;
+	case ETH_P_802_3:	datalink = p8023_datalink;	break;
+	default:		goto out;
+	}
+
+	intrfc = ipxitf_alloc(dev, 0, dlink_type, datalink, 0,
+				dev->hard_header_len + datalink->header_length);
+
+	if (intrfc) {
+		memset(intrfc->if_node, 0, IPX_NODE_LEN);
+		memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]),
+			dev->dev_addr, dev->addr_len);
+		spin_lock_init(&intrfc->if_sklist_lock);
+		atomic_set(&intrfc->refcnt, 1);
+		ipxitf_insert(intrfc);
+		dev_hold(dev);
+	}
+
+out:
+	return intrfc;
+}
+
+static int ipxitf_ioctl(unsigned int cmd, void __user *arg)
+{
+	int rc = -EINVAL;
+	struct ifreq ifr;
+	int val;
+
+	switch (cmd) {
+	case SIOCSIFADDR: {
+		struct sockaddr_ipx *sipx;
+		struct ipx_interface_definition f;
+
+		rc = -EFAULT;
+		if (copy_from_user(&ifr, arg, sizeof(ifr)))
+			break;
+		sipx = (struct sockaddr_ipx *)&ifr.ifr_addr;
+		rc = -EINVAL;
+		if (sipx->sipx_family != AF_IPX)
+			break;
+		f.ipx_network = sipx->sipx_network;
+		memcpy(f.ipx_device, ifr.ifr_name,
+			sizeof(f.ipx_device));
+		memcpy(f.ipx_node, sipx->sipx_node, IPX_NODE_LEN);
+		f.ipx_dlink_type = sipx->sipx_type;
+		f.ipx_special = sipx->sipx_special;
+
+		if (sipx->sipx_action == IPX_DLTITF)
+			rc = ipxitf_delete(&f);
+		else
+			rc = ipxitf_create(&f);
+		break;
+	}
+	case SIOCGIFADDR: {
+		struct sockaddr_ipx *sipx;
+		struct ipx_interface *ipxif;
+		struct net_device *dev;
+
+		rc = -EFAULT;
+		if (copy_from_user(&ifr, arg, sizeof(ifr)))
+			break;
+		sipx = (struct sockaddr_ipx *)&ifr.ifr_addr;
+		dev  = __dev_get_by_name(&init_net, ifr.ifr_name);
+		rc   = -ENODEV;
+		if (!dev)
+			break;
+		ipxif = ipxitf_find_using_phys(dev,
+					   ipx_map_frame_type(sipx->sipx_type));
+		rc = -EADDRNOTAVAIL;
+		if (!ipxif)
+			break;
+
+		sipx->sipx_family	= AF_IPX;
+		sipx->sipx_network	= ipxif->if_netnum;
+		memcpy(sipx->sipx_node, ipxif->if_node,
+			sizeof(sipx->sipx_node));
+		rc = -EFAULT;
+		if (copy_to_user(arg, &ifr, sizeof(ifr)))
+			break;
+		ipxitf_put(ipxif);
+		rc = 0;
+		break;
+	}
+	case SIOCAIPXITFCRT:
+		rc = -EFAULT;
+		if (get_user(val, (unsigned char __user *) arg))
+			break;
+		rc = 0;
+		ipxcfg_auto_create_interfaces = val;
+		break;
+	case SIOCAIPXPRISLT:
+		rc = -EFAULT;
+		if (get_user(val, (unsigned char __user *) arg))
+			break;
+		rc = 0;
+		ipxcfg_set_auto_select(val);
+		break;
+	}
+
+	return rc;
+}
+
+/*
+ *	Checksum routine for IPX
+ */
+
+/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */
+/* This functions should *not* mess with packet contents */
+
+__be16 ipx_cksum(struct ipxhdr *packet, int length)
+{
+	/*
+	 *	NOTE: sum is a net byte order quantity, which optimizes the
+	 *	loop. This only works on big and little endian machines. (I
+	 *	don't know of a machine that isn't.)
+	 */
+	/* handle the first 3 words separately; checksum should be skipped
+	 * and ipx_tctrl masked out */
+	__u16 *p = (__u16 *)packet;
+	__u32 sum = p[1] + (p[2] & (__force u16)htons(0x00ff));
+	__u32 i = (length >> 1) - 3; /* Number of remaining complete words */
+
+	/* Loop through them */
+	p += 3;
+	while (i--)
+		sum += *p++;
+
+	/* Add on the last part word if it exists */
+	if (packet->ipx_pktsize & htons(1))
+		sum += (__force u16)htons(0xff00) & *p;
+
+	/* Do final fixup */
+	sum = (sum & 0xffff) + (sum >> 16);
+
+	/* It's a pity there's no concept of carry in C */
+	if (sum >= 0x10000)
+		sum++;
+
+	/*
+	 * Leave 0 alone; we don't want 0xffff here.  Note that we can't get
+	 * here with 0x10000, so this check is the same as ((__u16)sum)
+	 */
+	if (sum)
+		sum = ~sum;
+
+	return (__force __be16)sum;
+}
+
+const char *ipx_frame_name(__be16 frame)
+{
+	char* rc = "None";
+
+	switch (ntohs(frame)) {
+	case ETH_P_IPX:		rc = "EtherII";	break;
+	case ETH_P_802_2:	rc = "802.2";	break;
+	case ETH_P_SNAP:	rc = "SNAP";	break;
+	case ETH_P_802_3:	rc = "802.3";	break;
+	case ETH_P_TR_802_2:	rc = "802.2TR";	break;
+	}
+
+	return rc;
+}
+
+const char *ipx_device_name(struct ipx_interface *intrfc)
+{
+	return intrfc->if_internal ? "Internal" :
+		intrfc->if_dev ? intrfc->if_dev->name : "Unknown";
+}
+
+/* Handling for system calls applied via the various interfaces to an IPX
+ * socket object. */
+
+static int ipx_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	int opt;
+	int rc = -EINVAL;
+
+	lock_sock(sk);
+	if (optlen != sizeof(int))
+		goto out;
+
+	rc = -EFAULT;
+	if (get_user(opt, (unsigned int __user *)optval))
+		goto out;
+
+	rc = -ENOPROTOOPT;
+	if (!(level == SOL_IPX && optname == IPX_TYPE))
+		goto out;
+
+	ipx_sk(sk)->type = opt;
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int ipx_getsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int val = 0;
+	int len;
+	int rc = -ENOPROTOOPT;
+
+	lock_sock(sk);
+	if (!(level == SOL_IPX && optname == IPX_TYPE))
+		goto out;
+
+	val = ipx_sk(sk)->type;
+
+	rc = -EFAULT;
+	if (get_user(len, optlen))
+		goto out;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	rc = -EINVAL;
+	if(len < 0)
+		goto out;
+
+	rc = -EFAULT;
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		goto out;
+
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static struct proto ipx_proto = {
+	.name	  = "IPX",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct ipx_sock),
+};
+
+static int ipx_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	int rc = -ESOCKTNOSUPPORT;
+	struct sock *sk;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	/*
+	 * SPX support is not anymore in the kernel sources. If you want to
+	 * ressurrect it, completing it and making it understand shared skbs,
+	 * be fully multithreaded, etc, grab the sources in an early 2.5 kernel
+	 * tree.
+	 */
+	if (sock->type != SOCK_DGRAM)
+		goto out;
+
+	rc = -ENOMEM;
+	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+	if (!sk)
+		goto out;
+
+	sk_refcnt_debug_inc(sk);
+	sock_init_data(sock, sk);
+	sk->sk_no_check = 1;		/* Checksum off by default */
+	sock->ops = &ipx_dgram_ops;
+	rc = 0;
+out:
+	return rc;
+}
+
+static int ipx_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (!sk)
+		goto out;
+
+	lock_sock(sk);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+
+	sock_set_flag(sk, SOCK_DEAD);
+	sock->sk = NULL;
+	sk_refcnt_debug_release(sk);
+	ipx_destroy_socket(sk);
+	release_sock(sk);
+	sock_put(sk);
+out:
+	return 0;
+}
+
+/* caller must hold a reference to intrfc */
+
+static __be16 ipx_first_free_socketnum(struct ipx_interface *intrfc)
+{
+	unsigned short socketNum = intrfc->if_sknum;
+
+	spin_lock_bh(&intrfc->if_sklist_lock);
+
+	if (socketNum < IPX_MIN_EPHEMERAL_SOCKET)
+		socketNum = IPX_MIN_EPHEMERAL_SOCKET;
+
+	while (__ipxitf_find_socket(intrfc, htons(socketNum)))
+		if (socketNum > IPX_MAX_EPHEMERAL_SOCKET)
+			socketNum = IPX_MIN_EPHEMERAL_SOCKET;
+		else
+			socketNum++;
+
+	spin_unlock_bh(&intrfc->if_sklist_lock);
+	intrfc->if_sknum = socketNum;
+
+	return htons(socketNum);
+}
+
+static int __ipx_bind(struct socket *sock,
+			struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	struct ipx_interface *intrfc;
+	struct sockaddr_ipx *addr = (struct sockaddr_ipx *)uaddr;
+	int rc = -EINVAL;
+
+	if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_ipx))
+		goto out;
+
+	intrfc = ipxitf_find_using_net(addr->sipx_network);
+	rc = -EADDRNOTAVAIL;
+	if (!intrfc)
+		goto out;
+
+	if (!addr->sipx_port) {
+		addr->sipx_port = ipx_first_free_socketnum(intrfc);
+		rc = -EINVAL;
+		if (!addr->sipx_port)
+			goto out_put;
+	}
+
+	/* protect IPX system stuff like routing/sap */
+	rc = -EACCES;
+	if (ntohs(addr->sipx_port) < IPX_MIN_EPHEMERAL_SOCKET &&
+	    !capable(CAP_NET_ADMIN))
+		goto out_put;
+
+	ipxs->port = addr->sipx_port;
+
+#ifdef CONFIG_IPX_INTERN
+	if (intrfc == ipx_internal_net) {
+		/* The source address is to be set explicitly if the
+		 * socket is to be bound on the internal network. If a
+		 * node number 0 was specified, the default is used.
+		 */
+
+		rc = -EINVAL;
+		if (!memcmp(addr->sipx_node, ipx_broadcast_node, IPX_NODE_LEN))
+			goto out_put;
+		if (!memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN))
+			memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN);
+		else
+			memcpy(ipxs->node, addr->sipx_node, IPX_NODE_LEN);
+
+		rc = -EADDRINUSE;
+		if (ipxitf_find_internal_socket(intrfc, ipxs->node,
+						ipxs->port)) {
+			SOCK_DEBUG(sk,
+				"IPX: bind failed because port %X in use.\n",
+				ntohs(addr->sipx_port));
+			goto out_put;
+		}
+	} else {
+		/* Source addresses are easy. It must be our
+		 * network:node pair for an interface routed to IPX
+		 * with the ipx routing ioctl()
+		 */
+
+		memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN);
+
+		rc = -EADDRINUSE;
+		if (ipxitf_find_socket(intrfc, addr->sipx_port)) {
+			SOCK_DEBUG(sk,
+				"IPX: bind failed because port %X in use.\n",
+				ntohs(addr->sipx_port));
+			goto out_put;
+		}
+	}
+
+#else	/* !def CONFIG_IPX_INTERN */
+
+	/* Source addresses are easy. It must be our network:node pair for
+	   an interface routed to IPX with the ipx routing ioctl() */
+
+	rc = -EADDRINUSE;
+	if (ipxitf_find_socket(intrfc, addr->sipx_port)) {
+		SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n",
+				ntohs((int)addr->sipx_port));
+		goto out_put;
+	}
+
+#endif	/* CONFIG_IPX_INTERN */
+
+	ipxitf_insert_socket(intrfc, sk);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	rc = 0;
+out_put:
+	ipxitf_put(intrfc);
+out:
+	return rc;
+}
+
+static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	int rc;
+
+	lock_sock(sk);
+	rc = __ipx_bind(sock, uaddr, addr_len);
+	release_sock(sk);
+
+	return rc;
+}
+
+static int ipx_connect(struct socket *sock, struct sockaddr *uaddr,
+	int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	struct sockaddr_ipx *addr;
+	int rc = -EINVAL;
+	struct ipx_route *rt;
+
+	sk->sk_state	= TCP_CLOSE;
+	sock->state 	= SS_UNCONNECTED;
+
+	lock_sock(sk);
+	if (addr_len != sizeof(*addr))
+		goto out;
+	addr = (struct sockaddr_ipx *)uaddr;
+
+	/* put the autobinding in */
+	if (!ipxs->port) {
+		struct sockaddr_ipx uaddr;
+
+		uaddr.sipx_port		= 0;
+		uaddr.sipx_network 	= 0;
+
+#ifdef CONFIG_IPX_INTERN
+		rc = -ENETDOWN;
+		if (!ipxs->intrfc)
+			goto out; /* Someone zonked the iface */
+		memcpy(uaddr.sipx_node, ipxs->intrfc->if_node,
+			IPX_NODE_LEN);
+#endif	/* CONFIG_IPX_INTERN */
+
+		rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+			      sizeof(struct sockaddr_ipx));
+		if (rc)
+			goto out;
+	}
+
+	/* We can either connect to primary network or somewhere
+	 * we can route to */
+	rt = ipxrtr_lookup(addr->sipx_network);
+	rc = -ENETUNREACH;
+	if (!rt && !(!addr->sipx_network && ipx_primary_net))
+		goto out;
+
+	ipxs->dest_addr.net  = addr->sipx_network;
+	ipxs->dest_addr.sock = addr->sipx_port;
+	memcpy(ipxs->dest_addr.node, addr->sipx_node, IPX_NODE_LEN);
+	ipxs->type = addr->sipx_type;
+
+	if (sock->type == SOCK_DGRAM) {
+		sock->state 	= SS_CONNECTED;
+		sk->sk_state 	= TCP_ESTABLISHED;
+	}
+
+	if (rt)
+		ipxrtr_put(rt);
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+
+static int ipx_getname(struct socket *sock, struct sockaddr *uaddr,
+			int *uaddr_len, int peer)
+{
+	struct ipx_address *addr;
+	struct sockaddr_ipx sipx;
+	struct sock *sk = sock->sk;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	int rc;
+
+	*uaddr_len = sizeof(struct sockaddr_ipx);
+
+	lock_sock(sk);
+	if (peer) {
+		rc = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+
+		addr = &ipxs->dest_addr;
+		sipx.sipx_network	= addr->net;
+		sipx.sipx_port		= addr->sock;
+		memcpy(sipx.sipx_node, addr->node, IPX_NODE_LEN);
+	} else {
+		if (ipxs->intrfc) {
+			sipx.sipx_network = ipxs->intrfc->if_netnum;
+#ifdef CONFIG_IPX_INTERN
+			memcpy(sipx.sipx_node, ipxs->node, IPX_NODE_LEN);
+#else
+			memcpy(sipx.sipx_node, ipxs->intrfc->if_node,
+				IPX_NODE_LEN);
+#endif	/* CONFIG_IPX_INTERN */
+
+		} else {
+			sipx.sipx_network = 0;
+			memset(sipx.sipx_node, '\0', IPX_NODE_LEN);
+		}
+
+		sipx.sipx_port = ipxs->port;
+	}
+
+	sipx.sipx_family = AF_IPX;
+	sipx.sipx_type	 = ipxs->type;
+	sipx.sipx_zero	 = 0;
+	memcpy(uaddr, &sipx, sizeof(sipx));
+
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	/* NULL here for pt means the packet was looped back */
+	struct ipx_interface *intrfc;
+	struct ipxhdr *ipx;
+	u16 ipx_pktsize;
+	int rc = 0;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	/* Not ours */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		goto out;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipxhdr)))
+		goto drop;
+
+	ipx_pktsize = ntohs(ipx_hdr(skb)->ipx_pktsize);
+
+	/* Too small or invalid header? */
+	if (ipx_pktsize < sizeof(struct ipxhdr) ||
+	    !pskb_may_pull(skb, ipx_pktsize))
+		goto drop;
+
+	ipx = ipx_hdr(skb);
+	if (ipx->ipx_checksum != IPX_NO_CHECKSUM &&
+	   ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize))
+		goto drop;
+
+	IPX_SKB_CB(skb)->ipx_tctrl	= ipx->ipx_tctrl;
+	IPX_SKB_CB(skb)->ipx_dest_net	= ipx->ipx_dest.net;
+	IPX_SKB_CB(skb)->ipx_source_net = ipx->ipx_source.net;
+
+	/* Determine what local ipx endpoint this is */
+	intrfc = ipxitf_find_using_phys(dev, pt->type);
+	if (!intrfc) {
+		if (ipxcfg_auto_create_interfaces &&
+		   IPX_SKB_CB(skb)->ipx_dest_net) {
+			intrfc = ipxitf_auto_create(dev, pt->type);
+			if (intrfc)
+				ipxitf_hold(intrfc);
+		}
+
+		if (!intrfc)	/* Not one of ours */
+				/* or invalid packet for auto creation */
+			goto drop;
+	}
+
+	rc = ipxitf_rcv(intrfc, skb);
+	ipxitf_put(intrfc);
+	goto out;
+drop:
+	kfree_skb(skb);
+out:
+	return rc;
+}
+
+static int ipx_sendmsg(struct kiocb *iocb, struct socket *sock,
+	struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	struct sockaddr_ipx *usipx = (struct sockaddr_ipx *)msg->msg_name;
+	struct sockaddr_ipx local_sipx;
+	int rc = -EINVAL;
+	int flags = msg->msg_flags;
+
+	lock_sock(sk);
+	/* Socket gets bound below anyway */
+/*	if (sk->sk_zapped)
+		return -EIO; */	/* Socket not bound */
+	if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+		goto out;
+
+	/* Max possible packet size limited by 16 bit pktsize in header */
+	if (len >= 65535 - sizeof(struct ipxhdr))
+		goto out;
+
+	if (usipx) {
+		if (!ipxs->port) {
+			struct sockaddr_ipx uaddr;
+
+			uaddr.sipx_port		= 0;
+			uaddr.sipx_network	= 0;
+#ifdef CONFIG_IPX_INTERN
+			rc = -ENETDOWN;
+			if (!ipxs->intrfc)
+				goto out; /* Someone zonked the iface */
+			memcpy(uaddr.sipx_node, ipxs->intrfc->if_node,
+				IPX_NODE_LEN);
+#endif
+			rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+					sizeof(struct sockaddr_ipx));
+			if (rc)
+				goto out;
+		}
+
+		rc = -EINVAL;
+		if (msg->msg_namelen < sizeof(*usipx) ||
+		    usipx->sipx_family != AF_IPX)
+			goto out;
+	} else {
+		rc = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+
+		usipx = &local_sipx;
+		usipx->sipx_family 	= AF_IPX;
+		usipx->sipx_type 	= ipxs->type;
+		usipx->sipx_port 	= ipxs->dest_addr.sock;
+		usipx->sipx_network 	= ipxs->dest_addr.net;
+		memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN);
+	}
+
+	rc = ipxrtr_route_packet(sk, usipx, msg->msg_iov, len,
+				 flags & MSG_DONTWAIT);
+	if (rc >= 0)
+		rc = len;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+
+static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	struct sockaddr_ipx *sipx = (struct sockaddr_ipx *)msg->msg_name;
+	struct ipxhdr *ipx = NULL;
+	struct sk_buff *skb;
+	int copied, rc;
+
+	lock_sock(sk);
+	/* put the autobinding in */
+	if (!ipxs->port) {
+		struct sockaddr_ipx uaddr;
+
+		uaddr.sipx_port		= 0;
+		uaddr.sipx_network 	= 0;
+
+#ifdef CONFIG_IPX_INTERN
+		rc = -ENETDOWN;
+		if (!ipxs->intrfc)
+			goto out; /* Someone zonked the iface */
+		memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, IPX_NODE_LEN);
+#endif	/* CONFIG_IPX_INTERN */
+
+		rc = __ipx_bind(sock, (struct sockaddr *)&uaddr,
+			      sizeof(struct sockaddr_ipx));
+		if (rc)
+			goto out;
+	}
+
+	rc = -ENOTCONN;
+	if (sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &rc);
+	if (!skb)
+		goto out;
+
+	ipx 	= ipx_hdr(skb);
+	copied 	= ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr);
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov,
+				     copied);
+	if (rc)
+		goto out_free;
+	if (skb->tstamp.tv64)
+		sk->sk_stamp = skb->tstamp;
+
+	msg->msg_namelen = sizeof(*sipx);
+
+	if (sipx) {
+		sipx->sipx_family	= AF_IPX;
+		sipx->sipx_port		= ipx->ipx_source.sock;
+		memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN);
+		sipx->sipx_network	= IPX_SKB_CB(skb)->ipx_source_net;
+		sipx->sipx_type 	= ipx->ipx_type;
+		sipx->sipx_zero		= 0;
+	}
+	rc = copied;
+
+out_free:
+	skb_free_datagram(sk, skb);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+
+static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int rc = 0;
+	long amount = 0;
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *)arg;
+
+	lock_sock(sk);
+	switch (cmd) {
+	case TIOCOUTQ:
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		rc = put_user(amount, (int __user *)argp);
+		break;
+	case TIOCINQ: {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+		/* These two are safe on a single CPU system as only
+		 * user tasks fiddle here */
+		if (skb)
+			amount = skb->len - sizeof(struct ipxhdr);
+		rc = put_user(amount, (int __user *)argp);
+		break;
+	}
+	case SIOCADDRT:
+	case SIOCDELRT:
+		rc = -EPERM;
+		if (capable(CAP_NET_ADMIN))
+			rc = ipxrtr_ioctl(cmd, argp);
+		break;
+	case SIOCSIFADDR:
+	case SIOCAIPXITFCRT:
+	case SIOCAIPXPRISLT:
+		rc = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+	case SIOCGIFADDR:
+		rc = ipxitf_ioctl(cmd, argp);
+		break;
+	case SIOCIPXCFGDATA:
+		rc = ipxcfg_get_config_data(argp);
+		break;
+	case SIOCIPXNCPCONN:
+		/*
+		 * This socket wants to take care of the NCP connection
+		 * handed to us in arg.
+		 */
+		rc = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		rc = get_user(ipx_sk(sk)->ipx_ncp_conn,
+			      (const unsigned short __user *)argp);
+		break;
+	case SIOCGSTAMP:
+		rc = -EINVAL;
+		if (sk)
+			rc = sock_get_timestamp(sk, argp);
+		break;
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+		rc = -EINVAL;
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
+
+#ifdef CONFIG_COMPAT
+static int ipx_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	/*
+	 * These 4 commands use same structure on 32bit and 64bit.  Rest of IPX
+	 * commands is handled by generic ioctl code.  As these commands are
+	 * SIOCPROTOPRIVATE..SIOCPROTOPRIVATE+3, they cannot be handled by generic
+	 * code.
+	 */
+	switch (cmd) {
+	case SIOCAIPXITFCRT:
+	case SIOCAIPXPRISLT:
+	case SIOCIPXCFGDATA:
+	case SIOCIPXNCPCONN:
+		return ipx_ioctl(sock, cmd, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+
+/*
+ * Socket family declarations
+ */
+
+static const struct net_proto_family ipx_family_ops = {
+	.family		= PF_IPX,
+	.create		= ipx_create,
+	.owner		= THIS_MODULE,
+};
+
+static const struct proto_ops ipx_dgram_ops = {
+	.family		= PF_IPX,
+	.owner		= THIS_MODULE,
+	.release	= ipx_release,
+	.bind		= ipx_bind,
+	.connect	= ipx_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= ipx_getname,
+	.poll		= datagram_poll,
+	.ioctl		= ipx_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ipx_compat_ioctl,
+#endif
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown, /* FIXME: support shutdown */
+	.setsockopt	= ipx_setsockopt,
+	.getsockopt	= ipx_getsockopt,
+	.sendmsg	= ipx_sendmsg,
+	.recvmsg	= ipx_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct packet_type ipx_8023_packet_type __read_mostly = {
+	.type		= cpu_to_be16(ETH_P_802_3),
+	.func		= ipx_rcv,
+};
+
+static struct packet_type ipx_dix_packet_type __read_mostly = {
+	.type		= cpu_to_be16(ETH_P_IPX),
+	.func		= ipx_rcv,
+};
+
+static struct notifier_block ipx_dev_notifier = {
+	.notifier_call	= ipxitf_device_event,
+};
+
+extern struct datalink_proto *make_EII_client(void);
+extern void destroy_EII_client(struct datalink_proto *);
+
+static const unsigned char ipx_8022_type = 0xE0;
+static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
+static const char ipx_EII_err_msg[] __initconst =
+	KERN_CRIT "IPX: Unable to register with Ethernet II\n";
+static const char ipx_8023_err_msg[] __initconst =
+	KERN_CRIT "IPX: Unable to register with 802.3\n";
+static const char ipx_llc_err_msg[] __initconst =
+	KERN_CRIT "IPX: Unable to register with 802.2\n";
+static const char ipx_snap_err_msg[] __initconst =
+	KERN_CRIT "IPX: Unable to register with SNAP\n";
+
+static int __init ipx_init(void)
+{
+	int rc = proto_register(&ipx_proto, 1);
+
+	if (rc != 0)
+		goto out;
+
+	sock_register(&ipx_family_ops);
+
+	pEII_datalink = make_EII_client();
+	if (pEII_datalink)
+		dev_add_pack(&ipx_dix_packet_type);
+	else
+		printk(ipx_EII_err_msg);
+
+	p8023_datalink = make_8023_client();
+	if (p8023_datalink)
+		dev_add_pack(&ipx_8023_packet_type);
+	else
+		printk(ipx_8023_err_msg);
+
+	p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv);
+	if (!p8022_datalink)
+		printk(ipx_llc_err_msg);
+
+	pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv);
+	if (!pSNAP_datalink)
+		printk(ipx_snap_err_msg);
+
+	register_netdevice_notifier(&ipx_dev_notifier);
+	ipx_register_sysctl();
+	ipx_proc_init();
+out:
+	return rc;
+}
+
+static void __exit ipx_proto_finito(void)
+{
+	ipx_proc_exit();
+	ipx_unregister_sysctl();
+
+	unregister_netdevice_notifier(&ipx_dev_notifier);
+
+	ipxitf_cleanup();
+
+	if (pSNAP_datalink) {
+		unregister_snap_client(pSNAP_datalink);
+		pSNAP_datalink = NULL;
+	}
+
+	if (p8022_datalink) {
+		unregister_8022_client(p8022_datalink);
+		p8022_datalink = NULL;
+	}
+
+	dev_remove_pack(&ipx_8023_packet_type);
+	if (p8023_datalink) {
+		destroy_8023_client(p8023_datalink);
+		p8023_datalink = NULL;
+	}
+
+	dev_remove_pack(&ipx_dix_packet_type);
+	if (pEII_datalink) {
+		destroy_EII_client(pEII_datalink);
+		pEII_datalink = NULL;
+	}
+
+	proto_unregister(&ipx_proto);
+	sock_unregister(ipx_family_ops.family);
+}
+
+module_init(ipx_init);
+module_exit(ipx_proto_finito);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IPX);
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
new file mode 100644
index 00000000..26b5bfcf
--- /dev/null
+++ b/net/ipx/ipx_proc.c
@@ -0,0 +1,339 @@
+/*
+ *	IPX proc routines
+ *
+ * 	Copyright(C) Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2002
+ */
+
+#include <linux/init.h>
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/tcp_states.h>
+#include <net/ipx.h>
+
+static void *ipx_seq_interface_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_bh(&ipx_interfaces_lock);
+	return seq_list_start_head(&ipx_interfaces, *pos);
+}
+
+static void *ipx_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &ipx_interfaces, pos);
+}
+
+static void ipx_seq_interface_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_bh(&ipx_interfaces_lock);
+}
+
+static int ipx_seq_interface_show(struct seq_file *seq, void *v)
+{
+	struct ipx_interface *i;
+
+	if (v == &ipx_interfaces) {
+		seq_puts(seq, "Network    Node_Address   Primary  Device     "
+			      "Frame_Type");
+#ifdef IPX_REFCNT_DEBUG
+		seq_puts(seq, "  refcnt");
+#endif
+		seq_puts(seq, "\n");
+		goto out;
+	}
+
+	i = list_entry(v, struct ipx_interface, node);
+	seq_printf(seq, "%08lX   ", (unsigned long int)ntohl(i->if_netnum));
+	seq_printf(seq, "%02X%02X%02X%02X%02X%02X   ",
+			i->if_node[0], i->if_node[1], i->if_node[2],
+			i->if_node[3], i->if_node[4], i->if_node[5]);
+	seq_printf(seq, "%-9s", i == ipx_primary_net ? "Yes" : "No");
+	seq_printf(seq, "%-11s", ipx_device_name(i));
+	seq_printf(seq, "%-9s", ipx_frame_name(i->if_dlink_type));
+#ifdef IPX_REFCNT_DEBUG
+	seq_printf(seq, "%6d", atomic_read(&i->refcnt));
+#endif
+	seq_puts(seq, "\n");
+out:
+	return 0;
+}
+
+static void *ipx_seq_route_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_bh(&ipx_routes_lock);
+	return seq_list_start_head(&ipx_routes, *pos);
+}
+
+static void *ipx_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &ipx_routes, pos);
+}
+
+static void ipx_seq_route_stop(struct seq_file *seq, void *v)
+{
+	read_unlock_bh(&ipx_routes_lock);
+}
+
+static int ipx_seq_route_show(struct seq_file *seq, void *v)
+{
+	struct ipx_route *rt;
+
+	if (v == &ipx_routes) {
+		seq_puts(seq, "Network    Router_Net   Router_Node\n");
+		goto out;
+	}
+
+	rt = list_entry(v, struct ipx_route, node);
+
+	seq_printf(seq, "%08lX   ", (unsigned long int)ntohl(rt->ir_net));
+	if (rt->ir_routed)
+		seq_printf(seq, "%08lX     %02X%02X%02X%02X%02X%02X\n",
+			   (long unsigned int)ntohl(rt->ir_intrfc->if_netnum),
+			   rt->ir_router_node[0], rt->ir_router_node[1],
+			   rt->ir_router_node[2], rt->ir_router_node[3],
+			   rt->ir_router_node[4], rt->ir_router_node[5]);
+	else
+		seq_puts(seq, "Directly     Connected\n");
+out:
+	return 0;
+}
+
+static __inline__ struct sock *ipx_get_socket_idx(loff_t pos)
+{
+	struct sock *s = NULL;
+	struct hlist_node *node;
+	struct ipx_interface *i;
+
+	list_for_each_entry(i, &ipx_interfaces, node) {
+		spin_lock_bh(&i->if_sklist_lock);
+		sk_for_each(s, node, &i->if_sklist) {
+			if (!pos)
+				break;
+			--pos;
+		}
+		spin_unlock_bh(&i->if_sklist_lock);
+		if (!pos) {
+			if (node)
+				goto found;
+			break;
+		}
+	}
+	s = NULL;
+found:
+	return s;
+}
+
+static void *ipx_seq_socket_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t l = *pos;
+
+	spin_lock_bh(&ipx_interfaces_lock);
+	return l ? ipx_get_socket_idx(--l) : SEQ_START_TOKEN;
+}
+
+static void *ipx_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock* sk, *next;
+	struct ipx_interface *i;
+	struct ipx_sock *ipxs;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		sk = NULL;
+		i = ipx_interfaces_head();
+		if (!i)
+			goto out;
+		sk = sk_head(&i->if_sklist);
+		if (sk)
+			spin_lock_bh(&i->if_sklist_lock);
+		goto out;
+	}
+	sk = v;
+	next = sk_next(sk);
+	if (next) {
+		sk = next;
+		goto out;
+	}
+	ipxs = ipx_sk(sk);
+	i = ipxs->intrfc;
+	spin_unlock_bh(&i->if_sklist_lock);
+	sk = NULL;
+	for (;;) {
+		if (i->node.next == &ipx_interfaces)
+			break;
+		i = list_entry(i->node.next, struct ipx_interface, node);
+		spin_lock_bh(&i->if_sklist_lock);
+		if (!hlist_empty(&i->if_sklist)) {
+			sk = sk_head(&i->if_sklist);
+			break;
+		}
+		spin_unlock_bh(&i->if_sklist_lock);
+	}
+out:
+	return sk;
+}
+
+static int ipx_seq_socket_show(struct seq_file *seq, void *v)
+{
+	struct sock *s;
+	struct ipx_sock *ipxs;
+
+	if (v == SEQ_START_TOKEN) {
+#ifdef CONFIG_IPX_INTERN
+		seq_puts(seq, "Local_Address               "
+			      "Remote_Address              Tx_Queue  "
+			      "Rx_Queue  State  Uid\n");
+#else
+		seq_puts(seq, "Local_Address  Remote_Address              "
+			      "Tx_Queue  Rx_Queue  State  Uid\n");
+#endif
+		goto out;
+	}
+
+	s = v;
+	ipxs = ipx_sk(s);
+#ifdef CONFIG_IPX_INTERN
+	seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X  ",
+		   (unsigned long)ntohl(ipxs->intrfc->if_netnum),
+		   ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3],
+		   ipxs->node[4], ipxs->node[5], ntohs(ipxs->port));
+#else
+	seq_printf(seq, "%08lX:%04X  ", (unsigned long) ntohl(ipxs->intrfc->if_netnum),
+		   ntohs(ipxs->port));
+#endif	/* CONFIG_IPX_INTERN */
+	if (s->sk_state != TCP_ESTABLISHED)
+		seq_printf(seq, "%-28s", "Not_Connected");
+	else {
+		seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X  ",
+			   (unsigned long)ntohl(ipxs->dest_addr.net),
+			   ipxs->dest_addr.node[0], ipxs->dest_addr.node[1],
+			   ipxs->dest_addr.node[2], ipxs->dest_addr.node[3],
+			   ipxs->dest_addr.node[4], ipxs->dest_addr.node[5],
+			   ntohs(ipxs->dest_addr.sock));
+	}
+
+	seq_printf(seq, "%08X  %08X  %02X     %03d\n",
+		   sk_wmem_alloc_get(s),
+		   sk_rmem_alloc_get(s),
+		   s->sk_state, SOCK_INODE(s->sk_socket)->i_uid);
+out:
+	return 0;
+}
+
+static const struct seq_operations ipx_seq_interface_ops = {
+	.start  = ipx_seq_interface_start,
+	.next   = ipx_seq_interface_next,
+	.stop   = ipx_seq_interface_stop,
+	.show   = ipx_seq_interface_show,
+};
+
+static const struct seq_operations ipx_seq_route_ops = {
+	.start  = ipx_seq_route_start,
+	.next   = ipx_seq_route_next,
+	.stop   = ipx_seq_route_stop,
+	.show   = ipx_seq_route_show,
+};
+
+static const struct seq_operations ipx_seq_socket_ops = {
+	.start  = ipx_seq_socket_start,
+	.next   = ipx_seq_socket_next,
+	.stop   = ipx_seq_interface_stop,
+	.show   = ipx_seq_socket_show,
+};
+
+static int ipx_seq_route_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ipx_seq_route_ops);
+}
+
+static int ipx_seq_interface_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ipx_seq_interface_ops);
+}
+
+static int ipx_seq_socket_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ipx_seq_socket_ops);
+}
+
+static const struct file_operations ipx_seq_interface_fops = {
+	.owner		= THIS_MODULE,
+	.open           = ipx_seq_interface_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static const struct file_operations ipx_seq_route_fops = {
+	.owner		= THIS_MODULE,
+	.open           = ipx_seq_route_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static const struct file_operations ipx_seq_socket_fops = {
+	.owner		= THIS_MODULE,
+	.open           = ipx_seq_socket_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static struct proc_dir_entry *ipx_proc_dir;
+
+int __init ipx_proc_init(void)
+{
+	struct proc_dir_entry *p;
+	int rc = -ENOMEM;
+
+	ipx_proc_dir = proc_mkdir("ipx", init_net.proc_net);
+
+	if (!ipx_proc_dir)
+		goto out;
+	p = proc_create("interface", S_IRUGO,
+			ipx_proc_dir, &ipx_seq_interface_fops);
+	if (!p)
+		goto out_interface;
+
+	p = proc_create("route", S_IRUGO, ipx_proc_dir, &ipx_seq_route_fops);
+	if (!p)
+		goto out_route;
+
+	p = proc_create("socket", S_IRUGO, ipx_proc_dir, &ipx_seq_socket_fops);
+	if (!p)
+		goto out_socket;
+
+	rc = 0;
+out:
+	return rc;
+out_socket:
+	remove_proc_entry("route", ipx_proc_dir);
+out_route:
+	remove_proc_entry("interface", ipx_proc_dir);
+out_interface:
+	remove_proc_entry("ipx", init_net.proc_net);
+	goto out;
+}
+
+void __exit ipx_proc_exit(void)
+{
+	remove_proc_entry("interface", ipx_proc_dir);
+	remove_proc_entry("route", ipx_proc_dir);
+	remove_proc_entry("socket", ipx_proc_dir);
+	remove_proc_entry("ipx", init_net.proc_net);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init ipx_proc_init(void)
+{
+	return 0;
+}
+
+void __exit ipx_proc_exit(void)
+{
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c
new file mode 100644
index 00000000..30f4519b
--- /dev/null
+++ b/net/ipx/ipx_route.c
@@ -0,0 +1,295 @@
+/*
+ *	Implements the IPX routing routines.
+ *	Code moved from af_ipx.c.
+ *
+ *	Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2003
+ *
+ *	See net/ipx/ChangeLog.
+ */
+
+#include <linux/list.h>
+#include <linux/route.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <net/ipx.h>
+#include <net/sock.h>
+
+LIST_HEAD(ipx_routes);
+DEFINE_RWLOCK(ipx_routes_lock);
+
+extern struct ipx_interface *ipx_internal_net;
+
+extern __be16 ipx_cksum(struct ipxhdr *packet, int length);
+extern struct ipx_interface *ipxitf_find_using_net(__be32 net);
+extern int ipxitf_demux_socket(struct ipx_interface *intrfc,
+			       struct sk_buff *skb, int copy);
+extern int ipxitf_demux_socket(struct ipx_interface *intrfc,
+			       struct sk_buff *skb, int copy);
+extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb,
+		       char *node);
+extern struct ipx_interface *ipxitf_find_using_net(__be32 net);
+
+struct ipx_route *ipxrtr_lookup(__be32 net)
+{
+	struct ipx_route *r;
+
+	read_lock_bh(&ipx_routes_lock);
+	list_for_each_entry(r, &ipx_routes, node)
+		if (r->ir_net == net) {
+			ipxrtr_hold(r);
+			goto unlock;
+		}
+	r = NULL;
+unlock:
+	read_unlock_bh(&ipx_routes_lock);
+	return r;
+}
+
+/*
+ * Caller must hold a reference to intrfc
+ */
+int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
+		     unsigned char *node)
+{
+	struct ipx_route *rt;
+	int rc;
+
+	/* Get a route structure; either existing or create */
+	rt = ipxrtr_lookup(network);
+	if (!rt) {
+		rt = kmalloc(sizeof(*rt), GFP_ATOMIC);
+		rc = -EAGAIN;
+		if (!rt)
+			goto out;
+
+		atomic_set(&rt->refcnt, 1);
+		ipxrtr_hold(rt);
+		write_lock_bh(&ipx_routes_lock);
+		list_add(&rt->node, &ipx_routes);
+		write_unlock_bh(&ipx_routes_lock);
+	} else {
+		rc = -EEXIST;
+		if (intrfc == ipx_internal_net)
+			goto out_put;
+	}
+
+	rt->ir_net 	= network;
+	rt->ir_intrfc 	= intrfc;
+	if (!node) {
+		memset(rt->ir_router_node, '\0', IPX_NODE_LEN);
+		rt->ir_routed = 0;
+	} else {
+		memcpy(rt->ir_router_node, node, IPX_NODE_LEN);
+		rt->ir_routed = 1;
+	}
+
+	rc = 0;
+out_put:
+	ipxrtr_put(rt);
+out:
+	return rc;
+}
+
+void ipxrtr_del_routes(struct ipx_interface *intrfc)
+{
+	struct ipx_route *r, *tmp;
+
+	write_lock_bh(&ipx_routes_lock);
+	list_for_each_entry_safe(r, tmp, &ipx_routes, node)
+		if (r->ir_intrfc == intrfc) {
+			list_del(&r->node);
+			ipxrtr_put(r);
+		}
+	write_unlock_bh(&ipx_routes_lock);
+}
+
+static int ipxrtr_create(struct ipx_route_definition *rd)
+{
+	struct ipx_interface *intrfc;
+	int rc = -ENETUNREACH;
+
+	/* Find the appropriate interface */
+	intrfc = ipxitf_find_using_net(rd->ipx_router_network);
+	if (!intrfc)
+		goto out;
+	rc = ipxrtr_add_route(rd->ipx_network, intrfc, rd->ipx_router_node);
+	ipxitf_put(intrfc);
+out:
+	return rc;
+}
+
+static int ipxrtr_delete(__be32 net)
+{
+	struct ipx_route *r, *tmp;
+	int rc;
+
+	write_lock_bh(&ipx_routes_lock);
+	list_for_each_entry_safe(r, tmp, &ipx_routes, node)
+		if (r->ir_net == net) {
+			/* Directly connected; can't lose route */
+			rc = -EPERM;
+			if (!r->ir_routed)
+				goto out;
+			list_del(&r->node);
+			ipxrtr_put(r);
+			rc = 0;
+			goto out;
+		}
+	rc = -ENOENT;
+out:
+	write_unlock_bh(&ipx_routes_lock);
+	return rc;
+}
+
+/*
+ * The skb has to be unshared, we'll end up calling ipxitf_send, that'll
+ * modify the packet
+ */
+int ipxrtr_route_skb(struct sk_buff *skb)
+{
+	struct ipxhdr *ipx = ipx_hdr(skb);
+	struct ipx_route *r = ipxrtr_lookup(IPX_SKB_CB(skb)->ipx_dest_net);
+
+	if (!r) {	/* no known route */
+		kfree_skb(skb);
+		return 0;
+	}
+
+	ipxitf_hold(r->ir_intrfc);
+	ipxitf_send(r->ir_intrfc, skb, r->ir_routed ?
+			r->ir_router_node : ipx->ipx_dest.node);
+	ipxitf_put(r->ir_intrfc);
+	ipxrtr_put(r);
+
+	return 0;
+}
+
+/*
+ * Route an outgoing frame from a socket.
+ */
+int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
+			struct iovec *iov, size_t len, int noblock)
+{
+	struct sk_buff *skb;
+	struct ipx_sock *ipxs = ipx_sk(sk);
+	struct ipx_interface *intrfc;
+	struct ipxhdr *ipx;
+	size_t size;
+	int ipx_offset;
+	struct ipx_route *rt = NULL;
+	int rc;
+
+	/* Find the appropriate interface on which to send packet */
+	if (!usipx->sipx_network && ipx_primary_net) {
+		usipx->sipx_network = ipx_primary_net->if_netnum;
+		intrfc = ipx_primary_net;
+	} else {
+		rt = ipxrtr_lookup(usipx->sipx_network);
+		rc = -ENETUNREACH;
+		if (!rt)
+			goto out;
+		intrfc = rt->ir_intrfc;
+	}
+
+	ipxitf_hold(intrfc);
+	ipx_offset = intrfc->if_ipx_offset;
+	size = sizeof(struct ipxhdr) + len + ipx_offset;
+
+	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+	if (!skb)
+		goto out_put;
+
+	skb_reserve(skb, ipx_offset);
+	skb->sk = sk;
+
+	/* Fill in IPX header */
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_put(skb, sizeof(struct ipxhdr));
+	ipx = ipx_hdr(skb);
+	ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr));
+	IPX_SKB_CB(skb)->ipx_tctrl = 0;
+	ipx->ipx_type 	 = usipx->sipx_type;
+
+	IPX_SKB_CB(skb)->last_hop.index = -1;
+#ifdef CONFIG_IPX_INTERN
+	IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum;
+	memcpy(ipx->ipx_source.node, ipxs->node, IPX_NODE_LEN);
+#else
+	rc = ntohs(ipxs->port);
+	if (rc == 0x453 || rc == 0x452) {
+		/* RIP/SAP special handling for mars_nwe */
+		IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum;
+		memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN);
+	} else {
+		IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum;
+		memcpy(ipx->ipx_source.node, ipxs->intrfc->if_node,
+			IPX_NODE_LEN);
+	}
+#endif	/* CONFIG_IPX_INTERN */
+	ipx->ipx_source.sock		= ipxs->port;
+	IPX_SKB_CB(skb)->ipx_dest_net	= usipx->sipx_network;
+	memcpy(ipx->ipx_dest.node, usipx->sipx_node, IPX_NODE_LEN);
+	ipx->ipx_dest.sock		= usipx->sipx_port;
+
+	rc = memcpy_fromiovec(skb_put(skb, len), iov, len);
+	if (rc) {
+		kfree_skb(skb);
+		goto out_put;
+	}
+
+	/* Apply checksum. Not allowed on 802.3 links. */
+	if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023))
+		ipx->ipx_checksum = htons(0xFFFF);
+	else
+		ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr));
+
+	rc = ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ?
+			 rt->ir_router_node : ipx->ipx_dest.node);
+out_put:
+	ipxitf_put(intrfc);
+	if (rt)
+		ipxrtr_put(rt);
+out:
+	return rc;
+}
+
+/*
+ * We use a normal struct rtentry for route handling
+ */
+int ipxrtr_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct rtentry rt;	/* Use these to behave like 'other' stacks */
+	struct sockaddr_ipx *sg, *st;
+	int rc = -EFAULT;
+
+	if (copy_from_user(&rt, arg, sizeof(rt)))
+		goto out;
+
+	sg = (struct sockaddr_ipx *)&rt.rt_gateway;
+	st = (struct sockaddr_ipx *)&rt.rt_dst;
+
+	rc = -EINVAL;
+	if (!(rt.rt_flags & RTF_GATEWAY) || /* Direct routes are fixed */
+	    sg->sipx_family != AF_IPX ||
+	    st->sipx_family != AF_IPX)
+		goto out;
+
+	switch (cmd) {
+	case SIOCDELRT:
+		rc = ipxrtr_delete(st->sipx_network);
+		break;
+	case SIOCADDRT: {
+		struct ipx_route_definition f;
+		f.ipx_network		= st->sipx_network;
+		f.ipx_router_network	= sg->sipx_network;
+		memcpy(f.ipx_router_node, sg->sipx_node, IPX_NODE_LEN);
+		rc = ipxrtr_create(&f);
+		break;
+	}
+	}
+
+out:
+	return rc;
+}
diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c
new file mode 100644
index 00000000..bd6dca00
--- /dev/null
+++ b/net/ipx/sysctl_net_ipx.c
@@ -0,0 +1,46 @@
+/* -*- linux-c -*-
+ * sysctl_net_ipx.c: sysctl interface to net IPX subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipx directory entry (empty =) ). [MS]
+ * Added /proc/sys/net/ipx/ipx_pprop_broadcasting - acme March 4, 2001
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+/* From af_ipx.c */
+extern int sysctl_ipx_pprop_broadcasting;
+
+static struct ctl_table ipx_table[] = {
+	{
+		.procname	= "ipx_pprop_broadcasting",
+		.data		= &sysctl_ipx_pprop_broadcasting,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_path ipx_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipx", },
+	{ }
+};
+
+static struct ctl_table_header *ipx_table_header;
+
+void ipx_register_sysctl(void)
+{
+	ipx_table_header = register_sysctl_paths(ipx_path, ipx_table);
+}
+
+void ipx_unregister_sysctl(void)
+{
+	unregister_sysctl_table(ipx_table_header);
+}
diff --git a/net/irda/Kconfig b/net/irda/Kconfig
new file mode 100644
index 00000000..c8671a7f
--- /dev/null
+++ b/net/irda/Kconfig
@@ -0,0 +1,96 @@
+#
+# IrDA protocol configuration
+#
+
+menuconfig IRDA
+	depends on NET && !S390
+	tristate "IrDA (infrared) subsystem support"
+	select CRC_CCITT
+	---help---
+	  Say Y here if you want to build support for the IrDA (TM) protocols.
+	  The Infrared Data Associations (tm) specifies standards for wireless
+	  infrared communication and is supported by most laptops and PDA's.
+
+	  To use Linux support for the IrDA (tm) protocols, you will also need
+	  some user-space utilities like irattach.  For more information, see
+	  the file <file:Documentation/networking/irda.txt>.  You also want to
+	  read the IR-HOWTO, available at
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  If you want to exchange bits of data (vCal, vCard) with a PDA, you
+	  will need to install some OBEX application, such as OpenObex :
+	  <http://sourceforge.net/projects/openobex/>
+
+	  To compile this support as a module, choose M here: the module will
+	  be called irda.
+
+comment "IrDA protocols"
+	depends on IRDA
+
+source "net/irda/irlan/Kconfig"
+
+source "net/irda/irnet/Kconfig"
+
+source "net/irda/ircomm/Kconfig"
+
+config IRDA_ULTRA
+	bool "Ultra (connectionless) protocol"
+	depends on IRDA
+	help
+	  Say Y here to support the connectionless Ultra IRDA protocol.
+	  Ultra allows to exchange data over IrDA with really simple devices
+	  (watch, beacon) without the overhead of the IrDA protocol (no handshaking,
+	  no management frames, simple fixed header).
+	  Ultra is available as a special socket : socket(AF_IRDA, SOCK_DGRAM, 1);
+
+comment "IrDA options"
+	depends on IRDA
+
+config IRDA_CACHE_LAST_LSAP
+	bool "Cache last LSAP"
+	depends on IRDA
+	help
+	  Say Y here if you want IrLMP to cache the last LSAP used.  This
+	  makes sense since most frames will be sent/received on the same
+	  connection.  Enabling this option will save a hash-lookup per frame.
+
+	  If unsure, say Y.
+
+config IRDA_FAST_RR
+	bool "Fast RRs (low latency)"
+	depends on IRDA
+	---help---
+	  Say Y here is you want IrLAP to send fast RR (Receive Ready) frames
+	  when acting as a primary station.
+	  Disabling this option will make latency over IrDA very bad. Enabling
+	  this option will make the IrDA stack send more packet than strictly
+	  necessary, thus reduce your battery life (but not that much).
+
+	  Fast RR will make IrLAP send out a RR frame immediately when
+	  receiving a frame if its own transmit queue is currently empty. This
+	  will give a lot of speed improvement when receiving much data since
+	  the secondary station will not have to wait the max. turn around
+	  time (usually 500ms) before it is allowed to transmit the next time.
+	  If the transmit queue of the secondary is also empty, the primary will
+	  start backing-off before sending another RR frame, waiting longer
+	  each time until the back-off reaches the max. turn around time.
+	  This back-off increase in controlled via
+	  /proc/sys/net/irda/fast_poll_increase
+
+	  If unsure, say Y.
+
+config IRDA_DEBUG
+	bool "Debug information"
+	depends on IRDA
+	help
+	  Say Y here if you want the IrDA subsystem to write debug information
+	  to your syslog. You can change the debug level in
+	  /proc/sys/net/irda/debug .
+	  When this option is enabled, the IrDA also perform many extra internal
+	  verifications which will usually prevent the kernel to crash in case of
+	  bugs.
+
+	  If unsure, say Y (since it makes it easier to find the bugs).
+
+source "drivers/net/irda/Kconfig"
+
diff --git a/net/irda/Makefile b/net/irda/Makefile
new file mode 100644
index 00000000..187f6c56
--- /dev/null
+++ b/net/irda/Makefile
@@ -0,0 +1,15 @@
+#
+# Makefile for the Linux IrDA protocol layer.
+#
+
+obj-$(CONFIG_IRDA) += irda.o
+obj-$(CONFIG_IRLAN) += irlan/
+obj-$(CONFIG_IRNET) += irnet/
+obj-$(CONFIG_IRCOMM) += ircomm/
+
+irda-y := iriap.o iriap_event.o irlmp.o irlmp_event.o irlmp_frame.o \
+          irlap.o irlap_event.o irlap_frame.o timer.o qos.o irqueue.o \
+          irttp.o irda_device.o irias_object.o wrapper.o af_irda.o \
+	  discovery.o parameters.o irnetlink.o irmod.o
+irda-$(CONFIG_PROC_FS) += irproc.o
+irda-$(CONFIG_SYSCTL) += irsysctl.o
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
new file mode 100644
index 00000000..cc616974
--- /dev/null
+++ b/net/irda/af_irda.c
@@ -0,0 +1,2746 @@
+/*********************************************************************
+ *
+ * Filename:      af_irda.c
+ * Version:       0.9
+ * Description:   IrDA sockets implementation
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun May 31 10:12:43 1998
+ * Modified at:   Sat Dec 25 21:10:23 1999
+ * Modified by:   Dag Brattli <dag@brattli.net>
+ * Sources:       af_netroom.c, af_ax25.c, af_rose.c, af_x25.c etc.
+ *
+ *     Copyright (c) 1999 Dag Brattli <dagb@cs.uit.no>
+ *     Copyright (c) 1999-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ *     Linux-IrDA now supports four different types of IrDA sockets:
+ *
+ *     o SOCK_STREAM:    TinyTP connections with SAR disabled. The
+ *                       max SDU size is 0 for conn. of this type
+ *     o SOCK_SEQPACKET: TinyTP connections with SAR enabled. TTP may
+ *                       fragment the messages, but will preserve
+ *                       the message boundaries
+ *     o SOCK_DGRAM:     IRDAPROTO_UNITDATA: TinyTP connections with Unitdata
+ *                       (unreliable) transfers
+ *                       IRDAPROTO_ULTRA: Connectionless and unreliable data
+ *
+ ********************************************************************/
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/irda.h>
+#include <linux/poll.h>
+
+#include <asm/ioctls.h>		/* TIOCOUTQ, TIOCINQ */
+#include <asm/uaccess.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <net/irda/af_irda.h>
+
+static int irda_create(struct net *net, struct socket *sock, int protocol, int kern);
+
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
+
+#ifdef CONFIG_IRDA_ULTRA
+static const struct proto_ops irda_ultra_ops;
+#define ULTRA_MAX_DATA 382
+#endif /* CONFIG_IRDA_ULTRA */
+
+#define IRDA_MAX_HEADER (TTP_MAX_HEADER)
+
+/*
+ * Function irda_data_indication (instance, sap, skb)
+ *
+ *    Received some data from TinyTP. Just queue it on the receive queue
+ *
+ */
+static int irda_data_indication(void *instance, void *sap, struct sk_buff *skb)
+{
+	struct irda_sock *self;
+	struct sock *sk;
+	int err;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	self = instance;
+	sk = instance;
+
+	err = sock_queue_rcv_skb(sk, skb);
+	if (err) {
+		IRDA_DEBUG(1, "%s(), error: no more mem!\n", __func__);
+		self->rx_flow = FLOW_STOP;
+
+		/* When we return error, TTP will need to requeue the skb */
+		return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Function irda_disconnect_indication (instance, sap, reason, skb)
+ *
+ *    Connection has been closed. Check reason to find out why
+ *
+ */
+static void irda_disconnect_indication(void *instance, void *sap,
+				       LM_REASON reason, struct sk_buff *skb)
+{
+	struct irda_sock *self;
+	struct sock *sk;
+
+	self = instance;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	/* Don't care about it, but let's not leak it */
+	if(skb)
+		dev_kfree_skb(skb);
+
+	sk = instance;
+	if (sk == NULL) {
+		IRDA_DEBUG(0, "%s(%p) : BUG : sk is NULL\n",
+			   __func__, self);
+		return;
+	}
+
+	/* Prevent race conditions with irda_release() and irda_shutdown() */
+	bh_lock_sock(sk);
+	if (!sock_flag(sk, SOCK_DEAD) && sk->sk_state != TCP_CLOSE) {
+		sk->sk_state     = TCP_CLOSE;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+
+		sk->sk_state_change(sk);
+
+		/* Close our TSAP.
+		 * If we leave it open, IrLMP put it back into the list of
+		 * unconnected LSAPs. The problem is that any incoming request
+		 * can then be matched to this socket (and it will be, because
+		 * it is at the head of the list). This would prevent any
+		 * listening socket waiting on the same TSAP to get those
+		 * requests. Some apps forget to close sockets, or hang to it
+		 * a bit too long, so we may stay in this dead state long
+		 * enough to be noticed...
+		 * Note : all socket function do check sk->sk_state, so we are
+		 * safe...
+		 * Jean II
+		 */
+		if (self->tsap) {
+			irttp_close_tsap(self->tsap);
+			self->tsap = NULL;
+		}
+	}
+	bh_unlock_sock(sk);
+
+	/* Note : once we are there, there is not much you want to do
+	 * with the socket anymore, apart from closing it.
+	 * For example, bind() and connect() won't reset sk->sk_err,
+	 * sk->sk_shutdown and sk->sk_flags to valid values...
+	 * Jean II
+	 */
+}
+
+/*
+ * Function irda_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ *    Connections has been confirmed by the remote device
+ *
+ */
+static void irda_connect_confirm(void *instance, void *sap,
+				 struct qos_info *qos,
+				 __u32 max_sdu_size, __u8 max_header_size,
+				 struct sk_buff *skb)
+{
+	struct irda_sock *self;
+	struct sock *sk;
+
+	self = instance;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	sk = instance;
+	if (sk == NULL) {
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	dev_kfree_skb(skb);
+	// Should be ??? skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	/* How much header space do we need to reserve */
+	self->max_header_size = max_header_size;
+
+	/* IrTTP max SDU size in transmit direction */
+	self->max_sdu_size_tx = max_sdu_size;
+
+	/* Find out what the largest chunk of data that we can transmit is */
+	switch (sk->sk_type) {
+	case SOCK_STREAM:
+		if (max_sdu_size != 0) {
+			IRDA_ERROR("%s: max_sdu_size must be 0\n",
+				   __func__);
+			return;
+		}
+		self->max_data_size = irttp_get_max_seg_size(self->tsap);
+		break;
+	case SOCK_SEQPACKET:
+		if (max_sdu_size == 0) {
+			IRDA_ERROR("%s: max_sdu_size cannot be 0\n",
+				   __func__);
+			return;
+		}
+		self->max_data_size = max_sdu_size;
+		break;
+	default:
+		self->max_data_size = irttp_get_max_seg_size(self->tsap);
+	}
+
+	IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __func__,
+		   self->max_data_size);
+
+	memcpy(&self->qos_tx, qos, sizeof(struct qos_info));
+
+	/* We are now connected! */
+	sk->sk_state = TCP_ESTABLISHED;
+	sk->sk_state_change(sk);
+}
+
+/*
+ * Function irda_connect_indication(instance, sap, qos, max_sdu_size, userdata)
+ *
+ *    Incoming connection
+ *
+ */
+static void irda_connect_indication(void *instance, void *sap,
+				    struct qos_info *qos, __u32 max_sdu_size,
+				    __u8 max_header_size, struct sk_buff *skb)
+{
+	struct irda_sock *self;
+	struct sock *sk;
+
+	self = instance;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	sk = instance;
+	if (sk == NULL) {
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	/* How much header space do we need to reserve */
+	self->max_header_size = max_header_size;
+
+	/* IrTTP max SDU size in transmit direction */
+	self->max_sdu_size_tx = max_sdu_size;
+
+	/* Find out what the largest chunk of data that we can transmit is */
+	switch (sk->sk_type) {
+	case SOCK_STREAM:
+		if (max_sdu_size != 0) {
+			IRDA_ERROR("%s: max_sdu_size must be 0\n",
+				   __func__);
+			kfree_skb(skb);
+			return;
+		}
+		self->max_data_size = irttp_get_max_seg_size(self->tsap);
+		break;
+	case SOCK_SEQPACKET:
+		if (max_sdu_size == 0) {
+			IRDA_ERROR("%s: max_sdu_size cannot be 0\n",
+				   __func__);
+			kfree_skb(skb);
+			return;
+		}
+		self->max_data_size = max_sdu_size;
+		break;
+	default:
+		self->max_data_size = irttp_get_max_seg_size(self->tsap);
+	}
+
+	IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __func__,
+		   self->max_data_size);
+
+	memcpy(&self->qos_tx, qos, sizeof(struct qos_info));
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_state_change(sk);
+}
+
+/*
+ * Function irda_connect_response (handle)
+ *
+ *    Accept incoming connection
+ *
+ */
+static void irda_connect_response(struct irda_sock *self)
+{
+	struct sk_buff *skb;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
+			GFP_ATOMIC);
+	if (skb == NULL) {
+		IRDA_DEBUG(0, "%s() Unable to allocate sk_buff!\n",
+			   __func__);
+		return;
+	}
+
+	/* Reserve space for MUX_CONTROL and LAP header */
+	skb_reserve(skb, IRDA_MAX_HEADER);
+
+	irttp_connect_response(self->tsap, self->max_sdu_size_rx, skb);
+}
+
+/*
+ * Function irda_flow_indication (instance, sap, flow)
+ *
+ *    Used by TinyTP to tell us if it can accept more data or not
+ *
+ */
+static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+	struct irda_sock *self;
+	struct sock *sk;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	self = instance;
+	sk = instance;
+	BUG_ON(sk == NULL);
+
+	switch (flow) {
+	case FLOW_STOP:
+		IRDA_DEBUG(1, "%s(), IrTTP wants us to slow down\n",
+			   __func__);
+		self->tx_flow = flow;
+		break;
+	case FLOW_START:
+		self->tx_flow = flow;
+		IRDA_DEBUG(1, "%s(), IrTTP wants us to start again\n",
+			   __func__);
+		wake_up_interruptible(sk_sleep(sk));
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown flow command!\n", __func__);
+		/* Unknown flow command, better stop */
+		self->tx_flow = flow;
+		break;
+	}
+}
+
+/*
+ * Function irda_getvalue_confirm (obj_id, value, priv)
+ *
+ *    Got answer from remote LM-IAS, just pass object to requester...
+ *
+ * Note : duplicate from above, but we need our own version that
+ * doesn't touch the dtsap_sel and save the full value structure...
+ */
+static void irda_getvalue_confirm(int result, __u16 obj_id,
+				  struct ias_value *value, void *priv)
+{
+	struct irda_sock *self;
+
+	self = (struct irda_sock *) priv;
+	if (!self) {
+		IRDA_WARNING("%s: lost myself!\n", __func__);
+		return;
+	}
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	/* We probably don't need to make any more queries */
+	iriap_close(self->iriap);
+	self->iriap = NULL;
+
+	/* Check if request succeeded */
+	if (result != IAS_SUCCESS) {
+		IRDA_DEBUG(1, "%s(), IAS query failed! (%d)\n", __func__,
+			   result);
+
+		self->errno = result;	/* We really need it later */
+
+		/* Wake up any processes waiting for result */
+		wake_up_interruptible(&self->query_wait);
+
+		return;
+	}
+
+	/* Pass the object to the caller (so the caller must delete it) */
+	self->ias_result = value;
+	self->errno = 0;
+
+	/* Wake up any processes waiting for result */
+	wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_selective_discovery_indication (discovery)
+ *
+ *    Got a selective discovery indication from IrLMP.
+ *
+ * IrLMP is telling us that this node is new and matching our hint bit
+ * filter. Wake up any process waiting for answer...
+ */
+static void irda_selective_discovery_indication(discinfo_t *discovery,
+						DISCOVERY_MODE mode,
+						void *priv)
+{
+	struct irda_sock *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	self = (struct irda_sock *) priv;
+	if (!self) {
+		IRDA_WARNING("%s: lost myself!\n", __func__);
+		return;
+	}
+
+	/* Pass parameter to the caller */
+	self->cachedaddr = discovery->daddr;
+
+	/* Wake up process if its waiting for device to be discovered */
+	wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_discovery_timeout (priv)
+ *
+ *    Timeout in the selective discovery process
+ *
+ * We were waiting for a node to be discovered, but nothing has come up
+ * so far. Wake up the user and tell him that we failed...
+ */
+static void irda_discovery_timeout(u_long priv)
+{
+	struct irda_sock *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	self = (struct irda_sock *) priv;
+	BUG_ON(self == NULL);
+
+	/* Nothing for the caller */
+	self->cachelog = NULL;
+	self->cachedaddr = 0;
+	self->errno = -ETIME;
+
+	/* Wake up process if its still waiting... */
+	wake_up_interruptible(&self->query_wait);
+}
+
+/*
+ * Function irda_open_tsap (self)
+ *
+ *    Open local Transport Service Access Point (TSAP)
+ *
+ */
+static int irda_open_tsap(struct irda_sock *self, __u8 tsap_sel, char *name)
+{
+	notify_t notify;
+
+	if (self->tsap) {
+		IRDA_WARNING("%s: busy!\n", __func__);
+		return -EBUSY;
+	}
+
+	/* Initialize callbacks to be used by the IrDA stack */
+	irda_notify_init(&notify);
+	notify.connect_confirm       = irda_connect_confirm;
+	notify.connect_indication    = irda_connect_indication;
+	notify.disconnect_indication = irda_disconnect_indication;
+	notify.data_indication       = irda_data_indication;
+	notify.udata_indication	     = irda_data_indication;
+	notify.flow_indication       = irda_flow_indication;
+	notify.instance = self;
+	strncpy(notify.name, name, NOTIFY_MAX_NAME);
+
+	self->tsap = irttp_open_tsap(tsap_sel, DEFAULT_INITIAL_CREDIT,
+				     &notify);
+	if (self->tsap == NULL) {
+		IRDA_DEBUG(0, "%s(), Unable to allocate TSAP!\n",
+			   __func__);
+		return -ENOMEM;
+	}
+	/* Remember which TSAP selector we actually got */
+	self->stsap_sel = self->tsap->stsap_sel;
+
+	return 0;
+}
+
+/*
+ * Function irda_open_lsap (self)
+ *
+ *    Open local Link Service Access Point (LSAP). Used for opening Ultra
+ *    sockets
+ */
+#ifdef CONFIG_IRDA_ULTRA
+static int irda_open_lsap(struct irda_sock *self, int pid)
+{
+	notify_t notify;
+
+	if (self->lsap) {
+		IRDA_WARNING("%s(), busy!\n", __func__);
+		return -EBUSY;
+	}
+
+	/* Initialize callbacks to be used by the IrDA stack */
+	irda_notify_init(&notify);
+	notify.udata_indication	= irda_data_indication;
+	notify.instance = self;
+	strncpy(notify.name, "Ultra", NOTIFY_MAX_NAME);
+
+	self->lsap = irlmp_open_lsap(LSAP_CONNLESS, &notify, pid);
+	if (self->lsap == NULL) {
+		IRDA_DEBUG( 0, "%s(), Unable to allocate LSAP!\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irda_find_lsap_sel (self, name)
+ *
+ *    Try to lookup LSAP selector in remote LM-IAS
+ *
+ * Basically, we start a IAP query, and then go to sleep. When the query
+ * return, irda_getvalue_confirm will wake us up, and we can examine the
+ * result of the query...
+ * Note that in some case, the query fail even before we go to sleep,
+ * creating some races...
+ */
+static int irda_find_lsap_sel(struct irda_sock *self, char *name)
+{
+	IRDA_DEBUG(2, "%s(%p, %s)\n", __func__, self, name);
+
+	if (self->iriap) {
+		IRDA_WARNING("%s(): busy with a previous query\n",
+			     __func__);
+		return -EBUSY;
+	}
+
+	self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+				 irda_getvalue_confirm);
+	if(self->iriap == NULL)
+		return -ENOMEM;
+
+	/* Treat unexpected wakeup as disconnect */
+	self->errno = -EHOSTUNREACH;
+
+	/* Query remote LM-IAS */
+	iriap_getvaluebyclass_request(self->iriap, self->saddr, self->daddr,
+				      name, "IrDA:TinyTP:LsapSel");
+
+	/* Wait for answer, if not yet finished (or failed) */
+	if (wait_event_interruptible(self->query_wait, (self->iriap==NULL)))
+		/* Treat signals as disconnect */
+		return -EHOSTUNREACH;
+
+	/* Check what happened */
+	if (self->errno)
+	{
+		/* Requested object/attribute doesn't exist */
+		if((self->errno == IAS_CLASS_UNKNOWN) ||
+		   (self->errno == IAS_ATTRIB_UNKNOWN))
+			return -EADDRNOTAVAIL;
+		else
+			return -EHOSTUNREACH;
+	}
+
+	/* Get the remote TSAP selector */
+	switch (self->ias_result->type) {
+	case IAS_INTEGER:
+		IRDA_DEBUG(4, "%s() int=%d\n",
+			   __func__, self->ias_result->t.integer);
+
+		if (self->ias_result->t.integer != -1)
+			self->dtsap_sel = self->ias_result->t.integer;
+		else
+			self->dtsap_sel = 0;
+		break;
+	default:
+		self->dtsap_sel = 0;
+		IRDA_DEBUG(0, "%s(), bad type!\n", __func__);
+		break;
+	}
+	if (self->ias_result)
+		irias_delete_value(self->ias_result);
+
+	if (self->dtsap_sel)
+		return 0;
+
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Function irda_discover_daddr_and_lsap_sel (self, name)
+ *
+ *    This try to find a device with the requested service.
+ *
+ * It basically look into the discovery log. For each address in the list,
+ * it queries the LM-IAS of the device to find if this device offer
+ * the requested service.
+ * If there is more than one node supporting the service, we complain
+ * to the user (it should move devices around).
+ * The, we set both the destination address and the lsap selector to point
+ * on the service on the unique device we have found.
+ *
+ * Note : this function fails if there is more than one device in range,
+ * because IrLMP doesn't disconnect the LAP when the last LSAP is closed.
+ * Moreover, we would need to wait the LAP disconnection...
+ */
+static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
+{
+	discinfo_t *discoveries;	/* Copy of the discovery log */
+	int	number;			/* Number of nodes in the log */
+	int	i;
+	int	err = -ENETUNREACH;
+	__u32	daddr = DEV_ADDR_ANY;	/* Address we found the service on */
+	__u8	dtsap_sel = 0x0;	/* TSAP associated with it */
+
+	IRDA_DEBUG(2, "%s(), name=%s\n", __func__, name);
+
+	/* Ask lmp for the current discovery log
+	 * Note : we have to use irlmp_get_discoveries(), as opposed
+	 * to play with the cachelog directly, because while we are
+	 * making our ias query, le log might change... */
+	discoveries = irlmp_get_discoveries(&number, self->mask.word,
+					    self->nslots);
+	/* Check if the we got some results */
+	if (discoveries == NULL)
+		return -ENETUNREACH;	/* No nodes discovered */
+
+	/*
+	 * Now, check all discovered devices (if any), and connect
+	 * client only about the services that the client is
+	 * interested in...
+	 */
+	for(i = 0; i < number; i++) {
+		/* Try the address in the log */
+		self->daddr = discoveries[i].daddr;
+		self->saddr = 0x0;
+		IRDA_DEBUG(1, "%s(), trying daddr = %08x\n",
+			   __func__, self->daddr);
+
+		/* Query remote LM-IAS for this service */
+		err = irda_find_lsap_sel(self, name);
+		switch (err) {
+		case 0:
+			/* We found the requested service */
+			if(daddr != DEV_ADDR_ANY) {
+				IRDA_DEBUG(1, "%s(), discovered service ''%s'' in two different devices !!!\n",
+					   __func__, name);
+				self->daddr = DEV_ADDR_ANY;
+				kfree(discoveries);
+				return -ENOTUNIQ;
+			}
+			/* First time we found that one, save it ! */
+			daddr = self->daddr;
+			dtsap_sel = self->dtsap_sel;
+			break;
+		case -EADDRNOTAVAIL:
+			/* Requested service simply doesn't exist on this node */
+			break;
+		default:
+			/* Something bad did happen :-( */
+			IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__);
+			self->daddr = DEV_ADDR_ANY;
+			kfree(discoveries);
+			return -EHOSTUNREACH;
+			break;
+		}
+	}
+	/* Cleanup our copy of the discovery log */
+	kfree(discoveries);
+
+	/* Check out what we found */
+	if(daddr == DEV_ADDR_ANY) {
+		IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n",
+			   __func__, name);
+		self->daddr = DEV_ADDR_ANY;
+		return -EADDRNOTAVAIL;
+	}
+
+	/* Revert back to discovered device & service */
+	self->daddr = daddr;
+	self->saddr = 0x0;
+	self->dtsap_sel = dtsap_sel;
+
+	IRDA_DEBUG(1, "%s(), discovered requested service ''%s'' at address %08x\n",
+		   __func__, name, self->daddr);
+
+	return 0;
+}
+
+/*
+ * Function irda_getname (sock, uaddr, uaddr_len, peer)
+ *
+ *    Return the our own, or peers socket address (sockaddr_irda)
+ *
+ */
+static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
+			int *uaddr_len, int peer)
+{
+	struct sockaddr_irda saddr;
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+
+	memset(&saddr, 0, sizeof(saddr));
+	if (peer) {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -ENOTCONN;
+
+		saddr.sir_family = AF_IRDA;
+		saddr.sir_lsap_sel = self->dtsap_sel;
+		saddr.sir_addr = self->daddr;
+	} else {
+		saddr.sir_family = AF_IRDA;
+		saddr.sir_lsap_sel = self->stsap_sel;
+		saddr.sir_addr = self->saddr;
+	}
+
+	IRDA_DEBUG(1, "%s(), tsap_sel = %#x\n", __func__, saddr.sir_lsap_sel);
+	IRDA_DEBUG(1, "%s(), addr = %08x\n", __func__, saddr.sir_addr);
+
+	/* uaddr_len come to us uninitialised */
+	*uaddr_len = sizeof (struct sockaddr_irda);
+	memcpy(uaddr, &saddr, *uaddr_len);
+
+	return 0;
+}
+
+/*
+ * Function irda_listen (sock, backlog)
+ *
+ *    Just move to the listen state
+ *
+ */
+static int irda_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = -EOPNOTSUPP;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	lock_sock(sk);
+
+	if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
+	    (sk->sk_type != SOCK_DGRAM))
+		goto out;
+
+	if (sk->sk_state != TCP_LISTEN) {
+		sk->sk_max_ack_backlog = backlog;
+		sk->sk_state           = TCP_LISTEN;
+
+		err = 0;
+	}
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+/*
+ * Function irda_bind (sock, uaddr, addr_len)
+ *
+ *    Used by servers to register their well known TSAP
+ *
+ */
+static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr;
+	struct irda_sock *self = irda_sk(sk);
+	int err;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	if (addr_len != sizeof(struct sockaddr_irda))
+		return -EINVAL;
+
+	lock_sock(sk);
+#ifdef CONFIG_IRDA_ULTRA
+	/* Special care for Ultra sockets */
+	if ((sk->sk_type == SOCK_DGRAM) &&
+	    (sk->sk_protocol == IRDAPROTO_ULTRA)) {
+		self->pid = addr->sir_lsap_sel;
+		err = -EOPNOTSUPP;
+		if (self->pid & 0x80) {
+			IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __func__);
+			goto out;
+		}
+		err = irda_open_lsap(self, self->pid);
+		if (err < 0)
+			goto out;
+
+		/* Pretend we are connected */
+		sock->state = SS_CONNECTED;
+		sk->sk_state   = TCP_ESTABLISHED;
+		err = 0;
+
+		goto out;
+	}
+#endif /* CONFIG_IRDA_ULTRA */
+
+	self->ias_obj = irias_new_object(addr->sir_name, jiffies);
+	err = -ENOMEM;
+	if (self->ias_obj == NULL)
+		goto out;
+
+	err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
+	if (err < 0) {
+		irias_delete_object(self->ias_obj);
+		self->ias_obj = NULL;
+		goto out;
+	}
+
+	/*  Register with LM-IAS */
+	irias_add_integer_attrib(self->ias_obj, "IrDA:TinyTP:LsapSel",
+				 self->stsap_sel, IAS_KERNEL_ATTR);
+	irias_insert_object(self->ias_obj);
+
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ * Function irda_accept (sock, newsock, flags)
+ *
+ *    Wait for incoming connection
+ *
+ */
+static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *new, *self = irda_sk(sk);
+	struct sock *newsk;
+	struct sk_buff *skb;
+	int err;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
+	if (err)
+		return err;
+
+	err = -EINVAL;
+
+	lock_sock(sk);
+	if (sock->state != SS_UNCONNECTED)
+		goto out;
+
+	if ((sk = sock->sk) == NULL)
+		goto out;
+
+	err = -EOPNOTSUPP;
+	if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
+	    (sk->sk_type != SOCK_DGRAM))
+		goto out;
+
+	err = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out;
+
+	/*
+	 *	The read queue this time is holding sockets ready to use
+	 *	hooked into the SABM we saved
+	 */
+
+	/*
+	 * We can perform the accept only if there is incoming data
+	 * on the listening socket.
+	 * So, we will block the caller until we receive any data.
+	 * If the caller was waiting on select() or poll() before
+	 * calling us, the data is waiting for us ;-)
+	 * Jean II
+	 */
+	while (1) {
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb)
+			break;
+
+		/* Non blocking operation */
+		err = -EWOULDBLOCK;
+		if (flags & O_NONBLOCK)
+			goto out;
+
+		err = wait_event_interruptible(*(sk_sleep(sk)),
+					skb_peek(&sk->sk_receive_queue));
+		if (err)
+			goto out;
+	}
+
+	newsk = newsock->sk;
+	err = -EIO;
+	if (newsk == NULL)
+		goto out;
+
+	newsk->sk_state = TCP_ESTABLISHED;
+
+	new = irda_sk(newsk);
+
+	/* Now attach up the new socket */
+	new->tsap = irttp_dup(self->tsap, new);
+	err = -EPERM; /* value does not seem to make sense. -arnd */
+	if (!new->tsap) {
+		IRDA_DEBUG(0, "%s(), dup failed!\n", __func__);
+		kfree_skb(skb);
+		goto out;
+	}
+
+	new->stsap_sel = new->tsap->stsap_sel;
+	new->dtsap_sel = new->tsap->dtsap_sel;
+	new->saddr = irttp_get_saddr(new->tsap);
+	new->daddr = irttp_get_daddr(new->tsap);
+
+	new->max_sdu_size_tx = self->max_sdu_size_tx;
+	new->max_sdu_size_rx = self->max_sdu_size_rx;
+	new->max_data_size   = self->max_data_size;
+	new->max_header_size = self->max_header_size;
+
+	memcpy(&new->qos_tx, &self->qos_tx, sizeof(struct qos_info));
+
+	/* Clean up the original one to keep it in listen state */
+	irttp_listen(self->tsap);
+
+	kfree_skb(skb);
+	sk->sk_ack_backlog--;
+
+	newsock->state = SS_CONNECTED;
+
+	irda_connect_response(new);
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ * Function irda_connect (sock, uaddr, addr_len, flags)
+ *
+ *    Connect to a IrDA device
+ *
+ * The main difference with a "standard" connect is that with IrDA we need
+ * to resolve the service name into a TSAP selector (in TCP, port number
+ * doesn't have to be resolved).
+ * Because of this service name resoltion, we can offer "auto-connect",
+ * where we connect to a service without specifying a destination address.
+ *
+ * Note : by consulting "errno", the user space caller may learn the cause
+ * of the failure. Most of them are visible in the function, others may come
+ * from subroutines called and are listed here :
+ *	o EBUSY : already processing a connect
+ *	o EHOSTUNREACH : bad addr->sir_addr argument
+ *	o EADDRNOTAVAIL : bad addr->sir_name argument
+ *	o ENOTUNIQ : more than one node has addr->sir_name (auto-connect)
+ *	o ENETUNREACH : no node found on the network (auto-connect)
+ */
+static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr;
+	struct irda_sock *self = irda_sk(sk);
+	int err;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	lock_sock(sk);
+	/* Don't allow connect for Ultra sockets */
+	err = -ESOCKTNOSUPPORT;
+	if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA))
+		goto out;
+
+	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+		sock->state = SS_CONNECTED;
+		err = 0;
+		goto out;   /* Connect completed during a ERESTARTSYS event */
+	}
+
+	if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+		sock->state = SS_UNCONNECTED;
+		err = -ECONNREFUSED;
+		goto out;
+	}
+
+	err = -EISCONN;      /* No reconnect on a seqpacket socket */
+	if (sk->sk_state == TCP_ESTABLISHED)
+		goto out;
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	err = -EINVAL;
+	if (addr_len != sizeof(struct sockaddr_irda))
+		goto out;
+
+	/* Check if user supplied any destination device address */
+	if ((!addr->sir_addr) || (addr->sir_addr == DEV_ADDR_ANY)) {
+		/* Try to find one suitable */
+		err = irda_discover_daddr_and_lsap_sel(self, addr->sir_name);
+		if (err) {
+			IRDA_DEBUG(0, "%s(), auto-connect failed!\n", __func__);
+			goto out;
+		}
+	} else {
+		/* Use the one provided by the user */
+		self->daddr = addr->sir_addr;
+		IRDA_DEBUG(1, "%s(), daddr = %08x\n", __func__, self->daddr);
+
+		/* If we don't have a valid service name, we assume the
+		 * user want to connect on a specific LSAP. Prevent
+		 * the use of invalid LSAPs (IrLMP 1.1 p10). Jean II */
+		if((addr->sir_name[0] != '\0') ||
+		   (addr->sir_lsap_sel >= 0x70)) {
+			/* Query remote LM-IAS using service name */
+			err = irda_find_lsap_sel(self, addr->sir_name);
+			if (err) {
+				IRDA_DEBUG(0, "%s(), connect failed!\n", __func__);
+				goto out;
+			}
+		} else {
+			/* Directly connect to the remote LSAP
+			 * specified by the sir_lsap field.
+			 * Please use with caution, in IrDA LSAPs are
+			 * dynamic and there is no "well-known" LSAP. */
+			self->dtsap_sel = addr->sir_lsap_sel;
+		}
+	}
+
+	/* Check if we have opened a local TSAP */
+	if (!self->tsap)
+		irda_open_tsap(self, LSAP_ANY, addr->sir_name);
+
+	/* Move to connecting socket, start sending Connect Requests */
+	sock->state = SS_CONNECTING;
+	sk->sk_state   = TCP_SYN_SENT;
+
+	/* Connect to remote device */
+	err = irttp_connect_request(self->tsap, self->dtsap_sel,
+				    self->saddr, self->daddr, NULL,
+				    self->max_sdu_size_rx, NULL);
+	if (err) {
+		IRDA_DEBUG(0, "%s(), connect failed!\n", __func__);
+		goto out;
+	}
+
+	/* Now the loop */
+	err = -EINPROGRESS;
+	if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
+		goto out;
+
+	err = -ERESTARTSYS;
+	if (wait_event_interruptible(*(sk_sleep(sk)),
+				     (sk->sk_state != TCP_SYN_SENT)))
+		goto out;
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		sock->state = SS_UNCONNECTED;
+		if (sk->sk_prot->disconnect(sk, flags))
+			sock->state = SS_DISCONNECTING;
+		err = sock_error(sk);
+		if (!err)
+			err = -ECONNRESET;
+		goto out;
+	}
+
+	sock->state = SS_CONNECTED;
+
+	/* At this point, IrLMP has assigned our source address */
+	self->saddr = irttp_get_saddr(self->tsap);
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+}
+
+static struct proto irda_proto = {
+	.name	  = "IRDA",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct irda_sock),
+};
+
+/*
+ * Function irda_create (sock, protocol)
+ *
+ *    Create IrDA socket
+ *
+ */
+static int irda_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	struct sock *sk;
+	struct irda_sock *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	if (net != &init_net)
+		return -EAFNOSUPPORT;
+
+	/* Check for valid socket type */
+	switch (sock->type) {
+	case SOCK_STREAM:     /* For TTP connections with SAR disabled */
+	case SOCK_SEQPACKET:  /* For TTP connections with SAR enabled */
+	case SOCK_DGRAM:      /* For TTP Unitdata or LMP Ultra transfers */
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	/* Allocate networking socket */
+	sk = sk_alloc(net, PF_IRDA, GFP_ATOMIC, &irda_proto);
+	if (sk == NULL)
+		return -ENOMEM;
+
+	self = irda_sk(sk);
+	IRDA_DEBUG(2, "%s() : self is %p\n", __func__, self);
+
+	init_waitqueue_head(&self->query_wait);
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		sock->ops = &irda_stream_ops;
+		self->max_sdu_size_rx = TTP_SAR_DISABLE;
+		break;
+	case SOCK_SEQPACKET:
+		sock->ops = &irda_seqpacket_ops;
+		self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+		break;
+	case SOCK_DGRAM:
+		switch (protocol) {
+#ifdef CONFIG_IRDA_ULTRA
+		case IRDAPROTO_ULTRA:
+			sock->ops = &irda_ultra_ops;
+			/* Initialise now, because we may send on unbound
+			 * sockets. Jean II */
+			self->max_data_size = ULTRA_MAX_DATA - LMP_PID_HEADER;
+			self->max_header_size = IRDA_MAX_HEADER + LMP_PID_HEADER;
+			break;
+#endif /* CONFIG_IRDA_ULTRA */
+		case IRDAPROTO_UNITDATA:
+			sock->ops = &irda_dgram_ops;
+			/* We let Unitdata conn. be like seqpack conn. */
+			self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+			break;
+		default:
+			sk_free(sk);
+			return -ESOCKTNOSUPPORT;
+		}
+		break;
+	default:
+		sk_free(sk);
+		return -ESOCKTNOSUPPORT;
+	}
+
+	/* Initialise networking socket struct */
+	sock_init_data(sock, sk);	/* Note : set sk->sk_refcnt to 1 */
+	sk->sk_family = PF_IRDA;
+	sk->sk_protocol = protocol;
+
+	/* Register as a client with IrLMP */
+	self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
+	self->mask.word = 0xffff;
+	self->rx_flow = self->tx_flow = FLOW_START;
+	self->nslots = DISCOVERY_DEFAULT_SLOTS;
+	self->daddr = DEV_ADDR_ANY;	/* Until we get connected */
+	self->saddr = 0x0;		/* so IrLMP assign us any link */
+	return 0;
+}
+
+/*
+ * Function irda_destroy_socket (self)
+ *
+ *    Destroy socket
+ *
+ */
+static void irda_destroy_socket(struct irda_sock *self)
+{
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	/* Unregister with IrLMP */
+	irlmp_unregister_client(self->ckey);
+	irlmp_unregister_service(self->skey);
+
+	/* Unregister with LM-IAS */
+	if (self->ias_obj) {
+		irias_delete_object(self->ias_obj);
+		self->ias_obj = NULL;
+	}
+
+	if (self->iriap) {
+		iriap_close(self->iriap);
+		self->iriap = NULL;
+	}
+
+	if (self->tsap) {
+		irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+		irttp_close_tsap(self->tsap);
+		self->tsap = NULL;
+	}
+#ifdef CONFIG_IRDA_ULTRA
+	if (self->lsap) {
+		irlmp_close_lsap(self->lsap);
+		self->lsap = NULL;
+	}
+#endif /* CONFIG_IRDA_ULTRA */
+}
+
+/*
+ * Function irda_release (sock)
+ */
+static int irda_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	if (sk == NULL)
+		return 0;
+
+	lock_sock(sk);
+	sk->sk_state       = TCP_CLOSE;
+	sk->sk_shutdown   |= SEND_SHUTDOWN;
+	sk->sk_state_change(sk);
+
+	/* Destroy IrDA socket */
+	irda_destroy_socket(irda_sk(sk));
+
+	sock_orphan(sk);
+	sock->sk   = NULL;
+	release_sock(sk);
+
+	/* Purge queues (see sock_init_data()) */
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	/* Destroy networking socket if we are the last reference on it,
+	 * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */
+	sock_put(sk);
+
+	/* Notes on socket locking and deallocation... - Jean II
+	 * In theory we should put pairs of sock_hold() / sock_put() to
+	 * prevent the socket to be destroyed whenever there is an
+	 * outstanding request or outstanding incoming packet or event.
+	 *
+	 * 1) This may include IAS request, both in connect and getsockopt.
+	 * Unfortunately, the situation is a bit more messy than it looks,
+	 * because we close iriap and kfree(self) above.
+	 *
+	 * 2) This may include selective discovery in getsockopt.
+	 * Same stuff as above, irlmp registration and self are gone.
+	 *
+	 * Probably 1 and 2 may not matter, because it's all triggered
+	 * by a process and the socket layer already prevent the
+	 * socket to go away while a process is holding it, through
+	 * sockfd_put() and fput()...
+	 *
+	 * 3) This may include deferred TSAP closure. In particular,
+	 * we may receive a late irda_disconnect_indication()
+	 * Fortunately, (tsap_cb *)->close_pend should protect us
+	 * from that.
+	 *
+	 * I did some testing on SMP, and it looks solid. And the socket
+	 * memory leak is now gone... - Jean II
+	 */
+
+	return 0;
+}
+
+/*
+ * Function irda_sendmsg (iocb, sock, msg, len)
+ *
+ *    Send message down to TinyTP. This function is used for both STREAM and
+ *    SEQPACK services. This is possible since it forces the client to
+ *    fragment the message if necessary
+ */
+static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self;
+	struct sk_buff *skb;
+	int err = -EPIPE;
+
+	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
+
+	/* Note : socket.c set MSG_EOR on SEQPACKET sockets */
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT |
+			       MSG_NOSIGNAL)) {
+		return -EINVAL;
+	}
+
+	lock_sock(sk);
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN)
+		goto out_err;
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	self = irda_sk(sk);
+
+	/* Check if IrTTP is wants us to slow down */
+
+	if (wait_event_interruptible(*(sk_sleep(sk)),
+	    (self->tx_flow != FLOW_STOP  ||  sk->sk_state != TCP_ESTABLISHED))) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	/* Check if we are still connected */
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	/* Check that we don't send out too big frames */
+	if (len > self->max_data_size) {
+		IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n",
+			   __func__, len, self->max_data_size);
+		len = self->max_data_size;
+	}
+
+	skb = sock_alloc_send_skb(sk, len + self->max_header_size + 16,
+				  msg->msg_flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		goto out_err;
+
+	skb_reserve(skb, self->max_header_size + 16);
+	skb_reset_transport_header(skb);
+	skb_put(skb, len);
+	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	if (err) {
+		kfree_skb(skb);
+		goto out_err;
+	}
+
+	/*
+	 * Just send the message to TinyTP, and let it deal with possible
+	 * errors. No need to duplicate all that here
+	 */
+	err = irttp_data_request(self->tsap, skb);
+	if (err) {
+		IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
+		goto out_err;
+	}
+
+	release_sock(sk);
+	/* Tell client how much data we actually sent */
+	return len;
+
+out_err:
+	err = sk_stream_error(sk, msg->msg_flags, err);
+out:
+	release_sock(sk);
+	return err;
+
+}
+
+/*
+ * Function irda_recvmsg_dgram (iocb, sock, msg, size, flags)
+ *
+ *    Try to receive message and copy it to user. The frame is discarded
+ *    after being read, regardless of how much the user actually read
+ */
+static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
+			      struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+	struct sk_buff *skb;
+	size_t copied;
+	int err;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	skb_reset_transport_header(skb);
+	copied = skb->len;
+
+	if (copied > size) {
+		IRDA_DEBUG(2, "%s(), Received truncated frame (%zd < %zd)!\n",
+			   __func__, copied, size);
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	skb_free_datagram(sk, skb);
+
+	/*
+	 *  Check if we have previously stopped IrTTP and we know
+	 *  have more free space in our rx_queue. If so tell IrTTP
+	 *  to start delivering frames again before our rx_queue gets
+	 *  empty
+	 */
+	if (self->rx_flow == FLOW_STOP) {
+		if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) {
+			IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __func__);
+			self->rx_flow = FLOW_START;
+			irttp_flow_request(self->tsap, FLOW_START);
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * Function irda_recvmsg_stream (iocb, sock, msg, size, flags)
+ */
+static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+	int noblock = flags & MSG_DONTWAIT;
+	size_t copied = 0;
+	int target, err;
+	long timeo;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	if ((err = sock_error(sk)) < 0)
+		return err;
+
+	if (sock->flags & __SO_ACCEPTCON)
+		return -EINVAL;
+
+	err =-EOPNOTSUPP;
+	if (flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	err = 0;
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+	timeo = sock_rcvtimeo(sk, noblock);
+
+	msg->msg_namelen = 0;
+
+	do {
+		int chunk;
+		struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
+
+		if (skb == NULL) {
+			DEFINE_WAIT(wait);
+			err = 0;
+
+			if (copied >= target)
+				break;
+
+			prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+			/*
+			 *	POSIX 1003.1g mandates this order.
+			 */
+			err = sock_error(sk);
+			if (err)
+				;
+			else if (sk->sk_shutdown & RCV_SHUTDOWN)
+				;
+			else if (noblock)
+				err = -EAGAIN;
+			else if (signal_pending(current))
+				err = sock_intr_errno(timeo);
+			else if (sk->sk_state != TCP_ESTABLISHED)
+				err = -ENOTCONN;
+			else if (skb_peek(&sk->sk_receive_queue) == NULL)
+				/* Wait process until data arrives */
+				schedule();
+
+			finish_wait(sk_sleep(sk), &wait);
+
+			if (err)
+				return err;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			continue;
+		}
+
+		chunk = min_t(unsigned int, skb->len, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			if (copied == 0)
+				copied = -EFAULT;
+			break;
+		}
+		copied += chunk;
+		size -= chunk;
+
+		/* Mark read part of skb as used */
+		if (!(flags & MSG_PEEK)) {
+			skb_pull(skb, chunk);
+
+			/* put the skb back if we didn't use it up.. */
+			if (skb->len) {
+				IRDA_DEBUG(1, "%s(), back on q!\n",
+					   __func__);
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+
+			kfree_skb(skb);
+		} else {
+			IRDA_DEBUG(0, "%s() questionable!?\n", __func__);
+
+			/* put message back and return */
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			break;
+		}
+	} while (size);
+
+	/*
+	 *  Check if we have previously stopped IrTTP and we know
+	 *  have more free space in our rx_queue. If so tell IrTTP
+	 *  to start delivering frames again before our rx_queue gets
+	 *  empty
+	 */
+	if (self->rx_flow == FLOW_STOP) {
+		if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) {
+			IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __func__);
+			self->rx_flow = FLOW_START;
+			irttp_flow_request(self->tsap, FLOW_START);
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * Function irda_sendmsg_dgram (iocb, sock, msg, len)
+ *
+ *    Send message down to TinyTP for the unreliable sequenced
+ *    packet service...
+ *
+ */
+static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
+			      struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self;
+	struct sk_buff *skb;
+	int err;
+
+	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		err = -EPIPE;
+		goto out;
+	}
+
+	err = -ENOTCONN;
+	if (sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	self = irda_sk(sk);
+
+	/*
+	 * Check that we don't send out too big frames. This is an unreliable
+	 * service, so we have no fragmentation and no coalescence
+	 */
+	if (len > self->max_data_size) {
+		IRDA_DEBUG(0, "%s(), Warning to much data! "
+			   "Chopping frame from %zd to %d bytes!\n",
+			   __func__, len, self->max_data_size);
+		len = self->max_data_size;
+	}
+
+	skb = sock_alloc_send_skb(sk, len + self->max_header_size,
+				  msg->msg_flags & MSG_DONTWAIT, &err);
+	err = -ENOBUFS;
+	if (!skb)
+		goto out;
+
+	skb_reserve(skb, self->max_header_size);
+	skb_reset_transport_header(skb);
+
+	IRDA_DEBUG(4, "%s(), appending user data\n", __func__);
+	skb_put(skb, len);
+	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	if (err) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	/*
+	 * Just send the message to TinyTP, and let it deal with possible
+	 * errors. No need to duplicate all that here
+	 */
+	err = irttp_udata_request(self->tsap, skb);
+	if (err) {
+		IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
+		goto out;
+	}
+
+	release_sock(sk);
+	return len;
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ * Function irda_sendmsg_ultra (iocb, sock, msg, len)
+ *
+ *    Send message down to IrLMP for the unreliable Ultra
+ *    packet service...
+ */
+#ifdef CONFIG_IRDA_ULTRA
+static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
+			      struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self;
+	__u8 pid = 0;
+	int bound = 0;
+	struct sk_buff *skb;
+	int err;
+
+	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
+
+	err = -EINVAL;
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	err = -EPIPE;
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		goto out;
+	}
+
+	self = irda_sk(sk);
+
+	/* Check if an address was specified with sendto. Jean II */
+	if (msg->msg_name) {
+		struct sockaddr_irda *addr = (struct sockaddr_irda *) msg->msg_name;
+		err = -EINVAL;
+		/* Check address, extract pid. Jean II */
+		if (msg->msg_namelen < sizeof(*addr))
+			goto out;
+		if (addr->sir_family != AF_IRDA)
+			goto out;
+
+		pid = addr->sir_lsap_sel;
+		if (pid & 0x80) {
+			IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __func__);
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	} else {
+		/* Check that the socket is properly bound to an Ultra
+		 * port. Jean II */
+		if ((self->lsap == NULL) ||
+		    (sk->sk_state != TCP_ESTABLISHED)) {
+			IRDA_DEBUG(0, "%s(), socket not bound to Ultra PID.\n",
+				   __func__);
+			err = -ENOTCONN;
+			goto out;
+		}
+		/* Use PID from socket */
+		bound = 1;
+	}
+
+	/*
+	 * Check that we don't send out too big frames. This is an unreliable
+	 * service, so we have no fragmentation and no coalescence
+	 */
+	if (len > self->max_data_size) {
+		IRDA_DEBUG(0, "%s(), Warning to much data! "
+			   "Chopping frame from %zd to %d bytes!\n",
+			   __func__, len, self->max_data_size);
+		len = self->max_data_size;
+	}
+
+	skb = sock_alloc_send_skb(sk, len + self->max_header_size,
+				  msg->msg_flags & MSG_DONTWAIT, &err);
+	err = -ENOBUFS;
+	if (!skb)
+		goto out;
+
+	skb_reserve(skb, self->max_header_size);
+	skb_reset_transport_header(skb);
+
+	IRDA_DEBUG(4, "%s(), appending user data\n", __func__);
+	skb_put(skb, len);
+	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	if (err) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	err = irlmp_connless_data_request((bound ? self->lsap : NULL),
+					  skb, pid);
+	if (err)
+		IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
+out:
+	release_sock(sk);
+	return err ? : len;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irda_shutdown (sk, how)
+ */
+static int irda_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+
+	IRDA_DEBUG(1, "%s(%p)\n", __func__, self);
+
+	lock_sock(sk);
+
+	sk->sk_state       = TCP_CLOSE;
+	sk->sk_shutdown   |= SEND_SHUTDOWN;
+	sk->sk_state_change(sk);
+
+	if (self->iriap) {
+		iriap_close(self->iriap);
+		self->iriap = NULL;
+	}
+
+	if (self->tsap) {
+		irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+		irttp_close_tsap(self->tsap);
+		self->tsap = NULL;
+	}
+
+	/* A few cleanup so the socket look as good as new... */
+	self->rx_flow = self->tx_flow = FLOW_START;	/* needed ??? */
+	self->daddr = DEV_ADDR_ANY;	/* Until we get re-connected */
+	self->saddr = 0x0;		/* so IrLMP assign us any link */
+
+	release_sock(sk);
+
+	return 0;
+}
+
+/*
+ * Function irda_poll (file, sock, wait)
+ */
+static unsigned int irda_poll(struct file * file, struct socket *sock,
+			      poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+	unsigned int mask;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* Exceptional events? */
+	if (sk->sk_err)
+		mask |= POLLERR;
+	if (sk->sk_shutdown & RCV_SHUTDOWN) {
+		IRDA_DEBUG(0, "%s(), POLLHUP\n", __func__);
+		mask |= POLLHUP;
+	}
+
+	/* Readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue)) {
+		IRDA_DEBUG(4, "Socket is readable\n");
+		mask |= POLLIN | POLLRDNORM;
+	}
+
+	/* Connection-based need to check for termination and startup */
+	switch (sk->sk_type) {
+	case SOCK_STREAM:
+		if (sk->sk_state == TCP_CLOSE) {
+			IRDA_DEBUG(0, "%s(), POLLHUP\n", __func__);
+			mask |= POLLHUP;
+		}
+
+		if (sk->sk_state == TCP_ESTABLISHED) {
+			if ((self->tx_flow == FLOW_START) &&
+			    sock_writeable(sk))
+			{
+				mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+			}
+		}
+		break;
+	case SOCK_SEQPACKET:
+		if ((self->tx_flow == FLOW_START) &&
+		    sock_writeable(sk))
+		{
+			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		}
+		break;
+	case SOCK_DGRAM:
+		if (sock_writeable(sk))
+			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		break;
+	default:
+		break;
+	}
+
+	return mask;
+}
+
+/*
+ * Function irda_ioctl (sock, cmd, arg)
+ */
+static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd);
+
+	err = -EINVAL;
+	switch (cmd) {
+	case TIOCOUTQ: {
+		long amount;
+
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		err = put_user(amount, (unsigned int __user *)arg);
+		break;
+	}
+
+	case TIOCINQ: {
+		struct sk_buff *skb;
+		long amount = 0L;
+		/* These two are safe on a single CPU system as only user tasks fiddle here */
+		if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+			amount = skb->len;
+		err = put_user(amount, (unsigned int __user *)arg);
+		break;
+	}
+
+	case SIOCGSTAMP:
+		if (sk != NULL)
+			err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+		break;
+
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__);
+		err = -ENOIOCTLCMD;
+	}
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Function irda_ioctl (sock, cmd, arg)
+ */
+static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	/*
+	 * All IRDA's ioctl are standard ones.
+	 */
+	return -ENOIOCTLCMD;
+}
+#endif
+
+/*
+ * Function irda_setsockopt (sock, level, optname, optval, optlen)
+ *
+ *    Set some options for the socket
+ *
+ */
+static int irda_setsockopt(struct socket *sock, int level, int optname,
+			   char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+	struct irda_ias_set    *ias_opt;
+	struct ias_object      *ias_obj;
+	struct ias_attrib *	ias_attr;	/* Attribute in IAS object */
+	int opt, free_ias = 0, err = 0;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	if (level != SOL_IRLMP)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case IRLMP_IAS_SET:
+		/* The user want to add an attribute to an existing IAS object
+		 * (in the IAS database) or to create a new object with this
+		 * attribute.
+		 * We first query IAS to know if the object exist, and then
+		 * create the right attribute...
+		 */
+
+		if (optlen != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy query to the driver. */
+		if (copy_from_user(ias_opt, optval, optlen)) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Find the object we target.
+		 * If the user gives us an empty string, we use the object
+		 * associated with this socket. This will workaround
+		 * duplicated class name - Jean II */
+		if(ias_opt->irda_class_name[0] == '\0') {
+			if(self->ias_obj == NULL) {
+				kfree(ias_opt);
+				err = -EINVAL;
+				goto out;
+			}
+			ias_obj = self->ias_obj;
+		} else
+			ias_obj = irias_find_object(ias_opt->irda_class_name);
+
+		/* Only ROOT can mess with the global IAS database.
+		 * Users can only add attributes to the object associated
+		 * with the socket they own - Jean II */
+		if((!capable(CAP_NET_ADMIN)) &&
+		   ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
+			kfree(ias_opt);
+			err = -EPERM;
+			goto out;
+		}
+
+		/* If the object doesn't exist, create it */
+		if(ias_obj == (struct ias_object *) NULL) {
+			/* Create a new object */
+			ias_obj = irias_new_object(ias_opt->irda_class_name,
+						   jiffies);
+			if (ias_obj == NULL) {
+				kfree(ias_opt);
+				err = -ENOMEM;
+				goto out;
+			}
+			free_ias = 1;
+		}
+
+		/* Do we have the attribute already ? */
+		if(irias_find_attrib(ias_obj, ias_opt->irda_attrib_name)) {
+			kfree(ias_opt);
+			if (free_ias) {
+				kfree(ias_obj->name);
+				kfree(ias_obj);
+			}
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Look at the type */
+		switch(ias_opt->irda_attrib_type) {
+		case IAS_INTEGER:
+			/* Add an integer attribute */
+			irias_add_integer_attrib(
+				ias_obj,
+				ias_opt->irda_attrib_name,
+				ias_opt->attribute.irda_attrib_int,
+				IAS_USER_ATTR);
+			break;
+		case IAS_OCT_SEQ:
+			/* Check length */
+			if(ias_opt->attribute.irda_attrib_octet_seq.len >
+			   IAS_MAX_OCTET_STRING) {
+				kfree(ias_opt);
+				if (free_ias) {
+					kfree(ias_obj->name);
+					kfree(ias_obj);
+				}
+
+				err = -EINVAL;
+				goto out;
+			}
+			/* Add an octet sequence attribute */
+			irias_add_octseq_attrib(
+			      ias_obj,
+			      ias_opt->irda_attrib_name,
+			      ias_opt->attribute.irda_attrib_octet_seq.octet_seq,
+			      ias_opt->attribute.irda_attrib_octet_seq.len,
+			      IAS_USER_ATTR);
+			break;
+		case IAS_STRING:
+			/* Should check charset & co */
+			/* Check length */
+			/* The length is encoded in a __u8, and
+			 * IAS_MAX_STRING == 256, so there is no way
+			 * userspace can pass us a string too large.
+			 * Jean II */
+			/* NULL terminate the string (avoid troubles) */
+			ias_opt->attribute.irda_attrib_string.string[ias_opt->attribute.irda_attrib_string.len] = '\0';
+			/* Add a string attribute */
+			irias_add_string_attrib(
+				ias_obj,
+				ias_opt->irda_attrib_name,
+				ias_opt->attribute.irda_attrib_string.string,
+				IAS_USER_ATTR);
+			break;
+		default :
+			kfree(ias_opt);
+			if (free_ias) {
+				kfree(ias_obj->name);
+				kfree(ias_obj);
+			}
+			err = -EINVAL;
+			goto out;
+		}
+		irias_insert_object(ias_obj);
+		kfree(ias_opt);
+		break;
+	case IRLMP_IAS_DEL:
+		/* The user want to delete an object from our local IAS
+		 * database. We just need to query the IAS, check is the
+		 * object is not owned by the kernel and delete it.
+		 */
+
+		if (optlen != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy query to the driver. */
+		if (copy_from_user(ias_opt, optval, optlen)) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Find the object we target.
+		 * If the user gives us an empty string, we use the object
+		 * associated with this socket. This will workaround
+		 * duplicated class name - Jean II */
+		if(ias_opt->irda_class_name[0] == '\0')
+			ias_obj = self->ias_obj;
+		else
+			ias_obj = irias_find_object(ias_opt->irda_class_name);
+		if(ias_obj == (struct ias_object *) NULL) {
+			kfree(ias_opt);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Only ROOT can mess with the global IAS database.
+		 * Users can only del attributes from the object associated
+		 * with the socket they own - Jean II */
+		if((!capable(CAP_NET_ADMIN)) &&
+		   ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
+			kfree(ias_opt);
+			err = -EPERM;
+			goto out;
+		}
+
+		/* Find the attribute (in the object) we target */
+		ias_attr = irias_find_attrib(ias_obj,
+					     ias_opt->irda_attrib_name);
+		if(ias_attr == (struct ias_attrib *) NULL) {
+			kfree(ias_opt);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Check is the user space own the object */
+		if(ias_attr->value->owner != IAS_USER_ATTR) {
+			IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__);
+			kfree(ias_opt);
+			err = -EPERM;
+			goto out;
+		}
+
+		/* Remove the attribute (and maybe the object) */
+		irias_delete_attrib(ias_obj, ias_attr, 1);
+		kfree(ias_opt);
+		break;
+	case IRLMP_MAX_SDU_SIZE:
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Only possible for a seqpacket service (TTP with SAR) */
+		if (sk->sk_type != SOCK_SEQPACKET) {
+			IRDA_DEBUG(2, "%s(), setting max_sdu_size = %d\n",
+				   __func__, opt);
+			self->max_sdu_size_rx = opt;
+		} else {
+			IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n",
+				     __func__);
+			err = -ENOPROTOOPT;
+			goto out;
+		}
+		break;
+	case IRLMP_HINTS_SET:
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* The input is really a (__u8 hints[2]), easier as an int */
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Unregister any old registration */
+		if (self->skey)
+			irlmp_unregister_service(self->skey);
+
+		self->skey = irlmp_register_service((__u16) opt);
+		break;
+	case IRLMP_HINT_MASK_SET:
+		/* As opposed to the previous case which set the hint bits
+		 * that we advertise, this one set the filter we use when
+		 * making a discovery (nodes which don't match any hint
+		 * bit in the mask are not reported).
+		 */
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* The input is really a (__u8 hints[2]), easier as an int */
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Set the new hint mask */
+		self->mask.word = (__u16) opt;
+		/* Mask out extension bits */
+		self->mask.word &= 0x7f7f;
+		/* Check if no bits */
+		if(!self->mask.word)
+			self->mask.word = 0xFFFF;
+
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+out:
+	release_sock(sk);
+
+	return err;
+}
+
+/*
+ * Function irda_extract_ias_value(ias_opt, ias_value)
+ *
+ *    Translate internal IAS value structure to the user space representation
+ *
+ * The external representation of IAS values, as we exchange them with
+ * user space program is quite different from the internal representation,
+ * as stored in the IAS database (because we need a flat structure for
+ * crossing kernel boundary).
+ * This function transform the former in the latter. We also check
+ * that the value type is valid.
+ */
+static int irda_extract_ias_value(struct irda_ias_set *ias_opt,
+				  struct ias_value *ias_value)
+{
+	/* Look at the type */
+	switch (ias_value->type) {
+	case IAS_INTEGER:
+		/* Copy the integer */
+		ias_opt->attribute.irda_attrib_int = ias_value->t.integer;
+		break;
+	case IAS_OCT_SEQ:
+		/* Set length */
+		ias_opt->attribute.irda_attrib_octet_seq.len = ias_value->len;
+		/* Copy over */
+		memcpy(ias_opt->attribute.irda_attrib_octet_seq.octet_seq,
+		       ias_value->t.oct_seq, ias_value->len);
+		break;
+	case IAS_STRING:
+		/* Set length */
+		ias_opt->attribute.irda_attrib_string.len = ias_value->len;
+		ias_opt->attribute.irda_attrib_string.charset = ias_value->charset;
+		/* Copy over */
+		memcpy(ias_opt->attribute.irda_attrib_string.string,
+		       ias_value->t.string, ias_value->len);
+		/* NULL terminate the string (avoid troubles) */
+		ias_opt->attribute.irda_attrib_string.string[ias_value->len] = '\0';
+		break;
+	case IAS_MISSING:
+	default :
+		return -EINVAL;
+	}
+
+	/* Copy type over */
+	ias_opt->irda_attrib_type = ias_value->type;
+
+	return 0;
+}
+
+/*
+ * Function irda_getsockopt (sock, level, optname, optval, optlen)
+ */
+static int irda_getsockopt(struct socket *sock, int level, int optname,
+			   char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct irda_sock *self = irda_sk(sk);
+	struct irda_device_list list;
+	struct irda_device_info *discoveries;
+	struct irda_ias_set *	ias_opt;	/* IAS get/query params */
+	struct ias_object *	ias_obj;	/* Object in IAS */
+	struct ias_attrib *	ias_attr;	/* Attribute in IAS object */
+	int daddr = DEV_ADDR_ANY;	/* Dest address for IAS queries */
+	int val = 0;
+	int len = 0;
+	int err = 0;
+	int offset, total;
+
+	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
+
+	if (level != SOL_IRLMP)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if(len < 0)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case IRLMP_ENUMDEVICES:
+
+		/* Offset to first device entry */
+		offset = sizeof(struct irda_device_list) -
+			sizeof(struct irda_device_info);
+
+		if (len < offset) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Ask lmp for the current discovery log */
+		discoveries = irlmp_get_discoveries(&list.len, self->mask.word,
+						    self->nslots);
+		/* Check if the we got some results */
+		if (discoveries == NULL) {
+			err = -EAGAIN;
+			goto out;		/* Didn't find any devices */
+		}
+
+		/* Write total list length back to client */
+		if (copy_to_user(optval, &list, offset))
+			err = -EFAULT;
+
+		/* Copy the list itself - watch for overflow */
+		if (list.len > 2048) {
+			err = -EINVAL;
+			goto bed;
+		}
+		total = offset + (list.len * sizeof(struct irda_device_info));
+		if (total > len)
+			total = len;
+		if (copy_to_user(optval+offset, discoveries, total - offset))
+			err = -EFAULT;
+
+		/* Write total number of bytes used back to client */
+		if (put_user(total, optlen))
+			err = -EFAULT;
+bed:
+		/* Free up our buffer */
+		kfree(discoveries);
+		break;
+	case IRLMP_MAX_SDU_SIZE:
+		val = self->max_data_size;
+		len = sizeof(int);
+		if (put_user(len, optlen)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		if (copy_to_user(optval, &val, len)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		break;
+	case IRLMP_IAS_GET:
+		/* The user want an object from our local IAS database.
+		 * We just need to query the IAS and return the value
+		 * that we found */
+
+		/* Check that the user has allocated the right space for us */
+		if (len != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy query to the driver. */
+		if (copy_from_user(ias_opt, optval, len)) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Find the object we target.
+		 * If the user gives us an empty string, we use the object
+		 * associated with this socket. This will workaround
+		 * duplicated class name - Jean II */
+		if(ias_opt->irda_class_name[0] == '\0')
+			ias_obj = self->ias_obj;
+		else
+			ias_obj = irias_find_object(ias_opt->irda_class_name);
+		if(ias_obj == (struct ias_object *) NULL) {
+			kfree(ias_opt);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Find the attribute (in the object) we target */
+		ias_attr = irias_find_attrib(ias_obj,
+					     ias_opt->irda_attrib_name);
+		if(ias_attr == (struct ias_attrib *) NULL) {
+			kfree(ias_opt);
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* Translate from internal to user structure */
+		err = irda_extract_ias_value(ias_opt, ias_attr->value);
+		if(err) {
+			kfree(ias_opt);
+			goto out;
+		}
+
+		/* Copy reply to the user */
+		if (copy_to_user(optval, ias_opt,
+				 sizeof(struct irda_ias_set))) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+		/* Note : don't need to put optlen, we checked it */
+		kfree(ias_opt);
+		break;
+	case IRLMP_IAS_QUERY:
+		/* The user want an object from a remote IAS database.
+		 * We need to use IAP to query the remote database and
+		 * then wait for the answer to come back. */
+
+		/* Check that the user has allocated the right space for us */
+		if (len != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy query to the driver. */
+		if (copy_from_user(ias_opt, optval, len)) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* At this point, there are two cases...
+		 * 1) the socket is connected - that's the easy case, we
+		 *	just query the device we are connected to...
+		 * 2) the socket is not connected - the user doesn't want
+		 *	to connect and/or may not have a valid service name
+		 *	(so can't create a fake connection). In this case,
+		 *	we assume that the user pass us a valid destination
+		 *	address in the requesting structure...
+		 */
+		if(self->daddr != DEV_ADDR_ANY) {
+			/* We are connected - reuse known daddr */
+			daddr = self->daddr;
+		} else {
+			/* We are not connected, we must specify a valid
+			 * destination address */
+			daddr = ias_opt->daddr;
+			if((!daddr) || (daddr == DEV_ADDR_ANY)) {
+				kfree(ias_opt);
+				err = -EINVAL;
+				goto out;
+			}
+		}
+
+		/* Check that we can proceed with IAP */
+		if (self->iriap) {
+			IRDA_WARNING("%s: busy with a previous query\n",
+				     __func__);
+			kfree(ias_opt);
+			err = -EBUSY;
+			goto out;
+		}
+
+		self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+					 irda_getvalue_confirm);
+
+		if (self->iriap == NULL) {
+			kfree(ias_opt);
+			err = -ENOMEM;
+			goto out;
+		}
+
+		/* Treat unexpected wakeup as disconnect */
+		self->errno = -EHOSTUNREACH;
+
+		/* Query remote LM-IAS */
+		iriap_getvaluebyclass_request(self->iriap,
+					      self->saddr, daddr,
+					      ias_opt->irda_class_name,
+					      ias_opt->irda_attrib_name);
+
+		/* Wait for answer, if not yet finished (or failed) */
+		if (wait_event_interruptible(self->query_wait,
+					     (self->iriap == NULL))) {
+			/* pending request uses copy of ias_opt-content
+			 * we can free it regardless! */
+			kfree(ias_opt);
+			/* Treat signals as disconnect */
+			err = -EHOSTUNREACH;
+			goto out;
+		}
+
+		/* Check what happened */
+		if (self->errno)
+		{
+			kfree(ias_opt);
+			/* Requested object/attribute doesn't exist */
+			if((self->errno == IAS_CLASS_UNKNOWN) ||
+			   (self->errno == IAS_ATTRIB_UNKNOWN))
+				err = -EADDRNOTAVAIL;
+			else
+				err = -EHOSTUNREACH;
+
+			goto out;
+		}
+
+		/* Translate from internal to user structure */
+		err = irda_extract_ias_value(ias_opt, self->ias_result);
+		if (self->ias_result)
+			irias_delete_value(self->ias_result);
+		if (err) {
+			kfree(ias_opt);
+			goto out;
+		}
+
+		/* Copy reply to the user */
+		if (copy_to_user(optval, ias_opt,
+				 sizeof(struct irda_ias_set))) {
+			kfree(ias_opt);
+			err = -EFAULT;
+			goto out;
+		}
+		/* Note : don't need to put optlen, we checked it */
+		kfree(ias_opt);
+		break;
+	case IRLMP_WAITDEVICE:
+		/* This function is just another way of seeing life ;-)
+		 * IRLMP_ENUMDEVICES assumes that you have a static network,
+		 * and that you just want to pick one of the devices present.
+		 * On the other hand, in here we assume that no device is
+		 * present and that at some point in the future a device will
+		 * come into range. When this device arrive, we just wake
+		 * up the caller, so that he has time to connect to it before
+		 * the device goes away...
+		 * Note : once the node has been discovered for more than a
+		 * few second, it won't trigger this function, unless it
+		 * goes away and come back changes its hint bits (so we
+		 * might call it IRLMP_WAITNEWDEVICE).
+		 */
+
+		/* Check that the user is passing us an int */
+		if (len != sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
+		/* Get timeout in ms (max time we block the caller) */
+		if (get_user(val, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		/* Tell IrLMP we want to be notified */
+		irlmp_update_client(self->ckey, self->mask.word,
+				    irda_selective_discovery_indication,
+				    NULL, (void *) self);
+
+		/* Do some discovery (and also return cached results) */
+		irlmp_discovery_request(self->nslots);
+
+		/* Wait until a node is discovered */
+		if (!self->cachedaddr) {
+			IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__);
+
+			/* Set watchdog timer to expire in <val> ms. */
+			self->errno = 0;
+			setup_timer(&self->watchdog, irda_discovery_timeout,
+					(unsigned long)self);
+			self->watchdog.expires = jiffies + (val * HZ/1000);
+			add_timer(&(self->watchdog));
+
+			/* Wait for IR-LMP to call us back */
+			__wait_event_interruptible(self->query_wait,
+			      (self->cachedaddr != 0 || self->errno == -ETIME),
+						   err);
+
+			/* If watchdog is still activated, kill it! */
+			if(timer_pending(&(self->watchdog)))
+				del_timer(&(self->watchdog));
+
+			IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__);
+
+			if (err != 0)
+				goto out;
+		}
+		else
+			IRDA_DEBUG(1, "%s(), found immediately !\n",
+				   __func__);
+
+		/* Tell IrLMP that we have been notified */
+		irlmp_update_client(self->ckey, self->mask.word,
+				    NULL, NULL, NULL);
+
+		/* Check if the we got some results */
+		if (!self->cachedaddr)
+			return -EAGAIN;		/* Didn't find any devices */
+		daddr = self->cachedaddr;
+		/* Cleanup */
+		self->cachedaddr = 0;
+
+		/* We return the daddr of the device that trigger the
+		 * wakeup. As irlmp pass us only the new devices, we
+		 * are sure that it's not an old device.
+		 * If the user want more details, he should query
+		 * the whole discovery log and pick one device...
+		 */
+		if (put_user(daddr, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+out:
+
+	release_sock(sk);
+
+	return err;
+}
+
+static const struct net_proto_family irda_family_ops = {
+	.family = PF_IRDA,
+	.create = irda_create,
+	.owner	= THIS_MODULE,
+};
+
+static const struct proto_ops irda_stream_ops = {
+	.family =	PF_IRDA,
+	.owner =	THIS_MODULE,
+	.release =	irda_release,
+	.bind =		irda_bind,
+	.connect =	irda_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	irda_accept,
+	.getname =	irda_getname,
+	.poll =		irda_poll,
+	.ioctl =	irda_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	irda_compat_ioctl,
+#endif
+	.listen =	irda_listen,
+	.shutdown =	irda_shutdown,
+	.setsockopt =	irda_setsockopt,
+	.getsockopt =	irda_getsockopt,
+	.sendmsg =	irda_sendmsg,
+	.recvmsg =	irda_recvmsg_stream,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct proto_ops irda_seqpacket_ops = {
+	.family =	PF_IRDA,
+	.owner =	THIS_MODULE,
+	.release =	irda_release,
+	.bind =		irda_bind,
+	.connect =	irda_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	irda_accept,
+	.getname =	irda_getname,
+	.poll =		datagram_poll,
+	.ioctl =	irda_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	irda_compat_ioctl,
+#endif
+	.listen =	irda_listen,
+	.shutdown =	irda_shutdown,
+	.setsockopt =	irda_setsockopt,
+	.getsockopt =	irda_getsockopt,
+	.sendmsg =	irda_sendmsg,
+	.recvmsg =	irda_recvmsg_dgram,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct proto_ops irda_dgram_ops = {
+	.family =	PF_IRDA,
+	.owner =	THIS_MODULE,
+	.release =	irda_release,
+	.bind =		irda_bind,
+	.connect =	irda_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	irda_accept,
+	.getname =	irda_getname,
+	.poll =		datagram_poll,
+	.ioctl =	irda_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	irda_compat_ioctl,
+#endif
+	.listen =	irda_listen,
+	.shutdown =	irda_shutdown,
+	.setsockopt =	irda_setsockopt,
+	.getsockopt =	irda_getsockopt,
+	.sendmsg =	irda_sendmsg_dgram,
+	.recvmsg =	irda_recvmsg_dgram,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+#ifdef CONFIG_IRDA_ULTRA
+static const struct proto_ops irda_ultra_ops = {
+	.family =	PF_IRDA,
+	.owner =	THIS_MODULE,
+	.release =	irda_release,
+	.bind =		irda_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	irda_getname,
+	.poll =		datagram_poll,
+	.ioctl =	irda_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	irda_compat_ioctl,
+#endif
+	.listen =	sock_no_listen,
+	.shutdown =	irda_shutdown,
+	.setsockopt =	irda_setsockopt,
+	.getsockopt =	irda_getsockopt,
+	.sendmsg =	irda_sendmsg_ultra,
+	.recvmsg =	irda_recvmsg_dgram,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irsock_init (pro)
+ *
+ *    Initialize IrDA protocol
+ *
+ */
+int __init irsock_init(void)
+{
+	int rc = proto_register(&irda_proto, 0);
+
+	if (rc == 0)
+		rc = sock_register(&irda_family_ops);
+
+	return rc;
+}
+
+/*
+ * Function irsock_cleanup (void)
+ *
+ *    Remove IrDA protocol
+ *
+ */
+void irsock_cleanup(void)
+{
+	sock_unregister(PF_IRDA);
+	proto_unregister(&irda_proto);
+}
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
new file mode 100644
index 00000000..36c3f037
--- /dev/null
+++ b/net/irda/discovery.c
@@ -0,0 +1,422 @@
+/*********************************************************************
+ *
+ * Filename:      discovery.c
+ * Version:       0.1
+ * Description:   Routines for handling discoveries at the IrLMP layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Apr  6 15:33:50 1999
+ * Modified at:   Sat Oct  9 17:11:31 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Modified at:   Fri May 28  3:11 CST 1999
+ * Modified by:   Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+
+#include <net/irda/discovery.h>
+
+#include <asm/unaligned.h>
+
+/*
+ * Function irlmp_add_discovery (cachelog, discovery)
+ *
+ *    Add a new discovery to the cachelog, and remove any old discoveries
+ *    from the same device
+ *
+ * Note : we try to preserve the time this device was *first* discovered
+ * (as opposed to the time of last discovery used for cleanup). This is
+ * used by clients waiting for discovery events to tell if the device
+ * discovered is "new" or just the same old one. They can't rely there
+ * on a binary flag (new/old), because not all discovery events are
+ * propagated to them, and they might not always listen, so they would
+ * miss some new devices popping up...
+ * Jean II
+ */
+void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *new)
+{
+	discovery_t *discovery, *node;
+	unsigned long flags;
+
+	/* Set time of first discovery if node is new (see below) */
+	new->firststamp = new->timestamp;
+
+	spin_lock_irqsave(&cachelog->hb_spinlock, flags);
+
+	/*
+	 * Remove all discoveries of devices that has previously been
+	 * discovered on the same link with the same name (info), or the
+	 * same daddr. We do this since some devices (mostly PDAs) change
+	 * their device address between every discovery.
+	 */
+	discovery = (discovery_t *) hashbin_get_first(cachelog);
+	while (discovery != NULL ) {
+		node = discovery;
+
+		/* Be sure to stay one item ahead */
+		discovery = (discovery_t *) hashbin_get_next(cachelog);
+
+		if ((node->data.saddr == new->data.saddr) &&
+		    ((node->data.daddr == new->data.daddr) ||
+		     (strcmp(node->data.info, new->data.info) == 0)))
+		{
+			/* This discovery is a previous discovery
+			 * from the same device, so just remove it
+			 */
+			hashbin_remove_this(cachelog, (irda_queue_t *) node);
+			/* Check if hints bits are unchanged */
+			if (get_unaligned((__u16 *)node->data.hints) == get_unaligned((__u16 *)new->data.hints))
+				/* Set time of first discovery for this node */
+				new->firststamp = node->firststamp;
+			kfree(node);
+		}
+	}
+
+	/* Insert the new and updated version */
+	hashbin_insert(cachelog, (irda_queue_t *) new, new->data.daddr, NULL);
+
+	spin_unlock_irqrestore(&cachelog->hb_spinlock, flags);
+}
+
+/*
+ * Function irlmp_add_discovery_log (cachelog, log)
+ *
+ *    Merge a disovery log into the cachelog.
+ *
+ */
+void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log)
+{
+	discovery_t *discovery;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/*
+	 *  If log is missing this means that IrLAP was unable to perform the
+	 *  discovery, so restart discovery again with just the half timeout
+	 *  of the normal one.
+	 */
+	/* Well... It means that there was nobody out there - Jean II */
+	if (log == NULL) {
+		/* irlmp_start_discovery_timer(irlmp, 150); */
+		return;
+	}
+
+	/*
+	 * Locking : we are the only owner of this discovery log, so
+	 * no need to lock it.
+	 * We just need to lock the global log in irlmp_add_discovery().
+	 */
+	discovery = (discovery_t *) hashbin_remove_first(log);
+	while (discovery != NULL) {
+		irlmp_add_discovery(cachelog, discovery);
+
+		discovery = (discovery_t *) hashbin_remove_first(log);
+	}
+
+	/* Delete the now empty log */
+	hashbin_delete(log, (FREE_FUNC) kfree);
+}
+
+/*
+ * Function irlmp_expire_discoveries (log, saddr, force)
+ *
+ *    Go through all discoveries and expire all that has stayed too long
+ *
+ * Note : this assume that IrLAP won't change its saddr, which
+ * currently is a valid assumption...
+ */
+void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force)
+{
+	discovery_t *		discovery;
+	discovery_t *		curr;
+	unsigned long		flags;
+	discinfo_t *		buffer = NULL;
+	int			n;		/* Size of the full log */
+	int			i = 0;		/* How many we expired */
+
+	IRDA_ASSERT(log != NULL, return;);
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	spin_lock_irqsave(&log->hb_spinlock, flags);
+
+	discovery = (discovery_t *) hashbin_get_first(log);
+	while (discovery != NULL) {
+		/* Be sure to be one item ahead */
+		curr = discovery;
+		discovery = (discovery_t *) hashbin_get_next(log);
+
+		/* Test if it's time to expire this discovery */
+		if ((curr->data.saddr == saddr) &&
+		    (force ||
+		     ((jiffies - curr->timestamp) > DISCOVERY_EXPIRE_TIMEOUT)))
+		{
+			/* Create buffer as needed.
+			 * As this function get called a lot and most time
+			 * we don't have anything to put in the log (we are
+			 * quite picky), we can save a lot of overhead
+			 * by not calling kmalloc. Jean II */
+			if(buffer == NULL) {
+				/* Create the client specific buffer */
+				n = HASHBIN_GET_SIZE(log);
+				buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC);
+				if (buffer == NULL) {
+					spin_unlock_irqrestore(&log->hb_spinlock, flags);
+					return;
+				}
+
+			}
+
+			/* Copy discovery information */
+			memcpy(&(buffer[i]), &(curr->data),
+			       sizeof(discinfo_t));
+			i++;
+
+			/* Remove it from the log */
+			curr = hashbin_remove_this(log, (irda_queue_t *) curr);
+			kfree(curr);
+		}
+	}
+
+	/* Drop the spinlock before calling the higher layers, as
+	 * we can't guarantee they won't call us back and create a
+	 * deadlock. We will work on our own private data, so we
+	 * don't care to be interrupted. - Jean II */
+	spin_unlock_irqrestore(&log->hb_spinlock, flags);
+
+	if(buffer == NULL)
+		return;
+
+	/* Tell IrLMP and registered clients about it */
+	irlmp_discovery_expiry(buffer, i);
+
+	/* Free up our buffer */
+	kfree(buffer);
+}
+
+#if 0
+/*
+ * Function irlmp_dump_discoveries (log)
+ *
+ *    Print out all discoveries in log
+ *
+ */
+void irlmp_dump_discoveries(hashbin_t *log)
+{
+	discovery_t *discovery;
+
+	IRDA_ASSERT(log != NULL, return;);
+
+	discovery = (discovery_t *) hashbin_get_first(log);
+	while (discovery != NULL) {
+		IRDA_DEBUG(0, "Discovery:\n");
+		IRDA_DEBUG(0, "  daddr=%08x\n", discovery->data.daddr);
+		IRDA_DEBUG(0, "  saddr=%08x\n", discovery->data.saddr);
+		IRDA_DEBUG(0, "  nickname=%s\n", discovery->data.info);
+
+		discovery = (discovery_t *) hashbin_get_next(log);
+	}
+}
+#endif
+
+/*
+ * Function irlmp_copy_discoveries (log, pn, mask)
+ *
+ *    Copy all discoveries in a buffer
+ *
+ * This function implement a safe way for lmp clients to access the
+ * discovery log. The basic problem is that we don't want the log
+ * to change (add/remove) while the client is reading it. If the
+ * lmp client manipulate directly the hashbin, he is sure to get
+ * into troubles...
+ * The idea is that we copy all the current discovery log in a buffer
+ * which is specific to the client and pass this copy to him. As we
+ * do this operation with the spinlock grabbed, we are safe...
+ * Note : we don't want those clients to grab the spinlock, because
+ * we have no control on how long they will hold it...
+ * Note : we choose to copy the log in "struct irda_device_info" to
+ * save space...
+ * Note : the client must kfree himself() the log...
+ * Jean II
+ */
+struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
+						__u16 mask, int old_entries)
+{
+	discovery_t *		discovery;
+	unsigned long		flags;
+	discinfo_t *		buffer = NULL;
+	int			j_timeout = (sysctl_discovery_timeout * HZ);
+	int			n;		/* Size of the full log */
+	int			i = 0;		/* How many we picked */
+
+	IRDA_ASSERT(pn != NULL, return NULL;);
+	IRDA_ASSERT(log != NULL, return NULL;);
+
+	/* Save spin lock */
+	spin_lock_irqsave(&log->hb_spinlock, flags);
+
+	discovery = (discovery_t *) hashbin_get_first(log);
+	while (discovery != NULL) {
+		/* Mask out the ones we don't want :
+		 * We want to match the discovery mask, and to get only
+		 * the most recent one (unless we want old ones) */
+		if ((get_unaligned((__u16 *)discovery->data.hints) & mask) &&
+		    ((old_entries) ||
+		     ((jiffies - discovery->firststamp) < j_timeout))) {
+			/* Create buffer as needed.
+			 * As this function get called a lot and most time
+			 * we don't have anything to put in the log (we are
+			 * quite picky), we can save a lot of overhead
+			 * by not calling kmalloc. Jean II */
+			if(buffer == NULL) {
+				/* Create the client specific buffer */
+				n = HASHBIN_GET_SIZE(log);
+				buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC);
+				if (buffer == NULL) {
+					spin_unlock_irqrestore(&log->hb_spinlock, flags);
+					return NULL;
+				}
+
+			}
+
+			/* Copy discovery information */
+			memcpy(&(buffer[i]), &(discovery->data),
+			       sizeof(discinfo_t));
+			i++;
+		}
+		discovery = (discovery_t *) hashbin_get_next(log);
+	}
+
+	spin_unlock_irqrestore(&log->hb_spinlock, flags);
+
+	/* Get the actual number of device in the buffer and return */
+	*pn = i;
+	return buffer;
+}
+
+#ifdef CONFIG_PROC_FS
+static inline discovery_t *discovery_seq_idx(loff_t pos)
+
+{
+	discovery_t *discovery;
+
+	for (discovery = (discovery_t *) hashbin_get_first(irlmp->cachelog);
+	     discovery != NULL;
+	     discovery = (discovery_t *) hashbin_get_next(irlmp->cachelog)) {
+		if (pos-- == 0)
+			break;
+	}
+
+	return discovery;
+}
+
+static void *discovery_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_irq(&irlmp->cachelog->hb_spinlock);
+	return *pos ? discovery_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *discovery_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return (v == SEQ_START_TOKEN)
+		? (void *) hashbin_get_first(irlmp->cachelog)
+		: (void *) hashbin_get_next(irlmp->cachelog);
+}
+
+static void discovery_seq_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_irq(&irlmp->cachelog->hb_spinlock);
+}
+
+static int discovery_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "IrLMP: Discovery log:\n\n");
+	else {
+		const discovery_t *discovery = v;
+
+		seq_printf(seq, "nickname: %s, hint: 0x%02x%02x",
+			   discovery->data.info,
+			   discovery->data.hints[0],
+			   discovery->data.hints[1]);
+#if 0
+		if ( discovery->data.hints[0] & HINT_PNP)
+			seq_puts(seq, "PnP Compatible ");
+		if ( discovery->data.hints[0] & HINT_PDA)
+			seq_puts(seq, "PDA/Palmtop ");
+		if ( discovery->data.hints[0] & HINT_COMPUTER)
+			seq_puts(seq, "Computer ");
+		if ( discovery->data.hints[0] & HINT_PRINTER)
+			seq_puts(seq, "Printer ");
+		if ( discovery->data.hints[0] & HINT_MODEM)
+			seq_puts(seq, "Modem ");
+		if ( discovery->data.hints[0] & HINT_FAX)
+			seq_puts(seq, "Fax ");
+		if ( discovery->data.hints[0] & HINT_LAN)
+			seq_puts(seq, "LAN Access ");
+
+		if ( discovery->data.hints[1] & HINT_TELEPHONY)
+			seq_puts(seq, "Telephony ");
+		if ( discovery->data.hints[1] & HINT_FILE_SERVER)
+			seq_puts(seq, "File Server ");
+		if ( discovery->data.hints[1] & HINT_COMM)
+			seq_puts(seq, "IrCOMM ");
+		if ( discovery->data.hints[1] & HINT_OBEX)
+			seq_puts(seq, "IrOBEX ");
+#endif
+		seq_printf(seq,", saddr: 0x%08x, daddr: 0x%08x\n\n",
+			       discovery->data.saddr,
+			       discovery->data.daddr);
+
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static const struct seq_operations discovery_seq_ops = {
+	.start  = discovery_seq_start,
+	.next   = discovery_seq_next,
+	.stop   = discovery_seq_stop,
+	.show   = discovery_seq_show,
+};
+
+static int discovery_seq_open(struct inode *inode, struct file *file)
+{
+	IRDA_ASSERT(irlmp != NULL, return -EINVAL;);
+
+	return seq_open(file, &discovery_seq_ops);
+}
+
+const struct file_operations discovery_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = discovery_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release,
+};
+#endif
diff --git a/net/irda/ircomm/Kconfig b/net/irda/ircomm/Kconfig
new file mode 100644
index 00000000..2d4c6b4a
--- /dev/null
+++ b/net/irda/ircomm/Kconfig
@@ -0,0 +1,12 @@
+config IRCOMM
+	tristate "IrCOMM protocol"
+	depends on IRDA
+	help
+	  Say Y here if you want to build support for the IrCOMM protocol.
+	  To compile it as modules, choose M here: the modules will be
+	  called ircomm and ircomm_tty.
+	  IrCOMM implements serial port emulation, and makes it possible to
+	  use all existing applications that understands TTY's with an
+	  infrared link.  Thus you should be able to use application like PPP,
+	  minicom and others.
+
diff --git a/net/irda/ircomm/Makefile b/net/irda/ircomm/Makefile
new file mode 100644
index 00000000..ab23b5ba
--- /dev/null
+++ b/net/irda/ircomm/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux IrDA IrCOMM protocol layer.
+#
+
+obj-$(CONFIG_IRCOMM) += ircomm.o ircomm-tty.o
+
+ircomm-y := ircomm_core.o ircomm_event.o ircomm_lmp.o ircomm_ttp.o
+ircomm-tty-y := ircomm_tty.o ircomm_tty_attach.o ircomm_tty_ioctl.o ircomm_param.o
diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c
new file mode 100644
index 00000000..52079f19
--- /dev/null
+++ b/net/irda/ircomm/ircomm_core.c
@@ -0,0 +1,592 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_core.c
+ * Version:       1.0
+ * Description:   IrCOMM service interface
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 20:37:34 1999
+ * Modified at:   Tue Dec 21 13:26:41 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_lmp.h>
+#include <net/irda/ircomm_ttp.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_core.h>
+
+static int __ircomm_close(struct ircomm_cb *self);
+static void ircomm_control_indication(struct ircomm_cb *self,
+				      struct sk_buff *skb, int clen);
+
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *proc_irda;
+static int ircomm_seq_open(struct inode *, struct file *);
+
+static const struct file_operations ircomm_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open           = ircomm_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+hashbin_t *ircomm = NULL;
+
+static int __init ircomm_init(void)
+{
+	ircomm = hashbin_new(HB_LOCK);
+	if (ircomm == NULL) {
+		IRDA_ERROR("%s(), can't allocate hashbin!\n", __func__);
+		return -ENOMEM;
+	}
+
+#ifdef CONFIG_PROC_FS
+	{ struct proc_dir_entry *ent;
+	ent = proc_create("ircomm", 0, proc_irda, &ircomm_proc_fops);
+	if (!ent) {
+		printk(KERN_ERR "ircomm_init: can't create /proc entry!\n");
+		return -ENODEV;
+	}
+	}
+#endif /* CONFIG_PROC_FS */
+
+	IRDA_MESSAGE("IrCOMM protocol (Dag Brattli)\n");
+
+	return 0;
+}
+
+static void __exit ircomm_cleanup(void)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	hashbin_delete(ircomm, (FREE_FUNC) __ircomm_close);
+
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("ircomm", proc_irda);
+#endif /* CONFIG_PROC_FS */
+}
+
+/*
+ * Function ircomm_open (client_notify)
+ *
+ *    Start a new IrCOMM instance
+ *
+ */
+struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line)
+{
+	struct ircomm_cb *self = NULL;
+	int ret;
+
+	IRDA_DEBUG(2, "%s(), service_type=0x%02x\n", __func__ ,
+		   service_type);
+
+	IRDA_ASSERT(ircomm != NULL, return NULL;);
+
+	self = kzalloc(sizeof(struct ircomm_cb), GFP_ATOMIC);
+	if (self == NULL)
+		return NULL;
+
+	self->notify = *notify;
+	self->magic = IRCOMM_MAGIC;
+
+	/* Check if we should use IrLMP or IrTTP */
+	if (service_type & IRCOMM_3_WIRE_RAW) {
+		self->flow_status = FLOW_START;
+		ret = ircomm_open_lsap(self);
+	} else
+		ret = ircomm_open_tsap(self);
+
+	if (ret < 0) {
+		kfree(self);
+		return NULL;
+	}
+
+	self->service_type = service_type;
+	self->line = line;
+
+	hashbin_insert(ircomm, (irda_queue_t *) self, line, NULL);
+
+	ircomm_next_state(self, IRCOMM_IDLE);
+
+	return self;
+}
+
+EXPORT_SYMBOL(ircomm_open);
+
+/*
+ * Function ircomm_close_instance (self)
+ *
+ *    Remove IrCOMM instance
+ *
+ */
+static int __ircomm_close(struct ircomm_cb *self)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Disconnect link if any */
+	ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, NULL, NULL);
+
+	/* Remove TSAP */
+	if (self->tsap) {
+		irttp_close_tsap(self->tsap);
+		self->tsap = NULL;
+	}
+
+	/* Remove LSAP */
+	if (self->lsap) {
+		irlmp_close_lsap(self->lsap);
+		self->lsap = NULL;
+	}
+	self->magic = 0;
+
+	kfree(self);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_close (self)
+ *
+ *    Closes and removes the specified IrCOMM instance
+ *
+ */
+int ircomm_close(struct ircomm_cb *self)
+{
+	struct ircomm_cb *entry;
+
+	IRDA_ASSERT(self != NULL, return -EIO;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EIO;);
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	entry = hashbin_remove(ircomm, self->line, NULL);
+
+	IRDA_ASSERT(entry == self, return -1;);
+
+	return __ircomm_close(self);
+}
+
+EXPORT_SYMBOL(ircomm_close);
+
+/*
+ * Function ircomm_connect_request (self, service_type)
+ *
+ *    Impl. of this function is differ from one of the reference. This
+ *    function does discovery as well as sending connect request
+ *
+ */
+int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel,
+			   __u32 saddr, __u32 daddr, struct sk_buff *skb,
+			   __u8 service_type)
+{
+	struct ircomm_info info;
+	int ret;
+
+	IRDA_DEBUG(2 , "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+	self->service_type= service_type;
+
+	info.dlsap_sel = dlsap_sel;
+	info.saddr = saddr;
+	info.daddr = daddr;
+
+	ret = ircomm_do_event(self, IRCOMM_CONNECT_REQUEST, skb, &info);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(ircomm_connect_request);
+
+/*
+ * Function ircomm_connect_indication (self, qos, skb)
+ *
+ *    Notify user layer about the incoming connection
+ *
+ */
+void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+			       struct ircomm_info *info)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/*
+	 * If there are any data hiding in the control channel, we must
+	 * deliver it first. The side effect is that the control channel
+	 * will be removed from the skb
+	 */
+	if (self->notify.connect_indication)
+		self->notify.connect_indication(self->notify.instance, self,
+						info->qos, info->max_data_size,
+						info->max_header_size, skb);
+	else {
+		IRDA_DEBUG(0, "%s(), missing handler\n", __func__ );
+	}
+}
+
+/*
+ * Function ircomm_connect_response (self, userdata, max_sdu_size)
+ *
+ *    User accepts connection
+ *
+ */
+int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata)
+{
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	ret = ircomm_do_event(self, IRCOMM_CONNECT_RESPONSE, userdata, NULL);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(ircomm_connect_response);
+
+/*
+ * Function connect_confirm (self, skb)
+ *
+ *    Notify user layer that the link is now connected
+ *
+ */
+void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb,
+			    struct ircomm_info *info)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	if (self->notify.connect_confirm )
+		self->notify.connect_confirm(self->notify.instance,
+					     self, info->qos,
+					     info->max_data_size,
+					     info->max_header_size, skb);
+	else {
+		IRDA_DEBUG(0, "%s(), missing handler\n", __func__ );
+	}
+}
+
+/*
+ * Function ircomm_data_request (self, userdata)
+ *
+ *    Send IrCOMM data to peer device
+ *
+ */
+int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb)
+{
+	int ret;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -EFAULT;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;);
+	IRDA_ASSERT(skb != NULL, return -EFAULT;);
+
+	ret = ircomm_do_event(self, IRCOMM_DATA_REQUEST, skb, NULL);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(ircomm_data_request);
+
+/*
+ * Function ircomm_data_indication (self, skb)
+ *
+ *    Data arrived, so deliver it to user
+ *
+ */
+void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(skb->len > 0, return;);
+
+	if (self->notify.data_indication)
+		self->notify.data_indication(self->notify.instance, self, skb);
+	else {
+		IRDA_DEBUG(0, "%s(), missing handler\n", __func__ );
+	}
+}
+
+/*
+ * Function ircomm_process_data (self, skb)
+ *
+ *    Data arrived which may contain control channel data
+ *
+ */
+void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb)
+{
+	int clen;
+
+	IRDA_ASSERT(skb->len > 0, return;);
+
+	clen = skb->data[0];
+
+	/*
+	 * Input validation check: a stir4200/mcp2150 combinations sometimes
+	 * results in frames with clen > remaining packet size. These are
+	 * illegal; if we throw away just this frame then it seems to carry on
+	 * fine
+	 */
+	if (unlikely(skb->len < (clen + 1))) {
+		IRDA_DEBUG(2, "%s() throwing away illegal frame\n",
+			   __func__ );
+		return;
+	}
+
+	/*
+	 * If there are any data hiding in the control channel, we must
+	 * deliver it first. The side effect is that the control channel
+	 * will be removed from the skb
+	 */
+	if (clen > 0)
+		ircomm_control_indication(self, skb, clen);
+
+	/* Remove control channel from data channel */
+	skb_pull(skb, clen+1);
+
+	if (skb->len)
+		ircomm_data_indication(self, skb);
+	else {
+		IRDA_DEBUG(4, "%s(), data was control info only!\n",
+			   __func__ );
+	}
+}
+
+/*
+ * Function ircomm_control_request (self, params)
+ *
+ *    Send control data to peer device
+ *
+ */
+int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb)
+{
+	int ret;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -EFAULT;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;);
+	IRDA_ASSERT(skb != NULL, return -EFAULT;);
+
+	ret = ircomm_do_event(self, IRCOMM_CONTROL_REQUEST, skb, NULL);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(ircomm_control_request);
+
+/*
+ * Function ircomm_control_indication (self, skb)
+ *
+ *    Data has arrived on the control channel
+ *
+ */
+static void ircomm_control_indication(struct ircomm_cb *self,
+				      struct sk_buff *skb, int clen)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Use udata for delivering data on the control channel */
+	if (self->notify.udata_indication) {
+		struct sk_buff *ctrl_skb;
+
+		/* We don't own the skb, so clone it */
+		ctrl_skb = skb_clone(skb, GFP_ATOMIC);
+		if (!ctrl_skb)
+			return;
+
+		/* Remove data channel from control channel */
+		skb_trim(ctrl_skb, clen+1);
+
+		self->notify.udata_indication(self->notify.instance, self,
+					      ctrl_skb);
+
+		/* Drop reference count -
+		 * see ircomm_tty_control_indication(). */
+		dev_kfree_skb(ctrl_skb);
+	} else {
+		IRDA_DEBUG(0, "%s(), missing handler\n", __func__ );
+	}
+}
+
+/*
+ * Function ircomm_disconnect_request (self, userdata, priority)
+ *
+ *    User layer wants to disconnect the IrCOMM connection
+ *
+ */
+int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata)
+{
+	struct ircomm_info info;
+	int ret;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+
+	ret = ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, userdata,
+			      &info);
+	return ret;
+}
+
+EXPORT_SYMBOL(ircomm_disconnect_request);
+
+/*
+ * Function disconnect_indication (self, skb)
+ *
+ *    Tell user that the link has been disconnected
+ *
+ */
+void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+				  struct ircomm_info *info)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(info != NULL, return;);
+
+	if (self->notify.disconnect_indication) {
+		self->notify.disconnect_indication(self->notify.instance, self,
+						   info->reason, skb);
+	} else {
+		IRDA_DEBUG(0, "%s(), missing handler\n", __func__ );
+	}
+}
+
+/*
+ * Function ircomm_flow_request (self, flow)
+ *
+ *
+ *
+ */
+void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+	if (self->service_type == IRCOMM_3_WIRE_RAW)
+		return;
+
+	irttp_flow_request(self->tsap, flow);
+}
+
+EXPORT_SYMBOL(ircomm_flow_request);
+
+#ifdef CONFIG_PROC_FS
+static void *ircomm_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ircomm_cb *self;
+	loff_t off = 0;
+
+	spin_lock_irq(&ircomm->hb_spinlock);
+
+	for (self = (struct ircomm_cb *) hashbin_get_first(ircomm);
+	     self != NULL;
+	     self = (struct ircomm_cb *) hashbin_get_next(ircomm)) {
+		if (off++ == *pos)
+			break;
+
+	}
+	return self;
+}
+
+static void *ircomm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+
+	return (void *) hashbin_get_next(ircomm);
+}
+
+static void ircomm_seq_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_irq(&ircomm->hb_spinlock);
+}
+
+static int ircomm_seq_show(struct seq_file *seq, void *v)
+{
+	const struct ircomm_cb *self = v;
+
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EINVAL; );
+
+	if(self->line < 0x10)
+		seq_printf(seq, "ircomm%d", self->line);
+	else
+		seq_printf(seq, "irlpt%d", self->line - 0x10);
+
+	seq_printf(seq,
+		   " state: %s, slsap_sel: %#02x, dlsap_sel: %#02x, mode:",
+		   ircomm_state[ self->state],
+		   self->slsap_sel, self->dlsap_sel);
+
+	if(self->service_type & IRCOMM_3_WIRE_RAW)
+		seq_printf(seq, " 3-wire-raw");
+	if(self->service_type & IRCOMM_3_WIRE)
+		seq_printf(seq, " 3-wire");
+	if(self->service_type & IRCOMM_9_WIRE)
+		seq_printf(seq, " 9-wire");
+	if(self->service_type & IRCOMM_CENTRONICS)
+		seq_printf(seq, " Centronics");
+	seq_putc(seq, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations ircomm_seq_ops = {
+	.start  = ircomm_seq_start,
+	.next   = ircomm_seq_next,
+	.stop   = ircomm_seq_stop,
+	.show   = ircomm_seq_show,
+};
+
+static int ircomm_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ircomm_seq_ops);
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_AUTHOR("Dag Brattli <dag@brattli.net>");
+MODULE_DESCRIPTION("IrCOMM protocol");
+MODULE_LICENSE("GPL");
+
+module_init(ircomm_init);
+module_exit(ircomm_cleanup);
diff --git a/net/irda/ircomm/ircomm_event.c b/net/irda/ircomm/ircomm_event.c
new file mode 100644
index 00000000..d78554fe
--- /dev/null
+++ b/net/irda/ircomm/ircomm_event.c
@@ -0,0 +1,250 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_event.c
+ * Version:       1.0
+ * Description:   IrCOMM layer state machine
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 20:33:11 1999
+ * Modified at:   Sun Dec 12 13:44:32 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_event.h>
+
+static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event,
+			     struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event,
+			      struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event,
+			      struct sk_buff *skb, struct ircomm_info *info);
+static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event,
+			     struct sk_buff *skb, struct ircomm_info *info);
+
+const char *const ircomm_state[] = {
+	"IRCOMM_IDLE",
+	"IRCOMM_WAITI",
+	"IRCOMM_WAITR",
+	"IRCOMM_CONN",
+};
+
+#ifdef CONFIG_IRDA_DEBUG
+static const char *const ircomm_event[] = {
+	"IRCOMM_CONNECT_REQUEST",
+	"IRCOMM_CONNECT_RESPONSE",
+	"IRCOMM_TTP_CONNECT_INDICATION",
+	"IRCOMM_LMP_CONNECT_INDICATION",
+	"IRCOMM_TTP_CONNECT_CONFIRM",
+	"IRCOMM_LMP_CONNECT_CONFIRM",
+
+	"IRCOMM_LMP_DISCONNECT_INDICATION",
+	"IRCOMM_TTP_DISCONNECT_INDICATION",
+	"IRCOMM_DISCONNECT_REQUEST",
+
+	"IRCOMM_TTP_DATA_INDICATION",
+	"IRCOMM_LMP_DATA_INDICATION",
+	"IRCOMM_DATA_REQUEST",
+	"IRCOMM_CONTROL_REQUEST",
+	"IRCOMM_CONTROL_INDICATION",
+};
+#endif /* CONFIG_IRDA_DEBUG */
+
+static int (*state[])(struct ircomm_cb *self, IRCOMM_EVENT event,
+		      struct sk_buff *skb, struct ircomm_info *info) =
+{
+	ircomm_state_idle,
+	ircomm_state_waiti,
+	ircomm_state_waitr,
+	ircomm_state_conn,
+};
+
+/*
+ * Function ircomm_state_idle (self, event, skb)
+ *
+ *    IrCOMM is currently idle
+ *
+ */
+static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event,
+			     struct sk_buff *skb, struct ircomm_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case IRCOMM_CONNECT_REQUEST:
+		ircomm_next_state(self, IRCOMM_WAITI);
+		ret = self->issue.connect_request(self, skb, info);
+		break;
+	case IRCOMM_TTP_CONNECT_INDICATION:
+	case IRCOMM_LMP_CONNECT_INDICATION:
+		ircomm_next_state(self, IRCOMM_WAITR);
+		ircomm_connect_indication(self, skb, info);
+		break;
+	default:
+		IRDA_DEBUG(4, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_state_waiti (self, event, skb)
+ *
+ *    The IrCOMM user has requested an IrCOMM connection to the remote
+ *    device and is awaiting confirmation
+ */
+static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event,
+			      struct sk_buff *skb, struct ircomm_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case IRCOMM_TTP_CONNECT_CONFIRM:
+	case IRCOMM_LMP_CONNECT_CONFIRM:
+		ircomm_next_state(self, IRCOMM_CONN);
+		ircomm_connect_confirm(self, skb, info);
+		break;
+	case IRCOMM_TTP_DISCONNECT_INDICATION:
+	case IRCOMM_LMP_DISCONNECT_INDICATION:
+		ircomm_next_state(self, IRCOMM_IDLE);
+		ircomm_disconnect_indication(self, skb, info);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_state_waitr (self, event, skb)
+ *
+ *    IrCOMM has received an incoming connection request and is awaiting
+ *    response from the user
+ */
+static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event,
+			      struct sk_buff *skb, struct ircomm_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case IRCOMM_CONNECT_RESPONSE:
+		ircomm_next_state(self, IRCOMM_CONN);
+		ret = self->issue.connect_response(self, skb);
+		break;
+	case IRCOMM_DISCONNECT_REQUEST:
+		ircomm_next_state(self, IRCOMM_IDLE);
+		ret = self->issue.disconnect_request(self, skb, info);
+		break;
+	case IRCOMM_TTP_DISCONNECT_INDICATION:
+	case IRCOMM_LMP_DISCONNECT_INDICATION:
+		ircomm_next_state(self, IRCOMM_IDLE);
+		ircomm_disconnect_indication(self, skb, info);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event = %s\n", __func__ ,
+			   ircomm_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_state_conn (self, event, skb)
+ *
+ *    IrCOMM is connected to the peer IrCOMM device
+ *
+ */
+static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event,
+			     struct sk_buff *skb, struct ircomm_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case IRCOMM_DATA_REQUEST:
+		ret = self->issue.data_request(self, skb, 0);
+		break;
+	case IRCOMM_TTP_DATA_INDICATION:
+		ircomm_process_data(self, skb);
+		break;
+	case IRCOMM_LMP_DATA_INDICATION:
+		ircomm_data_indication(self, skb);
+		break;
+	case IRCOMM_CONTROL_REQUEST:
+		/* Just send a separate frame for now */
+		ret = self->issue.data_request(self, skb, skb->len);
+		break;
+	case IRCOMM_TTP_DISCONNECT_INDICATION:
+	case IRCOMM_LMP_DISCONNECT_INDICATION:
+		ircomm_next_state(self, IRCOMM_IDLE);
+		ircomm_disconnect_indication(self, skb, info);
+		break;
+	case IRCOMM_DISCONNECT_REQUEST:
+		ircomm_next_state(self, IRCOMM_IDLE);
+		ret = self->issue.disconnect_request(self, skb, info);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event = %s\n", __func__ ,
+			   ircomm_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_do_event (self, event, skb)
+ *
+ *    Process event
+ *
+ */
+int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event,
+		    struct sk_buff *skb, struct ircomm_info *info)
+{
+	IRDA_DEBUG(4, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_state[self->state], ircomm_event[event]);
+
+	return (*state[self->state])(self, event, skb, info);
+}
+
+/*
+ * Function ircomm_next_state (self, state)
+ *
+ *    Switch state
+ *
+ */
+void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state)
+{
+	self->state = state;
+
+	IRDA_DEBUG(4, "%s: next state=%s, service type=%d\n", __func__ ,
+		   ircomm_state[self->state], self->service_type);
+}
diff --git a/net/irda/ircomm/ircomm_lmp.c b/net/irda/ircomm/ircomm_lmp.c
new file mode 100644
index 00000000..3b8095c7
--- /dev/null
+++ b/net/irda/ircomm/ircomm_lmp.c
@@ -0,0 +1,370 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_lmp.c
+ * Version:       1.0
+ * Description:   Interface between IrCOMM and IrLMP
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 20:48:27 1999
+ * Modified at:   Sun Dec 12 13:44:17 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Sources:       Previous IrLPT work by Thomas Davis
+ *
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/gfp.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irda_device.h>	/* struct irda_skb_cb */
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_lmp.h>
+
+
+/*
+ * Function ircomm_lmp_connect_request (self, userdata)
+ *
+ *
+ *
+ */
+static int ircomm_lmp_connect_request(struct ircomm_cb *self,
+				      struct sk_buff *userdata,
+				      struct ircomm_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	/* Don't forget to refcount it - should be NULL anyway */
+	if(userdata)
+		skb_get(userdata);
+
+	ret = irlmp_connect_request(self->lsap, info->dlsap_sel,
+				    info->saddr, info->daddr, NULL, userdata);
+	return ret;
+}
+
+/*
+ * Function ircomm_lmp_connect_response (self, skb)
+ *
+ *
+ *
+ */
+static int ircomm_lmp_connect_response(struct ircomm_cb *self,
+				       struct sk_buff *userdata)
+{
+	struct sk_buff *tx_skb;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	/* Any userdata supplied? */
+	if (userdata == NULL) {
+		tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		/* Reserve space for MUX and LAP header */
+		skb_reserve(tx_skb, LMP_MAX_HEADER);
+	} else {
+		/*
+		 *  Check that the client has reserved enough space for
+		 *  headers
+		 */
+		IRDA_ASSERT(skb_headroom(userdata) >= LMP_MAX_HEADER,
+			    return -1;);
+
+		/* Don't forget to refcount it - should be NULL anyway */
+		skb_get(userdata);
+		tx_skb = userdata;
+	}
+
+	return irlmp_connect_response(self->lsap, tx_skb);
+}
+
+static int ircomm_lmp_disconnect_request(struct ircomm_cb *self,
+					 struct sk_buff *userdata,
+					 struct ircomm_info *info)
+{
+	struct sk_buff *tx_skb;
+	int ret;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	if (!userdata) {
+		tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		/*  Reserve space for MUX and LAP header */
+		skb_reserve(tx_skb, LMP_MAX_HEADER);
+		userdata = tx_skb;
+	} else {
+		/* Don't forget to refcount it - should be NULL anyway */
+		skb_get(userdata);
+	}
+
+	ret = irlmp_disconnect_request(self->lsap, userdata);
+
+	return ret;
+}
+
+/*
+ * Function ircomm_lmp_flow_control (skb)
+ *
+ *    This function is called when a data frame we have sent to IrLAP has
+ *    been deallocated. We do this to make sure we don't flood IrLAP with
+ *    frames, since we are not using the IrTTP flow control mechanism
+ */
+static void ircomm_lmp_flow_control(struct sk_buff *skb)
+{
+	struct irda_skb_cb *cb;
+	struct ircomm_cb *self;
+	int line;
+
+	IRDA_ASSERT(skb != NULL, return;);
+
+	cb = (struct irda_skb_cb *) skb->cb;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	line = cb->line;
+
+	self = (struct ircomm_cb *) hashbin_lock_find(ircomm, line, NULL);
+	if (!self) {
+		IRDA_DEBUG(2, "%s(), didn't find myself\n", __func__ );
+		return;
+	}
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+	self->pkt_count--;
+
+	if ((self->pkt_count < 2) && (self->flow_status == FLOW_STOP)) {
+		IRDA_DEBUG(2, "%s(), asking TTY to start again!\n", __func__ );
+		self->flow_status = FLOW_START;
+		if (self->notify.flow_indication)
+			self->notify.flow_indication(self->notify.instance,
+						     self, FLOW_START);
+	}
+}
+
+/*
+ * Function ircomm_lmp_data_request (self, userdata)
+ *
+ *    Send data frame to peer device
+ *
+ */
+static int ircomm_lmp_data_request(struct ircomm_cb *self,
+				   struct sk_buff *skb,
+				   int not_used)
+{
+	struct irda_skb_cb *cb;
+	int ret;
+
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	cb = (struct irda_skb_cb *) skb->cb;
+
+	cb->line = self->line;
+
+	IRDA_DEBUG(4, "%s(), sending frame\n", __func__ );
+
+	/* Don't forget to refcount it - see ircomm_tty_do_softint() */
+	skb_get(skb);
+
+	skb_orphan(skb);
+	skb->destructor = ircomm_lmp_flow_control;
+
+	if ((self->pkt_count++ > 7) && (self->flow_status == FLOW_START)) {
+		IRDA_DEBUG(2, "%s(), asking TTY to slow down!\n", __func__ );
+		self->flow_status = FLOW_STOP;
+		if (self->notify.flow_indication)
+			self->notify.flow_indication(self->notify.instance,
+						     self, FLOW_STOP);
+	}
+	ret = irlmp_data_request(self->lsap, skb);
+	if (ret) {
+		IRDA_ERROR("%s(), failed\n", __func__);
+		/* irlmp_data_request already free the packet */
+	}
+
+	return ret;
+}
+
+/*
+ * Function ircomm_lmp_data_indication (instance, sap, skb)
+ *
+ *    Incoming data which we must deliver to the state machine, to check
+ *    we are still connected.
+ */
+static int ircomm_lmp_data_indication(void *instance, void *sap,
+				      struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	ircomm_do_event(self, IRCOMM_LMP_DATA_INDICATION, skb, NULL);
+
+	/* Drop reference count - see ircomm_tty_data_indication(). */
+	dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_lmp_connect_confirm (instance, sap, qos, max_sdu_size,
+ *                                       max_header_size, skb)
+ *
+ *    Connection has been confirmed by peer device
+ *
+ */
+static void ircomm_lmp_connect_confirm(void *instance, void *sap,
+				       struct qos_info *qos,
+				       __u32 max_seg_size,
+				       __u8 max_header_size,
+				       struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(qos != NULL, return;);
+
+	info.max_data_size = max_seg_size;
+	info.max_header_size = max_header_size;
+	info.qos = qos;
+
+	ircomm_do_event(self, IRCOMM_LMP_CONNECT_CONFIRM, skb, &info);
+
+	/* Drop reference count - see ircomm_tty_connect_confirm(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_lmp_connect_indication (instance, sap, qos, max_sdu_size,
+ *                                         max_header_size, skb)
+ *
+ *    Peer device wants to make a connection with us
+ *
+ */
+static void ircomm_lmp_connect_indication(void *instance, void *sap,
+					  struct qos_info *qos,
+					  __u32 max_seg_size,
+					  __u8 max_header_size,
+					  struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *)instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(qos != NULL, return;);
+
+	info.max_data_size = max_seg_size;
+	info.max_header_size = max_header_size;
+	info.qos = qos;
+
+	ircomm_do_event(self, IRCOMM_LMP_CONNECT_INDICATION, skb, &info);
+
+	/* Drop reference count - see ircomm_tty_connect_indication(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_lmp_disconnect_indication (instance, sap, reason, skb)
+ *
+ *    Peer device has closed the connection, or the link went down for some
+ *    other reason
+ */
+static void ircomm_lmp_disconnect_indication(void *instance, void *sap,
+					     LM_REASON reason,
+					     struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+	info.reason = reason;
+
+	ircomm_do_event(self, IRCOMM_LMP_DISCONNECT_INDICATION, skb, &info);
+
+	/* Drop reference count - see ircomm_tty_disconnect_indication(). */
+	if(skb)
+		dev_kfree_skb(skb);
+}
+/*
+ * Function ircomm_open_lsap (self)
+ *
+ *    Open LSAP. This function will only be used when using "raw" services
+ *
+ */
+int ircomm_open_lsap(struct ircomm_cb *self)
+{
+	notify_t notify;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	/* Register callbacks */
+	irda_notify_init(&notify);
+	notify.data_indication       = ircomm_lmp_data_indication;
+	notify.connect_confirm       = ircomm_lmp_connect_confirm;
+	notify.connect_indication    = ircomm_lmp_connect_indication;
+	notify.disconnect_indication = ircomm_lmp_disconnect_indication;
+	notify.instance = self;
+	strlcpy(notify.name, "IrCOMM", sizeof(notify.name));
+
+	self->lsap = irlmp_open_lsap(LSAP_ANY, &notify, 0);
+	if (!self->lsap) {
+		IRDA_DEBUG(0,"%sfailed to allocate tsap\n", __func__ );
+		return -1;
+	}
+	self->slsap_sel = self->lsap->slsap_sel;
+
+	/*
+	 *  Initialize the call-table for issuing commands
+	 */
+	self->issue.data_request       = ircomm_lmp_data_request;
+	self->issue.connect_request    = ircomm_lmp_connect_request;
+	self->issue.connect_response   = ircomm_lmp_connect_response;
+	self->issue.disconnect_request = ircomm_lmp_disconnect_request;
+
+	return 0;
+}
diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c
new file mode 100644
index 00000000..8b915f3a
--- /dev/null
+++ b/net/irda/ircomm/ircomm_param.c
@@ -0,0 +1,511 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_param.c
+ * Version:       1.0
+ * Description:   Parameter handling for the IrCOMM protocol
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Jun  7 10:25:11 1999
+ * Modified at:   Sun Jan 30 14:32:03 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/gfp.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+#include <net/irda/ircomm_param.h>
+
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+				     int get);
+static int ircomm_param_port_type(void *instance, irda_param_t *param,
+				  int get);
+static int ircomm_param_port_name(void *instance, irda_param_t *param,
+				  int get);
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+				     int get);
+static int ircomm_param_data_rate(void *instance, irda_param_t *param,
+				  int get);
+static int ircomm_param_data_format(void *instance, irda_param_t *param,
+				    int get);
+static int ircomm_param_flow_control(void *instance, irda_param_t *param,
+				     int get);
+static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get);
+static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get);
+static int ircomm_param_line_status(void *instance, irda_param_t *param,
+				    int get);
+static int ircomm_param_dte(void *instance, irda_param_t *param, int get);
+static int ircomm_param_dce(void *instance, irda_param_t *param, int get);
+static int ircomm_param_poll(void *instance, irda_param_t *param, int get);
+
+static pi_minor_info_t pi_minor_call_table_common[] = {
+	{ ircomm_param_service_type, PV_INT_8_BITS },
+	{ ircomm_param_port_type,    PV_INT_8_BITS },
+	{ ircomm_param_port_name,    PV_STRING }
+};
+static pi_minor_info_t pi_minor_call_table_non_raw[] = {
+	{ ircomm_param_data_rate,    PV_INT_32_BITS | PV_BIG_ENDIAN },
+	{ ircomm_param_data_format,  PV_INT_8_BITS },
+	{ ircomm_param_flow_control, PV_INT_8_BITS },
+	{ ircomm_param_xon_xoff,     PV_INT_16_BITS },
+	{ ircomm_param_enq_ack,      PV_INT_16_BITS },
+	{ ircomm_param_line_status,  PV_INT_8_BITS }
+};
+static pi_minor_info_t pi_minor_call_table_9_wire[] = {
+	{ ircomm_param_dte,          PV_INT_8_BITS },
+	{ ircomm_param_dce,          PV_INT_8_BITS },
+	{ ircomm_param_poll,         PV_NO_VALUE },
+};
+
+static pi_major_info_t pi_major_call_table[] = {
+	{ pi_minor_call_table_common,  3 },
+	{ pi_minor_call_table_non_raw, 6 },
+	{ pi_minor_call_table_9_wire,  3 }
+/* 	{ pi_minor_call_table_centronics }  */
+};
+
+pi_param_info_t ircomm_param_info = { pi_major_call_table, 3, 0x0f, 4 };
+
+/*
+ * Function ircomm_param_request (self, pi, flush)
+ *
+ *    Queue a parameter for the control channel
+ *
+ */
+int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush)
+{
+	struct tty_struct *tty;
+	unsigned long flags;
+	struct sk_buff *skb;
+	int count;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	tty = self->tty;
+	if (!tty)
+		return 0;
+
+	/* Make sure we don't send parameters for raw mode */
+	if (self->service_type == IRCOMM_3_WIRE_RAW)
+		return 0;
+
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	skb = self->ctrl_skb;
+	if (!skb) {
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb) {
+			spin_unlock_irqrestore(&self->spinlock, flags);
+			return -ENOMEM;
+		}
+
+		skb_reserve(skb, self->max_header_size);
+		self->ctrl_skb = skb;
+	}
+	/*
+	 * Inserting is a little bit tricky since we don't know how much
+	 * room we will need. But this should hopefully work OK
+	 */
+	count = irda_param_insert(self, pi, skb_tail_pointer(skb),
+				  skb_tailroom(skb), &ircomm_param_info);
+	if (count < 0) {
+		IRDA_WARNING("%s(), no room for parameter!\n", __func__);
+		spin_unlock_irqrestore(&self->spinlock, flags);
+		return -1;
+	}
+	skb_put(skb, count);
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	IRDA_DEBUG(2, "%s(), skb->len=%d\n", __func__ , skb->len);
+
+	if (flush) {
+		/* ircomm_tty_do_softint will take care of the rest */
+		schedule_work(&self->tqueue);
+	}
+
+	return count;
+}
+
+/*
+ * Function ircomm_param_service_type (self, buf, len)
+ *
+ *    Handle service type, this function will both be called after the LM-IAS
+ *    query and then the remote device sends its initial parameters
+ *
+ */
+static int ircomm_param_service_type(void *instance, irda_param_t *param,
+				     int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	__u8 service_type = (__u8) param->pv.i;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get) {
+		param->pv.i = self->settings.service_type;
+		return 0;
+	}
+
+	/* Find all common service types */
+	service_type &= self->service_type;
+	if (!service_type) {
+		IRDA_DEBUG(2,
+			   "%s(), No common service type to use!\n", __func__ );
+		return -1;
+	}
+	IRDA_DEBUG(0, "%s(), services in common=%02x\n", __func__ ,
+		   service_type);
+
+	/*
+	 * Now choose a preferred service type of those available
+	 */
+	if (service_type & IRCOMM_CENTRONICS)
+		self->settings.service_type = IRCOMM_CENTRONICS;
+	else if (service_type & IRCOMM_9_WIRE)
+		self->settings.service_type = IRCOMM_9_WIRE;
+	else if (service_type & IRCOMM_3_WIRE)
+		self->settings.service_type = IRCOMM_3_WIRE;
+	else if (service_type & IRCOMM_3_WIRE_RAW)
+		self->settings.service_type = IRCOMM_3_WIRE_RAW;
+
+	IRDA_DEBUG(0, "%s(), resulting service type=0x%02x\n", __func__ ,
+		   self->settings.service_type);
+
+	/*
+	 * Now the line is ready for some communication. Check if we are a
+	 * server, and send over some initial parameters.
+	 * Client do it in ircomm_tty_state_setup().
+	 * Note : we may get called from ircomm_tty_getvalue_confirm(),
+	 * therefore before we even have open any socket. And self->client
+	 * is initialised to TRUE only later. So, we check if the link is
+	 * really initialised. - Jean II
+	 */
+	if ((self->max_header_size != IRCOMM_TTY_HDR_UNINITIALISED) &&
+	    (!self->client) &&
+	    (self->settings.service_type != IRCOMM_3_WIRE_RAW))
+	{
+		/* Init connection */
+		ircomm_tty_send_initial_parameters(self);
+		ircomm_tty_link_established(self);
+	}
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_port_type (self, param)
+ *
+ *    The port type parameter tells if the devices are serial or parallel.
+ *    Since we only advertise serial service, this parameter should only
+ *    be equal to IRCOMM_SERIAL.
+ */
+static int ircomm_param_port_type(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = IRCOMM_SERIAL;
+	else {
+		self->settings.port_type = (__u8) param->pv.i;
+
+		IRDA_DEBUG(0, "%s(), port type=%d\n", __func__ ,
+			   self->settings.port_type);
+	}
+	return 0;
+}
+
+/*
+ * Function ircomm_param_port_name (self, param)
+ *
+ *    Exchange port name
+ *
+ */
+static int ircomm_param_port_name(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get) {
+		IRDA_DEBUG(0, "%s(), not imp!\n", __func__ );
+	} else {
+		IRDA_DEBUG(0, "%s(), port-name=%s\n", __func__ , param->pv.c);
+		strncpy(self->settings.port_name, param->pv.c, 32);
+	}
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_data_rate (self, param)
+ *
+ *    Exchange data rate to be used in this settings
+ *
+ */
+static int ircomm_param_data_rate(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->settings.data_rate;
+	else
+		self->settings.data_rate = param->pv.i;
+
+	IRDA_DEBUG(2, "%s(), data rate = %d\n", __func__ , param->pv.i);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_data_format (self, param)
+ *
+ *    Exchange data format to be used in this settings
+ *
+ */
+static int ircomm_param_data_format(void *instance, irda_param_t *param,
+				    int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->settings.data_format;
+	else
+		self->settings.data_format = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_flow_control (self, param)
+ *
+ *    Exchange flow control settings to be used in this settings
+ *
+ */
+static int ircomm_param_flow_control(void *instance, irda_param_t *param,
+				     int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->settings.flow_control;
+	else
+		self->settings.flow_control = (__u8) param->pv.i;
+
+	IRDA_DEBUG(1, "%s(), flow control = 0x%02x\n", __func__ , (__u8) param->pv.i);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_xon_xoff (self, param)
+ *
+ *    Exchange XON/XOFF characters
+ *
+ */
+static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get) {
+		param->pv.i = self->settings.xonxoff[0];
+		param->pv.i |= self->settings.xonxoff[1] << 8;
+	} else {
+		self->settings.xonxoff[0] = (__u16) param->pv.i & 0xff;
+		self->settings.xonxoff[1] = (__u16) param->pv.i >> 8;
+	}
+
+	IRDA_DEBUG(0, "%s(), XON/XOFF = 0x%02x,0x%02x\n", __func__ ,
+		   param->pv.i & 0xff, param->pv.i >> 8);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_enq_ack (self, param)
+ *
+ *    Exchange ENQ/ACK characters
+ *
+ */
+static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get) {
+		param->pv.i = self->settings.enqack[0];
+		param->pv.i |= self->settings.enqack[1] << 8;
+	} else {
+		self->settings.enqack[0] = (__u16) param->pv.i & 0xff;
+		self->settings.enqack[1] = (__u16) param->pv.i >> 8;
+	}
+
+	IRDA_DEBUG(0, "%s(), ENQ/ACK = 0x%02x,0x%02x\n", __func__ ,
+		   param->pv.i & 0xff, param->pv.i >> 8);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_line_status (self, param)
+ *
+ *
+ *
+ */
+static int ircomm_param_line_status(void *instance, irda_param_t *param,
+				    int get)
+{
+	IRDA_DEBUG(2, "%s(), not impl.\n", __func__ );
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_dte (instance, param)
+ *
+ *    If we get here, there must be some sort of null-modem connection, and
+ *    we are probably working in server mode as well.
+ */
+static int ircomm_param_dte(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	__u8 dte;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->settings.dte;
+	else {
+		dte = (__u8) param->pv.i;
+
+		self->settings.dce = 0;
+
+		if (dte & IRCOMM_DELTA_DTR)
+			self->settings.dce |= (IRCOMM_DELTA_DSR|
+					      IRCOMM_DELTA_RI |
+					      IRCOMM_DELTA_CD);
+		if (dte & IRCOMM_DTR)
+			self->settings.dce |= (IRCOMM_DSR|
+					      IRCOMM_RI |
+					      IRCOMM_CD);
+
+		if (dte & IRCOMM_DELTA_RTS)
+			self->settings.dce |= IRCOMM_DELTA_CTS;
+		if (dte & IRCOMM_RTS)
+			self->settings.dce |= IRCOMM_CTS;
+
+		/* Take appropriate actions */
+		ircomm_tty_check_modem_status(self);
+
+		/* Null modem cable emulator */
+		self->settings.null_modem = TRUE;
+	}
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_dce (instance, param)
+ *
+ *
+ *
+ */
+static int ircomm_param_dce(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	__u8 dce;
+
+	IRDA_DEBUG(1, "%s(), dce = 0x%02x\n", __func__ , (__u8) param->pv.i);
+
+	dce = (__u8) param->pv.i;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	self->settings.dce = dce;
+
+	/* Check if any of the settings have changed */
+	if (dce & 0x0f) {
+		if (dce & IRCOMM_DELTA_CTS) {
+			IRDA_DEBUG(2, "%s(), CTS\n", __func__ );
+		}
+	}
+
+	ircomm_tty_check_modem_status(self);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_param_poll (instance, param)
+ *
+ *    Called when the peer device is polling for the line settings
+ *
+ */
+static int ircomm_param_poll(void *instance, irda_param_t *param, int get)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	/* Poll parameters are always of length 0 (just a signal) */
+	if (!get) {
+		/* Respond with DTE line settings */
+		ircomm_param_request(self, IRCOMM_DTE, TRUE);
+	}
+	return 0;
+}
+
+
+
+
+
diff --git a/net/irda/ircomm/ircomm_ttp.c b/net/irda/ircomm/ircomm_ttp.c
new file mode 100644
index 00000000..6e6509f2
--- /dev/null
+++ b/net/irda/ircomm/ircomm_ttp.c
@@ -0,0 +1,368 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_ttp.c
+ * Version:       1.0
+ * Description:   Interface between IrCOMM and IrTTP
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 20:48:27 1999
+ * Modified at:   Mon Dec 13 11:35:13 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/ircomm_event.h>
+#include <net/irda/ircomm_ttp.h>
+
+static int ircomm_ttp_data_indication(void *instance, void *sap,
+				      struct sk_buff *skb);
+static void ircomm_ttp_connect_confirm(void *instance, void *sap,
+				       struct qos_info *qos,
+				       __u32 max_sdu_size,
+				       __u8 max_header_size,
+				       struct sk_buff *skb);
+static void ircomm_ttp_connect_indication(void *instance, void *sap,
+					  struct qos_info *qos,
+					  __u32 max_sdu_size,
+					  __u8 max_header_size,
+					  struct sk_buff *skb);
+static void ircomm_ttp_flow_indication(void *instance, void *sap,
+				       LOCAL_FLOW cmd);
+static void ircomm_ttp_disconnect_indication(void *instance, void *sap,
+					     LM_REASON reason,
+					     struct sk_buff *skb);
+static int ircomm_ttp_data_request(struct ircomm_cb *self,
+				   struct sk_buff *skb,
+				   int clen);
+static int ircomm_ttp_connect_request(struct ircomm_cb *self,
+				      struct sk_buff *userdata,
+				      struct ircomm_info *info);
+static int ircomm_ttp_connect_response(struct ircomm_cb *self,
+				       struct sk_buff *userdata);
+static int ircomm_ttp_disconnect_request(struct ircomm_cb *self,
+					 struct sk_buff *userdata,
+					 struct ircomm_info *info);
+
+/*
+ * Function ircomm_open_tsap (self)
+ *
+ *
+ *
+ */
+int ircomm_open_tsap(struct ircomm_cb *self)
+{
+	notify_t notify;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	/* Register callbacks */
+	irda_notify_init(&notify);
+	notify.data_indication       = ircomm_ttp_data_indication;
+	notify.connect_confirm       = ircomm_ttp_connect_confirm;
+	notify.connect_indication    = ircomm_ttp_connect_indication;
+	notify.flow_indication       = ircomm_ttp_flow_indication;
+	notify.disconnect_indication = ircomm_ttp_disconnect_indication;
+	notify.instance = self;
+	strlcpy(notify.name, "IrCOMM", sizeof(notify.name));
+
+	self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT,
+				     &notify);
+	if (!self->tsap) {
+		IRDA_DEBUG(0, "%sfailed to allocate tsap\n", __func__ );
+		return -1;
+	}
+	self->slsap_sel = self->tsap->stsap_sel;
+
+	/*
+	 *  Initialize the call-table for issuing commands
+	 */
+	self->issue.data_request       = ircomm_ttp_data_request;
+	self->issue.connect_request    = ircomm_ttp_connect_request;
+	self->issue.connect_response   = ircomm_ttp_connect_response;
+	self->issue.disconnect_request = ircomm_ttp_disconnect_request;
+
+	return 0;
+}
+
+/*
+ * Function ircomm_ttp_connect_request (self, userdata)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_connect_request(struct ircomm_cb *self,
+				      struct sk_buff *userdata,
+				      struct ircomm_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	/* Don't forget to refcount it - should be NULL anyway */
+	if(userdata)
+		skb_get(userdata);
+
+	ret = irttp_connect_request(self->tsap, info->dlsap_sel,
+				    info->saddr, info->daddr, NULL,
+				    TTP_SAR_DISABLE, userdata);
+
+	return ret;
+}
+
+/*
+ * Function ircomm_ttp_connect_response (self, skb)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_connect_response(struct ircomm_cb *self,
+				       struct sk_buff *userdata)
+{
+	int ret;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	/* Don't forget to refcount it - should be NULL anyway */
+	if(userdata)
+		skb_get(userdata);
+
+	ret = irttp_connect_response(self->tsap, TTP_SAR_DISABLE, userdata);
+
+	return ret;
+}
+
+/*
+ * Function ircomm_ttp_data_request (self, userdata)
+ *
+ *    Send IrCOMM data to IrTTP layer. Currently we do not try to combine
+ *    control data with pure data, so they will be sent as separate frames.
+ *    Should not be a big problem though, since control frames are rare. But
+ *    some of them are sent after connection establishment, so this can
+ *    increase the latency a bit.
+ */
+static int ircomm_ttp_data_request(struct ircomm_cb *self,
+				   struct sk_buff *skb,
+				   int clen)
+{
+	int ret;
+
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	IRDA_DEBUG(2, "%s(), clen=%d\n", __func__ , clen);
+
+	/*
+	 * Insert clen field, currently we either send data only, or control
+	 * only frames, to make things easier and avoid queueing
+	 */
+	IRDA_ASSERT(skb_headroom(skb) >= IRCOMM_HEADER_SIZE, return -1;);
+
+	/* Don't forget to refcount it - see ircomm_tty_do_softint() */
+	skb_get(skb);
+
+	skb_push(skb, IRCOMM_HEADER_SIZE);
+
+	skb->data[0] = clen;
+
+	ret = irttp_data_request(self->tsap, skb);
+	if (ret) {
+		IRDA_ERROR("%s(), failed\n", __func__);
+		/* irttp_data_request already free the packet */
+	}
+
+	return ret;
+}
+
+/*
+ * Function ircomm_ttp_data_indication (instance, sap, skb)
+ *
+ *    Incoming data
+ *
+ */
+static int ircomm_ttp_data_indication(void *instance, void *sap,
+				      struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	ircomm_do_event(self, IRCOMM_TTP_DATA_INDICATION, skb, NULL);
+
+	/* Drop reference count - see ircomm_tty_data_indication(). */
+	dev_kfree_skb(skb);
+
+	return 0;
+}
+
+static void ircomm_ttp_connect_confirm(void *instance, void *sap,
+				       struct qos_info *qos,
+				       __u32 max_sdu_size,
+				       __u8 max_header_size,
+				       struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(qos != NULL, goto out;);
+
+	if (max_sdu_size != TTP_SAR_DISABLE) {
+		IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n",
+			   __func__);
+		goto out;
+	}
+
+	info.max_data_size = irttp_get_max_seg_size(self->tsap)
+		- IRCOMM_HEADER_SIZE;
+	info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE;
+	info.qos = qos;
+
+	ircomm_do_event(self, IRCOMM_TTP_CONNECT_CONFIRM, skb, &info);
+
+out:
+	/* Drop reference count - see ircomm_tty_connect_confirm(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_connect_indication (instance, sap, qos, max_sdu_size,
+ *                                         max_header_size, skb)
+ *
+ *
+ *
+ */
+static void ircomm_ttp_connect_indication(void *instance, void *sap,
+					  struct qos_info *qos,
+					  __u32 max_sdu_size,
+					  __u8 max_header_size,
+					  struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *)instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(qos != NULL, goto out;);
+
+	if (max_sdu_size != TTP_SAR_DISABLE) {
+		IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n",
+			   __func__);
+		goto out;
+	}
+
+	info.max_data_size = irttp_get_max_seg_size(self->tsap)
+		- IRCOMM_HEADER_SIZE;
+	info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE;
+	info.qos = qos;
+
+	ircomm_do_event(self, IRCOMM_TTP_CONNECT_INDICATION, skb, &info);
+
+out:
+	/* Drop reference count - see ircomm_tty_connect_indication(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_disconnect_request (self, userdata, info)
+ *
+ *
+ *
+ */
+static int ircomm_ttp_disconnect_request(struct ircomm_cb *self,
+					 struct sk_buff *userdata,
+					 struct ircomm_info *info)
+{
+	int ret;
+
+	/* Don't forget to refcount it - should be NULL anyway */
+	if(userdata)
+		skb_get(userdata);
+
+	ret = irttp_disconnect_request(self->tsap, userdata, P_NORMAL);
+
+	return ret;
+}
+
+/*
+ * Function ircomm_ttp_disconnect_indication (instance, sap, reason, skb)
+ *
+ *
+ *
+ */
+static void ircomm_ttp_disconnect_indication(void *instance, void *sap,
+					     LM_REASON reason,
+					     struct sk_buff *skb)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+	struct ircomm_info info;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+	info.reason = reason;
+
+	ircomm_do_event(self, IRCOMM_TTP_DISCONNECT_INDICATION, skb, &info);
+
+	/* Drop reference count - see ircomm_tty_disconnect_indication(). */
+	if(skb)
+		dev_kfree_skb(skb);
+}
+
+/*
+ * Function ircomm_ttp_flow_indication (instance, sap, cmd)
+ *
+ *    Layer below is telling us to start or stop the flow of data
+ *
+ */
+static void ircomm_ttp_flow_indication(void *instance, void *sap,
+				       LOCAL_FLOW cmd)
+{
+	struct ircomm_cb *self = (struct ircomm_cb *) instance;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;);
+
+	if (self->notify.flow_indication)
+		self->notify.flow_indication(self->notify.instance, self, cmd);
+}
+
+
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
new file mode 100644
index 00000000..b3cc8b39
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -0,0 +1,1411 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_tty.c
+ * Version:       1.0
+ * Description:   IrCOMM serial TTY driver
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 21:00:56 1999
+ * Modified at:   Wed Feb 23 00:09:02 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Sources:       serial.c and previous IrCOMM work by Takahide Higuchi
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/termios.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>		/* for MODULE_ALIAS_CHARDEV_MAJOR */
+
+#include <asm/uaccess.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+static int  ircomm_tty_open(struct tty_struct *tty, struct file *filp);
+static void ircomm_tty_close(struct tty_struct * tty, struct file *filp);
+static int  ircomm_tty_write(struct tty_struct * tty,
+			     const unsigned char *buf, int count);
+static int  ircomm_tty_write_room(struct tty_struct *tty);
+static void ircomm_tty_throttle(struct tty_struct *tty);
+static void ircomm_tty_unthrottle(struct tty_struct *tty);
+static int  ircomm_tty_chars_in_buffer(struct tty_struct *tty);
+static void ircomm_tty_flush_buffer(struct tty_struct *tty);
+static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch);
+static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout);
+static void ircomm_tty_hangup(struct tty_struct *tty);
+static void ircomm_tty_do_softint(struct work_struct *work);
+static void ircomm_tty_shutdown(struct ircomm_tty_cb *self);
+static void ircomm_tty_stop(struct tty_struct *tty);
+
+static int ircomm_tty_data_indication(void *instance, void *sap,
+				      struct sk_buff *skb);
+static int ircomm_tty_control_indication(void *instance, void *sap,
+					 struct sk_buff *skb);
+static void ircomm_tty_flow_indication(void *instance, void *sap,
+				       LOCAL_FLOW cmd);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations ircomm_tty_proc_fops;
+#endif /* CONFIG_PROC_FS */
+static struct tty_driver *driver;
+
+static hashbin_t *ircomm_tty = NULL;
+
+static const struct tty_operations ops = {
+	.open            = ircomm_tty_open,
+	.close           = ircomm_tty_close,
+	.write           = ircomm_tty_write,
+	.write_room      = ircomm_tty_write_room,
+	.chars_in_buffer = ircomm_tty_chars_in_buffer,
+	.flush_buffer    = ircomm_tty_flush_buffer,
+	.ioctl           = ircomm_tty_ioctl,	/* ircomm_tty_ioctl.c */
+	.tiocmget        = ircomm_tty_tiocmget,	/* ircomm_tty_ioctl.c */
+	.tiocmset        = ircomm_tty_tiocmset,	/* ircomm_tty_ioctl.c */
+	.throttle        = ircomm_tty_throttle,
+	.unthrottle      = ircomm_tty_unthrottle,
+	.send_xchar      = ircomm_tty_send_xchar,
+	.set_termios     = ircomm_tty_set_termios,
+	.stop            = ircomm_tty_stop,
+	.start           = ircomm_tty_start,
+	.hangup          = ircomm_tty_hangup,
+	.wait_until_sent = ircomm_tty_wait_until_sent,
+#ifdef CONFIG_PROC_FS
+	.proc_fops       = &ircomm_tty_proc_fops,
+#endif /* CONFIG_PROC_FS */
+};
+
+/*
+ * Function ircomm_tty_init()
+ *
+ *    Init IrCOMM TTY layer/driver
+ *
+ */
+static int __init ircomm_tty_init(void)
+{
+	driver = alloc_tty_driver(IRCOMM_TTY_PORTS);
+	if (!driver)
+		return -ENOMEM;
+	ircomm_tty = hashbin_new(HB_LOCK);
+	if (ircomm_tty == NULL) {
+		IRDA_ERROR("%s(), can't allocate hashbin!\n", __func__);
+		put_tty_driver(driver);
+		return -ENOMEM;
+	}
+
+	driver->owner		= THIS_MODULE;
+	driver->driver_name     = "ircomm";
+	driver->name            = "ircomm";
+	driver->major           = IRCOMM_TTY_MAJOR;
+	driver->minor_start     = IRCOMM_TTY_MINOR;
+	driver->type            = TTY_DRIVER_TYPE_SERIAL;
+	driver->subtype         = SERIAL_TYPE_NORMAL;
+	driver->init_termios    = tty_std_termios;
+	driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	driver->flags           = TTY_DRIVER_REAL_RAW;
+	tty_set_operations(driver, &ops);
+	if (tty_register_driver(driver)) {
+		IRDA_ERROR("%s(): Couldn't register serial driver\n",
+			   __func__);
+		put_tty_driver(driver);
+		return -1;
+	}
+	return 0;
+}
+
+static void __exit __ircomm_tty_cleanup(struct ircomm_tty_cb *self)
+{
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	ircomm_tty_shutdown(self);
+
+	self->magic = 0;
+	kfree(self);
+}
+
+/*
+ * Function ircomm_tty_cleanup ()
+ *
+ *    Remove IrCOMM TTY layer/driver
+ *
+ */
+static void __exit ircomm_tty_cleanup(void)
+{
+	int ret;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	ret = tty_unregister_driver(driver);
+	if (ret) {
+		IRDA_ERROR("%s(), failed to unregister driver\n",
+			   __func__);
+		return;
+	}
+
+	hashbin_delete(ircomm_tty, (FREE_FUNC) __ircomm_tty_cleanup);
+	put_tty_driver(driver);
+}
+
+/*
+ * Function ircomm_startup (self)
+ *
+ *
+ *
+ */
+static int ircomm_tty_startup(struct ircomm_tty_cb *self)
+{
+	notify_t notify;
+	int ret = -ENODEV;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	/* Check if already open */
+	if (test_and_set_bit(ASYNC_B_INITIALIZED, &self->flags)) {
+		IRDA_DEBUG(2, "%s(), already open so break out!\n", __func__ );
+		return 0;
+	}
+
+	/* Register with IrCOMM */
+	irda_notify_init(&notify);
+	/* These callbacks we must handle ourselves */
+	notify.data_indication       = ircomm_tty_data_indication;
+	notify.udata_indication      = ircomm_tty_control_indication;
+	notify.flow_indication       = ircomm_tty_flow_indication;
+
+	/* Use the ircomm_tty interface for these ones */
+	notify.disconnect_indication = ircomm_tty_disconnect_indication;
+	notify.connect_confirm       = ircomm_tty_connect_confirm;
+	notify.connect_indication    = ircomm_tty_connect_indication;
+	strlcpy(notify.name, "ircomm_tty", sizeof(notify.name));
+	notify.instance = self;
+
+	if (!self->ircomm) {
+		self->ircomm = ircomm_open(&notify, self->service_type,
+					   self->line);
+	}
+	if (!self->ircomm)
+		goto err;
+
+	self->slsap_sel = self->ircomm->slsap_sel;
+
+	/* Connect IrCOMM link with remote device */
+	ret = ircomm_tty_attach_cable(self);
+	if (ret < 0) {
+		IRDA_ERROR("%s(), error attaching cable!\n", __func__);
+		goto err;
+	}
+
+	return 0;
+err:
+	clear_bit(ASYNC_B_INITIALIZED, &self->flags);
+	return ret;
+}
+
+/*
+ * Function ircomm_block_til_ready (self, filp)
+ *
+ *
+ *
+ */
+static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
+				      struct file *filp)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int		retval;
+	int		do_clocal = 0, extra_count = 0;
+	unsigned long	flags;
+	struct tty_struct *tty;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	tty = self->tty;
+
+	/*
+	 * If non-blocking mode is set, or the port is not enabled,
+	 * then make the check up front and then exit.
+	 */
+	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
+		/* nonblock mode is set or port is not enabled */
+		self->flags |= ASYNC_NORMAL_ACTIVE;
+		IRDA_DEBUG(1, "%s(), O_NONBLOCK requested!\n", __func__ );
+		return 0;
+	}
+
+	if (tty->termios->c_cflag & CLOCAL) {
+		IRDA_DEBUG(1, "%s(), doing CLOCAL!\n", __func__ );
+		do_clocal = 1;
+	}
+
+	/* Wait for carrier detect and the line to become
+	 * free (i.e., not in use by the callout).  While we are in
+	 * this loop, self->open_count is dropped by one, so that
+	 * mgsl_close() knows when to free things.  We restore it upon
+	 * exit, either normal or abnormal.
+	 */
+
+	retval = 0;
+	add_wait_queue(&self->open_wait, &wait);
+
+	IRDA_DEBUG(2, "%s(%d):block_til_ready before block on %s open_count=%d\n",
+	      __FILE__,__LINE__, tty->driver->name, self->open_count );
+
+	/* As far as I can see, we protect open_count - Jean II */
+	spin_lock_irqsave(&self->spinlock, flags);
+	if (!tty_hung_up_p(filp)) {
+		extra_count = 1;
+		self->open_count--;
+	}
+	spin_unlock_irqrestore(&self->spinlock, flags);
+	self->blocked_open++;
+
+	while (1) {
+		if (tty->termios->c_cflag & CBAUD) {
+			/* Here, we use to lock those two guys, but
+			 * as ircomm_param_request() does it itself,
+			 * I don't see the point (and I see the deadlock).
+			 * Jean II */
+			self->settings.dte |= IRCOMM_RTS + IRCOMM_DTR;
+
+			ircomm_param_request(self, IRCOMM_DTE, TRUE);
+		}
+
+		current->state = TASK_INTERRUPTIBLE;
+
+		if (tty_hung_up_p(filp) ||
+		    !test_bit(ASYNC_B_INITIALIZED, &self->flags)) {
+			retval = (self->flags & ASYNC_HUP_NOTIFY) ?
+					-EAGAIN : -ERESTARTSYS;
+			break;
+		}
+
+		/*
+		 * Check if link is ready now. Even if CLOCAL is
+		 * specified, we cannot return before the IrCOMM link is
+		 * ready
+		 */
+		if (!test_bit(ASYNC_B_CLOSING, &self->flags) &&
+		    (do_clocal || (self->settings.dce & IRCOMM_CD)) &&
+		    self->state == IRCOMM_TTY_READY)
+		{
+			break;
+		}
+
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+
+		IRDA_DEBUG(1, "%s(%d):block_til_ready blocking on %s open_count=%d\n",
+		      __FILE__,__LINE__, tty->driver->name, self->open_count );
+
+		schedule();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&self->open_wait, &wait);
+
+	if (extra_count) {
+		/* ++ is not atomic, so this should be protected - Jean II */
+		spin_lock_irqsave(&self->spinlock, flags);
+		self->open_count++;
+		spin_unlock_irqrestore(&self->spinlock, flags);
+	}
+	self->blocked_open--;
+
+	IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n",
+	      __FILE__,__LINE__, tty->driver->name, self->open_count);
+
+	if (!retval)
+		self->flags |= ASYNC_NORMAL_ACTIVE;
+
+	return retval;
+}
+
+/*
+ * Function ircomm_tty_open (tty, filp)
+ *
+ *    This routine is called when a particular tty device is opened. This
+ *    routine is mandatory; if this routine is not filled in, the attempted
+ *    open will fail with ENODEV.
+ */
+static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
+{
+	struct ircomm_tty_cb *self;
+	unsigned int line;
+	unsigned long	flags;
+	int ret;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	line = tty->index;
+	if (line >= IRCOMM_TTY_PORTS)
+		return -ENODEV;
+
+	/* Check if instance already exists */
+	self = hashbin_lock_find(ircomm_tty, line, NULL);
+	if (!self) {
+		/* No, so make new instance */
+		self = kzalloc(sizeof(struct ircomm_tty_cb), GFP_KERNEL);
+		if (self == NULL) {
+			IRDA_ERROR("%s(), kmalloc failed!\n", __func__);
+			return -ENOMEM;
+		}
+
+		self->magic = IRCOMM_TTY_MAGIC;
+		self->flow = FLOW_STOP;
+
+		self->line = line;
+		INIT_WORK(&self->tqueue, ircomm_tty_do_softint);
+		self->max_header_size = IRCOMM_TTY_HDR_UNINITIALISED;
+		self->max_data_size = IRCOMM_TTY_DATA_UNINITIALISED;
+		self->close_delay = 5*HZ/10;
+		self->closing_wait = 30*HZ;
+
+		/* Init some important stuff */
+		init_timer(&self->watchdog_timer);
+		init_waitqueue_head(&self->open_wait);
+		init_waitqueue_head(&self->close_wait);
+		spin_lock_init(&self->spinlock);
+
+		/*
+		 * Force TTY into raw mode by default which is usually what
+		 * we want for IrCOMM and IrLPT. This way applications will
+		 * not have to twiddle with printcap etc.
+		 *
+		 * Note this is completely usafe and doesn't work properly
+		 */
+		tty->termios->c_iflag = 0;
+		tty->termios->c_oflag = 0;
+
+		/* Insert into hash */
+		hashbin_insert(ircomm_tty, (irda_queue_t *) self, line, NULL);
+	}
+	/* ++ is not atomic, so this should be protected - Jean II */
+	spin_lock_irqsave(&self->spinlock, flags);
+	self->open_count++;
+
+	tty->driver_data = self;
+	self->tty = tty;
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	IRDA_DEBUG(1, "%s(), %s%d, count = %d\n", __func__ , tty->driver->name,
+		   self->line, self->open_count);
+
+	/* Not really used by us, but lets do it anyway */
+	self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+	/*
+	 * If the port is the middle of closing, bail out now
+	 */
+	if (tty_hung_up_p(filp) ||
+	    test_bit(ASYNC_B_CLOSING, &self->flags)) {
+
+		/* Hm, why are we blocking on ASYNC_CLOSING if we
+		 * do return -EAGAIN/-ERESTARTSYS below anyway?
+		 * IMHO it's either not needed in the first place
+		 * or for some reason we need to make sure the async
+		 * closing has been finished - if so, wouldn't we
+		 * probably better sleep uninterruptible?
+		 */
+
+		if (wait_event_interruptible(self->close_wait, !test_bit(ASYNC_B_CLOSING, &self->flags))) {
+			IRDA_WARNING("%s - got signal while blocking on ASYNC_CLOSING!\n",
+				     __func__);
+			return -ERESTARTSYS;
+		}
+
+#ifdef SERIAL_DO_RESTART
+		return (self->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS;
+#else
+		return -EAGAIN;
+#endif
+	}
+
+	/* Check if this is a "normal" ircomm device, or an irlpt device */
+	if (line < 0x10) {
+		self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE;
+		self->settings.service_type = IRCOMM_9_WIRE; /* 9 wire as default */
+		/* Jan Kiszka -> add DSR/RI -> Conform to IrCOMM spec */
+		self->settings.dce = IRCOMM_CTS | IRCOMM_CD | IRCOMM_DSR | IRCOMM_RI; /* Default line settings */
+		IRDA_DEBUG(2, "%s(), IrCOMM device\n", __func__ );
+	} else {
+		IRDA_DEBUG(2, "%s(), IrLPT device\n", __func__ );
+		self->service_type = IRCOMM_3_WIRE_RAW;
+		self->settings.service_type = IRCOMM_3_WIRE_RAW; /* Default */
+	}
+
+	ret = ircomm_tty_startup(self);
+	if (ret)
+		return ret;
+
+	ret = ircomm_tty_block_til_ready(self, filp);
+	if (ret) {
+		IRDA_DEBUG(2,
+		      "%s(), returning after block_til_ready with %d\n", __func__ ,
+		      ret);
+
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Function ircomm_tty_close (tty, filp)
+ *
+ *    This routine is called when a particular tty device is closed.
+ *
+ */
+static void ircomm_tty_close(struct tty_struct *tty, struct file *filp)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long flags;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	if (tty_hung_up_p(filp)) {
+		spin_unlock_irqrestore(&self->spinlock, flags);
+
+		IRDA_DEBUG(0, "%s(), returning 1\n", __func__ );
+		return;
+	}
+
+	if ((tty->count == 1) && (self->open_count != 1)) {
+		/*
+		 * Uh, oh.  tty->count is 1, which means that the tty
+		 * structure will be freed.  state->count should always
+		 * be one in these conditions.  If it's greater than
+		 * one, we've got real problems, since it means the
+		 * serial port won't be shutdown.
+		 */
+		IRDA_DEBUG(0, "%s(), bad serial port count; "
+			   "tty->count is 1, state->count is %d\n", __func__ ,
+			   self->open_count);
+		self->open_count = 1;
+	}
+
+	if (--self->open_count < 0) {
+		IRDA_ERROR("%s(), bad serial port count for ttys%d: %d\n",
+			   __func__, self->line, self->open_count);
+		self->open_count = 0;
+	}
+	if (self->open_count) {
+		spin_unlock_irqrestore(&self->spinlock, flags);
+
+		IRDA_DEBUG(0, "%s(), open count > 0\n", __func__ );
+		return;
+	}
+
+	/* Hum... Should be test_and_set_bit ??? - Jean II */
+	set_bit(ASYNC_B_CLOSING, &self->flags);
+
+	/* We need to unlock here (we were unlocking at the end of this
+	 * function), because tty_wait_until_sent() may schedule.
+	 * I don't know if the rest should be protected somehow,
+	 * so someone should check. - Jean II */
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	/*
+	 * Now we wait for the transmit buffer to clear; and we notify
+	 * the line discipline to only process XON/XOFF characters.
+	 */
+	tty->closing = 1;
+	if (self->closing_wait != ASYNC_CLOSING_WAIT_NONE)
+		tty_wait_until_sent(tty, self->closing_wait);
+
+	ircomm_tty_shutdown(self);
+
+	tty_driver_flush_buffer(tty);
+	tty_ldisc_flush(tty);
+
+	tty->closing = 0;
+	self->tty = NULL;
+
+	if (self->blocked_open) {
+		if (self->close_delay)
+			schedule_timeout_interruptible(self->close_delay);
+		wake_up_interruptible(&self->open_wait);
+	}
+
+	self->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+	wake_up_interruptible(&self->close_wait);
+}
+
+/*
+ * Function ircomm_tty_flush_buffer (tty)
+ *
+ *
+ *
+ */
+static void ircomm_tty_flush_buffer(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/*
+	 * Let do_softint() do this to avoid race condition with
+	 * do_softint() ;-)
+	 */
+	schedule_work(&self->tqueue);
+}
+
+/*
+ * Function ircomm_tty_do_softint (work)
+ *
+ *    We use this routine to give the write wakeup to the user at at a
+ *    safe time (as fast as possible after write have completed). This
+ *    can be compared to the Tx interrupt.
+ */
+static void ircomm_tty_do_softint(struct work_struct *work)
+{
+	struct ircomm_tty_cb *self =
+		container_of(work, struct ircomm_tty_cb, tqueue);
+	struct tty_struct *tty;
+	unsigned long flags;
+	struct sk_buff *skb, *ctrl_skb;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (!self || self->magic != IRCOMM_TTY_MAGIC)
+		return;
+
+	tty = self->tty;
+	if (!tty)
+		return;
+
+	/* Unlink control buffer */
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	ctrl_skb = self->ctrl_skb;
+	self->ctrl_skb = NULL;
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	/* Flush control buffer if any */
+	if(ctrl_skb) {
+		if(self->flow == FLOW_START)
+			ircomm_control_request(self->ircomm, ctrl_skb);
+		/* Drop reference count - see ircomm_ttp_data_request(). */
+		dev_kfree_skb(ctrl_skb);
+	}
+
+	if (tty->hw_stopped)
+		return;
+
+	/* Unlink transmit buffer */
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	skb = self->tx_skb;
+	self->tx_skb = NULL;
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	/* Flush transmit buffer if any */
+	if (skb) {
+		ircomm_tty_do_event(self, IRCOMM_TTY_DATA_REQUEST, skb, NULL);
+		/* Drop reference count - see ircomm_ttp_data_request(). */
+		dev_kfree_skb(skb);
+	}
+
+	/* Check if user (still) wants to be waken up */
+	tty_wakeup(tty);
+}
+
+/*
+ * Function ircomm_tty_write (tty, buf, count)
+ *
+ *    This routine is called by the kernel to write a series of characters
+ *    to the tty device. The characters may come from user space or kernel
+ *    space. This routine will return the number of characters actually
+ *    accepted for writing. This routine is mandatory.
+ */
+static int ircomm_tty_write(struct tty_struct *tty,
+			    const unsigned char *buf, int count)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long flags;
+	struct sk_buff *skb;
+	int tailroom = 0;
+	int len = 0;
+	int size;
+
+	IRDA_DEBUG(2, "%s(), count=%d, hw_stopped=%d\n", __func__ , count,
+		   tty->hw_stopped);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	/* We may receive packets from the TTY even before we have finished
+	 * our setup. Not cool.
+	 * The problem is that we don't know the final header and data size
+	 * to create the proper skb, so any skb we would create would have
+	 * bogus header and data size, so need care.
+	 * We use a bogus header size to safely detect this condition.
+	 * Another problem is that hw_stopped was set to 0 way before it
+	 * should be, so we would drop this skb. It should now be fixed.
+	 * One option is to not accept data until we are properly setup.
+	 * But, I suspect that when it happens, the ppp line discipline
+	 * just "drops" the data, which might screw up connect scripts.
+	 * The second option is to create a "safe skb", with large header
+	 * and small size (see ircomm_tty_open() for values).
+	 * We just need to make sure that when the real values get filled,
+	 * we don't mess up the original "safe skb" (see tx_data_size).
+	 * Jean II */
+	if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) {
+		IRDA_DEBUG(1, "%s() : not initialised\n", __func__);
+#ifdef IRCOMM_NO_TX_BEFORE_INIT
+		/* We didn't consume anything, TTY will retry */
+		return 0;
+#endif
+	}
+
+	if (count < 1)
+		return 0;
+
+	/* Protect our manipulation of self->tx_skb and related */
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	/* Fetch current transmit buffer */
+	skb = self->tx_skb;
+
+	/*
+	 * Send out all the data we get, possibly as multiple fragmented
+	 * frames, but this will only happen if the data is larger than the
+	 * max data size. The normal case however is just the opposite, and
+	 * this function may be called multiple times, and will then actually
+	 * defragment the data and send it out as one packet as soon as
+	 * possible, but at a safer point in time
+	 */
+	while (count) {
+		size = count;
+
+		/* Adjust data size to the max data size */
+		if (size > self->max_data_size)
+			size = self->max_data_size;
+
+		/*
+		 * Do we already have a buffer ready for transmit, or do
+		 * we need to allocate a new frame
+		 */
+		if (skb) {
+			/*
+			 * Any room for more data at the end of the current
+			 * transmit buffer? Cannot use skb_tailroom, since
+			 * dev_alloc_skb gives us a larger skb than we
+			 * requested
+			 * Note : use tx_data_size, because max_data_size
+			 * may have changed and we don't want to overwrite
+			 * the skb. - Jean II
+			 */
+			if ((tailroom = (self->tx_data_size - skb->len)) > 0) {
+				/* Adjust data to tailroom */
+				if (size > tailroom)
+					size = tailroom;
+			} else {
+				/*
+				 * Current transmit frame is full, so break
+				 * out, so we can send it as soon as possible
+				 */
+				break;
+			}
+		} else {
+			/* Prepare a full sized frame */
+			skb = alloc_skb(self->max_data_size+
+					self->max_header_size,
+					GFP_ATOMIC);
+			if (!skb) {
+				spin_unlock_irqrestore(&self->spinlock, flags);
+				return -ENOBUFS;
+			}
+			skb_reserve(skb, self->max_header_size);
+			self->tx_skb = skb;
+			/* Remember skb size because max_data_size may
+			 * change later on - Jean II */
+			self->tx_data_size = self->max_data_size;
+		}
+
+		/* Copy data */
+		memcpy(skb_put(skb,size), buf + len, size);
+
+		count -= size;
+		len += size;
+	}
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	/*
+	 * Schedule a new thread which will transmit the frame as soon
+	 * as possible, but at a safe point in time. We do this so the
+	 * "user" can give us data multiple times, as PPP does (because of
+	 * its 256 byte tx buffer). We will then defragment and send out
+	 * all this data as one single packet.
+	 */
+	schedule_work(&self->tqueue);
+
+	return len;
+}
+
+/*
+ * Function ircomm_tty_write_room (tty)
+ *
+ *    This routine returns the numbers of characters the tty driver will
+ *    accept for queuing to be written. This number is subject to change as
+ *    output buffers get emptied, or if the output flow control is acted.
+ */
+static int ircomm_tty_write_room(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long flags;
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+#ifdef IRCOMM_NO_TX_BEFORE_INIT
+	/* max_header_size tells us if the channel is initialised or not. */
+	if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED)
+		/* Don't bother us yet */
+		return 0;
+#endif
+
+	/* Check if we are allowed to transmit any data.
+	 * hw_stopped is the regular flow control.
+	 * Jean II */
+	if (tty->hw_stopped)
+		ret = 0;
+	else {
+		spin_lock_irqsave(&self->spinlock, flags);
+		if (self->tx_skb)
+			ret = self->tx_data_size - self->tx_skb->len;
+		else
+			ret = self->max_data_size;
+		spin_unlock_irqrestore(&self->spinlock, flags);
+	}
+	IRDA_DEBUG(2, "%s(), ret=%d\n", __func__ , ret);
+
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_wait_until_sent (tty, timeout)
+ *
+ *    This routine waits until the device has written out all of the
+ *    characters in its transmitter FIFO.
+ */
+static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long orig_jiffies, poll_time;
+	unsigned long flags;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	orig_jiffies = jiffies;
+
+	/* Set poll time to 200 ms */
+	poll_time = IRDA_MIN(timeout, msecs_to_jiffies(200));
+
+	spin_lock_irqsave(&self->spinlock, flags);
+	while (self->tx_skb && self->tx_skb->len) {
+		spin_unlock_irqrestore(&self->spinlock, flags);
+		schedule_timeout_interruptible(poll_time);
+		spin_lock_irqsave(&self->spinlock, flags);
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+	spin_unlock_irqrestore(&self->spinlock, flags);
+	current->state = TASK_RUNNING;
+}
+
+/*
+ * Function ircomm_tty_throttle (tty)
+ *
+ *    This routine notifies the tty driver that input buffers for the line
+ *    discipline are close to full, and it should somehow signal that no
+ *    more characters should be sent to the tty.
+ */
+static void ircomm_tty_throttle(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/* Software flow control? */
+	if (I_IXOFF(tty))
+		ircomm_tty_send_xchar(tty, STOP_CHAR(tty));
+
+	/* Hardware flow control? */
+	if (tty->termios->c_cflag & CRTSCTS) {
+		self->settings.dte &= ~IRCOMM_RTS;
+		self->settings.dte |= IRCOMM_DELTA_RTS;
+
+		ircomm_param_request(self, IRCOMM_DTE, TRUE);
+	}
+
+	ircomm_flow_request(self->ircomm, FLOW_STOP);
+}
+
+/*
+ * Function ircomm_tty_unthrottle (tty)
+ *
+ *    This routine notifies the tty drivers that it should signals that
+ *    characters can now be sent to the tty without fear of overrunning the
+ *    input buffers of the line disciplines.
+ */
+static void ircomm_tty_unthrottle(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/* Using software flow control? */
+	if (I_IXOFF(tty)) {
+		ircomm_tty_send_xchar(tty, START_CHAR(tty));
+	}
+
+	/* Using hardware flow control? */
+	if (tty->termios->c_cflag & CRTSCTS) {
+		self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS);
+
+		ircomm_param_request(self, IRCOMM_DTE, TRUE);
+		IRDA_DEBUG(1, "%s(), FLOW_START\n", __func__ );
+	}
+	ircomm_flow_request(self->ircomm, FLOW_START);
+}
+
+/*
+ * Function ircomm_tty_chars_in_buffer (tty)
+ *
+ *    Indicates if there are any data in the buffer
+ *
+ */
+static int ircomm_tty_chars_in_buffer(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long flags;
+	int len = 0;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	if (self->tx_skb)
+		len = self->tx_skb->len;
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	return len;
+}
+
+static void ircomm_tty_shutdown(struct ircomm_tty_cb *self)
+{
+	unsigned long flags;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	if (!test_and_clear_bit(ASYNC_B_INITIALIZED, &self->flags))
+		return;
+
+	ircomm_tty_detach_cable(self);
+
+	spin_lock_irqsave(&self->spinlock, flags);
+
+	del_timer(&self->watchdog_timer);
+
+	/* Free parameter buffer */
+	if (self->ctrl_skb) {
+		dev_kfree_skb(self->ctrl_skb);
+		self->ctrl_skb = NULL;
+	}
+
+	/* Free transmit buffer */
+	if (self->tx_skb) {
+		dev_kfree_skb(self->tx_skb);
+		self->tx_skb = NULL;
+	}
+
+	if (self->ircomm) {
+		ircomm_close(self->ircomm);
+		self->ircomm = NULL;
+	}
+
+	spin_unlock_irqrestore(&self->spinlock, flags);
+}
+
+/*
+ * Function ircomm_tty_hangup (tty)
+ *
+ *    This routine notifies the tty driver that it should hangup the tty
+ *    device.
+ *
+ */
+static void ircomm_tty_hangup(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned long	flags;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/* ircomm_tty_flush_buffer(tty); */
+	ircomm_tty_shutdown(self);
+
+	/* I guess we need to lock here - Jean II */
+	spin_lock_irqsave(&self->spinlock, flags);
+	self->flags &= ~ASYNC_NORMAL_ACTIVE;
+	self->tty = NULL;
+	self->open_count = 0;
+	spin_unlock_irqrestore(&self->spinlock, flags);
+
+	wake_up_interruptible(&self->open_wait);
+}
+
+/*
+ * Function ircomm_tty_send_xchar (tty, ch)
+ *
+ *    This routine is used to send a high-priority XON/XOFF character to
+ *    the device.
+ */
+static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch)
+{
+	IRDA_DEBUG(0, "%s(), not impl\n", __func__ );
+}
+
+/*
+ * Function ircomm_tty_start (tty)
+ *
+ *    This routine notifies the tty driver that it resume sending
+ *    characters to the tty device.
+ */
+void ircomm_tty_start(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	ircomm_flow_request(self->ircomm, FLOW_START);
+}
+
+/*
+ * Function ircomm_tty_stop (tty)
+ *
+ *     This routine notifies the tty driver that it should stop outputting
+ *     characters to the tty device.
+ */
+static void ircomm_tty_stop(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	ircomm_flow_request(self->ircomm, FLOW_STOP);
+}
+
+/*
+ * Function ircomm_check_modem_status (self)
+ *
+ *    Check for any changes in the DCE's line settings. This function should
+ *    be called whenever the dce parameter settings changes, to update the
+ *    flow control settings and other things
+ */
+void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self)
+{
+	struct tty_struct *tty;
+	int status;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	tty = self->tty;
+
+	status = self->settings.dce;
+
+	if (status & IRCOMM_DCE_DELTA_ANY) {
+		/*wake_up_interruptible(&self->delta_msr_wait);*/
+	}
+	if ((self->flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) {
+		IRDA_DEBUG(2,
+			   "%s(), ircomm%d CD now %s...\n", __func__ , self->line,
+			   (status & IRCOMM_CD) ? "on" : "off");
+
+		if (status & IRCOMM_CD) {
+			wake_up_interruptible(&self->open_wait);
+		} else {
+			IRDA_DEBUG(2,
+				   "%s(), Doing serial hangup..\n", __func__ );
+			if (tty)
+				tty_hangup(tty);
+
+			/* Hangup will remote the tty, so better break out */
+			return;
+		}
+	}
+	if (self->flags & ASYNC_CTS_FLOW) {
+		if (tty->hw_stopped) {
+			if (status & IRCOMM_CTS) {
+				IRDA_DEBUG(2,
+					   "%s(), CTS tx start...\n", __func__ );
+				tty->hw_stopped = 0;
+
+				/* Wake up processes blocked on open */
+				wake_up_interruptible(&self->open_wait);
+
+				schedule_work(&self->tqueue);
+				return;
+			}
+		} else {
+			if (!(status & IRCOMM_CTS)) {
+				IRDA_DEBUG(2,
+					   "%s(), CTS tx stop...\n", __func__ );
+				tty->hw_stopped = 1;
+			}
+		}
+	}
+}
+
+/*
+ * Function ircomm_tty_data_indication (instance, sap, skb)
+ *
+ *    Handle incoming data, and deliver it to the line discipline
+ *
+ */
+static int ircomm_tty_data_indication(void *instance, void *sap,
+				      struct sk_buff *skb)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	if (!self->tty) {
+		IRDA_DEBUG(0, "%s(), no tty!\n", __func__ );
+		return 0;
+	}
+
+	/*
+	 * If we receive data when hardware is stopped then something is wrong.
+	 * We try to poll the peers line settings to check if we are up todate.
+	 * Devices like WinCE can do this, and since they don't send any
+	 * params, we can just as well declare the hardware for running.
+	 */
+	if (self->tty->hw_stopped && (self->flow == FLOW_START)) {
+		IRDA_DEBUG(0, "%s(), polling for line settings!\n", __func__ );
+		ircomm_param_request(self, IRCOMM_POLL, TRUE);
+
+		/* We can just as well declare the hardware for running */
+		ircomm_tty_send_initial_parameters(self);
+		ircomm_tty_link_established(self);
+	}
+
+	/*
+	 * Use flip buffer functions since the code may be called from interrupt
+	 * context
+	 */
+	tty_insert_flip_string(self->tty, skb->data, skb->len);
+	tty_flip_buffer_push(self->tty);
+
+	/* No need to kfree_skb - see ircomm_ttp_data_indication() */
+
+	return 0;
+}
+
+/*
+ * Function ircomm_tty_control_indication (instance, sap, skb)
+ *
+ *    Parse all incoming parameters (easy!)
+ *
+ */
+static int ircomm_tty_control_indication(void *instance, void *sap,
+					 struct sk_buff *skb)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	int clen;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	clen = skb->data[0];
+
+	irda_param_extract_all(self, skb->data+1, IRDA_MIN(skb->len-1, clen),
+			       &ircomm_param_info);
+
+	/* No need to kfree_skb - see ircomm_control_indication() */
+
+	return 0;
+}
+
+/*
+ * Function ircomm_tty_flow_indication (instance, sap, cmd)
+ *
+ *    This function is called by IrTTP when it wants us to slow down the
+ *    transmission of data. We just mark the hardware as stopped, and wait
+ *    for IrTTP to notify us that things are OK again.
+ */
+static void ircomm_tty_flow_indication(void *instance, void *sap,
+				       LOCAL_FLOW cmd)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	struct tty_struct *tty;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	tty = self->tty;
+
+	switch (cmd) {
+	case FLOW_START:
+		IRDA_DEBUG(2, "%s(), hw start!\n", __func__ );
+		tty->hw_stopped = 0;
+
+		/* ircomm_tty_do_softint will take care of the rest */
+		schedule_work(&self->tqueue);
+		break;
+	default:  /* If we get here, something is very wrong, better stop */
+	case FLOW_STOP:
+		IRDA_DEBUG(2, "%s(), hw stopped!\n", __func__ );
+		tty->hw_stopped = 1;
+		break;
+	}
+	self->flow = cmd;
+}
+
+#ifdef CONFIG_PROC_FS
+static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
+{
+	char sep;
+
+	seq_printf(m, "State: %s\n", ircomm_tty_state[self->state]);
+
+	seq_puts(m, "Service type: ");
+	if (self->service_type & IRCOMM_9_WIRE)
+		seq_puts(m, "9_WIRE");
+	else if (self->service_type & IRCOMM_3_WIRE)
+		seq_puts(m, "3_WIRE");
+	else if (self->service_type & IRCOMM_3_WIRE_RAW)
+		seq_puts(m, "3_WIRE_RAW");
+	else
+		seq_puts(m, "No common service type!\n");
+	seq_putc(m, '\n');
+
+	seq_printf(m, "Port name: %s\n", self->settings.port_name);
+
+	seq_printf(m, "DTE status:");
+	sep = ' ';
+	if (self->settings.dte & IRCOMM_RTS) {
+		seq_printf(m, "%cRTS", sep);
+		sep = '|';
+	}
+	if (self->settings.dte & IRCOMM_DTR) {
+		seq_printf(m, "%cDTR", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "DCE status:");
+	sep = ' ';
+	if (self->settings.dce & IRCOMM_CTS) {
+		seq_printf(m, "%cCTS", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_DSR) {
+		seq_printf(m, "%cDSR", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_CD) {
+		seq_printf(m, "%cCD", sep);
+		sep = '|';
+	}
+	if (self->settings.dce & IRCOMM_RI) {
+		seq_printf(m, "%cRI", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "Configuration: ");
+	if (!self->settings.null_modem)
+		seq_puts(m, "DTE <-> DCE\n");
+	else
+		seq_puts(m, "DTE <-> DTE (null modem emulation)\n");
+
+	seq_printf(m, "Data rate: %d\n", self->settings.data_rate);
+
+	seq_puts(m, "Flow control:");
+	sep = ' ';
+	if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) {
+		seq_printf(m, "%cXON_XOFF_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) {
+		seq_printf(m, "%cXON_XOFF_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) {
+		seq_printf(m, "%cRTS_CTS_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) {
+		seq_printf(m, "%cRTS_CTS_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) {
+		seq_printf(m, "%cDSR_DTR_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) {
+		seq_printf(m, "%cDSR_DTR_OUT", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) {
+		seq_printf(m, "%cENQ_ACK_IN", sep);
+		sep = '|';
+	}
+	if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) {
+		seq_printf(m, "%cENQ_ACK_OUT", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_puts(m, "Flags:");
+	sep = ' ';
+	if (self->flags & ASYNC_CTS_FLOW) {
+		seq_printf(m, "%cASYNC_CTS_FLOW", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_CHECK_CD) {
+		seq_printf(m, "%cASYNC_CHECK_CD", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_INITIALIZED) {
+		seq_printf(m, "%cASYNC_INITIALIZED", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_LOW_LATENCY) {
+		seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_CLOSING) {
+		seq_printf(m, "%cASYNC_CLOSING", sep);
+		sep = '|';
+	}
+	if (self->flags & ASYNC_NORMAL_ACTIVE) {
+		seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
+		sep = '|';
+	}
+	seq_putc(m, '\n');
+
+	seq_printf(m, "Role: %s\n", self->client ? "client" : "server");
+	seq_printf(m, "Open count: %d\n", self->open_count);
+	seq_printf(m, "Max data size: %d\n", self->max_data_size);
+	seq_printf(m, "Max header size: %d\n", self->max_header_size);
+
+	if (self->tty)
+		seq_printf(m, "Hardware: %s\n",
+			       self->tty->hw_stopped ? "Stopped" : "Running");
+}
+
+static int ircomm_tty_proc_show(struct seq_file *m, void *v)
+{
+	struct ircomm_tty_cb *self;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags);
+
+	self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty);
+	while (self != NULL) {
+		if (self->magic != IRCOMM_TTY_MAGIC)
+			break;
+
+		ircomm_tty_line_info(self, m);
+		self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty);
+	}
+	spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags);
+	return 0;
+}
+
+static int ircomm_tty_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ircomm_tty_proc_show, NULL);
+}
+
+static const struct file_operations ircomm_tty_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ircomm_tty_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
+MODULE_DESCRIPTION("IrCOMM serial TTY driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CHARDEV_MAJOR(IRCOMM_TTY_MAJOR);
+
+module_init(ircomm_tty_init);
+module_exit(ircomm_tty_cleanup);
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
new file mode 100644
index 00000000..3c175402
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -0,0 +1,997 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_tty_attach.c
+ * Version:
+ * Description:   Code for attaching the serial driver to IrCOMM
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sat Jun  5 17:42:00 1999
+ * Modified at:   Tue Jan  4 14:20:49 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/sched.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/parameters.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_event.h>
+
+#include <net/irda/ircomm_tty.h>
+#include <net/irda/ircomm_tty_attach.h>
+
+static void ircomm_tty_ias_register(struct ircomm_tty_cb *self);
+static void ircomm_tty_discovery_indication(discinfo_t *discovery,
+					    DISCOVERY_MODE mode,
+					    void *priv);
+static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id,
+					struct ias_value *value, void *priv);
+static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self,
+					    int timeout);
+static void ircomm_tty_watchdog_timer_expired(void *data);
+
+static int ircomm_tty_state_idle(struct ircomm_tty_cb *self,
+				 IRCOMM_TTY_EVENT event,
+				 struct sk_buff *skb,
+				 struct ircomm_tty_info *info);
+static int ircomm_tty_state_search(struct ircomm_tty_cb *self,
+				   IRCOMM_TTY_EVENT event,
+				   struct sk_buff *skb,
+				   struct ircomm_tty_info *info);
+static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self,
+					     IRCOMM_TTY_EVENT event,
+					     struct sk_buff *skb,
+					     struct ircomm_tty_info *info);
+static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self,
+					   IRCOMM_TTY_EVENT event,
+					   struct sk_buff *skb,
+					   struct ircomm_tty_info *info);
+static int ircomm_tty_state_setup(struct ircomm_tty_cb *self,
+				  IRCOMM_TTY_EVENT event,
+				  struct sk_buff *skb,
+				  struct ircomm_tty_info *info);
+static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
+				  IRCOMM_TTY_EVENT event,
+				  struct sk_buff *skb,
+				  struct ircomm_tty_info *info);
+
+const char *const ircomm_tty_state[] = {
+	"IRCOMM_TTY_IDLE",
+	"IRCOMM_TTY_SEARCH",
+	"IRCOMM_TTY_QUERY_PARAMETERS",
+	"IRCOMM_TTY_QUERY_LSAP_SEL",
+	"IRCOMM_TTY_SETUP",
+	"IRCOMM_TTY_READY",
+	"*** ERROR *** ",
+};
+
+#ifdef CONFIG_IRDA_DEBUG
+static const char *const ircomm_tty_event[] = {
+	"IRCOMM_TTY_ATTACH_CABLE",
+	"IRCOMM_TTY_DETACH_CABLE",
+	"IRCOMM_TTY_DATA_REQUEST",
+	"IRCOMM_TTY_DATA_INDICATION",
+	"IRCOMM_TTY_DISCOVERY_REQUEST",
+	"IRCOMM_TTY_DISCOVERY_INDICATION",
+	"IRCOMM_TTY_CONNECT_CONFIRM",
+	"IRCOMM_TTY_CONNECT_INDICATION",
+	"IRCOMM_TTY_DISCONNECT_REQUEST",
+	"IRCOMM_TTY_DISCONNECT_INDICATION",
+	"IRCOMM_TTY_WD_TIMER_EXPIRED",
+	"IRCOMM_TTY_GOT_PARAMETERS",
+	"IRCOMM_TTY_GOT_LSAPSEL",
+	"*** ERROR ****",
+};
+#endif /* CONFIG_IRDA_DEBUG */
+
+static int (*state[])(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
+		      struct sk_buff *skb, struct ircomm_tty_info *info) =
+{
+	ircomm_tty_state_idle,
+	ircomm_tty_state_search,
+	ircomm_tty_state_query_parameters,
+	ircomm_tty_state_query_lsap_sel,
+	ircomm_tty_state_setup,
+	ircomm_tty_state_ready,
+};
+
+/*
+ * Function ircomm_tty_attach_cable (driver)
+ *
+ *    Try to attach cable (IrCOMM link). This function will only return
+ *    when the link has been connected, or if an error condition occurs.
+ *    If success, the return value is the resulting service type.
+ */
+int ircomm_tty_attach_cable(struct ircomm_tty_cb *self)
+{
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	/* Check if somebody has already connected to us */
+	if (ircomm_is_connected(self->ircomm)) {
+		IRDA_DEBUG(0, "%s(), already connected!\n", __func__ );
+		return 0;
+	}
+
+	/* Make sure nobody tries to write before the link is up */
+	self->tty->hw_stopped = 1;
+
+	ircomm_tty_ias_register(self);
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_ATTACH_CABLE, NULL, NULL);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_detach_cable (driver)
+ *
+ *    Detach cable, or cable has been detached by peer
+ *
+ */
+void ircomm_tty_detach_cable(struct ircomm_tty_cb *self)
+{
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	del_timer(&self->watchdog_timer);
+
+	/* Remove discovery handler */
+	if (self->ckey) {
+		irlmp_unregister_client(self->ckey);
+		self->ckey = NULL;
+	}
+	/* Remove IrCOMM hint bits */
+	if (self->skey) {
+		irlmp_unregister_service(self->skey);
+		self->skey = NULL;
+	}
+
+	if (self->iriap) {
+		iriap_close(self->iriap);
+		self->iriap = NULL;
+	}
+
+	/* Remove LM-IAS object */
+	if (self->obj) {
+		irias_delete_object(self->obj);
+		self->obj = NULL;
+	}
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_DETACH_CABLE, NULL, NULL);
+
+	/* Reset some values */
+	self->daddr = self->saddr = 0;
+	self->dlsap_sel = self->slsap_sel = 0;
+
+	memset(&self->settings, 0, sizeof(struct ircomm_params));
+}
+
+/*
+ * Function ircomm_tty_ias_register (self)
+ *
+ *    Register with LM-IAS depending on which service type we are
+ *
+ */
+static void ircomm_tty_ias_register(struct ircomm_tty_cb *self)
+{
+	__u8 oct_seq[6];
+	__u16 hints;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/* Compute hint bits based on service */
+	hints = irlmp_service_to_hint(S_COMM);
+	if (self->service_type & IRCOMM_3_WIRE_RAW)
+		hints |= irlmp_service_to_hint(S_PRINTER);
+
+	/* Advertise IrCOMM hint bit in discovery */
+	if (!self->skey)
+		self->skey = irlmp_register_service(hints);
+	/* Set up a discovery handler */
+	if (!self->ckey)
+		self->ckey = irlmp_register_client(hints,
+						   ircomm_tty_discovery_indication,
+						   NULL, (void *) self);
+
+	/* If already done, no need to do it again */
+	if (self->obj)
+		return;
+
+	if (self->service_type & IRCOMM_3_WIRE_RAW) {
+		/* Register IrLPT with LM-IAS */
+		self->obj = irias_new_object("IrLPT", IAS_IRLPT_ID);
+		irias_add_integer_attrib(self->obj, "IrDA:IrLMP:LsapSel",
+					 self->slsap_sel, IAS_KERNEL_ATTR);
+	} else {
+		/* Register IrCOMM with LM-IAS */
+		self->obj = irias_new_object("IrDA:IrCOMM", IAS_IRCOMM_ID);
+		irias_add_integer_attrib(self->obj, "IrDA:TinyTP:LsapSel",
+					 self->slsap_sel, IAS_KERNEL_ATTR);
+
+		/* Code the parameters into the buffer */
+		irda_param_pack(oct_seq, "bbbbbb",
+				IRCOMM_SERVICE_TYPE, 1, self->service_type,
+				IRCOMM_PORT_TYPE,    1, IRCOMM_SERIAL);
+
+		/* Register parameters with LM-IAS */
+		irias_add_octseq_attrib(self->obj, "Parameters", oct_seq, 6,
+					IAS_KERNEL_ATTR);
+	}
+	irias_insert_object(self->obj);
+}
+
+/*
+ * Function ircomm_tty_ias_unregister (self)
+ *
+ *    Remove our IAS object and client hook while connected.
+ *
+ */
+static void ircomm_tty_ias_unregister(struct ircomm_tty_cb *self)
+{
+	/* Remove LM-IAS object now so it is not reused.
+	 * IrCOMM deals very poorly with multiple incoming connections.
+	 * It should looks a lot more like IrNET, and "dup" a server TSAP
+	 * to the application TSAP (based on various rules).
+	 * This is a cheap workaround allowing multiple clients to
+	 * connect to us. It will not always work.
+	 * Each IrCOMM socket has an IAS entry. Incoming connection will
+	 * pick the first one found. So, when we are fully connected,
+	 * we remove our IAS entries so that the next IAS entry is used.
+	 * We do that for *both* client and server, because a server
+	 * can also create client instances.
+	 * Jean II */
+	if (self->obj) {
+		irias_delete_object(self->obj);
+		self->obj = NULL;
+	}
+
+#if 0
+	/* Remove discovery handler.
+	 * While we are connected, we no longer need to receive
+	 * discovery events. This would be the case if there is
+	 * multiple IrLAP interfaces. Jean II */
+	if (self->ckey) {
+		irlmp_unregister_client(self->ckey);
+		self->ckey = NULL;
+	}
+#endif
+}
+
+/*
+ * Function ircomm_send_initial_parameters (self)
+ *
+ *    Send initial parameters to the remote IrCOMM device. These parameters
+ *    must be sent before any data.
+ */
+int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self)
+{
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (self->service_type & IRCOMM_3_WIRE_RAW)
+		return 0;
+
+	/*
+	 * Set default values, but only if the application for some reason
+	 * haven't set them already
+	 */
+	IRDA_DEBUG(2, "%s(), data-rate = %d\n", __func__ ,
+		   self->settings.data_rate);
+	if (!self->settings.data_rate)
+		self->settings.data_rate = 9600;
+	IRDA_DEBUG(2, "%s(), data-format = %d\n", __func__ ,
+		   self->settings.data_format);
+	if (!self->settings.data_format)
+		self->settings.data_format = IRCOMM_WSIZE_8;  /* 8N1 */
+
+	IRDA_DEBUG(2, "%s(), flow-control = %d\n", __func__ ,
+		   self->settings.flow_control);
+	/*self->settings.flow_control = IRCOMM_RTS_CTS_IN|IRCOMM_RTS_CTS_OUT;*/
+
+	/* Do not set delta values for the initial parameters */
+	self->settings.dte = IRCOMM_DTR | IRCOMM_RTS;
+
+	/* Only send service type parameter when we are the client */
+	if (self->client)
+		ircomm_param_request(self, IRCOMM_SERVICE_TYPE, FALSE);
+	ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
+	ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE);
+
+	/* For a 3 wire service, we just flush the last parameter and return */
+	if (self->settings.service_type == IRCOMM_3_WIRE) {
+		ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE);
+		return 0;
+	}
+
+	/* Only 9-wire service types continue here */
+	ircomm_param_request(self, IRCOMM_FLOW_CONTROL, FALSE);
+#if 0
+	ircomm_param_request(self, IRCOMM_XON_XOFF, FALSE);
+	ircomm_param_request(self, IRCOMM_ENQ_ACK, FALSE);
+#endif
+	/* Notify peer that we are ready to receive data */
+	ircomm_param_request(self, IRCOMM_DTE, TRUE);
+
+	return 0;
+}
+
+/*
+ * Function ircomm_tty_discovery_indication (discovery)
+ *
+ *    Remote device is discovered, try query the remote IAS to see which
+ *    device it is, and which services it has.
+ *
+ */
+static void ircomm_tty_discovery_indication(discinfo_t *discovery,
+					    DISCOVERY_MODE mode,
+					    void *priv)
+{
+	struct ircomm_tty_cb *self;
+	struct ircomm_tty_info info;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Important note :
+	 * We need to drop all passive discoveries.
+	 * The LSAP management of IrComm is deficient and doesn't deal
+	 * with the case of two instance connecting to each other
+	 * simultaneously (it will deadlock in LMP).
+	 * The proper fix would be to use the same technique as in IrNET,
+	 * to have one server socket and separate instances for the
+	 * connecting/connected socket.
+	 * The workaround is to drop passive discovery, which drastically
+	 * reduce the probability of this happening.
+	 * Jean II */
+	if(mode == DISCOVERY_PASSIVE)
+		return;
+
+	info.daddr = discovery->daddr;
+	info.saddr = discovery->saddr;
+
+	self = (struct ircomm_tty_cb *) priv;
+	ircomm_tty_do_event(self, IRCOMM_TTY_DISCOVERY_INDICATION,
+			    NULL, &info);
+}
+
+/*
+ * Function ircomm_tty_disconnect_indication (instance, sap, reason, skb)
+ *
+ *    Link disconnected
+ *
+ */
+void ircomm_tty_disconnect_indication(void *instance, void *sap,
+				      LM_REASON reason,
+				      struct sk_buff *skb)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	if (!self->tty)
+		return;
+
+	/* This will stop control data transfers */
+	self->flow = FLOW_STOP;
+
+	/* Stop data transfers */
+	self->tty->hw_stopped = 1;
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_DISCONNECT_INDICATION, NULL,
+			    NULL);
+}
+
+/*
+ * Function ircomm_tty_getvalue_confirm (result, obj_id, value, priv)
+ *
+ *    Got result from the IAS query we make
+ *
+ */
+static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id,
+					struct ias_value *value,
+					void *priv)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) priv;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	/* We probably don't need to make any more queries */
+	iriap_close(self->iriap);
+	self->iriap = NULL;
+
+	/* Check if request succeeded */
+	if (result != IAS_SUCCESS) {
+		IRDA_DEBUG(4, "%s(), got NULL value!\n", __func__ );
+		return;
+	}
+
+	switch (value->type) {
+	case IAS_OCT_SEQ:
+		IRDA_DEBUG(2, "%s(), got octet sequence\n", __func__ );
+
+		irda_param_extract_all(self, value->t.oct_seq, value->len,
+				       &ircomm_param_info);
+
+		ircomm_tty_do_event(self, IRCOMM_TTY_GOT_PARAMETERS, NULL,
+				    NULL);
+		break;
+	case IAS_INTEGER:
+		/* Got LSAP selector */
+		IRDA_DEBUG(2, "%s(), got lsapsel = %d\n", __func__ ,
+			   value->t.integer);
+
+		if (value->t.integer == -1) {
+			IRDA_DEBUG(0, "%s(), invalid value!\n", __func__ );
+		} else
+			self->dlsap_sel = value->t.integer;
+
+		ircomm_tty_do_event(self, IRCOMM_TTY_GOT_LSAPSEL, NULL, NULL);
+		break;
+	case IAS_MISSING:
+		IRDA_DEBUG(0, "%s(), got IAS_MISSING\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), got unknown type!\n", __func__ );
+		break;
+	}
+	irias_delete_value(value);
+}
+
+/*
+ * Function ircomm_tty_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ *    Connection confirmed
+ *
+ */
+void ircomm_tty_connect_confirm(void *instance, void *sap,
+				struct qos_info *qos,
+				__u32 max_data_size,
+				__u8 max_header_size,
+				struct sk_buff *skb)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	self->client = TRUE;
+	self->max_data_size = max_data_size;
+	self->max_header_size = max_header_size;
+	self->flow = FLOW_START;
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_CONFIRM, NULL, NULL);
+
+	/* No need to kfree_skb - see ircomm_ttp_connect_confirm() */
+}
+
+/*
+ * Function ircomm_tty_connect_indication (instance, sap, qos, max_sdu_size,
+ *                                         skb)
+ *
+ *    we are discovered and being requested to connect by remote device !
+ *
+ */
+void ircomm_tty_connect_indication(void *instance, void *sap,
+				   struct qos_info *qos,
+				   __u32 max_data_size,
+				   __u8 max_header_size,
+				   struct sk_buff *skb)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance;
+	int clen;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	self->client = FALSE;
+	self->max_data_size = max_data_size;
+	self->max_header_size = max_header_size;
+	self->flow = FLOW_START;
+
+	clen = skb->data[0];
+	if (clen)
+		irda_param_extract_all(self, skb->data+1,
+				       IRDA_MIN(skb->len, clen),
+				       &ircomm_param_info);
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_INDICATION, NULL, NULL);
+
+	/* No need to kfree_skb - see ircomm_ttp_connect_indication() */
+}
+
+/*
+ * Function ircomm_tty_link_established (self)
+ *
+ *    Called when the IrCOMM link is established
+ *
+ */
+void ircomm_tty_link_established(struct ircomm_tty_cb *self)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	if (!self->tty)
+		return;
+
+	del_timer(&self->watchdog_timer);
+
+	/*
+	 * IrCOMM link is now up, and if we are not using hardware
+	 * flow-control, then declare the hardware as running. Otherwise we
+	 * will have to wait for the peer device (DCE) to raise the CTS
+	 * line.
+	 */
+	if ((self->flags & ASYNC_CTS_FLOW) && ((self->settings.dce & IRCOMM_CTS) == 0)) {
+		IRDA_DEBUG(0, "%s(), waiting for CTS ...\n", __func__ );
+		return;
+	} else {
+		IRDA_DEBUG(1, "%s(), starting hardware!\n", __func__ );
+
+		self->tty->hw_stopped = 0;
+
+		/* Wake up processes blocked on open */
+		wake_up_interruptible(&self->open_wait);
+	}
+
+	schedule_work(&self->tqueue);
+}
+
+/*
+ * Function ircomm_tty_start_watchdog_timer (self, timeout)
+ *
+ *    Start the watchdog timer. This timer is used to make sure that any
+ *    connection attempt is successful, and if not, we will retry after
+ *    the timeout
+ */
+static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self,
+					    int timeout)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	irda_start_timer(&self->watchdog_timer, timeout, (void *) self,
+			 ircomm_tty_watchdog_timer_expired);
+}
+
+/*
+ * Function ircomm_tty_watchdog_timer_expired (data)
+ *
+ *    Called when the connect procedure have taken to much time.
+ *
+ */
+static void ircomm_tty_watchdog_timer_expired(void *data)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	ircomm_tty_do_event(self, IRCOMM_TTY_WD_TIMER_EXPIRED, NULL, NULL);
+}
+
+
+/*
+ * Function ircomm_tty_do_event (self, event, skb)
+ *
+ *    Process event
+ *
+ */
+int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
+			struct sk_buff *skb, struct ircomm_tty_info *info)
+{
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+	return (*state[self->state])(self, event, skb, info);
+}
+
+/*
+ * Function ircomm_tty_next_state (self, state)
+ *
+ *    Switch state
+ *
+ */
+static inline void ircomm_tty_next_state(struct ircomm_tty_cb *self, IRCOMM_TTY_STATE state)
+{
+	/*
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
+
+	IRDA_DEBUG(2, "%s: next state=%s, service type=%d\n", __func__ ,
+		   ircomm_tty_state[self->state], self->service_type);
+	*/
+	self->state = state;
+}
+
+/*
+ * Function ircomm_tty_state_idle (self, event, skb, info)
+ *
+ *    Just hanging around
+ *
+ */
+static int ircomm_tty_state_idle(struct ircomm_tty_cb *self,
+				 IRCOMM_TTY_EVENT event,
+				 struct sk_buff *skb,
+				 struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+	switch (event) {
+	case IRCOMM_TTY_ATTACH_CABLE:
+		/* Try to discover any remote devices */
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+
+		irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+		break;
+	case IRCOMM_TTY_DISCOVERY_INDICATION:
+		self->daddr = info->daddr;
+		self->saddr = info->saddr;
+
+		if (self->iriap) {
+			IRDA_WARNING("%s(), busy with a previous query\n",
+				     __func__);
+			return -EBUSY;
+		}
+
+		self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+					 ircomm_tty_getvalue_confirm);
+
+		iriap_getvaluebyclass_request(self->iriap,
+					      self->saddr, self->daddr,
+					      "IrDA:IrCOMM", "Parameters");
+
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS);
+		break;
+	case IRCOMM_TTY_CONNECT_INDICATION:
+		del_timer(&self->watchdog_timer);
+
+		/* Accept connection */
+		ircomm_connect_response(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_WD_TIMER_EXPIRED:
+		/* Just stay idle */
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_state_search (self, event, skb, info)
+ *
+ *    Trying to discover an IrCOMM device
+ *
+ */
+static int ircomm_tty_state_search(struct ircomm_tty_cb *self,
+				   IRCOMM_TTY_EVENT event,
+				   struct sk_buff *skb,
+				   struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+	switch (event) {
+	case IRCOMM_TTY_DISCOVERY_INDICATION:
+		self->daddr = info->daddr;
+		self->saddr = info->saddr;
+
+		if (self->iriap) {
+			IRDA_WARNING("%s(), busy with a previous query\n",
+				     __func__);
+			return -EBUSY;
+		}
+
+		self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+					 ircomm_tty_getvalue_confirm);
+
+		if (self->service_type == IRCOMM_3_WIRE_RAW) {
+			iriap_getvaluebyclass_request(self->iriap, self->saddr,
+						      self->daddr, "IrLPT",
+						      "IrDA:IrLMP:LsapSel");
+			ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL);
+		} else {
+			iriap_getvaluebyclass_request(self->iriap, self->saddr,
+						      self->daddr,
+						      "IrDA:IrCOMM",
+						      "Parameters");
+
+			ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS);
+		}
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		break;
+	case IRCOMM_TTY_CONNECT_INDICATION:
+		del_timer(&self->watchdog_timer);
+		ircomm_tty_ias_unregister(self);
+
+		/* Accept connection */
+		ircomm_connect_response(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_WD_TIMER_EXPIRED:
+#if 1
+		/* Give up */
+#else
+		/* Try to discover any remote devices */
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+#endif
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_state_query (self, event, skb, info)
+ *
+ *    Querying the remote LM-IAS for IrCOMM parameters
+ *
+ */
+static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self,
+					     IRCOMM_TTY_EVENT event,
+					     struct sk_buff *skb,
+					     struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+	switch (event) {
+	case IRCOMM_TTY_GOT_PARAMETERS:
+		if (self->iriap) {
+			IRDA_WARNING("%s(), busy with a previous query\n",
+				     __func__);
+			return -EBUSY;
+		}
+
+		self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+					 ircomm_tty_getvalue_confirm);
+
+		iriap_getvaluebyclass_request(self->iriap, self->saddr,
+					      self->daddr, "IrDA:IrCOMM",
+					      "IrDA:TinyTP:LsapSel");
+
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL);
+		break;
+	case IRCOMM_TTY_WD_TIMER_EXPIRED:
+		/* Go back to search mode */
+		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		break;
+	case IRCOMM_TTY_CONNECT_INDICATION:
+		del_timer(&self->watchdog_timer);
+		ircomm_tty_ias_unregister(self);
+
+		/* Accept connection */
+		ircomm_connect_response(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_state_query_lsap_sel (self, event, skb, info)
+ *
+ *    Query remote LM-IAS for the LSAP selector which we can connect to
+ *
+ */
+static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self,
+					   IRCOMM_TTY_EVENT event,
+					   struct sk_buff *skb,
+					   struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+	switch (event) {
+	case IRCOMM_TTY_GOT_LSAPSEL:
+		/* Connect to remote device */
+		ret = ircomm_connect_request(self->ircomm, self->dlsap_sel,
+					     self->saddr, self->daddr,
+					     NULL, self->service_type);
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		ircomm_tty_next_state(self, IRCOMM_TTY_SETUP);
+		break;
+	case IRCOMM_TTY_WD_TIMER_EXPIRED:
+		/* Go back to search mode */
+		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		break;
+	case IRCOMM_TTY_CONNECT_INDICATION:
+		del_timer(&self->watchdog_timer);
+		ircomm_tty_ias_unregister(self);
+
+		/* Accept connection */
+		ircomm_connect_response(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_state_setup (self, event, skb, info)
+ *
+ *    Trying to connect
+ *
+ */
+static int ircomm_tty_state_setup(struct ircomm_tty_cb *self,
+				  IRCOMM_TTY_EVENT event,
+				  struct sk_buff *skb,
+				  struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __func__ ,
+		   ircomm_tty_state[self->state], ircomm_tty_event[event]);
+
+	switch (event) {
+	case IRCOMM_TTY_CONNECT_CONFIRM:
+		del_timer(&self->watchdog_timer);
+		ircomm_tty_ias_unregister(self);
+
+		/*
+		 * Send initial parameters. This will also send out queued
+		 * parameters waiting for the connection to come up
+		 */
+		ircomm_tty_send_initial_parameters(self);
+		ircomm_tty_link_established(self);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_CONNECT_INDICATION:
+		del_timer(&self->watchdog_timer);
+		ircomm_tty_ias_unregister(self);
+
+		/* Accept connection */
+		ircomm_connect_response(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_READY);
+		break;
+	case IRCOMM_TTY_WD_TIMER_EXPIRED:
+		/* Go back to search mode */
+		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		/* ircomm_disconnect_request(self->ircomm, NULL); */
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/*
+ * Function ircomm_tty_state_ready (self, event, skb, info)
+ *
+ *    IrCOMM is now connected
+ *
+ */
+static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
+				  IRCOMM_TTY_EVENT event,
+				  struct sk_buff *skb,
+				  struct ircomm_tty_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case IRCOMM_TTY_DATA_REQUEST:
+		ret = ircomm_data_request(self->ircomm, skb);
+		break;
+	case IRCOMM_TTY_DETACH_CABLE:
+		ircomm_disconnect_request(self->ircomm, NULL);
+		ircomm_tty_next_state(self, IRCOMM_TTY_IDLE);
+		break;
+	case IRCOMM_TTY_DISCONNECT_INDICATION:
+		ircomm_tty_ias_register(self);
+		ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH);
+		ircomm_tty_start_watchdog_timer(self, 3*HZ);
+
+		if (self->flags & ASYNC_CHECK_CD) {
+			/* Drop carrier */
+			self->settings.dce = IRCOMM_DELTA_CD;
+			ircomm_tty_check_modem_status(self);
+		} else {
+			IRDA_DEBUG(0, "%s(), hanging up!\n", __func__ );
+			if (self->tty)
+				tty_hangup(self->tty);
+		}
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown event: %s\n", __func__ ,
+			   ircomm_tty_event[event]);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
new file mode 100644
index 00000000..77c5e649
--- /dev/null
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -0,0 +1,427 @@
+/*********************************************************************
+ *
+ * Filename:      ircomm_tty_ioctl.c
+ * Version:
+ * Description:
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Jun 10 14:39:09 1999
+ * Modified at:   Wed Jan  5 14:45:43 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/termios.h>
+#include <linux/tty.h>
+#include <linux/serial.h>
+
+#include <asm/uaccess.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+#include <net/irda/ircomm_tty_attach.h>
+#include <net/irda/ircomm_tty.h>
+
+#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
+
+/*
+ * Function ircomm_tty_change_speed (driver)
+ *
+ *    Change speed of the driver. If the remote device is a DCE, then this
+ *    should make it change the speed of its serial port
+ */
+static void ircomm_tty_change_speed(struct ircomm_tty_cb *self)
+{
+	unsigned cflag, cval;
+	int baud;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (!self->tty || !self->tty->termios || !self->ircomm)
+		return;
+
+	cflag = self->tty->termios->c_cflag;
+
+	/*  byte size and parity */
+	switch (cflag & CSIZE) {
+	case CS5: cval = IRCOMM_WSIZE_5; break;
+	case CS6: cval = IRCOMM_WSIZE_6; break;
+	case CS7: cval = IRCOMM_WSIZE_7; break;
+	case CS8: cval = IRCOMM_WSIZE_8; break;
+	default:  cval = IRCOMM_WSIZE_5; break;
+	}
+	if (cflag & CSTOPB)
+		cval |= IRCOMM_2_STOP_BIT;
+
+	if (cflag & PARENB)
+		cval |= IRCOMM_PARITY_ENABLE;
+	if (!(cflag & PARODD))
+		cval |= IRCOMM_PARITY_EVEN;
+
+	/* Determine divisor based on baud rate */
+	baud = tty_get_baud_rate(self->tty);
+	if (!baud)
+		baud = 9600;	/* B0 transition handled in rs_set_termios */
+
+	self->settings.data_rate = baud;
+	ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE);
+
+	/* CTS flow control flag and modem status interrupts */
+	if (cflag & CRTSCTS) {
+		self->flags |= ASYNC_CTS_FLOW;
+		self->settings.flow_control |= IRCOMM_RTS_CTS_IN;
+		/* This got me. Bummer. Jean II */
+		if (self->service_type == IRCOMM_3_WIRE_RAW)
+			IRDA_WARNING("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n", __func__);
+	} else {
+		self->flags &= ~ASYNC_CTS_FLOW;
+		self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN;
+	}
+	if (cflag & CLOCAL)
+		self->flags &= ~ASYNC_CHECK_CD;
+	else
+		self->flags |= ASYNC_CHECK_CD;
+#if 0
+	/*
+	 * Set up parity check flag
+	 */
+
+	if (I_INPCK(self->tty))
+		driver->read_status_mask |= LSR_FE | LSR_PE;
+	if (I_BRKINT(driver->tty) || I_PARMRK(driver->tty))
+		driver->read_status_mask |= LSR_BI;
+
+	/*
+	 * Characters to ignore
+	 */
+	driver->ignore_status_mask = 0;
+	if (I_IGNPAR(driver->tty))
+		driver->ignore_status_mask |= LSR_PE | LSR_FE;
+
+	if (I_IGNBRK(self->tty)) {
+		self->ignore_status_mask |= LSR_BI;
+		/*
+		 * If we're ignore parity and break indicators, ignore
+		 * overruns too. (For real raw support).
+		 */
+		if (I_IGNPAR(self->tty))
+			self->ignore_status_mask |= LSR_OE;
+	}
+#endif
+	self->settings.data_format = cval;
+
+	ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE);
+	ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE);
+}
+
+/*
+ * Function ircomm_tty_set_termios (tty, old_termios)
+ *
+ *    This routine allows the tty driver to be notified when device's
+ *    termios settings have changed.  Note that a well-designed tty driver
+ *    should be prepared to accept the case where old == NULL, and try to
+ *    do something rational.
+ */
+void ircomm_tty_set_termios(struct tty_struct *tty,
+			    struct ktermios *old_termios)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned int cflag = tty->termios->c_cflag;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if ((cflag == old_termios->c_cflag) &&
+	    (RELEVANT_IFLAG(tty->termios->c_iflag) ==
+	     RELEVANT_IFLAG(old_termios->c_iflag)))
+	{
+		return;
+	}
+
+	ircomm_tty_change_speed(self);
+
+	/* Handle transition to B0 status */
+	if ((old_termios->c_cflag & CBAUD) &&
+	    !(cflag & CBAUD)) {
+		self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS);
+		ircomm_param_request(self, IRCOMM_DTE, TRUE);
+	}
+
+	/* Handle transition away from B0 status */
+	if (!(old_termios->c_cflag & CBAUD) &&
+	    (cflag & CBAUD)) {
+		self->settings.dte |= IRCOMM_DTR;
+		if (!(tty->termios->c_cflag & CRTSCTS) ||
+		    !test_bit(TTY_THROTTLED, &tty->flags)) {
+			self->settings.dte |= IRCOMM_RTS;
+		}
+		ircomm_param_request(self, IRCOMM_DTE, TRUE);
+	}
+
+	/* Handle turning off CRTSCTS */
+	if ((old_termios->c_cflag & CRTSCTS) &&
+	    !(tty->termios->c_cflag & CRTSCTS))
+	{
+		tty->hw_stopped = 0;
+		ircomm_tty_start(tty);
+	}
+}
+
+/*
+ * Function ircomm_tty_tiocmget (tty)
+ *
+ *
+ *
+ */
+int ircomm_tty_tiocmget(struct tty_struct *tty)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	unsigned int result;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (tty->flags & (1 << TTY_IO_ERROR))
+		return -EIO;
+
+	result =  ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0)
+		| ((self->settings.dte & IRCOMM_DTR) ? TIOCM_DTR : 0)
+		| ((self->settings.dce & IRCOMM_CD)  ? TIOCM_CAR : 0)
+		| ((self->settings.dce & IRCOMM_RI)  ? TIOCM_RNG : 0)
+		| ((self->settings.dce & IRCOMM_DSR) ? TIOCM_DSR : 0)
+		| ((self->settings.dce & IRCOMM_CTS) ? TIOCM_CTS : 0);
+	return result;
+}
+
+/*
+ * Function ircomm_tty_tiocmset (tty, set, clear)
+ *
+ *
+ *
+ */
+int ircomm_tty_tiocmset(struct tty_struct *tty,
+			unsigned int set, unsigned int clear)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (tty->flags & (1 << TTY_IO_ERROR))
+		return -EIO;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;);
+
+	if (set & TIOCM_RTS)
+		self->settings.dte |= IRCOMM_RTS;
+	if (set & TIOCM_DTR)
+		self->settings.dte |= IRCOMM_DTR;
+
+	if (clear & TIOCM_RTS)
+		self->settings.dte &= ~IRCOMM_RTS;
+	if (clear & TIOCM_DTR)
+		self->settings.dte &= ~IRCOMM_DTR;
+
+	if ((set|clear) & TIOCM_RTS)
+		self->settings.dte |= IRCOMM_DELTA_RTS;
+	if ((set|clear) & TIOCM_DTR)
+		self->settings.dte |= IRCOMM_DELTA_DTR;
+
+	ircomm_param_request(self, IRCOMM_DTE, TRUE);
+
+	return 0;
+}
+
+/*
+ * Function get_serial_info (driver, retinfo)
+ *
+ *
+ *
+ */
+static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self,
+				      struct serial_struct __user *retinfo)
+{
+	struct serial_struct info;
+
+	if (!retinfo)
+		return -EFAULT;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	memset(&info, 0, sizeof(info));
+	info.line = self->line;
+	info.flags = self->flags;
+	info.baud_base = self->settings.data_rate;
+	info.close_delay = self->close_delay;
+	info.closing_wait = self->closing_wait;
+
+	/* For compatibility  */
+	info.type = PORT_16550A;
+	info.port = 0;
+	info.irq = 0;
+	info.xmit_fifo_size = 0;
+	info.hub6 = 0;
+	info.custom_divisor = 0;
+
+	if (copy_to_user(retinfo, &info, sizeof(*retinfo)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Function set_serial_info (driver, new_info)
+ *
+ *
+ *
+ */
+static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self,
+				      struct serial_struct __user *new_info)
+{
+#if 0
+	struct serial_struct new_serial;
+	struct ircomm_tty_cb old_state, *state;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	if (copy_from_user(&new_serial,new_info,sizeof(new_serial)))
+		return -EFAULT;
+
+
+	state = self
+	old_state = *self;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		if ((new_serial.baud_base != state->settings.data_rate) ||
+		    (new_serial.close_delay != state->close_delay) ||
+		    ((new_serial.flags & ~ASYNC_USR_MASK) !=
+		     (self->flags & ~ASYNC_USR_MASK)))
+			return -EPERM;
+		state->flags = ((state->flags & ~ASYNC_USR_MASK) |
+				 (new_serial.flags & ASYNC_USR_MASK));
+		self->flags = ((self->flags & ~ASYNC_USR_MASK) |
+			       (new_serial.flags & ASYNC_USR_MASK));
+		/* self->custom_divisor = new_serial.custom_divisor; */
+		goto check_and_exit;
+	}
+
+	/*
+	 * OK, past this point, all the error checking has been done.
+	 * At this point, we start making changes.....
+	 */
+
+	if (self->settings.data_rate != new_serial.baud_base) {
+		self->settings.data_rate = new_serial.baud_base;
+		ircomm_param_request(self, IRCOMM_DATA_RATE, TRUE);
+	}
+
+	self->close_delay = new_serial.close_delay * HZ/100;
+	self->closing_wait = new_serial.closing_wait * HZ/100;
+	/* self->custom_divisor = new_serial.custom_divisor; */
+
+	self->flags = ((self->flags & ~ASYNC_FLAGS) |
+		       (new_serial.flags & ASYNC_FLAGS));
+	self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+
+ check_and_exit:
+
+	if (self->flags & ASYNC_INITIALIZED) {
+		if (((old_state.flags & ASYNC_SPD_MASK) !=
+		     (self->flags & ASYNC_SPD_MASK)) ||
+		    (old_driver.custom_divisor != driver->custom_divisor)) {
+			if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
+				driver->tty->alt_speed = 57600;
+			if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI)
+				driver->tty->alt_speed = 115200;
+			if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI)
+				driver->tty->alt_speed = 230400;
+			if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP)
+				driver->tty->alt_speed = 460800;
+			ircomm_tty_change_speed(driver);
+		}
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Function ircomm_tty_ioctl (tty, cmd, arg)
+ *
+ *
+ *
+ */
+int ircomm_tty_ioctl(struct tty_struct *tty,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data;
+	int ret = 0;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) &&
+	    (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) &&
+	    (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) {
+		if (tty->flags & (1 << TTY_IO_ERROR))
+		    return -EIO;
+	}
+
+	switch (cmd) {
+	case TIOCGSERIAL:
+		ret = ircomm_tty_get_serial_info(self, (struct serial_struct __user *) arg);
+		break;
+	case TIOCSSERIAL:
+		ret = ircomm_tty_set_serial_info(self, (struct serial_struct __user *) arg);
+		break;
+	case TIOCMIWAIT:
+		IRDA_DEBUG(0, "(), TIOCMIWAIT, not impl!\n");
+		break;
+
+	case TIOCGICOUNT:
+		IRDA_DEBUG(0, "%s(), TIOCGICOUNT not impl!\n", __func__ );
+#if 0
+		save_flags(flags); cli();
+		cnow = driver->icount;
+		restore_flags(flags);
+		p_cuser = (struct serial_icounter_struct __user *) arg;
+		if (put_user(cnow.cts, &p_cuser->cts) ||
+		    put_user(cnow.dsr, &p_cuser->dsr) ||
+		    put_user(cnow.rng, &p_cuser->rng) ||
+		    put_user(cnow.dcd, &p_cuser->dcd) ||
+		    put_user(cnow.rx, &p_cuser->rx) ||
+		    put_user(cnow.tx, &p_cuser->tx) ||
+		    put_user(cnow.frame, &p_cuser->frame) ||
+		    put_user(cnow.overrun, &p_cuser->overrun) ||
+		    put_user(cnow.parity, &p_cuser->parity) ||
+		    put_user(cnow.brk, &p_cuser->brk) ||
+		    put_user(cnow.buf_overrun, &p_cuser->buf_overrun))
+			return -EFAULT;
+#endif
+		return 0;
+	default:
+		ret = -ENOIOCTLCMD;  /* ioctls which we must ignore */
+	}
+	return ret;
+}
+
+
+
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
new file mode 100644
index 00000000..25cc2e69
--- /dev/null
+++ b/net/irda/irda_device.c
@@ -0,0 +1,324 @@
+/*********************************************************************
+ *
+ * Filename:      irda_device.c
+ * Version:       0.9
+ * Description:   Utility functions used by the device drivers
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sat Oct  9 09:22:27 1999
+ * Modified at:   Sun Jan 23 17:41:24 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/kmod.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <asm/ioctls.h>
+#include <asm/uaccess.h>
+#include <asm/dma.h>
+#include <asm/io.h>
+
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/wrapper.h>
+
+static void __irda_task_delete(struct irda_task *task);
+
+static hashbin_t *dongles = NULL;
+static hashbin_t *tasks = NULL;
+
+static void irda_task_timer_expired(void *data);
+
+int __init irda_device_init( void)
+{
+	dongles = hashbin_new(HB_NOLOCK);
+	if (dongles == NULL) {
+		IRDA_WARNING("IrDA: Can't allocate dongles hashbin!\n");
+		return -ENOMEM;
+	}
+	spin_lock_init(&dongles->hb_spinlock);
+
+	tasks = hashbin_new(HB_LOCK);
+	if (tasks == NULL) {
+		IRDA_WARNING("IrDA: Can't allocate tasks hashbin!\n");
+		hashbin_delete(dongles, NULL);
+		return -ENOMEM;
+	}
+
+	/* We no longer initialise the driver ourselves here, we let
+	 * the system do it for us... - Jean II */
+
+	return 0;
+}
+
+static void leftover_dongle(void *arg)
+{
+	struct dongle_reg *reg = arg;
+	IRDA_WARNING("IrDA: Dongle type %x not unregistered\n",
+		     reg->type);
+}
+
+void irda_device_cleanup(void)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	hashbin_delete(tasks, (FREE_FUNC) __irda_task_delete);
+
+	hashbin_delete(dongles, leftover_dongle);
+}
+
+/*
+ * Function irda_device_set_media_busy (self, status)
+ *
+ *    Called when we have detected that another station is transmitting
+ *    in contention mode.
+ */
+void irda_device_set_media_busy(struct net_device *dev, int status)
+{
+	struct irlap_cb *self;
+
+	IRDA_DEBUG(4, "%s(%s)\n", __func__, status ? "TRUE" : "FALSE");
+
+	self = (struct irlap_cb *) dev->atalk_ptr;
+
+	/* Some drivers may enable the receive interrupt before calling
+	 * irlap_open(), or they may disable the receive interrupt
+	 * after calling irlap_close().
+	 * The IrDA stack is protected from this in irlap_driver_rcv().
+	 * However, the driver calls directly the wrapper, that calls
+	 * us directly. Make sure we protect ourselves.
+	 * Jean II */
+	if (!self || self->magic != LAP_MAGIC)
+		return;
+
+	if (status) {
+		self->media_busy = TRUE;
+		if (status == SMALL)
+			irlap_start_mbusy_timer(self, SMALLBUSY_TIMEOUT);
+		else
+			irlap_start_mbusy_timer(self, MEDIABUSY_TIMEOUT);
+		IRDA_DEBUG( 4, "Media busy!\n");
+	} else {
+		self->media_busy = FALSE;
+		irlap_stop_mbusy_timer(self);
+	}
+}
+EXPORT_SYMBOL(irda_device_set_media_busy);
+
+
+/*
+ * Function irda_device_is_receiving (dev)
+ *
+ *    Check if the device driver is currently receiving data
+ *
+ */
+int irda_device_is_receiving(struct net_device *dev)
+{
+	struct if_irda_req req;
+	int ret;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	if (!dev->netdev_ops->ndo_do_ioctl) {
+		IRDA_ERROR("%s: do_ioctl not impl. by device driver\n",
+			   __func__);
+		return -1;
+	}
+
+	ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req,
+					      SIOCGRECEIVING);
+	if (ret < 0)
+		return ret;
+
+	return req.ifr_receiving;
+}
+
+static void __irda_task_delete(struct irda_task *task)
+{
+	del_timer(&task->timer);
+
+	kfree(task);
+}
+
+static void irda_task_delete(struct irda_task *task)
+{
+	/* Unregister task */
+	hashbin_remove(tasks, (long) task, NULL);
+
+	__irda_task_delete(task);
+}
+
+/*
+ * Function irda_task_kick (task)
+ *
+ *    Tries to execute a task possible multiple times until the task is either
+ *    finished, or askes for a timeout. When a task is finished, we do post
+ *    processing, and notify the parent task, that is waiting for this task
+ *    to complete.
+ */
+static int irda_task_kick(struct irda_task *task)
+{
+	int finished = TRUE;
+	int count = 0;
+	int timeout;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(task != NULL, return -1;);
+	IRDA_ASSERT(task->magic == IRDA_TASK_MAGIC, return -1;);
+
+	/* Execute task until it's finished, or askes for a timeout */
+	do {
+		timeout = task->function(task);
+		if (count++ > 100) {
+			IRDA_ERROR("%s: error in task handler!\n",
+				   __func__);
+			irda_task_delete(task);
+			return TRUE;
+		}
+	} while ((timeout == 0) && (task->state != IRDA_TASK_DONE));
+
+	if (timeout < 0) {
+		IRDA_ERROR("%s: Error executing task!\n", __func__);
+		irda_task_delete(task);
+		return TRUE;
+	}
+
+	/* Check if we are finished */
+	if (task->state == IRDA_TASK_DONE) {
+		del_timer(&task->timer);
+
+		/* Do post processing */
+		if (task->finished)
+			task->finished(task);
+
+		/* Notify parent */
+		if (task->parent) {
+			/* Check if parent is waiting for us to complete */
+			if (task->parent->state == IRDA_TASK_CHILD_WAIT) {
+				task->parent->state = IRDA_TASK_CHILD_DONE;
+
+				/* Stop timer now that we are here */
+				del_timer(&task->parent->timer);
+
+				/* Kick parent task */
+				irda_task_kick(task->parent);
+			}
+		}
+		irda_task_delete(task);
+	} else if (timeout > 0) {
+		irda_start_timer(&task->timer, timeout, (void *) task,
+				 irda_task_timer_expired);
+		finished = FALSE;
+	} else {
+		IRDA_DEBUG(0, "%s(), not finished, and no timeout!\n",
+			   __func__);
+		finished = FALSE;
+	}
+
+	return finished;
+}
+
+/*
+ * Function irda_task_timer_expired (data)
+ *
+ *    Task time has expired. We now try to execute task (again), and restart
+ *    the timer if the task has not finished yet
+ */
+static void irda_task_timer_expired(void *data)
+{
+	struct irda_task *task;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	task = (struct irda_task *) data;
+
+	irda_task_kick(task);
+}
+
+/*
+ * Function irda_device_setup (dev)
+ *
+ *    This function should be used by low level device drivers in a similar way
+ *    as ether_setup() is used by normal network device drivers
+ */
+static void irda_device_setup(struct net_device *dev)
+{
+	dev->hard_header_len = 0;
+	dev->addr_len        = LAP_ALEN;
+
+	dev->type            = ARPHRD_IRDA;
+	dev->tx_queue_len    = 8; /* Window size + 1 s-frame */
+
+	memset(dev->broadcast, 0xff, LAP_ALEN);
+
+	dev->mtu = 2048;
+	dev->flags = IFF_NOARP;
+}
+
+/*
+ * Funciton  alloc_irdadev
+ * 	Allocates and sets up an IRDA device in a manner similar to
+ * 	alloc_etherdev.
+ */
+struct net_device *alloc_irdadev(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup);
+}
+EXPORT_SYMBOL(alloc_irdadev);
+
+#ifdef CONFIG_ISA_DMA_API
+/*
+ * Function setup_dma (idev, buffer, count, mode)
+ *
+ *    Setup the DMA channel. Commonly used by LPC FIR drivers
+ *
+ */
+void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode)
+{
+	unsigned long flags;
+
+	flags = claim_dma_lock();
+
+	disable_dma(channel);
+	clear_dma_ff(channel);
+	set_dma_mode(channel, mode);
+	set_dma_addr(channel, buffer);
+	set_dma_count(channel, count);
+	enable_dma(channel);
+
+	release_dma_lock(flags);
+}
+EXPORT_SYMBOL(irda_setup_dma);
+#endif
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
new file mode 100644
index 00000000..f876eed7
--- /dev/null
+++ b/net/irda/iriap.c
@@ -0,0 +1,1104 @@
+/*********************************************************************
+ *
+ * Filename:      iriap.c
+ * Version:       0.8
+ * Description:   Information Access Protocol (IAP)
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Aug 21 00:02:07 1997
+ * Modified at:   Sat Dec 25 16:42:42 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap_event.h>
+#include <net/irda/iriap.h>
+
+#ifdef CONFIG_IRDA_DEBUG
+/* FIXME: This one should go in irlmp.c */
+static const char *const ias_charset_types[] = {
+	"CS_ASCII",
+	"CS_ISO_8859_1",
+	"CS_ISO_8859_2",
+	"CS_ISO_8859_3",
+	"CS_ISO_8859_4",
+	"CS_ISO_8859_5",
+	"CS_ISO_8859_6",
+	"CS_ISO_8859_7",
+	"CS_ISO_8859_8",
+	"CS_ISO_8859_9",
+	"CS_UNICODE"
+};
+#endif	/* CONFIG_IRDA_DEBUG */
+
+static hashbin_t *iriap = NULL;
+static void *service_handle;
+
+static void __iriap_close(struct iriap_cb *self);
+static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode);
+static void iriap_disconnect_indication(void *instance, void *sap,
+					LM_REASON reason, struct sk_buff *skb);
+static void iriap_connect_indication(void *instance, void *sap,
+				     struct qos_info *qos, __u32 max_sdu_size,
+				     __u8 max_header_size,
+				     struct sk_buff *skb);
+static void iriap_connect_confirm(void *instance, void *sap,
+				  struct qos_info *qos,
+				  __u32 max_sdu_size, __u8 max_header_size,
+				  struct sk_buff *skb);
+static int iriap_data_indication(void *instance, void *sap,
+				 struct sk_buff *skb);
+
+static void iriap_watchdog_timer_expired(void *data);
+
+static inline void iriap_start_watchdog_timer(struct iriap_cb *self,
+					      int timeout)
+{
+	irda_start_timer(&self->watchdog_timer, timeout, self,
+			 iriap_watchdog_timer_expired);
+}
+
+static struct lock_class_key irias_objects_key;
+
+/*
+ * Function iriap_init (void)
+ *
+ *    Initializes the IrIAP layer, called by the module initialization code
+ *    in irmod.c
+ */
+int __init iriap_init(void)
+{
+	struct ias_object *obj;
+	struct iriap_cb *server;
+	__u8 oct_seq[6];
+	__u16 hints;
+
+	/* Allocate master array */
+	iriap = hashbin_new(HB_LOCK);
+	if (!iriap)
+		return -ENOMEM;
+
+	/* Object repository - defined in irias_object.c */
+	irias_objects = hashbin_new(HB_LOCK);
+	if (!irias_objects) {
+		IRDA_WARNING("%s: Can't allocate irias_objects hashbin!\n",
+			     __func__);
+		hashbin_delete(iriap, NULL);
+		return -ENOMEM;
+	}
+
+	lockdep_set_class_and_name(&irias_objects->hb_spinlock, &irias_objects_key,
+				   "irias_objects");
+
+	/*
+	 *  Register some default services for IrLMP
+	 */
+	hints  = irlmp_service_to_hint(S_COMPUTER);
+	service_handle = irlmp_register_service(hints);
+
+	/* Register the Device object with LM-IAS */
+	obj = irias_new_object("Device", IAS_DEVICE_ID);
+	irias_add_string_attrib(obj, "DeviceName", "Linux", IAS_KERNEL_ATTR);
+
+	oct_seq[0] = 0x01;  /* Version 1 */
+	oct_seq[1] = 0x00;  /* IAS support bits */
+	oct_seq[2] = 0x00;  /* LM-MUX support bits */
+#ifdef CONFIG_IRDA_ULTRA
+	oct_seq[2] |= 0x04; /* Connectionless Data support */
+#endif
+	irias_add_octseq_attrib(obj, "IrLMPSupport", oct_seq, 3,
+				IAS_KERNEL_ATTR);
+	irias_insert_object(obj);
+
+	/*
+	 *  Register server support with IrLMP so we can accept incoming
+	 *  connections
+	 */
+	server = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL);
+	if (!server) {
+		IRDA_DEBUG(0, "%s(), unable to open server\n", __func__);
+		return -1;
+	}
+	iriap_register_lsap(server, LSAP_IAS, IAS_SERVER);
+
+	return 0;
+}
+
+/*
+ * Function iriap_cleanup (void)
+ *
+ *    Initializes the IrIAP layer, called by the module cleanup code in
+ *    irmod.c
+ */
+void iriap_cleanup(void)
+{
+	irlmp_unregister_service(service_handle);
+
+	hashbin_delete(iriap, (FREE_FUNC) __iriap_close);
+	hashbin_delete(irias_objects, (FREE_FUNC) __irias_delete_object);
+}
+
+/*
+ * Function iriap_open (void)
+ *
+ *    Opens an instance of the IrIAP layer, and registers with IrLMP
+ */
+struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv,
+			    CONFIRM_CALLBACK callback)
+{
+	struct iriap_cb *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	self = kzalloc(sizeof(*self), GFP_ATOMIC);
+	if (!self) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	/*
+	 *  Initialize instance
+	 */
+
+	self->magic = IAS_MAGIC;
+	self->mode = mode;
+	if (mode == IAS_CLIENT)
+		iriap_register_lsap(self, slsap_sel, mode);
+
+	self->confirm = callback;
+	self->priv = priv;
+
+	/* iriap_getvaluebyclass_request() will construct packets before
+	 * we connect, so this must have a sane value... Jean II */
+	self->max_header_size = LMP_MAX_HEADER;
+
+	init_timer(&self->watchdog_timer);
+
+	hashbin_insert(iriap, (irda_queue_t *) self, (long) self, NULL);
+
+	/* Initialize state machines */
+	iriap_next_client_state(self, S_DISCONNECT);
+	iriap_next_call_state(self, S_MAKE_CALL);
+	iriap_next_server_state(self, R_DISCONNECT);
+	iriap_next_r_connect_state(self, R_WAITING);
+
+	return self;
+}
+EXPORT_SYMBOL(iriap_open);
+
+/*
+ * Function __iriap_close (self)
+ *
+ *    Removes (deallocates) the IrIAP instance
+ *
+ */
+static void __iriap_close(struct iriap_cb *self)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	del_timer(&self->watchdog_timer);
+
+	if (self->request_skb)
+		dev_kfree_skb(self->request_skb);
+
+	self->magic = 0;
+
+	kfree(self);
+}
+
+/*
+ * Function iriap_close (void)
+ *
+ *    Closes IrIAP and deregisters with IrLMP
+ */
+void iriap_close(struct iriap_cb *self)
+{
+	struct iriap_cb *entry;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	if (self->lsap) {
+		irlmp_close_lsap(self->lsap);
+		self->lsap = NULL;
+	}
+
+	entry = (struct iriap_cb *) hashbin_remove(iriap, (long) self, NULL);
+	IRDA_ASSERT(entry == self, return;);
+
+	__iriap_close(self);
+}
+EXPORT_SYMBOL(iriap_close);
+
+static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode)
+{
+	notify_t notify;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	irda_notify_init(&notify);
+	notify.connect_confirm       = iriap_connect_confirm;
+	notify.connect_indication    = iriap_connect_indication;
+	notify.disconnect_indication = iriap_disconnect_indication;
+	notify.data_indication       = iriap_data_indication;
+	notify.instance = self;
+	if (mode == IAS_CLIENT)
+		strcpy(notify.name, "IrIAS cli");
+	else
+		strcpy(notify.name, "IrIAS srv");
+
+	self->lsap = irlmp_open_lsap(slsap_sel, &notify, 0);
+	if (self->lsap == NULL) {
+		IRDA_ERROR("%s: Unable to allocated LSAP!\n", __func__);
+		return -1;
+	}
+	self->slsap_sel = self->lsap->slsap_sel;
+
+	return 0;
+}
+
+/*
+ * Function iriap_disconnect_indication (handle, reason)
+ *
+ *    Got disconnect, so clean up everything associated with this connection
+ *
+ */
+static void iriap_disconnect_indication(void *instance, void *sap,
+					LM_REASON reason,
+					struct sk_buff *skb)
+{
+	struct iriap_cb *self;
+
+	IRDA_DEBUG(4, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]);
+
+	self = (struct iriap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	IRDA_ASSERT(iriap != NULL, return;);
+
+	del_timer(&self->watchdog_timer);
+
+	/* Not needed */
+	if (skb)
+		dev_kfree_skb(skb);
+
+	if (self->mode == IAS_CLIENT) {
+		IRDA_DEBUG(4, "%s(), disconnect as client\n", __func__);
+
+
+		iriap_do_client_event(self, IAP_LM_DISCONNECT_INDICATION,
+				      NULL);
+		/*
+		 * Inform service user that the request failed by sending
+		 * it a NULL value. Warning, the client might close us, so
+		 * remember no to use self anymore after calling confirm
+		 */
+		if (self->confirm)
+			self->confirm(IAS_DISCONNECT, 0, NULL, self->priv);
+	} else {
+		IRDA_DEBUG(4, "%s(), disconnect as server\n", __func__);
+		iriap_do_server_event(self, IAP_LM_DISCONNECT_INDICATION,
+				      NULL);
+		iriap_close(self);
+	}
+}
+
+/*
+ * Function iriap_disconnect_request (handle)
+ */
+static void iriap_disconnect_request(struct iriap_cb *self)
+{
+	struct sk_buff *tx_skb;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+	if (tx_skb == NULL) {
+		IRDA_DEBUG(0,
+			   "%s(), Could not allocate an sk_buff of length %d\n",
+			   __func__, LMP_MAX_HEADER);
+		return;
+	}
+
+	/*
+	 *  Reserve space for MUX control and LAP header
+	 */
+	skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+	irlmp_disconnect_request(self->lsap, tx_skb);
+}
+
+/*
+ * Function iriap_getvaluebyclass (addr, name, attr)
+ *
+ *    Retrieve all values from attribute in all objects with given class
+ *    name
+ */
+int iriap_getvaluebyclass_request(struct iriap_cb *self,
+				  __u32 saddr, __u32 daddr,
+				  char *name, char *attr)
+{
+	struct sk_buff *tx_skb;
+	int name_len, attr_len, skb_len;
+	__u8 *frame;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return -1;);
+
+	/* Client must supply the destination device address */
+	if (!daddr)
+		return -1;
+
+	self->daddr = daddr;
+	self->saddr = saddr;
+
+	/*
+	 *  Save operation, so we know what the later indication is about
+	 */
+	self->operation = GET_VALUE_BY_CLASS;
+
+	/* Give ourselves 10 secs to finish this operation */
+	iriap_start_watchdog_timer(self, 10*HZ);
+
+	name_len = strlen(name);	/* Up to IAS_MAX_CLASSNAME = 60 */
+	attr_len = strlen(attr);	/* Up to IAS_MAX_ATTRIBNAME = 60 */
+
+	skb_len = self->max_header_size+2+name_len+1+attr_len+4;
+	tx_skb = alloc_skb(skb_len, GFP_ATOMIC);
+	if (!tx_skb)
+		return -ENOMEM;
+
+	/* Reserve space for MUX and LAP header */
+	skb_reserve(tx_skb, self->max_header_size);
+	skb_put(tx_skb, 3+name_len+attr_len);
+	frame = tx_skb->data;
+
+	/* Build frame */
+	frame[0] = IAP_LST | GET_VALUE_BY_CLASS;
+	frame[1] = name_len;                       /* Insert length of name */
+	memcpy(frame+2, name, name_len);           /* Insert name */
+	frame[2+name_len] = attr_len;              /* Insert length of attr */
+	memcpy(frame+3+name_len, attr, attr_len);  /* Insert attr */
+
+	iriap_do_client_event(self, IAP_CALL_REQUEST_GVBC, tx_skb);
+
+	/* Drop reference count - see state_s_disconnect(). */
+	dev_kfree_skb(tx_skb);
+
+	return 0;
+}
+EXPORT_SYMBOL(iriap_getvaluebyclass_request);
+
+/*
+ * Function iriap_getvaluebyclass_confirm (self, skb)
+ *
+ *    Got result from GetValueByClass command. Parse it and return result
+ *    to service user.
+ *
+ */
+static void iriap_getvaluebyclass_confirm(struct iriap_cb *self,
+					  struct sk_buff *skb)
+{
+	struct ias_value *value;
+	int charset;
+	__u32 value_len;
+	__u32 tmp_cpu32;
+	__u16 obj_id;
+	__u16 len;
+	__u8  type;
+	__u8 *fp;
+	int n;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Initialize variables */
+	fp = skb->data;
+	n = 2;
+
+	/* Get length, MSB first */
+	len = get_unaligned_be16(fp + n);
+	n += 2;
+
+	IRDA_DEBUG(4, "%s(), len=%d\n", __func__, len);
+
+	/* Get object ID, MSB first */
+	obj_id = get_unaligned_be16(fp + n);
+	n += 2;
+
+	type = fp[n++];
+	IRDA_DEBUG(4, "%s(), Value type = %d\n", __func__, type);
+
+	switch (type) {
+	case IAS_INTEGER:
+		memcpy(&tmp_cpu32, fp+n, 4); n += 4;
+		be32_to_cpus(&tmp_cpu32);
+		value = irias_new_integer_value(tmp_cpu32);
+
+		/*  Legal values restricted to 0x01-0x6f, page 15 irttp */
+		IRDA_DEBUG(4, "%s(), lsap=%d\n", __func__, value->t.integer);
+		break;
+	case IAS_STRING:
+		charset = fp[n++];
+
+		switch (charset) {
+		case CS_ASCII:
+			break;
+/*		case CS_ISO_8859_1: */
+/*		case CS_ISO_8859_2: */
+/*		case CS_ISO_8859_3: */
+/*		case CS_ISO_8859_4: */
+/*		case CS_ISO_8859_5: */
+/*		case CS_ISO_8859_6: */
+/*		case CS_ISO_8859_7: */
+/*		case CS_ISO_8859_8: */
+/*		case CS_ISO_8859_9: */
+/*		case CS_UNICODE: */
+		default:
+			IRDA_DEBUG(0, "%s(), charset %s, not supported\n",
+				   __func__, ias_charset_types[charset]);
+
+			/* Aborting, close connection! */
+			iriap_disconnect_request(self);
+			return;
+			/* break; */
+		}
+		value_len = fp[n++];
+		IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len);
+
+		/* Make sure the string is null-terminated */
+		if (n + value_len < skb->len)
+			fp[n + value_len] = 0x00;
+		IRDA_DEBUG(4, "Got string %s\n", fp+n);
+
+		/* Will truncate to IAS_MAX_STRING bytes */
+		value = irias_new_string_value(fp+n);
+		break;
+	case IAS_OCT_SEQ:
+		value_len = get_unaligned_be16(fp + n);
+		n += 2;
+
+		/* Will truncate to IAS_MAX_OCTET_STRING bytes */
+		value = irias_new_octseq_value(fp+n, value_len);
+		break;
+	default:
+		value = irias_new_missing_value();
+		break;
+	}
+
+	/* Finished, close connection! */
+	iriap_disconnect_request(self);
+
+	/* Warning, the client might close us, so remember no to use self
+	 * anymore after calling confirm
+	 */
+	if (self->confirm)
+		self->confirm(IAS_SUCCESS, obj_id, value, self->priv);
+	else {
+		IRDA_DEBUG(0, "%s(), missing handler!\n", __func__);
+		irias_delete_value(value);
+	}
+}
+
+/*
+ * Function iriap_getvaluebyclass_response ()
+ *
+ *    Send answer back to remote LM-IAS
+ *
+ */
+static void iriap_getvaluebyclass_response(struct iriap_cb *self,
+					   __u16 obj_id,
+					   __u8 ret_code,
+					   struct ias_value *value)
+{
+	struct sk_buff *tx_skb;
+	int n;
+	__be32 tmp_be32;
+	__be16 tmp_be16;
+	__u8 *fp;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+	IRDA_ASSERT(value != NULL, return;);
+	IRDA_ASSERT(value->len <= 1024, return;);
+
+	/* Initialize variables */
+	n = 0;
+
+	/*
+	 *  We must adjust the size of the response after the length of the
+	 *  value. We add 32 bytes because of the 6 bytes for the frame and
+	 *  max 5 bytes for the value coding.
+	 */
+	tx_skb = alloc_skb(value->len + self->max_header_size + 32,
+			   GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	/* Reserve space for MUX and LAP header */
+	skb_reserve(tx_skb, self->max_header_size);
+	skb_put(tx_skb, 6);
+
+	fp = tx_skb->data;
+
+	/* Build frame */
+	fp[n++] = GET_VALUE_BY_CLASS | IAP_LST;
+	fp[n++] = ret_code;
+
+	/* Insert list length (MSB first) */
+	tmp_be16 = htons(0x0001);
+	memcpy(fp+n, &tmp_be16, 2);  n += 2;
+
+	/* Insert object identifier ( MSB first) */
+	tmp_be16 = cpu_to_be16(obj_id);
+	memcpy(fp+n, &tmp_be16, 2); n += 2;
+
+	switch (value->type) {
+	case IAS_STRING:
+		skb_put(tx_skb, 3 + value->len);
+		fp[n++] = value->type;
+		fp[n++] = 0; /* ASCII */
+		fp[n++] = (__u8) value->len;
+		memcpy(fp+n, value->t.string, value->len); n+=value->len;
+		break;
+	case IAS_INTEGER:
+		skb_put(tx_skb, 5);
+		fp[n++] = value->type;
+
+		tmp_be32 = cpu_to_be32(value->t.integer);
+		memcpy(fp+n, &tmp_be32, 4); n += 4;
+		break;
+	case IAS_OCT_SEQ:
+		skb_put(tx_skb, 3 + value->len);
+		fp[n++] = value->type;
+
+		tmp_be16 = cpu_to_be16(value->len);
+		memcpy(fp+n, &tmp_be16, 2); n += 2;
+		memcpy(fp+n, value->t.oct_seq, value->len); n+=value->len;
+		break;
+	case IAS_MISSING:
+		IRDA_DEBUG( 3, "%s: sending IAS_MISSING\n", __func__);
+		skb_put(tx_skb, 1);
+		fp[n++] = value->type;
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), type not implemented!\n", __func__);
+		break;
+	}
+	iriap_do_r_connect_event(self, IAP_CALL_RESPONSE, tx_skb);
+
+	/* Drop reference count - see state_r_execute(). */
+	dev_kfree_skb(tx_skb);
+}
+
+/*
+ * Function iriap_getvaluebyclass_indication (self, skb)
+ *
+ *    getvaluebyclass is requested from peer LM-IAS
+ *
+ */
+static void iriap_getvaluebyclass_indication(struct iriap_cb *self,
+					     struct sk_buff *skb)
+{
+	struct ias_object *obj;
+	struct ias_attrib *attrib;
+	int name_len;
+	int attr_len;
+	char name[IAS_MAX_CLASSNAME + 1];	/* 60 bytes */
+	char attr[IAS_MAX_ATTRIBNAME + 1];	/* 60 bytes */
+	__u8 *fp;
+	int n;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	fp = skb->data;
+	n = 1;
+
+	name_len = fp[n++];
+
+	IRDA_ASSERT(name_len < IAS_MAX_CLASSNAME + 1, return;);
+
+	memcpy(name, fp+n, name_len); n+=name_len;
+	name[name_len] = '\0';
+
+	attr_len = fp[n++];
+
+	IRDA_ASSERT(attr_len < IAS_MAX_ATTRIBNAME + 1, return;);
+
+	memcpy(attr, fp+n, attr_len); n+=attr_len;
+	attr[attr_len] = '\0';
+
+	IRDA_DEBUG(4, "LM-IAS: Looking up %s: %s\n", name, attr);
+	obj = irias_find_object(name);
+
+	if (obj == NULL) {
+		IRDA_DEBUG(2, "LM-IAS: Object %s not found\n", name);
+		iriap_getvaluebyclass_response(self, 0x1235, IAS_CLASS_UNKNOWN,
+					       &irias_missing);
+		return;
+	}
+	IRDA_DEBUG(4, "LM-IAS: found %s, id=%d\n", obj->name, obj->id);
+
+	attrib = irias_find_attrib(obj, attr);
+	if (attrib == NULL) {
+		IRDA_DEBUG(2, "LM-IAS: Attribute %s not found\n", attr);
+		iriap_getvaluebyclass_response(self, obj->id,
+					       IAS_ATTRIB_UNKNOWN,
+					       &irias_missing);
+		return;
+	}
+
+	/* We have a match; send the value.  */
+	iriap_getvaluebyclass_response(self, obj->id, IAS_SUCCESS,
+				       attrib->value);
+}
+
+/*
+ * Function iriap_send_ack (void)
+ *
+ *    Currently not used
+ *
+ */
+void iriap_send_ack(struct iriap_cb *self)
+{
+	struct sk_buff *tx_skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	tx_skb = alloc_skb(LMP_MAX_HEADER + 1, GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	/* Reserve space for MUX and LAP header */
+	skb_reserve(tx_skb, self->max_header_size);
+	skb_put(tx_skb, 1);
+	frame = tx_skb->data;
+
+	/* Build frame */
+	frame[0] = IAP_LST | IAP_ACK | self->operation;
+
+	irlmp_data_request(self->lsap, tx_skb);
+}
+
+void iriap_connect_request(struct iriap_cb *self)
+{
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	ret = irlmp_connect_request(self->lsap, LSAP_IAS,
+				    self->saddr, self->daddr,
+				    NULL, NULL);
+	if (ret < 0) {
+		IRDA_DEBUG(0, "%s(), connect failed!\n", __func__);
+		self->confirm(IAS_DISCONNECT, 0, NULL, self->priv);
+	}
+}
+
+/*
+ * Function iriap_connect_confirm (handle, skb)
+ *
+ *    LSAP connection confirmed!
+ *
+ */
+static void iriap_connect_confirm(void *instance, void *sap,
+				  struct qos_info *qos, __u32 max_seg_size,
+				  __u8 max_header_size,
+				  struct sk_buff *skb)
+{
+	struct iriap_cb *self;
+
+	self = (struct iriap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	self->max_data_size = max_seg_size;
+	self->max_header_size = max_header_size;
+
+	del_timer(&self->watchdog_timer);
+
+	iriap_do_client_event(self, IAP_LM_CONNECT_CONFIRM, skb);
+
+	/* Drop reference count - see state_s_make_call(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function iriap_connect_indication ( handle, skb)
+ *
+ *    Remote LM-IAS is requesting connection
+ *
+ */
+static void iriap_connect_indication(void *instance, void *sap,
+				     struct qos_info *qos, __u32 max_seg_size,
+				     __u8 max_header_size,
+				     struct sk_buff *skb)
+{
+	struct iriap_cb *self, *new;
+
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	self = (struct iriap_cb *) instance;
+
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(self != NULL, goto out;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;);
+
+	/* Start new server */
+	new = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL);
+	if (!new) {
+		IRDA_DEBUG(0, "%s(), open failed\n", __func__);
+		goto out;
+	}
+
+	/* Now attach up the new "socket" */
+	new->lsap = irlmp_dup(self->lsap, new);
+	if (!new->lsap) {
+		IRDA_DEBUG(0, "%s(), dup failed!\n", __func__);
+		goto out;
+	}
+
+	new->max_data_size = max_seg_size;
+	new->max_header_size = max_header_size;
+
+	/* Clean up the original one to keep it in listen state */
+	irlmp_listen(self->lsap);
+
+	iriap_do_server_event(new, IAP_LM_CONNECT_INDICATION, skb);
+
+out:
+	/* Drop reference count - see state_r_disconnect(). */
+	dev_kfree_skb(skb);
+}
+
+/*
+ * Function iriap_data_indication (handle, skb)
+ *
+ *    Receives data from connection identified by handle from IrLMP
+ *
+ */
+static int iriap_data_indication(void *instance, void *sap,
+				 struct sk_buff *skb)
+{
+	struct iriap_cb *self;
+	__u8  *frame;
+	__u8  opcode;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	self = (struct iriap_cb *) instance;
+
+	IRDA_ASSERT(skb != NULL, return 0;);
+	IRDA_ASSERT(self != NULL, goto out;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;);
+
+	frame = skb->data;
+
+	if (self->mode == IAS_SERVER) {
+		/* Call server */
+		IRDA_DEBUG(4, "%s(), Calling server!\n", __func__);
+		iriap_do_r_connect_event(self, IAP_RECV_F_LST, skb);
+		goto out;
+	}
+	opcode = frame[0];
+	if (~opcode & IAP_LST) {
+		IRDA_WARNING("%s:, IrIAS multiframe commands or "
+			     "results is not implemented yet!\n",
+			     __func__);
+		goto out;
+	}
+
+	/* Check for ack frames since they don't contain any data */
+	if (opcode & IAP_ACK) {
+		IRDA_DEBUG(0, "%s() Got ack frame!\n", __func__);
+		goto out;
+	}
+
+	opcode &= ~IAP_LST; /* Mask away LST bit */
+
+	switch (opcode) {
+	case GET_INFO_BASE:
+		IRDA_DEBUG(0, "IrLMP GetInfoBaseDetails not implemented!\n");
+		break;
+	case GET_VALUE_BY_CLASS:
+		iriap_do_call_event(self, IAP_RECV_F_LST, NULL);
+
+		switch (frame[1]) {
+		case IAS_SUCCESS:
+			iriap_getvaluebyclass_confirm(self, skb);
+			break;
+		case IAS_CLASS_UNKNOWN:
+			IRDA_DEBUG(1, "%s(), No such class!\n", __func__);
+			/* Finished, close connection! */
+			iriap_disconnect_request(self);
+
+			/*
+			 * Warning, the client might close us, so remember
+			 * no to use self anymore after calling confirm
+			 */
+			if (self->confirm)
+				self->confirm(IAS_CLASS_UNKNOWN, 0, NULL,
+					      self->priv);
+			break;
+		case IAS_ATTRIB_UNKNOWN:
+			IRDA_DEBUG(1, "%s(), No such attribute!\n", __func__);
+			/* Finished, close connection! */
+			iriap_disconnect_request(self);
+
+			/*
+			 * Warning, the client might close us, so remember
+			 * no to use self anymore after calling confirm
+			 */
+			if (self->confirm)
+				self->confirm(IAS_ATTRIB_UNKNOWN, 0, NULL,
+					      self->priv);
+			break;
+		}
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown op-code: %02x\n", __func__,
+			   opcode);
+		break;
+	}
+
+out:
+	/* Cleanup - sub-calls will have done skb_get() as needed. */
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Function iriap_call_indication (self, skb)
+ *
+ *    Received call to server from peer LM-IAS
+ *
+ */
+void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb)
+{
+	__u8 *fp;
+	__u8 opcode;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	fp = skb->data;
+
+	opcode = fp[0];
+	if (~opcode & 0x80) {
+		IRDA_WARNING("%s: IrIAS multiframe commands or results "
+			     "is not implemented yet!\n", __func__);
+		return;
+	}
+	opcode &= 0x7f; /* Mask away LST bit */
+
+	switch (opcode) {
+	case GET_INFO_BASE:
+		IRDA_WARNING("%s: GetInfoBaseDetails not implemented yet!\n",
+			     __func__);
+		break;
+	case GET_VALUE_BY_CLASS:
+		iriap_getvaluebyclass_indication(self, skb);
+		break;
+	}
+	/* skb will be cleaned up in iriap_data_indication */
+}
+
+/*
+ * Function iriap_watchdog_timer_expired (data)
+ *
+ *    Query has taken too long time, so abort
+ *
+ */
+static void iriap_watchdog_timer_expired(void *data)
+{
+	struct iriap_cb *self = (struct iriap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	/* iriap_close(self); */
+}
+
+#ifdef CONFIG_PROC_FS
+
+static const char *const ias_value_types[] = {
+	"IAS_MISSING",
+	"IAS_INTEGER",
+	"IAS_OCT_SEQ",
+	"IAS_STRING"
+};
+
+static inline struct ias_object *irias_seq_idx(loff_t pos)
+{
+	struct ias_object *obj;
+
+	for (obj = (struct ias_object *) hashbin_get_first(irias_objects);
+	     obj; obj = (struct ias_object *) hashbin_get_next(irias_objects)) {
+		if (pos-- == 0)
+			break;
+	}
+
+	return obj;
+}
+
+static void *irias_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_irq(&irias_objects->hb_spinlock);
+
+	return *pos ? irias_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *irias_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+
+	return (v == SEQ_START_TOKEN)
+		? (void *) hashbin_get_first(irias_objects)
+		: (void *) hashbin_get_next(irias_objects);
+}
+
+static void irias_seq_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_irq(&irias_objects->hb_spinlock);
+}
+
+static int irias_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "LM-IAS Objects:\n");
+	else {
+		struct ias_object *obj = v;
+		struct ias_attrib *attrib;
+
+		IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -EINVAL;);
+
+		seq_printf(seq, "name: %s, id=%d\n",
+			   obj->name, obj->id);
+
+		/* Careful for priority inversions here !
+		 * All other uses of attrib spinlock are independent of
+		 * the object spinlock, so we are safe. Jean II */
+		spin_lock(&obj->attribs->hb_spinlock);
+
+		/* List all attributes for this object */
+		for (attrib = (struct ias_attrib *) hashbin_get_first(obj->attribs);
+		     attrib != NULL;
+		     attrib = (struct ias_attrib *) hashbin_get_next(obj->attribs)) {
+
+			IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC,
+				    goto outloop; );
+
+			seq_printf(seq, " - Attribute name: \"%s\", ",
+				   attrib->name);
+			seq_printf(seq, "value[%s]: ",
+				   ias_value_types[attrib->value->type]);
+
+			switch (attrib->value->type) {
+			case IAS_INTEGER:
+				seq_printf(seq, "%d\n",
+					   attrib->value->t.integer);
+				break;
+			case IAS_STRING:
+				seq_printf(seq, "\"%s\"\n",
+					   attrib->value->t.string);
+				break;
+			case IAS_OCT_SEQ:
+				seq_printf(seq, "octet sequence (%d bytes)\n",
+					   attrib->value->len);
+				break;
+			case IAS_MISSING:
+				seq_puts(seq, "missing\n");
+				break;
+			default:
+				seq_printf(seq, "type %d?\n",
+					   attrib->value->type);
+			}
+			seq_putc(seq, '\n');
+
+		}
+	IRDA_ASSERT_LABEL(outloop:)
+		spin_unlock(&obj->attribs->hb_spinlock);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations irias_seq_ops = {
+	.start  = irias_seq_start,
+	.next   = irias_seq_next,
+	.stop   = irias_seq_stop,
+	.show   = irias_seq_show,
+};
+
+static int irias_seq_open(struct inode *inode, struct file *file)
+{
+	IRDA_ASSERT( irias_objects != NULL, return -EINVAL;);
+
+	return seq_open(file, &irias_seq_ops);
+}
+
+const struct file_operations irias_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = irias_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/iriap_event.c b/net/irda/iriap_event.c
new file mode 100644
index 00000000..703774e2
--- /dev/null
+++ b/net/irda/iriap_event.c
@@ -0,0 +1,504 @@
+/*********************************************************************
+ *
+ * Filename:      iriap_event.c
+ * Version:       0.1
+ * Description:   IAP Finite State Machine
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Aug 21 00:02:07 1997
+ * Modified at:   Wed Mar  1 11:28:34 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1997, 1999-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/iriap_event.h>
+
+static void state_s_disconnect   (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_connecting   (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_call         (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+
+static void state_s_make_call    (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_calling      (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_outstanding  (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_replying     (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_s_wait_active  (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+
+static void state_r_disconnect   (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_call         (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_waiting      (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_wait_active  (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_receiving    (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_execute      (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+static void state_r_returning    (struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb);
+
+static void (*iriap_state[])(struct iriap_cb *self, IRIAP_EVENT event,
+			     struct sk_buff *skb) = {
+	/* Client FSM */
+	state_s_disconnect,
+	state_s_connecting,
+	state_s_call,
+
+	/* S-Call FSM */
+	state_s_make_call,
+	state_s_calling,
+	state_s_outstanding,
+	state_s_replying,
+	state_s_wait_for_call,
+	state_s_wait_active,
+
+	/* Server FSM */
+	state_r_disconnect,
+	state_r_call,
+
+	/* R-Connect FSM */
+	state_r_waiting,
+	state_r_wait_active,
+	state_r_receiving,
+	state_r_execute,
+	state_r_returning,
+};
+
+void iriap_next_client_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	self->client_state = state;
+}
+
+void iriap_next_call_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	self->call_state = state;
+}
+
+void iriap_next_server_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	self->server_state = state;
+}
+
+void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	self->r_connect_state = state;
+}
+
+void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event,
+			   struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	(*iriap_state[ self->client_state]) (self, event, skb);
+}
+
+void iriap_do_call_event(struct iriap_cb *self, IRIAP_EVENT event,
+			 struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	(*iriap_state[ self->call_state]) (self, event, skb);
+}
+
+void iriap_do_server_event(struct iriap_cb *self, IRIAP_EVENT event,
+			   struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	(*iriap_state[ self->server_state]) (self, event, skb);
+}
+
+void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event,
+			      struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	(*iriap_state[ self->r_connect_state]) (self, event, skb);
+}
+
+
+/*
+ * Function state_s_disconnect (event, skb)
+ *
+ *    S-Disconnect, The device has no LSAP connection to a particular
+ *    remote device.
+ */
+static void state_s_disconnect(struct iriap_cb *self, IRIAP_EVENT event,
+			       struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	switch (event) {
+	case IAP_CALL_REQUEST_GVBC:
+		iriap_next_client_state(self, S_CONNECTING);
+		IRDA_ASSERT(self->request_skb == NULL, return;);
+		/* Don't forget to refcount it -
+		 * see iriap_getvaluebyclass_request(). */
+		skb_get(skb);
+		self->request_skb = skb;
+		iriap_connect_request(self);
+		break;
+	case IAP_LM_DISCONNECT_INDICATION:
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event);
+		break;
+	}
+}
+
+/*
+ * Function state_s_connecting (self, event, skb)
+ *
+ *    S-Connecting
+ *
+ */
+static void state_s_connecting(struct iriap_cb *self, IRIAP_EVENT event,
+			       struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	switch (event) {
+	case IAP_LM_CONNECT_CONFIRM:
+		/*
+		 *  Jump to S-Call FSM
+		 */
+		iriap_do_call_event(self, IAP_CALL_REQUEST, skb);
+		/* iriap_call_request(self, 0,0,0); */
+		iriap_next_client_state(self, S_CALL);
+		break;
+	case IAP_LM_DISCONNECT_INDICATION:
+		/* Abort calls */
+		iriap_next_call_state(self, S_MAKE_CALL);
+		iriap_next_client_state(self, S_DISCONNECT);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event);
+		break;
+	}
+}
+
+/*
+ * Function state_s_call (self, event, skb)
+ *
+ *    S-Call, The device can process calls to a specific remote
+ *    device. Whenever the LSAP connection is disconnected, this state
+ *    catches that event and clears up
+ */
+static void state_s_call(struct iriap_cb *self, IRIAP_EVENT event,
+			 struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+
+	switch (event) {
+	case IAP_LM_DISCONNECT_INDICATION:
+		/* Abort calls */
+		iriap_next_call_state(self, S_MAKE_CALL);
+		iriap_next_client_state(self, S_DISCONNECT);
+		break;
+	default:
+		IRDA_DEBUG(0, "state_s_call: Unknown event %d\n", event);
+		break;
+	}
+}
+
+/*
+ * Function state_s_make_call (event, skb)
+ *
+ *    S-Make-Call
+ *
+ */
+static void state_s_make_call(struct iriap_cb *self, IRIAP_EVENT event,
+			      struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+
+	IRDA_ASSERT(self != NULL, return;);
+
+	switch (event) {
+	case IAP_CALL_REQUEST:
+		/* Already refcounted - see state_s_disconnect() */
+		tx_skb = self->request_skb;
+		self->request_skb = NULL;
+
+		irlmp_data_request(self->lsap, tx_skb);
+		iriap_next_call_state(self, S_OUTSTANDING);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event);
+		break;
+	}
+}
+
+/*
+ * Function state_s_calling (event, skb)
+ *
+ *    S-Calling
+ *
+ */
+static void state_s_calling(struct iriap_cb *self, IRIAP_EVENT event,
+			    struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_s_outstanding (event, skb)
+ *
+ *    S-Outstanding, The device is waiting for a response to a command
+ *
+ */
+static void state_s_outstanding(struct iriap_cb *self, IRIAP_EVENT event,
+				struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+
+	switch (event) {
+	case IAP_RECV_F_LST:
+		/*iriap_send_ack(self);*/
+		/*LM_Idle_request(idle); */
+
+		iriap_next_call_state(self, S_WAIT_FOR_CALL);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %d\n", __func__, event);
+		break;
+	}
+}
+
+/*
+ * Function state_s_replying (event, skb)
+ *
+ *    S-Replying, The device is collecting a multiple part response
+ */
+static void state_s_replying(struct iriap_cb *self, IRIAP_EVENT event,
+			     struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_s_wait_for_call (event, skb)
+ *
+ *    S-Wait-for-Call
+ *
+ */
+static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event,
+				  struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+
+/*
+ * Function state_s_wait_active (event, skb)
+ *
+ *    S-Wait-Active
+ *
+ */
+static void state_s_wait_active(struct iriap_cb *self, IRIAP_EVENT event,
+				struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+/**************************************************************************
+ *
+ *  Server FSM
+ *
+ **************************************************************************/
+
+/*
+ * Function state_r_disconnect (self, event, skb)
+ *
+ *    LM-IAS server is disconnected (not processing any requests!)
+ *
+ */
+static void state_r_disconnect(struct iriap_cb *self, IRIAP_EVENT event,
+			       struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+
+	switch (event) {
+	case IAP_LM_CONNECT_INDICATION:
+		tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+		if (tx_skb == NULL) {
+			IRDA_WARNING("%s: unable to malloc!\n", __func__);
+			return;
+		}
+
+		/* Reserve space for MUX_CONTROL and LAP header */
+		skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+		irlmp_connect_response(self->lsap, tx_skb);
+		/*LM_Idle_request(idle); */
+
+		iriap_next_server_state(self, R_CALL);
+
+		/*
+		 *  Jump to R-Connect FSM, we skip R-Waiting since we do not
+		 *  care about LM_Idle_request()!
+		 */
+		iriap_next_r_connect_state(self, R_RECEIVING);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event %d\n", __func__, event);
+		break;
+	}
+}
+
+/*
+ * Function state_r_call (self, event, skb)
+ */
+static void state_r_call(struct iriap_cb *self, IRIAP_EVENT event,
+			 struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	switch (event) {
+	case IAP_LM_DISCONNECT_INDICATION:
+		/* Abort call */
+		iriap_next_server_state(self, R_DISCONNECT);
+		iriap_next_r_connect_state(self, R_WAITING);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event!\n", __func__);
+		break;
+	}
+}
+
+/*
+ *  R-Connect FSM
+ */
+
+/*
+ * Function state_r_waiting (self, event, skb)
+ */
+static void state_r_waiting(struct iriap_cb *self, IRIAP_EVENT event,
+			    struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+static void state_r_wait_active(struct iriap_cb *self, IRIAP_EVENT event,
+				struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), Not implemented\n", __func__);
+}
+
+/*
+ * Function state_r_receiving (self, event, skb)
+ *
+ *    We are receiving a command
+ *
+ */
+static void state_r_receiving(struct iriap_cb *self, IRIAP_EVENT event,
+			      struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	switch (event) {
+	case IAP_RECV_F_LST:
+		iriap_next_r_connect_state(self, R_EXECUTE);
+
+		iriap_call_indication(self, skb);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event!\n", __func__);
+		break;
+	}
+}
+
+/*
+ * Function state_r_execute (self, event, skb)
+ *
+ *    The server is processing the request
+ *
+ */
+static void state_r_execute(struct iriap_cb *self, IRIAP_EVENT event,
+			    struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IAS_MAGIC, return;);
+
+	switch (event) {
+	case IAP_CALL_RESPONSE:
+		/*
+		 *  Since we don't implement the Waiting state, we return
+		 *  to state Receiving instead, DB.
+		 */
+		iriap_next_r_connect_state(self, R_RECEIVING);
+
+		/* Don't forget to refcount it - see
+		 * iriap_getvaluebyclass_response(). */
+		skb_get(skb);
+
+		irlmp_data_request(self->lsap, skb);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), unknown event!\n", __func__);
+		break;
+	}
+}
+
+static void state_r_returning(struct iriap_cb *self, IRIAP_EVENT event,
+			      struct sk_buff *skb)
+{
+	IRDA_DEBUG(0, "%s(), event=%d\n", __func__, event);
+
+	switch (event) {
+	case IAP_RECV_F_LST:
+		break;
+	default:
+		break;
+	}
+}
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
new file mode 100644
index 00000000..f07ed9fd
--- /dev/null
+++ b/net/irda/irias_object.c
@@ -0,0 +1,567 @@
+/*********************************************************************
+ *
+ * Filename:      irias_object.c
+ * Version:       0.3
+ * Description:   IAS object database and functions
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Oct  1 22:50:04 1998
+ * Modified at:   Wed Dec 15 11:23:16 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irias_object.h>
+
+hashbin_t *irias_objects;
+
+/*
+ *  Used when a missing value needs to be returned
+ */
+struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}};
+
+
+/*
+ * Function ias_new_object (name, id)
+ *
+ *    Create a new IAS object
+ *
+ */
+struct ias_object *irias_new_object( char *name, int id)
+{
+	struct ias_object *obj;
+
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	obj = kzalloc(sizeof(struct ias_object), GFP_ATOMIC);
+	if (obj == NULL) {
+		IRDA_WARNING("%s(), Unable to allocate object!\n",
+			     __func__);
+		return NULL;
+	}
+
+	obj->magic = IAS_OBJECT_MAGIC;
+	obj->name = kstrndup(name, IAS_MAX_CLASSNAME, GFP_ATOMIC);
+	if (!obj->name) {
+		IRDA_WARNING("%s(), Unable to allocate name!\n",
+			     __func__);
+		kfree(obj);
+		return NULL;
+	}
+	obj->id = id;
+
+	/* Locking notes : the attrib spinlock has lower precendence
+	 * than the objects spinlock. Never grap the objects spinlock
+	 * while holding any attrib spinlock (risk of deadlock). Jean II */
+	obj->attribs = hashbin_new(HB_LOCK);
+
+	if (obj->attribs == NULL) {
+		IRDA_WARNING("%s(), Unable to allocate attribs!\n",
+			     __func__);
+		kfree(obj->name);
+		kfree(obj);
+		return NULL;
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL(irias_new_object);
+
+/*
+ * Function irias_delete_attrib (attrib)
+ *
+ *    Delete given attribute and deallocate all its memory
+ *
+ */
+static void __irias_delete_attrib(struct ias_attrib *attrib)
+{
+	IRDA_ASSERT(attrib != NULL, return;);
+	IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;);
+
+	kfree(attrib->name);
+
+	irias_delete_value(attrib->value);
+	attrib->magic = ~IAS_ATTRIB_MAGIC;
+
+	kfree(attrib);
+}
+
+void __irias_delete_object(struct ias_object *obj)
+{
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+	kfree(obj->name);
+
+	hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib);
+
+	obj->magic = ~IAS_OBJECT_MAGIC;
+
+	kfree(obj);
+}
+
+/*
+ * Function irias_delete_object (obj)
+ *
+ *    Remove object from hashbin and deallocate all attributes associated with
+ *    with this object and the object itself
+ *
+ */
+int irias_delete_object(struct ias_object *obj)
+{
+	struct ias_object *node;
+
+	IRDA_ASSERT(obj != NULL, return -1;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;);
+
+	/* Remove from list */
+	node = hashbin_remove_this(irias_objects, (irda_queue_t *) obj);
+	if (!node)
+		IRDA_DEBUG( 0, "%s(), object already removed!\n",
+			    __func__);
+
+	/* Destroy */
+	__irias_delete_object(obj);
+
+	return 0;
+}
+EXPORT_SYMBOL(irias_delete_object);
+
+/*
+ * Function irias_delete_attrib (obj)
+ *
+ *    Remove attribute from hashbin and, if it was the last attribute of
+ *    the object, remove the object as well.
+ *
+ */
+int irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib,
+			int cleanobject)
+{
+	struct ias_attrib *node;
+
+	IRDA_ASSERT(obj != NULL, return -1;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;);
+	IRDA_ASSERT(attrib != NULL, return -1;);
+
+	/* Remove attribute from object */
+	node = hashbin_remove_this(obj->attribs, (irda_queue_t *) attrib);
+	if (!node)
+		return 0; /* Already removed or non-existent */
+
+	/* Deallocate attribute */
+	__irias_delete_attrib(node);
+
+	/* Check if object has still some attributes, destroy it if none.
+	 * At first glance, this look dangerous, as the kernel reference
+	 * various IAS objects. However, we only use this function on
+	 * user attributes, not kernel attributes, so there is no risk
+	 * of deleting a kernel object this way. Jean II */
+	node = (struct ias_attrib *) hashbin_get_first(obj->attribs);
+	if (cleanobject && !node)
+		irias_delete_object(obj);
+
+	return 0;
+}
+
+/*
+ * Function irias_insert_object (obj)
+ *
+ *    Insert an object into the LM-IAS database
+ *
+ */
+void irias_insert_object(struct ias_object *obj)
+{
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+	hashbin_insert(irias_objects, (irda_queue_t *) obj, 0, obj->name);
+}
+EXPORT_SYMBOL(irias_insert_object);
+
+/*
+ * Function irias_find_object (name)
+ *
+ *    Find object with given name
+ *
+ */
+struct ias_object *irias_find_object(char *name)
+{
+	IRDA_ASSERT(name != NULL, return NULL;);
+
+	/* Unsafe (locking), object might change */
+	return hashbin_lock_find(irias_objects, 0, name);
+}
+EXPORT_SYMBOL(irias_find_object);
+
+/*
+ * Function irias_find_attrib (obj, name)
+ *
+ *    Find named attribute in object
+ *
+ */
+struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name)
+{
+	struct ias_attrib *attrib;
+
+	IRDA_ASSERT(obj != NULL, return NULL;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return NULL;);
+	IRDA_ASSERT(name != NULL, return NULL;);
+
+	attrib = hashbin_lock_find(obj->attribs, 0, name);
+	if (attrib == NULL)
+		return NULL;
+
+	/* Unsafe (locking), attrib might change */
+	return attrib;
+}
+
+/*
+ * Function irias_add_attribute (obj, attrib)
+ *
+ *    Add attribute to object
+ *
+ */
+static void irias_add_attrib(struct ias_object *obj, struct ias_attrib *attrib,
+			     int owner)
+{
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+	IRDA_ASSERT(attrib != NULL, return;);
+	IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;);
+
+	/* Set if attrib is owned by kernel or user space */
+	attrib->value->owner = owner;
+
+	hashbin_insert(obj->attribs, (irda_queue_t *) attrib, 0, attrib->name);
+}
+
+/*
+ * Function irias_object_change_attribute (obj_name, attrib_name, new_value)
+ *
+ *    Change the value of an objects attribute.
+ *
+ */
+int irias_object_change_attribute(char *obj_name, char *attrib_name,
+				  struct ias_value *new_value)
+{
+	struct ias_object *obj;
+	struct ias_attrib *attrib;
+	unsigned long flags;
+
+	/* Find object */
+	obj = hashbin_lock_find(irias_objects, 0, obj_name);
+	if (obj == NULL) {
+		IRDA_WARNING("%s: Unable to find object: %s\n", __func__,
+			     obj_name);
+		return -1;
+	}
+
+	/* Slightly unsafe (obj might get removed under us) */
+	spin_lock_irqsave(&obj->attribs->hb_spinlock, flags);
+
+	/* Find attribute */
+	attrib = hashbin_find(obj->attribs, 0, attrib_name);
+	if (attrib == NULL) {
+		IRDA_WARNING("%s: Unable to find attribute: %s\n",
+			     __func__, attrib_name);
+		spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+		return -1;
+	}
+
+	if ( attrib->value->type != new_value->type) {
+		IRDA_DEBUG( 0, "%s(), changing value type not allowed!\n",
+			    __func__);
+		spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+		return -1;
+	}
+
+	/* Delete old value */
+	irias_delete_value(attrib->value);
+
+	/* Insert new value */
+	attrib->value = new_value;
+
+	/* Success */
+	spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(irias_object_change_attribute);
+
+/*
+ * Function irias_object_add_integer_attrib (obj, name, value)
+ *
+ *    Add an integer attribute to an LM-IAS object
+ *
+ */
+void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
+			      int owner)
+{
+	struct ias_attrib *attrib;
+
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+	IRDA_ASSERT(name != NULL, return;);
+
+	attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC);
+	if (attrib == NULL) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		return;
+	}
+
+	attrib->magic = IAS_ATTRIB_MAGIC;
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+	/* Insert value */
+	attrib->value = irias_new_integer_value(value);
+	if (!attrib->name || !attrib->value) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		if (attrib->value)
+			irias_delete_value(attrib->value);
+		kfree(attrib->name);
+		kfree(attrib);
+		return;
+	}
+
+	irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_integer_attrib);
+
+ /*
+ * Function irias_add_octseq_attrib (obj, name, octet_seq, len)
+ *
+ *    Add a octet sequence attribute to an LM-IAS object
+ *
+ */
+
+void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
+			     int len, int owner)
+{
+	struct ias_attrib *attrib;
+
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+	IRDA_ASSERT(name != NULL, return;);
+	IRDA_ASSERT(octets != NULL, return;);
+
+	attrib = kzalloc(sizeof(struct ias_attrib), GFP_ATOMIC);
+	if (attrib == NULL) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		return;
+	}
+
+	attrib->magic = IAS_ATTRIB_MAGIC;
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+	attrib->value = irias_new_octseq_value( octets, len);
+	if (!attrib->name || !attrib->value) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		if (attrib->value)
+			irias_delete_value(attrib->value);
+		kfree(attrib->name);
+		kfree(attrib);
+		return;
+	}
+
+	irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_octseq_attrib);
+
+/*
+ * Function irias_object_add_string_attrib (obj, string)
+ *
+ *    Add a string attribute to an LM-IAS object
+ *
+ */
+void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
+			     int owner)
+{
+	struct ias_attrib *attrib;
+
+	IRDA_ASSERT(obj != NULL, return;);
+	IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
+
+	IRDA_ASSERT(name != NULL, return;);
+	IRDA_ASSERT(value != NULL, return;);
+
+	attrib = kzalloc(sizeof( struct ias_attrib), GFP_ATOMIC);
+	if (attrib == NULL) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		return;
+	}
+
+	attrib->magic = IAS_ATTRIB_MAGIC;
+	attrib->name = kstrndup(name, IAS_MAX_ATTRIBNAME, GFP_ATOMIC);
+
+	attrib->value = irias_new_string_value(value);
+	if (!attrib->name || !attrib->value) {
+		IRDA_WARNING("%s: Unable to allocate attribute!\n",
+			     __func__);
+		if (attrib->value)
+			irias_delete_value(attrib->value);
+		kfree(attrib->name);
+		kfree(attrib);
+		return;
+	}
+
+	irias_add_attrib(obj, attrib, owner);
+}
+EXPORT_SYMBOL(irias_add_string_attrib);
+
+/*
+ * Function irias_new_integer_value (integer)
+ *
+ *    Create new IAS integer value
+ *
+ */
+struct ias_value *irias_new_integer_value(int integer)
+{
+	struct ias_value *value;
+
+	value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+	if (value == NULL) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	value->type = IAS_INTEGER;
+	value->len = 4;
+	value->t.integer = integer;
+
+	return value;
+}
+EXPORT_SYMBOL(irias_new_integer_value);
+
+/*
+ * Function irias_new_string_value (string)
+ *
+ *    Create new IAS string value
+ *
+ * Per IrLMP 1.1, 4.3.3.2, strings are up to 256 chars - Jean II
+ */
+struct ias_value *irias_new_string_value(char *string)
+{
+	struct ias_value *value;
+
+	value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+	if (value == NULL) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	value->type = IAS_STRING;
+	value->charset = CS_ASCII;
+	value->t.string = kstrndup(string, IAS_MAX_STRING, GFP_ATOMIC);
+	if (!value->t.string) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		kfree(value);
+		return NULL;
+	}
+
+	value->len = strlen(value->t.string);
+
+	return value;
+}
+
+/*
+ * Function irias_new_octseq_value (octets, len)
+ *
+ *    Create new IAS octet-sequence value
+ *
+ * Per IrLMP 1.1, 4.3.3.2, octet-sequence are up to 1024 bytes - Jean II
+ */
+struct ias_value *irias_new_octseq_value(__u8 *octseq , int len)
+{
+	struct ias_value *value;
+
+	value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+	if (value == NULL) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	value->type = IAS_OCT_SEQ;
+	/* Check length */
+	if(len > IAS_MAX_OCTET_STRING)
+		len = IAS_MAX_OCTET_STRING;
+	value->len = len;
+
+	value->t.oct_seq = kmemdup(octseq, len, GFP_ATOMIC);
+	if (value->t.oct_seq == NULL){
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		kfree(value);
+		return NULL;
+	}
+	return value;
+}
+
+struct ias_value *irias_new_missing_value(void)
+{
+	struct ias_value *value;
+
+	value = kzalloc(sizeof(struct ias_value), GFP_ATOMIC);
+	if (value == NULL) {
+		IRDA_WARNING("%s: Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	value->type = IAS_MISSING;
+
+	return value;
+}
+
+/*
+ * Function irias_delete_value (value)
+ *
+ *    Delete IAS value
+ *
+ */
+void irias_delete_value(struct ias_value *value)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(value != NULL, return;);
+
+	switch (value->type) {
+	case IAS_INTEGER: /* Fallthrough */
+	case IAS_MISSING:
+		/* No need to deallocate */
+		break;
+	case IAS_STRING:
+		/* Deallocate string */
+		kfree(value->t.string);
+		break;
+	case IAS_OCT_SEQ:
+		/* Deallocate byte stream */
+		 kfree(value->t.oct_seq);
+		 break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown value type!\n", __func__);
+		break;
+	}
+	kfree(value);
+}
+EXPORT_SYMBOL(irias_delete_value);
diff --git a/net/irda/irlan/Kconfig b/net/irda/irlan/Kconfig
new file mode 100644
index 00000000..951abc2e
--- /dev/null
+++ b/net/irda/irlan/Kconfig
@@ -0,0 +1,14 @@
+config IRLAN
+	tristate "IrLAN protocol"
+	depends on IRDA
+	help
+	  Say Y here if you want to build support for the IrLAN protocol.
+	  To compile it as a module, choose M here: the module will be called
+	  irlan.  IrLAN emulates an Ethernet and makes it possible to put up
+	  a wireless LAN using infrared beams.
+
+	  The IrLAN protocol can be used to talk with infrared access points
+	  like the HP NetbeamIR, or the ESI JetEye NET.  You can also connect
+	  to another Linux machine running the IrLAN protocol for ad-hoc
+	  networking!
+
diff --git a/net/irda/irlan/Makefile b/net/irda/irlan/Makefile
new file mode 100644
index 00000000..94eefbc8
--- /dev/null
+++ b/net/irda/irlan/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux IrDA IrLAN protocol layer.
+#
+
+obj-$(CONFIG_IRLAN) += irlan.o
+
+irlan-y := irlan_common.o irlan_eth.o irlan_event.o irlan_client.o irlan_provider.o irlan_filter.o irlan_provider_event.o irlan_client_event.o
diff --git a/net/irda/irlan/irlan_client.c b/net/irda/irlan/irlan_client.c
new file mode 100644
index 00000000..7ed3af95
--- /dev/null
+++ b/net/irda/irlan/irlan_client.c
@@ -0,0 +1,576 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_client.c
+ * Version:       0.9
+ * Description:   IrDA LAN Access Protocol (IrLAN) Client
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Tue Dec 14 15:47:02 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Sources:       skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ *                slip.c by Laurence Culhane, <loz@holmes.demon.co.uk>
+ *                          Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/bitops.h>
+#include <net/arp.h>
+
+#include <asm/system.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_client.h>
+
+#undef CONFIG_IRLAN_GRATUITOUS_ARP
+
+static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap,
+						    LM_REASON reason,
+						    struct sk_buff *);
+static int irlan_client_ctrl_data_indication(void *instance, void *sap,
+					     struct sk_buff *skb);
+static void irlan_client_ctrl_connect_confirm(void *instance, void *sap,
+					      struct qos_info *qos,
+					      __u32 max_sdu_size,
+					      __u8 max_header_size,
+					      struct sk_buff *);
+static void irlan_check_response_param(struct irlan_cb *self, char *param,
+				       char *value, int val_len);
+static void irlan_client_open_ctrl_tsap(struct irlan_cb *self);
+
+static void irlan_client_kick_timer_expired(void *data)
+{
+	struct irlan_cb *self = (struct irlan_cb *) data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/*
+	 * If we are in peer mode, the client may not have got the discovery
+	 * indication it needs to make progress. If the client is still in
+	 * IDLE state, we must kick it to, but only if the provider is not IDLE
+	 */
+	if ((self->provider.access_type == ACCESS_PEER) &&
+	    (self->client.state == IRLAN_IDLE) &&
+	    (self->provider.state != IRLAN_IDLE)) {
+		irlan_client_wakeup(self, self->saddr, self->daddr);
+	}
+}
+
+static void irlan_client_start_kick_timer(struct irlan_cb *self, int timeout)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	irda_start_timer(&self->client.kick_timer, timeout, (void *) self,
+			 irlan_client_kick_timer_expired);
+}
+
+/*
+ * Function irlan_client_wakeup (self, saddr, daddr)
+ *
+ *    Wake up client
+ *
+ */
+void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/*
+	 * Check if we are already awake, or if we are a provider in direct
+	 * mode (in that case we must leave the client idle
+	 */
+	if ((self->client.state != IRLAN_IDLE) ||
+	    (self->provider.access_type == ACCESS_DIRECT))
+	{
+			IRDA_DEBUG(0, "%s(), already awake!\n", __func__ );
+			return;
+	}
+
+	/* Addresses may have changed! */
+	self->saddr = saddr;
+	self->daddr = daddr;
+
+	if (self->disconnect_reason == LM_USER_REQUEST) {
+			IRDA_DEBUG(0, "%s(), still stopped by user\n", __func__ );
+			return;
+	}
+
+	/* Open TSAPs */
+	irlan_client_open_ctrl_tsap(self);
+	irlan_open_data_tsap(self);
+
+	irlan_do_client_event(self, IRLAN_DISCOVERY_INDICATION, NULL);
+
+	/* Start kick timer */
+	irlan_client_start_kick_timer(self, 2*HZ);
+}
+
+/*
+ * Function irlan_discovery_indication (daddr)
+ *
+ *    Remote device with IrLAN server support discovered
+ *
+ */
+void irlan_client_discovery_indication(discinfo_t *discovery,
+				       DISCOVERY_MODE mode,
+				       void *priv)
+{
+	struct irlan_cb *self;
+	__u32 saddr, daddr;
+
+	IRDA_DEBUG(1, "%s()\n", __func__ );
+
+	IRDA_ASSERT(discovery != NULL, return;);
+
+	/*
+	 * I didn't check it, but I bet that IrLAN suffer from the same
+	 * deficiency as IrComm and doesn't handle two instances
+	 * simultaneously connecting to each other.
+	 * Same workaround, drop passive discoveries.
+	 * Jean II */
+	if(mode == DISCOVERY_PASSIVE)
+		return;
+
+	saddr = discovery->saddr;
+	daddr = discovery->daddr;
+
+	/* Find instance */
+	rcu_read_lock();
+	self = irlan_get_any();
+	if (self) {
+		IRDA_ASSERT(self->magic == IRLAN_MAGIC, goto out;);
+
+		IRDA_DEBUG(1, "%s(), Found instance (%08x)!\n", __func__ ,
+		      daddr);
+
+		irlan_client_wakeup(self, saddr, daddr);
+	}
+IRDA_ASSERT_LABEL(out:)
+	rcu_read_unlock();
+}
+
+/*
+ * Function irlan_client_data_indication (handle, skb)
+ *
+ *    This function gets the data that is received on the control channel
+ *
+ */
+static int irlan_client_ctrl_data_indication(void *instance, void *sap,
+					     struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	self = (struct irlan_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	irlan_do_client_event(self, IRLAN_DATA_INDICATION, skb);
+
+	/* Ready for a new command */
+	IRDA_DEBUG(2, "%s(), clearing tx_busy\n", __func__ );
+	self->client.tx_busy = FALSE;
+
+	/* Check if we have some queued commands waiting to be sent */
+	irlan_run_ctrl_tx_queue(self);
+
+	return 0;
+}
+
+static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap,
+						    LM_REASON reason,
+						    struct sk_buff *userdata)
+{
+	struct irlan_cb *self;
+	struct tsap_cb *tsap;
+	struct sk_buff *skb;
+
+	IRDA_DEBUG(4, "%s(), reason=%d\n", __func__ , reason);
+
+	self = (struct irlan_cb *) instance;
+	tsap = (struct tsap_cb *) sap;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+	IRDA_ASSERT(tsap != NULL, return;);
+	IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+	IRDA_ASSERT(tsap == self->client.tsap_ctrl, return;);
+
+	/* Remove frames queued on the control channel */
+	while ((skb = skb_dequeue(&self->client.txq)) != NULL) {
+		dev_kfree_skb(skb);
+	}
+	self->client.tx_busy = FALSE;
+
+	irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+}
+
+/*
+ * Function irlan_client_open_tsaps (self)
+ *
+ *    Initialize callbacks and open IrTTP TSAPs
+ *
+ */
+static void irlan_client_open_ctrl_tsap(struct irlan_cb *self)
+{
+	struct tsap_cb *tsap;
+	notify_t notify;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Check if already open */
+	if (self->client.tsap_ctrl)
+		return;
+
+	irda_notify_init(&notify);
+
+	/* Set up callbacks */
+	notify.data_indication       = irlan_client_ctrl_data_indication;
+	notify.connect_confirm       = irlan_client_ctrl_connect_confirm;
+	notify.disconnect_indication = irlan_client_ctrl_disconnect_indication;
+	notify.instance = self;
+	strlcpy(notify.name, "IrLAN ctrl (c)", sizeof(notify.name));
+
+	tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, &notify);
+	if (!tsap) {
+		IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ );
+		return;
+	}
+	self->client.tsap_ctrl = tsap;
+}
+
+/*
+ * Function irlan_client_connect_confirm (handle, skb)
+ *
+ *    Connection to peer IrLAN laye confirmed
+ *
+ */
+static void irlan_client_ctrl_connect_confirm(void *instance, void *sap,
+					      struct qos_info *qos,
+					      __u32 max_sdu_size,
+					      __u8 max_header_size,
+					      struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	self = (struct irlan_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	self->client.max_sdu_size = max_sdu_size;
+	self->client.max_header_size = max_header_size;
+
+	/* TODO: we could set the MTU depending on the max_sdu_size */
+
+	irlan_do_client_event(self, IRLAN_CONNECT_COMPLETE, NULL);
+}
+
+/*
+ * Function print_ret_code (code)
+ *
+ *    Print return code of request to peer IrLAN layer.
+ *
+ */
+static void print_ret_code(__u8 code)
+{
+	switch(code) {
+	case 0:
+		printk(KERN_INFO "Success\n");
+		break;
+	case 1:
+		IRDA_WARNING("IrLAN: Insufficient resources\n");
+		break;
+	case 2:
+		IRDA_WARNING("IrLAN: Invalid command format\n");
+		break;
+	case 3:
+		IRDA_WARNING("IrLAN: Command not supported\n");
+		break;
+	case 4:
+		IRDA_WARNING("IrLAN: Parameter not supported\n");
+		break;
+	case 5:
+		IRDA_WARNING("IrLAN: Value not supported\n");
+		break;
+	case 6:
+		IRDA_WARNING("IrLAN: Not open\n");
+		break;
+	case 7:
+		IRDA_WARNING("IrLAN: Authentication required\n");
+		break;
+	case 8:
+		IRDA_WARNING("IrLAN: Invalid password\n");
+		break;
+	case 9:
+		IRDA_WARNING("IrLAN: Protocol error\n");
+		break;
+	case 255:
+		IRDA_WARNING("IrLAN: Asynchronous status\n");
+		break;
+	}
+}
+
+/*
+ * Function irlan_client_parse_response (self, skb)
+ *
+ *    Extract all parameters from received buffer, then feed them to
+ *    check_params for parsing
+ */
+void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb)
+{
+	__u8 *frame;
+	__u8 *ptr;
+	int count;
+	int ret;
+	__u16 val_len;
+	int i;
+	char *name;
+	char *value;
+
+	IRDA_ASSERT(skb != NULL, return;);
+
+	IRDA_DEBUG(4, "%s() skb->len=%d\n", __func__ , (int) skb->len);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	if (!skb) {
+		IRDA_ERROR("%s(), Got NULL skb!\n", __func__);
+		return;
+	}
+	frame = skb->data;
+
+	/*
+	 *  Check return code and print it if not success
+	 */
+	if (frame[0]) {
+		print_ret_code(frame[0]);
+		return;
+	}
+
+	name = kmalloc(255, GFP_ATOMIC);
+	if (!name)
+		return;
+	value = kmalloc(1016, GFP_ATOMIC);
+	if (!value) {
+		kfree(name);
+		return;
+	}
+
+	/* How many parameters? */
+	count = frame[1];
+
+	IRDA_DEBUG(4, "%s(), got %d parameters\n", __func__ , count);
+
+	ptr = frame+2;
+
+	/* For all parameters */
+	for (i=0; i<count;i++) {
+		ret = irlan_extract_param(ptr, name, value, &val_len);
+		if (ret < 0) {
+			IRDA_DEBUG(2, "%s(), IrLAN, Error!\n", __func__ );
+			break;
+		}
+		ptr += ret;
+		irlan_check_response_param(self, name, value, val_len);
+	}
+	/* Cleanup */
+	kfree(name);
+	kfree(value);
+}
+
+/*
+ * Function irlan_check_response_param (self, param, value, val_len)
+ *
+ *     Check which parameter is received and update local variables
+ *
+ */
+static void irlan_check_response_param(struct irlan_cb *self, char *param,
+				       char *value, int val_len)
+{
+	__u16 tmp_cpu; /* Temporary value in host order */
+	__u8 *bytes;
+	int i;
+
+	IRDA_DEBUG(4, "%s(), parm=%s\n", __func__ , param);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Media type */
+	if (strcmp(param, "MEDIA") == 0) {
+		if (strcmp(value, "802.3") == 0)
+			self->media = MEDIA_802_3;
+		else
+			self->media = MEDIA_802_5;
+		return;
+	}
+	if (strcmp(param, "FILTER_TYPE") == 0) {
+		if (strcmp(value, "DIRECTED") == 0)
+			self->client.filter_type |= IRLAN_DIRECTED;
+		else if (strcmp(value, "FUNCTIONAL") == 0)
+			self->client.filter_type |= IRLAN_FUNCTIONAL;
+		else if (strcmp(value, "GROUP") == 0)
+			self->client.filter_type |= IRLAN_GROUP;
+		else if (strcmp(value, "MAC_FRAME") == 0)
+			self->client.filter_type |= IRLAN_MAC_FRAME;
+		else if (strcmp(value, "MULTICAST") == 0)
+			self->client.filter_type |= IRLAN_MULTICAST;
+		else if (strcmp(value, "BROADCAST") == 0)
+			self->client.filter_type |= IRLAN_BROADCAST;
+		else if (strcmp(value, "IPX_SOCKET") == 0)
+			self->client.filter_type |= IRLAN_IPX_SOCKET;
+
+	}
+	if (strcmp(param, "ACCESS_TYPE") == 0) {
+		if (strcmp(value, "DIRECT") == 0)
+			self->client.access_type = ACCESS_DIRECT;
+		else if (strcmp(value, "PEER") == 0)
+			self->client.access_type = ACCESS_PEER;
+		else if (strcmp(value, "HOSTED") == 0)
+			self->client.access_type = ACCESS_HOSTED;
+		else {
+			IRDA_DEBUG(2, "%s(), unknown access type!\n", __func__ );
+		}
+	}
+	/* IRLAN version */
+	if (strcmp(param, "IRLAN_VER") == 0) {
+		IRDA_DEBUG(4, "IrLAN version %d.%d\n", (__u8) value[0],
+		      (__u8) value[1]);
+
+		self->version[0] = value[0];
+		self->version[1] = value[1];
+		return;
+	}
+	/* Which remote TSAP to use for data channel */
+	if (strcmp(param, "DATA_CHAN") == 0) {
+		self->dtsap_sel_data = value[0];
+		IRDA_DEBUG(4, "Data TSAP = %02x\n", self->dtsap_sel_data);
+		return;
+	}
+	if (strcmp(param, "CON_ARB") == 0) {
+		memcpy(&tmp_cpu, value, 2); /* Align value */
+		le16_to_cpus(&tmp_cpu);     /* Convert to host order */
+		self->client.recv_arb_val = tmp_cpu;
+		IRDA_DEBUG(2, "%s(), receive arb val=%d\n", __func__ ,
+			   self->client.recv_arb_val);
+	}
+	if (strcmp(param, "MAX_FRAME") == 0) {
+		memcpy(&tmp_cpu, value, 2); /* Align value */
+		le16_to_cpus(&tmp_cpu);     /* Convert to host order */
+		self->client.max_frame = tmp_cpu;
+		IRDA_DEBUG(4, "%s(), max frame=%d\n", __func__ ,
+			   self->client.max_frame);
+	}
+
+	/* RECONNECT_KEY, in case the link goes down! */
+	if (strcmp(param, "RECONNECT_KEY") == 0) {
+		IRDA_DEBUG(4, "Got reconnect key: ");
+		/* for (i = 0; i < val_len; i++) */
+/* 			printk("%02x", value[i]); */
+		memcpy(self->client.reconnect_key, value, val_len);
+		self->client.key_len = val_len;
+		IRDA_DEBUG(4, "\n");
+	}
+	/* FILTER_ENTRY, have we got an ethernet address? */
+	if (strcmp(param, "FILTER_ENTRY") == 0) {
+		bytes = value;
+		IRDA_DEBUG(4, "Ethernet address = %pM\n", bytes);
+		for (i = 0; i < 6; i++)
+			self->dev->dev_addr[i] = bytes[i];
+	}
+}
+
+/*
+ * Function irlan_client_get_value_confirm (obj_id, value)
+ *
+ *    Got results from remote LM-IAS
+ *
+ */
+void irlan_client_get_value_confirm(int result, __u16 obj_id,
+				    struct ias_value *value, void *priv)
+{
+	struct irlan_cb *self;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(priv != NULL, return;);
+
+	self = (struct irlan_cb *) priv;
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* We probably don't need to make any more queries */
+	iriap_close(self->client.iriap);
+	self->client.iriap = NULL;
+
+	/* Check if request succeeded */
+	if (result != IAS_SUCCESS) {
+		IRDA_DEBUG(2, "%s(), got NULL value!\n", __func__ );
+		irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL,
+				      NULL);
+		return;
+	}
+
+	switch (value->type) {
+	case IAS_INTEGER:
+		self->dtsap_sel_ctrl = value->t.integer;
+
+		if (value->t.integer != -1) {
+			irlan_do_client_event(self, IRLAN_IAS_PROVIDER_AVAIL,
+					      NULL);
+			return;
+		}
+		irias_delete_value(value);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), unknown type!\n", __func__ );
+		break;
+	}
+	irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, NULL);
+}
diff --git a/net/irda/irlan/irlan_client_event.c b/net/irda/irlan/irlan_client_event.c
new file mode 100644
index 00000000..8d5a8ebc
--- /dev/null
+++ b/net/irda/irlan/irlan_client_event.c
@@ -0,0 +1,533 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_client_event.c
+ * Version:       0.9
+ * Description:   IrLAN client state machine
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sun Dec 26 21:52:24 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/irmod.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_event.h>
+
+static int irlan_client_state_idle (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_conn (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_info (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_open (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_wait (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_arb  (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_data (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+static int irlan_client_state_sync (struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb);
+
+static int (*state[])(struct irlan_cb *, IRLAN_EVENT event, struct sk_buff *) =
+{
+	irlan_client_state_idle,
+	irlan_client_state_query,
+	irlan_client_state_conn,
+	irlan_client_state_info,
+	irlan_client_state_media,
+	irlan_client_state_open,
+	irlan_client_state_wait,
+	irlan_client_state_arb,
+	irlan_client_state_data,
+	irlan_client_state_close,
+	irlan_client_state_sync
+};
+
+void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event,
+			   struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	(*state[ self->client.state]) (self, event, skb);
+}
+
+/*
+ * Function irlan_client_state_idle (event, skb, info)
+ *
+ *    IDLE, We are waiting for an indication that there is a provider
+ *    available.
+ */
+static int irlan_client_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	switch (event) {
+	case IRLAN_DISCOVERY_INDICATION:
+		if (self->client.iriap) {
+			IRDA_WARNING("%s(), busy with a previous query\n",
+				     __func__);
+			return -EBUSY;
+		}
+
+		self->client.iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+						irlan_client_get_value_confirm);
+		/* Get some values from peer IAS */
+		irlan_next_client_state(self, IRLAN_QUERY);
+		iriap_getvaluebyclass_request(self->client.iriap,
+					      self->saddr, self->daddr,
+					      "IrLAN", "IrDA:TinyTP:LsapSel");
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(4, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_query (event, skb, info)
+ *
+ *    QUERY, We have queryed the remote IAS and is ready to connect
+ *    to provider, just waiting for the confirm.
+ *
+ */
+static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	switch(event) {
+	case IRLAN_IAS_PROVIDER_AVAIL:
+		IRDA_ASSERT(self->dtsap_sel_ctrl != 0, return -1;);
+
+		self->client.open_retries = 0;
+
+		irttp_connect_request(self->client.tsap_ctrl,
+				      self->dtsap_sel_ctrl,
+				      self->saddr, self->daddr, NULL,
+				      IRLAN_MTU, NULL);
+		irlan_next_client_state(self, IRLAN_CONN);
+		break;
+	case IRLAN_IAS_PROVIDER_NOT_AVAIL:
+		IRDA_DEBUG(2, "%s(), IAS_PROVIDER_NOT_AVAIL\n", __func__ );
+		irlan_next_client_state(self, IRLAN_IDLE);
+
+		/* Give the client a kick! */
+		if ((self->provider.access_type == ACCESS_PEER) &&
+		    (self->provider.state != IRLAN_IDLE))
+			irlan_client_wakeup(self, self->saddr, self->daddr);
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_conn (event, skb, info)
+ *
+ *    CONN, We have connected to a provider but has not issued any
+ *    commands yet.
+ *
+ */
+static int irlan_client_state_conn(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch (event) {
+	case IRLAN_CONNECT_COMPLETE:
+		/* Send getinfo cmd */
+		irlan_get_provider_info(self);
+		irlan_next_client_state(self, IRLAN_INFO);
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_info (self, event, skb, info)
+ *
+ *    INFO, We have issued a GetInfo command and is awaiting a reply.
+ */
+static int irlan_client_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch (event) {
+	case IRLAN_DATA_INDICATION:
+		IRDA_ASSERT(skb != NULL, return -1;);
+
+		irlan_client_parse_response(self, skb);
+
+		irlan_next_client_state(self, IRLAN_MEDIA);
+
+		irlan_get_media_char(self);
+		break;
+
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_media (self, event, skb, info)
+ *
+ *    MEDIA, The irlan_client has issued a GetMedia command and is awaiting a
+ *    reply.
+ *
+ */
+static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_DATA_INDICATION:
+		irlan_client_parse_response(self, skb);
+		irlan_open_data_channel(self);
+		irlan_next_client_state(self, IRLAN_OPEN);
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_open (self, event, skb, info)
+ *
+ *    OPEN, The irlan_client has issued a OpenData command and is awaiting a
+ *    reply
+ *
+ */
+static int irlan_client_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	struct qos_info qos;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_DATA_INDICATION:
+		irlan_client_parse_response(self, skb);
+
+		/*
+		 *  Check if we have got the remote TSAP for data
+		 *  communications
+		 */
+		IRDA_ASSERT(self->dtsap_sel_data != 0, return -1;);
+
+		/* Check which access type we are dealing with */
+		switch (self->client.access_type) {
+		case ACCESS_PEER:
+		    if (self->provider.state == IRLAN_OPEN) {
+
+			    irlan_next_client_state(self, IRLAN_ARB);
+			    irlan_do_client_event(self, IRLAN_CHECK_CON_ARB,
+						  NULL);
+		    } else {
+
+			    irlan_next_client_state(self, IRLAN_WAIT);
+		    }
+		    break;
+		case ACCESS_DIRECT:
+		case ACCESS_HOSTED:
+			qos.link_disc_time.bits = 0x01; /* 3 secs */
+
+			irttp_connect_request(self->tsap_data,
+					      self->dtsap_sel_data,
+					      self->saddr, self->daddr, &qos,
+					      IRLAN_MTU, NULL);
+
+			irlan_next_client_state(self, IRLAN_DATA);
+			break;
+		default:
+			IRDA_DEBUG(2, "%s(), unknown access type!\n", __func__ );
+			break;
+		}
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_wait (self, event, skb, info)
+ *
+ *    WAIT, The irlan_client is waiting for the local provider to enter the
+ *    provider OPEN state.
+ *
+ */
+static int irlan_client_state_wait(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_PROVIDER_SIGNAL:
+		irlan_next_client_state(self, IRLAN_ARB);
+		irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, NULL);
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+static int irlan_client_state_arb(struct irlan_cb *self, IRLAN_EVENT event,
+				  struct sk_buff *skb)
+{
+	struct qos_info qos;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_CHECK_CON_ARB:
+		if (self->client.recv_arb_val == self->provider.send_arb_val) {
+			irlan_next_client_state(self, IRLAN_CLOSE);
+			irlan_close_data_channel(self);
+		} else if (self->client.recv_arb_val <
+			   self->provider.send_arb_val)
+		{
+			qos.link_disc_time.bits = 0x01; /* 3 secs */
+
+			irlan_next_client_state(self, IRLAN_DATA);
+			irttp_connect_request(self->tsap_data,
+					      self->dtsap_sel_data,
+					      self->saddr, self->daddr, &qos,
+					      IRLAN_MTU, NULL);
+		} else if (self->client.recv_arb_val >
+			   self->provider.send_arb_val)
+		{
+			IRDA_DEBUG(2, "%s(), lost the battle :-(\n", __func__ );
+		}
+		break;
+	case IRLAN_DATA_CONNECT_INDICATION:
+		irlan_next_client_state(self, IRLAN_DATA);
+		break;
+	case IRLAN_LMP_DISCONNECT:
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	case IRLAN_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_data (self, event, skb, info)
+ *
+ *    DATA, The data channel is connected, allowing data transfers between
+ *    the local and remote machines.
+ *
+ */
+static int irlan_client_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	switch(event) {
+	case IRLAN_DATA_INDICATION:
+		irlan_client_parse_response(self, skb);
+		break;
+	case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_client_state(self, IRLAN_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_close (self, event, skb, info)
+ *
+ *
+ *
+ */
+static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event,
+				    struct sk_buff *skb)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_client_state_sync (self, event, skb, info)
+ *
+ *
+ *
+ */
+static int irlan_client_state_sync(struct irlan_cb *self, IRLAN_EVENT event,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
new file mode 100644
index 00000000..6130f9d9
--- /dev/null
+++ b/net/irda/irlan/irlan_common.c
@@ -0,0 +1,1214 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_common.c
+ * Version:       0.9
+ * Description:   IrDA LAN Access Protocol Implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sun Dec 26 21:53:10 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+
+#include <asm/system.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_filter.h>
+
+
+/* extern char sysctl_devname[]; */
+
+/*
+ *  Master structure
+ */
+static LIST_HEAD(irlans);
+
+static void *ckey;
+static void *skey;
+
+/* Module parameters */
+static int eth;   /* Use "eth" or "irlan" name for devices */
+static int access = ACCESS_PEER; /* PEER, DIRECT or HOSTED */
+
+#ifdef CONFIG_PROC_FS
+static const char *const irlan_access[] = {
+	"UNKNOWN",
+	"DIRECT",
+	"PEER",
+	"HOSTED"
+};
+
+static const char *const irlan_media[] = {
+	"UNKNOWN",
+	"802.3",
+	"802.5"
+};
+
+extern struct proc_dir_entry *proc_irda;
+
+static int irlan_seq_open(struct inode *inode, struct file *file);
+
+static const struct file_operations irlan_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = irlan_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+extern struct proc_dir_entry *proc_irda;
+#endif /* CONFIG_PROC_FS */
+
+static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr);
+static void __irlan_close(struct irlan_cb *self);
+static int __irlan_insert_param(struct sk_buff *skb, char *param, int type,
+				__u8 value_byte, __u16 value_short,
+				__u8 *value_array, __u16 value_len);
+static void irlan_open_unicast_addr(struct irlan_cb *self);
+static void irlan_get_unicast_addr(struct irlan_cb *self);
+void irlan_close_tsaps(struct irlan_cb *self);
+
+/*
+ * Function irlan_init (void)
+ *
+ *    Initialize IrLAN layer
+ *
+ */
+static int __init irlan_init(void)
+{
+	struct irlan_cb *new;
+	__u16 hints;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+#ifdef CONFIG_PROC_FS
+	{ struct proc_dir_entry *proc;
+	proc = proc_create("irlan", 0, proc_irda, &irlan_fops);
+	if (!proc) {
+		printk(KERN_ERR "irlan_init: can't create /proc entry!\n");
+		return -ENODEV;
+	}
+	}
+#endif /* CONFIG_PROC_FS */
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+	hints = irlmp_service_to_hint(S_LAN);
+
+	/* Register with IrLMP as a client */
+	ckey = irlmp_register_client(hints, &irlan_client_discovery_indication,
+				     NULL, NULL);
+	if (!ckey)
+		goto err_ckey;
+
+	/* Register with IrLMP as a service */
+	skey = irlmp_register_service(hints);
+	if (!skey)
+		goto err_skey;
+
+	/* Start the master IrLAN instance (the only one for now) */
+	new = irlan_open(DEV_ADDR_ANY, DEV_ADDR_ANY);
+	if (!new)
+		goto err_open;
+
+	/* The master will only open its (listen) control TSAP */
+	irlan_provider_open_ctrl_tsap(new);
+
+	/* Do some fast discovery! */
+	irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS);
+
+	return 0;
+
+err_open:
+	irlmp_unregister_service(skey);
+err_skey:
+	irlmp_unregister_client(ckey);
+err_ckey:
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("irlan", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+	return -ENOMEM;
+}
+
+static void __exit irlan_cleanup(void)
+{
+	struct irlan_cb *self, *next;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	irlmp_unregister_client(ckey);
+	irlmp_unregister_service(skey);
+
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("irlan", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+	/* Cleanup any leftover network devices */
+	rtnl_lock();
+	list_for_each_entry_safe(self, next, &irlans, dev_list) {
+		__irlan_close(self);
+	}
+	rtnl_unlock();
+}
+
+/*
+ * Function irlan_open (void)
+ *
+ *    Open new instance of a client/provider, we should only register the
+ *    network device if this instance is ment for a particular client/provider
+ */
+static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr)
+{
+	struct net_device *dev;
+	struct irlan_cb *self;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Create network device with irlan */
+	dev = alloc_irlandev(eth ? "eth%d" : "irlan%d");
+	if (!dev)
+		return NULL;
+
+	self = netdev_priv(dev);
+	self->dev = dev;
+
+	/*
+	 *  Initialize local device structure
+	 */
+	self->magic = IRLAN_MAGIC;
+	self->saddr = saddr;
+	self->daddr = daddr;
+
+	/* Provider access can only be PEER, DIRECT, or HOSTED */
+	self->provider.access_type = access;
+	if (access == ACCESS_DIRECT) {
+		/*
+		 * Since we are emulating an IrLAN sever we will have to
+		 * give ourself an ethernet address!
+		 */
+		dev->dev_addr[0] = 0x40;
+		dev->dev_addr[1] = 0x00;
+		dev->dev_addr[2] = 0x00;
+		dev->dev_addr[3] = 0x00;
+		get_random_bytes(dev->dev_addr+4, 1);
+		get_random_bytes(dev->dev_addr+5, 1);
+	}
+
+	self->media = MEDIA_802_3;
+	self->disconnect_reason = LM_USER_REQUEST;
+	init_timer(&self->watchdog_timer);
+	init_timer(&self->client.kick_timer);
+	init_waitqueue_head(&self->open_wait);
+
+	skb_queue_head_init(&self->client.txq);
+
+	irlan_next_client_state(self, IRLAN_IDLE);
+	irlan_next_provider_state(self, IRLAN_IDLE);
+
+	if (register_netdev(dev)) {
+		IRDA_DEBUG(2, "%s(), register_netdev() failed!\n",
+			   __func__ );
+		self = NULL;
+		free_netdev(dev);
+	} else {
+		rtnl_lock();
+		list_add_rcu(&self->dev_list, &irlans);
+		rtnl_unlock();
+	}
+
+	return self;
+}
+/*
+ * Function __irlan_close (self)
+ *
+ *    This function closes and deallocates the IrLAN client instances. Be
+ *    aware that other functions which calls client_close() must
+ *    remove self from irlans list first.
+ */
+static void __irlan_close(struct irlan_cb *self)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	ASSERT_RTNL();
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	del_timer_sync(&self->watchdog_timer);
+	del_timer_sync(&self->client.kick_timer);
+
+	/* Close all open connections and remove TSAPs */
+	irlan_close_tsaps(self);
+
+	if (self->client.iriap)
+		iriap_close(self->client.iriap);
+
+	/* Remove frames queued on the control channel */
+	skb_queue_purge(&self->client.txq);
+
+	/* Unregister and free self via destructor */
+	unregister_netdevice(self->dev);
+}
+
+/* Find any instance of irlan, used for client discovery wakeup */
+struct irlan_cb *irlan_get_any(void)
+{
+	struct irlan_cb *self;
+
+	list_for_each_entry_rcu(self, &irlans, dev_list) {
+		return self;
+	}
+	return NULL;
+}
+
+/*
+ * Function irlan_connect_indication (instance, sap, qos, max_sdu_size, skb)
+ *
+ *    Here we receive the connect indication for the data channel
+ *
+ */
+static void irlan_connect_indication(void *instance, void *sap,
+				     struct qos_info *qos,
+				     __u32 max_sdu_size,
+				     __u8 max_header_size,
+				     struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+	struct tsap_cb *tsap;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	self = (struct irlan_cb *) instance;
+	tsap = (struct tsap_cb *) sap;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+	IRDA_ASSERT(tsap == self->tsap_data,return;);
+
+	self->max_sdu_size = max_sdu_size;
+	self->max_header_size = max_header_size;
+
+	IRDA_DEBUG(0, "%s: We are now connected!\n", __func__);
+
+	del_timer(&self->watchdog_timer);
+
+	/* If you want to pass the skb to *both* state machines, you will
+	 * need to skb_clone() it, so that you don't free it twice.
+	 * As the state machines don't need it, git rid of it here...
+	 * Jean II */
+	if (skb)
+		dev_kfree_skb(skb);
+
+	irlan_do_provider_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL);
+	irlan_do_client_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL);
+
+	if (self->provider.access_type == ACCESS_PEER) {
+		/*
+		 * Data channel is open, so we are now allowed to
+		 * configure the remote filter
+		 */
+		irlan_get_unicast_addr(self);
+		irlan_open_unicast_addr(self);
+	}
+	/* Ready to transfer Ethernet frames (at last) */
+	netif_start_queue(self->dev); /* Clear reason */
+}
+
+static void irlan_connect_confirm(void *instance, void *sap,
+				  struct qos_info *qos,
+				  __u32 max_sdu_size,
+				  __u8 max_header_size,
+				  struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+
+	self = (struct irlan_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	self->max_sdu_size = max_sdu_size;
+	self->max_header_size = max_header_size;
+
+	/* TODO: we could set the MTU depending on the max_sdu_size */
+
+	IRDA_DEBUG(0, "%s: We are now connected!\n", __func__);
+	del_timer(&self->watchdog_timer);
+
+	/*
+	 * Data channel is open, so we are now allowed to configure the remote
+	 * filter
+	 */
+	irlan_get_unicast_addr(self);
+	irlan_open_unicast_addr(self);
+
+	/* Open broadcast and multicast filter by default */
+	irlan_set_broadcast_filter(self, TRUE);
+	irlan_set_multicast_filter(self, TRUE);
+
+	/* Ready to transfer Ethernet frames */
+	netif_start_queue(self->dev);
+	self->disconnect_reason = 0; /* Clear reason */
+	wake_up_interruptible(&self->open_wait);
+}
+
+/*
+ * Function irlan_client_disconnect_indication (handle)
+ *
+ *    Callback function for the IrTTP layer. Indicates a disconnection of
+ *    the specified connection (handle)
+ */
+static void irlan_disconnect_indication(void *instance,
+					void *sap, LM_REASON reason,
+					struct sk_buff *userdata)
+{
+	struct irlan_cb *self;
+	struct tsap_cb *tsap;
+
+	IRDA_DEBUG(0, "%s(), reason=%d\n", __func__ , reason);
+
+	self = (struct irlan_cb *) instance;
+	tsap = (struct tsap_cb *) sap;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+	IRDA_ASSERT(tsap != NULL, return;);
+	IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+	IRDA_ASSERT(tsap == self->tsap_data, return;);
+
+	IRDA_DEBUG(2, "IrLAN, data channel disconnected by peer!\n");
+
+	/* Save reason so we know if we should try to reconnect or not */
+	self->disconnect_reason = reason;
+
+	switch (reason) {
+	case LM_USER_REQUEST: /* User request */
+		IRDA_DEBUG(2, "%s(), User requested\n", __func__ );
+		break;
+	case LM_LAP_DISCONNECT: /* Unexpected IrLAP disconnect */
+		IRDA_DEBUG(2, "%s(), Unexpected IrLAP disconnect\n", __func__ );
+		break;
+	case LM_CONNECT_FAILURE: /* Failed to establish IrLAP connection */
+		IRDA_DEBUG(2, "%s(), IrLAP connect failed\n", __func__ );
+		break;
+	case LM_LAP_RESET:  /* IrLAP reset */
+		IRDA_DEBUG(2, "%s(), IrLAP reset\n", __func__ );
+		break;
+	case LM_INIT_DISCONNECT:
+		IRDA_DEBUG(2, "%s(), IrLMP connect failed\n", __func__ );
+		break;
+	default:
+		IRDA_ERROR("%s(), Unknown disconnect reason\n", __func__);
+		break;
+	}
+
+	/* If you want to pass the skb to *both* state machines, you will
+	 * need to skb_clone() it, so that you don't free it twice.
+	 * As the state machines don't need it, git rid of it here...
+	 * Jean II */
+	if (userdata)
+		dev_kfree_skb(userdata);
+
+	irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+	irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+
+	wake_up_interruptible(&self->open_wait);
+}
+
+void irlan_open_data_tsap(struct irlan_cb *self)
+{
+	struct tsap_cb *tsap;
+	notify_t notify;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Check if already open */
+	if (self->tsap_data)
+		return;
+
+	irda_notify_init(&notify);
+
+	notify.data_indication       = irlan_eth_receive;
+	notify.udata_indication      = irlan_eth_receive;
+	notify.connect_indication    = irlan_connect_indication;
+	notify.connect_confirm       = irlan_connect_confirm;
+	notify.flow_indication       = irlan_eth_flow_indication;
+	notify.disconnect_indication = irlan_disconnect_indication;
+	notify.instance              = self;
+	strlcpy(notify.name, "IrLAN data", sizeof(notify.name));
+
+	tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, &notify);
+	if (!tsap) {
+		IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ );
+		return;
+	}
+	self->tsap_data = tsap;
+
+	/*
+	 *  This is the data TSAP selector which we will pass to the client
+	 *  when the client ask for it.
+	 */
+	self->stsap_sel_data = self->tsap_data->stsap_sel;
+}
+
+void irlan_close_tsaps(struct irlan_cb *self)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Disconnect and close all open TSAP connections */
+	if (self->tsap_data) {
+		irttp_disconnect_request(self->tsap_data, NULL, P_NORMAL);
+		irttp_close_tsap(self->tsap_data);
+		self->tsap_data = NULL;
+	}
+	if (self->client.tsap_ctrl) {
+		irttp_disconnect_request(self->client.tsap_ctrl, NULL,
+					 P_NORMAL);
+		irttp_close_tsap(self->client.tsap_ctrl);
+		self->client.tsap_ctrl = NULL;
+	}
+	if (self->provider.tsap_ctrl) {
+		irttp_disconnect_request(self->provider.tsap_ctrl, NULL,
+					 P_NORMAL);
+		irttp_close_tsap(self->provider.tsap_ctrl);
+		self->provider.tsap_ctrl = NULL;
+	}
+	self->disconnect_reason = LM_USER_REQUEST;
+}
+
+/*
+ * Function irlan_ias_register (self, tsap_sel)
+ *
+ *    Register with LM-IAS
+ *
+ */
+void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel)
+{
+	struct ias_object *obj;
+	struct ias_value *new_value;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/*
+	 * Check if object has already been registered by a previous provider.
+	 * If that is the case, we just change the value of the attribute
+	 */
+	if (!irias_find_object("IrLAN")) {
+		obj = irias_new_object("IrLAN", IAS_IRLAN_ID);
+		irias_add_integer_attrib(obj, "IrDA:TinyTP:LsapSel", tsap_sel,
+					 IAS_KERNEL_ATTR);
+		irias_insert_object(obj);
+	} else {
+		new_value = irias_new_integer_value(tsap_sel);
+		irias_object_change_attribute("IrLAN", "IrDA:TinyTP:LsapSel",
+					      new_value);
+	}
+
+	/* Register PnP object only if not registered before */
+	if (!irias_find_object("PnP")) {
+		obj = irias_new_object("PnP", IAS_PNP_ID);
+#if 0
+		irias_add_string_attrib(obj, "Name", sysctl_devname,
+					IAS_KERNEL_ATTR);
+#else
+		irias_add_string_attrib(obj, "Name", "Linux", IAS_KERNEL_ATTR);
+#endif
+		irias_add_string_attrib(obj, "DeviceID", "HWP19F0",
+					IAS_KERNEL_ATTR);
+		irias_add_integer_attrib(obj, "CompCnt", 1, IAS_KERNEL_ATTR);
+		if (self->provider.access_type == ACCESS_PEER)
+			irias_add_string_attrib(obj, "Comp#01", "PNP8389",
+						IAS_KERNEL_ATTR);
+		else
+			irias_add_string_attrib(obj, "Comp#01", "PNP8294",
+						IAS_KERNEL_ATTR);
+
+		irias_add_string_attrib(obj, "Manufacturer",
+					"Linux-IrDA Project", IAS_KERNEL_ATTR);
+		irias_insert_object(obj);
+	}
+}
+
+/*
+ * Function irlan_run_ctrl_tx_queue (self)
+ *
+ *    Try to send the next command in the control transmit queue
+ *
+ */
+int irlan_run_ctrl_tx_queue(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	if (irda_lock(&self->client.tx_busy) == FALSE)
+		return -EBUSY;
+
+	skb = skb_dequeue(&self->client.txq);
+	if (!skb) {
+		self->client.tx_busy = FALSE;
+		return 0;
+	}
+
+	/* Check that it's really possible to send commands */
+	if ((self->client.tsap_ctrl == NULL) ||
+	    (self->client.state == IRLAN_IDLE))
+	{
+		self->client.tx_busy = FALSE;
+		dev_kfree_skb(skb);
+		return -1;
+	}
+	IRDA_DEBUG(2, "%s(), sending ...\n", __func__ );
+
+	return irttp_data_request(self->client.tsap_ctrl, skb);
+}
+
+/*
+ * Function irlan_ctrl_data_request (self, skb)
+ *
+ *    This function makes sure that commands on the control channel is being
+ *    sent in a command/response fashion
+ */
+static void irlan_ctrl_data_request(struct irlan_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Queue command */
+	skb_queue_tail(&self->client.txq, skb);
+
+	/* Try to send command */
+	irlan_run_ctrl_tx_queue(self);
+}
+
+/*
+ * Function irlan_get_provider_info (self)
+ *
+ *    Send Get Provider Information command to peer IrLAN layer
+ *
+ */
+void irlan_get_provider_info(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER,
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	frame[0] = CMD_GET_PROVIDER_INFO;
+	frame[1] = 0x00;                 /* Zero parameters */
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_open_data_channel (self)
+ *
+ *    Send an Open Data Command to provider
+ *
+ */
+void irlan_open_data_channel(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3") +
+			IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "DIRECT"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	/* Build frame */
+	frame[0] = CMD_OPEN_DATA_CHANNEL;
+	frame[1] = 0x02; /* Two parameters */
+
+	irlan_insert_string_param(skb, "MEDIA", "802.3");
+	irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT");
+	/* irlan_insert_string_param(skb, "MODE", "UNRELIABLE"); */
+
+/* 	self->use_udata = TRUE; */
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+void irlan_close_data_channel(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Check if the TSAP is still there */
+	if (self->client.tsap_ctrl == NULL)
+		return;
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	/* Build frame */
+	frame[0] = CMD_CLOSE_DATA_CHAN;
+	frame[1] = 0x01; /* One parameter */
+
+	irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_open_unicast_addr (self)
+ *
+ *    Make IrLAN provider accept ethernet frames addressed to the unicast
+ *    address.
+ *
+ */
+static void irlan_open_unicast_addr(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	frame[0] = CMD_FILTER_OPERATION;
+	frame[1] = 0x03;                 /* Three parameters */
+	irlan_insert_byte_param(skb, "DATA_CHAN" , self->dtsap_sel_data);
+	irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+	irlan_insert_string_param(skb, "FILTER_MODE", "FILTER");
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_set_broadcast_filter (self, status)
+ *
+ *    Make IrLAN provider accept ethernet frames addressed to the broadcast
+ *    address. Be careful with the use of this one, since there may be a lot
+ *    of broadcast traffic out there. We can still function without this
+ *    one but then _we_ have to initiate all communication with other
+ *    hosts, since ARP request for this host will not be answered.
+ */
+void irlan_set_broadcast_filter(struct irlan_cb *self, int status)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BROADCAST") +
+			/* We may waste one byte here...*/
+			IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "FILTER"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	frame[0] = CMD_FILTER_OPERATION;
+	frame[1] = 0x03;                 /* Three parameters */
+	irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+	irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST");
+	if (status)
+		irlan_insert_string_param(skb, "FILTER_MODE", "FILTER");
+	else
+		irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_set_multicast_filter (self, status)
+ *
+ *    Make IrLAN provider accept ethernet frames addressed to the multicast
+ *    address.
+ *
+ */
+void irlan_set_multicast_filter(struct irlan_cb *self, int status)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") +
+			/* We may waste one byte here...*/
+			IRLAN_STRING_PARAMETER_LEN("FILTER_MODE", "NONE"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	frame[0] = CMD_FILTER_OPERATION;
+	frame[1] = 0x03;                 /* Three parameters */
+	irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+	irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST");
+	if (status)
+		irlan_insert_string_param(skb, "FILTER_MODE", "ALL");
+	else
+		irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_get_unicast_addr (self)
+ *
+ *    Retrieves the unicast address from the IrLAN provider. This address
+ *    will be inserted into the devices structure, so the ethernet layer
+ *    can construct its packets.
+ *
+ */
+static void irlan_get_unicast_addr(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_BYTE_PARAMETER_LEN("DATA_CHAN") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_OPERATION",
+						   "DYNAMIC"),
+			GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	frame[0] = CMD_FILTER_OPERATION;
+	frame[1] = 0x03;                 /* Three parameters */
+	irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data);
+	irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+	irlan_insert_string_param(skb, "FILTER_OPERATION", "DYNAMIC");
+
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function irlan_get_media_char (self)
+ *
+ *
+ *
+ */
+void irlan_get_media_char(struct irlan_cb *self)
+{
+	struct sk_buff *skb;
+	__u8 *frame;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			IRLAN_STRING_PARAMETER_LEN("MEDIA", "802.3"),
+			GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->client.max_header_size);
+	skb_put(skb, 2);
+
+	frame = skb->data;
+
+	/* Build frame */
+	frame[0] = CMD_GET_MEDIA_CHAR;
+	frame[1] = 0x01; /* One parameter */
+
+	irlan_insert_string_param(skb, "MEDIA", "802.3");
+	irlan_ctrl_data_request(self, skb);
+}
+
+/*
+ * Function insert_byte_param (skb, param, value)
+ *
+ *    Insert byte parameter into frame
+ *
+ */
+int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value)
+{
+	return __irlan_insert_param(skb, param, IRLAN_BYTE, value, 0, NULL, 0);
+}
+
+int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value)
+{
+	return __irlan_insert_param(skb, param, IRLAN_SHORT, 0, value, NULL, 0);
+}
+
+/*
+ * Function insert_string (skb, param, value)
+ *
+ *    Insert string parameter into frame
+ *
+ */
+int irlan_insert_string_param(struct sk_buff *skb, char *param, char *string)
+{
+	int string_len = strlen(string);
+
+	return __irlan_insert_param(skb, param, IRLAN_ARRAY, 0, 0, string,
+				    string_len);
+}
+
+/*
+ * Function insert_array_param(skb, param, value, len_value)
+ *
+ *    Insert array parameter into frame
+ *
+ */
+int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *array,
+			     __u16 array_len)
+{
+	return __irlan_insert_param(skb, name, IRLAN_ARRAY, 0, 0, array,
+				    array_len);
+}
+
+/*
+ * Function insert_param (skb, param, value, byte)
+ *
+ *    Insert parameter at end of buffer, structure of a parameter is:
+ *
+ *    -----------------------------------------------------------------------
+ *    | Name Length[1] | Param Name[1..255] | Val Length[2] | Value[0..1016]|
+ *    -----------------------------------------------------------------------
+ */
+static int __irlan_insert_param(struct sk_buff *skb, char *param, int type,
+				__u8 value_byte, __u16 value_short,
+				__u8 *value_array, __u16 value_len)
+{
+	__u8 *frame;
+	__u8 param_len;
+	__le16 tmp_le; /* Temporary value in little endian format */
+	int n=0;
+
+	if (skb == NULL) {
+		IRDA_DEBUG(2, "%s(), Got NULL skb\n", __func__ );
+		return 0;
+	}
+
+	param_len = strlen(param);
+	switch (type) {
+	case IRLAN_BYTE:
+		value_len = 1;
+		break;
+	case IRLAN_SHORT:
+		value_len = 2;
+		break;
+	case IRLAN_ARRAY:
+		IRDA_ASSERT(value_array != NULL, return 0;);
+		IRDA_ASSERT(value_len > 0, return 0;);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown parameter type!\n", __func__ );
+		return 0;
+		break;
+	}
+
+	/* Insert at end of sk-buffer */
+	frame = skb_tail_pointer(skb);
+
+	/* Make space for data */
+	if (skb_tailroom(skb) < (param_len+value_len+3)) {
+		IRDA_DEBUG(2, "%s(), No more space at end of skb\n", __func__ );
+		return 0;
+	}
+	skb_put(skb, param_len+value_len+3);
+
+	/* Insert parameter length */
+	frame[n++] = param_len;
+
+	/* Insert parameter */
+	memcpy(frame+n, param, param_len); n += param_len;
+
+	/* Insert value length (2 byte little endian format, LSB first) */
+	tmp_le = cpu_to_le16(value_len);
+	memcpy(frame+n, &tmp_le, 2); n += 2; /* To avoid alignment problems */
+
+	/* Insert value */
+	switch (type) {
+	case IRLAN_BYTE:
+		frame[n++] = value_byte;
+		break;
+	case IRLAN_SHORT:
+		tmp_le = cpu_to_le16(value_short);
+		memcpy(frame+n, &tmp_le, 2); n += 2;
+		break;
+	case IRLAN_ARRAY:
+		memcpy(frame+n, value_array, value_len); n+=value_len;
+		break;
+	default:
+		break;
+	}
+	IRDA_ASSERT(n == (param_len+value_len+3), return 0;);
+
+	return param_len+value_len+3;
+}
+
+/*
+ * Function irlan_extract_param (buf, name, value, len)
+ *
+ *    Extracts a single parameter name/value pair from buffer and updates
+ *    the buffer pointer to point to the next name/value pair.
+ */
+int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
+{
+	__u8 name_len;
+	__u16 val_len;
+	int n=0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	/* get length of parameter name (1 byte) */
+	name_len = buf[n++];
+
+	if (name_len > 254) {
+		IRDA_DEBUG(2, "%s(), name_len > 254\n", __func__ );
+		return -RSP_INVALID_COMMAND_FORMAT;
+	}
+
+	/* get parameter name */
+	memcpy(name, buf+n, name_len);
+	name[name_len] = '\0';
+	n+=name_len;
+
+	/*
+	 *  Get length of parameter value (2 bytes in little endian
+	 *  format)
+	 */
+	memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
+	le16_to_cpus(&val_len); n+=2;
+
+	if (val_len >= 1016) {
+		IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ );
+		return -RSP_INVALID_COMMAND_FORMAT;
+	}
+	*len = val_len;
+
+	/* get parameter value */
+	memcpy(value, buf+n, val_len);
+	value[val_len] = '\0';
+	n+=val_len;
+
+	IRDA_DEBUG(4, "Parameter: %s ", name);
+	IRDA_DEBUG(4, "Value: %s\n", value);
+
+	return n;
+}
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Start of reading /proc entries.
+ * Return entry at pos,
+ *	or start_token to indicate print header line
+ *	or NULL if end of file
+ */
+static void *irlan_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	rcu_read_lock();
+	return seq_list_start_head(&irlans, *pos);
+}
+
+/* Return entry after v, and increment pos */
+static void *irlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &irlans, pos);
+}
+
+/* End of reading /proc file */
+static void irlan_seq_stop(struct seq_file *seq, void *v)
+{
+	rcu_read_unlock();
+}
+
+
+/*
+ * Show one entry in /proc file.
+ */
+static int irlan_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == &irlans)
+		seq_puts(seq, "IrLAN instances:\n");
+	else {
+		struct irlan_cb *self = list_entry(v, struct irlan_cb, dev_list);
+
+		IRDA_ASSERT(self != NULL, return -1;);
+		IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+		seq_printf(seq,"ifname: %s,\n",
+			       self->dev->name);
+		seq_printf(seq,"client state: %s, ",
+			       irlan_state[ self->client.state]);
+		seq_printf(seq,"provider state: %s,\n",
+			       irlan_state[ self->provider.state]);
+		seq_printf(seq,"saddr: %#08x, ",
+			       self->saddr);
+		seq_printf(seq,"daddr: %#08x\n",
+			       self->daddr);
+		seq_printf(seq,"version: %d.%d,\n",
+			       self->version[1], self->version[0]);
+		seq_printf(seq,"access type: %s\n",
+			       irlan_access[self->client.access_type]);
+		seq_printf(seq,"media: %s\n",
+			       irlan_media[self->media]);
+
+		seq_printf(seq,"local filter:\n");
+		seq_printf(seq,"remote filter: ");
+		irlan_print_filter(seq, self->client.filter_type);
+		seq_printf(seq,"tx busy: %s\n",
+			       netif_queue_stopped(self->dev) ? "TRUE" : "FALSE");
+
+		seq_putc(seq,'\n');
+	}
+	return 0;
+}
+
+static const struct seq_operations irlan_seq_ops = {
+	.start = irlan_seq_start,
+	.next  = irlan_seq_next,
+	.stop  = irlan_seq_stop,
+	.show  = irlan_seq_show,
+};
+
+static int irlan_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &irlan_seq_ops);
+}
+#endif
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
+MODULE_DESCRIPTION("The Linux IrDA LAN protocol");
+MODULE_LICENSE("GPL");
+
+module_param(eth, bool, 0);
+MODULE_PARM_DESC(eth, "Name devices ethX (0) or irlanX (1)");
+module_param(access, int, 0);
+MODULE_PARM_DESC(access, "Access type DIRECT=1, PEER=2, HOSTED=3");
+
+module_init(irlan_init);
+module_exit(irlan_cleanup);
+
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
new file mode 100644
index 00000000..8ee1ff6c
--- /dev/null
+++ b/net/irda/irlan/irlan_eth.c
@@ -0,0 +1,349 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_eth.c
+ * Version:
+ * Description:
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Oct 15 08:37:58 1998
+ * Modified at:   Tue Mar 21 09:06:41 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Sources:       skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ *                slip.c by Laurence Culhane,   <loz@holmes.demon.co.uk>
+ *                          Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <net/arp.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_client.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_eth.h>
+
+static int  irlan_eth_open(struct net_device *dev);
+static int  irlan_eth_close(struct net_device *dev);
+static netdev_tx_t  irlan_eth_xmit(struct sk_buff *skb,
+					 struct net_device *dev);
+static void irlan_eth_set_multicast_list( struct net_device *dev);
+
+static const struct net_device_ops irlan_eth_netdev_ops = {
+	.ndo_open               = irlan_eth_open,
+	.ndo_stop               = irlan_eth_close,
+	.ndo_start_xmit    	= irlan_eth_xmit,
+	.ndo_set_multicast_list = irlan_eth_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+/*
+ * Function irlan_eth_setup (dev)
+ *
+ *    The network device initialization function.
+ *
+ */
+static void irlan_eth_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops		= &irlan_eth_netdev_ops;
+	dev->destructor		= free_netdev;
+
+
+	/*
+	 * Lets do all queueing in IrTTP instead of this device driver.
+	 * Queueing here as well can introduce some strange latency
+	 * problems, which we will avoid by setting the queue size to 0.
+	 */
+	/*
+	 * The bugs in IrTTP and IrLAN that created this latency issue
+	 * have now been fixed, and we can propagate flow control properly
+	 * to the network layer. However, this requires a minimal queue of
+	 * packets for the device.
+	 * Without flow control, the Tx Queue is 14 (ttp) + 0 (dev) = 14
+	 * With flow control, the Tx Queue is 7 (ttp) + 4 (dev) = 11
+	 * See irlan_eth_flow_indication()...
+	 * Note : this number was randomly selected and would need to
+	 * be adjusted.
+	 * Jean II */
+	dev->tx_queue_len = 4;
+}
+
+/*
+ * Function alloc_irlandev
+ *
+ *    Allocate network device and control block
+ *
+ */
+struct net_device *alloc_irlandev(const char *name)
+{
+	return alloc_netdev(sizeof(struct irlan_cb), name,
+			    irlan_eth_setup);
+}
+
+/*
+ * Function irlan_eth_open (dev)
+ *
+ *    Network device has been opened by user
+ *
+ */
+static int irlan_eth_open(struct net_device *dev)
+{
+	struct irlan_cb *self = netdev_priv(dev);
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Ready to play! */
+	netif_stop_queue(dev); /* Wait until data link is ready */
+
+	/* We are now open, so time to do some work */
+	self->disconnect_reason = 0;
+	irlan_client_wakeup(self, self->saddr, self->daddr);
+
+	/* Make sure we have a hardware address before we return,
+	   so DHCP clients gets happy */
+	return wait_event_interruptible(self->open_wait,
+					!self->tsap_data->connected);
+}
+
+/*
+ * Function irlan_eth_close (dev)
+ *
+ *    Stop the ether network device, his function will usually be called by
+ *    ifconfig down. We should now disconnect the link, We start the
+ *    close timer, so that the instance will be removed if we are unable
+ *    to discover the remote device after the disconnect.
+ */
+static int irlan_eth_close(struct net_device *dev)
+{
+	struct irlan_cb *self = netdev_priv(dev);
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Stop device */
+	netif_stop_queue(dev);
+
+	irlan_close_data_channel(self);
+	irlan_close_tsaps(self);
+
+	irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL);
+	irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+
+	/* Remove frames queued on the control channel */
+	skb_queue_purge(&self->client.txq);
+
+	self->client.tx_busy = 0;
+
+	return 0;
+}
+
+/*
+ * Function irlan_eth_tx (skb)
+ *
+ *    Transmits ethernet frames over IrDA link.
+ *
+ */
+static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
+					struct net_device *dev)
+{
+	struct irlan_cb *self = netdev_priv(dev);
+	int ret;
+	unsigned int len;
+
+	/* skb headroom large enough to contain all IrDA-headers? */
+	if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, self->max_header_size);
+
+		/*  We have to free the original skb anyway */
+		dev_kfree_skb(skb);
+
+		/* Did the realloc succeed? */
+		if (new_skb == NULL)
+			return NETDEV_TX_OK;
+
+		/* Use the new skb instead */
+		skb = new_skb;
+	}
+
+	dev->trans_start = jiffies;
+
+	len = skb->len;
+	/* Now queue the packet in the transport layer */
+	if (self->use_udata)
+		ret = irttp_udata_request(self->tsap_data, skb);
+	else
+		ret = irttp_data_request(self->tsap_data, skb);
+
+	if (ret < 0) {
+		/*
+		 * IrTTPs tx queue is full, so we just have to
+		 * drop the frame! You might think that we should
+		 * just return -1 and don't deallocate the frame,
+		 * but that is dangerous since it's possible that
+		 * we have replaced the original skb with a new
+		 * one with larger headroom, and that would really
+		 * confuse do_dev_queue_xmit() in dev.c! I have
+		 * tried :-) DB
+		 */
+		/* irttp_data_request already free the packet */
+		dev->stats.tx_dropped++;
+	} else {
+		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += len;
+	}
+
+	return NETDEV_TX_OK;
+}
+
+/*
+ * Function irlan_eth_receive (handle, skb)
+ *
+ *    This function gets the data that is received on the data channel
+ *
+ */
+int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
+{
+	struct irlan_cb *self = instance;
+	struct net_device *dev = self->dev;
+
+	if (skb == NULL) {
+		dev->stats.rx_dropped++;
+		return 0;
+	}
+	if (skb->len < ETH_HLEN) {
+		IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n",
+			   __func__, skb->len);
+		dev->stats.rx_dropped++;
+		dev_kfree_skb(skb);
+		return 0;
+	}
+
+	/*
+	 * Adopt this frame! Important to set all these fields since they
+	 * might have been previously set by the low level IrDA network
+	 * device driver
+	 */
+	skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */
+
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += skb->len;
+
+	netif_rx(skb);   /* Eat it! */
+
+	return 0;
+}
+
+/*
+ * Function irlan_eth_flow (status)
+ *
+ *    Do flow control between IP/Ethernet and IrLAN/IrTTP. This is done by
+ *    controlling the queue stop/start.
+ *
+ * The IrDA link layer has the advantage to have flow control, and
+ * IrTTP now properly handles that. Flow controlling the higher layers
+ * prevent us to drop Tx packets in here (up to 15% for a TCP socket,
+ * more for UDP socket).
+ * Also, this allow us to reduce the overall transmit queue, which means
+ * less latency in case of mixed traffic.
+ * Jean II
+ */
+void irlan_eth_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+	struct irlan_cb *self;
+	struct net_device *dev;
+
+	self = (struct irlan_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	dev = self->dev;
+
+	IRDA_ASSERT(dev != NULL, return;);
+
+	IRDA_DEBUG(0, "%s() : flow %s ; running %d\n", __func__,
+		   flow == FLOW_STOP ? "FLOW_STOP" : "FLOW_START",
+		   netif_running(dev));
+
+	switch (flow) {
+	case FLOW_STOP:
+		/* IrTTP is full, stop higher layers */
+		netif_stop_queue(dev);
+		break;
+	case FLOW_START:
+	default:
+		/* Tell upper layers that its time to transmit frames again */
+		/* Schedule network layer */
+		netif_wake_queue(dev);
+		break;
+	}
+}
+
+/*
+ * Function set_multicast_list (dev)
+ *
+ *    Configure the filtering of the device
+ *
+ */
+#define HW_MAX_ADDRS 4 /* Must query to get it! */
+static void irlan_eth_set_multicast_list(struct net_device *dev)
+{
+	struct irlan_cb *self = netdev_priv(dev);
+
+	IRDA_DEBUG(2, "%s()\n", __func__ );
+
+	/* Check if data channel has been connected yet */
+	if (self->client.state != IRLAN_DATA) {
+		IRDA_DEBUG(1, "%s(), delaying!\n", __func__ );
+		return;
+	}
+
+	if (dev->flags & IFF_PROMISC) {
+		/* Enable promiscuous mode */
+		IRDA_WARNING("Promiscuous mode not implemented by IrLAN!\n");
+	}
+	else if ((dev->flags & IFF_ALLMULTI) ||
+		 netdev_mc_count(dev) > HW_MAX_ADDRS) {
+		/* Disable promiscuous mode, use normal mode. */
+		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ );
+		/* hardware_set_filter(NULL); */
+
+		irlan_set_multicast_filter(self, TRUE);
+	}
+	else if (!netdev_mc_empty(dev)) {
+		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ );
+		/* Walk the address list, and load the filter */
+		/* hardware_set_filter(dev->mc_list); */
+
+		irlan_set_multicast_filter(self, TRUE);
+	}
+	else {
+		IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__ );
+		irlan_set_multicast_filter(self, FALSE);
+	}
+
+	if (dev->flags & IFF_BROADCAST)
+		irlan_set_broadcast_filter(self, TRUE);
+	else
+		irlan_set_broadcast_filter(self, FALSE);
+}
diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c
new file mode 100644
index 00000000..43f16040
--- /dev/null
+++ b/net/irda/irlan/irlan_event.c
@@ -0,0 +1,60 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_event.c
+ * Version:
+ * Description:
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Oct 20 09:10:16 1998
+ * Modified at:   Sat Oct 30 12:59:01 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <net/irda/irlan_event.h>
+
+const char * const irlan_state[] = {
+	"IRLAN_IDLE",
+	"IRLAN_QUERY",
+	"IRLAN_CONN",
+	"IRLAN_INFO",
+	"IRLAN_MEDIA",
+	"IRLAN_OPEN",
+	"IRLAN_WAIT",
+	"IRLAN_ARB",
+	"IRLAN_DATA",
+	"IRLAN_CLOSE",
+	"IRLAN_SYNC",
+};
+
+void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state)
+{
+	IRDA_DEBUG(2, "%s(), %s\n", __func__ , irlan_state[state]);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	self->client.state = state;
+}
+
+void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state)
+{
+	IRDA_DEBUG(2, "%s(), %s\n", __func__ , irlan_state[state]);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	self->provider.state = state;
+}
+
diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c
new file mode 100644
index 00000000..7977be7c
--- /dev/null
+++ b/net/irda/irlan/irlan_filter.c
@@ -0,0 +1,243 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_filter.c
+ * Version:
+ * Description:
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Fri Jan 29 11:16:38 1999
+ * Modified at:   Sat Oct 30 12:58:45 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_filter.h>
+
+/*
+ * Function irlan_filter_request (self, skb)
+ *
+ *    Handle filter request from client peer device
+ *
+ */
+void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+	    (self->provider.filter_operation == DYNAMIC))
+	{
+		IRDA_DEBUG(0, "Giving peer a dynamic Ethernet address\n");
+		self->provider.mac_address[0] = 0x40;
+		self->provider.mac_address[1] = 0x00;
+		self->provider.mac_address[2] = 0x00;
+		self->provider.mac_address[3] = 0x00;
+
+		/* Use arbitration value to generate MAC address */
+		if (self->provider.access_type == ACCESS_PEER) {
+			self->provider.mac_address[4] =
+				self->provider.send_arb_val & 0xff;
+			self->provider.mac_address[5] =
+				(self->provider.send_arb_val >> 8) & 0xff;
+		} else {
+			/* Just generate something for now */
+			get_random_bytes(self->provider.mac_address+4, 1);
+			get_random_bytes(self->provider.mac_address+5, 1);
+		}
+
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x03;
+		irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+		irlan_insert_short_param(skb, "MAX_ENTRY", 0x0001);
+		irlan_insert_array_param(skb, "FILTER_ENTRY",
+					 self->provider.mac_address, 6);
+		return;
+	}
+
+	if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+	    (self->provider.filter_mode == FILTER))
+	{
+		IRDA_DEBUG(0, "Directed filter on\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+	if ((self->provider.filter_type == IRLAN_DIRECTED) &&
+	    (self->provider.filter_mode == NONE))
+	{
+		IRDA_DEBUG(0, "Directed filter off\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+
+	if ((self->provider.filter_type == IRLAN_BROADCAST) &&
+	    (self->provider.filter_mode == FILTER))
+	{
+		IRDA_DEBUG(0, "Broadcast filter on\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+	if ((self->provider.filter_type == IRLAN_BROADCAST) &&
+	    (self->provider.filter_mode == NONE))
+	{
+		IRDA_DEBUG(0, "Broadcast filter off\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+	if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+	    (self->provider.filter_mode == FILTER))
+	{
+		IRDA_DEBUG(0, "Multicast filter on\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+	if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+	    (self->provider.filter_mode == NONE))
+	{
+		IRDA_DEBUG(0, "Multicast filter off\n");
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x00;
+		return;
+	}
+	if ((self->provider.filter_type == IRLAN_MULTICAST) &&
+	    (self->provider.filter_operation == GET))
+	{
+		IRDA_DEBUG(0, "Multicast filter get\n");
+		skb->data[0] = 0x00; /* Success? */
+		skb->data[1] = 0x02;
+		irlan_insert_string_param(skb, "FILTER_MODE", "NONE");
+		irlan_insert_short_param(skb, "MAX_ENTRY", 16);
+		return;
+	}
+	skb->data[0] = 0x00; /* Command not supported */
+	skb->data[1] = 0x00;
+
+	IRDA_DEBUG(0, "Not implemented!\n");
+}
+
+/*
+ * Function check_request_param (self, param, value)
+ *
+ *    Check parameters in request from peer device
+ *
+ */
+void irlan_check_command_param(struct irlan_cb *self, char *param, char *value)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	IRDA_DEBUG(4, "%s, %s\n", param, value);
+
+	/*
+	 *  This is experimental!! DB.
+	 */
+	 if (strcmp(param, "MODE") == 0) {
+		IRDA_DEBUG(0, "%s()\n", __func__ );
+		self->use_udata = TRUE;
+		return;
+	}
+
+	/*
+	 *  FILTER_TYPE
+	 */
+	if (strcmp(param, "FILTER_TYPE") == 0) {
+		if (strcmp(value, "DIRECTED") == 0) {
+			self->provider.filter_type = IRLAN_DIRECTED;
+			return;
+		}
+		if (strcmp(value, "MULTICAST") == 0) {
+			self->provider.filter_type = IRLAN_MULTICAST;
+			return;
+		}
+		if (strcmp(value, "BROADCAST") == 0) {
+			self->provider.filter_type = IRLAN_BROADCAST;
+			return;
+		}
+	}
+	/*
+	 *  FILTER_MODE
+	 */
+	if (strcmp(param, "FILTER_MODE") == 0) {
+		if (strcmp(value, "ALL") == 0) {
+			self->provider.filter_mode = ALL;
+			return;
+		}
+		if (strcmp(value, "FILTER") == 0) {
+			self->provider.filter_mode = FILTER;
+			return;
+		}
+		if (strcmp(value, "NONE") == 0) {
+			self->provider.filter_mode = FILTER;
+			return;
+		}
+	}
+	/*
+	 *  FILTER_OPERATION
+	 */
+	if (strcmp(param, "FILTER_OPERATION") == 0) {
+		if (strcmp(value, "DYNAMIC") == 0) {
+			self->provider.filter_operation = DYNAMIC;
+			return;
+		}
+		if (strcmp(value, "GET") == 0) {
+			self->provider.filter_operation = GET;
+			return;
+		}
+	}
+}
+
+/*
+ * Function irlan_print_filter (filter_type, buf)
+ *
+ *    Print status of filter. Used by /proc file system
+ *
+ */
+#ifdef CONFIG_PROC_FS
+#define MASK2STR(m,s)	{ .mask = m, .str = s }
+
+void irlan_print_filter(struct seq_file *seq, int filter_type)
+{
+	static struct {
+		int mask;
+		const char *str;
+	} filter_mask2str[] = {
+		MASK2STR(IRLAN_DIRECTED,	"DIRECTED"),
+		MASK2STR(IRLAN_FUNCTIONAL,	"FUNCTIONAL"),
+		MASK2STR(IRLAN_GROUP,		"GROUP"),
+		MASK2STR(IRLAN_MAC_FRAME,	"MAC_FRAME"),
+		MASK2STR(IRLAN_MULTICAST,	"MULTICAST"),
+		MASK2STR(IRLAN_BROADCAST,	"BROADCAST"),
+		MASK2STR(IRLAN_IPX_SOCKET,	"IPX_SOCKET"),
+		MASK2STR(0,			NULL)
+	}, *p;
+
+	for (p = filter_mask2str; p->str; p++) {
+		if (filter_type & p->mask)
+			seq_printf(seq, "%s ", p->str);
+	}
+	seq_putc(seq, '\n');
+}
+#undef MASK2STR
+#endif
diff --git a/net/irda/irlan/irlan_provider.c b/net/irda/irlan/irlan_provider.c
new file mode 100644
index 00000000..b8af74ab
--- /dev/null
+++ b/net/irda/irlan/irlan_provider.c
@@ -0,0 +1,417 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_provider.c
+ * Version:       0.9
+ * Description:   IrDA LAN Access Protocol Implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sat Oct 30 12:52:10 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Sources:       skeleton.c by Donald Becker <becker@CESDIS.gsfc.nasa.gov>
+ *                slip.c by Laurence Culhane,   <loz@holmes.demon.co.uk>
+ *                          Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irttp.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/iriap.h>
+#include <net/irda/timer.h>
+
+#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_eth.h>
+#include <net/irda/irlan_event.h>
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_filter.h>
+#include <net/irda/irlan_client.h>
+
+static void irlan_provider_connect_indication(void *instance, void *sap,
+					      struct qos_info *qos,
+					      __u32 max_sdu_size,
+					      __u8 max_header_size,
+					      struct sk_buff *skb);
+
+/*
+ * Function irlan_provider_control_data_indication (handle, skb)
+ *
+ *    This function gets the data that is received on the control channel
+ *
+ */
+static int irlan_provider_data_indication(void *instance, void *sap,
+					  struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+	__u8 code;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	self = (struct irlan_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	code = skb->data[0];
+	switch(code) {
+	case CMD_GET_PROVIDER_INFO:
+		IRDA_DEBUG(4, "Got GET_PROVIDER_INFO command!\n");
+		irlan_do_provider_event(self, IRLAN_GET_INFO_CMD, skb);
+		break;
+
+	case CMD_GET_MEDIA_CHAR:
+		IRDA_DEBUG(4, "Got GET_MEDIA_CHAR command!\n");
+		irlan_do_provider_event(self, IRLAN_GET_MEDIA_CMD, skb);
+		break;
+	case CMD_OPEN_DATA_CHANNEL:
+		IRDA_DEBUG(4, "Got OPEN_DATA_CHANNEL command!\n");
+		irlan_do_provider_event(self, IRLAN_OPEN_DATA_CMD, skb);
+		break;
+	case CMD_FILTER_OPERATION:
+		IRDA_DEBUG(4, "Got FILTER_OPERATION command!\n");
+		irlan_do_provider_event(self, IRLAN_FILTER_CONFIG_CMD, skb);
+		break;
+	case CMD_RECONNECT_DATA_CHAN:
+		IRDA_DEBUG(2, "%s(), Got RECONNECT_DATA_CHAN command\n", __func__ );
+		IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __func__ );
+		break;
+	case CMD_CLOSE_DATA_CHAN:
+		IRDA_DEBUG(2, "Got CLOSE_DATA_CHAN command!\n");
+		IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __func__ );
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown command!\n", __func__ );
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Function irlan_provider_connect_indication (handle, skb, priv)
+ *
+ *    Got connection from peer IrLAN client
+ *
+ */
+static void irlan_provider_connect_indication(void *instance, void *sap,
+					      struct qos_info *qos,
+					      __u32 max_sdu_size,
+					      __u8 max_header_size,
+					      struct sk_buff *skb)
+{
+	struct irlan_cb *self;
+	struct tsap_cb *tsap;
+
+	IRDA_DEBUG(0, "%s()\n", __func__ );
+
+	self = (struct irlan_cb *) instance;
+	tsap = (struct tsap_cb *) sap;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	IRDA_ASSERT(tsap == self->provider.tsap_ctrl,return;);
+	IRDA_ASSERT(self->provider.state == IRLAN_IDLE, return;);
+
+	self->provider.max_sdu_size = max_sdu_size;
+	self->provider.max_header_size = max_header_size;
+
+	irlan_do_provider_event(self, IRLAN_CONNECT_INDICATION, NULL);
+
+	/*
+	 * If we are in peer mode, the client may not have got the discovery
+	 * indication it needs to make progress. If the client is still in
+	 * IDLE state, we must kick it.
+	 */
+	if ((self->provider.access_type == ACCESS_PEER) &&
+	    (self->client.state == IRLAN_IDLE))
+	{
+		irlan_client_wakeup(self, self->saddr, self->daddr);
+	}
+}
+
+/*
+ * Function irlan_provider_connect_response (handle)
+ *
+ *    Accept incoming connection
+ *
+ */
+void irlan_provider_connect_response(struct irlan_cb *self,
+				     struct tsap_cb *tsap)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	/* Just accept */
+	irttp_connect_response(tsap, IRLAN_MTU, NULL);
+}
+
+static void irlan_provider_disconnect_indication(void *instance, void *sap,
+						 LM_REASON reason,
+						 struct sk_buff *userdata)
+{
+	struct irlan_cb *self;
+	struct tsap_cb *tsap;
+
+	IRDA_DEBUG(4, "%s(), reason=%d\n", __func__ , reason);
+
+	self = (struct irlan_cb *) instance;
+	tsap = (struct tsap_cb *) sap;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+	IRDA_ASSERT(tsap != NULL, return;);
+	IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;);
+
+	IRDA_ASSERT(tsap == self->provider.tsap_ctrl, return;);
+
+	irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL);
+}
+
+/*
+ * Function irlan_parse_open_data_cmd (self, skb)
+ *
+ *
+ *
+ */
+int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = irlan_provider_parse_command(self, CMD_OPEN_DATA_CHANNEL, skb);
+
+	/* Open data channel */
+	irlan_open_data_tsap(self);
+
+	return ret;
+}
+
+/*
+ * Function parse_command (skb)
+ *
+ *    Extract all parameters from received buffer, then feed them to
+ *    check_params for parsing
+ *
+ */
+int irlan_provider_parse_command(struct irlan_cb *self, int cmd,
+				 struct sk_buff *skb)
+{
+	__u8 *frame;
+	__u8 *ptr;
+	int count;
+	__u16 val_len;
+	int i;
+	char *name;
+	char *value;
+	int ret = RSP_SUCCESS;
+
+	IRDA_ASSERT(skb != NULL, return -RSP_PROTOCOL_ERROR;);
+
+	IRDA_DEBUG(4, "%s(), skb->len=%d\n", __func__ , (int)skb->len);
+
+	IRDA_ASSERT(self != NULL, return -RSP_PROTOCOL_ERROR;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -RSP_PROTOCOL_ERROR;);
+
+	if (!skb)
+		return -RSP_PROTOCOL_ERROR;
+
+	frame = skb->data;
+
+	name = kmalloc(255, GFP_ATOMIC);
+	if (!name)
+		return -RSP_INSUFFICIENT_RESOURCES;
+	value = kmalloc(1016, GFP_ATOMIC);
+	if (!value) {
+		kfree(name);
+		return -RSP_INSUFFICIENT_RESOURCES;
+	}
+
+	/* How many parameters? */
+	count = frame[1];
+
+	IRDA_DEBUG(4, "Got %d parameters\n", count);
+
+	ptr = frame+2;
+
+	/* For all parameters */
+	for (i=0; i<count;i++) {
+		ret = irlan_extract_param(ptr, name, value, &val_len);
+		if (ret < 0) {
+			IRDA_DEBUG(2, "%s(), IrLAN, Error!\n", __func__ );
+			break;
+		}
+		ptr+=ret;
+		ret = RSP_SUCCESS;
+		irlan_check_command_param(self, name, value);
+	}
+	/* Cleanup */
+	kfree(name);
+	kfree(value);
+
+	return ret;
+}
+
+/*
+ * Function irlan_provider_send_reply (self, info)
+ *
+ *    Send reply to query to peer IrLAN layer
+ *
+ */
+void irlan_provider_send_reply(struct irlan_cb *self, int command,
+			       int ret_code)
+{
+	struct sk_buff *skb;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;);
+
+	skb = alloc_skb(IRLAN_MAX_HEADER + IRLAN_CMD_HEADER +
+			/* Bigger param length comes from CMD_GET_MEDIA_CHAR */
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "DIRECTED") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "BORADCAST") +
+			IRLAN_STRING_PARAMETER_LEN("FILTER_TYPE", "MULTICAST") +
+			IRLAN_STRING_PARAMETER_LEN("ACCESS_TYPE", "HOSTED"),
+			GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	/* Reserve space for TTP, LMP, and LAP header */
+	skb_reserve(skb, self->provider.max_header_size);
+	skb_put(skb, 2);
+
+	switch (command) {
+	case CMD_GET_PROVIDER_INFO:
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x02; /* 2 parameters */
+		switch (self->media) {
+		case MEDIA_802_3:
+			irlan_insert_string_param(skb, "MEDIA", "802.3");
+			break;
+		case MEDIA_802_5:
+			irlan_insert_string_param(skb, "MEDIA", "802.5");
+			break;
+		default:
+			IRDA_DEBUG(2, "%s(), unknown media type!\n", __func__ );
+			break;
+		}
+		irlan_insert_short_param(skb, "IRLAN_VER", 0x0101);
+		break;
+
+	case CMD_GET_MEDIA_CHAR:
+		skb->data[0] = 0x00; /* Success */
+		skb->data[1] = 0x05; /* 5 parameters */
+		irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED");
+		irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST");
+		irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST");
+
+		switch (self->provider.access_type) {
+		case ACCESS_DIRECT:
+			irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT");
+			break;
+		case ACCESS_PEER:
+			irlan_insert_string_param(skb, "ACCESS_TYPE", "PEER");
+			break;
+		case ACCESS_HOSTED:
+			irlan_insert_string_param(skb, "ACCESS_TYPE", "HOSTED");
+			break;
+		default:
+			IRDA_DEBUG(2, "%s(), Unknown access type\n", __func__ );
+			break;
+		}
+		irlan_insert_short_param(skb, "MAX_FRAME", 0x05ee);
+		break;
+	case CMD_OPEN_DATA_CHANNEL:
+		skb->data[0] = 0x00; /* Success */
+		if (self->provider.send_arb_val) {
+			skb->data[1] = 0x03; /* 3 parameters */
+			irlan_insert_short_param(skb, "CON_ARB",
+						 self->provider.send_arb_val);
+		} else
+			skb->data[1] = 0x02; /* 2 parameters */
+		irlan_insert_byte_param(skb, "DATA_CHAN", self->stsap_sel_data);
+		irlan_insert_string_param(skb, "RECONNECT_KEY", "LINUX RULES!");
+		break;
+	case CMD_FILTER_OPERATION:
+		irlan_filter_request(self, skb);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown command!\n", __func__ );
+		break;
+	}
+
+	irttp_data_request(self->provider.tsap_ctrl, skb);
+}
+
+/*
+ * Function irlan_provider_register(void)
+ *
+ *    Register provider support so we can accept incoming connections.
+ *
+ */
+int irlan_provider_open_ctrl_tsap(struct irlan_cb *self)
+{
+	struct tsap_cb *tsap;
+	notify_t notify;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	/* Check if already open */
+	if (self->provider.tsap_ctrl)
+		return -1;
+
+	/*
+	 *  First register well known control TSAP
+	 */
+	irda_notify_init(&notify);
+	notify.data_indication       = irlan_provider_data_indication;
+	notify.connect_indication    = irlan_provider_connect_indication;
+	notify.disconnect_indication = irlan_provider_disconnect_indication;
+	notify.instance = self;
+	strlcpy(notify.name, "IrLAN ctrl (p)", sizeof(notify.name));
+
+	tsap = irttp_open_tsap(LSAP_ANY, 1, &notify);
+	if (!tsap) {
+		IRDA_DEBUG(2, "%s(), Got no tsap!\n", __func__ );
+		return -1;
+	}
+	self->provider.tsap_ctrl = tsap;
+
+	/* Register with LM-IAS */
+	irlan_ias_register(self, tsap->stsap_sel);
+
+	return 0;
+}
+
diff --git a/net/irda/irlan/irlan_provider_event.c b/net/irda/irlan/irlan_provider_event.c
new file mode 100644
index 00000000..01a9d7c9
--- /dev/null
+++ b/net/irda/irlan/irlan_provider_event.c
@@ -0,0 +1,241 @@
+/*********************************************************************
+ *
+ * Filename:      irlan_provider_event.c
+ * Version:       0.9
+ * Description:   IrLAN provider state machine)
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sat Oct 30 12:52:41 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <net/irda/irda.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+
+#include <net/irda/irlan_provider.h>
+#include <net/irda/irlan_event.h>
+
+static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb);
+static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb);
+static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb);
+static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb);
+
+static int (*state[])(struct irlan_cb *self, IRLAN_EVENT event,
+		      struct sk_buff *skb) =
+{
+	irlan_provider_state_idle,
+	NULL, /* Query */
+	NULL, /* Info */
+	irlan_provider_state_info,
+	NULL, /* Media */
+	irlan_provider_state_open,
+	NULL, /* Wait */
+	NULL, /* Arb */
+	irlan_provider_state_data,
+	NULL, /* Close */
+	NULL, /* Sync */
+};
+
+void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event,
+			     struct sk_buff *skb)
+{
+	IRDA_ASSERT(*state[ self->provider.state] != NULL, return;);
+
+	(*state[self->provider.state]) (self, event, skb);
+}
+
+/*
+ * Function irlan_provider_state_idle (event, skb, info)
+ *
+ *    IDLE, We are waiting for an indication that there is a provider
+ *    available.
+ */
+static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_CONNECT_INDICATION:
+	     irlan_provider_connect_response( self, self->provider.tsap_ctrl);
+	     irlan_next_provider_state( self, IRLAN_INFO);
+	     break;
+	default:
+		IRDA_DEBUG(4, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_provider_state_info (self, event, skb, info)
+ *
+ *    INFO, We have issued a GetInfo command and is awaiting a reply.
+ */
+static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb)
+{
+	int ret;
+
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_GET_INFO_CMD:
+		/* Be sure to use 802.3 in case of peer mode */
+		if (self->provider.access_type == ACCESS_PEER) {
+			self->media = MEDIA_802_3;
+
+			/* Check if client has started yet */
+			if (self->client.state == IRLAN_IDLE) {
+				/* This should get the client going */
+				irlmp_discovery_request(8);
+			}
+		}
+
+		irlan_provider_send_reply(self, CMD_GET_PROVIDER_INFO,
+					  RSP_SUCCESS);
+		/* Keep state */
+		break;
+	case IRLAN_GET_MEDIA_CMD:
+		irlan_provider_send_reply(self, CMD_GET_MEDIA_CHAR,
+					  RSP_SUCCESS);
+		/* Keep state */
+		break;
+	case IRLAN_OPEN_DATA_CMD:
+		ret = irlan_parse_open_data_cmd(self, skb);
+		if (self->provider.access_type == ACCESS_PEER) {
+			/* FIXME: make use of random functions! */
+			self->provider.send_arb_val = (jiffies & 0xffff);
+		}
+		irlan_provider_send_reply(self, CMD_OPEN_DATA_CHANNEL, ret);
+
+		if (ret == RSP_SUCCESS) {
+			irlan_next_provider_state(self, IRLAN_OPEN);
+
+			/* Signal client that we are now open */
+			irlan_do_client_event(self, IRLAN_PROVIDER_SIGNAL, NULL);
+		}
+		break;
+	case IRLAN_LMP_DISCONNECT:  /* FALLTHROUGH */
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_provider_state(self, IRLAN_IDLE);
+		break;
+	default:
+		IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_provider_state_open (self, event, skb, info)
+ *
+ *    OPEN, The client has issued a OpenData command and is awaiting a
+ *    reply
+ *
+ */
+static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+
+	switch(event) {
+	case IRLAN_FILTER_CONFIG_CMD:
+		irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb);
+		irlan_provider_send_reply(self, CMD_FILTER_OPERATION,
+					  RSP_SUCCESS);
+		/* Keep state */
+		break;
+	case IRLAN_DATA_CONNECT_INDICATION:
+		irlan_next_provider_state(self, IRLAN_DATA);
+		irlan_provider_connect_response(self, self->tsap_data);
+		break;
+	case IRLAN_LMP_DISCONNECT:  /* FALLTHROUGH */
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_provider_state(self, IRLAN_IDLE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irlan_provider_state_data (self, event, skb, info)
+ *
+ *    DATA, The data channel is connected, allowing data transfers between
+ *    the local and remote machines.
+ *
+ */
+static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event,
+				     struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__ );
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;);
+
+	switch(event) {
+	case IRLAN_FILTER_CONFIG_CMD:
+		irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb);
+		irlan_provider_send_reply(self, CMD_FILTER_OPERATION,
+					  RSP_SUCCESS);
+		break;
+	case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */
+	case IRLAN_LAP_DISCONNECT:
+		irlan_next_provider_state(self, IRLAN_IDLE);
+		break;
+	default:
+		IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __func__ , event);
+		break;
+	}
+	if (skb)
+		dev_kfree_skb(skb);
+
+	return 0;
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
new file mode 100644
index 00000000..005b4244
--- /dev/null
+++ b/net/irda/irlap.c
@@ -0,0 +1,1237 @@
+/*********************************************************************
+ *
+ * Filename:      irlap.c
+ * Version:       1.0
+ * Description:   IrLAP implementation for Linux
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Tue Dec 14 09:26:44 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irqueue.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/qos.h>
+
+static hashbin_t *irlap = NULL;
+int sysctl_slot_timeout = SLOT_TIMEOUT * 1000 / HZ;
+
+/* This is the delay of missed pf period before generating an event
+ * to the application. The spec mandate 3 seconds, but in some cases
+ * it's way too long. - Jean II */
+int sysctl_warn_noreply_time = 3;
+
+extern void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb);
+static void __irlap_close(struct irlap_cb *self);
+static void irlap_init_qos_capabilities(struct irlap_cb *self,
+					struct qos_info *qos_user);
+
+#ifdef CONFIG_IRDA_DEBUG
+static const char *const lap_reasons[] = {
+	"ERROR, NOT USED",
+	"LAP_DISC_INDICATION",
+	"LAP_NO_RESPONSE",
+	"LAP_RESET_INDICATION",
+	"LAP_FOUND_NONE",
+	"LAP_MEDIA_BUSY",
+	"LAP_PRIMARY_CONFLICT",
+	"ERROR, NOT USED",
+};
+#endif	/* CONFIG_IRDA_DEBUG */
+
+int __init irlap_init(void)
+{
+	/* Check if the compiler did its job properly.
+	 * May happen on some ARM configuration, check with Russell King. */
+	IRDA_ASSERT(sizeof(struct xid_frame) == 14, ;);
+	IRDA_ASSERT(sizeof(struct test_frame) == 10, ;);
+	IRDA_ASSERT(sizeof(struct ua_frame) == 10, ;);
+	IRDA_ASSERT(sizeof(struct snrm_frame) == 11, ;);
+
+	/* Allocate master array */
+	irlap = hashbin_new(HB_LOCK);
+	if (irlap == NULL) {
+		IRDA_ERROR("%s: can't allocate irlap hashbin!\n",
+			   __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void irlap_cleanup(void)
+{
+	IRDA_ASSERT(irlap != NULL, return;);
+
+	hashbin_delete(irlap, (FREE_FUNC) __irlap_close);
+}
+
+/*
+ * Function irlap_open (driver)
+ *
+ *    Initialize IrLAP layer
+ *
+ */
+struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
+			    const char *hw_name)
+{
+	struct irlap_cb *self;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/* Initialize the irlap structure. */
+	self = kzalloc(sizeof(struct irlap_cb), GFP_KERNEL);
+	if (self == NULL)
+		return NULL;
+
+	self->magic = LAP_MAGIC;
+
+	/* Make a binding between the layers */
+	self->netdev = dev;
+	self->qos_dev = qos;
+	/* Copy hardware name */
+	if(hw_name != NULL) {
+		strlcpy(self->hw_name, hw_name, sizeof(self->hw_name));
+	} else {
+		self->hw_name[0] = '\0';
+	}
+
+	/* FIXME: should we get our own field? */
+	dev->atalk_ptr = self;
+
+	self->state = LAP_OFFLINE;
+
+	/* Initialize transmit queue */
+	skb_queue_head_init(&self->txq);
+	skb_queue_head_init(&self->txq_ultra);
+	skb_queue_head_init(&self->wx_list);
+
+	/* My unique IrLAP device address! */
+	/* We don't want the broadcast address, neither the NULL address
+	 * (most often used to signify "invalid"), and we don't want an
+	 * address already in use (otherwise connect won't be able
+	 * to select the proper link). - Jean II */
+	do {
+		get_random_bytes(&self->saddr, sizeof(self->saddr));
+	} while ((self->saddr == 0x0) || (self->saddr == BROADCAST) ||
+		 (hashbin_lock_find(irlap, self->saddr, NULL)) );
+	/* Copy to the driver */
+	memcpy(dev->dev_addr, &self->saddr, 4);
+
+	init_timer(&self->slot_timer);
+	init_timer(&self->query_timer);
+	init_timer(&self->discovery_timer);
+	init_timer(&self->final_timer);
+	init_timer(&self->poll_timer);
+	init_timer(&self->wd_timer);
+	init_timer(&self->backoff_timer);
+	init_timer(&self->media_busy_timer);
+
+	irlap_apply_default_connection_parameters(self);
+
+	self->N3 = 3; /* # connections attempts to try before giving up */
+
+	self->state = LAP_NDM;
+
+	hashbin_insert(irlap, (irda_queue_t *) self, self->saddr, NULL);
+
+	irlmp_register_link(self, self->saddr, &self->notify);
+
+	return self;
+}
+EXPORT_SYMBOL(irlap_open);
+
+/*
+ * Function __irlap_close (self)
+ *
+ *    Remove IrLAP and all allocated memory. Stop any pending timers.
+ *
+ */
+static void __irlap_close(struct irlap_cb *self)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Stop timers */
+	del_timer(&self->slot_timer);
+	del_timer(&self->query_timer);
+	del_timer(&self->discovery_timer);
+	del_timer(&self->final_timer);
+	del_timer(&self->poll_timer);
+	del_timer(&self->wd_timer);
+	del_timer(&self->backoff_timer);
+	del_timer(&self->media_busy_timer);
+
+	irlap_flush_all_queues(self);
+
+	self->magic = 0;
+
+	kfree(self);
+}
+
+/*
+ * Function irlap_close (self)
+ *
+ *    Remove IrLAP instance
+ *
+ */
+void irlap_close(struct irlap_cb *self)
+{
+	struct irlap_cb *lap;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* We used to send a LAP_DISC_INDICATION here, but this was
+	 * racy. This has been move within irlmp_unregister_link()
+	 * itself. Jean II */
+
+	/* Kill the LAP and all LSAPs on top of it */
+	irlmp_unregister_link(self->saddr);
+	self->notify.instance = NULL;
+
+	/* Be sure that we manage to remove ourself from the hash */
+	lap = hashbin_remove(irlap, self->saddr, NULL);
+	if (!lap) {
+		IRDA_DEBUG(1, "%s(), Didn't find myself!\n", __func__);
+		return;
+	}
+	__irlap_close(lap);
+}
+EXPORT_SYMBOL(irlap_close);
+
+/*
+ * Function irlap_connect_indication (self, skb)
+ *
+ *    Another device is attempting to make a connection
+ *
+ */
+void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_init_qos_capabilities(self, NULL); /* No user QoS! */
+
+	irlmp_link_connect_indication(self->notify.instance, self->saddr,
+				      self->daddr, &self->qos_tx, skb);
+}
+
+/*
+ * Function irlap_connect_response (self, skb)
+ *
+ *    Service user has accepted incoming connection
+ *
+ */
+void irlap_connect_response(struct irlap_cb *self, struct sk_buff *userdata)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	irlap_do_event(self, CONNECT_RESPONSE, userdata, NULL);
+}
+
+/*
+ * Function irlap_connect_request (self, daddr, qos_user, sniff)
+ *
+ *    Request connection with another device, sniffing is not implemented
+ *    yet.
+ *
+ */
+void irlap_connect_request(struct irlap_cb *self, __u32 daddr,
+			   struct qos_info *qos_user, int sniff)
+{
+	IRDA_DEBUG(3, "%s(), daddr=0x%08x\n", __func__, daddr);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	self->daddr = daddr;
+
+	/*
+	 *  If the service user specifies QoS values for this connection,
+	 *  then use them
+	 */
+	irlap_init_qos_capabilities(self, qos_user);
+
+	if ((self->state == LAP_NDM) && !self->media_busy)
+		irlap_do_event(self, CONNECT_REQUEST, NULL, NULL);
+	else
+		self->connect_pending = TRUE;
+}
+
+/*
+ * Function irlap_connect_confirm (self, skb)
+ *
+ *    Connection request has been accepted
+ *
+ */
+void irlap_connect_confirm(struct irlap_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlmp_link_connect_confirm(self->notify.instance, &self->qos_tx, skb);
+}
+
+/*
+ * Function irlap_data_indication (self, skb)
+ *
+ *    Received data frames from IR-port, so we just pass them up to
+ *    IrLMP for further processing
+ *
+ */
+void irlap_data_indication(struct irlap_cb *self, struct sk_buff *skb,
+			   int unreliable)
+{
+	/* Hide LAP header from IrLMP layer */
+	skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+	irlmp_link_data_indication(self->notify.instance, skb, unreliable);
+}
+
+
+/*
+ * Function irlap_data_request (self, skb)
+ *
+ *    Queue data for transmission, must wait until XMIT state
+ *
+ */
+void irlap_data_request(struct irlap_cb *self, struct sk_buff *skb,
+			int unreliable)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER),
+		    return;);
+	skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+	/*
+	 *  Must set frame format now so that the rest of the code knows
+	 *  if its dealing with an I or an UI frame
+	 */
+	if (unreliable)
+		skb->data[1] = UI_FRAME;
+	else
+		skb->data[1] = I_FRAME;
+
+	/* Don't forget to refcount it - see irlmp_connect_request(). */
+	skb_get(skb);
+
+	/* Add at the end of the queue (keep ordering) - Jean II */
+	skb_queue_tail(&self->txq, skb);
+
+	/*
+	 *  Send event if this frame only if we are in the right state
+	 *  FIXME: udata should be sent first! (skb_queue_head?)
+	 */
+	if ((self->state == LAP_XMIT_P) || (self->state == LAP_XMIT_S)) {
+		/* If we are not already processing the Tx queue, trigger
+		 * transmission immediately - Jean II */
+		if((skb_queue_len(&self->txq) <= 1) && (!self->local_busy))
+			irlap_do_event(self, DATA_REQUEST, skb, NULL);
+		/* Otherwise, the packets will be sent normally at the
+		 * next pf-poll - Jean II */
+	}
+}
+
+/*
+ * Function irlap_unitdata_request (self, skb)
+ *
+ *    Send Ultra data. This is data that must be sent outside any connection
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlap_unitdata_request(struct irlap_cb *self, struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER),
+	       return;);
+	skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+	skb->data[0] = CBROADCAST;
+	skb->data[1] = UI_FRAME;
+
+	/* Don't need to refcount, see irlmp_connless_data_request() */
+
+	skb_queue_tail(&self->txq_ultra, skb);
+
+	irlap_do_event(self, SEND_UI_FRAME, NULL, NULL);
+}
+#endif /*CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlap_udata_indication (self, skb)
+ *
+ *    Receive Ultra data. This is data that is received outside any connection
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlap_unitdata_indication(struct irlap_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Hide LAP header from IrLMP layer */
+	skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER);
+
+	irlmp_link_unitdata_indication(self->notify.instance, skb);
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlap_disconnect_request (void)
+ *
+ *    Request to disconnect connection by service user
+ */
+void irlap_disconnect_request(struct irlap_cb *self)
+{
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Don't disconnect until all data frames are successfully sent */
+	if (!skb_queue_empty(&self->txq)) {
+		self->disconnect_pending = TRUE;
+		return;
+	}
+
+	/* Check if we are in the right state for disconnecting */
+	switch (self->state) {
+	case LAP_XMIT_P:        /* FALLTHROUGH */
+	case LAP_XMIT_S:        /* FALLTHROUGH */
+	case LAP_CONN:          /* FALLTHROUGH */
+	case LAP_RESET_WAIT:    /* FALLTHROUGH */
+	case LAP_RESET_CHECK:
+		irlap_do_event(self, DISCONNECT_REQUEST, NULL, NULL);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), disconnect pending!\n", __func__);
+		self->disconnect_pending = TRUE;
+		break;
+	}
+}
+
+/*
+ * Function irlap_disconnect_indication (void)
+ *
+ *    Disconnect request from other device
+ *
+ */
+void irlap_disconnect_indication(struct irlap_cb *self, LAP_REASON reason)
+{
+	IRDA_DEBUG(1, "%s(), reason=%s\n", __func__, lap_reasons[reason]);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Flush queues */
+	irlap_flush_all_queues(self);
+
+	switch (reason) {
+	case LAP_RESET_INDICATION:
+		IRDA_DEBUG(1, "%s(), Sending reset request!\n", __func__);
+		irlap_do_event(self, RESET_REQUEST, NULL, NULL);
+		break;
+	case LAP_NO_RESPONSE:	   /* FALLTHROUGH */
+	case LAP_DISC_INDICATION:  /* FALLTHROUGH */
+	case LAP_FOUND_NONE:       /* FALLTHROUGH */
+	case LAP_MEDIA_BUSY:
+		irlmp_link_disconnect_indication(self->notify.instance, self,
+						 reason, NULL);
+		break;
+	default:
+		IRDA_ERROR("%s: Unknown reason %d\n", __func__, reason);
+	}
+}
+
+/*
+ * Function irlap_discovery_request (gen_addr_bit)
+ *
+ *    Start one single discovery operation.
+ *
+ */
+void irlap_discovery_request(struct irlap_cb *self, discovery_t *discovery)
+{
+	struct irlap_info info;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(discovery != NULL, return;);
+
+	IRDA_DEBUG(4, "%s(), nslots = %d\n", __func__, discovery->nslots);
+
+	IRDA_ASSERT((discovery->nslots == 1) || (discovery->nslots == 6) ||
+		    (discovery->nslots == 8) || (discovery->nslots == 16),
+		    return;);
+
+	/* Discovery is only possible in NDM mode */
+	if (self->state != LAP_NDM) {
+		IRDA_DEBUG(4, "%s(), discovery only possible in NDM mode\n",
+			   __func__);
+		irlap_discovery_confirm(self, NULL);
+		/* Note : in theory, if we are not in NDM, we could postpone
+		 * the discovery like we do for connection request.
+		 * In practice, it's not worth it. If the media was busy,
+		 * it's likely next time around it won't be busy. If we are
+		 * in REPLY state, we will get passive discovery info & event.
+		 * Jean II */
+		return;
+	}
+
+	/* Check if last discovery request finished in time, or if
+	 * it was aborted due to the media busy flag. */
+	if (self->discovery_log != NULL) {
+		hashbin_delete(self->discovery_log, (FREE_FUNC) kfree);
+		self->discovery_log = NULL;
+	}
+
+	/* All operations will occur at predictable time, no need to lock */
+	self->discovery_log = hashbin_new(HB_NOLOCK);
+
+	if (self->discovery_log == NULL) {
+		IRDA_WARNING("%s(), Unable to allocate discovery log!\n",
+			     __func__);
+		return;
+	}
+
+	info.S = discovery->nslots; /* Number of slots */
+	info.s = 0; /* Current slot */
+
+	self->discovery_cmd = discovery;
+	info.discovery = discovery;
+
+	/* sysctl_slot_timeout bounds are checked in irsysctl.c - Jean II */
+	self->slot_timeout = sysctl_slot_timeout * HZ / 1000;
+
+	irlap_do_event(self, DISCOVERY_REQUEST, NULL, &info);
+}
+
+/*
+ * Function irlap_discovery_confirm (log)
+ *
+ *    A device has been discovered in front of this station, we
+ *    report directly to LMP.
+ */
+void irlap_discovery_confirm(struct irlap_cb *self, hashbin_t *discovery_log)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	IRDA_ASSERT(self->notify.instance != NULL, return;);
+
+	/*
+	 * Check for successful discovery, since we are then allowed to clear
+	 * the media busy condition (IrLAP 6.13.4 - p.94). This should allow
+	 * us to make connection attempts much faster and easier (i.e. no
+	 * collisions).
+	 * Setting media busy to false will also generate an event allowing
+	 * to process pending events in NDM state machine.
+	 * Note : the spec doesn't define what's a successful discovery is.
+	 * If we want Ultra to work, it's successful even if there is
+	 * nobody discovered - Jean II
+	 */
+	if (discovery_log)
+		irda_device_set_media_busy(self->netdev, FALSE);
+
+	/* Inform IrLMP */
+	irlmp_link_discovery_confirm(self->notify.instance, discovery_log);
+}
+
+/*
+ * Function irlap_discovery_indication (log)
+ *
+ *    Somebody is trying to discover us!
+ *
+ */
+void irlap_discovery_indication(struct irlap_cb *self, discovery_t *discovery)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(discovery != NULL, return;);
+
+	IRDA_ASSERT(self->notify.instance != NULL, return;);
+
+	/* A device is very likely to connect immediately after it performs
+	 * a successful discovery. This means that in our case, we are much
+	 * more likely to receive a connection request over the medium.
+	 * So, we backoff to avoid collisions.
+	 * IrLAP spec 6.13.4 suggest 100ms...
+	 * Note : this little trick actually make a *BIG* difference. If I set
+	 * my Linux box with discovery enabled and one Ultra frame sent every
+	 * second, my Palm has no trouble connecting to it every time !
+	 * Jean II */
+	irda_device_set_media_busy(self->netdev, SMALL);
+
+	irlmp_link_discovery_indication(self->notify.instance, discovery);
+}
+
+/*
+ * Function irlap_status_indication (quality_of_link)
+ */
+void irlap_status_indication(struct irlap_cb *self, int quality_of_link)
+{
+	switch (quality_of_link) {
+	case STATUS_NO_ACTIVITY:
+		IRDA_MESSAGE("IrLAP, no activity on link!\n");
+		break;
+	case STATUS_NOISY:
+		IRDA_MESSAGE("IrLAP, noisy link!\n");
+		break;
+	default:
+		break;
+	}
+	irlmp_status_indication(self->notify.instance,
+				quality_of_link, LOCK_NO_CHANGE);
+}
+
+/*
+ * Function irlap_reset_indication (void)
+ */
+void irlap_reset_indication(struct irlap_cb *self)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	if (self->state == LAP_RESET_WAIT)
+		irlap_do_event(self, RESET_REQUEST, NULL, NULL);
+	else
+		irlap_do_event(self, RESET_RESPONSE, NULL, NULL);
+}
+
+/*
+ * Function irlap_reset_confirm (void)
+ */
+void irlap_reset_confirm(void)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+}
+
+/*
+ * Function irlap_generate_rand_time_slot (S, s)
+ *
+ *    Generate a random time slot between s and S-1 where
+ *    S = Number of slots (0 -> S-1)
+ *    s = Current slot
+ */
+int irlap_generate_rand_time_slot(int S, int s)
+{
+	static int rand;
+	int slot;
+
+	IRDA_ASSERT((S - s) > 0, return 0;);
+
+	rand += jiffies;
+	rand ^= (rand << 12);
+	rand ^= (rand >> 20);
+
+	slot = s + rand % (S-s);
+
+	IRDA_ASSERT((slot >= s) || (slot < S), return 0;);
+
+	return slot;
+}
+
+/*
+ * Function irlap_update_nr_received (nr)
+ *
+ *    Remove all acknowledged frames in current window queue. This code is
+ *    not intuitive and you should not try to change it. If you think it
+ *    contains bugs, please mail a patch to the author instead.
+ */
+void irlap_update_nr_received(struct irlap_cb *self, int nr)
+{
+	struct sk_buff *skb = NULL;
+	int count = 0;
+
+	/*
+	 * Remove all the ack-ed frames from the window queue.
+	 */
+
+	/*
+	 *  Optimize for the common case. It is most likely that the receiver
+	 *  will acknowledge all the frames we have sent! So in that case we
+	 *  delete all frames stored in window.
+	 */
+	if (nr == self->vs) {
+		while ((skb = skb_dequeue(&self->wx_list)) != NULL) {
+			dev_kfree_skb(skb);
+		}
+		/* The last acked frame is the next to send minus one */
+		self->va = nr - 1;
+	} else {
+		/* Remove all acknowledged frames in current window */
+		while ((skb_peek(&self->wx_list) != NULL) &&
+		       (((self->va+1) % 8) != nr))
+		{
+			skb = skb_dequeue(&self->wx_list);
+			dev_kfree_skb(skb);
+
+			self->va = (self->va + 1) % 8;
+			count++;
+		}
+	}
+
+	/* Advance window */
+	self->window = self->window_size - skb_queue_len(&self->wx_list);
+}
+
+/*
+ * Function irlap_validate_ns_received (ns)
+ *
+ *    Validate the next to send (ns) field from received frame.
+ */
+int irlap_validate_ns_received(struct irlap_cb *self, int ns)
+{
+	/*  ns as expected?  */
+	if (ns == self->vr)
+		return NS_EXPECTED;
+	/*
+	 *  Stations are allowed to treat invalid NS as unexpected NS
+	 *  IrLAP, Recv ... with-invalid-Ns. p. 84
+	 */
+	return NS_UNEXPECTED;
+
+	/* return NR_INVALID; */
+}
+/*
+ * Function irlap_validate_nr_received (nr)
+ *
+ *    Validate the next to receive (nr) field from received frame.
+ *
+ */
+int irlap_validate_nr_received(struct irlap_cb *self, int nr)
+{
+	/*  nr as expected?  */
+	if (nr == self->vs) {
+		IRDA_DEBUG(4, "%s(), expected!\n", __func__);
+		return NR_EXPECTED;
+	}
+
+	/*
+	 *  unexpected nr? (but within current window), first we check if the
+	 *  ns numbers of the frames in the current window wrap.
+	 */
+	if (self->va < self->vs) {
+		if ((nr >= self->va) && (nr <= self->vs))
+			return NR_UNEXPECTED;
+	} else {
+		if ((nr >= self->va) || (nr <= self->vs))
+			return NR_UNEXPECTED;
+	}
+
+	/* Invalid nr!  */
+	return NR_INVALID;
+}
+
+/*
+ * Function irlap_initiate_connection_state ()
+ *
+ *    Initialize the connection state parameters
+ *
+ */
+void irlap_initiate_connection_state(struct irlap_cb *self)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Next to send and next to receive */
+	self->vs = self->vr = 0;
+
+	/* Last frame which got acked (0 - 1) % 8 */
+	self->va = 7;
+
+	self->window = 1;
+
+	self->remote_busy = FALSE;
+	self->retry_count = 0;
+}
+
+/*
+ * Function irlap_wait_min_turn_around (self, qos)
+ *
+ *    Wait negotiated minimum turn around time, this function actually sets
+ *    the number of BOS's that must be sent before the next transmitted
+ *    frame in order to delay for the specified amount of time. This is
+ *    done to avoid using timers, and the forbidden udelay!
+ */
+void irlap_wait_min_turn_around(struct irlap_cb *self, struct qos_info *qos)
+{
+	__u32 min_turn_time;
+	__u32 speed;
+
+	/* Get QoS values.  */
+	speed = qos->baud_rate.value;
+	min_turn_time = qos->min_turn_time.value;
+
+	/* No need to calculate XBOFs for speeds over 115200 bps */
+	if (speed > 115200) {
+		self->mtt_required = min_turn_time;
+		return;
+	}
+
+	/*
+	 *  Send additional BOF's for the next frame for the requested
+	 *  min turn time, so now we must calculate how many chars (XBOF's) we
+	 *  must send for the requested time period (min turn time)
+	 */
+	self->xbofs_delay = irlap_min_turn_time_in_bytes(speed, min_turn_time);
+}
+
+/*
+ * Function irlap_flush_all_queues (void)
+ *
+ *    Flush all queues
+ *
+ */
+void irlap_flush_all_queues(struct irlap_cb *self)
+{
+	struct sk_buff* skb;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Free transmission queue */
+	while ((skb = skb_dequeue(&self->txq)) != NULL)
+		dev_kfree_skb(skb);
+
+	while ((skb = skb_dequeue(&self->txq_ultra)) != NULL)
+		dev_kfree_skb(skb);
+
+	/* Free sliding window buffered packets */
+	while ((skb = skb_dequeue(&self->wx_list)) != NULL)
+		dev_kfree_skb(skb);
+}
+
+/*
+ * Function irlap_setspeed (self, speed)
+ *
+ *    Change the speed of the IrDA port
+ *
+ */
+static void irlap_change_speed(struct irlap_cb *self, __u32 speed, int now)
+{
+	struct sk_buff *skb;
+
+	IRDA_DEBUG(0, "%s(), setting speed to %d\n", __func__, speed);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	self->speed = speed;
+
+	/* Change speed now, or just piggyback speed on frames */
+	if (now) {
+		/* Send down empty frame to trigger speed change */
+		skb = alloc_skb(0, GFP_ATOMIC);
+		if (skb)
+			irlap_queue_xmit(self, skb);
+	}
+}
+
+/*
+ * Function irlap_init_qos_capabilities (self, qos)
+ *
+ *    Initialize QoS for this IrLAP session, What we do is to compute the
+ *    intersection of the QoS capabilities for the user, driver and for
+ *    IrLAP itself. Normally, IrLAP will not specify any values, but it can
+ *    be used to restrict certain values.
+ */
+static void irlap_init_qos_capabilities(struct irlap_cb *self,
+					struct qos_info *qos_user)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(self->netdev != NULL, return;);
+
+	/* Start out with the maximum QoS support possible */
+	irda_init_max_qos_capabilies(&self->qos_rx);
+
+	/* Apply drivers QoS capabilities */
+	irda_qos_compute_intersection(&self->qos_rx, self->qos_dev);
+
+	/*
+	 *  Check for user supplied QoS parameters. The service user is only
+	 *  allowed to supply these values. We check each parameter since the
+	 *  user may not have set all of them.
+	 */
+	if (qos_user) {
+		IRDA_DEBUG(1, "%s(), Found user specified QoS!\n", __func__);
+
+		if (qos_user->baud_rate.bits)
+			self->qos_rx.baud_rate.bits &= qos_user->baud_rate.bits;
+
+		if (qos_user->max_turn_time.bits)
+			self->qos_rx.max_turn_time.bits &= qos_user->max_turn_time.bits;
+		if (qos_user->data_size.bits)
+			self->qos_rx.data_size.bits &= qos_user->data_size.bits;
+
+		if (qos_user->link_disc_time.bits)
+			self->qos_rx.link_disc_time.bits &= qos_user->link_disc_time.bits;
+	}
+
+	/* Use 500ms in IrLAP for now */
+	self->qos_rx.max_turn_time.bits &= 0x01;
+
+	/* Set data size */
+	/*self->qos_rx.data_size.bits &= 0x03;*/
+
+	irda_qos_bits_to_value(&self->qos_rx);
+}
+
+/*
+ * Function irlap_apply_default_connection_parameters (void, now)
+ *
+ *    Use the default connection and transmission parameters
+ */
+void irlap_apply_default_connection_parameters(struct irlap_cb *self)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* xbofs : Default value in NDM */
+	self->next_bofs   = 12;
+	self->bofs_count  = 12;
+
+	/* NDM Speed is 9600 */
+	irlap_change_speed(self, 9600, TRUE);
+
+	/* Set mbusy when going to NDM state */
+	irda_device_set_media_busy(self->netdev, TRUE);
+
+	/*
+	 * Generate random connection address for this session, which must
+	 * be 7 bits wide and different from 0x00 and 0xfe
+	 */
+	while ((self->caddr == 0x00) || (self->caddr == 0xfe)) {
+		get_random_bytes(&self->caddr, sizeof(self->caddr));
+		self->caddr &= 0xfe;
+	}
+
+	/* Use default values until connection has been negitiated */
+	self->slot_timeout = sysctl_slot_timeout;
+	self->final_timeout = FINAL_TIMEOUT;
+	self->poll_timeout = POLL_TIMEOUT;
+	self->wd_timeout = WD_TIMEOUT;
+
+	/* Set some default values */
+	self->qos_tx.baud_rate.value = 9600;
+	self->qos_rx.baud_rate.value = 9600;
+	self->qos_tx.max_turn_time.value = 0;
+	self->qos_rx.max_turn_time.value = 0;
+	self->qos_tx.min_turn_time.value = 0;
+	self->qos_rx.min_turn_time.value = 0;
+	self->qos_tx.data_size.value = 64;
+	self->qos_rx.data_size.value = 64;
+	self->qos_tx.window_size.value = 1;
+	self->qos_rx.window_size.value = 1;
+	self->qos_tx.additional_bofs.value = 12;
+	self->qos_rx.additional_bofs.value = 12;
+	self->qos_tx.link_disc_time.value = 0;
+	self->qos_rx.link_disc_time.value = 0;
+
+	irlap_flush_all_queues(self);
+
+	self->disconnect_pending = FALSE;
+	self->connect_pending = FALSE;
+}
+
+/*
+ * Function irlap_apply_connection_parameters (qos, now)
+ *
+ *    Initialize IrLAP with the negotiated QoS values
+ *
+ * If 'now' is false, the speed and xbofs will be changed after the next
+ * frame is sent.
+ * If 'now' is true, the speed and xbofs is changed immediately
+ */
+void irlap_apply_connection_parameters(struct irlap_cb *self, int now)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Set the negotiated xbofs value */
+	self->next_bofs   = self->qos_tx.additional_bofs.value;
+	if (now)
+		self->bofs_count = self->next_bofs;
+
+	/* Set the negotiated link speed (may need the new xbofs value) */
+	irlap_change_speed(self, self->qos_tx.baud_rate.value, now);
+
+	self->window_size = self->qos_tx.window_size.value;
+	self->window      = self->qos_tx.window_size.value;
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	/*
+	 *  Calculate how many bytes it is possible to transmit before the
+	 *  link must be turned around
+	 */
+	self->line_capacity =
+		irlap_max_line_capacity(self->qos_tx.baud_rate.value,
+					self->qos_tx.max_turn_time.value);
+	self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+
+	/*
+	 *  Initialize timeout values, some of the rules are listed on
+	 *  page 92 in IrLAP.
+	 */
+	IRDA_ASSERT(self->qos_tx.max_turn_time.value != 0, return;);
+	IRDA_ASSERT(self->qos_rx.max_turn_time.value != 0, return;);
+	/* The poll timeout applies only to the primary station.
+	 * It defines the maximum time the primary stay in XMIT mode
+	 * before timeout and turning the link around (sending a RR).
+	 * Or, this is how much we can keep the pf bit in primary mode.
+	 * Therefore, it must be lower or equal than our *OWN* max turn around.
+	 * Jean II */
+	self->poll_timeout = self->qos_tx.max_turn_time.value * HZ / 1000;
+	/* The Final timeout applies only to the primary station.
+	 * It defines the maximum time the primary wait (mostly in RECV mode)
+	 * for an answer from the secondary station before polling it again.
+	 * Therefore, it must be greater or equal than our *PARTNER*
+	 * max turn around time - Jean II */
+	self->final_timeout = self->qos_rx.max_turn_time.value * HZ / 1000;
+	/* The Watchdog Bit timeout applies only to the secondary station.
+	 * It defines the maximum time the secondary wait (mostly in RECV mode)
+	 * for poll from the primary station before getting annoyed.
+	 * Therefore, it must be greater or equal than our *PARTNER*
+	 * max turn around time - Jean II */
+	self->wd_timeout = self->final_timeout * 2;
+
+	/*
+	 * N1 and N2 are maximum retry count for *both* the final timer
+	 * and the wd timer (with a factor 2) as defined above.
+	 * After N1 retry of a timer, we give a warning to the user.
+	 * After N2 retry, we consider the link dead and disconnect it.
+	 * Jean II
+	 */
+
+	/*
+	 *  Set N1 to 0 if Link Disconnect/Threshold Time = 3 and set it to
+	 *  3 seconds otherwise. See page 71 in IrLAP for more details.
+	 *  Actually, it's not always 3 seconds, as we allow to set
+	 *  it via sysctl... Max maxtt is 500ms, and N1 need to be multiple
+	 *  of 2, so 1 second is minimum we can allow. - Jean II
+	 */
+	if (self->qos_tx.link_disc_time.value == sysctl_warn_noreply_time)
+		/*
+		 * If we set N1 to 0, it will trigger immediately, which is
+		 * not what we want. What we really want is to disable it,
+		 * Jean II
+		 */
+		self->N1 = -2; /* Disable - Need to be multiple of 2*/
+	else
+		self->N1 = sysctl_warn_noreply_time * 1000 /
+		  self->qos_rx.max_turn_time.value;
+
+	IRDA_DEBUG(4, "Setting N1 = %d\n", self->N1);
+
+	/* Set N2 to match our own disconnect time */
+	self->N2 = self->qos_tx.link_disc_time.value * 1000 /
+		self->qos_rx.max_turn_time.value;
+	IRDA_DEBUG(4, "Setting N2 = %d\n", self->N2);
+}
+
+#ifdef CONFIG_PROC_FS
+struct irlap_iter_state {
+	int id;
+};
+
+static void *irlap_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct irlap_iter_state *iter = seq->private;
+	struct irlap_cb *self;
+
+	/* Protect our access to the tsap list */
+	spin_lock_irq(&irlap->hb_spinlock);
+	iter->id = 0;
+
+	for (self = (struct irlap_cb *) hashbin_get_first(irlap);
+	     self; self = (struct irlap_cb *) hashbin_get_next(irlap)) {
+		if (iter->id == *pos)
+			break;
+		++iter->id;
+	}
+
+	return self;
+}
+
+static void *irlap_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct irlap_iter_state *iter = seq->private;
+
+	++*pos;
+	++iter->id;
+	return (void *) hashbin_get_next(irlap);
+}
+
+static void irlap_seq_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_irq(&irlap->hb_spinlock);
+}
+
+static int irlap_seq_show(struct seq_file *seq, void *v)
+{
+	const struct irlap_iter_state *iter = seq->private;
+	const struct irlap_cb *self = v;
+
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -EINVAL;);
+
+	seq_printf(seq, "irlap%d ", iter->id);
+	seq_printf(seq, "state: %s\n",
+		   irlap_state[self->state]);
+
+	seq_printf(seq, "  device name: %s, ",
+		   (self->netdev) ? self->netdev->name : "bug");
+	seq_printf(seq, "hardware name: %s\n", self->hw_name);
+
+	seq_printf(seq, "  caddr: %#02x, ", self->caddr);
+	seq_printf(seq, "saddr: %#08x, ", self->saddr);
+	seq_printf(seq, "daddr: %#08x\n", self->daddr);
+
+	seq_printf(seq, "  win size: %d, ",
+		   self->window_size);
+	seq_printf(seq, "win: %d, ", self->window);
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	seq_printf(seq, "line capacity: %d, ",
+		   self->line_capacity);
+	seq_printf(seq, "bytes left: %d\n", self->bytes_left);
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+	seq_printf(seq, "  tx queue len: %d ",
+		   skb_queue_len(&self->txq));
+	seq_printf(seq, "win queue len: %d ",
+		   skb_queue_len(&self->wx_list));
+	seq_printf(seq, "rbusy: %s", self->remote_busy ?
+		   "TRUE" : "FALSE");
+	seq_printf(seq, " mbusy: %s\n", self->media_busy ?
+		   "TRUE" : "FALSE");
+
+	seq_printf(seq, "  retrans: %d ", self->retry_count);
+	seq_printf(seq, "vs: %d ", self->vs);
+	seq_printf(seq, "vr: %d ", self->vr);
+	seq_printf(seq, "va: %d\n", self->va);
+
+	seq_printf(seq, "  qos\tbps\tmaxtt\tdsize\twinsize\taddbofs\tmintt\tldisc\tcomp\n");
+
+	seq_printf(seq, "  tx\t%d\t",
+		   self->qos_tx.baud_rate.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.max_turn_time.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.data_size.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.window_size.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.additional_bofs.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.min_turn_time.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_tx.link_disc_time.value);
+	seq_printf(seq, "\n");
+
+	seq_printf(seq, "  rx\t%d\t",
+		   self->qos_rx.baud_rate.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_rx.max_turn_time.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_rx.data_size.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_rx.window_size.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_rx.additional_bofs.value);
+	seq_printf(seq, "%d\t",
+		   self->qos_rx.min_turn_time.value);
+	seq_printf(seq, "%d\n",
+		   self->qos_rx.link_disc_time.value);
+
+	return 0;
+}
+
+static const struct seq_operations irlap_seq_ops = {
+	.start  = irlap_seq_start,
+	.next   = irlap_seq_next,
+	.stop   = irlap_seq_stop,
+	.show   = irlap_seq_show,
+};
+
+static int irlap_seq_open(struct inode *inode, struct file *file)
+{
+	if (irlap == NULL)
+		return -EINVAL;
+
+	return seq_open_private(file, &irlap_seq_ops,
+			sizeof(struct irlap_iter_state));
+}
+
+const struct file_operations irlap_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = irlap_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
new file mode 100644
index 00000000..ccd214f9
--- /dev/null
+++ b/net/irda/irlap_event.c
@@ -0,0 +1,2330 @@
+/*********************************************************************
+ *
+ * Filename:      irlap_event.c
+ * Version:       0.9
+ * Description:   IrLAP state machine implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dag@brattli.net>
+ * Created at:    Sat Aug 16 00:59:29 1997
+ * Modified at:   Sat Dec 25 21:07:57 1999
+ * Modified by:   Dag Brattli <dag@brattli.net>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dag@brattli.net>,
+ *     Copyright (c) 1998      Thomas Davis <ratbert@radiks.net>
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap_event.h>
+
+#include <net/irda/timer.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/qos.h>
+#include <net/irda/parameters.h>
+#include <net/irda/irlmp.h>		/* irlmp_flow_indication(), ... */
+
+#include <net/irda/irda_device.h>
+
+#ifdef CONFIG_IRDA_FAST_RR
+int sysctl_fast_poll_increase = 50;
+#endif
+
+static int irlap_state_ndm    (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_query  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reply  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_conn   (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_setup  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_xmit_p (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_pclose (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_nrm_p  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event,
+				  struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_nrm_s  (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_xmit_s (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_sclose (struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info);
+static int irlap_state_reset_check(struct irlap_cb *, IRLAP_EVENT event,
+				   struct sk_buff *, struct irlap_info *);
+
+#ifdef CONFIG_IRDA_DEBUG
+static const char *const irlap_event[] = {
+	"DISCOVERY_REQUEST",
+	"CONNECT_REQUEST",
+	"CONNECT_RESPONSE",
+	"DISCONNECT_REQUEST",
+	"DATA_REQUEST",
+	"RESET_REQUEST",
+	"RESET_RESPONSE",
+	"SEND_I_CMD",
+	"SEND_UI_FRAME",
+	"RECV_DISCOVERY_XID_CMD",
+	"RECV_DISCOVERY_XID_RSP",
+	"RECV_SNRM_CMD",
+	"RECV_TEST_CMD",
+	"RECV_TEST_RSP",
+	"RECV_UA_RSP",
+	"RECV_DM_RSP",
+	"RECV_RD_RSP",
+	"RECV_I_CMD",
+	"RECV_I_RSP",
+	"RECV_UI_FRAME",
+	"RECV_FRMR_RSP",
+	"RECV_RR_CMD",
+	"RECV_RR_RSP",
+	"RECV_RNR_CMD",
+	"RECV_RNR_RSP",
+	"RECV_REJ_CMD",
+	"RECV_REJ_RSP",
+	"RECV_SREJ_CMD",
+	"RECV_SREJ_RSP",
+	"RECV_DISC_CMD",
+	"SLOT_TIMER_EXPIRED",
+	"QUERY_TIMER_EXPIRED",
+	"FINAL_TIMER_EXPIRED",
+	"POLL_TIMER_EXPIRED",
+	"DISCOVERY_TIMER_EXPIRED",
+	"WD_TIMER_EXPIRED",
+	"BACKOFF_TIMER_EXPIRED",
+	"MEDIA_BUSY_TIMER_EXPIRED",
+};
+#endif	/* CONFIG_IRDA_DEBUG */
+
+const char *const irlap_state[] = {
+	"LAP_NDM",
+	"LAP_QUERY",
+	"LAP_REPLY",
+	"LAP_CONN",
+	"LAP_SETUP",
+	"LAP_OFFLINE",
+	"LAP_XMIT_P",
+	"LAP_PCLOSE",
+	"LAP_NRM_P",
+	"LAP_RESET_WAIT",
+	"LAP_RESET",
+	"LAP_NRM_S",
+	"LAP_XMIT_S",
+	"LAP_SCLOSE",
+	"LAP_RESET_CHECK",
+};
+
+static int (*state[])(struct irlap_cb *self, IRLAP_EVENT event,
+		      struct sk_buff *skb, struct irlap_info *info) =
+{
+	irlap_state_ndm,
+	irlap_state_query,
+	irlap_state_reply,
+	irlap_state_conn,
+	irlap_state_setup,
+	irlap_state_offline,
+	irlap_state_xmit_p,
+	irlap_state_pclose,
+	irlap_state_nrm_p,
+	irlap_state_reset_wait,
+	irlap_state_reset,
+	irlap_state_nrm_s,
+	irlap_state_xmit_s,
+	irlap_state_sclose,
+	irlap_state_reset_check,
+};
+
+/*
+ * Function irda_poll_timer_expired (data)
+ *
+ *    Poll timer has expired. Normally we must now send a RR frame to the
+ *    remote device
+ */
+static void irlap_poll_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Calculate and set time before we will have to send back the pf bit
+ * to the peer. Use in primary.
+ * Make sure that state is XMIT_P/XMIT_S when calling this function
+ * (and that nobody messed up with the state). - Jean II
+ */
+static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+#ifdef CONFIG_IRDA_FAST_RR
+	/*
+	 * Send out the RR frames faster if our own transmit queue is empty, or
+	 * if the peer is busy. The effect is a much faster conversation
+	 */
+	if (skb_queue_empty(&self->txq) || self->remote_busy) {
+		if (self->fast_RR == TRUE) {
+			/*
+			 *  Assert that the fast poll timer has not reached the
+			 *  normal poll timer yet
+			 */
+			if (self->fast_RR_timeout < timeout) {
+				/*
+				 *  FIXME: this should be a more configurable
+				 *         function
+				 */
+				self->fast_RR_timeout +=
+					(sysctl_fast_poll_increase * HZ/1000);
+
+				/* Use this fast(er) timeout instead */
+				timeout = self->fast_RR_timeout;
+			}
+		} else {
+			self->fast_RR = TRUE;
+
+			/* Start with just 0 ms */
+			self->fast_RR_timeout = 0;
+			timeout = 0;
+		}
+	} else
+		self->fast_RR = FALSE;
+
+	IRDA_DEBUG(3, "%s(), timeout=%d (%ld)\n", __func__, timeout, jiffies);
+#endif /* CONFIG_IRDA_FAST_RR */
+
+	if (timeout == 0)
+		irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL);
+	else
+		irda_start_timer(&self->poll_timer, timeout, self,
+				 irlap_poll_timer_expired);
+}
+
+/*
+ * Function irlap_do_event (event, skb, info)
+ *
+ *    Rushes through the state machine without any delay. If state == XMIT
+ *    then send queued data frames.
+ */
+void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
+		    struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret;
+
+	if (!self || self->magic != LAP_MAGIC)
+		return;
+
+	IRDA_DEBUG(3, "%s(), event = %s, state = %s\n", __func__,
+		   irlap_event[event], irlap_state[self->state]);
+
+	ret = (*state[self->state])(self, event, skb, info);
+
+	/*
+	 *  Check if there are any pending events that needs to be executed
+	 */
+	switch (self->state) {
+	case LAP_XMIT_P: /* FALLTHROUGH */
+	case LAP_XMIT_S:
+		/*
+		 * We just received the pf bit and are at the beginning
+		 * of a new LAP transmit window.
+		 * Check if there are any queued data frames, and do not
+		 * try to disconnect link if we send any data frames, since
+		 * that will change the state away form XMIT
+		 */
+		IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__,
+			   skb_queue_len(&self->txq));
+
+		if (!skb_queue_empty(&self->txq)) {
+			/* Prevent race conditions with irlap_data_request() */
+			self->local_busy = TRUE;
+
+			/* Theory of operation.
+			 * We send frames up to when we fill the window or
+			 * reach line capacity. Those frames will queue up
+			 * in the device queue, and the driver will slowly
+			 * send them.
+			 * After each frame that we send, we poll the higher
+			 * layer for more data. It's the right time to do
+			 * that because the link layer need to perform the mtt
+			 * and then send the first frame, so we can afford
+			 * to send a bit of time in kernel space.
+			 * The explicit flow indication allow to minimise
+			 * buffers (== lower latency), to avoid higher layer
+			 * polling via timers (== less context switches) and
+			 * to implement a crude scheduler - Jean II */
+
+			/* Try to send away all queued data frames */
+			while ((skb = skb_dequeue(&self->txq)) != NULL) {
+				/* Send one frame */
+				ret = (*state[self->state])(self, SEND_I_CMD,
+							    skb, NULL);
+				/* Drop reference count.
+				 * It will be increase as needed in
+				 * irlap_send_data_xxx() */
+				kfree_skb(skb);
+
+				/* Poll the higher layers for one more frame */
+				irlmp_flow_indication(self->notify.instance,
+						      FLOW_START);
+
+				if (ret == -EPROTO)
+					break; /* Try again later! */
+			}
+			/* Finished transmitting */
+			self->local_busy = FALSE;
+		} else if (self->disconnect_pending) {
+			self->disconnect_pending = FALSE;
+
+			ret = (*state[self->state])(self, DISCONNECT_REQUEST,
+						    NULL, NULL);
+		}
+		break;
+/*	case LAP_NDM: */
+/*	case LAP_CONN: */
+/*	case LAP_RESET_WAIT: */
+/*	case LAP_RESET_CHECK: */
+	default:
+		break;
+	}
+}
+
+/*
+ * Function irlap_state_ndm (event, skb, frame)
+ *
+ *    NDM (Normal Disconnected Mode) state
+ *
+ */
+static int irlap_state_ndm(struct irlap_cb *self, IRLAP_EVENT event,
+			   struct sk_buff *skb, struct irlap_info *info)
+{
+	discovery_t *discovery_rsp;
+	int ret = 0;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case CONNECT_REQUEST:
+		IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+		if (self->media_busy) {
+			/* Note : this will never happen, because we test
+			 * media busy in irlap_connect_request() and
+			 * postpone the event... - Jean II */
+			IRDA_DEBUG(0, "%s(), CONNECT_REQUEST: media busy!\n",
+				   __func__);
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			irlap_disconnect_indication(self, LAP_MEDIA_BUSY);
+		} else {
+			irlap_send_snrm_frame(self, &self->qos_rx);
+
+			/* Start Final-bit timer */
+			irlap_start_final_timer(self, self->final_timeout);
+
+			self->retry_count = 0;
+			irlap_next_state(self, LAP_SETUP);
+		}
+		break;
+	case RECV_SNRM_CMD:
+		/* Check if the frame contains and I field */
+		if (info) {
+			self->daddr = info->daddr;
+			self->caddr = info->caddr;
+
+			irlap_next_state(self, LAP_CONN);
+
+			irlap_connect_indication(self, skb);
+		} else {
+			IRDA_DEBUG(0, "%s(), SNRM frame does not "
+				   "contain an I field!\n", __func__);
+		}
+		break;
+	case DISCOVERY_REQUEST:
+		IRDA_ASSERT(info != NULL, return -1;);
+
+		if (self->media_busy) {
+			IRDA_DEBUG(1, "%s(), DISCOVERY_REQUEST: media busy!\n",
+				   __func__);
+			/* irlap->log.condition = MEDIA_BUSY; */
+
+			/* This will make IrLMP try again */
+			irlap_discovery_confirm(self, NULL);
+			/* Note : the discovery log is not cleaned up here,
+			 * it will be done in irlap_discovery_request()
+			 * Jean II */
+			return 0;
+		}
+
+		self->S = info->S;
+		self->s = info->s;
+		irlap_send_discovery_xid_frame(self, info->S, info->s, TRUE,
+					       info->discovery);
+		self->frame_sent = FALSE;
+		self->s++;
+
+		irlap_start_slot_timer(self, self->slot_timeout);
+		irlap_next_state(self, LAP_QUERY);
+		break;
+	case RECV_DISCOVERY_XID_CMD:
+		IRDA_ASSERT(info != NULL, return -1;);
+
+		/* Assert that this is not the final slot */
+		if (info->s <= info->S) {
+			self->slot = irlap_generate_rand_time_slot(info->S,
+								   info->s);
+			if (self->slot == info->s) {
+				discovery_rsp = irlmp_get_discovery_response();
+				discovery_rsp->data.daddr = info->daddr;
+
+				irlap_send_discovery_xid_frame(self, info->S,
+							       self->slot,
+							       FALSE,
+							       discovery_rsp);
+				self->frame_sent = TRUE;
+			} else
+				self->frame_sent = FALSE;
+
+			/*
+			 * Go to reply state until end of discovery to
+			 * inhibit our own transmissions. Set the timer
+			 * to not stay forever there... Jean II
+			 */
+			irlap_start_query_timer(self, info->S, info->s);
+			irlap_next_state(self, LAP_REPLY);
+		} else {
+		/* This is the final slot. How is it possible ?
+		 * This would happen is both discoveries are just slightly
+		 * offset (if they are in sync, all packets are lost).
+		 * Most often, all the discovery requests will be received
+		 * in QUERY state (see my comment there), except for the
+		 * last frame that will come here.
+		 * The big trouble when it happen is that active discovery
+		 * doesn't happen, because nobody answer the discoveries
+		 * frame of the other guy, so the log shows up empty.
+		 * What should we do ?
+		 * Not much. It's too late to answer those discovery frames,
+		 * so we just pass the info to IrLMP who will put it in the
+		 * log (and post an event).
+		 * Another cause would be devices that do discovery much
+		 * slower than us, however the latest fixes should minimise
+		 * those cases...
+		 * Jean II
+		 */
+			IRDA_DEBUG(1, "%s(), Receiving final discovery request, missed the discovery slots :-(\n", __func__);
+
+			/* Last discovery request -> in the log */
+			irlap_discovery_indication(self, info->discovery);
+		}
+		break;
+	case MEDIA_BUSY_TIMER_EXPIRED:
+		/* A bunch of events may be postponed because the media is
+		 * busy (usually immediately after we close a connection),
+		 * or while we are doing discovery (state query/reply).
+		 * In all those cases, the media busy flag will be cleared
+		 * when it's OK for us to process those postponed events.
+		 * This event is not mentioned in the state machines in the
+		 * IrLAP spec. It's because they didn't consider Ultra and
+		 * postponing connection request is optional.
+		 * Jean II */
+#ifdef CONFIG_IRDA_ULTRA
+		/* Send any pending Ultra frames if any */
+		if (!skb_queue_empty(&self->txq_ultra)) {
+			/* We don't send the frame, just post an event.
+			 * Also, previously this code was in timer.c...
+			 * Jean II */
+			ret = (*state[self->state])(self, SEND_UI_FRAME,
+						    NULL, NULL);
+		}
+#endif /* CONFIG_IRDA_ULTRA */
+		/* Check if we should try to connect.
+		 * This code was previously in irlap_do_event() */
+		if (self->connect_pending) {
+			self->connect_pending = FALSE;
+
+			/* This one *should* not pend in this state, except
+			 * if a socket try to connect and immediately
+			 * disconnect. - clear - Jean II */
+			if (self->disconnect_pending)
+				irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+			else
+				ret = (*state[self->state])(self,
+							    CONNECT_REQUEST,
+							    NULL, NULL);
+			self->disconnect_pending = FALSE;
+		}
+		/* Note : one way to test if this code works well (including
+		 * media busy and small busy) is to create a user space
+		 * application generating an Ultra packet every 3.05 sec (or
+		 * 2.95 sec) and to see how it interact with discovery.
+		 * It's fairly easy to check that no packet is lost, that the
+		 * packets are postponed during discovery and that after
+		 * discovery indication you have a 100ms "gap".
+		 * As connection request and Ultra are now processed the same
+		 * way, this avoid the tedious job of trying IrLAP connection
+		 * in all those cases...
+		 * Jean II */
+		break;
+#ifdef CONFIG_IRDA_ULTRA
+	case SEND_UI_FRAME:
+	{
+		int i;
+		/* Only allowed to repeat an operation twice */
+		for (i=0; ((i<2) && (self->media_busy == FALSE)); i++) {
+			skb = skb_dequeue(&self->txq_ultra);
+			if (skb)
+				irlap_send_ui_frame(self, skb, CBROADCAST,
+						    CMD_FRAME);
+			else
+				break;
+			/* irlap_send_ui_frame() won't increase skb reference
+			 * count, so no dev_kfree_skb() - Jean II */
+		}
+		if (i == 2) {
+			/* Force us to listen 500 ms again */
+			irda_device_set_media_busy(self->netdev, TRUE);
+		}
+		break;
+	}
+	case RECV_UI_FRAME:
+		/* Only accept broadcast frames in NDM mode */
+		if (info->caddr != CBROADCAST) {
+			IRDA_DEBUG(0, "%s(), not a broadcast frame!\n",
+				   __func__);
+		} else
+			irlap_unitdata_indication(self, skb);
+		break;
+#endif /* CONFIG_IRDA_ULTRA */
+	case RECV_TEST_CMD:
+		/* Remove test frame header */
+		skb_pull(skb, sizeof(struct test_frame));
+
+		/*
+		 * Send response. This skb will not be sent out again, and
+		 * will only be used to send out the same info as the cmd
+		 */
+		irlap_send_test_frame(self, CBROADCAST, info->daddr, skb);
+		break;
+	case RECV_TEST_RSP:
+		IRDA_DEBUG(0, "%s() not implemented!\n", __func__);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__,
+			   irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_query (event, skb, info)
+ *
+ *    QUERY state
+ *
+ */
+static int irlap_state_query(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case RECV_DISCOVERY_XID_RSP:
+		IRDA_ASSERT(info != NULL, return -1;);
+		IRDA_ASSERT(info->discovery != NULL, return -1;);
+
+		IRDA_DEBUG(4, "%s(), daddr=%08x\n", __func__,
+			   info->discovery->data.daddr);
+
+		if (!self->discovery_log) {
+			IRDA_WARNING("%s: discovery log is gone! "
+				     "maybe the discovery timeout has been set"
+				     " too short?\n", __func__);
+			break;
+		}
+		hashbin_insert(self->discovery_log,
+			       (irda_queue_t *) info->discovery,
+			       info->discovery->data.daddr, NULL);
+
+		/* Keep state */
+		/* irlap_next_state(self, LAP_QUERY);  */
+
+		break;
+	case RECV_DISCOVERY_XID_CMD:
+		/* Yes, it is possible to receive those frames in this mode.
+		 * Note that most often the last discovery request won't
+		 * occur here but in NDM state (see my comment there).
+		 * What should we do ?
+		 * Not much. We are currently performing our own discovery,
+		 * therefore we can't answer those frames. We don't want
+		 * to change state either. We just pass the info to
+		 * IrLMP who will put it in the log (and post an event).
+		 * Jean II
+		 */
+
+		IRDA_ASSERT(info != NULL, return -1;);
+
+		IRDA_DEBUG(1, "%s(), Receiving discovery request (s = %d) while performing discovery :-(\n", __func__, info->s);
+
+		/* Last discovery request ? */
+		if (info->s == 0xff)
+			irlap_discovery_indication(self, info->discovery);
+		break;
+	case SLOT_TIMER_EXPIRED:
+		/*
+		 * Wait a little longer if we detect an incoming frame. This
+		 * is not mentioned in the spec, but is a good thing to do,
+		 * since we want to work even with devices that violate the
+		 * timing requirements.
+		 */
+		if (irda_device_is_receiving(self->netdev) && !self->add_wait) {
+			IRDA_DEBUG(2, "%s(), device is slow to answer, "
+				   "waiting some more!\n", __func__);
+			irlap_start_slot_timer(self, msecs_to_jiffies(10));
+			self->add_wait = TRUE;
+			return ret;
+		}
+		self->add_wait = FALSE;
+
+		if (self->s < self->S) {
+			irlap_send_discovery_xid_frame(self, self->S,
+						       self->s, TRUE,
+						       self->discovery_cmd);
+			self->s++;
+			irlap_start_slot_timer(self, self->slot_timeout);
+
+			/* Keep state */
+			irlap_next_state(self, LAP_QUERY);
+		} else {
+			/* This is the final slot! */
+			irlap_send_discovery_xid_frame(self, self->S, 0xff,
+						       TRUE,
+						       self->discovery_cmd);
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			/*
+			 *  We are now finished with the discovery procedure,
+			 *  so now we must return the results
+			 */
+			irlap_discovery_confirm(self, self->discovery_log);
+
+			/* IrLMP should now have taken care of the log */
+			self->discovery_log = NULL;
+		}
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__,
+			   irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_reply (self, event, skb, info)
+ *
+ *    REPLY, we have received a XID discovery frame from a device and we
+ *    are waiting for the right time slot to send a response XID frame
+ *
+ */
+static int irlap_state_reply(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	discovery_t *discovery_rsp;
+	int ret=0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case QUERY_TIMER_EXPIRED:
+		IRDA_DEBUG(0, "%s(), QUERY_TIMER_EXPIRED <%ld>\n",
+			   __func__, jiffies);
+		irlap_next_state(self, LAP_NDM);
+		break;
+	case RECV_DISCOVERY_XID_CMD:
+		IRDA_ASSERT(info != NULL, return -1;);
+		/* Last frame? */
+		if (info->s == 0xff) {
+			del_timer(&self->query_timer);
+
+			/* info->log.condition = REMOTE; */
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			irlap_discovery_indication(self, info->discovery);
+		} else {
+			/* If it's our slot, send our reply */
+			if ((info->s >= self->slot) && (!self->frame_sent)) {
+				discovery_rsp = irlmp_get_discovery_response();
+				discovery_rsp->data.daddr = info->daddr;
+
+				irlap_send_discovery_xid_frame(self, info->S,
+							       self->slot,
+							       FALSE,
+							       discovery_rsp);
+
+				self->frame_sent = TRUE;
+			}
+			/* Readjust our timer to accommodate devices
+			 * doing faster or slower discovery than us...
+			 * Jean II */
+			irlap_start_query_timer(self, info->S, info->s);
+
+			/* Keep state */
+			//irlap_next_state(self, LAP_REPLY);
+		}
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__,
+			   event, irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_conn (event, skb, info)
+ *
+ *    CONN, we have received a SNRM command and is waiting for the upper
+ *    layer to accept or refuse connection
+ *
+ */
+static int irlap_state_conn(struct irlap_cb *self, IRLAP_EVENT event,
+			    struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[ event]);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case CONNECT_RESPONSE:
+		skb_pull(skb, sizeof(struct snrm_frame));
+
+		IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+		irlap_qos_negotiate(self, skb);
+
+		irlap_initiate_connection_state(self);
+
+		/*
+		 * Applying the parameters now will make sure we change speed
+		 * *after* we have sent the next frame
+		 */
+		irlap_apply_connection_parameters(self, FALSE);
+
+		/*
+		 * Sending this frame will force a speed change after it has
+		 * been sent (i.e. the frame will be sent at 9600).
+		 */
+		irlap_send_ua_response_frame(self, &self->qos_rx);
+
+#if 0
+		/*
+		 * We are allowed to send two frames, but this may increase
+		 * the connect latency, so lets not do it for now.
+		 */
+		/* This is full of good intentions, but doesn't work in
+		 * practice.
+		 * After sending the first UA response, we switch the
+		 * dongle to the negotiated speed, which is usually
+		 * different than 9600 kb/s.
+		 * From there, there is two solutions :
+		 * 1) The other end has received the first UA response :
+		 * it will set up the connection, move to state LAP_NRM_P,
+		 * and will ignore and drop the second UA response.
+		 * Actually, it's even worse : the other side will almost
+		 * immediately send a RR that will likely collide with the
+		 * UA response (depending on negotiated turnaround).
+		 * 2) The other end has not received the first UA response,
+		 * will stay at 9600 and will never see the second UA response.
+		 * Jean II */
+		irlap_send_ua_response_frame(self, &self->qos_rx);
+#endif
+
+		/*
+		 *  The WD-timer could be set to the duration of the P-timer
+		 *  for this case, but it is recommended to use twice the
+		 *  value (note 3 IrLAP p. 60).
+		 */
+		irlap_start_wd_timer(self, self->wd_timeout);
+		irlap_next_state(self, LAP_NRM_S);
+
+		break;
+	case RECV_DISCOVERY_XID_CMD:
+		IRDA_DEBUG(3, "%s(), event RECV_DISCOVER_XID_CMD!\n",
+			   __func__);
+		irlap_next_state(self, LAP_NDM);
+
+		break;
+	case DISCONNECT_REQUEST:
+		IRDA_DEBUG(0, "%s(), Disconnect request!\n", __func__);
+		irlap_send_dm_frame(self);
+		irlap_next_state( self, LAP_NDM);
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__,
+			   event, irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * Function irlap_state_setup (event, skb, frame)
+ *
+ *    SETUP state, The local layer has transmitted a SNRM command frame to
+ *    a remote peer layer and is awaiting a reply .
+ *
+ */
+static int irlap_state_setup(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case FINAL_TIMER_EXPIRED:
+		if (self->retry_count < self->N3) {
+/*
+ *  Perform random backoff, Wait a random number of time units, minimum
+ *  duration half the time taken to transmitt a SNRM frame, maximum duration
+ *  1.5 times the time taken to transmit a SNRM frame. So this time should
+ *  between 15 msecs and 45 msecs.
+ */
+			irlap_start_backoff_timer(self, msecs_to_jiffies(20 +
+							(jiffies % 30)));
+		} else {
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			irlap_disconnect_indication(self, LAP_FOUND_NONE);
+		}
+		break;
+	case BACKOFF_TIMER_EXPIRED:
+		irlap_send_snrm_frame(self, &self->qos_rx);
+		irlap_start_final_timer(self, self->final_timeout);
+		self->retry_count++;
+		break;
+	case RECV_SNRM_CMD:
+		IRDA_DEBUG(4, "%s(), SNRM battle!\n", __func__);
+
+		IRDA_ASSERT(skb != NULL, return 0;);
+		IRDA_ASSERT(info != NULL, return 0;);
+
+		/*
+		 *  The device with the largest device address wins the battle
+		 *  (both have sent a SNRM command!)
+		 */
+		if (info &&(info->daddr > self->saddr)) {
+			del_timer(&self->final_timer);
+			irlap_initiate_connection_state(self);
+
+			IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+			skb_pull(skb, sizeof(struct snrm_frame));
+
+			irlap_qos_negotiate(self, skb);
+
+			/* Send UA frame and then change link settings */
+			irlap_apply_connection_parameters(self, FALSE);
+			irlap_send_ua_response_frame(self, &self->qos_rx);
+
+			irlap_next_state(self, LAP_NRM_S);
+			irlap_connect_confirm(self, skb);
+
+			/*
+			 *  The WD-timer could be set to the duration of the
+			 *  P-timer for this case, but it is recommended
+			 *  to use twice the value (note 3 IrLAP p. 60).
+			 */
+			irlap_start_wd_timer(self, self->wd_timeout);
+		} else {
+			/* We just ignore the other device! */
+			irlap_next_state(self, LAP_SETUP);
+		}
+		break;
+	case RECV_UA_RSP:
+		/* Stop F-timer */
+		del_timer(&self->final_timer);
+
+		/* Initiate connection state */
+		irlap_initiate_connection_state(self);
+
+		/* Negotiate connection parameters */
+		IRDA_ASSERT(skb->len > 10, return -1;);
+
+		skb_pull(skb, sizeof(struct ua_frame));
+
+		IRDA_ASSERT(self->netdev != NULL, return -1;);
+
+		irlap_qos_negotiate(self, skb);
+
+		/* Set the new link setting *now* (before the rr frame) */
+		irlap_apply_connection_parameters(self, TRUE);
+		self->retry_count = 0;
+
+		/* Wait for turnaround time to give a chance to the other
+		 * device to be ready to receive us.
+		 * Note : the time to switch speed is typically larger
+		 * than the turnaround time, but as we don't have the other
+		 * side speed switch time, that's our best guess...
+		 * Jean II */
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+
+		/* This frame will actually be sent at the new speed */
+		irlap_send_rr_frame(self, CMD_FRAME);
+
+		/* The timer is set to half the normal timer to quickly
+		 * detect a failure to negotiate the new connection
+		 * parameters. IrLAP 6.11.3.2, note 3.
+		 * Note that currently we don't process this failure
+		 * properly, as we should do a quick disconnect.
+		 * Jean II */
+		irlap_start_final_timer(self, self->final_timeout/2);
+		irlap_next_state(self, LAP_NRM_P);
+
+		irlap_connect_confirm(self, skb);
+		break;
+	case RECV_DM_RSP:     /* FALLTHROUGH */
+	case RECV_DISC_CMD:
+		del_timer(&self->final_timer);
+		irlap_next_state(self, LAP_NDM);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __func__,
+			   event, irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_offline (self, event, skb, info)
+ *
+ *    OFFLINE state, not used for now!
+ *
+ */
+static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event,
+			       struct sk_buff *skb, struct irlap_info *info)
+{
+	IRDA_DEBUG( 0, "%s(), Unknown event\n", __func__);
+
+	return -1;
+}
+
+/*
+ * Function irlap_state_xmit_p (self, event, skb, info)
+ *
+ *    XMIT, Only the primary station has right to transmit, and we
+ *    therefore do not expect to receive any transmissions from other
+ *    stations.
+ *
+ */
+static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
+			      struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	switch (event) {
+	case SEND_I_CMD:
+		/*
+		 *  Only send frame if send-window > 0.
+		 */
+		if ((self->window > 0) && (!self->remote_busy)) {
+			int nextfit;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+			struct sk_buff *skb_next;
+
+			/* With DYNAMIC_WINDOW, we keep the window size
+			 * maximum, and adapt on the packets we are sending.
+			 * At 115k, we can send only 2 packets of 2048 bytes
+			 * in a 500 ms turnaround. Without this option, we
+			 * would always limit the window to 2. With this
+			 * option, if we send smaller packets, we can send
+			 * up to 7 of them (always depending on QoS).
+			 * Jean II */
+
+			/* Look at the next skb. This is safe, as we are
+			 * the only consumer of the Tx queue (if we are not,
+			 * we have other problems) - Jean II */
+			skb_next = skb_peek(&self->txq);
+
+			/* Check if a subsequent skb exist and would fit in
+			 * the current window (with respect to turnaround
+			 * time).
+			 * This allow us to properly mark the current packet
+			 * with the pf bit, to avoid falling back on the
+			 * second test below, and avoid waiting the
+			 * end of the window and sending a extra RR.
+			 * Note : (skb_next != NULL) <=> (skb_queue_len() > 0)
+			 * Jean II */
+			nextfit = ((skb_next != NULL) &&
+				   ((skb_next->len + skb->len) <=
+				    self->bytes_left));
+
+			/*
+			 * The current packet may not fit ! Because of test
+			 * above, this should not happen any more !!!
+			 *  Test if we have transmitted more bytes over the
+			 *  link than its possible to do with the current
+			 *  speed and turn-around-time.
+			 */
+			if((!nextfit) && (skb->len > self->bytes_left)) {
+				IRDA_DEBUG(0, "%s(), Not allowed to transmit"
+					   " more bytes!\n", __func__);
+				/* Requeue the skb */
+				skb_queue_head(&self->txq, skb_get(skb));
+				/*
+				 *  We should switch state to LAP_NRM_P, but
+				 *  that is not possible since we must be sure
+				 *  that we poll the other side. Since we have
+				 *  used up our time, the poll timer should
+				 *  trigger anyway now, so we just wait for it
+				 *  DB
+				 */
+				/*
+				 * Sorry, but that's not totally true. If
+				 * we send 2000B packets, we may wait another
+				 * 1000B until our turnaround expire. That's
+				 * why we need to be proactive in avoiding
+				 * coming here. - Jean II
+				 */
+				return -EPROTO;
+			}
+
+			/* Subtract space used by this skb */
+			self->bytes_left -= skb->len;
+#else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
+			/* Window has been adjusted for the max packet
+			 * size, so much simpler... - Jean II */
+			nextfit = !skb_queue_empty(&self->txq);
+#endif	/* CONFIG_IRDA_DYNAMIC_WINDOW */
+			/*
+			 *  Send data with poll bit cleared only if window > 1
+			 *  and there is more frames after this one to be sent
+			 */
+			if ((self->window > 1) && (nextfit)) {
+				/* More packet to send in current window */
+				irlap_send_data_primary(self, skb);
+				irlap_next_state(self, LAP_XMIT_P);
+			} else {
+				/* Final packet of window */
+				irlap_send_data_primary_poll(self, skb);
+
+				/*
+				 * Make sure state machine does not try to send
+				 * any more frames
+				 */
+				ret = -EPROTO;
+			}
+#ifdef CONFIG_IRDA_FAST_RR
+			/* Peer may want to reply immediately */
+			self->fast_RR = FALSE;
+#endif /* CONFIG_IRDA_FAST_RR */
+		} else {
+			IRDA_DEBUG(4, "%s(), Unable to send! remote busy?\n",
+				   __func__);
+			skb_queue_head(&self->txq, skb_get(skb));
+
+			/*
+			 *  The next ret is important, because it tells
+			 *  irlap_next_state _not_ to deliver more frames
+			 */
+			ret = -EPROTO;
+		}
+		break;
+	case POLL_TIMER_EXPIRED:
+		IRDA_DEBUG(3, "%s(), POLL_TIMER_EXPIRED <%ld>\n",
+			    __func__, jiffies);
+		irlap_send_rr_frame(self, CMD_FRAME);
+		/* Return to NRM properly - Jean II  */
+		self->window = self->window_size;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+		/* Allowed to transmit a maximum number of bytes again. */
+		self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+		irlap_start_final_timer(self, self->final_timeout);
+		irlap_next_state(self, LAP_NRM_P);
+		break;
+	case DISCONNECT_REQUEST:
+		del_timer(&self->poll_timer);
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_send_disc_frame(self);
+		irlap_flush_all_queues(self);
+		irlap_start_final_timer(self, self->final_timeout);
+		self->retry_count = 0;
+		irlap_next_state(self, LAP_PCLOSE);
+		break;
+	case DATA_REQUEST:
+		/* Nothing to do, irlap_do_event() will send the packet
+		 * when we return... - Jean II */
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s\n",
+			   __func__, irlap_event[event]);
+
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_pclose (event, skb, info)
+ *
+ *    PCLOSE state
+ */
+static int irlap_state_pclose(struct irlap_cb *self, IRLAP_EVENT event,
+			      struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case RECV_UA_RSP: /* FALLTHROUGH */
+	case RECV_DM_RSP:
+		del_timer(&self->final_timer);
+
+		/* Set new link parameters */
+		irlap_apply_default_connection_parameters(self);
+
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	case FINAL_TIMER_EXPIRED:
+		if (self->retry_count < self->N3) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_disc_frame(self);
+			irlap_start_final_timer(self, self->final_timeout);
+			self->retry_count++;
+			/* Keep state */
+		} else {
+			irlap_apply_default_connection_parameters(self);
+
+			/*  Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+		}
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d\n", __func__, event);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_nrm_p (self, event, skb, info)
+ *
+ *   NRM_P (Normal Response Mode as Primary), The primary station has given
+ *   permissions to a secondary station to transmit IrLAP resonse frames
+ *   (by sending a frame with the P bit set). The primary station will not
+ *   transmit any frames and is expecting to receive frames only from the
+ *   secondary to which transmission permissions has been given.
+ */
+static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+	int ns_status;
+	int nr_status;
+
+	switch (event) {
+	case RECV_I_RSP: /* Optimize for the common case */
+		if (unlikely(skb->len <= LAP_ADDR_HEADER + LAP_CTRL_HEADER)) {
+			/*
+			 * Input validation check: a stir4200/mcp2150
+			 * combination sometimes results in an empty i:rsp.
+			 * This makes no sense; we can just ignore the frame
+			 * and send an rr:cmd immediately. This happens before
+			 * changing nr or ns so triggers a retransmit
+			 */
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rr_frame(self, CMD_FRAME);
+			/* Keep state */
+			break;
+		}
+		/* FIXME: must check for remote_busy below */
+#ifdef CONFIG_IRDA_FAST_RR
+		/*
+		 *  Reset the fast_RR so we can use the fast RR code with
+		 *  full speed the next time since peer may have more frames
+		 *  to transmitt
+		 */
+		self->fast_RR = FALSE;
+#endif /* CONFIG_IRDA_FAST_RR */
+		IRDA_ASSERT( info != NULL, return -1;);
+
+		ns_status = irlap_validate_ns_received(self, info->ns);
+		nr_status = irlap_validate_nr_received(self, info->nr);
+
+		/*
+		 *  Check for expected I(nformation) frame
+		 */
+		if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) {
+
+			/* Update Vr (next frame for us to receive) */
+			self->vr = (self->vr + 1) % 8;
+
+			/* Update Nr received, cleanup our retry queue */
+			irlap_update_nr_received(self, info->nr);
+
+			/*
+			 *  Got expected NR, so reset the
+			 *  retry_count. This is not done by IrLAP spec,
+			 *  which is strange!
+			 */
+			self->retry_count = 0;
+			self->ack_required = TRUE;
+
+			/*  poll bit cleared?  */
+			if (!info->pf) {
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_P);
+
+				irlap_data_indication(self, skb, FALSE);
+			} else {
+				/* No longer waiting for pf */
+				del_timer(&self->final_timer);
+
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+
+				/* Call higher layer *before* changing state
+				 * to give them a chance to send data in the
+				 * next LAP frame.
+				 * Jean II */
+				irlap_data_indication(self, skb, FALSE);
+
+				/* XMIT states are the most dangerous state
+				 * to be in, because user requests are
+				 * processed directly and may change state.
+				 * On the other hand, in NDM_P, those
+				 * requests are queued and we will process
+				 * them when we return to irlap_do_event().
+				 * Jean II
+				 */
+				irlap_next_state(self, LAP_XMIT_P);
+
+				/* This is the last frame.
+				 * Make sure it's always called in XMIT state.
+				 * - Jean II */
+				irlap_start_poll_timer(self, self->poll_timeout);
+			}
+			break;
+
+		}
+		/* Unexpected next to send (Ns) */
+		if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED))
+		{
+			if (!info->pf) {
+				irlap_update_nr_received(self, info->nr);
+
+				/*
+				 *  Wait until the last frame before doing
+				 *  anything
+				 */
+
+				/* Keep state */
+				irlap_next_state(self, LAP_NRM_P);
+			} else {
+				IRDA_DEBUG(4,
+				       "%s(), missing or duplicate frame!\n",
+					   __func__);
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+				irlap_send_rr_frame(self, CMD_FRAME);
+
+				self->ack_required = FALSE;
+
+				irlap_start_final_timer(self, self->final_timeout);
+				irlap_next_state(self, LAP_NRM_P);
+			}
+			break;
+		}
+		/*
+		 *  Unexpected next to receive (Nr)
+		 */
+		if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED))
+		{
+			if (info->pf) {
+				self->vr = (self->vr + 1) % 8;
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				/* Resend rejected frames */
+				irlap_resend_rejected_frames(self, CMD_FRAME);
+
+				self->ack_required = FALSE;
+
+				/* Make sure we account for the time
+				 * to transmit our frames. See comemnts
+				 * in irlap_send_data_primary_poll().
+				 * Jean II */
+				irlap_start_final_timer(self, 2 * self->final_timeout);
+
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_P);
+
+				irlap_data_indication(self, skb, FALSE);
+			} else {
+				/*
+				 *  Do not resend frames until the last
+				 *  frame has arrived from the other
+				 *  device. This is not documented in
+				 *  IrLAP!!
+				 */
+				self->vr = (self->vr + 1) % 8;
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				self->ack_required = FALSE;
+
+				/* Keep state, do not move this line!*/
+				irlap_next_state(self, LAP_NRM_P);
+
+				irlap_data_indication(self, skb, FALSE);
+			}
+			break;
+		}
+		/*
+		 *  Unexpected next to send (Ns) and next to receive (Nr)
+		 *  Not documented by IrLAP!
+		 */
+		if ((ns_status == NS_UNEXPECTED) &&
+		    (nr_status == NR_UNEXPECTED))
+		{
+			IRDA_DEBUG(4, "%s(), unexpected nr and ns!\n",
+				   __func__);
+			if (info->pf) {
+				/* Resend rejected frames */
+				irlap_resend_rejected_frames(self, CMD_FRAME);
+
+				/* Give peer some time to retransmit!
+				 * But account for our own Tx. */
+				irlap_start_final_timer(self, 2 * self->final_timeout);
+
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_P);
+			} else {
+				/* Update Nr received */
+				/* irlap_update_nr_received( info->nr); */
+
+				self->ack_required = FALSE;
+			}
+			break;
+		}
+
+		/*
+		 *  Invalid NR or NS
+		 */
+		if ((nr_status == NR_INVALID) || (ns_status == NS_INVALID)) {
+			if (info->pf) {
+				del_timer(&self->final_timer);
+
+				irlap_next_state(self, LAP_RESET_WAIT);
+
+				irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+				self->xmitflag = TRUE;
+			} else {
+				del_timer(&self->final_timer);
+
+				irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+
+				self->xmitflag = FALSE;
+			}
+			break;
+		}
+		IRDA_DEBUG(1, "%s(), Not implemented!\n", __func__);
+		IRDA_DEBUG(1, "%s(), event=%s, ns_status=%d, nr_status=%d\n",
+		       __func__, irlap_event[event], ns_status, nr_status);
+		break;
+	case RECV_UI_FRAME:
+		/* Poll bit cleared? */
+		if (!info->pf) {
+			irlap_data_indication(self, skb, TRUE);
+			irlap_next_state(self, LAP_NRM_P);
+		} else {
+			del_timer(&self->final_timer);
+			irlap_data_indication(self, skb, TRUE);
+			irlap_next_state(self, LAP_XMIT_P);
+			IRDA_DEBUG(1, "%s: RECV_UI_FRAME: next state %s\n", __func__, irlap_state[self->state]);
+			irlap_start_poll_timer(self, self->poll_timeout);
+		}
+		break;
+	case RECV_RR_RSP:
+		/*
+		 *  If you get a RR, the remote isn't busy anymore,
+		 *  no matter what the NR
+		 */
+		self->remote_busy = FALSE;
+
+		/* Stop final timer */
+		del_timer(&self->final_timer);
+
+		/*
+		 *  Nr as expected?
+		 */
+		ret = irlap_validate_nr_received(self, info->nr);
+		if (ret == NR_EXPECTED) {
+			/* Update Nr received */
+			irlap_update_nr_received(self, info->nr);
+
+			/*
+			 *  Got expected NR, so reset the retry_count. This
+			 *  is not done by the IrLAP standard , which is
+			 *  strange! DB.
+			 */
+			self->retry_count = 0;
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+
+			irlap_next_state(self, LAP_XMIT_P);
+
+			/* Start poll timer */
+			irlap_start_poll_timer(self, self->poll_timeout);
+		} else if (ret == NR_UNEXPECTED) {
+			IRDA_ASSERT(info != NULL, return -1;);
+			/*
+			 *  Unexpected nr!
+			 */
+
+			/* Update Nr received */
+			irlap_update_nr_received(self, info->nr);
+
+			IRDA_DEBUG(4, "RECV_RR_FRAME: Retrans:%d, nr=%d, va=%d, "
+			      "vs=%d, vr=%d\n",
+			      self->retry_count, info->nr, self->va,
+			      self->vs, self->vr);
+
+			/* Resend rejected frames */
+			irlap_resend_rejected_frames(self, CMD_FRAME);
+			irlap_start_final_timer(self, self->final_timeout * 2);
+
+			irlap_next_state(self, LAP_NRM_P);
+		} else if (ret == NR_INVALID) {
+			IRDA_DEBUG(1, "%s(), Received RR with "
+				   "invalid nr !\n", __func__);
+
+			irlap_next_state(self, LAP_RESET_WAIT);
+
+			irlap_disconnect_indication(self, LAP_RESET_INDICATION);
+			self->xmitflag = TRUE;
+		}
+		break;
+	case RECV_RNR_RSP:
+		IRDA_ASSERT(info != NULL, return -1;);
+
+		/* Stop final timer */
+		del_timer(&self->final_timer);
+		self->remote_busy = TRUE;
+
+		/* Update Nr received */
+		irlap_update_nr_received(self, info->nr);
+		irlap_next_state(self, LAP_XMIT_P);
+
+		/* Start poll timer */
+		irlap_start_poll_timer(self, self->poll_timeout);
+		break;
+	case RECV_FRMR_RSP:
+		del_timer(&self->final_timer);
+		self->xmitflag = TRUE;
+		irlap_next_state(self, LAP_RESET_WAIT);
+		irlap_reset_indication(self);
+		break;
+	case FINAL_TIMER_EXPIRED:
+		/*
+		 *  We are allowed to wait for additional 300 ms if
+		 *  final timer expires when we are in the middle
+		 *  of receiving a frame (page 45, IrLAP). Check that
+		 *  we only do this once for each frame.
+		 */
+		if (irda_device_is_receiving(self->netdev) && !self->add_wait) {
+			IRDA_DEBUG(1, "FINAL_TIMER_EXPIRED when receiving a "
+			      "frame! Waiting a little bit more!\n");
+			irlap_start_final_timer(self, msecs_to_jiffies(300));
+
+			/*
+			 *  Don't allow this to happen one more time in a row,
+			 *  or else we can get a pretty tight loop here if
+			 *  if we only receive half a frame. DB.
+			 */
+			self->add_wait = TRUE;
+			break;
+		}
+		self->add_wait = FALSE;
+
+		/* N2 is the disconnect timer. Until we reach it, we retry */
+		if (self->retry_count < self->N2) {
+			if (skb_peek(&self->wx_list) == NULL) {
+				/* Retry sending the pf bit to the secondary */
+				IRDA_DEBUG(4, "nrm_p: resending rr");
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+				irlap_send_rr_frame(self, CMD_FRAME);
+			} else {
+				IRDA_DEBUG(4, "nrm_p: resend frames");
+				irlap_resend_rejected_frames(self, CMD_FRAME);
+			}
+
+			irlap_start_final_timer(self, self->final_timeout);
+			self->retry_count++;
+			IRDA_DEBUG(4, "irlap_state_nrm_p: FINAL_TIMER_EXPIRED:"
+				   " retry_count=%d\n", self->retry_count);
+
+			/* Early warning event. I'm using a pretty liberal
+			 * interpretation of the spec and generate an event
+			 * every time the timer is multiple of N1 (and not
+			 * only the first time). This allow application
+			 * to know precisely if connectivity restart...
+			 * Jean II */
+			if((self->retry_count % self->N1) == 0)
+				irlap_status_indication(self,
+							STATUS_NO_ACTIVITY);
+
+			/* Keep state */
+		} else {
+			irlap_apply_default_connection_parameters(self);
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+			irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+		}
+		break;
+	case RECV_REJ_RSP:
+		irlap_update_nr_received(self, info->nr);
+		if (self->remote_busy) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rr_frame(self, CMD_FRAME);
+		} else
+			irlap_resend_rejected_frames(self, CMD_FRAME);
+		irlap_start_final_timer(self, 2 * self->final_timeout);
+		break;
+	case RECV_SREJ_RSP:
+		irlap_update_nr_received(self, info->nr);
+		if (self->remote_busy) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rr_frame(self, CMD_FRAME);
+		} else
+			irlap_resend_rejected_frame(self, CMD_FRAME);
+		irlap_start_final_timer(self, 2 * self->final_timeout);
+		break;
+	case RECV_RD_RSP:
+		IRDA_DEBUG(1, "%s(), RECV_RD_RSP\n", __func__);
+
+		irlap_flush_all_queues(self);
+		irlap_next_state(self, LAP_XMIT_P);
+		/* Call back the LAP state machine to do a proper disconnect */
+		irlap_disconnect_request(self);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %s\n",
+			    __func__, irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_reset_wait (event, skb, info)
+ *
+ *    We have informed the service user of a reset condition, and is
+ *    awaiting reset of disconnect request.
+ *
+ */
+static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event,
+				  struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(3, "%s(), event = %s\n", __func__, irlap_event[event]);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case RESET_REQUEST:
+		if (self->xmitflag) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_snrm_frame(self, NULL);
+			irlap_start_final_timer(self, self->final_timeout);
+			irlap_next_state(self, LAP_RESET);
+		} else {
+			irlap_start_final_timer(self, self->final_timeout);
+			irlap_next_state(self, LAP_RESET);
+		}
+		break;
+	case DISCONNECT_REQUEST:
+		irlap_wait_min_turn_around( self, &self->qos_tx);
+		irlap_send_disc_frame( self);
+		irlap_flush_all_queues( self);
+		irlap_start_final_timer( self, self->final_timeout);
+		self->retry_count = 0;
+		irlap_next_state( self, LAP_PCLOSE);
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__,
+			   irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_reset (self, event, skb, info)
+ *
+ *    We have sent a SNRM reset command to the peer layer, and is awaiting
+ *    reply.
+ *
+ */
+static int irlap_state_reset(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(3, "%s(), event = %s\n", __func__, irlap_event[event]);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case RECV_DISC_CMD:
+		del_timer(&self->final_timer);
+
+		irlap_apply_default_connection_parameters(self);
+
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+
+		break;
+	case RECV_UA_RSP:
+		del_timer(&self->final_timer);
+
+		/* Initiate connection state */
+		irlap_initiate_connection_state(self);
+
+		irlap_reset_confirm();
+
+		self->remote_busy = FALSE;
+
+		irlap_next_state(self, LAP_XMIT_P);
+
+		irlap_start_poll_timer(self, self->poll_timeout);
+
+		break;
+	case FINAL_TIMER_EXPIRED:
+		if (self->retry_count < 3) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+
+			IRDA_ASSERT(self->netdev != NULL, return -1;);
+			irlap_send_snrm_frame(self, self->qos_dev);
+
+			self->retry_count++; /* Experimental!! */
+
+			irlap_start_final_timer(self, self->final_timeout);
+			irlap_next_state(self, LAP_RESET);
+		} else if (self->retry_count >= self->N3) {
+			irlap_apply_default_connection_parameters(self);
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+
+			irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+		}
+		break;
+	case RECV_SNRM_CMD:
+		/*
+		 * SNRM frame is not allowed to contain an I-field in this
+		 * state
+		 */
+		if (!info) {
+			IRDA_DEBUG(3, "%s(), RECV_SNRM_CMD\n", __func__);
+			irlap_initiate_connection_state(self);
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_ua_response_frame(self, &self->qos_rx);
+			irlap_reset_confirm();
+			irlap_start_wd_timer(self, self->wd_timeout);
+			irlap_next_state(self, LAP_NDM);
+		} else {
+			IRDA_DEBUG(0,
+				   "%s(), SNRM frame contained an I field!\n",
+				   __func__);
+		}
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %s\n",
+			   __func__, irlap_event[event]);
+
+		ret = -1;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_xmit_s (event, skb, info)
+ *
+ *   XMIT_S, The secondary station has been given the right to transmit,
+ *   and we therefore do not expect to receive any transmissions from other
+ *   stations.
+ */
+static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
+			      struct sk_buff *skb, struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[event]);
+
+	IRDA_ASSERT(self != NULL, return -ENODEV;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+	switch (event) {
+	case SEND_I_CMD:
+		/*
+		 *  Send frame only if send window > 0
+		 */
+		if ((self->window > 0) && (!self->remote_busy)) {
+			int nextfit;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+			struct sk_buff *skb_next;
+
+			/*
+			 * Same deal as in irlap_state_xmit_p(), so see
+			 * the comments at that point.
+			 * We are the secondary, so there are only subtle
+			 * differences. - Jean II
+			 */
+
+			/* Check if a subsequent skb exist and would fit in
+			 * the current window (with respect to turnaround
+			 * time). - Jean II */
+			skb_next = skb_peek(&self->txq);
+			nextfit = ((skb_next != NULL) &&
+				   ((skb_next->len + skb->len) <=
+				    self->bytes_left));
+
+			/*
+			 *  Test if we have transmitted more bytes over the
+			 *  link than its possible to do with the current
+			 *  speed and turn-around-time.
+			 */
+			if((!nextfit) && (skb->len > self->bytes_left)) {
+				IRDA_DEBUG(0, "%s(), Not allowed to transmit"
+					   " more bytes!\n", __func__);
+				/* Requeue the skb */
+				skb_queue_head(&self->txq, skb_get(skb));
+
+				/*
+				 *  Switch to NRM_S, this is only possible
+				 *  when we are in secondary mode, since we
+				 *  must be sure that we don't miss any RR
+				 *  frames
+				 */
+				self->window = self->window_size;
+				self->bytes_left = self->line_capacity;
+				irlap_start_wd_timer(self, self->wd_timeout);
+
+				irlap_next_state(self, LAP_NRM_S);
+				/* Slight difference with primary :
+				 * here we would wait for the other side to
+				 * expire the turnaround. - Jean II */
+
+				return -EPROTO; /* Try again later */
+			}
+			/* Subtract space used by this skb */
+			self->bytes_left -= skb->len;
+#else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
+			/* Window has been adjusted for the max packet
+			 * size, so much simpler... - Jean II */
+			nextfit = !skb_queue_empty(&self->txq);
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+			/*
+			 *  Send data with final bit cleared only if window > 1
+			 *  and there is more frames to be sent
+			 */
+			if ((self->window > 1) && (nextfit)) {
+				irlap_send_data_secondary(self, skb);
+				irlap_next_state(self, LAP_XMIT_S);
+			} else {
+				irlap_send_data_secondary_final(self, skb);
+				irlap_next_state(self, LAP_NRM_S);
+
+				/*
+				 * Make sure state machine does not try to send
+				 * any more frames
+				 */
+				ret = -EPROTO;
+			}
+		} else {
+			IRDA_DEBUG(2, "%s(), Unable to send!\n", __func__);
+			skb_queue_head(&self->txq, skb_get(skb));
+			ret = -EPROTO;
+		}
+		break;
+	case DISCONNECT_REQUEST:
+		irlap_send_rd_frame(self);
+		irlap_flush_all_queues(self);
+		irlap_start_wd_timer(self, self->wd_timeout);
+		irlap_next_state(self, LAP_SCLOSE);
+		break;
+	case DATA_REQUEST:
+		/* Nothing to do, irlap_do_event() will send the packet
+		 * when we return... - Jean II */
+		break;
+	default:
+		IRDA_DEBUG(2, "%s(), Unknown event %s\n", __func__,
+			   irlap_event[event]);
+
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_nrm_s (event, skb, info)
+ *
+ *    NRM_S (Normal Response Mode as Secondary) state, in this state we are
+ *    expecting to receive frames from the primary station
+ *
+ */
+static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
+			     struct sk_buff *skb, struct irlap_info *info)
+{
+	int ns_status;
+	int nr_status;
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s(), event=%s\n", __func__, irlap_event[ event]);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	switch (event) {
+	case RECV_I_CMD: /* Optimize for the common case */
+		/* FIXME: must check for remote_busy below */
+		IRDA_DEBUG(4, "%s(), event=%s nr=%d, vs=%d, ns=%d, "
+			   "vr=%d, pf=%d\n", __func__,
+			   irlap_event[event], info->nr,
+			   self->vs, info->ns, self->vr, info->pf);
+
+		self->retry_count = 0;
+
+		ns_status = irlap_validate_ns_received(self, info->ns);
+		nr_status = irlap_validate_nr_received(self, info->nr);
+		/*
+		 *  Check for expected I(nformation) frame
+		 */
+		if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) {
+
+			/* Update Vr (next frame for us to receive) */
+			self->vr = (self->vr + 1) % 8;
+
+			/* Update Nr received */
+			irlap_update_nr_received(self, info->nr);
+
+			/*
+			 *  poll bit cleared?
+			 */
+			if (!info->pf) {
+
+				self->ack_required = TRUE;
+
+				/*
+				 *  Starting WD-timer here is optional, but
+				 *  not recommended. Note 6 IrLAP p. 83
+				 */
+#if 0
+				irda_start_timer(WD_TIMER, self->wd_timeout);
+#endif
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_S);
+
+				irlap_data_indication(self, skb, FALSE);
+				break;
+			} else {
+				/*
+				 *  We should wait before sending RR, and
+				 *  also before changing to XMIT_S
+				 *  state. (note 1, IrLAP p. 82)
+				 */
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+
+				/*
+				 * Give higher layers a chance to
+				 * immediately reply with some data before
+				 * we decide if we should send a RR frame
+				 * or not
+				 */
+				irlap_data_indication(self, skb, FALSE);
+
+				/* Any pending data requests?  */
+				if (!skb_queue_empty(&self->txq) &&
+				    (self->window > 0))
+				{
+					self->ack_required = TRUE;
+
+					del_timer(&self->wd_timer);
+
+					irlap_next_state(self, LAP_XMIT_S);
+				} else {
+					irlap_send_rr_frame(self, RSP_FRAME);
+					irlap_start_wd_timer(self,
+							     self->wd_timeout);
+
+					/* Keep the state */
+					irlap_next_state(self, LAP_NRM_S);
+				}
+				break;
+			}
+		}
+		/*
+		 *  Check for Unexpected next to send (Ns)
+		 */
+		if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED))
+		{
+			/* Unexpected next to send, with final bit cleared */
+			if (!info->pf) {
+				irlap_update_nr_received(self, info->nr);
+
+				irlap_start_wd_timer(self, self->wd_timeout);
+			} else {
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+				irlap_send_rr_frame(self, RSP_FRAME);
+
+				irlap_start_wd_timer(self, self->wd_timeout);
+			}
+			break;
+		}
+
+		/*
+		 *  Unexpected Next to Receive(NR) ?
+		 */
+		if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED))
+		{
+			if (info->pf) {
+				IRDA_DEBUG(4, "RECV_I_RSP: frame(s) lost\n");
+
+				self->vr = (self->vr + 1) % 8;
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				/* Resend rejected frames */
+				irlap_resend_rejected_frames(self, RSP_FRAME);
+
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_S);
+
+				irlap_data_indication(self, skb, FALSE);
+				irlap_start_wd_timer(self, self->wd_timeout);
+				break;
+			}
+			/*
+			 *  This is not documented in IrLAP!! Unexpected NR
+			 *  with poll bit cleared
+			 */
+			if (!info->pf) {
+				self->vr = (self->vr + 1) % 8;
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+
+				/* Keep state, do not move this line */
+				irlap_next_state(self, LAP_NRM_S);
+
+				irlap_data_indication(self, skb, FALSE);
+				irlap_start_wd_timer(self, self->wd_timeout);
+			}
+			break;
+		}
+
+		if (ret == NR_INVALID) {
+			IRDA_DEBUG(0, "NRM_S, NR_INVALID not implemented!\n");
+		}
+		if (ret == NS_INVALID) {
+			IRDA_DEBUG(0, "NRM_S, NS_INVALID not implemented!\n");
+		}
+		break;
+	case RECV_UI_FRAME:
+		/*
+		 *  poll bit cleared?
+		 */
+		if (!info->pf) {
+			irlap_data_indication(self, skb, TRUE);
+			irlap_next_state(self, LAP_NRM_S); /* Keep state */
+		} else {
+			/*
+			 *  Any pending data requests?
+			 */
+			if (!skb_queue_empty(&self->txq) &&
+			    (self->window > 0) && !self->remote_busy)
+			{
+				irlap_data_indication(self, skb, TRUE);
+
+				del_timer(&self->wd_timer);
+
+				irlap_next_state(self, LAP_XMIT_S);
+			} else {
+				irlap_data_indication(self, skb, TRUE);
+
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+
+				irlap_send_rr_frame(self, RSP_FRAME);
+				self->ack_required = FALSE;
+
+				irlap_start_wd_timer(self, self->wd_timeout);
+
+				/* Keep the state */
+				irlap_next_state(self, LAP_NRM_S);
+			}
+		}
+		break;
+	case RECV_RR_CMD:
+		self->retry_count = 0;
+
+		/*
+		 *  Nr as expected?
+		 */
+		nr_status = irlap_validate_nr_received(self, info->nr);
+		if (nr_status == NR_EXPECTED) {
+			if (!skb_queue_empty(&self->txq) &&
+			    (self->window > 0)) {
+				self->remote_busy = FALSE;
+
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+				del_timer(&self->wd_timer);
+
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+				irlap_next_state(self, LAP_XMIT_S);
+			} else {
+				self->remote_busy = FALSE;
+				/* Update Nr received */
+				irlap_update_nr_received(self, info->nr);
+				irlap_wait_min_turn_around(self, &self->qos_tx);
+				irlap_start_wd_timer(self, self->wd_timeout);
+
+				/* Note : if the link is idle (this case),
+				 * we never go in XMIT_S, so we never get a
+				 * chance to process any DISCONNECT_REQUEST.
+				 * Do it now ! - Jean II */
+				if (self->disconnect_pending) {
+					/* Disconnect */
+					irlap_send_rd_frame(self);
+					irlap_flush_all_queues(self);
+
+					irlap_next_state(self, LAP_SCLOSE);
+				} else {
+					/* Just send back pf bit */
+					irlap_send_rr_frame(self, RSP_FRAME);
+
+					irlap_next_state(self, LAP_NRM_S);
+				}
+			}
+		} else if (nr_status == NR_UNEXPECTED) {
+			self->remote_busy = FALSE;
+			irlap_update_nr_received(self, info->nr);
+			irlap_resend_rejected_frames(self, RSP_FRAME);
+
+			irlap_start_wd_timer(self, self->wd_timeout);
+
+			/* Keep state */
+			irlap_next_state(self, LAP_NRM_S);
+		} else {
+			IRDA_DEBUG(1, "%s(), invalid nr not implemented!\n",
+				   __func__);
+		}
+		break;
+	case RECV_SNRM_CMD:
+		/* SNRM frame is not allowed to contain an I-field */
+		if (!info) {
+			del_timer(&self->wd_timer);
+			IRDA_DEBUG(1, "%s(), received SNRM cmd\n", __func__);
+			irlap_next_state(self, LAP_RESET_CHECK);
+
+			irlap_reset_indication(self);
+		} else {
+			IRDA_DEBUG(0,
+				   "%s(), SNRM frame contained an I-field!\n",
+				   __func__);
+
+		}
+		break;
+	case RECV_REJ_CMD:
+		irlap_update_nr_received(self, info->nr);
+		if (self->remote_busy) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rr_frame(self, RSP_FRAME);
+		} else
+			irlap_resend_rejected_frames(self, RSP_FRAME);
+		irlap_start_wd_timer(self, self->wd_timeout);
+		break;
+	case RECV_SREJ_CMD:
+		irlap_update_nr_received(self, info->nr);
+		if (self->remote_busy) {
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rr_frame(self, RSP_FRAME);
+		} else
+			irlap_resend_rejected_frame(self, RSP_FRAME);
+		irlap_start_wd_timer(self, self->wd_timeout);
+		break;
+	case WD_TIMER_EXPIRED:
+		/*
+		 *  Wait until retry_count * n matches negotiated threshold/
+		 *  disconnect time (note 2 in IrLAP p. 82)
+		 *
+		 * Similar to irlap_state_nrm_p() -> FINAL_TIMER_EXPIRED
+		 * Note : self->wd_timeout = (self->final_timeout * 2),
+		 *   which explain why we use (self->N2 / 2) here !!!
+		 * Jean II
+		 */
+		IRDA_DEBUG(1, "%s(), retry_count = %d\n", __func__,
+			   self->retry_count);
+
+		if (self->retry_count < (self->N2 / 2)) {
+			/* No retry, just wait for primary */
+			irlap_start_wd_timer(self, self->wd_timeout);
+			self->retry_count++;
+
+			if((self->retry_count % (self->N1 / 2)) == 0)
+				irlap_status_indication(self,
+							STATUS_NO_ACTIVITY);
+		} else {
+			irlap_apply_default_connection_parameters(self);
+
+			/* Always switch state before calling upper layers */
+			irlap_next_state(self, LAP_NDM);
+			irlap_disconnect_indication(self, LAP_NO_RESPONSE);
+		}
+		break;
+	case RECV_DISC_CMD:
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		/* Send disconnect response */
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_send_ua_response_frame(self, NULL);
+
+		del_timer(&self->wd_timer);
+		irlap_flush_all_queues(self);
+		/* Set default link parameters */
+		irlap_apply_default_connection_parameters(self);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	case RECV_DISCOVERY_XID_CMD:
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_send_rr_frame(self, RSP_FRAME);
+		self->ack_required = TRUE;
+		irlap_start_wd_timer(self, self->wd_timeout);
+		irlap_next_state(self, LAP_NRM_S);
+
+		break;
+	case RECV_TEST_CMD:
+		/* Remove test frame header (only LAP header in NRM) */
+		skb_pull(skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER);
+
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_start_wd_timer(self, self->wd_timeout);
+
+		/* Send response (info will be copied) */
+		irlap_send_test_frame(self, self->caddr, info->daddr, skb);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__,
+			   event, irlap_event[event]);
+
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlap_state_sclose (self, event, skb, info)
+ */
+static int irlap_state_sclose(struct irlap_cb *self, IRLAP_EVENT event,
+			      struct sk_buff *skb, struct irlap_info *info)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -ENODEV;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+	switch (event) {
+	case RECV_DISC_CMD:
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		/* Send disconnect response */
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_send_ua_response_frame(self, NULL);
+
+		del_timer(&self->wd_timer);
+		/* Set default link parameters */
+		irlap_apply_default_connection_parameters(self);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	case RECV_DM_RSP:
+		/* IrLAP-1.1 p.82: in SCLOSE, S and I type RSP frames
+		 * shall take us down into default NDM state, like DM_RSP
+		 */
+	case RECV_RR_RSP:
+	case RECV_RNR_RSP:
+	case RECV_REJ_RSP:
+	case RECV_SREJ_RSP:
+	case RECV_I_RSP:
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		del_timer(&self->wd_timer);
+		irlap_apply_default_connection_parameters(self);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	case WD_TIMER_EXPIRED:
+		/* Always switch state before calling upper layers */
+		irlap_next_state(self, LAP_NDM);
+
+		irlap_apply_default_connection_parameters(self);
+
+		irlap_disconnect_indication(self, LAP_DISC_INDICATION);
+		break;
+	default:
+		/* IrLAP-1.1 p.82: in SCLOSE, basically any received frame
+		 * with pf=1 shall restart the wd-timer and resend the rd:rsp
+		 */
+		if (info != NULL  &&  info->pf) {
+			del_timer(&self->wd_timer);
+			irlap_wait_min_turn_around(self, &self->qos_tx);
+			irlap_send_rd_frame(self);
+			irlap_start_wd_timer(self, self->wd_timeout);
+			break;		/* stay in SCLOSE */
+		}
+
+		IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__,
+			   event, irlap_event[event]);
+
+		break;
+	}
+
+	return -1;
+}
+
+static int irlap_state_reset_check( struct irlap_cb *self, IRLAP_EVENT event,
+				   struct sk_buff *skb,
+				   struct irlap_info *info)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(1, "%s(), event=%s\n", __func__, irlap_event[event]);
+
+	IRDA_ASSERT(self != NULL, return -ENODEV;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;);
+
+	switch (event) {
+	case RESET_RESPONSE:
+		irlap_send_ua_response_frame(self, &self->qos_rx);
+		irlap_initiate_connection_state(self);
+		irlap_start_wd_timer(self, WD_TIMEOUT);
+		irlap_flush_all_queues(self);
+
+		irlap_next_state(self, LAP_NRM_S);
+		break;
+	case DISCONNECT_REQUEST:
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+		irlap_send_rd_frame(self);
+		irlap_start_wd_timer(self, WD_TIMEOUT);
+		irlap_next_state(self, LAP_SCLOSE);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __func__,
+			   event, irlap_event[event]);
+
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
new file mode 100644
index 00000000..8c004161
--- /dev/null
+++ b/net/irda/irlap_frame.c
@@ -0,0 +1,1434 @@
+/*********************************************************************
+ *
+ * Filename:      irlap_frame.c
+ * Version:       1.0
+ * Description:   Build and transmit IrLAP frames
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Aug 19 10:27:26 1997
+ * Modified at:   Wed Jan  5 08:59:04 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/irda.h>
+#include <linux/slab.h>
+
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/wrapper.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/qos.h>
+
+static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb,
+			       int command);
+
+/*
+ * Function irlap_insert_info (self, skb)
+ *
+ *    Insert minimum turnaround time and speed information into the skb. We
+ *    need to do this since it's per packet relevant information. Safe to
+ *    have this function inlined since it's only called from one place
+ */
+static inline void irlap_insert_info(struct irlap_cb *self,
+				     struct sk_buff *skb)
+{
+	struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb;
+
+	/*
+	 * Insert MTT (min. turn time) and speed into skb, so that the
+	 * device driver knows which settings to use
+	 */
+	cb->magic = LAP_MAGIC;
+	cb->mtt = self->mtt_required;
+	cb->next_speed = self->speed;
+
+	/* Reset */
+	self->mtt_required = 0;
+
+	/*
+	 * Delay equals negotiated BOFs count, plus the number of BOFs to
+	 * force the negotiated minimum turnaround time
+	 */
+	cb->xbofs = self->bofs_count;
+	cb->next_xbofs = self->next_bofs;
+	cb->xbofs_delay = self->xbofs_delay;
+
+	/* Reset XBOF's delay (used only for getting min turn time) */
+	self->xbofs_delay = 0;
+	/* Put the correct xbofs value for the next packet */
+	self->bofs_count = self->next_bofs;
+}
+
+/*
+ * Function irlap_queue_xmit (self, skb)
+ *
+ *    A little wrapper for dev_queue_xmit, so we can insert some common
+ *    code into it.
+ */
+void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb)
+{
+	/* Some common init stuff */
+	skb->dev = self->netdev;
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb->protocol = htons(ETH_P_IRDA);
+	skb->priority = TC_PRIO_BESTEFFORT;
+
+	irlap_insert_info(self, skb);
+
+	if (unlikely(self->mode & IRDA_MODE_MONITOR)) {
+		IRDA_DEBUG(3, "%s(): %s is in monitor mode\n", __func__,
+			   self->netdev->name);
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	dev_queue_xmit(skb);
+}
+
+/*
+ * Function irlap_send_snrm_cmd (void)
+ *
+ *    Transmits a connect SNRM command frame
+ */
+void irlap_send_snrm_frame(struct irlap_cb *self, struct qos_info *qos)
+{
+	struct sk_buff *tx_skb;
+	struct snrm_frame *frame;
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Allocate frame */
+	tx_skb = alloc_skb(sizeof(struct snrm_frame) +
+			   IRLAP_NEGOCIATION_PARAMS_LEN,
+			   GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct snrm_frame *) skb_put(tx_skb, 2);
+
+	/* Insert connection address field */
+	if (qos)
+		frame->caddr = CMD_FRAME | CBROADCAST;
+	else
+		frame->caddr = CMD_FRAME | self->caddr;
+
+	/* Insert control field */
+	frame->control = SNRM_CMD | PF_BIT;
+
+	/*
+	 *  If we are establishing a connection then insert QoS parameters
+	 */
+	if (qos) {
+		skb_put(tx_skb, 9); /* 25 left */
+		frame->saddr = cpu_to_le32(self->saddr);
+		frame->daddr = cpu_to_le32(self->daddr);
+
+		frame->ncaddr = self->caddr;
+
+		ret = irlap_insert_qos_negotiation_params(self, tx_skb);
+		if (ret < 0) {
+			dev_kfree_skb(tx_skb);
+			return;
+		}
+	}
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_snrm_cmd (skb, info)
+ *
+ *    Received SNRM (Set Normal Response Mode) command frame
+ *
+ */
+static void irlap_recv_snrm_cmd(struct irlap_cb *self, struct sk_buff *skb,
+				struct irlap_info *info)
+{
+	struct snrm_frame *frame;
+
+	if (pskb_may_pull(skb,sizeof(struct snrm_frame))) {
+		frame = (struct snrm_frame *) skb->data;
+
+		/* Copy the new connection address ignoring the C/R bit */
+		info->caddr = frame->ncaddr & 0xFE;
+
+		/* Check if the new connection address is valid */
+		if ((info->caddr == 0x00) || (info->caddr == 0xfe)) {
+			IRDA_DEBUG(3, "%s(), invalid connection address!\n",
+				   __func__);
+			return;
+		}
+
+		/* Copy peer device address */
+		info->daddr = le32_to_cpu(frame->saddr);
+		info->saddr = le32_to_cpu(frame->daddr);
+
+		/* Only accept if addressed directly to us */
+		if (info->saddr != self->saddr) {
+			IRDA_DEBUG(2, "%s(), not addressed to us!\n",
+				   __func__);
+			return;
+		}
+		irlap_do_event(self, RECV_SNRM_CMD, skb, info);
+	} else {
+		/* Signal that this SNRM frame does not contain and I-field */
+		irlap_do_event(self, RECV_SNRM_CMD, skb, NULL);
+	}
+}
+
+/*
+ * Function irlap_send_ua_response_frame (qos)
+ *
+ *    Send UA (Unnumbered Acknowledgement) frame
+ *
+ */
+void irlap_send_ua_response_frame(struct irlap_cb *self, struct qos_info *qos)
+{
+	struct sk_buff *tx_skb;
+	struct ua_frame *frame;
+	int ret;
+
+	IRDA_DEBUG(2, "%s() <%ld>\n", __func__, jiffies);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/* Allocate frame */
+	tx_skb = alloc_skb(sizeof(struct ua_frame) +
+			   IRLAP_NEGOCIATION_PARAMS_LEN,
+			   GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct ua_frame *) skb_put(tx_skb, 10);
+
+	/* Build UA response */
+	frame->caddr = self->caddr;
+	frame->control = UA_RSP | PF_BIT;
+
+	frame->saddr = cpu_to_le32(self->saddr);
+	frame->daddr = cpu_to_le32(self->daddr);
+
+	/* Should we send QoS negotiation parameters? */
+	if (qos) {
+		ret = irlap_insert_qos_negotiation_params(self, tx_skb);
+		if (ret < 0) {
+			dev_kfree_skb(tx_skb);
+			return;
+		}
+	}
+
+	irlap_queue_xmit(self, tx_skb);
+}
+
+
+/*
+ * Function irlap_send_dm_frame (void)
+ *
+ *    Send disconnected mode (DM) frame
+ *
+ */
+void irlap_send_dm_frame( struct irlap_cb *self)
+{
+	struct sk_buff *tx_skb = NULL;
+	struct dm_frame *frame;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	tx_skb = alloc_skb(sizeof(struct dm_frame), GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct dm_frame *)skb_put(tx_skb, 2);
+
+	if (self->state == LAP_NDM)
+		frame->caddr = CBROADCAST;
+	else
+		frame->caddr = self->caddr;
+
+	frame->control = DM_RSP | PF_BIT;
+
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_disc_frame (void)
+ *
+ *    Send disconnect (DISC) frame
+ *
+ */
+void irlap_send_disc_frame(struct irlap_cb *self)
+{
+	struct sk_buff *tx_skb = NULL;
+	struct disc_frame *frame;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	tx_skb = alloc_skb(sizeof(struct disc_frame), GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct disc_frame *)skb_put(tx_skb, 2);
+
+	frame->caddr = self->caddr | CMD_FRAME;
+	frame->control = DISC_CMD | PF_BIT;
+
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_discovery_xid_frame (S, s, command)
+ *
+ *    Build and transmit a XID (eXchange station IDentifier) discovery
+ *    frame.
+ */
+void irlap_send_discovery_xid_frame(struct irlap_cb *self, int S, __u8 s,
+				    __u8 command, discovery_t *discovery)
+{
+	struct sk_buff *tx_skb = NULL;
+	struct xid_frame *frame;
+	__u32 bcast = BROADCAST;
+	__u8 *info;
+
+	IRDA_DEBUG(4, "%s(), s=%d, S=%d, command=%d\n", __func__,
+		   s, S, command);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(discovery != NULL, return;);
+
+	tx_skb = alloc_skb(sizeof(struct xid_frame) + IRLAP_DISCOVERY_INFO_LEN,
+			   GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	skb_put(tx_skb, 14);
+	frame = (struct xid_frame *) tx_skb->data;
+
+	if (command) {
+		frame->caddr = CBROADCAST | CMD_FRAME;
+		frame->control =  XID_CMD | PF_BIT;
+	} else {
+		frame->caddr = CBROADCAST;
+		frame->control =  XID_RSP | PF_BIT;
+	}
+	frame->ident = XID_FORMAT;
+
+	frame->saddr = cpu_to_le32(self->saddr);
+
+	if (command)
+		frame->daddr = cpu_to_le32(bcast);
+	else
+		frame->daddr = cpu_to_le32(discovery->data.daddr);
+
+	switch (S) {
+	case 1:
+		frame->flags = 0x00;
+		break;
+	case 6:
+		frame->flags = 0x01;
+		break;
+	case 8:
+		frame->flags = 0x02;
+		break;
+	case 16:
+		frame->flags = 0x03;
+		break;
+	default:
+		frame->flags = 0x02;
+		break;
+	}
+
+	frame->slotnr = s;
+	frame->version = 0x00;
+
+	/*
+	 *  Provide info for final slot only in commands, and for all
+	 *  responses. Send the second byte of the hint only if the
+	 *  EXTENSION bit is set in the first byte.
+	 */
+	if (!command || (frame->slotnr == 0xff)) {
+		int len;
+
+		if (discovery->data.hints[0] & HINT_EXTENSION) {
+			info = skb_put(tx_skb, 2);
+			info[0] = discovery->data.hints[0];
+			info[1] = discovery->data.hints[1];
+		} else {
+			info = skb_put(tx_skb, 1);
+			info[0] = discovery->data.hints[0];
+		}
+		info = skb_put(tx_skb, 1);
+		info[0] = discovery->data.charset;
+
+		len = IRDA_MIN(discovery->name_len, skb_tailroom(tx_skb));
+		info = skb_put(tx_skb, len);
+		memcpy(info, discovery->data.info, len);
+	}
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_discovery_xid_rsp (skb, info)
+ *
+ *    Received a XID discovery response
+ *
+ */
+static void irlap_recv_discovery_xid_rsp(struct irlap_cb *self,
+					 struct sk_buff *skb,
+					 struct irlap_info *info)
+{
+	struct xid_frame *xid;
+	discovery_t *discovery = NULL;
+	__u8 *discovery_info;
+	char *text;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	if (!pskb_may_pull(skb, sizeof(struct xid_frame))) {
+		IRDA_ERROR("%s: frame too short!\n", __func__);
+		return;
+	}
+
+	xid = (struct xid_frame *) skb->data;
+
+	info->daddr = le32_to_cpu(xid->saddr);
+	info->saddr = le32_to_cpu(xid->daddr);
+
+	/* Make sure frame is addressed to us */
+	if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) {
+		IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n",
+			   __func__);
+		return;
+	}
+
+	if ((discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC)) == NULL) {
+		IRDA_WARNING("%s: kmalloc failed!\n", __func__);
+		return;
+	}
+
+	discovery->data.daddr = info->daddr;
+	discovery->data.saddr = self->saddr;
+	discovery->timestamp = jiffies;
+
+	IRDA_DEBUG(4, "%s(), daddr=%08x\n", __func__,
+		   discovery->data.daddr);
+
+	discovery_info = skb_pull(skb, sizeof(struct xid_frame));
+
+	/* Get info returned from peer */
+	discovery->data.hints[0] = discovery_info[0];
+	if (discovery_info[0] & HINT_EXTENSION) {
+		IRDA_DEBUG(4, "EXTENSION\n");
+		discovery->data.hints[1] = discovery_info[1];
+		discovery->data.charset = discovery_info[2];
+		text = (char *) &discovery_info[3];
+	} else {
+		discovery->data.hints[1] = 0;
+		discovery->data.charset = discovery_info[1];
+		text = (char *) &discovery_info[2];
+	}
+	/*
+	 *  Terminate info string, should be safe since this is where the
+	 *  FCS bytes resides.
+	 */
+	skb->data[skb->len] = '\0';
+	strncpy(discovery->data.info, text, NICKNAME_MAX_LEN);
+	discovery->name_len = strlen(discovery->data.info);
+
+	info->discovery = discovery;
+
+	irlap_do_event(self, RECV_DISCOVERY_XID_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_discovery_xid_cmd (skb, info)
+ *
+ *    Received a XID discovery command
+ *
+ */
+static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self,
+					 struct sk_buff *skb,
+					 struct irlap_info *info)
+{
+	struct xid_frame *xid;
+	discovery_t *discovery = NULL;
+	__u8 *discovery_info;
+	char *text;
+
+	if (!pskb_may_pull(skb, sizeof(struct xid_frame))) {
+		IRDA_ERROR("%s: frame too short!\n", __func__);
+		return;
+	}
+
+	xid = (struct xid_frame *) skb->data;
+
+	info->daddr = le32_to_cpu(xid->saddr);
+	info->saddr = le32_to_cpu(xid->daddr);
+
+	/* Make sure frame is addressed to us */
+	if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) {
+		IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n",
+			   __func__);
+		return;
+	}
+
+	switch (xid->flags & 0x03) {
+	case 0x00:
+		info->S = 1;
+		break;
+	case 0x01:
+		info->S = 6;
+		break;
+	case 0x02:
+		info->S = 8;
+		break;
+	case 0x03:
+		info->S = 16;
+		break;
+	default:
+		/* Error!! */
+		return;
+	}
+	info->s = xid->slotnr;
+
+	discovery_info = skb_pull(skb, sizeof(struct xid_frame));
+
+	/*
+	 *  Check if last frame
+	 */
+	if (info->s == 0xff) {
+		/* Check if things are sane at this point... */
+		if((discovery_info == NULL) ||
+		   !pskb_may_pull(skb, 3)) {
+			IRDA_ERROR("%s: discovery frame too short!\n",
+				   __func__);
+			return;
+		}
+
+		/*
+		 *  We now have some discovery info to deliver!
+		 */
+		discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC);
+		if (!discovery) {
+			IRDA_WARNING("%s: unable to malloc!\n", __func__);
+			return;
+		}
+
+		discovery->data.daddr = info->daddr;
+		discovery->data.saddr = self->saddr;
+		discovery->timestamp = jiffies;
+
+		discovery->data.hints[0] = discovery_info[0];
+		if (discovery_info[0] & HINT_EXTENSION) {
+			discovery->data.hints[1] = discovery_info[1];
+			discovery->data.charset = discovery_info[2];
+			text = (char *) &discovery_info[3];
+		} else {
+			discovery->data.hints[1] = 0;
+			discovery->data.charset = discovery_info[1];
+			text = (char *) &discovery_info[2];
+		}
+		/*
+		 *  Terminate string, should be safe since this is where the
+		 *  FCS bytes resides.
+		 */
+		skb->data[skb->len] = '\0';
+		strncpy(discovery->data.info, text, NICKNAME_MAX_LEN);
+		discovery->name_len = strlen(discovery->data.info);
+
+		info->discovery = discovery;
+	} else
+		info->discovery = NULL;
+
+	irlap_do_event(self, RECV_DISCOVERY_XID_CMD, skb, info);
+}
+
+/*
+ * Function irlap_send_rr_frame (self, command)
+ *
+ *    Build and transmit RR (Receive Ready) frame. Notice that it is currently
+ *    only possible to send RR frames with the poll bit set.
+ */
+void irlap_send_rr_frame(struct irlap_cb *self, int command)
+{
+	struct sk_buff *tx_skb;
+	struct rr_frame *frame;
+
+	tx_skb = alloc_skb(sizeof(struct rr_frame), GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct rr_frame *)skb_put(tx_skb, 2);
+
+	frame->caddr = self->caddr;
+	frame->caddr |= (command) ? CMD_FRAME : 0;
+
+	frame->control = RR | PF_BIT | (self->vr << 5);
+
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_send_rd_frame (self)
+ *
+ *    Request disconnect. Used by a secondary station to request the
+ *    disconnection of the link.
+ */
+void irlap_send_rd_frame(struct irlap_cb *self)
+{
+	struct sk_buff *tx_skb;
+	struct rd_frame *frame;
+
+	tx_skb = alloc_skb(sizeof(struct rd_frame), GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	frame = (struct rd_frame *)skb_put(tx_skb, 2);
+
+	frame->caddr = self->caddr;
+	frame->caddr = RD_RSP | PF_BIT;
+
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_rr_frame (skb, info)
+ *
+ *    Received RR (Receive Ready) frame from peer station, no harm in
+ *    making it inline since its called only from one single place
+ *    (irlap_driver_rcv).
+ */
+static inline void irlap_recv_rr_frame(struct irlap_cb *self,
+				       struct sk_buff *skb,
+				       struct irlap_info *info, int command)
+{
+	info->nr = skb->data[1] >> 5;
+
+	/* Check if this is a command or a response frame */
+	if (command)
+		irlap_do_event(self, RECV_RR_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_RR_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_rnr_frame (self, skb, info)
+ *
+ *    Received RNR (Receive Not Ready) frame from peer station
+ *
+ */
+static void irlap_recv_rnr_frame(struct irlap_cb *self, struct sk_buff *skb,
+				 struct irlap_info *info, int command)
+{
+	info->nr = skb->data[1] >> 5;
+
+	IRDA_DEBUG(4, "%s(), nr=%d, %ld\n", __func__, info->nr, jiffies);
+
+	if (command)
+		irlap_do_event(self, RECV_RNR_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_RNR_RSP, skb, info);
+}
+
+static void irlap_recv_rej_frame(struct irlap_cb *self, struct sk_buff *skb,
+				 struct irlap_info *info, int command)
+{
+	IRDA_DEBUG(0, "%s()\n", __func__);
+
+	info->nr = skb->data[1] >> 5;
+
+	/* Check if this is a command or a response frame */
+	if (command)
+		irlap_do_event(self, RECV_REJ_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_REJ_RSP, skb, info);
+}
+
+static void irlap_recv_srej_frame(struct irlap_cb *self, struct sk_buff *skb,
+				  struct irlap_info *info, int command)
+{
+	IRDA_DEBUG(0, "%s()\n", __func__);
+
+	info->nr = skb->data[1] >> 5;
+
+	/* Check if this is a command or a response frame */
+	if (command)
+		irlap_do_event(self, RECV_SREJ_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_SREJ_RSP, skb, info);
+}
+
+static void irlap_recv_disc_frame(struct irlap_cb *self, struct sk_buff *skb,
+				  struct irlap_info *info, int command)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	/* Check if this is a command or a response frame */
+	if (command)
+		irlap_do_event(self, RECV_DISC_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_RD_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_ua_frame (skb, frame)
+ *
+ *    Received UA (Unnumbered Acknowledgement) frame
+ *
+ */
+static inline void irlap_recv_ua_frame(struct irlap_cb *self,
+				       struct sk_buff *skb,
+				       struct irlap_info *info)
+{
+	irlap_do_event(self, RECV_UA_RSP, skb, info);
+}
+
+/*
+ * Function irlap_send_data_primary(self, skb)
+ *
+ *    Send I-frames as the primary station but without the poll bit set
+ *
+ */
+void irlap_send_data_primary(struct irlap_cb *self, struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+
+	if (skb->data[1] == I_FRAME) {
+
+		/*
+		 *  Insert frame sequence number (Vs) in control field before
+		 *  inserting into transmit window queue.
+		 */
+		skb->data[1] = I_FRAME | (self->vs << 1);
+
+		/*
+		 *  Insert frame in store, in case of retransmissions
+		 *  Increase skb reference count, see irlap_do_event()
+		 */
+		skb_get(skb);
+		skb_queue_tail(&self->wx_list, skb);
+
+		/* Copy buffer */
+		tx_skb = skb_clone(skb, GFP_ATOMIC);
+		if (tx_skb == NULL) {
+			return;
+		}
+
+		self->vs = (self->vs + 1) % 8;
+		self->ack_required = FALSE;
+		self->window -= 1;
+
+		irlap_send_i_frame( self, tx_skb, CMD_FRAME);
+	} else {
+		IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __func__);
+		irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+		self->window -= 1;
+	}
+}
+/*
+ * Function irlap_send_data_primary_poll (self, skb)
+ *
+ *    Send I(nformation) frame as primary with poll bit set
+ */
+void irlap_send_data_primary_poll(struct irlap_cb *self, struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+	int transmission_time;
+
+	/* Stop P timer */
+	del_timer(&self->poll_timer);
+
+	/* Is this reliable or unreliable data? */
+	if (skb->data[1] == I_FRAME) {
+
+		/*
+		 *  Insert frame sequence number (Vs) in control field before
+		 *  inserting into transmit window queue.
+		 */
+		skb->data[1] = I_FRAME | (self->vs << 1);
+
+		/*
+		 *  Insert frame in store, in case of retransmissions
+		 *  Increase skb reference count, see irlap_do_event()
+		 */
+		skb_get(skb);
+		skb_queue_tail(&self->wx_list, skb);
+
+		/* Copy buffer */
+		tx_skb = skb_clone(skb, GFP_ATOMIC);
+		if (tx_skb == NULL) {
+			return;
+		}
+
+		/*
+		 *  Set poll bit if necessary. We do this to the copied
+		 *  skb, since retransmitted need to set or clear the poll
+		 *  bit depending on when they are sent.
+		 */
+		tx_skb->data[1] |= PF_BIT;
+
+		self->vs = (self->vs + 1) % 8;
+		self->ack_required = FALSE;
+
+		irlap_next_state(self, LAP_NRM_P);
+		irlap_send_i_frame(self, tx_skb, CMD_FRAME);
+	} else {
+		IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __func__);
+
+		if (self->ack_required) {
+			irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+			irlap_next_state(self, LAP_NRM_P);
+			irlap_send_rr_frame(self, CMD_FRAME);
+			self->ack_required = FALSE;
+		} else {
+			skb->data[1] |= PF_BIT;
+			irlap_next_state(self, LAP_NRM_P);
+			irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME);
+		}
+	}
+
+	/* How much time we took for transmission of all frames.
+	 * We don't know, so let assume we used the full window. Jean II */
+	transmission_time = self->final_timeout;
+
+	/* Reset parameter so that we can fill next window */
+	self->window = self->window_size;
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	/* Remove what we have not used. Just do a prorata of the
+	 * bytes left in window to window capacity.
+	 * See max_line_capacities[][] in qos.c for details. Jean II */
+	transmission_time -= (self->final_timeout * self->bytes_left
+			      / self->line_capacity);
+	IRDA_DEBUG(4, "%s() adjusting transmission_time : ft=%d, bl=%d, lc=%d -> tt=%d\n", __func__, self->final_timeout, self->bytes_left, self->line_capacity, transmission_time);
+
+	/* We are allowed to transmit a maximum number of bytes again. */
+	self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+	/*
+	 * The network layer has a intermediate buffer between IrLAP
+	 * and the IrDA driver which can contain 8 frames. So, even
+	 * though IrLAP is currently sending the *last* frame of the
+	 * tx-window, the driver most likely has only just started
+	 * sending the *first* frame of the same tx-window.
+	 * I.e. we are always at the very beginning of or Tx window.
+	 * Now, we are supposed to set the final timer from the end
+	 * of our tx-window to let the other peer reply. So, we need
+	 * to add extra time to compensate for the fact that we
+	 * are really at the start of tx-window, otherwise the final timer
+	 * might expire before he can answer...
+	 * Jean II
+	 */
+	irlap_start_final_timer(self, self->final_timeout + transmission_time);
+
+	/*
+	 * The clever amongst you might ask why we do this adjustement
+	 * only here, and not in all the other cases in irlap_event.c.
+	 * In all those other case, we only send a very short management
+	 * frame (few bytes), so the adjustement would be lost in the
+	 * noise...
+	 * The exception of course is irlap_resend_rejected_frame().
+	 * Jean II */
+}
+
+/*
+ * Function irlap_send_data_secondary_final (self, skb)
+ *
+ *    Send I(nformation) frame as secondary with final bit set
+ *
+ */
+void irlap_send_data_secondary_final(struct irlap_cb *self,
+				     struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb = NULL;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Is this reliable or unreliable data? */
+	if (skb->data[1] == I_FRAME) {
+
+		/*
+		 *  Insert frame sequence number (Vs) in control field before
+		 *  inserting into transmit window queue.
+		 */
+		skb->data[1] = I_FRAME | (self->vs << 1);
+
+		/*
+		 *  Insert frame in store, in case of retransmissions
+		 *  Increase skb reference count, see irlap_do_event()
+		 */
+		skb_get(skb);
+		skb_queue_tail(&self->wx_list, skb);
+
+		tx_skb = skb_clone(skb, GFP_ATOMIC);
+		if (tx_skb == NULL) {
+			return;
+		}
+
+		tx_skb->data[1] |= PF_BIT;
+
+		self->vs = (self->vs + 1) % 8;
+		self->ack_required = FALSE;
+
+		irlap_send_i_frame(self, tx_skb, RSP_FRAME);
+	} else {
+		if (self->ack_required) {
+			irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+			irlap_send_rr_frame(self, RSP_FRAME);
+			self->ack_required = FALSE;
+		} else {
+			skb->data[1] |= PF_BIT;
+			irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+		}
+	}
+
+	self->window = self->window_size;
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	/* We are allowed to transmit a maximum number of bytes again. */
+	self->bytes_left = self->line_capacity;
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+	irlap_start_wd_timer(self, self->wd_timeout);
+}
+
+/*
+ * Function irlap_send_data_secondary (self, skb)
+ *
+ *    Send I(nformation) frame as secondary without final bit set
+ *
+ */
+void irlap_send_data_secondary(struct irlap_cb *self, struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb = NULL;
+
+	/* Is this reliable or unreliable data? */
+	if (skb->data[1] == I_FRAME) {
+
+		/*
+		 *  Insert frame sequence number (Vs) in control field before
+		 *  inserting into transmit window queue.
+		 */
+		skb->data[1] = I_FRAME | (self->vs << 1);
+
+		/*
+		 *  Insert frame in store, in case of retransmissions
+		 *  Increase skb reference count, see irlap_do_event()
+		 */
+		skb_get(skb);
+		skb_queue_tail(&self->wx_list, skb);
+
+		tx_skb = skb_clone(skb, GFP_ATOMIC);
+		if (tx_skb == NULL) {
+			return;
+		}
+
+		self->vs = (self->vs + 1) % 8;
+		self->ack_required = FALSE;
+		self->window -= 1;
+
+		irlap_send_i_frame(self, tx_skb, RSP_FRAME);
+	} else {
+		irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME);
+		self->window -= 1;
+	}
+}
+
+/*
+ * Function irlap_resend_rejected_frames (nr)
+ *
+ *    Resend frames which has not been acknowledged. Should be safe to
+ *    traverse the list without locking it since this function will only be
+ *    called from interrupt context (BH)
+ */
+void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
+{
+	struct sk_buff *tx_skb;
+	struct sk_buff *skb;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/*  Resend unacknowledged frame(s) */
+	skb_queue_walk(&self->wx_list, skb) {
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+
+		/* We copy the skb to be retransmitted since we will have to
+		 * modify it. Cloning will confuse packet sniffers
+		 */
+		/* tx_skb = skb_clone( skb, GFP_ATOMIC); */
+		tx_skb = skb_copy(skb, GFP_ATOMIC);
+		if (!tx_skb) {
+			IRDA_DEBUG(0, "%s(), unable to copy\n", __func__);
+			return;
+		}
+
+		/* Clear old Nr field + poll bit */
+		tx_skb->data[1] &= 0x0f;
+
+		/*
+		 *  Set poll bit on the last frame retransmitted
+		 */
+		if (skb_queue_is_last(&self->wx_list, skb))
+			tx_skb->data[1] |= PF_BIT; /* Set p/f bit */
+		else
+			tx_skb->data[1] &= ~PF_BIT; /* Clear p/f bit */
+
+		irlap_send_i_frame(self, tx_skb, command);
+	}
+#if 0 /* Not yet */
+	/*
+	 *  We can now fill the window with additional data frames
+	 */
+	while (!skb_queue_empty(&self->txq)) {
+
+		IRDA_DEBUG(0, "%s(), sending additional frames!\n", __func__);
+		if (self->window > 0) {
+			skb = skb_dequeue( &self->txq);
+			IRDA_ASSERT(skb != NULL, return;);
+
+			/*
+			 *  If send window > 1 then send frame with pf
+			 *  bit cleared
+			 */
+			if ((self->window > 1) &&
+			    !skb_queue_empty(&self->txq)) {
+				irlap_send_data_primary(self, skb);
+			} else {
+				irlap_send_data_primary_poll(self, skb);
+			}
+			kfree_skb(skb);
+		}
+	}
+#endif
+}
+
+void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
+{
+	struct sk_buff *tx_skb;
+	struct sk_buff *skb;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	/*  Resend unacknowledged frame(s) */
+	skb = skb_peek(&self->wx_list);
+	if (skb != NULL) {
+		irlap_wait_min_turn_around(self, &self->qos_tx);
+
+		/* We copy the skb to be retransmitted since we will have to
+		 * modify it. Cloning will confuse packet sniffers
+		 */
+		/* tx_skb = skb_clone( skb, GFP_ATOMIC); */
+		tx_skb = skb_copy(skb, GFP_ATOMIC);
+		if (!tx_skb) {
+			IRDA_DEBUG(0, "%s(), unable to copy\n", __func__);
+			return;
+		}
+
+		/* Clear old Nr field + poll bit */
+		tx_skb->data[1] &= 0x0f;
+
+		/*  Set poll/final bit */
+		tx_skb->data[1] |= PF_BIT; /* Set p/f bit */
+
+		irlap_send_i_frame(self, tx_skb, command);
+	}
+}
+
+/*
+ * Function irlap_send_ui_frame (self, skb, command)
+ *
+ *    Contruct and transmit an Unnumbered Information (UI) frame
+ *
+ */
+void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
+			 __u8 caddr, int command)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Insert connection address */
+	skb->data[0] = caddr | ((command) ? CMD_FRAME : 0);
+
+	irlap_queue_xmit(self, skb);
+}
+
+/*
+ * Function irlap_send_i_frame (skb)
+ *
+ *    Contruct and transmit Information (I) frame
+ */
+static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb,
+			       int command)
+{
+	/* Insert connection address */
+	skb->data[0] = self->caddr;
+	skb->data[0] |= (command) ? CMD_FRAME : 0;
+
+	/* Insert next to receive (Vr) */
+	skb->data[1] |= (self->vr << 5);  /* insert nr */
+
+	irlap_queue_xmit(self, skb);
+}
+
+/*
+ * Function irlap_recv_i_frame (skb, frame)
+ *
+ *    Receive and parse an I (Information) frame, no harm in making it inline
+ *    since it's called only from one single place (irlap_driver_rcv).
+ */
+static inline void irlap_recv_i_frame(struct irlap_cb *self,
+				      struct sk_buff *skb,
+				      struct irlap_info *info, int command)
+{
+	info->nr = skb->data[1] >> 5;          /* Next to receive */
+	info->pf = skb->data[1] & PF_BIT;      /* Final bit */
+	info->ns = (skb->data[1] >> 1) & 0x07; /* Next to send */
+
+	/* Check if this is a command or a response frame */
+	if (command)
+		irlap_do_event(self, RECV_I_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_I_RSP, skb, info);
+}
+
+/*
+ * Function irlap_recv_ui_frame (self, skb, info)
+ *
+ *    Receive and parse an Unnumbered Information (UI) frame
+ *
+ */
+static void irlap_recv_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
+				struct irlap_info *info)
+{
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	info->pf = skb->data[1] & PF_BIT;      /* Final bit */
+
+	irlap_do_event(self, RECV_UI_FRAME, skb, info);
+}
+
+/*
+ * Function irlap_recv_frmr_frame (skb, frame)
+ *
+ *    Received Frame Reject response.
+ *
+ */
+static void irlap_recv_frmr_frame(struct irlap_cb *self, struct sk_buff *skb,
+				  struct irlap_info *info)
+{
+	__u8 *frame;
+	int w, x, y, z;
+
+	IRDA_DEBUG(0, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(info != NULL, return;);
+
+	if (!pskb_may_pull(skb, 4)) {
+		IRDA_ERROR("%s: frame too short!\n", __func__);
+		return;
+	}
+
+	frame = skb->data;
+
+	info->nr = frame[2] >> 5;          /* Next to receive */
+	info->pf = frame[2] & PF_BIT;      /* Final bit */
+	info->ns = (frame[2] >> 1) & 0x07; /* Next to send */
+
+	w = frame[3] & 0x01;
+	x = frame[3] & 0x02;
+	y = frame[3] & 0x04;
+	z = frame[3] & 0x08;
+
+	if (w) {
+		IRDA_DEBUG(0, "Rejected control field is undefined or not "
+		      "implemented.\n");
+	}
+	if (x) {
+		IRDA_DEBUG(0, "Rejected control field was invalid because it "
+		      "contained a non permitted I field.\n");
+	}
+	if (y) {
+		IRDA_DEBUG(0, "Received I field exceeded the maximum negotiated "
+		      "for the existing connection or exceeded the maximum "
+		      "this station supports if no connection exists.\n");
+	}
+	if (z) {
+		IRDA_DEBUG(0, "Rejected control field control field contained an "
+		      "invalid Nr count.\n");
+	}
+	irlap_do_event(self, RECV_FRMR_RSP, skb, info);
+}
+
+/*
+ * Function irlap_send_test_frame (self, daddr)
+ *
+ *    Send a test frame response
+ *
+ */
+void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr,
+			   struct sk_buff *cmd)
+{
+	struct sk_buff *tx_skb;
+	struct test_frame *frame;
+	__u8 *info;
+
+	tx_skb = alloc_skb(cmd->len + sizeof(struct test_frame), GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	/* Broadcast frames must include saddr and daddr fields */
+	if (caddr == CBROADCAST) {
+		frame = (struct test_frame *)
+			skb_put(tx_skb, sizeof(struct test_frame));
+
+		/* Insert the swapped addresses */
+		frame->saddr = cpu_to_le32(self->saddr);
+		frame->daddr = cpu_to_le32(daddr);
+	} else
+		frame = (struct test_frame *) skb_put(tx_skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER);
+
+	frame->caddr = caddr;
+	frame->control = TEST_RSP | PF_BIT;
+
+	/* Copy info */
+	info = skb_put(tx_skb, cmd->len);
+	memcpy(info, cmd->data, cmd->len);
+
+	/* Return to sender */
+	irlap_wait_min_turn_around(self, &self->qos_tx);
+	irlap_queue_xmit(self, tx_skb);
+}
+
+/*
+ * Function irlap_recv_test_frame (self, skb)
+ *
+ *    Receive a test frame
+ *
+ */
+static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
+				  struct irlap_info *info, int command)
+{
+	struct test_frame *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	if (!pskb_may_pull(skb, sizeof(*frame))) {
+		IRDA_ERROR("%s: frame too short!\n", __func__);
+		return;
+	}
+	frame = (struct test_frame *) skb->data;
+
+	/* Broadcast frames must carry saddr and daddr fields */
+	if (info->caddr == CBROADCAST) {
+		if (skb->len < sizeof(struct test_frame)) {
+			IRDA_DEBUG(0, "%s() test frame too short!\n",
+				   __func__);
+			return;
+		}
+
+		/* Read and swap addresses */
+		info->daddr = le32_to_cpu(frame->saddr);
+		info->saddr = le32_to_cpu(frame->daddr);
+
+		/* Make sure frame is addressed to us */
+		if ((info->saddr != self->saddr) &&
+		    (info->saddr != BROADCAST)) {
+			return;
+		}
+	}
+
+	if (command)
+		irlap_do_event(self, RECV_TEST_CMD, skb, info);
+	else
+		irlap_do_event(self, RECV_TEST_RSP, skb, info);
+}
+
+/*
+ * Function irlap_driver_rcv (skb, netdev, ptype)
+ *
+ *    Called when a frame is received. Dispatches the right receive function
+ *    for processing of the frame.
+ *
+ * Note on skb management :
+ * After calling the higher layers of the IrDA stack, we always
+ * kfree() the skb, which drop the reference count (and potentially
+ * destroy it).
+ * If a higher layer of the stack want to keep the skb around (to put
+ * in a queue or pass it to the higher layer), it will need to use
+ * skb_get() to keep a reference on it. This is usually done at the
+ * LMP level in irlmp.c.
+ * Jean II
+ */
+int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
+		     struct packet_type *ptype, struct net_device *orig_dev)
+{
+	struct irlap_info info;
+	struct irlap_cb *self;
+	int command;
+	__u8 control;
+	int ret = -1;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto out;
+
+	/* FIXME: should we get our own field? */
+	self = (struct irlap_cb *) dev->atalk_ptr;
+
+	/* If the net device is down, then IrLAP is gone! */
+	if (!self || self->magic != LAP_MAGIC)
+		goto err;
+
+	/* We are no longer an "old" protocol, so we need to handle
+	 * share and non linear skbs. This should never happen, so
+	 * we don't need to be clever about it. Jean II */
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+		IRDA_ERROR("%s: can't clone shared skb!\n", __func__);
+		goto err;
+	}
+
+	/* Check if frame is large enough for parsing */
+	if (!pskb_may_pull(skb, 2)) {
+		IRDA_ERROR("%s: frame too short!\n", __func__);
+		goto err;
+	}
+
+	command    = skb->data[0] & CMD_FRAME;
+	info.caddr = skb->data[0] & CBROADCAST;
+
+	info.pf      = skb->data[1] &  PF_BIT;
+	info.control = skb->data[1] & ~PF_BIT; /* Mask away poll/final bit */
+
+	control = info.control;
+
+	/*  First we check if this frame has a valid connection address */
+	if ((info.caddr != self->caddr) && (info.caddr != CBROADCAST)) {
+		IRDA_DEBUG(0, "%s(), wrong connection address!\n",
+			   __func__);
+		goto out;
+	}
+	/*
+	 *  Optimize for the common case and check if the frame is an
+	 *  I(nformation) frame. Only I-frames have bit 0 set to 0
+	 */
+	if (~control & 0x01) {
+		irlap_recv_i_frame(self, skb, &info, command);
+		goto out;
+	}
+	/*
+	 *  We now check is the frame is an S(upervisory) frame. Only
+	 *  S-frames have bit 0 set to 1 and bit 1 set to 0
+	 */
+	if (~control & 0x02) {
+		/*
+		 *  Received S(upervisory) frame, check which frame type it is
+		 *  only the first nibble is of interest
+		 */
+		switch (control & 0x0f) {
+		case RR:
+			irlap_recv_rr_frame(self, skb, &info, command);
+			break;
+		case RNR:
+			irlap_recv_rnr_frame(self, skb, &info, command);
+			break;
+		case REJ:
+			irlap_recv_rej_frame(self, skb, &info, command);
+			break;
+		case SREJ:
+			irlap_recv_srej_frame(self, skb, &info, command);
+			break;
+		default:
+			IRDA_WARNING("%s: Unknown S-frame %02x received!\n",
+				__func__, info.control);
+			break;
+		}
+		goto out;
+	}
+	/*
+	 *  This must be a C(ontrol) frame
+	 */
+	switch (control) {
+	case XID_RSP:
+		irlap_recv_discovery_xid_rsp(self, skb, &info);
+		break;
+	case XID_CMD:
+		irlap_recv_discovery_xid_cmd(self, skb, &info);
+		break;
+	case SNRM_CMD:
+		irlap_recv_snrm_cmd(self, skb, &info);
+		break;
+	case DM_RSP:
+		irlap_do_event(self, RECV_DM_RSP, skb, &info);
+		break;
+	case DISC_CMD: /* And RD_RSP since they have the same value */
+		irlap_recv_disc_frame(self, skb, &info, command);
+		break;
+	case TEST_CMD:
+		irlap_recv_test_frame(self, skb, &info, command);
+		break;
+	case UA_RSP:
+		irlap_recv_ua_frame(self, skb, &info);
+		break;
+	case FRMR_RSP:
+		irlap_recv_frmr_frame(self, skb, &info);
+		break;
+	case UI_FRAME:
+		irlap_recv_ui_frame(self, skb, &info);
+		break;
+	default:
+		IRDA_WARNING("%s: Unknown frame %02x received!\n",
+				__func__, info.control);
+		break;
+	}
+out:
+	ret = 0;
+err:
+	/* Always drop our reference on the skb */
+	dev_kfree_skb(skb);
+	return ret;
+}
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
new file mode 100644
index 00000000..6115a44c
--- /dev/null
+++ b/net/irda/irlmp.c
@@ -0,0 +1,2025 @@
+/*********************************************************************
+ *
+ * Filename:      irlmp.c
+ * Version:       1.0
+ * Description:   IrDA Link Management Protocol (LMP) layer
+ * Status:        Stable.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 17 20:54:32 1997
+ * Modified at:   Wed Jan  5 11:26:03 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/qos.h>
+#include <net/irda/irlap.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+
+#include <asm/unaligned.h>
+
+static __u8 irlmp_find_free_slsap(void);
+static int irlmp_slsap_inuse(__u8 slsap_sel);
+
+/* Master structure */
+struct irlmp_cb *irlmp = NULL;
+
+/* These can be altered by the sysctl interface */
+int  sysctl_discovery         = 0;
+int  sysctl_discovery_timeout = 3; /* 3 seconds by default */
+int  sysctl_discovery_slots   = 6; /* 6 slots by default */
+int  sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
+char sysctl_devname[65];
+
+const char *irlmp_reasons[] = {
+	"ERROR, NOT USED",
+	"LM_USER_REQUEST",
+	"LM_LAP_DISCONNECT",
+	"LM_CONNECT_FAILURE",
+	"LM_LAP_RESET",
+	"LM_INIT_DISCONNECT",
+	"ERROR, NOT USED",
+};
+
+/*
+ * Function irlmp_init (void)
+ *
+ *    Create (allocate) the main IrLMP structure
+ *
+ */
+int __init irlmp_init(void)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+	/* Initialize the irlmp structure. */
+	irlmp = kzalloc( sizeof(struct irlmp_cb), GFP_KERNEL);
+	if (irlmp == NULL)
+		return -ENOMEM;
+
+	irlmp->magic = LMP_MAGIC;
+
+	irlmp->clients = hashbin_new(HB_LOCK);
+	irlmp->services = hashbin_new(HB_LOCK);
+	irlmp->links = hashbin_new(HB_LOCK);
+	irlmp->unconnected_lsaps = hashbin_new(HB_LOCK);
+	irlmp->cachelog = hashbin_new(HB_NOLOCK);
+
+	if ((irlmp->clients == NULL) ||
+	    (irlmp->services == NULL) ||
+	    (irlmp->links == NULL) ||
+	    (irlmp->unconnected_lsaps == NULL) ||
+	    (irlmp->cachelog == NULL)) {
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&irlmp->cachelog->hb_spinlock);
+
+	irlmp->last_lsap_sel = 0x0f; /* Reserved 0x00-0x0f */
+	strcpy(sysctl_devname, "Linux");
+
+	init_timer(&irlmp->discovery_timer);
+
+	/* Do discovery every 3 seconds, conditionally */
+	if (sysctl_discovery)
+		irlmp_start_discovery_timer(irlmp,
+					    sysctl_discovery_timeout*HZ);
+
+	return 0;
+}
+
+/*
+ * Function irlmp_cleanup (void)
+ *
+ *    Remove IrLMP layer
+ *
+ */
+void irlmp_cleanup(void)
+{
+	/* Check for main structure */
+	IRDA_ASSERT(irlmp != NULL, return;);
+	IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;);
+
+	del_timer(&irlmp->discovery_timer);
+
+	hashbin_delete(irlmp->links, (FREE_FUNC) kfree);
+	hashbin_delete(irlmp->unconnected_lsaps, (FREE_FUNC) kfree);
+	hashbin_delete(irlmp->clients, (FREE_FUNC) kfree);
+	hashbin_delete(irlmp->services, (FREE_FUNC) kfree);
+	hashbin_delete(irlmp->cachelog, (FREE_FUNC) kfree);
+
+	/* De-allocate main structure */
+	kfree(irlmp);
+	irlmp = NULL;
+}
+
+/*
+ * Function irlmp_open_lsap (slsap, notify)
+ *
+ *   Register with IrLMP and create a local LSAP,
+ *   returns handle to LSAP.
+ */
+struct lsap_cb *irlmp_open_lsap(__u8 slsap_sel, notify_t *notify, __u8 pid)
+{
+	struct lsap_cb *self;
+
+	IRDA_ASSERT(notify != NULL, return NULL;);
+	IRDA_ASSERT(irlmp != NULL, return NULL;);
+	IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return NULL;);
+	IRDA_ASSERT(notify->instance != NULL, return NULL;);
+
+	/*  Does the client care which Source LSAP selector it gets?  */
+	if (slsap_sel == LSAP_ANY) {
+		slsap_sel = irlmp_find_free_slsap();
+		if (!slsap_sel)
+			return NULL;
+	} else if (irlmp_slsap_inuse(slsap_sel))
+		return NULL;
+
+	/* Allocate new instance of a LSAP connection */
+	self = kzalloc(sizeof(struct lsap_cb), GFP_ATOMIC);
+	if (self == NULL) {
+		IRDA_ERROR("%s: can't allocate memory\n", __func__);
+		return NULL;
+	}
+
+	self->magic = LMP_LSAP_MAGIC;
+	self->slsap_sel = slsap_sel;
+
+	/* Fix connectionless LSAP's */
+	if (slsap_sel == LSAP_CONNLESS) {
+#ifdef CONFIG_IRDA_ULTRA
+		self->dlsap_sel = LSAP_CONNLESS;
+		self->pid = pid;
+#endif /* CONFIG_IRDA_ULTRA */
+	} else
+		self->dlsap_sel = LSAP_ANY;
+	/* self->connected = FALSE; -> already NULL via memset() */
+
+	init_timer(&self->watchdog_timer);
+
+	self->notify = *notify;
+
+	self->lsap_state = LSAP_DISCONNECTED;
+
+	/* Insert into queue of unconnected LSAPs */
+	hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self,
+		       (long) self, NULL);
+
+	return self;
+}
+EXPORT_SYMBOL(irlmp_open_lsap);
+
+/*
+ * Function __irlmp_close_lsap (self)
+ *
+ *    Remove an instance of LSAP
+ */
+static void __irlmp_close_lsap(struct lsap_cb *self)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+	/*
+	 *  Set some of the variables to preset values
+	 */
+	self->magic = 0;
+	del_timer(&self->watchdog_timer); /* Important! */
+
+	if (self->conn_skb)
+		dev_kfree_skb(self->conn_skb);
+
+	kfree(self);
+}
+
+/*
+ * Function irlmp_close_lsap (self)
+ *
+ *    Close and remove LSAP
+ *
+ */
+void irlmp_close_lsap(struct lsap_cb *self)
+{
+	struct lap_cb *lap;
+	struct lsap_cb *lsap = NULL;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+	/*
+	 *  Find out if we should remove this LSAP from a link or from the
+	 *  list of unconnected lsaps (not associated with a link)
+	 */
+	lap = self->lap;
+	if (lap) {
+		IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+		/* We might close a LSAP before it has completed the
+		 * connection setup. In those case, higher layers won't
+		 * send a proper disconnect request. Harmless, except
+		 * that we will forget to close LAP... - Jean II */
+		if(self->lsap_state != LSAP_DISCONNECTED) {
+			self->lsap_state = LSAP_DISCONNECTED;
+			irlmp_do_lap_event(self->lap,
+					   LM_LAP_DISCONNECT_REQUEST, NULL);
+		}
+		/* Now, remove from the link */
+		lsap = hashbin_remove(lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+		lap->cache.valid = FALSE;
+#endif
+	}
+	self->lap = NULL;
+	/* Check if we found the LSAP! If not then try the unconnected lsaps */
+	if (!lsap) {
+		lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self,
+				      NULL);
+	}
+	if (!lsap) {
+		IRDA_DEBUG(0,
+		     "%s(), Looks like somebody has removed me already!\n",
+			   __func__);
+		return;
+	}
+	__irlmp_close_lsap(self);
+}
+EXPORT_SYMBOL(irlmp_close_lsap);
+
+/*
+ * Function irlmp_register_irlap (saddr, notify)
+ *
+ *    Register IrLAP layer with IrLMP. There is possible to have multiple
+ *    instances of the IrLAP layer, each connected to different IrDA ports
+ *
+ */
+void irlmp_register_link(struct irlap_cb *irlap, __u32 saddr, notify_t *notify)
+{
+	struct lap_cb *lap;
+
+	IRDA_ASSERT(irlmp != NULL, return;);
+	IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;);
+	IRDA_ASSERT(notify != NULL, return;);
+
+	/*
+	 *  Allocate new instance of a LSAP connection
+	 */
+	lap = kzalloc(sizeof(struct lap_cb), GFP_KERNEL);
+	if (lap == NULL) {
+		IRDA_ERROR("%s: unable to kmalloc\n", __func__);
+		return;
+	}
+
+	lap->irlap = irlap;
+	lap->magic = LMP_LAP_MAGIC;
+	lap->saddr = saddr;
+	lap->daddr = DEV_ADDR_ANY;
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	lap->cache.valid = FALSE;
+#endif
+	lap->lsaps = hashbin_new(HB_LOCK);
+	if (lap->lsaps == NULL) {
+		IRDA_WARNING("%s(), unable to kmalloc lsaps\n", __func__);
+		kfree(lap);
+		return;
+	}
+
+	lap->lap_state = LAP_STANDBY;
+
+	init_timer(&lap->idle_timer);
+
+	/*
+	 *  Insert into queue of LMP links
+	 */
+	hashbin_insert(irlmp->links, (irda_queue_t *) lap, lap->saddr, NULL);
+
+	/*
+	 *  We set only this variable so IrLAP can tell us on which link the
+	 *  different events happened on
+	 */
+	irda_notify_init(notify);
+	notify->instance = lap;
+}
+
+/*
+ * Function irlmp_unregister_irlap (saddr)
+ *
+ *    IrLAP layer has been removed!
+ *
+ */
+void irlmp_unregister_link(__u32 saddr)
+{
+	struct lap_cb *link;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/* We must remove ourselves from the hashbin *first*. This ensure
+	 * that no more LSAPs will be open on this link and no discovery
+	 * will be triggered anymore. Jean II */
+	link = hashbin_remove(irlmp->links, saddr, NULL);
+	if (link) {
+		IRDA_ASSERT(link->magic == LMP_LAP_MAGIC, return;);
+
+		/* Kill all the LSAPs on this link. Jean II */
+		link->reason = LAP_DISC_INDICATION;
+		link->daddr = DEV_ADDR_ANY;
+		irlmp_do_lap_event(link, LM_LAP_DISCONNECT_INDICATION, NULL);
+
+		/* Remove all discoveries discovered at this link */
+		irlmp_expire_discoveries(irlmp->cachelog, link->saddr, TRUE);
+
+		/* Final cleanup */
+		del_timer(&link->idle_timer);
+		link->magic = 0;
+		hashbin_delete(link->lsaps, (FREE_FUNC) __irlmp_close_lsap);
+		kfree(link);
+	}
+}
+
+/*
+ * Function irlmp_connect_request (handle, dlsap, userdata)
+ *
+ *    Connect with a peer LSAP
+ *
+ */
+int irlmp_connect_request(struct lsap_cb *self, __u8 dlsap_sel,
+			  __u32 saddr, __u32 daddr,
+			  struct qos_info *qos, struct sk_buff *userdata)
+{
+	struct sk_buff *tx_skb = userdata;
+	struct lap_cb *lap;
+	struct lsap_cb *lsap;
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -EBADR;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EBADR;);
+
+	IRDA_DEBUG(2,
+	      "%s(), slsap_sel=%02x, dlsap_sel=%02x, saddr=%08x, daddr=%08x\n",
+	      __func__, self->slsap_sel, dlsap_sel, saddr, daddr);
+
+	if (test_bit(0, &self->connected)) {
+		ret = -EISCONN;
+		goto err;
+	}
+
+	/* Client must supply destination device address */
+	if (!daddr) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Any userdata? */
+	if (tx_skb == NULL) {
+		tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		skb_reserve(tx_skb, LMP_MAX_HEADER);
+	}
+
+	/* Make room for MUX control header (3 bytes) */
+	IRDA_ASSERT(skb_headroom(tx_skb) >= LMP_CONTROL_HEADER, return -1;);
+	skb_push(tx_skb, LMP_CONTROL_HEADER);
+
+	self->dlsap_sel = dlsap_sel;
+
+	/*
+	 * Find the link to where we should try to connect since there may
+	 * be more than one IrDA port on this machine. If the client has
+	 * passed us the saddr (and already knows which link to use), then
+	 * we use that to find the link, if not then we have to look in the
+	 * discovery log and check if any of the links has discovered a
+	 * device with the given daddr
+	 */
+	if ((!saddr) || (saddr == DEV_ADDR_ANY)) {
+		discovery_t *discovery;
+		unsigned long flags;
+
+		spin_lock_irqsave(&irlmp->cachelog->hb_spinlock, flags);
+		if (daddr != DEV_ADDR_ANY)
+			discovery = hashbin_find(irlmp->cachelog, daddr, NULL);
+		else {
+			IRDA_DEBUG(2, "%s(), no daddr\n", __func__);
+			discovery = (discovery_t *)
+				hashbin_get_first(irlmp->cachelog);
+		}
+
+		if (discovery) {
+			saddr = discovery->data.saddr;
+			daddr = discovery->data.daddr;
+		}
+		spin_unlock_irqrestore(&irlmp->cachelog->hb_spinlock, flags);
+	}
+	lap = hashbin_lock_find(irlmp->links, saddr, NULL);
+	if (lap == NULL) {
+		IRDA_DEBUG(1, "%s(), Unable to find a usable link!\n", __func__);
+		ret = -EHOSTUNREACH;
+		goto err;
+	}
+
+	/* Check if LAP is disconnected or already connected */
+	if (lap->daddr == DEV_ADDR_ANY)
+		lap->daddr = daddr;
+	else if (lap->daddr != daddr) {
+		/* Check if some LSAPs are active on this LAP */
+		if (HASHBIN_GET_SIZE(lap->lsaps) == 0) {
+			/* No active connection, but LAP hasn't been
+			 * disconnected yet (waiting for timeout in LAP).
+			 * Maybe we could give LAP a bit of help in this case.
+			 */
+			IRDA_DEBUG(0, "%s(), sorry, but I'm waiting for LAP to timeout!\n", __func__);
+			ret = -EAGAIN;
+			goto err;
+		}
+
+		/* LAP is already connected to a different node, and LAP
+		 * can only talk to one node at a time */
+		IRDA_DEBUG(0, "%s(), sorry, but link is busy!\n", __func__);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	self->lap = lap;
+
+	/*
+	 *  Remove LSAP from list of unconnected LSAPs and insert it into the
+	 *  list of connected LSAPs for the particular link
+	 */
+	lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, NULL);
+
+	IRDA_ASSERT(lsap != NULL, return -1;);
+	IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;);
+	IRDA_ASSERT(lsap->lap != NULL, return -1;);
+	IRDA_ASSERT(lsap->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+	hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, (long) self,
+		       NULL);
+
+	set_bit(0, &self->connected);	/* TRUE */
+
+	/*
+	 *  User supplied qos specifications?
+	 */
+	if (qos)
+		self->qos = *qos;
+
+	irlmp_do_lsap_event(self, LM_CONNECT_REQUEST, tx_skb);
+
+	/* Drop reference count - see irlap_data_request(). */
+	dev_kfree_skb(tx_skb);
+
+	return 0;
+
+err:
+	/* Cleanup */
+	if(tx_skb)
+		dev_kfree_skb(tx_skb);
+	return ret;
+}
+EXPORT_SYMBOL(irlmp_connect_request);
+
+/*
+ * Function irlmp_connect_indication (self)
+ *
+ *    Incoming connection
+ *
+ */
+void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+	int max_seg_size;
+	int lap_header_size;
+	int max_header_size;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(self->lap != NULL, return;);
+
+	IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+		   __func__, self->slsap_sel, self->dlsap_sel);
+
+	/* Note : self->lap is set in irlmp_link_data_indication(),
+	 * (case CONNECT_CMD:) because we have no way to set it here.
+	 * Similarly, self->dlsap_sel is usually set in irlmp_find_lsap().
+	 * Jean II */
+
+	self->qos = *self->lap->qos;
+
+	max_seg_size = self->lap->qos->data_size.value-LMP_HEADER;
+	lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap);
+	max_header_size = LMP_HEADER + lap_header_size;
+
+	/* Hide LMP_CONTROL_HEADER header from layer above */
+	skb_pull(skb, LMP_CONTROL_HEADER);
+
+	if (self->notify.connect_indication) {
+		/* Don't forget to refcount it - see irlap_driver_rcv(). */
+		skb_get(skb);
+		self->notify.connect_indication(self->notify.instance, self,
+						&self->qos, max_seg_size,
+						max_header_size, skb);
+	}
+}
+
+/*
+ * Function irlmp_connect_response (handle, userdata)
+ *
+ *    Service user is accepting connection
+ *
+ */
+int irlmp_connect_response(struct lsap_cb *self, struct sk_buff *userdata)
+{
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+	IRDA_ASSERT(userdata != NULL, return -1;);
+
+	/* We set the connected bit and move the lsap to the connected list
+	 * in the state machine itself. Jean II */
+
+	IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+		   __func__, self->slsap_sel, self->dlsap_sel);
+
+	/* Make room for MUX control header (3 bytes) */
+	IRDA_ASSERT(skb_headroom(userdata) >= LMP_CONTROL_HEADER, return -1;);
+	skb_push(userdata, LMP_CONTROL_HEADER);
+
+	irlmp_do_lsap_event(self, LM_CONNECT_RESPONSE, userdata);
+
+	/* Drop reference count - see irlap_data_request(). */
+	dev_kfree_skb(userdata);
+
+	return 0;
+}
+EXPORT_SYMBOL(irlmp_connect_response);
+
+/*
+ * Function irlmp_connect_confirm (handle, skb)
+ *
+ *    LSAP connection confirmed peer device!
+ */
+void irlmp_connect_confirm(struct lsap_cb *self, struct sk_buff *skb)
+{
+	int max_header_size;
+	int lap_header_size;
+	int max_seg_size;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(skb != NULL, return;);
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+	IRDA_ASSERT(self->lap != NULL, return;);
+
+	self->qos = *self->lap->qos;
+
+	max_seg_size    = self->lap->qos->data_size.value-LMP_HEADER;
+	lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap);
+	max_header_size = LMP_HEADER + lap_header_size;
+
+	IRDA_DEBUG(2, "%s(), max_header_size=%d\n",
+		   __func__, max_header_size);
+
+	/* Hide LMP_CONTROL_HEADER header from layer above */
+	skb_pull(skb, LMP_CONTROL_HEADER);
+
+	if (self->notify.connect_confirm) {
+		/* Don't forget to refcount it - see irlap_driver_rcv() */
+		skb_get(skb);
+		self->notify.connect_confirm(self->notify.instance, self,
+					     &self->qos, max_seg_size,
+					     max_header_size, skb);
+	}
+}
+
+/*
+ * Function irlmp_dup (orig, instance)
+ *
+ *    Duplicate LSAP, can be used by servers to confirm a connection on a
+ *    new LSAP so it can keep listening on the old one.
+ *
+ */
+struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
+{
+	struct lsap_cb *new;
+	unsigned long flags;
+
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+	/* Only allowed to duplicate unconnected LSAP's, and only LSAPs
+	 * that have received a connect indication. Jean II */
+	if ((!hashbin_find(irlmp->unconnected_lsaps, (long) orig, NULL)) ||
+	    (orig->lap == NULL)) {
+		IRDA_DEBUG(0, "%s(), invalid LSAP (wrong state)\n",
+			   __func__);
+		spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock,
+				       flags);
+		return NULL;
+	}
+
+	/* Allocate a new instance */
+	new = kmemdup(orig, sizeof(*new), GFP_ATOMIC);
+	if (!new)  {
+		IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __func__);
+		spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock,
+				       flags);
+		return NULL;
+	}
+	/* new->lap = orig->lap; => done in the memcpy() */
+	/* new->slsap_sel = orig->slsap_sel; => done in the memcpy() */
+	new->conn_skb = NULL;
+
+	spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+	/* Not everything is the same */
+	new->notify.instance = instance;
+
+	init_timer(&new->watchdog_timer);
+
+	hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) new,
+		       (long) new, NULL);
+
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	/* Make sure that we invalidate the LSAP cache */
+	new->lap->cache.valid = FALSE;
+#endif /* CONFIG_IRDA_CACHE_LAST_LSAP */
+
+	return new;
+}
+
+/*
+ * Function irlmp_disconnect_request (handle, userdata)
+ *
+ *    The service user is requesting disconnection, this will not remove the
+ *    LSAP, but only mark it as disconnected
+ */
+int irlmp_disconnect_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+	struct lsap_cb *lsap;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+	IRDA_ASSERT(userdata != NULL, return -1;);
+
+	/* Already disconnected ?
+	 * There is a race condition between irlmp_disconnect_indication()
+	 * and us that might mess up the hashbins below. This fixes it.
+	 * Jean II */
+	if (! test_and_clear_bit(0, &self->connected)) {
+		IRDA_DEBUG(0, "%s(), already disconnected!\n", __func__);
+		dev_kfree_skb(userdata);
+		return -1;
+	}
+
+	skb_push(userdata, LMP_CONTROL_HEADER);
+
+	/*
+	 *  Do the event before the other stuff since we must know
+	 *  which lap layer that the frame should be transmitted on
+	 */
+	irlmp_do_lsap_event(self, LM_DISCONNECT_REQUEST, userdata);
+
+	/* Drop reference count - see irlap_data_request(). */
+	dev_kfree_skb(userdata);
+
+	/*
+	 *  Remove LSAP from list of connected LSAPs for the particular link
+	 *  and insert it into the list of unconnected LSAPs
+	 */
+	IRDA_ASSERT(self->lap != NULL, return -1;);
+	IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+	IRDA_ASSERT(self->lap->lsaps != NULL, return -1;);
+
+	lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	self->lap->cache.valid = FALSE;
+#endif
+
+	IRDA_ASSERT(lsap != NULL, return -1;);
+	IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;);
+	IRDA_ASSERT(lsap == self, return -1;);
+
+	hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self,
+		       (long) self, NULL);
+
+	/* Reset some values */
+	self->dlsap_sel = LSAP_ANY;
+	self->lap = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(irlmp_disconnect_request);
+
+/*
+ * Function irlmp_disconnect_indication (reason, userdata)
+ *
+ *    LSAP is being closed!
+ */
+void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason,
+				 struct sk_buff *skb)
+{
+	struct lsap_cb *lsap;
+
+	IRDA_DEBUG(1, "%s(), reason=%s\n", __func__, irlmp_reasons[reason]);
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+	IRDA_DEBUG(3, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n",
+		   __func__, self->slsap_sel, self->dlsap_sel);
+
+	/* Already disconnected ?
+	 * There is a race condition between irlmp_disconnect_request()
+	 * and us that might mess up the hashbins below. This fixes it.
+	 * Jean II */
+	if (! test_and_clear_bit(0, &self->connected)) {
+		IRDA_DEBUG(0, "%s(), already disconnected!\n", __func__);
+		return;
+	}
+
+	/*
+	 *  Remove association between this LSAP and the link it used
+	 */
+	IRDA_ASSERT(self->lap != NULL, return;);
+	IRDA_ASSERT(self->lap->lsaps != NULL, return;);
+
+	lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL);
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	self->lap->cache.valid = FALSE;
+#endif
+
+	IRDA_ASSERT(lsap != NULL, return;);
+	IRDA_ASSERT(lsap == self, return;);
+	hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) lsap,
+		       (long) lsap, NULL);
+
+	self->dlsap_sel = LSAP_ANY;
+	self->lap = NULL;
+
+	/*
+	 *  Inform service user
+	 */
+	if (self->notify.disconnect_indication) {
+		/* Don't forget to refcount it - see irlap_driver_rcv(). */
+		if(skb)
+			skb_get(skb);
+		self->notify.disconnect_indication(self->notify.instance,
+						   self, reason, skb);
+	} else {
+		IRDA_DEBUG(0, "%s(), no handler\n", __func__);
+	}
+}
+
+/*
+ * Function irlmp_do_expiry (void)
+ *
+ *    Do a cleanup of the discovery log (remove old entries)
+ *
+ * Note : separate from irlmp_do_discovery() so that we can handle
+ * passive discovery properly.
+ */
+void irlmp_do_expiry(void)
+{
+	struct lap_cb *lap;
+
+	/*
+	 * Expire discovery on all links which are *not* connected.
+	 * On links which are connected, we can't do discovery
+	 * anymore and can't refresh the log, so we freeze the
+	 * discovery log to keep info about the device we are
+	 * connected to.
+	 * This info is mandatory if we want irlmp_connect_request()
+	 * to work properly. - Jean II
+	 */
+	lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+	while (lap != NULL) {
+		IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+		if (lap->lap_state == LAP_STANDBY) {
+			/* Expire discoveries discovered on this link */
+			irlmp_expire_discoveries(irlmp->cachelog, lap->saddr,
+						 FALSE);
+		}
+		lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+	}
+}
+
+/*
+ * Function irlmp_do_discovery (nslots)
+ *
+ *    Do some discovery on all links
+ *
+ * Note : log expiry is done above.
+ */
+void irlmp_do_discovery(int nslots)
+{
+	struct lap_cb *lap;
+	__u16 *data_hintsp;
+
+	/* Make sure the value is sane */
+	if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){
+		IRDA_WARNING("%s: invalid value for number of slots!\n",
+			     __func__);
+		nslots = sysctl_discovery_slots = 8;
+	}
+
+	/* Construct new discovery info to be used by IrLAP, */
+	data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints;
+	put_unaligned(irlmp->hints.word, data_hintsp);
+
+	/*
+	 *  Set character set for device name (we use ASCII), and
+	 *  copy device name. Remember to make room for a \0 at the
+	 *  end
+	 */
+	irlmp->discovery_cmd.data.charset = CS_ASCII;
+	strncpy(irlmp->discovery_cmd.data.info, sysctl_devname,
+		NICKNAME_MAX_LEN);
+	irlmp->discovery_cmd.name_len = strlen(irlmp->discovery_cmd.data.info);
+	irlmp->discovery_cmd.nslots = nslots;
+
+	/*
+	 * Try to send discovery packets on all links
+	 */
+	lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+	while (lap != NULL) {
+		IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+		if (lap->lap_state == LAP_STANDBY) {
+			/* Try to discover */
+			irlmp_do_lap_event(lap, LM_LAP_DISCOVERY_REQUEST,
+					   NULL);
+		}
+		lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+	}
+}
+
+/*
+ * Function irlmp_discovery_request (nslots)
+ *
+ *    Do a discovery of devices in front of the computer
+ *
+ * If the caller has registered a client discovery callback, this
+ * allow him to receive the full content of the discovery log through
+ * this callback (as normally he will receive only new discoveries).
+ */
+void irlmp_discovery_request(int nslots)
+{
+	/* Return current cached discovery log (in full) */
+	irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_LOG);
+
+	/*
+	 * Start a single discovery operation if discovery is not already
+	 * running
+	 */
+	if (!sysctl_discovery) {
+		/* Check if user wants to override the default */
+		if (nslots == DISCOVERY_DEFAULT_SLOTS)
+			nslots = sysctl_discovery_slots;
+
+		irlmp_do_discovery(nslots);
+		/* Note : we never do expiry here. Expiry will run on the
+		 * discovery timer regardless of the state of sysctl_discovery
+		 * Jean II */
+	}
+}
+EXPORT_SYMBOL(irlmp_discovery_request);
+
+/*
+ * Function irlmp_get_discoveries (pn, mask, slots)
+ *
+ *    Return the current discovery log
+ *
+ * If discovery is not enabled, you should call this function again
+ * after 1 or 2 seconds (i.e. after discovery has been done).
+ */
+struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
+{
+	/* If discovery is not enabled, it's likely that the discovery log
+	 * will be empty. So, we trigger a single discovery, so that next
+	 * time the user call us there might be some results in the log.
+	 * Jean II
+	 */
+	if (!sysctl_discovery) {
+		/* Check if user wants to override the default */
+		if (nslots == DISCOVERY_DEFAULT_SLOTS)
+			nslots = sysctl_discovery_slots;
+
+		/* Start discovery - will complete sometime later */
+		irlmp_do_discovery(nslots);
+		/* Note : we never do expiry here. Expiry will run on the
+		 * discovery timer regardless of the state of sysctl_discovery
+		 * Jean II */
+	}
+
+	/* Return current cached discovery log */
+	return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
+}
+EXPORT_SYMBOL(irlmp_get_discoveries);
+
+/*
+ * Function irlmp_notify_client (log)
+ *
+ *    Notify all about discovered devices
+ *
+ * Clients registered with IrLMP are :
+ *	o IrComm
+ *	o IrLAN
+ *	o Any socket (in any state - ouch, that may be a lot !)
+ * The client may have defined a callback to be notified in case of
+ * partial/selective discovery based on the hints that it passed to IrLMP.
+ */
+static inline void
+irlmp_notify_client(irlmp_client_t *client,
+		    hashbin_t *log, DISCOVERY_MODE mode)
+{
+	discinfo_t *discoveries;	/* Copy of the discovery log */
+	int	number;			/* Number of nodes in the log */
+	int	i;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	/* Check if client wants or not partial/selective log (optimisation) */
+	if (!client->disco_callback)
+		return;
+
+	/*
+	 * Locking notes :
+	 * the old code was manipulating the log directly, which was
+	 * very racy. Now, we use copy_discoveries, that protects
+	 * itself while dumping the log for us.
+	 * The overhead of the copy is compensated by the fact that
+	 * we only pass new discoveries in normal mode and don't
+	 * pass the same old entry every 3s to the caller as we used
+	 * to do (virtual function calling is expensive).
+	 * Jean II
+	 */
+
+	/*
+	 * Now, check all discovered devices (if any), and notify client
+	 * only about the services that the client is interested in
+	 * We also notify only about the new devices unless the caller
+	 * explicitly request a dump of the log. Jean II
+	 */
+	discoveries = irlmp_copy_discoveries(log, &number,
+					     client->hint_mask.word,
+					     (mode == DISCOVERY_LOG));
+	/* Check if the we got some results */
+	if (discoveries == NULL)
+		return;	/* No nodes discovered */
+
+	/* Pass all entries to the listener */
+	for(i = 0; i < number; i++)
+		client->disco_callback(&(discoveries[i]), mode, client->priv);
+
+	/* Free up our buffer */
+	kfree(discoveries);
+}
+
+/*
+ * Function irlmp_discovery_confirm ( self, log)
+ *
+ *    Some device(s) answered to our discovery request! Check to see which
+ *    device it is, and give indication to the client(s)
+ *
+ */
+void irlmp_discovery_confirm(hashbin_t *log, DISCOVERY_MODE mode)
+{
+	irlmp_client_t *client;
+	irlmp_client_t *client_next;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(log != NULL, return;);
+
+	if (!(HASHBIN_GET_SIZE(log)))
+		return;
+
+	/* For each client - notify callback may touch client list */
+	client = (irlmp_client_t *) hashbin_get_first(irlmp->clients);
+	while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL,
+					 (void *) &client_next) ) {
+		/* Check if we should notify client */
+		irlmp_notify_client(client, log, mode);
+
+		client = client_next;
+	}
+}
+
+/*
+ * Function irlmp_discovery_expiry (expiry)
+ *
+ *	This device is no longer been discovered, and therefore it is being
+ *	purged from the discovery log. Inform all clients who have
+ *	registered for this event...
+ *
+ *	Note : called exclusively from discovery.c
+ *	Note : this is no longer called under discovery spinlock, so the
+ *		client can do whatever he wants in the callback.
+ */
+void irlmp_discovery_expiry(discinfo_t *expiries, int number)
+{
+	irlmp_client_t *client;
+	irlmp_client_t *client_next;
+	int		i;
+
+	IRDA_DEBUG(3, "%s()\n", __func__);
+
+	IRDA_ASSERT(expiries != NULL, return;);
+
+	/* For each client - notify callback may touch client list */
+	client = (irlmp_client_t *) hashbin_get_first(irlmp->clients);
+	while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL,
+					 (void *) &client_next) ) {
+
+		/* Pass all entries to the listener */
+		for(i = 0; i < number; i++) {
+			/* Check if we should notify client */
+			if ((client->expir_callback) &&
+			    (client->hint_mask.word &
+			     get_unaligned((__u16 *)expiries[i].hints)
+			     & 0x7f7f) )
+				client->expir_callback(&(expiries[i]),
+						       EXPIRY_TIMEOUT,
+						       client->priv);
+		}
+
+		/* Next client */
+		client = client_next;
+	}
+}
+
+/*
+ * Function irlmp_get_discovery_response ()
+ *
+ *    Used by IrLAP to get the discovery info it needs when answering
+ *    discovery requests by other devices.
+ */
+discovery_t *irlmp_get_discovery_response(void)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(irlmp != NULL, return NULL;);
+
+	put_unaligned(irlmp->hints.word, (__u16 *)irlmp->discovery_rsp.data.hints);
+
+	/*
+	 *  Set character set for device name (we use ASCII), and
+	 *  copy device name. Remember to make room for a \0 at the
+	 *  end
+	 */
+	irlmp->discovery_rsp.data.charset = CS_ASCII;
+
+	strncpy(irlmp->discovery_rsp.data.info, sysctl_devname,
+		NICKNAME_MAX_LEN);
+	irlmp->discovery_rsp.name_len = strlen(irlmp->discovery_rsp.data.info);
+
+	return &irlmp->discovery_rsp;
+}
+
+/*
+ * Function irlmp_data_request (self, skb)
+ *
+ *    Send some data to peer device
+ *
+ * Note on skb management :
+ * After calling the lower layers of the IrDA stack, we always
+ * kfree() the skb, which drop the reference count (and potentially
+ * destroy it).
+ * IrLMP and IrLAP may queue the packet, and in those cases will need
+ * to use skb_get() to keep it around.
+ * Jean II
+ */
+int irlmp_data_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+	int	ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	/* Make room for MUX header */
+	IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;);
+	skb_push(userdata, LMP_HEADER);
+
+	ret = irlmp_do_lsap_event(self, LM_DATA_REQUEST, userdata);
+
+	/* Drop reference count - see irlap_data_request(). */
+	dev_kfree_skb(userdata);
+
+	return ret;
+}
+EXPORT_SYMBOL(irlmp_data_request);
+
+/*
+ * Function irlmp_data_indication (handle, skb)
+ *
+ *    Got data from LAP layer so pass it up to upper layer
+ *
+ */
+void irlmp_data_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+	/* Hide LMP header from layer above */
+	skb_pull(skb, LMP_HEADER);
+
+	if (self->notify.data_indication) {
+		/* Don't forget to refcount it - see irlap_driver_rcv(). */
+		skb_get(skb);
+		self->notify.data_indication(self->notify.instance, self, skb);
+	}
+}
+
+/*
+ * Function irlmp_udata_request (self, skb)
+ */
+int irlmp_udata_request(struct lsap_cb *self, struct sk_buff *userdata)
+{
+	int	ret;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(userdata != NULL, return -1;);
+
+	/* Make room for MUX header */
+	IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;);
+	skb_push(userdata, LMP_HEADER);
+
+	ret = irlmp_do_lsap_event(self, LM_UDATA_REQUEST, userdata);
+
+	/* Drop reference count - see irlap_data_request(). */
+	dev_kfree_skb(userdata);
+
+	return ret;
+}
+
+/*
+ * Function irlmp_udata_indication (self, skb)
+ *
+ *    Send unreliable data (but still within the connection)
+ *
+ */
+void irlmp_udata_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Hide LMP header from layer above */
+	skb_pull(skb, LMP_HEADER);
+
+	if (self->notify.udata_indication) {
+		/* Don't forget to refcount it - see irlap_driver_rcv(). */
+		skb_get(skb);
+		self->notify.udata_indication(self->notify.instance, self,
+					      skb);
+	}
+}
+
+/*
+ * Function irlmp_connless_data_request (self, skb)
+ */
+#ifdef CONFIG_IRDA_ULTRA
+int irlmp_connless_data_request(struct lsap_cb *self, struct sk_buff *userdata,
+				__u8 pid)
+{
+	struct sk_buff *clone_skb;
+	struct lap_cb *lap;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(userdata != NULL, return -1;);
+
+	/* Make room for MUX and PID header */
+	IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER+LMP_PID_HEADER,
+		    return -1;);
+
+	/* Insert protocol identifier */
+	skb_push(userdata, LMP_PID_HEADER);
+	if(self != NULL)
+	  userdata->data[0] = self->pid;
+	else
+	  userdata->data[0] = pid;
+
+	/* Connectionless sockets must use 0x70 */
+	skb_push(userdata, LMP_HEADER);
+	userdata->data[0] = userdata->data[1] = LSAP_CONNLESS;
+
+	/* Try to send Connectionless  packets out on all links */
+	lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+	while (lap != NULL) {
+		IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return -1;);
+
+		clone_skb = skb_clone(userdata, GFP_ATOMIC);
+		if (!clone_skb) {
+			dev_kfree_skb(userdata);
+			return -ENOMEM;
+		}
+
+		irlap_unitdata_request(lap->irlap, clone_skb);
+		/* irlap_unitdata_request() don't increase refcount,
+		 * so no dev_kfree_skb() - Jean II */
+
+		lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+	}
+	dev_kfree_skb(userdata);
+
+	return 0;
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlmp_connless_data_indication (self, skb)
+ *
+ *    Receive unreliable data outside any connection. Mostly used by Ultra
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlmp_connless_data_indication(struct lsap_cb *self, struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/* Hide LMP and PID header from layer above */
+	skb_pull(skb, LMP_HEADER+LMP_PID_HEADER);
+
+	if (self->notify.udata_indication) {
+		/* Don't forget to refcount it - see irlap_driver_rcv(). */
+		skb_get(skb);
+		self->notify.udata_indication(self->notify.instance, self,
+					      skb);
+	}
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Propagate status indication from LAP to LSAPs (via LMP)
+ * This don't trigger any change of state in lap_cb, lmp_cb or lsap_cb,
+ * and the event is stateless, therefore we can bypass both state machines
+ * and send the event direct to the LSAP user.
+ * Jean II
+ */
+void irlmp_status_indication(struct lap_cb *self,
+			     LINK_STATUS link, LOCK_STATUS lock)
+{
+	struct lsap_cb *next;
+	struct lsap_cb *curr;
+
+	/* Send status_indication to all LSAPs using this link */
+	curr = (struct lsap_cb *) hashbin_get_first( self->lsaps);
+	while (NULL != hashbin_find_next(self->lsaps, (long) curr, NULL,
+					 (void *) &next) ) {
+		IRDA_ASSERT(curr->magic == LMP_LSAP_MAGIC, return;);
+		/*
+		 *  Inform service user if he has requested it
+		 */
+		if (curr->notify.status_indication != NULL)
+			curr->notify.status_indication(curr->notify.instance,
+						       link, lock);
+		else
+			IRDA_DEBUG(2, "%s(), no handler\n", __func__);
+
+		curr = next;
+	}
+}
+
+/*
+ * Receive flow control indication from LAP.
+ * LAP want us to send it one more frame. We implement a simple round
+ * robin scheduler between the active sockets so that we get a bit of
+ * fairness. Note that the round robin is far from perfect, but it's
+ * better than nothing.
+ * We then poll the selected socket so that we can do synchronous
+ * refilling of IrLAP (which allow to minimise the number of buffers).
+ * Jean II
+ */
+void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow)
+{
+	struct lsap_cb *next;
+	struct lsap_cb *curr;
+	int	lsap_todo;
+
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+	IRDA_ASSERT(flow == FLOW_START, return;);
+
+	/* Get the number of lsap. That's the only safe way to know
+	 * that we have looped around... - Jean II */
+	lsap_todo = HASHBIN_GET_SIZE(self->lsaps);
+	IRDA_DEBUG(4, "%s() : %d lsaps to scan\n", __func__, lsap_todo);
+
+	/* Poll lsap in order until the queue is full or until we
+	 * tried them all.
+	 * Most often, the current LSAP will have something to send,
+	 * so we will go through this loop only once. - Jean II */
+	while((lsap_todo--) &&
+	      (IRLAP_GET_TX_QUEUE_LEN(self->irlap) < LAP_HIGH_THRESHOLD)) {
+		/* Try to find the next lsap we should poll. */
+		next = self->flow_next;
+		/* If we have no lsap, restart from first one */
+		if(next == NULL)
+			next = (struct lsap_cb *) hashbin_get_first(self->lsaps);
+		/* Verify current one and find the next one */
+		curr = hashbin_find_next(self->lsaps, (long) next, NULL,
+					 (void *) &self->flow_next);
+		/* Uh-oh... Paranoia */
+		if(curr == NULL)
+			break;
+		IRDA_DEBUG(4, "%s() : curr is %p, next was %p and is now %p, still %d to go - queue len = %d\n", __func__, curr, next, self->flow_next, lsap_todo, IRLAP_GET_TX_QUEUE_LEN(self->irlap));
+
+		/* Inform lsap user that it can send one more packet. */
+		if (curr->notify.flow_indication != NULL)
+			curr->notify.flow_indication(curr->notify.instance,
+						     curr, flow);
+		else
+			IRDA_DEBUG(1, "%s(), no handler\n", __func__);
+	}
+}
+
+#if 0
+/*
+ * Function irlmp_hint_to_service (hint)
+ *
+ *    Returns a list of all servics contained in the given hint bits. This
+ *    function assumes that the hint bits have the size of two bytes only
+ */
+__u8 *irlmp_hint_to_service(__u8 *hint)
+{
+	__u8 *service;
+	int i = 0;
+
+	/*
+	 * Allocate array to store services in. 16 entries should be safe
+	 * since we currently only support 2 hint bytes
+	 */
+	service = kmalloc(16, GFP_ATOMIC);
+	if (!service) {
+		IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	if (!hint[0]) {
+		IRDA_DEBUG(1, "<None>\n");
+		kfree(service);
+		return NULL;
+	}
+	if (hint[0] & HINT_PNP)
+		IRDA_DEBUG(1, "PnP Compatible ");
+	if (hint[0] & HINT_PDA)
+		IRDA_DEBUG(1, "PDA/Palmtop ");
+	if (hint[0] & HINT_COMPUTER)
+		IRDA_DEBUG(1, "Computer ");
+	if (hint[0] & HINT_PRINTER) {
+		IRDA_DEBUG(1, "Printer ");
+		service[i++] = S_PRINTER;
+	}
+	if (hint[0] & HINT_MODEM)
+		IRDA_DEBUG(1, "Modem ");
+	if (hint[0] & HINT_FAX)
+		IRDA_DEBUG(1, "Fax ");
+	if (hint[0] & HINT_LAN) {
+		IRDA_DEBUG(1, "LAN Access ");
+		service[i++] = S_LAN;
+	}
+	/*
+	 *  Test if extension byte exists. This byte will usually be
+	 *  there, but this is not really required by the standard.
+	 *  (IrLMP p. 29)
+	 */
+	if (hint[0] & HINT_EXTENSION) {
+		if (hint[1] & HINT_TELEPHONY) {
+			IRDA_DEBUG(1, "Telephony ");
+			service[i++] = S_TELEPHONY;
+		} if (hint[1] & HINT_FILE_SERVER)
+			IRDA_DEBUG(1, "File Server ");
+
+		if (hint[1] & HINT_COMM) {
+			IRDA_DEBUG(1, "IrCOMM ");
+			service[i++] = S_COMM;
+		}
+		if (hint[1] & HINT_OBEX) {
+			IRDA_DEBUG(1, "IrOBEX ");
+			service[i++] = S_OBEX;
+		}
+	}
+	IRDA_DEBUG(1, "\n");
+
+	/* So that client can be notified about any discovery */
+	service[i++] = S_ANY;
+
+	service[i] = S_END;
+
+	return service;
+}
+#endif
+
+static const __u16 service_hint_mapping[S_END][2] = {
+	{ HINT_PNP,		0 },			/* S_PNP */
+	{ HINT_PDA,		0 },			/* S_PDA */
+	{ HINT_COMPUTER,	0 },			/* S_COMPUTER */
+	{ HINT_PRINTER,		0 },			/* S_PRINTER */
+	{ HINT_MODEM,		0 },			/* S_MODEM */
+	{ HINT_FAX,		0 },			/* S_FAX */
+	{ HINT_LAN,		0 },			/* S_LAN */
+	{ HINT_EXTENSION,	HINT_TELEPHONY },	/* S_TELEPHONY */
+	{ HINT_EXTENSION,	HINT_COMM },		/* S_COMM */
+	{ HINT_EXTENSION,	HINT_OBEX },		/* S_OBEX */
+	{ 0xFF,			0xFF },			/* S_ANY */
+};
+
+/*
+ * Function irlmp_service_to_hint (service)
+ *
+ *    Converts a service type, to a hint bit
+ *
+ *    Returns: a 16 bit hint value, with the service bit set
+ */
+__u16 irlmp_service_to_hint(int service)
+{
+	__u16_host_order hint;
+
+	hint.byte[0] = service_hint_mapping[service][0];
+	hint.byte[1] = service_hint_mapping[service][1];
+
+	return hint.word;
+}
+EXPORT_SYMBOL(irlmp_service_to_hint);
+
+/*
+ * Function irlmp_register_service (service)
+ *
+ *    Register local service with IrLMP
+ *
+ */
+void *irlmp_register_service(__u16 hints)
+{
+	irlmp_service_t *service;
+
+	IRDA_DEBUG(4, "%s(), hints = %04x\n", __func__, hints);
+
+	/* Make a new registration */
+	service = kmalloc(sizeof(irlmp_service_t), GFP_ATOMIC);
+	if (!service) {
+		IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+	service->hints.word = hints;
+	hashbin_insert(irlmp->services, (irda_queue_t *) service,
+		       (long) service, NULL);
+
+	irlmp->hints.word |= hints;
+
+	return (void *)service;
+}
+EXPORT_SYMBOL(irlmp_register_service);
+
+/*
+ * Function irlmp_unregister_service (handle)
+ *
+ *    Unregister service with IrLMP.
+ *
+ *    Returns: 0 on success, -1 on error
+ */
+int irlmp_unregister_service(void *handle)
+{
+	irlmp_service_t *service;
+	unsigned long flags;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	if (!handle)
+		return -1;
+
+	/* Caller may call with invalid handle (it's legal) - Jean II */
+	service = hashbin_lock_find(irlmp->services, (long) handle, NULL);
+	if (!service) {
+		IRDA_DEBUG(1, "%s(), Unknown service!\n", __func__);
+		return -1;
+	}
+
+	hashbin_remove_this(irlmp->services, (irda_queue_t *) service);
+	kfree(service);
+
+	/* Remove old hint bits */
+	irlmp->hints.word = 0;
+
+	/* Refresh current hint bits */
+	spin_lock_irqsave(&irlmp->services->hb_spinlock, flags);
+	service = (irlmp_service_t *) hashbin_get_first(irlmp->services);
+	while (service) {
+		irlmp->hints.word |= service->hints.word;
+
+		service = (irlmp_service_t *)hashbin_get_next(irlmp->services);
+	}
+	spin_unlock_irqrestore(&irlmp->services->hb_spinlock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(irlmp_unregister_service);
+
+/*
+ * Function irlmp_register_client (hint_mask, callback1, callback2)
+ *
+ *    Register a local client with IrLMP
+ *	First callback is selective discovery (based on hints)
+ *	Second callback is for selective discovery expiries
+ *
+ *    Returns: handle > 0 on success, 0 on error
+ */
+void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb,
+			    DISCOVERY_CALLBACK2 expir_clb, void *priv)
+{
+	irlmp_client_t *client;
+
+	IRDA_DEBUG(1, "%s()\n", __func__);
+	IRDA_ASSERT(irlmp != NULL, return NULL;);
+
+	/* Make a new registration */
+	client = kmalloc(sizeof(irlmp_client_t), GFP_ATOMIC);
+	if (!client) {
+		IRDA_DEBUG( 1, "%s(), Unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	/* Register the details */
+	client->hint_mask.word = hint_mask;
+	client->disco_callback = disco_clb;
+	client->expir_callback = expir_clb;
+	client->priv = priv;
+
+	hashbin_insert(irlmp->clients, (irda_queue_t *) client,
+		       (long) client, NULL);
+
+	return (void *) client;
+}
+EXPORT_SYMBOL(irlmp_register_client);
+
+/*
+ * Function irlmp_update_client (handle, hint_mask, callback1, callback2)
+ *
+ *    Updates specified client (handle) with possibly new hint_mask and
+ *    callback
+ *
+ *    Returns: 0 on success, -1 on error
+ */
+int irlmp_update_client(void *handle, __u16 hint_mask,
+			DISCOVERY_CALLBACK1 disco_clb,
+			DISCOVERY_CALLBACK2 expir_clb, void *priv)
+{
+	irlmp_client_t *client;
+
+	if (!handle)
+		return -1;
+
+	client = hashbin_lock_find(irlmp->clients, (long) handle, NULL);
+	if (!client) {
+		IRDA_DEBUG(1, "%s(), Unknown client!\n", __func__);
+		return -1;
+	}
+
+	client->hint_mask.word = hint_mask;
+	client->disco_callback = disco_clb;
+	client->expir_callback = expir_clb;
+	client->priv = priv;
+
+	return 0;
+}
+EXPORT_SYMBOL(irlmp_update_client);
+
+/*
+ * Function irlmp_unregister_client (handle)
+ *
+ *    Returns: 0 on success, -1 on error
+ *
+ */
+int irlmp_unregister_client(void *handle)
+{
+	struct irlmp_client *client;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	if (!handle)
+		return -1;
+
+	/* Caller may call with invalid handle (it's legal) - Jean II */
+	client = hashbin_lock_find(irlmp->clients, (long) handle, NULL);
+	if (!client) {
+		IRDA_DEBUG(1, "%s(), Unknown client!\n", __func__);
+		return -1;
+	}
+
+	IRDA_DEBUG(4, "%s(), removing client!\n", __func__);
+	hashbin_remove_this(irlmp->clients, (irda_queue_t *) client);
+	kfree(client);
+
+	return 0;
+}
+EXPORT_SYMBOL(irlmp_unregister_client);
+
+/*
+ * Function irlmp_slsap_inuse (slsap)
+ *
+ *    Check if the given source LSAP selector is in use
+ *
+ * This function is clearly not very efficient. On the mitigating side, the
+ * stack make sure that in 99% of the cases, we are called only once
+ * for each socket allocation. We could probably keep a bitmap
+ * of the allocated LSAP, but I'm not sure the complexity is worth it.
+ * Jean II
+ */
+static int irlmp_slsap_inuse(__u8 slsap_sel)
+{
+	struct lsap_cb *self;
+	struct lap_cb *lap;
+	unsigned long flags;
+
+	IRDA_ASSERT(irlmp != NULL, return TRUE;);
+	IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return TRUE;);
+	IRDA_ASSERT(slsap_sel != LSAP_ANY, return TRUE;);
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+#ifdef CONFIG_IRDA_ULTRA
+	/* Accept all bindings to the connectionless LSAP */
+	if (slsap_sel == LSAP_CONNLESS)
+		return FALSE;
+#endif /* CONFIG_IRDA_ULTRA */
+
+	/* Valid values are between 0 and 127 (0x0-0x6F) */
+	if (slsap_sel > LSAP_MAX)
+		return TRUE;
+
+	/*
+	 *  Check if slsap is already in use. To do this we have to loop over
+	 *  every IrLAP connection and check every LSAP associated with each
+	 *  the connection.
+	 */
+	spin_lock_irqsave_nested(&irlmp->links->hb_spinlock, flags,
+			SINGLE_DEPTH_NESTING);
+	lap = (struct lap_cb *) hashbin_get_first(irlmp->links);
+	while (lap != NULL) {
+		IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, goto errlap;);
+
+		/* Careful for priority inversions here !
+		 * irlmp->links is never taken while another IrDA
+		 * spinlock is held, so we are safe. Jean II */
+		spin_lock(&lap->lsaps->hb_spinlock);
+
+		/* For this IrLAP, check all the LSAPs */
+		self = (struct lsap_cb *) hashbin_get_first(lap->lsaps);
+		while (self != NULL) {
+			IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC,
+				    goto errlsap;);
+
+			if ((self->slsap_sel == slsap_sel)) {
+				IRDA_DEBUG(4, "Source LSAP selector=%02x in use\n",
+					   self->slsap_sel);
+				goto errlsap;
+			}
+			self = (struct lsap_cb*) hashbin_get_next(lap->lsaps);
+		}
+		spin_unlock(&lap->lsaps->hb_spinlock);
+
+		/* Next LAP */
+		lap = (struct lap_cb *) hashbin_get_next(irlmp->links);
+	}
+	spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags);
+
+	/*
+	 * Server sockets are typically waiting for connections and
+	 * therefore reside in the unconnected list. We don't want
+	 * to give out their LSAPs for obvious reasons...
+	 * Jean II
+	 */
+	spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+	self = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps);
+	while (self != NULL) {
+		IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, goto erruncon;);
+		if ((self->slsap_sel == slsap_sel)) {
+			IRDA_DEBUG(4, "Source LSAP selector=%02x in use (unconnected)\n",
+				   self->slsap_sel);
+			goto erruncon;
+		}
+		self = (struct lsap_cb*) hashbin_get_next(irlmp->unconnected_lsaps);
+	}
+	spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+	return FALSE;
+
+	/* Error exit from within one of the two nested loops.
+	 * Make sure we release the right spinlock in the righ order.
+	 * Jean II */
+errlsap:
+	spin_unlock(&lap->lsaps->hb_spinlock);
+IRDA_ASSERT_LABEL(errlap:)
+	spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags);
+	return TRUE;
+
+	/* Error exit from within the unconnected loop.
+	 * Just one spinlock to release... Jean II */
+erruncon:
+	spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+	return TRUE;
+}
+
+/*
+ * Function irlmp_find_free_slsap ()
+ *
+ *    Find a free source LSAP to use. This function is called if the service
+ *    user has requested a source LSAP equal to LM_ANY
+ */
+static __u8 irlmp_find_free_slsap(void)
+{
+	__u8 lsap_sel;
+	int wrapped = 0;
+
+	IRDA_ASSERT(irlmp != NULL, return -1;);
+	IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return -1;);
+
+	/* Most users don't really care which LSAPs they are given,
+	 * and therefore we automatically give them a free LSAP.
+	 * This function try to find a suitable LSAP, i.e. which is
+	 * not in use and is within the acceptable range. Jean II */
+
+	do {
+		/* Always increment to LSAP number before using it.
+		 * In theory, we could reuse the last LSAP number, as long
+		 * as it is no longer in use. Some IrDA stack do that.
+		 * However, the previous socket may be half closed, i.e.
+		 * we closed it, we think it's no longer in use, but the
+		 * other side did not receive our close and think it's
+		 * active and still send data on it.
+		 * This is similar to what is done with PIDs and TCP ports.
+		 * Also, this reduce the number of calls to irlmp_slsap_inuse()
+		 * which is an expensive function to call.
+		 * Jean II */
+		irlmp->last_lsap_sel++;
+
+		/* Check if we need to wraparound (0x70-0x7f are reserved) */
+		if (irlmp->last_lsap_sel > LSAP_MAX) {
+			/* 0x00-0x10 are also reserved for well know ports */
+			irlmp->last_lsap_sel = 0x10;
+
+			/* Make sure we terminate the loop */
+			if (wrapped++) {
+				IRDA_ERROR("%s: no more free LSAPs !\n",
+					   __func__);
+				return 0;
+			}
+		}
+
+		/* If the LSAP is in use, try the next one.
+		 * Despite the autoincrement, we need to check if the lsap
+		 * is really in use or not, first because LSAP may be
+		 * directly allocated in irlmp_open_lsap(), and also because
+		 * we may wraparound on old sockets. Jean II */
+	} while (irlmp_slsap_inuse(irlmp->last_lsap_sel));
+
+	/* Got it ! */
+	lsap_sel = irlmp->last_lsap_sel;
+	IRDA_DEBUG(4, "%s(), found free lsap_sel=%02x\n",
+		   __func__, lsap_sel);
+
+	return lsap_sel;
+}
+
+/*
+ * Function irlmp_convert_lap_reason (lap_reason)
+ *
+ *    Converts IrLAP disconnect reason codes to IrLMP disconnect reason
+ *    codes
+ *
+ */
+LM_REASON irlmp_convert_lap_reason( LAP_REASON lap_reason)
+{
+	int reason = LM_LAP_DISCONNECT;
+
+	switch (lap_reason) {
+	case LAP_DISC_INDICATION: /* Received a disconnect request from peer */
+		IRDA_DEBUG( 1, "%s(), LAP_DISC_INDICATION\n", __func__);
+		reason = LM_USER_REQUEST;
+		break;
+	case LAP_NO_RESPONSE:    /* To many retransmits without response */
+		IRDA_DEBUG( 1, "%s(), LAP_NO_RESPONSE\n", __func__);
+		reason = LM_LAP_DISCONNECT;
+		break;
+	case LAP_RESET_INDICATION:
+		IRDA_DEBUG( 1, "%s(), LAP_RESET_INDICATION\n", __func__);
+		reason = LM_LAP_RESET;
+		break;
+	case LAP_FOUND_NONE:
+	case LAP_MEDIA_BUSY:
+	case LAP_PRIMARY_CONFLICT:
+		IRDA_DEBUG(1, "%s(), LAP_FOUND_NONE, LAP_MEDIA_BUSY or LAP_PRIMARY_CONFLICT\n", __func__);
+		reason = LM_CONNECT_FAILURE;
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown IrLAP disconnect reason %d!\n",
+			   __func__, lap_reason);
+		reason = LM_LAP_DISCONNECT;
+		break;
+	}
+
+	return reason;
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct irlmp_iter_state {
+	hashbin_t *hashbin;
+};
+
+#define LSAP_START_TOKEN	((void *)1)
+#define LINK_START_TOKEN	((void *)2)
+
+static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off)
+{
+	void *element;
+
+	spin_lock_irq(&iter->hashbin->hb_spinlock);
+	for (element = hashbin_get_first(iter->hashbin);
+	     element != NULL;
+	     element = hashbin_get_next(iter->hashbin)) {
+		if (!off || *off-- == 0) {
+			/* NB: hashbin left locked */
+			return element;
+		}
+	}
+	spin_unlock_irq(&iter->hashbin->hb_spinlock);
+	iter->hashbin = NULL;
+	return NULL;
+}
+
+
+static void *irlmp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct irlmp_iter_state *iter = seq->private;
+	void *v;
+	loff_t off = *pos;
+
+	iter->hashbin = NULL;
+	if (off-- == 0)
+		return LSAP_START_TOKEN;
+
+	iter->hashbin = irlmp->unconnected_lsaps;
+	v = irlmp_seq_hb_idx(iter, &off);
+	if (v)
+		return v;
+
+	if (off-- == 0)
+		return LINK_START_TOKEN;
+
+	iter->hashbin = irlmp->links;
+	return irlmp_seq_hb_idx(iter, &off);
+}
+
+static void *irlmp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct irlmp_iter_state *iter = seq->private;
+
+	++*pos;
+
+	if (v == LSAP_START_TOKEN) {		/* start of list of lsaps */
+		iter->hashbin = irlmp->unconnected_lsaps;
+		v = irlmp_seq_hb_idx(iter, NULL);
+		return v ? v : LINK_START_TOKEN;
+	}
+
+	if (v == LINK_START_TOKEN) {		/* start of list of links */
+		iter->hashbin = irlmp->links;
+		return irlmp_seq_hb_idx(iter, NULL);
+	}
+
+	v = hashbin_get_next(iter->hashbin);
+
+	if (v == NULL) {			/* no more in this hash bin */
+		spin_unlock_irq(&iter->hashbin->hb_spinlock);
+
+		if (iter->hashbin == irlmp->unconnected_lsaps)
+			v =  LINK_START_TOKEN;
+
+		iter->hashbin = NULL;
+	}
+	return v;
+}
+
+static void irlmp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct irlmp_iter_state *iter = seq->private;
+
+	if (iter->hashbin)
+		spin_unlock_irq(&iter->hashbin->hb_spinlock);
+}
+
+static int irlmp_seq_show(struct seq_file *seq, void *v)
+{
+	const struct irlmp_iter_state *iter = seq->private;
+	struct lsap_cb *self = v;
+
+	if (v == LSAP_START_TOKEN)
+		seq_puts(seq, "Unconnected LSAPs:\n");
+	else if (v == LINK_START_TOKEN)
+		seq_puts(seq, "\nRegistered Link Layers:\n");
+	else if (iter->hashbin == irlmp->unconnected_lsaps) {
+		self = v;
+		IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EINVAL; );
+		seq_printf(seq, "lsap state: %s, ",
+			   irlsap_state[ self->lsap_state]);
+		seq_printf(seq,
+			   "slsap_sel: %#02x, dlsap_sel: %#02x, ",
+			   self->slsap_sel, self->dlsap_sel);
+		seq_printf(seq, "(%s)", self->notify.name);
+		seq_printf(seq, "\n");
+	} else if (iter->hashbin == irlmp->links) {
+		struct lap_cb *lap = v;
+
+		seq_printf(seq, "lap state: %s, ",
+			   irlmp_state[lap->lap_state]);
+
+		seq_printf(seq, "saddr: %#08x, daddr: %#08x, ",
+			   lap->saddr, lap->daddr);
+		seq_printf(seq, "num lsaps: %d",
+			   HASHBIN_GET_SIZE(lap->lsaps));
+		seq_printf(seq, "\n");
+
+		/* Careful for priority inversions here !
+		 * All other uses of attrib spinlock are independent of
+		 * the object spinlock, so we are safe. Jean II */
+		spin_lock(&lap->lsaps->hb_spinlock);
+
+		seq_printf(seq, "\n  Connected LSAPs:\n");
+		for (self = (struct lsap_cb *) hashbin_get_first(lap->lsaps);
+		     self != NULL;
+		     self = (struct lsap_cb *)hashbin_get_next(lap->lsaps)) {
+			IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC,
+				    goto outloop;);
+			seq_printf(seq, "  lsap state: %s, ",
+				   irlsap_state[ self->lsap_state]);
+			seq_printf(seq,
+				   "slsap_sel: %#02x, dlsap_sel: %#02x, ",
+				   self->slsap_sel, self->dlsap_sel);
+			seq_printf(seq, "(%s)", self->notify.name);
+			seq_putc(seq, '\n');
+
+		}
+	IRDA_ASSERT_LABEL(outloop:)
+		spin_unlock(&lap->lsaps->hb_spinlock);
+		seq_putc(seq, '\n');
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static const struct seq_operations irlmp_seq_ops = {
+	.start  = irlmp_seq_start,
+	.next   = irlmp_seq_next,
+	.stop   = irlmp_seq_stop,
+	.show   = irlmp_seq_show,
+};
+
+static int irlmp_seq_open(struct inode *inode, struct file *file)
+{
+	IRDA_ASSERT(irlmp != NULL, return -EINVAL;);
+
+	return seq_open_private(file, &irlmp_seq_ops,
+			sizeof(struct irlmp_iter_state));
+}
+
+const struct file_operations irlmp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = irlmp_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/irlmp_event.c b/net/irda/irlmp_event.c
new file mode 100644
index 00000000..9505a7d0
--- /dev/null
+++ b/net/irda/irlmp_event.c
@@ -0,0 +1,909 @@
+/*********************************************************************
+ *
+ * Filename:      irlmp_event.c
+ * Version:       0.8
+ * Description:   An IrDA LMP event driver for Linux
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Tue Dec 14 23:04:16 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/kernel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/irlmp_event.h>
+
+const char *const irlmp_state[] = {
+	"LAP_STANDBY",
+	"LAP_U_CONNECT",
+	"LAP_ACTIVE",
+};
+
+const char *const irlsap_state[] = {
+	"LSAP_DISCONNECTED",
+	"LSAP_CONNECT",
+	"LSAP_CONNECT_PEND",
+	"LSAP_DATA_TRANSFER_READY",
+	"LSAP_SETUP",
+	"LSAP_SETUP_PEND",
+};
+
+#ifdef CONFIG_IRDA_DEBUG
+static const char *const irlmp_event[] = {
+	"LM_CONNECT_REQUEST",
+	"LM_CONNECT_CONFIRM",
+	"LM_CONNECT_RESPONSE",
+	"LM_CONNECT_INDICATION",
+
+	"LM_DISCONNECT_INDICATION",
+	"LM_DISCONNECT_REQUEST",
+
+	"LM_DATA_REQUEST",
+	"LM_UDATA_REQUEST",
+	"LM_DATA_INDICATION",
+	"LM_UDATA_INDICATION",
+
+	"LM_WATCHDOG_TIMEOUT",
+
+	/* IrLAP events */
+	"LM_LAP_CONNECT_REQUEST",
+	"LM_LAP_CONNECT_INDICATION",
+	"LM_LAP_CONNECT_CONFIRM",
+	"LM_LAP_DISCONNECT_INDICATION",
+	"LM_LAP_DISCONNECT_REQUEST",
+	"LM_LAP_DISCOVERY_REQUEST",
+	"LM_LAP_DISCOVERY_CONFIRM",
+	"LM_LAP_IDLE_TIMEOUT",
+};
+#endif	/* CONFIG_IRDA_DEBUG */
+
+/* LAP Connection control proto declarations */
+static void irlmp_state_standby  (struct lap_cb *, IRLMP_EVENT,
+				  struct sk_buff *);
+static void irlmp_state_u_connect(struct lap_cb *, IRLMP_EVENT,
+				  struct sk_buff *);
+static void irlmp_state_active   (struct lap_cb *, IRLMP_EVENT,
+				  struct sk_buff *);
+
+/* LSAP Connection control proto declarations */
+static int irlmp_state_disconnected(struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+static int irlmp_state_connect     (struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+static int irlmp_state_connect_pend(struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+static int irlmp_state_dtr         (struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+static int irlmp_state_setup       (struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+static int irlmp_state_setup_pend  (struct lsap_cb *, IRLMP_EVENT,
+				    struct sk_buff *);
+
+static void (*lap_state[]) (struct lap_cb *, IRLMP_EVENT, struct sk_buff *) =
+{
+	irlmp_state_standby,
+	irlmp_state_u_connect,
+	irlmp_state_active,
+};
+
+static int (*lsap_state[])( struct lsap_cb *, IRLMP_EVENT, struct sk_buff *) =
+{
+	irlmp_state_disconnected,
+	irlmp_state_connect,
+	irlmp_state_connect_pend,
+	irlmp_state_dtr,
+	irlmp_state_setup,
+	irlmp_state_setup_pend
+};
+
+static inline void irlmp_next_lap_state(struct lap_cb *self,
+					IRLMP_STATE state)
+{
+	/*
+	IRDA_DEBUG(4, "%s(), LMP LAP = %s\n", __func__, irlmp_state[state]);
+	*/
+	self->lap_state = state;
+}
+
+static inline void irlmp_next_lsap_state(struct lsap_cb *self,
+					 LSAP_STATE state)
+{
+	/*
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_DEBUG(4, "%s(), LMP LSAP = %s\n", __func__, irlsap_state[state]);
+	*/
+	self->lsap_state = state;
+}
+
+/* Do connection control events */
+int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event,
+			struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n",
+		__func__, irlmp_event[event], irlsap_state[ self->lsap_state]);
+
+	return (*lsap_state[self->lsap_state]) (self, event, skb);
+}
+
+/*
+ * Function do_lap_event (event, skb, info)
+ *
+ *    Do IrLAP control events
+ *
+ */
+void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event,
+			struct sk_buff *skb)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+	IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n", __func__,
+		   irlmp_event[event],
+		   irlmp_state[self->lap_state]);
+
+	(*lap_state[self->lap_state]) (self, event, skb);
+}
+
+void irlmp_discovery_timer_expired(void *data)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/* We always cleanup the log (active & passive discovery) */
+	irlmp_do_expiry();
+
+	irlmp_do_discovery(sysctl_discovery_slots);
+
+	/* Restart timer */
+	irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout * HZ);
+}
+
+void irlmp_watchdog_timer_expired(void *data)
+{
+	struct lsap_cb *self = (struct lsap_cb *) data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;);
+
+	irlmp_do_lsap_event(self, LM_WATCHDOG_TIMEOUT, NULL);
+}
+
+void irlmp_idle_timer_expired(void *data)
+{
+	struct lap_cb *self = (struct lap_cb *) data;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+	irlmp_do_lap_event(self, LM_LAP_IDLE_TIMEOUT, NULL);
+}
+
+/*
+ * Send an event on all LSAPs attached to this LAP.
+ */
+static inline void
+irlmp_do_all_lsap_event(hashbin_t *	lsap_hashbin,
+			IRLMP_EVENT	event)
+{
+	struct lsap_cb *lsap;
+	struct lsap_cb *lsap_next;
+
+	/* Note : this function use the new hashbin_find_next()
+	 * function, instead of the old hashbin_get_next().
+	 * This make sure that we are always pointing one lsap
+	 * ahead, so that if the current lsap is removed as the
+	 * result of sending the event, we don't care.
+	 * Also, as we store the context ourselves, if an enumeration
+	 * of the same lsap hashbin happens as the result of sending the
+	 * event, we don't care.
+	 * The only problem is if the next lsap is removed. In that case,
+	 * hashbin_find_next() will return NULL and we will abort the
+	 * enumeration. - Jean II */
+
+	/* Also : we don't accept any skb in input. We can *NOT* pass
+	 * the same skb to multiple clients safely, we would need to
+	 * skb_clone() it. - Jean II */
+
+	lsap = (struct lsap_cb *) hashbin_get_first(lsap_hashbin);
+
+	while (NULL != hashbin_find_next(lsap_hashbin,
+					 (long) lsap,
+					 NULL,
+					 (void *) &lsap_next) ) {
+		irlmp_do_lsap_event(lsap, event, NULL);
+		lsap = lsap_next;
+	}
+}
+
+/*********************************************************************
+ *
+ *    LAP connection control states
+ *
+ ********************************************************************/
+
+/*
+ * Function irlmp_state_standby (event, skb, info)
+ *
+ *    STANDBY, The IrLAP connection does not exist.
+ *
+ */
+static void irlmp_state_standby(struct lap_cb *self, IRLMP_EVENT event,
+				struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+	IRDA_ASSERT(self->irlap != NULL, return;);
+
+	switch (event) {
+	case LM_LAP_DISCOVERY_REQUEST:
+		/* irlmp_next_station_state( LMP_DISCOVER); */
+
+		irlap_discovery_request(self->irlap, &irlmp->discovery_cmd);
+		break;
+	case LM_LAP_CONNECT_INDICATION:
+		/*  It's important to switch state first, to avoid IrLMP to
+		 *  think that the link is free since IrLMP may then start
+		 *  discovery before the connection is properly set up. DB.
+		 */
+		irlmp_next_lap_state(self, LAP_ACTIVE);
+
+		/* Just accept connection TODO, this should be fixed */
+		irlap_connect_response(self->irlap, skb);
+		break;
+	case LM_LAP_CONNECT_REQUEST:
+		IRDA_DEBUG(4, "%s() LS_CONNECT_REQUEST\n", __func__);
+
+		irlmp_next_lap_state(self, LAP_U_CONNECT);
+
+		/* FIXME: need to set users requested QoS */
+		irlap_connect_request(self->irlap, self->daddr, NULL, 0);
+		break;
+	case LM_LAP_DISCONNECT_INDICATION:
+		IRDA_DEBUG(4, "%s(), Error LM_LAP_DISCONNECT_INDICATION\n",
+			   __func__);
+
+		irlmp_next_lap_state(self, LAP_STANDBY);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s\n",
+			   __func__, irlmp_event[event]);
+		break;
+	}
+}
+
+/*
+ * Function irlmp_state_u_connect (event, skb, info)
+ *
+ *    U_CONNECT, The layer above has tried to open an LSAP connection but
+ *    since the IrLAP connection does not exist, we must first start an
+ *    IrLAP connection. We are now waiting response from IrLAP.
+ * */
+static void irlmp_state_u_connect(struct lap_cb *self, IRLMP_EVENT event,
+				  struct sk_buff *skb)
+{
+	IRDA_DEBUG(2, "%s(), event=%s\n", __func__, irlmp_event[event]);
+
+	switch (event) {
+	case LM_LAP_CONNECT_INDICATION:
+		/*  It's important to switch state first, to avoid IrLMP to
+		 *  think that the link is free since IrLMP may then start
+		 *  discovery before the connection is properly set up. DB.
+		 */
+		irlmp_next_lap_state(self, LAP_ACTIVE);
+
+		/* Just accept connection TODO, this should be fixed */
+		irlap_connect_response(self->irlap, skb);
+
+		/* Tell LSAPs that they can start sending data */
+		irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+		/* Note : by the time we get there (LAP retries and co),
+		 * the lsaps may already have gone. This avoid getting stuck
+		 * forever in LAP_ACTIVE state - Jean II */
+		if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+			IRDA_DEBUG(0, "%s() NO LSAPs !\n",  __func__);
+			irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT);
+		}
+		break;
+	case LM_LAP_CONNECT_REQUEST:
+		/* Already trying to connect */
+		break;
+	case LM_LAP_CONNECT_CONFIRM:
+		/* For all lsap_ce E Associated do LS_Connect_confirm */
+		irlmp_next_lap_state(self, LAP_ACTIVE);
+
+		/* Tell LSAPs that they can start sending data */
+		irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+		/* Note : by the time we get there (LAP retries and co),
+		 * the lsaps may already have gone. This avoid getting stuck
+		 * forever in LAP_ACTIVE state - Jean II */
+		if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+			IRDA_DEBUG(0, "%s() NO LSAPs !\n",  __func__);
+			irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT);
+		}
+		break;
+	case LM_LAP_DISCONNECT_INDICATION:
+		IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_INDICATION\n",  __func__);
+		irlmp_next_lap_state(self, LAP_STANDBY);
+
+		/* Send disconnect event to all LSAPs using this link */
+		irlmp_do_all_lsap_event(self->lsaps,
+					LM_LAP_DISCONNECT_INDICATION);
+		break;
+	case LM_LAP_DISCONNECT_REQUEST:
+		IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_REQUEST\n",  __func__);
+
+		/* One of the LSAP did timeout or was closed, if it was
+		 * the last one, try to get out of here - Jean II */
+		if (HASHBIN_GET_SIZE(self->lsaps) <= 1) {
+			irlap_disconnect_request(self->irlap);
+		}
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s\n",
+			 __func__, irlmp_event[event]);
+		break;
+	}
+}
+
+/*
+ * Function irlmp_state_active (event, skb, info)
+ *
+ *    ACTIVE, IrLAP connection is active
+ *
+ */
+static void irlmp_state_active(struct lap_cb *self, IRLMP_EVENT event,
+			       struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	switch (event) {
+	case LM_LAP_CONNECT_REQUEST:
+		IRDA_DEBUG(4, "%s(), LS_CONNECT_REQUEST\n", __func__);
+
+		/*
+		 * IrLAP may have a pending disconnect. We tried to close
+		 * IrLAP, but it was postponed because the link was
+		 * busy or we were still sending packets. As we now
+		 * need it, make sure it stays on. Jean II
+		 */
+		irlap_clear_disconnect(self->irlap);
+
+		/*
+		 *  LAP connection already active, just bounce back! Since we
+		 *  don't know which LSAP that tried to do this, we have to
+		 *  notify all LSAPs using this LAP, but that should be safe to
+		 *  do anyway.
+		 */
+		irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM);
+
+		/* Needed by connect indication */
+		irlmp_do_all_lsap_event(irlmp->unconnected_lsaps,
+					LM_LAP_CONNECT_CONFIRM);
+		/* Keep state */
+		break;
+	case LM_LAP_DISCONNECT_REQUEST:
+		/*
+		 *  Need to find out if we should close IrLAP or not. If there
+		 *  is only one LSAP connection left on this link, that LSAP
+		 *  must be the one that tries to close IrLAP. It will be
+		 *  removed later and moved to the list of unconnected LSAPs
+		 */
+		if (HASHBIN_GET_SIZE(self->lsaps) > 0) {
+			/* Timer value is checked in irsysctl - Jean II */
+			irlmp_start_idle_timer(self, sysctl_lap_keepalive_time * HZ / 1000);
+		} else {
+			/* No more connections, so close IrLAP */
+
+			/* We don't want to change state just yet, because
+			 * we want to reflect accurately the real state of
+			 * the LAP, not the state we wish it was in,
+			 * so that we don't lose LM_LAP_CONNECT_REQUEST.
+			 * In some cases, IrLAP won't close the LAP
+			 * immediately. For example, it might still be
+			 * retrying packets or waiting for the pf bit.
+			 * As the LAP always send a DISCONNECT_INDICATION
+			 * in PCLOSE or SCLOSE, just change state on that.
+			 * Jean II */
+			irlap_disconnect_request(self->irlap);
+		}
+		break;
+	case LM_LAP_IDLE_TIMEOUT:
+		if (HASHBIN_GET_SIZE(self->lsaps) == 0) {
+			/* Same reasoning as above - keep state */
+			irlap_disconnect_request(self->irlap);
+		}
+		break;
+	case LM_LAP_DISCONNECT_INDICATION:
+		irlmp_next_lap_state(self, LAP_STANDBY);
+
+		/* In some case, at this point our side has already closed
+		 * all lsaps, and we are waiting for the idle_timer to
+		 * expire. If another device reconnect immediately, the
+		 * idle timer will expire in the midle of the connection
+		 * initialisation, screwing up things a lot...
+		 * Therefore, we must stop the timer... */
+		irlmp_stop_idle_timer(self);
+
+		/*
+		 *  Inform all connected LSAP's using this link
+		 */
+		irlmp_do_all_lsap_event(self->lsaps,
+					LM_LAP_DISCONNECT_INDICATION);
+
+		/* Force an expiry of the discovery log.
+		 * Now that the LAP is free, the system may attempt to
+		 * connect to another device. Unfortunately, our entries
+		 * are stale. There is a small window (<3s) before the
+		 * normal discovery will run and where irlmp_connect_request()
+		 * can get the wrong info, so make sure things get
+		 * cleaned *NOW* ;-) - Jean II */
+		irlmp_do_expiry();
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s\n",
+			 __func__, irlmp_event[event]);
+		break;
+	}
+}
+
+/*********************************************************************
+ *
+ *    LSAP connection control states
+ *
+ ********************************************************************/
+
+/*
+ * Function irlmp_state_disconnected (event, skb, info)
+ *
+ *    DISCONNECTED
+ *
+ */
+static int irlmp_state_disconnected(struct lsap_cb *self, IRLMP_EVENT event,
+				    struct sk_buff *skb)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	switch (event) {
+#ifdef CONFIG_IRDA_ULTRA
+	case LM_UDATA_INDICATION:
+		/* This is most bizarre. Those packets are  aka unreliable
+		 * connected, aka IrLPT or SOCK_DGRAM/IRDAPROTO_UNITDATA.
+		 * Why do we pass them as Ultra ??? Jean II */
+		irlmp_connless_data_indication(self, skb);
+		break;
+#endif /* CONFIG_IRDA_ULTRA */
+	case LM_CONNECT_REQUEST:
+		IRDA_DEBUG(4, "%s(), LM_CONNECT_REQUEST\n", __func__);
+
+		if (self->conn_skb) {
+			IRDA_WARNING("%s: busy with another request!\n",
+				     __func__);
+			return -EBUSY;
+		}
+		/* Don't forget to refcount it (see irlmp_connect_request()) */
+		skb_get(skb);
+		self->conn_skb = skb;
+
+		irlmp_next_lsap_state(self, LSAP_SETUP_PEND);
+
+		/* Start watchdog timer (5 secs for now) */
+		irlmp_start_watchdog_timer(self, 5*HZ);
+
+		irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL);
+		break;
+	case LM_CONNECT_INDICATION:
+		if (self->conn_skb) {
+			IRDA_WARNING("%s: busy with another request!\n",
+				     __func__);
+			return -EBUSY;
+		}
+		/* Don't forget to refcount it (see irlap_driver_rcv()) */
+		skb_get(skb);
+		self->conn_skb = skb;
+
+		irlmp_next_lsap_state(self, LSAP_CONNECT_PEND);
+
+		/* Start watchdog timer
+		 * This is not mentionned in the spec, but there is a rare
+		 * race condition that can get the socket stuck.
+		 * If we receive this event while our LAP is closing down,
+		 * the LM_LAP_CONNECT_REQUEST get lost and we get stuck in
+		 * CONNECT_PEND state forever.
+		 * The other cause of getting stuck down there is if the
+		 * higher layer never reply to the CONNECT_INDICATION.
+		 * Anyway, it make sense to make sure that we always have
+		 * a backup plan. 1 second is plenty (should be immediate).
+		 * Jean II */
+		irlmp_start_watchdog_timer(self, 1*HZ);
+
+		irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL);
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlmp_state_connect (self, event, skb)
+ *
+ *    CONNECT
+ *
+ */
+static int irlmp_state_connect(struct lsap_cb *self, IRLMP_EVENT event,
+				struct sk_buff *skb)
+{
+	struct lsap_cb *lsap;
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	switch (event) {
+	case LM_CONNECT_RESPONSE:
+		/*
+		 *  Bind this LSAP to the IrLAP link where the connect was
+		 *  received
+		 */
+		lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self,
+				      NULL);
+
+		IRDA_ASSERT(lsap == self, return -1;);
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		IRDA_ASSERT(self->lap->lsaps != NULL, return -1;);
+
+		hashbin_insert(self->lap->lsaps, (irda_queue_t *) self,
+			       (long) self, NULL);
+
+		set_bit(0, &self->connected);	/* TRUE */
+
+		irlmp_send_lcf_pdu(self->lap, self->dlsap_sel,
+				   self->slsap_sel, CONNECT_CNF, skb);
+
+		del_timer(&self->watchdog_timer);
+
+		irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY);
+		break;
+	case LM_WATCHDOG_TIMEOUT:
+		/* May happen, who knows...
+		 * Jean II */
+		IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n",  __func__);
+
+		/* Disconnect, get out... - Jean II */
+		self->lap = NULL;
+		self->dlsap_sel = LSAP_ANY;
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+		break;
+	default:
+		/* LM_LAP_DISCONNECT_INDICATION : Should never happen, we
+		 * are *not* yet bound to the IrLAP link. Jean II */
+		IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlmp_state_connect_pend (event, skb, info)
+ *
+ *    CONNECT_PEND
+ *
+ */
+static int irlmp_state_connect_pend(struct lsap_cb *self, IRLMP_EVENT event,
+				    struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	switch (event) {
+	case LM_CONNECT_REQUEST:
+		/* Keep state */
+		break;
+	case LM_CONNECT_RESPONSE:
+		IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, "
+			   "no indication issued yet\n",  __func__);
+		/* Keep state */
+		break;
+	case LM_DISCONNECT_REQUEST:
+		IRDA_DEBUG(0, "%s(), LM_DISCONNECT_REQUEST, "
+			   "not yet bound to IrLAP connection\n",  __func__);
+		/* Keep state */
+		break;
+	case LM_LAP_CONNECT_CONFIRM:
+		IRDA_DEBUG(4, "%s(), LS_CONNECT_CONFIRM\n",  __func__);
+		irlmp_next_lsap_state(self, LSAP_CONNECT);
+
+		tx_skb = self->conn_skb;
+		self->conn_skb = NULL;
+
+		irlmp_connect_indication(self, tx_skb);
+		/* Drop reference count - see irlmp_connect_indication(). */
+		dev_kfree_skb(tx_skb);
+		break;
+	case LM_WATCHDOG_TIMEOUT:
+		/* Will happen in some rare cases because of a race condition.
+		 * Just make sure we don't stay there forever...
+		 * Jean II */
+		IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n",  __func__);
+
+		/* Go back to disconnected mode, keep the socket waiting */
+		self->lap = NULL;
+		self->dlsap_sel = LSAP_ANY;
+		if(self->conn_skb)
+			dev_kfree_skb(self->conn_skb);
+		self->conn_skb = NULL;
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+		break;
+	default:
+		/* LM_LAP_DISCONNECT_INDICATION : Should never happen, we
+		 * are *not* yet bound to the IrLAP link. Jean II */
+		IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlmp_state_dtr (self, event, skb)
+ *
+ *    DATA_TRANSFER_READY
+ *
+ */
+static int irlmp_state_dtr(struct lsap_cb *self, IRLMP_EVENT event,
+			   struct sk_buff *skb)
+{
+	LM_REASON reason;
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+	IRDA_ASSERT(self->lap != NULL, return -1;);
+
+	switch (event) {
+	case LM_DATA_REQUEST: /* Optimize for the common case */
+		irlmp_send_data_pdu(self->lap, self->dlsap_sel,
+				    self->slsap_sel, FALSE, skb);
+		break;
+	case LM_DATA_INDICATION: /* Optimize for the common case */
+		irlmp_data_indication(self, skb);
+		break;
+	case LM_UDATA_REQUEST:
+		IRDA_ASSERT(skb != NULL, return -1;);
+		irlmp_send_data_pdu(self->lap, self->dlsap_sel,
+				    self->slsap_sel, TRUE, skb);
+		break;
+	case LM_UDATA_INDICATION:
+		irlmp_udata_indication(self, skb);
+		break;
+	case LM_CONNECT_REQUEST:
+		IRDA_DEBUG(0, "%s(), LM_CONNECT_REQUEST, "
+			   "error, LSAP already connected\n", __func__);
+		/* Keep state */
+		break;
+	case LM_CONNECT_RESPONSE:
+		IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, "
+			   "error, LSAP already connected\n", __func__);
+		/* Keep state */
+		break;
+	case LM_DISCONNECT_REQUEST:
+		irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, self->slsap_sel,
+				   DISCONNECT, skb);
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+		/* Called only from irlmp_disconnect_request(), will
+		 * unbind from LAP over there. Jean II */
+
+		/* Try to close the LAP connection if its still there */
+		if (self->lap) {
+			IRDA_DEBUG(4, "%s(), trying to close IrLAP\n",
+				   __func__);
+			irlmp_do_lap_event(self->lap,
+					   LM_LAP_DISCONNECT_REQUEST,
+					   NULL);
+		}
+		break;
+	case LM_LAP_DISCONNECT_INDICATION:
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		reason = irlmp_convert_lap_reason(self->lap->reason);
+
+		irlmp_disconnect_indication(self, reason, NULL);
+		break;
+	case LM_DISCONNECT_INDICATION:
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+		IRDA_ASSERT(skb != NULL, return -1;);
+		IRDA_ASSERT(skb->len > 3, return -1;);
+		reason = skb->data[3];
+
+		 /* Try to close the LAP connection */
+		IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", __func__);
+		irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+
+		irlmp_disconnect_indication(self, reason, skb);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlmp_state_setup (event, skb, info)
+ *
+ *    SETUP, Station Control has set up the underlying IrLAP connection.
+ *    An LSAP connection request has been transmitted to the peer
+ *    LSAP-Connection Control FSM and we are awaiting reply.
+ */
+static int irlmp_state_setup(struct lsap_cb *self, IRLMP_EVENT event,
+			     struct sk_buff *skb)
+{
+	LM_REASON reason;
+	int ret = 0;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;);
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	switch (event) {
+	case LM_CONNECT_CONFIRM:
+		irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY);
+
+		del_timer(&self->watchdog_timer);
+
+		irlmp_connect_confirm(self, skb);
+		break;
+	case LM_DISCONNECT_INDICATION:
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+		IRDA_ASSERT(skb != NULL, return -1;);
+		IRDA_ASSERT(skb->len > 3, return -1;);
+		reason = skb->data[3];
+
+		 /* Try to close the LAP connection */
+		IRDA_DEBUG(4, "%s(), trying to close IrLAP\n",  __func__);
+		irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+
+		irlmp_disconnect_indication(self, reason, skb);
+		break;
+	case LM_LAP_DISCONNECT_INDICATION:
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		del_timer(&self->watchdog_timer);
+
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;);
+
+		reason = irlmp_convert_lap_reason(self->lap->reason);
+
+		irlmp_disconnect_indication(self, reason, skb);
+		break;
+	case LM_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __func__);
+
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Function irlmp_state_setup_pend (event, skb, info)
+ *
+ *    SETUP_PEND, An LM_CONNECT_REQUEST has been received from the service
+ *    user to set up an LSAP connection. A request has been sent to the
+ *    LAP FSM to set up the underlying IrLAP connection, and we
+ *    are awaiting confirm.
+ */
+static int irlmp_state_setup_pend(struct lsap_cb *self, IRLMP_EVENT event,
+				  struct sk_buff *skb)
+{
+	struct sk_buff *tx_skb;
+	LM_REASON reason;
+	int ret = 0;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(irlmp != NULL, return -1;);
+
+	switch (event) {
+	case LM_LAP_CONNECT_CONFIRM:
+		IRDA_ASSERT(self->conn_skb != NULL, return -1;);
+
+		tx_skb = self->conn_skb;
+		self->conn_skb = NULL;
+
+		irlmp_send_lcf_pdu(self->lap, self->dlsap_sel,
+				   self->slsap_sel, CONNECT_CMD, tx_skb);
+		/* Drop reference count - see irlap_data_request(). */
+		dev_kfree_skb(tx_skb);
+
+		irlmp_next_lsap_state(self, LSAP_SETUP);
+		break;
+	case LM_WATCHDOG_TIMEOUT:
+		IRDA_DEBUG(0, "%s() : WATCHDOG_TIMEOUT !\n",  __func__);
+
+		IRDA_ASSERT(self->lap != NULL, return -1;);
+		irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL);
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL);
+		break;
+	case LM_LAP_DISCONNECT_INDICATION: /* LS_Disconnect.indication */
+		del_timer( &self->watchdog_timer);
+
+		irlmp_next_lsap_state(self, LSAP_DISCONNECTED);
+
+		reason = irlmp_convert_lap_reason(self->lap->reason);
+
+		irlmp_disconnect_indication(self, reason, NULL);
+		break;
+	default:
+		IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n",
+			   __func__, irlmp_event[event], self->slsap_sel);
+		break;
+	}
+	return ret;
+}
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
new file mode 100644
index 00000000..062e63b1
--- /dev/null
+++ b/net/irda/irlmp_frame.c
@@ -0,0 +1,490 @@
+/*********************************************************************
+ *
+ * Filename:      irlmp_frame.c
+ * Version:       0.9
+ * Description:   IrLMP frame implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Aug 19 02:09:59 1997
+ * Modified at:   Mon Dec 13 13:41:12 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/timer.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irlmp_frame.h>
+#include <net/irda/discovery.h>
+
+static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap,
+				       __u8 slsap, int status, hashbin_t *);
+
+inline void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
+				int expedited, struct sk_buff *skb)
+{
+	skb->data[0] = dlsap;
+	skb->data[1] = slsap;
+
+	if (expedited) {
+		IRDA_DEBUG(4, "%s(), sending expedited data\n", __func__);
+		irlap_data_request(self->irlap, skb, TRUE);
+	} else
+		irlap_data_request(self->irlap, skb, FALSE);
+}
+
+/*
+ * Function irlmp_send_lcf_pdu (dlsap, slsap, opcode,skb)
+ *
+ *    Send Link Control Frame to IrLAP
+ */
+void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
+			__u8 opcode, struct sk_buff *skb)
+{
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	frame = skb->data;
+
+	frame[0] = dlsap | CONTROL_BIT;
+	frame[1] = slsap;
+
+	frame[2] = opcode;
+
+	if (opcode == DISCONNECT)
+		frame[3] = 0x01; /* Service user request */
+	else
+		frame[3] = 0x00; /* rsvd */
+
+	irlap_data_request(self->irlap, skb, FALSE);
+}
+
+/*
+ * Function irlmp_input (skb)
+ *
+ *    Used by IrLAP to pass received data frames to IrLMP layer
+ *
+ */
+void irlmp_link_data_indication(struct lap_cb *self, struct sk_buff *skb,
+				int unreliable)
+{
+	struct lsap_cb *lsap;
+	__u8   slsap_sel;   /* Source (this) LSAP address */
+	__u8   dlsap_sel;   /* Destination LSAP address */
+	__u8   *fp;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+	IRDA_ASSERT(skb->len > 2, return;);
+
+	fp = skb->data;
+
+	/*
+	 *  The next statements may be confusing, but we do this so that
+	 *  destination LSAP of received frame is source LSAP in our view
+	 */
+	slsap_sel = fp[0] & LSAP_MASK;
+	dlsap_sel = fp[1];
+
+	/*
+	 *  Check if this is an incoming connection, since we must deal with
+	 *  it in a different way than other established connections.
+	 */
+	if ((fp[0] & CONTROL_BIT) && (fp[2] == CONNECT_CMD)) {
+		IRDA_DEBUG(3, "%s(), incoming connection, "
+			   "source LSAP=%d, dest LSAP=%d\n",
+			   __func__, slsap_sel, dlsap_sel);
+
+		/* Try to find LSAP among the unconnected LSAPs */
+		lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, CONNECT_CMD,
+				       irlmp->unconnected_lsaps);
+
+		/* Maybe LSAP was already connected, so try one more time */
+		if (!lsap) {
+			IRDA_DEBUG(1, "%s(), incoming connection for LSAP already connected\n", __func__);
+			lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0,
+					       self->lsaps);
+		}
+	} else
+		lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0,
+				       self->lsaps);
+
+	if (lsap == NULL) {
+		IRDA_DEBUG(2, "IrLMP, Sorry, no LSAP for received frame!\n");
+		IRDA_DEBUG(2, "%s(), slsap_sel = %02x, dlsap_sel = %02x\n",
+			   __func__, slsap_sel, dlsap_sel);
+		if (fp[0] & CONTROL_BIT) {
+			IRDA_DEBUG(2, "%s(), received control frame %02x\n",
+				   __func__, fp[2]);
+		} else {
+			IRDA_DEBUG(2, "%s(), received data frame\n", __func__);
+		}
+		return;
+	}
+
+	/*
+	 *  Check if we received a control frame?
+	 */
+	if (fp[0] & CONTROL_BIT) {
+		switch (fp[2]) {
+		case CONNECT_CMD:
+			lsap->lap = self;
+			irlmp_do_lsap_event(lsap, LM_CONNECT_INDICATION, skb);
+			break;
+		case CONNECT_CNF:
+			irlmp_do_lsap_event(lsap, LM_CONNECT_CONFIRM, skb);
+			break;
+		case DISCONNECT:
+			IRDA_DEBUG(4, "%s(), Disconnect indication!\n",
+				   __func__);
+			irlmp_do_lsap_event(lsap, LM_DISCONNECT_INDICATION,
+					    skb);
+			break;
+		case ACCESSMODE_CMD:
+			IRDA_DEBUG(0, "Access mode cmd not implemented!\n");
+			break;
+		case ACCESSMODE_CNF:
+			IRDA_DEBUG(0, "Access mode cnf not implemented!\n");
+			break;
+		default:
+			IRDA_DEBUG(0, "%s(), Unknown control frame %02x\n",
+				   __func__, fp[2]);
+			break;
+		}
+	} else if (unreliable) {
+		/* Optimize and bypass the state machine if possible */
+		if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY)
+			irlmp_udata_indication(lsap, skb);
+		else
+			irlmp_do_lsap_event(lsap, LM_UDATA_INDICATION, skb);
+	} else {
+		/* Optimize and bypass the state machine if possible */
+		if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY)
+			irlmp_data_indication(lsap, skb);
+		else
+			irlmp_do_lsap_event(lsap, LM_DATA_INDICATION, skb);
+	}
+}
+
+/*
+ * Function irlmp_link_unitdata_indication (self, skb)
+ *
+ *
+ *
+ */
+#ifdef CONFIG_IRDA_ULTRA
+void irlmp_link_unitdata_indication(struct lap_cb *self, struct sk_buff *skb)
+{
+	struct lsap_cb *lsap;
+	__u8   slsap_sel;   /* Source (this) LSAP address */
+	__u8   dlsap_sel;   /* Destination LSAP address */
+	__u8   pid;         /* Protocol identifier */
+	__u8   *fp;
+	unsigned long flags;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+	IRDA_ASSERT(skb->len > 2, return;);
+
+	fp = skb->data;
+
+	/*
+	 *  The next statements may be confusing, but we do this so that
+	 *  destination LSAP of received frame is source LSAP in our view
+	 */
+	slsap_sel = fp[0] & LSAP_MASK;
+	dlsap_sel = fp[1];
+	pid       = fp[2];
+
+	if (pid & 0x80) {
+		IRDA_DEBUG(0, "%s(), extension in PID not supp!\n",
+			   __func__);
+		return;
+	}
+
+	/* Check if frame is addressed to the connectionless LSAP */
+	if ((slsap_sel != LSAP_CONNLESS) || (dlsap_sel != LSAP_CONNLESS)) {
+		IRDA_DEBUG(0, "%s(), dropping frame!\n", __func__);
+		return;
+	}
+
+	/* Search the connectionless LSAP */
+	spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+	lsap = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps);
+	while (lsap != NULL) {
+		/*
+		 *  Check if source LSAP and dest LSAP selectors and PID match.
+		 */
+		if ((lsap->slsap_sel == slsap_sel) &&
+		    (lsap->dlsap_sel == dlsap_sel) &&
+		    (lsap->pid == pid))
+		{
+			break;
+		}
+		lsap = (struct lsap_cb *) hashbin_get_next(irlmp->unconnected_lsaps);
+	}
+	spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags);
+
+	if (lsap)
+		irlmp_connless_data_indication(lsap, skb);
+	else {
+		IRDA_DEBUG(0, "%s(), found no matching LSAP!\n", __func__);
+	}
+}
+#endif /* CONFIG_IRDA_ULTRA */
+
+/*
+ * Function irlmp_link_disconnect_indication (reason, userdata)
+ *
+ *    IrLAP has disconnected
+ *
+ */
+void irlmp_link_disconnect_indication(struct lap_cb *lap,
+				      struct irlap_cb *irlap,
+				      LAP_REASON reason,
+				      struct sk_buff *skb)
+{
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(lap != NULL, return;);
+	IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;);
+
+	lap->reason = reason;
+	lap->daddr = DEV_ADDR_ANY;
+
+	/* FIXME: must do something with the skb if any */
+
+	/*
+	 *  Inform station state machine
+	 */
+	irlmp_do_lap_event(lap, LM_LAP_DISCONNECT_INDICATION, NULL);
+}
+
+/*
+ * Function irlmp_link_connect_indication (qos)
+ *
+ *    Incoming LAP connection!
+ *
+ */
+void irlmp_link_connect_indication(struct lap_cb *self, __u32 saddr,
+				   __u32 daddr, struct qos_info *qos,
+				   struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/* Copy QoS settings for this session */
+	self->qos = qos;
+
+	/* Update destination device address */
+	self->daddr = daddr;
+	IRDA_ASSERT(self->saddr == saddr, return;);
+
+	irlmp_do_lap_event(self, LM_LAP_CONNECT_INDICATION, skb);
+}
+
+/*
+ * Function irlmp_link_connect_confirm (qos)
+ *
+ *    LAP connection confirmed!
+ *
+ */
+void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos,
+				struct sk_buff *skb)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+	IRDA_ASSERT(qos != NULL, return;);
+
+	/* Don't need use the skb for now */
+
+	/* Copy QoS settings for this session */
+	self->qos = qos;
+
+	irlmp_do_lap_event(self, LM_LAP_CONNECT_CONFIRM, NULL);
+}
+
+/*
+ * Function irlmp_link_discovery_indication (self, log)
+ *
+ *    Device is discovering us
+ *
+ * It's not an answer to our own discoveries, just another device trying
+ * to perform discovery, but we don't want to miss the opportunity
+ * to exploit this information, because :
+ *	o We may not actively perform discovery (just passive discovery)
+ *	o This type of discovery is much more reliable. In some cases, it
+ *	  seem that less than 50% of our discoveries get an answer, while
+ *	  we always get ~100% of these.
+ *	o Make faster discovery, statistically divide time of discovery
+ *	  events by 2 (important for the latency aspect and user feel)
+ *	o Even is we do active discovery, the other node might not
+ *	  answer our discoveries (ex: Palm). The Palm will just perform
+ *	  one active discovery and connect directly to us.
+ *
+ * However, when both devices discover each other, they might attempt to
+ * connect to each other following the discovery event, and it would create
+ * collisions on the medium (SNRM battle).
+ * The "fix" for that is to disable all connection requests in IrLAP
+ * for 100ms after a discovery indication by setting the media_busy flag.
+ * Previously, we used to postpone the event which was quite ugly. Now
+ * that IrLAP takes care of this problem, just pass the event up...
+ *
+ * Jean II
+ */
+void irlmp_link_discovery_indication(struct lap_cb *self,
+				     discovery_t *discovery)
+{
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+	/* Add to main log, cleanup */
+	irlmp_add_discovery(irlmp->cachelog, discovery);
+
+	/* Just handle it the same way as a discovery confirm,
+	 * bypass the LM_LAP state machine (see below) */
+	irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_PASSIVE);
+}
+
+/*
+ * Function irlmp_link_discovery_confirm (self, log)
+ *
+ *    Called by IrLAP with a list of discoveries after the discovery
+ *    request has been carried out. A NULL log is received if IrLAP
+ *    was unable to carry out the discovery request
+ *
+ */
+void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log)
+{
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;);
+
+	/* Add to main log, cleanup */
+	irlmp_add_discovery_log(irlmp->cachelog, log);
+
+	/* Propagate event to various LSAPs registered for it.
+	 * We bypass the LM_LAP state machine because
+	 *	1) We do it regardless of the LM_LAP state
+	 *	2) It doesn't affect the LM_LAP state
+	 *	3) Faster, slimer, simpler, ...
+	 * Jean II */
+	irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_ACTIVE);
+}
+
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+static inline void irlmp_update_cache(struct lap_cb *lap,
+				      struct lsap_cb *lsap)
+{
+	/* Prevent concurrent read to get garbage */
+	lap->cache.valid = FALSE;
+	/* Update cache entry */
+	lap->cache.dlsap_sel = lsap->dlsap_sel;
+	lap->cache.slsap_sel = lsap->slsap_sel;
+	lap->cache.lsap = lsap;
+	lap->cache.valid = TRUE;
+}
+#endif
+
+/*
+ * Function irlmp_find_handle (self, dlsap_sel, slsap_sel, status, queue)
+ *
+ *    Find handle associated with destination and source LSAP
+ *
+ * Any IrDA connection (LSAP/TSAP) is uniquely identified by
+ * 3 parameters, the local lsap, the remote lsap and the remote address.
+ * We may initiate multiple connections to the same remote service
+ * (they will have different local lsap), a remote device may initiate
+ * multiple connections to the same local service (they will have
+ * different remote lsap), or multiple devices may connect to the same
+ * service and may use the same remote lsap (and they will have
+ * different remote address).
+ * So, where is the remote address ? Each LAP connection is made with
+ * a single remote device, so imply a specific remote address.
+ * Jean II
+ */
+static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
+				       __u8 slsap_sel, int status,
+				       hashbin_t *queue)
+{
+	struct lsap_cb *lsap;
+	unsigned long flags;
+
+	/*
+	 *  Optimize for the common case. We assume that the last frame
+	 *  received is in the same connection as the last one, so check in
+	 *  cache first to avoid the linear search
+	 */
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	if ((self->cache.valid) &&
+	    (self->cache.slsap_sel == slsap_sel) &&
+	    (self->cache.dlsap_sel == dlsap_sel))
+	{
+		return self->cache.lsap;
+	}
+#endif
+
+	spin_lock_irqsave(&queue->hb_spinlock, flags);
+
+	lsap = (struct lsap_cb *) hashbin_get_first(queue);
+	while (lsap != NULL) {
+		/*
+		 *  If this is an incoming connection, then the destination
+		 *  LSAP selector may have been specified as LM_ANY so that
+		 *  any client can connect. In that case we only need to check
+		 *  if the source LSAP (in our view!) match!
+		 */
+		if ((status == CONNECT_CMD) &&
+		    (lsap->slsap_sel == slsap_sel) &&
+		    (lsap->dlsap_sel == LSAP_ANY)) {
+			/* This is where the dest lsap sel is set on incoming
+			 * lsaps */
+			lsap->dlsap_sel = dlsap_sel;
+			break;
+		}
+		/*
+		 *  Check if source LSAP and dest LSAP selectors match.
+		 */
+		if ((lsap->slsap_sel == slsap_sel) &&
+		    (lsap->dlsap_sel == dlsap_sel))
+			break;
+
+		lsap = (struct lsap_cb *) hashbin_get_next(queue);
+	}
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	if(lsap)
+		irlmp_update_cache(self, lsap);
+#endif
+	spin_unlock_irqrestore(&queue->hb_spinlock, flags);
+
+	/* Return what we've found or NULL */
+	return lsap;
+}
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
new file mode 100644
index 00000000..303a68d9
--- /dev/null
+++ b/net/irda/irmod.c
@@ -0,0 +1,211 @@
+/*********************************************************************
+ *
+ * Filename:      irmod.c
+ * Version:       0.9
+ * Description:   IrDA stack main entry points
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Dec 15 13:55:39 1997
+ * Modified at:   Wed Jan  5 15:12:41 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1997, 1999-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2004 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+/*
+ * This file contains the main entry points of the IrDA stack.
+ * They are in this file and not af_irda.c because some developpers
+ * are using the IrDA stack without the socket API (compiling out
+ * af_irda.c).
+ * Jean II
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irmod.h>		/* notify_t */
+#include <net/irda/irlap.h>		/* irlap_init */
+#include <net/irda/irlmp.h>		/* irlmp_init */
+#include <net/irda/iriap.h>		/* iriap_init */
+#include <net/irda/irttp.h>		/* irttp_init */
+#include <net/irda/irda_device.h>	/* irda_device_init */
+
+/*
+ * Module parameters
+ */
+#ifdef CONFIG_IRDA_DEBUG
+unsigned int irda_debug = IRDA_DEBUG_LEVEL;
+module_param_named(debug, irda_debug, uint, 0);
+MODULE_PARM_DESC(debug, "IRDA debugging level");
+EXPORT_SYMBOL(irda_debug);
+#endif
+
+/* Packet type handler.
+ * Tell the kernel how IrDA packets should be handled.
+ */
+static struct packet_type irda_packet_type __read_mostly = {
+	.type	= cpu_to_be16(ETH_P_IRDA),
+	.func	= irlap_driver_rcv,	/* Packet type handler irlap_frame.c */
+};
+
+/*
+ * Function irda_notify_init (notify)
+ *
+ *    Used for initializing the notify structure
+ *
+ */
+void irda_notify_init(notify_t *notify)
+{
+	notify->data_indication = NULL;
+	notify->udata_indication = NULL;
+	notify->connect_confirm = NULL;
+	notify->connect_indication = NULL;
+	notify->disconnect_indication = NULL;
+	notify->flow_indication = NULL;
+	notify->status_indication = NULL;
+	notify->instance = NULL;
+	strlcpy(notify->name, "Unknown", sizeof(notify->name));
+}
+EXPORT_SYMBOL(irda_notify_init);
+
+/*
+ * Function irda_init (void)
+ *
+ *  Protocol stack initialisation entry point.
+ *  Initialise the various components of the IrDA stack
+ */
+static int __init irda_init(void)
+{
+	int ret = 0;
+
+	IRDA_DEBUG(0, "%s()\n", __func__);
+
+	/* Lower layer of the stack */
+	irlmp_init();
+	irlap_init();
+
+	/* Driver/dongle support */
+	irda_device_init();
+
+	/* Higher layers of the stack */
+	iriap_init();
+	irttp_init();
+	ret = irsock_init();
+	if (ret < 0)
+		goto out_err_1;
+
+	/* Add IrDA packet type (Start receiving packets) */
+	dev_add_pack(&irda_packet_type);
+
+	/* External APIs */
+#ifdef CONFIG_PROC_FS
+	irda_proc_register();
+#endif
+#ifdef CONFIG_SYSCTL
+	ret = irda_sysctl_register();
+	if (ret < 0)
+		goto out_err_2;
+#endif
+
+	ret = irda_nl_register();
+	if (ret < 0)
+		goto out_err_3;
+
+	return 0;
+
+ out_err_3:
+#ifdef CONFIG_SYSCTL
+	irda_sysctl_unregister();
+ out_err_2:
+#endif
+#ifdef CONFIG_PROC_FS
+	irda_proc_unregister();
+#endif
+
+	/* Remove IrDA packet type (stop receiving packets) */
+	dev_remove_pack(&irda_packet_type);
+
+	/* Remove higher layers */
+	irsock_cleanup();
+ out_err_1:
+	irttp_cleanup();
+	iriap_cleanup();
+
+	/* Remove lower layers */
+	irda_device_cleanup();
+	irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */
+
+	/* Remove middle layer */
+	irlmp_cleanup();
+
+
+	return ret;
+}
+
+/*
+ * Function irda_cleanup (void)
+ *
+ *  Protocol stack cleanup/removal entry point.
+ *  Cleanup the various components of the IrDA stack
+ */
+static void __exit irda_cleanup(void)
+{
+	/* Remove External APIs */
+	irda_nl_unregister();
+
+#ifdef CONFIG_SYSCTL
+	irda_sysctl_unregister();
+#endif
+#ifdef CONFIG_PROC_FS
+	irda_proc_unregister();
+#endif
+
+	/* Remove IrDA packet type (stop receiving packets) */
+	dev_remove_pack(&irda_packet_type);
+
+	/* Remove higher layers */
+	irsock_cleanup();
+	irttp_cleanup();
+	iriap_cleanup();
+
+	/* Remove lower layers */
+	irda_device_cleanup();
+	irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */
+
+	/* Remove middle layer */
+	irlmp_cleanup();
+}
+
+/*
+ * The IrDA stack must be initialised *before* drivers get initialised,
+ * and *before* higher protocols (IrLAN/IrCOMM/IrNET) get initialised,
+ * otherwise bad things will happen (hashbins will be NULL for example).
+ * Those modules are at module_init()/device_initcall() level.
+ *
+ * On the other hand, it needs to be initialised *after* the basic
+ * networking, the /proc/net filesystem and sysctl module. Those are
+ * currently initialised in .../init/main.c (before initcalls).
+ * Also, IrDA drivers needs to be initialised *after* the random number
+ * generator (main stack and higher layer init don't need it anymore).
+ *
+ * Jean II
+ */
+subsys_initcall(irda_init);
+module_exit(irda_cleanup);
+
+MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no> & Jean Tourrilhes <jt@hpl.hp.com>");
+MODULE_DESCRIPTION("The Linux IrDA Protocol Stack");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IRDA);
diff --git a/net/irda/irnet/Kconfig b/net/irda/irnet/Kconfig
new file mode 100644
index 00000000..28c557f0
--- /dev/null
+++ b/net/irda/irnet/Kconfig
@@ -0,0 +1,13 @@
+config IRNET
+	tristate "IrNET protocol"
+	depends on IRDA && PPP
+	help
+	  Say Y here if you want to build support for the IrNET protocol.
+	  To compile it as a module, choose M here: the module will be
+	  called irnet.  IrNET is a PPP driver, so you will also need a
+	  working PPP subsystem (driver, daemon and config)...
+
+	  IrNET is an alternate way to transfer TCP/IP traffic over IrDA.  It
+	  uses synchronous PPP over a set of point to point IrDA sockets.  You
+	  can use it between Linux machine or with W2k.
+
diff --git a/net/irda/irnet/Makefile b/net/irda/irnet/Makefile
new file mode 100644
index 00000000..61c365c8
--- /dev/null
+++ b/net/irda/irnet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux IrDA IrNET protocol layer.
+#
+
+obj-$(CONFIG_IRNET) += irnet.o
+
+irnet-y := irnet_ppp.o irnet_irda.o
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
new file mode 100644
index 00000000..979ecb24
--- /dev/null
+++ b/net/irda/irnet/irnet.h
@@ -0,0 +1,528 @@
+/*
+ *	IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ *		Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains definitions and declarations global to the IrNET module,
+ * all grouped in one place...
+ * This file is a *private* header, so other modules don't want to know
+ * what's in there...
+ *
+ * Note : as most part of the Linux kernel, this module is available
+ * under the GNU General Public License (GPL).
+ */
+
+#ifndef IRNET_H
+#define IRNET_H
+
+/************************** DOCUMENTATION ***************************/
+/*
+ * What is IrNET
+ * -------------
+ * IrNET is a protocol allowing to carry TCP/IP traffic between two
+ * IrDA peers in an efficient fashion. It is a thin layer, passing PPP
+ * packets to IrTTP and vice versa. It uses PPP in synchronous mode,
+ * because IrTTP offer a reliable sequenced packet service (as opposed
+ * to a byte stream). In fact, you could see IrNET as carrying TCP/IP
+ * in a IrDA socket, using PPP to provide the glue.
+ *
+ * The main difference with traditional PPP over IrCOMM is that we
+ * avoid the framing and serial emulation which are a performance
+ * bottleneck. It also allows multipoint communications in a sensible
+ * fashion.
+ *
+ * The main difference with IrLAN is that we use PPP for the link
+ * management, which is more standard, interoperable and flexible than
+ * the IrLAN protocol. For example, PPP adds authentication,
+ * encryption, compression, header compression and automated routing
+ * setup. And, as IrNET let PPP do the hard work, the implementation
+ * is much simpler than IrLAN.
+ *
+ * The Linux implementation
+ * ------------------------
+ * IrNET is written on top of the Linux-IrDA stack, and interface with
+ * the generic Linux PPP driver. Because IrNET depend on recent
+ * changes of the PPP driver interface, IrNET will work only with very
+ * recent kernel (2.3.99-pre6 and up).
+ *
+ * The present implementation offer the following features :
+ *	o simple user interface using pppd
+ *	o efficient implementation (interface directly to PPP and IrTTP)
+ *	o addressing (you can specify the name of the IrNET recipient)
+ *	o multipoint operation (limited by IrLAP specification)
+ *	o information in /proc/net/irda/irnet
+ *	o IrNET events on /dev/irnet (for user space daemon)
+ *	o IrNET daemon (irnetd) to automatically handle incoming requests
+ *	o Windows 2000 compatibility (tested, but need more work)
+ * Currently missing :
+ *	o Lot's of testing (that's your job)
+ *	o Connection retries (may be too hard to do)
+ *	o Check pppd persist mode
+ *	o User space daemon (to automatically handle incoming requests)
+ *
+ * The setup is not currently the most easy, but this should get much
+ * better when everything will get integrated...
+ *
+ * Acknowledgements
+ * ----------------
+ * This module is based on :
+ *	o The PPP driver (ppp_synctty/ppp_generic) by Paul Mackerras
+ *	o The IrLAN protocol (irlan_common/XXX) by Dag Brattli
+ *	o The IrSock interface (af_irda) by Dag Brattli
+ *	o Some other bits from the kernel and my drivers...
+ * Infinite thanks to those brave souls for providing the infrastructure
+ * upon which IrNET is built.
+ *
+ * Thanks to all my colleagues in HP for helping me. In particular,
+ * thanks to Salil Pradhan and Bill Serra for W2k testing...
+ * Thanks to Luiz Magalhaes for irnetd and much testing...
+ *
+ * Thanks to Alan Cox for answering lot's of my stupid questions, and
+ * to Paul Mackerras answering my questions on how to best integrate
+ * IrNET and pppd.
+ *
+ * Jean II
+ *
+ * Note on some implementations choices...
+ * ------------------------------------
+ *	1) Direct interface vs tty/socket
+ * I could have used a tty interface to hook to ppp and use the full
+ * socket API to connect to IrDA. The code would have been easier to
+ * maintain, and maybe the code would have been smaller...
+ * Instead, we hook directly to ppp_generic and to IrTTP, which make
+ * things more complicated...
+ *
+ * The first reason is flexibility : this allow us to create IrNET
+ * instances on demand (no /dev/ircommX crap) and to allow linkname
+ * specification on pppd command line...
+ *
+ * Second reason is speed optimisation. If you look closely at the
+ * transmit and receive paths, you will notice that they are "super lean"
+ * (that's why they look ugly), with no function calls and as little data
+ * copy and modification as I could...
+ *
+ *	2) irnetd in user space
+ * irnetd is implemented in user space, which is necessary to call pppd.
+ * This also give maximum benefits in term of flexibility and customability,
+ * and allow to offer the event channel, useful for other stuff like debug.
+ *
+ * On the other hand, this require a loose coordination between the
+ * present module and irnetd. One critical area is how incoming request
+ * are handled.
+ * When irnet receive an incoming request, it send an event to irnetd and
+ * drop the incoming IrNET socket.
+ * irnetd start a pppd instance, which create a new IrNET socket. This new
+ * socket is then connected in the originating node to the pppd instance.
+ * At this point, in the originating node, the first socket is closed.
+ *
+ * I admit, this is a bit messy and waste some resources. The alternative
+ * is caching incoming socket, and that's also quite messy and waste
+ * resources.
+ * We also make connection time slower. For example, on a 115 kb/s link it
+ * adds 60ms to the connection time (770 ms). However, this is slower than
+ * the time it takes to fire up pppd on my P133...
+ *
+ *
+ * History :
+ * -------
+ *
+ * v1 - 15.5.00 - Jean II
+ *	o Basic IrNET (hook to ppp_generic & IrTTP - incl. multipoint)
+ *	o control channel on /dev/irnet (set name/address)
+ *	o event channel on /dev/irnet (for user space daemon)
+ *
+ * v2 - 5.6.00 - Jean II
+ *	o Enable DROP_NOT_READY to avoid PPP timeouts & other weirdness...
+ *	o Add DISCONNECT_TO event and rename DISCONNECT_FROM.
+ *	o Set official device number alloaction on /dev/irnet
+ *
+ * v3 - 30.8.00 - Jean II
+ *	o Update to latest Linux-IrDA changes :
+ *		- queue_t => irda_queue_t
+ *	o Update to ppp-2.4.0 :
+ *		- move irda_irnet_connect from PPPIOCATTACH to TIOCSETD
+ *	o Add EXPIRE event (depend on new IrDA-Linux patch)
+ *	o Switch from `hashbin_remove' to `hashbin_remove_this' to fix
+ *	  a multilink bug... (depend on new IrDA-Linux patch)
+ *	o fix a self->daddr to self->raddr in irda_irnet_connect to fix
+ *	  another multilink bug (darn !)
+ *	o Remove LINKNAME_IOCTL cruft
+ *
+ * v3b - 31.8.00 - Jean II
+ *	o Dump discovery log at event channel startup
+ *
+ * v4 - 28.9.00 - Jean II
+ *	o Fix interaction between poll/select and dump discovery log
+ *	o Add IRNET_BLOCKED_LINK event (depend on new IrDA-Linux patch)
+ *	o Add IRNET_NOANSWER_FROM event (mostly to help support)
+ *	o Release flow control in disconnect_indication
+ *	o Block packets while connecting (speed up connections)
+ *
+ * v5 - 11.01.01 - Jean II
+ *	o Init self->max_header_size, just in case...
+ *	o Set up ap->chan.hdrlen, to get zero copy on tx side working.
+ *	o avoid tx->ttp->flow->ppp->tx->... loop, by checking flow state
+ *		Thanks to Christian Gennerat for finding this bug !
+ *	---
+ *	o Declare the proper MTU/MRU that we can support
+ *		(but PPP doesn't read the MTU value :-()
+ *	o Declare hashbin HB_NOLOCK instead of HB_LOCAL to avoid
+ *		disabling and enabling irq twice
+ *
+ * v6 - 31.05.01 - Jean II
+ *	o Print source address in Found, Discovery, Expiry & Request events
+ *	o Print requested source address in /proc/net/irnet
+ *	o Change control channel input. Allow multiple commands in one line.
+ *	o Add saddr command to change ap->rsaddr (and use that in IrDA)
+ *	---
+ *	o Make the IrDA connection procedure totally asynchronous.
+ *	  Heavy rewrite of the IAS query code and the whole connection
+ *	  procedure. Now, irnet_connect() no longer need to be called from
+ *	  a process context...
+ *	o Enable IrDA connect retries in ppp_irnet_send(). The good thing
+ *	  is that IrDA connect retries are directly driven by PPP LCP
+ *	  retries (we retry for each LCP packet), so that everything
+ *	  is transparently controlled from pppd lcp-max-configure.
+ *	o Add ttp_connect flag to prevent rentry on the connect procedure
+ *	o Test and fixups to eliminate side effects of retries
+ *
+ * v7 - 22.08.01 - Jean II
+ *	o Cleanup : Change "saddr = 0x0" to "saddr = DEV_ADDR_ANY"
+ *	o Fix bug in BLOCK_WHEN_CONNECT introduced in v6 : due to the
+ *	  asynchronous IAS query, self->tsap is NULL when PPP send the
+ *	  first packet.  This was preventing "connect-delay 0" to work.
+ *	  Change the test in ppp_irnet_send() to self->ttp_connect.
+ *
+ * v8 - 1.11.01 - Jean II
+ *	o Tighten the use of self->ttp_connect and self->ttp_open to
+ *	  prevent various race conditions.
+ *	o Avoid leaking discovery log and skb
+ *	o Replace "self" with "server" in irnet_connect_indication() to
+ *	  better detect cut'n'paste error ;-)
+ *
+ * v9 - 29.11.01 - Jean II
+ *	o Fix event generation in disconnect indication that I broke in v8
+ *	  It was always generation "No-Answer" because I was testing ttp_open
+ *	  just after clearing it. *blush*.
+ *	o Use newly created irttp_listen() to fix potential crash when LAP
+ *	  destroyed before irnet module removed.
+ *
+ * v10 - 4.3.2 - Jean II
+ *	o When receiving a disconnect indication, don't reenable the
+ *	  PPP Tx queue, this will trigger a reconnect. Instead, close
+ *	  the channel, which will kill pppd...
+ *
+ * v11 - 20.3.02 - Jean II
+ *	o Oops ! v10 fix disabled IrNET retries and passive behaviour.
+ *	  Better fix in irnet_disconnect_indication() :
+ *	  - if connected, kill pppd via hangup.
+ *	  - if not connected, reenable ppp Tx, which trigger IrNET retry.
+ *
+ * v12 - 10.4.02 - Jean II
+ *	o Fix race condition in irnet_connect_indication().
+ *	  If the socket was already trying to connect, drop old connection
+ *	  and use new one only if acting as primary. See comments.
+ *
+ * v13 - 30.5.02 - Jean II
+ *	o Update module init code
+ *
+ * v14 - 20.2.03 - Jean II
+ *	o Add discovery hint bits in the control channel.
+ *	o Remove obsolete MOD_INC/DEC_USE_COUNT in favor of .owner
+ *
+ * v15 - 7.4.03 - Jean II
+ *	o Replace spin_lock_irqsave() with spin_lock_bh() so that we can
+ *	  use ppp_unit_number(). It's probably also better overall...
+ *	o Disable call to ppp_unregister_channel(), because we can't do it.
+ */
+
+/***************************** INCLUDES *****************************/
+
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tty.h>
+#include <linux/proc_fs.h>
+#include <linux/netdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/capability.h>
+#include <linux/ctype.h>	/* isspace() */
+#include <linux/string.h>	/* skip_spaces() */
+#include <asm/uaccess.h>
+#include <linux/init.h>
+
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/ppp_channel.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/iriap.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/irttp.h>
+#include <net/irda/discovery.h>
+
+/***************************** OPTIONS *****************************/
+/*
+ * Define or undefine to compile or not some optional part of the
+ * IrNET driver...
+ * Note : the present defaults make sense, play with that at your
+ * own risk...
+ */
+/* IrDA side of the business... */
+#define DISCOVERY_NOMASK	/* To enable W2k compatibility... */
+#define ADVERTISE_HINT		/* Advertise IrLAN hint bit */
+#define ALLOW_SIMULT_CONNECT	/* This seem to work, cross fingers... */
+#define DISCOVERY_EVENTS	/* Query the discovery log to post events */
+#define INITIAL_DISCOVERY	/* Dump current discovery log as events */
+#undef STREAM_COMPAT		/* Not needed - potentially messy */
+#undef CONNECT_INDIC_KICK	/* Might mess IrDA, not needed */
+#undef FAIL_SEND_DISCONNECT	/* Might mess IrDA, not needed */
+#undef PASS_CONNECT_PACKETS	/* Not needed ? Safe */
+#undef MISSING_PPP_API		/* Stuff I wish I could do */
+
+/* PPP side of the business */
+#define BLOCK_WHEN_CONNECT	/* Block packets when connecting */
+#define CONNECT_IN_SEND		/* Retry IrDA connection procedure */
+#undef FLUSH_TO_PPP		/* Not sure about this one, let's play safe */
+#undef SECURE_DEVIRNET		/* Bah... */
+
+/****************************** DEBUG ******************************/
+
+/*
+ * This set of flags enable and disable all the various warning,
+ * error and debug message of this driver.
+ * Each section can be enabled and disabled independently
+ */
+/* In the PPP part */
+#define DEBUG_CTRL_TRACE	0	/* Control channel */
+#define DEBUG_CTRL_INFO		0	/* various info */
+#define DEBUG_CTRL_ERROR	1	/* problems */
+#define DEBUG_FS_TRACE		0	/* filesystem callbacks */
+#define DEBUG_FS_INFO		0	/* various info */
+#define DEBUG_FS_ERROR		1	/* problems */
+#define DEBUG_PPP_TRACE		0	/* PPP related functions */
+#define DEBUG_PPP_INFO		0	/* various info */
+#define DEBUG_PPP_ERROR		1	/* problems */
+#define DEBUG_MODULE_TRACE	0	/* module insertion/removal */
+#define DEBUG_MODULE_ERROR	1	/* problems */
+
+/* In the IrDA part */
+#define DEBUG_IRDA_SR_TRACE	0	/* IRDA subroutines */
+#define DEBUG_IRDA_SR_INFO	0	/* various info */
+#define DEBUG_IRDA_SR_ERROR	1	/* problems */
+#define DEBUG_IRDA_SOCK_TRACE	0	/* IRDA main socket functions */
+#define DEBUG_IRDA_SOCK_INFO	0	/* various info */
+#define DEBUG_IRDA_SOCK_ERROR	1	/* problems */
+#define DEBUG_IRDA_SERV_TRACE	0	/* The IrNET server */
+#define DEBUG_IRDA_SERV_INFO	0	/* various info */
+#define DEBUG_IRDA_SERV_ERROR	1	/* problems */
+#define DEBUG_IRDA_TCB_TRACE	0	/* IRDA IrTTP callbacks */
+#define DEBUG_IRDA_CB_INFO	0	/* various info */
+#define DEBUG_IRDA_CB_ERROR	1	/* problems */
+#define DEBUG_IRDA_OCB_TRACE	0	/* IRDA other callbacks */
+#define DEBUG_IRDA_OCB_INFO	0	/* various info */
+#define DEBUG_IRDA_OCB_ERROR	1	/* problems */
+
+#define DEBUG_ASSERT		0	/* Verify all assertions */
+
+/*
+ * These are the macros we are using to actually print the debug
+ * statements. Don't look at it, it's ugly...
+ *
+ * One of the trick is that, as the DEBUG_XXX are constant, the
+ * compiler will optimise away the if() in all cases.
+ */
+/* All error messages (will show up in the normal logs) */
+#define DERROR(dbg, format, args...) \
+	{if(DEBUG_##dbg) \
+		printk(KERN_INFO "irnet: %s(): " format, __func__ , ##args);}
+
+/* Normal debug message (will show up in /var/log/debug) */
+#define DEBUG(dbg, format, args...) \
+	{if(DEBUG_##dbg) \
+		printk(KERN_DEBUG "irnet: %s(): " format, __func__ , ##args);}
+
+/* Entering a function (trace) */
+#define DENTER(dbg, format, args...) \
+	{if(DEBUG_##dbg) \
+		printk(KERN_DEBUG "irnet: -> %s" format, __func__ , ##args);}
+
+/* Entering and exiting a function in one go (trace) */
+#define DPASS(dbg, format, args...) \
+	{if(DEBUG_##dbg) \
+		printk(KERN_DEBUG "irnet: <>%s" format, __func__ , ##args);}
+
+/* Exiting a function (trace) */
+#define DEXIT(dbg, format, args...) \
+	{if(DEBUG_##dbg) \
+		printk(KERN_DEBUG "irnet: <-%s()" format, __func__ , ##args);}
+
+/* Exit a function with debug */
+#define DRETURN(ret, dbg, args...) \
+	{DEXIT(dbg, ": " args);\
+	return ret; }
+
+/* Exit a function on failed condition */
+#define DABORT(cond, ret, dbg, args...) \
+	{if(cond) {\
+		DERROR(dbg, args);\
+		return ret; }}
+
+/* Invalid assertion, print out an error and exit... */
+#define DASSERT(cond, ret, dbg, args...) \
+	{if((DEBUG_ASSERT) && !(cond)) {\
+		DERROR(dbg, "Invalid assertion: " args);\
+		return ret; }}
+
+/************************ CONSTANTS & MACROS ************************/
+
+/* Paranoia */
+#define IRNET_MAGIC	0xB00754
+
+/* Number of control events in the control channel buffer... */
+#define IRNET_MAX_EVENTS	8	/* Should be more than enough... */
+
+/****************************** TYPES ******************************/
+
+/*
+ * This is the main structure where we store all the data pertaining to
+ * one instance of irnet.
+ * Note : in irnet functions, a pointer this structure is usually called
+ * "ap" or "self". If the code is borrowed from the IrDA stack, it tend
+ * to be called "self", and if it is borrowed from the PPP driver it is
+ * "ap". Apart from that, it's exactly the same structure ;-)
+ */
+typedef struct irnet_socket
+{
+  /* ------------------- Instance management ------------------- */
+  /* We manage a linked list of IrNET socket instances */
+  irda_queue_t		q;		/* Must be first - for hasbin */
+  int			magic;		/* Paranoia */
+
+  /* --------------------- FileSystem part --------------------- */
+  /* "pppd" interact directly with us on a /dev/ file */
+  struct file *		file;		/* File descriptor of this instance */
+  /* TTY stuff - to keep "pppd" happy */
+  struct ktermios	termios;	/* Various tty flags */
+  /* Stuff for the control channel */
+  int			event_index;	/* Last read in the event log */
+
+  /* ------------------------- PPP part ------------------------- */
+  /* We interface directly to the ppp_generic driver in the kernel */
+  int			ppp_open;	/* registered with ppp_generic */
+  struct ppp_channel	chan;		/* Interface to generic ppp layer */
+
+  int			mru;		/* Max size of PPP payload */
+  u32			xaccm[8];	/* Asynchronous character map (just */
+  u32			raccm;		/* to please pppd - dummy) */
+  unsigned int		flags;		/* PPP flags (compression, ...) */
+  unsigned int		rbits;		/* Unused receive flags ??? */
+  struct work_struct disconnect_work;   /* Process context disconnection */
+  /* ------------------------ IrTTP part ------------------------ */
+  /* We create a pseudo "socket" over the IrDA tranport */
+  unsigned long		ttp_open;	/* Set when IrTTP is ready */
+  unsigned long		ttp_connect;	/* Set when IrTTP is connecting */
+  struct tsap_cb *	tsap;		/* IrTTP instance (the connection) */
+
+  char			rname[NICKNAME_MAX_LEN + 1];
+					/* IrDA nickname of destination */
+  __u32			rdaddr;		/* Requested peer IrDA address */
+  __u32			rsaddr;		/* Requested local IrDA address */
+  __u32			daddr;		/* actual peer IrDA address */
+  __u32			saddr;		/* my local IrDA address */
+  __u8			dtsap_sel;	/* Remote TSAP selector */
+  __u8			stsap_sel;	/* Local TSAP selector */
+
+  __u32			max_sdu_size_rx;/* Socket parameters used for IrTTP */
+  __u32			max_sdu_size_tx;
+  __u32			max_data_size;
+  __u8			max_header_size;
+  LOCAL_FLOW		tx_flow;	/* State of the Tx path in IrTTP */
+
+  /* ------------------- IrLMP and IrIAS part ------------------- */
+  /* Used for IrDA Discovery and socket name resolution */
+  void *		ckey;		/* IrLMP client handle */
+  __u16			mask;		/* Hint bits mask (filter discov.)*/
+  int			nslots;		/* Number of slots for discovery */
+
+  struct iriap_cb *	iriap;		/* Used to query remote IAS */
+  int			errno;		/* status of the IAS query */
+
+  /* -------------------- Discovery log part -------------------- */
+  /* Used by initial discovery on the control channel
+   * and by irnet_discover_daddr_and_lsap_sel() */
+  struct irda_device_info *discoveries;	/* Copy of the discovery log */
+  int			disco_index;	/* Last read in the discovery log */
+  int			disco_number;	/* Size of the discovery log */
+
+  struct mutex		lock;
+
+} irnet_socket;
+
+/*
+ * This is the various event that we will generate on the control channel
+ */
+typedef enum irnet_event
+{
+  IRNET_DISCOVER,		/* New IrNET node discovered */
+  IRNET_EXPIRE,			/* IrNET node expired */
+  IRNET_CONNECT_TO,		/* IrNET socket has connected to other node */
+  IRNET_CONNECT_FROM,		/* Other node has connected to IrNET socket */
+  IRNET_REQUEST_FROM,		/* Non satisfied connection request */
+  IRNET_NOANSWER_FROM,		/* Failed connection request */
+  IRNET_BLOCKED_LINK,		/* Link (IrLAP) is blocked for > 3s */
+  IRNET_DISCONNECT_FROM,	/* IrNET socket has disconnected */
+  IRNET_DISCONNECT_TO		/* Closing IrNET socket */
+} irnet_event;
+
+/*
+ * This is the storage for an event and its arguments
+ */
+typedef struct irnet_log
+{
+  irnet_event	event;
+  int		unit;
+  __u32		saddr;
+  __u32		daddr;
+  char		name[NICKNAME_MAX_LEN + 1];	/* 21 + 1 */
+  __u16_host_order hints;			/* Discovery hint bits */
+} irnet_log;
+
+/*
+ * This is the storage for all events and related stuff...
+ */
+typedef struct irnet_ctrl_channel
+{
+  irnet_log	log[IRNET_MAX_EVENTS];	/* Event log */
+  int		index;		/* Current index in log */
+  spinlock_t	spinlock;	/* Serialize access to the event log */
+  wait_queue_head_t	rwait;	/* processes blocked on read (or poll) */
+} irnet_ctrl_channel;
+
+/**************************** PROTOTYPES ****************************/
+/*
+ * Global functions of the IrNET module
+ * Note : we list here also functions called from one file to the other.
+ */
+
+/* -------------------------- IRDA PART -------------------------- */
+extern int
+	irda_irnet_create(irnet_socket *);	/* Initialise a IrNET socket */
+extern int
+	irda_irnet_connect(irnet_socket *);	/* Try to connect over IrDA */
+extern void
+	irda_irnet_destroy(irnet_socket *);	/* Teardown  a IrNET socket */
+extern int
+	irda_irnet_init(void);		/* Initialise IrDA part of IrNET */
+extern void
+	irda_irnet_cleanup(void);	/* Teardown IrDA part of IrNET */
+
+/**************************** VARIABLES ****************************/
+
+/* Control channel stuff - allocated in irnet_irda.h */
+extern struct irnet_ctrl_channel	irnet_events;
+
+#endif /* IRNET_H */
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
new file mode 100644
index 00000000..7f17a802
--- /dev/null
+++ b/net/irda/irnet/irnet_irda.c
@@ -0,0 +1,1885 @@
+/*
+ *	IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ *		Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file implement the IRDA interface of IrNET.
+ * Basically, we sit on top of IrTTP. We set up IrTTP, IrIAS properly,
+ * and exchange frames with IrTTP.
+ */
+
+#include "irnet_irda.h"		/* Private header */
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+/*
+ * PPP disconnect work: we need to make sure we're in
+ * process context when calling ppp_unregister_channel().
+ */
+static void irnet_ppp_disconnect(struct work_struct *work)
+{
+	irnet_socket * self =
+		container_of(work, irnet_socket, disconnect_work);
+
+	if (self == NULL)
+		return;
+	/*
+	 * If we were connected, cleanup & close the PPP
+	 * channel, which will kill pppd (hangup) and the rest.
+	 */
+	if (self->ppp_open && !self->ttp_open && !self->ttp_connect) {
+		ppp_unregister_channel(&self->chan);
+		self->ppp_open = 0;
+	}
+}
+
+/************************* CONTROL CHANNEL *************************/
+/*
+ * When ppp is not active, /dev/irnet act as a control channel.
+ * Writing allow to set up the IrDA destination of the IrNET channel,
+ * and any application may be read events happening on IrNET...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Post an event to the control channel...
+ * Put the event in the log, and then wait all process blocked on read
+ * so they can read the log...
+ */
+static void
+irnet_post_event(irnet_socket *	ap,
+		 irnet_event	event,
+		 __u32		saddr,
+		 __u32		daddr,
+		 char *		name,
+		 __u16		hints)
+{
+  int			index;		/* In the log */
+
+  DENTER(CTRL_TRACE, "(ap=0x%p, event=%d, daddr=%08x, name=``%s'')\n",
+	 ap, event, daddr, name);
+
+  /* Protect this section via spinlock.
+   * Note : as we are the only event producer, we only need to exclude
+   * ourself when touching the log, which is nice and easy.
+   */
+  spin_lock_bh(&irnet_events.spinlock);
+
+  /* Copy the event in the log */
+  index = irnet_events.index;
+  irnet_events.log[index].event = event;
+  irnet_events.log[index].daddr = daddr;
+  irnet_events.log[index].saddr = saddr;
+  /* Try to copy IrDA nickname */
+  if(name)
+    strcpy(irnet_events.log[index].name, name);
+  else
+    irnet_events.log[index].name[0] = '\0';
+  /* Copy hints */
+  irnet_events.log[index].hints.word = hints;
+  /* Try to get ppp unit number */
+  if((ap != (irnet_socket *) NULL) && (ap->ppp_open))
+    irnet_events.log[index].unit = ppp_unit_number(&ap->chan);
+  else
+    irnet_events.log[index].unit = -1;
+
+  /* Increment the index
+   * Note that we increment the index only after the event is written,
+   * to make sure that the readers don't get garbage... */
+  irnet_events.index = (index + 1) % IRNET_MAX_EVENTS;
+
+  DEBUG(CTRL_INFO, "New event index is %d\n", irnet_events.index);
+
+  /* Spin lock end */
+  spin_unlock_bh(&irnet_events.spinlock);
+
+  /* Now : wake up everybody waiting for events... */
+  wake_up_interruptible_all(&irnet_events.rwait);
+
+  DEXIT(CTRL_TRACE, "\n");
+}
+
+/************************* IRDA SUBROUTINES *************************/
+/*
+ * These are a bunch of subroutines called from other functions
+ * down there, mostly common code or to improve readability...
+ *
+ * Note : we duplicate quite heavily some routines of af_irda.c,
+ * because our input structure (self) is quite different
+ * (struct irnet instead of struct irda_sock), which make sharing
+ * the same code impossible (at least, without templates).
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_open_tsap (self)
+ *
+ *    Open local Transport Service Access Point (TSAP)
+ *
+ * Create a IrTTP instance for us and set all the IrTTP callbacks.
+ */
+static inline int
+irnet_open_tsap(irnet_socket *	self)
+{
+  notify_t	notify;		/* Callback structure */
+
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  DABORT(self->tsap != NULL, -EBUSY, IRDA_SR_ERROR, "Already busy !\n");
+
+  /* Initialize IrTTP callbacks to be used by the IrDA stack */
+  irda_notify_init(&notify);
+  notify.connect_confirm	= irnet_connect_confirm;
+  notify.connect_indication	= irnet_connect_indication;
+  notify.disconnect_indication	= irnet_disconnect_indication;
+  notify.data_indication	= irnet_data_indication;
+  /*notify.udata_indication	= NULL;*/
+  notify.flow_indication	= irnet_flow_indication;
+  notify.status_indication	= irnet_status_indication;
+  notify.instance		= self;
+  strlcpy(notify.name, IRNET_NOTIFY_NAME, sizeof(notify.name));
+
+  /* Open an IrTTP instance */
+  self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT,
+			       &notify);
+  DABORT(self->tsap == NULL, -ENOMEM,
+	 IRDA_SR_ERROR, "Unable to allocate TSAP !\n");
+
+  /* Remember which TSAP selector we actually got */
+  self->stsap_sel = self->tsap->stsap_sel;
+
+  DEXIT(IRDA_SR_TRACE, " - tsap=0x%p, sel=0x%X\n",
+	self->tsap, self->stsap_sel);
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_ias_to_tsap (self, result, value)
+ *
+ *    Examine an IAS object and extract TSAP
+ *
+ * We do an IAP query to find the TSAP associated with the IrNET service.
+ * When IrIAP pass us the result of the query, this function look at
+ * the return values to check for failures and extract the TSAP if
+ * possible.
+ * Also deallocate value
+ * The failure is in self->errno
+ * Return TSAP or -1
+ */
+static inline __u8
+irnet_ias_to_tsap(irnet_socket *	self,
+		  int			result,
+		  struct ias_value *	value)
+{
+  __u8	dtsap_sel = 0;		/* TSAP we are looking for */
+
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  /* By default, no error */
+  self->errno = 0;
+
+  /* Check if request succeeded */
+  switch(result)
+    {
+      /* Standard errors : service not available */
+    case IAS_CLASS_UNKNOWN:
+    case IAS_ATTRIB_UNKNOWN:
+      DEBUG(IRDA_SR_INFO, "IAS object doesn't exist ! (%d)\n", result);
+      self->errno = -EADDRNOTAVAIL;
+      break;
+
+      /* Other errors, most likely IrDA stack failure */
+    default :
+      DEBUG(IRDA_SR_INFO, "IAS query failed ! (%d)\n", result);
+      self->errno = -EHOSTUNREACH;
+      break;
+
+      /* Success : we got what we wanted */
+    case IAS_SUCCESS:
+      break;
+    }
+
+  /* Check what was returned to us */
+  if(value != NULL)
+    {
+      /* What type of argument have we got ? */
+      switch(value->type)
+	{
+	case IAS_INTEGER:
+	  DEBUG(IRDA_SR_INFO, "result=%d\n", value->t.integer);
+	  if(value->t.integer != -1)
+	    /* Get the remote TSAP selector */
+	    dtsap_sel = value->t.integer;
+	  else
+	    self->errno = -EADDRNOTAVAIL;
+	  break;
+	default:
+	  self->errno = -EADDRNOTAVAIL;
+	  DERROR(IRDA_SR_ERROR, "bad type ! (0x%X)\n", value->type);
+	  break;
+	}
+
+      /* Cleanup */
+      irias_delete_value(value);
+    }
+  else	/* value == NULL */
+    {
+      /* Nothing returned to us - usually result != SUCCESS */
+      if(!(self->errno))
+	{
+	  DERROR(IRDA_SR_ERROR,
+		 "IrDA bug : result == SUCCESS && value == NULL\n");
+	  self->errno = -EHOSTUNREACH;
+	}
+    }
+  DEXIT(IRDA_SR_TRACE, "\n");
+
+  /* Return the TSAP */
+  return dtsap_sel;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_find_lsap_sel (self)
+ *
+ *    Try to lookup LSAP selector in remote LM-IAS
+ *
+ * Basically, we start a IAP query, and then go to sleep. When the query
+ * return, irnet_getvalue_confirm will wake us up, and we can examine the
+ * result of the query...
+ * Note that in some case, the query fail even before we go to sleep,
+ * creating some races...
+ */
+static inline int
+irnet_find_lsap_sel(irnet_socket *	self)
+{
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  /* This should not happen */
+  DABORT(self->iriap, -EBUSY, IRDA_SR_ERROR, "busy with a previous query.\n");
+
+  /* Create an IAP instance, will be closed in irnet_getvalue_confirm() */
+  self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+			   irnet_getvalue_confirm);
+
+  /* Treat unexpected signals as disconnect */
+  self->errno = -EHOSTUNREACH;
+
+  /* Query remote LM-IAS */
+  iriap_getvaluebyclass_request(self->iriap, self->rsaddr, self->daddr,
+				IRNET_SERVICE_NAME, IRNET_IAS_VALUE);
+
+  /* The above request is non-blocking.
+   * After a while, IrDA will call us back in irnet_getvalue_confirm()
+   * We will then call irnet_ias_to_tsap() and finish the
+   * connection procedure */
+
+  DEXIT(IRDA_SR_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_tsap (self)
+ *
+ *    Initialise the TTP socket and initiate TTP connection
+ *
+ */
+static inline int
+irnet_connect_tsap(irnet_socket *	self)
+{
+  int		err;
+
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  /* Open a local TSAP (an IrTTP instance) */
+  err = irnet_open_tsap(self);
+  if(err != 0)
+    {
+      clear_bit(0, &self->ttp_connect);
+      DERROR(IRDA_SR_ERROR, "connect aborted!\n");
+      return err;
+    }
+
+  /* Connect to remote device */
+  err = irttp_connect_request(self->tsap, self->dtsap_sel,
+			      self->rsaddr, self->daddr, NULL,
+			      self->max_sdu_size_rx, NULL);
+  if(err != 0)
+    {
+      clear_bit(0, &self->ttp_connect);
+      DERROR(IRDA_SR_ERROR, "connect aborted!\n");
+      return err;
+    }
+
+  /* The above call is non-blocking.
+   * After a while, the IrDA stack will either call us back in
+   * irnet_connect_confirm() or irnet_disconnect_indication()
+   * See you there ;-) */
+
+  DEXIT(IRDA_SR_TRACE, "\n");
+  return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discover_next_daddr (self)
+ *
+ *    Query the IrNET TSAP of the next device in the log.
+ *
+ * Used in the TSAP discovery procedure.
+ */
+static inline int
+irnet_discover_next_daddr(irnet_socket *	self)
+{
+  /* Close the last instance of IrIAP, and open a new one.
+   * We can't reuse the IrIAP instance in the IrIAP callback */
+  if(self->iriap)
+    {
+      iriap_close(self->iriap);
+      self->iriap = NULL;
+    }
+  /* Create a new IAP instance */
+  self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
+			   irnet_discovervalue_confirm);
+  if(self->iriap == NULL)
+    return -ENOMEM;
+
+  /* Next discovery - before the call to avoid races */
+  self->disco_index++;
+
+  /* Check if we have one more address to try */
+  if(self->disco_index < self->disco_number)
+    {
+      /* Query remote LM-IAS */
+      iriap_getvaluebyclass_request(self->iriap,
+				    self->discoveries[self->disco_index].saddr,
+				    self->discoveries[self->disco_index].daddr,
+				    IRNET_SERVICE_NAME, IRNET_IAS_VALUE);
+      /* The above request is non-blocking.
+       * After a while, IrDA will call us back in irnet_discovervalue_confirm()
+       * We will then call irnet_ias_to_tsap() and come back here again... */
+      return 0;
+    }
+  else
+    return 1;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discover_daddr_and_lsap_sel (self)
+ *
+ *    This try to find a device with the requested service.
+ *
+ * Initiate a TSAP discovery procedure.
+ * It basically look into the discovery log. For each address in the list,
+ * it queries the LM-IAS of the device to find if this device offer
+ * the requested service.
+ * If there is more than one node supporting the service, we complain
+ * to the user (it should move devices around).
+ * If we find one node which have the requested TSAP, we connect to it.
+ *
+ * This function just start the whole procedure. It request the discovery
+ * log and submit the first IAS query.
+ * The bulk of the job is handled in irnet_discovervalue_confirm()
+ *
+ * Note : this procedure fails if there is more than one device in range
+ * on the same dongle, because IrLMP doesn't disconnect the LAP when the
+ * last LSAP is closed. Moreover, we would need to wait the LAP
+ * disconnection...
+ */
+static inline int
+irnet_discover_daddr_and_lsap_sel(irnet_socket *	self)
+{
+  int	ret;
+
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  /* Ask lmp for the current discovery log */
+  self->discoveries = irlmp_get_discoveries(&self->disco_number, self->mask,
+					    DISCOVERY_DEFAULT_SLOTS);
+
+  /* Check if the we got some results */
+  if(self->discoveries == NULL)
+    {
+      self->disco_number = -1;
+      clear_bit(0, &self->ttp_connect);
+      DRETURN(-ENETUNREACH, IRDA_SR_INFO, "No Cachelog...\n");
+    }
+  DEBUG(IRDA_SR_INFO, "Got the log (0x%p), size is %d\n",
+	self->discoveries, self->disco_number);
+
+  /* Start with the first discovery */
+  self->disco_index = -1;
+  self->daddr = DEV_ADDR_ANY;
+
+  /* This will fail if the log is empty - this is non-blocking */
+  ret = irnet_discover_next_daddr(self);
+  if(ret)
+    {
+      /* Close IAP */
+      if(self->iriap)
+	iriap_close(self->iriap);
+      self->iriap = NULL;
+
+      /* Cleanup our copy of the discovery log */
+      kfree(self->discoveries);
+      self->discoveries = NULL;
+
+      clear_bit(0, &self->ttp_connect);
+      DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n");
+    }
+
+  /* Follow me in irnet_discovervalue_confirm() */
+
+  DEXIT(IRDA_SR_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_dname_to_daddr (self)
+ *
+ *    Convert an IrDA nickname to a valid IrDA address
+ *
+ * It basically look into the discovery log until there is a match.
+ */
+static inline int
+irnet_dname_to_daddr(irnet_socket *	self)
+{
+  struct irda_device_info *discoveries;	/* Copy of the discovery log */
+  int	number;			/* Number of nodes in the log */
+  int	i;
+
+  DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self);
+
+  /* Ask lmp for the current discovery log */
+  discoveries = irlmp_get_discoveries(&number, 0xffff,
+				      DISCOVERY_DEFAULT_SLOTS);
+  /* Check if the we got some results */
+  if(discoveries == NULL)
+    DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n");
+
+  /*
+   * Now, check all discovered devices (if any), and connect
+   * client only about the services that the client is
+   * interested in...
+   */
+  for(i = 0; i < number; i++)
+    {
+      /* Does the name match ? */
+      if(!strncmp(discoveries[i].info, self->rname, NICKNAME_MAX_LEN))
+	{
+	  /* Yes !!! Get it.. */
+	  self->daddr = discoveries[i].daddr;
+	  DEBUG(IRDA_SR_INFO, "discovered device ``%s'' at address 0x%08x.\n",
+		self->rname, self->daddr);
+	  kfree(discoveries);
+	  DEXIT(IRDA_SR_TRACE, "\n");
+	  return 0;
+	}
+    }
+  /* No luck ! */
+  DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
+  kfree(discoveries);
+  return -EADDRNOTAVAIL;
+}
+
+
+/************************* SOCKET ROUTINES *************************/
+/*
+ * This are the main operations on IrNET sockets, basically to create
+ * and destroy IrNET sockets. These are called from the PPP part...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Create a IrNET instance : just initialise some parameters...
+ */
+int
+irda_irnet_create(irnet_socket *	self)
+{
+  DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+
+  self->magic = IRNET_MAGIC;	/* Paranoia */
+
+  self->ttp_open = 0;		/* Prevent higher layer from accessing IrTTP */
+  self->ttp_connect = 0;	/* Not connecting yet */
+  self->rname[0] = '\0';	/* May be set via control channel */
+  self->rdaddr = DEV_ADDR_ANY;	/* May be set via control channel */
+  self->rsaddr = DEV_ADDR_ANY;	/* May be set via control channel */
+  self->daddr = DEV_ADDR_ANY;	/* Until we get connected */
+  self->saddr = DEV_ADDR_ANY;	/* Until we get connected */
+  self->max_sdu_size_rx = TTP_SAR_UNBOUND;
+
+  /* Register as a client with IrLMP */
+  self->ckey = irlmp_register_client(0, NULL, NULL, NULL);
+#ifdef DISCOVERY_NOMASK
+  self->mask = 0xffff;		/* For W2k compatibility */
+#else /* DISCOVERY_NOMASK */
+  self->mask = irlmp_service_to_hint(S_LAN);
+#endif /* DISCOVERY_NOMASK */
+  self->tx_flow = FLOW_START;	/* Flow control from IrTTP */
+
+  INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
+
+  DEXIT(IRDA_SOCK_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Connect to the other side :
+ *	o convert device name to an address
+ *	o find the socket number (dlsap)
+ *	o Establish the connection
+ *
+ * Note : We no longer mimic af_irda. The IAS query for finding the TSAP
+ * is done asynchronously, like the TTP connection. This allow us to
+ * call this function from any context (not only process).
+ * The downside is that following what's happening in there is tricky
+ * because it involve various functions all over the place...
+ */
+int
+irda_irnet_connect(irnet_socket *	self)
+{
+  int		err;
+
+  DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+
+  /* Check if we are already trying to connect.
+   * Because irda_irnet_connect() can be called directly by pppd plus
+   * packet retries in ppp_generic and connect may take time, plus we may
+   * race with irnet_connect_indication(), we need to be careful there... */
+  if(test_and_set_bit(0, &self->ttp_connect))
+    DRETURN(-EBUSY, IRDA_SOCK_INFO, "Already connecting...\n");
+  if((self->iriap != NULL) || (self->tsap != NULL))
+    DERROR(IRDA_SOCK_ERROR, "Socket not cleaned up...\n");
+
+  /* Insert ourselves in the hashbin so that the IrNET server can find us.
+   * Notes : 4th arg is string of 32 char max and must be null terminated
+   *	     When 4th arg is used (string), 3rd arg isn't (int)
+   *	     Can't re-insert (MUST remove first) so check for that... */
+  if((irnet_server.running) && (self->q.q_next == NULL))
+    {
+      spin_lock_bh(&irnet_server.spinlock);
+      hashbin_insert(irnet_server.list, (irda_queue_t *) self, 0, self->rname);
+      spin_unlock_bh(&irnet_server.spinlock);
+      DEBUG(IRDA_SOCK_INFO, "Inserted ``%s'' in hashbin...\n", self->rname);
+    }
+
+  /* If we don't have anything (no address, no name) */
+  if((self->rdaddr == DEV_ADDR_ANY) && (self->rname[0] == '\0'))
+    {
+      /* Try to find a suitable address */
+      if((err = irnet_discover_daddr_and_lsap_sel(self)) != 0)
+	DRETURN(err, IRDA_SOCK_INFO, "auto-connect failed!\n");
+      /* In most cases, the call above is non-blocking */
+    }
+  else
+    {
+      /* If we have only the name (no address), try to get an address */
+      if(self->rdaddr == DEV_ADDR_ANY)
+	{
+	  if((err = irnet_dname_to_daddr(self)) != 0)
+	    DRETURN(err, IRDA_SOCK_INFO, "name connect failed!\n");
+	}
+      else
+	/* Use the requested destination address */
+	self->daddr = self->rdaddr;
+
+      /* Query remote LM-IAS to find LSAP selector */
+      irnet_find_lsap_sel(self);
+      /* The above call is non blocking */
+    }
+
+  /* At this point, we are waiting for the IrDA stack to call us back,
+   * or we have already failed.
+   * We will finish the connection procedure in irnet_connect_tsap().
+   */
+  DEXIT(IRDA_SOCK_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_irnet_destroy(self)
+ *
+ *    Destroy irnet instance
+ *
+ * Note : this need to be called from a process context.
+ */
+void
+irda_irnet_destroy(irnet_socket *	self)
+{
+  DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self);
+  if(self == NULL)
+    return;
+
+  /* Remove ourselves from hashbin (if we are queued in hashbin)
+   * Note : `irnet_server.running' protect us from calls in hashbin_delete() */
+  if((irnet_server.running) && (self->q.q_next != NULL))
+    {
+      struct irnet_socket *	entry;
+      DEBUG(IRDA_SOCK_INFO, "Removing from hash..\n");
+      spin_lock_bh(&irnet_server.spinlock);
+      entry = hashbin_remove_this(irnet_server.list, (irda_queue_t *) self);
+      self->q.q_next = NULL;
+      spin_unlock_bh(&irnet_server.spinlock);
+      DASSERT(entry == self, , IRDA_SOCK_ERROR, "Can't remove from hash.\n");
+    }
+
+  /* If we were connected, post a message */
+  if(test_bit(0, &self->ttp_open))
+    {
+      /* Note : as the disconnect comes from ppp_generic, the unit number
+       * doesn't exist anymore when we post the event, so we need to pass
+       * NULL as the first arg... */
+      irnet_post_event(NULL, IRNET_DISCONNECT_TO,
+		       self->saddr, self->daddr, self->rname, 0);
+    }
+
+  /* Prevent various IrDA callbacks from messing up things
+   * Need to be first */
+  clear_bit(0, &self->ttp_connect);
+
+  /* Prevent higher layer from accessing IrTTP */
+  clear_bit(0, &self->ttp_open);
+
+  /* Unregister with IrLMP */
+  irlmp_unregister_client(self->ckey);
+
+  /* Unregister with LM-IAS */
+  if(self->iriap)
+    {
+      iriap_close(self->iriap);
+      self->iriap = NULL;
+    }
+
+  /* Cleanup eventual discoveries from connection attempt or control channel */
+  if(self->discoveries != NULL)
+    {
+      /* Cleanup our copy of the discovery log */
+      kfree(self->discoveries);
+      self->discoveries = NULL;
+    }
+
+  /* Close our IrTTP connection */
+  if(self->tsap)
+    {
+      DEBUG(IRDA_SOCK_INFO, "Closing our TTP connection.\n");
+      irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+      irttp_close_tsap(self->tsap);
+      self->tsap = NULL;
+    }
+  self->stsap_sel = 0;
+
+  DEXIT(IRDA_SOCK_TRACE, "\n");
+}
+
+
+/************************** SERVER SOCKET **************************/
+/*
+ * The IrNET service is composed of one server socket and a variable
+ * number of regular IrNET sockets. The server socket is supposed to
+ * handle incoming connections and redirect them to one IrNET sockets.
+ * It's a superset of the regular IrNET socket, but has a very distinct
+ * behaviour...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_daddr_to_dname (self)
+ *
+ *    Convert an IrDA address to a IrDA nickname
+ *
+ * It basically look into the discovery log until there is a match.
+ */
+static inline int
+irnet_daddr_to_dname(irnet_socket *	self)
+{
+  struct irda_device_info *discoveries;	/* Copy of the discovery log */
+  int	number;			/* Number of nodes in the log */
+  int	i;
+
+  DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+  /* Ask lmp for the current discovery log */
+  discoveries = irlmp_get_discoveries(&number, 0xffff,
+				      DISCOVERY_DEFAULT_SLOTS);
+  /* Check if the we got some results */
+  if (discoveries == NULL)
+    DRETURN(-ENETUNREACH, IRDA_SERV_INFO, "Cachelog empty...\n");
+
+  /* Now, check all discovered devices (if any) */
+  for(i = 0; i < number; i++)
+    {
+      /* Does the name match ? */
+      if(discoveries[i].daddr == self->daddr)
+	{
+	  /* Yes !!! Get it.. */
+	  strlcpy(self->rname, discoveries[i].info, sizeof(self->rname));
+	  self->rname[sizeof(self->rname) - 1] = '\0';
+	  DEBUG(IRDA_SERV_INFO, "Device 0x%08x is in fact ``%s''.\n",
+		self->daddr, self->rname);
+	  kfree(discoveries);
+	  DEXIT(IRDA_SERV_TRACE, "\n");
+	  return 0;
+	}
+    }
+  /* No luck ! */
+  DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
+  kfree(discoveries);
+  return -EADDRNOTAVAIL;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_find_socket (self)
+ *
+ *    Find the correct IrNET socket
+ *
+ * Look into the list of IrNET sockets and finds one with the right
+ * properties...
+ */
+static inline irnet_socket *
+irnet_find_socket(irnet_socket *	self)
+{
+  irnet_socket *	new = (irnet_socket *) NULL;
+  int			err;
+
+  DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+  /* Get the addresses of the requester */
+  self->daddr = irttp_get_daddr(self->tsap);
+  self->saddr = irttp_get_saddr(self->tsap);
+
+  /* Try to get the IrDA nickname of the requester */
+  err = irnet_daddr_to_dname(self);
+
+  /* Protect access to the instance list */
+  spin_lock_bh(&irnet_server.spinlock);
+
+  /* So now, try to get an socket having specifically
+   * requested that nickname */
+  if(err == 0)
+    {
+      new = (irnet_socket *) hashbin_find(irnet_server.list,
+					  0, self->rname);
+      if(new)
+	DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches rname ``%s''.\n",
+	      new, new->rname);
+    }
+
+  /* If no name matches, try to find an socket by the destination address */
+  /* It can be either the requested destination address (set via the
+   * control channel), or the current destination address if the
+   * socket is in the middle of a connection request */
+  if(new == (irnet_socket *) NULL)
+    {
+      new = (irnet_socket *) hashbin_get_first(irnet_server.list);
+      while(new !=(irnet_socket *) NULL)
+	{
+	  /* Does it have the same address ? */
+	  if((new->rdaddr == self->daddr) || (new->daddr == self->daddr))
+	    {
+	      /* Yes !!! Get it.. */
+	      DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches daddr %#08x.\n",
+		    new, self->daddr);
+	      break;
+	    }
+	  new = (irnet_socket *) hashbin_get_next(irnet_server.list);
+	}
+    }
+
+  /* If we don't have any socket, get the first unconnected socket */
+  if(new == (irnet_socket *) NULL)
+    {
+      new = (irnet_socket *) hashbin_get_first(irnet_server.list);
+      while(new !=(irnet_socket *) NULL)
+	{
+	  /* Is it available ? */
+	  if(!(test_bit(0, &new->ttp_open)) && (new->rdaddr == DEV_ADDR_ANY) &&
+	     (new->rname[0] == '\0') && (new->ppp_open))
+	    {
+	      /* Yes !!! Get it.. */
+	      DEBUG(IRDA_SERV_INFO, "Socket 0x%p is free.\n",
+		    new);
+	      break;
+	    }
+	  new = (irnet_socket *) hashbin_get_next(irnet_server.list);
+	}
+    }
+
+  /* Spin lock end */
+  spin_unlock_bh(&irnet_server.spinlock);
+
+  DEXIT(IRDA_SERV_TRACE, " - new = 0x%p\n", new);
+  return new;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_connect_socket (self)
+ *
+ *    Connect an incoming connection to the socket
+ *
+ */
+static inline int
+irnet_connect_socket(irnet_socket *	server,
+		     irnet_socket *	new,
+		     struct qos_info *	qos,
+		     __u32		max_sdu_size,
+		     __u8		max_header_size)
+{
+  DENTER(IRDA_SERV_TRACE, "(server=0x%p, new=0x%p)\n",
+	 server, new);
+
+  /* Now attach up the new socket */
+  new->tsap = irttp_dup(server->tsap, new);
+  DABORT(new->tsap == NULL, -1, IRDA_SERV_ERROR, "dup failed!\n");
+
+  /* Set up all the relevant parameters on the new socket */
+  new->stsap_sel = new->tsap->stsap_sel;
+  new->dtsap_sel = new->tsap->dtsap_sel;
+  new->saddr = irttp_get_saddr(new->tsap);
+  new->daddr = irttp_get_daddr(new->tsap);
+
+  new->max_header_size = max_header_size;
+  new->max_sdu_size_tx = max_sdu_size;
+  new->max_data_size   = max_sdu_size;
+#ifdef STREAM_COMPAT
+  /* If we want to receive "stream sockets" */
+  if(max_sdu_size == 0)
+    new->max_data_size = irttp_get_max_seg_size(new->tsap);
+#endif /* STREAM_COMPAT */
+
+  /* Clean up the original one to keep it in listen state */
+  irttp_listen(server->tsap);
+
+  /* Send a connection response on the new socket */
+  irttp_connect_response(new->tsap, new->max_sdu_size_rx, NULL);
+
+  /* Allow PPP to send its junk over the new socket... */
+  set_bit(0, &new->ttp_open);
+
+  /* Not connecting anymore, and clean up last possible remains
+   * of connection attempts on the socket */
+  clear_bit(0, &new->ttp_connect);
+  if(new->iriap)
+    {
+      iriap_close(new->iriap);
+      new->iriap = NULL;
+    }
+  if(new->discoveries != NULL)
+    {
+      kfree(new->discoveries);
+      new->discoveries = NULL;
+    }
+
+#ifdef CONNECT_INDIC_KICK
+  /* As currently we don't block packets in ppp_irnet_send() while passive,
+   * this is not really needed...
+   * Also, not doing it give IrDA a chance to finish the setup properly
+   * before being swamped with packets... */
+  ppp_output_wakeup(&new->chan);
+#endif /* CONNECT_INDIC_KICK */
+
+  /* Notify the control channel */
+  irnet_post_event(new, IRNET_CONNECT_FROM,
+		   new->saddr, new->daddr, server->rname, 0);
+
+  DEXIT(IRDA_SERV_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_disconnect_server (self)
+ *
+ *    Cleanup the server socket when the incoming connection abort
+ *
+ */
+static inline void
+irnet_disconnect_server(irnet_socket *	self,
+			struct sk_buff *skb)
+{
+  DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self);
+
+  /* Put the received packet in the black hole */
+  kfree_skb(skb);
+
+#ifdef FAIL_SEND_DISCONNECT
+  /* Tell the other party we don't want to be connected */
+  /* Hum... Is it the right thing to do ? And do we need to send
+   * a connect response before ? It looks ok without this... */
+  irttp_disconnect_request(self->tsap, NULL, P_NORMAL);
+#endif /* FAIL_SEND_DISCONNECT */
+
+  /* Notify the control channel (see irnet_find_socket()) */
+  irnet_post_event(NULL, IRNET_REQUEST_FROM,
+		   self->saddr, self->daddr, self->rname, 0);
+
+  /* Clean up the server to keep it in listen state */
+  irttp_listen(self->tsap);
+
+  DEXIT(IRDA_SERV_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_setup_server (self)
+ *
+ *    Create a IrTTP server and set it up...
+ *
+ * Register the IrLAN hint bit, create a IrTTP instance for us,
+ * set all the IrTTP callbacks and create an IrIAS entry...
+ */
+static inline int
+irnet_setup_server(void)
+{
+  __u16		hints;
+
+  DENTER(IRDA_SERV_TRACE, "()\n");
+
+  /* Initialise the regular socket part of the server */
+  irda_irnet_create(&irnet_server.s);
+
+  /* Open a local TSAP (an IrTTP instance) for the server */
+  irnet_open_tsap(&irnet_server.s);
+
+  /* PPP part setup */
+  irnet_server.s.ppp_open = 0;
+  irnet_server.s.chan.private = NULL;
+  irnet_server.s.file = NULL;
+
+  /* Get the hint bit corresponding to IrLAN */
+  /* Note : we overload the IrLAN hint bit. As it is only a "hint", and as
+   * we provide roughly the same functionality as IrLAN, this is ok.
+   * In fact, the situation is similar as JetSend overloading the Obex hint
+   */
+  hints = irlmp_service_to_hint(S_LAN);
+
+#ifdef ADVERTISE_HINT
+  /* Register with IrLMP as a service (advertise our hint bit) */
+  irnet_server.skey = irlmp_register_service(hints);
+#endif /* ADVERTISE_HINT */
+
+  /* Register with LM-IAS (so that people can connect to us) */
+  irnet_server.ias_obj = irias_new_object(IRNET_SERVICE_NAME, jiffies);
+  irias_add_integer_attrib(irnet_server.ias_obj, IRNET_IAS_VALUE,
+			   irnet_server.s.stsap_sel, IAS_KERNEL_ATTR);
+  irias_insert_object(irnet_server.ias_obj);
+
+#ifdef DISCOVERY_EVENTS
+  /* Tell IrLMP we want to be notified of newly discovered nodes */
+  irlmp_update_client(irnet_server.s.ckey, hints,
+		      irnet_discovery_indication, irnet_expiry_indication,
+		      (void *) &irnet_server.s);
+#endif
+
+  DEXIT(IRDA_SERV_TRACE, " - self=0x%p\n", &irnet_server.s);
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irda_destroy_server (self)
+ *
+ *    Destroy the IrTTP server...
+ *
+ * Reverse of the previous function...
+ */
+static inline void
+irnet_destroy_server(void)
+{
+  DENTER(IRDA_SERV_TRACE, "()\n");
+
+#ifdef ADVERTISE_HINT
+  /* Unregister with IrLMP */
+  irlmp_unregister_service(irnet_server.skey);
+#endif /* ADVERTISE_HINT */
+
+  /* Unregister with LM-IAS */
+  if(irnet_server.ias_obj)
+    irias_delete_object(irnet_server.ias_obj);
+
+  /* Cleanup the socket part */
+  irda_irnet_destroy(&irnet_server.s);
+
+  DEXIT(IRDA_SERV_TRACE, "\n");
+}
+
+
+/************************ IRDA-TTP CALLBACKS ************************/
+/*
+ * When we create a IrTTP instance, we pass to it a set of callbacks
+ * that IrTTP will call in case of various events.
+ * We take care of those events here.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_data_indication (instance, sap, skb)
+ *
+ *    Received some data from TinyTP. Just queue it on the receive queue
+ *
+ */
+static int
+irnet_data_indication(void *	instance,
+		      void *	sap,
+		      struct sk_buff *skb)
+{
+  irnet_socket *	ap = (irnet_socket *) instance;
+  unsigned char *	p;
+  int			code = 0;
+
+  DENTER(IRDA_TCB_TRACE, "(self/ap=0x%p, skb=0x%p)\n",
+	 ap, skb);
+  DASSERT(skb != NULL, 0, IRDA_CB_ERROR, "skb is NULL !!!\n");
+
+  /* Check is ppp is ready to receive our packet */
+  if(!ap->ppp_open)
+    {
+      DERROR(IRDA_CB_ERROR, "PPP not ready, dropping packet...\n");
+      /* When we return error, TTP will need to requeue the skb and
+       * will stop the sender. IrTTP will stall until we send it a
+       * flow control request... */
+      return -ENOMEM;
+    }
+
+  /* strip address/control field if present */
+  p = skb->data;
+  if((p[0] == PPP_ALLSTATIONS) && (p[1] == PPP_UI))
+    {
+      /* chop off address/control */
+      if(skb->len < 3)
+	goto err_exit;
+      p = skb_pull(skb, 2);
+    }
+
+  /* decompress protocol field if compressed */
+  if(p[0] & 1)
+    {
+      /* protocol is compressed */
+      skb_push(skb, 1)[0] = 0;
+    }
+  else
+    if(skb->len < 2)
+      goto err_exit;
+
+  /* pass to generic ppp layer */
+  /* Note : how do I know if ppp can accept or not the packet ? This is
+   * essential if I want to manage flow control smoothly... */
+  ppp_input(&ap->chan, skb);
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+  return 0;
+
+ err_exit:
+  DERROR(IRDA_CB_ERROR, "Packet too small, dropping...\n");
+  kfree_skb(skb);
+  ppp_input_error(&ap->chan, code);
+  return 0;	/* Don't return an error code, only for flow control... */
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_disconnect_indication (instance, sap, reason, skb)
+ *
+ *    Connection has been closed. Chech reason to find out why
+ *
+ * Note : there are many cases where we come here :
+ *	o attempted to connect, timeout
+ *	o connected, link is broken, LAP has timeout
+ *	o connected, other side close the link
+ *	o connection request on the server not handled
+ */
+static void
+irnet_disconnect_indication(void *	instance,
+			    void *	sap,
+			    LM_REASON	reason,
+			    struct sk_buff *skb)
+{
+  irnet_socket *	self = (irnet_socket *) instance;
+  int			test_open;
+  int			test_connect;
+
+  DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n");
+
+  /* Don't care about it, but let's not leak it */
+  if(skb)
+    dev_kfree_skb(skb);
+
+  /* Prevent higher layer from accessing IrTTP */
+  test_open = test_and_clear_bit(0, &self->ttp_open);
+  /* Not connecting anymore...
+   * (note : TSAP is open, so IAP callbacks are no longer pending...) */
+  test_connect = test_and_clear_bit(0, &self->ttp_connect);
+
+  /* If both self->ttp_open and self->ttp_connect are NULL, it mean that we
+   * have a race condition with irda_irnet_destroy() or
+   * irnet_connect_indication(), so don't mess up tsap...
+   */
+  if(!(test_open || test_connect))
+    {
+      DERROR(IRDA_CB_ERROR, "Race condition detected...\n");
+      return;
+    }
+
+  /* If we were active, notify the control channel */
+  if(test_open)
+    irnet_post_event(self, IRNET_DISCONNECT_FROM,
+		     self->saddr, self->daddr, self->rname, 0);
+  else
+    /* If we were trying to connect, notify the control channel */
+    if((self->tsap) && (self != &irnet_server.s))
+      irnet_post_event(self, IRNET_NOANSWER_FROM,
+		       self->saddr, self->daddr, self->rname, 0);
+
+  /* Close our IrTTP connection, cleanup tsap */
+  if((self->tsap) && (self != &irnet_server.s))
+    {
+      DEBUG(IRDA_CB_INFO, "Closing our TTP connection.\n");
+      irttp_close_tsap(self->tsap);
+      self->tsap = NULL;
+    }
+  /* Cleanup the socket in case we want to reconnect in ppp_output_wakeup() */
+  self->stsap_sel = 0;
+  self->daddr = DEV_ADDR_ANY;
+  self->tx_flow = FLOW_START;
+
+  /* Deal with the ppp instance if it's still alive */
+  if(self->ppp_open)
+    {
+      if(test_open)
+	{
+	  /* ppp_unregister_channel() wants a user context. */
+	  schedule_work(&self->disconnect_work);
+	}
+      else
+	{
+	  /* If we were trying to connect, flush (drain) ppp_generic
+	   * Tx queue (most often we have blocked it), which will
+	   * trigger an other attempt to connect. If we are passive,
+	   * this will empty the Tx queue after last try. */
+	  ppp_output_wakeup(&self->chan);
+	}
+    }
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_confirm (instance, sap, qos, max_sdu_size, skb)
+ *
+ *    Connections has been confirmed by the remote device
+ *
+ */
+static void
+irnet_connect_confirm(void *	instance,
+		      void *	sap,
+		      struct qos_info *qos,
+		      __u32	max_sdu_size,
+		      __u8	max_header_size,
+		      struct sk_buff *skb)
+{
+  irnet_socket *	self = (irnet_socket *) instance;
+
+  DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+
+  /* Check if socket is closing down (via irda_irnet_destroy()) */
+  if(! test_bit(0, &self->ttp_connect))
+    {
+      DERROR(IRDA_CB_ERROR, "Socket no longer connecting. Ouch !\n");
+      return;
+    }
+
+  /* How much header space do we need to reserve */
+  self->max_header_size = max_header_size;
+
+  /* IrTTP max SDU size in transmit direction */
+  self->max_sdu_size_tx = max_sdu_size;
+  self->max_data_size = max_sdu_size;
+#ifdef STREAM_COMPAT
+  if(max_sdu_size == 0)
+    self->max_data_size = irttp_get_max_seg_size(self->tsap);
+#endif /* STREAM_COMPAT */
+
+  /* At this point, IrLMP has assigned our source address */
+  self->saddr = irttp_get_saddr(self->tsap);
+
+  /* Allow higher layer to access IrTTP */
+  set_bit(0, &self->ttp_open);
+  clear_bit(0, &self->ttp_connect);	/* Not racy, IrDA traffic is serial */
+  /* Give a kick in the ass of ppp_generic so that he sends us some data */
+  ppp_output_wakeup(&self->chan);
+
+  /* Check size of received packet */
+  if(skb->len > 0)
+    {
+#ifdef PASS_CONNECT_PACKETS
+      DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n");
+      /* Try to pass it to PPP */
+      irnet_data_indication(instance, sap, skb);
+#else /* PASS_CONNECT_PACKETS */
+      DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n");
+      kfree_skb(skb);	/* Note : will be optimised with other kfree... */
+#endif /* PASS_CONNECT_PACKETS */
+    }
+  else
+    kfree_skb(skb);
+
+  /* Notify the control channel */
+  irnet_post_event(self, IRNET_CONNECT_TO,
+		   self->saddr, self->daddr, self->rname, 0);
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_flow_indication (instance, sap, flow)
+ *
+ *    Used by TinyTP to tell us if it can accept more data or not
+ *
+ */
+static void
+irnet_flow_indication(void *	instance,
+		      void *	sap,
+		      LOCAL_FLOW flow)
+{
+  irnet_socket *	self = (irnet_socket *) instance;
+  LOCAL_FLOW		oldflow = self->tx_flow;
+
+  DENTER(IRDA_TCB_TRACE, "(self=0x%p, flow=%d)\n", self, flow);
+
+  /* Update our state */
+  self->tx_flow = flow;
+
+  /* Check what IrTTP want us to do... */
+  switch(flow)
+    {
+    case FLOW_START:
+      DEBUG(IRDA_CB_INFO, "IrTTP wants us to start again\n");
+      /* Check if we really need to wake up PPP */
+      if(oldflow == FLOW_STOP)
+	ppp_output_wakeup(&self->chan);
+      else
+	DEBUG(IRDA_CB_INFO, "But we were already transmitting !!!\n");
+      break;
+    case FLOW_STOP:
+      DEBUG(IRDA_CB_INFO, "IrTTP wants us to slow down\n");
+      break;
+    default:
+      DEBUG(IRDA_CB_INFO, "Unknown flow command!\n");
+      break;
+    }
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_status_indication (instance, sap, reason, skb)
+ *
+ *    Link (IrLAP) status report.
+ *
+ */
+static void
+irnet_status_indication(void *	instance,
+			LINK_STATUS link,
+			LOCK_STATUS lock)
+{
+  irnet_socket *	self = (irnet_socket *) instance;
+
+  DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n");
+
+  /* We can only get this event if we are connected */
+  switch(link)
+    {
+    case STATUS_NO_ACTIVITY:
+      irnet_post_event(self, IRNET_BLOCKED_LINK,
+		       self->saddr, self->daddr, self->rname, 0);
+      break;
+    default:
+      DEBUG(IRDA_CB_INFO, "Unknown status...\n");
+    }
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_connect_indication(instance, sap, qos, max_sdu_size, userdata)
+ *
+ *    Incoming connection
+ *
+ * In theory, this function is called only on the server socket.
+ * Some other node is attempting to connect to the IrNET service, and has
+ * sent a connection request on our server socket.
+ * We just redirect the connection to the relevant IrNET socket.
+ *
+ * Note : we also make sure that between 2 irnet nodes, there can
+ * exist only one irnet connection.
+ */
+static void
+irnet_connect_indication(void *		instance,
+			 void *		sap,
+			 struct qos_info *qos,
+			 __u32		max_sdu_size,
+			 __u8		max_header_size,
+			 struct sk_buff *skb)
+{
+  irnet_socket *	server = &irnet_server.s;
+  irnet_socket *	new = (irnet_socket *) NULL;
+
+  DENTER(IRDA_TCB_TRACE, "(server=0x%p)\n", server);
+  DASSERT(instance == &irnet_server, , IRDA_CB_ERROR,
+	  "Invalid instance (0x%p) !!!\n", instance);
+  DASSERT(sap == irnet_server.s.tsap, , IRDA_CB_ERROR, "Invalid sap !!!\n");
+
+  /* Try to find the most appropriate IrNET socket */
+  new = irnet_find_socket(server);
+
+  /* After all this hard work, do we have an socket ? */
+  if(new == (irnet_socket *) NULL)
+    {
+      DEXIT(IRDA_CB_INFO, ": No socket waiting for this connection.\n");
+      irnet_disconnect_server(server, skb);
+      return;
+    }
+
+  /* Is the socket already busy ? */
+  if(test_bit(0, &new->ttp_open))
+    {
+      DEXIT(IRDA_CB_INFO, ": Socket already connected.\n");
+      irnet_disconnect_server(server, skb);
+      return;
+    }
+
+  /* The following code is a bit tricky, so need comments ;-)
+   */
+  /* If ttp_connect is set, the socket is trying to connect to the other
+   * end and may have sent a IrTTP connection request and is waiting for
+   * a connection response (that may never come).
+   * Now, the pain is that the socket may have opened a tsap and is
+   * waiting on it, while the other end is trying to connect to it on
+   * another tsap.
+   * Because IrNET can be peer to peer, we need to workaround this.
+   * Furthermore, the way the irnetd script is implemented, the
+   * target will create a second IrNET connection back to the
+   * originator and expect the originator to bind this new connection
+   * to the original PPPD instance.
+   * And of course, if we don't use irnetd, we can have a race when
+   * both side try to connect simultaneously, which could leave both
+   * connections half closed (yuck).
+   * Conclusions :
+   *	1) The "originator" must accept the new connection and get rid
+   *	   of the old one so that irnetd works
+   *	2) One side must deny the new connection to avoid races,
+   *	   but both side must agree on which side it is...
+   * Most often, the originator is primary at the LAP layer.
+   * Jean II
+   */
+  /* Now, let's look at the way I wrote the test...
+   * We need to clear up the ttp_connect flag atomically to prevent
+   * irnet_disconnect_indication() to mess up the tsap we are going to close.
+   * We want to clear the ttp_connect flag only if we close the tsap,
+   * otherwise we will never close it, so we need to check for primary
+   * *before* doing the test on the flag.
+   * And of course, ALLOW_SIMULT_CONNECT can disable this entirely...
+   * Jean II
+   */
+
+  /* Socket already connecting ? On primary ? */
+  if(0
+#ifdef ALLOW_SIMULT_CONNECT
+     || ((irttp_is_primary(server->tsap) == 1) &&	/* primary */
+	 (test_and_clear_bit(0, &new->ttp_connect)))
+#endif /* ALLOW_SIMULT_CONNECT */
+     )
+    {
+      DERROR(IRDA_CB_ERROR, "Socket already connecting, but going to reuse it !\n");
+
+      /* Cleanup the old TSAP if necessary - IrIAP will be cleaned up later */
+      if(new->tsap != NULL)
+	{
+	  /* Close the old connection the new socket was attempting,
+	   * so that we can hook it up to the new connection.
+	   * It's now safe to do it... */
+	  irttp_close_tsap(new->tsap);
+	  new->tsap = NULL;
+	}
+    }
+  else
+    {
+      /* Three options :
+       * 1) socket was not connecting or connected : ttp_connect should be 0.
+       * 2) we don't want to connect the socket because we are secondary or
+       * ALLOW_SIMULT_CONNECT is undefined. ttp_connect should be 1.
+       * 3) we are half way in irnet_disconnect_indication(), and it's a
+       * nice race condition... Fortunately, we can detect that by checking
+       * if tsap is still alive. On the other hand, we can't be in
+       * irda_irnet_destroy() otherwise we would not have found this
+       * socket in the hashbin.
+       * Jean II */
+      if((test_bit(0, &new->ttp_connect)) || (new->tsap != NULL))
+	{
+	  /* Don't mess this socket, somebody else in in charge... */
+	  DERROR(IRDA_CB_ERROR, "Race condition detected, socket in use, abort connect...\n");
+	  irnet_disconnect_server(server, skb);
+	  return;
+	}
+    }
+
+  /* So : at this point, we have a socket, and it is idle. Good ! */
+  irnet_connect_socket(server, new, qos, max_sdu_size, max_header_size);
+
+  /* Check size of received packet */
+  if(skb->len > 0)
+    {
+#ifdef PASS_CONNECT_PACKETS
+      DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n");
+      /* Try to pass it to PPP */
+      irnet_data_indication(new, new->tsap, skb);
+#else /* PASS_CONNECT_PACKETS */
+      DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n");
+      kfree_skb(skb);	/* Note : will be optimised with other kfree... */
+#endif /* PASS_CONNECT_PACKETS */
+    }
+  else
+    kfree_skb(skb);
+
+  DEXIT(IRDA_TCB_TRACE, "\n");
+}
+
+
+/********************** IRDA-IAS/LMP CALLBACKS **********************/
+/*
+ * These are the callbacks called by other layers of the IrDA stack,
+ * mainly LMP for discovery and IAS for name queries.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_getvalue_confirm (result, obj_id, value, priv)
+ *
+ *    Got answer from remote LM-IAS, just connect
+ *
+ * This is the reply to a IAS query we were doing to find the TSAP of
+ * the device we want to connect to.
+ * If we have found a valid TSAP, just initiate the TTP connection
+ * on this TSAP.
+ */
+static void
+irnet_getvalue_confirm(int	result,
+		       __u16	obj_id,
+		       struct ias_value *value,
+		       void *	priv)
+{
+  irnet_socket *	self = (irnet_socket *) priv;
+
+  DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n");
+
+  /* Check if already connected (via irnet_connect_socket())
+   * or socket is closing down (via irda_irnet_destroy()) */
+  if(! test_bit(0, &self->ttp_connect))
+    {
+      DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n");
+      return;
+    }
+
+  /* We probably don't need to make any more queries */
+  iriap_close(self->iriap);
+  self->iriap = NULL;
+
+  /* Post process the IAS reply */
+  self->dtsap_sel = irnet_ias_to_tsap(self, result, value);
+
+  /* If error, just go out */
+  if(self->errno)
+    {
+      clear_bit(0, &self->ttp_connect);
+      DERROR(IRDA_OCB_ERROR, "IAS connect failed ! (0x%X)\n", self->errno);
+      return;
+    }
+
+  DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n",
+	self->daddr, self->dtsap_sel);
+
+  /* Start up TTP - non blocking */
+  irnet_connect_tsap(self);
+
+  DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discovervalue_confirm (result, obj_id, value, priv)
+ *
+ *    Handle the TSAP discovery procedure state machine.
+ *    Got answer from remote LM-IAS, try next device
+ *
+ * We are doing a  TSAP discovery procedure, and we got an answer to
+ * a IAS query we were doing to find the TSAP on one of the address
+ * in the discovery log.
+ *
+ * If we have found a valid TSAP for the first time, save it. If it's
+ * not the first time we found one, complain.
+ *
+ * If we have more addresses in the log, just initiate a new query.
+ * Note that those query may fail (see irnet_discover_daddr_and_lsap_sel())
+ *
+ * Otherwise, wrap up the procedure (cleanup), check if we have found
+ * any device and connect to it.
+ */
+static void
+irnet_discovervalue_confirm(int		result,
+			    __u16	obj_id,
+			    struct ias_value *value,
+			    void *	priv)
+{
+  irnet_socket *	self = (irnet_socket *) priv;
+  __u8			dtsap_sel;		/* TSAP we are looking for */
+
+  DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n");
+
+  /* Check if already connected (via irnet_connect_socket())
+   * or socket is closing down (via irda_irnet_destroy()) */
+  if(! test_bit(0, &self->ttp_connect))
+    {
+      DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n");
+      return;
+    }
+
+  /* Post process the IAS reply */
+  dtsap_sel = irnet_ias_to_tsap(self, result, value);
+
+  /* Have we got something ? */
+  if(self->errno == 0)
+    {
+      /* We found the requested service */
+      if(self->daddr != DEV_ADDR_ANY)
+	{
+	  DERROR(IRDA_OCB_ERROR, "More than one device in range supports IrNET...\n");
+	}
+      else
+	{
+	  /* First time we found that one, save it ! */
+	  self->daddr = self->discoveries[self->disco_index].daddr;
+	  self->dtsap_sel = dtsap_sel;
+	}
+    }
+
+  /* If no failure */
+  if((self->errno == -EADDRNOTAVAIL) || (self->errno == 0))
+    {
+      int	ret;
+
+      /* Search the next node */
+      ret = irnet_discover_next_daddr(self);
+      if(!ret)
+	{
+	  /* In this case, the above request was non-blocking.
+	   * We will return here after a while... */
+	  return;
+	}
+      /* In this case, we have processed the last discovery item */
+    }
+
+  /* No more queries to be done (failure or last one) */
+
+  /* We probably don't need to make any more queries */
+  iriap_close(self->iriap);
+  self->iriap = NULL;
+
+  /* No more items : remove the log and signal termination */
+  DEBUG(IRDA_OCB_INFO, "Cleaning up log (0x%p)\n",
+	self->discoveries);
+  if(self->discoveries != NULL)
+    {
+      /* Cleanup our copy of the discovery log */
+      kfree(self->discoveries);
+      self->discoveries = NULL;
+    }
+  self->disco_number = -1;
+
+  /* Check out what we found */
+  if(self->daddr == DEV_ADDR_ANY)
+    {
+      self->daddr = DEV_ADDR_ANY;
+      clear_bit(0, &self->ttp_connect);
+      DEXIT(IRDA_OCB_TRACE, ": cannot discover IrNET in any device !!!\n");
+      return;
+    }
+
+  /* We have a valid address - just connect */
+
+  DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n",
+	self->daddr, self->dtsap_sel);
+
+  /* Start up TTP - non blocking */
+  irnet_connect_tsap(self);
+
+  DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+#ifdef DISCOVERY_EVENTS
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_discovery_indication (discovery)
+ *
+ *    Got a discovery indication from IrLMP, post an event
+ *
+ * Note : IrLMP take care of matching the hint mask for us, and also
+ * check if it is a "new" node for us...
+ *
+ * As IrLMP filter on the IrLAN hint bit, we get both IrLAN and IrNET
+ * nodes, so it's only at connection time that we will know if the
+ * node support IrNET, IrLAN or both. The other solution is to check
+ * in IAS the PNP ids and service name.
+ * Note : even if a node support IrNET (or IrLAN), it's no guarantee
+ * that we will be able to connect to it, the node might already be
+ * busy...
+ *
+ * One last thing : in some case, this function will trigger duplicate
+ * discovery events. On the other hand, we should catch all
+ * discoveries properly (i.e. not miss one). Filtering duplicate here
+ * is to messy, so we leave that to user space...
+ */
+static void
+irnet_discovery_indication(discinfo_t *		discovery,
+			   DISCOVERY_MODE	mode,
+			   void *		priv)
+{
+  irnet_socket *	self = &irnet_server.s;
+
+  DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR,
+	  "Invalid instance (0x%p) !!!\n", priv);
+
+  DEBUG(IRDA_OCB_INFO, "Discovered new IrNET/IrLAN node %s...\n",
+	discovery->info);
+
+  /* Notify the control channel */
+  irnet_post_event(NULL, IRNET_DISCOVER,
+		   discovery->saddr, discovery->daddr, discovery->info,
+		   get_unaligned((__u16 *)discovery->hints));
+
+  DEXIT(IRDA_OCB_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_expiry_indication (expiry)
+ *
+ *    Got a expiry indication from IrLMP, post an event
+ *
+ * Note : IrLMP take care of matching the hint mask for us, we only
+ * check if it is a "new" node...
+ */
+static void
+irnet_expiry_indication(discinfo_t *	expiry,
+			DISCOVERY_MODE	mode,
+			void *		priv)
+{
+  irnet_socket *	self = &irnet_server.s;
+
+  DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self);
+  DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR,
+	  "Invalid instance (0x%p) !!!\n", priv);
+
+  DEBUG(IRDA_OCB_INFO, "IrNET/IrLAN node %s expired...\n",
+	expiry->info);
+
+  /* Notify the control channel */
+  irnet_post_event(NULL, IRNET_EXPIRE,
+		   expiry->saddr, expiry->daddr, expiry->info,
+		   get_unaligned((__u16 *)expiry->hints));
+
+  DEXIT(IRDA_OCB_TRACE, "\n");
+}
+#endif /* DISCOVERY_EVENTS */
+
+
+/*********************** PROC ENTRY CALLBACKS ***********************/
+/*
+ * We create a instance in the /proc filesystem, and here we take care
+ * of that...
+ */
+
+#ifdef CONFIG_PROC_FS
+static int
+irnet_proc_show(struct seq_file *m, void *v)
+{
+  irnet_socket *	self;
+  char *		state;
+  int			i = 0;
+
+  /* Get the IrNET server information... */
+  seq_printf(m, "IrNET server - ");
+  seq_printf(m, "IrDA state: %s, ",
+		 (irnet_server.running ? "running" : "dead"));
+  seq_printf(m, "stsap_sel: %02x, ", irnet_server.s.stsap_sel);
+  seq_printf(m, "dtsap_sel: %02x\n", irnet_server.s.dtsap_sel);
+
+  /* Do we need to continue ? */
+  if(!irnet_server.running)
+    return 0;
+
+  /* Protect access to the instance list */
+  spin_lock_bh(&irnet_server.spinlock);
+
+  /* Get the sockets one by one... */
+  self = (irnet_socket *) hashbin_get_first(irnet_server.list);
+  while(self != NULL)
+    {
+      /* Start printing info about the socket. */
+      seq_printf(m, "\nIrNET socket %d - ", i++);
+
+      /* First, get the requested configuration */
+      seq_printf(m, "Requested IrDA name: \"%s\", ", self->rname);
+      seq_printf(m, "daddr: %08x, ", self->rdaddr);
+      seq_printf(m, "saddr: %08x\n", self->rsaddr);
+
+      /* Second, get all the PPP info */
+      seq_printf(m, "	PPP state: %s",
+		 (self->ppp_open ? "registered" : "unregistered"));
+      if(self->ppp_open)
+	{
+	  seq_printf(m, ", unit: ppp%d",
+			 ppp_unit_number(&self->chan));
+	  seq_printf(m, ", channel: %d",
+			 ppp_channel_index(&self->chan));
+	  seq_printf(m, ", mru: %d",
+			 self->mru);
+	  /* Maybe add self->flags ? Later... */
+	}
+
+      /* Then, get all the IrDA specific info... */
+      if(self->ttp_open)
+	state = "connected";
+      else
+	if(self->tsap != NULL)
+	  state = "connecting";
+	else
+	  if(self->iriap != NULL)
+	    state = "searching";
+	  else
+	    if(self->ttp_connect)
+	      state = "weird";
+	    else
+	      state = "idle";
+      seq_printf(m, "\n	IrDA state: %s, ", state);
+      seq_printf(m, "daddr: %08x, ", self->daddr);
+      seq_printf(m, "stsap_sel: %02x, ", self->stsap_sel);
+      seq_printf(m, "dtsap_sel: %02x\n", self->dtsap_sel);
+
+      /* Next socket, please... */
+      self = (irnet_socket *) hashbin_get_next(irnet_server.list);
+    }
+
+  /* Spin lock end */
+  spin_unlock_bh(&irnet_server.spinlock);
+
+  return 0;
+}
+
+static int irnet_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irnet_proc_show, NULL);
+}
+
+static const struct file_operations irnet_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= irnet_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* PROC_FS */
+
+
+/********************** CONFIGURATION/CLEANUP **********************/
+/*
+ * Initialisation and teardown of the IrDA part, called at module
+ * insertion and removal...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare the IrNET layer for operation...
+ */
+int __init
+irda_irnet_init(void)
+{
+  int		err = 0;
+
+  DENTER(MODULE_TRACE, "()\n");
+
+  /* Pure paranoia - should be redundant */
+  memset(&irnet_server, 0, sizeof(struct irnet_root));
+
+  /* Setup start of irnet instance list */
+  irnet_server.list = hashbin_new(HB_NOLOCK);
+  DABORT(irnet_server.list == NULL, -ENOMEM,
+	 MODULE_ERROR, "Can't allocate hashbin!\n");
+  /* Init spinlock for instance list */
+  spin_lock_init(&irnet_server.spinlock);
+
+  /* Initialise control channel */
+  init_waitqueue_head(&irnet_events.rwait);
+  irnet_events.index = 0;
+  /* Init spinlock for event logging */
+  spin_lock_init(&irnet_events.spinlock);
+
+#ifdef CONFIG_PROC_FS
+  /* Add a /proc file for irnet infos */
+  proc_create("irnet", 0, proc_irda, &irnet_proc_fops);
+#endif /* CONFIG_PROC_FS */
+
+  /* Setup the IrNET server */
+  err = irnet_setup_server();
+
+  if(!err)
+    /* We are no longer functional... */
+    irnet_server.running = 1;
+
+  DEXIT(MODULE_TRACE, "\n");
+  return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Cleanup at exit...
+ */
+void __exit
+irda_irnet_cleanup(void)
+{
+  DENTER(MODULE_TRACE, "()\n");
+
+  /* We are no longer there... */
+  irnet_server.running = 0;
+
+#ifdef CONFIG_PROC_FS
+  /* Remove our /proc file */
+  remove_proc_entry("irnet", proc_irda);
+#endif /* CONFIG_PROC_FS */
+
+  /* Remove our IrNET server from existence */
+  irnet_destroy_server();
+
+  /* Remove all instances of IrNET socket still present */
+  hashbin_delete(irnet_server.list, (FREE_FUNC) irda_irnet_destroy);
+
+  DEXIT(MODULE_TRACE, "\n");
+}
diff --git a/net/irda/irnet/irnet_irda.h b/net/irda/irnet/irnet_irda.h
new file mode 100644
index 00000000..3e408952
--- /dev/null
+++ b/net/irda/irnet/irnet_irda.h
@@ -0,0 +1,178 @@
+/*
+ *	IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ *		Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains all definitions and declarations necessary for the
+ * IRDA part of the IrNET module (dealing with IrTTP, IrIAS and co).
+ * This file is a private header, so other modules don't want to know
+ * what's in there...
+ */
+
+#ifndef IRNET_IRDA_H
+#define IRNET_IRDA_H
+
+/***************************** INCLUDES *****************************/
+/* Please add other headers in irnet.h */
+
+#include "irnet.h"		/* Module global include */
+
+/************************ CONSTANTS & MACROS ************************/
+
+/*
+ * Name of the service (socket name) used by IrNET
+ */
+/* IAS object name (or part of it) */
+#define IRNET_SERVICE_NAME	"IrNetv1"
+/* IAS attribute */
+#define IRNET_IAS_VALUE		"IrDA:TinyTP:LsapSel"
+/* LMP notify name for client (only for /proc/net/irda/irlmp) */
+#define IRNET_NOTIFY_NAME	"IrNET socket"
+/* LMP notify name for server (only for /proc/net/irda/irlmp) */
+#define IRNET_NOTIFY_NAME_SERV	"IrNET server"
+
+/****************************** TYPES ******************************/
+
+/*
+ * This is the main structure where we store all the data pertaining to
+ * the IrNET server (listen for connection requests) and the root
+ * of the IrNET socket list
+ */
+typedef struct irnet_root
+{
+  irnet_socket		s;		/* To pretend we are a client... */
+
+  /* Generic stuff */
+  int			magic;		/* Paranoia */
+  int			running;	/* Are we operational ? */
+
+  /* Link list of all IrNET instances opened */
+  hashbin_t *		list;
+  spinlock_t		spinlock;	/* Serialize access to the list */
+  /* Note : the way hashbin has been designed is absolutely not
+   * reentrant, beware... So, we blindly protect all with spinlock */
+
+  /* Handle for the hint bit advertised in IrLMP */
+  void *		skey;
+
+  /* Server socket part */
+  struct ias_object *	ias_obj;	/* Our service name + lsap in IAS */
+
+} irnet_root;
+
+
+/**************************** PROTOTYPES ****************************/
+
+/* ----------------------- CONTROL CHANNEL ----------------------- */
+static void
+	irnet_post_event(irnet_socket *,
+			 irnet_event,
+			 __u32,
+			 __u32,
+			 char *,
+			 __u16);
+/* ----------------------- IRDA SUBROUTINES ----------------------- */
+static inline int
+	irnet_open_tsap(irnet_socket *);
+static inline __u8
+	irnet_ias_to_tsap(irnet_socket *,
+			  int,
+			  struct ias_value *);
+static inline int
+	irnet_find_lsap_sel(irnet_socket *);
+static inline int
+	irnet_connect_tsap(irnet_socket *);
+static inline int
+	irnet_discover_next_daddr(irnet_socket *);
+static inline int
+	irnet_discover_daddr_and_lsap_sel(irnet_socket *);
+static inline int
+	irnet_dname_to_daddr(irnet_socket *);
+/* ------------------------ SERVER SOCKET ------------------------ */
+static inline int
+	irnet_daddr_to_dname(irnet_socket *);
+static inline irnet_socket *
+	irnet_find_socket(irnet_socket *);
+static inline int
+	irnet_connect_socket(irnet_socket *,
+			     irnet_socket *,
+			     struct qos_info *,
+			     __u32,
+			     __u8);
+static inline void
+	irnet_disconnect_server(irnet_socket *,
+				struct sk_buff *);
+static inline int
+	irnet_setup_server(void);
+static inline void
+	irnet_destroy_server(void);
+/* ---------------------- IRDA-TTP CALLBACKS ---------------------- */
+static int
+	irnet_data_indication(void *,		/* instance */
+			      void *,		/* sap */
+			      struct sk_buff *);
+static void
+	irnet_disconnect_indication(void *,
+				    void *,
+				    LM_REASON,
+				    struct sk_buff *);
+static void
+	irnet_connect_confirm(void *,
+			      void *,
+			      struct qos_info *,
+			      __u32,
+			      __u8,
+			      struct sk_buff *);
+static void
+	irnet_flow_indication(void *,
+			      void *,
+			      LOCAL_FLOW);
+static void
+	irnet_status_indication(void *,
+				LINK_STATUS,
+				LOCK_STATUS);
+static void
+	irnet_connect_indication(void *,
+				 void *,
+				 struct qos_info *,
+				 __u32,
+				 __u8,
+				 struct sk_buff *);
+/* -------------------- IRDA-IAS/LMP CALLBACKS -------------------- */
+static void
+	irnet_getvalue_confirm(int,
+			       __u16,
+			       struct ias_value *,
+			       void *);
+static void
+	irnet_discovervalue_confirm(int,
+				    __u16,
+				    struct ias_value *,
+				    void *);
+#ifdef DISCOVERY_EVENTS
+static void
+	irnet_discovery_indication(discinfo_t *,
+				   DISCOVERY_MODE,
+				   void *);
+static void
+	irnet_expiry_indication(discinfo_t *,
+				DISCOVERY_MODE,
+				void *);
+#endif
+
+/**************************** VARIABLES ****************************/
+
+/*
+ * The IrNET server. Listen to connection requests and co...
+ */
+static struct irnet_root	irnet_server;
+
+/* Control channel stuff (note : extern) */
+struct irnet_ctrl_channel	irnet_events;
+
+/* The /proc/net/irda directory, defined elsewhere... */
+#ifdef CONFIG_PROC_FS
+extern struct proc_dir_entry *proc_irda;
+#endif /* CONFIG_PROC_FS */
+
+#endif /* IRNET_IRDA_H */
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
new file mode 100644
index 00000000..2bb2beb6
--- /dev/null
+++ b/net/irda/irnet/irnet_ppp.c
@@ -0,0 +1,1189 @@
+/*
+ *	IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ *		Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file implement the PPP interface and /dev/irnet character device.
+ * The PPP interface hook to the ppp_generic module, handle all our
+ *	relationship to the PPP code in the kernel (and by extension to pppd),
+ *	and exchange PPP frames with this module (send/receive).
+ * The /dev/irnet device is used primarily for 2 functions :
+ *	1) as a stub for pppd (the ppp daemon), so that we can appropriately
+ *	generate PPP sessions (we pretend we are a tty).
+ *	2) as a control channel (write commands, read events)
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "irnet_ppp.h"		/* Private header */
+/* Please put other headers in irnet.h - Thanks */
+
+/* Generic PPP callbacks (to call us) */
+static const struct ppp_channel_ops irnet_ppp_ops = {
+	.start_xmit = ppp_irnet_send,
+	.ioctl = ppp_irnet_ioctl
+};
+
+/************************* CONTROL CHANNEL *************************/
+/*
+ * When a pppd instance is not active on /dev/irnet, it acts as a control
+ * channel.
+ * Writing allow to set up the IrDA destination of the IrNET channel,
+ * and any application may be read events happening in IrNET...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Write is used to send a command to configure a IrNET channel
+ * before it is open by pppd. The syntax is : "command argument"
+ * Currently there is only two defined commands :
+ *	o name : set the requested IrDA nickname of the IrNET peer.
+ *	o addr : set the requested IrDA address of the IrNET peer.
+ * Note : the code is crude, but effective...
+ */
+static inline ssize_t
+irnet_ctrl_write(irnet_socket *	ap,
+		 const char __user *buf,
+		 size_t		count)
+{
+  char		command[IRNET_MAX_COMMAND];
+  char *	start;		/* Current command being processed */
+  char *	next;		/* Next command to process */
+  int		length;		/* Length of current command */
+
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+
+  /* Check for overflow... */
+  DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM,
+	 CTRL_ERROR, "Too much data !!!\n");
+
+  /* Get the data in the driver */
+  if(copy_from_user(command, buf, count))
+    {
+      DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+      return -EFAULT;
+    }
+
+  /* Safe terminate the string */
+  command[count] = '\0';
+  DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n",
+	command, count);
+
+  /* Check every commands in the command line */
+  next = command;
+  while(next != NULL)
+    {
+      /* Look at the next command */
+      start = next;
+
+	/* Scrap whitespaces before the command */
+	start = skip_spaces(start);
+
+      /* ',' is our command separator */
+      next = strchr(start, ',');
+      if(next)
+	{
+	  *next = '\0';			/* Terminate command */
+	  length = next - start;	/* Length */
+	  next++;			/* Skip the '\0' */
+	}
+      else
+	length = strlen(start);
+
+      DEBUG(CTRL_INFO, "Found command ``%s'' (%d).\n", start, length);
+
+      /* Check if we recognised one of the known command
+       * We can't use "switch" with strings, so hack with "continue" */
+
+      /* First command : name -> Requested IrDA nickname */
+      if(!strncmp(start, "name", 4))
+	{
+	  /* Copy the name only if is included and not "any" */
+	  if((length > 5) && (strcmp(start + 5, "any")))
+	    {
+	      /* Strip out trailing whitespaces */
+	      while(isspace(start[length - 1]))
+		length--;
+
+	      DABORT(length < 5 || length > NICKNAME_MAX_LEN + 5,
+		     -EINVAL, CTRL_ERROR, "Invalid nickname.\n");
+
+	      /* Copy the name for later reuse */
+	      memcpy(ap->rname, start + 5, length - 5);
+	      ap->rname[length - 5] = '\0';
+	    }
+	  else
+	    ap->rname[0] = '\0';
+	  DEBUG(CTRL_INFO, "Got rname = ``%s''\n", ap->rname);
+
+	  /* Restart the loop */
+	  continue;
+	}
+
+      /* Second command : addr, daddr -> Requested IrDA destination address
+       * Also process : saddr -> Requested IrDA source address */
+      if((!strncmp(start, "addr", 4)) ||
+	 (!strncmp(start, "daddr", 5)) ||
+	 (!strncmp(start, "saddr", 5)))
+	{
+	  __u32		addr = DEV_ADDR_ANY;
+
+	  /* Copy the address only if is included and not "any" */
+	  if((length > 5) && (strcmp(start + 5, "any")))
+	    {
+	      char *	begp = start + 5;
+	      char *	endp;
+
+	      /* Scrap whitespaces before the command */
+	      begp = skip_spaces(begp);
+
+	      /* Convert argument to a number (last arg is the base) */
+	      addr = simple_strtoul(begp, &endp, 16);
+	      /* Has it worked  ? (endp should be start + length) */
+	      DABORT(endp <= (start + 5), -EINVAL,
+		     CTRL_ERROR, "Invalid address.\n");
+	    }
+	  /* Which type of address ? */
+	  if(start[0] == 's')
+	    {
+	      /* Save it */
+	      ap->rsaddr = addr;
+	      DEBUG(CTRL_INFO, "Got rsaddr = %08x\n", ap->rsaddr);
+	    }
+	  else
+	    {
+	      /* Save it */
+	      ap->rdaddr = addr;
+	      DEBUG(CTRL_INFO, "Got rdaddr = %08x\n", ap->rdaddr);
+	    }
+
+	  /* Restart the loop */
+	  continue;
+	}
+
+      /* Other possible command : connect N (number of retries) */
+
+      /* No command matched -> Failed... */
+      DABORT(1, -EINVAL, CTRL_ERROR, "Not a recognised IrNET command.\n");
+    }
+
+  /* Success : we have parsed all commands successfully */
+  return count;
+}
+
+#ifdef INITIAL_DISCOVERY
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_get_discovery_log (self)
+ *
+ *    Query the content on the discovery log if not done
+ *
+ * This function query the current content of the discovery log
+ * at the startup of the event channel and save it in the internal struct.
+ */
+static void
+irnet_get_discovery_log(irnet_socket *	ap)
+{
+  __u16		mask = irlmp_service_to_hint(S_LAN);
+
+  /* Ask IrLMP for the current discovery log */
+  ap->discoveries = irlmp_get_discoveries(&ap->disco_number, mask,
+					  DISCOVERY_DEFAULT_SLOTS);
+
+  /* Check if the we got some results */
+  if(ap->discoveries == NULL)
+    ap->disco_number = -1;
+
+  DEBUG(CTRL_INFO, "Got the log (0x%p), size is %d\n",
+	ap->discoveries, ap->disco_number);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Function irnet_read_discovery_log (self, event)
+ *
+ *    Read the content on the discovery log
+ *
+ * This function dump the current content of the discovery log
+ * at the startup of the event channel.
+ * Return 1 if wrote an event on the control channel...
+ *
+ * State of the ap->disco_XXX variables :
+ * Socket creation :  discoveries = NULL ; disco_index = 0 ; disco_number = 0
+ * While reading :    discoveries = ptr  ; disco_index = X ; disco_number = Y
+ * After reading :    discoveries = NULL ; disco_index = Y ; disco_number = -1
+ */
+static inline int
+irnet_read_discovery_log(irnet_socket *	ap,
+			 char *		event)
+{
+  int		done_event = 0;
+
+  DENTER(CTRL_TRACE, "(ap=0x%p, event=0x%p)\n",
+	 ap, event);
+
+  /* Test if we have some work to do or we have already finished */
+  if(ap->disco_number == -1)
+    {
+      DEBUG(CTRL_INFO, "Already done\n");
+      return 0;
+    }
+
+  /* Test if it's the first time and therefore we need to get the log */
+  if(ap->discoveries == NULL)
+    irnet_get_discovery_log(ap);
+
+  /* Check if we have more item to dump */
+  if(ap->disco_index < ap->disco_number)
+    {
+      /* Write an event */
+      sprintf(event, "Found %08x (%s) behind %08x {hints %02X-%02X}\n",
+	      ap->discoveries[ap->disco_index].daddr,
+	      ap->discoveries[ap->disco_index].info,
+	      ap->discoveries[ap->disco_index].saddr,
+	      ap->discoveries[ap->disco_index].hints[0],
+	      ap->discoveries[ap->disco_index].hints[1]);
+      DEBUG(CTRL_INFO, "Writing discovery %d : %s\n",
+	    ap->disco_index, ap->discoveries[ap->disco_index].info);
+
+      /* We have an event */
+      done_event = 1;
+      /* Next discovery */
+      ap->disco_index++;
+    }
+
+  /* Check if we have done the last item */
+  if(ap->disco_index >= ap->disco_number)
+    {
+      /* No more items : remove the log and signal termination */
+      DEBUG(CTRL_INFO, "Cleaning up log (0x%p)\n",
+	    ap->discoveries);
+      if(ap->discoveries != NULL)
+	{
+	  /* Cleanup our copy of the discovery log */
+	  kfree(ap->discoveries);
+	  ap->discoveries = NULL;
+	}
+      ap->disco_number = -1;
+    }
+
+  return done_event;
+}
+#endif /* INITIAL_DISCOVERY */
+
+/*------------------------------------------------------------------*/
+/*
+ * Read is used to get IrNET events
+ */
+static inline ssize_t
+irnet_ctrl_read(irnet_socket *	ap,
+		struct file *	file,
+		char __user *	buf,
+		size_t		count)
+{
+  DECLARE_WAITQUEUE(wait, current);
+  char		event[64];	/* Max event is 61 char */
+  ssize_t	ret = 0;
+
+  DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count);
+
+  /* Check if we can write an event out in one go */
+  DABORT(count < sizeof(event), -EOVERFLOW, CTRL_ERROR, "Buffer to small.\n");
+
+#ifdef INITIAL_DISCOVERY
+  /* Check if we have read the log */
+  if(irnet_read_discovery_log(ap, event))
+    {
+      /* We have an event !!! Copy it to the user */
+      if(copy_to_user(buf, event, strlen(event)))
+	{
+	  DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+	  return -EFAULT;
+	}
+
+      DEXIT(CTRL_TRACE, "\n");
+      return strlen(event);
+    }
+#endif /* INITIAL_DISCOVERY */
+
+  /* Put ourselves on the wait queue to be woken up */
+  add_wait_queue(&irnet_events.rwait, &wait);
+  current->state = TASK_INTERRUPTIBLE;
+  for(;;)
+    {
+      /* If there is unread events */
+      ret = 0;
+      if(ap->event_index != irnet_events.index)
+	break;
+      ret = -EAGAIN;
+      if(file->f_flags & O_NONBLOCK)
+	break;
+      ret = -ERESTARTSYS;
+      if(signal_pending(current))
+	break;
+      /* Yield and wait to be woken up */
+      schedule();
+    }
+  current->state = TASK_RUNNING;
+  remove_wait_queue(&irnet_events.rwait, &wait);
+
+  /* Did we got it ? */
+  if(ret != 0)
+    {
+      /* No, return the error code */
+      DEXIT(CTRL_TRACE, " - ret %Zd\n", ret);
+      return ret;
+    }
+
+  /* Which event is it ? */
+  switch(irnet_events.log[ap->event_index].event)
+    {
+    case IRNET_DISCOVER:
+      sprintf(event, "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].saddr,
+	      irnet_events.log[ap->event_index].hints.byte[0],
+	      irnet_events.log[ap->event_index].hints.byte[1]);
+      break;
+    case IRNET_EXPIRE:
+      sprintf(event, "Expired %08x (%s) behind %08x {hints %02X-%02X}\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].saddr,
+	      irnet_events.log[ap->event_index].hints.byte[0],
+	      irnet_events.log[ap->event_index].hints.byte[1]);
+      break;
+    case IRNET_CONNECT_TO:
+      sprintf(event, "Connected to %08x (%s) on ppp%d\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].unit);
+      break;
+    case IRNET_CONNECT_FROM:
+      sprintf(event, "Connection from %08x (%s) on ppp%d\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].unit);
+      break;
+    case IRNET_REQUEST_FROM:
+      sprintf(event, "Request from %08x (%s) behind %08x\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].saddr);
+      break;
+    case IRNET_NOANSWER_FROM:
+      sprintf(event, "No-answer from %08x (%s) on ppp%d\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].unit);
+      break;
+    case IRNET_BLOCKED_LINK:
+      sprintf(event, "Blocked link with %08x (%s) on ppp%d\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].unit);
+      break;
+    case IRNET_DISCONNECT_FROM:
+      sprintf(event, "Disconnection from %08x (%s) on ppp%d\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name,
+	      irnet_events.log[ap->event_index].unit);
+      break;
+    case IRNET_DISCONNECT_TO:
+      sprintf(event, "Disconnected to %08x (%s)\n",
+	      irnet_events.log[ap->event_index].daddr,
+	      irnet_events.log[ap->event_index].name);
+      break;
+    default:
+      sprintf(event, "Bug\n");
+    }
+  /* Increment our event index */
+  ap->event_index = (ap->event_index + 1) % IRNET_MAX_EVENTS;
+
+  DEBUG(CTRL_INFO, "Event is :%s", event);
+
+  /* Copy it to the user */
+  if(copy_to_user(buf, event, strlen(event)))
+    {
+      DERROR(CTRL_ERROR, "Invalid user space pointer.\n");
+      return -EFAULT;
+    }
+
+  DEXIT(CTRL_TRACE, "\n");
+  return strlen(event);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Poll : called when someone do a select on /dev/irnet.
+ * Just check if there are new events...
+ */
+static inline unsigned int
+irnet_ctrl_poll(irnet_socket *	ap,
+		struct file *	file,
+		poll_table *	wait)
+{
+  unsigned int mask;
+
+  DENTER(CTRL_TRACE, "(ap=0x%p)\n", ap);
+
+  poll_wait(file, &irnet_events.rwait, wait);
+  mask = POLLOUT | POLLWRNORM;
+  /* If there is unread events */
+  if(ap->event_index != irnet_events.index)
+    mask |= POLLIN | POLLRDNORM;
+#ifdef INITIAL_DISCOVERY
+  if(ap->disco_number != -1)
+    {
+      /* Test if it's the first time and therefore we need to get the log */
+      if(ap->discoveries == NULL)
+	irnet_get_discovery_log(ap);
+      /* Recheck */
+      if(ap->disco_number != -1)
+	mask |= POLLIN | POLLRDNORM;
+    }
+#endif /* INITIAL_DISCOVERY */
+
+  DEXIT(CTRL_TRACE, " - mask=0x%X\n", mask);
+  return mask;
+}
+
+
+/*********************** FILESYSTEM CALLBACKS ***********************/
+/*
+ * Implement the usual open, read, write functions that will be called
+ * by the file system when some action is performed on /dev/irnet.
+ * Most of those actions will in fact be performed by "pppd" or
+ * the control channel, we just act as a redirector...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Open : when somebody open /dev/irnet
+ * We basically create a new instance of irnet and initialise it.
+ */
+static int
+dev_irnet_open(struct inode *	inode,
+	       struct file *	file)
+{
+  struct irnet_socket *	ap;
+  int			err;
+
+  DENTER(FS_TRACE, "(file=0x%p)\n", file);
+
+#ifdef SECURE_DEVIRNET
+  /* This could (should?) be enforced by the permissions on /dev/irnet. */
+  if(!capable(CAP_NET_ADMIN))
+    return -EPERM;
+#endif /* SECURE_DEVIRNET */
+
+  /* Allocate a private structure for this IrNET instance */
+  ap = kzalloc(sizeof(*ap), GFP_KERNEL);
+  DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n");
+
+  /* initialize the irnet structure */
+  ap->file = file;
+
+  /* PPP channel setup */
+  ap->ppp_open = 0;
+  ap->chan.private = ap;
+  ap->chan.ops = &irnet_ppp_ops;
+  ap->chan.mtu = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN);
+  ap->chan.hdrlen = 2 + TTP_MAX_HEADER;		/* for A/C + Max IrDA hdr */
+  /* PPP parameters */
+  ap->mru = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN);
+  ap->xaccm[0] = ~0U;
+  ap->xaccm[3] = 0x60000000U;
+  ap->raccm = ~0U;
+
+  /* Setup the IrDA part... */
+  err = irda_irnet_create(ap);
+  if(err)
+    {
+      DERROR(FS_ERROR, "Can't setup IrDA link...\n");
+      kfree(ap);
+
+      return err;
+    }
+
+  /* For the control channel */
+  ap->event_index = irnet_events.index;	/* Cancel all past events */
+
+  mutex_init(&ap->lock);
+
+  /* Put our stuff where we will be able to find it later */
+  file->private_data = ap;
+
+  DEXIT(FS_TRACE, " - ap=0x%p\n", ap);
+
+  return 0;
+}
+
+
+/*------------------------------------------------------------------*/
+/*
+ * Close : when somebody close /dev/irnet
+ * Destroy the instance of /dev/irnet
+ */
+static int
+dev_irnet_close(struct inode *	inode,
+		struct file *	file)
+{
+  irnet_socket *	ap = file->private_data;
+
+  DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
+	 file, ap);
+  DABORT(ap == NULL, 0, FS_ERROR, "ap is NULL !!!\n");
+
+  /* Detach ourselves */
+  file->private_data = NULL;
+
+  /* Close IrDA stuff */
+  irda_irnet_destroy(ap);
+
+  /* Disconnect from the generic PPP layer if not already done */
+  if(ap->ppp_open)
+    {
+      DERROR(FS_ERROR, "Channel still registered - deregistering !\n");
+      ap->ppp_open = 0;
+      ppp_unregister_channel(&ap->chan);
+    }
+
+  kfree(ap);
+
+  DEXIT(FS_TRACE, "\n");
+  return 0;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Write does nothing.
+ * (we receive packet from ppp_generic through ppp_irnet_send())
+ */
+static ssize_t
+dev_irnet_write(struct file *	file,
+		const char __user *buf,
+		size_t		count,
+		loff_t *	ppos)
+{
+  irnet_socket *	ap = file->private_data;
+
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+	file, ap, count);
+  DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
+
+  /* If we are connected to ppp_generic, let it handle the job */
+  if(ap->ppp_open)
+    return -EAGAIN;
+  else
+    return irnet_ctrl_write(ap, buf, count);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Read doesn't do much either.
+ * (pppd poll us, but ultimately reads through /dev/ppp)
+ */
+static ssize_t
+dev_irnet_read(struct file *	file,
+	       char __user *	buf,
+	       size_t		count,
+	       loff_t *		ppos)
+{
+  irnet_socket *	ap = file->private_data;
+
+  DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
+	file, ap, count);
+  DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n");
+
+  /* If we are connected to ppp_generic, let it handle the job */
+  if(ap->ppp_open)
+    return -EAGAIN;
+  else
+    return irnet_ctrl_read(ap, file, buf, count);
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Poll : called when someone do a select on /dev/irnet
+ */
+static unsigned int
+dev_irnet_poll(struct file *	file,
+	       poll_table *	wait)
+{
+  irnet_socket *	ap = file->private_data;
+  unsigned int		mask;
+
+  DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
+	 file, ap);
+
+  mask = POLLOUT | POLLWRNORM;
+  DABORT(ap == NULL, mask, FS_ERROR, "ap is NULL !!!\n");
+
+  /* If we are connected to ppp_generic, let it handle the job */
+  if(!ap->ppp_open)
+    mask |= irnet_ctrl_poll(ap, file, wait);
+
+  DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
+  return mask;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * IOCtl : Called when someone does some ioctls on /dev/irnet
+ * This is the way pppd configure us and control us while the PPP
+ * instance is active.
+ */
+static long
+dev_irnet_ioctl(
+		struct file *	file,
+		unsigned int	cmd,
+		unsigned long	arg)
+{
+  irnet_socket *	ap = file->private_data;
+  int			err;
+  int			val;
+  void __user *argp = (void __user *)arg;
+
+  DENTER(FS_TRACE, "(file=0x%p, ap=0x%p, cmd=0x%X)\n",
+	 file, ap, cmd);
+
+  /* Basic checks... */
+  DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n");
+#ifdef SECURE_DEVIRNET
+  if(!capable(CAP_NET_ADMIN))
+    return -EPERM;
+#endif /* SECURE_DEVIRNET */
+
+  err = -EFAULT;
+  switch(cmd)
+    {
+      /* Set discipline (should be N_SYNC_PPP or N_TTY) */
+    case TIOCSETD:
+      if(get_user(val, (int __user *)argp))
+	break;
+      if((val == N_SYNC_PPP) || (val == N_PPP))
+	{
+	  DEBUG(FS_INFO, "Entering PPP discipline.\n");
+	  /* PPP channel setup (ap->chan in configured in dev_irnet_open())*/
+	  if (mutex_lock_interruptible(&ap->lock))
+		  return -EINTR;
+
+	  err = ppp_register_channel(&ap->chan);
+	  if(err == 0)
+	    {
+	      /* Our ppp side is active */
+	      ap->ppp_open = 1;
+
+	      DEBUG(FS_INFO, "Trying to establish a connection.\n");
+	      /* Setup the IrDA link now - may fail... */
+	      irda_irnet_connect(ap);
+	    }
+	  else
+	    DERROR(FS_ERROR, "Can't setup PPP channel...\n");
+
+          mutex_unlock(&ap->lock);
+	}
+      else
+	{
+	  /* In theory, should be N_TTY */
+	  DEBUG(FS_INFO, "Exiting PPP discipline.\n");
+	  /* Disconnect from the generic PPP layer */
+	  if (mutex_lock_interruptible(&ap->lock))
+		  return -EINTR;
+
+	  if(ap->ppp_open)
+	    {
+	      ap->ppp_open = 0;
+	      ppp_unregister_channel(&ap->chan);
+	    }
+	  else
+	    DERROR(FS_ERROR, "Channel not registered !\n");
+	  err = 0;
+
+	  mutex_unlock(&ap->lock);
+	}
+      break;
+
+      /* Query PPP channel and unit number */
+    case PPPIOCGCHAN:
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
+      if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan),
+						(int __user *)argp))
+	err = 0;
+
+      mutex_unlock(&ap->lock);
+      break;
+    case PPPIOCGUNIT:
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
+      if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan),
+						(int __user *)argp))
+        err = 0;
+
+      mutex_unlock(&ap->lock);
+      break;
+
+      /* All these ioctls can be passed both directly and from ppp_generic,
+       * so we just deal with them in one place...
+       */
+    case PPPIOCGFLAGS:
+    case PPPIOCSFLAGS:
+    case PPPIOCGASYNCMAP:
+    case PPPIOCSASYNCMAP:
+    case PPPIOCGRASYNCMAP:
+    case PPPIOCSRASYNCMAP:
+    case PPPIOCGXASYNCMAP:
+    case PPPIOCSXASYNCMAP:
+    case PPPIOCGMRU:
+    case PPPIOCSMRU:
+      DEBUG(FS_INFO, "Standard PPP ioctl.\n");
+      if(!capable(CAP_NET_ADMIN))
+	err = -EPERM;
+      else {
+	if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
+	err = ppp_irnet_ioctl(&ap->chan, cmd, arg);
+
+	mutex_unlock(&ap->lock);
+      }
+      break;
+
+      /* TTY IOCTLs : Pretend that we are a tty, to keep pppd happy */
+      /* Get termios */
+    case TCGETS:
+      DEBUG(FS_INFO, "Get termios.\n");
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
+#ifndef TCGETS2
+      if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios))
+	err = 0;
+#else
+      if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios))
+	err = 0;
+#endif
+
+      mutex_unlock(&ap->lock);
+      break;
+      /* Set termios */
+    case TCSETSF:
+      DEBUG(FS_INFO, "Set termios.\n");
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
+#ifndef TCGETS2
+      if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp))
+	err = 0;
+#else
+      if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp))
+	err = 0;
+#endif
+
+      mutex_unlock(&ap->lock);
+      break;
+
+      /* Set DTR/RTS */
+    case TIOCMBIS:
+    case TIOCMBIC:
+      /* Set exclusive/non-exclusive mode */
+    case TIOCEXCL:
+    case TIOCNXCL:
+      DEBUG(FS_INFO, "TTY compatibility.\n");
+      err = 0;
+      break;
+
+    case TCGETA:
+      DEBUG(FS_INFO, "TCGETA\n");
+      break;
+
+    case TCFLSH:
+      DEBUG(FS_INFO, "TCFLSH\n");
+      /* Note : this will flush buffers in PPP, so it *must* be done
+       * We should also worry that we don't accept junk here and that
+       * we get rid of our own buffers */
+#ifdef FLUSH_TO_PPP
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+      ppp_output_wakeup(&ap->chan);
+      mutex_unlock(&ap->lock);
+#endif /* FLUSH_TO_PPP */
+      err = 0;
+      break;
+
+    case FIONREAD:
+      DEBUG(FS_INFO, "FIONREAD\n");
+      val = 0;
+      if(put_user(val, (int __user *)argp))
+	break;
+      err = 0;
+      break;
+
+    default:
+      DERROR(FS_ERROR, "Unsupported ioctl (0x%X)\n", cmd);
+      err = -ENOTTY;
+    }
+
+  DEXIT(FS_TRACE, " - err = 0x%X\n", err);
+  return err;
+}
+
+/************************** PPP CALLBACKS **************************/
+/*
+ * This are the functions that the generic PPP driver in the kernel
+ * will call to communicate to us.
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare the ppp frame for transmission over the IrDA socket.
+ * We make sure that the header space is enough, and we change ppp header
+ * according to flags passed by pppd.
+ * This is not a callback, but just a helper function used in ppp_irnet_send()
+ */
+static inline struct sk_buff *
+irnet_prepare_skb(irnet_socket *	ap,
+		  struct sk_buff *	skb)
+{
+  unsigned char *	data;
+  int			proto;		/* PPP protocol */
+  int			islcp;		/* Protocol == LCP */
+  int			needaddr;	/* Need PPP address */
+
+  DENTER(PPP_TRACE, "(ap=0x%p, skb=0x%p)\n",
+	 ap, skb);
+
+  /* Extract PPP protocol from the frame */
+  data  = skb->data;
+  proto = (data[0] << 8) + data[1];
+
+  /* LCP packets with codes between 1 (configure-request)
+   * and 7 (code-reject) must be sent as though no options
+   * have been negotiated. */
+  islcp = (proto == PPP_LCP) && (1 <= data[2]) && (data[2] <= 7);
+
+  /* compress protocol field if option enabled */
+  if((data[0] == 0) && (ap->flags & SC_COMP_PROT) && (!islcp))
+    skb_pull(skb,1);
+
+  /* Check if we need address/control fields */
+  needaddr = 2*((ap->flags & SC_COMP_AC) == 0 || islcp);
+
+  /* Is the skb headroom large enough to contain all IrDA-headers? */
+  if((skb_headroom(skb) < (ap->max_header_size + needaddr)) ||
+      (skb_shared(skb)))
+    {
+      struct sk_buff *	new_skb;
+
+      DEBUG(PPP_INFO, "Reallocating skb\n");
+
+      /* Create a new skb */
+      new_skb = skb_realloc_headroom(skb, ap->max_header_size + needaddr);
+
+      /* We have to free the original skb anyway */
+      dev_kfree_skb(skb);
+
+      /* Did the realloc succeed ? */
+      DABORT(new_skb == NULL, NULL, PPP_ERROR, "Could not realloc skb\n");
+
+      /* Use the new skb instead */
+      skb = new_skb;
+    }
+
+  /* prepend address/control fields if necessary */
+  if(needaddr)
+    {
+      skb_push(skb, 2);
+      skb->data[0] = PPP_ALLSTATIONS;
+      skb->data[1] = PPP_UI;
+    }
+
+  DEXIT(PPP_TRACE, "\n");
+
+  return skb;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Send a packet to the peer over the IrTTP connection.
+ * Returns 1 iff the packet was accepted.
+ * Returns 0 iff packet was not consumed.
+ * If the packet was not accepted, we will call ppp_output_wakeup
+ * at some later time to reactivate flow control in ppp_generic.
+ */
+static int
+ppp_irnet_send(struct ppp_channel *	chan,
+	       struct sk_buff *		skb)
+{
+  irnet_socket *	self = (struct irnet_socket *) chan->private;
+  int			ret;
+
+  DENTER(PPP_TRACE, "(channel=0x%p, ap/self=0x%p)\n",
+	 chan, self);
+
+  /* Check if things are somewhat valid... */
+  DASSERT(self != NULL, 0, PPP_ERROR, "Self is NULL !!!\n");
+
+  /* Check if we are connected */
+  if(!(test_bit(0, &self->ttp_open)))
+    {
+#ifdef CONNECT_IN_SEND
+      /* Let's try to connect one more time... */
+      /* Note : we won't be connected after this call, but we should be
+       * ready for next packet... */
+      /* If we are already connecting, this will fail */
+      irda_irnet_connect(self);
+#endif /* CONNECT_IN_SEND */
+
+      DEBUG(PPP_INFO, "IrTTP not ready ! (%ld-%ld)\n",
+	    self->ttp_open, self->ttp_connect);
+
+      /* Note : we can either drop the packet or block the packet.
+       *
+       * Blocking the packet allow us a better connection time,
+       * because by calling ppp_output_wakeup() we can have
+       * ppp_generic resending the LCP request immediately to us,
+       * rather than waiting for one of pppd periodic transmission of
+       * LCP request.
+       *
+       * On the other hand, if we block all packet, all those periodic
+       * transmissions of pppd accumulate in ppp_generic, creating a
+       * backlog of LCP request. When we eventually connect later on,
+       * we have to transmit all this backlog before we can connect
+       * proper (if we don't timeout before).
+       *
+       * The current strategy is as follow :
+       * While we are attempting to connect, we block packets to get
+       * a better connection time.
+       * If we fail to connect, we drain the queue and start dropping packets
+       */
+#ifdef BLOCK_WHEN_CONNECT
+      /* If we are attempting to connect */
+      if(test_bit(0, &self->ttp_connect))
+	{
+	  /* Blocking packet, ppp_generic will retry later */
+	  return 0;
+	}
+#endif /* BLOCK_WHEN_CONNECT */
+
+      /* Dropping packet, pppd will retry later */
+      dev_kfree_skb(skb);
+      return 1;
+    }
+
+  /* Check if the queue can accept any packet, otherwise block */
+  if(self->tx_flow != FLOW_START)
+    DRETURN(0, PPP_INFO, "IrTTP queue full (%d skbs)...\n",
+	    skb_queue_len(&self->tsap->tx_queue));
+
+  /* Prepare ppp frame for transmission */
+  skb = irnet_prepare_skb(self, skb);
+  DABORT(skb == NULL, 1, PPP_ERROR, "Prepare skb for Tx failed.\n");
+
+  /* Send the packet to IrTTP */
+  ret = irttp_data_request(self->tsap, skb);
+  if(ret < 0)
+    {
+      /*
+       * > IrTTPs tx queue is full, so we just have to
+       * > drop the frame! You might think that we should
+       * > just return -1 and don't deallocate the frame,
+       * > but that is dangerous since it's possible that
+       * > we have replaced the original skb with a new
+       * > one with larger headroom, and that would really
+       * > confuse do_dev_queue_xmit() in dev.c! I have
+       * > tried :-) DB
+       * Correction : we verify the flow control above (self->tx_flow),
+       * so we come here only if IrTTP doesn't like the packet (empty,
+       * too large, IrTTP not connected). In those rare cases, it's ok
+       * to drop it, we don't want to see it here again...
+       * Jean II
+       */
+      DERROR(PPP_ERROR, "IrTTP doesn't like this packet !!! (0x%X)\n", ret);
+      /* irttp_data_request already free the packet */
+    }
+
+  DEXIT(PPP_TRACE, "\n");
+  return 1;	/* Packet has been consumed */
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Take care of the ioctls that ppp_generic doesn't want to deal with...
+ * Note : we are also called from dev_irnet_ioctl().
+ */
+static int
+ppp_irnet_ioctl(struct ppp_channel *	chan,
+		unsigned int		cmd,
+		unsigned long		arg)
+{
+  irnet_socket *	ap = (struct irnet_socket *) chan->private;
+  int			err;
+  int			val;
+  u32			accm[8];
+  void __user *argp = (void __user *)arg;
+
+  DENTER(PPP_TRACE, "(channel=0x%p, ap=0x%p, cmd=0x%X)\n",
+	 chan, ap, cmd);
+
+  /* Basic checks... */
+  DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n");
+
+  err = -EFAULT;
+  switch(cmd)
+    {
+      /* PPP flags */
+    case PPPIOCGFLAGS:
+      val = ap->flags | ap->rbits;
+      if(put_user(val, (int __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCSFLAGS:
+      if(get_user(val, (int __user *) argp))
+	break;
+      ap->flags = val & ~SC_RCV_BITS;
+      ap->rbits = val & SC_RCV_BITS;
+      err = 0;
+      break;
+
+      /* Async map stuff - all dummy to please pppd */
+    case PPPIOCGASYNCMAP:
+      if(put_user(ap->xaccm[0], (u32 __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCSASYNCMAP:
+      if(get_user(ap->xaccm[0], (u32 __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCGRASYNCMAP:
+      if(put_user(ap->raccm, (u32 __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCSRASYNCMAP:
+      if(get_user(ap->raccm, (u32 __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCGXASYNCMAP:
+      if(copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm)))
+	break;
+      err = 0;
+      break;
+    case PPPIOCSXASYNCMAP:
+      if(copy_from_user(accm, argp, sizeof(accm)))
+	break;
+      accm[2] &= ~0x40000000U;		/* can't escape 0x5e */
+      accm[3] |= 0x60000000U;		/* must escape 0x7d, 0x7e */
+      memcpy(ap->xaccm, accm, sizeof(ap->xaccm));
+      err = 0;
+      break;
+
+      /* Max PPP frame size */
+    case PPPIOCGMRU:
+      if(put_user(ap->mru, (int __user *) argp))
+	break;
+      err = 0;
+      break;
+    case PPPIOCSMRU:
+      if(get_user(val, (int __user *) argp))
+	break;
+      if(val < PPP_MRU)
+	val = PPP_MRU;
+      ap->mru = val;
+      err = 0;
+      break;
+
+    default:
+      DEBUG(PPP_INFO, "Unsupported ioctl (0x%X)\n", cmd);
+      err = -ENOIOCTLCMD;
+    }
+
+  DEXIT(PPP_TRACE, " - err = 0x%X\n", err);
+  return err;
+}
+
+/************************** INITIALISATION **************************/
+/*
+ * Module initialisation and all that jazz...
+ */
+
+/*------------------------------------------------------------------*/
+/*
+ * Hook our device callbacks in the filesystem, to connect our code
+ * to /dev/irnet
+ */
+static inline int __init
+ppp_irnet_init(void)
+{
+  int err = 0;
+
+  DENTER(MODULE_TRACE, "()\n");
+
+  /* Allocate ourselves as a minor in the misc range */
+  err = misc_register(&irnet_misc_device);
+
+  DEXIT(MODULE_TRACE, "\n");
+  return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Cleanup at exit...
+ */
+static inline void __exit
+ppp_irnet_cleanup(void)
+{
+  DENTER(MODULE_TRACE, "()\n");
+
+  /* De-allocate /dev/irnet minor in misc range */
+  misc_deregister(&irnet_misc_device);
+
+  DEXIT(MODULE_TRACE, "\n");
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module main entry point
+ */
+static int __init
+irnet_init(void)
+{
+  int err;
+
+  /* Initialise both parts... */
+  err = irda_irnet_init();
+  if(!err)
+    err = ppp_irnet_init();
+  return err;
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module exit
+ */
+static void __exit
+irnet_cleanup(void)
+{
+  irda_irnet_cleanup();
+  ppp_irnet_cleanup();
+}
+
+/*------------------------------------------------------------------*/
+/*
+ * Module magic
+ */
+module_init(irnet_init);
+module_exit(irnet_cleanup);
+MODULE_AUTHOR("Jean Tourrilhes <jt@hpl.hp.com>");
+MODULE_DESCRIPTION("IrNET : Synchronous PPP over IrDA");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CHARDEV(10, 187);
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
new file mode 100644
index 00000000..94022586
--- /dev/null
+++ b/net/irda/irnet/irnet_ppp.h
@@ -0,0 +1,119 @@
+/*
+ *	IrNET protocol module : Synchronous PPP over an IrDA socket.
+ *
+ *		Jean II - HPL `00 - <jt@hpl.hp.com>
+ *
+ * This file contains all definitions and declarations necessary for the
+ * PPP part of the IrNET module.
+ * This file is a private header, so other modules don't want to know
+ * what's in there...
+ */
+
+#ifndef IRNET_PPP_H
+#define IRNET_PPP_H
+
+/***************************** INCLUDES *****************************/
+
+#include "irnet.h"		/* Module global include */
+
+/************************ CONSTANTS & MACROS ************************/
+
+/* /dev/irnet file constants */
+#define IRNET_MAJOR	10	/* Misc range */
+#define IRNET_MINOR	187	/* Official allocation */
+
+/* IrNET control channel stuff */
+#define IRNET_MAX_COMMAND	256	/* Max length of a command line */
+
+/* PPP hardcore stuff */
+
+/* Bits in rbits (PPP flags in irnet struct) */
+#define SC_RCV_BITS	(SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
+
+/* Bit numbers in busy */
+#define XMIT_BUSY	0
+#define RECV_BUSY	1
+#define XMIT_WAKEUP	2
+#define XMIT_FULL	3
+
+/* Queue management */
+#define PPPSYNC_MAX_RQLEN	32	/* arbitrary */
+
+/****************************** TYPES ******************************/
+
+
+/**************************** PROTOTYPES ****************************/
+
+/* ----------------------- CONTROL CHANNEL ----------------------- */
+static inline ssize_t
+	irnet_ctrl_write(irnet_socket *,
+			 const char *,
+			 size_t);
+static inline ssize_t
+	irnet_ctrl_read(irnet_socket *,
+			struct file *,
+			char *,
+			size_t);
+static inline unsigned int
+	irnet_ctrl_poll(irnet_socket *,
+			struct file *,
+			poll_table *);
+/* ----------------------- CHARACTER DEVICE ----------------------- */
+static int
+	dev_irnet_open(struct inode *,	/* fs callback : open */
+		       struct file *),
+	dev_irnet_close(struct inode *,
+			struct file *);
+static ssize_t
+	dev_irnet_write(struct file *,
+			const char __user *,
+			size_t,
+			loff_t *),
+	dev_irnet_read(struct file *,
+		       char __user *,
+		       size_t,
+		       loff_t *);
+static unsigned int
+	dev_irnet_poll(struct file *,
+		       poll_table *);
+static long
+	dev_irnet_ioctl(struct file *,
+			unsigned int,
+			unsigned long);
+/* ------------------------ PPP INTERFACE ------------------------ */
+static inline struct sk_buff *
+	irnet_prepare_skb(irnet_socket *,
+			  struct sk_buff *);
+static int
+	ppp_irnet_send(struct ppp_channel *,
+		      struct sk_buff *);
+static int
+	ppp_irnet_ioctl(struct ppp_channel *,
+			unsigned int,
+			unsigned long);
+
+/**************************** VARIABLES ****************************/
+
+/* Filesystem callbacks (to call us) */
+static const struct file_operations irnet_device_fops =
+{
+	.owner		= THIS_MODULE,
+	.read		= dev_irnet_read,
+	.write		= dev_irnet_write,
+	.poll		= dev_irnet_poll,
+	.unlocked_ioctl	= dev_irnet_ioctl,
+	.open		= dev_irnet_open,
+	.release	= dev_irnet_close,
+	.llseek		= noop_llseek,
+  /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */
+};
+
+/* Structure so that the misc major (drivers/char/misc.c) take care of us... */
+static struct miscdevice irnet_misc_device =
+{
+	IRNET_MINOR,
+	"irnet",
+	&irnet_device_fops
+};
+
+#endif /* IRNET_PPP_H */
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
new file mode 100644
index 00000000..6c7c4b92
--- /dev/null
+++ b/net/irda/irnetlink.c
@@ -0,0 +1,159 @@
+/*
+ * IrDA netlink layer, for stack configuration.
+ *
+ * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org>
+ *
+ * Partly based on the 802.11 nelink implementation
+ * (see net/wireless/nl80211.c) which is:
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/socket.h>
+#include <linux/irda.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/genetlink.h>
+
+
+
+static struct genl_family irda_nl_family = {
+	.id = GENL_ID_GENERATE,
+	.name = IRDA_NL_NAME,
+	.hdrsize = 0,
+	.version = IRDA_NL_VERSION,
+	.maxattr = IRDA_NL_CMD_MAX,
+};
+
+static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
+{
+	char * ifname;
+
+	if (!info->attrs[IRDA_NL_ATTR_IFNAME])
+		return NULL;
+
+	ifname = nla_data(info->attrs[IRDA_NL_ATTR_IFNAME]);
+
+	IRDA_DEBUG(5, "%s(): Looking for %s\n", __func__, ifname);
+
+	return dev_get_by_name(net, ifname);
+}
+
+static int irda_nl_set_mode(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device * dev;
+	struct irlap_cb * irlap;
+	u32 mode;
+
+	if (!info->attrs[IRDA_NL_ATTR_MODE])
+		return -EINVAL;
+
+	mode = nla_get_u32(info->attrs[IRDA_NL_ATTR_MODE]);
+
+	IRDA_DEBUG(5, "%s(): Switching to mode: %d\n", __func__, mode);
+
+	dev = ifname_to_netdev(&init_net, info);
+	if (!dev)
+		return -ENODEV;
+
+	irlap = (struct irlap_cb *)dev->atalk_ptr;
+	if (!irlap) {
+		dev_put(dev);
+		return -ENODEV;
+	}
+
+	irlap->mode = mode;
+
+	dev_put(dev);
+
+	return 0;
+}
+
+static int irda_nl_get_mode(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net_device * dev;
+	struct irlap_cb * irlap;
+	struct sk_buff *msg;
+	void *hdr;
+	int ret = -ENOBUFS;
+
+	dev = ifname_to_netdev(&init_net, info);
+	if (!dev)
+		return -ENODEV;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		dev_put(dev);
+		return -ENOMEM;
+	}
+
+	irlap = (struct irlap_cb *)dev->atalk_ptr;
+	if (!irlap) {
+		ret = -ENODEV;
+		goto err_out;
+	}
+
+	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
+			  &irda_nl_family, 0,  IRDA_NL_CMD_GET_MODE);
+	if (hdr == NULL) {
+		ret = -EMSGSIZE;
+		goto err_out;
+	}
+
+	if(nla_put_string(msg, IRDA_NL_ATTR_IFNAME,
+			  dev->name))
+		goto err_out;
+
+	if(nla_put_u32(msg, IRDA_NL_ATTR_MODE, irlap->mode))
+		goto err_out;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+
+ err_out:
+	nlmsg_free(msg);
+	dev_put(dev);
+
+	return ret;
+}
+
+static const struct nla_policy irda_nl_policy[IRDA_NL_ATTR_MAX + 1] = {
+	[IRDA_NL_ATTR_IFNAME] = { .type = NLA_NUL_STRING,
+				  .len = IFNAMSIZ-1 },
+	[IRDA_NL_ATTR_MODE] = { .type = NLA_U32 },
+};
+
+static struct genl_ops irda_nl_ops[] = {
+	{
+		.cmd = IRDA_NL_CMD_SET_MODE,
+		.doit = irda_nl_set_mode,
+		.policy = irda_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = IRDA_NL_CMD_GET_MODE,
+		.doit = irda_nl_get_mode,
+		.policy = irda_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+
+};
+
+int irda_nl_register(void)
+{
+	return genl_register_family_with_ops(&irda_nl_family,
+		irda_nl_ops, ARRAY_SIZE(irda_nl_ops));
+}
+
+void irda_nl_unregister(void)
+{
+	genl_unregister_family(&irda_nl_family);
+}
diff --git a/net/irda/irproc.c b/net/irda/irproc.c
new file mode 100644
index 00000000..b9ac598e
--- /dev/null
+++ b/net/irda/irproc.c
@@ -0,0 +1,97 @@
+/*********************************************************************
+ *
+ * Filename:      irproc.c
+ * Version:       1.0
+ * Description:   Various entries in the /proc file system
+ * Status:        Experimental.
+ * Author:        Thomas Davis, <ratbert@radiks.net>
+ * Created at:    Sat Feb 21 21:33:24 1998
+ * Modified at:   Sun Nov 14 08:54:54 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-1999, Dag Brattli <dagb@cs.uit.no>
+ *     Copyright (c) 1998, Thomas Davis, <ratbert@radiks.net>,
+ *     All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     I, Thomas Davis, provide no warranty for any of this software.
+ *     This material is provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/miscdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+
+extern const struct file_operations discovery_seq_fops;
+extern const struct file_operations irlap_seq_fops;
+extern const struct file_operations irlmp_seq_fops;
+extern const struct file_operations irttp_seq_fops;
+extern const struct file_operations irias_seq_fops;
+
+struct irda_entry {
+	const char *name;
+	const struct file_operations *fops;
+};
+
+struct proc_dir_entry *proc_irda;
+EXPORT_SYMBOL(proc_irda);
+
+static const struct irda_entry irda_dirs[] = {
+	{"discovery",	&discovery_seq_fops},
+	{"irttp",	&irttp_seq_fops},
+	{"irlmp",	&irlmp_seq_fops},
+	{"irlap",	&irlap_seq_fops},
+	{"irias",	&irias_seq_fops},
+};
+
+/*
+ * Function irda_proc_register (void)
+ *
+ *    Register irda entry in /proc file system
+ *
+ */
+void __init irda_proc_register(void)
+{
+	int i;
+
+	proc_irda = proc_mkdir("irda", init_net.proc_net);
+	if (proc_irda == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(irda_dirs); i++)
+		(void) proc_create(irda_dirs[i].name, 0, proc_irda,
+				   irda_dirs[i].fops);
+}
+
+/*
+ * Function irda_proc_unregister (void)
+ *
+ *    Unregister irda entry in /proc file system
+ *
+ */
+void irda_proc_unregister(void)
+{
+	int i;
+
+	if (proc_irda) {
+		for (i=0; i<ARRAY_SIZE(irda_dirs); i++)
+			remove_proc_entry(irda_dirs[i].name, proc_irda);
+
+		remove_proc_entry("irda", init_net.proc_net);
+		proc_irda = NULL;
+	}
+}
+
+
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
new file mode 100644
index 00000000..9715e6e5
--- /dev/null
+++ b/net/irda/irqueue.c
@@ -0,0 +1,921 @@
+/*********************************************************************
+ *
+ * Filename:      irqueue.c
+ * Version:       0.3
+ * Description:   General queue implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Jun  9 13:29:31 1998
+ * Modified at:   Sun Dec 12 13:48:22 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Modified at:   Thu Jan  4 14:29:10 CET 2001
+ * Modified by:   Marc Zyngier <mzyngier@freesurf.fr>
+ *
+ *     Copyright (C) 1998-1999, Aage Kvalnes <aage@cs.uit.no>
+ *     Copyright (C) 1998, Dag Brattli,
+ *     All Rights Reserved.
+ *
+ *     This code is taken from the Vortex Operating System written by Aage
+ *     Kvalnes. Aage has agreed that this code can use the GPL licence,
+ *     although he does not use that licence in his own code.
+ *
+ *     This copyright does however _not_ include the ELF hash() function
+ *     which I currently don't know which licence or copyright it
+ *     has. Please inform me if you know.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+/*
+ * NOTE :
+ * There are various problems with this package :
+ *	o the hash function for ints is pathetic (but could be changed)
+ *	o locking is sometime suspicious (especially during enumeration)
+ *	o most users have only a few elements (== overhead)
+ *	o most users never use search, so don't benefit from hashing
+ * Problem already fixed :
+ *	o not 64 bit compliant (most users do hashv = (int) self)
+ *	o hashbin_remove() is broken => use hashbin_remove_this()
+ * I think most users would be better served by a simple linked list
+ * (like include/linux/list.h) with a global spinlock per list.
+ * Jean II
+ */
+
+/*
+ * Notes on the concurrent access to hashbin and other SMP issues
+ * -------------------------------------------------------------
+ *	Hashbins are very often in the IrDA stack a global repository of
+ * information, and therefore used in a very asynchronous manner following
+ * various events (driver calls, timers, user calls...).
+ *	Therefore, very often it is highly important to consider the
+ * management of concurrent access to the hashbin and how to guarantee the
+ * consistency of the operations on it.
+ *
+ *	First, we need to define the objective of locking :
+ *		1) Protect user data (content pointed by the hashbin)
+ *		2) Protect hashbin structure itself (linked list in each bin)
+ *
+ *			     OLD LOCKING
+ *			     -----------
+ *
+ *	The previous locking strategy, either HB_LOCAL or HB_GLOBAL were
+ * both inadequate in *both* aspect.
+ *		o HB_GLOBAL was using a spinlock for each bin (local locking).
+ *		o HB_LOCAL was disabling irq on *all* CPUs, so use a single
+ *		  global semaphore.
+ *	The problems were :
+ *		A) Global irq disabling is no longer supported by the kernel
+ *		B) No protection for the hashbin struct global data
+ *			o hashbin_delete()
+ *			o hb_current
+ *		C) No protection for user data in some cases
+ *
+ *	A) HB_LOCAL use global irq disabling, so doesn't work on kernel
+ * 2.5.X. Even when it is supported (kernel 2.4.X and earlier), its
+ * performance is not satisfactory on SMP setups. Most hashbins were
+ * HB_LOCAL, so (A) definitely need fixing.
+ *	B) HB_LOCAL could be modified to fix (B). However, because HB_GLOBAL
+ * lock only the individual bins, it will never be able to lock the
+ * global data, so can't do (B).
+ *	C) Some functions return pointer to data that is still in the
+ * hashbin :
+ *		o hashbin_find()
+ *		o hashbin_get_first()
+ *		o hashbin_get_next()
+ *	As the data is still in the hashbin, it may be changed or free'd
+ * while the caller is examinimg the data. In those case, locking can't
+ * be done within the hashbin, but must include use of the data within
+ * the caller.
+ *	The caller can easily do this with HB_LOCAL (just disable irqs).
+ * However, this is impossible with HB_GLOBAL because the caller has no
+ * way to know the proper bin, so don't know which spinlock to use.
+ *
+ *	Quick summary : can no longer use HB_LOCAL, and HB_GLOBAL is
+ * fundamentally broken and will never work.
+ *
+ *			     NEW LOCKING
+ *			     -----------
+ *
+ *	To fix those problems, I've introduce a few changes in the
+ * hashbin locking :
+ *		1) New HB_LOCK scheme
+ *		2) hashbin->hb_spinlock
+ *		3) New hashbin usage policy
+ *
+ * HB_LOCK :
+ * -------
+ *	HB_LOCK is a locking scheme intermediate between the old HB_LOCAL
+ * and HB_GLOBAL. It uses a single spinlock to protect the whole content
+ * of the hashbin. As it is a single spinlock, it can protect the global
+ * data of the hashbin and not only the bins themselves.
+ *	HB_LOCK can only protect some of the hashbin calls, so it only lock
+ * call that can be made 100% safe and leave other call unprotected.
+ *	HB_LOCK in theory is slower than HB_GLOBAL, but as the hashbin
+ * content is always small contention is not high, so it doesn't matter
+ * much. HB_LOCK is probably faster than HB_LOCAL.
+ *
+ * hashbin->hb_spinlock :
+ * --------------------
+ *	The spinlock that HB_LOCK uses is available for caller, so that
+ * the caller can protect unprotected calls (see below).
+ *	If the caller want to do entirely its own locking (HB_NOLOCK), he
+ * can do so and may use safely this spinlock.
+ *	Locking is done like this :
+ *		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+ *	Releasing the lock :
+ *		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+ *
+ * Safe & Protected calls :
+ * ----------------------
+ *	The following calls are safe or protected via HB_LOCK :
+ *		o hashbin_new()		-> safe
+ *		o hashbin_delete()
+ *		o hashbin_insert()
+ *		o hashbin_remove_first()
+ *		o hashbin_remove()
+ *		o hashbin_remove_this()
+ *		o HASHBIN_GET_SIZE()	-> atomic
+ *
+ *	The following calls only protect the hashbin itself :
+ *		o hashbin_lock_find()
+ *		o hashbin_find_next()
+ *
+ * Unprotected calls :
+ * -----------------
+ *	The following calls need to be protected by the caller :
+ *		o hashbin_find()
+ *		o hashbin_get_first()
+ *		o hashbin_get_next()
+ *
+ * Locking Policy :
+ * --------------
+ *	If the hashbin is used only in a single thread of execution
+ * (explicitly or implicitely), you can use HB_NOLOCK
+ *	If the calling module already provide concurrent access protection,
+ * you may use HB_NOLOCK.
+ *
+ *	In all other cases, you need to use HB_LOCK and lock the hashbin
+ * every time before calling one of the unprotected calls. You also must
+ * use the pointer returned by the unprotected call within the locked
+ * region.
+ *
+ * Extra care for enumeration :
+ * --------------------------
+ *	hashbin_get_first() and hashbin_get_next() use the hashbin to
+ * store the current position, in hb_current.
+ *	As long as the hashbin remains locked, this is safe. If you unlock
+ * the hashbin, the current position may change if anybody else modify
+ * or enumerate the hashbin.
+ *	Summary : do the full enumeration while locked.
+ *
+ *	Alternatively, you may use hashbin_find_next(). But, this will
+ * be slower, is more complex to use and doesn't protect the hashbin
+ * content. So, care is needed here as well.
+ *
+ * Other issues :
+ * ------------
+ *	I believe that we are overdoing it by using spin_lock_irqsave()
+ * and we should use only spin_lock_bh() or similar. But, I don't have
+ * the balls to try it out.
+ *	Don't believe that because hashbin are now (somewhat) SMP safe
+ * that the rest of the code is. Higher layers tend to be safest,
+ * but LAP and LMP would need some serious dedicated love.
+ *
+ * Jean II
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irqueue.h>
+
+/************************ QUEUE SUBROUTINES ************************/
+
+/*
+ * Hashbin
+ */
+#define GET_HASHBIN(x) ( x & HASHBIN_MASK )
+
+/*
+ * Function hash (name)
+ *
+ *    This function hash the input string 'name' using the ELF hash
+ *    function for strings.
+ */
+static __u32 hash( const char* name)
+{
+	__u32 h = 0;
+	__u32 g;
+
+	while(*name) {
+		h = (h<<4) + *name++;
+		if ((g = (h & 0xf0000000)))
+			h ^=g>>24;
+		h &=~g;
+	}
+	return h;
+}
+
+/*
+ * Function enqueue_first (queue, proc)
+ *
+ *    Insert item first in queue.
+ *
+ */
+static void enqueue_first(irda_queue_t **queue, irda_queue_t* element)
+{
+
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	/*
+	 * Check if queue is empty.
+	 */
+	if ( *queue == NULL ) {
+		/*
+		 * Queue is empty.  Insert one element into the queue.
+		 */
+		element->q_next = element->q_prev = *queue = element;
+
+	} else {
+		/*
+		 * Queue is not empty.  Insert element into front of queue.
+		 */
+		element->q_next          = (*queue);
+		(*queue)->q_prev->q_next = element;
+		element->q_prev          = (*queue)->q_prev;
+		(*queue)->q_prev         = element;
+		(*queue)                 = element;
+	}
+}
+
+
+/*
+ * Function dequeue (queue)
+ *
+ *    Remove first entry in queue
+ *
+ */
+static irda_queue_t *dequeue_first(irda_queue_t **queue)
+{
+	irda_queue_t *ret;
+
+	IRDA_DEBUG( 4, "dequeue_first()\n");
+
+	/*
+	 * Set return value
+	 */
+	ret =  *queue;
+
+	if ( *queue == NULL ) {
+		/*
+		 * Queue was empty.
+		 */
+	} else if ( (*queue)->q_next == *queue ) {
+		/*
+		 *  Queue only contained a single element. It will now be
+		 *  empty.
+		 */
+		*queue = NULL;
+	} else {
+		/*
+		 * Queue contained several element.  Remove the first one.
+		 */
+		(*queue)->q_prev->q_next = (*queue)->q_next;
+		(*queue)->q_next->q_prev = (*queue)->q_prev;
+		*queue = (*queue)->q_next;
+	}
+
+	/*
+	 * Return the removed entry (or NULL of queue was empty).
+	 */
+	return ret;
+}
+
+/*
+ * Function dequeue_general (queue, element)
+ *
+ *
+ */
+static irda_queue_t *dequeue_general(irda_queue_t **queue, irda_queue_t* element)
+{
+	irda_queue_t *ret;
+
+	IRDA_DEBUG( 4, "dequeue_general()\n");
+
+	/*
+	 * Set return value
+	 */
+	ret =  *queue;
+
+	if ( *queue == NULL ) {
+		/*
+		 * Queue was empty.
+		 */
+	} else if ( (*queue)->q_next == *queue ) {
+		/*
+		 *  Queue only contained a single element. It will now be
+		 *  empty.
+		 */
+		*queue = NULL;
+
+	} else {
+		/*
+		 *  Remove specific element.
+		 */
+		element->q_prev->q_next = element->q_next;
+		element->q_next->q_prev = element->q_prev;
+		if ( (*queue) == element)
+			(*queue) = element->q_next;
+	}
+
+	/*
+	 * Return the removed entry (or NULL of queue was empty).
+	 */
+	return ret;
+}
+
+/************************ HASHBIN MANAGEMENT ************************/
+
+/*
+ * Function hashbin_create ( type, name )
+ *
+ *    Create hashbin!
+ *
+ */
+hashbin_t *hashbin_new(int type)
+{
+	hashbin_t* hashbin;
+
+	/*
+	 * Allocate new hashbin
+	 */
+	hashbin = kzalloc(sizeof(*hashbin), GFP_ATOMIC);
+	if (!hashbin)
+		return NULL;
+
+	/*
+	 * Initialize structure
+	 */
+	hashbin->hb_type = type;
+	hashbin->magic = HB_MAGIC;
+	//hashbin->hb_current = NULL;
+
+	/* Make sure all spinlock's are unlocked */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_init(&hashbin->hb_spinlock);
+	}
+
+	return hashbin;
+}
+EXPORT_SYMBOL(hashbin_new);
+
+
+/*
+ * Function hashbin_delete (hashbin, free_func)
+ *
+ *    Destroy hashbin, the free_func can be a user supplied special routine
+ *    for deallocating this structure if it's complex. If not the user can
+ *    just supply kfree, which should take care of the job.
+ */
+#ifdef CONFIG_LOCKDEP
+static int hashbin_lock_depth = 0;
+#endif
+int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func)
+{
+	irda_queue_t* queue;
+	unsigned long flags = 0;
+	int i;
+
+	IRDA_ASSERT(hashbin != NULL, return -1;);
+	IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;);
+
+	/* Synchronize */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_irqsave_nested(&hashbin->hb_spinlock, flags,
+					 hashbin_lock_depth++);
+	}
+
+	/*
+	 *  Free the entries in the hashbin, TODO: use hashbin_clear when
+	 *  it has been shown to work
+	 */
+	for (i = 0; i < HASHBIN_SIZE; i ++ ) {
+		queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]);
+		while (queue ) {
+			if (free_func)
+				(*free_func)(queue);
+			queue = dequeue_first(
+				(irda_queue_t**) &hashbin->hb_queue[i]);
+		}
+	}
+
+	/* Cleanup local data */
+	hashbin->hb_current = NULL;
+	hashbin->magic = ~HB_MAGIC;
+
+	/* Release lock */
+	if ( hashbin->hb_type & HB_LOCK) {
+		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+#ifdef CONFIG_LOCKDEP
+		hashbin_lock_depth--;
+#endif
+	}
+
+	/*
+	 *  Free the hashbin structure
+	 */
+	kfree(hashbin);
+
+	return 0;
+}
+EXPORT_SYMBOL(hashbin_delete);
+
+/********************* HASHBIN LIST OPERATIONS *********************/
+
+/*
+ * Function hashbin_insert (hashbin, entry, name)
+ *
+ *    Insert an entry into the hashbin
+ *
+ */
+void hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv,
+		    const char* name)
+{
+	unsigned long flags = 0;
+	int bin;
+
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	IRDA_ASSERT( hashbin != NULL, return;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return;);
+
+	/*
+	 * Locate hashbin
+	 */
+	if ( name )
+		hashv = hash( name );
+	bin = GET_HASHBIN( hashv );
+
+	/* Synchronize */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	/*
+	 * Store name and key
+	 */
+	entry->q_hash = hashv;
+	if ( name )
+		strlcpy( entry->q_name, name, sizeof(entry->q_name));
+
+	/*
+	 * Insert new entry first
+	 */
+	enqueue_first( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+		       entry);
+	hashbin->hb_size++;
+
+	/* Release lock */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+}
+EXPORT_SYMBOL(hashbin_insert);
+
+/*
+ *  Function hashbin_remove_first (hashbin)
+ *
+ *    Remove first entry of the hashbin
+ *
+ * Note : this function no longer use hashbin_remove(), but does things
+ * similar to hashbin_remove_this(), so can be considered safe.
+ * Jean II
+ */
+void *hashbin_remove_first( hashbin_t *hashbin)
+{
+	unsigned long flags = 0;
+	irda_queue_t *entry = NULL;
+
+	/* Synchronize */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	entry = hashbin_get_first( hashbin);
+	if ( entry != NULL) {
+		int	bin;
+		long	hashv;
+		/*
+		 * Locate hashbin
+		 */
+		hashv = entry->q_hash;
+		bin = GET_HASHBIN( hashv );
+
+		/*
+		 * Dequeue the entry...
+		 */
+		dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+				 (irda_queue_t*) entry );
+		hashbin->hb_size--;
+		entry->q_next = NULL;
+		entry->q_prev = NULL;
+
+		/*
+		 *  Check if this item is the currently selected item, and in
+		 *  that case we must reset hb_current
+		 */
+		if ( entry == hashbin->hb_current)
+			hashbin->hb_current = NULL;
+	}
+
+	/* Release lock */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	return entry;
+}
+
+
+/*
+ *  Function hashbin_remove (hashbin, hashv, name)
+ *
+ *    Remove entry with the given name
+ *
+ *  The use of this function is highly discouraged, because the whole
+ *  concept behind hashbin_remove() is broken. In many cases, it's not
+ *  possible to guarantee the unicity of the index (either hashv or name),
+ *  leading to removing the WRONG entry.
+ *  The only simple safe use is :
+ *		hashbin_remove(hasbin, (int) self, NULL);
+ *  In other case, you must think hard to guarantee unicity of the index.
+ *  Jean II
+ */
+void* hashbin_remove( hashbin_t* hashbin, long hashv, const char* name)
+{
+	int bin, found = FALSE;
+	unsigned long flags = 0;
+	irda_queue_t* entry;
+
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	IRDA_ASSERT( hashbin != NULL, return NULL;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+	/*
+	 * Locate hashbin
+	 */
+	if ( name )
+		hashv = hash( name );
+	bin = GET_HASHBIN( hashv );
+
+	/* Synchronize */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	/*
+	 * Search for entry
+	 */
+	entry = hashbin->hb_queue[ bin ];
+	if ( entry ) {
+		do {
+			/*
+			 * Check for key
+			 */
+			if ( entry->q_hash == hashv ) {
+				/*
+				 * Name compare too?
+				 */
+				if ( name ) {
+					if ( strcmp( entry->q_name, name) == 0)
+					{
+						found = TRUE;
+						break;
+					}
+				} else {
+					found = TRUE;
+					break;
+				}
+			}
+			entry = entry->q_next;
+		} while ( entry != hashbin->hb_queue[ bin ] );
+	}
+
+	/*
+	 * If entry was found, dequeue it
+	 */
+	if ( found ) {
+		dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+				 (irda_queue_t*) entry );
+		hashbin->hb_size--;
+
+		/*
+		 *  Check if this item is the currently selected item, and in
+		 *  that case we must reset hb_current
+		 */
+		if ( entry == hashbin->hb_current)
+			hashbin->hb_current = NULL;
+	}
+
+	/* Release lock */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+
+	/* Return */
+	if ( found )
+		return entry;
+	else
+		return NULL;
+
+}
+EXPORT_SYMBOL(hashbin_remove);
+
+/*
+ *  Function hashbin_remove_this (hashbin, entry)
+ *
+ *    Remove entry with the given name
+ *
+ * In some cases, the user of hashbin can't guarantee the unicity
+ * of either the hashv or name.
+ * In those cases, using the above function is guaranteed to cause troubles,
+ * so we use this one instead...
+ * And by the way, it's also faster, because we skip the search phase ;-)
+ */
+void* hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry)
+{
+	unsigned long flags = 0;
+	int	bin;
+	long	hashv;
+
+	IRDA_DEBUG( 4, "%s()\n", __func__);
+
+	IRDA_ASSERT( hashbin != NULL, return NULL;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+	IRDA_ASSERT( entry != NULL, return NULL;);
+
+	/* Synchronize */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	/* Check if valid and not already removed... */
+	if((entry->q_next == NULL) || (entry->q_prev == NULL)) {
+		entry = NULL;
+		goto out;
+	}
+
+	/*
+	 * Locate hashbin
+	 */
+	hashv = entry->q_hash;
+	bin = GET_HASHBIN( hashv );
+
+	/*
+	 * Dequeue the entry...
+	 */
+	dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ],
+			 (irda_queue_t*) entry );
+	hashbin->hb_size--;
+	entry->q_next = NULL;
+	entry->q_prev = NULL;
+
+	/*
+	 *  Check if this item is the currently selected item, and in
+	 *  that case we must reset hb_current
+	 */
+	if ( entry == hashbin->hb_current)
+		hashbin->hb_current = NULL;
+out:
+	/* Release lock */
+	if ( hashbin->hb_type & HB_LOCK ) {
+		spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+	} /* Default is no-lock  */
+
+	return entry;
+}
+EXPORT_SYMBOL(hashbin_remove_this);
+
+/*********************** HASHBIN ENUMERATION ***********************/
+
+/*
+ * Function hashbin_common_find (hashbin, hashv, name)
+ *
+ *    Find item with the given hashv or name
+ *
+ */
+void* hashbin_find( hashbin_t* hashbin, long hashv, const char* name )
+{
+	int bin;
+	irda_queue_t* entry;
+
+	IRDA_DEBUG( 4, "hashbin_find()\n");
+
+	IRDA_ASSERT( hashbin != NULL, return NULL;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+	/*
+	 * Locate hashbin
+	 */
+	if ( name )
+		hashv = hash( name );
+	bin = GET_HASHBIN( hashv );
+
+	/*
+	 * Search for entry
+	 */
+	entry = hashbin->hb_queue[ bin];
+	if ( entry ) {
+		do {
+			/*
+			 * Check for key
+			 */
+			if ( entry->q_hash == hashv ) {
+				/*
+				 * Name compare too?
+				 */
+				if ( name ) {
+					if ( strcmp( entry->q_name, name ) == 0 ) {
+						return entry;
+					}
+				} else {
+					return entry;
+				}
+			}
+			entry = entry->q_next;
+		} while ( entry != hashbin->hb_queue[ bin ] );
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(hashbin_find);
+
+/*
+ * Function hashbin_lock_find (hashbin, hashv, name)
+ *
+ *    Find item with the given hashv or name
+ *
+ * Same, but with spinlock protection...
+ * I call it safe, but it's only safe with respect to the hashbin, not its
+ * content. - Jean II
+ */
+void* hashbin_lock_find( hashbin_t* hashbin, long hashv, const char* name )
+{
+	unsigned long flags = 0;
+	irda_queue_t* entry;
+
+	/* Synchronize */
+	spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+
+	/*
+	 * Search for entry
+	 */
+	entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name );
+
+	/* Release lock */
+	spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+
+	return entry;
+}
+EXPORT_SYMBOL(hashbin_lock_find);
+
+/*
+ * Function hashbin_find (hashbin, hashv, name, pnext)
+ *
+ *    Find an item with the given hashv or name, and its successor
+ *
+ * This function allow to do concurrent enumerations without the
+ * need to lock over the whole session, because the caller keep the
+ * context of the search. On the other hand, it might fail and return
+ * NULL if the entry is removed. - Jean II
+ */
+void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
+			 void ** pnext)
+{
+	unsigned long flags = 0;
+	irda_queue_t* entry;
+
+	/* Synchronize */
+	spin_lock_irqsave(&hashbin->hb_spinlock, flags);
+
+	/*
+	 * Search for current entry
+	 * This allow to check if the current item is still in the
+	 * hashbin or has been removed.
+	 */
+	entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name );
+
+	/*
+	 * Trick hashbin_get_next() to return what we want
+	 */
+	if(entry) {
+		hashbin->hb_current = entry;
+		*pnext = hashbin_get_next( hashbin );
+	} else
+		*pnext = NULL;
+
+	/* Release lock */
+	spin_unlock_irqrestore(&hashbin->hb_spinlock, flags);
+
+	return entry;
+}
+
+/*
+ * Function hashbin_get_first (hashbin)
+ *
+ *    Get a pointer to first element in hashbin, this function must be
+ *    called before any calls to hashbin_get_next()!
+ *
+ */
+irda_queue_t *hashbin_get_first( hashbin_t* hashbin)
+{
+	irda_queue_t *entry;
+	int i;
+
+	IRDA_ASSERT( hashbin != NULL, return NULL;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+	if ( hashbin == NULL)
+		return NULL;
+
+	for ( i = 0; i < HASHBIN_SIZE; i ++ ) {
+		entry = hashbin->hb_queue[ i];
+		if ( entry) {
+			hashbin->hb_current = entry;
+			return entry;
+		}
+	}
+	/*
+	 *  Did not find any item in hashbin
+	 */
+	return NULL;
+}
+EXPORT_SYMBOL(hashbin_get_first);
+
+/*
+ * Function hashbin_get_next (hashbin)
+ *
+ *    Get next item in hashbin. A series of hashbin_get_next() calls must
+ *    be started by a call to hashbin_get_first(). The function returns
+ *    NULL when all items have been traversed
+ *
+ * The context of the search is stored within the hashbin, so you must
+ * protect yourself from concurrent enumerations. - Jean II
+ */
+irda_queue_t *hashbin_get_next( hashbin_t *hashbin)
+{
+	irda_queue_t* entry;
+	int bin;
+	int i;
+
+	IRDA_ASSERT( hashbin != NULL, return NULL;);
+	IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;);
+
+	if ( hashbin->hb_current == NULL) {
+		IRDA_ASSERT( hashbin->hb_current != NULL, return NULL;);
+		return NULL;
+	}
+	entry = hashbin->hb_current->q_next;
+	bin = GET_HASHBIN( entry->q_hash);
+
+	/*
+	 *  Make sure that we are not back at the beginning of the queue
+	 *  again
+	 */
+	if ( entry != hashbin->hb_queue[ bin ]) {
+		hashbin->hb_current = entry;
+
+		return entry;
+	}
+
+	/*
+	 *  Check that this is not the last queue in hashbin
+	 */
+	if ( bin >= HASHBIN_SIZE)
+		return NULL;
+
+	/*
+	 *  Move to next queue in hashbin
+	 */
+	bin++;
+	for ( i = bin; i < HASHBIN_SIZE; i++ ) {
+		entry = hashbin->hb_queue[ i];
+		if ( entry) {
+			hashbin->hb_current = entry;
+
+			return entry;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(hashbin_get_next);
diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c
new file mode 100644
index 00000000..d0b70dad
--- /dev/null
+++ b/net/irda/irsysctl.c
@@ -0,0 +1,273 @@
+/*********************************************************************
+ *
+ * Filename:      irsysctl.c
+ * Version:       1.0
+ * Description:   Sysctl interface for IrDA
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun May 24 22:12:06 1998
+ * Modified at:   Fri Jun  4 02:50:15 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1997, 1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+#include <net/irda/irda.h>		/* irda_debug */
+#include <net/irda/irlmp.h>
+#include <net/irda/timer.h>
+#include <net/irda/irias_object.h>
+
+extern int  sysctl_discovery;
+extern int  sysctl_discovery_slots;
+extern int  sysctl_discovery_timeout;
+extern int  sysctl_slot_timeout;
+extern int  sysctl_fast_poll_increase;
+extern char sysctl_devname[];
+extern int  sysctl_max_baud_rate;
+extern int  sysctl_min_tx_turn_time;
+extern int  sysctl_max_tx_data_size;
+extern int  sysctl_max_tx_window;
+extern int  sysctl_max_noreply_time;
+extern int  sysctl_warn_noreply_time;
+extern int  sysctl_lap_keepalive_time;
+
+extern struct irlmp_cb *irlmp;
+
+/* this is needed for the proc_dointvec_minmax - Jean II */
+static int max_discovery_slots = 16;		/* ??? */
+static int min_discovery_slots = 1;
+/* IrLAP 6.13.2 says 25ms to 10+70ms - allow higher since some devices
+ * seems to require it. (from Dag's comment) */
+static int max_slot_timeout = 160;
+static int min_slot_timeout = 20;
+static int max_max_baud_rate = 16000000;	/* See qos.c - IrLAP spec */
+static int min_max_baud_rate = 2400;
+static int max_min_tx_turn_time = 10000;	/* See qos.c - IrLAP spec */
+static int min_min_tx_turn_time;
+static int max_max_tx_data_size = 2048;		/* See qos.c - IrLAP spec */
+static int min_max_tx_data_size = 64;
+static int max_max_tx_window = 7;		/* See qos.c - IrLAP spec */
+static int min_max_tx_window = 1;
+static int max_max_noreply_time = 40;		/* See qos.c - IrLAP spec */
+static int min_max_noreply_time = 3;
+static int max_warn_noreply_time = 3;		/* 3s == standard */
+static int min_warn_noreply_time = 1;		/* 1s == min WD_TIMER */
+static int max_lap_keepalive_time = 10000;	/* 10s */
+static int min_lap_keepalive_time = 100;	/* 100us */
+/* For other sysctl, I've no idea of the range. Maybe Dag could help
+ * us on that - Jean II */
+
+static int do_devname(ctl_table *table, int write,
+		      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dostring(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write) {
+		struct ias_value *val;
+
+		val = irias_new_string_value(sysctl_devname);
+		if (val)
+			irias_object_change_attribute("Device", "DeviceName", val);
+	}
+	return ret;
+}
+
+
+static int do_discovery(ctl_table *table, int write,
+                    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       int ret;
+
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       if (ret)
+	       return ret;
+
+       if (irlmp == NULL)
+	       return -ENODEV;
+
+       if (sysctl_discovery)
+	       irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ);
+       else
+	       del_timer_sync(&irlmp->discovery_timer);
+
+       return ret;
+}
+
+/* One file */
+static ctl_table irda_table[] = {
+	{
+		.procname	= "discovery",
+		.data		= &sysctl_discovery,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= do_discovery,
+	},
+	{
+		.procname	= "devname",
+		.data		= sysctl_devname,
+		.maxlen		= 65,
+		.mode		= 0644,
+		.proc_handler	= do_devname,
+	},
+#ifdef CONFIG_IRDA_DEBUG
+	{
+		.procname	= "debug",
+		.data		= &irda_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
+#ifdef CONFIG_IRDA_FAST_RR
+	{
+		.procname	= "fast_poll_increase",
+		.data		= &sysctl_fast_poll_increase,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+#endif
+	{
+		.procname	= "discovery_slots",
+		.data		= &sysctl_discovery_slots,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_discovery_slots,
+		.extra2		= &max_discovery_slots
+	},
+	{
+		.procname	= "discovery_timeout",
+		.data		= &sysctl_discovery_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "slot_timeout",
+		.data		= &sysctl_slot_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_timeout,
+		.extra2		= &max_slot_timeout
+	},
+	{
+		.procname	= "max_baud_rate",
+		.data		= &sysctl_max_baud_rate,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_baud_rate,
+		.extra2		= &max_max_baud_rate
+	},
+	{
+		.procname	= "min_tx_turn_time",
+		.data		= &sysctl_min_tx_turn_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_min_tx_turn_time,
+		.extra2		= &max_min_tx_turn_time
+	},
+	{
+		.procname	= "max_tx_data_size",
+		.data		= &sysctl_max_tx_data_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_tx_data_size,
+		.extra2		= &max_max_tx_data_size
+	},
+	{
+		.procname	= "max_tx_window",
+		.data		= &sysctl_max_tx_window,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_tx_window,
+		.extra2		= &max_max_tx_window
+	},
+	{
+		.procname	= "max_noreply_time",
+		.data		= &sysctl_max_noreply_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_noreply_time,
+		.extra2		= &max_max_noreply_time
+	},
+	{
+		.procname	= "warn_noreply_time",
+		.data		= &sysctl_warn_noreply_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_warn_noreply_time,
+		.extra2		= &max_warn_noreply_time
+	},
+	{
+		.procname	= "lap_keepalive_time",
+		.data		= &sysctl_lap_keepalive_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_lap_keepalive_time,
+		.extra2		= &max_lap_keepalive_time
+	},
+	{ }
+};
+
+static struct ctl_path irda_path[] = {
+	{ .procname = "net", },
+	{ .procname = "irda", },
+	{ }
+};
+
+static struct ctl_table_header *irda_table_header;
+
+/*
+ * Function irda_sysctl_register (void)
+ *
+ *    Register our sysctl interface
+ *
+ */
+int __init irda_sysctl_register(void)
+{
+	irda_table_header = register_sysctl_paths(irda_path, irda_table);
+	if (!irda_table_header)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * Function irda_sysctl_unregister (void)
+ *
+ *    Unregister our sysctl interface
+ *
+ */
+void irda_sysctl_unregister(void)
+{
+	unregister_sysctl_table(irda_table_header);
+}
+
+
+
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
new file mode 100644
index 00000000..9d9af460
--- /dev/null
+++ b/net/irda/irttp.c
@@ -0,0 +1,1915 @@
+/*********************************************************************
+ *
+ * Filename:      irttp.c
+ * Version:       1.2
+ * Description:   Tiny Transport Protocol (TTP) implementation
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:31 1997
+ * Modified at:   Wed Jan  5 11:31:27 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2003 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+#include <net/irda/parameters.h>
+#include <net/irda/irttp.h>
+
+static struct irttp_cb *irttp;
+
+static void __irttp_close_tsap(struct tsap_cb *self);
+
+static int irttp_data_indication(void *instance, void *sap,
+				 struct sk_buff *skb);
+static int irttp_udata_indication(void *instance, void *sap,
+				  struct sk_buff *skb);
+static void irttp_disconnect_indication(void *instance, void *sap,
+					LM_REASON reason, struct sk_buff *);
+static void irttp_connect_indication(void *instance, void *sap,
+				     struct qos_info *qos, __u32 max_sdu_size,
+				     __u8 header_size, struct sk_buff *skb);
+static void irttp_connect_confirm(void *instance, void *sap,
+				  struct qos_info *qos, __u32 max_sdu_size,
+				  __u8 header_size, struct sk_buff *skb);
+static void irttp_run_tx_queue(struct tsap_cb *self);
+static void irttp_run_rx_queue(struct tsap_cb *self);
+
+static void irttp_flush_queues(struct tsap_cb *self);
+static void irttp_fragment_skb(struct tsap_cb *self, struct sk_buff *skb);
+static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self);
+static void irttp_todo_expired(unsigned long data);
+static int irttp_param_max_sdu_size(void *instance, irda_param_t *param,
+				    int get);
+
+static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow);
+static void irttp_status_indication(void *instance,
+				    LINK_STATUS link, LOCK_STATUS lock);
+
+/* Information for parsing parameters in IrTTP */
+static pi_minor_info_t pi_minor_call_table[] = {
+	{ NULL, 0 },                                             /* 0x00 */
+	{ irttp_param_max_sdu_size, PV_INTEGER | PV_BIG_ENDIAN } /* 0x01 */
+};
+static pi_major_info_t pi_major_call_table[] = {{ pi_minor_call_table, 2 }};
+static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 };
+
+/************************ GLOBAL PROCEDURES ************************/
+
+/*
+ * Function irttp_init (void)
+ *
+ *    Initialize the IrTTP layer. Called by module initialization code
+ *
+ */
+int __init irttp_init(void)
+{
+	irttp = kzalloc(sizeof(struct irttp_cb), GFP_KERNEL);
+	if (irttp == NULL)
+		return -ENOMEM;
+
+	irttp->magic = TTP_MAGIC;
+
+	irttp->tsaps = hashbin_new(HB_LOCK);
+	if (!irttp->tsaps) {
+		IRDA_ERROR("%s: can't allocate IrTTP hashbin!\n",
+			   __func__);
+		kfree(irttp);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Function irttp_cleanup (void)
+ *
+ *    Called by module destruction/cleanup code
+ *
+ */
+void irttp_cleanup(void)
+{
+	/* Check for main structure */
+	IRDA_ASSERT(irttp->magic == TTP_MAGIC, return;);
+
+	/*
+	 *  Delete hashbin and close all TSAP instances in it
+	 */
+	hashbin_delete(irttp->tsaps, (FREE_FUNC) __irttp_close_tsap);
+
+	irttp->magic = 0;
+
+	/* De-allocate main structure */
+	kfree(irttp);
+
+	irttp = NULL;
+}
+
+/*************************** SUBROUTINES ***************************/
+
+/*
+ * Function irttp_start_todo_timer (self, timeout)
+ *
+ *    Start todo timer.
+ *
+ * Made it more effient and unsensitive to race conditions - Jean II
+ */
+static inline void irttp_start_todo_timer(struct tsap_cb *self, int timeout)
+{
+	/* Set new value for timer */
+	mod_timer(&self->todo_timer, jiffies + timeout);
+}
+
+/*
+ * Function irttp_todo_expired (data)
+ *
+ *    Todo timer has expired!
+ *
+ * One of the restriction of the timer is that it is run only on the timer
+ * interrupt which run every 10ms. This mean that even if you set the timer
+ * with a delay of 0, it may take up to 10ms before it's run.
+ * So, to minimise latency and keep cache fresh, we try to avoid using
+ * it as much as possible.
+ * Note : we can't use tasklets, because they can't be asynchronously
+ * killed (need user context), and we can't guarantee that here...
+ * Jean II
+ */
+static void irttp_todo_expired(unsigned long data)
+{
+	struct tsap_cb *self = (struct tsap_cb *) data;
+
+	/* Check that we still exist */
+	if (!self || self->magic != TTP_TSAP_MAGIC)
+		return;
+
+	IRDA_DEBUG(4, "%s(instance=%p)\n", __func__, self);
+
+	/* Try to make some progress, especially on Tx side - Jean II */
+	irttp_run_rx_queue(self);
+	irttp_run_tx_queue(self);
+
+	/* Check if time for disconnect */
+	if (test_bit(0, &self->disconnect_pend)) {
+		/* Check if it's possible to disconnect yet */
+		if (skb_queue_empty(&self->tx_queue)) {
+			/* Make sure disconnect is not pending anymore */
+			clear_bit(0, &self->disconnect_pend);	/* FALSE */
+
+			/* Note : self->disconnect_skb may be NULL */
+			irttp_disconnect_request(self, self->disconnect_skb,
+						 P_NORMAL);
+			self->disconnect_skb = NULL;
+		} else {
+			/* Try again later */
+			irttp_start_todo_timer(self, HZ/10);
+
+			/* No reason to try and close now */
+			return;
+		}
+	}
+
+	/* Check if it's closing time */
+	if (self->close_pend)
+		/* Finish cleanup */
+		irttp_close_tsap(self);
+}
+
+/*
+ * Function irttp_flush_queues (self)
+ *
+ *     Flushes (removes all frames) in transitt-buffer (tx_list)
+ */
+static void irttp_flush_queues(struct tsap_cb *self)
+{
+	struct sk_buff* skb;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	/* Deallocate frames waiting to be sent */
+	while ((skb = skb_dequeue(&self->tx_queue)) != NULL)
+		dev_kfree_skb(skb);
+
+	/* Deallocate received frames */
+	while ((skb = skb_dequeue(&self->rx_queue)) != NULL)
+		dev_kfree_skb(skb);
+
+	/* Deallocate received fragments */
+	while ((skb = skb_dequeue(&self->rx_fragments)) != NULL)
+		dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_reassemble (self)
+ *
+ *    Makes a new (continuous) skb of all the fragments in the fragment
+ *    queue
+ *
+ */
+static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self)
+{
+	struct sk_buff *skb, *frag;
+	int n = 0;  /* Fragment index */
+
+	IRDA_ASSERT(self != NULL, return NULL;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return NULL;);
+
+	IRDA_DEBUG(2, "%s(), self->rx_sdu_size=%d\n", __func__,
+		   self->rx_sdu_size);
+
+	skb = dev_alloc_skb(TTP_HEADER + self->rx_sdu_size);
+	if (!skb)
+		return NULL;
+
+	/*
+	 * Need to reserve space for TTP header in case this skb needs to
+	 * be requeued in case delivery failes
+	 */
+	skb_reserve(skb, TTP_HEADER);
+	skb_put(skb, self->rx_sdu_size);
+
+	/*
+	 *  Copy all fragments to a new buffer
+	 */
+	while ((frag = skb_dequeue(&self->rx_fragments)) != NULL) {
+		skb_copy_to_linear_data_offset(skb, n, frag->data, frag->len);
+		n += frag->len;
+
+		dev_kfree_skb(frag);
+	}
+
+	IRDA_DEBUG(2,
+		   "%s(), frame len=%d, rx_sdu_size=%d, rx_max_sdu_size=%d\n",
+		   __func__, n, self->rx_sdu_size, self->rx_max_sdu_size);
+	/* Note : irttp_run_rx_queue() calculate self->rx_sdu_size
+	 * by summing the size of all fragments, so we should always
+	 * have n == self->rx_sdu_size, except in cases where we
+	 * droped the last fragment (when self->rx_sdu_size exceed
+	 * self->rx_max_sdu_size), where n < self->rx_sdu_size.
+	 * Jean II */
+	IRDA_ASSERT(n <= self->rx_sdu_size, n = self->rx_sdu_size;);
+
+	/* Set the new length */
+	skb_trim(skb, n);
+
+	self->rx_sdu_size = 0;
+
+	return skb;
+}
+
+/*
+ * Function irttp_fragment_skb (skb)
+ *
+ *    Fragments a frame and queues all the fragments for transmission
+ *
+ */
+static inline void irttp_fragment_skb(struct tsap_cb *self,
+				      struct sk_buff *skb)
+{
+	struct sk_buff *frag;
+	__u8 *frame;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	/*
+	 *  Split frame into a number of segments
+	 */
+	while (skb->len > self->max_seg_size) {
+		IRDA_DEBUG(2, "%s(), fragmenting ...\n", __func__);
+
+		/* Make new segment */
+		frag = alloc_skb(self->max_seg_size+self->max_header_size,
+				 GFP_ATOMIC);
+		if (!frag)
+			return;
+
+		skb_reserve(frag, self->max_header_size);
+
+		/* Copy data from the original skb into this fragment. */
+		skb_copy_from_linear_data(skb, skb_put(frag, self->max_seg_size),
+			      self->max_seg_size);
+
+		/* Insert TTP header, with the more bit set */
+		frame = skb_push(frag, TTP_HEADER);
+		frame[0] = TTP_MORE;
+
+		/* Hide the copied data from the original skb */
+		skb_pull(skb, self->max_seg_size);
+
+		/* Queue fragment */
+		skb_queue_tail(&self->tx_queue, frag);
+	}
+	/* Queue what is left of the original skb */
+	IRDA_DEBUG(2, "%s(), queuing last segment\n", __func__);
+
+	frame = skb_push(skb, TTP_HEADER);
+	frame[0] = 0x00; /* Clear more bit */
+
+	/* Queue fragment */
+	skb_queue_tail(&self->tx_queue, skb);
+}
+
+/*
+ * Function irttp_param_max_sdu_size (self, param)
+ *
+ *    Handle the MaxSduSize parameter in the connect frames, this function
+ *    will be called both when this parameter needs to be inserted into, and
+ *    extracted from the connect frames
+ */
+static int irttp_param_max_sdu_size(void *instance, irda_param_t *param,
+				    int get)
+{
+	struct tsap_cb *self;
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->tx_max_sdu_size;
+	else
+		self->tx_max_sdu_size = param->pv.i;
+
+	IRDA_DEBUG(1, "%s(), MaxSduSize=%d\n", __func__, param->pv.i);
+
+	return 0;
+}
+
+/*************************** CLIENT CALLS ***************************/
+/************************** LMP CALLBACKS **************************/
+/* Everything is happily mixed up. Waiting for next clean up - Jean II */
+
+/*
+ * Initialization, that has to be done on new tsap
+ * instance allocation and on duplication
+ */
+static void irttp_init_tsap(struct tsap_cb *tsap)
+{
+	spin_lock_init(&tsap->lock);
+	init_timer(&tsap->todo_timer);
+
+	skb_queue_head_init(&tsap->rx_queue);
+	skb_queue_head_init(&tsap->tx_queue);
+	skb_queue_head_init(&tsap->rx_fragments);
+}
+
+/*
+ * Function irttp_open_tsap (stsap, notify)
+ *
+ *    Create TSAP connection endpoint,
+ */
+struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify)
+{
+	struct tsap_cb *self;
+	struct lsap_cb *lsap;
+	notify_t ttp_notify;
+
+	IRDA_ASSERT(irttp->magic == TTP_MAGIC, return NULL;);
+
+	/* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to
+	 * use only 0x01-0x6F. Of course, we can use LSAP_ANY as well.
+	 * JeanII */
+	if((stsap_sel != LSAP_ANY) &&
+	   ((stsap_sel < 0x01) || (stsap_sel >= 0x70))) {
+		IRDA_DEBUG(0, "%s(), invalid tsap!\n", __func__);
+		return NULL;
+	}
+
+	self = kzalloc(sizeof(struct tsap_cb), GFP_ATOMIC);
+	if (self == NULL) {
+		IRDA_DEBUG(0, "%s(), unable to kmalloc!\n", __func__);
+		return NULL;
+	}
+
+	/* Initialize internal objects */
+	irttp_init_tsap(self);
+
+	/* Initialise todo timer */
+	self->todo_timer.data     = (unsigned long) self;
+	self->todo_timer.function = &irttp_todo_expired;
+
+	/* Initialize callbacks for IrLMP to use */
+	irda_notify_init(&ttp_notify);
+	ttp_notify.connect_confirm = irttp_connect_confirm;
+	ttp_notify.connect_indication = irttp_connect_indication;
+	ttp_notify.disconnect_indication = irttp_disconnect_indication;
+	ttp_notify.data_indication = irttp_data_indication;
+	ttp_notify.udata_indication = irttp_udata_indication;
+	ttp_notify.flow_indication = irttp_flow_indication;
+	if(notify->status_indication != NULL)
+		ttp_notify.status_indication = irttp_status_indication;
+	ttp_notify.instance = self;
+	strncpy(ttp_notify.name, notify->name, NOTIFY_MAX_NAME);
+
+	self->magic = TTP_TSAP_MAGIC;
+	self->connected = FALSE;
+
+	/*
+	 *  Create LSAP at IrLMP layer
+	 */
+	lsap = irlmp_open_lsap(stsap_sel, &ttp_notify, 0);
+	if (lsap == NULL) {
+		IRDA_WARNING("%s: unable to allocate LSAP!!\n", __func__);
+		return NULL;
+	}
+
+	/*
+	 *  If user specified LSAP_ANY as source TSAP selector, then IrLMP
+	 *  will replace it with whatever source selector which is free, so
+	 *  the stsap_sel we have might not be valid anymore
+	 */
+	self->stsap_sel = lsap->slsap_sel;
+	IRDA_DEBUG(4, "%s(), stsap_sel=%02x\n", __func__, self->stsap_sel);
+
+	self->notify = *notify;
+	self->lsap = lsap;
+
+	hashbin_insert(irttp->tsaps, (irda_queue_t *) self, (long) self, NULL);
+
+	if (credit > TTP_RX_MAX_CREDIT)
+		self->initial_credit = TTP_RX_MAX_CREDIT;
+	else
+		self->initial_credit = credit;
+
+	return self;
+}
+EXPORT_SYMBOL(irttp_open_tsap);
+
+/*
+ * Function irttp_close (handle)
+ *
+ *    Remove an instance of a TSAP. This function should only deal with the
+ *    deallocation of the TSAP, and resetting of the TSAPs values;
+ *
+ */
+static void __irttp_close_tsap(struct tsap_cb *self)
+{
+	/* First make sure we're connected. */
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	irttp_flush_queues(self);
+
+	del_timer(&self->todo_timer);
+
+	/* This one won't be cleaned up if we are disconnect_pend + close_pend
+	 * and we receive a disconnect_indication */
+	if (self->disconnect_skb)
+		dev_kfree_skb(self->disconnect_skb);
+
+	self->connected = FALSE;
+	self->magic = ~TTP_TSAP_MAGIC;
+
+	kfree(self);
+}
+
+/*
+ * Function irttp_close (self)
+ *
+ *    Remove TSAP from list of all TSAPs and then deallocate all resources
+ *    associated with this TSAP
+ *
+ * Note : because we *free* the tsap structure, it is the responsibility
+ * of the caller to make sure we are called only once and to deal with
+ * possible race conditions. - Jean II
+ */
+int irttp_close_tsap(struct tsap_cb *self)
+{
+	struct tsap_cb *tsap;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+	/* Make sure tsap has been disconnected */
+	if (self->connected) {
+		/* Check if disconnect is not pending */
+		if (!test_bit(0, &self->disconnect_pend)) {
+			IRDA_WARNING("%s: TSAP still connected!\n",
+				     __func__);
+			irttp_disconnect_request(self, NULL, P_NORMAL);
+		}
+		self->close_pend = TRUE;
+		irttp_start_todo_timer(self, HZ/10);
+
+		return 0; /* Will be back! */
+	}
+
+	tsap = hashbin_remove(irttp->tsaps, (long) self, NULL);
+
+	IRDA_ASSERT(tsap == self, return -1;);
+
+	/* Close corresponding LSAP */
+	if (self->lsap) {
+		irlmp_close_lsap(self->lsap);
+		self->lsap = NULL;
+	}
+
+	__irttp_close_tsap(self);
+
+	return 0;
+}
+EXPORT_SYMBOL(irttp_close_tsap);
+
+/*
+ * Function irttp_udata_request (self, skb)
+ *
+ *    Send unreliable data on this TSAP
+ *
+ */
+int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb)
+{
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	/* Take shortcut on zero byte packets */
+	if (skb->len == 0) {
+		ret = 0;
+		goto err;
+	}
+
+	/* Check that nothing bad happens */
+	if (!self->connected) {
+		IRDA_WARNING("%s(), Not connected\n", __func__);
+		ret = -ENOTCONN;
+		goto err;
+	}
+
+	if (skb->len > self->max_seg_size) {
+		IRDA_ERROR("%s(), UData is too large for IrLAP!\n", __func__);
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	irlmp_udata_request(self->lsap, skb);
+	self->stats.tx_packets++;
+
+	return 0;
+
+err:
+	dev_kfree_skb(skb);
+	return ret;
+}
+EXPORT_SYMBOL(irttp_udata_request);
+
+
+/*
+ * Function irttp_data_request (handle, skb)
+ *
+ *    Queue frame for transmission. If SAR is enabled, fragement the frame
+ *    and queue the fragments for transmission
+ */
+int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb)
+{
+	__u8 *frame;
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	IRDA_DEBUG(2, "%s() : queue len = %d\n", __func__,
+		   skb_queue_len(&self->tx_queue));
+
+	/* Take shortcut on zero byte packets */
+	if (skb->len == 0) {
+		ret = 0;
+		goto err;
+	}
+
+	/* Check that nothing bad happens */
+	if (!self->connected) {
+		IRDA_WARNING("%s: Not connected\n", __func__);
+		ret = -ENOTCONN;
+		goto err;
+	}
+
+	/*
+	 *  Check if SAR is disabled, and the frame is larger than what fits
+	 *  inside an IrLAP frame
+	 */
+	if ((self->tx_max_sdu_size == 0) && (skb->len > self->max_seg_size)) {
+		IRDA_ERROR("%s: SAR disabled, and data is too large for IrLAP!\n",
+			   __func__);
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	/*
+	 *  Check if SAR is enabled, and the frame is larger than the
+	 *  TxMaxSduSize
+	 */
+	if ((self->tx_max_sdu_size != 0) &&
+	    (self->tx_max_sdu_size != TTP_SAR_UNBOUND) &&
+	    (skb->len > self->tx_max_sdu_size))
+	{
+		IRDA_ERROR("%s: SAR enabled, but data is larger than TxMaxSduSize!\n",
+			   __func__);
+		ret = -EMSGSIZE;
+		goto err;
+	}
+	/*
+	 *  Check if transmit queue is full
+	 */
+	if (skb_queue_len(&self->tx_queue) >= TTP_TX_MAX_QUEUE) {
+		/*
+		 *  Give it a chance to empty itself
+		 */
+		irttp_run_tx_queue(self);
+
+		/* Drop packet. This error code should trigger the caller
+		 * to resend the data in the client code - Jean II */
+		ret = -ENOBUFS;
+		goto err;
+	}
+
+	/* Queue frame, or queue frame segments */
+	if ((self->tx_max_sdu_size == 0) || (skb->len < self->max_seg_size)) {
+		/* Queue frame */
+		IRDA_ASSERT(skb_headroom(skb) >= TTP_HEADER, return -1;);
+		frame = skb_push(skb, TTP_HEADER);
+		frame[0] = 0x00; /* Clear more bit */
+
+		skb_queue_tail(&self->tx_queue, skb);
+	} else {
+		/*
+		 *  Fragment the frame, this function will also queue the
+		 *  fragments, we don't care about the fact the transmit
+		 *  queue may be overfilled by all the segments for a little
+		 *  while
+		 */
+		irttp_fragment_skb(self, skb);
+	}
+
+	/* Check if we can accept more data from client */
+	if ((!self->tx_sdu_busy) &&
+	    (skb_queue_len(&self->tx_queue) > TTP_TX_HIGH_THRESHOLD)) {
+		/* Tx queue filling up, so stop client. */
+		if (self->notify.flow_indication) {
+			self->notify.flow_indication(self->notify.instance,
+						     self, FLOW_STOP);
+		}
+		/* self->tx_sdu_busy is the state of the client.
+		 * Update state after notifying client to avoid
+		 * race condition with irttp_flow_indication().
+		 * If the queue empty itself after our test but before
+		 * we set the flag, we will fix ourselves below in
+		 * irttp_run_tx_queue().
+		 * Jean II */
+		self->tx_sdu_busy = TRUE;
+	}
+
+	/* Try to make some progress */
+	irttp_run_tx_queue(self);
+
+	return 0;
+
+err:
+	dev_kfree_skb(skb);
+	return ret;
+}
+EXPORT_SYMBOL(irttp_data_request);
+
+/*
+ * Function irttp_run_tx_queue (self)
+ *
+ *    Transmit packets queued for transmission (if possible)
+ *
+ */
+static void irttp_run_tx_queue(struct tsap_cb *self)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+	int n;
+
+	IRDA_DEBUG(2, "%s() : send_credit = %d, queue_len = %d\n",
+		   __func__,
+		   self->send_credit, skb_queue_len(&self->tx_queue));
+
+	/* Get exclusive access to the tx queue, otherwise don't touch it */
+	if (irda_lock(&self->tx_queue_lock) == FALSE)
+		return;
+
+	/* Try to send out frames as long as we have credits
+	 * and as long as LAP is not full. If LAP is full, it will
+	 * poll us through irttp_flow_indication() - Jean II */
+	while ((self->send_credit > 0) &&
+	       (!irlmp_lap_tx_queue_full(self->lsap)) &&
+	       (skb = skb_dequeue(&self->tx_queue)))
+	{
+		/*
+		 *  Since we can transmit and receive frames concurrently,
+		 *  the code below is a critical region and we must assure that
+		 *  nobody messes with the credits while we update them.
+		 */
+		spin_lock_irqsave(&self->lock, flags);
+
+		n = self->avail_credit;
+		self->avail_credit = 0;
+
+		/* Only room for 127 credits in frame */
+		if (n > 127) {
+			self->avail_credit = n-127;
+			n = 127;
+		}
+		self->remote_credit += n;
+		self->send_credit--;
+
+		spin_unlock_irqrestore(&self->lock, flags);
+
+		/*
+		 *  More bit must be set by the data_request() or fragment()
+		 *  functions
+		 */
+		skb->data[0] |= (n & 0x7f);
+
+		/* Detach from socket.
+		 * The current skb has a reference to the socket that sent
+		 * it (skb->sk). When we pass it to IrLMP, the skb will be
+		 * stored in in IrLAP (self->wx_list). When we are within
+		 * IrLAP, we lose the notion of socket, so we should not
+		 * have a reference to a socket. So, we drop it here.
+		 *
+		 * Why does it matter ?
+		 * When the skb is freed (kfree_skb), if it is associated
+		 * with a socket, it release buffer space on the socket
+		 * (through sock_wfree() and sock_def_write_space()).
+		 * If the socket no longer exist, we may crash. Hard.
+		 * When we close a socket, we make sure that associated packets
+		 * in IrTTP are freed. However, we have no way to cancel
+		 * the packet that we have passed to IrLAP. So, if a packet
+		 * remains in IrLAP (retry on the link or else) after we
+		 * close the socket, we are dead !
+		 * Jean II */
+		if (skb->sk != NULL) {
+			/* IrSOCK application, IrOBEX, ... */
+			skb_orphan(skb);
+		}
+			/* IrCOMM over IrTTP, IrLAN, ... */
+
+		/* Pass the skb to IrLMP - done */
+		irlmp_data_request(self->lsap, skb);
+		self->stats.tx_packets++;
+	}
+
+	/* Check if we can accept more frames from client.
+	 * We don't want to wait until the todo timer to do that, and we
+	 * can't use tasklets (grr...), so we are obliged to give control
+	 * to client. That's ok, this test will be true not too often
+	 * (max once per LAP window) and we are called from places
+	 * where we can spend a bit of time doing stuff. - Jean II */
+	if ((self->tx_sdu_busy) &&
+	    (skb_queue_len(&self->tx_queue) < TTP_TX_LOW_THRESHOLD) &&
+	    (!self->close_pend))
+	{
+		if (self->notify.flow_indication)
+			self->notify.flow_indication(self->notify.instance,
+						     self, FLOW_START);
+
+		/* self->tx_sdu_busy is the state of the client.
+		 * We don't really have a race here, but it's always safer
+		 * to update our state after the client - Jean II */
+		self->tx_sdu_busy = FALSE;
+	}
+
+	/* Reset lock */
+	self->tx_queue_lock = 0;
+}
+
+/*
+ * Function irttp_give_credit (self)
+ *
+ *    Send a dataless flowdata TTP-PDU and give available credit to peer
+ *    TSAP
+ */
+static inline void irttp_give_credit(struct tsap_cb *self)
+{
+	struct sk_buff *tx_skb = NULL;
+	unsigned long flags;
+	int n;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n",
+		   __func__,
+		   self->send_credit, self->avail_credit, self->remote_credit);
+
+	/* Give credit to peer */
+	tx_skb = alloc_skb(TTP_MAX_HEADER, GFP_ATOMIC);
+	if (!tx_skb)
+		return;
+
+	/* Reserve space for LMP, and LAP header */
+	skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+	/*
+	 *  Since we can transmit and receive frames concurrently,
+	 *  the code below is a critical region and we must assure that
+	 *  nobody messes with the credits while we update them.
+	 */
+	spin_lock_irqsave(&self->lock, flags);
+
+	n = self->avail_credit;
+	self->avail_credit = 0;
+
+	/* Only space for 127 credits in frame */
+	if (n > 127) {
+		self->avail_credit = n - 127;
+		n = 127;
+	}
+	self->remote_credit += n;
+
+	spin_unlock_irqrestore(&self->lock, flags);
+
+	skb_put(tx_skb, 1);
+	tx_skb->data[0] = (__u8) (n & 0x7f);
+
+	irlmp_data_request(self->lsap, tx_skb);
+	self->stats.tx_packets++;
+}
+
+/*
+ * Function irttp_udata_indication (instance, sap, skb)
+ *
+ *    Received some unit-data (unreliable)
+ *
+ */
+static int irttp_udata_indication(void *instance, void *sap,
+				  struct sk_buff *skb)
+{
+	struct tsap_cb *self;
+	int err;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+	IRDA_ASSERT(skb != NULL, return -1;);
+
+	self->stats.rx_packets++;
+
+	/* Just pass data to layer above */
+	if (self->notify.udata_indication) {
+		err = self->notify.udata_indication(self->notify.instance,
+						    self,skb);
+		/* Same comment as in irttp_do_data_indication() */
+		if (!err)
+			return 0;
+	}
+	/* Either no handler, or handler returns an error */
+	dev_kfree_skb(skb);
+
+	return 0;
+}
+
+/*
+ * Function irttp_data_indication (instance, sap, skb)
+ *
+ *    Receive segment from IrLMP.
+ *
+ */
+static int irttp_data_indication(void *instance, void *sap,
+				 struct sk_buff *skb)
+{
+	struct tsap_cb *self;
+	unsigned long flags;
+	int n;
+
+	self = (struct tsap_cb *) instance;
+
+	n = skb->data[0] & 0x7f;     /* Extract the credits */
+
+	self->stats.rx_packets++;
+
+	/*  Deal with inbound credit
+	 *  Since we can transmit and receive frames concurrently,
+	 *  the code below is a critical region and we must assure that
+	 *  nobody messes with the credits while we update them.
+	 */
+	spin_lock_irqsave(&self->lock, flags);
+	self->send_credit += n;
+	if (skb->len > 1)
+		self->remote_credit--;
+	spin_unlock_irqrestore(&self->lock, flags);
+
+	/*
+	 *  Data or dataless packet? Dataless frames contains only the
+	 *  TTP_HEADER.
+	 */
+	if (skb->len > 1) {
+		/*
+		 *  We don't remove the TTP header, since we must preserve the
+		 *  more bit, so the defragment routing knows what to do
+		 */
+		skb_queue_tail(&self->rx_queue, skb);
+	} else {
+		/* Dataless flowdata TTP-PDU */
+		dev_kfree_skb(skb);
+	}
+
+
+	/* Push data to the higher layer.
+	 * We do it synchronously because running the todo timer for each
+	 * receive packet would be too much overhead and latency.
+	 * By passing control to the higher layer, we run the risk that
+	 * it may take time or grab a lock. Most often, the higher layer
+	 * will only put packet in a queue.
+	 * Anyway, packets are only dripping through the IrDA, so we can
+	 * have time before the next packet.
+	 * Further, we are run from NET_BH, so the worse that can happen is
+	 * us missing the optimal time to send back the PF bit in LAP.
+	 * Jean II */
+	irttp_run_rx_queue(self);
+
+	/* We now give credits to peer in irttp_run_rx_queue().
+	 * We need to send credit *NOW*, otherwise we are going
+	 * to miss the next Tx window. The todo timer may take
+	 * a while before it's run... - Jean II */
+
+	/*
+	 * If the peer device has given us some credits and we didn't have
+	 * anyone from before, then we need to shedule the tx queue.
+	 * We need to do that because our Tx have stopped (so we may not
+	 * get any LAP flow indication) and the user may be stopped as
+	 * well. - Jean II
+	 */
+	if (self->send_credit == n) {
+		/* Restart pushing stuff to LAP */
+		irttp_run_tx_queue(self);
+		/* Note : we don't want to schedule the todo timer
+		 * because it has horrible latency. No tasklets
+		 * because the tasklet API is broken. - Jean II */
+	}
+
+	return 0;
+}
+
+/*
+ * Function irttp_status_indication (self, reason)
+ *
+ *    Status_indication, just pass to the higher layer...
+ *
+ */
+static void irttp_status_indication(void *instance,
+				    LINK_STATUS link, LOCK_STATUS lock)
+{
+	struct tsap_cb *self;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	/* Check if client has already closed the TSAP and gone away */
+	if (self->close_pend)
+		return;
+
+	/*
+	 *  Inform service user if he has requested it
+	 */
+	if (self->notify.status_indication != NULL)
+		self->notify.status_indication(self->notify.instance,
+					       link, lock);
+	else
+		IRDA_DEBUG(2, "%s(), no handler\n", __func__);
+}
+
+/*
+ * Function irttp_flow_indication (self, reason)
+ *
+ *    Flow_indication : IrLAP tells us to send more data.
+ *
+ */
+static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow)
+{
+	struct tsap_cb *self;
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	IRDA_DEBUG(4, "%s(instance=%p)\n", __func__, self);
+
+	/* We are "polled" directly from LAP, and the LAP want to fill
+	 * its Tx window. We want to do our best to send it data, so that
+	 * we maximise the window. On the other hand, we want to limit the
+	 * amount of work here so that LAP doesn't hang forever waiting
+	 * for packets. - Jean II */
+
+	/* Try to send some packets. Currently, LAP calls us every time
+	 * there is one free slot, so we will send only one packet.
+	 * This allow the scheduler to do its round robin - Jean II */
+	irttp_run_tx_queue(self);
+
+	/* Note regarding the interraction with higher layer.
+	 * irttp_run_tx_queue() may call the client when its queue
+	 * start to empty, via notify.flow_indication(). Initially.
+	 * I wanted this to happen in a tasklet, to avoid client
+	 * grabbing the CPU, but we can't use tasklets safely. And timer
+	 * is definitely too slow.
+	 * This will happen only once per LAP window, and usually at
+	 * the third packet (unless window is smaller). LAP is still
+	 * doing mtt and sending first packet so it's sort of OK
+	 * to do that. Jean II */
+
+	/* If we need to send disconnect. try to do it now */
+	if(self->disconnect_pend)
+		irttp_start_todo_timer(self, 0);
+}
+
+/*
+ * Function irttp_flow_request (self, command)
+ *
+ *    This function could be used by the upper layers to tell IrTTP to stop
+ *    delivering frames if the receive queues are starting to get full, or
+ *    to tell IrTTP to start delivering frames again.
+ */
+void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow)
+{
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	switch (flow) {
+	case FLOW_STOP:
+		IRDA_DEBUG(1, "%s(), flow stop\n", __func__);
+		self->rx_sdu_busy = TRUE;
+		break;
+	case FLOW_START:
+		IRDA_DEBUG(1, "%s(), flow start\n", __func__);
+		self->rx_sdu_busy = FALSE;
+
+		/* Client say he can accept more data, try to free our
+		 * queues ASAP - Jean II */
+		irttp_run_rx_queue(self);
+
+		break;
+	default:
+		IRDA_DEBUG(1, "%s(), Unknown flow command!\n", __func__);
+	}
+}
+EXPORT_SYMBOL(irttp_flow_request);
+
+/*
+ * Function irttp_connect_request (self, dtsap_sel, daddr, qos)
+ *
+ *    Try to connect to remote destination TSAP selector
+ *
+ */
+int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel,
+			  __u32 saddr, __u32 daddr,
+			  struct qos_info *qos, __u32 max_sdu_size,
+			  struct sk_buff *userdata)
+{
+	struct sk_buff *tx_skb;
+	__u8 *frame;
+	__u8 n;
+
+	IRDA_DEBUG(4, "%s(), max_sdu_size=%d\n", __func__, max_sdu_size);
+
+	IRDA_ASSERT(self != NULL, return -EBADR;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -EBADR;);
+
+	if (self->connected) {
+		if(userdata)
+			dev_kfree_skb(userdata);
+		return -EISCONN;
+	}
+
+	/* Any userdata supplied? */
+	if (userdata == NULL) {
+		tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
+				   GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		/* Reserve space for MUX_CONTROL and LAP header */
+		skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
+	} else {
+		tx_skb = userdata;
+		/*
+		 *  Check that the client has reserved enough space for
+		 *  headers
+		 */
+		IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
+			{ dev_kfree_skb(userdata); return -1; } );
+	}
+
+	/* Initialize connection parameters */
+	self->connected = FALSE;
+	self->avail_credit = 0;
+	self->rx_max_sdu_size = max_sdu_size;
+	self->rx_sdu_size = 0;
+	self->rx_sdu_busy = FALSE;
+	self->dtsap_sel = dtsap_sel;
+
+	n = self->initial_credit;
+
+	self->remote_credit = 0;
+	self->send_credit = 0;
+
+	/*
+	 *  Give away max 127 credits for now
+	 */
+	if (n > 127) {
+		self->avail_credit=n-127;
+		n = 127;
+	}
+
+	self->remote_credit = n;
+
+	/* SAR enabled? */
+	if (max_sdu_size > 0) {
+		IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
+			{ dev_kfree_skb(tx_skb); return -1; } );
+
+		/* Insert SAR parameters */
+		frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER);
+
+		frame[0] = TTP_PARAMETERS | n;
+		frame[1] = 0x04; /* Length */
+		frame[2] = 0x01; /* MaxSduSize */
+		frame[3] = 0x02; /* Value length */
+
+		put_unaligned(cpu_to_be16((__u16) max_sdu_size),
+			      (__be16 *)(frame+4));
+	} else {
+		/* Insert plain TTP header */
+		frame = skb_push(tx_skb, TTP_HEADER);
+
+		/* Insert initial credit in frame */
+		frame[0] = n & 0x7f;
+	}
+
+	/* Connect with IrLMP. No QoS parameters for now */
+	return irlmp_connect_request(self->lsap, dtsap_sel, saddr, daddr, qos,
+				     tx_skb);
+}
+EXPORT_SYMBOL(irttp_connect_request);
+
+/*
+ * Function irttp_connect_confirm (handle, qos, skb)
+ *
+ *    Service user confirms TSAP connection with peer.
+ *
+ */
+static void irttp_connect_confirm(void *instance, void *sap,
+				  struct qos_info *qos, __u32 max_seg_size,
+				  __u8 max_header_size, struct sk_buff *skb)
+{
+	struct tsap_cb *self;
+	int parameters;
+	int ret;
+	__u8 plen;
+	__u8 n;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	self->max_seg_size = max_seg_size - TTP_HEADER;
+	self->max_header_size = max_header_size + TTP_HEADER;
+
+	/*
+	 *  Check if we have got some QoS parameters back! This should be the
+	 *  negotiated QoS for the link.
+	 */
+	if (qos) {
+		IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %02x\n",
+		       qos->baud_rate.bits);
+		IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %d bps.\n",
+		       qos->baud_rate.value);
+	}
+
+	n = skb->data[0] & 0x7f;
+
+	IRDA_DEBUG(4, "%s(), Initial send_credit=%d\n", __func__, n);
+
+	self->send_credit = n;
+	self->tx_max_sdu_size = 0;
+	self->connected = TRUE;
+
+	parameters = skb->data[0] & 0x80;
+
+	IRDA_ASSERT(skb->len >= TTP_HEADER, return;);
+	skb_pull(skb, TTP_HEADER);
+
+	if (parameters) {
+		plen = skb->data[0];
+
+		ret = irda_param_extract_all(self, skb->data+1,
+					     IRDA_MIN(skb->len-1, plen),
+					     &param_info);
+
+		/* Any errors in the parameter list? */
+		if (ret < 0) {
+			IRDA_WARNING("%s: error extracting parameters\n",
+				     __func__);
+			dev_kfree_skb(skb);
+
+			/* Do not accept this connection attempt */
+			return;
+		}
+		/* Remove parameters */
+		skb_pull(skb, IRDA_MIN(skb->len, plen+1));
+	}
+
+	IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n", __func__,
+	      self->send_credit, self->avail_credit, self->remote_credit);
+
+	IRDA_DEBUG(2, "%s(), MaxSduSize=%d\n", __func__,
+		   self->tx_max_sdu_size);
+
+	if (self->notify.connect_confirm) {
+		self->notify.connect_confirm(self->notify.instance, self, qos,
+					     self->tx_max_sdu_size,
+					     self->max_header_size, skb);
+	} else
+		dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_connect_indication (handle, skb)
+ *
+ *    Some other device is connecting to this TSAP
+ *
+ */
+static void irttp_connect_indication(void *instance, void *sap,
+		struct qos_info *qos, __u32 max_seg_size, __u8 max_header_size,
+		struct sk_buff *skb)
+{
+	struct tsap_cb *self;
+	struct lsap_cb *lsap;
+	int parameters;
+	int ret;
+	__u8 plen;
+	__u8 n;
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+	IRDA_ASSERT(skb != NULL, return;);
+
+	lsap = (struct lsap_cb *) sap;
+
+	self->max_seg_size = max_seg_size - TTP_HEADER;
+	self->max_header_size = max_header_size+TTP_HEADER;
+
+	IRDA_DEBUG(4, "%s(), TSAP sel=%02x\n", __func__, self->stsap_sel);
+
+	/* Need to update dtsap_sel if its equal to LSAP_ANY */
+	self->dtsap_sel = lsap->dlsap_sel;
+
+	n = skb->data[0] & 0x7f;
+
+	self->send_credit = n;
+	self->tx_max_sdu_size = 0;
+
+	parameters = skb->data[0] & 0x80;
+
+	IRDA_ASSERT(skb->len >= TTP_HEADER, return;);
+	skb_pull(skb, TTP_HEADER);
+
+	if (parameters) {
+		plen = skb->data[0];
+
+		ret = irda_param_extract_all(self, skb->data+1,
+					     IRDA_MIN(skb->len-1, plen),
+					     &param_info);
+
+		/* Any errors in the parameter list? */
+		if (ret < 0) {
+			IRDA_WARNING("%s: error extracting parameters\n",
+				     __func__);
+			dev_kfree_skb(skb);
+
+			/* Do not accept this connection attempt */
+			return;
+		}
+
+		/* Remove parameters */
+		skb_pull(skb, IRDA_MIN(skb->len, plen+1));
+	}
+
+	if (self->notify.connect_indication) {
+		self->notify.connect_indication(self->notify.instance, self,
+						qos, self->tx_max_sdu_size,
+						self->max_header_size, skb);
+	} else
+		dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_connect_response (handle, userdata)
+ *
+ *    Service user is accepting the connection, just pass it down to
+ *    IrLMP!
+ *
+ */
+int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size,
+			   struct sk_buff *userdata)
+{
+	struct sk_buff *tx_skb;
+	__u8 *frame;
+	int ret;
+	__u8 n;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+	IRDA_DEBUG(4, "%s(), Source TSAP selector=%02x\n", __func__,
+		   self->stsap_sel);
+
+	/* Any userdata supplied? */
+	if (userdata == NULL) {
+		tx_skb = alloc_skb(TTP_MAX_HEADER + TTP_SAR_HEADER,
+				   GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		/* Reserve space for MUX_CONTROL and LAP header */
+		skb_reserve(tx_skb, TTP_MAX_HEADER + TTP_SAR_HEADER);
+	} else {
+		tx_skb = userdata;
+		/*
+		 *  Check that the client has reserved enough space for
+		 *  headers
+		 */
+		IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER,
+			{ dev_kfree_skb(userdata); return -1; } );
+	}
+
+	self->avail_credit = 0;
+	self->remote_credit = 0;
+	self->rx_max_sdu_size = max_sdu_size;
+	self->rx_sdu_size = 0;
+	self->rx_sdu_busy = FALSE;
+
+	n = self->initial_credit;
+
+	/* Frame has only space for max 127 credits (7 bits) */
+	if (n > 127) {
+		self->avail_credit = n - 127;
+		n = 127;
+	}
+
+	self->remote_credit = n;
+	self->connected = TRUE;
+
+	/* SAR enabled? */
+	if (max_sdu_size > 0) {
+		IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER),
+			{ dev_kfree_skb(tx_skb); return -1; } );
+
+		/* Insert TTP header with SAR parameters */
+		frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER);
+
+		frame[0] = TTP_PARAMETERS | n;
+		frame[1] = 0x04; /* Length */
+
+		/* irda_param_insert(self, IRTTP_MAX_SDU_SIZE, frame+1,  */
+/*				  TTP_SAR_HEADER, &param_info) */
+
+		frame[2] = 0x01; /* MaxSduSize */
+		frame[3] = 0x02; /* Value length */
+
+		put_unaligned(cpu_to_be16((__u16) max_sdu_size),
+			      (__be16 *)(frame+4));
+	} else {
+		/* Insert TTP header */
+		frame = skb_push(tx_skb, TTP_HEADER);
+
+		frame[0] = n & 0x7f;
+	}
+
+	ret = irlmp_connect_response(self->lsap, tx_skb);
+
+	return ret;
+}
+EXPORT_SYMBOL(irttp_connect_response);
+
+/*
+ * Function irttp_dup (self, instance)
+ *
+ *    Duplicate TSAP, can be used by servers to confirm a connection on a
+ *    new TSAP so it can keep listening on the old one.
+ */
+struct tsap_cb *irttp_dup(struct tsap_cb *orig, void *instance)
+{
+	struct tsap_cb *new;
+	unsigned long flags;
+
+	IRDA_DEBUG(1, "%s()\n", __func__);
+
+	/* Protect our access to the old tsap instance */
+	spin_lock_irqsave(&irttp->tsaps->hb_spinlock, flags);
+
+	/* Find the old instance */
+	if (!hashbin_find(irttp->tsaps, (long) orig, NULL)) {
+		IRDA_DEBUG(0, "%s(), unable to find TSAP\n", __func__);
+		spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+		return NULL;
+	}
+
+	/* Allocate a new instance */
+	new = kmalloc(sizeof(struct tsap_cb), GFP_ATOMIC);
+	if (!new) {
+		IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __func__);
+		spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+		return NULL;
+	}
+	/* Dup */
+	memcpy(new, orig, sizeof(struct tsap_cb));
+	spin_lock_init(&new->lock);
+
+	/* We don't need the old instance any more */
+	spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags);
+
+	/* Try to dup the LSAP (may fail if we were too slow) */
+	new->lsap = irlmp_dup(orig->lsap, new);
+	if (!new->lsap) {
+		IRDA_DEBUG(0, "%s(), dup failed!\n", __func__);
+		kfree(new);
+		return NULL;
+	}
+
+	/* Not everything should be copied */
+	new->notify.instance = instance;
+
+	/* Initialize internal objects */
+	irttp_init_tsap(new);
+
+	/* This is locked */
+	hashbin_insert(irttp->tsaps, (irda_queue_t *) new, (long) new, NULL);
+
+	return new;
+}
+EXPORT_SYMBOL(irttp_dup);
+
+/*
+ * Function irttp_disconnect_request (self)
+ *
+ *    Close this connection please! If priority is high, the queued data
+ *    segments, if any, will be deallocated first
+ *
+ */
+int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
+			     int priority)
+{
+	int ret;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;);
+
+	/* Already disconnected? */
+	if (!self->connected) {
+		IRDA_DEBUG(4, "%s(), already disconnected!\n", __func__);
+		if (userdata)
+			dev_kfree_skb(userdata);
+		return -1;
+	}
+
+	/* Disconnect already pending ?
+	 * We need to use an atomic operation to prevent reentry. This
+	 * function may be called from various context, like user, timer
+	 * for following a disconnect_indication() (i.e. net_bh).
+	 * Jean II */
+	if(test_and_set_bit(0, &self->disconnect_pend)) {
+		IRDA_DEBUG(0, "%s(), disconnect already pending\n",
+			   __func__);
+		if (userdata)
+			dev_kfree_skb(userdata);
+
+		/* Try to make some progress */
+		irttp_run_tx_queue(self);
+		return -1;
+	}
+
+	/*
+	 *  Check if there is still data segments in the transmit queue
+	 */
+	if (!skb_queue_empty(&self->tx_queue)) {
+		if (priority == P_HIGH) {
+			/*
+			 *  No need to send the queued data, if we are
+			 *  disconnecting right now since the data will
+			 *  not have any usable connection to be sent on
+			 */
+			IRDA_DEBUG(1, "%s(): High priority!!()\n", __func__);
+			irttp_flush_queues(self);
+		} else if (priority == P_NORMAL) {
+			/*
+			 *  Must delay disconnect until after all data segments
+			 *  have been sent and the tx_queue is empty
+			 */
+			/* We'll reuse this one later for the disconnect */
+			self->disconnect_skb = userdata;  /* May be NULL */
+
+			irttp_run_tx_queue(self);
+
+			irttp_start_todo_timer(self, HZ/10);
+			return -1;
+		}
+	}
+	/* Note : we don't need to check if self->rx_queue is full and the
+	 * state of self->rx_sdu_busy because the disconnect response will
+	 * be sent at the LMP level (so even if the peer has its Tx queue
+	 * full of data). - Jean II */
+
+	IRDA_DEBUG(1, "%s(), Disconnecting ...\n", __func__);
+	self->connected = FALSE;
+
+	if (!userdata) {
+		struct sk_buff *tx_skb;
+		tx_skb = alloc_skb(LMP_MAX_HEADER, GFP_ATOMIC);
+		if (!tx_skb)
+			return -ENOMEM;
+
+		/*
+		 *  Reserve space for MUX and LAP header
+		 */
+		skb_reserve(tx_skb, LMP_MAX_HEADER);
+
+		userdata = tx_skb;
+	}
+	ret = irlmp_disconnect_request(self->lsap, userdata);
+
+	/* The disconnect is no longer pending */
+	clear_bit(0, &self->disconnect_pend);	/* FALSE */
+
+	return ret;
+}
+EXPORT_SYMBOL(irttp_disconnect_request);
+
+/*
+ * Function irttp_disconnect_indication (self, reason)
+ *
+ *    Disconnect indication, TSAP disconnected by peer?
+ *
+ */
+static void irttp_disconnect_indication(void *instance, void *sap,
+		LM_REASON reason, struct sk_buff *skb)
+{
+	struct tsap_cb *self;
+
+	IRDA_DEBUG(4, "%s()\n", __func__);
+
+	self = (struct tsap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;);
+
+	/* Prevent higher layer to send more data */
+	self->connected = FALSE;
+
+	/* Check if client has already tried to close the TSAP */
+	if (self->close_pend) {
+		/* In this case, the higher layer is probably gone. Don't
+		 * bother it and clean up the remains - Jean II */
+		if (skb)
+			dev_kfree_skb(skb);
+		irttp_close_tsap(self);
+		return;
+	}
+
+	/* If we are here, we assume that is the higher layer is still
+	 * waiting for the disconnect notification and able to process it,
+	 * even if he tried to disconnect. Otherwise, it would have already
+	 * attempted to close the tsap and self->close_pend would be TRUE.
+	 * Jean II */
+
+	/* No need to notify the client if has already tried to disconnect */
+	if(self->notify.disconnect_indication)
+		self->notify.disconnect_indication(self->notify.instance, self,
+						   reason, skb);
+	else
+		if (skb)
+			dev_kfree_skb(skb);
+}
+
+/*
+ * Function irttp_do_data_indication (self, skb)
+ *
+ *    Try to deliver reassembled skb to layer above, and requeue it if that
+ *    for some reason should fail. We mark rx sdu as busy to apply back
+ *    pressure is necessary.
+ */
+static void irttp_do_data_indication(struct tsap_cb *self, struct sk_buff *skb)
+{
+	int err;
+
+	/* Check if client has already closed the TSAP and gone away */
+	if (self->close_pend) {
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	err = self->notify.data_indication(self->notify.instance, self, skb);
+
+	/* Usually the layer above will notify that it's input queue is
+	 * starting to get filled by using the flow request, but this may
+	 * be difficult, so it can instead just refuse to eat it and just
+	 * give an error back
+	 */
+	if (err) {
+		IRDA_DEBUG(0, "%s() requeueing skb!\n", __func__);
+
+		/* Make sure we take a break */
+		self->rx_sdu_busy = TRUE;
+
+		/* Need to push the header in again */
+		skb_push(skb, TTP_HEADER);
+		skb->data[0] = 0x00; /* Make sure MORE bit is cleared */
+
+		/* Put skb back on queue */
+		skb_queue_head(&self->rx_queue, skb);
+	}
+}
+
+/*
+ * Function irttp_run_rx_queue (self)
+ *
+ *     Check if we have any frames to be transmitted, or if we have any
+ *     available credit to give away.
+ */
+static void irttp_run_rx_queue(struct tsap_cb *self)
+{
+	struct sk_buff *skb;
+	int more = 0;
+
+	IRDA_DEBUG(2, "%s() send=%d,avail=%d,remote=%d\n", __func__,
+		   self->send_credit, self->avail_credit, self->remote_credit);
+
+	/* Get exclusive access to the rx queue, otherwise don't touch it */
+	if (irda_lock(&self->rx_queue_lock) == FALSE)
+		return;
+
+	/*
+	 *  Reassemble all frames in receive queue and deliver them
+	 */
+	while (!self->rx_sdu_busy && (skb = skb_dequeue(&self->rx_queue))) {
+		/* This bit will tell us if it's the last fragment or not */
+		more = skb->data[0] & 0x80;
+
+		/* Remove TTP header */
+		skb_pull(skb, TTP_HEADER);
+
+		/* Add the length of the remaining data */
+		self->rx_sdu_size += skb->len;
+
+		/*
+		 * If SAR is disabled, or user has requested no reassembly
+		 * of received fragments then we just deliver them
+		 * immediately. This can be requested by clients that
+		 * implements byte streams without any message boundaries
+		 */
+		if (self->rx_max_sdu_size == TTP_SAR_DISABLE) {
+			irttp_do_data_indication(self, skb);
+			self->rx_sdu_size = 0;
+
+			continue;
+		}
+
+		/* Check if this is a fragment, and not the last fragment */
+		if (more) {
+			/*
+			 *  Queue the fragment if we still are within the
+			 *  limits of the maximum size of the rx_sdu
+			 */
+			if (self->rx_sdu_size <= self->rx_max_sdu_size) {
+				IRDA_DEBUG(4, "%s(), queueing frag\n",
+					   __func__);
+				skb_queue_tail(&self->rx_fragments, skb);
+			} else {
+				/* Free the part of the SDU that is too big */
+				dev_kfree_skb(skb);
+			}
+			continue;
+		}
+		/*
+		 *  This is the last fragment, so time to reassemble!
+		 */
+		if ((self->rx_sdu_size <= self->rx_max_sdu_size) ||
+		    (self->rx_max_sdu_size == TTP_SAR_UNBOUND))
+		{
+			/*
+			 * A little optimizing. Only queue the fragment if
+			 * there are other fragments. Since if this is the
+			 * last and only fragment, there is no need to
+			 * reassemble :-)
+			 */
+			if (!skb_queue_empty(&self->rx_fragments)) {
+				skb_queue_tail(&self->rx_fragments,
+					       skb);
+
+				skb = irttp_reassemble_skb(self);
+			}
+
+			/* Now we can deliver the reassembled skb */
+			irttp_do_data_indication(self, skb);
+		} else {
+			IRDA_DEBUG(1, "%s(), Truncated frame\n", __func__);
+
+			/* Free the part of the SDU that is too big */
+			dev_kfree_skb(skb);
+
+			/* Deliver only the valid but truncated part of SDU */
+			skb = irttp_reassemble_skb(self);
+
+			irttp_do_data_indication(self, skb);
+		}
+		self->rx_sdu_size = 0;
+	}
+
+	/*
+	 * It's not trivial to keep track of how many credits are available
+	 * by incrementing at each packet, because delivery may fail
+	 * (irttp_do_data_indication() may requeue the frame) and because
+	 * we need to take care of fragmentation.
+	 * We want the other side to send up to initial_credit packets.
+	 * We have some frames in our queues, and we have already allowed it
+	 * to send remote_credit.
+	 * No need to spinlock, write is atomic and self correcting...
+	 * Jean II
+	 */
+	self->avail_credit = (self->initial_credit -
+			      (self->remote_credit +
+			       skb_queue_len(&self->rx_queue) +
+			       skb_queue_len(&self->rx_fragments)));
+
+	/* Do we have too much credits to send to peer ? */
+	if ((self->remote_credit <= TTP_RX_MIN_CREDIT) &&
+	    (self->avail_credit > 0)) {
+		/* Send explicit credit frame */
+		irttp_give_credit(self);
+		/* Note : do *NOT* check if tx_queue is non-empty, that
+		 * will produce deadlocks. I repeat : send a credit frame
+		 * even if we have something to send in our Tx queue.
+		 * If we have credits, it means that our Tx queue is blocked.
+		 *
+		 * Let's suppose the peer can't keep up with our Tx. He will
+		 * flow control us by not sending us any credits, and we
+		 * will stop Tx and start accumulating credits here.
+		 * Up to the point where the peer will stop its Tx queue,
+		 * for lack of credits.
+		 * Let's assume the peer application is single threaded.
+		 * It will block on Tx and never consume any Rx buffer.
+		 * Deadlock. Guaranteed. - Jean II
+		 */
+	}
+
+	/* Reset lock */
+	self->rx_queue_lock = 0;
+}
+
+#ifdef CONFIG_PROC_FS
+struct irttp_iter_state {
+	int id;
+};
+
+static void *irttp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct irttp_iter_state *iter = seq->private;
+	struct tsap_cb *self;
+
+	/* Protect our access to the tsap list */
+	spin_lock_irq(&irttp->tsaps->hb_spinlock);
+	iter->id = 0;
+
+	for (self = (struct tsap_cb *) hashbin_get_first(irttp->tsaps);
+	     self != NULL;
+	     self = (struct tsap_cb *) hashbin_get_next(irttp->tsaps)) {
+		if (iter->id == *pos)
+			break;
+		++iter->id;
+	}
+
+	return self;
+}
+
+static void *irttp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct irttp_iter_state *iter = seq->private;
+
+	++*pos;
+	++iter->id;
+	return (void *) hashbin_get_next(irttp->tsaps);
+}
+
+static void irttp_seq_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_irq(&irttp->tsaps->hb_spinlock);
+}
+
+static int irttp_seq_show(struct seq_file *seq, void *v)
+{
+	const struct irttp_iter_state *iter = seq->private;
+	const struct tsap_cb *self = v;
+
+	seq_printf(seq, "TSAP %d, ", iter->id);
+	seq_printf(seq, "stsap_sel: %02x, ",
+		   self->stsap_sel);
+	seq_printf(seq, "dtsap_sel: %02x\n",
+		   self->dtsap_sel);
+	seq_printf(seq, "  connected: %s, ",
+		   self->connected? "TRUE":"FALSE");
+	seq_printf(seq, "avail credit: %d, ",
+		   self->avail_credit);
+	seq_printf(seq, "remote credit: %d, ",
+		   self->remote_credit);
+	seq_printf(seq, "send credit: %d\n",
+		   self->send_credit);
+	seq_printf(seq, "  tx packets: %lu, ",
+		   self->stats.tx_packets);
+	seq_printf(seq, "rx packets: %lu, ",
+		   self->stats.rx_packets);
+	seq_printf(seq, "tx_queue len: %u ",
+		   skb_queue_len(&self->tx_queue));
+	seq_printf(seq, "rx_queue len: %u\n",
+		   skb_queue_len(&self->rx_queue));
+	seq_printf(seq, "  tx_sdu_busy: %s, ",
+		   self->tx_sdu_busy? "TRUE":"FALSE");
+	seq_printf(seq, "rx_sdu_busy: %s\n",
+		   self->rx_sdu_busy? "TRUE":"FALSE");
+	seq_printf(seq, "  max_seg_size: %u, ",
+		   self->max_seg_size);
+	seq_printf(seq, "tx_max_sdu_size: %u, ",
+		   self->tx_max_sdu_size);
+	seq_printf(seq, "rx_max_sdu_size: %u\n",
+		   self->rx_max_sdu_size);
+
+	seq_printf(seq, "  Used by (%s)\n\n",
+		   self->notify.name);
+	return 0;
+}
+
+static const struct seq_operations irttp_seq_ops = {
+	.start  = irttp_seq_start,
+	.next   = irttp_seq_next,
+	.stop   = irttp_seq_stop,
+	.show   = irttp_seq_show,
+};
+
+static int irttp_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &irttp_seq_ops,
+			sizeof(struct irttp_iter_state));
+}
+
+const struct file_operations irttp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = irttp_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+#endif /* PROC_FS */
diff --git a/net/irda/parameters.c b/net/irda/parameters.c
new file mode 100644
index 00000000..71cd38c1
--- /dev/null
+++ b/net/irda/parameters.c
@@ -0,0 +1,591 @@
+/*********************************************************************
+ *
+ * Filename:      parameters.c
+ * Version:       1.0
+ * Description:   A more general way to handle (pi,pl,pv) parameters
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Jun  7 10:25:11 1999
+ * Modified at:   Sun Jan 30 14:08:39 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <linux/types.h>
+#include <linux/module.h>
+
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+
+static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi,
+				PV_TYPE type, PI_HANDLER func);
+static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func);
+static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func);
+static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi,
+				 PV_TYPE type, PI_HANDLER func);
+
+static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func);
+static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi,
+				PV_TYPE type, PI_HANDLER func);
+
+static int irda_param_unpack(__u8 *buf, char *fmt, ...);
+
+/* Parameter value call table. Must match PV_TYPE */
+static PV_HANDLER pv_extract_table[] = {
+	irda_extract_integer, /* Handler for any length integers */
+	irda_extract_integer, /* Handler for 8  bits integers */
+	irda_extract_integer, /* Handler for 16 bits integers */
+	irda_extract_string,  /* Handler for strings */
+	irda_extract_integer, /* Handler for 32 bits integers */
+	irda_extract_octseq,  /* Handler for octet sequences */
+	irda_extract_no_value /* Handler for no value parameters */
+};
+
+static PV_HANDLER pv_insert_table[] = {
+	irda_insert_integer, /* Handler for any length integers */
+	irda_insert_integer, /* Handler for 8  bits integers */
+	irda_insert_integer, /* Handler for 16 bits integers */
+	NULL,                /* Handler for strings */
+	irda_insert_integer, /* Handler for 32 bits integers */
+	NULL,                /* Handler for octet sequences */
+	irda_insert_no_value /* Handler for no value parameters */
+};
+
+/*
+ * Function irda_insert_no_value (self, buf, len, pi, type, func)
+ */
+static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi,
+				PV_TYPE type, PI_HANDLER func)
+{
+	irda_param_t p;
+	int ret;
+
+	p.pi = pi;
+	p.pl = 0;
+
+	/* Call handler for this parameter */
+	ret = (*func)(self, &p, PV_GET);
+
+	/* Extract values anyway, since handler may need them */
+	irda_param_pack(buf, "bb", p.pi, p.pl);
+
+	if (ret < 0)
+		return ret;
+
+	return 2; /* Inserted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_no_value (self, buf, len, type, func)
+ *
+ *    Extracts a parameter without a pv field (pl=0)
+ *
+ */
+static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi,
+				 PV_TYPE type, PI_HANDLER func)
+{
+	irda_param_t p;
+	int ret;
+
+	/* Extract values anyway, since handler may need them */
+	irda_param_unpack(buf, "bb", &p.pi, &p.pl);
+
+	/* Call handler for this parameter */
+	ret = (*func)(self, &p, PV_PUT);
+
+	if (ret < 0)
+		return ret;
+
+	return 2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_insert_integer (self, buf, len, pi, type, func)
+ */
+static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func)
+{
+	irda_param_t p;
+	int n = 0;
+	int err;
+
+	p.pi = pi;             /* In case handler needs to know */
+	p.pl = type & PV_MASK; /* The integer type codes the length as well */
+	p.pv.i = 0;            /* Clear value */
+
+	/* Call handler for this parameter */
+	err = (*func)(self, &p, PV_GET);
+	if (err < 0)
+		return err;
+
+	/*
+	 * If parameter length is still 0, then (1) this is an any length
+	 * integer, and (2) the handler function does not care which length
+	 * we choose to use, so we pick the one the gives the fewest bytes.
+	 */
+	if (p.pl == 0) {
+		if (p.pv.i < 0xff) {
+			IRDA_DEBUG(2, "%s(), using 1 byte\n", __func__);
+			p.pl = 1;
+		} else if (p.pv.i < 0xffff) {
+			IRDA_DEBUG(2, "%s(), using 2 bytes\n", __func__);
+			p.pl = 2;
+		} else {
+			IRDA_DEBUG(2, "%s(), using 4 bytes\n", __func__);
+			p.pl = 4; /* Default length */
+		}
+	}
+	/* Check if buffer is long enough for insertion */
+	if (len < (2+p.pl)) {
+		IRDA_WARNING("%s: buffer too short for insertion!\n",
+			     __func__);
+		return -1;
+	}
+	IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __func__,
+		   p.pi, p.pl, p.pv.i);
+	switch (p.pl) {
+	case 1:
+		n += irda_param_pack(buf, "bbb", p.pi, p.pl, (__u8) p.pv.i);
+		break;
+	case 2:
+		if (type & PV_BIG_ENDIAN)
+			p.pv.i = cpu_to_be16((__u16) p.pv.i);
+		else
+			p.pv.i = cpu_to_le16((__u16) p.pv.i);
+		n += irda_param_pack(buf, "bbs", p.pi, p.pl, (__u16) p.pv.i);
+		break;
+	case 4:
+		if (type & PV_BIG_ENDIAN)
+			cpu_to_be32s(&p.pv.i);
+		else
+			cpu_to_le32s(&p.pv.i);
+		n += irda_param_pack(buf, "bbi", p.pi, p.pl, p.pv.i);
+
+		break;
+	default:
+		IRDA_WARNING("%s: length %d not supported\n",
+			     __func__, p.pl);
+		/* Skip parameter */
+		return -1;
+	}
+
+	return p.pl+2; /* Inserted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract integer (self, buf, len, pi, type, func)
+ *
+ *    Extract a possibly variable length integer from buffer, and call
+ *    handler for processing of the parameter
+ */
+static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi,
+				PV_TYPE type, PI_HANDLER func)
+{
+	irda_param_t p;
+	int n = 0;
+	int extract_len;	/* Real length we extract */
+	int err;
+
+	p.pi = pi;     /* In case handler needs to know */
+	p.pl = buf[1]; /* Extract length of value */
+	p.pv.i = 0;    /* Clear value */
+	extract_len = p.pl;	/* Default : extract all */
+
+	/* Check if buffer is long enough for parsing */
+	if (len < (2+p.pl)) {
+		IRDA_WARNING("%s: buffer too short for parsing! "
+			     "Need %d bytes, but len is only %d\n",
+			     __func__, p.pl, len);
+		return -1;
+	}
+
+	/*
+	 * Check that the integer length is what we expect it to be. If the
+	 * handler want a 16 bits integer then a 32 bits is not good enough
+	 * PV_INTEGER means that the handler is flexible.
+	 */
+	if (((type & PV_MASK) != PV_INTEGER) && ((type & PV_MASK) != p.pl)) {
+		IRDA_ERROR("%s: invalid parameter length! "
+			   "Expected %d bytes, but value had %d bytes!\n",
+			   __func__, type & PV_MASK, p.pl);
+
+		/* Most parameters are bit/byte fields or little endian,
+		 * so it's ok to only extract a subset of it (the subset
+		 * that the handler expect). This is necessary, as some
+		 * broken implementations seems to add extra undefined bits.
+		 * If the parameter is shorter than we expect or is big
+		 * endian, we can't play those tricks. Jean II */
+		if((p.pl < (type & PV_MASK)) || (type & PV_BIG_ENDIAN)) {
+			/* Skip parameter */
+			return p.pl+2;
+		} else {
+			/* Extract subset of it, fallthrough */
+			extract_len = type & PV_MASK;
+		}
+	}
+
+
+	switch (extract_len) {
+	case 1:
+		n += irda_param_unpack(buf+2, "b", &p.pv.i);
+		break;
+	case 2:
+		n += irda_param_unpack(buf+2, "s", &p.pv.i);
+		if (type & PV_BIG_ENDIAN)
+			p.pv.i = be16_to_cpu((__u16) p.pv.i);
+		else
+			p.pv.i = le16_to_cpu((__u16) p.pv.i);
+		break;
+	case 4:
+		n += irda_param_unpack(buf+2, "i", &p.pv.i);
+		if (type & PV_BIG_ENDIAN)
+			be32_to_cpus(&p.pv.i);
+		else
+			le32_to_cpus(&p.pv.i);
+		break;
+	default:
+		IRDA_WARNING("%s: length %d not supported\n",
+			     __func__, p.pl);
+
+		/* Skip parameter */
+		return p.pl+2;
+	}
+
+	IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __func__,
+		   p.pi, p.pl, p.pv.i);
+	/* Call handler for this parameter */
+	err = (*func)(self, &p, PV_PUT);
+	if (err < 0)
+		return err;
+
+	return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_string (self, buf, len, type, func)
+ */
+static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func)
+{
+	char str[33];
+	irda_param_t p;
+	int err;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	p.pi = pi;     /* In case handler needs to know */
+	p.pl = buf[1]; /* Extract length of value */
+	if (p.pl > 32)
+		p.pl = 32;
+
+	IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__,
+		   p.pi, p.pl);
+
+	/* Check if buffer is long enough for parsing */
+	if (len < (2+p.pl)) {
+		IRDA_WARNING("%s: buffer too short for parsing! "
+			     "Need %d bytes, but len is only %d\n",
+			     __func__, p.pl, len);
+		return -1;
+	}
+
+	/* Should be safe to copy string like this since we have already
+	 * checked that the buffer is long enough */
+	strncpy(str, buf+2, p.pl);
+
+	IRDA_DEBUG(2, "%s(), str=0x%02x 0x%02x\n", __func__,
+		   (__u8) str[0], (__u8) str[1]);
+
+	/* Null terminate string */
+	str[p.pl] = '\0';
+
+	p.pv.c = str; /* Handler will need to take a copy */
+
+	/* Call handler for this parameter */
+	err = (*func)(self, &p, PV_PUT);
+	if (err < 0)
+		return err;
+
+	return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_extract_octseq (self, buf, len, type, func)
+ */
+static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi,
+			       PV_TYPE type, PI_HANDLER func)
+{
+	irda_param_t p;
+
+	p.pi = pi;     /* In case handler needs to know */
+	p.pl = buf[1]; /* Extract length of value */
+
+	/* Check if buffer is long enough for parsing */
+	if (len < (2+p.pl)) {
+		IRDA_WARNING("%s: buffer too short for parsing! "
+			     "Need %d bytes, but len is only %d\n",
+			     __func__, p.pl, len);
+		return -1;
+	}
+
+	IRDA_DEBUG(0, "%s(), not impl\n", __func__);
+
+	return p.pl+2; /* Extracted pl+2 bytes */
+}
+
+/*
+ * Function irda_param_pack (skb, fmt, ...)
+ *
+ *    Format:
+ *        'i' = 32 bits integer
+ *        's' = string
+ *
+ */
+int irda_param_pack(__u8 *buf, char *fmt, ...)
+{
+	irda_pv_t arg;
+	va_list args;
+	char *p;
+	int n = 0;
+
+	va_start(args, fmt);
+
+	for (p = fmt; *p != '\0'; p++) {
+		switch (*p) {
+		case 'b':  /* 8 bits unsigned byte */
+			buf[n++] = (__u8)va_arg(args, int);
+			break;
+		case 's':  /* 16 bits unsigned short */
+			arg.i = (__u16)va_arg(args, int);
+			put_unaligned((__u16)arg.i, (__u16 *)(buf+n)); n+=2;
+			break;
+		case 'i':  /* 32 bits unsigned integer */
+			arg.i = va_arg(args, __u32);
+			put_unaligned(arg.i, (__u32 *)(buf+n)); n+=4;
+			break;
+#if 0
+		case 'c': /* \0 terminated string */
+			arg.c = va_arg(args, char *);
+			strcpy(buf+n, arg.c);
+			n += strlen(arg.c) + 1;
+			break;
+#endif
+		default:
+			va_end(args);
+			return -1;
+		}
+	}
+	va_end(args);
+
+	return 0;
+}
+EXPORT_SYMBOL(irda_param_pack);
+
+/*
+ * Function irda_param_unpack (skb, fmt, ...)
+ */
+static int irda_param_unpack(__u8 *buf, char *fmt, ...)
+{
+	irda_pv_t arg;
+	va_list args;
+	char *p;
+	int n = 0;
+
+	va_start(args, fmt);
+
+	for (p = fmt; *p != '\0'; p++) {
+		switch (*p) {
+		case 'b':  /* 8 bits byte */
+			arg.ip = va_arg(args, __u32 *);
+			*arg.ip = buf[n++];
+			break;
+		case 's':  /* 16 bits short */
+			arg.ip = va_arg(args, __u32 *);
+			*arg.ip = get_unaligned((__u16 *)(buf+n)); n+=2;
+			break;
+		case 'i':  /* 32 bits unsigned integer */
+			arg.ip = va_arg(args, __u32 *);
+			*arg.ip = get_unaligned((__u32 *)(buf+n)); n+=4;
+			break;
+#if 0
+		case 'c':   /* \0 terminated string */
+			arg.c = va_arg(args, char *);
+			strcpy(arg.c, buf+n);
+			n += strlen(arg.c) + 1;
+			break;
+#endif
+		default:
+			va_end(args);
+			return -1;
+		}
+
+	}
+	va_end(args);
+
+	return 0;
+}
+
+/*
+ * Function irda_param_insert (self, pi, buf, len, info)
+ *
+ *    Insert the specified parameter (pi) into buffer. Returns number of
+ *    bytes inserted
+ */
+int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len,
+		      pi_param_info_t *info)
+{
+	pi_minor_info_t *pi_minor_info;
+	__u8 pi_minor;
+	__u8 pi_major;
+	int type;
+	int ret = -1;
+	int n = 0;
+
+	IRDA_ASSERT(buf != NULL, return ret;);
+	IRDA_ASSERT(info != NULL, return ret;);
+
+	pi_minor = pi & info->pi_mask;
+	pi_major = pi >> info->pi_major_offset;
+
+	/* Check if the identifier value (pi) is valid */
+	if ((pi_major > info->len-1) ||
+	    (pi_minor > info->tables[pi_major].len-1))
+	{
+		IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n",
+			   __func__, pi);
+
+		/* Skip this parameter */
+		return -1;
+	}
+
+	/* Lookup the info on how to parse this parameter */
+	pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor];
+
+	/* Find expected data type for this parameter identifier (pi)*/
+	type = pi_minor_info->type;
+
+	/*  Check if handler has been implemented */
+	if (!pi_minor_info->func) {
+		IRDA_MESSAGE("%s: no handler for pi=%#x\n", __func__, pi);
+		/* Skip this parameter */
+		return -1;
+	}
+
+	/* Insert parameter value */
+	ret = (*pv_insert_table[type & PV_MASK])(self, buf+n, len, pi, type,
+						 pi_minor_info->func);
+	return ret;
+}
+EXPORT_SYMBOL(irda_param_insert);
+
+/*
+ * Function irda_param_extract (self, buf, len, info)
+ *
+ *    Parse all parameters. If len is correct, then everything should be
+ *    safe. Returns the number of bytes that was parsed
+ *
+ */
+static int irda_param_extract(void *self, __u8 *buf, int len,
+			      pi_param_info_t *info)
+{
+	pi_minor_info_t *pi_minor_info;
+	__u8 pi_minor;
+	__u8 pi_major;
+	int type;
+	int ret = -1;
+	int n = 0;
+
+	IRDA_ASSERT(buf != NULL, return ret;);
+	IRDA_ASSERT(info != NULL, return ret;);
+
+	pi_minor = buf[n] & info->pi_mask;
+	pi_major = buf[n] >> info->pi_major_offset;
+
+	/* Check if the identifier value (pi) is valid */
+	if ((pi_major > info->len-1) ||
+	    (pi_minor > info->tables[pi_major].len-1))
+	{
+		IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n",
+			   __func__, buf[0]);
+
+		/* Skip this parameter */
+		return 2 + buf[n + 1];  /* Continue */
+	}
+
+	/* Lookup the info on how to parse this parameter */
+	pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor];
+
+	/* Find expected data type for this parameter identifier (pi)*/
+	type = pi_minor_info->type;
+
+	IRDA_DEBUG(3, "%s(), pi=[%d,%d], type=%d\n", __func__,
+		   pi_major, pi_minor, type);
+
+	/*  Check if handler has been implemented */
+	if (!pi_minor_info->func) {
+		IRDA_MESSAGE("%s: no handler for pi=%#x\n",
+			     __func__, buf[n]);
+		/* Skip this parameter */
+		return 2 + buf[n + 1]; /* Continue */
+	}
+
+	/* Parse parameter value */
+	ret = (*pv_extract_table[type & PV_MASK])(self, buf+n, len, buf[n],
+						  type, pi_minor_info->func);
+	return ret;
+}
+
+/*
+ * Function irda_param_extract_all (self, buf, len, info)
+ *
+ *    Parse all parameters. If len is correct, then everything should be
+ *    safe. Returns the number of bytes that was parsed
+ *
+ */
+int irda_param_extract_all(void *self, __u8 *buf, int len,
+			   pi_param_info_t *info)
+{
+	int ret = -1;
+	int n = 0;
+
+	IRDA_ASSERT(buf != NULL, return ret;);
+	IRDA_ASSERT(info != NULL, return ret;);
+
+	/*
+	 * Parse all parameters. Each parameter must be at least two bytes
+	 * long or else there is no point in trying to parse it
+	 */
+	while (len > 2) {
+		ret = irda_param_extract(self, buf+n, len, info);
+		if (ret < 0)
+			return ret;
+
+		n += ret;
+		len -= ret;
+	}
+	return n;
+}
+EXPORT_SYMBOL(irda_param_extract_all);
diff --git a/net/irda/qos.c b/net/irda/qos.c
new file mode 100644
index 00000000..1b51bcf4
--- /dev/null
+++ b/net/irda/qos.c
@@ -0,0 +1,774 @@
+/*********************************************************************
+ *
+ * Filename:      qos.c
+ * Version:       1.0
+ * Description:   IrLAP QoS parameter negotiation
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Sep  9 00:00:26 1997
+ * Modified at:   Sun Jan 30 14:29:16 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2001 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ *     MA 02111-1307 USA
+ *
+ ********************************************************************/
+
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/parameters.h>
+#include <net/irda/qos.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+
+/*
+ * Maximum values of the baud rate we negotiate with the other end.
+ * Most often, you don't have to change that, because Linux-IrDA will
+ * use the maximum offered by the link layer, which usually works fine.
+ * In some very rare cases, you may want to limit it to lower speeds...
+ */
+int sysctl_max_baud_rate = 16000000;
+/*
+ * Maximum value of the lap disconnect timer we negotiate with the other end.
+ * Most often, the value below represent the best compromise, but some user
+ * may want to keep the LAP alive longer or shorter in case of link failure.
+ * Remember that the threshold time (early warning) is fixed to 3s...
+ */
+int sysctl_max_noreply_time = 12;
+/*
+ * Minimum turn time to be applied before transmitting to the peer.
+ * Nonzero values (usec) are used as lower limit to the per-connection
+ * mtt value which was announced by the other end during negotiation.
+ * Might be helpful if the peer device provides too short mtt.
+ * Default is 10us which means using the unmodified value given by the
+ * peer except if it's 0 (0 is likely a bug in the other stack).
+ */
+unsigned sysctl_min_tx_turn_time = 10;
+/*
+ * Maximum data size to be used in transmission in payload of LAP frame.
+ * There is a bit of confusion in the IrDA spec :
+ * The LAP spec defines the payload of a LAP frame (I field) to be
+ * 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40).
+ * On the other hand, the PHY mention frames of 2048 bytes max (IrPHY
+ * 1.2, chapt 5.3.2.1, p41). But, this number includes the LAP header
+ * (2 bytes), and CRC (32 bits at 4 Mb/s). So, for the I field (LAP
+ * payload), that's only 2042 bytes. Oups !
+ * My nsc-ircc hardware has troubles receiving 2048 bytes frames at 4 Mb/s,
+ * so adjust to 2042... I don't know if this bug applies only for 2048
+ * bytes frames or all negotiated frame sizes, but you can use the sysctl
+ * to play with this value anyway.
+ * Jean II */
+unsigned sysctl_max_tx_data_size = 2042;
+/*
+ * Maximum transmit window, i.e. number of LAP frames between turn-around.
+ * This allow to override what the peer told us. Some peers are buggy and
+ * don't always support what they tell us.
+ * Jean II */
+unsigned sysctl_max_tx_window = 7;
+
+static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get);
+static int irlap_param_link_disconnect(void *instance, irda_param_t *parm,
+				       int get);
+static int irlap_param_max_turn_time(void *instance, irda_param_t *param,
+				     int get);
+static int irlap_param_data_size(void *instance, irda_param_t *param, int get);
+static int irlap_param_window_size(void *instance, irda_param_t *param,
+				   int get);
+static int irlap_param_additional_bofs(void *instance, irda_param_t *parm,
+				       int get);
+static int irlap_param_min_turn_time(void *instance, irda_param_t *param,
+				     int get);
+
+#ifndef CONFIG_IRDA_DYNAMIC_WINDOW
+static __u32 irlap_requested_line_capacity(struct qos_info *qos);
+#endif
+
+static __u32 min_turn_times[]  = { 10000, 5000, 1000, 500, 100, 50, 10, 0 }; /* us */
+static __u32 baud_rates[]      = { 2400, 9600, 19200, 38400, 57600, 115200, 576000,
+				   1152000, 4000000, 16000000 };           /* bps */
+static __u32 data_sizes[]      = { 64, 128, 256, 512, 1024, 2048 };        /* bytes */
+static __u32 add_bofs[]        = { 48, 24, 12, 5, 3, 2, 1, 0 };            /* bytes */
+static __u32 max_turn_times[]  = { 500, 250, 100, 50 };                    /* ms */
+static __u32 link_disc_times[] = { 3, 8, 12, 16, 20, 25, 30, 40 };         /* secs */
+
+static __u32 max_line_capacities[10][4] = {
+       /* 500 ms     250 ms  100 ms  50 ms (max turn time) */
+	{    100,      0,      0,     0 }, /*     2400 bps */
+	{    400,      0,      0,     0 }, /*     9600 bps */
+	{    800,      0,      0,     0 }, /*    19200 bps */
+	{   1600,      0,      0,     0 }, /*    38400 bps */
+	{   2360,      0,      0,     0 }, /*    57600 bps */
+	{   4800,   2400,    960,   480 }, /*   115200 bps */
+	{  28800,  11520,   5760,  2880 }, /*   576000 bps */
+	{  57600,  28800,  11520,  5760 }, /*  1152000 bps */
+	{ 200000, 100000,  40000, 20000 }, /*  4000000 bps */
+	{ 800000, 400000, 160000, 80000 }, /* 16000000 bps */
+};
+
+static pi_minor_info_t pi_minor_call_table_type_0[] = {
+	{ NULL, 0 },
+/* 01 */{ irlap_param_baud_rate,       PV_INTEGER | PV_LITTLE_ENDIAN },
+	{ NULL, 0 },
+	{ NULL, 0 },
+	{ NULL, 0 },
+	{ NULL, 0 },
+	{ NULL, 0 },
+	{ NULL, 0 },
+/* 08 */{ irlap_param_link_disconnect, PV_INT_8_BITS }
+};
+
+static pi_minor_info_t pi_minor_call_table_type_1[] = {
+	{ NULL, 0 },
+	{ NULL, 0 },
+/* 82 */{ irlap_param_max_turn_time,   PV_INT_8_BITS },
+/* 83 */{ irlap_param_data_size,       PV_INT_8_BITS },
+/* 84 */{ irlap_param_window_size,     PV_INT_8_BITS },
+/* 85 */{ irlap_param_additional_bofs, PV_INT_8_BITS },
+/* 86 */{ irlap_param_min_turn_time,   PV_INT_8_BITS },
+};
+
+static pi_major_info_t pi_major_call_table[] = {
+	{ pi_minor_call_table_type_0, 9 },
+	{ pi_minor_call_table_type_1, 7 },
+};
+
+static pi_param_info_t irlap_param_info = { pi_major_call_table, 2, 0x7f, 7 };
+
+/* ---------------------- LOCAL SUBROUTINES ---------------------- */
+/* Note : we start with a bunch of local subroutines.
+ * As the compiler is "one pass", this is the only way to get them to
+ * inline properly...
+ * Jean II
+ */
+/*
+ * Function value_index (value, array, size)
+ *
+ *    Returns the index to the value in the specified array
+ */
+static inline int value_index(__u32 value, __u32 *array, int size)
+{
+	int i;
+
+	for (i=0; i < size; i++)
+		if (array[i] == value)
+			break;
+	return i;
+}
+
+/*
+ * Function index_value (index, array)
+ *
+ *    Returns value to index in array, easy!
+ *
+ */
+static inline __u32 index_value(int index, __u32 *array)
+{
+	return array[index];
+}
+
+/*
+ * Function msb_index (word)
+ *
+ *    Returns index to most significant bit (MSB) in word
+ *
+ */
+static int msb_index (__u16 word)
+{
+	__u16 msb = 0x8000;
+	int index = 15;   /* Current MSB */
+
+	/* Check for buggy peers.
+	 * Note : there is a small probability that it could be us, but I
+	 * would expect driver authors to catch that pretty early and be
+	 * able to check precisely what's going on. If a end user sees this,
+	 * it's very likely the peer. - Jean II */
+	if (word == 0) {
+		IRDA_WARNING("%s(), Detected buggy peer, adjust null PV to 0x1!\n",
+			 __func__);
+		/* The only safe choice (we don't know the array size) */
+		word = 0x1;
+	}
+
+	while (msb) {
+		if (word & msb)
+			break;   /* Found it! */
+		msb >>=1;
+		index--;
+	}
+	return index;
+}
+
+/*
+ * Function value_lower_bits (value, array)
+ *
+ *    Returns a bit field marking all possibility lower than value.
+ */
+static inline int value_lower_bits(__u32 value, __u32 *array, int size, __u16 *field)
+{
+	int	i;
+	__u16	mask = 0x1;
+	__u16	result = 0x0;
+
+	for (i=0; i < size; i++) {
+		/* Add the current value to the bit field, shift mask */
+		result |= mask;
+		mask <<= 1;
+		/* Finished ? */
+		if (array[i] >= value)
+			break;
+	}
+	/* Send back a valid index */
+	if(i >= size)
+	  i = size - 1;	/* Last item */
+	*field = result;
+	return i;
+}
+
+/*
+ * Function value_highest_bit (value, array)
+ *
+ *    Returns a bit field marking the highest possibility lower than value.
+ */
+static inline int value_highest_bit(__u32 value, __u32 *array, int size, __u16 *field)
+{
+	int	i;
+	__u16	mask = 0x1;
+	__u16	result = 0x0;
+
+	for (i=0; i < size; i++) {
+		/* Finished ? */
+		if (array[i] <= value)
+			break;
+		/* Shift mask */
+		mask <<= 1;
+	}
+	/* Set the current value to the bit field */
+	result |= mask;
+	/* Send back a valid index */
+	if(i >= size)
+	  i = size - 1;	/* Last item */
+	*field = result;
+	return i;
+}
+
+/* -------------------------- MAIN CALLS -------------------------- */
+
+/*
+ * Function irda_qos_compute_intersection (qos, new)
+ *
+ *    Compute the intersection of the old QoS capabilities with new ones
+ *
+ */
+void irda_qos_compute_intersection(struct qos_info *qos, struct qos_info *new)
+{
+	IRDA_ASSERT(qos != NULL, return;);
+	IRDA_ASSERT(new != NULL, return;);
+
+	/* Apply */
+	qos->baud_rate.bits       &= new->baud_rate.bits;
+	qos->window_size.bits     &= new->window_size.bits;
+	qos->min_turn_time.bits   &= new->min_turn_time.bits;
+	qos->max_turn_time.bits   &= new->max_turn_time.bits;
+	qos->data_size.bits       &= new->data_size.bits;
+	qos->link_disc_time.bits  &= new->link_disc_time.bits;
+	qos->additional_bofs.bits &= new->additional_bofs.bits;
+
+	irda_qos_bits_to_value(qos);
+}
+
+/*
+ * Function irda_init_max_qos_capabilies (qos)
+ *
+ *    The purpose of this function is for layers and drivers to be able to
+ *    set the maximum QoS possible and then "and in" their own limitations
+ *
+ */
+void irda_init_max_qos_capabilies(struct qos_info *qos)
+{
+	int i;
+	/*
+	 *  These are the maximum supported values as specified on pages
+	 *  39-43 in IrLAP
+	 */
+
+	/* Use sysctl to set some configurable values... */
+	/* Set configured max speed */
+	i = value_lower_bits(sysctl_max_baud_rate, baud_rates, 10,
+			     &qos->baud_rate.bits);
+	sysctl_max_baud_rate = index_value(i, baud_rates);
+
+	/* Set configured max disc time */
+	i = value_lower_bits(sysctl_max_noreply_time, link_disc_times, 8,
+			     &qos->link_disc_time.bits);
+	sysctl_max_noreply_time = index_value(i, link_disc_times);
+
+	/* LSB is first byte, MSB is second byte */
+	qos->baud_rate.bits    &= 0x03ff;
+
+	qos->window_size.bits     = 0x7f;
+	qos->min_turn_time.bits   = 0xff;
+	qos->max_turn_time.bits   = 0x0f;
+	qos->data_size.bits       = 0x3f;
+	qos->link_disc_time.bits &= 0xff;
+	qos->additional_bofs.bits = 0xff;
+}
+EXPORT_SYMBOL(irda_init_max_qos_capabilies);
+
+/*
+ * Function irlap_adjust_qos_settings (qos)
+ *
+ *     Adjust QoS settings in case some values are not possible to use because
+ *     of other settings
+ */
+static void irlap_adjust_qos_settings(struct qos_info *qos)
+{
+	__u32 line_capacity;
+	int index;
+
+	IRDA_DEBUG(2, "%s()\n", __func__);
+
+	/*
+	 * Make sure the mintt is sensible.
+	 * Main culprit : Ericsson T39. - Jean II
+	 */
+	if (sysctl_min_tx_turn_time > qos->min_turn_time.value) {
+		int i;
+
+		IRDA_WARNING("%s(), Detected buggy peer, adjust mtt to %dus!\n",
+			 __func__, sysctl_min_tx_turn_time);
+
+		/* We don't really need bits, but easier this way */
+		i = value_highest_bit(sysctl_min_tx_turn_time, min_turn_times,
+				      8, &qos->min_turn_time.bits);
+		sysctl_min_tx_turn_time = index_value(i, min_turn_times);
+		qos->min_turn_time.value = sysctl_min_tx_turn_time;
+	}
+
+	/*
+	 * Not allowed to use a max turn time less than 500 ms if the baudrate
+	 * is less than 115200
+	 */
+	if ((qos->baud_rate.value < 115200) &&
+	    (qos->max_turn_time.value < 500))
+	{
+		IRDA_DEBUG(0,
+			   "%s(), adjusting max turn time from %d to 500 ms\n",
+			   __func__, qos->max_turn_time.value);
+		qos->max_turn_time.value = 500;
+	}
+
+	/*
+	 * The data size must be adjusted according to the baud rate and max
+	 * turn time
+	 */
+	index = value_index(qos->data_size.value, data_sizes, 6);
+	line_capacity = irlap_max_line_capacity(qos->baud_rate.value,
+						qos->max_turn_time.value);
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	while ((qos->data_size.value > line_capacity) && (index > 0)) {
+		qos->data_size.value = data_sizes[index--];
+		IRDA_DEBUG(2, "%s(), reducing data size to %d\n",
+			   __func__, qos->data_size.value);
+	}
+#else /* Use method described in section 6.6.11 of IrLAP */
+	while (irlap_requested_line_capacity(qos) > line_capacity) {
+		IRDA_ASSERT(index != 0, return;);
+
+		/* Must be able to send at least one frame */
+		if (qos->window_size.value > 1) {
+			qos->window_size.value--;
+			IRDA_DEBUG(2, "%s(), reducing window size to %d\n",
+				   __func__, qos->window_size.value);
+		} else if (index > 1) {
+			qos->data_size.value = data_sizes[index--];
+			IRDA_DEBUG(2, "%s(), reducing data size to %d\n",
+				   __func__, qos->data_size.value);
+		} else {
+			IRDA_WARNING("%s(), nothing more we can do!\n",
+				     __func__);
+		}
+	}
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+	/*
+	 * Fix tx data size according to user limits - Jean II
+	 */
+	if (qos->data_size.value > sysctl_max_tx_data_size)
+		/* Allow non discrete adjustement to avoid losing capacity */
+		qos->data_size.value = sysctl_max_tx_data_size;
+	/*
+	 * Override Tx window if user request it. - Jean II
+	 */
+	if (qos->window_size.value > sysctl_max_tx_window)
+		qos->window_size.value = sysctl_max_tx_window;
+}
+
+/*
+ * Function irlap_negotiate (qos_device, qos_session, skb)
+ *
+ *    Negotiate QoS values, not really that much negotiation :-)
+ *    We just set the QoS capabilities for the peer station
+ *
+ */
+int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = irda_param_extract_all(self, skb->data, skb->len,
+				     &irlap_param_info);
+
+	/* Convert the negotiated bits to values */
+	irda_qos_bits_to_value(&self->qos_tx);
+	irda_qos_bits_to_value(&self->qos_rx);
+
+	irlap_adjust_qos_settings(&self->qos_tx);
+
+	IRDA_DEBUG(2, "Setting BAUD_RATE to %d bps.\n",
+		   self->qos_tx.baud_rate.value);
+	IRDA_DEBUG(2, "Setting DATA_SIZE to %d bytes\n",
+		   self->qos_tx.data_size.value);
+	IRDA_DEBUG(2, "Setting WINDOW_SIZE to %d\n",
+		   self->qos_tx.window_size.value);
+	IRDA_DEBUG(2, "Setting XBOFS to %d\n",
+		   self->qos_tx.additional_bofs.value);
+	IRDA_DEBUG(2, "Setting MAX_TURN_TIME to %d ms.\n",
+		   self->qos_tx.max_turn_time.value);
+	IRDA_DEBUG(2, "Setting MIN_TURN_TIME to %d usecs.\n",
+		   self->qos_tx.min_turn_time.value);
+	IRDA_DEBUG(2, "Setting LINK_DISC to %d secs.\n",
+		   self->qos_tx.link_disc_time.value);
+	return ret;
+}
+
+/*
+ * Function irlap_insert_negotiation_params (qos, fp)
+ *
+ *    Insert QoS negotiaion pararameters into frame
+ *
+ */
+int irlap_insert_qos_negotiation_params(struct irlap_cb *self,
+					struct sk_buff *skb)
+{
+	int ret;
+
+	/* Insert data rate */
+	ret = irda_param_insert(self, PI_BAUD_RATE, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert max turnaround time */
+	ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert data size */
+	ret = irda_param_insert(self, PI_DATA_SIZE, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert window size */
+	ret = irda_param_insert(self, PI_WINDOW_SIZE, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert additional BOFs */
+	ret = irda_param_insert(self, PI_ADD_BOFS, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert minimum turnaround time */
+	ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	/* Insert link disconnect/threshold time */
+	ret = irda_param_insert(self, PI_LINK_DISC, skb_tail_pointer(skb),
+				skb_tailroom(skb), &irlap_param_info);
+	if (ret < 0)
+		return ret;
+	skb_put(skb, ret);
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_baud_rate (instance, param, get)
+ *
+ *    Negotiate data-rate
+ *
+ */
+static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get)
+{
+	__u16 final;
+
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get) {
+		param->pv.i = self->qos_rx.baud_rate.bits;
+		IRDA_DEBUG(2, "%s(), baud rate = 0x%02x\n",
+			   __func__, param->pv.i);
+	} else {
+		/*
+		 *  Stations must agree on baud rate, so calculate
+		 *  intersection
+		 */
+		IRDA_DEBUG(2, "Requested BAUD_RATE: 0x%04x\n", (__u16) param->pv.i);
+		final = (__u16) param->pv.i & self->qos_rx.baud_rate.bits;
+
+		IRDA_DEBUG(2, "Final BAUD_RATE: 0x%04x\n", final);
+		self->qos_tx.baud_rate.bits = final;
+		self->qos_rx.baud_rate.bits = final;
+	}
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_link_disconnect (instance, param, get)
+ *
+ *    Negotiate link disconnect/threshold time.
+ *
+ */
+static int irlap_param_link_disconnect(void *instance, irda_param_t *param,
+				       int get)
+{
+	__u16 final;
+
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.link_disc_time.bits;
+	else {
+		/*
+		 *  Stations must agree on link disconnect/threshold
+		 *  time.
+		 */
+		IRDA_DEBUG(2, "LINK_DISC: %02x\n", (__u8) param->pv.i);
+		final = (__u8) param->pv.i & self->qos_rx.link_disc_time.bits;
+
+		IRDA_DEBUG(2, "Final LINK_DISC: %02x\n", final);
+		self->qos_tx.link_disc_time.bits = final;
+		self->qos_rx.link_disc_time.bits = final;
+	}
+	return 0;
+}
+
+/*
+ * Function irlap_param_max_turn_time (instance, param, get)
+ *
+ *    Negotiate the maximum turnaround time. This is a type 1 parameter and
+ *    will be negotiated independently for each station
+ *
+ */
+static int irlap_param_max_turn_time(void *instance, irda_param_t *param,
+				     int get)
+{
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.max_turn_time.bits;
+	else
+		self->qos_tx.max_turn_time.bits = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_data_size (instance, param, get)
+ *
+ *    Negotiate the data size. This is a type 1 parameter and
+ *    will be negotiated independently for each station
+ *
+ */
+static int irlap_param_data_size(void *instance, irda_param_t *param, int get)
+{
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.data_size.bits;
+	else
+		self->qos_tx.data_size.bits = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_window_size (instance, param, get)
+ *
+ *    Negotiate the window size. This is a type 1 parameter and
+ *    will be negotiated independently for each station
+ *
+ */
+static int irlap_param_window_size(void *instance, irda_param_t *param,
+				   int get)
+{
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.window_size.bits;
+	else
+		self->qos_tx.window_size.bits = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_additional_bofs (instance, param, get)
+ *
+ *    Negotiate additional BOF characters. This is a type 1 parameter and
+ *    will be negotiated independently for each station.
+ */
+static int irlap_param_additional_bofs(void *instance, irda_param_t *param, int get)
+{
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.additional_bofs.bits;
+	else
+		self->qos_tx.additional_bofs.bits = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function irlap_param_min_turn_time (instance, param, get)
+ *
+ *    Negotiate the minimum turn around time. This is a type 1 parameter and
+ *    will be negotiated independently for each station
+ */
+static int irlap_param_min_turn_time(void *instance, irda_param_t *param,
+				     int get)
+{
+	struct irlap_cb *self = (struct irlap_cb *) instance;
+
+	IRDA_ASSERT(self != NULL, return -1;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;);
+
+	if (get)
+		param->pv.i = self->qos_rx.min_turn_time.bits;
+	else
+		self->qos_tx.min_turn_time.bits = (__u8) param->pv.i;
+
+	return 0;
+}
+
+/*
+ * Function irlap_max_line_capacity (speed, max_turn_time, min_turn_time)
+ *
+ *    Calculate the maximum line capacity
+ *
+ */
+__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time)
+{
+	__u32 line_capacity;
+	int i,j;
+
+	IRDA_DEBUG(2, "%s(), speed=%d, max_turn_time=%d\n",
+		   __func__, speed, max_turn_time);
+
+	i = value_index(speed, baud_rates, 10);
+	j = value_index(max_turn_time, max_turn_times, 4);
+
+	IRDA_ASSERT(((i >=0) && (i <10)), return 0;);
+	IRDA_ASSERT(((j >=0) && (j <4)), return 0;);
+
+	line_capacity = max_line_capacities[i][j];
+
+	IRDA_DEBUG(2, "%s(), line capacity=%d bytes\n",
+		   __func__, line_capacity);
+
+	return line_capacity;
+}
+
+#ifndef CONFIG_IRDA_DYNAMIC_WINDOW
+static __u32 irlap_requested_line_capacity(struct qos_info *qos)
+{
+	__u32 line_capacity;
+
+	line_capacity = qos->window_size.value *
+		(qos->data_size.value + 6 + qos->additional_bofs.value) +
+		irlap_min_turn_time_in_bytes(qos->baud_rate.value,
+					     qos->min_turn_time.value);
+
+	IRDA_DEBUG(2, "%s(), requested line capacity=%d\n",
+		   __func__, line_capacity);
+
+	return line_capacity;
+}
+#endif
+
+void irda_qos_bits_to_value(struct qos_info *qos)
+{
+	int index;
+
+	IRDA_ASSERT(qos != NULL, return;);
+
+	index = msb_index(qos->baud_rate.bits);
+	qos->baud_rate.value = baud_rates[index];
+
+	index = msb_index(qos->data_size.bits);
+	qos->data_size.value = data_sizes[index];
+
+	index = msb_index(qos->window_size.bits);
+	qos->window_size.value = index+1;
+
+	index = msb_index(qos->min_turn_time.bits);
+	qos->min_turn_time.value = min_turn_times[index];
+
+	index = msb_index(qos->max_turn_time.bits);
+	qos->max_turn_time.value = max_turn_times[index];
+
+	index = msb_index(qos->link_disc_time.bits);
+	qos->link_disc_time.value = link_disc_times[index];
+
+	index = msb_index(qos->additional_bofs.bits);
+	qos->additional_bofs.value = add_bofs[index];
+}
+EXPORT_SYMBOL(irda_qos_bits_to_value);
diff --git a/net/irda/timer.c b/net/irda/timer.c
new file mode 100644
index 00000000..f418cb2a
--- /dev/null
+++ b/net/irda/timer.c
@@ -0,0 +1,232 @@
+/*********************************************************************
+ *
+ * Filename:      timer.c
+ * Version:
+ * Description:
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sat Aug 16 00:59:29 1997
+ * Modified at:   Wed Dec  8 12:50:34 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <asm/system.h>
+#include <linux/delay.h>
+
+#include <net/irda/timer.h>
+#include <net/irda/irda.h>
+#include <net/irda/irda_device.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlmp.h>
+
+extern int  sysctl_slot_timeout;
+
+static void irlap_slot_timer_expired(void* data);
+static void irlap_query_timer_expired(void* data);
+static void irlap_final_timer_expired(void* data);
+static void irlap_wd_timer_expired(void* data);
+static void irlap_backoff_timer_expired(void* data);
+static void irlap_media_busy_expired(void* data);
+
+void irlap_start_slot_timer(struct irlap_cb *self, int timeout)
+{
+	irda_start_timer(&self->slot_timer, timeout, (void *) self,
+			 irlap_slot_timer_expired);
+}
+
+void irlap_start_query_timer(struct irlap_cb *self, int S, int s)
+{
+	int timeout;
+
+	/* Calculate when the peer discovery should end. Normally, we
+	 * get the end-of-discovery frame, so this is just in case
+	 * we miss it.
+	 * Basically, we multiply the number of remaining slots by our
+	 * slot time, plus add some extra time to properly receive the last
+	 * discovery packet (which is longer due to extra discovery info),
+	 * to avoid messing with for incomming connections requests and
+	 * to accommodate devices that perform discovery slower than us.
+	 * Jean II */
+	timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s)
+		   + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT);
+
+	/* Set or re-set the timer. We reset the timer for each received
+	 * discovery query, which allow us to automatically adjust to
+	 * the speed of the peer discovery (faster or slower). Jean II */
+	irda_start_timer( &self->query_timer, timeout, (void *) self,
+			  irlap_query_timer_expired);
+}
+
+void irlap_start_final_timer(struct irlap_cb *self, int timeout)
+{
+	irda_start_timer(&self->final_timer, timeout, (void *) self,
+			 irlap_final_timer_expired);
+}
+
+void irlap_start_wd_timer(struct irlap_cb *self, int timeout)
+{
+	irda_start_timer(&self->wd_timer, timeout, (void *) self,
+			 irlap_wd_timer_expired);
+}
+
+void irlap_start_backoff_timer(struct irlap_cb *self, int timeout)
+{
+	irda_start_timer(&self->backoff_timer, timeout, (void *) self,
+			 irlap_backoff_timer_expired);
+}
+
+void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout)
+{
+	irda_start_timer(&self->media_busy_timer, timeout,
+			 (void *) self, irlap_media_busy_expired);
+}
+
+void irlap_stop_mbusy_timer(struct irlap_cb *self)
+{
+	/* If timer is activated, kill it! */
+	del_timer(&self->media_busy_timer);
+
+	/* If we are in NDM, there is a bunch of events in LAP that
+	 * that be pending due to the media_busy condition, such as
+	 * CONNECT_REQUEST and SEND_UI_FRAME. If we don't generate
+	 * an event, they will wait forever...
+	 * Jean II */
+	if (self->state == LAP_NDM)
+		irlap_do_event(self, MEDIA_BUSY_TIMER_EXPIRED, NULL, NULL);
+}
+
+void irlmp_start_watchdog_timer(struct lsap_cb *self, int timeout)
+{
+	irda_start_timer(&self->watchdog_timer, timeout, (void *) self,
+			 irlmp_watchdog_timer_expired);
+}
+
+void irlmp_start_discovery_timer(struct irlmp_cb *self, int timeout)
+{
+	irda_start_timer(&self->discovery_timer, timeout, (void *) self,
+			 irlmp_discovery_timer_expired);
+}
+
+void irlmp_start_idle_timer(struct lap_cb *self, int timeout)
+{
+	irda_start_timer(&self->idle_timer, timeout, (void *) self,
+			 irlmp_idle_timer_expired);
+}
+
+void irlmp_stop_idle_timer(struct lap_cb *self)
+{
+	/* If timer is activated, kill it! */
+	del_timer(&self->idle_timer);
+}
+
+/*
+ * Function irlap_slot_timer_expired (data)
+ *
+ *    IrLAP slot timer has expired
+ *
+ */
+static void irlap_slot_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, SLOT_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irlap_query_timer_expired (data)
+ *
+ *    IrLAP query timer has expired
+ *
+ */
+static void irlap_query_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, QUERY_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_final_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_final_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, FINAL_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_wd_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_wd_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, WD_TIMER_EXPIRED, NULL, NULL);
+}
+
+/*
+ * Function irda_backoff_timer_expired (data)
+ *
+ *
+ *
+ */
+static void irlap_backoff_timer_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
+
+	irlap_do_event(self, BACKOFF_TIMER_EXPIRED, NULL, NULL);
+}
+
+
+/*
+ * Function irtty_media_busy_expired (data)
+ *
+ *
+ */
+static void irlap_media_busy_expired(void *data)
+{
+	struct irlap_cb *self = (struct irlap_cb *) data;
+
+	IRDA_ASSERT(self != NULL, return;);
+
+	irda_device_set_media_busy(self->netdev, FALSE);
+	/* Note : the LAP event will be send in irlap_stop_mbusy_timer(),
+	* to catch other cases where the flag is cleared (for example
+	* after a discovery) - Jean II */
+}
diff --git a/net/irda/wrapper.c b/net/irda/wrapper.c
new file mode 100644
index 00000000..fd0995b1
--- /dev/null
+++ b/net/irda/wrapper.c
@@ -0,0 +1,492 @@
+/*********************************************************************
+ *
+ * Filename:      wrapper.c
+ * Version:       1.2
+ * Description:   IrDA SIR async wrapper layer
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Fri Jan 28 13:21:09 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * Modified at:   Fri May 28  3:11 CST 1999
+ * Modified by:   Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/byteorder.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/wrapper.h>
+#include <net/irda/crc.h>
+#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
+#include <net/irda/irda_device.h>
+
+/************************** FRAME WRAPPING **************************/
+/*
+ * Unwrap and unstuff SIR frames
+ *
+ * Note : at FIR and MIR, HDLC framing is used and usually handled
+ * by the controller, so we come here only for SIR... Jean II
+ */
+
+/*
+ * Function stuff_byte (byte, buf)
+ *
+ *    Byte stuff one single byte and put the result in buffer pointed to by
+ *    buf. The buffer must at all times be able to have two bytes inserted.
+ *
+ * This is in a tight loop, better inline it, so need to be prior to callers.
+ * (2000 bytes on P6 200MHz, non-inlined ~370us, inline ~170us) - Jean II
+ */
+static inline int stuff_byte(__u8 byte, __u8 *buf)
+{
+	switch (byte) {
+	case BOF: /* FALLTHROUGH */
+	case EOF: /* FALLTHROUGH */
+	case CE:
+		/* Insert transparently coded */
+		buf[0] = CE;               /* Send link escape */
+		buf[1] = byte^IRDA_TRANS;    /* Complement bit 5 */
+		return 2;
+		/* break; */
+	default:
+		 /* Non-special value, no transparency required */
+		buf[0] = byte;
+		return 1;
+		/* break; */
+	}
+}
+
+/*
+ * Function async_wrap (skb, *tx_buff, buffsize)
+ *
+ *    Makes a new buffer with wrapping and stuffing, should check that
+ *    we don't get tx buffer overflow.
+ */
+int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize)
+{
+	struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb;
+	int xbofs;
+	int i;
+	int n;
+	union {
+		__u16 value;
+		__u8 bytes[2];
+	} fcs;
+
+	/* Initialize variables */
+	fcs.value = INIT_FCS;
+	n = 0;
+
+	/*
+	 *  Send  XBOF's for required min. turn time and for the negotiated
+	 *  additional XBOFS
+	 */
+
+	if (cb->magic != LAP_MAGIC) {
+		/*
+		 * This will happen for all frames sent from user-space.
+		 * Nothing to worry about, but we set the default number of
+		 * BOF's
+		 */
+		IRDA_DEBUG(1, "%s(), wrong magic in skb!\n", __func__);
+		xbofs = 10;
+	} else
+		xbofs = cb->xbofs + cb->xbofs_delay;
+
+	IRDA_DEBUG(4, "%s(), xbofs=%d\n", __func__, xbofs);
+
+	/* Check that we never use more than 115 + 48 xbofs */
+	if (xbofs > 163) {
+		IRDA_DEBUG(0, "%s(), too many xbofs (%d)\n", __func__,
+			   xbofs);
+		xbofs = 163;
+	}
+
+	memset(tx_buff + n, XBOF, xbofs);
+	n += xbofs;
+
+	/* Start of packet character BOF */
+	tx_buff[n++] = BOF;
+
+	/* Insert frame and calc CRC */
+	for (i=0; i < skb->len; i++) {
+		/*
+		 *  Check for the possibility of tx buffer overflow. We use
+		 *  bufsize-5 since the maximum number of bytes that can be
+		 *  transmitted after this point is 5.
+		 */
+		if(n >= (buffsize-5)) {
+			IRDA_ERROR("%s(), tx buffer overflow (n=%d)\n",
+				   __func__, n);
+			return n;
+		}
+
+		n += stuff_byte(skb->data[i], tx_buff+n);
+		fcs.value = irda_fcs(fcs.value, skb->data[i]);
+	}
+
+	/* Insert CRC in little endian format (LSB first) */
+	fcs.value = ~fcs.value;
+#ifdef __LITTLE_ENDIAN
+	n += stuff_byte(fcs.bytes[0], tx_buff+n);
+	n += stuff_byte(fcs.bytes[1], tx_buff+n);
+#else /* ifdef __BIG_ENDIAN */
+	n += stuff_byte(fcs.bytes[1], tx_buff+n);
+	n += stuff_byte(fcs.bytes[0], tx_buff+n);
+#endif
+	tx_buff[n++] = EOF;
+
+	return n;
+}
+EXPORT_SYMBOL(async_wrap_skb);
+
+/************************* FRAME UNWRAPPING *************************/
+/*
+ * Unwrap and unstuff SIR frames
+ *
+ * Complete rewrite by Jean II :
+ * More inline, faster, more compact, more logical. Jean II
+ * (16 bytes on P6 200MHz, old 5 to 7 us, new 4 to 6 us)
+ * (24 bytes on P6 200MHz, old 9 to 10 us, new 7 to 8 us)
+ * (for reference, 115200 b/s is 1 byte every 69 us)
+ * And reduce wrapper.o by ~900B in the process ;-)
+ *
+ * Then, we have the addition of ZeroCopy, which is optional
+ * (i.e. the driver must initiate it) and improve final processing.
+ * (2005 B frame + EOF on P6 200MHz, without 30 to 50 us, with 10 to 25 us)
+ *
+ * Note : at FIR and MIR, HDLC framing is used and usually handled
+ * by the controller, so we come here only for SIR... Jean II
+ */
+
+/*
+ * We can also choose where we want to do the CRC calculation. We can
+ * do it "inline", as we receive the bytes, or "postponed", when
+ * receiving the End-Of-Frame.
+ * (16 bytes on P6 200MHz, inlined 4 to 6 us, postponed 4 to 5 us)
+ * (24 bytes on P6 200MHz, inlined 7 to 8 us, postponed 5 to 7 us)
+ * With ZeroCopy :
+ * (2005 B frame on P6 200MHz, inlined 10 to 25 us, postponed 140 to 180 us)
+ * Without ZeroCopy :
+ * (2005 B frame on P6 200MHz, inlined 30 to 50 us, postponed 150 to 180 us)
+ * (Note : numbers taken with irq disabled)
+ *
+ * From those numbers, it's not clear which is the best strategy, because
+ * we end up running through a lot of data one way or another (i.e. cache
+ * misses). I personally prefer to avoid the huge latency spike of the
+ * "postponed" solution, because it come just at the time when we have
+ * lot's of protocol processing to do and it will hurt our ability to
+ * reach low link turnaround times... Jean II
+ */
+//#define POSTPONE_RX_CRC
+
+/*
+ * Function async_bump (buf, len, stats)
+ *
+ *    Got a frame, make a copy of it, and pass it up the stack! We can try
+ *    to inline it since it's only called from state_inside_frame
+ */
+static inline void
+async_bump(struct net_device *dev,
+	   struct net_device_stats *stats,
+	   iobuff_t *rx_buff)
+{
+	struct sk_buff *newskb;
+	struct sk_buff *dataskb;
+	int		docopy;
+
+	/* Check if we need to copy the data to a new skb or not.
+	 * If the driver doesn't use ZeroCopy Rx, we have to do it.
+	 * With ZeroCopy Rx, the rx_buff already point to a valid
+	 * skb. But, if the frame is small, it is more efficient to
+	 * copy it to save memory (copy will be fast anyway - that's
+	 * called Rx-copy-break). Jean II */
+	docopy = ((rx_buff->skb == NULL) ||
+		  (rx_buff->len < IRDA_RX_COPY_THRESHOLD));
+
+	/* Allocate a new skb */
+	newskb = dev_alloc_skb(docopy ? rx_buff->len + 1 : rx_buff->truesize);
+	if (!newskb)  {
+		stats->rx_dropped++;
+		/* We could deliver the current skb if doing ZeroCopy Rx,
+		 * but this would stall the Rx path. Better drop the
+		 * packet... Jean II */
+		return;
+	}
+
+	/* Align IP header to 20 bytes (i.e. increase skb->data)
+	 * Note this is only useful with IrLAN, as PPP has a variable
+	 * header size (2 or 1 bytes) - Jean II */
+	skb_reserve(newskb, 1);
+
+	if(docopy) {
+		/* Copy data without CRC (length already checked) */
+		skb_copy_to_linear_data(newskb, rx_buff->data,
+					rx_buff->len - 2);
+		/* Deliver this skb */
+		dataskb = newskb;
+	} else {
+		/* We are using ZeroCopy. Deliver old skb */
+		dataskb = rx_buff->skb;
+		/* And hook the new skb to the rx_buff */
+		rx_buff->skb = newskb;
+		rx_buff->head = newskb->data;	/* NOT newskb->head */
+		//printk(KERN_DEBUG "ZeroCopy : len = %d, dataskb = %p, newskb = %p\n", rx_buff->len, dataskb, newskb);
+	}
+
+	/* Set proper length on skb (without CRC) */
+	skb_put(dataskb, rx_buff->len - 2);
+
+	/* Feed it to IrLAP layer */
+	dataskb->dev = dev;
+	skb_reset_mac_header(dataskb);
+	dataskb->protocol = htons(ETH_P_IRDA);
+
+	netif_rx(dataskb);
+
+	stats->rx_packets++;
+	stats->rx_bytes += rx_buff->len;
+
+	/* Clean up rx_buff (redundant with async_unwrap_bof() ???) */
+	rx_buff->data = rx_buff->head;
+	rx_buff->len = 0;
+}
+
+/*
+ * Function async_unwrap_bof(dev, byte)
+ *
+ *    Handle Beginning Of Frame character received within a frame
+ *
+ */
+static inline void
+async_unwrap_bof(struct net_device *dev,
+		 struct net_device_stats *stats,
+		 iobuff_t *rx_buff, __u8 byte)
+{
+	switch(rx_buff->state) {
+	case LINK_ESCAPE:
+	case INSIDE_FRAME:
+		/* Not supposed to happen, the previous frame is not
+		 * finished - Jean II */
+		IRDA_DEBUG(1, "%s(), Discarding incomplete frame\n",
+			   __func__);
+		stats->rx_errors++;
+		stats->rx_missed_errors++;
+		irda_device_set_media_busy(dev, TRUE);
+		break;
+
+	case OUTSIDE_FRAME:
+	case BEGIN_FRAME:
+	default:
+		/* We may receive multiple BOF at the start of frame */
+		break;
+	}
+
+	/* Now receiving frame */
+	rx_buff->state = BEGIN_FRAME;
+	rx_buff->in_frame = TRUE;
+
+	/* Time to initialize receive buffer */
+	rx_buff->data = rx_buff->head;
+	rx_buff->len = 0;
+	rx_buff->fcs = INIT_FCS;
+}
+
+/*
+ * Function async_unwrap_eof(dev, byte)
+ *
+ *    Handle End Of Frame character received within a frame
+ *
+ */
+static inline void
+async_unwrap_eof(struct net_device *dev,
+		 struct net_device_stats *stats,
+		 iobuff_t *rx_buff, __u8 byte)
+{
+#ifdef POSTPONE_RX_CRC
+	int	i;
+#endif
+
+	switch(rx_buff->state) {
+	case OUTSIDE_FRAME:
+		/* Probably missed the BOF */
+		stats->rx_errors++;
+		stats->rx_missed_errors++;
+		irda_device_set_media_busy(dev, TRUE);
+		break;
+
+	case BEGIN_FRAME:
+	case LINK_ESCAPE:
+	case INSIDE_FRAME:
+	default:
+		/* Note : in the case of BEGIN_FRAME and LINK_ESCAPE,
+		 * the fcs will most likely not match and generate an
+		 * error, as expected - Jean II */
+		rx_buff->state = OUTSIDE_FRAME;
+		rx_buff->in_frame = FALSE;
+
+#ifdef POSTPONE_RX_CRC
+		/* If we haven't done the CRC as we receive bytes, we
+		 * must do it now... Jean II */
+		for(i = 0; i < rx_buff->len; i++)
+			rx_buff->fcs = irda_fcs(rx_buff->fcs,
+						rx_buff->data[i]);
+#endif
+
+		/* Test FCS and signal success if the frame is good */
+		if (rx_buff->fcs == GOOD_FCS) {
+			/* Deliver frame */
+			async_bump(dev, stats, rx_buff);
+			break;
+		} else {
+			/* Wrong CRC, discard frame!  */
+			irda_device_set_media_busy(dev, TRUE);
+
+			IRDA_DEBUG(1, "%s(), crc error\n", __func__);
+			stats->rx_errors++;
+			stats->rx_crc_errors++;
+		}
+		break;
+	}
+}
+
+/*
+ * Function async_unwrap_ce(dev, byte)
+ *
+ *    Handle Character Escape character received within a frame
+ *
+ */
+static inline void
+async_unwrap_ce(struct net_device *dev,
+		 struct net_device_stats *stats,
+		 iobuff_t *rx_buff, __u8 byte)
+{
+	switch(rx_buff->state) {
+	case OUTSIDE_FRAME:
+		/* Activate carrier sense */
+		irda_device_set_media_busy(dev, TRUE);
+		break;
+
+	case LINK_ESCAPE:
+		IRDA_WARNING("%s: state not defined\n", __func__);
+		break;
+
+	case BEGIN_FRAME:
+	case INSIDE_FRAME:
+	default:
+		/* Stuffed byte coming */
+		rx_buff->state = LINK_ESCAPE;
+		break;
+	}
+}
+
+/*
+ * Function async_unwrap_other(dev, byte)
+ *
+ *    Handle other characters received within a frame
+ *
+ */
+static inline void
+async_unwrap_other(struct net_device *dev,
+		   struct net_device_stats *stats,
+		   iobuff_t *rx_buff, __u8 byte)
+{
+	switch(rx_buff->state) {
+		/* This is on the critical path, case are ordered by
+		 * probability (most frequent first) - Jean II */
+	case INSIDE_FRAME:
+		/* Must be the next byte of the frame */
+		if (rx_buff->len < rx_buff->truesize)  {
+			rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+			rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+		} else {
+			IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n",
+				   __func__);
+			rx_buff->state = OUTSIDE_FRAME;
+		}
+		break;
+
+	case LINK_ESCAPE:
+		/*
+		 *  Stuffed char, complement bit 5 of byte
+		 *  following CE, IrLAP p.114
+		 */
+		byte ^= IRDA_TRANS;
+		if (rx_buff->len < rx_buff->truesize)  {
+			rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+			rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+			rx_buff->state = INSIDE_FRAME;
+		} else {
+			IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n",
+				   __func__);
+			rx_buff->state = OUTSIDE_FRAME;
+		}
+		break;
+
+	case OUTSIDE_FRAME:
+		/* Activate carrier sense */
+		if(byte != XBOF)
+			irda_device_set_media_busy(dev, TRUE);
+		break;
+
+	case BEGIN_FRAME:
+	default:
+		rx_buff->data[rx_buff->len++] = byte;
+#ifndef POSTPONE_RX_CRC
+		rx_buff->fcs = irda_fcs(rx_buff->fcs, byte);
+#endif
+		rx_buff->state = INSIDE_FRAME;
+		break;
+	}
+}
+
+/*
+ * Function async_unwrap_char (dev, rx_buff, byte)
+ *
+ *    Parse and de-stuff frame received from the IrDA-port
+ *
+ * This is the main entry point for SIR drivers.
+ */
+void async_unwrap_char(struct net_device *dev,
+		       struct net_device_stats *stats,
+		       iobuff_t *rx_buff, __u8 byte)
+{
+	switch(byte) {
+	case CE:
+		async_unwrap_ce(dev, stats, rx_buff, byte);
+		break;
+	case BOF:
+		async_unwrap_bof(dev, stats, rx_buff, byte);
+		break;
+	case EOF:
+		async_unwrap_eof(dev, stats, rx_buff, byte);
+		break;
+	default:
+		async_unwrap_other(dev, stats, rx_buff, byte);
+		break;
+	}
+}
+EXPORT_SYMBOL(async_unwrap_char);
+
diff --git a/net/iucv/Kconfig b/net/iucv/Kconfig
new file mode 100644
index 00000000..16ce9cd4
--- /dev/null
+++ b/net/iucv/Kconfig
@@ -0,0 +1,15 @@
+config IUCV
+	tristate "IUCV support (S390 - z/VM only)"
+	depends on S390
+	help
+	  Select this option if you want to use inter-user communication
+	  under VM or VIF. If you run on z/VM, say "Y" to enable a fast
+	  communication link between VM guests.
+
+config AFIUCV
+	tristate "AF_IUCV support (S390 - z/VM only)"
+	depends on IUCV
+	help
+	  Select this option if you want to use inter-user communication under
+	  VM or VIF sockets. If you run on z/VM, say "Y" to enable a fast
+	  communication link between VM guests.
diff --git a/net/iucv/Makefile b/net/iucv/Makefile
new file mode 100644
index 00000000..7bfdc853
--- /dev/null
+++ b/net/iucv/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for IUCV
+#
+
+obj-$(CONFIG_IUCV)	+= iucv.o
+obj-$(CONFIG_AFIUCV)	+= af_iucv.o
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
new file mode 100644
index 00000000..e2013e43
--- /dev/null
+++ b/net/iucv/af_iucv.c
@@ -0,0 +1,1795 @@
+/*
+ *  IUCV protocol stack for Linux on zSeries
+ *
+ *  Copyright IBM Corp. 2006, 2009
+ *
+ *  Author(s):	Jennifer Hunt <jenhunt@us.ibm.com>
+ *		Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ *  PM functions:
+ *		Ursula Braun <ursula.braun@de.ibm.com>
+ */
+
+#define KMSG_COMPONENT "af_iucv"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+#include <asm/ebcdic.h>
+#include <asm/cpcmd.h>
+#include <linux/kmod.h>
+
+#include <net/iucv/iucv.h>
+#include <net/iucv/af_iucv.h>
+
+#define VERSION "1.1"
+
+static char iucv_userid[80];
+
+static const struct proto_ops iucv_sock_ops;
+
+static struct proto iucv_proto = {
+	.name		= "AF_IUCV",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct iucv_sock),
+};
+
+/* special AF_IUCV IPRM messages */
+static const u8 iprm_shutdown[8] =
+	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
+
+#define TRGCLS_SIZE	(sizeof(((struct iucv_message *)0)->class))
+
+/* macros to set/get socket control buffer at correct offset */
+#define CB_TAG(skb)	((skb)->cb)		/* iucv message tag */
+#define CB_TAG_LEN	(sizeof(((struct iucv_message *) 0)->tag))
+#define CB_TRGCLS(skb)	((skb)->cb + CB_TAG_LEN) /* iucv msg target class */
+#define CB_TRGCLS_LEN	(TRGCLS_SIZE)
+
+#define __iucv_sock_wait(sk, condition, timeo, ret)			\
+do {									\
+	DEFINE_WAIT(__wait);						\
+	long __timeo = timeo;						\
+	ret = 0;							\
+	prepare_to_wait(sk_sleep(sk), &__wait, TASK_INTERRUPTIBLE);	\
+	while (!(condition)) {						\
+		if (!__timeo) {						\
+			ret = -EAGAIN;					\
+			break;						\
+		}							\
+		if (signal_pending(current)) {				\
+			ret = sock_intr_errno(__timeo);			\
+			break;						\
+		}							\
+		release_sock(sk);					\
+		__timeo = schedule_timeout(__timeo);			\
+		lock_sock(sk);						\
+		ret = sock_error(sk);					\
+		if (ret)						\
+			break;						\
+	}								\
+	finish_wait(sk_sleep(sk), &__wait);				\
+} while (0)
+
+#define iucv_sock_wait(sk, condition, timeo)				\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__iucv_sock_wait(sk, condition, timeo, __ret);		\
+	__ret;								\
+})
+
+static void iucv_sock_kill(struct sock *sk);
+static void iucv_sock_close(struct sock *sk);
+
+/* Call Back functions */
+static void iucv_callback_rx(struct iucv_path *, struct iucv_message *);
+static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *);
+static void iucv_callback_connack(struct iucv_path *, u8 ipuser[16]);
+static int iucv_callback_connreq(struct iucv_path *, u8 ipvmid[8],
+				 u8 ipuser[16]);
+static void iucv_callback_connrej(struct iucv_path *, u8 ipuser[16]);
+static void iucv_callback_shutdown(struct iucv_path *, u8 ipuser[16]);
+
+static struct iucv_sock_list iucv_sk_list = {
+	.lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock),
+	.autobind_name = ATOMIC_INIT(0)
+};
+
+static struct iucv_handler af_iucv_handler = {
+	.path_pending	  = iucv_callback_connreq,
+	.path_complete	  = iucv_callback_connack,
+	.path_severed	  = iucv_callback_connrej,
+	.message_pending  = iucv_callback_rx,
+	.message_complete = iucv_callback_txdone,
+	.path_quiesced	  = iucv_callback_shutdown,
+};
+
+static inline void high_nmcpy(unsigned char *dst, char *src)
+{
+       memcpy(dst, src, 8);
+}
+
+static inline void low_nmcpy(unsigned char *dst, char *src)
+{
+       memcpy(&dst[8], src, 8);
+}
+
+static int afiucv_pm_prepare(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "afiucv_pm_prepare\n");
+#endif
+	return 0;
+}
+
+static void afiucv_pm_complete(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "afiucv_pm_complete\n");
+#endif
+}
+
+/**
+ * afiucv_pm_freeze() - Freeze PM callback
+ * @dev:	AFIUCV dummy device
+ *
+ * Sever all established IUCV communication pathes
+ */
+static int afiucv_pm_freeze(struct device *dev)
+{
+	struct iucv_sock *iucv;
+	struct sock *sk;
+	struct hlist_node *node;
+	int err = 0;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "afiucv_pm_freeze\n");
+#endif
+	read_lock(&iucv_sk_list.lock);
+	sk_for_each(sk, node, &iucv_sk_list.head) {
+		iucv = iucv_sk(sk);
+		skb_queue_purge(&iucv->send_skb_q);
+		skb_queue_purge(&iucv->backlog_skb_q);
+		switch (sk->sk_state) {
+		case IUCV_SEVERED:
+		case IUCV_DISCONN:
+		case IUCV_CLOSING:
+		case IUCV_CONNECTED:
+			if (iucv->path) {
+				err = iucv_path_sever(iucv->path, NULL);
+				iucv_path_free(iucv->path);
+				iucv->path = NULL;
+			}
+			break;
+		case IUCV_OPEN:
+		case IUCV_BOUND:
+		case IUCV_LISTEN:
+		case IUCV_CLOSED:
+		default:
+			break;
+		}
+	}
+	read_unlock(&iucv_sk_list.lock);
+	return err;
+}
+
+/**
+ * afiucv_pm_restore_thaw() - Thaw and restore PM callback
+ * @dev:	AFIUCV dummy device
+ *
+ * socket clean up after freeze
+ */
+static int afiucv_pm_restore_thaw(struct device *dev)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "afiucv_pm_restore_thaw\n");
+#endif
+	read_lock(&iucv_sk_list.lock);
+	sk_for_each(sk, node, &iucv_sk_list.head) {
+		switch (sk->sk_state) {
+		case IUCV_CONNECTED:
+			sk->sk_err = EPIPE;
+			sk->sk_state = IUCV_DISCONN;
+			sk->sk_state_change(sk);
+			break;
+		case IUCV_DISCONN:
+		case IUCV_SEVERED:
+		case IUCV_CLOSING:
+		case IUCV_LISTEN:
+		case IUCV_BOUND:
+		case IUCV_OPEN:
+		default:
+			break;
+		}
+	}
+	read_unlock(&iucv_sk_list.lock);
+	return 0;
+}
+
+static const struct dev_pm_ops afiucv_pm_ops = {
+	.prepare = afiucv_pm_prepare,
+	.complete = afiucv_pm_complete,
+	.freeze = afiucv_pm_freeze,
+	.thaw = afiucv_pm_restore_thaw,
+	.restore = afiucv_pm_restore_thaw,
+};
+
+static struct device_driver af_iucv_driver = {
+	.owner = THIS_MODULE,
+	.name = "afiucv",
+	.bus  = &iucv_bus,
+	.pm   = &afiucv_pm_ops,
+};
+
+/* dummy device used as trigger for PM functions */
+static struct device *af_iucv_dev;
+
+/**
+ * iucv_msg_length() - Returns the length of an iucv message.
+ * @msg:	Pointer to struct iucv_message, MUST NOT be NULL
+ *
+ * The function returns the length of the specified iucv message @msg of data
+ * stored in a buffer and of data stored in the parameter list (PRMDATA).
+ *
+ * For IUCV_IPRMDATA, AF_IUCV uses the following convention to transport socket
+ * data:
+ *	PRMDATA[0..6]	socket data (max 7 bytes);
+ *	PRMDATA[7]	socket data length value (len is 0xff - PRMDATA[7])
+ *
+ * The socket data length is computed by subtracting the socket data length
+ * value from 0xFF.
+ * If the socket data len is greater 7, then PRMDATA can be used for special
+ * notifications (see iucv_sock_shutdown); and further,
+ * if the socket data len is > 7, the function returns 8.
+ *
+ * Use this function to allocate socket buffers to store iucv message data.
+ */
+static inline size_t iucv_msg_length(struct iucv_message *msg)
+{
+	size_t datalen;
+
+	if (msg->flags & IUCV_IPRMDATA) {
+		datalen = 0xff - msg->rmmsg[7];
+		return (datalen < 8) ? datalen : 8;
+	}
+	return msg->length;
+}
+
+/**
+ * iucv_sock_in_state() - check for specific states
+ * @sk:		sock structure
+ * @state:	first iucv sk state
+ * @state:	second iucv sk state
+ *
+ * Returns true if the socket in either in the first or second state.
+ */
+static int iucv_sock_in_state(struct sock *sk, int state, int state2)
+{
+	return (sk->sk_state == state || sk->sk_state == state2);
+}
+
+/**
+ * iucv_below_msglim() - function to check if messages can be sent
+ * @sk:		sock structure
+ *
+ * Returns true if the send queue length is lower than the message limit.
+ * Always returns true if the socket is not connected (no iucv path for
+ * checking the message limit).
+ */
+static inline int iucv_below_msglim(struct sock *sk)
+{
+	struct iucv_sock *iucv = iucv_sk(sk);
+
+	if (sk->sk_state != IUCV_CONNECTED)
+		return 1;
+	return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim);
+}
+
+/**
+ * iucv_sock_wake_msglim() - Wake up thread waiting on msg limit
+ */
+static void iucv_sock_wake_msglim(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_all(&wq->wait);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+
+/* Timers */
+static void iucv_sock_timeout(unsigned long arg)
+{
+	struct sock *sk = (struct sock *)arg;
+
+	bh_lock_sock(sk);
+	sk->sk_err = ETIMEDOUT;
+	sk->sk_state_change(sk);
+	bh_unlock_sock(sk);
+
+	iucv_sock_kill(sk);
+	sock_put(sk);
+}
+
+static void iucv_sock_clear_timer(struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+static struct sock *__iucv_get_sock_by_name(char *nm)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+
+	sk_for_each(sk, node, &iucv_sk_list.head)
+		if (!memcmp(&iucv_sk(sk)->src_name, nm, 8))
+			return sk;
+
+	return NULL;
+}
+
+static void iucv_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+}
+
+/* Cleanup Listen */
+static void iucv_sock_cleanup_listen(struct sock *parent)
+{
+	struct sock *sk;
+
+	/* Close non-accepted connections */
+	while ((sk = iucv_accept_dequeue(parent, NULL))) {
+		iucv_sock_close(sk);
+		iucv_sock_kill(sk);
+	}
+
+	parent->sk_state = IUCV_CLOSED;
+}
+
+/* Kill socket (only if zapped and orphaned) */
+static void iucv_sock_kill(struct sock *sk)
+{
+	if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+		return;
+
+	iucv_sock_unlink(&iucv_sk_list, sk);
+	sock_set_flag(sk, SOCK_DEAD);
+	sock_put(sk);
+}
+
+/* Close an IUCV socket */
+static void iucv_sock_close(struct sock *sk)
+{
+	unsigned char user_data[16];
+	struct iucv_sock *iucv = iucv_sk(sk);
+	unsigned long timeo;
+
+	iucv_sock_clear_timer(sk);
+	lock_sock(sk);
+
+	switch (sk->sk_state) {
+	case IUCV_LISTEN:
+		iucv_sock_cleanup_listen(sk);
+		break;
+
+	case IUCV_CONNECTED:
+	case IUCV_DISCONN:
+		sk->sk_state = IUCV_CLOSING;
+		sk->sk_state_change(sk);
+
+		if (!skb_queue_empty(&iucv->send_skb_q)) {
+			if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
+				timeo = sk->sk_lingertime;
+			else
+				timeo = IUCV_DISCONN_TIMEOUT;
+			iucv_sock_wait(sk,
+					iucv_sock_in_state(sk, IUCV_CLOSED, 0),
+					timeo);
+		}
+
+	case IUCV_CLOSING:   /* fall through */
+		sk->sk_state = IUCV_CLOSED;
+		sk->sk_state_change(sk);
+
+		if (iucv->path) {
+			low_nmcpy(user_data, iucv->src_name);
+			high_nmcpy(user_data, iucv->dst_name);
+			ASCEBC(user_data, sizeof(user_data));
+			iucv_path_sever(iucv->path, user_data);
+			iucv_path_free(iucv->path);
+			iucv->path = NULL;
+		}
+
+		sk->sk_err = ECONNRESET;
+		sk->sk_state_change(sk);
+
+		skb_queue_purge(&iucv->send_skb_q);
+		skb_queue_purge(&iucv->backlog_skb_q);
+		break;
+
+	default:
+		/* nothing to do here */
+		break;
+	}
+
+	/* mark socket for deletion by iucv_sock_kill() */
+	sock_set_flag(sk, SOCK_ZAPPED);
+
+	release_sock(sk);
+}
+
+static void iucv_sock_init(struct sock *sk, struct sock *parent)
+{
+	if (parent)
+		sk->sk_type = parent->sk_type;
+}
+
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+{
+	struct sock *sk;
+
+	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+	if (!sk)
+		return NULL;
+
+	sock_init_data(sock, sk);
+	INIT_LIST_HEAD(&iucv_sk(sk)->accept_q);
+	spin_lock_init(&iucv_sk(sk)->accept_q_lock);
+	skb_queue_head_init(&iucv_sk(sk)->send_skb_q);
+	INIT_LIST_HEAD(&iucv_sk(sk)->message_q.list);
+	spin_lock_init(&iucv_sk(sk)->message_q.lock);
+	skb_queue_head_init(&iucv_sk(sk)->backlog_skb_q);
+	iucv_sk(sk)->send_tag = 0;
+	iucv_sk(sk)->flags = 0;
+	iucv_sk(sk)->msglimit = IUCV_QUEUELEN_DEFAULT;
+	iucv_sk(sk)->path = NULL;
+	memset(&iucv_sk(sk)->src_user_id , 0, 32);
+
+	sk->sk_destruct = iucv_sock_destruct;
+	sk->sk_sndtimeo = IUCV_CONN_TIMEOUT;
+	sk->sk_allocation = GFP_DMA;
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	sk->sk_protocol = proto;
+	sk->sk_state	= IUCV_OPEN;
+
+	setup_timer(&sk->sk_timer, iucv_sock_timeout, (unsigned long)sk);
+
+	iucv_sock_link(&iucv_sk_list, sk);
+	return sk;
+}
+
+/* Create an IUCV socket */
+static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+
+	if (protocol && protocol != PF_IUCV)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		sock->ops = &iucv_sock_ops;
+		break;
+	case SOCK_SEQPACKET:
+		/* currently, proto ops can handle both sk types */
+		sock->ops = &iucv_sock_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+	if (!sk)
+		return -ENOMEM;
+
+	iucv_sock_init(sk, NULL);
+
+	return 0;
+}
+
+void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_add_node(sk, &l->head);
+	write_unlock_bh(&l->lock);
+}
+
+void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
+{
+	write_lock_bh(&l->lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&l->lock);
+}
+
+void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+	unsigned long flags;
+	struct iucv_sock *par = iucv_sk(parent);
+
+	sock_hold(sk);
+	spin_lock_irqsave(&par->accept_q_lock, flags);
+	list_add_tail(&iucv_sk(sk)->accept_q, &par->accept_q);
+	spin_unlock_irqrestore(&par->accept_q_lock, flags);
+	iucv_sk(sk)->parent = parent;
+	sk_acceptq_added(parent);
+}
+
+void iucv_accept_unlink(struct sock *sk)
+{
+	unsigned long flags;
+	struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent);
+
+	spin_lock_irqsave(&par->accept_q_lock, flags);
+	list_del_init(&iucv_sk(sk)->accept_q);
+	spin_unlock_irqrestore(&par->accept_q_lock, flags);
+	sk_acceptq_removed(iucv_sk(sk)->parent);
+	iucv_sk(sk)->parent = NULL;
+	sock_put(sk);
+}
+
+struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock)
+{
+	struct iucv_sock *isk, *n;
+	struct sock *sk;
+
+	list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) {
+		sk = (struct sock *) isk;
+		lock_sock(sk);
+
+		if (sk->sk_state == IUCV_CLOSED) {
+			iucv_accept_unlink(sk);
+			release_sock(sk);
+			continue;
+		}
+
+		if (sk->sk_state == IUCV_CONNECTED ||
+		    sk->sk_state == IUCV_SEVERED ||
+		    sk->sk_state == IUCV_DISCONN ||	/* due to PM restore */
+		    !newsock) {
+			iucv_accept_unlink(sk);
+			if (newsock)
+				sock_graft(sk, newsock);
+
+			if (sk->sk_state == IUCV_SEVERED)
+				sk->sk_state = IUCV_DISCONN;
+
+			release_sock(sk);
+			return sk;
+		}
+
+		release_sock(sk);
+	}
+	return NULL;
+}
+
+/* Bind an unbound socket */
+static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
+			  int addr_len)
+{
+	struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv;
+	int err;
+
+	/* Verify the input sockaddr */
+	if (!addr || addr->sa_family != AF_IUCV)
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sk->sk_state != IUCV_OPEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	write_lock_bh(&iucv_sk_list.lock);
+
+	iucv = iucv_sk(sk);
+	if (__iucv_get_sock_by_name(sa->siucv_name)) {
+		err = -EADDRINUSE;
+		goto done_unlock;
+	}
+	if (iucv->path) {
+		err = 0;
+		goto done_unlock;
+	}
+
+	/* Bind the socket */
+	memcpy(iucv->src_name, sa->siucv_name, 8);
+
+	/* Copy the user id */
+	memcpy(iucv->src_user_id, iucv_userid, 8);
+	sk->sk_state = IUCV_BOUND;
+	err = 0;
+
+done_unlock:
+	/* Release the socket list lock */
+	write_unlock_bh(&iucv_sk_list.lock);
+done:
+	release_sock(sk);
+	return err;
+}
+
+/* Automatically bind an unbound socket */
+static int iucv_sock_autobind(struct sock *sk)
+{
+	struct iucv_sock *iucv = iucv_sk(sk);
+	char query_buffer[80];
+	char name[12];
+	int err = 0;
+
+	/* Set the userid and name */
+	cpcmd("QUERY USERID", query_buffer, sizeof(query_buffer), &err);
+	if (unlikely(err))
+		return -EPROTO;
+
+	memcpy(iucv->src_user_id, query_buffer, 8);
+
+	write_lock_bh(&iucv_sk_list.lock);
+
+	sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name));
+	while (__iucv_get_sock_by_name(name)) {
+		sprintf(name, "%08x",
+			atomic_inc_return(&iucv_sk_list.autobind_name));
+	}
+
+	write_unlock_bh(&iucv_sk_list.lock);
+
+	memcpy(&iucv->src_name, name, 8);
+
+	return err;
+}
+
+/* Connect an unconnected socket */
+static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr,
+			     int alen, int flags)
+{
+	struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv;
+	unsigned char user_data[16];
+	int err;
+
+	if (addr->sa_family != AF_IUCV || alen < sizeof(struct sockaddr_iucv))
+		return -EINVAL;
+
+	if (sk->sk_state != IUCV_OPEN && sk->sk_state != IUCV_BOUND)
+		return -EBADFD;
+
+	if (sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_SEQPACKET)
+		return -EINVAL;
+
+	if (sk->sk_state == IUCV_OPEN) {
+		err = iucv_sock_autobind(sk);
+		if (unlikely(err))
+			return err;
+	}
+
+	lock_sock(sk);
+
+	/* Set the destination information */
+	memcpy(iucv_sk(sk)->dst_user_id, sa->siucv_user_id, 8);
+	memcpy(iucv_sk(sk)->dst_name, sa->siucv_name, 8);
+
+	high_nmcpy(user_data, sa->siucv_name);
+	low_nmcpy(user_data, iucv_sk(sk)->src_name);
+	ASCEBC(user_data, sizeof(user_data));
+
+	iucv = iucv_sk(sk);
+	/* Create path. */
+	iucv->path = iucv_path_alloc(iucv->msglimit,
+				     IUCV_IPRMDATA, GFP_KERNEL);
+	if (!iucv->path) {
+		err = -ENOMEM;
+		goto done;
+	}
+	err = iucv_path_connect(iucv->path, &af_iucv_handler,
+				sa->siucv_user_id, NULL, user_data, sk);
+	if (err) {
+		iucv_path_free(iucv->path);
+		iucv->path = NULL;
+		switch (err) {
+		case 0x0b:	/* Target communicator is not logged on */
+			err = -ENETUNREACH;
+			break;
+		case 0x0d:	/* Max connections for this guest exceeded */
+		case 0x0e:	/* Max connections for target guest exceeded */
+			err = -EAGAIN;
+			break;
+		case 0x0f:	/* Missing IUCV authorization */
+			err = -EACCES;
+			break;
+		default:
+			err = -ECONNREFUSED;
+			break;
+		}
+		goto done;
+	}
+
+	if (sk->sk_state != IUCV_CONNECTED) {
+		err = iucv_sock_wait(sk, iucv_sock_in_state(sk, IUCV_CONNECTED,
+							    IUCV_DISCONN),
+				     sock_sndtimeo(sk, flags & O_NONBLOCK));
+	}
+
+	if (sk->sk_state == IUCV_DISCONN) {
+		err = -ECONNREFUSED;
+	}
+
+	if (err) {
+		iucv_path_sever(iucv->path, NULL);
+		iucv_path_free(iucv->path);
+		iucv->path = NULL;
+	}
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+/* Move a socket into listening state. */
+static int iucv_sock_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sk->sk_state != IUCV_BOUND)
+		goto done;
+
+	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+		goto done;
+
+	sk->sk_max_ack_backlog = backlog;
+	sk->sk_ack_backlog = 0;
+	sk->sk_state = IUCV_LISTEN;
+	err = 0;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+/* Accept a pending connection */
+static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
+			    int flags)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct sock *sk = sock->sk, *nsk;
+	long timeo;
+	int err = 0;
+
+	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+	if (sk->sk_state != IUCV_LISTEN) {
+		err = -EBADFD;
+		goto done;
+	}
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	/* Wait for an incoming connection */
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	while (!(nsk = iucv_accept_dequeue(sk, newsock))) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!timeo) {
+			err = -EAGAIN;
+			break;
+		}
+
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+		if (sk->sk_state != IUCV_LISTEN) {
+			err = -EBADFD;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			break;
+		}
+	}
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	if (err)
+		goto done;
+
+	newsock->state = SS_CONNECTED;
+
+done:
+	release_sock(sk);
+	return err;
+}
+
+static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
+			     int *len, int peer)
+{
+	struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr;
+	struct sock *sk = sock->sk;
+
+	addr->sa_family = AF_IUCV;
+	*len = sizeof(struct sockaddr_iucv);
+
+	if (peer) {
+		memcpy(siucv->siucv_user_id, iucv_sk(sk)->dst_user_id, 8);
+		memcpy(siucv->siucv_name, &iucv_sk(sk)->dst_name, 8);
+	} else {
+		memcpy(siucv->siucv_user_id, iucv_sk(sk)->src_user_id, 8);
+		memcpy(siucv->siucv_name, iucv_sk(sk)->src_name, 8);
+	}
+	memset(&siucv->siucv_port, 0, sizeof(siucv->siucv_port));
+	memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr));
+	memset(siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid));
+
+	return 0;
+}
+
+/**
+ * iucv_send_iprm() - Send socket data in parameter list of an iucv message.
+ * @path:	IUCV path
+ * @msg:	Pointer to a struct iucv_message
+ * @skb:	The socket data to send, skb->len MUST BE <= 7
+ *
+ * Send the socket data in the parameter list in the iucv message
+ * (IUCV_IPRMDATA). The socket data is stored at index 0 to 6 in the parameter
+ * list and the socket data len at index 7 (last byte).
+ * See also iucv_msg_length().
+ *
+ * Returns the error code from the iucv_message_send() call.
+ */
+static int iucv_send_iprm(struct iucv_path *path, struct iucv_message *msg,
+			  struct sk_buff *skb)
+{
+	u8 prmdata[8];
+
+	memcpy(prmdata, (void *) skb->data, skb->len);
+	prmdata[7] = 0xff - (u8) skb->len;
+	return iucv_message_send(path, msg, IUCV_IPRMDATA, 0,
+				 (void *) prmdata, 8);
+}
+
+static int iucv_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+			     struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	struct sk_buff *skb;
+	struct iucv_message txmsg;
+	struct cmsghdr *cmsg;
+	int cmsg_done;
+	long timeo;
+	char user_id[9];
+	char appl_id[9];
+	int err;
+	int noblock = msg->msg_flags & MSG_DONTWAIT;
+
+	err = sock_error(sk);
+	if (err)
+		return err;
+
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/* SOCK_SEQPACKET: we do not support segmented records */
+	if (sk->sk_type == SOCK_SEQPACKET && !(msg->msg_flags & MSG_EOR))
+		return -EOPNOTSUPP;
+
+	lock_sock(sk);
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		err = -EPIPE;
+		goto out;
+	}
+
+	/* Return if the socket is not in connected state */
+	if (sk->sk_state != IUCV_CONNECTED) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	/* initialize defaults */
+	cmsg_done   = 0;	/* check for duplicate headers */
+	txmsg.class = 0;
+
+	/* iterate over control messages */
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg;
+		cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if (!CMSG_OK(msg, cmsg)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (cmsg->cmsg_level != SOL_IUCV)
+			continue;
+
+		if (cmsg->cmsg_type & cmsg_done) {
+			err = -EINVAL;
+			goto out;
+		}
+		cmsg_done |= cmsg->cmsg_type;
+
+		switch (cmsg->cmsg_type) {
+		case SCM_IUCV_TRGCLS:
+			if (cmsg->cmsg_len != CMSG_LEN(TRGCLS_SIZE)) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			/* set iucv message target class */
+			memcpy(&txmsg.class,
+				(void *) CMSG_DATA(cmsg), TRGCLS_SIZE);
+
+			break;
+
+		default:
+			err = -EINVAL;
+			goto out;
+			break;
+		}
+	}
+
+	/* allocate one skb for each iucv message:
+	 * this is fine for SOCK_SEQPACKET (unless we want to support
+	 * segmented records using the MSG_EOR flag), but
+	 * for SOCK_STREAM we might want to improve it in future */
+	skb = sock_alloc_send_skb(sk, len, noblock, &err);
+	if (!skb)
+		goto out;
+	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+		err = -EFAULT;
+		goto fail;
+	}
+
+	/* wait if outstanding messages for iucv path has reached */
+	timeo = sock_sndtimeo(sk, noblock);
+	err = iucv_sock_wait(sk, iucv_below_msglim(sk), timeo);
+	if (err)
+		goto fail;
+
+	/* return -ECONNRESET if the socket is no longer connected */
+	if (sk->sk_state != IUCV_CONNECTED) {
+		err = -ECONNRESET;
+		goto fail;
+	}
+
+	/* increment and save iucv message tag for msg_completion cbk */
+	txmsg.tag = iucv->send_tag++;
+	memcpy(CB_TAG(skb), &txmsg.tag, CB_TAG_LEN);
+	skb_queue_tail(&iucv->send_skb_q, skb);
+
+	if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags)
+	      && skb->len <= 7) {
+		err = iucv_send_iprm(iucv->path, &txmsg, skb);
+
+		/* on success: there is no message_complete callback
+		 * for an IPRMDATA msg; remove skb from send queue */
+		if (err == 0) {
+			skb_unlink(skb, &iucv->send_skb_q);
+			kfree_skb(skb);
+		}
+
+		/* this error should never happen since the
+		 * IUCV_IPRMDATA path flag is set... sever path */
+		if (err == 0x15) {
+			iucv_path_sever(iucv->path, NULL);
+			skb_unlink(skb, &iucv->send_skb_q);
+			err = -EPIPE;
+			goto fail;
+		}
+	} else
+		err = iucv_message_send(iucv->path, &txmsg, 0, 0,
+					(void *) skb->data, skb->len);
+	if (err) {
+		if (err == 3) {
+			user_id[8] = 0;
+			memcpy(user_id, iucv->dst_user_id, 8);
+			appl_id[8] = 0;
+			memcpy(appl_id, iucv->dst_name, 8);
+			pr_err("Application %s on z/VM guest %s"
+				" exceeds message limit\n",
+				appl_id, user_id);
+			err = -EAGAIN;
+		} else
+			err = -EPIPE;
+		skb_unlink(skb, &iucv->send_skb_q);
+		goto fail;
+	}
+
+	release_sock(sk);
+	return len;
+
+fail:
+	kfree_skb(skb);
+out:
+	release_sock(sk);
+	return err;
+}
+
+/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len)
+{
+	int dataleft, size, copied = 0;
+	struct sk_buff *nskb;
+
+	dataleft = len;
+	while (dataleft) {
+		if (dataleft >= sk->sk_rcvbuf / 4)
+			size = sk->sk_rcvbuf / 4;
+		else
+			size = dataleft;
+
+		nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA);
+		if (!nskb)
+			return -ENOMEM;
+
+		/* copy target class to control buffer of new skb */
+		memcpy(CB_TRGCLS(nskb), CB_TRGCLS(skb), CB_TRGCLS_LEN);
+
+		/* copy data fragment */
+		memcpy(nskb->data, skb->data + copied, size);
+		copied += size;
+		dataleft -= size;
+
+		skb_reset_transport_header(nskb);
+		skb_reset_network_header(nskb);
+		nskb->len = size;
+
+		skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb);
+	}
+
+	return 0;
+}
+
+/* iucv_process_message() - Receive a single outstanding IUCV message
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
+				 struct iucv_path *path,
+				 struct iucv_message *msg)
+{
+	int rc;
+	unsigned int len;
+
+	len = iucv_msg_length(msg);
+
+	/* store msg target class in the second 4 bytes of skb ctrl buffer */
+	/* Note: the first 4 bytes are reserved for msg tag */
+	memcpy(CB_TRGCLS(skb), &msg->class, CB_TRGCLS_LEN);
+
+	/* check for special IPRM messages (e.g. iucv_sock_shutdown) */
+	if ((msg->flags & IUCV_IPRMDATA) && len > 7) {
+		if (memcmp(msg->rmmsg, iprm_shutdown, 8) == 0) {
+			skb->data = NULL;
+			skb->len = 0;
+		}
+	} else {
+		rc = iucv_message_receive(path, msg, msg->flags & IUCV_IPRMDATA,
+					  skb->data, len, NULL);
+		if (rc) {
+			kfree_skb(skb);
+			return;
+		}
+		/* we need to fragment iucv messages for SOCK_STREAM only;
+		 * for SOCK_SEQPACKET, it is only relevant if we support
+		 * record segmentation using MSG_EOR (see also recvmsg()) */
+		if (sk->sk_type == SOCK_STREAM &&
+		    skb->truesize >= sk->sk_rcvbuf / 4) {
+			rc = iucv_fragment_skb(sk, skb, len);
+			kfree_skb(skb);
+			skb = NULL;
+			if (rc) {
+				iucv_path_sever(path, NULL);
+				return;
+			}
+			skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q);
+		} else {
+			skb_reset_transport_header(skb);
+			skb_reset_network_header(skb);
+			skb->len = len;
+		}
+	}
+
+	if (sock_queue_rcv_skb(sk, skb))
+		skb_queue_head(&iucv_sk(sk)->backlog_skb_q, skb);
+}
+
+/* iucv_process_message_q() - Process outstanding IUCV messages
+ *
+ * Locking: must be called with message_q.lock held
+ */
+static void iucv_process_message_q(struct sock *sk)
+{
+	struct iucv_sock *iucv = iucv_sk(sk);
+	struct sk_buff *skb;
+	struct sock_msg_q *p, *n;
+
+	list_for_each_entry_safe(p, n, &iucv->message_q.list, list) {
+		skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA);
+		if (!skb)
+			break;
+		iucv_process_message(sk, skb, p->path, &p->msg);
+		list_del(&p->list);
+		kfree(p);
+		if (!skb_queue_empty(&iucv->backlog_skb_q))
+			break;
+	}
+}
+
+static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+			     struct msghdr *msg, size_t len, int flags)
+{
+	int noblock = flags & MSG_DONTWAIT;
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	unsigned int copied, rlen;
+	struct sk_buff *skb, *rskb, *cskb;
+	int err = 0;
+
+	if ((sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED) &&
+	    skb_queue_empty(&iucv->backlog_skb_q) &&
+	    skb_queue_empty(&sk->sk_receive_queue) &&
+	    list_empty(&iucv->message_q.list))
+		return 0;
+
+	if (flags & (MSG_OOB))
+		return -EOPNOTSUPP;
+
+	/* receive/dequeue next skb:
+	 * the function understands MSG_PEEK and, thus, does not dequeue skb */
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb) {
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			return 0;
+		return err;
+	}
+
+	rlen   = skb->len;		/* real length of skb */
+	copied = min_t(unsigned int, rlen, len);
+
+	cskb = skb;
+	if (memcpy_toiovec(msg->msg_iov, cskb->data, copied)) {
+		if (!(flags & MSG_PEEK))
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		return -EFAULT;
+	}
+
+	/* SOCK_SEQPACKET: set MSG_TRUNC if recv buf size is too small */
+	if (sk->sk_type == SOCK_SEQPACKET) {
+		if (copied < rlen)
+			msg->msg_flags |= MSG_TRUNC;
+		/* each iucv message contains a complete record */
+		msg->msg_flags |= MSG_EOR;
+	}
+
+	/* create control message to store iucv msg target class:
+	 * get the trgcls from the control buffer of the skb due to
+	 * fragmentation of original iucv message. */
+	err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS,
+			CB_TRGCLS_LEN, CB_TRGCLS(skb));
+	if (err) {
+		if (!(flags & MSG_PEEK))
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		return err;
+	}
+
+	/* Mark read part of skb as used */
+	if (!(flags & MSG_PEEK)) {
+
+		/* SOCK_STREAM: re-queue skb if it contains unreceived data */
+		if (sk->sk_type == SOCK_STREAM) {
+			skb_pull(skb, copied);
+			if (skb->len) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				goto done;
+			}
+		}
+
+		kfree_skb(skb);
+
+		/* Queue backlog skbs */
+		spin_lock_bh(&iucv->message_q.lock);
+		rskb = skb_dequeue(&iucv->backlog_skb_q);
+		while (rskb) {
+			if (sock_queue_rcv_skb(sk, rskb)) {
+				skb_queue_head(&iucv->backlog_skb_q,
+						rskb);
+				break;
+			} else {
+				rskb = skb_dequeue(&iucv->backlog_skb_q);
+			}
+		}
+		if (skb_queue_empty(&iucv->backlog_skb_q)) {
+			if (!list_empty(&iucv->message_q.list))
+				iucv_process_message_q(sk);
+		}
+		spin_unlock_bh(&iucv->message_q.lock);
+	}
+
+done:
+	/* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */
+	if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC))
+		copied = rlen;
+
+	return copied;
+}
+
+static inline unsigned int iucv_accept_poll(struct sock *parent)
+{
+	struct iucv_sock *isk, *n;
+	struct sock *sk;
+
+	list_for_each_entry_safe(isk, n, &iucv_sk(parent)->accept_q, accept_q) {
+		sk = (struct sock *) isk;
+
+		if (sk->sk_state == IUCV_CONNECTED)
+			return POLLIN | POLLRDNORM;
+	}
+
+	return 0;
+}
+
+unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
+			    poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask = 0;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+
+	if (sk->sk_state == IUCV_LISTEN)
+		return iucv_accept_poll(sk);
+
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP;
+
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	if (!skb_queue_empty(&sk->sk_receive_queue) ||
+	    (sk->sk_shutdown & RCV_SHUTDOWN))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sk->sk_state == IUCV_CLOSED)
+		mask |= POLLHUP;
+
+	if (sk->sk_state == IUCV_DISCONN || sk->sk_state == IUCV_SEVERED)
+		mask |= POLLIN;
+
+	if (sock_writeable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	else
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	return mask;
+}
+
+static int iucv_sock_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	struct iucv_message txmsg;
+	int err = 0;
+
+	how++;
+
+	if ((how & ~SHUTDOWN_MASK) || !how)
+		return -EINVAL;
+
+	lock_sock(sk);
+	switch (sk->sk_state) {
+	case IUCV_DISCONN:
+	case IUCV_CLOSING:
+	case IUCV_SEVERED:
+	case IUCV_CLOSED:
+		err = -ENOTCONN;
+		goto fail;
+
+	default:
+		sk->sk_shutdown |= how;
+		break;
+	}
+
+	if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) {
+		txmsg.class = 0;
+		txmsg.tag = 0;
+		err = iucv_message_send(iucv->path, &txmsg, IUCV_IPRMDATA, 0,
+					(void *) iprm_shutdown, 8);
+		if (err) {
+			switch (err) {
+			case 1:
+				err = -ENOTCONN;
+				break;
+			case 2:
+				err = -ECONNRESET;
+				break;
+			default:
+				err = -ENOTCONN;
+				break;
+			}
+		}
+	}
+
+	if (how == RCV_SHUTDOWN || how == SHUTDOWN_MASK) {
+		err = iucv_path_quiesce(iucv_sk(sk)->path, NULL);
+		if (err)
+			err = -ENOTCONN;
+
+		skb_queue_purge(&sk->sk_receive_queue);
+	}
+
+	/* Wake up anyone sleeping in poll */
+	sk->sk_state_change(sk);
+
+fail:
+	release_sock(sk);
+	return err;
+}
+
+static int iucv_sock_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	if (!sk)
+		return 0;
+
+	iucv_sock_close(sk);
+
+	/* Unregister with IUCV base support */
+	if (iucv_sk(sk)->path) {
+		iucv_path_sever(iucv_sk(sk)->path, NULL);
+		iucv_path_free(iucv_sk(sk)->path);
+		iucv_sk(sk)->path = NULL;
+	}
+
+	sock_orphan(sk);
+	iucv_sock_kill(sk);
+	return err;
+}
+
+/* getsockopt and setsockopt */
+static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	int val;
+	int rc;
+
+	if (level != SOL_IUCV)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *) optval))
+		return -EFAULT;
+
+	rc = 0;
+
+	lock_sock(sk);
+	switch (optname) {
+	case SO_IPRMDATA_MSG:
+		if (val)
+			iucv->flags |= IUCV_IPRMDATA;
+		else
+			iucv->flags &= ~IUCV_IPRMDATA;
+		break;
+	case SO_MSGLIMIT:
+		switch (sk->sk_state) {
+		case IUCV_OPEN:
+		case IUCV_BOUND:
+			if (val < 1 || val > (u16)(~0))
+				rc = -EINVAL;
+			else
+				iucv->msglimit = val;
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		break;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
+static int iucv_sock_getsockopt(struct socket *sock, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	int val, len;
+
+	if (level != SOL_IUCV)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	switch (optname) {
+	case SO_IPRMDATA_MSG:
+		val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0;
+		break;
+	case SO_MSGLIMIT:
+		lock_sock(sk);
+		val = (iucv->path != NULL) ? iucv->path->msglim	/* connected */
+					   : iucv->msglimit;	/* default */
+		release_sock(sk);
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+
+/* Callback wrappers - called from iucv base support */
+static int iucv_callback_connreq(struct iucv_path *path,
+				 u8 ipvmid[8], u8 ipuser[16])
+{
+	unsigned char user_data[16];
+	unsigned char nuser_data[16];
+	unsigned char src_name[8];
+	struct hlist_node *node;
+	struct sock *sk, *nsk;
+	struct iucv_sock *iucv, *niucv;
+	int err;
+
+	memcpy(src_name, ipuser, 8);
+	EBCASC(src_name, 8);
+	/* Find out if this path belongs to af_iucv. */
+	read_lock(&iucv_sk_list.lock);
+	iucv = NULL;
+	sk = NULL;
+	sk_for_each(sk, node, &iucv_sk_list.head)
+		if (sk->sk_state == IUCV_LISTEN &&
+		    !memcmp(&iucv_sk(sk)->src_name, src_name, 8)) {
+			/*
+			 * Found a listening socket with
+			 * src_name == ipuser[0-7].
+			 */
+			iucv = iucv_sk(sk);
+			break;
+		}
+	read_unlock(&iucv_sk_list.lock);
+	if (!iucv)
+		/* No socket found, not one of our paths. */
+		return -EINVAL;
+
+	bh_lock_sock(sk);
+
+	/* Check if parent socket is listening */
+	low_nmcpy(user_data, iucv->src_name);
+	high_nmcpy(user_data, iucv->dst_name);
+	ASCEBC(user_data, sizeof(user_data));
+	if (sk->sk_state != IUCV_LISTEN) {
+		err = iucv_path_sever(path, user_data);
+		iucv_path_free(path);
+		goto fail;
+	}
+
+	/* Check for backlog size */
+	if (sk_acceptq_is_full(sk)) {
+		err = iucv_path_sever(path, user_data);
+		iucv_path_free(path);
+		goto fail;
+	}
+
+	/* Create the new socket */
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	if (!nsk) {
+		err = iucv_path_sever(path, user_data);
+		iucv_path_free(path);
+		goto fail;
+	}
+
+	niucv = iucv_sk(nsk);
+	iucv_sock_init(nsk, sk);
+
+	/* Set the new iucv_sock */
+	memcpy(niucv->dst_name, ipuser + 8, 8);
+	EBCASC(niucv->dst_name, 8);
+	memcpy(niucv->dst_user_id, ipvmid, 8);
+	memcpy(niucv->src_name, iucv->src_name, 8);
+	memcpy(niucv->src_user_id, iucv->src_user_id, 8);
+	niucv->path = path;
+
+	/* Call iucv_accept */
+	high_nmcpy(nuser_data, ipuser + 8);
+	memcpy(nuser_data + 8, niucv->src_name, 8);
+	ASCEBC(nuser_data + 8, 8);
+
+	/* set message limit for path based on msglimit of accepting socket */
+	niucv->msglimit = iucv->msglimit;
+	path->msglim = iucv->msglimit;
+	err = iucv_path_accept(path, &af_iucv_handler, nuser_data, nsk);
+	if (err) {
+		err = iucv_path_sever(path, user_data);
+		iucv_path_free(path);
+		iucv_sock_kill(nsk);
+		goto fail;
+	}
+
+	iucv_accept_enqueue(sk, nsk);
+
+	/* Wake up accept */
+	nsk->sk_state = IUCV_CONNECTED;
+	sk->sk_data_ready(sk, 1);
+	err = 0;
+fail:
+	bh_unlock_sock(sk);
+	return 0;
+}
+
+static void iucv_callback_connack(struct iucv_path *path, u8 ipuser[16])
+{
+	struct sock *sk = path->private;
+
+	sk->sk_state = IUCV_CONNECTED;
+	sk->sk_state_change(sk);
+}
+
+static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg)
+{
+	struct sock *sk = path->private;
+	struct iucv_sock *iucv = iucv_sk(sk);
+	struct sk_buff *skb;
+	struct sock_msg_q *save_msg;
+	int len;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN) {
+		iucv_message_reject(path, msg);
+		return;
+	}
+
+	spin_lock(&iucv->message_q.lock);
+
+	if (!list_empty(&iucv->message_q.list) ||
+	    !skb_queue_empty(&iucv->backlog_skb_q))
+		goto save_message;
+
+	len = atomic_read(&sk->sk_rmem_alloc);
+	len += iucv_msg_length(msg) + sizeof(struct sk_buff);
+	if (len > sk->sk_rcvbuf)
+		goto save_message;
+
+	skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA);
+	if (!skb)
+		goto save_message;
+
+	iucv_process_message(sk, skb, path, msg);
+	goto out_unlock;
+
+save_message:
+	save_msg = kzalloc(sizeof(struct sock_msg_q), GFP_ATOMIC | GFP_DMA);
+	if (!save_msg)
+		goto out_unlock;
+	save_msg->path = path;
+	save_msg->msg = *msg;
+
+	list_add_tail(&save_msg->list, &iucv->message_q.list);
+
+out_unlock:
+	spin_unlock(&iucv->message_q.lock);
+}
+
+static void iucv_callback_txdone(struct iucv_path *path,
+				 struct iucv_message *msg)
+{
+	struct sock *sk = path->private;
+	struct sk_buff *this = NULL;
+	struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q;
+	struct sk_buff *list_skb = list->next;
+	unsigned long flags;
+
+	if (!skb_queue_empty(list)) {
+		spin_lock_irqsave(&list->lock, flags);
+
+		while (list_skb != (struct sk_buff *)list) {
+			if (!memcmp(&msg->tag, CB_TAG(list_skb), CB_TAG_LEN)) {
+				this = list_skb;
+				break;
+			}
+			list_skb = list_skb->next;
+		}
+		if (this)
+			__skb_unlink(this, list);
+
+		spin_unlock_irqrestore(&list->lock, flags);
+
+		if (this) {
+			kfree_skb(this);
+			/* wake up any process waiting for sending */
+			iucv_sock_wake_msglim(sk);
+		}
+	}
+	BUG_ON(!this);
+
+	if (sk->sk_state == IUCV_CLOSING) {
+		if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) {
+			sk->sk_state = IUCV_CLOSED;
+			sk->sk_state_change(sk);
+		}
+	}
+
+}
+
+static void iucv_callback_connrej(struct iucv_path *path, u8 ipuser[16])
+{
+	struct sock *sk = path->private;
+
+	if (!list_empty(&iucv_sk(sk)->accept_q))
+		sk->sk_state = IUCV_SEVERED;
+	else
+		sk->sk_state = IUCV_DISCONN;
+
+	sk->sk_state_change(sk);
+}
+
+/* called if the other communication side shuts down its RECV direction;
+ * in turn, the callback sets SEND_SHUTDOWN to disable sending of data.
+ */
+static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
+{
+	struct sock *sk = path->private;
+
+	bh_lock_sock(sk);
+	if (sk->sk_state != IUCV_CLOSED) {
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		sk->sk_state_change(sk);
+	}
+	bh_unlock_sock(sk);
+}
+
+static const struct proto_ops iucv_sock_ops = {
+	.family		= PF_IUCV,
+	.owner		= THIS_MODULE,
+	.release	= iucv_sock_release,
+	.bind		= iucv_sock_bind,
+	.connect	= iucv_sock_connect,
+	.listen		= iucv_sock_listen,
+	.accept		= iucv_sock_accept,
+	.getname	= iucv_sock_getname,
+	.sendmsg	= iucv_sock_sendmsg,
+	.recvmsg	= iucv_sock_recvmsg,
+	.poll		= iucv_sock_poll,
+	.ioctl		= sock_no_ioctl,
+	.mmap		= sock_no_mmap,
+	.socketpair	= sock_no_socketpair,
+	.shutdown	= iucv_sock_shutdown,
+	.setsockopt	= iucv_sock_setsockopt,
+	.getsockopt	= iucv_sock_getsockopt,
+};
+
+static const struct net_proto_family iucv_sock_family_ops = {
+	.family	= AF_IUCV,
+	.owner	= THIS_MODULE,
+	.create	= iucv_sock_create,
+};
+
+static int __init afiucv_init(void)
+{
+	int err;
+
+	if (!MACHINE_IS_VM) {
+		pr_err("The af_iucv module cannot be loaded"
+		       " without z/VM\n");
+		err = -EPROTONOSUPPORT;
+		goto out;
+	}
+	cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err);
+	if (unlikely(err)) {
+		WARN_ON(err);
+		err = -EPROTONOSUPPORT;
+		goto out;
+	}
+
+	err = iucv_register(&af_iucv_handler, 0);
+	if (err)
+		goto out;
+	err = proto_register(&iucv_proto, 0);
+	if (err)
+		goto out_iucv;
+	err = sock_register(&iucv_sock_family_ops);
+	if (err)
+		goto out_proto;
+	/* establish dummy device */
+	err = driver_register(&af_iucv_driver);
+	if (err)
+		goto out_sock;
+	af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+	if (!af_iucv_dev) {
+		err = -ENOMEM;
+		goto out_driver;
+	}
+	dev_set_name(af_iucv_dev, "af_iucv");
+	af_iucv_dev->bus = &iucv_bus;
+	af_iucv_dev->parent = iucv_root;
+	af_iucv_dev->release = (void (*)(struct device *))kfree;
+	af_iucv_dev->driver = &af_iucv_driver;
+	err = device_register(af_iucv_dev);
+	if (err)
+		goto out_driver;
+
+	return 0;
+
+out_driver:
+	driver_unregister(&af_iucv_driver);
+out_sock:
+	sock_unregister(PF_IUCV);
+out_proto:
+	proto_unregister(&iucv_proto);
+out_iucv:
+	iucv_unregister(&af_iucv_handler, 0);
+out:
+	return err;
+}
+
+static void __exit afiucv_exit(void)
+{
+	device_unregister(af_iucv_dev);
+	driver_unregister(&af_iucv_driver);
+	sock_unregister(PF_IUCV);
+	proto_unregister(&iucv_proto);
+	iucv_unregister(&af_iucv_handler, 0);
+}
+
+module_init(afiucv_init);
+module_exit(afiucv_exit);
+
+MODULE_AUTHOR("Jennifer Hunt <jenhunt@us.ibm.com>");
+MODULE_DESCRIPTION("IUCV Sockets ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_IUCV);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
new file mode 100644
index 00000000..7f912491
--- /dev/null
+++ b/net/iucv/iucv.c
@@ -0,0 +1,2098 @@
+/*
+ * IUCV base infrastructure.
+ *
+ * Copyright IBM Corp. 2001, 2009
+ *
+ * Author(s):
+ *    Original source:
+ *	Alan Altmark (Alan_Altmark@us.ibm.com)	Sept. 2000
+ *	Xenia Tkatschow (xenia@us.ibm.com)
+ *    2Gb awareness and general cleanup:
+ *	Fritz Elfert (elfert@de.ibm.com, felfert@millenux.com)
+ *    Rewritten for af_iucv:
+ *	Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *    PM functions:
+ *	Ursula Braun (ursula.braun@de.ibm.com)
+ *
+ * Documentation used:
+ *    The original source
+ *    CP Programming Service, IBM document # SC24-5760
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define KMSG_COMPONENT "iucv"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/reboot.h>
+#include <net/iucv/iucv.h>
+#include <asm/atomic.h>
+#include <asm/ebcdic.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/smp.h>
+
+/*
+ * FLAGS:
+ * All flags are defined in the field IPFLAGS1 of each function
+ * and can be found in CP Programming Services.
+ * IPSRCCLS - Indicates you have specified a source class.
+ * IPTRGCLS - Indicates you have specified a target class.
+ * IPFGPID  - Indicates you have specified a pathid.
+ * IPFGMID  - Indicates you have specified a message ID.
+ * IPNORPY  - Indicates a one-way message. No reply expected.
+ * IPALL    - Indicates that all paths are affected.
+ */
+#define IUCV_IPSRCCLS	0x01
+#define IUCV_IPTRGCLS	0x01
+#define IUCV_IPFGPID	0x02
+#define IUCV_IPFGMID	0x04
+#define IUCV_IPNORPY	0x10
+#define IUCV_IPALL	0x80
+
+static int iucv_bus_match(struct device *dev, struct device_driver *drv)
+{
+	return 0;
+}
+
+enum iucv_pm_states {
+	IUCV_PM_INITIAL = 0,
+	IUCV_PM_FREEZING = 1,
+	IUCV_PM_THAWING = 2,
+	IUCV_PM_RESTORING = 3,
+};
+static enum iucv_pm_states iucv_pm_state;
+
+static int iucv_pm_prepare(struct device *);
+static void iucv_pm_complete(struct device *);
+static int iucv_pm_freeze(struct device *);
+static int iucv_pm_thaw(struct device *);
+static int iucv_pm_restore(struct device *);
+
+static const struct dev_pm_ops iucv_pm_ops = {
+	.prepare = iucv_pm_prepare,
+	.complete = iucv_pm_complete,
+	.freeze = iucv_pm_freeze,
+	.thaw = iucv_pm_thaw,
+	.restore = iucv_pm_restore,
+};
+
+struct bus_type iucv_bus = {
+	.name = "iucv",
+	.match = iucv_bus_match,
+	.pm = &iucv_pm_ops,
+};
+EXPORT_SYMBOL(iucv_bus);
+
+struct device *iucv_root;
+EXPORT_SYMBOL(iucv_root);
+
+static int iucv_available;
+
+/* General IUCV interrupt structure */
+struct iucv_irq_data {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iptype;
+	u32 res2[8];
+};
+
+struct iucv_irq_list {
+	struct list_head list;
+	struct iucv_irq_data data;
+};
+
+static struct iucv_irq_data *iucv_irq_data[NR_CPUS];
+static cpumask_t iucv_buffer_cpumask = { CPU_BITS_NONE };
+static cpumask_t iucv_irq_cpumask = { CPU_BITS_NONE };
+
+/*
+ * Queue of interrupt buffers lock for delivery via the tasklet
+ * (fast but can't call smp_call_function).
+ */
+static LIST_HEAD(iucv_task_queue);
+
+/*
+ * The tasklet for fast delivery of iucv interrupts.
+ */
+static void iucv_tasklet_fn(unsigned long);
+static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+
+/*
+ * Queue of interrupt buffers for delivery via a work queue
+ * (slower but can call smp_call_function).
+ */
+static LIST_HEAD(iucv_work_queue);
+
+/*
+ * The work element to deliver path pending interrupts.
+ */
+static void iucv_work_fn(struct work_struct *work);
+static DECLARE_WORK(iucv_work, iucv_work_fn);
+
+/*
+ * Spinlock protecting task and work queue.
+ */
+static DEFINE_SPINLOCK(iucv_queue_lock);
+
+enum iucv_command_codes {
+	IUCV_QUERY = 0,
+	IUCV_RETRIEVE_BUFFER = 2,
+	IUCV_SEND = 4,
+	IUCV_RECEIVE = 5,
+	IUCV_REPLY = 6,
+	IUCV_REJECT = 8,
+	IUCV_PURGE = 9,
+	IUCV_ACCEPT = 10,
+	IUCV_CONNECT = 11,
+	IUCV_DECLARE_BUFFER = 12,
+	IUCV_QUIESCE = 13,
+	IUCV_RESUME = 14,
+	IUCV_SEVER = 15,
+	IUCV_SETMASK = 16,
+	IUCV_SETCONTROLMASK = 17,
+};
+
+/*
+ * Error messages that are used with the iucv_sever function. They get
+ * converted to EBCDIC.
+ */
+static char iucv_error_no_listener[16] = "NO LISTENER";
+static char iucv_error_no_memory[16] = "NO MEMORY";
+static char iucv_error_pathid[16] = "INVALID PATHID";
+
+/*
+ * iucv_handler_list: List of registered handlers.
+ */
+static LIST_HEAD(iucv_handler_list);
+
+/*
+ * iucv_path_table: an array of iucv_path structures.
+ */
+static struct iucv_path **iucv_path_table;
+static unsigned long iucv_max_pathid;
+
+/*
+ * iucv_lock: spinlock protecting iucv_handler_list and iucv_pathid_table
+ */
+static DEFINE_SPINLOCK(iucv_table_lock);
+
+/*
+ * iucv_active_cpu: contains the number of the cpu executing the tasklet
+ * or the work handler. Needed for iucv_path_sever called from tasklet.
+ */
+static int iucv_active_cpu = -1;
+
+/*
+ * Mutex and wait queue for iucv_register/iucv_unregister.
+ */
+static DEFINE_MUTEX(iucv_register_mutex);
+
+/*
+ * Counter for number of non-smp capable handlers.
+ */
+static int iucv_nonsmp_handler;
+
+/*
+ * IUCV control data structure. Used by iucv_path_accept, iucv_path_connect,
+ * iucv_path_quiesce and iucv_path_sever.
+ */
+struct iucv_cmd_control {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iprcode;
+	u16 ipmsglim;
+	u16 res1;
+	u8  ipvmid[8];
+	u8  ipuser[16];
+	u8  iptarget[8];
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Data in parameter list iucv structure. Used by iucv_message_send,
+ * iucv_message_send2way and iucv_message_reply.
+ */
+struct iucv_cmd_dpl {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iprcode;
+	u32 ipmsgid;
+	u32 iptrgcls;
+	u8  iprmmsg[8];
+	u32 ipsrccls;
+	u32 ipmsgtag;
+	u32 ipbfadr2;
+	u32 ipbfln2f;
+	u32 res;
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Data in buffer iucv structure. Used by iucv_message_receive,
+ * iucv_message_reject, iucv_message_send, iucv_message_send2way
+ * and iucv_declare_cpu.
+ */
+struct iucv_cmd_db {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iprcode;
+	u32 ipmsgid;
+	u32 iptrgcls;
+	u32 ipbfadr1;
+	u32 ipbfln1f;
+	u32 ipsrccls;
+	u32 ipmsgtag;
+	u32 ipbfadr2;
+	u32 ipbfln2f;
+	u32 res;
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Purge message iucv structure. Used by iucv_message_purge.
+ */
+struct iucv_cmd_purge {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iprcode;
+	u32 ipmsgid;
+	u8  ipaudit[3];
+	u8  res1[5];
+	u32 res2;
+	u32 ipsrccls;
+	u32 ipmsgtag;
+	u32 res3[3];
+} __attribute__ ((packed,aligned(8)));
+
+/*
+ * Set mask iucv structure. Used by iucv_enable_cpu.
+ */
+struct iucv_cmd_set_mask {
+	u8  ipmask;
+	u8  res1[2];
+	u8  iprcode;
+	u32 res2[9];
+} __attribute__ ((packed,aligned(8)));
+
+union iucv_param {
+	struct iucv_cmd_control ctrl;
+	struct iucv_cmd_dpl dpl;
+	struct iucv_cmd_db db;
+	struct iucv_cmd_purge purge;
+	struct iucv_cmd_set_mask set_mask;
+};
+
+/*
+ * Anchor for per-cpu IUCV command parameter block.
+ */
+static union iucv_param *iucv_param[NR_CPUS];
+static union iucv_param *iucv_param_irq[NR_CPUS];
+
+/**
+ * iucv_call_b2f0
+ * @code: identifier of IUCV call to CP.
+ * @parm: pointer to a struct iucv_parm block
+ *
+ * Calls CP to execute IUCV commands.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+static inline int iucv_call_b2f0(int command, union iucv_param *parm)
+{
+	register unsigned long reg0 asm ("0");
+	register unsigned long reg1 asm ("1");
+	int ccode;
+
+	reg0 = command;
+	reg1 = virt_to_phys(parm);
+	asm volatile(
+		"	.long 0xb2f01000\n"
+		"	ipm	%0\n"
+		"	srl	%0,28\n"
+		: "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1)
+		:  "m" (*parm) : "cc");
+	return (ccode == 1) ? parm->ctrl.iprcode : ccode;
+}
+
+/**
+ * iucv_query_maxconn
+ *
+ * Determines the maximum number of connections that may be established.
+ *
+ * Returns the maximum number of connections or -EPERM is IUCV is not
+ * available.
+ */
+static int iucv_query_maxconn(void)
+{
+	register unsigned long reg0 asm ("0");
+	register unsigned long reg1 asm ("1");
+	void *param;
+	int ccode;
+
+	param = kzalloc(sizeof(union iucv_param), GFP_KERNEL|GFP_DMA);
+	if (!param)
+		return -ENOMEM;
+	reg0 = IUCV_QUERY;
+	reg1 = (unsigned long) param;
+	asm volatile (
+		"	.long	0xb2f01000\n"
+		"	ipm	%0\n"
+		"	srl	%0,28\n"
+		: "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc");
+	if (ccode == 0)
+		iucv_max_pathid = reg1;
+	kfree(param);
+	return ccode ? -EPERM : 0;
+}
+
+/**
+ * iucv_allow_cpu
+ * @data: unused
+ *
+ * Allow iucv interrupts on this cpu.
+ */
+static void iucv_allow_cpu(void *data)
+{
+	int cpu = smp_processor_id();
+	union iucv_param *parm;
+
+	/*
+	 * Enable all iucv interrupts.
+	 * ipmask contains bits for the different interrupts
+	 *	0x80 - Flag to allow nonpriority message pending interrupts
+	 *	0x40 - Flag to allow priority message pending interrupts
+	 *	0x20 - Flag to allow nonpriority message completion interrupts
+	 *	0x10 - Flag to allow priority message completion interrupts
+	 *	0x08 - Flag to allow IUCV control interrupts
+	 */
+	parm = iucv_param_irq[cpu];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->set_mask.ipmask = 0xf8;
+	iucv_call_b2f0(IUCV_SETMASK, parm);
+
+	/*
+	 * Enable all iucv control interrupts.
+	 * ipmask contains bits for the different interrupts
+	 *	0x80 - Flag to allow pending connections interrupts
+	 *	0x40 - Flag to allow connection complete interrupts
+	 *	0x20 - Flag to allow connection severed interrupts
+	 *	0x10 - Flag to allow connection quiesced interrupts
+	 *	0x08 - Flag to allow connection resumed interrupts
+	 */
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->set_mask.ipmask = 0xf8;
+	iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
+	/* Set indication that iucv interrupts are allowed for this cpu. */
+	cpumask_set_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_block_cpu
+ * @data: unused
+ *
+ * Block iucv interrupts on this cpu.
+ */
+static void iucv_block_cpu(void *data)
+{
+	int cpu = smp_processor_id();
+	union iucv_param *parm;
+
+	/* Disable all iucv interrupts. */
+	parm = iucv_param_irq[cpu];
+	memset(parm, 0, sizeof(union iucv_param));
+	iucv_call_b2f0(IUCV_SETMASK, parm);
+
+	/* Clear indication that iucv interrupts are allowed for this cpu. */
+	cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_block_cpu_almost
+ * @data: unused
+ *
+ * Allow connection-severed interrupts only on this cpu.
+ */
+static void iucv_block_cpu_almost(void *data)
+{
+	int cpu = smp_processor_id();
+	union iucv_param *parm;
+
+	/* Allow iucv control interrupts only */
+	parm = iucv_param_irq[cpu];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->set_mask.ipmask = 0x08;
+	iucv_call_b2f0(IUCV_SETMASK, parm);
+	/* Allow iucv-severed interrupt only */
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->set_mask.ipmask = 0x20;
+	iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
+
+	/* Clear indication that iucv interrupts are allowed for this cpu. */
+	cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
+}
+
+/**
+ * iucv_declare_cpu
+ * @data: unused
+ *
+ * Declare a interrupt buffer on this cpu.
+ */
+static void iucv_declare_cpu(void *data)
+{
+	int cpu = smp_processor_id();
+	union iucv_param *parm;
+	int rc;
+
+	if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask))
+		return;
+
+	/* Declare interrupt buffer. */
+	parm = iucv_param_irq[cpu];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->db.ipbfadr1 = virt_to_phys(iucv_irq_data[cpu]);
+	rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm);
+	if (rc) {
+		char *err = "Unknown";
+		switch (rc) {
+		case 0x03:
+			err = "Directory error";
+			break;
+		case 0x0a:
+			err = "Invalid length";
+			break;
+		case 0x13:
+			err = "Buffer already exists";
+			break;
+		case 0x3e:
+			err = "Buffer overlap";
+			break;
+		case 0x5c:
+			err = "Paging or storage error";
+			break;
+		}
+		pr_warning("Defining an interrupt buffer on CPU %i"
+			   " failed with 0x%02x (%s)\n", cpu, rc, err);
+		return;
+	}
+
+	/* Set indication that an iucv buffer exists for this cpu. */
+	cpumask_set_cpu(cpu, &iucv_buffer_cpumask);
+
+	if (iucv_nonsmp_handler == 0 || cpumask_empty(&iucv_irq_cpumask))
+		/* Enable iucv interrupts on this cpu. */
+		iucv_allow_cpu(NULL);
+	else
+		/* Disable iucv interrupts on this cpu. */
+		iucv_block_cpu(NULL);
+}
+
+/**
+ * iucv_retrieve_cpu
+ * @data: unused
+ *
+ * Retrieve interrupt buffer on this cpu.
+ */
+static void iucv_retrieve_cpu(void *data)
+{
+	int cpu = smp_processor_id();
+	union iucv_param *parm;
+
+	if (!cpumask_test_cpu(cpu, &iucv_buffer_cpumask))
+		return;
+
+	/* Block iucv interrupts. */
+	iucv_block_cpu(NULL);
+
+	/* Retrieve interrupt buffer. */
+	parm = iucv_param_irq[cpu];
+	iucv_call_b2f0(IUCV_RETRIEVE_BUFFER, parm);
+
+	/* Clear indication that an iucv buffer exists for this cpu. */
+	cpumask_clear_cpu(cpu, &iucv_buffer_cpumask);
+}
+
+/**
+ * iucv_setmask_smp
+ *
+ * Allow iucv interrupts on all cpus.
+ */
+static void iucv_setmask_mp(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		/* Enable all cpus with a declared buffer. */
+		if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) &&
+		    !cpumask_test_cpu(cpu, &iucv_irq_cpumask))
+			smp_call_function_single(cpu, iucv_allow_cpu,
+						 NULL, 1);
+	put_online_cpus();
+}
+
+/**
+ * iucv_setmask_up
+ *
+ * Allow iucv interrupts on a single cpu.
+ */
+static void iucv_setmask_up(void)
+{
+	cpumask_t cpumask;
+	int cpu;
+
+	/* Disable all cpu but the first in cpu_irq_cpumask. */
+	cpumask_copy(&cpumask, &iucv_irq_cpumask);
+	cpumask_clear_cpu(cpumask_first(&iucv_irq_cpumask), &cpumask);
+	for_each_cpu(cpu, &cpumask)
+		smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
+}
+
+/**
+ * iucv_enable
+ *
+ * This function makes iucv ready for use. It allocates the pathid
+ * table, declares an iucv interrupt buffer and enables the iucv
+ * interrupts. Called when the first user has registered an iucv
+ * handler.
+ */
+static int iucv_enable(void)
+{
+	size_t alloc_size;
+	int cpu, rc;
+
+	get_online_cpus();
+	rc = -ENOMEM;
+	alloc_size = iucv_max_pathid * sizeof(struct iucv_path);
+	iucv_path_table = kzalloc(alloc_size, GFP_KERNEL);
+	if (!iucv_path_table)
+		goto out;
+	/* Declare per cpu buffers. */
+	rc = -EIO;
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
+	if (cpumask_empty(&iucv_buffer_cpumask))
+		/* No cpu could declare an iucv buffer. */
+		goto out;
+	put_online_cpus();
+	return 0;
+out:
+	kfree(iucv_path_table);
+	iucv_path_table = NULL;
+	put_online_cpus();
+	return rc;
+}
+
+/**
+ * iucv_disable
+ *
+ * This function shuts down iucv. It disables iucv interrupts, retrieves
+ * the iucv interrupt buffer and frees the pathid table. Called after the
+ * last user unregister its iucv handler.
+ */
+static void iucv_disable(void)
+{
+	get_online_cpus();
+	on_each_cpu(iucv_retrieve_cpu, NULL, 1);
+	kfree(iucv_path_table);
+	iucv_path_table = NULL;
+	put_online_cpus();
+}
+
+static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
+				     unsigned long action, void *hcpu)
+{
+	cpumask_t cpumask;
+	long cpu = (long) hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+					GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_irq_data[cpu])
+			return notifier_from_errno(-ENOMEM);
+
+		iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+				     GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_param[cpu]) {
+			kfree(iucv_irq_data[cpu]);
+			iucv_irq_data[cpu] = NULL;
+			return notifier_from_errno(-ENOMEM);
+		}
+		iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
+					GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_param_irq[cpu]) {
+			kfree(iucv_param[cpu]);
+			iucv_param[cpu] = NULL;
+			kfree(iucv_irq_data[cpu]);
+			iucv_irq_data[cpu] = NULL;
+			return notifier_from_errno(-ENOMEM);
+		}
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		kfree(iucv_param_irq[cpu]);
+		iucv_param_irq[cpu] = NULL;
+		kfree(iucv_param[cpu]);
+		iucv_param[cpu] = NULL;
+		kfree(iucv_irq_data[cpu]);
+		iucv_irq_data[cpu] = NULL;
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		if (!iucv_path_table)
+			break;
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		if (!iucv_path_table)
+			break;
+		cpumask_copy(&cpumask, &iucv_buffer_cpumask);
+		cpumask_clear_cpu(cpu, &cpumask);
+		if (cpumask_empty(&cpumask))
+			/* Can't offline last IUCV enabled cpu. */
+			return notifier_from_errno(-EINVAL);
+		smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
+		if (cpumask_empty(&iucv_irq_cpumask))
+			smp_call_function_single(
+				cpumask_first(&iucv_buffer_cpumask),
+				iucv_allow_cpu, NULL, 1);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata iucv_cpu_notifier = {
+	.notifier_call = iucv_cpu_notify,
+};
+
+/**
+ * iucv_sever_pathid
+ * @pathid: path identification number.
+ * @userdata: 16-bytes of user data.
+ *
+ * Sever an iucv path to free up the pathid. Used internally.
+ */
+static int iucv_sever_pathid(u16 pathid, u8 userdata[16])
+{
+	union iucv_param *parm;
+
+	parm = iucv_param_irq[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (userdata)
+		memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+	parm->ctrl.ippathid = pathid;
+	return iucv_call_b2f0(IUCV_SEVER, parm);
+}
+
+/**
+ * __iucv_cleanup_queue
+ * @dummy: unused dummy argument
+ *
+ * Nop function called via smp_call_function to force work items from
+ * pending external iucv interrupts to the work queue.
+ */
+static void __iucv_cleanup_queue(void *dummy)
+{
+}
+
+/**
+ * iucv_cleanup_queue
+ *
+ * Function called after a path has been severed to find all remaining
+ * work items for the now stale pathid. The caller needs to hold the
+ * iucv_table_lock.
+ */
+static void iucv_cleanup_queue(void)
+{
+	struct iucv_irq_list *p, *n;
+
+	/*
+	 * When a path is severed, the pathid can be reused immediately
+	 * on a iucv connect or a connection pending interrupt. Remove
+	 * all entries from the task queue that refer to a stale pathid
+	 * (iucv_path_table[ix] == NULL). Only then do the iucv connect
+	 * or deliver the connection pending interrupt. To get all the
+	 * pending interrupts force them to the work queue by calling
+	 * an empty function on all cpus.
+	 */
+	smp_call_function(__iucv_cleanup_queue, NULL, 1);
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
+		/* Remove stale work items from the task queue. */
+		if (iucv_path_table[p->data.ippathid] == NULL) {
+			list_del(&p->list);
+			kfree(p);
+		}
+	}
+	spin_unlock_irq(&iucv_queue_lock);
+}
+
+/**
+ * iucv_register:
+ * @handler: address of iucv handler structure
+ * @smp: != 0 indicates that the handler can deal with out of order messages
+ *
+ * Registers a driver with IUCV.
+ *
+ * Returns 0 on success, -ENOMEM if the memory allocation for the pathid
+ * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus.
+ */
+int iucv_register(struct iucv_handler *handler, int smp)
+{
+	int rc;
+
+	if (!iucv_available)
+		return -ENOSYS;
+	mutex_lock(&iucv_register_mutex);
+	if (!smp)
+		iucv_nonsmp_handler++;
+	if (list_empty(&iucv_handler_list)) {
+		rc = iucv_enable();
+		if (rc)
+			goto out_mutex;
+	} else if (!smp && iucv_nonsmp_handler == 1)
+		iucv_setmask_up();
+	INIT_LIST_HEAD(&handler->paths);
+
+	spin_lock_bh(&iucv_table_lock);
+	list_add_tail(&handler->list, &iucv_handler_list);
+	spin_unlock_bh(&iucv_table_lock);
+	rc = 0;
+out_mutex:
+	mutex_unlock(&iucv_register_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(iucv_register);
+
+/**
+ * iucv_unregister
+ * @handler:  address of iucv handler structure
+ * @smp: != 0 indicates that the handler can deal with out of order messages
+ *
+ * Unregister driver from IUCV.
+ */
+void iucv_unregister(struct iucv_handler *handler, int smp)
+{
+	struct iucv_path *p, *n;
+
+	mutex_lock(&iucv_register_mutex);
+	spin_lock_bh(&iucv_table_lock);
+	/* Remove handler from the iucv_handler_list. */
+	list_del_init(&handler->list);
+	/* Sever all pathids still referring to the handler. */
+	list_for_each_entry_safe(p, n, &handler->paths, list) {
+		iucv_sever_pathid(p->pathid, NULL);
+		iucv_path_table[p->pathid] = NULL;
+		list_del(&p->list);
+		iucv_path_free(p);
+	}
+	spin_unlock_bh(&iucv_table_lock);
+	if (!smp)
+		iucv_nonsmp_handler--;
+	if (list_empty(&iucv_handler_list))
+		iucv_disable();
+	else if (!smp && iucv_nonsmp_handler == 0)
+		iucv_setmask_mp();
+	mutex_unlock(&iucv_register_mutex);
+}
+EXPORT_SYMBOL(iucv_unregister);
+
+static int iucv_reboot_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	int i;
+
+	get_online_cpus();
+	on_each_cpu(iucv_block_cpu, NULL, 1);
+	preempt_disable();
+	for (i = 0; i < iucv_max_pathid; i++) {
+		if (iucv_path_table[i])
+			iucv_sever_pathid(i, NULL);
+	}
+	preempt_enable();
+	put_online_cpus();
+	iucv_disable();
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block iucv_reboot_notifier = {
+	.notifier_call = iucv_reboot_event,
+};
+
+/**
+ * iucv_path_accept
+ * @path: address of iucv path structure
+ * @handler: address of iucv handler structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ * @private: private data passed to interrupt handlers for this path
+ *
+ * This function is issued after the user received a connection pending
+ * external interrupt and now wishes to complete the IUCV communication path.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler,
+		     u8 userdata[16], void *private)
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	/* Prepare parameter block. */
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->ctrl.ippathid = path->pathid;
+	parm->ctrl.ipmsglim = path->msglim;
+	if (userdata)
+		memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+	parm->ctrl.ipflags1 = path->flags;
+
+	rc = iucv_call_b2f0(IUCV_ACCEPT, parm);
+	if (!rc) {
+		path->private = private;
+		path->msglim = parm->ctrl.ipmsglim;
+		path->flags = parm->ctrl.ipflags1;
+	}
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_path_accept);
+
+/**
+ * iucv_path_connect
+ * @path: address of iucv path structure
+ * @handler: address of iucv handler structure
+ * @userid: 8-byte user identification
+ * @system: 8-byte target system identification
+ * @userdata: 16 bytes of data reflected to the communication partner
+ * @private: private data passed to interrupt handlers for this path
+ *
+ * This function establishes an IUCV path. Although the connect may complete
+ * successfully, you are not able to use the path until you receive an IUCV
+ * Connection Complete external interrupt.
+ *
+ * Returns the result of the CP IUCV call.
+ */
+int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
+		      u8 userid[8], u8 system[8], u8 userdata[16],
+		      void *private)
+{
+	union iucv_param *parm;
+	int rc;
+
+	spin_lock_bh(&iucv_table_lock);
+	iucv_cleanup_queue();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->ctrl.ipmsglim = path->msglim;
+	parm->ctrl.ipflags1 = path->flags;
+	if (userid) {
+		memcpy(parm->ctrl.ipvmid, userid, sizeof(parm->ctrl.ipvmid));
+		ASCEBC(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid));
+		EBC_TOUPPER(parm->ctrl.ipvmid, sizeof(parm->ctrl.ipvmid));
+	}
+	if (system) {
+		memcpy(parm->ctrl.iptarget, system,
+		       sizeof(parm->ctrl.iptarget));
+		ASCEBC(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget));
+		EBC_TOUPPER(parm->ctrl.iptarget, sizeof(parm->ctrl.iptarget));
+	}
+	if (userdata)
+		memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+
+	rc = iucv_call_b2f0(IUCV_CONNECT, parm);
+	if (!rc) {
+		if (parm->ctrl.ippathid < iucv_max_pathid) {
+			path->pathid = parm->ctrl.ippathid;
+			path->msglim = parm->ctrl.ipmsglim;
+			path->flags = parm->ctrl.ipflags1;
+			path->handler = handler;
+			path->private = private;
+			list_add_tail(&path->list, &handler->paths);
+			iucv_path_table[path->pathid] = path;
+		} else {
+			iucv_sever_pathid(parm->ctrl.ippathid,
+					  iucv_error_pathid);
+			rc = -EIO;
+		}
+	}
+out:
+	spin_unlock_bh(&iucv_table_lock);
+	return rc;
+}
+EXPORT_SYMBOL(iucv_path_connect);
+
+/**
+ * iucv_path_quiesce:
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function temporarily suspends incoming messages on an IUCV path.
+ * You can later reactivate the path by invoking the iucv_resume function.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_quiesce(struct iucv_path *path, u8 userdata[16])
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (userdata)
+		memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+	parm->ctrl.ippathid = path->pathid;
+	rc = iucv_call_b2f0(IUCV_QUIESCE, parm);
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_path_quiesce);
+
+/**
+ * iucv_path_resume:
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function resumes incoming messages on an IUCV path that has
+ * been stopped with iucv_path_quiesce.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_resume(struct iucv_path *path, u8 userdata[16])
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (userdata)
+		memcpy(parm->ctrl.ipuser, userdata, sizeof(parm->ctrl.ipuser));
+	parm->ctrl.ippathid = path->pathid;
+	rc = iucv_call_b2f0(IUCV_RESUME, parm);
+out:
+	local_bh_enable();
+	return rc;
+}
+
+/**
+ * iucv_path_sever
+ * @path: address of iucv path structure
+ * @userdata: 16 bytes of data reflected to the communication partner
+ *
+ * This function terminates an IUCV path.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_path_sever(struct iucv_path *path, u8 userdata[16])
+{
+	int rc;
+
+	preempt_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	if (iucv_active_cpu != smp_processor_id())
+		spin_lock_bh(&iucv_table_lock);
+	rc = iucv_sever_pathid(path->pathid, userdata);
+	iucv_path_table[path->pathid] = NULL;
+	list_del_init(&path->list);
+	if (iucv_active_cpu != smp_processor_id())
+		spin_unlock_bh(&iucv_table_lock);
+out:
+	preempt_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_path_sever);
+
+/**
+ * iucv_message_purge
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @srccls: source class of message
+ *
+ * Cancels a message you have sent.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg,
+		       u32 srccls)
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->purge.ippathid = path->pathid;
+	parm->purge.ipmsgid = msg->id;
+	parm->purge.ipsrccls = srccls;
+	parm->purge.ipflags1 = IUCV_IPSRCCLS | IUCV_IPFGMID | IUCV_IPFGPID;
+	rc = iucv_call_b2f0(IUCV_PURGE, parm);
+	if (!rc) {
+		msg->audit = (*(u32 *) &parm->purge.ipaudit) >> 8;
+		msg->tag = parm->purge.ipmsgtag;
+	}
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_purge);
+
+/**
+ * iucv_message_receive_iprmdata
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * Internal function used by iucv_message_receive and __iucv_message_receive
+ * to receive RMDATA data stored in struct iucv_message.
+ */
+static int iucv_message_receive_iprmdata(struct iucv_path *path,
+					 struct iucv_message *msg,
+					 u8 flags, void *buffer,
+					 size_t size, size_t *residual)
+{
+	struct iucv_array *array;
+	u8 *rmmsg;
+	size_t copy;
+
+	/*
+	 * Message is 8 bytes long and has been stored to the
+	 * message descriptor itself.
+	 */
+	if (residual)
+		*residual = abs(size - 8);
+	rmmsg = msg->rmmsg;
+	if (flags & IUCV_IPBUFLST) {
+		/* Copy to struct iucv_array. */
+		size = (size < 8) ? size : 8;
+		for (array = buffer; size > 0; array++) {
+			copy = min_t(size_t, size, array->length);
+			memcpy((u8 *)(addr_t) array->address,
+				rmmsg, copy);
+			rmmsg += copy;
+			size -= copy;
+		}
+	} else {
+		/* Copy to direct buffer. */
+		memcpy(buffer, rmmsg, min_t(size_t, size, 8));
+	}
+	return 0;
+}
+
+/**
+ * __iucv_message_receive
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * This function receives messages that are being sent to you over
+ * established paths. This function will deal with RMDATA messages
+ * embedded in struct iucv_message as well.
+ *
+ * Locking:	no locking
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
+			   u8 flags, void *buffer, size_t size, size_t *residual)
+{
+	union iucv_param *parm;
+	int rc;
+
+	if (msg->flags & IUCV_IPRMDATA)
+		return iucv_message_receive_iprmdata(path, msg, flags,
+						     buffer, size, residual);
+	 if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+	parm->db.ipbfln1f = (u32) size;
+	parm->db.ipmsgid = msg->id;
+	parm->db.ippathid = path->pathid;
+	parm->db.iptrgcls = msg->class;
+	parm->db.ipflags1 = (flags | IUCV_IPFGPID |
+			     IUCV_IPFGMID | IUCV_IPTRGCLS);
+	rc = iucv_call_b2f0(IUCV_RECEIVE, parm);
+	if (!rc || rc == 5) {
+		msg->flags = parm->db.ipflags1;
+		if (residual)
+			*residual = parm->db.ipbfln1f;
+	}
+out:
+	return rc;
+}
+EXPORT_SYMBOL(__iucv_message_receive);
+
+/**
+ * iucv_message_receive
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is received (IUCV_IPBUFLST)
+ * @buffer: address of data buffer or address of struct iucv_array
+ * @size: length of data buffer
+ * @residual:
+ *
+ * This function receives messages that are being sent to you over
+ * established paths. This function will deal with RMDATA messages
+ * embedded in struct iucv_message as well.
+ *
+ * Locking:	local_bh_enable/local_bh_disable
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
+			 u8 flags, void *buffer, size_t size, size_t *residual)
+{
+	int rc;
+
+	if (msg->flags & IUCV_IPRMDATA)
+		return iucv_message_receive_iprmdata(path, msg, flags,
+						     buffer, size, residual);
+	local_bh_disable();
+	rc = __iucv_message_receive(path, msg, flags, buffer, size, residual);
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_receive);
+
+/**
+ * iucv_message_reject
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ *
+ * The reject function refuses a specified message. Between the time you
+ * are notified of a message and the time that you complete the message,
+ * the message may be rejected.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg)
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	parm->db.ippathid = path->pathid;
+	parm->db.ipmsgid = msg->id;
+	parm->db.iptrgcls = msg->class;
+	parm->db.ipflags1 = (IUCV_IPTRGCLS | IUCV_IPFGMID | IUCV_IPFGPID);
+	rc = iucv_call_b2f0(IUCV_REJECT, parm);
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_reject);
+
+/**
+ * iucv_message_reply
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @reply: address of reply data buffer or address of struct iucv_array
+ * @size: length of reply data buffer
+ *
+ * This function responds to the two-way messages that you receive. You
+ * must identify completely the message to which you wish to reply. ie,
+ * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into
+ * the parameter list.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg,
+		       u8 flags, void *reply, size_t size)
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (flags & IUCV_IPRMDATA) {
+		parm->dpl.ippathid = path->pathid;
+		parm->dpl.ipflags1 = flags;
+		parm->dpl.ipmsgid = msg->id;
+		parm->dpl.iptrgcls = msg->class;
+		memcpy(parm->dpl.iprmmsg, reply, min_t(size_t, size, 8));
+	} else {
+		parm->db.ipbfadr1 = (u32)(addr_t) reply;
+		parm->db.ipbfln1f = (u32) size;
+		parm->db.ippathid = path->pathid;
+		parm->db.ipflags1 = flags;
+		parm->db.ipmsgid = msg->id;
+		parm->db.iptrgcls = msg->class;
+	}
+	rc = iucv_call_b2f0(IUCV_REPLY, parm);
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_reply);
+
+/**
+ * __iucv_message_send
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer and this is a one-way message and the
+ * receiver will not reply to the message.
+ *
+ * Locking:	no locking
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
+		      u8 flags, u32 srccls, void *buffer, size_t size)
+{
+	union iucv_param *parm;
+	int rc;
+
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (flags & IUCV_IPRMDATA) {
+		/* Message of 8 bytes can be placed into the parameter list. */
+		parm->dpl.ippathid = path->pathid;
+		parm->dpl.ipflags1 = flags | IUCV_IPNORPY;
+		parm->dpl.iptrgcls = msg->class;
+		parm->dpl.ipsrccls = srccls;
+		parm->dpl.ipmsgtag = msg->tag;
+		memcpy(parm->dpl.iprmmsg, buffer, 8);
+	} else {
+		parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+		parm->db.ipbfln1f = (u32) size;
+		parm->db.ippathid = path->pathid;
+		parm->db.ipflags1 = flags | IUCV_IPNORPY;
+		parm->db.iptrgcls = msg->class;
+		parm->db.ipsrccls = srccls;
+		parm->db.ipmsgtag = msg->tag;
+	}
+	rc = iucv_call_b2f0(IUCV_SEND, parm);
+	if (!rc)
+		msg->id = parm->db.ipmsgid;
+out:
+	return rc;
+}
+EXPORT_SYMBOL(__iucv_message_send);
+
+/**
+ * iucv_message_send
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer and this is a one-way message and the
+ * receiver will not reply to the message.
+ *
+ * Locking:	local_bh_enable/local_bh_disable
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
+		      u8 flags, u32 srccls, void *buffer, size_t size)
+{
+	int rc;
+
+	local_bh_disable();
+	rc = __iucv_message_send(path, msg, flags, srccls, buffer, size);
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_send);
+
+/**
+ * iucv_message_send2way
+ * @path: address of iucv path structure
+ * @msg: address of iucv msg structure
+ * @flags: how the message is sent and the reply is received
+ *	   (IUCV_IPRMDATA, IUCV_IPBUFLST, IUCV_IPPRTY, IUCV_ANSLST)
+ * @srccls: source class of message
+ * @buffer: address of send buffer or address of struct iucv_array
+ * @size: length of send buffer
+ * @ansbuf: address of answer buffer or address of struct iucv_array
+ * @asize: size of reply buffer
+ *
+ * This function transmits data to another application. Data to be
+ * transmitted is in a buffer. The receiver of the send is expected to
+ * reply to the message and a buffer is provided into which IUCV moves
+ * the reply to this message.
+ *
+ * Returns the result from the CP IUCV call.
+ */
+int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
+			  u8 flags, u32 srccls, void *buffer, size_t size,
+			  void *answer, size_t asize, size_t *residual)
+{
+	union iucv_param *parm;
+	int rc;
+
+	local_bh_disable();
+	if (cpumask_empty(&iucv_buffer_cpumask)) {
+		rc = -EIO;
+		goto out;
+	}
+	parm = iucv_param[smp_processor_id()];
+	memset(parm, 0, sizeof(union iucv_param));
+	if (flags & IUCV_IPRMDATA) {
+		parm->dpl.ippathid = path->pathid;
+		parm->dpl.ipflags1 = path->flags;	/* priority message */
+		parm->dpl.iptrgcls = msg->class;
+		parm->dpl.ipsrccls = srccls;
+		parm->dpl.ipmsgtag = msg->tag;
+		parm->dpl.ipbfadr2 = (u32)(addr_t) answer;
+		parm->dpl.ipbfln2f = (u32) asize;
+		memcpy(parm->dpl.iprmmsg, buffer, 8);
+	} else {
+		parm->db.ippathid = path->pathid;
+		parm->db.ipflags1 = path->flags;	/* priority message */
+		parm->db.iptrgcls = msg->class;
+		parm->db.ipsrccls = srccls;
+		parm->db.ipmsgtag = msg->tag;
+		parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+		parm->db.ipbfln1f = (u32) size;
+		parm->db.ipbfadr2 = (u32)(addr_t) answer;
+		parm->db.ipbfln2f = (u32) asize;
+	}
+	rc = iucv_call_b2f0(IUCV_SEND, parm);
+	if (!rc)
+		msg->id = parm->db.ipmsgid;
+out:
+	local_bh_enable();
+	return rc;
+}
+EXPORT_SYMBOL(iucv_message_send2way);
+
+/**
+ * iucv_path_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_pending {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iptype;
+	u16 ipmsglim;
+	u16 res1;
+	u8  ipvmid[8];
+	u8  ipuser[16];
+	u32 res3;
+	u8  ippollfg;
+	u8  res4[3];
+} __packed;
+
+static void iucv_path_pending(struct iucv_irq_data *data)
+{
+	struct iucv_path_pending *ipp = (void *) data;
+	struct iucv_handler *handler;
+	struct iucv_path *path;
+	char *error;
+
+	BUG_ON(iucv_path_table[ipp->ippathid]);
+	/* New pathid, handler found. Create a new path struct. */
+	error = iucv_error_no_memory;
+	path = iucv_path_alloc(ipp->ipmsglim, ipp->ipflags1, GFP_ATOMIC);
+	if (!path)
+		goto out_sever;
+	path->pathid = ipp->ippathid;
+	iucv_path_table[path->pathid] = path;
+	EBCASC(ipp->ipvmid, 8);
+
+	/* Call registered handler until one is found that wants the path. */
+	list_for_each_entry(handler, &iucv_handler_list, list) {
+		if (!handler->path_pending)
+			continue;
+		/*
+		 * Add path to handler to allow a call to iucv_path_sever
+		 * inside the path_pending function. If the handler returns
+		 * an error remove the path from the handler again.
+		 */
+		list_add(&path->list, &handler->paths);
+		path->handler = handler;
+		if (!handler->path_pending(path, ipp->ipvmid, ipp->ipuser))
+			return;
+		list_del(&path->list);
+		path->handler = NULL;
+	}
+	/* No handler wanted the path. */
+	iucv_path_table[path->pathid] = NULL;
+	iucv_path_free(path);
+	error = iucv_error_no_listener;
+out_sever:
+	iucv_sever_pathid(ipp->ippathid, error);
+}
+
+/**
+ * iucv_path_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_complete {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iptype;
+	u16 ipmsglim;
+	u16 res1;
+	u8  res2[8];
+	u8  ipuser[16];
+	u32 res3;
+	u8  ippollfg;
+	u8  res4[3];
+} __packed;
+
+static void iucv_path_complete(struct iucv_irq_data *data)
+{
+	struct iucv_path_complete *ipc = (void *) data;
+	struct iucv_path *path = iucv_path_table[ipc->ippathid];
+
+	if (path)
+		path->flags = ipc->ipflags1;
+	if (path && path->handler && path->handler->path_complete)
+		path->handler->path_complete(path, ipc->ipuser);
+}
+
+/**
+ * iucv_path_severed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection severed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_severed {
+	u16 ippathid;
+	u8  res1;
+	u8  iptype;
+	u32 res2;
+	u8  res3[8];
+	u8  ipuser[16];
+	u32 res4;
+	u8  ippollfg;
+	u8  res5[3];
+} __packed;
+
+static void iucv_path_severed(struct iucv_irq_data *data)
+{
+	struct iucv_path_severed *ips = (void *) data;
+	struct iucv_path *path = iucv_path_table[ips->ippathid];
+
+	if (!path || !path->handler)	/* Already severed */
+		return;
+	if (path->handler->path_severed)
+		path->handler->path_severed(path, ips->ipuser);
+	else {
+		iucv_sever_pathid(path->pathid, NULL);
+		iucv_path_table[path->pathid] = NULL;
+		list_del(&path->list);
+		iucv_path_free(path);
+	}
+}
+
+/**
+ * iucv_path_quiesced
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection quiesced work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_quiesced {
+	u16 ippathid;
+	u8  res1;
+	u8  iptype;
+	u32 res2;
+	u8  res3[8];
+	u8  ipuser[16];
+	u32 res4;
+	u8  ippollfg;
+	u8  res5[3];
+} __packed;
+
+static void iucv_path_quiesced(struct iucv_irq_data *data)
+{
+	struct iucv_path_quiesced *ipq = (void *) data;
+	struct iucv_path *path = iucv_path_table[ipq->ippathid];
+
+	if (path && path->handler && path->handler->path_quiesced)
+		path->handler->path_quiesced(path, ipq->ipuser);
+}
+
+/**
+ * iucv_path_resumed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection resumed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_path_resumed {
+	u16 ippathid;
+	u8  res1;
+	u8  iptype;
+	u32 res2;
+	u8  res3[8];
+	u8  ipuser[16];
+	u32 res4;
+	u8  ippollfg;
+	u8  res5[3];
+} __packed;
+
+static void iucv_path_resumed(struct iucv_irq_data *data)
+{
+	struct iucv_path_resumed *ipr = (void *) data;
+	struct iucv_path *path = iucv_path_table[ipr->ippathid];
+
+	if (path && path->handler && path->handler->path_resumed)
+		path->handler->path_resumed(path, ipr->ipuser);
+}
+
+/**
+ * iucv_message_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_message_complete {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iptype;
+	u32 ipmsgid;
+	u32 ipaudit;
+	u8  iprmmsg[8];
+	u32 ipsrccls;
+	u32 ipmsgtag;
+	u32 res;
+	u32 ipbfln2f;
+	u8  ippollfg;
+	u8  res2[3];
+} __packed;
+
+static void iucv_message_complete(struct iucv_irq_data *data)
+{
+	struct iucv_message_complete *imc = (void *) data;
+	struct iucv_path *path = iucv_path_table[imc->ippathid];
+	struct iucv_message msg;
+
+	if (path && path->handler && path->handler->message_complete) {
+		msg.flags = imc->ipflags1;
+		msg.id = imc->ipmsgid;
+		msg.audit = imc->ipaudit;
+		memcpy(msg.rmmsg, imc->iprmmsg, 8);
+		msg.class = imc->ipsrccls;
+		msg.tag = imc->ipmsgtag;
+		msg.length = imc->ipbfln2f;
+		path->handler->message_complete(path, &msg);
+	}
+}
+
+/**
+ * iucv_message_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
+struct iucv_message_pending {
+	u16 ippathid;
+	u8  ipflags1;
+	u8  iptype;
+	u32 ipmsgid;
+	u32 iptrgcls;
+	union {
+		u32 iprmmsg1_u32;
+		u8  iprmmsg1[4];
+	} ln1msg1;
+	union {
+		u32 ipbfln1f;
+		u8  iprmmsg2[4];
+	} ln1msg2;
+	u32 res1[3];
+	u32 ipbfln2f;
+	u8  ippollfg;
+	u8  res2[3];
+} __packed;
+
+static void iucv_message_pending(struct iucv_irq_data *data)
+{
+	struct iucv_message_pending *imp = (void *) data;
+	struct iucv_path *path = iucv_path_table[imp->ippathid];
+	struct iucv_message msg;
+
+	if (path && path->handler && path->handler->message_pending) {
+		msg.flags = imp->ipflags1;
+		msg.id = imp->ipmsgid;
+		msg.class = imp->iptrgcls;
+		if (imp->ipflags1 & IUCV_IPRMDATA) {
+			memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8);
+			msg.length = 8;
+		} else
+			msg.length = imp->ln1msg2.ipbfln1f;
+		msg.reply_size = imp->ipbfln2f;
+		path->handler->message_pending(path, &msg);
+	}
+}
+
+/**
+ * iucv_tasklet_fn:
+ *
+ * This tasklet loops over the queue of irq buffers created by
+ * iucv_external_interrupt, calls the appropriate action handler
+ * and then frees the buffer.
+ */
+static void iucv_tasklet_fn(unsigned long ignored)
+{
+	typedef void iucv_irq_fn(struct iucv_irq_data *);
+	static iucv_irq_fn *irq_fn[] = {
+		[0x02] = iucv_path_complete,
+		[0x03] = iucv_path_severed,
+		[0x04] = iucv_path_quiesced,
+		[0x05] = iucv_path_resumed,
+		[0x06] = iucv_message_complete,
+		[0x07] = iucv_message_complete,
+		[0x08] = iucv_message_pending,
+		[0x09] = iucv_message_pending,
+	};
+	LIST_HEAD(task_queue);
+	struct iucv_irq_list *p, *n;
+
+	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+	if (!spin_trylock(&iucv_table_lock)) {
+		tasklet_schedule(&iucv_tasklet);
+		return;
+	}
+	iucv_active_cpu = smp_processor_id();
+
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_task_queue, &task_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	list_for_each_entry_safe(p, n, &task_queue, list) {
+		list_del_init(&p->list);
+		irq_fn[p->data.iptype](&p->data);
+		kfree(p);
+	}
+
+	iucv_active_cpu = -1;
+	spin_unlock(&iucv_table_lock);
+}
+
+/**
+ * iucv_work_fn:
+ *
+ * This work function loops over the queue of path pending irq blocks
+ * created by iucv_external_interrupt, calls the appropriate action
+ * handler and then frees the buffer.
+ */
+static void iucv_work_fn(struct work_struct *work)
+{
+	LIST_HEAD(work_queue);
+	struct iucv_irq_list *p, *n;
+
+	/* Serialize tasklet, iucv_path_sever and iucv_path_connect. */
+	spin_lock_bh(&iucv_table_lock);
+	iucv_active_cpu = smp_processor_id();
+
+	spin_lock_irq(&iucv_queue_lock);
+	list_splice_init(&iucv_work_queue, &work_queue);
+	spin_unlock_irq(&iucv_queue_lock);
+
+	iucv_cleanup_queue();
+	list_for_each_entry_safe(p, n, &work_queue, list) {
+		list_del_init(&p->list);
+		iucv_path_pending(&p->data);
+		kfree(p);
+	}
+
+	iucv_active_cpu = -1;
+	spin_unlock_bh(&iucv_table_lock);
+}
+
+/**
+ * iucv_external_interrupt
+ * @code: irq code
+ *
+ * Handles external interrupts coming in from CP.
+ * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
+ */
+static void iucv_external_interrupt(unsigned int ext_int_code,
+				    unsigned int param32, unsigned long param64)
+{
+	struct iucv_irq_data *p;
+	struct iucv_irq_list *work;
+
+	kstat_cpu(smp_processor_id()).irqs[EXTINT_IUC]++;
+	p = iucv_irq_data[smp_processor_id()];
+	if (p->ippathid >= iucv_max_pathid) {
+		WARN_ON(p->ippathid >= iucv_max_pathid);
+		iucv_sever_pathid(p->ippathid, iucv_error_no_listener);
+		return;
+	}
+	BUG_ON(p->iptype  < 0x01 || p->iptype > 0x09);
+	work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC);
+	if (!work) {
+		pr_warning("iucv_external_interrupt: out of memory\n");
+		return;
+	}
+	memcpy(&work->data, p, sizeof(work->data));
+	spin_lock(&iucv_queue_lock);
+	if (p->iptype == 0x01) {
+		/* Path pending interrupt. */
+		list_add_tail(&work->list, &iucv_work_queue);
+		schedule_work(&iucv_work);
+	} else {
+		/* The other interrupts. */
+		list_add_tail(&work->list, &iucv_task_queue);
+		tasklet_schedule(&iucv_tasklet);
+	}
+	spin_unlock(&iucv_queue_lock);
+}
+
+static int iucv_pm_prepare(struct device *dev)
+{
+	int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_INFO "iucv_pm_prepare\n");
+#endif
+	if (dev->driver && dev->driver->pm && dev->driver->pm->prepare)
+		rc = dev->driver->pm->prepare(dev);
+	return rc;
+}
+
+static void iucv_pm_complete(struct device *dev)
+{
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_INFO "iucv_pm_complete\n");
+#endif
+	if (dev->driver && dev->driver->pm && dev->driver->pm->complete)
+		dev->driver->pm->complete(dev);
+}
+
+/**
+ * iucv_path_table_empty() - determine if iucv path table is empty
+ *
+ * Returns 0 if there are still iucv pathes defined
+ *	   1 if there are no iucv pathes defined
+ */
+int iucv_path_table_empty(void)
+{
+	int i;
+
+	for (i = 0; i < iucv_max_pathid; i++) {
+		if (iucv_path_table[i])
+			return 0;
+	}
+	return 1;
+}
+
+/**
+ * iucv_pm_freeze() - Freeze PM callback
+ * @dev:	iucv-based device
+ *
+ * disable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ * shut down iucv, if no iucv-pathes are established anymore
+ */
+static int iucv_pm_freeze(struct device *dev)
+{
+	int cpu;
+	struct iucv_irq_list *p, *n;
+	int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "iucv_pm_freeze\n");
+#endif
+	if (iucv_pm_state != IUCV_PM_FREEZING) {
+		for_each_cpu(cpu, &iucv_irq_cpumask)
+			smp_call_function_single(cpu, iucv_block_cpu_almost,
+						 NULL, 1);
+		cancel_work_sync(&iucv_work);
+		list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
+			list_del_init(&p->list);
+			iucv_sever_pathid(p->data.ippathid,
+					  iucv_error_no_listener);
+			kfree(p);
+		}
+	}
+	iucv_pm_state = IUCV_PM_FREEZING;
+	if (dev->driver && dev->driver->pm && dev->driver->pm->freeze)
+		rc = dev->driver->pm->freeze(dev);
+	if (iucv_path_table_empty())
+		iucv_disable();
+	return rc;
+}
+
+/**
+ * iucv_pm_thaw() - Thaw PM callback
+ * @dev:	iucv-based device
+ *
+ * make iucv ready for use again: allocate path table, declare interrupt buffers
+ *				  and enable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ */
+static int iucv_pm_thaw(struct device *dev)
+{
+	int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "iucv_pm_thaw\n");
+#endif
+	iucv_pm_state = IUCV_PM_THAWING;
+	if (!iucv_path_table) {
+		rc = iucv_enable();
+		if (rc)
+			goto out;
+	}
+	if (cpumask_empty(&iucv_irq_cpumask)) {
+		if (iucv_nonsmp_handler)
+			/* enable interrupts on one cpu */
+			iucv_allow_cpu(NULL);
+		else
+			/* enable interrupts on all cpus */
+			iucv_setmask_mp();
+	}
+	if (dev->driver && dev->driver->pm && dev->driver->pm->thaw)
+		rc = dev->driver->pm->thaw(dev);
+out:
+	return rc;
+}
+
+/**
+ * iucv_pm_restore() - Restore PM callback
+ * @dev:	iucv-based device
+ *
+ * make iucv ready for use again: allocate path table, declare interrupt buffers
+ *				  and enable iucv interrupts
+ * invoke callback function of the iucv-based driver
+ */
+static int iucv_pm_restore(struct device *dev)
+{
+	int rc = 0;
+
+#ifdef CONFIG_PM_DEBUG
+	printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table);
+#endif
+	if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table)
+		pr_warning("Suspending Linux did not completely close all IUCV "
+			"connections\n");
+	iucv_pm_state = IUCV_PM_RESTORING;
+	if (cpumask_empty(&iucv_irq_cpumask)) {
+		rc = iucv_query_maxconn();
+		rc = iucv_enable();
+		if (rc)
+			goto out;
+	}
+	if (dev->driver && dev->driver->pm && dev->driver->pm->restore)
+		rc = dev->driver->pm->restore(dev);
+out:
+	return rc;
+}
+
+/**
+ * iucv_init
+ *
+ * Allocates and initializes various data structures.
+ */
+static int __init iucv_init(void)
+{
+	int rc;
+	int cpu;
+
+	if (!MACHINE_IS_VM) {
+		rc = -EPROTONOSUPPORT;
+		goto out;
+	}
+	rc = iucv_query_maxconn();
+	if (rc)
+		goto out;
+	rc = register_external_interrupt(0x4000, iucv_external_interrupt);
+	if (rc)
+		goto out;
+	iucv_root = root_device_register("iucv");
+	if (IS_ERR(iucv_root)) {
+		rc = PTR_ERR(iucv_root);
+		goto out_int;
+	}
+
+	for_each_online_cpu(cpu) {
+		/* Note: GFP_DMA used to get memory below 2G */
+		iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+				     GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_irq_data[cpu]) {
+			rc = -ENOMEM;
+			goto out_free;
+		}
+
+		/* Allocate parameter blocks. */
+		iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+				  GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_param[cpu]) {
+			rc = -ENOMEM;
+			goto out_free;
+		}
+		iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
+				  GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+		if (!iucv_param_irq[cpu]) {
+			rc = -ENOMEM;
+			goto out_free;
+		}
+
+	}
+	rc = register_hotcpu_notifier(&iucv_cpu_notifier);
+	if (rc)
+		goto out_free;
+	rc = register_reboot_notifier(&iucv_reboot_notifier);
+	if (rc)
+		goto out_cpu;
+	ASCEBC(iucv_error_no_listener, 16);
+	ASCEBC(iucv_error_no_memory, 16);
+	ASCEBC(iucv_error_pathid, 16);
+	iucv_available = 1;
+	rc = bus_register(&iucv_bus);
+	if (rc)
+		goto out_reboot;
+	return 0;
+
+out_reboot:
+	unregister_reboot_notifier(&iucv_reboot_notifier);
+out_cpu:
+	unregister_hotcpu_notifier(&iucv_cpu_notifier);
+out_free:
+	for_each_possible_cpu(cpu) {
+		kfree(iucv_param_irq[cpu]);
+		iucv_param_irq[cpu] = NULL;
+		kfree(iucv_param[cpu]);
+		iucv_param[cpu] = NULL;
+		kfree(iucv_irq_data[cpu]);
+		iucv_irq_data[cpu] = NULL;
+	}
+	root_device_unregister(iucv_root);
+out_int:
+	unregister_external_interrupt(0x4000, iucv_external_interrupt);
+out:
+	return rc;
+}
+
+/**
+ * iucv_exit
+ *
+ * Frees everything allocated from iucv_init.
+ */
+static void __exit iucv_exit(void)
+{
+	struct iucv_irq_list *p, *n;
+	int cpu;
+
+	spin_lock_irq(&iucv_queue_lock);
+	list_for_each_entry_safe(p, n, &iucv_task_queue, list)
+		kfree(p);
+	list_for_each_entry_safe(p, n, &iucv_work_queue, list)
+		kfree(p);
+	spin_unlock_irq(&iucv_queue_lock);
+	unregister_reboot_notifier(&iucv_reboot_notifier);
+	unregister_hotcpu_notifier(&iucv_cpu_notifier);
+	for_each_possible_cpu(cpu) {
+		kfree(iucv_param_irq[cpu]);
+		iucv_param_irq[cpu] = NULL;
+		kfree(iucv_param[cpu]);
+		iucv_param[cpu] = NULL;
+		kfree(iucv_irq_data[cpu]);
+		iucv_irq_data[cpu] = NULL;
+	}
+	root_device_unregister(iucv_root);
+	bus_unregister(&iucv_bus);
+	unregister_external_interrupt(0x4000, iucv_external_interrupt);
+}
+
+subsys_initcall(iucv_init);
+module_exit(iucv_exit);
+
+MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)");
+MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver");
+MODULE_LICENSE("GPL");
diff --git a/net/key/Makefile b/net/key/Makefile
new file mode 100644
index 00000000..85760804
--- /dev/null
+++ b/net/key/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the key AF.
+#
+
+obj-$(CONFIG_NET_KEY) += af_key.o
diff --git a/net/key/af_key.c b/net/key/af_key.c
new file mode 100644
index 00000000..8f92cf81
--- /dev/null
+++ b/net/key/af_key.c
@@ -0,0 +1,3818 @@
+/*
+ * net/key/af_key.c	An implementation of PF_KEYv2 sockets.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Maxim Giryaev	<gem@asplinux.ru>
+ *		David S. Miller	<davem@redhat.com>
+ *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *		Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ *		Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org>
+ *		Derek Atkins <derek@ihtfp.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/xfrm.h>
+
+#include <net/sock.h>
+
+#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
+#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
+
+static int pfkey_net_id __read_mostly;
+struct netns_pfkey {
+	/* List of all pfkey sockets. */
+	struct hlist_head table;
+	atomic_t socks_nr;
+};
+static DEFINE_MUTEX(pfkey_mutex);
+
+#define DUMMY_MARK 0
+static struct xfrm_mark dummy_mark = {0, 0};
+struct pfkey_sock {
+	/* struct sock must be the first member of struct pfkey_sock */
+	struct sock	sk;
+	int		registered;
+	int		promisc;
+
+	struct {
+		uint8_t		msg_version;
+		uint32_t	msg_pid;
+		int		(*dump)(struct pfkey_sock *sk);
+		void		(*done)(struct pfkey_sock *sk);
+		union {
+			struct xfrm_policy_walk	policy;
+			struct xfrm_state_walk	state;
+		} u;
+		struct sk_buff	*skb;
+	} dump;
+};
+
+static inline struct pfkey_sock *pfkey_sk(struct sock *sk)
+{
+	return (struct pfkey_sock *)sk;
+}
+
+static int pfkey_can_dump(const struct sock *sk)
+{
+	if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf)
+		return 1;
+	return 0;
+}
+
+static void pfkey_terminate_dump(struct pfkey_sock *pfk)
+{
+	if (pfk->dump.dump) {
+		if (pfk->dump.skb) {
+			kfree_skb(pfk->dump.skb);
+			pfk->dump.skb = NULL;
+		}
+		pfk->dump.done(pfk);
+		pfk->dump.dump = NULL;
+		pfk->dump.done = NULL;
+	}
+}
+
+static void pfkey_sock_destruct(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	pfkey_terminate_dump(pfkey_sk(sk));
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		pr_err("Attempt to release alive pfkey socket: %p\n", sk);
+		return;
+	}
+
+	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+
+	atomic_dec(&net_pfkey->socks_nr);
+}
+
+static const struct proto_ops pfkey_ops;
+
+static void pfkey_insert(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	mutex_lock(&pfkey_mutex);
+	sk_add_node_rcu(sk, &net_pfkey->table);
+	mutex_unlock(&pfkey_mutex);
+}
+
+static void pfkey_remove(struct sock *sk)
+{
+	mutex_lock(&pfkey_mutex);
+	sk_del_node_init_rcu(sk);
+	mutex_unlock(&pfkey_mutex);
+}
+
+static struct proto key_proto = {
+	.name	  = "KEY",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct pfkey_sock),
+};
+
+static int pfkey_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
+{
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+	struct sock *sk;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+	if (protocol != PF_KEY_V2)
+		return -EPROTONOSUPPORT;
+
+	err = -ENOMEM;
+	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+	if (sk == NULL)
+		goto out;
+
+	sock->ops = &pfkey_ops;
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_KEY;
+	sk->sk_destruct = pfkey_sock_destruct;
+
+	atomic_inc(&net_pfkey->socks_nr);
+
+	pfkey_insert(sk);
+
+	return 0;
+out:
+	return err;
+}
+
+static int pfkey_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (!sk)
+		return 0;
+
+	pfkey_remove(sk);
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+	skb_queue_purge(&sk->sk_write_queue);
+
+	synchronize_rcu();
+	sock_put(sk);
+
+	return 0;
+}
+
+static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
+			       gfp_t allocation, struct sock *sk)
+{
+	int err = -ENOBUFS;
+
+	sock_hold(sk);
+	if (*skb2 == NULL) {
+		if (atomic_read(&skb->users) != 1) {
+			*skb2 = skb_clone(skb, allocation);
+		} else {
+			*skb2 = skb;
+			atomic_inc(&skb->users);
+		}
+	}
+	if (*skb2 != NULL) {
+		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+			skb_orphan(*skb2);
+			skb_set_owner_r(*skb2, sk);
+			skb_queue_tail(&sk->sk_receive_queue, *skb2);
+			sk->sk_data_ready(sk, (*skb2)->len);
+			*skb2 = NULL;
+			err = 0;
+		}
+	}
+	sock_put(sk);
+	return err;
+}
+
+/* Send SKB to all pfkey sockets matching selected criteria.  */
+#define BROADCAST_ALL		0
+#define BROADCAST_ONE		1
+#define BROADCAST_REGISTERED	2
+#define BROADCAST_PROMISC_ONLY	4
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
+			   int broadcast_flags, struct sock *one_sk,
+			   struct net *net)
+{
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+	struct sock *sk;
+	struct hlist_node *node;
+	struct sk_buff *skb2 = NULL;
+	int err = -ESRCH;
+
+	/* XXX Do we need something like netlink_overrun?  I think
+	 * XXX PF_KEY socket apps will not mind current behavior.
+	 */
+	if (!skb)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &net_pfkey->table) {
+		struct pfkey_sock *pfk = pfkey_sk(sk);
+		int err2;
+
+		/* Yes, it means that if you are meant to receive this
+		 * pfkey message you receive it twice as promiscuous
+		 * socket.
+		 */
+		if (pfk->promisc)
+			pfkey_broadcast_one(skb, &skb2, allocation, sk);
+
+		/* the exact target will be processed later */
+		if (sk == one_sk)
+			continue;
+		if (broadcast_flags != BROADCAST_ALL) {
+			if (broadcast_flags & BROADCAST_PROMISC_ONLY)
+				continue;
+			if ((broadcast_flags & BROADCAST_REGISTERED) &&
+			    !pfk->registered)
+				continue;
+			if (broadcast_flags & BROADCAST_ONE)
+				continue;
+		}
+
+		err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk);
+
+		/* Error is cleare after succecful sending to at least one
+		 * registered KM */
+		if ((broadcast_flags & BROADCAST_REGISTERED) && err)
+			err = err2;
+	}
+	rcu_read_unlock();
+
+	if (one_sk != NULL)
+		err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
+
+	kfree_skb(skb2);
+	kfree_skb(skb);
+	return err;
+}
+
+static int pfkey_do_dump(struct pfkey_sock *pfk)
+{
+	struct sadb_msg *hdr;
+	int rc;
+
+	rc = pfk->dump.dump(pfk);
+	if (rc == -ENOBUFS)
+		return 0;
+
+	if (pfk->dump.skb) {
+		if (!pfkey_can_dump(&pfk->sk))
+			return 0;
+
+		hdr = (struct sadb_msg *) pfk->dump.skb->data;
+		hdr->sadb_msg_seq = 0;
+		hdr->sadb_msg_errno = rc;
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk, sock_net(&pfk->sk));
+		pfk->dump.skb = NULL;
+	}
+
+	pfkey_terminate_dump(pfk);
+	return rc;
+}
+
+static inline void pfkey_hdr_dup(struct sadb_msg *new,
+				 const struct sadb_msg *orig)
+{
+	*new = *orig;
+}
+
+static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk)
+{
+	struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
+	struct sadb_msg *hdr;
+
+	if (!skb)
+		return -ENOBUFS;
+
+	/* Woe be to the platform trying to support PFKEY yet
+	 * having normal errnos outside the 1-255 range, inclusive.
+	 */
+	err = -err;
+	if (err == ERESTARTSYS ||
+	    err == ERESTARTNOHAND ||
+	    err == ERESTARTNOINTR)
+		err = EINTR;
+	if (err >= 512)
+		err = EINVAL;
+	BUG_ON(err <= 0 || err >= 256);
+
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	pfkey_hdr_dup(hdr, orig);
+	hdr->sadb_msg_errno = (uint8_t) err;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) /
+			     sizeof(uint64_t));
+
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk));
+
+	return 0;
+}
+
+static u8 sadb_ext_min_len[] = {
+	[SADB_EXT_RESERVED]		= (u8) 0,
+	[SADB_EXT_SA]			= (u8) sizeof(struct sadb_sa),
+	[SADB_EXT_LIFETIME_CURRENT]	= (u8) sizeof(struct sadb_lifetime),
+	[SADB_EXT_LIFETIME_HARD]	= (u8) sizeof(struct sadb_lifetime),
+	[SADB_EXT_LIFETIME_SOFT]	= (u8) sizeof(struct sadb_lifetime),
+	[SADB_EXT_ADDRESS_SRC]		= (u8) sizeof(struct sadb_address),
+	[SADB_EXT_ADDRESS_DST]		= (u8) sizeof(struct sadb_address),
+	[SADB_EXT_ADDRESS_PROXY]	= (u8) sizeof(struct sadb_address),
+	[SADB_EXT_KEY_AUTH]		= (u8) sizeof(struct sadb_key),
+	[SADB_EXT_KEY_ENCRYPT]		= (u8) sizeof(struct sadb_key),
+	[SADB_EXT_IDENTITY_SRC]		= (u8) sizeof(struct sadb_ident),
+	[SADB_EXT_IDENTITY_DST]		= (u8) sizeof(struct sadb_ident),
+	[SADB_EXT_SENSITIVITY]		= (u8) sizeof(struct sadb_sens),
+	[SADB_EXT_PROPOSAL]		= (u8) sizeof(struct sadb_prop),
+	[SADB_EXT_SUPPORTED_AUTH]	= (u8) sizeof(struct sadb_supported),
+	[SADB_EXT_SUPPORTED_ENCRYPT]	= (u8) sizeof(struct sadb_supported),
+	[SADB_EXT_SPIRANGE]		= (u8) sizeof(struct sadb_spirange),
+	[SADB_X_EXT_KMPRIVATE]		= (u8) sizeof(struct sadb_x_kmprivate),
+	[SADB_X_EXT_POLICY]		= (u8) sizeof(struct sadb_x_policy),
+	[SADB_X_EXT_SA2]		= (u8) sizeof(struct sadb_x_sa2),
+	[SADB_X_EXT_NAT_T_TYPE]		= (u8) sizeof(struct sadb_x_nat_t_type),
+	[SADB_X_EXT_NAT_T_SPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
+	[SADB_X_EXT_NAT_T_DPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
+	[SADB_X_EXT_NAT_T_OA]		= (u8) sizeof(struct sadb_address),
+	[SADB_X_EXT_SEC_CTX]		= (u8) sizeof(struct sadb_x_sec_ctx),
+	[SADB_X_EXT_KMADDRESS]		= (u8) sizeof(struct sadb_x_kmaddress),
+};
+
+/* Verify sadb_address_{len,prefixlen} against sa_family.  */
+static int verify_address_len(const void *p)
+{
+	const struct sadb_address *sp = p;
+	const struct sockaddr *addr = (const struct sockaddr *)(sp + 1);
+	const struct sockaddr_in *sin;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	const struct sockaddr_in6 *sin6;
+#endif
+	int len;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t));
+		if (sp->sadb_address_len != len ||
+		    sp->sadb_address_prefixlen > 32)
+			return -EINVAL;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t));
+		if (sp->sadb_address_len != len ||
+		    sp->sadb_address_prefixlen > 128)
+			return -EINVAL;
+		break;
+#endif
+	default:
+		/* It is user using kernel to keep track of security
+		 * associations for another protocol, such as
+		 * OSPF/RSVP/RIPV2/MIP.  It is user's job to verify
+		 * lengths.
+		 *
+		 * XXX Actually, association/policy database is not yet
+		 * XXX able to cope with arbitrary sockaddr families.
+		 * XXX When it can, remove this -EINVAL.  -DaveM
+		 */
+		return -EINVAL;
+		break;
+	}
+
+	return 0;
+}
+
+static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx)
+{
+	return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) +
+			    sec_ctx->sadb_x_ctx_len,
+			    sizeof(uint64_t));
+}
+
+static inline int verify_sec_ctx_len(const void *p)
+{
+	const struct sadb_x_sec_ctx *sec_ctx = p;
+	int len = sec_ctx->sadb_x_ctx_len;
+
+	if (len > PAGE_SIZE)
+		return -EINVAL;
+
+	len = pfkey_sec_ctx_len(sec_ctx);
+
+	if (sec_ctx->sadb_x_sec_len != len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx)
+{
+	struct xfrm_user_sec_ctx *uctx = NULL;
+	int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+	uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
+
+	if (!uctx)
+		return NULL;
+
+	uctx->len = pfkey_sec_ctx_len(sec_ctx);
+	uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+	uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+	uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+	uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+	memcpy(uctx + 1, sec_ctx + 1,
+	       uctx->ctx_len);
+
+	return uctx;
+}
+
+static int present_and_same_family(const struct sadb_address *src,
+				   const struct sadb_address *dst)
+{
+	const struct sockaddr *s_addr, *d_addr;
+
+	if (!src || !dst)
+		return 0;
+
+	s_addr = (const struct sockaddr *)(src + 1);
+	d_addr = (const struct sockaddr *)(dst + 1);
+	if (s_addr->sa_family != d_addr->sa_family)
+		return 0;
+	if (s_addr->sa_family != AF_INET
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	    && s_addr->sa_family != AF_INET6
+#endif
+		)
+		return 0;
+
+	return 1;
+}
+
+static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs)
+{
+	const char *p = (char *) hdr;
+	int len = skb->len;
+
+	len -= sizeof(*hdr);
+	p += sizeof(*hdr);
+	while (len > 0) {
+		const struct sadb_ext *ehdr = (const struct sadb_ext *) p;
+		uint16_t ext_type;
+		int ext_len;
+
+		ext_len  = ehdr->sadb_ext_len;
+		ext_len *= sizeof(uint64_t);
+		ext_type = ehdr->sadb_ext_type;
+		if (ext_len < sizeof(uint64_t) ||
+		    ext_len > len ||
+		    ext_type == SADB_EXT_RESERVED)
+			return -EINVAL;
+
+		if (ext_type <= SADB_EXT_MAX) {
+			int min = (int) sadb_ext_min_len[ext_type];
+			if (ext_len < min)
+				return -EINVAL;
+			if (ext_hdrs[ext_type-1] != NULL)
+				return -EINVAL;
+			if (ext_type == SADB_EXT_ADDRESS_SRC ||
+			    ext_type == SADB_EXT_ADDRESS_DST ||
+			    ext_type == SADB_EXT_ADDRESS_PROXY ||
+			    ext_type == SADB_X_EXT_NAT_T_OA) {
+				if (verify_address_len(p))
+					return -EINVAL;
+			}
+			if (ext_type == SADB_X_EXT_SEC_CTX) {
+				if (verify_sec_ctx_len(p))
+					return -EINVAL;
+			}
+			ext_hdrs[ext_type-1] = (void *) p;
+		}
+		p   += ext_len;
+		len -= ext_len;
+	}
+
+	return 0;
+}
+
+static uint16_t
+pfkey_satype2proto(uint8_t satype)
+{
+	switch (satype) {
+	case SADB_SATYPE_UNSPEC:
+		return IPSEC_PROTO_ANY;
+	case SADB_SATYPE_AH:
+		return IPPROTO_AH;
+	case SADB_SATYPE_ESP:
+		return IPPROTO_ESP;
+	case SADB_X_SATYPE_IPCOMP:
+		return IPPROTO_COMP;
+		break;
+	default:
+		return 0;
+	}
+	/* NOTREACHED */
+}
+
+static uint8_t
+pfkey_proto2satype(uint16_t proto)
+{
+	switch (proto) {
+	case IPPROTO_AH:
+		return SADB_SATYPE_AH;
+	case IPPROTO_ESP:
+		return SADB_SATYPE_ESP;
+	case IPPROTO_COMP:
+		return SADB_X_SATYPE_IPCOMP;
+		break;
+	default:
+		return 0;
+	}
+	/* NOTREACHED */
+}
+
+/* BTW, this scheme means that there is no way with PFKEY2 sockets to
+ * say specifically 'just raw sockets' as we encode them as 255.
+ */
+
+static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
+{
+	return proto == IPSEC_PROTO_ANY ? 0 : proto;
+}
+
+static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
+{
+	return proto ? proto : IPSEC_PROTO_ANY;
+}
+
+static inline int pfkey_sockaddr_len(sa_family_t family)
+{
+	switch (family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		return sizeof(struct sockaddr_in6);
+#endif
+	}
+	return 0;
+}
+
+static
+int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr)
+{
+	switch (sa->sa_family) {
+	case AF_INET:
+		xaddr->a4 =
+			((struct sockaddr_in *)sa)->sin_addr.s_addr;
+		return AF_INET;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		memcpy(xaddr->a6,
+		       &((struct sockaddr_in6 *)sa)->sin6_addr,
+		       sizeof(struct in6_addr));
+		return AF_INET6;
+#endif
+	}
+	return 0;
+}
+
+static
+int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr)
+{
+	return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1),
+				      xaddr);
+}
+
+static struct  xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	const struct sadb_sa *sa;
+	const struct sadb_address *addr;
+	uint16_t proto;
+	unsigned short family;
+	xfrm_address_t *xaddr;
+
+	sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
+	if (sa == NULL)
+		return NULL;
+
+	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+	if (proto == 0)
+		return NULL;
+
+	/* sadb_address_len should be checked by caller */
+	addr = (const struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+	if (addr == NULL)
+		return NULL;
+
+	family = ((const struct sockaddr *)(addr + 1))->sa_family;
+	switch (family) {
+	case AF_INET:
+		xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr;
+		break;
+#endif
+	default:
+		xaddr = NULL;
+	}
+
+	if (!xaddr)
+		return NULL;
+
+	return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family);
+}
+
+#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1)))
+
+static int
+pfkey_sockaddr_size(sa_family_t family)
+{
+	return PFKEY_ALIGN8(pfkey_sockaddr_len(family));
+}
+
+static inline int pfkey_mode_from_xfrm(int mode)
+{
+	switch(mode) {
+	case XFRM_MODE_TRANSPORT:
+		return IPSEC_MODE_TRANSPORT;
+	case XFRM_MODE_TUNNEL:
+		return IPSEC_MODE_TUNNEL;
+	case XFRM_MODE_BEET:
+		return IPSEC_MODE_BEET;
+	default:
+		return -1;
+	}
+}
+
+static inline int pfkey_mode_to_xfrm(int mode)
+{
+	switch(mode) {
+	case IPSEC_MODE_ANY:	/*XXX*/
+	case IPSEC_MODE_TRANSPORT:
+		return XFRM_MODE_TRANSPORT;
+	case IPSEC_MODE_TUNNEL:
+		return XFRM_MODE_TUNNEL;
+	case IPSEC_MODE_BEET:
+		return XFRM_MODE_BEET;
+	default:
+		return -1;
+	}
+}
+
+static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port,
+					struct sockaddr *sa,
+					unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+	    {
+		struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+		sin->sin_family = AF_INET;
+		sin->sin_port = port;
+		sin->sin_addr.s_addr = xaddr->a4;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		return 32;
+	    }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+	    {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_port = port;
+		sin6->sin6_flowinfo = 0;
+		ipv6_addr_copy(&sin6->sin6_addr, (const struct in6_addr *)xaddr->a6);
+		sin6->sin6_scope_id = 0;
+		return 128;
+	    }
+#endif
+	}
+	return 0;
+}
+
+static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
+					      int add_keys, int hsc)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	struct sadb_sa *sa;
+	struct sadb_lifetime *lifetime;
+	struct sadb_address *addr;
+	struct sadb_key *key;
+	struct sadb_x_sa2 *sa2;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
+	int ctx_size = 0;
+	int size;
+	int auth_key_size = 0;
+	int encrypt_key_size = 0;
+	int sockaddr_size;
+	struct xfrm_encap_tmpl *natt = NULL;
+	int mode;
+
+	/* address family check */
+	sockaddr_size = pfkey_sockaddr_size(x->props.family);
+	if (!sockaddr_size)
+		return ERR_PTR(-EINVAL);
+
+	/* base, SA, (lifetime (HSC),) address(SD), (address(P),)
+	   key(AE), (identity(SD),) (sensitivity)> */
+	size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) +
+		sizeof(struct sadb_lifetime) +
+		((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) +
+		((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) +
+			sizeof(struct sadb_address)*2 +
+				sockaddr_size*2 +
+					sizeof(struct sadb_x_sa2);
+
+	if ((xfrm_ctx = x->security)) {
+		ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+		size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+	}
+
+	/* identity & sensitivity */
+	if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr, x->props.family))
+		size += sizeof(struct sadb_address) + sockaddr_size;
+
+	if (add_keys) {
+		if (x->aalg && x->aalg->alg_key_len) {
+			auth_key_size =
+				PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8);
+			size += sizeof(struct sadb_key) + auth_key_size;
+		}
+		if (x->ealg && x->ealg->alg_key_len) {
+			encrypt_key_size =
+				PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8);
+			size += sizeof(struct sadb_key) + encrypt_key_size;
+		}
+	}
+	if (x->encap)
+		natt = x->encap;
+
+	if (natt && natt->encap_type) {
+		size += sizeof(struct sadb_x_nat_t_type);
+		size += sizeof(struct sadb_x_nat_t_port);
+		size += sizeof(struct sadb_x_nat_t_port);
+	}
+
+	skb =  alloc_skb(size + 16, GFP_ATOMIC);
+	if (skb == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	/* call should fill header later */
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	memset(hdr, 0, size);	/* XXX do we need this ? */
+	hdr->sadb_msg_len = size / sizeof(uint64_t);
+
+	/* sa */
+	sa = (struct sadb_sa *)  skb_put(skb, sizeof(struct sadb_sa));
+	sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
+	sa->sadb_sa_exttype = SADB_EXT_SA;
+	sa->sadb_sa_spi = x->id.spi;
+	sa->sadb_sa_replay = x->props.replay_window;
+	switch (x->km.state) {
+	case XFRM_STATE_VALID:
+		sa->sadb_sa_state = x->km.dying ?
+			SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
+		break;
+	case XFRM_STATE_ACQ:
+		sa->sadb_sa_state = SADB_SASTATE_LARVAL;
+		break;
+	default:
+		sa->sadb_sa_state = SADB_SASTATE_DEAD;
+		break;
+	}
+	sa->sadb_sa_auth = 0;
+	if (x->aalg) {
+		struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0;
+	}
+	sa->sadb_sa_encrypt = 0;
+	BUG_ON(x->ealg && x->calg);
+	if (x->ealg) {
+		struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0);
+		sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0;
+	}
+	/* KAME compatible: sadb_sa_encrypt is overloaded with calg id */
+	if (x->calg) {
+		struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0);
+		sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0;
+	}
+
+	sa->sadb_sa_flags = 0;
+	if (x->props.flags & XFRM_STATE_NOECN)
+		sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
+	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+		sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
+	if (x->props.flags & XFRM_STATE_NOPMTUDISC)
+		sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
+
+	/* hard time */
+	if (hsc & 2) {
+		lifetime = (struct sadb_lifetime *)  skb_put(skb,
+							     sizeof(struct sadb_lifetime));
+		lifetime->sadb_lifetime_len =
+			sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+		lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
+		lifetime->sadb_lifetime_allocations =  _X2KEY(x->lft.hard_packet_limit);
+		lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit);
+		lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds;
+		lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds;
+	}
+	/* soft time */
+	if (hsc & 1) {
+		lifetime = (struct sadb_lifetime *)  skb_put(skb,
+							     sizeof(struct sadb_lifetime));
+		lifetime->sadb_lifetime_len =
+			sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+		lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
+		lifetime->sadb_lifetime_allocations =  _X2KEY(x->lft.soft_packet_limit);
+		lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit);
+		lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds;
+		lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds;
+	}
+	/* current time */
+	lifetime = (struct sadb_lifetime *)  skb_put(skb,
+						     sizeof(struct sadb_lifetime));
+	lifetime->sadb_lifetime_len =
+		sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+	lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
+	lifetime->sadb_lifetime_allocations = x->curlft.packets;
+	lifetime->sadb_lifetime_bytes = x->curlft.bytes;
+	lifetime->sadb_lifetime_addtime = x->curlft.add_time;
+	lifetime->sadb_lifetime_usetime = x->curlft.use_time;
+	/* src address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+	/* "if the ports are non-zero, then the sadb_address_proto field,
+	   normally zero, MUST be filled in with the transport
+	   protocol's number." - RFC2367 */
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(&x->props.saddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	/* dst address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(&x->id.daddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	if (xfrm_addr_cmp(&x->sel.saddr, &x->props.saddr,
+			  x->props.family)) {
+		addr = (struct sadb_address*) skb_put(skb,
+			sizeof(struct sadb_address)+sockaddr_size);
+		addr->sadb_address_len =
+			(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+		addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY;
+		addr->sadb_address_proto =
+			pfkey_proto_from_xfrm(x->sel.proto);
+		addr->sadb_address_prefixlen = x->sel.prefixlen_s;
+		addr->sadb_address_reserved = 0;
+
+		pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	}
+
+	/* auth key */
+	if (add_keys && auth_key_size) {
+		key = (struct sadb_key *) skb_put(skb,
+						  sizeof(struct sadb_key)+auth_key_size);
+		key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) /
+			sizeof(uint64_t);
+		key->sadb_key_exttype = SADB_EXT_KEY_AUTH;
+		key->sadb_key_bits = x->aalg->alg_key_len;
+		key->sadb_key_reserved = 0;
+		memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8);
+	}
+	/* encrypt key */
+	if (add_keys && encrypt_key_size) {
+		key = (struct sadb_key *) skb_put(skb,
+						  sizeof(struct sadb_key)+encrypt_key_size);
+		key->sadb_key_len = (sizeof(struct sadb_key) +
+				     encrypt_key_size) / sizeof(uint64_t);
+		key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
+		key->sadb_key_bits = x->ealg->alg_key_len;
+		key->sadb_key_reserved = 0;
+		memcpy(key + 1, x->ealg->alg_key,
+		       (x->ealg->alg_key_len+7)/8);
+	}
+
+	/* sa */
+	sa2 = (struct sadb_x_sa2 *)  skb_put(skb, sizeof(struct sadb_x_sa2));
+	sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t);
+	sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
+	if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) {
+		kfree_skb(skb);
+		return ERR_PTR(-EINVAL);
+	}
+	sa2->sadb_x_sa2_mode = mode;
+	sa2->sadb_x_sa2_reserved1 = 0;
+	sa2->sadb_x_sa2_reserved2 = 0;
+	sa2->sadb_x_sa2_sequence = 0;
+	sa2->sadb_x_sa2_reqid = x->props.reqid;
+
+	if (natt && natt->encap_type) {
+		struct sadb_x_nat_t_type *n_type;
+		struct sadb_x_nat_t_port *n_port;
+
+		/* type */
+		n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type));
+		n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t);
+		n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE;
+		n_type->sadb_x_nat_t_type_type = natt->encap_type;
+		n_type->sadb_x_nat_t_type_reserved[0] = 0;
+		n_type->sadb_x_nat_t_type_reserved[1] = 0;
+		n_type->sadb_x_nat_t_type_reserved[2] = 0;
+
+		/* source port */
+		n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+		n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+		n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
+		n_port->sadb_x_nat_t_port_port = natt->encap_sport;
+		n_port->sadb_x_nat_t_port_reserved = 0;
+
+		/* dest port */
+		n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+		n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+		n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
+		n_port->sadb_x_nat_t_port_port = natt->encap_dport;
+		n_port->sadb_x_nat_t_port_reserved = 0;
+	}
+
+	/* security context */
+	if (xfrm_ctx) {
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+				sizeof(struct sadb_x_sec_ctx) + ctx_size);
+		sec_ctx->sadb_x_sec_len =
+		  (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
+	return skb;
+}
+
+
+static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x)
+{
+	struct sk_buff *skb;
+
+	skb = __pfkey_xfrm_state2msg(x, 1, 3);
+
+	return skb;
+}
+
+static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x,
+							  int hsc)
+{
+	return __pfkey_xfrm_state2msg(x, 0, hsc);
+}
+
+static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
+						const struct sadb_msg *hdr,
+						void * const *ext_hdrs)
+{
+	struct xfrm_state *x;
+	const struct sadb_lifetime *lifetime;
+	const struct sadb_sa *sa;
+	const struct sadb_key *key;
+	const struct sadb_x_sec_ctx *sec_ctx;
+	uint16_t proto;
+	int err;
+
+
+	sa = (const struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1];
+	if (!sa ||
+	    !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+		return ERR_PTR(-EINVAL);
+	if (hdr->sadb_msg_satype == SADB_SATYPE_ESP &&
+	    !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1])
+		return ERR_PTR(-EINVAL);
+	if (hdr->sadb_msg_satype == SADB_SATYPE_AH &&
+	    !ext_hdrs[SADB_EXT_KEY_AUTH-1])
+		return ERR_PTR(-EINVAL);
+	if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] !=
+	    !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1])
+		return ERR_PTR(-EINVAL);
+
+	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+	if (proto == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* default error is no buffer space */
+	err = -ENOBUFS;
+
+	/* RFC2367:
+
+   Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message.
+   SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not
+   sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state.
+   Therefore, the sadb_sa_state field of all submitted SAs MUST be
+   SADB_SASTATE_MATURE and the kernel MUST return an error if this is
+   not true.
+
+	   However, KAME setkey always uses SADB_SASTATE_LARVAL.
+	   Hence, we have to _ignore_ sadb_sa_state, which is also reasonable.
+	 */
+	if (sa->sadb_sa_auth > SADB_AALG_MAX ||
+	    (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP &&
+	     sa->sadb_sa_encrypt > SADB_X_CALG_MAX) ||
+	    sa->sadb_sa_encrypt > SADB_EALG_MAX)
+		return ERR_PTR(-EINVAL);
+	key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
+	if (key != NULL &&
+	    sa->sadb_sa_auth != SADB_X_AALG_NULL &&
+	    ((key->sadb_key_bits+7) / 8 == 0 ||
+	     (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
+		return ERR_PTR(-EINVAL);
+	key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
+	if (key != NULL &&
+	    sa->sadb_sa_encrypt != SADB_EALG_NULL &&
+	    ((key->sadb_key_bits+7) / 8 == 0 ||
+	     (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t)))
+		return ERR_PTR(-EINVAL);
+
+	x = xfrm_state_alloc(net);
+	if (x == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	x->id.proto = proto;
+	x->id.spi = sa->sadb_sa_spi;
+	x->props.replay_window = sa->sadb_sa_replay;
+	if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN)
+		x->props.flags |= XFRM_STATE_NOECN;
+	if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
+		x->props.flags |= XFRM_STATE_DECAP_DSCP;
+	if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
+		x->props.flags |= XFRM_STATE_NOPMTUDISC;
+
+	lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
+	if (lifetime != NULL) {
+		x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+		x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+		x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+		x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+	}
+	lifetime = (const struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1];
+	if (lifetime != NULL) {
+		x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+		x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+		x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+		x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+	}
+
+	sec_ctx = (const struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			goto out;
+
+		err = security_xfrm_state_alloc(x, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
+	key = (const struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
+	if (sa->sadb_sa_auth) {
+		int keysize = 0;
+		struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth);
+		if (!a) {
+			err = -ENOSYS;
+			goto out;
+		}
+		if (key)
+			keysize = (key->sadb_key_bits + 7) / 8;
+		x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL);
+		if (!x->aalg)
+			goto out;
+		strcpy(x->aalg->alg_name, a->name);
+		x->aalg->alg_key_len = 0;
+		if (key) {
+			x->aalg->alg_key_len = key->sadb_key_bits;
+			memcpy(x->aalg->alg_key, key+1, keysize);
+		}
+		x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits;
+		x->props.aalgo = sa->sadb_sa_auth;
+		/* x->algo.flags = sa->sadb_sa_flags; */
+	}
+	if (sa->sadb_sa_encrypt) {
+		if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) {
+			struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt);
+			if (!a) {
+				err = -ENOSYS;
+				goto out;
+			}
+			x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL);
+			if (!x->calg)
+				goto out;
+			strcpy(x->calg->alg_name, a->name);
+			x->props.calgo = sa->sadb_sa_encrypt;
+		} else {
+			int keysize = 0;
+			struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt);
+			if (!a) {
+				err = -ENOSYS;
+				goto out;
+			}
+			key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1];
+			if (key)
+				keysize = (key->sadb_key_bits + 7) / 8;
+			x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL);
+			if (!x->ealg)
+				goto out;
+			strcpy(x->ealg->alg_name, a->name);
+			x->ealg->alg_key_len = 0;
+			if (key) {
+				x->ealg->alg_key_len = key->sadb_key_bits;
+				memcpy(x->ealg->alg_key, key+1, keysize);
+			}
+			x->props.ealgo = sa->sadb_sa_encrypt;
+		}
+	}
+	/* x->algo.flags = sa->sadb_sa_flags; */
+
+	x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+						    &x->props.saddr);
+	if (!x->props.family) {
+		err = -EAFNOSUPPORT;
+		goto out;
+	}
+	pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+				  &x->id.daddr);
+
+	if (ext_hdrs[SADB_X_EXT_SA2-1]) {
+		const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1];
+		int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode);
+		if (mode < 0) {
+			err = -EINVAL;
+			goto out;
+		}
+		x->props.mode = mode;
+		x->props.reqid = sa2->sadb_x_sa2_reqid;
+	}
+
+	if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) {
+		const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1];
+
+		/* Nobody uses this, but we try. */
+		x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr);
+		x->sel.prefixlen_s = addr->sadb_address_prefixlen;
+	}
+
+	if (!x->sel.family)
+		x->sel.family = x->props.family;
+
+	if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) {
+		const struct sadb_x_nat_t_type* n_type;
+		struct xfrm_encap_tmpl *natt;
+
+		x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
+		if (!x->encap)
+			goto out;
+
+		natt = x->encap;
+		n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1];
+		natt->encap_type = n_type->sadb_x_nat_t_type_type;
+
+		if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) {
+			const struct sadb_x_nat_t_port *n_port =
+				ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1];
+			natt->encap_sport = n_port->sadb_x_nat_t_port_port;
+		}
+		if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) {
+			const struct sadb_x_nat_t_port *n_port =
+				ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1];
+			natt->encap_dport = n_port->sadb_x_nat_t_port_port;
+		}
+		memset(&natt->encap_oa, 0, sizeof(natt->encap_oa));
+	}
+
+	err = xfrm_init_state(x);
+	if (err)
+		goto out;
+
+	x->km.seq = hdr->sadb_msg_seq;
+	return x;
+
+out:
+	x->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(x);
+	return ERR_PTR(err);
+}
+
+static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	struct sk_buff *resp_skb;
+	struct sadb_x_sa2 *sa2;
+	struct sadb_address *saddr, *daddr;
+	struct sadb_msg *out_hdr;
+	struct sadb_spirange *range;
+	struct xfrm_state *x = NULL;
+	int mode;
+	int err;
+	u32 min_spi, max_spi;
+	u32 reqid;
+	u8 proto;
+	unsigned short family;
+	xfrm_address_t *xsaddr = NULL, *xdaddr = NULL;
+
+	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+		return -EINVAL;
+
+	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+	if (proto == 0)
+		return -EINVAL;
+
+	if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) {
+		mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode);
+		if (mode < 0)
+			return -EINVAL;
+		reqid = sa2->sadb_x_sa2_reqid;
+	} else {
+		mode = 0;
+		reqid = 0;
+	}
+
+	saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1];
+	daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1];
+
+	family = ((struct sockaddr *)(saddr + 1))->sa_family;
+	switch (family) {
+	case AF_INET:
+		xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr;
+		xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr;
+		xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr;
+		break;
+#endif
+	}
+
+	if (hdr->sadb_msg_seq) {
+		x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+		if (x && xfrm_addr_cmp(&x->id.daddr, xdaddr, family)) {
+			xfrm_state_put(x);
+			x = NULL;
+		}
+	}
+
+	if (!x)
+		x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+
+	if (x == NULL)
+		return -ENOENT;
+
+	min_spi = 0x100;
+	max_spi = 0x0fffffff;
+
+	range = ext_hdrs[SADB_EXT_SPIRANGE-1];
+	if (range) {
+		min_spi = range->sadb_spirange_min;
+		max_spi = range->sadb_spirange_max;
+	}
+
+	err = xfrm_alloc_spi(x, min_spi, max_spi);
+	resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x);
+
+	if (IS_ERR(resp_skb)) {
+		xfrm_state_put(x);
+		return  PTR_ERR(resp_skb);
+	}
+
+	out_hdr = (struct sadb_msg *) resp_skb->data;
+	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+	out_hdr->sadb_msg_type = SADB_GETSPI;
+	out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_reserved = 0;
+	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+
+	xfrm_state_put(x);
+
+	pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net);
+
+	return 0;
+}
+
+static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	struct xfrm_state *x;
+
+	if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8)
+		return -EOPNOTSUPP;
+
+	if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0)
+		return 0;
+
+	x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+	if (x == NULL)
+		return 0;
+
+	spin_lock_bh(&x->lock);
+	if (x->km.state == XFRM_STATE_ACQ) {
+		x->km.state = XFRM_STATE_ERROR;
+		wake_up(&net->xfrm.km_waitq);
+	}
+	spin_unlock_bh(&x->lock);
+	xfrm_state_put(x);
+	return 0;
+}
+
+static inline int event2poltype(int event)
+{
+	switch (event) {
+	case XFRM_MSG_DELPOLICY:
+		return SADB_X_SPDDELETE;
+	case XFRM_MSG_NEWPOLICY:
+		return SADB_X_SPDADD;
+	case XFRM_MSG_UPDPOLICY:
+		return SADB_X_SPDUPDATE;
+	case XFRM_MSG_POLEXPIRE:
+	//	return SADB_X_SPDEXPIRE;
+	default:
+		pr_err("pfkey: Unknown policy event %d\n", event);
+		break;
+	}
+
+	return 0;
+}
+
+static inline int event2keytype(int event)
+{
+	switch (event) {
+	case XFRM_MSG_DELSA:
+		return SADB_DELETE;
+	case XFRM_MSG_NEWSA:
+		return SADB_ADD;
+	case XFRM_MSG_UPDSA:
+		return SADB_UPDATE;
+	case XFRM_MSG_EXPIRE:
+		return SADB_EXPIRE;
+	default:
+		pr_err("pfkey: Unknown SA event %d\n", event);
+		break;
+	}
+
+	return 0;
+}
+
+/* ADD/UPD/DEL */
+static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+
+	skb = pfkey_xfrm_state2msg(x);
+
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	hdr = (struct sadb_msg *) skb->data;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_type = event2keytype(c->event);
+	hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_reserved = 0;
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
+
+	return 0;
+}
+
+static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	struct xfrm_state *x;
+	int err;
+	struct km_event c;
+
+	x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs);
+	if (IS_ERR(x))
+		return PTR_ERR(x);
+
+	xfrm_state_hold(x);
+	if (hdr->sadb_msg_type == SADB_ADD)
+		err = xfrm_state_add(x);
+	else
+		err = xfrm_state_update(x);
+
+	xfrm_audit_state_add(x, err ? 0 : 1,
+			     audit_get_loginuid(current),
+			     audit_get_sessionid(current), 0);
+
+	if (err < 0) {
+		x->km.state = XFRM_STATE_DEAD;
+		__xfrm_state_put(x);
+		goto out;
+	}
+
+	if (hdr->sadb_msg_type == SADB_ADD)
+		c.event = XFRM_MSG_NEWSA;
+	else
+		c.event = XFRM_MSG_UPDSA;
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	km_state_notify(x, &c);
+out:
+	xfrm_state_put(x);
+	return err;
+}
+
+static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	struct xfrm_state *x;
+	struct km_event c;
+	int err;
+
+	if (!ext_hdrs[SADB_EXT_SA-1] ||
+	    !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+		return -EINVAL;
+
+	x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs);
+	if (x == NULL)
+		return -ESRCH;
+
+	if ((err = security_xfrm_state_delete(x)))
+		goto out;
+
+	if (xfrm_state_kern(x)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = xfrm_state_delete(x);
+
+	if (err < 0)
+		goto out;
+
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.event = XFRM_MSG_DELSA;
+	km_state_notify(x, &c);
+out:
+	xfrm_audit_state_delete(x, err ? 0 : 1,
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
+	xfrm_state_put(x);
+
+	return err;
+}
+
+static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	__u8 proto;
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	struct xfrm_state *x;
+
+	if (!ext_hdrs[SADB_EXT_SA-1] ||
+	    !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]))
+		return -EINVAL;
+
+	x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs);
+	if (x == NULL)
+		return -ESRCH;
+
+	out_skb = pfkey_xfrm_state2msg(x);
+	proto = x->id.proto;
+	xfrm_state_put(x);
+	if (IS_ERR(out_skb))
+		return  PTR_ERR(out_skb);
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+	out_hdr->sadb_msg_type = SADB_GET;
+	out_hdr->sadb_msg_satype = pfkey_proto2satype(proto);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_reserved = 0;
+	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+
+	return 0;
+}
+
+static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig,
+					      gfp_t allocation)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	int len, auth_len, enc_len, i;
+
+	auth_len = xfrm_count_auth_supported();
+	if (auth_len) {
+		auth_len *= sizeof(struct sadb_alg);
+		auth_len += sizeof(struct sadb_supported);
+	}
+
+	enc_len = xfrm_count_enc_supported();
+	if (enc_len) {
+		enc_len *= sizeof(struct sadb_alg);
+		enc_len += sizeof(struct sadb_supported);
+	}
+
+	len = enc_len + auth_len + sizeof(struct sadb_msg);
+
+	skb = alloc_skb(len + 16, allocation);
+	if (!skb)
+		goto out_put_algs;
+
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr));
+	pfkey_hdr_dup(hdr, orig);
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_len = len / sizeof(uint64_t);
+
+	if (auth_len) {
+		struct sadb_supported *sp;
+		struct sadb_alg *ap;
+
+		sp = (struct sadb_supported *) skb_put(skb, auth_len);
+		ap = (struct sadb_alg *) (sp + 1);
+
+		sp->sadb_supported_len = auth_len / sizeof(uint64_t);
+		sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
+
+		for (i = 0; ; i++) {
+			struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+			if (!aalg)
+				break;
+			if (aalg->available)
+				*ap++ = aalg->desc;
+		}
+	}
+
+	if (enc_len) {
+		struct sadb_supported *sp;
+		struct sadb_alg *ap;
+
+		sp = (struct sadb_supported *) skb_put(skb, enc_len);
+		ap = (struct sadb_alg *) (sp + 1);
+
+		sp->sadb_supported_len = enc_len / sizeof(uint64_t);
+		sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
+
+		for (i = 0; ; i++) {
+			struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+			if (!ealg)
+				break;
+			if (ealg->available)
+				*ap++ = ealg->desc;
+		}
+	}
+
+out_put_algs:
+	return skb;
+}
+
+static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct pfkey_sock *pfk = pfkey_sk(sk);
+	struct sk_buff *supp_skb;
+
+	if (hdr->sadb_msg_satype > SADB_SATYPE_MAX)
+		return -EINVAL;
+
+	if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) {
+		if (pfk->registered&(1<<hdr->sadb_msg_satype))
+			return -EEXIST;
+		pfk->registered |= (1<<hdr->sadb_msg_satype);
+	}
+
+	xfrm_probe_algs();
+
+	supp_skb = compose_sadb_supported(hdr, GFP_KERNEL);
+	if (!supp_skb) {
+		if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC)
+			pfk->registered &= ~(1<<hdr->sadb_msg_satype);
+
+		return -ENOBUFS;
+	}
+
+	pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk));
+
+	return 0;
+}
+
+static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+
+	skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+	if (!skb)
+		return -ENOBUFS;
+
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	memcpy(hdr, ihdr, sizeof(struct sadb_msg));
+	hdr->sadb_msg_errno = (uint8_t) 0;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
+}
+
+static int key_notify_sa_flush(const struct km_event *c)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+
+	skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+	if (!skb)
+		return -ENOBUFS;
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
+	hdr->sadb_msg_type = SADB_FLUSH;
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_errno = (uint8_t) 0;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+
+	return 0;
+}
+
+static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	unsigned proto;
+	struct km_event c;
+	struct xfrm_audit audit_info;
+	int err, err2;
+
+	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+	if (proto == 0)
+		return -EINVAL;
+
+	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
+	audit_info.secid = 0;
+	err = xfrm_state_flush(net, proto, &audit_info);
+	err2 = unicast_flush_resp(sk, hdr);
+	if (err || err2) {
+		if (err == -ESRCH) /* empty table - go quietly */
+			err = 0;
+		return err ? err : err2;
+	}
+
+	c.data.proto = proto;
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.event = XFRM_MSG_FLUSHSA;
+	c.net = net;
+	km_state_notify(NULL, &c);
+
+	return 0;
+}
+
+static int dump_sa(struct xfrm_state *x, int count, void *ptr)
+{
+	struct pfkey_sock *pfk = ptr;
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+
+	if (!pfkey_can_dump(&pfk->sk))
+		return -ENOBUFS;
+
+	out_skb = pfkey_xfrm_state2msg(x);
+	if (IS_ERR(out_skb))
+		return PTR_ERR(out_skb);
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = pfk->dump.msg_version;
+	out_hdr->sadb_msg_type = SADB_DUMP;
+	out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_reserved = 0;
+	out_hdr->sadb_msg_seq = count + 1;
+	out_hdr->sadb_msg_pid = pfk->dump.msg_pid;
+
+	if (pfk->dump.skb)
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk, sock_net(&pfk->sk));
+	pfk->dump.skb = out_skb;
+
+	return 0;
+}
+
+static int pfkey_dump_sa(struct pfkey_sock *pfk)
+{
+	struct net *net = sock_net(&pfk->sk);
+	return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk);
+}
+
+static void pfkey_dump_sa_done(struct pfkey_sock *pfk)
+{
+	xfrm_state_walk_done(&pfk->dump.u.state);
+}
+
+static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	u8 proto;
+	struct pfkey_sock *pfk = pfkey_sk(sk);
+
+	if (pfk->dump.dump != NULL)
+		return -EBUSY;
+
+	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
+	if (proto == 0)
+		return -EINVAL;
+
+	pfk->dump.msg_version = hdr->sadb_msg_version;
+	pfk->dump.msg_pid = hdr->sadb_msg_pid;
+	pfk->dump.dump = pfkey_dump_sa;
+	pfk->dump.done = pfkey_dump_sa_done;
+	xfrm_state_walk_init(&pfk->dump.u.state, proto);
+
+	return pfkey_do_dump(pfk);
+}
+
+static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct pfkey_sock *pfk = pfkey_sk(sk);
+	int satype = hdr->sadb_msg_satype;
+	bool reset_errno = false;
+
+	if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) {
+		reset_errno = true;
+		if (satype != 0 && satype != 1)
+			return -EINVAL;
+		pfk->promisc = satype;
+	}
+	if (reset_errno && skb_cloned(skb))
+		skb = skb_copy(skb, GFP_KERNEL);
+	else
+		skb = skb_clone(skb, GFP_KERNEL);
+
+	if (reset_errno && skb) {
+		struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data;
+		new_hdr->sadb_msg_errno = 0;
+	}
+
+	pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk));
+	return 0;
+}
+
+static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+	int i;
+	u32 reqid = *(u32*)ptr;
+
+	for (i=0; i<xp->xfrm_nr; i++) {
+		if (xp->xfrm_vec[i].reqid == reqid)
+			return -EEXIST;
+	}
+	return 0;
+}
+
+static u32 gen_reqid(struct net *net)
+{
+	struct xfrm_policy_walk walk;
+	u32 start;
+	int rc;
+	static u32 reqid = IPSEC_MANUAL_REQID_MAX;
+
+	start = reqid;
+	do {
+		++reqid;
+		if (reqid == 0)
+			reqid = IPSEC_MANUAL_REQID_MAX+1;
+		xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN);
+		rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid);
+		xfrm_policy_walk_done(&walk);
+		if (rc != -EEXIST)
+			return reqid;
+	} while (reqid != start);
+	return 0;
+}
+
+static int
+parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
+{
+	struct net *net = xp_net(xp);
+	struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr;
+	int mode;
+
+	if (xp->xfrm_nr >= XFRM_MAX_DEPTH)
+		return -ELOOP;
+
+	if (rq->sadb_x_ipsecrequest_mode == 0)
+		return -EINVAL;
+
+	t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */
+	if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0)
+		return -EINVAL;
+	t->mode = mode;
+	if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE)
+		t->optional = 1;
+	else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) {
+		t->reqid = rq->sadb_x_ipsecrequest_reqid;
+		if (t->reqid > IPSEC_MANUAL_REQID_MAX)
+			t->reqid = 0;
+		if (!t->reqid && !(t->reqid = gen_reqid(net)))
+			return -ENOBUFS;
+	}
+
+	/* addresses present only in tunnel mode */
+	if (t->mode == XFRM_MODE_TUNNEL) {
+		u8 *sa = (u8 *) (rq + 1);
+		int family, socklen;
+
+		family = pfkey_sockaddr_extract((struct sockaddr *)sa,
+						&t->saddr);
+		if (!family)
+			return -EINVAL;
+
+		socklen = pfkey_sockaddr_len(family);
+		if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen),
+					   &t->id.daddr) != family)
+			return -EINVAL;
+		t->encap_family = family;
+	} else
+		t->encap_family = xp->family;
+
+	/* No way to set this via kame pfkey */
+	t->allalgs = 1;
+	xp->xfrm_nr++;
+	return 0;
+}
+
+static int
+parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
+{
+	int err;
+	int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy);
+	struct sadb_x_ipsecrequest *rq = (void*)(pol+1);
+
+	while (len >= sizeof(struct sadb_x_ipsecrequest)) {
+		if ((err = parse_ipsecrequest(xp, rq)) < 0)
+			return err;
+		len -= rq->sadb_x_ipsecrequest_len;
+		rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len);
+	}
+	return 0;
+}
+
+static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
+{
+  struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+	if (xfrm_ctx) {
+		int len = sizeof(struct sadb_x_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+		return PFKEY_ALIGN8(len);
+	}
+	return 0;
+}
+
+static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp)
+{
+	const struct xfrm_tmpl *t;
+	int sockaddr_size = pfkey_sockaddr_size(xp->family);
+	int socklen = 0;
+	int i;
+
+	for (i=0; i<xp->xfrm_nr; i++) {
+		t = xp->xfrm_vec + i;
+		socklen += pfkey_sockaddr_len(t->encap_family);
+	}
+
+	return sizeof(struct sadb_msg) +
+		(sizeof(struct sadb_lifetime) * 3) +
+		(sizeof(struct sadb_address) * 2) +
+		(sockaddr_size * 2) +
+		sizeof(struct sadb_x_policy) +
+		(xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) +
+		(socklen * 2) +
+		pfkey_xfrm_policy2sec_ctx_size(xp);
+}
+
+static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp)
+{
+	struct sk_buff *skb;
+	int size;
+
+	size = pfkey_xfrm_policy2msg_size(xp);
+
+	skb =  alloc_skb(size + 16, GFP_ATOMIC);
+	if (skb == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	return skb;
+}
+
+static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir)
+{
+	struct sadb_msg *hdr;
+	struct sadb_address *addr;
+	struct sadb_lifetime *lifetime;
+	struct sadb_x_policy *pol;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
+	int i;
+	int size;
+	int sockaddr_size = pfkey_sockaddr_size(xp->family);
+	int socklen = pfkey_sockaddr_len(xp->family);
+
+	size = pfkey_xfrm_policy2msg_size(xp);
+
+	/* call should fill header later */
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	memset(hdr, 0, size);	/* XXX do we need this ? */
+
+	/* src address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+	addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
+	addr->sadb_address_prefixlen = xp->selector.prefixlen_s;
+	addr->sadb_address_reserved = 0;
+	if (!pfkey_sockaddr_fill(&xp->selector.saddr,
+				 xp->selector.sport,
+				 (struct sockaddr *) (addr + 1),
+				 xp->family))
+		BUG();
+
+	/* dst address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+	addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto);
+	addr->sadb_address_prefixlen = xp->selector.prefixlen_d;
+	addr->sadb_address_reserved = 0;
+
+	pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport,
+			    (struct sockaddr *) (addr + 1),
+			    xp->family);
+
+	/* hard time */
+	lifetime = (struct sadb_lifetime *)  skb_put(skb,
+						     sizeof(struct sadb_lifetime));
+	lifetime->sadb_lifetime_len =
+		sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+	lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
+	lifetime->sadb_lifetime_allocations =  _X2KEY(xp->lft.hard_packet_limit);
+	lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit);
+	lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds;
+	lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds;
+	/* soft time */
+	lifetime = (struct sadb_lifetime *)  skb_put(skb,
+						     sizeof(struct sadb_lifetime));
+	lifetime->sadb_lifetime_len =
+		sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+	lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
+	lifetime->sadb_lifetime_allocations =  _X2KEY(xp->lft.soft_packet_limit);
+	lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit);
+	lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds;
+	lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds;
+	/* current time */
+	lifetime = (struct sadb_lifetime *)  skb_put(skb,
+						     sizeof(struct sadb_lifetime));
+	lifetime->sadb_lifetime_len =
+		sizeof(struct sadb_lifetime)/sizeof(uint64_t);
+	lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
+	lifetime->sadb_lifetime_allocations = xp->curlft.packets;
+	lifetime->sadb_lifetime_bytes = xp->curlft.bytes;
+	lifetime->sadb_lifetime_addtime = xp->curlft.add_time;
+	lifetime->sadb_lifetime_usetime = xp->curlft.use_time;
+
+	pol = (struct sadb_x_policy *)  skb_put(skb, sizeof(struct sadb_x_policy));
+	pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
+	pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+	pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD;
+	if (xp->action == XFRM_POLICY_ALLOW) {
+		if (xp->xfrm_nr)
+			pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+		else
+			pol->sadb_x_policy_type = IPSEC_POLICY_NONE;
+	}
+	pol->sadb_x_policy_dir = dir+1;
+	pol->sadb_x_policy_id = xp->index;
+	pol->sadb_x_policy_priority = xp->priority;
+
+	for (i=0; i<xp->xfrm_nr; i++) {
+		const struct xfrm_tmpl *t = xp->xfrm_vec + i;
+		struct sadb_x_ipsecrequest *rq;
+		int req_size;
+		int mode;
+
+		req_size = sizeof(struct sadb_x_ipsecrequest);
+		if (t->mode == XFRM_MODE_TUNNEL) {
+			socklen = pfkey_sockaddr_len(t->encap_family);
+			req_size += socklen * 2;
+		} else {
+			size -= 2*socklen;
+		}
+		rq = (void*)skb_put(skb, req_size);
+		pol->sadb_x_policy_len += req_size/8;
+		memset(rq, 0, sizeof(*rq));
+		rq->sadb_x_ipsecrequest_len = req_size;
+		rq->sadb_x_ipsecrequest_proto = t->id.proto;
+		if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0)
+			return -EINVAL;
+		rq->sadb_x_ipsecrequest_mode = mode;
+		rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE;
+		if (t->reqid)
+			rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE;
+		if (t->optional)
+			rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE;
+		rq->sadb_x_ipsecrequest_reqid = t->reqid;
+
+		if (t->mode == XFRM_MODE_TUNNEL) {
+			u8 *sa = (void *)(rq + 1);
+			pfkey_sockaddr_fill(&t->saddr, 0,
+					    (struct sockaddr *)sa,
+					    t->encap_family);
+			pfkey_sockaddr_fill(&t->id.daddr, 0,
+					    (struct sockaddr *) (sa + socklen),
+					    t->encap_family);
+		}
+	}
+
+	/* security context */
+	if ((xfrm_ctx = xp->security)) {
+		int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+		sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
+	hdr->sadb_msg_len = size / sizeof(uint64_t);
+	hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
+
+	return 0;
+}
+
+static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	int err;
+
+	out_skb = pfkey_xfrm_policy2msg_prep(xp);
+	if (IS_ERR(out_skb))
+		return PTR_ERR(out_skb);
+
+	err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+	if (err < 0)
+		return err;
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = PF_KEY_V2;
+
+	if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
+		out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
+	else
+		out_hdr->sadb_msg_type = event2poltype(c->event);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_seq = c->seq;
+	out_hdr->sadb_msg_pid = c->pid;
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
+	return 0;
+
+}
+
+static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	int err = 0;
+	struct sadb_lifetime *lifetime;
+	struct sadb_address *sa;
+	struct sadb_x_policy *pol;
+	struct xfrm_policy *xp;
+	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
+
+	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
+	    !ext_hdrs[SADB_X_EXT_POLICY-1])
+		return -EINVAL;
+
+	pol = ext_hdrs[SADB_X_EXT_POLICY-1];
+	if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC)
+		return -EINVAL;
+	if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
+		return -EINVAL;
+
+	xp = xfrm_policy_alloc(net, GFP_KERNEL);
+	if (xp == NULL)
+		return -ENOBUFS;
+
+	xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
+		      XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
+	xp->priority = pol->sadb_x_policy_priority;
+
+	sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+	xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr);
+	if (!xp->family) {
+		err = -EINVAL;
+		goto out;
+	}
+	xp->selector.family = xp->family;
+	xp->selector.prefixlen_s = sa->sadb_address_prefixlen;
+	xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+	xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
+	if (xp->selector.sport)
+		xp->selector.sport_mask = htons(0xffff);
+
+	sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+	pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr);
+	xp->selector.prefixlen_d = sa->sadb_address_prefixlen;
+
+	/* Amusing, we set this twice.  KAME apps appear to set same value
+	 * in both addresses.
+	 */
+	xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+
+	xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
+	if (xp->selector.dport)
+		xp->selector.dport_mask = htons(0xffff);
+
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx) {
+			err = -ENOBUFS;
+			goto out;
+		}
+
+		err = security_xfrm_policy_alloc(&xp->security, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
+	xp->lft.soft_byte_limit = XFRM_INF;
+	xp->lft.hard_byte_limit = XFRM_INF;
+	xp->lft.soft_packet_limit = XFRM_INF;
+	xp->lft.hard_packet_limit = XFRM_INF;
+	if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) {
+		xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+		xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+		xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+		xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+	}
+	if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) {
+		xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations);
+		xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes);
+		xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
+		xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
+	}
+	xp->xfrm_nr = 0;
+	if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
+	    (err = parse_ipsecrequests(xp, pol)) < 0)
+		goto out;
+
+	err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
+				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
+
+	xfrm_audit_policy_add(xp, err ? 0 : 1,
+			      audit_get_loginuid(current),
+			      audit_get_sessionid(current), 0);
+
+	if (err)
+		goto out;
+
+	if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
+		c.event = XFRM_MSG_UPDPOLICY;
+	else
+		c.event = XFRM_MSG_NEWPOLICY;
+
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+
+	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+	xfrm_pol_put(xp);
+	return 0;
+
+out:
+	xp->walk.dead = 1;
+	xfrm_policy_destroy(xp);
+	return err;
+}
+
+static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	int err;
+	struct sadb_address *sa;
+	struct sadb_x_policy *pol;
+	struct xfrm_policy *xp;
+	struct xfrm_selector sel;
+	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *pol_ctx = NULL;
+
+	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
+	    !ext_hdrs[SADB_X_EXT_POLICY-1])
+		return -EINVAL;
+
+	pol = ext_hdrs[SADB_X_EXT_POLICY-1];
+	if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX)
+		return -EINVAL;
+
+	memset(&sel, 0, sizeof(sel));
+
+	sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
+	sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
+	sel.prefixlen_s = sa->sadb_address_prefixlen;
+	sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+	sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port;
+	if (sel.sport)
+		sel.sport_mask = htons(0xffff);
+
+	sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1],
+	pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
+	sel.prefixlen_d = sa->sadb_address_prefixlen;
+	sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+	sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port;
+	if (sel.dport)
+		sel.dport_mask = htons(0xffff);
+
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			return -ENOMEM;
+
+		err = security_xfrm_policy_alloc(&pol_ctx, uctx);
+		kfree(uctx);
+		if (err)
+			return err;
+	}
+
+	xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+				   pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
+				   1, &err);
+	security_xfrm_policy_free(pol_ctx);
+	if (xp == NULL)
+		return -ENOENT;
+
+	xfrm_audit_policy_delete(xp, err ? 0 : 1,
+				 audit_get_loginuid(current),
+				 audit_get_sessionid(current), 0);
+
+	if (err)
+		goto out;
+
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.data.byid = 0;
+	c.event = XFRM_MSG_DELPOLICY;
+	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+
+out:
+	xfrm_pol_put(xp);
+	return err;
+}
+
+static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir)
+{
+	int err;
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	err = 0;
+
+	out_skb = pfkey_xfrm_policy2msg_prep(xp);
+	if (IS_ERR(out_skb)) {
+		err =  PTR_ERR(out_skb);
+		goto out;
+	}
+	err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+	if (err < 0)
+		goto out;
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
+	out_hdr->sadb_msg_type = hdr->sadb_msg_type;
+	out_hdr->sadb_msg_satype = 0;
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
+	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp));
+	err = 0;
+
+out:
+	return err;
+}
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int pfkey_sockaddr_pair_size(sa_family_t family)
+{
+	return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2);
+}
+
+static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len,
+			       xfrm_address_t *saddr, xfrm_address_t *daddr,
+			       u16 *family)
+{
+	int af, socklen;
+
+	if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family))
+		return -EINVAL;
+
+	af = pfkey_sockaddr_extract(sa, saddr);
+	if (!af)
+		return -EINVAL;
+
+	socklen = pfkey_sockaddr_len(af);
+	if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen),
+				   daddr) != af)
+		return -EINVAL;
+
+	*family = af;
+	return 0;
+}
+
+static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len,
+				    struct xfrm_migrate *m)
+{
+	int err;
+	struct sadb_x_ipsecrequest *rq2;
+	int mode;
+
+	if (len <= sizeof(struct sadb_x_ipsecrequest) ||
+	    len < rq1->sadb_x_ipsecrequest_len)
+		return -EINVAL;
+
+	/* old endoints */
+	err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1),
+				  rq1->sadb_x_ipsecrequest_len,
+				  &m->old_saddr, &m->old_daddr,
+				  &m->old_family);
+	if (err)
+		return err;
+
+	rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len);
+	len -= rq1->sadb_x_ipsecrequest_len;
+
+	if (len <= sizeof(struct sadb_x_ipsecrequest) ||
+	    len < rq2->sadb_x_ipsecrequest_len)
+		return -EINVAL;
+
+	/* new endpoints */
+	err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1),
+				  rq2->sadb_x_ipsecrequest_len,
+				  &m->new_saddr, &m->new_daddr,
+				  &m->new_family);
+	if (err)
+		return err;
+
+	if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto ||
+	    rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode ||
+	    rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid)
+		return -EINVAL;
+
+	m->proto = rq1->sadb_x_ipsecrequest_proto;
+	if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0)
+		return -EINVAL;
+	m->mode = mode;
+	m->reqid = rq1->sadb_x_ipsecrequest_reqid;
+
+	return ((int)(rq1->sadb_x_ipsecrequest_len +
+		      rq2->sadb_x_ipsecrequest_len));
+}
+
+static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
+			 const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	int i, len, ret, err = -EINVAL;
+	u8 dir;
+	struct sadb_address *sa;
+	struct sadb_x_kmaddress *kma;
+	struct sadb_x_policy *pol;
+	struct sadb_x_ipsecrequest *rq;
+	struct xfrm_selector sel;
+	struct xfrm_migrate m[XFRM_MAX_DEPTH];
+	struct xfrm_kmaddress k;
+
+	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1],
+				     ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) ||
+	    !ext_hdrs[SADB_X_EXT_POLICY - 1]) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1];
+	pol = ext_hdrs[SADB_X_EXT_POLICY - 1];
+
+	if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (kma) {
+		/* convert sadb_x_kmaddress to xfrm_kmaddress */
+		k.reserved = kma->sadb_x_kmaddress_reserved;
+		ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1),
+					  8*(kma->sadb_x_kmaddress_len) - sizeof(*kma),
+					  &k.local, &k.remote, &k.family);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+	}
+
+	dir = pol->sadb_x_policy_dir - 1;
+	memset(&sel, 0, sizeof(sel));
+
+	/* set source address info of selector */
+	sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1];
+	sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr);
+	sel.prefixlen_s = sa->sadb_address_prefixlen;
+	sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+	sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port;
+	if (sel.sport)
+		sel.sport_mask = htons(0xffff);
+
+	/* set destination address info of selector */
+	sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1],
+	pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr);
+	sel.prefixlen_d = sa->sadb_address_prefixlen;
+	sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto);
+	sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port;
+	if (sel.dport)
+		sel.dport_mask = htons(0xffff);
+
+	rq = (struct sadb_x_ipsecrequest *)(pol + 1);
+
+	/* extract ipsecrequests */
+	i = 0;
+	len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy);
+
+	while (len > 0 && i < XFRM_MAX_DEPTH) {
+		ret = ipsecrequests_to_migrate(rq, len, &m[i]);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		} else {
+			rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret);
+			len -= ret;
+			i++;
+		}
+	}
+
+	if (!i || len > 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
+			    kma ? &k : NULL);
+
+ out:
+	return err;
+}
+#else
+static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
+			 const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	return -ENOPROTOOPT;
+}
+#endif
+
+
+static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	unsigned int dir;
+	int err = 0, delete;
+	struct sadb_x_policy *pol;
+	struct xfrm_policy *xp;
+	struct km_event c;
+
+	if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
+		return -EINVAL;
+
+	dir = xfrm_policy_id2dir(pol->sadb_x_policy_id);
+	if (dir >= XFRM_POLICY_MAX)
+		return -EINVAL;
+
+	delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
+	xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+			      dir, pol->sadb_x_policy_id, delete, &err);
+	if (xp == NULL)
+		return -ENOENT;
+
+	if (delete) {
+		xfrm_audit_policy_delete(xp, err ? 0 : 1,
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
+
+		if (err)
+			goto out;
+		c.seq = hdr->sadb_msg_seq;
+		c.pid = hdr->sadb_msg_pid;
+		c.data.byid = 1;
+		c.event = XFRM_MSG_DELPOLICY;
+		km_policy_notify(xp, dir, &c);
+	} else {
+		err = key_pol_get_resp(sk, xp, hdr, dir);
+	}
+
+out:
+	xfrm_pol_put(xp);
+	return err;
+}
+
+static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+	struct pfkey_sock *pfk = ptr;
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	int err;
+
+	if (!pfkey_can_dump(&pfk->sk))
+		return -ENOBUFS;
+
+	out_skb = pfkey_xfrm_policy2msg_prep(xp);
+	if (IS_ERR(out_skb))
+		return PTR_ERR(out_skb);
+
+	err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
+	if (err < 0)
+		return err;
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = pfk->dump.msg_version;
+	out_hdr->sadb_msg_type = SADB_X_SPDDUMP;
+	out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC;
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_seq = count + 1;
+	out_hdr->sadb_msg_pid = pfk->dump.msg_pid;
+
+	if (pfk->dump.skb)
+		pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE,
+				&pfk->sk, sock_net(&pfk->sk));
+	pfk->dump.skb = out_skb;
+
+	return 0;
+}
+
+static int pfkey_dump_sp(struct pfkey_sock *pfk)
+{
+	struct net *net = sock_net(&pfk->sk);
+	return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk);
+}
+
+static void pfkey_dump_sp_done(struct pfkey_sock *pfk)
+{
+	xfrm_policy_walk_done(&pfk->dump.u.policy);
+}
+
+static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct pfkey_sock *pfk = pfkey_sk(sk);
+
+	if (pfk->dump.dump != NULL)
+		return -EBUSY;
+
+	pfk->dump.msg_version = hdr->sadb_msg_version;
+	pfk->dump.msg_pid = hdr->sadb_msg_pid;
+	pfk->dump.dump = pfkey_dump_sp;
+	pfk->dump.done = pfkey_dump_sp_done;
+	xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN);
+
+	return pfkey_do_dump(pfk);
+}
+
+static int key_notify_policy_flush(const struct km_event *c)
+{
+	struct sk_buff *skb_out;
+	struct sadb_msg *hdr;
+
+	skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+	if (!skb_out)
+		return -ENOBUFS;
+	hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
+	hdr->sadb_msg_type = SADB_X_SPDFLUSH;
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_errno = (uint8_t) 0;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+	pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
+	return 0;
+
+}
+
+static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs)
+{
+	struct net *net = sock_net(sk);
+	struct km_event c;
+	struct xfrm_audit audit_info;
+	int err, err2;
+
+	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
+	audit_info.secid = 0;
+	err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
+	err2 = unicast_flush_resp(sk, hdr);
+	if (err || err2) {
+		if (err == -ESRCH) /* empty table - old silent behavior */
+			return 0;
+		return err;
+	}
+
+	c.data.type = XFRM_POLICY_TYPE_MAIN;
+	c.event = XFRM_MSG_FLUSHPOLICY;
+	c.pid = hdr->sadb_msg_pid;
+	c.seq = hdr->sadb_msg_seq;
+	c.net = net;
+	km_policy_notify(NULL, 0, &c);
+
+	return 0;
+}
+
+typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb,
+			     const struct sadb_msg *hdr, void * const *ext_hdrs);
+static pfkey_handler pfkey_funcs[SADB_MAX + 1] = {
+	[SADB_RESERVED]		= pfkey_reserved,
+	[SADB_GETSPI]		= pfkey_getspi,
+	[SADB_UPDATE]		= pfkey_add,
+	[SADB_ADD]		= pfkey_add,
+	[SADB_DELETE]		= pfkey_delete,
+	[SADB_GET]		= pfkey_get,
+	[SADB_ACQUIRE]		= pfkey_acquire,
+	[SADB_REGISTER]		= pfkey_register,
+	[SADB_EXPIRE]		= NULL,
+	[SADB_FLUSH]		= pfkey_flush,
+	[SADB_DUMP]		= pfkey_dump,
+	[SADB_X_PROMISC]	= pfkey_promisc,
+	[SADB_X_PCHANGE]	= NULL,
+	[SADB_X_SPDUPDATE]	= pfkey_spdadd,
+	[SADB_X_SPDADD]		= pfkey_spdadd,
+	[SADB_X_SPDDELETE]	= pfkey_spddelete,
+	[SADB_X_SPDGET]		= pfkey_spdget,
+	[SADB_X_SPDACQUIRE]	= NULL,
+	[SADB_X_SPDDUMP]	= pfkey_spddump,
+	[SADB_X_SPDFLUSH]	= pfkey_spdflush,
+	[SADB_X_SPDSETIDX]	= pfkey_spdadd,
+	[SADB_X_SPDDELETE2]	= pfkey_spdget,
+	[SADB_X_MIGRATE]	= pfkey_migrate,
+};
+
+static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr)
+{
+	void *ext_hdrs[SADB_EXT_MAX];
+	int err;
+
+	pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
+			BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
+
+	memset(ext_hdrs, 0, sizeof(ext_hdrs));
+	err = parse_exthdrs(skb, hdr, ext_hdrs);
+	if (!err) {
+		err = -EOPNOTSUPP;
+		if (pfkey_funcs[hdr->sadb_msg_type])
+			err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs);
+	}
+	return err;
+}
+
+static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp)
+{
+	struct sadb_msg *hdr = NULL;
+
+	if (skb->len < sizeof(*hdr)) {
+		*errp = -EMSGSIZE;
+	} else {
+		hdr = (struct sadb_msg *) skb->data;
+		if (hdr->sadb_msg_version != PF_KEY_V2 ||
+		    hdr->sadb_msg_reserved != 0 ||
+		    (hdr->sadb_msg_type <= SADB_RESERVED ||
+		     hdr->sadb_msg_type > SADB_MAX)) {
+			hdr = NULL;
+			*errp = -EINVAL;
+		} else if (hdr->sadb_msg_len != (skb->len /
+						 sizeof(uint64_t)) ||
+			   hdr->sadb_msg_len < (sizeof(struct sadb_msg) /
+						sizeof(uint64_t))) {
+			hdr = NULL;
+			*errp = -EMSGSIZE;
+		} else {
+			*errp = 0;
+		}
+	}
+	return hdr;
+}
+
+static inline int aalg_tmpl_set(const struct xfrm_tmpl *t,
+				const struct xfrm_algo_desc *d)
+{
+	unsigned int id = d->desc.sadb_alg_id;
+
+	if (id >= sizeof(t->aalgos) * 8)
+		return 0;
+
+	return (t->aalgos >> id) & 1;
+}
+
+static inline int ealg_tmpl_set(const struct xfrm_tmpl *t,
+				const struct xfrm_algo_desc *d)
+{
+	unsigned int id = d->desc.sadb_alg_id;
+
+	if (id >= sizeof(t->ealgos) * 8)
+		return 0;
+
+	return (t->ealgos >> id) & 1;
+}
+
+static int count_ah_combs(const struct xfrm_tmpl *t)
+{
+	int i, sz = 0;
+
+	for (i = 0; ; i++) {
+		const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+		if (!aalg)
+			break;
+		if (aalg_tmpl_set(t, aalg) && aalg->available)
+			sz += sizeof(struct sadb_comb);
+	}
+	return sz + sizeof(struct sadb_prop);
+}
+
+static int count_esp_combs(const struct xfrm_tmpl *t)
+{
+	int i, k, sz = 0;
+
+	for (i = 0; ; i++) {
+		const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+		if (!ealg)
+			break;
+
+		if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+			continue;
+
+		for (k = 1; ; k++) {
+			const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+			if (!aalg)
+				break;
+
+			if (aalg_tmpl_set(t, aalg) && aalg->available)
+				sz += sizeof(struct sadb_comb);
+		}
+	}
+	return sz + sizeof(struct sadb_prop);
+}
+
+static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+{
+	struct sadb_prop *p;
+	int i;
+
+	p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
+	p->sadb_prop_len = sizeof(struct sadb_prop)/8;
+	p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
+	p->sadb_prop_replay = 32;
+	memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
+
+	for (i = 0; ; i++) {
+		const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i);
+		if (!aalg)
+			break;
+
+		if (aalg_tmpl_set(t, aalg) && aalg->available) {
+			struct sadb_comb *c;
+			c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
+			memset(c, 0, sizeof(*c));
+			p->sadb_prop_len += sizeof(struct sadb_comb)/8;
+			c->sadb_comb_auth = aalg->desc.sadb_alg_id;
+			c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
+			c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
+			c->sadb_comb_hard_addtime = 24*60*60;
+			c->sadb_comb_soft_addtime = 20*60*60;
+			c->sadb_comb_hard_usetime = 8*60*60;
+			c->sadb_comb_soft_usetime = 7*60*60;
+		}
+	}
+}
+
+static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+{
+	struct sadb_prop *p;
+	int i, k;
+
+	p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop));
+	p->sadb_prop_len = sizeof(struct sadb_prop)/8;
+	p->sadb_prop_exttype = SADB_EXT_PROPOSAL;
+	p->sadb_prop_replay = 32;
+	memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved));
+
+	for (i=0; ; i++) {
+		const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i);
+		if (!ealg)
+			break;
+
+		if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+			continue;
+
+		for (k = 1; ; k++) {
+			struct sadb_comb *c;
+			const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k);
+			if (!aalg)
+				break;
+			if (!(aalg_tmpl_set(t, aalg) && aalg->available))
+				continue;
+			c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb));
+			memset(c, 0, sizeof(*c));
+			p->sadb_prop_len += sizeof(struct sadb_comb)/8;
+			c->sadb_comb_auth = aalg->desc.sadb_alg_id;
+			c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits;
+			c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits;
+			c->sadb_comb_encrypt = ealg->desc.sadb_alg_id;
+			c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits;
+			c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits;
+			c->sadb_comb_hard_addtime = 24*60*60;
+			c->sadb_comb_soft_addtime = 20*60*60;
+			c->sadb_comb_hard_usetime = 8*60*60;
+			c->sadb_comb_soft_usetime = 7*60*60;
+		}
+	}
+}
+
+static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c)
+{
+	return 0;
+}
+
+static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
+{
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	int hard;
+	int hsc;
+
+	hard = c->data.hard;
+	if (hard)
+		hsc = 2;
+	else
+		hsc = 1;
+
+	out_skb = pfkey_xfrm_state2msg_expire(x, hsc);
+	if (IS_ERR(out_skb))
+		return PTR_ERR(out_skb);
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = PF_KEY_V2;
+	out_hdr->sadb_msg_type = SADB_EXPIRE;
+	out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_reserved = 0;
+	out_hdr->sadb_msg_seq = 0;
+	out_hdr->sadb_msg_pid = 0;
+
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+	return 0;
+}
+
+static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c)
+{
+	struct net *net = x ? xs_net(x) : c->net;
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	if (atomic_read(&net_pfkey->socks_nr) == 0)
+		return 0;
+
+	switch (c->event) {
+	case XFRM_MSG_EXPIRE:
+		return key_notify_sa_expire(x, c);
+	case XFRM_MSG_DELSA:
+	case XFRM_MSG_NEWSA:
+	case XFRM_MSG_UPDSA:
+		return key_notify_sa(x, c);
+	case XFRM_MSG_FLUSHSA:
+		return key_notify_sa_flush(c);
+	case XFRM_MSG_NEWAE: /* not yet supported */
+		break;
+	default:
+		pr_err("pfkey: Unknown SA event %d\n", c->event);
+		break;
+	}
+
+	return 0;
+}
+
+static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+	if (xp && xp->type != XFRM_POLICY_TYPE_MAIN)
+		return 0;
+
+	switch (c->event) {
+	case XFRM_MSG_POLEXPIRE:
+		return key_notify_policy_expire(xp, c);
+	case XFRM_MSG_DELPOLICY:
+	case XFRM_MSG_NEWPOLICY:
+	case XFRM_MSG_UPDPOLICY:
+		return key_notify_policy(xp, dir, c);
+	case XFRM_MSG_FLUSHPOLICY:
+		if (c->data.type != XFRM_POLICY_TYPE_MAIN)
+			break;
+		return key_notify_policy_flush(c);
+	default:
+		pr_err("pfkey: Unknown policy event %d\n", c->event);
+		break;
+	}
+
+	return 0;
+}
+
+static u32 get_acqseq(void)
+{
+	u32 res;
+	static atomic_t acqseq;
+
+	do {
+		res = atomic_inc_return(&acqseq);
+	} while (!res);
+	return res;
+}
+
+static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp, int dir)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	struct sadb_address *addr;
+	struct sadb_x_policy *pol;
+	int sockaddr_size;
+	int size;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
+	int ctx_size = 0;
+
+	sockaddr_size = pfkey_sockaddr_size(x->props.family);
+	if (!sockaddr_size)
+		return -EINVAL;
+
+	size = sizeof(struct sadb_msg) +
+		(sizeof(struct sadb_address) * 2) +
+		(sockaddr_size * 2) +
+		sizeof(struct sadb_x_policy);
+
+	if (x->id.proto == IPPROTO_AH)
+		size += count_ah_combs(t);
+	else if (x->id.proto == IPPROTO_ESP)
+		size += count_esp_combs(t);
+
+	if ((xfrm_ctx = x->security)) {
+		ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+		size +=  sizeof(struct sadb_x_sec_ctx) + ctx_size;
+	}
+
+	skb =  alloc_skb(size + 16, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_type = SADB_ACQUIRE;
+	hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+	hdr->sadb_msg_len = size / sizeof(uint64_t);
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_reserved = 0;
+	hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+	hdr->sadb_msg_pid = 0;
+
+	/* src address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(&x->props.saddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	/* dst address */
+	addr = (struct sadb_address*) skb_put(skb,
+					      sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(&x->id.daddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	pol = (struct sadb_x_policy *)  skb_put(skb, sizeof(struct sadb_x_policy));
+	pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t);
+	pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+	pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+	pol->sadb_x_policy_dir = dir+1;
+	pol->sadb_x_policy_id = xp->index;
+
+	/* Set sadb_comb's. */
+	if (x->id.proto == IPPROTO_AH)
+		dump_ah_combs(skb, t);
+	else if (x->id.proto == IPPROTO_ESP)
+		dump_esp_combs(skb, t);
+
+	/* security context */
+	if (xfrm_ctx) {
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+				sizeof(struct sadb_x_sec_ctx) + ctx_size);
+		sec_ctx->sadb_x_sec_len =
+		  (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+}
+
+static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
+						u8 *data, int len, int *dir)
+{
+	struct net *net = sock_net(sk);
+	struct xfrm_policy *xp;
+	struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+	struct sadb_x_sec_ctx *sec_ctx;
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		if (opt != IP_IPSEC_POLICY) {
+			*dir = -EOPNOTSUPP;
+			return NULL;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		if (opt != IPV6_IPSEC_POLICY) {
+			*dir = -EOPNOTSUPP;
+			return NULL;
+		}
+		break;
+#endif
+	default:
+		*dir = -EINVAL;
+		return NULL;
+	}
+
+	*dir = -EINVAL;
+
+	if (len < sizeof(struct sadb_x_policy) ||
+	    pol->sadb_x_policy_len*8 > len ||
+	    pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS ||
+	    (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND))
+		return NULL;
+
+	xp = xfrm_policy_alloc(net, GFP_ATOMIC);
+	if (xp == NULL) {
+		*dir = -ENOBUFS;
+		return NULL;
+	}
+
+	xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ?
+		      XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW);
+
+	xp->lft.soft_byte_limit = XFRM_INF;
+	xp->lft.hard_byte_limit = XFRM_INF;
+	xp->lft.soft_packet_limit = XFRM_INF;
+	xp->lft.hard_packet_limit = XFRM_INF;
+	xp->family = sk->sk_family;
+
+	xp->xfrm_nr = 0;
+	if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
+	    (*dir = parse_ipsecrequests(xp, pol)) < 0)
+		goto out;
+
+	/* security context too */
+	if (len >= (pol->sadb_x_policy_len*8 +
+	    sizeof(struct sadb_x_sec_ctx))) {
+		char *p = (char *)pol;
+		struct xfrm_user_sec_ctx *uctx;
+
+		p += pol->sadb_x_policy_len*8;
+		sec_ctx = (struct sadb_x_sec_ctx *)p;
+		if (len < pol->sadb_x_policy_len*8 +
+		    sec_ctx->sadb_x_sec_len) {
+			*dir = -EINVAL;
+			goto out;
+		}
+		if ((*dir = verify_sec_ctx_len(p)))
+			goto out;
+		uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+		*dir = security_xfrm_policy_alloc(&xp->security, uctx);
+		kfree(uctx);
+
+		if (*dir)
+			goto out;
+	}
+
+	*dir = pol->sadb_x_policy_dir-1;
+	return xp;
+
+out:
+	xp->walk.dead = 1;
+	xfrm_policy_destroy(xp);
+	return NULL;
+}
+
+static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	struct sadb_sa *sa;
+	struct sadb_address *addr;
+	struct sadb_x_nat_t_port *n_port;
+	int sockaddr_size;
+	int size;
+	__u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0);
+	struct xfrm_encap_tmpl *natt = NULL;
+
+	sockaddr_size = pfkey_sockaddr_size(x->props.family);
+	if (!sockaddr_size)
+		return -EINVAL;
+
+	if (!satype)
+		return -EINVAL;
+
+	if (!x->encap)
+		return -EINVAL;
+
+	natt = x->encap;
+
+	/* Build an SADB_X_NAT_T_NEW_MAPPING message:
+	 *
+	 * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) |
+	 * ADDRESS_DST (new addr) | NAT_T_DPORT (new port)
+	 */
+
+	size = sizeof(struct sadb_msg) +
+		sizeof(struct sadb_sa) +
+		(sizeof(struct sadb_address) * 2) +
+		(sockaddr_size * 2) +
+		(sizeof(struct sadb_x_nat_t_port) * 2);
+
+	skb =  alloc_skb(size + 16, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING;
+	hdr->sadb_msg_satype = satype;
+	hdr->sadb_msg_len = size / sizeof(uint64_t);
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_reserved = 0;
+	hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+	hdr->sadb_msg_pid = 0;
+
+	/* SA */
+	sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa));
+	sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t);
+	sa->sadb_sa_exttype = SADB_EXT_SA;
+	sa->sadb_sa_spi = x->id.spi;
+	sa->sadb_sa_replay = 0;
+	sa->sadb_sa_state = 0;
+	sa->sadb_sa_auth = 0;
+	sa->sadb_sa_encrypt = 0;
+	sa->sadb_sa_flags = 0;
+
+	/* ADDRESS_SRC (old addr) */
+	addr = (struct sadb_address*)
+		skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(&x->props.saddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	/* NAT_T_SPORT (old port) */
+	n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+	n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+	n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT;
+	n_port->sadb_x_nat_t_port_port = natt->encap_sport;
+	n_port->sadb_x_nat_t_port_reserved = 0;
+
+	/* ADDRESS_DST (new addr) */
+	addr = (struct sadb_address*)
+		skb_put(skb, sizeof(struct sadb_address)+sockaddr_size);
+	addr->sadb_address_len =
+		(sizeof(struct sadb_address)+sockaddr_size)/
+			sizeof(uint64_t);
+	addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+	addr->sadb_address_proto = 0;
+	addr->sadb_address_reserved = 0;
+	addr->sadb_address_prefixlen =
+		pfkey_sockaddr_fill(ipaddr, 0,
+				    (struct sockaddr *) (addr + 1),
+				    x->props.family);
+	if (!addr->sadb_address_prefixlen)
+		BUG();
+
+	/* NAT_T_DPORT (new port) */
+	n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port));
+	n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t);
+	n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT;
+	n_port->sadb_x_nat_t_port_port = sport;
+	n_port->sadb_x_nat_t_port_reserved = 0;
+
+	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+}
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int set_sadb_address(struct sk_buff *skb, int sasize, int type,
+			    const struct xfrm_selector *sel)
+{
+	struct sadb_address *addr;
+	addr = (struct sadb_address *)skb_put(skb, sizeof(struct sadb_address) + sasize);
+	addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8;
+	addr->sadb_address_exttype = type;
+	addr->sadb_address_proto = sel->proto;
+	addr->sadb_address_reserved = 0;
+
+	switch (type) {
+	case SADB_EXT_ADDRESS_SRC:
+		addr->sadb_address_prefixlen = sel->prefixlen_s;
+		pfkey_sockaddr_fill(&sel->saddr, 0,
+				    (struct sockaddr *)(addr + 1),
+				    sel->family);
+		break;
+	case SADB_EXT_ADDRESS_DST:
+		addr->sadb_address_prefixlen = sel->prefixlen_d;
+		pfkey_sockaddr_fill(&sel->daddr, 0,
+				    (struct sockaddr *)(addr + 1),
+				    sel->family);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k)
+{
+	struct sadb_x_kmaddress *kma;
+	u8 *sa;
+	int family = k->family;
+	int socklen = pfkey_sockaddr_len(family);
+	int size_req;
+
+	size_req = (sizeof(struct sadb_x_kmaddress) +
+		    pfkey_sockaddr_pair_size(family));
+
+	kma = (struct sadb_x_kmaddress *)skb_put(skb, size_req);
+	memset(kma, 0, size_req);
+	kma->sadb_x_kmaddress_len = size_req / 8;
+	kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS;
+	kma->sadb_x_kmaddress_reserved = k->reserved;
+
+	sa = (u8 *)(kma + 1);
+	if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) ||
+	    !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int set_ipsecrequest(struct sk_buff *skb,
+			    uint8_t proto, uint8_t mode, int level,
+			    uint32_t reqid, uint8_t family,
+			    const xfrm_address_t *src, const xfrm_address_t *dst)
+{
+	struct sadb_x_ipsecrequest *rq;
+	u8 *sa;
+	int socklen = pfkey_sockaddr_len(family);
+	int size_req;
+
+	size_req = sizeof(struct sadb_x_ipsecrequest) +
+		   pfkey_sockaddr_pair_size(family);
+
+	rq = (struct sadb_x_ipsecrequest *)skb_put(skb, size_req);
+	memset(rq, 0, size_req);
+	rq->sadb_x_ipsecrequest_len = size_req;
+	rq->sadb_x_ipsecrequest_proto = proto;
+	rq->sadb_x_ipsecrequest_mode = mode;
+	rq->sadb_x_ipsecrequest_level = level;
+	rq->sadb_x_ipsecrequest_reqid = reqid;
+
+	sa = (u8 *) (rq + 1);
+	if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) ||
+	    !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family))
+		return -EINVAL;
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_NET_KEY_MIGRATE
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+			      const struct xfrm_migrate *m, int num_bundles,
+			      const struct xfrm_kmaddress *k)
+{
+	int i;
+	int sasize_sel;
+	int size = 0;
+	int size_pol = 0;
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	struct sadb_x_policy *pol;
+	const struct xfrm_migrate *mp;
+
+	if (type != XFRM_POLICY_TYPE_MAIN)
+		return 0;
+
+	if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH)
+		return -EINVAL;
+
+	if (k != NULL) {
+		/* addresses for KM */
+		size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) +
+				     pfkey_sockaddr_pair_size(k->family));
+	}
+
+	/* selector */
+	sasize_sel = pfkey_sockaddr_size(sel->family);
+	if (!sasize_sel)
+		return -EINVAL;
+	size += (sizeof(struct sadb_address) + sasize_sel) * 2;
+
+	/* policy info */
+	size_pol += sizeof(struct sadb_x_policy);
+
+	/* ipsecrequests */
+	for (i = 0, mp = m; i < num_bundles; i++, mp++) {
+		/* old locator pair */
+		size_pol += sizeof(struct sadb_x_ipsecrequest) +
+			    pfkey_sockaddr_pair_size(mp->old_family);
+		/* new locator pair */
+		size_pol += sizeof(struct sadb_x_ipsecrequest) +
+			    pfkey_sockaddr_pair_size(mp->new_family);
+	}
+
+	size += sizeof(struct sadb_msg) + size_pol;
+
+	/* alloc buffer */
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	hdr = (struct sadb_msg *)skb_put(skb, sizeof(struct sadb_msg));
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_type = SADB_X_MIGRATE;
+	hdr->sadb_msg_satype = pfkey_proto2satype(m->proto);
+	hdr->sadb_msg_len = size / 8;
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_reserved = 0;
+	hdr->sadb_msg_seq = 0;
+	hdr->sadb_msg_pid = 0;
+
+	/* Addresses to be used by KM for negotiation, if ext is available */
+	if (k != NULL && (set_sadb_kmaddress(skb, k) < 0))
+		return -EINVAL;
+
+	/* selector src */
+	set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel);
+
+	/* selector dst */
+	set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel);
+
+	/* policy information */
+	pol = (struct sadb_x_policy *)skb_put(skb, sizeof(struct sadb_x_policy));
+	pol->sadb_x_policy_len = size_pol / 8;
+	pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
+	pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC;
+	pol->sadb_x_policy_dir = dir + 1;
+	pol->sadb_x_policy_id = 0;
+	pol->sadb_x_policy_priority = 0;
+
+	for (i = 0, mp = m; i < num_bundles; i++, mp++) {
+		/* old ipsecrequest */
+		int mode = pfkey_mode_from_xfrm(mp->mode);
+		if (mode < 0)
+			goto err;
+		if (set_ipsecrequest(skb, mp->proto, mode,
+				     (mp->reqid ?  IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE),
+				     mp->reqid, mp->old_family,
+				     &mp->old_saddr, &mp->old_daddr) < 0)
+			goto err;
+
+		/* new ipsecrequest */
+		if (set_ipsecrequest(skb, mp->proto, mode,
+				     (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE),
+				     mp->reqid, mp->new_family,
+				     &mp->new_saddr, &mp->new_daddr) < 0)
+			goto err;
+	}
+
+	/* broadcast migrate message to sockets */
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net);
+
+	return 0;
+
+err:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+#else
+static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+			      const struct xfrm_migrate *m, int num_bundles,
+			      const struct xfrm_kmaddress *k)
+{
+	return -ENOPROTOOPT;
+}
+#endif
+
+static int pfkey_sendmsg(struct kiocb *kiocb,
+			 struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb = NULL;
+	struct sadb_msg *hdr = NULL;
+	int err;
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags & MSG_OOB)
+		goto out;
+
+	err = -EMSGSIZE;
+	if ((unsigned)len > sk->sk_sndbuf - 32)
+		goto out;
+
+	err = -ENOBUFS;
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (skb == NULL)
+		goto out;
+
+	err = -EFAULT;
+	if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len))
+		goto out;
+
+	hdr = pfkey_get_base_msg(skb, &err);
+	if (!hdr)
+		goto out;
+
+	mutex_lock(&xfrm_cfg_mutex);
+	err = pfkey_process(sk, skb, hdr);
+	mutex_unlock(&xfrm_cfg_mutex);
+
+out:
+	if (err && hdr && pfkey_error(hdr, err, sk) == 0)
+		err = 0;
+	kfree_skb(skb);
+
+	return err ? : len;
+}
+
+static int pfkey_recvmsg(struct kiocb *kiocb,
+			 struct socket *sock, struct msghdr *msg, size_t len,
+			 int flags)
+{
+	struct sock *sk = sock->sk;
+	struct pfkey_sock *pfk = pfkey_sk(sk);
+	struct sk_buff *skb;
+	int copied, err;
+
+	err = -EINVAL;
+	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
+		goto out;
+
+	msg->msg_namelen = 0;
+	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	skb_reset_transport_header(skb);
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	err = (flags & MSG_TRUNC) ? skb->len : copied;
+
+	if (pfk->dump.dump != NULL &&
+	    3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		pfkey_do_dump(pfk);
+
+out_free:
+	skb_free_datagram(sk, skb);
+out:
+	return err;
+}
+
+static const struct proto_ops pfkey_ops = {
+	.family		=	PF_KEY,
+	.owner		=	THIS_MODULE,
+	/* Operations that make no sense on pfkey sockets. */
+	.bind		=	sock_no_bind,
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.accept		=	sock_no_accept,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.setsockopt	=	sock_no_setsockopt,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.sendpage	=	sock_no_sendpage,
+
+	/* Now the operations that really occur. */
+	.release	=	pfkey_release,
+	.poll		=	datagram_poll,
+	.sendmsg	=	pfkey_sendmsg,
+	.recvmsg	=	pfkey_recvmsg,
+};
+
+static const struct net_proto_family pfkey_family_ops = {
+	.family	=	PF_KEY,
+	.create	=	pfkey_create,
+	.owner	=	THIS_MODULE,
+};
+
+#ifdef CONFIG_PROC_FS
+static int pfkey_seq_show(struct seq_file *f, void *v)
+{
+	struct sock *s = sk_entry(v);
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(f ,"sk       RefCnt Rmem   Wmem   User   Inode\n");
+	else
+		seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n",
+			       s,
+			       atomic_read(&s->sk_refcnt),
+			       sk_rmem_alloc_get(s),
+			       sk_wmem_alloc_get(s),
+			       sock_i_uid(s),
+			       sock_i_ino(s)
+			       );
+	return 0;
+}
+
+static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos)
+	__acquires(rcu)
+{
+	struct net *net = seq_file_net(f);
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	rcu_read_lock();
+	return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos);
+}
+
+static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct net *net = seq_file_net(f);
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	return seq_hlist_next_rcu(v, &net_pfkey->table, ppos);
+}
+
+static void pfkey_seq_stop(struct seq_file *f, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+static const struct seq_operations pfkey_seq_ops = {
+	.start	= pfkey_seq_start,
+	.next	= pfkey_seq_next,
+	.stop	= pfkey_seq_stop,
+	.show	= pfkey_seq_show,
+};
+
+static int pfkey_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &pfkey_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations pfkey_proc_ops = {
+	.open	 = pfkey_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_net,
+};
+
+static int __net_init pfkey_init_proc(struct net *net)
+{
+	struct proc_dir_entry *e;
+
+	e = proc_net_fops_create(net, "pfkey", 0, &pfkey_proc_ops);
+	if (e == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit pfkey_exit_proc(struct net *net)
+{
+	proc_net_remove(net, "pfkey");
+}
+#else
+static inline int pfkey_init_proc(struct net *net)
+{
+	return 0;
+}
+
+static inline void pfkey_exit_proc(struct net *net)
+{
+}
+#endif
+
+static struct xfrm_mgr pfkeyv2_mgr =
+{
+	.id		= "pfkeyv2",
+	.notify		= pfkey_send_notify,
+	.acquire	= pfkey_send_acquire,
+	.compile_policy	= pfkey_compile_policy,
+	.new_mapping	= pfkey_send_new_mapping,
+	.notify_policy	= pfkey_send_policy_notify,
+	.migrate	= pfkey_send_migrate,
+};
+
+static int __net_init pfkey_net_init(struct net *net)
+{
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+	int rv;
+
+	INIT_HLIST_HEAD(&net_pfkey->table);
+	atomic_set(&net_pfkey->socks_nr, 0);
+
+	rv = pfkey_init_proc(net);
+
+	return rv;
+}
+
+static void __net_exit pfkey_net_exit(struct net *net)
+{
+	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
+
+	pfkey_exit_proc(net);
+	BUG_ON(!hlist_empty(&net_pfkey->table));
+}
+
+static struct pernet_operations pfkey_net_ops = {
+	.init = pfkey_net_init,
+	.exit = pfkey_net_exit,
+	.id   = &pfkey_net_id,
+	.size = sizeof(struct netns_pfkey),
+};
+
+static void __exit ipsec_pfkey_exit(void)
+{
+	xfrm_unregister_km(&pfkeyv2_mgr);
+	sock_unregister(PF_KEY);
+	unregister_pernet_subsys(&pfkey_net_ops);
+	proto_unregister(&key_proto);
+}
+
+static int __init ipsec_pfkey_init(void)
+{
+	int err = proto_register(&key_proto, 0);
+
+	if (err != 0)
+		goto out;
+
+	err = register_pernet_subsys(&pfkey_net_ops);
+	if (err != 0)
+		goto out_unregister_key_proto;
+	err = sock_register(&pfkey_family_ops);
+	if (err != 0)
+		goto out_unregister_pernet;
+	err = xfrm_register_km(&pfkeyv2_mgr);
+	if (err != 0)
+		goto out_sock_unregister;
+out:
+	return err;
+
+out_sock_unregister:
+	sock_unregister(PF_KEY);
+out_unregister_pernet:
+	unregister_pernet_subsys(&pfkey_net_ops);
+out_unregister_key_proto:
+	proto_unregister(&key_proto);
+	goto out;
+}
+
+module_init(ipsec_pfkey_init);
+module_exit(ipsec_pfkey_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KEY);
diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig
new file mode 100644
index 00000000..4b1e7175
--- /dev/null
+++ b/net/l2tp/Kconfig
@@ -0,0 +1,107 @@
+#
+# Layer Two Tunneling Protocol (L2TP)
+#
+
+menuconfig L2TP
+	tristate "Layer Two Tunneling Protocol (L2TP)"
+	depends on INET
+	---help---
+	  Layer Two Tunneling Protocol
+
+	  From RFC 2661 <http://www.ietf.org/rfc/rfc2661.txt>.
+
+	  L2TP facilitates the tunneling of packets across an
+	  intervening network in a way that is as transparent as
+	  possible to both end-users and applications.
+
+	  L2TP is often used to tunnel PPP traffic over IP
+	  tunnels. One IP tunnel may carry thousands of individual PPP
+	  connections. L2TP is also used as a VPN protocol, popular
+	  with home workers to connect to their offices.
+
+	  L2TPv3 allows other protocols as well as PPP to be carried
+	  over L2TP tunnels. L2TPv3 is defined in RFC 3931
+	  <http://www.ietf.org/rfc/rfc3931.txt>.
+
+	  The kernel component handles only L2TP data packets: a
+	  userland daemon handles L2TP the control protocol (tunnel
+	  and session setup). One such daemon is OpenL2TP
+	  (http://openl2tp.org/).
+
+	  If you don't need L2TP, say N. To compile all L2TP code as
+	  modules, choose M here.
+
+config L2TP_DEBUGFS
+	tristate "L2TP debugfs support"
+	depends on L2TP && DEBUG_FS
+	help
+	  Support for l2tp directory in debugfs filesystem. This may be
+	  used to dump internal state of the l2tp drivers for problem
+	  analysis.
+
+	  If unsure, say 'Y'.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called l2tp_debugfs.
+
+config L2TP_V3
+	bool "L2TPv3 support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && L2TP
+	help
+	  Layer Two Tunneling Protocol Version 3
+
+	  From RFC 3931 <http://www.ietf.org/rfc/rfc3931.txt>.
+
+	  The Layer Two Tunneling Protocol (L2TP) provides a dynamic
+	  mechanism for tunneling Layer 2 (L2) "circuits" across a
+	  packet-oriented data network (e.g., over IP).  L2TP, as
+	  originally defined in RFC 2661, is a standard method for
+	  tunneling Point-to-Point Protocol (PPP) [RFC1661] sessions.
+	  L2TP has since been adopted for tunneling a number of other
+	  L2 protocols, including ATM, Frame Relay, HDLC and even raw
+	  ethernet frames.
+
+	  If you are connecting to L2TPv3 equipment, or you want to
+	  tunnel raw ethernet frames using L2TP, say Y here. If
+	  unsure, say N.
+
+config L2TP_IP
+	tristate "L2TP IP encapsulation for L2TPv3"
+	depends on L2TP_V3
+	help
+	  Support for L2TP-over-IP socket family.
+
+	  The L2TPv3 protocol defines two possible encapsulations for
+	  L2TP frames, namely UDP and plain IP (without UDP). This
+	  driver provides a new L2TPIP socket family with which
+	  userspace L2TPv3 daemons may create L2TP/IP tunnel sockets
+	  when UDP encapsulation is not required. When L2TP is carried
+	  in IP packets, it used IP protocol number 115, so this port
+	  must be enabled in firewalls.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called l2tp_ip.
+
+config L2TP_ETH
+	tristate "L2TP ethernet pseudowire support for L2TPv3"
+	depends on L2TP_V3
+	help
+	  Support for carrying raw ethernet frames over L2TPv3.
+
+	  From RFC 4719 <http://www.ietf.org/rfc/rfc4719.txt>.
+
+	  The Layer 2 Tunneling Protocol, Version 3 (L2TPv3) can be
+	  used as a control protocol and for data encapsulation to set
+	  up Pseudowires for transporting layer 2 Packet Data Units
+	  across an IP network [RFC3931].
+
+	  This driver provides an ethernet virtual interface for each
+	  L2TP ethernet pseudowire instance. Standard Linux tools may
+	  be used to assign an IP address to the local virtual
+	  interface, or add the interface to a bridge.
+
+	  If you are using L2TPv3, you will almost certainly want to
+	  enable this option.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called l2tp_eth.
diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile
new file mode 100644
index 00000000..110e7bc2
--- /dev/null
+++ b/net/l2tp/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the L2TP.
+#
+
+obj-$(CONFIG_L2TP) += l2tp_core.o
+
+# Build l2tp as modules if L2TP is M
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_V3)) += l2tp_netlink.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_ETH)) += l2tp_eth.o
+obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_DEBUGFS)) += l2tp_debugfs.o
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
new file mode 100644
index 00000000..71c292e3
--- /dev/null
+++ b/net/l2tp/l2tp_core.c
@@ -0,0 +1,1703 @@
+/*
+ * L2TP core.
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * This file contains some code of the original L2TPv2 pppol2tp
+ * driver, which has the following copyright:
+ *
+ * Authors:	Martijn van Oosterhout <kleptog@svana.org>
+ *		James Chapman (jchapman@katalix.com)
+ * Contributors:
+ *		Michal Ostrowski <mostrows@speakeasy.net>
+ *		Arnaldo Carvalho de Melo <acme@xconectiva.com.br>
+ *		David S. Miller (davem@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/l2tp.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/file.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#include <net/protocol.h>
+
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+
+#include "l2tp_core.h"
+
+#define L2TP_DRV_VERSION	"V2.0"
+
+/* L2TP header constants */
+#define L2TP_HDRFLAG_T	   0x8000
+#define L2TP_HDRFLAG_L	   0x4000
+#define L2TP_HDRFLAG_S	   0x0800
+#define L2TP_HDRFLAG_O	   0x0200
+#define L2TP_HDRFLAG_P	   0x0100
+
+#define L2TP_HDR_VER_MASK  0x000F
+#define L2TP_HDR_VER_2	   0x0002
+#define L2TP_HDR_VER_3	   0x0003
+
+/* L2TPv3 default L2-specific sublayer */
+#define L2TP_SLFLAG_S	   0x40000000
+#define L2TP_SL_SEQ_MASK   0x00ffffff
+
+#define L2TP_HDR_SIZE_SEQ		10
+#define L2TP_HDR_SIZE_NOSEQ		6
+
+/* Default trace flags */
+#define L2TP_DEFAULT_DEBUG_FLAGS	0
+
+#define PRINTK(_mask, _type, _lvl, _fmt, args...)			\
+	do {								\
+		if ((_mask) & (_type))					\
+			printk(_lvl "L2TP: " _fmt, ##args);		\
+	} while (0)
+
+/* Private data stored for received packets in the skb.
+ */
+struct l2tp_skb_cb {
+	u32			ns;
+	u16			has_seq;
+	u16			length;
+	unsigned long		expires;
+};
+
+#define L2TP_SKB_CB(skb)	((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)])
+
+static atomic_t l2tp_tunnel_count;
+static atomic_t l2tp_session_count;
+
+/* per-net private data for this module */
+static unsigned int l2tp_net_id;
+struct l2tp_net {
+	struct list_head l2tp_tunnel_list;
+	spinlock_t l2tp_tunnel_list_lock;
+	struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2];
+	spinlock_t l2tp_session_hlist_lock;
+};
+
+static void l2tp_session_set_header_len(struct l2tp_session *session, int version);
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
+
+static inline struct l2tp_net *l2tp_pernet(struct net *net)
+{
+	BUG_ON(!net);
+
+	return net_generic(net, l2tp_net_id);
+}
+
+
+/* Tunnel reference counts. Incremented per session that is added to
+ * the tunnel.
+ */
+static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
+{
+	atomic_inc(&tunnel->ref_count);
+}
+
+static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
+{
+	if (atomic_dec_and_test(&tunnel->ref_count))
+		l2tp_tunnel_free(tunnel);
+}
+#ifdef L2TP_REFCNT_DEBUG
+#define l2tp_tunnel_inc_refcount(_t) do { \
+		printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
+		l2tp_tunnel_inc_refcount_1(_t);				\
+	} while (0)
+#define l2tp_tunnel_dec_refcount(_t) do { \
+		printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
+		l2tp_tunnel_dec_refcount_1(_t);				\
+	} while (0)
+#else
+#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
+#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
+#endif
+
+/* Session hash global list for L2TPv3.
+ * The session_id SHOULD be random according to RFC3931, but several
+ * L2TP implementations use incrementing session_ids.  So we do a real
+ * hash on the session_id, rather than a simple bitmask.
+ */
+static inline struct hlist_head *
+l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id)
+{
+	return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)];
+
+}
+
+/* Lookup a session by id in the global session list
+ */
+static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id)
+{
+	struct l2tp_net *pn = l2tp_pernet(net);
+	struct hlist_head *session_list =
+		l2tp_session_id_hash_2(pn, session_id);
+	struct l2tp_session *session;
+	struct hlist_node *walk;
+
+	rcu_read_lock_bh();
+	hlist_for_each_entry_rcu(session, walk, session_list, global_hlist) {
+		if (session->session_id == session_id) {
+			rcu_read_unlock_bh();
+			return session;
+		}
+	}
+	rcu_read_unlock_bh();
+
+	return NULL;
+}
+
+/* Session hash list.
+ * The session_id SHOULD be random according to RFC2661, but several
+ * L2TP implementations (Cisco and Microsoft) use incrementing
+ * session_ids.  So we do a real hash on the session_id, rather than a
+ * simple bitmask.
+ */
+static inline struct hlist_head *
+l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id)
+{
+	return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)];
+}
+
+/* Lookup a session by id
+ */
+struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id)
+{
+	struct hlist_head *session_list;
+	struct l2tp_session *session;
+	struct hlist_node *walk;
+
+	/* In L2TPv3, session_ids are unique over all tunnels and we
+	 * sometimes need to look them up before we know the
+	 * tunnel.
+	 */
+	if (tunnel == NULL)
+		return l2tp_session_find_2(net, session_id);
+
+	session_list = l2tp_session_id_hash(tunnel, session_id);
+	read_lock_bh(&tunnel->hlist_lock);
+	hlist_for_each_entry(session, walk, session_list, hlist) {
+		if (session->session_id == session_id) {
+			read_unlock_bh(&tunnel->hlist_lock);
+			return session;
+		}
+	}
+	read_unlock_bh(&tunnel->hlist_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find);
+
+struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth)
+{
+	int hash;
+	struct hlist_node *walk;
+	struct l2tp_session *session;
+	int count = 0;
+
+	read_lock_bh(&tunnel->hlist_lock);
+	for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+		hlist_for_each_entry(session, walk, &tunnel->session_hlist[hash], hlist) {
+			if (++count > nth) {
+				read_unlock_bh(&tunnel->hlist_lock);
+				return session;
+			}
+		}
+	}
+
+	read_unlock_bh(&tunnel->hlist_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find_nth);
+
+/* Lookup a session by interface name.
+ * This is very inefficient but is only used by management interfaces.
+ */
+struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname)
+{
+	struct l2tp_net *pn = l2tp_pernet(net);
+	int hash;
+	struct hlist_node *walk;
+	struct l2tp_session *session;
+
+	rcu_read_lock_bh();
+	for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
+		hlist_for_each_entry_rcu(session, walk, &pn->l2tp_session_hlist[hash], global_hlist) {
+			if (!strcmp(session->ifname, ifname)) {
+				rcu_read_unlock_bh();
+				return session;
+			}
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_find_by_ifname);
+
+/* Lookup a tunnel by id
+ */
+struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id)
+{
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_net *pn = l2tp_pernet(net);
+
+	rcu_read_lock_bh();
+	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+		if (tunnel->tunnel_id == tunnel_id) {
+			rcu_read_unlock_bh();
+			return tunnel;
+		}
+	}
+	rcu_read_unlock_bh();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_find);
+
+struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth)
+{
+	struct l2tp_net *pn = l2tp_pernet(net);
+	struct l2tp_tunnel *tunnel;
+	int count = 0;
+
+	rcu_read_lock_bh();
+	list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
+		if (++count > nth) {
+			rcu_read_unlock_bh();
+			return tunnel;
+		}
+	}
+
+	rcu_read_unlock_bh();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
+
+/*****************************************************************************
+ * Receive data handling
+ *****************************************************************************/
+
+/* Queue a skb in order. We come here only if the skb has an L2TP sequence
+ * number.
+ */
+static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+	struct sk_buff *skbp;
+	struct sk_buff *tmp;
+	u32 ns = L2TP_SKB_CB(skb)->ns;
+
+	spin_lock_bh(&session->reorder_q.lock);
+	skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
+		if (L2TP_SKB_CB(skbp)->ns > ns) {
+			__skb_queue_before(&session->reorder_q, skbp, skb);
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+			       "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
+			       session->name, ns, L2TP_SKB_CB(skbp)->ns,
+			       skb_queue_len(&session->reorder_q));
+			session->stats.rx_oos_packets++;
+			goto out;
+		}
+	}
+
+	__skb_queue_tail(&session->reorder_q, skb);
+
+out:
+	spin_unlock_bh(&session->reorder_q.lock);
+}
+
+/* Dequeue a single skb.
+ */
+static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	int length = L2TP_SKB_CB(skb)->length;
+
+	/* We're about to requeue the skb, so return resources
+	 * to its current owner (a socket receive buffer).
+	 */
+	skb_orphan(skb);
+
+	tunnel->stats.rx_packets++;
+	tunnel->stats.rx_bytes += length;
+	session->stats.rx_packets++;
+	session->stats.rx_bytes += length;
+
+	if (L2TP_SKB_CB(skb)->has_seq) {
+		/* Bump our Nr */
+		session->nr++;
+		if (tunnel->version == L2TP_HDR_VER_2)
+			session->nr &= 0xffff;
+		else
+			session->nr &= 0xffffff;
+
+		PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+		       "%s: updated nr to %hu\n", session->name, session->nr);
+	}
+
+	/* call private receive handler */
+	if (session->recv_skb != NULL)
+		(*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length);
+	else
+		kfree_skb(skb);
+
+	if (session->deref)
+		(*session->deref)(session);
+}
+
+/* Dequeue skbs from the session's reorder_q, subject to packet order.
+ * Skbs that have been in the queue for too long are simply discarded.
+ */
+static void l2tp_recv_dequeue(struct l2tp_session *session)
+{
+	struct sk_buff *skb;
+	struct sk_buff *tmp;
+
+	/* If the pkt at the head of the queue has the nr that we
+	 * expect to send up next, dequeue it and any other
+	 * in-sequence packets behind it.
+	 */
+	spin_lock_bh(&session->reorder_q.lock);
+	skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
+		if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
+			session->stats.rx_seq_discards++;
+			session->stats.rx_errors++;
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+			       "%s: oos pkt %u len %d discarded (too old), "
+			       "waiting for %u, reorder_q_len=%d\n",
+			       session->name, L2TP_SKB_CB(skb)->ns,
+			       L2TP_SKB_CB(skb)->length, session->nr,
+			       skb_queue_len(&session->reorder_q));
+			__skb_unlink(skb, &session->reorder_q);
+			kfree_skb(skb);
+			if (session->deref)
+				(*session->deref)(session);
+			continue;
+		}
+
+		if (L2TP_SKB_CB(skb)->has_seq) {
+			if (L2TP_SKB_CB(skb)->ns != session->nr) {
+				PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+				       "%s: holding oos pkt %u len %d, "
+				       "waiting for %u, reorder_q_len=%d\n",
+				       session->name, L2TP_SKB_CB(skb)->ns,
+				       L2TP_SKB_CB(skb)->length, session->nr,
+				       skb_queue_len(&session->reorder_q));
+				goto out;
+			}
+		}
+		__skb_unlink(skb, &session->reorder_q);
+
+		/* Process the skb. We release the queue lock while we
+		 * do so to let other contexts process the queue.
+		 */
+		spin_unlock_bh(&session->reorder_q.lock);
+		l2tp_recv_dequeue_skb(session, skb);
+		spin_lock_bh(&session->reorder_q.lock);
+	}
+
+out:
+	spin_unlock_bh(&session->reorder_q.lock);
+}
+
+static inline int l2tp_verify_udp_checksum(struct sock *sk,
+					   struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	u16 ulen = ntohs(uh->len);
+	struct inet_sock *inet;
+	__wsum psum;
+
+	if (sk->sk_no_check || skb_csum_unnecessary(skb) || !uh->check)
+		return 0;
+
+	inet = inet_sk(sk);
+	psum = csum_tcpudp_nofold(inet->inet_saddr, inet->inet_daddr, ulen,
+				  IPPROTO_UDP, 0);
+
+	if ((skb->ip_summed == CHECKSUM_COMPLETE) &&
+	    !csum_fold(csum_add(psum, skb->csum)))
+		return 0;
+
+	skb->csum = psum;
+
+	return __skb_checksum_complete(skb);
+}
+
+/* Do receive processing of L2TP data frames. We handle both L2TPv2
+ * and L2TPv3 data frames here.
+ *
+ * L2TPv2 Data Message Header
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |T|L|x|x|S|x|O|P|x|x|x|x|  Ver  |          Length (opt)         |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |           Tunnel ID           |           Session ID          |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |             Ns (opt)          |             Nr (opt)          |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |      Offset Size (opt)        |    Offset pad... (opt)
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Data frames are marked by T=0. All other fields are the same as
+ * those in L2TP control frames.
+ *
+ * L2TPv3 Data Message Header
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                      L2TP Session Header                      |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                      L2-Specific Sublayer                     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                        Tunnel Payload                      ...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 Session Header Over IP
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                           Session ID                          |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |               Cookie (optional, maximum 64 bits)...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *                                                                 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 L2-Specific Sublayer Format
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x|S|x|x|x|x|x|x|              Sequence Number                  |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Cookie value, sublayer format and offset (pad) are negotiated with
+ * the peer when the session is set up. Unlike L2TPv2, we do not need
+ * to parse the packet header to determine if optional fields are
+ * present.
+ *
+ * Caller must already have parsed the frame and determined that it is
+ * a data (not control) frame before coming here. Fields up to the
+ * session-id have already been parsed and ptr points to the data
+ * after the session-id.
+ */
+void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
+		      unsigned char *ptr, unsigned char *optr, u16 hdrflags,
+		      int length, int (*payload_hook)(struct sk_buff *skb))
+{
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	int offset;
+	u32 ns, nr;
+
+	/* The ref count is increased since we now hold a pointer to
+	 * the session. Take care to decrement the refcnt when exiting
+	 * this function from now on...
+	 */
+	l2tp_session_inc_refcount(session);
+	if (session->ref)
+		(*session->ref)(session);
+
+	/* Parse and check optional cookie */
+	if (session->peer_cookie_len > 0) {
+		if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
+			PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO,
+			       "%s: cookie mismatch (%u/%u). Discarding.\n",
+			       tunnel->name, tunnel->tunnel_id, session->session_id);
+			session->stats.rx_cookie_discards++;
+			goto discard;
+		}
+		ptr += session->peer_cookie_len;
+	}
+
+	/* Handle the optional sequence numbers. Sequence numbers are
+	 * in different places for L2TPv2 and L2TPv3.
+	 *
+	 * If we are the LAC, enable/disable sequence numbers under
+	 * the control of the LNS.  If no sequence numbers present but
+	 * we were expecting them, discard frame.
+	 */
+	ns = nr = 0;
+	L2TP_SKB_CB(skb)->has_seq = 0;
+	if (tunnel->version == L2TP_HDR_VER_2) {
+		if (hdrflags & L2TP_HDRFLAG_S) {
+			ns = ntohs(*(__be16 *) ptr);
+			ptr += 2;
+			nr = ntohs(*(__be16 *) ptr);
+			ptr += 2;
+
+			/* Store L2TP info in the skb */
+			L2TP_SKB_CB(skb)->ns = ns;
+			L2TP_SKB_CB(skb)->has_seq = 1;
+
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+			       "%s: recv data ns=%u, nr=%u, session nr=%u\n",
+			       session->name, ns, nr, session->nr);
+		}
+	} else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+		u32 l2h = ntohl(*(__be32 *) ptr);
+
+		if (l2h & 0x40000000) {
+			ns = l2h & 0x00ffffff;
+
+			/* Store L2TP info in the skb */
+			L2TP_SKB_CB(skb)->ns = ns;
+			L2TP_SKB_CB(skb)->has_seq = 1;
+
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+			       "%s: recv data ns=%u, session nr=%u\n",
+			       session->name, ns, session->nr);
+		}
+	}
+
+	/* Advance past L2-specific header, if present */
+	ptr += session->l2specific_len;
+
+	if (L2TP_SKB_CB(skb)->has_seq) {
+		/* Received a packet with sequence numbers. If we're the LNS,
+		 * check if we sre sending sequence numbers and if not,
+		 * configure it so.
+		 */
+		if ((!session->lns_mode) && (!session->send_seq)) {
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_INFO,
+			       "%s: requested to enable seq numbers by LNS\n",
+			       session->name);
+			session->send_seq = -1;
+			l2tp_session_set_header_len(session, tunnel->version);
+		}
+	} else {
+		/* No sequence numbers.
+		 * If user has configured mandatory sequence numbers, discard.
+		 */
+		if (session->recv_seq) {
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_WARNING,
+			       "%s: recv data has no seq numbers when required. "
+			       "Discarding\n", session->name);
+			session->stats.rx_seq_discards++;
+			goto discard;
+		}
+
+		/* If we're the LAC and we're sending sequence numbers, the
+		 * LNS has requested that we no longer send sequence numbers.
+		 * If we're the LNS and we're sending sequence numbers, the
+		 * LAC is broken. Discard the frame.
+		 */
+		if ((!session->lns_mode) && (session->send_seq)) {
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_INFO,
+			       "%s: requested to disable seq numbers by LNS\n",
+			       session->name);
+			session->send_seq = 0;
+			l2tp_session_set_header_len(session, tunnel->version);
+		} else if (session->send_seq) {
+			PRINTK(session->debug, L2TP_MSG_SEQ, KERN_WARNING,
+			       "%s: recv data has no seq numbers when required. "
+			       "Discarding\n", session->name);
+			session->stats.rx_seq_discards++;
+			goto discard;
+		}
+	}
+
+	/* Session data offset is handled differently for L2TPv2 and
+	 * L2TPv3. For L2TPv2, there is an optional 16-bit value in
+	 * the header. For L2TPv3, the offset is negotiated using AVPs
+	 * in the session setup control protocol.
+	 */
+	if (tunnel->version == L2TP_HDR_VER_2) {
+		/* If offset bit set, skip it. */
+		if (hdrflags & L2TP_HDRFLAG_O) {
+			offset = ntohs(*(__be16 *)ptr);
+			ptr += 2 + offset;
+		}
+	} else
+		ptr += session->offset;
+
+	offset = ptr - optr;
+	if (!pskb_may_pull(skb, offset))
+		goto discard;
+
+	__skb_pull(skb, offset);
+
+	/* If caller wants to process the payload before we queue the
+	 * packet, do so now.
+	 */
+	if (payload_hook)
+		if ((*payload_hook)(skb))
+			goto discard;
+
+	/* Prepare skb for adding to the session's reorder_q.  Hold
+	 * packets for max reorder_timeout or 1 second if not
+	 * reordering.
+	 */
+	L2TP_SKB_CB(skb)->length = length;
+	L2TP_SKB_CB(skb)->expires = jiffies +
+		(session->reorder_timeout ? session->reorder_timeout : HZ);
+
+	/* Add packet to the session's receive queue. Reordering is done here, if
+	 * enabled. Saved L2TP protocol info is stored in skb->sb[].
+	 */
+	if (L2TP_SKB_CB(skb)->has_seq) {
+		if (session->reorder_timeout != 0) {
+			/* Packet reordering enabled. Add skb to session's
+			 * reorder queue, in order of ns.
+			 */
+			l2tp_recv_queue_skb(session, skb);
+		} else {
+			/* Packet reordering disabled. Discard out-of-sequence
+			 * packets
+			 */
+			if (L2TP_SKB_CB(skb)->ns != session->nr) {
+				session->stats.rx_seq_discards++;
+				PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+				       "%s: oos pkt %u len %d discarded, "
+				       "waiting for %u, reorder_q_len=%d\n",
+				       session->name, L2TP_SKB_CB(skb)->ns,
+				       L2TP_SKB_CB(skb)->length, session->nr,
+				       skb_queue_len(&session->reorder_q));
+				goto discard;
+			}
+			skb_queue_tail(&session->reorder_q, skb);
+		}
+	} else {
+		/* No sequence numbers. Add the skb to the tail of the
+		 * reorder queue. This ensures that it will be
+		 * delivered after all previous sequenced skbs.
+		 */
+		skb_queue_tail(&session->reorder_q, skb);
+	}
+
+	/* Try to dequeue as many skbs from reorder_q as we can. */
+	l2tp_recv_dequeue(session);
+
+	l2tp_session_dec_refcount(session);
+
+	return;
+
+discard:
+	session->stats.rx_errors++;
+	kfree_skb(skb);
+
+	if (session->deref)
+		(*session->deref)(session);
+
+	l2tp_session_dec_refcount(session);
+}
+EXPORT_SYMBOL(l2tp_recv_common);
+
+/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
+ * here. The skb is not on a list when we get here.
+ * Returns 0 if the packet was a data packet and was successfully passed on.
+ * Returns 1 if the packet was not a good data packet and could not be
+ * forwarded.  All such packets are passed up to userspace to deal with.
+ */
+static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
+			      int (*payload_hook)(struct sk_buff *skb))
+{
+	struct l2tp_session *session = NULL;
+	unsigned char *ptr, *optr;
+	u16 hdrflags;
+	u32 tunnel_id, session_id;
+	int offset;
+	u16 version;
+	int length;
+
+	if (tunnel->sock && l2tp_verify_udp_checksum(tunnel->sock, skb))
+		goto discard_bad_csum;
+
+	/* UDP always verifies the packet length. */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Short packet? */
+	if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) {
+		PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO,
+		       "%s: recv short packet (len=%d)\n", tunnel->name, skb->len);
+		goto error;
+	}
+
+	/* Point to L2TP header */
+	optr = ptr = skb->data;
+
+	/* Trace packet contents, if enabled */
+	if (tunnel->debug & L2TP_MSG_DATA) {
+		length = min(32u, skb->len);
+		if (!pskb_may_pull(skb, length))
+			goto error;
+
+		printk(KERN_DEBUG "%s: recv: ", tunnel->name);
+
+		offset = 0;
+		do {
+			printk(" %02X", ptr[offset]);
+		} while (++offset < length);
+
+		printk("\n");
+	}
+
+	/* Get L2TP header flags */
+	hdrflags = ntohs(*(__be16 *) ptr);
+
+	/* Check protocol version */
+	version = hdrflags & L2TP_HDR_VER_MASK;
+	if (version != tunnel->version) {
+		PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO,
+		       "%s: recv protocol version mismatch: got %d expected %d\n",
+		       tunnel->name, version, tunnel->version);
+		goto error;
+	}
+
+	/* Get length of L2TP packet */
+	length = skb->len;
+
+	/* If type is control packet, it is handled by userspace. */
+	if (hdrflags & L2TP_HDRFLAG_T) {
+		PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_DEBUG,
+		       "%s: recv control packet, len=%d\n", tunnel->name, length);
+		goto error;
+	}
+
+	/* Skip flags */
+	ptr += 2;
+
+	if (tunnel->version == L2TP_HDR_VER_2) {
+		/* If length is present, skip it */
+		if (hdrflags & L2TP_HDRFLAG_L)
+			ptr += 2;
+
+		/* Extract tunnel and session ID */
+		tunnel_id = ntohs(*(__be16 *) ptr);
+		ptr += 2;
+		session_id = ntohs(*(__be16 *) ptr);
+		ptr += 2;
+	} else {
+		ptr += 2;	/* skip reserved bits */
+		tunnel_id = tunnel->tunnel_id;
+		session_id = ntohl(*(__be32 *) ptr);
+		ptr += 4;
+	}
+
+	/* Find the session context */
+	session = l2tp_session_find(tunnel->l2tp_net, tunnel, session_id);
+	if (!session || !session->recv_skb) {
+		/* Not found? Pass to userspace to deal with */
+		PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_INFO,
+		       "%s: no session found (%u/%u). Passing up.\n",
+		       tunnel->name, tunnel_id, session_id);
+		goto error;
+	}
+
+	l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+
+	return 0;
+
+discard_bad_csum:
+	LIMIT_NETDEBUG("%s: UDP: bad checksum\n", tunnel->name);
+	UDP_INC_STATS_USER(tunnel->l2tp_net, UDP_MIB_INERRORS, 0);
+	tunnel->stats.rx_errors++;
+	kfree_skb(skb);
+
+	return 0;
+
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+}
+
+/* UDP encapsulation receive handler. See net/ipv4/udp.c.
+ * Return codes:
+ * 0 : success.
+ * <0: error
+ * >0: skb should be passed up to userspace as UDP.
+ */
+int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct l2tp_tunnel *tunnel;
+
+	tunnel = l2tp_sock_to_tunnel(sk);
+	if (tunnel == NULL)
+		goto pass_up;
+
+	PRINTK(tunnel->debug, L2TP_MSG_DATA, KERN_DEBUG,
+	       "%s: received %d bytes\n", tunnel->name, skb->len);
+
+	if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook))
+		goto pass_up_put;
+
+	sock_put(sk);
+	return 0;
+
+pass_up_put:
+	sock_put(sk);
+pass_up:
+	return 1;
+}
+EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv);
+
+/************************************************************************
+ * Transmit handling
+ ***********************************************************************/
+
+/* Build an L2TP header for the session into the buffer provided.
+ */
+static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf)
+{
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	__be16 *bufp = buf;
+	__be16 *optr = buf;
+	u16 flags = L2TP_HDR_VER_2;
+	u32 tunnel_id = tunnel->peer_tunnel_id;
+	u32 session_id = session->peer_session_id;
+
+	if (session->send_seq)
+		flags |= L2TP_HDRFLAG_S;
+
+	/* Setup L2TP header. */
+	*bufp++ = htons(flags);
+	*bufp++ = htons(tunnel_id);
+	*bufp++ = htons(session_id);
+	if (session->send_seq) {
+		*bufp++ = htons(session->ns);
+		*bufp++ = 0;
+		session->ns++;
+		session->ns &= 0xffff;
+		PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+		       "%s: updated ns to %u\n", session->name, session->ns);
+	}
+
+	return bufp - optr;
+}
+
+static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
+{
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	char *bufp = buf;
+	char *optr = bufp;
+
+	/* Setup L2TP header. The header differs slightly for UDP and
+	 * IP encapsulations. For UDP, there is 4 bytes of flags.
+	 */
+	if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
+		u16 flags = L2TP_HDR_VER_3;
+		*((__be16 *) bufp) = htons(flags);
+		bufp += 2;
+		*((__be16 *) bufp) = 0;
+		bufp += 2;
+	}
+
+	*((__be32 *) bufp) = htonl(session->peer_session_id);
+	bufp += 4;
+	if (session->cookie_len) {
+		memcpy(bufp, &session->cookie[0], session->cookie_len);
+		bufp += session->cookie_len;
+	}
+	if (session->l2specific_len) {
+		if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
+			u32 l2h = 0;
+			if (session->send_seq) {
+				l2h = 0x40000000 | session->ns;
+				session->ns++;
+				session->ns &= 0xffffff;
+				PRINTK(session->debug, L2TP_MSG_SEQ, KERN_DEBUG,
+				       "%s: updated ns to %u\n", session->name, session->ns);
+			}
+
+			*((__be32 *) bufp) = htonl(l2h);
+		}
+		bufp += session->l2specific_len;
+	}
+	if (session->offset)
+		bufp += session->offset;
+
+	return bufp - optr;
+}
+
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
+			  struct flowi *fl, size_t data_len)
+{
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	unsigned int len = skb->len;
+	int error;
+
+	/* Debug */
+	if (session->send_seq)
+		PRINTK(session->debug, L2TP_MSG_DATA, KERN_DEBUG,
+		       "%s: send %Zd bytes, ns=%u\n", session->name,
+		       data_len, session->ns - 1);
+	else
+		PRINTK(session->debug, L2TP_MSG_DATA, KERN_DEBUG,
+		       "%s: send %Zd bytes\n", session->name, data_len);
+
+	if (session->debug & L2TP_MSG_DATA) {
+		int i;
+		int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+		unsigned char *datap = skb->data + uhlen;
+
+		printk(KERN_DEBUG "%s: xmit:", session->name);
+		for (i = 0; i < (len - uhlen); i++) {
+			printk(" %02X", *datap++);
+			if (i == 31) {
+				printk(" ...");
+				break;
+			}
+		}
+		printk("\n");
+	}
+
+	/* Queue the packet to IP for output */
+	skb->local_df = 1;
+	error = ip_queue_xmit(skb, fl);
+
+	/* Update stats */
+	if (error >= 0) {
+		tunnel->stats.tx_packets++;
+		tunnel->stats.tx_bytes += len;
+		session->stats.tx_packets++;
+		session->stats.tx_bytes += len;
+	} else {
+		tunnel->stats.tx_errors++;
+		session->stats.tx_errors++;
+	}
+
+	return 0;
+}
+
+/* Automatically called when the skb is freed.
+ */
+static void l2tp_sock_wfree(struct sk_buff *skb)
+{
+	sock_put(skb->sk);
+}
+
+/* For data skbs that we transmit, we associate with the tunnel socket
+ * but don't do accounting.
+ */
+static inline void l2tp_skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+	sock_hold(sk);
+	skb->sk = sk;
+	skb->destructor = l2tp_sock_wfree;
+}
+
+/* If caller requires the skb to have a ppp header, the header must be
+ * inserted in the skb data before calling this function.
+ */
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len)
+{
+	int data_len = skb->len;
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	struct sock *sk = tunnel->sock;
+	struct flowi *fl;
+	struct udphdr *uh;
+	struct inet_sock *inet;
+	__wsum csum;
+	int old_headroom;
+	int new_headroom;
+	int headroom;
+	int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+	int udp_len;
+
+	/* Check that there's enough headroom in the skb to insert IP,
+	 * UDP and L2TP headers. If not enough, expand it to
+	 * make room. Adjust truesize.
+	 */
+	headroom = NET_SKB_PAD + sizeof(struct iphdr) +
+		uhlen + hdr_len;
+	old_headroom = skb_headroom(skb);
+	if (skb_cow_head(skb, headroom)) {
+		dev_kfree_skb(skb);
+		goto abort;
+	}
+
+	new_headroom = skb_headroom(skb);
+	skb_orphan(skb);
+	skb->truesize += new_headroom - old_headroom;
+
+	/* Setup L2TP header */
+	session->build_header(session, __skb_push(skb, hdr_len));
+
+	/* Reset skb netfilter state */
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	nf_reset(skb);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		dev_kfree_skb(skb);
+		goto out_unlock;
+	}
+
+	/* Get routing info from the tunnel socket */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst_clone(__sk_dst_get(sk)));
+
+	inet = inet_sk(sk);
+	fl = &inet->cork.fl;
+	switch (tunnel->encap) {
+	case L2TP_ENCAPTYPE_UDP:
+		/* Setup UDP header */
+		__skb_push(skb, sizeof(*uh));
+		skb_reset_transport_header(skb);
+		uh = udp_hdr(skb);
+		uh->source = inet->inet_sport;
+		uh->dest = inet->inet_dport;
+		udp_len = uhlen + hdr_len + data_len;
+		uh->len = htons(udp_len);
+		uh->check = 0;
+
+		/* Calculate UDP checksum if configured to do so */
+		if (sk->sk_no_check == UDP_CSUM_NOXMIT)
+			skb->ip_summed = CHECKSUM_NONE;
+		else if ((skb_dst(skb) && skb_dst(skb)->dev) &&
+			 (!(skb_dst(skb)->dev->features & NETIF_F_V4_CSUM))) {
+			skb->ip_summed = CHECKSUM_COMPLETE;
+			csum = skb_checksum(skb, 0, udp_len, 0);
+			uh->check = csum_tcpudp_magic(inet->inet_saddr,
+						      inet->inet_daddr,
+						      udp_len, IPPROTO_UDP, csum);
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+		} else {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_transport_header(skb) - skb->head;
+			skb->csum_offset = offsetof(struct udphdr, check);
+			uh->check = ~csum_tcpudp_magic(inet->inet_saddr,
+						       inet->inet_daddr,
+						       udp_len, IPPROTO_UDP, 0);
+		}
+		break;
+
+	case L2TP_ENCAPTYPE_IP:
+		break;
+	}
+
+	l2tp_skb_set_owner_w(skb, sk);
+
+	l2tp_xmit_core(session, skb, fl, data_len);
+out_unlock:
+	bh_unlock_sock(sk);
+
+abort:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
+
+/*****************************************************************************
+ * Tinnel and session create/destroy.
+ *****************************************************************************/
+
+/* Tunnel socket destruct hook.
+ * The tunnel context is deleted only when all session sockets have been
+ * closed.
+ */
+static void l2tp_tunnel_destruct(struct sock *sk)
+{
+	struct l2tp_tunnel *tunnel;
+
+	tunnel = sk->sk_user_data;
+	if (tunnel == NULL)
+		goto end;
+
+	PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO,
+	       "%s: closing...\n", tunnel->name);
+
+	/* Close all sessions */
+	l2tp_tunnel_closeall(tunnel);
+
+	switch (tunnel->encap) {
+	case L2TP_ENCAPTYPE_UDP:
+		/* No longer an encapsulation socket. See net/ipv4/udp.c */
+		(udp_sk(sk))->encap_type = 0;
+		(udp_sk(sk))->encap_rcv = NULL;
+		break;
+	case L2TP_ENCAPTYPE_IP:
+		break;
+	}
+
+	/* Remove hooks into tunnel socket */
+	tunnel->sock = NULL;
+	sk->sk_destruct = tunnel->old_sk_destruct;
+	sk->sk_user_data = NULL;
+
+	/* Call the original destructor */
+	if (sk->sk_destruct)
+		(*sk->sk_destruct)(sk);
+
+	/* We're finished with the socket */
+	l2tp_tunnel_dec_refcount(tunnel);
+
+end:
+	return;
+}
+
+/* When the tunnel is closed, all the attached sessions need to go too.
+ */
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+{
+	int hash;
+	struct hlist_node *walk;
+	struct hlist_node *tmp;
+	struct l2tp_session *session;
+
+	BUG_ON(tunnel == NULL);
+
+	PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO,
+	       "%s: closing all sessions...\n", tunnel->name);
+
+	write_lock_bh(&tunnel->hlist_lock);
+	for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+again:
+		hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
+			session = hlist_entry(walk, struct l2tp_session, hlist);
+
+			PRINTK(session->debug, L2TP_MSG_CONTROL, KERN_INFO,
+			       "%s: closing session\n", session->name);
+
+			hlist_del_init(&session->hlist);
+
+			/* Since we should hold the sock lock while
+			 * doing any unbinding, we need to release the
+			 * lock we're holding before taking that lock.
+			 * Hold a reference to the sock so it doesn't
+			 * disappear as we're jumping between locks.
+			 */
+			if (session->ref != NULL)
+				(*session->ref)(session);
+
+			write_unlock_bh(&tunnel->hlist_lock);
+
+			if (tunnel->version != L2TP_HDR_VER_2) {
+				struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+
+				spin_lock_bh(&pn->l2tp_session_hlist_lock);
+				hlist_del_init_rcu(&session->global_hlist);
+				spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+				synchronize_rcu();
+			}
+
+			if (session->session_close != NULL)
+				(*session->session_close)(session);
+
+			if (session->deref != NULL)
+				(*session->deref)(session);
+
+			write_lock_bh(&tunnel->hlist_lock);
+
+			/* Now restart from the beginning of this hash
+			 * chain.  We always remove a session from the
+			 * list so we are guaranteed to make forward
+			 * progress.
+			 */
+			goto again;
+		}
+	}
+	write_unlock_bh(&tunnel->hlist_lock);
+}
+
+/* Really kill the tunnel.
+ * Come here only when all sessions have been cleared from the tunnel.
+ */
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
+{
+	struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+
+	BUG_ON(atomic_read(&tunnel->ref_count) != 0);
+	BUG_ON(tunnel->sock != NULL);
+
+	PRINTK(tunnel->debug, L2TP_MSG_CONTROL, KERN_INFO,
+	       "%s: free...\n", tunnel->name);
+
+	/* Remove from tunnel list */
+	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+	list_del_rcu(&tunnel->list);
+	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+	synchronize_rcu();
+
+	atomic_dec(&l2tp_tunnel_count);
+	kfree(tunnel);
+}
+
+/* Create a socket for the tunnel, if one isn't set up by
+ * userspace. This is used for static tunnels where there is no
+ * managing L2TP daemon.
+ */
+static int l2tp_tunnel_sock_create(u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct socket **sockp)
+{
+	int err = -EINVAL;
+	struct sockaddr_in udp_addr;
+	struct sockaddr_l2tpip ip_addr;
+	struct socket *sock = NULL;
+
+	switch (cfg->encap) {
+	case L2TP_ENCAPTYPE_UDP:
+		err = sock_create(AF_INET, SOCK_DGRAM, 0, sockp);
+		if (err < 0)
+			goto out;
+
+		sock = *sockp;
+
+		memset(&udp_addr, 0, sizeof(udp_addr));
+		udp_addr.sin_family = AF_INET;
+		udp_addr.sin_addr = cfg->local_ip;
+		udp_addr.sin_port = htons(cfg->local_udp_port);
+		err = kernel_bind(sock, (struct sockaddr *) &udp_addr, sizeof(udp_addr));
+		if (err < 0)
+			goto out;
+
+		udp_addr.sin_family = AF_INET;
+		udp_addr.sin_addr = cfg->peer_ip;
+		udp_addr.sin_port = htons(cfg->peer_udp_port);
+		err = kernel_connect(sock, (struct sockaddr *) &udp_addr, sizeof(udp_addr), 0);
+		if (err < 0)
+			goto out;
+
+		if (!cfg->use_udp_checksums)
+			sock->sk->sk_no_check = UDP_CSUM_NOXMIT;
+
+		break;
+
+	case L2TP_ENCAPTYPE_IP:
+		err = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_L2TP, sockp);
+		if (err < 0)
+			goto out;
+
+		sock = *sockp;
+
+		memset(&ip_addr, 0, sizeof(ip_addr));
+		ip_addr.l2tp_family = AF_INET;
+		ip_addr.l2tp_addr = cfg->local_ip;
+		ip_addr.l2tp_conn_id = tunnel_id;
+		err = kernel_bind(sock, (struct sockaddr *) &ip_addr, sizeof(ip_addr));
+		if (err < 0)
+			goto out;
+
+		ip_addr.l2tp_family = AF_INET;
+		ip_addr.l2tp_addr = cfg->peer_ip;
+		ip_addr.l2tp_conn_id = peer_tunnel_id;
+		err = kernel_connect(sock, (struct sockaddr *) &ip_addr, sizeof(ip_addr), 0);
+		if (err < 0)
+			goto out;
+
+		break;
+
+	default:
+		goto out;
+	}
+
+out:
+	if ((err < 0) && sock) {
+		sock_release(sock);
+		*sockp = NULL;
+	}
+
+	return err;
+}
+
+int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
+{
+	struct l2tp_tunnel *tunnel = NULL;
+	int err;
+	struct socket *sock = NULL;
+	struct sock *sk = NULL;
+	struct l2tp_net *pn;
+	enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP;
+
+	/* Get the tunnel socket from the fd, which was opened by
+	 * the userspace L2TP daemon. If not specified, create a
+	 * kernel socket.
+	 */
+	if (fd < 0) {
+		err = l2tp_tunnel_sock_create(tunnel_id, peer_tunnel_id, cfg, &sock);
+		if (err < 0)
+			goto err;
+	} else {
+		err = -EBADF;
+		sock = sockfd_lookup(fd, &err);
+		if (!sock) {
+			printk(KERN_ERR "tunl %hu: sockfd_lookup(fd=%d) returned %d\n",
+			       tunnel_id, fd, err);
+			goto err;
+		}
+	}
+
+	sk = sock->sk;
+
+	if (cfg != NULL)
+		encap = cfg->encap;
+
+	/* Quick sanity checks */
+	switch (encap) {
+	case L2TP_ENCAPTYPE_UDP:
+		err = -EPROTONOSUPPORT;
+		if (sk->sk_protocol != IPPROTO_UDP) {
+			printk(KERN_ERR "tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
+			       tunnel_id, fd, sk->sk_protocol, IPPROTO_UDP);
+			goto err;
+		}
+		break;
+	case L2TP_ENCAPTYPE_IP:
+		err = -EPROTONOSUPPORT;
+		if (sk->sk_protocol != IPPROTO_L2TP) {
+			printk(KERN_ERR "tunl %hu: fd %d wrong protocol, got %d, expected %d\n",
+			       tunnel_id, fd, sk->sk_protocol, IPPROTO_L2TP);
+			goto err;
+		}
+		break;
+	}
+
+	/* Check if this socket has already been prepped */
+	tunnel = (struct l2tp_tunnel *)sk->sk_user_data;
+	if (tunnel != NULL) {
+		/* This socket has already been prepped */
+		err = -EBUSY;
+		goto err;
+	}
+
+	tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL);
+	if (tunnel == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	tunnel->version = version;
+	tunnel->tunnel_id = tunnel_id;
+	tunnel->peer_tunnel_id = peer_tunnel_id;
+	tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS;
+
+	tunnel->magic = L2TP_TUNNEL_MAGIC;
+	sprintf(&tunnel->name[0], "tunl %u", tunnel_id);
+	rwlock_init(&tunnel->hlist_lock);
+
+	/* The net we belong to */
+	tunnel->l2tp_net = net;
+	pn = l2tp_pernet(net);
+
+	if (cfg != NULL)
+		tunnel->debug = cfg->debug;
+
+	/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
+	tunnel->encap = encap;
+	if (encap == L2TP_ENCAPTYPE_UDP) {
+		/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
+		udp_sk(sk)->encap_type = UDP_ENCAP_L2TPINUDP;
+		udp_sk(sk)->encap_rcv = l2tp_udp_encap_recv;
+	}
+
+	sk->sk_user_data = tunnel;
+
+	/* Hook on the tunnel socket destructor so that we can cleanup
+	 * if the tunnel socket goes away.
+	 */
+	tunnel->old_sk_destruct = sk->sk_destruct;
+	sk->sk_destruct = &l2tp_tunnel_destruct;
+	tunnel->sock = sk;
+	sk->sk_allocation = GFP_ATOMIC;
+
+	/* Add tunnel to our list */
+	INIT_LIST_HEAD(&tunnel->list);
+	atomic_inc(&l2tp_tunnel_count);
+
+	/* Bump the reference count. The tunnel context is deleted
+	 * only when this drops to zero. Must be done before list insertion
+	 */
+	l2tp_tunnel_inc_refcount(tunnel);
+	spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+	list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
+	spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+
+	err = 0;
+err:
+	if (tunnelp)
+		*tunnelp = tunnel;
+
+	/* If tunnel's socket was created by the kernel, it doesn't
+	 *  have a file.
+	 */
+	if (sock && sock->file)
+		sockfd_put(sock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
+
+/* This function is used by the netlink TUNNEL_DELETE command.
+ */
+int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
+{
+	int err = 0;
+	struct socket *sock = tunnel->sock ? tunnel->sock->sk_socket : NULL;
+
+	/* Force the tunnel socket to close. This will eventually
+	 * cause the tunnel to be deleted via the normal socket close
+	 * mechanisms when userspace closes the tunnel socket.
+	 */
+	if (sock != NULL) {
+		err = inet_shutdown(sock, 2);
+
+		/* If the tunnel's socket was created by the kernel,
+		 * close the socket here since the socket was not
+		 * created by userspace.
+		 */
+		if (sock->file == NULL)
+			err = inet_release(sock);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
+
+/* Really kill the session.
+ */
+void l2tp_session_free(struct l2tp_session *session)
+{
+	struct l2tp_tunnel *tunnel;
+
+	BUG_ON(atomic_read(&session->ref_count) != 0);
+
+	tunnel = session->tunnel;
+	if (tunnel != NULL) {
+		BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+
+		/* Delete the session from the hash */
+		write_lock_bh(&tunnel->hlist_lock);
+		hlist_del_init(&session->hlist);
+		write_unlock_bh(&tunnel->hlist_lock);
+
+		/* Unlink from the global hash if not L2TPv2 */
+		if (tunnel->version != L2TP_HDR_VER_2) {
+			struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+
+			spin_lock_bh(&pn->l2tp_session_hlist_lock);
+			hlist_del_init_rcu(&session->global_hlist);
+			spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+			synchronize_rcu();
+		}
+
+		if (session->session_id != 0)
+			atomic_dec(&l2tp_session_count);
+
+		sock_put(tunnel->sock);
+
+		/* This will delete the tunnel context if this
+		 * is the last session on the tunnel.
+		 */
+		session->tunnel = NULL;
+		l2tp_tunnel_dec_refcount(tunnel);
+	}
+
+	kfree(session);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_free);
+
+/* This function is used by the netlink SESSION_DELETE command and by
+   pseudowire modules.
+ */
+int l2tp_session_delete(struct l2tp_session *session)
+{
+	if (session->session_close != NULL)
+		(*session->session_close)(session);
+
+	l2tp_session_dec_refcount(session);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_delete);
+
+
+/* We come here whenever a session's send_seq, cookie_len or
+ * l2specific_len parameters are set.
+ */
+static void l2tp_session_set_header_len(struct l2tp_session *session, int version)
+{
+	if (version == L2TP_HDR_VER_2) {
+		session->hdr_len = 6;
+		if (session->send_seq)
+			session->hdr_len += 4;
+	} else {
+		session->hdr_len = 4 + session->cookie_len + session->l2specific_len + session->offset;
+		if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
+			session->hdr_len += 4;
+	}
+
+}
+
+struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+	struct l2tp_session *session;
+
+	session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
+	if (session != NULL) {
+		session->magic = L2TP_SESSION_MAGIC;
+		session->tunnel = tunnel;
+
+		session->session_id = session_id;
+		session->peer_session_id = peer_session_id;
+		session->nr = 1;
+
+		sprintf(&session->name[0], "sess %u/%u",
+			tunnel->tunnel_id, session->session_id);
+
+		skb_queue_head_init(&session->reorder_q);
+
+		INIT_HLIST_NODE(&session->hlist);
+		INIT_HLIST_NODE(&session->global_hlist);
+
+		/* Inherit debug options from tunnel */
+		session->debug = tunnel->debug;
+
+		if (cfg) {
+			session->pwtype = cfg->pw_type;
+			session->debug = cfg->debug;
+			session->mtu = cfg->mtu;
+			session->mru = cfg->mru;
+			session->send_seq = cfg->send_seq;
+			session->recv_seq = cfg->recv_seq;
+			session->lns_mode = cfg->lns_mode;
+			session->reorder_timeout = cfg->reorder_timeout;
+			session->offset = cfg->offset;
+			session->l2specific_type = cfg->l2specific_type;
+			session->l2specific_len = cfg->l2specific_len;
+			session->cookie_len = cfg->cookie_len;
+			memcpy(&session->cookie[0], &cfg->cookie[0], cfg->cookie_len);
+			session->peer_cookie_len = cfg->peer_cookie_len;
+			memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len);
+		}
+
+		if (tunnel->version == L2TP_HDR_VER_2)
+			session->build_header = l2tp_build_l2tpv2_header;
+		else
+			session->build_header = l2tp_build_l2tpv3_header;
+
+		l2tp_session_set_header_len(session, tunnel->version);
+
+		/* Bump the reference count. The session context is deleted
+		 * only when this drops to zero.
+		 */
+		l2tp_session_inc_refcount(session);
+		l2tp_tunnel_inc_refcount(tunnel);
+
+		/* Ensure tunnel socket isn't deleted */
+		sock_hold(tunnel->sock);
+
+		/* Add session to the tunnel's hash list */
+		write_lock_bh(&tunnel->hlist_lock);
+		hlist_add_head(&session->hlist,
+			       l2tp_session_id_hash(tunnel, session_id));
+		write_unlock_bh(&tunnel->hlist_lock);
+
+		/* And to the global session list if L2TPv3 */
+		if (tunnel->version != L2TP_HDR_VER_2) {
+			struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+
+			spin_lock_bh(&pn->l2tp_session_hlist_lock);
+			hlist_add_head_rcu(&session->global_hlist,
+					   l2tp_session_id_hash_2(pn, session_id));
+			spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+		}
+
+		/* Ignore management session in session count value */
+		if (session->session_id != 0)
+			atomic_inc(&l2tp_session_count);
+	}
+
+	return session;
+}
+EXPORT_SYMBOL_GPL(l2tp_session_create);
+
+/*****************************************************************************
+ * Init and cleanup
+ *****************************************************************************/
+
+static __net_init int l2tp_init_net(struct net *net)
+{
+	struct l2tp_net *pn = net_generic(net, l2tp_net_id);
+	int hash;
+
+	INIT_LIST_HEAD(&pn->l2tp_tunnel_list);
+	spin_lock_init(&pn->l2tp_tunnel_list_lock);
+
+	for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++)
+		INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]);
+
+	spin_lock_init(&pn->l2tp_session_hlist_lock);
+
+	return 0;
+}
+
+static struct pernet_operations l2tp_net_ops = {
+	.init = l2tp_init_net,
+	.id   = &l2tp_net_id,
+	.size = sizeof(struct l2tp_net),
+};
+
+static int __init l2tp_init(void)
+{
+	int rc = 0;
+
+	rc = register_pernet_device(&l2tp_net_ops);
+	if (rc)
+		goto out;
+
+	printk(KERN_INFO "L2TP core driver, %s\n", L2TP_DRV_VERSION);
+
+out:
+	return rc;
+}
+
+static void __exit l2tp_exit(void)
+{
+	unregister_pernet_device(&l2tp_net_ops);
+}
+
+module_init(l2tp_init);
+module_exit(l2tp_exit);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP core");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(L2TP_DRV_VERSION);
+
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
new file mode 100644
index 00000000..a16a48e7
--- /dev/null
+++ b/net/l2tp/l2tp_core.h
@@ -0,0 +1,271 @@
+/*
+ * L2TP internal definitions.
+ *
+ * Copyright (c) 2008,2009 Katalix Systems Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _L2TP_CORE_H_
+#define _L2TP_CORE_H_
+
+/* Just some random numbers */
+#define L2TP_TUNNEL_MAGIC	0x42114DDA
+#define L2TP_SESSION_MAGIC	0x0C04EB7D
+
+/* Per tunnel, session hash table size */
+#define L2TP_HASH_BITS	4
+#define L2TP_HASH_SIZE	(1 << L2TP_HASH_BITS)
+
+/* System-wide, session hash table size */
+#define L2TP_HASH_BITS_2	8
+#define L2TP_HASH_SIZE_2	(1 << L2TP_HASH_BITS_2)
+
+/* Debug message categories for the DEBUG socket option */
+enum {
+	L2TP_MSG_DEBUG		= (1 << 0),	/* verbose debug (if
+						 * compiled in) */
+	L2TP_MSG_CONTROL	= (1 << 1),	/* userspace - kernel
+						 * interface */
+	L2TP_MSG_SEQ		= (1 << 2),	/* sequence numbers */
+	L2TP_MSG_DATA		= (1 << 3),	/* data packets */
+};
+
+struct sk_buff;
+
+struct l2tp_stats {
+	u64			tx_packets;
+	u64			tx_bytes;
+	u64			tx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_seq_discards;
+	u64			rx_oos_packets;
+	u64			rx_errors;
+	u64			rx_cookie_discards;
+};
+
+struct l2tp_tunnel;
+
+/* Describes a session. Contains information to determine incoming
+ * packets and transmit outgoing ones.
+ */
+struct l2tp_session_cfg {
+	enum l2tp_pwtype	pw_type;
+	unsigned		data_seq:2;	/* data sequencing level
+						 * 0 => none, 1 => IP only,
+						 * 2 => all
+						 */
+	unsigned		recv_seq:1;	/* expect receive packets with
+						 * sequence numbers? */
+	unsigned		send_seq:1;	/* send packets with sequence
+						 * numbers? */
+	unsigned		lns_mode:1;	/* behave as LNS? LAC enables
+						 * sequence numbers under
+						 * control of LNS. */
+	int			debug;		/* bitmask of debug message
+						 * categories */
+	u16			vlan_id;	/* VLAN pseudowire only */
+	u16			offset;		/* offset to payload */
+	u16			l2specific_len;	/* Layer 2 specific length */
+	u16			l2specific_type; /* Layer 2 specific type */
+	u8			cookie[8];	/* optional cookie */
+	int			cookie_len;	/* 0, 4 or 8 bytes */
+	u8			peer_cookie[8];	/* peer's cookie */
+	int			peer_cookie_len; /* 0, 4 or 8 bytes */
+	int			reorder_timeout; /* configured reorder timeout
+						  * (in jiffies) */
+	int			mtu;
+	int			mru;
+	char			*ifname;
+};
+
+struct l2tp_session {
+	int			magic;		/* should be
+						 * L2TP_SESSION_MAGIC */
+
+	struct l2tp_tunnel	*tunnel;	/* back pointer to tunnel
+						 * context */
+	u32			session_id;
+	u32			peer_session_id;
+	u8			cookie[8];
+	int			cookie_len;
+	u8			peer_cookie[8];
+	int			peer_cookie_len;
+	u16			offset;		/* offset from end of L2TP header
+						   to beginning of data */
+	u16			l2specific_len;
+	u16			l2specific_type;
+	u16			hdr_len;
+	u32			nr;		/* session NR state (receive) */
+	u32			ns;		/* session NR state (send) */
+	struct sk_buff_head	reorder_q;	/* receive reorder queue */
+	struct hlist_node	hlist;		/* Hash list node */
+	atomic_t		ref_count;
+
+	char			name[32];	/* for logging */
+	char			ifname[IFNAMSIZ];
+	unsigned		data_seq:2;	/* data sequencing level
+						 * 0 => none, 1 => IP only,
+						 * 2 => all
+						 */
+	unsigned		recv_seq:1;	/* expect receive packets with
+						 * sequence numbers? */
+	unsigned		send_seq:1;	/* send packets with sequence
+						 * numbers? */
+	unsigned		lns_mode:1;	/* behave as LNS? LAC enables
+						 * sequence numbers under
+						 * control of LNS. */
+	int			debug;		/* bitmask of debug message
+						 * categories */
+	int			reorder_timeout; /* configured reorder timeout
+						  * (in jiffies) */
+	int			mtu;
+	int			mru;
+	enum l2tp_pwtype	pwtype;
+	struct l2tp_stats	stats;
+	struct hlist_node	global_hlist;	/* Global hash list node */
+
+	int (*build_header)(struct l2tp_session *session, void *buf);
+	void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
+	void (*session_close)(struct l2tp_session *session);
+	void (*ref)(struct l2tp_session *session);
+	void (*deref)(struct l2tp_session *session);
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+	void (*show)(struct seq_file *m, void *priv);
+#endif
+	uint8_t			priv[0];	/* private data */
+};
+
+/* Describes the tunnel. It contains info to track all the associated
+ * sessions so incoming packets can be sorted out
+ */
+struct l2tp_tunnel_cfg {
+	int			debug;		/* bitmask of debug message
+						 * categories */
+	enum l2tp_encap_type	encap;
+
+	/* Used only for kernel-created sockets */
+	struct in_addr		local_ip;
+	struct in_addr		peer_ip;
+	u16			local_udp_port;
+	u16			peer_udp_port;
+	unsigned int		use_udp_checksums:1;
+};
+
+struct l2tp_tunnel {
+	int			magic;		/* Should be L2TP_TUNNEL_MAGIC */
+	rwlock_t		hlist_lock;	/* protect session_hlist */
+	struct hlist_head	session_hlist[L2TP_HASH_SIZE];
+						/* hashed list of sessions,
+						 * hashed by id */
+	u32			tunnel_id;
+	u32			peer_tunnel_id;
+	int			version;	/* 2=>L2TPv2, 3=>L2TPv3 */
+
+	char			name[20];	/* for logging */
+	int			debug;		/* bitmask of debug message
+						 * categories */
+	enum l2tp_encap_type	encap;
+	struct l2tp_stats	stats;
+
+	struct list_head	list;		/* Keep a list of all tunnels */
+	struct net		*l2tp_net;	/* the net we belong to */
+
+	atomic_t		ref_count;
+#ifdef CONFIG_DEBUG_FS
+	void (*show)(struct seq_file *m, void *arg);
+#endif
+	int (*recv_payload_hook)(struct sk_buff *skb);
+	void (*old_sk_destruct)(struct sock *);
+	struct sock		*sock;		/* Parent socket */
+	int			fd;
+
+	uint8_t			priv[0];	/* private data */
+};
+
+struct l2tp_nl_cmd_ops {
+	int (*session_create)(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
+	int (*session_delete)(struct l2tp_session *session);
+};
+
+static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel)
+{
+	return &tunnel->priv[0];
+}
+
+static inline void *l2tp_session_priv(struct l2tp_session *session)
+{
+	return &session->priv[0];
+}
+
+static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk)
+{
+	struct l2tp_tunnel *tunnel;
+
+	if (sk == NULL)
+		return NULL;
+
+	sock_hold(sk);
+	tunnel = (struct l2tp_tunnel *)(sk->sk_user_data);
+	if (tunnel == NULL) {
+		sock_put(sk);
+		goto out;
+	}
+
+	BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+
+out:
+	return tunnel;
+}
+
+extern struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id);
+extern struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth);
+extern struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname);
+extern struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
+extern struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
+
+extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp);
+extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
+extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
+extern int l2tp_session_delete(struct l2tp_session *session);
+extern void l2tp_session_free(struct l2tp_session *session);
+extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb));
+extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
+
+extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len);
+
+extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops);
+extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
+
+/* Session reference counts. Incremented when code obtains a reference
+ * to a session.
+ */
+static inline void l2tp_session_inc_refcount_1(struct l2tp_session *session)
+{
+	atomic_inc(&session->ref_count);
+}
+
+static inline void l2tp_session_dec_refcount_1(struct l2tp_session *session)
+{
+	if (atomic_dec_and_test(&session->ref_count))
+		l2tp_session_free(session);
+}
+
+#ifdef L2TP_REFCNT_DEBUG
+#define l2tp_session_inc_refcount(_s) do { \
+		printk(KERN_DEBUG "l2tp_session_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_s)->name, atomic_read(&_s->ref_count)); \
+		l2tp_session_inc_refcount_1(_s);				\
+	} while (0)
+#define l2tp_session_dec_refcount(_s) do { \
+		printk(KERN_DEBUG "l2tp_session_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_s)->name, atomic_read(&_s->ref_count)); \
+		l2tp_session_dec_refcount_1(_s);				\
+	} while (0)
+#else
+#define l2tp_session_inc_refcount(s) l2tp_session_inc_refcount_1(s)
+#define l2tp_session_dec_refcount(s) l2tp_session_dec_refcount_1(s)
+#endif
+
+#endif /* _L2TP_CORE_H_ */
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
new file mode 100644
index 00000000..76130134
--- /dev/null
+++ b/net/l2tp/l2tp_debugfs.c
@@ -0,0 +1,341 @@
+/*
+ * L2TP subsystem debugfs
+ *
+ * Copyright (c) 2010 Katalix Systems Ltd
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/hash.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "l2tp_core.h"
+
+static struct dentry *rootdir;
+static struct dentry *tunnels;
+
+struct l2tp_dfs_seq_data {
+	struct net *net;
+	int tunnel_idx;			/* current tunnel */
+	int session_idx;		/* index of session within current tunnel */
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session;	/* NULL means get next tunnel */
+};
+
+static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
+{
+	pd->tunnel = l2tp_tunnel_find_nth(pd->net, pd->tunnel_idx);
+	pd->tunnel_idx++;
+}
+
+static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
+{
+	pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+	pd->session_idx++;
+
+	if (pd->session == NULL) {
+		pd->session_idx = 0;
+		l2tp_dfs_next_tunnel(pd);
+	}
+
+}
+
+static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs)
+{
+	struct l2tp_dfs_seq_data *pd = SEQ_START_TOKEN;
+	loff_t pos = *offs;
+
+	if (!pos)
+		goto out;
+
+	BUG_ON(m->private == NULL);
+	pd = m->private;
+
+	if (pd->tunnel == NULL)
+		l2tp_dfs_next_tunnel(pd);
+	else
+		l2tp_dfs_next_session(pd);
+
+	/* NULL tunnel and session indicates end of list */
+	if ((pd->tunnel == NULL) && (pd->session == NULL))
+		pd = NULL;
+
+out:
+	return pd;
+}
+
+
+static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
+{
+	/* nothing to do */
+}
+
+static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
+{
+	struct l2tp_tunnel *tunnel = v;
+	int session_count = 0;
+	int hash;
+	struct hlist_node *walk;
+	struct hlist_node *tmp;
+
+	read_lock_bh(&tunnel->hlist_lock);
+	for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
+		hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
+			struct l2tp_session *session;
+
+			session = hlist_entry(walk, struct l2tp_session, hlist);
+			if (session->session_id == 0)
+				continue;
+
+			session_count++;
+		}
+	}
+	read_unlock_bh(&tunnel->hlist_lock);
+
+	seq_printf(m, "\nTUNNEL %u peer %u", tunnel->tunnel_id, tunnel->peer_tunnel_id);
+	if (tunnel->sock) {
+		struct inet_sock *inet = inet_sk(tunnel->sock);
+		seq_printf(m, " from %pI4 to %pI4\n",
+			   &inet->inet_saddr, &inet->inet_daddr);
+		if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
+			seq_printf(m, " source port %hu, dest port %hu\n",
+				   ntohs(inet->inet_sport), ntohs(inet->inet_dport));
+	}
+	seq_printf(m, " L2TPv%d, %s\n", tunnel->version,
+		   tunnel->encap == L2TP_ENCAPTYPE_UDP ? "UDP" :
+		   tunnel->encap == L2TP_ENCAPTYPE_IP ? "IP" :
+		   "");
+	seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count,
+		   tunnel->sock ? atomic_read(&tunnel->sock->sk_refcnt) : 0,
+		   atomic_read(&tunnel->ref_count));
+
+	seq_printf(m, " %08x rx %llu/%llu/%llu rx %llu/%llu/%llu\n",
+		   tunnel->debug,
+		   (unsigned long long)tunnel->stats.tx_packets,
+		   (unsigned long long)tunnel->stats.tx_bytes,
+		   (unsigned long long)tunnel->stats.tx_errors,
+		   (unsigned long long)tunnel->stats.rx_packets,
+		   (unsigned long long)tunnel->stats.rx_bytes,
+		   (unsigned long long)tunnel->stats.rx_errors);
+
+	if (tunnel->show != NULL)
+		tunnel->show(m, tunnel);
+}
+
+static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
+{
+	struct l2tp_session *session = v;
+
+	seq_printf(m, "  SESSION %u, peer %u, %s\n", session->session_id,
+		   session->peer_session_id,
+		   session->pwtype == L2TP_PWTYPE_ETH ? "ETH" :
+		   session->pwtype == L2TP_PWTYPE_PPP ? "PPP" :
+		   "");
+	if (session->send_seq || session->recv_seq)
+		seq_printf(m, "   nr %hu, ns %hu\n", session->nr, session->ns);
+	seq_printf(m, "   refcnt %d\n", atomic_read(&session->ref_count));
+	seq_printf(m, "   config %d/%d/%c/%c/%s/%s %08x %u\n",
+		   session->mtu, session->mru,
+		   session->recv_seq ? 'R' : '-',
+		   session->send_seq ? 'S' : '-',
+		   session->data_seq == 1 ? "IPSEQ" :
+		   session->data_seq == 2 ? "DATASEQ" : "-",
+		   session->lns_mode ? "LNS" : "LAC",
+		   session->debug,
+		   jiffies_to_msecs(session->reorder_timeout));
+	seq_printf(m, "   offset %hu l2specific %hu/%hu\n",
+		   session->offset, session->l2specific_type, session->l2specific_len);
+	if (session->cookie_len) {
+		seq_printf(m, "   cookie %02x%02x%02x%02x",
+			   session->cookie[0], session->cookie[1],
+			   session->cookie[2], session->cookie[3]);
+		if (session->cookie_len == 8)
+			seq_printf(m, "%02x%02x%02x%02x",
+				   session->cookie[4], session->cookie[5],
+				   session->cookie[6], session->cookie[7]);
+		seq_printf(m, "\n");
+	}
+	if (session->peer_cookie_len) {
+		seq_printf(m, "   peer cookie %02x%02x%02x%02x",
+			   session->peer_cookie[0], session->peer_cookie[1],
+			   session->peer_cookie[2], session->peer_cookie[3]);
+		if (session->peer_cookie_len == 8)
+			seq_printf(m, "%02x%02x%02x%02x",
+				   session->peer_cookie[4], session->peer_cookie[5],
+				   session->peer_cookie[6], session->peer_cookie[7]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "   %hu/%hu tx %llu/%llu/%llu rx %llu/%llu/%llu\n",
+		   session->nr, session->ns,
+		   (unsigned long long)session->stats.tx_packets,
+		   (unsigned long long)session->stats.tx_bytes,
+		   (unsigned long long)session->stats.tx_errors,
+		   (unsigned long long)session->stats.rx_packets,
+		   (unsigned long long)session->stats.rx_bytes,
+		   (unsigned long long)session->stats.rx_errors);
+
+	if (session->show != NULL)
+		session->show(m, session);
+}
+
+static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
+{
+	struct l2tp_dfs_seq_data *pd = v;
+
+	/* display header on line 1 */
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "TUNNEL ID, peer ID from IP to IP\n");
+		seq_puts(m, " L2TPv2/L2TPv3, UDP/IP\n");
+		seq_puts(m, " sessions session-count, refcnt refcnt/sk->refcnt\n");
+		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+		seq_puts(m, "  SESSION ID, peer ID, PWTYPE\n");
+		seq_puts(m, "   refcnt cnt\n");
+		seq_puts(m, "   offset OFFSET l2specific TYPE/LEN\n");
+		seq_puts(m, "   [ cookie ]\n");
+		seq_puts(m, "   [ peer cookie ]\n");
+		seq_puts(m, "   config mtu/mru/rcvseq/sendseq/dataseq/lns debug reorderto\n");
+		seq_puts(m, "   nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+		goto out;
+	}
+
+	/* Show the tunnel or session context */
+	if (pd->session == NULL)
+		l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
+	else
+		l2tp_dfs_seq_session_show(m, pd->session);
+
+out:
+	return 0;
+}
+
+static const struct seq_operations l2tp_dfs_seq_ops = {
+	.start		= l2tp_dfs_seq_start,
+	.next		= l2tp_dfs_seq_next,
+	.stop		= l2tp_dfs_seq_stop,
+	.show		= l2tp_dfs_seq_show,
+};
+
+static int l2tp_dfs_seq_open(struct inode *inode, struct file *file)
+{
+	struct l2tp_dfs_seq_data *pd;
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (pd == NULL)
+		goto out;
+
+	/* Derive the network namespace from the pid opening the
+	 * file.
+	 */
+	pd->net = get_net_ns_by_pid(current->pid);
+	if (IS_ERR(pd->net)) {
+		rc = PTR_ERR(pd->net);
+		goto err_free_pd;
+	}
+
+	rc = seq_open(file, &l2tp_dfs_seq_ops);
+	if (rc)
+		goto err_free_net;
+
+	seq = file->private_data;
+	seq->private = pd;
+
+out:
+	return rc;
+
+err_free_net:
+	put_net(pd->net);
+err_free_pd:
+	kfree(pd);
+	goto out;
+}
+
+static int l2tp_dfs_seq_release(struct inode *inode, struct file *file)
+{
+	struct l2tp_dfs_seq_data *pd;
+	struct seq_file *seq;
+
+	seq = file->private_data;
+	pd = seq->private;
+	if (pd->net)
+		put_net(pd->net);
+	kfree(pd);
+	seq_release(inode, file);
+
+	return 0;
+}
+
+static const struct file_operations l2tp_dfs_fops = {
+	.owner		= THIS_MODULE,
+	.open		= l2tp_dfs_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= l2tp_dfs_seq_release,
+};
+
+static int __init l2tp_debugfs_init(void)
+{
+	int rc = 0;
+
+	rootdir = debugfs_create_dir("l2tp", NULL);
+	if (IS_ERR(rootdir)) {
+		rc = PTR_ERR(rootdir);
+		rootdir = NULL;
+		goto out;
+	}
+
+	tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
+	if (tunnels == NULL)
+		rc = -EIO;
+
+	printk(KERN_INFO "L2TP debugfs support\n");
+
+out:
+	if (rc)
+		printk(KERN_WARNING "l2tp debugfs: unable to init\n");
+
+	return rc;
+}
+
+static void __exit l2tp_debugfs_exit(void)
+{
+	debugfs_remove(tunnels);
+	debugfs_remove(rootdir);
+}
+
+module_init(l2tp_debugfs_init);
+module_exit(l2tp_debugfs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP debugfs driver");
+MODULE_VERSION("1.0");
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
new file mode 100644
index 00000000..3c55f633
--- /dev/null
+++ b/net/l2tp/l2tp_eth.c
@@ -0,0 +1,335 @@
+/*
+ * L2TPv3 ethernet pseudowire driver
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/hash.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include <linux/spinlock.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "l2tp_core.h"
+
+/* Default device name. May be overridden by name specified by user */
+#define L2TP_ETH_DEV_NAME	"l2tpeth%d"
+
+/* via netdev_priv() */
+struct l2tp_eth {
+	struct net_device	*dev;
+	struct sock		*tunnel_sock;
+	struct l2tp_session	*session;
+	struct list_head	list;
+};
+
+/* via l2tp_session_priv() */
+struct l2tp_eth_sess {
+	struct net_device	*dev;
+};
+
+/* per-net private data for this module */
+static unsigned int l2tp_eth_net_id;
+struct l2tp_eth_net {
+	struct list_head l2tp_eth_dev_list;
+	spinlock_t l2tp_eth_lock;
+};
+
+static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net)
+{
+	return net_generic(net, l2tp_eth_net_id);
+}
+
+static int l2tp_eth_dev_init(struct net_device *dev)
+{
+	struct l2tp_eth *priv = netdev_priv(dev);
+
+	priv->dev = dev;
+	random_ether_addr(dev->dev_addr);
+	memset(&dev->broadcast[0], 0xff, 6);
+
+	return 0;
+}
+
+static void l2tp_eth_dev_uninit(struct net_device *dev)
+{
+	struct l2tp_eth *priv = netdev_priv(dev);
+	struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev));
+
+	spin_lock(&pn->l2tp_eth_lock);
+	list_del_init(&priv->list);
+	spin_unlock(&pn->l2tp_eth_lock);
+	dev_put(dev);
+}
+
+static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct l2tp_eth *priv = netdev_priv(dev);
+	struct l2tp_session *session = priv->session;
+
+	l2tp_xmit_skb(session, skb, session->hdr_len);
+
+	dev->stats.tx_bytes += skb->len;
+	dev->stats.tx_packets++;
+
+	return 0;
+}
+
+static struct net_device_ops l2tp_eth_netdev_ops = {
+	.ndo_init		= l2tp_eth_dev_init,
+	.ndo_uninit		= l2tp_eth_dev_uninit,
+	.ndo_start_xmit		= l2tp_eth_dev_xmit,
+};
+
+static void l2tp_eth_dev_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->netdev_ops		= &l2tp_eth_netdev_ops;
+	dev->destructor		= free_netdev;
+}
+
+static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
+{
+	struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
+	struct net_device *dev = spriv->dev;
+
+	if (session->debug & L2TP_MSG_DATA) {
+		unsigned int length;
+		int offset;
+		u8 *ptr = skb->data;
+
+		length = min(32u, skb->len);
+		if (!pskb_may_pull(skb, length))
+			goto error;
+
+		printk(KERN_DEBUG "%s: eth recv: ", session->name);
+
+		offset = 0;
+		do {
+			printk(" %02X", ptr[offset]);
+		} while (++offset < length);
+
+		printk("\n");
+	}
+
+	if (!pskb_may_pull(skb, sizeof(ETH_HLEN)))
+		goto error;
+
+	secpath_reset(skb);
+
+	/* checksums verified by L2TP */
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb_dst_drop(skb);
+	nf_reset(skb);
+
+	if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += data_len;
+	} else
+		dev->stats.rx_errors++;
+
+	return;
+
+error:
+	dev->stats.rx_errors++;
+	kfree_skb(skb);
+}
+
+static void l2tp_eth_delete(struct l2tp_session *session)
+{
+	struct l2tp_eth_sess *spriv;
+	struct net_device *dev;
+
+	if (session) {
+		spriv = l2tp_session_priv(session);
+		dev = spriv->dev;
+		if (dev) {
+			unregister_netdev(dev);
+			spriv->dev = NULL;
+			module_put(THIS_MODULE);
+		}
+	}
+}
+
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+static void l2tp_eth_show(struct seq_file *m, void *arg)
+{
+	struct l2tp_session *session = arg;
+	struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
+	struct net_device *dev = spriv->dev;
+
+	seq_printf(m, "   interface %s\n", dev->name);
+}
+#endif
+
+static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+	struct net_device *dev;
+	char name[IFNAMSIZ];
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session;
+	struct l2tp_eth *priv;
+	struct l2tp_eth_sess *spriv;
+	int rc;
+	struct l2tp_eth_net *pn;
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (!tunnel) {
+		rc = -ENODEV;
+		goto out;
+	}
+
+	session = l2tp_session_find(net, tunnel, session_id);
+	if (session) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	if (cfg->ifname) {
+		dev = dev_get_by_name(net, cfg->ifname);
+		if (dev) {
+			dev_put(dev);
+			rc = -EEXIST;
+			goto out;
+		}
+		strlcpy(name, cfg->ifname, IFNAMSIZ);
+	} else
+		strcpy(name, L2TP_ETH_DEV_NAME);
+
+	session = l2tp_session_create(sizeof(*spriv), tunnel, session_id,
+				      peer_session_id, cfg);
+	if (!session) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	dev = alloc_netdev(sizeof(*priv), name, l2tp_eth_dev_setup);
+	if (!dev) {
+		rc = -ENOMEM;
+		goto out_del_session;
+	}
+
+	dev_net_set(dev, net);
+	if (session->mtu == 0)
+		session->mtu = dev->mtu - session->hdr_len;
+	dev->mtu = session->mtu;
+	dev->needed_headroom += session->hdr_len;
+
+	priv = netdev_priv(dev);
+	priv->dev = dev;
+	priv->session = session;
+	INIT_LIST_HEAD(&priv->list);
+
+	priv->tunnel_sock = tunnel->sock;
+	session->recv_skb = l2tp_eth_dev_recv;
+	session->session_close = l2tp_eth_delete;
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+	session->show = l2tp_eth_show;
+#endif
+
+	spriv = l2tp_session_priv(session);
+	spriv->dev = dev;
+
+	rc = register_netdev(dev);
+	if (rc < 0)
+		goto out_del_dev;
+
+	__module_get(THIS_MODULE);
+	/* Must be done after register_netdev() */
+	strlcpy(session->ifname, dev->name, IFNAMSIZ);
+
+	dev_hold(dev);
+	pn = l2tp_eth_pernet(dev_net(dev));
+	spin_lock(&pn->l2tp_eth_lock);
+	list_add(&priv->list, &pn->l2tp_eth_dev_list);
+	spin_unlock(&pn->l2tp_eth_lock);
+
+	return 0;
+
+out_del_dev:
+	free_netdev(dev);
+out_del_session:
+	l2tp_session_delete(session);
+out:
+	return rc;
+}
+
+static __net_init int l2tp_eth_init_net(struct net *net)
+{
+	struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id);
+
+	INIT_LIST_HEAD(&pn->l2tp_eth_dev_list);
+	spin_lock_init(&pn->l2tp_eth_lock);
+
+	return 0;
+}
+
+static struct pernet_operations l2tp_eth_net_ops = {
+	.init = l2tp_eth_init_net,
+	.id   = &l2tp_eth_net_id,
+	.size = sizeof(struct l2tp_eth_net),
+};
+
+
+static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = {
+	.session_create	= l2tp_eth_create,
+	.session_delete	= l2tp_session_delete,
+};
+
+
+static int __init l2tp_eth_init(void)
+{
+	int err = 0;
+
+	err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops);
+	if (err)
+		goto out;
+
+	err = register_pernet_device(&l2tp_eth_net_ops);
+	if (err)
+		goto out_unreg;
+
+	printk(KERN_INFO "L2TP ethernet pseudowire support (L2TPv3)\n");
+
+	return 0;
+
+out_unreg:
+	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
+out:
+	return err;
+}
+
+static void __exit l2tp_eth_exit(void)
+{
+	unregister_pernet_device(&l2tp_eth_net_ops);
+	l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH);
+}
+
+module_init(l2tp_eth_init);
+module_exit(l2tp_eth_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP ethernet pseudowire driver");
+MODULE_VERSION("1.0");
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
new file mode 100644
index 00000000..78bc442b
--- /dev/null
+++ b/net/l2tp/l2tp_ip.c
@@ -0,0 +1,707 @@
+/*
+ * L2TPv3 IP encapsulation support
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/socket.h>
+#include <linux/l2tp.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/tcp_states.h>
+#include <net/protocol.h>
+#include <net/xfrm.h>
+
+#include "l2tp_core.h"
+
+struct l2tp_ip_sock {
+	/* inet_sock has to be the first member of l2tp_ip_sock */
+	struct inet_sock	inet;
+
+	__u32			conn_id;
+	__u32			peer_conn_id;
+
+	__u64			tx_packets;
+	__u64			tx_bytes;
+	__u64			tx_errors;
+	__u64			rx_packets;
+	__u64			rx_bytes;
+	__u64			rx_errors;
+};
+
+static DEFINE_RWLOCK(l2tp_ip_lock);
+static struct hlist_head l2tp_ip_table;
+static struct hlist_head l2tp_ip_bind_table;
+
+static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
+{
+	return (struct l2tp_ip_sock *)sk;
+}
+
+static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+{
+	struct hlist_node *node;
+	struct sock *sk;
+
+	sk_for_each_bound(sk, node, &l2tp_ip_bind_table) {
+		struct inet_sock *inet = inet_sk(sk);
+		struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+
+		if (l2tp == NULL)
+			continue;
+
+		if ((l2tp->conn_id == tunnel_id) &&
+		    net_eq(sock_net(sk), net) &&
+		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+			goto found;
+	}
+
+	sk = NULL;
+found:
+	return sk;
+}
+
+static inline struct sock *l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif, u32 tunnel_id)
+{
+	struct sock *sk = __l2tp_ip_bind_lookup(net, laddr, dif, tunnel_id);
+	if (sk)
+		sock_hold(sk);
+
+	return sk;
+}
+
+/* When processing receive frames, there are two cases to
+ * consider. Data frames consist of a non-zero session-id and an
+ * optional cookie. Control frames consist of a regular L2TP header
+ * preceded by 32-bits of zeros.
+ *
+ * L2TPv3 Session Header Over IP
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                           Session ID                          |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |               Cookie (optional, maximum 64 bits)...
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *                                                                 |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * L2TPv3 Control Message Header Over IP
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                      (32 bits of zeros)                       |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |T|L|x|x|S|x|x|x|x|x|x|x|  Ver  |             Length            |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                     Control Connection ID                     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |               Ns              |               Nr              |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * All control frames are passed to userspace.
+ */
+static int l2tp_ip_recv(struct sk_buff *skb)
+{
+	struct sock *sk;
+	u32 session_id;
+	u32 tunnel_id;
+	unsigned char *ptr, *optr;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel = NULL;
+	int length;
+	int offset;
+
+	/* Point to L2TP header */
+	optr = ptr = skb->data;
+
+	if (!pskb_may_pull(skb, 4))
+		goto discard;
+
+	session_id = ntohl(*((__be32 *) ptr));
+	ptr += 4;
+
+	/* RFC3931: L2TP/IP packets have the first 4 bytes containing
+	 * the session_id. If it is 0, the packet is a L2TP control
+	 * frame and the session_id value can be discarded.
+	 */
+	if (session_id == 0) {
+		__skb_pull(skb, 4);
+		goto pass_up;
+	}
+
+	/* Ok, this is a data packet. Lookup the session. */
+	session = l2tp_session_find(&init_net, NULL, session_id);
+	if (session == NULL)
+		goto discard;
+
+	tunnel = session->tunnel;
+	if (tunnel == NULL)
+		goto discard;
+
+	/* Trace packet contents, if enabled */
+	if (tunnel->debug & L2TP_MSG_DATA) {
+		length = min(32u, skb->len);
+		if (!pskb_may_pull(skb, length))
+			goto discard;
+
+		printk(KERN_DEBUG "%s: ip recv: ", tunnel->name);
+
+		offset = 0;
+		do {
+			printk(" %02X", ptr[offset]);
+		} while (++offset < length);
+
+		printk("\n");
+	}
+
+	l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+
+	return 0;
+
+pass_up:
+	/* Get the tunnel_id from the L2TP header */
+	if (!pskb_may_pull(skb, 12))
+		goto discard;
+
+	if ((skb->data[0] & 0xc0) != 0xc0)
+		goto discard;
+
+	tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
+	tunnel = l2tp_tunnel_find(&init_net, tunnel_id);
+	if (tunnel != NULL)
+		sk = tunnel->sock;
+	else {
+		struct iphdr *iph = (struct iphdr *) skb_network_header(skb);
+
+		read_lock_bh(&l2tp_ip_lock);
+		sk = __l2tp_ip_bind_lookup(&init_net, iph->daddr, 0, tunnel_id);
+		read_unlock_bh(&l2tp_ip_lock);
+	}
+
+	if (sk == NULL)
+		goto discard;
+
+	sock_hold(sk);
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_put;
+
+	nf_reset(skb);
+
+	return sk_receive_skb(sk, skb, 1);
+
+discard_put:
+	sock_put(sk);
+
+discard:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int l2tp_ip_open(struct sock *sk)
+{
+	/* Prevent autobind. We don't have ports. */
+	inet_sk(sk)->inet_num = IPPROTO_L2TP;
+
+	write_lock_bh(&l2tp_ip_lock);
+	sk_add_node(sk, &l2tp_ip_table);
+	write_unlock_bh(&l2tp_ip_lock);
+
+	return 0;
+}
+
+static void l2tp_ip_close(struct sock *sk, long timeout)
+{
+	write_lock_bh(&l2tp_ip_lock);
+	hlist_del_init(&sk->sk_bind_node);
+	hlist_del_init(&sk->sk_node);
+	write_unlock_bh(&l2tp_ip_lock);
+	sk_common_release(sk);
+}
+
+static void l2tp_ip_destroy_sock(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+		kfree_skb(skb);
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr;
+	int ret;
+	int chk_addr_ret;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		return -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_l2tpip))
+		return -EINVAL;
+	if (addr->l2tp_family != AF_INET)
+		return -EINVAL;
+
+	ret = -EADDRINUSE;
+	read_lock_bh(&l2tp_ip_lock);
+	if (__l2tp_ip_bind_lookup(&init_net, addr->l2tp_addr.s_addr, sk->sk_bound_dev_if, addr->l2tp_conn_id))
+		goto out_in_use;
+
+	read_unlock_bh(&l2tp_ip_lock);
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+		goto out;
+
+	chk_addr_ret = inet_addr_type(&init_net, addr->l2tp_addr.s_addr);
+	ret = -EADDRNOTAVAIL;
+	if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+		goto out;
+
+	inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->inet_saddr = 0;  /* Use device */
+	sk_dst_reset(sk);
+
+	l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
+
+	write_lock_bh(&l2tp_ip_lock);
+	sk_add_bind_node(sk, &l2tp_ip_bind_table);
+	sk_del_node_init(sk);
+	write_unlock_bh(&l2tp_ip_lock);
+	ret = 0;
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+out:
+	release_sock(sk);
+
+	return ret;
+
+out_in_use:
+	read_unlock_bh(&l2tp_ip_lock);
+
+	return ret;
+}
+
+static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi4 *fl4;
+	struct rtable *rt;
+	__be32 saddr;
+	int oif, rc;
+
+	if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+		return -EINVAL;
+
+	if (addr_len < sizeof(*lsa))
+		return -EINVAL;
+
+	if (lsa->l2tp_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	lock_sock(sk);
+
+	sk_dst_reset(sk);
+
+	oif = sk->sk_bound_dev_if;
+	saddr = inet->inet_saddr;
+
+	rc = -EINVAL;
+	if (ipv4_is_multicast(lsa->l2tp_addr.s_addr))
+		goto out;
+
+	fl4 = &inet->cork.fl.u.ip4;
+	rt = ip_route_connect(fl4, lsa->l2tp_addr.s_addr, saddr,
+			      RT_CONN_FLAGS(sk), oif,
+			      IPPROTO_L2TP,
+			      0, 0, sk, true);
+	if (IS_ERR(rt)) {
+		rc = PTR_ERR(rt);
+		if (rc == -ENETUNREACH)
+			IP_INC_STATS_BH(&init_net, IPSTATS_MIB_OUTNOROUTES);
+		goto out;
+	}
+
+	rc = -ENETUNREACH;
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		goto out;
+	}
+
+	l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
+
+	if (!inet->inet_saddr)
+		inet->inet_saddr = fl4->saddr;
+	if (!inet->inet_rcv_saddr)
+		inet->inet_rcv_saddr = fl4->saddr;
+	inet->inet_daddr = fl4->daddr;
+	sk->sk_state = TCP_ESTABLISHED;
+	inet->inet_id = jiffies;
+
+	sk_dst_set(sk, &rt->dst);
+
+	write_lock_bh(&l2tp_ip_lock);
+	hlist_del_init(&sk->sk_bind_node);
+	sk_add_bind_node(sk, &l2tp_ip_bind_table);
+	write_unlock_bh(&l2tp_ip_lock);
+
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int l2tp_ip_disconnect(struct sock *sk, int flags)
+{
+	if (sock_flag(sk, SOCK_ZAPPED))
+		return 0;
+
+	return udp_disconnect(sk, flags);
+}
+
+static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
+			   int *uaddr_len, int peer)
+{
+	struct sock *sk		= sock->sk;
+	struct inet_sock *inet	= inet_sk(sk);
+	struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk);
+	struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr;
+
+	memset(lsa, 0, sizeof(*lsa));
+	lsa->l2tp_family = AF_INET;
+	if (peer) {
+		if (!inet->inet_dport)
+			return -ENOTCONN;
+		lsa->l2tp_conn_id = lsk->peer_conn_id;
+		lsa->l2tp_addr.s_addr = inet->inet_daddr;
+	} else {
+		__be32 addr = inet->inet_rcv_saddr;
+		if (!addr)
+			addr = inet->inet_saddr;
+		lsa->l2tp_conn_id = lsk->conn_id;
+		lsa->l2tp_addr.s_addr = addr;
+	}
+	*uaddr_len = sizeof(*lsa);
+	return 0;
+}
+
+static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+
+	/* Charge it to the socket, dropping if the queue is full. */
+	rc = sock_queue_rcv_skb(sk, skb);
+	if (rc < 0)
+		goto drop;
+
+	return 0;
+
+drop:
+	IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS);
+	kfree_skb(skb);
+	return -1;
+}
+
+/* Userspace will call sendmsg() on the tunnel socket to send L2TP
+ * control frames.
+ */
+static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len)
+{
+	struct sk_buff *skb;
+	int rc;
+	struct l2tp_ip_sock *lsa = l2tp_ip_sk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = NULL;
+	struct flowi4 *fl4;
+	int connected = 0;
+	__be32 daddr;
+
+	lock_sock(sk);
+
+	rc = -ENOTCONN;
+	if (sock_flag(sk, SOCK_DEAD))
+		goto out;
+
+	/* Get and verify the address. */
+	if (msg->msg_name) {
+		struct sockaddr_l2tpip *lip = (struct sockaddr_l2tpip *) msg->msg_name;
+		rc = -EINVAL;
+		if (msg->msg_namelen < sizeof(*lip))
+			goto out;
+
+		if (lip->l2tp_family != AF_INET) {
+			rc = -EAFNOSUPPORT;
+			if (lip->l2tp_family != AF_UNSPEC)
+				goto out;
+		}
+
+		daddr = lip->l2tp_addr.s_addr;
+	} else {
+		rc = -EDESTADDRREQ;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+
+		daddr = inet->inet_daddr;
+		connected = 1;
+	}
+
+	/* Allocate a socket buffer */
+	rc = -ENOMEM;
+	skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) +
+			   4 + len, 0, GFP_KERNEL);
+	if (!skb)
+		goto error;
+
+	/* Reserve space for headers, putting IP header on 4-byte boundary. */
+	skb_reserve(skb, 2 + NET_SKB_PAD);
+	skb_reset_network_header(skb);
+	skb_reserve(skb, sizeof(struct iphdr));
+	skb_reset_transport_header(skb);
+
+	/* Insert 0 session_id */
+	*((__be32 *) skb_put(skb, 4)) = 0;
+
+	/* Copy user data into skb */
+	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (rc < 0) {
+		kfree_skb(skb);
+		goto error;
+	}
+
+	fl4 = &inet->cork.fl.u.ip4;
+	if (connected)
+		rt = (struct rtable *) __sk_dst_check(sk, 0);
+
+	if (rt == NULL) {
+		struct ip_options_rcu *inet_opt;
+
+		rcu_read_lock();
+		inet_opt = rcu_dereference(inet->inet_opt);
+
+		/* Use correct destination address if we have options. */
+		if (inet_opt && inet_opt->opt.srr)
+			daddr = inet_opt->opt.faddr;
+
+		rcu_read_unlock();
+
+		/* If this fails, retransmit mechanism of transport layer will
+		 * keep trying until route appears or the connection times
+		 * itself out.
+		 */
+		rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+					   daddr, inet->inet_saddr,
+					   inet->inet_dport, inet->inet_sport,
+					   sk->sk_protocol, RT_CONN_FLAGS(sk),
+					   sk->sk_bound_dev_if);
+		if (IS_ERR(rt))
+			goto no_route;
+		sk_setup_caps(sk, &rt->dst);
+	}
+	skb_dst_set(skb, dst_clone(&rt->dst));
+
+	/* Queue the packet to IP for output */
+	rc = ip_queue_xmit(skb, &inet->cork.fl);
+
+error:
+	/* Update stats */
+	if (rc >= 0) {
+		lsa->tx_packets++;
+		lsa->tx_bytes += len;
+		rc = len;
+	} else {
+		lsa->tx_errors++;
+	}
+
+out:
+	release_sock(sk);
+	return rc;
+
+no_route:
+	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	rc = -EHOSTUNREACH;
+	goto out;
+}
+
+static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+			   size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk);
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		sin->sin_port = 0;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (inet->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:
+	if (err) {
+		lsk->rx_errors++;
+		return err;
+	}
+
+	lsk->rx_packets++;
+	lsk->rx_bytes += copied;
+
+	return copied;
+}
+
+static struct proto l2tp_ip_prot = {
+	.name		   = "L2TP/IP",
+	.owner		   = THIS_MODULE,
+	.init		   = l2tp_ip_open,
+	.close		   = l2tp_ip_close,
+	.bind		   = l2tp_ip_bind,
+	.connect	   = l2tp_ip_connect,
+	.disconnect	   = l2tp_ip_disconnect,
+	.ioctl		   = udp_ioctl,
+	.destroy	   = l2tp_ip_destroy_sock,
+	.setsockopt	   = ip_setsockopt,
+	.getsockopt	   = ip_getsockopt,
+	.sendmsg	   = l2tp_ip_sendmsg,
+	.recvmsg	   = l2tp_ip_recvmsg,
+	.backlog_rcv	   = l2tp_ip_backlog_recv,
+	.hash		   = inet_hash,
+	.unhash		   = inet_unhash,
+	.obj_size	   = sizeof(struct l2tp_ip_sock),
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ip_setsockopt,
+	.compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+static const struct proto_ops l2tp_ip_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = sock_no_accept,
+	.getname	   = l2tp_ip_getname,
+	.poll		   = datagram_poll,
+	.ioctl		   = inet_ioctl,
+	.listen		   = sock_no_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw l2tp_ip_protosw = {
+	.type		= SOCK_DGRAM,
+	.protocol	= IPPROTO_L2TP,
+	.prot		= &l2tp_ip_prot,
+	.ops		= &l2tp_ip_ops,
+	.no_check	= 0,
+};
+
+static struct net_protocol l2tp_ip_protocol __read_mostly = {
+	.handler	= l2tp_ip_recv,
+};
+
+static int __init l2tp_ip_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "L2TP IP encapsulation support (L2TPv3)\n");
+
+	err = proto_register(&l2tp_ip_prot, 1);
+	if (err != 0)
+		goto out;
+
+	err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
+	if (err)
+		goto out1;
+
+	inet_register_protosw(&l2tp_ip_protosw);
+	return 0;
+
+out1:
+	proto_unregister(&l2tp_ip_prot);
+out:
+	return err;
+}
+
+static void __exit l2tp_ip_exit(void)
+{
+	inet_unregister_protosw(&l2tp_ip_protosw);
+	inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
+	proto_unregister(&l2tp_ip_prot);
+}
+
+module_init(l2tp_ip_init);
+module_exit(l2tp_ip_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP over IP");
+MODULE_VERSION("1.0");
+
+/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
+ * enums
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
new file mode 100644
index 00000000..93a41a09
--- /dev/null
+++ b/net/l2tp/l2tp_netlink.c
@@ -0,0 +1,841 @@
+/*
+ * L2TP netlink layer, for management
+ *
+ * Copyright (c) 2008,2009,2010 Katalix Systems Ltd
+ *
+ * Partly based on the IrDA nelink implementation
+ * (see net/irda/irnetlink.c) which is:
+ * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org>
+ * which is in turn partly based on the wireless netlink code:
+ * Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/udp.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <net/net_namespace.h>
+
+#include <linux/l2tp.h>
+
+#include "l2tp_core.h"
+
+
+static struct genl_family l2tp_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= L2TP_GENL_NAME,
+	.version	= L2TP_GENL_VERSION,
+	.hdrsize	= 0,
+	.maxattr	= L2TP_ATTR_MAX,
+};
+
+/* Accessed under genl lock */
+static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX];
+
+static struct l2tp_session *l2tp_nl_session_find(struct genl_info *info)
+{
+	u32 tunnel_id;
+	u32 session_id;
+	char *ifname;
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session = NULL;
+	struct net *net = genl_info_net(info);
+
+	if (info->attrs[L2TP_ATTR_IFNAME]) {
+		ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
+		session = l2tp_session_find_by_ifname(net, ifname);
+	} else if ((info->attrs[L2TP_ATTR_SESSION_ID]) &&
+		   (info->attrs[L2TP_ATTR_CONN_ID])) {
+		tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+		session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
+		tunnel = l2tp_tunnel_find(net, tunnel_id);
+		if (tunnel)
+			session = l2tp_session_find(net, tunnel, session_id);
+	}
+
+	return session;
+}
+
+static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	int ret = -ENOBUFS;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
+			  &l2tp_nl_family, 0, L2TP_CMD_NOOP);
+	if (IS_ERR(hdr)) {
+		ret = PTR_ERR(hdr);
+		goto err_out;
+	}
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_unicast(genl_info_net(info), msg, info->snd_pid);
+
+err_out:
+	nlmsg_free(msg);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info)
+{
+	u32 tunnel_id;
+	u32 peer_tunnel_id;
+	int proto_version;
+	int fd;
+	int ret = 0;
+	struct l2tp_tunnel_cfg cfg = { 0, };
+	struct l2tp_tunnel *tunnel;
+	struct net *net = genl_info_net(info);
+
+	if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+	if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]);
+
+	if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]);
+
+	if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]);
+
+	fd = -1;
+	if (info->attrs[L2TP_ATTR_FD]) {
+		fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]);
+	} else {
+		if (info->attrs[L2TP_ATTR_IP_SADDR])
+			cfg.local_ip.s_addr = nla_get_be32(info->attrs[L2TP_ATTR_IP_SADDR]);
+		if (info->attrs[L2TP_ATTR_IP_DADDR])
+			cfg.peer_ip.s_addr = nla_get_be32(info->attrs[L2TP_ATTR_IP_DADDR]);
+		if (info->attrs[L2TP_ATTR_UDP_SPORT])
+			cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
+		if (info->attrs[L2TP_ATTR_UDP_DPORT])
+			cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
+		if (info->attrs[L2TP_ATTR_UDP_CSUM])
+			cfg.use_udp_checksums = nla_get_flag(info->attrs[L2TP_ATTR_UDP_CSUM]);
+	}
+
+	if (info->attrs[L2TP_ATTR_DEBUG])
+		cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (tunnel != NULL) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	ret = -EINVAL;
+	switch (cfg.encap) {
+	case L2TP_ENCAPTYPE_UDP:
+	case L2TP_ENCAPTYPE_IP:
+		ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id,
+					 peer_tunnel_id, &cfg, &tunnel);
+		break;
+	}
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info)
+{
+	struct l2tp_tunnel *tunnel;
+	u32 tunnel_id;
+	int ret = 0;
+	struct net *net = genl_info_net(info);
+
+	if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (tunnel == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	(void) l2tp_tunnel_delete(tunnel);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info)
+{
+	struct l2tp_tunnel *tunnel;
+	u32 tunnel_id;
+	int ret = 0;
+	struct net *net = genl_info_net(info);
+
+	if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (tunnel == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (info->attrs[L2TP_ATTR_DEBUG])
+		tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 pid, u32 seq, int flags,
+			       struct l2tp_tunnel *tunnel)
+{
+	void *hdr;
+	struct nlattr *nest;
+	struct sock *sk = NULL;
+	struct inet_sock *inet;
+
+	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags,
+			  L2TP_CMD_TUNNEL_GET);
+	if (IS_ERR(hdr))
+		return PTR_ERR(hdr);
+
+	NLA_PUT_U8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version);
+	NLA_PUT_U32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_DEBUG, tunnel->debug);
+	NLA_PUT_U16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap);
+
+	nest = nla_nest_start(skb, L2TP_ATTR_STATS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_PACKETS, tunnel->stats.tx_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_BYTES, tunnel->stats.tx_bytes);
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_ERRORS, tunnel->stats.tx_errors);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_PACKETS, tunnel->stats.rx_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_BYTES, tunnel->stats.rx_bytes);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, tunnel->stats.rx_seq_discards);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_OOS_PACKETS, tunnel->stats.rx_oos_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_ERRORS, tunnel->stats.rx_errors);
+	nla_nest_end(skb, nest);
+
+	sk = tunnel->sock;
+	if (!sk)
+		goto out;
+
+	inet = inet_sk(sk);
+
+	switch (tunnel->encap) {
+	case L2TP_ENCAPTYPE_UDP:
+		NLA_PUT_U16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport));
+		NLA_PUT_U16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport));
+		NLA_PUT_U8(skb, L2TP_ATTR_UDP_CSUM, (sk->sk_no_check != UDP_CSUM_NOXMIT));
+		/* NOBREAK */
+	case L2TP_ENCAPTYPE_IP:
+		NLA_PUT_BE32(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr);
+		NLA_PUT_BE32(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr);
+		break;
+	}
+
+out:
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -1;
+}
+
+static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct l2tp_tunnel *tunnel;
+	struct sk_buff *msg;
+	u32 tunnel_id;
+	int ret = -ENOBUFS;
+	struct net *net = genl_info_net(info);
+
+	if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (tunnel == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = l2tp_nl_tunnel_send(msg, info->snd_pid, info->snd_seq,
+				  NLM_F_ACK, tunnel);
+	if (ret < 0)
+		goto err_out;
+
+	return genlmsg_unicast(net, msg, info->snd_pid);
+
+err_out:
+	nlmsg_free(msg);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int ti = cb->args[0];
+	struct l2tp_tunnel *tunnel;
+	struct net *net = sock_net(skb->sk);
+
+	for (;;) {
+		tunnel = l2tp_tunnel_find_nth(net, ti);
+		if (tunnel == NULL)
+			goto out;
+
+		if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).pid,
+					cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					tunnel) <= 0)
+			goto out;
+
+		ti++;
+	}
+
+out:
+	cb->args[0] = ti;
+
+	return skb->len;
+}
+
+static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info)
+{
+	u32 tunnel_id = 0;
+	u32 session_id;
+	u32 peer_session_id;
+	int ret = 0;
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session;
+	struct l2tp_session_cfg cfg = { 0, };
+	struct net *net = genl_info_net(info);
+
+	if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+	if (!tunnel) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!info->attrs[L2TP_ATTR_SESSION_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
+	session = l2tp_session_find(net, tunnel, session_id);
+	if (session) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]);
+
+	if (!info->attrs[L2TP_ATTR_PW_TYPE]) {
+		ret = -EINVAL;
+		goto out;
+	}
+	cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]);
+	if (cfg.pw_type >= __L2TP_PWTYPE_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (tunnel->version > 2) {
+		if (info->attrs[L2TP_ATTR_OFFSET])
+			cfg.offset = nla_get_u16(info->attrs[L2TP_ATTR_OFFSET]);
+
+		if (info->attrs[L2TP_ATTR_DATA_SEQ])
+			cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
+
+		cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT;
+		if (info->attrs[L2TP_ATTR_L2SPEC_TYPE])
+			cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
+
+		cfg.l2specific_len = 4;
+		if (info->attrs[L2TP_ATTR_L2SPEC_LEN])
+			cfg.l2specific_len = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_LEN]);
+
+		if (info->attrs[L2TP_ATTR_COOKIE]) {
+			u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]);
+			if (len > 8) {
+				ret = -EINVAL;
+				goto out;
+			}
+			cfg.cookie_len = len;
+			memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len);
+		}
+		if (info->attrs[L2TP_ATTR_PEER_COOKIE]) {
+			u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]);
+			if (len > 8) {
+				ret = -EINVAL;
+				goto out;
+			}
+			cfg.peer_cookie_len = len;
+			memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len);
+		}
+		if (info->attrs[L2TP_ATTR_IFNAME])
+			cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
+
+		if (info->attrs[L2TP_ATTR_VLAN_ID])
+			cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
+	}
+
+	if (info->attrs[L2TP_ATTR_DEBUG])
+		cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+	if (info->attrs[L2TP_ATTR_RECV_SEQ])
+		cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
+
+	if (info->attrs[L2TP_ATTR_SEND_SEQ])
+		cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]);
+
+	if (info->attrs[L2TP_ATTR_LNS_MODE])
+		cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]);
+
+	if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
+		cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
+
+	if (info->attrs[L2TP_ATTR_MTU])
+		cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
+
+	if (info->attrs[L2TP_ATTR_MRU])
+		cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
+
+	if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) ||
+	    (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) {
+		ret = -EPROTONOSUPPORT;
+		goto out;
+	}
+
+	/* Check that pseudowire-specific params are present */
+	switch (cfg.pw_type) {
+	case L2TP_PWTYPE_NONE:
+		break;
+	case L2TP_PWTYPE_ETH_VLAN:
+		if (!info->attrs[L2TP_ATTR_VLAN_ID]) {
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case L2TP_PWTYPE_ETH:
+		break;
+	case L2TP_PWTYPE_PPP:
+	case L2TP_PWTYPE_PPP_AC:
+		break;
+	case L2TP_PWTYPE_IP:
+	default:
+		ret = -EPROTONOSUPPORT;
+		break;
+	}
+
+	ret = -EPROTONOSUPPORT;
+	if (l2tp_nl_cmd_ops[cfg.pw_type]->session_create)
+		ret = (*l2tp_nl_cmd_ops[cfg.pw_type]->session_create)(net, tunnel_id,
+			session_id, peer_session_id, &cfg);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret = 0;
+	struct l2tp_session *session;
+	u16 pw_type;
+
+	session = l2tp_nl_session_find(info);
+	if (session == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	pw_type = session->pwtype;
+	if (pw_type < __L2TP_PWTYPE_MAX)
+		if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
+			ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret = 0;
+	struct l2tp_session *session;
+
+	session = l2tp_nl_session_find(info);
+	if (session == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (info->attrs[L2TP_ATTR_DEBUG])
+		session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
+
+	if (info->attrs[L2TP_ATTR_DATA_SEQ])
+		session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
+
+	if (info->attrs[L2TP_ATTR_RECV_SEQ])
+		session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
+
+	if (info->attrs[L2TP_ATTR_SEND_SEQ])
+		session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]);
+
+	if (info->attrs[L2TP_ATTR_LNS_MODE])
+		session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]);
+
+	if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
+		session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
+
+	if (info->attrs[L2TP_ATTR_MTU])
+		session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
+
+	if (info->attrs[L2TP_ATTR_MRU])
+		session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_session_send(struct sk_buff *skb, u32 pid, u32 seq, int flags,
+				struct l2tp_session *session)
+{
+	void *hdr;
+	struct nlattr *nest;
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	struct sock *sk = NULL;
+
+	sk = tunnel->sock;
+
+	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_SESSION_GET);
+	if (IS_ERR(hdr))
+		return PTR_ERR(hdr);
+
+	NLA_PUT_U32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_SESSION_ID, session->session_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id);
+	NLA_PUT_U32(skb, L2TP_ATTR_DEBUG, session->debug);
+	NLA_PUT_U16(skb, L2TP_ATTR_PW_TYPE, session->pwtype);
+	NLA_PUT_U16(skb, L2TP_ATTR_MTU, session->mtu);
+	if (session->mru)
+		NLA_PUT_U16(skb, L2TP_ATTR_MRU, session->mru);
+
+	if (session->ifname && session->ifname[0])
+		NLA_PUT_STRING(skb, L2TP_ATTR_IFNAME, session->ifname);
+	if (session->cookie_len)
+		NLA_PUT(skb, L2TP_ATTR_COOKIE, session->cookie_len, &session->cookie[0]);
+	if (session->peer_cookie_len)
+		NLA_PUT(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, &session->peer_cookie[0]);
+	NLA_PUT_U8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq);
+	NLA_PUT_U8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq);
+	NLA_PUT_U8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode);
+#ifdef CONFIG_XFRM
+	if ((sk) && (sk->sk_policy[0] || sk->sk_policy[1]))
+		NLA_PUT_U8(skb, L2TP_ATTR_USING_IPSEC, 1);
+#endif
+	if (session->reorder_timeout)
+		NLA_PUT_MSECS(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout);
+
+	nest = nla_nest_start(skb, L2TP_ATTR_STATS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_PACKETS, session->stats.tx_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_BYTES, session->stats.tx_bytes);
+	NLA_PUT_U64(skb, L2TP_ATTR_TX_ERRORS, session->stats.tx_errors);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_PACKETS, session->stats.rx_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_BYTES, session->stats.rx_bytes);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, session->stats.rx_seq_discards);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_OOS_PACKETS, session->stats.rx_oos_packets);
+	NLA_PUT_U64(skb, L2TP_ATTR_RX_ERRORS, session->stats.rx_errors);
+	nla_nest_end(skb, nest);
+
+	return genlmsg_end(skb, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -1;
+}
+
+static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct l2tp_session *session;
+	struct sk_buff *msg;
+	int ret;
+
+	session = l2tp_nl_session_find(info);
+	if (session == NULL) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = l2tp_nl_session_send(msg, info->snd_pid, info->snd_seq,
+				   0, session);
+	if (ret < 0)
+		goto err_out;
+
+	return genlmsg_unicast(genl_info_net(info), msg, info->snd_pid);
+
+err_out:
+	nlmsg_free(msg);
+
+out:
+	return ret;
+}
+
+static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel = NULL;
+	int ti = cb->args[0];
+	int si = cb->args[1];
+
+	for (;;) {
+		if (tunnel == NULL) {
+			tunnel = l2tp_tunnel_find_nth(net, ti);
+			if (tunnel == NULL)
+				goto out;
+		}
+
+		session = l2tp_session_find_nth(tunnel, si);
+		if (session == NULL) {
+			ti++;
+			tunnel = NULL;
+			si = 0;
+			continue;
+		}
+
+		if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).pid,
+					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					 session) <= 0)
+			break;
+
+		si++;
+	}
+
+out:
+	cb->args[0] = ti;
+	cb->args[1] = si;
+
+	return skb->len;
+}
+
+static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
+	[L2TP_ATTR_NONE]		= { .type = NLA_UNSPEC, },
+	[L2TP_ATTR_PW_TYPE]		= { .type = NLA_U16, },
+	[L2TP_ATTR_ENCAP_TYPE]		= { .type = NLA_U16, },
+	[L2TP_ATTR_OFFSET]		= { .type = NLA_U16, },
+	[L2TP_ATTR_DATA_SEQ]		= { .type = NLA_U8, },
+	[L2TP_ATTR_L2SPEC_TYPE]		= { .type = NLA_U8, },
+	[L2TP_ATTR_L2SPEC_LEN]		= { .type = NLA_U8, },
+	[L2TP_ATTR_PROTO_VERSION]	= { .type = NLA_U8, },
+	[L2TP_ATTR_CONN_ID]		= { .type = NLA_U32, },
+	[L2TP_ATTR_PEER_CONN_ID]	= { .type = NLA_U32, },
+	[L2TP_ATTR_SESSION_ID]		= { .type = NLA_U32, },
+	[L2TP_ATTR_PEER_SESSION_ID]	= { .type = NLA_U32, },
+	[L2TP_ATTR_UDP_CSUM]		= { .type = NLA_U8, },
+	[L2TP_ATTR_VLAN_ID]		= { .type = NLA_U16, },
+	[L2TP_ATTR_DEBUG]		= { .type = NLA_U32, },
+	[L2TP_ATTR_RECV_SEQ]		= { .type = NLA_U8, },
+	[L2TP_ATTR_SEND_SEQ]		= { .type = NLA_U8, },
+	[L2TP_ATTR_LNS_MODE]		= { .type = NLA_U8, },
+	[L2TP_ATTR_USING_IPSEC]		= { .type = NLA_U8, },
+	[L2TP_ATTR_RECV_TIMEOUT]	= { .type = NLA_MSECS, },
+	[L2TP_ATTR_FD]			= { .type = NLA_U32, },
+	[L2TP_ATTR_IP_SADDR]		= { .type = NLA_U32, },
+	[L2TP_ATTR_IP_DADDR]		= { .type = NLA_U32, },
+	[L2TP_ATTR_UDP_SPORT]		= { .type = NLA_U16, },
+	[L2TP_ATTR_UDP_DPORT]		= { .type = NLA_U16, },
+	[L2TP_ATTR_MTU]			= { .type = NLA_U16, },
+	[L2TP_ATTR_MRU]			= { .type = NLA_U16, },
+	[L2TP_ATTR_STATS]		= { .type = NLA_NESTED, },
+	[L2TP_ATTR_IFNAME] = {
+		.type = NLA_NUL_STRING,
+		.len = IFNAMSIZ - 1,
+	},
+	[L2TP_ATTR_COOKIE] = {
+		.type = NLA_BINARY,
+		.len = 8,
+	},
+	[L2TP_ATTR_PEER_COOKIE] = {
+		.type = NLA_BINARY,
+		.len = 8,
+	},
+};
+
+static struct genl_ops l2tp_nl_ops[] = {
+	{
+		.cmd = L2TP_CMD_NOOP,
+		.doit = l2tp_nl_cmd_noop,
+		.policy = l2tp_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = L2TP_CMD_TUNNEL_CREATE,
+		.doit = l2tp_nl_cmd_tunnel_create,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_TUNNEL_DELETE,
+		.doit = l2tp_nl_cmd_tunnel_delete,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_TUNNEL_MODIFY,
+		.doit = l2tp_nl_cmd_tunnel_modify,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_TUNNEL_GET,
+		.doit = l2tp_nl_cmd_tunnel_get,
+		.dumpit = l2tp_nl_cmd_tunnel_dump,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_SESSION_CREATE,
+		.doit = l2tp_nl_cmd_session_create,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_SESSION_DELETE,
+		.doit = l2tp_nl_cmd_session_delete,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_SESSION_MODIFY,
+		.doit = l2tp_nl_cmd_session_modify,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = L2TP_CMD_SESSION_GET,
+		.doit = l2tp_nl_cmd_session_get,
+		.dumpit = l2tp_nl_cmd_session_dump,
+		.policy = l2tp_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (pw_type >= __L2TP_PWTYPE_MAX)
+		goto err;
+
+	genl_lock();
+	ret = -EBUSY;
+	if (l2tp_nl_cmd_ops[pw_type])
+		goto out;
+
+	l2tp_nl_cmd_ops[pw_type] = ops;
+	ret = 0;
+
+out:
+	genl_unlock();
+err:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(l2tp_nl_register_ops);
+
+void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
+{
+	if (pw_type < __L2TP_PWTYPE_MAX) {
+		genl_lock();
+		l2tp_nl_cmd_ops[pw_type] = NULL;
+		genl_unlock();
+	}
+}
+EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
+
+static int l2tp_nl_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "L2TP netlink interface\n");
+	err = genl_register_family_with_ops(&l2tp_nl_family, l2tp_nl_ops,
+					    ARRAY_SIZE(l2tp_nl_ops));
+
+	return err;
+}
+
+static void l2tp_nl_cleanup(void)
+{
+	genl_unregister_family(&l2tp_nl_family);
+}
+
+module_init(l2tp_nl_init);
+module_exit(l2tp_nl_cleanup);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("L2TP netlink");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+MODULE_ALIAS("net-pf-" __stringify(PF_NETLINK) "-proto-" \
+	     __stringify(NETLINK_GENERIC) "-type-" "l2tp");
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
new file mode 100644
index 00000000..13f9868e
--- /dev/null
+++ b/net/l2tp/l2tp_ppp.c
@@ -0,0 +1,1840 @@
+/*****************************************************************************
+ * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets
+ *
+ * PPPoX    --- Generic PPP encapsulation socket family
+ * PPPoL2TP --- PPP over L2TP (RFC 2661)
+ *
+ * Version:	2.0.0
+ *
+ * Authors:	James Chapman (jchapman@katalix.com)
+ *
+ * Based on original work by Martijn van Oosterhout <kleptog@svana.org>
+ *
+ * License:
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* This driver handles only L2TP data frames; control frames are handled by a
+ * userspace application.
+ *
+ * To send data in an L2TP session, userspace opens a PPPoL2TP socket and
+ * attaches it to a bound UDP socket with local tunnel_id / session_id and
+ * peer tunnel_id / session_id set. Data can then be sent or received using
+ * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket
+ * can be read or modified using ioctl() or [gs]etsockopt() calls.
+ *
+ * When a PPPoL2TP socket is connected with local and peer session_id values
+ * zero, the socket is treated as a special tunnel management socket.
+ *
+ * Here's example userspace code to create a socket for sending/receiving data
+ * over an L2TP session:-
+ *
+ *	struct sockaddr_pppol2tp sax;
+ *	int fd;
+ *	int session_fd;
+ *
+ *	fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP);
+ *
+ *	sax.sa_family = AF_PPPOX;
+ *	sax.sa_protocol = PX_PROTO_OL2TP;
+ *	sax.pppol2tp.fd = tunnel_fd;	// bound UDP socket
+ *	sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr;
+ *	sax.pppol2tp.addr.sin_port = addr->sin_port;
+ *	sax.pppol2tp.addr.sin_family = AF_INET;
+ *	sax.pppol2tp.s_tunnel  = tunnel_id;
+ *	sax.pppol2tp.s_session = session_id;
+ *	sax.pppol2tp.d_tunnel  = peer_tunnel_id;
+ *	sax.pppol2tp.d_session = peer_session_id;
+ *
+ *	session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax));
+ *
+ * A pppd plugin that allows PPP traffic to be carried over L2TP using
+ * this driver is available from the OpenL2TP project at
+ * http://openl2tp.sourceforge.net.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if_pppox.h>
+#include <linux/if_pppol2tp.h>
+#include <net/sock.h>
+#include <linux/ppp_channel.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_ppp.h>
+#include <linux/file.h>
+#include <linux/hash.h>
+#include <linux/sort.h>
+#include <linux/proc_fs.h>
+#include <linux/l2tp.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/xfrm.h>
+
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+
+#include "l2tp_core.h"
+
+#define PPPOL2TP_DRV_VERSION	"V2.0"
+
+/* Space for UDP, L2TP and PPP headers */
+#define PPPOL2TP_HEADER_OVERHEAD	40
+
+#define PRINTK(_mask, _type, _lvl, _fmt, args...)			\
+	do {								\
+		if ((_mask) & (_type))					\
+			printk(_lvl "PPPOL2TP: " _fmt, ##args);		\
+	} while (0)
+
+/* Number of bytes to build transmit L2TP headers.
+ * Unfortunately the size is different depending on whether sequence numbers
+ * are enabled.
+ */
+#define PPPOL2TP_L2TP_HDR_SIZE_SEQ		10
+#define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ		6
+
+/* Private data of each session. This data lives at the end of struct
+ * l2tp_session, referenced via session->priv[].
+ */
+struct pppol2tp_session {
+	int			owner;		/* pid that opened the socket */
+
+	struct sock		*sock;		/* Pointer to the session
+						 * PPPoX socket */
+	struct sock		*tunnel_sock;	/* Pointer to the tunnel UDP
+						 * socket */
+	int			flags;		/* accessed by PPPIOCGFLAGS.
+						 * Unused. */
+};
+
+static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
+
+static const struct ppp_channel_ops pppol2tp_chan_ops = {
+	.start_xmit =  pppol2tp_xmit,
+};
+
+static const struct proto_ops pppol2tp_ops;
+
+/* Helpers to obtain tunnel/session contexts from sockets.
+ */
+static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk)
+{
+	struct l2tp_session *session;
+
+	if (sk == NULL)
+		return NULL;
+
+	sock_hold(sk);
+	session = (struct l2tp_session *)(sk->sk_user_data);
+	if (session == NULL) {
+		sock_put(sk);
+		goto out;
+	}
+
+	BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+
+out:
+	return session;
+}
+
+/*****************************************************************************
+ * Receive data handling
+ *****************************************************************************/
+
+static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
+{
+	/* Skip PPP header, if present.	 In testing, Microsoft L2TP clients
+	 * don't send the PPP header (PPP header compression enabled), but
+	 * other clients can include the header. So we cope with both cases
+	 * here. The PPP header is always FF03 when using L2TP.
+	 *
+	 * Note that skb->data[] isn't dereferenced from a u16 ptr here since
+	 * the field may be unaligned.
+	 */
+	if (!pskb_may_pull(skb, 2))
+		return 1;
+
+	if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03))
+		skb_pull(skb, 2);
+
+	return 0;
+}
+
+/* Receive message. This is the recvmsg for the PPPoL2TP socket.
+ */
+static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock,
+			    struct msghdr *msg, size_t len,
+			    int flags)
+{
+	int err;
+	struct sk_buff *skb;
+	struct sock *sk = sock->sk;
+
+	err = -EIO;
+	if (sk->sk_state & PPPOX_BOUND)
+		goto end;
+
+	msg->msg_namelen = 0;
+
+	err = 0;
+	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+				flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		goto end;
+
+	if (len > skb->len)
+		len = skb->len;
+	else if (len < skb->len)
+		msg->msg_flags |= MSG_TRUNC;
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	if (likely(err == 0))
+		err = len;
+
+	kfree_skb(skb);
+end:
+	return err;
+}
+
+static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
+{
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+	struct sock *sk = NULL;
+
+	/* If the socket is bound, send it in to PPP's input queue. Otherwise
+	 * queue it on the session socket.
+	 */
+	sk = ps->sock;
+	if (sk == NULL)
+		goto no_sock;
+
+	if (sk->sk_state & PPPOX_BOUND) {
+		struct pppox_sock *po;
+		PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_DEBUG,
+		       "%s: recv %d byte data frame, passing to ppp\n",
+		       session->name, data_len);
+
+		/* We need to forget all info related to the L2TP packet
+		 * gathered in the skb as we are going to reuse the same
+		 * skb for the inner packet.
+		 * Namely we need to:
+		 * - reset xfrm (IPSec) information as it applies to
+		 *   the outer L2TP packet and not to the inner one
+		 * - release the dst to force a route lookup on the inner
+		 *   IP packet since skb->dst currently points to the dst
+		 *   of the UDP tunnel
+		 * - reset netfilter information as it doesn't apply
+		 *   to the inner packet either
+		 */
+		secpath_reset(skb);
+		skb_dst_drop(skb);
+		nf_reset(skb);
+
+		po = pppox_sk(sk);
+		ppp_input(&po->chan, skb);
+	} else {
+		PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_INFO,
+		       "%s: socket not bound\n", session->name);
+
+		/* Not bound. Nothing we can do, so discard. */
+		session->stats.rx_errors++;
+		kfree_skb(skb);
+	}
+
+	return;
+
+no_sock:
+	PRINTK(session->debug, PPPOL2TP_MSG_DATA, KERN_INFO,
+	       "%s: no socket\n", session->name);
+	kfree_skb(skb);
+}
+
+static void pppol2tp_session_sock_hold(struct l2tp_session *session)
+{
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+	if (ps->sock)
+		sock_hold(ps->sock);
+}
+
+static void pppol2tp_session_sock_put(struct l2tp_session *session)
+{
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+	if (ps->sock)
+		sock_put(ps->sock);
+}
+
+/************************************************************************
+ * Transmit handling
+ ***********************************************************************/
+
+/* This is the sendmsg for the PPPoL2TP pppol2tp_session socket.  We come here
+ * when a user application does a sendmsg() on the session socket. L2TP and
+ * PPP headers must be inserted into the user's data.
+ */
+static int pppol2tp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
+			    size_t total_len)
+{
+	static const unsigned char ppph[2] = { 0xff, 0x03 };
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int error;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	struct pppol2tp_session *ps;
+	int uhlen;
+
+	error = -ENOTCONN;
+	if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
+		goto error;
+
+	/* Get session and tunnel contexts */
+	error = -EBADF;
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto error;
+
+	ps = l2tp_session_priv(session);
+	tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+	if (tunnel == NULL)
+		goto error_put_sess;
+
+	uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
+
+	/* Allocate a socket buffer */
+	error = -ENOMEM;
+	skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) +
+			   uhlen + session->hdr_len +
+			   sizeof(ppph) + total_len,
+			   0, GFP_KERNEL);
+	if (!skb)
+		goto error_put_sess_tun;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, NET_SKB_PAD);
+	skb_reset_network_header(skb);
+	skb_reserve(skb, sizeof(struct iphdr));
+	skb_reset_transport_header(skb);
+	skb_reserve(skb, uhlen);
+
+	/* Add PPP header */
+	skb->data[0] = ppph[0];
+	skb->data[1] = ppph[1];
+	skb_put(skb, 2);
+
+	/* Copy user data into skb */
+	error = memcpy_fromiovec(skb->data, m->msg_iov, total_len);
+	if (error < 0) {
+		kfree_skb(skb);
+		goto error_put_sess_tun;
+	}
+	skb_put(skb, total_len);
+
+	l2tp_xmit_skb(session, skb, session->hdr_len);
+
+	sock_put(ps->tunnel_sock);
+
+	return error;
+
+error_put_sess_tun:
+	sock_put(ps->tunnel_sock);
+error_put_sess:
+	sock_put(sk);
+error:
+	return error;
+}
+
+/* Transmit function called by generic PPP driver.  Sends PPP frame
+ * over PPPoL2TP socket.
+ *
+ * This is almost the same as pppol2tp_sendmsg(), but rather than
+ * being called with a msghdr from userspace, it is called with a skb
+ * from the kernel.
+ *
+ * The supplied skb from ppp doesn't have enough headroom for the
+ * insertion of L2TP, UDP and IP headers so we need to allocate more
+ * headroom in the skb. This will create a cloned skb. But we must be
+ * careful in the error case because the caller will expect to free
+ * the skb it supplied, not our cloned skb. So we take care to always
+ * leave the original skb unfreed if we return an error.
+ */
+static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+	static const u8 ppph[2] = { 0xff, 0x03 };
+	struct sock *sk = (struct sock *) chan->private;
+	struct sock *sk_tun;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	struct pppol2tp_session *ps;
+	int old_headroom;
+	int new_headroom;
+
+	if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
+		goto abort;
+
+	/* Get session and tunnel contexts from the socket */
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto abort;
+
+	ps = l2tp_session_priv(session);
+	sk_tun = ps->tunnel_sock;
+	if (sk_tun == NULL)
+		goto abort_put_sess;
+	tunnel = l2tp_sock_to_tunnel(sk_tun);
+	if (tunnel == NULL)
+		goto abort_put_sess;
+
+	old_headroom = skb_headroom(skb);
+	if (skb_cow_head(skb, sizeof(ppph)))
+		goto abort_put_sess_tun;
+
+	new_headroom = skb_headroom(skb);
+	skb->truesize += new_headroom - old_headroom;
+
+	/* Setup PPP header */
+	__skb_push(skb, sizeof(ppph));
+	skb->data[0] = ppph[0];
+	skb->data[1] = ppph[1];
+
+	l2tp_xmit_skb(session, skb, session->hdr_len);
+
+	sock_put(sk_tun);
+	sock_put(sk);
+	return 1;
+
+abort_put_sess_tun:
+	sock_put(sk_tun);
+abort_put_sess:
+	sock_put(sk);
+abort:
+	/* Free the original skb */
+	kfree_skb(skb);
+	return 1;
+}
+
+/*****************************************************************************
+ * Session (and tunnel control) socket create/destroy.
+ *****************************************************************************/
+
+/* Called by l2tp_core when a session socket is being closed.
+ */
+static void pppol2tp_session_close(struct l2tp_session *session)
+{
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+	struct sock *sk = ps->sock;
+	struct sk_buff *skb;
+
+	BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+
+	if (session->session_id == 0)
+		goto out;
+
+	if (sk != NULL) {
+		lock_sock(sk);
+
+		if (sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) {
+			pppox_unbind_sock(sk);
+			sk->sk_state = PPPOX_DEAD;
+			sk->sk_state_change(sk);
+		}
+
+		/* Purge any queued data */
+		skb_queue_purge(&sk->sk_receive_queue);
+		skb_queue_purge(&sk->sk_write_queue);
+		while ((skb = skb_dequeue(&session->reorder_q))) {
+			kfree_skb(skb);
+			sock_put(sk);
+		}
+
+		release_sock(sk);
+	}
+
+out:
+	return;
+}
+
+/* Really kill the session socket. (Called from sock_put() if
+ * refcnt == 0.)
+ */
+static void pppol2tp_session_destruct(struct sock *sk)
+{
+	struct l2tp_session *session;
+
+	if (sk->sk_user_data != NULL) {
+		session = sk->sk_user_data;
+		if (session == NULL)
+			goto out;
+
+		sk->sk_user_data = NULL;
+		BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+		l2tp_session_dec_refcount(session);
+	}
+
+out:
+	return;
+}
+
+/* Called when the PPPoX socket (session) is closed.
+ */
+static int pppol2tp_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct l2tp_session *session;
+	int error;
+
+	if (!sk)
+		return 0;
+
+	error = -EBADF;
+	lock_sock(sk);
+	if (sock_flag(sk, SOCK_DEAD) != 0)
+		goto error;
+
+	pppox_unbind_sock(sk);
+
+	/* Signal the death of the socket. */
+	sk->sk_state = PPPOX_DEAD;
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	session = pppol2tp_sock_to_session(sk);
+
+	/* Purge any queued data */
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+	if (session != NULL) {
+		struct sk_buff *skb;
+		while ((skb = skb_dequeue(&session->reorder_q))) {
+			kfree_skb(skb);
+			sock_put(sk);
+		}
+		sock_put(sk);
+	}
+
+	release_sock(sk);
+
+	/* This will delete the session context via
+	 * pppol2tp_session_destruct() if the socket's refcnt drops to
+	 * zero.
+	 */
+	sock_put(sk);
+
+	return 0;
+
+error:
+	release_sock(sk);
+	return error;
+}
+
+static struct proto pppol2tp_sk_proto = {
+	.name	  = "PPPOL2TP",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct pppox_sock),
+};
+
+static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+
+	rc = l2tp_udp_encap_recv(sk, skb);
+	if (rc)
+		kfree_skb(skb);
+
+	return NET_RX_SUCCESS;
+}
+
+/* socket() handler. Initialize a new struct sock.
+ */
+static int pppol2tp_create(struct net *net, struct socket *sock)
+{
+	int error = -ENOMEM;
+	struct sock *sk;
+
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+	if (!sk)
+		goto out;
+
+	sock_init_data(sock, sk);
+
+	sock->state  = SS_UNCONNECTED;
+	sock->ops    = &pppol2tp_ops;
+
+	sk->sk_backlog_rcv = pppol2tp_backlog_recv;
+	sk->sk_protocol	   = PX_PROTO_OL2TP;
+	sk->sk_family	   = PF_PPPOX;
+	sk->sk_state	   = PPPOX_NONE;
+	sk->sk_type	   = SOCK_STREAM;
+	sk->sk_destruct	   = pppol2tp_session_destruct;
+
+	error = 0;
+
+out:
+	return error;
+}
+
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+static void pppol2tp_show(struct seq_file *m, void *arg)
+{
+	struct l2tp_session *session = arg;
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+	if (ps) {
+		struct pppox_sock *po = pppox_sk(ps->sock);
+		if (po)
+			seq_printf(m, "   interface %s\n", ppp_dev_name(&po->chan));
+	}
+}
+#endif
+
+/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
+ */
+static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
+			    int sockaddr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr;
+	struct sockaddr_pppol2tpv3 *sp3 = (struct sockaddr_pppol2tpv3 *) uservaddr;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct l2tp_session *session = NULL;
+	struct l2tp_tunnel *tunnel;
+	struct pppol2tp_session *ps;
+	struct dst_entry *dst;
+	struct l2tp_session_cfg cfg = { 0, };
+	int error = 0;
+	u32 tunnel_id, peer_tunnel_id;
+	u32 session_id, peer_session_id;
+	int ver = 2;
+	int fd;
+
+	lock_sock(sk);
+
+	error = -EINVAL;
+	if (sp->sa_protocol != PX_PROTO_OL2TP)
+		goto end;
+
+	/* Check for already bound sockets */
+	error = -EBUSY;
+	if (sk->sk_state & PPPOX_CONNECTED)
+		goto end;
+
+	/* We don't supporting rebinding anyway */
+	error = -EALREADY;
+	if (sk->sk_user_data)
+		goto end; /* socket is already attached */
+
+	/* Get params from socket address. Handle L2TPv2 and L2TPv3 */
+	if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) {
+		fd = sp->pppol2tp.fd;
+		tunnel_id = sp->pppol2tp.s_tunnel;
+		peer_tunnel_id = sp->pppol2tp.d_tunnel;
+		session_id = sp->pppol2tp.s_session;
+		peer_session_id = sp->pppol2tp.d_session;
+	} else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) {
+		ver = 3;
+		fd = sp3->pppol2tp.fd;
+		tunnel_id = sp3->pppol2tp.s_tunnel;
+		peer_tunnel_id = sp3->pppol2tp.d_tunnel;
+		session_id = sp3->pppol2tp.s_session;
+		peer_session_id = sp3->pppol2tp.d_session;
+	} else {
+		error = -EINVAL;
+		goto end; /* bad socket address */
+	}
+
+	/* Don't bind if tunnel_id is 0 */
+	error = -EINVAL;
+	if (tunnel_id == 0)
+		goto end;
+
+	tunnel = l2tp_tunnel_find(sock_net(sk), tunnel_id);
+
+	/* Special case: create tunnel context if session_id and
+	 * peer_session_id is 0. Otherwise look up tunnel using supplied
+	 * tunnel id.
+	 */
+	if ((session_id == 0) && (peer_session_id == 0)) {
+		if (tunnel == NULL) {
+			struct l2tp_tunnel_cfg tcfg = {
+				.encap = L2TP_ENCAPTYPE_UDP,
+				.debug = 0,
+			};
+			error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
+			if (error < 0)
+				goto end;
+		}
+	} else {
+		/* Error if we can't find the tunnel */
+		error = -ENOENT;
+		if (tunnel == NULL)
+			goto end;
+
+		/* Error if socket is not prepped */
+		if (tunnel->sock == NULL)
+			goto end;
+	}
+
+	if (tunnel->recv_payload_hook == NULL)
+		tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
+
+	if (tunnel->peer_tunnel_id == 0) {
+		if (ver == 2)
+			tunnel->peer_tunnel_id = sp->pppol2tp.d_tunnel;
+		else
+			tunnel->peer_tunnel_id = sp3->pppol2tp.d_tunnel;
+	}
+
+	/* Create session if it doesn't already exist. We handle the
+	 * case where a session was previously created by the netlink
+	 * interface by checking that the session doesn't already have
+	 * a socket and its tunnel socket are what we expect. If any
+	 * of those checks fail, return EEXIST to the caller.
+	 */
+	session = l2tp_session_find(sock_net(sk), tunnel, session_id);
+	if (session == NULL) {
+		/* Default MTU must allow space for UDP/L2TP/PPP
+		 * headers.
+		 */
+		cfg.mtu = cfg.mru = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+
+		/* Allocate and initialize a new session context. */
+		session = l2tp_session_create(sizeof(struct pppol2tp_session),
+					      tunnel, session_id,
+					      peer_session_id, &cfg);
+		if (session == NULL) {
+			error = -ENOMEM;
+			goto end;
+		}
+	} else {
+		ps = l2tp_session_priv(session);
+		error = -EEXIST;
+		if (ps->sock != NULL)
+			goto end;
+
+		/* consistency checks */
+		if (ps->tunnel_sock != tunnel->sock)
+			goto end;
+	}
+
+	/* Associate session with its PPPoL2TP socket */
+	ps = l2tp_session_priv(session);
+	ps->owner	     = current->pid;
+	ps->sock	     = sk;
+	ps->tunnel_sock = tunnel->sock;
+
+	session->recv_skb	= pppol2tp_recv;
+	session->session_close	= pppol2tp_session_close;
+#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE)
+	session->show		= pppol2tp_show;
+#endif
+
+	/* We need to know each time a skb is dropped from the reorder
+	 * queue.
+	 */
+	session->ref = pppol2tp_session_sock_hold;
+	session->deref = pppol2tp_session_sock_put;
+
+	/* If PMTU discovery was enabled, use the MTU that was discovered */
+	dst = sk_dst_get(sk);
+	if (dst != NULL) {
+		u32 pmtu = dst_mtu(__sk_dst_get(sk));
+		if (pmtu != 0)
+			session->mtu = session->mru = pmtu -
+				PPPOL2TP_HEADER_OVERHEAD;
+		dst_release(dst);
+	}
+
+	/* Special case: if source & dest session_id == 0x0000, this
+	 * socket is being created to manage the tunnel. Just set up
+	 * the internal context for use by ioctl() and sockopt()
+	 * handlers.
+	 */
+	if ((session->session_id == 0) &&
+	    (session->peer_session_id == 0)) {
+		error = 0;
+		goto out_no_ppp;
+	}
+
+	/* The only header we need to worry about is the L2TP
+	 * header. This size is different depending on whether
+	 * sequence numbers are enabled for the data channel.
+	 */
+	po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
+
+	po->chan.private = sk;
+	po->chan.ops	 = &pppol2tp_chan_ops;
+	po->chan.mtu	 = session->mtu;
+
+	error = ppp_register_net_channel(sock_net(sk), &po->chan);
+	if (error)
+		goto end;
+
+out_no_ppp:
+	/* This is how we get the session context from the socket. */
+	sk->sk_user_data = session;
+	sk->sk_state = PPPOX_CONNECTED;
+	PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+	       "%s: created\n", session->name);
+
+end:
+	release_sock(sk);
+
+	return error;
+}
+
+#ifdef CONFIG_L2TP_V3
+
+/* Called when creating sessions via the netlink interface.
+ */
+static int pppol2tp_session_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+{
+	int error;
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session;
+	struct pppol2tp_session *ps;
+
+	tunnel = l2tp_tunnel_find(net, tunnel_id);
+
+	/* Error if we can't find the tunnel */
+	error = -ENOENT;
+	if (tunnel == NULL)
+		goto out;
+
+	/* Error if tunnel socket is not prepped */
+	if (tunnel->sock == NULL)
+		goto out;
+
+	/* Check that this session doesn't already exist */
+	error = -EEXIST;
+	session = l2tp_session_find(net, tunnel, session_id);
+	if (session != NULL)
+		goto out;
+
+	/* Default MTU values. */
+	if (cfg->mtu == 0)
+		cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
+	if (cfg->mru == 0)
+		cfg->mru = cfg->mtu;
+
+	/* Allocate and initialize a new session context. */
+	error = -ENOMEM;
+	session = l2tp_session_create(sizeof(struct pppol2tp_session),
+				      tunnel, session_id,
+				      peer_session_id, cfg);
+	if (session == NULL)
+		goto out;
+
+	ps = l2tp_session_priv(session);
+	ps->tunnel_sock = tunnel->sock;
+
+	PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+	       "%s: created\n", session->name);
+
+	error = 0;
+
+out:
+	return error;
+}
+
+/* Called when deleting sessions via the netlink interface.
+ */
+static int pppol2tp_session_delete(struct l2tp_session *session)
+{
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+	if (ps->sock == NULL)
+		l2tp_session_dec_refcount(session);
+
+	return 0;
+}
+
+#endif /* CONFIG_L2TP_V3 */
+
+/* getname() support.
+ */
+static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
+			    int *usockaddr_len, int peer)
+{
+	int len = 0;
+	int error = 0;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	struct sock *sk = sock->sk;
+	struct inet_sock *inet;
+	struct pppol2tp_session *pls;
+
+	error = -ENOTCONN;
+	if (sk == NULL)
+		goto end;
+	if (sk->sk_state != PPPOX_CONNECTED)
+		goto end;
+
+	error = -EBADF;
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto end;
+
+	pls = l2tp_session_priv(session);
+	tunnel = l2tp_sock_to_tunnel(pls->tunnel_sock);
+	if (tunnel == NULL) {
+		error = -EBADF;
+		goto end_put_sess;
+	}
+
+	inet = inet_sk(tunnel->sock);
+	if (tunnel->version == 2) {
+		struct sockaddr_pppol2tp sp;
+		len = sizeof(sp);
+		memset(&sp, 0, len);
+		sp.sa_family	= AF_PPPOX;
+		sp.sa_protocol	= PX_PROTO_OL2TP;
+		sp.pppol2tp.fd  = tunnel->fd;
+		sp.pppol2tp.pid = pls->owner;
+		sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+		sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+		sp.pppol2tp.s_session = session->session_id;
+		sp.pppol2tp.d_session = session->peer_session_id;
+		sp.pppol2tp.addr.sin_family = AF_INET;
+		sp.pppol2tp.addr.sin_port = inet->inet_dport;
+		sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr;
+		memcpy(uaddr, &sp, len);
+	} else if (tunnel->version == 3) {
+		struct sockaddr_pppol2tpv3 sp;
+		len = sizeof(sp);
+		memset(&sp, 0, len);
+		sp.sa_family	= AF_PPPOX;
+		sp.sa_protocol	= PX_PROTO_OL2TP;
+		sp.pppol2tp.fd  = tunnel->fd;
+		sp.pppol2tp.pid = pls->owner;
+		sp.pppol2tp.s_tunnel = tunnel->tunnel_id;
+		sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id;
+		sp.pppol2tp.s_session = session->session_id;
+		sp.pppol2tp.d_session = session->peer_session_id;
+		sp.pppol2tp.addr.sin_family = AF_INET;
+		sp.pppol2tp.addr.sin_port = inet->inet_dport;
+		sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr;
+		memcpy(uaddr, &sp, len);
+	}
+
+	*usockaddr_len = len;
+
+	sock_put(pls->tunnel_sock);
+end_put_sess:
+	sock_put(sk);
+	error = 0;
+
+end:
+	return error;
+}
+
+/****************************************************************************
+ * ioctl() handlers.
+ *
+ * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP
+ * sockets. However, in order to control kernel tunnel features, we allow
+ * userspace to create a special "tunnel" PPPoX socket which is used for
+ * control only.  Tunnel PPPoX sockets have session_id == 0 and simply allow
+ * the user application to issue L2TP setsockopt(), getsockopt() and ioctl()
+ * calls.
+ ****************************************************************************/
+
+static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
+				struct l2tp_stats *stats)
+{
+	dest->tx_packets = stats->tx_packets;
+	dest->tx_bytes = stats->tx_bytes;
+	dest->tx_errors = stats->tx_errors;
+	dest->rx_packets = stats->rx_packets;
+	dest->rx_bytes = stats->rx_bytes;
+	dest->rx_seq_discards = stats->rx_seq_discards;
+	dest->rx_oos_packets = stats->rx_oos_packets;
+	dest->rx_errors = stats->rx_errors;
+}
+
+/* Session ioctl helper.
+ */
+static int pppol2tp_session_ioctl(struct l2tp_session *session,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct ifreq ifr;
+	int err = 0;
+	struct sock *sk;
+	int val = (int) arg;
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	struct pppol2tp_ioc_stats stats;
+
+	PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_DEBUG,
+	       "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
+	       session->name, cmd, arg);
+
+	sk = ps->sock;
+	sock_hold(sk);
+
+	switch (cmd) {
+	case SIOCGIFMTU:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		err = -EFAULT;
+		if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
+			break;
+		ifr.ifr_mtu = session->mtu;
+		if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
+			break;
+
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get mtu=%d\n", session->name, session->mtu);
+		err = 0;
+		break;
+
+	case SIOCSIFMTU:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		err = -EFAULT;
+		if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
+			break;
+
+		session->mtu = ifr.ifr_mtu;
+
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set mtu=%d\n", session->name, session->mtu);
+		err = 0;
+		break;
+
+	case PPPIOCGMRU:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		err = -EFAULT;
+		if (put_user(session->mru, (int __user *) arg))
+			break;
+
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get mru=%d\n", session->name, session->mru);
+		err = 0;
+		break;
+
+	case PPPIOCSMRU:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		err = -EFAULT;
+		if (get_user(val, (int __user *) arg))
+			break;
+
+		session->mru = val;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set mru=%d\n", session->name, session->mru);
+		err = 0;
+		break;
+
+	case PPPIOCGFLAGS:
+		err = -EFAULT;
+		if (put_user(ps->flags, (int __user *) arg))
+			break;
+
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get flags=%d\n", session->name, ps->flags);
+		err = 0;
+		break;
+
+	case PPPIOCSFLAGS:
+		err = -EFAULT;
+		if (get_user(val, (int __user *) arg))
+			break;
+		ps->flags = val;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set flags=%d\n", session->name, ps->flags);
+		err = 0;
+		break;
+
+	case PPPIOCGL2TPSTATS:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		memset(&stats, 0, sizeof(stats));
+		stats.tunnel_id = tunnel->tunnel_id;
+		stats.session_id = session->session_id;
+		pppol2tp_copy_stats(&stats, &session->stats);
+		if (copy_to_user((void __user *) arg, &stats,
+				 sizeof(stats)))
+			break;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get L2TP stats\n", session->name);
+		err = 0;
+		break;
+
+	default:
+		err = -ENOSYS;
+		break;
+	}
+
+	sock_put(sk);
+
+	return err;
+}
+
+/* Tunnel ioctl helper.
+ *
+ * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data
+ * specifies a session_id, the session ioctl handler is called. This allows an
+ * application to retrieve session stats via a tunnel socket.
+ */
+static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
+				 unsigned int cmd, unsigned long arg)
+{
+	int err = 0;
+	struct sock *sk;
+	struct pppol2tp_ioc_stats stats;
+
+	PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_DEBUG,
+	       "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
+	       tunnel->name, cmd, arg);
+
+	sk = tunnel->sock;
+	sock_hold(sk);
+
+	switch (cmd) {
+	case PPPIOCGL2TPSTATS:
+		err = -ENXIO;
+		if (!(sk->sk_state & PPPOX_CONNECTED))
+			break;
+
+		if (copy_from_user(&stats, (void __user *) arg,
+				   sizeof(stats))) {
+			err = -EFAULT;
+			break;
+		}
+		if (stats.session_id != 0) {
+			/* resend to session ioctl handler */
+			struct l2tp_session *session =
+				l2tp_session_find(sock_net(sk), tunnel, stats.session_id);
+			if (session != NULL)
+				err = pppol2tp_session_ioctl(session, cmd, arg);
+			else
+				err = -EBADR;
+			break;
+		}
+#ifdef CONFIG_XFRM
+		stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0;
+#endif
+		pppol2tp_copy_stats(&stats, &tunnel->stats);
+		if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) {
+			err = -EFAULT;
+			break;
+		}
+		PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get L2TP stats\n", tunnel->name);
+		err = 0;
+		break;
+
+	default:
+		err = -ENOSYS;
+		break;
+	}
+
+	sock_put(sk);
+
+	return err;
+}
+
+/* Main ioctl() handler.
+ * Dispatch to tunnel or session helpers depending on the socket.
+ */
+static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	struct pppol2tp_session *ps;
+	int err;
+
+	if (!sk)
+		return 0;
+
+	err = -EBADF;
+	if (sock_flag(sk, SOCK_DEAD) != 0)
+		goto end;
+
+	err = -ENOTCONN;
+	if ((sk->sk_user_data == NULL) ||
+	    (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND))))
+		goto end;
+
+	/* Get session context from the socket */
+	err = -EBADF;
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto end;
+
+	/* Special case: if session's session_id is zero, treat ioctl as a
+	 * tunnel ioctl
+	 */
+	ps = l2tp_session_priv(session);
+	if ((session->session_id == 0) &&
+	    (session->peer_session_id == 0)) {
+		err = -EBADF;
+		tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+		if (tunnel == NULL)
+			goto end_put_sess;
+
+		err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg);
+		sock_put(ps->tunnel_sock);
+		goto end_put_sess;
+	}
+
+	err = pppol2tp_session_ioctl(session, cmd, arg);
+
+end_put_sess:
+	sock_put(sk);
+end:
+	return err;
+}
+
+/*****************************************************************************
+ * setsockopt() / getsockopt() support.
+ *
+ * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP
+ * sockets. In order to control kernel tunnel features, we allow userspace to
+ * create a special "tunnel" PPPoX socket which is used for control only.
+ * Tunnel PPPoX sockets have session_id == 0 and simply allow the user
+ * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls.
+ *****************************************************************************/
+
+/* Tunnel setsockopt() helper.
+ */
+static int pppol2tp_tunnel_setsockopt(struct sock *sk,
+				      struct l2tp_tunnel *tunnel,
+				      int optname, int val)
+{
+	int err = 0;
+
+	switch (optname) {
+	case PPPOL2TP_SO_DEBUG:
+		tunnel->debug = val;
+		PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set debug=%x\n", tunnel->name, tunnel->debug);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	return err;
+}
+
+/* Session setsockopt helper.
+ */
+static int pppol2tp_session_setsockopt(struct sock *sk,
+				       struct l2tp_session *session,
+				       int optname, int val)
+{
+	int err = 0;
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+
+	switch (optname) {
+	case PPPOL2TP_SO_RECVSEQ:
+		if ((val != 0) && (val != 1)) {
+			err = -EINVAL;
+			break;
+		}
+		session->recv_seq = val ? -1 : 0;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set recv_seq=%d\n", session->name, session->recv_seq);
+		break;
+
+	case PPPOL2TP_SO_SENDSEQ:
+		if ((val != 0) && (val != 1)) {
+			err = -EINVAL;
+			break;
+		}
+		session->send_seq = val ? -1 : 0;
+		{
+			struct sock *ssk      = ps->sock;
+			struct pppox_sock *po = pppox_sk(ssk);
+			po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ :
+				PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
+		}
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set send_seq=%d\n", session->name, session->send_seq);
+		break;
+
+	case PPPOL2TP_SO_LNSMODE:
+		if ((val != 0) && (val != 1)) {
+			err = -EINVAL;
+			break;
+		}
+		session->lns_mode = val ? -1 : 0;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set lns_mode=%d\n", session->name, session->lns_mode);
+		break;
+
+	case PPPOL2TP_SO_DEBUG:
+		session->debug = val;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set debug=%x\n", session->name, session->debug);
+		break;
+
+	case PPPOL2TP_SO_REORDERTO:
+		session->reorder_timeout = msecs_to_jiffies(val);
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: set reorder_timeout=%d\n", session->name, session->reorder_timeout);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	return err;
+}
+
+/* Main setsockopt() entry point.
+ * Does API checks, then calls either the tunnel or session setsockopt
+ * handler, according to whether the PPPoL2TP socket is a for a regular
+ * session or the special tunnel type.
+ */
+static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	struct pppol2tp_session *ps;
+	int val;
+	int err;
+
+	if (level != SOL_PPPOL2TP)
+		return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	err = -ENOTCONN;
+	if (sk->sk_user_data == NULL)
+		goto end;
+
+	/* Get session context from the socket */
+	err = -EBADF;
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto end;
+
+	/* Special case: if session_id == 0x0000, treat as operation on tunnel
+	 */
+	ps = l2tp_session_priv(session);
+	if ((session->session_id == 0) &&
+	    (session->peer_session_id == 0)) {
+		err = -EBADF;
+		tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+		if (tunnel == NULL)
+			goto end_put_sess;
+
+		err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val);
+		sock_put(ps->tunnel_sock);
+	} else
+		err = pppol2tp_session_setsockopt(sk, session, optname, val);
+
+	err = 0;
+
+end_put_sess:
+	sock_put(sk);
+end:
+	return err;
+}
+
+/* Tunnel getsockopt helper. Called with sock locked.
+ */
+static int pppol2tp_tunnel_getsockopt(struct sock *sk,
+				      struct l2tp_tunnel *tunnel,
+				      int optname, int *val)
+{
+	int err = 0;
+
+	switch (optname) {
+	case PPPOL2TP_SO_DEBUG:
+		*val = tunnel->debug;
+		PRINTK(tunnel->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get debug=%x\n", tunnel->name, tunnel->debug);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	return err;
+}
+
+/* Session getsockopt helper. Called with sock locked.
+ */
+static int pppol2tp_session_getsockopt(struct sock *sk,
+				       struct l2tp_session *session,
+				       int optname, int *val)
+{
+	int err = 0;
+
+	switch (optname) {
+	case PPPOL2TP_SO_RECVSEQ:
+		*val = session->recv_seq;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get recv_seq=%d\n", session->name, *val);
+		break;
+
+	case PPPOL2TP_SO_SENDSEQ:
+		*val = session->send_seq;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get send_seq=%d\n", session->name, *val);
+		break;
+
+	case PPPOL2TP_SO_LNSMODE:
+		*val = session->lns_mode;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get lns_mode=%d\n", session->name, *val);
+		break;
+
+	case PPPOL2TP_SO_DEBUG:
+		*val = session->debug;
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get debug=%d\n", session->name, *val);
+		break;
+
+	case PPPOL2TP_SO_REORDERTO:
+		*val = (int) jiffies_to_msecs(session->reorder_timeout);
+		PRINTK(session->debug, PPPOL2TP_MSG_CONTROL, KERN_INFO,
+		       "%s: get reorder_timeout=%d\n", session->name, *val);
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+	return err;
+}
+
+/* Main getsockopt() entry point.
+ * Does API checks, then calls either the tunnel or session getsockopt
+ * handler, according to whether the PPPoX socket is a for a regular session
+ * or the special tunnel type.
+ */
+static int pppol2tp_getsockopt(struct socket *sock, int level,
+			       int optname, char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct l2tp_session *session;
+	struct l2tp_tunnel *tunnel;
+	int val, len;
+	int err;
+	struct pppol2tp_session *ps;
+
+	if (level != SOL_PPPOL2TP)
+		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+
+	if (get_user(len, (int __user *) optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	err = -ENOTCONN;
+	if (sk->sk_user_data == NULL)
+		goto end;
+
+	/* Get the session context */
+	err = -EBADF;
+	session = pppol2tp_sock_to_session(sk);
+	if (session == NULL)
+		goto end;
+
+	/* Special case: if session_id == 0x0000, treat as operation on tunnel */
+	ps = l2tp_session_priv(session);
+	if ((session->session_id == 0) &&
+	    (session->peer_session_id == 0)) {
+		err = -EBADF;
+		tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+		if (tunnel == NULL)
+			goto end_put_sess;
+
+		err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
+		sock_put(ps->tunnel_sock);
+	} else
+		err = pppol2tp_session_getsockopt(sk, session, optname, &val);
+
+	err = -EFAULT;
+	if (put_user(len, (int __user *) optlen))
+		goto end_put_sess;
+
+	if (copy_to_user((void __user *) optval, &val, len))
+		goto end_put_sess;
+
+	err = 0;
+
+end_put_sess:
+	sock_put(sk);
+end:
+	return err;
+}
+
+/*****************************************************************************
+ * /proc filesystem for debug
+ * Since the original pppol2tp driver provided /proc/net/pppol2tp for
+ * L2TPv2, we dump only L2TPv2 tunnels and sessions here.
+ *****************************************************************************/
+
+static unsigned int pppol2tp_net_id;
+
+#ifdef CONFIG_PROC_FS
+
+struct pppol2tp_seq_data {
+	struct seq_net_private p;
+	int tunnel_idx;			/* current tunnel */
+	int session_idx;		/* index of session within current tunnel */
+	struct l2tp_tunnel *tunnel;
+	struct l2tp_session *session;	/* NULL means get next tunnel */
+};
+
+static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
+{
+	for (;;) {
+		pd->tunnel = l2tp_tunnel_find_nth(net, pd->tunnel_idx);
+		pd->tunnel_idx++;
+
+		if (pd->tunnel == NULL)
+			break;
+
+		/* Ignore L2TPv3 tunnels */
+		if (pd->tunnel->version < 3)
+			break;
+	}
+}
+
+static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
+{
+	pd->session = l2tp_session_find_nth(pd->tunnel, pd->session_idx);
+	pd->session_idx++;
+
+	if (pd->session == NULL) {
+		pd->session_idx = 0;
+		pppol2tp_next_tunnel(net, pd);
+	}
+}
+
+static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs)
+{
+	struct pppol2tp_seq_data *pd = SEQ_START_TOKEN;
+	loff_t pos = *offs;
+	struct net *net;
+
+	if (!pos)
+		goto out;
+
+	BUG_ON(m->private == NULL);
+	pd = m->private;
+	net = seq_file_net(m);
+
+	if (pd->tunnel == NULL)
+		pppol2tp_next_tunnel(net, pd);
+	else
+		pppol2tp_next_session(net, pd);
+
+	/* NULL tunnel and session indicates end of list */
+	if ((pd->tunnel == NULL) && (pd->session == NULL))
+		pd = NULL;
+
+out:
+	return pd;
+}
+
+static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void pppol2tp_seq_stop(struct seq_file *p, void *v)
+{
+	/* nothing to do */
+}
+
+static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
+{
+	struct l2tp_tunnel *tunnel = v;
+
+	seq_printf(m, "\nTUNNEL '%s', %c %d\n",
+		   tunnel->name,
+		   (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
+		   atomic_read(&tunnel->ref_count) - 1);
+	seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n",
+		   tunnel->debug,
+		   (unsigned long long)tunnel->stats.tx_packets,
+		   (unsigned long long)tunnel->stats.tx_bytes,
+		   (unsigned long long)tunnel->stats.tx_errors,
+		   (unsigned long long)tunnel->stats.rx_packets,
+		   (unsigned long long)tunnel->stats.rx_bytes,
+		   (unsigned long long)tunnel->stats.rx_errors);
+}
+
+static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
+{
+	struct l2tp_session *session = v;
+	struct l2tp_tunnel *tunnel = session->tunnel;
+	struct pppol2tp_session *ps = l2tp_session_priv(session);
+	struct pppox_sock *po = pppox_sk(ps->sock);
+	u32 ip = 0;
+	u16 port = 0;
+
+	if (tunnel->sock) {
+		struct inet_sock *inet = inet_sk(tunnel->sock);
+		ip = ntohl(inet->inet_saddr);
+		port = ntohs(inet->inet_sport);
+	}
+
+	seq_printf(m, "  SESSION '%s' %08X/%d %04X/%04X -> "
+		   "%04X/%04X %d %c\n",
+		   session->name, ip, port,
+		   tunnel->tunnel_id,
+		   session->session_id,
+		   tunnel->peer_tunnel_id,
+		   session->peer_session_id,
+		   ps->sock->sk_state,
+		   (session == ps->sock->sk_user_data) ?
+		   'Y' : 'N');
+	seq_printf(m, "   %d/%d/%c/%c/%s %08x %u\n",
+		   session->mtu, session->mru,
+		   session->recv_seq ? 'R' : '-',
+		   session->send_seq ? 'S' : '-',
+		   session->lns_mode ? "LNS" : "LAC",
+		   session->debug,
+		   jiffies_to_msecs(session->reorder_timeout));
+	seq_printf(m, "   %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n",
+		   session->nr, session->ns,
+		   (unsigned long long)session->stats.tx_packets,
+		   (unsigned long long)session->stats.tx_bytes,
+		   (unsigned long long)session->stats.tx_errors,
+		   (unsigned long long)session->stats.rx_packets,
+		   (unsigned long long)session->stats.rx_bytes,
+		   (unsigned long long)session->stats.rx_errors);
+
+	if (po)
+		seq_printf(m, "   interface %s\n", ppp_dev_name(&po->chan));
+}
+
+static int pppol2tp_seq_show(struct seq_file *m, void *v)
+{
+	struct pppol2tp_seq_data *pd = v;
+
+	/* display header on line 1 */
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n");
+		seq_puts(m, "TUNNEL name, user-data-ok session-count\n");
+		seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+		seq_puts(m, "  SESSION name, addr/port src-tid/sid "
+			 "dest-tid/sid state user-data-ok\n");
+		seq_puts(m, "   mtu/mru/rcvseq/sendseq/lns debug reorderto\n");
+		seq_puts(m, "   nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
+		goto out;
+	}
+
+	/* Show the tunnel or session context.
+	 */
+	if (pd->session == NULL)
+		pppol2tp_seq_tunnel_show(m, pd->tunnel);
+	else
+		pppol2tp_seq_session_show(m, pd->session);
+
+out:
+	return 0;
+}
+
+static const struct seq_operations pppol2tp_seq_ops = {
+	.start		= pppol2tp_seq_start,
+	.next		= pppol2tp_seq_next,
+	.stop		= pppol2tp_seq_stop,
+	.show		= pppol2tp_seq_show,
+};
+
+/* Called when our /proc file is opened. We allocate data for use when
+ * iterating our tunnel / session contexts and store it in the private
+ * data of the seq_file.
+ */
+static int pppol2tp_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &pppol2tp_seq_ops,
+			    sizeof(struct pppol2tp_seq_data));
+}
+
+static const struct file_operations pppol2tp_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= pppol2tp_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+/*****************************************************************************
+ * Network namespace
+ *****************************************************************************/
+
+static __net_init int pppol2tp_init_net(struct net *net)
+{
+	struct proc_dir_entry *pde;
+	int err = 0;
+
+	pde = proc_net_fops_create(net, "pppol2tp", S_IRUGO, &pppol2tp_proc_fops);
+	if (!pde) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+out:
+	return err;
+}
+
+static __net_exit void pppol2tp_exit_net(struct net *net)
+{
+	proc_net_remove(net, "pppol2tp");
+}
+
+static struct pernet_operations pppol2tp_net_ops = {
+	.init = pppol2tp_init_net,
+	.exit = pppol2tp_exit_net,
+	.id   = &pppol2tp_net_id,
+};
+
+/*****************************************************************************
+ * Init and cleanup
+ *****************************************************************************/
+
+static const struct proto_ops pppol2tp_ops = {
+	.family		= AF_PPPOX,
+	.owner		= THIS_MODULE,
+	.release	= pppol2tp_release,
+	.bind		= sock_no_bind,
+	.connect	= pppol2tp_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= pppol2tp_getname,
+	.poll		= datagram_poll,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= pppol2tp_setsockopt,
+	.getsockopt	= pppol2tp_getsockopt,
+	.sendmsg	= pppol2tp_sendmsg,
+	.recvmsg	= pppol2tp_recvmsg,
+	.mmap		= sock_no_mmap,
+	.ioctl		= pppox_ioctl,
+};
+
+static const struct pppox_proto pppol2tp_proto = {
+	.create		= pppol2tp_create,
+	.ioctl		= pppol2tp_ioctl
+};
+
+#ifdef CONFIG_L2TP_V3
+
+static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = {
+	.session_create	= pppol2tp_session_create,
+	.session_delete	= pppol2tp_session_delete,
+};
+
+#endif /* CONFIG_L2TP_V3 */
+
+static int __init pppol2tp_init(void)
+{
+	int err;
+
+	err = register_pernet_device(&pppol2tp_net_ops);
+	if (err)
+		goto out;
+
+	err = proto_register(&pppol2tp_sk_proto, 0);
+	if (err)
+		goto out_unregister_pppol2tp_pernet;
+
+	err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto);
+	if (err)
+		goto out_unregister_pppol2tp_proto;
+
+#ifdef CONFIG_L2TP_V3
+	err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops);
+	if (err)
+		goto out_unregister_pppox;
+#endif
+
+	printk(KERN_INFO "PPPoL2TP kernel driver, %s\n",
+	       PPPOL2TP_DRV_VERSION);
+
+out:
+	return err;
+
+#ifdef CONFIG_L2TP_V3
+out_unregister_pppox:
+	unregister_pppox_proto(PX_PROTO_OL2TP);
+#endif
+out_unregister_pppol2tp_proto:
+	proto_unregister(&pppol2tp_sk_proto);
+out_unregister_pppol2tp_pernet:
+	unregister_pernet_device(&pppol2tp_net_ops);
+	goto out;
+}
+
+static void __exit pppol2tp_exit(void)
+{
+#ifdef CONFIG_L2TP_V3
+	l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP);
+#endif
+	unregister_pppox_proto(PX_PROTO_OL2TP);
+	proto_unregister(&pppol2tp_sk_proto);
+	unregister_pernet_device(&pppol2tp_net_ops);
+}
+
+module_init(pppol2tp_init);
+module_exit(pppol2tp_exit);
+
+MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
+MODULE_DESCRIPTION("PPP over L2TP over UDP");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(PPPOL2TP_DRV_VERSION);
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 00000000..f0b5efb3
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
+#
+# LAPB Data Link Drive
+#
+
+config LAPB
+	tristate "LAPB Data Link Driver (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
+	  the lower) part of the X.25 protocol. It offers a reliable
+	  connection service to exchange data frames with one other host, and
+	  it is used to transport higher level protocols (mostly X.25 Packet
+	  Layer, the higher part of X.25, but others are possible as well).
+	  Usually, LAPB is used with specialized X.21 network cards, but Linux
+	  currently supports LAPB only over Ethernet connections. If you want
+	  to use LAPB connections over Ethernet, say Y here and to "LAPB over
+	  Ethernet driver" below. Read
+	  <file:Documentation/networking/lapb-module.txt> for technical
+	  details.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called lapb.  If unsure, say N.
diff --git a/net/lapb/Makefile b/net/lapb/Makefile
new file mode 100644
index 00000000..fff797df
--- /dev/null
+++ b/net/lapb/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux LAPB layer.
+#
+
+obj-$(CONFIG_LAPB) += lapb.o
+
+lapb-y := lapb_in.o lapb_out.o lapb_subr.o lapb_timer.o lapb_iface.o
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
new file mode 100644
index 00000000..d5d8d555
--- /dev/null
+++ b/net/lapb/lapb_iface.c
@@ -0,0 +1,450 @@
+/*
+ *	LAPB release 002
+ *
+ *	This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	LAPB 001	Jonathan Naylor	Started Coding
+ *	LAPB 002	Jonathan Naylor	New timer architecture.
+ *	2000-10-29	Henner Eisen	lapb_data_indication() return status.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <net/lapb.h>
+
+static LIST_HEAD(lapb_list);
+static DEFINE_RWLOCK(lapb_list_lock);
+
+/*
+ *	Free an allocated lapb control block.
+ */
+static void lapb_free_cb(struct lapb_cb *lapb)
+{
+	kfree(lapb);
+}
+
+static __inline__ void lapb_hold(struct lapb_cb *lapb)
+{
+	atomic_inc(&lapb->refcnt);
+}
+
+static __inline__ void lapb_put(struct lapb_cb *lapb)
+{
+	if (atomic_dec_and_test(&lapb->refcnt))
+		lapb_free_cb(lapb);
+}
+
+/*
+ *	Socket removal during an interrupt is now safe.
+ */
+static void __lapb_remove_cb(struct lapb_cb *lapb)
+{
+	if (lapb->node.next) {
+		list_del(&lapb->node);
+		lapb_put(lapb);
+	}
+}
+
+/*
+ *	Add a socket to the bound sockets list.
+ */
+static void __lapb_insert_cb(struct lapb_cb *lapb)
+{
+	list_add(&lapb->node, &lapb_list);
+	lapb_hold(lapb);
+}
+
+static struct lapb_cb *__lapb_devtostruct(struct net_device *dev)
+{
+	struct list_head *entry;
+	struct lapb_cb *lapb, *use = NULL;
+
+	list_for_each(entry, &lapb_list) {
+		lapb = list_entry(entry, struct lapb_cb, node);
+		if (lapb->dev == dev) {
+			use = lapb;
+			break;
+		}
+	}
+
+	if (use)
+		lapb_hold(use);
+
+	return use;
+}
+
+static struct lapb_cb *lapb_devtostruct(struct net_device *dev)
+{
+	struct lapb_cb *rc;
+
+	read_lock_bh(&lapb_list_lock);
+	rc = __lapb_devtostruct(dev);
+	read_unlock_bh(&lapb_list_lock);
+
+	return rc;
+}
+/*
+ *	Create an empty LAPB control block.
+ */
+static struct lapb_cb *lapb_create_cb(void)
+{
+	struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC);
+
+
+	if (!lapb)
+		goto out;
+
+	skb_queue_head_init(&lapb->write_queue);
+	skb_queue_head_init(&lapb->ack_queue);
+
+	init_timer(&lapb->t1timer);
+	init_timer(&lapb->t2timer);
+
+	lapb->t1      = LAPB_DEFAULT_T1;
+	lapb->t2      = LAPB_DEFAULT_T2;
+	lapb->n2      = LAPB_DEFAULT_N2;
+	lapb->mode    = LAPB_DEFAULT_MODE;
+	lapb->window  = LAPB_DEFAULT_WINDOW;
+	lapb->state   = LAPB_STATE_0;
+	atomic_set(&lapb->refcnt, 1);
+out:
+	return lapb;
+}
+
+int lapb_register(struct net_device *dev, struct lapb_register_struct *callbacks)
+{
+	struct lapb_cb *lapb;
+	int rc = LAPB_BADTOKEN;
+
+	write_lock_bh(&lapb_list_lock);
+
+	lapb = __lapb_devtostruct(dev);
+	if (lapb) {
+		lapb_put(lapb);
+		goto out;
+	}
+
+	lapb = lapb_create_cb();
+	rc = LAPB_NOMEM;
+	if (!lapb)
+		goto out;
+
+	lapb->dev       = dev;
+	lapb->callbacks = *callbacks;
+
+	__lapb_insert_cb(lapb);
+
+	lapb_start_t1timer(lapb);
+
+	rc = LAPB_OK;
+out:
+	write_unlock_bh(&lapb_list_lock);
+	return rc;
+}
+
+int lapb_unregister(struct net_device *dev)
+{
+	struct lapb_cb *lapb;
+	int rc = LAPB_BADTOKEN;
+
+	write_lock_bh(&lapb_list_lock);
+	lapb = __lapb_devtostruct(dev);
+	if (!lapb)
+		goto out;
+
+	lapb_stop_t1timer(lapb);
+	lapb_stop_t2timer(lapb);
+
+	lapb_clear_queues(lapb);
+
+	__lapb_remove_cb(lapb);
+
+	lapb_put(lapb);
+	rc = LAPB_OK;
+out:
+	write_unlock_bh(&lapb_list_lock);
+	return rc;
+}
+
+int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
+{
+	int rc = LAPB_BADTOKEN;
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+
+	if (!lapb)
+		goto out;
+
+	parms->t1      = lapb->t1 / HZ;
+	parms->t2      = lapb->t2 / HZ;
+	parms->n2      = lapb->n2;
+	parms->n2count = lapb->n2count;
+	parms->state   = lapb->state;
+	parms->window  = lapb->window;
+	parms->mode    = lapb->mode;
+
+	if (!timer_pending(&lapb->t1timer))
+		parms->t1timer = 0;
+	else
+		parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ;
+
+	if (!timer_pending(&lapb->t2timer))
+		parms->t2timer = 0;
+	else
+		parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ;
+
+	lapb_put(lapb);
+	rc = LAPB_OK;
+out:
+	return rc;
+}
+
+int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
+{
+	int rc = LAPB_BADTOKEN;
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+
+	if (!lapb)
+		goto out;
+
+	rc = LAPB_INVALUE;
+	if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1)
+		goto out_put;
+
+	if (lapb->state == LAPB_STATE_0) {
+		if (parms->mode & LAPB_EXTENDED) {
+			if (parms->window < 1 || parms->window > 127)
+				goto out_put;
+		} else {
+			if (parms->window < 1 || parms->window > 7)
+				goto out_put;
+		}
+		lapb->mode    = parms->mode;
+		lapb->window  = parms->window;
+	}
+
+	lapb->t1    = parms->t1 * HZ;
+	lapb->t2    = parms->t2 * HZ;
+	lapb->n2    = parms->n2;
+
+	rc = LAPB_OK;
+out_put:
+	lapb_put(lapb);
+out:
+	return rc;
+}
+
+int lapb_connect_request(struct net_device *dev)
+{
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+	int rc = LAPB_BADTOKEN;
+
+	if (!lapb)
+		goto out;
+
+	rc = LAPB_OK;
+	if (lapb->state == LAPB_STATE_1)
+		goto out_put;
+
+	rc = LAPB_CONNECTED;
+	if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4)
+		goto out_put;
+
+	lapb_establish_data_link(lapb);
+
+#if LAPB_DEBUG > 0
+	printk(KERN_DEBUG "lapb: (%p) S0 -> S1\n", lapb->dev);
+#endif
+	lapb->state = LAPB_STATE_1;
+
+	rc = LAPB_OK;
+out_put:
+	lapb_put(lapb);
+out:
+	return rc;
+}
+
+int lapb_disconnect_request(struct net_device *dev)
+{
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+	int rc = LAPB_BADTOKEN;
+
+	if (!lapb)
+		goto out;
+
+	switch (lapb->state) {
+		case LAPB_STATE_0:
+			rc = LAPB_NOTCONNECTED;
+			goto out_put;
+
+		case LAPB_STATE_1:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 TX DISC(1)\n", lapb->dev);
+#endif
+#if LAPB_DEBUG > 0
+			printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev);
+#endif
+			lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+			lapb->state = LAPB_STATE_0;
+			lapb_start_t1timer(lapb);
+			rc = LAPB_NOTCONNECTED;
+			goto out_put;
+
+		case LAPB_STATE_2:
+			rc = LAPB_OK;
+			goto out_put;
+	}
+
+	lapb_clear_queues(lapb);
+	lapb->n2count = 0;
+	lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+	lapb_start_t1timer(lapb);
+	lapb_stop_t2timer(lapb);
+	lapb->state = LAPB_STATE_2;
+
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S3 DISC(1)\n", lapb->dev);
+#endif
+#if LAPB_DEBUG > 0
+	printk(KERN_DEBUG "lapb: (%p) S3 -> S2\n", lapb->dev);
+#endif
+
+	rc = LAPB_OK;
+out_put:
+	lapb_put(lapb);
+out:
+	return rc;
+}
+
+int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
+{
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+	int rc = LAPB_BADTOKEN;
+
+	if (!lapb)
+		goto out;
+
+	rc = LAPB_NOTCONNECTED;
+	if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4)
+		goto out_put;
+
+	skb_queue_tail(&lapb->write_queue, skb);
+	lapb_kick(lapb);
+	rc = LAPB_OK;
+out_put:
+	lapb_put(lapb);
+out:
+	return rc;
+}
+
+int lapb_data_received(struct net_device *dev, struct sk_buff *skb)
+{
+	struct lapb_cb *lapb = lapb_devtostruct(dev);
+	int rc = LAPB_BADTOKEN;
+
+	if (lapb) {
+		lapb_data_input(lapb, skb);
+		lapb_put(lapb);
+		rc = LAPB_OK;
+	}
+
+	return rc;
+}
+
+void lapb_connect_confirmation(struct lapb_cb *lapb, int reason)
+{
+	if (lapb->callbacks.connect_confirmation)
+		lapb->callbacks.connect_confirmation(lapb->dev, reason);
+}
+
+void lapb_connect_indication(struct lapb_cb *lapb, int reason)
+{
+	if (lapb->callbacks.connect_indication)
+		lapb->callbacks.connect_indication(lapb->dev, reason);
+}
+
+void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason)
+{
+	if (lapb->callbacks.disconnect_confirmation)
+		lapb->callbacks.disconnect_confirmation(lapb->dev, reason);
+}
+
+void lapb_disconnect_indication(struct lapb_cb *lapb, int reason)
+{
+	if (lapb->callbacks.disconnect_indication)
+		lapb->callbacks.disconnect_indication(lapb->dev, reason);
+}
+
+int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+	if (lapb->callbacks.data_indication)
+		return lapb->callbacks.data_indication(lapb->dev, skb);
+
+	kfree_skb(skb);
+	return NET_RX_SUCCESS; /* For now; must be != NET_RX_DROP */
+}
+
+int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+	int used = 0;
+
+	if (lapb->callbacks.data_transmit) {
+		lapb->callbacks.data_transmit(lapb->dev, skb);
+		used = 1;
+	}
+
+	return used;
+}
+
+EXPORT_SYMBOL(lapb_register);
+EXPORT_SYMBOL(lapb_unregister);
+EXPORT_SYMBOL(lapb_getparms);
+EXPORT_SYMBOL(lapb_setparms);
+EXPORT_SYMBOL(lapb_connect_request);
+EXPORT_SYMBOL(lapb_disconnect_request);
+EXPORT_SYMBOL(lapb_data_request);
+EXPORT_SYMBOL(lapb_data_received);
+
+static int __init lapb_init(void)
+{
+	return 0;
+}
+
+static void __exit lapb_exit(void)
+{
+	WARN_ON(!list_empty(&lapb_list));
+}
+
+MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol");
+MODULE_LICENSE("GPL");
+
+module_init(lapb_init);
+module_exit(lapb_exit);
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
new file mode 100644
index 00000000..21904a00
--- /dev/null
+++ b/net/lapb/lapb_in.c
@@ -0,0 +1,724 @@
+/*
+ *	LAPB release 002
+ *
+ *	This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	LAPB 001	Jonathan Naulor	Started Coding
+ *	LAPB 002	Jonathan Naylor	New timer architecture.
+ *	2000-10-29	Henner Eisen	lapb_data_indication() return status.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ *	State machine for state 0, Disconnected State.
+ *	The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state0_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+				struct lapb_frame *frame)
+{
+	switch (frame->type) {
+		case LAPB_SABM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S0 RX SABM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n",
+				       lapb->dev);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state     = LAPB_STATE_3;
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_connect_indication(lapb, LAPB_OK);
+			}
+			break;
+
+		case LAPB_SABME:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S0 RX SABME(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n",
+				       lapb->dev);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state     = LAPB_STATE_3;
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_connect_indication(lapb, LAPB_OK);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			}
+			break;
+
+		case LAPB_DISC:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S0 RX DISC(%d)\n",
+			       lapb->dev, frame->pf);
+			printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			lapb_send_control(lapb, LAPB_UA, frame->pf,
+					  LAPB_RESPONSE);
+			break;
+
+		default:
+			break;
+	}
+
+	kfree_skb(skb);
+}
+
+/*
+ *	State machine for state 1, Awaiting Connection State.
+ *	The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state1_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+				struct lapb_frame *frame)
+{
+	switch (frame->type) {
+		case LAPB_SABM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 RX SABM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+			}
+			break;
+
+		case LAPB_SABME:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 RX SABME(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			}
+			break;
+
+		case LAPB_DISC:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 RX DISC(%d)\n",
+			       lapb->dev, frame->pf);
+			printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			lapb_send_control(lapb, LAPB_DM, frame->pf,
+					  LAPB_RESPONSE);
+			break;
+
+		case LAPB_UA:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 RX UA(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (frame->pf) {
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S1 -> S3\n",
+				       lapb->dev);
+#endif
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state     = LAPB_STATE_3;
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_connect_confirmation(lapb, LAPB_OK);
+			}
+			break;
+
+		case LAPB_DM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S1 RX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (frame->pf) {
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n",
+				       lapb->dev);
+#endif
+				lapb_clear_queues(lapb);
+				lapb->state = LAPB_STATE_0;
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb_disconnect_indication(lapb, LAPB_REFUSED);
+			}
+			break;
+	}
+
+	kfree_skb(skb);
+}
+
+/*
+ *	State machine for state 2, Awaiting Release State.
+ *	The handling of the timer(s) is in file lapb_timer.c
+ */
+static void lapb_state2_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+				struct lapb_frame *frame)
+{
+	switch (frame->type) {
+		case LAPB_SABM:
+		case LAPB_SABME:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S2 RX {SABM,SABME}(%d)\n",
+			       lapb->dev, frame->pf);
+			printk(KERN_DEBUG "lapb: (%p) S2 TX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			lapb_send_control(lapb, LAPB_DM, frame->pf,
+					  LAPB_RESPONSE);
+			break;
+
+		case LAPB_DISC:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S2 RX DISC(%d)\n",
+			       lapb->dev, frame->pf);
+			printk(KERN_DEBUG "lapb: (%p) S2 TX UA(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			lapb_send_control(lapb, LAPB_UA, frame->pf,
+					  LAPB_RESPONSE);
+			break;
+
+		case LAPB_UA:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S2 RX UA(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (frame->pf) {
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n",
+				       lapb->dev);
+#endif
+				lapb->state = LAPB_STATE_0;
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb_disconnect_confirmation(lapb, LAPB_OK);
+			}
+			break;
+
+		case LAPB_DM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (frame->pf) {
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n",
+				       lapb->dev);
+#endif
+				lapb->state = LAPB_STATE_0;
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb_disconnect_confirmation(lapb,
+							     LAPB_NOTCONNECTED);
+			}
+			break;
+
+		case LAPB_I:
+		case LAPB_REJ:
+		case LAPB_RNR:
+		case LAPB_RR:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S2 RX {I,REJ,RNR,RR}"
+			       "(%d)\n", lapb->dev, frame->pf);
+			printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (frame->pf)
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			break;
+	}
+
+	kfree_skb(skb);
+}
+
+/*
+ *	State machine for state 3, Connected State.
+ *	The handling of the timer(s) is in file lapb_timer.c
+ */
+static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+				struct lapb_frame *frame)
+{
+	int queued = 0;
+	int modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS :
+						     LAPB_SMODULUS;
+
+	switch (frame->type) {
+		case LAPB_SABM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX SABM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_requeue_frames(lapb);
+			}
+			break;
+
+		case LAPB_SABME:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX SABME(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_requeue_frames(lapb);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			}
+			break;
+
+		case LAPB_DISC:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX DISC(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+			printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n",
+			       lapb->dev);
+#endif
+			lapb_clear_queues(lapb);
+			lapb_send_control(lapb, LAPB_UA, frame->pf,
+					  LAPB_RESPONSE);
+			lapb_start_t1timer(lapb);
+			lapb_stop_t2timer(lapb);
+			lapb->state = LAPB_STATE_0;
+			lapb_disconnect_indication(lapb, LAPB_OK);
+			break;
+
+		case LAPB_DM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX DM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+			printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n",
+			       lapb->dev);
+#endif
+			lapb_clear_queues(lapb);
+			lapb->state = LAPB_STATE_0;
+			lapb_start_t1timer(lapb);
+			lapb_stop_t2timer(lapb);
+			lapb_disconnect_indication(lapb, LAPB_NOTCONNECTED);
+			break;
+
+		case LAPB_RNR:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX RNR(%d) R%d\n",
+			       lapb->dev, frame->pf, frame->nr);
+#endif
+			lapb->condition |= LAPB_PEER_RX_BUSY_CONDITION;
+			lapb_check_need_response(lapb, frame->cr, frame->pf);
+			if (lapb_validate_nr(lapb, frame->nr)) {
+				lapb_check_iframes_acked(lapb, frame->nr);
+			} else {
+				lapb->frmr_data = *frame;
+				lapb->frmr_type = LAPB_FRMR_Z;
+				lapb_transmit_frmr(lapb);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n",
+				       lapb->dev);
+#endif
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state   = LAPB_STATE_4;
+				lapb->n2count = 0;
+			}
+			break;
+
+		case LAPB_RR:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX RR(%d) R%d\n",
+			       lapb->dev, frame->pf, frame->nr);
+#endif
+			lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION;
+			lapb_check_need_response(lapb, frame->cr, frame->pf);
+			if (lapb_validate_nr(lapb, frame->nr)) {
+				lapb_check_iframes_acked(lapb, frame->nr);
+			} else {
+				lapb->frmr_data = *frame;
+				lapb->frmr_type = LAPB_FRMR_Z;
+				lapb_transmit_frmr(lapb);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n",
+				       lapb->dev);
+#endif
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state   = LAPB_STATE_4;
+				lapb->n2count = 0;
+			}
+			break;
+
+		case LAPB_REJ:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX REJ(%d) R%d\n",
+			       lapb->dev, frame->pf, frame->nr);
+#endif
+			lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION;
+			lapb_check_need_response(lapb, frame->cr, frame->pf);
+			if (lapb_validate_nr(lapb, frame->nr)) {
+				lapb_frames_acked(lapb, frame->nr);
+				lapb_stop_t1timer(lapb);
+				lapb->n2count = 0;
+				lapb_requeue_frames(lapb);
+			} else {
+				lapb->frmr_data = *frame;
+				lapb->frmr_type = LAPB_FRMR_Z;
+				lapb_transmit_frmr(lapb);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n",
+				       lapb->dev);
+#endif
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state   = LAPB_STATE_4;
+				lapb->n2count = 0;
+			}
+			break;
+
+		case LAPB_I:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX I(%d) S%d R%d\n",
+			       lapb->dev, frame->pf, frame->ns, frame->nr);
+#endif
+			if (!lapb_validate_nr(lapb, frame->nr)) {
+				lapb->frmr_data = *frame;
+				lapb->frmr_type = LAPB_FRMR_Z;
+				lapb_transmit_frmr(lapb);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n",
+				       lapb->dev);
+#endif
+				lapb_start_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state   = LAPB_STATE_4;
+				lapb->n2count = 0;
+				break;
+			}
+			if (lapb->condition & LAPB_PEER_RX_BUSY_CONDITION)
+				lapb_frames_acked(lapb, frame->nr);
+			else
+				lapb_check_iframes_acked(lapb, frame->nr);
+
+			if (frame->ns == lapb->vr) {
+				int cn;
+				cn = lapb_data_indication(lapb, skb);
+				queued = 1;
+				/*
+				 * If upper layer has dropped the frame, we
+				 * basically ignore any further protocol
+				 * processing. This will cause the peer
+				 * to re-transmit the frame later like
+				 * a frame lost on the wire.
+				 */
+				if (cn == NET_RX_DROP) {
+					printk(KERN_DEBUG
+					       "LAPB: rx congestion\n");
+					break;
+				}
+				lapb->vr = (lapb->vr + 1) % modulus;
+				lapb->condition &= ~LAPB_REJECT_CONDITION;
+				if (frame->pf)
+					lapb_enquiry_response(lapb);
+				else {
+					if (!(lapb->condition &
+					      LAPB_ACK_PENDING_CONDITION)) {
+						lapb->condition |= LAPB_ACK_PENDING_CONDITION;
+						lapb_start_t2timer(lapb);
+					}
+				}
+			} else {
+				if (lapb->condition & LAPB_REJECT_CONDITION) {
+					if (frame->pf)
+						lapb_enquiry_response(lapb);
+				} else {
+#if LAPB_DEBUG > 1
+					printk(KERN_DEBUG
+					       "lapb: (%p) S3 TX REJ(%d) R%d\n",
+					       lapb->dev, frame->pf, lapb->vr);
+#endif
+					lapb->condition |= LAPB_REJECT_CONDITION;
+					lapb_send_control(lapb, LAPB_REJ,
+							  frame->pf,
+							  LAPB_RESPONSE);
+					lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+				}
+			}
+			break;
+
+		case LAPB_FRMR:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX FRMR(%d) %02X "
+			       "%02X %02X %02X %02X\n", lapb->dev, frame->pf,
+			       skb->data[0], skb->data[1], skb->data[2],
+			       skb->data[3], skb->data[4]);
+#endif
+			lapb_establish_data_link(lapb);
+#if LAPB_DEBUG > 0
+			printk(KERN_DEBUG "lapb: (%p) S3 -> S1\n",
+			       lapb->dev);
+#endif
+			lapb_requeue_frames(lapb);
+			lapb->state = LAPB_STATE_1;
+			break;
+
+		case LAPB_ILLEGAL:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S3 RX ILLEGAL(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			lapb->frmr_data = *frame;
+			lapb->frmr_type = LAPB_FRMR_W;
+			lapb_transmit_frmr(lapb);
+#if LAPB_DEBUG > 0
+			printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", lapb->dev);
+#endif
+			lapb_start_t1timer(lapb);
+			lapb_stop_t2timer(lapb);
+			lapb->state   = LAPB_STATE_4;
+			lapb->n2count = 0;
+			break;
+	}
+
+	if (!queued)
+		kfree_skb(skb);
+}
+
+/*
+ *	State machine for state 4, Frame Reject State.
+ *	The handling of the timer(s) is in file lapb_timer.c.
+ */
+static void lapb_state4_machine(struct lapb_cb *lapb, struct sk_buff *skb,
+				struct lapb_frame *frame)
+{
+	switch (frame->type) {
+		case LAPB_SABM:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S4 RX SABM(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n",
+				       lapb->dev);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state     = LAPB_STATE_3;
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_connect_indication(lapb, LAPB_OK);
+			}
+			break;
+
+		case LAPB_SABME:
+#if LAPB_DEBUG > 1
+			printk(KERN_DEBUG "lapb: (%p) S4 RX SABME(%d)\n",
+			       lapb->dev, frame->pf);
+#endif
+			if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n",
+				       lapb->dev);
+#endif
+				lapb_send_control(lapb, LAPB_UA, frame->pf,
+						  LAPB_RESPONSE);
+				lapb_stop_t1timer(lapb);
+				lapb_stop_t2timer(lapb);
+				lapb->state     = LAPB_STATE_3;
+				lapb->condition = 0x00;
+				lapb->n2count   = 0;
+				lapb->vs        = 0;
+				lapb->vr        = 0;
+				lapb->va        = 0;
+				lapb_connect_indication(lapb, LAPB_OK);
+			} else {
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n",
+				       lapb->dev, frame->pf);
+#endif
+				lapb_send_control(lapb, LAPB_DM, frame->pf,
+						  LAPB_RESPONSE);
+			}
+			break;
+	}
+
+	kfree_skb(skb);
+}
+
+/*
+ *	Process an incoming LAPB frame
+ */
+void lapb_data_input(struct lapb_cb *lapb, struct sk_buff *skb)
+{
+	struct lapb_frame frame;
+
+	if (lapb_decode(lapb, skb, &frame) < 0) {
+		kfree_skb(skb);
+		return;
+	}
+
+	switch (lapb->state) {
+	case LAPB_STATE_0:
+		lapb_state0_machine(lapb, skb, &frame); break;
+	case LAPB_STATE_1:
+		lapb_state1_machine(lapb, skb, &frame); break;
+	case LAPB_STATE_2:
+		lapb_state2_machine(lapb, skb, &frame); break;
+	case LAPB_STATE_3:
+		lapb_state3_machine(lapb, skb, &frame); break;
+	case LAPB_STATE_4:
+		lapb_state4_machine(lapb, skb, &frame); break;
+	}
+
+	lapb_kick(lapb);
+}
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
new file mode 100644
index 00000000..c75a7954
--- /dev/null
+++ b/net/lapb/lapb_out.c
@@ -0,0 +1,224 @@
+/*
+ *	LAPB release 002
+ *
+ *	This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	LAPB 001	Jonathan Naylor	Started Coding
+ *	LAPB 002	Jonathan Naylor	New timer architecture.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ *  This procedure is passed a buffer descriptor for an iframe. It builds
+ *  the rest of the control part of the frame and then writes it out.
+ */
+static void lapb_send_iframe(struct lapb_cb *lapb, struct sk_buff *skb, int poll_bit)
+{
+	unsigned char *frame;
+
+	if (!skb)
+		return;
+
+	if (lapb->mode & LAPB_EXTENDED) {
+		frame = skb_push(skb, 2);
+
+		frame[0] = LAPB_I;
+		frame[0] |= lapb->vs << 1;
+		frame[1] = poll_bit ? LAPB_EPF : 0;
+		frame[1] |= lapb->vr << 1;
+	} else {
+		frame = skb_push(skb, 1);
+
+		*frame = LAPB_I;
+		*frame |= poll_bit ? LAPB_SPF : 0;
+		*frame |= lapb->vr << 5;
+		*frame |= lapb->vs << 1;
+	}
+
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S%d TX I(%d) S%d R%d\n",
+	       lapb->dev, lapb->state, poll_bit, lapb->vs, lapb->vr);
+#endif
+
+	lapb_transmit_buffer(lapb, skb, LAPB_COMMAND);
+}
+
+void lapb_kick(struct lapb_cb *lapb)
+{
+	struct sk_buff *skb, *skbn;
+	unsigned short modulus, start, end;
+
+	modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+	start = !skb_peek(&lapb->ack_queue) ? lapb->va : lapb->vs;
+	end   = (lapb->va + lapb->window) % modulus;
+
+	if (!(lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) &&
+	    start != end && skb_peek(&lapb->write_queue)) {
+		lapb->vs = start;
+
+		/*
+		 * Dequeue the frame and copy it.
+		 */
+		skb = skb_dequeue(&lapb->write_queue);
+
+		do {
+			if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+				skb_queue_head(&lapb->write_queue, skb);
+				break;
+			}
+
+			if (skb->sk)
+				skb_set_owner_w(skbn, skb->sk);
+
+			/*
+			 * Transmit the frame copy.
+			 */
+			lapb_send_iframe(lapb, skbn, LAPB_POLLOFF);
+
+			lapb->vs = (lapb->vs + 1) % modulus;
+
+			/*
+			 * Requeue the original data frame.
+			 */
+			skb_queue_tail(&lapb->ack_queue, skb);
+
+		} while (lapb->vs != end && (skb = skb_dequeue(&lapb->write_queue)) != NULL);
+
+		lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+
+		if (!lapb_t1timer_running(lapb))
+			lapb_start_t1timer(lapb);
+	}
+}
+
+void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type)
+{
+	unsigned char *ptr;
+
+	ptr = skb_push(skb, 1);
+
+	if (lapb->mode & LAPB_MLP) {
+		if (lapb->mode & LAPB_DCE) {
+			if (type == LAPB_COMMAND)
+				*ptr = LAPB_ADDR_C;
+			if (type == LAPB_RESPONSE)
+				*ptr = LAPB_ADDR_D;
+		} else {
+			if (type == LAPB_COMMAND)
+				*ptr = LAPB_ADDR_D;
+			if (type == LAPB_RESPONSE)
+				*ptr = LAPB_ADDR_C;
+		}
+	} else {
+		if (lapb->mode & LAPB_DCE) {
+			if (type == LAPB_COMMAND)
+				*ptr = LAPB_ADDR_A;
+			if (type == LAPB_RESPONSE)
+				*ptr = LAPB_ADDR_B;
+		} else {
+			if (type == LAPB_COMMAND)
+				*ptr = LAPB_ADDR_B;
+			if (type == LAPB_RESPONSE)
+				*ptr = LAPB_ADDR_A;
+		}
+	}
+
+#if LAPB_DEBUG > 2
+	printk(KERN_DEBUG "lapb: (%p) S%d TX %02X %02X %02X\n",
+	       lapb->dev, lapb->state,
+	       skb->data[0], skb->data[1], skb->data[2]);
+#endif
+
+	if (!lapb_data_transmit(lapb, skb))
+		kfree_skb(skb);
+}
+
+void lapb_establish_data_link(struct lapb_cb *lapb)
+{
+	lapb->condition = 0x00;
+	lapb->n2count   = 0;
+
+	if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+		printk(KERN_DEBUG "lapb: (%p) S%d TX SABME(1)\n",
+		       lapb->dev, lapb->state);
+#endif
+		lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND);
+	} else {
+#if LAPB_DEBUG > 1
+		printk(KERN_DEBUG "lapb: (%p) S%d TX SABM(1)\n",
+		       lapb->dev, lapb->state);
+#endif
+		lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND);
+	}
+
+	lapb_start_t1timer(lapb);
+	lapb_stop_t2timer(lapb);
+}
+
+void lapb_enquiry_response(struct lapb_cb *lapb)
+{
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S%d TX RR(1) R%d\n",
+	       lapb->dev, lapb->state, lapb->vr);
+#endif
+
+	lapb_send_control(lapb, LAPB_RR, LAPB_POLLON, LAPB_RESPONSE);
+
+	lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+}
+
+void lapb_timeout_response(struct lapb_cb *lapb)
+{
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S%d TX RR(0) R%d\n",
+	       lapb->dev, lapb->state, lapb->vr);
+#endif
+	lapb_send_control(lapb, LAPB_RR, LAPB_POLLOFF, LAPB_RESPONSE);
+
+	lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+}
+
+void lapb_check_iframes_acked(struct lapb_cb *lapb, unsigned short nr)
+{
+	if (lapb->vs == nr) {
+		lapb_frames_acked(lapb, nr);
+		lapb_stop_t1timer(lapb);
+		lapb->n2count = 0;
+	} else if (lapb->va != nr) {
+		lapb_frames_acked(lapb, nr);
+		lapb_start_t1timer(lapb);
+	}
+}
+
+void lapb_check_need_response(struct lapb_cb *lapb, int type, int pf)
+{
+	if (type == LAPB_COMMAND && pf)
+		lapb_enquiry_response(lapb);
+}
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
new file mode 100644
index 00000000..43a2a7fb
--- /dev/null
+++ b/net/lapb/lapb_subr.c
@@ -0,0 +1,313 @@
+/*
+ *	LAPB release 002
+ *
+ *	This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	LAPB 001	Jonathan Naylor	Started Coding
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+/*
+ *	This routine purges all the queues of frames.
+ */
+void lapb_clear_queues(struct lapb_cb *lapb)
+{
+	skb_queue_purge(&lapb->write_queue);
+	skb_queue_purge(&lapb->ack_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr)
+{
+	struct sk_buff *skb;
+	int modulus;
+
+	modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+
+	/*
+	 * Remove all the ack-ed frames from the ack queue.
+	 */
+	if (lapb->va != nr)
+		while (skb_peek(&lapb->ack_queue) && lapb->va != nr) {
+			skb = skb_dequeue(&lapb->ack_queue);
+			kfree_skb(skb);
+			lapb->va = (lapb->va + 1) % modulus;
+		}
+}
+
+void lapb_requeue_frames(struct lapb_cb *lapb)
+{
+	struct sk_buff *skb, *skb_prev = NULL;
+
+	/*
+	 * Requeue all the un-ack-ed frames on the output queue to be picked
+	 * up by lapb_kick called from the timer. This arrangement handles the
+	 * possibility of an empty output queue.
+	 */
+	while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) {
+		if (!skb_prev)
+			skb_queue_head(&lapb->write_queue, skb);
+		else
+			skb_append(skb_prev, skb, &lapb->write_queue);
+		skb_prev = skb;
+	}
+}
+
+/*
+ *	Validate that the value of nr is between va and vs. Return true or
+ *	false for testing.
+ */
+int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr)
+{
+	unsigned short vc = lapb->va;
+	int modulus;
+
+	modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS;
+
+	while (vc != lapb->vs) {
+		if (nr == vc)
+			return 1;
+		vc = (vc + 1) % modulus;
+	}
+
+	return nr == lapb->vs;
+}
+
+/*
+ *	This routine is the centralised routine for parsing the control
+ *	information for the different frame formats.
+ */
+int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb,
+		struct lapb_frame *frame)
+{
+	frame->type = LAPB_ILLEGAL;
+
+#if LAPB_DEBUG > 2
+	printk(KERN_DEBUG "lapb: (%p) S%d RX %02X %02X %02X\n",
+	       lapb->dev, lapb->state,
+	       skb->data[0], skb->data[1], skb->data[2]);
+#endif
+
+	/* We always need to look at 2 bytes, sometimes we need
+	 * to look at 3 and those cases are handled below.
+	 */
+	if (!pskb_may_pull(skb, 2))
+		return -1;
+
+	if (lapb->mode & LAPB_MLP) {
+		if (lapb->mode & LAPB_DCE) {
+			if (skb->data[0] == LAPB_ADDR_D)
+				frame->cr = LAPB_COMMAND;
+			if (skb->data[0] == LAPB_ADDR_C)
+				frame->cr = LAPB_RESPONSE;
+		} else {
+			if (skb->data[0] == LAPB_ADDR_C)
+				frame->cr = LAPB_COMMAND;
+			if (skb->data[0] == LAPB_ADDR_D)
+				frame->cr = LAPB_RESPONSE;
+		}
+	} else {
+		if (lapb->mode & LAPB_DCE) {
+			if (skb->data[0] == LAPB_ADDR_B)
+				frame->cr = LAPB_COMMAND;
+			if (skb->data[0] == LAPB_ADDR_A)
+				frame->cr = LAPB_RESPONSE;
+		} else {
+			if (skb->data[0] == LAPB_ADDR_A)
+				frame->cr = LAPB_COMMAND;
+			if (skb->data[0] == LAPB_ADDR_B)
+				frame->cr = LAPB_RESPONSE;
+		}
+	}
+
+	skb_pull(skb, 1);
+
+	if (lapb->mode & LAPB_EXTENDED) {
+		if (!(skb->data[0] & LAPB_S)) {
+			if (!pskb_may_pull(skb, 2))
+				return -1;
+			/*
+			 * I frame - carries NR/NS/PF
+			 */
+			frame->type       = LAPB_I;
+			frame->ns         = (skb->data[0] >> 1) & 0x7F;
+			frame->nr         = (skb->data[1] >> 1) & 0x7F;
+			frame->pf         = skb->data[1] & LAPB_EPF;
+			frame->control[0] = skb->data[0];
+			frame->control[1] = skb->data[1];
+			skb_pull(skb, 2);
+		} else if ((skb->data[0] & LAPB_U) == 1) {
+			if (!pskb_may_pull(skb, 2))
+				return -1;
+			/*
+			 * S frame - take out PF/NR
+			 */
+			frame->type       = skb->data[0] & 0x0F;
+			frame->nr         = (skb->data[1] >> 1) & 0x7F;
+			frame->pf         = skb->data[1] & LAPB_EPF;
+			frame->control[0] = skb->data[0];
+			frame->control[1] = skb->data[1];
+			skb_pull(skb, 2);
+		} else if ((skb->data[0] & LAPB_U) == 3) {
+			/*
+			 * U frame - take out PF
+			 */
+			frame->type       = skb->data[0] & ~LAPB_SPF;
+			frame->pf         = skb->data[0] & LAPB_SPF;
+			frame->control[0] = skb->data[0];
+			frame->control[1] = 0x00;
+			skb_pull(skb, 1);
+		}
+	} else {
+		if (!(skb->data[0] & LAPB_S)) {
+			/*
+			 * I frame - carries NR/NS/PF
+			 */
+			frame->type = LAPB_I;
+			frame->ns   = (skb->data[0] >> 1) & 0x07;
+			frame->nr   = (skb->data[0] >> 5) & 0x07;
+			frame->pf   = skb->data[0] & LAPB_SPF;
+		} else if ((skb->data[0] & LAPB_U) == 1) {
+			/*
+			 * S frame - take out PF/NR
+			 */
+			frame->type = skb->data[0] & 0x0F;
+			frame->nr   = (skb->data[0] >> 5) & 0x07;
+			frame->pf   = skb->data[0] & LAPB_SPF;
+		} else if ((skb->data[0] & LAPB_U) == 3) {
+			/*
+			 * U frame - take out PF
+			 */
+			frame->type = skb->data[0] & ~LAPB_SPF;
+			frame->pf   = skb->data[0] & LAPB_SPF;
+		}
+
+		frame->control[0] = skb->data[0];
+
+		skb_pull(skb, 1);
+	}
+
+	return 0;
+}
+
+/*
+ *	This routine is called when the HDLC layer internally  generates a
+ *	command or  response  for  the remote machine ( eg. RR, UA etc. ).
+ *	Only supervisory or unnumbered frames are processed, FRMRs are handled
+ *	by lapb_transmit_frmr below.
+ */
+void lapb_send_control(struct lapb_cb *lapb, int frametype,
+		       int poll_bit, int type)
+{
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+
+	if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, LAPB_HEADER_LEN + 1);
+
+	if (lapb->mode & LAPB_EXTENDED) {
+		if ((frametype & LAPB_U) == LAPB_U) {
+			dptr   = skb_put(skb, 1);
+			*dptr  = frametype;
+			*dptr |= poll_bit ? LAPB_SPF : 0;
+		} else {
+			dptr     = skb_put(skb, 2);
+			dptr[0]  = frametype;
+			dptr[1]  = (lapb->vr << 1);
+			dptr[1] |= poll_bit ? LAPB_EPF : 0;
+		}
+	} else {
+		dptr   = skb_put(skb, 1);
+		*dptr  = frametype;
+		*dptr |= poll_bit ? LAPB_SPF : 0;
+		if ((frametype & LAPB_U) == LAPB_S)	/* S frames carry NR */
+			*dptr |= (lapb->vr << 5);
+	}
+
+	lapb_transmit_buffer(lapb, skb, type);
+}
+
+/*
+ *	This routine generates FRMRs based on information previously stored in
+ *	the LAPB control block.
+ */
+void lapb_transmit_frmr(struct lapb_cb *lapb)
+{
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+
+	if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, LAPB_HEADER_LEN + 1);
+
+	if (lapb->mode & LAPB_EXTENDED) {
+		dptr    = skb_put(skb, 6);
+		*dptr++ = LAPB_FRMR;
+		*dptr++ = lapb->frmr_data.control[0];
+		*dptr++ = lapb->frmr_data.control[1];
+		*dptr++ = (lapb->vs << 1) & 0xFE;
+		*dptr   = (lapb->vr << 1) & 0xFE;
+		if (lapb->frmr_data.cr == LAPB_RESPONSE)
+			*dptr |= 0x01;
+		dptr++;
+		*dptr++ = lapb->frmr_type;
+
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X %02X %02X\n",
+	       lapb->dev, lapb->state,
+	       skb->data[1], skb->data[2], skb->data[3],
+	       skb->data[4], skb->data[5]);
+#endif
+	} else {
+		dptr    = skb_put(skb, 4);
+		*dptr++ = LAPB_FRMR;
+		*dptr++ = lapb->frmr_data.control[0];
+		*dptr   = (lapb->vs << 1) & 0x0E;
+		*dptr  |= (lapb->vr << 5) & 0xE0;
+		if (lapb->frmr_data.cr == LAPB_RESPONSE)
+			*dptr |= 0x10;
+		dptr++;
+		*dptr++ = lapb->frmr_type;
+
+#if LAPB_DEBUG > 1
+	printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X\n",
+	       lapb->dev, lapb->state, skb->data[1],
+	       skb->data[2], skb->data[3]);
+#endif
+	}
+
+	lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE);
+}
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
new file mode 100644
index 00000000..af6d14b4
--- /dev/null
+++ b/net/lapb/lapb_timer.c
@@ -0,0 +1,189 @@
+/*
+ *	LAPB release 002
+ *
+ *	This code REQUIRES 2.1.15 or higher/ NET3.038
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	LAPB 001	Jonathan Naylor	Started Coding
+ *	LAPB 002	Jonathan Naylor	New timer architecture.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/lapb.h>
+
+static void lapb_t1timer_expiry(unsigned long);
+static void lapb_t2timer_expiry(unsigned long);
+
+void lapb_start_t1timer(struct lapb_cb *lapb)
+{
+	del_timer(&lapb->t1timer);
+
+	lapb->t1timer.data     = (unsigned long)lapb;
+	lapb->t1timer.function = &lapb_t1timer_expiry;
+	lapb->t1timer.expires  = jiffies + lapb->t1;
+
+	add_timer(&lapb->t1timer);
+}
+
+void lapb_start_t2timer(struct lapb_cb *lapb)
+{
+	del_timer(&lapb->t2timer);
+
+	lapb->t2timer.data     = (unsigned long)lapb;
+	lapb->t2timer.function = &lapb_t2timer_expiry;
+	lapb->t2timer.expires  = jiffies + lapb->t2;
+
+	add_timer(&lapb->t2timer);
+}
+
+void lapb_stop_t1timer(struct lapb_cb *lapb)
+{
+	del_timer(&lapb->t1timer);
+}
+
+void lapb_stop_t2timer(struct lapb_cb *lapb)
+{
+	del_timer(&lapb->t2timer);
+}
+
+int lapb_t1timer_running(struct lapb_cb *lapb)
+{
+	return timer_pending(&lapb->t1timer);
+}
+
+static void lapb_t2timer_expiry(unsigned long param)
+{
+	struct lapb_cb *lapb = (struct lapb_cb *)param;
+
+	if (lapb->condition & LAPB_ACK_PENDING_CONDITION) {
+		lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
+		lapb_timeout_response(lapb);
+	}
+}
+
+static void lapb_t1timer_expiry(unsigned long param)
+{
+	struct lapb_cb *lapb = (struct lapb_cb *)param;
+
+	switch (lapb->state) {
+
+		/*
+		 *	If we are a DCE, keep going DM .. DM .. DM
+		 */
+		case LAPB_STATE_0:
+			if (lapb->mode & LAPB_DCE)
+				lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE);
+			break;
+
+		/*
+		 *	Awaiting connection state, send SABM(E), up to N2 times.
+		 */
+		case LAPB_STATE_1:
+			if (lapb->n2count == lapb->n2) {
+				lapb_clear_queues(lapb);
+				lapb->state = LAPB_STATE_0;
+				lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev);
+#endif
+				return;
+			} else {
+				lapb->n2count++;
+				if (lapb->mode & LAPB_EXTENDED) {
+#if LAPB_DEBUG > 1
+					printk(KERN_DEBUG "lapb: (%p) S1 TX SABME(1)\n", lapb->dev);
+#endif
+					lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND);
+				} else {
+#if LAPB_DEBUG > 1
+					printk(KERN_DEBUG "lapb: (%p) S1 TX SABM(1)\n", lapb->dev);
+#endif
+					lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND);
+				}
+			}
+			break;
+
+		/*
+		 *	Awaiting disconnection state, send DISC, up to N2 times.
+		 */
+		case LAPB_STATE_2:
+			if (lapb->n2count == lapb->n2) {
+				lapb_clear_queues(lapb);
+				lapb->state = LAPB_STATE_0;
+				lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", lapb->dev);
+#endif
+				return;
+			} else {
+				lapb->n2count++;
+#if LAPB_DEBUG > 1
+				printk(KERN_DEBUG "lapb: (%p) S2 TX DISC(1)\n", lapb->dev);
+#endif
+				lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
+			}
+			break;
+
+		/*
+		 *	Data transfer state, restransmit I frames, up to N2 times.
+		 */
+		case LAPB_STATE_3:
+			if (lapb->n2count == lapb->n2) {
+				lapb_clear_queues(lapb);
+				lapb->state = LAPB_STATE_0;
+				lapb_stop_t2timer(lapb);
+				lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", lapb->dev);
+#endif
+				return;
+			} else {
+				lapb->n2count++;
+				lapb_requeue_frames(lapb);
+			}
+			break;
+
+		/*
+		 *	Frame reject state, restransmit FRMR frames, up to N2 times.
+		 */
+		case LAPB_STATE_4:
+			if (lapb->n2count == lapb->n2) {
+				lapb_clear_queues(lapb);
+				lapb->state = LAPB_STATE_0;
+				lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
+#if LAPB_DEBUG > 0
+				printk(KERN_DEBUG "lapb: (%p) S4 -> S0\n", lapb->dev);
+#endif
+				return;
+			} else {
+				lapb->n2count++;
+				lapb_transmit_frmr(lapb);
+			}
+			break;
+	}
+
+	lapb_start_t1timer(lapb);
+}
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
new file mode 100644
index 00000000..b91c6510
--- /dev/null
+++ b/net/llc/Kconfig
@@ -0,0 +1,10 @@
+config LLC
+	tristate
+	depends on NET
+
+config LLC2
+	tristate "ANSI/IEEE 802.2 LLC type 2 Support"
+	select LLC
+	help
+	  This is a Logical Link Layer type 2, connection oriented support. 
+	  Select this if you want to have support for PF_LLC sockets.
diff --git a/net/llc/Makefile b/net/llc/Makefile
new file mode 100644
index 00000000..4e260cff
--- /dev/null
+++ b/net/llc/Makefile
@@ -0,0 +1,25 @@
+###########################################################################
+# Makefile for the Linux 802.2 LLC (fully-functional) layer.
+#
+# Copyright (c) 1997 by Procom Technology,Inc.
+#		2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+#
+# This program can be redistributed or modified under the terms of the 
+# GNU General Public License as published by the Free Software Foundation.
+# This program is distributed without any warranty or implied warranty
+# of merchantability or fitness for a particular purpose.
+#
+# See the GNU General Public License for more details.
+###########################################################################
+
+obj-$(CONFIG_LLC) += llc.o
+
+llc-y := llc_core.o llc_input.o llc_output.o
+
+obj-$(CONFIG_LLC2) += llc2.o
+
+llc2-y := llc_if.o llc_c_ev.o llc_c_ac.o llc_conn.o llc_c_st.o llc_pdu.o \
+	  llc_sap.o llc_s_ac.o llc_s_ev.o llc_s_st.o af_llc.o llc_station.o
+
+llc2-$(CONFIG_PROC_FS) += llc_proc.o
+llc2-$(CONFIG_SYSCTL)  += sysctl_net_llc.o
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
new file mode 100644
index 00000000..a18e6c3d
--- /dev/null
+++ b/net/llc/af_llc.c
@@ -0,0 +1,1248 @@
+/*
+ * af_llc.c - LLC User Interface SAPs
+ * Description:
+ *   Functions in this module are implementation of socket based llc
+ *   communications for the Linux operating system. Support of llc class
+ *   one and class two is provided via SOCK_DGRAM and SOCK_STREAM
+ *   respectively.
+ *
+ *   An llc2 connection is (mac + sap), only one llc2 sap connection
+ *   is allowed per mac. Though one sap may have multiple mac + sap
+ *   connections.
+ *
+ * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org>
+ *		 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_sap.h>
+#include <net/llc_pdu.h>
+#include <net/llc_conn.h>
+#include <net/tcp_states.h>
+
+/* remember: uninitialized global data is zeroed because its in .bss */
+static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+static u16 llc_ui_sap_link_no_max[256];
+static struct sockaddr_llc llc_ui_addrnull;
+static const struct proto_ops llc_ui_ops;
+
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/* Maybe we'll add some more in the future. */
+#define LLC_CMSG_PKTINFO	1
+
+
+/**
+ *	llc_ui_next_link_no - return the next unused link number for a sap
+ *	@sap: Address of sap to get link number from.
+ *
+ *	Return the next unused link number for a given sap.
+ */
+static inline u16 llc_ui_next_link_no(int sap)
+{
+	return llc_ui_sap_link_no_max[sap]++;
+}
+
+/**
+ *	llc_proto_type - return eth protocol for ARP header type
+ *	@arphrd: ARP header type.
+ *
+ *	Given an ARP header type return the corresponding ethernet protocol.
+ */
+static inline __be16 llc_proto_type(u16 arphrd)
+{
+	return arphrd == ARPHRD_IEEE802_TR ?
+			 htons(ETH_P_TR_802_2) : htons(ETH_P_802_2);
+}
+
+/**
+ *	llc_ui_addr_null - determines if a address structure is null
+ *	@addr: Address to test if null.
+ */
+static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr)
+{
+	return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr));
+}
+
+/**
+ *	llc_ui_header_len - return length of llc header based on operation
+ *	@sk: Socket which contains a valid llc socket type.
+ *	@addr: Complete sockaddr_llc structure received from the user.
+ *
+ *	Provide the length of the llc header depending on what kind of
+ *	operation the user would like to perform and the type of socket.
+ *	Returns the correct llc header length.
+ */
+static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
+{
+	u8 rc = LLC_PDU_LEN_U;
+
+	if (addr->sllc_test || addr->sllc_xid)
+		rc = LLC_PDU_LEN_U;
+	else if (sk->sk_type == SOCK_STREAM)
+		rc = LLC_PDU_LEN_I;
+	return rc;
+}
+
+/**
+ *	llc_ui_send_data - send data via reliable llc2 connection
+ *	@sk: Connection the socket is using.
+ *	@skb: Data the user wishes to send.
+ *	@noblock: can we block waiting for data?
+ *
+ *	Send data via reliable llc2 connection.
+ *	Returns 0 upon success, non-zero if action did not succeed.
+ */
+static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
+{
+	struct llc_sock* llc = llc_sk(sk);
+	int rc = 0;
+
+	if (unlikely(llc_data_accept_state(llc->state) ||
+		     llc->remote_busy_flag ||
+		     llc->p_flag)) {
+		long timeout = sock_sndtimeo(sk, noblock);
+
+		rc = llc_ui_wait_for_busy_core(sk, timeout);
+	}
+	if (unlikely(!rc))
+		rc = llc_build_and_send_pkt(sk, skb);
+	return rc;
+}
+
+static void llc_ui_sk_init(struct socket *sock, struct sock *sk)
+{
+	sock_graft(sk, sock);
+	sk->sk_type	= sock->type;
+	sock->ops	= &llc_ui_ops;
+}
+
+static struct proto llc_proto = {
+	.name	  = "LLC",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct llc_sock),
+	.slab_flags = SLAB_DESTROY_BY_RCU,
+};
+
+/**
+ *	llc_ui_create - alloc and init a new llc_ui socket
+ *	@net: network namespace (must be default network)
+ *	@sock: Socket to initialize and attach allocated sk to.
+ *	@protocol: Unused.
+ *	@kern: on behalf of kernel or userspace
+ *
+ *	Allocate and initialize a new llc_ui socket, validate the user wants a
+ *	socket type we have available.
+ *	Returns 0 upon success, negative upon failure.
+ */
+static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
+{
+	struct sock *sk;
+	int rc = -ESOCKTNOSUPPORT;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
+		rc = -ENOMEM;
+		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+		if (sk) {
+			rc = 0;
+			llc_ui_sk_init(sock, sk);
+		}
+	}
+	return rc;
+}
+
+/**
+ *	llc_ui_release - shutdown socket
+ *	@sock: Socket to release.
+ *
+ *	Shutdown and deallocate an existing socket.
+ */
+static int llc_ui_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc;
+
+	if (unlikely(sk == NULL))
+		goto out;
+	sock_hold(sk);
+	lock_sock(sk);
+	llc = llc_sk(sk);
+	dprintk("%s: closing local(%02X) remote(%02X)\n", __func__,
+		llc->laddr.lsap, llc->daddr.lsap);
+	if (!llc_send_disc(sk))
+		llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		llc_sap_remove_socket(llc->sap, sk);
+	release_sock(sk);
+	if (llc->dev)
+		dev_put(llc->dev);
+	sock_put(sk);
+	llc_sk_free(sk);
+out:
+	return 0;
+}
+
+/**
+ *	llc_ui_autoport - provide dynamically allocate SAP number
+ *
+ *	Provide the caller with a dynamically allocated SAP number according
+ *	to the rules that are set in this function. Returns: 0, upon failure,
+ *	SAP number otherwise.
+ */
+static int llc_ui_autoport(void)
+{
+	struct llc_sap *sap;
+	int i, tries = 0;
+
+	while (tries < LLC_SAP_DYN_TRIES) {
+		for (i = llc_ui_sap_last_autoport;
+		     i < LLC_SAP_DYN_STOP; i += 2) {
+			sap = llc_sap_find(i);
+			if (!sap) {
+				llc_ui_sap_last_autoport = i + 2;
+				goto out;
+			}
+			llc_sap_put(sap);
+		}
+		llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+		tries++;
+	}
+	i = 0;
+out:
+	return i;
+}
+
+/**
+ *	llc_ui_autobind - automatically bind a socket to a sap
+ *	@sock: socket to bind
+ *	@addr: address to connect to
+ *
+ * 	Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't
+ * 	specifically used llc_ui_bind to bind to an specific address/sap
+ *
+ *	Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap;
+	int rc = -EINVAL;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+	rc = -ENODEV;
+	if (sk->sk_bound_dev_if) {
+		llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
+		if (llc->dev && addr->sllc_arphrd != llc->dev->type) {
+			dev_put(llc->dev);
+			llc->dev = NULL;
+		}
+	} else
+		llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
+	if (!llc->dev)
+		goto out;
+	rc = -EUSERS;
+	llc->laddr.lsap = llc_ui_autoport();
+	if (!llc->laddr.lsap)
+		goto out;
+	rc = -EBUSY; /* some other network layer is using the sap */
+	sap = llc_sap_open(llc->laddr.lsap, NULL);
+	if (!sap)
+		goto out;
+	memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN);
+	memcpy(&llc->addr, addr, sizeof(llc->addr));
+	/* assign new connection to its SAP */
+	llc_sap_add_socket(sap, sk);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	rc = 0;
+out:
+	return rc;
+}
+
+/**
+ *	llc_ui_bind - bind a socket to a specific address.
+ *	@sock: Socket to bind an address to.
+ *	@uaddr: Address the user wants the socket bound to.
+ *	@addrlen: Length of the uaddr structure.
+ *
+ *	Bind a socket to a specific address. For llc a user is able to bind to
+ *	a specific sap only or mac + sap.
+ *	If the user desires to bind to a specific mac + sap, it is possible to
+ *	have multiple sap connections via multiple macs.
+ *	Bind and autobind for that matter must enforce the correct sap usage
+ *	otherwise all hell will break loose.
+ *	Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
+{
+	struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap;
+	int rc = -EINVAL;
+
+	dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
+	if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
+		goto out;
+	rc = -EAFNOSUPPORT;
+	if (unlikely(addr->sllc_family != AF_LLC))
+		goto out;
+	rc = -ENODEV;
+	rcu_read_lock();
+	if (sk->sk_bound_dev_if) {
+		llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
+		if (llc->dev) {
+			if (!addr->sllc_arphrd)
+				addr->sllc_arphrd = llc->dev->type;
+			if (llc_mac_null(addr->sllc_mac))
+				memcpy(addr->sllc_mac, llc->dev->dev_addr,
+				       IFHWADDRLEN);
+			if (addr->sllc_arphrd != llc->dev->type ||
+			    !llc_mac_match(addr->sllc_mac,
+					   llc->dev->dev_addr)) {
+				rc = -EINVAL;
+				llc->dev = NULL;
+			}
+		}
+	} else
+		llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
+					   addr->sllc_mac);
+	if (llc->dev)
+		dev_hold(llc->dev);
+	rcu_read_unlock();
+	if (!llc->dev)
+		goto out;
+	if (!addr->sllc_sap) {
+		rc = -EUSERS;
+		addr->sllc_sap = llc_ui_autoport();
+		if (!addr->sllc_sap)
+			goto out;
+	}
+	sap = llc_sap_find(addr->sllc_sap);
+	if (!sap) {
+		sap = llc_sap_open(addr->sllc_sap, NULL);
+		rc = -EBUSY; /* some other network layer is using the sap */
+		if (!sap)
+			goto out;
+	} else {
+		struct llc_addr laddr, daddr;
+		struct sock *ask;
+
+		memset(&laddr, 0, sizeof(laddr));
+		memset(&daddr, 0, sizeof(daddr));
+		/*
+		 * FIXME: check if the address is multicast,
+		 * 	  only SOCK_DGRAM can do this.
+		 */
+		memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN);
+		laddr.lsap = addr->sllc_sap;
+		rc = -EADDRINUSE; /* mac + sap clash. */
+		ask = llc_lookup_established(sap, &daddr, &laddr);
+		if (ask) {
+			sock_put(ask);
+			goto out_put;
+		}
+	}
+	llc->laddr.lsap = addr->sllc_sap;
+	memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN);
+	memcpy(&llc->addr, addr, sizeof(llc->addr));
+	/* assign new connection to its SAP */
+	llc_sap_add_socket(sap, sk);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	rc = 0;
+out_put:
+	llc_sap_put(sap);
+out:
+	return rc;
+}
+
+/**
+ *	llc_ui_shutdown - shutdown a connect llc2 socket.
+ *	@sock: Socket to shutdown.
+ *	@how: What part of the socket to shutdown.
+ *
+ *	Shutdown a connected llc2 socket. Currently this function only supports
+ *	shutting down both sends and receives (2), we could probably make this
+ *	function such that a user can shutdown only half the connection but not
+ *	right now.
+ *	Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int rc = -ENOTCONN;
+
+	lock_sock(sk);
+	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
+		goto out;
+	rc = -EINVAL;
+	if (how != 2)
+		goto out;
+	rc = llc_send_disc(sk);
+	if (!rc)
+		rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+	/* Wake up anyone sleeping in poll */
+	sk->sk_state_change(sk);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/**
+ *	llc_ui_connect - Connect to a remote llc2 mac + sap.
+ *	@sock: Socket which will be connected to the remote destination.
+ *	@uaddr: Remote and possibly the local address of the new connection.
+ *	@addrlen: Size of uaddr structure.
+ *	@flags: Operational flags specified by the user.
+ *
+ *	Connect to a remote llc2 mac + sap. The caller must specify the
+ *	destination mac and address to connect to. If the user hasn't previously
+ *	called bind(2) with a smac the address of the first interface of the
+ *	specified arp type will be used.
+ *	This function will autobind if user did not previously call bind.
+ *	Returns: 0 upon success, negative otherwise.
+ */
+static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
+			  int addrlen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
+	int rc = -EINVAL;
+
+	lock_sock(sk);
+	if (unlikely(addrlen != sizeof(*addr)))
+		goto out;
+	rc = -EAFNOSUPPORT;
+	if (unlikely(addr->sllc_family != AF_LLC))
+		goto out;
+	if (unlikely(sk->sk_type != SOCK_STREAM))
+		goto out;
+	rc = -EALREADY;
+	if (unlikely(sock->state == SS_CONNECTING))
+		goto out;
+	/* bind connection to sap if user hasn't done it. */
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		/* bind to sap with null dev, exclusive */
+		rc = llc_ui_autobind(sock, addr);
+		if (rc)
+			goto out;
+	}
+	llc->daddr.lsap = addr->sllc_sap;
+	memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN);
+	sock->state = SS_CONNECTING;
+	sk->sk_state   = TCP_SYN_SENT;
+	llc->link   = llc_ui_next_link_no(llc->sap->laddr.lsap);
+	rc = llc_establish_connection(sk, llc->dev->dev_addr,
+				      addr->sllc_mac, addr->sllc_sap);
+	if (rc) {
+		dprintk("%s: llc_ui_send_conn failed :-(\n", __func__);
+		sock->state  = SS_UNCONNECTED;
+		sk->sk_state = TCP_CLOSE;
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_SYN_SENT) {
+		const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+		if (!timeo || !llc_ui_wait_for_conn(sk, timeo))
+			goto out;
+
+		rc = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out;
+	}
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto sock_error;
+
+	sock->state = SS_CONNECTED;
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+sock_error:
+	rc = sock_error(sk) ? : -ECONNABORTED;
+	sock->state = SS_UNCONNECTED;
+	goto out;
+}
+
+/**
+ *	llc_ui_listen - allow a normal socket to accept incoming connections
+ *	@sock: Socket to allow incoming connections on.
+ *	@backlog: Number of connections to queue.
+ *
+ *	Allow a normal socket to accept incoming connections.
+ *	Returns 0 upon success, negative otherwise.
+ */
+static int llc_ui_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int rc = -EINVAL;
+
+	lock_sock(sk);
+	if (unlikely(sock->state != SS_UNCONNECTED))
+		goto out;
+	rc = -EOPNOTSUPP;
+	if (unlikely(sk->sk_type != SOCK_STREAM))
+		goto out;
+	rc = -EAGAIN;
+	if (sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+	rc = 0;
+	if (!(unsigned)backlog)	/* BSDism */
+		backlog = 1;
+	sk->sk_max_ack_backlog = backlog;
+	if (sk->sk_state != TCP_LISTEN) {
+		sk->sk_ack_backlog = 0;
+		sk->sk_state	   = TCP_LISTEN;
+	}
+	sk->sk_socket->flags |= __SO_ACCEPTCON;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
+{
+	DEFINE_WAIT(wait);
+	int rc = 0;
+
+	while (1) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE))
+			break;
+		rc = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		rc = -EAGAIN;
+		if (!timeout)
+			break;
+		rc = 0;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return rc;
+}
+
+static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
+{
+	DEFINE_WAIT(wait);
+
+	while (1) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT))
+			break;
+		if (signal_pending(current) || !timeout)
+			break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return timeout;
+}
+
+static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
+{
+	DEFINE_WAIT(wait);
+	struct llc_sock *llc = llc_sk(sk);
+	int rc;
+
+	while (1) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		rc = 0;
+		if (sk_wait_event(sk, &timeout,
+				  (sk->sk_shutdown & RCV_SHUTDOWN) ||
+				  (!llc_data_accept_state(llc->state) &&
+				   !llc->remote_busy_flag &&
+				   !llc->p_flag)))
+			break;
+		rc = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		rc = -EAGAIN;
+		if (!timeout)
+			break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	return rc;
+}
+
+static int llc_wait_data(struct sock *sk, long timeo)
+{
+	int rc;
+
+	while (1) {
+		/*
+		 * POSIX 1003.1g mandates this order.
+		 */
+		rc = sock_error(sk);
+		if (rc)
+			break;
+		rc = 0;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+		rc = -EAGAIN;
+		if (!timeo)
+			break;
+		rc = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		rc = 0;
+		if (sk_wait_data(sk, &timeo))
+			break;
+	}
+	return rc;
+}
+
+static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(skb->sk);
+
+	if (llc->cmsg_flags & LLC_CMSG_PKTINFO) {
+		struct llc_pktinfo info;
+
+		info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex;
+		llc_pdu_decode_dsap(skb, &info.lpi_sap);
+		llc_pdu_decode_da(skb, info.lpi_mac);
+		put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info);
+	}
+}
+
+/**
+ *	llc_ui_accept - accept a new incoming connection.
+ *	@sock: Socket which connections arrive on.
+ *	@newsock: Socket to move incoming connection to.
+ *	@flags: User specified operational flags.
+ *
+ *	Accept a new incoming connection.
+ *	Returns 0 upon success, negative otherwise.
+ */
+static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk, *newsk;
+	struct llc_sock *llc, *newllc;
+	struct sk_buff *skb;
+	int rc = -EOPNOTSUPP;
+
+	dprintk("%s: accepting on %02X\n", __func__,
+		llc_sk(sk)->laddr.lsap);
+	lock_sock(sk);
+	if (unlikely(sk->sk_type != SOCK_STREAM))
+		goto out;
+	rc = -EINVAL;
+	if (unlikely(sock->state != SS_UNCONNECTED ||
+		     sk->sk_state != TCP_LISTEN))
+		goto out;
+	/* wait for a connection to arrive. */
+	if (skb_queue_empty(&sk->sk_receive_queue)) {
+		rc = llc_wait_data(sk, sk->sk_rcvtimeo);
+		if (rc)
+			goto out;
+	}
+	dprintk("%s: got a new connection on %02X\n", __func__,
+		llc_sk(sk)->laddr.lsap);
+	skb = skb_dequeue(&sk->sk_receive_queue);
+	rc = -EINVAL;
+	if (!skb->sk)
+		goto frees;
+	rc = 0;
+	newsk = skb->sk;
+	/* attach connection to a new socket. */
+	llc_ui_sk_init(newsock, newsk);
+	sock_reset_flag(newsk, SOCK_ZAPPED);
+	newsk->sk_state		= TCP_ESTABLISHED;
+	newsock->state		= SS_CONNECTED;
+	llc			= llc_sk(sk);
+	newllc			= llc_sk(newsk);
+	memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr));
+	newllc->link = llc_ui_next_link_no(newllc->laddr.lsap);
+
+	/* put original socket back into a clean listen state. */
+	sk->sk_state = TCP_LISTEN;
+	sk->sk_ack_backlog--;
+	dprintk("%s: ok success on %02X, client on %02X\n", __func__,
+		llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap);
+frees:
+	kfree_skb(skb);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/**
+ *	llc_ui_recvmsg - copy received data to the socket user.
+ *	@sock: Socket to copy data from.
+ *	@msg: Various user space related information.
+ *	@len: Size of user buffer.
+ *	@flags: User specified flags.
+ *
+ *	Copy received data to the socket user.
+ *	Returns non-negative upon success, negative otherwise.
+ */
+static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
+			  struct msghdr *msg, size_t len, int flags)
+{
+	struct sockaddr_llc *uaddr = (struct sockaddr_llc *)msg->msg_name;
+	const int nonblock = flags & MSG_DONTWAIT;
+	struct sk_buff *skb = NULL;
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	size_t copied = 0;
+	u32 peek_seq = 0;
+	u32 *seq;
+	unsigned long used;
+	int target;	/* Read at least this many bytes */
+	long timeo;
+
+	lock_sock(sk);
+	copied = -ENOTCONN;
+	if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN))
+		goto out;
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	seq = &llc->copied_seq;
+	if (flags & MSG_PEEK) {
+		peek_seq = llc->copied_seq;
+		seq = &peek_seq;
+	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+	copied = 0;
+
+	do {
+		u32 offset;
+
+		/*
+		 * We need to check signals first, to get correct SIGURG
+		 * handling. FIXME: Need to check this doesn't impact 1003.1g
+		 * and move it down to the bottom of the loop
+		 */
+		if (signal_pending(current)) {
+			if (copied)
+				break;
+			copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+			break;
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb) {
+			offset = *seq;
+			goto found_ok_skb;
+		}
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && !sk->sk_backlog.tail)
+			break;
+
+		if (copied) {
+			if (sk->sk_err ||
+			    sk->sk_state == TCP_CLOSE ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    (flags & MSG_PEEK))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/*
+					 * This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					copied = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+		}
+
+		if (copied >= target) { /* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else
+			sk_wait_data(sk, &timeo);
+
+		if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "LLC(%s:%d): Application "
+						  "bug, race in MSG_PEEK.\n",
+				       current->comm, task_pid_nr(current));
+			peek_seq = llc->copied_seq;
+		}
+		continue;
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		if (!(flags & MSG_TRUNC)) {
+			int rc = skb_copy_datagram_iovec(skb, offset,
+							 msg->msg_iov, used);
+			if (rc) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+
+		/* For non stream protcols we get one packet per recvmsg call */
+		if (sk->sk_type != SOCK_STREAM)
+			goto copy_uaddr;
+
+		if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, 0);
+			*seq = 0;
+		}
+
+		/* Partial read */
+		if (used + offset < skb->len)
+			continue;
+	} while (len > 0);
+
+out:
+	release_sock(sk);
+	return copied;
+copy_uaddr:
+	if (uaddr != NULL && skb != NULL) {
+		memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
+		msg->msg_namelen = sizeof(*uaddr);
+	}
+	if (llc_sk(sk)->cmsg_flags)
+		llc_cmsg_rcv(msg, skb);
+
+	if (!(flags & MSG_PEEK)) {
+			sk_eat_skb(sk, skb, 0);
+			*seq = 0;
+	}
+
+	goto out;
+}
+
+/**
+ *	llc_ui_sendmsg - Transmit data provided by the socket user.
+ *	@sock: Socket to transmit data from.
+ *	@msg: Various user related information.
+ *	@len: Length of data to transmit.
+ *
+ *	Transmit data provided by the socket user.
+ *	Returns non-negative upon success, negative otherwise.
+ */
+static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock,
+			  struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sockaddr_llc *addr = (struct sockaddr_llc *)msg->msg_name;
+	int flags = msg->msg_flags;
+	int noblock = flags & MSG_DONTWAIT;
+	struct sk_buff *skb;
+	size_t size = 0;
+	int rc = -EINVAL, copied = 0, hdrlen;
+
+	dprintk("%s: sending from %02X to %02X\n", __func__,
+		llc->laddr.lsap, llc->daddr.lsap);
+	lock_sock(sk);
+	if (addr) {
+		if (msg->msg_namelen < sizeof(*addr))
+			goto release;
+	} else {
+		if (llc_ui_addr_null(&llc->addr))
+			goto release;
+		addr = &llc->addr;
+	}
+	/* must bind connection to sap if user hasn't done it. */
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		/* bind to sap with null dev, exclusive. */
+		rc = llc_ui_autobind(sock, addr);
+		if (rc)
+			goto release;
+	}
+	hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
+	size = hdrlen + len;
+	if (size > llc->dev->mtu)
+		size = llc->dev->mtu;
+	copied = size - hdrlen;
+	release_sock(sk);
+	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+	lock_sock(sk);
+	if (!skb)
+		goto release;
+	skb->dev      = llc->dev;
+	skb->protocol = llc_proto_type(addr->sllc_arphrd);
+	skb_reserve(skb, hdrlen);
+	rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied);
+	if (rc)
+		goto out;
+	if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) {
+		llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac,
+					  addr->sllc_sap);
+		goto out;
+	}
+	if (addr->sllc_test) {
+		llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac,
+					    addr->sllc_sap);
+		goto out;
+	}
+	if (addr->sllc_xid) {
+		llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac,
+					   addr->sllc_sap);
+		goto out;
+	}
+	rc = -ENOPROTOOPT;
+	if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua))
+		goto out;
+	rc = llc_ui_send_data(sk, skb, noblock);
+out:
+	if (rc) {
+		kfree_skb(skb);
+release:
+		dprintk("%s: failed sending from %02X to %02X: %d\n",
+			__func__, llc->laddr.lsap, llc->daddr.lsap, rc);
+	}
+	release_sock(sk);
+	return rc ? : copied;
+}
+
+/**
+ *	llc_ui_getname - return the address info of a socket
+ *	@sock: Socket to get address of.
+ *	@uaddr: Address structure to return information.
+ *	@uaddrlen: Length of address structure.
+ *	@peer: Does user want local or remote address information.
+ *
+ *	Return the address information of a socket.
+ */
+static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
+			  int *uaddrlen, int peer)
+{
+	struct sockaddr_llc sllc;
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	int rc = 0;
+
+	memset(&sllc, 0, sizeof(sllc));
+	lock_sock(sk);
+	if (sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+	*uaddrlen = sizeof(sllc);
+	memset(uaddr, 0, *uaddrlen);
+	if (peer) {
+		rc = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+		if(llc->dev)
+			sllc.sllc_arphrd = llc->dev->type;
+		sllc.sllc_sap = llc->daddr.lsap;
+		memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN);
+	} else {
+		rc = -EINVAL;
+		if (!llc->sap)
+			goto out;
+		sllc.sllc_sap = llc->sap->laddr.lsap;
+
+		if (llc->dev) {
+			sllc.sllc_arphrd = llc->dev->type;
+			memcpy(&sllc.sllc_mac, llc->dev->dev_addr,
+			       IFHWADDRLEN);
+		}
+	}
+	rc = 0;
+	sllc.sllc_family = AF_LLC;
+	memcpy(uaddr, &sllc, sizeof(sllc));
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/**
+ *	llc_ui_ioctl - io controls for PF_LLC
+ *	@sock: Socket to get/set info
+ *	@cmd: command
+ *	@arg: optional argument for cmd
+ *
+ *	get/set info on llc sockets
+ */
+static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
+			unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
+/**
+ *	llc_ui_setsockopt - set various connection specific parameters.
+ *	@sock: Socket to set options on.
+ *	@level: Socket level user is requesting operations on.
+ *	@optname: Operation name.
+ *	@optval User provided operation data.
+ *	@optlen: Length of optval.
+ *
+ *	Set various connection specific parameters.
+ */
+static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
+			     char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	unsigned int opt;
+	int rc = -EINVAL;
+
+	lock_sock(sk);
+	if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
+		goto out;
+	rc = get_user(opt, (int __user *)optval);
+	if (rc)
+		goto out;
+	rc = -EINVAL;
+	switch (optname) {
+	case LLC_OPT_RETRY:
+		if (opt > LLC_OPT_MAX_RETRY)
+			goto out;
+		llc->n2 = opt;
+		break;
+	case LLC_OPT_SIZE:
+		if (opt > LLC_OPT_MAX_SIZE)
+			goto out;
+		llc->n1 = opt;
+		break;
+	case LLC_OPT_ACK_TMR_EXP:
+		if (opt > LLC_OPT_MAX_ACK_TMR_EXP)
+			goto out;
+		llc->ack_timer.expire = opt * HZ;
+		break;
+	case LLC_OPT_P_TMR_EXP:
+		if (opt > LLC_OPT_MAX_P_TMR_EXP)
+			goto out;
+		llc->pf_cycle_timer.expire = opt * HZ;
+		break;
+	case LLC_OPT_REJ_TMR_EXP:
+		if (opt > LLC_OPT_MAX_REJ_TMR_EXP)
+			goto out;
+		llc->rej_sent_timer.expire = opt * HZ;
+		break;
+	case LLC_OPT_BUSY_TMR_EXP:
+		if (opt > LLC_OPT_MAX_BUSY_TMR_EXP)
+			goto out;
+		llc->busy_state_timer.expire = opt * HZ;
+		break;
+	case LLC_OPT_TX_WIN:
+		if (opt > LLC_OPT_MAX_WIN)
+			goto out;
+		llc->k = opt;
+		break;
+	case LLC_OPT_RX_WIN:
+		if (opt > LLC_OPT_MAX_WIN)
+			goto out;
+		llc->rw = opt;
+		break;
+	case LLC_OPT_PKTINFO:
+		if (opt)
+			llc->cmsg_flags |= LLC_CMSG_PKTINFO;
+		else
+			llc->cmsg_flags &= ~LLC_CMSG_PKTINFO;
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		goto out;
+	}
+	rc = 0;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+/**
+ *	llc_ui_getsockopt - get connection specific socket info
+ *	@sock: Socket to get information from.
+ *	@level: Socket level user is requesting operations on.
+ *	@optname: Operation name.
+ *	@optval: Variable to return operation data in.
+ *	@optlen: Length of optval.
+ *
+ *	Get connection specific socket information.
+ */
+static int llc_ui_getsockopt(struct socket *sock, int level, int optname,
+			     char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct llc_sock *llc = llc_sk(sk);
+	int val = 0, len = 0, rc = -EINVAL;
+
+	lock_sock(sk);
+	if (unlikely(level != SOL_LLC))
+		goto out;
+	rc = get_user(len, optlen);
+	if (rc)
+		goto out;
+	rc = -EINVAL;
+	if (len != sizeof(int))
+		goto out;
+	switch (optname) {
+	case LLC_OPT_RETRY:
+		val = llc->n2;					break;
+	case LLC_OPT_SIZE:
+		val = llc->n1;					break;
+	case LLC_OPT_ACK_TMR_EXP:
+		val = llc->ack_timer.expire / HZ;		break;
+	case LLC_OPT_P_TMR_EXP:
+		val = llc->pf_cycle_timer.expire / HZ;		break;
+	case LLC_OPT_REJ_TMR_EXP:
+		val = llc->rej_sent_timer.expire / HZ;		break;
+	case LLC_OPT_BUSY_TMR_EXP:
+		val = llc->busy_state_timer.expire / HZ;	break;
+	case LLC_OPT_TX_WIN:
+		val = llc->k;				break;
+	case LLC_OPT_RX_WIN:
+		val = llc->rw;				break;
+	case LLC_OPT_PKTINFO:
+		val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0;
+		break;
+	default:
+		rc = -ENOPROTOOPT;
+		goto out;
+	}
+	rc = 0;
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		rc = -EFAULT;
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static const struct net_proto_family llc_ui_family_ops = {
+	.family = PF_LLC,
+	.create = llc_ui_create,
+	.owner	= THIS_MODULE,
+};
+
+static const struct proto_ops llc_ui_ops = {
+	.family	     = PF_LLC,
+	.owner       = THIS_MODULE,
+	.release     = llc_ui_release,
+	.bind	     = llc_ui_bind,
+	.connect     = llc_ui_connect,
+	.socketpair  = sock_no_socketpair,
+	.accept      = llc_ui_accept,
+	.getname     = llc_ui_getname,
+	.poll	     = datagram_poll,
+	.ioctl       = llc_ui_ioctl,
+	.listen      = llc_ui_listen,
+	.shutdown    = llc_ui_shutdown,
+	.setsockopt  = llc_ui_setsockopt,
+	.getsockopt  = llc_ui_getsockopt,
+	.sendmsg     = llc_ui_sendmsg,
+	.recvmsg     = llc_ui_recvmsg,
+	.mmap	     = sock_no_mmap,
+	.sendpage    = sock_no_sendpage,
+};
+
+static const char llc_proc_err_msg[] __initconst =
+	KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
+static const char llc_sysctl_err_msg[] __initconst =
+	KERN_CRIT "LLC: Unable to register the sysctl entries\n";
+static const char llc_sock_err_msg[] __initconst =
+	KERN_CRIT "LLC: Unable to register the network family\n";
+
+static int __init llc2_init(void)
+{
+	int rc = proto_register(&llc_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	llc_build_offset_table();
+	llc_station_init();
+	llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
+	rc = llc_proc_init();
+	if (rc != 0) {
+		printk(llc_proc_err_msg);
+		goto out_unregister_llc_proto;
+	}
+	rc = llc_sysctl_init();
+	if (rc) {
+		printk(llc_sysctl_err_msg);
+		goto out_proc;
+	}
+	rc = sock_register(&llc_ui_family_ops);
+	if (rc) {
+		printk(llc_sock_err_msg);
+		goto out_sysctl;
+	}
+	llc_add_pack(LLC_DEST_SAP, llc_sap_handler);
+	llc_add_pack(LLC_DEST_CONN, llc_conn_handler);
+out:
+	return rc;
+out_sysctl:
+	llc_sysctl_exit();
+out_proc:
+	llc_proc_exit();
+out_unregister_llc_proto:
+	proto_unregister(&llc_proto);
+	goto out;
+}
+
+static void __exit llc2_exit(void)
+{
+	llc_station_exit();
+	llc_remove_pack(LLC_DEST_SAP);
+	llc_remove_pack(LLC_DEST_CONN);
+	sock_unregister(PF_LLC);
+	llc_proc_exit();
+	llc_sysctl_exit();
+	proto_unregister(&llc_proto);
+}
+
+module_init(llc2_init);
+module_exit(llc2_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003");
+MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support");
+MODULE_ALIAS_NETPROTO(PF_LLC);
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
new file mode 100644
index 00000000..ea225bd2
--- /dev/null
+++ b/net/llc/llc_c_ac.c
@@ -0,0 +1,1444 @@
+/*
+ * llc_c_ac.c - actions performed during connection state transition.
+ *
+ * Description:
+ *   Functions in this module are implementation of connection component actions
+ *   Details of actions can be found in IEEE-802.2 standard document.
+ *   All functions have one connection and one event as input argument. All of
+ *   them return 0 On success and 1 otherwise.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/llc_conn.h>
+#include <net/llc_sap.h>
+#include <net/sock.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/llc_pdu.h>
+#include <net/llc.h>
+
+
+static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb);
+static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb);
+static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev);
+
+static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb);
+
+static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk,
+					       struct sk_buff *skb);
+
+static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb);
+
+#define INCORRECT 0
+
+int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (llc->remote_busy_flag) {
+		u8 nr;
+		struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+		llc->remote_busy_flag = 0;
+		del_timer(&llc->busy_state_timer.timer);
+		nr = LLC_I_GET_NR(pdu);
+		llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+	}
+	return 0;
+}
+
+int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->ind_prim = LLC_CONN_PRIM;
+	return 0;
+}
+
+int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->cfm_prim = LLC_CONN_PRIM;
+	return 0;
+}
+
+static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->cfm_prim = LLC_DATA_PRIM;
+	return 0;
+}
+
+int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb)
+{
+	llc_conn_rtn_pdu(sk, skb);
+	return 0;
+}
+
+int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+	u8 reason = 0;
+	int rc = 0;
+
+	if (ev->type == LLC_CONN_EV_TYPE_PDU) {
+		struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+		if (LLC_PDU_IS_RSP(pdu) &&
+		    LLC_PDU_TYPE_IS_U(pdu) &&
+		    LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM)
+			reason = LLC_DISC_REASON_RX_DM_RSP_PDU;
+		else if (LLC_PDU_IS_CMD(pdu) &&
+			   LLC_PDU_TYPE_IS_U(pdu) &&
+			   LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC)
+			reason = LLC_DISC_REASON_RX_DISC_CMD_PDU;
+	} else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR)
+		reason = LLC_DISC_REASON_ACK_TMR_EXP;
+	else
+		rc = -EINVAL;
+	if (!rc) {
+		ev->reason   = reason;
+		ev->ind_prim = LLC_DISC_PRIM;
+	}
+	return rc;
+}
+
+int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->reason   = ev->status;
+	ev->cfm_prim = LLC_DISC_PRIM;
+	return 0;
+}
+
+int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb)
+{
+	u8 reason = 0;
+	int rc = 1;
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+	struct llc_sock *llc = llc_sk(sk);
+
+	switch (ev->type) {
+	case LLC_CONN_EV_TYPE_PDU:
+		if (LLC_PDU_IS_RSP(pdu) &&
+		    LLC_PDU_TYPE_IS_U(pdu) &&
+		    LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) {
+			reason = LLC_RESET_REASON_LOCAL;
+			rc = 0;
+		} else if (LLC_PDU_IS_CMD(pdu) &&
+			   LLC_PDU_TYPE_IS_U(pdu) &&
+			   LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) {
+			reason = LLC_RESET_REASON_REMOTE;
+			rc = 0;
+		}
+		break;
+	case LLC_CONN_EV_TYPE_ACK_TMR:
+	case LLC_CONN_EV_TYPE_P_TMR:
+	case LLC_CONN_EV_TYPE_REJ_TMR:
+	case LLC_CONN_EV_TYPE_BUSY_TMR:
+		if (llc->retry_count > llc->n2) {
+			reason = LLC_RESET_REASON_LOCAL;
+			rc = 0;
+		}
+		break;
+	}
+	if (!rc) {
+		ev->reason   = reason;
+		ev->ind_prim = LLC_RESET_PRIM;
+	}
+	return rc;
+}
+
+int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->reason   = 0;
+	ev->cfm_prim = LLC_RESET_PRIM;
+	return 0;
+}
+
+int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	if (LLC_PDU_IS_RSP(pdu) &&
+	    LLC_PDU_TYPE_IS_I(pdu) &&
+	    LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf)
+		llc_conn_ac_clear_remote_busy(sk, skb);
+	return 0;
+}
+
+int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk,
+					       struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (llc->data_flag == 2)
+		del_timer(&llc->rej_sent_timer.timer);
+	return 0;
+}
+
+int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_CMD);
+		llc_pdu_init_as_disc_cmd(nskb, 1);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+		llc_conn_ac_set_p_flag_1(sk, skb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+		u8 f_bit;
+
+		llc_pdu_decode_pf_bit(skb, &f_bit);
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_dm_rsp(nskb, f_bit);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_dm_rsp(nskb, 1);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	u8 f_bit;
+	int rc = -ENOBUFS;
+	struct sk_buff *nskb;
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc->rx_pdu_hdr = *((u32 *)pdu);
+	if (LLC_PDU_IS_CMD(pdu))
+		llc_pdu_decode_pf_bit(skb, &f_bit);
+	else
+		f_bit = 0;
+	nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+			       sizeof(struct llc_frmr_info));
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
+					 llc->vR, INCORRECT);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+					       sizeof(struct llc_frmr_info));
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+		struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS,
+					 llc->vR, INCORRECT);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+	u8 f_bit;
+	int rc = -ENOBUFS;
+	struct sk_buff *nskb;
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc_pdu_decode_pf_bit(skb, &f_bit);
+	nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U,
+			       sizeof(struct llc_frmr_info));
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+		struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS,
+					 llc->vR, INCORRECT);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap = llc->sap;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+			    llc->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR);
+	rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+	if (likely(!rc)) {
+		llc_conn_send_pdu(sk, skb);
+		llc_conn_ac_inc_vs_by_1(sk, skb);
+	}
+	return rc;
+}
+
+static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap = llc->sap;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+			    llc->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
+	rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+	if (likely(!rc)) {
+		llc_conn_send_pdu(sk, skb);
+		llc_conn_ac_inc_vs_by_1(sk, skb);
+	}
+	return rc;
+}
+
+int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap = llc->sap;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+			    llc->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
+	rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+	if (likely(!rc)) {
+		llc_conn_send_pdu(sk, skb);
+		llc_conn_ac_inc_vs_by_1(sk, skb);
+	}
+	return 0;
+}
+
+int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	u8 nr = LLC_I_GET_NR(pdu);
+
+	llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+	return 0;
+}
+
+int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk,
+						struct sk_buff *skb)
+{
+	u8 nr;
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (likely(!rc))
+			llc_conn_send_pdu(sk, nskb);
+		else
+			kfree_skb(skb);
+	}
+	if (rc) {
+		nr = LLC_I_GET_NR(pdu);
+		rc = 0;
+		llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
+	}
+	return rc;
+}
+
+int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	u8 nr = LLC_I_GET_NR(pdu);
+
+	llc_conn_resend_i_pdu_as_rsp(sk, nr, 1);
+	return 0;
+}
+
+int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_CMD);
+		llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_CMD);
+		llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (!llc->remote_busy_flag) {
+		llc->remote_busy_flag = 1;
+		mod_timer(&llc->busy_state_timer.timer,
+			 jiffies + llc->busy_state_timer.expire);
+	}
+	return 0;
+}
+
+int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_CMD);
+		llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+		u8 f_bit = 1;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+void llc_conn_set_p_flag(struct sock *sk, u8 value)
+{
+	int state_changed = llc_sk(sk)->p_flag && !value;
+
+	llc_sk(sk)->p_flag = value;
+
+	if (state_changed)
+		sk->sk_state_change(sk);
+}
+
+int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+		u8 *dmac = llc->daddr.mac;
+
+		if (llc->dev->flags & IFF_LOOPBACK)
+			dmac = llc->dev->dev_addr;
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_CMD);
+		llc_pdu_init_as_sabme_cmd(nskb, 1);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+		llc_conn_set_p_flag(sk, 1);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb)
+{
+	u8 f_bit;
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0);
+
+	llc_pdu_decode_pf_bit(skb, &f_bit);
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		nskb->dev = llc->dev;
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_ua_rsp(nskb, f_bit);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->s_flag = 0;
+	return 0;
+}
+
+int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->s_flag = 1;
+	return 0;
+}
+
+int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc_conn_set_p_flag(sk, 1);
+	mod_timer(&llc->pf_cycle_timer.timer,
+		  jiffies + llc->pf_cycle_timer.expire);
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_send_ack_if_needed - check if ack is needed
+ *	@sk: current connection structure
+ *	@skb: current event
+ *
+ *	Checks number of received PDUs which have not been acknowledged, yet,
+ *	If number of them reaches to "npta"(Number of PDUs To Acknowledge) then
+ *	sends an RR response as acknowledgement for them.  Returns 0 for
+ *	success, 1 otherwise.
+ */
+int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb)
+{
+	u8 pf_bit;
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc_pdu_decode_pf_bit(skb, &pf_bit);
+	llc->ack_pf |= pf_bit & 1;
+	if (!llc->ack_must_be_send) {
+		llc->first_pdu_Ns = llc->vR;
+		llc->ack_must_be_send = 1;
+		llc->ack_pf = pf_bit & 1;
+	}
+	if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO)
+			% LLC_2_SEQ_NBR_MODULO) >= llc->npta) {
+		llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb);
+		llc->ack_must_be_send	= 0;
+		llc->ack_pf		= 0;
+		llc_conn_ac_inc_npta_value(sk, skb);
+	}
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag
+ *	@sk: current connection structure
+ *	@skb: current event
+ *
+ *	This action resets ack_must_be_send flag of given connection, this flag
+ *	indicates if there is any PDU which has not been acknowledged yet.
+ *	Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0;
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs
+ *	@sk: current connection structure
+ *	@skb: current event
+ *
+ *	Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to
+ *	all received PDUs which have not been acknowledged, yet. ack_pf flag is
+ *	set to one if one PDU with p-bit set to one is received.  Returns 0 for
+ *	success, 1 otherwise.
+ */
+static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	int rc;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_sap *sap = llc->sap;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap,
+			    llc->daddr.lsap, LLC_PDU_RSP);
+	llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR);
+	rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
+	if (likely(!rc)) {
+		llc_conn_send_pdu(sk, skb);
+		llc_conn_ac_inc_vs_by_1(sk, skb);
+	}
+	return rc;
+}
+
+/**
+ *	llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	This action sends an I-format PDU as acknowledge to received PDUs which
+ *	have not been acknowledged, yet, if there is any. By using of this
+ *	action number of acknowledgements decreases, this technic is called
+ *	piggy backing. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (llc->ack_must_be_send) {
+		llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb);
+		llc->ack_must_be_send = 0 ;
+		llc->ack_pf = 0;
+	} else
+		llc_conn_ac_send_i_cmd_p_set_0(sk, skb);
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	This action sends an RR response with f-bit set to ack_pf flag as
+ *	acknowledge to all received PDUs which have not been acknowledged, yet,
+ *	if there is any. ack_pf flag indicates if a PDU has been received with
+ *	p-bit set to one. Returns 0 for success, 1 otherwise.
+ */
+static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk,
+					       struct sk_buff *skb)
+{
+	int rc = -ENOBUFS;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0);
+
+	if (nskb) {
+		struct llc_sap *sap = llc->sap;
+
+		llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap,
+				    llc->daddr.lsap, LLC_PDU_RSP);
+		llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR);
+		rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac);
+		if (unlikely(rc))
+			goto free;
+		llc_conn_send_pdu(sk, nskb);
+	}
+out:
+	return rc;
+free:
+	kfree_skb(nskb);
+	goto out;
+}
+
+/**
+ *	llc_conn_ac_inc_npta_value - tries to make value of npta greater
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	After "inc_cntr" times calling of this action, "npta" increase by one.
+ *	this action tries to make vale of "npta" greater as possible; number of
+ *	acknowledgements decreases by increasing of "npta". Returns 0 for
+ *	success, 1 otherwise.
+ */
+static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (!llc->inc_cntr) {
+		llc->dec_step = 0;
+		llc->dec_cntr = llc->inc_cntr = 2;
+		++llc->npta;
+		if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO)
+			llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO;
+	} else
+		--llc->inc_cntr;
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	After receiving "dec_cntr" times RR command, this action decreases
+ *	"npta" by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (!llc->connect_step && !llc->remote_busy_flag) {
+		if (!llc->dec_step) {
+			if (!llc->dec_cntr) {
+				llc->inc_cntr = llc->dec_cntr = 2;
+				if (llc->npta > 0)
+					llc->npta = llc->npta - 1;
+			} else
+				llc->dec_cntr -=1;
+		}
+	} else
+		llc->connect_step = 0 ;
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	After receiving "dec_cntr" times RNR command, this action decreases
+ *	"npta" by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (llc->remote_busy_flag)
+		if (!llc->dec_step) {
+			if (!llc->dec_cntr) {
+				llc->inc_cntr = llc->dec_cntr = 2;
+				if (llc->npta > 0)
+					--llc->npta;
+			} else
+				--llc->dec_cntr;
+		}
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_dec_tx_win_size - decreases tx window size
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	After receiving of a REJ command or response, transmit window size is
+ *	decreased by number of PDUs which are outstanding yet. Returns 0 for
+ *	success, 1 otherwise.
+ */
+int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+	u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q);
+
+	if (llc->k - unacked_pdu < 1)
+		llc->k = 1;
+	else
+		llc->k -= unacked_pdu;
+	return 0;
+}
+
+/**
+ *	llc_conn_ac_inc_tx_win_size - tx window size is inc by 1
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	After receiving an RR response with f-bit set to one, transmit window
+ *	size is increased by one. Returns 0 for success, 1 otherwise.
+ */
+int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc->k += 1;
+	if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO)
+		llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO;
+	return 0;
+}
+
+int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	del_timer(&llc->pf_cycle_timer.timer);
+	del_timer(&llc->ack_timer.timer);
+	del_timer(&llc->rej_sent_timer.timer);
+	del_timer(&llc->busy_state_timer.timer);
+	llc->ack_must_be_send = 0;
+	llc->ack_pf = 0;
+	return 0;
+}
+
+int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	del_timer(&llc->rej_sent_timer.timer);
+	del_timer(&llc->pf_cycle_timer.timer);
+	del_timer(&llc->busy_state_timer.timer);
+	llc->ack_must_be_send = 0;
+	llc->ack_pf = 0;
+	return 0;
+}
+
+int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire);
+	return 0;
+}
+
+int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	mod_timer(&llc->rej_sent_timer.timer,
+		  jiffies + llc->rej_sent_timer.expire);
+	return 0;
+}
+
+int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk,
+					     struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (!timer_pending(&llc->ack_timer.timer))
+		mod_timer(&llc->ack_timer.timer,
+			  jiffies + llc->ack_timer.expire);
+	return 0;
+}
+
+int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb)
+{
+	del_timer(&llc_sk(sk)->ack_timer.timer);
+	return 0;
+}
+
+int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	del_timer(&llc->pf_cycle_timer.timer);
+	llc_conn_set_p_flag(sk, 0);
+	return 0;
+}
+
+int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb)
+{
+	del_timer(&llc_sk(sk)->rej_sent_timer.timer);
+	return 0;
+}
+
+int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb)
+{
+	int acked;
+	u16 unacked = 0;
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc->last_nr = PDU_SUPV_GET_Nr(pdu);
+	acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked);
+	/* On loopback we don't queue I frames in unack_pdu_q queue. */
+	if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) {
+		llc->retry_count = 0;
+		del_timer(&llc->ack_timer.timer);
+		if (llc->failed_data_req) {
+			/* already, we did not accept data from upper layer
+			 * (tx_window full or unacceptable state). Now, we
+			 * can send data and must inform to upper layer.
+			 */
+			llc->failed_data_req = 0;
+			llc_conn_ac_data_confirm(sk, skb);
+		}
+		if (unacked)
+			mod_timer(&llc->ack_timer.timer,
+				  jiffies + llc->ack_timer.expire);
+	} else if (llc->failed_data_req) {
+		u8 f_bit;
+
+		llc_pdu_decode_pf_bit(skb, &f_bit);
+		if (f_bit == 1) {
+			llc->failed_data_req = 0;
+			llc_conn_ac_data_confirm(sk, skb);
+		}
+	}
+	return 0;
+}
+
+int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	if (LLC_PDU_IS_RSP(pdu)) {
+		u8 f_bit;
+
+		llc_pdu_decode_pf_bit(skb, &f_bit);
+		if (f_bit) {
+			llc_conn_set_p_flag(sk, 0);
+			llc_conn_ac_stop_p_timer(sk, skb);
+		}
+	}
+	return 0;
+}
+
+int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->data_flag = 2;
+	return 0;
+}
+
+int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->data_flag = 0;
+	return 0;
+}
+
+int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->data_flag = 1;
+	return 0;
+}
+
+int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk,
+						  struct sk_buff *skb)
+{
+	if (!llc_sk(sk)->data_flag)
+		llc_sk(sk)->data_flag = 1;
+	return 0;
+}
+
+int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_conn_set_p_flag(sk, 0);
+	return 0;
+}
+
+static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_conn_set_p_flag(sk, 1);
+	return 0;
+}
+
+int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->remote_busy_flag = 0;
+	return 0;
+}
+
+int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->cause_flag = 0;
+	return 0;
+}
+
+int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->cause_flag = 1;
+	return 0;
+}
+
+int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->retry_count = 0;
+	return 0;
+}
+
+int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->retry_count++;
+	return 0;
+}
+
+int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->vR = 0;
+	return 0;
+}
+
+int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR);
+	return 0;
+}
+
+int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->vS = 0;
+	return 0;
+}
+
+int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->vS = llc_sk(sk)->last_nr;
+	return 0;
+}
+
+static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+	return 0;
+}
+
+static void llc_conn_tmr_common_cb(unsigned long timeout_data, u8 type)
+{
+	struct sock *sk = (struct sock *)timeout_data;
+	struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
+
+	bh_lock_sock(sk);
+	if (skb) {
+		struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+		skb_set_owner_r(skb, sk);
+		ev->type = type;
+		llc_process_tmr_ev(sk, skb);
+	}
+	bh_unlock_sock(sk);
+}
+
+void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data)
+{
+	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_P_TMR);
+}
+
+void llc_conn_busy_tmr_cb(unsigned long timeout_data)
+{
+	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_BUSY_TMR);
+}
+
+void llc_conn_ack_tmr_cb(unsigned long timeout_data)
+{
+	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_ACK_TMR);
+}
+
+void llc_conn_rej_tmr_cb(unsigned long timeout_data)
+{
+	llc_conn_tmr_common_cb(timeout_data, LLC_CONN_EV_TYPE_REJ_TMR);
+}
+
+int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk(sk)->X = llc_sk(sk)->vS;
+	llc_conn_ac_set_vs_nr(sk, skb);
+	return 0;
+}
+
+int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	u8 nr = PDU_SUPV_GET_Nr(pdu);
+
+	if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X))
+		llc_conn_ac_set_vs_nr(sk, skb);
+	return 0;
+}
+
+/*
+ * Non-standard actions; these not contained in IEEE specification; for
+ * our own usage
+ */
+/**
+ *	llc_conn_disc - removes connection from SAP list and frees it
+ *	@sk: closed connection
+ *	@skb: occurred event
+ */
+int llc_conn_disc(struct sock *sk, struct sk_buff *skb)
+{
+	/* FIXME: this thing seems to want to die */
+	return 0;
+}
+
+/**
+ *	llc_conn_reset - resets connection
+ *	@sk : reseting connection.
+ *	@skb: occurred event.
+ *
+ *	Stop all timers, empty all queues and reset all flags.
+ */
+int llc_conn_reset(struct sock *sk, struct sk_buff *skb)
+{
+	llc_sk_reset(sk);
+	return 0;
+}
+
+/**
+ *	llc_circular_between - designates that b is between a and c or not
+ *	@a: lower bound
+ *	@b: element to see if is between a and b
+ *	@c: upper bound
+ *
+ *	This function designates that b is between a and c or not (for example,
+ *	0 is between 127 and 1). Returns 1 if b is between a and c, 0
+ *	otherwise.
+ */
+u8 llc_circular_between(u8 a, u8 b, u8 c)
+{
+	b = b - a;
+	c = c - a;
+	return b <= c;
+}
+
+/**
+ *	llc_process_tmr_ev - timer backend
+ *	@sk: active connection
+ *	@skb: occurred event
+ *
+ *	This function is called from timer callback functions. When connection
+ *	is busy (during sending a data frame) timer expiration event must be
+ *	queued. Otherwise this event can be sent to connection state machine.
+ *	Queued events will process by llc_backlog_rcv function after sending
+ *	data frame.
+ */
+static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb)
+{
+	if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) {
+		printk(KERN_WARNING "%s: timer called on closed connection\n",
+		       __func__);
+		kfree_skb(skb);
+	} else {
+		if (!sock_owned_by_user(sk))
+			llc_conn_state_process(sk, skb);
+		else {
+			llc_set_backlog_type(skb, LLC_EVENT);
+			__sk_add_backlog(sk, skb);
+		}
+	}
+}
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
new file mode 100644
index 00000000..523fdd1c
--- /dev/null
+++ b/net/llc/llc_c_ev.c
@@ -0,0 +1,748 @@
+/*
+ * llc_c_ev.c - Connection component state transition event qualifiers
+ *
+ * A 'state' consists of a number of possible event matching functions,
+ * the actions associated with each being executed when that event is
+ * matched; a 'state machine' accepts events in a serial fashion from an
+ * event queue. Each event is passed to each successive event matching
+ * function until a match is made (the event matching function returns
+ * success, or '0') or the list of event matching functions is exhausted.
+ * If a match is made, the actions associated with the event are executed
+ * and the state is changed to that event's transition state. Before some
+ * events are recognized, even after a match has been made, a certain
+ * number of 'event qualifier' functions must also be executed. If these
+ * all execute successfully, then the event is finally executed.
+ *
+ * These event functions must return 0 for success, to show a matched
+ * event, of 1 if the event does not match. Event qualifier functions
+ * must return a 0 for success or a non-zero for failure. Each function
+ * is simply responsible for verifying one single thing and returning
+ * either a success or failure.
+ *
+ * All of followed event functions are described in 802.2 LLC Protocol
+ * standard document except two functions that we added that will explain
+ * in their comments, at below.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <net/llc_conn.h>
+#include <net/llc_sap.h>
+#include <net/sock.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_pdu.h>
+
+#if 1
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/**
+ *	llc_util_ns_inside_rx_window - check if sequence number is in rx window
+ *	@ns: sequence number of received pdu.
+ *	@vr: sequence number which receiver expects to receive.
+ *	@rw: receive window size of receiver.
+ *
+ *	Checks if sequence number of received PDU is in range of receive
+ *	window. Returns 0 for success, 1 otherwise
+ */
+static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw)
+{
+	return !llc_circular_between(vr, ns,
+				     (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO);
+}
+
+/**
+ *	llc_util_nr_inside_tx_window - check if sequence number is in tx window
+ *	@sk: current connection.
+ *	@nr: N(R) of received PDU.
+ *
+ *	This routine checks if N(R) of received PDU is in range of transmit
+ *	window; on the other hand checks if received PDU acknowledges some
+ *	outstanding PDUs that are in transmit window. Returns 0 for success, 1
+ *	otherwise.
+ */
+static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
+{
+	u8 nr1, nr2;
+	struct sk_buff *skb;
+	struct llc_pdu_sn *pdu;
+	struct llc_sock *llc = llc_sk(sk);
+	int rc = 0;
+
+	if (llc->dev->flags & IFF_LOOPBACK)
+		goto out;
+	rc = 1;
+	if (skb_queue_empty(&llc->pdu_unack_q))
+		goto out;
+	skb = skb_peek(&llc->pdu_unack_q);
+	pdu = llc_pdu_sn_hdr(skb);
+	nr1 = LLC_I_GET_NS(pdu);
+	skb = skb_peek_tail(&llc->pdu_unack_q);
+	pdu = llc_pdu_sn_hdr(skb);
+	nr2 = LLC_I_GET_NS(pdu);
+	rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO);
+out:
+	return rc;
+}
+
+int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->prim == LLC_CONN_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->prim == LLC_DATA_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->prim == LLC_DISC_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->prim == LLC_RESET_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+	       ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1;
+}
+
+int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+	       ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1;
+}
+
+int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb)
+{
+	return 1;
+}
+
+int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1;
+}
+
+int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1;
+}
+
+int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_0(pdu) &&
+	       LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_1(pdu) &&
+	       LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_0(pdu) && ns != vr &&
+	       !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_1(pdu) && ns != vr &&
+	       !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk,
+					     struct sk_buff *skb)
+{
+	const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+	const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+		ns != vr &&
+		 llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+	if (!rc)
+		dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
+			__func__, llc_sk(sk)->state, ns, vr);
+	return rc;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_0(pdu) &&
+	       LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_1(pdu) &&
+	       LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_0(pdu) && ns != vr &&
+	       !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+	       LLC_I_PF_IS_1(pdu) && ns != vr &&
+	       !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr &&
+	       !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+}
+
+int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk,
+					     struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vr = llc_sk(sk)->vR;
+	const u8 ns = LLC_I_GET_NS(pdu);
+	const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) &&
+		ns != vr &&
+		 llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1;
+	if (!rc)
+		dprintk("%s: matched, state=%d, ns=%d, vr=%d\n",
+			__func__, llc_sk(sk)->state, ns, vr);
+	return rc;
+}
+
+int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_0(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	return llc_conn_space(sk, skb) &&
+	       LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) &&
+	       LLC_S_PF_IS_1(pdu) &&
+	       LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1;
+}
+
+int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1;
+}
+
+int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1;
+}
+
+int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb)
+{
+	u16 rc = 1;
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	if (LLC_PDU_IS_CMD(pdu)) {
+		if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) {
+			if (LLC_I_PF_IS_1(pdu))
+				rc = 0;
+		} else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu))
+			rc = 0;
+	}
+	return rc;
+}
+
+int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	u16 rc = 1;
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	if (LLC_PDU_IS_CMD(pdu)) {
+		if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
+			rc = 0;
+		else if (LLC_PDU_TYPE_IS_U(pdu))
+			switch (LLC_U_PDU_CMD(pdu)) {
+			case LLC_2_PDU_CMD_SABME:
+			case LLC_2_PDU_CMD_DISC:
+				rc = 0;
+				break;
+			}
+	}
+	return rc;
+}
+
+int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb)
+{
+	u16 rc = 1;
+	const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	if (LLC_PDU_IS_RSP(pdu)) {
+		if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu))
+			rc = 0;
+		else if (LLC_PDU_TYPE_IS_U(pdu))
+			switch (LLC_U_PDU_RSP(pdu)) {
+			case LLC_2_PDU_RSP_UA:
+			case LLC_2_PDU_RSP_DM:
+			case LLC_2_PDU_RSP_FRMR:
+				rc = 0;
+				break;
+			}
+	}
+
+	return rc;
+}
+
+int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk,
+					       struct sk_buff *skb)
+{
+	u16 rc = 1;
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vs = llc_sk(sk)->vS;
+	const u8 nr = LLC_I_GET_NR(pdu);
+
+	if (LLC_PDU_IS_CMD(pdu) &&
+	    (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
+	    nr != vs && llc_util_nr_inside_tx_window(sk, nr)) {
+		dprintk("%s: matched, state=%d, vs=%d, nr=%d\n",
+			__func__, llc_sk(sk)->state, vs, nr);
+		rc = 0;
+	}
+	return rc;
+}
+
+int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk,
+					       struct sk_buff *skb)
+{
+	u16 rc = 1;
+	const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+	const u8 vs = llc_sk(sk)->vS;
+	const u8 nr = LLC_I_GET_NR(pdu);
+
+	if (LLC_PDU_IS_RSP(pdu) &&
+	    (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) &&
+	    nr != vs && llc_util_nr_inside_tx_window(sk, nr)) {
+		rc = 0;
+		dprintk("%s: matched, state=%d, vs=%d, nr=%d\n",
+			__func__, llc_sk(sk)->state, vs, nr);
+	}
+	return rc;
+}
+
+int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb)
+{
+	return 0;
+}
+
+int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type != LLC_CONN_EV_TYPE_P_TMR;
+}
+
+int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type != LLC_CONN_EV_TYPE_ACK_TMR;
+}
+
+int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type != LLC_CONN_EV_TYPE_REJ_TMR;
+}
+
+int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR;
+}
+
+int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb)
+{
+	return 1;
+}
+
+int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb)
+{
+	const struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	return ev->type == LLC_CONN_EV_TYPE_SIMPLE &&
+	       ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1;
+}
+
+/* Event qualifier functions
+ *
+ * these functions simply verify the value of a state flag associated with
+ * the connection and return either a 0 for success or a non-zero value
+ * for not-success; verify the event is the type we expect
+ */
+int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->data_flag != 1;
+}
+
+int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->data_flag;
+}
+
+int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->data_flag != 2;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->p_flag != 1;
+}
+
+/**
+ *	conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	This function determines when frame which is sent, is last frame of
+ *	transmit window, if it is then this function return zero else return
+ *	one.  This function is used for sending last frame of transmit window
+ *	as I-format command with p-bit set to one. Returns 0 if frame is last
+ *	frame, 1 otherwise.
+ */
+int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k);
+}
+
+/**
+ *	conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window
+ *	@sk: current connection structure.
+ *	@skb: current event.
+ *
+ *	This function determines when frame which is sent, isn't last frame of
+ *	transmit window, if it isn't then this function return zero else return
+ *	one. Returns 0 if frame isn't last frame, 1 otherwise.
+ */
+int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->p_flag;
+}
+
+int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb)
+{
+	u8 f_bit;
+
+	llc_pdu_decode_pf_bit(skb, &f_bit);
+	return llc_sk(sk)->p_flag == f_bit ? 0 : 1;
+}
+
+int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->remote_busy_flag;
+}
+
+int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return !llc_sk(sk)->remote_busy_flag;
+}
+
+int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb)
+{
+	return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2);
+}
+
+int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb)
+{
+	return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2);
+}
+
+int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return !llc_sk(sk)->s_flag;
+}
+
+int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->s_flag;
+}
+
+int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb)
+{
+	return !llc_sk(sk)->cause_flag;
+}
+
+int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb)
+{
+	return llc_sk(sk)->cause_flag;
+}
+
+int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_CONN;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_DISC;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_FAILED;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_REMOTE_BUSY;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_REFUSE;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_CONFLICT;
+	return 0;
+}
+
+int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->status = LLC_STATUS_RESET_DONE;
+	return 0;
+}
diff --git a/net/llc/llc_c_st.c b/net/llc/llc_c_st.c
new file mode 100644
index 00000000..818a9428
--- /dev/null
+++ b/net/llc/llc_c_st.c
@@ -0,0 +1,4946 @@
+/*
+ * llc_c_st.c - This module contains state transition of connection component.
+ *
+ * Description of event functions and actions there is in 802.2 LLC standard,
+ * or in "llc_c_ac.c" and "llc_c_ev.c" modules.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/types.h>
+#include <net/llc_if.h>
+#include <net/llc_sap.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+
+#define NONE NULL
+
+/* COMMON CONNECTION STATE transitions
+ * Common transitions for
+ * LLC_CONN_STATE_NORMAL,
+ * LLC_CONN_STATE_BUSY,
+ * LLC_CONN_STATE_REJ,
+ * LLC_CONN_STATE_AWAIT,
+ * LLC_CONN_STATE_AWAIT_BUSY and
+ * LLC_CONN_STATE_AWAIT_REJ states
+ */
+/* State transitions for LLC_CONN_EV_DISC_REQ event */
+static llc_conn_action_t llc_common_actions_1[] = {
+	[0] = llc_conn_ac_send_disc_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_1 = {
+	.ev	       = llc_conn_ev_disc_req,
+	.next_state    = LLC_CONN_STATE_D_CONN,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RESET_REQ event */
+static llc_conn_action_t llc_common_actions_2[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_2 = {
+	.ev	       = llc_conn_ev_rst_req,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_common_actions_3[] = {
+	[0] = llc_conn_ac_stop_all_timers,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[4] = llc_conn_ac_rst_ind,
+	[5] = llc_conn_ac_set_p_flag_0,
+	[6] = llc_conn_ac_set_remote_busy_0,
+	[7] = llc_conn_reset,
+	[8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_3 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_common_actions_4[] = {
+	[0] = llc_conn_ac_stop_all_timers,
+	[1] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[2] = llc_conn_ac_disc_ind,
+	[3] = llc_conn_disc,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */
+static llc_conn_action_t llc_common_actions_5[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_rst_ind,
+	[5] = llc_conn_ac_set_cause_flag_0,
+	[6] = llc_conn_reset,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static llc_conn_action_t llc_common_actions_6[] = {
+	[0] = llc_conn_ac_disc_ind,
+	[1] = llc_conn_ac_stop_all_timers,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_6 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_ZZZ_CMD_Pbit_SET_X_INVAL_Nr event */
+static llc_conn_action_t llc_common_actions_7a[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_7a = {
+	.ev	       = llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_X_INVAL_Ns event */
+static llc_conn_action_t llc_common_actions_7b[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_7b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_ZZZ_RSP_Fbit_SET_X_INVAL_Nr event */
+static llc_conn_action_t llc_common_actions_8a[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_INVAL_Ns event */
+static llc_conn_action_t llc_common_actions_8b[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_BAD_PDU event */
+static llc_conn_action_t llc_common_actions_8c[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_8c = {
+	.ev	       = llc_conn_ev_rx_bad_pdu,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_8c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */
+static llc_conn_action_t llc_common_actions_9[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_9 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_common_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_1 event */
+#if 0
+static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_10[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_common_actions_10[] = {
+	[0] = llc_conn_ac_send_frmr_rsp_f_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_10 = {
+	.ev	       = llc_conn_ev_rx_xxx_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = llc_common_ev_qfyrs_10,
+	.ev_actions    = llc_common_actions_10,
+};
+#endif
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11a[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_common_actions_11a[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_0,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11a = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_common_ev_qfyrs_11a,
+	.ev_actions    = llc_common_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11b[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_common_actions_11b[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_0,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11b = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_common_ev_qfyrs_11b,
+	.ev_actions    = llc_common_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11c[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_common_actions_11c[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_0,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11c = {
+	.ev	       = llc_conn_ev_rej_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_common_ev_qfyrs_11c,
+	.ev_actions    = llc_common_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11d[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_common_actions_11d[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_stop_other_timers,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_0,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_common_state_trans_11d = {
+	.ev	       = llc_conn_ev_busy_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_common_ev_qfyrs_11d,
+	.ev_actions    = llc_common_actions_11d,
+};
+
+/*
+ * Common dummy state transition; must be last entry for all state
+ * transition groups - it'll be on .bss, so will be zeroed.
+ */
+static struct llc_conn_state_trans llc_common_state_trans_end;
+
+/* LLC_CONN_STATE_ADM transitions */
+/* State transitions for LLC_CONN_EV_CONN_REQ event */
+static llc_conn_action_t llc_adm_actions_1[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_set_retry_cnt_0,
+	[3] = llc_conn_ac_set_s_flag_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_1 = {
+	.ev	       = llc_conn_ev_conn_req,
+	.next_state    = LLC_CONN_STATE_SETUP,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_adm_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_adm_actions_2[] = {
+	[0] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_p_flag_0,
+	[5] = llc_conn_ac_set_remote_busy_0,
+	[6] = llc_conn_ac_conn_ind,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_adm_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_adm_actions_3[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_3 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_adm_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_adm_actions_4[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_1,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_xxx_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_adm_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_YYY event */
+static llc_conn_action_t llc_adm_actions_5[] = {
+	[0] = llc_conn_disc,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_adm_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_any_frame,
+	.next_state    = LLC_CONN_OUT_OF_SVC,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_adm_actions_5,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_adm_state_transitions[] = {
+	[0] = &llc_adm_state_trans_1,		/* Request */
+	[1] = &llc_common_state_trans_end,
+	[2] = &llc_common_state_trans_end,	/* local_busy */
+	[3] = &llc_common_state_trans_end,	/* init_pf_cycle */
+	[4] = &llc_common_state_trans_end,	/* timer */
+	[5] = &llc_adm_state_trans_2,		/* Receive frame */
+	[6] = &llc_adm_state_trans_3,
+	[7] = &llc_adm_state_trans_4,
+	[8] = &llc_adm_state_trans_5,
+	[9] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_SETUP transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_setup_actions_1[] = {
+	[0] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_set_s_flag_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_1 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_SETUP,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_setup_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = llc_conn_ev_qlfy_set_status_conn,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_2[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_set_remote_busy_0,
+	[5] = llc_conn_ac_conn_confirm,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_setup_ev_qfyrs_2,
+	.ev_actions    = llc_setup_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_s_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_conn,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_3[] = {
+	[0] = llc_conn_ac_set_p_flag_0,
+	[1] = llc_conn_ac_set_remote_busy_0,
+	[2] = llc_conn_ac_conn_confirm,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_3 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_setup_ev_qfyrs_3,
+	.ev_actions    = llc_setup_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_set_status_disc,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_4[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_ac_conn_confirm,
+	[3] = llc_conn_disc,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_setup_ev_qfyrs_4,
+	.ev_actions    = llc_setup_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_5[] = {
+	[0] = llc_conn_ev_qlfy_set_status_disc,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_5[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_conn_confirm,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_setup_ev_qfyrs_5,
+	.ev_actions    = llc_setup_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_7[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = llc_conn_ev_qlfy_s_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_7[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_7 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_SETUP,
+	.ev_qualifiers = llc_setup_ev_qfyrs_7,
+	.ev_actions    = llc_setup_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_8[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = llc_conn_ev_qlfy_s_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_set_status_failed,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_setup_actions_8[] = {
+	[0] = llc_conn_ac_conn_confirm,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_setup_state_trans_8 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_setup_ev_qfyrs_8,
+	.ev_actions    = llc_setup_actions_8,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_setup_state_transitions[] = {
+	 [0] = &llc_common_state_trans_end,	/* Request */
+	 [1] = &llc_common_state_trans_end,	/* local busy */
+	 [2] = &llc_common_state_trans_end,	/* init_pf_cycle */
+	 [3] = &llc_setup_state_trans_3,	/* Timer */
+	 [4] = &llc_setup_state_trans_7,
+	 [5] = &llc_setup_state_trans_8,
+	 [6] = &llc_common_state_trans_end,
+	 [7] = &llc_setup_state_trans_1,	/* Receive frame */
+	 [8] = &llc_setup_state_trans_2,
+	 [9] = &llc_setup_state_trans_4,
+	[10] = &llc_setup_state_trans_5,
+	[11] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_NORMAL transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_last_frame_eq_0,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_1[] = {
+	[0] = llc_conn_ac_send_i_as_ack,
+	[1] = llc_conn_ac_start_ack_tmr_if_not_running,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_1,
+	.ev_actions    = llc_normal_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_last_frame_eq_1,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_2[] = {
+	[0] = llc_conn_ac_send_i_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_2 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_2,
+	.ev_actions    = llc_normal_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_remote_busy,
+	[2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_normal_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_normal_state_trans_2_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_2_1,
+	.ev_actions    = llc_normal_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_3[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[2] = llc_conn_ac_set_data_flag_0,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_3 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_normal_ev_qfyrs_3,
+	.ev_actions    = llc_normal_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_4[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[2] = llc_conn_ac_set_data_flag_0,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_4 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_normal_ev_qfyrs_4,
+	.ev_actions    = llc_normal_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_5a[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_start_rej_timer,
+	[5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_normal_ev_qfyrs_5a,
+	.ev_actions    = llc_normal_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_5b[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_start_rej_timer,
+	[5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_normal_ev_qfyrs_5b,
+	.ev_actions    = llc_normal_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_5c[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_start_rej_timer,
+	[5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_5c = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_normal_ev_qfyrs_5c,
+	.ev_actions    = llc_normal_actions_5c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_6a[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_6a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_normal_ev_qfyrs_6a,
+	.ev_actions    = llc_normal_actions_6a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_6b[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_6b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_normal_ev_qfyrs_6b,
+	.ev_actions    = llc_normal_actions_6b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_normal_actions_7[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rej_rsp_f_set_1,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_7 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_8[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[5] = llc_conn_ac_send_ack_if_needed,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_8a,
+	.ev_actions    = llc_normal_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_8b,
+	.ev_actions    = llc_normal_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_9a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_send_ack_if_needed,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_9a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_9a,
+	.ev_actions    = llc_normal_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_9b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_send_ack_if_needed,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_9b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_9b,
+	.ev_actions    = llc_normal_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_normal_actions_10[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_send_ack_rsp_f_set_1,
+	[2] = llc_conn_ac_rst_sendack_flag,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_data_ind,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_10 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_10,
+};
+
+/* State transitions for * LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_normal_actions_11a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_normal_actions_11b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_11c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_11c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_inc_tx_win_size,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_11c = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_11c,
+	.ev_actions    = llc_normal_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_normal_actions_12[] = {
+	[0] = llc_conn_ac_send_ack_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_adjust_npta_by_rr,
+	[3] = llc_conn_ac_rst_sendack_flag,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_12 = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_normal_actions_13a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_13a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_normal_actions_13b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_13b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_13c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_13c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_13c = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_13c,
+	.ev_actions    = llc_normal_actions_13c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_normal_actions_14[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_adjust_npta_by_rnr,
+	[3] = llc_conn_ac_rst_sendack_flag,
+	[4] = llc_conn_ac_set_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_14 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_14,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_15a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_dec_tx_win_size,
+	[4] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[5] = llc_conn_ac_clear_remote_busy,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_15a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_15a,
+	.ev_actions    = llc_normal_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_15b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_dec_tx_win_size,
+	[4] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[5] = llc_conn_ac_clear_remote_busy,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_15b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_15b,
+	.ev_actions    = llc_normal_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_16a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_dec_tx_win_size,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_16a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_16a,
+	.ev_actions    = llc_normal_actions_16a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_16b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_dec_tx_win_size,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_16b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_16b,
+	.ev_actions    = llc_normal_actions_16b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_normal_actions_17[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_dec_tx_win_size,
+	[3] = llc_conn_ac_resend_i_rsp_f_set_1,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_17 = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_normal_actions_17,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_18[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_18[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_18 = {
+	.ev	       = llc_conn_ev_init_p_f_cycle,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_18,
+	.ev_actions    = llc_normal_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_19[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_19[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[2] = llc_conn_ac_rst_vs,
+	[3] = llc_conn_ac_start_p_timer,
+	[4] = llc_conn_ac_inc_retry_cnt_by_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_19 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_normal_ev_qfyrs_19,
+	.ev_actions    = llc_normal_actions_19,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_20a[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[2] = llc_conn_ac_rst_vs,
+	[3] = llc_conn_ac_start_p_timer,
+	[4] = llc_conn_ac_inc_retry_cnt_by_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_20a = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_normal_ev_qfyrs_20a,
+	.ev_actions    = llc_normal_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_20b[] = {
+	[0] = llc_conn_ac_rst_sendack_flag,
+	[1] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[2] = llc_conn_ac_rst_vs,
+	[3] = llc_conn_ac_start_p_timer,
+	[4] = llc_conn_ac_inc_retry_cnt_by_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_20b = {
+	.ev	       = llc_conn_ev_busy_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_normal_ev_qfyrs_20b,
+	.ev_actions    = llc_normal_actions_20b,
+};
+
+/* State transitions for LLC_CONN_EV_TX_BUFF_FULL event */
+static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_21[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_normal_actions_21[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_normal_state_trans_21 = {
+	.ev	       = llc_conn_ev_tx_buffer_full,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_normal_ev_qfyrs_21,
+	.ev_actions    = llc_normal_actions_21,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_normal_state_transitions[] = {
+	 [0] = &llc_normal_state_trans_1,	/* Requests */
+	 [1] = &llc_normal_state_trans_2,
+	 [2] = &llc_normal_state_trans_2_1,
+	 [3] = &llc_common_state_trans_1,
+	 [4] = &llc_common_state_trans_2,
+	 [5] = &llc_common_state_trans_end,
+	 [6] = &llc_normal_state_trans_21,
+	 [7] = &llc_normal_state_trans_3,	/* Local busy */
+	 [8] = &llc_normal_state_trans_4,
+	 [9] = &llc_common_state_trans_end,
+	[10] = &llc_normal_state_trans_18,	/* Init pf cycle */
+	[11] = &llc_common_state_trans_end,
+	[12] = &llc_common_state_trans_11a,	/* Timers */
+	[13] = &llc_common_state_trans_11b,
+	[14] = &llc_common_state_trans_11c,
+	[15] = &llc_common_state_trans_11d,
+	[16] = &llc_normal_state_trans_19,
+	[17] = &llc_normal_state_trans_20a,
+	[18] = &llc_normal_state_trans_20b,
+	[19] = &llc_common_state_trans_end,
+	[20] = &llc_normal_state_trans_8b,	/* Receive frames */
+	[21] = &llc_normal_state_trans_9b,
+	[22] = &llc_normal_state_trans_10,
+	[23] = &llc_normal_state_trans_11b,
+	[24] = &llc_normal_state_trans_11c,
+	[25] = &llc_normal_state_trans_5a,
+	[26] = &llc_normal_state_trans_5b,
+	[27] = &llc_normal_state_trans_5c,
+	[28] = &llc_normal_state_trans_6a,
+	[29] = &llc_normal_state_trans_6b,
+	[30] = &llc_normal_state_trans_7,
+	[31] = &llc_normal_state_trans_8a,
+	[32] = &llc_normal_state_trans_9a,
+	[33] = &llc_normal_state_trans_11a,
+	[34] = &llc_normal_state_trans_12,
+	[35] = &llc_normal_state_trans_13a,
+	[36] = &llc_normal_state_trans_13b,
+	[37] = &llc_normal_state_trans_13c,
+	[38] = &llc_normal_state_trans_14,
+	[39] = &llc_normal_state_trans_15a,
+	[40] = &llc_normal_state_trans_15b,
+	[41] = &llc_normal_state_trans_16a,
+	[42] = &llc_normal_state_trans_16b,
+	[43] = &llc_normal_state_trans_17,
+	[44] = &llc_common_state_trans_3,
+	[45] = &llc_common_state_trans_4,
+	[46] = &llc_common_state_trans_5,
+	[47] = &llc_common_state_trans_6,
+	[48] = &llc_common_state_trans_7a,
+	[49] = &llc_common_state_trans_7b,
+	[50] = &llc_common_state_trans_8a,
+	[51] = &llc_common_state_trans_8b,
+	[52] = &llc_common_state_trans_8c,
+	[53] = &llc_common_state_trans_9,
+	/* [54] = &llc_common_state_trans_10, */
+	[54] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_BUSY transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_1[] = {
+	[0] = llc_conn_ac_send_i_xxx_x_set_0,
+	[1] = llc_conn_ac_start_ack_tmr_if_not_running,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_1,
+	.ev_actions    = llc_busy_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_2[] = {
+	[0] = llc_conn_ac_send_i_xxx_x_set_0,
+	[1] = llc_conn_ac_start_ack_tmr_if_not_running,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_2 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_2,
+	.ev_actions    = llc_busy_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_remote_busy,
+	[2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_busy_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_busy_state_trans_2_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_2_1,
+	.ev_actions    = llc_busy_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_3[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_start_rej_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_3 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_busy_ev_qfyrs_3,
+	.ev_actions    = llc_busy_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_4[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_start_rej_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_4 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_busy_ev_qfyrs_4,
+	.ev_actions    = llc_busy_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_5[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_5[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_5 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_busy_ev_qfyrs_5,
+	.ev_actions    = llc_busy_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_6[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_6[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_6 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_busy_ev_qfyrs_6,
+	.ev_actions    = llc_busy_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_7[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_2,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_7[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_7 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_busy_ev_qfyrs_7,
+	.ev_actions    = llc_busy_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_8[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_2,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_8[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_8 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_busy_ev_qfyrs_8,
+	.ev_actions    = llc_busy_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_9a[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_p_flag,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+	[4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_9a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_9a,
+	.ev_actions    = llc_busy_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_9b[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_p_flag,
+	[2] = llc_conn_ac_upd_nr_received,
+	[3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+	[4] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_9b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_9b,
+	.ev_actions    = llc_busy_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_10a[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_10a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_10a,
+	.ev_actions    = llc_busy_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_10b[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_10b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_10b,
+	.ev_actions    = llc_busy_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_busy_actions_11[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_11 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_busy_actions_12[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_12 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_13a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+	[6] = llc_conn_ac_set_data_flag_0,
+	[7] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_13a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_13a,
+	.ev_actions    = llc_busy_actions_13a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_13b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+	[6] = llc_conn_ac_set_data_flag_0,
+	[7] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_13b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_13b,
+	.ev_actions    = llc_busy_actions_13b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_14a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_14a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_14a,
+	.ev_actions    = llc_busy_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_14b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_14b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_14b,
+	.ev_actions    = llc_busy_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_busy_actions_15a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_busy_actions_15b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_15c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_15c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_15c = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_15c,
+	.ev_actions    = llc_busy_actions_15c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_busy_actions_16[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_16 = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_16,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_busy_actions_17a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_17a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_busy_actions_17b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_17b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_17c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_17c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_17c = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_17c,
+	.ev_actions    = llc_busy_actions_17c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_busy_actions_18[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_18 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_19a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_19a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_19a,
+	.ev_actions    = llc_busy_actions_19a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_19b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_19b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_19b,
+	.ev_actions    = llc_busy_actions_19b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_20a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_20a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_20a,
+	.ev_actions    = llc_busy_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_20b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_20b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_20b,
+	.ev_actions    = llc_busy_actions_20b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_busy_actions_21[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_21 = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_busy_actions_21,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_22[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_22[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_22 = {
+	.ev	       = llc_conn_ev_init_p_f_cycle,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_22,
+	.ev_actions    = llc_busy_actions_22,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_23[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_23[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_rst_vs,
+	[2] = llc_conn_ac_start_p_timer,
+	[3] = llc_conn_ac_inc_retry_cnt_by_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_23 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_23,
+	.ev_actions    = llc_busy_actions_23,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_24a[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = llc_conn_ac_rst_vs,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_24a = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_24a,
+	.ev_actions    = llc_busy_actions_24a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_24b[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = llc_conn_ac_rst_vs,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_24b = {
+	.ev	       = llc_conn_ev_busy_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_24b,
+	.ev_actions    = llc_busy_actions_24b,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_25[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_25[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = llc_conn_ac_rst_vs,
+	[4] = llc_conn_ac_set_data_flag_1,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_25 = {
+	.ev	       = llc_conn_ev_rej_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_25,
+	.ev_actions    = llc_busy_actions_25,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_26[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_busy_actions_26[] = {
+	[0] = llc_conn_ac_set_data_flag_1,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_busy_state_trans_26 = {
+	.ev	       = llc_conn_ev_rej_tmr_exp,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_busy_ev_qfyrs_26,
+	.ev_actions    = llc_busy_actions_26,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_busy_state_transitions[] = {
+	 [0] = &llc_common_state_trans_1,	/* Request */
+	 [1] = &llc_common_state_trans_2,
+	 [2] = &llc_busy_state_trans_1,
+	 [3] = &llc_busy_state_trans_2,
+	 [4] = &llc_busy_state_trans_2_1,
+	 [5] = &llc_common_state_trans_end,
+	 [6] = &llc_busy_state_trans_3,		/* Local busy */
+	 [7] = &llc_busy_state_trans_4,
+	 [8] = &llc_busy_state_trans_5,
+	 [9] = &llc_busy_state_trans_6,
+	[10] = &llc_busy_state_trans_7,
+	[11] = &llc_busy_state_trans_8,
+	[12] = &llc_common_state_trans_end,
+	[13] = &llc_busy_state_trans_22,	/* Initiate PF cycle */
+	[14] = &llc_common_state_trans_end,
+	[15] = &llc_common_state_trans_11a,	/* Timer */
+	[16] = &llc_common_state_trans_11b,
+	[17] = &llc_common_state_trans_11c,
+	[18] = &llc_common_state_trans_11d,
+	[19] = &llc_busy_state_trans_23,
+	[20] = &llc_busy_state_trans_24a,
+	[21] = &llc_busy_state_trans_24b,
+	[22] = &llc_busy_state_trans_25,
+	[23] = &llc_busy_state_trans_26,
+	[24] = &llc_common_state_trans_end,
+	[25] = &llc_busy_state_trans_9a,	/* Receive frame */
+	[26] = &llc_busy_state_trans_9b,
+	[27] = &llc_busy_state_trans_10a,
+	[28] = &llc_busy_state_trans_10b,
+	[29] = &llc_busy_state_trans_11,
+	[30] = &llc_busy_state_trans_12,
+	[31] = &llc_busy_state_trans_13a,
+	[32] = &llc_busy_state_trans_13b,
+	[33] = &llc_busy_state_trans_14a,
+	[34] = &llc_busy_state_trans_14b,
+	[35] = &llc_busy_state_trans_15a,
+	[36] = &llc_busy_state_trans_15b,
+	[37] = &llc_busy_state_trans_15c,
+	[38] = &llc_busy_state_trans_16,
+	[39] = &llc_busy_state_trans_17a,
+	[40] = &llc_busy_state_trans_17b,
+	[41] = &llc_busy_state_trans_17c,
+	[42] = &llc_busy_state_trans_18,
+	[43] = &llc_busy_state_trans_19a,
+	[44] = &llc_busy_state_trans_19b,
+	[45] = &llc_busy_state_trans_20a,
+	[46] = &llc_busy_state_trans_20b,
+	[47] = &llc_busy_state_trans_21,
+	[48] = &llc_common_state_trans_3,
+	[49] = &llc_common_state_trans_4,
+	[50] = &llc_common_state_trans_5,
+	[51] = &llc_common_state_trans_6,
+	[52] = &llc_common_state_trans_7a,
+	[53] = &llc_common_state_trans_7b,
+	[54] = &llc_common_state_trans_8a,
+	[55] = &llc_common_state_trans_8b,
+	[56] = &llc_common_state_trans_8c,
+	[57] = &llc_common_state_trans_9,
+	/* [58] = &llc_common_state_trans_10, */
+	[58] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_REJ transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_1[] = {
+	[0] = llc_conn_ac_send_i_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_1,
+	.ev_actions    = llc_reject_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_0,
+	[1] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_2[] = {
+	[0] = llc_conn_ac_send_i_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_2 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_2,
+	.ev_actions    = llc_reject_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2_1[] = {
+	[0] = llc_conn_ev_qlfy_remote_busy_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_remote_busy,
+	[2] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_reject_actions_2_1[1];
+
+static struct llc_conn_state_trans llc_reject_state_trans_2_1 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_2_1,
+	.ev_actions    = llc_reject_actions_2_1,
+};
+
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_3[] = {
+	[0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_set_data_flag_2,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_3 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_reject_ev_qfyrs_3,
+	.ev_actions    = llc_reject_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_4[] = {
+	[0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_set_data_flag_2,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_4 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = llc_reject_ev_qfyrs_4,
+	.ev_actions    = llc_reject_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_reject_actions_5a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_p_flag,
+	[2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_reject_actions_5b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_p_flag,
+	[2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_5c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_5c[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_p_flag,
+	[2] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_5c = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_5c,
+	.ev_actions    = llc_reject_actions_5c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_reject_actions_6[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_6 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_7a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_send_ack_xxx_x_set_0,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[6] = llc_conn_ac_stop_rej_timer,
+	[7] = NULL,
+
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_7a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_reject_ev_qfyrs_7a,
+	.ev_actions    = llc_reject_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_7b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_send_ack_xxx_x_set_0,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_clear_remote_busy_if_f_eq_1,
+	[6] = llc_conn_ac_stop_rej_timer,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_7b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_reject_ev_qfyrs_7b,
+	.ev_actions    = llc_reject_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_8a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_ack_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_timer,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_reject_ev_qfyrs_8a,
+	.ev_actions    = llc_reject_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_8b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_ack_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_timer,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_reject_ev_qfyrs_8b,
+	.ev_actions    = llc_reject_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_reject_actions_9[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_ack_rsp_f_set_1,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_stop_rej_timer,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_9 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_reject_actions_10a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_reject_actions_10b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_10c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_10c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_10c = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_10c,
+	.ev_actions    = llc_reject_actions_10c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_reject_actions_11[] = {
+	[0] = llc_conn_ac_send_ack_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_11 = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_reject_actions_12a[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_reject_actions_12b[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_12c[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_12c[] = {
+	[0] = llc_conn_ac_upd_p_flag,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_12c = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_12c,
+	.ev_actions    = llc_reject_actions_12c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_reject_actions_13[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_13 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_14a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_14a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_14a,
+	.ev_actions    = llc_reject_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_14b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_p_flag,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_14b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_14b,
+	.ev_actions    = llc_reject_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_15a[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_15a = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_15a,
+	.ev_actions    = llc_reject_actions_15a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_15b[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_15b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_15b,
+	.ev_actions    = llc_reject_actions_15b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_reject_actions_16[] = {
+	[0] = llc_conn_ac_set_vs_nr,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_resend_i_rsp_f_set_1,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_16 = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_reject_actions_16,
+};
+
+/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_17[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_17[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_17 = {
+	.ev	       = llc_conn_ev_init_p_f_cycle,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_17,
+	.ev_actions    = llc_reject_actions_17,
+};
+
+/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_18[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_18[] = {
+	[0] = llc_conn_ac_send_rej_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_start_rej_timer,
+	[3] = llc_conn_ac_inc_retry_cnt_by_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_18 = {
+	.ev	       = llc_conn_ev_rej_tmr_exp,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_18,
+	.ev_actions    = llc_reject_actions_18,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_19[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_19[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_start_rej_timer,
+	[3] = llc_conn_ac_inc_retry_cnt_by_1,
+	[4] = llc_conn_ac_rst_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_19 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_19,
+	.ev_actions    = llc_reject_actions_19,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20a[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_20a[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_start_rej_timer,
+	[3] = llc_conn_ac_inc_retry_cnt_by_1,
+	[4] = llc_conn_ac_rst_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_20a = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_20a,
+	.ev_actions    = llc_reject_actions_20a,
+};
+
+/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20b[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_reject_actions_20b[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_start_rej_timer,
+	[3] = llc_conn_ac_inc_retry_cnt_by_1,
+	[4] = llc_conn_ac_rst_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_reject_state_trans_20b = {
+	.ev	       = llc_conn_ev_busy_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_reject_ev_qfyrs_20b,
+	.ev_actions    = llc_reject_actions_20b,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_reject_state_transitions[] = {
+	 [0] = &llc_common_state_trans_1,	/* Request */
+	 [1] = &llc_common_state_trans_2,
+	 [2] = &llc_common_state_trans_end,
+	 [3] = &llc_reject_state_trans_1,
+	 [4] = &llc_reject_state_trans_2,
+	 [5] = &llc_reject_state_trans_2_1,
+	 [6] = &llc_reject_state_trans_3,	/* Local busy */
+	 [7] = &llc_reject_state_trans_4,
+	 [8] = &llc_common_state_trans_end,
+	 [9] = &llc_reject_state_trans_17,	/* Initiate PF cycle */
+	[10] = &llc_common_state_trans_end,
+	[11] = &llc_common_state_trans_11a,	/* Timer */
+	[12] = &llc_common_state_trans_11b,
+	[13] = &llc_common_state_trans_11c,
+	[14] = &llc_common_state_trans_11d,
+	[15] = &llc_reject_state_trans_18,
+	[16] = &llc_reject_state_trans_19,
+	[17] = &llc_reject_state_trans_20a,
+	[18] = &llc_reject_state_trans_20b,
+	[19] = &llc_common_state_trans_end,
+	[20] = &llc_common_state_trans_3,	/* Receive frame */
+	[21] = &llc_common_state_trans_4,
+	[22] = &llc_common_state_trans_5,
+	[23] = &llc_common_state_trans_6,
+	[24] = &llc_common_state_trans_7a,
+	[25] = &llc_common_state_trans_7b,
+	[26] = &llc_common_state_trans_8a,
+	[27] = &llc_common_state_trans_8b,
+	[28] = &llc_common_state_trans_8c,
+	[29] = &llc_common_state_trans_9,
+	/* [30] = &llc_common_state_trans_10, */
+	[30] = &llc_reject_state_trans_5a,
+	[31] = &llc_reject_state_trans_5b,
+	[32] = &llc_reject_state_trans_5c,
+	[33] = &llc_reject_state_trans_6,
+	[34] = &llc_reject_state_trans_7a,
+	[35] = &llc_reject_state_trans_7b,
+	[36] = &llc_reject_state_trans_8a,
+	[37] = &llc_reject_state_trans_8b,
+	[38] = &llc_reject_state_trans_9,
+	[39] = &llc_reject_state_trans_10a,
+	[40] = &llc_reject_state_trans_10b,
+	[41] = &llc_reject_state_trans_10c,
+	[42] = &llc_reject_state_trans_11,
+	[43] = &llc_reject_state_trans_12a,
+	[44] = &llc_reject_state_trans_12b,
+	[45] = &llc_reject_state_trans_12c,
+	[46] = &llc_reject_state_trans_13,
+	[47] = &llc_reject_state_trans_14a,
+	[48] = &llc_reject_state_trans_14b,
+	[49] = &llc_reject_state_trans_15a,
+	[50] = &llc_reject_state_trans_15b,
+	[51] = &llc_reject_state_trans_16,
+	[52] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_AWAIT transitions */
+/* State transitions for LLC_CONN_EV_DATA_REQ event */
+static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_1_0[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_await_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_state_trans_1_0 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_await_ev_qfyrs_1_0,
+	.ev_actions    = llc_await_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_action_t llc_await_actions_1[] = {
+	[0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_set_data_flag_0,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_1 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_actions_2[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_stop_p_timer,
+	[4] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[5] = llc_conn_ac_start_rej_timer,
+	[6] = llc_conn_ac_clear_remote_busy,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_actions_3a[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_3a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_3a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_actions_3b[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_3b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_3b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_actions_4[] = {
+	[0] = llc_conn_ac_send_rej_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_start_rej_timer,
+	[4] = llc_conn_ac_start_p_timer,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_5[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr,
+	[6] = llc_conn_ac_clear_remote_busy,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_6a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_6a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_6a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_6b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_6b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_6b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_7[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_7 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_8a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_8b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_9a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_9b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_9c[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9c = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_9c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_9d[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_9d = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_9d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_10a[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_10a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_10b[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_10b = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_11[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_11 = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_11,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_12a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_12a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_actions_12b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_12b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_actions_13[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_13 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_14[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_actions_14[] = {
+	[0] = llc_conn_ac_send_rr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_state_trans_14 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_await_ev_qfyrs_14,
+	.ev_actions    = llc_await_actions_14,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_state_transitions[] = {
+	 [0] = &llc_common_state_trans_1,	/* Request */
+	 [1] = &llc_common_state_trans_2,
+	 [2] = &llc_await_state_trans_1_0,
+	 [3] = &llc_common_state_trans_end,
+	 [4] = &llc_await_state_trans_1,	/* Local busy */
+	 [5] = &llc_common_state_trans_end,
+	 [6] = &llc_common_state_trans_end,	/* Initiate PF Cycle */
+	 [7] = &llc_common_state_trans_11a,	/* Timer */
+	 [8] = &llc_common_state_trans_11b,
+	 [9] = &llc_common_state_trans_11c,
+	[10] = &llc_common_state_trans_11d,
+	[11] = &llc_await_state_trans_14,
+	[12] = &llc_common_state_trans_end,
+	[13] = &llc_common_state_trans_3,	/* Receive frame */
+	[14] = &llc_common_state_trans_4,
+	[15] = &llc_common_state_trans_5,
+	[16] = &llc_common_state_trans_6,
+	[17] = &llc_common_state_trans_7a,
+	[18] = &llc_common_state_trans_7b,
+	[19] = &llc_common_state_trans_8a,
+	[20] = &llc_common_state_trans_8b,
+	[21] = &llc_common_state_trans_8c,
+	[22] = &llc_common_state_trans_9,
+	/* [23] = &llc_common_state_trans_10, */
+	[23] = &llc_await_state_trans_2,
+	[24] = &llc_await_state_trans_3a,
+	[25] = &llc_await_state_trans_3b,
+	[26] = &llc_await_state_trans_4,
+	[27] = &llc_await_state_trans_5,
+	[28] = &llc_await_state_trans_6a,
+	[29] = &llc_await_state_trans_6b,
+	[30] = &llc_await_state_trans_7,
+	[31] = &llc_await_state_trans_8a,
+	[32] = &llc_await_state_trans_8b,
+	[33] = &llc_await_state_trans_9a,
+	[34] = &llc_await_state_trans_9b,
+	[35] = &llc_await_state_trans_9c,
+	[36] = &llc_await_state_trans_9d,
+	[37] = &llc_await_state_trans_10a,
+	[38] = &llc_await_state_trans_10b,
+	[39] = &llc_await_state_trans_11,
+	[40] = &llc_await_state_trans_12a,
+	[41] = &llc_await_state_trans_12b,
+	[42] = &llc_await_state_trans_13,
+	[43] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_AWAIT_BUSY transitions */
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1_0[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_await_busy_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_await_busy_ev_qfyrs_1_0,
+	.ev_actions    = llc_await_busy_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_1,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_busy_actions_1[] = {
+	[0] = llc_conn_ac_send_rej_xxx_x_set_0,
+	[1] = llc_conn_ac_start_rej_timer,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_1 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_await_busy_ev_qfyrs_1,
+	.ev_actions    = llc_await_busy_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_0,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_busy_actions_2[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_2 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = llc_await_busy_ev_qfyrs_2,
+	.ev_actions    = llc_await_busy_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */
+static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_data_flag_eq_2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_busy_actions_3[] = {
+	[0] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_3 = {
+	.ev	       = llc_conn_ev_local_busy_cleared,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_await_busy_ev_qfyrs_3,
+	.ev_actions    = llc_await_busy_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_busy_actions_4[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_stop_p_timer,
+	[4] = llc_conn_ac_set_data_flag_1,
+	[5] = llc_conn_ac_clear_remote_busy,
+	[6] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_busy_actions_5a[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_data_flag_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_5a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_busy_actions_5b[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_data_flag_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_5b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_busy_actions_6[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_data_flag_1,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_6 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_7[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_inc_vr_by_1,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_stop_p_timer,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_upd_vs,
+	[6] = llc_conn_ac_set_data_flag_0,
+	[7] = llc_conn_ac_clear_remote_busy,
+	[8] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[9] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_7 = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_8a[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_inc_vr_by_1,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_8b[] = {
+	[0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_inc_vr_by_1,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_9[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_inc_vr_by_1,
+	[2] = llc_conn_ac_data_ind,
+	[3] = llc_conn_ac_upd_nr_received,
+	[4] = llc_conn_ac_upd_vs,
+	[5] = llc_conn_ac_set_data_flag_0,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_9 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_9,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_10a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_10a = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_10a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_10b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_10b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_10b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_11a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_11b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_11c[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11c = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_11c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_11d[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_11d = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_11d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_12a[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_12a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_12a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_12b[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_12b = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_12b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_13[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_13 = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_13,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_14a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_14a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_14a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_busy_actions_14b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_14b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_14b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_busy_actions_15[] = {
+	[0] = llc_conn_ac_send_rnr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_15 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_busy_actions_15,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_16[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_busy_actions_16[] = {
+	[0] = llc_conn_ac_send_rnr_cmd_p_set_1,
+	[1] = llc_conn_ac_start_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_busy_state_trans_16 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = llc_await_busy_ev_qfyrs_16,
+	.ev_actions    = llc_await_busy_actions_16,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_busy_state_transitions[] = {
+	 [0] = &llc_common_state_trans_1,		/* Request */
+	 [1] = &llc_common_state_trans_2,
+	 [2] = &llc_await_busy_state_trans_1_0,
+	 [3] = &llc_common_state_trans_end,
+	 [4] = &llc_await_busy_state_trans_1,		/* Local busy */
+	 [5] = &llc_await_busy_state_trans_2,
+	 [6] = &llc_await_busy_state_trans_3,
+	 [7] = &llc_common_state_trans_end,
+	 [8] = &llc_common_state_trans_end,		/* Initiate PF cycle */
+	 [9] = &llc_common_state_trans_11a,		/* Timer */
+	[10] = &llc_common_state_trans_11b,
+	[11] = &llc_common_state_trans_11c,
+	[12] = &llc_common_state_trans_11d,
+	[13] = &llc_await_busy_state_trans_16,
+	[14] = &llc_common_state_trans_end,
+	[15] = &llc_await_busy_state_trans_4,		/* Receive frame */
+	[16] = &llc_await_busy_state_trans_5a,
+	[17] = &llc_await_busy_state_trans_5b,
+	[18] = &llc_await_busy_state_trans_6,
+	[19] = &llc_await_busy_state_trans_7,
+	[20] = &llc_await_busy_state_trans_8a,
+	[21] = &llc_await_busy_state_trans_8b,
+	[22] = &llc_await_busy_state_trans_9,
+	[23] = &llc_await_busy_state_trans_10a,
+	[24] = &llc_await_busy_state_trans_10b,
+	[25] = &llc_await_busy_state_trans_11a,
+	[26] = &llc_await_busy_state_trans_11b,
+	[27] = &llc_await_busy_state_trans_11c,
+	[28] = &llc_await_busy_state_trans_11d,
+	[29] = &llc_await_busy_state_trans_12a,
+	[30] = &llc_await_busy_state_trans_12b,
+	[31] = &llc_await_busy_state_trans_13,
+	[32] = &llc_await_busy_state_trans_14a,
+	[33] = &llc_await_busy_state_trans_14b,
+	[34] = &llc_await_busy_state_trans_15,
+	[35] = &llc_common_state_trans_3,
+	[36] = &llc_common_state_trans_4,
+	[37] = &llc_common_state_trans_5,
+	[38] = &llc_common_state_trans_6,
+	[39] = &llc_common_state_trans_7a,
+	[40] = &llc_common_state_trans_7b,
+	[41] = &llc_common_state_trans_8a,
+	[42] = &llc_common_state_trans_8b,
+	[43] = &llc_common_state_trans_8c,
+	[44] = &llc_common_state_trans_9,
+	/* [45] = &llc_common_state_trans_10, */
+	[45] = &llc_common_state_trans_end,
+};
+
+/* ----------------- LLC_CONN_STATE_AWAIT_REJ transitions --------------- */
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static llc_conn_ev_qfyr_t llc_await_reject_ev_qfyrs_1_0[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_await_reject_actions_1_0[1];
+
+static struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_await_reject_ev_qfyrs_1_0,
+	.ev_actions    = llc_await_reject_actions_1_0,
+};
+
+/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */
+static llc_conn_action_t llc_await_rejct_actions_1[] = {
+	[0] = llc_conn_ac_send_rnr_xxx_x_set_0,
+	[1] = llc_conn_ac_set_data_flag_2,
+	[2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_1 = {
+	.ev	       = llc_conn_ev_local_busy_detected,
+	.next_state    = LLC_CONN_STATE_AWAIT_BUSY,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_rejct_actions_2a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_2a = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_2a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_rejct_actions_2b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_2b = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_2b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_rejct_actions_3[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = NULL
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_3 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_4[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_stop_rej_timer,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_upd_vs,
+	[6] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr,
+	[7] = llc_conn_ac_clear_remote_busy,
+	[8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_5a[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[3] = llc_conn_ac_stop_rej_timer,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_upd_vs,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_5a = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_5a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_5b[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_xxx_x_set_0,
+	[3] = llc_conn_ac_stop_rej_timer,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_upd_vs,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_5b = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_5b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_6[] = {
+	[0] = llc_conn_ac_inc_vr_by_1,
+	[1] = llc_conn_ac_data_ind,
+	[2] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[3] = llc_conn_ac_stop_rej_timer,
+	[4] = llc_conn_ac_upd_nr_received,
+	[5] = llc_conn_ac_upd_vs,
+	[6] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_6 = {
+	.ev	       = llc_conn_ev_rx_i_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_7a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7a = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_7a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_7b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7b = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_7b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */
+static llc_conn_action_t llc_await_rejct_actions_7c[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_resend_i_xxx_x_set_0,
+	[4] = llc_conn_ac_clear_remote_busy,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_7c = {
+	.ev	       = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_7c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_8a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_8a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_8b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8b = {
+	.ev	       = llc_conn_ev_rx_rr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_8b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_8c[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8c = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_8c,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_8d[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_clear_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_8d = {
+	.ev	       = llc_conn_ev_rx_rej_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_8d,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_9a[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_9a = {
+	.ev	       = llc_conn_ev_rx_rr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_9a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_9b[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_clear_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_9b = {
+	.ev	       = llc_conn_ev_rx_rej_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_9b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_10[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_stop_p_timer,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_10 = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
+	.next_state    = LLC_CONN_STATE_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_10,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_11a[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_11a = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_11a,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */
+static llc_conn_action_t llc_await_rejct_actions_11b[] = {
+	[0] = llc_conn_ac_upd_nr_received,
+	[1] = llc_conn_ac_upd_vs,
+	[2] = llc_conn_ac_set_remote_busy,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_11b = {
+	.ev	       = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_11b,
+};
+
+/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */
+static llc_conn_action_t llc_await_rejct_actions_12[] = {
+	[0] = llc_conn_ac_send_rr_rsp_f_set_1,
+	[1] = llc_conn_ac_upd_nr_received,
+	[2] = llc_conn_ac_upd_vs,
+	[3] = llc_conn_ac_set_remote_busy,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_12 = {
+	.ev	       = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_await_rejct_actions_12,
+};
+
+/* State transitions for LLC_CONN_EV_P_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_await_rejct_ev_qfyrs_13[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_await_rejct_actions_13[] = {
+	[0] = llc_conn_ac_send_rej_cmd_p_set_1,
+	[1] = llc_conn_ac_stop_p_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = {
+	.ev	       = llc_conn_ev_p_tmr_exp,
+	.next_state    = LLC_CONN_STATE_AWAIT_REJ,
+	.ev_qualifiers = llc_await_rejct_ev_qfyrs_13,
+	.ev_actions    = llc_await_rejct_actions_13,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = {
+	 [0] = &llc_await_reject_state_trans_1_0,
+	 [1] = &llc_common_state_trans_1,		/* requests */
+	 [2] = &llc_common_state_trans_2,
+	 [3] = &llc_common_state_trans_end,
+	 [4] = &llc_await_rejct_state_trans_1,		/* local busy */
+	 [5] = &llc_common_state_trans_end,
+	 [6] = &llc_common_state_trans_end,		/* Initiate PF cycle */
+	 [7] = &llc_await_rejct_state_trans_13,	/* timers */
+	 [8] = &llc_common_state_trans_11a,
+	 [9] = &llc_common_state_trans_11b,
+	[10] = &llc_common_state_trans_11c,
+	[11] = &llc_common_state_trans_11d,
+	[12] = &llc_common_state_trans_end,
+	[13] = &llc_await_rejct_state_trans_2a,	/* receive frames */
+	[14] = &llc_await_rejct_state_trans_2b,
+	[15] = &llc_await_rejct_state_trans_3,
+	[16] = &llc_await_rejct_state_trans_4,
+	[17] = &llc_await_rejct_state_trans_5a,
+	[18] = &llc_await_rejct_state_trans_5b,
+	[19] = &llc_await_rejct_state_trans_6,
+	[20] = &llc_await_rejct_state_trans_7a,
+	[21] = &llc_await_rejct_state_trans_7b,
+	[22] = &llc_await_rejct_state_trans_7c,
+	[23] = &llc_await_rejct_state_trans_8a,
+	[24] = &llc_await_rejct_state_trans_8b,
+	[25] = &llc_await_rejct_state_trans_8c,
+	[26] = &llc_await_rejct_state_trans_8d,
+	[27] = &llc_await_rejct_state_trans_9a,
+	[28] = &llc_await_rejct_state_trans_9b,
+	[29] = &llc_await_rejct_state_trans_10,
+	[30] = &llc_await_rejct_state_trans_11a,
+	[31] = &llc_await_rejct_state_trans_11b,
+	[32] = &llc_await_rejct_state_trans_12,
+	[33] = &llc_common_state_trans_3,
+	[34] = &llc_common_state_trans_4,
+	[35] = &llc_common_state_trans_5,
+	[36] = &llc_common_state_trans_6,
+	[37] = &llc_common_state_trans_7a,
+	[38] = &llc_common_state_trans_7b,
+	[39] = &llc_common_state_trans_8a,
+	[40] = &llc_common_state_trans_8b,
+	[41] = &llc_common_state_trans_8c,
+	[42] = &llc_common_state_trans_9,
+	/* [43] = &llc_common_state_trans_10, */
+	[43] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_D_CONN transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event,
+ * cause_flag = 1 */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_conflict,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_1[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_ac_disc_confirm,
+	[3] = llc_conn_disc,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_1 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_1,
+	.ev_actions    = llc_d_conn_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1_1[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_set_status_conflict,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_1_1[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_1_1,
+	.ev_actions    = llc_d_conn_actions_1_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[2] = llc_conn_ev_qlfy_set_status_disc,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_2[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_disc_confirm,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_2,
+	.ev_actions    = llc_d_conn_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2_1[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_set_status_disc,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_2_1[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_2_1,
+	.ev_actions    = llc_d_conn_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_d_conn_actions_3[] = {
+	[0] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_3 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_D_CONN,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_d_conn_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_disc,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_4[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_disc_confirm,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_4,
+	.ev_actions    = llc_d_conn_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4_1[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_set_status_disc,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_4_1[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_4_1,
+	.ev_actions    = llc_d_conn_actions_4_1,
+};
+
+/*
+ * State transition for
+ * LLC_CONN_EV_DATA_CONN_REQ event
+ */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_5[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_d_conn_actions_5[1];
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_5 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_D_CONN,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_5,
+	.ev_actions    = llc_d_conn_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_6[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_6[] = {
+	[0] = llc_conn_ac_send_disc_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_6 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_D_CONN,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_6,
+	.ev_actions    = llc_d_conn_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 1 */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_7[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[2] = llc_conn_ev_qlfy_set_status_failed,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_7[] = {
+	[0] = llc_conn_ac_disc_confirm,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_7 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_7,
+	.ev_actions    = llc_d_conn_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 0 */
+static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_8[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_set_status_failed,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_d_conn_actions_8[] = {
+	[0] = llc_conn_disc,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_d_conn_state_trans_8 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_d_conn_ev_qfyrs_8,
+	.ev_actions    = llc_d_conn_actions_8,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_d_conn_state_transitions[] = {
+	 [0] = &llc_d_conn_state_trans_5,	/* Request */
+	 [1] = &llc_common_state_trans_end,
+	 [2] = &llc_common_state_trans_end,	/* Local busy */
+	 [3] = &llc_common_state_trans_end,	/* Initiate PF cycle */
+	 [4] = &llc_d_conn_state_trans_6,	/* Timer */
+	 [5] = &llc_d_conn_state_trans_7,
+	 [6] = &llc_d_conn_state_trans_8,
+	 [7] = &llc_common_state_trans_end,
+	 [8] = &llc_d_conn_state_trans_1,	/* Receive frame */
+	 [9] = &llc_d_conn_state_trans_1_1,
+	[10] = &llc_d_conn_state_trans_2,
+	[11] = &llc_d_conn_state_trans_2_1,
+	[12] = &llc_d_conn_state_trans_3,
+	[13] = &llc_d_conn_state_trans_4,
+	[14] = &llc_d_conn_state_trans_4_1,
+	[15] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_RESET transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_rst_actions_1[] = {
+	[0] = llc_conn_ac_set_vs_0,
+	[1] = llc_conn_ac_set_vr_0,
+	[2] = llc_conn_ac_set_s_flag_1,
+	[3] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_1 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_rst_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[2] = llc_conn_ev_qlfy_set_status_conn,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_2[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_rst_confirm,
+	[5] = llc_conn_ac_set_remote_busy_0,
+	[6] = llc_conn_reset,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_rst_ev_qfyrs_2,
+	.ev_actions    = llc_rst_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2_1[] = {
+	[0] = llc_conn_ev_qlfy_p_flag_eq_f,
+	[1] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_set_status_rst_done,
+	[3] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_2_1[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_ac_set_vs_0,
+	[2] = llc_conn_ac_set_vr_0,
+	[3] = llc_conn_ac_upd_p_flag,
+	[4] = llc_conn_ac_rst_confirm,
+	[5] = llc_conn_ac_set_remote_busy_0,
+	[6] = llc_conn_reset,
+	[7] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_2_1 = {
+	.ev	       = llc_conn_ev_rx_ua_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_rst_ev_qfyrs_2_1,
+	.ev_actions    = llc_rst_actions_2_1,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_3[] = {
+	[0] = llc_conn_ev_qlfy_s_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_rst_done,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_3[] = {
+	[0] = llc_conn_ac_set_p_flag_0,
+	[1] = llc_conn_ac_set_remote_busy_0,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_3 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = llc_rst_ev_qfyrs_3,
+	.ev_actions    = llc_rst_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event,
+ * cause_flag = 1
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_disc,
+	[2] = NULL,
+};
+static llc_conn_action_t llc_rst_actions_4[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_ac_disc_ind,
+	[2] = llc_conn_ac_stop_ack_timer,
+	[3] = llc_conn_disc,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_4,
+	.ev_actions    = llc_rst_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4_1[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_set_status_refuse,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_4_1[] = {
+	[0] = llc_conn_ac_send_dm_rsp_f_set_p,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_4_1 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_4_1,
+	.ev_actions    = llc_rst_actions_4_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 1
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[1] = llc_conn_ev_qlfy_set_status_disc,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_5[] = {
+	[0] = llc_conn_ac_disc_ind,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_5,
+	.ev_actions    = llc_rst_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event,
+ * cause_flag = 0
+ */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5_1[] = {
+	[0] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[1] = llc_conn_ev_qlfy_set_status_refuse,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_5_1[] = {
+	[0] = llc_conn_ac_stop_ack_timer,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_5_1 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_5_1,
+	.ev_actions    = llc_rst_actions_5_1,
+};
+
+/* State transitions for DATA_CONN_REQ event */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_6[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_rst_actions_6[1];
+
+static struct llc_conn_state_trans llc_rst_state_trans_6 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_rst_ev_qfyrs_6,
+	.ev_actions    = llc_rst_actions_6,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_7[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = llc_conn_ev_qlfy_s_flag_eq_0,
+	[2] = NULL,
+};
+
+static llc_conn_action_t llc_rst_actions_7[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_7 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_rst_ev_qfyrs_7,
+	.ev_actions    = llc_rst_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = llc_conn_ev_qlfy_s_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_cause_flag_eq_1,
+	[3] = llc_conn_ev_qlfy_set_status_failed,
+	[4] = NULL,
+};
+static llc_conn_action_t llc_rst_actions_8[] = {
+	[0] = llc_conn_ac_disc_ind,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_8 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_8,
+	.ev_actions    = llc_rst_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8_1[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = llc_conn_ev_qlfy_s_flag_eq_0,
+	[2] = llc_conn_ev_qlfy_cause_flag_eq_0,
+	[3] = llc_conn_ev_qlfy_set_status_failed,
+	[4] = NULL,
+};
+static llc_conn_action_t llc_rst_actions_8_1[] = {
+	[0] = llc_conn_ac_disc_ind,
+	[1] = llc_conn_disc,
+	[2] = NULL,
+};
+
+static struct llc_conn_state_trans llc_rst_state_trans_8_1 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = llc_rst_ev_qfyrs_8_1,
+	.ev_actions    = llc_rst_actions_8_1,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_rst_state_transitions[] = {
+	 [0] = &llc_rst_state_trans_6,		/* Request */
+	 [1] = &llc_common_state_trans_end,
+	 [2] = &llc_common_state_trans_end,	/* Local busy */
+	 [3] = &llc_common_state_trans_end,	/* Initiate PF cycle */
+	 [4] = &llc_rst_state_trans_3,		/* Timer */
+	 [5] = &llc_rst_state_trans_7,
+	 [6] = &llc_rst_state_trans_8,
+	 [7] = &llc_rst_state_trans_8_1,
+	 [8] = &llc_common_state_trans_end,
+	 [9] = &llc_rst_state_trans_1,		/* Receive frame */
+	[10] = &llc_rst_state_trans_2,
+	[11] = &llc_rst_state_trans_2_1,
+	[12] = &llc_rst_state_trans_4,
+	[13] = &llc_rst_state_trans_4_1,
+	[14] = &llc_rst_state_trans_5,
+	[15] = &llc_rst_state_trans_5_1,
+	[16] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_ERROR transitions */
+/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_error_actions_1[] = {
+	[0] = llc_conn_ac_set_vs_0,
+	[1] = llc_conn_ac_set_vr_0,
+	[2] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[3] = llc_conn_ac_rst_ind,
+	[4] = llc_conn_ac_set_p_flag_0,
+	[5] = llc_conn_ac_set_remote_busy_0,
+	[6] = llc_conn_ac_stop_ack_timer,
+	[7] = llc_conn_reset,
+	[8] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_1 = {
+	.ev	       = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_NORMAL,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_error_actions_1,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_error_actions_2[] = {
+	[0] = llc_conn_ac_send_ua_rsp_f_set_p,
+	[1] = llc_conn_ac_disc_ind,
+	[2] = llc_conn_ac_stop_ack_timer,
+	[3] = llc_conn_disc,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_2 = {
+	.ev	       = llc_conn_ev_rx_disc_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_error_actions_2,
+};
+
+/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */
+static llc_conn_action_t llc_error_actions_3[] = {
+	[0] = llc_conn_ac_disc_ind,
+	[1] = llc_conn_ac_stop_ack_timer,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_3 = {
+	.ev	       = llc_conn_ev_rx_dm_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_error_actions_3,
+};
+
+/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */
+static llc_conn_action_t llc_error_actions_4[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_set_retry_cnt_0,
+	[3] = llc_conn_ac_set_cause_flag_0,
+	[4] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_4 = {
+	.ev	       = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_error_actions_4,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_X event */
+static llc_conn_action_t llc_error_actions_5[] = {
+	[0] = llc_conn_ac_resend_frmr_rsp_f_set_p,
+	[1] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_5 = {
+	.ev	       = llc_conn_ev_rx_xxx_cmd_pbit_set_x,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_error_actions_5,
+};
+
+/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_X event */
+static struct llc_conn_state_trans llc_error_state_trans_6 = {
+	.ev	       = llc_conn_ev_rx_xxx_rsp_fbit_set_x,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = NONE,
+	.ev_actions    = NONE,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_7[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_lt_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_error_actions_7[] = {
+	[0] = llc_conn_ac_resend_frmr_rsp_f_set_0,
+	[1] = llc_conn_ac_start_ack_timer,
+	[2] = llc_conn_ac_inc_retry_cnt_by_1,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_7 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = llc_error_ev_qfyrs_7,
+	.ev_actions    = llc_error_actions_7,
+};
+
+/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */
+static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_8[] = {
+	[0] = llc_conn_ev_qlfy_retry_cnt_gte_n2,
+	[1] = NULL,
+};
+
+static llc_conn_action_t llc_error_actions_8[] = {
+	[0] = llc_conn_ac_send_sabme_cmd_p_set_x,
+	[1] = llc_conn_ac_set_s_flag_0,
+	[2] = llc_conn_ac_start_ack_timer,
+	[3] = llc_conn_ac_set_retry_cnt_0,
+	[4] = llc_conn_ac_set_cause_flag_0,
+	[5] = NULL,
+};
+
+static struct llc_conn_state_trans llc_error_state_trans_8 = {
+	.ev	       = llc_conn_ev_ack_tmr_exp,
+	.next_state    = LLC_CONN_STATE_RESET,
+	.ev_qualifiers = llc_error_ev_qfyrs_8,
+	.ev_actions    = llc_error_actions_8,
+};
+
+/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */
+static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_9[] = {
+	[0] = llc_conn_ev_qlfy_set_status_refuse,
+	[1] = NULL,
+};
+
+/* just one member, NULL, .bss zeroes it */
+static llc_conn_action_t llc_error_actions_9[1];
+
+static struct llc_conn_state_trans llc_error_state_trans_9 = {
+	.ev	       = llc_conn_ev_data_req,
+	.next_state    = LLC_CONN_STATE_ERROR,
+	.ev_qualifiers = llc_error_ev_qfyrs_9,
+	.ev_actions    = llc_error_actions_9,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_error_state_transitions[] = {
+	 [0] = &llc_error_state_trans_9,	/* Request */
+	 [1] = &llc_common_state_trans_end,
+	 [2] = &llc_common_state_trans_end,	/* Local busy */
+	 [3] = &llc_common_state_trans_end,	/* Initiate PF cycle */
+	 [4] = &llc_error_state_trans_7,	/* Timer */
+	 [5] = &llc_error_state_trans_8,
+	 [6] = &llc_common_state_trans_end,
+	 [7] = &llc_error_state_trans_1,	/* Receive frame */
+	 [8] = &llc_error_state_trans_2,
+	 [9] = &llc_error_state_trans_3,
+	[10] = &llc_error_state_trans_4,
+	[11] = &llc_error_state_trans_5,
+	[12] = &llc_error_state_trans_6,
+	[13] = &llc_common_state_trans_end,
+};
+
+/* LLC_CONN_STATE_TEMP transitions */
+/* State transitions for LLC_CONN_EV_DISC_REQ event */
+static llc_conn_action_t llc_temp_actions_1[] = {
+	[0] = llc_conn_ac_stop_all_timers,
+	[1] = llc_conn_ac_send_disc_cmd_p_set_x,
+	[2] = llc_conn_disc,
+	[3] = NULL,
+};
+
+static struct llc_conn_state_trans llc_temp_state_trans_1 = {
+	.ev	       = llc_conn_ev_disc_req,
+	.next_state    = LLC_CONN_STATE_ADM,
+	.ev_qualifiers = NONE,
+	.ev_actions    = llc_temp_actions_1,
+};
+
+/*
+ * Array of pointers;
+ * one to each transition
+ */
+static struct llc_conn_state_trans *llc_temp_state_transitions[] = {
+	[0] = &llc_temp_state_trans_1,		/* requests */
+	[1] = &llc_common_state_trans_end,
+	[2] = &llc_common_state_trans_end,	/* local busy */
+	[3] = &llc_common_state_trans_end,	/* init_pf_cycle */
+	[4] = &llc_common_state_trans_end,	/* timer */
+	[5] = &llc_common_state_trans_end,	/* receive */
+};
+
+/* Connection State Transition Table */
+struct llc_conn_state llc_conn_state_table[NBR_CONN_STATES] = {
+	[LLC_CONN_STATE_ADM - 1] = {
+		.current_state	= LLC_CONN_STATE_ADM,
+		.transitions	= llc_adm_state_transitions,
+	},
+	[LLC_CONN_STATE_SETUP - 1] = {
+		.current_state	= LLC_CONN_STATE_SETUP,
+		.transitions	= llc_setup_state_transitions,
+	},
+	[LLC_CONN_STATE_NORMAL - 1] = {
+		.current_state	= LLC_CONN_STATE_NORMAL,
+		.transitions	= llc_normal_state_transitions,
+	},
+	[LLC_CONN_STATE_BUSY - 1] = {
+		.current_state	= LLC_CONN_STATE_BUSY,
+		.transitions	= llc_busy_state_transitions,
+	},
+	[LLC_CONN_STATE_REJ - 1] = {
+		.current_state	= LLC_CONN_STATE_REJ,
+		.transitions	= llc_reject_state_transitions,
+	},
+	[LLC_CONN_STATE_AWAIT - 1] = {
+		.current_state	= LLC_CONN_STATE_AWAIT,
+		.transitions	= llc_await_state_transitions,
+	},
+	[LLC_CONN_STATE_AWAIT_BUSY - 1] = {
+		.current_state	= LLC_CONN_STATE_AWAIT_BUSY,
+		.transitions	= llc_await_busy_state_transitions,
+	},
+	[LLC_CONN_STATE_AWAIT_REJ - 1] = {
+		.current_state	= LLC_CONN_STATE_AWAIT_REJ,
+		.transitions	= llc_await_rejct_state_transitions,
+	},
+	[LLC_CONN_STATE_D_CONN - 1] = {
+		.current_state	= LLC_CONN_STATE_D_CONN,
+		.transitions	= llc_d_conn_state_transitions,
+	},
+	[LLC_CONN_STATE_RESET - 1] = {
+		.current_state	= LLC_CONN_STATE_RESET,
+		.transitions	= llc_rst_state_transitions,
+	},
+	[LLC_CONN_STATE_ERROR - 1] = {
+		.current_state	= LLC_CONN_STATE_ERROR,
+		.transitions	= llc_error_state_transitions,
+	},
+	[LLC_CONN_STATE_TEMP - 1] = {
+		.current_state	= LLC_CONN_STATE_TEMP,
+		.transitions	= llc_temp_state_transitions,
+	},
+};
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
new file mode 100644
index 00000000..ba137a6a
--- /dev/null
+++ b/net/llc/llc_conn.c
@@ -0,0 +1,1016 @@
+/*
+ * llc_conn.c - Driver routines for connection component.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ *		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/llc_sap.h>
+#include <net/llc_conn.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/llc_pdu.h>
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+static int llc_find_offset(int state, int ev_type);
+static void llc_conn_send_pdus(struct sock *sk);
+static int llc_conn_service(struct sock *sk, struct sk_buff *skb);
+static int llc_exec_conn_trans_actions(struct sock *sk,
+				       struct llc_conn_state_trans *trans,
+				       struct sk_buff *ev);
+static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+							struct sk_buff *skb);
+
+/* Offset table on connection states transition diagram */
+static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV];
+
+int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ;
+int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ;
+int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ;
+int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ;
+
+/**
+ *	llc_conn_state_process - sends event to connection state machine
+ *	@sk: connection
+ *	@skb: occurred event
+ *
+ *	Sends an event to connection state machine. After processing event
+ *	(executing it's actions and changing state), upper layer will be
+ *	indicated or confirmed, if needed. Returns 0 for success, 1 for
+ *	failure. The socket lock has to be held before calling this function.
+ */
+int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
+{
+	int rc;
+	struct llc_sock *llc = llc_sk(skb->sk);
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	/*
+	 * We have to hold the skb, because llc_conn_service will kfree it in
+	 * the sending path and we need to look at the skb->cb, where we encode
+	 * llc_conn_state_ev.
+	 */
+	skb_get(skb);
+	ev->ind_prim = ev->cfm_prim = 0;
+	/*
+	 * Send event to state machine
+	 */
+	rc = llc_conn_service(skb->sk, skb);
+	if (unlikely(rc != 0)) {
+		printk(KERN_ERR "%s: llc_conn_service failed\n", __func__);
+		goto out_kfree_skb;
+	}
+
+	if (unlikely(!ev->ind_prim && !ev->cfm_prim)) {
+		/* indicate or confirm not required */
+		if (!skb->next)
+			goto out_kfree_skb;
+		goto out_skb_put;
+	}
+
+	if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */
+		skb_get(skb);
+
+	switch (ev->ind_prim) {
+	case LLC_DATA_PRIM:
+		llc_save_primitive(sk, skb, LLC_DATA_PRIM);
+		if (unlikely(sock_queue_rcv_skb(sk, skb))) {
+			/*
+			 * shouldn't happen
+			 */
+			printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n",
+			       __func__);
+			kfree_skb(skb);
+		}
+		break;
+	case LLC_CONN_PRIM:
+		/*
+		 * Can't be sock_queue_rcv_skb, because we have to leave the
+		 * skb->sk pointing to the newly created struct sock in
+		 * llc_conn_handler. -acme
+		 */
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		sk->sk_state_change(sk);
+		break;
+	case LLC_DISC_PRIM:
+		sock_hold(sk);
+		if (sk->sk_type == SOCK_STREAM &&
+		    sk->sk_state == TCP_ESTABLISHED) {
+			sk->sk_shutdown       = SHUTDOWN_MASK;
+			sk->sk_socket->state  = SS_UNCONNECTED;
+			sk->sk_state          = TCP_CLOSE;
+			if (!sock_flag(sk, SOCK_DEAD)) {
+				sock_set_flag(sk, SOCK_DEAD);
+				sk->sk_state_change(sk);
+			}
+		}
+		kfree_skb(skb);
+		sock_put(sk);
+		break;
+	case LLC_RESET_PRIM:
+		/*
+		 * FIXME:
+		 * RESET is not being notified to upper layers for now
+		 */
+		printk(KERN_INFO "%s: received a reset ind!\n", __func__);
+		kfree_skb(skb);
+		break;
+	default:
+		if (ev->ind_prim) {
+			printk(KERN_INFO "%s: received unknown %d prim!\n",
+				__func__, ev->ind_prim);
+			kfree_skb(skb);
+		}
+		/* No indication */
+		break;
+	}
+
+	switch (ev->cfm_prim) {
+	case LLC_DATA_PRIM:
+		if (!llc_data_accept_state(llc->state))
+			sk->sk_write_space(sk);
+		else
+			rc = llc->failed_data_req = 1;
+		break;
+	case LLC_CONN_PRIM:
+		if (sk->sk_type == SOCK_STREAM &&
+		    sk->sk_state == TCP_SYN_SENT) {
+			if (ev->status) {
+				sk->sk_socket->state = SS_UNCONNECTED;
+				sk->sk_state         = TCP_CLOSE;
+			} else {
+				sk->sk_socket->state = SS_CONNECTED;
+				sk->sk_state         = TCP_ESTABLISHED;
+			}
+			sk->sk_state_change(sk);
+		}
+		break;
+	case LLC_DISC_PRIM:
+		sock_hold(sk);
+		if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) {
+			sk->sk_socket->state = SS_UNCONNECTED;
+			sk->sk_state         = TCP_CLOSE;
+			sk->sk_state_change(sk);
+		}
+		sock_put(sk);
+		break;
+	case LLC_RESET_PRIM:
+		/*
+		 * FIXME:
+		 * RESET is not being notified to upper layers for now
+		 */
+		printk(KERN_INFO "%s: received a reset conf!\n", __func__);
+		break;
+	default:
+		if (ev->cfm_prim) {
+			printk(KERN_INFO "%s: received unknown %d prim!\n",
+					__func__, ev->cfm_prim);
+			break;
+		}
+		goto out_skb_put; /* No confirmation */
+	}
+out_kfree_skb:
+	kfree_skb(skb);
+out_skb_put:
+	kfree_skb(skb);
+	return rc;
+}
+
+void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
+{
+	/* queue PDU to send to MAC layer */
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	llc_conn_send_pdus(sk);
+}
+
+/**
+ *	llc_conn_rtn_pdu - sends received data pdu to upper layer
+ *	@sk: Active connection
+ *	@skb: Received data frame
+ *
+ *	Sends received data pdu to upper layer (by using indicate function).
+ *	Prepares service parameters (prim and prim_data). calling indication
+ *	function will be done in llc_conn_state_process.
+ */
+void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->ind_prim = LLC_DATA_PRIM;
+}
+
+/**
+ *	llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs
+ *	@sk: active connection
+ *	@nr: NR
+ *	@first_p_bit: p_bit value of first pdu
+ *
+ *	Resend all unacknowledged I PDUs, starting with the NR; send first as
+ *	command PDU with P bit equal first_p_bit; if more than one send
+ *	subsequent as command PDUs with P bit equal zero (0).
+ */
+void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit)
+{
+	struct sk_buff *skb;
+	struct llc_pdu_sn *pdu;
+	u16 nbr_unack_pdus;
+	struct llc_sock *llc;
+	u8 howmany_resend = 0;
+
+	llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus);
+	if (!nbr_unack_pdus)
+		goto out;
+	/*
+	 * Process unack PDUs only if unack queue is not empty; remove
+	 * appropriate PDUs, fix them up, and put them on mac_pdu_q.
+	 */
+	llc = llc_sk(sk);
+
+	while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) {
+		pdu = llc_pdu_sn_hdr(skb);
+		llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD);
+		llc_pdu_set_pf_bit(skb, first_p_bit);
+		skb_queue_tail(&sk->sk_write_queue, skb);
+		first_p_bit = 0;
+		llc->vS = LLC_I_GET_NS(pdu);
+		howmany_resend++;
+	}
+	if (howmany_resend > 0)
+		llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+	/* any PDUs to re-send are queued up; start sending to MAC */
+	llc_conn_send_pdus(sk);
+out:;
+}
+
+/**
+ *	llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs
+ *	@sk: active connection.
+ *	@nr: NR
+ *	@first_f_bit: f_bit value of first pdu.
+ *
+ *	Resend all unacknowledged I PDUs, starting with the NR; send first as
+ *	response PDU with F bit equal first_f_bit; if more than one send
+ *	subsequent as response PDUs with F bit equal zero (0).
+ */
+void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit)
+{
+	struct sk_buff *skb;
+	u16 nbr_unack_pdus;
+	struct llc_sock *llc = llc_sk(sk);
+	u8 howmany_resend = 0;
+
+	llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus);
+	if (!nbr_unack_pdus)
+		goto out;
+	/*
+	 * Process unack PDUs only if unack queue is not empty; remove
+	 * appropriate PDUs, fix them up, and put them on mac_pdu_q
+	 */
+	while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) {
+		struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+		llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP);
+		llc_pdu_set_pf_bit(skb, first_f_bit);
+		skb_queue_tail(&sk->sk_write_queue, skb);
+		first_f_bit = 0;
+		llc->vS = LLC_I_GET_NS(pdu);
+		howmany_resend++;
+	}
+	if (howmany_resend > 0)
+		llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
+	/* any PDUs to re-send are queued up; start sending to MAC */
+	llc_conn_send_pdus(sk);
+out:;
+}
+
+/**
+ *	llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue
+ *	@sk: active connection
+ *	nr: NR
+ *	how_many_unacked: size of pdu_unack_q after removing acked pdus
+ *
+ *	Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns
+ *	the number of pdus that removed from queue.
+ */
+int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked)
+{
+	int pdu_pos, i;
+	struct sk_buff *skb;
+	struct llc_pdu_sn *pdu;
+	int nbr_acked = 0;
+	struct llc_sock *llc = llc_sk(sk);
+	int q_len = skb_queue_len(&llc->pdu_unack_q);
+
+	if (!q_len)
+		goto out;
+	skb = skb_peek(&llc->pdu_unack_q);
+	pdu = llc_pdu_sn_hdr(skb);
+
+	/* finding position of last acked pdu in queue */
+	pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr -
+			(int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO;
+
+	for (i = 0; i < pdu_pos && i < q_len; i++) {
+		skb = skb_dequeue(&llc->pdu_unack_q);
+		kfree_skb(skb);
+		nbr_acked++;
+	}
+out:
+	*how_many_unacked = skb_queue_len(&llc->pdu_unack_q);
+	return nbr_acked;
+}
+
+/**
+ *	llc_conn_send_pdus - Sends queued PDUs
+ *	@sk: active connection
+ *
+ *	Sends queued pdus to MAC layer for transmission.
+ */
+static void llc_conn_send_pdus(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+		if (LLC_PDU_TYPE_IS_I(pdu) &&
+		    !(skb->dev->flags & IFF_LOOPBACK)) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
+			skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb);
+			if (!skb2)
+				break;
+			skb = skb2;
+		}
+		dev_queue_xmit(skb);
+	}
+}
+
+/**
+ *	llc_conn_service - finds transition and changes state of connection
+ *	@sk: connection
+ *	@skb: happened event
+ *
+ *	This function finds transition that matches with happened event, then
+ *	executes related actions and finally changes state of connection.
+ *	Returns 0 for success, 1 for failure.
+ */
+static int llc_conn_service(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = 1;
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_conn_state_trans *trans;
+
+	if (llc->state > NBR_CONN_STATES)
+		goto out;
+	rc = 0;
+	trans = llc_qualify_conn_ev(sk, skb);
+	if (trans) {
+		rc = llc_exec_conn_trans_actions(sk, trans, skb);
+		if (!rc && trans->next_state != NO_STATE_CHANGE) {
+			llc->state = trans->next_state;
+			if (!llc_data_accept_state(llc->state))
+				sk->sk_state_change(sk);
+		}
+	}
+out:
+	return rc;
+}
+
+/**
+ *	llc_qualify_conn_ev - finds transition for event
+ *	@sk: connection
+ *	@skb: happened event
+ *
+ *	This function finds transition that matches with happened event.
+ *	Returns pointer to found transition on success, %NULL otherwise.
+ */
+static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+							struct sk_buff *skb)
+{
+	struct llc_conn_state_trans **next_trans;
+	llc_conn_ev_qfyr_t *next_qualifier;
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+	struct llc_sock *llc = llc_sk(sk);
+	struct llc_conn_state *curr_state =
+					&llc_conn_state_table[llc->state - 1];
+
+	/* search thru events for this state until
+	 * list exhausted or until no more
+	 */
+	for (next_trans = curr_state->transitions +
+		llc_find_offset(llc->state - 1, ev->type);
+	     (*next_trans)->ev; next_trans++) {
+		if (!((*next_trans)->ev)(sk, skb)) {
+			/* got POSSIBLE event match; the event may require
+			 * qualification based on the values of a number of
+			 * state flags; if all qualifications are met (i.e.,
+			 * if all qualifying functions return success, or 0,
+			 * then this is THE event we're looking for
+			 */
+			for (next_qualifier = (*next_trans)->ev_qualifiers;
+			     next_qualifier && *next_qualifier &&
+			     !(*next_qualifier)(sk, skb); next_qualifier++)
+				/* nothing */;
+			if (!next_qualifier || !*next_qualifier)
+				/* all qualifiers executed successfully; this is
+				 * our transition; return it so we can perform
+				 * the associated actions & change the state
+				 */
+				return *next_trans;
+		}
+	}
+	return NULL;
+}
+
+/**
+ *	llc_exec_conn_trans_actions - executes related actions
+ *	@sk: connection
+ *	@trans: transition that it's actions must be performed
+ *	@skb: event
+ *
+ *	Executes actions that is related to happened event. Returns 0 for
+ *	success, 1 to indicate failure of at least one action.
+ */
+static int llc_exec_conn_trans_actions(struct sock *sk,
+				       struct llc_conn_state_trans *trans,
+				       struct sk_buff *skb)
+{
+	int rc = 0;
+	llc_conn_action_t *next_action;
+
+	for (next_action = trans->ev_actions;
+	     next_action && *next_action; next_action++) {
+		int rc2 = (*next_action)(sk, skb);
+
+		if (rc2 == 2) {
+			rc = rc2;
+			break;
+		} else if (rc2)
+			rc = 1;
+	}
+	return rc;
+}
+
+static inline bool llc_estab_match(const struct llc_sap *sap,
+				   const struct llc_addr *daddr,
+				   const struct llc_addr *laddr,
+				   const struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	return llc->laddr.lsap == laddr->lsap &&
+		llc->daddr.lsap == daddr->lsap &&
+		llc_mac_match(llc->laddr.mac, laddr->mac) &&
+		llc_mac_match(llc->daddr.mac, daddr->mac);
+}
+
+/**
+ *	__llc_lookup_established - Finds connection for the remote/local sap/mac
+ *	@sap: SAP
+ *	@daddr: address of remote LLC (MAC + SAP)
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search connection list of the SAP and finds connection using the remote
+ *	mac, remote sap, local mac, and local sap. Returns pointer for
+ *	connection found, %NULL otherwise.
+ *	Caller has to make sure local_bh is disabled.
+ */
+static struct sock *__llc_lookup_established(struct llc_sap *sap,
+					     struct llc_addr *daddr,
+					     struct llc_addr *laddr)
+{
+	struct sock *rc;
+	struct hlist_nulls_node *node;
+	int slot = llc_sk_laddr_hashfn(sap, laddr);
+	struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+	rcu_read_lock();
+again:
+	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+		if (llc_estab_match(sap, daddr, laddr, rc)) {
+			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+				goto again;
+			if (unlikely(llc_sk(rc)->sap != sap ||
+				     !llc_estab_match(sap, daddr, laddr, rc))) {
+				sock_put(rc);
+				continue;
+			}
+			goto found;
+		}
+	}
+	rc = NULL;
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (unlikely(get_nulls_value(node) != slot))
+		goto again;
+found:
+	rcu_read_unlock();
+	return rc;
+}
+
+struct sock *llc_lookup_established(struct llc_sap *sap,
+				    struct llc_addr *daddr,
+				    struct llc_addr *laddr)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+	sk = __llc_lookup_established(sap, daddr, laddr);
+	local_bh_enable();
+	return sk;
+}
+
+static inline bool llc_listener_match(const struct llc_sap *sap,
+				      const struct llc_addr *laddr,
+				      const struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN &&
+		llc->laddr.lsap == laddr->lsap &&
+		llc_mac_match(llc->laddr.mac, laddr->mac);
+}
+
+static struct sock *__llc_lookup_listener(struct llc_sap *sap,
+					  struct llc_addr *laddr)
+{
+	struct sock *rc;
+	struct hlist_nulls_node *node;
+	int slot = llc_sk_laddr_hashfn(sap, laddr);
+	struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+	rcu_read_lock();
+again:
+	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+		if (llc_listener_match(sap, laddr, rc)) {
+			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+				goto again;
+			if (unlikely(llc_sk(rc)->sap != sap ||
+				     !llc_listener_match(sap, laddr, rc))) {
+				sock_put(rc);
+				continue;
+			}
+			goto found;
+		}
+	}
+	rc = NULL;
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (unlikely(get_nulls_value(node) != slot))
+		goto again;
+found:
+	rcu_read_unlock();
+	return rc;
+}
+
+/**
+ *	llc_lookup_listener - Finds listener for local MAC + SAP
+ *	@sap: SAP
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search connection list of the SAP and finds connection listening on
+ *	local mac, and local sap. Returns pointer for parent socket found,
+ *	%NULL otherwise.
+ *	Caller has to make sure local_bh is disabled.
+ */
+static struct sock *llc_lookup_listener(struct llc_sap *sap,
+					struct llc_addr *laddr)
+{
+	static struct llc_addr null_addr;
+	struct sock *rc = __llc_lookup_listener(sap, laddr);
+
+	if (!rc)
+		rc = __llc_lookup_listener(sap, &null_addr);
+
+	return rc;
+}
+
+static struct sock *__llc_lookup(struct llc_sap *sap,
+				 struct llc_addr *daddr,
+				 struct llc_addr *laddr)
+{
+	struct sock *sk = __llc_lookup_established(sap, daddr, laddr);
+
+	return sk ? : llc_lookup_listener(sap, laddr);
+}
+
+/**
+ *	llc_data_accept_state - designates if in this state data can be sent.
+ *	@state: state of connection.
+ *
+ *	Returns 0 if data can be sent, 1 otherwise.
+ */
+u8 llc_data_accept_state(u8 state)
+{
+	return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY &&
+	       state != LLC_CONN_STATE_REJ;
+}
+
+/**
+ *	llc_find_next_offset - finds offset for next category of transitions
+ *	@state: state table.
+ *	@offset: start offset.
+ *
+ *	Finds offset of next category of transitions in transition table.
+ *	Returns the start index of next category.
+ */
+static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset)
+{
+	u16 cnt = 0;
+	struct llc_conn_state_trans **next_trans;
+
+	for (next_trans = state->transitions + offset;
+	     (*next_trans)->ev; next_trans++)
+		++cnt;
+	return cnt;
+}
+
+/**
+ *	llc_build_offset_table - builds offset table of connection
+ *
+ *	Fills offset table of connection state transition table
+ *	(llc_offset_table).
+ */
+void __init llc_build_offset_table(void)
+{
+	struct llc_conn_state *curr_state;
+	int state, ev_type, next_offset;
+
+	for (state = 0; state < NBR_CONN_STATES; state++) {
+		curr_state = &llc_conn_state_table[state];
+		next_offset = 0;
+		for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) {
+			llc_offset_table[state][ev_type] = next_offset;
+			next_offset += llc_find_next_offset(curr_state,
+							    next_offset) + 1;
+		}
+	}
+}
+
+/**
+ *	llc_find_offset - finds start offset of category of transitions
+ *	@state: state of connection
+ *	@ev_type: type of happened event
+ *
+ *	Finds start offset of desired category of transitions. Returns the
+ *	desired start offset.
+ */
+static int llc_find_offset(int state, int ev_type)
+{
+	int rc = 0;
+	/* at this stage, llc_offset_table[..][2] is not important. it is for
+	 * init_pf_cycle and I don't know what is it.
+	 */
+	switch (ev_type) {
+	case LLC_CONN_EV_TYPE_PRIM:
+		rc = llc_offset_table[state][0]; break;
+	case LLC_CONN_EV_TYPE_PDU:
+		rc = llc_offset_table[state][4]; break;
+	case LLC_CONN_EV_TYPE_SIMPLE:
+		rc = llc_offset_table[state][1]; break;
+	case LLC_CONN_EV_TYPE_P_TMR:
+	case LLC_CONN_EV_TYPE_ACK_TMR:
+	case LLC_CONN_EV_TYPE_REJ_TMR:
+	case LLC_CONN_EV_TYPE_BUSY_TMR:
+		rc = llc_offset_table[state][3]; break;
+	}
+	return rc;
+}
+
+/**
+ *	llc_sap_add_socket - adds a socket to a SAP
+ *	@sap: SAP
+ *	@sk: socket
+ *
+ *	This function adds a socket to the hash tables of a SAP.
+ */
+void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+	struct hlist_head *dev_hb = llc_sk_dev_hash(sap, llc->dev->ifindex);
+	struct hlist_nulls_head *laddr_hb = llc_sk_laddr_hash(sap, &llc->laddr);
+
+	llc_sap_hold(sap);
+	llc_sk(sk)->sap = sap;
+
+	spin_lock_bh(&sap->sk_lock);
+	sap->sk_count++;
+	sk_nulls_add_node_rcu(sk, laddr_hb);
+	hlist_add_head(&llc->dev_hash_node, dev_hb);
+	spin_unlock_bh(&sap->sk_lock);
+}
+
+/**
+ *	llc_sap_remove_socket - removes a socket from SAP
+ *	@sap: SAP
+ *	@sk: socket
+ *
+ *	This function removes a connection from the hash tables of a SAP if
+ *	the connection was in this list.
+ */
+void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	spin_lock_bh(&sap->sk_lock);
+	sk_nulls_del_node_init_rcu(sk);
+	hlist_del(&llc->dev_hash_node);
+	sap->sk_count--;
+	spin_unlock_bh(&sap->sk_lock);
+	llc_sap_put(sap);
+}
+
+/**
+ *	llc_conn_rcv - sends received pdus to the connection state machine
+ *	@sk: current connection structure.
+ *	@skb: received frame.
+ *
+ *	Sends received pdus to the connection state machine.
+ */
+static int llc_conn_rcv(struct sock* sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+	ev->type   = LLC_CONN_EV_TYPE_PDU;
+	ev->reason = 0;
+	return llc_conn_state_process(sk, skb);
+}
+
+static struct sock *llc_create_incoming_sock(struct sock *sk,
+					     struct net_device *dev,
+					     struct llc_addr *saddr,
+					     struct llc_addr *daddr)
+{
+	struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
+					  sk->sk_prot);
+	struct llc_sock *newllc, *llc = llc_sk(sk);
+
+	if (!newsk)
+		goto out;
+	newllc = llc_sk(newsk);
+	memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr));
+	memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr));
+	newllc->dev = dev;
+	dev_hold(dev);
+	llc_sap_add_socket(llc->sap, newsk);
+	llc_sap_hold(llc->sap);
+out:
+	return newsk;
+}
+
+void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_addr saddr, daddr;
+	struct sock *sk;
+
+	llc_pdu_decode_sa(skb, saddr.mac);
+	llc_pdu_decode_ssap(skb, &saddr.lsap);
+	llc_pdu_decode_da(skb, daddr.mac);
+	llc_pdu_decode_dsap(skb, &daddr.lsap);
+
+	sk = __llc_lookup(sap, &saddr, &daddr);
+	if (!sk)
+		goto drop;
+
+	bh_lock_sock(sk);
+	/*
+	 * This has to be done here and not at the upper layer ->accept
+	 * method because of the way the PROCOM state machine works:
+	 * it needs to set several state variables (see, for instance,
+	 * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to
+	 * the originator of the new connection, and this state has to be
+	 * in the newly created struct sock private area. -acme
+	 */
+	if (unlikely(sk->sk_state == TCP_LISTEN)) {
+		struct sock *newsk = llc_create_incoming_sock(sk, skb->dev,
+							      &saddr, &daddr);
+		if (!newsk)
+			goto drop_unlock;
+		skb_set_owner_r(skb, newsk);
+	} else {
+		/*
+		 * Can't be skb_set_owner_r, this will be done at the
+		 * llc_conn_state_process function, later on, when we will use
+		 * skb_queue_rcv_skb to send it to upper layers, this is
+		 * another trick required to cope with how the PROCOM state
+		 * machine works. -acme
+		 */
+		skb->sk = sk;
+	}
+	if (!sock_owned_by_user(sk))
+		llc_conn_rcv(sk, skb);
+	else {
+		dprintk("%s: adding to backlog...\n", __func__);
+		llc_set_backlog_type(skb, LLC_PACKET);
+		if (sk_add_backlog(sk, skb))
+			goto drop_unlock;
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+	return;
+drop:
+	kfree_skb(skb);
+	return;
+drop_unlock:
+	kfree_skb(skb);
+	goto out;
+}
+
+#undef LLC_REFCNT_DEBUG
+#ifdef LLC_REFCNT_DEBUG
+static atomic_t llc_sock_nr;
+#endif
+
+/**
+ *	llc_backlog_rcv - Processes rx frames and expired timers.
+ *	@sk: LLC sock (p8022 connection)
+ *	@skb: queued rx frame or event
+ *
+ *	This function processes frames that has received and timers that has
+ *	expired during sending an I pdu (refer to data_req_handler).  frames
+ *	queue by llc_rcv function (llc_mac.c) and timers queue by timer
+ *	callback functions(llc_c_ac.c).
+ */
+static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = 0;
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (likely(llc_backlog_type(skb) == LLC_PACKET)) {
+		if (likely(llc->state > 1)) /* not closed */
+			rc = llc_conn_rcv(sk, skb);
+		else
+			goto out_kfree_skb;
+	} else if (llc_backlog_type(skb) == LLC_EVENT) {
+		/* timer expiration event */
+		if (likely(llc->state > 1))  /* not closed */
+			rc = llc_conn_state_process(sk, skb);
+		else
+			goto out_kfree_skb;
+	} else {
+		printk(KERN_ERR "%s: invalid skb in backlog\n", __func__);
+		goto out_kfree_skb;
+	}
+out:
+	return rc;
+out_kfree_skb:
+	kfree_skb(skb);
+	goto out;
+}
+
+/**
+ *     llc_sk_init - Initializes a socket with default llc values.
+ *     @sk: socket to initialize.
+ *
+ *     Initializes a socket with default llc values.
+ */
+static void llc_sk_init(struct sock* sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc->state    = LLC_CONN_STATE_ADM;
+	llc->inc_cntr = llc->dec_cntr = 2;
+	llc->dec_step = llc->connect_step = 1;
+
+	setup_timer(&llc->ack_timer.timer, llc_conn_ack_tmr_cb,
+			(unsigned long)sk);
+	llc->ack_timer.expire	      = sysctl_llc2_ack_timeout;
+
+	setup_timer(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb,
+			(unsigned long)sk);
+	llc->pf_cycle_timer.expire	   = sysctl_llc2_p_timeout;
+
+	setup_timer(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb,
+			(unsigned long)sk);
+	llc->rej_sent_timer.expire	   = sysctl_llc2_rej_timeout;
+
+	setup_timer(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb,
+			(unsigned long)sk);
+	llc->busy_state_timer.expire	     = sysctl_llc2_busy_timeout;
+
+	llc->n2 = 2;   /* max retransmit */
+	llc->k  = 2;   /* tx win size, will adjust dynam */
+	llc->rw = 128; /* rx win size (opt and equal to
+			* tx_win of remote LLC) */
+	skb_queue_head_init(&llc->pdu_unack_q);
+	sk->sk_backlog_rcv = llc_backlog_rcv;
+}
+
+/**
+ *	llc_sk_alloc - Allocates LLC sock
+ *	@family: upper layer protocol family
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Allocates a LLC sock and initializes it. Returns the new LLC sock
+ *	or %NULL if there's no memory available for one
+ */
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+{
+	struct sock *sk = sk_alloc(net, family, priority, prot);
+
+	if (!sk)
+		goto out;
+	llc_sk_init(sk);
+	sock_init_data(NULL, sk);
+#ifdef LLC_REFCNT_DEBUG
+	atomic_inc(&llc_sock_nr);
+	printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk,
+		__func__, atomic_read(&llc_sock_nr));
+#endif
+out:
+	return sk;
+}
+
+/**
+ *	llc_sk_free - Frees a LLC socket
+ *	@sk - socket to free
+ *
+ *	Frees a LLC socket
+ */
+void llc_sk_free(struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc->state = LLC_CONN_OUT_OF_SVC;
+	/* Stop all (possibly) running timers */
+	llc_conn_ac_stop_all_timers(sk, NULL);
+#ifdef DEBUG_LLC_CONN_ALLOC
+	printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__,
+		skb_queue_len(&llc->pdu_unack_q),
+		skb_queue_len(&sk->sk_write_queue));
+#endif
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&sk->sk_write_queue);
+	skb_queue_purge(&llc->pdu_unack_q);
+#ifdef LLC_REFCNT_DEBUG
+	if (atomic_read(&sk->sk_refcnt) != 1) {
+		printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n",
+			sk, __func__, atomic_read(&sk->sk_refcnt));
+		printk(KERN_DEBUG "%d LLC sockets are still alive\n",
+			atomic_read(&llc_sock_nr));
+	} else {
+		atomic_dec(&llc_sock_nr);
+		printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk,
+			__func__, atomic_read(&llc_sock_nr));
+	}
+#endif
+	sock_put(sk);
+}
+
+/**
+ *	llc_sk_reset - resets a connection
+ *	@sk: LLC socket to reset
+ *
+ *	Resets a connection to the out of service state. Stops its timers
+ *	and frees any frames in the queues of the connection.
+ */
+void llc_sk_reset(struct sock *sk)
+{
+	struct llc_sock *llc = llc_sk(sk);
+
+	llc_conn_ac_stop_all_timers(sk, NULL);
+	skb_queue_purge(&sk->sk_write_queue);
+	skb_queue_purge(&llc->pdu_unack_q);
+	llc->remote_busy_flag	= 0;
+	llc->cause_flag		= 0;
+	llc->retry_count	= 0;
+	llc_conn_set_p_flag(sk, 0);
+	llc->f_flag		= 0;
+	llc->s_flag		= 0;
+	llc->ack_pf		= 0;
+	llc->first_pdu_Ns	= 0;
+	llc->ack_must_be_send	= 0;
+	llc->dec_step		= 1;
+	llc->inc_cntr		= 2;
+	llc->dec_cntr		= 2;
+	llc->X			= 0;
+	llc->failed_data_req	= 0 ;
+	llc->last_nr		= 0;
+}
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
new file mode 100644
index 00000000..2bb0ddff
--- /dev/null
+++ b/net/llc/llc_core.c
@@ -0,0 +1,169 @@
+/*
+ * llc_core.c - Minimum needed routines for sap handling and module init/exit
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+
+LIST_HEAD(llc_sap_list);
+DEFINE_SPINLOCK(llc_sap_list_lock);
+
+/**
+ *	llc_sap_alloc - allocates and initializes sap.
+ *
+ *	Allocates and initializes sap.
+ */
+static struct llc_sap *llc_sap_alloc(void)
+{
+	struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC);
+	int i;
+
+	if (sap) {
+		/* sap->laddr.mac - leave as a null, it's filled by bind */
+		sap->state = LLC_SAP_STATE_ACTIVE;
+		spin_lock_init(&sap->sk_lock);
+		for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++)
+			INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i);
+		atomic_set(&sap->refcnt, 1);
+	}
+	return sap;
+}
+
+static struct llc_sap *__llc_sap_find(unsigned char sap_value)
+{
+	struct llc_sap* sap;
+
+	list_for_each_entry(sap, &llc_sap_list, node)
+		if (sap->laddr.lsap == sap_value)
+			goto out;
+	sap = NULL;
+out:
+	return sap;
+}
+
+/**
+ *	llc_sap_find - searchs a SAP in station
+ *	@sap_value: sap to be found
+ *
+ *	Searchs for a sap in the sap list of the LLC's station upon the sap ID.
+ *	If the sap is found it will be refcounted and the user will have to do
+ *	a llc_sap_put after use.
+ *	Returns the sap or %NULL if not found.
+ */
+struct llc_sap *llc_sap_find(unsigned char sap_value)
+{
+	struct llc_sap *sap;
+
+	rcu_read_lock_bh();
+	sap = __llc_sap_find(sap_value);
+	if (sap)
+		llc_sap_hold(sap);
+	rcu_read_unlock_bh();
+	return sap;
+}
+
+/**
+ *	llc_sap_open - open interface to the upper layers.
+ *	@lsap: SAP number.
+ *	@func: rcv func for datalink protos
+ *
+ *	Interface function to upper layer. Each one who wants to get a SAP
+ *	(for example NetBEUI) should call this function. Returns the opened
+ *	SAP for success, NULL for failure.
+ */
+struct llc_sap *llc_sap_open(unsigned char lsap,
+			     int (*func)(struct sk_buff *skb,
+					 struct net_device *dev,
+					 struct packet_type *pt,
+					 struct net_device *orig_dev))
+{
+	struct llc_sap *sap = NULL;
+
+	spin_lock_bh(&llc_sap_list_lock);
+	if (__llc_sap_find(lsap)) /* SAP already exists */
+		goto out;
+	sap = llc_sap_alloc();
+	if (!sap)
+		goto out;
+	sap->laddr.lsap = lsap;
+	sap->rcv_func	= func;
+	list_add_tail_rcu(&sap->node, &llc_sap_list);
+out:
+	spin_unlock_bh(&llc_sap_list_lock);
+	return sap;
+}
+
+/**
+ *	llc_sap_close - close interface for upper layers.
+ *	@sap: SAP to be closed.
+ *
+ *	Close interface function to upper layer. Each one who wants to
+ *	close an open SAP (for example NetBEUI) should call this function.
+ * 	Removes this sap from the list of saps in the station and then
+ * 	frees the memory for this sap.
+ */
+void llc_sap_close(struct llc_sap *sap)
+{
+	WARN_ON(sap->sk_count);
+
+	spin_lock_bh(&llc_sap_list_lock);
+	list_del_rcu(&sap->node);
+	spin_unlock_bh(&llc_sap_list_lock);
+
+	synchronize_rcu();
+
+	kfree(sap);
+}
+
+static struct packet_type llc_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_802_2),
+	.func = llc_rcv,
+};
+
+static struct packet_type llc_tr_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_TR_802_2),
+	.func = llc_rcv,
+};
+
+static int __init llc_init(void)
+{
+	dev_add_pack(&llc_packet_type);
+	dev_add_pack(&llc_tr_packet_type);
+	return 0;
+}
+
+static void __exit llc_exit(void)
+{
+	dev_remove_pack(&llc_packet_type);
+	dev_remove_pack(&llc_tr_packet_type);
+}
+
+module_init(llc_init);
+module_exit(llc_exit);
+
+EXPORT_SYMBOL(llc_sap_list);
+EXPORT_SYMBOL(llc_sap_list_lock);
+EXPORT_SYMBOL(llc_sap_find);
+EXPORT_SYMBOL(llc_sap_open);
+EXPORT_SYMBOL(llc_sap_close);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003");
+MODULE_DESCRIPTION("LLC IEEE 802.2 core support");
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
new file mode 100644
index 00000000..25c31c0a
--- /dev/null
+++ b/net/llc/llc_if.c
@@ -0,0 +1,154 @@
+/*
+ * llc_if.c - Defines LLC interface to upper layer
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <asm/errno.h>
+#include <net/llc_if.h>
+#include <net/llc_sap.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_conn.h>
+#include <net/sock.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_st.h>
+#include <net/tcp_states.h>
+
+/**
+ *	llc_build_and_send_pkt - Connection data sending for upper layers.
+ *	@sk: connection
+ *	@skb: packet to send
+ *
+ *	This function is called when upper layer wants to send data using
+ *	connection oriented communication mode. During sending data, connection
+ *	will be locked and received frames and expired timers will be queued.
+ *	Returns 0 for success, -ECONNABORTED when the connection already
+ *	closed and -EBUSY when sending data is not permitted in this state or
+ *	LLC has send an I pdu with p bit set to 1 and is waiting for it's
+ *	response.
+ */
+int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
+{
+	struct llc_conn_state_ev *ev;
+	int rc = -ECONNABORTED;
+	struct llc_sock *llc = llc_sk(sk);
+
+	if (unlikely(llc->state == LLC_CONN_STATE_ADM))
+		goto out;
+	rc = -EBUSY;
+	if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */
+		     llc->p_flag)) {
+		llc->failed_data_req = 1;
+		goto out;
+	}
+	ev = llc_conn_ev(skb);
+	ev->type      = LLC_CONN_EV_TYPE_PRIM;
+	ev->prim      = LLC_DATA_PRIM;
+	ev->prim_type = LLC_PRIM_TYPE_REQ;
+	skb->dev      = llc->dev;
+	rc = llc_conn_state_process(sk, skb);
+out:
+	return rc;
+}
+
+/**
+ *	llc_establish_connection - Called by upper layer to establish a conn
+ *	@sk: connection
+ *	@lmac: local mac address
+ *	@dmac: destination mac address
+ *	@dsap: destination sap
+ *
+ *	Upper layer calls this to establish an LLC connection with a remote
+ *	machine. This function packages a proper event and sends it connection
+ *	component state machine. Success or failure of connection
+ *	establishment will inform to upper layer via calling it's confirm
+ *	function and passing proper information.
+ */
+int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap)
+{
+	int rc = -EISCONN;
+	struct llc_addr laddr, daddr;
+	struct sk_buff *skb;
+	struct llc_sock *llc = llc_sk(sk);
+	struct sock *existing;
+
+	laddr.lsap = llc->sap->laddr.lsap;
+	daddr.lsap = dsap;
+	memcpy(daddr.mac, dmac, sizeof(daddr.mac));
+	memcpy(laddr.mac, lmac, sizeof(laddr.mac));
+	existing = llc_lookup_established(llc->sap, &daddr, &laddr);
+	if (existing) {
+		if (existing->sk_state == TCP_ESTABLISHED) {
+			sk = existing;
+			goto out_put;
+		} else
+			sock_put(existing);
+	}
+	sock_hold(sk);
+	rc = -ENOMEM;
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (skb) {
+		struct llc_conn_state_ev *ev = llc_conn_ev(skb);
+
+		ev->type      = LLC_CONN_EV_TYPE_PRIM;
+		ev->prim      = LLC_CONN_PRIM;
+		ev->prim_type = LLC_PRIM_TYPE_REQ;
+		skb_set_owner_w(skb, sk);
+		rc = llc_conn_state_process(sk, skb);
+	}
+out_put:
+	sock_put(sk);
+	return rc;
+}
+
+/**
+ *	llc_send_disc - Called by upper layer to close a connection
+ *	@sk: connection to be closed
+ *
+ *	Upper layer calls this when it wants to close an established LLC
+ *	connection with a remote machine. This function packages a proper event
+ *	and sends it to connection component state machine. Returns 0 for
+ *	success, 1 otherwise.
+ */
+int llc_send_disc(struct sock *sk)
+{
+	u16 rc = 1;
+	struct llc_conn_state_ev *ev;
+	struct sk_buff *skb;
+
+	sock_hold(sk);
+	if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED ||
+	    llc_sk(sk)->state == LLC_CONN_STATE_ADM ||
+	    llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC)
+		goto out;
+	/*
+	 * Postpone unassigning the connection from its SAP and returning the
+	 * connection until all ACTIONs have been completely executed
+	 */
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+	skb_set_owner_w(skb, sk);
+	sk->sk_state  = TCP_CLOSING;
+	ev	      = llc_conn_ev(skb);
+	ev->type      = LLC_CONN_EV_TYPE_PRIM;
+	ev->prim      = LLC_DISC_PRIM;
+	ev->prim_type = LLC_PRIM_TYPE_REQ;
+	rc = llc_conn_state_process(sk, skb);
+out:
+	sock_put(sk);
+	return rc;
+}
+
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
new file mode 100644
index 00000000..90324211
--- /dev/null
+++ b/net/llc/llc_input.c
@@ -0,0 +1,212 @@
+/*
+ * llc_input.c - Minimal input path for LLC
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/llc_sap.h>
+
+#if 0
+#define dprintk(args...) printk(KERN_DEBUG args)
+#else
+#define dprintk(args...)
+#endif
+
+/*
+ * Packet handler for the station, registerable because in the minimal
+ * LLC core that is taking shape only the very minimal subset of LLC that
+ * is needed for things like IPX, Appletalk, etc will stay, with all the
+ * rest in the llc1 and llc2 modules.
+ */
+static void (*llc_station_handler)(struct sk_buff *skb);
+
+/*
+ * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN.
+ */
+static void (*llc_type_handlers[2])(struct llc_sap *sap,
+				    struct sk_buff *skb);
+
+void llc_add_pack(int type, void (*handler)(struct llc_sap *sap,
+					    struct sk_buff *skb))
+{
+	if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
+		llc_type_handlers[type - 1] = handler;
+}
+
+void llc_remove_pack(int type)
+{
+	if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
+		llc_type_handlers[type - 1] = NULL;
+}
+
+void llc_set_station_handler(void (*handler)(struct sk_buff *skb))
+{
+	llc_station_handler = handler;
+}
+
+/**
+ *	llc_pdu_type - returns which LLC component must handle for PDU
+ *	@skb: input skb
+ *
+ *	This function returns which LLC component must handle this PDU.
+ */
+static __inline__ int llc_pdu_type(struct sk_buff *skb)
+{
+	int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U)
+		goto out;
+	switch (LLC_U_PDU_CMD(pdu)) {
+	case LLC_1_PDU_CMD_XID:
+	case LLC_1_PDU_CMD_UI:
+	case LLC_1_PDU_CMD_TEST:
+		type = LLC_DEST_SAP;
+		break;
+	case LLC_2_PDU_CMD_SABME:
+	case LLC_2_PDU_CMD_DISC:
+	case LLC_2_PDU_RSP_UA:
+	case LLC_2_PDU_RSP_DM:
+	case LLC_2_PDU_RSP_FRMR:
+		break;
+	default:
+		type = LLC_DEST_INVALID;
+		break;
+	}
+out:
+	return type;
+}
+
+/**
+ *	llc_fixup_skb - initializes skb pointers
+ *	@skb: This argument points to incoming skb
+ *
+ *	Initializes internal skb pointer to start of network layer by deriving
+ *	length of LLC header; finds length of LLC control field in LLC header
+ *	by looking at the two lowest-order bits of the first control field
+ *	byte; field is either 3 or 4 bytes long.
+ */
+static inline int llc_fixup_skb(struct sk_buff *skb)
+{
+	u8 llc_len = 2;
+	struct llc_pdu_un *pdu;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*pdu))))
+		return 0;
+
+	pdu = (struct llc_pdu_un *)skb->data;
+	if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U)
+		llc_len = 1;
+	llc_len += 2;
+
+	if (unlikely(!pskb_may_pull(skb, llc_len)))
+		return 0;
+
+	skb->transport_header += llc_len;
+	skb_pull(skb, llc_len);
+	if (skb->protocol == htons(ETH_P_802_2)) {
+		__be16 pdulen = eth_hdr(skb)->h_proto;
+		s32 data_size = ntohs(pdulen) - llc_len;
+
+		if (data_size < 0 ||
+		    !pskb_may_pull(skb, data_size))
+			return 0;
+		if (unlikely(pskb_trim_rcsum(skb, data_size)))
+			return 0;
+	}
+	return 1;
+}
+
+/**
+ *	llc_rcv - 802.2 entry point from net lower layers
+ *	@skb: received pdu
+ *	@dev: device that receive pdu
+ *	@pt: packet type
+ *
+ *	When the system receives a 802.2 frame this function is called. It
+ *	checks SAP and connection of received pdu and passes frame to
+ *	llc_{station,sap,conn}_rcv for sending to proper state machine. If
+ *	the frame is related to a busy connection (a connection is sending
+ *	data now), it queues this frame in the connection's backlog.
+ */
+int llc_rcv(struct sk_buff *skb, struct net_device *dev,
+	    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct llc_sap *sap;
+	struct llc_pdu_sn *pdu;
+	int dest;
+	int (*rcv)(struct sk_buff *, struct net_device *,
+		   struct packet_type *, struct net_device *);
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	/*
+	 * When the interface is in promisc. mode, drop all the crap that it
+	 * receives, do not try to analyse it.
+	 */
+	if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
+		dprintk("%s: PACKET_OTHERHOST\n", __func__);
+		goto drop;
+	}
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		goto out;
+	if (unlikely(!llc_fixup_skb(skb)))
+		goto drop;
+	pdu = llc_pdu_sn_hdr(skb);
+	if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */
+	       goto handle_station;
+	sap = llc_sap_find(pdu->dsap);
+	if (unlikely(!sap)) {/* unknown SAP */
+		dprintk("%s: llc_sap_find(%02X) failed!\n", __func__,
+			pdu->dsap);
+		goto drop;
+	}
+	/*
+	 * First the upper layer protocols that don't need the full
+	 * LLC functionality
+	 */
+	rcv = rcu_dereference(sap->rcv_func);
+	dest = llc_pdu_type(skb);
+	if (unlikely(!dest || !llc_type_handlers[dest - 1])) {
+		if (rcv)
+			rcv(skb, dev, pt, orig_dev);
+		else
+			kfree_skb(skb);
+	} else {
+		if (rcv) {
+			struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
+			if (cskb)
+				rcv(cskb, dev, pt, orig_dev);
+		}
+		llc_type_handlers[dest - 1](sap, skb);
+	}
+	llc_sap_put(sap);
+out:
+	return 0;
+drop:
+	kfree_skb(skb);
+	goto out;
+handle_station:
+	if (!llc_station_handler)
+		goto drop;
+	llc_station_handler(skb);
+	goto out;
+}
+
+EXPORT_SYMBOL(llc_add_pack);
+EXPORT_SYMBOL(llc_remove_pack);
+EXPORT_SYMBOL(llc_set_station_handler);
diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
new file mode 100644
index 00000000..b38a1079
--- /dev/null
+++ b/net/llc/llc_output.c
@@ -0,0 +1,81 @@
+/*
+ * llc_output.c - LLC minimal output path
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License version 2 for more details.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/if_tr.h>
+#include <linux/netdevice.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+
+/**
+ *	llc_mac_hdr_init - fills MAC header fields
+ *	@skb: Address of the frame to initialize its MAC header
+ *	@sa: The MAC source address
+ *	@da: The MAC destination address
+ *
+ *	Fills MAC header fields, depending on MAC type. Returns 0, If MAC type
+ *	is a valid type and initialization completes correctly 1, otherwise.
+ */
+int llc_mac_hdr_init(struct sk_buff *skb,
+		     const unsigned char *sa, const unsigned char *da)
+{
+	int rc = -EINVAL;
+
+	switch (skb->dev->type) {
+	case ARPHRD_IEEE802_TR:
+	case ARPHRD_ETHER:
+	case ARPHRD_LOOPBACK:
+		rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa,
+				     skb->len);
+		if (rc > 0)
+			rc = 0;
+		break;
+	default:
+		WARN(1, "device type not supported: %d\n", skb->dev->type);
+	}
+	return rc;
+}
+
+/**
+ *	llc_build_and_send_ui_pkt - unitdata request interface for upper layers
+ *	@sap: sap to use
+ *	@skb: packet to send
+ *	@dmac: destination mac address
+ *	@dsap: destination sap
+ *
+ *	Upper layers calls this function when upper layer wants to send data
+ *	using connection-less mode communication (UI pdu).
+ *
+ *	Accept data frame from network layer to be sent using connection-
+ *	less mode communication; timeout/retries handled by network layer;
+ *	package primitive as an event and send to SAP event handler
+ */
+int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
+			      unsigned char *dmac, unsigned char dsap)
+{
+	int rc;
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap,
+			    dsap, LLC_PDU_CMD);
+	llc_pdu_init_as_ui_cmd(skb);
+	rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
+	if (likely(!rc))
+		rc = dev_queue_xmit(skb);
+	return rc;
+}
+
+EXPORT_SYMBOL(llc_mac_hdr_init);
+EXPORT_SYMBOL(llc_build_and_send_ui_pkt);
diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c
new file mode 100644
index 00000000..2e6cb791
--- /dev/null
+++ b/net/llc/llc_pdu.c
@@ -0,0 +1,372 @@
+/*
+ * llc_pdu.c - access to PDU internals
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ *		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/netdevice.h>
+#include <net/llc_pdu.h>
+
+static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type);
+static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu);
+
+void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type)
+{
+	llc_pdu_un_hdr(skb)->ssap |= pdu_type;
+}
+
+/**
+ *	pdu_set_pf_bit - sets poll/final bit in LLC header
+ *	@pdu_frame: input frame that p/f bit must be set into it.
+ *	@bit_value: poll/final bit (0 or 1).
+ *
+ *	This function sets poll/final bit in LLC header (based on type of PDU).
+ *	in I or S pdus, p/f bit is right bit of fourth byte in header. in U
+ *	pdus p/f bit is fifth bit of third byte.
+ */
+void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value)
+{
+	u8 pdu_type;
+	struct llc_pdu_sn *pdu;
+
+	llc_pdu_decode_pdu_type(skb, &pdu_type);
+	pdu = llc_pdu_sn_hdr(skb);
+
+	switch (pdu_type) {
+	case LLC_PDU_TYPE_I:
+	case LLC_PDU_TYPE_S:
+		pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value;
+		break;
+	case LLC_PDU_TYPE_U:
+		pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4);
+		break;
+	}
+}
+
+/**
+ *	llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header
+ *	@skb: input skb that p/f bit must be extracted from it
+ *	@pf_bit: poll/final bit (0 or 1)
+ *
+ *	This function extracts poll/final bit from LLC header (based on type of
+ *	PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In
+ *	U pdus p/f bit is fifth bit of third byte.
+ */
+void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit)
+{
+	u8 pdu_type;
+	struct llc_pdu_sn *pdu;
+
+	llc_pdu_decode_pdu_type(skb, &pdu_type);
+	pdu = llc_pdu_sn_hdr(skb);
+
+	switch (pdu_type) {
+	case LLC_PDU_TYPE_I:
+	case LLC_PDU_TYPE_S:
+		*pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK;
+		break;
+	case LLC_PDU_TYPE_U:
+		*pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4;
+		break;
+	}
+}
+
+/**
+ *	llc_pdu_init_as_disc_cmd - Builds DISC PDU
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *
+ *	Builds a pdu frame as a DISC command.
+ */
+void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_U;
+	pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC;
+	pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ *	llc_pdu_init_as_i_cmd - builds I pdu
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *	@ns: The sequence number of the data PDU
+ *	@nr: The seq. number of the expected I PDU from the remote
+ *
+ *	Builds a pdu frame as an I command.
+ */
+void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_I;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */
+	pdu->ctrl_1 |= (ns << 1) & 0xFE;   /* set N(S) in bits 2..8 */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE;   /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_rej_cmd - builds REJ PDU
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *	@nr: The seq. number of the expected I PDU from the remote
+ *
+ *	Builds a pdu frame as a REJ command.
+ */
+void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_rnr_cmd - builds RNR pdu
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *	@nr: The seq. number of the expected I PDU from the remote
+ *
+ *	Builds a pdu frame as an RNR command.
+ */
+void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_rr_cmd - Builds RR pdu
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *	@nr: The seq. number of the expected I PDU from the remote
+ *
+ *	Builds a pdu frame as an RR command.
+ */
+void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_CMD_RR;
+	pdu->ctrl_2  = p_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_sabme_cmd - builds SABME pdu
+ *	@skb: Address of the skb to build
+ *	@p_bit: The P bit to set in the PDU
+ *
+ *	Builds a pdu frame as an SABME command.
+ */
+void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_U;
+	pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME;
+	pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ *	llc_pdu_init_as_dm_rsp - builds DM response pdu
+ *	@skb: Address of the skb to build
+ *	@f_bit: The F bit to set in the PDU
+ *
+ *	Builds a pdu frame as a DM response.
+ */
+void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_U;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_DM;
+	pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ *	llc_pdu_init_as_frmr_rsp - builds FRMR response PDU
+ *	@skb: Address of the frame to build
+ *	@prev_pdu: The rejected PDU frame
+ *	@f_bit: The F bit to set in the PDU
+ *	@vs: tx state vari value for the data link conn at the rejecting LLC
+ *	@vr: rx state var value for the data link conn at the rejecting LLC
+ *	@vzyxw: completely described in the IEEE Std 802.2 document (Pg 55)
+ *
+ *	Builds a pdu frame as a FRMR response.
+ */
+void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu,
+			      u8 f_bit, u8 vs, u8 vr, u8 vzyxw)
+{
+	struct llc_frmr_info *frmr_info;
+	u8 prev_pf = 0;
+	u8 *ctrl;
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_U;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR;
+	pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+
+	frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2;
+	ctrl = (u8 *)&prev_pdu->ctrl_1;
+	FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl);
+	FRMR_INFO_SET_Vs(frmr_info, vs);
+	FRMR_INFO_SET_Vr(frmr_info, vr);
+	prev_pf = llc_pdu_get_pf_bit(prev_pdu);
+	FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf);
+	FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw);
+	FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw);
+	FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw);
+	FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw);
+	FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw);
+	skb_put(skb, sizeof(struct llc_frmr_info));
+}
+
+/**
+ *	llc_pdu_init_as_rr_rsp - builds RR response pdu
+ *	@skb: Address of the skb to build
+ *	@f_bit: The F bit to set in the PDU
+ *	@nr: The seq. number of the expected data PDU from the remote
+ *
+ *	Builds a pdu frame as an RR response.
+ */
+void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_RR;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE;  /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_rej_rsp - builds REJ response pdu
+ *	@skb: Address of the skb to build
+ *	@f_bit: The F bit to set in the PDU
+ *	@nr: The seq. number of the expected data PDU from the remote
+ *
+ *	Builds a pdu frame as a REJ response.
+ */
+void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE;  /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_rnr_rsp - builds RNR response pdu
+ *	@skb: Address of the frame to build
+ *	@f_bit: The F bit to set in the PDU
+ *	@nr: The seq. number of the expected data PDU from the remote
+ *
+ *	Builds a pdu frame as an RNR response.
+ */
+void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr)
+{
+	struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_S;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR;
+	pdu->ctrl_2  = 0;
+	pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK;
+	pdu->ctrl_1 &= 0x0F;    /* setting bits 5..8 to zero(reserved) */
+	pdu->ctrl_2 |= (nr << 1) & 0xFE;  /* set N(R) in bits 10..16 */
+}
+
+/**
+ *	llc_pdu_init_as_ua_rsp - builds UA response pdu
+ *	@skb: Address of the frame to build
+ *	@f_bit: The F bit to set in the PDU
+ *
+ *	Builds a pdu frame as a UA response.
+ */
+void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	pdu->ctrl_1  = LLC_PDU_TYPE_U;
+	pdu->ctrl_1 |= LLC_2_PDU_RSP_UA;
+	pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK;
+}
+
+/**
+ *	llc_pdu_decode_pdu_type - designates PDU type
+ *	@skb: input skb that type of it must be designated.
+ *	@type: type of PDU (output argument).
+ *
+ *	This function designates type of PDU (I, S or U).
+ */
+static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type)
+{
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	if (pdu->ctrl_1 & 1) {
+		if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U)
+			*type = LLC_PDU_TYPE_U;
+		else
+			*type = LLC_PDU_TYPE_S;
+	} else
+		*type = LLC_PDU_TYPE_I;
+}
+
+/**
+ *	llc_pdu_get_pf_bit - extracts p/f bit of input PDU
+ *	@pdu: pointer to LLC header.
+ *
+ *	This function extracts p/f bit of input PDU. at first examines type of
+ *	PDU and then extracts p/f bit. Returns the p/f bit.
+ */
+static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu)
+{
+	u8 pdu_type;
+	u8 pf_bit = 0;
+
+	if (pdu->ctrl_1 & 1) {
+		if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U)
+			pdu_type = LLC_PDU_TYPE_U;
+		else
+			pdu_type = LLC_PDU_TYPE_S;
+	} else
+		pdu_type = LLC_PDU_TYPE_I;
+	switch (pdu_type) {
+	case LLC_PDU_TYPE_I:
+	case LLC_PDU_TYPE_S:
+		pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK;
+		break;
+	case LLC_PDU_TYPE_U:
+		pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4;
+		break;
+	}
+	return pf_bit;
+}
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
new file mode 100644
index 00000000..7af1ff2d
--- /dev/null
+++ b/net/llc/llc_proc.c
@@ -0,0 +1,276 @@
+/*
+ * proc_llc.c - proc interface for LLC
+ *
+ * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org>
+ *		 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/errno.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/llc.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_st.h>
+#include <net/llc_conn.h>
+
+static void llc_ui_format_mac(struct seq_file *seq, u8 *addr)
+{
+	seq_printf(seq, "%pM", addr);
+}
+
+static struct sock *llc_get_sk_idx(loff_t pos)
+{
+	struct llc_sap *sap;
+	struct sock *sk = NULL;
+	int i;
+
+	list_for_each_entry_rcu(sap, &llc_sap_list, node) {
+		spin_lock_bh(&sap->sk_lock);
+		for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) {
+			struct hlist_nulls_head *head = &sap->sk_laddr_hash[i];
+			struct hlist_nulls_node *node;
+
+			sk_nulls_for_each(sk, node, head) {
+				if (!pos)
+					goto found; /* keep the lock */
+				--pos;
+			}
+		}
+		spin_unlock_bh(&sap->sk_lock);
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static void *llc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	loff_t l = *pos;
+
+	rcu_read_lock_bh();
+	return l ? llc_get_sk_idx(--l) : SEQ_START_TOKEN;
+}
+
+static struct sock *laddr_hash_next(struct llc_sap *sap, int bucket)
+{
+	struct hlist_nulls_node *node;
+	struct sock *sk = NULL;
+
+	while (++bucket < LLC_SK_LADDR_HASH_ENTRIES)
+		sk_nulls_for_each(sk, node, &sap->sk_laddr_hash[bucket])
+			goto out;
+
+out:
+	return sk;
+}
+
+static void *llc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock* sk, *next;
+	struct llc_sock *llc;
+	struct llc_sap *sap;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		sk = llc_get_sk_idx(0);
+		goto out;
+	}
+	sk = v;
+	next = sk_nulls_next(sk);
+	if (next) {
+		sk = next;
+		goto out;
+	}
+	llc = llc_sk(sk);
+	sap = llc->sap;
+	sk = laddr_hash_next(sap, llc_sk_laddr_hashfn(sap, &llc->laddr));
+	if (sk)
+		goto out;
+	spin_unlock_bh(&sap->sk_lock);
+	list_for_each_entry_continue_rcu(sap, &llc_sap_list, node) {
+		spin_lock_bh(&sap->sk_lock);
+		sk = laddr_hash_next(sap, -1);
+		if (sk)
+			break; /* keep the lock */
+		spin_unlock_bh(&sap->sk_lock);
+	}
+out:
+	return sk;
+}
+
+static void llc_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v && v != SEQ_START_TOKEN) {
+		struct sock *sk = v;
+		struct llc_sock *llc = llc_sk(sk);
+		struct llc_sap *sap = llc->sap;
+
+		spin_unlock_bh(&sap->sk_lock);
+	}
+	rcu_read_unlock_bh();
+}
+
+static int llc_seq_socket_show(struct seq_file *seq, void *v)
+{
+	struct sock* sk;
+	struct llc_sock *llc;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "SKt Mc local_mac_sap        remote_mac_sap   "
+			      "    tx_queue rx_queue st uid link\n");
+		goto out;
+	}
+	sk = v;
+	llc = llc_sk(sk);
+
+	/* FIXME: check if the address is multicast */
+	seq_printf(seq, "%2X  %2X ", sk->sk_type, 0);
+
+	if (llc->dev)
+		llc_ui_format_mac(seq, llc->dev->dev_addr);
+	else {
+		u8 addr[6] = {0,0,0,0,0,0};
+		llc_ui_format_mac(seq, addr);
+	}
+	seq_printf(seq, "@%02X ", llc->sap->laddr.lsap);
+	llc_ui_format_mac(seq, llc->daddr.mac);
+	seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap,
+		   sk_wmem_alloc_get(sk),
+		   sk_rmem_alloc_get(sk) - llc->copied_seq,
+		   sk->sk_state,
+		   sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : -1,
+		   llc->link);
+out:
+	return 0;
+}
+
+static const char *const llc_conn_state_names[] = {
+	[LLC_CONN_STATE_ADM] =        "adm",
+	[LLC_CONN_STATE_SETUP] =      "setup",
+	[LLC_CONN_STATE_NORMAL] =     "normal",
+	[LLC_CONN_STATE_BUSY] =       "busy",
+	[LLC_CONN_STATE_REJ] =        "rej",
+	[LLC_CONN_STATE_AWAIT] =      "await",
+	[LLC_CONN_STATE_AWAIT_BUSY] = "await_busy",
+	[LLC_CONN_STATE_AWAIT_REJ] =  "await_rej",
+	[LLC_CONN_STATE_D_CONN]	=     "d_conn",
+	[LLC_CONN_STATE_RESET] =      "reset",
+	[LLC_CONN_STATE_ERROR] =      "error",
+	[LLC_CONN_STATE_TEMP] =       "temp",
+};
+
+static int llc_seq_core_show(struct seq_file *seq, void *v)
+{
+	struct sock* sk;
+	struct llc_sock *llc;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "Connection list:\n"
+			      "dsap state      retr txw rxw pf ff sf df rs cs "
+			      "tack tpfc trs tbs blog busr\n");
+		goto out;
+	}
+	sk = v;
+	llc = llc_sk(sk);
+
+	seq_printf(seq, " %02X  %-10s %3d  %3d %3d %2d %2d %2d %2d %2d %2d "
+			"%4d %4d %3d %3d %4d %4d\n",
+		   llc->daddr.lsap, llc_conn_state_names[llc->state],
+		   llc->retry_count, llc->k, llc->rw, llc->p_flag, llc->f_flag,
+		   llc->s_flag, llc->data_flag, llc->remote_busy_flag,
+		   llc->cause_flag, timer_pending(&llc->ack_timer.timer),
+		   timer_pending(&llc->pf_cycle_timer.timer),
+		   timer_pending(&llc->rej_sent_timer.timer),
+		   timer_pending(&llc->busy_state_timer.timer),
+		   !!sk->sk_backlog.tail, !!sock_owned_by_user(sk));
+out:
+	return 0;
+}
+
+static const struct seq_operations llc_seq_socket_ops = {
+	.start  = llc_seq_start,
+	.next   = llc_seq_next,
+	.stop   = llc_seq_stop,
+	.show   = llc_seq_socket_show,
+};
+
+static const struct seq_operations llc_seq_core_ops = {
+	.start  = llc_seq_start,
+	.next   = llc_seq_next,
+	.stop   = llc_seq_stop,
+	.show   = llc_seq_core_show,
+};
+
+static int llc_seq_socket_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &llc_seq_socket_ops);
+}
+
+static int llc_seq_core_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &llc_seq_core_ops);
+}
+
+static const struct file_operations llc_seq_socket_fops = {
+	.owner		= THIS_MODULE,
+	.open		= llc_seq_socket_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations llc_seq_core_fops = {
+	.owner		= THIS_MODULE,
+	.open		= llc_seq_core_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct proc_dir_entry *llc_proc_dir;
+
+int __init llc_proc_init(void)
+{
+	int rc = -ENOMEM;
+	struct proc_dir_entry *p;
+
+	llc_proc_dir = proc_mkdir("llc", init_net.proc_net);
+	if (!llc_proc_dir)
+		goto out;
+
+	p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops);
+	if (!p)
+		goto out_socket;
+
+	p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops);
+	if (!p)
+		goto out_core;
+
+	rc = 0;
+out:
+	return rc;
+out_core:
+	remove_proc_entry("socket", llc_proc_dir);
+out_socket:
+	remove_proc_entry("llc", init_net.proc_net);
+	goto out;
+}
+
+void llc_proc_exit(void)
+{
+	remove_proc_entry("socket", llc_proc_dir);
+	remove_proc_entry("core", llc_proc_dir);
+	remove_proc_entry("llc", init_net.proc_net);
+}
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
new file mode 100644
index 00000000..a94bd56b
--- /dev/null
+++ b/net/llc/llc_s_ac.c
@@ -0,0 +1,208 @@
+/*
+ * llc_s_ac.c - actions performed during sap state transition.
+ *
+ * Description :
+ *   Functions in this module are implementation of sap component actions.
+ *   Details of actions can be found in IEEE-802.2 standard document.
+ *   All functions have one sap and one event as input argument. All of
+ *   them return 0 On success and 1 otherwise.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ *		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <linux/netdevice.h>
+#include <net/llc.h>
+#include <net/llc_pdu.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_sap.h>
+
+
+/**
+ *	llc_sap_action_unit_data_ind - forward UI PDU to network layer
+ *	@sap: SAP
+ *	@skb: the event to forward
+ *
+ *	Received a UI PDU from MAC layer; forward to network layer as a
+ *	UNITDATA INDICATION; verify our event is the kind we expect
+ */
+int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+	llc_sap_rtn_pdu(sap, skb);
+	return 0;
+}
+
+/**
+ *	llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Sends a UI PDU to the MAC layer in response to a UNITDATA REQUEST
+ *	primitive from the network layer. Verifies event is a primitive type of
+ *	event. Verify the primitive is a UNITDATA REQUEST.
+ */
+int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	int rc;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+			    ev->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_ui_cmd(skb);
+	rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+	if (likely(!rc))
+		rc = dev_queue_xmit(skb);
+	return rc;
+}
+
+/**
+ *	llc_sap_action_send_xid_c - send XID PDU as response to XID REQ
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Send a XID command PDU to MAC layer in response to a XID REQUEST
+ *	primitive from the network layer. Verify event is a primitive type
+ *	event. Verify the primitive is a XID REQUEST.
+ */
+int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	int rc;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+			    ev->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
+	rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+	if (likely(!rc))
+		rc = dev_queue_xmit(skb);
+	return rc;
+}
+
+/**
+ *	llc_sap_action_send_xid_r - send XID PDU resp to MAC for received XID
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Send XID response PDU to MAC in response to an earlier received XID
+ *	command PDU. Verify event is a PDU type event
+ */
+int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+	u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap;
+	int rc = 1;
+	struct sk_buff *nskb;
+
+	llc_pdu_decode_sa(skb, mac_da);
+	llc_pdu_decode_da(skb, mac_sa);
+	llc_pdu_decode_ssap(skb, &dsap);
+	nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
+			       sizeof(struct llc_xid_info));
+	if (!nskb)
+		goto out;
+	llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
+			    LLC_PDU_RSP);
+	llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 0);
+	rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
+	if (likely(!rc))
+		rc = dev_queue_xmit(nskb);
+out:
+	return rc;
+}
+
+/**
+ *	llc_sap_action_send_test_c - send TEST PDU to MAC in resp to TEST REQ
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Send a TEST command PDU to the MAC layer in response to a TEST REQUEST
+ *	primitive from the network layer. Verify event is a primitive type
+ *	event; verify the primitive is a TEST REQUEST.
+ */
+int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	int rc;
+
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+			    ev->daddr.lsap, LLC_PDU_CMD);
+	llc_pdu_init_as_test_cmd(skb);
+	rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+	if (likely(!rc))
+		rc = dev_queue_xmit(skb);
+	return rc;
+}
+
+int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+	u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap;
+	struct sk_buff *nskb;
+	int rc = 1;
+	u32 data_size;
+
+	llc_pdu_decode_sa(skb, mac_da);
+	llc_pdu_decode_da(skb, mac_sa);
+	llc_pdu_decode_ssap(skb, &dsap);
+
+	/* The test request command is type U (llc_len = 3) */
+	data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
+	nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
+	if (!nskb)
+		goto out;
+	llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap,
+			    LLC_PDU_RSP);
+	llc_pdu_init_as_test_rsp(nskb, skb);
+	rc = llc_mac_hdr_init(nskb, mac_sa, mac_da);
+	if (likely(!rc))
+		rc = dev_queue_xmit(nskb);
+out:
+	return rc;
+}
+
+/**
+ *	llc_sap_action_report_status - report data link status to layer mgmt
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Report data link status to layer management. Verify our event is the
+ *	kind we expect.
+ */
+int llc_sap_action_report_status(struct llc_sap *sap, struct sk_buff *skb)
+{
+	return 0;
+}
+
+/**
+ *	llc_sap_action_xid_ind - send XID PDU resp to net layer via XID IND
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Send a XID response PDU to the network layer via a XID INDICATION
+ *	primitive.
+ */
+int llc_sap_action_xid_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+	llc_sap_rtn_pdu(sap, skb);
+	return 0;
+}
+
+/**
+ *	llc_sap_action_test_ind - send TEST PDU to net layer via TEST IND
+ *	@sap: SAP
+ *	@skb: the event to send
+ *
+ *	Send a TEST response PDU to the network layer via a TEST INDICATION
+ *	primitive. Verify our event is a PDU type event.
+ */
+int llc_sap_action_test_ind(struct llc_sap *sap, struct sk_buff *skb)
+{
+	llc_sap_rtn_pdu(sap, skb);
+	return 0;
+}
diff --git a/net/llc/llc_s_ev.c b/net/llc/llc_s_ev.c
new file mode 100644
index 00000000..a74d2a1d
--- /dev/null
+++ b/net/llc/llc_s_ev.c
@@ -0,0 +1,115 @@
+/*
+ * llc_s_ev.c - Defines SAP component events
+ *
+ * The followed event functions are SAP component events which are described
+ * in 802.2 LLC protocol standard document.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ *		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/llc_if.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_pdu.h>
+
+int llc_sap_ev_activation_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_SIMPLE &&
+	       ev->prim_type == LLC_SAP_EV_ACTIVATION_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_ui(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+	       LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_UI ? 0 : 1;
+}
+
+int llc_sap_ev_unitdata_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+	       ev->prim == LLC_DATAUNIT_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+
+}
+
+int llc_sap_ev_xid_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+	       ev->prim == LLC_XID_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_xid_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+	       LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1;
+}
+
+int llc_sap_ev_rx_xid_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) &&
+	       LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1;
+}
+
+int llc_sap_ev_test_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PRIM &&
+	       ev->prim == LLC_TEST_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+int llc_sap_ev_rx_test_c(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) &&
+	       LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1;
+}
+
+int llc_sap_ev_rx_test_r(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) &&
+	       LLC_PDU_TYPE_IS_U(pdu) &&
+	       LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1;
+}
+
+int llc_sap_ev_deactivation_req(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	return ev->type == LLC_SAP_EV_TYPE_SIMPLE &&
+	       ev->prim_type == LLC_SAP_EV_DEACTIVATION_REQ ? 0 : 1;
+}
diff --git a/net/llc/llc_s_st.c b/net/llc/llc_s_st.c
new file mode 100644
index 00000000..135f7d80
--- /dev/null
+++ b/net/llc/llc_s_st.c
@@ -0,0 +1,183 @@
+/*
+ * llc_s_st.c - Defines SAP component state machine transitions.
+ *
+ * The followed transitions are SAP component state machine transitions
+ * which are described in 802.2 LLC protocol standard document.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ *		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/types.h>
+#include <net/llc_if.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_st.h>
+
+/* dummy last-transition indicator; common to all state transition groups
+ * last entry for this state
+ * all members are zeros, .bss zeroes it
+ */
+static struct llc_sap_state_trans llc_sap_state_trans_end;
+
+/* state LLC_SAP_STATE_INACTIVE transition for
+ * LLC_SAP_EV_ACTIVATION_REQ event
+ */
+static llc_sap_action_t llc_sap_inactive_state_actions_1[] = {
+	[0] = llc_sap_action_report_status,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = {
+	.ev =		llc_sap_ev_activation_req,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_inactive_state_actions_1,
+};
+
+/* array of pointers; one to each transition */
+static struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = {
+	[0] = &llc_sap_inactive_state_trans_1,
+	[1] = &llc_sap_state_trans_end,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_UI event */
+static llc_sap_action_t llc_sap_active_state_actions_1[] = {
+	[0] = llc_sap_action_unitdata_ind,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_1 = {
+	.ev =		llc_sap_ev_rx_ui,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_1,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_UNITDATA_REQ event */
+static llc_sap_action_t llc_sap_active_state_actions_2[] = {
+	[0] = llc_sap_action_send_ui,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_2 = {
+	.ev =		llc_sap_ev_unitdata_req,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_2,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_XID_REQ event */
+static llc_sap_action_t llc_sap_active_state_actions_3[] = {
+	[0] = llc_sap_action_send_xid_c,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_3 = {
+	.ev =		llc_sap_ev_xid_req,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_3,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_C event */
+static llc_sap_action_t llc_sap_active_state_actions_4[] = {
+	[0] = llc_sap_action_send_xid_r,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_4 = {
+	.ev =		llc_sap_ev_rx_xid_c,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_4,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_R event */
+static llc_sap_action_t llc_sap_active_state_actions_5[] = {
+	[0] = llc_sap_action_xid_ind,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_5 = {
+	.ev =		llc_sap_ev_rx_xid_r,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_5,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_TEST_REQ event */
+static llc_sap_action_t llc_sap_active_state_actions_6[] = {
+	[0] = llc_sap_action_send_test_c,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_6 = {
+	.ev =		llc_sap_ev_test_req,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_6,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_C event */
+static llc_sap_action_t llc_sap_active_state_actions_7[] = {
+	[0] = llc_sap_action_send_test_r,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_7 = {
+	.ev =		llc_sap_ev_rx_test_c,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_7
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_R event */
+static llc_sap_action_t llc_sap_active_state_actions_8[] = {
+	[0] = llc_sap_action_test_ind,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_8 = {
+	.ev =		llc_sap_ev_rx_test_r,
+	.next_state =	LLC_SAP_STATE_ACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_8,
+};
+
+/* state LLC_SAP_STATE_ACTIVE transition for
+ * LLC_SAP_EV_DEACTIVATION_REQ event
+ */
+static llc_sap_action_t llc_sap_active_state_actions_9[] = {
+	[0] = llc_sap_action_report_status,
+	[1] = NULL,
+};
+
+static struct llc_sap_state_trans llc_sap_active_state_trans_9 = {
+	.ev =		llc_sap_ev_deactivation_req,
+	.next_state =	LLC_SAP_STATE_INACTIVE,
+	.ev_actions =	llc_sap_active_state_actions_9
+};
+
+/* array of pointers; one to each transition */
+static struct llc_sap_state_trans *llc_sap_active_state_transitions[] = {
+	[0] = &llc_sap_active_state_trans_2,
+	[1] = &llc_sap_active_state_trans_1,
+	[2] = &llc_sap_active_state_trans_3,
+	[3] = &llc_sap_active_state_trans_4,
+	[4] = &llc_sap_active_state_trans_5,
+	[5] = &llc_sap_active_state_trans_6,
+	[6] = &llc_sap_active_state_trans_7,
+	[7] = &llc_sap_active_state_trans_8,
+	[8] = &llc_sap_active_state_trans_9,
+	[9] = &llc_sap_state_trans_end,
+};
+
+/* SAP state transition table */
+struct llc_sap_state llc_sap_state_table[LLC_NR_SAP_STATES] = {
+	[LLC_SAP_STATE_INACTIVE - 1] = {
+		.curr_state	= LLC_SAP_STATE_INACTIVE,
+		.transitions	= llc_sap_inactive_state_transitions,
+	},
+	[LLC_SAP_STATE_ACTIVE - 1] = {
+		.curr_state	= LLC_SAP_STATE_ACTIVE,
+		.transitions	= llc_sap_active_state_transitions,
+	},
+};
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
new file mode 100644
index 00000000..94e7fca7
--- /dev/null
+++ b/net/llc/llc_sap.c
@@ -0,0 +1,444 @@
+/*
+ * llc_sap.c - driver routines for SAP component.
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+
+#include <net/llc.h>
+#include <net/llc_if.h>
+#include <net/llc_conn.h>
+#include <net/llc_pdu.h>
+#include <net/llc_sap.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_st.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <linux/llc.h>
+#include <linux/slab.h>
+
+static int llc_mac_header_len(unsigned short devtype)
+{
+	switch (devtype) {
+	case ARPHRD_ETHER:
+	case ARPHRD_LOOPBACK:
+		return sizeof(struct ethhdr);
+#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
+	case ARPHRD_IEEE802_TR:
+		return sizeof(struct trh_hdr);
+#endif
+	}
+	return 0;
+}
+
+/**
+ *	llc_alloc_frame - allocates sk_buff for frame
+ *	@dev: network device this skb will be sent over
+ *	@type: pdu type to allocate
+ *	@data_size: data size to allocate
+ *
+ *	Allocates an sk_buff for frame and initializes sk_buff fields.
+ *	Returns allocated skb or %NULL when out of memory.
+ */
+struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev,
+				u8 type, u32 data_size)
+{
+	int hlen = type == LLC_PDU_TYPE_U ? 3 : 4;
+	struct sk_buff *skb;
+
+	hlen += llc_mac_header_len(dev->type);
+	skb = alloc_skb(hlen + data_size, GFP_ATOMIC);
+
+	if (skb) {
+		skb_reset_mac_header(skb);
+		skb_reserve(skb, hlen);
+		skb_reset_network_header(skb);
+		skb_reset_transport_header(skb);
+		skb->protocol = htons(ETH_P_802_2);
+		skb->dev      = dev;
+		if (sk != NULL)
+			skb_set_owner_w(skb, sk);
+	}
+	return skb;
+}
+
+void llc_save_primitive(struct sock *sk, struct sk_buff* skb, u8 prim)
+{
+	struct sockaddr_llc *addr;
+
+       /* save primitive for use by the user. */
+	addr		  = llc_ui_skb_cb(skb);
+
+	memset(addr, 0, sizeof(*addr));
+	addr->sllc_family = sk->sk_family;
+	addr->sllc_arphrd = skb->dev->type;
+	addr->sllc_test   = prim == LLC_TEST_PRIM;
+	addr->sllc_xid    = prim == LLC_XID_PRIM;
+	addr->sllc_ua     = prim == LLC_DATAUNIT_PRIM;
+	llc_pdu_decode_sa(skb, addr->sllc_mac);
+	llc_pdu_decode_ssap(skb, &addr->sllc_sap);
+}
+
+/**
+ *	llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu.
+ *	@sap: pointer to SAP
+ *	@skb: received pdu
+ */
+void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	switch (LLC_U_PDU_RSP(pdu)) {
+	case LLC_1_PDU_CMD_TEST:
+		ev->prim = LLC_TEST_PRIM;	break;
+	case LLC_1_PDU_CMD_XID:
+		ev->prim = LLC_XID_PRIM;	break;
+	case LLC_1_PDU_CMD_UI:
+		ev->prim = LLC_DATAUNIT_PRIM;	break;
+	}
+	ev->ind_cfm_flag = LLC_IND;
+}
+
+/**
+ *	llc_find_sap_trans - finds transition for event
+ *	@sap: pointer to SAP
+ *	@skb: happened event
+ *
+ *	This function finds transition that matches with happened event.
+ *	Returns the pointer to found transition on success or %NULL for
+ *	failure.
+ */
+static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap,
+						      struct sk_buff* skb)
+{
+	int i = 0;
+	struct llc_sap_state_trans *rc = NULL;
+	struct llc_sap_state_trans **next_trans;
+	struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1];
+	/*
+	 * Search thru events for this state until list exhausted or until
+	 * its obvious the event is not valid for the current state
+	 */
+	for (next_trans = curr_state->transitions; next_trans[i]->ev; i++)
+		if (!next_trans[i]->ev(sap, skb)) {
+			rc = next_trans[i]; /* got event match; return it */
+			break;
+		}
+	return rc;
+}
+
+/**
+ *	llc_exec_sap_trans_actions - execute actions related to event
+ *	@sap: pointer to SAP
+ *	@trans: pointer to transition that it's actions must be performed
+ *	@skb: happened event.
+ *
+ *	This function executes actions that is related to happened event.
+ *	Returns 0 for success and 1 for failure of at least one action.
+ */
+static int llc_exec_sap_trans_actions(struct llc_sap *sap,
+				      struct llc_sap_state_trans *trans,
+				      struct sk_buff *skb)
+{
+	int rc = 0;
+	llc_sap_action_t *next_action = trans->ev_actions;
+
+	for (; next_action && *next_action; next_action++)
+		if ((*next_action)(sap, skb))
+			rc = 1;
+	return rc;
+}
+
+/**
+ *	llc_sap_next_state - finds transition, execs actions & change SAP state
+ *	@sap: pointer to SAP
+ *	@skb: happened event
+ *
+ *	This function finds transition that matches with happened event, then
+ *	executes related actions and finally changes state of SAP. It returns
+ *	0 on success and 1 for failure.
+ */
+static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb)
+{
+	int rc = 1;
+	struct llc_sap_state_trans *trans;
+
+	if (sap->state > LLC_NR_SAP_STATES)
+		goto out;
+	trans = llc_find_sap_trans(sap, skb);
+	if (!trans)
+		goto out;
+	/*
+	 * Got the state to which we next transition; perform the actions
+	 * associated with this transition before actually transitioning to the
+	 * next state
+	 */
+	rc = llc_exec_sap_trans_actions(sap, trans, skb);
+	if (rc)
+		goto out;
+	/*
+	 * Transition SAP to next state if all actions execute successfully
+	 */
+	sap->state = trans->next_state;
+out:
+	return rc;
+}
+
+/**
+ *	llc_sap_state_process - sends event to SAP state machine
+ *	@sap: sap to use
+ *	@skb: pointer to occurred event
+ *
+ *	After executing actions of the event, upper layer will be indicated
+ *	if needed(on receiving an UI frame). sk can be null for the
+ *	datalink_proto case.
+ */
+static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	/*
+	 * We have to hold the skb, because llc_sap_next_state
+	 * will kfree it in the sending path and we need to
+	 * look at the skb->cb, where we encode llc_sap_state_ev.
+	 */
+	skb_get(skb);
+	ev->ind_cfm_flag = 0;
+	llc_sap_next_state(sap, skb);
+	if (ev->ind_cfm_flag == LLC_IND) {
+		if (skb->sk->sk_state == TCP_LISTEN)
+			kfree_skb(skb);
+		else {
+			llc_save_primitive(skb->sk, skb, ev->prim);
+
+			/* queue skb to the user. */
+			if (sock_queue_rcv_skb(skb->sk, skb))
+				kfree_skb(skb);
+		}
+	}
+	kfree_skb(skb);
+}
+
+/**
+ *	llc_build_and_send_test_pkt - TEST interface for upper layers.
+ *	@sap: sap to use
+ *	@skb: packet to send
+ *	@dmac: destination mac address
+ *	@dsap: destination sap
+ *
+ *	This function is called when upper layer wants to send a TEST pdu.
+ *	Returns 0 for success, 1 otherwise.
+ */
+void llc_build_and_send_test_pkt(struct llc_sap *sap,
+				 struct sk_buff *skb, u8 *dmac, u8 dsap)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	ev->saddr.lsap = sap->laddr.lsap;
+	ev->daddr.lsap = dsap;
+	memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN);
+	memcpy(ev->daddr.mac, dmac, IFHWADDRLEN);
+
+	ev->type      = LLC_SAP_EV_TYPE_PRIM;
+	ev->prim      = LLC_TEST_PRIM;
+	ev->prim_type = LLC_PRIM_TYPE_REQ;
+	llc_sap_state_process(sap, skb);
+}
+
+/**
+ *	llc_build_and_send_xid_pkt - XID interface for upper layers
+ *	@sap: sap to use
+ *	@skb: packet to send
+ *	@dmac: destination mac address
+ *	@dsap: destination sap
+ *
+ *	This function is called when upper layer wants to send a XID pdu.
+ *	Returns 0 for success, 1 otherwise.
+ */
+void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb,
+				u8 *dmac, u8 dsap)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	ev->saddr.lsap = sap->laddr.lsap;
+	ev->daddr.lsap = dsap;
+	memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN);
+	memcpy(ev->daddr.mac, dmac, IFHWADDRLEN);
+
+	ev->type      = LLC_SAP_EV_TYPE_PRIM;
+	ev->prim      = LLC_XID_PRIM;
+	ev->prim_type = LLC_PRIM_TYPE_REQ;
+	llc_sap_state_process(sap, skb);
+}
+
+/**
+ *	llc_sap_rcv - sends received pdus to the sap state machine
+ *	@sap: current sap component structure.
+ *	@skb: received frame.
+ *
+ *	Sends received pdus to the sap state machine.
+ */
+static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
+			struct sock *sk)
+{
+	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+
+	ev->type   = LLC_SAP_EV_TYPE_PDU;
+	ev->reason = 0;
+	skb->sk = sk;
+	llc_sap_state_process(sap, skb);
+}
+
+static inline bool llc_dgram_match(const struct llc_sap *sap,
+				   const struct llc_addr *laddr,
+				   const struct sock *sk)
+{
+     struct llc_sock *llc = llc_sk(sk);
+
+     return sk->sk_type == SOCK_DGRAM &&
+	  llc->laddr.lsap == laddr->lsap &&
+	  llc_mac_match(llc->laddr.mac, laddr->mac);
+}
+
+/**
+ *	llc_lookup_dgram - Finds dgram socket for the local sap/mac
+ *	@sap: SAP
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search socket list of the SAP and finds connection using the local
+ *	mac, and local sap. Returns pointer for socket found, %NULL otherwise.
+ */
+static struct sock *llc_lookup_dgram(struct llc_sap *sap,
+				     const struct llc_addr *laddr)
+{
+	struct sock *rc;
+	struct hlist_nulls_node *node;
+	int slot = llc_sk_laddr_hashfn(sap, laddr);
+	struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot];
+
+	rcu_read_lock_bh();
+again:
+	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
+		if (llc_dgram_match(sap, laddr, rc)) {
+			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
+				goto again;
+			if (unlikely(llc_sk(rc)->sap != sap ||
+				     !llc_dgram_match(sap, laddr, rc))) {
+				sock_put(rc);
+				continue;
+			}
+			goto found;
+		}
+	}
+	rc = NULL;
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (unlikely(get_nulls_value(node) != slot))
+		goto again;
+found:
+	rcu_read_unlock_bh();
+	return rc;
+}
+
+static inline bool llc_mcast_match(const struct llc_sap *sap,
+				   const struct llc_addr *laddr,
+				   const struct sk_buff *skb,
+				   const struct sock *sk)
+{
+     struct llc_sock *llc = llc_sk(sk);
+
+     return sk->sk_type == SOCK_DGRAM &&
+	  llc->laddr.lsap == laddr->lsap &&
+	  llc->dev == skb->dev;
+}
+
+static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb,
+			 struct sock **stack, int count)
+{
+	struct sk_buff *skb1;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		skb1 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb1) {
+			sock_put(stack[i]);
+			continue;
+		}
+
+		llc_sap_rcv(sap, skb1, stack[i]);
+		sock_put(stack[i]);
+	}
+}
+
+/**
+ * 	llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
+ *	@sap: SAP
+ *	@laddr: address of local LLC (MAC + SAP)
+ *
+ *	Search socket list of the SAP and finds connections with same sap.
+ *	Deliver clone to each.
+ */
+static void llc_sap_mcast(struct llc_sap *sap,
+			  const struct llc_addr *laddr,
+			  struct sk_buff *skb)
+{
+	int i = 0, count = 256 / sizeof(struct sock *);
+	struct sock *sk, *stack[count];
+	struct hlist_node *node;
+	struct llc_sock *llc;
+	struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex);
+
+	spin_lock_bh(&sap->sk_lock);
+	hlist_for_each_entry(llc, node, dev_hb, dev_hash_node) {
+
+		sk = &llc->sk;
+
+		if (!llc_mcast_match(sap, laddr, skb, sk))
+			continue;
+
+		sock_hold(sk);
+		if (i < count)
+			stack[i++] = sk;
+		else {
+			llc_do_mcast(sap, skb, stack, i);
+			i = 0;
+		}
+	}
+	spin_unlock_bh(&sap->sk_lock);
+
+	llc_do_mcast(sap, skb, stack, i);
+}
+
+
+void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
+{
+	struct llc_addr laddr;
+
+	llc_pdu_decode_da(skb, laddr.mac);
+	llc_pdu_decode_dsap(skb, &laddr.lsap);
+
+	if (llc_mac_multicast(laddr.mac)) {
+		llc_sap_mcast(sap, &laddr, skb);
+		kfree_skb(skb);
+	} else {
+		struct sock *sk = llc_lookup_dgram(sap, &laddr);
+		if (sk) {
+			llc_sap_rcv(sap, skb, sk);
+			sock_put(sk);
+		} else
+			kfree_skb(skb);
+	}
+}
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
new file mode 100644
index 00000000..cf4aea3b
--- /dev/null
+++ b/net/llc/llc_station.c
@@ -0,0 +1,722 @@
+/*
+ * llc_station.c - station component of LLC
+ *
+ * Copyright (c) 1997 by Procom Technology, Inc.
+ * 		 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program can be redistributed or modified under the terms of the
+ * GNU General Public License as published by the Free Software Foundation.
+ * This program is distributed without any warranty or implied warranty
+ * of merchantability or fitness for a particular purpose.
+ *
+ * See the GNU General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/llc.h>
+#include <net/llc_sap.h>
+#include <net/llc_conn.h>
+#include <net/llc_c_ac.h>
+#include <net/llc_s_ac.h>
+#include <net/llc_c_ev.h>
+#include <net/llc_c_st.h>
+#include <net/llc_s_ev.h>
+#include <net/llc_s_st.h>
+#include <net/llc_pdu.h>
+
+/**
+ * struct llc_station - LLC station component
+ *
+ * SAP and connection resource manager, one per adapter.
+ *
+ * @state - state of station
+ * @xid_r_count - XID response PDU counter
+ * @mac_sa - MAC source address
+ * @sap_list - list of related SAPs
+ * @ev_q - events entering state mach.
+ * @mac_pdu_q - PDUs ready to send to MAC
+ */
+struct llc_station {
+	u8			    state;
+	u8			    xid_r_count;
+	struct timer_list	    ack_timer;
+	u8			    retry_count;
+	u8			    maximum_retry;
+	struct {
+		struct sk_buff_head list;
+		spinlock_t	    lock;
+	} ev_q;
+	struct sk_buff_head	    mac_pdu_q;
+};
+
+#define LLC_STATION_ACK_TIME (3 * HZ)
+
+int sysctl_llc_station_ack_timeout = LLC_STATION_ACK_TIME;
+
+/* Types of events (possible values in 'ev->type') */
+#define LLC_STATION_EV_TYPE_SIMPLE	1
+#define LLC_STATION_EV_TYPE_CONDITION	2
+#define LLC_STATION_EV_TYPE_PRIM	3
+#define LLC_STATION_EV_TYPE_PDU		4       /* command/response PDU */
+#define LLC_STATION_EV_TYPE_ACK_TMR	5
+#define LLC_STATION_EV_TYPE_RPT_STATUS	6
+
+/* Events */
+#define LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK		1
+#define LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK		2
+#define LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY	3
+#define LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY	4
+#define LLC_STATION_EV_RX_NULL_DSAP_XID_C			5
+#define LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ	6
+#define LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ	7
+#define LLC_STATION_EV_RX_NULL_DSAP_TEST_C			8
+#define LLC_STATION_EV_DISABLE_REQ				9
+
+struct llc_station_state_ev {
+	u8		 type;
+	u8		 prim;
+	u8		 prim_type;
+	u8		 reason;
+	struct list_head node; /* node in station->ev_q.list */
+};
+
+static __inline__ struct llc_station_state_ev *
+					llc_station_ev(struct sk_buff *skb)
+{
+	return (struct llc_station_state_ev *)skb->cb;
+}
+
+typedef int (*llc_station_ev_t)(struct sk_buff *skb);
+
+#define LLC_STATION_STATE_DOWN		1	/* initial state */
+#define LLC_STATION_STATE_DUP_ADDR_CHK	2
+#define LLC_STATION_STATE_UP		3
+
+#define LLC_NBR_STATION_STATES		3	/* size of state table */
+
+typedef int (*llc_station_action_t)(struct sk_buff *skb);
+
+/* Station component state table structure */
+struct llc_station_state_trans {
+	llc_station_ev_t ev;
+	u8 next_state;
+	llc_station_action_t *ev_actions;
+};
+
+struct llc_station_state {
+	u8 curr_state;
+	struct llc_station_state_trans **transitions;
+};
+
+static struct llc_station llc_main_station;
+
+static int llc_stat_ev_enable_with_dup_addr_check(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_SIMPLE &&
+	       ev->prim_type ==
+			      LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK ? 0 : 1;
+}
+
+static int llc_stat_ev_enable_without_dup_addr_check(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_SIMPLE &&
+	       ev->prim_type ==
+			LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK ? 0 : 1;
+}
+
+static int llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_ACK_TMR &&
+		llc_main_station.retry_count <
+		llc_main_station.maximum_retry ? 0 : 1;
+}
+
+static int llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_ACK_TMR &&
+		llc_main_station.retry_count ==
+		llc_main_station.maximum_retry ? 0 : 1;
+}
+
+static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_PDU &&
+	       LLC_PDU_IS_CMD(pdu) &&			/* command PDU */
+	       LLC_PDU_TYPE_IS_U(pdu) &&		/* U type PDU */
+	       LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID &&
+	       !pdu->dsap ? 0 : 1;			/* NULL DSAP value */
+}
+
+static int llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_PDU &&
+	       LLC_PDU_IS_RSP(pdu) &&			/* response PDU */
+	       LLC_PDU_TYPE_IS_U(pdu) &&		/* U type PDU */
+	       LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID &&
+	       !pdu->dsap &&				/* NULL DSAP value */
+	       !llc_main_station.xid_r_count ? 0 : 1;
+}
+
+static int llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_PDU &&
+	       LLC_PDU_IS_RSP(pdu) &&			/* response PDU */
+	       LLC_PDU_TYPE_IS_U(pdu) &&		/* U type PDU */
+	       LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID &&
+	       !pdu->dsap &&				/* NULL DSAP value */
+	       llc_main_station.xid_r_count == 1 ? 0 : 1;
+}
+
+static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+	struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_PDU &&
+	       LLC_PDU_IS_CMD(pdu) &&			/* command PDU */
+	       LLC_PDU_TYPE_IS_U(pdu) &&		/* U type PDU */
+	       LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST &&
+	       !pdu->dsap ? 0 : 1;			/* NULL DSAP */
+}
+
+static int llc_stat_ev_disable_req(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	return ev->type == LLC_STATION_EV_TYPE_PRIM &&
+	       ev->prim == LLC_DISABLE_PRIM &&
+	       ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1;
+}
+
+/**
+ *	llc_station_send_pdu - queues PDU to send
+ *	@skb: Address of the PDU
+ *
+ *	Queues a PDU to send to the MAC layer.
+ */
+static void llc_station_send_pdu(struct sk_buff *skb)
+{
+	skb_queue_tail(&llc_main_station.mac_pdu_q, skb);
+	while ((skb = skb_dequeue(&llc_main_station.mac_pdu_q)) != NULL)
+		if (dev_queue_xmit(skb))
+			break;
+}
+
+static int llc_station_ac_start_ack_timer(struct sk_buff *skb)
+{
+	mod_timer(&llc_main_station.ack_timer,
+		  jiffies + sysctl_llc_station_ack_timeout);
+	return 0;
+}
+
+static int llc_station_ac_set_retry_cnt_0(struct sk_buff *skb)
+{
+	llc_main_station.retry_count = 0;
+	return 0;
+}
+
+static int llc_station_ac_inc_retry_cnt_by_1(struct sk_buff *skb)
+{
+	llc_main_station.retry_count++;
+	return 0;
+}
+
+static int llc_station_ac_set_xid_r_cnt_0(struct sk_buff *skb)
+{
+	llc_main_station.xid_r_count = 0;
+	return 0;
+}
+
+static int llc_station_ac_inc_xid_r_cnt_by_1(struct sk_buff *skb)
+{
+	llc_main_station.xid_r_count++;
+	return 0;
+}
+
+static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb)
+{
+	int rc = 1;
+	struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
+					       sizeof(struct llc_xid_info));
+
+	if (!nskb)
+		goto out;
+	llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, 0, LLC_PDU_CMD);
+	llc_pdu_init_as_xid_cmd(nskb, LLC_XID_NULL_CLASS_2, 127);
+	rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, skb->dev->dev_addr);
+	if (unlikely(rc))
+		goto free;
+	llc_station_send_pdu(nskb);
+out:
+	return rc;
+free:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int llc_station_ac_send_xid_r(struct sk_buff *skb)
+{
+	u8 mac_da[ETH_ALEN], dsap;
+	int rc = 1;
+	struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
+					       sizeof(struct llc_xid_info));
+
+	if (!nskb)
+		goto out;
+	rc = 0;
+	llc_pdu_decode_sa(skb, mac_da);
+	llc_pdu_decode_ssap(skb, &dsap);
+	llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
+	llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127);
+	rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
+	if (unlikely(rc))
+		goto free;
+	llc_station_send_pdu(nskb);
+out:
+	return rc;
+free:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int llc_station_ac_send_test_r(struct sk_buff *skb)
+{
+	u8 mac_da[ETH_ALEN], dsap;
+	int rc = 1;
+	u32 data_size;
+	struct sk_buff *nskb;
+
+	/* The test request command is type U (llc_len = 3) */
+	data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
+	nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
+
+	if (!nskb)
+		goto out;
+	rc = 0;
+	llc_pdu_decode_sa(skb, mac_da);
+	llc_pdu_decode_ssap(skb, &dsap);
+	llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
+	llc_pdu_init_as_test_rsp(nskb, skb);
+	rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
+	if (unlikely(rc))
+		goto free;
+	llc_station_send_pdu(nskb);
+out:
+	return rc;
+free:
+	kfree_skb(skb);
+	goto out;
+}
+
+static int llc_station_ac_report_status(struct sk_buff *skb)
+{
+	return 0;
+}
+
+/* COMMON STATION STATE transitions */
+
+/* dummy last-transition indicator; common to all state transition groups
+ * last entry for this state
+ * all members are zeros, .bss zeroes it
+ */
+static struct llc_station_state_trans llc_stat_state_trans_end;
+
+/* DOWN STATE transitions */
+
+/* state transition for LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK event */
+static llc_station_action_t llc_stat_down_state_actions_1[] = {
+	[0] = llc_station_ac_start_ack_timer,
+	[1] = llc_station_ac_set_retry_cnt_0,
+	[2] = llc_station_ac_set_xid_r_cnt_0,
+	[3] = llc_station_ac_send_null_dsap_xid_c,
+	[4] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_down_state_trans_1 = {
+	.ev	    = llc_stat_ev_enable_with_dup_addr_check,
+	.next_state = LLC_STATION_STATE_DUP_ADDR_CHK,
+	.ev_actions = llc_stat_down_state_actions_1,
+};
+
+/* state transition for LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK event */
+static llc_station_action_t llc_stat_down_state_actions_2[] = {
+	[0] = llc_station_ac_report_status,	/* STATION UP */
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_down_state_trans_2 = {
+	.ev	    = llc_stat_ev_enable_without_dup_addr_check,
+	.next_state = LLC_STATION_STATE_UP,
+	.ev_actions = llc_stat_down_state_actions_2,
+};
+
+/* array of pointers; one to each transition */
+static struct llc_station_state_trans *llc_stat_dwn_state_trans[] = {
+	[0] = &llc_stat_down_state_trans_1,
+	[1] = &llc_stat_down_state_trans_2,
+	[2] = &llc_stat_state_trans_end,
+};
+
+/* UP STATE transitions */
+/* state transition for LLC_STATION_EV_DISABLE_REQ event */
+static llc_station_action_t llc_stat_up_state_actions_1[] = {
+	[0] = llc_station_ac_report_status,	/* STATION DOWN */
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_up_state_trans_1 = {
+	.ev	    = llc_stat_ev_disable_req,
+	.next_state = LLC_STATION_STATE_DOWN,
+	.ev_actions = llc_stat_up_state_actions_1,
+};
+
+/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */
+static llc_station_action_t llc_stat_up_state_actions_2[] = {
+	[0] = llc_station_ac_send_xid_r,
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_up_state_trans_2 = {
+	.ev	    = llc_stat_ev_rx_null_dsap_xid_c,
+	.next_state = LLC_STATION_STATE_UP,
+	.ev_actions = llc_stat_up_state_actions_2,
+};
+
+/* state transition for LLC_STATION_EV_RX_NULL_DSAP_TEST_C event */
+static llc_station_action_t llc_stat_up_state_actions_3[] = {
+	[0] = llc_station_ac_send_test_r,
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_up_state_trans_3 = {
+	.ev	    = llc_stat_ev_rx_null_dsap_test_c,
+	.next_state = LLC_STATION_STATE_UP,
+	.ev_actions = llc_stat_up_state_actions_3,
+};
+
+/* array of pointers; one to each transition */
+static struct llc_station_state_trans *llc_stat_up_state_trans [] = {
+	[0] = &llc_stat_up_state_trans_1,
+	[1] = &llc_stat_up_state_trans_2,
+	[2] = &llc_stat_up_state_trans_3,
+	[3] = &llc_stat_state_trans_end,
+};
+
+/* DUP ADDR CHK STATE transitions */
+/* state transition for LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ
+ * event
+ */
+static llc_station_action_t llc_stat_dupaddr_state_actions_1[] = {
+	[0] = llc_station_ac_inc_xid_r_cnt_by_1,
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_1 = {
+	.ev	    = llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq,
+	.next_state = LLC_STATION_STATE_DUP_ADDR_CHK,
+	.ev_actions = llc_stat_dupaddr_state_actions_1,
+};
+
+/* state transition for LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ
+ * event
+ */
+static llc_station_action_t llc_stat_dupaddr_state_actions_2[] = {
+	[0] = llc_station_ac_report_status,	/* DUPLICATE ADDRESS FOUND */
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_2 = {
+	.ev	    = llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq,
+	.next_state = LLC_STATION_STATE_DOWN,
+	.ev_actions = llc_stat_dupaddr_state_actions_2,
+};
+
+/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */
+static llc_station_action_t llc_stat_dupaddr_state_actions_3[] = {
+	[0] = llc_station_ac_send_xid_r,
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_3 = {
+	.ev	    = llc_stat_ev_rx_null_dsap_xid_c,
+	.next_state = LLC_STATION_STATE_DUP_ADDR_CHK,
+	.ev_actions = llc_stat_dupaddr_state_actions_3,
+};
+
+/* state transition for LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY
+ * event
+ */
+static llc_station_action_t llc_stat_dupaddr_state_actions_4[] = {
+	[0] = llc_station_ac_start_ack_timer,
+	[1] = llc_station_ac_inc_retry_cnt_by_1,
+	[2] = llc_station_ac_set_xid_r_cnt_0,
+	[3] = llc_station_ac_send_null_dsap_xid_c,
+	[4] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_4 = {
+	.ev	    = llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry,
+	.next_state = LLC_STATION_STATE_DUP_ADDR_CHK,
+	.ev_actions = llc_stat_dupaddr_state_actions_4,
+};
+
+/* state transition for LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY
+ * event
+ */
+static llc_station_action_t llc_stat_dupaddr_state_actions_5[] = {
+	[0] = llc_station_ac_report_status,	/* STATION UP */
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_5 = {
+	.ev	    = llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry,
+	.next_state = LLC_STATION_STATE_UP,
+	.ev_actions = llc_stat_dupaddr_state_actions_5,
+};
+
+/* state transition for LLC_STATION_EV_DISABLE_REQ event */
+static llc_station_action_t llc_stat_dupaddr_state_actions_6[] = {
+	[0] = llc_station_ac_report_status,	/* STATION DOWN */
+	[1] = NULL,
+};
+
+static struct llc_station_state_trans llc_stat_dupaddr_state_trans_6 = {
+	.ev	    = llc_stat_ev_disable_req,
+	.next_state = LLC_STATION_STATE_DOWN,
+	.ev_actions = llc_stat_dupaddr_state_actions_6,
+};
+
+/* array of pointers; one to each transition */
+static struct llc_station_state_trans *llc_stat_dupaddr_state_trans[] = {
+	[0] = &llc_stat_dupaddr_state_trans_6,	/* Request */
+	[1] = &llc_stat_dupaddr_state_trans_4,	/* Timer */
+	[2] = &llc_stat_dupaddr_state_trans_5,
+	[3] = &llc_stat_dupaddr_state_trans_1,	/* Receive frame */
+	[4] = &llc_stat_dupaddr_state_trans_2,
+	[5] = &llc_stat_dupaddr_state_trans_3,
+	[6] = &llc_stat_state_trans_end,
+};
+
+static struct llc_station_state
+			llc_station_state_table[LLC_NBR_STATION_STATES] = {
+	[LLC_STATION_STATE_DOWN - 1] = {
+		.curr_state  = LLC_STATION_STATE_DOWN,
+		.transitions = llc_stat_dwn_state_trans,
+	},
+	[LLC_STATION_STATE_DUP_ADDR_CHK - 1] = {
+		.curr_state  = LLC_STATION_STATE_DUP_ADDR_CHK,
+		.transitions = llc_stat_dupaddr_state_trans,
+	},
+	[LLC_STATION_STATE_UP - 1] = {
+		.curr_state  = LLC_STATION_STATE_UP,
+		.transitions = llc_stat_up_state_trans,
+	},
+};
+
+/**
+ *	llc_exec_station_trans_actions - executes actions for transition
+ *	@trans: Address of the transition
+ *	@skb: Address of the event that caused the transition
+ *
+ *	Executes actions of a transition of the station state machine. Returns
+ *	0 if all actions complete successfully, nonzero otherwise.
+ */
+static u16 llc_exec_station_trans_actions(struct llc_station_state_trans *trans,
+					  struct sk_buff *skb)
+{
+	u16 rc = 0;
+	llc_station_action_t *next_action = trans->ev_actions;
+
+	for (; next_action && *next_action; next_action++)
+		if ((*next_action)(skb))
+			rc = 1;
+	return rc;
+}
+
+/**
+ *	llc_find_station_trans - finds transition for this event
+ *	@skb: Address of the event
+ *
+ *	Search thru events of the current state of the station until list
+ *	exhausted or it's obvious that the event is not valid for the current
+ *	state. Returns the address of the transition if cound, %NULL otherwise.
+ */
+static struct llc_station_state_trans *
+				llc_find_station_trans(struct sk_buff *skb)
+{
+	int i = 0;
+	struct llc_station_state_trans *rc = NULL;
+	struct llc_station_state_trans **next_trans;
+	struct llc_station_state *curr_state =
+				&llc_station_state_table[llc_main_station.state - 1];
+
+	for (next_trans = curr_state->transitions; next_trans[i]->ev; i++)
+		if (!next_trans[i]->ev(skb)) {
+			rc = next_trans[i];
+			break;
+		}
+	return rc;
+}
+
+/**
+ *	llc_station_free_ev - frees an event
+ *	@skb: Address of the event
+ *
+ *	Frees an event.
+ */
+static void llc_station_free_ev(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	if (ev->type == LLC_STATION_EV_TYPE_PDU)
+		kfree_skb(skb);
+}
+
+/**
+ *	llc_station_next_state - processes event and goes to the next state
+ *	@skb: Address of the event
+ *
+ *	Processes an event, executes any transitions related to that event and
+ *	updates the state of the station.
+ */
+static u16 llc_station_next_state(struct sk_buff *skb)
+{
+	u16 rc = 1;
+	struct llc_station_state_trans *trans;
+
+	if (llc_main_station.state > LLC_NBR_STATION_STATES)
+		goto out;
+	trans = llc_find_station_trans(skb);
+	if (trans) {
+		/* got the state to which we next transition; perform the
+		 * actions associated with this transition before actually
+		 * transitioning to the next state
+		 */
+		rc = llc_exec_station_trans_actions(trans, skb);
+		if (!rc)
+			/* transition station to next state if all actions
+			 * execute successfully; done; wait for next event
+			 */
+			llc_main_station.state = trans->next_state;
+	} else
+		/* event not recognized in current state; re-queue it for
+		 * processing again at a later time; return failure
+		 */
+		rc = 0;
+out:
+	llc_station_free_ev(skb);
+	return rc;
+}
+
+/**
+ *	llc_station_service_events - service events in the queue
+ *
+ *	Get an event from the station event queue (if any); attempt to service
+ *	the event; if event serviced, get the next event (if any) on the event
+ *	queue; if event not service, re-queue the event on the event queue and
+ *	attempt to service the next event; when serviced all events in queue,
+ *	finished; if don't transition to different state, just service all
+ *	events once; if transition to new state, service all events again.
+ *	Caller must hold llc_main_station.ev_q.lock.
+ */
+static void llc_station_service_events(void)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&llc_main_station.ev_q.list)) != NULL)
+		llc_station_next_state(skb);
+}
+
+/**
+ *	llc_station_state_process: queue event and try to process queue.
+ *	@skb: Address of the event
+ *
+ *	Queues an event (on the station event queue) for handling by the
+ *	station state machine and attempts to process any queued-up events.
+ */
+static void llc_station_state_process(struct sk_buff *skb)
+{
+	spin_lock_bh(&llc_main_station.ev_q.lock);
+	skb_queue_tail(&llc_main_station.ev_q.list, skb);
+	llc_station_service_events();
+	spin_unlock_bh(&llc_main_station.ev_q.lock);
+}
+
+static void llc_station_ack_tmr_cb(unsigned long timeout_data)
+{
+	struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC);
+
+	if (skb) {
+		struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+		ev->type = LLC_STATION_EV_TYPE_ACK_TMR;
+		llc_station_state_process(skb);
+	}
+}
+
+/*
+ *	llc_station_rcv - send received pdu to the station state machine
+ *	@skb: received frame.
+ *
+ *	Sends data unit to station state machine.
+ */
+static void llc_station_rcv(struct sk_buff *skb)
+{
+	struct llc_station_state_ev *ev = llc_station_ev(skb);
+
+	ev->type   = LLC_STATION_EV_TYPE_PDU;
+	ev->reason = 0;
+	llc_station_state_process(skb);
+}
+
+int __init llc_station_init(void)
+{
+	int rc = -ENOBUFS;
+	struct sk_buff *skb;
+	struct llc_station_state_ev *ev;
+
+	skb_queue_head_init(&llc_main_station.mac_pdu_q);
+	skb_queue_head_init(&llc_main_station.ev_q.list);
+	spin_lock_init(&llc_main_station.ev_q.lock);
+	setup_timer(&llc_main_station.ack_timer, llc_station_ack_tmr_cb,
+			(unsigned long)&llc_main_station);
+	llc_main_station.ack_timer.expires  = jiffies +
+						sysctl_llc_station_ack_timeout;
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+	rc = 0;
+	llc_set_station_handler(llc_station_rcv);
+	ev = llc_station_ev(skb);
+	memset(ev, 0, sizeof(*ev));
+	llc_main_station.maximum_retry	= 1;
+	llc_main_station.state		= LLC_STATION_STATE_DOWN;
+	ev->type	= LLC_STATION_EV_TYPE_SIMPLE;
+	ev->prim_type	= LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK;
+	rc = llc_station_next_state(skb);
+out:
+	return rc;
+}
+
+void __exit llc_station_exit(void)
+{
+	llc_set_station_handler(NULL);
+}
diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c
new file mode 100644
index 00000000..e2ebe358
--- /dev/null
+++ b/net/llc/sysctl_net_llc.c
@@ -0,0 +1,103 @@
+/*
+ * sysctl_net_llc.c: sysctl interface to LLC net subsystem.
+ *
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/llc.h>
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+static struct ctl_table llc2_timeout_table[] = {
+	{
+		.procname	= "ack",
+		.data		= &sysctl_llc2_ack_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "busy",
+		.data		= &sysctl_llc2_busy_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "p",
+		.data		= &sysctl_llc2_p_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "rej",
+		.data		= &sysctl_llc2_rej_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{ },
+};
+
+static struct ctl_table llc_station_table[] = {
+	{
+		.procname	= "ack_timeout",
+		.data		= &sysctl_llc_station_ack_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler   = proc_dointvec_jiffies,
+	},
+	{ },
+};
+
+static struct ctl_table llc2_dir_timeout_table[] = {
+	{
+		.procname	= "timeout",
+		.mode		= 0555,
+		.child		= llc2_timeout_table,
+	},
+	{ },
+};
+
+static struct ctl_table llc_table[] = {
+	{
+		.procname	= "llc2",
+		.mode		= 0555,
+		.child		= llc2_dir_timeout_table,
+	},
+	{
+		.procname       = "station",
+		.mode           = 0555,
+		.child          = llc_station_table,
+	},
+	{ },
+};
+
+static struct ctl_path llc_path[] = {
+	{ .procname = "net", },
+	{ .procname = "llc", },
+	{ }
+};
+
+static struct ctl_table_header *llc_table_header;
+
+int __init llc_sysctl_init(void)
+{
+	llc_table_header = register_sysctl_paths(llc_path, llc_table);
+
+	return llc_table_header ? 0 : -ENOMEM;
+}
+
+void llc_sysctl_exit(void)
+{
+	if (llc_table_header) {
+		unregister_sysctl_table(llc_table_header);
+		llc_table_header = NULL;
+	}
+}
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
new file mode 100644
index 00000000..f5fdfcbf
--- /dev/null
+++ b/net/mac80211/Kconfig
@@ -0,0 +1,236 @@
+config MAC80211
+	tristate "Generic IEEE 802.11 Networking Stack (mac80211)"
+	depends on CFG80211
+	select CRYPTO
+	select CRYPTO_ARC4
+	select CRYPTO_AES
+	select CRC32
+	select AVERAGE
+	---help---
+	  This option enables the hardware independent IEEE 802.11
+	  networking stack.
+
+comment "CFG80211 needs to be enabled for MAC80211"
+	depends on CFG80211=n
+
+if MAC80211 != n
+
+config MAC80211_HAS_RC
+	bool
+
+config MAC80211_RC_PID
+	bool "PID controller based rate control algorithm" if EXPERT
+	select MAC80211_HAS_RC
+	---help---
+	  This option enables a TX rate control algorithm for
+	  mac80211 that uses a PID controller to select the TX
+	  rate.
+
+config MAC80211_RC_MINSTREL
+	bool "Minstrel" if EXPERT
+	select MAC80211_HAS_RC
+	default y
+	---help---
+	  This option enables the 'minstrel' TX rate control algorithm
+
+config MAC80211_RC_MINSTREL_HT
+	bool "Minstrel 802.11n support" if EXPERT
+	depends on MAC80211_RC_MINSTREL
+	default y
+	---help---
+	  This option enables the 'minstrel_ht' TX rate control algorithm
+
+choice
+	prompt "Default rate control algorithm"
+	depends on MAC80211_HAS_RC
+	default MAC80211_RC_DEFAULT_MINSTREL
+	---help---
+	  This option selects the default rate control algorithm
+	  mac80211 will use. Note that this default can still be
+	  overridden through the ieee80211_default_rc_algo module
+	  parameter if different algorithms are available.
+
+config MAC80211_RC_DEFAULT_PID
+	bool "PID controller based rate control algorithm"
+	depends on MAC80211_RC_PID
+	---help---
+	  Select the PID controller based rate control as the
+	  default rate control algorithm. You should choose
+	  this unless you know what you are doing.
+
+config MAC80211_RC_DEFAULT_MINSTREL
+	bool "Minstrel"
+	depends on MAC80211_RC_MINSTREL
+	---help---
+	  Select Minstrel as the default rate control algorithm.
+
+
+endchoice
+
+config MAC80211_RC_DEFAULT
+	string
+	default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT
+	default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
+	default "pid" if MAC80211_RC_DEFAULT_PID
+	default ""
+
+endif
+
+comment "Some wireless drivers require a rate control algorithm"
+	depends on MAC80211 && MAC80211_HAS_RC=n
+
+config MAC80211_MESH
+	bool "Enable mac80211 mesh networking (pre-802.11s) support"
+	depends on MAC80211 && EXPERIMENTAL
+	---help---
+	 This options enables support of Draft 802.11s mesh networking.
+	 The implementation is based on Draft 2.08 of the Mesh Networking
+	 amendment.  However, no compliance with that draft is claimed or even
+	 possible, as drafts leave a number of identifiers to be defined after
+	 ratification.  For more information visit http://o11s.org/.
+
+config MAC80211_LEDS
+	bool "Enable LED triggers"
+	depends on MAC80211
+	depends on LEDS_CLASS
+	select LEDS_TRIGGERS
+	---help---
+	  This option enables a few LED triggers for different
+	  packet receive/transmit events.
+
+config MAC80211_DEBUGFS
+	bool "Export mac80211 internals in DebugFS"
+	depends on MAC80211 && DEBUG_FS
+	---help---
+	  Select this to see extensive information about
+	  the internal state of mac80211 in debugfs.
+
+	  Say N unless you know you need this.
+
+menuconfig MAC80211_DEBUG_MENU
+	bool "Select mac80211 debugging features"
+	depends on MAC80211
+	---help---
+	  This option collects various mac80211 debug settings.
+
+config MAC80211_NOINLINE
+	bool "Do not inline TX/RX handlers"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  This option affects code generation in mac80211, when
+	  selected some functions are marked "noinline" to allow
+	  easier debugging of problems in the transmit and receive
+	  paths.
+
+	  This option increases code size a bit and inserts a lot
+	  of function calls in the code, but is otherwise safe to
+	  enable.
+
+	  If unsure, say N unless you expect to be finding problems
+	  in mac80211.
+
+config MAC80211_VERBOSE_DEBUG
+	bool "Verbose debugging output"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  Selecting this option causes mac80211 to print out
+	  many debugging messages. It should not be selected
+	  on production systems as some of the messages are
+	  remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_HT_DEBUG
+	bool "Verbose HT debugging"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  This option enables 802.11n High Throughput features
+	  debug tracing output.
+
+	  It should not be selected on production systems as some
+	  of the messages are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_TKIP_DEBUG
+	bool "Verbose TKIP debugging"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  Selecting this option causes mac80211 to print out
+	  very verbose TKIP debugging messages. It should not
+	  be selected on production systems as those messages
+	  are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_IBSS_DEBUG
+	bool "Verbose IBSS debugging"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  Selecting this option causes mac80211 to print out
+	  very verbose IBSS debugging messages. It should not
+	  be selected on production systems as those messages
+	  are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_VERBOSE_PS_DEBUG
+	bool "Verbose powersave mode debugging"
+	depends on MAC80211_DEBUG_MENU
+	---help---
+	  Selecting this option causes mac80211 to print out very
+	  verbose power save mode debugging messages (when mac80211
+	  is an AP and has power saving stations.)
+	  It should not be selected on production systems as those
+	  messages are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_VERBOSE_MPL_DEBUG
+	bool "Verbose mesh peer link debugging"
+	depends on MAC80211_DEBUG_MENU
+	depends on MAC80211_MESH
+	---help---
+	  Selecting this option causes mac80211 to print out very
+	  verbose mesh peer link debugging messages (when mac80211
+	  is taking part in a mesh network).
+	  It should not be selected on production systems as those
+	  messages are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_VERBOSE_MHWMP_DEBUG
+	bool "Verbose mesh HWMP routing debugging"
+	depends on MAC80211_DEBUG_MENU
+	depends on MAC80211_MESH
+	---help---
+	  Selecting this option causes mac80211 to print out very
+	  verbose mesh routing (HWMP) debugging messages (when mac80211
+	  is taking part in a mesh network).
+	  It should not be selected on production systems as those
+	  messages are remotely triggerable.
+
+	  Do not select this option.
+
+config MAC80211_DEBUG_COUNTERS
+	bool "Extra statistics for TX/RX debugging"
+	depends on MAC80211_DEBUG_MENU
+	depends on MAC80211_DEBUGFS
+	---help---
+	  Selecting this option causes mac80211 to keep additional
+	  and very verbose statistics about TX and RX handler use
+	  and show them in debugfs.
+
+	  If unsure, say N.
+
+config MAC80211_DRIVER_API_TRACER
+	bool "Driver API tracer"
+	depends on MAC80211_DEBUG_MENU
+	depends on EVENT_TRACING
+	help
+	  Say Y here to make mac80211 register with the ftrace
+	  framework for the driver API -- you can then see which
+	  driver methods it is calling and which API functions
+	  drivers are calling by looking at the trace.
+
+	  If unsure, say Y.
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
new file mode 100644
index 00000000..fdb54e61
--- /dev/null
+++ b/net/mac80211/Makefile
@@ -0,0 +1,61 @@
+obj-$(CONFIG_MAC80211) += mac80211.o
+
+# mac80211 objects
+mac80211-y := \
+	main.o status.o \
+	sta_info.o \
+	wep.o \
+	wpa.o \
+	scan.o offchannel.o \
+	ht.o agg-tx.o agg-rx.o \
+	ibss.o \
+	mlme.o work.o \
+	iface.o \
+	rate.o \
+	michael.o \
+	tkip.o \
+	aes_ccm.o \
+	aes_cmac.o \
+	cfg.o \
+	rx.o \
+	spectmgmt.o \
+	tx.o \
+	key.o \
+	util.o \
+	wme.o \
+	event.o \
+	chan.o
+
+mac80211-$(CONFIG_MAC80211_LEDS) += led.o
+mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
+	debugfs.o \
+	debugfs_sta.o \
+	debugfs_netdev.o \
+	debugfs_key.o
+
+mac80211-$(CONFIG_MAC80211_MESH) += \
+	mesh.o \
+	mesh_pathtbl.o \
+	mesh_plink.o \
+	mesh_hwmp.o
+
+mac80211-$(CONFIG_PM) += pm.o
+
+mac80211-$(CONFIG_MAC80211_DRIVER_API_TRACER) += driver-trace.o
+CFLAGS_driver-trace.o := -I$(src)
+
+# objects for PID algorithm
+rc80211_pid-y := rc80211_pid_algo.o
+rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o
+
+rc80211_minstrel-y := rc80211_minstrel.o
+rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
+
+rc80211_minstrel_ht-y := rc80211_minstrel_ht.o
+rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
+
+mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc80211_pid-y)
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
+
+ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
new file mode 100644
index 00000000..b9b595c0
--- /dev/null
+++ b/net/mac80211/aes_ccm.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2003-2004, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_ccm.h"
+
+static void aes_ccm_prepare(struct crypto_cipher *tfm, u8 *scratch, u8 *a)
+{
+	int i;
+	u8 *b_0, *aad, *b, *s_0;
+
+	b_0 = scratch + 3 * AES_BLOCK_LEN;
+	aad = scratch + 4 * AES_BLOCK_LEN;
+	b = scratch;
+	s_0 = scratch + AES_BLOCK_LEN;
+
+	crypto_cipher_encrypt_one(tfm, b, b_0);
+
+	/* Extra Authenticate-only data (always two AES blocks) */
+	for (i = 0; i < AES_BLOCK_LEN; i++)
+		aad[i] ^= b[i];
+	crypto_cipher_encrypt_one(tfm, b, aad);
+
+	aad += AES_BLOCK_LEN;
+
+	for (i = 0; i < AES_BLOCK_LEN; i++)
+		aad[i] ^= b[i];
+	crypto_cipher_encrypt_one(tfm, a, aad);
+
+	/* Mask out bits from auth-only-b_0 */
+	b_0[0] &= 0x07;
+
+	/* S_0 is used to encrypt T (= MIC) */
+	b_0[14] = 0;
+	b_0[15] = 0;
+	crypto_cipher_encrypt_one(tfm, s_0, b_0);
+}
+
+
+void ieee80211_aes_ccm_encrypt(struct crypto_cipher *tfm, u8 *scratch,
+			       u8 *data, size_t data_len,
+			       u8 *cdata, u8 *mic)
+{
+	int i, j, last_len, num_blocks;
+	u8 *pos, *cpos, *b, *s_0, *e, *b_0;
+
+	b = scratch;
+	s_0 = scratch + AES_BLOCK_LEN;
+	e = scratch + 2 * AES_BLOCK_LEN;
+	b_0 = scratch + 3 * AES_BLOCK_LEN;
+
+	num_blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+	last_len = data_len % AES_BLOCK_LEN;
+	aes_ccm_prepare(tfm, scratch, b);
+
+	/* Process payload blocks */
+	pos = data;
+	cpos = cdata;
+	for (j = 1; j <= num_blocks; j++) {
+		int blen = (j == num_blocks && last_len) ?
+			last_len : AES_BLOCK_LEN;
+
+		/* Authentication followed by encryption */
+		for (i = 0; i < blen; i++)
+			b[i] ^= pos[i];
+		crypto_cipher_encrypt_one(tfm, b, b);
+
+		b_0[14] = (j >> 8) & 0xff;
+		b_0[15] = j & 0xff;
+		crypto_cipher_encrypt_one(tfm, e, b_0);
+		for (i = 0; i < blen; i++)
+			*cpos++ = *pos++ ^ e[i];
+	}
+
+	for (i = 0; i < CCMP_MIC_LEN; i++)
+		mic[i] = b[i] ^ s_0[i];
+}
+
+
+int ieee80211_aes_ccm_decrypt(struct crypto_cipher *tfm, u8 *scratch,
+			      u8 *cdata, size_t data_len, u8 *mic, u8 *data)
+{
+	int i, j, last_len, num_blocks;
+	u8 *pos, *cpos, *b, *s_0, *a, *b_0;
+
+	b = scratch;
+	s_0 = scratch + AES_BLOCK_LEN;
+	a = scratch + 2 * AES_BLOCK_LEN;
+	b_0 = scratch + 3 * AES_BLOCK_LEN;
+
+	num_blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+	last_len = data_len % AES_BLOCK_LEN;
+	aes_ccm_prepare(tfm, scratch, a);
+
+	/* Process payload blocks */
+	cpos = cdata;
+	pos = data;
+	for (j = 1; j <= num_blocks; j++) {
+		int blen = (j == num_blocks && last_len) ?
+			last_len : AES_BLOCK_LEN;
+
+		/* Decryption followed by authentication */
+		b_0[14] = (j >> 8) & 0xff;
+		b_0[15] = j & 0xff;
+		crypto_cipher_encrypt_one(tfm, b, b_0);
+		for (i = 0; i < blen; i++) {
+			*pos = *cpos++ ^ b[i];
+			a[i] ^= *pos++;
+		}
+		crypto_cipher_encrypt_one(tfm, a, a);
+	}
+
+	for (i = 0; i < CCMP_MIC_LEN; i++) {
+		if ((mic[i] ^ s_0[i]) != a[i])
+			return -1;
+	}
+
+	return 0;
+}
+
+
+struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[])
+{
+	struct crypto_cipher *tfm;
+
+	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	if (!IS_ERR(tfm))
+		crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
+
+	return tfm;
+}
+
+
+void ieee80211_aes_key_free(struct crypto_cipher *tfm)
+{
+	crypto_free_cipher(tfm);
+}
diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h
new file mode 100644
index 00000000..6e7820ef
--- /dev/null
+++ b/net/mac80211/aes_ccm.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2003-2004, Instant802 Networks, Inc.
+ * Copyright 2006, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_CCM_H
+#define AES_CCM_H
+
+#include <linux/crypto.h>
+
+#define AES_BLOCK_LEN 16
+
+struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[]);
+void ieee80211_aes_ccm_encrypt(struct crypto_cipher *tfm, u8 *scratch,
+			       u8 *data, size_t data_len,
+			       u8 *cdata, u8 *mic);
+int ieee80211_aes_ccm_decrypt(struct crypto_cipher *tfm, u8 *scratch,
+			      u8 *cdata, size_t data_len,
+			      u8 *mic, u8 *data);
+void ieee80211_aes_key_free(struct crypto_cipher *tfm);
+
+#endif /* AES_CCM_H */
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
new file mode 100644
index 00000000..d502b268
--- /dev/null
+++ b/net/mac80211/aes_cmac.c
@@ -0,0 +1,132 @@
+/*
+ * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+
+#include <net/mac80211.h>
+#include "key.h"
+#include "aes_cmac.h"
+
+#define AES_BLOCK_SIZE 16
+#define AES_CMAC_KEY_LEN 16
+#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
+#define AAD_LEN 20
+
+
+static void gf_mulx(u8 *pad)
+{
+	int i, carry;
+
+	carry = pad[0] & 0x80;
+	for (i = 0; i < AES_BLOCK_SIZE - 1; i++)
+		pad[i] = (pad[i] << 1) | (pad[i + 1] >> 7);
+	pad[AES_BLOCK_SIZE - 1] <<= 1;
+	if (carry)
+		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
+}
+
+
+static void aes_128_cmac_vector(struct crypto_cipher *tfm, u8 *scratch,
+				size_t num_elem,
+				const u8 *addr[], const size_t *len, u8 *mac)
+{
+	u8 *cbc, *pad;
+	const u8 *pos, *end;
+	size_t i, e, left, total_len;
+
+	cbc = scratch;
+	pad = scratch + AES_BLOCK_SIZE;
+
+	memset(cbc, 0, AES_BLOCK_SIZE);
+
+	total_len = 0;
+	for (e = 0; e < num_elem; e++)
+		total_len += len[e];
+	left = total_len;
+
+	e = 0;
+	pos = addr[0];
+	end = pos + len[0];
+
+	while (left >= AES_BLOCK_SIZE) {
+		for (i = 0; i < AES_BLOCK_SIZE; i++) {
+			cbc[i] ^= *pos++;
+			if (pos >= end) {
+				e++;
+				pos = addr[e];
+				end = pos + len[e];
+			}
+		}
+		if (left > AES_BLOCK_SIZE)
+			crypto_cipher_encrypt_one(tfm, cbc, cbc);
+		left -= AES_BLOCK_SIZE;
+	}
+
+	memset(pad, 0, AES_BLOCK_SIZE);
+	crypto_cipher_encrypt_one(tfm, pad, pad);
+	gf_mulx(pad);
+
+	if (left || total_len == 0) {
+		for (i = 0; i < left; i++) {
+			cbc[i] ^= *pos++;
+			if (pos >= end) {
+				e++;
+				pos = addr[e];
+				end = pos + len[e];
+			}
+		}
+		cbc[left] ^= 0x80;
+		gf_mulx(pad);
+	}
+
+	for (i = 0; i < AES_BLOCK_SIZE; i++)
+		pad[i] ^= cbc[i];
+	crypto_cipher_encrypt_one(tfm, pad, pad);
+	memcpy(mac, pad, CMAC_TLEN);
+}
+
+
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad,
+			const u8 *data, size_t data_len, u8 *mic)
+{
+	const u8 *addr[3];
+	size_t len[3];
+	u8 zero[CMAC_TLEN];
+
+	memset(zero, 0, CMAC_TLEN);
+	addr[0] = aad;
+	len[0] = AAD_LEN;
+	addr[1] = data;
+	len[1] = data_len - CMAC_TLEN;
+	addr[2] = zero;
+	len[2] = CMAC_TLEN;
+
+	aes_128_cmac_vector(tfm, scratch, 3, addr, len, mic);
+}
+
+
+struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[])
+{
+	struct crypto_cipher *tfm;
+
+	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	if (!IS_ERR(tfm))
+		crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+
+	return tfm;
+}
+
+
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm)
+{
+	crypto_free_cipher(tfm);
+}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
new file mode 100644
index 00000000..0eb9a483
--- /dev/null
+++ b/net/mac80211/aes_cmac.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef AES_CMAC_H
+#define AES_CMAC_H
+
+#include <linux/crypto.h>
+
+struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]);
+void ieee80211_aes_cmac(struct crypto_cipher *tfm, u8 *scratch, const u8 *aad,
+			const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
+
+#endif /* AES_CMAC_H */
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
new file mode 100644
index 00000000..1a41b142
--- /dev/null
+++ b/net/mac80211/agg-rx.c
@@ -0,0 +1,328 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/**
+ * DOC: RX A-MPDU aggregation
+ *
+ * Aggregation on the RX side requires only implementing the
+ * @ampdu_action callback that is invoked to start/stop any
+ * block-ack sessions for RX aggregation.
+ *
+ * When RX aggregation is started by the peer, the driver is
+ * notified via @ampdu_action function, with the
+ * %IEEE80211_AMPDU_RX_START action, and may reject the request
+ * in which case a negative response is sent to the peer, if it
+ * accepts it a positive response is sent.
+ *
+ * While the session is active, the device/driver are required
+ * to de-aggregate frames and pass them up one by one to mac80211,
+ * which will handle the reorder buffer.
+ *
+ * When the aggregation session is stopped again by the peer or
+ * ourselves, the driver's @ampdu_action function will be called
+ * with the action %IEEE80211_AMPDU_RX_STOP. In this case, the
+ * call must not fail.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+
+static void ieee80211_free_tid_rx(struct rcu_head *h)
+{
+	struct tid_ampdu_rx *tid_rx =
+		container_of(h, struct tid_ampdu_rx, rcu_head);
+	int i;
+
+	del_timer_sync(&tid_rx->reorder_timer);
+
+	for (i = 0; i < tid_rx->buf_size; i++)
+		dev_kfree_skb(tid_rx->reorder_buf[i]);
+	kfree(tid_rx->reorder_buf);
+	kfree(tid_rx->reorder_time);
+	kfree(tid_rx);
+}
+
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				     u16 initiator, u16 reason, bool tx)
+{
+	struct ieee80211_local *local = sta->local;
+	struct tid_ampdu_rx *tid_rx;
+
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+	tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
+					lockdep_is_held(&sta->ampdu_mlme.mtx));
+
+	if (!tid_rx)
+		return;
+
+	rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], NULL);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Rx BA session stop requested for %pM tid %u\n",
+	       sta->sta.addr, tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
+			     &sta->sta, tid, NULL, 0))
+		printk(KERN_DEBUG "HW problem - can not stop rx "
+				"aggregation for tid %d\n", tid);
+
+	/* check if this is a self generated aggregation halt */
+	if (initiator == WLAN_BACK_RECIPIENT && tx)
+		ieee80211_send_delba(sta->sdata, sta->sta.addr,
+				     tid, 0, reason);
+
+	del_timer_sync(&tid_rx->session_timer);
+
+	call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
+}
+
+void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				    u16 initiator, u16 reason, bool tx)
+{
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
+/*
+ * After accepting the AddBA Request we activated a timer,
+ * resetting it after each frame that arrives from the originator.
+ */
+static void sta_rx_agg_session_timer_expired(unsigned long data)
+{
+	/* not an elegant detour, but there is no choice as the timer passes
+	 * only one argument, and various sta_info are needed here, so init
+	 * flow in sta_info_create gives the TID as data, while the timer_to_id
+	 * array gives the sta through container_of */
+	u8 *ptid = (u8 *)data;
+	u8 *timer_to_id = ptid - *ptid;
+	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+					 timer_to_tid[0]);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
+#endif
+	set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired);
+	ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+}
+
+static void sta_rx_agg_reorder_timer_expired(unsigned long data)
+{
+	u8 *ptid = (u8 *)data;
+	u8 *timer_to_id = ptid - *ptid;
+	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+			timer_to_tid[0]);
+
+	rcu_read_lock();
+	ieee80211_release_reorder_timeout(sta, *ptid);
+	rcu_read_unlock();
+}
+
+static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+				      u8 dialog_token, u16 status, u16 policy,
+				      u16 buf_size, u16 timeout)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 capab;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer "
+		       "for addba resp frame\n", sdata->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	if (sdata->vif.type == NL80211_IFTYPE_AP ||
+	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
+	mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
+
+	capab = (u16)(policy << 1);	/* bit 1 aggregation policy */
+	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
+	capab |= (u16)(buf_size << 6);	/* bit 15:6 max size of aggregation */
+
+	mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
+	mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
+	mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+				     struct sta_info *sta,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len)
+{
+	struct tid_ampdu_rx *tid_agg_rx;
+	u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num, status;
+	u8 dialog_token;
+	int ret = -EOPNOTSUPP;
+
+	/* extract session parameters from addba request frame */
+	dialog_token = mgmt->u.action.u.addba_req.dialog_token;
+	timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout);
+	start_seq_num =
+		le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4;
+
+	capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab);
+	ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1;
+	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+
+	status = WLAN_STATUS_REQUEST_DECLINED;
+
+	if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Suspend in progress. "
+		       "Denying ADDBA request\n");
+#endif
+		goto end_no_lock;
+	}
+
+	/* sanity check for incoming parameters:
+	 * check if configuration can support the BA policy
+	 * and if buffer size does not exceeds max value */
+	/* XXX: check own ht delayed BA capability?? */
+	if (((ba_policy != 1) &&
+	     (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
+	    (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+		status = WLAN_STATUS_INVALID_QOS_PARAM;
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "AddBA Req with bad params from "
+				"%pM on tid %u. policy %d, buffer size %d\n",
+				mgmt->sa, tid, ba_policy,
+				buf_size);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		goto end_no_lock;
+	}
+	/* determine default buffer size */
+	if (buf_size == 0)
+		buf_size = IEEE80211_MAX_AMPDU_BUF;
+
+	/* make sure the size doesn't exceed the maximum supported by the hw */
+	if (buf_size > local->hw.max_rx_aggregation_subframes)
+		buf_size = local->hw.max_rx_aggregation_subframes;
+
+	/* examine state machine */
+	mutex_lock(&sta->ampdu_mlme.mtx);
+
+	if (sta->ampdu_mlme.tid_rx[tid]) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "unexpected AddBA Req from "
+				"%pM on tid %u\n",
+				mgmt->sa, tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		goto end;
+	}
+
+	/* prepare A-MPDU MLME for Rx aggregation */
+	tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+	if (!tid_agg_rx) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "allocate rx mlme to tid %d failed\n",
+					tid);
+#endif
+		goto end;
+	}
+
+	spin_lock_init(&tid_agg_rx->reorder_lock);
+
+	/* rx timer */
+	tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired;
+	tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_agg_rx->session_timer);
+
+	/* rx reorder timer */
+	tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired;
+	tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_agg_rx->reorder_timer);
+
+	/* prepare reordering buffer */
+	tid_agg_rx->reorder_buf =
+		kcalloc(buf_size, sizeof(struct sk_buff *), GFP_KERNEL);
+	tid_agg_rx->reorder_time =
+		kcalloc(buf_size, sizeof(unsigned long), GFP_KERNEL);
+	if (!tid_agg_rx->reorder_buf || !tid_agg_rx->reorder_time) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "can not allocate reordering buffer "
+			       "to tid %d\n", tid);
+#endif
+		kfree(tid_agg_rx->reorder_buf);
+		kfree(tid_agg_rx->reorder_time);
+		kfree(tid_agg_rx);
+		goto end;
+	}
+
+	ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
+			       &sta->sta, tid, &start_seq_num, 0);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Rx A-MPDU request on tid %d result %d\n", tid, ret);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	if (ret) {
+		kfree(tid_agg_rx->reorder_buf);
+		kfree(tid_agg_rx->reorder_time);
+		kfree(tid_agg_rx);
+		goto end;
+	}
+
+	/* update data */
+	tid_agg_rx->dialog_token = dialog_token;
+	tid_agg_rx->ssn = start_seq_num;
+	tid_agg_rx->head_seq_num = start_seq_num;
+	tid_agg_rx->buf_size = buf_size;
+	tid_agg_rx->timeout = timeout;
+	tid_agg_rx->stored_mpdu_num = 0;
+	status = WLAN_STATUS_SUCCESS;
+
+	/* activate it for RX */
+	rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], tid_agg_rx);
+
+	if (timeout)
+		mod_timer(&tid_agg_rx->session_timer, TU_TO_EXP_TIME(timeout));
+
+end:
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+
+end_no_lock:
+	ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
+				  dialog_token, status, 1, buf_size, timeout);
+}
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
new file mode 100644
index 00000000..b7f4f5c1
--- /dev/null
+++ b/net/mac80211/agg-tx.c
@@ -0,0 +1,842 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "wme.h"
+
+/**
+ * DOC: TX A-MPDU aggregation
+ *
+ * Aggregation on the TX side requires setting the hardware flag
+ * %IEEE80211_HW_AMPDU_AGGREGATION. The driver will then be handed
+ * packets with a flag indicating A-MPDU aggregation. The driver
+ * or device is responsible for actually aggregating the frames,
+ * as well as deciding how many and which to aggregate.
+ *
+ * When TX aggregation is started by some subsystem (usually the rate
+ * control algorithm would be appropriate) by calling the
+ * ieee80211_start_tx_ba_session() function, the driver will be
+ * notified via its @ampdu_action function, with the
+ * %IEEE80211_AMPDU_TX_START action.
+ *
+ * In response to that, the driver is later required to call the
+ * ieee80211_start_tx_ba_cb_irqsafe() function, which will really
+ * start the aggregation session after the peer has also responded.
+ * If the peer responds negatively, the session will be stopped
+ * again right away. Note that it is possible for the aggregation
+ * session to be stopped before the driver has indicated that it
+ * is done setting it up, in which case it must not indicate the
+ * setup completion.
+ *
+ * Also note that, since we also need to wait for a response from
+ * the peer, the driver is notified of the completion of the
+ * handshake by the %IEEE80211_AMPDU_TX_OPERATIONAL action to the
+ * @ampdu_action callback.
+ *
+ * Similarly, when the aggregation session is stopped by the peer
+ * or something calling ieee80211_stop_tx_ba_session(), the driver's
+ * @ampdu_action function will be called with the action
+ * %IEEE80211_AMPDU_TX_STOP. In this case, the call must not fail,
+ * and the driver must later call ieee80211_stop_tx_ba_cb_irqsafe().
+ */
+
+static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
+					 const u8 *da, u16 tid,
+					 u8 dialog_token, u16 start_seq_num,
+					 u16 agg_size, u16 timeout)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 capab;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer "
+				"for addba request frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	if (sdata->vif.type == NL80211_IFTYPE_AP ||
+	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
+
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
+
+	mgmt->u.action.u.addba_req.dialog_token = dialog_token;
+	capab = (u16)(1 << 1);		/* bit 1 aggregation policy */
+	capab |= (u16)(tid << 2); 	/* bit 5:2 TID number */
+	capab |= (u16)(agg_size << 6);	/* bit 15:6 max size of aggergation */
+
+	mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
+
+	mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout);
+	mgmt->u.action.u.addba_req.start_seq_num =
+					cpu_to_le16(start_seq_num << 4);
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_bar *bar;
+	u16 bar_control = 0;
+
+	skb = dev_alloc_skb(sizeof(*bar) + local->hw.extra_tx_headroom);
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer for "
+			"bar frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	bar = (struct ieee80211_bar *)skb_put(skb, sizeof(*bar));
+	memset(bar, 0, sizeof(*bar));
+	bar->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+					 IEEE80211_STYPE_BACK_REQ);
+	memcpy(bar->ra, ra, ETH_ALEN);
+	memcpy(bar->ta, sdata->vif.addr, ETH_ALEN);
+	bar_control |= (u16)IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL;
+	bar_control |= (u16)IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA;
+	bar_control |= (u16)(tid << 12);
+	bar->control = cpu_to_le16(bar_control);
+	bar->start_seq_num = cpu_to_le16(ssn);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
+			     struct tid_ampdu_tx *tid_tx)
+{
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+	lockdep_assert_held(&sta->lock);
+	rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx);
+}
+
+int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+				    enum ieee80211_back_parties initiator,
+				    bool tx)
+{
+	struct ieee80211_local *local = sta->local;
+	struct tid_ampdu_tx *tid_tx;
+	int ret;
+
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+	spin_lock_bh(&sta->lock);
+
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+	if (!tid_tx) {
+		spin_unlock_bh(&sta->lock);
+		return -ENOENT;
+	}
+
+	/* if we're already stopping ignore any new requests to stop */
+	if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+		spin_unlock_bh(&sta->lock);
+		return -EALREADY;
+	}
+
+	if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+		/* not even started yet! */
+		ieee80211_assign_tid_tx(sta, tid, NULL);
+		spin_unlock_bh(&sta->lock);
+		kfree_rcu(tid_tx, rcu_head);
+		return 0;
+	}
+
+	set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
+
+	spin_unlock_bh(&sta->lock);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Tx BA session stop requested for %pM tid %u\n",
+	       sta->sta.addr, tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	del_timer_sync(&tid_tx->addba_resp_timer);
+
+	/*
+	 * After this packets are no longer handed right through
+	 * to the driver but are put onto tid_tx->pending instead,
+	 * with locking to ensure proper access.
+	 */
+	clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
+
+	/*
+	 * There might be a few packets being processed right now (on
+	 * another CPU) that have already gotten past the aggregation
+	 * check when it was still OPERATIONAL and consequently have
+	 * IEEE80211_TX_CTL_AMPDU set. In that case, this code might
+	 * call into the driver at the same time or even before the
+	 * TX paths calls into it, which could confuse the driver.
+	 *
+	 * Wait for all currently running TX paths to finish before
+	 * telling the driver. New packets will not go through since
+	 * the aggregation session is no longer OPERATIONAL.
+	 */
+	synchronize_net();
+
+	tid_tx->stop_initiator = initiator;
+	tid_tx->tx_stop = tx;
+
+	ret = drv_ampdu_action(local, sta->sdata,
+			       IEEE80211_AMPDU_TX_STOP,
+			       &sta->sta, tid, NULL, 0);
+
+	/* HW shall not deny going back to legacy */
+	if (WARN_ON(ret)) {
+		/*
+		 * We may have pending packets get stuck in this case...
+		 * Not bothering with a workaround for now.
+		 */
+	}
+
+	return ret;
+}
+
+/*
+ * After sending add Block Ack request we activated a timer until
+ * add Block Ack response will arrive from the recipient.
+ * If this timer expires sta_addba_resp_timer_expired will be executed.
+ */
+static void sta_addba_resp_timer_expired(unsigned long data)
+{
+	/* not an elegant detour, but there is no choice as the timer passes
+	 * only one argument, and both sta_info and TID are needed, so init
+	 * flow in sta_info_create gives the TID as data, while the timer_to_id
+	 * array gives the sta through container_of */
+	u16 tid = *(u8 *)data;
+	struct sta_info *sta = container_of((void *)data,
+		struct sta_info, timer_to_tid[tid]);
+	struct tid_ampdu_tx *tid_tx;
+
+	/* check if the TID waits for addBA response */
+	rcu_read_lock();
+	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+	if (!tid_tx ||
+	    test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
+		rcu_read_unlock();
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "timer expired on tid %d but we are not "
+				"(or no longer) expecting addBA response there\n",
+			tid);
+#endif
+		return;
+	}
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid);
+#endif
+
+	ieee80211_stop_tx_ba_session(&sta->sta, tid);
+	rcu_read_unlock();
+}
+
+static inline int ieee80211_ac_from_tid(int tid)
+{
+	return ieee802_1d_to_ac[tid & 7];
+}
+
+/*
+ * When multiple aggregation sessions on multiple stations
+ * are being created/destroyed simultaneously, we need to
+ * refcount the global queue stop caused by that in order
+ * to not get into a situation where one of the aggregation
+ * setup or teardown re-enables queues before the other is
+ * ready to handle that.
+ *
+ * These two functions take care of this issue by keeping
+ * a global "agg_queue_stop" refcount.
+ */
+static void __acquires(agg_queue)
+ieee80211_stop_queue_agg(struct ieee80211_local *local, int tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+
+	if (atomic_inc_return(&local->agg_queue_stop[queue]) == 1)
+		ieee80211_stop_queue_by_reason(
+			&local->hw, queue,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	__acquire(agg_queue);
+}
+
+static void __releases(agg_queue)
+ieee80211_wake_queue_agg(struct ieee80211_local *local, int tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+
+	if (atomic_dec_return(&local->agg_queue_stop[queue]) == 0)
+		ieee80211_wake_queue_by_reason(
+			&local->hw, queue,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	__release(agg_queue);
+}
+
+/*
+ * splice packets from the STA's pending to the local pending,
+ * requires a call to ieee80211_agg_splice_finish later
+ */
+static void __acquires(agg_queue)
+ieee80211_agg_splice_packets(struct ieee80211_local *local,
+			     struct tid_ampdu_tx *tid_tx, u16 tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+	unsigned long flags;
+
+	ieee80211_stop_queue_agg(local, tid);
+
+	if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates"
+			  " from the pending queue\n", tid))
+		return;
+
+	if (!skb_queue_empty(&tid_tx->pending)) {
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+		/* copy over remaining packets */
+		skb_queue_splice_tail_init(&tid_tx->pending,
+					   &local->pending[queue]);
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+	}
+}
+
+static void __releases(agg_queue)
+ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid)
+{
+	ieee80211_wake_queue_agg(local, tid);
+}
+
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
+{
+	struct tid_ampdu_tx *tid_tx;
+	struct ieee80211_local *local = sta->local;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u16 start_seq_num;
+	int ret;
+
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+	/*
+	 * Start queuing up packets for this aggregation session.
+	 * We're going to release them once the driver is OK with
+	 * that.
+	 */
+	clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
+
+	/*
+	 * Make sure no packets are being processed. This ensures that
+	 * we have a valid starting sequence number and that in-flight
+	 * packets have been flushed out and no packets for this TID
+	 * will go into the driver during the ampdu_action call.
+	 */
+	synchronize_net();
+
+	start_seq_num = sta->tid_seq[tid] >> 4;
+
+	ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
+			       &sta->sta, tid, &start_seq_num, 0);
+	if (ret) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - HW unavailable for"
+					" tid %d\n", tid);
+#endif
+		spin_lock_bh(&sta->lock);
+		ieee80211_agg_splice_packets(local, tid_tx, tid);
+		ieee80211_assign_tid_tx(sta, tid, NULL);
+		ieee80211_agg_splice_finish(local, tid);
+		spin_unlock_bh(&sta->lock);
+
+		kfree_rcu(tid_tx, rcu_head);
+		return;
+	}
+
+	/* activate the timer for the recipient's addBA response */
+	mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
+#endif
+
+	spin_lock_bh(&sta->lock);
+	sta->ampdu_mlme.addba_req_num[tid]++;
+	spin_unlock_bh(&sta->lock);
+
+	/* send AddBA request */
+	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
+				     tid_tx->dialog_token, start_seq_num,
+				     local->hw.max_tx_aggregation_subframes,
+				     tid_tx->timeout);
+}
+
+int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
+				  u16 timeout)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	struct tid_ampdu_tx *tid_tx;
+	int ret = 0;
+
+	trace_api_start_tx_ba_session(pubsta, tid);
+
+	if (WARN_ON(!local->ops->ampdu_action))
+		return -EINVAL;
+
+	if ((tid >= STA_TID_NUM) ||
+	    !(local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION))
+		return -EINVAL;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Open BA session requested for %pM tid %u\n",
+	       pubsta->addr, tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	/*
+	 * The aggregation code is not prepared to handle
+	 * anything but STA/AP due to the BSSID handling.
+	 * IBSS could work in the code but isn't supported
+	 * by drivers or the standard.
+	 */
+	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+	    sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+	    sdata->vif.type != NL80211_IFTYPE_AP)
+		return -EINVAL;
+
+	if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA sessions blocked. "
+		       "Denying BA session request\n");
+#endif
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&sta->lock);
+
+	/* we have tried too many times, receiver does not want A-MPDU */
+	if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
+		ret = -EBUSY;
+		goto err_unlock_sta;
+	}
+
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+	/* check if the TID is not in aggregation flow already */
+	if (tid_tx || sta->ampdu_mlme.tid_start_tx[tid]) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - session is not "
+				 "idle on tid %u\n", tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+		ret = -EAGAIN;
+		goto err_unlock_sta;
+	}
+
+	/* prepare A-MPDU MLME for Tx aggregation */
+	tid_tx = kzalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
+	if (!tid_tx) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_ERR "allocate tx mlme to tid %d failed\n",
+					tid);
+#endif
+		ret = -ENOMEM;
+		goto err_unlock_sta;
+	}
+
+	skb_queue_head_init(&tid_tx->pending);
+	__set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
+
+	tid_tx->timeout = timeout;
+
+	/* Tx timer */
+	tid_tx->addba_resp_timer.function = sta_addba_resp_timer_expired;
+	tid_tx->addba_resp_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_tx->addba_resp_timer);
+
+	/* assign a dialog token */
+	sta->ampdu_mlme.dialog_token_allocator++;
+	tid_tx->dialog_token = sta->ampdu_mlme.dialog_token_allocator;
+
+	/*
+	 * Finally, assign it to the start array; the work item will
+	 * collect it and move it to the normal array.
+	 */
+	sta->ampdu_mlme.tid_start_tx[tid] = tid_tx;
+
+	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+	/* this flow continues off the work */
+ err_unlock_sta:
+	spin_unlock_bh(&sta->lock);
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
+
+static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
+					 struct sta_info *sta, u16 tid)
+{
+	struct tid_ampdu_tx *tid_tx;
+
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Aggregation is on for tid %d\n", tid);
+#endif
+
+	drv_ampdu_action(local, sta->sdata,
+			 IEEE80211_AMPDU_TX_OPERATIONAL,
+			 &sta->sta, tid, NULL, tid_tx->buf_size);
+
+	/*
+	 * synchronize with TX path, while splicing the TX path
+	 * should block so it won't put more packets onto pending.
+	 */
+	spin_lock_bh(&sta->lock);
+
+	ieee80211_agg_splice_packets(local, tid_tx, tid);
+	/*
+	 * Now mark as operational. This will be visible
+	 * in the TX path, and lets it go lock-free in
+	 * the common case.
+	 */
+	set_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
+	ieee80211_agg_splice_finish(local, tid);
+
+	spin_unlock_bh(&sta->lock);
+}
+
+void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	struct tid_ampdu_tx *tid_tx;
+
+	trace_api_start_tx_ba_cb(sdata, ra, tid);
+
+	if (tid >= STA_TID_NUM) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
+				tid, STA_TID_NUM);
+#endif
+		return;
+	}
+
+	mutex_lock(&local->sta_mtx);
+	sta = sta_info_get(sdata, ra);
+	if (!sta) {
+		mutex_unlock(&local->sta_mtx);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Could not find station: %pM\n", ra);
+#endif
+		return;
+	}
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+	if (WARN_ON(!tid_tx)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "addBA was not requested!\n");
+#endif
+		goto unlock;
+	}
+
+	if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
+		goto unlock;
+
+	if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
+		ieee80211_agg_tx_operational(local, sta, tid);
+
+ unlock:
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+	mutex_unlock(&local->sta_mtx);
+}
+
+void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
+				      const u8 *ra, u16 tid)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_ra_tid *ra_tid;
+	struct sk_buff *skb = dev_alloc_skb(0);
+
+	if (unlikely(!skb)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_WARNING "%s: Not enough memory, "
+			       "dropping start BA session", sdata->name);
+#endif
+		return;
+	}
+	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+	memcpy(&ra_tid->ra, ra, ETH_ALEN);
+	ra_tid->tid = tid;
+
+	skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START;
+	skb_queue_tail(&sdata->skb_queue, skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
+
+int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+				   enum ieee80211_back_parties initiator,
+				   bool tx)
+{
+	int ret;
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+
+	ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx);
+
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+
+	return ret;
+}
+
+int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	struct tid_ampdu_tx *tid_tx;
+	int ret = 0;
+
+	trace_api_stop_tx_ba_session(pubsta, tid);
+
+	if (!local->ops->ampdu_action)
+		return -EINVAL;
+
+	if (tid >= STA_TID_NUM)
+		return -EINVAL;
+
+	spin_lock_bh(&sta->lock);
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+	if (!tid_tx) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+		/* already in progress stopping it */
+		ret = 0;
+		goto unlock;
+	}
+
+	set_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state);
+	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+ unlock:
+	spin_unlock_bh(&sta->lock);
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
+
+void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	struct tid_ampdu_tx *tid_tx;
+
+	trace_api_stop_tx_ba_cb(sdata, ra, tid);
+
+	if (tid >= STA_TID_NUM) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Bad TID value: tid = %d (>= %d)\n",
+				tid, STA_TID_NUM);
+#endif
+		return;
+	}
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "Stopping Tx BA session for %pM tid %d\n",
+	       ra, tid);
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	mutex_lock(&local->sta_mtx);
+
+	sta = sta_info_get(sdata, ra);
+	if (!sta) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "Could not find station: %pM\n", ra);
+#endif
+		goto unlock;
+	}
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	spin_lock_bh(&sta->lock);
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+
+	if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n");
+#endif
+		goto unlock_sta;
+	}
+
+	if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop)
+		ieee80211_send_delba(sta->sdata, ra, tid,
+			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
+
+	/*
+	 * When we get here, the TX path will not be lockless any more wrt.
+	 * aggregation, since the OPERATIONAL bit has long been cleared.
+	 * Thus it will block on getting the lock, if it occurs. So if we
+	 * stop the queue now, we will not get any more packets, and any
+	 * that might be being processed will wait for us here, thereby
+	 * guaranteeing that no packets go to the tid_tx pending queue any
+	 * more.
+	 */
+
+	ieee80211_agg_splice_packets(local, tid_tx, tid);
+
+	/* future packets must not find the tid_tx struct any more */
+	ieee80211_assign_tid_tx(sta, tid, NULL);
+
+	ieee80211_agg_splice_finish(local, tid);
+
+	kfree_rcu(tid_tx, rcu_head);
+
+ unlock_sta:
+	spin_unlock_bh(&sta->lock);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+ unlock:
+	mutex_unlock(&local->sta_mtx);
+}
+
+void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
+				     const u8 *ra, u16 tid)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_ra_tid *ra_tid;
+	struct sk_buff *skb = dev_alloc_skb(0);
+
+	if (unlikely(!skb)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		if (net_ratelimit())
+			printk(KERN_WARNING "%s: Not enough memory, "
+			       "dropping stop BA session", sdata->name);
+#endif
+		return;
+	}
+	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
+	memcpy(&ra_tid->ra, ra, ETH_ALEN);
+	ra_tid->tid = tid;
+
+	skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP;
+	skb_queue_tail(&sdata->skb_queue, skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
+
+
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len)
+{
+	struct tid_ampdu_tx *tid_tx;
+	u16 capab, tid;
+	u8 buf_size;
+
+	capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
+	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
+	buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+
+	tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+	if (!tid_tx)
+		goto out;
+
+	if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid);
+#endif
+		goto out;
+	}
+
+	del_timer_sync(&tid_tx->addba_resp_timer);
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "switched off addBA timer for tid %d\n", tid);
+#endif
+
+	/*
+	 * addba_resp_timer may have fired before we got here, and
+	 * caused WANT_STOP to be set. If the stop then was already
+	 * processed further, STOPPING might be set.
+	 */
+	if (test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state) ||
+	    test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG
+		       "got addBA resp for tid %d but we already gave up\n",
+		       tid);
+#endif
+		goto out;
+	}
+
+	if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
+			== WLAN_STATUS_SUCCESS) {
+		/*
+		 * IEEE 802.11-2007 7.3.1.14:
+		 * In an ADDBA Response frame, when the Status Code field
+		 * is set to 0, the Buffer Size subfield is set to a value
+		 * of at least 1.
+		 */
+		if (!buf_size)
+			goto out;
+
+		if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED,
+				     &tid_tx->state)) {
+			/* ignore duplicate response */
+			goto out;
+		}
+
+		tid_tx->buf_size = buf_size;
+
+		if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))
+			ieee80211_agg_tx_operational(local, sta, tid);
+
+		sta->ampdu_mlme.addba_req_num[tid] = 0;
+	} else {
+		___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR,
+						true);
+	}
+
+ out:
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+}
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
new file mode 100644
index 00000000..143a0064
--- /dev/null
+++ b/net/mac80211/cfg.c
@@ -0,0 +1,2149 @@
+/*
+ * mac80211 configuration hooks for cfg80211
+ *
+ * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+
+#include <linux/ieee80211.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <linux/rcupdate.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "cfg.h"
+#include "rate.h"
+#include "mesh.h"
+
+static struct net_device *ieee80211_add_iface(struct wiphy *wiphy, char *name,
+					      enum nl80211_iftype type,
+					      u32 *flags,
+					      struct vif_params *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct net_device *dev;
+	struct ieee80211_sub_if_data *sdata;
+	int err;
+
+	err = ieee80211_if_add(local, name, &dev, type, params);
+	if (err)
+		return ERR_PTR(err);
+
+	if (type == NL80211_IFTYPE_MONITOR && flags) {
+		sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+		sdata->u.mntr_flags = *flags;
+	}
+
+	return dev;
+}
+
+static int ieee80211_del_iface(struct wiphy *wiphy, struct net_device *dev)
+{
+	ieee80211_if_remove(IEEE80211_DEV_TO_SUB_IF(dev));
+
+	return 0;
+}
+
+static int ieee80211_change_iface(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  enum nl80211_iftype type, u32 *flags,
+				  struct vif_params *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	int ret;
+
+	ret = ieee80211_if_change_type(sdata, type);
+	if (ret)
+		return ret;
+
+	if (type == NL80211_IFTYPE_AP_VLAN &&
+	    params && params->use_4addr == 0)
+		rcu_assign_pointer(sdata->u.vlan.sta, NULL);
+	else if (type == NL80211_IFTYPE_STATION &&
+		 params && params->use_4addr >= 0)
+		sdata->u.mgd.use_4addr = params->use_4addr;
+
+	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+		struct ieee80211_local *local = sdata->local;
+
+		if (ieee80211_sdata_running(sdata)) {
+			/*
+			 * Prohibit MONITOR_FLAG_COOK_FRAMES to be
+			 * changed while the interface is up.
+			 * Else we would need to add a lot of cruft
+			 * to update everything:
+			 *	cooked_mntrs, monitor and all fif_* counters
+			 *	reconfigure hardware
+			 */
+			if ((*flags & MONITOR_FLAG_COOK_FRAMES) !=
+			    (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+				return -EBUSY;
+
+			ieee80211_adjust_monitor_flags(sdata, -1);
+			sdata->u.mntr_flags = *flags;
+			ieee80211_adjust_monitor_flags(sdata, 1);
+
+			ieee80211_configure_filter(local);
+		} else {
+			/*
+			 * Because the interface is down, ieee80211_do_stop
+			 * and ieee80211_do_open take care of "everything"
+			 * mentioned in the comment above.
+			 */
+			sdata->u.mntr_flags = *flags;
+		}
+	}
+
+	return 0;
+}
+
+static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
+			     u8 key_idx, bool pairwise, const u8 *mac_addr,
+			     struct key_params *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct sta_info *sta = NULL;
+	struct ieee80211_key *key;
+	int err;
+
+	if (!ieee80211_sdata_running(sdata))
+		return -ENETDOWN;
+
+	/* reject WEP and TKIP keys if WEP failed to initialize */
+	switch (params->cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_TKIP:
+	case WLAN_CIPHER_SUITE_WEP104:
+		if (IS_ERR(sdata->local->wep_tx_tfm))
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+
+	key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
+				  params->key, params->seq_len, params->seq);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	if (pairwise)
+		key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
+
+	mutex_lock(&sdata->local->sta_mtx);
+
+	if (mac_addr) {
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			sta = sta_info_get(sdata, mac_addr);
+		else
+			sta = sta_info_get_bss(sdata, mac_addr);
+		if (!sta) {
+			ieee80211_key_free(sdata->local, key);
+			err = -ENOENT;
+			goto out_unlock;
+		}
+	}
+
+	err = ieee80211_key_link(key, sdata, sta);
+	if (err)
+		ieee80211_key_free(sdata->local, key);
+
+ out_unlock:
+	mutex_unlock(&sdata->local->sta_mtx);
+
+	return err;
+}
+
+static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
+			     u8 key_idx, bool pairwise, const u8 *mac_addr)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	struct ieee80211_key *key = NULL;
+	int ret;
+
+	mutex_lock(&local->sta_mtx);
+	mutex_lock(&local->key_mtx);
+
+	if (mac_addr) {
+		ret = -ENOENT;
+
+		sta = sta_info_get_bss(sdata, mac_addr);
+		if (!sta)
+			goto out_unlock;
+
+		if (pairwise)
+			key = key_mtx_dereference(local, sta->ptk);
+		else
+			key = key_mtx_dereference(local, sta->gtk[key_idx]);
+	} else
+		key = key_mtx_dereference(local, sdata->keys[key_idx]);
+
+	if (!key) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	__ieee80211_key_free(key);
+
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&local->key_mtx);
+	mutex_unlock(&local->sta_mtx);
+
+	return ret;
+}
+
+static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
+			     u8 key_idx, bool pairwise, const u8 *mac_addr,
+			     void *cookie,
+			     void (*callback)(void *cookie,
+					      struct key_params *params))
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta = NULL;
+	u8 seq[6] = {0};
+	struct key_params params;
+	struct ieee80211_key *key = NULL;
+	u32 iv32;
+	u16 iv16;
+	int err = -ENOENT;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	rcu_read_lock();
+
+	if (mac_addr) {
+		sta = sta_info_get_bss(sdata, mac_addr);
+		if (!sta)
+			goto out;
+
+		if (pairwise)
+			key = rcu_dereference(sta->ptk);
+		else if (key_idx < NUM_DEFAULT_KEYS)
+			key = rcu_dereference(sta->gtk[key_idx]);
+	} else
+		key = rcu_dereference(sdata->keys[key_idx]);
+
+	if (!key)
+		goto out;
+
+	memset(&params, 0, sizeof(params));
+
+	params.cipher = key->conf.cipher;
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_TKIP:
+		iv32 = key->u.tkip.tx.iv32;
+		iv16 = key->u.tkip.tx.iv16;
+
+		if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
+			drv_get_tkip_seq(sdata->local,
+					 key->conf.hw_key_idx,
+					 &iv32, &iv16);
+
+		seq[0] = iv16 & 0xff;
+		seq[1] = (iv16 >> 8) & 0xff;
+		seq[2] = iv32 & 0xff;
+		seq[3] = (iv32 >> 8) & 0xff;
+		seq[4] = (iv32 >> 16) & 0xff;
+		seq[5] = (iv32 >> 24) & 0xff;
+		params.seq = seq;
+		params.seq_len = 6;
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		seq[0] = key->u.ccmp.tx_pn[5];
+		seq[1] = key->u.ccmp.tx_pn[4];
+		seq[2] = key->u.ccmp.tx_pn[3];
+		seq[3] = key->u.ccmp.tx_pn[2];
+		seq[4] = key->u.ccmp.tx_pn[1];
+		seq[5] = key->u.ccmp.tx_pn[0];
+		params.seq = seq;
+		params.seq_len = 6;
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		seq[0] = key->u.aes_cmac.tx_pn[5];
+		seq[1] = key->u.aes_cmac.tx_pn[4];
+		seq[2] = key->u.aes_cmac.tx_pn[3];
+		seq[3] = key->u.aes_cmac.tx_pn[2];
+		seq[4] = key->u.aes_cmac.tx_pn[1];
+		seq[5] = key->u.aes_cmac.tx_pn[0];
+		params.seq = seq;
+		params.seq_len = 6;
+		break;
+	}
+
+	params.key = key->conf.key;
+	params.key_len = key->conf.keylen;
+
+	callback(cookie, &params);
+	err = 0;
+
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int ieee80211_config_default_key(struct wiphy *wiphy,
+					struct net_device *dev,
+					u8 key_idx, bool uni,
+					bool multi)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_set_default_key(sdata, key_idx, uni, multi);
+
+	return 0;
+}
+
+static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
+					     struct net_device *dev,
+					     u8 key_idx)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_set_default_mgmt_key(sdata, key_idx);
+
+	return 0;
+}
+
+static void rate_idx_to_bitrate(struct rate_info *rate, struct sta_info *sta, int idx)
+{
+	if (!(rate->flags & RATE_INFO_FLAGS_MCS)) {
+		struct ieee80211_supported_band *sband;
+		sband = sta->local->hw.wiphy->bands[
+				sta->local->hw.conf.channel->band];
+		rate->legacy = sband->bitrates[idx].bitrate;
+	} else
+		rate->mcs = idx;
+}
+
+static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct timespec uptime;
+
+	sinfo->generation = sdata->local->sta_generation;
+
+	sinfo->filled = STATION_INFO_INACTIVE_TIME |
+			STATION_INFO_RX_BYTES |
+			STATION_INFO_TX_BYTES |
+			STATION_INFO_RX_PACKETS |
+			STATION_INFO_TX_PACKETS |
+			STATION_INFO_TX_RETRIES |
+			STATION_INFO_TX_FAILED |
+			STATION_INFO_TX_BITRATE |
+			STATION_INFO_RX_BITRATE |
+			STATION_INFO_RX_DROP_MISC |
+			STATION_INFO_BSS_PARAM |
+			STATION_INFO_CONNECTED_TIME;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	sinfo->connected_time = uptime.tv_sec - sta->last_connected;
+
+	sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
+	sinfo->rx_bytes = sta->rx_bytes;
+	sinfo->tx_bytes = sta->tx_bytes;
+	sinfo->rx_packets = sta->rx_packets;
+	sinfo->tx_packets = sta->tx_packets;
+	sinfo->tx_retries = sta->tx_retry_count;
+	sinfo->tx_failed = sta->tx_retry_failed;
+	sinfo->rx_dropped_misc = sta->rx_dropped;
+
+	if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
+	    (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
+		sinfo->filled |= STATION_INFO_SIGNAL | STATION_INFO_SIGNAL_AVG;
+		sinfo->signal = (s8)sta->last_signal;
+		sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal);
+	}
+
+	sinfo->txrate.flags = 0;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_MCS;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_SHORT_GI)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_SHORT_GI;
+	rate_idx_to_bitrate(&sinfo->txrate, sta, sta->last_tx_rate.idx);
+
+	sinfo->rxrate.flags = 0;
+	if (sta->last_rx_rate_flag & RX_FLAG_HT)
+		sinfo->rxrate.flags |= RATE_INFO_FLAGS_MCS;
+	if (sta->last_rx_rate_flag & RX_FLAG_40MHZ)
+		sinfo->rxrate.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH;
+	if (sta->last_rx_rate_flag & RX_FLAG_SHORT_GI)
+		sinfo->rxrate.flags |= RATE_INFO_FLAGS_SHORT_GI;
+	rate_idx_to_bitrate(&sinfo->rxrate, sta, sta->last_rx_rate_idx);
+
+	if (ieee80211_vif_is_mesh(&sdata->vif)) {
+#ifdef CONFIG_MAC80211_MESH
+		sinfo->filled |= STATION_INFO_LLID |
+				 STATION_INFO_PLID |
+				 STATION_INFO_PLINK_STATE;
+
+		sinfo->llid = le16_to_cpu(sta->llid);
+		sinfo->plid = le16_to_cpu(sta->plid);
+		sinfo->plink_state = sta->plink_state;
+#endif
+	}
+
+	sinfo->bss_param.flags = 0;
+	if (sdata->vif.bss_conf.use_cts_prot)
+		sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT;
+	if (sdata->vif.bss_conf.use_short_preamble)
+		sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE;
+	if (sdata->vif.bss_conf.use_short_slot)
+		sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME;
+	sinfo->bss_param.dtim_period = sdata->local->hw.conf.ps_dtim_period;
+	sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int;
+}
+
+
+static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
+				 int idx, u8 *mac, struct station_info *sinfo)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct sta_info *sta;
+	int ret = -ENOENT;
+
+	rcu_read_lock();
+
+	sta = sta_info_get_by_idx(sdata, idx);
+	if (sta) {
+		ret = 0;
+		memcpy(mac, sta->sta.addr, ETH_ALEN);
+		sta_set_sinfo(sta, sinfo);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev,
+				 int idx, struct survey_info *survey)
+{
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+
+	return drv_get_survey(local, idx, survey);
+}
+
+static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
+				 u8 *mac, struct station_info *sinfo)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct sta_info *sta;
+	int ret = -ENOENT;
+
+	rcu_read_lock();
+
+	sta = sta_info_get_bss(sdata, mac);
+	if (sta) {
+		ret = 0;
+		sta_set_sinfo(sta, sinfo);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * This handles both adding a beacon and setting new beacon info
+ */
+static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata,
+				   struct beacon_parameters *params)
+{
+	struct beacon_data *new, *old;
+	int new_head_len, new_tail_len;
+	int size;
+	int err = -EINVAL;
+
+	old = rtnl_dereference(sdata->u.ap.beacon);
+
+	/* head must not be zero-length */
+	if (params->head && !params->head_len)
+		return -EINVAL;
+
+	/*
+	 * This is a kludge. beacon interval should really be part
+	 * of the beacon information.
+	 */
+	if (params->interval &&
+	    (sdata->vif.bss_conf.beacon_int != params->interval)) {
+		sdata->vif.bss_conf.beacon_int = params->interval;
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_BEACON_INT);
+	}
+
+	/* Need to have a beacon head if we don't have one yet */
+	if (!params->head && !old)
+		return err;
+
+	/* sorry, no way to start beaconing without dtim period */
+	if (!params->dtim_period && !old)
+		return err;
+
+	/* new or old head? */
+	if (params->head)
+		new_head_len = params->head_len;
+	else
+		new_head_len = old->head_len;
+
+	/* new or old tail? */
+	if (params->tail || !old)
+		/* params->tail_len will be zero for !params->tail */
+		new_tail_len = params->tail_len;
+	else
+		new_tail_len = old->tail_len;
+
+	size = sizeof(*new) + new_head_len + new_tail_len;
+
+	new = kzalloc(size, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* start filling the new info now */
+
+	/* new or old dtim period? */
+	if (params->dtim_period)
+		new->dtim_period = params->dtim_period;
+	else
+		new->dtim_period = old->dtim_period;
+
+	/*
+	 * pointers go into the block we allocated,
+	 * memory is | beacon_data | head | tail |
+	 */
+	new->head = ((u8 *) new) + sizeof(*new);
+	new->tail = new->head + new_head_len;
+	new->head_len = new_head_len;
+	new->tail_len = new_tail_len;
+
+	/* copy in head */
+	if (params->head)
+		memcpy(new->head, params->head, new_head_len);
+	else
+		memcpy(new->head, old->head, new_head_len);
+
+	/* copy in optional tail */
+	if (params->tail)
+		memcpy(new->tail, params->tail, new_tail_len);
+	else
+		if (old)
+			memcpy(new->tail, old->tail, new_tail_len);
+
+	sdata->vif.bss_conf.dtim_period = new->dtim_period;
+
+	rcu_assign_pointer(sdata->u.ap.beacon, new);
+
+	synchronize_rcu();
+
+	kfree(old);
+
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED |
+						BSS_CHANGED_BEACON);
+	return 0;
+}
+
+static int ieee80211_add_beacon(struct wiphy *wiphy, struct net_device *dev,
+				struct beacon_parameters *params)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct beacon_data *old;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	old = rtnl_dereference(sdata->u.ap.beacon);
+	if (old)
+		return -EALREADY;
+
+	return ieee80211_config_beacon(sdata, params);
+}
+
+static int ieee80211_set_beacon(struct wiphy *wiphy, struct net_device *dev,
+				struct beacon_parameters *params)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct beacon_data *old;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	old = rtnl_dereference(sdata->u.ap.beacon);
+	if (!old)
+		return -ENOENT;
+
+	return ieee80211_config_beacon(sdata, params);
+}
+
+static int ieee80211_del_beacon(struct wiphy *wiphy, struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct beacon_data *old;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	old = rtnl_dereference(sdata->u.ap.beacon);
+	if (!old)
+		return -ENOENT;
+
+	rcu_assign_pointer(sdata->u.ap.beacon, NULL);
+	synchronize_rcu();
+	kfree(old);
+
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+	return 0;
+}
+
+/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
+struct iapp_layer2_update {
+	u8 da[ETH_ALEN];	/* broadcast */
+	u8 sa[ETH_ALEN];	/* STA addr */
+	__be16 len;		/* 6 */
+	u8 dsap;		/* 0 */
+	u8 ssap;		/* 0 */
+	u8 control;
+	u8 xid_info[3];
+} __packed;
+
+static void ieee80211_send_layer2_update(struct sta_info *sta)
+{
+	struct iapp_layer2_update *msg;
+	struct sk_buff *skb;
+
+	/* Send Level 2 Update Frame to update forwarding tables in layer 2
+	 * bridge devices */
+
+	skb = dev_alloc_skb(sizeof(*msg));
+	if (!skb)
+		return;
+	msg = (struct iapp_layer2_update *)skb_put(skb, sizeof(*msg));
+
+	/* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
+	 * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
+
+	memset(msg->da, 0xff, ETH_ALEN);
+	memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
+	msg->len = htons(6);
+	msg->dsap = 0;
+	msg->ssap = 0x01;	/* NULL LSAP, CR Bit: Response */
+	msg->control = 0xaf;	/* XID response lsb.1111F101.
+				 * F=0 (no poll command; unsolicited frame) */
+	msg->xid_info[0] = 0x81;	/* XID format identifier */
+	msg->xid_info[1] = 1;	/* LLC types/classes: Type 1 LLC */
+	msg->xid_info[2] = 0;	/* XID sender's receive window size (RW) */
+
+	skb->dev = sta->sdata->dev;
+	skb->protocol = eth_type_trans(skb, sta->sdata->dev);
+	memset(skb->cb, 0, sizeof(skb->cb));
+	netif_rx_ni(skb);
+}
+
+static void sta_apply_parameters(struct ieee80211_local *local,
+				 struct sta_info *sta,
+				 struct station_parameters *params)
+{
+	unsigned long flags;
+	u32 rates;
+	int i, j;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u32 mask, set;
+
+	sband = local->hw.wiphy->bands[local->oper_channel->band];
+
+	spin_lock_irqsave(&sta->flaglock, flags);
+	mask = params->sta_flags_mask;
+	set = params->sta_flags_set;
+
+	if (mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
+		sta->flags &= ~WLAN_STA_AUTHORIZED;
+		if (set & BIT(NL80211_STA_FLAG_AUTHORIZED))
+			sta->flags |= WLAN_STA_AUTHORIZED;
+	}
+
+	if (mask & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE)) {
+		sta->flags &= ~WLAN_STA_SHORT_PREAMBLE;
+		if (set & BIT(NL80211_STA_FLAG_SHORT_PREAMBLE))
+			sta->flags |= WLAN_STA_SHORT_PREAMBLE;
+	}
+
+	if (mask & BIT(NL80211_STA_FLAG_WME)) {
+		sta->flags &= ~WLAN_STA_WME;
+		if (set & BIT(NL80211_STA_FLAG_WME))
+			sta->flags |= WLAN_STA_WME;
+	}
+
+	if (mask & BIT(NL80211_STA_FLAG_MFP)) {
+		sta->flags &= ~WLAN_STA_MFP;
+		if (set & BIT(NL80211_STA_FLAG_MFP))
+			sta->flags |= WLAN_STA_MFP;
+	}
+
+	if (mask & BIT(NL80211_STA_FLAG_AUTHENTICATED)) {
+		sta->flags &= ~WLAN_STA_AUTH;
+		if (set & BIT(NL80211_STA_FLAG_AUTHENTICATED))
+			sta->flags |= WLAN_STA_AUTH;
+	}
+	spin_unlock_irqrestore(&sta->flaglock, flags);
+
+	/*
+	 * cfg80211 validates this (1-2007) and allows setting the AID
+	 * only when creating a new station entry
+	 */
+	if (params->aid)
+		sta->sta.aid = params->aid;
+
+	/*
+	 * FIXME: updating the following information is racy when this
+	 *	  function is called from ieee80211_change_station().
+	 *	  However, all this information should be static so
+	 *	  maybe we should just reject attemps to change it.
+	 */
+
+	if (params->listen_interval >= 0)
+		sta->listen_interval = params->listen_interval;
+
+	if (params->supported_rates) {
+		rates = 0;
+
+		for (i = 0; i < params->supported_rates_len; i++) {
+			int rate = (params->supported_rates[i] & 0x7f) * 5;
+			for (j = 0; j < sband->n_bitrates; j++) {
+				if (sband->bitrates[j].bitrate == rate)
+					rates |= BIT(j);
+			}
+		}
+		sta->sta.supp_rates[local->oper_channel->band] = rates;
+	}
+
+	if (params->ht_capa)
+		ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
+						  params->ht_capa,
+						  &sta->sta.ht_cap);
+
+	if (ieee80211_vif_is_mesh(&sdata->vif)) {
+#ifdef CONFIG_MAC80211_MESH
+		if (sdata->u.mesh.security & IEEE80211_MESH_SEC_SECURED)
+			switch (params->plink_state) {
+			case NL80211_PLINK_LISTEN:
+			case NL80211_PLINK_ESTAB:
+			case NL80211_PLINK_BLOCKED:
+				sta->plink_state = params->plink_state;
+				break;
+			default:
+				/*  nothing  */
+				break;
+			}
+		else
+			switch (params->plink_action) {
+			case PLINK_ACTION_OPEN:
+				mesh_plink_open(sta);
+				break;
+			case PLINK_ACTION_BLOCK:
+				mesh_plink_block(sta);
+				break;
+			}
+#endif
+	}
+}
+
+static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
+				 u8 *mac, struct station_parameters *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct sta_info *sta;
+	struct ieee80211_sub_if_data *sdata;
+	int err;
+	int layer2_update;
+
+	if (params->vlan) {
+		sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
+
+		if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    sdata->vif.type != NL80211_IFTYPE_AP)
+			return -EINVAL;
+	} else
+		sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (compare_ether_addr(mac, sdata->vif.addr) == 0)
+		return -EINVAL;
+
+	if (is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
+	if (!sta)
+		return -ENOMEM;
+
+	sta->flags = WLAN_STA_AUTH | WLAN_STA_ASSOC;
+
+	sta_apply_parameters(local, sta, params);
+
+	rate_control_rate_init(sta);
+
+	layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+		sdata->vif.type == NL80211_IFTYPE_AP;
+
+	err = sta_info_insert_rcu(sta);
+	if (err) {
+		rcu_read_unlock();
+		return err;
+	}
+
+	if (layer2_update)
+		ieee80211_send_layer2_update(sta);
+
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev,
+				 u8 *mac)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (mac)
+		return sta_info_destroy_addr_bss(sdata, mac);
+
+	sta_info_flush(local, sdata);
+	return 0;
+}
+
+static int ieee80211_change_station(struct wiphy *wiphy,
+				    struct net_device *dev,
+				    u8 *mac,
+				    struct station_parameters *params)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct sta_info *sta;
+	struct ieee80211_sub_if_data *vlansdata;
+
+	rcu_read_lock();
+
+	sta = sta_info_get_bss(sdata, mac);
+	if (!sta) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	if (params->vlan && params->vlan != sta->sdata->dev) {
+		vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
+
+		if (vlansdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    vlansdata->vif.type != NL80211_IFTYPE_AP) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+
+		if (params->vlan->ieee80211_ptr->use_4addr) {
+			if (vlansdata->u.vlan.sta) {
+				rcu_read_unlock();
+				return -EBUSY;
+			}
+
+			rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
+		}
+
+		sta->sdata = vlansdata;
+		ieee80211_send_layer2_update(sta);
+	}
+
+	sta_apply_parameters(local, sta, params);
+
+	rcu_read_unlock();
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED))
+		ieee80211_recalc_ps(local, -1);
+
+	return 0;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static int ieee80211_add_mpath(struct wiphy *wiphy, struct net_device *dev,
+				 u8 *dst, u8 *next_hop)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct mesh_path *mpath;
+	struct sta_info *sta;
+	int err;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	rcu_read_lock();
+	sta = sta_info_get(sdata, next_hop);
+	if (!sta) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	err = mesh_path_add(dst, sdata);
+	if (err) {
+		rcu_read_unlock();
+		return err;
+	}
+
+	mpath = mesh_path_lookup(dst, sdata);
+	if (!mpath) {
+		rcu_read_unlock();
+		return -ENXIO;
+	}
+	mesh_path_fix_nexthop(mpath, sta);
+
+	rcu_read_unlock();
+	return 0;
+}
+
+static int ieee80211_del_mpath(struct wiphy *wiphy, struct net_device *dev,
+				 u8 *dst)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (dst)
+		return mesh_path_del(dst, sdata);
+
+	mesh_path_flush(sdata);
+	return 0;
+}
+
+static int ieee80211_change_mpath(struct wiphy *wiphy,
+				    struct net_device *dev,
+				    u8 *dst, u8 *next_hop)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct mesh_path *mpath;
+	struct sta_info *sta;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	rcu_read_lock();
+
+	sta = sta_info_get(sdata, next_hop);
+	if (!sta) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	mpath = mesh_path_lookup(dst, sdata);
+	if (!mpath) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+
+	mesh_path_fix_nexthop(mpath, sta);
+
+	rcu_read_unlock();
+	return 0;
+}
+
+static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
+			    struct mpath_info *pinfo)
+{
+	struct sta_info *next_hop_sta = rcu_dereference(mpath->next_hop);
+
+	if (next_hop_sta)
+		memcpy(next_hop, next_hop_sta->sta.addr, ETH_ALEN);
+	else
+		memset(next_hop, 0, ETH_ALEN);
+
+	pinfo->generation = mesh_paths_generation;
+
+	pinfo->filled = MPATH_INFO_FRAME_QLEN |
+			MPATH_INFO_SN |
+			MPATH_INFO_METRIC |
+			MPATH_INFO_EXPTIME |
+			MPATH_INFO_DISCOVERY_TIMEOUT |
+			MPATH_INFO_DISCOVERY_RETRIES |
+			MPATH_INFO_FLAGS;
+
+	pinfo->frame_qlen = mpath->frame_queue.qlen;
+	pinfo->sn = mpath->sn;
+	pinfo->metric = mpath->metric;
+	if (time_before(jiffies, mpath->exp_time))
+		pinfo->exptime = jiffies_to_msecs(mpath->exp_time - jiffies);
+	pinfo->discovery_timeout =
+			jiffies_to_msecs(mpath->discovery_timeout);
+	pinfo->discovery_retries = mpath->discovery_retries;
+	pinfo->flags = 0;
+	if (mpath->flags & MESH_PATH_ACTIVE)
+		pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE;
+	if (mpath->flags & MESH_PATH_RESOLVING)
+		pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING;
+	if (mpath->flags & MESH_PATH_SN_VALID)
+		pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID;
+	if (mpath->flags & MESH_PATH_FIXED)
+		pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
+	if (mpath->flags & MESH_PATH_RESOLVING)
+		pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING;
+
+	pinfo->flags = mpath->flags;
+}
+
+static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
+			       u8 *dst, u8 *next_hop, struct mpath_info *pinfo)
+
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct mesh_path *mpath;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(dst, sdata);
+	if (!mpath) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+	memcpy(dst, mpath->dst, ETH_ALEN);
+	mpath_set_pinfo(mpath, next_hop, pinfo);
+	rcu_read_unlock();
+	return 0;
+}
+
+static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
+				 int idx, u8 *dst, u8 *next_hop,
+				 struct mpath_info *pinfo)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct mesh_path *mpath;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup_by_idx(idx, sdata);
+	if (!mpath) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
+	memcpy(dst, mpath->dst, ETH_ALEN);
+	mpath_set_pinfo(mpath, next_hop, pinfo);
+	rcu_read_unlock();
+	return 0;
+}
+
+static int ieee80211_get_mesh_config(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct mesh_config *conf)
+{
+	struct ieee80211_sub_if_data *sdata;
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config));
+	return 0;
+}
+
+static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask)
+{
+	return (mask >> (parm-1)) & 0x1;
+}
+
+static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
+		const struct mesh_setup *setup)
+{
+	u8 *new_ie;
+	const u8 *old_ie;
+
+	/* allocate information elements */
+	new_ie = NULL;
+	old_ie = ifmsh->ie;
+
+	if (setup->ie_len) {
+		new_ie = kmemdup(setup->ie, setup->ie_len,
+				GFP_KERNEL);
+		if (!new_ie)
+			return -ENOMEM;
+	}
+	ifmsh->ie_len = setup->ie_len;
+	ifmsh->ie = new_ie;
+	kfree(old_ie);
+
+	/* now copy the rest of the setup parameters */
+	ifmsh->mesh_id_len = setup->mesh_id_len;
+	memcpy(ifmsh->mesh_id, setup->mesh_id, ifmsh->mesh_id_len);
+	ifmsh->mesh_pp_id = setup->path_sel_proto;
+	ifmsh->mesh_pm_id = setup->path_metric;
+	ifmsh->security = IEEE80211_MESH_SEC_NONE;
+	if (setup->is_authenticated)
+		ifmsh->security |= IEEE80211_MESH_SEC_AUTHED;
+	if (setup->is_secure)
+		ifmsh->security |= IEEE80211_MESH_SEC_SECURED;
+
+	return 0;
+}
+
+static int ieee80211_update_mesh_config(struct wiphy *wiphy,
+					struct net_device *dev, u32 mask,
+					const struct mesh_config *nconf)
+{
+	struct mesh_config *conf;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_mesh *ifmsh;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	ifmsh = &sdata->u.mesh;
+
+	/* Set the config options which we are interested in setting */
+	conf = &(sdata->u.mesh.mshcfg);
+	if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask))
+		conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask))
+		conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask))
+		conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask))
+		conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask))
+		conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries;
+	if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask))
+		conf->dot11MeshTTL = nconf->dot11MeshTTL;
+	if (_chg_mesh_attr(NL80211_MESHCONF_ELEMENT_TTL, mask))
+		conf->dot11MeshTTL = nconf->element_ttl;
+	if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask))
+		conf->auto_open_plinks = nconf->auto_open_plinks;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask))
+		conf->dot11MeshHWMPmaxPREQretries =
+			nconf->dot11MeshHWMPmaxPREQretries;
+	if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask))
+		conf->path_refresh_time = nconf->path_refresh_time;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask))
+		conf->min_discovery_timeout = nconf->min_discovery_timeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask))
+		conf->dot11MeshHWMPactivePathTimeout =
+			nconf->dot11MeshHWMPactivePathTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask))
+		conf->dot11MeshHWMPpreqMinInterval =
+			nconf->dot11MeshHWMPpreqMinInterval;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			   mask))
+		conf->dot11MeshHWMPnetDiameterTraversalTime =
+			nconf->dot11MeshHWMPnetDiameterTraversalTime;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ROOTMODE, mask)) {
+		conf->dot11MeshHWMPRootMode = nconf->dot11MeshHWMPRootMode;
+		ieee80211_mesh_root_setup(ifmsh);
+	}
+	return 0;
+}
+
+static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
+			       const struct mesh_config *conf,
+			       const struct mesh_setup *setup)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	int err;
+
+	memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config));
+	err = copy_mesh_setup(ifmsh, setup);
+	if (err)
+		return err;
+	ieee80211_start_mesh(sdata);
+
+	return 0;
+}
+
+static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_stop_mesh(sdata);
+
+	return 0;
+}
+#endif
+
+static int ieee80211_change_bss(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct bss_parameters *params)
+{
+	struct ieee80211_sub_if_data *sdata;
+	u32 changed = 0;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (params->use_cts_prot >= 0) {
+		sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot;
+		changed |= BSS_CHANGED_ERP_CTS_PROT;
+	}
+	if (params->use_short_preamble >= 0) {
+		sdata->vif.bss_conf.use_short_preamble =
+			params->use_short_preamble;
+		changed |= BSS_CHANGED_ERP_PREAMBLE;
+	}
+
+	if (!sdata->vif.bss_conf.use_short_slot &&
+	    sdata->local->hw.conf.channel->band == IEEE80211_BAND_5GHZ) {
+		sdata->vif.bss_conf.use_short_slot = true;
+		changed |= BSS_CHANGED_ERP_SLOT;
+	}
+
+	if (params->use_short_slot_time >= 0) {
+		sdata->vif.bss_conf.use_short_slot =
+			params->use_short_slot_time;
+		changed |= BSS_CHANGED_ERP_SLOT;
+	}
+
+	if (params->basic_rates) {
+		int i, j;
+		u32 rates = 0;
+		struct ieee80211_local *local = wiphy_priv(wiphy);
+		struct ieee80211_supported_band *sband =
+			wiphy->bands[local->oper_channel->band];
+
+		for (i = 0; i < params->basic_rates_len; i++) {
+			int rate = (params->basic_rates[i] & 0x7f) * 5;
+			for (j = 0; j < sband->n_bitrates; j++) {
+				if (sband->bitrates[j].bitrate == rate)
+					rates |= BIT(j);
+			}
+		}
+		sdata->vif.bss_conf.basic_rates = rates;
+		changed |= BSS_CHANGED_BASIC_RATES;
+	}
+
+	if (params->ap_isolate >= 0) {
+		if (params->ap_isolate)
+			sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+		else
+			sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
+	}
+
+	if (params->ht_opmode >= 0) {
+		sdata->vif.bss_conf.ht_operation_mode =
+			(u16) params->ht_opmode;
+		changed |= BSS_CHANGED_HT;
+	}
+
+	ieee80211_bss_info_change_notify(sdata, changed);
+
+	return 0;
+}
+
+static int ieee80211_set_txq_params(struct wiphy *wiphy,
+				    struct ieee80211_txq_params *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_tx_queue_params p;
+
+	if (!local->ops->conf_tx)
+		return -EOPNOTSUPP;
+
+	memset(&p, 0, sizeof(p));
+	p.aifs = params->aifs;
+	p.cw_max = params->cwmax;
+	p.cw_min = params->cwmin;
+	p.txop = params->txop;
+
+	/*
+	 * Setting tx queue params disables u-apsd because it's only
+	 * called in master mode.
+	 */
+	p.uapsd = false;
+
+	if (drv_conf_tx(local, params->queue, &p)) {
+		wiphy_debug(local->hw.wiphy,
+			    "failed to set TX queue parameters for queue %d\n",
+			    params->queue);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ieee80211_set_channel(struct wiphy *wiphy,
+				 struct net_device *netdev,
+				 struct ieee80211_channel *chan,
+				 enum nl80211_channel_type channel_type)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata = NULL;
+	struct ieee80211_channel *old_oper;
+	enum nl80211_channel_type old_oper_type;
+	enum nl80211_channel_type old_vif_oper_type= NL80211_CHAN_NO_HT;
+
+	if (netdev)
+		sdata = IEEE80211_DEV_TO_SUB_IF(netdev);
+
+	switch (ieee80211_get_channel_mode(local, NULL)) {
+	case CHAN_MODE_HOPPING:
+		return -EBUSY;
+	case CHAN_MODE_FIXED:
+		if (local->oper_channel != chan)
+			return -EBUSY;
+		if (!sdata && local->_oper_channel_type == channel_type)
+			return 0;
+		break;
+	case CHAN_MODE_UNDEFINED:
+		break;
+	}
+
+	if (sdata)
+		old_vif_oper_type = sdata->vif.bss_conf.channel_type;
+	old_oper_type = local->_oper_channel_type;
+
+	if (!ieee80211_set_channel_type(local, sdata, channel_type))
+		return -EBUSY;
+
+	old_oper = local->oper_channel;
+	local->oper_channel = chan;
+
+	/* Update driver if changes were actually made. */
+	if ((old_oper != local->oper_channel) ||
+	    (old_oper_type != local->_oper_channel_type))
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+
+	if ((sdata && sdata->vif.type != NL80211_IFTYPE_MONITOR) &&
+	    old_vif_oper_type != sdata->vif.bss_conf.channel_type)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int ieee80211_suspend(struct wiphy *wiphy,
+			     struct cfg80211_wowlan *wowlan)
+{
+	return __ieee80211_suspend(wiphy_priv(wiphy), wowlan);
+}
+
+static int ieee80211_resume(struct wiphy *wiphy)
+{
+	return __ieee80211_resume(wiphy_priv(wiphy));
+}
+#else
+#define ieee80211_suspend NULL
+#define ieee80211_resume NULL
+#endif
+
+static int ieee80211_scan(struct wiphy *wiphy,
+			  struct net_device *dev,
+			  struct cfg80211_scan_request *req)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	switch (ieee80211_vif_type_p2p(&sdata->vif)) {
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_MESH_POINT:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		break;
+	case NL80211_IFTYPE_P2P_GO:
+		if (sdata->local->ops->hw_scan)
+			break;
+		/*
+		 * FIXME: implement NoA while scanning in software,
+		 * for now fall through to allow scanning only when
+		 * beaconing hasn't been configured yet
+		 */
+	case NL80211_IFTYPE_AP:
+		if (sdata->u.ap.beacon)
+			return -EOPNOTSUPP;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return ieee80211_request_scan(sdata, req);
+}
+
+static int
+ieee80211_sched_scan_start(struct wiphy *wiphy,
+			   struct net_device *dev,
+			   struct cfg80211_sched_scan_request *req)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (!sdata->local->ops->sched_scan_start)
+		return -EOPNOTSUPP;
+
+	return ieee80211_request_sched_scan_start(sdata, req);
+}
+
+static int
+ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (!sdata->local->ops->sched_scan_stop)
+		return -EOPNOTSUPP;
+
+	return ieee80211_request_sched_scan_stop(sdata);
+}
+
+static int ieee80211_auth(struct wiphy *wiphy, struct net_device *dev,
+			  struct cfg80211_auth_request *req)
+{
+	return ieee80211_mgd_auth(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_assoc(struct wiphy *wiphy, struct net_device *dev,
+			   struct cfg80211_assoc_request *req)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	switch (ieee80211_get_channel_mode(local, sdata)) {
+	case CHAN_MODE_HOPPING:
+		return -EBUSY;
+	case CHAN_MODE_FIXED:
+		if (local->oper_channel == req->bss->channel)
+			break;
+		return -EBUSY;
+	case CHAN_MODE_UNDEFINED:
+		break;
+	}
+
+	return ieee80211_mgd_assoc(IEEE80211_DEV_TO_SUB_IF(dev), req);
+}
+
+static int ieee80211_deauth(struct wiphy *wiphy, struct net_device *dev,
+			    struct cfg80211_deauth_request *req,
+			    void *cookie)
+{
+	return ieee80211_mgd_deauth(IEEE80211_DEV_TO_SUB_IF(dev),
+				    req, cookie);
+}
+
+static int ieee80211_disassoc(struct wiphy *wiphy, struct net_device *dev,
+			      struct cfg80211_disassoc_request *req,
+			      void *cookie)
+{
+	return ieee80211_mgd_disassoc(IEEE80211_DEV_TO_SUB_IF(dev),
+				      req, cookie);
+}
+
+static int ieee80211_join_ibss(struct wiphy *wiphy, struct net_device *dev,
+			       struct cfg80211_ibss_params *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	switch (ieee80211_get_channel_mode(local, sdata)) {
+	case CHAN_MODE_HOPPING:
+		return -EBUSY;
+	case CHAN_MODE_FIXED:
+		if (!params->channel_fixed)
+			return -EBUSY;
+		if (local->oper_channel == params->channel)
+			break;
+		return -EBUSY;
+	case CHAN_MODE_UNDEFINED:
+		break;
+	}
+
+	return ieee80211_ibss_join(sdata, params);
+}
+
+static int ieee80211_leave_ibss(struct wiphy *wiphy, struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	return ieee80211_ibss_leave(sdata);
+}
+
+static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	int err;
+
+	if (changed & WIPHY_PARAM_FRAG_THRESHOLD) {
+		err = drv_set_frag_threshold(local, wiphy->frag_threshold);
+
+		if (err)
+			return err;
+	}
+
+	if (changed & WIPHY_PARAM_COVERAGE_CLASS) {
+		err = drv_set_coverage_class(local, wiphy->coverage_class);
+
+		if (err)
+			return err;
+	}
+
+	if (changed & WIPHY_PARAM_RTS_THRESHOLD) {
+		err = drv_set_rts_threshold(local, wiphy->rts_threshold);
+
+		if (err)
+			return err;
+	}
+
+	if (changed & WIPHY_PARAM_RETRY_SHORT)
+		local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
+	if (changed & WIPHY_PARAM_RETRY_LONG)
+		local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+	if (changed &
+	    (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+
+	return 0;
+}
+
+static int ieee80211_set_tx_power(struct wiphy *wiphy,
+				  enum nl80211_tx_power_setting type, int mbm)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_channel *chan = local->hw.conf.channel;
+	u32 changes = 0;
+
+	switch (type) {
+	case NL80211_TX_POWER_AUTOMATIC:
+		local->user_power_level = -1;
+		break;
+	case NL80211_TX_POWER_LIMITED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
+		local->user_power_level = MBM_TO_DBM(mbm);
+		break;
+	case NL80211_TX_POWER_FIXED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
+		/* TODO: move to cfg80211 when it knows the channel */
+		if (MBM_TO_DBM(mbm) > chan->max_power)
+			return -EINVAL;
+		local->user_power_level = MBM_TO_DBM(mbm);
+		break;
+	}
+
+	ieee80211_hw_config(local, changes);
+
+	return 0;
+}
+
+static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	*dbm = local->hw.conf.power_level;
+
+	return 0;
+}
+
+static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
+				  const u8 *addr)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN);
+
+	return 0;
+}
+
+static void ieee80211_rfkill_poll(struct wiphy *wiphy)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	drv_rfkill_poll(local);
+}
+
+#ifdef CONFIG_NL80211_TESTMODE
+static int ieee80211_testmode_cmd(struct wiphy *wiphy, void *data, int len)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	if (!local->ops->testmode_cmd)
+		return -EOPNOTSUPP;
+
+	return local->ops->testmode_cmd(&local->hw, data, len);
+}
+#endif
+
+int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
+			     enum ieee80211_smps_mode smps_mode)
+{
+	const u8 *ap;
+	enum ieee80211_smps_mode old_req;
+	int err;
+
+	lockdep_assert_held(&sdata->u.mgd.mtx);
+
+	old_req = sdata->u.mgd.req_smps;
+	sdata->u.mgd.req_smps = smps_mode;
+
+	if (old_req == smps_mode &&
+	    smps_mode != IEEE80211_SMPS_AUTOMATIC)
+		return 0;
+
+	/*
+	 * If not associated, or current association is not an HT
+	 * association, there's no need to send an action frame.
+	 */
+	if (!sdata->u.mgd.associated ||
+	    sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) {
+		mutex_lock(&sdata->local->iflist_mtx);
+		ieee80211_recalc_smps(sdata->local);
+		mutex_unlock(&sdata->local->iflist_mtx);
+		return 0;
+	}
+
+	ap = sdata->u.mgd.associated->bssid;
+
+	if (smps_mode == IEEE80211_SMPS_AUTOMATIC) {
+		if (sdata->u.mgd.powersave)
+			smps_mode = IEEE80211_SMPS_DYNAMIC;
+		else
+			smps_mode = IEEE80211_SMPS_OFF;
+	}
+
+	/* send SM PS frame to AP */
+	err = ieee80211_send_smps_action(sdata, smps_mode,
+					 ap, ap);
+	if (err)
+		sdata->u.mgd.req_smps = old_req;
+
+	return err;
+}
+
+static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
+				    bool enabled, int timeout)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+		return -EOPNOTSUPP;
+
+	if (enabled == sdata->u.mgd.powersave &&
+	    timeout == local->dynamic_ps_forced_timeout)
+		return 0;
+
+	sdata->u.mgd.powersave = enabled;
+	local->dynamic_ps_forced_timeout = timeout;
+
+	/* no change, but if automatic follow powersave */
+	mutex_lock(&sdata->u.mgd.mtx);
+	__ieee80211_request_smps(sdata, sdata->u.mgd.req_smps);
+	mutex_unlock(&sdata->u.mgd.mtx);
+
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+
+	ieee80211_recalc_ps(local, -1);
+
+	return 0;
+}
+
+static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
+					 struct net_device *dev,
+					 s32 rssi_thold, u32 rssi_hyst)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_vif *vif = &sdata->vif;
+	struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+
+	if (rssi_thold == bss_conf->cqm_rssi_thold &&
+	    rssi_hyst == bss_conf->cqm_rssi_hyst)
+		return 0;
+
+	bss_conf->cqm_rssi_thold = rssi_thold;
+	bss_conf->cqm_rssi_hyst = rssi_hyst;
+
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) {
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			return -EOPNOTSUPP;
+		return 0;
+	}
+
+	/* tell the driver upon association, unless already associated */
+	if (sdata->u.mgd.associated)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+
+	return 0;
+}
+
+static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
+				      struct net_device *dev,
+				      const u8 *addr,
+				      const struct cfg80211_bitrate_mask *mask)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	int i, ret;
+
+	if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+		ret = drv_set_bitrate_mask(local, sdata, mask);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+		sdata->rc_rateidx_mask[i] = mask->control[i].legacy;
+
+	return 0;
+}
+
+static int ieee80211_remain_on_channel_hw(struct ieee80211_local *local,
+					  struct net_device *dev,
+					  struct ieee80211_channel *chan,
+					  enum nl80211_channel_type chantype,
+					  unsigned int duration, u64 *cookie)
+{
+	int ret;
+	u32 random_cookie;
+
+	lockdep_assert_held(&local->mtx);
+
+	if (local->hw_roc_cookie)
+		return -EBUSY;
+	/* must be nonzero */
+	random_cookie = random32() | 1;
+
+	*cookie = random_cookie;
+	local->hw_roc_dev = dev;
+	local->hw_roc_cookie = random_cookie;
+	local->hw_roc_channel = chan;
+	local->hw_roc_channel_type = chantype;
+	local->hw_roc_duration = duration;
+	ret = drv_remain_on_channel(local, chan, chantype, duration);
+	if (ret) {
+		local->hw_roc_channel = NULL;
+		local->hw_roc_cookie = 0;
+	}
+
+	return ret;
+}
+
+static int ieee80211_remain_on_channel(struct wiphy *wiphy,
+				       struct net_device *dev,
+				       struct ieee80211_channel *chan,
+				       enum nl80211_channel_type channel_type,
+				       unsigned int duration,
+				       u64 *cookie)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->ops->remain_on_channel) {
+		int ret;
+
+		mutex_lock(&local->mtx);
+		ret = ieee80211_remain_on_channel_hw(local, dev,
+						     chan, channel_type,
+						     duration, cookie);
+		local->hw_roc_for_tx = false;
+		mutex_unlock(&local->mtx);
+
+		return ret;
+	}
+
+	return ieee80211_wk_remain_on_channel(sdata, chan, channel_type,
+					      duration, cookie);
+}
+
+static int ieee80211_cancel_remain_on_channel_hw(struct ieee80211_local *local,
+						 u64 cookie)
+{
+	int ret;
+
+	lockdep_assert_held(&local->mtx);
+
+	if (local->hw_roc_cookie != cookie)
+		return -ENOENT;
+
+	ret = drv_cancel_remain_on_channel(local);
+	if (ret)
+		return ret;
+
+	local->hw_roc_cookie = 0;
+	local->hw_roc_channel = NULL;
+
+	ieee80211_recalc_idle(local);
+
+	return 0;
+}
+
+static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
+					      struct net_device *dev,
+					      u64 cookie)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->ops->cancel_remain_on_channel) {
+		int ret;
+
+		mutex_lock(&local->mtx);
+		ret = ieee80211_cancel_remain_on_channel_hw(local, cookie);
+		mutex_unlock(&local->mtx);
+
+		return ret;
+	}
+
+	return ieee80211_wk_cancel_remain_on_channel(sdata, cookie);
+}
+
+static enum work_done_result
+ieee80211_offchan_tx_done(struct ieee80211_work *wk, struct sk_buff *skb)
+{
+	/*
+	 * Use the data embedded in the work struct for reporting
+	 * here so if the driver mangled the SKB before dropping
+	 * it (which is the only way we really should get here)
+	 * then we don't report mangled data.
+	 *
+	 * If there was no wait time, then by the time we get here
+	 * the driver will likely not have reported the status yet,
+	 * so in that case userspace will have to deal with it.
+	 */
+
+	if (wk->offchan_tx.wait && !wk->offchan_tx.status)
+		cfg80211_mgmt_tx_status(wk->sdata->dev,
+					(unsigned long) wk->offchan_tx.frame,
+					wk->ie, wk->ie_len, false, GFP_KERNEL);
+
+	return WORK_DONE_DESTROY;
+}
+
+static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev,
+			     struct ieee80211_channel *chan, bool offchan,
+			     enum nl80211_channel_type channel_type,
+			     bool channel_type_valid, unsigned int wait,
+			     const u8 *buf, size_t len, u64 *cookie)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct sta_info *sta;
+	struct ieee80211_work *wk;
+	const struct ieee80211_mgmt *mgmt = (void *)buf;
+	u32 flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+		    IEEE80211_TX_CTL_REQ_TX_STATUS;
+	bool is_offchan = false;
+
+	/* Check that we are on the requested channel for transmission */
+	if (chan != local->tmp_channel &&
+	    chan != local->oper_channel)
+		is_offchan = true;
+	if (channel_type_valid &&
+	    (channel_type != local->tmp_channel_type &&
+	     channel_type != local->_oper_channel_type))
+		is_offchan = true;
+
+	if (chan == local->hw_roc_channel) {
+		/* TODO: check channel type? */
+		is_offchan = false;
+		flags |= IEEE80211_TX_CTL_TX_OFFCHAN;
+	}
+
+	if (is_offchan && !offchan)
+		return -EBUSY;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!ieee80211_is_action(mgmt->frame_control) ||
+		    mgmt->u.action.category == WLAN_CATEGORY_PUBLIC)
+			break;
+		rcu_read_lock();
+		sta = sta_info_get(sdata, mgmt->da);
+		rcu_read_unlock();
+		if (!sta)
+			return -ENOLINK;
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + len);
+	if (!skb)
+		return -ENOMEM;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	memcpy(skb_put(skb, len), buf, len);
+
+	IEEE80211_SKB_CB(skb)->flags = flags;
+
+	skb->dev = sdata->dev;
+
+	*cookie = (unsigned long) skb;
+
+	if (is_offchan && local->ops->offchannel_tx) {
+		int ret;
+
+		IEEE80211_SKB_CB(skb)->band = chan->band;
+
+		mutex_lock(&local->mtx);
+
+		if (local->hw_offchan_tx_cookie) {
+			mutex_unlock(&local->mtx);
+			return -EBUSY;
+		}
+
+		/* TODO: bitrate control, TX processing? */
+		ret = drv_offchannel_tx(local, skb, chan, channel_type, wait);
+
+		if (ret == 0)
+			local->hw_offchan_tx_cookie = *cookie;
+		mutex_unlock(&local->mtx);
+
+		/*
+		 * Allow driver to return 1 to indicate it wants to have the
+		 * frame transmitted with a remain_on_channel + regular TX.
+		 */
+		if (ret != 1)
+			return ret;
+	}
+
+	if (is_offchan && local->ops->remain_on_channel) {
+		unsigned int duration;
+		int ret;
+
+		mutex_lock(&local->mtx);
+		/*
+		 * If the duration is zero, then the driver
+		 * wouldn't actually do anything. Set it to
+		 * 100 for now.
+		 *
+		 * TODO: cancel the off-channel operation
+		 *       when we get the SKB's TX status and
+		 *       the wait time was zero before.
+		 */
+		duration = 100;
+		if (wait)
+			duration = wait;
+		ret = ieee80211_remain_on_channel_hw(local, dev, chan,
+						     channel_type,
+						     duration, cookie);
+		if (ret) {
+			kfree_skb(skb);
+			mutex_unlock(&local->mtx);
+			return ret;
+		}
+
+		local->hw_roc_for_tx = true;
+		local->hw_roc_duration = wait;
+
+		/*
+		 * queue up frame for transmission after
+		 * ieee80211_ready_on_channel call
+		 */
+
+		/* modify cookie to prevent API mismatches */
+		*cookie ^= 2;
+		IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN;
+		local->hw_roc_skb = skb;
+		local->hw_roc_skb_for_status = skb;
+		mutex_unlock(&local->mtx);
+
+		return 0;
+	}
+
+	/*
+	 * Can transmit right away if the channel was the
+	 * right one and there's no wait involved... If a
+	 * wait is involved, we might otherwise not be on
+	 * the right channel for long enough!
+	 */
+	if (!is_offchan && !wait && !sdata->vif.bss_conf.idle) {
+		ieee80211_tx_skb(sdata, skb);
+		return 0;
+	}
+
+	wk = kzalloc(sizeof(*wk) + len, GFP_KERNEL);
+	if (!wk) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	wk->type = IEEE80211_WORK_OFFCHANNEL_TX;
+	wk->chan = chan;
+	wk->chan_type = channel_type;
+	wk->sdata = sdata;
+	wk->done = ieee80211_offchan_tx_done;
+	wk->offchan_tx.frame = skb;
+	wk->offchan_tx.wait = wait;
+	wk->ie_len = len;
+	memcpy(wk->ie, buf, len);
+
+	ieee80211_add_work(wk);
+	return 0;
+}
+
+static int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
+					 struct net_device *dev,
+					 u64 cookie)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_work *wk;
+	int ret = -ENOENT;
+
+	mutex_lock(&local->mtx);
+
+	if (local->ops->offchannel_tx_cancel_wait &&
+	    local->hw_offchan_tx_cookie == cookie) {
+		ret = drv_offchannel_tx_cancel_wait(local);
+
+		if (!ret)
+			local->hw_offchan_tx_cookie = 0;
+
+		mutex_unlock(&local->mtx);
+
+		return ret;
+	}
+
+	if (local->ops->cancel_remain_on_channel) {
+		cookie ^= 2;
+		ret = ieee80211_cancel_remain_on_channel_hw(local, cookie);
+
+		if (ret == 0) {
+			kfree_skb(local->hw_roc_skb);
+			local->hw_roc_skb = NULL;
+			local->hw_roc_skb_for_status = NULL;
+		}
+
+		mutex_unlock(&local->mtx);
+
+		return ret;
+	}
+
+	list_for_each_entry(wk, &local->work_list, list) {
+		if (wk->sdata != sdata)
+			continue;
+
+		if (wk->type != IEEE80211_WORK_OFFCHANNEL_TX)
+			continue;
+
+		if (cookie != (unsigned long) wk->offchan_tx.frame)
+			continue;
+
+		wk->timeout = jiffies;
+
+		ieee80211_queue_work(&local->hw, &local->work_work);
+		ret = 0;
+		break;
+	}
+	mutex_unlock(&local->mtx);
+
+	return ret;
+}
+
+static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+					  struct net_device *dev,
+					  u16 frame_type, bool reg)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ))
+		return;
+
+	if (reg)
+		local->probe_req_reg++;
+	else
+		local->probe_req_reg--;
+
+	ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+}
+
+static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	if (local->started)
+		return -EOPNOTSUPP;
+
+	return drv_set_antenna(local, tx_ant, rx_ant);
+}
+
+static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	return drv_get_antenna(local, tx_ant, rx_ant);
+}
+
+static int ieee80211_set_ringparam(struct wiphy *wiphy, u32 tx, u32 rx)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	return drv_set_ringparam(local, tx, rx);
+}
+
+static void ieee80211_get_ringparam(struct wiphy *wiphy,
+				    u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	drv_get_ringparam(local, tx, tx_max, rx, rx_max);
+}
+
+struct cfg80211_ops mac80211_config_ops = {
+	.add_virtual_intf = ieee80211_add_iface,
+	.del_virtual_intf = ieee80211_del_iface,
+	.change_virtual_intf = ieee80211_change_iface,
+	.add_key = ieee80211_add_key,
+	.del_key = ieee80211_del_key,
+	.get_key = ieee80211_get_key,
+	.set_default_key = ieee80211_config_default_key,
+	.set_default_mgmt_key = ieee80211_config_default_mgmt_key,
+	.add_beacon = ieee80211_add_beacon,
+	.set_beacon = ieee80211_set_beacon,
+	.del_beacon = ieee80211_del_beacon,
+	.add_station = ieee80211_add_station,
+	.del_station = ieee80211_del_station,
+	.change_station = ieee80211_change_station,
+	.get_station = ieee80211_get_station,
+	.dump_station = ieee80211_dump_station,
+	.dump_survey = ieee80211_dump_survey,
+#ifdef CONFIG_MAC80211_MESH
+	.add_mpath = ieee80211_add_mpath,
+	.del_mpath = ieee80211_del_mpath,
+	.change_mpath = ieee80211_change_mpath,
+	.get_mpath = ieee80211_get_mpath,
+	.dump_mpath = ieee80211_dump_mpath,
+	.update_mesh_config = ieee80211_update_mesh_config,
+	.get_mesh_config = ieee80211_get_mesh_config,
+	.join_mesh = ieee80211_join_mesh,
+	.leave_mesh = ieee80211_leave_mesh,
+#endif
+	.change_bss = ieee80211_change_bss,
+	.set_txq_params = ieee80211_set_txq_params,
+	.set_channel = ieee80211_set_channel,
+	.suspend = ieee80211_suspend,
+	.resume = ieee80211_resume,
+	.scan = ieee80211_scan,
+	.sched_scan_start = ieee80211_sched_scan_start,
+	.sched_scan_stop = ieee80211_sched_scan_stop,
+	.auth = ieee80211_auth,
+	.assoc = ieee80211_assoc,
+	.deauth = ieee80211_deauth,
+	.disassoc = ieee80211_disassoc,
+	.join_ibss = ieee80211_join_ibss,
+	.leave_ibss = ieee80211_leave_ibss,
+	.set_wiphy_params = ieee80211_set_wiphy_params,
+	.set_tx_power = ieee80211_set_tx_power,
+	.get_tx_power = ieee80211_get_tx_power,
+	.set_wds_peer = ieee80211_set_wds_peer,
+	.rfkill_poll = ieee80211_rfkill_poll,
+	CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd)
+	.set_power_mgmt = ieee80211_set_power_mgmt,
+	.set_bitrate_mask = ieee80211_set_bitrate_mask,
+	.remain_on_channel = ieee80211_remain_on_channel,
+	.cancel_remain_on_channel = ieee80211_cancel_remain_on_channel,
+	.mgmt_tx = ieee80211_mgmt_tx,
+	.mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait,
+	.set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
+	.mgmt_frame_register = ieee80211_mgmt_frame_register,
+	.set_antenna = ieee80211_set_antenna,
+	.get_antenna = ieee80211_get_antenna,
+	.set_ringparam = ieee80211_set_ringparam,
+	.get_ringparam = ieee80211_get_ringparam,
+};
diff --git a/net/mac80211/cfg.h b/net/mac80211/cfg.h
new file mode 100644
index 00000000..7d7879f5
--- /dev/null
+++ b/net/mac80211/cfg.h
@@ -0,0 +1,9 @@
+/*
+ * mac80211 configuration hooks for cfg80211
+ */
+#ifndef __CFG_H
+#define __CFG_H
+
+extern struct cfg80211_ops mac80211_config_ops;
+
+#endif /* __CFG_H */
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
new file mode 100644
index 00000000..889c3e93
--- /dev/null
+++ b/net/mac80211/chan.c
@@ -0,0 +1,130 @@
+/*
+ * mac80211 - channel management
+ */
+
+#include <linux/nl80211.h>
+#include "ieee80211_i.h"
+
+static enum ieee80211_chan_mode
+__ieee80211_get_channel_mode(struct ieee80211_local *local,
+			     struct ieee80211_sub_if_data *ignore)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	lockdep_assert_held(&local->iflist_mtx);
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata == ignore)
+			continue;
+
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+		    !sdata->u.mgd.associated)
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+			if (!sdata->u.ibss.ssid_len)
+				continue;
+			if (!sdata->u.ibss.fixed_channel)
+				return CHAN_MODE_HOPPING;
+		}
+
+		if (sdata->vif.type == NL80211_IFTYPE_AP &&
+		    !sdata->u.ap.beacon)
+			continue;
+
+		return CHAN_MODE_FIXED;
+	}
+
+	return CHAN_MODE_UNDEFINED;
+}
+
+enum ieee80211_chan_mode
+ieee80211_get_channel_mode(struct ieee80211_local *local,
+			   struct ieee80211_sub_if_data *ignore)
+{
+	enum ieee80211_chan_mode mode;
+
+	mutex_lock(&local->iflist_mtx);
+	mode = __ieee80211_get_channel_mode(local, ignore);
+	mutex_unlock(&local->iflist_mtx);
+
+	return mode;
+}
+
+bool ieee80211_set_channel_type(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				enum nl80211_channel_type chantype)
+{
+	struct ieee80211_sub_if_data *tmp;
+	enum nl80211_channel_type superchan = NL80211_CHAN_NO_HT;
+	bool result;
+
+	mutex_lock(&local->iflist_mtx);
+
+	list_for_each_entry(tmp, &local->interfaces, list) {
+		if (tmp == sdata)
+			continue;
+
+		if (!ieee80211_sdata_running(tmp))
+			continue;
+
+		switch (tmp->vif.bss_conf.channel_type) {
+		case NL80211_CHAN_NO_HT:
+		case NL80211_CHAN_HT20:
+			if (superchan > tmp->vif.bss_conf.channel_type)
+				break;
+
+			superchan = tmp->vif.bss_conf.channel_type;
+			break;
+		case NL80211_CHAN_HT40PLUS:
+			WARN_ON(superchan == NL80211_CHAN_HT40MINUS);
+			superchan = NL80211_CHAN_HT40PLUS;
+			break;
+		case NL80211_CHAN_HT40MINUS:
+			WARN_ON(superchan == NL80211_CHAN_HT40PLUS);
+			superchan = NL80211_CHAN_HT40MINUS;
+			break;
+		}
+	}
+
+	switch (superchan) {
+	case NL80211_CHAN_NO_HT:
+	case NL80211_CHAN_HT20:
+		/*
+		 * allow any change that doesn't go to no-HT
+		 * (if it already is no-HT no change is needed)
+		 */
+		if (chantype == NL80211_CHAN_NO_HT)
+			break;
+		superchan = chantype;
+		break;
+	case NL80211_CHAN_HT40PLUS:
+	case NL80211_CHAN_HT40MINUS:
+		/* allow smaller bandwidth and same */
+		if (chantype == NL80211_CHAN_NO_HT)
+			break;
+		if (chantype == NL80211_CHAN_HT20)
+			break;
+		if (superchan == chantype)
+			break;
+		result = false;
+		goto out;
+	}
+
+	local->_oper_channel_type = superchan;
+
+	if (sdata)
+		sdata->vif.bss_conf.channel_type = chantype;
+
+	result = true;
+ out:
+	mutex_unlock(&local->iflist_mtx);
+
+	return result;
+}
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
new file mode 100644
index 00000000..186e02f7
--- /dev/null
+++ b/net/mac80211/debugfs.c
@@ -0,0 +1,526 @@
+
+/*
+ * mac80211 debugfs for wireless PHYs
+ *
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * GPLv2
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/rtnetlink.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "debugfs.h"
+
+int mac80211_open_file_generic(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define DEBUGFS_FORMAT_BUFFER_SIZE 100
+
+int mac80211_format_buffer(char __user *userbuf, size_t count,
+				  loff_t *ppos, char *fmt, ...)
+{
+	va_list args;
+	char buf[DEBUGFS_FORMAT_BUFFER_SIZE];
+	int res;
+
+	va_start(args, fmt);
+	res = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+
+#define DEBUGFS_READONLY_FILE_FN(name, fmt, value...)			\
+static ssize_t name## _read(struct file *file, char __user *userbuf,	\
+			    size_t count, loff_t *ppos)			\
+{									\
+	struct ieee80211_local *local = file->private_data;		\
+									\
+	return mac80211_format_buffer(userbuf, count, ppos, 		\
+				      fmt "\n", ##value);		\
+}
+
+#define DEBUGFS_READONLY_FILE_OPS(name)			\
+static const struct file_operations name## _ops = {			\
+	.read = name## _read,						\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+};
+
+#define DEBUGFS_READONLY_FILE(name, fmt, value...)		\
+	DEBUGFS_READONLY_FILE_FN(name, fmt, value)		\
+	DEBUGFS_READONLY_FILE_OPS(name)
+
+#define DEBUGFS_ADD(name)						\
+	debugfs_create_file(#name, 0400, phyd, local, &name## _ops);
+
+#define DEBUGFS_ADD_MODE(name, mode)					\
+	debugfs_create_file(#name, mode, phyd, local, &name## _ops);
+
+
+DEBUGFS_READONLY_FILE(user_power, "%d",
+		      local->user_power_level);
+DEBUGFS_READONLY_FILE(power, "%d",
+		      local->hw.conf.power_level);
+DEBUGFS_READONLY_FILE(frequency, "%d",
+		      local->hw.conf.channel->center_freq);
+DEBUGFS_READONLY_FILE(total_ps_buffered, "%d",
+		      local->total_ps_buffered);
+DEBUGFS_READONLY_FILE(wep_iv, "%#08x",
+		      local->wep_iv & 0xffffff);
+DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s",
+	local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver");
+
+static ssize_t tsf_read(struct file *file, char __user *user_buf,
+			     size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	u64 tsf;
+
+	tsf = drv_get_tsf(local);
+
+	return mac80211_format_buffer(user_buf, count, ppos, "0x%016llx\n",
+				      (unsigned long long) tsf);
+}
+
+static ssize_t tsf_write(struct file *file,
+                         const char __user *user_buf,
+                         size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	unsigned long long tsf;
+	char buf[100];
+	size_t len;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+	buf[len] = '\0';
+
+	if (strncmp(buf, "reset", 5) == 0) {
+		if (local->ops->reset_tsf) {
+			drv_reset_tsf(local);
+			wiphy_info(local->hw.wiphy, "debugfs reset TSF\n");
+		}
+	} else {
+		tsf = simple_strtoul(buf, NULL, 0);
+		if (local->ops->set_tsf) {
+			drv_set_tsf(local, tsf);
+			wiphy_info(local->hw.wiphy,
+				   "debugfs set TSF to %#018llx\n", tsf);
+
+		}
+	}
+
+	return count;
+}
+
+static const struct file_operations tsf_ops = {
+	.read = tsf_read,
+	.write = tsf_write,
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
+};
+
+static ssize_t reset_write(struct file *file, const char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+
+	rtnl_lock();
+	__ieee80211_suspend(&local->hw, NULL);
+	__ieee80211_resume(&local->hw);
+	rtnl_unlock();
+
+	return count;
+}
+
+static const struct file_operations reset_ops = {
+	.write = reset_write,
+	.open = mac80211_open_file_generic,
+	.llseek = noop_llseek,
+};
+
+static ssize_t noack_read(struct file *file, char __user *user_buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+
+	return mac80211_format_buffer(user_buf, count, ppos, "%d\n",
+				      local->wifi_wme_noack_test);
+}
+
+static ssize_t noack_write(struct file *file,
+			   const char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	char buf[10];
+	size_t len;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+	buf[len] = '\0';
+
+	local->wifi_wme_noack_test = !!simple_strtoul(buf, NULL, 0);
+
+	return count;
+}
+
+static const struct file_operations noack_ops = {
+	.read = noack_read,
+	.write = noack_write,
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
+};
+
+static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
+				 size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	return mac80211_format_buffer(user_buf, count, ppos, "0x%x\n",
+				      local->uapsd_queues);
+}
+
+static ssize_t uapsd_queues_write(struct file *file,
+				  const char __user *user_buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	unsigned long val;
+	char buf[10];
+	size_t len;
+	int ret;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+	buf[len] = '\0';
+
+	ret = strict_strtoul(buf, 0, &val);
+
+	if (ret)
+		return -EINVAL;
+
+	if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK)
+		return -ERANGE;
+
+	local->uapsd_queues = val;
+
+	return count;
+}
+
+static const struct file_operations uapsd_queues_ops = {
+	.read = uapsd_queues_read,
+	.write = uapsd_queues_write,
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
+};
+
+static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+
+	return mac80211_format_buffer(user_buf, count, ppos, "0x%x\n",
+				      local->uapsd_max_sp_len);
+}
+
+static ssize_t uapsd_max_sp_len_write(struct file *file,
+				      const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	unsigned long val;
+	char buf[10];
+	size_t len;
+	int ret;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+	buf[len] = '\0';
+
+	ret = strict_strtoul(buf, 0, &val);
+
+	if (ret)
+		return -EINVAL;
+
+	if (val & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK)
+		return -ERANGE;
+
+	local->uapsd_max_sp_len = val;
+
+	return count;
+}
+
+static const struct file_operations uapsd_max_sp_len_ops = {
+	.read = uapsd_max_sp_len_read,
+	.write = uapsd_max_sp_len_write,
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
+};
+
+static ssize_t channel_type_read(struct file *file, char __user *user_buf,
+		       size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	const char *buf;
+
+	switch (local->hw.conf.channel_type) {
+	case NL80211_CHAN_NO_HT:
+		buf = "no ht\n";
+		break;
+	case NL80211_CHAN_HT20:
+		buf = "ht20\n";
+		break;
+	case NL80211_CHAN_HT40MINUS:
+		buf = "ht40-\n";
+		break;
+	case NL80211_CHAN_HT40PLUS:
+		buf = "ht40+\n";
+		break;
+	default:
+		buf = "???";
+		break;
+	}
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+}
+
+static ssize_t hwflags_read(struct file *file, char __user *user_buf,
+			    size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	int mxln = 500;
+	ssize_t rv;
+	char *buf = kzalloc(mxln, GFP_KERNEL);
+	int sf = 0; /* how many written so far */
+
+	sf += snprintf(buf, mxln - sf, "0x%x\n", local->hw.flags);
+	if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+		sf += snprintf(buf + sf, mxln - sf, "HAS_RATE_CONTROL\n");
+	if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+		sf += snprintf(buf + sf, mxln - sf, "RX_INCLUDES_FCS\n");
+	if (local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING)
+		sf += snprintf(buf + sf, mxln - sf,
+			       "HOST_BCAST_PS_BUFFERING\n");
+	if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)
+		sf += snprintf(buf + sf, mxln - sf,
+			       "2GHZ_SHORT_SLOT_INCAPABLE\n");
+	if (local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE)
+		sf += snprintf(buf + sf, mxln - sf,
+			       "2GHZ_SHORT_PREAMBLE_INCAPABLE\n");
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+		sf += snprintf(buf + sf, mxln - sf, "SIGNAL_UNSPEC\n");
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+		sf += snprintf(buf + sf, mxln - sf, "SIGNAL_DBM\n");
+	if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
+		sf += snprintf(buf + sf, mxln - sf, "NEED_DTIM_PERIOD\n");
+	if (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT)
+		sf += snprintf(buf + sf, mxln - sf, "SPECTRUM_MGMT\n");
+	if (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)
+		sf += snprintf(buf + sf, mxln - sf, "AMPDU_AGGREGATION\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_PS)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_PS\n");
+	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
+		sf += snprintf(buf + sf, mxln - sf, "PS_NULLFUNC_STACK\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_PS\n");
+	if (local->hw.flags & IEEE80211_HW_MFP_CAPABLE)
+		sf += snprintf(buf + sf, mxln - sf, "MFP_CAPABLE\n");
+	if (local->hw.flags & IEEE80211_HW_BEACON_FILTER)
+		sf += snprintf(buf + sf, mxln - sf, "BEACON_FILTER\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_STATIC_SMPS\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_DYNAMIC_SMPS\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_UAPSD\n");
+	if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+		sf += snprintf(buf + sf, mxln - sf, "REPORTS_TX_ACK_STATUS\n");
+	if (local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+		sf += snprintf(buf + sf, mxln - sf, "CONNECTION_MONITOR\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_CQM_RSSI\n");
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)
+		sf += snprintf(buf + sf, mxln - sf, "SUPPORTS_PER_STA_GTK\n");
+	if (local->hw.flags & IEEE80211_HW_AP_LINK_PS)
+		sf += snprintf(buf + sf, mxln - sf, "AP_LINK_PS\n");
+
+	rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return rv;
+}
+
+static ssize_t queues_read(struct file *file, char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	unsigned long flags;
+	char buf[IEEE80211_MAX_QUEUES * 20];
+	int q, res = 0;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (q = 0; q < local->hw.queues; q++)
+		res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q,
+				local->queue_stop_reasons[q],
+				skb_queue_len(&local->pending[q]));
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, res);
+}
+
+DEBUGFS_READONLY_FILE_OPS(hwflags);
+DEBUGFS_READONLY_FILE_OPS(channel_type);
+DEBUGFS_READONLY_FILE_OPS(queues);
+
+/* statistics stuff */
+
+static ssize_t format_devstat_counter(struct ieee80211_local *local,
+	char __user *userbuf,
+	size_t count, loff_t *ppos,
+	int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf,
+			  int buflen))
+{
+	struct ieee80211_low_level_stats stats;
+	char buf[20];
+	int res;
+
+	rtnl_lock();
+	res = drv_get_stats(local, &stats);
+	rtnl_unlock();
+	if (res)
+		return res;
+	res = printvalue(&stats, buf, sizeof(buf));
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+
+#define DEBUGFS_DEVSTATS_FILE(name)					\
+static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\
+				 char *buf, int buflen)			\
+{									\
+	return scnprintf(buf, buflen, "%u\n", stats->name);		\
+}									\
+static ssize_t stats_ ##name## _read(struct file *file,			\
+				     char __user *userbuf,		\
+				     size_t count, loff_t *ppos)	\
+{									\
+	return format_devstat_counter(file->private_data,		\
+				      userbuf,				\
+				      count,				\
+				      ppos,				\
+				      print_devstats_##name);		\
+}									\
+									\
+static const struct file_operations stats_ ##name## _ops = {		\
+	.read = stats_ ##name## _read,					\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+};
+
+#define DEBUGFS_STATS_ADD(name, field)					\
+	debugfs_create_u32(#name, 0400, statsd, (u32 *) &field);
+#define DEBUGFS_DEVSTATS_ADD(name)					\
+	debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
+
+DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount);
+DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount);
+DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount);
+DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount);
+
+void debugfs_hw_add(struct ieee80211_local *local)
+{
+	struct dentry *phyd = local->hw.wiphy->debugfsdir;
+	struct dentry *statsd;
+
+	if (!phyd)
+		return;
+
+	local->debugfs.keys = debugfs_create_dir("keys", phyd);
+
+	DEBUGFS_ADD(frequency);
+	DEBUGFS_ADD(total_ps_buffered);
+	DEBUGFS_ADD(wep_iv);
+	DEBUGFS_ADD(tsf);
+	DEBUGFS_ADD(queues);
+	DEBUGFS_ADD_MODE(reset, 0200);
+	DEBUGFS_ADD(noack);
+	DEBUGFS_ADD(uapsd_queues);
+	DEBUGFS_ADD(uapsd_max_sp_len);
+	DEBUGFS_ADD(channel_type);
+	DEBUGFS_ADD(hwflags);
+	DEBUGFS_ADD(user_power);
+	DEBUGFS_ADD(power);
+
+	statsd = debugfs_create_dir("statistics", phyd);
+
+	/* if the dir failed, don't put all the other things into the root! */
+	if (!statsd)
+		return;
+
+	DEBUGFS_STATS_ADD(transmitted_fragment_count,
+		local->dot11TransmittedFragmentCount);
+	DEBUGFS_STATS_ADD(multicast_transmitted_frame_count,
+		local->dot11MulticastTransmittedFrameCount);
+	DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount);
+	DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount);
+	DEBUGFS_STATS_ADD(multiple_retry_count,
+		local->dot11MultipleRetryCount);
+	DEBUGFS_STATS_ADD(frame_duplicate_count,
+		local->dot11FrameDuplicateCount);
+	DEBUGFS_STATS_ADD(received_fragment_count,
+		local->dot11ReceivedFragmentCount);
+	DEBUGFS_STATS_ADD(multicast_received_frame_count,
+		local->dot11MulticastReceivedFrameCount);
+	DEBUGFS_STATS_ADD(transmitted_frame_count,
+		local->dot11TransmittedFrameCount);
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
+	DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop);
+	DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_unencrypted,
+		local->tx_handlers_drop_unencrypted);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_fragment,
+		local->tx_handlers_drop_fragment);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_wep,
+		local->tx_handlers_drop_wep);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc,
+		local->tx_handlers_drop_not_assoc);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port,
+		local->tx_handlers_drop_unauth_port);
+	DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop);
+	DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc,
+		local->rx_handlers_drop_nullfunc);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_defrag,
+		local->rx_handlers_drop_defrag);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_short,
+		local->rx_handlers_drop_short);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_passive_scan,
+		local->rx_handlers_drop_passive_scan);
+	DEBUGFS_STATS_ADD(tx_expand_skb_head,
+		local->tx_expand_skb_head);
+	DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned,
+		local->tx_expand_skb_head_cloned);
+	DEBUGFS_STATS_ADD(rx_expand_skb_head,
+		local->rx_expand_skb_head);
+	DEBUGFS_STATS_ADD(rx_expand_skb_head2,
+		local->rx_expand_skb_head2);
+	DEBUGFS_STATS_ADD(rx_handlers_fragments,
+		local->rx_handlers_fragments);
+	DEBUGFS_STATS_ADD(tx_status_drop,
+		local->tx_status_drop);
+#endif
+	DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount);
+	DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount);
+	DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount);
+	DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount);
+}
diff --git a/net/mac80211/debugfs.h b/net/mac80211/debugfs.h
new file mode 100644
index 00000000..7c875296
--- /dev/null
+++ b/net/mac80211/debugfs.h
@@ -0,0 +1,15 @@
+#ifndef __MAC80211_DEBUGFS_H
+#define __MAC80211_DEBUGFS_H
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+extern void debugfs_hw_add(struct ieee80211_local *local);
+extern int mac80211_open_file_generic(struct inode *inode, struct file *file);
+extern int mac80211_format_buffer(char __user *userbuf, size_t count,
+				  loff_t *ppos, char *fmt, ...);
+#else
+static inline void debugfs_hw_add(struct ieee80211_local *local)
+{
+}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_H */
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
new file mode 100644
index 00000000..33c58b85
--- /dev/null
+++ b/net/mac80211/debugfs_key.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright 2003-2005	Devicescape Software, Inc.
+ * Copyright (c) 2006	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include "ieee80211_i.h"
+#include "key.h"
+#include "debugfs.h"
+#include "debugfs_key.h"
+
+#define KEY_READ(name, prop, format_string)				\
+static ssize_t key_##name##_read(struct file *file,			\
+				 char __user *userbuf,			\
+				 size_t count, loff_t *ppos)		\
+{									\
+	struct ieee80211_key *key = file->private_data;			\
+	return mac80211_format_buffer(userbuf, count, ppos, 		\
+				      format_string, key->prop);	\
+}
+#define KEY_READ_D(name) KEY_READ(name, name, "%d\n")
+#define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n")
+
+#define KEY_OPS(name)							\
+static const struct file_operations key_ ##name## _ops = {		\
+	.read = key_##name##_read,					\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+}
+
+#define KEY_FILE(name, format)						\
+		 KEY_READ_##format(name)				\
+		 KEY_OPS(name)
+
+#define KEY_CONF_READ(name, format_string)				\
+	KEY_READ(conf_##name, conf.name, format_string)
+#define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n")
+
+#define KEY_CONF_OPS(name)						\
+static const struct file_operations key_ ##name## _ops = {		\
+	.read = key_conf_##name##_read,					\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+}
+
+#define KEY_CONF_FILE(name, format)					\
+		 KEY_CONF_READ_##format(name)				\
+		 KEY_CONF_OPS(name)
+
+KEY_CONF_FILE(keylen, D);
+KEY_CONF_FILE(keyidx, D);
+KEY_CONF_FILE(hw_key_idx, D);
+KEY_FILE(flags, X);
+KEY_FILE(tx_rx_count, D);
+KEY_READ(ifindex, sdata->name, "%s\n");
+KEY_OPS(ifindex);
+
+static ssize_t key_algorithm_read(struct file *file,
+				  char __user *userbuf,
+				  size_t count, loff_t *ppos)
+{
+	char buf[15];
+	struct ieee80211_key *key = file->private_data;
+	u32 c = key->conf.cipher;
+
+	sprintf(buf, "%.2x-%.2x-%.2x:%d\n",
+		c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff);
+	return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+}
+KEY_OPS(algorithm);
+
+static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
+				size_t count, loff_t *ppos)
+{
+	const u8 *tpn;
+	char buf[20];
+	int len;
+	struct ieee80211_key *key = file->private_data;
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		len = scnprintf(buf, sizeof(buf), "\n");
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
+				key->u.tkip.tx.iv32,
+				key->u.tkip.tx.iv16);
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		tpn = key->u.ccmp.tx_pn;
+		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		tpn = key->u.aes_cmac.tx_pn;
+		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
+				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4],
+				tpn[5]);
+		break;
+	default:
+		return 0;
+	}
+	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(tx_spec);
+
+static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
+				size_t count, loff_t *ppos)
+{
+	struct ieee80211_key *key = file->private_data;
+	char buf[14*NUM_RX_DATA_QUEUES+1], *p = buf;
+	int i, len;
+	const u8 *rpn;
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		len = scnprintf(buf, sizeof(buf), "\n");
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
+			p += scnprintf(p, sizeof(buf)+buf-p,
+				       "%08x %04x\n",
+				       key->u.tkip.rx[i].iv32,
+				       key->u.tkip.rx[i].iv16);
+		len = p - buf;
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) {
+			rpn = key->u.ccmp.rx_pn[i];
+			p += scnprintf(p, sizeof(buf)+buf-p,
+				       "%02x%02x%02x%02x%02x%02x\n",
+				       rpn[0], rpn[1], rpn[2],
+				       rpn[3], rpn[4], rpn[5]);
+		}
+		len = p - buf;
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		rpn = key->u.aes_cmac.rx_pn;
+		p += scnprintf(p, sizeof(buf)+buf-p,
+			       "%02x%02x%02x%02x%02x%02x\n",
+			       rpn[0], rpn[1], rpn[2],
+			       rpn[3], rpn[4], rpn[5]);
+		len = p - buf;
+		break;
+	default:
+		return 0;
+	}
+	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(rx_spec);
+
+static ssize_t key_replays_read(struct file *file, char __user *userbuf,
+				size_t count, loff_t *ppos)
+{
+	struct ieee80211_key *key = file->private_data;
+	char buf[20];
+	int len;
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
+		len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		len = scnprintf(buf, sizeof(buf), "%u\n",
+				key->u.aes_cmac.replays);
+		break;
+	default:
+		return 0;
+	}
+	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(replays);
+
+static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
+				  size_t count, loff_t *ppos)
+{
+	struct ieee80211_key *key = file->private_data;
+	char buf[20];
+	int len;
+
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		len = scnprintf(buf, sizeof(buf), "%u\n",
+				key->u.aes_cmac.icverrors);
+		break;
+	default:
+		return 0;
+	}
+	return simple_read_from_buffer(userbuf, count, ppos, buf, len);
+}
+KEY_OPS(icverrors);
+
+static ssize_t key_key_read(struct file *file, char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	struct ieee80211_key *key = file->private_data;
+	int i, bufsize = 2 * key->conf.keylen + 2;
+	char *buf = kmalloc(bufsize, GFP_KERNEL);
+	char *p = buf;
+	ssize_t res;
+
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; i < key->conf.keylen; i++)
+		p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]);
+	p += scnprintf(p, bufsize+buf-p, "\n");
+	res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+	kfree(buf);
+	return res;
+}
+KEY_OPS(key);
+
+#define DEBUGFS_ADD(name) \
+	debugfs_create_file(#name, 0400, key->debugfs.dir, \
+			    key, &key_##name##_ops);
+
+void ieee80211_debugfs_key_add(struct ieee80211_key *key)
+  {
+	static int keycount;
+	char buf[50];
+	struct sta_info *sta;
+
+	if (!key->local->debugfs.keys)
+		return;
+
+	sprintf(buf, "%d", keycount);
+	key->debugfs.cnt = keycount;
+	keycount++;
+	key->debugfs.dir = debugfs_create_dir(buf,
+					key->local->debugfs.keys);
+
+	if (!key->debugfs.dir)
+		return;
+
+	sta = key->sta;
+	if (sta) {
+		sprintf(buf, "../../stations/%pM", sta->sta.addr);
+		key->debugfs.stalink =
+			debugfs_create_symlink("station", key->debugfs.dir, buf);
+	}
+
+	DEBUGFS_ADD(keylen);
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(keyidx);
+	DEBUGFS_ADD(hw_key_idx);
+	DEBUGFS_ADD(tx_rx_count);
+	DEBUGFS_ADD(algorithm);
+	DEBUGFS_ADD(tx_spec);
+	DEBUGFS_ADD(rx_spec);
+	DEBUGFS_ADD(replays);
+	DEBUGFS_ADD(icverrors);
+	DEBUGFS_ADD(key);
+	DEBUGFS_ADD(ifindex);
+};
+
+void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
+{
+	if (!key)
+		return;
+
+	debugfs_remove_recursive(key->debugfs.dir);
+	key->debugfs.dir = NULL;
+}
+
+void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
+{
+	char buf[50];
+	struct ieee80211_key *key;
+
+	if (!sdata->debugfs.dir)
+		return;
+
+	lockdep_assert_held(&sdata->local->key_mtx);
+
+	if (sdata->default_unicast_key) {
+		key = key_mtx_dereference(sdata->local,
+					  sdata->default_unicast_key);
+		sprintf(buf, "../keys/%d", key->debugfs.cnt);
+		sdata->debugfs.default_unicast_key =
+			debugfs_create_symlink("default_unicast_key",
+					       sdata->debugfs.dir, buf);
+	} else {
+		debugfs_remove(sdata->debugfs.default_unicast_key);
+		sdata->debugfs.default_unicast_key = NULL;
+	}
+
+	if (sdata->default_multicast_key) {
+		key = key_mtx_dereference(sdata->local,
+					  sdata->default_multicast_key);
+		sprintf(buf, "../keys/%d", key->debugfs.cnt);
+		sdata->debugfs.default_multicast_key =
+			debugfs_create_symlink("default_multicast_key",
+					       sdata->debugfs.dir, buf);
+	} else {
+		debugfs_remove(sdata->debugfs.default_multicast_key);
+		sdata->debugfs.default_multicast_key = NULL;
+	}
+}
+
+void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+	char buf[50];
+	struct ieee80211_key *key;
+
+	if (!sdata->debugfs.dir)
+		return;
+
+	key = key_mtx_dereference(sdata->local,
+				  sdata->default_mgmt_key);
+	if (key) {
+		sprintf(buf, "../keys/%d", key->debugfs.cnt);
+		sdata->debugfs.default_mgmt_key =
+			debugfs_create_symlink("default_mgmt_key",
+					       sdata->debugfs.dir, buf);
+	} else
+		ieee80211_debugfs_key_remove_mgmt_default(sdata);
+}
+
+void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata)
+		return;
+
+	debugfs_remove(sdata->debugfs.default_mgmt_key);
+	sdata->debugfs.default_mgmt_key = NULL;
+}
+
+void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+				   struct sta_info *sta)
+{
+	debugfs_remove(key->debugfs.stalink);
+	key->debugfs.stalink = NULL;
+}
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
new file mode 100644
index 00000000..32adc77e
--- /dev/null
+++ b/net/mac80211/debugfs_key.h
@@ -0,0 +1,33 @@
+#ifndef __MAC80211_DEBUGFS_KEY_H
+#define __MAC80211_DEBUGFS_KEY_H
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_debugfs_key_add(struct ieee80211_key *key);
+void ieee80211_debugfs_key_remove(struct ieee80211_key *key);
+void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_add_mgmt_default(
+	struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_remove_mgmt_default(
+	struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+				   struct sta_info *sta);
+#else
+static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key)
+{}
+static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
+{}
+static inline void ieee80211_debugfs_key_update_default(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_add_mgmt_default(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_remove_mgmt_default(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
+						 struct sta_info *sta)
+{}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_KEY_H */
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
new file mode 100644
index 00000000..9ea7c0d0
--- /dev/null
+++ b/net/mac80211/debugfs_netdev.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2006	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <net/mac80211.h>
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "debugfs.h"
+#include "debugfs_netdev.h"
+
+static ssize_t ieee80211_if_read(
+	struct ieee80211_sub_if_data *sdata,
+	char __user *userbuf,
+	size_t count, loff_t *ppos,
+	ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
+{
+	char buf[70];
+	ssize_t ret = -EINVAL;
+
+	read_lock(&dev_base_lock);
+	if (sdata->dev->reg_state == NETREG_REGISTERED)
+		ret = (*format)(sdata, buf, sizeof(buf));
+	read_unlock(&dev_base_lock);
+
+	if (ret >= 0)
+		ret = simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+
+	return ret;
+}
+
+static ssize_t ieee80211_if_write(
+	struct ieee80211_sub_if_data *sdata,
+	const char __user *userbuf,
+	size_t count, loff_t *ppos,
+	ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int))
+{
+	u8 *buf;
+	ssize_t ret;
+
+	buf = kmalloc(count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(buf, userbuf, count))
+		goto freebuf;
+
+	ret = -ENODEV;
+	rtnl_lock();
+	if (sdata->dev->reg_state == NETREG_REGISTERED)
+		ret = (*write)(sdata, buf, count);
+	rtnl_unlock();
+
+freebuf:
+	kfree(buf);
+	return ret;
+}
+
+#define IEEE80211_IF_FMT(name, field, format_string)			\
+static ssize_t ieee80211_if_fmt_##name(					\
+	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	int buflen)							\
+{									\
+	return scnprintf(buf, buflen, format_string, sdata->field);	\
+}
+#define IEEE80211_IF_FMT_DEC(name, field)				\
+		IEEE80211_IF_FMT(name, field, "%d\n")
+#define IEEE80211_IF_FMT_HEX(name, field)				\
+		IEEE80211_IF_FMT(name, field, "%#x\n")
+#define IEEE80211_IF_FMT_LHEX(name, field)				\
+		IEEE80211_IF_FMT(name, field, "%#lx\n")
+#define IEEE80211_IF_FMT_SIZE(name, field)				\
+		IEEE80211_IF_FMT(name, field, "%zd\n")
+
+#define IEEE80211_IF_FMT_ATOMIC(name, field)				\
+static ssize_t ieee80211_if_fmt_##name(					\
+	const struct ieee80211_sub_if_data *sdata,			\
+	char *buf, int buflen)						\
+{									\
+	return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\
+}
+
+#define IEEE80211_IF_FMT_MAC(name, field)				\
+static ssize_t ieee80211_if_fmt_##name(					\
+	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	int buflen)							\
+{									\
+	return scnprintf(buf, buflen, "%pM\n", sdata->field);		\
+}
+
+#define IEEE80211_IF_FMT_DEC_DIV_16(name, field)			\
+static ssize_t ieee80211_if_fmt_##name(					\
+	const struct ieee80211_sub_if_data *sdata,			\
+	char *buf, int buflen)						\
+{									\
+	return scnprintf(buf, buflen, "%d\n", sdata->field / 16);	\
+}
+
+#define __IEEE80211_IF_FILE(name, _write)				\
+static ssize_t ieee80211_if_read_##name(struct file *file,		\
+					char __user *userbuf,		\
+					size_t count, loff_t *ppos)	\
+{									\
+	return ieee80211_if_read(file->private_data,			\
+				 userbuf, count, ppos,			\
+				 ieee80211_if_fmt_##name);		\
+}									\
+static const struct file_operations name##_ops = {			\
+	.read = ieee80211_if_read_##name,				\
+	.write = (_write),						\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+}
+
+#define __IEEE80211_IF_FILE_W(name)					\
+static ssize_t ieee80211_if_write_##name(struct file *file,		\
+					 const char __user *userbuf,	\
+					 size_t count, loff_t *ppos)	\
+{									\
+	return ieee80211_if_write(file->private_data, userbuf, count,	\
+				  ppos, ieee80211_if_parse_##name);	\
+}									\
+__IEEE80211_IF_FILE(name, ieee80211_if_write_##name)
+
+
+#define IEEE80211_IF_FILE(name, field, format)				\
+		IEEE80211_IF_FMT_##format(name, field)			\
+		__IEEE80211_IF_FILE(name, NULL)
+
+/* common attributes */
+IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC);
+IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ],
+		  HEX);
+IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ],
+		  HEX);
+IEEE80211_IF_FILE(flags, flags, HEX);
+IEEE80211_IF_FILE(state, state, LHEX);
+IEEE80211_IF_FILE(channel_type, vif.bss_conf.channel_type, DEC);
+
+/* STA attributes */
+IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
+IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(last_beacon, u.mgd.last_beacon_signal, DEC);
+IEEE80211_IF_FILE(ave_beacon, u.mgd.ave_beacon_signal, DEC_DIV_16);
+
+static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
+			      enum ieee80211_smps_mode smps_mode)
+{
+	struct ieee80211_local *local = sdata->local;
+	int err;
+
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_STATIC_SMPS) &&
+	    smps_mode == IEEE80211_SMPS_STATIC)
+		return -EINVAL;
+
+	/* auto should be dynamic if in PS mode */
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS) &&
+	    (smps_mode == IEEE80211_SMPS_DYNAMIC ||
+	     smps_mode == IEEE80211_SMPS_AUTOMATIC))
+		return -EINVAL;
+
+	/* supported only on managed interfaces for now */
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&sdata->u.mgd.mtx);
+	err = __ieee80211_request_smps(sdata, smps_mode);
+	mutex_unlock(&sdata->u.mgd.mtx);
+
+	return err;
+}
+
+static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
+	[IEEE80211_SMPS_AUTOMATIC] = "auto",
+	[IEEE80211_SMPS_OFF] = "off",
+	[IEEE80211_SMPS_STATIC] = "static",
+	[IEEE80211_SMPS_DYNAMIC] = "dynamic",
+};
+
+static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
+				     char *buf, int buflen)
+{
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	return snprintf(buf, buflen, "request: %s\nused: %s\n",
+			smps_modes[sdata->u.mgd.req_smps],
+			smps_modes[sdata->u.mgd.ap_smps]);
+}
+
+static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
+				       const char *buf, int buflen)
+{
+	enum ieee80211_smps_mode mode;
+
+	for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) {
+		if (strncmp(buf, smps_modes[mode], buflen) == 0) {
+			int err = ieee80211_set_smps(sdata, mode);
+			if (!err)
+				return buflen;
+			return err;
+		}
+	}
+
+	return -EINVAL;
+}
+
+__IEEE80211_IF_FILE_W(smps);
+
+static ssize_t ieee80211_if_fmt_tkip_mic_test(
+	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+	return -EOPNOTSUPP;
+}
+
+static int hwaddr_aton(const char *txt, u8 *addr)
+{
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		int a, b;
+
+		a = hex_to_bin(*txt++);
+		if (a < 0)
+			return -1;
+		b = hex_to_bin(*txt++);
+		if (b < 0)
+			return -1;
+		*addr++ = (a << 4) | b;
+		if (i < 5 && *txt++ != ':')
+			return -1;
+	}
+
+	return 0;
+}
+
+static ssize_t ieee80211_if_parse_tkip_mic_test(
+	struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
+{
+	struct ieee80211_local *local = sdata->local;
+	u8 addr[ETH_ALEN];
+	struct sk_buff *skb;
+	struct ieee80211_hdr *hdr;
+	__le16 fc;
+
+	/*
+	 * Assume colon-delimited MAC address with possible white space
+	 * following.
+	 */
+	if (buflen < 3 * ETH_ALEN - 1)
+		return -EINVAL;
+	if (hwaddr_aton(buf, addr) < 0)
+		return -EINVAL;
+
+	if (!ieee80211_sdata_running(sdata))
+		return -ENOTCONN;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24 + 100);
+	if (!skb)
+		return -ENOMEM;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	hdr = (struct ieee80211_hdr *) skb_put(skb, 24);
+	memset(hdr, 0, 24);
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		/* DA BSSID SA */
+		memcpy(hdr->addr1, addr, ETH_ALEN);
+		memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+		memcpy(hdr->addr3, sdata->vif.addr, ETH_ALEN);
+		break;
+	case NL80211_IFTYPE_STATION:
+		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+		/* BSSID SA DA */
+		if (sdata->vif.bss_conf.bssid == NULL) {
+			dev_kfree_skb(skb);
+			return -ENOTCONN;
+		}
+		memcpy(hdr->addr1, sdata->vif.bss_conf.bssid, ETH_ALEN);
+		memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
+		memcpy(hdr->addr3, addr, ETH_ALEN);
+		break;
+	default:
+		dev_kfree_skb(skb);
+		return -EOPNOTSUPP;
+	}
+	hdr->frame_control = fc;
+
+	/*
+	 * Add some length to the test frame to make it look bit more valid.
+	 * The exact contents does not matter since the recipient is required
+	 * to drop this because of the Michael MIC failure.
+	 */
+	memset(skb_put(skb, 50), 0, 50);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_TKIP_MIC_FAILURE;
+
+	ieee80211_tx_skb(sdata, skb);
+
+	return buflen;
+}
+
+__IEEE80211_IF_FILE_W(tkip_mic_test);
+
+/* AP attributes */
+IEEE80211_IF_FILE(num_sta_ps, u.ap.num_sta_ps, ATOMIC);
+IEEE80211_IF_FILE(dtim_count, u.ap.dtim_count, DEC);
+
+static ssize_t ieee80211_if_fmt_num_buffered_multicast(
+	const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
+{
+	return scnprintf(buf, buflen, "%u\n",
+			 skb_queue_len(&sdata->u.ap.ps_bc_buf));
+}
+__IEEE80211_IF_FILE(num_buffered_multicast, NULL);
+
+/* WDS attributes */
+IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC);
+
+#ifdef CONFIG_MAC80211_MESH
+/* Mesh stats attributes */
+IEEE80211_IF_FILE(fwded_mcast, u.mesh.mshstats.fwded_mcast, DEC);
+IEEE80211_IF_FILE(fwded_unicast, u.mesh.mshstats.fwded_unicast, DEC);
+IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC);
+IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC);
+IEEE80211_IF_FILE(dropped_frames_no_route,
+		u.mesh.mshstats.dropped_frames_no_route, DEC);
+IEEE80211_IF_FILE(estab_plinks, u.mesh.mshstats.estab_plinks, ATOMIC);
+
+/* Mesh parameters */
+IEEE80211_IF_FILE(dot11MeshMaxRetries,
+		u.mesh.mshcfg.dot11MeshMaxRetries, DEC);
+IEEE80211_IF_FILE(dot11MeshRetryTimeout,
+		u.mesh.mshcfg.dot11MeshRetryTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshConfirmTimeout,
+		u.mesh.mshcfg.dot11MeshConfirmTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHoldingTimeout,
+		u.mesh.mshcfg.dot11MeshHoldingTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshTTL, u.mesh.mshcfg.dot11MeshTTL, DEC);
+IEEE80211_IF_FILE(element_ttl, u.mesh.mshcfg.element_ttl, DEC);
+IEEE80211_IF_FILE(auto_open_plinks, u.mesh.mshcfg.auto_open_plinks, DEC);
+IEEE80211_IF_FILE(dot11MeshMaxPeerLinks,
+		u.mesh.mshcfg.dot11MeshMaxPeerLinks, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPactivePathTimeout,
+		u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPpreqMinInterval,
+		u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPnetDiameterTraversalTime,
+		u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPmaxPREQretries,
+		u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries, DEC);
+IEEE80211_IF_FILE(path_refresh_time,
+		u.mesh.mshcfg.path_refresh_time, DEC);
+IEEE80211_IF_FILE(min_discovery_timeout,
+		u.mesh.mshcfg.min_discovery_timeout, DEC);
+IEEE80211_IF_FILE(dot11MeshHWMPRootMode,
+		u.mesh.mshcfg.dot11MeshHWMPRootMode, DEC);
+#endif
+
+
+#define DEBUGFS_ADD(name) \
+	debugfs_create_file(#name, 0400, sdata->debugfs.dir, \
+			    sdata, &name##_ops);
+
+#define DEBUGFS_ADD_MODE(name, mode) \
+	debugfs_create_file(#name, mode, sdata->debugfs.dir, \
+			    sdata, &name##_ops);
+
+static void add_sta_files(struct ieee80211_sub_if_data *sdata)
+{
+	DEBUGFS_ADD(drop_unencrypted);
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(state);
+	DEBUGFS_ADD(channel_type);
+	DEBUGFS_ADD(rc_rateidx_mask_2ghz);
+	DEBUGFS_ADD(rc_rateidx_mask_5ghz);
+
+	DEBUGFS_ADD(bssid);
+	DEBUGFS_ADD(aid);
+	DEBUGFS_ADD(last_beacon);
+	DEBUGFS_ADD(ave_beacon);
+	DEBUGFS_ADD_MODE(smps, 0600);
+	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+}
+
+static void add_ap_files(struct ieee80211_sub_if_data *sdata)
+{
+	DEBUGFS_ADD(drop_unencrypted);
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(state);
+	DEBUGFS_ADD(channel_type);
+	DEBUGFS_ADD(rc_rateidx_mask_2ghz);
+	DEBUGFS_ADD(rc_rateidx_mask_5ghz);
+
+	DEBUGFS_ADD(num_sta_ps);
+	DEBUGFS_ADD(dtim_count);
+	DEBUGFS_ADD(num_buffered_multicast);
+	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
+}
+
+static void add_wds_files(struct ieee80211_sub_if_data *sdata)
+{
+	DEBUGFS_ADD(drop_unencrypted);
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(state);
+	DEBUGFS_ADD(channel_type);
+	DEBUGFS_ADD(rc_rateidx_mask_2ghz);
+	DEBUGFS_ADD(rc_rateidx_mask_5ghz);
+
+	DEBUGFS_ADD(peer);
+}
+
+static void add_vlan_files(struct ieee80211_sub_if_data *sdata)
+{
+	DEBUGFS_ADD(drop_unencrypted);
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(state);
+	DEBUGFS_ADD(channel_type);
+	DEBUGFS_ADD(rc_rateidx_mask_2ghz);
+	DEBUGFS_ADD(rc_rateidx_mask_5ghz);
+}
+
+static void add_monitor_files(struct ieee80211_sub_if_data *sdata)
+{
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(state);
+	DEBUGFS_ADD(channel_type);
+}
+
+#ifdef CONFIG_MAC80211_MESH
+
+static void add_mesh_stats(struct ieee80211_sub_if_data *sdata)
+{
+	struct dentry *dir = debugfs_create_dir("mesh_stats",
+						sdata->debugfs.dir);
+
+#define MESHSTATS_ADD(name)\
+	debugfs_create_file(#name, 0400, dir, sdata, &name##_ops);
+
+	MESHSTATS_ADD(fwded_mcast);
+	MESHSTATS_ADD(fwded_unicast);
+	MESHSTATS_ADD(fwded_frames);
+	MESHSTATS_ADD(dropped_frames_ttl);
+	MESHSTATS_ADD(dropped_frames_no_route);
+	MESHSTATS_ADD(estab_plinks);
+#undef MESHSTATS_ADD
+}
+
+static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
+{
+	struct dentry *dir = debugfs_create_dir("mesh_config",
+						sdata->debugfs.dir);
+
+#define MESHPARAMS_ADD(name) \
+	debugfs_create_file(#name, 0600, dir, sdata, &name##_ops);
+
+	MESHPARAMS_ADD(dot11MeshMaxRetries);
+	MESHPARAMS_ADD(dot11MeshRetryTimeout);
+	MESHPARAMS_ADD(dot11MeshConfirmTimeout);
+	MESHPARAMS_ADD(dot11MeshHoldingTimeout);
+	MESHPARAMS_ADD(dot11MeshTTL);
+	MESHPARAMS_ADD(element_ttl);
+	MESHPARAMS_ADD(auto_open_plinks);
+	MESHPARAMS_ADD(dot11MeshMaxPeerLinks);
+	MESHPARAMS_ADD(dot11MeshHWMPactivePathTimeout);
+	MESHPARAMS_ADD(dot11MeshHWMPpreqMinInterval);
+	MESHPARAMS_ADD(dot11MeshHWMPnetDiameterTraversalTime);
+	MESHPARAMS_ADD(dot11MeshHWMPmaxPREQretries);
+	MESHPARAMS_ADD(path_refresh_time);
+	MESHPARAMS_ADD(min_discovery_timeout);
+
+#undef MESHPARAMS_ADD
+}
+#endif
+
+static void add_files(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata->debugfs.dir)
+		return;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_MESH_POINT:
+#ifdef CONFIG_MAC80211_MESH
+		add_mesh_stats(sdata);
+		add_mesh_config(sdata);
+#endif
+		break;
+	case NL80211_IFTYPE_STATION:
+		add_sta_files(sdata);
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* XXX */
+		break;
+	case NL80211_IFTYPE_AP:
+		add_ap_files(sdata);
+		break;
+	case NL80211_IFTYPE_WDS:
+		add_wds_files(sdata);
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		add_monitor_files(sdata);
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		add_vlan_files(sdata);
+		break;
+	default:
+		break;
+	}
+}
+
+void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
+{
+	char buf[10+IFNAMSIZ];
+
+	sprintf(buf, "netdev:%s", sdata->name);
+	sdata->debugfs.dir = debugfs_create_dir(buf,
+		sdata->local->hw.wiphy->debugfsdir);
+	if (sdata->debugfs.dir)
+		sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+			sdata->debugfs.dir);
+	add_files(sdata);
+}
+
+void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
+{
+	if (!sdata->debugfs.dir)
+		return;
+
+	debugfs_remove_recursive(sdata->debugfs.dir);
+	sdata->debugfs.dir = NULL;
+}
+
+void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
+{
+	struct dentry *dir;
+	char buf[10 + IFNAMSIZ];
+
+	dir = sdata->debugfs.dir;
+
+	if (!dir)
+		return;
+
+	sprintf(buf, "netdev:%s", sdata->name);
+	if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
+		printk(KERN_ERR "mac80211: debugfs: failed to rename debugfs "
+		       "dir to %s\n", buf);
+}
diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h
new file mode 100644
index 00000000..79025e79
--- /dev/null
+++ b/net/mac80211/debugfs_netdev.h
@@ -0,0 +1,22 @@
+/* routines exported for debugfs handling */
+
+#ifndef __IEEE80211_DEBUGFS_NETDEV_H
+#define __IEEE80211_DEBUGFS_NETDEV_H
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata);
+#else
+static inline void ieee80211_debugfs_add_netdev(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_remove_netdev(
+	struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_rename_netdev(
+	struct ieee80211_sub_if_data *sdata)
+{}
+#endif
+
+#endif /* __IEEE80211_DEBUGFS_NETDEV_H */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
new file mode 100644
index 00000000..a01d2137
--- /dev/null
+++ b/net/mac80211/debugfs_sta.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright 2003-2005	Devicescape Software, Inc.
+ * Copyright (c) 2006	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include "ieee80211_i.h"
+#include "debugfs.h"
+#include "debugfs_sta.h"
+#include "sta_info.h"
+
+/* sta attributtes */
+
+#define STA_READ(name, field, format_string)				\
+static ssize_t sta_ ##name## _read(struct file *file,			\
+				   char __user *userbuf,		\
+				   size_t count, loff_t *ppos)		\
+{									\
+	struct sta_info *sta = file->private_data;			\
+	return mac80211_format_buffer(userbuf, count, ppos, 		\
+				      format_string, sta->field);	\
+}
+#define STA_READ_D(name, field) STA_READ(name, field, "%d\n")
+#define STA_READ_U(name, field) STA_READ(name, field, "%u\n")
+#define STA_READ_S(name, field) STA_READ(name, field, "%s\n")
+
+#define STA_OPS(name)							\
+static const struct file_operations sta_ ##name## _ops = {		\
+	.read = sta_##name##_read,					\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+}
+
+#define STA_OPS_RW(name)						\
+static const struct file_operations sta_ ##name## _ops = {		\
+	.read = sta_##name##_read,					\
+	.write = sta_##name##_write,					\
+	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+}
+
+#define STA_FILE(name, field, format)					\
+		STA_READ_##format(name, field)				\
+		STA_OPS(name)
+
+STA_FILE(aid, sta.aid, D);
+STA_FILE(dev, sdata->name, S);
+STA_FILE(last_signal, last_signal, D);
+
+static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
+			      size_t count, loff_t *ppos)
+{
+	char buf[100];
+	struct sta_info *sta = file->private_data;
+	u32 staflags = get_sta_flags(sta);
+	int res = scnprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s",
+		staflags & WLAN_STA_AUTH ? "AUTH\n" : "",
+		staflags & WLAN_STA_ASSOC ? "ASSOC\n" : "",
+		staflags & WLAN_STA_PS_STA ? "PS (sta)\n" : "",
+		staflags & WLAN_STA_PS_DRIVER ? "PS (driver)\n" : "",
+		staflags & WLAN_STA_AUTHORIZED ? "AUTHORIZED\n" : "",
+		staflags & WLAN_STA_SHORT_PREAMBLE ? "SHORT PREAMBLE\n" : "",
+		staflags & WLAN_STA_WME ? "WME\n" : "",
+		staflags & WLAN_STA_WDS ? "WDS\n" : "",
+		staflags & WLAN_STA_MFP ? "MFP\n" : "");
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+STA_OPS(flags);
+
+static ssize_t sta_num_ps_buf_frames_read(struct file *file,
+					  char __user *userbuf,
+					  size_t count, loff_t *ppos)
+{
+	struct sta_info *sta = file->private_data;
+	return mac80211_format_buffer(userbuf, count, ppos, "%u\n",
+				      skb_queue_len(&sta->ps_tx_buf));
+}
+STA_OPS(num_ps_buf_frames);
+
+static ssize_t sta_inactive_ms_read(struct file *file, char __user *userbuf,
+				    size_t count, loff_t *ppos)
+{
+	struct sta_info *sta = file->private_data;
+	return mac80211_format_buffer(userbuf, count, ppos, "%d\n",
+				      jiffies_to_msecs(jiffies - sta->last_rx));
+}
+STA_OPS(inactive_ms);
+
+
+static ssize_t sta_connected_time_read(struct file *file, char __user *userbuf,
+					size_t count, loff_t *ppos)
+{
+	struct sta_info *sta = file->private_data;
+	struct timespec uptime;
+	struct tm result;
+	long connected_time_secs;
+	char buf[100];
+	int res;
+	do_posix_clock_monotonic_gettime(&uptime);
+	connected_time_secs = uptime.tv_sec - sta->last_connected;
+	time_to_tm(connected_time_secs, 0, &result);
+	result.tm_year -= 70;
+	result.tm_mday -= 1;
+	res = scnprintf(buf, sizeof(buf),
+		"years  - %ld\nmonths - %d\ndays   - %d\nclock  - %d:%d:%d\n\n",
+			result.tm_year, result.tm_mon, result.tm_mday,
+			result.tm_hour, result.tm_min, result.tm_sec);
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);
+}
+STA_OPS(connected_time);
+
+
+
+static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf,
+				      size_t count, loff_t *ppos)
+{
+	char buf[15*NUM_RX_DATA_QUEUES], *p = buf;
+	int i;
+	struct sta_info *sta = file->private_data;
+	for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
+		p += scnprintf(p, sizeof(buf)+buf-p, "%x ",
+			       le16_to_cpu(sta->last_seq_ctrl[i]));
+	p += scnprintf(p, sizeof(buf)+buf-p, "\n");
+	return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(last_seq_ctrl);
+
+static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
+					size_t count, loff_t *ppos)
+{
+	char buf[71 + STA_TID_NUM * 40], *p = buf;
+	int i;
+	struct sta_info *sta = file->private_data;
+	struct tid_ampdu_rx *tid_rx;
+	struct tid_ampdu_tx *tid_tx;
+
+	rcu_read_lock();
+
+	p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
+			sta->ampdu_mlme.dialog_token_allocator + 1);
+	p += scnprintf(p, sizeof(buf) + buf - p,
+		       "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
+
+	for (i = 0; i < STA_TID_NUM; i++) {
+		tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
+		tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
+
+		p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_rx);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+				tid_rx ? tid_rx->dialog_token : 0);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
+				tid_rx ? tid_rx->ssn : 0);
+
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_tx);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+				tid_tx ? tid_tx->dialog_token : 0);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d",
+				tid_tx ? skb_queue_len(&tid_tx->pending) : 0);
+		p += scnprintf(p, sizeof(buf) + buf - p, "\n");
+	}
+	rcu_read_unlock();
+
+	return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+
+static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf,
+				    size_t count, loff_t *ppos)
+{
+	char _buf[12], *buf = _buf;
+	struct sta_info *sta = file->private_data;
+	bool start, tx;
+	unsigned long tid;
+	int ret;
+
+	if (count > sizeof(_buf))
+		return -EINVAL;
+
+	if (copy_from_user(buf, userbuf, count))
+		return -EFAULT;
+
+	buf[sizeof(_buf) - 1] = '\0';
+
+	if (strncmp(buf, "tx ", 3) == 0) {
+		buf += 3;
+		tx = true;
+	} else if (strncmp(buf, "rx ", 3) == 0) {
+		buf += 3;
+		tx = false;
+	} else
+		return -EINVAL;
+
+	if (strncmp(buf, "start ", 6) == 0) {
+		buf += 6;
+		start = true;
+		if (!tx)
+			return -EINVAL;
+	} else if (strncmp(buf, "stop ", 5) == 0) {
+		buf += 5;
+		start = false;
+	} else
+		return -EINVAL;
+
+	tid = simple_strtoul(buf, NULL, 0);
+
+	if (tid >= STA_TID_NUM)
+		return -EINVAL;
+
+	if (tx) {
+		if (start)
+			ret = ieee80211_start_tx_ba_session(&sta->sta, tid, 5000);
+		else
+			ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
+	} else {
+		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+					       3, true);
+		ret = 0;
+	}
+
+	return ret ?: count;
+}
+STA_OPS_RW(agg_status);
+
+static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
+				size_t count, loff_t *ppos)
+{
+#define PRINT_HT_CAP(_cond, _str) \
+	do { \
+	if (_cond) \
+			p += scnprintf(p, sizeof(buf)+buf-p, "\t" _str "\n"); \
+	} while (0)
+	char buf[512], *p = buf;
+	int i;
+	struct sta_info *sta = file->private_data;
+	struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap;
+
+	p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n",
+			htc->ht_supported ? "" : "not ");
+	if (htc->ht_supported) {
+		p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.4x\n", htc->cap);
+
+		PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC");
+		PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40");
+		PRINT_HT_CAP(!(htc->cap & BIT(1)), "HT20");
+
+		PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 0, "Static SM Power Save");
+		PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 1, "Dynamic SM Power Save");
+		PRINT_HT_CAP(((htc->cap >> 2) & 0x3) == 3, "SM Power Save disabled");
+
+		PRINT_HT_CAP((htc->cap & BIT(4)), "RX Greenfield");
+		PRINT_HT_CAP((htc->cap & BIT(5)), "RX HT20 SGI");
+		PRINT_HT_CAP((htc->cap & BIT(6)), "RX HT40 SGI");
+		PRINT_HT_CAP((htc->cap & BIT(7)), "TX STBC");
+
+		PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 0, "No RX STBC");
+		PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 1, "RX STBC 1-stream");
+		PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 2, "RX STBC 2-streams");
+		PRINT_HT_CAP(((htc->cap >> 8) & 0x3) == 3, "RX STBC 3-streams");
+
+		PRINT_HT_CAP((htc->cap & BIT(10)), "HT Delayed Block Ack");
+
+		PRINT_HT_CAP((htc->cap & BIT(11)), "Max AMSDU length: "
+			     "3839 bytes");
+		PRINT_HT_CAP(!(htc->cap & BIT(11)), "Max AMSDU length: "
+			     "7935 bytes");
+
+		/*
+		 * For beacons and probe response this would mean the BSS
+		 * does or does not allow the usage of DSSS/CCK HT40.
+		 * Otherwise it means the STA does or does not use
+		 * DSSS/CCK HT40.
+		 */
+		PRINT_HT_CAP((htc->cap & BIT(12)), "DSSS/CCK HT40");
+		PRINT_HT_CAP(!(htc->cap & BIT(12)), "No DSSS/CCK HT40");
+
+		/* BIT(13) is reserved */
+
+		PRINT_HT_CAP((htc->cap & BIT(14)), "40 MHz Intolerant");
+
+		PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection");
+
+		p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n",
+				htc->ampdu_factor, htc->ampdu_density);
+		p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:");
+
+		for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
+			p += scnprintf(p, sizeof(buf)+buf-p, " %.2x",
+					htc->mcs.rx_mask[i]);
+		p += scnprintf(p, sizeof(buf)+buf-p, "\n");
+
+		/* If not set this is meaningless */
+		if (le16_to_cpu(htc->mcs.rx_highest)) {
+			p += scnprintf(p, sizeof(buf)+buf-p,
+				       "MCS rx highest: %d Mbps\n",
+				       le16_to_cpu(htc->mcs.rx_highest));
+		}
+
+		p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n",
+				htc->mcs.tx_params);
+	}
+
+	return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+}
+STA_OPS(ht_capa);
+
+#define DEBUGFS_ADD(name) \
+	debugfs_create_file(#name, 0400, \
+		sta->debugfs.dir, sta, &sta_ ##name## _ops);
+
+#define DEBUGFS_ADD_COUNTER(name, field)				\
+	if (sizeof(sta->field) == sizeof(u32))				\
+		debugfs_create_u32(#name, 0400, sta->debugfs.dir,	\
+			(u32 *) &sta->field);				\
+	else								\
+		debugfs_create_u64(#name, 0400, sta->debugfs.dir,	\
+			(u64 *) &sta->field);
+
+void ieee80211_sta_debugfs_add(struct sta_info *sta)
+{
+	struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
+	u8 mac[3*ETH_ALEN];
+
+	sta->debugfs.add_has_run = true;
+
+	if (!stations_dir)
+		return;
+
+	snprintf(mac, sizeof(mac), "%pM", sta->sta.addr);
+
+	/*
+	 * This might fail due to a race condition:
+	 * When mac80211 unlinks a station, the debugfs entries
+	 * remain, but it is already possible to link a new
+	 * station with the same address which triggers adding
+	 * it to debugfs; therefore, if the old station isn't
+	 * destroyed quickly enough the old station's debugfs
+	 * dir might still be around.
+	 */
+	sta->debugfs.dir = debugfs_create_dir(mac, stations_dir);
+	if (!sta->debugfs.dir)
+		return;
+
+	DEBUGFS_ADD(flags);
+	DEBUGFS_ADD(num_ps_buf_frames);
+	DEBUGFS_ADD(inactive_ms);
+	DEBUGFS_ADD(connected_time);
+	DEBUGFS_ADD(last_seq_ctrl);
+	DEBUGFS_ADD(agg_status);
+	DEBUGFS_ADD(dev);
+	DEBUGFS_ADD(last_signal);
+	DEBUGFS_ADD(ht_capa);
+
+	DEBUGFS_ADD_COUNTER(rx_packets, rx_packets);
+	DEBUGFS_ADD_COUNTER(tx_packets, tx_packets);
+	DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes);
+	DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes);
+	DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates);
+	DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments);
+	DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped);
+	DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments);
+	DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count);
+	DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed);
+	DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count);
+	DEBUGFS_ADD_COUNTER(wep_weak_iv_count, wep_weak_iv_count);
+}
+
+void ieee80211_sta_debugfs_remove(struct sta_info *sta)
+{
+	debugfs_remove_recursive(sta->debugfs.dir);
+	sta->debugfs.dir = NULL;
+}
diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h
new file mode 100644
index 00000000..8b608903
--- /dev/null
+++ b/net/mac80211/debugfs_sta.h
@@ -0,0 +1,14 @@
+#ifndef __MAC80211_DEBUGFS_STA_H
+#define __MAC80211_DEBUGFS_STA_H
+
+#include "sta_info.h"
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+void ieee80211_sta_debugfs_add(struct sta_info *sta);
+void ieee80211_sta_debugfs_remove(struct sta_info *sta);
+#else
+static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {}
+static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {}
+#endif
+
+#endif /* __MAC80211_DEBUGFS_STA_H */
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
new file mode 100644
index 00000000..eebf7a67
--- /dev/null
+++ b/net/mac80211/driver-ops.h
@@ -0,0 +1,640 @@
+#ifndef __MAC80211_DRIVER_OPS
+#define __MAC80211_DRIVER_OPS
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-trace.h"
+
+static inline void drv_tx(struct ieee80211_local *local, struct sk_buff *skb)
+{
+	local->ops->tx(&local->hw, skb);
+}
+
+static inline int drv_start(struct ieee80211_local *local)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_start(local);
+	local->started = true;
+	smp_mb();
+	ret = local->ops->start(&local->hw);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_stop(struct ieee80211_local *local)
+{
+	might_sleep();
+
+	trace_drv_stop(local);
+	local->ops->stop(&local->hw);
+	trace_drv_return_void(local);
+
+	/* sync away all work on the tasklet before clearing started */
+	tasklet_disable(&local->tasklet);
+	tasklet_enable(&local->tasklet);
+
+	barrier();
+
+	local->started = false;
+}
+
+#ifdef CONFIG_PM
+static inline int drv_suspend(struct ieee80211_local *local,
+			      struct cfg80211_wowlan *wowlan)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_suspend(local);
+	ret = local->ops->suspend(&local->hw, wowlan);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_resume(struct ieee80211_local *local)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_resume(local);
+	ret = local->ops->resume(&local->hw);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+#endif
+
+static inline int drv_add_interface(struct ieee80211_local *local,
+				    struct ieee80211_vif *vif)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_add_interface(local, vif_to_sdata(vif));
+	ret = local->ops->add_interface(&local->hw, vif);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_change_interface(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       enum nl80211_iftype type, bool p2p)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_change_interface(local, sdata, type, p2p);
+	ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_remove_interface(struct ieee80211_local *local,
+					struct ieee80211_vif *vif)
+{
+	might_sleep();
+
+	trace_drv_remove_interface(local, vif_to_sdata(vif));
+	local->ops->remove_interface(&local->hw, vif);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_config(struct ieee80211_local *local, u32 changed)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_config(local, changed);
+	ret = local->ops->config(&local->hw, changed);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_bss_info_changed(struct ieee80211_local *local,
+					struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_bss_conf *info,
+					u32 changed)
+{
+	might_sleep();
+
+	trace_drv_bss_info_changed(local, sdata, info, changed);
+	if (local->ops->bss_info_changed)
+		local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed);
+	trace_drv_return_void(local);
+}
+
+static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
+					struct netdev_hw_addr_list *mc_list)
+{
+	u64 ret = 0;
+
+	trace_drv_prepare_multicast(local, mc_list->count);
+
+	if (local->ops->prepare_multicast)
+		ret = local->ops->prepare_multicast(&local->hw, mc_list);
+
+	trace_drv_return_u64(local, ret);
+
+	return ret;
+}
+
+static inline void drv_configure_filter(struct ieee80211_local *local,
+					unsigned int changed_flags,
+					unsigned int *total_flags,
+					u64 multicast)
+{
+	might_sleep();
+
+	trace_drv_configure_filter(local, changed_flags, total_flags,
+				   multicast);
+	local->ops->configure_filter(&local->hw, changed_flags, total_flags,
+				     multicast);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_set_tim(struct ieee80211_local *local,
+			      struct ieee80211_sta *sta, bool set)
+{
+	int ret = 0;
+	trace_drv_set_tim(local, sta, set);
+	if (local->ops->set_tim)
+		ret = local->ops->set_tim(&local->hw, sta, set);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_set_key(struct ieee80211_local *local,
+			      enum set_key_cmd cmd,
+			      struct ieee80211_sub_if_data *sdata,
+			      struct ieee80211_sta *sta,
+			      struct ieee80211_key_conf *key)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_set_key(local, cmd, sdata, sta, key);
+	ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_update_tkip_key(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_key_conf *conf,
+				       struct sta_info *sta, u32 iv32,
+				       u16 *phase1key)
+{
+	struct ieee80211_sta *ista = NULL;
+
+	if (sta)
+		ista = &sta->sta;
+
+	trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
+	if (local->ops->update_tkip_key)
+		local->ops->update_tkip_key(&local->hw, &sdata->vif, conf,
+					    ista, iv32, phase1key);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_hw_scan(struct ieee80211_local *local,
+			      struct ieee80211_sub_if_data *sdata,
+			      struct cfg80211_scan_request *req)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_hw_scan(local, sdata);
+	ret = local->ops->hw_scan(&local->hw, &sdata->vif, req);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int
+drv_sched_scan_start(struct ieee80211_local *local,
+		     struct ieee80211_sub_if_data *sdata,
+		     struct cfg80211_sched_scan_request *req,
+		     struct ieee80211_sched_scan_ies *ies)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_sched_scan_start(local, sdata);
+	ret = local->ops->sched_scan_start(&local->hw, &sdata->vif,
+					      req, ies);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_sched_scan_stop(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata)
+{
+	might_sleep();
+
+	trace_drv_sched_scan_stop(local, sdata);
+	local->ops->sched_scan_stop(&local->hw, &sdata->vif);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_sw_scan_start(struct ieee80211_local *local)
+{
+	might_sleep();
+
+	trace_drv_sw_scan_start(local);
+	if (local->ops->sw_scan_start)
+		local->ops->sw_scan_start(&local->hw);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_sw_scan_complete(struct ieee80211_local *local)
+{
+	might_sleep();
+
+	trace_drv_sw_scan_complete(local);
+	if (local->ops->sw_scan_complete)
+		local->ops->sw_scan_complete(&local->hw);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_get_stats(struct ieee80211_local *local,
+				struct ieee80211_low_level_stats *stats)
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	if (local->ops->get_stats)
+		ret = local->ops->get_stats(&local->hw, stats);
+	trace_drv_get_stats(local, stats, ret);
+
+	return ret;
+}
+
+static inline void drv_get_tkip_seq(struct ieee80211_local *local,
+				    u8 hw_key_idx, u32 *iv32, u16 *iv16)
+{
+	if (local->ops->get_tkip_seq)
+		local->ops->get_tkip_seq(&local->hw, hw_key_idx, iv32, iv16);
+	trace_drv_get_tkip_seq(local, hw_key_idx, iv32, iv16);
+}
+
+static inline int drv_set_frag_threshold(struct ieee80211_local *local,
+					u32 value)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	trace_drv_set_frag_threshold(local, value);
+	if (local->ops->set_frag_threshold)
+		ret = local->ops->set_frag_threshold(&local->hw, value);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_set_rts_threshold(struct ieee80211_local *local,
+					u32 value)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	trace_drv_set_rts_threshold(local, value);
+	if (local->ops->set_rts_threshold)
+		ret = local->ops->set_rts_threshold(&local->hw, value);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_set_coverage_class(struct ieee80211_local *local,
+					 u8 value)
+{
+	int ret = 0;
+	might_sleep();
+
+	trace_drv_set_coverage_class(local, value);
+	if (local->ops->set_coverage_class)
+		local->ops->set_coverage_class(&local->hw, value);
+	else
+		ret = -EOPNOTSUPP;
+
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline void drv_sta_notify(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  enum sta_notify_cmd cmd,
+				  struct ieee80211_sta *sta)
+{
+	trace_drv_sta_notify(local, sdata, cmd, sta);
+	if (local->ops->sta_notify)
+		local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_sta_add(struct ieee80211_local *local,
+			      struct ieee80211_sub_if_data *sdata,
+			      struct ieee80211_sta *sta)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	trace_drv_sta_add(local, sdata, sta);
+	if (local->ops->sta_add)
+		ret = local->ops->sta_add(&local->hw, &sdata->vif, sta);
+
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline void drv_sta_remove(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_sta *sta)
+{
+	might_sleep();
+
+	trace_drv_sta_remove(local, sdata, sta);
+	if (local->ops->sta_remove)
+		local->ops->sta_remove(&local->hw, &sdata->vif, sta);
+
+	trace_drv_return_void(local);
+}
+
+static inline int drv_conf_tx(struct ieee80211_local *local, u16 queue,
+			      const struct ieee80211_tx_queue_params *params)
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	trace_drv_conf_tx(local, queue, params);
+	if (local->ops->conf_tx)
+		ret = local->ops->conf_tx(&local->hw, queue, params);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline u64 drv_get_tsf(struct ieee80211_local *local)
+{
+	u64 ret = -1ULL;
+
+	might_sleep();
+
+	trace_drv_get_tsf(local);
+	if (local->ops->get_tsf)
+		ret = local->ops->get_tsf(&local->hw);
+	trace_drv_return_u64(local, ret);
+	return ret;
+}
+
+static inline void drv_set_tsf(struct ieee80211_local *local, u64 tsf)
+{
+	might_sleep();
+
+	trace_drv_set_tsf(local, tsf);
+	if (local->ops->set_tsf)
+		local->ops->set_tsf(&local->hw, tsf);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_reset_tsf(struct ieee80211_local *local)
+{
+	might_sleep();
+
+	trace_drv_reset_tsf(local);
+	if (local->ops->reset_tsf)
+		local->ops->reset_tsf(&local->hw);
+	trace_drv_return_void(local);
+}
+
+static inline int drv_tx_last_beacon(struct ieee80211_local *local)
+{
+	int ret = 0; /* default unsuported op for less congestion */
+
+	might_sleep();
+
+	trace_drv_tx_last_beacon(local);
+	if (local->ops->tx_last_beacon)
+		ret = local->ops->tx_last_beacon(&local->hw);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_ampdu_action(struct ieee80211_local *local,
+				   struct ieee80211_sub_if_data *sdata,
+				   enum ieee80211_ampdu_mlme_action action,
+				   struct ieee80211_sta *sta, u16 tid,
+				   u16 *ssn, u8 buf_size)
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, buf_size);
+
+	if (local->ops->ampdu_action)
+		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
+					       sta, tid, ssn, buf_size);
+
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline int drv_get_survey(struct ieee80211_local *local, int idx,
+				struct survey_info *survey)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_drv_get_survey(local, idx, survey);
+
+	if (local->ops->get_survey)
+		ret = local->ops->get_survey(&local->hw, idx, survey);
+
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline void drv_rfkill_poll(struct ieee80211_local *local)
+{
+	might_sleep();
+
+	if (local->ops->rfkill_poll)
+		local->ops->rfkill_poll(&local->hw);
+}
+
+static inline void drv_flush(struct ieee80211_local *local, bool drop)
+{
+	might_sleep();
+
+	trace_drv_flush(local, drop);
+	if (local->ops->flush)
+		local->ops->flush(&local->hw, drop);
+	trace_drv_return_void(local);
+}
+
+static inline void drv_channel_switch(struct ieee80211_local *local,
+				     struct ieee80211_channel_switch *ch_switch)
+{
+	might_sleep();
+
+	trace_drv_channel_switch(local, ch_switch);
+	local->ops->channel_switch(&local->hw, ch_switch);
+	trace_drv_return_void(local);
+}
+
+
+static inline int drv_set_antenna(struct ieee80211_local *local,
+				  u32 tx_ant, u32 rx_ant)
+{
+	int ret = -EOPNOTSUPP;
+	might_sleep();
+	if (local->ops->set_antenna)
+		ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant);
+	trace_drv_set_antenna(local, tx_ant, rx_ant, ret);
+	return ret;
+}
+
+static inline int drv_get_antenna(struct ieee80211_local *local,
+				  u32 *tx_ant, u32 *rx_ant)
+{
+	int ret = -EOPNOTSUPP;
+	might_sleep();
+	if (local->ops->get_antenna)
+		ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant);
+	trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret);
+	return ret;
+}
+
+static inline int drv_remain_on_channel(struct ieee80211_local *local,
+					struct ieee80211_channel *chan,
+					enum nl80211_channel_type chantype,
+					unsigned int duration)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_remain_on_channel(local, chan, chantype, duration);
+	ret = local->ops->remain_on_channel(&local->hw, chan, chantype,
+					    duration);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_cancel_remain_on_channel(local);
+	ret = local->ops->cancel_remain_on_channel(&local->hw);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline int drv_offchannel_tx(struct ieee80211_local *local,
+				    struct sk_buff *skb,
+				    struct ieee80211_channel *chan,
+				    enum nl80211_channel_type channel_type,
+				    unsigned int wait)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_offchannel_tx(local, skb, chan, channel_type, wait);
+	ret = local->ops->offchannel_tx(&local->hw, skb, chan,
+					channel_type, wait);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline int drv_offchannel_tx_cancel_wait(struct ieee80211_local *local)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_offchannel_tx_cancel_wait(local);
+	ret = local->ops->offchannel_tx_cancel_wait(&local->hw);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline int drv_set_ringparam(struct ieee80211_local *local,
+				    u32 tx, u32 rx)
+{
+	int ret = -ENOTSUPP;
+
+	might_sleep();
+
+	trace_drv_set_ringparam(local, tx, rx);
+	if (local->ops->set_ringparam)
+		ret = local->ops->set_ringparam(&local->hw, tx, rx);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+static inline void drv_get_ringparam(struct ieee80211_local *local,
+				     u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max)
+{
+	might_sleep();
+
+	trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max);
+	if (local->ops->get_ringparam)
+		local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max);
+	trace_drv_return_void(local);
+}
+
+static inline bool drv_tx_frames_pending(struct ieee80211_local *local)
+{
+	bool ret = false;
+
+	might_sleep();
+
+	trace_drv_tx_frames_pending(local);
+	if (local->ops->tx_frames_pending)
+		ret = local->ops->tx_frames_pending(&local->hw);
+	trace_drv_return_bool(local, ret);
+
+	return ret;
+}
+
+static inline int drv_set_bitrate_mask(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       const struct cfg80211_bitrate_mask *mask)
+{
+	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	trace_drv_set_bitrate_mask(local, sdata, mask);
+	if (local->ops->set_bitrate_mask)
+		ret = local->ops->set_bitrate_mask(&local->hw,
+						   &sdata->vif, mask);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
+#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/driver-trace.c b/net/mac80211/driver-trace.c
new file mode 100644
index 00000000..8ed8711b
--- /dev/null
+++ b/net/mac80211/driver-trace.c
@@ -0,0 +1,9 @@
+/* bug in tracepoint.h, it should include this */
+#include <linux/module.h>
+
+/* sparse isn't too happy with all macros... */
+#ifndef __CHECKER__
+#include "driver-ops.h"
+#define CREATE_TRACE_POINTS
+#include "driver-trace.h"
+#endif
diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h
new file mode 100644
index 00000000..ed9edcbd
--- /dev/null
+++ b/net/mac80211/driver-trace.h
@@ -0,0 +1,1348 @@
+#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __MAC80211_DRIVER_TRACE
+
+#include <linux/tracepoint.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+
+#if !defined(CONFIG_MAC80211_DRIVER_API_TRACER) || defined(__CHECKER__)
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, ...) \
+static inline void trace_ ## name(proto) {}
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(...)
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(evt_class, name, proto, ...) \
+static inline void trace_ ## name(proto) {}
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mac80211
+
+#define MAXNAME		32
+#define LOCAL_ENTRY	__array(char, wiphy_name, 32)
+#define LOCAL_ASSIGN	strlcpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME)
+#define LOCAL_PR_FMT	"%s"
+#define LOCAL_PR_ARG	__entry->wiphy_name
+
+#define STA_ENTRY	__array(char, sta_addr, ETH_ALEN)
+#define STA_ASSIGN	(sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : memset(__entry->sta_addr, 0, ETH_ALEN))
+#define STA_PR_FMT	" sta:%pM"
+#define STA_PR_ARG	__entry->sta_addr
+
+#define VIF_ENTRY	__field(enum nl80211_iftype, vif_type) __field(void *, sdata)	\
+			__field(bool, p2p)						\
+			__string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
+#define VIF_ASSIGN	__entry->vif_type = sdata->vif.type; __entry->sdata = sdata;	\
+			__entry->p2p = sdata->vif.p2p;					\
+			__assign_str(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
+#define VIF_PR_FMT	" vif:%s(%d%s)"
+#define VIF_PR_ARG	__get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
+
+/*
+ * Tracing for driver callbacks.
+ */
+
+DECLARE_EVENT_CLASS(local_only_evt,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+	),
+	TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
+);
+
+DECLARE_EVENT_CLASS(local_sdata_addr_evt,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__array(char, addr, 6)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		memcpy(__entry->addr, sdata->vif.addr, 6);
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " addr:%pM",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr
+	)
+);
+
+DECLARE_EVENT_CLASS(local_u32_evt,
+	TP_PROTO(struct ieee80211_local *local, u32 value),
+	TP_ARGS(local, value),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, value)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->value = value;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " value:%d",
+		LOCAL_PR_ARG, __entry->value
+	)
+);
+
+DECLARE_EVENT_CLASS(local_sdata_evt,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG
+	)
+);
+
+DEFINE_EVENT(local_only_evt, drv_return_void,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_return_int,
+	TP_PROTO(struct ieee80211_local *local, int ret),
+	TP_ARGS(local, ret),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, ret)
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->ret = ret;
+	),
+	TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(drv_return_bool,
+	TP_PROTO(struct ieee80211_local *local, bool ret),
+	TP_ARGS(local, ret),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(bool, ret)
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->ret = ret;
+	),
+	TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ?
+		  "true" : "false")
+);
+
+TRACE_EVENT(drv_return_u64,
+	TP_PROTO(struct ieee80211_local *local, u64 ret),
+	TP_ARGS(local, ret),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u64, ret)
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->ret = ret;
+	),
+	TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret)
+);
+
+DEFINE_EVENT(local_only_evt, drv_start,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_suspend,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_resume,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_stop,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_change_interface,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 enum nl80211_iftype type, bool p2p),
+
+	TP_ARGS(local, sdata, type, p2p),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, new_type)
+		__field(bool, new_p2p)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->new_type = type;
+		__entry->new_p2p = p2p;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " new type:%d%s",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
+		__entry->new_p2p ? "/p2p" : ""
+	)
+);
+
+DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata)
+);
+
+TRACE_EVENT(drv_config,
+	TP_PROTO(struct ieee80211_local *local,
+		 u32 changed),
+
+	TP_ARGS(local, changed),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, changed)
+		__field(u32, flags)
+		__field(int, power_level)
+		__field(int, dynamic_ps_timeout)
+		__field(int, max_sleep_period)
+		__field(u16, listen_interval)
+		__field(u8, long_frame_max_tx_count)
+		__field(u8, short_frame_max_tx_count)
+		__field(int, center_freq)
+		__field(int, channel_type)
+		__field(int, smps)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->changed = changed;
+		__entry->flags = local->hw.conf.flags;
+		__entry->power_level = local->hw.conf.power_level;
+		__entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
+		__entry->max_sleep_period = local->hw.conf.max_sleep_period;
+		__entry->listen_interval = local->hw.conf.listen_interval;
+		__entry->long_frame_max_tx_count = local->hw.conf.long_frame_max_tx_count;
+		__entry->short_frame_max_tx_count = local->hw.conf.short_frame_max_tx_count;
+		__entry->center_freq = local->hw.conf.channel->center_freq;
+		__entry->channel_type = local->hw.conf.channel_type;
+		__entry->smps = local->hw.conf.smps_mode;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " ch:%#x freq:%d",
+		LOCAL_PR_ARG, __entry->changed, __entry->center_freq
+	)
+);
+
+TRACE_EVENT(drv_bss_info_changed,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_bss_conf *info,
+		 u32 changed),
+
+	TP_ARGS(local, sdata, info, changed),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(bool, assoc)
+		__field(u16, aid)
+		__field(bool, cts)
+		__field(bool, shortpre)
+		__field(bool, shortslot)
+		__field(u8, dtimper)
+		__field(u16, bcnint)
+		__field(u16, assoc_cap)
+		__field(u64, timestamp)
+		__field(u32, basic_rates)
+		__field(u32, changed)
+		__field(bool, enable_beacon)
+		__field(u16, ht_operation_mode)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->changed = changed;
+		__entry->aid = info->aid;
+		__entry->assoc = info->assoc;
+		__entry->shortpre = info->use_short_preamble;
+		__entry->cts = info->use_cts_prot;
+		__entry->shortslot = info->use_short_slot;
+		__entry->dtimper = info->dtim_period;
+		__entry->bcnint = info->beacon_int;
+		__entry->assoc_cap = info->assoc_capability;
+		__entry->timestamp = info->timestamp;
+		__entry->basic_rates = info->basic_rates;
+		__entry->enable_beacon = info->enable_beacon;
+		__entry->ht_operation_mode = info->ht_operation_mode;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " changed:%#x",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
+	)
+);
+
+TRACE_EVENT(drv_prepare_multicast,
+	TP_PROTO(struct ieee80211_local *local, int mc_count),
+
+	TP_ARGS(local, mc_count),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, mc_count)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->mc_count = mc_count;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " prepare mc (%d)",
+		LOCAL_PR_ARG, __entry->mc_count
+	)
+);
+
+TRACE_EVENT(drv_configure_filter,
+	TP_PROTO(struct ieee80211_local *local,
+		 unsigned int changed_flags,
+		 unsigned int *total_flags,
+		 u64 multicast),
+
+	TP_ARGS(local, changed_flags, total_flags, multicast),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(unsigned int, changed)
+		__field(unsigned int, total)
+		__field(u64, multicast)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->changed = changed_flags;
+		__entry->total = *total_flags;
+		__entry->multicast = multicast;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " changed:%#x total:%#x",
+		LOCAL_PR_ARG, __entry->changed, __entry->total
+	)
+);
+
+TRACE_EVENT(drv_set_tim,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sta *sta, bool set),
+
+	TP_ARGS(local, sta, set),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		STA_ENTRY
+		__field(bool, set)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		STA_ASSIGN;
+		__entry->set = set;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT STA_PR_FMT " set:%d",
+		LOCAL_PR_ARG, STA_PR_FMT, __entry->set
+	)
+);
+
+TRACE_EVENT(drv_set_key,
+	TP_PROTO(struct ieee80211_local *local,
+		 enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta,
+		 struct ieee80211_key_conf *key),
+
+	TP_ARGS(local, cmd, sdata, sta, key),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u32, cipher)
+		__field(u8, hw_key_idx)
+		__field(u8, flags)
+		__field(s8, keyidx)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->cipher = key->cipher;
+		__entry->flags = key->flags;
+		__entry->keyidx = key->keyidx;
+		__entry->hw_key_idx = key->hw_key_idx;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+	)
+);
+
+TRACE_EVENT(drv_update_tkip_key,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_key_conf *conf,
+		 struct ieee80211_sta *sta, u32 iv32),
+
+	TP_ARGS(local, sdata, conf, sta, iv32),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u32, iv32)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->iv32 = iv32;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x",
+		LOCAL_PR_ARG,VIF_PR_ARG,STA_PR_ARG, __entry->iv32
+	)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_hw_scan,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata),
+	TP_ARGS(local, sdata)
+);
+
+DEFINE_EVENT(local_only_evt, drv_sw_scan_start,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_sw_scan_complete,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_get_stats,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_low_level_stats *stats,
+		 int ret),
+
+	TP_ARGS(local, stats, ret),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, ret)
+		__field(unsigned int, ackfail)
+		__field(unsigned int, rtsfail)
+		__field(unsigned int, fcserr)
+		__field(unsigned int, rtssucc)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->ret = ret;
+		__entry->ackfail = stats->dot11ACKFailureCount;
+		__entry->rtsfail = stats->dot11RTSFailureCount;
+		__entry->fcserr = stats->dot11FCSErrorCount;
+		__entry->rtssucc = stats->dot11RTSSuccessCount;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " ret:%d",
+		LOCAL_PR_ARG, __entry->ret
+	)
+);
+
+TRACE_EVENT(drv_get_tkip_seq,
+	TP_PROTO(struct ieee80211_local *local,
+		 u8 hw_key_idx, u32 *iv32, u16 *iv16),
+
+	TP_ARGS(local, hw_key_idx, iv32, iv16),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u8, hw_key_idx)
+		__field(u32, iv32)
+		__field(u16, iv16)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->hw_key_idx = hw_key_idx;
+		__entry->iv32 = *iv32;
+		__entry->iv16 = *iv16;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT, LOCAL_PR_ARG
+	)
+);
+
+DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold,
+	TP_PROTO(struct ieee80211_local *local, u32 value),
+	TP_ARGS(local, value)
+);
+
+DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold,
+	TP_PROTO(struct ieee80211_local *local, u32 value),
+	TP_ARGS(local, value)
+);
+
+TRACE_EVENT(drv_set_coverage_class,
+	TP_PROTO(struct ieee80211_local *local, u8 value),
+
+	TP_ARGS(local, value),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u8, value)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->value = value;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " value:%d",
+		LOCAL_PR_ARG, __entry->value
+	)
+);
+
+TRACE_EVENT(drv_sta_notify,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 enum sta_notify_cmd cmd,
+		 struct ieee80211_sta *sta),
+
+	TP_ARGS(local, sdata, cmd, sta),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u32, cmd)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->cmd = cmd;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " cmd:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd
+	)
+);
+
+TRACE_EVENT(drv_sta_add,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta),
+
+	TP_ARGS(local, sdata, sta),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+	)
+);
+
+TRACE_EVENT(drv_sta_remove,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta),
+
+	TP_ARGS(local, sdata, sta),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
+	)
+);
+
+TRACE_EVENT(drv_conf_tx,
+	TP_PROTO(struct ieee80211_local *local, u16 queue,
+		 const struct ieee80211_tx_queue_params *params),
+
+	TP_ARGS(local, queue, params),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u16, queue)
+		__field(u16, txop)
+		__field(u16, cw_min)
+		__field(u16, cw_max)
+		__field(u8, aifs)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->queue = queue;
+		__entry->txop = params->txop;
+		__entry->cw_max = params->cw_max;
+		__entry->cw_min = params->cw_min;
+		__entry->aifs = params->aifs;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " queue:%d",
+		LOCAL_PR_ARG, __entry->queue
+	)
+);
+
+DEFINE_EVENT(local_only_evt, drv_get_tsf,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_set_tsf,
+	TP_PROTO(struct ieee80211_local *local, u64 tsf),
+
+	TP_ARGS(local, tsf),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u64, tsf)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->tsf = tsf;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " tsf:%llu",
+		LOCAL_PR_ARG, (unsigned long long)__entry->tsf
+	)
+);
+
+DEFINE_EVENT(local_only_evt, drv_reset_tsf,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_ampdu_action,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 enum ieee80211_ampdu_mlme_action action,
+		 struct ieee80211_sta *sta, u16 tid,
+		 u16 *ssn, u8 buf_size),
+
+	TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		STA_ENTRY
+		__field(u32, action)
+		__field(u16, tid)
+		__field(u16, ssn)
+		__field(u8, buf_size)
+		VIF_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_ASSIGN;
+		__entry->action = action;
+		__entry->tid = tid;
+		__entry->ssn = ssn ? *ssn : 0;
+		__entry->buf_size = buf_size;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
+		__entry->tid, __entry->buf_size
+	)
+);
+
+TRACE_EVENT(drv_get_survey,
+	TP_PROTO(struct ieee80211_local *local, int idx,
+		 struct survey_info *survey),
+
+	TP_ARGS(local, idx, survey),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, idx)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->idx = idx;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " idx:%d",
+		LOCAL_PR_ARG, __entry->idx
+	)
+);
+
+TRACE_EVENT(drv_flush,
+	TP_PROTO(struct ieee80211_local *local, bool drop),
+
+	TP_ARGS(local, drop),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(bool, drop)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->drop = drop;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " drop:%d",
+		LOCAL_PR_ARG, __entry->drop
+	)
+);
+
+TRACE_EVENT(drv_channel_switch,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_channel_switch *ch_switch),
+
+	TP_ARGS(local, ch_switch),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u64, timestamp)
+		__field(bool, block_tx)
+		__field(u16, freq)
+		__field(u8, count)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->timestamp = ch_switch->timestamp;
+		__entry->block_tx = ch_switch->block_tx;
+		__entry->freq = ch_switch->channel->center_freq;
+		__entry->count = ch_switch->count;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " new freq:%u count:%d",
+		LOCAL_PR_ARG, __entry->freq, __entry->count
+	)
+);
+
+TRACE_EVENT(drv_set_antenna,
+	TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
+
+	TP_ARGS(local, tx_ant, rx_ant, ret),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, tx_ant)
+		__field(u32, rx_ant)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->tx_ant = tx_ant;
+		__entry->rx_ant = rx_ant;
+		__entry->ret = ret;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
+		LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
+	)
+);
+
+TRACE_EVENT(drv_get_antenna,
+	TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
+
+	TP_ARGS(local, tx_ant, rx_ant, ret),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, tx_ant)
+		__field(u32, rx_ant)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->tx_ant = tx_ant;
+		__entry->rx_ant = rx_ant;
+		__entry->ret = ret;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
+		LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
+	)
+);
+
+TRACE_EVENT(drv_remain_on_channel,
+	TP_PROTO(struct ieee80211_local *local, struct ieee80211_channel *chan,
+		 enum nl80211_channel_type chantype, unsigned int duration),
+
+	TP_ARGS(local, chan, chantype, duration),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, center_freq)
+		__field(int, channel_type)
+		__field(unsigned int, duration)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->center_freq = chan->center_freq;
+		__entry->channel_type = chantype;
+		__entry->duration = duration;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " freq:%dMHz duration:%dms",
+		LOCAL_PR_ARG, __entry->center_freq, __entry->duration
+	)
+);
+
+DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_offchannel_tx,
+	TP_PROTO(struct ieee80211_local *local, struct sk_buff *skb,
+		 struct ieee80211_channel *chan,
+		 enum nl80211_channel_type channel_type,
+		 unsigned int wait),
+
+	TP_ARGS(local, skb, chan, channel_type, wait),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, center_freq)
+		__field(int, channel_type)
+		__field(unsigned int, wait)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->center_freq = chan->center_freq;
+		__entry->channel_type = channel_type;
+		__entry->wait = wait;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " freq:%dMHz, wait:%dms",
+		LOCAL_PR_ARG, __entry->center_freq, __entry->wait
+	)
+);
+
+TRACE_EVENT(drv_set_ringparam,
+	TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx),
+
+	TP_ARGS(local, tx, rx),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, tx)
+		__field(u32, rx)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->tx = tx;
+		__entry->rx = rx;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " tx:%d rx %d",
+		LOCAL_PR_ARG, __entry->tx, __entry->rx
+	)
+);
+
+TRACE_EVENT(drv_get_ringparam,
+	TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max,
+		 u32 *rx, u32 *rx_max),
+
+	TP_ARGS(local, tx, tx_max, rx, rx_max),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u32, tx)
+		__field(u32, tx_max)
+		__field(u32, rx)
+		__field(u32, rx_max)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->tx = *tx;
+		__entry->tx_max = *tx_max;
+		__entry->rx = *rx;
+		__entry->rx_max = *rx_max;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d",
+		LOCAL_PR_ARG,
+		__entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max
+	)
+);
+
+DEFINE_EVENT(local_only_evt, drv_tx_frames_pending,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(drv_set_bitrate_mask,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 const struct cfg80211_bitrate_mask *mask),
+
+	TP_ARGS(local, sdata, mask),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, legacy_2g)
+		__field(u32, legacy_5g)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy;
+		__entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g
+	)
+);
+
+/*
+ * Tracing for API calls that drivers call.
+ */
+
+TRACE_EVENT(api_start_tx_ba_session,
+	TP_PROTO(struct ieee80211_sta *sta, u16 tid),
+
+	TP_ARGS(sta, tid),
+
+	TP_STRUCT__entry(
+		STA_ENTRY
+		__field(u16, tid)
+	),
+
+	TP_fast_assign(
+		STA_ASSIGN;
+		__entry->tid = tid;
+	),
+
+	TP_printk(
+		STA_PR_FMT " tid:%d",
+		STA_PR_ARG, __entry->tid
+	)
+);
+
+TRACE_EVENT(api_start_tx_ba_cb,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),
+
+	TP_ARGS(sdata, ra, tid),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+		__array(u8, ra, ETH_ALEN)
+		__field(u16, tid)
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+		memcpy(__entry->ra, ra, ETH_ALEN);
+		__entry->tid = tid;
+	),
+
+	TP_printk(
+		VIF_PR_FMT " ra:%pM tid:%d",
+		VIF_PR_ARG, __entry->ra, __entry->tid
+	)
+);
+
+TRACE_EVENT(api_stop_tx_ba_session,
+	TP_PROTO(struct ieee80211_sta *sta, u16 tid),
+
+	TP_ARGS(sta, tid),
+
+	TP_STRUCT__entry(
+		STA_ENTRY
+		__field(u16, tid)
+	),
+
+	TP_fast_assign(
+		STA_ASSIGN;
+		__entry->tid = tid;
+	),
+
+	TP_printk(
+		STA_PR_FMT " tid:%d",
+		STA_PR_ARG, __entry->tid
+	)
+);
+
+TRACE_EVENT(api_stop_tx_ba_cb,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),
+
+	TP_ARGS(sdata, ra, tid),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+		__array(u8, ra, ETH_ALEN)
+		__field(u16, tid)
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+		memcpy(__entry->ra, ra, ETH_ALEN);
+		__entry->tid = tid;
+	),
+
+	TP_printk(
+		VIF_PR_FMT " ra:%pM tid:%d",
+		VIF_PR_ARG, __entry->ra, __entry->tid
+	)
+);
+
+DEFINE_EVENT(local_only_evt, api_restart_hw,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+TRACE_EVENT(api_beacon_loss,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata),
+
+	TP_ARGS(sdata),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+	),
+
+	TP_printk(
+		VIF_PR_FMT,
+		VIF_PR_ARG
+	)
+);
+
+TRACE_EVENT(api_connection_loss,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata),
+
+	TP_ARGS(sdata),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+	),
+
+	TP_printk(
+		VIF_PR_FMT,
+		VIF_PR_ARG
+	)
+);
+
+TRACE_EVENT(api_cqm_rssi_notify,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata,
+		 enum nl80211_cqm_rssi_threshold_event rssi_event),
+
+	TP_ARGS(sdata, rssi_event),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+		__field(u32, rssi_event)
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+		__entry->rssi_event = rssi_event;
+	),
+
+	TP_printk(
+		VIF_PR_FMT " event:%d",
+		VIF_PR_ARG, __entry->rssi_event
+	)
+);
+
+TRACE_EVENT(api_scan_completed,
+	TP_PROTO(struct ieee80211_local *local, bool aborted),
+
+	TP_ARGS(local, aborted),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(bool, aborted)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->aborted = aborted;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " aborted:%d",
+		LOCAL_PR_ARG, __entry->aborted
+	)
+);
+
+TRACE_EVENT(api_sched_scan_results,
+	TP_PROTO(struct ieee80211_local *local),
+
+	TP_ARGS(local),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT, LOCAL_PR_ARG
+	)
+);
+
+TRACE_EVENT(api_sched_scan_stopped,
+	TP_PROTO(struct ieee80211_local *local),
+
+	TP_ARGS(local),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT, LOCAL_PR_ARG
+	)
+);
+
+TRACE_EVENT(api_sta_block_awake,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sta *sta, bool block),
+
+	TP_ARGS(local, sta, block),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		STA_ENTRY
+		__field(bool, block)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		STA_ASSIGN;
+		__entry->block = block;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT STA_PR_FMT " block:%d",
+		LOCAL_PR_ARG, STA_PR_FMT, __entry->block
+	)
+);
+
+TRACE_EVENT(api_chswitch_done,
+	TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success),
+
+	TP_ARGS(sdata, success),
+
+	TP_STRUCT__entry(
+		VIF_ENTRY
+		__field(bool, success)
+	),
+
+	TP_fast_assign(
+		VIF_ASSIGN;
+		__entry->success = success;
+	),
+
+	TP_printk(
+		VIF_PR_FMT " success=%d",
+		VIF_PR_ARG, __entry->success
+	)
+);
+
+DEFINE_EVENT(local_only_evt, api_ready_on_channel,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local)
+);
+
+/*
+ * Tracing for internal functions
+ * (which may also be called in response to driver calls)
+ */
+
+TRACE_EVENT(wake_queue,
+	TP_PROTO(struct ieee80211_local *local, u16 queue,
+		 enum queue_stop_reason reason),
+
+	TP_ARGS(local, queue, reason),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u16, queue)
+		__field(u32, reason)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->queue = queue;
+		__entry->reason = reason;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " queue:%d, reason:%d",
+		LOCAL_PR_ARG, __entry->queue, __entry->reason
+	)
+);
+
+TRACE_EVENT(stop_queue,
+	TP_PROTO(struct ieee80211_local *local, u16 queue,
+		 enum queue_stop_reason reason),
+
+	TP_ARGS(local, queue, reason),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u16, queue)
+		__field(u32, reason)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->queue = queue;
+		__entry->reason = reason;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " queue:%d, reason:%d",
+		LOCAL_PR_ARG, __entry->queue, __entry->reason
+	)
+);
+#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE driver-trace
+#include <trace/define_trace.h>
diff --git a/net/mac80211/event.c b/net/mac80211/event.c
new file mode 100644
index 00000000..01ae7595
--- /dev/null
+++ b/net/mac80211/event.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * mac80211 - events
+ */
+#include <net/cfg80211.h>
+#include "ieee80211_i.h"
+
+/*
+ * Indicate a failed Michael MIC to userspace. If the caller knows the TSC of
+ * the frame that generated the MIC failure (i.e., if it was provided by the
+ * driver or is still in the frame), it should provide that information.
+ */
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
+				     struct ieee80211_hdr *hdr, const u8 *tsc,
+				     gfp_t gfp)
+{
+	cfg80211_michael_mic_failure(sdata->dev, hdr->addr2,
+				     (hdr->addr1[0] & 0x01) ?
+				     NL80211_KEYTYPE_GROUP :
+				     NL80211_KEYTYPE_PAIRWISE,
+				     keyidx, tsc, gfp);
+}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
new file mode 100644
index 00000000..591add22
--- /dev/null
+++ b/net/mac80211/ht.c
@@ -0,0 +1,318 @@
+/*
+ * HT handling
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+
+void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
+				       struct ieee80211_ht_cap *ht_cap_ie,
+				       struct ieee80211_sta_ht_cap *ht_cap)
+{
+	u8 ampdu_info, tx_mcs_set_cap;
+	int i, max_tx_streams;
+
+	BUG_ON(!ht_cap);
+
+	memset(ht_cap, 0, sizeof(*ht_cap));
+
+	if (!ht_cap_ie || !sband->ht_cap.ht_supported)
+		return;
+
+	ht_cap->ht_supported = true;
+
+	/*
+	 * The bits listed in this expression should be
+	 * the same for the peer and us, if the station
+	 * advertises more then we can't use those thus
+	 * we mask them out.
+	 */
+	ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info) &
+		(sband->ht_cap.cap |
+		 ~(IEEE80211_HT_CAP_LDPC_CODING |
+		   IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+		   IEEE80211_HT_CAP_GRN_FLD |
+		   IEEE80211_HT_CAP_SGI_20 |
+		   IEEE80211_HT_CAP_SGI_40 |
+		   IEEE80211_HT_CAP_DSSSCCK40));
+	/*
+	 * The STBC bits are asymmetric -- if we don't have
+	 * TX then mask out the peer's RX and vice versa.
+	 */
+	if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_TX_STBC))
+		ht_cap->cap &= ~IEEE80211_HT_CAP_RX_STBC;
+	if (!(sband->ht_cap.cap & IEEE80211_HT_CAP_RX_STBC))
+		ht_cap->cap &= ~IEEE80211_HT_CAP_TX_STBC;
+
+	ampdu_info = ht_cap_ie->ampdu_params_info;
+	ht_cap->ampdu_factor =
+		ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR;
+	ht_cap->ampdu_density =
+		(ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2;
+
+	/* own MCS TX capabilities */
+	tx_mcs_set_cap = sband->ht_cap.mcs.tx_params;
+
+	/* Copy peer MCS TX capabilities, the driver might need them. */
+	ht_cap->mcs.tx_params = ht_cap_ie->mcs.tx_params;
+
+	/* can we TX with MCS rates? */
+	if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED))
+		return;
+
+	/* Counting from 0, therefore +1 */
+	if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF)
+		max_tx_streams =
+			((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK)
+				>> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
+	else
+		max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS;
+
+	/*
+	 * 802.11n-2009 20.3.5 / 20.6 says:
+	 * - indices 0 to 7 and 32 are single spatial stream
+	 * - 8 to 31 are multiple spatial streams using equal modulation
+	 *   [8..15 for two streams, 16..23 for three and 24..31 for four]
+	 * - remainder are multiple spatial streams using unequal modulation
+	 */
+	for (i = 0; i < max_tx_streams; i++)
+		ht_cap->mcs.rx_mask[i] =
+			sband->ht_cap.mcs.rx_mask[i] & ht_cap_ie->mcs.rx_mask[i];
+
+	if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION)
+		for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE;
+		     i < IEEE80211_HT_MCS_MASK_LEN; i++)
+			ht_cap->mcs.rx_mask[i] =
+				sband->ht_cap.mcs.rx_mask[i] &
+					ht_cap_ie->mcs.rx_mask[i];
+
+	/* handle MCS rate 32 too */
+	if (sband->ht_cap.mcs.rx_mask[32/8] & ht_cap_ie->mcs.rx_mask[32/8] & 1)
+		ht_cap->mcs.rx_mask[32/8] |= 1;
+}
+
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx)
+{
+	int i;
+
+	cancel_work_sync(&sta->ampdu_mlme.work);
+
+	for (i = 0; i <  STA_TID_NUM; i++) {
+		__ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx);
+		__ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
+					       WLAN_REASON_QSTA_LEAVE_QBSS, tx);
+	}
+}
+
+void ieee80211_ba_session_work(struct work_struct *work)
+{
+	struct sta_info *sta =
+		container_of(work, struct sta_info, ampdu_mlme.work);
+	struct tid_ampdu_tx *tid_tx;
+	int tid;
+
+	/*
+	 * When this flag is set, new sessions should be
+	 * blocked, and existing sessions will be torn
+	 * down by the code that set the flag, so this
+	 * need not run.
+	 */
+	if (test_sta_flags(sta, WLAN_STA_BLOCK_BA))
+		return;
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	for (tid = 0; tid < STA_TID_NUM; tid++) {
+		if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired))
+			___ieee80211_stop_rx_ba_session(
+				sta, tid, WLAN_BACK_RECIPIENT,
+				WLAN_REASON_QSTA_TIMEOUT, true);
+
+		tid_tx = sta->ampdu_mlme.tid_start_tx[tid];
+		if (tid_tx) {
+			/*
+			 * Assign it over to the normal tid_tx array
+			 * where it "goes live".
+			 */
+			spin_lock_bh(&sta->lock);
+
+			sta->ampdu_mlme.tid_start_tx[tid] = NULL;
+			/* could there be a race? */
+			if (sta->ampdu_mlme.tid_tx[tid])
+				kfree(tid_tx);
+			else
+				ieee80211_assign_tid_tx(sta, tid, tid_tx);
+			spin_unlock_bh(&sta->lock);
+
+			ieee80211_tx_ba_session_handle_start(sta, tid);
+			continue;
+		}
+
+		tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+		if (tid_tx && test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
+						 &tid_tx->state))
+			___ieee80211_stop_tx_ba_session(sta, tid,
+							WLAN_BACK_INITIATOR,
+							true);
+	}
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
+			  const u8 *da, u16 tid,
+			  u16 initiator, u16 reason_code)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u16 params;
+
+	skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer "
+					"for delba frame\n", sdata->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	if (sdata->vif.type == NL80211_IFTYPE_AP ||
+	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+	else if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba));
+
+	mgmt->u.action.category = WLAN_CATEGORY_BACK;
+	mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA;
+	params = (u16)(initiator << 11); 	/* bit 11 initiator */
+	params |= (u16)(tid << 12); 		/* bit 15:12 TID number */
+
+	mgmt->u.action.u.delba.params = cpu_to_le16(params);
+	mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code);
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+			     struct sta_info *sta,
+			     struct ieee80211_mgmt *mgmt, size_t len)
+{
+	u16 tid, params;
+	u16 initiator;
+
+	params = le16_to_cpu(mgmt->u.action.u.delba.params);
+	tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12;
+	initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	if (net_ratelimit())
+		printk(KERN_DEBUG "delba from %pM (%s) tid %d reason code %d\n",
+			mgmt->sa, initiator ? "initiator" : "recipient", tid,
+			le16_to_cpu(mgmt->u.action.u.delba.reason_code));
+#endif /* CONFIG_MAC80211_HT_DEBUG */
+
+	if (initiator == WLAN_BACK_INITIATOR)
+		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0,
+					       true);
+	else
+		__ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+					       true);
+}
+
+int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
+			       enum ieee80211_smps_mode smps, const u8 *da,
+			       const u8 *bssid)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *action_frame;
+
+	/* 27 = header + category + action + smps mode */
+	skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	action_frame = (void *)skb_put(skb, 27);
+	memcpy(action_frame->da, da, ETH_ALEN);
+	memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN);
+	memcpy(action_frame->bssid, bssid, ETH_ALEN);
+	action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_ACTION);
+	action_frame->u.action.category = WLAN_CATEGORY_HT;
+	action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS;
+	switch (smps) {
+	case IEEE80211_SMPS_AUTOMATIC:
+	case IEEE80211_SMPS_NUM_MODES:
+		WARN_ON(1);
+	case IEEE80211_SMPS_OFF:
+		action_frame->u.action.u.ht_smps.smps_control =
+				WLAN_HT_SMPS_CONTROL_DISABLED;
+		break;
+	case IEEE80211_SMPS_STATIC:
+		action_frame->u.action.u.ht_smps.smps_control =
+				WLAN_HT_SMPS_CONTROL_STATIC;
+		break;
+	case IEEE80211_SMPS_DYNAMIC:
+		action_frame->u.action.u.ht_smps.smps_control =
+				WLAN_HT_SMPS_CONTROL_DYNAMIC;
+		break;
+	}
+
+	/* we'll do more on status of this frame */
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+	ieee80211_tx_skb(sdata, skb);
+
+	return 0;
+}
+
+void ieee80211_request_smps_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.request_smps_work);
+
+	mutex_lock(&sdata->u.mgd.mtx);
+	__ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode);
+	mutex_unlock(&sdata->u.mgd.mtx);
+}
+
+void ieee80211_request_smps(struct ieee80211_vif *vif,
+			    enum ieee80211_smps_mode smps_mode)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+		return;
+
+	if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF))
+		smps_mode = IEEE80211_SMPS_AUTOMATIC;
+
+	sdata->u.mgd.driver_smps_mode = smps_mode;
+
+	ieee80211_queue_work(&sdata->local->hw,
+			     &sdata->u.mgd.request_smps_work);
+}
+/* this might change ... don't want non-open drivers using it */
+EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
new file mode 100644
index 00000000..56c24cab
--- /dev/null
+++ b/net/mac80211/ibss.c
@@ -0,0 +1,1016 @@
+/*
+ * IBSS mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+
+#define IEEE80211_SCAN_INTERVAL (2 * HZ)
+#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
+#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
+
+#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
+#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
+
+#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
+
+
+static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_mgmt *mgmt,
+					size_t len)
+{
+	u16 auth_alg, auth_transaction;
+
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
+	if (len < 24 + 6)
+		return;
+
+	auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+
+	/*
+	 * IEEE 802.11 standard does not require authentication in IBSS
+	 * networks and most implementations do not seem to use it.
+	 * However, try to reply to authentication attempts if someone
+	 * has actually implemented this.
+	 */
+	if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
+		ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0,
+				    sdata->u.ibss.bssid, NULL, 0, 0);
+}
+
+static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+				      const u8 *bssid, const int beacon_int,
+				      struct ieee80211_channel *chan,
+				      const u32 basic_rates,
+				      const u16 capability, u64 tsf)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	int rates, i;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos;
+	struct ieee80211_supported_band *sband;
+	struct cfg80211_bss *bss;
+	u32 bss_change;
+	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
+
+	lockdep_assert_held(&ifibss->mtx);
+
+	/* Reset own TSF to allow time synchronization work. */
+	drv_reset_tsf(local);
+
+	skb = ifibss->skb;
+	rcu_assign_pointer(ifibss->presp, NULL);
+	synchronize_rcu();
+	skb->data = skb->head;
+	skb->len = 0;
+	skb_reset_tail_pointer(skb);
+	skb_reserve(skb, sdata->local->hw.extra_tx_headroom);
+
+	if (memcmp(ifibss->bssid, bssid, ETH_ALEN))
+		sta_info_flush(sdata->local, sdata);
+
+	/* if merging, indicate to driver that we leave the old IBSS */
+	if (sdata->vif.bss_conf.ibss_joined) {
+		sdata->vif.bss_conf.ibss_joined = false;
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS);
+	}
+
+	memcpy(ifibss->bssid, bssid, ETH_ALEN);
+
+	sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0;
+
+	local->oper_channel = chan;
+	WARN_ON(!ieee80211_set_channel_type(local, sdata, NL80211_CHAN_NO_HT));
+	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+
+	sband = local->hw.wiphy->bands[chan->band];
+
+	/* build supported rates array */
+	pos = supp_rates;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		int rate = sband->bitrates[i].bitrate;
+		u8 basic = 0;
+		if (basic_rates & BIT(i))
+			basic = 0x80;
+		*pos++ = basic | (u8) (rate / 5);
+	}
+
+	/* Build IBSS probe response */
+	mgmt = (void *) skb_put(skb, 24 + sizeof(mgmt->u.beacon));
+	memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_PROBE_RESP);
+	memset(mgmt->da, 0xff, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
+	mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int);
+	mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
+	mgmt->u.beacon.capab_info = cpu_to_le16(capability);
+
+	pos = skb_put(skb, 2 + ifibss->ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ifibss->ssid_len;
+	memcpy(pos, ifibss->ssid, ifibss->ssid_len);
+
+	rates = sband->n_bitrates;
+	if (rates > 8)
+		rates = 8;
+	pos = skb_put(skb, 2 + rates);
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = rates;
+	memcpy(pos, supp_rates, rates);
+
+	if (sband->band == IEEE80211_BAND_2GHZ) {
+		pos = skb_put(skb, 2 + 1);
+		*pos++ = WLAN_EID_DS_PARAMS;
+		*pos++ = 1;
+		*pos++ = ieee80211_frequency_to_channel(chan->center_freq);
+	}
+
+	pos = skb_put(skb, 2 + 2);
+	*pos++ = WLAN_EID_IBSS_PARAMS;
+	*pos++ = 2;
+	/* FIX: set ATIM window based on scan results */
+	*pos++ = 0;
+	*pos++ = 0;
+
+	if (sband->n_bitrates > 8) {
+		rates = sband->n_bitrates - 8;
+		pos = skb_put(skb, 2 + rates);
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = rates;
+		memcpy(pos, &supp_rates[8], rates);
+	}
+
+	if (ifibss->ie_len)
+		memcpy(skb_put(skb, ifibss->ie_len),
+		       ifibss->ie, ifibss->ie_len);
+
+	if (local->hw.queues >= 4) {
+		pos = skb_put(skb, 9);
+		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
+		*pos++ = 7; /* len */
+		*pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
+		*pos++ = 0x50;
+		*pos++ = 0xf2;
+		*pos++ = 2; /* WME */
+		*pos++ = 0; /* WME info */
+		*pos++ = 1; /* WME ver */
+		*pos++ = 0; /* U-APSD no in use */
+	}
+
+	rcu_assign_pointer(ifibss->presp, skb);
+
+	sdata->vif.bss_conf.beacon_int = beacon_int;
+	sdata->vif.bss_conf.basic_rates = basic_rates;
+	bss_change = BSS_CHANGED_BEACON_INT;
+	bss_change |= ieee80211_reset_erp_info(sdata);
+	bss_change |= BSS_CHANGED_BSSID;
+	bss_change |= BSS_CHANGED_BEACON;
+	bss_change |= BSS_CHANGED_BEACON_ENABLED;
+	bss_change |= BSS_CHANGED_BASIC_RATES;
+	bss_change |= BSS_CHANGED_IBSS;
+	sdata->vif.bss_conf.ibss_joined = true;
+	ieee80211_bss_info_change_notify(sdata, bss_change);
+
+	ieee80211_sta_def_wmm_params(sdata, sband->n_bitrates, supp_rates);
+
+	ifibss->state = IEEE80211_IBSS_MLME_JOINED;
+	mod_timer(&ifibss->timer,
+		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
+
+	bss = cfg80211_inform_bss_frame(local->hw.wiphy, local->hw.conf.channel,
+					mgmt, skb->len, 0, GFP_KERNEL);
+	cfg80211_put_bss(bss);
+	cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL);
+}
+
+static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_bss *bss)
+{
+	struct cfg80211_bss *cbss =
+		container_of((void *)bss, struct cfg80211_bss, priv);
+	struct ieee80211_supported_band *sband;
+	u32 basic_rates;
+	int i, j;
+	u16 beacon_int = cbss->beacon_interval;
+
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
+	if (beacon_int < 10)
+		beacon_int = 10;
+
+	sband = sdata->local->hw.wiphy->bands[cbss->channel->band];
+
+	basic_rates = 0;
+
+	for (i = 0; i < bss->supp_rates_len; i++) {
+		int rate = (bss->supp_rates[i] & 0x7f) * 5;
+		bool is_basic = !!(bss->supp_rates[i] & 0x80);
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].bitrate == rate) {
+				if (is_basic)
+					basic_rates |= BIT(j);
+				break;
+			}
+		}
+	}
+
+	__ieee80211_sta_join_ibss(sdata, cbss->bssid,
+				  beacon_int,
+				  cbss->channel,
+				  basic_rates,
+				  cbss->capability,
+				  cbss->tsf);
+}
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len,
+				  struct ieee80211_rx_status *rx_status,
+				  struct ieee802_11_elems *elems,
+				  bool beacon)
+{
+	struct ieee80211_local *local = sdata->local;
+	int freq;
+	struct cfg80211_bss *cbss;
+	struct ieee80211_bss *bss;
+	struct sta_info *sta;
+	struct ieee80211_channel *channel;
+	u64 beacon_timestamp, rx_timestamp;
+	u32 supp_rates = 0;
+	enum ieee80211_band band = rx_status->band;
+
+	if (elems->ds_params && elems->ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems->ds_params[0],
+						      band);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+	    memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
+
+		rcu_read_lock();
+		sta = sta_info_get(sdata, mgmt->sa);
+
+		if (elems->supp_rates) {
+			supp_rates = ieee80211_sta_get_rates(local, elems,
+							     band);
+			if (sta) {
+				u32 prev_rates;
+
+				prev_rates = sta->sta.supp_rates[band];
+				/* make sure mandatory rates are always added */
+				sta->sta.supp_rates[band] = supp_rates |
+					ieee80211_mandatory_rates(local, band);
+
+				if (sta->sta.supp_rates[band] != prev_rates) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+					printk(KERN_DEBUG
+						"%s: updated supp_rates set "
+						"for %pM based on beacon"
+						"/probe_resp (0x%x -> 0x%x)\n",
+						sdata->name, sta->sta.addr,
+						prev_rates,
+						sta->sta.supp_rates[band]);
+#endif
+					rate_control_rate_init(sta);
+				}
+			} else
+				sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
+						mgmt->sa, supp_rates,
+						GFP_ATOMIC);
+		}
+
+		if (sta && elems->wmm_info)
+			set_sta_flags(sta, WLAN_STA_WME);
+
+		rcu_read_unlock();
+	}
+
+	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+					channel, beacon);
+	if (!bss)
+		return;
+
+	cbss = container_of((void *)bss, struct cfg80211_bss, priv);
+
+	/* was just updated in ieee80211_bss_info_update */
+	beacon_timestamp = cbss->tsf;
+
+	/* check if we need to merge IBSS */
+
+	/* we use a fixed BSSID */
+	if (sdata->u.ibss.fixed_bssid)
+		goto put_bss;
+
+	/* not an IBSS */
+	if (!(cbss->capability & WLAN_CAPABILITY_IBSS))
+		goto put_bss;
+
+	/* different channel */
+	if (cbss->channel != local->oper_channel)
+		goto put_bss;
+
+	/* different SSID */
+	if (elems->ssid_len != sdata->u.ibss.ssid_len ||
+	    memcmp(elems->ssid, sdata->u.ibss.ssid,
+				sdata->u.ibss.ssid_len))
+		goto put_bss;
+
+	/* same BSSID */
+	if (memcmp(cbss->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0)
+		goto put_bss;
+
+	if (rx_status->flag & RX_FLAG_MACTIME_MPDU) {
+		/*
+		 * For correct IBSS merging we need mactime; since mactime is
+		 * defined as the time the first data symbol of the frame hits
+		 * the PHY, and the timestamp of the beacon is defined as "the
+		 * time that the data symbol containing the first bit of the
+		 * timestamp is transmitted to the PHY plus the transmitting
+		 * STA's delays through its local PHY from the MAC-PHY
+		 * interface to its interface with the WM" (802.11 11.1.2)
+		 * - equals the time this bit arrives at the receiver - we have
+		 * to take into account the offset between the two.
+		 *
+		 * E.g. at 1 MBit that means mactime is 192 usec earlier
+		 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
+		 */
+		int rate;
+
+		if (rx_status->flag & RX_FLAG_HT)
+			rate = 65; /* TODO: HT rates */
+		else
+			rate = local->hw.wiphy->bands[band]->
+				bitrates[rx_status->rate_idx].bitrate;
+
+		rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
+	} else {
+		/*
+		 * second best option: get current TSF
+		 * (will return -1 if not supported)
+		 */
+		rx_timestamp = drv_get_tsf(local);
+	}
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
+	       "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
+	       mgmt->sa, mgmt->bssid,
+	       (unsigned long long)rx_timestamp,
+	       (unsigned long long)beacon_timestamp,
+	       (unsigned long long)(rx_timestamp - beacon_timestamp),
+	       jiffies);
+#endif
+
+	if (beacon_timestamp > rx_timestamp) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+		printk(KERN_DEBUG "%s: beacon TSF higher than "
+		       "local TSF - IBSS merge with BSSID %pM\n",
+		       sdata->name, mgmt->bssid);
+#endif
+		ieee80211_sta_join_ibss(sdata, bss);
+		supp_rates = ieee80211_sta_get_rates(local, elems, band);
+		ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
+				       supp_rates, GFP_KERNEL);
+	}
+
+ put_bss:
+	ieee80211_rx_bss_put(local, bss);
+}
+
+/*
+ * Add a new IBSS station, will also be called by the RX code when,
+ * in IBSS mode, receiving a frame from a yet-unknown station, hence
+ * must be callable in atomic context.
+ */
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+					u8 *bssid,u8 *addr, u32 supp_rates,
+					gfp_t gfp)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	int band = local->hw.conf.channel->band;
+
+	/*
+	 * XXX: Consider removing the least recently used entry and
+	 * 	allow new one to be added.
+	 */
+	if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "%s: No room for a new IBSS STA entry %pM\n",
+			       sdata->name, addr);
+		return NULL;
+	}
+
+	if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH)
+		return NULL;
+
+	if (compare_ether_addr(bssid, sdata->u.ibss.bssid))
+		return NULL;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n",
+		    addr, sdata->name);
+#endif
+
+	sta = sta_info_alloc(sdata, addr, gfp);
+	if (!sta)
+		return NULL;
+
+	sta->last_rx = jiffies;
+	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
+
+	/* make sure mandatory rates are always added */
+	sta->sta.supp_rates[band] = supp_rates |
+			ieee80211_mandatory_rates(local, band);
+
+	rate_control_rate_init(sta);
+
+	/* If it fails, maybe we raced another insertion? */
+	if (sta_info_insert(sta))
+		return sta_info_get(sdata, addr);
+	return sta;
+}
+
+static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	int active = 0;
+	struct sta_info *sta;
+
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sta->sdata == sdata &&
+		    time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
+			       jiffies)) {
+			active++;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return active;
+}
+
+/*
+ * This function is called with state == IEEE80211_IBSS_MLME_JOINED
+ */
+
+static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	lockdep_assert_held(&ifibss->mtx);
+
+	mod_timer(&ifibss->timer,
+		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
+
+	ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
+
+	if (time_before(jiffies, ifibss->last_scan_completed +
+		       IEEE80211_IBSS_MERGE_INTERVAL))
+		return;
+
+	if (ieee80211_sta_active_ibss(sdata))
+		return;
+
+	if (ifibss->fixed_channel)
+		return;
+
+	printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
+	       "IBSS networks with same SSID (merge)\n", sdata->name);
+
+	ieee80211_request_internal_scan(sdata,
+			ifibss->ssid, ifibss->ssid_len,
+			ifibss->fixed_channel ? ifibss->channel : NULL);
+}
+
+static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	u8 bssid[ETH_ALEN];
+	u16 capability;
+	int i;
+
+	lockdep_assert_held(&ifibss->mtx);
+
+	if (ifibss->fixed_bssid) {
+		memcpy(bssid, ifibss->bssid, ETH_ALEN);
+	} else {
+		/* Generate random, not broadcast, locally administered BSSID. Mix in
+		 * own MAC address to make sure that devices that do not have proper
+		 * random number generator get different BSSID. */
+		get_random_bytes(bssid, ETH_ALEN);
+		for (i = 0; i < ETH_ALEN; i++)
+			bssid[i] ^= sdata->vif.addr[i];
+		bssid[0] &= ~0x01;
+		bssid[0] |= 0x02;
+	}
+
+	printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
+	       sdata->name, bssid);
+
+	capability = WLAN_CAPABILITY_IBSS;
+
+	if (ifibss->privacy)
+		capability |= WLAN_CAPABILITY_PRIVACY;
+	else
+		sdata->drop_unencrypted = 0;
+
+	__ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int,
+				  ifibss->channel, ifibss->basic_rates,
+				  capability, 0);
+}
+
+/*
+ * This function is called with state == IEEE80211_IBSS_MLME_SEARCH
+ */
+
+static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct cfg80211_bss *cbss;
+	struct ieee80211_channel *chan = NULL;
+	const u8 *bssid = NULL;
+	int active_ibss;
+	u16 capability;
+
+	lockdep_assert_held(&ifibss->mtx);
+
+	active_ibss = ieee80211_sta_active_ibss(sdata);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
+	       sdata->name, active_ibss);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	if (active_ibss)
+		return;
+
+	capability = WLAN_CAPABILITY_IBSS;
+	if (ifibss->privacy)
+		capability |= WLAN_CAPABILITY_PRIVACY;
+	if (ifibss->fixed_bssid)
+		bssid = ifibss->bssid;
+	if (ifibss->fixed_channel)
+		chan = ifibss->channel;
+	if (!is_zero_ether_addr(ifibss->bssid))
+		bssid = ifibss->bssid;
+	cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid,
+				ifibss->ssid, ifibss->ssid_len,
+				WLAN_CAPABILITY_IBSS | WLAN_CAPABILITY_PRIVACY,
+				capability);
+
+	if (cbss) {
+		struct ieee80211_bss *bss;
+
+		bss = (void *)cbss->priv;
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+		printk(KERN_DEBUG "   sta_find_ibss: selected %pM current "
+		       "%pM\n", cbss->bssid, ifibss->bssid);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+		printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
+		       " based on configured SSID\n",
+		       sdata->name, cbss->bssid);
+
+		ieee80211_sta_join_ibss(sdata, bss);
+		ieee80211_rx_bss_put(local, bss);
+		return;
+	}
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "   did not try to join ibss\n");
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	/* Selected IBSS not found in current scan results - try to scan */
+	if (time_after(jiffies, ifibss->last_scan_completed +
+					IEEE80211_SCAN_INTERVAL)) {
+		printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
+		       "join\n", sdata->name);
+
+		ieee80211_request_internal_scan(sdata,
+				ifibss->ssid, ifibss->ssid_len,
+				ifibss->fixed_channel ? ifibss->channel : NULL);
+	} else {
+		int interval = IEEE80211_SCAN_INTERVAL;
+
+		if (time_after(jiffies, ifibss->ibss_join_req +
+			       IEEE80211_IBSS_JOIN_TIMEOUT)) {
+			if (!(local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS)) {
+				ieee80211_sta_create_ibss(sdata);
+				return;
+			}
+			printk(KERN_DEBUG "%s: IBSS not allowed on"
+			       " %d MHz\n", sdata->name,
+			       local->hw.conf.channel->center_freq);
+
+			/* No IBSS found - decrease scan interval and continue
+			 * scanning. */
+			interval = IEEE80211_SCAN_INTERVAL_SLOW;
+		}
+
+		mod_timer(&ifibss->timer,
+			  round_jiffies(jiffies + interval));
+	}
+}
+
+static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
+					struct sk_buff *req)
+{
+	struct ieee80211_mgmt *mgmt = (void *)req->data;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	int tx_last_beacon, len = req->len;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *resp;
+	struct sk_buff *presp;
+	u8 *pos, *end;
+
+	lockdep_assert_held(&ifibss->mtx);
+
+	presp = rcu_dereference_protected(ifibss->presp,
+					  lockdep_is_held(&ifibss->mtx));
+
+	if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
+	    len < 24 + 2 || !presp)
+		return;
+
+	tx_last_beacon = drv_tx_last_beacon(local);
+
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
+	       " (tx_last_beacon=%d)\n",
+	       sdata->name, mgmt->sa, mgmt->da,
+	       mgmt->bssid, tx_last_beacon);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+
+	if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da))
+		return;
+
+	if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 &&
+	    memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
+		return;
+
+	end = ((u8 *) mgmt) + len;
+	pos = mgmt->u.probe_req.variable;
+	if (pos[0] != WLAN_EID_SSID ||
+	    pos + 2 + pos[1] > end) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+		printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
+		       "from %pM\n",
+		       sdata->name, mgmt->sa);
+#endif
+		return;
+	}
+	if (pos[1] != 0 &&
+	    (pos[1] != ifibss->ssid_len ||
+	     memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) {
+		/* Ignore ProbeReq for foreign SSID */
+		return;
+	}
+
+	/* Reply with ProbeResp */
+	skb = skb_copy(presp, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	resp = (struct ieee80211_mgmt *) skb->data;
+	memcpy(resp->da, mgmt->sa, ETH_ALEN);
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+	printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
+	       sdata->name, resp->da);
+#endif /* CONFIG_MAC80211_IBSS_DEBUG */
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
+					 struct ieee80211_mgmt *mgmt,
+					 size_t len,
+					 struct ieee80211_rx_status *rx_status)
+{
+	size_t baselen;
+	struct ieee802_11_elems elems;
+
+	if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN))
+		return; /* ignore ProbeResp to foreign address */
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+				&elems);
+
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
+}
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len,
+				     struct ieee80211_rx_status *rx_status)
+{
+	size_t baselen;
+	struct ieee802_11_elems elems;
+
+	/* Process beacon from the current BSS */
+	baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
+
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
+}
+
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_mgmt *mgmt;
+	u16 fc;
+
+	rx_status = IEEE80211_SKB_RXCB(skb);
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	mutex_lock(&sdata->u.ibss.mtx);
+
+	if (!sdata->u.ibss.ssid_len)
+		goto mgmt_out; /* not ready to merge yet */
+
+	switch (fc & IEEE80211_FCTL_STYPE) {
+	case IEEE80211_STYPE_PROBE_REQ:
+		ieee80211_rx_mgmt_probe_req(sdata, skb);
+		break;
+	case IEEE80211_STYPE_PROBE_RESP:
+		ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
+					     rx_status);
+		break;
+	case IEEE80211_STYPE_BEACON:
+		ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+					 rx_status);
+		break;
+	case IEEE80211_STYPE_AUTH:
+		ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
+		break;
+	}
+
+ mgmt_out:
+	mutex_unlock(&sdata->u.ibss.mtx);
+}
+
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	mutex_lock(&ifibss->mtx);
+
+	/*
+	 * Work could be scheduled after scan or similar
+	 * when we aren't even joined (or trying) with a
+	 * network.
+	 */
+	if (!ifibss->ssid_len)
+		goto out;
+
+	switch (ifibss->state) {
+	case IEEE80211_IBSS_MLME_SEARCH:
+		ieee80211_sta_find_ibss(sdata);
+		break;
+	case IEEE80211_IBSS_MLME_JOINED:
+		ieee80211_sta_merge_ibss(sdata);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+ out:
+	mutex_unlock(&ifibss->mtx);
+}
+
+static void ieee80211_ibss_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->quiescing) {
+		ifibss->timer_running = true;
+		return;
+	}
+
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+#ifdef CONFIG_PM
+void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	if (del_timer_sync(&ifibss->timer))
+		ifibss->timer_running = true;
+}
+
+void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	if (ifibss->timer_running) {
+		add_timer(&ifibss->timer);
+		ifibss->timer_running = false;
+	}
+}
+#endif
+
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+
+	setup_timer(&ifibss->timer, ieee80211_ibss_timer,
+		    (unsigned long) sdata);
+	mutex_init(&ifibss->mtx);
+}
+
+/* scan finished notification */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
+			continue;
+		sdata->u.ibss.last_scan_completed = jiffies;
+		ieee80211_queue_work(&local->hw, &sdata->work);
+	}
+	mutex_unlock(&local->iflist_mtx);
+}
+
+int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
+			struct cfg80211_ibss_params *params)
+{
+	struct sk_buff *skb;
+
+	skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom +
+			    36 /* bitrates */ +
+			    34 /* SSID */ +
+			    3  /* DS params */ +
+			    4  /* IBSS params */ +
+			    params->ie_len);
+	if (!skb)
+		return -ENOMEM;
+
+	mutex_lock(&sdata->u.ibss.mtx);
+
+	if (params->bssid) {
+		memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN);
+		sdata->u.ibss.fixed_bssid = true;
+	} else
+		sdata->u.ibss.fixed_bssid = false;
+
+	sdata->u.ibss.privacy = params->privacy;
+	sdata->u.ibss.basic_rates = params->basic_rates;
+	memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate,
+	       sizeof(params->mcast_rate));
+
+	sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+
+	sdata->u.ibss.channel = params->channel;
+	sdata->u.ibss.fixed_channel = params->channel_fixed;
+
+	/* fix ourselves to that channel now already */
+	if (params->channel_fixed) {
+		sdata->local->oper_channel = params->channel;
+		WARN_ON(!ieee80211_set_channel_type(sdata->local, sdata,
+						    NL80211_CHAN_NO_HT));
+	}
+
+	if (params->ie) {
+		sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len,
+					   GFP_KERNEL);
+		if (sdata->u.ibss.ie)
+			sdata->u.ibss.ie_len = params->ie_len;
+	}
+
+	sdata->u.ibss.skb = skb;
+	sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH;
+	sdata->u.ibss.ibss_join_req = jiffies;
+
+	memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN);
+	sdata->u.ibss.ssid_len = params->ssid_len;
+
+	mutex_unlock(&sdata->u.ibss.mtx);
+
+	mutex_lock(&sdata->local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
+
+	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+
+	return 0;
+}
+
+int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
+{
+	struct sk_buff *skb;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct cfg80211_bss *cbss;
+	u16 capability;
+	int active_ibss;
+
+	mutex_lock(&sdata->u.ibss.mtx);
+
+	sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH;
+	memset(sdata->u.ibss.bssid, 0, ETH_ALEN);
+	sdata->u.ibss.ssid_len = 0;
+
+	active_ibss = ieee80211_sta_active_ibss(sdata);
+
+	if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) {
+		capability = WLAN_CAPABILITY_IBSS;
+
+		if (ifibss->privacy)
+			capability |= WLAN_CAPABILITY_PRIVACY;
+
+		cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->channel,
+					ifibss->bssid, ifibss->ssid,
+					ifibss->ssid_len, WLAN_CAPABILITY_IBSS |
+					WLAN_CAPABILITY_PRIVACY,
+					capability);
+
+		if (cbss) {
+			cfg80211_unlink_bss(local->hw.wiphy, cbss);
+			cfg80211_put_bss(cbss);
+		}
+	}
+
+	sta_info_flush(sdata->local, sdata);
+
+	/* remove beacon */
+	kfree(sdata->u.ibss.ie);
+	skb = rcu_dereference_protected(sdata->u.ibss.presp,
+					lockdep_is_held(&sdata->u.ibss.mtx));
+	rcu_assign_pointer(sdata->u.ibss.presp, NULL);
+	sdata->vif.bss_conf.ibss_joined = false;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED |
+						BSS_CHANGED_IBSS);
+	synchronize_rcu();
+	kfree_skb(skb);
+
+	skb_queue_purge(&sdata->skb_queue);
+
+	del_timer_sync(&sdata->u.ibss.timer);
+
+	mutex_unlock(&sdata->u.ibss.mtx);
+
+	mutex_lock(&local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&local->mtx);
+
+	return 0;
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
new file mode 100644
index 00000000..3fdac77b
--- /dev/null
+++ b/net/mac80211/ieee80211_i.h
@@ -0,0 +1,1408 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_I_H
+#define IEEE80211_I_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/if_ether.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/etherdevice.h>
+#include <linux/leds.h>
+#include <net/ieee80211_radiotap.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include "key.h"
+#include "sta_info.h"
+
+struct ieee80211_local;
+
+/* Maximum number of broadcast/multicast frames to buffer when some of the
+ * associated stations are using power saving. */
+#define AP_MAX_BC_BUFFER 128
+
+/* Maximum number of frames buffered to all STAs, including multicast frames.
+ * Note: increasing this limit increases the potential memory requirement. Each
+ * frame can be up to about 2 kB long. */
+#define TOTAL_MAX_TX_BUFFER 512
+
+/* Required encryption head and tailroom */
+#define IEEE80211_ENCRYPT_HEADROOM 8
+#define IEEE80211_ENCRYPT_TAILROOM 18
+
+/* IEEE 802.11 (Ch. 9.5 Defragmentation) requires support for concurrent
+ * reception of at least three fragmented frames. This limit can be increased
+ * by changing this define, at the cost of slower frame reassembly and
+ * increased memory use (about 2 kB of RAM per entry). */
+#define IEEE80211_FRAGMENT_MAX 4
+
+#define TU_TO_EXP_TIME(x)	(jiffies + usecs_to_jiffies((x) * 1024))
+
+#define IEEE80211_DEFAULT_UAPSD_QUEUES \
+	(IEEE80211_WMM_IE_STA_QOSINFO_AC_BK |	\
+	 IEEE80211_WMM_IE_STA_QOSINFO_AC_BE |	\
+	 IEEE80211_WMM_IE_STA_QOSINFO_AC_VI |	\
+	 IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
+
+#define IEEE80211_DEFAULT_MAX_SP_LEN		\
+	IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL
+
+struct ieee80211_fragment_entry {
+	unsigned long first_frag_time;
+	unsigned int seq;
+	unsigned int rx_queue;
+	unsigned int last_frag;
+	unsigned int extra_len;
+	struct sk_buff_head skb_list;
+	int ccmp; /* Whether fragments were encrypted with CCMP */
+	u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
+};
+
+
+struct ieee80211_bss {
+	/* don't want to look up all the time */
+	size_t ssid_len;
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+
+	u8 dtim_period;
+
+	bool wmm_used;
+	bool uapsd_supported;
+
+	unsigned long last_probe_resp;
+
+#ifdef CONFIG_MAC80211_MESH
+	u8 *mesh_id;
+	size_t mesh_id_len;
+	u8 *mesh_cfg;
+#endif
+
+#define IEEE80211_MAX_SUPP_RATES 32
+	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
+	size_t supp_rates_len;
+
+	/*
+	 * During association, we save an ERP value from a probe response so
+	 * that we can feed ERP info to the driver when handling the
+	 * association completes. these fields probably won't be up-to-date
+	 * otherwise, you probably don't want to use them.
+	 */
+	bool has_erp_value;
+	u8 erp_value;
+};
+
+static inline u8 *bss_mesh_cfg(struct ieee80211_bss *bss)
+{
+#ifdef CONFIG_MAC80211_MESH
+	return bss->mesh_cfg;
+#endif
+	return NULL;
+}
+
+static inline u8 *bss_mesh_id(struct ieee80211_bss *bss)
+{
+#ifdef CONFIG_MAC80211_MESH
+	return bss->mesh_id;
+#endif
+	return NULL;
+}
+
+static inline u8 bss_mesh_id_len(struct ieee80211_bss *bss)
+{
+#ifdef CONFIG_MAC80211_MESH
+	return bss->mesh_id_len;
+#endif
+	return 0;
+}
+
+
+typedef unsigned __bitwise__ ieee80211_tx_result;
+#define TX_CONTINUE	((__force ieee80211_tx_result) 0u)
+#define TX_DROP		((__force ieee80211_tx_result) 1u)
+#define TX_QUEUED	((__force ieee80211_tx_result) 2u)
+
+#define IEEE80211_TX_FRAGMENTED		BIT(0)
+#define IEEE80211_TX_UNICAST		BIT(1)
+#define IEEE80211_TX_PS_BUFFERED	BIT(2)
+
+struct ieee80211_tx_data {
+	struct sk_buff *skb;
+	struct ieee80211_local *local;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+	struct ieee80211_key *key;
+
+	struct ieee80211_channel *channel;
+
+	u16 ethertype;
+	unsigned int flags;
+};
+
+
+typedef unsigned __bitwise__ ieee80211_rx_result;
+#define RX_CONTINUE		((__force ieee80211_rx_result) 0u)
+#define RX_DROP_UNUSABLE	((__force ieee80211_rx_result) 1u)
+#define RX_DROP_MONITOR		((__force ieee80211_rx_result) 2u)
+#define RX_QUEUED		((__force ieee80211_rx_result) 3u)
+
+/**
+ * enum ieee80211_packet_rx_flags - packet RX flags
+ * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
+ *	(incl. multicast frames)
+ * @IEEE80211_RX_IN_SCAN: received while scanning
+ * @IEEE80211_RX_FRAGMENTED: fragmented frame
+ * @IEEE80211_RX_AMSDU: a-MSDU packet
+ * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
+ * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering
+ *
+ * These are per-frame flags that are attached to a frame in the
+ * @rx_flags field of &struct ieee80211_rx_status.
+ */
+enum ieee80211_packet_rx_flags {
+	IEEE80211_RX_IN_SCAN			= BIT(0),
+	IEEE80211_RX_RA_MATCH			= BIT(1),
+	IEEE80211_RX_FRAGMENTED			= BIT(2),
+	IEEE80211_RX_AMSDU			= BIT(3),
+	IEEE80211_RX_MALFORMED_ACTION_FRM	= BIT(4),
+	IEEE80211_RX_DEFERRED_RELEASE		= BIT(5),
+};
+
+/**
+ * enum ieee80211_rx_flags - RX data flags
+ *
+ * @IEEE80211_RX_CMNTR: received on cooked monitor already
+ *
+ * These flags are used across handling multiple interfaces
+ * for a single frame.
+ */
+enum ieee80211_rx_flags {
+	IEEE80211_RX_CMNTR		= BIT(0),
+};
+
+struct ieee80211_rx_data {
+	struct sk_buff *skb;
+	struct ieee80211_local *local;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+	struct ieee80211_key *key;
+
+	unsigned int flags;
+	int queue;
+	u32 tkip_iv32;
+	u16 tkip_iv16;
+};
+
+struct beacon_data {
+	u8 *head, *tail;
+	int head_len, tail_len;
+	int dtim_period;
+};
+
+struct ieee80211_if_ap {
+	struct beacon_data __rcu *beacon;
+
+	struct list_head vlans;
+
+	/* yes, this looks ugly, but guarantees that we can later use
+	 * bitmap_empty :)
+	 * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */
+	u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)];
+	struct sk_buff_head ps_bc_buf;
+	atomic_t num_sta_ps; /* number of stations in PS mode */
+	int dtim_count;
+	bool dtim_bc_mc;
+};
+
+struct ieee80211_if_wds {
+	struct sta_info *sta;
+	u8 remote_addr[ETH_ALEN];
+};
+
+struct ieee80211_if_vlan {
+	struct list_head list;
+
+	/* used for all tx if the VLAN is configured to 4-addr mode */
+	struct sta_info __rcu *sta;
+};
+
+struct mesh_stats {
+	__u32 fwded_mcast;		/* Mesh forwarded multicast frames */
+	__u32 fwded_unicast;		/* Mesh forwarded unicast frames */
+	__u32 fwded_frames;		/* Mesh total forwarded frames */
+	__u32 dropped_frames_ttl;	/* Not transmitted since mesh_ttl == 0*/
+	__u32 dropped_frames_no_route;	/* Not transmitted, no route found */
+	atomic_t estab_plinks;
+};
+
+#define PREQ_Q_F_START		0x1
+#define PREQ_Q_F_REFRESH	0x2
+struct mesh_preq_queue {
+	struct list_head list;
+	u8 dst[ETH_ALEN];
+	u8 flags;
+};
+
+enum ieee80211_work_type {
+	IEEE80211_WORK_ABORT,
+	IEEE80211_WORK_DIRECT_PROBE,
+	IEEE80211_WORK_AUTH,
+	IEEE80211_WORK_ASSOC_BEACON_WAIT,
+	IEEE80211_WORK_ASSOC,
+	IEEE80211_WORK_REMAIN_ON_CHANNEL,
+	IEEE80211_WORK_OFFCHANNEL_TX,
+};
+
+/**
+ * enum work_done_result - indicates what to do after work was done
+ *
+ * @WORK_DONE_DESTROY: This work item is no longer needed, destroy.
+ * @WORK_DONE_REQUEUE: This work item was reset to be reused, and
+ *	should be requeued.
+ */
+enum work_done_result {
+	WORK_DONE_DESTROY,
+	WORK_DONE_REQUEUE,
+};
+
+struct ieee80211_work {
+	struct list_head list;
+
+	struct rcu_head rcu_head;
+
+	struct ieee80211_sub_if_data *sdata;
+
+	enum work_done_result (*done)(struct ieee80211_work *wk,
+				      struct sk_buff *skb);
+
+	struct ieee80211_channel *chan;
+	enum nl80211_channel_type chan_type;
+
+	unsigned long timeout;
+	enum ieee80211_work_type type;
+
+	u8 filter_ta[ETH_ALEN];
+
+	bool started;
+
+	union {
+		struct {
+			int tries;
+			u16 algorithm, transaction;
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+			u8 key[WLAN_KEY_LEN_WEP104];
+			u8 key_len, key_idx;
+			bool privacy;
+		} probe_auth;
+		struct {
+			struct cfg80211_bss *bss;
+			const u8 *supp_rates;
+			const u8 *ht_information_ie;
+			enum ieee80211_smps_mode smps;
+			int tries;
+			u16 capability;
+			u8 prev_bssid[ETH_ALEN];
+			u8 ssid[IEEE80211_MAX_SSID_LEN];
+			u8 ssid_len;
+			u8 supp_rates_len;
+			bool wmm_used, use_11n, uapsd_used;
+		} assoc;
+		struct {
+			u32 duration;
+		} remain;
+		struct {
+			struct sk_buff *frame;
+			u32 wait;
+			bool status;
+		} offchan_tx;
+	};
+
+	int ie_len;
+	/* must be last */
+	u8 ie[0];
+};
+
+/* flags used in struct ieee80211_if_managed.flags */
+enum ieee80211_sta_flags {
+	IEEE80211_STA_BEACON_POLL	= BIT(0),
+	IEEE80211_STA_CONNECTION_POLL	= BIT(1),
+	IEEE80211_STA_CONTROL_PORT	= BIT(2),
+	IEEE80211_STA_DISABLE_11N	= BIT(4),
+	IEEE80211_STA_CSA_RECEIVED	= BIT(5),
+	IEEE80211_STA_MFP_ENABLED	= BIT(6),
+	IEEE80211_STA_UAPSD_ENABLED	= BIT(7),
+	IEEE80211_STA_NULLFUNC_ACKED	= BIT(8),
+	IEEE80211_STA_RESET_SIGNAL_AVE	= BIT(9),
+};
+
+struct ieee80211_if_managed {
+	struct timer_list timer;
+	struct timer_list conn_mon_timer;
+	struct timer_list bcn_mon_timer;
+	struct timer_list chswitch_timer;
+	struct work_struct monitor_work;
+	struct work_struct chswitch_work;
+	struct work_struct beacon_connection_loss_work;
+
+	unsigned long beacon_timeout;
+	unsigned long probe_timeout;
+	int probe_send_count;
+	bool nullfunc_failed;
+
+	struct mutex mtx;
+	struct cfg80211_bss *associated;
+
+	u8 bssid[ETH_ALEN];
+
+	u16 aid;
+
+	unsigned long timers_running; /* used for quiesce/restart */
+	bool powersave; /* powersave requested for this iface */
+	bool broken_ap; /* AP is broken -- turn off powersave */
+	enum ieee80211_smps_mode req_smps, /* requested smps mode */
+				 ap_smps, /* smps mode AP thinks we're in */
+				 driver_smps_mode; /* smps mode request */
+
+	struct work_struct request_smps_work;
+
+	unsigned int flags;
+
+	bool beacon_crc_valid;
+	u32 beacon_crc;
+
+	enum {
+		IEEE80211_MFP_DISABLED,
+		IEEE80211_MFP_OPTIONAL,
+		IEEE80211_MFP_REQUIRED
+	} mfp; /* management frame protection */
+
+	int wmm_last_param_set;
+
+	u8 use_4addr;
+
+	/* Signal strength from the last Beacon frame in the current BSS. */
+	int last_beacon_signal;
+
+	/*
+	 * Weighted average of the signal strength from Beacon frames in the
+	 * current BSS. This is in units of 1/16 of the signal unit to maintain
+	 * accuracy and to speed up calculations, i.e., the value need to be
+	 * divided by 16 to get the actual value.
+	 */
+	int ave_beacon_signal;
+
+	/*
+	 * Number of Beacon frames used in ave_beacon_signal. This can be used
+	 * to avoid generating less reliable cqm events that would be based
+	 * only on couple of received frames.
+	 */
+	unsigned int count_beacon_signal;
+
+	/*
+	 * Last Beacon frame signal strength average (ave_beacon_signal / 16)
+	 * that triggered a cqm event. 0 indicates that no event has been
+	 * generated for the current association.
+	 */
+	int last_cqm_event_signal;
+};
+
+struct ieee80211_if_ibss {
+	struct timer_list timer;
+
+	struct mutex mtx;
+
+	unsigned long last_scan_completed;
+
+	u32 basic_rates;
+
+	bool timer_running;
+
+	bool fixed_bssid;
+	bool fixed_channel;
+	bool privacy;
+
+	u8 bssid[ETH_ALEN];
+	u8 ssid[IEEE80211_MAX_SSID_LEN];
+	u8 ssid_len, ie_len;
+	u8 *ie;
+	struct ieee80211_channel *channel;
+
+	unsigned long ibss_join_req;
+	/* probe response/beacon for IBSS */
+	struct sk_buff __rcu *presp;
+	struct sk_buff *skb;
+
+	enum {
+		IEEE80211_IBSS_MLME_SEARCH,
+		IEEE80211_IBSS_MLME_JOINED,
+	} state;
+};
+
+struct ieee80211_if_mesh {
+	struct timer_list housekeeping_timer;
+	struct timer_list mesh_path_timer;
+	struct timer_list mesh_path_root_timer;
+
+	unsigned long timers_running;
+
+	unsigned long wrkq_flags;
+
+	u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
+	size_t mesh_id_len;
+	/* Active Path Selection Protocol Identifier */
+	u8 mesh_pp_id;
+	/* Active Path Selection Metric Identifier */
+	u8 mesh_pm_id;
+	/* Congestion Control Mode Identifier */
+	u8 mesh_cc_id;
+	/* Synchronization Protocol Identifier */
+	u8 mesh_sp_id;
+	/* Authentication Protocol Identifier */
+	u8 mesh_auth_id;
+	/* Local mesh Sequence Number */
+	u32 sn;
+	/* Last used PREQ ID */
+	u32 preq_id;
+	atomic_t mpaths;
+	/* Timestamp of last SN update */
+	unsigned long last_sn_update;
+	/* Timestamp of last SN sent */
+	unsigned long last_preq;
+	struct mesh_rmc *rmc;
+	spinlock_t mesh_preq_queue_lock;
+	struct mesh_preq_queue preq_queue;
+	int preq_queue_len;
+	struct mesh_stats mshstats;
+	struct mesh_config mshcfg;
+	u32 mesh_seqnum;
+	bool accepting_plinks;
+	const u8 *ie;
+	u8 ie_len;
+	enum {
+		IEEE80211_MESH_SEC_NONE = 0x0,
+		IEEE80211_MESH_SEC_AUTHED = 0x1,
+		IEEE80211_MESH_SEC_SECURED = 0x2,
+	} security;
+};
+
+#ifdef CONFIG_MAC80211_MESH
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name)	\
+	do { (msh)->mshstats.name++; } while (0)
+#else
+#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
+	do { } while (0)
+#endif
+
+/**
+ * enum ieee80211_sub_if_data_flags - virtual interface flags
+ *
+ * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
+ * @IEEE80211_SDATA_PROMISC: interface is promisc
+ * @IEEE80211_SDATA_OPERATING_GMODE: operating in G-only mode
+ * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
+ *	associated stations and deliver multicast frames both
+ *	back to wireless media and to the local net stack.
+ */
+enum ieee80211_sub_if_data_flags {
+	IEEE80211_SDATA_ALLMULTI		= BIT(0),
+	IEEE80211_SDATA_PROMISC			= BIT(1),
+	IEEE80211_SDATA_OPERATING_GMODE		= BIT(2),
+	IEEE80211_SDATA_DONT_BRIDGE_PACKETS	= BIT(3),
+};
+
+/**
+ * enum ieee80211_sdata_state_bits - virtual interface state bits
+ * @SDATA_STATE_RUNNING: virtual interface is up & running; this
+ *	mirrors netif_running() but is separate for interface type
+ *	change handling while the interface is up
+ * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
+ *	mode, so queues are stopped
+ */
+enum ieee80211_sdata_state_bits {
+	SDATA_STATE_RUNNING,
+	SDATA_STATE_OFFCHANNEL,
+};
+
+struct ieee80211_sub_if_data {
+	struct list_head list;
+
+	struct wireless_dev wdev;
+
+	/* keys */
+	struct list_head key_list;
+
+	struct net_device *dev;
+	struct ieee80211_local *local;
+
+	unsigned int flags;
+
+	unsigned long state;
+
+	int drop_unencrypted;
+
+	char name[IFNAMSIZ];
+
+	/*
+	 * keep track of whether the HT opmode (stored in
+	 * vif.bss_info.ht_operation_mode) is valid.
+	 */
+	bool ht_opmode_valid;
+
+	/* to detect idle changes */
+	bool old_idle;
+
+	/* Fragment table for host-based reassembly */
+	struct ieee80211_fragment_entry	fragments[IEEE80211_FRAGMENT_MAX];
+	unsigned int fragment_next;
+
+	struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+	struct ieee80211_key __rcu *default_unicast_key;
+	struct ieee80211_key __rcu *default_multicast_key;
+	struct ieee80211_key __rcu *default_mgmt_key;
+
+	u16 sequence_number;
+	__be16 control_port_protocol;
+	bool control_port_no_encrypt;
+
+	struct work_struct work;
+	struct sk_buff_head skb_queue;
+
+	bool arp_filter_state;
+
+	/*
+	 * AP this belongs to: self in AP mode and
+	 * corresponding AP in VLAN mode, NULL for
+	 * all others (might be needed later in IBSS)
+	 */
+	struct ieee80211_if_ap *bss;
+
+	/* bitmap of allowed (non-MCS) rate indexes for rate control */
+	u32 rc_rateidx_mask[IEEE80211_NUM_BANDS];
+
+	union {
+		struct ieee80211_if_ap ap;
+		struct ieee80211_if_wds wds;
+		struct ieee80211_if_vlan vlan;
+		struct ieee80211_if_managed mgd;
+		struct ieee80211_if_ibss ibss;
+		struct ieee80211_if_mesh mesh;
+		u32 mntr_flags;
+	} u;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct {
+		struct dentry *dir;
+		struct dentry *subdir_stations;
+		struct dentry *default_unicast_key;
+		struct dentry *default_multicast_key;
+		struct dentry *default_mgmt_key;
+	} debugfs;
+#endif
+	/* must be last, dynamically sized area in this! */
+	struct ieee80211_vif vif;
+};
+
+static inline
+struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
+{
+	return container_of(p, struct ieee80211_sub_if_data, vif);
+}
+
+enum sdata_queue_type {
+	IEEE80211_SDATA_QUEUE_TYPE_FRAME	= 0,
+	IEEE80211_SDATA_QUEUE_AGG_START		= 1,
+	IEEE80211_SDATA_QUEUE_AGG_STOP		= 2,
+};
+
+enum {
+	IEEE80211_RX_MSG	= 1,
+	IEEE80211_TX_STATUS_MSG	= 2,
+};
+
+enum queue_stop_reason {
+	IEEE80211_QUEUE_STOP_REASON_DRIVER,
+	IEEE80211_QUEUE_STOP_REASON_PS,
+	IEEE80211_QUEUE_STOP_REASON_CSA,
+	IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
+	IEEE80211_QUEUE_STOP_REASON_SUSPEND,
+	IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
+};
+
+#ifdef CONFIG_MAC80211_LEDS
+struct tpt_led_trigger {
+	struct led_trigger trig;
+	char name[32];
+	const struct ieee80211_tpt_blink *blink_table;
+	unsigned int blink_table_len;
+	struct timer_list timer;
+	unsigned long prev_traffic;
+	unsigned long tx_bytes, rx_bytes;
+	unsigned int active, want;
+	bool running;
+};
+#endif
+
+/**
+ * mac80211 scan flags - currently active scan mode
+ *
+ * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as
+ *	well be on the operating channel
+ * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to
+ *	determine if we are on the operating channel or not
+ * @SCAN_COMPLETED: Set for our scan work function when the driver reported
+ *	that the scan completed.
+ * @SCAN_ABORTED: Set for our scan work function when the driver reported
+ *	a scan complete for an aborted scan.
+ */
+enum {
+	SCAN_SW_SCANNING,
+	SCAN_HW_SCANNING,
+	SCAN_COMPLETED,
+	SCAN_ABORTED,
+};
+
+/**
+ * enum mac80211_scan_state - scan state machine states
+ *
+ * @SCAN_DECISION: Main entry point to the scan state machine, this state
+ *	determines if we should keep on scanning or switch back to the
+ *	operating channel
+ * @SCAN_SET_CHANNEL: Set the next channel to be scanned
+ * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses
+ * @SCAN_LEAVE_OPER_CHANNEL: Leave the operating channel, notify the AP
+ *	about us leaving the channel and stop all associated STA interfaces
+ * @SCAN_ENTER_OPER_CHANNEL: Enter the operating channel again, notify the
+ *	AP about us being back and restart all associated STA interfaces
+ */
+enum mac80211_scan_state {
+	SCAN_DECISION,
+	SCAN_SET_CHANNEL,
+	SCAN_SEND_PROBE,
+	SCAN_LEAVE_OPER_CHANNEL,
+	SCAN_ENTER_OPER_CHANNEL,
+};
+
+struct ieee80211_local {
+	/* embed the driver visible part.
+	 * don't cast (use the static inlines below), but we keep
+	 * it first anyway so they become a no-op */
+	struct ieee80211_hw hw;
+
+	const struct ieee80211_ops *ops;
+
+	/*
+	 * work stuff, potentially off-channel (in the future)
+	 */
+	struct list_head work_list;
+	struct timer_list work_timer;
+	struct work_struct work_work;
+	struct sk_buff_head work_skb_queue;
+
+	/*
+	 * private workqueue to mac80211. mac80211 makes this accessible
+	 * via ieee80211_queue_work()
+	 */
+	struct workqueue_struct *workqueue;
+
+	unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
+	/* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
+	spinlock_t queue_stop_reason_lock;
+
+	int open_count;
+	int monitors, cooked_mntrs;
+	/* number of interfaces with corresponding FIF_ flags */
+	int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
+	    fif_probe_req;
+	int probe_req_reg;
+	unsigned int filter_flags; /* FIF_* */
+
+	bool wiphy_ciphers_allocated;
+
+	/* protects the aggregated multicast list and filter calls */
+	spinlock_t filter_lock;
+
+	/* used for uploading changed mc list */
+	struct work_struct reconfig_filter;
+
+	/* used to reconfigure hardware SM PS */
+	struct work_struct recalc_smps;
+
+	/* aggregated multicast list */
+	struct netdev_hw_addr_list mc_list;
+
+	bool tim_in_locked_section; /* see ieee80211_beacon_get() */
+
+	/*
+	 * suspended is true if we finished all the suspend _and_ we have
+	 * not yet come up from resume. This is to be used by mac80211
+	 * to ensure driver sanity during suspend and mac80211's own
+	 * sanity. It can eventually be used for WoW as well.
+	 */
+	bool suspended;
+
+	/*
+	 * Resuming is true while suspended, but when we're reprogramming the
+	 * hardware -- at that time it's allowed to use ieee80211_queue_work()
+	 * again even though some other parts of the stack are still suspended
+	 * and we still drop received frames to avoid waking the stack.
+	 */
+	bool resuming;
+
+	/*
+	 * quiescing is true during the suspend process _only_ to
+	 * ease timer cancelling etc.
+	 */
+	bool quiescing;
+
+	/* device is started */
+	bool started;
+
+	/* wowlan is enabled -- don't reconfig on resume */
+	bool wowlan;
+
+	int tx_headroom; /* required headroom for hardware/radiotap */
+
+	/* Tasklet and skb queue to process calls from IRQ mode. All frames
+	 * added to skb_queue will be processed, but frames in
+	 * skb_queue_unreliable may be dropped if the total length of these
+	 * queues increases over the limit. */
+#define IEEE80211_IRQSAFE_QUEUE_LIMIT 128
+	struct tasklet_struct tasklet;
+	struct sk_buff_head skb_queue;
+	struct sk_buff_head skb_queue_unreliable;
+
+	/*
+	 * Internal FIFO queue which is shared between multiple rx path
+	 * stages. Its main task is to provide a serialization mechanism,
+	 * so all rx handlers can enjoy having exclusive access to their
+	 * private data structures.
+	 */
+	struct sk_buff_head rx_skb_queue;
+	bool running_rx_handler;	/* protected by rx_skb_queue.lock */
+
+	/* Station data */
+	/*
+	 * The mutex only protects the list and counter,
+	 * reads are done in RCU.
+	 * Additionally, the lock protects the hash table,
+	 * the pending list and each BSS's TIM bitmap.
+	 */
+	struct mutex sta_mtx;
+	spinlock_t sta_lock;
+	unsigned long num_sta;
+	struct list_head sta_list, sta_pending_list;
+	struct sta_info __rcu *sta_hash[STA_HASH_SIZE];
+	struct timer_list sta_cleanup;
+	struct work_struct sta_finish_work;
+	int sta_generation;
+
+	struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
+	struct tasklet_struct tx_pending_tasklet;
+
+	atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
+
+	/* number of interfaces with corresponding IFF_ flags */
+	atomic_t iff_allmultis, iff_promiscs;
+
+	struct rate_control_ref *rate_ctrl;
+
+	struct crypto_cipher *wep_tx_tfm;
+	struct crypto_cipher *wep_rx_tfm;
+	u32 wep_iv;
+
+	/* see iface.c */
+	struct list_head interfaces;
+	struct mutex iflist_mtx;
+
+	/*
+	 * Key mutex, protects sdata's key_list and sta_info's
+	 * key pointers (write access, they're RCU.)
+	 */
+	struct mutex key_mtx;
+
+	/* mutex for scan and work locking */
+	struct mutex mtx;
+
+	/* Scanning and BSS list */
+	unsigned long scanning;
+	struct cfg80211_ssid scan_ssid;
+	struct cfg80211_scan_request *int_scan_req;
+	struct cfg80211_scan_request *scan_req, *hw_scan_req;
+	struct ieee80211_channel *scan_channel;
+	enum ieee80211_band hw_scan_band;
+	int scan_channel_idx;
+	int scan_ies_len;
+
+	bool sched_scanning;
+	struct ieee80211_sched_scan_ies sched_scan_ies;
+	struct work_struct sched_scan_stopped_work;
+
+	unsigned long leave_oper_channel_time;
+	enum mac80211_scan_state next_scan_state;
+	struct delayed_work scan_work;
+	struct ieee80211_sub_if_data *scan_sdata;
+	enum nl80211_channel_type _oper_channel_type;
+	struct ieee80211_channel *oper_channel, *csa_channel;
+
+	/* Temporary remain-on-channel for off-channel operations */
+	struct ieee80211_channel *tmp_channel;
+	enum nl80211_channel_type tmp_channel_type;
+
+	/* SNMP counters */
+	/* dot11CountersTable */
+	u32 dot11TransmittedFragmentCount;
+	u32 dot11MulticastTransmittedFrameCount;
+	u32 dot11FailedCount;
+	u32 dot11RetryCount;
+	u32 dot11MultipleRetryCount;
+	u32 dot11FrameDuplicateCount;
+	u32 dot11ReceivedFragmentCount;
+	u32 dot11MulticastReceivedFrameCount;
+	u32 dot11TransmittedFrameCount;
+
+#ifdef CONFIG_MAC80211_LEDS
+	int tx_led_counter, rx_led_counter;
+	struct led_trigger *tx_led, *rx_led, *assoc_led, *radio_led;
+	struct tpt_led_trigger *tpt_led_trigger;
+	char tx_led_name[32], rx_led_name[32],
+	     assoc_led_name[32], radio_led_name[32];
+#endif
+
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
+	/* TX/RX handler statistics */
+	unsigned int tx_handlers_drop;
+	unsigned int tx_handlers_queued;
+	unsigned int tx_handlers_drop_unencrypted;
+	unsigned int tx_handlers_drop_fragment;
+	unsigned int tx_handlers_drop_wep;
+	unsigned int tx_handlers_drop_not_assoc;
+	unsigned int tx_handlers_drop_unauth_port;
+	unsigned int rx_handlers_drop;
+	unsigned int rx_handlers_queued;
+	unsigned int rx_handlers_drop_nullfunc;
+	unsigned int rx_handlers_drop_defrag;
+	unsigned int rx_handlers_drop_short;
+	unsigned int rx_handlers_drop_passive_scan;
+	unsigned int tx_expand_skb_head;
+	unsigned int tx_expand_skb_head_cloned;
+	unsigned int rx_expand_skb_head;
+	unsigned int rx_expand_skb_head2;
+	unsigned int rx_handlers_fragments;
+	unsigned int tx_status_drop;
+#define I802_DEBUG_INC(c) (c)++
+#else /* CONFIG_MAC80211_DEBUG_COUNTERS */
+#define I802_DEBUG_INC(c) do { } while (0)
+#endif /* CONFIG_MAC80211_DEBUG_COUNTERS */
+
+
+	int total_ps_buffered; /* total number of all buffered unicast and
+				* multicast packets for power saving stations
+				*/
+	int wifi_wme_noack_test;
+	unsigned int wmm_acm; /* bit field of ACM bits (BIT(802.1D tag)) */
+
+	/*
+	 * Bitmask of enabled u-apsd queues,
+	 * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association
+	 * to take effect.
+	 */
+	unsigned int uapsd_queues;
+
+	/*
+	 * Maximum number of buffered frames AP can deliver during a
+	 * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar.
+	 * Needs a new association to take effect.
+	 */
+	unsigned int uapsd_max_sp_len;
+
+	bool pspolling;
+	bool offchannel_ps_enabled;
+	/*
+	 * PS can only be enabled when we have exactly one managed
+	 * interface (and monitors) in PS, this then points there.
+	 */
+	struct ieee80211_sub_if_data *ps_sdata;
+	struct work_struct dynamic_ps_enable_work;
+	struct work_struct dynamic_ps_disable_work;
+	struct timer_list dynamic_ps_timer;
+	struct notifier_block network_latency_notifier;
+	struct notifier_block ifa_notifier;
+
+	/*
+	 * The dynamic ps timeout configured from user space via WEXT -
+	 * this will override whatever chosen by mac80211 internally.
+	 */
+	int dynamic_ps_forced_timeout;
+	int dynamic_ps_user_timeout;
+	bool disable_dynamic_ps;
+
+	int user_power_level; /* in dBm */
+	int power_constr_level; /* in dBm */
+
+	enum ieee80211_smps_mode smps_mode;
+
+	struct work_struct restart_work;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct local_debugfsdentries {
+		struct dentry *rcdir;
+		struct dentry *keys;
+	} debugfs;
+#endif
+
+	struct ieee80211_channel *hw_roc_channel;
+	struct net_device *hw_roc_dev;
+	struct sk_buff *hw_roc_skb, *hw_roc_skb_for_status;
+	struct work_struct hw_roc_start, hw_roc_done;
+	enum nl80211_channel_type hw_roc_channel_type;
+	unsigned int hw_roc_duration;
+	u32 hw_roc_cookie;
+	bool hw_roc_for_tx;
+	unsigned long hw_offchan_tx_cookie;
+
+	/* dummy netdev for use w/ NAPI */
+	struct net_device napi_dev;
+
+	struct napi_struct napi;
+};
+
+static inline struct ieee80211_sub_if_data *
+IEEE80211_DEV_TO_SUB_IF(struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+/* this struct represents 802.11n's RA/TID combination */
+struct ieee80211_ra_tid {
+	u8 ra[ETH_ALEN];
+	u16 tid;
+};
+
+/* Parsed Information Elements */
+struct ieee802_11_elems {
+	u8 *ie_start;
+	size_t total_len;
+
+	/* pointers to IEs */
+	u8 *ssid;
+	u8 *supp_rates;
+	u8 *fh_params;
+	u8 *ds_params;
+	u8 *cf_params;
+	struct ieee80211_tim_ie *tim;
+	u8 *ibss_params;
+	u8 *challenge;
+	u8 *wpa;
+	u8 *rsn;
+	u8 *erp_info;
+	u8 *ext_supp_rates;
+	u8 *wmm_info;
+	u8 *wmm_param;
+	struct ieee80211_ht_cap *ht_cap_elem;
+	struct ieee80211_ht_info *ht_info_elem;
+	struct ieee80211_meshconf_ie *mesh_config;
+	u8 *mesh_id;
+	u8 *peer_link;
+	u8 *preq;
+	u8 *prep;
+	u8 *perr;
+	struct ieee80211_rann_ie *rann;
+	u8 *ch_switch_elem;
+	u8 *country_elem;
+	u8 *pwr_constr_elem;
+	u8 *quiet_elem; 	/* first quite element */
+	u8 *timeout_int;
+
+	/* length of them, respectively */
+	u8 ssid_len;
+	u8 supp_rates_len;
+	u8 fh_params_len;
+	u8 ds_params_len;
+	u8 cf_params_len;
+	u8 tim_len;
+	u8 ibss_params_len;
+	u8 challenge_len;
+	u8 wpa_len;
+	u8 rsn_len;
+	u8 erp_info_len;
+	u8 ext_supp_rates_len;
+	u8 wmm_info_len;
+	u8 wmm_param_len;
+	u8 mesh_id_len;
+	u8 peer_link_len;
+	u8 preq_len;
+	u8 prep_len;
+	u8 perr_len;
+	u8 ch_switch_elem_len;
+	u8 country_elem_len;
+	u8 pwr_constr_elem_len;
+	u8 quiet_elem_len;
+	u8 num_of_quiet_elem;	/* can be more the one */
+	u8 timeout_int_len;
+};
+
+static inline struct ieee80211_local *hw_to_local(
+	struct ieee80211_hw *hw)
+{
+	return container_of(hw, struct ieee80211_local, hw);
+}
+
+static inline struct ieee80211_hw *local_to_hw(
+	struct ieee80211_local *local)
+{
+	return &local->hw;
+}
+
+
+static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
+{
+	return compare_ether_addr(raddr, addr) == 0 ||
+	       is_broadcast_ether_addr(raddr);
+}
+
+
+int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
+void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
+void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+				      u32 changed);
+void ieee80211_configure_filter(struct ieee80211_local *local);
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
+
+/* STA code */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
+int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
+		       struct cfg80211_auth_request *req);
+int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
+			struct cfg80211_assoc_request *req);
+int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
+			 struct cfg80211_deauth_request *req,
+			 void *cookie);
+int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_disassoc_request *req,
+			   void *cookie);
+void ieee80211_send_pspoll(struct ieee80211_local *local,
+			   struct ieee80211_sub_if_data *sdata);
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency);
+int ieee80211_max_network_latency(struct notifier_block *nb,
+				  unsigned long data, void *dummy);
+int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
+				      struct ieee80211_channel_sw_ie *sw_elem,
+				      struct ieee80211_bss *bss,
+				      u64 timestamp);
+void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				  struct sk_buff *skb);
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
+
+/* IBSS code */
+void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
+void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
+struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
+					u8 *bssid, u8 *addr, u32 supp_rates,
+					gfp_t gfp);
+int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
+			struct cfg80211_ibss_params *params);
+int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb);
+
+/* mesh code */
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb);
+
+/* scan/BSS handling */
+void ieee80211_scan_work(struct work_struct *work);
+int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
+				    const u8 *ssid, u8 ssid_len,
+				    struct ieee80211_channel *chan);
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_scan_request *req);
+void ieee80211_scan_cancel(struct ieee80211_local *local);
+ieee80211_rx_result
+ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
+
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
+struct ieee80211_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *rx_status,
+			  struct ieee80211_mgmt *mgmt,
+			  size_t len,
+			  struct ieee802_11_elems *elems,
+			  struct ieee80211_channel *channel,
+			  bool beacon);
+struct ieee80211_bss *
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len);
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+			  struct ieee80211_bss *bss);
+
+/* scheduled scan handling */
+int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+				       struct cfg80211_sched_scan_request *req);
+int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sched_scan_stopped_work(struct work_struct *work);
+
+/* off-channel helpers */
+bool ieee80211_cfg_on_oper_channel(struct ieee80211_local *local);
+void ieee80211_offchannel_enable_all_ps(struct ieee80211_local *local,
+					bool tell_ap);
+void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local,
+				    bool offchannel_ps_enable);
+void ieee80211_offchannel_return(struct ieee80211_local *local,
+				 bool enable_beaconing,
+				 bool offchannel_ps_disable);
+void ieee80211_hw_roc_setup(struct ieee80211_local *local);
+
+/* interface handling */
+int ieee80211_iface_init(void);
+void ieee80211_iface_exit(void);
+int ieee80211_if_add(struct ieee80211_local *local, const char *name,
+		     struct net_device **new_dev, enum nl80211_iftype type,
+		     struct vif_params *params);
+int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
+			     enum nl80211_iftype type);
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
+void ieee80211_remove_interfaces(struct ieee80211_local *local);
+u32 __ieee80211_recalc_idle(struct ieee80211_local *local);
+void ieee80211_recalc_idle(struct ieee80211_local *local);
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+				    const int offset);
+
+static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
+{
+	return test_bit(SDATA_STATE_RUNNING, &sdata->state);
+}
+
+/* tx handling */
+void ieee80211_clear_tx_pending(struct ieee80211_local *local);
+void ieee80211_tx_pending(unsigned long data);
+netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
+					 struct net_device *dev);
+netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
+				       struct net_device *dev);
+
+/*
+ * radiotap header for status frames
+ */
+struct ieee80211_tx_status_rtap_hdr {
+	struct ieee80211_radiotap_header hdr;
+	u8 rate;
+	u8 padding_for_rate;
+	__le16 tx_flags;
+	u8 data_retries;
+} __packed;
+
+
+/* HT */
+void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
+				       struct ieee80211_ht_cap *ht_cap_ie,
+				       struct ieee80211_sta_ht_cap *ht_cap);
+void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn);
+void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
+			  const u8 *da, u16 tid,
+			  u16 initiator, u16 reason_code);
+int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
+			       enum ieee80211_smps_mode smps, const u8 *da,
+			       const u8 *bssid);
+void ieee80211_request_smps_work(struct work_struct *work);
+
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				     u16 initiator, u16 reason, bool stop);
+void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				    u16 initiator, u16 reason, bool stop);
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx);
+void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
+			     struct sta_info *sta,
+			     struct ieee80211_mgmt *mgmt, size_t len);
+void ieee80211_process_addba_resp(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len);
+void ieee80211_process_addba_request(struct ieee80211_local *local,
+				     struct sta_info *sta,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len);
+
+int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+				   enum ieee80211_back_parties initiator,
+				   bool tx);
+int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+				    enum ieee80211_back_parties initiator,
+				    bool tx);
+void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
+void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
+void ieee80211_ba_session_work(struct work_struct *work);
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
+
+/* Spectrum management */
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_mgmt *mgmt,
+				       size_t len);
+
+/* Suspend/resume and hw reconfiguration */
+int ieee80211_reconfig(struct ieee80211_local *local);
+void ieee80211_stop_device(struct ieee80211_local *local);
+
+#ifdef CONFIG_PM
+int __ieee80211_suspend(struct ieee80211_hw *hw,
+			struct cfg80211_wowlan *wowlan);
+
+static inline int __ieee80211_resume(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+		"%s: resume with hardware scan still in progress\n",
+		wiphy_name(hw->wiphy));
+
+	return ieee80211_reconfig(hw_to_local(hw));
+}
+#else
+static inline int __ieee80211_suspend(struct ieee80211_hw *hw,
+				      struct cfg80211_wowlan *wowlan)
+{
+	return 0;
+}
+
+static inline int __ieee80211_resume(struct ieee80211_hw *hw)
+{
+	return 0;
+}
+#endif
+
+/* utility functions/constants */
+extern void *mac80211_wiphy_privid; /* for wiphy privid */
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+			enum nl80211_iftype type);
+int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
+			     int rate, int erp, int short_preamble);
+void mac80211_ev_michael_mic_failure(struct ieee80211_sub_if_data *sdata, int keyidx,
+				     struct ieee80211_hdr *hdr, const u8 *tsc,
+				     gfp_t gfp);
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata);
+void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
+void ieee802_11_parse_elems(u8 *start, size_t len,
+			    struct ieee802_11_elems *elems);
+u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
+			       struct ieee802_11_elems *elems,
+			       u64 filter, u32 crc);
+u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
+			      enum ieee80211_band band);
+
+void ieee80211_dynamic_ps_enable_work(struct work_struct *work);
+void ieee80211_dynamic_ps_disable_work(struct work_struct *work);
+void ieee80211_dynamic_ps_timer(unsigned long data);
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+			     struct ieee80211_sub_if_data *sdata,
+			     int powersave);
+void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
+			     struct ieee80211_hdr *hdr);
+void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
+			     struct ieee80211_hdr *hdr, bool ack);
+void ieee80211_beacon_connection_loss_work(struct work_struct *work);
+
+void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
+				     enum queue_stop_reason reason);
+void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
+				     enum queue_stop_reason reason);
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason);
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason);
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+			       struct sk_buff *skb);
+int ieee80211_add_pending_skbs(struct ieee80211_local *local,
+			       struct sk_buff_head *skbs);
+int ieee80211_add_pending_skbs_fn(struct ieee80211_local *local,
+				  struct sk_buff_head *skbs,
+				  void (*fn)(void *data), void *data);
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+			 u16 transaction, u16 auth_alg,
+			 u8 *extra, size_t extra_len, const u8 *bssid,
+			 const u8 *key, u8 key_len, u8 key_idx);
+int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+			     const u8 *ie, size_t ie_len,
+			     enum ieee80211_band band, u32 rate_mask,
+			     u8 channel);
+struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
+					  u8 *dst,
+					  const u8 *ssid, size_t ssid_len,
+					  const u8 *ie, size_t ie_len);
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      const u8 *ssid, size_t ssid_len,
+			      const u8 *ie, size_t ie_len);
+
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+				  const size_t supp_rates_len,
+				  const u8 *supp_rates);
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+			    struct ieee802_11_elems *elems,
+			    enum ieee80211_band band);
+int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
+			     enum ieee80211_smps_mode smps_mode);
+void ieee80211_recalc_smps(struct ieee80211_local *local);
+
+size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
+			  const u8 *ids, int n_ids, size_t offset);
+size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
+
+/* internal work items */
+void ieee80211_work_init(struct ieee80211_local *local);
+void ieee80211_add_work(struct ieee80211_work *wk);
+void free_work(struct ieee80211_work *wk);
+void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata);
+ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+					   struct sk_buff *skb);
+int ieee80211_wk_remain_on_channel(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_channel *chan,
+				   enum nl80211_channel_type channel_type,
+				   unsigned int duration, u64 *cookie);
+int ieee80211_wk_cancel_remain_on_channel(
+	struct ieee80211_sub_if_data *sdata, u64 cookie);
+
+/* channel management */
+enum ieee80211_chan_mode {
+	CHAN_MODE_UNDEFINED,
+	CHAN_MODE_HOPPING,
+	CHAN_MODE_FIXED,
+};
+
+enum ieee80211_chan_mode
+ieee80211_get_channel_mode(struct ieee80211_local *local,
+			   struct ieee80211_sub_if_data *ignore);
+bool ieee80211_set_channel_type(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				enum nl80211_channel_type chantype);
+
+#ifdef CONFIG_MAC80211_NOINLINE
+#define debug_noinline noinline
+#else
+#define debug_noinline
+#endif
+
+#endif /* IEEE80211_I_H */
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
new file mode 100644
index 00000000..65f3764c
--- /dev/null
+++ b/net/mac80211/iface.c
@@ -0,0 +1,1417 @@
+/*
+ * Interface handling (except master interface)
+ *
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/mac80211.h>
+#include <net/ieee80211_radiotap.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "debugfs_netdev.h"
+#include "mesh.h"
+#include "led.h"
+#include "driver-ops.h"
+#include "wme.h"
+#include "rate.h"
+
+/**
+ * DOC: Interface list locking
+ *
+ * The interface list in each struct ieee80211_local is protected
+ * three-fold:
+ *
+ * (1) modifications may only be done under the RTNL
+ * (2) modifications and readers are protected against each other by
+ *     the iflist_mtx.
+ * (3) modifications are done in an RCU manner so atomic readers
+ *     can traverse the list in RCU-safe blocks.
+ *
+ * As a consequence, reads (traversals) of the list can be protected
+ * by either the RTNL, the iflist_mtx or RCU.
+ */
+
+
+static int ieee80211_change_mtu(struct net_device *dev, int new_mtu)
+{
+	int meshhdrlen;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	meshhdrlen = (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ? 5 : 0;
+
+	/* FIX: what would be proper limits for MTU?
+	 * This interface uses 802.3 frames. */
+	if (new_mtu < 256 ||
+	    new_mtu > IEEE80211_MAX_DATA_LEN - 24 - 6 - meshhdrlen) {
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	printk(KERN_DEBUG "%s: setting MTU %d\n", dev->name, new_mtu);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static int ieee80211_change_mac(struct net_device *dev, void *addr)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct sockaddr *sa = addr;
+	int ret;
+
+	if (ieee80211_sdata_running(sdata))
+		return -EBUSY;
+
+	ret = eth_mac_addr(dev, sa);
+
+	if (ret == 0)
+		memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN);
+
+	return ret;
+}
+
+static inline int identical_mac_addr_allowed(int type1, int type2)
+{
+	return type1 == NL80211_IFTYPE_MONITOR ||
+		type2 == NL80211_IFTYPE_MONITOR ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_WDS) ||
+		(type1 == NL80211_IFTYPE_WDS &&
+			(type2 == NL80211_IFTYPE_WDS ||
+			 type2 == NL80211_IFTYPE_AP)) ||
+		(type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
+		(type1 == NL80211_IFTYPE_AP_VLAN &&
+			(type2 == NL80211_IFTYPE_AP ||
+			 type2 == NL80211_IFTYPE_AP_VLAN));
+}
+
+static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
+					    enum nl80211_iftype iftype)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_sub_if_data *nsdata;
+	struct net_device *dev = sdata->dev;
+
+	ASSERT_RTNL();
+
+	/* we hold the RTNL here so can safely walk the list */
+	list_for_each_entry(nsdata, &local->interfaces, list) {
+		struct net_device *ndev = nsdata->dev;
+
+		if (ndev != dev && ieee80211_sdata_running(nsdata)) {
+			/*
+			 * Allow only a single IBSS interface to be up at any
+			 * time. This is restricted because beacon distribution
+			 * cannot work properly if both are in the same IBSS.
+			 *
+			 * To remove this restriction we'd have to disallow them
+			 * from setting the same SSID on different IBSS interfaces
+			 * belonging to the same hardware. Then, however, we're
+			 * faced with having to adopt two different TSF timers...
+			 */
+			if (iftype == NL80211_IFTYPE_ADHOC &&
+			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
+				return -EBUSY;
+
+			/*
+			 * The remaining checks are only performed for interfaces
+			 * with the same MAC address.
+			 */
+			if (compare_ether_addr(dev->dev_addr, ndev->dev_addr))
+				continue;
+
+			/*
+			 * check whether it may have the same address
+			 */
+			if (!identical_mac_addr_allowed(iftype,
+							nsdata->vif.type))
+				return -ENOTUNIQ;
+
+			/*
+			 * can only add VLANs to enabled APs
+			 */
+			if (iftype == NL80211_IFTYPE_AP_VLAN &&
+			    nsdata->vif.type == NL80211_IFTYPE_AP)
+				sdata->bss = &nsdata->u.ap;
+		}
+	}
+
+	return 0;
+}
+
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+				    const int offset)
+{
+	struct ieee80211_local *local = sdata->local;
+	u32 flags = sdata->u.mntr_flags;
+
+#define ADJUST(_f, _s)	do {					\
+	if (flags & MONITOR_FLAG_##_f)				\
+		local->fif_##_s += offset;			\
+	} while (0)
+
+	ADJUST(FCSFAIL, fcsfail);
+	ADJUST(PLCPFAIL, plcpfail);
+	ADJUST(CONTROL, control);
+	ADJUST(CONTROL, pspoll);
+	ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+static int ieee80211_do_open(struct net_device *dev, bool coming_up)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	u32 changed = 0;
+	int res;
+	u32 hw_reconf_flags = 0;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_WDS:
+		if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
+			return -ENOLINK;
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		if (!sdata->bss)
+			return -ENOLINK;
+		list_add(&sdata->u.vlan.list, &sdata->bss->vlans);
+		break;
+	case NL80211_IFTYPE_AP:
+		sdata->bss = &sdata->u.ap;
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_MONITOR:
+	case NL80211_IFTYPE_ADHOC:
+		/* no special treatment */
+		break;
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case NUM_NL80211_IFTYPES:
+	case NL80211_IFTYPE_P2P_CLIENT:
+	case NL80211_IFTYPE_P2P_GO:
+		/* cannot happen */
+		WARN_ON(1);
+		break;
+	}
+
+	if (local->open_count == 0) {
+		res = drv_start(local);
+		if (res)
+			goto err_del_bss;
+		if (local->ops->napi_poll)
+			napi_enable(&local->napi);
+		/* we're brought up, everything changes */
+		hw_reconf_flags = ~0;
+		ieee80211_led_radio(local, true);
+		ieee80211_mod_tpt_led_trig(local,
+					   IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+	}
+
+	/*
+	 * Copy the hopefully now-present MAC address to
+	 * this interface, if it has the special null one.
+	 */
+	if (is_zero_ether_addr(dev->dev_addr)) {
+		memcpy(dev->dev_addr,
+		       local->hw.wiphy->perm_addr,
+		       ETH_ALEN);
+		memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
+
+		if (!is_valid_ether_addr(dev->dev_addr)) {
+			if (!local->open_count)
+				drv_stop(local);
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		/* no need to tell driver */
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+			local->cooked_mntrs++;
+			break;
+		}
+
+		/* must be before the call to ieee80211_configure_filter */
+		local->monitors++;
+		if (local->monitors == 1) {
+			local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
+			hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+		}
+
+		ieee80211_adjust_monitor_flags(sdata, 1);
+		ieee80211_configure_filter(local);
+
+		netif_carrier_on(dev);
+		break;
+	default:
+		if (coming_up) {
+			res = drv_add_interface(local, &sdata->vif);
+			if (res)
+				goto err_stop;
+		}
+
+		if (sdata->vif.type == NL80211_IFTYPE_AP) {
+			local->fif_pspoll++;
+			local->fif_probe_req++;
+
+			ieee80211_configure_filter(local);
+		} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+			local->fif_probe_req++;
+		}
+
+		changed |= ieee80211_reset_erp_info(sdata);
+		ieee80211_bss_info_change_notify(sdata, changed);
+
+		if (sdata->vif.type == NL80211_IFTYPE_STATION)
+			netif_carrier_off(dev);
+		else
+			netif_carrier_on(dev);
+	}
+
+	set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+	if (sdata->vif.type == NL80211_IFTYPE_WDS) {
+		/* Create STA entry for the WDS peer */
+		sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
+				     GFP_KERNEL);
+		if (!sta) {
+			res = -ENOMEM;
+			goto err_del_interface;
+		}
+
+		/* no locking required since STA is not live yet */
+		sta->flags |= WLAN_STA_AUTHORIZED;
+
+		res = sta_info_insert(sta);
+		if (res) {
+			/* STA has been freed */
+			goto err_del_interface;
+		}
+
+		rate_control_rate_init(sta);
+	}
+
+	/*
+	 * set_multicast_list will be invoked by the networking core
+	 * which will check whether any increments here were done in
+	 * error and sync them down to the hardware as filter flags.
+	 */
+	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+		atomic_inc(&local->iff_allmultis);
+
+	if (sdata->flags & IEEE80211_SDATA_PROMISC)
+		atomic_inc(&local->iff_promiscs);
+
+	mutex_lock(&local->mtx);
+	hw_reconf_flags |= __ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+
+	if (coming_up)
+		local->open_count++;
+
+	if (hw_reconf_flags) {
+		ieee80211_hw_config(local, hw_reconf_flags);
+		/*
+		 * set default queue parameters so drivers don't
+		 * need to initialise the hardware if the hardware
+		 * doesn't start up with sane defaults
+		 */
+		ieee80211_set_wmm_default(sdata);
+	}
+
+	ieee80211_recalc_ps(local, -1);
+
+	netif_tx_start_all_queues(dev);
+
+	return 0;
+ err_del_interface:
+	drv_remove_interface(local, &sdata->vif);
+ err_stop:
+	if (!local->open_count)
+		drv_stop(local);
+ err_del_bss:
+	sdata->bss = NULL;
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		list_del(&sdata->u.vlan.list);
+	clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+	return res;
+}
+
+static int ieee80211_open(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	int err;
+
+	/* fail early if user set an invalid address */
+	if (!is_zero_ether_addr(dev->dev_addr) &&
+	    !is_valid_ether_addr(dev->dev_addr))
+		return -EADDRNOTAVAIL;
+
+	err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
+	if (err)
+		return err;
+
+	return ieee80211_do_open(dev, true);
+}
+
+static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
+			      bool going_down)
+{
+	struct ieee80211_local *local = sdata->local;
+	unsigned long flags;
+	struct sk_buff *skb, *tmp;
+	u32 hw_reconf_flags = 0;
+	int i;
+	enum nl80211_channel_type orig_ct;
+
+	clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
+	if (local->scan_sdata == sdata)
+		ieee80211_scan_cancel(local);
+
+	/*
+	 * Stop TX on this interface first.
+	 */
+	netif_tx_stop_all_queues(sdata->dev);
+
+	/*
+	 * Purge work for this interface.
+	 */
+	ieee80211_work_purge(sdata);
+
+	/*
+	 * Remove all stations associated with this interface.
+	 *
+	 * This must be done before calling ops->remove_interface()
+	 * because otherwise we can later invoke ops->sta_notify()
+	 * whenever the STAs are removed, and that invalidates driver
+	 * assumptions about always getting a vif pointer that is valid
+	 * (because if we remove a STA after ops->remove_interface()
+	 * the driver will have removed the vif info already!)
+	 *
+	 * This is relevant only in AP, WDS and mesh modes, since in
+	 * all other modes we've already removed all stations when
+	 * disconnecting etc.
+	 */
+	sta_info_flush(local, sdata);
+
+	/*
+	 * Don't count this interface for promisc/allmulti while it
+	 * is down. dev_mc_unsync() will invoke set_multicast_list
+	 * on the master interface which will sync these down to the
+	 * hardware as filter flags.
+	 */
+	if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
+		atomic_dec(&local->iff_allmultis);
+
+	if (sdata->flags & IEEE80211_SDATA_PROMISC)
+		atomic_dec(&local->iff_promiscs);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
+		local->fif_pspoll--;
+		local->fif_probe_req--;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		local->fif_probe_req--;
+	}
+
+	netif_addr_lock_bh(sdata->dev);
+	spin_lock_bh(&local->filter_lock);
+	__hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
+			 sdata->dev->addr_len);
+	spin_unlock_bh(&local->filter_lock);
+	netif_addr_unlock_bh(sdata->dev);
+
+	ieee80211_configure_filter(local);
+
+	del_timer_sync(&local->dynamic_ps_timer);
+	cancel_work_sync(&local->dynamic_ps_enable_work);
+
+	/* APs need special treatment */
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
+		struct ieee80211_sub_if_data *vlan, *tmpsdata;
+		struct beacon_data *old_beacon =
+			rtnl_dereference(sdata->u.ap.beacon);
+
+		/* sdata_running will return false, so this will disable */
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_BEACON_ENABLED);
+
+		/* remove beacon */
+		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
+		synchronize_rcu();
+		kfree(old_beacon);
+
+		/* free all potentially still buffered bcast frames */
+		while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
+			local->total_ps_buffered--;
+			dev_kfree_skb(skb);
+		}
+
+		/* down all dependent devices, that is VLANs */
+		list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
+					 u.vlan.list)
+			dev_close(vlan->dev);
+		WARN_ON(!list_empty(&sdata->u.ap.vlans));
+	}
+
+	if (going_down)
+		local->open_count--;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		list_del(&sdata->u.vlan.list);
+		/* no need to tell driver */
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) {
+			local->cooked_mntrs--;
+			break;
+		}
+
+		local->monitors--;
+		if (local->monitors == 0) {
+			local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR;
+			hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
+		}
+
+		ieee80211_adjust_monitor_flags(sdata, -1);
+		ieee80211_configure_filter(local);
+		break;
+	default:
+		mutex_lock(&local->mtx);
+		if (local->hw_roc_dev == sdata->dev &&
+		    local->hw_roc_channel) {
+			/* ignore return value since this is racy */
+			drv_cancel_remain_on_channel(local);
+			ieee80211_queue_work(&local->hw, &local->hw_roc_done);
+		}
+		mutex_unlock(&local->mtx);
+
+		flush_work(&local->hw_roc_start);
+		flush_work(&local->hw_roc_done);
+
+		flush_work(&sdata->work);
+		/*
+		 * When we get here, the interface is marked down.
+		 * Call synchronize_rcu() to wait for the RX path
+		 * should it be using the interface and enqueuing
+		 * frames at this very time on another CPU.
+		 */
+		synchronize_rcu();
+		skb_queue_purge(&sdata->skb_queue);
+
+		/*
+		 * Disable beaconing here for mesh only, AP and IBSS
+		 * are already taken care of.
+		 */
+		if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+			ieee80211_bss_info_change_notify(sdata,
+				BSS_CHANGED_BEACON_ENABLED);
+
+		/*
+		 * Free all remaining keys, there shouldn't be any,
+		 * except maybe group keys in AP more or WDS?
+		 */
+		ieee80211_free_keys(sdata);
+
+		if (going_down)
+			drv_remove_interface(local, &sdata->vif);
+	}
+
+	sdata->bss = NULL;
+
+	mutex_lock(&local->mtx);
+	hw_reconf_flags |= __ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+
+	ieee80211_recalc_ps(local, -1);
+
+	if (local->open_count == 0) {
+		if (local->ops->napi_poll)
+			napi_disable(&local->napi);
+		ieee80211_clear_tx_pending(local);
+		ieee80211_stop_device(local);
+
+		/* no reconfiguring after stop! */
+		hw_reconf_flags = 0;
+	}
+
+	/* Re-calculate channel-type, in case there are multiple vifs
+	 * on different channel types.
+	 */
+	orig_ct = local->_oper_channel_type;
+	ieee80211_set_channel_type(local, NULL, NL80211_CHAN_NO_HT);
+
+	/* do after stop to avoid reconfiguring when we stop anyway */
+	if (hw_reconf_flags || (orig_ct != local->_oper_channel_type))
+		ieee80211_hw_config(local, hw_reconf_flags);
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+		skb_queue_walk_safe(&local->pending[i], skb, tmp) {
+			struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+			if (info->control.vif == &sdata->vif) {
+				__skb_unlink(skb, &local->pending[i]);
+				dev_kfree_skb_irq(skb);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+static int ieee80211_stop(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_do_stop(sdata, true);
+
+	return 0;
+}
+
+static void ieee80211_set_multicast_list(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	int allmulti, promisc, sdata_allmulti, sdata_promisc;
+
+	allmulti = !!(dev->flags & IFF_ALLMULTI);
+	promisc = !!(dev->flags & IFF_PROMISC);
+	sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);
+	sdata_promisc = !!(sdata->flags & IEEE80211_SDATA_PROMISC);
+
+	if (allmulti != sdata_allmulti) {
+		if (dev->flags & IFF_ALLMULTI)
+			atomic_inc(&local->iff_allmultis);
+		else
+			atomic_dec(&local->iff_allmultis);
+		sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
+	}
+
+	if (promisc != sdata_promisc) {
+		if (dev->flags & IFF_PROMISC)
+			atomic_inc(&local->iff_promiscs);
+		else
+			atomic_dec(&local->iff_promiscs);
+		sdata->flags ^= IEEE80211_SDATA_PROMISC;
+	}
+	spin_lock_bh(&local->filter_lock);
+	__hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len);
+	spin_unlock_bh(&local->filter_lock);
+	ieee80211_queue_work(&local->hw, &local->reconfig_filter);
+}
+
+/*
+ * Called when the netdev is removed or, by the code below, before
+ * the interface type changes.
+ */
+static void ieee80211_teardown_sdata(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	int flushed;
+	int i;
+
+	/* free extra data */
+	ieee80211_free_keys(sdata);
+
+	ieee80211_debugfs_remove_netdev(sdata);
+
+	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
+		__skb_queue_purge(&sdata->fragments[i].skb_list);
+	sdata->fragment_next = 0;
+
+	if (ieee80211_vif_is_mesh(&sdata->vif))
+		mesh_rmc_free(sdata);
+
+	flushed = sta_info_flush(local, sdata);
+	WARN_ON(flushed);
+}
+
+static u16 ieee80211_netdev_select_queue(struct net_device *dev,
+					 struct sk_buff *skb)
+{
+	return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
+}
+
+static const struct net_device_ops ieee80211_dataif_ops = {
+	.ndo_open		= ieee80211_open,
+	.ndo_stop		= ieee80211_stop,
+	.ndo_uninit		= ieee80211_teardown_sdata,
+	.ndo_start_xmit		= ieee80211_subif_start_xmit,
+	.ndo_set_multicast_list = ieee80211_set_multicast_list,
+	.ndo_change_mtu 	= ieee80211_change_mtu,
+	.ndo_set_mac_address 	= ieee80211_change_mac,
+	.ndo_select_queue	= ieee80211_netdev_select_queue,
+};
+
+static u16 ieee80211_monitor_select_queue(struct net_device *dev,
+					  struct sk_buff *skb)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hdr *hdr;
+	struct ieee80211_radiotap_header *rtap = (void *)skb->data;
+	u8 *p;
+
+	if (local->hw.queues < 4)
+		return 0;
+
+	if (skb->len < 4 ||
+	    skb->len < le16_to_cpu(rtap->it_len) + 2 /* frame control */)
+		return 0; /* doesn't matter, frame will be dropped */
+
+	hdr = (void *)((u8 *)skb->data + le16_to_cpu(rtap->it_len));
+
+	if (!ieee80211_is_data(hdr->frame_control)) {
+		skb->priority = 7;
+		return ieee802_1d_to_ac[skb->priority];
+	}
+	if (!ieee80211_is_data_qos(hdr->frame_control)) {
+		skb->priority = 0;
+		return ieee802_1d_to_ac[skb->priority];
+	}
+
+	p = ieee80211_get_qos_ctl(hdr);
+	skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK;
+
+	return ieee80211_downgrade_queue(local, skb);
+}
+
+static const struct net_device_ops ieee80211_monitorif_ops = {
+	.ndo_open		= ieee80211_open,
+	.ndo_stop		= ieee80211_stop,
+	.ndo_uninit		= ieee80211_teardown_sdata,
+	.ndo_start_xmit		= ieee80211_monitor_start_xmit,
+	.ndo_set_multicast_list = ieee80211_set_multicast_list,
+	.ndo_change_mtu 	= ieee80211_change_mtu,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_select_queue	= ieee80211_monitor_select_queue,
+};
+
+static void ieee80211_if_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->netdev_ops = &ieee80211_dataif_ops;
+	dev->destructor = free_netdev;
+}
+
+static void ieee80211_iface_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, work);
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct sta_info *sta;
+	struct ieee80211_ra_tid *ra_tid;
+
+	if (!ieee80211_sdata_running(sdata))
+		return;
+
+	if (local->scanning)
+		return;
+
+	/*
+	 * ieee80211_queue_work() should have picked up most cases,
+	 * here we'll pick the rest.
+	 */
+	if (WARN(local->suspended,
+		 "interface work scheduled while going to suspend\n"))
+		return;
+
+	/* first process frames */
+	while ((skb = skb_dequeue(&sdata->skb_queue))) {
+		struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+		if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) {
+			ra_tid = (void *)&skb->cb;
+			ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra,
+						 ra_tid->tid);
+		} else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) {
+			ra_tid = (void *)&skb->cb;
+			ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra,
+						ra_tid->tid);
+		} else if (ieee80211_is_action(mgmt->frame_control) &&
+			   mgmt->u.action.category == WLAN_CATEGORY_BACK) {
+			int len = skb->len;
+
+			mutex_lock(&local->sta_mtx);
+			sta = sta_info_get_bss(sdata, mgmt->sa);
+			if (sta) {
+				switch (mgmt->u.action.u.addba_req.action_code) {
+				case WLAN_ACTION_ADDBA_REQ:
+					ieee80211_process_addba_request(
+							local, sta, mgmt, len);
+					break;
+				case WLAN_ACTION_ADDBA_RESP:
+					ieee80211_process_addba_resp(local, sta,
+								     mgmt, len);
+					break;
+				case WLAN_ACTION_DELBA:
+					ieee80211_process_delba(sdata, sta,
+								mgmt, len);
+					break;
+				default:
+					WARN_ON(1);
+					break;
+				}
+			}
+			mutex_unlock(&local->sta_mtx);
+		} else if (ieee80211_is_data_qos(mgmt->frame_control)) {
+			struct ieee80211_hdr *hdr = (void *)mgmt;
+			/*
+			 * So the frame isn't mgmt, but frame_control
+			 * is at the right place anyway, of course, so
+			 * the if statement is correct.
+			 *
+			 * Warn if we have other data frame types here,
+			 * they must not get here.
+			 */
+			WARN_ON(hdr->frame_control &
+					cpu_to_le16(IEEE80211_STYPE_NULLFUNC));
+			WARN_ON(!(hdr->seq_ctrl &
+					cpu_to_le16(IEEE80211_SCTL_FRAG)));
+			/*
+			 * This was a fragment of a frame, received while
+			 * a block-ack session was active. That cannot be
+			 * right, so terminate the session.
+			 */
+			mutex_lock(&local->sta_mtx);
+			sta = sta_info_get_bss(sdata, mgmt->sa);
+			if (sta) {
+				u16 tid = *ieee80211_get_qos_ctl(hdr) &
+						IEEE80211_QOS_CTL_TID_MASK;
+
+				__ieee80211_stop_rx_ba_session(
+					sta, tid, WLAN_BACK_RECIPIENT,
+					WLAN_REASON_QSTA_REQUIRE_SETUP,
+					true);
+			}
+			mutex_unlock(&local->sta_mtx);
+		} else switch (sdata->vif.type) {
+		case NL80211_IFTYPE_STATION:
+			ieee80211_sta_rx_queued_mgmt(sdata, skb);
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			if (!ieee80211_vif_is_mesh(&sdata->vif))
+				break;
+			ieee80211_mesh_rx_queued_mgmt(sdata, skb);
+			break;
+		default:
+			WARN(1, "frame for unexpected interface type");
+			break;
+		}
+
+		kfree_skb(skb);
+	}
+
+	/* then other type-dependent work */
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		ieee80211_sta_work(sdata);
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		ieee80211_ibss_work(sdata);
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
+		ieee80211_mesh_work(sdata);
+		break;
+	default:
+		break;
+	}
+}
+
+
+/*
+ * Helper function to initialise an interface to a specific type.
+ */
+static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
+				  enum nl80211_iftype type)
+{
+	/* clear type-dependent union */
+	memset(&sdata->u, 0, sizeof(sdata->u));
+
+	/* and set some type-dependent values */
+	sdata->vif.type = type;
+	sdata->vif.p2p = false;
+	sdata->dev->netdev_ops = &ieee80211_dataif_ops;
+	sdata->wdev.iftype = type;
+
+	sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
+	sdata->control_port_no_encrypt = false;
+
+	/* only monitor differs */
+	sdata->dev->type = ARPHRD_ETHER;
+
+	skb_queue_head_init(&sdata->skb_queue);
+	INIT_WORK(&sdata->work, ieee80211_iface_work);
+
+	switch (type) {
+	case NL80211_IFTYPE_P2P_GO:
+		type = NL80211_IFTYPE_AP;
+		sdata->vif.type = type;
+		sdata->vif.p2p = true;
+		/* fall through */
+	case NL80211_IFTYPE_AP:
+		skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
+		INIT_LIST_HEAD(&sdata->u.ap.vlans);
+		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
+		type = NL80211_IFTYPE_STATION;
+		sdata->vif.type = type;
+		sdata->vif.p2p = true;
+		/* fall through */
+	case NL80211_IFTYPE_STATION:
+		ieee80211_sta_setup_sdata(sdata);
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		ieee80211_ibss_setup_sdata(sdata);
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			ieee80211_mesh_init_sdata(sdata);
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
+		sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
+		sdata->u.mntr_flags = MONITOR_FLAG_CONTROL |
+				      MONITOR_FLAG_OTHER_BSS;
+		break;
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_AP_VLAN:
+		break;
+	case NL80211_IFTYPE_UNSPECIFIED:
+	case NUM_NL80211_IFTYPES:
+		BUG();
+		break;
+	}
+
+	ieee80211_debugfs_add_netdev(sdata);
+}
+
+static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
+					   enum nl80211_iftype type)
+{
+	struct ieee80211_local *local = sdata->local;
+	int ret, err;
+	enum nl80211_iftype internal_type = type;
+	bool p2p = false;
+
+	ASSERT_RTNL();
+
+	if (!local->ops->change_interface)
+		return -EBUSY;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		/*
+		 * Could maybe also all others here?
+		 * Just not sure how that interacts
+		 * with the RX/config path e.g. for
+		 * mesh.
+		 */
+		break;
+	default:
+		return -EBUSY;
+	}
+
+	switch (type) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		/*
+		 * Could probably support everything
+		 * but WDS here (WDS do_open can fail
+		 * under memory pressure, which this
+		 * code isn't prepared to handle).
+		 */
+		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
+		p2p = true;
+		internal_type = NL80211_IFTYPE_STATION;
+		break;
+	case NL80211_IFTYPE_P2P_GO:
+		p2p = true;
+		internal_type = NL80211_IFTYPE_AP;
+		break;
+	default:
+		return -EBUSY;
+	}
+
+	ret = ieee80211_check_concurrent_iface(sdata, internal_type);
+	if (ret)
+		return ret;
+
+	ieee80211_do_stop(sdata, false);
+
+	ieee80211_teardown_sdata(sdata->dev);
+
+	ret = drv_change_interface(local, sdata, internal_type, p2p);
+	if (ret)
+		type = sdata->vif.type;
+
+	ieee80211_setup_sdata(sdata, type);
+
+	err = ieee80211_do_open(sdata->dev, false);
+	WARN(err, "type change: do_open returned %d", err);
+
+	return ret;
+}
+
+int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
+			     enum nl80211_iftype type)
+{
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (type == ieee80211_vif_type_p2p(&sdata->vif))
+		return 0;
+
+	/* Setting ad-hoc mode on non-IBSS channel is not supported. */
+	if (sdata->local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS &&
+	    type == NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	if (ieee80211_sdata_running(sdata)) {
+		ret = ieee80211_runtime_change_iftype(sdata, type);
+		if (ret)
+			return ret;
+	} else {
+		/* Purge and reset type-dependent state. */
+		ieee80211_teardown_sdata(sdata->dev);
+		ieee80211_setup_sdata(sdata, type);
+	}
+
+	/* reset some values that shouldn't be kept across type changes */
+	sdata->vif.bss_conf.basic_rates =
+		ieee80211_mandatory_rates(sdata->local,
+			sdata->local->hw.conf.channel->band);
+	sdata->drop_unencrypted = 0;
+	if (type == NL80211_IFTYPE_STATION)
+		sdata->u.mgd.use_4addr = false;
+
+	return 0;
+}
+
+static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
+				       struct net_device *dev,
+				       enum nl80211_iftype type)
+{
+	struct ieee80211_sub_if_data *sdata;
+	u64 mask, start, addr, val, inc;
+	u8 *m;
+	u8 tmp_addr[ETH_ALEN];
+	int i;
+
+	/* default ... something at least */
+	memcpy(dev->perm_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
+
+	if (is_zero_ether_addr(local->hw.wiphy->addr_mask) &&
+	    local->hw.wiphy->n_addresses <= 1)
+		return;
+
+
+	mutex_lock(&local->iflist_mtx);
+
+	switch (type) {
+	case NL80211_IFTYPE_MONITOR:
+		/* doesn't matter */
+		break;
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_AP_VLAN:
+		/* match up with an AP interface */
+		list_for_each_entry(sdata, &local->interfaces, list) {
+			if (sdata->vif.type != NL80211_IFTYPE_AP)
+				continue;
+			memcpy(dev->perm_addr, sdata->vif.addr, ETH_ALEN);
+			break;
+		}
+		/* keep default if no AP interface present */
+		break;
+	default:
+		/* assign a new address if possible -- try n_addresses first */
+		for (i = 0; i < local->hw.wiphy->n_addresses; i++) {
+			bool used = false;
+
+			list_for_each_entry(sdata, &local->interfaces, list) {
+				if (memcmp(local->hw.wiphy->addresses[i].addr,
+					   sdata->vif.addr, ETH_ALEN) == 0) {
+					used = true;
+					break;
+				}
+			}
+
+			if (!used) {
+				memcpy(dev->perm_addr,
+				       local->hw.wiphy->addresses[i].addr,
+				       ETH_ALEN);
+				break;
+			}
+		}
+
+		/* try mask if available */
+		if (is_zero_ether_addr(local->hw.wiphy->addr_mask))
+			break;
+
+		m = local->hw.wiphy->addr_mask;
+		mask =	((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+			((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+			((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+		if (__ffs64(mask) + hweight64(mask) != fls64(mask)) {
+			/* not a contiguous mask ... not handled now! */
+			printk(KERN_DEBUG "not contiguous\n");
+			break;
+		}
+
+		m = local->hw.wiphy->perm_addr;
+		start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
+			((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
+			((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);
+
+		inc = 1ULL<<__ffs64(mask);
+		val = (start & mask);
+		addr = (start & ~mask) | (val & mask);
+		do {
+			bool used = false;
+
+			tmp_addr[5] = addr >> 0*8;
+			tmp_addr[4] = addr >> 1*8;
+			tmp_addr[3] = addr >> 2*8;
+			tmp_addr[2] = addr >> 3*8;
+			tmp_addr[1] = addr >> 4*8;
+			tmp_addr[0] = addr >> 5*8;
+
+			val += inc;
+
+			list_for_each_entry(sdata, &local->interfaces, list) {
+				if (memcmp(tmp_addr, sdata->vif.addr,
+							ETH_ALEN) == 0) {
+					used = true;
+					break;
+				}
+			}
+
+			if (!used) {
+				memcpy(dev->perm_addr, tmp_addr, ETH_ALEN);
+				break;
+			}
+			addr = (start & ~mask) | (val & mask);
+		} while (addr != start);
+
+		break;
+	}
+
+	mutex_unlock(&local->iflist_mtx);
+}
+
+int ieee80211_if_add(struct ieee80211_local *local, const char *name,
+		     struct net_device **new_dev, enum nl80211_iftype type,
+		     struct vif_params *params)
+{
+	struct net_device *ndev;
+	struct ieee80211_sub_if_data *sdata = NULL;
+	int ret, i;
+
+	ASSERT_RTNL();
+
+	ndev = alloc_netdev_mq(sizeof(*sdata) + local->hw.vif_data_size,
+			       name, ieee80211_if_setup, local->hw.queues);
+	if (!ndev)
+		return -ENOMEM;
+	dev_net_set(ndev, wiphy_net(local->hw.wiphy));
+
+	ndev->needed_headroom = local->tx_headroom +
+				4*6 /* four MAC addresses */
+				+ 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */
+				+ 6 /* mesh */
+				+ 8 /* rfc1042/bridge tunnel */
+				- ETH_HLEN /* ethernet hard_header_len */
+				+ IEEE80211_ENCRYPT_HEADROOM;
+	ndev->needed_tailroom = IEEE80211_ENCRYPT_TAILROOM;
+
+	ret = dev_alloc_name(ndev, ndev->name);
+	if (ret < 0)
+		goto fail;
+
+	ieee80211_assign_perm_addr(local, ndev, type);
+	memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN);
+	SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));
+
+	/* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */
+	sdata = netdev_priv(ndev);
+	ndev->ieee80211_ptr = &sdata->wdev;
+	memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN);
+	memcpy(sdata->name, ndev->name, IFNAMSIZ);
+
+	/* initialise type-independent data */
+	sdata->wdev.wiphy = local->hw.wiphy;
+	sdata->local = local;
+	sdata->dev = ndev;
+#ifdef CONFIG_INET
+	sdata->arp_filter_state = true;
+#endif
+
+	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
+		skb_queue_head_init(&sdata->fragments[i].skb_list);
+
+	INIT_LIST_HEAD(&sdata->key_list);
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		struct ieee80211_supported_band *sband;
+		sband = local->hw.wiphy->bands[i];
+		sdata->rc_rateidx_mask[i] =
+			sband ? (1 << sband->n_bitrates) - 1 : 0;
+	}
+
+	/* setup type-dependent data */
+	ieee80211_setup_sdata(sdata, type);
+
+	if (params) {
+		ndev->ieee80211_ptr->use_4addr = params->use_4addr;
+		if (type == NL80211_IFTYPE_STATION)
+			sdata->u.mgd.use_4addr = params->use_4addr;
+	}
+
+	ret = register_netdevice(ndev);
+	if (ret)
+		goto fail;
+
+	mutex_lock(&local->iflist_mtx);
+	list_add_tail_rcu(&sdata->list, &local->interfaces);
+	mutex_unlock(&local->iflist_mtx);
+
+	if (new_dev)
+		*new_dev = ndev;
+
+	return 0;
+
+ fail:
+	free_netdev(ndev);
+	return ret;
+}
+
+void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
+{
+	ASSERT_RTNL();
+
+	mutex_lock(&sdata->local->iflist_mtx);
+	list_del_rcu(&sdata->list);
+	mutex_unlock(&sdata->local->iflist_mtx);
+
+	synchronize_rcu();
+	unregister_netdevice(sdata->dev);
+}
+
+/*
+ * Remove all interfaces, may only be called at hardware unregistration
+ * time because it doesn't do RCU-safe list removals.
+ */
+void ieee80211_remove_interfaces(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata, *tmp;
+	LIST_HEAD(unreg_list);
+
+	ASSERT_RTNL();
+
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
+		list_del(&sdata->list);
+
+		unregister_netdevice_queue(sdata->dev, &unreg_list);
+	}
+	mutex_unlock(&local->iflist_mtx);
+	unregister_netdevice_many(&unreg_list);
+	list_del(&unreg_list);
+}
+
+static u32 ieee80211_idle_off(struct ieee80211_local *local,
+			      const char *reason)
+{
+	if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE))
+		return 0;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason);
+#endif
+
+	local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
+	return IEEE80211_CONF_CHANGE_IDLE;
+}
+
+static u32 ieee80211_idle_on(struct ieee80211_local *local)
+{
+	if (local->hw.conf.flags & IEEE80211_CONF_IDLE)
+		return 0;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "device now idle\n");
+#endif
+
+	drv_flush(local, false);
+
+	local->hw.conf.flags |= IEEE80211_CONF_IDLE;
+	return IEEE80211_CONF_CHANGE_IDLE;
+}
+
+u32 __ieee80211_recalc_idle(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata;
+	int count = 0;
+	bool working = false, scanning = false, hw_roc = false;
+	struct ieee80211_work *wk;
+	unsigned int led_trig_start = 0, led_trig_stop = 0;
+
+#ifdef CONFIG_PROVE_LOCKING
+	WARN_ON(debug_locks && !lockdep_rtnl_is_held() &&
+		!lockdep_is_held(&local->iflist_mtx));
+#endif
+	lockdep_assert_held(&local->mtx);
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata)) {
+			sdata->vif.bss_conf.idle = true;
+			continue;
+		}
+
+		sdata->old_idle = sdata->vif.bss_conf.idle;
+
+		/* do not count disabled managed interfaces */
+		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+		    !sdata->u.mgd.associated) {
+			sdata->vif.bss_conf.idle = true;
+			continue;
+		}
+		/* do not count unused IBSS interfaces */
+		if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+		    !sdata->u.ibss.ssid_len) {
+			sdata->vif.bss_conf.idle = true;
+			continue;
+		}
+		/* count everything else */
+		count++;
+	}
+
+	list_for_each_entry(wk, &local->work_list, list) {
+		working = true;
+		wk->sdata->vif.bss_conf.idle = false;
+	}
+
+	if (local->scan_sdata) {
+		scanning = true;
+		local->scan_sdata->vif.bss_conf.idle = false;
+	}
+
+	if (local->hw_roc_channel)
+		hw_roc = true;
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata->old_idle == sdata->vif.bss_conf.idle)
+			continue;
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE);
+	}
+
+	if (working || scanning || hw_roc)
+		led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK;
+	else
+		led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK;
+
+	if (count)
+		led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;
+	else
+		led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;
+
+	ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop);
+
+	if (hw_roc)
+		return ieee80211_idle_off(local, "hw remain-on-channel");
+	if (working)
+		return ieee80211_idle_off(local, "working");
+	if (scanning)
+		return ieee80211_idle_off(local, "scanning");
+	if (!count)
+		return ieee80211_idle_on(local);
+	else
+		return ieee80211_idle_off(local, "in use");
+
+	return 0;
+}
+
+void ieee80211_recalc_idle(struct ieee80211_local *local)
+{
+	u32 chg;
+
+	mutex_lock(&local->iflist_mtx);
+	chg = __ieee80211_recalc_idle(local);
+	mutex_unlock(&local->iflist_mtx);
+	if (chg)
+		ieee80211_hw_config(local, chg);
+}
+
+static int netdev_notify(struct notifier_block *nb,
+			 unsigned long state,
+			 void *ndev)
+{
+	struct net_device *dev = ndev;
+	struct ieee80211_sub_if_data *sdata;
+
+	if (state != NETDEV_CHANGENAME)
+		return 0;
+
+	if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy)
+		return 0;
+
+	if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid)
+		return 0;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	memcpy(sdata->name, dev->name, IFNAMSIZ);
+
+	ieee80211_debugfs_rename_netdev(sdata);
+	return 0;
+}
+
+static struct notifier_block mac80211_netdev_notifier = {
+	.notifier_call = netdev_notify,
+};
+
+int ieee80211_iface_init(void)
+{
+	return register_netdevice_notifier(&mac80211_netdev_notifier);
+}
+
+void ieee80211_iface_exit(void)
+{
+	unregister_netdevice_notifier(&mac80211_netdev_notifier);
+}
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
new file mode 100644
index 00000000..f825e2f0
--- /dev/null
+++ b/net/mac80211/key.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2008	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "debugfs_key.h"
+#include "aes_ccm.h"
+#include "aes_cmac.h"
+
+
+/**
+ * DOC: Key handling basics
+ *
+ * Key handling in mac80211 is done based on per-interface (sub_if_data)
+ * keys and per-station keys. Since each station belongs to an interface,
+ * each station key also belongs to that interface.
+ *
+ * Hardware acceleration is done on a best-effort basis for algorithms
+ * that are implemented in software,  for each key the hardware is asked
+ * to enable that key for offloading but if it cannot do that the key is
+ * simply kept for software encryption (unless it is for an algorithm
+ * that isn't implemented in software).
+ * There is currently no way of knowing whether a key is handled in SW
+ * or HW except by looking into debugfs.
+ *
+ * All key management is internally protected by a mutex. Within all
+ * other parts of mac80211, key references are, just as STA structure
+ * references, protected by RCU. Note, however, that some things are
+ * unprotected, namely the key->sta dereferences within the hardware
+ * acceleration functions. This means that sta_info_destroy() must
+ * remove the key which waits for an RCU grace period.
+ */
+
+static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+
+static void assert_key_lock(struct ieee80211_local *local)
+{
+	lockdep_assert_held(&local->key_mtx);
+}
+
+static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
+{
+	if (key->sta)
+		return &key->sta->sta;
+
+	return NULL;
+}
+
+static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sta *sta;
+	int ret;
+
+	might_sleep();
+
+	if (!key->local->ops->set_key)
+		goto out_unsupported;
+
+	assert_key_lock(key->local);
+
+	sta = get_sta_for_key(key);
+
+	/*
+	 * If this is a per-STA GTK, check if it
+	 * is supported; if not, return.
+	 */
+	if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
+	    !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
+		goto out_unsupported;
+
+	sdata = key->sdata;
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+		/*
+		 * The driver doesn't know anything about VLAN interfaces.
+		 * Hence, don't send GTKs for VLAN interfaces to the driver.
+		 */
+		if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE))
+			goto out_unsupported;
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data,
+				     u.ap);
+	}
+
+	ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf);
+
+	if (!ret) {
+		key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
+		return 0;
+	}
+
+	if (ret != -ENOSPC && ret != -EOPNOTSUPP)
+		wiphy_err(key->local->hw.wiphy,
+			  "failed to set key (%d, %pM) to hardware (%d)\n",
+			  key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+
+ out_unsupported:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+	case WLAN_CIPHER_SUITE_TKIP:
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		/* all of these we can do in software */
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sta *sta;
+	int ret;
+
+	might_sleep();
+
+	if (!key || !key->local->ops->set_key)
+		return;
+
+	assert_key_lock(key->local);
+
+	if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+		return;
+
+	sta = get_sta_for_key(key);
+	sdata = key->sdata;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data,
+				     u.ap);
+
+	ret = drv_set_key(key->local, DISABLE_KEY, sdata,
+			  sta, &key->conf);
+
+	if (ret)
+		wiphy_err(key->local->hw.wiphy,
+			  "failed to remove key (%d, %pM) from hardware (%d)\n",
+			  key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+
+	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+}
+
+void ieee80211_key_removed(struct ieee80211_key_conf *key_conf)
+{
+	struct ieee80211_key *key;
+
+	key = container_of(key_conf, struct ieee80211_key, conf);
+
+	might_sleep();
+	assert_key_lock(key->local);
+
+	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+	/*
+	 * Flush TX path to avoid attempts to use this key
+	 * after this function returns. Until then, drivers
+	 * must be prepared to handle the key.
+	 */
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ieee80211_key_removed);
+
+static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
+					int idx, bool uni, bool multi)
+{
+	struct ieee80211_key *key = NULL;
+
+	assert_key_lock(sdata->local);
+
+	if (idx >= 0 && idx < NUM_DEFAULT_KEYS)
+		key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+	if (uni)
+		rcu_assign_pointer(sdata->default_unicast_key, key);
+	if (multi)
+		rcu_assign_pointer(sdata->default_multicast_key, key);
+
+	ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx,
+			       bool uni, bool multi)
+{
+	mutex_lock(&sdata->local->key_mtx);
+	__ieee80211_set_default_key(sdata, idx, uni, multi);
+	mutex_unlock(&sdata->local->key_mtx);
+}
+
+static void
+__ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx)
+{
+	struct ieee80211_key *key = NULL;
+
+	assert_key_lock(sdata->local);
+
+	if (idx >= NUM_DEFAULT_KEYS &&
+	    idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+		key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+	rcu_assign_pointer(sdata->default_mgmt_key, key);
+
+	ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+				    int idx)
+{
+	mutex_lock(&sdata->local->key_mtx);
+	__ieee80211_set_default_mgmt_key(sdata, idx);
+	mutex_unlock(&sdata->local->key_mtx);
+}
+
+
+static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
+				    struct sta_info *sta,
+				    bool pairwise,
+				    struct ieee80211_key *old,
+				    struct ieee80211_key *new)
+{
+	int idx;
+	bool defunikey, defmultikey, defmgmtkey;
+
+	if (new)
+		list_add(&new->list, &sdata->key_list);
+
+	if (sta && pairwise) {
+		rcu_assign_pointer(sta->ptk, new);
+	} else if (sta) {
+		if (old)
+			idx = old->conf.keyidx;
+		else
+			idx = new->conf.keyidx;
+		rcu_assign_pointer(sta->gtk[idx], new);
+	} else {
+		WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
+
+		if (old)
+			idx = old->conf.keyidx;
+		else
+			idx = new->conf.keyidx;
+
+		defunikey = old &&
+			old == key_mtx_dereference(sdata->local,
+						sdata->default_unicast_key);
+		defmultikey = old &&
+			old == key_mtx_dereference(sdata->local,
+						sdata->default_multicast_key);
+		defmgmtkey = old &&
+			old == key_mtx_dereference(sdata->local,
+						sdata->default_mgmt_key);
+
+		if (defunikey && !new)
+			__ieee80211_set_default_key(sdata, -1, true, false);
+		if (defmultikey && !new)
+			__ieee80211_set_default_key(sdata, -1, false, true);
+		if (defmgmtkey && !new)
+			__ieee80211_set_default_mgmt_key(sdata, -1);
+
+		rcu_assign_pointer(sdata->keys[idx], new);
+		if (defunikey && new)
+			__ieee80211_set_default_key(sdata, new->conf.keyidx,
+						    true, false);
+		if (defmultikey && new)
+			__ieee80211_set_default_key(sdata, new->conf.keyidx,
+						    false, true);
+		if (defmgmtkey && new)
+			__ieee80211_set_default_mgmt_key(sdata,
+							 new->conf.keyidx);
+	}
+
+	if (old)
+		list_del(&old->list);
+}
+
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
+					  const u8 *key_data,
+					  size_t seq_len, const u8 *seq)
+{
+	struct ieee80211_key *key;
+	int i, j, err;
+
+	BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
+
+	key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
+	if (!key)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Default to software encryption; we'll later upload the
+	 * key to the hardware if possible.
+	 */
+	key->conf.flags = 0;
+	key->flags = 0;
+
+	key->conf.cipher = cipher;
+	key->conf.keyidx = idx;
+	key->conf.keylen = key_len;
+	switch (cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		key->conf.iv_len = WEP_IV_LEN;
+		key->conf.icv_len = WEP_ICV_LEN;
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		key->conf.iv_len = TKIP_IV_LEN;
+		key->conf.icv_len = TKIP_ICV_LEN;
+		if (seq) {
+			for (i = 0; i < NUM_RX_DATA_QUEUES; i++) {
+				key->u.tkip.rx[i].iv32 =
+					get_unaligned_le32(&seq[2]);
+				key->u.tkip.rx[i].iv16 =
+					get_unaligned_le16(seq);
+			}
+		}
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		key->conf.iv_len = CCMP_HDR_LEN;
+		key->conf.icv_len = CCMP_MIC_LEN;
+		if (seq) {
+			for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++)
+				for (j = 0; j < CCMP_PN_LEN; j++)
+					key->u.ccmp.rx_pn[i][j] =
+						seq[CCMP_PN_LEN - j - 1];
+		}
+		/*
+		 * Initialize AES key state here as an optimization so that
+		 * it does not need to be initialized for every packet.
+		 */
+		key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data);
+		if (IS_ERR(key->u.ccmp.tfm)) {
+			err = PTR_ERR(key->u.ccmp.tfm);
+			kfree(key);
+			return ERR_PTR(err);
+		}
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		key->conf.iv_len = 0;
+		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		if (seq)
+			for (j = 0; j < 6; j++)
+				key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
+		/*
+		 * Initialize AES key state here as an optimization so that
+		 * it does not need to be initialized for every packet.
+		 */
+		key->u.aes_cmac.tfm =
+			ieee80211_aes_cmac_key_setup(key_data);
+		if (IS_ERR(key->u.aes_cmac.tfm)) {
+			err = PTR_ERR(key->u.aes_cmac.tfm);
+			kfree(key);
+			return ERR_PTR(err);
+		}
+		break;
+	}
+	memcpy(key->conf.key, key_data, key_len);
+	INIT_LIST_HEAD(&key->list);
+
+	return key;
+}
+
+static void __ieee80211_key_destroy(struct ieee80211_key *key)
+{
+	if (!key)
+		return;
+
+	/*
+	 * Synchronize so the TX path can no longer be using
+	 * this key before we free/remove it.
+	 */
+	synchronize_rcu();
+
+	if (key->local)
+		ieee80211_key_disable_hw_accel(key);
+
+	if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP)
+		ieee80211_aes_key_free(key->u.ccmp.tfm);
+	if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC)
+		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
+	if (key->local)
+		ieee80211_debugfs_key_remove(key);
+
+	kfree(key);
+}
+
+int ieee80211_key_link(struct ieee80211_key *key,
+		       struct ieee80211_sub_if_data *sdata,
+		       struct sta_info *sta)
+{
+	struct ieee80211_key *old_key;
+	int idx, ret;
+	bool pairwise;
+
+	BUG_ON(!sdata);
+	BUG_ON(!key);
+
+	pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
+	idx = key->conf.keyidx;
+	key->local = sdata->local;
+	key->sdata = sdata;
+	key->sta = sta;
+
+	if (sta) {
+		/*
+		 * some hardware cannot handle TKIP with QoS, so
+		 * we indicate whether QoS could be in use.
+		 */
+		if (test_sta_flags(sta, WLAN_STA_WME))
+			key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA;
+	} else {
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+			struct sta_info *ap;
+
+			/*
+			 * We're getting a sta pointer in, so must be under
+			 * appropriate locking for sta_info_get().
+			 */
+
+			/* same here, the AP could be using QoS */
+			ap = sta_info_get(key->sdata, key->sdata->u.mgd.bssid);
+			if (ap) {
+				if (test_sta_flags(ap, WLAN_STA_WME))
+					key->conf.flags |=
+						IEEE80211_KEY_FLAG_WMM_STA;
+			}
+		}
+	}
+
+	mutex_lock(&sdata->local->key_mtx);
+
+	if (sta && pairwise)
+		old_key = key_mtx_dereference(sdata->local, sta->ptk);
+	else if (sta)
+		old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]);
+	else
+		old_key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+	__ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
+	__ieee80211_key_destroy(old_key);
+
+	ieee80211_debugfs_key_add(key);
+
+	ret = ieee80211_key_enable_hw_accel(key);
+
+	mutex_unlock(&sdata->local->key_mtx);
+
+	return ret;
+}
+
+void __ieee80211_key_free(struct ieee80211_key *key)
+{
+	if (!key)
+		return;
+
+	/*
+	 * Replace key with nothingness if it was ever used.
+	 */
+	if (key->sdata)
+		__ieee80211_key_replace(key->sdata, key->sta,
+				key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+				key, NULL);
+	__ieee80211_key_destroy(key);
+}
+
+void ieee80211_key_free(struct ieee80211_local *local,
+			struct ieee80211_key *key)
+{
+	mutex_lock(&local->key_mtx);
+	__ieee80211_key_free(key);
+	mutex_unlock(&local->key_mtx);
+}
+
+void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_key *key;
+
+	ASSERT_RTNL();
+
+	if (WARN_ON(!ieee80211_sdata_running(sdata)))
+		return;
+
+	mutex_lock(&sdata->local->key_mtx);
+
+	list_for_each_entry(key, &sdata->key_list, list)
+		ieee80211_key_enable_hw_accel(key);
+
+	mutex_unlock(&sdata->local->key_mtx);
+}
+
+void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_key *key;
+
+	ASSERT_RTNL();
+
+	mutex_lock(&sdata->local->key_mtx);
+
+	list_for_each_entry(key, &sdata->key_list, list)
+		ieee80211_key_disable_hw_accel(key);
+
+	mutex_unlock(&sdata->local->key_mtx);
+}
+
+void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_key *key, *tmp;
+
+	mutex_lock(&sdata->local->key_mtx);
+
+	ieee80211_debugfs_key_remove_mgmt_default(sdata);
+
+	list_for_each_entry_safe(key, tmp, &sdata->key_list, list)
+		__ieee80211_key_free(key);
+
+	ieee80211_debugfs_key_update_default(sdata);
+
+	mutex_unlock(&sdata->local->key_mtx);
+}
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
new file mode 100644
index 00000000..d801d535
--- /dev/null
+++ b/net/mac80211/key.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_KEY_H
+#define IEEE80211_KEY_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/crypto.h>
+#include <linux/rcupdate.h>
+#include <net/mac80211.h>
+
+#define NUM_DEFAULT_KEYS 4
+#define NUM_DEFAULT_MGMT_KEYS 2
+
+#define WEP_IV_LEN		4
+#define WEP_ICV_LEN		4
+#define ALG_CCMP_KEY_LEN	16
+#define CCMP_HDR_LEN		8
+#define CCMP_MIC_LEN		8
+#define CCMP_TK_LEN		16
+#define CCMP_PN_LEN		6
+#define TKIP_IV_LEN		8
+#define TKIP_ICV_LEN		4
+
+#define NUM_RX_DATA_QUEUES	17
+
+struct ieee80211_local;
+struct ieee80211_sub_if_data;
+struct sta_info;
+
+/**
+ * enum ieee80211_internal_key_flags - internal key flags
+ *
+ * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present
+ *	in the hardware for TX crypto hardware acceleration.
+ */
+enum ieee80211_internal_key_flags {
+	KEY_FLAG_UPLOADED_TO_HARDWARE	= BIT(0),
+};
+
+enum ieee80211_internal_tkip_state {
+	TKIP_STATE_NOT_INIT,
+	TKIP_STATE_PHASE1_DONE,
+	TKIP_STATE_PHASE1_HW_UPLOADED,
+};
+
+struct tkip_ctx {
+	u32 iv32;
+	u16 iv16;
+	u16 p1k[5];
+	enum ieee80211_internal_tkip_state state;
+};
+
+struct ieee80211_key {
+	struct ieee80211_local *local;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+
+	/* for sdata list */
+	struct list_head list;
+
+	/* protected by key mutex */
+	unsigned int flags;
+
+	union {
+		struct {
+			/* last used TSC */
+			struct tkip_ctx tx;
+
+			/* last received RSC */
+			struct tkip_ctx rx[NUM_RX_DATA_QUEUES];
+		} tkip;
+		struct {
+			u8 tx_pn[6];
+			/*
+			 * Last received packet number. The first
+			 * NUM_RX_DATA_QUEUES counters are used with Data
+			 * frames and the last counter is used with Robust
+			 * Management frames.
+			 */
+			u8 rx_pn[NUM_RX_DATA_QUEUES + 1][6];
+			struct crypto_cipher *tfm;
+			u32 replays; /* dot11RSNAStatsCCMPReplays */
+			/* scratch buffers for virt_to_page() (crypto API) */
+#ifndef AES_BLOCK_LEN
+#define AES_BLOCK_LEN 16
+#endif
+			u8 tx_crypto_buf[6 * AES_BLOCK_LEN];
+			u8 rx_crypto_buf[6 * AES_BLOCK_LEN];
+		} ccmp;
+		struct {
+			u8 tx_pn[6];
+			u8 rx_pn[6];
+			struct crypto_cipher *tfm;
+			u32 replays; /* dot11RSNAStatsCMACReplays */
+			u32 icverrors; /* dot11RSNAStatsCMACICVErrors */
+			/* scratch buffers for virt_to_page() (crypto API) */
+			u8 tx_crypto_buf[2 * AES_BLOCK_LEN];
+			u8 rx_crypto_buf[2 * AES_BLOCK_LEN];
+		} aes_cmac;
+	} u;
+
+	/* number of times this key has been used */
+	int tx_rx_count;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct {
+		struct dentry *stalink;
+		struct dentry *dir;
+		int cnt;
+	} debugfs;
+#endif
+
+	/*
+	 * key config, must be last because it contains key
+	 * material as variable length member
+	 */
+	struct ieee80211_key_conf conf;
+};
+
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
+					  const u8 *key_data,
+					  size_t seq_len, const u8 *seq);
+/*
+ * Insert a key into data structures (sdata, sta if necessary)
+ * to make it used, free old key.
+ */
+int __must_check ieee80211_key_link(struct ieee80211_key *key,
+				    struct ieee80211_sub_if_data *sdata,
+				    struct sta_info *sta);
+void __ieee80211_key_free(struct ieee80211_key *key);
+void ieee80211_key_free(struct ieee80211_local *local,
+			struct ieee80211_key *key);
+void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx,
+			       bool uni, bool multi);
+void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
+				    int idx);
+void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata);
+void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
+void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata);
+
+#define key_mtx_dereference(local, ref) \
+	rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx)))
+
+#endif /* IEEE80211_KEY_H */
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
new file mode 100644
index 00000000..14590332
--- /dev/null
+++ b/net/mac80211/led.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* just for IFNAMSIZ */
+#include <linux/if.h>
+#include <linux/slab.h>
+#include "led.h"
+
+void ieee80211_led_rx(struct ieee80211_local *local)
+{
+	if (unlikely(!local->rx_led))
+		return;
+	if (local->rx_led_counter++ % 2 == 0)
+		led_trigger_event(local->rx_led, LED_OFF);
+	else
+		led_trigger_event(local->rx_led, LED_FULL);
+}
+
+/* q is 1 if a packet was enqueued, 0 if it has been transmitted */
+void ieee80211_led_tx(struct ieee80211_local *local, int q)
+{
+	if (unlikely(!local->tx_led))
+		return;
+	/* not sure how this is supposed to work ... */
+	local->tx_led_counter += 2*q-1;
+	if (local->tx_led_counter % 2 == 0)
+		led_trigger_event(local->tx_led, LED_OFF);
+	else
+		led_trigger_event(local->tx_led, LED_FULL);
+}
+
+void ieee80211_led_assoc(struct ieee80211_local *local, bool associated)
+{
+	if (unlikely(!local->assoc_led))
+		return;
+	if (associated)
+		led_trigger_event(local->assoc_led, LED_FULL);
+	else
+		led_trigger_event(local->assoc_led, LED_OFF);
+}
+
+void ieee80211_led_radio(struct ieee80211_local *local, bool enabled)
+{
+	if (unlikely(!local->radio_led))
+		return;
+	if (enabled)
+		led_trigger_event(local->radio_led, LED_FULL);
+	else
+		led_trigger_event(local->radio_led, LED_OFF);
+}
+
+void ieee80211_led_names(struct ieee80211_local *local)
+{
+	snprintf(local->rx_led_name, sizeof(local->rx_led_name),
+		 "%srx", wiphy_name(local->hw.wiphy));
+	snprintf(local->tx_led_name, sizeof(local->tx_led_name),
+		 "%stx", wiphy_name(local->hw.wiphy));
+	snprintf(local->assoc_led_name, sizeof(local->assoc_led_name),
+		 "%sassoc", wiphy_name(local->hw.wiphy));
+	snprintf(local->radio_led_name, sizeof(local->radio_led_name),
+		 "%sradio", wiphy_name(local->hw.wiphy));
+}
+
+void ieee80211_led_init(struct ieee80211_local *local)
+{
+	local->rx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+	if (local->rx_led) {
+		local->rx_led->name = local->rx_led_name;
+		if (led_trigger_register(local->rx_led)) {
+			kfree(local->rx_led);
+			local->rx_led = NULL;
+		}
+	}
+
+	local->tx_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+	if (local->tx_led) {
+		local->tx_led->name = local->tx_led_name;
+		if (led_trigger_register(local->tx_led)) {
+			kfree(local->tx_led);
+			local->tx_led = NULL;
+		}
+	}
+
+	local->assoc_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+	if (local->assoc_led) {
+		local->assoc_led->name = local->assoc_led_name;
+		if (led_trigger_register(local->assoc_led)) {
+			kfree(local->assoc_led);
+			local->assoc_led = NULL;
+		}
+	}
+
+	local->radio_led = kzalloc(sizeof(struct led_trigger), GFP_KERNEL);
+	if (local->radio_led) {
+		local->radio_led->name = local->radio_led_name;
+		if (led_trigger_register(local->radio_led)) {
+			kfree(local->radio_led);
+			local->radio_led = NULL;
+		}
+	}
+
+	if (local->tpt_led_trigger) {
+		if (led_trigger_register(&local->tpt_led_trigger->trig)) {
+			kfree(local->tpt_led_trigger);
+			local->tpt_led_trigger = NULL;
+		}
+	}
+}
+
+void ieee80211_led_exit(struct ieee80211_local *local)
+{
+	if (local->radio_led) {
+		led_trigger_unregister(local->radio_led);
+		kfree(local->radio_led);
+	}
+	if (local->assoc_led) {
+		led_trigger_unregister(local->assoc_led);
+		kfree(local->assoc_led);
+	}
+	if (local->tx_led) {
+		led_trigger_unregister(local->tx_led);
+		kfree(local->tx_led);
+	}
+	if (local->rx_led) {
+		led_trigger_unregister(local->rx_led);
+		kfree(local->rx_led);
+	}
+
+	if (local->tpt_led_trigger) {
+		led_trigger_unregister(&local->tpt_led_trigger->trig);
+		kfree(local->tpt_led_trigger);
+	}
+}
+
+char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	return local->radio_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_radio_led_name);
+
+char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	return local->assoc_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_assoc_led_name);
+
+char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	return local->tx_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_tx_led_name);
+
+char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	return local->rx_led_name;
+}
+EXPORT_SYMBOL(__ieee80211_get_rx_led_name);
+
+static unsigned long tpt_trig_traffic(struct ieee80211_local *local,
+				      struct tpt_led_trigger *tpt_trig)
+{
+	unsigned long traffic, delta;
+
+	traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes;
+
+	delta = traffic - tpt_trig->prev_traffic;
+	tpt_trig->prev_traffic = traffic;
+	return DIV_ROUND_UP(delta, 1024 / 8);
+}
+
+static void tpt_trig_timer(unsigned long data)
+{
+	struct ieee80211_local *local = (void *)data;
+	struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+	struct led_classdev *led_cdev;
+	unsigned long on, off, tpt;
+	int i;
+
+	if (!tpt_trig->running)
+		return;
+
+	mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ));
+
+	tpt = tpt_trig_traffic(local, tpt_trig);
+
+	/* default to just solid on */
+	on = 1;
+	off = 0;
+
+	for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) {
+		if (tpt_trig->blink_table[i].throughput < 0 ||
+		    tpt > tpt_trig->blink_table[i].throughput) {
+			off = tpt_trig->blink_table[i].blink_time / 2;
+			on = tpt_trig->blink_table[i].blink_time - off;
+			break;
+		}
+	}
+
+	read_lock(&tpt_trig->trig.leddev_list_lock);
+	list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+		led_blink_set(led_cdev, &on, &off);
+	read_unlock(&tpt_trig->trig.leddev_list_lock);
+}
+
+char *__ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw,
+				unsigned int flags,
+				const struct ieee80211_tpt_blink *blink_table,
+				unsigned int blink_table_len)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct tpt_led_trigger *tpt_trig;
+
+	if (WARN_ON(local->tpt_led_trigger))
+		return NULL;
+
+	tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL);
+	if (!tpt_trig)
+		return NULL;
+
+	snprintf(tpt_trig->name, sizeof(tpt_trig->name),
+		 "%stpt", wiphy_name(local->hw.wiphy));
+
+	tpt_trig->trig.name = tpt_trig->name;
+
+	tpt_trig->blink_table = blink_table;
+	tpt_trig->blink_table_len = blink_table_len;
+	tpt_trig->want = flags;
+
+	setup_timer(&tpt_trig->timer, tpt_trig_timer, (unsigned long)local);
+
+	local->tpt_led_trigger = tpt_trig;
+
+	return tpt_trig->name;
+}
+EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger);
+
+static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local)
+{
+	struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+
+	if (tpt_trig->running)
+		return;
+
+	/* reset traffic */
+	tpt_trig_traffic(local, tpt_trig);
+	tpt_trig->running = true;
+
+	tpt_trig_timer((unsigned long)local);
+	mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ));
+}
+
+static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local)
+{
+	struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+	struct led_classdev *led_cdev;
+
+	if (!tpt_trig->running)
+		return;
+
+	tpt_trig->running = false;
+	del_timer_sync(&tpt_trig->timer);
+
+	read_lock(&tpt_trig->trig.leddev_list_lock);
+	list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
+		led_brightness_set(led_cdev, LED_OFF);
+	read_unlock(&tpt_trig->trig.leddev_list_lock);
+}
+
+void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+				unsigned int types_on, unsigned int types_off)
+{
+	struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger;
+	bool allowed;
+
+	WARN_ON(types_on & types_off);
+
+	if (!tpt_trig)
+		return;
+
+	tpt_trig->active &= ~types_off;
+	tpt_trig->active |= types_on;
+
+	/*
+	 * Regardless of wanted state, we shouldn't blink when
+	 * the radio is disabled -- this can happen due to some
+	 * code ordering issues with __ieee80211_recalc_idle()
+	 * being called before the radio is started.
+	 */
+	allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO;
+
+	if (!allowed || !(tpt_trig->active & tpt_trig->want))
+		ieee80211_stop_tpt_led_trig(local);
+	else
+		ieee80211_start_tpt_led_trig(local);
+}
diff --git a/net/mac80211/led.h b/net/mac80211/led.h
new file mode 100644
index 00000000..e0275d9b
--- /dev/null
+++ b/net/mac80211/led.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/leds.h>
+#include "ieee80211_i.h"
+
+#ifdef CONFIG_MAC80211_LEDS
+void ieee80211_led_rx(struct ieee80211_local *local);
+void ieee80211_led_tx(struct ieee80211_local *local, int q);
+void ieee80211_led_assoc(struct ieee80211_local *local,
+			 bool associated);
+void ieee80211_led_radio(struct ieee80211_local *local,
+			 bool enabled);
+void ieee80211_led_names(struct ieee80211_local *local);
+void ieee80211_led_init(struct ieee80211_local *local);
+void ieee80211_led_exit(struct ieee80211_local *local);
+void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+				unsigned int types_on, unsigned int types_off);
+#else
+static inline void ieee80211_led_rx(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_tx(struct ieee80211_local *local, int q)
+{
+}
+static inline void ieee80211_led_assoc(struct ieee80211_local *local,
+				       bool associated)
+{
+}
+static inline void ieee80211_led_radio(struct ieee80211_local *local,
+				       bool enabled)
+{
+}
+static inline void ieee80211_led_names(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_init(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_led_exit(struct ieee80211_local *local)
+{
+}
+static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
+					      unsigned int types_on,
+					      unsigned int types_off)
+{
+}
+#endif
+
+static inline void
+ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes)
+{
+#ifdef CONFIG_MAC80211_LEDS
+	if (local->tpt_led_trigger && ieee80211_is_data(fc))
+		local->tpt_led_trigger->tx_bytes += bytes;
+#endif
+}
+
+static inline void
+ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes)
+{
+#ifdef CONFIG_MAC80211_LEDS
+	if (local->tpt_led_trigger && ieee80211_is_data(fc))
+		local->tpt_led_trigger->rx_bytes += bytes;
+#endif
+}
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
new file mode 100644
index 00000000..1e36fb33
--- /dev/null
+++ b/net/mac80211/main.c
@@ -0,0 +1,1102 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/mac80211.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitmap.h>
+#include <linux/pm_qos_params.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <net/cfg80211.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "mesh.h"
+#include "wep.h"
+#include "led.h"
+#include "cfg.h"
+#include "debugfs.h"
+
+static struct lock_class_key ieee80211_rx_skb_queue_class;
+
+void ieee80211_configure_filter(struct ieee80211_local *local)
+{
+	u64 mc;
+	unsigned int changed_flags;
+	unsigned int new_flags = 0;
+
+	if (atomic_read(&local->iff_promiscs))
+		new_flags |= FIF_PROMISC_IN_BSS;
+
+	if (atomic_read(&local->iff_allmultis))
+		new_flags |= FIF_ALLMULTI;
+
+	if (local->monitors || local->scanning)
+		new_flags |= FIF_BCN_PRBRESP_PROMISC;
+
+	if (local->fif_probe_req || local->probe_req_reg)
+		new_flags |= FIF_PROBE_REQ;
+
+	if (local->fif_fcsfail)
+		new_flags |= FIF_FCSFAIL;
+
+	if (local->fif_plcpfail)
+		new_flags |= FIF_PLCPFAIL;
+
+	if (local->fif_control)
+		new_flags |= FIF_CONTROL;
+
+	if (local->fif_other_bss)
+		new_flags |= FIF_OTHER_BSS;
+
+	if (local->fif_pspoll)
+		new_flags |= FIF_PSPOLL;
+
+	spin_lock_bh(&local->filter_lock);
+	changed_flags = local->filter_flags ^ new_flags;
+
+	mc = drv_prepare_multicast(local, &local->mc_list);
+	spin_unlock_bh(&local->filter_lock);
+
+	/* be a bit nasty */
+	new_flags |= (1<<31);
+
+	drv_configure_filter(local, changed_flags, &new_flags, mc);
+
+	WARN_ON(new_flags & (1<<31));
+
+	local->filter_flags = new_flags & ~(1<<31);
+}
+
+static void ieee80211_reconfig_filter(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, reconfig_filter);
+
+	ieee80211_configure_filter(local);
+}
+
+/*
+ * Returns true if we are logically configured to be on
+ * the operating channel AND the hardware-conf is currently
+ * configured on the operating channel.  Compares channel-type
+ * as well.
+ */
+bool ieee80211_cfg_on_oper_channel(struct ieee80211_local *local)
+{
+	struct ieee80211_channel *chan, *scan_chan;
+	enum nl80211_channel_type channel_type;
+
+	/* This logic needs to match logic in ieee80211_hw_config */
+	if (local->scan_channel) {
+		chan = local->scan_channel;
+		/* If scanning on oper channel, use whatever channel-type
+		 * is currently in use.
+		 */
+		if (chan == local->oper_channel)
+			channel_type = local->_oper_channel_type;
+		else
+			channel_type = NL80211_CHAN_NO_HT;
+	} else if (local->tmp_channel) {
+		chan = scan_chan = local->tmp_channel;
+		channel_type = local->tmp_channel_type;
+	} else {
+		chan = local->oper_channel;
+		channel_type = local->_oper_channel_type;
+	}
+
+	if (chan != local->oper_channel ||
+	    channel_type != local->_oper_channel_type)
+		return false;
+
+	/* Check current hardware-config against oper_channel. */
+	if ((local->oper_channel != local->hw.conf.channel) ||
+	    (local->_oper_channel_type != local->hw.conf.channel_type))
+		return false;
+
+	return true;
+}
+
+int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
+{
+	struct ieee80211_channel *chan, *scan_chan;
+	int ret = 0;
+	int power;
+	enum nl80211_channel_type channel_type;
+	u32 offchannel_flag;
+
+	might_sleep();
+
+	scan_chan = local->scan_channel;
+
+	/* If this off-channel logic ever changes,  ieee80211_on_oper_channel
+	 * may need to change as well.
+	 */
+	offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
+	if (scan_chan) {
+		chan = scan_chan;
+		/* If scanning on oper channel, use whatever channel-type
+		 * is currently in use.
+		 */
+		if (chan == local->oper_channel)
+			channel_type = local->_oper_channel_type;
+		else
+			channel_type = NL80211_CHAN_NO_HT;
+	} else if (local->tmp_channel) {
+		chan = scan_chan = local->tmp_channel;
+		channel_type = local->tmp_channel_type;
+	} else {
+		chan = local->oper_channel;
+		channel_type = local->_oper_channel_type;
+	}
+
+	if (chan != local->oper_channel ||
+	    channel_type != local->_oper_channel_type)
+		local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
+	else
+		local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
+
+	offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
+
+	if (offchannel_flag || chan != local->hw.conf.channel ||
+	    channel_type != local->hw.conf.channel_type) {
+		local->hw.conf.channel = chan;
+		local->hw.conf.channel_type = channel_type;
+		changed |= IEEE80211_CONF_CHANGE_CHANNEL;
+	}
+
+	if (!conf_is_ht(&local->hw.conf)) {
+		/*
+		 * mac80211.h documents that this is only valid
+		 * when the channel is set to an HT type, and
+		 * that otherwise STATIC is used.
+		 */
+		local->hw.conf.smps_mode = IEEE80211_SMPS_STATIC;
+	} else if (local->hw.conf.smps_mode != local->smps_mode) {
+		local->hw.conf.smps_mode = local->smps_mode;
+		changed |= IEEE80211_CONF_CHANGE_SMPS;
+	}
+
+	if ((local->scanning & SCAN_SW_SCANNING) ||
+	    (local->scanning & SCAN_HW_SCANNING))
+		power = chan->max_power;
+	else
+		power = local->power_constr_level ?
+			(chan->max_power - local->power_constr_level) :
+			chan->max_power;
+
+	if (local->user_power_level >= 0)
+		power = min(power, local->user_power_level);
+
+	if (local->hw.conf.power_level != power) {
+		changed |= IEEE80211_CONF_CHANGE_POWER;
+		local->hw.conf.power_level = power;
+	}
+
+	if (changed && local->open_count) {
+		ret = drv_config(local, changed);
+		/*
+		 * Goal:
+		 * HW reconfiguration should never fail, the driver has told
+		 * us what it can support so it should live up to that promise.
+		 *
+		 * Current status:
+		 * rfkill is not integrated with mac80211 and a
+		 * configuration command can thus fail if hardware rfkill
+		 * is enabled
+		 *
+		 * FIXME: integrate rfkill with mac80211 and then add this
+		 * WARN_ON() back
+		 *
+		 */
+		/* WARN_ON(ret); */
+	}
+
+	return ret;
+}
+
+void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
+				      u32 changed)
+{
+	struct ieee80211_local *local = sdata->local;
+	static const u8 zero[ETH_ALEN] = { 0 };
+
+	if (!changed)
+		return;
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		/*
+		 * While not associated, claim a BSSID of all-zeroes
+		 * so that drivers don't do any weird things with the
+		 * BSSID at that time.
+		 */
+		if (sdata->vif.bss_conf.assoc)
+			sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid;
+		else
+			sdata->vif.bss_conf.bssid = zero;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
+	else if (sdata->vif.type == NL80211_IFTYPE_AP)
+		sdata->vif.bss_conf.bssid = sdata->vif.addr;
+	else if (sdata->vif.type == NL80211_IFTYPE_WDS)
+		sdata->vif.bss_conf.bssid = NULL;
+	else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+		sdata->vif.bss_conf.bssid = zero;
+	} else {
+		WARN_ON(1);
+		return;
+	}
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_MESH_POINT:
+		break;
+	default:
+		/* do not warn to simplify caller in scan.c */
+		changed &= ~BSS_CHANGED_BEACON_ENABLED;
+		if (WARN_ON(changed & BSS_CHANGED_BEACON))
+			return;
+		break;
+	}
+
+	if (changed & BSS_CHANGED_BEACON_ENABLED) {
+		if (local->quiescing || !ieee80211_sdata_running(sdata) ||
+		    test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) {
+			sdata->vif.bss_conf.enable_beacon = false;
+		} else {
+			/*
+			 * Beacon should be enabled, but AP mode must
+			 * check whether there is a beacon configured.
+			 */
+			switch (sdata->vif.type) {
+			case NL80211_IFTYPE_AP:
+				sdata->vif.bss_conf.enable_beacon =
+					!!sdata->u.ap.beacon;
+				break;
+			case NL80211_IFTYPE_ADHOC:
+				sdata->vif.bss_conf.enable_beacon =
+					!!sdata->u.ibss.presp;
+				break;
+#ifdef CONFIG_MAC80211_MESH
+			case NL80211_IFTYPE_MESH_POINT:
+				sdata->vif.bss_conf.enable_beacon =
+					!!sdata->u.mesh.mesh_id_len;
+				break;
+#endif
+			default:
+				/* not reached */
+				WARN_ON(1);
+				break;
+			}
+		}
+	}
+
+	drv_bss_info_changed(local, sdata, &sdata->vif.bss_conf, changed);
+}
+
+u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata)
+{
+	sdata->vif.bss_conf.use_cts_prot = false;
+	sdata->vif.bss_conf.use_short_preamble = false;
+	sdata->vif.bss_conf.use_short_slot = false;
+	return BSS_CHANGED_ERP_CTS_PROT |
+	       BSS_CHANGED_ERP_PREAMBLE |
+	       BSS_CHANGED_ERP_SLOT;
+}
+
+static void ieee80211_tasklet_handler(unsigned long data)
+{
+	struct ieee80211_local *local = (struct ieee80211_local *) data;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&local->skb_queue)) ||
+	       (skb = skb_dequeue(&local->skb_queue_unreliable))) {
+		switch (skb->pkt_type) {
+		case IEEE80211_RX_MSG:
+			/* Clear skb->pkt_type in order to not confuse kernel
+			 * netstack. */
+			skb->pkt_type = 0;
+			ieee80211_rx(local_to_hw(local), skb);
+			break;
+		case IEEE80211_TX_STATUS_MSG:
+			skb->pkt_type = 0;
+			ieee80211_tx_status(local_to_hw(local), skb);
+			break;
+		default:
+			WARN(1, "mac80211: Packet is of unknown type %d\n",
+			     skb->pkt_type);
+			dev_kfree_skb(skb);
+			break;
+		}
+	}
+}
+
+static void ieee80211_restart_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, restart_work);
+
+	/* wait for scan work complete */
+	flush_workqueue(local->workqueue);
+
+	mutex_lock(&local->mtx);
+	WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) ||
+	     local->sched_scanning,
+		"%s called with hardware scan in progress\n", __func__);
+	mutex_unlock(&local->mtx);
+
+	rtnl_lock();
+	ieee80211_scan_cancel(local);
+	ieee80211_reconfig(local);
+	rtnl_unlock();
+}
+
+void ieee80211_restart_hw(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_restart_hw(local);
+
+	wiphy_info(hw->wiphy,
+		   "Hardware restart was requested\n");
+
+	/* use this reason, ieee80211_reconfig will unblock it */
+	ieee80211_stop_queues_by_reason(hw,
+		IEEE80211_QUEUE_STOP_REASON_SUSPEND);
+
+	schedule_work(&local->restart_work);
+}
+EXPORT_SYMBOL(ieee80211_restart_hw);
+
+static void ieee80211_recalc_smps_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, recalc_smps);
+
+	mutex_lock(&local->iflist_mtx);
+	ieee80211_recalc_smps(local);
+	mutex_unlock(&local->iflist_mtx);
+}
+
+#ifdef CONFIG_INET
+static int ieee80211_ifa_changed(struct notifier_block *nb,
+				 unsigned long data, void *arg)
+{
+	struct in_ifaddr *ifa = arg;
+	struct ieee80211_local *local =
+		container_of(nb, struct ieee80211_local,
+			     ifa_notifier);
+	struct net_device *ndev = ifa->ifa_dev->dev;
+	struct wireless_dev *wdev = ndev->ieee80211_ptr;
+	struct in_device *idev;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_bss_conf *bss_conf;
+	struct ieee80211_if_managed *ifmgd;
+	int c = 0;
+
+	/* Make sure it's our interface that got changed */
+	if (!wdev)
+		return NOTIFY_DONE;
+
+	if (wdev->wiphy != local->hw.wiphy)
+		return NOTIFY_DONE;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
+	bss_conf = &sdata->vif.bss_conf;
+
+	if (!ieee80211_sdata_running(sdata))
+		return NOTIFY_DONE;
+
+	/* ARP filtering is only supported in managed mode */
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return NOTIFY_DONE;
+
+	idev = __in_dev_get_rtnl(sdata->dev);
+	if (!idev)
+		return NOTIFY_DONE;
+
+	ifmgd = &sdata->u.mgd;
+	mutex_lock(&ifmgd->mtx);
+
+	/* Copy the addresses to the bss_conf list */
+	ifa = idev->ifa_list;
+	while (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN && ifa) {
+		bss_conf->arp_addr_list[c] = ifa->ifa_address;
+		ifa = ifa->ifa_next;
+		c++;
+	}
+
+	/* If not all addresses fit the list, disable filtering */
+	if (ifa) {
+		sdata->arp_filter_state = false;
+		c = 0;
+	} else {
+		sdata->arp_filter_state = true;
+	}
+	bss_conf->arp_addr_cnt = c;
+
+	/* Configure driver only if associated */
+	if (ifmgd->associated) {
+		bss_conf->arp_filter_enabled = sdata->arp_filter_state;
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_ARP_FILTER);
+	}
+
+	mutex_unlock(&ifmgd->mtx);
+
+	return NOTIFY_DONE;
+}
+#endif
+
+static int ieee80211_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct ieee80211_local *local =
+		container_of(napi, struct ieee80211_local, napi);
+
+	return local->ops->napi_poll(&local->hw, budget);
+}
+
+void ieee80211_napi_schedule(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	napi_schedule(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_schedule);
+
+void ieee80211_napi_complete(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	napi_complete(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_complete);
+
+/* There isn't a lot of sense in it, but you can transmit anything you like */
+static const struct ieee80211_txrx_stypes
+ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
+	[NL80211_IFTYPE_ADHOC] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_STATION] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+	},
+	[NL80211_IFTYPE_AP] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_AP_VLAN] = {
+		/* copy AP */
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_P2P_CLIENT] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+	},
+	[NL80211_IFTYPE_P2P_GO] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_MESH_POINT] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4),
+	},
+};
+
+struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
+					const struct ieee80211_ops *ops)
+{
+	struct ieee80211_local *local;
+	int priv_size, i;
+	struct wiphy *wiphy;
+
+	/* Ensure 32-byte alignment of our private data and hw private data.
+	 * We use the wiphy priv data for both our ieee80211_local and for
+	 * the driver's private data
+	 *
+	 * In memory it'll be like this:
+	 *
+	 * +-------------------------+
+	 * | struct wiphy	    |
+	 * +-------------------------+
+	 * | struct ieee80211_local  |
+	 * +-------------------------+
+	 * | driver's private data   |
+	 * +-------------------------+
+	 *
+	 */
+	priv_size = ALIGN(sizeof(*local), NETDEV_ALIGN) + priv_data_len;
+
+	wiphy = wiphy_new(&mac80211_config_ops, priv_size);
+
+	if (!wiphy)
+		return NULL;
+
+	wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes;
+
+	wiphy->privid = mac80211_wiphy_privid;
+
+	wiphy->flags |= WIPHY_FLAG_NETNS_OK |
+			WIPHY_FLAG_4ADDR_AP |
+			WIPHY_FLAG_4ADDR_STATION;
+
+	if (!ops->set_key)
+		wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
+
+	wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
+
+	local = wiphy_priv(wiphy);
+
+	local->hw.wiphy = wiphy;
+
+	local->hw.priv = (char *)local + ALIGN(sizeof(*local), NETDEV_ALIGN);
+
+	BUG_ON(!ops->tx);
+	BUG_ON(!ops->start);
+	BUG_ON(!ops->stop);
+	BUG_ON(!ops->config);
+	BUG_ON(!ops->add_interface);
+	BUG_ON(!ops->remove_interface);
+	BUG_ON(!ops->configure_filter);
+	local->ops = ops;
+
+	/* set up some defaults */
+	local->hw.queues = 1;
+	local->hw.max_rates = 1;
+	local->hw.max_report_rates = 0;
+	local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+	local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
+	local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
+	local->user_power_level = -1;
+	local->uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES;
+	local->uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN;
+
+	INIT_LIST_HEAD(&local->interfaces);
+
+	__hw_addr_init(&local->mc_list);
+
+	mutex_init(&local->iflist_mtx);
+	mutex_init(&local->mtx);
+
+	mutex_init(&local->key_mtx);
+	spin_lock_init(&local->filter_lock);
+	spin_lock_init(&local->queue_stop_reason_lock);
+
+	/*
+	 * The rx_skb_queue is only accessed from tasklets,
+	 * but other SKB queues are used from within IRQ
+	 * context. Therefore, this one needs a different
+	 * locking class so our direct, non-irq-safe use of
+	 * the queue's lock doesn't throw lockdep warnings.
+	 */
+	skb_queue_head_init_class(&local->rx_skb_queue,
+				  &ieee80211_rx_skb_queue_class);
+
+	INIT_DELAYED_WORK(&local->scan_work, ieee80211_scan_work);
+
+	ieee80211_work_init(local);
+
+	INIT_WORK(&local->restart_work, ieee80211_restart_work);
+
+	INIT_WORK(&local->reconfig_filter, ieee80211_reconfig_filter);
+	INIT_WORK(&local->recalc_smps, ieee80211_recalc_smps_work);
+	local->smps_mode = IEEE80211_SMPS_OFF;
+
+	INIT_WORK(&local->dynamic_ps_enable_work,
+		  ieee80211_dynamic_ps_enable_work);
+	INIT_WORK(&local->dynamic_ps_disable_work,
+		  ieee80211_dynamic_ps_disable_work);
+	setup_timer(&local->dynamic_ps_timer,
+		    ieee80211_dynamic_ps_timer, (unsigned long) local);
+
+	INIT_WORK(&local->sched_scan_stopped_work,
+		  ieee80211_sched_scan_stopped_work);
+
+	sta_info_init(local);
+
+	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
+		skb_queue_head_init(&local->pending[i]);
+		atomic_set(&local->agg_queue_stop[i], 0);
+	}
+	tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
+		     (unsigned long)local);
+
+	tasklet_init(&local->tasklet,
+		     ieee80211_tasklet_handler,
+		     (unsigned long) local);
+
+	skb_queue_head_init(&local->skb_queue);
+	skb_queue_head_init(&local->skb_queue_unreliable);
+
+	/* init dummy netdev for use w/ NAPI */
+	init_dummy_netdev(&local->napi_dev);
+
+	ieee80211_led_names(local);
+
+	ieee80211_hw_roc_setup(local);
+
+	return local_to_hw(local);
+}
+EXPORT_SYMBOL(ieee80211_alloc_hw);
+
+int ieee80211_register_hw(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	int result, i;
+	enum ieee80211_band band;
+	int channels, max_bitrates;
+	bool supp_ht;
+	static const u32 cipher_suites[] = {
+		/* keep WEP first, it may be removed below */
+		WLAN_CIPHER_SUITE_WEP40,
+		WLAN_CIPHER_SUITE_WEP104,
+		WLAN_CIPHER_SUITE_TKIP,
+		WLAN_CIPHER_SUITE_CCMP,
+
+		/* keep last -- depends on hw flags! */
+		WLAN_CIPHER_SUITE_AES_CMAC
+	};
+
+	if ((hw->wiphy->wowlan.flags || hw->wiphy->wowlan.n_patterns)
+#ifdef CONFIG_PM
+	    && (!local->ops->suspend || !local->ops->resume)
+#endif
+	    )
+		return -EINVAL;
+
+	if (hw->max_report_rates == 0)
+		hw->max_report_rates = hw->max_rates;
+
+	/*
+	 * generic code guarantees at least one band,
+	 * set this very early because much code assumes
+	 * that hw.conf.channel is assigned
+	 */
+	channels = 0;
+	max_bitrates = 0;
+	supp_ht = false;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		struct ieee80211_supported_band *sband;
+
+		sband = local->hw.wiphy->bands[band];
+		if (!sband)
+			continue;
+		if (!local->oper_channel) {
+			/* init channel we're on */
+			local->hw.conf.channel =
+			local->oper_channel = &sband->channels[0];
+			local->hw.conf.channel_type = NL80211_CHAN_NO_HT;
+		}
+		channels += sband->n_channels;
+
+		if (max_bitrates < sband->n_bitrates)
+			max_bitrates = sband->n_bitrates;
+		supp_ht = supp_ht || sband->ht_cap.ht_supported;
+	}
+
+	local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) +
+				      sizeof(void *) * channels, GFP_KERNEL);
+	if (!local->int_scan_req)
+		return -ENOMEM;
+
+	/* if low-level driver supports AP, we also support VLAN */
+	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) {
+		hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN);
+		hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN);
+	}
+
+	/* mac80211 always supports monitor */
+	hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
+	hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR);
+
+	/*
+	 * mac80211 doesn't support more than 1 channel, and also not more
+	 * than one IBSS interface
+	 */
+	for (i = 0; i < hw->wiphy->n_iface_combinations; i++) {
+		const struct ieee80211_iface_combination *c;
+		int j;
+
+		c = &hw->wiphy->iface_combinations[i];
+
+		if (c->num_different_channels > 1)
+			return -EINVAL;
+
+		for (j = 0; j < c->n_limits; j++)
+			if ((c->limits[j].types & BIT(NL80211_IFTYPE_ADHOC)) &&
+			    c->limits[j].max > 1)
+				return -EINVAL;
+	}
+
+#ifndef CONFIG_MAC80211_MESH
+	/* mesh depends on Kconfig, but drivers should set it if they want */
+	local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT);
+#endif
+
+	/* if the underlying driver supports mesh, mac80211 will (at least)
+	 * provide routing of mesh authentication frames to userspace */
+	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_MESH_POINT))
+		local->hw.wiphy->flags |= WIPHY_FLAG_MESH_AUTH;
+
+	/* mac80211 supports control port protocol changing */
+	local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
+
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
+	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
+
+	WARN((local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)
+	     && (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK),
+	     "U-APSD not supported with HW_PS_NULLFUNC_STACK\n");
+
+	/*
+	 * Calculate scan IE length -- we need this to alloc
+	 * memory and to subtract from the driver limit. It
+	 * includes the DS Params, (extended) supported rates, and HT
+	 * information -- SSID is the driver's responsibility.
+	 */
+	local->scan_ies_len = 4 + max_bitrates /* (ext) supp rates */ +
+		3 /* DS Params */;
+	if (supp_ht)
+		local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap);
+
+	if (!local->ops->hw_scan) {
+		/* For hw_scan, driver needs to set these up. */
+		local->hw.wiphy->max_scan_ssids = 4;
+		local->hw.wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN;
+	}
+
+	/*
+	 * If the driver supports any scan IEs, then assume the
+	 * limit includes the IEs mac80211 will add, otherwise
+	 * leave it at zero and let the driver sort it out; we
+	 * still pass our IEs to the driver but userspace will
+	 * not be allowed to in that case.
+	 */
+	if (local->hw.wiphy->max_scan_ie_len)
+		local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
+
+	/* Set up cipher suites unless driver already did */
+	if (!local->hw.wiphy->cipher_suites) {
+		local->hw.wiphy->cipher_suites = cipher_suites;
+		local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
+		if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
+			local->hw.wiphy->n_cipher_suites--;
+	}
+	if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) {
+		if (local->hw.wiphy->cipher_suites == cipher_suites) {
+			local->hw.wiphy->cipher_suites += 2;
+			local->hw.wiphy->n_cipher_suites -= 2;
+		} else {
+			u32 *suites;
+			int r, w = 0;
+
+			/* Filter out WEP */
+
+			suites = kmemdup(
+				local->hw.wiphy->cipher_suites,
+				sizeof(u32) * local->hw.wiphy->n_cipher_suites,
+				GFP_KERNEL);
+			if (!suites)
+				return -ENOMEM;
+			for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) {
+				u32 suite = local->hw.wiphy->cipher_suites[r];
+				if (suite == WLAN_CIPHER_SUITE_WEP40 ||
+				    suite == WLAN_CIPHER_SUITE_WEP104)
+					continue;
+				suites[w++] = suite;
+			}
+			local->hw.wiphy->cipher_suites = suites;
+			local->hw.wiphy->n_cipher_suites = w;
+			local->wiphy_ciphers_allocated = true;
+		}
+	}
+
+	if (!local->ops->remain_on_channel)
+		local->hw.wiphy->max_remain_on_channel_duration = 5000;
+
+	if (local->ops->sched_scan_start)
+		local->hw.wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN;
+
+	result = wiphy_register(local->hw.wiphy);
+	if (result < 0)
+		goto fail_wiphy_register;
+
+	/*
+	 * We use the number of queues for feature tests (QoS, HT) internally
+	 * so restrict them appropriately.
+	 */
+	if (hw->queues > IEEE80211_MAX_QUEUES)
+		hw->queues = IEEE80211_MAX_QUEUES;
+
+	local->workqueue =
+		alloc_ordered_workqueue(wiphy_name(local->hw.wiphy), 0);
+	if (!local->workqueue) {
+		result = -ENOMEM;
+		goto fail_workqueue;
+	}
+
+	/*
+	 * The hardware needs headroom for sending the frame,
+	 * and we need some headroom for passing the frame to monitor
+	 * interfaces, but never both at the same time.
+	 */
+#ifndef __CHECKER__
+	BUILD_BUG_ON(IEEE80211_TX_STATUS_HEADROOM !=
+			sizeof(struct ieee80211_tx_status_rtap_hdr));
+#endif
+	local->tx_headroom = max_t(unsigned int , local->hw.extra_tx_headroom,
+				   sizeof(struct ieee80211_tx_status_rtap_hdr));
+
+	debugfs_hw_add(local);
+
+	/*
+	 * if the driver doesn't specify a max listen interval we
+	 * use 5 which should be a safe default
+	 */
+	if (local->hw.max_listen_interval == 0)
+		local->hw.max_listen_interval = 5;
+
+	local->hw.conf.listen_interval = local->hw.max_listen_interval;
+
+	local->dynamic_ps_forced_timeout = -1;
+
+	result = ieee80211_wep_init(local);
+	if (result < 0)
+		wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
+			    result);
+
+	ieee80211_led_init(local);
+
+	rtnl_lock();
+
+	result = ieee80211_init_rate_ctrl_alg(local,
+					      hw->rate_control_algorithm);
+	if (result < 0) {
+		wiphy_debug(local->hw.wiphy,
+			    "Failed to initialize rate control algorithm\n");
+		goto fail_rate;
+	}
+
+	/* add one default STA interface if supported */
+	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION)) {
+		result = ieee80211_if_add(local, "wlan%d", NULL,
+					  NL80211_IFTYPE_STATION, NULL);
+		if (result)
+			wiphy_warn(local->hw.wiphy,
+				   "Failed to add default virtual iface\n");
+	}
+
+	rtnl_unlock();
+
+	local->network_latency_notifier.notifier_call =
+		ieee80211_max_network_latency;
+	result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
+				     &local->network_latency_notifier);
+	if (result) {
+		rtnl_lock();
+		goto fail_pm_qos;
+	}
+
+#ifdef CONFIG_INET
+	local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
+	result = register_inetaddr_notifier(&local->ifa_notifier);
+	if (result)
+		goto fail_ifa;
+#endif
+
+	netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll,
+			local->hw.napi_weight);
+
+	return 0;
+
+#ifdef CONFIG_INET
+ fail_ifa:
+	pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+			       &local->network_latency_notifier);
+	rtnl_lock();
+#endif
+ fail_pm_qos:
+	ieee80211_led_exit(local);
+	ieee80211_remove_interfaces(local);
+ fail_rate:
+	rtnl_unlock();
+	ieee80211_wep_free(local);
+	sta_info_stop(local);
+	destroy_workqueue(local->workqueue);
+ fail_workqueue:
+	wiphy_unregister(local->hw.wiphy);
+ fail_wiphy_register:
+	if (local->wiphy_ciphers_allocated)
+		kfree(local->hw.wiphy->cipher_suites);
+	kfree(local->int_scan_req);
+	return result;
+}
+EXPORT_SYMBOL(ieee80211_register_hw);
+
+void ieee80211_unregister_hw(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	tasklet_kill(&local->tx_pending_tasklet);
+	tasklet_kill(&local->tasklet);
+
+	pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+			       &local->network_latency_notifier);
+#ifdef CONFIG_INET
+	unregister_inetaddr_notifier(&local->ifa_notifier);
+#endif
+
+	rtnl_lock();
+
+	/*
+	 * At this point, interface list manipulations are fine
+	 * because the driver cannot be handing us frames any
+	 * more and the tasklet is killed.
+	 */
+	ieee80211_remove_interfaces(local);
+
+	rtnl_unlock();
+
+	/*
+	 * Now all work items will be gone, but the
+	 * timer might still be armed, so delete it
+	 */
+	del_timer_sync(&local->work_timer);
+
+	cancel_work_sync(&local->restart_work);
+	cancel_work_sync(&local->reconfig_filter);
+
+	ieee80211_clear_tx_pending(local);
+	sta_info_stop(local);
+	rate_control_deinitialize(local);
+
+	if (skb_queue_len(&local->skb_queue) ||
+	    skb_queue_len(&local->skb_queue_unreliable))
+		wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
+	skb_queue_purge(&local->skb_queue);
+	skb_queue_purge(&local->skb_queue_unreliable);
+	skb_queue_purge(&local->rx_skb_queue);
+
+	destroy_workqueue(local->workqueue);
+	wiphy_unregister(local->hw.wiphy);
+	ieee80211_wep_free(local);
+	ieee80211_led_exit(local);
+	kfree(local->int_scan_req);
+}
+EXPORT_SYMBOL(ieee80211_unregister_hw);
+
+void ieee80211_free_hw(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	mutex_destroy(&local->iflist_mtx);
+	mutex_destroy(&local->mtx);
+
+	if (local->wiphy_ciphers_allocated)
+		kfree(local->hw.wiphy->cipher_suites);
+
+	wiphy_free(local->hw.wiphy);
+}
+EXPORT_SYMBOL(ieee80211_free_hw);
+
+static int __init ieee80211_init(void)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct ieee80211_tx_info) > sizeof(skb->cb));
+	BUILD_BUG_ON(offsetof(struct ieee80211_tx_info, driver_data) +
+		     IEEE80211_TX_INFO_DRIVER_DATA_SIZE > sizeof(skb->cb));
+
+	ret = rc80211_minstrel_init();
+	if (ret)
+		return ret;
+
+	ret = rc80211_minstrel_ht_init();
+	if (ret)
+		goto err_minstrel;
+
+	ret = rc80211_pid_init();
+	if (ret)
+		goto err_pid;
+
+	ret = ieee80211_iface_init();
+	if (ret)
+		goto err_netdev;
+
+	return 0;
+ err_netdev:
+	rc80211_pid_exit();
+ err_pid:
+	rc80211_minstrel_ht_exit();
+ err_minstrel:
+	rc80211_minstrel_exit();
+
+	return ret;
+}
+
+static void __exit ieee80211_exit(void)
+{
+	rc80211_pid_exit();
+	rc80211_minstrel_ht_exit();
+	rc80211_minstrel_exit();
+
+	if (mesh_allocated)
+		ieee80211s_stop();
+
+	ieee80211_iface_exit();
+
+	rcu_barrier();
+}
+
+
+subsys_initcall(ieee80211_init);
+module_exit(ieee80211_exit);
+
+MODULE_DESCRIPTION("IEEE 802.11 subsystem");
+MODULE_LICENSE("GPL");
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
new file mode 100644
index 00000000..29e9980c
--- /dev/null
+++ b/net/mac80211/mesh.c
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Authors:    Luis Carlos Cobo <luisca@cozybit.com>
+ * 	       Javier Cardona <javier@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "ieee80211_i.h"
+#include "mesh.h"
+
+#define IEEE80211_MESH_PEER_INACTIVITY_LIMIT (1800 * HZ)
+#define IEEE80211_MESH_HOUSEKEEPING_INTERVAL (60 * HZ)
+#define IEEE80211_MESH_RANN_INTERVAL	     (1 * HZ)
+
+#define MESHCONF_CAPAB_ACCEPT_PLINKS 0x01
+#define MESHCONF_CAPAB_FORWARDING    0x08
+
+#define TMR_RUNNING_HK	0
+#define TMR_RUNNING_MP	1
+#define TMR_RUNNING_MPR	2
+
+int mesh_allocated;
+static struct kmem_cache *rm_cache;
+
+void ieee80211s_init(void)
+{
+	mesh_pathtbl_init();
+	mesh_allocated = 1;
+	rm_cache = kmem_cache_create("mesh_rmc", sizeof(struct rmc_entry),
+				     0, 0, NULL);
+}
+
+void ieee80211s_stop(void)
+{
+	mesh_pathtbl_unregister();
+	kmem_cache_destroy(rm_cache);
+}
+
+static void ieee80211_mesh_housekeeping_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata = (void *) data;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
+
+	if (local->quiescing) {
+		set_bit(TMR_RUNNING_HK, &ifmsh->timers_running);
+		return;
+	}
+
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+/**
+ * mesh_matches_local - check if the config of a mesh point matches ours
+ *
+ * @ie: information elements of a management frame from the mesh peer
+ * @sdata: local mesh subif
+ *
+ * This function checks if the mesh configuration of a mesh point matches the
+ * local mesh configuration, i.e. if both nodes belong to the same mesh network.
+ */
+bool mesh_matches_local(struct ieee802_11_elems *ie, struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	/*
+	 * As support for each feature is added, check for matching
+	 * - On mesh config capabilities
+	 *   - Power Save Support En
+	 *   - Sync support enabled
+	 *   - Sync support active
+	 *   - Sync support required from peer
+	 *   - MDA enabled
+	 * - Power management control on fc
+	 */
+	if (ifmsh->mesh_id_len == ie->mesh_id_len &&
+		memcmp(ifmsh->mesh_id, ie->mesh_id, ie->mesh_id_len) == 0 &&
+		(ifmsh->mesh_pp_id == ie->mesh_config->meshconf_psel) &&
+		(ifmsh->mesh_pm_id == ie->mesh_config->meshconf_pmetric) &&
+		(ifmsh->mesh_cc_id == ie->mesh_config->meshconf_congest) &&
+		(ifmsh->mesh_sp_id == ie->mesh_config->meshconf_synch) &&
+		(ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth))
+		return true;
+
+	return false;
+}
+
+/**
+ * mesh_peer_accepts_plinks - check if an mp is willing to establish peer links
+ *
+ * @ie: information elements of a management frame from the mesh peer
+ */
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie)
+{
+	return (ie->mesh_config->meshconf_cap &
+	    MESHCONF_CAPAB_ACCEPT_PLINKS) != 0;
+}
+
+/**
+ * mesh_accept_plinks_update: update accepting_plink in local mesh beacons
+ *
+ * @sdata: mesh interface in which mesh beacons are going to be updated
+ */
+void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata)
+{
+	bool free_plinks;
+
+	/* In case mesh_plink_free_count > 0 and mesh_plinktbl_capacity == 0,
+	 * the mesh interface might be able to establish plinks with peers that
+	 * are already on the table but are not on PLINK_ESTAB state. However,
+	 * in general the mesh interface is not accepting peer link requests
+	 * from new peers, and that must be reflected in the beacon
+	 */
+	free_plinks = mesh_plink_availables(sdata);
+
+	if (free_plinks != sdata->u.mesh.accepting_plinks)
+		ieee80211_mesh_housekeeping_timer((unsigned long) sdata);
+}
+
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata)
+{
+	int i;
+
+	sdata->u.mesh.rmc = kmalloc(sizeof(struct mesh_rmc), GFP_KERNEL);
+	if (!sdata->u.mesh.rmc)
+		return -ENOMEM;
+	sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1;
+	for (i = 0; i < RMC_BUCKETS; i++)
+		INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i].list);
+	return 0;
+}
+
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_rmc *rmc = sdata->u.mesh.rmc;
+	struct rmc_entry *p, *n;
+	int i;
+
+	if (!sdata->u.mesh.rmc)
+		return;
+
+	for (i = 0; i < RMC_BUCKETS; i++)
+		list_for_each_entry_safe(p, n, &rmc->bucket[i].list, list) {
+			list_del(&p->list);
+			kmem_cache_free(rm_cache, p);
+		}
+
+	kfree(rmc);
+	sdata->u.mesh.rmc = NULL;
+}
+
+/**
+ * mesh_rmc_check - Check frame in recent multicast cache and add if absent.
+ *
+ * @sa:		source address
+ * @mesh_hdr:	mesh_header
+ *
+ * Returns: 0 if the frame is not in the cache, nonzero otherwise.
+ *
+ * Checks using the source address and the mesh sequence number if we have
+ * received this frame lately. If the frame is not in the cache, it is added to
+ * it.
+ */
+int mesh_rmc_check(u8 *sa, struct ieee80211s_hdr *mesh_hdr,
+		   struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_rmc *rmc = sdata->u.mesh.rmc;
+	u32 seqnum = 0;
+	int entries = 0;
+	u8 idx;
+	struct rmc_entry *p, *n;
+
+	/* Don't care about endianness since only match matters */
+	memcpy(&seqnum, &mesh_hdr->seqnum, sizeof(mesh_hdr->seqnum));
+	idx = le32_to_cpu(mesh_hdr->seqnum) & rmc->idx_mask;
+	list_for_each_entry_safe(p, n, &rmc->bucket[idx].list, list) {
+		++entries;
+		if (time_after(jiffies, p->exp_time) ||
+				(entries == RMC_QUEUE_MAX_LEN)) {
+			list_del(&p->list);
+			kmem_cache_free(rm_cache, p);
+			--entries;
+		} else if ((seqnum == p->seqnum) &&
+			   (memcmp(sa, p->sa, ETH_ALEN) == 0))
+			return -1;
+	}
+
+	p = kmem_cache_alloc(rm_cache, GFP_ATOMIC);
+	if (!p) {
+		printk(KERN_DEBUG "o11s: could not allocate RMC entry\n");
+		return 0;
+	}
+	p->seqnum = seqnum;
+	p->exp_time = jiffies + RMC_TIMEOUT;
+	memcpy(p->sa, sa, ETH_ALEN);
+	list_add(&p->list, &rmc->bucket[idx].list);
+	return 0;
+}
+
+void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	u8 *pos;
+	int len, i, rate;
+	u8 neighbors;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	len = sband->n_bitrates;
+	if (len > 8)
+		len = 8;
+	pos = skb_put(skb, len + 2);
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = len;
+	for (i = 0; i < len; i++) {
+		rate = sband->bitrates[i].bitrate;
+		*pos++ = (u8) (rate / 5);
+	}
+
+	if (sband->n_bitrates > len) {
+		pos = skb_put(skb, sband->n_bitrates - len + 2);
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = sband->n_bitrates - len;
+		for (i = len; i < sband->n_bitrates; i++) {
+			rate = sband->bitrates[i].bitrate;
+			*pos++ = (u8) (rate / 5);
+		}
+	}
+
+	if (sband->band == IEEE80211_BAND_2GHZ) {
+		pos = skb_put(skb, 2 + 1);
+		*pos++ = WLAN_EID_DS_PARAMS;
+		*pos++ = 1;
+		*pos++ = ieee80211_frequency_to_channel(local->hw.conf.channel->center_freq);
+	}
+
+	pos = skb_put(skb, 2 + sdata->u.mesh.mesh_id_len);
+	*pos++ = WLAN_EID_MESH_ID;
+	*pos++ = sdata->u.mesh.mesh_id_len;
+	if (sdata->u.mesh.mesh_id_len)
+		memcpy(pos, sdata->u.mesh.mesh_id, sdata->u.mesh.mesh_id_len);
+
+	pos = skb_put(skb, 2 + sizeof(struct ieee80211_meshconf_ie));
+	*pos++ = WLAN_EID_MESH_CONFIG;
+	*pos++ = sizeof(struct ieee80211_meshconf_ie);
+
+	/* Active path selection protocol ID */
+	*pos++ = sdata->u.mesh.mesh_pp_id;
+
+	/* Active path selection metric ID   */
+	*pos++ = sdata->u.mesh.mesh_pm_id;
+
+	/* Congestion control mode identifier */
+	*pos++ = sdata->u.mesh.mesh_cc_id;
+
+	/* Synchronization protocol identifier */
+	*pos++ = sdata->u.mesh.mesh_sp_id;
+
+	/* Authentication Protocol identifier */
+	*pos++ = sdata->u.mesh.mesh_auth_id;
+
+	/* Mesh Formation Info - number of neighbors */
+	neighbors = atomic_read(&sdata->u.mesh.mshstats.estab_plinks);
+	/* Number of neighbor mesh STAs or 15 whichever is smaller */
+	neighbors = (neighbors > 15) ? 15 : neighbors;
+	*pos++ = neighbors << 1;
+
+	/* Mesh capability */
+	sdata->u.mesh.accepting_plinks = mesh_plink_availables(sdata);
+	*pos = MESHCONF_CAPAB_FORWARDING;
+	*pos++ |= sdata->u.mesh.accepting_plinks ?
+	    MESHCONF_CAPAB_ACCEPT_PLINKS : 0x00;
+	*pos++ = 0x00;
+
+	if (sdata->u.mesh.ie) {
+		int len = sdata->u.mesh.ie_len;
+		const u8 *data = sdata->u.mesh.ie;
+		if (skb_tailroom(skb) > len)
+			memcpy(skb_put(skb, len), data, len);
+	}
+}
+
+
+static void ieee80211_mesh_path_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->quiescing) {
+		set_bit(TMR_RUNNING_MP, &ifmsh->timers_running);
+		return;
+	}
+
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+static void ieee80211_mesh_path_root_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+
+	set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+
+	if (local->quiescing) {
+		set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running);
+		return;
+	}
+
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
+{
+	if (ifmsh->mshcfg.dot11MeshHWMPRootMode)
+		set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+	else {
+		clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags);
+		/* stop running timer */
+		del_timer_sync(&ifmsh->mesh_path_root_timer);
+	}
+}
+
+/**
+ * ieee80211_fill_mesh_addresses - fill addresses of a locally originated mesh frame
+ * @hdr:    	802.11 frame header
+ * @fc:		frame control field
+ * @meshda:	destination address in the mesh
+ * @meshsa:	source address address in the mesh.  Same as TA, as frame is
+ *              locally originated.
+ *
+ * Return the length of the 802.11 (does not include a mesh control header)
+ */
+int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
+				  const u8 *meshda, const u8 *meshsa)
+{
+	if (is_multicast_ether_addr(meshda)) {
+		*fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		/* DA TA SA */
+		memcpy(hdr->addr1, meshda, ETH_ALEN);
+		memcpy(hdr->addr2, meshsa, ETH_ALEN);
+		memcpy(hdr->addr3, meshsa, ETH_ALEN);
+		return 24;
+	} else {
+		*fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS |
+				IEEE80211_FCTL_TODS);
+		/* RA TA DA SA */
+		memset(hdr->addr1, 0, ETH_ALEN);   /* RA is resolved later */
+		memcpy(hdr->addr2, meshsa, ETH_ALEN);
+		memcpy(hdr->addr3, meshda, ETH_ALEN);
+		memcpy(hdr->addr4, meshsa, ETH_ALEN);
+		return 30;
+	}
+}
+
+/**
+ * ieee80211_new_mesh_header - create a new mesh header
+ * @meshhdr:    uninitialized mesh header
+ * @sdata:	mesh interface to be used
+ * @addr4or5:   1st address in the ae header, which may correspond to address 4
+ *              (if addr6 is NULL) or address 5 (if addr6 is present). It may
+ *              be NULL.
+ * @addr6:	2nd address in the ae header, which corresponds to addr6 of the
+ *              mesh frame
+ *
+ * Return the header length.
+ */
+int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr,
+		struct ieee80211_sub_if_data *sdata, char *addr4or5,
+		char *addr6)
+{
+	int aelen = 0;
+	BUG_ON(!addr4or5 && addr6);
+	memset(meshhdr, 0, sizeof(*meshhdr));
+	meshhdr->ttl = sdata->u.mesh.mshcfg.dot11MeshTTL;
+	put_unaligned(cpu_to_le32(sdata->u.mesh.mesh_seqnum), &meshhdr->seqnum);
+	sdata->u.mesh.mesh_seqnum++;
+	if (addr4or5 && !addr6) {
+		meshhdr->flags |= MESH_FLAGS_AE_A4;
+		aelen += ETH_ALEN;
+		memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN);
+	} else if (addr4or5 && addr6) {
+		meshhdr->flags |= MESH_FLAGS_AE_A5_A6;
+		aelen += 2 * ETH_ALEN;
+		memcpy(meshhdr->eaddr1, addr4or5, ETH_ALEN);
+		memcpy(meshhdr->eaddr2, addr6, ETH_ALEN);
+	}
+	return 6 + aelen;
+}
+
+static void ieee80211_mesh_housekeeping(struct ieee80211_sub_if_data *sdata,
+			   struct ieee80211_if_mesh *ifmsh)
+{
+	bool free_plinks;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	printk(KERN_DEBUG "%s: running mesh housekeeping\n",
+	       sdata->name);
+#endif
+
+	ieee80211_sta_expire(sdata, IEEE80211_MESH_PEER_INACTIVITY_LIMIT);
+	mesh_path_expire(sdata);
+
+	free_plinks = mesh_plink_availables(sdata);
+	if (free_plinks != sdata->u.mesh.accepting_plinks)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+
+	mod_timer(&ifmsh->housekeeping_timer,
+		  round_jiffies(jiffies + IEEE80211_MESH_HOUSEKEEPING_INTERVAL));
+}
+
+static void ieee80211_mesh_rootpath(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	mesh_path_tx_root_frame(sdata);
+	mod_timer(&ifmsh->mesh_path_root_timer,
+		  round_jiffies(jiffies + IEEE80211_MESH_RANN_INTERVAL));
+}
+
+#ifdef CONFIG_PM
+void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	/* use atomic bitops in case both timers fire at the same time */
+
+	if (del_timer_sync(&ifmsh->housekeeping_timer))
+		set_bit(TMR_RUNNING_HK, &ifmsh->timers_running);
+	if (del_timer_sync(&ifmsh->mesh_path_timer))
+		set_bit(TMR_RUNNING_MP, &ifmsh->timers_running);
+	if (del_timer_sync(&ifmsh->mesh_path_root_timer))
+		set_bit(TMR_RUNNING_MPR, &ifmsh->timers_running);
+}
+
+void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	if (test_and_clear_bit(TMR_RUNNING_HK, &ifmsh->timers_running))
+		add_timer(&ifmsh->housekeeping_timer);
+	if (test_and_clear_bit(TMR_RUNNING_MP, &ifmsh->timers_running))
+		add_timer(&ifmsh->mesh_path_timer);
+	if (test_and_clear_bit(TMR_RUNNING_MPR, &ifmsh->timers_running))
+		add_timer(&ifmsh->mesh_path_root_timer);
+	ieee80211_mesh_root_setup(ifmsh);
+}
+#endif
+
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+
+	local->fif_other_bss++;
+	/* mesh ifaces must set allmulti to forward mcast traffic */
+	atomic_inc(&local->iff_allmultis);
+	ieee80211_configure_filter(local);
+
+	ifmsh->mesh_cc_id = 0;	/* Disabled */
+	ifmsh->mesh_sp_id = 0;	/* Neighbor Offset */
+	ifmsh->mesh_auth_id = 0;	/* Disabled */
+	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
+	ieee80211_mesh_root_setup(ifmsh);
+	ieee80211_queue_work(&local->hw, &sdata->work);
+	sdata->vif.bss_conf.beacon_int = MESH_DEFAULT_BEACON_INTERVAL;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON |
+						BSS_CHANGED_BEACON_ENABLED |
+						BSS_CHANGED_BEACON_INT);
+}
+
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	ifmsh->mesh_id_len = 0;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+	sta_info_flush(local, NULL);
+
+	del_timer_sync(&sdata->u.mesh.housekeeping_timer);
+	del_timer_sync(&sdata->u.mesh.mesh_path_root_timer);
+	/*
+	 * If the timer fired while we waited for it, it will have
+	 * requeued the work. Now the work will be running again
+	 * but will not rearm the timer again because it checks
+	 * whether the interface is running, which, at this point,
+	 * it no longer is.
+	 */
+	cancel_work_sync(&sdata->work);
+
+	local->fif_other_bss--;
+	atomic_dec(&local->iff_allmultis);
+	ieee80211_configure_filter(local);
+}
+
+static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
+					u16 stype,
+					struct ieee80211_mgmt *mgmt,
+					size_t len,
+					struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee802_11_elems elems;
+	struct ieee80211_channel *channel;
+	u32 supp_rates = 0;
+	size_t baselen;
+	int freq;
+	enum ieee80211_band band = rx_status->band;
+
+	/* ignore ProbeResp to foreign address */
+	if (stype == IEEE80211_STYPE_PROBE_RESP &&
+	    compare_ether_addr(mgmt->da, sdata->vif.addr))
+		return;
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+			       &elems);
+
+	/* ignore beacons from secure mesh peers if our security is off */
+	if (elems.rsn_len && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE)
+		return;
+
+	if (elems.ds_params && elems.ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems.ds_params[0], band);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	if (elems.mesh_id && elems.mesh_config &&
+	    mesh_matches_local(&elems, sdata)) {
+		supp_rates = ieee80211_sta_get_rates(local, &elems, band);
+		mesh_neighbour_update(mgmt->sa, supp_rates, sdata, &elems);
+	}
+}
+
+static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
+					  struct ieee80211_mgmt *mgmt,
+					  size_t len,
+					  struct ieee80211_rx_status *rx_status)
+{
+	switch (mgmt->u.action.category) {
+	case WLAN_CATEGORY_MESH_ACTION:
+		mesh_rx_plink_frame(sdata, mgmt, len, rx_status);
+		break;
+	case WLAN_CATEGORY_MESH_PATH_SEL:
+		mesh_rx_path_sel_frame(sdata, mgmt, len);
+		break;
+	}
+}
+
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_mgmt *mgmt;
+	u16 stype;
+
+	rx_status = IEEE80211_SKB_RXCB(skb);
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+
+	switch (stype) {
+	case IEEE80211_STYPE_PROBE_RESP:
+	case IEEE80211_STYPE_BEACON:
+		ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len,
+					    rx_status);
+		break;
+	case IEEE80211_STYPE_ACTION:
+		ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status);
+		break;
+	}
+}
+
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	if (ifmsh->preq_queue_len &&
+	    time_after(jiffies,
+		       ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval)))
+		mesh_path_start_discovery(sdata);
+
+	if (test_and_clear_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags))
+		mesh_mpath_table_grow();
+
+	if (test_and_clear_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags))
+		mesh_mpp_table_grow();
+
+	if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags))
+		ieee80211_mesh_housekeeping(sdata, ifmsh);
+
+	if (test_and_clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags))
+		ieee80211_mesh_rootpath(sdata);
+}
+
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			ieee80211_queue_work(&local->hw, &sdata->work);
+	rcu_read_unlock();
+}
+
+void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	setup_timer(&ifmsh->housekeeping_timer,
+		    ieee80211_mesh_housekeeping_timer,
+		    (unsigned long) sdata);
+
+	ifmsh->accepting_plinks = true;
+	ifmsh->preq_id = 0;
+	ifmsh->sn = 0;
+	atomic_set(&ifmsh->mpaths, 0);
+	mesh_rmc_init(sdata);
+	ifmsh->last_preq = jiffies;
+	/* Allocate all mesh structures when creating the first mesh interface. */
+	if (!mesh_allocated)
+		ieee80211s_init();
+	setup_timer(&ifmsh->mesh_path_timer,
+		    ieee80211_mesh_path_timer,
+		    (unsigned long) sdata);
+	setup_timer(&ifmsh->mesh_path_root_timer,
+		    ieee80211_mesh_path_root_timer,
+		    (unsigned long) sdata);
+	INIT_LIST_HEAD(&ifmsh->preq_queue.list);
+	spin_lock_init(&ifmsh->mesh_preq_queue_lock);
+}
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
new file mode 100644
index 00000000..249e7333
--- /dev/null
+++ b/net/mac80211/mesh.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Authors:    Luis Carlos Cobo <luisca@cozybit.com>
+ *             Javier Cardona <javier@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211S_H
+#define IEEE80211S_H
+
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <asm/unaligned.h>
+#include "ieee80211_i.h"
+
+
+/* Data structures */
+
+/**
+ * enum mesh_path_flags - mac80211 mesh path flags
+ *
+ *
+ *
+ * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding
+ * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path
+ * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence
+ * 	number
+ * @MESH_PATH_FIXED: the mesh path has been manually set and should not be
+ * 	modified
+ * @MESH_PATH_RESOLVED: the mesh path can has been resolved
+ *
+ * MESH_PATH_RESOLVED is used by the mesh path timer to
+ * decide when to stop or cancel the mesh path discovery.
+ */
+enum mesh_path_flags {
+	MESH_PATH_ACTIVE =	BIT(0),
+	MESH_PATH_RESOLVING =	BIT(1),
+	MESH_PATH_SN_VALID =	BIT(2),
+	MESH_PATH_FIXED	=	BIT(3),
+	MESH_PATH_RESOLVED =	BIT(4),
+};
+
+/**
+ * enum mesh_deferred_task_flags - mac80211 mesh deferred tasks
+ *
+ *
+ *
+ * @MESH_WORK_HOUSEKEEPING: run the periodic mesh housekeeping tasks
+ * @MESH_WORK_GROW_MPATH_TABLE: the mesh path table is full and needs
+ * to grow.
+ * @MESH_WORK_GROW_MPP_TABLE: the mesh portals table is full and needs to
+ * grow
+ * @MESH_WORK_ROOT: the mesh root station needs to send a frame
+ */
+enum mesh_deferred_task_flags {
+	MESH_WORK_HOUSEKEEPING,
+	MESH_WORK_GROW_MPATH_TABLE,
+	MESH_WORK_GROW_MPP_TABLE,
+	MESH_WORK_ROOT,
+};
+
+/**
+ * struct mesh_path - mac80211 mesh path structure
+ *
+ * @dst: mesh path destination mac address
+ * @sdata: mesh subif
+ * @next_hop: mesh neighbor to which frames for this destination will be
+ * 	forwarded
+ * @timer: mesh path discovery timer
+ * @frame_queue: pending queue for frames sent to this destination while the
+ * 	path is unresolved
+ * @sn: target sequence number
+ * @metric: current metric to this destination
+ * @hop_count: hops to destination
+ * @exp_time: in jiffies, when the path will expire or when it expired
+ * @discovery_timeout: timeout (lapse in jiffies) used for the last discovery
+ * 	retry
+ * @discovery_retries: number of discovery retries
+ * @flags: mesh path flags, as specified on &enum mesh_path_flags
+ * @state_lock: mesh path state lock
+ *
+ *
+ * The combination of dst and sdata is unique in the mesh path table. Since the
+ * next_hop STA is only protected by RCU as well, deleting the STA must also
+ * remove/substitute the mesh_path structure and wait until that is no longer
+ * reachable before destroying the STA completely.
+ */
+struct mesh_path {
+	u8 dst[ETH_ALEN];
+	u8 mpp[ETH_ALEN];	/* used for MPP or MAP */
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info __rcu *next_hop;
+	struct timer_list timer;
+	struct sk_buff_head frame_queue;
+	struct rcu_head rcu;
+	u32 sn;
+	u32 metric;
+	u8 hop_count;
+	unsigned long exp_time;
+	u32 discovery_timeout;
+	u8 discovery_retries;
+	enum mesh_path_flags flags;
+	spinlock_t state_lock;
+};
+
+/**
+ * struct mesh_table
+ *
+ * @hash_buckets: array of hash buckets of the table
+ * @hashwlock: array of locks to protect write operations, one per bucket
+ * @hash_mask: 2^size_order - 1, used to compute hash idx
+ * @hash_rnd: random value used for hash computations
+ * @entries: number of entries in the table
+ * @free_node: function to free nodes of the table
+ * @copy_node: function to copy nodes of the table
+ * @size_order: determines size of the table, there will be 2^size_order hash
+ *	buckets
+ * @mean_chain_len: maximum average length for the hash buckets' list, if it is
+ *	reached, the table will grow
+ * rcu_head: RCU head to free the table
+ */
+struct mesh_table {
+	/* Number of buckets will be 2^N */
+	struct hlist_head *hash_buckets;
+	spinlock_t *hashwlock;		/* One per bucket, for add/del */
+	unsigned int hash_mask;		/* (2^size_order) - 1 */
+	__u32 hash_rnd;			/* Used for hash generation */
+	atomic_t entries;		/* Up to MAX_MESH_NEIGHBOURS */
+	void (*free_node) (struct hlist_node *p, bool free_leafs);
+	int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl);
+	int size_order;
+	int mean_chain_len;
+
+	struct rcu_head rcu_head;
+};
+
+/* Recent multicast cache */
+/* RMC_BUCKETS must be a power of 2, maximum 256 */
+#define RMC_BUCKETS		256
+#define RMC_QUEUE_MAX_LEN	4
+#define RMC_TIMEOUT		(3 * HZ)
+
+/**
+ * struct rmc_entry - entry in the Recent Multicast Cache
+ *
+ * @seqnum: mesh sequence number of the frame
+ * @exp_time: expiration time of the entry, in jiffies
+ * @sa: source address of the frame
+ *
+ * The Recent Multicast Cache keeps track of the latest multicast frames that
+ * have been received by a mesh interface and discards received multicast frames
+ * that are found in the cache.
+ */
+struct rmc_entry {
+	struct list_head list;
+	u32 seqnum;
+	unsigned long exp_time;
+	u8 sa[ETH_ALEN];
+};
+
+struct mesh_rmc {
+	struct rmc_entry bucket[RMC_BUCKETS];
+	u32 idx_mask;
+};
+
+
+#define MESH_DEFAULT_BEACON_INTERVAL		1000 	/* in 1024 us units */
+
+#define MESH_PATH_EXPIRE (600 * HZ)
+
+/* Default maximum number of plinks per interface */
+#define MESH_MAX_PLINKS		256
+
+/* Maximum number of paths per interface */
+#define MESH_MAX_MPATHS		1024
+
+/* Pending ANA approval */
+#define MESH_PATH_SEL_ACTION	0
+
+/* PERR reason codes */
+#define PEER_RCODE_UNSPECIFIED  11
+#define PERR_RCODE_NO_ROUTE     12
+#define PERR_RCODE_DEST_UNREACH 13
+
+/* Public interfaces */
+/* Various */
+int ieee80211_fill_mesh_addresses(struct ieee80211_hdr *hdr, __le16 *fc,
+				  const u8 *da, const u8 *sa);
+int ieee80211_new_mesh_header(struct ieee80211s_hdr *meshhdr,
+		struct ieee80211_sub_if_data *sdata, char *addr4or5,
+		char *addr6);
+int mesh_rmc_check(u8 *addr, struct ieee80211s_hdr *mesh_hdr,
+		struct ieee80211_sub_if_data *sdata);
+bool mesh_matches_local(struct ieee802_11_elems *ie,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_ids_set_default(struct ieee80211_if_mesh *mesh);
+void mesh_mgmt_ies_add(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
+int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
+void ieee80211s_init(void);
+void ieee80211s_update_metric(struct ieee80211_local *local,
+		struct sta_info *stainfo, struct sk_buff *skb);
+void ieee80211s_stop(void);
+void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
+void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh);
+
+/* Mesh paths */
+int mesh_nexthop_lookup(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mesh_path_lookup(u8 *dst,
+		struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mpp_path_lookup(u8 *dst,
+				  struct ieee80211_sub_if_data *sdata);
+int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata);
+struct mesh_path *mesh_path_lookup_by_idx(int idx,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop);
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata);
+void mesh_path_flush(struct ieee80211_sub_if_data *sdata);
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
+		struct ieee80211_mgmt *mgmt, size_t len);
+int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata);
+/* Mesh plinks */
+void mesh_neighbour_update(u8 *hw_addr, u32 rates,
+		struct ieee80211_sub_if_data *sdata,
+		struct ieee802_11_elems *ie);
+bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie);
+void mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata);
+void mesh_plink_broken(struct sta_info *sta);
+void mesh_plink_deactivate(struct sta_info *sta);
+int mesh_plink_open(struct sta_info *sta);
+void mesh_plink_block(struct sta_info *sta);
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
+			 struct ieee80211_mgmt *mgmt, size_t len,
+			 struct ieee80211_rx_status *rx_status);
+
+/* Private interfaces */
+/* Mesh tables */
+void mesh_mpath_table_grow(void);
+void mesh_mpp_table_grow(void);
+/* Mesh paths */
+int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, __le16 target_rcode,
+		       const u8 *ra, struct ieee80211_sub_if_data *sdata);
+void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta);
+void mesh_path_flush_pending(struct mesh_path *mpath);
+void mesh_path_tx_pending(struct mesh_path *mpath);
+int mesh_pathtbl_init(void);
+void mesh_pathtbl_unregister(void);
+int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata);
+void mesh_path_timer(unsigned long data);
+void mesh_path_flush_by_nexthop(struct sta_info *sta);
+void mesh_path_discard_frame(struct sk_buff *skb,
+		struct ieee80211_sub_if_data *sdata);
+void mesh_path_quiesce(struct ieee80211_sub_if_data *sdata);
+void mesh_path_restart(struct ieee80211_sub_if_data *sdata);
+void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata);
+
+extern int mesh_paths_generation;
+
+#ifdef CONFIG_MAC80211_MESH
+extern int mesh_allocated;
+
+static inline int mesh_plink_free_count(struct ieee80211_sub_if_data *sdata)
+{
+	return sdata->u.mesh.mshcfg.dot11MeshMaxPeerLinks -
+	       atomic_read(&sdata->u.mesh.mshstats.estab_plinks);
+}
+
+static inline bool mesh_plink_availables(struct ieee80211_sub_if_data *sdata)
+{
+	return (min_t(long, mesh_plink_free_count(sdata),
+		   MESH_MAX_PLINKS - sdata->local->num_sta)) > 0;
+}
+
+static inline void mesh_path_activate(struct mesh_path *mpath)
+{
+	mpath->flags |= MESH_PATH_ACTIVE | MESH_PATH_RESOLVED;
+}
+
+static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
+{
+	return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP;
+}
+
+void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
+
+void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata);
+void mesh_plink_quiesce(struct sta_info *sta);
+void mesh_plink_restart(struct sta_info *sta);
+#else
+#define mesh_allocated	0
+static inline void
+ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
+static inline void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_mesh_restart(struct ieee80211_sub_if_data *sdata)
+{}
+static inline void mesh_plink_quiesce(struct sta_info *sta) {}
+static inline void mesh_plink_restart(struct sta_info *sta) {}
+static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
+{ return false; }
+#endif
+
+#endif /* IEEE80211S_H */
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
new file mode 100644
index 00000000..2b180530
--- /dev/null
+++ b/net/mac80211/mesh_hwmp.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author:     Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include "mesh.h"
+
+#ifdef CONFIG_MAC80211_VERBOSE_MHWMP_DEBUG
+#define mhwmp_dbg(fmt, args...)   printk(KERN_DEBUG "Mesh HWMP: " fmt, ##args)
+#else
+#define mhwmp_dbg(fmt, args...)   do { (void)(0); } while (0)
+#endif
+
+#define TEST_FRAME_LEN	8192
+#define MAX_METRIC	0xffffffff
+#define ARITH_SHIFT	8
+
+/* Number of frames buffered per destination for unresolved destinations */
+#define MESH_FRAME_QUEUE_LEN	10
+#define MAX_PREQ_QUEUE_LEN	64
+
+/* Destination only */
+#define MP_F_DO	0x1
+/* Reply and forward */
+#define MP_F_RF	0x2
+/* Unknown Sequence Number */
+#define MP_F_USN    0x01
+/* Reason code Present */
+#define MP_F_RCODE  0x02
+
+static void mesh_queue_preq(struct mesh_path *, u8);
+
+static inline u32 u32_field_get(u8 *preq_elem, int offset, bool ae)
+{
+	if (ae)
+		offset += 6;
+	return get_unaligned_le32(preq_elem + offset);
+}
+
+static inline u32 u16_field_get(u8 *preq_elem, int offset, bool ae)
+{
+	if (ae)
+		offset += 6;
+	return get_unaligned_le16(preq_elem + offset);
+}
+
+/* HWMP IE processing macros */
+#define AE_F			(1<<6)
+#define AE_F_SET(x)		(*x & AE_F)
+#define PREQ_IE_FLAGS(x)	(*(x))
+#define PREQ_IE_HOPCOUNT(x)	(*(x + 1))
+#define PREQ_IE_TTL(x)		(*(x + 2))
+#define PREQ_IE_PREQ_ID(x)	u32_field_get(x, 3, 0)
+#define PREQ_IE_ORIG_ADDR(x)	(x + 7)
+#define PREQ_IE_ORIG_SN(x)	u32_field_get(x, 13, 0);
+#define PREQ_IE_LIFETIME(x)	u32_field_get(x, 17, AE_F_SET(x));
+#define PREQ_IE_METRIC(x) 	u32_field_get(x, 21, AE_F_SET(x));
+#define PREQ_IE_TARGET_F(x)	(*(AE_F_SET(x) ? x + 32 : x + 26))
+#define PREQ_IE_TARGET_ADDR(x) 	(AE_F_SET(x) ? x + 33 : x + 27)
+#define PREQ_IE_TARGET_SN(x) 	u32_field_get(x, 33, AE_F_SET(x));
+
+
+#define PREP_IE_FLAGS(x)	PREQ_IE_FLAGS(x)
+#define PREP_IE_HOPCOUNT(x)	PREQ_IE_HOPCOUNT(x)
+#define PREP_IE_TTL(x)		PREQ_IE_TTL(x)
+#define PREP_IE_ORIG_ADDR(x)	(x + 3)
+#define PREP_IE_ORIG_SN(x)	u32_field_get(x, 9, 0);
+#define PREP_IE_LIFETIME(x)	u32_field_get(x, 13, AE_F_SET(x));
+#define PREP_IE_METRIC(x)	u32_field_get(x, 17, AE_F_SET(x));
+#define PREP_IE_TARGET_ADDR(x)	(AE_F_SET(x) ? x + 27 : x + 21)
+#define PREP_IE_TARGET_SN(x)	u32_field_get(x, 27, AE_F_SET(x));
+
+#define PERR_IE_TTL(x)		(*(x))
+#define PERR_IE_TARGET_FLAGS(x)	(*(x + 2))
+#define PERR_IE_TARGET_ADDR(x)	(x + 3)
+#define PERR_IE_TARGET_SN(x)	u32_field_get(x, 9, 0);
+#define PERR_IE_TARGET_RCODE(x)	u16_field_get(x, 13, 0);
+
+#define MSEC_TO_TU(x) (x*1000/1024)
+#define SN_GT(x, y) ((long) (y) - (long) (x) < 0)
+#define SN_LT(x, y) ((long) (x) - (long) (y) < 0)
+
+#define net_traversal_jiffies(s) \
+	msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime)
+#define default_lifetime(s) \
+	MSEC_TO_TU(s->u.mesh.mshcfg.dot11MeshHWMPactivePathTimeout)
+#define min_preq_int_jiff(s) \
+	(msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPpreqMinInterval))
+#define max_preq_retries(s) (s->u.mesh.mshcfg.dot11MeshHWMPmaxPREQretries)
+#define disc_timeout_jiff(s) \
+	msecs_to_jiffies(sdata->u.mesh.mshcfg.min_discovery_timeout)
+
+enum mpath_frame_type {
+	MPATH_PREQ = 0,
+	MPATH_PREP,
+	MPATH_PERR,
+	MPATH_RANN
+};
+
+static const u8 broadcast_addr[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags,
+		u8 *orig_addr, __le32 orig_sn, u8 target_flags, u8 *target,
+		__le32 target_sn, const u8 *da, u8 hop_count, u8 ttl,
+		__le32 lifetime, __le32 metric, __le32 preq_id,
+		struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos;
+	int ie_len;
+
+	if (!skb)
+		return -1;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	/* 25 is the size of the common mgmt part (24) plus the size of the
+	 * common action part (1)
+	 */
+	mgmt = (struct ieee80211_mgmt *)
+		skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action));
+	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action));
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	/* BSSID == SA */
+	memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+	mgmt->u.action.category = WLAN_CATEGORY_MESH_PATH_SEL;
+	mgmt->u.action.u.mesh_action.action_code = MESH_PATH_SEL_ACTION;
+
+	switch (action) {
+	case MPATH_PREQ:
+		mhwmp_dbg("sending PREQ to %pM\n", target);
+		ie_len = 37;
+		pos = skb_put(skb, 2 + ie_len);
+		*pos++ = WLAN_EID_PREQ;
+		break;
+	case MPATH_PREP:
+		mhwmp_dbg("sending PREP to %pM\n", target);
+		ie_len = 31;
+		pos = skb_put(skb, 2 + ie_len);
+		*pos++ = WLAN_EID_PREP;
+		break;
+	case MPATH_RANN:
+		mhwmp_dbg("sending RANN from %pM\n", orig_addr);
+		ie_len = sizeof(struct ieee80211_rann_ie);
+		pos = skb_put(skb, 2 + ie_len);
+		*pos++ = WLAN_EID_RANN;
+		break;
+	default:
+		kfree_skb(skb);
+		return -ENOTSUPP;
+		break;
+	}
+	*pos++ = ie_len;
+	*pos++ = flags;
+	*pos++ = hop_count;
+	*pos++ = ttl;
+	if (action == MPATH_PREQ) {
+		memcpy(pos, &preq_id, 4);
+		pos += 4;
+	}
+	memcpy(pos, orig_addr, ETH_ALEN);
+	pos += ETH_ALEN;
+	memcpy(pos, &orig_sn, 4);
+	pos += 4;
+	if (action != MPATH_RANN) {
+		memcpy(pos, &lifetime, 4);
+		pos += 4;
+	}
+	memcpy(pos, &metric, 4);
+	pos += 4;
+	if (action == MPATH_PREQ) {
+		/* destination count */
+		*pos++ = 1;
+		*pos++ = target_flags;
+	}
+	if (action != MPATH_RANN) {
+		memcpy(pos, target, ETH_ALEN);
+		pos += ETH_ALEN;
+		memcpy(pos, &target_sn, 4);
+	}
+
+	ieee80211_tx_skb(sdata, skb);
+	return 0;
+}
+
+/**
+ * mesh_send_path error - Sends a PERR mesh management frame
+ *
+ * @target: broken destination
+ * @target_sn: SN of the broken destination
+ * @target_rcode: reason code for this PERR
+ * @ra: node this frame is addressed to
+ */
+int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn,
+		       __le16 target_rcode, const u8 *ra,
+		       struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos;
+	int ie_len;
+
+	if (!skb)
+		return -1;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	/* 25 is the size of the common mgmt part (24) plus the size of the
+	 * common action part (1)
+	 */
+	mgmt = (struct ieee80211_mgmt *)
+		skb_put(skb, 25 + sizeof(mgmt->u.action.u.mesh_action));
+	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.mesh_action));
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+
+	memcpy(mgmt->da, ra, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	/* BSSID is left zeroed, wildcard value */
+	mgmt->u.action.category = WLAN_CATEGORY_MESH_PATH_SEL;
+	mgmt->u.action.u.mesh_action.action_code = MESH_PATH_SEL_ACTION;
+	ie_len = 15;
+	pos = skb_put(skb, 2 + ie_len);
+	*pos++ = WLAN_EID_PERR;
+	*pos++ = ie_len;
+	/* ttl */
+	*pos++ = ttl;
+	/* number of destinations */
+	*pos++ = 1;
+	/*
+	 * flags bit, bit 1 is unset if we know the sequence number and
+	 * bit 2 is set if we have a reason code
+	 */
+	*pos = 0;
+	if (!target_sn)
+		*pos |= MP_F_USN;
+	if (target_rcode)
+		*pos |= MP_F_RCODE;
+	pos++;
+	memcpy(pos, target, ETH_ALEN);
+	pos += ETH_ALEN;
+	memcpy(pos, &target_sn, 4);
+	pos += 4;
+	memcpy(pos, &target_rcode, 2);
+
+	ieee80211_tx_skb(sdata, skb);
+	return 0;
+}
+
+void ieee80211s_update_metric(struct ieee80211_local *local,
+		struct sta_info *stainfo, struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	int failed;
+
+	if (!ieee80211_is_data(hdr->frame_control))
+		return;
+
+	failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK);
+
+	/* moving average, scaled to 100 */
+	stainfo->fail_avg = ((80 * stainfo->fail_avg + 5) / 100 + 20 * failed);
+	if (stainfo->fail_avg > 95)
+		mesh_plink_broken(stainfo);
+}
+
+static u32 airtime_link_metric_get(struct ieee80211_local *local,
+				   struct sta_info *sta)
+{
+	struct ieee80211_supported_band *sband;
+	/* This should be adjusted for each device */
+	int device_constant = 1 << ARITH_SHIFT;
+	int test_frame_len = TEST_FRAME_LEN << ARITH_SHIFT;
+	int s_unit = 1 << ARITH_SHIFT;
+	int rate, err;
+	u32 tx_time, estimated_retx;
+	u64 result;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	if (sta->fail_avg >= 100)
+		return MAX_METRIC;
+
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)
+		return MAX_METRIC;
+
+	err = (sta->fail_avg << ARITH_SHIFT) / 100;
+
+	/* bitrate is in units of 100 Kbps, while we need rate in units of
+	 * 1Mbps. This will be corrected on tx_time computation.
+	 */
+	rate = sband->bitrates[sta->last_tx_rate.idx].bitrate;
+	tx_time = (device_constant + 10 * test_frame_len / rate);
+	estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err));
+	result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ;
+	return (u32)result;
+}
+
+/**
+ * hwmp_route_info_get - Update routing info to originator and transmitter
+ *
+ * @sdata: local mesh subif
+ * @mgmt: mesh management frame
+ * @hwmp_ie: hwmp information element (PREP or PREQ)
+ *
+ * This function updates the path routing information to the originator and the
+ * transmitter of a HWMP PREQ or PREP frame.
+ *
+ * Returns: metric to frame originator or 0 if the frame should not be further
+ * processed
+ *
+ * Notes: this function is the only place (besides user-provided info) where
+ * path routing information is updated.
+ */
+static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
+			    struct ieee80211_mgmt *mgmt,
+			    u8 *hwmp_ie, enum mpath_frame_type action)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct mesh_path *mpath;
+	struct sta_info *sta;
+	bool fresh_info;
+	u8 *orig_addr, *ta;
+	u32 orig_sn, orig_metric;
+	unsigned long orig_lifetime, exp_time;
+	u32 last_hop_metric, new_metric;
+	bool process = true;
+
+	rcu_read_lock();
+	sta = sta_info_get(sdata, mgmt->sa);
+	if (!sta) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	last_hop_metric = airtime_link_metric_get(local, sta);
+	/* Update and check originator routing info */
+	fresh_info = true;
+
+	switch (action) {
+	case MPATH_PREQ:
+		orig_addr = PREQ_IE_ORIG_ADDR(hwmp_ie);
+		orig_sn = PREQ_IE_ORIG_SN(hwmp_ie);
+		orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie);
+		orig_metric = PREQ_IE_METRIC(hwmp_ie);
+		break;
+	case MPATH_PREP:
+		/* Originator here refers to the MP that was the destination in
+		 * the Path Request. The draft refers to that MP as the
+		 * destination address, even though usually it is the origin of
+		 * the PREP frame. We divert from the nomenclature in the draft
+		 * so that we can easily use a single function to gather path
+		 * information from both PREQ and PREP frames.
+		 */
+		orig_addr = PREP_IE_ORIG_ADDR(hwmp_ie);
+		orig_sn = PREP_IE_ORIG_SN(hwmp_ie);
+		orig_lifetime = PREP_IE_LIFETIME(hwmp_ie);
+		orig_metric = PREP_IE_METRIC(hwmp_ie);
+		break;
+	default:
+		rcu_read_unlock();
+		return 0;
+	}
+	new_metric = orig_metric + last_hop_metric;
+	if (new_metric < orig_metric)
+		new_metric = MAX_METRIC;
+	exp_time = TU_TO_EXP_TIME(orig_lifetime);
+
+	if (memcmp(orig_addr, sdata->vif.addr, ETH_ALEN) == 0) {
+		/* This MP is the originator, we are not interested in this
+		 * frame, except for updating transmitter's path info.
+		 */
+		process = false;
+		fresh_info = false;
+	} else {
+		mpath = mesh_path_lookup(orig_addr, sdata);
+		if (mpath) {
+			spin_lock_bh(&mpath->state_lock);
+			if (mpath->flags & MESH_PATH_FIXED)
+				fresh_info = false;
+			else if ((mpath->flags & MESH_PATH_ACTIVE) &&
+			    (mpath->flags & MESH_PATH_SN_VALID)) {
+				if (SN_GT(mpath->sn, orig_sn) ||
+				    (mpath->sn == orig_sn &&
+				     new_metric >= mpath->metric)) {
+					process = false;
+					fresh_info = false;
+				}
+			}
+		} else {
+			mesh_path_add(orig_addr, sdata);
+			mpath = mesh_path_lookup(orig_addr, sdata);
+			if (!mpath) {
+				rcu_read_unlock();
+				return 0;
+			}
+			spin_lock_bh(&mpath->state_lock);
+		}
+
+		if (fresh_info) {
+			mesh_path_assign_nexthop(mpath, sta);
+			mpath->flags |= MESH_PATH_SN_VALID;
+			mpath->metric = new_metric;
+			mpath->sn = orig_sn;
+			mpath->exp_time = time_after(mpath->exp_time, exp_time)
+					  ?  mpath->exp_time : exp_time;
+			mesh_path_activate(mpath);
+			spin_unlock_bh(&mpath->state_lock);
+			mesh_path_tx_pending(mpath);
+			/* draft says preq_id should be saved to, but there does
+			 * not seem to be any use for it, skipping by now
+			 */
+		} else
+			spin_unlock_bh(&mpath->state_lock);
+	}
+
+	/* Update and check transmitter routing info */
+	ta = mgmt->sa;
+	if (memcmp(orig_addr, ta, ETH_ALEN) == 0)
+		fresh_info = false;
+	else {
+		fresh_info = true;
+
+		mpath = mesh_path_lookup(ta, sdata);
+		if (mpath) {
+			spin_lock_bh(&mpath->state_lock);
+			if ((mpath->flags & MESH_PATH_FIXED) ||
+				((mpath->flags & MESH_PATH_ACTIVE) &&
+					(last_hop_metric > mpath->metric)))
+				fresh_info = false;
+		} else {
+			mesh_path_add(ta, sdata);
+			mpath = mesh_path_lookup(ta, sdata);
+			if (!mpath) {
+				rcu_read_unlock();
+				return 0;
+			}
+			spin_lock_bh(&mpath->state_lock);
+		}
+
+		if (fresh_info) {
+			mesh_path_assign_nexthop(mpath, sta);
+			mpath->flags &= ~MESH_PATH_SN_VALID;
+			mpath->metric = last_hop_metric;
+			mpath->exp_time = time_after(mpath->exp_time, exp_time)
+					  ?  mpath->exp_time : exp_time;
+			mesh_path_activate(mpath);
+			spin_unlock_bh(&mpath->state_lock);
+			mesh_path_tx_pending(mpath);
+		} else
+			spin_unlock_bh(&mpath->state_lock);
+	}
+
+	rcu_read_unlock();
+
+	return process ? new_metric : 0;
+}
+
+static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt,
+				    u8 *preq_elem, u32 metric)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_path *mpath;
+	u8 *target_addr, *orig_addr;
+	u8 target_flags, ttl;
+	u32 orig_sn, target_sn, lifetime;
+	bool reply = false;
+	bool forward = true;
+
+	/* Update target SN, if present */
+	target_addr = PREQ_IE_TARGET_ADDR(preq_elem);
+	orig_addr = PREQ_IE_ORIG_ADDR(preq_elem);
+	target_sn = PREQ_IE_TARGET_SN(preq_elem);
+	orig_sn = PREQ_IE_ORIG_SN(preq_elem);
+	target_flags = PREQ_IE_TARGET_F(preq_elem);
+
+	mhwmp_dbg("received PREQ from %pM\n", orig_addr);
+
+	if (memcmp(target_addr, sdata->vif.addr, ETH_ALEN) == 0) {
+		mhwmp_dbg("PREQ is for us\n");
+		forward = false;
+		reply = true;
+		metric = 0;
+		if (time_after(jiffies, ifmsh->last_sn_update +
+					net_traversal_jiffies(sdata)) ||
+		    time_before(jiffies, ifmsh->last_sn_update)) {
+			target_sn = ++ifmsh->sn;
+			ifmsh->last_sn_update = jiffies;
+		}
+	} else {
+		rcu_read_lock();
+		mpath = mesh_path_lookup(target_addr, sdata);
+		if (mpath) {
+			if ((!(mpath->flags & MESH_PATH_SN_VALID)) ||
+					SN_LT(mpath->sn, target_sn)) {
+				mpath->sn = target_sn;
+				mpath->flags |= MESH_PATH_SN_VALID;
+			} else if ((!(target_flags & MP_F_DO)) &&
+					(mpath->flags & MESH_PATH_ACTIVE)) {
+				reply = true;
+				metric = mpath->metric;
+				target_sn = mpath->sn;
+				if (target_flags & MP_F_RF)
+					target_flags |= MP_F_DO;
+				else
+					forward = false;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	if (reply) {
+		lifetime = PREQ_IE_LIFETIME(preq_elem);
+		ttl = ifmsh->mshcfg.element_ttl;
+		if (ttl != 0) {
+			mhwmp_dbg("replying to the PREQ\n");
+			mesh_path_sel_frame_tx(MPATH_PREP, 0, target_addr,
+				cpu_to_le32(target_sn), 0, orig_addr,
+				cpu_to_le32(orig_sn), mgmt->sa, 0, ttl,
+				cpu_to_le32(lifetime), cpu_to_le32(metric),
+				0, sdata);
+		} else
+			ifmsh->mshstats.dropped_frames_ttl++;
+	}
+
+	if (forward) {
+		u32 preq_id;
+		u8 hopcount, flags;
+
+		ttl = PREQ_IE_TTL(preq_elem);
+		lifetime = PREQ_IE_LIFETIME(preq_elem);
+		if (ttl <= 1) {
+			ifmsh->mshstats.dropped_frames_ttl++;
+			return;
+		}
+		mhwmp_dbg("forwarding the PREQ from %pM\n", orig_addr);
+		--ttl;
+		flags = PREQ_IE_FLAGS(preq_elem);
+		preq_id = PREQ_IE_PREQ_ID(preq_elem);
+		hopcount = PREQ_IE_HOPCOUNT(preq_elem) + 1;
+		mesh_path_sel_frame_tx(MPATH_PREQ, flags, orig_addr,
+				cpu_to_le32(orig_sn), target_flags, target_addr,
+				cpu_to_le32(target_sn), broadcast_addr,
+				hopcount, ttl, cpu_to_le32(lifetime),
+				cpu_to_le32(metric), cpu_to_le32(preq_id),
+				sdata);
+		ifmsh->mshstats.fwded_mcast++;
+		ifmsh->mshstats.fwded_frames++;
+	}
+}
+
+
+static inline struct sta_info *
+next_hop_deref_protected(struct mesh_path *mpath)
+{
+	return rcu_dereference_protected(mpath->next_hop,
+					 lockdep_is_held(&mpath->state_lock));
+}
+
+
+static void hwmp_prep_frame_process(struct ieee80211_sub_if_data *sdata,
+				    struct ieee80211_mgmt *mgmt,
+				    u8 *prep_elem, u32 metric)
+{
+	struct mesh_path *mpath;
+	u8 *target_addr, *orig_addr;
+	u8 ttl, hopcount, flags;
+	u8 next_hop[ETH_ALEN];
+	u32 target_sn, orig_sn, lifetime;
+
+	mhwmp_dbg("received PREP from %pM\n", PREP_IE_ORIG_ADDR(prep_elem));
+
+	/* Note that we divert from the draft nomenclature and denominate
+	 * destination to what the draft refers to as origininator. So in this
+	 * function destnation refers to the final destination of the PREP,
+	 * which corresponds with the originator of the PREQ which this PREP
+	 * replies
+	 */
+	target_addr = PREP_IE_TARGET_ADDR(prep_elem);
+	if (memcmp(target_addr, sdata->vif.addr, ETH_ALEN) == 0)
+		/* destination, no forwarding required */
+		return;
+
+	ttl = PREP_IE_TTL(prep_elem);
+	if (ttl <= 1) {
+		sdata->u.mesh.mshstats.dropped_frames_ttl++;
+		return;
+	}
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(target_addr, sdata);
+	if (mpath)
+		spin_lock_bh(&mpath->state_lock);
+	else
+		goto fail;
+	if (!(mpath->flags & MESH_PATH_ACTIVE)) {
+		spin_unlock_bh(&mpath->state_lock);
+		goto fail;
+	}
+	memcpy(next_hop, next_hop_deref_protected(mpath)->sta.addr, ETH_ALEN);
+	spin_unlock_bh(&mpath->state_lock);
+	--ttl;
+	flags = PREP_IE_FLAGS(prep_elem);
+	lifetime = PREP_IE_LIFETIME(prep_elem);
+	hopcount = PREP_IE_HOPCOUNT(prep_elem) + 1;
+	orig_addr = PREP_IE_ORIG_ADDR(prep_elem);
+	target_sn = PREP_IE_TARGET_SN(prep_elem);
+	orig_sn = PREP_IE_ORIG_SN(prep_elem);
+
+	mesh_path_sel_frame_tx(MPATH_PREP, flags, orig_addr,
+		cpu_to_le32(orig_sn), 0, target_addr,
+		cpu_to_le32(target_sn), next_hop, hopcount,
+		ttl, cpu_to_le32(lifetime), cpu_to_le32(metric),
+		0, sdata);
+	rcu_read_unlock();
+
+	sdata->u.mesh.mshstats.fwded_unicast++;
+	sdata->u.mesh.mshstats.fwded_frames++;
+	return;
+
+fail:
+	rcu_read_unlock();
+	sdata->u.mesh.mshstats.dropped_frames_no_route++;
+}
+
+static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata,
+			     struct ieee80211_mgmt *mgmt, u8 *perr_elem)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_path *mpath;
+	u8 ttl;
+	u8 *ta, *target_addr;
+	u32 target_sn;
+	u16 target_rcode;
+
+	ta = mgmt->sa;
+	ttl = PERR_IE_TTL(perr_elem);
+	if (ttl <= 1) {
+		ifmsh->mshstats.dropped_frames_ttl++;
+		return;
+	}
+	ttl--;
+	target_addr = PERR_IE_TARGET_ADDR(perr_elem);
+	target_sn = PERR_IE_TARGET_SN(perr_elem);
+	target_rcode = PERR_IE_TARGET_RCODE(perr_elem);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(target_addr, sdata);
+	if (mpath) {
+		spin_lock_bh(&mpath->state_lock);
+		if (mpath->flags & MESH_PATH_ACTIVE &&
+		    memcmp(ta, next_hop_deref_protected(mpath)->sta.addr,
+							ETH_ALEN) == 0 &&
+		    (!(mpath->flags & MESH_PATH_SN_VALID) ||
+		    SN_GT(target_sn, mpath->sn))) {
+			mpath->flags &= ~MESH_PATH_ACTIVE;
+			mpath->sn = target_sn;
+			spin_unlock_bh(&mpath->state_lock);
+			mesh_path_error_tx(ttl, target_addr, cpu_to_le32(target_sn),
+					   cpu_to_le16(target_rcode),
+					   broadcast_addr, sdata);
+		} else
+			spin_unlock_bh(&mpath->state_lock);
+	}
+	rcu_read_unlock();
+}
+
+static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_mgmt *mgmt,
+				struct ieee80211_rann_ie *rann)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_path *mpath;
+	u8 ttl, flags, hopcount;
+	u8 *orig_addr;
+	u32 orig_sn, metric;
+
+	ttl = rann->rann_ttl;
+	if (ttl <= 1) {
+		ifmsh->mshstats.dropped_frames_ttl++;
+		return;
+	}
+	ttl--;
+	flags = rann->rann_flags;
+	orig_addr = rann->rann_addr;
+	orig_sn = rann->rann_seq;
+	hopcount = rann->rann_hopcount;
+	hopcount++;
+	metric = rann->rann_metric;
+	mhwmp_dbg("received RANN from %pM\n", orig_addr);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(orig_addr, sdata);
+	if (!mpath) {
+		mesh_path_add(orig_addr, sdata);
+		mpath = mesh_path_lookup(orig_addr, sdata);
+		if (!mpath) {
+			rcu_read_unlock();
+			sdata->u.mesh.mshstats.dropped_frames_no_route++;
+			return;
+		}
+		mesh_queue_preq(mpath,
+				PREQ_Q_F_START | PREQ_Q_F_REFRESH);
+	}
+	if (mpath->sn < orig_sn) {
+		mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr,
+				       cpu_to_le32(orig_sn),
+				       0, NULL, 0, broadcast_addr,
+				       hopcount, ttl, 0,
+				       cpu_to_le32(metric + mpath->metric),
+				       0, sdata);
+		mpath->sn = orig_sn;
+	}
+	rcu_read_unlock();
+}
+
+
+void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
+			    struct ieee80211_mgmt *mgmt,
+			    size_t len)
+{
+	struct ieee802_11_elems elems;
+	size_t baselen;
+	u32 last_hop_metric;
+
+	/* need action_code */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+		return;
+
+	baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt;
+	ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable,
+			len - baselen, &elems);
+
+	if (elems.preq) {
+		if (elems.preq_len != 37)
+			/* Right now we support just 1 destination and no AE */
+			return;
+		last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.preq,
+						      MPATH_PREQ);
+		if (last_hop_metric)
+			hwmp_preq_frame_process(sdata, mgmt, elems.preq,
+						last_hop_metric);
+	}
+	if (elems.prep) {
+		if (elems.prep_len != 31)
+			/* Right now we support no AE */
+			return;
+		last_hop_metric = hwmp_route_info_get(sdata, mgmt, elems.prep,
+						      MPATH_PREP);
+		if (last_hop_metric)
+			hwmp_prep_frame_process(sdata, mgmt, elems.prep,
+						last_hop_metric);
+	}
+	if (elems.perr) {
+		if (elems.perr_len != 15)
+			/* Right now we support only one destination per PERR */
+			return;
+		hwmp_perr_frame_process(sdata, mgmt, elems.perr);
+	}
+	if (elems.rann)
+		hwmp_rann_frame_process(sdata, mgmt, elems.rann);
+}
+
+/**
+ * mesh_queue_preq - queue a PREQ to a given destination
+ *
+ * @mpath: mesh path to discover
+ * @flags: special attributes of the PREQ to be sent
+ *
+ * Locking: the function must be called from within a rcu read lock block.
+ *
+ */
+static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
+{
+	struct ieee80211_sub_if_data *sdata = mpath->sdata;
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_preq_queue *preq_node;
+
+	preq_node = kmalloc(sizeof(struct mesh_preq_queue), GFP_ATOMIC);
+	if (!preq_node) {
+		mhwmp_dbg("could not allocate PREQ node\n");
+		return;
+	}
+
+	spin_lock(&ifmsh->mesh_preq_queue_lock);
+	if (ifmsh->preq_queue_len == MAX_PREQ_QUEUE_LEN) {
+		spin_unlock(&ifmsh->mesh_preq_queue_lock);
+		kfree(preq_node);
+		if (printk_ratelimit())
+			mhwmp_dbg("PREQ node queue full\n");
+		return;
+	}
+
+	memcpy(preq_node->dst, mpath->dst, ETH_ALEN);
+	preq_node->flags = flags;
+
+	list_add_tail(&preq_node->list, &ifmsh->preq_queue.list);
+	++ifmsh->preq_queue_len;
+	spin_unlock(&ifmsh->mesh_preq_queue_lock);
+
+	if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata)))
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+
+	else if (time_before(jiffies, ifmsh->last_preq)) {
+		/* avoid long wait if did not send preqs for a long time
+		 * and jiffies wrapped around
+		 */
+		ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1;
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+	} else
+		mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq +
+						min_preq_int_jiff(sdata));
+}
+
+/**
+ * mesh_path_start_discovery - launch a path discovery from the PREQ queue
+ *
+ * @sdata: local mesh subif
+ */
+void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct mesh_preq_queue *preq_node;
+	struct mesh_path *mpath;
+	u8 ttl, target_flags;
+	u32 lifetime;
+
+	spin_lock_bh(&ifmsh->mesh_preq_queue_lock);
+	if (!ifmsh->preq_queue_len ||
+		time_before(jiffies, ifmsh->last_preq +
+				min_preq_int_jiff(sdata))) {
+		spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+		return;
+	}
+
+	preq_node = list_first_entry(&ifmsh->preq_queue.list,
+			struct mesh_preq_queue, list);
+	list_del(&preq_node->list);
+	--ifmsh->preq_queue_len;
+	spin_unlock_bh(&ifmsh->mesh_preq_queue_lock);
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(preq_node->dst, sdata);
+	if (!mpath)
+		goto enddiscovery;
+
+	spin_lock_bh(&mpath->state_lock);
+	if (preq_node->flags & PREQ_Q_F_START) {
+		if (mpath->flags & MESH_PATH_RESOLVING) {
+			spin_unlock_bh(&mpath->state_lock);
+			goto enddiscovery;
+		} else {
+			mpath->flags &= ~MESH_PATH_RESOLVED;
+			mpath->flags |= MESH_PATH_RESOLVING;
+			mpath->discovery_retries = 0;
+			mpath->discovery_timeout = disc_timeout_jiff(sdata);
+		}
+	} else if (!(mpath->flags & MESH_PATH_RESOLVING) ||
+			mpath->flags & MESH_PATH_RESOLVED) {
+		mpath->flags &= ~MESH_PATH_RESOLVING;
+		spin_unlock_bh(&mpath->state_lock);
+		goto enddiscovery;
+	}
+
+	ifmsh->last_preq = jiffies;
+
+	if (time_after(jiffies, ifmsh->last_sn_update +
+				net_traversal_jiffies(sdata)) ||
+	    time_before(jiffies, ifmsh->last_sn_update)) {
+		++ifmsh->sn;
+		sdata->u.mesh.last_sn_update = jiffies;
+	}
+	lifetime = default_lifetime(sdata);
+	ttl = sdata->u.mesh.mshcfg.element_ttl;
+	if (ttl == 0) {
+		sdata->u.mesh.mshstats.dropped_frames_ttl++;
+		spin_unlock_bh(&mpath->state_lock);
+		goto enddiscovery;
+	}
+
+	if (preq_node->flags & PREQ_Q_F_REFRESH)
+		target_flags = MP_F_DO;
+	else
+		target_flags = MP_F_RF;
+
+	spin_unlock_bh(&mpath->state_lock);
+	mesh_path_sel_frame_tx(MPATH_PREQ, 0, sdata->vif.addr,
+			cpu_to_le32(ifmsh->sn), target_flags, mpath->dst,
+			cpu_to_le32(mpath->sn), broadcast_addr, 0,
+			ttl, cpu_to_le32(lifetime), 0,
+			cpu_to_le32(ifmsh->preq_id++), sdata);
+	mod_timer(&mpath->timer, jiffies + mpath->discovery_timeout);
+
+enddiscovery:
+	rcu_read_unlock();
+	kfree(preq_node);
+}
+
+/**
+ * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame
+ *
+ * @skb: 802.11 frame to be sent
+ * @sdata: network subif the frame will be sent through
+ *
+ * Returns: 0 if the next hop was found. Nonzero otherwise. If no next hop is
+ * found, the function will start a path discovery and queue the frame so it is
+ * sent when the path is resolved. This means the caller must not free the skb
+ * in this case.
+ */
+int mesh_nexthop_lookup(struct sk_buff *skb,
+			struct ieee80211_sub_if_data *sdata)
+{
+	struct sk_buff *skb_to_free = NULL;
+	struct mesh_path *mpath;
+	struct sta_info *next_hop;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	u8 *target_addr = hdr->addr3;
+	int err = 0;
+
+	rcu_read_lock();
+	mpath = mesh_path_lookup(target_addr, sdata);
+
+	if (!mpath) {
+		mesh_path_add(target_addr, sdata);
+		mpath = mesh_path_lookup(target_addr, sdata);
+		if (!mpath) {
+			sdata->u.mesh.mshstats.dropped_frames_no_route++;
+			err = -ENOSPC;
+			goto endlookup;
+		}
+	}
+
+	if (mpath->flags & MESH_PATH_ACTIVE) {
+		if (time_after(jiffies,
+			       mpath->exp_time -
+			       msecs_to_jiffies(sdata->u.mesh.mshcfg.path_refresh_time)) &&
+		    !memcmp(sdata->vif.addr, hdr->addr4, ETH_ALEN) &&
+		    !(mpath->flags & MESH_PATH_RESOLVING) &&
+		    !(mpath->flags & MESH_PATH_FIXED)) {
+			mesh_queue_preq(mpath,
+					PREQ_Q_F_START | PREQ_Q_F_REFRESH);
+		}
+		next_hop = rcu_dereference(mpath->next_hop);
+		if (next_hop)
+			memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN);
+		else
+			err = -ENOENT;
+	} else {
+		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		if (!(mpath->flags & MESH_PATH_RESOLVING)) {
+			/* Start discovery only if it is not running yet */
+			mesh_queue_preq(mpath, PREQ_Q_F_START);
+		}
+
+		if (skb_queue_len(&mpath->frame_queue) >=
+				MESH_FRAME_QUEUE_LEN)
+			skb_to_free = skb_dequeue(&mpath->frame_queue);
+
+		info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+		skb_queue_tail(&mpath->frame_queue, skb);
+		if (skb_to_free)
+			mesh_path_discard_frame(skb_to_free, sdata);
+		err = -ENOENT;
+	}
+
+endlookup:
+	rcu_read_unlock();
+	return err;
+}
+
+void mesh_path_timer(unsigned long data)
+{
+	struct mesh_path *mpath = (void *) data;
+	struct ieee80211_sub_if_data *sdata = mpath->sdata;
+
+	if (sdata->local->quiescing)
+		return;
+
+	spin_lock_bh(&mpath->state_lock);
+	if (mpath->flags & MESH_PATH_RESOLVED ||
+			(!(mpath->flags & MESH_PATH_RESOLVING)))
+		mpath->flags &= ~(MESH_PATH_RESOLVING | MESH_PATH_RESOLVED);
+	else if (mpath->discovery_retries < max_preq_retries(sdata)) {
+		++mpath->discovery_retries;
+		mpath->discovery_timeout *= 2;
+		mesh_queue_preq(mpath, 0);
+	} else {
+		mpath->flags = 0;
+		mpath->exp_time = jiffies;
+		mesh_path_flush_pending(mpath);
+	}
+
+	spin_unlock_bh(&mpath->state_lock);
+}
+
+void
+mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+
+	mesh_path_sel_frame_tx(MPATH_RANN, 0, sdata->vif.addr,
+			       cpu_to_le32(++ifmsh->sn),
+			       0, NULL, 0, broadcast_addr,
+			       0, sdata->u.mesh.mshcfg.element_ttl,
+			       0, 0, 0, sdata);
+}
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
new file mode 100644
index 00000000..0d2faacc
--- /dev/null
+++ b/net/mac80211/mesh_pathtbl.c
@@ -0,0 +1,848 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author:     Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "mesh.h"
+
+/* There will be initially 2^INIT_PATHS_SIZE_ORDER buckets */
+#define INIT_PATHS_SIZE_ORDER	2
+
+/* Keep the mean chain length below this constant */
+#define MEAN_CHAIN_LEN		2
+
+#define MPATH_EXPIRED(mpath) ((mpath->flags & MESH_PATH_ACTIVE) && \
+				time_after(jiffies, mpath->exp_time) && \
+				!(mpath->flags & MESH_PATH_FIXED))
+
+struct mpath_node {
+	struct hlist_node list;
+	struct rcu_head rcu;
+	/* This indirection allows two different tables to point to the same
+	 * mesh_path structure, useful when resizing
+	 */
+	struct mesh_path *mpath;
+};
+
+static struct mesh_table __rcu *mesh_paths;
+static struct mesh_table __rcu *mpp_paths; /* Store paths for MPP&MAP */
+
+int mesh_paths_generation;
+
+/* This lock will have the grow table function as writer and add / delete nodes
+ * as readers. When reading the table (i.e. doing lookups) we are well protected
+ * by RCU
+ */
+static DEFINE_RWLOCK(pathtbl_resize_lock);
+
+
+static inline struct mesh_table *resize_dereference_mesh_paths(void)
+{
+	return rcu_dereference_protected(mesh_paths,
+		lockdep_is_held(&pathtbl_resize_lock));
+}
+
+static inline struct mesh_table *resize_dereference_mpp_paths(void)
+{
+	return rcu_dereference_protected(mpp_paths,
+		lockdep_is_held(&pathtbl_resize_lock));
+}
+
+/*
+ * CAREFUL -- "tbl" must not be an expression,
+ * in particular not an rcu_dereference(), since
+ * it's used twice. So it is illegal to do
+ *	for_each_mesh_entry(rcu_dereference(...), ...)
+ */
+#define for_each_mesh_entry(tbl, p, node, i) \
+	for (i = 0; i <= tbl->hash_mask; i++) \
+		hlist_for_each_entry_rcu(node, p, &tbl->hash_buckets[i], list)
+
+
+static struct mesh_table *mesh_table_alloc(int size_order)
+{
+	int i;
+	struct mesh_table *newtbl;
+
+	newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC);
+	if (!newtbl)
+		return NULL;
+
+	newtbl->hash_buckets = kzalloc(sizeof(struct hlist_head) *
+			(1 << size_order), GFP_ATOMIC);
+
+	if (!newtbl->hash_buckets) {
+		kfree(newtbl);
+		return NULL;
+	}
+
+	newtbl->hashwlock = kmalloc(sizeof(spinlock_t) *
+			(1 << size_order), GFP_ATOMIC);
+	if (!newtbl->hashwlock) {
+		kfree(newtbl->hash_buckets);
+		kfree(newtbl);
+		return NULL;
+	}
+
+	newtbl->size_order = size_order;
+	newtbl->hash_mask = (1 << size_order) - 1;
+	atomic_set(&newtbl->entries,  0);
+	get_random_bytes(&newtbl->hash_rnd,
+			sizeof(newtbl->hash_rnd));
+	for (i = 0; i <= newtbl->hash_mask; i++)
+		spin_lock_init(&newtbl->hashwlock[i]);
+
+	return newtbl;
+}
+
+static void __mesh_table_free(struct mesh_table *tbl)
+{
+	kfree(tbl->hash_buckets);
+	kfree(tbl->hashwlock);
+	kfree(tbl);
+}
+
+static void mesh_table_free(struct mesh_table *tbl, bool free_leafs)
+{
+	struct hlist_head *mesh_hash;
+	struct hlist_node *p, *q;
+	int i;
+
+	mesh_hash = tbl->hash_buckets;
+	for (i = 0; i <= tbl->hash_mask; i++) {
+		spin_lock_bh(&tbl->hashwlock[i]);
+		hlist_for_each_safe(p, q, &mesh_hash[i]) {
+			tbl->free_node(p, free_leafs);
+			atomic_dec(&tbl->entries);
+		}
+		spin_unlock_bh(&tbl->hashwlock[i]);
+	}
+	__mesh_table_free(tbl);
+}
+
+static int mesh_table_grow(struct mesh_table *oldtbl,
+			   struct mesh_table *newtbl)
+{
+	struct hlist_head *oldhash;
+	struct hlist_node *p, *q;
+	int i;
+
+	if (atomic_read(&oldtbl->entries)
+			< oldtbl->mean_chain_len * (oldtbl->hash_mask + 1))
+		return -EAGAIN;
+
+	newtbl->free_node = oldtbl->free_node;
+	newtbl->mean_chain_len = oldtbl->mean_chain_len;
+	newtbl->copy_node = oldtbl->copy_node;
+	atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries));
+
+	oldhash = oldtbl->hash_buckets;
+	for (i = 0; i <= oldtbl->hash_mask; i++)
+		hlist_for_each(p, &oldhash[i])
+			if (oldtbl->copy_node(p, newtbl) < 0)
+				goto errcopy;
+
+	return 0;
+
+errcopy:
+	for (i = 0; i <= newtbl->hash_mask; i++) {
+		hlist_for_each_safe(p, q, &newtbl->hash_buckets[i])
+			oldtbl->free_node(p, 0);
+	}
+	return -ENOMEM;
+}
+
+static u32 mesh_table_hash(u8 *addr, struct ieee80211_sub_if_data *sdata,
+			   struct mesh_table *tbl)
+{
+	/* Use last four bytes of hw addr and interface index as hash index */
+	return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, tbl->hash_rnd)
+		& tbl->hash_mask;
+}
+
+
+/**
+ *
+ * mesh_path_assign_nexthop - update mesh path next hop
+ *
+ * @mpath: mesh path to update
+ * @sta: next hop to assign
+ *
+ * Locking: mpath->state_lock must be held when calling this function
+ */
+void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta)
+{
+	struct sk_buff *skb;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff_head tmpq;
+	unsigned long flags;
+
+	rcu_assign_pointer(mpath->next_hop, sta);
+
+	__skb_queue_head_init(&tmpq);
+
+	spin_lock_irqsave(&mpath->frame_queue.lock, flags);
+
+	while ((skb = __skb_dequeue(&mpath->frame_queue)) != NULL) {
+		hdr = (struct ieee80211_hdr *) skb->data;
+		memcpy(hdr->addr1, sta->sta.addr, ETH_ALEN);
+		__skb_queue_tail(&tmpq, skb);
+	}
+
+	skb_queue_splice(&tmpq, &mpath->frame_queue);
+	spin_unlock_irqrestore(&mpath->frame_queue.lock, flags);
+}
+
+
+/**
+ * mesh_path_lookup - look up a path in the mesh path table
+ * @dst: hardware address (ETH_ALEN length) of destination
+ * @sdata: local subif
+ *
+ * Returns: pointer to the mesh path structure, or NULL if not found
+ *
+ * Locking: must be called within a read rcu section.
+ */
+struct mesh_path *mesh_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_path *mpath;
+	struct hlist_node *n;
+	struct hlist_head *bucket;
+	struct mesh_table *tbl;
+	struct mpath_node *node;
+
+	tbl = rcu_dereference(mesh_paths);
+
+	bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
+	hlist_for_each_entry_rcu(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata &&
+				memcmp(dst, mpath->dst, ETH_ALEN) == 0) {
+			if (MPATH_EXPIRED(mpath)) {
+				spin_lock_bh(&mpath->state_lock);
+				if (MPATH_EXPIRED(mpath))
+					mpath->flags &= ~MESH_PATH_ACTIVE;
+				spin_unlock_bh(&mpath->state_lock);
+			}
+			return mpath;
+		}
+	}
+	return NULL;
+}
+
+struct mesh_path *mpp_path_lookup(u8 *dst, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_path *mpath;
+	struct hlist_node *n;
+	struct hlist_head *bucket;
+	struct mesh_table *tbl;
+	struct mpath_node *node;
+
+	tbl = rcu_dereference(mpp_paths);
+
+	bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)];
+	hlist_for_each_entry_rcu(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata &&
+		    memcmp(dst, mpath->dst, ETH_ALEN) == 0) {
+			if (MPATH_EXPIRED(mpath)) {
+				spin_lock_bh(&mpath->state_lock);
+				if (MPATH_EXPIRED(mpath))
+					mpath->flags &= ~MESH_PATH_ACTIVE;
+				spin_unlock_bh(&mpath->state_lock);
+			}
+			return mpath;
+		}
+	}
+	return NULL;
+}
+
+
+/**
+ * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index
+ * @idx: index
+ * @sdata: local subif, or NULL for all entries
+ *
+ * Returns: pointer to the mesh path structure, or NULL if not found.
+ *
+ * Locking: must be called within a read rcu section.
+ */
+struct mesh_path *mesh_path_lookup_by_idx(int idx, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_table *tbl = rcu_dereference(mesh_paths);
+	struct mpath_node *node;
+	struct hlist_node *p;
+	int i;
+	int j = 0;
+
+	for_each_mesh_entry(tbl, p, node, i) {
+		if (sdata && node->mpath->sdata != sdata)
+			continue;
+		if (j++ == idx) {
+			if (MPATH_EXPIRED(node->mpath)) {
+				spin_lock_bh(&node->mpath->state_lock);
+				if (MPATH_EXPIRED(node->mpath))
+					node->mpath->flags &= ~MESH_PATH_ACTIVE;
+				spin_unlock_bh(&node->mpath->state_lock);
+			}
+			return node->mpath;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * mesh_path_add - allocate and add a new path to the mesh path table
+ * @addr: destination address of the path (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 on success
+ *
+ * State: the initial state of the new path is set to 0
+ */
+int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+	struct mesh_table *tbl;
+	struct mesh_path *mpath, *new_mpath;
+	struct mpath_node *node, *new_node;
+	struct hlist_head *bucket;
+	struct hlist_node *n;
+	int grow = 0;
+	int err = 0;
+	u32 hash_idx;
+
+	if (memcmp(dst, sdata->vif.addr, ETH_ALEN) == 0)
+		/* never add ourselves as neighbours */
+		return -ENOTSUPP;
+
+	if (is_multicast_ether_addr(dst))
+		return -ENOTSUPP;
+
+	if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0)
+		return -ENOSPC;
+
+	err = -ENOMEM;
+	new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
+	if (!new_mpath)
+		goto err_path_alloc;
+
+	new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+	if (!new_node)
+		goto err_node_alloc;
+
+	read_lock_bh(&pathtbl_resize_lock);
+	memcpy(new_mpath->dst, dst, ETH_ALEN);
+	new_mpath->sdata = sdata;
+	new_mpath->flags = 0;
+	skb_queue_head_init(&new_mpath->frame_queue);
+	new_node->mpath = new_mpath;
+	new_mpath->timer.data = (unsigned long) new_mpath;
+	new_mpath->timer.function = mesh_path_timer;
+	new_mpath->exp_time = jiffies;
+	spin_lock_init(&new_mpath->state_lock);
+	init_timer(&new_mpath->timer);
+
+	tbl = resize_dereference_mesh_paths();
+
+	hash_idx = mesh_table_hash(dst, sdata, tbl);
+	bucket = &tbl->hash_buckets[hash_idx];
+
+	spin_lock_bh(&tbl->hashwlock[hash_idx]);
+
+	err = -EEXIST;
+	hlist_for_each_entry(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0)
+			goto err_exists;
+	}
+
+	hlist_add_head_rcu(&new_node->list, bucket);
+	if (atomic_inc_return(&tbl->entries) >=
+	    tbl->mean_chain_len * (tbl->hash_mask + 1))
+		grow = 1;
+
+	mesh_paths_generation++;
+
+	spin_unlock_bh(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	if (grow) {
+		set_bit(MESH_WORK_GROW_MPATH_TABLE,  &ifmsh->wrkq_flags);
+		ieee80211_queue_work(&local->hw, &sdata->work);
+	}
+	return 0;
+
+err_exists:
+	spin_unlock_bh(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	kfree(new_node);
+err_node_alloc:
+	kfree(new_mpath);
+err_path_alloc:
+	atomic_dec(&sdata->u.mesh.mpaths);
+	return err;
+}
+
+static void mesh_table_free_rcu(struct rcu_head *rcu)
+{
+	struct mesh_table *tbl = container_of(rcu, struct mesh_table, rcu_head);
+
+	mesh_table_free(tbl, false);
+}
+
+void mesh_mpath_table_grow(void)
+{
+	struct mesh_table *oldtbl, *newtbl;
+
+	write_lock_bh(&pathtbl_resize_lock);
+	oldtbl = resize_dereference_mesh_paths();
+	newtbl = mesh_table_alloc(oldtbl->size_order + 1);
+	if (!newtbl)
+		goto out;
+	if (mesh_table_grow(oldtbl, newtbl) < 0) {
+		__mesh_table_free(newtbl);
+		goto out;
+	}
+	rcu_assign_pointer(mesh_paths, newtbl);
+
+	call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
+
+ out:
+	write_unlock_bh(&pathtbl_resize_lock);
+}
+
+void mesh_mpp_table_grow(void)
+{
+	struct mesh_table *oldtbl, *newtbl;
+
+	write_lock_bh(&pathtbl_resize_lock);
+	oldtbl = resize_dereference_mpp_paths();
+	newtbl = mesh_table_alloc(oldtbl->size_order + 1);
+	if (!newtbl)
+		goto out;
+	if (mesh_table_grow(oldtbl, newtbl) < 0) {
+		__mesh_table_free(newtbl);
+		goto out;
+	}
+	rcu_assign_pointer(mpp_paths, newtbl);
+	call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu);
+
+ out:
+	write_unlock_bh(&pathtbl_resize_lock);
+}
+
+int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
+	struct ieee80211_local *local = sdata->local;
+	struct mesh_table *tbl;
+	struct mesh_path *mpath, *new_mpath;
+	struct mpath_node *node, *new_node;
+	struct hlist_head *bucket;
+	struct hlist_node *n;
+	int grow = 0;
+	int err = 0;
+	u32 hash_idx;
+
+	if (memcmp(dst, sdata->vif.addr, ETH_ALEN) == 0)
+		/* never add ourselves as neighbours */
+		return -ENOTSUPP;
+
+	if (is_multicast_ether_addr(dst))
+		return -ENOTSUPP;
+
+	err = -ENOMEM;
+	new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC);
+	if (!new_mpath)
+		goto err_path_alloc;
+
+	new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+	if (!new_node)
+		goto err_node_alloc;
+
+	read_lock_bh(&pathtbl_resize_lock);
+	memcpy(new_mpath->dst, dst, ETH_ALEN);
+	memcpy(new_mpath->mpp, mpp, ETH_ALEN);
+	new_mpath->sdata = sdata;
+	new_mpath->flags = 0;
+	skb_queue_head_init(&new_mpath->frame_queue);
+	new_node->mpath = new_mpath;
+	new_mpath->exp_time = jiffies;
+	spin_lock_init(&new_mpath->state_lock);
+
+	tbl = resize_dereference_mpp_paths();
+
+	hash_idx = mesh_table_hash(dst, sdata, tbl);
+	bucket = &tbl->hash_buckets[hash_idx];
+
+	spin_lock_bh(&tbl->hashwlock[hash_idx]);
+
+	err = -EEXIST;
+	hlist_for_each_entry(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata && memcmp(dst, mpath->dst, ETH_ALEN) == 0)
+			goto err_exists;
+	}
+
+	hlist_add_head_rcu(&new_node->list, bucket);
+	if (atomic_inc_return(&tbl->entries) >=
+	    tbl->mean_chain_len * (tbl->hash_mask + 1))
+		grow = 1;
+
+	spin_unlock_bh(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	if (grow) {
+		set_bit(MESH_WORK_GROW_MPP_TABLE,  &ifmsh->wrkq_flags);
+		ieee80211_queue_work(&local->hw, &sdata->work);
+	}
+	return 0;
+
+err_exists:
+	spin_unlock_bh(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	kfree(new_node);
+err_node_alloc:
+	kfree(new_mpath);
+err_path_alloc:
+	return err;
+}
+
+
+/**
+ * mesh_plink_broken - deactivates paths and sends perr when a link breaks
+ *
+ * @sta: broken peer link
+ *
+ * This function must be called from the rate control algorithm if enough
+ * delivery errors suggest that a peer link is no longer usable.
+ */
+void mesh_plink_broken(struct sta_info *sta)
+{
+	struct mesh_table *tbl;
+	static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_node *p;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	int i;
+
+	rcu_read_lock();
+	tbl = rcu_dereference(mesh_paths);
+	for_each_mesh_entry(tbl, p, node, i) {
+		mpath = node->mpath;
+		spin_lock_bh(&mpath->state_lock);
+		if (rcu_dereference(mpath->next_hop) == sta &&
+		    mpath->flags & MESH_PATH_ACTIVE &&
+		    !(mpath->flags & MESH_PATH_FIXED)) {
+			mpath->flags &= ~MESH_PATH_ACTIVE;
+			++mpath->sn;
+			spin_unlock_bh(&mpath->state_lock);
+			mesh_path_error_tx(sdata->u.mesh.mshcfg.element_ttl,
+					mpath->dst, cpu_to_le32(mpath->sn),
+					cpu_to_le16(PERR_RCODE_DEST_UNREACH),
+					bcast, sdata);
+		} else
+		spin_unlock_bh(&mpath->state_lock);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * mesh_path_flush_by_nexthop - Deletes mesh paths if their next hop matches
+ *
+ * @sta - mesh peer to match
+ *
+ * RCU notes: this function is called when a mesh plink transitions from
+ * PLINK_ESTAB to any other state, since PLINK_ESTAB state is the only one that
+ * allows path creation. This will happen before the sta can be freed (because
+ * sta_info_destroy() calls this) so any reader in a rcu read block will be
+ * protected against the plink disappearing.
+ */
+void mesh_path_flush_by_nexthop(struct sta_info *sta)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_node *p;
+	int i;
+
+	rcu_read_lock();
+	tbl = rcu_dereference(mesh_paths);
+	for_each_mesh_entry(tbl, p, node, i) {
+		mpath = node->mpath;
+		if (rcu_dereference(mpath->next_hop) == sta)
+			mesh_path_del(mpath->dst, mpath->sdata);
+	}
+	rcu_read_unlock();
+}
+
+void mesh_path_flush(struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_node *p;
+	int i;
+
+	rcu_read_lock();
+	tbl = rcu_dereference(mesh_paths);
+	for_each_mesh_entry(tbl, p, node, i) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata)
+			mesh_path_del(mpath->dst, mpath->sdata);
+	}
+	rcu_read_unlock();
+}
+
+static void mesh_path_node_reclaim(struct rcu_head *rp)
+{
+	struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
+	struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
+
+	del_timer_sync(&node->mpath->timer);
+	atomic_dec(&sdata->u.mesh.mpaths);
+	kfree(node->mpath);
+	kfree(node);
+}
+
+/**
+ * mesh_path_del - delete a mesh path from the table
+ *
+ * @addr: dst address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+int mesh_path_del(u8 *addr, struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_head *bucket;
+	struct hlist_node *n;
+	int hash_idx;
+	int err = 0;
+
+	read_lock_bh(&pathtbl_resize_lock);
+	tbl = resize_dereference_mesh_paths();
+	hash_idx = mesh_table_hash(addr, sdata, tbl);
+	bucket = &tbl->hash_buckets[hash_idx];
+
+	spin_lock_bh(&tbl->hashwlock[hash_idx]);
+	hlist_for_each_entry(node, n, bucket, list) {
+		mpath = node->mpath;
+		if (mpath->sdata == sdata &&
+		    memcmp(addr, mpath->dst, ETH_ALEN) == 0) {
+			spin_lock_bh(&mpath->state_lock);
+			mpath->flags |= MESH_PATH_RESOLVING;
+			hlist_del_rcu(&node->list);
+			call_rcu(&node->rcu, mesh_path_node_reclaim);
+			atomic_dec(&tbl->entries);
+			spin_unlock_bh(&mpath->state_lock);
+			goto enddel;
+		}
+	}
+
+	err = -ENXIO;
+enddel:
+	mesh_paths_generation++;
+	spin_unlock_bh(&tbl->hashwlock[hash_idx]);
+	read_unlock_bh(&pathtbl_resize_lock);
+	return err;
+}
+
+/**
+ * mesh_path_tx_pending - sends pending frames in a mesh path queue
+ *
+ * @mpath: mesh path to activate
+ *
+ * Locking: the state_lock of the mpath structure must NOT be held when calling
+ * this function.
+ */
+void mesh_path_tx_pending(struct mesh_path *mpath)
+{
+	if (mpath->flags & MESH_PATH_ACTIVE)
+		ieee80211_add_pending_skbs(mpath->sdata->local,
+				&mpath->frame_queue);
+}
+
+/**
+ * mesh_path_discard_frame - discard a frame whose path could not be resolved
+ *
+ * @skb: frame to discard
+ * @sdata: network subif the frame was to be sent through
+ *
+ * If the frame was being forwarded from another MP, a PERR frame will be sent
+ * to the precursor.  The precursor's address (i.e. the previous hop) was saved
+ * in addr1 of the frame-to-be-forwarded, and would only be overwritten once
+ * the destination is successfully resolved.
+ *
+ * Locking: the function must me called within a rcu_read_lock region
+ */
+void mesh_path_discard_frame(struct sk_buff *skb,
+			     struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct mesh_path *mpath;
+	u32 sn = 0;
+
+	if (memcmp(hdr->addr4, sdata->vif.addr, ETH_ALEN) != 0) {
+		u8 *ra, *da;
+
+		da = hdr->addr3;
+		ra = hdr->addr1;
+		mpath = mesh_path_lookup(da, sdata);
+		if (mpath)
+			sn = ++mpath->sn;
+		mesh_path_error_tx(sdata->u.mesh.mshcfg.element_ttl, skb->data,
+				   cpu_to_le32(sn),
+				   cpu_to_le16(PERR_RCODE_NO_ROUTE), ra, sdata);
+	}
+
+	kfree_skb(skb);
+	sdata->u.mesh.mshstats.dropped_frames_no_route++;
+}
+
+/**
+ * mesh_path_flush_pending - free the pending queue of a mesh path
+ *
+ * @mpath: mesh path whose queue has to be freed
+ *
+ * Locking: the function must me called within a rcu_read_lock region
+ */
+void mesh_path_flush_pending(struct mesh_path *mpath)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&mpath->frame_queue)) &&
+			(mpath->flags & MESH_PATH_ACTIVE))
+		mesh_path_discard_frame(skb, mpath->sdata);
+}
+
+/**
+ * mesh_path_fix_nexthop - force a specific next hop for a mesh path
+ *
+ * @mpath: the mesh path to modify
+ * @next_hop: the next hop to force
+ *
+ * Locking: this function must be called holding mpath->state_lock
+ */
+void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop)
+{
+	spin_lock_bh(&mpath->state_lock);
+	mesh_path_assign_nexthop(mpath, next_hop);
+	mpath->sn = 0xffff;
+	mpath->metric = 0;
+	mpath->hop_count = 0;
+	mpath->exp_time = 0;
+	mpath->flags |= MESH_PATH_FIXED;
+	mesh_path_activate(mpath);
+	spin_unlock_bh(&mpath->state_lock);
+	mesh_path_tx_pending(mpath);
+}
+
+static void mesh_path_node_free(struct hlist_node *p, bool free_leafs)
+{
+	struct mesh_path *mpath;
+	struct mpath_node *node = hlist_entry(p, struct mpath_node, list);
+	mpath = node->mpath;
+	hlist_del_rcu(p);
+	if (free_leafs) {
+		del_timer_sync(&mpath->timer);
+		kfree(mpath);
+	}
+	kfree(node);
+}
+
+static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl)
+{
+	struct mesh_path *mpath;
+	struct mpath_node *node, *new_node;
+	u32 hash_idx;
+
+	new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC);
+	if (new_node == NULL)
+		return -ENOMEM;
+
+	node = hlist_entry(p, struct mpath_node, list);
+	mpath = node->mpath;
+	new_node->mpath = mpath;
+	hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl);
+	hlist_add_head(&new_node->list,
+			&newtbl->hash_buckets[hash_idx]);
+	return 0;
+}
+
+int mesh_pathtbl_init(void)
+{
+	struct mesh_table *tbl_path, *tbl_mpp;
+
+	tbl_path = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+	if (!tbl_path)
+		return -ENOMEM;
+	tbl_path->free_node = &mesh_path_node_free;
+	tbl_path->copy_node = &mesh_path_node_copy;
+	tbl_path->mean_chain_len = MEAN_CHAIN_LEN;
+
+	tbl_mpp = mesh_table_alloc(INIT_PATHS_SIZE_ORDER);
+	if (!tbl_mpp) {
+		mesh_table_free(tbl_path, true);
+		return -ENOMEM;
+	}
+	tbl_mpp->free_node = &mesh_path_node_free;
+	tbl_mpp->copy_node = &mesh_path_node_copy;
+	tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN;
+
+	/* Need no locking since this is during init */
+	RCU_INIT_POINTER(mesh_paths, tbl_path);
+	RCU_INIT_POINTER(mpp_paths, tbl_mpp);
+
+	return 0;
+}
+
+void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
+{
+	struct mesh_table *tbl;
+	struct mesh_path *mpath;
+	struct mpath_node *node;
+	struct hlist_node *p;
+	int i;
+
+	rcu_read_lock();
+	tbl = rcu_dereference(mesh_paths);
+	for_each_mesh_entry(tbl, p, node, i) {
+		if (node->mpath->sdata != sdata)
+			continue;
+		mpath = node->mpath;
+		spin_lock_bh(&mpath->state_lock);
+		if ((!(mpath->flags & MESH_PATH_RESOLVING)) &&
+		    (!(mpath->flags & MESH_PATH_FIXED)) &&
+		     time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) {
+			spin_unlock_bh(&mpath->state_lock);
+			mesh_path_del(mpath->dst, mpath->sdata);
+		} else
+			spin_unlock_bh(&mpath->state_lock);
+	}
+	rcu_read_unlock();
+}
+
+void mesh_pathtbl_unregister(void)
+{
+	/* no need for locking during exit path */
+	mesh_table_free(rcu_dereference_raw(mesh_paths), true);
+	mesh_table_free(rcu_dereference_raw(mpp_paths), true);
+}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
new file mode 100644
index 00000000..f4adc091
--- /dev/null
+++ b/net/mac80211/mesh_plink.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2008, 2009 open80211s Ltd.
+ * Author:     Luis Carlos Cobo <luisca@cozybit.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "mesh.h"
+
+#ifdef CONFIG_MAC80211_VERBOSE_MPL_DEBUG
+#define mpl_dbg(fmt, args...)	printk(KERN_DEBUG fmt, ##args)
+#else
+#define mpl_dbg(fmt, args...)	do { (void)(0); } while (0)
+#endif
+
+#define PLINK_GET_LLID(p) (p + 4)
+#define PLINK_GET_PLID(p) (p + 6)
+
+#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \
+				jiffies + HZ * t / 1000))
+
+/* Peer link cancel reasons, all subject to ANA approval */
+#define MESH_LINK_CANCELLED			2
+#define MESH_MAX_NEIGHBORS			3
+#define MESH_CAPABILITY_POLICY_VIOLATION	4
+#define MESH_CLOSE_RCVD				5
+#define MESH_MAX_RETRIES			6
+#define MESH_CONFIRM_TIMEOUT			7
+#define MESH_SECURITY_ROLE_NEGOTIATION_DIFFERS	8
+#define MESH_SECURITY_AUTHENTICATION_IMPOSSIBLE	9
+#define MESH_SECURITY_FAILED_VERIFICATION	10
+
+#define dot11MeshMaxRetries(s) (s->u.mesh.mshcfg.dot11MeshMaxRetries)
+#define dot11MeshRetryTimeout(s) (s->u.mesh.mshcfg.dot11MeshRetryTimeout)
+#define dot11MeshConfirmTimeout(s) (s->u.mesh.mshcfg.dot11MeshConfirmTimeout)
+#define dot11MeshHoldingTimeout(s) (s->u.mesh.mshcfg.dot11MeshHoldingTimeout)
+#define dot11MeshMaxPeerLinks(s) (s->u.mesh.mshcfg.dot11MeshMaxPeerLinks)
+
+enum plink_frame_type {
+	PLINK_OPEN = 1,
+	PLINK_CONFIRM,
+	PLINK_CLOSE
+};
+
+enum plink_event {
+	PLINK_UNDEFINED,
+	OPN_ACPT,
+	OPN_RJCT,
+	OPN_IGNR,
+	CNF_ACPT,
+	CNF_RJCT,
+	CNF_IGNR,
+	CLS_ACPT,
+	CLS_IGNR
+};
+
+static inline
+void mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata)
+{
+	atomic_inc(&sdata->u.mesh.mshstats.estab_plinks);
+	mesh_accept_plinks_update(sdata);
+}
+
+static inline
+void mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata)
+{
+	atomic_dec(&sdata->u.mesh.mshstats.estab_plinks);
+	mesh_accept_plinks_update(sdata);
+}
+
+/**
+ * mesh_plink_fsm_restart - restart a mesh peer link finite state machine
+ *
+ * @sta: mesh peer link to restart
+ *
+ * Locking: this function must be called holding sta->lock
+ */
+static inline void mesh_plink_fsm_restart(struct sta_info *sta)
+{
+	sta->plink_state = NL80211_PLINK_LISTEN;
+	sta->llid = sta->plid = sta->reason = 0;
+	sta->plink_retries = 0;
+}
+
+/*
+ * NOTE: This is just an alias for sta_info_alloc(), see notes
+ *       on it in the lifecycle management section!
+ */
+static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata,
+					 u8 *hw_addr, u32 rates)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	if (local->num_sta >= MESH_MAX_PLINKS)
+		return NULL;
+
+	sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL);
+	if (!sta)
+		return NULL;
+
+	sta->flags = WLAN_STA_AUTHORIZED | WLAN_STA_AUTH;
+	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
+	rate_control_rate_init(sta);
+
+	return sta;
+}
+
+/**
+ * __mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ *
+ * Locking: the caller must hold sta->lock
+ */
+static bool __mesh_plink_deactivate(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated = false;
+
+	if (sta->plink_state == NL80211_PLINK_ESTAB) {
+		mesh_plink_dec_estab_count(sdata);
+		deactivated = true;
+	}
+	sta->plink_state = NL80211_PLINK_BLOCKED;
+	mesh_path_flush_by_nexthop(sta);
+
+	return deactivated;
+}
+
+/**
+ * mesh_plink_deactivate - deactivate mesh peer link
+ *
+ * @sta: mesh peer link to deactivate
+ *
+ * All mesh paths with this peer as next hop will be flushed
+ */
+void mesh_plink_deactivate(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated;
+
+	spin_lock_bh(&sta->lock);
+	deactivated = __mesh_plink_deactivate(sta);
+	spin_unlock_bh(&sta->lock);
+
+	if (deactivated)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+}
+
+static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
+		enum plink_frame_type action, u8 *da, __le16 llid, __le16 plid,
+		__le16 reason) {
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 +
+			sdata->u.mesh.ie_len);
+	struct ieee80211_mgmt *mgmt;
+	bool include_plid = false;
+	static const u8 meshpeeringproto[] = { 0x00, 0x0F, 0xAC, 0x2A };
+	u8 *pos;
+	int ie_len;
+
+	if (!skb)
+		return -1;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	/* 25 is the size of the common mgmt part (24) plus the size of the
+	 * common action part (1)
+	 */
+	mgmt = (struct ieee80211_mgmt *)
+		skb_put(skb, 25 + sizeof(mgmt->u.action.u.plink_action));
+	memset(mgmt, 0, 25 + sizeof(mgmt->u.action.u.plink_action));
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, da, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+	mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION;
+	mgmt->u.action.u.plink_action.action_code = action;
+
+	if (action == PLINK_CLOSE)
+		mgmt->u.action.u.plink_action.aux = reason;
+	else {
+		mgmt->u.action.u.plink_action.aux = cpu_to_le16(0x0);
+		if (action == PLINK_CONFIRM) {
+			pos = skb_put(skb, 4);
+			/* two-byte status code followed by two-byte AID */
+			memset(pos, 0, 2);
+			memcpy(pos + 2, &plid, 2);
+		}
+		mesh_mgmt_ies_add(skb, sdata);
+	}
+
+	/* Add Peer Link Management element */
+	switch (action) {
+	case PLINK_OPEN:
+		ie_len = 6;
+		break;
+	case PLINK_CONFIRM:
+		ie_len = 8;
+		include_plid = true;
+		break;
+	case PLINK_CLOSE:
+	default:
+		if (!plid)
+			ie_len = 8;
+		else {
+			ie_len = 10;
+			include_plid = true;
+		}
+		break;
+	}
+
+	pos = skb_put(skb, 2 + ie_len);
+	*pos++ = WLAN_EID_PEER_LINK;
+	*pos++ = ie_len;
+	memcpy(pos, meshpeeringproto, sizeof(meshpeeringproto));
+	pos += 4;
+	memcpy(pos, &llid, 2);
+	if (include_plid) {
+		pos += 2;
+		memcpy(pos, &plid, 2);
+	}
+	if (action == PLINK_CLOSE) {
+		pos += 2;
+		memcpy(pos, &reason, 2);
+	}
+
+	ieee80211_tx_skb(sdata, skb);
+	return 0;
+}
+
+void mesh_neighbour_update(u8 *hw_addr, u32 rates,
+		struct ieee80211_sub_if_data *sdata,
+		struct ieee802_11_elems *elems)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	rcu_read_lock();
+
+	sta = sta_info_get(sdata, hw_addr);
+	if (!sta) {
+		rcu_read_unlock();
+		/* Userspace handles peer allocation when security is enabled
+		 * */
+		if (sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED)
+			cfg80211_notify_new_peer_candidate(sdata->dev, hw_addr,
+					elems->ie_start, elems->total_len,
+					GFP_KERNEL);
+		else
+			sta = mesh_plink_alloc(sdata, hw_addr, rates);
+		if (!sta)
+			return;
+		if (sta_info_insert_rcu(sta)) {
+			rcu_read_unlock();
+			return;
+		}
+	}
+
+	sta->last_rx = jiffies;
+	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
+	if (mesh_peer_accepts_plinks(elems) &&
+			sta->plink_state == NL80211_PLINK_LISTEN &&
+			sdata->u.mesh.accepting_plinks &&
+			sdata->u.mesh.mshcfg.auto_open_plinks)
+		mesh_plink_open(sta);
+
+	rcu_read_unlock();
+}
+
+static void mesh_plink_timer(unsigned long data)
+{
+	struct sta_info *sta;
+	__le16 llid, plid, reason;
+	struct ieee80211_sub_if_data *sdata;
+
+	/*
+	 * This STA is valid because sta_info_destroy() will
+	 * del_timer_sync() this timer after having made sure
+	 * it cannot be readded (by deleting the plink.)
+	 */
+	sta = (struct sta_info *) data;
+
+	if (sta->sdata->local->quiescing) {
+		sta->plink_timer_was_running = true;
+		return;
+	}
+
+	spin_lock_bh(&sta->lock);
+	if (sta->ignore_plink_timer) {
+		sta->ignore_plink_timer = false;
+		spin_unlock_bh(&sta->lock);
+		return;
+	}
+	mpl_dbg("Mesh plink timer for %pM fired on state %d\n",
+		sta->sta.addr, sta->plink_state);
+	reason = 0;
+	llid = sta->llid;
+	plid = sta->plid;
+	sdata = sta->sdata;
+
+	switch (sta->plink_state) {
+	case NL80211_PLINK_OPN_RCVD:
+	case NL80211_PLINK_OPN_SNT:
+		/* retry timer */
+		if (sta->plink_retries < dot11MeshMaxRetries(sdata)) {
+			u32 rand;
+			mpl_dbg("Mesh plink for %pM (retry, timeout): %d %d\n",
+				sta->sta.addr, sta->plink_retries,
+				sta->plink_timeout);
+			get_random_bytes(&rand, sizeof(u32));
+			sta->plink_timeout = sta->plink_timeout +
+					     rand % sta->plink_timeout;
+			++sta->plink_retries;
+			mod_plink_timer(sta, sta->plink_timeout);
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid,
+					    0, 0);
+			break;
+		}
+		reason = cpu_to_le16(MESH_MAX_RETRIES);
+		/* fall through on else */
+	case NL80211_PLINK_CNF_RCVD:
+		/* confirm timer */
+		if (!reason)
+			reason = cpu_to_le16(MESH_CONFIRM_TIMEOUT);
+		sta->plink_state = NL80211_PLINK_HOLDING;
+		mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
+		spin_unlock_bh(&sta->lock);
+		mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid, plid,
+				    reason);
+		break;
+	case NL80211_PLINK_HOLDING:
+		/* holding timer */
+		del_timer(&sta->plink_timer);
+		mesh_plink_fsm_restart(sta);
+		spin_unlock_bh(&sta->lock);
+		break;
+	default:
+		spin_unlock_bh(&sta->lock);
+		break;
+	}
+}
+
+#ifdef CONFIG_PM
+void mesh_plink_quiesce(struct sta_info *sta)
+{
+	if (del_timer_sync(&sta->plink_timer))
+		sta->plink_timer_was_running = true;
+}
+
+void mesh_plink_restart(struct sta_info *sta)
+{
+	if (sta->plink_timer_was_running) {
+		add_timer(&sta->plink_timer);
+		sta->plink_timer_was_running = false;
+	}
+}
+#endif
+
+static inline void mesh_plink_timer_set(struct sta_info *sta, int timeout)
+{
+	sta->plink_timer.expires = jiffies + (HZ * timeout / 1000);
+	sta->plink_timer.data = (unsigned long) sta;
+	sta->plink_timer.function = mesh_plink_timer;
+	sta->plink_timeout = timeout;
+	add_timer(&sta->plink_timer);
+}
+
+int mesh_plink_open(struct sta_info *sta)
+{
+	__le16 llid;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+	if (!test_sta_flags(sta, WLAN_STA_AUTH))
+		return -EPERM;
+
+	spin_lock_bh(&sta->lock);
+	get_random_bytes(&llid, 2);
+	sta->llid = llid;
+	if (sta->plink_state != NL80211_PLINK_LISTEN) {
+		spin_unlock_bh(&sta->lock);
+		return -EBUSY;
+	}
+	sta->plink_state = NL80211_PLINK_OPN_SNT;
+	mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata));
+	spin_unlock_bh(&sta->lock);
+	mpl_dbg("Mesh plink: starting establishment with %pM\n",
+		sta->sta.addr);
+
+	return mesh_plink_frame_tx(sdata, PLINK_OPEN,
+				   sta->sta.addr, llid, 0, 0);
+}
+
+void mesh_plink_block(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated;
+
+	spin_lock_bh(&sta->lock);
+	deactivated = __mesh_plink_deactivate(sta);
+	sta->plink_state = NL80211_PLINK_BLOCKED;
+	spin_unlock_bh(&sta->lock);
+
+	if (deactivated)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+}
+
+
+void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt,
+			 size_t len, struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee802_11_elems elems;
+	struct sta_info *sta;
+	enum plink_event event;
+	enum plink_frame_type ftype;
+	size_t baselen;
+	bool deactivated, matches_local = true;
+	u8 ie_len;
+	u8 *baseaddr;
+	__le16 plid, llid, reason;
+#ifdef CONFIG_MAC80211_VERBOSE_MPL_DEBUG
+	static const char *mplstates[] = {
+		[NL80211_PLINK_LISTEN] = "LISTEN",
+		[NL80211_PLINK_OPN_SNT] = "OPN-SNT",
+		[NL80211_PLINK_OPN_RCVD] = "OPN-RCVD",
+		[NL80211_PLINK_CNF_RCVD] = "CNF_RCVD",
+		[NL80211_PLINK_ESTAB] = "ESTAB",
+		[NL80211_PLINK_HOLDING] = "HOLDING",
+		[NL80211_PLINK_BLOCKED] = "BLOCKED"
+	};
+#endif
+
+	/* need action_code, aux */
+	if (len < IEEE80211_MIN_ACTION_SIZE + 3)
+		return;
+
+	if (is_multicast_ether_addr(mgmt->da)) {
+		mpl_dbg("Mesh plink: ignore frame from multicast address");
+		return;
+	}
+
+	baseaddr = mgmt->u.action.u.plink_action.variable;
+	baselen = (u8 *) mgmt->u.action.u.plink_action.variable - (u8 *) mgmt;
+	if (mgmt->u.action.u.plink_action.action_code == PLINK_CONFIRM) {
+		baseaddr += 4;
+		baselen += 4;
+	}
+	ieee802_11_parse_elems(baseaddr, len - baselen, &elems);
+	if (!elems.peer_link) {
+		mpl_dbg("Mesh plink: missing necessary peer link ie\n");
+		return;
+	}
+	if (elems.rsn_len &&
+			sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) {
+		mpl_dbg("Mesh plink: can't establish link with secure peer\n");
+		return;
+	}
+
+	ftype = mgmt->u.action.u.plink_action.action_code;
+	ie_len = elems.peer_link_len;
+	if ((ftype == PLINK_OPEN && ie_len != 6) ||
+	    (ftype == PLINK_CONFIRM && ie_len != 8) ||
+	    (ftype == PLINK_CLOSE && ie_len != 8 && ie_len != 10)) {
+		mpl_dbg("Mesh plink: incorrect plink ie length %d %d\n",
+		    ftype, ie_len);
+		return;
+	}
+
+	if (ftype != PLINK_CLOSE && (!elems.mesh_id || !elems.mesh_config)) {
+		mpl_dbg("Mesh plink: missing necessary ie\n");
+		return;
+	}
+	/* Note the lines below are correct, the llid in the frame is the plid
+	 * from the point of view of this host.
+	 */
+	memcpy(&plid, PLINK_GET_LLID(elems.peer_link), 2);
+	if (ftype == PLINK_CONFIRM || (ftype == PLINK_CLOSE && ie_len == 10))
+		memcpy(&llid, PLINK_GET_PLID(elems.peer_link), 2);
+
+	rcu_read_lock();
+
+	sta = sta_info_get(sdata, mgmt->sa);
+	if (!sta && ftype != PLINK_OPEN) {
+		mpl_dbg("Mesh plink: cls or cnf from unknown peer\n");
+		rcu_read_unlock();
+		return;
+	}
+
+	if (sta && !test_sta_flags(sta, WLAN_STA_AUTH)) {
+		mpl_dbg("Mesh plink: Action frame from non-authed peer\n");
+		rcu_read_unlock();
+		return;
+	}
+
+	if (sta && sta->plink_state == NL80211_PLINK_BLOCKED) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* Now we will figure out the appropriate event... */
+	event = PLINK_UNDEFINED;
+	if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) {
+		matches_local = false;
+		switch (ftype) {
+		case PLINK_OPEN:
+			event = OPN_RJCT;
+			break;
+		case PLINK_CONFIRM:
+			event = CNF_RJCT;
+			break;
+		case PLINK_CLOSE:
+			/* avoid warning */
+			break;
+		}
+	}
+
+	if (!sta && !matches_local) {
+		rcu_read_unlock();
+		reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+		llid = 0;
+		mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid,
+				    plid, reason);
+		return;
+	} else if (!sta) {
+		/* ftype == PLINK_OPEN */
+		u32 rates;
+
+		rcu_read_unlock();
+
+		if (!mesh_plink_free_count(sdata)) {
+			mpl_dbg("Mesh plink error: no more free plinks\n");
+			return;
+		}
+
+		rates = ieee80211_sta_get_rates(local, &elems, rx_status->band);
+		sta = mesh_plink_alloc(sdata, mgmt->sa, rates);
+		if (!sta) {
+			mpl_dbg("Mesh plink error: plink table full\n");
+			return;
+		}
+		if (sta_info_insert_rcu(sta)) {
+			rcu_read_unlock();
+			return;
+		}
+		event = OPN_ACPT;
+		spin_lock_bh(&sta->lock);
+	} else if (matches_local) {
+		spin_lock_bh(&sta->lock);
+		switch (ftype) {
+		case PLINK_OPEN:
+			if (!mesh_plink_free_count(sdata) ||
+			    (sta->plid && sta->plid != plid))
+				event = OPN_IGNR;
+			else
+				event = OPN_ACPT;
+			break;
+		case PLINK_CONFIRM:
+			if (!mesh_plink_free_count(sdata) ||
+			    (sta->llid != llid || sta->plid != plid))
+				event = CNF_IGNR;
+			else
+				event = CNF_ACPT;
+			break;
+		case PLINK_CLOSE:
+			if (sta->plink_state == NL80211_PLINK_ESTAB)
+				/* Do not check for llid or plid. This does not
+				 * follow the standard but since multiple plinks
+				 * per sta are not supported, it is necessary in
+				 * order to avoid a livelock when MP A sees an
+				 * establish peer link to MP B but MP B does not
+				 * see it. This can be caused by a timeout in
+				 * B's peer link establishment or B beign
+				 * restarted.
+				 */
+				event = CLS_ACPT;
+			else if (sta->plid != plid)
+				event = CLS_IGNR;
+			else if (ie_len == 7 && sta->llid != llid)
+				event = CLS_IGNR;
+			else
+				event = CLS_ACPT;
+			break;
+		default:
+			mpl_dbg("Mesh plink: unknown frame subtype\n");
+			spin_unlock_bh(&sta->lock);
+			rcu_read_unlock();
+			return;
+		}
+	} else {
+		spin_lock_bh(&sta->lock);
+	}
+
+	mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n",
+		mgmt->sa, mplstates[sta->plink_state],
+		le16_to_cpu(sta->llid), le16_to_cpu(sta->plid),
+		event);
+	reason = 0;
+	switch (sta->plink_state) {
+		/* spin_unlock as soon as state is updated at each case */
+	case NL80211_PLINK_LISTEN:
+		switch (event) {
+		case CLS_ACPT:
+			mesh_plink_fsm_restart(sta);
+			spin_unlock_bh(&sta->lock);
+			break;
+		case OPN_ACPT:
+			sta->plink_state = NL80211_PLINK_OPN_RCVD;
+			sta->plid = plid;
+			get_random_bytes(&llid, 2);
+			sta->llid = llid;
+			mesh_plink_timer_set(sta, dot11MeshRetryTimeout(sdata));
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_OPEN, sta->sta.addr, llid,
+					    0, 0);
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr,
+					    llid, plid, 0);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+			break;
+		}
+		break;
+
+	case NL80211_PLINK_OPN_SNT:
+		switch (event) {
+		case OPN_RJCT:
+		case CNF_RJCT:
+			reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+		case CLS_ACPT:
+			if (!reason)
+				reason = cpu_to_le16(MESH_CLOSE_RCVD);
+			sta->reason = reason;
+			sta->plink_state = NL80211_PLINK_HOLDING;
+			if (!mod_plink_timer(sta,
+					     dot11MeshHoldingTimeout(sdata)))
+				sta->ignore_plink_timer = true;
+
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
+					    plid, reason);
+			break;
+		case OPN_ACPT:
+			/* retry timer is left untouched */
+			sta->plink_state = NL80211_PLINK_OPN_RCVD;
+			sta->plid = plid;
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
+					    plid, 0);
+			break;
+		case CNF_ACPT:
+			sta->plink_state = NL80211_PLINK_CNF_RCVD;
+			if (!mod_plink_timer(sta,
+					     dot11MeshConfirmTimeout(sdata)))
+				sta->ignore_plink_timer = true;
+
+			spin_unlock_bh(&sta->lock);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+			break;
+		}
+		break;
+
+	case NL80211_PLINK_OPN_RCVD:
+		switch (event) {
+		case OPN_RJCT:
+		case CNF_RJCT:
+			reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+		case CLS_ACPT:
+			if (!reason)
+				reason = cpu_to_le16(MESH_CLOSE_RCVD);
+			sta->reason = reason;
+			sta->plink_state = NL80211_PLINK_HOLDING;
+			if (!mod_plink_timer(sta,
+					     dot11MeshHoldingTimeout(sdata)))
+				sta->ignore_plink_timer = true;
+
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
+					    plid, reason);
+			break;
+		case OPN_ACPT:
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
+					    plid, 0);
+			break;
+		case CNF_ACPT:
+			del_timer(&sta->plink_timer);
+			sta->plink_state = NL80211_PLINK_ESTAB;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_inc_estab_count(sdata);
+			ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+			mpl_dbg("Mesh plink with %pM ESTABLISHED\n",
+				sta->sta.addr);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+			break;
+		}
+		break;
+
+	case NL80211_PLINK_CNF_RCVD:
+		switch (event) {
+		case OPN_RJCT:
+		case CNF_RJCT:
+			reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+		case CLS_ACPT:
+			if (!reason)
+				reason = cpu_to_le16(MESH_CLOSE_RCVD);
+			sta->reason = reason;
+			sta->plink_state = NL80211_PLINK_HOLDING;
+			if (!mod_plink_timer(sta,
+					     dot11MeshHoldingTimeout(sdata)))
+				sta->ignore_plink_timer = true;
+
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
+					    plid, reason);
+			break;
+		case OPN_ACPT:
+			del_timer(&sta->plink_timer);
+			sta->plink_state = NL80211_PLINK_ESTAB;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_inc_estab_count(sdata);
+			ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+			mpl_dbg("Mesh plink with %pM ESTABLISHED\n",
+				sta->sta.addr);
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
+					    plid, 0);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+			break;
+		}
+		break;
+
+	case NL80211_PLINK_ESTAB:
+		switch (event) {
+		case CLS_ACPT:
+			reason = cpu_to_le16(MESH_CLOSE_RCVD);
+			sta->reason = reason;
+			deactivated = __mesh_plink_deactivate(sta);
+			sta->plink_state = NL80211_PLINK_HOLDING;
+			llid = sta->llid;
+			mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
+			spin_unlock_bh(&sta->lock);
+			if (deactivated)
+				ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
+					    plid, reason);
+			break;
+		case OPN_ACPT:
+			llid = sta->llid;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
+					    plid, 0);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+			break;
+		}
+		break;
+	case NL80211_PLINK_HOLDING:
+		switch (event) {
+		case CLS_ACPT:
+			if (del_timer(&sta->plink_timer))
+				sta->ignore_plink_timer = 1;
+			mesh_plink_fsm_restart(sta);
+			spin_unlock_bh(&sta->lock);
+			break;
+		case OPN_ACPT:
+		case CNF_ACPT:
+		case OPN_RJCT:
+		case CNF_RJCT:
+			llid = sta->llid;
+			reason = sta->reason;
+			spin_unlock_bh(&sta->lock);
+			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr,
+					    llid, plid, reason);
+			break;
+		default:
+			spin_unlock_bh(&sta->lock);
+		}
+		break;
+	default:
+		/* should not get here, PLINK_BLOCKED is dealt with at the
+		 * beginning of the function
+		 */
+		spin_unlock_bh(&sta->lock);
+		break;
+	}
+
+	rcu_read_unlock();
+}
diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c
new file mode 100644
index 00000000..408649bd
--- /dev/null
+++ b/net/mac80211/michael.c
@@ -0,0 +1,86 @@
+/*
+ * Michael MIC implementation - optimized for TKIP MIC operations
+ * Copyright 2002-2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/ieee80211.h>
+#include <asm/unaligned.h>
+
+#include "michael.h"
+
+static void michael_block(struct michael_mic_ctx *mctx, u32 val)
+{
+	mctx->l ^= val;
+	mctx->r ^= rol32(mctx->l, 17);
+	mctx->l += mctx->r;
+	mctx->r ^= ((mctx->l & 0xff00ff00) >> 8) |
+		   ((mctx->l & 0x00ff00ff) << 8);
+	mctx->l += mctx->r;
+	mctx->r ^= rol32(mctx->l, 3);
+	mctx->l += mctx->r;
+	mctx->r ^= ror32(mctx->l, 2);
+	mctx->l += mctx->r;
+}
+
+static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key,
+			    struct ieee80211_hdr *hdr)
+{
+	u8 *da, *sa, tid;
+
+	da = ieee80211_get_DA(hdr);
+	sa = ieee80211_get_SA(hdr);
+	if (ieee80211_is_data_qos(hdr->frame_control))
+		tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	else
+		tid = 0;
+
+	mctx->l = get_unaligned_le32(key);
+	mctx->r = get_unaligned_le32(key + 4);
+
+	/*
+	 * A pseudo header (DA, SA, Priority, 0, 0, 0) is used in Michael MIC
+	 * calculation, but it is _not_ transmitted
+	 */
+	michael_block(mctx, get_unaligned_le32(da));
+	michael_block(mctx, get_unaligned_le16(&da[4]) |
+			    (get_unaligned_le16(sa) << 16));
+	michael_block(mctx, get_unaligned_le32(&sa[2]));
+	michael_block(mctx, tid);
+}
+
+void michael_mic(const u8 *key, struct ieee80211_hdr *hdr,
+		 const u8 *data, size_t data_len, u8 *mic)
+{
+	u32 val;
+	size_t block, blocks, left;
+	struct michael_mic_ctx mctx;
+
+	michael_mic_hdr(&mctx, key, hdr);
+
+	/* Real data */
+	blocks = data_len / 4;
+	left = data_len % 4;
+
+	for (block = 0; block < blocks; block++)
+		michael_block(&mctx, get_unaligned_le32(&data[block * 4]));
+
+	/* Partial block of 0..3 bytes and padding: 0x5a + 4..7 zeros to make
+	 * total length a multiple of 4. */
+	val = 0x5a;
+	while (left > 0) {
+		val <<= 8;
+		left--;
+		val |= data[blocks * 4 + left];
+	}
+
+	michael_block(&mctx, val);
+	michael_block(&mctx, 0);
+
+	put_unaligned_le32(mctx.l, mic);
+	put_unaligned_le32(mctx.r, mic + 4);
+}
diff --git a/net/mac80211/michael.h b/net/mac80211/michael.h
new file mode 100644
index 00000000..3b848dad
--- /dev/null
+++ b/net/mac80211/michael.h
@@ -0,0 +1,24 @@
+/*
+ * Michael MIC implementation - optimized for TKIP MIC operations
+ * Copyright 2002-2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MICHAEL_H
+#define MICHAEL_H
+
+#include <linux/types.h>
+
+#define MICHAEL_MIC_LEN 8
+
+struct michael_mic_ctx {
+	u32 l, r;
+};
+
+void michael_mic(const u8 *key, struct ieee80211_hdr *hdr,
+		 const u8 *data, size_t data_len, u8 *mic);
+
+#endif /* MICHAEL_H */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
new file mode 100644
index 00000000..1563250a
--- /dev/null
+++ b/net/mac80211/mlme.c
@@ -0,0 +1,2671 @@
+/*
+ * BSS client mode implementation
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/pm_qos_params.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "led.h"
+
+static int max_nullfunc_tries = 2;
+module_param(max_nullfunc_tries, int, 0644);
+MODULE_PARM_DESC(max_nullfunc_tries,
+		 "Maximum nullfunc tx tries before disconnecting (reason 4).");
+
+static int max_probe_tries = 5;
+module_param(max_probe_tries, int, 0644);
+MODULE_PARM_DESC(max_probe_tries,
+		 "Maximum probe tries before disconnecting (reason 4).");
+
+/*
+ * Beacon loss timeout is calculated as N frames times the
+ * advertised beacon interval.  This may need to be somewhat
+ * higher than what hardware might detect to account for
+ * delays in the host processing frames. But since we also
+ * probe on beacon miss before declaring the connection lost
+ * default to what we want.
+ */
+#define IEEE80211_BEACON_LOSS_COUNT	7
+
+/*
+ * Time the connection can be idle before we probe
+ * it to see if we can still talk to the AP.
+ */
+#define IEEE80211_CONNECTION_IDLE_TIME	(30 * HZ)
+/*
+ * Time we wait for a probe response after sending
+ * a probe request because of beacon loss or for
+ * checking the connection still works.
+ */
+static int probe_wait_ms = 500;
+module_param(probe_wait_ms, int, 0644);
+MODULE_PARM_DESC(probe_wait_ms,
+		 "Maximum time(ms) to wait for probe response"
+		 " before disconnecting (reason 4).");
+
+/*
+ * Weight given to the latest Beacon frame when calculating average signal
+ * strength for Beacon frames received in the current BSS. This must be
+ * between 1 and 15.
+ */
+#define IEEE80211_SIGNAL_AVE_WEIGHT	3
+
+/*
+ * How many Beacon frames need to have been used in average signal strength
+ * before starting to indicate signal change events.
+ */
+#define IEEE80211_SIGNAL_AVE_MIN_COUNT	4
+
+#define TMR_RUNNING_TIMER	0
+#define TMR_RUNNING_CHANSW	1
+
+/*
+ * All cfg80211 functions have to be called outside a locked
+ * section so that they can acquire a lock themselves... This
+ * is much simpler than queuing up things in cfg80211, but we
+ * do need some indirection for that here.
+ */
+enum rx_mgmt_action {
+	/* no action required */
+	RX_MGMT_NONE,
+
+	/* caller must call cfg80211_send_deauth() */
+	RX_MGMT_CFG80211_DEAUTH,
+
+	/* caller must call cfg80211_send_disassoc() */
+	RX_MGMT_CFG80211_DISASSOC,
+};
+
+/* utils */
+static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd)
+{
+	lockdep_assert_held(&ifmgd->mtx);
+}
+
+/*
+ * We can have multiple work items (and connection probing)
+ * scheduling this timer, but we need to take care to only
+ * reschedule it when it should fire _earlier_ than it was
+ * asked for before, or if it's not pending right now. This
+ * function ensures that. Note that it then is required to
+ * run this function for all timeouts after the first one
+ * has happened -- the work that runs from this timer will
+ * do that.
+ */
+static void run_again(struct ieee80211_if_managed *ifmgd,
+			     unsigned long timeout)
+{
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (!timer_pending(&ifmgd->timer) ||
+	    time_before(timeout, ifmgd->timer.expires))
+		mod_timer(&ifmgd->timer, timeout);
+}
+
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
+{
+	if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER)
+		return;
+
+	mod_timer(&sdata->u.mgd.bcn_mon_timer,
+		  round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout));
+}
+
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (unlikely(!sdata->u.mgd.associated))
+		return;
+
+	if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+		return;
+
+	mod_timer(&sdata->u.mgd.conn_mon_timer,
+		  round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+
+	ifmgd->probe_send_count = 0;
+}
+
+static int ecw2cw(int ecw)
+{
+	return (1 << ecw) - 1;
+}
+
+/*
+ * ieee80211_enable_ht should be called only after the operating band
+ * has been determined as ht configuration depends on the hw's
+ * HT abilities for a specific band.
+ */
+static u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
+			       struct ieee80211_ht_info *hti,
+			       const u8 *bssid, u16 ap_ht_cap_flags)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	struct sta_info *sta;
+	u32 changed = 0;
+	int hti_cfreq;
+	u16 ht_opmode;
+	bool enable_ht = true;
+	enum nl80211_channel_type prev_chantype;
+	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	prev_chantype = sdata->vif.bss_conf.channel_type;
+
+	/* HT is not supported */
+	if (!sband->ht_cap.ht_supported)
+		enable_ht = false;
+
+	if (enable_ht) {
+		hti_cfreq = ieee80211_channel_to_frequency(hti->control_chan,
+							   sband->band);
+		/* check that channel matches the right operating channel */
+		if (local->hw.conf.channel->center_freq != hti_cfreq) {
+			/* Some APs mess this up, evidently.
+			 * Netgear WNDR3700 sometimes reports 4 higher than
+			 * the actual channel, for instance.
+			 */
+			printk(KERN_DEBUG
+			       "%s: Wrong control channel in association"
+			       " response: configured center-freq: %d"
+			       " hti-cfreq: %d  hti->control_chan: %d"
+			       " band: %d.  Disabling HT.\n",
+			       sdata->name,
+			       local->hw.conf.channel->center_freq,
+			       hti_cfreq, hti->control_chan,
+			       sband->band);
+			enable_ht = false;
+		}
+	}
+
+	if (enable_ht) {
+		channel_type = NL80211_CHAN_HT20;
+
+		if (!(ap_ht_cap_flags & IEEE80211_HT_CAP_40MHZ_INTOLERANT) &&
+		    (sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) &&
+		    (hti->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) {
+			switch(hti->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+			case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+				if (!(local->hw.conf.channel->flags &
+				    IEEE80211_CHAN_NO_HT40PLUS))
+					channel_type = NL80211_CHAN_HT40PLUS;
+				break;
+			case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+				if (!(local->hw.conf.channel->flags &
+				    IEEE80211_CHAN_NO_HT40MINUS))
+					channel_type = NL80211_CHAN_HT40MINUS;
+				break;
+			}
+		}
+	}
+
+	if (local->tmp_channel)
+		local->tmp_channel_type = channel_type;
+
+	if (!ieee80211_set_channel_type(local, sdata, channel_type)) {
+		/* can only fail due to HT40+/- mismatch */
+		channel_type = NL80211_CHAN_HT20;
+		WARN_ON(!ieee80211_set_channel_type(local, sdata, channel_type));
+	}
+
+	/* channel_type change automatically detected */
+	ieee80211_hw_config(local, 0);
+
+	if (prev_chantype != channel_type) {
+		rcu_read_lock();
+		sta = sta_info_get(sdata, bssid);
+		if (sta)
+			rate_control_rate_update(local, sband, sta,
+						 IEEE80211_RC_HT_CHANGED,
+						 channel_type);
+		rcu_read_unlock();
+	}
+
+	ht_opmode = le16_to_cpu(hti->operation_mode);
+
+	/* if bss configuration changed store the new one */
+	if (sdata->ht_opmode_valid != enable_ht ||
+	    sdata->vif.bss_conf.ht_operation_mode != ht_opmode ||
+	    prev_chantype != channel_type) {
+		changed |= BSS_CHANGED_HT;
+		sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
+		sdata->ht_opmode_valid = enable_ht;
+	}
+
+	return changed;
+}
+
+/* frame sending functions */
+
+static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
+					   const u8 *bssid, u16 stype, u16 reason,
+					   void *cookie, bool send_frame)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt));
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for "
+		       "deauth/disassoc frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, bssid, ETH_ALEN);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
+	skb_put(skb, 2);
+	/* u.deauth.reason_code == u.disassoc.reason_code */
+	mgmt->u.deauth.reason_code = cpu_to_le16(reason);
+
+	if (stype == IEEE80211_STYPE_DEAUTH)
+		if (cookie)
+			__cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
+		else
+			cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
+	else
+		if (cookie)
+			__cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len);
+		else
+			cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len);
+	if (!(ifmgd->flags & IEEE80211_STA_MFP_ENABLED))
+		IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+	if (send_frame)
+		ieee80211_tx_skb(sdata, skb);
+	else
+		kfree_skb(skb);
+}
+
+void ieee80211_send_pspoll(struct ieee80211_local *local,
+			   struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_pspoll *pspoll;
+	struct sk_buff *skb;
+
+	skb = ieee80211_pspoll_get(&local->hw, &sdata->vif);
+	if (!skb)
+		return;
+
+	pspoll = (struct ieee80211_pspoll *) skb->data;
+	pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_send_nullfunc(struct ieee80211_local *local,
+			     struct ieee80211_sub_if_data *sdata,
+			     int powersave)
+{
+	struct sk_buff *skb;
+	struct ieee80211_hdr_3addr *nullfunc;
+
+	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif);
+	if (!skb)
+		return;
+
+	nullfunc = (struct ieee80211_hdr_3addr *) skb->data;
+	if (powersave)
+		nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
+					  struct ieee80211_sub_if_data *sdata)
+{
+	struct sk_buff *skb;
+	struct ieee80211_hdr *nullfunc;
+	__le16 fc;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+		return;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for 4addr "
+		       "nullfunc frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	nullfunc = (struct ieee80211_hdr *) skb_put(skb, 30);
+	memset(nullfunc, 0, 30);
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
+			 IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+	nullfunc->frame_control = fc;
+	memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN);
+	memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN);
+	memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN);
+	memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN);
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+/* spectrum management related things */
+static void ieee80211_chswitch_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work);
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (!ieee80211_sdata_running(sdata))
+		return;
+
+	mutex_lock(&ifmgd->mtx);
+	if (!ifmgd->associated)
+		goto out;
+
+	sdata->local->oper_channel = sdata->local->csa_channel;
+	if (!sdata->local->ops->channel_switch) {
+		/* call "hw_config" only if doing sw channel switch */
+		ieee80211_hw_config(sdata->local,
+			IEEE80211_CONF_CHANGE_CHANNEL);
+	}
+
+	/* XXX: shouldn't really modify cfg80211-owned data! */
+	ifmgd->associated->channel = sdata->local->oper_channel;
+
+	ieee80211_wake_queues_by_reason(&sdata->local->hw,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+ out:
+	ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED;
+	mutex_unlock(&ifmgd->mtx);
+}
+
+void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_managed *ifmgd;
+
+	sdata = vif_to_sdata(vif);
+	ifmgd = &sdata->u.mgd;
+
+	trace_api_chswitch_done(sdata, success);
+	if (!success) {
+		/*
+		 * If the channel switch was not successful, stay
+		 * around on the old channel. We currently lack
+		 * good handling of this situation, possibly we
+		 * should just drop the association.
+		 */
+		sdata->local->csa_channel = sdata->local->oper_channel;
+	}
+
+	ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
+}
+EXPORT_SYMBOL(ieee80211_chswitch_done);
+
+static void ieee80211_chswitch_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (sdata->local->quiescing) {
+		set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running);
+		return;
+	}
+
+	ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
+}
+
+void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
+				      struct ieee80211_channel_sw_ie *sw_elem,
+				      struct ieee80211_bss *bss,
+				      u64 timestamp)
+{
+	struct cfg80211_bss *cbss =
+		container_of((void *)bss, struct cfg80211_bss, priv);
+	struct ieee80211_channel *new_ch;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num,
+						      cbss->channel->band);
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (!ifmgd->associated)
+		return;
+
+	if (sdata->local->scanning)
+		return;
+
+	/* Disregard subsequent beacons if we are already running a timer
+	   processing a CSA */
+
+	if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
+		return;
+
+	new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
+	if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	sdata->local->csa_channel = new_ch;
+
+	if (sdata->local->ops->channel_switch) {
+		/* use driver's channel switch callback */
+		struct ieee80211_channel_switch ch_switch;
+		memset(&ch_switch, 0, sizeof(ch_switch));
+		ch_switch.timestamp = timestamp;
+		if (sw_elem->mode) {
+			ch_switch.block_tx = true;
+			ieee80211_stop_queues_by_reason(&sdata->local->hw,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+		}
+		ch_switch.channel = new_ch;
+		ch_switch.count = sw_elem->count;
+		ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
+		drv_channel_switch(sdata->local, &ch_switch);
+		return;
+	}
+
+	/* channel switch handled in software */
+	if (sw_elem->count <= 1) {
+		ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
+	} else {
+		if (sw_elem->mode)
+			ieee80211_stop_queues_by_reason(&sdata->local->hw,
+					IEEE80211_QUEUE_STOP_REASON_CSA);
+		ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
+		mod_timer(&ifmgd->chswitch_timer,
+			  jiffies +
+			  msecs_to_jiffies(sw_elem->count *
+					   cbss->beacon_interval));
+	}
+}
+
+static void ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
+					u16 capab_info, u8 *pwr_constr_elem,
+					u8 pwr_constr_elem_len)
+{
+	struct ieee80211_conf *conf = &sdata->local->hw.conf;
+
+	if (!(capab_info & WLAN_CAPABILITY_SPECTRUM_MGMT))
+		return;
+
+	/* Power constraint IE length should be 1 octet */
+	if (pwr_constr_elem_len != 1)
+		return;
+
+	if ((*pwr_constr_elem <= conf->channel->max_power) &&
+	    (*pwr_constr_elem != sdata->local->power_constr_level)) {
+		sdata->local->power_constr_level = *pwr_constr_elem;
+		ieee80211_hw_config(sdata->local, 0);
+	}
+}
+
+void ieee80211_enable_dyn_ps(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION ||
+		!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) ||
+		(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS));
+
+	local->disable_dynamic_ps = false;
+	conf->dynamic_ps_timeout = local->dynamic_ps_user_timeout;
+}
+EXPORT_SYMBOL(ieee80211_enable_dyn_ps);
+
+void ieee80211_disable_dyn_ps(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION ||
+		!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) ||
+		(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS));
+
+	local->disable_dynamic_ps = true;
+	conf->dynamic_ps_timeout = 0;
+	del_timer_sync(&local->dynamic_ps_timer);
+	ieee80211_queue_work(&local->hw,
+			     &local->dynamic_ps_enable_work);
+}
+EXPORT_SYMBOL(ieee80211_disable_dyn_ps);
+
+/* powersave */
+static void ieee80211_enable_ps(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	/*
+	 * If we are scanning right now then the parameters will
+	 * take effect when scan finishes.
+	 */
+	if (local->scanning)
+		return;
+
+	if (conf->dynamic_ps_timeout > 0 &&
+	    !(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)) {
+		mod_timer(&local->dynamic_ps_timer, jiffies +
+			  msecs_to_jiffies(conf->dynamic_ps_timeout));
+	} else {
+		if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
+			ieee80211_send_nullfunc(local, sdata, 1);
+
+		if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
+		    (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+			return;
+
+		conf->flags |= IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+	}
+}
+
+static void ieee80211_change_ps(struct ieee80211_local *local)
+{
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	if (local->ps_sdata) {
+		ieee80211_enable_ps(local, local->ps_sdata);
+	} else if (conf->flags & IEEE80211_CONF_PS) {
+		conf->flags &= ~IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		del_timer_sync(&local->dynamic_ps_timer);
+		cancel_work_sync(&local->dynamic_ps_enable_work);
+	}
+}
+
+static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *mgd = &sdata->u.mgd;
+	struct sta_info *sta = NULL;
+	u32 sta_flags = 0;
+
+	if (!mgd->powersave)
+		return false;
+
+	if (mgd->broken_ap)
+		return false;
+
+	if (!mgd->associated)
+		return false;
+
+	if (!mgd->associated->beacon_ies)
+		return false;
+
+	if (mgd->flags & (IEEE80211_STA_BEACON_POLL |
+			  IEEE80211_STA_CONNECTION_POLL))
+		return false;
+
+	rcu_read_lock();
+	sta = sta_info_get(sdata, mgd->bssid);
+	if (sta)
+		sta_flags = get_sta_flags(sta);
+	rcu_read_unlock();
+
+	if (!(sta_flags & WLAN_STA_AUTHORIZED))
+		return false;
+
+	return true;
+}
+
+/* need to hold RTNL or interface lock */
+void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
+{
+	struct ieee80211_sub_if_data *sdata, *found = NULL;
+	int count = 0;
+	int timeout;
+
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS)) {
+		local->ps_sdata = NULL;
+		return;
+	}
+
+	if (!list_empty(&local->work_list)) {
+		local->ps_sdata = NULL;
+		goto change;
+	}
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+		if (sdata->vif.type == NL80211_IFTYPE_AP) {
+			/* If an AP vif is found, then disable PS
+			 * by setting the count to zero thereby setting
+			 * ps_sdata to NULL.
+			 */
+			count = 0;
+			break;
+		}
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			continue;
+		found = sdata;
+		count++;
+	}
+
+	if (count == 1 && ieee80211_powersave_allowed(found)) {
+		struct ieee80211_conf *conf = &local->hw.conf;
+		s32 beaconint_us;
+
+		if (latency < 0)
+			latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
+
+		beaconint_us = ieee80211_tu_to_usec(
+					found->vif.bss_conf.beacon_int);
+
+		timeout = local->dynamic_ps_forced_timeout;
+		if (timeout < 0) {
+			/*
+			 * Go to full PSM if the user configures a very low
+			 * latency requirement.
+			 * The 2000 second value is there for compatibility
+			 * until the PM_QOS_NETWORK_LATENCY is configured
+			 * with real values.
+			 */
+			if (latency > (1900 * USEC_PER_MSEC) &&
+			    latency != (2000 * USEC_PER_SEC))
+				timeout = 0;
+			else
+				timeout = 100;
+		}
+		local->dynamic_ps_user_timeout = timeout;
+		if (!local->disable_dynamic_ps)
+			conf->dynamic_ps_timeout =
+				local->dynamic_ps_user_timeout;
+
+		if (beaconint_us > latency) {
+			local->ps_sdata = NULL;
+		} else {
+			struct ieee80211_bss *bss;
+			int maxslp = 1;
+			u8 dtimper;
+
+			bss = (void *)found->u.mgd.associated->priv;
+			dtimper = bss->dtim_period;
+
+			/* If the TIM IE is invalid, pretend the value is 1 */
+			if (!dtimper)
+				dtimper = 1;
+			else if (dtimper > 1)
+				maxslp = min_t(int, dtimper,
+						    latency / beaconint_us);
+
+			local->hw.conf.max_sleep_period = maxslp;
+			local->hw.conf.ps_dtim_period = dtimper;
+			local->ps_sdata = found;
+		}
+	} else {
+		local->ps_sdata = NULL;
+	}
+
+ change:
+	ieee80211_change_ps(local);
+}
+
+void ieee80211_dynamic_ps_disable_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local,
+			     dynamic_ps_disable_work);
+
+	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+	}
+
+	ieee80211_wake_queues_by_reason(&local->hw,
+					IEEE80211_QUEUE_STOP_REASON_PS);
+}
+
+void ieee80211_dynamic_ps_enable_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local,
+			     dynamic_ps_enable_work);
+	struct ieee80211_sub_if_data *sdata = local->ps_sdata;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	unsigned long flags;
+	int q;
+
+	/* can only happen when PS was just disabled anyway */
+	if (!sdata)
+		return;
+
+	if (local->hw.conf.flags & IEEE80211_CONF_PS)
+		return;
+
+	/*
+	 * transmission can be stopped by others which leads to
+	 * dynamic_ps_timer expiry. Postpond the ps timer if it
+	 * is not the actual idle state.
+	 */
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (q = 0; q < local->hw.queues; q++) {
+		if (local->queue_stop_reasons[q]) {
+			spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+					       flags);
+			mod_timer(&local->dynamic_ps_timer, jiffies +
+				  msecs_to_jiffies(
+				  local->hw.conf.dynamic_ps_timeout));
+			return;
+		}
+	}
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+	if ((local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) &&
+	    (!(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED))) {
+		netif_tx_stop_all_queues(sdata->dev);
+
+		if (drv_tx_frames_pending(local))
+			mod_timer(&local->dynamic_ps_timer, jiffies +
+				  msecs_to_jiffies(
+				  local->hw.conf.dynamic_ps_timeout));
+		else {
+			ieee80211_send_nullfunc(local, sdata, 1);
+			/* Flush to get the tx status of nullfunc frame */
+			drv_flush(local, false);
+		}
+	}
+
+	if (!((local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
+	      (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)) ||
+	    (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
+		ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+		local->hw.conf.flags |= IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+	}
+
+	netif_tx_wake_all_queues(sdata->dev);
+}
+
+void ieee80211_dynamic_ps_timer(unsigned long data)
+{
+	struct ieee80211_local *local = (void *) data;
+
+	if (local->quiescing || local->suspended)
+		return;
+
+	ieee80211_queue_work(&local->hw, &local->dynamic_ps_enable_work);
+}
+
+/* MLME */
+static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
+				     struct ieee80211_sub_if_data *sdata,
+				     u8 *wmm_param, size_t wmm_param_len)
+{
+	struct ieee80211_tx_queue_params params;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	size_t left;
+	int count;
+	u8 *pos, uapsd_queues = 0;
+
+	if (!local->ops->conf_tx)
+		return;
+
+	if (local->hw.queues < 4)
+		return;
+
+	if (!wmm_param)
+		return;
+
+	if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1)
+		return;
+
+	if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED)
+		uapsd_queues = local->uapsd_queues;
+
+	count = wmm_param[6] & 0x0f;
+	if (count == ifmgd->wmm_last_param_set)
+		return;
+	ifmgd->wmm_last_param_set = count;
+
+	pos = wmm_param + 8;
+	left = wmm_param_len - 8;
+
+	memset(&params, 0, sizeof(params));
+
+	local->wmm_acm = 0;
+	for (; left >= 4; left -= 4, pos += 4) {
+		int aci = (pos[0] >> 5) & 0x03;
+		int acm = (pos[0] >> 4) & 0x01;
+		bool uapsd = false;
+		int queue;
+
+		switch (aci) {
+		case 1: /* AC_BK */
+			queue = 3;
+			if (acm)
+				local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
+			if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
+				uapsd = true;
+			break;
+		case 2: /* AC_VI */
+			queue = 1;
+			if (acm)
+				local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
+			if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
+				uapsd = true;
+			break;
+		case 3: /* AC_VO */
+			queue = 0;
+			if (acm)
+				local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
+			if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
+				uapsd = true;
+			break;
+		case 0: /* AC_BE */
+		default:
+			queue = 2;
+			if (acm)
+				local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
+			if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
+				uapsd = true;
+			break;
+		}
+
+		params.aifs = pos[0] & 0x0f;
+		params.cw_max = ecw2cw((pos[1] & 0xf0) >> 4);
+		params.cw_min = ecw2cw(pos[1] & 0x0f);
+		params.txop = get_unaligned_le16(pos + 2);
+		params.uapsd = uapsd;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		wiphy_debug(local->hw.wiphy,
+			    "WMM queue=%d aci=%d acm=%d aifs=%d "
+			    "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
+			    queue, aci, acm,
+			    params.aifs, params.cw_min, params.cw_max,
+			    params.txop, params.uapsd);
+#endif
+		if (drv_conf_tx(local, queue, &params))
+			wiphy_debug(local->hw.wiphy,
+				    "failed to set TX queue parameters for queue %d\n",
+				    queue);
+	}
+
+	/* enable WMM or activate new settings */
+	sdata->vif.bss_conf.qos = true;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+}
+
+static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
+					   u16 capab, bool erp_valid, u8 erp)
+{
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	u32 changed = 0;
+	bool use_protection;
+	bool use_short_preamble;
+	bool use_short_slot;
+
+	if (erp_valid) {
+		use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0;
+		use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0;
+	} else {
+		use_protection = false;
+		use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE);
+	}
+
+	use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME);
+	if (sdata->local->hw.conf.channel->band == IEEE80211_BAND_5GHZ)
+		use_short_slot = true;
+
+	if (use_protection != bss_conf->use_cts_prot) {
+		bss_conf->use_cts_prot = use_protection;
+		changed |= BSS_CHANGED_ERP_CTS_PROT;
+	}
+
+	if (use_short_preamble != bss_conf->use_short_preamble) {
+		bss_conf->use_short_preamble = use_short_preamble;
+		changed |= BSS_CHANGED_ERP_PREAMBLE;
+	}
+
+	if (use_short_slot != bss_conf->use_short_slot) {
+		bss_conf->use_short_slot = use_short_slot;
+		changed |= BSS_CHANGED_ERP_SLOT;
+	}
+
+	return changed;
+}
+
+static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
+				     struct cfg80211_bss *cbss,
+				     u32 bss_info_changed)
+{
+	struct ieee80211_bss *bss = (void *)cbss->priv;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+	bss_info_changed |= BSS_CHANGED_ASSOC;
+	/* set timing information */
+	bss_conf->beacon_int = cbss->beacon_interval;
+	bss_conf->timestamp = cbss->tsf;
+
+	bss_info_changed |= BSS_CHANGED_BEACON_INT;
+	bss_info_changed |= ieee80211_handle_bss_capability(sdata,
+		cbss->capability, bss->has_erp_value, bss->erp_value);
+
+	sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec(
+		IEEE80211_BEACON_LOSS_COUNT * bss_conf->beacon_int));
+
+	sdata->u.mgd.associated = cbss;
+	memcpy(sdata->u.mgd.bssid, cbss->bssid, ETH_ALEN);
+
+	sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE;
+
+	/* just to be sure */
+	sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL |
+				IEEE80211_STA_BEACON_POLL);
+
+	ieee80211_led_assoc(local, 1);
+
+	if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
+		bss_conf->dtim_period = bss->dtim_period;
+	else
+		bss_conf->dtim_period = 0;
+
+	bss_conf->assoc = 1;
+	/*
+	 * For now just always ask the driver to update the basic rateset
+	 * when we have associated, we aren't checking whether it actually
+	 * changed or not.
+	 */
+	bss_info_changed |= BSS_CHANGED_BASIC_RATES;
+
+	/* And the BSSID changed - we're associated now */
+	bss_info_changed |= BSS_CHANGED_BSSID;
+
+	/* Tell the driver to monitor connection quality (if supported) */
+	if ((local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI) &&
+	    bss_conf->cqm_rssi_thold)
+		bss_info_changed |= BSS_CHANGED_CQM;
+
+	/* Enable ARP filtering */
+	if (bss_conf->arp_filter_enabled != sdata->arp_filter_state) {
+		bss_conf->arp_filter_enabled = sdata->arp_filter_state;
+		bss_info_changed |= BSS_CHANGED_ARP_FILTER;
+	}
+
+	ieee80211_bss_info_change_notify(sdata, bss_info_changed);
+
+	mutex_lock(&local->iflist_mtx);
+	ieee80211_recalc_ps(local, -1);
+	ieee80211_recalc_smps(local);
+	mutex_unlock(&local->iflist_mtx);
+
+	netif_tx_start_all_queues(sdata->dev);
+	netif_carrier_on(sdata->dev);
+}
+
+static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
+				   bool remove_sta, bool tx)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	u32 changed = 0, config_changed = 0;
+	u8 bssid[ETH_ALEN];
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (WARN_ON(!ifmgd->associated))
+		return;
+
+	memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
+
+	ifmgd->associated = NULL;
+	memset(ifmgd->bssid, 0, ETH_ALEN);
+
+	/*
+	 * we need to commit the associated = NULL change because the
+	 * scan code uses that to determine whether this iface should
+	 * go to/wake up from powersave or not -- and could otherwise
+	 * wake the queues erroneously.
+	 */
+	smp_mb();
+
+	/*
+	 * Thus, we can only afterwards stop the queues -- to account
+	 * for the case where another CPU is finishing a scan at this
+	 * time -- we don't want the scan code to enable queues.
+	 */
+
+	netif_tx_stop_all_queues(sdata->dev);
+	netif_carrier_off(sdata->dev);
+
+	mutex_lock(&local->sta_mtx);
+	sta = sta_info_get(sdata, bssid);
+	if (sta) {
+		set_sta_flags(sta, WLAN_STA_BLOCK_BA);
+		ieee80211_sta_tear_down_BA_sessions(sta, tx);
+	}
+	mutex_unlock(&local->sta_mtx);
+
+	changed |= ieee80211_reset_erp_info(sdata);
+
+	ieee80211_led_assoc(local, 0);
+	changed |= BSS_CHANGED_ASSOC;
+	sdata->vif.bss_conf.assoc = false;
+
+	ieee80211_set_wmm_default(sdata);
+
+	/* channel(_type) changes are handled by ieee80211_hw_config */
+	WARN_ON(!ieee80211_set_channel_type(local, sdata, NL80211_CHAN_NO_HT));
+
+	/* on the next assoc, re-program HT parameters */
+	sdata->ht_opmode_valid = false;
+
+	local->power_constr_level = 0;
+
+	del_timer_sync(&local->dynamic_ps_timer);
+	cancel_work_sync(&local->dynamic_ps_enable_work);
+
+	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+		config_changed |= IEEE80211_CONF_CHANGE_PS;
+	}
+	local->ps_sdata = NULL;
+
+	ieee80211_hw_config(local, config_changed);
+
+	/* Disable ARP filtering */
+	if (sdata->vif.bss_conf.arp_filter_enabled) {
+		sdata->vif.bss_conf.arp_filter_enabled = false;
+		changed |= BSS_CHANGED_ARP_FILTER;
+	}
+
+	/* The BSSID (not really interesting) and HT changed */
+	changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT;
+	ieee80211_bss_info_change_notify(sdata, changed);
+
+	if (remove_sta)
+		sta_info_destroy_addr(sdata, bssid);
+
+	del_timer_sync(&sdata->u.mgd.conn_mon_timer);
+	del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
+	del_timer_sync(&sdata->u.mgd.timer);
+	del_timer_sync(&sdata->u.mgd.chswitch_timer);
+}
+
+void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
+			     struct ieee80211_hdr *hdr)
+{
+	/*
+	 * We can postpone the mgd.timer whenever receiving unicast frames
+	 * from AP because we know that the connection is working both ways
+	 * at that time. But multicast frames (and hence also beacons) must
+	 * be ignored here, because we need to trigger the timer during
+	 * data idle periods for sending the periodic probe request to the
+	 * AP we're connected to.
+	 */
+	if (is_multicast_ether_addr(hdr->addr1))
+		return;
+
+	ieee80211_sta_reset_conn_monitor(sdata);
+}
+
+static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (!(ifmgd->flags & (IEEE80211_STA_BEACON_POLL |
+			      IEEE80211_STA_CONNECTION_POLL)))
+	    return;
+
+	ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL |
+			  IEEE80211_STA_BEACON_POLL);
+	mutex_lock(&sdata->local->iflist_mtx);
+	ieee80211_recalc_ps(sdata->local, -1);
+	mutex_unlock(&sdata->local->iflist_mtx);
+
+	if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+		return;
+
+	/*
+	 * We've received a probe response, but are not sure whether
+	 * we have or will be receiving any beacons or data, so let's
+	 * schedule the timers again, just in case.
+	 */
+	ieee80211_sta_reset_beacon_monitor(sdata);
+
+	mod_timer(&ifmgd->conn_mon_timer,
+		  round_jiffies_up(jiffies +
+				   IEEE80211_CONNECTION_IDLE_TIME));
+}
+
+void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
+			     struct ieee80211_hdr *hdr, bool ack)
+{
+	if (!ieee80211_is_data(hdr->frame_control))
+	    return;
+
+	if (ack)
+		ieee80211_sta_reset_conn_monitor(sdata);
+
+	if (ieee80211_is_nullfunc(hdr->frame_control) &&
+	    sdata->u.mgd.probe_send_count > 0) {
+		if (ack)
+			sdata->u.mgd.probe_send_count = 0;
+		else
+			sdata->u.mgd.nullfunc_failed = true;
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+	}
+}
+
+static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	const u8 *ssid;
+	u8 *dst = ifmgd->associated->bssid;
+	u8 unicast_limit = max(1, max_probe_tries - 3);
+
+	/*
+	 * Try sending broadcast probe requests for the last three
+	 * probe requests after the first ones failed since some
+	 * buggy APs only support broadcast probe requests.
+	 */
+	if (ifmgd->probe_send_count >= unicast_limit)
+		dst = NULL;
+
+	/*
+	 * When the hardware reports an accurate Tx ACK status, it's
+	 * better to send a nullfunc frame instead of a probe request,
+	 * as it will kick us off the AP quickly if we aren't associated
+	 * anymore. The timeout will be reset if the frame is ACKed by
+	 * the AP.
+	 */
+	if (sdata->local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+		ifmgd->nullfunc_failed = false;
+		ieee80211_send_nullfunc(sdata->local, sdata, 0);
+	} else {
+		ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
+		ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0);
+	}
+
+	ifmgd->probe_send_count++;
+	ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms);
+	run_again(ifmgd, ifmgd->probe_timeout);
+}
+
+static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata,
+				   bool beacon)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	bool already = false;
+
+	if (!ieee80211_sdata_running(sdata))
+		return;
+
+	if (sdata->local->scanning)
+		return;
+
+	if (sdata->local->tmp_channel)
+		return;
+
+	mutex_lock(&ifmgd->mtx);
+
+	if (!ifmgd->associated)
+		goto out;
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	if (beacon && net_ratelimit())
+		printk(KERN_DEBUG "%s: detected beacon loss from AP "
+		       "- sending probe request\n", sdata->name);
+#endif
+
+	/*
+	 * The driver/our work has already reported this event or the
+	 * connection monitoring has kicked in and we have already sent
+	 * a probe request. Or maybe the AP died and the driver keeps
+	 * reporting until we disassociate...
+	 *
+	 * In either case we have to ignore the current call to this
+	 * function (except for setting the correct probe reason bit)
+	 * because otherwise we would reset the timer every time and
+	 * never check whether we received a probe response!
+	 */
+	if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL |
+			    IEEE80211_STA_CONNECTION_POLL))
+		already = true;
+
+	if (beacon)
+		ifmgd->flags |= IEEE80211_STA_BEACON_POLL;
+	else
+		ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL;
+
+	if (already)
+		goto out;
+
+	mutex_lock(&sdata->local->iflist_mtx);
+	ieee80211_recalc_ps(sdata->local, -1);
+	mutex_unlock(&sdata->local->iflist_mtx);
+
+	ifmgd->probe_send_count = 0;
+	ieee80211_mgd_probe_ap_send(sdata);
+ out:
+	mutex_unlock(&ifmgd->mtx);
+}
+
+struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
+					  struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct sk_buff *skb;
+	const u8 *ssid;
+
+	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
+		return NULL;
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (!ifmgd->associated)
+		return NULL;
+
+	ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
+	skb = ieee80211_build_probe_req(sdata, ifmgd->associated->bssid,
+					ssid + 2, ssid[1], NULL, 0);
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_ap_probereq_get);
+
+static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_local *local = sdata->local;
+	u8 bssid[ETH_ALEN];
+
+	mutex_lock(&ifmgd->mtx);
+	if (!ifmgd->associated) {
+		mutex_unlock(&ifmgd->mtx);
+		return;
+	}
+
+	memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
+
+	printk(KERN_DEBUG "%s: Connection to AP %pM lost.\n",
+	       sdata->name, bssid);
+
+	ieee80211_set_disassoc(sdata, true, true);
+	mutex_unlock(&ifmgd->mtx);
+
+	mutex_lock(&local->mtx);
+	ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+	/*
+	 * must be outside lock due to cfg80211,
+	 * but that's not a problem.
+	 */
+	ieee80211_send_deauth_disassoc(sdata, bssid,
+				       IEEE80211_STYPE_DEAUTH,
+				       WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+				       NULL, true);
+}
+
+void ieee80211_beacon_connection_loss_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.beacon_connection_loss_work);
+
+	if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+		__ieee80211_connection_loss(sdata);
+	else
+		ieee80211_mgd_probe_ap(sdata, true);
+}
+
+void ieee80211_beacon_loss(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_hw *hw = &sdata->local->hw;
+
+	trace_api_beacon_loss(sdata);
+
+	WARN_ON(hw->flags & IEEE80211_HW_CONNECTION_MONITOR);
+	ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work);
+}
+EXPORT_SYMBOL(ieee80211_beacon_loss);
+
+void ieee80211_connection_loss(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_hw *hw = &sdata->local->hw;
+
+	trace_api_connection_loss(sdata);
+
+	WARN_ON(!(hw->flags & IEEE80211_HW_CONNECTION_MONITOR));
+	ieee80211_queue_work(hw, &sdata->u.mgd.beacon_connection_loss_work);
+}
+EXPORT_SYMBOL(ieee80211_connection_loss);
+
+
+static enum rx_mgmt_action __must_check
+ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
+			 struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	const u8 *bssid = NULL;
+	u16 reason_code;
+
+	if (len < 24 + 2)
+		return RX_MGMT_NONE;
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	bssid = ifmgd->associated->bssid;
+
+	reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
+
+	printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n",
+			sdata->name, bssid, reason_code);
+
+	ieee80211_set_disassoc(sdata, true, false);
+	mutex_lock(&sdata->local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
+
+	return RX_MGMT_CFG80211_DEAUTH;
+}
+
+
+static enum rx_mgmt_action __must_check
+ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
+			   struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	u16 reason_code;
+
+	if (len < 24 + 2)
+		return RX_MGMT_NONE;
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (WARN_ON(!ifmgd->associated))
+		return RX_MGMT_NONE;
+
+	if (WARN_ON(memcmp(ifmgd->associated->bssid, mgmt->sa, ETH_ALEN)))
+		return RX_MGMT_NONE;
+
+	reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
+
+	printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n",
+			sdata->name, mgmt->sa, reason_code);
+
+	ieee80211_set_disassoc(sdata, true, false);
+	mutex_lock(&sdata->local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
+	return RX_MGMT_CFG80211_DISASSOC;
+}
+
+
+static bool ieee80211_assoc_success(struct ieee80211_work *wk,
+				    struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_supported_band *sband;
+	struct sta_info *sta;
+	struct cfg80211_bss *cbss = wk->assoc.bss;
+	u8 *pos;
+	u32 rates, basic_rates;
+	u16 capab_info, aid;
+	struct ieee802_11_elems elems;
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	u32 changed = 0;
+	int i, j, err;
+	bool have_higher_than_11mbit = false;
+	u16 ap_ht_cap_flags;
+
+	/* AssocResp and ReassocResp have identical structure */
+
+	aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+
+	if ((aid & (BIT(15) | BIT(14))) != (BIT(15) | BIT(14)))
+		printk(KERN_DEBUG
+		       "%s: invalid AID value 0x%x; bits 15:14 not set\n",
+		       sdata->name, aid);
+	aid &= ~(BIT(15) | BIT(14));
+
+	ifmgd->broken_ap = false;
+
+	if (aid == 0 || aid > IEEE80211_MAX_AID) {
+		printk(KERN_DEBUG
+		       "%s: invalid AID value %d (out of range), turn off PS\n",
+		       sdata->name, aid);
+		aid = 0;
+		ifmgd->broken_ap = true;
+	}
+
+	pos = mgmt->u.assoc_resp.variable;
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+
+	if (!elems.supp_rates) {
+		printk(KERN_DEBUG "%s: no SuppRates element in AssocResp\n",
+		       sdata->name);
+		return false;
+	}
+
+	ifmgd->aid = aid;
+
+	sta = sta_info_alloc(sdata, cbss->bssid, GFP_KERNEL);
+	if (!sta) {
+		printk(KERN_DEBUG "%s: failed to alloc STA entry for"
+		       " the AP\n", sdata->name);
+		return false;
+	}
+
+	set_sta_flags(sta, WLAN_STA_AUTH | WLAN_STA_ASSOC |
+			   WLAN_STA_ASSOC_AP);
+	if (!(ifmgd->flags & IEEE80211_STA_CONTROL_PORT))
+		set_sta_flags(sta, WLAN_STA_AUTHORIZED);
+
+	rates = 0;
+	basic_rates = 0;
+	sband = local->hw.wiphy->bands[wk->chan->band];
+
+	for (i = 0; i < elems.supp_rates_len; i++) {
+		int rate = (elems.supp_rates[i] & 0x7f) * 5;
+		bool is_basic = !!(elems.supp_rates[i] & 0x80);
+
+		if (rate > 110)
+			have_higher_than_11mbit = true;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].bitrate == rate) {
+				rates |= BIT(j);
+				if (is_basic)
+					basic_rates |= BIT(j);
+				break;
+			}
+		}
+	}
+
+	for (i = 0; i < elems.ext_supp_rates_len; i++) {
+		int rate = (elems.ext_supp_rates[i] & 0x7f) * 5;
+		bool is_basic = !!(elems.ext_supp_rates[i] & 0x80);
+
+		if (rate > 110)
+			have_higher_than_11mbit = true;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].bitrate == rate) {
+				rates |= BIT(j);
+				if (is_basic)
+					basic_rates |= BIT(j);
+				break;
+			}
+		}
+	}
+
+	sta->sta.supp_rates[wk->chan->band] = rates;
+	sdata->vif.bss_conf.basic_rates = basic_rates;
+
+	/* cf. IEEE 802.11 9.2.12 */
+	if (wk->chan->band == IEEE80211_BAND_2GHZ &&
+	    have_higher_than_11mbit)
+		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+	else
+		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+	if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_11N))
+		ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
+				elems.ht_cap_elem, &sta->sta.ht_cap);
+
+	ap_ht_cap_flags = sta->sta.ht_cap.cap;
+
+	rate_control_rate_init(sta);
+
+	if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED)
+		set_sta_flags(sta, WLAN_STA_MFP);
+
+	if (elems.wmm_param)
+		set_sta_flags(sta, WLAN_STA_WME);
+
+	err = sta_info_insert(sta);
+	sta = NULL;
+	if (err) {
+		printk(KERN_DEBUG "%s: failed to insert STA entry for"
+		       " the AP (error %d)\n", sdata->name, err);
+		return false;
+	}
+
+	/*
+	 * Always handle WMM once after association regardless
+	 * of the first value the AP uses. Setting -1 here has
+	 * that effect because the AP values is an unsigned
+	 * 4-bit value.
+	 */
+	ifmgd->wmm_last_param_set = -1;
+
+	if (elems.wmm_param)
+		ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
+					 elems.wmm_param_len);
+	else
+		ieee80211_set_wmm_default(sdata);
+
+	local->oper_channel = wk->chan;
+
+	if (elems.ht_info_elem && elems.wmm_param &&
+	    (sdata->local->hw.queues >= 4) &&
+	    !(ifmgd->flags & IEEE80211_STA_DISABLE_11N))
+		changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem,
+					       cbss->bssid, ap_ht_cap_flags);
+
+	/* set AID and assoc capability,
+	 * ieee80211_set_associated() will tell the driver */
+	bss_conf->aid = aid;
+	bss_conf->assoc_capability = capab_info;
+	ieee80211_set_associated(sdata, cbss, changed);
+
+	/*
+	 * If we're using 4-addr mode, let the AP know that we're
+	 * doing so, so that it can create the STA VLAN on its side
+	 */
+	if (ifmgd->use_4addr)
+		ieee80211_send_4addr_nullfunc(local, sdata);
+
+	/*
+	 * Start timer to probe the connection to the AP now.
+	 * Also start the timer that will detect beacon loss.
+	 */
+	ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
+	ieee80211_sta_reset_beacon_monitor(sdata);
+
+	return true;
+}
+
+
+static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
+				  struct ieee80211_mgmt *mgmt,
+				  size_t len,
+				  struct ieee80211_rx_status *rx_status,
+				  struct ieee802_11_elems *elems,
+				  bool beacon)
+{
+	struct ieee80211_local *local = sdata->local;
+	int freq;
+	struct ieee80211_bss *bss;
+	struct ieee80211_channel *channel;
+	bool need_ps = false;
+
+	if (sdata->u.mgd.associated) {
+		bss = (void *)sdata->u.mgd.associated->priv;
+		/* not previously set so we may need to recalc */
+		need_ps = !bss->dtim_period;
+	}
+
+	if (elems->ds_params && elems->ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems->ds_params[0],
+						      rx_status->band);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return;
+
+	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
+					channel, beacon);
+	if (bss)
+		ieee80211_rx_bss_put(local, bss);
+
+	if (!sdata->u.mgd.associated)
+		return;
+
+	if (need_ps) {
+		mutex_lock(&local->iflist_mtx);
+		ieee80211_recalc_ps(local, -1);
+		mutex_unlock(&local->iflist_mtx);
+	}
+
+	if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) &&
+	    (memcmp(mgmt->bssid, sdata->u.mgd.associated->bssid,
+							ETH_ALEN) == 0)) {
+		struct ieee80211_channel_sw_ie *sw_elem =
+			(struct ieee80211_channel_sw_ie *)elems->ch_switch_elem;
+		ieee80211_sta_process_chanswitch(sdata, sw_elem,
+						 bss, rx_status->mactime);
+	}
+}
+
+
+static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+	struct ieee80211_if_managed *ifmgd;
+	struct ieee80211_rx_status *rx_status = (void *) skb->cb;
+	size_t baselen, len = skb->len;
+	struct ieee802_11_elems elems;
+
+	ifmgd = &sdata->u.mgd;
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN))
+		return; /* ignore ProbeResp to foreign address */
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
+				&elems);
+
+	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
+
+	if (ifmgd->associated &&
+	    memcmp(mgmt->bssid, ifmgd->associated->bssid, ETH_ALEN) == 0)
+		ieee80211_reset_ap_probe(sdata);
+}
+
+/*
+ * This is the canonical list of information elements we care about,
+ * the filter code also gives us all changes to the Microsoft OUI
+ * (00:50:F2) vendor IE which is used for WMM which we need to track.
+ *
+ * We implement beacon filtering in software since that means we can
+ * avoid processing the frame here and in cfg80211, and userspace
+ * will not be able to tell whether the hardware supports it or not.
+ *
+ * XXX: This list needs to be dynamic -- userspace needs to be able to
+ *	add items it requires. It also needs to be able to tell us to
+ *	look out for other vendor IEs.
+ */
+static const u64 care_about_ies =
+	(1ULL << WLAN_EID_COUNTRY) |
+	(1ULL << WLAN_EID_ERP_INFO) |
+	(1ULL << WLAN_EID_CHANNEL_SWITCH) |
+	(1ULL << WLAN_EID_PWR_CONSTRAINT) |
+	(1ULL << WLAN_EID_HT_CAPABILITY) |
+	(1ULL << WLAN_EID_HT_INFORMATION);
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len,
+				     struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	size_t baselen;
+	struct ieee802_11_elems elems;
+	struct ieee80211_local *local = sdata->local;
+	u32 changed = 0;
+	bool erp_valid, directed_tim = false;
+	u8 erp_value = 0;
+	u32 ncrc;
+	u8 *bssid;
+
+	ASSERT_MGD_MTX(ifmgd);
+
+	/* Process beacon from the current BSS */
+	baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	if (rx_status->freq != local->hw.conf.channel->center_freq)
+		return;
+
+	/*
+	 * We might have received a number of frames, among them a
+	 * disassoc frame and a beacon...
+	 */
+	if (!ifmgd->associated)
+		return;
+
+	bssid = ifmgd->associated->bssid;
+
+	/*
+	 * And in theory even frames from a different AP we were just
+	 * associated to a split-second ago!
+	 */
+	if (memcmp(bssid, mgmt->bssid, ETH_ALEN) != 0)
+		return;
+
+	/* Track average RSSI from the Beacon frames of the current AP */
+	ifmgd->last_beacon_signal = rx_status->signal;
+	if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
+		ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
+		ifmgd->ave_beacon_signal = rx_status->signal * 16;
+		ifmgd->last_cqm_event_signal = 0;
+		ifmgd->count_beacon_signal = 1;
+	} else {
+		ifmgd->ave_beacon_signal =
+			(IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
+			 (16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
+			 ifmgd->ave_beacon_signal) / 16;
+		ifmgd->count_beacon_signal++;
+	}
+	if (bss_conf->cqm_rssi_thold &&
+	    ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
+	    !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) {
+		int sig = ifmgd->ave_beacon_signal / 16;
+		int last_event = ifmgd->last_cqm_event_signal;
+		int thold = bss_conf->cqm_rssi_thold;
+		int hyst = bss_conf->cqm_rssi_hyst;
+		if (sig < thold &&
+		    (last_event == 0 || sig < last_event - hyst)) {
+			ifmgd->last_cqm_event_signal = sig;
+			ieee80211_cqm_rssi_notify(
+				&sdata->vif,
+				NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW,
+				GFP_KERNEL);
+		} else if (sig > thold &&
+			   (last_event == 0 || sig > last_event + hyst)) {
+			ifmgd->last_cqm_event_signal = sig;
+			ieee80211_cqm_rssi_notify(
+				&sdata->vif,
+				NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH,
+				GFP_KERNEL);
+		}
+	}
+
+	if (ifmgd->flags & IEEE80211_STA_BEACON_POLL) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "%s: cancelling probereq poll due "
+			       "to a received beacon\n", sdata->name);
+		}
+#endif
+		ifmgd->flags &= ~IEEE80211_STA_BEACON_POLL;
+		mutex_lock(&local->iflist_mtx);
+		ieee80211_recalc_ps(local, -1);
+		mutex_unlock(&local->iflist_mtx);
+	}
+
+	/*
+	 * Push the beacon loss detection into the future since
+	 * we are processing a beacon from the AP just now.
+	 */
+	ieee80211_sta_reset_beacon_monitor(sdata);
+
+	ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
+	ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
+					  len - baselen, &elems,
+					  care_about_ies, ncrc);
+
+	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)
+		directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len,
+						   ifmgd->aid);
+
+	if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) {
+		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems,
+				      true);
+
+		ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
+					 elems.wmm_param_len);
+	}
+
+	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
+		if (directed_tim) {
+			if (local->hw.conf.dynamic_ps_timeout > 0) {
+				local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+				ieee80211_hw_config(local,
+						    IEEE80211_CONF_CHANGE_PS);
+				ieee80211_send_nullfunc(local, sdata, 0);
+			} else {
+				local->pspolling = true;
+
+				/*
+				 * Here is assumed that the driver will be
+				 * able to send ps-poll frame and receive a
+				 * response even though power save mode is
+				 * enabled, but some drivers might require
+				 * to disable power save here. This needs
+				 * to be investigated.
+				 */
+				ieee80211_send_pspoll(local, sdata);
+			}
+		}
+	}
+
+	if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
+		return;
+	ifmgd->beacon_crc = ncrc;
+	ifmgd->beacon_crc_valid = true;
+
+	if (elems.erp_info && elems.erp_info_len >= 1) {
+		erp_valid = true;
+		erp_value = elems.erp_info[0];
+	} else {
+		erp_valid = false;
+	}
+	changed |= ieee80211_handle_bss_capability(sdata,
+			le16_to_cpu(mgmt->u.beacon.capab_info),
+			erp_valid, erp_value);
+
+
+	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
+	    !(ifmgd->flags & IEEE80211_STA_DISABLE_11N)) {
+		struct sta_info *sta;
+		struct ieee80211_supported_band *sband;
+		u16 ap_ht_cap_flags;
+
+		rcu_read_lock();
+
+		sta = sta_info_get(sdata, bssid);
+		if (WARN_ON(!sta)) {
+			rcu_read_unlock();
+			return;
+		}
+
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+		ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
+				elems.ht_cap_elem, &sta->sta.ht_cap);
+
+		ap_ht_cap_flags = sta->sta.ht_cap.cap;
+
+		rcu_read_unlock();
+
+		changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem,
+					       bssid, ap_ht_cap_flags);
+	}
+
+	/* Note: country IE parsing is done for us by cfg80211 */
+	if (elems.country_elem) {
+		/* TODO: IBSS also needs this */
+		if (elems.pwr_constr_elem)
+			ieee80211_handle_pwr_constr(sdata,
+				le16_to_cpu(mgmt->u.probe_resp.capab_info),
+				elems.pwr_constr_elem,
+				elems.pwr_constr_elem_len);
+	}
+
+	ieee80211_bss_info_change_notify(sdata, changed);
+}
+
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				  struct sk_buff *skb)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_mgmt *mgmt;
+	enum rx_mgmt_action rma = RX_MGMT_NONE;
+	u16 fc;
+
+	rx_status = (struct ieee80211_rx_status *) skb->cb;
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	mutex_lock(&ifmgd->mtx);
+
+	if (ifmgd->associated &&
+	    memcmp(ifmgd->associated->bssid, mgmt->bssid, ETH_ALEN) == 0) {
+		switch (fc & IEEE80211_FCTL_STYPE) {
+		case IEEE80211_STYPE_BEACON:
+			ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
+						 rx_status);
+			break;
+		case IEEE80211_STYPE_PROBE_RESP:
+			ieee80211_rx_mgmt_probe_resp(sdata, skb);
+			break;
+		case IEEE80211_STYPE_DEAUTH:
+			rma = ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len);
+			break;
+		case IEEE80211_STYPE_DISASSOC:
+			rma = ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len);
+			break;
+		case IEEE80211_STYPE_ACTION:
+			switch (mgmt->u.action.category) {
+			case WLAN_CATEGORY_SPECTRUM_MGMT:
+				ieee80211_sta_process_chanswitch(sdata,
+						&mgmt->u.action.u.chan_switch.sw_elem,
+						(void *)ifmgd->associated->priv,
+						rx_status->mactime);
+				break;
+			}
+		}
+		mutex_unlock(&ifmgd->mtx);
+
+		switch (rma) {
+		case RX_MGMT_NONE:
+			/* no action */
+			break;
+		case RX_MGMT_CFG80211_DEAUTH:
+			cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
+			break;
+		case RX_MGMT_CFG80211_DISASSOC:
+			cfg80211_send_disassoc(sdata->dev, (u8 *)mgmt, skb->len);
+			break;
+		default:
+			WARN(1, "unexpected: %d", rma);
+		}
+		return;
+	}
+
+	mutex_unlock(&ifmgd->mtx);
+
+	if (skb->len >= 24 + 2 /* mgmt + deauth reason */ &&
+	    (fc & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_DEAUTH) {
+		struct ieee80211_local *local = sdata->local;
+		struct ieee80211_work *wk;
+
+		mutex_lock(&local->mtx);
+		list_for_each_entry(wk, &local->work_list, list) {
+			if (wk->sdata != sdata)
+				continue;
+
+			if (wk->type != IEEE80211_WORK_ASSOC &&
+			    wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
+				continue;
+
+			if (memcmp(mgmt->bssid, wk->filter_ta, ETH_ALEN))
+				continue;
+			if (memcmp(mgmt->sa, wk->filter_ta, ETH_ALEN))
+				continue;
+
+			/*
+			 * Printing the message only here means we can't
+			 * spuriously print it, but it also means that it
+			 * won't be printed when the frame comes in before
+			 * we even tried to associate or in similar cases.
+			 *
+			 * Ultimately, I suspect cfg80211 should print the
+			 * messages instead.
+			 */
+			printk(KERN_DEBUG
+			       "%s: deauthenticated from %pM (Reason: %u)\n",
+			       sdata->name, mgmt->bssid,
+			       le16_to_cpu(mgmt->u.deauth.reason_code));
+
+			list_del_rcu(&wk->list);
+			free_work(wk);
+			break;
+		}
+		mutex_unlock(&local->mtx);
+
+		cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
+	}
+}
+
+static void ieee80211_sta_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->quiescing) {
+		set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running);
+		return;
+	}
+
+	ieee80211_queue_work(&local->hw, &sdata->work);
+}
+
+static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata,
+					  u8 *bssid)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL |
+			  IEEE80211_STA_BEACON_POLL);
+
+	ieee80211_set_disassoc(sdata, true, true);
+	mutex_unlock(&ifmgd->mtx);
+	mutex_lock(&local->mtx);
+	ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+	/*
+	 * must be outside lock due to cfg80211,
+	 * but that's not a problem.
+	 */
+	ieee80211_send_deauth_disassoc(sdata, bssid,
+			IEEE80211_STYPE_DEAUTH,
+			WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
+			NULL, true);
+	mutex_lock(&ifmgd->mtx);
+}
+
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	/* then process the rest of the work */
+	mutex_lock(&ifmgd->mtx);
+
+	if (ifmgd->flags & (IEEE80211_STA_BEACON_POLL |
+			    IEEE80211_STA_CONNECTION_POLL) &&
+	    ifmgd->associated) {
+		u8 bssid[ETH_ALEN];
+		int max_tries;
+
+		memcpy(bssid, ifmgd->associated->bssid, ETH_ALEN);
+
+		if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS)
+			max_tries = max_nullfunc_tries;
+		else
+			max_tries = max_probe_tries;
+
+		/* ACK received for nullfunc probing frame */
+		if (!ifmgd->probe_send_count)
+			ieee80211_reset_ap_probe(sdata);
+		else if (ifmgd->nullfunc_failed) {
+			if (ifmgd->probe_send_count < max_tries) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+				wiphy_debug(local->hw.wiphy,
+					    "%s: No ack for nullfunc frame to"
+					    " AP %pM, try %d/%i\n",
+					    sdata->name, bssid,
+					    ifmgd->probe_send_count, max_tries);
+#endif
+				ieee80211_mgd_probe_ap_send(sdata);
+			} else {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+				wiphy_debug(local->hw.wiphy,
+					    "%s: No ack for nullfunc frame to"
+					    " AP %pM, disconnecting.\n",
+					    sdata->name, bssid);
+#endif
+				ieee80211_sta_connection_lost(sdata, bssid);
+			}
+		} else if (time_is_after_jiffies(ifmgd->probe_timeout))
+			run_again(ifmgd, ifmgd->probe_timeout);
+		else if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+			wiphy_debug(local->hw.wiphy,
+				    "%s: Failed to send nullfunc to AP %pM"
+				    " after %dms, disconnecting.\n",
+				    sdata->name,
+				    bssid, probe_wait_ms);
+#endif
+			ieee80211_sta_connection_lost(sdata, bssid);
+		} else if (ifmgd->probe_send_count < max_tries) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+			wiphy_debug(local->hw.wiphy,
+				    "%s: No probe response from AP %pM"
+				    " after %dms, try %d/%i\n",
+				    sdata->name,
+				    bssid, probe_wait_ms,
+				    ifmgd->probe_send_count, max_tries);
+#endif
+			ieee80211_mgd_probe_ap_send(sdata);
+		} else {
+			/*
+			 * We actually lost the connection ... or did we?
+			 * Let's make sure!
+			 */
+			wiphy_debug(local->hw.wiphy,
+				    "%s: No probe response from AP %pM"
+				    " after %dms, disconnecting.\n",
+				    sdata->name,
+				    bssid, probe_wait_ms);
+
+			ieee80211_sta_connection_lost(sdata, bssid);
+		}
+	}
+
+	mutex_unlock(&ifmgd->mtx);
+}
+
+static void ieee80211_sta_bcn_mon_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->quiescing)
+		return;
+
+	ieee80211_queue_work(&sdata->local->hw,
+			     &sdata->u.mgd.beacon_connection_loss_work);
+}
+
+static void ieee80211_sta_conn_mon_timer(unsigned long data)
+{
+	struct ieee80211_sub_if_data *sdata =
+		(struct ieee80211_sub_if_data *) data;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_local *local = sdata->local;
+
+	if (local->quiescing)
+		return;
+
+	ieee80211_queue_work(&local->hw, &ifmgd->monitor_work);
+}
+
+static void ieee80211_sta_monitor_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.monitor_work);
+
+	ieee80211_mgd_probe_ap(sdata, false);
+}
+
+static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
+{
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		sdata->u.mgd.flags &= ~(IEEE80211_STA_BEACON_POLL |
+					IEEE80211_STA_CONNECTION_POLL);
+
+		/* let's probe the connection once */
+		ieee80211_queue_work(&sdata->local->hw,
+			   &sdata->u.mgd.monitor_work);
+		/* and do all the other regular work too */
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+	}
+}
+
+#ifdef CONFIG_PM
+void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	/*
+	 * we need to use atomic bitops for the running bits
+	 * only because both timers might fire at the same
+	 * time -- the code here is properly synchronised.
+	 */
+
+	cancel_work_sync(&ifmgd->request_smps_work);
+
+	cancel_work_sync(&ifmgd->beacon_connection_loss_work);
+	if (del_timer_sync(&ifmgd->timer))
+		set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running);
+
+	cancel_work_sync(&ifmgd->chswitch_work);
+	if (del_timer_sync(&ifmgd->chswitch_timer))
+		set_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running);
+
+	cancel_work_sync(&ifmgd->monitor_work);
+	/* these will just be re-established on connection */
+	del_timer_sync(&ifmgd->conn_mon_timer);
+	del_timer_sync(&ifmgd->bcn_mon_timer);
+}
+
+void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (!ifmgd->associated)
+		return;
+
+	if (test_and_clear_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running))
+		add_timer(&ifmgd->timer);
+	if (test_and_clear_bit(TMR_RUNNING_CHANSW, &ifmgd->timers_running))
+		add_timer(&ifmgd->chswitch_timer);
+	ieee80211_sta_reset_beacon_monitor(sdata);
+	ieee80211_restart_sta_timer(sdata);
+}
+#endif
+
+/* interface setup */
+void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd;
+
+	ifmgd = &sdata->u.mgd;
+	INIT_WORK(&ifmgd->monitor_work, ieee80211_sta_monitor_work);
+	INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
+	INIT_WORK(&ifmgd->beacon_connection_loss_work,
+		  ieee80211_beacon_connection_loss_work);
+	INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work);
+	setup_timer(&ifmgd->timer, ieee80211_sta_timer,
+		    (unsigned long) sdata);
+	setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer,
+		    (unsigned long) sdata);
+	setup_timer(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer,
+		    (unsigned long) sdata);
+	setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
+		    (unsigned long) sdata);
+
+	ifmgd->flags = 0;
+
+	mutex_init(&ifmgd->mtx);
+
+	if (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_SMPS)
+		ifmgd->req_smps = IEEE80211_SMPS_AUTOMATIC;
+	else
+		ifmgd->req_smps = IEEE80211_SMPS_OFF;
+}
+
+/* scan finished notification */
+void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+
+	/* Restart STA timers */
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		ieee80211_restart_sta_timer(sdata);
+	rcu_read_unlock();
+}
+
+int ieee80211_max_network_latency(struct notifier_block *nb,
+				  unsigned long data, void *dummy)
+{
+	s32 latency_usec = (s32) data;
+	struct ieee80211_local *local =
+		container_of(nb, struct ieee80211_local,
+			     network_latency_notifier);
+
+	mutex_lock(&local->iflist_mtx);
+	ieee80211_recalc_ps(local, latency_usec);
+	mutex_unlock(&local->iflist_mtx);
+
+	return 0;
+}
+
+/* config hooks */
+static enum work_done_result
+ieee80211_probe_auth_done(struct ieee80211_work *wk,
+			  struct sk_buff *skb)
+{
+	if (!skb) {
+		cfg80211_send_auth_timeout(wk->sdata->dev, wk->filter_ta);
+		return WORK_DONE_DESTROY;
+	}
+
+	if (wk->type == IEEE80211_WORK_AUTH) {
+		cfg80211_send_rx_auth(wk->sdata->dev, skb->data, skb->len);
+		return WORK_DONE_DESTROY;
+	}
+
+	mutex_lock(&wk->sdata->u.mgd.mtx);
+	ieee80211_rx_mgmt_probe_resp(wk->sdata, skb);
+	mutex_unlock(&wk->sdata->u.mgd.mtx);
+
+	wk->type = IEEE80211_WORK_AUTH;
+	wk->probe_auth.tries = 0;
+	return WORK_DONE_REQUEUE;
+}
+
+int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
+		       struct cfg80211_auth_request *req)
+{
+	const u8 *ssid;
+	struct ieee80211_work *wk;
+	u16 auth_alg;
+
+	if (req->local_state_change)
+		return 0; /* no need to update mac80211 state */
+
+	switch (req->auth_type) {
+	case NL80211_AUTHTYPE_OPEN_SYSTEM:
+		auth_alg = WLAN_AUTH_OPEN;
+		break;
+	case NL80211_AUTHTYPE_SHARED_KEY:
+		if (IS_ERR(sdata->local->wep_tx_tfm))
+			return -EOPNOTSUPP;
+		auth_alg = WLAN_AUTH_SHARED_KEY;
+		break;
+	case NL80211_AUTHTYPE_FT:
+		auth_alg = WLAN_AUTH_FT;
+		break;
+	case NL80211_AUTHTYPE_NETWORK_EAP:
+		auth_alg = WLAN_AUTH_LEAP;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	wk = kzalloc(sizeof(*wk) + req->ie_len, GFP_KERNEL);
+	if (!wk)
+		return -ENOMEM;
+
+	memcpy(wk->filter_ta, req->bss->bssid, ETH_ALEN);
+
+	if (req->ie && req->ie_len) {
+		memcpy(wk->ie, req->ie, req->ie_len);
+		wk->ie_len = req->ie_len;
+	}
+
+	if (req->key && req->key_len) {
+		wk->probe_auth.key_len = req->key_len;
+		wk->probe_auth.key_idx = req->key_idx;
+		memcpy(wk->probe_auth.key, req->key, req->key_len);
+	}
+
+	ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID);
+	memcpy(wk->probe_auth.ssid, ssid + 2, ssid[1]);
+	wk->probe_auth.ssid_len = ssid[1];
+
+	wk->probe_auth.algorithm = auth_alg;
+	wk->probe_auth.privacy = req->bss->capability & WLAN_CAPABILITY_PRIVACY;
+
+	/* if we already have a probe, don't probe again */
+	if (req->bss->proberesp_ies)
+		wk->type = IEEE80211_WORK_AUTH;
+	else
+		wk->type = IEEE80211_WORK_DIRECT_PROBE;
+	wk->chan = req->bss->channel;
+	wk->chan_type = NL80211_CHAN_NO_HT;
+	wk->sdata = sdata;
+	wk->done = ieee80211_probe_auth_done;
+
+	ieee80211_add_work(wk);
+	return 0;
+}
+
+static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk,
+						  struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_rx_status *rx_status;
+	struct ieee802_11_elems elems;
+	u16 status;
+
+	if (!skb) {
+		cfg80211_send_assoc_timeout(wk->sdata->dev, wk->filter_ta);
+		return WORK_DONE_DESTROY;
+	}
+
+	if (wk->type == IEEE80211_WORK_ASSOC_BEACON_WAIT) {
+		mutex_lock(&wk->sdata->u.mgd.mtx);
+		rx_status = (void *) skb->cb;
+		ieee802_11_parse_elems(skb->data + 24 + 12, skb->len - 24 - 12, &elems);
+		ieee80211_rx_bss_info(wk->sdata, (void *)skb->data, skb->len, rx_status,
+				      &elems, true);
+		mutex_unlock(&wk->sdata->u.mgd.mtx);
+
+		wk->type = IEEE80211_WORK_ASSOC;
+		/* not really done yet */
+		return WORK_DONE_REQUEUE;
+	}
+
+	mgmt = (void *)skb->data;
+	status = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+
+	if (status == WLAN_STATUS_SUCCESS) {
+		mutex_lock(&wk->sdata->u.mgd.mtx);
+		if (!ieee80211_assoc_success(wk, mgmt, skb->len)) {
+			mutex_unlock(&wk->sdata->u.mgd.mtx);
+			/* oops -- internal error -- send timeout for now */
+			cfg80211_send_assoc_timeout(wk->sdata->dev,
+						    wk->filter_ta);
+			return WORK_DONE_DESTROY;
+		}
+
+		mutex_unlock(&wk->sdata->u.mgd.mtx);
+	}
+
+	cfg80211_send_rx_assoc(wk->sdata->dev, skb->data, skb->len);
+	return WORK_DONE_DESTROY;
+}
+
+int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
+			struct cfg80211_assoc_request *req)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_bss *bss = (void *)req->bss->priv;
+	struct ieee80211_work *wk;
+	const u8 *ssid;
+	int i;
+
+	mutex_lock(&ifmgd->mtx);
+	if (ifmgd->associated) {
+		if (!req->prev_bssid ||
+		    memcmp(req->prev_bssid, ifmgd->associated->bssid,
+			   ETH_ALEN)) {
+			/*
+			 * We are already associated and the request was not a
+			 * reassociation request from the current BSS, so
+			 * reject it.
+			 */
+			mutex_unlock(&ifmgd->mtx);
+			return -EALREADY;
+		}
+
+		/* Trying to reassociate - clear previous association state */
+		ieee80211_set_disassoc(sdata, true, false);
+	}
+	mutex_unlock(&ifmgd->mtx);
+
+	wk = kzalloc(sizeof(*wk) + req->ie_len, GFP_KERNEL);
+	if (!wk)
+		return -ENOMEM;
+
+	ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N;
+	ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+
+	ifmgd->beacon_crc_valid = false;
+
+	for (i = 0; i < req->crypto.n_ciphers_pairwise; i++)
+		if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 ||
+		    req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP ||
+		    req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104)
+			ifmgd->flags |= IEEE80211_STA_DISABLE_11N;
+
+
+	if (req->ie && req->ie_len) {
+		memcpy(wk->ie, req->ie, req->ie_len);
+		wk->ie_len = req->ie_len;
+	} else
+		wk->ie_len = 0;
+
+	wk->assoc.bss = req->bss;
+
+	memcpy(wk->filter_ta, req->bss->bssid, ETH_ALEN);
+
+	/* new association always uses requested smps mode */
+	if (ifmgd->req_smps == IEEE80211_SMPS_AUTOMATIC) {
+		if (ifmgd->powersave)
+			ifmgd->ap_smps = IEEE80211_SMPS_DYNAMIC;
+		else
+			ifmgd->ap_smps = IEEE80211_SMPS_OFF;
+	} else
+		ifmgd->ap_smps = ifmgd->req_smps;
+
+	wk->assoc.smps = ifmgd->ap_smps;
+	/*
+	 * IEEE802.11n does not allow TKIP/WEP as pairwise ciphers in HT mode.
+	 * We still associate in non-HT mode (11a/b/g) if any one of these
+	 * ciphers is configured as pairwise.
+	 * We can set this to true for non-11n hardware, that'll be checked
+	 * separately along with the peer capabilities.
+	 */
+	wk->assoc.use_11n = !(ifmgd->flags & IEEE80211_STA_DISABLE_11N);
+	wk->assoc.capability = req->bss->capability;
+	wk->assoc.wmm_used = bss->wmm_used;
+	wk->assoc.supp_rates = bss->supp_rates;
+	wk->assoc.supp_rates_len = bss->supp_rates_len;
+	wk->assoc.ht_information_ie =
+		ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_INFORMATION);
+
+	if (bss->wmm_used && bss->uapsd_supported &&
+	    (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) {
+		wk->assoc.uapsd_used = true;
+		ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED;
+	} else {
+		wk->assoc.uapsd_used = false;
+		ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED;
+	}
+
+	ssid = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID);
+	memcpy(wk->assoc.ssid, ssid + 2, ssid[1]);
+	wk->assoc.ssid_len = ssid[1];
+
+	if (req->prev_bssid)
+		memcpy(wk->assoc.prev_bssid, req->prev_bssid, ETH_ALEN);
+
+	wk->chan = req->bss->channel;
+	wk->chan_type = NL80211_CHAN_NO_HT;
+	wk->sdata = sdata;
+	wk->done = ieee80211_assoc_done;
+	if (!bss->dtim_period &&
+	    sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
+		wk->type = IEEE80211_WORK_ASSOC_BEACON_WAIT;
+	else
+		wk->type = IEEE80211_WORK_ASSOC;
+
+	if (req->use_mfp) {
+		ifmgd->mfp = IEEE80211_MFP_REQUIRED;
+		ifmgd->flags |= IEEE80211_STA_MFP_ENABLED;
+	} else {
+		ifmgd->mfp = IEEE80211_MFP_DISABLED;
+		ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED;
+	}
+
+	if (req->crypto.control_port)
+		ifmgd->flags |= IEEE80211_STA_CONTROL_PORT;
+	else
+		ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT;
+
+	sdata->control_port_protocol = req->crypto.control_port_ethertype;
+	sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
+
+	ieee80211_add_work(wk);
+	return 0;
+}
+
+int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
+			 struct cfg80211_deauth_request *req,
+			 void *cookie)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_work *wk;
+	u8 bssid[ETH_ALEN];
+	bool assoc_bss = false;
+
+	mutex_lock(&ifmgd->mtx);
+
+	memcpy(bssid, req->bss->bssid, ETH_ALEN);
+	if (ifmgd->associated == req->bss) {
+		ieee80211_set_disassoc(sdata, false, true);
+		mutex_unlock(&ifmgd->mtx);
+		assoc_bss = true;
+	} else {
+		bool not_auth_yet = false;
+
+		mutex_unlock(&ifmgd->mtx);
+
+		mutex_lock(&local->mtx);
+		list_for_each_entry(wk, &local->work_list, list) {
+			if (wk->sdata != sdata)
+				continue;
+
+			if (wk->type != IEEE80211_WORK_DIRECT_PROBE &&
+			    wk->type != IEEE80211_WORK_AUTH &&
+			    wk->type != IEEE80211_WORK_ASSOC &&
+			    wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
+				continue;
+
+			if (memcmp(req->bss->bssid, wk->filter_ta, ETH_ALEN))
+				continue;
+
+			not_auth_yet = wk->type == IEEE80211_WORK_DIRECT_PROBE;
+			list_del_rcu(&wk->list);
+			free_work(wk);
+			break;
+		}
+		mutex_unlock(&local->mtx);
+
+		/*
+		 * If somebody requests authentication and we haven't
+		 * sent out an auth frame yet there's no need to send
+		 * out a deauth frame either. If the state was PROBE,
+		 * then this is the case. If it's AUTH we have sent a
+		 * frame, and if it's IDLE we have completed the auth
+		 * process already.
+		 */
+		if (not_auth_yet) {
+			__cfg80211_auth_canceled(sdata->dev, bssid);
+			return 0;
+		}
+	}
+
+	printk(KERN_DEBUG "%s: deauthenticating from %pM by local choice (reason=%d)\n",
+	       sdata->name, bssid, req->reason_code);
+
+	ieee80211_send_deauth_disassoc(sdata, bssid, IEEE80211_STYPE_DEAUTH,
+				       req->reason_code, cookie,
+				       !req->local_state_change);
+	if (assoc_bss)
+		sta_info_destroy_addr(sdata, bssid);
+
+	mutex_lock(&sdata->local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
+
+	return 0;
+}
+
+int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_disassoc_request *req,
+			   void *cookie)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	u8 bssid[ETH_ALEN];
+
+	mutex_lock(&ifmgd->mtx);
+
+	/*
+	 * cfg80211 should catch this ... but it's racy since
+	 * we can receive a disassoc frame, process it, hand it
+	 * to cfg80211 while that's in a locked section already
+	 * trying to tell us that the user wants to disconnect.
+	 */
+	if (ifmgd->associated != req->bss) {
+		mutex_unlock(&ifmgd->mtx);
+		return -ENOLINK;
+	}
+
+	printk(KERN_DEBUG "%s: disassociating from %pM by local choice (reason=%d)\n",
+	       sdata->name, req->bss->bssid, req->reason_code);
+
+	memcpy(bssid, req->bss->bssid, ETH_ALEN);
+	ieee80211_set_disassoc(sdata, false, true);
+
+	mutex_unlock(&ifmgd->mtx);
+
+	ieee80211_send_deauth_disassoc(sdata, req->bss->bssid,
+			IEEE80211_STYPE_DISASSOC, req->reason_code,
+			cookie, !req->local_state_change);
+	sta_info_destroy_addr(sdata, bssid);
+
+	mutex_lock(&sdata->local->mtx);
+	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
+
+	return 0;
+}
+
+void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
+			       enum nl80211_cqm_rssi_threshold_event rssi_event,
+			       gfp_t gfp)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	trace_api_cqm_rssi_notify(sdata, rssi_event);
+
+	cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp);
+}
+EXPORT_SYMBOL(ieee80211_cqm_rssi_notify);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
new file mode 100644
index 00000000..c55eb9d8
--- /dev/null
+++ b/net/mac80211/offchannel.c
@@ -0,0 +1,299 @@
+/*
+ * Off-channel operation helpers
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-trace.h"
+
+/*
+ * Tell our hardware to disable PS.
+ * Optionally inform AP that we will go to sleep so that it will buffer
+ * the frames while we are doing off-channel work.  This is optional
+ * because we *may* be doing work on-operating channel, and want our
+ * hardware unconditionally awake, but still let the AP send us normal frames.
+ */
+static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata,
+					   bool tell_ap)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	local->offchannel_ps_enabled = false;
+
+	/* FIXME: what to do when local->pspolling is true? */
+
+	del_timer_sync(&local->dynamic_ps_timer);
+	del_timer_sync(&ifmgd->bcn_mon_timer);
+	del_timer_sync(&ifmgd->conn_mon_timer);
+
+	cancel_work_sync(&local->dynamic_ps_enable_work);
+
+	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+		local->offchannel_ps_enabled = true;
+		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+	}
+
+	if (tell_ap && (!local->offchannel_ps_enabled ||
+			!(local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK)))
+		/*
+		 * If power save was enabled, no need to send a nullfunc
+		 * frame because AP knows that we are sleeping. But if the
+		 * hardware is creating the nullfunc frame for power save
+		 * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not
+		 * enabled) and power save was enabled, the firmware just
+		 * sent a null frame with power save disabled. So we need
+		 * to send a new nullfunc frame to inform the AP that we
+		 * are again sleeping.
+		 */
+		ieee80211_send_nullfunc(local, sdata, 1);
+}
+
+/* inform AP that we are awake again, unless power save is enabled */
+static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+
+	if (!local->ps_sdata)
+		ieee80211_send_nullfunc(local, sdata, 0);
+	else if (local->offchannel_ps_enabled) {
+		/*
+		 * In !IEEE80211_HW_PS_NULLFUNC_STACK case the hardware
+		 * will send a nullfunc frame with the powersave bit set
+		 * even though the AP already knows that we are sleeping.
+		 * This could be avoided by sending a null frame with power
+		 * save bit disabled before enabling the power save, but
+		 * this doesn't gain anything.
+		 *
+		 * When IEEE80211_HW_PS_NULLFUNC_STACK is enabled, no need
+		 * to send a nullfunc frame because AP already knows that
+		 * we are sleeping, let's just enable power save mode in
+		 * hardware.
+		 */
+		/* TODO:  Only set hardware if CONF_PS changed?
+		 * TODO:  Should we set offchannel_ps_enabled to false?
+		 */
+		local->hw.conf.flags |= IEEE80211_CONF_PS;
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+	} else if (local->hw.conf.dynamic_ps_timeout > 0) {
+		/*
+		 * If IEEE80211_CONF_PS was not set and the dynamic_ps_timer
+		 * had been running before leaving the operating channel,
+		 * restart the timer now and send a nullfunc frame to inform
+		 * the AP that we are awake.
+		 */
+		ieee80211_send_nullfunc(local, sdata, 0);
+		mod_timer(&local->dynamic_ps_timer, jiffies +
+			  msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+	}
+
+	ieee80211_sta_reset_beacon_monitor(sdata);
+	ieee80211_sta_reset_conn_monitor(sdata);
+}
+
+void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local,
+				    bool offchannel_ps_enable)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	/*
+	 * notify the AP about us leaving the channel and stop all
+	 * STA interfaces.
+	 */
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+			set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+
+		/* Check to see if we should disable beaconing. */
+		if (sdata->vif.type == NL80211_IFTYPE_AP ||
+		    sdata->vif.type == NL80211_IFTYPE_ADHOC ||
+		    sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+			ieee80211_bss_info_change_notify(
+				sdata, BSS_CHANGED_BEACON_ENABLED);
+
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+			netif_tx_stop_all_queues(sdata->dev);
+			if (offchannel_ps_enable &&
+			    (sdata->vif.type == NL80211_IFTYPE_STATION) &&
+			    sdata->u.mgd.associated)
+				ieee80211_offchannel_ps_enable(sdata, true);
+		}
+	}
+	mutex_unlock(&local->iflist_mtx);
+}
+
+void ieee80211_offchannel_enable_all_ps(struct ieee80211_local *local,
+					bool tell_ap)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+		    sdata->u.mgd.associated)
+			ieee80211_offchannel_ps_enable(sdata, tell_ap);
+	}
+	mutex_unlock(&local->iflist_mtx);
+}
+
+void ieee80211_offchannel_return(struct ieee80211_local *local,
+				 bool enable_beaconing,
+				 bool offchannel_ps_disable)
+{
+	struct ieee80211_sub_if_data *sdata;
+
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		/* Tell AP we're back */
+		if (offchannel_ps_disable &&
+		    sdata->vif.type == NL80211_IFTYPE_STATION) {
+			if (sdata->u.mgd.associated)
+				ieee80211_offchannel_ps_disable(sdata);
+		}
+
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+			clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+			/*
+			 * This may wake up queues even though the driver
+			 * currently has them stopped. This is not very
+			 * likely, since the driver won't have gotten any
+			 * (or hardly any) new packets while we weren't
+			 * on the right channel, and even if it happens
+			 * it will at most lead to queueing up one more
+			 * packet per queue in mac80211 rather than on
+			 * the interface qdisc.
+			 */
+			netif_tx_wake_all_queues(sdata->dev);
+		}
+
+		/* Check to see if we should re-enable beaconing */
+		if (enable_beaconing &&
+		    (sdata->vif.type == NL80211_IFTYPE_AP ||
+		     sdata->vif.type == NL80211_IFTYPE_ADHOC ||
+		     sdata->vif.type == NL80211_IFTYPE_MESH_POINT))
+			ieee80211_bss_info_change_notify(
+				sdata, BSS_CHANGED_BEACON_ENABLED);
+	}
+	mutex_unlock(&local->iflist_mtx);
+}
+
+static void ieee80211_hw_roc_start(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, hw_roc_start);
+	struct ieee80211_sub_if_data *sdata;
+
+	mutex_lock(&local->mtx);
+
+	if (!local->hw_roc_channel) {
+		mutex_unlock(&local->mtx);
+		return;
+	}
+
+	ieee80211_recalc_idle(local);
+
+	if (local->hw_roc_skb) {
+		sdata = IEEE80211_DEV_TO_SUB_IF(local->hw_roc_dev);
+		ieee80211_tx_skb(sdata, local->hw_roc_skb);
+		local->hw_roc_skb = NULL;
+	} else {
+		cfg80211_ready_on_channel(local->hw_roc_dev,
+					  local->hw_roc_cookie,
+					  local->hw_roc_channel,
+					  local->hw_roc_channel_type,
+					  local->hw_roc_duration,
+					  GFP_KERNEL);
+	}
+
+	mutex_unlock(&local->mtx);
+}
+
+void ieee80211_ready_on_channel(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_ready_on_channel(local);
+
+	ieee80211_queue_work(hw, &local->hw_roc_start);
+}
+EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel);
+
+static void ieee80211_hw_roc_done(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, hw_roc_done);
+
+	mutex_lock(&local->mtx);
+
+	if (!local->hw_roc_channel) {
+		mutex_unlock(&local->mtx);
+		return;
+	}
+
+	/* was never transmitted */
+	if (local->hw_roc_skb) {
+		u64 cookie;
+
+		cookie = local->hw_roc_cookie ^ 2;
+
+		cfg80211_mgmt_tx_status(local->hw_roc_dev, cookie,
+					local->hw_roc_skb->data,
+					local->hw_roc_skb->len, false,
+					GFP_KERNEL);
+
+		kfree_skb(local->hw_roc_skb);
+		local->hw_roc_skb = NULL;
+		local->hw_roc_skb_for_status = NULL;
+	}
+
+	if (!local->hw_roc_for_tx)
+		cfg80211_remain_on_channel_expired(local->hw_roc_dev,
+						   local->hw_roc_cookie,
+						   local->hw_roc_channel,
+						   local->hw_roc_channel_type,
+						   GFP_KERNEL);
+
+	local->hw_roc_channel = NULL;
+	local->hw_roc_cookie = 0;
+
+	ieee80211_recalc_idle(local);
+
+	mutex_unlock(&local->mtx);
+}
+
+void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_remain_on_channel_expired(local);
+
+	ieee80211_queue_work(hw, &local->hw_roc_done);
+}
+EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired);
+
+void ieee80211_hw_roc_setup(struct ieee80211_local *local)
+{
+	INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start);
+	INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done);
+}
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
new file mode 100644
index 00000000..730778a2
--- /dev/null
+++ b/net/mac80211/pm.c
@@ -0,0 +1,130 @@
+#include <net/mac80211.h>
+#include <net/rtnetlink.h>
+
+#include "ieee80211_i.h"
+#include "mesh.h"
+#include "driver-ops.h"
+#include "led.h"
+
+int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+
+	ieee80211_scan_cancel(local);
+
+	if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+		mutex_lock(&local->sta_mtx);
+		list_for_each_entry(sta, &local->sta_list, list) {
+			set_sta_flags(sta, WLAN_STA_BLOCK_BA);
+			ieee80211_sta_tear_down_BA_sessions(sta, true);
+		}
+		mutex_unlock(&local->sta_mtx);
+	}
+
+	ieee80211_stop_queues_by_reason(hw,
+			IEEE80211_QUEUE_STOP_REASON_SUSPEND);
+
+	/* flush out all packets */
+	synchronize_net();
+
+	drv_flush(local, false);
+
+	local->quiescing = true;
+	/* make quiescing visible to timers everywhere */
+	mb();
+
+	flush_workqueue(local->workqueue);
+
+	/* Don't try to run timers while suspended. */
+	del_timer_sync(&local->sta_cleanup);
+
+	 /*
+	 * Note that this particular timer doesn't need to be
+	 * restarted at resume.
+	 */
+	cancel_work_sync(&local->dynamic_ps_enable_work);
+	del_timer_sync(&local->dynamic_ps_timer);
+
+	local->wowlan = wowlan && local->open_count;
+	if (local->wowlan) {
+		int err = drv_suspend(local, wowlan);
+		if (err) {
+			local->quiescing = false;
+			return err;
+		}
+		goto suspend;
+	}
+
+	/* disable keys */
+	list_for_each_entry(sdata, &local->interfaces, list)
+		ieee80211_disable_keys(sdata);
+
+	/* tear down aggregation sessions and remove STAs */
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry(sta, &local->sta_list, list) {
+		if (sta->uploaded) {
+			sdata = sta->sdata;
+			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+				sdata = container_of(sdata->bss,
+					     struct ieee80211_sub_if_data,
+					     u.ap);
+
+			drv_sta_remove(local, sdata, &sta->sta);
+		}
+
+		mesh_plink_quiesce(sta);
+	}
+	mutex_unlock(&local->sta_mtx);
+
+	/* remove all interfaces */
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		cancel_work_sync(&sdata->work);
+
+		switch(sdata->vif.type) {
+		case NL80211_IFTYPE_STATION:
+			ieee80211_sta_quiesce(sdata);
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			ieee80211_ibss_quiesce(sdata);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			ieee80211_mesh_quiesce(sdata);
+			break;
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_MONITOR:
+			/* don't tell driver about this */
+			continue;
+		default:
+			break;
+		}
+
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		/* disable beaconing */
+		ieee80211_bss_info_change_notify(sdata,
+			BSS_CHANGED_BEACON_ENABLED);
+
+		drv_remove_interface(local, &sdata->vif);
+	}
+
+	/* stop hardware - this must stop RX */
+	if (local->open_count)
+		ieee80211_stop_device(local);
+
+ suspend:
+	local->suspended = true;
+	/* need suspended to be visible before quiescing is false */
+	barrier();
+	local->quiescing = false;
+
+	return 0;
+}
+
+/*
+ * __ieee80211_resume() is a static inline which just calls
+ * ieee80211_reconfig(), which is also needed for hardware
+ * hang/firmware failure/etc. recovery.
+ */
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
new file mode 100644
index 00000000..816590b0
--- /dev/null
+++ b/net/mac80211/rate.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include "rate.h"
+#include "ieee80211_i.h"
+#include "debugfs.h"
+
+struct rate_control_alg {
+	struct list_head list;
+	struct rate_control_ops *ops;
+};
+
+static LIST_HEAD(rate_ctrl_algs);
+static DEFINE_MUTEX(rate_ctrl_mutex);
+
+static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT;
+module_param(ieee80211_default_rc_algo, charp, 0644);
+MODULE_PARM_DESC(ieee80211_default_rc_algo,
+		 "Default rate control algorithm for mac80211 to use");
+
+int ieee80211_rate_control_register(struct rate_control_ops *ops)
+{
+	struct rate_control_alg *alg;
+
+	if (!ops->name)
+		return -EINVAL;
+
+	mutex_lock(&rate_ctrl_mutex);
+	list_for_each_entry(alg, &rate_ctrl_algs, list) {
+		if (!strcmp(alg->ops->name, ops->name)) {
+			/* don't register an algorithm twice */
+			WARN_ON(1);
+			mutex_unlock(&rate_ctrl_mutex);
+			return -EALREADY;
+		}
+	}
+
+	alg = kzalloc(sizeof(*alg), GFP_KERNEL);
+	if (alg == NULL) {
+		mutex_unlock(&rate_ctrl_mutex);
+		return -ENOMEM;
+	}
+	alg->ops = ops;
+
+	list_add_tail(&alg->list, &rate_ctrl_algs);
+	mutex_unlock(&rate_ctrl_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_rate_control_register);
+
+void ieee80211_rate_control_unregister(struct rate_control_ops *ops)
+{
+	struct rate_control_alg *alg;
+
+	mutex_lock(&rate_ctrl_mutex);
+	list_for_each_entry(alg, &rate_ctrl_algs, list) {
+		if (alg->ops == ops) {
+			list_del(&alg->list);
+			kfree(alg);
+			break;
+		}
+	}
+	mutex_unlock(&rate_ctrl_mutex);
+}
+EXPORT_SYMBOL(ieee80211_rate_control_unregister);
+
+static struct rate_control_ops *
+ieee80211_try_rate_control_ops_get(const char *name)
+{
+	struct rate_control_alg *alg;
+	struct rate_control_ops *ops = NULL;
+
+	if (!name)
+		return NULL;
+
+	mutex_lock(&rate_ctrl_mutex);
+	list_for_each_entry(alg, &rate_ctrl_algs, list) {
+		if (!strcmp(alg->ops->name, name))
+			if (try_module_get(alg->ops->module)) {
+				ops = alg->ops;
+				break;
+			}
+	}
+	mutex_unlock(&rate_ctrl_mutex);
+	return ops;
+}
+
+/* Get the rate control algorithm. */
+static struct rate_control_ops *
+ieee80211_rate_control_ops_get(const char *name)
+{
+	struct rate_control_ops *ops;
+	const char *alg_name;
+
+	kparam_block_sysfs_write(ieee80211_default_rc_algo);
+	if (!name)
+		alg_name = ieee80211_default_rc_algo;
+	else
+		alg_name = name;
+
+	ops = ieee80211_try_rate_control_ops_get(alg_name);
+	if (!ops) {
+		request_module("rc80211_%s", alg_name);
+		ops = ieee80211_try_rate_control_ops_get(alg_name);
+	}
+	if (!ops && name)
+		/* try default if specific alg requested but not found */
+		ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo);
+
+	/* try built-in one if specific alg requested but not found */
+	if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
+		ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
+	kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
+
+	return ops;
+}
+
+static void ieee80211_rate_control_ops_put(struct rate_control_ops *ops)
+{
+	module_put(ops->module);
+}
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+static ssize_t rcname_read(struct file *file, char __user *userbuf,
+			   size_t count, loff_t *ppos)
+{
+	struct rate_control_ref *ref = file->private_data;
+	int len = strlen(ref->ops->name);
+
+	return simple_read_from_buffer(userbuf, count, ppos,
+				       ref->ops->name, len);
+}
+
+static const struct file_operations rcname_ops = {
+	.read = rcname_read,
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
+};
+#endif
+
+static struct rate_control_ref *rate_control_alloc(const char *name,
+					    struct ieee80211_local *local)
+{
+	struct dentry *debugfsdir = NULL;
+	struct rate_control_ref *ref;
+
+	ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL);
+	if (!ref)
+		goto fail_ref;
+	kref_init(&ref->kref);
+	ref->local = local;
+	ref->ops = ieee80211_rate_control_ops_get(name);
+	if (!ref->ops)
+		goto fail_ops;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir);
+	local->debugfs.rcdir = debugfsdir;
+	debugfs_create_file("name", 0400, debugfsdir, ref, &rcname_ops);
+#endif
+
+	ref->priv = ref->ops->alloc(&local->hw, debugfsdir);
+	if (!ref->priv)
+		goto fail_priv;
+	return ref;
+
+fail_priv:
+	ieee80211_rate_control_ops_put(ref->ops);
+fail_ops:
+	kfree(ref);
+fail_ref:
+	return NULL;
+}
+
+static void rate_control_release(struct kref *kref)
+{
+	struct rate_control_ref *ctrl_ref;
+
+	ctrl_ref = container_of(kref, struct rate_control_ref, kref);
+	ctrl_ref->ops->free(ctrl_ref->priv);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	debugfs_remove_recursive(ctrl_ref->local->debugfs.rcdir);
+	ctrl_ref->local->debugfs.rcdir = NULL;
+#endif
+
+	ieee80211_rate_control_ops_put(ctrl_ref->ops);
+	kfree(ctrl_ref);
+}
+
+static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc)
+{
+	struct sk_buff *skb = txrc->skb;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	__le16 fc;
+
+	fc = hdr->frame_control;
+
+	return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc);
+}
+
+static void rc_send_low_broadcast(s8 *idx, u32 basic_rates,
+				  struct ieee80211_supported_band *sband)
+{
+	u8 i;
+
+	if (basic_rates == 0)
+		return; /* assume basic rates unknown and accept rate */
+	if (*idx < 0)
+		return;
+	if (basic_rates & (1 << *idx))
+		return; /* selected rate is a basic rate */
+
+	for (i = *idx + 1; i <= sband->n_bitrates; i++) {
+		if (basic_rates & (1 << i)) {
+			*idx = i;
+			return;
+		}
+	}
+
+	/* could not find a basic rate; use original selection */
+}
+
+bool rate_control_send_low(struct ieee80211_sta *sta,
+			   void *priv_sta,
+			   struct ieee80211_tx_rate_control *txrc)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+	struct ieee80211_supported_band *sband = txrc->sband;
+	int mcast_rate;
+
+	if (!sta || !priv_sta || rc_no_data_or_no_ack(txrc)) {
+		info->control.rates[0].idx = rate_lowest_index(txrc->sband, sta);
+		info->control.rates[0].count =
+			(info->flags & IEEE80211_TX_CTL_NO_ACK) ?
+			1 : txrc->hw->max_rate_tries;
+		if (!sta && txrc->bss) {
+			mcast_rate = txrc->bss_conf->mcast_rate[sband->band];
+			if (mcast_rate > 0) {
+				info->control.rates[0].idx = mcast_rate - 1;
+				return true;
+			}
+
+			rc_send_low_broadcast(&info->control.rates[0].idx,
+					      txrc->bss_conf->basic_rates,
+					      sband);
+		}
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(rate_control_send_low);
+
+static void rate_idx_match_mask(struct ieee80211_tx_rate *rate,
+				int n_bitrates, u32 mask)
+{
+	int j;
+
+	/* See whether the selected rate or anything below it is allowed. */
+	for (j = rate->idx; j >= 0; j--) {
+		if (mask & (1 << j)) {
+			/* Okay, found a suitable rate. Use it. */
+			rate->idx = j;
+			return;
+		}
+	}
+
+	/* Try to find a higher rate that would be allowed */
+	for (j = rate->idx + 1; j < n_bitrates; j++) {
+		if (mask & (1 << j)) {
+			/* Okay, found a suitable rate. Use it. */
+			rate->idx = j;
+			return;
+		}
+	}
+
+	/*
+	 * Uh.. No suitable rate exists. This should not really happen with
+	 * sane TX rate mask configurations. However, should someone manage to
+	 * configure supported rates and TX rate mask in incompatible way,
+	 * allow the frame to be transmitted with whatever the rate control
+	 * selected.
+	 */
+}
+
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
+			   struct sta_info *sta,
+			   struct ieee80211_tx_rate_control *txrc)
+{
+	struct rate_control_ref *ref = sdata->local->rate_ctrl;
+	void *priv_sta = NULL;
+	struct ieee80211_sta *ista = NULL;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+	int i;
+	u32 mask;
+
+	if (sta) {
+		ista = &sta->sta;
+		priv_sta = sta->rate_ctrl_priv;
+	}
+
+	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+		info->control.rates[i].idx = -1;
+		info->control.rates[i].flags = 0;
+		info->control.rates[i].count = 0;
+	}
+
+	if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+		return;
+
+	ref->ops->get_rate(ref->priv, ista, priv_sta, txrc);
+
+	/*
+	 * Try to enforce the rateidx mask the user wanted. skip this if the
+	 * default mask (allow all rates) is used to save some processing for
+	 * the common case.
+	 */
+	mask = sdata->rc_rateidx_mask[info->band];
+	if (mask != (1 << txrc->sband->n_bitrates) - 1) {
+		if (sta) {
+			/* Filter out rates that the STA does not support */
+			mask &= sta->sta.supp_rates[info->band];
+		}
+		/*
+		 * Make sure the rate index selected for each TX rate is
+		 * included in the configured mask and change the rate indexes
+		 * if needed.
+		 */
+		for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+			/* Skip invalid rates */
+			if (info->control.rates[i].idx < 0)
+				break;
+			/* Rate masking supports only legacy rates for now */
+			if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS)
+				continue;
+			rate_idx_match_mask(&info->control.rates[i],
+					    txrc->sband->n_bitrates, mask);
+		}
+	}
+
+	BUG_ON(info->control.rates[0].idx < 0);
+}
+
+struct rate_control_ref *rate_control_get(struct rate_control_ref *ref)
+{
+	kref_get(&ref->kref);
+	return ref;
+}
+
+void rate_control_put(struct rate_control_ref *ref)
+{
+	kref_put(&ref->kref, rate_control_release);
+}
+
+int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
+				 const char *name)
+{
+	struct rate_control_ref *ref, *old;
+
+	ASSERT_RTNL();
+
+	if (local->open_count)
+		return -EBUSY;
+
+	if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) {
+		if (WARN_ON(!local->ops->set_rts_threshold))
+			return -EINVAL;
+		return 0;
+	}
+
+	ref = rate_control_alloc(name, local);
+	if (!ref) {
+		wiphy_warn(local->hw.wiphy,
+			   "Failed to select rate control algorithm\n");
+		return -ENOENT;
+	}
+
+	old = local->rate_ctrl;
+	local->rate_ctrl = ref;
+	if (old) {
+		rate_control_put(old);
+		sta_info_flush(local, NULL);
+	}
+
+	wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n",
+		    ref->ops->name);
+
+	return 0;
+}
+
+void rate_control_deinitialize(struct ieee80211_local *local)
+{
+	struct rate_control_ref *ref;
+
+	ref = local->rate_ctrl;
+
+	if (!ref)
+		return;
+
+	local->rate_ctrl = NULL;
+	rate_control_put(ref);
+}
+
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
new file mode 100644
index 00000000..168427b0
--- /dev/null
+++ b/net/mac80211/rate.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef IEEE80211_RATE_H
+#define IEEE80211_RATE_H
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+struct rate_control_ref {
+	struct ieee80211_local *local;
+	struct rate_control_ops *ops;
+	void *priv;
+	struct kref kref;
+};
+
+void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
+			   struct sta_info *sta,
+			   struct ieee80211_tx_rate_control *txrc);
+struct rate_control_ref *rate_control_get(struct rate_control_ref *ref);
+void rate_control_put(struct rate_control_ref *ref);
+
+static inline void rate_control_tx_status(struct ieee80211_local *local,
+					  struct ieee80211_supported_band *sband,
+					  struct sta_info *sta,
+					  struct sk_buff *skb)
+{
+	struct rate_control_ref *ref = local->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+
+	if (!ref)
+		return;
+
+	ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb);
+}
+
+
+static inline void rate_control_rate_init(struct sta_info *sta)
+{
+	struct ieee80211_local *local = sta->sdata->local;
+	struct rate_control_ref *ref = sta->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+	struct ieee80211_supported_band *sband;
+
+	if (!ref)
+		return;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
+}
+
+static inline void rate_control_rate_update(struct ieee80211_local *local,
+				    struct ieee80211_supported_band *sband,
+				    struct sta_info *sta, u32 changed,
+				    enum nl80211_channel_type oper_chan_type)
+{
+	struct rate_control_ref *ref = local->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+
+	if (ref && ref->ops->rate_update)
+		ref->ops->rate_update(ref->priv, sband, ista,
+				      priv_sta, changed, oper_chan_type);
+}
+
+static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
+					   struct ieee80211_sta *sta,
+					   gfp_t gfp)
+{
+	return ref->ops->alloc_sta(ref->priv, sta, gfp);
+}
+
+static inline void rate_control_free_sta(struct sta_info *sta)
+{
+	struct rate_control_ref *ref = sta->rate_ctrl;
+	struct ieee80211_sta *ista = &sta->sta;
+	void *priv_sta = sta->rate_ctrl_priv;
+
+	ref->ops->free_sta(ref->priv, ista, priv_sta);
+}
+
+static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct rate_control_ref *ref = sta->rate_ctrl;
+	if (ref && sta->debugfs.dir && ref->ops->add_sta_debugfs)
+		ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv,
+					  sta->debugfs.dir);
+#endif
+}
+
+static inline void rate_control_remove_sta_debugfs(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct rate_control_ref *ref = sta->rate_ctrl;
+	if (ref && ref->ops->remove_sta_debugfs)
+		ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv);
+#endif
+}
+
+/* Get a reference to the rate control algorithm. If `name' is NULL, get the
+ * first available algorithm. */
+int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
+				 const char *name);
+void rate_control_deinitialize(struct ieee80211_local *local);
+
+
+/* Rate control algorithms */
+#ifdef CONFIG_MAC80211_RC_PID
+extern int rc80211_pid_init(void);
+extern void rc80211_pid_exit(void);
+#else
+static inline int rc80211_pid_init(void)
+{
+	return 0;
+}
+static inline void rc80211_pid_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL
+extern int rc80211_minstrel_init(void);
+extern void rc80211_minstrel_exit(void);
+#else
+static inline int rc80211_minstrel_init(void)
+{
+	return 0;
+}
+static inline void rc80211_minstrel_exit(void)
+{
+}
+#endif
+
+#ifdef CONFIG_MAC80211_RC_MINSTREL_HT
+extern int rc80211_minstrel_ht_init(void);
+extern void rc80211_minstrel_ht_exit(void);
+#else
+static inline int rc80211_minstrel_ht_init(void)
+{
+	return 0;
+}
+static inline void rc80211_minstrel_ht_exit(void)
+{
+}
+#endif
+
+
+#endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
new file mode 100644
index 00000000..8adac673
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.c
@@ -0,0 +1,570 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ *   Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ *   Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ *   Copyright (c) 2005 John Bicket
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer,
+ *      without modification.
+ *   2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *      similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *      redistribution must be conditioned upon including a substantially
+ *      similar Disclaimer requirement for further binary redistribution.
+ *   3. Neither the names of the above-listed copyright holders nor the names
+ *      of any contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *   Alternatively, this software may be distributed under the terms of the
+ *   GNU General Public License ("GPL") version 2 as published by the Free
+ *   Software Foundation.
+ *
+ *   NO WARRANTY
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ *   THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ *   OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+
+#define SAMPLE_COLUMNS	10
+#define SAMPLE_TBL(_mi, _idx, _col) \
+		_mi->sample_table[(_idx * SAMPLE_COLUMNS) + _col]
+
+/* convert mac80211 rate index to local array index */
+static inline int
+rix_to_ndx(struct minstrel_sta_info *mi, int rix)
+{
+	int i = rix;
+	for (i = rix; i >= 0; i--)
+		if (mi->r[i].rix == rix)
+			break;
+	return i;
+}
+
+static void
+minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
+{
+	u32 max_tp = 0, index_max_tp = 0, index_max_tp2 = 0;
+	u32 max_prob = 0, index_max_prob = 0;
+	u32 usecs;
+	u32 p;
+	int i;
+
+	mi->stats_update = jiffies;
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		usecs = mr->perfect_tx_time;
+		if (!usecs)
+			usecs = 1000000;
+
+		/* To avoid rounding issues, probabilities scale from 0 (0%)
+		 * to 18000 (100%) */
+		if (mr->attempts) {
+			p = (mr->success * 18000) / mr->attempts;
+			mr->succ_hist += mr->success;
+			mr->att_hist += mr->attempts;
+			mr->cur_prob = p;
+			p = ((p * (100 - mp->ewma_level)) + (mr->probability *
+				mp->ewma_level)) / 100;
+			mr->probability = p;
+			mr->cur_tp = p * (1000000 / usecs);
+		}
+
+		mr->last_success = mr->success;
+		mr->last_attempts = mr->attempts;
+		mr->success = 0;
+		mr->attempts = 0;
+
+		/* Sample less often below the 10% chance of success.
+		 * Sample less often above the 95% chance of success. */
+		if ((mr->probability > 17100) || (mr->probability < 1800)) {
+			mr->adjusted_retry_count = mr->retry_count >> 1;
+			if (mr->adjusted_retry_count > 2)
+				mr->adjusted_retry_count = 2;
+			mr->sample_limit = 4;
+		} else {
+			mr->sample_limit = -1;
+			mr->adjusted_retry_count = mr->retry_count;
+		}
+		if (!mr->adjusted_retry_count)
+			mr->adjusted_retry_count = 2;
+	}
+
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+		if (max_tp < mr->cur_tp) {
+			index_max_tp = i;
+			max_tp = mr->cur_tp;
+		}
+		if (max_prob < mr->probability) {
+			index_max_prob = i;
+			max_prob = mr->probability;
+		}
+	}
+
+	max_tp = 0;
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		if (i == index_max_tp)
+			continue;
+
+		if (max_tp < mr->cur_tp) {
+			index_max_tp2 = i;
+			max_tp = mr->cur_tp;
+		}
+	}
+	mi->max_tp_rate = index_max_tp;
+	mi->max_tp_rate2 = index_max_tp2;
+	mi->max_prob_rate = index_max_prob;
+}
+
+static void
+minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
+                   struct ieee80211_sta *sta, void *priv_sta,
+		   struct sk_buff *skb)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_tx_rate *ar = info->status.rates;
+	int i, ndx;
+	int success;
+
+	success = !!(info->flags & IEEE80211_TX_STAT_ACK);
+
+	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+		if (ar[i].idx < 0)
+			break;
+
+		ndx = rix_to_ndx(mi, ar[i].idx);
+		if (ndx < 0)
+			continue;
+
+		mi->r[ndx].attempts += ar[i].count;
+
+		if ((i != IEEE80211_TX_MAX_RATES - 1) && (ar[i + 1].idx < 0))
+			mi->r[ndx].success += success;
+	}
+
+	if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && (i >= 0))
+		mi->sample_count++;
+
+	if (mi->sample_deferred > 0)
+		mi->sample_deferred--;
+}
+
+
+static inline unsigned int
+minstrel_get_retry_count(struct minstrel_rate *mr,
+                         struct ieee80211_tx_info *info)
+{
+	unsigned int retry = mr->adjusted_retry_count;
+
+	if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS)
+		retry = max(2U, min(mr->retry_count_rtscts, retry));
+	else if (info->control.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT)
+		retry = max(2U, min(mr->retry_count_cts, retry));
+	return retry;
+}
+
+
+static int
+minstrel_get_next_sample(struct minstrel_sta_info *mi)
+{
+	unsigned int sample_ndx;
+	sample_ndx = SAMPLE_TBL(mi, mi->sample_idx, mi->sample_column);
+	mi->sample_idx++;
+	if ((int) mi->sample_idx > (mi->n_rates - 2)) {
+		mi->sample_idx = 0;
+		mi->sample_column++;
+		if (mi->sample_column >= SAMPLE_COLUMNS)
+			mi->sample_column = 0;
+	}
+	return sample_ndx;
+}
+
+static void
+minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
+		  void *priv_sta, struct ieee80211_tx_rate_control *txrc)
+{
+	struct sk_buff *skb = txrc->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct minstrel_sta_info *mi = priv_sta;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_tx_rate *ar = info->control.rates;
+	unsigned int ndx, sample_ndx = 0;
+	bool mrr;
+	bool sample_slower = false;
+	bool sample = false;
+	int i, delta;
+	int mrr_ndx[3];
+	int sample_rate;
+
+	if (rate_control_send_low(sta, priv_sta, txrc))
+		return;
+
+	mrr = mp->has_mrr && !txrc->rts && !txrc->bss_conf->use_cts_prot;
+
+	if (time_after(jiffies, mi->stats_update + (mp->update_interval *
+			HZ) / 1000))
+		minstrel_update_stats(mp, mi);
+
+	ndx = mi->max_tp_rate;
+
+	if (mrr)
+		sample_rate = mp->lookaround_rate_mrr;
+	else
+		sample_rate = mp->lookaround_rate;
+
+	mi->packet_count++;
+	delta = (mi->packet_count * sample_rate / 100) -
+			(mi->sample_count + mi->sample_deferred / 2);
+
+	/* delta > 0: sampling required */
+	if ((delta > 0) && (mrr || !mi->prev_sample)) {
+		struct minstrel_rate *msr;
+		if (mi->packet_count >= 10000) {
+			mi->sample_deferred = 0;
+			mi->sample_count = 0;
+			mi->packet_count = 0;
+		} else if (delta > mi->n_rates * 2) {
+			/* With multi-rate retry, not every planned sample
+			 * attempt actually gets used, due to the way the retry
+			 * chain is set up - [max_tp,sample,prob,lowest] for
+			 * sample_rate < max_tp.
+			 *
+			 * If there's too much sampling backlog and the link
+			 * starts getting worse, minstrel would start bursting
+			 * out lots of sampling frames, which would result
+			 * in a large throughput loss. */
+			mi->sample_count += (delta - mi->n_rates * 2);
+		}
+
+		sample_ndx = minstrel_get_next_sample(mi);
+		msr = &mi->r[sample_ndx];
+		sample = true;
+		sample_slower = mrr && (msr->perfect_tx_time >
+			mi->r[ndx].perfect_tx_time);
+
+		if (!sample_slower) {
+			if (msr->sample_limit != 0) {
+				ndx = sample_ndx;
+				mi->sample_count++;
+				if (msr->sample_limit > 0)
+					msr->sample_limit--;
+			} else {
+				sample = false;
+			}
+		} else {
+			/* Only use IEEE80211_TX_CTL_RATE_CTRL_PROBE to mark
+			 * packets that have the sampling rate deferred to the
+			 * second MRR stage. Increase the sample counter only
+			 * if the deferred sample rate was actually used.
+			 * Use the sample_deferred counter to make sure that
+			 * the sampling is not done in large bursts */
+			info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+			mi->sample_deferred++;
+		}
+	}
+	mi->prev_sample = sample;
+
+	/* If we're not using MRR and the sampling rate already
+	 * has a probability of >95%, we shouldn't be attempting
+	 * to use it, as this only wastes precious airtime */
+	if (!mrr && sample && (mi->r[ndx].probability > 17100))
+		ndx = mi->max_tp_rate;
+
+	ar[0].idx = mi->r[ndx].rix;
+	ar[0].count = minstrel_get_retry_count(&mi->r[ndx], info);
+
+	if (!mrr) {
+		if (!sample)
+			ar[0].count = mp->max_retry;
+		ar[1].idx = mi->lowest_rix;
+		ar[1].count = mp->max_retry;
+		return;
+	}
+
+	/* MRR setup */
+	if (sample) {
+		if (sample_slower)
+			mrr_ndx[0] = sample_ndx;
+		else
+			mrr_ndx[0] = mi->max_tp_rate;
+	} else {
+		mrr_ndx[0] = mi->max_tp_rate2;
+	}
+	mrr_ndx[1] = mi->max_prob_rate;
+	mrr_ndx[2] = 0;
+	for (i = 1; i < 4; i++) {
+		ar[i].idx = mi->r[mrr_ndx[i - 1]].rix;
+		ar[i].count = mi->r[mrr_ndx[i - 1]].adjusted_retry_count;
+	}
+}
+
+
+static void
+calc_rate_durations(struct minstrel_sta_info *mi, struct ieee80211_local *local,
+                    struct minstrel_rate *d, struct ieee80211_rate *rate)
+{
+	int erp = !!(rate->flags & IEEE80211_RATE_ERP_G);
+
+	d->perfect_tx_time = ieee80211_frame_duration(local, 1200,
+			rate->bitrate, erp, 1);
+	d->ack_time = ieee80211_frame_duration(local, 10,
+			rate->bitrate, erp, 1);
+}
+
+static void
+init_sample_table(struct minstrel_sta_info *mi)
+{
+	unsigned int i, col, new_idx;
+	unsigned int n_srates = mi->n_rates - 1;
+	u8 rnd[8];
+
+	mi->sample_column = 0;
+	mi->sample_idx = 0;
+	memset(mi->sample_table, 0, SAMPLE_COLUMNS * mi->n_rates);
+
+	for (col = 0; col < SAMPLE_COLUMNS; col++) {
+		for (i = 0; i < n_srates; i++) {
+			get_random_bytes(rnd, sizeof(rnd));
+			new_idx = (i + rnd[i & 7]) % n_srates;
+
+			while (SAMPLE_TBL(mi, new_idx, col) != 0)
+				new_idx = (new_idx + 1) % n_srates;
+
+			/* Don't sample the slowest rate (i.e. slowest base
+			 * rate). We must presume that the slowest rate works
+			 * fine, or else other management frames will also be
+			 * failing and the link will break */
+			SAMPLE_TBL(mi, new_idx, col) = i + 1;
+		}
+	}
+}
+
+static void
+minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
+               struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_local *local = hw_to_local(mp->hw);
+	struct ieee80211_rate *ctl_rate;
+	unsigned int i, n = 0;
+	unsigned int t_slot = 9; /* FIXME: get real slot time */
+
+	mi->lowest_rix = rate_lowest_index(sband, sta);
+	ctl_rate = &sband->bitrates[mi->lowest_rix];
+	mi->sp_ack_dur = ieee80211_frame_duration(local, 10, ctl_rate->bitrate,
+				!!(ctl_rate->flags & IEEE80211_RATE_ERP_G), 1);
+
+	for (i = 0; i < sband->n_bitrates; i++) {
+		struct minstrel_rate *mr = &mi->r[n];
+		unsigned int tx_time = 0, tx_time_cts = 0, tx_time_rtscts = 0;
+		unsigned int tx_time_single;
+		unsigned int cw = mp->cw_min;
+
+		if (!rate_supported(sta, sband->band, i))
+			continue;
+		n++;
+		memset(mr, 0, sizeof(*mr));
+
+		mr->rix = i;
+		mr->bitrate = sband->bitrates[i].bitrate / 5;
+		calc_rate_durations(mi, local, mr,
+				&sband->bitrates[i]);
+
+		/* calculate maximum number of retransmissions before
+		 * fallback (based on maximum segment size) */
+		mr->sample_limit = -1;
+		mr->retry_count = 1;
+		mr->retry_count_cts = 1;
+		mr->retry_count_rtscts = 1;
+		tx_time = mr->perfect_tx_time + mi->sp_ack_dur;
+		do {
+			/* add one retransmission */
+			tx_time_single = mr->ack_time + mr->perfect_tx_time;
+
+			/* contention window */
+			tx_time_single += (t_slot * cw) >> 1;
+			cw = min((cw << 1) | 1, mp->cw_max);
+
+			tx_time += tx_time_single;
+			tx_time_cts += tx_time_single + mi->sp_ack_dur;
+			tx_time_rtscts += tx_time_single + 2 * mi->sp_ack_dur;
+			if ((tx_time_cts < mp->segment_size) &&
+				(mr->retry_count_cts < mp->max_retry))
+				mr->retry_count_cts++;
+			if ((tx_time_rtscts < mp->segment_size) &&
+				(mr->retry_count_rtscts < mp->max_retry))
+				mr->retry_count_rtscts++;
+		} while ((tx_time < mp->segment_size) &&
+				(++mr->retry_count < mp->max_retry));
+		mr->adjusted_retry_count = mr->retry_count;
+	}
+
+	for (i = n; i < sband->n_bitrates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+		mr->rix = -1;
+	}
+
+	mi->n_rates = n;
+	mi->stats_update = jiffies;
+
+	init_sample_table(mi);
+}
+
+static void *
+minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+	struct ieee80211_supported_band *sband;
+	struct minstrel_sta_info *mi;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_hw *hw = mp->hw;
+	int max_rates = 0;
+	int i;
+
+	mi = kzalloc(sizeof(struct minstrel_sta_info), gfp);
+	if (!mi)
+		return NULL;
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = hw->wiphy->bands[i];
+		if (sband && sband->n_bitrates > max_rates)
+			max_rates = sband->n_bitrates;
+	}
+
+	mi->r = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+	if (!mi->r)
+		goto error;
+
+	mi->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+	if (!mi->sample_table)
+		goto error1;
+
+	mi->stats_update = jiffies;
+	return mi;
+
+error1:
+	kfree(mi->r);
+error:
+	kfree(mi);
+	return NULL;
+}
+
+static void
+minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	kfree(mi->sample_table);
+	kfree(mi->r);
+	kfree(mi);
+}
+
+static void *
+minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+	struct minstrel_priv *mp;
+
+	mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
+	if (!mp)
+		return NULL;
+
+	/* contention window settings
+	 * Just an approximation. Using the per-queue values would complicate
+	 * the calculations and is probably unnecessary */
+	mp->cw_min = 15;
+	mp->cw_max = 1023;
+
+	/* number of packets (in %) to use for sampling other rates
+	 * sample less often for non-mrr packets, because the overhead
+	 * is much higher than with mrr */
+	mp->lookaround_rate = 5;
+	mp->lookaround_rate_mrr = 10;
+
+	/* moving average weight for EWMA */
+	mp->ewma_level = 75;
+
+	/* maximum time that the hw is allowed to stay in one MRR segment */
+	mp->segment_size = 6000;
+
+	if (hw->max_rate_tries > 0)
+		mp->max_retry = hw->max_rate_tries;
+	else
+		/* safe default, does not necessarily have to match hw properties */
+		mp->max_retry = 7;
+
+	if (hw->max_rates >= 4)
+		mp->has_mrr = true;
+
+	mp->hw = hw;
+	mp->update_interval = 100;
+
+	return mp;
+}
+
+static void
+minstrel_free(void *priv)
+{
+	kfree(priv);
+}
+
+struct rate_control_ops mac80211_minstrel = {
+	.name = "minstrel",
+	.tx_status = minstrel_tx_status,
+	.get_rate = minstrel_get_rate,
+	.rate_init = minstrel_rate_init,
+	.alloc = minstrel_alloc,
+	.free = minstrel_free,
+	.alloc_sta = minstrel_alloc_sta,
+	.free_sta = minstrel_free_sta,
+#ifdef CONFIG_MAC80211_DEBUGFS
+	.add_sta_debugfs = minstrel_add_sta_debugfs,
+	.remove_sta_debugfs = minstrel_remove_sta_debugfs,
+#endif
+};
+
+int __init
+rc80211_minstrel_init(void)
+{
+	return ieee80211_rate_control_register(&mac80211_minstrel);
+}
+
+void
+rc80211_minstrel_exit(void)
+{
+	ieee80211_rate_control_unregister(&mac80211_minstrel);
+}
+
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
new file mode 100644
index 00000000..0f5a8337
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_H
+#define __RC_MINSTREL_H
+
+struct minstrel_rate {
+	int bitrate;
+	int rix;
+
+	unsigned int perfect_tx_time;
+	unsigned int ack_time;
+
+	int sample_limit;
+	unsigned int retry_count;
+	unsigned int retry_count_cts;
+	unsigned int retry_count_rtscts;
+	unsigned int adjusted_retry_count;
+
+	u32 success;
+	u32 attempts;
+	u32 last_attempts;
+	u32 last_success;
+
+	/* parts per thousand */
+	u32 cur_prob;
+	u32 probability;
+
+	/* per-rate throughput */
+	u32 cur_tp;
+
+	u64 succ_hist;
+	u64 att_hist;
+};
+
+struct minstrel_sta_info {
+	unsigned long stats_update;
+	unsigned int sp_ack_dur;
+	unsigned int rate_avg;
+
+	unsigned int lowest_rix;
+
+	unsigned int max_tp_rate;
+	unsigned int max_tp_rate2;
+	unsigned int max_prob_rate;
+	unsigned int packet_count;
+	unsigned int sample_count;
+	int sample_deferred;
+
+	unsigned int sample_idx;
+	unsigned int sample_column;
+
+	int n_rates;
+	struct minstrel_rate *r;
+	bool prev_sample;
+
+	/* sampling table */
+	u8 *sample_table;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *dbg_stats;
+#endif
+};
+
+struct minstrel_priv {
+	struct ieee80211_hw *hw;
+	bool has_mrr;
+	unsigned int cw_min;
+	unsigned int cw_max;
+	unsigned int max_retry;
+	unsigned int ewma_level;
+	unsigned int segment_size;
+	unsigned int update_interval;
+	unsigned int lookaround_rate;
+	unsigned int lookaround_rate_mrr;
+};
+
+struct minstrel_debugfs_info {
+	size_t len;
+	char buf[];
+};
+
+extern struct rate_control_ops mac80211_minstrel;
+void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
+
+/* debugfs */
+int minstrel_stats_open(struct inode *inode, struct file *file);
+ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos);
+int minstrel_stats_release(struct inode *inode, struct file *file);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
new file mode 100644
index 00000000..a290ad23
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on minstrel.c:
+ *   Copyright (C) 2005-2007 Derek Smithies <derek@indranet.co.nz>
+ *   Sponsored by Indranet Technologies Ltd
+ *
+ * Based on sample.c:
+ *   Copyright (c) 2005 John Bicket
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer,
+ *      without modification.
+ *   2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *      similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *      redistribution must be conditioned upon including a substantially
+ *      similar Disclaimer requirement for further binary redistribution.
+ *   3. Neither the names of the above-listed copyright holders nor the names
+ *      of any contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ *   Alternatively, this software may be distributed under the terms of the
+ *   GNU General Public License ("GPL") version 2 as published by the Free
+ *   Software Foundation.
+ *
+ *   NO WARRANTY
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ *   AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ *   THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ *   OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ *   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ *   THE POSSIBILITY OF SUCH DAMAGES.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+
+int
+minstrel_stats_open(struct inode *inode, struct file *file)
+{
+	struct minstrel_sta_info *mi = inode->i_private;
+	struct minstrel_debugfs_info *ms;
+	unsigned int i, tp, prob, eprob;
+	char *p;
+
+	ms = kmalloc(sizeof(*ms) + 4096, GFP_KERNEL);
+	if (!ms)
+		return -ENOMEM;
+
+	file->private_data = ms;
+	p = ms->buf;
+	p += sprintf(p, "rate     throughput  ewma prob   this prob  "
+			"this succ/attempt   success    attempts\n");
+	for (i = 0; i < mi->n_rates; i++) {
+		struct minstrel_rate *mr = &mi->r[i];
+
+		*(p++) = (i == mi->max_tp_rate) ? 'T' : ' ';
+		*(p++) = (i == mi->max_tp_rate2) ? 't' : ' ';
+		*(p++) = (i == mi->max_prob_rate) ? 'P' : ' ';
+		p += sprintf(p, "%3u%s", mr->bitrate / 2,
+				(mr->bitrate & 1 ? ".5" : "  "));
+
+		tp = mr->cur_tp / ((18000 << 10) / 96);
+		prob = mr->cur_prob / 18;
+		eprob = mr->probability / 18;
+
+		p += sprintf(p, "  %6u.%1u   %6u.%1u   %6u.%1u        "
+				"%3u(%3u)   %8llu    %8llu\n",
+				tp / 10, tp % 10,
+				eprob / 10, eprob % 10,
+				prob / 10, prob % 10,
+				mr->last_success,
+				mr->last_attempts,
+				(unsigned long long)mr->succ_hist,
+				(unsigned long long)mr->att_hist);
+	}
+	p += sprintf(p, "\nTotal packet count::    ideal %d      "
+			"lookaround %d\n\n",
+			mi->packet_count - mi->sample_count,
+			mi->sample_count);
+	ms->len = p - ms->buf;
+
+	return 0;
+}
+
+ssize_t
+minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
+{
+	struct minstrel_debugfs_info *ms;
+
+	ms = file->private_data;
+	return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
+}
+
+int
+minstrel_stats_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static const struct file_operations minstrel_stat_fops = {
+	.owner = THIS_MODULE,
+	.open = minstrel_stats_open,
+	.read = minstrel_stats_read,
+	.release = minstrel_stats_release,
+	.llseek = default_llseek,
+};
+
+void
+minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi,
+			&minstrel_stat_fops);
+}
+
+void
+minstrel_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+	struct minstrel_sta_info *mi = priv_sta;
+
+	debugfs_remove(mi->dbg_stats);
+}
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
new file mode 100644
index 00000000..333b5118
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -0,0 +1,881 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+#define AVG_PKT_SIZE	1200
+#define SAMPLE_COLUMNS	10
+#define EWMA_LEVEL		75
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of symbols for a packet with (bps) bits per symbol */
+#define MCS_NSYMS(bps) ((MCS_NBITS + (bps) - 1) / (bps))
+
+/* Transmission time for a packet containing (syms) symbols */
+#define MCS_SYMBOL_TIME(sgi, syms)					\
+	(sgi ?								\
+	  ((syms) * 18 + 4) / 5 :	/* syms * 3.6 us */		\
+	  (syms) << 2			/* syms * 4 us */		\
+	)
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps)))
+
+/* MCS rate information for an MCS group */
+#define MCS_GROUP(_streams, _sgi, _ht40) {				\
+	.streams = _streams,						\
+	.flags =							\
+		(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) |			\
+		(_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0),		\
+	.duration = {							\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260)		\
+	}								\
+}
+
+/*
+ * To enable sufficiently targeted rate sampling, MCS rates are divided into
+ * groups, based on the number of streams and flags (HT40, SGI) that they
+ * use.
+ */
+const struct mcs_group minstrel_mcs_groups[] = {
+	MCS_GROUP(1, 0, 0),
+	MCS_GROUP(2, 0, 0),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 0, 0),
+#endif
+
+	MCS_GROUP(1, 1, 0),
+	MCS_GROUP(2, 1, 0),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 1, 0),
+#endif
+
+	MCS_GROUP(1, 0, 1),
+	MCS_GROUP(2, 0, 1),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 0, 1),
+#endif
+
+	MCS_GROUP(1, 1, 1),
+	MCS_GROUP(2, 1, 1),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 1, 1),
+#endif
+};
+
+static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES];
+
+/*
+ * Perform EWMA (Exponentially Weighted Moving Average) calculation
+ */
+static int
+minstrel_ewma(int old, int new, int weight)
+{
+	return (new * (100 - weight) + old * weight) / 100;
+}
+
+/*
+ * Look up an MCS group index based on mac80211 rate information
+ */
+static int
+minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate)
+{
+	int streams = (rate->idx / MCS_GROUP_RATES) + 1;
+	u32 flags = IEEE80211_TX_RC_SHORT_GI | IEEE80211_TX_RC_40_MHZ_WIDTH;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) {
+		if (minstrel_mcs_groups[i].streams != streams)
+			continue;
+		if (minstrel_mcs_groups[i].flags != (rate->flags & flags))
+			continue;
+
+		return i;
+	}
+
+	WARN_ON(1);
+	return 0;
+}
+
+static inline struct minstrel_rate_stats *
+minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index)
+{
+	return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES];
+}
+
+
+/*
+ * Recalculate success probabilities and counters for a rate using EWMA
+ */
+static void
+minstrel_calc_rate_ewma(struct minstrel_priv *mp, struct minstrel_rate_stats *mr)
+{
+	if (unlikely(mr->attempts > 0)) {
+		mr->sample_skipped = 0;
+		mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts);
+		if (!mr->att_hist)
+			mr->probability = mr->cur_prob;
+		else
+			mr->probability = minstrel_ewma(mr->probability,
+				mr->cur_prob, EWMA_LEVEL);
+		mr->att_hist += mr->attempts;
+		mr->succ_hist += mr->success;
+	} else {
+		mr->sample_skipped++;
+	}
+	mr->last_success = mr->success;
+	mr->last_attempts = mr->attempts;
+	mr->success = 0;
+	mr->attempts = 0;
+}
+
+/*
+ * Calculate throughput based on the average A-MPDU length, taking into account
+ * the expected number of retransmissions and their expected length
+ */
+static void
+minstrel_ht_calc_tp(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                    int group, int rate)
+{
+	struct minstrel_rate_stats *mr;
+	unsigned int usecs;
+
+	mr = &mi->groups[group].rates[rate];
+
+	if (mr->probability < MINSTREL_FRAC(1, 10)) {
+		mr->cur_tp = 0;
+		return;
+	}
+
+	usecs = mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len);
+	usecs += minstrel_mcs_groups[group].duration[rate];
+	mr->cur_tp = MINSTREL_TRUNC((1000000 / usecs) * mr->probability);
+}
+
+/*
+ * Update rate statistics and select new primary rates
+ *
+ * Rules for rate selection:
+ *  - max_prob_rate must use only one stream, as a tradeoff between delivery
+ *    probability and throughput during strong fluctuations
+ *  - as long as the max prob rate has a probability of more than 3/4, pick
+ *    higher throughput rates, even if the probablity is a bit lower
+ */
+static void
+minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+	struct minstrel_mcs_group_data *mg;
+	struct minstrel_rate_stats *mr;
+	int cur_prob, cur_prob_tp, cur_tp, cur_tp2;
+	int group, i, index;
+
+	if (mi->ampdu_packets > 0) {
+		mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len,
+			MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL);
+		mi->ampdu_len = 0;
+		mi->ampdu_packets = 0;
+	}
+
+	mi->sample_slow = 0;
+	mi->sample_count = 0;
+	mi->max_tp_rate = 0;
+	mi->max_tp_rate2 = 0;
+	mi->max_prob_rate = 0;
+
+	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+		cur_prob = 0;
+		cur_prob_tp = 0;
+		cur_tp = 0;
+		cur_tp2 = 0;
+
+		mg = &mi->groups[group];
+		if (!mg->supported)
+			continue;
+
+		mg->max_tp_rate = 0;
+		mg->max_tp_rate2 = 0;
+		mg->max_prob_rate = 0;
+		mi->sample_count++;
+
+		for (i = 0; i < MCS_GROUP_RATES; i++) {
+			if (!(mg->supported & BIT(i)))
+				continue;
+
+			mr = &mg->rates[i];
+			mr->retry_updated = false;
+			index = MCS_GROUP_RATES * group + i;
+			minstrel_calc_rate_ewma(mp, mr);
+			minstrel_ht_calc_tp(mp, mi, group, i);
+
+			if (!mr->cur_tp)
+				continue;
+
+			/* ignore the lowest rate of each single-stream group */
+			if (!i && minstrel_mcs_groups[group].streams == 1)
+				continue;
+
+			if ((mr->cur_tp > cur_prob_tp && mr->probability >
+			     MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) {
+				mg->max_prob_rate = index;
+				cur_prob = mr->probability;
+				cur_prob_tp = mr->cur_tp;
+			}
+
+			if (mr->cur_tp > cur_tp) {
+				swap(index, mg->max_tp_rate);
+				cur_tp = mr->cur_tp;
+				mr = minstrel_get_ratestats(mi, index);
+			}
+
+			if (index >= mg->max_tp_rate)
+				continue;
+
+			if (mr->cur_tp > cur_tp2) {
+				mg->max_tp_rate2 = index;
+				cur_tp2 = mr->cur_tp;
+			}
+		}
+	}
+
+	/* try to sample up to half of the available rates during each interval */
+	mi->sample_count *= 4;
+
+	cur_prob = 0;
+	cur_prob_tp = 0;
+	cur_tp = 0;
+	cur_tp2 = 0;
+	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+		mg = &mi->groups[group];
+		if (!mg->supported)
+			continue;
+
+		mr = minstrel_get_ratestats(mi, mg->max_prob_rate);
+		if (cur_prob_tp < mr->cur_tp &&
+		    minstrel_mcs_groups[group].streams == 1) {
+			mi->max_prob_rate = mg->max_prob_rate;
+			cur_prob = mr->cur_prob;
+			cur_prob_tp = mr->cur_tp;
+		}
+
+		mr = minstrel_get_ratestats(mi, mg->max_tp_rate);
+		if (cur_tp < mr->cur_tp) {
+			mi->max_tp_rate = mg->max_tp_rate;
+			cur_tp = mr->cur_tp;
+		}
+
+		mr = minstrel_get_ratestats(mi, mg->max_tp_rate2);
+		if (cur_tp2 < mr->cur_tp) {
+			mi->max_tp_rate2 = mg->max_tp_rate2;
+			cur_tp2 = mr->cur_tp;
+		}
+	}
+
+	mi->stats_update = jiffies;
+}
+
+static bool
+minstrel_ht_txstat_valid(struct ieee80211_tx_rate *rate)
+{
+	if (!rate->count)
+		return false;
+
+	if (rate->idx < 0)
+		return false;
+
+	return !!(rate->flags & IEEE80211_TX_RC_MCS);
+}
+
+static void
+minstrel_next_sample_idx(struct minstrel_ht_sta *mi)
+{
+	struct minstrel_mcs_group_data *mg;
+
+	for (;;) {
+		mi->sample_group++;
+		mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
+		mg = &mi->groups[mi->sample_group];
+
+		if (!mg->supported)
+			continue;
+
+		if (++mg->index >= MCS_GROUP_RATES) {
+			mg->index = 0;
+			if (++mg->column >= ARRAY_SIZE(sample_table))
+				mg->column = 0;
+		}
+		break;
+	}
+}
+
+static void
+minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx,
+			bool primary)
+{
+	int group, orig_group;
+
+	orig_group = group = *idx / MCS_GROUP_RATES;
+	while (group > 0) {
+		group--;
+
+		if (!mi->groups[group].supported)
+			continue;
+
+		if (minstrel_mcs_groups[group].streams >
+		    minstrel_mcs_groups[orig_group].streams)
+			continue;
+
+		if (primary)
+			*idx = mi->groups[group].max_tp_rate;
+		else
+			*idx = mi->groups[group].max_tp_rate2;
+		break;
+	}
+}
+
+static void
+minstrel_aggr_check(struct minstrel_priv *mp, struct ieee80211_sta *pubsta, struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	u16 tid;
+
+	if (unlikely(!ieee80211_is_data_qos(hdr->frame_control)))
+		return;
+
+	if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
+		return;
+
+	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	if (likely(sta->ampdu_mlme.tid_tx[tid]))
+		return;
+
+	if (skb_get_queue_mapping(skb) == IEEE80211_AC_VO)
+		return;
+
+	ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+}
+
+static void
+minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
+                      struct ieee80211_sta *sta, void *priv_sta,
+                      struct sk_buff *skb)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_tx_rate *ar = info->status.rates;
+	struct minstrel_rate_stats *rate, *rate2;
+	struct minstrel_priv *mp = priv;
+	bool last = false;
+	int group;
+	int i = 0;
+
+	if (!msp->is_ht)
+		return mac80211_minstrel.tx_status(priv, sband, sta, &msp->legacy, skb);
+
+	/* This packet was aggregated but doesn't carry status info */
+	if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
+	    !(info->flags & IEEE80211_TX_STAT_AMPDU))
+		return;
+
+	if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) {
+		info->status.ampdu_ack_len =
+			(info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0);
+		info->status.ampdu_len = 1;
+	}
+
+	mi->ampdu_packets++;
+	mi->ampdu_len += info->status.ampdu_len;
+
+	if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) {
+		mi->sample_wait = 16 + 2 * MINSTREL_TRUNC(mi->avg_ampdu_len);
+		mi->sample_tries = 2;
+		mi->sample_count--;
+	}
+
+	if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
+		mi->sample_packets += info->status.ampdu_len;
+
+	for (i = 0; !last; i++) {
+		last = (i == IEEE80211_TX_MAX_RATES - 1) ||
+		       !minstrel_ht_txstat_valid(&ar[i + 1]);
+
+		if (!minstrel_ht_txstat_valid(&ar[i]))
+			break;
+
+		group = minstrel_ht_get_group_idx(&ar[i]);
+		rate = &mi->groups[group].rates[ar[i].idx % 8];
+
+		if (last)
+			rate->success += info->status.ampdu_ack_len;
+
+		rate->attempts += ar[i].count * info->status.ampdu_len;
+	}
+
+	/*
+	 * check for sudden death of spatial multiplexing,
+	 * downgrade to a lower number of streams if necessary.
+	 */
+	rate = minstrel_get_ratestats(mi, mi->max_tp_rate);
+	if (rate->attempts > 30 &&
+	    MINSTREL_FRAC(rate->success, rate->attempts) <
+	    MINSTREL_FRAC(20, 100))
+		minstrel_downgrade_rate(mi, &mi->max_tp_rate, true);
+
+	rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2);
+	if (rate2->attempts > 30 &&
+	    MINSTREL_FRAC(rate2->success, rate2->attempts) <
+	    MINSTREL_FRAC(20, 100))
+		minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false);
+
+	if (time_after(jiffies, mi->stats_update + (mp->update_interval / 2 * HZ) / 1000)) {
+		minstrel_ht_update_stats(mp, mi);
+		minstrel_aggr_check(mp, sta, skb);
+	}
+}
+
+static void
+minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                         int index)
+{
+	struct minstrel_rate_stats *mr;
+	const struct mcs_group *group;
+	unsigned int tx_time, tx_time_rtscts, tx_time_data;
+	unsigned int cw = mp->cw_min;
+	unsigned int ctime = 0;
+	unsigned int t_slot = 9; /* FIXME */
+	unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len);
+
+	mr = minstrel_get_ratestats(mi, index);
+	if (mr->probability < MINSTREL_FRAC(1, 10)) {
+		mr->retry_count = 1;
+		mr->retry_count_rtscts = 1;
+		return;
+	}
+
+	mr->retry_count = 2;
+	mr->retry_count_rtscts = 2;
+	mr->retry_updated = true;
+
+	group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len;
+
+	/* Contention time for first 2 tries */
+	ctime = (t_slot * cw) >> 1;
+	cw = min((cw << 1) | 1, mp->cw_max);
+	ctime += (t_slot * cw) >> 1;
+	cw = min((cw << 1) | 1, mp->cw_max);
+
+	/* Total TX time for data and Contention after first 2 tries */
+	tx_time = ctime + 2 * (mi->overhead + tx_time_data);
+	tx_time_rtscts = ctime + 2 * (mi->overhead_rtscts + tx_time_data);
+
+	/* See how many more tries we can fit inside segment size */
+	do {
+		/* Contention time for this try */
+		ctime = (t_slot * cw) >> 1;
+		cw = min((cw << 1) | 1, mp->cw_max);
+
+		/* Total TX time after this try */
+		tx_time += ctime + mi->overhead + tx_time_data;
+		tx_time_rtscts += ctime + mi->overhead_rtscts + tx_time_data;
+
+		if (tx_time_rtscts < mp->segment_size)
+			mr->retry_count_rtscts++;
+	} while ((tx_time < mp->segment_size) &&
+	         (++mr->retry_count < mp->max_retry));
+}
+
+
+static void
+minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                     struct ieee80211_tx_rate *rate, int index,
+                     struct ieee80211_tx_rate_control *txrc,
+                     bool sample, bool rtscts)
+{
+	const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	struct minstrel_rate_stats *mr;
+
+	mr = minstrel_get_ratestats(mi, index);
+	if (!mr->retry_updated)
+		minstrel_calc_retransmit(mp, mi, index);
+
+	if (sample)
+		rate->count = 1;
+	else if (mr->probability < MINSTREL_FRAC(20, 100))
+		rate->count = 2;
+	else if (rtscts)
+		rate->count = mr->retry_count_rtscts;
+	else
+		rate->count = mr->retry_count;
+
+	rate->flags = IEEE80211_TX_RC_MCS | group->flags;
+	if (rtscts)
+		rate->flags |= IEEE80211_TX_RC_USE_RTS_CTS;
+	rate->idx = index % MCS_GROUP_RATES + (group->streams - 1) * MCS_GROUP_RATES;
+}
+
+static inline int
+minstrel_get_duration(int index)
+{
+	const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	return group->duration[index % MCS_GROUP_RATES];
+}
+
+static int
+minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+	struct minstrel_rate_stats *mr;
+	struct minstrel_mcs_group_data *mg;
+	int sample_idx = 0;
+
+	if (mi->sample_wait > 0) {
+		mi->sample_wait--;
+		return -1;
+	}
+
+	if (!mi->sample_tries)
+		return -1;
+
+	mi->sample_tries--;
+	mg = &mi->groups[mi->sample_group];
+	sample_idx = sample_table[mg->column][mg->index];
+	mr = &mg->rates[sample_idx];
+	sample_idx += mi->sample_group * MCS_GROUP_RATES;
+	minstrel_next_sample_idx(mi);
+
+	/*
+	 * When not using MRR, do not sample if the probability is already
+	 * higher than 95% to avoid wasting airtime
+	 */
+	if (!mp->has_mrr && (mr->probability > MINSTREL_FRAC(95, 100)))
+		return -1;
+
+	/*
+	 * Make sure that lower rates get sampled only occasionally,
+	 * if the link is working perfectly.
+	 */
+	if (minstrel_get_duration(sample_idx) >
+	    minstrel_get_duration(mi->max_tp_rate)) {
+		if (mr->sample_skipped < 20)
+			return -1;
+
+		if (mi->sample_slow++ > 2)
+			return -1;
+	}
+
+	return sample_idx;
+}
+
+static void
+minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
+                     struct ieee80211_tx_rate_control *txrc)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+	struct ieee80211_tx_rate *ar = info->status.rates;
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct minstrel_priv *mp = priv;
+	int sample_idx;
+	bool sample = false;
+
+	if (rate_control_send_low(sta, priv_sta, txrc))
+		return;
+
+	if (!msp->is_ht)
+		return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
+
+	info->flags |= mi->tx_flags;
+	sample_idx = minstrel_get_sample_rate(mp, mi);
+	if (sample_idx >= 0) {
+		sample = true;
+		minstrel_ht_set_rate(mp, mi, &ar[0], sample_idx,
+			txrc, true, false);
+		info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+	} else {
+		minstrel_ht_set_rate(mp, mi, &ar[0], mi->max_tp_rate,
+			txrc, false, false);
+	}
+
+	if (mp->hw->max_rates >= 3) {
+		/*
+		 * At least 3 tx rates supported, use
+		 * sample_rate -> max_tp_rate -> max_prob_rate for sampling and
+		 * max_tp_rate -> max_tp_rate2 -> max_prob_rate by default.
+		 */
+		if (sample_idx >= 0)
+			minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate,
+				txrc, false, false);
+		else
+			minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate2,
+				txrc, false, true);
+
+		minstrel_ht_set_rate(mp, mi, &ar[2], mi->max_prob_rate,
+				     txrc, false, !sample);
+
+		ar[3].count = 0;
+		ar[3].idx = -1;
+	} else if (mp->hw->max_rates == 2) {
+		/*
+		 * Only 2 tx rates supported, use
+		 * sample_rate -> max_prob_rate for sampling and
+		 * max_tp_rate -> max_prob_rate by default.
+		 */
+		minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_prob_rate,
+				     txrc, false, !sample);
+
+		ar[2].count = 0;
+		ar[2].idx = -1;
+	} else {
+		/* Not using MRR, only use the first rate */
+		ar[1].count = 0;
+		ar[1].idx = -1;
+	}
+
+	mi->total_packets++;
+
+	/* wraparound */
+	if (mi->total_packets == ~0) {
+		mi->total_packets = 0;
+		mi->sample_packets = 0;
+	}
+}
+
+static void
+minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
+                        struct ieee80211_sta *sta, void *priv_sta,
+			enum nl80211_channel_type oper_chan_type)
+{
+	struct minstrel_priv *mp = priv;
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
+	struct ieee80211_local *local = hw_to_local(mp->hw);
+	u16 sta_cap = sta->ht_cap.cap;
+	int n_supported = 0;
+	int ack_dur;
+	int stbc;
+	int i;
+
+	/* fall back to the old minstrel for legacy stations */
+	if (!sta->ht_cap.ht_supported)
+		goto use_legacy;
+
+	BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) !=
+		MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS);
+
+	msp->is_ht = true;
+	memset(mi, 0, sizeof(*mi));
+	mi->stats_update = jiffies;
+
+	ack_dur = ieee80211_frame_duration(local, 10, 60, 1, 1);
+	mi->overhead = ieee80211_frame_duration(local, 0, 60, 1, 1) + ack_dur;
+	mi->overhead_rtscts = mi->overhead + 2 * ack_dur;
+
+	mi->avg_ampdu_len = MINSTREL_FRAC(1, 1);
+
+	/* When using MRR, sample more on the first attempt, without delay */
+	if (mp->has_mrr) {
+		mi->sample_count = 16;
+		mi->sample_wait = 0;
+	} else {
+		mi->sample_count = 8;
+		mi->sample_wait = 8;
+	}
+	mi->sample_tries = 4;
+
+	stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >>
+		IEEE80211_HT_CAP_RX_STBC_SHIFT;
+	mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
+
+	if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING)
+		mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+
+	if (oper_chan_type != NL80211_CHAN_HT40MINUS &&
+	    oper_chan_type != NL80211_CHAN_HT40PLUS)
+		sta_cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+
+	for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
+		u16 req = 0;
+
+		mi->groups[i].supported = 0;
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) {
+			if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+				req |= IEEE80211_HT_CAP_SGI_40;
+			else
+				req |= IEEE80211_HT_CAP_SGI_20;
+		}
+
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+			req |= IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+
+		if ((sta_cap & req) != req)
+			continue;
+
+		mi->groups[i].supported =
+			mcs->rx_mask[minstrel_mcs_groups[i].streams - 1];
+
+		if (mi->groups[i].supported)
+			n_supported++;
+	}
+
+	if (!n_supported)
+		goto use_legacy;
+
+	return;
+
+use_legacy:
+	msp->is_ht = false;
+	memset(&msp->legacy, 0, sizeof(msp->legacy));
+	msp->legacy.r = msp->ratelist;
+	msp->legacy.sample_table = msp->sample_table;
+	return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy);
+}
+
+static void
+minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband,
+                      struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_priv *mp = priv;
+
+	minstrel_ht_update_caps(priv, sband, sta, priv_sta, mp->hw->conf.channel_type);
+}
+
+static void
+minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband,
+                        struct ieee80211_sta *sta, void *priv_sta,
+                        u32 changed, enum nl80211_channel_type oper_chan_type)
+{
+	minstrel_ht_update_caps(priv, sband, sta, priv_sta, oper_chan_type);
+}
+
+static void *
+minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+	struct ieee80211_supported_band *sband;
+	struct minstrel_ht_sta_priv *msp;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_hw *hw = mp->hw;
+	int max_rates = 0;
+	int i;
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = hw->wiphy->bands[i];
+		if (sband && sband->n_bitrates > max_rates)
+			max_rates = sband->n_bitrates;
+	}
+
+	msp = kzalloc(sizeof(struct minstrel_ht_sta), gfp);
+	if (!msp)
+		return NULL;
+
+	msp->ratelist = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+	if (!msp->ratelist)
+		goto error;
+
+	msp->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+	if (!msp->sample_table)
+		goto error1;
+
+	return msp;
+
+error1:
+	kfree(msp->ratelist);
+error:
+	kfree(msp);
+	return NULL;
+}
+
+static void
+minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	kfree(msp->sample_table);
+	kfree(msp->ratelist);
+	kfree(msp);
+}
+
+static void *
+minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+	return mac80211_minstrel.alloc(hw, debugfsdir);
+}
+
+static void
+minstrel_ht_free(void *priv)
+{
+	mac80211_minstrel.free(priv);
+}
+
+static struct rate_control_ops mac80211_minstrel_ht = {
+	.name = "minstrel_ht",
+	.tx_status = minstrel_ht_tx_status,
+	.get_rate = minstrel_ht_get_rate,
+	.rate_init = minstrel_ht_rate_init,
+	.rate_update = minstrel_ht_rate_update,
+	.alloc_sta = minstrel_ht_alloc_sta,
+	.free_sta = minstrel_ht_free_sta,
+	.alloc = minstrel_ht_alloc,
+	.free = minstrel_ht_free,
+#ifdef CONFIG_MAC80211_DEBUGFS
+	.add_sta_debugfs = minstrel_ht_add_sta_debugfs,
+	.remove_sta_debugfs = minstrel_ht_remove_sta_debugfs,
+#endif
+};
+
+
+static void
+init_sample_table(void)
+{
+	int col, i, new_idx;
+	u8 rnd[MCS_GROUP_RATES];
+
+	memset(sample_table, 0xff, sizeof(sample_table));
+	for (col = 0; col < SAMPLE_COLUMNS; col++) {
+		for (i = 0; i < MCS_GROUP_RATES; i++) {
+			get_random_bytes(rnd, sizeof(rnd));
+			new_idx = (i + rnd[i]) % MCS_GROUP_RATES;
+
+			while (sample_table[col][new_idx] != 0xff)
+				new_idx = (new_idx + 1) % MCS_GROUP_RATES;
+
+			sample_table[col][new_idx] = i;
+		}
+	}
+}
+
+int __init
+rc80211_minstrel_ht_init(void)
+{
+	init_sample_table();
+	return ieee80211_rate_control_register(&mac80211_minstrel_ht);
+}
+
+void
+rc80211_minstrel_ht_exit(void)
+{
+	ieee80211_rate_control_unregister(&mac80211_minstrel_ht);
+}
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
new file mode 100644
index 00000000..462d2b22
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_HT_H
+#define __RC_MINSTREL_HT_H
+
+/*
+ * The number of streams can be changed to 2 to reduce code
+ * size and memory footprint.
+ */
+#define MINSTREL_MAX_STREAMS	3
+#define MINSTREL_STREAM_GROUPS	4
+
+/* scaled fraction values */
+#define MINSTREL_SCALE	16
+#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
+#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
+
+#define MCS_GROUP_RATES	8
+
+struct mcs_group {
+	u32 flags;
+	unsigned int streams;
+	unsigned int duration[MCS_GROUP_RATES];
+};
+
+extern const struct mcs_group minstrel_mcs_groups[];
+
+struct minstrel_rate_stats {
+	/* current / last sampling period attempts/success counters */
+	unsigned int attempts, last_attempts;
+	unsigned int success, last_success;
+
+	/* total attempts/success counters */
+	u64 att_hist, succ_hist;
+
+	/* current throughput */
+	unsigned int cur_tp;
+
+	/* packet delivery probabilities */
+	unsigned int cur_prob, probability;
+
+	/* maximum retry counts */
+	unsigned int retry_count;
+	unsigned int retry_count_rtscts;
+
+	bool retry_updated;
+	u8 sample_skipped;
+};
+
+struct minstrel_mcs_group_data {
+	u8 index;
+	u8 column;
+
+	/* bitfield of supported MCS rates of this group */
+	u8 supported;
+
+	/* selected primary rates */
+	unsigned int max_tp_rate;
+	unsigned int max_tp_rate2;
+	unsigned int max_prob_rate;
+
+	/* MCS rate statistics */
+	struct minstrel_rate_stats rates[MCS_GROUP_RATES];
+};
+
+struct minstrel_ht_sta {
+	/* ampdu length (average, per sampling interval) */
+	unsigned int ampdu_len;
+	unsigned int ampdu_packets;
+
+	/* ampdu length (EWMA) */
+	unsigned int avg_ampdu_len;
+
+	/* best throughput rate */
+	unsigned int max_tp_rate;
+
+	/* second best throughput rate */
+	unsigned int max_tp_rate2;
+
+	/* best probability rate */
+	unsigned int max_prob_rate;
+
+	/* time of last status update */
+	unsigned long stats_update;
+
+	/* overhead time in usec for each frame */
+	unsigned int overhead;
+	unsigned int overhead_rtscts;
+
+	unsigned int total_packets;
+	unsigned int sample_packets;
+
+	/* tx flags to add for frames for this sta */
+	u32 tx_flags;
+
+	u8 sample_wait;
+	u8 sample_tries;
+	u8 sample_count;
+	u8 sample_slow;
+
+	/* current MCS group to be sampled */
+	u8 sample_group;
+
+	/* MCS rate group info and statistics */
+	struct minstrel_mcs_group_data groups[MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS];
+};
+
+struct minstrel_ht_sta_priv {
+	union {
+		struct minstrel_ht_sta ht;
+		struct minstrel_sta_info legacy;
+	};
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *dbg_stats;
+#endif
+	void *ratelist;
+	void *sample_table;
+	bool is_ht;
+};
+
+void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
new file mode 100644
index 00000000..cefcb5d2
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+static int
+minstrel_ht_stats_open(struct inode *inode, struct file *file)
+{
+	struct minstrel_ht_sta_priv *msp = inode->i_private;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct minstrel_debugfs_info *ms;
+	unsigned int i, j, tp, prob, eprob;
+	char *p;
+	int ret;
+
+	if (!msp->is_ht) {
+		inode->i_private = &msp->legacy;
+		ret = minstrel_stats_open(inode, file);
+		inode->i_private = msp;
+		return ret;
+	}
+
+	ms = kmalloc(sizeof(*ms) + 8192, GFP_KERNEL);
+	if (!ms)
+		return -ENOMEM;
+
+	file->private_data = ms;
+	p = ms->buf;
+	p += sprintf(p, "type      rate     throughput  ewma prob   this prob  "
+			"this succ/attempt   success    attempts\n");
+	for (i = 0; i < MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; i++) {
+		char htmode = '2';
+		char gimode = 'L';
+
+		if (!mi->groups[i].supported)
+			continue;
+
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+			htmode = '4';
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI)
+			gimode = 'S';
+
+		for (j = 0; j < MCS_GROUP_RATES; j++) {
+			struct minstrel_rate_stats *mr = &mi->groups[i].rates[j];
+			int idx = i * MCS_GROUP_RATES + j;
+
+			if (!(mi->groups[i].supported & BIT(j)))
+				continue;
+
+			p += sprintf(p, "HT%c0/%cGI ", htmode, gimode);
+
+			*(p++) = (idx == mi->max_tp_rate) ? 'T' : ' ';
+			*(p++) = (idx == mi->max_tp_rate2) ? 't' : ' ';
+			*(p++) = (idx == mi->max_prob_rate) ? 'P' : ' ';
+			p += sprintf(p, "MCS%-2u", (minstrel_mcs_groups[i].streams - 1) *
+					MCS_GROUP_RATES + j);
+
+			tp = mr->cur_tp / 10;
+			prob = MINSTREL_TRUNC(mr->cur_prob * 1000);
+			eprob = MINSTREL_TRUNC(mr->probability * 1000);
+
+			p += sprintf(p, "  %6u.%1u   %6u.%1u   %6u.%1u        "
+					"%3u(%3u)   %8llu    %8llu\n",
+					tp / 10, tp % 10,
+					eprob / 10, eprob % 10,
+					prob / 10, prob % 10,
+					mr->last_success,
+					mr->last_attempts,
+					(unsigned long long)mr->succ_hist,
+					(unsigned long long)mr->att_hist);
+		}
+	}
+	p += sprintf(p, "\nTotal packet count::    ideal %d      "
+			"lookaround %d\n",
+			max(0, (int) mi->total_packets - (int) mi->sample_packets),
+			mi->sample_packets);
+	p += sprintf(p, "Average A-MPDU length: %d.%d\n",
+		MINSTREL_TRUNC(mi->avg_ampdu_len),
+		MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
+	ms->len = p - ms->buf;
+
+	return nonseekable_open(inode, file);
+}
+
+static const struct file_operations minstrel_ht_stat_fops = {
+	.owner = THIS_MODULE,
+	.open = minstrel_ht_stats_open,
+	.read = minstrel_stats_read,
+	.release = minstrel_stats_release,
+	.llseek = no_llseek,
+};
+
+void
+minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp,
+			&minstrel_ht_stat_fops);
+}
+
+void
+minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	debugfs_remove(msp->dbg_stats);
+}
diff --git a/net/mac80211/rc80211_pid.h b/net/mac80211/rc80211_pid.h
new file mode 100644
index 00000000..19111c7b
--- /dev/null
+++ b/net/mac80211/rc80211_pid.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de>
+ * Copyright 2007, Stefano Brivio <stefano.brivio@polimi.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef RC80211_PID_H
+#define RC80211_PID_H
+
+/* Sampling period for measuring percentage of failed frames in ms. */
+#define RC_PID_INTERVAL			125
+
+/* Exponential averaging smoothness (used for I part of PID controller) */
+#define RC_PID_SMOOTHING_SHIFT		3
+#define RC_PID_SMOOTHING		(1 << RC_PID_SMOOTHING_SHIFT)
+
+/* Sharpening factor (used for D part of PID controller) */
+#define RC_PID_SHARPENING_FACTOR	0
+#define RC_PID_SHARPENING_DURATION	0
+
+/* Fixed point arithmetic shifting amount. */
+#define RC_PID_ARITH_SHIFT		8
+
+/* Proportional PID component coefficient. */
+#define RC_PID_COEFF_P			15
+/* Integral PID component coefficient. */
+#define RC_PID_COEFF_I			9
+/* Derivative PID component coefficient. */
+#define RC_PID_COEFF_D			15
+
+/* Target failed frames rate for the PID controller. NB: This effectively gives
+ * maximum failed frames percentage we're willing to accept. If the wireless
+ * link quality is good, the controller will fail to adjust failed frames
+ * percentage to the target. This is intentional.
+ */
+#define RC_PID_TARGET_PF		14
+
+/* Rate behaviour normalization quantity over time. */
+#define RC_PID_NORM_OFFSET		3
+
+/* Push high rates right after loading. */
+#define RC_PID_FAST_START		0
+
+/* Arithmetic right shift for positive and negative values for ISO C. */
+#define RC_PID_DO_ARITH_RIGHT_SHIFT(x, y) \
+	((x) < 0 ? -((-(x)) >> (y)) : (x) >> (y))
+
+enum rc_pid_event_type {
+	RC_PID_EVENT_TYPE_TX_STATUS,
+	RC_PID_EVENT_TYPE_RATE_CHANGE,
+	RC_PID_EVENT_TYPE_TX_RATE,
+	RC_PID_EVENT_TYPE_PF_SAMPLE,
+};
+
+union rc_pid_event_data {
+	/* RC_PID_EVENT_TX_STATUS */
+	struct {
+		u32 flags;
+		struct ieee80211_tx_info tx_status;
+	};
+	/* RC_PID_EVENT_TYPE_RATE_CHANGE */
+	/* RC_PID_EVENT_TYPE_TX_RATE */
+	struct {
+		int index;
+		int rate;
+	};
+	/* RC_PID_EVENT_TYPE_PF_SAMPLE */
+	struct {
+		s32 pf_sample;
+		s32 prop_err;
+		s32 int_err;
+		s32 der_err;
+	};
+};
+
+struct rc_pid_event {
+	/* The time when the event occurred */
+	unsigned long timestamp;
+
+	/* Event ID number */
+	unsigned int id;
+
+	/* Type of event */
+	enum rc_pid_event_type type;
+
+	/* type specific data */
+	union rc_pid_event_data data;
+};
+
+/* Size of the event ring buffer. */
+#define RC_PID_EVENT_RING_SIZE 32
+
+struct rc_pid_event_buffer {
+	/* Counter that generates event IDs */
+	unsigned int ev_count;
+
+	/* Ring buffer of events */
+	struct rc_pid_event ring[RC_PID_EVENT_RING_SIZE];
+
+	/* Index to the entry in events_buf to be reused */
+	unsigned int next_entry;
+
+	/* Lock that guards against concurrent access to this buffer struct */
+	spinlock_t lock;
+
+	/* Wait queue for poll/select and blocking I/O */
+	wait_queue_head_t waitqueue;
+};
+
+struct rc_pid_events_file_info {
+	/* The event buffer we read */
+	struct rc_pid_event_buffer *events;
+
+	/* The entry we have should read next */
+	unsigned int next_entry;
+};
+
+/**
+ * struct rc_pid_debugfs_entries - tunable parameters
+ *
+ * Algorithm parameters, tunable via debugfs.
+ * @target: target percentage for failed frames
+ * @sampling_period: error sampling interval in milliseconds
+ * @coeff_p: absolute value of the proportional coefficient
+ * @coeff_i: absolute value of the integral coefficient
+ * @coeff_d: absolute value of the derivative coefficient
+ * @smoothing_shift: absolute value of the integral smoothing factor (i.e.
+ *	amount of smoothing introduced by the exponential moving average)
+ * @sharpen_factor: absolute value of the derivative sharpening factor (i.e.
+ *	amount of emphasis given to the derivative term after low activity
+ *	events)
+ * @sharpen_duration: duration of the sharpening effect after the detected low
+ *	activity event, relative to sampling_period
+ * @norm_offset: amount of normalization periodically performed on the learnt
+ *	rate behaviour values (lower means we should trust more what we learnt
+ *	about behaviour of rates, higher means we should trust more the natural
+ *	ordering of rates)
+ */
+struct rc_pid_debugfs_entries {
+	struct dentry *target;
+	struct dentry *sampling_period;
+	struct dentry *coeff_p;
+	struct dentry *coeff_i;
+	struct dentry *coeff_d;
+	struct dentry *smoothing_shift;
+	struct dentry *sharpen_factor;
+	struct dentry *sharpen_duration;
+	struct dentry *norm_offset;
+};
+
+void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf,
+				      struct ieee80211_tx_info *stat);
+
+void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf,
+					       int index, int rate);
+
+void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf,
+					   int index, int rate);
+
+void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf,
+					     s32 pf_sample, s32 prop_err,
+					     s32 int_err, s32 der_err);
+
+void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
+					     struct dentry *dir);
+
+void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta);
+
+struct rc_pid_sta_info {
+	unsigned long last_change;
+	unsigned long last_sample;
+
+	u32 tx_num_failed;
+	u32 tx_num_xmit;
+
+	int txrate_idx;
+
+	/* Average failed frames percentage error (i.e. actual vs. target
+	 * percentage), scaled by RC_PID_SMOOTHING. This value is computed
+	 * using using an exponential weighted average technique:
+	 *
+	 *           (RC_PID_SMOOTHING - 1) * err_avg_old + err
+	 * err_avg = ------------------------------------------
+	 *                       RC_PID_SMOOTHING
+	 *
+	 * where err_avg is the new approximation, err_avg_old the previous one
+	 * and err is the error w.r.t. to the current failed frames percentage
+	 * sample. Note that the bigger RC_PID_SMOOTHING the more weight is
+	 * given to the previous estimate, resulting in smoother behavior (i.e.
+	 * corresponding to a longer integration window).
+	 *
+	 * For computation, we actually don't use the above formula, but this
+	 * one:
+	 *
+	 * err_avg_scaled = err_avg_old_scaled - err_avg_old + err
+	 *
+	 * where:
+	 * 	err_avg_scaled = err * RC_PID_SMOOTHING
+	 * 	err_avg_old_scaled = err_avg_old * RC_PID_SMOOTHING
+	 *
+	 * This avoids floating point numbers and the per_failed_old value can
+	 * easily be obtained by shifting per_failed_old_scaled right by
+	 * RC_PID_SMOOTHING_SHIFT.
+	 */
+	s32 err_avg_sc;
+
+	/* Last framed failes percentage sample. */
+	u32 last_pf;
+
+	/* Sharpening needed. */
+	u8 sharp_cnt;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	/* Event buffer */
+	struct rc_pid_event_buffer events;
+
+	/* Events debugfs file entry */
+	struct dentry *events_entry;
+#endif
+};
+
+/* Algorithm parameters. We keep them on a per-algorithm approach, so they can
+ * be tuned individually for each interface.
+ */
+struct rc_pid_rateinfo {
+
+	/* Map sorted rates to rates in ieee80211_hw_mode. */
+	int index;
+
+	/* Map rates in ieee80211_hw_mode to sorted rates. */
+	int rev_index;
+
+	/* Did we do any measurement on this rate? */
+	bool valid;
+
+	/* Comparison with the lowest rate. */
+	int diff;
+};
+
+struct rc_pid_info {
+
+	/* The failed frames percentage target. */
+	unsigned int target;
+
+	/* Rate at which failed frames percentage is sampled in 0.001s. */
+	unsigned int sampling_period;
+
+	/* P, I and D coefficients. */
+	int coeff_p;
+	int coeff_i;
+	int coeff_d;
+
+	/* Exponential averaging shift. */
+	unsigned int smoothing_shift;
+
+	/* Sharpening factor and duration. */
+	unsigned int sharpen_factor;
+	unsigned int sharpen_duration;
+
+	/* Normalization offset. */
+	unsigned int norm_offset;
+
+	/* Rates information. */
+	struct rc_pid_rateinfo *rinfo;
+
+	/* Index of the last used rate. */
+	int oldrate;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	/* Debugfs entries created for the parameters above. */
+	struct rc_pid_debugfs_entries dentries;
+#endif
+};
+
+#endif /* RC80211_PID_H */
diff --git a/net/mac80211/rc80211_pid_algo.c b/net/mac80211/rc80211_pid_algo.c
new file mode 100644
index 00000000..aeda6546
--- /dev/null
+++ b/net/mac80211/rc80211_pid_algo.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de>
+ * Copyright 2007-2008, Stefano Brivio <stefano.brivio@polimi.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "mesh.h"
+#include "rc80211_pid.h"
+
+
+/* This is an implementation of a TX rate control algorithm that uses a PID
+ * controller. Given a target failed frames rate, the controller decides about
+ * TX rate changes to meet the target failed frames rate.
+ *
+ * The controller basically computes the following:
+ *
+ * adj = CP * err + CI * err_avg + CD * (err - last_err) * (1 + sharpening)
+ *
+ * where
+ * 	adj	adjustment value that is used to switch TX rate (see below)
+ * 	err	current error: target vs. current failed frames percentage
+ * 	last_err	last error
+ * 	err_avg	average (i.e. poor man's integral) of recent errors
+ *	sharpening	non-zero when fast response is needed (i.e. right after
+ *			association or no frames sent for a long time), heading
+ * 			to zero over time
+ * 	CP	Proportional coefficient
+ * 	CI	Integral coefficient
+ * 	CD	Derivative coefficient
+ *
+ * CP, CI, CD are subject to careful tuning.
+ *
+ * The integral component uses a exponential moving average approach instead of
+ * an actual sliding window. The advantage is that we don't need to keep an
+ * array of the last N error values and computation is easier.
+ *
+ * Once we have the adj value, we map it to a rate by means of a learning
+ * algorithm. This algorithm keeps the state of the percentual failed frames
+ * difference between rates. The behaviour of the lowest available rate is kept
+ * as a reference value, and every time we switch between two rates, we compute
+ * the difference between the failed frames each rate exhibited. By doing so,
+ * we compare behaviours which different rates exhibited in adjacent timeslices,
+ * thus the comparison is minimally affected by external conditions. This
+ * difference gets propagated to the whole set of measurements, so that the
+ * reference is always the same. Periodically, we normalize this set so that
+ * recent events weigh the most. By comparing the adj value with this set, we
+ * avoid pejorative switches to lower rates and allow for switches to higher
+ * rates if they behaved well.
+ *
+ * Note that for the computations we use a fixed-point representation to avoid
+ * floating point arithmetic. Hence, all values are shifted left by
+ * RC_PID_ARITH_SHIFT.
+ */
+
+
+/* Adjust the rate while ensuring that we won't switch to a lower rate if it
+ * exhibited a worse failed frames behaviour and we'll choose the highest rate
+ * whose failed frames behaviour is not worse than the one of the original rate
+ * target. While at it, check that the new rate is valid. */
+static void rate_control_pid_adjust_rate(struct ieee80211_supported_band *sband,
+					 struct ieee80211_sta *sta,
+					 struct rc_pid_sta_info *spinfo, int adj,
+					 struct rc_pid_rateinfo *rinfo)
+{
+	int cur_sorted, new_sorted, probe, tmp, n_bitrates, band;
+	int cur = spinfo->txrate_idx;
+
+	band = sband->band;
+	n_bitrates = sband->n_bitrates;
+
+	/* Map passed arguments to sorted values. */
+	cur_sorted = rinfo[cur].rev_index;
+	new_sorted = cur_sorted + adj;
+
+	/* Check limits. */
+	if (new_sorted < 0)
+		new_sorted = rinfo[0].rev_index;
+	else if (new_sorted >= n_bitrates)
+		new_sorted = rinfo[n_bitrates - 1].rev_index;
+
+	tmp = new_sorted;
+
+	if (adj < 0) {
+		/* Ensure that the rate decrease isn't disadvantageous. */
+		for (probe = cur_sorted; probe >= new_sorted; probe--)
+			if (rinfo[probe].diff <= rinfo[cur_sorted].diff &&
+			    rate_supported(sta, band, rinfo[probe].index))
+				tmp = probe;
+	} else {
+		/* Look for rate increase with zero (or below) cost. */
+		for (probe = new_sorted + 1; probe < n_bitrates; probe++)
+			if (rinfo[probe].diff <= rinfo[new_sorted].diff &&
+			    rate_supported(sta, band, rinfo[probe].index))
+				tmp = probe;
+	}
+
+	/* Fit the rate found to the nearest supported rate. */
+	do {
+		if (rate_supported(sta, band, rinfo[tmp].index)) {
+			spinfo->txrate_idx = rinfo[tmp].index;
+			break;
+		}
+		if (adj < 0)
+			tmp--;
+		else
+			tmp++;
+	} while (tmp < n_bitrates && tmp >= 0);
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	rate_control_pid_event_rate_change(&spinfo->events,
+		spinfo->txrate_idx,
+		sband->bitrates[spinfo->txrate_idx].bitrate);
+#endif
+}
+
+/* Normalize the failed frames per-rate differences. */
+static void rate_control_pid_normalize(struct rc_pid_info *pinfo, int l)
+{
+	int i, norm_offset = pinfo->norm_offset;
+	struct rc_pid_rateinfo *r = pinfo->rinfo;
+
+	if (r[0].diff > norm_offset)
+		r[0].diff -= norm_offset;
+	else if (r[0].diff < -norm_offset)
+		r[0].diff += norm_offset;
+	for (i = 0; i < l - 1; i++)
+		if (r[i + 1].diff > r[i].diff + norm_offset)
+			r[i + 1].diff -= norm_offset;
+		else if (r[i + 1].diff <= r[i].diff)
+			r[i + 1].diff += norm_offset;
+}
+
+static void rate_control_pid_sample(struct rc_pid_info *pinfo,
+				    struct ieee80211_supported_band *sband,
+				    struct ieee80211_sta *sta,
+				    struct rc_pid_sta_info *spinfo)
+{
+	struct rc_pid_rateinfo *rinfo = pinfo->rinfo;
+	u32 pf;
+	s32 err_avg;
+	u32 err_prop;
+	u32 err_int;
+	u32 err_der;
+	int adj, i, j, tmp;
+	unsigned long period;
+
+	/* In case nothing happened during the previous control interval, turn
+	 * the sharpening factor on. */
+	period = msecs_to_jiffies(pinfo->sampling_period);
+	if (jiffies - spinfo->last_sample > 2 * period)
+		spinfo->sharp_cnt = pinfo->sharpen_duration;
+
+	spinfo->last_sample = jiffies;
+
+	/* This should never happen, but in case, we assume the old sample is
+	 * still a good measurement and copy it. */
+	if (unlikely(spinfo->tx_num_xmit == 0))
+		pf = spinfo->last_pf;
+	else
+		pf = spinfo->tx_num_failed * 100 / spinfo->tx_num_xmit;
+
+	spinfo->tx_num_xmit = 0;
+	spinfo->tx_num_failed = 0;
+
+	/* If we just switched rate, update the rate behaviour info. */
+	if (pinfo->oldrate != spinfo->txrate_idx) {
+
+		i = rinfo[pinfo->oldrate].rev_index;
+		j = rinfo[spinfo->txrate_idx].rev_index;
+
+		tmp = (pf - spinfo->last_pf);
+		tmp = RC_PID_DO_ARITH_RIGHT_SHIFT(tmp, RC_PID_ARITH_SHIFT);
+
+		rinfo[j].diff = rinfo[i].diff + tmp;
+		pinfo->oldrate = spinfo->txrate_idx;
+	}
+	rate_control_pid_normalize(pinfo, sband->n_bitrates);
+
+	/* Compute the proportional, integral and derivative errors. */
+	err_prop = (pinfo->target - pf) << RC_PID_ARITH_SHIFT;
+
+	err_avg = spinfo->err_avg_sc >> pinfo->smoothing_shift;
+	spinfo->err_avg_sc = spinfo->err_avg_sc - err_avg + err_prop;
+	err_int = spinfo->err_avg_sc >> pinfo->smoothing_shift;
+
+	err_der = (pf - spinfo->last_pf) *
+		  (1 + pinfo->sharpen_factor * spinfo->sharp_cnt);
+	spinfo->last_pf = pf;
+	if (spinfo->sharp_cnt)
+			spinfo->sharp_cnt--;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	rate_control_pid_event_pf_sample(&spinfo->events, pf, err_prop, err_int,
+					 err_der);
+#endif
+
+	/* Compute the controller output. */
+	adj = (err_prop * pinfo->coeff_p + err_int * pinfo->coeff_i
+	      + err_der * pinfo->coeff_d);
+	adj = RC_PID_DO_ARITH_RIGHT_SHIFT(adj, 2 * RC_PID_ARITH_SHIFT);
+
+	/* Change rate. */
+	if (adj)
+		rate_control_pid_adjust_rate(sband, sta, spinfo, adj, rinfo);
+}
+
+static void rate_control_pid_tx_status(void *priv, struct ieee80211_supported_band *sband,
+				       struct ieee80211_sta *sta, void *priv_sta,
+				       struct sk_buff *skb)
+{
+	struct rc_pid_info *pinfo = priv;
+	struct rc_pid_sta_info *spinfo = priv_sta;
+	unsigned long period;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	if (!spinfo)
+		return;
+
+	/* Ignore all frames that were sent with a different rate than the rate
+	 * we currently advise mac80211 to use. */
+	if (info->status.rates[0].idx != spinfo->txrate_idx)
+		return;
+
+	spinfo->tx_num_xmit++;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	rate_control_pid_event_tx_status(&spinfo->events, info);
+#endif
+
+	/* We count frames that totally failed to be transmitted as two bad
+	 * frames, those that made it out but had some retries as one good and
+	 * one bad frame. */
+	if (!(info->flags & IEEE80211_TX_STAT_ACK)) {
+		spinfo->tx_num_failed += 2;
+		spinfo->tx_num_xmit++;
+	} else if (info->status.rates[0].count > 1) {
+		spinfo->tx_num_failed++;
+		spinfo->tx_num_xmit++;
+	}
+
+	/* Update PID controller state. */
+	period = msecs_to_jiffies(pinfo->sampling_period);
+	if (time_after(jiffies, spinfo->last_sample + period))
+		rate_control_pid_sample(pinfo, sband, sta, spinfo);
+}
+
+static void
+rate_control_pid_get_rate(void *priv, struct ieee80211_sta *sta,
+			  void *priv_sta,
+			  struct ieee80211_tx_rate_control *txrc)
+{
+	struct sk_buff *skb = txrc->skb;
+	struct ieee80211_supported_band *sband = txrc->sband;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct rc_pid_sta_info *spinfo = priv_sta;
+	int rateidx;
+
+	if (txrc->rts)
+		info->control.rates[0].count =
+			txrc->hw->conf.long_frame_max_tx_count;
+	else
+		info->control.rates[0].count =
+			txrc->hw->conf.short_frame_max_tx_count;
+
+	/* Send management frames and NO_ACK data using lowest rate. */
+	if (rate_control_send_low(sta, priv_sta, txrc))
+		return;
+
+	rateidx = spinfo->txrate_idx;
+
+	if (rateidx >= sband->n_bitrates)
+		rateidx = sband->n_bitrates - 1;
+
+	info->control.rates[0].idx = rateidx;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	rate_control_pid_event_tx_rate(&spinfo->events,
+		rateidx, sband->bitrates[rateidx].bitrate);
+#endif
+}
+
+static void
+rate_control_pid_rate_init(void *priv, struct ieee80211_supported_band *sband,
+			   struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct rc_pid_sta_info *spinfo = priv_sta;
+	struct rc_pid_info *pinfo = priv;
+	struct rc_pid_rateinfo *rinfo = pinfo->rinfo;
+	int i, j, tmp;
+	bool s;
+
+	/* TODO: This routine should consider using RSSI from previous packets
+	 * as we need to have IEEE 802.1X auth succeed immediately after assoc..
+	 * Until that method is implemented, we will use the lowest supported
+	 * rate as a workaround. */
+
+	/* Sort the rates. This is optimized for the most common case (i.e.
+	 * almost-sorted CCK+OFDM rates). Kind of bubble-sort with reversed
+	 * mapping too. */
+	for (i = 0; i < sband->n_bitrates; i++) {
+		rinfo[i].index = i;
+		rinfo[i].rev_index = i;
+		if (RC_PID_FAST_START)
+			rinfo[i].diff = 0;
+		else
+			rinfo[i].diff = i * pinfo->norm_offset;
+	}
+	for (i = 1; i < sband->n_bitrates; i++) {
+		s = 0;
+		for (j = 0; j < sband->n_bitrates - i; j++)
+			if (unlikely(sband->bitrates[rinfo[j].index].bitrate >
+				     sband->bitrates[rinfo[j + 1].index].bitrate)) {
+				tmp = rinfo[j].index;
+				rinfo[j].index = rinfo[j + 1].index;
+				rinfo[j + 1].index = tmp;
+				rinfo[rinfo[j].index].rev_index = j;
+				rinfo[rinfo[j + 1].index].rev_index = j + 1;
+				s = 1;
+			}
+		if (!s)
+			break;
+	}
+
+	spinfo->txrate_idx = rate_lowest_index(sband, sta);
+}
+
+static void *rate_control_pid_alloc(struct ieee80211_hw *hw,
+				    struct dentry *debugfsdir)
+{
+	struct rc_pid_info *pinfo;
+	struct rc_pid_rateinfo *rinfo;
+	struct ieee80211_supported_band *sband;
+	int i, max_rates = 0;
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct rc_pid_debugfs_entries *de;
+#endif
+
+	pinfo = kmalloc(sizeof(*pinfo), GFP_ATOMIC);
+	if (!pinfo)
+		return NULL;
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = hw->wiphy->bands[i];
+		if (sband && sband->n_bitrates > max_rates)
+			max_rates = sband->n_bitrates;
+	}
+
+	rinfo = kmalloc(sizeof(*rinfo) * max_rates, GFP_ATOMIC);
+	if (!rinfo) {
+		kfree(pinfo);
+		return NULL;
+	}
+
+	pinfo->target = RC_PID_TARGET_PF;
+	pinfo->sampling_period = RC_PID_INTERVAL;
+	pinfo->coeff_p = RC_PID_COEFF_P;
+	pinfo->coeff_i = RC_PID_COEFF_I;
+	pinfo->coeff_d = RC_PID_COEFF_D;
+	pinfo->smoothing_shift = RC_PID_SMOOTHING_SHIFT;
+	pinfo->sharpen_factor = RC_PID_SHARPENING_FACTOR;
+	pinfo->sharpen_duration = RC_PID_SHARPENING_DURATION;
+	pinfo->norm_offset = RC_PID_NORM_OFFSET;
+	pinfo->rinfo = rinfo;
+	pinfo->oldrate = 0;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	de = &pinfo->dentries;
+	de->target = debugfs_create_u32("target_pf", S_IRUSR | S_IWUSR,
+					debugfsdir, &pinfo->target);
+	de->sampling_period = debugfs_create_u32("sampling_period",
+						 S_IRUSR | S_IWUSR, debugfsdir,
+						 &pinfo->sampling_period);
+	de->coeff_p = debugfs_create_u32("coeff_p", S_IRUSR | S_IWUSR,
+					 debugfsdir, (u32 *)&pinfo->coeff_p);
+	de->coeff_i = debugfs_create_u32("coeff_i", S_IRUSR | S_IWUSR,
+					 debugfsdir, (u32 *)&pinfo->coeff_i);
+	de->coeff_d = debugfs_create_u32("coeff_d", S_IRUSR | S_IWUSR,
+					 debugfsdir, (u32 *)&pinfo->coeff_d);
+	de->smoothing_shift = debugfs_create_u32("smoothing_shift",
+						 S_IRUSR | S_IWUSR, debugfsdir,
+						 &pinfo->smoothing_shift);
+	de->sharpen_factor = debugfs_create_u32("sharpen_factor",
+					       S_IRUSR | S_IWUSR, debugfsdir,
+					       &pinfo->sharpen_factor);
+	de->sharpen_duration = debugfs_create_u32("sharpen_duration",
+						  S_IRUSR | S_IWUSR, debugfsdir,
+						  &pinfo->sharpen_duration);
+	de->norm_offset = debugfs_create_u32("norm_offset",
+					     S_IRUSR | S_IWUSR, debugfsdir,
+					     &pinfo->norm_offset);
+#endif
+
+	return pinfo;
+}
+
+static void rate_control_pid_free(void *priv)
+{
+	struct rc_pid_info *pinfo = priv;
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct rc_pid_debugfs_entries *de = &pinfo->dentries;
+
+	debugfs_remove(de->norm_offset);
+	debugfs_remove(de->sharpen_duration);
+	debugfs_remove(de->sharpen_factor);
+	debugfs_remove(de->smoothing_shift);
+	debugfs_remove(de->coeff_d);
+	debugfs_remove(de->coeff_i);
+	debugfs_remove(de->coeff_p);
+	debugfs_remove(de->sampling_period);
+	debugfs_remove(de->target);
+#endif
+
+	kfree(pinfo->rinfo);
+	kfree(pinfo);
+}
+
+static void *rate_control_pid_alloc_sta(void *priv, struct ieee80211_sta *sta,
+					gfp_t gfp)
+{
+	struct rc_pid_sta_info *spinfo;
+
+	spinfo = kzalloc(sizeof(*spinfo), gfp);
+	if (spinfo == NULL)
+		return NULL;
+
+	spinfo->last_sample = jiffies;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	spin_lock_init(&spinfo->events.lock);
+	init_waitqueue_head(&spinfo->events.waitqueue);
+#endif
+
+	return spinfo;
+}
+
+static void rate_control_pid_free_sta(void *priv, struct ieee80211_sta *sta,
+				      void *priv_sta)
+{
+	kfree(priv_sta);
+}
+
+static struct rate_control_ops mac80211_rcpid = {
+	.name = "pid",
+	.tx_status = rate_control_pid_tx_status,
+	.get_rate = rate_control_pid_get_rate,
+	.rate_init = rate_control_pid_rate_init,
+	.alloc = rate_control_pid_alloc,
+	.free = rate_control_pid_free,
+	.alloc_sta = rate_control_pid_alloc_sta,
+	.free_sta = rate_control_pid_free_sta,
+#ifdef CONFIG_MAC80211_DEBUGFS
+	.add_sta_debugfs = rate_control_pid_add_sta_debugfs,
+	.remove_sta_debugfs = rate_control_pid_remove_sta_debugfs,
+#endif
+};
+
+int __init rc80211_pid_init(void)
+{
+	return ieee80211_rate_control_register(&mac80211_rcpid);
+}
+
+void rc80211_pid_exit(void)
+{
+	ieee80211_rate_control_unregister(&mac80211_rcpid);
+}
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
new file mode 100644
index 00000000..4851e9e2
--- /dev/null
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2007, Mattias Nissler <mattias.nissler@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/poll.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <net/mac80211.h>
+#include "rate.h"
+
+#include "rc80211_pid.h"
+
+static void rate_control_pid_event(struct rc_pid_event_buffer *buf,
+				   enum rc_pid_event_type type,
+				   union rc_pid_event_data *data)
+{
+	struct rc_pid_event *ev;
+	unsigned long status;
+
+	spin_lock_irqsave(&buf->lock, status);
+	ev = &(buf->ring[buf->next_entry]);
+	buf->next_entry = (buf->next_entry + 1) % RC_PID_EVENT_RING_SIZE;
+
+	ev->timestamp = jiffies;
+	ev->id = buf->ev_count++;
+	ev->type = type;
+	ev->data = *data;
+
+	spin_unlock_irqrestore(&buf->lock, status);
+
+	wake_up_all(&buf->waitqueue);
+}
+
+void rate_control_pid_event_tx_status(struct rc_pid_event_buffer *buf,
+				      struct ieee80211_tx_info *stat)
+{
+	union rc_pid_event_data evd;
+
+	evd.flags = stat->flags;
+	memcpy(&evd.tx_status, stat, sizeof(struct ieee80211_tx_info));
+	rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_STATUS, &evd);
+}
+
+void rate_control_pid_event_rate_change(struct rc_pid_event_buffer *buf,
+					       int index, int rate)
+{
+	union rc_pid_event_data evd;
+
+	evd.index = index;
+	evd.rate = rate;
+	rate_control_pid_event(buf, RC_PID_EVENT_TYPE_RATE_CHANGE, &evd);
+}
+
+void rate_control_pid_event_tx_rate(struct rc_pid_event_buffer *buf,
+					   int index, int rate)
+{
+	union rc_pid_event_data evd;
+
+	evd.index = index;
+	evd.rate = rate;
+	rate_control_pid_event(buf, RC_PID_EVENT_TYPE_TX_RATE, &evd);
+}
+
+void rate_control_pid_event_pf_sample(struct rc_pid_event_buffer *buf,
+					     s32 pf_sample, s32 prop_err,
+					     s32 int_err, s32 der_err)
+{
+	union rc_pid_event_data evd;
+
+	evd.pf_sample = pf_sample;
+	evd.prop_err = prop_err;
+	evd.int_err = int_err;
+	evd.der_err = der_err;
+	rate_control_pid_event(buf, RC_PID_EVENT_TYPE_PF_SAMPLE, &evd);
+}
+
+static int rate_control_pid_events_open(struct inode *inode, struct file *file)
+{
+	struct rc_pid_sta_info *sinfo = inode->i_private;
+	struct rc_pid_event_buffer *events = &sinfo->events;
+	struct rc_pid_events_file_info *file_info;
+	unsigned long status;
+
+	/* Allocate a state struct */
+	file_info = kmalloc(sizeof(*file_info), GFP_KERNEL);
+	if (file_info == NULL)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&events->lock, status);
+
+	file_info->next_entry = events->next_entry;
+	file_info->events = events;
+
+	spin_unlock_irqrestore(&events->lock, status);
+
+	file->private_data = file_info;
+
+	return 0;
+}
+
+static int rate_control_pid_events_release(struct inode *inode,
+					   struct file *file)
+{
+	struct rc_pid_events_file_info *file_info = file->private_data;
+
+	kfree(file_info);
+
+	return 0;
+}
+
+static unsigned int rate_control_pid_events_poll(struct file *file,
+						 poll_table *wait)
+{
+	struct rc_pid_events_file_info *file_info = file->private_data;
+
+	poll_wait(file, &file_info->events->waitqueue, wait);
+
+	return POLLIN | POLLRDNORM;
+}
+
+#define RC_PID_PRINT_BUF_SIZE 64
+
+static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf,
+					    size_t length, loff_t *offset)
+{
+	struct rc_pid_events_file_info *file_info = file->private_data;
+	struct rc_pid_event_buffer *events = file_info->events;
+	struct rc_pid_event *ev;
+	char pb[RC_PID_PRINT_BUF_SIZE];
+	int ret;
+	int p;
+	unsigned long status;
+
+	/* Check if there is something to read. */
+	if (events->next_entry == file_info->next_entry) {
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		/* Wait */
+		ret = wait_event_interruptible(events->waitqueue,
+				events->next_entry != file_info->next_entry);
+
+		if (ret)
+			return ret;
+	}
+
+	/* Write out one event per call. I don't care whether it's a little
+	 * inefficient, this is debugging code anyway. */
+	spin_lock_irqsave(&events->lock, status);
+
+	/* Get an event */
+	ev = &(events->ring[file_info->next_entry]);
+	file_info->next_entry = (file_info->next_entry + 1) %
+				RC_PID_EVENT_RING_SIZE;
+
+	/* Print information about the event. Note that userspace needs to
+	 * provide large enough buffers. */
+	length = length < RC_PID_PRINT_BUF_SIZE ?
+		 length : RC_PID_PRINT_BUF_SIZE;
+	p = snprintf(pb, length, "%u %lu ", ev->id, ev->timestamp);
+	switch (ev->type) {
+	case RC_PID_EVENT_TYPE_TX_STATUS:
+		p += snprintf(pb + p, length - p, "tx_status %u %u",
+			      !(ev->data.flags & IEEE80211_TX_STAT_ACK),
+			      ev->data.tx_status.status.rates[0].idx);
+		break;
+	case RC_PID_EVENT_TYPE_RATE_CHANGE:
+		p += snprintf(pb + p, length - p, "rate_change %d %d",
+			      ev->data.index, ev->data.rate);
+		break;
+	case RC_PID_EVENT_TYPE_TX_RATE:
+		p += snprintf(pb + p, length - p, "tx_rate %d %d",
+			      ev->data.index, ev->data.rate);
+		break;
+	case RC_PID_EVENT_TYPE_PF_SAMPLE:
+		p += snprintf(pb + p, length - p,
+			      "pf_sample %d %d %d %d",
+			      ev->data.pf_sample, ev->data.prop_err,
+			      ev->data.int_err, ev->data.der_err);
+		break;
+	}
+	p += snprintf(pb + p, length - p, "\n");
+
+	spin_unlock_irqrestore(&events->lock, status);
+
+	if (copy_to_user(buf, pb, p))
+		return -EFAULT;
+
+	return p;
+}
+
+#undef RC_PID_PRINT_BUF_SIZE
+
+static const struct file_operations rc_pid_fop_events = {
+	.owner = THIS_MODULE,
+	.read = rate_control_pid_events_read,
+	.poll = rate_control_pid_events_poll,
+	.open = rate_control_pid_events_open,
+	.release = rate_control_pid_events_release,
+	.llseek = noop_llseek,
+};
+
+void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
+					     struct dentry *dir)
+{
+	struct rc_pid_sta_info *spinfo = priv_sta;
+
+	spinfo->events_entry = debugfs_create_file("rc_pid_events", S_IRUGO,
+						   dir, spinfo,
+						   &rc_pid_fop_events);
+}
+
+void rate_control_pid_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+	struct rc_pid_sta_info *spinfo = priv_sta;
+
+	debugfs_remove(spinfo->events_entry);
+}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
new file mode 100644
index 00000000..41000650
--- /dev/null
+++ b/net/mac80211/rx.c
@@ -0,0 +1,2965 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/rcupdate.h>
+#include <net/mac80211.h>
+#include <net/ieee80211_radiotap.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "led.h"
+#include "mesh.h"
+#include "wep.h"
+#include "wpa.h"
+#include "tkip.h"
+#include "wme.h"
+
+/*
+ * monitor mode reception
+ *
+ * This function cleans up the SKB, i.e. it removes all the stuff
+ * only useful for monitoring.
+ */
+static struct sk_buff *remove_monitor_info(struct ieee80211_local *local,
+					   struct sk_buff *skb)
+{
+	if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) {
+		if (likely(skb->len > FCS_LEN))
+			__pskb_trim(skb, skb->len - FCS_LEN);
+		else {
+			/* driver bug */
+			WARN_ON(1);
+			dev_kfree_skb(skb);
+			skb = NULL;
+		}
+	}
+
+	return skb;
+}
+
+static inline int should_drop_frame(struct sk_buff *skb,
+				    int present_fcs_len)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
+		return 1;
+	if (unlikely(skb->len < 16 + present_fcs_len))
+		return 1;
+	if (ieee80211_is_ctl(hdr->frame_control) &&
+	    !ieee80211_is_pspoll(hdr->frame_control) &&
+	    !ieee80211_is_back_req(hdr->frame_control))
+		return 1;
+	return 0;
+}
+
+static int
+ieee80211_rx_radiotap_len(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *status)
+{
+	int len;
+
+	/* always present fields */
+	len = sizeof(struct ieee80211_radiotap_header) + 9;
+
+	if (status->flag & RX_FLAG_MACTIME_MPDU)
+		len += 8;
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+		len += 1;
+
+	if (len & 1) /* padding for RX_FLAGS if necessary */
+		len++;
+
+	if (status->flag & RX_FLAG_HT) /* HT info */
+		len += 3;
+
+	return len;
+}
+
+/*
+ * ieee80211_add_rx_radiotap_header - add radiotap header
+ *
+ * add a radiotap header containing all the fields which the hardware provided.
+ */
+static void
+ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
+				 struct sk_buff *skb,
+				 struct ieee80211_rate *rate,
+				 int rtap_len)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_radiotap_header *rthdr;
+	unsigned char *pos;
+	u16 rx_flags = 0;
+
+	rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len);
+	memset(rthdr, 0, rtap_len);
+
+	/* radiotap header, set always present flags */
+	rthdr->it_present =
+		cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) |
+			    (1 << IEEE80211_RADIOTAP_CHANNEL) |
+			    (1 << IEEE80211_RADIOTAP_ANTENNA) |
+			    (1 << IEEE80211_RADIOTAP_RX_FLAGS));
+	rthdr->it_len = cpu_to_le16(rtap_len);
+
+	pos = (unsigned char *)(rthdr+1);
+
+	/* the order of the following fields is important */
+
+	/* IEEE80211_RADIOTAP_TSFT */
+	if (status->flag & RX_FLAG_MACTIME_MPDU) {
+		put_unaligned_le64(status->mactime, pos);
+		rthdr->it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT);
+		pos += 8;
+	}
+
+	/* IEEE80211_RADIOTAP_FLAGS */
+	if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+		*pos |= IEEE80211_RADIOTAP_F_FCS;
+	if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
+		*pos |= IEEE80211_RADIOTAP_F_BADFCS;
+	if (status->flag & RX_FLAG_SHORTPRE)
+		*pos |= IEEE80211_RADIOTAP_F_SHORTPRE;
+	pos++;
+
+	/* IEEE80211_RADIOTAP_RATE */
+	if (!rate || status->flag & RX_FLAG_HT) {
+		/*
+		 * Without rate information don't add it. If we have,
+		 * MCS information is a separate field in radiotap,
+		 * added below. The byte here is needed as padding
+		 * for the channel though, so initialise it to 0.
+		 */
+		*pos = 0;
+	} else {
+		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
+		*pos = rate->bitrate / 5;
+	}
+	pos++;
+
+	/* IEEE80211_RADIOTAP_CHANNEL */
+	put_unaligned_le16(status->freq, pos);
+	pos += 2;
+	if (status->band == IEEE80211_BAND_5GHZ)
+		put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ,
+				   pos);
+	else if (status->flag & RX_FLAG_HT)
+		put_unaligned_le16(IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ,
+				   pos);
+	else if (rate && rate->flags & IEEE80211_RATE_ERP_G)
+		put_unaligned_le16(IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ,
+				   pos);
+	else if (rate)
+		put_unaligned_le16(IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ,
+				   pos);
+	else
+		put_unaligned_le16(IEEE80211_CHAN_2GHZ, pos);
+	pos += 2;
+
+	/* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+		*pos = status->signal;
+		rthdr->it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
+		pos++;
+	}
+
+	/* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */
+
+	/* IEEE80211_RADIOTAP_ANTENNA */
+	*pos = status->antenna;
+	pos++;
+
+	/* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */
+
+	/* IEEE80211_RADIOTAP_RX_FLAGS */
+	/* ensure 2 byte alignment for the 2 byte field as required */
+	if ((pos - (u8 *)rthdr) & 1)
+		pos++;
+	if (status->flag & RX_FLAG_FAILED_PLCP_CRC)
+		rx_flags |= IEEE80211_RADIOTAP_F_RX_BADPLCP;
+	put_unaligned_le16(rx_flags, pos);
+	pos += 2;
+
+	if (status->flag & RX_FLAG_HT) {
+		rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
+		*pos++ = IEEE80211_RADIOTAP_MCS_HAVE_MCS |
+			 IEEE80211_RADIOTAP_MCS_HAVE_GI |
+			 IEEE80211_RADIOTAP_MCS_HAVE_BW;
+		*pos = 0;
+		if (status->flag & RX_FLAG_SHORT_GI)
+			*pos |= IEEE80211_RADIOTAP_MCS_SGI;
+		if (status->flag & RX_FLAG_40MHZ)
+			*pos |= IEEE80211_RADIOTAP_MCS_BW_40;
+		pos++;
+		*pos++ = status->rate_idx;
+	}
+}
+
+/*
+ * This function copies a received frame to all monitor interfaces and
+ * returns a cleaned-up SKB that no longer includes the FCS nor the
+ * radiotap header the driver might have added.
+ */
+static struct sk_buff *
+ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
+		     struct ieee80211_rate *rate)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb);
+	struct ieee80211_sub_if_data *sdata;
+	int needed_headroom = 0;
+	struct sk_buff *skb, *skb2;
+	struct net_device *prev_dev = NULL;
+	int present_fcs_len = 0;
+
+	/*
+	 * First, we may need to make a copy of the skb because
+	 *  (1) we need to modify it for radiotap (if not present), and
+	 *  (2) the other RX handlers will modify the skb we got.
+	 *
+	 * We don't need to, of course, if we aren't going to return
+	 * the SKB because it has a bad FCS/PLCP checksum.
+	 */
+
+	/* room for the radiotap header based on driver features */
+	needed_headroom = ieee80211_rx_radiotap_len(local, status);
+
+	if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
+		present_fcs_len = FCS_LEN;
+
+	/* make sure hdr->frame_control is on the linear part */
+	if (!pskb_may_pull(origskb, 2)) {
+		dev_kfree_skb(origskb);
+		return NULL;
+	}
+
+	if (!local->monitors) {
+		if (should_drop_frame(origskb, present_fcs_len)) {
+			dev_kfree_skb(origskb);
+			return NULL;
+		}
+
+		return remove_monitor_info(local, origskb);
+	}
+
+	if (should_drop_frame(origskb, present_fcs_len)) {
+		/* only need to expand headroom if necessary */
+		skb = origskb;
+		origskb = NULL;
+
+		/*
+		 * This shouldn't trigger often because most devices have an
+		 * RX header they pull before we get here, and that should
+		 * be big enough for our radiotap information. We should
+		 * probably export the length to drivers so that we can have
+		 * them allocate enough headroom to start with.
+		 */
+		if (skb_headroom(skb) < needed_headroom &&
+		    pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) {
+			dev_kfree_skb(skb);
+			return NULL;
+		}
+	} else {
+		/*
+		 * Need to make a copy and possibly remove radiotap header
+		 * and FCS from the original.
+		 */
+		skb = skb_copy_expand(origskb, needed_headroom, 0, GFP_ATOMIC);
+
+		origskb = remove_monitor_info(local, origskb);
+
+		if (!skb)
+			return origskb;
+	}
+
+	/* prepend radiotap information */
+	ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom);
+
+	skb_reset_mac_header(skb);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb->pkt_type = PACKET_OTHERHOST;
+	skb->protocol = htons(ETH_P_802_2);
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+			continue;
+
+		if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
+			continue;
+
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (prev_dev) {
+			skb2 = skb_clone(skb, GFP_ATOMIC);
+			if (skb2) {
+				skb2->dev = prev_dev;
+				netif_receive_skb(skb2);
+			}
+		}
+
+		prev_dev = sdata->dev;
+		sdata->dev->stats.rx_packets++;
+		sdata->dev->stats.rx_bytes += skb->len;
+	}
+
+	if (prev_dev) {
+		skb->dev = prev_dev;
+		netif_receive_skb(skb);
+	} else
+		dev_kfree_skb(skb);
+
+	return origskb;
+}
+
+
+static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	int tid;
+
+	/* does the frame have a qos control field? */
+	if (ieee80211_is_data_qos(hdr->frame_control)) {
+		u8 *qc = ieee80211_get_qos_ctl(hdr);
+		/* frame has qos control */
+		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+		if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
+			status->rx_flags |= IEEE80211_RX_AMSDU;
+	} else {
+		/*
+		 * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
+		 *
+		 *	Sequence numbers for management frames, QoS data
+		 *	frames with a broadcast/multicast address in the
+		 *	Address 1 field, and all non-QoS data frames sent
+		 *	by QoS STAs are assigned using an additional single
+		 *	modulo-4096 counter, [...]
+		 *
+		 * We also use that counter for non-QoS STAs.
+		 */
+		tid = NUM_RX_DATA_QUEUES - 1;
+	}
+
+	rx->queue = tid;
+	/* Set skb->priority to 1d tag if highest order bit of TID is not set.
+	 * For now, set skb->priority to 0 for other cases. */
+	rx->skb->priority = (tid > 7) ? 0 : tid;
+}
+
+/**
+ * DOC: Packet alignment
+ *
+ * Drivers always need to pass packets that are aligned to two-byte boundaries
+ * to the stack.
+ *
+ * Additionally, should, if possible, align the payload data in a way that
+ * guarantees that the contained IP header is aligned to a four-byte
+ * boundary. In the case of regular frames, this simply means aligning the
+ * payload to a four-byte boundary (because either the IP header is directly
+ * contained, or IV/RFC1042 headers that have a length divisible by four are
+ * in front of it).  If the payload data is not properly aligned and the
+ * architecture doesn't support efficient unaligned operations, mac80211
+ * will align the data.
+ *
+ * With A-MSDU frames, however, the payload data address must yield two modulo
+ * four because there are 14-byte 802.3 headers within the A-MSDU frames that
+ * push the IP header further back to a multiple of four again. Thankfully, the
+ * specs were sane enough this time around to require padding each A-MSDU
+ * subframe to a length that is a multiple of four.
+ *
+ * Padding like Atheros hardware adds which is between the 802.11 header and
+ * the payload is not supported, the driver is required to move the 802.11
+ * header to be directly in front of the payload in that case.
+ */
+static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx)
+{
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	WARN_ONCE((unsigned long)rx->skb->data & 1,
+		  "unaligned packet at 0x%p\n", rx->skb->data);
+#endif
+}
+
+
+/* rx handlers */
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	struct sk_buff *skb = rx->skb;
+
+	if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN) &&
+		   !local->sched_scanning))
+		return RX_CONTINUE;
+
+	if (test_bit(SCAN_HW_SCANNING, &local->scanning) ||
+	    test_bit(SCAN_SW_SCANNING, &local->scanning) ||
+	    local->sched_scanning)
+		return ieee80211_scan_rx(rx->sdata, skb);
+
+	/* scanning finished during invoking of handlers */
+	I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
+	return RX_DROP_UNUSABLE;
+}
+
+
+static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (skb->len < 24 || is_multicast_ether_addr(hdr->addr1))
+		return 0;
+
+	return ieee80211_is_robust_mgmt_frame(hdr);
+}
+
+
+static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (skb->len < 24 || !is_multicast_ether_addr(hdr->addr1))
+		return 0;
+
+	return ieee80211_is_robust_mgmt_frame(hdr);
+}
+
+
+/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */
+static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
+	struct ieee80211_mmie *mmie;
+
+	if (skb->len < 24 + sizeof(*mmie) ||
+	    !is_multicast_ether_addr(hdr->da))
+		return -1;
+
+	if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *) hdr))
+		return -1; /* not a robust management frame */
+
+	mmie = (struct ieee80211_mmie *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return -1;
+
+	return le16_to_cpu(mmie->key_id);
+}
+
+
+static ieee80211_rx_result
+ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	unsigned int hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	char *dev_addr = rx->sdata->vif.addr;
+
+	if (ieee80211_is_data(hdr->frame_control)) {
+		if (is_multicast_ether_addr(hdr->addr1)) {
+			if (ieee80211_has_tods(hdr->frame_control) ||
+				!ieee80211_has_fromds(hdr->frame_control))
+				return RX_DROP_MONITOR;
+			if (memcmp(hdr->addr3, dev_addr, ETH_ALEN) == 0)
+				return RX_DROP_MONITOR;
+		} else {
+			if (!ieee80211_has_a4(hdr->frame_control))
+				return RX_DROP_MONITOR;
+			if (memcmp(hdr->addr4, dev_addr, ETH_ALEN) == 0)
+				return RX_DROP_MONITOR;
+		}
+	}
+
+	/* If there is not an established peer link and this is not a peer link
+	 * establisment frame, beacon or probe, drop the frame.
+	 */
+
+	if (!rx->sta || sta_plink_state(rx->sta) != NL80211_PLINK_ESTAB) {
+		struct ieee80211_mgmt *mgmt;
+
+		if (!ieee80211_is_mgmt(hdr->frame_control))
+			return RX_DROP_MONITOR;
+
+		if (ieee80211_is_action(hdr->frame_control)) {
+			u8 category;
+			mgmt = (struct ieee80211_mgmt *)hdr;
+			category = mgmt->u.action.category;
+			if (category != WLAN_CATEGORY_MESH_ACTION &&
+				category != WLAN_CATEGORY_SELF_PROTECTED)
+				return RX_DROP_MONITOR;
+			return RX_CONTINUE;
+		}
+
+		if (ieee80211_is_probe_req(hdr->frame_control) ||
+		    ieee80211_is_probe_resp(hdr->frame_control) ||
+		    ieee80211_is_beacon(hdr->frame_control) ||
+		    ieee80211_is_auth(hdr->frame_control))
+			return RX_CONTINUE;
+
+		return RX_DROP_MONITOR;
+
+	}
+
+#define msh_h_get(h, l) ((struct ieee80211s_hdr *) ((u8 *)h + l))
+
+	if (ieee80211_is_data(hdr->frame_control) &&
+	    is_multicast_ether_addr(hdr->addr1) &&
+	    mesh_rmc_check(hdr->addr3, msh_h_get(hdr, hdrlen), rx->sdata))
+		return RX_DROP_MONITOR;
+#undef msh_h_get
+
+	return RX_CONTINUE;
+}
+
+#define SEQ_MODULO 0x1000
+#define SEQ_MASK   0xfff
+
+static inline int seq_less(u16 sq1, u16 sq2)
+{
+	return ((sq1 - sq2) & SEQ_MASK) > (SEQ_MODULO >> 1);
+}
+
+static inline u16 seq_inc(u16 sq)
+{
+	return (sq + 1) & SEQ_MASK;
+}
+
+static inline u16 seq_sub(u16 sq1, u16 sq2)
+{
+	return (sq1 - sq2) & SEQ_MASK;
+}
+
+
+static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw,
+					    struct tid_ampdu_rx *tid_agg_rx,
+					    int index)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sk_buff *skb = tid_agg_rx->reorder_buf[index];
+	struct ieee80211_rx_status *status;
+
+	lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+	if (!skb)
+		goto no_frame;
+
+	/* release the frame from the reorder ring buffer */
+	tid_agg_rx->stored_mpdu_num--;
+	tid_agg_rx->reorder_buf[index] = NULL;
+	status = IEEE80211_SKB_RXCB(skb);
+	status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE;
+	skb_queue_tail(&local->rx_skb_queue, skb);
+
+no_frame:
+	tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
+}
+
+static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw,
+					     struct tid_ampdu_rx *tid_agg_rx,
+					     u16 head_seq_num)
+{
+	int index;
+
+	lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+	while (seq_less(tid_agg_rx->head_seq_num, head_seq_num)) {
+		index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+							tid_agg_rx->buf_size;
+		ieee80211_release_reorder_frame(hw, tid_agg_rx, index);
+	}
+}
+
+/*
+ * Timeout (in jiffies) for skb's that are waiting in the RX reorder buffer. If
+ * the skb was added to the buffer longer than this time ago, the earlier
+ * frames that have not yet been received are assumed to be lost and the skb
+ * can be released for processing. This may also release other skb's from the
+ * reorder buffer if there are no additional gaps between the frames.
+ *
+ * Callers must hold tid_agg_rx->reorder_lock.
+ */
+#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
+
+static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
+					  struct tid_ampdu_rx *tid_agg_rx)
+{
+	int index, j;
+
+	lockdep_assert_held(&tid_agg_rx->reorder_lock);
+
+	/* release the buffer until next missing frame */
+	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+						tid_agg_rx->buf_size;
+	if (!tid_agg_rx->reorder_buf[index] &&
+	    tid_agg_rx->stored_mpdu_num) {
+		/*
+		 * No buffers ready to be released, but check whether any
+		 * frames in the reorder buffer have timed out.
+		 */
+		int skipped = 1;
+		for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
+		     j = (j + 1) % tid_agg_rx->buf_size) {
+			if (!tid_agg_rx->reorder_buf[j]) {
+				skipped++;
+				continue;
+			}
+			if (skipped &&
+			    !time_after(jiffies, tid_agg_rx->reorder_time[j] +
+					HT_RX_REORDER_BUF_TIMEOUT))
+				goto set_release_timer;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+			if (net_ratelimit())
+				wiphy_debug(hw->wiphy,
+					    "release an RX reorder frame due to timeout on earlier frames\n");
+#endif
+			ieee80211_release_reorder_frame(hw, tid_agg_rx, j);
+
+			/*
+			 * Increment the head seq# also for the skipped slots.
+			 */
+			tid_agg_rx->head_seq_num =
+				(tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
+			skipped = 0;
+		}
+	} else while (tid_agg_rx->reorder_buf[index]) {
+		ieee80211_release_reorder_frame(hw, tid_agg_rx, index);
+		index =	seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+							tid_agg_rx->buf_size;
+	}
+
+	if (tid_agg_rx->stored_mpdu_num) {
+		j = index = seq_sub(tid_agg_rx->head_seq_num,
+				    tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+
+		for (; j != (index - 1) % tid_agg_rx->buf_size;
+		     j = (j + 1) % tid_agg_rx->buf_size) {
+			if (tid_agg_rx->reorder_buf[j])
+				break;
+		}
+
+ set_release_timer:
+
+		mod_timer(&tid_agg_rx->reorder_timer,
+			  tid_agg_rx->reorder_time[j] + 1 +
+			  HT_RX_REORDER_BUF_TIMEOUT);
+	} else {
+		del_timer(&tid_agg_rx->reorder_timer);
+	}
+}
+
+/*
+ * As this function belongs to the RX path it must be under
+ * rcu_read_lock protection. It returns false if the frame
+ * can be processed immediately, true if it was consumed.
+ */
+static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
+					     struct tid_ampdu_rx *tid_agg_rx,
+					     struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	u16 sc = le16_to_cpu(hdr->seq_ctrl);
+	u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
+	u16 head_seq_num, buf_size;
+	int index;
+	bool ret = true;
+
+	spin_lock(&tid_agg_rx->reorder_lock);
+
+	buf_size = tid_agg_rx->buf_size;
+	head_seq_num = tid_agg_rx->head_seq_num;
+
+	/* frame with out of date sequence number */
+	if (seq_less(mpdu_seq_num, head_seq_num)) {
+		dev_kfree_skb(skb);
+		goto out;
+	}
+
+	/*
+	 * If frame the sequence number exceeds our buffering window
+	 * size release some previous frames to make room for this one.
+	 */
+	if (!seq_less(mpdu_seq_num, head_seq_num + buf_size)) {
+		head_seq_num = seq_inc(seq_sub(mpdu_seq_num, buf_size));
+		/* release stored frames up to new head to stack */
+		ieee80211_release_reorder_frames(hw, tid_agg_rx, head_seq_num);
+	}
+
+	/* Now the new frame is always in the range of the reordering buffer */
+
+	index = seq_sub(mpdu_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+
+	/* check if we already stored this frame */
+	if (tid_agg_rx->reorder_buf[index]) {
+		dev_kfree_skb(skb);
+		goto out;
+	}
+
+	/*
+	 * If the current MPDU is in the right order and nothing else
+	 * is stored we can process it directly, no need to buffer it.
+	 * If it is first but there's something stored, we may be able
+	 * to release frames after this one.
+	 */
+	if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
+	    tid_agg_rx->stored_mpdu_num == 0) {
+		tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
+		ret = false;
+		goto out;
+	}
+
+	/* put the frame in the reordering buffer */
+	tid_agg_rx->reorder_buf[index] = skb;
+	tid_agg_rx->reorder_time[index] = jiffies;
+	tid_agg_rx->stored_mpdu_num++;
+	ieee80211_sta_reorder_release(hw, tid_agg_rx);
+
+ out:
+	spin_unlock(&tid_agg_rx->reorder_lock);
+	return ret;
+}
+
+/*
+ * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns
+ * true if the MPDU was buffered, false if it should be processed.
+ */
+static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_hw *hw = &local->hw;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct sta_info *sta = rx->sta;
+	struct tid_ampdu_rx *tid_agg_rx;
+	u16 sc;
+	int tid;
+
+	if (!ieee80211_is_data_qos(hdr->frame_control))
+		goto dont_reorder;
+
+	/*
+	 * filter the QoS data rx stream according to
+	 * STA/TID and check if this STA/TID is on aggregation
+	 */
+
+	if (!sta)
+		goto dont_reorder;
+
+	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		goto dont_reorder;
+
+	/* qos null data frames are excluded */
+	if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
+		goto dont_reorder;
+
+	/* new, potentially un-ordered, ampdu frame - process it */
+
+	/* reset session timer */
+	if (tid_agg_rx->timeout)
+		mod_timer(&tid_agg_rx->session_timer,
+			  TU_TO_EXP_TIME(tid_agg_rx->timeout));
+
+	/* if this mpdu is fragmented - terminate rx aggregation session */
+	sc = le16_to_cpu(hdr->seq_ctrl);
+	if (sc & IEEE80211_SCTL_FRAG) {
+		skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+		skb_queue_tail(&rx->sdata->skb_queue, skb);
+		ieee80211_queue_work(&local->hw, &rx->sdata->work);
+		return;
+	}
+
+	/*
+	 * No locking needed -- we will only ever process one
+	 * RX packet at a time, and thus own tid_agg_rx. All
+	 * other code manipulating it needs to (and does) make
+	 * sure that we cannot get to it any more before doing
+	 * anything with it.
+	 */
+	if (ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, skb))
+		return;
+
+ dont_reorder:
+	skb_queue_tail(&local->rx_skb_queue, skb);
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	/* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */
+	if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) {
+		if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
+			     rx->sta->last_seq_ctrl[rx->queue] ==
+			     hdr->seq_ctrl)) {
+			if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
+				rx->local->dot11FrameDuplicateCount++;
+				rx->sta->num_duplicates++;
+			}
+			return RX_DROP_UNUSABLE;
+		} else
+			rx->sta->last_seq_ctrl[rx->queue] = hdr->seq_ctrl;
+	}
+
+	if (unlikely(rx->skb->len < 16)) {
+		I802_DEBUG_INC(rx->local->rx_handlers_drop_short);
+		return RX_DROP_MONITOR;
+	}
+
+	/* Drop disallowed frame classes based on STA auth/assoc state;
+	 * IEEE 802.11, Chap 5.5.
+	 *
+	 * mac80211 filters only based on association state, i.e. it drops
+	 * Class 3 frames from not associated stations. hostapd sends
+	 * deauth/disassoc frames when needed. In addition, hostapd is
+	 * responsible for filtering on both auth and assoc states.
+	 */
+
+	if (ieee80211_vif_is_mesh(&rx->sdata->vif))
+		return ieee80211_rx_mesh_check(rx);
+
+	if (unlikely((ieee80211_is_data(hdr->frame_control) ||
+		      ieee80211_is_pspoll(hdr->frame_control)) &&
+		     rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+		     rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
+		     (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC))))
+		return RX_DROP_MONITOR;
+
+	return RX_CONTINUE;
+}
+
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	int keyidx;
+	int hdrlen;
+	ieee80211_rx_result result = RX_DROP_UNUSABLE;
+	struct ieee80211_key *sta_ptk = NULL;
+	int mmie_keyidx = -1;
+	__le16 fc;
+
+	/*
+	 * Key selection 101
+	 *
+	 * There are four types of keys:
+	 *  - GTK (group keys)
+	 *  - IGTK (group keys for management frames)
+	 *  - PTK (pairwise keys)
+	 *  - STK (station-to-station pairwise keys)
+	 *
+	 * When selecting a key, we have to distinguish between multicast
+	 * (including broadcast) and unicast frames, the latter can only
+	 * use PTKs and STKs while the former always use GTKs and IGTKs.
+	 * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
+	 * unicast frames can also use key indices like GTKs. Hence, if we
+	 * don't have a PTK/STK we check the key index for a WEP key.
+	 *
+	 * Note that in a regular BSS, multicast frames are sent by the
+	 * AP only, associated stations unicast the frame to the AP first
+	 * which then multicasts it on their behalf.
+	 *
+	 * There is also a slight problem in IBSS mode: GTKs are negotiated
+	 * with each station, that is something we don't currently handle.
+	 * The spec seems to expect that one negotiates the same key with
+	 * every station but there's no such requirement; VLANs could be
+	 * possible.
+	 */
+
+	/*
+	 * No point in finding a key and decrypting if the frame is neither
+	 * addressed to us nor a multicast frame.
+	 */
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_CONTINUE;
+
+	/* start without a key */
+	rx->key = NULL;
+
+	if (rx->sta)
+		sta_ptk = rcu_dereference(rx->sta->ptk);
+
+	fc = hdr->frame_control;
+
+	if (!ieee80211_has_protected(fc))
+		mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
+
+	if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
+		rx->key = sta_ptk;
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+		/* Skip decryption if the frame is not protected. */
+		if (!ieee80211_has_protected(fc))
+			return RX_CONTINUE;
+	} else if (mmie_keyidx >= 0) {
+		/* Broadcast/multicast robust management frame / BIP */
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+
+		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
+		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+		if (rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+		if (!rx->key)
+			rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
+	} else if (!ieee80211_has_protected(fc)) {
+		/*
+		 * The frame was not protected, so skip decryption. However, we
+		 * need to set rx->key if there is a key that could have been
+		 * used so that the frame may be dropped if encryption would
+		 * have been expected.
+		 */
+		struct ieee80211_key *key = NULL;
+		struct ieee80211_sub_if_data *sdata = rx->sdata;
+		int i;
+
+		if (ieee80211_is_mgmt(fc) &&
+		    is_multicast_ether_addr(hdr->addr1) &&
+		    (key = rcu_dereference(rx->sdata->default_mgmt_key)))
+			rx->key = key;
+		else {
+			if (rx->sta) {
+				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+					key = rcu_dereference(rx->sta->gtk[i]);
+					if (key)
+						break;
+				}
+			}
+			if (!key) {
+				for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
+					key = rcu_dereference(sdata->keys[i]);
+					if (key)
+						break;
+				}
+			}
+			if (key)
+				rx->key = key;
+		}
+		return RX_CONTINUE;
+	} else {
+		u8 keyid;
+		/*
+		 * The device doesn't give us the IV so we won't be
+		 * able to look up the key. That's ok though, we
+		 * don't need to decrypt the frame, we just won't
+		 * be able to keep statistics accurate.
+		 * Except for key threshold notifications, should
+		 * we somehow allow the driver to tell us which key
+		 * the hardware used if this flag is set?
+		 */
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
+
+		hdrlen = ieee80211_hdrlen(fc);
+
+		if (rx->skb->len < 8 + hdrlen)
+			return RX_DROP_UNUSABLE; /* TODO: count this? */
+
+		/*
+		 * no need to call ieee80211_wep_get_keyidx,
+		 * it verifies a bunch of things we've done already
+		 */
+		skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
+		keyidx = keyid >> 6;
+
+		/* check per-station GTK first, if multicast packet */
+		if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
+
+		/* if not found, try default key */
+		if (!rx->key) {
+			rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+
+			/*
+			 * RSNA-protected unicast frames should always be
+			 * sent with pairwise or station-to-station keys,
+			 * but for WEP we allow using a key index as well.
+			 */
+			if (rx->key &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
+			    !is_multicast_ether_addr(hdr->addr1))
+				rx->key = NULL;
+		}
+	}
+
+	if (rx->key) {
+		rx->key->tx_rx_count++;
+		/* TODO: add threshold stuff again */
+	} else {
+		return RX_DROP_MONITOR;
+	}
+
+	if (skb_linearize(rx->skb))
+		return RX_DROP_UNUSABLE;
+	/* the hdr variable is invalid now! */
+
+	switch (rx->key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		/* Check for weak IVs if possible */
+		if (rx->sta && ieee80211_is_data(fc) &&
+		    (!(status->flag & RX_FLAG_IV_STRIPPED) ||
+		     !(status->flag & RX_FLAG_DECRYPTED)) &&
+		    ieee80211_wep_is_weak_iv(rx->skb, rx->key))
+			rx->sta->wep_weak_iv_count++;
+
+		result = ieee80211_crypto_wep_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		result = ieee80211_crypto_tkip_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		result = ieee80211_crypto_ccmp_decrypt(rx);
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		result = ieee80211_crypto_aes_cmac_decrypt(rx);
+		break;
+	default:
+		/*
+		 * We can reach here only with HW-only algorithms
+		 * but why didn't it decrypt the frame?!
+		 */
+		return RX_DROP_UNUSABLE;
+	}
+
+	/* either the frame has been decrypted or will be dropped */
+	status->flag |= RX_FLAG_DECRYPTED;
+
+	return result;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb;
+
+	local = rx->local;
+	skb = rx->skb;
+	hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (!local->pspolling)
+		return RX_CONTINUE;
+
+	if (!ieee80211_has_fromds(hdr->frame_control))
+		/* this is not from AP */
+		return RX_CONTINUE;
+
+	if (!ieee80211_is_data(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if (!ieee80211_has_moredata(hdr->frame_control)) {
+		/* AP has no more frames buffered for us */
+		local->pspolling = false;
+		return RX_CONTINUE;
+	}
+
+	/* more data bit is set, let's request a new frame from the AP */
+	ieee80211_send_pspoll(local, rx->sdata);
+
+	return RX_CONTINUE;
+}
+
+static void ap_sta_ps_start(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	atomic_inc(&sdata->bss->num_sta_ps);
+	set_sta_flags(sta, WLAN_STA_PS_STA);
+	if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+		drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta);
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	printk(KERN_DEBUG "%s: STA %pM aid %d enters power save mode\n",
+	       sdata->name, sta->sta.addr, sta->sta.aid);
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+}
+
+static void ap_sta_ps_end(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+	atomic_dec(&sdata->bss->num_sta_ps);
+
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	printk(KERN_DEBUG "%s: STA %pM aid %d exits power save mode\n",
+	       sdata->name, sta->sta.addr, sta->sta.aid);
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+
+	if (test_sta_flags(sta, WLAN_STA_PS_DRIVER)) {
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+		printk(KERN_DEBUG "%s: STA %pM aid %d driver-ps-blocked\n",
+		       sdata->name, sta->sta.addr, sta->sta.aid);
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+		return;
+	}
+
+	ieee80211_sta_ps_deliver_wakeup(sta);
+}
+
+int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start)
+{
+	struct sta_info *sta_inf = container_of(sta, struct sta_info, sta);
+	bool in_ps;
+
+	WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS));
+
+	/* Don't let the same PS state be set twice */
+	in_ps = test_sta_flags(sta_inf, WLAN_STA_PS_STA);
+	if ((start && in_ps) || (!start && !in_ps))
+		return -EINVAL;
+
+	if (start)
+		ap_sta_ps_start(sta_inf);
+	else
+		ap_sta_ps_end(sta_inf);
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_sta_ps_transition);
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
+{
+	struct sta_info *sta = rx->sta;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (!sta)
+		return RX_CONTINUE;
+
+	/*
+	 * Update last_rx only for IBSS packets which are for the current
+	 * BSSID to avoid keeping the current IBSS network alive in cases
+	 * where other STAs start using different BSSID.
+	 */
+	if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
+						NL80211_IFTYPE_ADHOC);
+		if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0) {
+			sta->last_rx = jiffies;
+			if (ieee80211_is_data(hdr->frame_control)) {
+				sta->last_rx_rate_idx = status->rate_idx;
+				sta->last_rx_rate_flag = status->flag;
+			}
+		}
+	} else if (!is_multicast_ether_addr(hdr->addr1)) {
+		/*
+		 * Mesh beacons will update last_rx when if they are found to
+		 * match the current local configuration when processed.
+		 */
+		sta->last_rx = jiffies;
+		if (ieee80211_is_data(hdr->frame_control)) {
+			sta->last_rx_rate_idx = status->rate_idx;
+			sta->last_rx_rate_flag = status->flag;
+		}
+	}
+
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_CONTINUE;
+
+	if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
+		ieee80211_sta_rx_notify(rx->sdata, hdr);
+
+	sta->rx_fragments++;
+	sta->rx_bytes += rx->skb->len;
+	sta->last_signal = status->signal;
+	ewma_add(&sta->avg_signal, -status->signal);
+
+	/*
+	 * Change STA power saving mode only at the end of a frame
+	 * exchange sequence.
+	 */
+	if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) &&
+	    !ieee80211_has_morefrags(hdr->frame_control) &&
+	    !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
+	    (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
+	     rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) {
+		if (test_sta_flags(sta, WLAN_STA_PS_STA)) {
+			/*
+			 * Ignore doze->wake transitions that are
+			 * indicated by non-data frames, the standard
+			 * is unclear here, but for example going to
+			 * PS mode and then scanning would cause a
+			 * doze->wake transition for the probe request,
+			 * and that is clearly undesirable.
+			 */
+			if (ieee80211_is_data(hdr->frame_control) &&
+			    !ieee80211_has_pm(hdr->frame_control))
+				ap_sta_ps_end(sta);
+		} else {
+			if (ieee80211_has_pm(hdr->frame_control))
+				ap_sta_ps_start(sta);
+		}
+	}
+
+	/*
+	 * Drop (qos-)data::nullfunc frames silently, since they
+	 * are used only to control station power saving mode.
+	 */
+	if (ieee80211_is_nullfunc(hdr->frame_control) ||
+	    ieee80211_is_qos_nullfunc(hdr->frame_control)) {
+		I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
+
+		/*
+		 * If we receive a 4-addr nullfunc frame from a STA
+		 * that was not moved to a 4-addr STA vlan yet, drop
+		 * the frame to the monitor interface, to make sure
+		 * that hostapd sees it
+		 */
+		if (ieee80211_has_a4(hdr->frame_control) &&
+		    (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
+		     (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+		      !rx->sdata->u.vlan.sta)))
+			return RX_DROP_MONITOR;
+		/*
+		 * Update counter and free packet here to avoid
+		 * counting this as a dropped packed.
+		 */
+		sta->rx_packets++;
+		dev_kfree_skb(rx->skb);
+		return RX_QUEUED;
+	}
+
+	return RX_CONTINUE;
+} /* ieee80211_rx_h_sta_process */
+
+static inline struct ieee80211_fragment_entry *
+ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
+			 unsigned int frag, unsigned int seq, int rx_queue,
+			 struct sk_buff **skb)
+{
+	struct ieee80211_fragment_entry *entry;
+	int idx;
+
+	idx = sdata->fragment_next;
+	entry = &sdata->fragments[sdata->fragment_next++];
+	if (sdata->fragment_next >= IEEE80211_FRAGMENT_MAX)
+		sdata->fragment_next = 0;
+
+	if (!skb_queue_empty(&entry->skb_list)) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		struct ieee80211_hdr *hdr =
+			(struct ieee80211_hdr *) entry->skb_list.next->data;
+		printk(KERN_DEBUG "%s: RX reassembly removed oldest "
+		       "fragment entry (idx=%d age=%lu seq=%d last_frag=%d "
+		       "addr1=%pM addr2=%pM\n",
+		       sdata->name, idx,
+		       jiffies - entry->first_frag_time, entry->seq,
+		       entry->last_frag, hdr->addr1, hdr->addr2);
+#endif
+		__skb_queue_purge(&entry->skb_list);
+	}
+
+	__skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */
+	*skb = NULL;
+	entry->first_frag_time = jiffies;
+	entry->seq = seq;
+	entry->rx_queue = rx_queue;
+	entry->last_frag = frag;
+	entry->ccmp = 0;
+	entry->extra_len = 0;
+
+	return entry;
+}
+
+static inline struct ieee80211_fragment_entry *
+ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
+			  unsigned int frag, unsigned int seq,
+			  int rx_queue, struct ieee80211_hdr *hdr)
+{
+	struct ieee80211_fragment_entry *entry;
+	int i, idx;
+
+	idx = sdata->fragment_next;
+	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
+		struct ieee80211_hdr *f_hdr;
+
+		idx--;
+		if (idx < 0)
+			idx = IEEE80211_FRAGMENT_MAX - 1;
+
+		entry = &sdata->fragments[idx];
+		if (skb_queue_empty(&entry->skb_list) || entry->seq != seq ||
+		    entry->rx_queue != rx_queue ||
+		    entry->last_frag + 1 != frag)
+			continue;
+
+		f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data;
+
+		/*
+		 * Check ftype and addresses are equal, else check next fragment
+		 */
+		if (((hdr->frame_control ^ f_hdr->frame_control) &
+		     cpu_to_le16(IEEE80211_FCTL_FTYPE)) ||
+		    compare_ether_addr(hdr->addr1, f_hdr->addr1) != 0 ||
+		    compare_ether_addr(hdr->addr2, f_hdr->addr2) != 0)
+			continue;
+
+		if (time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
+			__skb_queue_purge(&entry->skb_list);
+			continue;
+		}
+		return entry;
+	}
+
+	return NULL;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr;
+	u16 sc;
+	__le16 fc;
+	unsigned int frag, seq;
+	struct ieee80211_fragment_entry *entry;
+	struct sk_buff *skb;
+	struct ieee80211_rx_status *status;
+
+	hdr = (struct ieee80211_hdr *)rx->skb->data;
+	fc = hdr->frame_control;
+	sc = le16_to_cpu(hdr->seq_ctrl);
+	frag = sc & IEEE80211_SCTL_FRAG;
+
+	if (likely((!ieee80211_has_morefrags(fc) && frag == 0) ||
+		   (rx->skb)->len < 24 ||
+		   is_multicast_ether_addr(hdr->addr1))) {
+		/* not fragmented */
+		goto out;
+	}
+	I802_DEBUG_INC(rx->local->rx_handlers_fragments);
+
+	if (skb_linearize(rx->skb))
+		return RX_DROP_UNUSABLE;
+
+	/*
+	 *  skb_linearize() might change the skb->data and
+	 *  previously cached variables (in this case, hdr) need to
+	 *  be refreshed with the new data.
+	 */
+	hdr = (struct ieee80211_hdr *)rx->skb->data;
+	seq = (sc & IEEE80211_SCTL_SEQ) >> 4;
+
+	if (frag == 0) {
+		/* This is the first fragment of a new frame. */
+		entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
+						 rx->queue, &(rx->skb));
+		if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP &&
+		    ieee80211_has_protected(fc)) {
+			int queue = ieee80211_is_mgmt(fc) ?
+				NUM_RX_DATA_QUEUES : rx->queue;
+			/* Store CCMP PN so that we can verify that the next
+			 * fragment has a sequential PN value. */
+			entry->ccmp = 1;
+			memcpy(entry->last_pn,
+			       rx->key->u.ccmp.rx_pn[queue],
+			       CCMP_PN_LEN);
+		}
+		return RX_QUEUED;
+	}
+
+	/* This is a fragment for a frame that should already be pending in
+	 * fragment cache. Add this fragment to the end of the pending entry.
+	 */
+	entry = ieee80211_reassemble_find(rx->sdata, frag, seq, rx->queue, hdr);
+	if (!entry) {
+		I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
+		return RX_DROP_MONITOR;
+	}
+
+	/* Verify that MPDUs within one MSDU have sequential PN values.
+	 * (IEEE 802.11i, 8.3.3.4.5) */
+	if (entry->ccmp) {
+		int i;
+		u8 pn[CCMP_PN_LEN], *rpn;
+		int queue;
+		if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP)
+			return RX_DROP_UNUSABLE;
+		memcpy(pn, entry->last_pn, CCMP_PN_LEN);
+		for (i = CCMP_PN_LEN - 1; i >= 0; i--) {
+			pn[i]++;
+			if (pn[i])
+				break;
+		}
+		queue = ieee80211_is_mgmt(fc) ?
+			NUM_RX_DATA_QUEUES : rx->queue;
+		rpn = rx->key->u.ccmp.rx_pn[queue];
+		if (memcmp(pn, rpn, CCMP_PN_LEN))
+			return RX_DROP_UNUSABLE;
+		memcpy(entry->last_pn, pn, CCMP_PN_LEN);
+	}
+
+	skb_pull(rx->skb, ieee80211_hdrlen(fc));
+	__skb_queue_tail(&entry->skb_list, rx->skb);
+	entry->last_frag = frag;
+	entry->extra_len += rx->skb->len;
+	if (ieee80211_has_morefrags(fc)) {
+		rx->skb = NULL;
+		return RX_QUEUED;
+	}
+
+	rx->skb = __skb_dequeue(&entry->skb_list);
+	if (skb_tailroom(rx->skb) < entry->extra_len) {
+		I802_DEBUG_INC(rx->local->rx_expand_skb_head2);
+		if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len,
+					      GFP_ATOMIC))) {
+			I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
+			__skb_queue_purge(&entry->skb_list);
+			return RX_DROP_UNUSABLE;
+		}
+	}
+	while ((skb = __skb_dequeue(&entry->skb_list))) {
+		memcpy(skb_put(rx->skb, skb->len), skb->data, skb->len);
+		dev_kfree_skb(skb);
+	}
+
+	/* Complete frame has been reassembled - process it now */
+	status = IEEE80211_SKB_RXCB(rx->skb);
+	status->rx_flags |= IEEE80211_RX_FRAGMENTED;
+
+ out:
+	if (rx->sta)
+		rx->sta->rx_packets++;
+	if (is_multicast_ether_addr(hdr->addr1))
+		rx->local->dot11MulticastReceivedFrameCount++;
+	else
+		ieee80211_led_rx(rx->local);
+	return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	__le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	if (likely(!rx->sta || !ieee80211_is_pspoll(fc) ||
+		   !(status->rx_flags & IEEE80211_RX_RA_MATCH)))
+		return RX_CONTINUE;
+
+	if ((sdata->vif.type != NL80211_IFTYPE_AP) &&
+	    (sdata->vif.type != NL80211_IFTYPE_AP_VLAN))
+		return RX_DROP_UNUSABLE;
+
+	if (!test_sta_flags(rx->sta, WLAN_STA_PS_DRIVER))
+		ieee80211_sta_ps_deliver_poll_response(rx->sta);
+	else
+		set_sta_flags(rx->sta, WLAN_STA_PSPOLL);
+
+	/* Free PS Poll skb here instead of returning RX_DROP that would
+	 * count as an dropped frame. */
+	dev_kfree_skb(rx->skb);
+
+	return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_remove_qos_control(struct ieee80211_rx_data *rx)
+{
+	u8 *data = rx->skb->data;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)data;
+
+	if (!ieee80211_is_data_qos(hdr->frame_control))
+		return RX_CONTINUE;
+
+	/* remove the qos control field, update frame type and meta-data */
+	memmove(data + IEEE80211_QOS_CTL_LEN, data,
+		ieee80211_hdrlen(hdr->frame_control) - IEEE80211_QOS_CTL_LEN);
+	hdr = (struct ieee80211_hdr *)skb_pull(rx->skb, IEEE80211_QOS_CTL_LEN);
+	/* change frame type to non QOS */
+	hdr->frame_control &= ~cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+
+	return RX_CONTINUE;
+}
+
+static int
+ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx)
+{
+	if (unlikely(!rx->sta ||
+	    !test_sta_flags(rx->sta, WLAN_STA_AUTHORIZED)))
+		return -EACCES;
+
+	return 0;
+}
+
+static int
+ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+	/*
+	 * Pass through unencrypted frames if the hardware has
+	 * decrypted them already.
+	 */
+	if (status->flag & RX_FLAG_DECRYPTED)
+		return 0;
+
+	/* Drop unencrypted frames if key is set. */
+	if (unlikely(!ieee80211_has_protected(fc) &&
+		     !ieee80211_is_nullfunc(fc) &&
+		     ieee80211_is_data(fc) &&
+		     (rx->key || rx->sdata->drop_unencrypted)))
+		return -EACCES;
+
+	return 0;
+}
+
+static int
+ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	__le16 fc = hdr->frame_control;
+
+	/*
+	 * Pass through unencrypted frames if the hardware has
+	 * decrypted them already.
+	 */
+	if (status->flag & RX_FLAG_DECRYPTED)
+		return 0;
+
+	if (rx->sta && test_sta_flags(rx->sta, WLAN_STA_MFP)) {
+		if (unlikely(!ieee80211_has_protected(fc) &&
+			     ieee80211_is_unicast_robust_mgmt_frame(rx->skb) &&
+			     rx->key)) {
+			if (ieee80211_is_deauth(fc))
+				cfg80211_send_unprot_deauth(rx->sdata->dev,
+							    rx->skb->data,
+							    rx->skb->len);
+			else if (ieee80211_is_disassoc(fc))
+				cfg80211_send_unprot_disassoc(rx->sdata->dev,
+							      rx->skb->data,
+							      rx->skb->len);
+			return -EACCES;
+		}
+		/* BIP does not use Protected field, so need to check MMIE */
+		if (unlikely(ieee80211_is_multicast_robust_mgmt_frame(rx->skb) &&
+			     ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
+			if (ieee80211_is_deauth(fc))
+				cfg80211_send_unprot_deauth(rx->sdata->dev,
+							    rx->skb->data,
+							    rx->skb->len);
+			else if (ieee80211_is_disassoc(fc))
+				cfg80211_send_unprot_disassoc(rx->sdata->dev,
+							      rx->skb->data,
+							      rx->skb->len);
+			return -EACCES;
+		}
+		/*
+		 * When using MFP, Action frames are not allowed prior to
+		 * having configured keys.
+		 */
+		if (unlikely(ieee80211_is_action(fc) && !rx->key &&
+			     ieee80211_is_robust_mgmt_frame(
+				     (struct ieee80211_hdr *) rx->skb->data)))
+			return -EACCES;
+	}
+
+	return 0;
+}
+
+static int
+__ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	bool check_port_control = false;
+	struct ethhdr *ehdr;
+	int ret;
+
+	*port_control = false;
+	if (ieee80211_has_a4(hdr->frame_control) &&
+	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
+		return -1;
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    !!sdata->u.mgd.use_4addr != !!ieee80211_has_a4(hdr->frame_control)) {
+
+		if (!sdata->u.mgd.use_4addr)
+			return -1;
+		else
+			check_port_control = true;
+	}
+
+	if (is_multicast_ether_addr(hdr->addr1) &&
+	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sdata->u.vlan.sta)
+		return -1;
+
+	ret = ieee80211_data_to_8023(rx->skb, sdata->vif.addr, sdata->vif.type);
+	if (ret < 0)
+		return ret;
+
+	ehdr = (struct ethhdr *) rx->skb->data;
+	if (ehdr->h_proto == rx->sdata->control_port_protocol)
+		*port_control = true;
+	else if (check_port_control)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * requires that rx->skb is a frame with ethernet header
+ */
+static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
+{
+	static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
+		= { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
+	struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
+
+	/*
+	 * Allow EAPOL frames to us/the PAE group address regardless
+	 * of whether the frame was encrypted or not.
+	 */
+	if (ehdr->h_proto == rx->sdata->control_port_protocol &&
+	    (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 ||
+	     compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0))
+		return true;
+
+	if (ieee80211_802_1x_port_control(rx) ||
+	    ieee80211_drop_unencrypted(rx, fc))
+		return false;
+
+	return true;
+}
+
+/*
+ * requires that rx->skb is a frame with ethernet header
+ */
+static void
+ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct net_device *dev = sdata->dev;
+	struct sk_buff *skb, *xmit_skb;
+	struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
+	struct sta_info *dsta;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	skb = rx->skb;
+	xmit_skb = NULL;
+
+	if ((sdata->vif.type == NL80211_IFTYPE_AP ||
+	     sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
+	    !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
+	    (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
+	    (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
+		if (is_multicast_ether_addr(ehdr->h_dest)) {
+			/*
+			 * send multicast frames both to higher layers in
+			 * local net stack and back to the wireless medium
+			 */
+			xmit_skb = skb_copy(skb, GFP_ATOMIC);
+			if (!xmit_skb && net_ratelimit())
+				printk(KERN_DEBUG "%s: failed to clone "
+				       "multicast frame\n", dev->name);
+		} else {
+			dsta = sta_info_get(sdata, skb->data);
+			if (dsta) {
+				/*
+				 * The destination station is associated to
+				 * this AP (in this VLAN), so send the frame
+				 * directly to it and do not pass it to local
+				 * net stack.
+				 */
+				xmit_skb = skb;
+				skb = NULL;
+			}
+		}
+	}
+
+	if (skb) {
+		int align __maybe_unused;
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+		/*
+		 * 'align' will only take the values 0 or 2 here
+		 * since all frames are required to be aligned
+		 * to 2-byte boundaries when being passed to
+		 * mac80211. That also explains the __skb_push()
+		 * below.
+		 */
+		align = ((unsigned long)(skb->data + sizeof(struct ethhdr))) & 3;
+		if (align) {
+			if (WARN_ON(skb_headroom(skb) < 3)) {
+				dev_kfree_skb(skb);
+				skb = NULL;
+			} else {
+				u8 *data = skb->data;
+				size_t len = skb_headlen(skb);
+				skb->data -= align;
+				memmove(skb->data, data, len);
+				skb_set_tail_pointer(skb, len);
+			}
+		}
+#endif
+
+		if (skb) {
+			/* deliver to local stack */
+			skb->protocol = eth_type_trans(skb, dev);
+			memset(skb->cb, 0, sizeof(skb->cb));
+			netif_receive_skb(skb);
+		}
+	}
+
+	if (xmit_skb) {
+		/* send to wireless media */
+		xmit_skb->protocol = htons(ETH_P_802_3);
+		skb_reset_network_header(xmit_skb);
+		skb_reset_mac_header(xmit_skb);
+		dev_queue_xmit(xmit_skb);
+	}
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
+{
+	struct net_device *dev = rx->sdata->dev;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	__le16 fc = hdr->frame_control;
+	struct sk_buff_head frame_list;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	if (unlikely(!ieee80211_is_data(fc)))
+		return RX_CONTINUE;
+
+	if (unlikely(!ieee80211_is_data_present(fc)))
+		return RX_DROP_MONITOR;
+
+	if (!(status->rx_flags & IEEE80211_RX_AMSDU))
+		return RX_CONTINUE;
+
+	if (ieee80211_has_a4(hdr->frame_control) &&
+	    rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+	    !rx->sdata->u.vlan.sta)
+		return RX_DROP_UNUSABLE;
+
+	if (is_multicast_ether_addr(hdr->addr1) &&
+	    ((rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+	      rx->sdata->u.vlan.sta) ||
+	     (rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
+	      rx->sdata->u.mgd.use_4addr)))
+		return RX_DROP_UNUSABLE;
+
+	skb->dev = dev;
+	__skb_queue_head_init(&frame_list);
+
+	if (skb_linearize(skb))
+		return RX_DROP_UNUSABLE;
+
+	ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
+				 rx->sdata->vif.type,
+				 rx->local->hw.extra_tx_headroom, true);
+
+	while (!skb_queue_empty(&frame_list)) {
+		rx->skb = __skb_dequeue(&frame_list);
+
+		if (!ieee80211_frame_allowed(rx, fc)) {
+			dev_kfree_skb(rx->skb);
+			continue;
+		}
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += rx->skb->len;
+
+		ieee80211_deliver_skb(rx);
+	}
+
+	return RX_QUEUED;
+}
+
+#ifdef CONFIG_MAC80211_MESH
+static ieee80211_rx_result
+ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr;
+	struct ieee80211s_hdr *mesh_hdr;
+	unsigned int hdrlen;
+	struct sk_buff *skb = rx->skb, *fwd_skb;
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+	hdr = (struct ieee80211_hdr *) skb->data;
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
+
+	if (!ieee80211_is_data(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if (!mesh_hdr->ttl)
+		/* illegal frame */
+		return RX_DROP_MONITOR;
+
+	if (mesh_hdr->flags & MESH_FLAGS_AE) {
+		struct mesh_path *mppath;
+		char *proxied_addr;
+		char *mpp_addr;
+
+		if (is_multicast_ether_addr(hdr->addr1)) {
+			mpp_addr = hdr->addr3;
+			proxied_addr = mesh_hdr->eaddr1;
+		} else {
+			mpp_addr = hdr->addr4;
+			proxied_addr = mesh_hdr->eaddr2;
+		}
+
+		rcu_read_lock();
+		mppath = mpp_path_lookup(proxied_addr, sdata);
+		if (!mppath) {
+			mpp_path_add(proxied_addr, mpp_addr, sdata);
+		} else {
+			spin_lock_bh(&mppath->state_lock);
+			if (compare_ether_addr(mppath->mpp, mpp_addr) != 0)
+				memcpy(mppath->mpp, mpp_addr, ETH_ALEN);
+			spin_unlock_bh(&mppath->state_lock);
+		}
+		rcu_read_unlock();
+	}
+
+	/* Frame has reached destination.  Don't forward */
+	if (!is_multicast_ether_addr(hdr->addr1) &&
+	    compare_ether_addr(sdata->vif.addr, hdr->addr3) == 0)
+		return RX_CONTINUE;
+
+	mesh_hdr->ttl--;
+
+	if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
+		if (!mesh_hdr->ttl)
+			IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh,
+						     dropped_frames_ttl);
+		else {
+			struct ieee80211_hdr *fwd_hdr;
+			struct ieee80211_tx_info *info;
+
+			fwd_skb = skb_copy(skb, GFP_ATOMIC);
+
+			if (!fwd_skb && net_ratelimit())
+				printk(KERN_DEBUG "%s: failed to clone mesh frame\n",
+						   sdata->name);
+			if (!fwd_skb)
+				goto out;
+
+			fwd_hdr =  (struct ieee80211_hdr *) fwd_skb->data;
+			memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN);
+			info = IEEE80211_SKB_CB(fwd_skb);
+			memset(info, 0, sizeof(*info));
+			info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+			info->control.vif = &rx->sdata->vif;
+			skb_set_queue_mapping(skb,
+				ieee80211_select_queue(rx->sdata, fwd_skb));
+			ieee80211_set_qos_hdr(local, skb);
+			if (is_multicast_ether_addr(fwd_hdr->addr1))
+				IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh,
+								fwded_mcast);
+			else {
+				int err;
+				/*
+				 * Save TA to addr1 to send TA a path error if a
+				 * suitable next hop is not found
+				 */
+				memcpy(fwd_hdr->addr1, fwd_hdr->addr2,
+						ETH_ALEN);
+				err = mesh_nexthop_lookup(fwd_skb, sdata);
+				/* Failed to immediately resolve next hop:
+				 * fwded frame was dropped or will be added
+				 * later to the pending skb queue.  */
+				if (err)
+					return RX_DROP_MONITOR;
+
+				IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh,
+								fwded_unicast);
+			}
+			IEEE80211_IFSTA_MESH_CTR_INC(&sdata->u.mesh,
+						     fwded_frames);
+			ieee80211_add_pending_skb(local, fwd_skb);
+		}
+	}
+
+ out:
+	if (is_multicast_ether_addr(hdr->addr1) ||
+	    sdata->dev->flags & IFF_PROMISC)
+		return RX_CONTINUE;
+	else
+		return RX_DROP_MONITOR;
+}
+#endif
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_local *local = rx->local;
+	struct net_device *dev = sdata->dev;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	__le16 fc = hdr->frame_control;
+	bool port_control;
+	int err;
+
+	if (unlikely(!ieee80211_is_data(hdr->frame_control)))
+		return RX_CONTINUE;
+
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+		return RX_DROP_MONITOR;
+
+	/*
+	 * Allow the cooked monitor interface of an AP to see 4-addr frames so
+	 * that a 4-addr station can be detected and moved into a separate VLAN
+	 */
+	if (ieee80211_has_a4(hdr->frame_control) &&
+	    sdata->vif.type == NL80211_IFTYPE_AP)
+		return RX_DROP_MONITOR;
+
+	err = __ieee80211_data_to_8023(rx, &port_control);
+	if (unlikely(err))
+		return RX_DROP_UNUSABLE;
+
+	if (!ieee80211_frame_allowed(rx, fc))
+		return RX_DROP_MONITOR;
+
+	if (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+	    unlikely(port_control) && sdata->bss) {
+		sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+				     u.ap);
+		dev = sdata->dev;
+		rx->sdata = sdata;
+	}
+
+	rx->skb->dev = dev;
+
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += rx->skb->len;
+
+	if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
+	    !is_multicast_ether_addr(
+		    ((struct ethhdr *)rx->skb->data)->h_dest) &&
+	    (!local->scanning &&
+	     !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) {
+			mod_timer(&local->dynamic_ps_timer, jiffies +
+			 msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+	}
+
+	ieee80211_deliver_skb(rx);
+
+	return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_hw *hw = &local->hw;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data;
+	struct tid_ampdu_rx *tid_agg_rx;
+	u16 start_seq_num;
+	u16 tid;
+
+	if (likely(!ieee80211_is_ctl(bar->frame_control)))
+		return RX_CONTINUE;
+
+	if (ieee80211_is_back_req(bar->frame_control)) {
+		struct {
+			__le16 control, start_seq_num;
+		} __packed bar_data;
+
+		if (!rx->sta)
+			return RX_DROP_MONITOR;
+
+		if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
+				  &bar_data, sizeof(bar_data)))
+			return RX_DROP_MONITOR;
+
+		tid = le16_to_cpu(bar_data.control) >> 12;
+
+		tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
+		if (!tid_agg_rx)
+			return RX_DROP_MONITOR;
+
+		start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
+
+		/* reset session timer */
+		if (tid_agg_rx->timeout)
+			mod_timer(&tid_agg_rx->session_timer,
+				  TU_TO_EXP_TIME(tid_agg_rx->timeout));
+
+		spin_lock(&tid_agg_rx->reorder_lock);
+		/* release stored frames up to start of BAR */
+		ieee80211_release_reorder_frames(hw, tid_agg_rx, start_seq_num);
+		spin_unlock(&tid_agg_rx->reorder_lock);
+
+		kfree_skb(skb);
+		return RX_QUEUED;
+	}
+
+	/*
+	 * After this point, we only want management frames,
+	 * so we can drop all remaining control frames to
+	 * cooked monitor interfaces.
+	 */
+	return RX_DROP_MONITOR;
+}
+
+static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
+					   struct ieee80211_mgmt *mgmt,
+					   size_t len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *resp;
+
+	if (compare_ether_addr(mgmt->da, sdata->vif.addr) != 0) {
+		/* Not to own unicast address */
+		return;
+	}
+
+	if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 ||
+	    compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) {
+		/* Not from the current AP or not associated yet. */
+		return;
+	}
+
+	if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) {
+		/* Too short SA Query request frame */
+		return;
+	}
+
+	skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom);
+	if (skb == NULL)
+		return;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	resp = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(resp, 0, 24);
+	memcpy(resp->da, mgmt->sa, ETH_ALEN);
+	memcpy(resp->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
+	resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
+	resp->u.action.category = WLAN_CATEGORY_SA_QUERY;
+	resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE;
+	memcpy(resp->u.action.u.sa_query.trans_id,
+	       mgmt->u.action.u.sa_query.trans_id,
+	       WLAN_SA_QUERY_TR_ID_LEN);
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	/*
+	 * From here on, look only at management frames.
+	 * Data and control frames are already handled,
+	 * and unknown (reserved) frames are useless.
+	 */
+	if (rx->skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	if (!ieee80211_is_mgmt(mgmt->frame_control))
+		return RX_DROP_MONITOR;
+
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_DROP_MONITOR;
+
+	if (ieee80211_drop_unencrypted_mgmt(rx))
+		return RX_DROP_UNUSABLE;
+
+	return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	int len = rx->skb->len;
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return RX_CONTINUE;
+
+	/* drop too small frames */
+	if (len < IEEE80211_MIN_ACTION_SIZE)
+		return RX_DROP_UNUSABLE;
+
+	if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
+		return RX_DROP_UNUSABLE;
+
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_DROP_UNUSABLE;
+
+	switch (mgmt->u.action.category) {
+	case WLAN_CATEGORY_BACK:
+		/*
+		 * The aggregation code is not prepared to handle
+		 * anything but STA/AP due to the BSSID handling;
+		 * IBSS could work in the code but isn't supported
+		 * by drivers or the standard.
+		 */
+		if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+		    sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    sdata->vif.type != NL80211_IFTYPE_AP)
+			break;
+
+		/* verify action_code is present */
+		if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+			break;
+
+		switch (mgmt->u.action.u.addba_req.action_code) {
+		case WLAN_ACTION_ADDBA_REQ:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.addba_req)))
+				goto invalid;
+			break;
+		case WLAN_ACTION_ADDBA_RESP:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.addba_resp)))
+				goto invalid;
+			break;
+		case WLAN_ACTION_DELBA:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.delba)))
+				goto invalid;
+			break;
+		default:
+			goto invalid;
+		}
+
+		goto queue;
+	case WLAN_CATEGORY_SPECTRUM_MGMT:
+		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
+			break;
+
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			break;
+
+		/* verify action_code is present */
+		if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+			break;
+
+		switch (mgmt->u.action.u.measurement.action_code) {
+		case WLAN_ACTION_SPCT_MSR_REQ:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.measurement)))
+				break;
+			ieee80211_process_measurement_req(sdata, mgmt, len);
+			goto handled;
+		case WLAN_ACTION_SPCT_CHL_SWITCH:
+			if (len < (IEEE80211_MIN_ACTION_SIZE +
+				   sizeof(mgmt->u.action.u.chan_switch)))
+				break;
+
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				break;
+
+			if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN))
+				break;
+
+			goto queue;
+		}
+		break;
+	case WLAN_CATEGORY_SA_QUERY:
+		if (len < (IEEE80211_MIN_ACTION_SIZE +
+			   sizeof(mgmt->u.action.u.sa_query)))
+			break;
+
+		switch (mgmt->u.action.u.sa_query.action) {
+		case WLAN_ACTION_SA_QUERY_REQUEST:
+			if (sdata->vif.type != NL80211_IFTYPE_STATION)
+				break;
+			ieee80211_process_sa_query_req(sdata, mgmt, len);
+			goto handled;
+		}
+		break;
+	case WLAN_CATEGORY_MESH_ACTION:
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
+		goto queue;
+	case WLAN_CATEGORY_MESH_PATH_SEL:
+		if (!mesh_path_sel_is_hwmp(sdata))
+			break;
+		goto queue;
+	}
+
+	return RX_CONTINUE;
+
+ invalid:
+	status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
+	/* will return in the next handlers */
+	return RX_CONTINUE;
+
+ handled:
+	if (rx->sta)
+		rx->sta->rx_packets++;
+	dev_kfree_skb(rx->skb);
+	return RX_QUEUED;
+
+ queue:
+	rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+	skb_queue_tail(&sdata->skb_queue, rx->skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
+	if (rx->sta)
+		rx->sta->rx_packets++;
+	return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	/* skip known-bad action frames and return them in the next handler */
+	if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
+		return RX_CONTINUE;
+
+	/*
+	 * Getting here means the kernel doesn't know how to handle
+	 * it, but maybe userspace does ... include returned frames
+	 * so userspace can register for those to know whether ones
+	 * it transmitted were processed or returned.
+	 */
+
+	if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq,
+			     rx->skb->data, rx->skb->len,
+			     GFP_ATOMIC)) {
+		if (rx->sta)
+			rx->sta->rx_packets++;
+		dev_kfree_skb(rx->skb);
+		return RX_QUEUED;
+	}
+
+
+	return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	struct sk_buff *nskb;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return RX_CONTINUE;
+
+	/*
+	 * For AP mode, hostapd is responsible for handling any action
+	 * frames that we didn't handle, including returning unknown
+	 * ones. For all other modes we will return them to the sender,
+	 * setting the 0x80 bit in the action category, as required by
+	 * 802.11-2007 7.3.1.11.
+	 * Newer versions of hostapd shall also use the management frame
+	 * registration mechanisms, but older ones still use cooked
+	 * monitor interfaces so push all frames there.
+	 */
+	if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
+	    (sdata->vif.type == NL80211_IFTYPE_AP ||
+	     sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+		return RX_DROP_MONITOR;
+
+	/* do not return rejected action frames */
+	if (mgmt->u.action.category & 0x80)
+		return RX_DROP_UNUSABLE;
+
+	nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0,
+			       GFP_ATOMIC);
+	if (nskb) {
+		struct ieee80211_mgmt *nmgmt = (void *)nskb->data;
+
+		nmgmt->u.action.category |= 0x80;
+		memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN);
+		memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN);
+
+		memset(nskb->cb, 0, sizeof(nskb->cb));
+
+		ieee80211_tx_skb(rx->sdata, nskb);
+	}
+	dev_kfree_skb(rx->skb);
+	return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	ieee80211_rx_result rxs;
+	struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
+	__le16 stype;
+
+	rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb);
+	if (rxs != RX_CONTINUE)
+		return rxs;
+
+	stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE);
+
+	if (!ieee80211_vif_is_mesh(&sdata->vif) &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+	    sdata->vif.type != NL80211_IFTYPE_STATION)
+		return RX_DROP_MONITOR;
+
+	switch (stype) {
+	case cpu_to_le16(IEEE80211_STYPE_BEACON):
+	case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
+		/* process for all: mesh, mlme, ibss */
+		break;
+	case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
+	case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
+		if (is_multicast_ether_addr(mgmt->da) &&
+		    !is_broadcast_ether_addr(mgmt->da))
+			return RX_DROP_MONITOR;
+
+		/* process only for station */
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			return RX_DROP_MONITOR;
+		break;
+	case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
+	case cpu_to_le16(IEEE80211_STYPE_AUTH):
+		/* process only for ibss */
+		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
+			return RX_DROP_MONITOR;
+		break;
+	default:
+		return RX_DROP_MONITOR;
+	}
+
+	/* queue up frame and kick off work to process it */
+	rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+	skb_queue_tail(&sdata->skb_queue, rx->skb);
+	ieee80211_queue_work(&rx->local->hw, &sdata->work);
+	if (rx->sta)
+		rx->sta->rx_packets++;
+
+	return RX_QUEUED;
+}
+
+/* TODO: use IEEE80211_RX_FRAGMENTED */
+static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
+					struct ieee80211_rate *rate)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_rtap_hdr {
+		struct ieee80211_radiotap_header hdr;
+		u8 flags;
+		u8 rate_or_pad;
+		__le16 chan_freq;
+		__le16 chan_flags;
+	} __packed *rthdr;
+	struct sk_buff *skb = rx->skb, *skb2;
+	struct net_device *prev_dev = NULL;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+	/*
+	 * If cooked monitor has been processed already, then
+	 * don't do it again. If not, set the flag.
+	 */
+	if (rx->flags & IEEE80211_RX_CMNTR)
+		goto out_free_skb;
+	rx->flags |= IEEE80211_RX_CMNTR;
+
+	if (skb_headroom(skb) < sizeof(*rthdr) &&
+	    pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC))
+		goto out_free_skb;
+
+	rthdr = (void *)skb_push(skb, sizeof(*rthdr));
+	memset(rthdr, 0, sizeof(*rthdr));
+	rthdr->hdr.it_len = cpu_to_le16(sizeof(*rthdr));
+	rthdr->hdr.it_present =
+		cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) |
+			    (1 << IEEE80211_RADIOTAP_CHANNEL));
+
+	if (rate) {
+		rthdr->rate_or_pad = rate->bitrate / 5;
+		rthdr->hdr.it_present |=
+			cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
+	}
+	rthdr->chan_freq = cpu_to_le16(status->freq);
+
+	if (status->band == IEEE80211_BAND_5GHZ)
+		rthdr->chan_flags = cpu_to_le16(IEEE80211_CHAN_OFDM |
+						IEEE80211_CHAN_5GHZ);
+	else
+		rthdr->chan_flags = cpu_to_le16(IEEE80211_CHAN_DYN |
+						IEEE80211_CHAN_2GHZ);
+
+	skb_set_mac_header(skb, 0);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb->pkt_type = PACKET_OTHERHOST;
+	skb->protocol = htons(ETH_P_802_2);
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
+		    !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+			continue;
+
+		if (prev_dev) {
+			skb2 = skb_clone(skb, GFP_ATOMIC);
+			if (skb2) {
+				skb2->dev = prev_dev;
+				netif_receive_skb(skb2);
+			}
+		}
+
+		prev_dev = sdata->dev;
+		sdata->dev->stats.rx_packets++;
+		sdata->dev->stats.rx_bytes += skb->len;
+	}
+
+	if (prev_dev) {
+		skb->dev = prev_dev;
+		netif_receive_skb(skb);
+		return;
+	}
+
+ out_free_skb:
+	dev_kfree_skb(skb);
+}
+
+static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
+					 ieee80211_rx_result res)
+{
+	switch (res) {
+	case RX_DROP_MONITOR:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+		if (rx->sta)
+			rx->sta->rx_dropped++;
+		/* fall through */
+	case RX_CONTINUE: {
+		struct ieee80211_rate *rate = NULL;
+		struct ieee80211_supported_band *sband;
+		struct ieee80211_rx_status *status;
+
+		status = IEEE80211_SKB_RXCB((rx->skb));
+
+		sband = rx->local->hw.wiphy->bands[status->band];
+		if (!(status->flag & RX_FLAG_HT))
+			rate = &sband->bitrates[status->rate_idx];
+
+		ieee80211_rx_cooked_monitor(rx, rate);
+		break;
+		}
+	case RX_DROP_UNUSABLE:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+		if (rx->sta)
+			rx->sta->rx_dropped++;
+		dev_kfree_skb(rx->skb);
+		break;
+	case RX_QUEUED:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+		break;
+	}
+}
+
+static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx)
+{
+	ieee80211_rx_result res = RX_DROP_MONITOR;
+	struct sk_buff *skb;
+
+#define CALL_RXH(rxh)			\
+	do {				\
+		res = rxh(rx);		\
+		if (res != RX_CONTINUE)	\
+			goto rxh_next;  \
+	} while (0);
+
+	spin_lock(&rx->local->rx_skb_queue.lock);
+	if (rx->local->running_rx_handler)
+		goto unlock;
+
+	rx->local->running_rx_handler = true;
+
+	while ((skb = __skb_dequeue(&rx->local->rx_skb_queue))) {
+		spin_unlock(&rx->local->rx_skb_queue.lock);
+
+		/*
+		 * all the other fields are valid across frames
+		 * that belong to an aMPDU since they are on the
+		 * same TID from the same station
+		 */
+		rx->skb = skb;
+
+		CALL_RXH(ieee80211_rx_h_decrypt)
+		CALL_RXH(ieee80211_rx_h_check_more_data)
+		CALL_RXH(ieee80211_rx_h_sta_process)
+		CALL_RXH(ieee80211_rx_h_defragment)
+		CALL_RXH(ieee80211_rx_h_ps_poll)
+		CALL_RXH(ieee80211_rx_h_michael_mic_verify)
+		/* must be after MMIC verify so header is counted in MPDU mic */
+		CALL_RXH(ieee80211_rx_h_remove_qos_control)
+		CALL_RXH(ieee80211_rx_h_amsdu)
+#ifdef CONFIG_MAC80211_MESH
+		if (ieee80211_vif_is_mesh(&rx->sdata->vif))
+			CALL_RXH(ieee80211_rx_h_mesh_fwding);
+#endif
+		CALL_RXH(ieee80211_rx_h_data)
+		CALL_RXH(ieee80211_rx_h_ctrl);
+		CALL_RXH(ieee80211_rx_h_mgmt_check)
+		CALL_RXH(ieee80211_rx_h_action)
+		CALL_RXH(ieee80211_rx_h_userspace_mgmt)
+		CALL_RXH(ieee80211_rx_h_action_return)
+		CALL_RXH(ieee80211_rx_h_mgmt)
+
+ rxh_next:
+		ieee80211_rx_handlers_result(rx, res);
+		spin_lock(&rx->local->rx_skb_queue.lock);
+#undef CALL_RXH
+	}
+
+	rx->local->running_rx_handler = false;
+
+ unlock:
+	spin_unlock(&rx->local->rx_skb_queue.lock);
+}
+
+static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
+{
+	ieee80211_rx_result res = RX_DROP_MONITOR;
+
+#define CALL_RXH(rxh)			\
+	do {				\
+		res = rxh(rx);		\
+		if (res != RX_CONTINUE)	\
+			goto rxh_next;  \
+	} while (0);
+
+	CALL_RXH(ieee80211_rx_h_passive_scan)
+	CALL_RXH(ieee80211_rx_h_check)
+
+	ieee80211_rx_reorder_ampdu(rx);
+
+	ieee80211_rx_handlers(rx);
+	return;
+
+ rxh_next:
+	ieee80211_rx_handlers_result(rx, res);
+
+#undef CALL_RXH
+}
+
+/*
+ * This function makes calls into the RX path, therefore
+ * it has to be invoked under RCU read lock.
+ */
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
+{
+	struct ieee80211_rx_data rx = {
+		.sta = sta,
+		.sdata = sta->sdata,
+		.local = sta->local,
+		.queue = tid,
+		.flags = 0,
+	};
+	struct tid_ampdu_rx *tid_agg_rx;
+
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		return;
+
+	spin_lock(&tid_agg_rx->reorder_lock);
+	ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx);
+	spin_unlock(&tid_agg_rx->reorder_lock);
+
+	ieee80211_rx_handlers(&rx);
+}
+
+/* main receive path */
+
+static int prepare_for_handlers(struct ieee80211_rx_data *rx,
+				struct ieee80211_hdr *hdr)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
+	int multicast = is_multicast_ether_addr(hdr->addr1);
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		if (!bssid && !sdata->u.mgd.use_4addr)
+			return 0;
+		if (!multicast &&
+		    compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) {
+			if (!(sdata->dev->flags & IFF_PROMISC) ||
+			    sdata->u.mgd.use_4addr)
+				return 0;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+		}
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		if (!bssid)
+			return 0;
+		if (ieee80211_is_beacon(hdr->frame_control)) {
+			return 1;
+		}
+		else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
+			if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
+				return 0;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+		} else if (!multicast &&
+			   compare_ether_addr(sdata->vif.addr,
+					      hdr->addr1) != 0) {
+			if (!(sdata->dev->flags & IFF_PROMISC))
+				return 0;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+		} else if (!rx->sta) {
+			int rate_idx;
+			if (status->flag & RX_FLAG_HT)
+				rate_idx = 0; /* TODO: HT rates */
+			else
+				rate_idx = status->rate_idx;
+			rx->sta = ieee80211_ibss_add_sta(sdata, bssid,
+					hdr->addr2, BIT(rate_idx), GFP_ATOMIC);
+		}
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!multicast &&
+		    compare_ether_addr(sdata->vif.addr,
+				       hdr->addr1) != 0) {
+			if (!(sdata->dev->flags & IFF_PROMISC))
+				return 0;
+
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+		}
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_AP:
+		if (!bssid) {
+			if (compare_ether_addr(sdata->vif.addr,
+					       hdr->addr1))
+				return 0;
+		} else if (!ieee80211_bssid_match(bssid,
+					sdata->vif.addr)) {
+			if (!(status->rx_flags & IEEE80211_RX_IN_SCAN) &&
+			    !ieee80211_is_beacon(hdr->frame_control))
+				return 0;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
+		}
+		break;
+	case NL80211_IFTYPE_WDS:
+		if (bssid || !ieee80211_is_data(hdr->frame_control))
+			return 0;
+		if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2))
+			return 0;
+		break;
+	default:
+		/* should never get here */
+		WARN_ON(1);
+		break;
+	}
+
+	return 1;
+}
+
+/*
+ * This function returns whether or not the SKB
+ * was destined for RX processing or not, which,
+ * if consume is true, is equivalent to whether
+ * or not the skb was consumed.
+ */
+static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
+					    struct sk_buff *skb, bool consume)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	int prepares;
+
+	rx->skb = skb;
+	status->rx_flags |= IEEE80211_RX_RA_MATCH;
+	prepares = prepare_for_handlers(rx, hdr);
+
+	if (!prepares)
+		return false;
+
+	if (!consume) {
+		skb = skb_copy(skb, GFP_ATOMIC);
+		if (!skb) {
+			if (net_ratelimit())
+				wiphy_debug(local->hw.wiphy,
+					"failed to copy skb for %s\n",
+					sdata->name);
+			return true;
+		}
+
+		rx->skb = skb;
+	}
+
+	ieee80211_invoke_rx_handlers(rx);
+	return true;
+}
+
+/*
+ * This is the actual Rx frames handler. as it blongs to Rx path it must
+ * be called with rcu_read_lock protection.
+ */
+static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
+					 struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_hdr *hdr;
+	__le16 fc;
+	struct ieee80211_rx_data rx;
+	struct ieee80211_sub_if_data *prev;
+	struct sta_info *sta, *tmp, *prev_sta;
+	int err = 0;
+
+	fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
+	memset(&rx, 0, sizeof(rx));
+	rx.skb = skb;
+	rx.local = local;
+
+	if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
+		local->dot11ReceivedFragmentCount++;
+
+	if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) ||
+		     test_bit(SCAN_SW_SCANNING, &local->scanning)))
+		status->rx_flags |= IEEE80211_RX_IN_SCAN;
+
+	if (ieee80211_is_mgmt(fc))
+		err = skb_linearize(skb);
+	else
+		err = !pskb_may_pull(skb, ieee80211_hdrlen(fc));
+
+	if (err) {
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+	ieee80211_parse_qos(&rx);
+	ieee80211_verify_alignment(&rx);
+
+	if (ieee80211_is_data(fc)) {
+		prev_sta = NULL;
+
+		for_each_sta_info(local, hdr->addr2, sta, tmp) {
+			if (!prev_sta) {
+				prev_sta = sta;
+				continue;
+			}
+
+			rx.sta = prev_sta;
+			rx.sdata = prev_sta->sdata;
+			ieee80211_prepare_and_rx_handle(&rx, skb, false);
+
+			prev_sta = sta;
+		}
+
+		if (prev_sta) {
+			rx.sta = prev_sta;
+			rx.sdata = prev_sta->sdata;
+
+			if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+				return;
+			goto out;
+		}
+	}
+
+	prev = NULL;
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+		    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			continue;
+
+		/*
+		 * frame is destined for this interface, but if it's
+		 * not also for the previous one we handle that after
+		 * the loop to avoid copying the SKB once too much
+		 */
+
+		if (!prev) {
+			prev = sdata;
+			continue;
+		}
+
+		rx.sta = sta_info_get_bss(prev, hdr->addr2);
+		rx.sdata = prev;
+		ieee80211_prepare_and_rx_handle(&rx, skb, false);
+
+		prev = sdata;
+	}
+
+	if (prev) {
+		rx.sta = sta_info_get_bss(prev, hdr->addr2);
+		rx.sdata = prev;
+
+		if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+			return;
+	}
+
+ out:
+	dev_kfree_skb(skb);
+}
+
+/*
+ * This is the receive path handler. It is called by a low level driver when an
+ * 802.11 MPDU is received from the hardware.
+ */
+void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_rate *rate = NULL;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+
+	WARN_ON_ONCE(softirq_count() == 0);
+
+	if (WARN_ON(status->band < 0 ||
+		    status->band >= IEEE80211_NUM_BANDS))
+		goto drop;
+
+	sband = local->hw.wiphy->bands[status->band];
+	if (WARN_ON(!sband))
+		goto drop;
+
+	/*
+	 * If we're suspending, it is possible although not too likely
+	 * that we'd be receiving frames after having already partially
+	 * quiesced the stack. We can't process such frames then since
+	 * that might, for example, cause stations to be added or other
+	 * driver callbacks be invoked.
+	 */
+	if (unlikely(local->quiescing || local->suspended))
+		goto drop;
+
+	/*
+	 * The same happens when we're not even started,
+	 * but that's worth a warning.
+	 */
+	if (WARN_ON(!local->started))
+		goto drop;
+
+	if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
+		/*
+		 * Validate the rate, unless a PLCP error means that
+		 * we probably can't have a valid rate here anyway.
+		 */
+
+		if (status->flag & RX_FLAG_HT) {
+			/*
+			 * rate_idx is MCS index, which can be [0-76]
+			 * as documented on:
+			 *
+			 * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
+			 *
+			 * Anything else would be some sort of driver or
+			 * hardware error. The driver should catch hardware
+			 * errors.
+			 */
+			if (WARN((status->rate_idx < 0 ||
+				 status->rate_idx > 76),
+				 "Rate marked as an HT rate but passed "
+				 "status->rate_idx is not "
+				 "an MCS index [0-76]: %d (0x%02x)\n",
+				 status->rate_idx,
+				 status->rate_idx))
+				goto drop;
+		} else {
+			if (WARN_ON(status->rate_idx < 0 ||
+				    status->rate_idx >= sband->n_bitrates))
+				goto drop;
+			rate = &sband->bitrates[status->rate_idx];
+		}
+	}
+
+	status->rx_flags = 0;
+
+	/*
+	 * key references and virtual interfaces are protected using RCU
+	 * and this requires that we are in a read-side RCU section during
+	 * receive processing
+	 */
+	rcu_read_lock();
+
+	/*
+	 * Frames with failed FCS/PLCP checksum are not returned,
+	 * all other frames are returned without radiotap header
+	 * if it was previously present.
+	 * Also, frames with less than 16 bytes are dropped.
+	 */
+	skb = ieee80211_rx_monitor(local, skb, rate);
+	if (!skb) {
+		rcu_read_unlock();
+		return;
+	}
+
+	ieee80211_tpt_led_trig_rx(local,
+			((struct ieee80211_hdr *)skb->data)->frame_control,
+			skb->len);
+	__ieee80211_rx_handle_packet(hw, skb);
+
+	rcu_read_unlock();
+
+	return;
+ drop:
+	kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_rx);
+
+/* This is a version of the rx handler that can be called from hard irq
+ * context. Post the skb on the queue and schedule the tasklet */
+void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	BUILD_BUG_ON(sizeof(struct ieee80211_rx_status) > sizeof(skb->cb));
+
+	skb->pkt_type = IEEE80211_RX_MSG;
+	skb_queue_tail(&local->skb_queue, skb);
+	tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_rx_irqsafe);
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
new file mode 100644
index 00000000..669d2e32
--- /dev/null
+++ b/net/mac80211/scan.c
@@ -0,0 +1,977 @@
+/*
+ * Scanning implementation
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/pm_qos_params.h>
+#include <net/sch_generic.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "mesh.h"
+
+#define IEEE80211_PROBE_DELAY (HZ / 33)
+#define IEEE80211_CHANNEL_TIME (HZ / 33)
+#define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 8)
+
+struct ieee80211_bss *
+ieee80211_rx_bss_get(struct ieee80211_local *local, u8 *bssid, int freq,
+		     u8 *ssid, u8 ssid_len)
+{
+	struct cfg80211_bss *cbss;
+
+	cbss = cfg80211_get_bss(local->hw.wiphy,
+				ieee80211_get_channel(local->hw.wiphy, freq),
+				bssid, ssid, ssid_len, 0, 0);
+	if (!cbss)
+		return NULL;
+	return (void *)cbss->priv;
+}
+
+static void ieee80211_rx_bss_free(struct cfg80211_bss *cbss)
+{
+	struct ieee80211_bss *bss = (void *)cbss->priv;
+
+	kfree(bss_mesh_id(bss));
+	kfree(bss_mesh_cfg(bss));
+}
+
+void ieee80211_rx_bss_put(struct ieee80211_local *local,
+			  struct ieee80211_bss *bss)
+{
+	if (!bss)
+		return;
+	cfg80211_put_bss(container_of((void *)bss, struct cfg80211_bss, priv));
+}
+
+static bool is_uapsd_supported(struct ieee802_11_elems *elems)
+{
+	u8 qos_info;
+
+	if (elems->wmm_info && elems->wmm_info_len == 7
+	    && elems->wmm_info[5] == 1)
+		qos_info = elems->wmm_info[6];
+	else if (elems->wmm_param && elems->wmm_param_len == 24
+		 && elems->wmm_param[5] == 1)
+		qos_info = elems->wmm_param[6];
+	else
+		/* no valid wmm information or parameter element found */
+		return false;
+
+	return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD;
+}
+
+struct ieee80211_bss *
+ieee80211_bss_info_update(struct ieee80211_local *local,
+			  struct ieee80211_rx_status *rx_status,
+			  struct ieee80211_mgmt *mgmt,
+			  size_t len,
+			  struct ieee802_11_elems *elems,
+			  struct ieee80211_channel *channel,
+			  bool beacon)
+{
+	struct cfg80211_bss *cbss;
+	struct ieee80211_bss *bss;
+	int clen, srlen;
+	s32 signal = 0;
+
+	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
+		signal = rx_status->signal * 100;
+	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
+		signal = (rx_status->signal * 100) / local->hw.max_signal;
+
+	cbss = cfg80211_inform_bss_frame(local->hw.wiphy, channel,
+					 mgmt, len, signal, GFP_ATOMIC);
+
+	if (!cbss)
+		return NULL;
+
+	cbss->free_priv = ieee80211_rx_bss_free;
+	bss = (void *)cbss->priv;
+
+	/* save the ERP value so that it is available at association time */
+	if (elems->erp_info && elems->erp_info_len >= 1) {
+		bss->erp_value = elems->erp_info[0];
+		bss->has_erp_value = 1;
+	}
+
+	if (elems->tim) {
+		struct ieee80211_tim_ie *tim_ie =
+			(struct ieee80211_tim_ie *)elems->tim;
+		bss->dtim_period = tim_ie->dtim_period;
+	}
+
+	/* If the beacon had no TIM IE, or it was invalid, use 1 */
+	if (beacon && !bss->dtim_period)
+		bss->dtim_period = 1;
+
+	/* replace old supported rates if we get new values */
+	srlen = 0;
+	if (elems->supp_rates) {
+		clen = IEEE80211_MAX_SUPP_RATES;
+		if (clen > elems->supp_rates_len)
+			clen = elems->supp_rates_len;
+		memcpy(bss->supp_rates, elems->supp_rates, clen);
+		srlen += clen;
+	}
+	if (elems->ext_supp_rates) {
+		clen = IEEE80211_MAX_SUPP_RATES - srlen;
+		if (clen > elems->ext_supp_rates_len)
+			clen = elems->ext_supp_rates_len;
+		memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen);
+		srlen += clen;
+	}
+	if (srlen)
+		bss->supp_rates_len = srlen;
+
+	bss->wmm_used = elems->wmm_param || elems->wmm_info;
+	bss->uapsd_supported = is_uapsd_supported(elems);
+
+	if (!beacon)
+		bss->last_probe_resp = jiffies;
+
+	return bss;
+}
+
+ieee80211_rx_result
+ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_bss *bss;
+	u8 *elements;
+	struct ieee80211_channel *channel;
+	size_t baselen;
+	int freq;
+	__le16 fc;
+	bool presp, beacon = false;
+	struct ieee802_11_elems elems;
+
+	if (skb->len < 2)
+		return RX_DROP_UNUSABLE;
+
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = mgmt->frame_control;
+
+	if (ieee80211_is_ctl(fc))
+		return RX_CONTINUE;
+
+	if (skb->len < 24)
+		return RX_CONTINUE;
+
+	presp = ieee80211_is_probe_resp(fc);
+	if (presp) {
+		/* ignore ProbeResp to foreign address */
+		if (memcmp(mgmt->da, sdata->vif.addr, ETH_ALEN))
+			return RX_DROP_MONITOR;
+
+		presp = true;
+		elements = mgmt->u.probe_resp.variable;
+		baselen = offsetof(struct ieee80211_mgmt, u.probe_resp.variable);
+	} else {
+		beacon = ieee80211_is_beacon(fc);
+		baselen = offsetof(struct ieee80211_mgmt, u.beacon.variable);
+		elements = mgmt->u.beacon.variable;
+	}
+
+	if (!presp && !beacon)
+		return RX_CONTINUE;
+
+	if (baselen > skb->len)
+		return RX_DROP_MONITOR;
+
+	ieee802_11_parse_elems(elements, skb->len - baselen, &elems);
+
+	if (elems.ds_params && elems.ds_params_len == 1)
+		freq = ieee80211_channel_to_frequency(elems.ds_params[0],
+						      rx_status->band);
+	else
+		freq = rx_status->freq;
+
+	channel = ieee80211_get_channel(sdata->local->hw.wiphy, freq);
+
+	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
+		return RX_DROP_MONITOR;
+
+	bss = ieee80211_bss_info_update(sdata->local, rx_status,
+					mgmt, skb->len, &elems,
+					channel, beacon);
+	if (bss)
+		ieee80211_rx_bss_put(sdata->local, bss);
+
+	/* If we are on-operating-channel, and this packet is for the
+	 * current channel, pass the pkt on up the stack so that
+	 * the rest of the stack can make use of it.
+	 */
+	if (ieee80211_cfg_on_oper_channel(sdata->local)
+	    && (channel == sdata->local->oper_channel))
+		return RX_CONTINUE;
+
+	dev_kfree_skb(skb);
+	return RX_QUEUED;
+}
+
+/* return false if no more work */
+static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
+{
+	struct cfg80211_scan_request *req = local->scan_req;
+	enum ieee80211_band band;
+	int i, ielen, n_chans;
+
+	do {
+		if (local->hw_scan_band == IEEE80211_NUM_BANDS)
+			return false;
+
+		band = local->hw_scan_band;
+		n_chans = 0;
+		for (i = 0; i < req->n_channels; i++) {
+			if (req->channels[i]->band == band) {
+				local->hw_scan_req->channels[n_chans] =
+							req->channels[i];
+				n_chans++;
+			}
+		}
+
+		local->hw_scan_band++;
+	} while (!n_chans);
+
+	local->hw_scan_req->n_channels = n_chans;
+
+	ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie,
+					 req->ie, req->ie_len, band, (u32) -1,
+					 0);
+	local->hw_scan_req->ie_len = ielen;
+
+	return true;
+}
+
+static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
+				       bool was_hw_scan)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	bool on_oper_chan;
+	bool enable_beacons = false;
+
+	lockdep_assert_held(&local->mtx);
+
+	/*
+	 * It's ok to abort a not-yet-running scan (that
+	 * we have one at all will be verified by checking
+	 * local->scan_req next), but not to complete it
+	 * successfully.
+	 */
+	if (WARN_ON(!local->scanning && !aborted))
+		aborted = true;
+
+	if (WARN_ON(!local->scan_req))
+		return;
+
+	if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) {
+		int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req);
+		if (rc == 0)
+			return;
+	}
+
+	kfree(local->hw_scan_req);
+	local->hw_scan_req = NULL;
+
+	if (local->scan_req != local->int_scan_req)
+		cfg80211_scan_done(local->scan_req, aborted);
+	local->scan_req = NULL;
+	local->scan_sdata = NULL;
+
+	local->scanning = 0;
+	local->scan_channel = NULL;
+
+	on_oper_chan = ieee80211_cfg_on_oper_channel(local);
+
+	if (was_hw_scan || !on_oper_chan)
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+	else
+		/* Set power back to normal operating levels. */
+		ieee80211_hw_config(local, 0);
+
+	if (!was_hw_scan) {
+		bool on_oper_chan2;
+		ieee80211_configure_filter(local);
+		drv_sw_scan_complete(local);
+		on_oper_chan2 = ieee80211_cfg_on_oper_channel(local);
+		/* We should always be on-channel at this point. */
+		WARN_ON(!on_oper_chan2);
+		if (on_oper_chan2 && (on_oper_chan != on_oper_chan2))
+			enable_beacons = true;
+
+		ieee80211_offchannel_return(local, enable_beacons, true);
+	}
+
+	ieee80211_recalc_idle(local);
+
+	ieee80211_mlme_notify_scan_completed(local);
+	ieee80211_ibss_notify_scan_completed(local);
+	ieee80211_mesh_notify_scan_completed(local);
+	ieee80211_queue_work(&local->hw, &local->work_work);
+}
+
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_scan_completed(local, aborted);
+
+	set_bit(SCAN_COMPLETED, &local->scanning);
+	if (aborted)
+		set_bit(SCAN_ABORTED, &local->scanning);
+	ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+}
+EXPORT_SYMBOL(ieee80211_scan_completed);
+
+static int ieee80211_start_sw_scan(struct ieee80211_local *local)
+{
+	/*
+	 * Hardware/driver doesn't support hw_scan, so use software
+	 * scanning instead. First send a nullfunc frame with power save
+	 * bit on so that AP will buffer the frames for us while we are not
+	 * listening, then send probe requests to each channel and wait for
+	 * the responses. After all channels are scanned, tune back to the
+	 * original channel and send a nullfunc frame with power save bit
+	 * off to trigger the AP to send us all the buffered frames.
+	 *
+	 * Note that while local->sw_scanning is true everything else but
+	 * nullfunc frames and probe requests will be dropped in
+	 * ieee80211_tx_h_check_assoc().
+	 */
+	drv_sw_scan_start(local);
+
+	local->leave_oper_channel_time = 0;
+	local->next_scan_state = SCAN_DECISION;
+	local->scan_channel_idx = 0;
+
+	/* We always want to use off-channel PS, even if we
+	 * are not really leaving oper-channel.  Don't
+	 * tell the AP though, as long as we are on-channel.
+	 */
+	ieee80211_offchannel_enable_all_ps(local, false);
+
+	ieee80211_configure_filter(local);
+
+	/* We need to set power level at maximum rate for scanning. */
+	ieee80211_hw_config(local, 0);
+
+	ieee80211_queue_delayed_work(&local->hw,
+				     &local->scan_work,
+				     IEEE80211_CHANNEL_TIME);
+
+	return 0;
+}
+
+
+static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
+				  struct cfg80211_scan_request *req)
+{
+	struct ieee80211_local *local = sdata->local;
+	int rc;
+
+	lockdep_assert_held(&local->mtx);
+
+	if (local->scan_req)
+		return -EBUSY;
+
+	if (!list_empty(&local->work_list)) {
+		/* wait for the work to finish/time out */
+		local->scan_req = req;
+		local->scan_sdata = sdata;
+		return 0;
+	}
+
+	if (local->ops->hw_scan) {
+		u8 *ies;
+
+		local->hw_scan_req = kmalloc(
+				sizeof(*local->hw_scan_req) +
+				req->n_channels * sizeof(req->channels[0]) +
+				2 + IEEE80211_MAX_SSID_LEN + local->scan_ies_len +
+				req->ie_len, GFP_KERNEL);
+		if (!local->hw_scan_req)
+			return -ENOMEM;
+
+		local->hw_scan_req->ssids = req->ssids;
+		local->hw_scan_req->n_ssids = req->n_ssids;
+		ies = (u8 *)local->hw_scan_req +
+			sizeof(*local->hw_scan_req) +
+			req->n_channels * sizeof(req->channels[0]);
+		local->hw_scan_req->ie = ies;
+
+		local->hw_scan_band = 0;
+
+		/*
+		 * After allocating local->hw_scan_req, we must
+		 * go through until ieee80211_prep_hw_scan(), so
+		 * anything that might be changed here and leave
+		 * this function early must not go after this
+		 * allocation.
+		 */
+	}
+
+	local->scan_req = req;
+	local->scan_sdata = sdata;
+
+	if (local->ops->hw_scan)
+		__set_bit(SCAN_HW_SCANNING, &local->scanning);
+	else
+		__set_bit(SCAN_SW_SCANNING, &local->scanning);
+
+	ieee80211_recalc_idle(local);
+
+	if (local->ops->hw_scan) {
+		WARN_ON(!ieee80211_prep_hw_scan(local));
+		rc = drv_hw_scan(local, sdata, local->hw_scan_req);
+	} else
+		rc = ieee80211_start_sw_scan(local);
+
+	if (rc) {
+		kfree(local->hw_scan_req);
+		local->hw_scan_req = NULL;
+		local->scanning = 0;
+
+		ieee80211_recalc_idle(local);
+
+		local->scan_req = NULL;
+		local->scan_sdata = NULL;
+	}
+
+	return rc;
+}
+
+static unsigned long
+ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
+{
+	/*
+	 * TODO: channel switching also consumes quite some time,
+	 * add that delay as well to get a better estimation
+	 */
+	if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN)
+		return IEEE80211_PASSIVE_CHANNEL_TIME;
+	return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
+}
+
+static void ieee80211_scan_state_decision(struct ieee80211_local *local,
+					  unsigned long *next_delay)
+{
+	bool associated = false;
+	bool tx_empty = true;
+	bool bad_latency;
+	bool listen_int_exceeded;
+	unsigned long min_beacon_int = 0;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_channel *next_chan;
+
+	/*
+	 * check if at least one STA interface is associated,
+	 * check if at least one STA interface has pending tx frames
+	 * and grab the lowest used beacon interval
+	 */
+	mutex_lock(&local->iflist_mtx);
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+			if (sdata->u.mgd.associated) {
+				associated = true;
+
+				if (sdata->vif.bss_conf.beacon_int <
+				    min_beacon_int || min_beacon_int == 0)
+					min_beacon_int =
+						sdata->vif.bss_conf.beacon_int;
+
+				if (!qdisc_all_tx_empty(sdata->dev)) {
+					tx_empty = false;
+					break;
+				}
+			}
+		}
+	}
+	mutex_unlock(&local->iflist_mtx);
+
+	next_chan = local->scan_req->channels[local->scan_channel_idx];
+
+	if (ieee80211_cfg_on_oper_channel(local)) {
+		/* We're currently on operating channel. */
+		if (next_chan == local->oper_channel)
+			/* We don't need to move off of operating channel. */
+			local->next_scan_state = SCAN_SET_CHANNEL;
+		else
+			/*
+			 * We do need to leave operating channel, as next
+			 * scan is somewhere else.
+			 */
+			local->next_scan_state = SCAN_LEAVE_OPER_CHANNEL;
+	} else {
+		/*
+		 * we're currently scanning a different channel, let's
+		 * see if we can scan another channel without interfering
+		 * with the current traffic situation.
+		 *
+		 * Since we don't know if the AP has pending frames for us
+		 * we can only check for our tx queues and use the current
+		 * pm_qos requirements for rx. Hence, if no tx traffic occurs
+		 * at all we will scan as many channels in a row as the pm_qos
+		 * latency allows us to. Additionally we also check for the
+		 * currently negotiated listen interval to prevent losing
+		 * frames unnecessarily.
+		 *
+		 * Otherwise switch back to the operating channel.
+		 */
+
+		bad_latency = time_after(jiffies +
+				ieee80211_scan_get_channel_time(next_chan),
+				local->leave_oper_channel_time +
+				usecs_to_jiffies(pm_qos_request(PM_QOS_NETWORK_LATENCY)));
+
+		listen_int_exceeded = time_after(jiffies +
+				ieee80211_scan_get_channel_time(next_chan),
+				local->leave_oper_channel_time +
+				usecs_to_jiffies(min_beacon_int * 1024) *
+				local->hw.conf.listen_interval);
+
+		if (associated && ( !tx_empty || bad_latency ||
+		    listen_int_exceeded))
+			local->next_scan_state = SCAN_ENTER_OPER_CHANNEL;
+		else
+			local->next_scan_state = SCAN_SET_CHANNEL;
+	}
+
+	*next_delay = 0;
+}
+
+static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local,
+						    unsigned long *next_delay)
+{
+	/* PS will already be in off-channel mode,
+	 * we do that once at the beginning of scanning.
+	 */
+	ieee80211_offchannel_stop_vifs(local, false);
+
+	/*
+	 * What if the nullfunc frames didn't arrive?
+	 */
+	drv_flush(local, false);
+	if (local->ops->flush)
+		*next_delay = 0;
+	else
+		*next_delay = HZ / 10;
+
+	/* remember when we left the operating channel */
+	local->leave_oper_channel_time = jiffies;
+
+	/* advance to the next channel to be scanned */
+	local->next_scan_state = SCAN_SET_CHANNEL;
+}
+
+static void ieee80211_scan_state_enter_oper_channel(struct ieee80211_local *local,
+						    unsigned long *next_delay)
+{
+	/* switch back to the operating channel */
+	local->scan_channel = NULL;
+	if (!ieee80211_cfg_on_oper_channel(local))
+		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+
+	/*
+	 * Re-enable vifs and beaconing.  Leave PS
+	 * in off-channel state..will put that back
+	 * on-channel at the end of scanning.
+	 */
+	ieee80211_offchannel_return(local, true, false);
+
+	*next_delay = HZ / 5;
+	local->next_scan_state = SCAN_DECISION;
+}
+
+static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
+					     unsigned long *next_delay)
+{
+	int skip;
+	struct ieee80211_channel *chan;
+
+	skip = 0;
+	chan = local->scan_req->channels[local->scan_channel_idx];
+
+	local->scan_channel = chan;
+
+	/* Only call hw-config if we really need to change channels. */
+	if (chan != local->hw.conf.channel)
+		if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL))
+			skip = 1;
+
+	/* advance state machine to next channel/band */
+	local->scan_channel_idx++;
+
+	if (skip) {
+		/* if we skip this channel return to the decision state */
+		local->next_scan_state = SCAN_DECISION;
+		return;
+	}
+
+	/*
+	 * Probe delay is used to update the NAV, cf. 11.1.3.2.2
+	 * (which unfortunately doesn't say _why_ step a) is done,
+	 * but it waits for the probe delay or until a frame is
+	 * received - and the received frame would update the NAV).
+	 * For now, we do not support waiting until a frame is
+	 * received.
+	 *
+	 * In any case, it is not necessary for a passive scan.
+	 */
+	if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN ||
+	    !local->scan_req->n_ssids) {
+		*next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
+		local->next_scan_state = SCAN_DECISION;
+		return;
+	}
+
+	/* active scan, send probes */
+	*next_delay = IEEE80211_PROBE_DELAY;
+	local->next_scan_state = SCAN_SEND_PROBE;
+}
+
+static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
+					    unsigned long *next_delay)
+{
+	int i;
+	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
+
+	for (i = 0; i < local->scan_req->n_ssids; i++)
+		ieee80211_send_probe_req(
+			sdata, NULL,
+			local->scan_req->ssids[i].ssid,
+			local->scan_req->ssids[i].ssid_len,
+			local->scan_req->ie, local->scan_req->ie_len);
+
+	/*
+	 * After sending probe requests, wait for probe responses
+	 * on the channel.
+	 */
+	*next_delay = IEEE80211_CHANNEL_TIME;
+	local->next_scan_state = SCAN_DECISION;
+}
+
+void ieee80211_scan_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, scan_work.work);
+	struct ieee80211_sub_if_data *sdata;
+	unsigned long next_delay = 0;
+	bool aborted, hw_scan;
+
+	mutex_lock(&local->mtx);
+
+	sdata = local->scan_sdata;
+
+	if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) {
+		aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
+		goto out_complete;
+	}
+
+	if (!sdata || !local->scan_req)
+		goto out;
+
+	if (local->scan_req && !local->scanning) {
+		struct cfg80211_scan_request *req = local->scan_req;
+		int rc;
+
+		local->scan_req = NULL;
+		local->scan_sdata = NULL;
+
+		rc = __ieee80211_start_scan(sdata, req);
+		if (rc) {
+			/* need to complete scan in cfg80211 */
+			local->scan_req = req;
+			aborted = true;
+			goto out_complete;
+		} else
+			goto out;
+	}
+
+	/*
+	 * Avoid re-scheduling when the sdata is going away.
+	 */
+	if (!ieee80211_sdata_running(sdata)) {
+		aborted = true;
+		goto out_complete;
+	}
+
+	/*
+	 * as long as no delay is required advance immediately
+	 * without scheduling a new work
+	 */
+	do {
+		if (!ieee80211_sdata_running(sdata)) {
+			aborted = true;
+			goto out_complete;
+		}
+
+		switch (local->next_scan_state) {
+		case SCAN_DECISION:
+			/* if no more bands/channels left, complete scan */
+			if (local->scan_channel_idx >= local->scan_req->n_channels) {
+				aborted = false;
+				goto out_complete;
+			}
+			ieee80211_scan_state_decision(local, &next_delay);
+			break;
+		case SCAN_SET_CHANNEL:
+			ieee80211_scan_state_set_channel(local, &next_delay);
+			break;
+		case SCAN_SEND_PROBE:
+			ieee80211_scan_state_send_probe(local, &next_delay);
+			break;
+		case SCAN_LEAVE_OPER_CHANNEL:
+			ieee80211_scan_state_leave_oper_channel(local, &next_delay);
+			break;
+		case SCAN_ENTER_OPER_CHANNEL:
+			ieee80211_scan_state_enter_oper_channel(local, &next_delay);
+			break;
+		}
+	} while (next_delay == 0);
+
+	ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay);
+	goto out;
+
+out_complete:
+	hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
+	__ieee80211_scan_completed(&local->hw, aborted, hw_scan);
+out:
+	mutex_unlock(&local->mtx);
+}
+
+int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
+			   struct cfg80211_scan_request *req)
+{
+	int res;
+
+	mutex_lock(&sdata->local->mtx);
+	res = __ieee80211_start_scan(sdata, req);
+	mutex_unlock(&sdata->local->mtx);
+
+	return res;
+}
+
+int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
+				    const u8 *ssid, u8 ssid_len,
+				    struct ieee80211_channel *chan)
+{
+	struct ieee80211_local *local = sdata->local;
+	int ret = -EBUSY;
+	enum ieee80211_band band;
+
+	mutex_lock(&local->mtx);
+
+	/* busy scanning */
+	if (local->scan_req)
+		goto unlock;
+
+	/* fill internal scan request */
+	if (!chan) {
+		int i, nchan = 0;
+
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			if (!local->hw.wiphy->bands[band])
+				continue;
+			for (i = 0;
+			     i < local->hw.wiphy->bands[band]->n_channels;
+			     i++) {
+				local->int_scan_req->channels[nchan] =
+				    &local->hw.wiphy->bands[band]->channels[i];
+				nchan++;
+			}
+		}
+
+		local->int_scan_req->n_channels = nchan;
+	} else {
+		local->int_scan_req->channels[0] = chan;
+		local->int_scan_req->n_channels = 1;
+	}
+
+	local->int_scan_req->ssids = &local->scan_ssid;
+	local->int_scan_req->n_ssids = 1;
+	memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN);
+	local->int_scan_req->ssids[0].ssid_len = ssid_len;
+
+	ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req);
+ unlock:
+	mutex_unlock(&local->mtx);
+	return ret;
+}
+
+/*
+ * Only call this function when a scan can't be queued -- under RTNL.
+ */
+void ieee80211_scan_cancel(struct ieee80211_local *local)
+{
+	bool abortscan;
+
+	/*
+	 * We are only canceling software scan, or deferred scan that was not
+	 * yet really started (see __ieee80211_start_scan ).
+	 *
+	 * Regarding hardware scan:
+	 * - we can not call  __ieee80211_scan_completed() as when
+	 *   SCAN_HW_SCANNING bit is set this function change
+	 *   local->hw_scan_req to operate on 5G band, what race with
+	 *   driver which can use local->hw_scan_req
+	 *
+	 * - we can not cancel scan_work since driver can schedule it
+	 *   by ieee80211_scan_completed(..., true) to finish scan
+	 *
+	 * Hence low lever driver is responsible for canceling HW scan.
+	 */
+
+	mutex_lock(&local->mtx);
+	abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning);
+	if (abortscan) {
+		/*
+		 * The scan is canceled, but stop work from being pending.
+		 *
+		 * If the work is currently running, it must be blocked on
+		 * the mutex, but we'll set scan_sdata = NULL and it'll
+		 * simply exit once it acquires the mutex.
+		 */
+		cancel_delayed_work(&local->scan_work);
+		/* and clean up */
+		__ieee80211_scan_completed(&local->hw, true, false);
+	}
+	mutex_unlock(&local->mtx);
+}
+
+int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
+				       struct cfg80211_sched_scan_request *req)
+{
+	struct ieee80211_local *local = sdata->local;
+	int ret, i;
+
+	mutex_lock(&sdata->local->mtx);
+
+	if (local->sched_scanning) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!local->ops->sched_scan_start) {
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		local->sched_scan_ies.ie[i] = kzalloc(2 +
+						      IEEE80211_MAX_SSID_LEN +
+						      local->scan_ies_len +
+						      req->ie_len,
+						      GFP_KERNEL);
+		if (!local->sched_scan_ies.ie[i]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		local->sched_scan_ies.len[i] =
+			ieee80211_build_preq_ies(local,
+						 local->sched_scan_ies.ie[i],
+						 req->ie, req->ie_len, i,
+						 (u32) -1, 0);
+	}
+
+	ret = drv_sched_scan_start(local, sdata, req,
+				   &local->sched_scan_ies);
+	if (ret == 0) {
+		local->sched_scanning = true;
+		goto out;
+	}
+
+out_free:
+	while (i > 0)
+		kfree(local->sched_scan_ies.ie[--i]);
+out:
+	mutex_unlock(&sdata->local->mtx);
+	return ret;
+}
+
+int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	int ret = 0, i;
+
+	mutex_lock(&sdata->local->mtx);
+
+	if (!local->ops->sched_scan_stop) {
+		ret = -ENOTSUPP;
+		goto out;
+	}
+
+	if (local->sched_scanning) {
+		for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+			kfree(local->sched_scan_ies.ie[i]);
+
+		drv_sched_scan_stop(local, sdata);
+		local->sched_scanning = false;
+	}
+out:
+	mutex_unlock(&sdata->local->mtx);
+
+	return ret;
+}
+
+void ieee80211_sched_scan_results(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_sched_scan_results(local);
+
+	cfg80211_sched_scan_results(hw->wiphy);
+}
+EXPORT_SYMBOL(ieee80211_sched_scan_results);
+
+void ieee80211_sched_scan_stopped_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local,
+			     sched_scan_stopped_work);
+	int i;
+
+	mutex_lock(&local->mtx);
+
+	if (!local->sched_scanning) {
+		mutex_unlock(&local->mtx);
+		return;
+	}
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++)
+		kfree(local->sched_scan_ies.ie[i]);
+
+	local->sched_scanning = false;
+
+	mutex_unlock(&local->mtx);
+
+	cfg80211_sched_scan_stopped(local->hw.wiphy);
+}
+
+void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_sched_scan_stopped(local);
+
+	ieee80211_queue_work(&local->hw, &local->sched_scan_stopped_work);
+}
+EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
new file mode 100644
index 00000000..7733f66e
--- /dev/null
+++ b/net/mac80211/spectmgmt.c
@@ -0,0 +1,86 @@
+/*
+ * spectrum management
+ *
+ * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007  Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2007-2008, Intel Corporation
+ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ieee80211.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+#include "wme.h"
+
+static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_msrment_ie *request_ie,
+					const u8 *da, const u8 *bssid,
+					u8 dialog_token)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *msr_report;
+
+	skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
+				sizeof(struct ieee80211_msrment_ie));
+
+	if (!skb) {
+		printk(KERN_ERR "%s: failed to allocate buffer for "
+				"measurement report frame\n", sdata->name);
+		return;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	msr_report = (struct ieee80211_mgmt *)skb_put(skb, 24);
+	memset(msr_report, 0, 24);
+	memcpy(msr_report->da, da, ETH_ALEN);
+	memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(msr_report->bssid, bssid, ETH_ALEN);
+	msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						IEEE80211_STYPE_ACTION);
+
+	skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
+	msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
+	msr_report->u.action.u.measurement.action_code =
+				WLAN_ACTION_SPCT_MSR_RPRT;
+	msr_report->u.action.u.measurement.dialog_token = dialog_token;
+
+	msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
+	msr_report->u.action.u.measurement.length =
+			sizeof(struct ieee80211_msrment_ie);
+
+	memset(&msr_report->u.action.u.measurement.msr_elem, 0,
+		sizeof(struct ieee80211_msrment_ie));
+	msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
+	msr_report->u.action.u.measurement.msr_elem.mode |=
+			IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
+	msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;
+
+	ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
+				       struct ieee80211_mgmt *mgmt,
+				       size_t len)
+{
+	/*
+	 * Ignoring measurement request is spec violation.
+	 * Mandatory measurements must be reported optional
+	 * measurements might be refused or reported incapable
+	 * For now just refuse
+	 * TODO: Answer basic measurement as unmeasured
+	 */
+	ieee80211_send_refuse_measurement_request(sdata,
+			&mgmt->u.action.u.measurement.msr_elem,
+			mgmt->sa, mgmt->bssid,
+			mgmt->u.action.u.measurement.dialog_token);
+}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
new file mode 100644
index 00000000..3ff633e8
--- /dev/null
+++ b/net/mac80211/sta_info.c
@@ -0,0 +1,1005 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/timer.h>
+#include <linux/rtnetlink.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "sta_info.h"
+#include "debugfs_sta.h"
+#include "mesh.h"
+
+/**
+ * DOC: STA information lifetime rules
+ *
+ * STA info structures (&struct sta_info) are managed in a hash table
+ * for faster lookup and a list for iteration. They are managed using
+ * RCU, i.e. access to the list and hash table is protected by RCU.
+ *
+ * Upon allocating a STA info structure with sta_info_alloc(), the caller
+ * owns that structure. It must then insert it into the hash table using
+ * either sta_info_insert() or sta_info_insert_rcu(); only in the latter
+ * case (which acquires an rcu read section but must not be called from
+ * within one) will the pointer still be valid after the call. Note that
+ * the caller may not do much with the STA info before inserting it, in
+ * particular, it may not start any mesh peer link management or add
+ * encryption keys.
+ *
+ * When the insertion fails (sta_info_insert()) returns non-zero), the
+ * structure will have been freed by sta_info_insert()!
+ *
+ * Station entries are added by mac80211 when you establish a link with a
+ * peer. This means different things for the different type of interfaces
+ * we support. For a regular station this mean we add the AP sta when we
+ * receive an association response from the AP. For IBSS this occurs when
+ * get to know about a peer on the same IBSS. For WDS we add the sta for
+ * the peer immediately upon device open. When using AP mode we add stations
+ * for each respective station upon request from userspace through nl80211.
+ *
+ * In order to remove a STA info structure, various sta_info_destroy_*()
+ * calls are available.
+ *
+ * There is no concept of ownership on a STA entry, each structure is
+ * owned by the global hash table/list until it is removed. All users of
+ * the structure need to be RCU protected so that the structure won't be
+ * freed before they are done using it.
+ */
+
+/* Caller must hold local->sta_lock */
+static int sta_info_hash_del(struct ieee80211_local *local,
+			     struct sta_info *sta)
+{
+	struct sta_info *s;
+
+	s = rcu_dereference_protected(local->sta_hash[STA_HASH(sta->sta.addr)],
+				      lockdep_is_held(&local->sta_lock));
+	if (!s)
+		return -ENOENT;
+	if (s == sta) {
+		rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)],
+				   s->hnext);
+		return 0;
+	}
+
+	while (rcu_access_pointer(s->hnext) &&
+	       rcu_access_pointer(s->hnext) != sta)
+		s = rcu_dereference_protected(s->hnext,
+					lockdep_is_held(&local->sta_lock));
+	if (rcu_access_pointer(s->hnext)) {
+		rcu_assign_pointer(s->hnext, sta->hnext);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+/* protected by RCU */
+struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
+			      const u8 *addr)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
+				    rcu_read_lock_held() ||
+				    lockdep_is_held(&local->sta_lock) ||
+				    lockdep_is_held(&local->sta_mtx));
+	while (sta) {
+		if (sta->sdata == sdata &&
+		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
+			break;
+		sta = rcu_dereference_check(sta->hnext,
+					    rcu_read_lock_held() ||
+					    lockdep_is_held(&local->sta_lock) ||
+					    lockdep_is_held(&local->sta_mtx));
+	}
+	return sta;
+}
+
+/*
+ * Get sta info either from the specified interface
+ * or from one of its vlans
+ */
+struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
+				  const u8 *addr)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
+				    rcu_read_lock_held() ||
+				    lockdep_is_held(&local->sta_lock) ||
+				    lockdep_is_held(&local->sta_mtx));
+	while (sta) {
+		if ((sta->sdata == sdata ||
+		     (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
+		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
+			break;
+		sta = rcu_dereference_check(sta->hnext,
+					    rcu_read_lock_held() ||
+					    lockdep_is_held(&local->sta_lock) ||
+					    lockdep_is_held(&local->sta_mtx));
+	}
+	return sta;
+}
+
+struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
+				     int idx)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	int i = 0;
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		if (sdata != sta->sdata)
+			continue;
+		if (i < idx) {
+			++i;
+			continue;
+		}
+		return sta;
+	}
+
+	return NULL;
+}
+
+/**
+ * __sta_info_free - internal STA free helper
+ *
+ * @local: pointer to the global information
+ * @sta: STA info to free
+ *
+ * This function must undo everything done by sta_info_alloc()
+ * that may happen before sta_info_insert().
+ */
+static void __sta_info_free(struct ieee80211_local *local,
+			    struct sta_info *sta)
+{
+	if (sta->rate_ctrl) {
+		rate_control_free_sta(sta);
+		rate_control_put(sta->rate_ctrl);
+	}
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+
+	kfree(sta);
+}
+
+/* Caller must hold local->sta_lock */
+static void sta_info_hash_add(struct ieee80211_local *local,
+			      struct sta_info *sta)
+{
+	sta->hnext = local->sta_hash[STA_HASH(sta->sta.addr)];
+	rcu_assign_pointer(local->sta_hash[STA_HASH(sta->sta.addr)], sta);
+}
+
+static void sta_unblock(struct work_struct *wk)
+{
+	struct sta_info *sta;
+
+	sta = container_of(wk, struct sta_info, drv_unblock_wk);
+
+	if (sta->dead)
+		return;
+
+	if (!test_sta_flags(sta, WLAN_STA_PS_STA))
+		ieee80211_sta_ps_deliver_wakeup(sta);
+	else if (test_and_clear_sta_flags(sta, WLAN_STA_PSPOLL)) {
+		clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
+		ieee80211_sta_ps_deliver_poll_response(sta);
+	} else
+		clear_sta_flags(sta, WLAN_STA_PS_DRIVER);
+}
+
+static int sta_prepare_rate_control(struct ieee80211_local *local,
+				    struct sta_info *sta, gfp_t gfp)
+{
+	if (local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)
+		return 0;
+
+	sta->rate_ctrl = rate_control_get(local->rate_ctrl);
+	sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
+						     &sta->sta, gfp);
+	if (!sta->rate_ctrl_priv) {
+		rate_control_put(sta->rate_ctrl);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
+				u8 *addr, gfp_t gfp)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	struct timespec uptime;
+	int i;
+
+	sta = kzalloc(sizeof(*sta) + local->hw.sta_data_size, gfp);
+	if (!sta)
+		return NULL;
+
+	spin_lock_init(&sta->lock);
+	spin_lock_init(&sta->flaglock);
+	INIT_WORK(&sta->drv_unblock_wk, sta_unblock);
+	INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
+	mutex_init(&sta->ampdu_mlme.mtx);
+
+	memcpy(sta->sta.addr, addr, ETH_ALEN);
+	sta->local = local;
+	sta->sdata = sdata;
+	sta->last_rx = jiffies;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	sta->last_connected = uptime.tv_sec;
+	ewma_init(&sta->avg_signal, 1024, 8);
+
+	if (sta_prepare_rate_control(local, sta, gfp)) {
+		kfree(sta);
+		return NULL;
+	}
+
+	for (i = 0; i < STA_TID_NUM; i++) {
+		/*
+		 * timer_to_tid must be initialized with identity mapping
+		 * to enable session_timer's data differentiation. See
+		 * sta_rx_agg_session_timer_expired for usage.
+		 */
+		sta->timer_to_tid[i] = i;
+	}
+	skb_queue_head_init(&sta->ps_tx_buf);
+	skb_queue_head_init(&sta->tx_filtered);
+
+	for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
+		sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+
+#ifdef CONFIG_MAC80211_MESH
+	sta->plink_state = NL80211_PLINK_LISTEN;
+	init_timer(&sta->plink_timer);
+#endif
+
+	return sta;
+}
+
+static int sta_info_finish_insert(struct sta_info *sta, bool async)
+{
+	struct ieee80211_local *local = sta->local;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct station_info sinfo;
+	unsigned long flags;
+	int err = 0;
+
+	lockdep_assert_held(&local->sta_mtx);
+
+	/* notify driver */
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss,
+				     struct ieee80211_sub_if_data,
+				     u.ap);
+	err = drv_sta_add(local, sdata, &sta->sta);
+	if (err) {
+		if (!async)
+			return err;
+		printk(KERN_DEBUG "%s: failed to add IBSS STA %pM to driver (%d)"
+				  " - keeping it anyway.\n",
+		       sdata->name, sta->sta.addr, err);
+	} else {
+		sta->uploaded = true;
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		if (async)
+			wiphy_debug(local->hw.wiphy,
+				    "Finished adding IBSS STA %pM\n",
+				    sta->sta.addr);
+#endif
+	}
+
+	sdata = sta->sdata;
+
+	if (!async) {
+		local->num_sta++;
+		local->sta_generation++;
+		smp_mb();
+
+		/* make the station visible */
+		spin_lock_irqsave(&local->sta_lock, flags);
+		sta_info_hash_add(local, sta);
+		spin_unlock_irqrestore(&local->sta_lock, flags);
+	}
+
+	list_add(&sta->list, &local->sta_list);
+
+	ieee80211_sta_debugfs_add(sta);
+	rate_control_add_sta_debugfs(sta);
+
+	memset(&sinfo, 0, sizeof(sinfo));
+	sinfo.filled = 0;
+	sinfo.generation = local->sta_generation;
+	cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+
+
+	return 0;
+}
+
+static void sta_info_finish_pending(struct ieee80211_local *local)
+{
+	struct sta_info *sta;
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->sta_lock, flags);
+	while (!list_empty(&local->sta_pending_list)) {
+		sta = list_first_entry(&local->sta_pending_list,
+				       struct sta_info, list);
+		list_del(&sta->list);
+		spin_unlock_irqrestore(&local->sta_lock, flags);
+
+		sta_info_finish_insert(sta, true);
+
+		spin_lock_irqsave(&local->sta_lock, flags);
+	}
+	spin_unlock_irqrestore(&local->sta_lock, flags);
+}
+
+static void sta_info_finish_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, sta_finish_work);
+
+	mutex_lock(&local->sta_mtx);
+	sta_info_finish_pending(local);
+	mutex_unlock(&local->sta_mtx);
+}
+
+int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
+{
+	struct ieee80211_local *local = sta->local;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	unsigned long flags;
+	int err = 0;
+
+	/*
+	 * Can't be a WARN_ON because it can be triggered through a race:
+	 * something inserts a STA (on one CPU) without holding the RTNL
+	 * and another CPU turns off the net device.
+	 */
+	if (unlikely(!ieee80211_sdata_running(sdata))) {
+		err = -ENETDOWN;
+		rcu_read_lock();
+		goto out_free;
+	}
+
+	if (WARN_ON(compare_ether_addr(sta->sta.addr, sdata->vif.addr) == 0 ||
+		    is_multicast_ether_addr(sta->sta.addr))) {
+		err = -EINVAL;
+		rcu_read_lock();
+		goto out_free;
+	}
+
+	/*
+	 * In ad-hoc mode, we sometimes need to insert stations
+	 * from tasklet context from the RX path. To avoid races,
+	 * always do so in that case -- see the comment below.
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		spin_lock_irqsave(&local->sta_lock, flags);
+		/* check if STA exists already */
+		if (sta_info_get_bss(sdata, sta->sta.addr)) {
+			spin_unlock_irqrestore(&local->sta_lock, flags);
+			rcu_read_lock();
+			err = -EEXIST;
+			goto out_free;
+		}
+
+		local->num_sta++;
+		local->sta_generation++;
+		smp_mb();
+		sta_info_hash_add(local, sta);
+
+		list_add_tail(&sta->list, &local->sta_pending_list);
+
+		rcu_read_lock();
+		spin_unlock_irqrestore(&local->sta_lock, flags);
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
+			    sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+
+		ieee80211_queue_work(&local->hw, &local->sta_finish_work);
+
+		return 0;
+	}
+
+	/*
+	 * On first glance, this will look racy, because the code
+	 * below this point, which inserts a station with sleeping,
+	 * unlocks the sta_lock between checking existence in the
+	 * hash table and inserting into it.
+	 *
+	 * However, it is not racy against itself because it keeps
+	 * the mutex locked. It still seems to race against the
+	 * above code that atomically inserts the station... That,
+	 * however, is not true because the above code can only
+	 * be invoked for IBSS interfaces, and the below code will
+	 * not be -- and the two do not race against each other as
+	 * the hash table also keys off the interface.
+	 */
+
+	might_sleep();
+
+	mutex_lock(&local->sta_mtx);
+
+	spin_lock_irqsave(&local->sta_lock, flags);
+	/* check if STA exists already */
+	if (sta_info_get_bss(sdata, sta->sta.addr)) {
+		spin_unlock_irqrestore(&local->sta_lock, flags);
+		mutex_unlock(&local->sta_mtx);
+		rcu_read_lock();
+		err = -EEXIST;
+		goto out_free;
+	}
+
+	spin_unlock_irqrestore(&local->sta_lock, flags);
+
+	err = sta_info_finish_insert(sta, false);
+	if (err) {
+		mutex_unlock(&local->sta_mtx);
+		rcu_read_lock();
+		goto out_free;
+	}
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+
+	/* move reference to rcu-protected */
+	rcu_read_lock();
+	mutex_unlock(&local->sta_mtx);
+
+	if (ieee80211_vif_is_mesh(&sdata->vif))
+		mesh_accept_plinks_update(sdata);
+
+	return 0;
+ out_free:
+	BUG_ON(!err);
+	__sta_info_free(local, sta);
+	return err;
+}
+
+int sta_info_insert(struct sta_info *sta)
+{
+	int err = sta_info_insert_rcu(sta);
+
+	rcu_read_unlock();
+
+	return err;
+}
+
+static inline void __bss_tim_set(struct ieee80211_if_ap *bss, u16 aid)
+{
+	/*
+	 * This format has been mandated by the IEEE specifications,
+	 * so this line may not be changed to use the __set_bit() format.
+	 */
+	bss->tim[aid / 8] |= (1 << (aid % 8));
+}
+
+static inline void __bss_tim_clear(struct ieee80211_if_ap *bss, u16 aid)
+{
+	/*
+	 * This format has been mandated by the IEEE specifications,
+	 * so this line may not be changed to use the __clear_bit() format.
+	 */
+	bss->tim[aid / 8] &= ~(1 << (aid % 8));
+}
+
+static void __sta_info_set_tim_bit(struct ieee80211_if_ap *bss,
+				   struct sta_info *sta)
+{
+	BUG_ON(!bss);
+
+	__bss_tim_set(bss, sta->sta.aid);
+
+	if (sta->local->ops->set_tim) {
+		sta->local->tim_in_locked_section = true;
+		drv_set_tim(sta->local, &sta->sta, true);
+		sta->local->tim_in_locked_section = false;
+	}
+}
+
+void sta_info_set_tim_bit(struct sta_info *sta)
+{
+	unsigned long flags;
+
+	BUG_ON(!sta->sdata->bss);
+
+	spin_lock_irqsave(&sta->local->sta_lock, flags);
+	__sta_info_set_tim_bit(sta->sdata->bss, sta);
+	spin_unlock_irqrestore(&sta->local->sta_lock, flags);
+}
+
+static void __sta_info_clear_tim_bit(struct ieee80211_if_ap *bss,
+				     struct sta_info *sta)
+{
+	BUG_ON(!bss);
+
+	__bss_tim_clear(bss, sta->sta.aid);
+
+	if (sta->local->ops->set_tim) {
+		sta->local->tim_in_locked_section = true;
+		drv_set_tim(sta->local, &sta->sta, false);
+		sta->local->tim_in_locked_section = false;
+	}
+}
+
+void sta_info_clear_tim_bit(struct sta_info *sta)
+{
+	unsigned long flags;
+
+	BUG_ON(!sta->sdata->bss);
+
+	spin_lock_irqsave(&sta->local->sta_lock, flags);
+	__sta_info_clear_tim_bit(sta->sdata->bss, sta);
+	spin_unlock_irqrestore(&sta->local->sta_lock, flags);
+}
+
+static int sta_info_buffer_expired(struct sta_info *sta,
+				   struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info;
+	int timeout;
+
+	if (!skb)
+		return 0;
+
+	info = IEEE80211_SKB_CB(skb);
+
+	/* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */
+	timeout = (sta->listen_interval *
+		   sta->sdata->vif.bss_conf.beacon_int *
+		   32 / 15625) * HZ;
+	if (timeout < STA_TX_BUFFER_EXPIRE)
+		timeout = STA_TX_BUFFER_EXPIRE;
+	return time_after(jiffies, info->control.jiffies + timeout);
+}
+
+
+static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
+					     struct sta_info *sta)
+{
+	unsigned long flags;
+	struct sk_buff *skb;
+
+	if (skb_queue_empty(&sta->ps_tx_buf))
+		return false;
+
+	for (;;) {
+		spin_lock_irqsave(&sta->ps_tx_buf.lock, flags);
+		skb = skb_peek(&sta->ps_tx_buf);
+		if (sta_info_buffer_expired(sta, skb))
+			skb = __skb_dequeue(&sta->ps_tx_buf);
+		else
+			skb = NULL;
+		spin_unlock_irqrestore(&sta->ps_tx_buf.lock, flags);
+
+		if (!skb)
+			break;
+
+		local->total_ps_buffered--;
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+		printk(KERN_DEBUG "Buffered frame expired (STA %pM)\n",
+		       sta->sta.addr);
+#endif
+		dev_kfree_skb(skb);
+
+		if (skb_queue_empty(&sta->ps_tx_buf) &&
+		    !test_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF))
+			sta_info_clear_tim_bit(sta);
+	}
+
+	return true;
+}
+
+static int __must_check __sta_info_destroy(struct sta_info *sta)
+{
+	struct ieee80211_local *local;
+	struct ieee80211_sub_if_data *sdata;
+	struct sk_buff *skb;
+	unsigned long flags;
+	int ret, i;
+
+	might_sleep();
+
+	if (!sta)
+		return -ENOENT;
+
+	local = sta->local;
+	sdata = sta->sdata;
+
+	/*
+	 * Before removing the station from the driver and
+	 * rate control, it might still start new aggregation
+	 * sessions -- block that to make sure the tear-down
+	 * will be sufficient.
+	 */
+	set_sta_flags(sta, WLAN_STA_BLOCK_BA);
+	ieee80211_sta_tear_down_BA_sessions(sta, true);
+
+	spin_lock_irqsave(&local->sta_lock, flags);
+	ret = sta_info_hash_del(local, sta);
+	/* this might still be the pending list ... which is fine */
+	if (!ret)
+		list_del(&sta->list);
+	spin_unlock_irqrestore(&local->sta_lock, flags);
+	if (ret)
+		return ret;
+
+	mutex_lock(&local->key_mtx);
+	for (i = 0; i < NUM_DEFAULT_KEYS; i++)
+		__ieee80211_key_free(key_mtx_dereference(local, sta->gtk[i]));
+	if (sta->ptk)
+		__ieee80211_key_free(key_mtx_dereference(local, sta->ptk));
+	mutex_unlock(&local->key_mtx);
+
+	sta->dead = true;
+
+	if (test_and_clear_sta_flags(sta,
+				WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) {
+		BUG_ON(!sdata->bss);
+
+		atomic_dec(&sdata->bss->num_sta_ps);
+		sta_info_clear_tim_bit(sta);
+	}
+
+	local->num_sta--;
+	local->sta_generation++;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		rcu_assign_pointer(sdata->u.vlan.sta, NULL);
+
+	if (sta->uploaded) {
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(sdata->bss,
+					     struct ieee80211_sub_if_data,
+					     u.ap);
+		drv_sta_remove(local, sdata, &sta->sta);
+		sdata = sta->sdata;
+	}
+
+	/*
+	 * At this point, after we wait for an RCU grace period,
+	 * neither mac80211 nor the driver can reference this
+	 * sta struct any more except by still existing timers
+	 * associated with this station that we clean up below.
+	 */
+	synchronize_rcu();
+
+#ifdef CONFIG_MAC80211_MESH
+	if (ieee80211_vif_is_mesh(&sdata->vif))
+		mesh_accept_plinks_update(sdata);
+#endif
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+	cancel_work_sync(&sta->drv_unblock_wk);
+
+	cfg80211_del_sta(sdata->dev, sta->sta.addr, GFP_KERNEL);
+
+	rate_control_remove_sta_debugfs(sta);
+	ieee80211_sta_debugfs_remove(sta);
+
+#ifdef CONFIG_MAC80211_MESH
+	if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
+		mesh_plink_deactivate(sta);
+		del_timer_sync(&sta->plink_timer);
+	}
+#endif
+
+	while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) {
+		local->total_ps_buffered--;
+		dev_kfree_skb_any(skb);
+	}
+
+	while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL)
+		dev_kfree_skb_any(skb);
+
+	__sta_info_free(local, sta);
+
+	return 0;
+}
+
+int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+	struct sta_info *sta;
+	int ret;
+
+	mutex_lock(&sdata->local->sta_mtx);
+	sta = sta_info_get(sdata, addr);
+	ret = __sta_info_destroy(sta);
+	mutex_unlock(&sdata->local->sta_mtx);
+
+	return ret;
+}
+
+int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
+			      const u8 *addr)
+{
+	struct sta_info *sta;
+	int ret;
+
+	mutex_lock(&sdata->local->sta_mtx);
+	sta = sta_info_get_bss(sdata, addr);
+	ret = __sta_info_destroy(sta);
+	mutex_unlock(&sdata->local->sta_mtx);
+
+	return ret;
+}
+
+static void sta_info_cleanup(unsigned long data)
+{
+	struct ieee80211_local *local = (struct ieee80211_local *) data;
+	struct sta_info *sta;
+	bool timer_needed = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sta, &local->sta_list, list)
+		if (sta_info_cleanup_expire_buffered(local, sta))
+			timer_needed = true;
+	rcu_read_unlock();
+
+	if (local->quiescing)
+		return;
+
+	if (!timer_needed)
+		return;
+
+	mod_timer(&local->sta_cleanup,
+		  round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
+}
+
+void sta_info_init(struct ieee80211_local *local)
+{
+	spin_lock_init(&local->sta_lock);
+	mutex_init(&local->sta_mtx);
+	INIT_LIST_HEAD(&local->sta_list);
+	INIT_LIST_HEAD(&local->sta_pending_list);
+	INIT_WORK(&local->sta_finish_work, sta_info_finish_work);
+
+	setup_timer(&local->sta_cleanup, sta_info_cleanup,
+		    (unsigned long)local);
+}
+
+void sta_info_stop(struct ieee80211_local *local)
+{
+	del_timer(&local->sta_cleanup);
+	sta_info_flush(local, NULL);
+}
+
+/**
+ * sta_info_flush - flush matching STA entries from the STA table
+ *
+ * Returns the number of removed STA entries.
+ *
+ * @local: local interface data
+ * @sdata: matching rule for the net device (sta->dev) or %NULL to match all STAs
+ */
+int sta_info_flush(struct ieee80211_local *local,
+		   struct ieee80211_sub_if_data *sdata)
+{
+	struct sta_info *sta, *tmp;
+	int ret = 0;
+
+	might_sleep();
+
+	mutex_lock(&local->sta_mtx);
+
+	sta_info_finish_pending(local);
+
+	list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
+		if (!sdata || sdata == sta->sdata)
+			WARN_ON(__sta_info_destroy(sta));
+	}
+	mutex_unlock(&local->sta_mtx);
+
+	return ret;
+}
+
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+			  unsigned long exp_time)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta, *tmp;
+
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry_safe(sta, tmp, &local->sta_list, list)
+		if (time_after(jiffies, sta->last_rx + exp_time)) {
+#ifdef CONFIG_MAC80211_IBSS_DEBUG
+			printk(KERN_DEBUG "%s: expiring inactive STA %pM\n",
+			       sdata->name, sta->sta.addr);
+#endif
+			WARN_ON(__sta_info_destroy(sta));
+		}
+	mutex_unlock(&local->sta_mtx);
+}
+
+struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
+					       const u8 *addr,
+					       const u8 *localaddr)
+{
+	struct sta_info *sta, *nxt;
+
+	/*
+	 * Just return a random station if localaddr is NULL
+	 * ... first in list.
+	 */
+	for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
+		if (localaddr &&
+		    compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
+			continue;
+		if (!sta->uploaded)
+			return NULL;
+		return &sta->sta;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
+
+struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
+					 const u8 *addr)
+{
+	struct sta_info *sta;
+
+	if (!vif)
+		return NULL;
+
+	sta = sta_info_get_bss(vif_to_sdata(vif), addr);
+	if (!sta)
+		return NULL;
+
+	if (!sta->uploaded)
+		return NULL;
+
+	return &sta->sta;
+}
+EXPORT_SYMBOL(ieee80211_find_sta);
+
+static void clear_sta_ps_flags(void *_sta)
+{
+	struct sta_info *sta = _sta;
+
+	clear_sta_flags(sta, WLAN_STA_PS_DRIVER | WLAN_STA_PS_STA);
+}
+
+/* powersave support code */
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	int sent, buffered;
+
+	clear_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
+	if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
+		drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
+
+	if (!skb_queue_empty(&sta->ps_tx_buf))
+		sta_info_clear_tim_bit(sta);
+
+	/* Send all buffered frames to the station */
+	sent = ieee80211_add_pending_skbs(local, &sta->tx_filtered);
+	buffered = ieee80211_add_pending_skbs_fn(local, &sta->ps_tx_buf,
+						 clear_sta_ps_flags, sta);
+	sent += buffered;
+	local->total_ps_buffered -= buffered;
+
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	printk(KERN_DEBUG "%s: STA %pM aid %d sending %d filtered/%d PS frames "
+	       "since STA not sleeping anymore\n", sdata->name,
+	       sta->sta.addr, sta->sta.aid, sent - buffered, buffered);
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+}
+
+void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	int no_pending_pkts;
+
+	skb = skb_dequeue(&sta->tx_filtered);
+	if (!skb) {
+		skb = skb_dequeue(&sta->ps_tx_buf);
+		if (skb)
+			local->total_ps_buffered--;
+	}
+	no_pending_pkts = skb_queue_empty(&sta->tx_filtered) &&
+		skb_queue_empty(&sta->ps_tx_buf);
+
+	if (skb) {
+		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+		struct ieee80211_hdr *hdr =
+			(struct ieee80211_hdr *) skb->data;
+
+		/*
+		 * Tell TX path to send this frame even though the STA may
+		 * still remain is PS mode after this frame exchange.
+		 */
+		info->flags |= IEEE80211_TX_CTL_PSPOLL_RESPONSE;
+
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+		printk(KERN_DEBUG "STA %pM aid %d: PS Poll (entries after %d)\n",
+		       sta->sta.addr, sta->sta.aid,
+		       skb_queue_len(&sta->ps_tx_buf));
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+
+		/* Use MoreData flag to indicate whether there are more
+		 * buffered frames for this STA */
+		if (no_pending_pkts)
+			hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA);
+		else
+			hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+
+		ieee80211_add_pending_skb(local, skb);
+
+		if (no_pending_pkts)
+			sta_info_clear_tim_bit(sta);
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	} else {
+		/*
+		 * FIXME: This can be the result of a race condition between
+		 *	  us expiring a frame and the station polling for it.
+		 *	  Should we send it a null-func frame indicating we
+		 *	  have nothing buffered for it?
+		 */
+		printk(KERN_DEBUG "%s: STA %pM sent PS Poll even "
+		       "though there are no buffered frames for it\n",
+		       sdata->name, sta->sta.addr);
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+	}
+}
+
+void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
+			       struct ieee80211_sta *pubsta, bool block)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+	trace_api_sta_block_awake(sta->local, pubsta, block);
+
+	if (block)
+		set_sta_flags(sta, WLAN_STA_PS_DRIVER);
+	else if (test_sta_flags(sta, WLAN_STA_PS_DRIVER))
+		ieee80211_queue_work(hw, &sta->drv_unblock_wk);
+}
+EXPORT_SYMBOL(ieee80211_sta_block_awake);
+
+void ieee80211_sta_set_tim(struct ieee80211_sta *pubsta)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+
+	set_sta_flags(sta, WLAN_STA_PS_DRIVER_BUF);
+	sta_info_set_tim_bit(sta);
+}
+EXPORT_SYMBOL(ieee80211_sta_set_tim);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
new file mode 100644
index 00000000..c6ae8718
--- /dev/null
+++ b/net/mac80211/sta_info.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2002-2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef STA_INFO_H
+#define STA_INFO_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <linux/workqueue.h>
+#include <linux/average.h>
+#include "key.h"
+
+/**
+ * enum ieee80211_sta_info_flags - Stations flags
+ *
+ * These flags are used with &struct sta_info's @flags member.
+ *
+ * @WLAN_STA_AUTH: Station is authenticated.
+ * @WLAN_STA_ASSOC: Station is associated.
+ * @WLAN_STA_PS_STA: Station is in power-save mode
+ * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic.
+ *	This bit is always checked so needs to be enabled for all stations
+ *	when virtual port control is not in use.
+ * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble
+ *	frames.
+ * @WLAN_STA_ASSOC_AP: We're associated to that station, it is an AP.
+ * @WLAN_STA_WME: Station is a QoS-STA.
+ * @WLAN_STA_WDS: Station is one of our WDS peers.
+ * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the
+ *	IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next
+ *	frame to this station is transmitted.
+ * @WLAN_STA_MFP: Management frame protection is used with this STA.
+ * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX)
+ *	during suspend/resume and station removal.
+ * @WLAN_STA_PS_DRIVER: driver requires keeping this station in
+ *	power-save mode logically to flush frames that might still
+ *	be in the queues
+ * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping
+ *	station in power-save mode, reply when the driver unblocks.
+ * @WLAN_STA_PS_DRIVER_BUF: Station has frames pending in driver internal
+ *	buffers. Automatically cleared on station wake-up.
+ */
+enum ieee80211_sta_info_flags {
+	WLAN_STA_AUTH		= 1<<0,
+	WLAN_STA_ASSOC		= 1<<1,
+	WLAN_STA_PS_STA		= 1<<2,
+	WLAN_STA_AUTHORIZED	= 1<<3,
+	WLAN_STA_SHORT_PREAMBLE	= 1<<4,
+	WLAN_STA_ASSOC_AP	= 1<<5,
+	WLAN_STA_WME		= 1<<6,
+	WLAN_STA_WDS		= 1<<7,
+	WLAN_STA_CLEAR_PS_FILT	= 1<<9,
+	WLAN_STA_MFP		= 1<<10,
+	WLAN_STA_BLOCK_BA	= 1<<11,
+	WLAN_STA_PS_DRIVER	= 1<<12,
+	WLAN_STA_PSPOLL		= 1<<13,
+	WLAN_STA_PS_DRIVER_BUF	= 1<<14,
+};
+
+#define STA_TID_NUM 16
+#define ADDBA_RESP_INTERVAL HZ
+#define HT_AGG_MAX_RETRIES		0x3
+
+#define HT_AGG_STATE_DRV_READY		0
+#define HT_AGG_STATE_RESPONSE_RECEIVED	1
+#define HT_AGG_STATE_OPERATIONAL	2
+#define HT_AGG_STATE_STOPPING		3
+#define HT_AGG_STATE_WANT_START		4
+#define HT_AGG_STATE_WANT_STOP		5
+
+/**
+ * struct tid_ampdu_tx - TID aggregation information (Tx).
+ *
+ * @rcu_head: rcu head for freeing structure
+ * @addba_resp_timer: timer for peer's response to addba request
+ * @pending: pending frames queue -- use sta's spinlock to protect
+ * @dialog_token: dialog token for aggregation session
+ * @timeout: session timeout value to be filled in ADDBA requests
+ * @state: session state (see above)
+ * @stop_initiator: initiator of a session stop
+ * @tx_stop: TX DelBA frame when stopping
+ * @buf_size: reorder buffer size at receiver
+ *
+ * This structure's lifetime is managed by RCU, assignments to
+ * the array holding it must hold the aggregation mutex.
+ *
+ * The TX path can access it under RCU lock-free if, and
+ * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL
+ * set. Otherwise, the TX path must also acquire the spinlock
+ * and re-check the state, see comments in the tx code
+ * touching it.
+ */
+struct tid_ampdu_tx {
+	struct rcu_head rcu_head;
+	struct timer_list addba_resp_timer;
+	struct sk_buff_head pending;
+	unsigned long state;
+	u16 timeout;
+	u8 dialog_token;
+	u8 stop_initiator;
+	bool tx_stop;
+	u8 buf_size;
+};
+
+/**
+ * struct tid_ampdu_rx - TID aggregation information (Rx).
+ *
+ * @reorder_buf: buffer to reorder incoming aggregated MPDUs
+ * @reorder_time: jiffies when skb was added
+ * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
+ * @reorder_timer: releases expired frames from the reorder buffer.
+ * @head_seq_num: head sequence number in reordering buffer.
+ * @stored_mpdu_num: number of MPDUs in reordering buffer
+ * @ssn: Starting Sequence Number expected to be aggregated.
+ * @buf_size: buffer size for incoming A-MPDUs
+ * @timeout: reset timer value (in TUs).
+ * @dialog_token: dialog token for aggregation session
+ * @rcu_head: RCU head used for freeing this struct
+ * @reorder_lock: serializes access to reorder buffer, see below.
+ *
+ * This structure's lifetime is managed by RCU, assignments to
+ * the array holding it must hold the aggregation mutex.
+ *
+ * The @reorder_lock is used to protect the members of this
+ * struct, except for @timeout, @buf_size and @dialog_token,
+ * which are constant across the lifetime of the struct (the
+ * dialog token being used only for debugging).
+ */
+struct tid_ampdu_rx {
+	struct rcu_head rcu_head;
+	spinlock_t reorder_lock;
+	struct sk_buff **reorder_buf;
+	unsigned long *reorder_time;
+	struct timer_list session_timer;
+	struct timer_list reorder_timer;
+	u16 head_seq_num;
+	u16 stored_mpdu_num;
+	u16 ssn;
+	u16 buf_size;
+	u16 timeout;
+	u8 dialog_token;
+};
+
+/**
+ * struct sta_ampdu_mlme - STA aggregation information.
+ *
+ * @tid_rx: aggregation info for Rx per TID -- RCU protected
+ * @tid_tx: aggregation info for Tx per TID
+ * @tid_start_tx: sessions where start was requested
+ * @addba_req_num: number of times addBA request has been sent.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
+ * @work: work struct for starting/stopping aggregation
+ * @tid_rx_timer_expired: bitmap indicating on which TIDs the
+ *	RX timer expired until the work for it runs
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ *	to tid_tx[idx], which are protected by the sta spinlock)
+ */
+struct sta_ampdu_mlme {
+	struct mutex mtx;
+	/* rx */
+	struct tid_ampdu_rx __rcu *tid_rx[STA_TID_NUM];
+	unsigned long tid_rx_timer_expired[BITS_TO_LONGS(STA_TID_NUM)];
+	/* tx */
+	struct work_struct work;
+	struct tid_ampdu_tx __rcu *tid_tx[STA_TID_NUM];
+	struct tid_ampdu_tx *tid_start_tx[STA_TID_NUM];
+	u8 addba_req_num[STA_TID_NUM];
+	u8 dialog_token_allocator;
+};
+
+
+/**
+ * struct sta_info - STA information
+ *
+ * This structure collects information about a station that
+ * mac80211 is communicating with.
+ *
+ * @list: global linked list entry
+ * @hnext: hash table linked list pointer
+ * @local: pointer to the global information
+ * @sdata: virtual interface this station belongs to
+ * @ptk: peer key negotiated with this station, if any
+ * @gtk: group keys negotiated with this station, if any
+ * @rate_ctrl: rate control algorithm reference
+ * @rate_ctrl_priv: rate control private per-STA pointer
+ * @last_tx_rate: rate used for last transmit, to report to userspace as
+ *	"the" transmit rate
+ * @last_rx_rate_idx: rx status rate index of the last data packet
+ * @last_rx_rate_flag: rx status flag of the last data packet
+ * @lock: used for locking all fields that require locking, see comments
+ *	in the header file.
+ * @flaglock: spinlock for flags accesses
+ * @drv_unblock_wk: used for driver PS unblocking
+ * @listen_interval: listen interval of this station, when we're acting as AP
+ * @flags: STA flags, see &enum ieee80211_sta_info_flags
+ * @ps_tx_buf: buffer of frames to transmit to this station
+ *	when it leaves power saving state
+ * @tx_filtered: buffer of frames we already tried to transmit
+ *	but were filtered by hardware due to STA having entered
+ *	power saving state
+ * @rx_packets: Number of MSDUs received from this STA
+ * @rx_bytes: Number of bytes received from this STA
+ * @wep_weak_iv_count: number of weak WEP IVs received from this station
+ * @last_rx: time (in jiffies) when last frame was received from this STA
+ * @last_connected: time (in seconds) when a station got connected
+ * @num_duplicates: number of duplicate frames received from this STA
+ * @rx_fragments: number of received MPDUs
+ * @rx_dropped: number of dropped MPDUs from this STA
+ * @last_signal: signal of last received frame from this STA
+ * @avg_signal: moving average of signal of received frames from this STA
+ * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue)
+ * @tx_filtered_count: number of frames the hardware filtered for this STA
+ * @tx_retry_failed: number of frames that failed retry
+ * @tx_retry_count: total number of retries for frames to this STA
+ * @fail_avg: moving percentage of failed MSDUs
+ * @tx_packets: number of RX/TX MSDUs
+ * @tx_bytes: number of bytes transmitted to this STA
+ * @tx_fragments: number of transmitted MPDUs
+ * @tid_seq: per-TID sequence numbers for sending to this STA
+ * @ampdu_mlme: A-MPDU state machine state
+ * @timer_to_tid: identity mapping to ID timers
+ * @llid: Local link ID
+ * @plid: Peer link ID
+ * @reason: Cancel reason on PLINK_HOLDING state
+ * @plink_retries: Retries in establishment
+ * @ignore_plink_timer: ignore the peer-link timer (used internally)
+ * @plink_state: peer link state
+ * @plink_timeout: timeout of peer link
+ * @plink_timer: peer link watch timer
+ * @plink_timer_was_running: used by suspend/resume to restore timers
+ * @debugfs: debug filesystem info
+ * @sta: station information we share with the driver
+ * @dead: set to true when sta is unlinked
+ * @uploaded: set to true when sta is uploaded to the driver
+ * @lost_packets: number of consecutive lost packets
+ */
+struct sta_info {
+	/* General information, mostly static */
+	struct list_head list;
+	struct sta_info __rcu *hnext;
+	struct ieee80211_local *local;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+	struct ieee80211_key __rcu *ptk;
+	struct rate_control_ref *rate_ctrl;
+	void *rate_ctrl_priv;
+	spinlock_t lock;
+	spinlock_t flaglock;
+
+	struct work_struct drv_unblock_wk;
+
+	u16 listen_interval;
+
+	bool dead;
+
+	bool uploaded;
+
+	/*
+	 * frequently updated, locked with own spinlock (flaglock),
+	 * use the accessors defined below
+	 */
+	u32 flags;
+
+	/*
+	 * STA powersave frame queues, no more than the internal
+	 * locking required.
+	 */
+	struct sk_buff_head ps_tx_buf;
+	struct sk_buff_head tx_filtered;
+
+	/* Updated from RX path only, no locking requirements */
+	unsigned long rx_packets, rx_bytes;
+	unsigned long wep_weak_iv_count;
+	unsigned long last_rx;
+	long last_connected;
+	unsigned long num_duplicates;
+	unsigned long rx_fragments;
+	unsigned long rx_dropped;
+	int last_signal;
+	struct ewma avg_signal;
+	__le16 last_seq_ctrl[NUM_RX_DATA_QUEUES];
+
+	/* Updated from TX status path only, no locking requirements */
+	unsigned long tx_filtered_count;
+	unsigned long tx_retry_failed, tx_retry_count;
+	/* moving percentage of failed MSDUs */
+	unsigned int fail_avg;
+
+	/* Updated from TX path only, no locking requirements */
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+	unsigned long tx_fragments;
+	struct ieee80211_tx_rate last_tx_rate;
+	int last_rx_rate_idx;
+	int last_rx_rate_flag;
+	u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];
+
+	/*
+	 * Aggregation information, locked with lock.
+	 */
+	struct sta_ampdu_mlme ampdu_mlme;
+	u8 timer_to_tid[STA_TID_NUM];
+
+#ifdef CONFIG_MAC80211_MESH
+	/*
+	 * Mesh peer link attributes
+	 * TODO: move to a sub-structure that is referenced with pointer?
+	 */
+	__le16 llid;
+	__le16 plid;
+	__le16 reason;
+	u8 plink_retries;
+	bool ignore_plink_timer;
+	bool plink_timer_was_running;
+	enum nl80211_plink_state plink_state;
+	u32 plink_timeout;
+	struct timer_list plink_timer;
+#endif
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct sta_info_debugfsdentries {
+		struct dentry *dir;
+		bool add_has_run;
+	} debugfs;
+#endif
+
+	unsigned int lost_packets;
+
+	/* keep last! */
+	struct ieee80211_sta sta;
+};
+
+static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta)
+{
+#ifdef CONFIG_MAC80211_MESH
+	return sta->plink_state;
+#endif
+	return NL80211_PLINK_LISTEN;
+}
+
+static inline void set_sta_flags(struct sta_info *sta, const u32 flags)
+{
+	unsigned long irqfl;
+
+	spin_lock_irqsave(&sta->flaglock, irqfl);
+	sta->flags |= flags;
+	spin_unlock_irqrestore(&sta->flaglock, irqfl);
+}
+
+static inline void clear_sta_flags(struct sta_info *sta, const u32 flags)
+{
+	unsigned long irqfl;
+
+	spin_lock_irqsave(&sta->flaglock, irqfl);
+	sta->flags &= ~flags;
+	spin_unlock_irqrestore(&sta->flaglock, irqfl);
+}
+
+static inline u32 test_sta_flags(struct sta_info *sta, const u32 flags)
+{
+	u32 ret;
+	unsigned long irqfl;
+
+	spin_lock_irqsave(&sta->flaglock, irqfl);
+	ret = sta->flags & flags;
+	spin_unlock_irqrestore(&sta->flaglock, irqfl);
+
+	return ret;
+}
+
+static inline u32 test_and_clear_sta_flags(struct sta_info *sta,
+					   const u32 flags)
+{
+	u32 ret;
+	unsigned long irqfl;
+
+	spin_lock_irqsave(&sta->flaglock, irqfl);
+	ret = sta->flags & flags;
+	sta->flags &= ~flags;
+	spin_unlock_irqrestore(&sta->flaglock, irqfl);
+
+	return ret;
+}
+
+static inline u32 get_sta_flags(struct sta_info *sta)
+{
+	u32 ret;
+	unsigned long irqfl;
+
+	spin_lock_irqsave(&sta->flaglock, irqfl);
+	ret = sta->flags;
+	spin_unlock_irqrestore(&sta->flaglock, irqfl);
+
+	return ret;
+}
+
+void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
+			     struct tid_ampdu_tx *tid_tx);
+
+static inline struct tid_ampdu_tx *
+rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid)
+{
+	return rcu_dereference_protected(sta->ampdu_mlme.tid_tx[tid],
+					 lockdep_is_held(&sta->lock) ||
+					 lockdep_is_held(&sta->ampdu_mlme.mtx));
+}
+
+#define STA_HASH_SIZE 256
+#define STA_HASH(sta) (sta[5])
+
+
+/* Maximum number of frames to buffer per power saving station */
+#define STA_MAX_TX_BUFFER 128
+
+/* Minimum buffered frame expiry time. If STA uses listen interval that is
+ * smaller than this value, the minimum value here is used instead. */
+#define STA_TX_BUFFER_EXPIRE (10 * HZ)
+
+/* How often station data is cleaned up (e.g., expiration of buffered frames)
+ */
+#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)
+
+/*
+ * Get a STA info, must be under RCU read lock.
+ */
+struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
+			      const u8 *addr);
+
+struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
+				  const u8 *addr);
+
+static inline
+void for_each_sta_info_type_check(struct ieee80211_local *local,
+				  const u8 *addr,
+				  struct sta_info *sta,
+				  struct sta_info *nxt)
+{
+}
+
+#define for_each_sta_info(local, _addr, _sta, nxt) 			\
+	for (	/* initialise loop */					\
+		_sta = rcu_dereference(local->sta_hash[STA_HASH(_addr)]),\
+		nxt = _sta ? rcu_dereference(_sta->hnext) : NULL;	\
+		/* typecheck */						\
+		for_each_sta_info_type_check(local, (_addr), _sta, nxt),\
+		/* continue condition */				\
+		_sta;							\
+		/* advance loop */					\
+		_sta = nxt,						\
+		nxt = _sta ? rcu_dereference(_sta->hnext) : NULL	\
+	     )								\
+	/* compare address and run code only if it matches */		\
+	if (memcmp(_sta->sta.addr, (_addr), ETH_ALEN) == 0)
+
+/*
+ * Get STA info by index, BROKEN!
+ */
+struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
+				     int idx);
+/*
+ * Create a new STA info, caller owns returned structure
+ * until sta_info_insert().
+ */
+struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
+				u8 *addr, gfp_t gfp);
+/*
+ * Insert STA info into hash table/list, returns zero or a
+ * -EEXIST if (if the same MAC address is already present).
+ *
+ * Calling the non-rcu version makes the caller relinquish,
+ * the _rcu version calls read_lock_rcu() and must be called
+ * without it held.
+ */
+int sta_info_insert(struct sta_info *sta);
+int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU);
+int sta_info_insert_atomic(struct sta_info *sta);
+
+int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata,
+			  const u8 *addr);
+int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
+			      const u8 *addr);
+
+void sta_info_set_tim_bit(struct sta_info *sta);
+void sta_info_clear_tim_bit(struct sta_info *sta);
+
+void sta_info_init(struct ieee80211_local *local);
+void sta_info_stop(struct ieee80211_local *local);
+int sta_info_flush(struct ieee80211_local *local,
+		   struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
+			  unsigned long exp_time);
+
+void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
+void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
+
+#endif /* STA_INFO_H */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
new file mode 100644
index 00000000..04cdbaf1
--- /dev/null
+++ b/net/mac80211/status.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2008-2010	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "rate.h"
+#include "mesh.h"
+#include "led.h"
+
+
+void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
+				 struct sk_buff *skb)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	int tmp;
+
+	skb->pkt_type = IEEE80211_TX_STATUS_MSG;
+	skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ?
+		       &local->skb_queue : &local->skb_queue_unreliable, skb);
+	tmp = skb_queue_len(&local->skb_queue) +
+		skb_queue_len(&local->skb_queue_unreliable);
+	while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT &&
+	       (skb = skb_dequeue(&local->skb_queue_unreliable))) {
+		dev_kfree_skb_irq(skb);
+		tmp--;
+		I802_DEBUG_INC(local->tx_status_drop);
+	}
+	tasklet_schedule(&local->tasklet);
+}
+EXPORT_SYMBOL(ieee80211_tx_status_irqsafe);
+
+static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
+					    struct sta_info *sta,
+					    struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	/*
+	 * This skb 'survived' a round-trip through the driver, and
+	 * hopefully the driver didn't mangle it too badly. However,
+	 * we can definitely not rely on the control information
+	 * being correct. Clear it so we don't get junk there, and
+	 * indicate that it needs new processing, but must not be
+	 * modified/encrypted again.
+	 */
+	memset(&info->control, 0, sizeof(info->control));
+
+	info->control.jiffies = jiffies;
+	info->control.vif = &sta->sdata->vif;
+	info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
+		       IEEE80211_TX_INTFL_RETRANSMISSION;
+	info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+
+	sta->tx_filtered_count++;
+
+	/*
+	 * Clear the TX filter mask for this STA when sending the next
+	 * packet. If the STA went to power save mode, this will happen
+	 * when it wakes up for the next time.
+	 */
+	set_sta_flags(sta, WLAN_STA_CLEAR_PS_FILT);
+
+	/*
+	 * This code races in the following way:
+	 *
+	 *  (1) STA sends frame indicating it will go to sleep and does so
+	 *  (2) hardware/firmware adds STA to filter list, passes frame up
+	 *  (3) hardware/firmware processes TX fifo and suppresses a frame
+	 *  (4) we get TX status before having processed the frame and
+	 *	knowing that the STA has gone to sleep.
+	 *
+	 * This is actually quite unlikely even when both those events are
+	 * processed from interrupts coming in quickly after one another or
+	 * even at the same time because we queue both TX status events and
+	 * RX frames to be processed by a tasklet and process them in the
+	 * same order that they were received or TX status last. Hence, there
+	 * is no race as long as the frame RX is processed before the next TX
+	 * status, which drivers can ensure, see below.
+	 *
+	 * Note that this can only happen if the hardware or firmware can
+	 * actually add STAs to the filter list, if this is done by the
+	 * driver in response to set_tim() (which will only reduce the race
+	 * this whole filtering tries to solve, not completely solve it)
+	 * this situation cannot happen.
+	 *
+	 * To completely solve this race drivers need to make sure that they
+	 *  (a) don't mix the irq-safe/not irq-safe TX status/RX processing
+	 *	functions and
+	 *  (b) always process RX events before TX status events if ordering
+	 *      can be unknown, for example with different interrupt status
+	 *	bits.
+	 *  (c) if PS mode transitions are manual (i.e. the flag
+	 *      %IEEE80211_HW_AP_LINK_PS is set), always process PS state
+	 *      changes before calling TX status events if ordering can be
+	 *	unknown.
+	 */
+	if (test_sta_flags(sta, WLAN_STA_PS_STA) &&
+	    skb_queue_len(&sta->tx_filtered) < STA_MAX_TX_BUFFER) {
+		skb_queue_tail(&sta->tx_filtered, skb);
+		return;
+	}
+
+	if (!test_sta_flags(sta, WLAN_STA_PS_STA) &&
+	    !(info->flags & IEEE80211_TX_INTFL_RETRIED)) {
+		/* Software retry the packet once */
+		info->flags |= IEEE80211_TX_INTFL_RETRIED;
+		ieee80211_add_pending_skb(local, skb);
+		return;
+	}
+
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+	if (net_ratelimit())
+		wiphy_debug(local->hw.wiphy,
+			    "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
+			    skb_queue_len(&sta->tx_filtered),
+			    !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
+#endif
+	dev_kfree_skb(skb);
+}
+
+static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *) skb->data;
+	struct ieee80211_local *local = sta->local;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+
+	if (ieee80211_is_action(mgmt->frame_control) &&
+	    sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    mgmt->u.action.category == WLAN_CATEGORY_HT &&
+	    mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS) {
+		/*
+		 * This update looks racy, but isn't -- if we come
+		 * here we've definitely got a station that we're
+		 * talking to, and on a managed interface that can
+		 * only be the AP. And the only other place updating
+		 * this variable is before we're associated.
+		 */
+		switch (mgmt->u.action.u.ht_smps.smps_control) {
+		case WLAN_HT_SMPS_CONTROL_DYNAMIC:
+			sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_DYNAMIC;
+			break;
+		case WLAN_HT_SMPS_CONTROL_STATIC:
+			sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_STATIC;
+			break;
+		case WLAN_HT_SMPS_CONTROL_DISABLED:
+		default: /* shouldn't happen since we don't send that */
+			sta->sdata->u.mgd.ap_smps = IEEE80211_SMPS_OFF;
+			break;
+		}
+
+		ieee80211_queue_work(&local->hw, &local->recalc_smps);
+	}
+}
+
+/*
+ * Use a static threshold for now, best value to be determined
+ * by testing ...
+ * Should it depend on:
+ *  - on # of retransmissions
+ *  - current throughput (higher value for higher tpt)?
+ */
+#define STA_LOST_PKT_THRESHOLD	50
+
+void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct sk_buff *skb2;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	u16 frag, type;
+	__le16 fc;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_tx_status_rtap_hdr *rthdr;
+	struct ieee80211_sub_if_data *sdata;
+	struct net_device *prev_dev = NULL;
+	struct sta_info *sta, *tmp;
+	int retry_count = -1, i;
+	int rates_idx = -1;
+	bool send_to_cooked;
+	bool acked;
+
+	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+		if (info->status.rates[i].idx < 0) {
+			break;
+		} else if (i >= hw->max_report_rates) {
+			/* the HW cannot have attempted that rate */
+			info->status.rates[i].idx = -1;
+			info->status.rates[i].count = 0;
+			break;
+		}
+
+		retry_count += info->status.rates[i].count;
+	}
+	rates_idx = i - 1;
+
+	if (retry_count < 0)
+		retry_count = 0;
+
+	rcu_read_lock();
+
+	sband = local->hw.wiphy->bands[info->band];
+	fc = hdr->frame_control;
+
+	for_each_sta_info(local, hdr->addr1, sta, tmp) {
+		/* skip wrong virtual interface */
+		if (memcmp(hdr->addr2, sta->sdata->vif.addr, ETH_ALEN))
+			continue;
+
+		acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
+		if (!acked && test_sta_flags(sta, WLAN_STA_PS_STA)) {
+			/*
+			 * The STA is in power save mode, so assume
+			 * that this TX packet failed because of that.
+			 */
+			ieee80211_handle_filtered_frame(local, sta, skb);
+			rcu_read_unlock();
+			return;
+		}
+
+		if ((local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) &&
+		    (rates_idx != -1))
+			sta->last_tx_rate = info->status.rates[rates_idx];
+
+		if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) &&
+		    (ieee80211_is_data_qos(fc))) {
+			u16 tid, ssn;
+			u8 *qc;
+
+			qc = ieee80211_get_qos_ctl(hdr);
+			tid = qc[0] & 0xf;
+			ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10)
+						& IEEE80211_SCTL_SEQ);
+			ieee80211_send_bar(sta->sdata, hdr->addr1,
+					   tid, ssn);
+		}
+
+		if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) {
+			ieee80211_handle_filtered_frame(local, sta, skb);
+			rcu_read_unlock();
+			return;
+		} else {
+			if (!acked)
+				sta->tx_retry_failed++;
+			sta->tx_retry_count += retry_count;
+		}
+
+		rate_control_tx_status(local, sband, sta, skb);
+		if (ieee80211_vif_is_mesh(&sta->sdata->vif))
+			ieee80211s_update_metric(local, sta, skb);
+
+		if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
+			ieee80211_frame_acked(sta, skb);
+
+		if ((sta->sdata->vif.type == NL80211_IFTYPE_STATION) &&
+		    (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS))
+			ieee80211_sta_tx_notify(sta->sdata, (void *) skb->data, acked);
+
+		if (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) {
+			if (info->flags & IEEE80211_TX_STAT_ACK) {
+				if (sta->lost_packets)
+					sta->lost_packets = 0;
+			} else if (++sta->lost_packets >= STA_LOST_PKT_THRESHOLD) {
+				cfg80211_cqm_pktloss_notify(sta->sdata->dev,
+							    sta->sta.addr,
+							    sta->lost_packets,
+							    GFP_ATOMIC);
+				sta->lost_packets = 0;
+			}
+		}
+	}
+
+	rcu_read_unlock();
+
+	ieee80211_led_tx(local, 0);
+
+	/* SNMP counters
+	 * Fragments are passed to low-level drivers as separate skbs, so these
+	 * are actually fragments, not frames. Update frame counters only for
+	 * the first fragment of the frame. */
+
+	frag = le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG;
+	type = le16_to_cpu(hdr->frame_control) & IEEE80211_FCTL_FTYPE;
+
+	if (info->flags & IEEE80211_TX_STAT_ACK) {
+		if (frag == 0) {
+			local->dot11TransmittedFrameCount++;
+			if (is_multicast_ether_addr(hdr->addr1))
+				local->dot11MulticastTransmittedFrameCount++;
+			if (retry_count > 0)
+				local->dot11RetryCount++;
+			if (retry_count > 1)
+				local->dot11MultipleRetryCount++;
+		}
+
+		/* This counter shall be incremented for an acknowledged MPDU
+		 * with an individual address in the address 1 field or an MPDU
+		 * with a multicast address in the address 1 field of type Data
+		 * or Management. */
+		if (!is_multicast_ether_addr(hdr->addr1) ||
+		    type == IEEE80211_FTYPE_DATA ||
+		    type == IEEE80211_FTYPE_MGMT)
+			local->dot11TransmittedFragmentCount++;
+	} else {
+		if (frag == 0)
+			local->dot11FailedCount++;
+	}
+
+	if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) &&
+	    (local->hw.flags & IEEE80211_HW_REPORTS_TX_ACK_STATUS) &&
+	    !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
+	    local->ps_sdata && !(local->scanning)) {
+		if (info->flags & IEEE80211_TX_STAT_ACK) {
+			local->ps_sdata->u.mgd.flags |=
+					IEEE80211_STA_NULLFUNC_ACKED;
+		} else
+			mod_timer(&local->dynamic_ps_timer, jiffies +
+					msecs_to_jiffies(10));
+	}
+
+	if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) {
+		struct ieee80211_work *wk;
+		u64 cookie = (unsigned long)skb;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(wk, &local->work_list, list) {
+			if (wk->type != IEEE80211_WORK_OFFCHANNEL_TX)
+				continue;
+			if (wk->offchan_tx.frame != skb)
+				continue;
+			wk->offchan_tx.status = true;
+			break;
+		}
+		rcu_read_unlock();
+		if (local->hw_roc_skb_for_status == skb) {
+			cookie = local->hw_roc_cookie ^ 2;
+			local->hw_roc_skb_for_status = NULL;
+		}
+
+		if (cookie == local->hw_offchan_tx_cookie)
+			local->hw_offchan_tx_cookie = 0;
+
+		cfg80211_mgmt_tx_status(
+			skb->dev, cookie, skb->data, skb->len,
+			!!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC);
+	}
+
+	/* this was a transmitted frame, but now we want to reuse it */
+	skb_orphan(skb);
+
+	/* Need to make a copy before skb->cb gets cleared */
+	send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) ||
+			(type != IEEE80211_FTYPE_DATA);
+
+	/*
+	 * This is a bit racy but we can avoid a lot of work
+	 * with this test...
+	 */
+	if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) {
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	/* send frame to monitor interfaces now */
+
+	if (skb_headroom(skb) < sizeof(*rthdr)) {
+		printk(KERN_ERR "ieee80211_tx_status: headroom too small\n");
+		dev_kfree_skb(skb);
+		return;
+	}
+
+	rthdr = (struct ieee80211_tx_status_rtap_hdr *)
+				skb_push(skb, sizeof(*rthdr));
+
+	memset(rthdr, 0, sizeof(*rthdr));
+	rthdr->hdr.it_len = cpu_to_le16(sizeof(*rthdr));
+	rthdr->hdr.it_present =
+		cpu_to_le32((1 << IEEE80211_RADIOTAP_TX_FLAGS) |
+			    (1 << IEEE80211_RADIOTAP_DATA_RETRIES) |
+			    (1 << IEEE80211_RADIOTAP_RATE));
+
+	if (!(info->flags & IEEE80211_TX_STAT_ACK) &&
+	    !is_multicast_ether_addr(hdr->addr1))
+		rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_FAIL);
+
+	/*
+	 * XXX: Once radiotap gets the bitmap reset thing the vendor
+	 *	extensions proposal contains, we can actually report
+	 *	the whole set of tries we did.
+	 */
+	if ((info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) ||
+	    (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT))
+		rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_CTS);
+	else if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS)
+		rthdr->tx_flags |= cpu_to_le16(IEEE80211_RADIOTAP_F_TX_RTS);
+	if (info->status.rates[0].idx >= 0 &&
+	    !(info->status.rates[0].flags & IEEE80211_TX_RC_MCS))
+		rthdr->rate = sband->bitrates[
+				info->status.rates[0].idx].bitrate / 5;
+
+	/* for now report the total retry_count */
+	rthdr->data_retries = retry_count;
+
+	/* XXX: is this sufficient for BPF? */
+	skb_set_mac_header(skb, 0);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb->pkt_type = PACKET_OTHERHOST;
+	skb->protocol = htons(ETH_P_802_2);
+	memset(skb->cb, 0, sizeof(skb->cb));
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+			if (!ieee80211_sdata_running(sdata))
+				continue;
+
+			if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) &&
+			    !send_to_cooked)
+				continue;
+
+			if (prev_dev) {
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2) {
+					skb2->dev = prev_dev;
+					netif_rx(skb2);
+				}
+			}
+
+			prev_dev = sdata->dev;
+		}
+	}
+	if (prev_dev) {
+		skb->dev = prev_dev;
+		netif_rx(skb);
+		skb = NULL;
+	}
+	rcu_read_unlock();
+	dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_tx_status);
+
+void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr,
+				    num_packets, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(ieee80211_report_low_ack);
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
new file mode 100644
index 00000000..757e4eb2
--- /dev/null
+++ b/net/mac80211/tkip.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <asm/unaligned.h>
+
+#include <net/mac80211.h>
+#include "driver-ops.h"
+#include "key.h"
+#include "tkip.h"
+#include "wep.h"
+
+#define PHASE1_LOOP_COUNT 8
+
+/*
+ * 2-byte by 2-byte subset of the full AES S-box table; second part of this
+ * table is identical to first part but byte-swapped
+ */
+static const u16 tkip_sbox[256] =
+{
+	0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+	0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+	0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+	0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+	0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+	0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+	0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+	0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+	0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+	0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+	0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+	0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+	0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+	0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+	0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+	0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+	0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+	0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+	0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+	0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+	0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+	0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+	0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+	0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+	0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+	0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+	0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+	0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+	0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+	0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+	0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+	0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static u16 tkipS(u16 val)
+{
+	return tkip_sbox[val & 0xff] ^ swab16(tkip_sbox[val >> 8]);
+}
+
+static u8 *write_tkip_iv(u8 *pos, u16 iv16)
+{
+	*pos++ = iv16 >> 8;
+	*pos++ = ((iv16 >> 8) | 0x20) & 0x7f;
+	*pos++ = iv16 & 0xFF;
+	return pos;
+}
+
+/*
+ * P1K := Phase1(TA, TK, TSC)
+ * TA = transmitter address (48 bits)
+ * TK = dot11DefaultKeyValue or dot11KeyMappingValue (128 bits)
+ * TSC = TKIP sequence counter (48 bits, only 32 msb bits used)
+ * P1K: 80 bits
+ */
+static void tkip_mixing_phase1(const u8 *tk, struct tkip_ctx *ctx,
+			       const u8 *ta, u32 tsc_IV32)
+{
+	int i, j;
+	u16 *p1k = ctx->p1k;
+
+	p1k[0] = tsc_IV32 & 0xFFFF;
+	p1k[1] = tsc_IV32 >> 16;
+	p1k[2] = get_unaligned_le16(ta + 0);
+	p1k[3] = get_unaligned_le16(ta + 2);
+	p1k[4] = get_unaligned_le16(ta + 4);
+
+	for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+		j = 2 * (i & 1);
+		p1k[0] += tkipS(p1k[4] ^ get_unaligned_le16(tk + 0 + j));
+		p1k[1] += tkipS(p1k[0] ^ get_unaligned_le16(tk + 4 + j));
+		p1k[2] += tkipS(p1k[1] ^ get_unaligned_le16(tk + 8 + j));
+		p1k[3] += tkipS(p1k[2] ^ get_unaligned_le16(tk + 12 + j));
+		p1k[4] += tkipS(p1k[3] ^ get_unaligned_le16(tk + 0 + j)) + i;
+	}
+	ctx->state = TKIP_STATE_PHASE1_DONE;
+}
+
+static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx,
+			       u16 tsc_IV16, u8 *rc4key)
+{
+	u16 ppk[6];
+	const u16 *p1k = ctx->p1k;
+	int i;
+
+	ppk[0] = p1k[0];
+	ppk[1] = p1k[1];
+	ppk[2] = p1k[2];
+	ppk[3] = p1k[3];
+	ppk[4] = p1k[4];
+	ppk[5] = p1k[4] + tsc_IV16;
+
+	ppk[0] += tkipS(ppk[5] ^ get_unaligned_le16(tk + 0));
+	ppk[1] += tkipS(ppk[0] ^ get_unaligned_le16(tk + 2));
+	ppk[2] += tkipS(ppk[1] ^ get_unaligned_le16(tk + 4));
+	ppk[3] += tkipS(ppk[2] ^ get_unaligned_le16(tk + 6));
+	ppk[4] += tkipS(ppk[3] ^ get_unaligned_le16(tk + 8));
+	ppk[5] += tkipS(ppk[4] ^ get_unaligned_le16(tk + 10));
+	ppk[0] += ror16(ppk[5] ^ get_unaligned_le16(tk + 12), 1);
+	ppk[1] += ror16(ppk[0] ^ get_unaligned_le16(tk + 14), 1);
+	ppk[2] += ror16(ppk[1], 1);
+	ppk[3] += ror16(ppk[2], 1);
+	ppk[4] += ror16(ppk[3], 1);
+	ppk[5] += ror16(ppk[4], 1);
+
+	rc4key = write_tkip_iv(rc4key, tsc_IV16);
+	*rc4key++ = ((ppk[5] ^ get_unaligned_le16(tk)) >> 1) & 0xFF;
+
+	for (i = 0; i < 6; i++)
+		put_unaligned_le16(ppk[i], rc4key + 2 * i);
+}
+
+/* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets
+ * of the IV. Returns pointer to the octet following IVs (i.e., beginning of
+ * the packet payload). */
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key, u16 iv16)
+{
+	pos = write_tkip_iv(pos, iv16);
+	*pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */;
+	put_unaligned_le32(key->u.tkip.tx.iv32, pos);
+	return pos + 4;
+}
+
+void ieee80211_get_tkip_key(struct ieee80211_key_conf *keyconf,
+			struct sk_buff *skb, enum ieee80211_tkip_key_type type,
+			u8 *outkey)
+{
+	struct ieee80211_key *key = (struct ieee80211_key *)
+			container_of(keyconf, struct ieee80211_key, conf);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	u8 *data;
+	const u8 *tk;
+	struct tkip_ctx *ctx;
+	u16 iv16;
+	u32 iv32;
+
+	data = (u8 *)hdr + ieee80211_hdrlen(hdr->frame_control);
+	iv16 = data[2] | (data[0] << 8);
+	iv32 = get_unaligned_le32(&data[4]);
+
+	tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+	ctx = &key->u.tkip.tx;
+
+#ifdef CONFIG_MAC80211_TKIP_DEBUG
+	printk(KERN_DEBUG "TKIP encrypt: iv16 = 0x%04x, iv32 = 0x%08x\n",
+			iv16, iv32);
+
+	if (iv32 != ctx->iv32) {
+		printk(KERN_DEBUG "skb: iv32 = 0x%08x key: iv32 = 0x%08x\n",
+			iv32, ctx->iv32);
+		printk(KERN_DEBUG "Wrap around of iv16 in the middle of a "
+			"fragmented packet\n");
+	}
+#endif
+
+	/* Update the p1k only when the iv16 in the packet wraps around, this
+	 * might occur after the wrap around of iv16 in the key in case of
+	 * fragmented packets. */
+	if (iv16 == 0 || ctx->state == TKIP_STATE_NOT_INIT)
+		tkip_mixing_phase1(tk, ctx, hdr->addr2, iv32);
+
+	if (type == IEEE80211_TKIP_P1_KEY) {
+		memcpy(outkey, ctx->p1k, sizeof(u16) * 5);
+		return;
+	}
+
+	tkip_mixing_phase2(tk, ctx, iv16, outkey);
+}
+EXPORT_SYMBOL(ieee80211_get_tkip_key);
+
+/*
+ * Encrypt packet payload with TKIP using @key. @pos is a pointer to the
+ * beginning of the buffer containing payload. This payload must include
+ * the IV/Ext.IV and space for (taildroom) four octets for ICV.
+ * @payload_len is the length of payload (_not_ including IV/ICV length).
+ * @ta is the transmitter addresses.
+ */
+int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
+				struct ieee80211_key *key,
+				u8 *pos, size_t payload_len, u8 *ta)
+{
+	u8 rc4key[16];
+	struct tkip_ctx *ctx = &key->u.tkip.tx;
+	const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+
+	/* Calculate per-packet key */
+	if (ctx->iv16 == 0 || ctx->state == TKIP_STATE_NOT_INIT)
+		tkip_mixing_phase1(tk, ctx, ta, ctx->iv32);
+
+	tkip_mixing_phase2(tk, ctx, ctx->iv16, rc4key);
+
+	return ieee80211_wep_encrypt_data(tfm, rc4key, 16, pos, payload_len);
+}
+
+/* Decrypt packet payload with TKIP using @key. @pos is a pointer to the
+ * beginning of the buffer containing IEEE 802.11 header payload, i.e.,
+ * including IV, Ext. IV, real data, Michael MIC, ICV. @payload_len is the
+ * length of payload, including IV, Ext. IV, MIC, ICV.  */
+int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
+				struct ieee80211_key *key,
+				u8 *payload, size_t payload_len, u8 *ta,
+				u8 *ra, int only_iv, int queue,
+				u32 *out_iv32, u16 *out_iv16)
+{
+	u32 iv32;
+	u32 iv16;
+	u8 rc4key[16], keyid, *pos = payload;
+	int res;
+	const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+
+	if (payload_len < 12)
+		return -1;
+
+	iv16 = (pos[0] << 8) | pos[2];
+	keyid = pos[3];
+	iv32 = get_unaligned_le32(pos + 4);
+	pos += 8;
+#ifdef CONFIG_MAC80211_TKIP_DEBUG
+	{
+		int i;
+		printk(KERN_DEBUG "TKIP decrypt: data(len=%zd)", payload_len);
+		for (i = 0; i < payload_len; i++)
+			printk(" %02x", payload[i]);
+		printk("\n");
+		printk(KERN_DEBUG "TKIP decrypt: iv16=%04x iv32=%08x\n",
+		       iv16, iv32);
+	}
+#endif
+
+	if (!(keyid & (1 << 5)))
+		return TKIP_DECRYPT_NO_EXT_IV;
+
+	if ((keyid >> 6) != key->conf.keyidx)
+		return TKIP_DECRYPT_INVALID_KEYIDX;
+
+	if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT &&
+	    (iv32 < key->u.tkip.rx[queue].iv32 ||
+	     (iv32 == key->u.tkip.rx[queue].iv32 &&
+	      iv16 <= key->u.tkip.rx[queue].iv16))) {
+#ifdef CONFIG_MAC80211_TKIP_DEBUG
+		printk(KERN_DEBUG "TKIP replay detected for RX frame from "
+		       "%pM (RX IV (%04x,%02x) <= prev. IV (%04x,%02x)\n",
+		       ta,
+		       iv32, iv16, key->u.tkip.rx[queue].iv32,
+		       key->u.tkip.rx[queue].iv16);
+#endif
+		return TKIP_DECRYPT_REPLAY;
+	}
+
+	if (only_iv) {
+		res = TKIP_DECRYPT_OK;
+		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+		goto done;
+	}
+
+	if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT ||
+	    key->u.tkip.rx[queue].iv32 != iv32) {
+		/* IV16 wrapped around - perform TKIP phase 1 */
+		tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32);
+#ifdef CONFIG_MAC80211_TKIP_DEBUG
+		{
+			int i;
+			u8 key_offset = NL80211_TKIP_DATA_OFFSET_ENCR_KEY;
+			printk(KERN_DEBUG "TKIP decrypt: Phase1 TA=%pM"
+			       " TK=", ta);
+			for (i = 0; i < 16; i++)
+				printk("%02x ",
+				       key->conf.key[key_offset + i]);
+			printk("\n");
+			printk(KERN_DEBUG "TKIP decrypt: P1K=");
+			for (i = 0; i < 5; i++)
+				printk("%04x ", key->u.tkip.rx[queue].p1k[i]);
+			printk("\n");
+		}
+#endif
+	}
+	if (key->local->ops->update_tkip_key &&
+	    key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
+	    key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) {
+		struct ieee80211_sub_if_data *sdata = key->sdata;
+
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			sdata = container_of(key->sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+		drv_update_tkip_key(key->local, sdata, &key->conf, key->sta,
+				iv32, key->u.tkip.rx[queue].p1k);
+		key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+	}
+
+	tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key);
+#ifdef CONFIG_MAC80211_TKIP_DEBUG
+	{
+		int i;
+		printk(KERN_DEBUG "TKIP decrypt: Phase2 rc4key=");
+		for (i = 0; i < 16; i++)
+			printk("%02x ", rc4key[i]);
+		printk("\n");
+	}
+#endif
+
+	res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12);
+ done:
+	if (res == TKIP_DECRYPT_OK) {
+		/*
+		 * Record previously received IV, will be copied into the
+		 * key information after MIC verification. It is possible
+		 * that we don't catch replays of fragments but that's ok
+		 * because the Michael MIC verication will then fail.
+		 */
+		*out_iv32 = iv32;
+		*out_iv16 = iv16;
+	}
+
+	return res;
+}
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
new file mode 100644
index 00000000..1cab9c86
--- /dev/null
+++ b/net/mac80211/tkip.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef TKIP_H
+#define TKIP_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include "key.h"
+
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key, u16 iv16);
+
+int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
+				 struct ieee80211_key *key,
+				 u8 *pos, size_t payload_len, u8 *ta);
+enum {
+	TKIP_DECRYPT_OK = 0,
+	TKIP_DECRYPT_NO_EXT_IV = -1,
+	TKIP_DECRYPT_INVALID_KEYIDX = -2,
+	TKIP_DECRYPT_REPLAY = -3,
+};
+int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
+				struct ieee80211_key *key,
+				u8 *payload, size_t payload_len, u8 *ta,
+				u8 *ra, int only_iv, int queue,
+				u32 *out_iv32, u16 *out_iv16);
+
+#endif /* TKIP_H */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
new file mode 100644
index 00000000..da878c14
--- /dev/null
+++ b/net/mac80211/tx.c
@@ -0,0 +1,2560 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *
+ * Transmit and frame generation functions.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/bitmap.h>
+#include <linux/rcupdate.h>
+#include <net/net_namespace.h>
+#include <net/ieee80211_radiotap.h>
+#include <net/cfg80211.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "led.h"
+#include "mesh.h"
+#include "wep.h"
+#include "wpa.h"
+#include "wme.h"
+#include "rate.h"
+
+/* misc utils */
+
+static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, int group_addr,
+				 int next_frag_len)
+{
+	int rate, mrate, erp, dur, i;
+	struct ieee80211_rate *txrate;
+	struct ieee80211_local *local = tx->local;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_hdr *hdr;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+	/* assume HW handles this */
+	if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS)
+		return 0;
+
+	/* uh huh? */
+	if (WARN_ON_ONCE(info->control.rates[0].idx < 0))
+		return 0;
+
+	sband = local->hw.wiphy->bands[tx->channel->band];
+	txrate = &sband->bitrates[info->control.rates[0].idx];
+
+	erp = txrate->flags & IEEE80211_RATE_ERP_G;
+
+	/*
+	 * data and mgmt (except PS Poll):
+	 * - during CFP: 32768
+	 * - during contention period:
+	 *   if addr1 is group address: 0
+	 *   if more fragments = 0 and addr1 is individual address: time to
+	 *      transmit one ACK plus SIFS
+	 *   if more fragments = 1 and addr1 is individual address: time to
+	 *      transmit next fragment plus 2 x ACK plus 3 x SIFS
+	 *
+	 * IEEE 802.11, 9.6:
+	 * - control response frame (CTS or ACK) shall be transmitted using the
+	 *   same rate as the immediately previous frame in the frame exchange
+	 *   sequence, if this rate belongs to the PHY mandatory rates, or else
+	 *   at the highest possible rate belonging to the PHY rates in the
+	 *   BSSBasicRateSet
+	 */
+	hdr = (struct ieee80211_hdr *)tx->skb->data;
+	if (ieee80211_is_ctl(hdr->frame_control)) {
+		/* TODO: These control frames are not currently sent by
+		 * mac80211, but should they be implemented, this function
+		 * needs to be updated to support duration field calculation.
+		 *
+		 * RTS: time needed to transmit pending data/mgmt frame plus
+		 *    one CTS frame plus one ACK frame plus 3 x SIFS
+		 * CTS: duration of immediately previous RTS minus time
+		 *    required to transmit CTS and its SIFS
+		 * ACK: 0 if immediately previous directed data/mgmt had
+		 *    more=0, with more=1 duration in ACK frame is duration
+		 *    from previous frame minus time needed to transmit ACK
+		 *    and its SIFS
+		 * PS Poll: BIT(15) | BIT(14) | aid
+		 */
+		return 0;
+	}
+
+	/* data/mgmt */
+	if (0 /* FIX: data/mgmt during CFP */)
+		return cpu_to_le16(32768);
+
+	if (group_addr) /* Group address as the destination - no ACK */
+		return 0;
+
+	/* Individual destination address:
+	 * IEEE 802.11, Ch. 9.6 (after IEEE 802.11g changes)
+	 * CTS and ACK frames shall be transmitted using the highest rate in
+	 * basic rate set that is less than or equal to the rate of the
+	 * immediately previous frame and that is using the same modulation
+	 * (CCK or OFDM). If no basic rate set matches with these requirements,
+	 * the highest mandatory rate of the PHY that is less than or equal to
+	 * the rate of the previous frame is used.
+	 * Mandatory rates for IEEE 802.11g PHY: 1, 2, 5.5, 11, 6, 12, 24 Mbps
+	 */
+	rate = -1;
+	/* use lowest available if everything fails */
+	mrate = sband->bitrates[0].bitrate;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		struct ieee80211_rate *r = &sband->bitrates[i];
+
+		if (r->bitrate > txrate->bitrate)
+			break;
+
+		if (tx->sdata->vif.bss_conf.basic_rates & BIT(i))
+			rate = r->bitrate;
+
+		switch (sband->band) {
+		case IEEE80211_BAND_2GHZ: {
+			u32 flag;
+			if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+				flag = IEEE80211_RATE_MANDATORY_G;
+			else
+				flag = IEEE80211_RATE_MANDATORY_B;
+			if (r->flags & flag)
+				mrate = r->bitrate;
+			break;
+		}
+		case IEEE80211_BAND_5GHZ:
+			if (r->flags & IEEE80211_RATE_MANDATORY_A)
+				mrate = r->bitrate;
+			break;
+		case IEEE80211_NUM_BANDS:
+			WARN_ON(1);
+			break;
+		}
+	}
+	if (rate == -1) {
+		/* No matching basic rate found; use highest suitable mandatory
+		 * PHY rate */
+		rate = mrate;
+	}
+
+	/* Time needed to transmit ACK
+	 * (10 bytes + 4-byte FCS = 112 bits) plus SIFS; rounded up
+	 * to closest integer */
+
+	dur = ieee80211_frame_duration(local, 10, rate, erp,
+				tx->sdata->vif.bss_conf.use_short_preamble);
+
+	if (next_frag_len) {
+		/* Frame is fragmented: duration increases with time needed to
+		 * transmit next fragment plus ACK and 2 x SIFS. */
+		dur *= 2; /* ACK + SIFS */
+		/* next fragment */
+		dur += ieee80211_frame_duration(local, next_frag_len,
+				txrate->bitrate, erp,
+				tx->sdata->vif.bss_conf.use_short_preamble);
+	}
+
+	return cpu_to_le16(dur);
+}
+
+static inline int is_ieee80211_device(struct ieee80211_local *local,
+				      struct net_device *dev)
+{
+	return local == wdev_priv(dev->ieee80211_ptr);
+}
+
+/* tx handlers */
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_local *local = tx->local;
+	struct ieee80211_if_managed *ifmgd;
+
+	/* driver doesn't support power save */
+	if (!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS))
+		return TX_CONTINUE;
+
+	/* hardware does dynamic power save */
+	if (local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS)
+		return TX_CONTINUE;
+
+	/* dynamic power save disabled */
+	if (local->hw.conf.dynamic_ps_timeout <= 0)
+		return TX_CONTINUE;
+
+	/* we are scanning, don't enable power save */
+	if (local->scanning)
+		return TX_CONTINUE;
+
+	if (!local->ps_sdata)
+		return TX_CONTINUE;
+
+	/* No point if we're going to suspend */
+	if (local->quiescing)
+		return TX_CONTINUE;
+
+	/* dynamic ps is supported only in managed mode */
+	if (tx->sdata->vif.type != NL80211_IFTYPE_STATION)
+		return TX_CONTINUE;
+
+	ifmgd = &tx->sdata->u.mgd;
+
+	/*
+	 * Don't wakeup from power save if u-apsd is enabled, voip ac has
+	 * u-apsd enabled and the frame is in voip class. This effectively
+	 * means that even if all access categories have u-apsd enabled, in
+	 * practise u-apsd is only used with the voip ac. This is a
+	 * workaround for the case when received voip class packets do not
+	 * have correct qos tag for some reason, due the network or the
+	 * peer application.
+	 *
+	 * Note: local->uapsd_queues access is racy here. If the value is
+	 * changed via debugfs, user needs to reassociate manually to have
+	 * everything in sync.
+	 */
+	if ((ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED)
+	    && (local->uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
+	    && skb_get_queue_mapping(tx->skb) == 0)
+		return TX_CONTINUE;
+
+	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
+		ieee80211_stop_queues_by_reason(&local->hw,
+						IEEE80211_QUEUE_STOP_REASON_PS);
+		ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
+		ieee80211_queue_work(&local->hw,
+				     &local->dynamic_ps_disable_work);
+	}
+
+	/* Don't restart the timer if we're not disassociated */
+	if (!ifmgd->associated)
+		return TX_CONTINUE;
+
+	mod_timer(&local->dynamic_ps_timer, jiffies +
+		  msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
+{
+
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	u32 sta_flags;
+
+	if (unlikely(info->flags & IEEE80211_TX_CTL_INJECTED))
+		return TX_CONTINUE;
+
+	if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) &&
+	    test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) &&
+	    !ieee80211_is_probe_req(hdr->frame_control) &&
+	    !ieee80211_is_nullfunc(hdr->frame_control))
+		/*
+		 * When software scanning only nullfunc frames (to notify
+		 * the sleep state to the AP) and probe requests (for the
+		 * active scan) are allowed, all other frames should not be
+		 * sent and we should not get here, but if we do
+		 * nonetheless, drop them to avoid sending them
+		 * off-channel. See the link below and
+		 * ieee80211_start_scan() for more.
+		 *
+		 * http://article.gmane.org/gmane.linux.kernel.wireless.general/30089
+		 */
+		return TX_DROP;
+
+	if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
+		return TX_CONTINUE;
+
+	if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
+		return TX_CONTINUE;
+
+	if (tx->flags & IEEE80211_TX_PS_BUFFERED)
+		return TX_CONTINUE;
+
+	sta_flags = tx->sta ? get_sta_flags(tx->sta) : 0;
+
+	if (likely(tx->flags & IEEE80211_TX_UNICAST)) {
+		if (unlikely(!(sta_flags & WLAN_STA_ASSOC) &&
+			     tx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+			     ieee80211_is_data(hdr->frame_control))) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+			printk(KERN_DEBUG "%s: dropped data frame to not "
+			       "associated station %pM\n",
+			       tx->sdata->name, hdr->addr1);
+#endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
+			I802_DEBUG_INC(tx->local->tx_handlers_drop_not_assoc);
+			return TX_DROP;
+		}
+	} else {
+		if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+			     tx->local->num_sta == 0 &&
+			     tx->sdata->vif.type != NL80211_IFTYPE_ADHOC)) {
+			/*
+			 * No associated STAs - no need to send multicast
+			 * frames.
+			 */
+			return TX_DROP;
+		}
+		return TX_CONTINUE;
+	}
+
+	return TX_CONTINUE;
+}
+
+/* This function is called whenever the AP is about to exceed the maximum limit
+ * of buffered frames for power saving STAs. This situation should not really
+ * happen often during normal operation, so dropping the oldest buffered packet
+ * from each queue should be OK to make some room for new frames. */
+static void purge_old_ps_buffers(struct ieee80211_local *local)
+{
+	int total = 0, purged = 0;
+	struct sk_buff *skb;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+
+	/*
+	 * virtual interfaces are protected by RCU
+	 */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		struct ieee80211_if_ap *ap;
+		if (sdata->vif.type != NL80211_IFTYPE_AP)
+			continue;
+		ap = &sdata->u.ap;
+		skb = skb_dequeue(&ap->ps_bc_buf);
+		if (skb) {
+			purged++;
+			dev_kfree_skb(skb);
+		}
+		total += skb_queue_len(&ap->ps_bc_buf);
+	}
+
+	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+		skb = skb_dequeue(&sta->ps_tx_buf);
+		if (skb) {
+			purged++;
+			dev_kfree_skb(skb);
+		}
+		total += skb_queue_len(&sta->ps_tx_buf);
+	}
+
+	rcu_read_unlock();
+
+	local->total_ps_buffered = total;
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n",
+		    purged);
+#endif
+}
+
+static ieee80211_tx_result
+ieee80211_tx_h_multicast_ps_buf(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+
+	/*
+	 * broadcast/multicast frame
+	 *
+	 * If any of the associated stations is in power save mode,
+	 * the frame is buffered to be sent after DTIM beacon frame.
+	 * This is done either by the hardware or us.
+	 */
+
+	/* powersaving STAs only in AP/VLAN mode */
+	if (!tx->sdata->bss)
+		return TX_CONTINUE;
+
+	/* no buffering for ordered frames */
+	if (ieee80211_has_order(hdr->frame_control))
+		return TX_CONTINUE;
+
+	/* no stations in PS mode */
+	if (!atomic_read(&tx->sdata->bss->num_sta_ps))
+		return TX_CONTINUE;
+
+	info->flags |= IEEE80211_TX_CTL_SEND_AFTER_DTIM;
+
+	/* device releases frame after DTIM beacon */
+	if (!(tx->local->hw.flags & IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING))
+		return TX_CONTINUE;
+
+	/* buffered in mac80211 */
+	if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
+		purge_old_ps_buffers(tx->local);
+
+	if (skb_queue_len(&tx->sdata->bss->ps_bc_buf) >= AP_MAX_BC_BUFFER) {
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "%s: BC TX buffer full - dropping the oldest frame\n",
+			       tx->sdata->name);
+#endif
+		dev_kfree_skb(skb_dequeue(&tx->sdata->bss->ps_bc_buf));
+	} else
+		tx->local->total_ps_buffered++;
+
+	skb_queue_tail(&tx->sdata->bss->ps_bc_buf, tx->skb);
+
+	return TX_QUEUED;
+}
+
+static int ieee80211_use_mfp(__le16 fc, struct sta_info *sta,
+			     struct sk_buff *skb)
+{
+	if (!ieee80211_is_mgmt(fc))
+		return 0;
+
+	if (sta == NULL || !test_sta_flags(sta, WLAN_STA_MFP))
+		return 0;
+
+	if (!ieee80211_is_robust_mgmt_frame((struct ieee80211_hdr *)
+					    skb->data))
+		return 0;
+
+	return 1;
+}
+
+static ieee80211_tx_result
+ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
+{
+	struct sta_info *sta = tx->sta;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_local *local = tx->local;
+	u32 staflags;
+
+	if (unlikely(!sta ||
+		     ieee80211_is_probe_resp(hdr->frame_control) ||
+		     ieee80211_is_auth(hdr->frame_control) ||
+		     ieee80211_is_assoc_resp(hdr->frame_control) ||
+		     ieee80211_is_reassoc_resp(hdr->frame_control)))
+		return TX_CONTINUE;
+
+	staflags = get_sta_flags(sta);
+
+	if (unlikely((staflags & (WLAN_STA_PS_STA | WLAN_STA_PS_DRIVER)) &&
+		     !(info->flags & IEEE80211_TX_CTL_PSPOLL_RESPONSE))) {
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+		printk(KERN_DEBUG "STA %pM aid %d: PS buffer (entries "
+		       "before %d)\n",
+		       sta->sta.addr, sta->sta.aid,
+		       skb_queue_len(&sta->ps_tx_buf));
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+		if (tx->local->total_ps_buffered >= TOTAL_MAX_TX_BUFFER)
+			purge_old_ps_buffers(tx->local);
+		if (skb_queue_len(&sta->ps_tx_buf) >= STA_MAX_TX_BUFFER) {
+			struct sk_buff *old = skb_dequeue(&sta->ps_tx_buf);
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+			if (net_ratelimit()) {
+				printk(KERN_DEBUG "%s: STA %pM TX "
+				       "buffer full - dropping oldest frame\n",
+				       tx->sdata->name, sta->sta.addr);
+			}
+#endif
+			dev_kfree_skb(old);
+		} else
+			tx->local->total_ps_buffered++;
+
+		/*
+		 * Queue frame to be sent after STA wakes up/polls,
+		 * but don't set the TIM bit if the driver is blocking
+		 * wakeup or poll response transmissions anyway.
+		 */
+		if (skb_queue_empty(&sta->ps_tx_buf) &&
+		    !(staflags & WLAN_STA_PS_DRIVER))
+			sta_info_set_tim_bit(sta);
+
+		info->control.jiffies = jiffies;
+		info->control.vif = &tx->sdata->vif;
+		info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+		skb_queue_tail(&sta->ps_tx_buf, tx->skb);
+
+		if (!timer_pending(&local->sta_cleanup))
+			mod_timer(&local->sta_cleanup,
+				  round_jiffies(jiffies +
+						STA_INFO_CLEANUP_INTERVAL));
+
+		return TX_QUEUED;
+	}
+#ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
+	else if (unlikely(staflags & WLAN_STA_PS_STA)) {
+		printk(KERN_DEBUG "%s: STA %pM in PS mode, but pspoll "
+		       "set -> send frame\n", tx->sdata->name,
+		       sta->sta.addr);
+	}
+#endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
+{
+	if (unlikely(tx->flags & IEEE80211_TX_PS_BUFFERED))
+		return TX_CONTINUE;
+
+	if (tx->flags & IEEE80211_TX_UNICAST)
+		return ieee80211_tx_h_unicast_ps_buf(tx);
+	else
+		return ieee80211_tx_h_multicast_ps_buf(tx);
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+	if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol &&
+		     tx->sdata->control_port_no_encrypt))
+		info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_key *key = NULL;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+
+	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
+		tx->key = NULL;
+	else if (tx->sta && (key = rcu_dereference(tx->sta->ptk)))
+		tx->key = key;
+	else if (ieee80211_is_mgmt(hdr->frame_control) &&
+		 is_multicast_ether_addr(hdr->addr1) &&
+		 ieee80211_is_robust_mgmt_frame(hdr) &&
+		 (key = rcu_dereference(tx->sdata->default_mgmt_key)))
+		tx->key = key;
+	else if (is_multicast_ether_addr(hdr->addr1) &&
+		 (key = rcu_dereference(tx->sdata->default_multicast_key)))
+		tx->key = key;
+	else if (!is_multicast_ether_addr(hdr->addr1) &&
+		 (key = rcu_dereference(tx->sdata->default_unicast_key)))
+		tx->key = key;
+	else if (tx->sdata->drop_unencrypted &&
+		 (tx->skb->protocol != tx->sdata->control_port_protocol) &&
+		 !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
+		 (!ieee80211_is_robust_mgmt_frame(hdr) ||
+		  (ieee80211_is_action(hdr->frame_control) &&
+		   tx->sta && test_sta_flags(tx->sta, WLAN_STA_MFP)))) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop_unencrypted);
+		return TX_DROP;
+	} else
+		tx->key = NULL;
+
+	if (tx->key) {
+		bool skip_hw = false;
+
+		tx->key->tx_rx_count++;
+		/* TODO: add threshold stuff again */
+
+		switch (tx->key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_WEP40:
+		case WLAN_CIPHER_SUITE_WEP104:
+			if (ieee80211_is_auth(hdr->frame_control))
+				break;
+		case WLAN_CIPHER_SUITE_TKIP:
+			if (!ieee80211_is_data_present(hdr->frame_control))
+				tx->key = NULL;
+			break;
+		case WLAN_CIPHER_SUITE_CCMP:
+			if (!ieee80211_is_data_present(hdr->frame_control) &&
+			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
+					       tx->skb))
+				tx->key = NULL;
+			else
+				skip_hw = (tx->key->conf.flags &
+					   IEEE80211_KEY_FLAG_SW_MGMT) &&
+					ieee80211_is_mgmt(hdr->frame_control);
+			break;
+		case WLAN_CIPHER_SUITE_AES_CMAC:
+			if (!ieee80211_is_mgmt(hdr->frame_control))
+				tx->key = NULL;
+			break;
+		}
+
+		if (!skip_hw && tx->key &&
+		    tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
+			info->control.hw_key = &tx->key->conf;
+	}
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (void *)tx->skb->data;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *rate;
+	int i;
+	u32 len;
+	bool inval = false, rts = false, short_preamble = false;
+	struct ieee80211_tx_rate_control txrc;
+	u32 sta_flags;
+
+	memset(&txrc, 0, sizeof(txrc));
+
+	sband = tx->local->hw.wiphy->bands[tx->channel->band];
+
+	len = min_t(u32, tx->skb->len + FCS_LEN,
+			 tx->local->hw.wiphy->frag_threshold);
+
+	/* set up the tx rate control struct we give the RC algo */
+	txrc.hw = local_to_hw(tx->local);
+	txrc.sband = sband;
+	txrc.bss_conf = &tx->sdata->vif.bss_conf;
+	txrc.skb = tx->skb;
+	txrc.reported_rate.idx = -1;
+	txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[tx->channel->band];
+	if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1)
+		txrc.max_rate_idx = -1;
+	else
+		txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1;
+	txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP ||
+		    tx->sdata->vif.type == NL80211_IFTYPE_ADHOC);
+
+	/* set up RTS protection if desired */
+	if (len > tx->local->hw.wiphy->rts_threshold) {
+		txrc.rts = rts = true;
+	}
+
+	/*
+	 * Use short preamble if the BSS can handle it, but not for
+	 * management frames unless we know the receiver can handle
+	 * that -- the management frame might be to a station that
+	 * just wants a probe response.
+	 */
+	if (tx->sdata->vif.bss_conf.use_short_preamble &&
+	    (ieee80211_is_data(hdr->frame_control) ||
+	     (tx->sta && test_sta_flags(tx->sta, WLAN_STA_SHORT_PREAMBLE))))
+		txrc.short_preamble = short_preamble = true;
+
+	sta_flags = tx->sta ? get_sta_flags(tx->sta) : 0;
+
+	/*
+	 * Lets not bother rate control if we're associated and cannot
+	 * talk to the sta. This should not happen.
+	 */
+	if (WARN(test_bit(SCAN_SW_SCANNING, &tx->local->scanning) &&
+		 (sta_flags & WLAN_STA_ASSOC) &&
+		 !rate_usable_index_exists(sband, &tx->sta->sta),
+		 "%s: Dropped data frame as no usable bitrate found while "
+		 "scanning and associated. Target station: "
+		 "%pM on %d GHz band\n",
+		 tx->sdata->name, hdr->addr1,
+		 tx->channel->band ? 5 : 2))
+		return TX_DROP;
+
+	/*
+	 * If we're associated with the sta at this point we know we can at
+	 * least send the frame at the lowest bit rate.
+	 */
+	rate_control_get_rate(tx->sdata, tx->sta, &txrc);
+
+	if (unlikely(info->control.rates[0].idx < 0))
+		return TX_DROP;
+
+	if (txrc.reported_rate.idx < 0) {
+		txrc.reported_rate = info->control.rates[0];
+		if (tx->sta && ieee80211_is_data(hdr->frame_control))
+			tx->sta->last_tx_rate = txrc.reported_rate;
+	} else if (tx->sta)
+		tx->sta->last_tx_rate = txrc.reported_rate;
+
+	if (unlikely(!info->control.rates[0].count))
+		info->control.rates[0].count = 1;
+
+	if (WARN_ON_ONCE((info->control.rates[0].count > 1) &&
+			 (info->flags & IEEE80211_TX_CTL_NO_ACK)))
+		info->control.rates[0].count = 1;
+
+	if (is_multicast_ether_addr(hdr->addr1)) {
+		/*
+		 * XXX: verify the rate is in the basic rateset
+		 */
+		return TX_CONTINUE;
+	}
+
+	/*
+	 * set up the RTS/CTS rate as the fastest basic rate
+	 * that is not faster than the data rate
+	 *
+	 * XXX: Should this check all retry rates?
+	 */
+	if (!(info->control.rates[0].flags & IEEE80211_TX_RC_MCS)) {
+		s8 baserate = 0;
+
+		rate = &sband->bitrates[info->control.rates[0].idx];
+
+		for (i = 0; i < sband->n_bitrates; i++) {
+			/* must be a basic rate */
+			if (!(tx->sdata->vif.bss_conf.basic_rates & BIT(i)))
+				continue;
+			/* must not be faster than the data rate */
+			if (sband->bitrates[i].bitrate > rate->bitrate)
+				continue;
+			/* maximum */
+			if (sband->bitrates[baserate].bitrate <
+			     sband->bitrates[i].bitrate)
+				baserate = i;
+		}
+
+		info->control.rts_cts_rate_idx = baserate;
+	}
+
+	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+		/*
+		 * make sure there's no valid rate following
+		 * an invalid one, just in case drivers don't
+		 * take the API seriously to stop at -1.
+		 */
+		if (inval) {
+			info->control.rates[i].idx = -1;
+			continue;
+		}
+		if (info->control.rates[i].idx < 0) {
+			inval = true;
+			continue;
+		}
+
+		/*
+		 * For now assume MCS is already set up correctly, this
+		 * needs to be fixed.
+		 */
+		if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS) {
+			WARN_ON(info->control.rates[i].idx > 76);
+			continue;
+		}
+
+		/* set up RTS protection if desired */
+		if (rts)
+			info->control.rates[i].flags |=
+				IEEE80211_TX_RC_USE_RTS_CTS;
+
+		/* RC is busted */
+		if (WARN_ON_ONCE(info->control.rates[i].idx >=
+				 sband->n_bitrates)) {
+			info->control.rates[i].idx = -1;
+			continue;
+		}
+
+		rate = &sband->bitrates[info->control.rates[i].idx];
+
+		/* set up short preamble */
+		if (short_preamble &&
+		    rate->flags & IEEE80211_RATE_SHORT_PREAMBLE)
+			info->control.rates[i].flags |=
+				IEEE80211_TX_RC_USE_SHORT_PREAMBLE;
+
+		/* set up G protection */
+		if (!rts && tx->sdata->vif.bss_conf.use_cts_prot &&
+		    rate->flags & IEEE80211_RATE_ERP_G)
+			info->control.rates[i].flags |=
+				IEEE80211_TX_RC_USE_CTS_PROTECT;
+	}
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
+	u16 *seq;
+	u8 *qc;
+	int tid;
+
+	/*
+	 * Packet injection may want to control the sequence
+	 * number, if we have no matching interface then we
+	 * neither assign one ourselves nor ask the driver to.
+	 */
+	if (unlikely(info->control.vif->type == NL80211_IFTYPE_MONITOR))
+		return TX_CONTINUE;
+
+	if (unlikely(ieee80211_is_ctl(hdr->frame_control)))
+		return TX_CONTINUE;
+
+	if (ieee80211_hdrlen(hdr->frame_control) < 24)
+		return TX_CONTINUE;
+
+	/*
+	 * Anything but QoS data that has a sequence number field
+	 * (is long enough) gets a sequence number from the global
+	 * counter.
+	 */
+	if (!ieee80211_is_data_qos(hdr->frame_control)) {
+		/* driver should assign sequence number */
+		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
+		/* for pure STA mode without beacons, we can do it */
+		hdr->seq_ctrl = cpu_to_le16(tx->sdata->sequence_number);
+		tx->sdata->sequence_number += 0x10;
+		return TX_CONTINUE;
+	}
+
+	/*
+	 * This should be true for injected/management frames only, for
+	 * management frames we have set the IEEE80211_TX_CTL_ASSIGN_SEQ
+	 * above since they are not QoS-data frames.
+	 */
+	if (!tx->sta)
+		return TX_CONTINUE;
+
+	/* include per-STA, per-TID sequence counter */
+
+	qc = ieee80211_get_qos_ctl(hdr);
+	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+	seq = &tx->sta->tid_seq[tid];
+
+	hdr->seq_ctrl = cpu_to_le16(*seq);
+
+	/* Increase the sequence number. */
+	*seq = (*seq + 0x10) & IEEE80211_SCTL_SEQ;
+
+	return TX_CONTINUE;
+}
+
+static int ieee80211_fragment(struct ieee80211_local *local,
+			      struct sk_buff *skb, int hdrlen,
+			      int frag_threshold)
+{
+	struct sk_buff *tail = skb, *tmp;
+	int per_fragm = frag_threshold - hdrlen - FCS_LEN;
+	int pos = hdrlen + per_fragm;
+	int rem = skb->len - hdrlen - per_fragm;
+
+	if (WARN_ON(rem < 0))
+		return -EINVAL;
+
+	while (rem) {
+		int fraglen = per_fragm;
+
+		if (fraglen > rem)
+			fraglen = rem;
+		rem -= fraglen;
+		tmp = dev_alloc_skb(local->tx_headroom +
+				    frag_threshold +
+				    IEEE80211_ENCRYPT_HEADROOM +
+				    IEEE80211_ENCRYPT_TAILROOM);
+		if (!tmp)
+			return -ENOMEM;
+		tail->next = tmp;
+		tail = tmp;
+		skb_reserve(tmp, local->tx_headroom +
+				 IEEE80211_ENCRYPT_HEADROOM);
+		/* copy control information */
+		memcpy(tmp->cb, skb->cb, sizeof(tmp->cb));
+		skb_copy_queue_mapping(tmp, skb);
+		tmp->priority = skb->priority;
+		tmp->dev = skb->dev;
+
+		/* copy header and data */
+		memcpy(skb_put(tmp, hdrlen), skb->data, hdrlen);
+		memcpy(skb_put(tmp, fraglen), skb->data + pos, fraglen);
+
+		pos += fraglen;
+	}
+
+	skb->len = hdrlen + per_fragm;
+	return 0;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	int frag_threshold = tx->local->hw.wiphy->frag_threshold;
+	int hdrlen;
+	int fragnum;
+
+	if (!(tx->flags & IEEE80211_TX_FRAGMENTED))
+		return TX_CONTINUE;
+
+	/*
+	 * Warn when submitting a fragmented A-MPDU frame and drop it.
+	 * This scenario is handled in ieee80211_tx_prepare but extra
+	 * caution taken here as fragmented ampdu may cause Tx stop.
+	 */
+	if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU))
+		return TX_DROP;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+	/* internal error, why is TX_FRAGMENTED set? */
+	if (WARN_ON(skb->len + FCS_LEN <= frag_threshold))
+		return TX_DROP;
+
+	/*
+	 * Now fragment the frame. This will allocate all the fragments and
+	 * chain them (using skb as the first fragment) to skb->next.
+	 * During transmission, we will remove the successfully transmitted
+	 * fragments from this list. When the low-level driver rejects one
+	 * of the fragments then we will simply pretend to accept the skb
+	 * but store it away as pending.
+	 */
+	if (ieee80211_fragment(tx->local, skb, hdrlen, frag_threshold))
+		return TX_DROP;
+
+	/* update duration/seq/flags of fragments */
+	fragnum = 0;
+	do {
+		int next_len;
+		const __le16 morefrags = cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
+
+		hdr = (void *)skb->data;
+		info = IEEE80211_SKB_CB(skb);
+
+		if (skb->next) {
+			hdr->frame_control |= morefrags;
+			next_len = skb->next->len;
+			/*
+			 * No multi-rate retries for fragmented frames, that
+			 * would completely throw off the NAV at other STAs.
+			 */
+			info->control.rates[1].idx = -1;
+			info->control.rates[2].idx = -1;
+			info->control.rates[3].idx = -1;
+			info->control.rates[4].idx = -1;
+			BUILD_BUG_ON(IEEE80211_TX_MAX_RATES != 5);
+			info->flags &= ~IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+		} else {
+			hdr->frame_control &= ~morefrags;
+			next_len = 0;
+		}
+		hdr->duration_id = ieee80211_duration(tx, 0, next_len);
+		hdr->seq_ctrl |= cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
+		fragnum++;
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+
+	if (!tx->sta)
+		return TX_CONTINUE;
+
+	tx->sta->tx_packets++;
+	do {
+		tx->sta->tx_fragments++;
+		tx->sta->tx_bytes += skb->len;
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+	if (!tx->key)
+		return TX_CONTINUE;
+
+	switch (tx->key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		return ieee80211_crypto_wep_encrypt(tx);
+	case WLAN_CIPHER_SUITE_TKIP:
+		return ieee80211_crypto_tkip_encrypt(tx);
+	case WLAN_CIPHER_SUITE_CCMP:
+		return ieee80211_crypto_ccmp_encrypt(tx);
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		return ieee80211_crypto_aes_cmac_encrypt(tx);
+	default:
+		/* handle hw-only algorithm */
+		if (info->control.hw_key) {
+			ieee80211_tx_set_protected(tx);
+			return TX_CONTINUE;
+		}
+		break;
+
+	}
+
+	return TX_DROP;
+}
+
+static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_calculate_duration(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_hdr *hdr;
+	int next_len;
+	bool group_addr;
+
+	do {
+		hdr = (void *) skb->data;
+		if (unlikely(ieee80211_is_pspoll(hdr->frame_control)))
+			break; /* must not overwrite AID */
+		next_len = skb->next ? skb->next->len : 0;
+		group_addr = is_multicast_ether_addr(hdr->addr1);
+
+		hdr->duration_id =
+			ieee80211_duration(tx, group_addr, next_len);
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
+
+/* actual transmit path */
+
+/*
+ * deal with packet injection down monitor interface
+ * with Radiotap Header -- only called for monitor mode interface
+ */
+static bool __ieee80211_parse_tx_radiotap(struct ieee80211_tx_data *tx,
+					  struct sk_buff *skb)
+{
+	/*
+	 * this is the moment to interpret and discard the radiotap header that
+	 * must be at the start of the packet injected in Monitor mode
+	 *
+	 * Need to take some care with endian-ness since radiotap
+	 * args are little-endian
+	 */
+
+	struct ieee80211_radiotap_iterator iterator;
+	struct ieee80211_radiotap_header *rthdr =
+		(struct ieee80211_radiotap_header *) skb->data;
+	bool hw_frag;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len,
+						   NULL);
+
+	info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	tx->flags &= ~IEEE80211_TX_FRAGMENTED;
+
+	/* packet is fragmented in HW if we have a non-NULL driver callback */
+	hw_frag = (tx->local->ops->set_frag_threshold != NULL);
+
+	/*
+	 * for every radiotap entry that is present
+	 * (ieee80211_radiotap_iterator_next returns -ENOENT when no more
+	 * entries present, or -EINVAL on error)
+	 */
+
+	while (!ret) {
+		ret = ieee80211_radiotap_iterator_next(&iterator);
+
+		if (ret)
+			continue;
+
+		/* see if this argument is something we can use */
+		switch (iterator.this_arg_index) {
+		/*
+		 * You must take care when dereferencing iterator.this_arg
+		 * for multibyte types... the pointer is not aligned.  Use
+		 * get_unaligned((type *)iterator.this_arg) to dereference
+		 * iterator.this_arg for type "type" safely on all arches.
+		*/
+		case IEEE80211_RADIOTAP_FLAGS:
+			if (*iterator.this_arg & IEEE80211_RADIOTAP_F_FCS) {
+				/*
+				 * this indicates that the skb we have been
+				 * handed has the 32-bit FCS CRC at the end...
+				 * we should react to that by snipping it off
+				 * because it will be recomputed and added
+				 * on transmission
+				 */
+				if (skb->len < (iterator._max_length + FCS_LEN))
+					return false;
+
+				skb_trim(skb, skb->len - FCS_LEN);
+			}
+			if (*iterator.this_arg & IEEE80211_RADIOTAP_F_WEP)
+				info->flags &= ~IEEE80211_TX_INTFL_DONT_ENCRYPT;
+			if ((*iterator.this_arg & IEEE80211_RADIOTAP_F_FRAG) &&
+								!hw_frag)
+				tx->flags |= IEEE80211_TX_FRAGMENTED;
+			break;
+
+		/*
+		 * Please update the file
+		 * Documentation/networking/mac80211-injection.txt
+		 * when parsing new fields here.
+		 */
+
+		default:
+			break;
+		}
+	}
+
+	if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */
+		return false;
+
+	/*
+	 * remove the radiotap header
+	 * iterator->_max_length was sanity-checked against
+	 * skb->len by iterator init
+	 */
+	skb_pull(skb, iterator._max_length);
+
+	return true;
+}
+
+static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
+				  struct sk_buff *skb,
+				  struct ieee80211_tx_info *info,
+				  struct tid_ampdu_tx *tid_tx,
+				  int tid)
+{
+	bool queued = false;
+
+	if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+		info->flags |= IEEE80211_TX_CTL_AMPDU;
+	} else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+		/*
+		 * nothing -- this aggregation session is being started
+		 * but that might still fail with the driver
+		 */
+	} else {
+		spin_lock(&tx->sta->lock);
+		/*
+		 * Need to re-check now, because we may get here
+		 *
+		 *  1) in the window during which the setup is actually
+		 *     already done, but not marked yet because not all
+		 *     packets are spliced over to the driver pending
+		 *     queue yet -- if this happened we acquire the lock
+		 *     either before or after the splice happens, but
+		 *     need to recheck which of these cases happened.
+		 *
+		 *  2) during session teardown, if the OPERATIONAL bit
+		 *     was cleared due to the teardown but the pointer
+		 *     hasn't been assigned NULL yet (or we loaded it
+		 *     before it was assigned) -- in this case it may
+		 *     now be NULL which means we should just let the
+		 *     packet pass through because splicing the frames
+		 *     back is already done.
+		 */
+		tid_tx = rcu_dereference_protected_tid_tx(tx->sta, tid);
+
+		if (!tid_tx) {
+			/* do nothing, let packet pass through */
+		} else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+			info->flags |= IEEE80211_TX_CTL_AMPDU;
+		} else {
+			queued = true;
+			info->control.vif = &tx->sdata->vif;
+			info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+			__skb_queue_tail(&tid_tx->pending, skb);
+		}
+		spin_unlock(&tx->sta->lock);
+	}
+
+	return queued;
+}
+
+/*
+ * initialises @tx
+ */
+static ieee80211_tx_result
+ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
+		     struct ieee80211_tx_data *tx,
+		     struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hdr *hdr;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	int hdrlen, tid;
+	u8 *qc;
+
+	memset(tx, 0, sizeof(*tx));
+	tx->skb = skb;
+	tx->local = local;
+	tx->sdata = sdata;
+	tx->channel = local->hw.conf.channel;
+	/*
+	 * Set this flag (used below to indicate "automatic fragmentation"),
+	 * it will be cleared/left by radiotap as desired.
+	 * Only valid when fragmentation is done by the stack.
+	 */
+	if (!local->ops->set_frag_threshold)
+		tx->flags |= IEEE80211_TX_FRAGMENTED;
+
+	/* process and remove the injection radiotap header */
+	if (unlikely(info->flags & IEEE80211_TX_INTFL_HAS_RADIOTAP)) {
+		if (!__ieee80211_parse_tx_radiotap(tx, skb))
+			return TX_DROP;
+
+		/*
+		 * __ieee80211_parse_tx_radiotap has now removed
+		 * the radiotap header that was present and pre-filled
+		 * 'tx' with tx control information.
+		 */
+		info->flags &= ~IEEE80211_TX_INTFL_HAS_RADIOTAP;
+	}
+
+	/*
+	 * If this flag is set to true anywhere, and we get here,
+	 * we are doing the needed processing, so remove the flag
+	 * now.
+	 */
+	info->flags &= ~IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+
+	hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+		tx->sta = rcu_dereference(sdata->u.vlan.sta);
+		if (!tx->sta && sdata->dev->ieee80211_ptr->use_4addr)
+			return TX_DROP;
+	} else if (info->flags & IEEE80211_TX_CTL_INJECTED ||
+		   tx->sdata->control_port_protocol == tx->skb->protocol) {
+		tx->sta = sta_info_get_bss(sdata, hdr->addr1);
+	}
+	if (!tx->sta)
+		tx->sta = sta_info_get(sdata, hdr->addr1);
+
+	if (tx->sta && ieee80211_is_data_qos(hdr->frame_control) &&
+	    (local->hw.flags & IEEE80211_HW_AMPDU_AGGREGATION)) {
+		struct tid_ampdu_tx *tid_tx;
+
+		qc = ieee80211_get_qos_ctl(hdr);
+		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+
+		tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]);
+		if (tid_tx) {
+			bool queued;
+
+			queued = ieee80211_tx_prep_agg(tx, skb, info,
+						       tid_tx, tid);
+
+			if (unlikely(queued))
+				return TX_QUEUED;
+		}
+	}
+
+	if (is_multicast_ether_addr(hdr->addr1)) {
+		tx->flags &= ~IEEE80211_TX_UNICAST;
+		info->flags |= IEEE80211_TX_CTL_NO_ACK;
+	} else {
+		tx->flags |= IEEE80211_TX_UNICAST;
+		if (unlikely(local->wifi_wme_noack_test))
+			info->flags |= IEEE80211_TX_CTL_NO_ACK;
+		else
+			info->flags &= ~IEEE80211_TX_CTL_NO_ACK;
+	}
+
+	if (tx->flags & IEEE80211_TX_FRAGMENTED) {
+		if ((tx->flags & IEEE80211_TX_UNICAST) &&
+		    skb->len + FCS_LEN > local->hw.wiphy->frag_threshold &&
+		    !(info->flags & IEEE80211_TX_CTL_AMPDU))
+			tx->flags |= IEEE80211_TX_FRAGMENTED;
+		else
+			tx->flags &= ~IEEE80211_TX_FRAGMENTED;
+	}
+
+	if (!tx->sta)
+		info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
+	else if (test_and_clear_sta_flags(tx->sta, WLAN_STA_CLEAR_PS_FILT))
+		info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (skb->len > hdrlen + sizeof(rfc1042_header) + 2) {
+		u8 *pos = &skb->data[hdrlen + sizeof(rfc1042_header)];
+		tx->ethertype = (pos[0] << 8) | pos[1];
+	}
+	info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT;
+
+	return TX_CONTINUE;
+}
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead.
+ */
+static bool __ieee80211_tx(struct ieee80211_local *local, struct sk_buff **skbp,
+			   struct sta_info *sta, bool txpending)
+{
+	struct sk_buff *skb = *skbp, *next;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_sub_if_data *sdata;
+	unsigned long flags;
+	int len;
+	bool fragm = false;
+
+	while (skb) {
+		int q = skb_get_queue_mapping(skb);
+		__le16 fc;
+
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+		if (local->queue_stop_reasons[q] ||
+		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
+			/*
+			 * Since queue is stopped, queue up frames for later
+			 * transmission from the tx-pending tasklet when the
+			 * queue is woken again.
+			 */
+
+			do {
+				next = skb->next;
+				skb->next = NULL;
+				/*
+				 * NB: If txpending is true, next must already
+				 * be NULL since we must've gone through this
+				 * loop before already; therefore we can just
+				 * queue the frame to the head without worrying
+				 * about reordering of fragments.
+				 */
+				if (unlikely(txpending))
+					__skb_queue_head(&local->pending[q],
+							 skb);
+				else
+					__skb_queue_tail(&local->pending[q],
+							 skb);
+			} while ((skb = next));
+
+			spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+					       flags);
+			return false;
+		}
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+		info = IEEE80211_SKB_CB(skb);
+
+		if (fragm)
+			info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT |
+					 IEEE80211_TX_CTL_FIRST_FRAGMENT);
+
+		next = skb->next;
+		len = skb->len;
+
+		if (next)
+			info->flags |= IEEE80211_TX_CTL_MORE_FRAMES;
+
+		sdata = vif_to_sdata(info->control.vif);
+
+		switch (sdata->vif.type) {
+		case NL80211_IFTYPE_MONITOR:
+			info->control.vif = NULL;
+			break;
+		case NL80211_IFTYPE_AP_VLAN:
+			info->control.vif = &container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap)->vif;
+			break;
+		default:
+			/* keep */
+			break;
+		}
+
+		if (sta && sta->uploaded)
+			info->control.sta = &sta->sta;
+		else
+			info->control.sta = NULL;
+
+		fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
+		drv_tx(local, skb);
+
+		ieee80211_tpt_led_trig_tx(local, fc, len);
+		*skbp = skb = next;
+		ieee80211_led_tx(local, 1);
+		fragm = true;
+	}
+
+	return true;
+}
+
+/*
+ * Invoke TX handlers, return 0 on success and non-zero if the
+ * frame was dropped or queued.
+ */
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	ieee80211_tx_result res = TX_DROP;
+
+#define CALL_TXH(txh) \
+	do {				\
+		res = txh(tx);		\
+		if (res != TX_CONTINUE)	\
+			goto txh_done;	\
+	} while (0)
+
+	CALL_TXH(ieee80211_tx_h_dynamic_ps);
+	CALL_TXH(ieee80211_tx_h_check_assoc);
+	CALL_TXH(ieee80211_tx_h_ps_buf);
+	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
+	CALL_TXH(ieee80211_tx_h_select_key);
+	if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+		CALL_TXH(ieee80211_tx_h_rate_ctrl);
+
+	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION))
+		goto txh_done;
+
+	CALL_TXH(ieee80211_tx_h_michael_mic_add);
+	CALL_TXH(ieee80211_tx_h_sequence);
+	CALL_TXH(ieee80211_tx_h_fragment);
+	/* handlers after fragment must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_stats);
+	CALL_TXH(ieee80211_tx_h_encrypt);
+	if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
+		CALL_TXH(ieee80211_tx_h_calculate_duration);
+#undef CALL_TXH
+
+ txh_done:
+	if (unlikely(res == TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		while (skb) {
+			struct sk_buff *next;
+
+			next = skb->next;
+			dev_kfree_skb(skb);
+			skb = next;
+		}
+		return -1;
+	} else if (unlikely(res == TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead.
+ */
+static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
+			 struct sk_buff *skb, bool txpending)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result res_prepare;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	bool result = true;
+
+	if (unlikely(skb->len < 10)) {
+		dev_kfree_skb(skb);
+		return true;
+	}
+
+	rcu_read_lock();
+
+	/* initialises tx */
+	res_prepare = ieee80211_tx_prepare(sdata, &tx, skb);
+
+	if (unlikely(res_prepare == TX_DROP)) {
+		dev_kfree_skb(skb);
+		goto out;
+	} else if (unlikely(res_prepare == TX_QUEUED)) {
+		goto out;
+	}
+
+	tx.channel = local->hw.conf.channel;
+	info->band = tx.channel->band;
+
+	if (!invoke_tx_handlers(&tx))
+		result = __ieee80211_tx(local, &tx.skb, tx.sta, txpending);
+ out:
+	rcu_read_unlock();
+	return result;
+}
+
+/* device xmit handlers */
+
+static int ieee80211_skb_resize(struct ieee80211_local *local,
+				struct sk_buff *skb,
+				int head_need, bool may_encrypt)
+{
+	int tail_need = 0;
+
+	/*
+	 * This could be optimised, devices that do full hardware
+	 * crypto (including TKIP MMIC) need no tailroom... But we
+	 * have no drivers for such devices currently.
+	 */
+	if (may_encrypt) {
+		tail_need = IEEE80211_ENCRYPT_TAILROOM;
+		tail_need -= skb_tailroom(skb);
+		tail_need = max_t(int, tail_need, 0);
+	}
+
+	if (head_need || tail_need) {
+		/* Sorry. Can't account for this any more */
+		skb_orphan(skb);
+	}
+
+	if (skb_cloned(skb))
+		I802_DEBUG_INC(local->tx_expand_skb_head_cloned);
+	else if (head_need || tail_need)
+		I802_DEBUG_INC(local->tx_expand_skb_head);
+	else
+		return 0;
+
+	if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) {
+		wiphy_debug(local->hw.wiphy,
+			    "failed to reallocate TX buffer\n");
+		return -ENOMEM;
+	}
+
+	/* update truesize too */
+	skb->truesize += head_need + tail_need;
+
+	return 0;
+}
+
+static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
+			   struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_sub_if_data *tmp_sdata;
+	int headroom;
+	bool may_encrypt;
+
+	rcu_read_lock();
+
+	if (unlikely(sdata->vif.type == NL80211_IFTYPE_MONITOR)) {
+		int hdrlen;
+		u16 len_rthdr;
+
+		info->flags |= IEEE80211_TX_CTL_INJECTED |
+			       IEEE80211_TX_INTFL_HAS_RADIOTAP;
+
+		len_rthdr = ieee80211_get_radiotap_len(skb->data);
+		hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr);
+		hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+		/* check the header is complete in the frame */
+		if (likely(skb->len >= len_rthdr + hdrlen)) {
+			/*
+			 * We process outgoing injected frames that have a
+			 * local address we handle as though they are our
+			 * own frames.
+			 * This code here isn't entirely correct, the local
+			 * MAC address is not necessarily enough to find
+			 * the interface to use; for that proper VLAN/WDS
+			 * support we will need a different mechanism.
+			 */
+
+			list_for_each_entry_rcu(tmp_sdata, &local->interfaces,
+						list) {
+				if (!ieee80211_sdata_running(tmp_sdata))
+					continue;
+				if (tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_MONITOR ||
+				    tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_AP_VLAN ||
+					tmp_sdata->vif.type ==
+				    NL80211_IFTYPE_WDS)
+					continue;
+				if (compare_ether_addr(tmp_sdata->vif.addr,
+						       hdr->addr2) == 0) {
+					sdata = tmp_sdata;
+					break;
+				}
+			}
+		}
+	}
+
+	may_encrypt = !(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT);
+
+	headroom = local->tx_headroom;
+	if (may_encrypt)
+		headroom += IEEE80211_ENCRYPT_HEADROOM;
+	headroom -= skb_headroom(skb);
+	headroom = max_t(int, 0, headroom);
+
+	if (ieee80211_skb_resize(local, skb, headroom, may_encrypt)) {
+		dev_kfree_skb(skb);
+		rcu_read_unlock();
+		return;
+	}
+
+	hdr = (struct ieee80211_hdr *) skb->data;
+	info->control.vif = &sdata->vif;
+
+	if (ieee80211_vif_is_mesh(&sdata->vif) &&
+	    ieee80211_is_data(hdr->frame_control) &&
+		!is_multicast_ether_addr(hdr->addr1))
+			if (mesh_nexthop_lookup(skb, sdata)) {
+				/* skb queued: don't free */
+				rcu_read_unlock();
+				return;
+			}
+
+	ieee80211_set_qos_hdr(local, skb);
+	ieee80211_tx(sdata, skb, false);
+	rcu_read_unlock();
+}
+
+netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
+					 struct net_device *dev)
+{
+	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+	struct ieee80211_channel *chan = local->hw.conf.channel;
+	struct ieee80211_radiotap_header *prthdr =
+		(struct ieee80211_radiotap_header *)skb->data;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	u16 len_rthdr;
+
+	/*
+	 * Frame injection is not allowed if beaconing is not allowed
+	 * or if we need radar detection. Beaconing is usually not allowed when
+	 * the mode or operation (Adhoc, AP, Mesh) does not support DFS.
+	 * Passive scan is also used in world regulatory domains where
+	 * your country is not known and as such it should be treated as
+	 * NO TX unless the channel is explicitly allowed in which case
+	 * your current regulatory domain would not have the passive scan
+	 * flag.
+	 *
+	 * Since AP mode uses monitor interfaces to inject/TX management
+	 * frames we can make AP mode the exception to this rule once it
+	 * supports radar detection as its implementation can deal with
+	 * radar detection by itself. We can do that later by adding a
+	 * monitor flag interfaces used for AP support.
+	 */
+	if ((chan->flags & (IEEE80211_CHAN_NO_IBSS | IEEE80211_CHAN_RADAR |
+	     IEEE80211_CHAN_PASSIVE_SCAN)))
+		goto fail;
+
+	/* check for not even having the fixed radiotap header part */
+	if (unlikely(skb->len < sizeof(struct ieee80211_radiotap_header)))
+		goto fail; /* too short to be possibly valid */
+
+	/* is it a header version we can trust to find length from? */
+	if (unlikely(prthdr->it_version))
+		goto fail; /* only version 0 is supported */
+
+	/* then there must be a radiotap header with a length we can use */
+	len_rthdr = ieee80211_get_radiotap_len(skb->data);
+
+	/* does the skb contain enough to deliver on the alleged length? */
+	if (unlikely(skb->len < len_rthdr))
+		goto fail; /* skb too short for claimed rt header extent */
+
+	/*
+	 * fix up the pointers accounting for the radiotap
+	 * header still being in there.  We are being given
+	 * a precooked IEEE80211 header so no need for
+	 * normal processing
+	 */
+	skb_set_mac_header(skb, len_rthdr);
+	/*
+	 * these are just fixed to the end of the rt area since we
+	 * don't have any better information and at this point, nobody cares
+	 */
+	skb_set_network_header(skb, len_rthdr);
+	skb_set_transport_header(skb, len_rthdr);
+
+	memset(info, 0, sizeof(*info));
+
+	info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+	/* pass the radiotap header up to xmit */
+	ieee80211_xmit(IEEE80211_DEV_TO_SUB_IF(dev), skb);
+	return NETDEV_TX_OK;
+
+fail:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK; /* meaning, we dealt with the skb */
+}
+
+/**
+ * ieee80211_subif_start_xmit - netif start_xmit function for Ethernet-type
+ * subinterfaces (wlan#, WDS, and VLAN interfaces)
+ * @skb: packet to be sent
+ * @dev: incoming interface
+ *
+ * Returns: 0 on success (and frees skb in this case) or 1 on failure (skb will
+ * not be freed, and caller is responsible for either retrying later or freeing
+ * skb).
+ *
+ * This function takes in an Ethernet header and encapsulates it with suitable
+ * IEEE 802.11 header based on which interface the packet is coming in. The
+ * encapsulated packet will then be passed to master interface, wlan#.11, for
+ * transmission (through low-level driver).
+ */
+netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
+				    struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_info *info;
+	int ret = NETDEV_TX_BUSY, head_need;
+	u16 ethertype, hdrlen,  meshhdrlen = 0;
+	__le16 fc;
+	struct ieee80211_hdr hdr;
+	struct ieee80211s_hdr mesh_hdr __maybe_unused;
+	struct mesh_path __maybe_unused *mppath = NULL;
+	const u8 *encaps_data;
+	int encaps_len, skip_header_bytes;
+	int nh_pos, h_pos;
+	struct sta_info *sta = NULL;
+	u32 sta_flags = 0;
+	struct sk_buff *tmp_skb;
+
+	if (unlikely(skb->len < ETH_HLEN)) {
+		ret = NETDEV_TX_OK;
+		goto fail;
+	}
+
+	/* convert Ethernet header to proper 802.11 header (based on
+	 * operation mode) */
+	ethertype = (skb->data[12] << 8) | skb->data[13];
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		rcu_read_lock();
+		sta = rcu_dereference(sdata->u.vlan.sta);
+		if (sta) {
+			fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+			/* RA TA DA SA */
+			memcpy(hdr.addr1, sta->sta.addr, ETH_ALEN);
+			memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+			memcpy(hdr.addr3, skb->data, ETH_ALEN);
+			memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+			hdrlen = 30;
+			sta_flags = get_sta_flags(sta);
+		}
+		rcu_read_unlock();
+		if (sta)
+			break;
+		/* fall through */
+	case NL80211_IFTYPE_AP:
+		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		/* DA BSSID SA */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	case NL80211_IFTYPE_WDS:
+		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+		/* RA TA DA SA */
+		memcpy(hdr.addr1, sdata->u.wds.remote_addr, ETH_ALEN);
+		memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data, ETH_ALEN);
+		memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+		hdrlen = 30;
+		break;
+#ifdef CONFIG_MAC80211_MESH
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!sdata->u.mesh.mshcfg.dot11MeshTTL) {
+			/* Do not send frames with mesh_ttl == 0 */
+			sdata->u.mesh.mshstats.dropped_frames_ttl++;
+			ret = NETDEV_TX_OK;
+			goto fail;
+		}
+		rcu_read_lock();
+		if (!is_multicast_ether_addr(skb->data))
+			mppath = mpp_path_lookup(skb->data, sdata);
+
+		/*
+		 * Use address extension if it is a packet from
+		 * another interface or if we know the destination
+		 * is being proxied by a portal (i.e. portal address
+		 * differs from proxied address)
+		 */
+		if (compare_ether_addr(sdata->vif.addr,
+				       skb->data + ETH_ALEN) == 0 &&
+		    !(mppath && compare_ether_addr(mppath->mpp, skb->data))) {
+			hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
+					skb->data, skb->data + ETH_ALEN);
+			rcu_read_unlock();
+			meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr,
+					sdata, NULL, NULL);
+		} else {
+			int is_mesh_mcast = 1;
+			const u8 *mesh_da;
+
+			if (is_multicast_ether_addr(skb->data))
+				/* DA TA mSA AE:SA */
+				mesh_da = skb->data;
+			else {
+				static const u8 bcast[ETH_ALEN] =
+					{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+				if (mppath) {
+					/* RA TA mDA mSA AE:DA SA */
+					mesh_da = mppath->mpp;
+					is_mesh_mcast = 0;
+				} else {
+					/* DA TA mSA AE:SA */
+					mesh_da = bcast;
+				}
+			}
+			hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
+					mesh_da, sdata->vif.addr);
+			rcu_read_unlock();
+			if (is_mesh_mcast)
+				meshhdrlen =
+					ieee80211_new_mesh_header(&mesh_hdr,
+							sdata,
+							skb->data + ETH_ALEN,
+							NULL);
+			else
+				meshhdrlen =
+					ieee80211_new_mesh_header(&mesh_hdr,
+							sdata,
+							skb->data,
+							skb->data + ETH_ALEN);
+
+		}
+		break;
+#endif
+	case NL80211_IFTYPE_STATION:
+		memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
+		if (sdata->u.mgd.use_4addr &&
+		    cpu_to_be16(ethertype) != sdata->control_port_protocol) {
+			fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
+			/* RA TA DA SA */
+			memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
+			memcpy(hdr.addr3, skb->data, ETH_ALEN);
+			memcpy(hdr.addr4, skb->data + ETH_ALEN, ETH_ALEN);
+			hdrlen = 30;
+		} else {
+			fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+			/* BSSID SA DA */
+			memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+			memcpy(hdr.addr3, skb->data, ETH_ALEN);
+			hdrlen = 24;
+		}
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* DA SA BSSID */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+		memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	default:
+		ret = NETDEV_TX_OK;
+		goto fail;
+	}
+
+	/*
+	 * There's no need to try to look up the destination
+	 * if it is a multicast address (which can only happen
+	 * in AP mode)
+	 */
+	if (!is_multicast_ether_addr(hdr.addr1)) {
+		rcu_read_lock();
+		sta = sta_info_get(sdata, hdr.addr1);
+		if (sta)
+			sta_flags = get_sta_flags(sta);
+		rcu_read_unlock();
+	}
+
+	/* receiver and we are QoS enabled, use a QoS type frame */
+	if ((sta_flags & WLAN_STA_WME) && local->hw.queues >= 4) {
+		fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+		hdrlen += 2;
+	}
+
+	/*
+	 * Drop unicast frames to unauthorised stations unless they are
+	 * EAPOL frames from the local station.
+	 */
+	if (!ieee80211_vif_is_mesh(&sdata->vif) &&
+		unlikely(!is_multicast_ether_addr(hdr.addr1) &&
+		      !(sta_flags & WLAN_STA_AUTHORIZED) &&
+		      !(cpu_to_be16(ethertype) == sdata->control_port_protocol &&
+		       compare_ether_addr(sdata->vif.addr,
+					  skb->data + ETH_ALEN) == 0))) {
+#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
+		if (net_ratelimit())
+			printk(KERN_DEBUG "%s: dropped frame to %pM"
+			       " (unauthorized port)\n", dev->name,
+			       hdr.addr1);
+#endif
+
+		I802_DEBUG_INC(local->tx_handlers_drop_unauth_port);
+
+		ret = NETDEV_TX_OK;
+		goto fail;
+	}
+
+	/*
+	 * If the skb is shared we need to obtain our own copy.
+	 */
+	if (skb_shared(skb)) {
+		tmp_skb = skb;
+		skb = skb_clone(skb, GFP_ATOMIC);
+		kfree_skb(tmp_skb);
+
+		if (!skb) {
+			ret = NETDEV_TX_OK;
+			goto fail;
+		}
+	}
+
+	hdr.frame_control = fc;
+	hdr.duration_id = 0;
+	hdr.seq_ctrl = 0;
+
+	skip_header_bytes = ETH_HLEN;
+	if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
+		encaps_data = bridge_tunnel_header;
+		encaps_len = sizeof(bridge_tunnel_header);
+		skip_header_bytes -= 2;
+	} else if (ethertype >= 0x600) {
+		encaps_data = rfc1042_header;
+		encaps_len = sizeof(rfc1042_header);
+		skip_header_bytes -= 2;
+	} else {
+		encaps_data = NULL;
+		encaps_len = 0;
+	}
+
+	nh_pos = skb_network_header(skb) - skb->data;
+	h_pos = skb_transport_header(skb) - skb->data;
+
+	skb_pull(skb, skip_header_bytes);
+	nh_pos -= skip_header_bytes;
+	h_pos -= skip_header_bytes;
+
+	head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb);
+
+	/*
+	 * So we need to modify the skb header and hence need a copy of
+	 * that. The head_need variable above doesn't, so far, include
+	 * the needed header space that we don't need right away. If we
+	 * can, then we don't reallocate right now but only after the
+	 * frame arrives at the master device (if it does...)
+	 *
+	 * If we cannot, however, then we will reallocate to include all
+	 * the ever needed space. Also, if we need to reallocate it anyway,
+	 * make it big enough for everything we may ever need.
+	 */
+
+	if (head_need > 0 || skb_cloned(skb)) {
+		head_need += IEEE80211_ENCRYPT_HEADROOM;
+		head_need += local->tx_headroom;
+		head_need = max_t(int, 0, head_need);
+		if (ieee80211_skb_resize(local, skb, head_need, true))
+			goto fail;
+	}
+
+	if (encaps_data) {
+		memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
+		nh_pos += encaps_len;
+		h_pos += encaps_len;
+	}
+
+#ifdef CONFIG_MAC80211_MESH
+	if (meshhdrlen > 0) {
+		memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen);
+		nh_pos += meshhdrlen;
+		h_pos += meshhdrlen;
+	}
+#endif
+
+	if (ieee80211_is_data_qos(fc)) {
+		__le16 *qos_control;
+
+		qos_control = (__le16*) skb_push(skb, 2);
+		memcpy(skb_push(skb, hdrlen - 2), &hdr, hdrlen - 2);
+		/*
+		 * Maybe we could actually set some fields here, for now just
+		 * initialise to zero to indicate no special operation.
+		 */
+		*qos_control = 0;
+	} else
+		memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
+
+	nh_pos += hdrlen;
+	h_pos += hdrlen;
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	/* Update skb pointers to various headers since this modified frame
+	 * is going to go through Linux networking code that may potentially
+	 * need things like pointer to IP header. */
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, nh_pos);
+	skb_set_transport_header(skb, h_pos);
+
+	info = IEEE80211_SKB_CB(skb);
+	memset(info, 0, sizeof(*info));
+
+	dev->trans_start = jiffies;
+	ieee80211_xmit(sdata, skb);
+
+	return NETDEV_TX_OK;
+
+ fail:
+	if (ret == NETDEV_TX_OK)
+		dev_kfree_skb(skb);
+
+	return ret;
+}
+
+
+/*
+ * ieee80211_clear_tx_pending may not be called in a context where
+ * it is possible that it packets could come in again.
+ */
+void ieee80211_clear_tx_pending(struct ieee80211_local *local)
+{
+	int i;
+
+	for (i = 0; i < local->hw.queues; i++)
+		skb_queue_purge(&local->pending[i]);
+}
+
+/*
+ * Returns false if the frame couldn't be transmitted but was queued instead,
+ * which in this case means re-queued -- take as an indication to stop sending
+ * more pending frames.
+ */
+static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
+				     struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+	struct ieee80211_hdr *hdr;
+	bool result;
+
+	sdata = vif_to_sdata(info->control.vif);
+
+	if (info->flags & IEEE80211_TX_INTFL_NEED_TXPROCESSING) {
+		result = ieee80211_tx(sdata, skb, true);
+	} else {
+		hdr = (struct ieee80211_hdr *)skb->data;
+		sta = sta_info_get(sdata, hdr->addr1);
+
+		result = __ieee80211_tx(local, &skb, sta, true);
+	}
+
+	return result;
+}
+
+/*
+ * Transmit all pending packets. Called from tasklet.
+ */
+void ieee80211_tx_pending(unsigned long data)
+{
+	struct ieee80211_local *local = (struct ieee80211_local *)data;
+	struct ieee80211_sub_if_data *sdata;
+	unsigned long flags;
+	int i;
+	bool txok;
+
+	rcu_read_lock();
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (i = 0; i < local->hw.queues; i++) {
+		/*
+		 * If queue is stopped by something other than due to pending
+		 * frames, or we have no pending frames, proceed to next queue.
+		 */
+		if (local->queue_stop_reasons[i] ||
+		    skb_queue_empty(&local->pending[i]))
+			continue;
+
+		while (!skb_queue_empty(&local->pending[i])) {
+			struct sk_buff *skb = __skb_dequeue(&local->pending[i]);
+			struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+			if (WARN_ON(!info->control.vif)) {
+				kfree_skb(skb);
+				continue;
+			}
+
+			spin_unlock_irqrestore(&local->queue_stop_reason_lock,
+						flags);
+
+			txok = ieee80211_tx_pending_skb(local, skb);
+			spin_lock_irqsave(&local->queue_stop_reason_lock,
+					  flags);
+			if (!txok)
+				break;
+		}
+
+		if (skb_queue_empty(&local->pending[i]))
+			list_for_each_entry_rcu(sdata, &local->interfaces, list)
+				netif_wake_subqueue(sdata->dev, i);
+	}
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+	rcu_read_unlock();
+}
+
+/* functions for drivers to get certain frames */
+
+static void ieee80211_beacon_add_tim(struct ieee80211_if_ap *bss,
+				     struct sk_buff *skb,
+				     struct beacon_data *beacon)
+{
+	u8 *pos, *tim;
+	int aid0 = 0;
+	int i, have_bits = 0, n1, n2;
+
+	/* Generate bitmap for TIM only if there are any STAs in power save
+	 * mode. */
+	if (atomic_read(&bss->num_sta_ps) > 0)
+		/* in the hope that this is faster than
+		 * checking byte-for-byte */
+		have_bits = !bitmap_empty((unsigned long*)bss->tim,
+					  IEEE80211_MAX_AID+1);
+
+	if (bss->dtim_count == 0)
+		bss->dtim_count = beacon->dtim_period - 1;
+	else
+		bss->dtim_count--;
+
+	tim = pos = (u8 *) skb_put(skb, 6);
+	*pos++ = WLAN_EID_TIM;
+	*pos++ = 4;
+	*pos++ = bss->dtim_count;
+	*pos++ = beacon->dtim_period;
+
+	if (bss->dtim_count == 0 && !skb_queue_empty(&bss->ps_bc_buf))
+		aid0 = 1;
+
+	bss->dtim_bc_mc = aid0 == 1;
+
+	if (have_bits) {
+		/* Find largest even number N1 so that bits numbered 1 through
+		 * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits
+		 * (N2 + 1) x 8 through 2007 are 0. */
+		n1 = 0;
+		for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) {
+			if (bss->tim[i]) {
+				n1 = i & 0xfe;
+				break;
+			}
+		}
+		n2 = n1;
+		for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) {
+			if (bss->tim[i]) {
+				n2 = i;
+				break;
+			}
+		}
+
+		/* Bitmap control */
+		*pos++ = n1 | aid0;
+		/* Part Virt Bitmap */
+		memcpy(pos, bss->tim + n1, n2 - n1 + 1);
+
+		tim[1] = n2 - n1 + 4;
+		skb_put(skb, n2 - n1);
+	} else {
+		*pos++ = aid0; /* Bitmap control */
+		*pos++ = 0; /* Part Virt Bitmap */
+	}
+}
+
+struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
+					 struct ieee80211_vif *vif,
+					 u16 *tim_offset, u16 *tim_length)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sk_buff *skb = NULL;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_sub_if_data *sdata = NULL;
+	struct ieee80211_if_ap *ap = NULL;
+	struct beacon_data *beacon;
+	struct ieee80211_supported_band *sband;
+	enum ieee80211_band band = local->hw.conf.channel->band;
+	struct ieee80211_tx_rate_control txrc;
+
+	sband = local->hw.wiphy->bands[band];
+
+	rcu_read_lock();
+
+	sdata = vif_to_sdata(vif);
+
+	if (!ieee80211_sdata_running(sdata))
+		goto out;
+
+	if (tim_offset)
+		*tim_offset = 0;
+	if (tim_length)
+		*tim_length = 0;
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
+		ap = &sdata->u.ap;
+		beacon = rcu_dereference(ap->beacon);
+		if (beacon) {
+			/*
+			 * headroom, head length,
+			 * tail length and maximum TIM length
+			 */
+			skb = dev_alloc_skb(local->tx_headroom +
+					    beacon->head_len +
+					    beacon->tail_len + 256);
+			if (!skb)
+				goto out;
+
+			skb_reserve(skb, local->tx_headroom);
+			memcpy(skb_put(skb, beacon->head_len), beacon->head,
+			       beacon->head_len);
+
+			/*
+			 * Not very nice, but we want to allow the driver to call
+			 * ieee80211_beacon_get() as a response to the set_tim()
+			 * callback. That, however, is already invoked under the
+			 * sta_lock to guarantee consistent and race-free update
+			 * of the tim bitmap in mac80211 and the driver.
+			 */
+			if (local->tim_in_locked_section) {
+				ieee80211_beacon_add_tim(ap, skb, beacon);
+			} else {
+				unsigned long flags;
+
+				spin_lock_irqsave(&local->sta_lock, flags);
+				ieee80211_beacon_add_tim(ap, skb, beacon);
+				spin_unlock_irqrestore(&local->sta_lock, flags);
+			}
+
+			if (tim_offset)
+				*tim_offset = beacon->head_len;
+			if (tim_length)
+				*tim_length = skb->len - beacon->head_len;
+
+			if (beacon->tail)
+				memcpy(skb_put(skb, beacon->tail_len),
+				       beacon->tail, beacon->tail_len);
+		} else
+			goto out;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+		struct ieee80211_hdr *hdr;
+		struct sk_buff *presp = rcu_dereference(ifibss->presp);
+
+		if (!presp)
+			goto out;
+
+		skb = skb_copy(presp, GFP_ATOMIC);
+		if (!skb)
+			goto out;
+
+		hdr = (struct ieee80211_hdr *) skb->data;
+		hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						 IEEE80211_STYPE_BEACON);
+	} else if (ieee80211_vif_is_mesh(&sdata->vif)) {
+		struct ieee80211_mgmt *mgmt;
+		u8 *pos;
+
+#ifdef CONFIG_MAC80211_MESH
+		if (!sdata->u.mesh.mesh_id_len)
+			goto out;
+#endif
+
+		/* headroom, head length, tail length and maximum TIM length */
+		skb = dev_alloc_skb(local->tx_headroom + 400 +
+				sdata->u.mesh.ie_len);
+		if (!skb)
+			goto out;
+
+		skb_reserve(skb, local->hw.extra_tx_headroom);
+		mgmt = (struct ieee80211_mgmt *)
+			skb_put(skb, 24 + sizeof(mgmt->u.beacon));
+		memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
+		mgmt->frame_control =
+		    cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON);
+		memset(mgmt->da, 0xff, ETH_ALEN);
+		memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+		memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+		mgmt->u.beacon.beacon_int =
+			cpu_to_le16(sdata->vif.bss_conf.beacon_int);
+		mgmt->u.beacon.capab_info = 0x0; /* 0x0 for MPs */
+
+		pos = skb_put(skb, 2);
+		*pos++ = WLAN_EID_SSID;
+		*pos++ = 0x0;
+
+		mesh_mgmt_ies_add(skb, sdata);
+	} else {
+		WARN_ON(1);
+		goto out;
+	}
+
+	info = IEEE80211_SKB_CB(skb);
+
+	info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	info->flags |= IEEE80211_TX_CTL_NO_ACK;
+	info->band = band;
+
+	memset(&txrc, 0, sizeof(txrc));
+	txrc.hw = hw;
+	txrc.sband = sband;
+	txrc.bss_conf = &sdata->vif.bss_conf;
+	txrc.skb = skb;
+	txrc.reported_rate.idx = -1;
+	txrc.rate_idx_mask = sdata->rc_rateidx_mask[band];
+	if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1)
+		txrc.max_rate_idx = -1;
+	else
+		txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1;
+	txrc.bss = true;
+	rate_control_get_rate(sdata, NULL, &txrc);
+
+	info->control.vif = vif;
+
+	info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT |
+			IEEE80211_TX_CTL_ASSIGN_SEQ |
+			IEEE80211_TX_CTL_FIRST_FRAGMENT;
+ out:
+	rcu_read_unlock();
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_tim);
+
+struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
+				     struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_managed *ifmgd;
+	struct ieee80211_pspoll *pspoll;
+	struct ieee80211_local *local;
+	struct sk_buff *skb;
+
+	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+		return NULL;
+
+	sdata = vif_to_sdata(vif);
+	ifmgd = &sdata->u.mgd;
+	local = sdata->local;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*pspoll));
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for "
+		       "pspoll template\n", sdata->name);
+		return NULL;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	pspoll = (struct ieee80211_pspoll *) skb_put(skb, sizeof(*pspoll));
+	memset(pspoll, 0, sizeof(*pspoll));
+	pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+					    IEEE80211_STYPE_PSPOLL);
+	pspoll->aid = cpu_to_le16(ifmgd->aid);
+
+	/* aid in PS-Poll has its two MSBs each set to 1 */
+	pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
+
+	memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN);
+	memcpy(pspoll->ta, vif->addr, ETH_ALEN);
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_pspoll_get);
+
+struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif)
+{
+	struct ieee80211_hdr_3addr *nullfunc;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_managed *ifmgd;
+	struct ieee80211_local *local;
+	struct sk_buff *skb;
+
+	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+		return NULL;
+
+	sdata = vif_to_sdata(vif);
+	ifmgd = &sdata->u.mgd;
+	local = sdata->local;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc));
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
+		       "template\n", sdata->name);
+		return NULL;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	nullfunc = (struct ieee80211_hdr_3addr *) skb_put(skb,
+							  sizeof(*nullfunc));
+	memset(nullfunc, 0, sizeof(*nullfunc));
+	nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
+					      IEEE80211_STYPE_NULLFUNC |
+					      IEEE80211_FCTL_TODS);
+	memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN);
+	memcpy(nullfunc->addr2, vif->addr, ETH_ALEN);
+	memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN);
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_nullfunc_get);
+
+struct sk_buff *ieee80211_probereq_get(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       const u8 *ssid, size_t ssid_len,
+				       const u8 *ie, size_t ie_len)
+{
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_local *local;
+	struct ieee80211_hdr_3addr *hdr;
+	struct sk_buff *skb;
+	size_t ie_ssid_len;
+	u8 *pos;
+
+	sdata = vif_to_sdata(vif);
+	local = sdata->local;
+	ie_ssid_len = 2 + ssid_len;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*hdr) +
+			    ie_ssid_len + ie_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
+		       "request template\n", sdata->name);
+		return NULL;
+	}
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	hdr = (struct ieee80211_hdr_3addr *) skb_put(skb, sizeof(*hdr));
+	memset(hdr, 0, sizeof(*hdr));
+	hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					 IEEE80211_STYPE_PROBE_REQ);
+	memset(hdr->addr1, 0xff, ETH_ALEN);
+	memcpy(hdr->addr2, vif->addr, ETH_ALEN);
+	memset(hdr->addr3, 0xff, ETH_ALEN);
+
+	pos = skb_put(skb, ie_ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = ssid_len;
+	if (ssid)
+		memcpy(pos, ssid, ssid_len);
+	pos += ssid_len;
+
+	if (ie) {
+		pos = skb_put(skb, ie_len);
+		memcpy(pos, ie, ie_len);
+	}
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_probereq_get);
+
+void ieee80211_rts_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		       const void *frame, size_t frame_len,
+		       const struct ieee80211_tx_info *frame_txctl,
+		       struct ieee80211_rts *rts)
+{
+	const struct ieee80211_hdr *hdr = frame;
+
+	rts->frame_control =
+	    cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
+	rts->duration = ieee80211_rts_duration(hw, vif, frame_len,
+					       frame_txctl);
+	memcpy(rts->ra, hdr->addr1, sizeof(rts->ra));
+	memcpy(rts->ta, hdr->addr2, sizeof(rts->ta));
+}
+EXPORT_SYMBOL(ieee80211_rts_get);
+
+void ieee80211_ctstoself_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			     const void *frame, size_t frame_len,
+			     const struct ieee80211_tx_info *frame_txctl,
+			     struct ieee80211_cts *cts)
+{
+	const struct ieee80211_hdr *hdr = frame;
+
+	cts->frame_control =
+	    cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS);
+	cts->duration = ieee80211_ctstoself_duration(hw, vif,
+						     frame_len, frame_txctl);
+	memcpy(cts->ra, hdr->addr1, sizeof(cts->ra));
+}
+EXPORT_SYMBOL(ieee80211_ctstoself_get);
+
+struct sk_buff *
+ieee80211_get_buffered_bc(struct ieee80211_hw *hw,
+			  struct ieee80211_vif *vif)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct sk_buff *skb = NULL;
+	struct ieee80211_tx_data tx;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_if_ap *bss = NULL;
+	struct beacon_data *beacon;
+	struct ieee80211_tx_info *info;
+
+	sdata = vif_to_sdata(vif);
+	bss = &sdata->u.ap;
+
+	rcu_read_lock();
+	beacon = rcu_dereference(bss->beacon);
+
+	if (sdata->vif.type != NL80211_IFTYPE_AP || !beacon || !beacon->head)
+		goto out;
+
+	if (bss->dtim_count != 0 || !bss->dtim_bc_mc)
+		goto out; /* send buffered bc/mc only after DTIM beacon */
+
+	while (1) {
+		skb = skb_dequeue(&bss->ps_bc_buf);
+		if (!skb)
+			goto out;
+		local->total_ps_buffered--;
+
+		if (!skb_queue_empty(&bss->ps_bc_buf) && skb->len >= 2) {
+			struct ieee80211_hdr *hdr =
+				(struct ieee80211_hdr *) skb->data;
+			/* more buffered multicast/broadcast frames ==> set
+			 * MoreData flag in IEEE 802.11 header to inform PS
+			 * STAs */
+			hdr->frame_control |=
+				cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+		}
+
+		if (!ieee80211_tx_prepare(sdata, &tx, skb))
+			break;
+		dev_kfree_skb_any(skb);
+	}
+
+	info = IEEE80211_SKB_CB(skb);
+
+	tx.flags |= IEEE80211_TX_PS_BUFFERED;
+	tx.channel = local->hw.conf.channel;
+	info->band = tx.channel->band;
+
+	if (invoke_tx_handlers(&tx))
+		skb = NULL;
+ out:
+	rcu_read_unlock();
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_get_buffered_bc);
+
+void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
+{
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, 0);
+	skb_set_transport_header(skb, 0);
+
+	/* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
+	skb_set_queue_mapping(skb, IEEE80211_AC_VO);
+	skb->priority = 7;
+
+	/*
+	 * The other path calling ieee80211_xmit is from the tasklet,
+	 * and while we can handle concurrent transmissions locking
+	 * requirements are that we do not come into tx with bhs on.
+	 */
+	local_bh_disable();
+	ieee80211_xmit(sdata, skb);
+	local_bh_enable();
+}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
new file mode 100644
index 00000000..11d9d49f
--- /dev/null
+++ b/net/mac80211/util.c
@@ -0,0 +1,1443 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * utilities for mac80211
+ */
+
+#include <net/mac80211.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/bitmap.h>
+#include <linux/crc32.h>
+#include <net/net_namespace.h>
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+
+#include "ieee80211_i.h"
+#include "driver-ops.h"
+#include "rate.h"
+#include "mesh.h"
+#include "wme.h"
+#include "led.h"
+#include "wep.h"
+
+/* privid for wiphys to determine whether they belong to us or not */
+void *mac80211_wiphy_privid = &mac80211_wiphy_privid;
+
+struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
+{
+	struct ieee80211_local *local;
+	BUG_ON(!wiphy);
+
+	local = wiphy_priv(wiphy);
+	return &local->hw;
+}
+EXPORT_SYMBOL(wiphy_to_ieee80211_hw);
+
+u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
+			enum nl80211_iftype type)
+{
+	__le16 fc = hdr->frame_control;
+
+	 /* drop ACK/CTS frames and incorrect hdr len (ctrl) */
+	if (len < 16)
+		return NULL;
+
+	if (ieee80211_is_data(fc)) {
+		if (len < 24) /* drop incorrect hdr len (data) */
+			return NULL;
+
+		if (ieee80211_has_a4(fc))
+			return NULL;
+		if (ieee80211_has_tods(fc))
+			return hdr->addr1;
+		if (ieee80211_has_fromds(fc))
+			return hdr->addr2;
+
+		return hdr->addr3;
+	}
+
+	if (ieee80211_is_mgmt(fc)) {
+		if (len < 24) /* drop incorrect hdr len (mgmt) */
+			return NULL;
+		return hdr->addr3;
+	}
+
+	if (ieee80211_is_ctl(fc)) {
+		if(ieee80211_is_pspoll(fc))
+			return hdr->addr1;
+
+		if (ieee80211_is_back_req(fc)) {
+			switch (type) {
+			case NL80211_IFTYPE_STATION:
+				return hdr->addr2;
+			case NL80211_IFTYPE_AP:
+			case NL80211_IFTYPE_AP_VLAN:
+				return hdr->addr1;
+			default:
+				break; /* fall through to the return */
+			}
+		}
+	}
+
+	return NULL;
+}
+
+void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_hdr *hdr;
+
+	do {
+		hdr = (struct ieee80211_hdr *) skb->data;
+		hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+	} while ((skb = skb->next));
+}
+
+int ieee80211_frame_duration(struct ieee80211_local *local, size_t len,
+			     int rate, int erp, int short_preamble)
+{
+	int dur;
+
+	/* calculate duration (in microseconds, rounded up to next higher
+	 * integer if it includes a fractional microsecond) to send frame of
+	 * len bytes (does not include FCS) at the given rate. Duration will
+	 * also include SIFS.
+	 *
+	 * rate is in 100 kbps, so divident is multiplied by 10 in the
+	 * DIV_ROUND_UP() operations.
+	 */
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_5GHZ || erp) {
+		/*
+		 * OFDM:
+		 *
+		 * N_DBPS = DATARATE x 4
+		 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS)
+		 *	(16 = SIGNAL time, 6 = tail bits)
+		 * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext
+		 *
+		 * T_SYM = 4 usec
+		 * 802.11a - 17.5.2: aSIFSTime = 16 usec
+		 * 802.11g - 19.8.4: aSIFSTime = 10 usec +
+		 *	signal ext = 6 usec
+		 */
+		dur = 16; /* SIFS + signal ext */
+		dur += 16; /* 17.3.2.3: T_PREAMBLE = 16 usec */
+		dur += 4; /* 17.3.2.3: T_SIGNAL = 4 usec */
+		dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
+					4 * rate); /* T_SYM x N_SYM */
+	} else {
+		/*
+		 * 802.11b or 802.11g with 802.11b compatibility:
+		 * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime +
+		 * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0.
+		 *
+		 * 802.11 (DS): 15.3.3, 802.11b: 18.3.4
+		 * aSIFSTime = 10 usec
+		 * aPreambleLength = 144 usec or 72 usec with short preamble
+		 * aPLCPHeaderLength = 48 usec or 24 usec with short preamble
+		 */
+		dur = 10; /* aSIFSTime = 10 usec */
+		dur += short_preamble ? (72 + 24) : (144 + 48);
+
+		dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate);
+	}
+
+	return dur;
+}
+
+/* Exported duration function for driver use */
+__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					size_t frame_len,
+					struct ieee80211_rate *rate)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+	u16 dur;
+	int erp;
+	bool short_preamble = false;
+
+	erp = 0;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->vif.bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
+
+	dur = ieee80211_frame_duration(local, frame_len, rate->bitrate, erp,
+				       short_preamble);
+
+	return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_generic_frame_duration);
+
+__le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
+			      struct ieee80211_vif *vif, size_t frame_len,
+			      const struct ieee80211_tx_info *frame_txctl)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_rate *rate;
+	struct ieee80211_sub_if_data *sdata;
+	bool short_preamble;
+	int erp;
+	u16 dur;
+	struct ieee80211_supported_band *sband;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	short_preamble = false;
+
+	rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
+
+	erp = 0;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->vif.bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
+
+	/* CTS duration */
+	dur = ieee80211_frame_duration(local, 10, rate->bitrate,
+				       erp, short_preamble);
+	/* Data frame duration */
+	dur += ieee80211_frame_duration(local, frame_len, rate->bitrate,
+					erp, short_preamble);
+	/* ACK duration */
+	dur += ieee80211_frame_duration(local, 10, rate->bitrate,
+					erp, short_preamble);
+
+	return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_rts_duration);
+
+__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
+				    struct ieee80211_vif *vif,
+				    size_t frame_len,
+				    const struct ieee80211_tx_info *frame_txctl)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_rate *rate;
+	struct ieee80211_sub_if_data *sdata;
+	bool short_preamble;
+	int erp;
+	u16 dur;
+	struct ieee80211_supported_band *sband;
+
+	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+
+	short_preamble = false;
+
+	rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
+	erp = 0;
+	if (vif) {
+		sdata = vif_to_sdata(vif);
+		short_preamble = sdata->vif.bss_conf.use_short_preamble;
+		if (sdata->flags & IEEE80211_SDATA_OPERATING_GMODE)
+			erp = rate->flags & IEEE80211_RATE_ERP_G;
+	}
+
+	/* Data frame duration */
+	dur = ieee80211_frame_duration(local, frame_len, rate->bitrate,
+				       erp, short_preamble);
+	if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) {
+		/* ACK duration */
+		dur += ieee80211_frame_duration(local, 10, rate->bitrate,
+						erp, short_preamble);
+	}
+
+	return cpu_to_le16(dur);
+}
+EXPORT_SYMBOL(ieee80211_ctstoself_duration);
+
+static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
+				   enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+
+	trace_wake_queue(local, queue, reason);
+
+	if (WARN_ON(queue >= hw->queues))
+		return;
+
+	__clear_bit(reason, &local->queue_stop_reasons[queue]);
+
+	if (local->queue_stop_reasons[queue] != 0)
+		/* someone still has this queue stopped */
+		return;
+
+	if (skb_queue_empty(&local->pending[queue])) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+			if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
+				continue;
+			netif_wake_subqueue(sdata->dev, queue);
+		}
+		rcu_read_unlock();
+	} else
+		tasklet_schedule(&local->tx_pending_tasklet);
+}
+
+void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	__ieee80211_wake_queue(hw, queue, reason);
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue)
+{
+	ieee80211_wake_queue_by_reason(hw, queue,
+				       IEEE80211_QUEUE_STOP_REASON_DRIVER);
+}
+EXPORT_SYMBOL(ieee80211_wake_queue);
+
+static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
+				   enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+
+	trace_stop_queue(local, queue, reason);
+
+	if (WARN_ON(queue >= hw->queues))
+		return;
+
+	__set_bit(reason, &local->queue_stop_reasons[queue]);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(sdata, &local->interfaces, list)
+		netif_stop_subqueue(sdata->dev, queue);
+	rcu_read_unlock();
+}
+
+void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
+				    enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	__ieee80211_stop_queue(hw, queue, reason);
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue)
+{
+	ieee80211_stop_queue_by_reason(hw, queue,
+				       IEEE80211_QUEUE_STOP_REASON_DRIVER);
+}
+EXPORT_SYMBOL(ieee80211_stop_queue);
+
+void ieee80211_add_pending_skb(struct ieee80211_local *local,
+			       struct sk_buff *skb)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	unsigned long flags;
+	int queue = skb_get_queue_mapping(skb);
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	if (WARN_ON(!info->control.vif)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	__ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	__skb_queue_tail(&local->pending[queue], skb);
+	__ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+int ieee80211_add_pending_skbs_fn(struct ieee80211_local *local,
+				  struct sk_buff_head *skbs,
+				  void (*fn)(void *data), void *data)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	struct sk_buff *skb;
+	unsigned long flags;
+	int queue, ret = 0, i;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	for (i = 0; i < hw->queues; i++)
+		__ieee80211_stop_queue(hw, i,
+			IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+
+	while ((skb = skb_dequeue(skbs))) {
+		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+		if (WARN_ON(!info->control.vif)) {
+			kfree_skb(skb);
+			continue;
+		}
+
+		ret++;
+		queue = skb_get_queue_mapping(skb);
+		__skb_queue_tail(&local->pending[queue], skb);
+	}
+
+	if (fn)
+		fn(data);
+
+	for (i = 0; i < hw->queues; i++)
+		__ieee80211_wake_queue(hw, i,
+			IEEE80211_QUEUE_STOP_REASON_SKB_ADD);
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+	return ret;
+}
+
+int ieee80211_add_pending_skbs(struct ieee80211_local *local,
+			       struct sk_buff_head *skbs)
+{
+	return ieee80211_add_pending_skbs_fn(local, skbs, NULL, NULL);
+}
+
+void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
+				    enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+	for (i = 0; i < hw->queues; i++)
+		__ieee80211_stop_queue(hw, i, reason);
+
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_stop_queues(struct ieee80211_hw *hw)
+{
+	ieee80211_stop_queues_by_reason(hw,
+					IEEE80211_QUEUE_STOP_REASON_DRIVER);
+}
+EXPORT_SYMBOL(ieee80211_stop_queues);
+
+int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+	int ret;
+
+	if (WARN_ON(queue >= hw->queues))
+		return true;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+	ret = !!local->queue_stop_reasons[queue];
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ieee80211_queue_stopped);
+
+void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
+				     enum queue_stop_reason reason)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+	for (i = 0; i < hw->queues; i++)
+		__ieee80211_wake_queue(hw, i, reason);
+
+	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+void ieee80211_wake_queues(struct ieee80211_hw *hw)
+{
+	ieee80211_wake_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_DRIVER);
+}
+EXPORT_SYMBOL(ieee80211_wake_queues);
+
+void ieee80211_iterate_active_interfaces(
+	struct ieee80211_hw *hw,
+	void (*iterator)(void *data, u8 *mac,
+			 struct ieee80211_vif *vif),
+	void *data)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+
+	mutex_lock(&local->iflist_mtx);
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		switch (sdata->vif.type) {
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
+			continue;
+		default:
+			break;
+		}
+		if (ieee80211_sdata_running(sdata))
+			iterator(data, sdata->vif.addr,
+				 &sdata->vif);
+	}
+
+	mutex_unlock(&local->iflist_mtx);
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces);
+
+void ieee80211_iterate_active_interfaces_atomic(
+	struct ieee80211_hw *hw,
+	void (*iterator)(void *data, u8 *mac,
+			 struct ieee80211_vif *vif),
+	void *data)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		switch (sdata->vif.type) {
+		case NL80211_IFTYPE_MONITOR:
+		case NL80211_IFTYPE_AP_VLAN:
+			continue;
+		default:
+			break;
+		}
+		if (ieee80211_sdata_running(sdata))
+			iterator(data, sdata->vif.addr,
+				 &sdata->vif);
+	}
+
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
+
+/*
+ * Nothing should have been stuffed into the workqueue during
+ * the suspend->resume cycle. If this WARN is seen then there
+ * is a bug with either the driver suspend or something in
+ * mac80211 stuffing into the workqueue which we haven't yet
+ * cleared during mac80211's suspend cycle.
+ */
+static bool ieee80211_can_queue_work(struct ieee80211_local *local)
+{
+	if (WARN(local->suspended && !local->resuming,
+		 "queueing ieee80211 work while going to suspend\n"))
+		return false;
+
+	return true;
+}
+
+void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	if (!ieee80211_can_queue_work(local))
+		return;
+
+	queue_work(local->workqueue, work);
+}
+EXPORT_SYMBOL(ieee80211_queue_work);
+
+void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
+				  struct delayed_work *dwork,
+				  unsigned long delay)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	if (!ieee80211_can_queue_work(local))
+		return;
+
+	queue_delayed_work(local->workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(ieee80211_queue_delayed_work);
+
+void ieee802_11_parse_elems(u8 *start, size_t len,
+			    struct ieee802_11_elems *elems)
+{
+	ieee802_11_parse_elems_crc(start, len, elems, 0, 0);
+}
+
+u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
+			       struct ieee802_11_elems *elems,
+			       u64 filter, u32 crc)
+{
+	size_t left = len;
+	u8 *pos = start;
+	bool calc_crc = filter != 0;
+
+	memset(elems, 0, sizeof(*elems));
+	elems->ie_start = start;
+	elems->total_len = len;
+
+	while (left >= 2) {
+		u8 id, elen;
+
+		id = *pos++;
+		elen = *pos++;
+		left -= 2;
+
+		if (elen > left)
+			break;
+
+		if (calc_crc && id < 64 && (filter & (1ULL << id)))
+			crc = crc32_be(crc, pos - 2, elen + 2);
+
+		switch (id) {
+		case WLAN_EID_SSID:
+			elems->ssid = pos;
+			elems->ssid_len = elen;
+			break;
+		case WLAN_EID_SUPP_RATES:
+			elems->supp_rates = pos;
+			elems->supp_rates_len = elen;
+			break;
+		case WLAN_EID_FH_PARAMS:
+			elems->fh_params = pos;
+			elems->fh_params_len = elen;
+			break;
+		case WLAN_EID_DS_PARAMS:
+			elems->ds_params = pos;
+			elems->ds_params_len = elen;
+			break;
+		case WLAN_EID_CF_PARAMS:
+			elems->cf_params = pos;
+			elems->cf_params_len = elen;
+			break;
+		case WLAN_EID_TIM:
+			if (elen >= sizeof(struct ieee80211_tim_ie)) {
+				elems->tim = (void *)pos;
+				elems->tim_len = elen;
+			}
+			break;
+		case WLAN_EID_IBSS_PARAMS:
+			elems->ibss_params = pos;
+			elems->ibss_params_len = elen;
+			break;
+		case WLAN_EID_CHALLENGE:
+			elems->challenge = pos;
+			elems->challenge_len = elen;
+			break;
+		case WLAN_EID_VENDOR_SPECIFIC:
+			if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 &&
+			    pos[2] == 0xf2) {
+				/* Microsoft OUI (00:50:F2) */
+
+				if (calc_crc)
+					crc = crc32_be(crc, pos - 2, elen + 2);
+
+				if (pos[3] == 1) {
+					/* OUI Type 1 - WPA IE */
+					elems->wpa = pos;
+					elems->wpa_len = elen;
+				} else if (elen >= 5 && pos[3] == 2) {
+					/* OUI Type 2 - WMM IE */
+					if (pos[4] == 0) {
+						elems->wmm_info = pos;
+						elems->wmm_info_len = elen;
+					} else if (pos[4] == 1) {
+						elems->wmm_param = pos;
+						elems->wmm_param_len = elen;
+					}
+				}
+			}
+			break;
+		case WLAN_EID_RSN:
+			elems->rsn = pos;
+			elems->rsn_len = elen;
+			break;
+		case WLAN_EID_ERP_INFO:
+			elems->erp_info = pos;
+			elems->erp_info_len = elen;
+			break;
+		case WLAN_EID_EXT_SUPP_RATES:
+			elems->ext_supp_rates = pos;
+			elems->ext_supp_rates_len = elen;
+			break;
+		case WLAN_EID_HT_CAPABILITY:
+			if (elen >= sizeof(struct ieee80211_ht_cap))
+				elems->ht_cap_elem = (void *)pos;
+			break;
+		case WLAN_EID_HT_INFORMATION:
+			if (elen >= sizeof(struct ieee80211_ht_info))
+				elems->ht_info_elem = (void *)pos;
+			break;
+		case WLAN_EID_MESH_ID:
+			elems->mesh_id = pos;
+			elems->mesh_id_len = elen;
+			break;
+		case WLAN_EID_MESH_CONFIG:
+			if (elen >= sizeof(struct ieee80211_meshconf_ie))
+				elems->mesh_config = (void *)pos;
+			break;
+		case WLAN_EID_PEER_LINK:
+			elems->peer_link = pos;
+			elems->peer_link_len = elen;
+			break;
+		case WLAN_EID_PREQ:
+			elems->preq = pos;
+			elems->preq_len = elen;
+			break;
+		case WLAN_EID_PREP:
+			elems->prep = pos;
+			elems->prep_len = elen;
+			break;
+		case WLAN_EID_PERR:
+			elems->perr = pos;
+			elems->perr_len = elen;
+			break;
+		case WLAN_EID_RANN:
+			if (elen >= sizeof(struct ieee80211_rann_ie))
+				elems->rann = (void *)pos;
+			break;
+		case WLAN_EID_CHANNEL_SWITCH:
+			elems->ch_switch_elem = pos;
+			elems->ch_switch_elem_len = elen;
+			break;
+		case WLAN_EID_QUIET:
+			if (!elems->quiet_elem) {
+				elems->quiet_elem = pos;
+				elems->quiet_elem_len = elen;
+			}
+			elems->num_of_quiet_elem++;
+			break;
+		case WLAN_EID_COUNTRY:
+			elems->country_elem = pos;
+			elems->country_elem_len = elen;
+			break;
+		case WLAN_EID_PWR_CONSTRAINT:
+			elems->pwr_constr_elem = pos;
+			elems->pwr_constr_elem_len = elen;
+			break;
+		case WLAN_EID_TIMEOUT_INTERVAL:
+			elems->timeout_int = pos;
+			elems->timeout_int_len = elen;
+			break;
+		default:
+			break;
+		}
+
+		left -= elen;
+		pos += elen;
+	}
+
+	return crc;
+}
+
+void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_tx_queue_params qparam;
+	int queue;
+	bool use_11b;
+	int aCWmin, aCWmax;
+
+	if (!local->ops->conf_tx)
+		return;
+
+	memset(&qparam, 0, sizeof(qparam));
+
+	use_11b = (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) &&
+		 !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE);
+
+	for (queue = 0; queue < local_to_hw(local)->queues; queue++) {
+		/* Set defaults according to 802.11-2007 Table 7-37 */
+		aCWmax = 1023;
+		if (use_11b)
+			aCWmin = 31;
+		else
+			aCWmin = 15;
+
+		switch (queue) {
+		case 3: /* AC_BK */
+			qparam.cw_max = aCWmax;
+			qparam.cw_min = aCWmin;
+			qparam.txop = 0;
+			qparam.aifs = 7;
+			break;
+		default: /* never happens but let's not leave undefined */
+		case 2: /* AC_BE */
+			qparam.cw_max = aCWmax;
+			qparam.cw_min = aCWmin;
+			qparam.txop = 0;
+			qparam.aifs = 3;
+			break;
+		case 1: /* AC_VI */
+			qparam.cw_max = aCWmin;
+			qparam.cw_min = (aCWmin + 1) / 2 - 1;
+			if (use_11b)
+				qparam.txop = 6016/32;
+			else
+				qparam.txop = 3008/32;
+			qparam.aifs = 2;
+			break;
+		case 0: /* AC_VO */
+			qparam.cw_max = (aCWmin + 1) / 2 - 1;
+			qparam.cw_min = (aCWmin + 1) / 4 - 1;
+			if (use_11b)
+				qparam.txop = 3264/32;
+			else
+				qparam.txop = 1504/32;
+			qparam.aifs = 2;
+			break;
+		}
+
+		qparam.uapsd = false;
+
+		drv_conf_tx(local, queue, &qparam);
+	}
+
+	/* after reinitialize QoS TX queues setting to default,
+	 * disable QoS at all */
+
+	if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+		sdata->vif.bss_conf.qos =
+			sdata->vif.type != NL80211_IFTYPE_STATION;
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+	}
+}
+
+void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
+				  const size_t supp_rates_len,
+				  const u8 *supp_rates)
+{
+	struct ieee80211_local *local = sdata->local;
+	int i, have_higher_than_11mbit = 0;
+
+	/* cf. IEEE 802.11 9.2.12 */
+	for (i = 0; i < supp_rates_len; i++)
+		if ((supp_rates[i] & 0x7f) * 5 > 110)
+			have_higher_than_11mbit = 1;
+
+	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	    have_higher_than_11mbit)
+		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
+	else
+		sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
+
+	ieee80211_set_wmm_default(sdata);
+}
+
+u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
+			      enum ieee80211_band band)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	u32 mandatory_rates;
+	enum ieee80211_rate_flags mandatory_flag;
+	int i;
+
+	sband = local->hw.wiphy->bands[band];
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	}
+
+	if (band == IEEE80211_BAND_2GHZ)
+		mandatory_flag = IEEE80211_RATE_MANDATORY_B;
+	else
+		mandatory_flag = IEEE80211_RATE_MANDATORY_A;
+
+	bitrates = sband->bitrates;
+	mandatory_rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++)
+		if (bitrates[i].flags & mandatory_flag)
+			mandatory_rates |= BIT(i);
+	return mandatory_rates;
+}
+
+void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
+			 u16 transaction, u16 auth_alg,
+			 u8 *extra, size_t extra_len, const u8 *bssid,
+			 const u8 *key, u8 key_len, u8 key_idx)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	int err;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*mgmt) + 6 + extra_len);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
+		       "frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
+	memset(mgmt, 0, 24 + 6);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_AUTH);
+	memcpy(mgmt->da, bssid, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, bssid, ETH_ALEN);
+	mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
+	mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
+	mgmt->u.auth.status_code = cpu_to_le16(0);
+	if (extra)
+		memcpy(skb_put(skb, extra_len), extra, extra_len);
+
+	if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) {
+		mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+		err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx);
+		WARN_ON(err);
+	}
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
+			     const u8 *ie, size_t ie_len,
+			     enum ieee80211_band band, u32 rate_mask,
+			     u8 channel)
+{
+	struct ieee80211_supported_band *sband;
+	u8 *pos;
+	size_t offset = 0, noffset;
+	int supp_rates_len, i;
+	u8 rates[32];
+	int num_rates;
+	int ext_rates_len;
+
+	sband = local->hw.wiphy->bands[band];
+
+	pos = buffer;
+
+	num_rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((BIT(i) & rate_mask) == 0)
+			continue; /* skip rate */
+		rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5);
+	}
+
+	supp_rates_len = min_t(int, num_rates, 8);
+
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = supp_rates_len;
+	memcpy(pos, rates, supp_rates_len);
+	pos += supp_rates_len;
+
+	/* insert "request information" if in custom IEs */
+	if (ie && ie_len) {
+		static const u8 before_extrates[] = {
+			WLAN_EID_SSID,
+			WLAN_EID_SUPP_RATES,
+			WLAN_EID_REQUEST,
+		};
+		noffset = ieee80211_ie_split(ie, ie_len,
+					     before_extrates,
+					     ARRAY_SIZE(before_extrates),
+					     offset);
+		memcpy(pos, ie + offset, noffset - offset);
+		pos += noffset - offset;
+		offset = noffset;
+	}
+
+	ext_rates_len = num_rates - supp_rates_len;
+	if (ext_rates_len > 0) {
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = ext_rates_len;
+		memcpy(pos, rates + supp_rates_len, ext_rates_len);
+		pos += ext_rates_len;
+	}
+
+	if (channel && sband->band == IEEE80211_BAND_2GHZ) {
+		*pos++ = WLAN_EID_DS_PARAMS;
+		*pos++ = 1;
+		*pos++ = channel;
+	}
+
+	/* insert custom IEs that go before HT */
+	if (ie && ie_len) {
+		static const u8 before_ht[] = {
+			WLAN_EID_SSID,
+			WLAN_EID_SUPP_RATES,
+			WLAN_EID_REQUEST,
+			WLAN_EID_EXT_SUPP_RATES,
+			WLAN_EID_DS_PARAMS,
+			WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+		};
+		noffset = ieee80211_ie_split(ie, ie_len,
+					     before_ht, ARRAY_SIZE(before_ht),
+					     offset);
+		memcpy(pos, ie + offset, noffset - offset);
+		pos += noffset - offset;
+		offset = noffset;
+	}
+
+	if (sband->ht_cap.ht_supported) {
+		u16 cap = sband->ht_cap.cap;
+		__le16 tmp;
+
+		*pos++ = WLAN_EID_HT_CAPABILITY;
+		*pos++ = sizeof(struct ieee80211_ht_cap);
+		memset(pos, 0, sizeof(struct ieee80211_ht_cap));
+		tmp = cpu_to_le16(cap);
+		memcpy(pos, &tmp, sizeof(u16));
+		pos += sizeof(u16);
+		*pos++ = sband->ht_cap.ampdu_factor |
+			 (sband->ht_cap.ampdu_density <<
+				IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT);
+		memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs));
+		pos += sizeof(sband->ht_cap.mcs);
+		pos += 2 + 4 + 1; /* ext info, BF cap, antsel */
+	}
+
+	/*
+	 * If adding more here, adjust code in main.c
+	 * that calculates local->scan_ies_len.
+	 */
+
+	/* add any remaining custom IEs */
+	if (ie && ie_len) {
+		noffset = ie_len;
+		memcpy(pos, ie + offset, noffset - offset);
+		pos += noffset - offset;
+	}
+
+	return pos - buffer;
+}
+
+struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
+					  u8 *dst,
+					  const u8 *ssid, size_t ssid_len,
+					  const u8 *ie, size_t ie_len)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	size_t buf_len;
+	u8 *buf;
+	u8 chan;
+
+	/* FIXME: come up with a proper value */
+	buf = kmalloc(200 + ie_len, GFP_KERNEL);
+	if (!buf) {
+		printk(KERN_DEBUG "%s: failed to allocate temporary IE "
+		       "buffer\n", sdata->name);
+		return NULL;
+	}
+
+	chan = ieee80211_frequency_to_channel(
+		local->hw.conf.channel->center_freq);
+
+	buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len,
+					   local->hw.conf.channel->band,
+					   sdata->rc_rateidx_mask
+					   [local->hw.conf.channel->band],
+					   chan);
+
+	skb = ieee80211_probereq_get(&local->hw, &sdata->vif,
+				     ssid, ssid_len,
+				     buf, buf_len);
+	if (!skb)
+		goto out;
+
+	if (dst) {
+		mgmt = (struct ieee80211_mgmt *) skb->data;
+		memcpy(mgmt->da, dst, ETH_ALEN);
+		memcpy(mgmt->bssid, dst, ETH_ALEN);
+	}
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+ out:
+	kfree(buf);
+
+	return skb;
+}
+
+void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
+			      const u8 *ssid, size_t ssid_len,
+			      const u8 *ie, size_t ie_len)
+{
+	struct sk_buff *skb;
+
+	skb = ieee80211_build_probe_req(sdata, dst, ssid, ssid_len, ie, ie_len);
+	if (skb)
+		ieee80211_tx_skb(sdata, skb);
+}
+
+u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
+			    struct ieee802_11_elems *elems,
+			    enum ieee80211_band band)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *bitrates;
+	size_t num_rates;
+	u32 supp_rates;
+	int i, j;
+	sband = local->hw.wiphy->bands[band];
+
+	if (!sband) {
+		WARN_ON(1);
+		sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	}
+
+	bitrates = sband->bitrates;
+	num_rates = sband->n_bitrates;
+	supp_rates = 0;
+	for (i = 0; i < elems->supp_rates_len +
+		     elems->ext_supp_rates_len; i++) {
+		u8 rate = 0;
+		int own_rate;
+		if (i < elems->supp_rates_len)
+			rate = elems->supp_rates[i];
+		else if (elems->ext_supp_rates)
+			rate = elems->ext_supp_rates
+				[i - elems->supp_rates_len];
+		own_rate = 5 * (rate & 0x7f);
+		for (j = 0; j < num_rates; j++)
+			if (bitrates[j].bitrate == own_rate)
+				supp_rates |= BIT(j);
+	}
+	return supp_rates;
+}
+
+void ieee80211_stop_device(struct ieee80211_local *local)
+{
+	ieee80211_led_radio(local, false);
+	ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO);
+
+	cancel_work_sync(&local->reconfig_filter);
+
+	flush_workqueue(local->workqueue);
+	drv_stop(local);
+}
+
+int ieee80211_reconfig(struct ieee80211_local *local)
+{
+	struct ieee80211_hw *hw = &local->hw;
+	struct ieee80211_sub_if_data *sdata;
+	struct sta_info *sta;
+	int res;
+
+#ifdef CONFIG_PM
+	if (local->suspended)
+		local->resuming = true;
+
+	if (local->wowlan) {
+		local->wowlan = false;
+		res = drv_resume(local);
+		if (res < 0) {
+			local->resuming = false;
+			return res;
+		}
+		if (res == 0)
+			goto wake_up;
+		WARN_ON(res > 1);
+		/*
+		 * res is 1, which means the driver requested
+		 * to go through a regular reset on wakeup.
+		 */
+	}
+#endif
+
+	/* restart hardware */
+	if (local->open_count) {
+		/*
+		 * Upon resume hardware can sometimes be goofy due to
+		 * various platform / driver / bus issues, so restarting
+		 * the device may at times not work immediately. Propagate
+		 * the error.
+		 */
+		res = drv_start(local);
+		if (res) {
+			WARN(local->suspended, "Hardware became unavailable "
+			     "upon resume. This could be a software issue "
+			     "prior to suspend or a hardware issue.\n");
+			return res;
+		}
+
+		ieee80211_led_radio(local, true);
+		ieee80211_mod_tpt_led_trig(local,
+					   IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
+	}
+
+	/* add interfaces */
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
+		    sdata->vif.type != NL80211_IFTYPE_MONITOR &&
+		    ieee80211_sdata_running(sdata))
+			res = drv_add_interface(local, &sdata->vif);
+	}
+
+	/* add STAs back */
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry(sta, &local->sta_list, list) {
+		if (sta->uploaded) {
+			sdata = sta->sdata;
+			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+				sdata = container_of(sdata->bss,
+					     struct ieee80211_sub_if_data,
+					     u.ap);
+
+			WARN_ON(drv_sta_add(local, sdata, &sta->sta));
+		}
+	}
+	mutex_unlock(&local->sta_mtx);
+
+	/* setup fragmentation threshold */
+	drv_set_frag_threshold(local, hw->wiphy->frag_threshold);
+
+	/* setup RTS threshold */
+	drv_set_rts_threshold(local, hw->wiphy->rts_threshold);
+
+	/* reconfigure hardware */
+	ieee80211_hw_config(local, ~0);
+
+	ieee80211_configure_filter(local);
+
+	/* Finally also reconfigure all the BSS information */
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		u32 changed;
+
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+
+		/* common change flags for all interface types */
+		changed = BSS_CHANGED_ERP_CTS_PROT |
+			  BSS_CHANGED_ERP_PREAMBLE |
+			  BSS_CHANGED_ERP_SLOT |
+			  BSS_CHANGED_HT |
+			  BSS_CHANGED_BASIC_RATES |
+			  BSS_CHANGED_BEACON_INT |
+			  BSS_CHANGED_BSSID |
+			  BSS_CHANGED_CQM |
+			  BSS_CHANGED_QOS;
+
+		switch (sdata->vif.type) {
+		case NL80211_IFTYPE_STATION:
+			changed |= BSS_CHANGED_ASSOC;
+			mutex_lock(&sdata->u.mgd.mtx);
+			ieee80211_bss_info_change_notify(sdata, changed);
+			mutex_unlock(&sdata->u.mgd.mtx);
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			changed |= BSS_CHANGED_IBSS;
+			/* fall through */
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_MESH_POINT:
+			changed |= BSS_CHANGED_BEACON |
+				   BSS_CHANGED_BEACON_ENABLED;
+			ieee80211_bss_info_change_notify(sdata, changed);
+			break;
+		case NL80211_IFTYPE_WDS:
+			break;
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_MONITOR:
+			/* ignore virtual */
+			break;
+		case NL80211_IFTYPE_UNSPECIFIED:
+		case NUM_NL80211_IFTYPES:
+		case NL80211_IFTYPE_P2P_CLIENT:
+		case NL80211_IFTYPE_P2P_GO:
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	/* add back keys */
+	list_for_each_entry(sdata, &local->interfaces, list)
+		if (ieee80211_sdata_running(sdata))
+			ieee80211_enable_keys(sdata);
+
+ wake_up:
+	/*
+	 * Clear the WLAN_STA_BLOCK_BA flag so new aggregation
+	 * sessions can be established after a resume.
+	 *
+	 * Also tear down aggregation sessions since reconfiguring
+	 * them in a hardware restart scenario is not easily done
+	 * right now, and the hardware will have lost information
+	 * about the sessions, but we and the AP still think they
+	 * are active. This is really a workaround though.
+	 */
+	if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
+		mutex_lock(&local->sta_mtx);
+
+		list_for_each_entry(sta, &local->sta_list, list) {
+			ieee80211_sta_tear_down_BA_sessions(sta, true);
+			clear_sta_flags(sta, WLAN_STA_BLOCK_BA);
+		}
+
+		mutex_unlock(&local->sta_mtx);
+	}
+
+	ieee80211_wake_queues_by_reason(hw,
+			IEEE80211_QUEUE_STOP_REASON_SUSPEND);
+
+	/*
+	 * If this is for hw restart things are still running.
+	 * We may want to change that later, however.
+	 */
+	if (!local->suspended)
+		return 0;
+
+#ifdef CONFIG_PM
+	/* first set suspended false, then resuming */
+	local->suspended = false;
+	mb();
+	local->resuming = false;
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		switch(sdata->vif.type) {
+		case NL80211_IFTYPE_STATION:
+			ieee80211_sta_restart(sdata);
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			ieee80211_ibss_restart(sdata);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			ieee80211_mesh_restart(sdata);
+			break;
+		default:
+			break;
+		}
+	}
+
+	mod_timer(&local->sta_cleanup, jiffies + 1);
+
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry(sta, &local->sta_list, list)
+		mesh_plink_restart(sta);
+	mutex_unlock(&local->sta_mtx);
+#else
+	WARN_ON(1);
+#endif
+	return 0;
+}
+
+static int check_mgd_smps(struct ieee80211_if_managed *ifmgd,
+			  enum ieee80211_smps_mode *smps_mode)
+{
+	if (ifmgd->associated) {
+		*smps_mode = ifmgd->ap_smps;
+
+		if (*smps_mode == IEEE80211_SMPS_AUTOMATIC) {
+			if (ifmgd->powersave)
+				*smps_mode = IEEE80211_SMPS_DYNAMIC;
+			else
+				*smps_mode = IEEE80211_SMPS_OFF;
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+/* must hold iflist_mtx */
+void ieee80211_recalc_smps(struct ieee80211_local *local)
+{
+	struct ieee80211_sub_if_data *sdata;
+	enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF;
+	int count = 0;
+
+	lockdep_assert_held(&local->iflist_mtx);
+
+	/*
+	 * This function could be improved to handle multiple
+	 * interfaces better, but right now it makes any
+	 * non-station interfaces force SM PS to be turned
+	 * off. If there are multiple station interfaces it
+	 * could also use the best possible mode, e.g. if
+	 * one is in static and the other in dynamic then
+	 * dynamic is ok.
+	 */
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			goto set;
+
+		count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
+
+		if (count > 1) {
+			smps_mode = IEEE80211_SMPS_OFF;
+			break;
+		}
+	}
+
+	if (smps_mode == local->smps_mode)
+		return;
+
+ set:
+	local->smps_mode = smps_mode;
+	/* changed flag is auto-detected for this */
+	ieee80211_hw_config(local, 0);
+}
+
+static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id)
+{
+	int i;
+
+	for (i = 0; i < n_ids; i++)
+		if (ids[i] == id)
+			return true;
+	return false;
+}
+
+/**
+ * ieee80211_ie_split - split an IE buffer according to ordering
+ *
+ * @ies: the IE buffer
+ * @ielen: the length of the IE buffer
+ * @ids: an array with element IDs that are allowed before
+ *	the split
+ * @n_ids: the size of the element ID array
+ * @offset: offset where to start splitting in the buffer
+ *
+ * This function splits an IE buffer by updating the @offset
+ * variable to point to the location where the buffer should be
+ * split.
+ *
+ * It assumes that the given IE buffer is well-formed, this
+ * has to be guaranteed by the caller!
+ *
+ * It also assumes that the IEs in the buffer are ordered
+ * correctly, if not the result of using this function will not
+ * be ordered correctly either, i.e. it does no reordering.
+ *
+ * The function returns the offset where the next part of the
+ * buffer starts, which may be @ielen if the entire (remainder)
+ * of the buffer should be used.
+ */
+size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
+			  const u8 *ids, int n_ids, size_t offset)
+{
+	size_t pos = offset;
+
+	while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos]))
+		pos += 2 + ies[pos + 1];
+
+	return pos;
+}
+
+size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset)
+{
+	size_t pos = offset;
+
+	while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC)
+		pos += 2 + ies[pos + 1];
+
+	return pos;
+}
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
new file mode 100644
index 00000000..a1c6bfd5
--- /dev/null
+++ b/net/mac80211/wep.c
@@ -0,0 +1,342 @@
+/*
+ * Software WEP encryption implementation
+ * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/crc32.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "wep.h"
+
+
+int ieee80211_wep_init(struct ieee80211_local *local)
+{
+	/* start WEP IV from a random value */
+	get_random_bytes(&local->wep_iv, WEP_IV_LEN);
+
+	local->wep_tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(local->wep_tx_tfm)) {
+		local->wep_rx_tfm = ERR_PTR(-EINVAL);
+		return PTR_ERR(local->wep_tx_tfm);
+	}
+
+	local->wep_rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(local->wep_rx_tfm)) {
+		crypto_free_cipher(local->wep_tx_tfm);
+		local->wep_tx_tfm = ERR_PTR(-EINVAL);
+		return PTR_ERR(local->wep_rx_tfm);
+	}
+
+	return 0;
+}
+
+void ieee80211_wep_free(struct ieee80211_local *local)
+{
+	if (!IS_ERR(local->wep_tx_tfm))
+		crypto_free_cipher(local->wep_tx_tfm);
+	if (!IS_ERR(local->wep_rx_tfm))
+		crypto_free_cipher(local->wep_rx_tfm);
+}
+
+static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen)
+{
+	/*
+	 * Fluhrer, Mantin, and Shamir have reported weaknesses in the
+	 * key scheduling algorithm of RC4. At least IVs (KeyByte + 3,
+	 * 0xff, N) can be used to speedup attacks, so avoid using them.
+	 */
+	if ((iv & 0xff00) == 0xff00) {
+		u8 B = (iv >> 16) & 0xff;
+		if (B >= 3 && B < 3 + keylen)
+			return true;
+	}
+	return false;
+}
+
+
+static void ieee80211_wep_get_iv(struct ieee80211_local *local,
+				 int keylen, int keyidx, u8 *iv)
+{
+	local->wep_iv++;
+	if (ieee80211_wep_weak_iv(local->wep_iv, keylen))
+		local->wep_iv += 0x0100;
+
+	if (!iv)
+		return;
+
+	*iv++ = (local->wep_iv >> 16) & 0xff;
+	*iv++ = (local->wep_iv >> 8) & 0xff;
+	*iv++ = local->wep_iv & 0xff;
+	*iv++ = keyidx << 6;
+}
+
+
+static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local,
+				struct sk_buff *skb,
+				int keylen, int keyidx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+	u8 *newhdr;
+
+	hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+
+	if (WARN_ON(skb_tailroom(skb) < WEP_ICV_LEN ||
+		    skb_headroom(skb) < WEP_IV_LEN))
+		return NULL;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	newhdr = skb_push(skb, WEP_IV_LEN);
+	memmove(newhdr, newhdr + WEP_IV_LEN, hdrlen);
+	ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen);
+	return newhdr + hdrlen;
+}
+
+
+static void ieee80211_wep_remove_iv(struct ieee80211_local *local,
+				    struct sk_buff *skb,
+				    struct ieee80211_key *key)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	memmove(skb->data + WEP_IV_LEN, skb->data, hdrlen);
+	skb_pull(skb, WEP_IV_LEN);
+}
+
+
+/* Perform WEP encryption using given key. data buffer must have tailroom
+ * for 4-byte ICV. data_len must not include this ICV. Note: this function
+ * does _not_ add IV. data = RC4(data | CRC32(data)) */
+int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+			       size_t klen, u8 *data, size_t data_len)
+{
+	__le32 icv;
+	int i;
+
+	if (IS_ERR(tfm))
+		return -1;
+
+	icv = cpu_to_le32(~crc32_le(~0, data, data_len));
+	put_unaligned(icv, (__le32 *)(data + data_len));
+
+	crypto_cipher_setkey(tfm, rc4key, klen);
+	for (i = 0; i < data_len + WEP_ICV_LEN; i++)
+		crypto_cipher_encrypt_one(tfm, data + i, data + i);
+
+	return 0;
+}
+
+
+/* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the
+ * beginning of the buffer 4 bytes of extra space (ICV) in the end of the
+ * buffer will be added. Both IV and ICV will be transmitted, so the
+ * payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+int ieee80211_wep_encrypt(struct ieee80211_local *local,
+			  struct sk_buff *skb,
+			  const u8 *key, int keylen, int keyidx)
+{
+	u8 *iv;
+	size_t len;
+	u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+
+	iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx);
+	if (!iv)
+		return -1;
+
+	len = skb->len - (iv + WEP_IV_LEN - skb->data);
+
+	/* Prepend 24-bit IV to RC4 key */
+	memcpy(rc4key, iv, 3);
+
+	/* Copy rest of the WEP key (the secret part) */
+	memcpy(rc4key + 3, key, keylen);
+
+	/* Add room for ICV */
+	skb_put(skb, WEP_ICV_LEN);
+
+	return ieee80211_wep_encrypt_data(local->wep_tx_tfm, rc4key, keylen + 3,
+					  iv + WEP_IV_LEN, len);
+}
+
+
+/* Perform WEP decryption using given key. data buffer includes encrypted
+ * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV.
+ * Return 0 on success and -1 on ICV mismatch. */
+int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+			       size_t klen, u8 *data, size_t data_len)
+{
+	__le32 crc;
+	int i;
+
+	if (IS_ERR(tfm))
+		return -1;
+
+	crypto_cipher_setkey(tfm, rc4key, klen);
+	for (i = 0; i < data_len + WEP_ICV_LEN; i++)
+		crypto_cipher_decrypt_one(tfm, data + i, data + i);
+
+	crc = cpu_to_le32(~crc32_le(~0, data, data_len));
+	if (memcmp(&crc, data + data_len, WEP_ICV_LEN) != 0)
+		/* ICV mismatch */
+		return -1;
+
+	return 0;
+}
+
+
+/* Perform WEP decryption on given skb. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). skb->len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload
+ * is moved to the beginning of the skb and skb length will be reduced.
+ */
+static int ieee80211_wep_decrypt(struct ieee80211_local *local,
+				 struct sk_buff *skb,
+				 struct ieee80211_key *key)
+{
+	u32 klen;
+	u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+	u8 keyidx;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+	size_t len;
+	int ret = 0;
+
+	if (!ieee80211_has_protected(hdr->frame_control))
+		return -1;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (skb->len < hdrlen + WEP_IV_LEN + WEP_ICV_LEN)
+		return -1;
+
+	len = skb->len - hdrlen - WEP_IV_LEN - WEP_ICV_LEN;
+
+	keyidx = skb->data[hdrlen + 3] >> 6;
+
+	if (!key || keyidx != key->conf.keyidx)
+		return -1;
+
+	klen = 3 + key->conf.keylen;
+
+	/* Prepend 24-bit IV to RC4 key */
+	memcpy(rc4key, skb->data + hdrlen, 3);
+
+	/* Copy rest of the WEP key (the secret part) */
+	memcpy(rc4key + 3, key->conf.key, key->conf.keylen);
+
+	if (ieee80211_wep_decrypt_data(local->wep_rx_tfm, rc4key, klen,
+				       skb->data + hdrlen + WEP_IV_LEN,
+				       len))
+		ret = -1;
+
+	/* Trim ICV */
+	skb_trim(skb, skb->len - WEP_ICV_LEN);
+
+	/* Remove IV */
+	memmove(skb->data + WEP_IV_LEN, skb->data, hdrlen);
+	skb_pull(skb, WEP_IV_LEN);
+
+	return ret;
+}
+
+
+bool ieee80211_wep_is_weak_iv(struct sk_buff *skb, struct ieee80211_key *key)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+	u8 *ivpos;
+	u32 iv;
+
+	if (!ieee80211_has_protected(hdr->frame_control))
+		return false;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	ivpos = skb->data + hdrlen;
+	iv = (ivpos[0] << 16) | (ivpos[1] << 8) | ivpos[2];
+
+	return ieee80211_wep_weak_iv(iv, key->conf.keylen);
+}
+
+ieee80211_rx_result
+ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (!ieee80211_is_data(hdr->frame_control) &&
+	    !ieee80211_is_auth(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if (!(status->flag & RX_FLAG_DECRYPTED)) {
+		if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key))
+			return RX_DROP_UNUSABLE;
+	} else if (!(status->flag & RX_FLAG_IV_STRIPPED)) {
+		ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key);
+		/* remove ICV */
+		skb_trim(rx->skb, rx->skb->len - WEP_ICV_LEN);
+	}
+
+	return RX_CONTINUE;
+}
+
+static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	if (!info->control.hw_key) {
+		if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key,
+					  tx->key->conf.keylen,
+					  tx->key->conf.keyidx))
+			return -1;
+	} else if (info->control.hw_key->flags &
+			IEEE80211_KEY_FLAG_GENERATE_IV) {
+		if (!ieee80211_wep_add_iv(tx->local, skb,
+					  tx->key->conf.keylen,
+					  tx->key->conf.keyidx))
+			return -1;
+	}
+
+	return 0;
+}
+
+ieee80211_tx_result
+ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+
+	ieee80211_tx_set_protected(tx);
+
+	skb = tx->skb;
+	do {
+		if (wep_encrypt_skb(tx, skb) < 0) {
+			I802_DEBUG_INC(tx->local->tx_handlers_drop_wep);
+			return TX_DROP;
+		}
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
diff --git a/net/mac80211/wep.h b/net/mac80211/wep.h
new file mode 100644
index 00000000..01e54840
--- /dev/null
+++ b/net/mac80211/wep.h
@@ -0,0 +1,35 @@
+/*
+ * Software WEP encryption implementation
+ * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright 2003, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef WEP_H
+#define WEP_H
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include "ieee80211_i.h"
+#include "key.h"
+
+int ieee80211_wep_init(struct ieee80211_local *local);
+void ieee80211_wep_free(struct ieee80211_local *local);
+int ieee80211_wep_encrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+				size_t klen, u8 *data, size_t data_len);
+int ieee80211_wep_encrypt(struct ieee80211_local *local,
+			  struct sk_buff *skb,
+			  const u8 *key, int keylen, int keyidx);
+int ieee80211_wep_decrypt_data(struct crypto_cipher *tfm, u8 *rc4key,
+			       size_t klen, u8 *data, size_t data_len);
+bool ieee80211_wep_is_weak_iv(struct sk_buff *skb, struct ieee80211_key *key);
+
+ieee80211_rx_result
+ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_tx_result
+ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx);
+
+#endif /* WEP_H */
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
new file mode 100644
index 00000000..28bc084d
--- /dev/null
+++ b/net/mac80211/wme.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2004, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "wme.h"
+
+/* Default mapping in classifier to work with default
+ * queue setup.
+ */
+const int ieee802_1d_to_ac[8] = {
+	IEEE80211_AC_BE,
+	IEEE80211_AC_BK,
+	IEEE80211_AC_BK,
+	IEEE80211_AC_BE,
+	IEEE80211_AC_VI,
+	IEEE80211_AC_VI,
+	IEEE80211_AC_VO,
+	IEEE80211_AC_VO
+};
+
+static int wme_downgrade_ac(struct sk_buff *skb)
+{
+	switch (skb->priority) {
+	case 6:
+	case 7:
+		skb->priority = 5; /* VO -> VI */
+		return 0;
+	case 4:
+	case 5:
+		skb->priority = 3; /* VI -> BE */
+		return 0;
+	case 0:
+	case 3:
+		skb->priority = 2; /* BE -> BK */
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+
+/* Indicate which queue to use. */
+u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
+			   struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta = NULL;
+	const u8 *ra = NULL;
+	bool qos = false;
+
+	if (local->hw.queues < 4 || skb->len < 6) {
+		skb->priority = 0; /* required for correct WPA/11i MIC */
+		return min_t(u16, local->hw.queues - 1, IEEE80211_AC_BE);
+	}
+
+	rcu_read_lock();
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP_VLAN:
+		sta = rcu_dereference(sdata->u.vlan.sta);
+		if (sta) {
+			qos = get_sta_flags(sta) & WLAN_STA_WME;
+			break;
+		}
+	case NL80211_IFTYPE_AP:
+		ra = skb->data;
+		break;
+	case NL80211_IFTYPE_WDS:
+		ra = sdata->u.wds.remote_addr;
+		break;
+#ifdef CONFIG_MAC80211_MESH
+	case NL80211_IFTYPE_MESH_POINT:
+		/*
+		 * XXX: This is clearly broken ... but already was before,
+		 * because ieee80211_fill_mesh_addresses() would clear A1
+		 * except for multicast addresses.
+		 */
+		break;
+#endif
+	case NL80211_IFTYPE_STATION:
+		ra = sdata->u.mgd.bssid;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		ra = skb->data;
+		break;
+	default:
+		break;
+	}
+
+	if (!sta && ra && !is_multicast_ether_addr(ra)) {
+		sta = sta_info_get(sdata, ra);
+		if (sta)
+			qos = get_sta_flags(sta) & WLAN_STA_WME;
+	}
+	rcu_read_unlock();
+
+	if (!qos) {
+		skb->priority = 0; /* required for correct WPA/11i MIC */
+		return IEEE80211_AC_BE;
+	}
+
+	/* use the data classifier to determine what 802.1d tag the
+	 * data frame has */
+	skb->priority = cfg80211_classify8021d(skb);
+
+	return ieee80211_downgrade_queue(local, skb);
+}
+
+u16 ieee80211_downgrade_queue(struct ieee80211_local *local,
+			      struct sk_buff *skb)
+{
+	/* in case we are a client verify acm is not set for this ac */
+	while (unlikely(local->wmm_acm & BIT(skb->priority))) {
+		if (wme_downgrade_ac(skb)) {
+			/*
+			 * This should not really happen. The AP has marked all
+			 * lower ACs to require admission control which is not
+			 * a reasonable configuration. Allow the frame to be
+			 * transmitted using AC_BK as a workaround.
+			 */
+			break;
+		}
+	}
+
+	/* look up which queue to use for frames with this 1d tag */
+	return ieee802_1d_to_ac[skb->priority];
+}
+
+void ieee80211_set_qos_hdr(struct ieee80211_local *local, struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+
+	/* Fill in the QoS header if there is one. */
+	if (ieee80211_is_data_qos(hdr->frame_control)) {
+		u8 *p = ieee80211_get_qos_ctl(hdr);
+		u8 ack_policy = 0, tid;
+
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+
+		if (unlikely(local->wifi_wme_noack_test))
+			ack_policy |= QOS_CONTROL_ACK_POLICY_NOACK <<
+					QOS_CONTROL_ACK_POLICY_SHIFT;
+		/* qos header is 2 bytes, second reserved */
+		*p++ = ack_policy | tid;
+		*p = 0;
+	}
+}
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
new file mode 100644
index 00000000..6053b1c9
--- /dev/null
+++ b/net/mac80211/wme.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _WME_H
+#define _WME_H
+
+#include <linux/netdevice.h>
+#include "ieee80211_i.h"
+
+#define QOS_CONTROL_ACK_POLICY_NORMAL 0
+#define QOS_CONTROL_ACK_POLICY_NOACK 1
+
+#define QOS_CONTROL_ACK_POLICY_SHIFT 5
+
+extern const int ieee802_1d_to_ac[8];
+
+u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata,
+			   struct sk_buff *skb);
+void ieee80211_set_qos_hdr(struct ieee80211_local *local, struct sk_buff *skb);
+u16 ieee80211_downgrade_queue(struct ieee80211_local *local,
+                              struct sk_buff *skb);
+
+
+#endif /* _WME_H */
diff --git a/net/mac80211/work.c b/net/mac80211/work.c
new file mode 100644
index 00000000..52b758db
--- /dev/null
+++ b/net/mac80211/work.c
@@ -0,0 +1,1276 @@
+/*
+ * mac80211 work implementation
+ *
+ * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
+ * Copyright 2004, Instant802 Networks, Inc.
+ * Copyright 2005, Devicescape Software, Inc.
+ * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/if_ether.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/crc32.h>
+#include <linux/slab.h>
+#include <net/mac80211.h>
+#include <asm/unaligned.h>
+
+#include "ieee80211_i.h"
+#include "rate.h"
+
+#define IEEE80211_AUTH_TIMEOUT (HZ / 5)
+#define IEEE80211_AUTH_MAX_TRIES 3
+#define IEEE80211_ASSOC_TIMEOUT (HZ / 5)
+#define IEEE80211_ASSOC_MAX_TRIES 3
+
+enum work_action {
+	WORK_ACT_MISMATCH,
+	WORK_ACT_NONE,
+	WORK_ACT_TIMEOUT,
+	WORK_ACT_DONE,
+};
+
+
+/* utils */
+static inline void ASSERT_WORK_MTX(struct ieee80211_local *local)
+{
+	lockdep_assert_held(&local->mtx);
+}
+
+/*
+ * We can have multiple work items (and connection probing)
+ * scheduling this timer, but we need to take care to only
+ * reschedule it when it should fire _earlier_ than it was
+ * asked for before, or if it's not pending right now. This
+ * function ensures that. Note that it then is required to
+ * run this function for all timeouts after the first one
+ * has happened -- the work that runs from this timer will
+ * do that.
+ */
+static void run_again(struct ieee80211_local *local,
+		      unsigned long timeout)
+{
+	ASSERT_WORK_MTX(local);
+
+	if (!timer_pending(&local->work_timer) ||
+	    time_before(timeout, local->work_timer.expires))
+		mod_timer(&local->work_timer, timeout);
+}
+
+void free_work(struct ieee80211_work *wk)
+{
+	kfree_rcu(wk, rcu_head);
+}
+
+static int ieee80211_compatible_rates(const u8 *supp_rates, int supp_rates_len,
+				      struct ieee80211_supported_band *sband,
+				      u32 *rates)
+{
+	int i, j, count;
+	*rates = 0;
+	count = 0;
+	for (i = 0; i < supp_rates_len; i++) {
+		int rate = (supp_rates[i] & 0x7F) * 5;
+
+		for (j = 0; j < sband->n_bitrates; j++)
+			if (sband->bitrates[j].bitrate == rate) {
+				*rates |= BIT(j);
+				count++;
+				break;
+			}
+	}
+
+	return count;
+}
+
+/* frame sending functions */
+
+static void ieee80211_add_ht_ie(struct sk_buff *skb, const u8 *ht_info_ie,
+				struct ieee80211_supported_band *sband,
+				struct ieee80211_channel *channel,
+				enum ieee80211_smps_mode smps)
+{
+	struct ieee80211_ht_info *ht_info;
+	u8 *pos;
+	u32 flags = channel->flags;
+	u16 cap = sband->ht_cap.cap;
+	__le16 tmp;
+
+	if (!sband->ht_cap.ht_supported)
+		return;
+
+	if (!ht_info_ie)
+		return;
+
+	if (ht_info_ie[1] < sizeof(struct ieee80211_ht_info))
+		return;
+
+	ht_info = (struct ieee80211_ht_info *)(ht_info_ie + 2);
+
+	/* determine capability flags */
+
+	switch (ht_info->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+	case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+		if (flags & IEEE80211_CHAN_NO_HT40PLUS) {
+			cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+			cap &= ~IEEE80211_HT_CAP_SGI_40;
+		}
+		break;
+	case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+		if (flags & IEEE80211_CHAN_NO_HT40MINUS) {
+			cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+			cap &= ~IEEE80211_HT_CAP_SGI_40;
+		}
+		break;
+	}
+
+	/* set SM PS mode properly */
+	cap &= ~IEEE80211_HT_CAP_SM_PS;
+	switch (smps) {
+	case IEEE80211_SMPS_AUTOMATIC:
+	case IEEE80211_SMPS_NUM_MODES:
+		WARN_ON(1);
+	case IEEE80211_SMPS_OFF:
+		cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
+			IEEE80211_HT_CAP_SM_PS_SHIFT;
+		break;
+	case IEEE80211_SMPS_STATIC:
+		cap |= WLAN_HT_CAP_SM_PS_STATIC <<
+			IEEE80211_HT_CAP_SM_PS_SHIFT;
+		break;
+	case IEEE80211_SMPS_DYNAMIC:
+		cap |= WLAN_HT_CAP_SM_PS_DYNAMIC <<
+			IEEE80211_HT_CAP_SM_PS_SHIFT;
+		break;
+	}
+
+	/* reserve and fill IE */
+
+	pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2);
+	*pos++ = WLAN_EID_HT_CAPABILITY;
+	*pos++ = sizeof(struct ieee80211_ht_cap);
+	memset(pos, 0, sizeof(struct ieee80211_ht_cap));
+
+	/* capability flags */
+	tmp = cpu_to_le16(cap);
+	memcpy(pos, &tmp, sizeof(u16));
+	pos += sizeof(u16);
+
+	/* AMPDU parameters */
+	*pos++ = sband->ht_cap.ampdu_factor |
+		 (sband->ht_cap.ampdu_density <<
+			IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT);
+
+	/* MCS set */
+	memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs));
+	pos += sizeof(sband->ht_cap.mcs);
+
+	/* extended capabilities */
+	pos += sizeof(__le16);
+
+	/* BF capabilities */
+	pos += sizeof(__le32);
+
+	/* antenna selection */
+	pos += sizeof(u8);
+}
+
+static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
+				 struct ieee80211_work *wk)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ieee80211_mgmt *mgmt;
+	u8 *pos, qos_info;
+	size_t offset = 0, noffset;
+	int i, count, rates_len, supp_rates_len;
+	u16 capab;
+	struct ieee80211_supported_band *sband;
+	u32 rates = 0;
+
+	sband = local->hw.wiphy->bands[wk->chan->band];
+
+	if (wk->assoc.supp_rates_len) {
+		/*
+		 * Get all rates supported by the device and the AP as
+		 * some APs don't like getting a superset of their rates
+		 * in the association request (e.g. D-Link DAP 1353 in
+		 * b-only mode)...
+		 */
+		rates_len = ieee80211_compatible_rates(wk->assoc.supp_rates,
+						       wk->assoc.supp_rates_len,
+						       sband, &rates);
+	} else {
+		/*
+		 * In case AP not provide any supported rates information
+		 * before association, we send information element(s) with
+		 * all rates that we support.
+		 */
+		rates = ~0;
+		rates_len = sband->n_bitrates;
+	}
+
+	skb = alloc_skb(local->hw.extra_tx_headroom +
+			sizeof(*mgmt) + /* bit too much but doesn't matter */
+			2 + wk->assoc.ssid_len + /* SSID */
+			4 + rates_len + /* (extended) rates */
+			4 + /* power capability */
+			2 + 2 * sband->n_channels + /* supported channels */
+			2 + sizeof(struct ieee80211_ht_cap) + /* HT */
+			wk->ie_len + /* extra IEs */
+			9, /* WMM */
+			GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
+		       "frame\n", sdata->name);
+		return;
+	}
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	capab = WLAN_CAPABILITY_ESS;
+
+	if (sband->band == IEEE80211_BAND_2GHZ) {
+		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
+			capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME;
+		if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_PREAMBLE_INCAPABLE))
+			capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
+	}
+
+	if (wk->assoc.capability & WLAN_CAPABILITY_PRIVACY)
+		capab |= WLAN_CAPABILITY_PRIVACY;
+
+	if ((wk->assoc.capability & WLAN_CAPABILITY_SPECTRUM_MGMT) &&
+	    (local->hw.flags & IEEE80211_HW_SPECTRUM_MGMT))
+		capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
+
+	mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
+	memset(mgmt, 0, 24);
+	memcpy(mgmt->da, wk->filter_ta, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, wk->filter_ta, ETH_ALEN);
+
+	if (!is_zero_ether_addr(wk->assoc.prev_bssid)) {
+		skb_put(skb, 10);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_REASSOC_REQ);
+		mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
+		mgmt->u.reassoc_req.listen_interval =
+				cpu_to_le16(local->hw.conf.listen_interval);
+		memcpy(mgmt->u.reassoc_req.current_ap, wk->assoc.prev_bssid,
+		       ETH_ALEN);
+	} else {
+		skb_put(skb, 4);
+		mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+						  IEEE80211_STYPE_ASSOC_REQ);
+		mgmt->u.assoc_req.capab_info = cpu_to_le16(capab);
+		mgmt->u.assoc_req.listen_interval =
+				cpu_to_le16(local->hw.conf.listen_interval);
+	}
+
+	/* SSID */
+	pos = skb_put(skb, 2 + wk->assoc.ssid_len);
+	*pos++ = WLAN_EID_SSID;
+	*pos++ = wk->assoc.ssid_len;
+	memcpy(pos, wk->assoc.ssid, wk->assoc.ssid_len);
+
+	/* add all rates which were marked to be used above */
+	supp_rates_len = rates_len;
+	if (supp_rates_len > 8)
+		supp_rates_len = 8;
+
+	pos = skb_put(skb, supp_rates_len + 2);
+	*pos++ = WLAN_EID_SUPP_RATES;
+	*pos++ = supp_rates_len;
+
+	count = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if (BIT(i) & rates) {
+			int rate = sband->bitrates[i].bitrate;
+			*pos++ = (u8) (rate / 5);
+			if (++count == 8)
+				break;
+		}
+	}
+
+	if (rates_len > count) {
+		pos = skb_put(skb, rates_len - count + 2);
+		*pos++ = WLAN_EID_EXT_SUPP_RATES;
+		*pos++ = rates_len - count;
+
+		for (i++; i < sband->n_bitrates; i++) {
+			if (BIT(i) & rates) {
+				int rate = sband->bitrates[i].bitrate;
+				*pos++ = (u8) (rate / 5);
+			}
+		}
+	}
+
+	if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) {
+		/* 1. power capabilities */
+		pos = skb_put(skb, 4);
+		*pos++ = WLAN_EID_PWR_CAPABILITY;
+		*pos++ = 2;
+		*pos++ = 0; /* min tx power */
+		*pos++ = wk->chan->max_power; /* max tx power */
+
+		/* 2. supported channels */
+		/* TODO: get this in reg domain format */
+		pos = skb_put(skb, 2 * sband->n_channels + 2);
+		*pos++ = WLAN_EID_SUPPORTED_CHANNELS;
+		*pos++ = 2 * sband->n_channels;
+		for (i = 0; i < sband->n_channels; i++) {
+			*pos++ = ieee80211_frequency_to_channel(
+					sband->channels[i].center_freq);
+			*pos++ = 1; /* one channel in the subband*/
+		}
+	}
+
+	/* if present, add any custom IEs that go before HT */
+	if (wk->ie_len && wk->ie) {
+		static const u8 before_ht[] = {
+			WLAN_EID_SSID,
+			WLAN_EID_SUPP_RATES,
+			WLAN_EID_EXT_SUPP_RATES,
+			WLAN_EID_PWR_CAPABILITY,
+			WLAN_EID_SUPPORTED_CHANNELS,
+			WLAN_EID_RSN,
+			WLAN_EID_QOS_CAPA,
+			WLAN_EID_RRM_ENABLED_CAPABILITIES,
+			WLAN_EID_MOBILITY_DOMAIN,
+			WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
+		};
+		noffset = ieee80211_ie_split(wk->ie, wk->ie_len,
+					     before_ht, ARRAY_SIZE(before_ht),
+					     offset);
+		pos = skb_put(skb, noffset - offset);
+		memcpy(pos, wk->ie + offset, noffset - offset);
+		offset = noffset;
+	}
+
+	if (wk->assoc.use_11n && wk->assoc.wmm_used &&
+	    local->hw.queues >= 4)
+		ieee80211_add_ht_ie(skb, wk->assoc.ht_information_ie,
+				    sband, wk->chan, wk->assoc.smps);
+
+	/* if present, add any custom non-vendor IEs that go after HT */
+	if (wk->ie_len && wk->ie) {
+		noffset = ieee80211_ie_split_vendor(wk->ie, wk->ie_len,
+						    offset);
+		pos = skb_put(skb, noffset - offset);
+		memcpy(pos, wk->ie + offset, noffset - offset);
+		offset = noffset;
+	}
+
+	if (wk->assoc.wmm_used && local->hw.queues >= 4) {
+		if (wk->assoc.uapsd_used) {
+			qos_info = local->uapsd_queues;
+			qos_info |= (local->uapsd_max_sp_len <<
+				     IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT);
+		} else {
+			qos_info = 0;
+		}
+
+		pos = skb_put(skb, 9);
+		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
+		*pos++ = 7; /* len */
+		*pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
+		*pos++ = 0x50;
+		*pos++ = 0xf2;
+		*pos++ = 2; /* WME */
+		*pos++ = 0; /* WME info */
+		*pos++ = 1; /* WME ver */
+		*pos++ = qos_info;
+	}
+
+	/* add any remaining custom (i.e. vendor specific here) IEs */
+	if (wk->ie_len && wk->ie) {
+		noffset = wk->ie_len;
+		pos = skb_put(skb, noffset - offset);
+		memcpy(pos, wk->ie + offset, noffset - offset);
+	}
+
+	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	ieee80211_tx_skb(sdata, skb);
+}
+
+static void ieee80211_remove_auth_bss(struct ieee80211_local *local,
+				      struct ieee80211_work *wk)
+{
+	struct cfg80211_bss *cbss;
+	u16 capa_val = WLAN_CAPABILITY_ESS;
+
+	if (wk->probe_auth.privacy)
+		capa_val |= WLAN_CAPABILITY_PRIVACY;
+
+	cbss = cfg80211_get_bss(local->hw.wiphy, wk->chan, wk->filter_ta,
+				wk->probe_auth.ssid, wk->probe_auth.ssid_len,
+				WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_PRIVACY,
+				capa_val);
+	if (!cbss)
+		return;
+
+	cfg80211_unlink_bss(local->hw.wiphy, cbss);
+	cfg80211_put_bss(cbss);
+}
+
+static enum work_action __must_check
+ieee80211_direct_probe(struct ieee80211_work *wk)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	wk->probe_auth.tries++;
+	if (wk->probe_auth.tries > IEEE80211_AUTH_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: direct probe to %pM timed out\n",
+		       sdata->name, wk->filter_ta);
+
+		/*
+		 * Most likely AP is not in the range so remove the
+		 * bss struct for that AP.
+		 */
+		ieee80211_remove_auth_bss(local, wk);
+
+		return WORK_ACT_TIMEOUT;
+	}
+
+	printk(KERN_DEBUG "%s: direct probe to %pM (try %d/%i)\n",
+	       sdata->name, wk->filter_ta, wk->probe_auth.tries,
+	       IEEE80211_AUTH_MAX_TRIES);
+
+	/*
+	 * Direct probe is sent to broadcast address as some APs
+	 * will not answer to direct packet in unassociated state.
+	 */
+	ieee80211_send_probe_req(sdata, NULL, wk->probe_auth.ssid,
+				 wk->probe_auth.ssid_len, NULL, 0);
+
+	wk->timeout = jiffies + IEEE80211_AUTH_TIMEOUT;
+	run_again(local, wk->timeout);
+
+	return WORK_ACT_NONE;
+}
+
+
+static enum work_action __must_check
+ieee80211_authenticate(struct ieee80211_work *wk)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	wk->probe_auth.tries++;
+	if (wk->probe_auth.tries > IEEE80211_AUTH_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: authentication with %pM"
+		       " timed out\n", sdata->name, wk->filter_ta);
+
+		/*
+		 * Most likely AP is not in the range so remove the
+		 * bss struct for that AP.
+		 */
+		ieee80211_remove_auth_bss(local, wk);
+
+		return WORK_ACT_TIMEOUT;
+	}
+
+	printk(KERN_DEBUG "%s: authenticate with %pM (try %d)\n",
+	       sdata->name, wk->filter_ta, wk->probe_auth.tries);
+
+	ieee80211_send_auth(sdata, 1, wk->probe_auth.algorithm, wk->ie,
+			    wk->ie_len, wk->filter_ta, NULL, 0, 0);
+	wk->probe_auth.transaction = 2;
+
+	wk->timeout = jiffies + IEEE80211_AUTH_TIMEOUT;
+	run_again(local, wk->timeout);
+
+	return WORK_ACT_NONE;
+}
+
+static enum work_action __must_check
+ieee80211_associate(struct ieee80211_work *wk)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	wk->assoc.tries++;
+	if (wk->assoc.tries > IEEE80211_ASSOC_MAX_TRIES) {
+		printk(KERN_DEBUG "%s: association with %pM"
+		       " timed out\n",
+		       sdata->name, wk->filter_ta);
+
+		/*
+		 * Most likely AP is not in the range so remove the
+		 * bss struct for that AP.
+		 */
+		if (wk->assoc.bss)
+			cfg80211_unlink_bss(local->hw.wiphy, wk->assoc.bss);
+
+		return WORK_ACT_TIMEOUT;
+	}
+
+	printk(KERN_DEBUG "%s: associate with %pM (try %d)\n",
+	       sdata->name, wk->filter_ta, wk->assoc.tries);
+	ieee80211_send_assoc(sdata, wk);
+
+	wk->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT;
+	run_again(local, wk->timeout);
+
+	return WORK_ACT_NONE;
+}
+
+static enum work_action __must_check
+ieee80211_remain_on_channel_timeout(struct ieee80211_work *wk)
+{
+	/*
+	 * First time we run, do nothing -- the generic code will
+	 * have switched to the right channel etc.
+	 */
+	if (!wk->started) {
+		wk->timeout = jiffies + msecs_to_jiffies(wk->remain.duration);
+
+		cfg80211_ready_on_channel(wk->sdata->dev, (unsigned long) wk,
+					  wk->chan, wk->chan_type,
+					  wk->remain.duration, GFP_KERNEL);
+
+		return WORK_ACT_NONE;
+	}
+
+	return WORK_ACT_TIMEOUT;
+}
+
+static enum work_action __must_check
+ieee80211_offchannel_tx(struct ieee80211_work *wk)
+{
+	if (!wk->started) {
+		wk->timeout = jiffies + msecs_to_jiffies(wk->offchan_tx.wait);
+
+		/*
+		 * After this, offchan_tx.frame remains but now is no
+		 * longer a valid pointer -- we still need it as the
+		 * cookie for canceling this work/status matching.
+		 */
+		ieee80211_tx_skb(wk->sdata, wk->offchan_tx.frame);
+
+		return WORK_ACT_NONE;
+	}
+
+	return WORK_ACT_TIMEOUT;
+}
+
+static enum work_action __must_check
+ieee80211_assoc_beacon_wait(struct ieee80211_work *wk)
+{
+	if (wk->started)
+		return WORK_ACT_TIMEOUT;
+
+	/*
+	 * Wait up to one beacon interval ...
+	 * should this be more if we miss one?
+	 */
+	printk(KERN_DEBUG "%s: waiting for beacon from %pM\n",
+	       wk->sdata->name, wk->filter_ta);
+	wk->timeout = TU_TO_EXP_TIME(wk->assoc.bss->beacon_interval);
+	return WORK_ACT_NONE;
+}
+
+static void ieee80211_auth_challenge(struct ieee80211_work *wk,
+				     struct ieee80211_mgmt *mgmt,
+				     size_t len)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	u8 *pos;
+	struct ieee802_11_elems elems;
+
+	pos = mgmt->u.auth.variable;
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+	if (!elems.challenge)
+		return;
+	ieee80211_send_auth(sdata, 3, wk->probe_auth.algorithm,
+			    elems.challenge - 2, elems.challenge_len + 2,
+			    wk->filter_ta, wk->probe_auth.key,
+			    wk->probe_auth.key_len, wk->probe_auth.key_idx);
+	wk->probe_auth.transaction = 4;
+}
+
+static enum work_action __must_check
+ieee80211_rx_mgmt_auth(struct ieee80211_work *wk,
+		       struct ieee80211_mgmt *mgmt, size_t len)
+{
+	u16 auth_alg, auth_transaction, status_code;
+
+	if (wk->type != IEEE80211_WORK_AUTH)
+		return WORK_ACT_MISMATCH;
+
+	if (len < 24 + 6)
+		return WORK_ACT_NONE;
+
+	auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
+	auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
+	status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+	if (auth_alg != wk->probe_auth.algorithm ||
+	    auth_transaction != wk->probe_auth.transaction)
+		return WORK_ACT_NONE;
+
+	if (status_code != WLAN_STATUS_SUCCESS) {
+		printk(KERN_DEBUG "%s: %pM denied authentication (status %d)\n",
+		       wk->sdata->name, mgmt->sa, status_code);
+		return WORK_ACT_DONE;
+	}
+
+	switch (wk->probe_auth.algorithm) {
+	case WLAN_AUTH_OPEN:
+	case WLAN_AUTH_LEAP:
+	case WLAN_AUTH_FT:
+		break;
+	case WLAN_AUTH_SHARED_KEY:
+		if (wk->probe_auth.transaction != 4) {
+			ieee80211_auth_challenge(wk, mgmt, len);
+			/* need another frame */
+			return WORK_ACT_NONE;
+		}
+		break;
+	default:
+		WARN_ON(1);
+		return WORK_ACT_NONE;
+	}
+
+	printk(KERN_DEBUG "%s: authenticated\n", wk->sdata->name);
+	return WORK_ACT_DONE;
+}
+
+static enum work_action __must_check
+ieee80211_rx_mgmt_assoc_resp(struct ieee80211_work *wk,
+			     struct ieee80211_mgmt *mgmt, size_t len,
+			     bool reassoc)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+	u16 capab_info, status_code, aid;
+	struct ieee802_11_elems elems;
+	u8 *pos;
+
+	if (wk->type != IEEE80211_WORK_ASSOC)
+		return WORK_ACT_MISMATCH;
+
+	/*
+	 * AssocResp and ReassocResp have identical structure, so process both
+	 * of them in this function.
+	 */
+
+	if (len < 24 + 6)
+		return WORK_ACT_NONE;
+
+	capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
+	status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+	aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
+
+	printk(KERN_DEBUG "%s: RX %sssocResp from %pM (capab=0x%x "
+	       "status=%d aid=%d)\n",
+	       sdata->name, reassoc ? "Rea" : "A", mgmt->sa,
+	       capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
+
+	pos = mgmt->u.assoc_resp.variable;
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+
+	if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
+	    elems.timeout_int && elems.timeout_int_len == 5 &&
+	    elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) {
+		u32 tu, ms;
+		tu = get_unaligned_le32(elems.timeout_int + 1);
+		ms = tu * 1024 / 1000;
+		printk(KERN_DEBUG "%s: %pM rejected association temporarily; "
+		       "comeback duration %u TU (%u ms)\n",
+		       sdata->name, mgmt->sa, tu, ms);
+		wk->timeout = jiffies + msecs_to_jiffies(ms);
+		if (ms > IEEE80211_ASSOC_TIMEOUT)
+			run_again(local, wk->timeout);
+		return WORK_ACT_NONE;
+	}
+
+	if (status_code != WLAN_STATUS_SUCCESS)
+		printk(KERN_DEBUG "%s: %pM denied association (code=%d)\n",
+		       sdata->name, mgmt->sa, status_code);
+	else
+		printk(KERN_DEBUG "%s: associated\n", sdata->name);
+
+	return WORK_ACT_DONE;
+}
+
+static enum work_action __must_check
+ieee80211_rx_mgmt_probe_resp(struct ieee80211_work *wk,
+			     struct ieee80211_mgmt *mgmt, size_t len,
+			     struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+	size_t baselen;
+
+	ASSERT_WORK_MTX(local);
+
+	if (wk->type != IEEE80211_WORK_DIRECT_PROBE)
+		return WORK_ACT_MISMATCH;
+
+	if (len < 24 + 12)
+		return WORK_ACT_NONE;
+
+	baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return WORK_ACT_NONE;
+
+	printk(KERN_DEBUG "%s: direct probe responded\n", sdata->name);
+	return WORK_ACT_DONE;
+}
+
+static enum work_action __must_check
+ieee80211_rx_mgmt_beacon(struct ieee80211_work *wk,
+			 struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	ASSERT_WORK_MTX(local);
+
+	if (wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
+		return WORK_ACT_MISMATCH;
+
+	if (len < 24 + 12)
+		return WORK_ACT_NONE;
+
+	printk(KERN_DEBUG "%s: beacon received\n", sdata->name);
+	return WORK_ACT_DONE;
+}
+
+static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
+					  struct sk_buff *skb)
+{
+	struct ieee80211_rx_status *rx_status;
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_work *wk;
+	enum work_action rma = WORK_ACT_NONE;
+	u16 fc;
+
+	rx_status = (struct ieee80211_rx_status *) skb->cb;
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	mutex_lock(&local->mtx);
+
+	list_for_each_entry(wk, &local->work_list, list) {
+		const u8 *bssid = NULL;
+
+		switch (wk->type) {
+		case IEEE80211_WORK_DIRECT_PROBE:
+		case IEEE80211_WORK_AUTH:
+		case IEEE80211_WORK_ASSOC:
+		case IEEE80211_WORK_ASSOC_BEACON_WAIT:
+			bssid = wk->filter_ta;
+			break;
+		default:
+			continue;
+		}
+
+		/*
+		 * Before queuing, we already verified mgmt->sa,
+		 * so this is needed just for matching.
+		 */
+		if (compare_ether_addr(bssid, mgmt->bssid))
+			continue;
+
+		switch (fc & IEEE80211_FCTL_STYPE) {
+		case IEEE80211_STYPE_BEACON:
+			rma = ieee80211_rx_mgmt_beacon(wk, mgmt, skb->len);
+			break;
+		case IEEE80211_STYPE_PROBE_RESP:
+			rma = ieee80211_rx_mgmt_probe_resp(wk, mgmt, skb->len,
+							   rx_status);
+			break;
+		case IEEE80211_STYPE_AUTH:
+			rma = ieee80211_rx_mgmt_auth(wk, mgmt, skb->len);
+			break;
+		case IEEE80211_STYPE_ASSOC_RESP:
+			rma = ieee80211_rx_mgmt_assoc_resp(wk, mgmt,
+							   skb->len, false);
+			break;
+		case IEEE80211_STYPE_REASSOC_RESP:
+			rma = ieee80211_rx_mgmt_assoc_resp(wk, mgmt,
+							   skb->len, true);
+			break;
+		default:
+			WARN_ON(1);
+			rma = WORK_ACT_NONE;
+		}
+
+		/*
+		 * We've either received an unexpected frame, or we have
+		 * multiple work items and need to match the frame to the
+		 * right one.
+		 */
+		if (rma == WORK_ACT_MISMATCH)
+			continue;
+
+		/*
+		 * We've processed this frame for that work, so it can't
+		 * belong to another work struct.
+		 * NB: this is also required for correctness for 'rma'!
+		 */
+		break;
+	}
+
+	switch (rma) {
+	case WORK_ACT_MISMATCH:
+		/* ignore this unmatched frame */
+		break;
+	case WORK_ACT_NONE:
+		break;
+	case WORK_ACT_DONE:
+		list_del_rcu(&wk->list);
+		break;
+	default:
+		WARN(1, "unexpected: %d", rma);
+	}
+
+	mutex_unlock(&local->mtx);
+
+	if (rma != WORK_ACT_DONE)
+		goto out;
+
+	switch (wk->done(wk, skb)) {
+	case WORK_DONE_DESTROY:
+		free_work(wk);
+		break;
+	case WORK_DONE_REQUEUE:
+		synchronize_rcu();
+		wk->started = false; /* restart */
+		mutex_lock(&local->mtx);
+		list_add_tail(&wk->list, &local->work_list);
+		mutex_unlock(&local->mtx);
+	}
+
+ out:
+	kfree_skb(skb);
+}
+
+static bool ieee80211_work_ct_coexists(enum nl80211_channel_type wk_ct,
+				       enum nl80211_channel_type oper_ct)
+{
+	switch (wk_ct) {
+	case NL80211_CHAN_NO_HT:
+		return true;
+	case NL80211_CHAN_HT20:
+		if (oper_ct != NL80211_CHAN_NO_HT)
+			return true;
+		return false;
+	case NL80211_CHAN_HT40MINUS:
+	case NL80211_CHAN_HT40PLUS:
+		return (wk_ct == oper_ct);
+	}
+	WARN_ON(1); /* shouldn't get here */
+	return false;
+}
+
+static enum nl80211_channel_type
+ieee80211_calc_ct(enum nl80211_channel_type wk_ct,
+		  enum nl80211_channel_type oper_ct)
+{
+	switch (wk_ct) {
+	case NL80211_CHAN_NO_HT:
+		return oper_ct;
+	case NL80211_CHAN_HT20:
+		if (oper_ct != NL80211_CHAN_NO_HT)
+			return oper_ct;
+		return wk_ct;
+	case NL80211_CHAN_HT40MINUS:
+	case NL80211_CHAN_HT40PLUS:
+		return wk_ct;
+	}
+	WARN_ON(1); /* shouldn't get here */
+	return wk_ct;
+}
+
+
+static void ieee80211_work_timer(unsigned long data)
+{
+	struct ieee80211_local *local = (void *) data;
+
+	if (local->quiescing)
+		return;
+
+	ieee80211_queue_work(&local->hw, &local->work_work);
+}
+
+static void ieee80211_work_work(struct work_struct *work)
+{
+	struct ieee80211_local *local =
+		container_of(work, struct ieee80211_local, work_work);
+	struct sk_buff *skb;
+	struct ieee80211_work *wk, *tmp;
+	LIST_HEAD(free_work);
+	enum work_action rma;
+	bool remain_off_channel = false;
+
+	if (local->scanning)
+		return;
+
+	/*
+	 * ieee80211_queue_work() should have picked up most cases,
+	 * here we'll pick the rest.
+	 */
+	if (WARN(local->suspended, "work scheduled while going to suspend\n"))
+		return;
+
+	/* first process frames to avoid timing out while a frame is pending */
+	while ((skb = skb_dequeue(&local->work_skb_queue)))
+		ieee80211_work_rx_queued_mgmt(local, skb);
+
+	mutex_lock(&local->mtx);
+
+	ieee80211_recalc_idle(local);
+
+	list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
+		bool started = wk->started;
+
+		/* mark work as started if it's on the current off-channel */
+		if (!started && local->tmp_channel &&
+		    wk->chan == local->tmp_channel &&
+		    wk->chan_type == local->tmp_channel_type) {
+			started = true;
+			wk->timeout = jiffies;
+		}
+
+		if (!started && !local->tmp_channel) {
+			bool on_oper_chan;
+			bool tmp_chan_changed = false;
+			bool on_oper_chan2;
+			enum nl80211_channel_type wk_ct;
+			on_oper_chan = ieee80211_cfg_on_oper_channel(local);
+
+			/* Work with existing channel type if possible. */
+			wk_ct = wk->chan_type;
+			if (wk->chan == local->hw.conf.channel)
+				wk_ct = ieee80211_calc_ct(wk->chan_type,
+						local->hw.conf.channel_type);
+
+			if (local->tmp_channel)
+				if ((local->tmp_channel != wk->chan) ||
+				    (local->tmp_channel_type != wk_ct))
+					tmp_chan_changed = true;
+
+			local->tmp_channel = wk->chan;
+			local->tmp_channel_type = wk_ct;
+			/*
+			 * Leave the station vifs in awake mode if they
+			 * happen to be on the same channel as
+			 * the requested channel.
+			 */
+			on_oper_chan2 = ieee80211_cfg_on_oper_channel(local);
+			if (on_oper_chan != on_oper_chan2) {
+				if (on_oper_chan2) {
+					/* going off oper channel, PS too */
+					ieee80211_offchannel_stop_vifs(local,
+								       true);
+					ieee80211_hw_config(local, 0);
+				} else {
+					/* going on channel, but leave PS
+					 * off-channel. */
+					ieee80211_hw_config(local, 0);
+					ieee80211_offchannel_return(local,
+								    true,
+								    false);
+				}
+			} else if (tmp_chan_changed)
+				/* Still off-channel, but on some other
+				 * channel, so update hardware.
+				 * PS should already be off-channel.
+				 */
+				ieee80211_hw_config(local, 0);
+
+			started = true;
+			wk->timeout = jiffies;
+		}
+
+		/* don't try to work with items that aren't started */
+		if (!started)
+			continue;
+
+		if (time_is_after_jiffies(wk->timeout)) {
+			/*
+			 * This work item isn't supposed to be worked on
+			 * right now, but take care to adjust the timer
+			 * properly.
+			 */
+			run_again(local, wk->timeout);
+			continue;
+		}
+
+		switch (wk->type) {
+		default:
+			WARN_ON(1);
+			/* nothing */
+			rma = WORK_ACT_NONE;
+			break;
+		case IEEE80211_WORK_ABORT:
+			rma = WORK_ACT_TIMEOUT;
+			break;
+		case IEEE80211_WORK_DIRECT_PROBE:
+			rma = ieee80211_direct_probe(wk);
+			break;
+		case IEEE80211_WORK_AUTH:
+			rma = ieee80211_authenticate(wk);
+			break;
+		case IEEE80211_WORK_ASSOC:
+			rma = ieee80211_associate(wk);
+			break;
+		case IEEE80211_WORK_REMAIN_ON_CHANNEL:
+			rma = ieee80211_remain_on_channel_timeout(wk);
+			break;
+		case IEEE80211_WORK_OFFCHANNEL_TX:
+			rma = ieee80211_offchannel_tx(wk);
+			break;
+		case IEEE80211_WORK_ASSOC_BEACON_WAIT:
+			rma = ieee80211_assoc_beacon_wait(wk);
+			break;
+		}
+
+		wk->started = started;
+
+		switch (rma) {
+		case WORK_ACT_NONE:
+			/* might have changed the timeout */
+			run_again(local, wk->timeout);
+			break;
+		case WORK_ACT_TIMEOUT:
+			list_del_rcu(&wk->list);
+			synchronize_rcu();
+			list_add(&wk->list, &free_work);
+			break;
+		default:
+			WARN(1, "unexpected: %d", rma);
+		}
+	}
+
+	list_for_each_entry(wk, &local->work_list, list) {
+		if (!wk->started)
+			continue;
+		if (wk->chan != local->tmp_channel)
+			continue;
+		if (!ieee80211_work_ct_coexists(wk->chan_type,
+						local->tmp_channel_type))
+			continue;
+		remain_off_channel = true;
+	}
+
+	if (!remain_off_channel && local->tmp_channel) {
+		local->tmp_channel = NULL;
+		/* If tmp_channel wasn't operating channel, then
+		 * we need to go back on-channel.
+		 * NOTE:  If we can ever be here while scannning,
+		 * or if the hw_config() channel config logic changes,
+		 * then we may need to do a more thorough check to see if
+		 * we still need to do a hardware config.  Currently,
+		 * we cannot be here while scanning, however.
+		 */
+		if (!ieee80211_cfg_on_oper_channel(local))
+			ieee80211_hw_config(local, 0);
+
+		/* At the least, we need to disable offchannel_ps,
+		 * so just go ahead and run the entire offchannel
+		 * return logic here.  We *could* skip enabling
+		 * beaconing if we were already on-oper-channel
+		 * as a future optimization.
+		 */
+		ieee80211_offchannel_return(local, true, true);
+
+		/* give connection some time to breathe */
+		run_again(local, jiffies + HZ/2);
+	}
+
+	if (list_empty(&local->work_list) && local->scan_req &&
+	    !local->scanning)
+		ieee80211_queue_delayed_work(&local->hw,
+					     &local->scan_work,
+					     round_jiffies_relative(0));
+
+	ieee80211_recalc_idle(local);
+
+	mutex_unlock(&local->mtx);
+
+	list_for_each_entry_safe(wk, tmp, &free_work, list) {
+		wk->done(wk, NULL);
+		list_del(&wk->list);
+		kfree(wk);
+	}
+}
+
+void ieee80211_add_work(struct ieee80211_work *wk)
+{
+	struct ieee80211_local *local;
+
+	if (WARN_ON(!wk->chan))
+		return;
+
+	if (WARN_ON(!wk->sdata))
+		return;
+
+	if (WARN_ON(!wk->done))
+		return;
+
+	if (WARN_ON(!ieee80211_sdata_running(wk->sdata)))
+		return;
+
+	wk->started = false;
+
+	local = wk->sdata->local;
+	mutex_lock(&local->mtx);
+	list_add_tail(&wk->list, &local->work_list);
+	mutex_unlock(&local->mtx);
+
+	ieee80211_queue_work(&local->hw, &local->work_work);
+}
+
+void ieee80211_work_init(struct ieee80211_local *local)
+{
+	INIT_LIST_HEAD(&local->work_list);
+	setup_timer(&local->work_timer, ieee80211_work_timer,
+		    (unsigned long)local);
+	INIT_WORK(&local->work_work, ieee80211_work_work);
+	skb_queue_head_init(&local->work_skb_queue);
+}
+
+void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_work *wk;
+	bool cleanup = false;
+
+	mutex_lock(&local->mtx);
+	list_for_each_entry(wk, &local->work_list, list) {
+		if (wk->sdata != sdata)
+			continue;
+		cleanup = true;
+		wk->type = IEEE80211_WORK_ABORT;
+		wk->started = true;
+		wk->timeout = jiffies;
+	}
+	mutex_unlock(&local->mtx);
+
+	/* run cleanups etc. */
+	if (cleanup)
+		ieee80211_work_work(&local->work_work);
+
+	mutex_lock(&local->mtx);
+	list_for_each_entry(wk, &local->work_list, list) {
+		if (wk->sdata != sdata)
+			continue;
+		WARN_ON(1);
+		break;
+	}
+	mutex_unlock(&local->mtx);
+}
+
+ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
+					   struct sk_buff *skb)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_work *wk;
+	u16 fc;
+
+	if (skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	mgmt = (struct ieee80211_mgmt *) skb->data;
+	fc = le16_to_cpu(mgmt->frame_control);
+
+	list_for_each_entry_rcu(wk, &local->work_list, list) {
+		if (sdata != wk->sdata)
+			continue;
+		if (compare_ether_addr(wk->filter_ta, mgmt->sa))
+			continue;
+		if (compare_ether_addr(wk->filter_ta, mgmt->bssid))
+			continue;
+
+		switch (fc & IEEE80211_FCTL_STYPE) {
+		case IEEE80211_STYPE_AUTH:
+		case IEEE80211_STYPE_PROBE_RESP:
+		case IEEE80211_STYPE_ASSOC_RESP:
+		case IEEE80211_STYPE_REASSOC_RESP:
+		case IEEE80211_STYPE_BEACON:
+			skb_queue_tail(&local->work_skb_queue, skb);
+			ieee80211_queue_work(&local->hw, &local->work_work);
+			return RX_QUEUED;
+		}
+	}
+
+	return RX_CONTINUE;
+}
+
+static enum work_done_result ieee80211_remain_done(struct ieee80211_work *wk,
+						   struct sk_buff *skb)
+{
+	/*
+	 * We are done serving the remain-on-channel command.
+	 */
+	cfg80211_remain_on_channel_expired(wk->sdata->dev, (unsigned long) wk,
+					   wk->chan, wk->chan_type,
+					   GFP_KERNEL);
+
+	return WORK_DONE_DESTROY;
+}
+
+int ieee80211_wk_remain_on_channel(struct ieee80211_sub_if_data *sdata,
+				   struct ieee80211_channel *chan,
+				   enum nl80211_channel_type channel_type,
+				   unsigned int duration, u64 *cookie)
+{
+	struct ieee80211_work *wk;
+
+	wk = kzalloc(sizeof(*wk), GFP_KERNEL);
+	if (!wk)
+		return -ENOMEM;
+
+	wk->type = IEEE80211_WORK_REMAIN_ON_CHANNEL;
+	wk->chan = chan;
+	wk->chan_type = channel_type;
+	wk->sdata = sdata;
+	wk->done = ieee80211_remain_done;
+
+	wk->remain.duration = duration;
+
+	*cookie = (unsigned long) wk;
+
+	ieee80211_add_work(wk);
+
+	return 0;
+}
+
+int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
+					  u64 cookie)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_work *wk, *tmp;
+	bool found = false;
+
+	mutex_lock(&local->mtx);
+	list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
+		if ((unsigned long) wk == cookie) {
+			wk->timeout = jiffies;
+			found = true;
+			break;
+		}
+	}
+	mutex_unlock(&local->mtx);
+
+	if (!found)
+		return -ENOENT;
+
+	ieee80211_queue_work(&local->hw, &local->work_work);
+
+	return 0;
+}
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
new file mode 100644
index 00000000..aa1c40ab
--- /dev/null
+++ b/net/mac80211/wpa.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ * Copyright 2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/compiler.h>
+#include <linux/ieee80211.h>
+#include <linux/gfp.h>
+#include <asm/unaligned.h>
+#include <net/mac80211.h>
+
+#include "ieee80211_i.h"
+#include "michael.h"
+#include "tkip.h"
+#include "aes_ccm.h"
+#include "aes_cmac.h"
+#include "wpa.h"
+
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	u8 *data, *key, *mic;
+	size_t data_len;
+	unsigned int hdrlen;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	int tail;
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+	if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+	    skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
+		return TX_CONTINUE;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (skb->len < hdrlen)
+		return TX_DROP;
+
+	data = skb->data + hdrlen;
+	data_len = skb->len - hdrlen;
+
+	if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) {
+		/* Need to use software crypto for the test */
+		info->control.hw_key = NULL;
+	}
+
+	if (info->control.hw_key &&
+	    !(tx->flags & IEEE80211_TX_FRAGMENTED) &&
+	    !(tx->key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)) {
+		/* hwaccel - with no need for SW-generated MMIC */
+		return TX_CONTINUE;
+	}
+
+	tail = MICHAEL_MIC_LEN;
+	if (!info->control.hw_key)
+		tail += TKIP_ICV_LEN;
+
+	if (WARN_ON(skb_tailroom(skb) < tail ||
+		    skb_headroom(skb) < TKIP_IV_LEN))
+		return TX_DROP;
+
+	key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY];
+	mic = skb_put(skb, MICHAEL_MIC_LEN);
+	michael_mic(key, hdr, data, data_len, mic);
+	if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE))
+		mic[0]++;
+
+	return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
+{
+	u8 *data, *key = NULL;
+	size_t data_len;
+	unsigned int hdrlen;
+	u8 mic[MICHAEL_MIC_LEN];
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	int queue = rx->queue;
+
+	/* otherwise, TKIP is vulnerable to TID 0 vs. non-QoS replays */
+	if (rx->queue == NUM_RX_DATA_QUEUES - 1)
+		queue = 0;
+
+	/*
+	 * it makes no sense to check for MIC errors on anything other
+	 * than data frames.
+	 */
+	if (!ieee80211_is_data_present(hdr->frame_control))
+		return RX_CONTINUE;
+
+	/*
+	 * No way to verify the MIC if the hardware stripped it or
+	 * the IV with the key index. In this case we have solely rely
+	 * on the driver to set RX_FLAG_MMIC_ERROR in the event of a
+	 * MIC failure report.
+	 */
+	if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) {
+		if (status->flag & RX_FLAG_MMIC_ERROR)
+			goto mic_fail;
+
+		if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key)
+			goto update_iv;
+
+		return RX_CONTINUE;
+	}
+
+	/*
+	 * Some hardware seems to generate Michael MIC failure reports; even
+	 * though, the frame was not encrypted with TKIP and therefore has no
+	 * MIC. Ignore the flag them to avoid triggering countermeasures.
+	 */
+	if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+	    !(status->flag & RX_FLAG_DECRYPTED))
+		return RX_CONTINUE;
+
+	if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) {
+		/*
+		 * APs with pairwise keys should never receive Michael MIC
+		 * errors for non-zero keyidx because these are reserved for
+		 * group keys and only the AP is sending real multicast
+		 * frames in the BSS. (
+		 */
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (status->flag & RX_FLAG_MMIC_ERROR)
+		goto mic_fail;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (skb->len < hdrlen + MICHAEL_MIC_LEN)
+		return RX_DROP_UNUSABLE;
+
+	data = skb->data + hdrlen;
+	data_len = skb->len - hdrlen - MICHAEL_MIC_LEN;
+	key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY];
+	michael_mic(key, hdr, data, data_len, mic);
+	if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0)
+		goto mic_fail;
+
+	/* remove Michael MIC from payload */
+	skb_trim(skb, skb->len - MICHAEL_MIC_LEN);
+
+update_iv:
+	/* update IV in key information to be able to detect replays */
+	rx->key->u.tkip.rx[queue].iv32 = rx->tkip_iv32;
+	rx->key->u.tkip.rx[queue].iv16 = rx->tkip_iv16;
+
+	return RX_CONTINUE;
+
+mic_fail:
+	/*
+	 * In some cases the key can be unset - e.g. a multicast packet, in
+	 * a driver that supports HW encryption. Send up the key idx only if
+	 * the key is set.
+	 */
+	mac80211_ev_michael_mic_failure(rx->sdata,
+					rx->key ? rx->key->conf.keyidx : -1,
+					(void *) skb->data, NULL, GFP_ATOMIC);
+	return RX_DROP_UNUSABLE;
+}
+
+
+static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	unsigned int hdrlen;
+	int len, tail;
+	u8 *pos;
+
+	if (info->control.hw_key &&
+	    !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
+		/* hwaccel - with no need for software-generated IV */
+		return 0;
+	}
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	len = skb->len - hdrlen;
+
+	if (info->control.hw_key)
+		tail = 0;
+	else
+		tail = TKIP_ICV_LEN;
+
+	if (WARN_ON(skb_tailroom(skb) < tail ||
+		    skb_headroom(skb) < TKIP_IV_LEN))
+		return -1;
+
+	pos = skb_push(skb, TKIP_IV_LEN);
+	memmove(pos, pos + TKIP_IV_LEN, hdrlen);
+	pos += hdrlen;
+
+	/* Increase IV for the frame */
+	key->u.tkip.tx.iv16++;
+	if (key->u.tkip.tx.iv16 == 0)
+		key->u.tkip.tx.iv32++;
+
+	pos = ieee80211_tkip_add_iv(pos, key, key->u.tkip.tx.iv16);
+
+	/* hwaccel - with software IV */
+	if (info->control.hw_key)
+		return 0;
+
+	/* Add room for ICV */
+	skb_put(skb, TKIP_ICV_LEN);
+
+	hdr = (struct ieee80211_hdr *) skb->data;
+	return ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm,
+					   key, pos, len, hdr->addr2);
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+
+	ieee80211_tx_set_protected(tx);
+
+	do {
+		if (tkip_encrypt_skb(tx, skb) < 0)
+			return TX_DROP;
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data;
+	int hdrlen, res, hwaccel = 0;
+	struct ieee80211_key *key = rx->key;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	int queue = rx->queue;
+
+	/* otherwise, TKIP is vulnerable to TID 0 vs. non-QoS replays */
+	if (rx->queue == NUM_RX_DATA_QUEUES - 1)
+		queue = 0;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+	if (!ieee80211_is_data(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if (!rx->sta || skb->len - hdrlen < 12)
+		return RX_DROP_UNUSABLE;
+
+	/*
+	 * Let TKIP code verify IV, but skip decryption.
+	 * In the case where hardware checks the IV as well,
+	 * we don't even get here, see ieee80211_rx_h_decrypt()
+	 */
+	if (status->flag & RX_FLAG_DECRYPTED)
+		hwaccel = 1;
+
+	res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
+					  key, skb->data + hdrlen,
+					  skb->len - hdrlen, rx->sta->sta.addr,
+					  hdr->addr1, hwaccel, queue,
+					  &rx->tkip_iv32,
+					  &rx->tkip_iv16);
+	if (res != TKIP_DECRYPT_OK)
+		return RX_DROP_UNUSABLE;
+
+	/* Trim ICV */
+	skb_trim(skb, skb->len - TKIP_ICV_LEN);
+
+	/* Remove IV */
+	memmove(skb->data + TKIP_IV_LEN, skb->data, hdrlen);
+	skb_pull(skb, TKIP_IV_LEN);
+
+	return RX_CONTINUE;
+}
+
+
+static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *scratch,
+				int encrypted)
+{
+	__le16 mask_fc;
+	int a4_included, mgmt;
+	u8 qos_tid;
+	u8 *b_0, *aad;
+	u16 data_len, len_a;
+	unsigned int hdrlen;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	b_0 = scratch + 3 * AES_BLOCK_LEN;
+	aad = scratch + 4 * AES_BLOCK_LEN;
+
+	/*
+	 * Mask FC: zero subtype b4 b5 b6 (if not mgmt)
+	 * Retry, PwrMgt, MoreData; set Protected
+	 */
+	mgmt = ieee80211_is_mgmt(hdr->frame_control);
+	mask_fc = hdr->frame_control;
+	mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY |
+				IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA);
+	if (!mgmt)
+		mask_fc &= ~cpu_to_le16(0x0070);
+	mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	len_a = hdrlen - 2;
+	a4_included = ieee80211_has_a4(hdr->frame_control);
+
+	if (ieee80211_is_data_qos(hdr->frame_control))
+		qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	else
+		qos_tid = 0;
+
+	data_len = skb->len - hdrlen - CCMP_HDR_LEN;
+	if (encrypted)
+		data_len -= CCMP_MIC_LEN;
+
+	/* First block, b_0 */
+	b_0[0] = 0x59; /* flags: Adata: 1, M: 011, L: 001 */
+	/* Nonce: Nonce Flags | A2 | PN
+	 * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7)
+	 */
+	b_0[1] = qos_tid | (mgmt << 4);
+	memcpy(&b_0[2], hdr->addr2, ETH_ALEN);
+	memcpy(&b_0[8], pn, CCMP_PN_LEN);
+	/* l(m) */
+	put_unaligned_be16(data_len, &b_0[14]);
+
+	/* AAD (extra authenticate-only data) / masked 802.11 header
+	 * FC | A1 | A2 | A3 | SC | [A4] | [QC] */
+	put_unaligned_be16(len_a, &aad[0]);
+	put_unaligned(mask_fc, (__le16 *)&aad[2]);
+	memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN);
+
+	/* Mask Seq#, leave Frag# */
+	aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f;
+	aad[23] = 0;
+
+	if (a4_included) {
+		memcpy(&aad[24], hdr->addr4, ETH_ALEN);
+		aad[30] = qos_tid;
+		aad[31] = 0;
+	} else {
+		memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN);
+		aad[24] = qos_tid;
+	}
+}
+
+
+static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id)
+{
+	hdr[0] = pn[5];
+	hdr[1] = pn[4];
+	hdr[2] = 0;
+	hdr[3] = 0x20 | (key_id << 6);
+	hdr[4] = pn[3];
+	hdr[5] = pn[2];
+	hdr[6] = pn[1];
+	hdr[7] = pn[0];
+}
+
+
+static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr)
+{
+	pn[0] = hdr[7];
+	pn[1] = hdr[6];
+	pn[2] = hdr[5];
+	pn[3] = hdr[4];
+	pn[4] = hdr[1];
+	pn[5] = hdr[0];
+}
+
+
+static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	int hdrlen, len, tail;
+	u8 *pos, *pn;
+	int i;
+
+	if (info->control.hw_key &&
+	    !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
+		/*
+		 * hwaccel has no need for preallocated room for CCMP
+		 * header or MIC fields
+		 */
+		return 0;
+	}
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	len = skb->len - hdrlen;
+
+	if (info->control.hw_key)
+		tail = 0;
+	else
+		tail = CCMP_MIC_LEN;
+
+	if (WARN_ON(skb_tailroom(skb) < tail ||
+		    skb_headroom(skb) < CCMP_HDR_LEN))
+		return -1;
+
+	pos = skb_push(skb, CCMP_HDR_LEN);
+	memmove(pos, pos + CCMP_HDR_LEN, hdrlen);
+	hdr = (struct ieee80211_hdr *) pos;
+	pos += hdrlen;
+
+	/* PN = PN + 1 */
+	pn = key->u.ccmp.tx_pn;
+
+	for (i = CCMP_PN_LEN - 1; i >= 0; i--) {
+		pn[i]++;
+		if (pn[i])
+			break;
+	}
+
+	ccmp_pn2hdr(pos, pn, key->conf.keyidx);
+
+	/* hwaccel - with software CCMP header */
+	if (info->control.hw_key)
+		return 0;
+
+	pos += CCMP_HDR_LEN;
+	ccmp_special_blocks(skb, pn, key->u.ccmp.tx_crypto_buf, 0);
+	ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, key->u.ccmp.tx_crypto_buf, pos, len,
+				  pos, skb_put(skb, CCMP_MIC_LEN));
+
+	return 0;
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+
+	ieee80211_tx_set_protected(tx);
+
+	do {
+		if (ccmp_encrypt_skb(tx, skb) < 0)
+			return TX_DROP;
+	} while ((skb = skb->next));
+
+	return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	int hdrlen;
+	struct ieee80211_key *key = rx->key;
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	u8 pn[CCMP_PN_LEN];
+	int data_len;
+	int queue;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+	if (!ieee80211_is_data(hdr->frame_control) &&
+	    !ieee80211_is_robust_mgmt_frame(hdr))
+		return RX_CONTINUE;
+
+	data_len = skb->len - hdrlen - CCMP_HDR_LEN - CCMP_MIC_LEN;
+	if (!rx->sta || data_len < 0)
+		return RX_DROP_UNUSABLE;
+
+	ccmp_hdr2pn(pn, skb->data + hdrlen);
+
+	queue = ieee80211_is_mgmt(hdr->frame_control) ?
+		NUM_RX_DATA_QUEUES : rx->queue;
+
+	if (memcmp(pn, key->u.ccmp.rx_pn[queue], CCMP_PN_LEN) <= 0) {
+		key->u.ccmp.replays++;
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (!(status->flag & RX_FLAG_DECRYPTED)) {
+		/* hardware didn't decrypt/verify MIC */
+		ccmp_special_blocks(skb, pn, key->u.ccmp.rx_crypto_buf, 1);
+
+		if (ieee80211_aes_ccm_decrypt(
+			    key->u.ccmp.tfm, key->u.ccmp.rx_crypto_buf,
+			    skb->data + hdrlen + CCMP_HDR_LEN, data_len,
+			    skb->data + skb->len - CCMP_MIC_LEN,
+			    skb->data + hdrlen + CCMP_HDR_LEN))
+			return RX_DROP_UNUSABLE;
+	}
+
+	memcpy(key->u.ccmp.rx_pn[queue], pn, CCMP_PN_LEN);
+
+	/* Remove CCMP header and MIC */
+	skb_trim(skb, skb->len - CCMP_MIC_LEN);
+	memmove(skb->data + CCMP_HDR_LEN, skb->data, hdrlen);
+	skb_pull(skb, CCMP_HDR_LEN);
+
+	return RX_CONTINUE;
+}
+
+
+static void bip_aad(struct sk_buff *skb, u8 *aad)
+{
+	/* BIP AAD: FC(masked) || A1 || A2 || A3 */
+
+	/* FC type/subtype */
+	aad[0] = skb->data[0];
+	/* Mask FC Retry, PwrMgt, MoreData flags to zero */
+	aad[1] = skb->data[1] & ~(BIT(4) | BIT(5) | BIT(6));
+	/* A1 || A2 || A3 */
+	memcpy(aad + 2, skb->data + 4, 3 * ETH_ALEN);
+}
+
+
+static inline void bip_ipn_swap(u8 *d, const u8 *s)
+{
+	*d++ = s[5];
+	*d++ = s[4];
+	*d++ = s[3];
+	*d++ = s[2];
+	*d++ = s[1];
+	*d = s[0];
+}
+
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb = tx->skb;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_mmie *mmie;
+	u8 *pn, aad[20];
+	int i;
+
+	if (info->control.hw_key)
+		return 0;
+
+	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+		return TX_DROP;
+
+	mmie = (struct ieee80211_mmie *) skb_put(skb, sizeof(*mmie));
+	mmie->element_id = WLAN_EID_MMIE;
+	mmie->length = sizeof(*mmie) - 2;
+	mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+	/* PN = PN + 1 */
+	pn = key->u.aes_cmac.tx_pn;
+
+	for (i = sizeof(key->u.aes_cmac.tx_pn) - 1; i >= 0; i--) {
+		pn[i]++;
+		if (pn[i])
+			break;
+	}
+	bip_ipn_swap(mmie->sequence_number, pn);
+
+	bip_aad(skb, aad);
+
+	/*
+	 * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64)
+	 */
+	ieee80211_aes_cmac(key->u.aes_cmac.tfm, key->u.aes_cmac.tx_crypto_buf,
+			   aad, skb->data + 24, skb->len - 24, mmie->mic);
+
+	return TX_CONTINUE;
+}
+
+
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_key *key = rx->key;
+	struct ieee80211_mmie *mmie;
+	u8 aad[20], mic[8], ipn[6];
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+
+	if (!ieee80211_is_mgmt(hdr->frame_control))
+		return RX_CONTINUE;
+
+	if (skb->len < 24 + sizeof(*mmie))
+		return RX_DROP_UNUSABLE;
+
+	mmie = (struct ieee80211_mmie *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+	bip_ipn_swap(ipn, mmie->sequence_number);
+
+	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+		key->u.aes_cmac.replays++;
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (!(status->flag & RX_FLAG_DECRYPTED)) {
+		/* hardware didn't decrypt/verify MIC */
+		bip_aad(skb, aad);
+		ieee80211_aes_cmac(key->u.aes_cmac.tfm,
+				   key->u.aes_cmac.rx_crypto_buf, aad,
+				   skb->data + 24, skb->len - 24, mic);
+		if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+			key->u.aes_cmac.icverrors++;
+			return RX_DROP_UNUSABLE;
+		}
+	}
+
+	memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+	/* Remove MMIE */
+	skb_trim(skb, skb->len - sizeof(*mmie));
+
+	return RX_CONTINUE;
+}
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
new file mode 100644
index 00000000..baba0608
--- /dev/null
+++ b/net/mac80211/wpa.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2002-2004, Instant802 Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef WPA_H
+#define WPA_H
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include "ieee80211_i.h"
+
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx);
+
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx);
+
+#endif /* WPA_H */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 00000000..5bd5c612
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,1130 @@
+menu "Core Netfilter Configuration"
+	depends on NET && INET && NETFILTER
+
+config NETFILTER_NETLINK
+	tristate
+
+config NETFILTER_NETLINK_QUEUE
+	tristate "Netfilter NFQUEUE over NFNETLINK interface"
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_NETLINK
+	help
+	  If this option is enabled, the kernel will include support
+	  for queueing packets via NFNETLINK.
+	  
+config NETFILTER_NETLINK_LOG
+	tristate "Netfilter LOG over NFNETLINK interface"
+	default m if NETFILTER_ADVANCED=n
+	select NETFILTER_NETLINK
+	help
+	  If this option is enabled, the kernel will include support
+	  for logging packets via NFNETLINK.
+
+	  This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+	  and is also scheduled to replace the old syslog-based ipt_LOG
+	  and ip6t_LOG modules.
+
+config NF_CONNTRACK
+	tristate "Netfilter connection tracking support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections.
+
+	  This is required to do Masquerading or other kinds of Network
+	  Address Translation.  It can also be used to enhance packet
+	  filtering (see `Connection state match support' below).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if NF_CONNTRACK
+
+config NF_CONNTRACK_MARK
+	bool  'Connection mark tracking support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables support for connection marks, used by the
+	  `CONNMARK' target and `connmark' match. Similar to the mark value
+	  of packets, but this mark value is kept in the conntrack session
+	  instead of the individual packets.
+
+config NF_CONNTRACK_SECMARK
+	bool  'Connection tracking security mark support'
+	depends on NETWORK_SECMARK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option enables security markings to be applied to
+	  connections.  Typically they are copied to connections from
+	  packets using the CONNSECMARK target and copied back from
+	  connections to packets with the same target, with the packets
+	  being originally labeled via SECMARK.
+
+	  If unsure, say 'N'.
+
+config NF_CONNTRACK_ZONES
+	bool  'Connection tracking zones'
+	depends on NETFILTER_ADVANCED
+	depends on NETFILTER_XT_TARGET_CT
+	help
+	  This option enables support for connection tracking zones.
+	  Normally, each connection needs to have a unique system wide
+	  identity. Connection tracking zones allow to have multiple
+	  connections using the same identity, as long as they are
+	  contained in different zones.
+
+	  If unsure, say `N'.
+
+config NF_CONNTRACK_EVENTS
+	bool "Connection tracking events"
+	depends on NETFILTER_ADVANCED
+	help
+	  If this option is enabled, the connection tracking code will
+	  provide a notifier chain that can be used by other kernel code
+	  to get notified about changes in the connection tracking state.
+
+	  If unsure, say `N'.
+
+config NF_CONNTRACK_TIMESTAMP
+	bool  'Connection tracking timestamping'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables support for connection tracking timestamping.
+	  This allows you to store the flow start-time and to obtain
+	  the flow-stop time (once it has been destroyed) via Connection
+	  tracking events.
+
+	  If unsure, say `N'.
+
+config NF_CT_PROTO_DCCP
+	tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_ADVANCED
+	default IP_DCCP
+	help
+	  With this option enabled, the layer 3 independent connection
+	  tracking code will be able to do state tracking on DCCP connections.
+
+	  If unsure, say 'N'.
+
+config NF_CT_PROTO_GRE
+	tristate
+
+config NF_CT_PROTO_SCTP
+	tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_ADVANCED
+	default IP_SCTP
+	help
+	  With this option enabled, the layer 3 independent connection
+	  tracking code will be able to do state tracking on SCTP connections.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NF_CT_PROTO_UDPLITE
+	tristate 'UDP-Lite protocol connection tracking support'
+	depends on NETFILTER_ADVANCED
+	help
+	  With this option enabled, the layer 3 independent connection
+	  tracking code will be able to do state tracking on UDP-Lite
+	  connections.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_AMANDA
+	tristate "Amanda backup protocol support"
+	depends on NETFILTER_ADVANCED
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	help
+	  If you are running the Amanda backup package <http://www.amanda.org/>
+	  on this machine or machines that will be MASQUERADED through this
+	  machine, then you may want to enable this feature.  This allows the
+	  connection tracking and natting code to allow the sub-channels that
+	  Amanda requires for communication of the backup data, messages and
+	  index.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_FTP
+	tristate "FTP protocol support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Tracking FTP connections is problematic: special helpers are
+	  required for tracking them, and doing masquerading and other forms
+	  of Network Address Translation on them.
+
+	  This is FTP support on Layer 3 independent connection tracking.
+	  Layer 3 independent connection tracking is experimental scheme
+	  which generalize ip_conntrack to support other layer 3 protocols.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_H323
+	tristate "H.323 protocol support"
+	depends on (IPV6 || IPV6=n)
+	depends on NETFILTER_ADVANCED
+	help
+	  H.323 is a VoIP signalling protocol from ITU-T. As one of the most
+	  important VoIP protocols, it is widely used by voice hardware and
+	  software including voice gateways, IP phones, Netmeeting, OpenPhone,
+	  Gnomemeeting, etc.
+
+	  With this module you can support H.323 on a connection tracking/NAT
+	  firewall.
+
+	  This module supports RAS, Fast Start, H.245 Tunnelling, Call
+	  Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat,
+	  whiteboard, file transfer, etc. For more information, please
+	  visit http://nath323.sourceforge.net/.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_IRC
+	tristate "IRC protocol support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  There is a commonly-used extension to IRC called
+	  Direct Client-to-Client Protocol (DCC).  This enables users to send
+	  files to each other, and also chat to each other without the need
+	  of a server.  DCC Sending is used anywhere you send files over IRC,
+	  and DCC Chat is most commonly used by Eggdrop bots.  If you are
+	  using NAT, this extension will enable you to send files and initiate
+	  chats.  Note that you do NOT need this extension to get files or
+	  have others initiate chats, or everything else in IRC.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_BROADCAST
+	tristate
+
+config NF_CONNTRACK_NETBIOS_NS
+	tristate "NetBIOS name service protocol support"
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_BROADCAST
+	help
+	  NetBIOS name service requests are sent as broadcast messages from an
+	  unprivileged port and responded to with unicast messages to the
+	  same port. This make them hard to firewall properly because connection
+	  tracking doesn't deal with broadcasts. This helper tracks locally
+	  originating NetBIOS name service requests and the corresponding
+	  responses. It relies on correct IP address configuration, specifically
+	  netmask and broadcast address. When properly configured, the output
+	  of "ip address show" should look similar to this:
+
+	  $ ip -4 address show eth0
+	  4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+	      inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_SNMP
+	tristate "SNMP service protocol support"
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_BROADCAST
+	help
+	  SNMP service requests are sent as broadcast messages from an
+	  unprivileged port and responded to with unicast messages to the
+	  same port. This make them hard to firewall properly because connection
+	  tracking doesn't deal with broadcasts. This helper tracks locally
+	  originating SNMP service requests and the corresponding
+	  responses. It relies on correct IP address configuration, specifically
+	  netmask and broadcast address.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_PPTP
+	tristate "PPtP protocol support"
+	depends on NETFILTER_ADVANCED
+	select NF_CT_PROTO_GRE
+	help
+	  This module adds support for PPTP (Point to Point Tunnelling
+	  Protocol, RFC2637) connection tracking and NAT.
+
+	  If you are running PPTP sessions over a stateful firewall or NAT
+	  box, you may want to enable this feature.
+
+	  Please note that not all PPTP modes of operation are supported yet.
+	  Specifically these limitations exist:
+	    - Blindly assumes that control connections are always established
+	      in PNS->PAC direction. This is a violation of RFC2637.
+	    - Only supports a single call within each session
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_SANE
+	tristate "SANE protocol support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on NETFILTER_ADVANCED
+	help
+	  SANE is a protocol for remote access to scanners as implemented
+	  by the 'saned' daemon. Like FTP, it uses separate control and
+	  data connections.
+
+	  With this module you can support SANE on a connection tracking
+	  firewall.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_SIP
+	tristate "SIP protocol support"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  SIP is an application-layer control protocol that can establish,
+	  modify, and terminate multimedia sessions (conferences) such as
+	  Internet telephony calls. With the ip_conntrack_sip and
+	  the nf_nat_sip modules you can support the protocol on a connection
+	  tracking/NATing firewall.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CONNTRACK_TFTP
+	tristate "TFTP protocol support"
+	depends on NETFILTER_ADVANCED
+	help
+	  TFTP connection tracking helper, this is required depending
+	  on how restrictive your ruleset is.
+	  If you are using a tftp client behind -j SNAT or -j MASQUERADING
+	  you will need this.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_CT_NETLINK
+	tristate 'Connection tracking netlink interface'
+	select NETFILTER_NETLINK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This option enables support for a netlink-based userspace interface
+
+endif # NF_CONNTRACK
+
+# transparent proxy support
+config NETFILTER_TPROXY
+	tristate "Transparent proxying support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on IP_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option enables transparent proxying support, that is,
+	  support for handling non-locally bound IPv4 TCP and UDP sockets.
+	  For it to work you will have to configure certain iptables rules
+	  and use policy routing. For more information on how to set it up
+	  see Documentation/networking/tproxy.txt.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XTABLES
+	tristate "Netfilter Xtables support (required for ip_tables)"
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This is required if you intend to use any of ip_tables,
+	  ip6_tables or arp_tables.
+
+if NETFILTER_XTABLES
+
+comment "Xtables combined modules"
+
+config NETFILTER_XT_MARK
+	tristate 'nfmark target and match support'
+	default m if NETFILTER_ADVANCED=n
+	---help---
+	This option adds the "MARK" target and "mark" match.
+
+	Netfilter mark matching allows you to match packets based on the
+	"nfmark" value in the packet.
+	The target allows you to create rules in the "mangle" table which alter
+	the netfilter mark (nfmark) field associated with the packet.
+
+	Prior to routing, the nfmark can influence the routing method (see
+	"Use netfilter MARK value as routing key") and can also be used by
+	other subsystems to change their behavior.
+
+config NETFILTER_XT_CONNMARK
+	tristate 'ctmark target and match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_MARK
+	---help---
+	This option adds the "CONNMARK" target and "connmark" match.
+
+	Netfilter allows you to store a mark value per connection (a.k.a.
+	ctmark), similarly to the packet mark (nfmark). Using this
+	target and match, you can set and match on this mark.
+
+config NETFILTER_XT_SET
+	tristate 'set target and match support'
+	depends on IP_SET
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds the "SET" target and "set" match.
+
+	  Using this target and match, you can add/delete and match
+	  elements in the sets created by ipset(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# alphabetically ordered list of targets
+
+comment "Xtables targets"
+
+config NETFILTER_XT_TARGET_AUDIT
+	tristate "AUDIT target support"
+	depends on AUDIT
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a 'AUDIT' target, which can be used to create
+	  audit records for packets dropped/accepted.
+
+	  To compileit as a module, choose M here. If unsure, say N.
+
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_CLASSIFY
+	tristate '"CLASSIFY" target support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `CLASSIFY' target, which enables the user to set
+	  the priority of a packet. Some qdiscs can use this value for
+	  classification, among these are:
+
+  	  atm, cbq, dsmark, pfifo_fast, htb, prio
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_CONNMARK
+	tristate  '"CONNMARK" target support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_CONNMARK
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
+
+config NETFILTER_XT_TARGET_CONNSECMARK
+	tristate '"CONNSECMARK" target support'
+	depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The CONNSECMARK target copies security markings from packets
+	  to connections, and restores security markings from connections
+	  to packets (if the packets are not already marked).  This would
+	  normally be used in conjunction with the SECMARK target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_CT
+	tristate '"CT" target support'
+	depends on NF_CONNTRACK
+	depends on IP_NF_RAW || IP6_NF_RAW
+	depends on NETFILTER_ADVANCED
+	help
+	  This options adds a `CT' target, which allows to specify initial
+	  connection tracking parameters like events to be delivered and
+	  the helper to be used.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_DSCP
+	tristate '"DSCP" and "TOS" target support'
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `DSCP' target, which allows you to manipulate
+	  the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+	  The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+	  It also adds the "TOS" target, which allows you to create rules in
+	  the "mangle" table which alter the Type Of Service field of an IPv4
+	  or the Priority field of an IPv6 packet, prior to routing.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_HL
+	tristate '"HL" hoplimit target support'
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
+	targets, which enable the user to change the
+	hoplimit/time-to-live value of the IP header.
+
+	While it is safe to decrement the hoplimit/TTL value, the
+	modules also allow to increment and set the hoplimit value of
+	the header to arbitrary values. This is EXTREMELY DANGEROUS
+	since you can easily create immortal packets that loop
+	forever on the network.
+
+config NETFILTER_XT_TARGET_IDLETIMER
+	tristate  "IDLETIMER target support"
+	depends on NETFILTER_ADVANCED
+	help
+
+	  This option adds the `IDLETIMER' target.  Each matching packet
+	  resets the timer associated with label specified when the rule is
+	  added.  When the timer expires, it triggers a sysfs notification.
+	  The remaining time for expiration can be read via sysfs.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_LED
+	tristate '"LED" target support'
+	depends on LEDS_CLASS && LEDS_TRIGGERS
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `LED' target, which allows you to blink LEDs in
+	  response to particular packets passing through your machine.
+
+	  This can be used to turn a spare LED into a network activity LED,
+	  which only flashes in response to FTP transfers, for example.  Or
+	  you could have an LED which lights up for a minute or two every time
+	  somebody connects to your machine via SSH.
+
+	  You will need support for the "led" class to make this work.
+
+	  To create an LED trigger for incoming SSH traffic:
+	    iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000
+
+	  Then attach the new trigger to an LED on your system:
+	    echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
+
+	  For more information on the LEDs available on your system, see
+	  Documentation/leds-class.txt
+
+config NETFILTER_XT_TARGET_MARK
+	tristate '"MARK" target support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MARK
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_TARGET_NFLOG
+	tristate '"NFLOG" target support'
+	default m if NETFILTER_ADVANCED=n
+	select NETFILTER_NETLINK_LOG
+	help
+	  This option enables the NFLOG target, which allows to LOG
+	  messages through nfnetlink_log.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_NFQUEUE
+	tristate '"NFQUEUE" target Support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_NETLINK_QUEUE
+	help
+	  This target replaced the old obsolete QUEUE target.
+
+	  As opposed to QUEUE, it supports 65535 different queues,
+	  not just one.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_NOTRACK
+	tristate  '"NOTRACK" target support'
+	depends on IP_NF_RAW || IP6_NF_RAW
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	help
+	  The NOTRACK target allows a select rule to specify
+	  which packets *not* to enter the conntrack/NAT
+	  subsystem with all the consequences (no ICMP error tracking,
+	  no protocol helpers for the selected packets).
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_TARGET_RATEEST
+	tristate '"RATEEST" target support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `RATEEST' target, which allows to measure
+	  rates similar to TC estimators. The `rateest' match can be
+	  used to match on the measured rates.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_TEE
+	tristate '"TEE" - packet cloning to alternate destination'
+	depends on NETFILTER_ADVANCED
+	depends on (IPV6 || IPV6=n)
+	depends on !NF_CONNTRACK || NF_CONNTRACK
+	---help---
+	This option adds a "TEE" target with which a packet can be cloned and
+	this clone be rerouted to another nexthop.
+
+config NETFILTER_XT_TARGET_TPROXY
+	tristate '"TPROXY" target support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_TPROXY
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+	help
+	  This option adds a `TPROXY' target, which is somewhat similar to
+	  REDIRECT.  It can only be used in the mangle table and is useful
+	  to redirect traffic to a transparent proxy.  It does _not_ depend
+	  on Netfilter connection tracking and NAT, unlike REDIRECT.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_TRACE
+	tristate  '"TRACE" target support'
+	depends on IP_NF_RAW || IP6_NF_RAW
+	depends on NETFILTER_ADVANCED
+	help
+	  The TRACE target allows you to mark packets so that the kernel
+	  will log every rule which match the packets as those traverse
+	  the tables, chains, rules.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_TARGET_SECMARK
+	tristate '"SECMARK" target support'
+	depends on NETWORK_SECMARK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  The SECMARK target allows security marking of network
+	  packets, for use with security subsystems.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPMSS
+	tristate '"TCPMSS" target support'
+	depends on (IPV6 || IPV6=n)
+	default m if NETFILTER_ADVANCED=n
+	---help---
+	  This option adds a `TCPMSS' target, which allows you to alter the
+	  MSS value of TCP SYN packets, to control the maximum size for that
+	  connection (usually limiting it to your outgoing interface's MTU
+	  minus 40).
+
+	  This is used to overcome criminally braindead ISPs or servers which
+	  block ICMP Fragmentation Needed packets.  The symptoms of this
+	  problem are that everything works fine from your Linux
+	  firewall/router, but machines behind it can never exchange large
+	  packets:
+	        1) Web browsers connect, then hang with no data received.
+	        2) Small mail works fine, but large emails hang.
+	        3) ssh works fine, but scp hangs after initial handshaking.
+
+	  Workaround: activate this option and add a rule to your firewall
+	  configuration like:
+
+	  iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+	                 -j TCPMSS --clamp-mss-to-pmtu
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_TARGET_TCPOPTSTRIP
+	tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a "TCPOPTSTRIP" target, which allows you to strip
+	  TCP options from TCP packets.
+
+# alphabetically ordered list of matches
+
+comment "Xtables matches"
+
+config NETFILTER_XT_MATCH_ADDRTYPE
+	tristate '"addrtype" address type match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option allows you to match what routing thinks of an address,
+	  eg. UNICAST, LOCAL, BROADCAST, ...
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CLUSTER
+	tristate '"cluster" match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option allows you to build work-load-sharing clusters of
+	  network servers/stateful firewalls without having a dedicated
+	  load-balancing router/server/switch. Basically, this match returns
+	  true when the packet must be handled by this cluster node. Thus,
+	  all nodes see all packets and this match decides which node handles
+	  what packets. The work-load sharing algorithm is based on source
+	  address hashing.
+
+	  If you say Y or M here, try `iptables -m cluster --help` for
+	  more information.
+
+config NETFILTER_XT_MATCH_COMMENT
+	tristate  '"comment" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `comment' dummy-match, which allows you to put
+	  comments in your iptables ruleset.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNBYTES
+	tristate  '"connbytes" per-connection counter match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `connbytes' match, which allows you to match the
+	  number of bytes and/or packets for each direction within a connection.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_CONNLIMIT
+	tristate '"connlimit" match support"'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This match allows you to match against the number of parallel
+	  connections to a server per client IP address (or address block).
+
+config NETFILTER_XT_MATCH_CONNMARK
+	tristate  '"connmark" connection mark match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_CONNMARK
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
+
+config NETFILTER_XT_MATCH_CONNTRACK
+	tristate '"conntrack" connection tracking match support'
+	depends on NF_CONNTRACK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  This is a general conntrack match module, a superset of the state match.
+
+	  It allows matching on additional conntrack information, which is
+	  useful in complex configurations, such as NAT gateways with multiple
+	  internet links or tunnels.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_CPU
+	tristate '"cpu" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  CPU matching allows you to match packets based on the CPU
+	  currently handling the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_DCCP
+	tristate '"dccp" protocol match support'
+	depends on NETFILTER_ADVANCED
+	default IP_DCCP
+	help
+	  With this option enabled, you will be able to use the iptables
+	  `dccp' match in order to match on DCCP source/destination ports
+	  and DCCP flags.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_DEVGROUP
+	tristate '"devgroup" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This options adds a `devgroup' match, which allows to match on the
+	  device group a network device is assigned to.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_DSCP
+	tristate '"dscp" and "tos" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `DSCP' match, which allows you to match against
+	  the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+	  The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+	  It will also add a "tos" match, which allows you to match packets
+	  based on the Type Of Service fields of the IPv4 packet (which share
+	  the same bits as DSCP).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_ESP
+	tristate '"esp" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This match extension allows you to match a range of SPIs
+	  inside ESP header of IPSec packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_HASHLIMIT
+	tristate '"hashlimit" match support'
+	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `hashlimit' match.
+
+	  As opposed to `limit', this match dynamically creates a hash table
+	  of limit buckets, based on your selection of source/destination
+	  addresses and/or ports.
+
+	  It enables you to express policies like `10kpps for any given
+	  destination address' or `500pps from any given source address'
+	  with a single rule.
+
+config NETFILTER_XT_MATCH_HELPER
+	tristate '"helper" match support'
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	help
+	  Helper matching allows you to match packets in dynamic connections
+	  tracked by a conntrack-helper, ie. ip_conntrack_ftp
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config NETFILTER_XT_MATCH_HL
+	tristate '"hl" hoplimit/TTL match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	HL matching allows you to match packets based on the hoplimit
+	in the IPv6 header, or the time-to-live field in the IPv4
+	header of the packet.
+
+config NETFILTER_XT_MATCH_IPRANGE
+	tristate '"iprange" address range match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	This option adds a "iprange" match, which allows you to match based on
+	an IP address range. (Normal iptables only matches on single addresses
+	with an optional mask.)
+
+	If unsure, say M.
+
+config NETFILTER_XT_MATCH_IPVS
+	tristate '"ipvs" match support'
+	depends on IP_VS
+	depends on NETFILTER_ADVANCED
+	depends on NF_CONNTRACK
+	help
+	  This option allows you to match against IPVS properties of a packet.
+
+	  If unsure, say N.
+
+config NETFILTER_XT_MATCH_LENGTH
+	tristate '"length" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option allows you to match the length of a packet against a
+	  specific value or range of values.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_LIMIT
+	tristate '"limit" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  limit matching allows you to control the rate at which a rule can be
+	  matched: mainly useful in combination with the LOG target ("LOG
+	  target support", below) and to avoid some Denial of Service attacks.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_MAC
+	tristate '"mac" address match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  MAC matching allows you to match packets based on the source
+	  Ethernet address of the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_MARK
+	tristate '"mark" match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_MARK
+	---help---
+	This is a backwards-compat option for the user's convenience
+	(e.g. when running oldconfig). It selects
+	CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+
+config NETFILTER_XT_MATCH_MULTIPORT
+	tristate '"multiport" Multiple port match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  Multiport matching allows you to match TCP or UDP packets based on
+	  a series of source or destination ports: normally a rule can only
+	  match a single range of ports.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_OSF
+	tristate '"osf" Passive OS fingerprint match'
+	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+	help
+	  This option selects the Passive OS Fingerprinting match module
+	  that allows to passively match the remote operating system by
+	  analyzing incoming TCP SYN packets.
+
+	  Rules and loading software can be downloaded from
+	  http://www.ioremap.net/projects/osf
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_OWNER
+	tristate '"owner" match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	Socket owner matching allows you to match locally-generated packets
+	based on who created the socket: the user or group. It is also
+	possible to check whether a socket actually exists.
+
+	Conflicts with '"quota, tag, uid" match'
+
+config NETFILTER_XT_MATCH_POLICY
+	tristate 'IPsec "policy" match support'
+	depends on XFRM
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_PHYSDEV
+	tristate '"physdev" match support'
+	depends on BRIDGE && BRIDGE_NETFILTER
+	depends on NETFILTER_ADVANCED
+	help
+	  Physdev packet matching matches against the physical bridge ports
+	  the IP packet arrived on or will leave by.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_PKTTYPE
+	tristate '"pkttype" packet type match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  Packet type matching allows you to match a packet by
+	  its "class", eg. BROADCAST, MULTICAST, ...
+
+	  Typical usage:
+	  iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_QTAGUID
+	bool '"quota, tag, owner" match and stats support'
+        depends on NETFILTER_XT_MATCH_SOCKET
+	depends on NETFILTER_XT_MATCH_OWNER=n
+	help
+	  This option replaces the `owner' match. In addition to matching
+	  on uid, it keeps stats based on a tag assigned to a socket.
+	  The full tag is comprised of a UID and an accounting tag.
+	  The tags are assignable to sockets from user space (e.g. a download
+	  manager can assign the socket to another UID for accounting).
+	  Stats and control are done via /proc/net/xt_qtaguid/.
+	  It replaces owner as it takes the same arguments, but should
+	  really be recognized by the iptables tool.
+
+	  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA
+	tristate '"quota" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `quota' match, which allows to match on a
+	  byte counter.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2
+	tristate '"quota2" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `quota2' match, which allows to match on a
+	  byte counter correctly and not per CPU.
+	  It allows naming the quotas.
+	  This is based on http://xtables-addons.git.sourceforge.net
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+	bool '"quota2" Netfilter LOG support'
+	depends on NETFILTER_XT_MATCH_QUOTA2
+	depends on IP_NF_TARGET_ULOG=n    # not yes, not module, just no
+	default n
+	help
+	  This option allows `quota2' to log ONCE when a quota limit
+	  is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+	  It logs similarly to how ipt_ULOG would without data.
+
+	  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RATEEST
+	tristate '"rateest" match support'
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_XT_TARGET_RATEEST
+	help
+	  This option adds a `rateest' match, which allows to match on the
+	  rate estimated by the RATEEST target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_REALM
+	tristate  '"realm" match support'
+	depends on NETFILTER_ADVANCED
+	select IP_ROUTE_CLASSID
+	help
+	  This option adds a `realm' match, which allows you to use the realm
+	  key from the routing subsystem inside iptables.
+
+	  This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option 
+	  in tc world.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_RECENT
+	tristate '"recent" match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	This match is used for creating one or many lists of recently
+	used addresses and then matching against that/those list(s).
+
+	Short options are available by using 'iptables -m recent -h'
+	Official Website: <http://snowman.net/projects/ipt_recent/>
+
+config NETFILTER_XT_MATCH_SCTP
+	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_ADVANCED
+	default IP_SCTP
+	help
+	  With this option enabled, you will be able to use the 
+	  `sctp' match in order to match on SCTP source/destination ports
+	  and SCTP chunk types.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_SOCKET
+	tristate '"socket" match support (EXPERIMENTAL)'
+	depends on EXPERIMENTAL
+	depends on NETFILTER_TPROXY
+	depends on NETFILTER_XTABLES
+	depends on NETFILTER_ADVANCED
+	depends on !NF_CONNTRACK || NF_CONNTRACK
+	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+	help
+	  This option adds a `socket' match, which can be used to match
+	  packets for which a TCP or UDP socket lookup finds a valid socket.
+	  It can be used in combination with the MARK target and policy
+	  routing to implement full featured non-locally bound sockets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_STATE
+	tristate '"state" match support'
+	depends on NF_CONNTRACK
+	default m if NETFILTER_ADVANCED=n
+	help
+	  Connection state matching allows you to match packets based on their
+	  relationship to a tracked connection (ie. previous packets).  This
+	  is a powerful tool for packet classification.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_STATISTIC
+	tristate '"statistic" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `statistic' match, which allows you to match
+	  on packets periodically or randomly with a given percentage.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_STRING
+	tristate  '"string" match support'
+	depends on NETFILTER_ADVANCED
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
+	select TEXTSEARCH_FSM
+	help
+	  This option adds a `string' match, which allows you to look for
+	  pattern matchings in packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_TCPMSS
+	tristate '"tcpmss" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `tcpmss' match, which allows you to examine the
+	  MSS value of TCP SYN packets, which control the maximum packet size
+	  for that connection.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NETFILTER_XT_MATCH_TIME
+	tristate '"time" match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a "time" match, which allows you to match based on
+	  the packet arrival time (at the machine which netfilter is running)
+	  on) or departure time/date (for locally generated packets).
+
+	  If you say Y here, try `iptables -m time --help` for
+	  more information.
+
+	  If you want to compile it as a module, say M here.
+	  If unsure, say N.
+
+config NETFILTER_XT_MATCH_U32
+	tristate '"u32" match support'
+	depends on NETFILTER_ADVANCED
+	---help---
+	  u32 allows you to extract quantities of up to 4 bytes from a packet,
+	  AND them with specified masks, shift them by specified amounts and
+	  test whether the results are in any of a set of specified ranges.
+	  The specification of what to extract is general enough to skip over
+	  headers with lengths stored in the packet, as in IP or TCP header
+	  lengths.
+
+	  Details and examples are in the kernel module source.
+
+endif # NETFILTER_XTABLES
+
+endmenu
+
+source "net/netfilter/ipset/Kconfig"
+
+source "net/netfilter/ipvs/Kconfig"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 00000000..6d917176
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,117 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+
+nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
+
+obj-$(CONFIG_NETFILTER) = netfilter.o
+
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
+
+# netlink interface for nf_conntrack
+obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
+
+# connection tracking helpers
+nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o
+
+obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
+obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
+obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
+obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
+obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
+obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
+obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
+obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
+
+# transparent proxy support
+obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
+
+# generic X tables 
+obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
+
+# combos
+obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
+obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
+obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
+
+# targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
+
+# matches
+obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+
+# ipset
+obj-$(CONFIG_IP_SET) += ipset/
+
+# IPVS
+obj-$(CONFIG_IP_VS) += ipvs/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 00000000..899b71c0
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,291 @@
+/* netfilter.c: look after the filters for various protocols.
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ */
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+static DEFINE_MUTEX(afinfo_mutex);
+
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+EXPORT_SYMBOL(nf_afinfo);
+
+int nf_register_afinfo(const struct nf_afinfo *afinfo)
+{
+	int err;
+
+	err = mutex_lock_interruptible(&afinfo_mutex);
+	if (err < 0)
+		return err;
+	rcu_assign_pointer(nf_afinfo[afinfo->family], afinfo);
+	mutex_unlock(&afinfo_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_afinfo);
+
+void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
+{
+	mutex_lock(&afinfo_mutex);
+	rcu_assign_pointer(nf_afinfo[afinfo->family], NULL);
+	mutex_unlock(&afinfo_mutex);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
+
+struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
+EXPORT_SYMBOL(nf_hooks);
+static DEFINE_MUTEX(nf_hook_mutex);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	struct nf_hook_ops *elem;
+	int err;
+
+	err = mutex_lock_interruptible(&nf_hook_mutex);
+	if (err < 0)
+		return err;
+	list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+		if (reg->priority < elem->priority)
+			break;
+	}
+	list_add_rcu(&reg->list, elem->list.prev);
+	mutex_unlock(&nf_hook_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	mutex_lock(&nf_hook_mutex);
+	list_del_rcu(&reg->list);
+	mutex_unlock(&nf_hook_mutex);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+
+int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = nf_register_hook(&reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		nf_unregister_hooks(reg, i);
+	return err;
+}
+EXPORT_SYMBOL(nf_register_hooks);
+
+void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+	while (n-- > 0)
+		nf_unregister_hook(&reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_hooks);
+
+unsigned int nf_iterate(struct list_head *head,
+			struct sk_buff *skb,
+			unsigned int hook,
+			const struct net_device *indev,
+			const struct net_device *outdev,
+			struct list_head **i,
+			int (*okfn)(struct sk_buff *),
+			int hook_thresh)
+{
+	unsigned int verdict;
+
+	/*
+	 * The caller must not block between calls to this
+	 * function because of risk of continuing from deleted element.
+	 */
+	list_for_each_continue_rcu(*i, head) {
+		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+
+		if (hook_thresh > elem->priority)
+			continue;
+
+		/* Optimization: we don't need to hold module
+		   reference here, since function can't sleep. --RR */
+repeat:
+		verdict = elem->hook(hook, skb, indev, outdev, okfn);
+		if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+			if (unlikely((verdict & NF_VERDICT_MASK)
+							> NF_MAX_VERDICT)) {
+				NFDEBUG("Evil return from %p(%u).\n",
+					elem->hook, hook);
+				continue;
+			}
+#endif
+			if (verdict != NF_REPEAT)
+				return verdict;
+			goto repeat;
+		}
+	}
+	return NF_ACCEPT;
+}
+
+
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,
+		 struct net_device *indev,
+		 struct net_device *outdev,
+		 int (*okfn)(struct sk_buff *),
+		 int hook_thresh)
+{
+	struct list_head *elem;
+	unsigned int verdict;
+	int ret = 0;
+
+	/* We may already have this, but read-locks nest anyway */
+	rcu_read_lock();
+
+	elem = &nf_hooks[pf][hook];
+next_hook:
+	verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,
+			     outdev, &elem, okfn, hook_thresh);
+	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+		ret = 1;
+	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
+		kfree_skb(skb);
+		ret = NF_DROP_GETERR(verdict);
+		if (ret == 0)
+			ret = -EPERM;
+	} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+		ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+			       verdict >> NF_VERDICT_QBITS);
+		if (ret < 0) {
+			if (ret == -ECANCELED)
+				goto next_hook;
+			if (ret == -ESRCH &&
+			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+				goto next_hook;
+			kfree_skb(skb);
+		}
+		ret = 0;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+
+
+int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
+{
+	if (writable_len > skb->len)
+		return 0;
+
+	/* Not exclusive use of packet?  Must copy. */
+	if (!skb_cloned(skb)) {
+		if (writable_len <= skb_headlen(skb))
+			return 1;
+	} else if (skb_clone_writable(skb, writable_len))
+		return 1;
+
+	if (writable_len <= skb_headlen(skb))
+		writable_len = 0;
+	else
+		writable_len -= skb_headlen(skb);
+
+	return !!__pskb_pull_tail(skb, writable_len);
+}
+EXPORT_SYMBOL(skb_make_writable);
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+/* This does not belong here, but locally generated errors need it if connection
+   tracking in use: without this, connection may not be in hash table, and hence
+   manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
+EXPORT_SYMBOL(ip_ct_attach);
+
+void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+{
+	void (*attach)(struct sk_buff *, struct sk_buff *);
+
+	if (skb->nfct) {
+		rcu_read_lock();
+		attach = rcu_dereference(ip_ct_attach);
+		if (attach)
+			attach(new, skb);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(nf_ct_attach);
+
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
+EXPORT_SYMBOL(nf_ct_destroy);
+
+void nf_conntrack_destroy(struct nf_conntrack *nfct)
+{
+	void (*destroy)(struct nf_conntrack *);
+
+	rcu_read_lock();
+	destroy = rcu_dereference(nf_ct_destroy);
+	BUG_ON(destroy == NULL);
+	destroy(nfct);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_conntrack_destroy);
+#endif /* CONFIG_NF_CONNTRACK */
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_net_netfilter;
+EXPORT_SYMBOL(proc_net_netfilter);
+#endif
+
+void __init netfilter_init(void)
+{
+	int i, h;
+	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
+		for (h = 0; h < NF_MAX_HOOKS; h++)
+			INIT_LIST_HEAD(&nf_hooks[i][h]);
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net);
+	if (!proc_net_netfilter)
+		panic("cannot create netfilter proc entry");
+#endif
+
+	if (netfilter_queue_init() < 0)
+		panic("cannot initialize nf_queue");
+	if (netfilter_log_init() < 0)
+		panic("cannot initialize nf_log");
+}
+
+#ifdef CONFIG_SYSCTL
+struct ctl_path nf_net_netfilter_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "netfilter", },
+	{ }
+};
+EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path);
+#endif /* CONFIG_SYSCTL */
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
new file mode 100644
index 00000000..2c5b348e
--- /dev/null
+++ b/net/netfilter/ipset/Kconfig
@@ -0,0 +1,122 @@
+menuconfig IP_SET
+	tristate "IP set support"
+	depends on INET && NETFILTER
+	depends on NETFILTER_NETLINK
+	help
+	  This option adds IP set support to the kernel.
+	  In order to define and use the sets, you need the userspace utility
+	  ipset(8). You can use the sets in netfilter via the "set" match
+	  and "SET" target.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+if IP_SET
+
+config IP_SET_MAX
+	int "Maximum number of IP sets"
+	default 256
+	range 2 65534
+	depends on IP_SET
+	help
+	  You can define here default value of the maximum number 
+	  of IP sets for the kernel.
+
+	  The value can be overriden by the 'max_sets' module
+	  parameter of the 'ip_set' module.
+
+config IP_SET_BITMAP_IP
+	tristate "bitmap:ip set support"
+	depends on IP_SET
+	help
+	  This option adds the bitmap:ip set type support, by which one
+	  can store IPv4 addresses (or network addresse) from a range.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_BITMAP_IPMAC
+	tristate "bitmap:ip,mac set support"
+	depends on IP_SET
+	help
+	  This option adds the bitmap:ip,mac set type support, by which one
+	  can store IPv4 address and (source) MAC address pairs from a range.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_BITMAP_PORT
+	tristate "bitmap:port set support"
+	depends on IP_SET
+	help
+	  This option adds the bitmap:port set type support, by which one
+	  can store TCP/UDP port numbers from a range.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_IP
+	tristate "hash:ip set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:ip set type support, by which one
+	  can store arbitrary IPv4 or IPv6 addresses (or network addresses)
+	  in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_IPPORT
+	tristate "hash:ip,port set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:ip,port set type support, by which one
+	  can store IPv4/IPv6 address and protocol/port pairs.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_IPPORTIP
+	tristate "hash:ip,port,ip set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:ip,port,ip set type support, by which
+	  one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+	  address triples in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_IPPORTNET
+	tristate "hash:ip,port,net set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:ip,port,net set type support, by which
+	  one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6
+	  network address/prefix triples in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_NET
+	tristate "hash:net set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:net set type support, by which
+	  one can store IPv4/IPv6 network address/prefix elements in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_HASH_NETPORT
+	tristate "hash:net,port set support"
+	depends on IP_SET
+	help
+	  This option adds the hash:net,port set type support, by which
+	  one can store IPv4/IPv6 network address/prefix and
+	  protocol/port pairs as elements in a set.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_SET_LIST_SET
+	tristate "list:set set support"
+	depends on IP_SET
+	help
+	  This option adds the list:set set type support. In this
+	  kind of set one can store the name of other sets and it forms
+	  an ordered union of the member sets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+endif # IP_SET
diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile
new file mode 100644
index 00000000..5adbdab6
--- /dev/null
+++ b/net/netfilter/ipset/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for the ipset modules
+#
+
+ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o
+
+# ipset core
+obj-$(CONFIG_IP_SET) += ip_set.o
+
+# bitmap types
+obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o
+obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o
+obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o
+
+# hash types
+obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o
+obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o
+obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o
+obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o
+
+# list types
+obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
new file mode 100644
index 00000000..ba2d1660
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -0,0 +1,586 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ *                         Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip");
+
+/* Type structure */
+struct bitmap_ip {
+	void *members;		/* the set members */
+	u32 first_ip;		/* host byte order, included in range */
+	u32 last_ip;		/* host byte order, included in range */
+	u32 elements;		/* number of max elements in the set */
+	u32 hosts;		/* number of hosts in a subnet */
+	size_t memsize;		/* members size */
+	u8 netmask;		/* subnet netmask */
+	u32 timeout;		/* timeout parameter */
+	struct timer_list gc;	/* garbage collection */
+};
+
+/* Base variant */
+
+static inline u32
+ip_to_id(const struct bitmap_ip *m, u32 ip)
+{
+	return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+}
+
+static int
+bitmap_ip_test(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_ip *map = set->data;
+	u16 id = *(u16 *)value;
+
+	return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_ip_add(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ip *map = set->data;
+	u16 id = *(u16 *)value;
+
+	if (test_and_set_bit(id, map->members))
+		return -IPSET_ERR_EXIST;
+
+	return 0;
+}
+
+static int
+bitmap_ip_del(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ip *map = set->data;
+	u16 id = *(u16 *)value;
+
+	if (!test_and_clear_bit(id, map->members))
+		return -IPSET_ERR_EXIST;
+
+	return 0;
+}
+
+static int
+bitmap_ip_list(const struct ip_set *set,
+	       struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_ip *map = set->data;
+	struct nlattr *atd, *nested;
+	u32 id, first = cb->args[2];
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	for (; cb->args[2] < map->elements; cb->args[2]++) {
+		id = cb->args[2];
+		if (!test_bit(id, map->members))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, atd);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+				htonl(map->first_ip + id * map->hosts));
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, atd);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_ip *map = set->data;
+	const unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+
+	return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ip *map = set->data;
+	unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+
+	if (ip_set_timeout_test(members[id]))
+		return -IPSET_ERR_EXIST;
+
+	members[id] = ip_set_timeout_set(timeout);
+
+	return 0;
+}
+
+static int
+bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ip *map = set->data;
+	unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+	int ret = -IPSET_ERR_EXIST;
+
+	if (ip_set_timeout_test(members[id]))
+		ret = 0;
+
+	members[id] = IPSET_ELEM_UNSET;
+	return ret;
+}
+
+static int
+bitmap_ip_tlist(const struct ip_set *set,
+		struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_ip *map = set->data;
+	struct nlattr *adt, *nested;
+	u32 id, first = cb->args[2];
+	const unsigned long *members = map->members;
+
+	adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!adt)
+		return -EMSGSIZE;
+	for (; cb->args[2] < map->elements; cb->args[2]++) {
+		id = cb->args[2];
+		if (!ip_set_timeout_test(members[id]))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, adt);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+				htonl(map->first_ip + id * map->hosts));
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+			      htonl(ip_set_timeout_get(members[id])));
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, adt);
+
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, adt);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static int
+bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	struct bitmap_ip *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	u32 ip;
+
+	ip = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+	if (ip < map->first_ip || ip > map->last_ip)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	ip = ip_to_id(map, ip);
+
+	return adtfn(set, &ip, map->timeout);
+}
+
+static int
+bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
+	       enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	struct bitmap_ip *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	u32 timeout = map->timeout;
+	u32 ip, ip_to, id;
+	int ret = 0;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+	if (ret)
+		return ret;
+
+	if (ip < map->first_ip || ip > map->last_ip)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(map->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST) {
+		id = ip_to_id(map, ip);
+		return adtfn(set, &id, timeout);
+	}
+
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip > ip_to) {
+			swap(ip, ip_to);
+			if (ip < map->first_ip)
+				return -IPSET_ERR_BITMAP_RANGE;
+		}
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr > 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		ip &= ip_set_hostmask(cidr);
+		ip_to = ip | ~ip_set_hostmask(cidr);
+	} else
+		ip_to = ip;
+
+	if (ip_to > map->last_ip)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	for (; !before(ip_to, ip); ip += map->hosts) {
+		id = ip_to_id(map, ip);
+		ret = adtfn(set, &id, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+static void
+bitmap_ip_destroy(struct ip_set *set)
+{
+	struct bitmap_ip *map = set->data;
+
+	if (with_timeout(map->timeout))
+		del_timer_sync(&map->gc);
+
+	ip_set_free(map->members);
+	kfree(map);
+
+	set->data = NULL;
+}
+
+static void
+bitmap_ip_flush(struct ip_set *set)
+{
+	struct bitmap_ip *map = set->data;
+
+	memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_ip_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct bitmap_ip *map = set->data;
+	struct nlattr *nested;
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+	if (map->netmask != 32)
+		NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask);
+	NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1));
+	NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+		      htonl(sizeof(*map) + map->memsize));
+	if (with_timeout(map->timeout))
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static bool
+bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct bitmap_ip *x = a->data;
+	const struct bitmap_ip *y = b->data;
+
+	return x->first_ip == y->first_ip &&
+	       x->last_ip == y->last_ip &&
+	       x->netmask == y->netmask &&
+	       x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ip = {
+	.kadt	= bitmap_ip_kadt,
+	.uadt	= bitmap_ip_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_ip_add,
+		[IPSET_DEL] = bitmap_ip_del,
+		[IPSET_TEST] = bitmap_ip_test,
+	},
+	.destroy = bitmap_ip_destroy,
+	.flush	= bitmap_ip_flush,
+	.head	= bitmap_ip_head,
+	.list	= bitmap_ip_list,
+	.same_set = bitmap_ip_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tip = {
+	.kadt	= bitmap_ip_kadt,
+	.uadt	= bitmap_ip_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_ip_tadd,
+		[IPSET_DEL] = bitmap_ip_tdel,
+		[IPSET_TEST] = bitmap_ip_ttest,
+	},
+	.destroy = bitmap_ip_destroy,
+	.flush	= bitmap_ip_flush,
+	.head	= bitmap_ip_head,
+	.list	= bitmap_ip_tlist,
+	.same_set = bitmap_ip_same_set,
+};
+
+static void
+bitmap_ip_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct bitmap_ip *map = set->data;
+	unsigned long *table = map->members;
+	u32 id;
+
+	/* We run parallel with other readers (test element)
+	 * but adding/deleting new entries is locked out */
+	read_lock_bh(&set->lock);
+	for (id = 0; id < map->elements; id++)
+		if (ip_set_timeout_expired(table[id]))
+			table[id] = IPSET_ELEM_UNSET;
+	read_unlock_bh(&set->lock);
+
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static void
+bitmap_ip_gc_init(struct ip_set *set)
+{
+	struct bitmap_ip *map = set->data;
+
+	init_timer(&map->gc);
+	map->gc.data = (unsigned long) set;
+	map->gc.function = bitmap_ip_gc;
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_ip(struct ip_set *set, struct bitmap_ip *map,
+	    u32 first_ip, u32 last_ip,
+	    u32 elements, u32 hosts, u8 netmask)
+{
+	map->members = ip_set_alloc(map->memsize);
+	if (!map->members)
+		return false;
+	map->first_ip = first_ip;
+	map->last_ip = last_ip;
+	map->elements = elements;
+	map->hosts = hosts;
+	map->netmask = netmask;
+	map->timeout = IPSET_NO_TIMEOUT;
+
+	set->data = map;
+	set->family = AF_INET;
+
+	return true;
+}
+
+static int
+bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct bitmap_ip *map;
+	u32 first_ip, last_ip, hosts, elements;
+	u8 netmask = 32;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+		if (ret)
+			return ret;
+		if (first_ip > last_ip) {
+			u32 tmp = first_ip;
+
+			first_ip = last_ip;
+			last_ip = tmp;
+		}
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr >= 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		last_ip = first_ip | ~ip_set_hostmask(cidr);
+	} else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_NETMASK]) {
+		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+		if (netmask > 32)
+			return -IPSET_ERR_INVALID_NETMASK;
+
+		first_ip &= ip_set_hostmask(netmask);
+		last_ip |= ~ip_set_hostmask(netmask);
+	}
+
+	if (netmask == 32) {
+		hosts = 1;
+		elements = last_ip - first_ip + 1;
+	} else {
+		u8 mask_bits;
+		u32 mask;
+
+		mask = range_to_mask(first_ip, last_ip, &mask_bits);
+
+		if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) ||
+		    netmask <= mask_bits)
+			return -IPSET_ERR_BITMAP_RANGE;
+
+		pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
+		hosts = 2 << (32 - netmask - 1);
+		elements = 2 << (netmask - mask_bits - 1);
+	}
+	if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+		return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+	pr_debug("hosts %u, elements %u\n", hosts, elements);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		map->memsize = elements * sizeof(unsigned long);
+
+		if (!init_map_ip(set, map, first_ip, last_ip,
+				 elements, hosts, netmask)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+
+		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+		set->variant = &bitmap_tip;
+
+		bitmap_ip_gc_init(set);
+	} else {
+		map->memsize = bitmap_bytes(0, elements - 1);
+
+		if (!init_map_ip(set, map, first_ip, last_ip,
+				 elements, hosts, netmask)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+
+		set->variant = &bitmap_ip;
+	}
+	return 0;
+}
+
+static struct ip_set_type bitmap_ip_type __read_mostly = {
+	.name		= "bitmap:ip",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP,
+	.dimension	= IPSET_DIM_ONE,
+	.family		= AF_INET,
+	.revision	= 0,
+	.create		= bitmap_ip_create,
+	.create_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+bitmap_ip_init(void)
+{
+	return ip_set_type_register(&bitmap_ip_type);
+}
+
+static void __exit
+bitmap_ip_fini(void)
+{
+	ip_set_type_unregister(&bitmap_ip_type);
+}
+
+module_init(bitmap_ip_init);
+module_exit(bitmap_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
new file mode 100644
index 00000000..a274300b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -0,0 +1,655 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ *                         Patrick Schaaf <bof@bof.de>
+ *			   Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:ip,mac");
+
+enum {
+	MAC_EMPTY,		/* element is not set */
+	MAC_FILLED,		/* element is set with MAC */
+	MAC_UNSET,		/* element is set, without MAC */
+};
+
+/* Type structure */
+struct bitmap_ipmac {
+	void *members;		/* the set members */
+	u32 first_ip;		/* host byte order, included in range */
+	u32 last_ip;		/* host byte order, included in range */
+	u32 timeout;		/* timeout value */
+	struct timer_list gc;	/* garbage collector */
+	size_t dsize;		/* size of element */
+};
+
+/* ADT structure for generic function args */
+struct ipmac {
+	u32 id;			/* id in array */
+	unsigned char *ether;	/* ethernet address */
+};
+
+/* Member element without and with timeout */
+
+struct ipmac_elem {
+	unsigned char ether[ETH_ALEN];
+	unsigned char match;
+} __attribute__ ((aligned));
+
+struct ipmac_telem {
+	unsigned char ether[ETH_ALEN];
+	unsigned char match;
+	unsigned long timeout;
+} __attribute__ ((aligned));
+
+static inline void *
+bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id)
+{
+	return (void *)((char *)map->members + id * map->dsize);
+}
+
+static inline bool
+bitmap_timeout(const struct bitmap_ipmac *map, u32 id)
+{
+	const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+	return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+bitmap_expired(const struct bitmap_ipmac *map, u32 id)
+{
+	const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
+
+	return ip_set_timeout_expired(elem->timeout);
+}
+
+static inline int
+bitmap_ipmac_exist(const struct ipmac_telem *elem)
+{
+	return elem->match == MAC_UNSET ||
+	       (elem->match == MAC_FILLED &&
+		!ip_set_timeout_expired(elem->timeout));
+}
+
+/* Base variant */
+
+static int
+bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+	switch (elem->match) {
+	case MAC_UNSET:
+		/* Trigger kernel to fill out the ethernet address */
+		return -EAGAIN;
+	case MAC_FILLED:
+		return data->ether == NULL ||
+		       compare_ether_addr(data->ether, elem->ether) == 0;
+	}
+	return 0;
+}
+
+static int
+bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+	switch (elem->match) {
+	case MAC_UNSET:
+		if (!data->ether)
+			/* Already added without ethernet address */
+			return -IPSET_ERR_EXIST;
+		/* Fill the MAC address */
+		memcpy(elem->ether, data->ether, ETH_ALEN);
+		elem->match = MAC_FILLED;
+		break;
+	case MAC_FILLED:
+		return -IPSET_ERR_EXIST;
+	case MAC_EMPTY:
+		if (data->ether) {
+			memcpy(elem->ether, data->ether, ETH_ALEN);
+			elem->match = MAC_FILLED;
+		} else
+			elem->match = MAC_UNSET;
+	}
+
+	return 0;
+}
+
+static int
+bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+	if (elem->match == MAC_EMPTY)
+		return -IPSET_ERR_EXIST;
+
+	elem->match = MAC_EMPTY;
+
+	return 0;
+}
+
+static int
+bitmap_ipmac_list(const struct ip_set *set,
+		  struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_ipmac *map = set->data;
+	const struct ipmac_elem *elem;
+	struct nlattr *atd, *nested;
+	u32 id, first = cb->args[2];
+	u32 last = map->last_ip - map->first_ip;
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	for (; cb->args[2] <= last; cb->args[2]++) {
+		id = cb->args[2];
+		elem = bitmap_ipmac_elem(map, id);
+		if (elem->match == MAC_EMPTY)
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, atd);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+				htonl(map->first_ip + id));
+		if (elem->match == MAC_FILLED)
+			NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+				elem->ether);
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, atd);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
+
+	switch (elem->match) {
+	case MAC_UNSET:
+		/* Trigger kernel to fill out the ethernet address */
+		return -EAGAIN;
+	case MAC_FILLED:
+		return (data->ether == NULL ||
+			compare_ether_addr(data->ether, elem->ether) == 0) &&
+		       !bitmap_expired(map, data->id);
+	}
+	return 0;
+}
+
+static int
+bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+	switch (elem->match) {
+	case MAC_UNSET:
+		if (!data->ether)
+			/* Already added without ethernet address */
+			return -IPSET_ERR_EXIST;
+		/* Fill the MAC address and activate the timer */
+		memcpy(elem->ether, data->ether, ETH_ALEN);
+		elem->match = MAC_FILLED;
+		if (timeout == map->timeout)
+			/* Timeout was not specified, get stored one */
+			timeout = elem->timeout;
+		elem->timeout = ip_set_timeout_set(timeout);
+		break;
+	case MAC_FILLED:
+		if (!bitmap_expired(map, data->id))
+			return -IPSET_ERR_EXIST;
+		/* Fall through */
+	case MAC_EMPTY:
+		if (data->ether) {
+			memcpy(elem->ether, data->ether, ETH_ALEN);
+			elem->match = MAC_FILLED;
+		} else
+			elem->match = MAC_UNSET;
+		/* If MAC is unset yet, we store plain timeout value
+		 * because the timer is not activated yet
+		 * and we can reuse it later when MAC is filled out,
+		 * possibly by the kernel */
+		elem->timeout = data->ether ? ip_set_timeout_set(timeout)
+					    : timeout;
+		break;
+	}
+
+	return 0;
+}
+
+static int
+bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_ipmac *map = set->data;
+	const struct ipmac *data = value;
+	struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+
+	if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id))
+		return -IPSET_ERR_EXIST;
+
+	elem->match = MAC_EMPTY;
+
+	return 0;
+}
+
+static int
+bitmap_ipmac_tlist(const struct ip_set *set,
+		   struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_ipmac *map = set->data;
+	const struct ipmac_telem *elem;
+	struct nlattr *atd, *nested;
+	u32 id, first = cb->args[2];
+	u32 timeout, last = map->last_ip - map->first_ip;
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	for (; cb->args[2] <= last; cb->args[2]++) {
+		id = cb->args[2];
+		elem = bitmap_ipmac_elem(map, id);
+		if (!bitmap_ipmac_exist(elem))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, atd);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP,
+				htonl(map->first_ip + id));
+		if (elem->match == MAC_FILLED)
+			NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN,
+				elem->ether);
+		timeout = elem->match == MAC_UNSET ? elem->timeout
+				: ip_set_timeout_get(elem->timeout);
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout));
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, atd);
+	return -EMSGSIZE;
+}
+
+static int
+bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	struct bitmap_ipmac *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct ipmac data;
+
+	/* MAC can be src only */
+	if (!(flags & IPSET_DIM_TWO_SRC))
+		return 0;
+
+	data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
+	if (data.id < map->first_ip || data.id > map->last_ip)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	/* Backward compatibility: we don't check the second flag */
+	if (skb_mac_header(skb) < skb->head ||
+	    (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+		return -EINVAL;
+
+	data.id -= map->first_ip;
+	data.ether = eth_hdr(skb)->h_source;
+
+	return adtfn(set, &data, map->timeout);
+}
+
+static int
+bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
+		  enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct bitmap_ipmac *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct ipmac data;
+	u32 timeout = map->timeout;
+	int ret = 0;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id);
+	if (ret)
+		return ret;
+
+	if (data.id < map->first_ip || data.id > map->last_ip)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	if (tb[IPSET_ATTR_ETHER])
+		data.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+	else
+		data.ether = NULL;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(map->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	data.id -= map->first_ip;
+
+	ret = adtfn(set, &data, timeout);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+bitmap_ipmac_destroy(struct ip_set *set)
+{
+	struct bitmap_ipmac *map = set->data;
+
+	if (with_timeout(map->timeout))
+		del_timer_sync(&map->gc);
+
+	ip_set_free(map->members);
+	kfree(map);
+
+	set->data = NULL;
+}
+
+static void
+bitmap_ipmac_flush(struct ip_set *set)
+{
+	struct bitmap_ipmac *map = set->data;
+
+	memset(map->members, 0,
+	       (map->last_ip - map->first_ip + 1) * map->dsize);
+}
+
+static int
+bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct bitmap_ipmac *map = set->data;
+	struct nlattr *nested;
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip));
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
+	NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1));
+	NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+		      htonl(sizeof(*map)
+			    + (map->last_ip - map->first_ip + 1) * map->dsize));
+	if (with_timeout(map->timeout))
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static bool
+bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct bitmap_ipmac *x = a->data;
+	const struct bitmap_ipmac *y = b->data;
+
+	return x->first_ip == y->first_ip &&
+	       x->last_ip == y->last_ip &&
+	       x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_ipmac = {
+	.kadt	= bitmap_ipmac_kadt,
+	.uadt	= bitmap_ipmac_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_ipmac_add,
+		[IPSET_DEL] = bitmap_ipmac_del,
+		[IPSET_TEST] = bitmap_ipmac_test,
+	},
+	.destroy = bitmap_ipmac_destroy,
+	.flush	= bitmap_ipmac_flush,
+	.head	= bitmap_ipmac_head,
+	.list	= bitmap_ipmac_list,
+	.same_set = bitmap_ipmac_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tipmac = {
+	.kadt	= bitmap_ipmac_kadt,
+	.uadt	= bitmap_ipmac_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_ipmac_tadd,
+		[IPSET_DEL] = bitmap_ipmac_tdel,
+		[IPSET_TEST] = bitmap_ipmac_ttest,
+	},
+	.destroy = bitmap_ipmac_destroy,
+	.flush	= bitmap_ipmac_flush,
+	.head	= bitmap_ipmac_head,
+	.list	= bitmap_ipmac_tlist,
+	.same_set = bitmap_ipmac_same_set,
+};
+
+static void
+bitmap_ipmac_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct bitmap_ipmac *map = set->data;
+	struct ipmac_telem *elem;
+	u32 id, last = map->last_ip - map->first_ip;
+
+	/* We run parallel with other readers (test element)
+	 * but adding/deleting new entries is locked out */
+	read_lock_bh(&set->lock);
+	for (id = 0; id <= last; id++) {
+		elem = bitmap_ipmac_elem(map, id);
+		if (elem->match == MAC_FILLED &&
+		    ip_set_timeout_expired(elem->timeout))
+			elem->match = MAC_EMPTY;
+	}
+	read_unlock_bh(&set->lock);
+
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static void
+bitmap_ipmac_gc_init(struct ip_set *set)
+{
+	struct bitmap_ipmac *map = set->data;
+
+	init_timer(&map->gc);
+	map->gc.data = (unsigned long) set;
+	map->gc.function = bitmap_ipmac_gc;
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+/* Create bitmap:ip,mac type of sets */
+
+static bool
+init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
+	       u32 first_ip, u32 last_ip)
+{
+	map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize);
+	if (!map->members)
+		return false;
+	map->first_ip = first_ip;
+	map->last_ip = last_ip;
+	map->timeout = IPSET_NO_TIMEOUT;
+
+	set->data = map;
+	set->family = AF_INET;
+
+	return true;
+}
+
+static int
+bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
+		    u32 flags)
+{
+	u32 first_ip, last_ip, elements;
+	struct bitmap_ipmac *map;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip);
+		if (ret)
+			return ret;
+		if (first_ip > last_ip) {
+			u32 tmp = first_ip;
+
+			first_ip = last_ip;
+			last_ip = tmp;
+		}
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr >= 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		last_ip = first_ip | ~ip_set_hostmask(cidr);
+	} else
+		return -IPSET_ERR_PROTOCOL;
+
+	elements = last_ip - first_ip + 1;
+
+	if (elements > IPSET_BITMAP_MAX_RANGE + 1)
+		return -IPSET_ERR_BITMAP_RANGE_SIZE;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		map->dsize = sizeof(struct ipmac_telem);
+
+		if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+
+		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = &bitmap_tipmac;
+
+		bitmap_ipmac_gc_init(set);
+	} else {
+		map->dsize = sizeof(struct ipmac_elem);
+
+		if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+		set->variant = &bitmap_ipmac;
+
+	}
+	return 0;
+}
+
+static struct ip_set_type bitmap_ipmac_type = {
+	.name		= "bitmap:ip,mac",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_MAC,
+	.dimension	= IPSET_DIM_TWO,
+	.family		= AF_INET,
+	.revision	= 0,
+	.create		= bitmap_ipmac_create,
+	.create_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_ETHER]	= { .type = NLA_BINARY, .len  = ETH_ALEN },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+bitmap_ipmac_init(void)
+{
+	return ip_set_type_register(&bitmap_ipmac_type);
+}
+
+static void __exit
+bitmap_ipmac_fini(void)
+{
+	ip_set_type_unregister(&bitmap_ipmac_type);
+}
+
+module_init(bitmap_ipmac_init);
+module_exit(bitmap_ipmac_fini);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
new file mode 100644
index 00000000..6b38eb8f
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -0,0 +1,514 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the bitmap:port type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_bitmap.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#define IP_SET_BITMAP_TIMEOUT
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("bitmap:port type of IP sets");
+MODULE_ALIAS("ip_set_bitmap:port");
+
+/* Type structure */
+struct bitmap_port {
+	void *members;		/* the set members */
+	u16 first_port;		/* host byte order, included in range */
+	u16 last_port;		/* host byte order, included in range */
+	size_t memsize;		/* members size */
+	u32 timeout;		/* timeout parameter */
+	struct timer_list gc;	/* garbage collection */
+};
+
+/* Base variant */
+
+static int
+bitmap_port_test(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_port *map = set->data;
+	u16 id = *(u16 *)value;
+
+	return !!test_bit(id, map->members);
+}
+
+static int
+bitmap_port_add(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_port *map = set->data;
+	u16 id = *(u16 *)value;
+
+	if (test_and_set_bit(id, map->members))
+		return -IPSET_ERR_EXIST;
+
+	return 0;
+}
+
+static int
+bitmap_port_del(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_port *map = set->data;
+	u16 id = *(u16 *)value;
+
+	if (!test_and_clear_bit(id, map->members))
+		return -IPSET_ERR_EXIST;
+
+	return 0;
+}
+
+static int
+bitmap_port_list(const struct ip_set *set,
+		 struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_port *map = set->data;
+	struct nlattr *atd, *nested;
+	u16 id, first = cb->args[2];
+	u16 last = map->last_port - map->first_port;
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	for (; cb->args[2] <= last; cb->args[2]++) {
+		id = cb->args[2];
+		if (!test_bit(id, map->members))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, atd);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+			      htons(map->first_port + id));
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, atd);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+/* Timeout variant */
+
+static int
+bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+	const struct bitmap_port *map = set->data;
+	const unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+
+	return ip_set_timeout_test(members[id]);
+}
+
+static int
+bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_port *map = set->data;
+	unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+
+	if (ip_set_timeout_test(members[id]))
+		return -IPSET_ERR_EXIST;
+
+	members[id] = ip_set_timeout_set(timeout);
+
+	return 0;
+}
+
+static int
+bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+	struct bitmap_port *map = set->data;
+	unsigned long *members = map->members;
+	u16 id = *(u16 *)value;
+	int ret = -IPSET_ERR_EXIST;
+
+	if (ip_set_timeout_test(members[id]))
+		ret = 0;
+
+	members[id] = IPSET_ELEM_UNSET;
+	return ret;
+}
+
+static int
+bitmap_port_tlist(const struct ip_set *set,
+		  struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct bitmap_port *map = set->data;
+	struct nlattr *adt, *nested;
+	u16 id, first = cb->args[2];
+	u16 last = map->last_port - map->first_port;
+	const unsigned long *members = map->members;
+
+	adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!adt)
+		return -EMSGSIZE;
+	for (; cb->args[2] <= last; cb->args[2]++) {
+		id = cb->args[2];
+		if (!ip_set_timeout_test(members[id]))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, adt);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_NET16(skb, IPSET_ATTR_PORT,
+			      htons(map->first_port + id));
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+			      htonl(ip_set_timeout_get(members[id])));
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, adt);
+
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, adt);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static int
+bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
+		 enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	struct bitmap_port *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	__be16 __port;
+	u16 port = 0;
+
+	if (!ip_set_get_ip_port(skb, pf, flags & IPSET_DIM_ONE_SRC, &__port))
+		return -EINVAL;
+
+	port = ntohs(__port);
+
+	if (port < map->first_port || port > map->last_port)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	port -= map->first_port;
+
+	return adtfn(set, &port, map->timeout);
+}
+
+static int
+bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
+		 enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	struct bitmap_port *map = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	u32 timeout = map->timeout;
+	u32 port;	/* wraparound */
+	u16 id, port_to;
+	int ret = 0;
+
+	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+	if (port < map->first_port || port > map->last_port)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(map->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST) {
+		id = port - map->first_port;
+		return adtfn(set, &id, timeout);
+	}
+
+	if (tb[IPSET_ATTR_PORT_TO]) {
+		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+		if (port > port_to) {
+			swap(port, port_to);
+			if (port < map->first_port)
+				return -IPSET_ERR_BITMAP_RANGE;
+		}
+	} else
+		port_to = port;
+
+	if (port_to > map->last_port)
+		return -IPSET_ERR_BITMAP_RANGE;
+
+	for (; port <= port_to; port++) {
+		id = port - map->first_port;
+		ret = adtfn(set, &id, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+static void
+bitmap_port_destroy(struct ip_set *set)
+{
+	struct bitmap_port *map = set->data;
+
+	if (with_timeout(map->timeout))
+		del_timer_sync(&map->gc);
+
+	ip_set_free(map->members);
+	kfree(map);
+
+	set->data = NULL;
+}
+
+static void
+bitmap_port_flush(struct ip_set *set)
+{
+	struct bitmap_port *map = set->data;
+
+	memset(map->members, 0, map->memsize);
+}
+
+static int
+bitmap_port_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct bitmap_port *map = set->data;
+	struct nlattr *nested;
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port));
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
+	NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1));
+	NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+		      htonl(sizeof(*map) + map->memsize));
+	if (with_timeout(map->timeout))
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static bool
+bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct bitmap_port *x = a->data;
+	const struct bitmap_port *y = b->data;
+
+	return x->first_port == y->first_port &&
+	       x->last_port == y->last_port &&
+	       x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant bitmap_port = {
+	.kadt	= bitmap_port_kadt,
+	.uadt	= bitmap_port_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_port_add,
+		[IPSET_DEL] = bitmap_port_del,
+		[IPSET_TEST] = bitmap_port_test,
+	},
+	.destroy = bitmap_port_destroy,
+	.flush	= bitmap_port_flush,
+	.head	= bitmap_port_head,
+	.list	= bitmap_port_list,
+	.same_set = bitmap_port_same_set,
+};
+
+static const struct ip_set_type_variant bitmap_tport = {
+	.kadt	= bitmap_port_kadt,
+	.uadt	= bitmap_port_uadt,
+	.adt	= {
+		[IPSET_ADD] = bitmap_port_tadd,
+		[IPSET_DEL] = bitmap_port_tdel,
+		[IPSET_TEST] = bitmap_port_ttest,
+	},
+	.destroy = bitmap_port_destroy,
+	.flush	= bitmap_port_flush,
+	.head	= bitmap_port_head,
+	.list	= bitmap_port_tlist,
+	.same_set = bitmap_port_same_set,
+};
+
+static void
+bitmap_port_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct bitmap_port *map = set->data;
+	unsigned long *table = map->members;
+	u32 id;	/* wraparound */
+	u16 last = map->last_port - map->first_port;
+
+	/* We run parallel with other readers (test element)
+	 * but adding/deleting new entries is locked out */
+	read_lock_bh(&set->lock);
+	for (id = 0; id <= last; id++)
+		if (ip_set_timeout_expired(table[id]))
+			table[id] = IPSET_ELEM_UNSET;
+	read_unlock_bh(&set->lock);
+
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static void
+bitmap_port_gc_init(struct ip_set *set)
+{
+	struct bitmap_port *map = set->data;
+
+	init_timer(&map->gc);
+	map->gc.data = (unsigned long) set;
+	map->gc.function = bitmap_port_gc;
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+/* Create bitmap:ip type of sets */
+
+static bool
+init_map_port(struct ip_set *set, struct bitmap_port *map,
+	      u16 first_port, u16 last_port)
+{
+	map->members = ip_set_alloc(map->memsize);
+	if (!map->members)
+		return false;
+	map->first_port = first_port;
+	map->last_port = last_port;
+	map->timeout = IPSET_NO_TIMEOUT;
+
+	set->data = map;
+	set->family = AF_UNSPEC;
+
+	return true;
+}
+
+static int
+bitmap_port_create(struct ip_set *set, struct nlattr *tb[],
+		 u32 flags)
+{
+	struct bitmap_port *map;
+	u16 first_port, last_port;
+
+	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
+	last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (first_port > last_port) {
+		u16 tmp = first_port;
+
+		first_port = last_port;
+		last_port = tmp;
+	}
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		map->memsize = (last_port - first_port + 1)
+			       * sizeof(unsigned long);
+
+		if (!init_map_port(set, map, first_port, last_port)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+
+		map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+		set->variant = &bitmap_tport;
+
+		bitmap_port_gc_init(set);
+	} else {
+		map->memsize = bitmap_bytes(0, last_port - first_port);
+		pr_debug("memsize: %zu\n", map->memsize);
+		if (!init_map_port(set, map, first_port, last_port)) {
+			kfree(map);
+			return -ENOMEM;
+		}
+
+		set->variant = &bitmap_port;
+	}
+	return 0;
+}
+
+static struct ip_set_type bitmap_port_type = {
+	.name		= "bitmap:port",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_PORT,
+	.dimension	= IPSET_DIM_ONE,
+	.family		= AF_UNSPEC,
+	.revision	= 0,
+	.create		= bitmap_port_create,
+	.create_policy	= {
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+bitmap_port_init(void)
+{
+	return ip_set_type_register(&bitmap_port_type);
+}
+
+static void __exit
+bitmap_port_fini(void)
+{
+	ip_set_type_unregister(&bitmap_port_type);
+}
+
+module_init(bitmap_port_init);
+module_exit(bitmap_port_fini);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
new file mode 100644
index 00000000..42aa64b6
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -0,0 +1,1708 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ *                         Patrick Schaaf <bof@bof.de>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module for IP set management */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/rculist.h>
+#include <linux/version.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+static LIST_HEAD(ip_set_type_list);		/* all registered set types */
+static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
+static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
+
+static struct ip_set **ip_set_list;		/* all individual sets */
+static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
+
+#define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
+
+static unsigned int max_sets;
+
+module_param(max_sets, int, 0600);
+MODULE_PARM_DESC(max_sets, "maximal number of sets");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("core IP set support");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
+
+/*
+ * The set types are implemented in modules and registered set types
+ * can be found in ip_set_type_list. Adding/deleting types is
+ * serialized by ip_set_type_mutex.
+ */
+
+static inline void
+ip_set_type_lock(void)
+{
+	mutex_lock(&ip_set_type_mutex);
+}
+
+static inline void
+ip_set_type_unlock(void)
+{
+	mutex_unlock(&ip_set_type_mutex);
+}
+
+/* Register and deregister settype */
+
+static struct ip_set_type *
+find_set_type(const char *name, u8 family, u8 revision)
+{
+	struct ip_set_type *type;
+
+	list_for_each_entry_rcu(type, &ip_set_type_list, list)
+		if (STREQ(type->name, name) &&
+		    (type->family == family || type->family == AF_UNSPEC) &&
+		    type->revision == revision)
+			return type;
+	return NULL;
+}
+
+/* Unlock, try to load a set type module and lock again */
+static int
+try_to_load_type(const char *name)
+{
+	nfnl_unlock();
+	pr_debug("try to load ip_set_%s\n", name);
+	if (request_module("ip_set_%s", name) < 0) {
+		pr_warning("Can't find ip_set type %s\n", name);
+		nfnl_lock();
+		return -IPSET_ERR_FIND_TYPE;
+	}
+	nfnl_lock();
+	return -EAGAIN;
+}
+
+/* Find a set type and reference it */
+static int
+find_set_type_get(const char *name, u8 family, u8 revision,
+		  struct ip_set_type **found)
+{
+	struct ip_set_type *type;
+	int err;
+
+	rcu_read_lock();
+	*found = find_set_type(name, family, revision);
+	if (*found) {
+		err = !try_module_get((*found)->me) ? -EFAULT : 0;
+		goto unlock;
+	}
+	/* Make sure the type is loaded but we don't support the revision */
+	list_for_each_entry_rcu(type, &ip_set_type_list, list)
+		if (STREQ(type->name, name)) {
+			err = -IPSET_ERR_FIND_TYPE;
+			goto unlock;
+		}
+	rcu_read_unlock();
+
+	return try_to_load_type(name);
+
+unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+/* Find a given set type by name and family.
+ * If we succeeded, the supported minimal and maximum revisions are
+ * filled out.
+ */
+static int
+find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max)
+{
+	struct ip_set_type *type;
+	bool found = false;
+
+	*min = 255; *max = 0;
+	rcu_read_lock();
+	list_for_each_entry_rcu(type, &ip_set_type_list, list)
+		if (STREQ(type->name, name) &&
+		    (type->family == family || type->family == AF_UNSPEC)) {
+			found = true;
+			if (type->revision < *min)
+				*min = type->revision;
+			if (type->revision > *max)
+				*max = type->revision;
+		}
+	rcu_read_unlock();
+	if (found)
+		return 0;
+
+	return try_to_load_type(name);
+}
+
+#define family_name(f)	((f) == AF_INET ? "inet" : \
+			 (f) == AF_INET6 ? "inet6" : "any")
+
+/* Register a set type structure. The type is identified by
+ * the unique triple of name, family and revision.
+ */
+int
+ip_set_type_register(struct ip_set_type *type)
+{
+	int ret = 0;
+
+	if (type->protocol != IPSET_PROTOCOL) {
+		pr_warning("ip_set type %s, family %s, revision %u uses "
+			   "wrong protocol version %u (want %u)\n",
+			   type->name, family_name(type->family),
+			   type->revision, type->protocol, IPSET_PROTOCOL);
+		return -EINVAL;
+	}
+
+	ip_set_type_lock();
+	if (find_set_type(type->name, type->family, type->revision)) {
+		/* Duplicate! */
+		pr_warning("ip_set type %s, family %s, revision %u "
+			   "already registered!\n", type->name,
+			   family_name(type->family), type->revision);
+		ret = -EINVAL;
+		goto unlock;
+	}
+	list_add_rcu(&type->list, &ip_set_type_list);
+	pr_debug("type %s, family %s, revision %u registered.\n",
+		 type->name, family_name(type->family), type->revision);
+unlock:
+	ip_set_type_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_type_register);
+
+/* Unregister a set type. There's a small race with ip_set_create */
+void
+ip_set_type_unregister(struct ip_set_type *type)
+{
+	ip_set_type_lock();
+	if (!find_set_type(type->name, type->family, type->revision)) {
+		pr_warning("ip_set type %s, family %s, revision %u "
+			   "not registered\n", type->name,
+			   family_name(type->family), type->revision);
+		goto unlock;
+	}
+	list_del_rcu(&type->list);
+	pr_debug("type %s, family %s, revision %u unregistered.\n",
+		 type->name, family_name(type->family), type->revision);
+unlock:
+	ip_set_type_unlock();
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ip_set_type_unregister);
+
+/* Utility functions */
+void *
+ip_set_alloc(size_t size)
+{
+	void *members = NULL;
+
+	if (size < KMALLOC_MAX_SIZE)
+		members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+	if (members) {
+		pr_debug("%p: allocated with kmalloc\n", members);
+		return members;
+	}
+
+	members = vzalloc(size);
+	if (!members)
+		return NULL;
+	pr_debug("%p: allocated with vmalloc\n", members);
+
+	return members;
+}
+EXPORT_SYMBOL_GPL(ip_set_alloc);
+
+void
+ip_set_free(void *members)
+{
+	pr_debug("%p: free with %s\n", members,
+		 is_vmalloc_addr(members) ? "vfree" : "kfree");
+	if (is_vmalloc_addr(members))
+		vfree(members);
+	else
+		kfree(members);
+}
+EXPORT_SYMBOL_GPL(ip_set_free);
+
+static inline bool
+flag_nested(const struct nlattr *nla)
+{
+	return nla->nla_type & NLA_F_NESTED;
+}
+
+static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
+	[IPSET_ATTR_IPADDR_IPV4]	= { .type = NLA_U32 },
+	[IPSET_ATTR_IPADDR_IPV6]	= { .type = NLA_BINARY,
+					    .len = sizeof(struct in6_addr) },
+};
+
+int
+ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr)
+{
+	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+	if (unlikely(!flag_nested(nla)))
+		return -IPSET_ERR_PROTOCOL;
+	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
+		return -IPSET_ERR_PROTOCOL;
+
+	*ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
+
+int
+ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
+{
+	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+
+	if (unlikely(!flag_nested(nla)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy))
+		return -IPSET_ERR_PROTOCOL;
+	if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
+		return -IPSET_ERR_PROTOCOL;
+
+	memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
+		sizeof(struct in6_addr));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+
+/*
+ * Creating/destroying/renaming/swapping affect the existence and
+ * the properties of a set. All of these can be executed from userspace
+ * only and serialized by the nfnl mutex indirectly from nfnetlink.
+ *
+ * Sets are identified by their index in ip_set_list and the index
+ * is used by the external references (set/SET netfilter modules).
+ *
+ * The set behind an index may change by swapping only, from userspace.
+ */
+
+static inline void
+__ip_set_get(ip_set_id_t index)
+{
+	write_lock_bh(&ip_set_ref_lock);
+	ip_set_list[index]->ref++;
+	write_unlock_bh(&ip_set_ref_lock);
+}
+
+static inline void
+__ip_set_put(ip_set_id_t index)
+{
+	write_lock_bh(&ip_set_ref_lock);
+	BUG_ON(ip_set_list[index]->ref == 0);
+	ip_set_list[index]->ref--;
+	write_unlock_bh(&ip_set_ref_lock);
+}
+
+/*
+ * Add, del and test set entries from kernel.
+ *
+ * The set behind the index must exist and must be referenced
+ * so it can't be destroyed (or changed) under our foot.
+ */
+
+int
+ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+	    u8 family, u8 dim, u8 flags)
+{
+	struct ip_set *set = ip_set_list[index];
+	int ret = 0;
+
+	BUG_ON(set == NULL);
+	pr_debug("set %s, index %u\n", set->name, index);
+
+	if (dim < set->type->dimension ||
+	    !(family == set->family || set->family == AF_UNSPEC))
+		return 0;
+
+	read_lock_bh(&set->lock);
+	ret = set->variant->kadt(set, skb, IPSET_TEST, family, dim, flags);
+	read_unlock_bh(&set->lock);
+
+	if (ret == -EAGAIN) {
+		/* Type requests element to be completed */
+		pr_debug("element must be competed, ADD is triggered\n");
+		write_lock_bh(&set->lock);
+		set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+		write_unlock_bh(&set->lock);
+		ret = 1;
+	}
+
+	/* Convert error codes to nomatch */
+	return (ret < 0 ? 0 : ret);
+}
+EXPORT_SYMBOL_GPL(ip_set_test);
+
+int
+ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
+	   u8 family, u8 dim, u8 flags)
+{
+	struct ip_set *set = ip_set_list[index];
+	int ret;
+
+	BUG_ON(set == NULL);
+	pr_debug("set %s, index %u\n", set->name, index);
+
+	if (dim < set->type->dimension ||
+	    !(family == set->family || set->family == AF_UNSPEC))
+		return 0;
+
+	write_lock_bh(&set->lock);
+	ret = set->variant->kadt(set, skb, IPSET_ADD, family, dim, flags);
+	write_unlock_bh(&set->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_add);
+
+int
+ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
+	   u8 family, u8 dim, u8 flags)
+{
+	struct ip_set *set = ip_set_list[index];
+	int ret = 0;
+
+	BUG_ON(set == NULL);
+	pr_debug("set %s, index %u\n", set->name, index);
+
+	if (dim < set->type->dimension ||
+	    !(family == set->family || set->family == AF_UNSPEC))
+		return 0;
+
+	write_lock_bh(&set->lock);
+	ret = set->variant->kadt(set, skb, IPSET_DEL, family, dim, flags);
+	write_unlock_bh(&set->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_set_del);
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ */
+ip_set_id_t
+ip_set_get_byname(const char *name, struct ip_set **set)
+{
+	ip_set_id_t i, index = IPSET_INVALID_ID;
+	struct ip_set *s;
+
+	for (i = 0; i < ip_set_max; i++) {
+		s = ip_set_list[i];
+		if (s != NULL && STREQ(s->name, name)) {
+			__ip_set_get(i);
+			index = i;
+			*set = s;
+		}
+	}
+
+	return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_byname);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ */
+void
+ip_set_put_byindex(ip_set_id_t index)
+{
+	if (ip_set_list[index] != NULL)
+		__ip_set_put(index);
+}
+EXPORT_SYMBOL_GPL(ip_set_put_byindex);
+
+/*
+ * Get the name of a set behind a set index.
+ * We assume the set is referenced, so it does exist and
+ * can't be destroyed. The set cannot be renamed due to
+ * the referencing either.
+ *
+ */
+const char *
+ip_set_name_byindex(ip_set_id_t index)
+{
+	const struct ip_set *set = ip_set_list[index];
+
+	BUG_ON(set == NULL);
+	BUG_ON(set->ref == 0);
+
+	/* Referenced, so it's safe */
+	return set->name;
+}
+EXPORT_SYMBOL_GPL(ip_set_name_byindex);
+
+/*
+ * Routines to call by external subsystems, which do not
+ * call nfnl_lock for us.
+ */
+
+/*
+ * Find set by name, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get(const char *name)
+{
+	struct ip_set *s;
+	ip_set_id_t index;
+
+	nfnl_lock();
+	index = ip_set_get_byname(name, &s);
+	nfnl_unlock();
+
+	return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get);
+
+/*
+ * Find set by index, reference it once. The reference makes sure the
+ * thing pointed to, does not go away under our feet.
+ *
+ * The nfnl mutex is used in the function.
+ */
+ip_set_id_t
+ip_set_nfnl_get_byindex(ip_set_id_t index)
+{
+	if (index > ip_set_max)
+		return IPSET_INVALID_ID;
+
+	nfnl_lock();
+	if (ip_set_list[index])
+		__ip_set_get(index);
+	else
+		index = IPSET_INVALID_ID;
+	nfnl_unlock();
+
+	return index;
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
+
+/*
+ * If the given set pointer points to a valid set, decrement
+ * reference count by 1. The caller shall not assume the index
+ * to be valid, after calling this function.
+ *
+ * The nfnl mutex is used in the function.
+ */
+void
+ip_set_nfnl_put(ip_set_id_t index)
+{
+	nfnl_lock();
+	ip_set_put_byindex(index);
+	nfnl_unlock();
+}
+EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
+
+/*
+ * Communication protocol with userspace over netlink.
+ *
+ * The commands are serialized by the nfnl mutex.
+ */
+
+static inline bool
+protocol_failed(const struct nlattr * const tb[])
+{
+	return !tb[IPSET_ATTR_PROTOCOL] ||
+	       nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL;
+}
+
+static inline u32
+flag_exist(const struct nlmsghdr *nlh)
+{
+	return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST;
+}
+
+static struct nlmsghdr *
+start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags,
+	  enum ipset_cmd cmd)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+
+	nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+			sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		return NULL;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	return nlh;
+}
+
+/* Create a set */
+
+static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1},
+	[IPSET_ATTR_REVISION]	= { .type = NLA_U8 },
+	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
+	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
+};
+
+static ip_set_id_t
+find_set_id(const char *name)
+{
+	ip_set_id_t i, index = IPSET_INVALID_ID;
+	const struct ip_set *set;
+
+	for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) {
+		set = ip_set_list[i];
+		if (set != NULL && STREQ(set->name, name))
+			index = i;
+	}
+	return index;
+}
+
+static inline struct ip_set *
+find_set(const char *name)
+{
+	ip_set_id_t index = find_set_id(name);
+
+	return index == IPSET_INVALID_ID ? NULL : ip_set_list[index];
+}
+
+static int
+find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
+{
+	ip_set_id_t i;
+
+	*index = IPSET_INVALID_ID;
+	for (i = 0;  i < ip_set_max; i++) {
+		if (ip_set_list[i] == NULL) {
+			if (*index == IPSET_INVALID_ID)
+				*index = i;
+		} else if (STREQ(name, ip_set_list[i]->name)) {
+			/* Name clash */
+			*set = ip_set_list[i];
+			return -EEXIST;
+		}
+	}
+	if (*index == IPSET_INVALID_ID)
+		/* No free slot remained */
+		return -IPSET_ERR_MAX_SETS;
+	return 0;
+}
+
+static int
+ip_set_create(struct sock *ctnl, struct sk_buff *skb,
+	      const struct nlmsghdr *nlh,
+	      const struct nlattr * const attr[])
+{
+	struct ip_set *set, *clash = NULL;
+	ip_set_id_t index = IPSET_INVALID_ID;
+	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+	const char *name, *typename;
+	u8 family, revision;
+	u32 flags = flag_exist(nlh);
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     attr[IPSET_ATTR_TYPENAME] == NULL ||
+		     attr[IPSET_ATTR_REVISION] == NULL ||
+		     attr[IPSET_ATTR_FAMILY] == NULL ||
+		     (attr[IPSET_ATTR_DATA] != NULL &&
+		      !flag_nested(attr[IPSET_ATTR_DATA]))))
+		return -IPSET_ERR_PROTOCOL;
+
+	name = nla_data(attr[IPSET_ATTR_SETNAME]);
+	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+	revision = nla_get_u8(attr[IPSET_ATTR_REVISION]);
+	pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
+		 name, typename, family_name(family), revision);
+
+	/*
+	 * First, and without any locks, allocate and initialize
+	 * a normal base set structure.
+	 */
+	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+	if (!set)
+		return -ENOMEM;
+	rwlock_init(&set->lock);
+	strlcpy(set->name, name, IPSET_MAXNAMELEN);
+	set->family = family;
+
+	/*
+	 * Next, check that we know the type, and take
+	 * a reference on the type, to make sure it stays available
+	 * while constructing our new set.
+	 *
+	 * After referencing the type, we try to create the type
+	 * specific part of the set without holding any locks.
+	 */
+	ret = find_set_type_get(typename, family, revision, &(set->type));
+	if (ret)
+		goto out;
+
+	/*
+	 * Without holding any locks, create private part.
+	 */
+	if (attr[IPSET_ATTR_DATA] &&
+	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+			     set->type->create_policy)) {
+	    	ret = -IPSET_ERR_PROTOCOL;
+	    	goto put_out;
+	}
+
+	ret = set->type->create(set, tb, flags);
+	if (ret != 0)
+		goto put_out;
+
+	/* BTW, ret==0 here. */
+
+	/*
+	 * Here, we have a valid, constructed set and we are protected
+	 * by the nfnl mutex. Find the first free index in ip_set_list
+	 * and check clashing.
+	 */
+	if ((ret = find_free_id(set->name, &index, &clash)) != 0) {
+		/* If this is the same set and requested, ignore error */
+		if (ret == -EEXIST &&
+		    (flags & IPSET_FLAG_EXIST) &&
+		    STREQ(set->type->name, clash->type->name) &&
+		    set->type->family == clash->type->family &&
+		    set->type->revision == clash->type->revision &&
+		    set->variant->same_set(set, clash))
+			ret = 0;
+		goto cleanup;
+	}
+
+	/*
+	 * Finally! Add our shiny new set to the list, and be done.
+	 */
+	pr_debug("create: '%s' created with index %u!\n", set->name, index);
+	ip_set_list[index] = set;
+
+	return ret;
+
+cleanup:
+	set->variant->destroy(set);
+put_out:
+	module_put(set->type->me);
+out:
+	kfree(set);
+	return ret;
+}
+
+/* Destroy sets */
+
+static const struct nla_policy
+ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static void
+ip_set_destroy_set(ip_set_id_t index)
+{
+	struct ip_set *set = ip_set_list[index];
+
+	pr_debug("set: %s\n",  set->name);
+	ip_set_list[index] = NULL;
+
+	/* Must call it without holding any lock */
+	set->variant->destroy(set);
+	module_put(set->type->me);
+	kfree(set);
+}
+
+static int
+ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
+	       const struct nlmsghdr *nlh,
+	       const struct nlattr * const attr[])
+{
+	ip_set_id_t i;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr)))
+		return -IPSET_ERR_PROTOCOL;
+
+	/* Commands are serialized and references are
+	 * protected by the ip_set_ref_lock.
+	 * External systems (i.e. xt_set) must call
+	 * ip_set_put|get_nfnl_* functions, that way we
+	 * can safely check references here.
+	 *
+	 * list:set timer can only decrement the reference
+	 * counter, so if it's already zero, we can proceed
+	 * without holding the lock.
+	 */
+	read_lock_bh(&ip_set_ref_lock);
+	if (!attr[IPSET_ATTR_SETNAME]) {
+		for (i = 0; i < ip_set_max; i++) {
+			if (ip_set_list[i] != NULL && ip_set_list[i]->ref) {
+				ret = -IPSET_ERR_BUSY;
+				goto out;
+			}
+		}
+		read_unlock_bh(&ip_set_ref_lock);
+		for (i = 0; i < ip_set_max; i++) {
+			if (ip_set_list[i] != NULL)
+				ip_set_destroy_set(i);
+		}
+	} else {
+		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+		if (i == IPSET_INVALID_ID) {
+			ret = -ENOENT;
+			goto out;
+		} else if (ip_set_list[i]->ref) {
+			ret = -IPSET_ERR_BUSY;
+			goto out;
+		}
+		read_unlock_bh(&ip_set_ref_lock);
+
+		ip_set_destroy_set(i);
+	}
+	return 0;
+out:
+	read_unlock_bh(&ip_set_ref_lock);
+	return ret;
+}
+
+/* Flush sets */
+
+static void
+ip_set_flush_set(struct ip_set *set)
+{
+	pr_debug("set: %s\n",  set->name);
+
+	write_lock_bh(&set->lock);
+	set->variant->flush(set);
+	write_unlock_bh(&set->lock);
+}
+
+static int
+ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh,
+	     const struct nlattr * const attr[])
+{
+	ip_set_id_t i;
+
+	if (unlikely(protocol_failed(attr)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (!attr[IPSET_ATTR_SETNAME]) {
+		for (i = 0; i < ip_set_max; i++)
+			if (ip_set_list[i] != NULL)
+				ip_set_flush_set(ip_set_list[i]);
+	} else {
+		i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+		if (i == IPSET_INVALID_ID)
+			return -ENOENT;
+
+		ip_set_flush_set(ip_set_list[i]);
+	}
+
+	return 0;
+}
+
+/* Rename a set */
+
+static const struct nla_policy
+ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+	[IPSET_ATTR_SETNAME2]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+};
+
+static int
+ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
+	      const struct nlmsghdr *nlh,
+	      const struct nlattr * const attr[])
+{
+	struct ip_set *set;
+	const char *name2;
+	ip_set_id_t i;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     attr[IPSET_ATTR_SETNAME2] == NULL))
+		return -IPSET_ERR_PROTOCOL;
+
+	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (set == NULL)
+		return -ENOENT;
+
+	read_lock_bh(&ip_set_ref_lock);
+	if (set->ref != 0) {
+		ret = -IPSET_ERR_REFERENCED;
+		goto out;
+	}
+
+	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
+	for (i = 0; i < ip_set_max; i++) {
+		if (ip_set_list[i] != NULL &&
+		    STREQ(ip_set_list[i]->name, name2)) {
+			ret = -IPSET_ERR_EXIST_SETNAME2;
+			goto out;
+		}
+	}
+	strncpy(set->name, name2, IPSET_MAXNAMELEN);
+
+out:
+	read_unlock_bh(&ip_set_ref_lock);
+	return ret;
+}
+
+/* Swap two sets so that name/index points to the other.
+ * References and set names are also swapped.
+ *
+ * The commands are serialized by the nfnl mutex and references are
+ * protected by the ip_set_ref_lock. The kernel interfaces
+ * do not hold the mutex but the pointer settings are atomic
+ * so the ip_set_list always contains valid pointers to the sets.
+ */
+
+static int
+ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	struct ip_set *from, *to;
+	ip_set_id_t from_id, to_id;
+	char from_name[IPSET_MAXNAMELEN];
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     attr[IPSET_ATTR_SETNAME2] == NULL))
+		return -IPSET_ERR_PROTOCOL;
+
+	from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (from_id == IPSET_INVALID_ID)
+		return -ENOENT;
+
+	to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2]));
+	if (to_id == IPSET_INVALID_ID)
+		return -IPSET_ERR_EXIST_SETNAME2;
+
+	from = ip_set_list[from_id];
+	to = ip_set_list[to_id];
+
+	/* Features must not change.
+	 * Not an artificial restriction anymore, as we must prevent
+	 * possible loops created by swapping in setlist type of sets. */
+	if (!(from->type->features == to->type->features &&
+	      from->type->family == to->type->family))
+		return -IPSET_ERR_TYPE_MISMATCH;
+
+	strncpy(from_name, from->name, IPSET_MAXNAMELEN);
+	strncpy(from->name, to->name, IPSET_MAXNAMELEN);
+	strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+
+	write_lock_bh(&ip_set_ref_lock);
+	swap(from->ref, to->ref);
+	ip_set_list[from_id] = to;
+	ip_set_list[to_id] = from;
+	write_unlock_bh(&ip_set_ref_lock);
+
+	return 0;
+}
+
+/* List/save set data */
+
+#define DUMP_INIT	0L
+#define DUMP_ALL	1L
+#define DUMP_ONE	2L
+#define DUMP_LAST	3L
+
+static int
+ip_set_dump_done(struct netlink_callback *cb)
+{
+	if (cb->args[2]) {
+		pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
+		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
+	}
+	return 0;
+}
+
+static inline void
+dump_attrs(struct nlmsghdr *nlh)
+{
+	const struct nlattr *attr;
+	int rem;
+
+	pr_debug("dump nlmsg\n");
+	nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) {
+		pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len);
+	}
+}
+
+static int
+dump_init(struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
+	int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+	struct nlattr *attr = (void *)nlh + min_len;
+	ip_set_id_t index;
+
+	/* Second pass, so parser can't fail */
+	nla_parse(cda, IPSET_ATTR_CMD_MAX,
+		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
+
+	/* cb->args[0] : dump single set/all sets
+	 *         [1] : set index
+	 *         [..]: type specific
+	 */
+
+	if (!cda[IPSET_ATTR_SETNAME]) {
+		cb->args[0] = DUMP_ALL;
+		return 0;
+	}
+
+	index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME]));
+	if (index == IPSET_INVALID_ID)
+		return -ENOENT;
+
+	cb->args[0] = DUMP_ONE;
+	cb->args[1] = index;
+	return 0;
+}
+
+static int
+ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	ip_set_id_t index = IPSET_INVALID_ID, max;
+	struct ip_set *set = NULL;
+	struct nlmsghdr *nlh = NULL;
+	unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0;
+	int ret = 0;
+
+	if (cb->args[0] == DUMP_INIT) {
+		ret = dump_init(cb);
+		if (ret < 0) {
+			nlh = nlmsg_hdr(cb->skb);
+			/* We have to create and send the error message
+			 * manually :-( */
+			if (nlh->nlmsg_flags & NLM_F_ACK)
+				netlink_ack(cb->skb, nlh, ret);
+			return ret;
+		}
+	}
+
+	if (cb->args[1] >= ip_set_max)
+		goto out;
+
+	max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+dump_last:
+	pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
+	for (; cb->args[1] < max; cb->args[1]++) {
+		index = (ip_set_id_t) cb->args[1];
+		set = ip_set_list[index];
+		if (set == NULL) {
+			if (cb->args[0] == DUMP_ONE) {
+				ret = -ENOENT;
+				goto out;
+			}
+			continue;
+		}
+		/* When dumping all sets, we must dump "sorted"
+		 * so that lists (unions of sets) are dumped last.
+		 */
+		if (cb->args[0] != DUMP_ONE &&
+		    ((cb->args[0] == DUMP_ALL) ==
+		     !!(set->type->features & IPSET_DUMP_LAST)))
+			continue;
+		pr_debug("List set: %s\n", set->name);
+		if (!cb->args[2]) {
+			/* Start listing: make sure set won't be destroyed */
+			pr_debug("reference set\n");
+			__ip_set_get(index);
+		}
+		nlh = start_msg(skb, NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, flags,
+				IPSET_CMD_LIST);
+		if (!nlh) {
+			ret = -EMSGSIZE;
+			goto release_refcount;
+		}
+		NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+		NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name);
+		switch (cb->args[2]) {
+		case 0:
+			/* Core header data */
+			NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME,
+				       set->type->name);
+			NLA_PUT_U8(skb, IPSET_ATTR_FAMILY,
+				   set->family);
+			NLA_PUT_U8(skb, IPSET_ATTR_REVISION,
+				   set->type->revision);
+			ret = set->variant->head(set, skb);
+			if (ret < 0)
+				goto release_refcount;
+			/* Fall through and add elements */
+		default:
+			read_lock_bh(&set->lock);
+			ret = set->variant->list(set, skb, cb);
+			read_unlock_bh(&set->lock);
+			if (!cb->args[2]) {
+				/* Set is done, proceed with next one */
+				if (cb->args[0] == DUMP_ONE)
+					cb->args[1] = IPSET_INVALID_ID;
+				else
+					cb->args[1]++;
+			}
+			goto release_refcount;
+		}
+	}
+	/* If we dump all sets, continue with dumping last ones */
+	if (cb->args[0] == DUMP_ALL) {
+		cb->args[0] = DUMP_LAST;
+		cb->args[1] = 0;
+		goto dump_last;
+	}
+	goto out;
+
+nla_put_failure:
+	ret = -EFAULT;
+release_refcount:
+	/* If there was an error or set is done, release set */
+	if (ret || !cb->args[2]) {
+		pr_debug("release set %s\n", ip_set_list[index]->name);
+		ip_set_put_byindex(index);
+	}
+out:
+	if (nlh) {
+		nlmsg_end(skb, nlh);
+		pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len);
+		dump_attrs(nlh);
+	}
+
+	return ret < 0 ? ret : skb->len;
+}
+
+static int
+ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	if (unlikely(protocol_failed(attr)))
+		return -IPSET_ERR_PROTOCOL;
+
+	return netlink_dump_start(ctnl, skb, nlh,
+				  ip_set_dump_start,
+				  ip_set_dump_done);
+}
+
+/* Add, del and test */
+
+static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_SETNAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+	[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	[IPSET_ATTR_DATA]	= { .type = NLA_NESTED },
+	[IPSET_ATTR_ADT]	= { .type = NLA_NESTED },
+};
+
+static int
+call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
+	struct nlattr *tb[], enum ipset_adt adt,
+	u32 flags, bool use_lineno)
+{
+	int ret, retried = 0;
+	u32 lineno = 0;
+	bool eexist = flags & IPSET_FLAG_EXIST;
+
+	do {
+		write_lock_bh(&set->lock);
+		ret = set->variant->uadt(set, tb, adt, &lineno, flags);
+		write_unlock_bh(&set->lock);
+	} while (ret == -EAGAIN &&
+		 set->variant->resize &&
+		 (ret = set->variant->resize(set, retried++)) == 0);
+
+	if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
+		return 0;
+	if (lineno && use_lineno) {
+		/* Error in restore/batch mode: send back lineno */
+		struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb);
+		struct sk_buff *skb2;
+		struct nlmsgerr *errmsg;
+		size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
+		int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+		struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+		struct nlattr *cmdattr;
+		u32 *errline;
+
+		skb2 = nlmsg_new(payload, GFP_KERNEL);
+		if (skb2 == NULL)
+			return -ENOMEM;
+		rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid,
+				  nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+		errmsg = nlmsg_data(rep);
+		errmsg->error = ret;
+		memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+		cmdattr = (void *)&errmsg->msg + min_len;
+
+		nla_parse(cda, IPSET_ATTR_CMD_MAX,
+			  cmdattr, nlh->nlmsg_len - min_len,
+			  ip_set_adt_policy);
+
+		errline = nla_data(cda[IPSET_ATTR_LINENO]);
+
+		*errline = lineno;
+
+		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+		/* Signal netlink not to send its ACK/errmsg.  */
+		return -EINTR;
+	}
+
+	return ret;
+}
+
+static int
+ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	struct ip_set *set;
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	const struct nlattr *nla;
+	u32 flags = flag_exist(nlh);
+	bool use_lineno;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     !((attr[IPSET_ATTR_DATA] != NULL) ^
+		       (attr[IPSET_ATTR_ADT] != NULL)) ||
+		     (attr[IPSET_ATTR_DATA] != NULL &&
+		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
+		     (attr[IPSET_ATTR_ADT] != NULL &&
+		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+		       attr[IPSET_ATTR_LINENO] == NULL))))
+		return -IPSET_ERR_PROTOCOL;
+
+	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (set == NULL)
+		return -ENOENT;
+
+	use_lineno = !!attr[IPSET_ATTR_LINENO];
+	if (attr[IPSET_ATTR_DATA]) {
+		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+				     attr[IPSET_ATTR_DATA],
+				     set->type->adt_policy))
+			return -IPSET_ERR_PROTOCOL;
+		ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+			      use_lineno);
+	} else {
+		int nla_rem;
+
+		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+			memset(tb, 0, sizeof(tb));
+			if (nla_type(nla) != IPSET_ATTR_DATA ||
+			    !flag_nested(nla) ||
+			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+					     set->type->adt_policy))
+				return -IPSET_ERR_PROTOCOL;
+			ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+				      flags, use_lineno);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return ret;
+}
+
+static int
+ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	struct ip_set *set;
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	const struct nlattr *nla;
+	u32 flags = flag_exist(nlh);
+	bool use_lineno;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     !((attr[IPSET_ATTR_DATA] != NULL) ^
+		       (attr[IPSET_ATTR_ADT] != NULL)) ||
+		     (attr[IPSET_ATTR_DATA] != NULL &&
+		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
+		     (attr[IPSET_ATTR_ADT] != NULL &&
+		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
+		       attr[IPSET_ATTR_LINENO] == NULL))))
+		return -IPSET_ERR_PROTOCOL;
+
+	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (set == NULL)
+		return -ENOENT;
+
+	use_lineno = !!attr[IPSET_ATTR_LINENO];
+	if (attr[IPSET_ATTR_DATA]) {
+		if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+				     attr[IPSET_ATTR_DATA],
+				     set->type->adt_policy))
+			return -IPSET_ERR_PROTOCOL;
+		ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
+			      use_lineno);
+	} else {
+		int nla_rem;
+
+		nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
+			memset(tb, 0, sizeof(*tb));
+			if (nla_type(nla) != IPSET_ATTR_DATA ||
+			    !flag_nested(nla) ||
+			    nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+					     set->type->adt_policy))
+				return -IPSET_ERR_PROTOCOL;
+			ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
+				      flags, use_lineno);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return ret;
+}
+
+static int
+ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh,
+	     const struct nlattr * const attr[])
+{
+	struct ip_set *set;
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     attr[IPSET_ATTR_DATA] == NULL ||
+		     !flag_nested(attr[IPSET_ATTR_DATA])))
+		return -IPSET_ERR_PROTOCOL;
+
+	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (set == NULL)
+		return -ENOENT;
+
+	if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+			     set->type->adt_policy))
+		return -IPSET_ERR_PROTOCOL;
+
+	read_lock_bh(&set->lock);
+	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0);
+	read_unlock_bh(&set->lock);
+	/* Userspace can't trigger element to be re-added */
+	if (ret == -EAGAIN)
+		ret = 1;
+
+	return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST;
+}
+
+/* Get headed data of a set */
+
+static int
+ip_set_header(struct sock *ctnl, struct sk_buff *skb,
+	      const struct nlmsghdr *nlh,
+	      const struct nlattr * const attr[])
+{
+	const struct ip_set *set;
+	struct sk_buff *skb2;
+	struct nlmsghdr *nlh2;
+	ip_set_id_t index;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_SETNAME] == NULL))
+		return -IPSET_ERR_PROTOCOL;
+
+	index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME]));
+	if (index == IPSET_INVALID_ID)
+		return -ENOENT;
+	set = ip_set_list[index];
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+			 IPSET_CMD_HEADER);
+	if (!nlh2)
+		goto nlmsg_failure;
+	NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+	NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name);
+	NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name);
+	NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family);
+	NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->type->revision);
+	nlmsg_end(skb2, nlh2);
+
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+	kfree_skb(skb2);
+	return -EMSGSIZE;
+}
+
+/* Get type data */
+
+static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+	[IPSET_ATTR_TYPENAME]	= { .type = NLA_NUL_STRING,
+				    .len = IPSET_MAXNAMELEN - 1 },
+	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
+};
+
+static int
+ip_set_type(struct sock *ctnl, struct sk_buff *skb,
+	    const struct nlmsghdr *nlh,
+	    const struct nlattr * const attr[])
+{
+	struct sk_buff *skb2;
+	struct nlmsghdr *nlh2;
+	u8 family, min, max;
+	const char *typename;
+	int ret = 0;
+
+	if (unlikely(protocol_failed(attr) ||
+		     attr[IPSET_ATTR_TYPENAME] == NULL ||
+		     attr[IPSET_ATTR_FAMILY] == NULL))
+		return -IPSET_ERR_PROTOCOL;
+
+	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
+	typename = nla_data(attr[IPSET_ATTR_TYPENAME]);
+	ret = find_set_type_minmax(typename, family, &min, &max);
+	if (ret)
+		return ret;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+			 IPSET_CMD_TYPE);
+	if (!nlh2)
+		goto nlmsg_failure;
+	NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+	NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename);
+	NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family);
+	NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max);
+	NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min);
+	nlmsg_end(skb2, nlh2);
+
+	pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+	kfree_skb(skb2);
+	return -EMSGSIZE;
+}
+
+/* Get protocol version */
+
+static const struct nla_policy
+ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
+	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
+};
+
+static int
+ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
+		const struct nlmsghdr *nlh,
+		const struct nlattr * const attr[])
+{
+	struct sk_buff *skb2;
+	struct nlmsghdr *nlh2;
+	int ret = 0;
+
+	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+		return -IPSET_ERR_PROTOCOL;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0,
+			 IPSET_CMD_PROTOCOL);
+	if (!nlh2)
+		goto nlmsg_failure;
+	NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL);
+	nlmsg_end(skb2, nlh2);
+
+	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb2, nlh2);
+nlmsg_failure:
+	kfree_skb(skb2);
+	return -EMSGSIZE;
+}
+
+static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
+	[IPSET_CMD_CREATE]	= {
+		.call		= ip_set_create,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_create_policy,
+	},
+	[IPSET_CMD_DESTROY]	= {
+		.call		= ip_set_destroy,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_FLUSH]	= {
+		.call		= ip_set_flush,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_RENAME]	= {
+		.call		= ip_set_rename,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname2_policy,
+	},
+	[IPSET_CMD_SWAP]	= {
+		.call		= ip_set_swap,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname2_policy,
+	},
+	[IPSET_CMD_LIST]	= {
+		.call		= ip_set_dump,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_SAVE]	= {
+		.call		= ip_set_dump,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_ADD]	= {
+		.call		= ip_set_uadd,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_adt_policy,
+	},
+	[IPSET_CMD_DEL]	= {
+		.call		= ip_set_udel,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_adt_policy,
+	},
+	[IPSET_CMD_TEST]	= {
+		.call		= ip_set_utest,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_adt_policy,
+	},
+	[IPSET_CMD_HEADER]	= {
+		.call		= ip_set_header,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_setname_policy,
+	},
+	[IPSET_CMD_TYPE]	= {
+		.call		= ip_set_type,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_type_policy,
+	},
+	[IPSET_CMD_PROTOCOL]	= {
+		.call		= ip_set_protocol,
+		.attr_count	= IPSET_ATTR_CMD_MAX,
+		.policy		= ip_set_protocol_policy,
+	},
+};
+
+static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = {
+	.name		= "ip_set",
+	.subsys_id	= NFNL_SUBSYS_IPSET,
+	.cb_count	= IPSET_MSG_MAX,
+	.cb		= ip_set_netlink_subsys_cb,
+};
+
+/* Interface to iptables/ip6tables */
+
+static int
+ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
+{
+	unsigned *op;
+	void *data;
+	int copylen = *len, ret = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (optval != SO_IP_SET)
+		return -EBADF;
+	if (*len < sizeof(unsigned))
+		return -EINVAL;
+
+	data = vmalloc(*len);
+	if (!data)
+		return -ENOMEM;
+	if (copy_from_user(data, user, *len) != 0) {
+		ret = -EFAULT;
+		goto done;
+	}
+	op = (unsigned *) data;
+
+	if (*op < IP_SET_OP_VERSION) {
+		/* Check the version at the beginning of operations */
+		struct ip_set_req_version *req_version = data;
+		if (req_version->version != IPSET_PROTOCOL) {
+			ret = -EPROTO;
+			goto done;
+		}
+	}
+
+	switch (*op) {
+	case IP_SET_OP_VERSION: {
+		struct ip_set_req_version *req_version = data;
+
+		if (*len != sizeof(struct ip_set_req_version)) {
+			ret = -EINVAL;
+			goto done;
+		}
+
+		req_version->version = IPSET_PROTOCOL;
+		ret = copy_to_user(user, req_version,
+				   sizeof(struct ip_set_req_version));
+		goto done;
+	}
+	case IP_SET_OP_GET_BYNAME: {
+		struct ip_set_req_get_set *req_get = data;
+
+		if (*len != sizeof(struct ip_set_req_get_set)) {
+			ret = -EINVAL;
+			goto done;
+		}
+		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
+		nfnl_lock();
+		req_get->set.index = find_set_id(req_get->set.name);
+		nfnl_unlock();
+		goto copy;
+	}
+	case IP_SET_OP_GET_BYINDEX: {
+		struct ip_set_req_get_set *req_get = data;
+
+		if (*len != sizeof(struct ip_set_req_get_set) ||
+		    req_get->set.index >= ip_set_max) {
+			ret = -EINVAL;
+			goto done;
+		}
+		nfnl_lock();
+		strncpy(req_get->set.name,
+			ip_set_list[req_get->set.index]
+				? ip_set_list[req_get->set.index]->name : "",
+			IPSET_MAXNAMELEN);
+		nfnl_unlock();
+		goto copy;
+	}
+	default:
+		ret = -EBADMSG;
+		goto done;
+	}	/* end of switch(op) */
+
+copy:
+	ret = copy_to_user(user, data, copylen);
+
+done:
+	vfree(data);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
+static struct nf_sockopt_ops so_set __read_mostly = {
+	.pf		= PF_INET,
+	.get_optmin	= SO_IP_SET,
+	.get_optmax	= SO_IP_SET + 1,
+	.get		= &ip_set_sockfn_get,
+	.owner		= THIS_MODULE,
+};
+
+static int __init
+ip_set_init(void)
+{
+	int ret;
+
+	if (max_sets)
+		ip_set_max = max_sets;
+	if (ip_set_max >= IPSET_INVALID_ID)
+		ip_set_max = IPSET_INVALID_ID - 1;
+
+	ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max,
+			      GFP_KERNEL);
+	if (!ip_set_list) {
+		pr_err("ip_set: Unable to create ip_set_list\n");
+		return -ENOMEM;
+	}
+
+	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+	if (ret != 0) {
+		pr_err("ip_set: cannot register with nfnetlink.\n");
+		kfree(ip_set_list);
+		return ret;
+	}
+	ret = nf_register_sockopt(&so_set);
+	if (ret != 0) {
+		pr_err("SO_SET registry failed: %d\n", ret);
+		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+		kfree(ip_set_list);
+		return ret;
+	}
+
+	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
+	return 0;
+}
+
+static void __exit
+ip_set_fini(void)
+{
+	/* There can't be any existing set */
+	nf_unregister_sockopt(&so_set);
+	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+	kfree(ip_set_list);
+	pr_debug("these are the famous last words\n");
+}
+
+module_init(ip_set_init);
+module_exit(ip_set_fini);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
new file mode 100644
index 00000000..757143b2
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -0,0 +1,155 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Get Layer-4 data from the packets */
+
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/sctp.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/ipset/ip_set_getport.h>
+
+/* We must handle non-linear skbs */
+static bool
+get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
+	 bool src, __be16 *port, u8 *proto)
+{
+	switch (protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph;
+		const struct tcphdr *th;
+
+		th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			/* No choice either */
+			return false;
+
+		*port = src ? th->source : th->dest;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		sctp_sctphdr_t _sh;
+		const sctp_sctphdr_t *sh;
+
+		sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+		if (sh == NULL)
+			/* No choice either */
+			return false;
+
+		*port = src ? sh->source : sh->dest;
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
+		struct udphdr _udph;
+		const struct udphdr *uh;
+
+		uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
+		if (uh == NULL)
+			/* No choice either */
+			return false;
+
+		*port = src ? uh->source : uh->dest;
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr _ich;
+		const struct icmphdr *ic;
+
+		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+		if (ic == NULL)
+			return false;
+
+		*port = (__force __be16)htons((ic->type << 8) | ic->code);
+		break;
+	}
+	case IPPROTO_ICMPV6: {
+		struct icmp6hdr _ich;
+		const struct icmp6hdr *ic;
+
+		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
+		if (ic == NULL)
+			return false;
+
+		*port = (__force __be16)
+			htons((ic->icmp6_type << 8) | ic->icmp6_code);
+		break;
+	}
+	default:
+		break;
+	}
+	*proto = protocol;
+
+	return true;
+}
+
+bool
+ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
+		    __be16 *port, u8 *proto)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	unsigned int protooff = ip_hdrlen(skb);
+	int protocol = iph->protocol;
+
+	/* See comments at tcp_match in ip_tables.c */
+	if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET))
+		return false;
+
+	return get_port(skb, protocol, protooff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip4_port);
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+bool
+ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
+		    __be16 *port, u8 *proto)
+{
+	int protoff;
+	u8 nexthdr;
+
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+	if (protoff < 0)
+		return false;
+
+	return get_port(skb, nexthdr, protoff, src, port, proto);
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
+#endif
+
+bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+	bool ret;
+	u8 proto;
+
+	switch (pf) {
+	case AF_INET:
+		ret = ip_set_get_ip4_port(skb, src, port, &proto);
+		break;
+	case AF_INET6:
+		ret = ip_set_get_ip6_port(skb, src, port, &proto);
+		break;
+	default:
+		return false;
+	}
+	if (!ret)
+		return ret;
+	switch (proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		return true;
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
new file mode 100644
index 00000000..43bcce20
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -0,0 +1,464 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip");
+
+/* Type specific function prefix */
+#define TYPE		hash_ip
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ip4_same_set	hash_ip_same_set
+#define hash_ip6_same_set	hash_ip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ip4_elem {
+	__be32 ip;
+};
+
+/* Member elements with timeout support */
+struct hash_ip4_telem {
+	__be32 ip;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *ip1,
+		    const struct hash_ip4_elem *ip2)
+{
+	return ip1->ip == ip2->ip;
+}
+
+static inline bool
+hash_ip4_data_isnull(const struct hash_ip4_elem *elem)
+{
+	return elem->ip == 0;
+}
+
+static inline void
+hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src)
+{
+	dst->ip = src->ip;
+}
+
+/* Zero valued IP addresses cannot be stored */
+static inline void
+hash_ip4_data_zero_out(struct hash_ip4_elem *elem)
+{
+	elem->ip = 0;
+}
+
+static inline bool
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data)
+{
+	const struct hash_ip4_telem *tdata =
+		(const struct hash_ip4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define IP_SET_HASH_WITH_NETMASK
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	__be32 ip;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip);
+	ip &= ip_set_netmask(h->netmask);
+	if (ip == 0)
+		return -EINVAL;
+
+	return adtfn(set, &ip, h->timeout);
+}
+
+static int
+hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
+	      enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	u32 ip, ip_to, hosts, timeout = h->timeout;
+	__be32 nip;
+	int ret = 0;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+	if (ret)
+		return ret;
+
+	ip &= ip_set_hostmask(h->netmask);
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST) {
+		nip = htonl(ip);
+		if (nip == 0)
+			return -IPSET_ERR_HASH_ELEM;
+		return adtfn(set, &nip, timeout);
+	}
+
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip > ip_to)
+			swap(ip, ip_to);
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr > 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		ip &= ip_set_hostmask(cidr);
+		ip_to = ip | ~ip_set_hostmask(cidr);
+	} else
+		ip_to = ip;
+
+	hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
+
+	for (; !before(ip_to, ip); ip += hosts) {
+		nip = htonl(ip);
+		if (nip == 0)
+			return -IPSET_ERR_HASH_ELEM;
+		ret = adtfn(set, &nip, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+static bool
+hash_ip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout &&
+	       x->netmask == y->netmask;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ip6_elem {
+	union nf_inet_addr ip;
+};
+
+struct hash_ip6_telem {
+	union nf_inet_addr ip;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
+		    const struct hash_ip6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0;
+}
+
+static inline bool
+hash_ip6_data_isnull(const struct hash_ip6_elem *elem)
+{
+	return ipv6_addr_any(&elem->ip.in6);
+}
+
+static inline void
+hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src)
+{
+	ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+}
+
+static inline void
+hash_ip6_data_zero_out(struct hash_ip6_elem *elem)
+{
+	ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0);
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static bool
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data)
+{
+	const struct hash_ip6_telem *e =
+		(const struct hash_ip6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	union nf_inet_addr ip;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &ip.in6);
+	ip6_netmask(&ip, h->netmask);
+	if (ipv6_addr_any(&ip.in6))
+		return -EINVAL;
+
+	return adtfn(set, &ip, h->timeout);
+}
+
+static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
+	[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+	[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+};
+
+static int
+hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
+	      enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	union nf_inet_addr ip;
+	u32 timeout = h->timeout;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     tb[IPSET_ATTR_IP_TO] ||
+		     tb[IPSET_ATTR_CIDR]))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip);
+	if (ret)
+		return ret;
+
+	ip6_netmask(&ip, h->netmask);
+	if (ipv6_addr_any(&ip.in6))
+		return -IPSET_ERR_HASH_ELEM;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	ret = adtfn(set, &ip, timeout);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 netmask, hbits;
+	struct ip_set_hash *h;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+	netmask = set->family == AF_INET ? 32 : 128;
+	pr_debug("Create set %s with family %s\n",
+		 set->name, set->family == AF_INET ? "inet" : "inet6");
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	if (tb[IPSET_ATTR_NETMASK]) {
+		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+		if ((set->family == AF_INET && netmask > 32) ||
+		    (set->family == AF_INET6 && netmask > 128) ||
+		    netmask == 0)
+			return -IPSET_ERR_INVALID_NETMASK;
+	}
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	h->netmask = netmask;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_ip4_tvariant : &hash_ip6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_ip4_gc_init(set);
+		else
+			hash_ip6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_ip4_variant : &hash_ip6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_ip_type __read_mostly = {
+	.name		= "hash:ip",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP,
+	.dimension	= IPSET_DIM_ONE,
+	.family		= AF_UNSPEC,
+	.revision	= 0,
+	.create		= hash_ip_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_NETMASK]	= { .type = NLA_U8  },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_ip_init(void)
+{
+	return ip_set_type_register(&hash_ip_type);
+}
+
+static void __exit
+hash_ip_fini(void)
+{
+	ip_set_type_unregister(&hash_ip_type);
+}
+
+module_init(hash_ip_init);
+module_exit(hash_ip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
new file mode 100644
index 00000000..14281b6b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -0,0 +1,530 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port");
+
+/* Type specific function prefix */
+#define TYPE		hash_ipport
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipport4_same_set	hash_ipport_same_set
+#define hash_ipport6_same_set	hash_ipport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipport4_elem {
+	__be32 ip;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipport4_telem {
+	__be32 ip;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
+			const struct hash_ipport4_elem *ip2)
+{
+	return ip1->ip == ip2->ip &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipport4_data_copy(struct hash_ipport4_elem *dst,
+		       const struct hash_ipport4_elem *src)
+{
+	dst->ip = src->ip;
+	dst->port = src->port;
+	dst->proto = src->proto;
+}
+
+static inline void
+hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_ipport4_data_list(struct sk_buff *skb,
+		       const struct hash_ipport4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipport4_data_tlist(struct sk_buff *skb,
+			const struct hash_ipport4_elem *data)
+{
+	const struct hash_ipport4_telem *tdata =
+		(const struct hash_ipport4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipport4_elem data = { };
+
+	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
+		  enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipport4_elem data = { };
+	u32 ip, ip_to, p, port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMP))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST ||
+	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+	      tb[IPSET_ATTR_PORT_TO])) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	ip = ntohl(data.ip);
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip > ip_to)
+			swap(ip, ip_to);
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr > 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		ip &= ip_set_hostmask(cidr);
+		ip_to = ip | ~ip_set_hostmask(cidr);
+	} else
+		ip_to = ip;
+
+	port_to = port = ntohs(data.port);
+	if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+		if (port > port_to)
+			swap(port, port_to);
+	}
+
+	for (; !before(ip_to, ip); ip++)
+		for (p = port; p <= port_to; p++) {
+			data.ip = htonl(ip);
+			data.port = htons(p);
+			ret = adtfn(set, &data, timeout);
+
+			if (ret && !ip_set_eexist(ret, flags))
+				return ret;
+			else
+				ret = 0;
+		}
+	return ret;
+}
+
+static bool
+hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipport6_elem {
+	union nf_inet_addr ip;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+};
+
+struct hash_ipport6_telem {
+	union nf_inet_addr ip;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
+			const struct hash_ipport6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipport6_data_copy(struct hash_ipport6_elem *dst,
+		       const struct hash_ipport6_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_ipport6_data_list(struct sk_buff *skb,
+		       const struct hash_ipport6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipport6_data_tlist(struct sk_buff *skb,
+			const struct hash_ipport6_elem *data)
+{
+	const struct hash_ipport6_telem *e =
+		(const struct hash_ipport6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		  enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipport6_elem data = { };
+
+	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
+		  enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipport6_elem data = { };
+	u32 port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     tb[IPSET_ATTR_IP_TO] ||
+		     tb[IPSET_ATTR_CIDR]))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMPV6))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	port = ntohs(data.port);
+	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (port > port_to)
+		swap(port, port_to);
+
+	for (; port <= port_to; port++) {
+		data.port = htons(port);
+		ret = adtfn(set, &data, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct ip_set_hash *h;
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_ipport4_tvariant : &hash_ipport6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_ipport4_gc_init(set);
+		else
+			hash_ipport6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_ipport4_variant : &hash_ipport6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_ipport_type __read_mostly = {
+	.name		= "hash:ip,port",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
+	.dimension	= IPSET_DIM_TWO,
+	.family		= AF_UNSPEC,
+	.revision	= 1,
+	.create		= hash_ipport_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_ipport_init(void)
+{
+	return ip_set_type_register(&hash_ipport_type);
+}
+
+static void __exit
+hash_ipport_fini(void)
+{
+	ip_set_type_unregister(&hash_ipport_type);
+}
+
+module_init(hash_ipport_init);
+module_exit(hash_ipport_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
new file mode 100644
index 00000000..401c8a25
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -0,0 +1,548 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,ip");
+
+/* Type specific function prefix */
+#define TYPE		hash_ipportip
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportip4_same_set	hash_ipportip_same_set
+#define hash_ipportip6_same_set	hash_ipportip_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportip4_elem {
+	__be32 ip;
+	__be32 ip2;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportip4_telem {
+	__be32 ip;
+	__be32 ip2;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
+			  const struct hash_ipportip4_elem *ip2)
+{
+	return ip1->ip == ip2->ip &&
+	       ip1->ip2 == ip2->ip2 &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst,
+			 const struct hash_ipportip4_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_ipportip4_data_list(struct sk_buff *skb,
+		       const struct hash_ipportip4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipportip4_data_tlist(struct sk_buff *skb,
+			const struct hash_ipportip4_elem *data)
+{
+	const struct hash_ipportip4_telem *tdata =
+		(const struct hash_ipportip4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportip4_elem data = { };
+
+	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
+		    enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportip4_elem data = { };
+	u32 ip, ip_to, p, port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMP))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST ||
+	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+	      tb[IPSET_ATTR_PORT_TO])) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	ip = ntohl(data.ip);
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip > ip_to)
+			swap(ip, ip_to);
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr > 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		ip &= ip_set_hostmask(cidr);
+		ip_to = ip | ~ip_set_hostmask(cidr);
+	} else
+		ip_to = ip;
+
+	port_to = port = ntohs(data.port);
+	if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+		if (port > port_to)
+			swap(port, port_to);
+	}
+
+	for (; !before(ip_to, ip); ip++)
+		for (p = port; p <= port_to; p++) {
+			data.ip = htonl(ip);
+			data.port = htons(p);
+			ret = adtfn(set, &data, timeout);
+
+			if (ret && !ip_set_eexist(ret, flags))
+				return ret;
+			else
+				ret = 0;
+		}
+	return ret;
+}
+
+static bool
+hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportip6_elem {
+	union nf_inet_addr ip;
+	union nf_inet_addr ip2;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+};
+
+struct hash_ipportip6_telem {
+	union nf_inet_addr ip;
+	union nf_inet_addr ip2;
+	__be16 port;
+	u8 proto;
+	u8 padding;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
+			  const struct hash_ipportip6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst,
+			 const struct hash_ipportip6_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_ipportip6_data_list(struct sk_buff *skb,
+			 const struct hash_ipportip6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipportip6_data_tlist(struct sk_buff *skb,
+			  const struct hash_ipportip6_elem *data)
+{
+	const struct hash_ipportip6_telem *e =
+		(const struct hash_ipportip6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		    enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportip6_elem data = { };
+
+	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
+		    enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportip6_elem data = { };
+	u32 port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     tb[IPSET_ATTR_IP_TO] ||
+		     tb[IPSET_ATTR_CIDR]))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMPV6))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	port = ntohs(data.port);
+	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (port > port_to)
+		swap(port, port_to);
+
+	for (; port <= port_to; port++) {
+		data.port = htons(port);
+		ret = adtfn(set, &data, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct ip_set_hash *h;
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_ipportip4_gc_init(set);
+		else
+			hash_ipportip6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_ipportip4_variant : &hash_ipportip6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_ipportip_type __read_mostly = {
+	.name		= "hash:ip,port,ip",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+	.dimension	= IPSET_DIM_THREE,
+	.family		= AF_UNSPEC,
+	.revision	= 1,
+	.create		= hash_ipportip_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_ipportip_init(void)
+{
+	return ip_set_type_register(&hash_ipportip_type);
+}
+
+static void __exit
+hash_ipportip_fini(void)
+{
+	ip_set_type_unregister(&hash_ipportip_type);
+}
+
+module_init(hash_ipportip_init);
+module_exit(hash_ipportip_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
new file mode 100644
index 00000000..565a7c5b
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -0,0 +1,616 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:ip,port,net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:ip,port,net type of IP sets");
+MODULE_ALIAS("ip_set_hash:ip,port,net");
+
+/* Type specific function prefix */
+#define TYPE		hash_ipportnet
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_ipportnet4_same_set	hash_ipportnet_same_set
+#define hash_ipportnet6_same_set	hash_ipportnet_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_ipportnet4_elem {
+	__be32 ip;
+	__be32 ip2;
+	__be16 port;
+	u8 cidr;
+	u8 proto;
+};
+
+/* Member elements with timeout support */
+struct hash_ipportnet4_telem {
+	__be32 ip;
+	__be32 ip2;
+	__be16 port;
+	u8 cidr;
+	u8 proto;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
+			   const struct hash_ipportnet4_elem *ip2)
+{
+	return ip1->ip == ip2->ip &&
+	       ip1->ip2 == ip2->ip2 &&
+	       ip1->cidr == ip2->cidr &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst,
+			  const struct hash_ipportnet4_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
+{
+	elem->ip2 &= ip_set_netmask(cidr);
+	elem->cidr = cidr;
+}
+
+static inline void
+hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_ipportnet4_data_list(struct sk_buff *skb,
+			  const struct hash_ipportnet4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipportnet4_data_tlist(struct sk_buff *skb,
+			   const struct hash_ipportnet4_elem *data)
+{
+	const struct hash_ipportnet4_telem *tdata =
+		(const struct hash_ipportnet4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		     enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportnet4_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	ip4addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2);
+	data.ip2 &= ip_set_netmask(data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
+		     enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportnet4_elem data = { .cidr = HOST_MASK };
+	u32 ip, ip_to, p, port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR2])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+
+	data.ip2 &= ip_set_netmask(data.cidr);
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMP))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST ||
+	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
+	      tb[IPSET_ATTR_PORT_TO])) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	ip = ntohl(data.ip);
+	if (tb[IPSET_ATTR_IP_TO]) {
+		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
+		if (ret)
+			return ret;
+		if (ip > ip_to)
+			swap(ip, ip_to);
+	} else if (tb[IPSET_ATTR_CIDR]) {
+		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+		if (cidr > 32)
+			return -IPSET_ERR_INVALID_CIDR;
+		ip &= ip_set_hostmask(cidr);
+		ip_to = ip | ~ip_set_hostmask(cidr);
+	} else
+		ip_to = ip;
+
+	port_to = port = ntohs(data.port);
+	if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
+		port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+		if (port > port_to)
+			swap(port, port_to);
+	}
+
+	for (; !before(ip_to, ip); ip++)
+		for (p = port; p <= port_to; p++) {
+			data.ip = htonl(ip);
+			data.port = htons(p);
+			ret = adtfn(set, &data, timeout);
+
+			if (ret && !ip_set_eexist(ret, flags))
+				return ret;
+			else
+				ret = 0;
+		}
+	return ret;
+}
+
+static bool
+hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_ipportnet6_elem {
+	union nf_inet_addr ip;
+	union nf_inet_addr ip2;
+	__be16 port;
+	u8 cidr;
+	u8 proto;
+};
+
+struct hash_ipportnet6_telem {
+	union nf_inet_addr ip;
+	union nf_inet_addr ip2;
+	__be16 port;
+	u8 cidr;
+	u8 proto;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
+			   const struct hash_ipportnet6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 &&
+	       ip1->cidr == ip2->cidr &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto;
+}
+
+static inline bool
+hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst,
+			  const struct hash_ipportnet6_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
+{
+	ip6_netmask(&elem->ip2, cidr);
+	elem->cidr = cidr;
+}
+
+static bool
+hash_ipportnet6_data_list(struct sk_buff *skb,
+			  const struct hash_ipportnet6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_ipportnet6_data_tlist(struct sk_buff *skb,
+			   const struct hash_ipportnet6_elem *data)
+{
+	const struct hash_ipportnet6_telem *e =
+		(const struct hash_ipportnet6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		     enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportnet6_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6addrptr(skb, flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
+	ip6_netmask(&data.ip2, data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
+		     enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_ipportnet6_elem data = { .cidr = HOST_MASK };
+	u32 port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     tb[IPSET_ATTR_IP_TO] ||
+		     tb[IPSET_ATTR_CIDR]))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR2])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
+
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+
+	ip6_netmask(&data.ip2, data.cidr);
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMPV6))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	port = ntohs(data.port);
+	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (port > port_to)
+		swap(port, port_to);
+
+	for (; port <= port_to; port++) {
+		data.port = htons(port);
+		ret = adtfn(set, &data, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct ip_set_hash *h;
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h)
+		    + sizeof(struct ip_set_hash_nets)
+		      * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_ipportnet4_tvariant
+			: &hash_ipportnet6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_ipportnet4_gc_init(set);
+		else
+			hash_ipportnet6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_ipportnet4_variant : &hash_ipportnet6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_ipportnet_type __read_mostly = {
+	.name		= "hash:ip,port,net",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
+	.dimension	= IPSET_DIM_THREE,
+	.family		= AF_UNSPEC,
+	.revision	= 1,
+	.create		= hash_ipportnet_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP_TO]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_IP2]	= { .type = NLA_NESTED },
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_CIDR2]	= { .type = NLA_U8 },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_ipportnet_init(void)
+{
+	return ip_set_type_register(&hash_ipportnet_type);
+}
+
+static void __exit
+hash_ipportnet_fini(void)
+{
+	ip_set_type_unregister(&hash_ipportnet_type);
+}
+
+module_init(hash_ipportnet_init);
+module_exit(hash_ipportnet_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
new file mode 100644
index 00000000..2aeeabcd
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -0,0 +1,462 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net type of IP sets");
+MODULE_ALIAS("ip_set_hash:net");
+
+/* Type specific function prefix */
+#define TYPE		hash_net
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_net4_same_set	hash_net_same_set
+#define hash_net6_same_set	hash_net_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_net4_elem {
+	__be32 ip;
+	u16 padding0;
+	u8 padding1;
+	u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_net4_telem {
+	__be32 ip;
+	u16 padding0;
+	u8 padding1;
+	u8 cidr;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_net4_data_equal(const struct hash_net4_elem *ip1,
+		    const struct hash_net4_elem *ip2)
+{
+	return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net4_data_isnull(const struct hash_net4_elem *elem)
+{
+	return elem->cidr == 0;
+}
+
+static inline void
+hash_net4_data_copy(struct hash_net4_elem *dst,
+		    const struct hash_net4_elem *src)
+{
+	dst->ip = src->ip;
+	dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
+{
+	elem->ip &= ip_set_netmask(cidr);
+	elem->cidr = cidr;
+}
+
+/* Zero CIDR values cannot be stored */
+static inline void
+hash_net4_data_zero_out(struct hash_net4_elem *elem)
+{
+	elem->cidr = 0;
+}
+
+static bool
+hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data)
+{
+	const struct hash_net4_telem *tdata =
+		(const struct hash_net4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define IP_SET_HASH_WITH_NETS
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_net4_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	data.ip &= ip_set_netmask(data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
+	       enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_net4_elem data = { .cidr = HOST_MASK };
+	u32 timeout = h->timeout;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+
+	data.ip &= ip_set_netmask(data.cidr);
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	ret = adtfn(set, &data, timeout);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static bool
+hash_net_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_net6_elem {
+	union nf_inet_addr ip;
+	u16 padding0;
+	u8 padding1;
+	u8 cidr;
+};
+
+struct hash_net6_telem {
+	union nf_inet_addr ip;
+	u16 padding0;
+	u8 padding1;
+	u8 cidr;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_net6_data_equal(const struct hash_net6_elem *ip1,
+		     const struct hash_net6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_net6_data_isnull(const struct hash_net6_elem *elem)
+{
+	return elem->cidr == 0;
+}
+
+static inline void
+hash_net6_data_copy(struct hash_net6_elem *dst,
+		    const struct hash_net6_elem *src)
+{
+	ipv6_addr_copy(&dst->ip.in6, &src->ip.in6);
+	dst->cidr = src->cidr;
+}
+
+static inline void
+hash_net6_data_zero_out(struct hash_net6_elem *elem)
+{
+	elem->cidr = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
+{
+	ip6_netmask(&elem->ip, cidr);
+	elem->cidr = cidr;
+}
+
+static bool
+hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data)
+{
+	const struct hash_net6_telem *e =
+		(const struct hash_net6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
+	       enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_net6_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6_netmask(&data.ip, data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
+	       enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_net6_elem data = { .cidr = HOST_MASK };
+	u32 timeout = h->timeout;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+
+	ip6_netmask(&data.ip, data.cidr);
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	ret = adtfn(set, &data, timeout);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	struct ip_set_hash *h;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h)
+		    + sizeof(struct ip_set_hash_nets)
+		      * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_net4_tvariant : &hash_net6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_net4_gc_init(set);
+		else
+			hash_net6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_net4_variant : &hash_net6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_net_type __read_mostly = {
+	.name		= "hash:net",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP,
+	.dimension	= IPSET_DIM_ONE,
+	.family		= AF_UNSPEC,
+	.revision	= 0,
+	.create		= hash_net_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_net_init(void)
+{
+	return ip_set_type_register(&hash_net_type);
+}
+
+static void __exit
+hash_net_fini(void)
+{
+	ip_set_type_unregister(&hash_net_type);
+}
+
+module_init(hash_net_init);
+module_exit(hash_net_fini);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
new file mode 100644
index 00000000..e50d9bb8
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -0,0 +1,566 @@
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the hash:net,port type */
+
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/ipset/pfxlen.h>
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_getport.h>
+#include <linux/netfilter/ipset/ip_set_hash.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("hash:net,port type of IP sets");
+MODULE_ALIAS("ip_set_hash:net,port");
+
+/* Type specific function prefix */
+#define TYPE		hash_netport
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b);
+
+#define hash_netport4_same_set	hash_netport_same_set
+#define hash_netport6_same_set	hash_netport_same_set
+
+/* The type variant functions: IPv4 */
+
+/* Member elements without timeout */
+struct hash_netport4_elem {
+	__be32 ip;
+	__be16 port;
+	u8 proto;
+	u8 cidr;
+};
+
+/* Member elements with timeout support */
+struct hash_netport4_telem {
+	__be32 ip;
+	__be16 port;
+	u8 proto;
+	u8 cidr;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
+			 const struct hash_netport4_elem *ip2)
+{
+	return ip1->ip == ip2->ip &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto &&
+	       ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport4_data_isnull(const struct hash_netport4_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_netport4_data_copy(struct hash_netport4_elem *dst,
+			const struct hash_netport4_elem *src)
+{
+	dst->ip = src->ip;
+	dst->port = src->port;
+	dst->proto = src->proto;
+	dst->cidr = src->cidr;
+}
+
+static inline void
+hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
+{
+	elem->ip &= ip_set_netmask(cidr);
+	elem->cidr = cidr;
+}
+
+static inline void
+hash_netport4_data_zero_out(struct hash_netport4_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static bool
+hash_netport4_data_list(struct sk_buff *skb,
+			const struct hash_netport4_elem *data)
+{
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_netport4_data_tlist(struct sk_buff *skb,
+			 const struct hash_netport4_elem *data)
+{
+	const struct hash_netport4_telem *tdata =
+		(const struct hash_netport4_telem *)data;
+
+	NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(tdata->timeout)));
+
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+#define PF		4
+#define HOST_MASK	32
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
+		   enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netport4_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	if (!ip_set_get_ip4_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip4addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip);
+	data.ip &= ip_set_netmask(data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
+		   enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netport4_elem data = { .cidr = HOST_MASK };
+	u32 port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+	data.ip &= ip_set_netmask(data.cidr);
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMP))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	port = ntohs(data.port);
+	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (port > port_to)
+		swap(port, port_to);
+
+	for (; port <= port_to; port++) {
+		data.port = htons(port);
+		ret = adtfn(set, &data, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+static bool
+hash_netport_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct ip_set_hash *x = a->data;
+	const struct ip_set_hash *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout;
+}
+
+/* The type variant functions: IPv6 */
+
+struct hash_netport6_elem {
+	union nf_inet_addr ip;
+	__be16 port;
+	u8 proto;
+	u8 cidr;
+};
+
+struct hash_netport6_telem {
+	union nf_inet_addr ip;
+	__be16 port;
+	u8 proto;
+	u8 cidr;
+	unsigned long timeout;
+};
+
+static inline bool
+hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
+			 const struct hash_netport6_elem *ip2)
+{
+	return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 &&
+	       ip1->port == ip2->port &&
+	       ip1->proto == ip2->proto &&
+	       ip1->cidr == ip2->cidr;
+}
+
+static inline bool
+hash_netport6_data_isnull(const struct hash_netport6_elem *elem)
+{
+	return elem->proto == 0;
+}
+
+static inline void
+hash_netport6_data_copy(struct hash_netport6_elem *dst,
+			const struct hash_netport6_elem *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
+static inline void
+hash_netport6_data_zero_out(struct hash_netport6_elem *elem)
+{
+	elem->proto = 0;
+}
+
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
+static inline void
+hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
+{
+	ip6_netmask(&elem->ip, cidr);
+	elem->cidr = cidr;
+}
+
+static bool
+hash_netport6_data_list(struct sk_buff *skb,
+			const struct hash_netport6_elem *data)
+{
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+static bool
+hash_netport6_data_tlist(struct sk_buff *skb,
+			 const struct hash_netport6_elem *data)
+{
+	const struct hash_netport6_telem *e =
+		(const struct hash_netport6_telem *)data;
+
+	NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip);
+	NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port);
+	NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr);
+	NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto);
+	NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+		      htonl(ip_set_timeout_get(e->timeout)));
+	return 0;
+
+nla_put_failure:
+	return 1;
+}
+
+#undef PF
+#undef HOST_MASK
+
+#define PF		6
+#define HOST_MASK	128
+#include <linux/netfilter/ipset/ip_set_ahash.h>
+
+static int
+hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
+		   enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netport6_elem data = {
+		.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
+	};
+
+	if (data.cidr == 0)
+		return -EINVAL;
+	if (adt == IPSET_TEST)
+		data.cidr = HOST_MASK;
+
+	if (!ip_set_get_ip6_port(skb, flags & IPSET_DIM_TWO_SRC,
+				 &data.port, &data.proto))
+		return -EINVAL;
+
+	ip6addrptr(skb, flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
+	ip6_netmask(&data.ip, data.cidr);
+
+	return adtfn(set, &data, h->timeout);
+}
+
+static int
+hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
+		   enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	const struct ip_set_hash *h = set->data;
+	ipset_adtfn adtfn = set->variant->adt[adt];
+	struct hash_netport6_elem data = { .cidr = HOST_MASK };
+	u32 port, port_to;
+	u32 timeout = h->timeout;
+	bool with_ports = false;
+	int ret;
+
+	if (unlikely(!tb[IPSET_ATTR_IP] ||
+		     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+	if (ret)
+		return ret;
+
+	if (tb[IPSET_ATTR_CIDR])
+		data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+	if (!data.cidr)
+		return -IPSET_ERR_INVALID_CIDR;
+	ip6_netmask(&data.ip, data.cidr);
+
+	if (tb[IPSET_ATTR_PORT])
+		data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+	else
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_PROTO]) {
+		data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+		with_ports = ip_set_proto_with_ports(data.proto);
+
+		if (data.proto == 0)
+			return -IPSET_ERR_INVALID_PROTO;
+	} else
+		return -IPSET_ERR_MISSING_PROTO;
+
+	if (!(with_ports || data.proto == IPPROTO_ICMPV6))
+		data.port = 0;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout(h->timeout))
+			return -IPSET_ERR_TIMEOUT;
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
+		ret = adtfn(set, &data, timeout);
+		return ip_set_eexist(ret, flags) ? 0 : ret;
+	}
+
+	port = ntohs(data.port);
+	port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
+	if (port > port_to)
+		swap(port, port_to);
+
+	for (; port <= port_to; port++) {
+		data.port = htons(port);
+		ret = adtfn(set, &data, timeout);
+
+		if (ret && !ip_set_eexist(ret, flags))
+			return ret;
+		else
+			ret = 0;
+	}
+	return ret;
+}
+
+/* Create hash:ip type of sets */
+
+static int
+hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	struct ip_set_hash *h;
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+
+	if (!(set->family == AF_INET || set->family == AF_INET6))
+		return -IPSET_ERR_INVALID_FAMILY;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+	h = kzalloc(sizeof(*h)
+		    + sizeof(struct ip_set_hash_nets)
+		      * (set->family == AF_INET ? 32 : 128), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	h->table = ip_set_alloc(
+			sizeof(struct htable)
+			+ jhash_size(hbits) * sizeof(struct hbucket));
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+
+		set->variant = set->family == AF_INET
+			? &hash_netport4_tvariant : &hash_netport6_tvariant;
+
+		if (set->family == AF_INET)
+			hash_netport4_gc_init(set);
+		else
+			hash_netport6_gc_init(set);
+	} else {
+		set->variant = set->family == AF_INET
+			? &hash_netport4_variant : &hash_netport6_variant;
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+
+static struct ip_set_type hash_netport_type __read_mostly = {
+	.name		= "hash:net,port",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
+	.dimension	= IPSET_DIM_TWO,
+	.family		= AF_UNSPEC,
+	.revision	= 1,
+	.create		= hash_netport_create,
+	.create_policy	= {
+		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_MAXELEM]	= { .type = NLA_U32 },
+		[IPSET_ATTR_PROBES]	= { .type = NLA_U8 },
+		[IPSET_ATTR_RESIZE]	= { .type = NLA_U8  },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_IP]		= { .type = NLA_NESTED },
+		[IPSET_ATTR_PORT]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PORT_TO]	= { .type = NLA_U16 },
+		[IPSET_ATTR_PROTO]	= { .type = NLA_U8 },
+		[IPSET_ATTR_CIDR]	= { .type = NLA_U8 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+hash_netport_init(void)
+{
+	return ip_set_type_register(&hash_netport_type);
+}
+
+static void __exit
+hash_netport_fini(void)
+{
+	ip_set_type_unregister(&hash_netport_type);
+}
+
+module_init(hash_netport_init);
+module_exit(hash_netport_fini);
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
new file mode 100644
index 00000000..e9159e99
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -0,0 +1,577 @@
+/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module implementing an IP set type: the list:set type */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+
+#include <linux/netfilter/ipset/ip_set.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set_list.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("list:set type of IP sets");
+MODULE_ALIAS("ip_set_list:set");
+
+/* Member elements without and with timeout */
+struct set_elem {
+	ip_set_id_t id;
+};
+
+struct set_telem {
+	ip_set_id_t id;
+	unsigned long timeout;
+};
+
+/* Type structure */
+struct list_set {
+	size_t dsize;		/* element size */
+	u32 size;		/* size of set list array */
+	u32 timeout;		/* timeout value */
+	struct timer_list gc;	/* garbage collection */
+	struct set_elem members[0]; /* the set members */
+};
+
+static inline struct set_elem *
+list_set_elem(const struct list_set *map, u32 id)
+{
+	return (struct set_elem *)((void *)map->members + id * map->dsize);
+}
+
+static inline struct set_telem *
+list_set_telem(const struct list_set *map, u32 id)
+{
+	return (struct set_telem *)((void *)map->members + id * map->dsize);
+}
+
+static inline bool
+list_set_timeout(const struct list_set *map, u32 id)
+{
+	const struct set_telem *elem = list_set_telem(map, id);
+
+	return ip_set_timeout_test(elem->timeout);
+}
+
+static inline bool
+list_set_expired(const struct list_set *map, u32 id)
+{
+	const struct set_telem *elem = list_set_telem(map, id);
+
+	return ip_set_timeout_expired(elem->timeout);
+}
+
+/* Set list without and with timeout */
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+	      enum ipset_adt adt, u8 pf, u8 dim, u8 flags)
+{
+	struct list_set *map = set->data;
+	struct set_elem *elem;
+	u32 i;
+	int ret;
+
+	for (i = 0; i < map->size; i++) {
+		elem = list_set_elem(map, i);
+		if (elem->id == IPSET_INVALID_ID)
+			return 0;
+		if (with_timeout(map->timeout) && list_set_expired(map, i))
+			continue;
+		switch (adt) {
+		case IPSET_TEST:
+			ret = ip_set_test(elem->id, skb, pf, dim, flags);
+			if (ret > 0)
+				return ret;
+			break;
+		case IPSET_ADD:
+			ret = ip_set_add(elem->id, skb, pf, dim, flags);
+			if (ret == 0)
+				return ret;
+			break;
+		case IPSET_DEL:
+			ret = ip_set_del(elem->id, skb, pf, dim, flags);
+			if (ret == 0)
+				return ret;
+			break;
+		default:
+			break;
+		}
+	}
+	return -EINVAL;
+}
+
+static bool
+next_id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
+{
+	const struct set_elem *elem;
+
+	if (i + 1 < map->size) {
+		elem = list_set_elem(map, i + 1);
+		return !!(elem->id == id &&
+			  !(with_timeout(map->timeout) &&
+			    list_set_expired(map, i + 1)));
+	}
+
+	return 0;
+}
+
+static void
+list_elem_add(struct list_set *map, u32 i, ip_set_id_t id)
+{
+	struct set_elem *e;
+
+	for (; i < map->size; i++) {
+		e = list_set_elem(map, i);
+		swap(e->id, id);
+		if (e->id == IPSET_INVALID_ID)
+			break;
+	}
+}
+
+static void
+list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id,
+	       unsigned long timeout)
+{
+	struct set_telem *e;
+
+	for (; i < map->size; i++) {
+		e = list_set_telem(map, i);
+		swap(e->id, id);
+		swap(e->timeout, timeout);
+		if (e->id == IPSET_INVALID_ID)
+			break;
+	}
+}
+
+static int
+list_set_add(struct list_set *map, u32 i, ip_set_id_t id,
+	     unsigned long timeout)
+{
+	const struct set_elem *e = list_set_elem(map, i);
+
+	if (i == map->size - 1 && e->id != IPSET_INVALID_ID)
+		/* Last element replaced: e.g. add new,before,last */
+		ip_set_put_byindex(e->id);
+	if (with_timeout(map->timeout))
+		list_elem_tadd(map, i, id, ip_set_timeout_set(timeout));
+	else
+		list_elem_add(map, i, id);
+
+	return 0;
+}
+
+static int
+list_set_del(struct list_set *map, u32 i)
+{
+	struct set_elem *a = list_set_elem(map, i), *b;
+
+	ip_set_put_byindex(a->id);
+
+	for (; i < map->size - 1; i++) {
+		b = list_set_elem(map, i + 1);
+		a->id = b->id;
+		if (with_timeout(map->timeout))
+			((struct set_telem *)a)->timeout =
+				((struct set_telem *)b)->timeout;
+		a = b;
+		if (a->id == IPSET_INVALID_ID)
+			break;
+	}
+	/* Last element */
+	a->id = IPSET_INVALID_ID;
+	return 0;
+}
+
+static int
+list_set_uadt(struct ip_set *set, struct nlattr *tb[],
+	      enum ipset_adt adt, u32 *lineno, u32 flags)
+{
+	struct list_set *map = set->data;
+	bool with_timeout = with_timeout(map->timeout);
+	int before = 0;
+	u32 timeout = map->timeout;
+	ip_set_id_t id, refid = IPSET_INVALID_ID;
+	const struct set_elem *elem;
+	struct ip_set *s;
+	u32 i;
+	int ret = 0;
+
+	if (unlikely(!tb[IPSET_ATTR_NAME] ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_LINENO])
+		*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
+
+	id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+	if (id == IPSET_INVALID_ID)
+		return -IPSET_ERR_NAME;
+	/* "Loop detection" */
+	if (s->type->features & IPSET_TYPE_NAME) {
+		ret = -IPSET_ERR_LOOP;
+		goto finish;
+	}
+
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
+		u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+		before = f & IPSET_FLAG_BEFORE;
+	}
+
+	if (before && !tb[IPSET_ATTR_NAMEREF]) {
+		ret = -IPSET_ERR_BEFORE;
+		goto finish;
+	}
+
+	if (tb[IPSET_ATTR_NAMEREF]) {
+		refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+					  &s);
+		if (refid == IPSET_INVALID_ID) {
+			ret = -IPSET_ERR_NAMEREF;
+			goto finish;
+		}
+		if (!before)
+			before = -1;
+	}
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!with_timeout) {
+			ret = -IPSET_ERR_TIMEOUT;
+			goto finish;
+		}
+		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+
+	switch (adt) {
+	case IPSET_TEST:
+		for (i = 0; i < map->size && !ret; i++) {
+			elem = list_set_elem(map, i);
+			if (elem->id == IPSET_INVALID_ID ||
+			    (before != 0 && i + 1 >= map->size))
+				break;
+			else if (with_timeout && list_set_expired(map, i))
+				continue;
+			else if (before > 0 && elem->id == id)
+				ret = next_id_eq(map, i, refid);
+			else if (before < 0 && elem->id == refid)
+				ret = next_id_eq(map, i, id);
+			else if (before == 0 && elem->id == id)
+				ret = 1;
+		}
+		break;
+	case IPSET_ADD:
+		for (i = 0; i < map->size && !ret; i++) {
+			elem = list_set_elem(map, i);
+			if (elem->id == id &&
+			    !(with_timeout && list_set_expired(map, i)))
+				ret = -IPSET_ERR_EXIST;
+		}
+		if (ret == -IPSET_ERR_EXIST)
+			break;
+		ret = -IPSET_ERR_LIST_FULL;
+		for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+			elem = list_set_elem(map, i);
+			if (elem->id == IPSET_INVALID_ID)
+				ret = before != 0 ? -IPSET_ERR_REF_EXIST
+					: list_set_add(map, i, id, timeout);
+			else if (elem->id != refid)
+				continue;
+			else if (with_timeout && list_set_expired(map, i))
+				ret = -IPSET_ERR_REF_EXIST;
+			else if (before)
+				ret = list_set_add(map, i, id, timeout);
+			else if (i + 1 < map->size)
+				ret = list_set_add(map, i + 1, id, timeout);
+		}
+		break;
+	case IPSET_DEL:
+		ret = -IPSET_ERR_EXIST;
+		for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) {
+			elem = list_set_elem(map, i);
+			if (elem->id == IPSET_INVALID_ID) {
+				ret = before != 0 ? -IPSET_ERR_REF_EXIST
+						  : -IPSET_ERR_EXIST;
+				break;
+			} else if (with_timeout && list_set_expired(map, i))
+				continue;
+			else if (elem->id == id &&
+				 (before == 0 ||
+				  (before > 0 &&
+				   next_id_eq(map, i, refid))))
+				ret = list_set_del(map, i);
+			else if (before < 0 &&
+				 elem->id == refid &&
+				 next_id_eq(map, i, id))
+				ret = list_set_del(map, i + 1);
+		}
+		break;
+	default:
+		break;
+	}
+
+finish:
+	if (refid != IPSET_INVALID_ID)
+		ip_set_put_byindex(refid);
+	if (adt != IPSET_ADD || ret)
+		ip_set_put_byindex(id);
+
+	return ip_set_eexist(ret, flags) ? 0 : ret;
+}
+
+static void
+list_set_flush(struct ip_set *set)
+{
+	struct list_set *map = set->data;
+	struct set_elem *elem;
+	u32 i;
+
+	for (i = 0; i < map->size; i++) {
+		elem = list_set_elem(map, i);
+		if (elem->id != IPSET_INVALID_ID) {
+			ip_set_put_byindex(elem->id);
+			elem->id = IPSET_INVALID_ID;
+		}
+	}
+}
+
+static void
+list_set_destroy(struct ip_set *set)
+{
+	struct list_set *map = set->data;
+
+	if (with_timeout(map->timeout))
+		del_timer_sync(&map->gc);
+	list_set_flush(set);
+	kfree(map);
+
+	set->data = NULL;
+}
+
+static int
+list_set_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct list_set *map = set->data;
+	struct nlattr *nested;
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size));
+	if (with_timeout(map->timeout))
+		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout));
+	NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1));
+	NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE,
+		      htonl(sizeof(*map) + map->size * map->dsize));
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int
+list_set_list(const struct ip_set *set,
+	      struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct list_set *map = set->data;
+	struct nlattr *atd, *nested;
+	u32 i, first = cb->args[2];
+	const struct set_elem *e;
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	for (; cb->args[2] < map->size; cb->args[2]++) {
+		i = cb->args[2];
+		e = list_set_elem(map, i);
+		if (e->id == IPSET_INVALID_ID)
+			goto finish;
+		if (with_timeout(map->timeout) && list_set_expired(map, i))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (i == first) {
+				nla_nest_cancel(skb, atd);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		NLA_PUT_STRING(skb, IPSET_ATTR_NAME,
+			       ip_set_name_byindex(e->id));
+		if (with_timeout(map->timeout)) {
+			const struct set_telem *te =
+				(const struct set_telem *) e;
+			NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT,
+				      htonl(ip_set_timeout_get(te->timeout)));
+		}
+		ipset_nest_end(skb, nested);
+	}
+finish:
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, atd);
+	if (unlikely(i == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static bool
+list_set_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct list_set *x = a->data;
+	const struct list_set *y = b->data;
+
+	return x->size == y->size &&
+	       x->timeout == y->timeout;
+}
+
+static const struct ip_set_type_variant list_set = {
+	.kadt	= list_set_kadt,
+	.uadt	= list_set_uadt,
+	.destroy = list_set_destroy,
+	.flush	= list_set_flush,
+	.head	= list_set_head,
+	.list	= list_set_list,
+	.same_set = list_set_same_set,
+};
+
+static void
+list_set_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct list_set *map = set->data;
+	struct set_telem *e;
+	u32 i;
+
+	write_lock_bh(&set->lock);
+	for (i = 0; i < map->size; i++) {
+		e = list_set_telem(map, i);
+		if (e->id != IPSET_INVALID_ID && list_set_expired(map, i))
+			list_set_del(map, i);
+	}
+	write_unlock_bh(&set->lock);
+
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static void
+list_set_gc_init(struct ip_set *set)
+{
+	struct list_set *map = set->data;
+
+	init_timer(&map->gc);
+	map->gc.data = (unsigned long) set;
+	map->gc.function = list_set_gc;
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+/* Create list:set type of sets */
+
+static bool
+init_list_set(struct ip_set *set, u32 size, size_t dsize,
+	      unsigned long timeout)
+{
+	struct list_set *map;
+	struct set_elem *e;
+	u32 i;
+
+	map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL);
+	if (!map)
+		return false;
+
+	map->size = size;
+	map->dsize = dsize;
+	map->timeout = timeout;
+	set->data = map;
+
+	for (i = 0; i < size; i++) {
+		e = list_set_elem(map, i);
+		e->id = IPSET_INVALID_ID;
+	}
+
+	return true;
+}
+
+static int
+list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	u32 size = IP_SET_LIST_DEFAULT_SIZE;
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_SIZE])
+		size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]);
+	if (size < IP_SET_LIST_MIN_SIZE)
+		size = IP_SET_LIST_MIN_SIZE;
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!init_list_set(set, size, sizeof(struct set_telem),
+				   ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT])))
+			return -ENOMEM;
+
+		list_set_gc_init(set);
+	} else {
+		if (!init_list_set(set, size, sizeof(struct set_elem),
+				   IPSET_NO_TIMEOUT))
+			return -ENOMEM;
+	}
+	set->variant = &list_set;
+	return 0;
+}
+
+static struct ip_set_type list_set_type __read_mostly = {
+	.name		= "list:set",
+	.protocol	= IPSET_PROTOCOL,
+	.features	= IPSET_TYPE_NAME | IPSET_DUMP_LAST,
+	.dimension	= IPSET_DIM_ONE,
+	.family		= AF_UNSPEC,
+	.revision	= 0,
+	.create		= list_set_create,
+	.create_policy	= {
+		[IPSET_ATTR_SIZE]	= { .type = NLA_U32 },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+	},
+	.adt_policy	= {
+		[IPSET_ATTR_NAME]	= { .type = NLA_STRING,
+					    .len = IPSET_MAXNAMELEN },
+		[IPSET_ATTR_NAMEREF]	= { .type = NLA_STRING,
+					    .len = IPSET_MAXNAMELEN },
+		[IPSET_ATTR_TIMEOUT]	= { .type = NLA_U32 },
+		[IPSET_ATTR_LINENO]	= { .type = NLA_U32 },
+		[IPSET_ATTR_CADT_FLAGS]	= { .type = NLA_U32 },
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init
+list_set_init(void)
+{
+	return ip_set_type_register(&list_set_type);
+}
+
+static void __exit
+list_set_fini(void)
+{
+	ip_set_type_unregister(&list_set_type);
+}
+
+module_init(list_set_init);
+module_exit(list_set_fini);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
new file mode 100644
index 00000000..23f8c816
--- /dev/null
+++ b/net/netfilter/ipset/pfxlen.c
@@ -0,0 +1,291 @@
+#include <linux/netfilter/ipset/pfxlen.h>
+
+/*
+ * Prefixlen maps for fast conversions, by Jan Engelhardt.
+ */
+
+#define E(a, b, c, d) \
+	{.ip6 = { \
+		__constant_htonl(a), __constant_htonl(b), \
+		__constant_htonl(c), __constant_htonl(d), \
+	} }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_netmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_netmask_map[] = {
+	E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_netmask_map);
+
+#undef  E
+#define E(a, b, c, d) 						\
+	{.ip6 = { (__force __be32) a, (__force __be32) b,	\
+		  (__force __be32) c, (__force __be32) d,	\
+	} }
+
+/*
+ * This table works for both IPv4 and IPv6;
+ * just use prefixlen_hostmask_map[prefixlength].ip.
+ */
+const union nf_inet_addr ip_set_hostmask_map[] = {
+	E(0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0x80000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xC0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xE0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xF0000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xF8000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFC000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFE000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFF000000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFF800000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE),
+	E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF),
+};
+EXPORT_SYMBOL_GPL(ip_set_hostmask_map);
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
new file mode 100644
index 00000000..70bd1d07
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,267 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+	tristate "IP virtual server support"
+	depends on NET && INET && NETFILTER
+	depends on (NF_CONNTRACK || NF_CONNTRACK=n)
+	---help---
+	  IP Virtual Server support will let you build a high-performance
+	  virtual server based on cluster of two or more real servers. This
+	  option must be enabled for at least one of the clustered computers
+	  that will take care of intercepting incoming connections to a
+	  single IP address and scheduling them to real servers.
+
+	  Three request dispatching techniques are implemented, they are
+	  virtual server via NAT, virtual server via tunneling and virtual
+	  server via direct routing. The several scheduling algorithms can
+	  be used to choose which server the connection is directed to,
+	  thus load balancing can be achieved among the servers.  For more
+	  information and its administration program, please visit the
+	  following URL: <http://www.linuxvirtualserver.org/>.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config	IP_VS_IPV6
+	bool "IPv6 support for IPVS"
+	depends on IPV6 = y || IP_VS = IPV6
+	---help---
+	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
+
+	  See http://www.mindbasket.com/ipvs for more information.
+
+	  Say N if unsure.
+
+config	IP_VS_DEBUG
+	bool "IP virtual server debugging"
+	---help---
+	  Say Y here if you want to get additional messages useful in
+	  debugging the IP virtual server code. You can change the debug
+	  level in /proc/sys/net/ipv4/vs/debug_level
+
+config	IP_VS_TAB_BITS
+	int "IPVS connection table size (the Nth power of 2)"
+	range 8 20
+	default 12
+	---help---
+	  The IPVS connection hash table uses the chaining scheme to handle
+	  hash collisions. Using a big IPVS connection hash table will greatly
+	  reduce conflicts when there are hundreds of thousands of connections
+	  in the hash table.
+
+	  Note the table size must be power of 2. The table size will be the
+	  value of 2 to the your input number power. The number to choose is
+	  from 8 to 20, the default number is 12, which means the table size
+	  is 4096. Don't input the number too small, otherwise you will lose
+	  performance on it. You can adapt the table size yourself, according
+	  to your virtual server application. It is good to set the table size
+	  not far less than the number of connections per second multiplying
+	  average lasting time of connection in the table.  For example, your
+	  virtual server gets 200 connections per second, the connection lasts
+	  for 200 seconds in average in the connection table, the table size
+	  should be not far less than 200x200, it is good to set the table
+	  size 32768 (2**15).
+
+	  Another note that each connection occupies 128 bytes effectively and
+	  each hash entry uses 8 bytes, so you can estimate how much memory is
+	  needed for your box.
+
+	  You can overwrite this number setting conn_tab_bits module parameter
+	  or by appending ip_vs.conn_tab_bits=? to the kernel command line
+	  if IP VS was compiled built-in.
+
+comment "IPVS transport protocol load balancing support"
+
+config	IP_VS_PROTO_TCP
+	bool "TCP load balancing support"
+	---help---
+	  This option enables support for load balancing TCP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_UDP
+	bool "UDP load balancing support"
+	---help---
+	  This option enables support for load balancing UDP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_AH_ESP
+	def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
+
+config	IP_VS_PROTO_ESP
+	bool "ESP load balancing support"
+	---help---
+	  This option enables support for load balancing ESP (Encapsulation
+	  Security Payload) transport protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_AH
+	bool "AH load balancing support"
+	---help---
+	  This option enables support for load balancing AH (Authentication
+	  Header) transport protocol. Say Y if unsure.
+
+config  IP_VS_PROTO_SCTP
+	bool "SCTP load balancing support"
+	select LIBCRC32C
+	---help---
+	  This option enables support for load balancing SCTP transport
+	  protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config	IP_VS_RR
+	tristate "round-robin scheduling"
+	---help---
+	  The robin-robin scheduling algorithm simply directs network
+	  connections to different real servers in a round-robin manner.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+ 
+config	IP_VS_WRR
+	tristate "weighted round-robin scheduling"
+	select GCD
+	---help---
+	  The weighted robin-robin scheduling algorithm directs network
+	  connections to different real servers based on server weights
+	  in a round-robin manner. Servers with higher weights receive
+	  new connections first than those with less weights, and servers
+	  with higher weights get more connections than those with less
+	  weights and servers with equal weights get equal connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LC
+        tristate "least-connection scheduling"
+	---help---
+	  The least-connection scheduling algorithm directs network
+	  connections to the server with the least number of active 
+	  connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+	---help---
+	  The weighted least-connection scheduling algorithm directs network
+	  connections to the server with the least active connections
+	  normalized by the server weight.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LBLC
+	tristate "locality-based least-connection scheduling"
+	---help---
+	  The locality-based least-connection scheduling algorithm is for
+	  destination IP load balancing. It is usually used in cache cluster.
+	  This algorithm usually directs packet destined for an IP address to
+	  its server if the server is alive and under load. If the server is
+	  overloaded (its active connection numbers is larger than its weight)
+	  and there is a server in its half load, then allocate the weighted
+	  least-connection server to this IP address.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config  IP_VS_LBLCR
+	tristate "locality-based least-connection with replication scheduling"
+	---help---
+	  The locality-based least-connection with replication scheduling
+	  algorithm is also for destination IP load balancing. It is 
+	  usually used in cache cluster. It differs from the LBLC scheduling
+	  as follows: the load balancer maintains mappings from a target
+	  to a set of server nodes that can serve the target. Requests for
+	  a target are assigned to the least-connection node in the target's
+	  server set. If all the node in the server set are over loaded,
+	  it picks up a least-connection node in the cluster and adds it
+	  in the sever set for the target. If the server set has not been
+	  modified for the specified time, the most loaded node is removed
+	  from the server set, in order to avoid high degree of replication.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_DH
+	tristate "destination hashing scheduling"
+	---help---
+	  The destination hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their destination IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SH
+	tristate "source hashing scheduling"
+	---help---
+	  The source hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their source IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SED
+	tristate "shortest expected delay scheduling"
+	---help---
+	  The shortest expected delay scheduling algorithm assigns network
+	  connections to the server with the shortest expected delay. The 
+	  expected delay that the job will experience is (Ci + 1) / Ui if 
+	  sent to the ith server, in which Ci is the number of connections
+	  on the ith server and Ui is the fixed service rate (weight)
+	  of the ith server.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_NQ
+	tristate "never queue scheduling"
+	---help---
+	  The never queue scheduling algorithm adopts a two-speed model.
+	  When there is an idle server available, the job will be sent to
+	  the idle server, instead of waiting for a fast one. When there
+	  is no idle server available, the job will be sent to the server
+	  that minimize its expected delay (The Shortest Expected Delay
+	  scheduling algorithm).
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+comment 'IPVS application helper'
+
+config	IP_VS_FTP
+  	tristate "FTP protocol helper"
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+	select IP_VS_NFCT
+	---help---
+	  FTP is a protocol that transfers IP address and/or port number in
+	  the payload. In the virtual server via Network Address Translation,
+	  the IP address and port number of real servers cannot be sent to
+	  clients in ftp connections directly, so FTP protocol helper is
+	  required for tracking the connection and mangling it back to that of
+	  virtual service.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_NFCT
+	bool "Netfilter connection tracking"
+	depends on NF_CONNTRACK
+	---help---
+	  The Netfilter connection tracking support allows the IPVS
+	  connection state to be exported to the Netfilter framework
+	  for filtering purposes.
+
+config	IP_VS_PE_SIP
+	tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+	depends on NF_CONNTRACK_SIP
+	---help---
+	  Allow persistence based on the SIP Call-ID
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
new file mode 100644
index 00000000..34ee602d
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,40 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
+ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
+		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
+		ip_vs_est.o ip_vs_proto.o ip_vs_pe.o			   \
+		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
new file mode 100644
index 00000000..059af312
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,601 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *		IP_MASQ_APP application masquerading module
+ *
+ * Author:	Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+/*
+ *	Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+	return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+	module_put(app->module);
+}
+
+
+/*
+ *	Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+		  __u16 port)
+{
+	struct ip_vs_protocol *pp;
+	struct ip_vs_app *inc;
+	int ret;
+
+	if (!(pp = ip_vs_proto_get(proto)))
+		return -EPROTONOSUPPORT;
+
+	if (!pp->unregister_app)
+		return -EOPNOTSUPP;
+
+	inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+	if (!inc)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&inc->p_list);
+	INIT_LIST_HEAD(&inc->incs_list);
+	inc->app = app;
+	inc->port = htons(port);
+	atomic_set(&inc->usecnt, 0);
+
+	if (app->timeouts) {
+		inc->timeout_table =
+			ip_vs_create_timeout_table(app->timeouts,
+						   app->timeouts_size);
+		if (!inc->timeout_table) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	ret = pp->register_app(net, inc);
+	if (ret)
+		goto out;
+
+	list_add(&inc->a_list, &app->incs_list);
+	IP_VS_DBG(9, "%s App %s:%u registered\n",
+		  pp->name, inc->name, ntohs(inc->port));
+
+	return 0;
+
+  out:
+	kfree(inc->timeout_table);
+	kfree(inc);
+	return ret;
+}
+
+
+/*
+ *	Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+{
+	struct ip_vs_protocol *pp;
+
+	if (!(pp = ip_vs_proto_get(inc->protocol)))
+		return;
+
+	if (pp->unregister_app)
+		pp->unregister_app(net, inc);
+
+	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+		  pp->name, inc->name, ntohs(inc->port));
+
+	list_del(&inc->a_list);
+
+	kfree(inc->timeout_table);
+	kfree(inc);
+}
+
+
+/*
+ *	Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+	int result;
+
+	atomic_inc(&inc->usecnt);
+	if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+		atomic_dec(&inc->usecnt);
+	return result;
+}
+
+
+/*
+ *	Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+	ip_vs_app_put(inc->app);
+	atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ *	Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+		       __u16 port)
+{
+	int result;
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	result = ip_vs_app_inc_new(net, app, proto, port);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	return result;
+}
+
+
+/*
+ *	ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	list_add(&app->a_list, &ipvs->app_list);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	return 0;
+}
+
+
+/*
+ *	ip_vs_app unregistration routine
+ *	We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+	struct ip_vs_app *inc, *nxt;
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+		ip_vs_app_inc_release(net, inc);
+	}
+
+	list_del(&app->a_list);
+
+	mutex_unlock(&__ip_vs_app_mutex);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+
+/*
+ *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+		   struct ip_vs_protocol *pp)
+{
+	return pp->app_conn_bind(cp);
+}
+
+
+/*
+ *	Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+	struct ip_vs_app *inc = cp->app;
+
+	if (!inc)
+		return;
+
+	if (inc->unbind_conn)
+		inc->unbind_conn(inc, cp);
+	if (inc->done_conn)
+		inc->done_conn(inc, cp);
+	ip_vs_app_inc_put(inc);
+	cp->app = NULL;
+}
+
+
+/*
+ *	Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 seq = ntohl(th->seq);
+
+	/*
+	 *	Adjust seq with delta-offset for all packets after
+	 *	the most recent resized pkt seq and with previous_delta offset
+	 *	for all packets	before most recent resized pkt seq.
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		if(after(seq, vseq->init_seq)) {
+			th->seq = htonl(seq + vseq->delta);
+			IP_VS_DBG(9, "%s(): added delta (%d) to seq\n",
+				  __func__, vseq->delta);
+		} else {
+			th->seq = htonl(seq + vseq->previous_delta);
+			IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n",
+				  __func__, vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 ack_seq = ntohl(th->ack_seq);
+
+	/*
+	 * Adjust ack_seq with delta-offset for
+	 * the packets AFTER most recent resized pkt has caused a shift
+	 * for packets before most recent resized pkt, use previous_delta
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		/* since ack_seq is the number of octet that is expected
+		   to receive next, so compare it with init_seq+delta */
+		if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+			th->ack_seq = htonl(ack_seq - vseq->delta);
+			IP_VS_DBG(9, "%s(): subtracted delta "
+				  "(%d) from ack_seq\n", __func__, vseq->delta);
+
+		} else {
+			th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+			IP_VS_DBG(9, "%s(): subtracted "
+				  "previous_delta (%d) from ack_seq\n",
+				  __func__, vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Updates ip_vs_seq if pkt has been resized
+ *	Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+				 unsigned flag, __u32 seq, int diff)
+{
+	/* spinlock is to keep updating cp->flags atomic */
+	spin_lock(&cp->lock);
+	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+		vseq->previous_delta = vseq->delta;
+		vseq->delta += diff;
+		vseq->init_seq = seq;
+		cp->flags |= flag;
+	}
+	spin_unlock(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+				  struct ip_vs_app *app)
+{
+	int diff;
+	const unsigned int tcp_offset = ip_hdrlen(skb);
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_seq(&cp->out_seq, th);
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_ack_seq(&cp->in_seq, th);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	if (!app->pkt_out(app, cp, skb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->out_seq,
+			      IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Output pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL
+ *	returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_out(cp, skb, app);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+				 struct ip_vs_app *app)
+{
+	int diff;
+	const unsigned int tcp_offset = ip_hdrlen(skb);
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_seq(&cp->in_seq, th);
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_ack_seq(&cp->out_seq, th);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	if (!app->pkt_in(app, cp, skb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->in_seq,
+			      IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Input pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *	returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_in(cp, skb, app);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	/proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
+{
+	struct ip_vs_app *app, *inc;
+
+	list_for_each_entry(app, &ipvs->app_list, a_list) {
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			if (pos-- == 0)
+				return inc;
+		}
+	}
+	return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	mutex_lock(&__ip_vs_app_mutex);
+
+	return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_app *inc, *app;
+	struct list_head *e;
+	struct net *net = seq_file_net(seq);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_app_idx(ipvs, 0);
+
+	inc = v;
+	app = inc->app;
+
+	if ((e = inc->a_list.next) != &app->incs_list)
+		return list_entry(e, struct ip_vs_app, a_list);
+
+	/* go on to next application */
+	for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
+		app = list_entry(e, struct ip_vs_app, a_list);
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			return inc;
+		}
+	}
+	return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+	mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "prot port    usecnt name\n");
+	else {
+		const struct ip_vs_app *inc = v;
+
+		seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+			   ip_vs_proto_name(inc->protocol),
+			   ntohs(inc->port),
+			   atomic_read(&inc->usecnt),
+			   inc->name);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+	.start = ip_vs_app_seq_start,
+	.next  = ip_vs_app_seq_next,
+	.stop  = ip_vs_app_seq_stop,
+	.show  = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ip_vs_app_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ip_vs_app_open,
+	.read	 = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
+
+int __net_init __ip_vs_app_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	INIT_LIST_HEAD(&ipvs->app_list);
+	proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
+	return 0;
+}
+
+void __net_exit __ip_vs_app_cleanup(struct net *net)
+{
+	proc_net_remove(net, "ip_vs_app");
+}
+
+int __init ip_vs_app_init(void)
+{
+	return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
new file mode 100644
index 00000000..782db275
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1326 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>		/* for proc_net_* */
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS	12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
+
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
+
+/*  SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd __read_mostly;
+
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  5
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+	rwlock_t	l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ *	Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
+				       const union nf_inet_addr *addr,
+				       __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+				    (__force u32)port, proto, ip_vs_conn_rnd) ^
+			((size_t)net>>8)) & ip_vs_conn_tab_mask;
+#endif
+	return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+			    ip_vs_conn_rnd) ^
+		((size_t)net>>8)) & ip_vs_conn_tab_mask;
+}
+
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+					     bool inverse)
+{
+	const union nf_inet_addr *addr;
+	__be16 port;
+
+	if (p->pe_data && p->pe->hashkey_raw)
+		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+			ip_vs_conn_tab_mask;
+
+	if (likely(!inverse)) {
+		addr = p->caddr;
+		port = p->cport;
+	} else {
+		addr = p->vaddr;
+		port = p->vport;
+	}
+
+	return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn_param p;
+
+	ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+			      &cp->caddr, cp->cport, NULL, 0, &p);
+
+	if (cp->pe) {
+		p.pe = cp->pe;
+		p.pe_data = cp->pe_data;
+		p.pe_data_len = cp->pe_data_len;
+	}
+
+	return ip_vs_conn_hashkey_param(&p, false);
+}
+
+/*
+ *	Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return 0;
+
+	/* Hash by protocol, client address and port */
+	hash = ip_vs_conn_hashkey_conn(cp);
+
+	ct_write_lock(hash);
+	spin_lock(&cp->lock);
+
+	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+		hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
+		cp->flags |= IP_VS_CONN_F_HASHED;
+		atomic_inc(&cp->refcnt);
+		ret = 1;
+	} else {
+		pr_err("%s(): request for already hashed, called from %pF\n",
+		       __func__, __builtin_return_address(0));
+		ret = 0;
+	}
+
+	spin_unlock(&cp->lock);
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *	UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	/* unhash it and decrease its reference counter */
+	hash = ip_vs_conn_hashkey_conn(cp);
+
+	ct_write_lock(hash);
+	spin_lock(&cp->lock);
+
+	if (cp->flags & IP_VS_CONN_F_HASHED) {
+		hlist_del(&cp->c_list);
+		cp->flags &= ~IP_VS_CONN_F_HASHED;
+		atomic_dec(&cp->refcnt);
+		ret = 1;
+	} else
+		ret = 0;
+
+	spin_unlock(&cp->lock);
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *	p->caddr, p->cport: pkt source address (foreign host)
+ *	p->vaddr, p->vport: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+	struct hlist_node *n;
+
+	hash = ip_vs_conn_hashkey_param(p, false);
+
+	ct_read_lock(hash);
+
+	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+		if (cp->af == p->af &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    p->protocol == cp->protocol &&
+		    ip_vs_conn_net_eq(cp, p->net)) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ct_read_unlock(hash);
+			return cp;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+	struct ip_vs_conn *cp;
+
+	cp = __ip_vs_conn_in_get(p);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+		struct ip_vs_conn_param cport_zero_p = *p;
+		cport_zero_p.cport = 0;
+		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	}
+
+	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      cp ? "hit" : "not hit");
+
+	return cp;
+}
+
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+			    const struct ip_vs_iphdr *iph,
+			    unsigned int proto_off, int inverse,
+			    struct ip_vs_conn_param *p)
+{
+	__be16 _ports[2], *pptr;
+	struct net *net = skb_net(skb);
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return 1;
+
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+				      pptr[0], &iph->daddr, pptr[1], p);
+	else
+		ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+				      pptr[1], &iph->saddr, pptr[0], p);
+	return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+			const struct ip_vs_iphdr *iph,
+			unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn_param p;
+
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+		return NULL;
+
+	return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+	struct hlist_node *n;
+
+	hash = ip_vs_conn_hashkey_param(p, false);
+
+	ct_read_lock(hash);
+
+	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+		if (!ip_vs_conn_net_eq(cp, p->net))
+			continue;
+		if (p->pe_data && p->pe->ct_match) {
+			if (p->pe == cp->pe && p->pe->ct_match(p, cp))
+				goto out;
+			continue;
+		}
+
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    /* protocol should only be IPPROTO_IP if
+		     * p->vaddr is a fwmark */
+		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+				     p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
+		    p->protocol == cp->protocol)
+			goto out;
+	}
+	cp = NULL;
+
+  out:
+	if (cp)
+		atomic_inc(&cp->refcnt);
+	ct_read_unlock(hash);
+
+	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      cp ? "hit" : "not hit");
+
+	return cp;
+}
+
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ *	p->caddr, p->cport: pkt source address (inside host)
+ *	p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp, *ret=NULL;
+	struct hlist_node *n;
+
+	/*
+	 *	Check for "full" addressed entries
+	 */
+	hash = ip_vs_conn_hashkey_param(p, true);
+
+	ct_read_lock(hash);
+
+	hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+		if (cp->af == p->af &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+		    p->protocol == cp->protocol &&
+		    ip_vs_conn_net_eq(cp, p->net)) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ret = cp;
+			break;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+			 const struct ip_vs_iphdr *iph,
+			 unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn_param p;
+
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+		return NULL;
+
+	return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+		0 : cp->timeout;
+	mod_timer(&cp->timer, jiffies+t);
+
+	__ip_vs_conn_put(cp);
+}
+
+
+/*
+ *	Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+	if (ip_vs_conn_unhash(cp)) {
+		spin_lock(&cp->lock);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+			cp->cport = cport;
+		}
+		spin_unlock(&cp->lock);
+
+		/* hash on new dport */
+		ip_vs_conn_hash(cp);
+	}
+}
+
+
+/*
+ *	Bind a connection entry with the corresponding packet_xmit.
+ *	Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit;
+		break;
+	}
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit_v6;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit_v6;
+		break;
+	}
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->activeconns)
+		+ atomic_read(&dest->inactconns);
+}
+
+/*
+ *	Bind a connection entry with a virtual service destination
+ *	Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+	unsigned int conn_flags;
+
+	/* if dest is NULL, then return directly */
+	if (!dest)
+		return;
+
+	/* Increase the refcnt counter of the dest */
+	atomic_inc(&dest->refcnt);
+
+	conn_flags = atomic_read(&dest->conn_flags);
+	if (cp->protocol != IPPROTO_UDP)
+		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+	/* Bind with the destination and its corresponding transmitter */
+	if (cp->flags & IP_VS_CONN_F_SYNC) {
+		/* if the connection is not template and is created
+		 * by sync, preserve the activity flag.
+		 */
+		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+		/* connections inherit forwarding method from dest */
+		cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+	}
+	cp->flags |= conn_flags;
+	cp->dest = dest;
+
+	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+		/* It is a normal connection, so increase the inactive
+		   connection counter because it is in TCP SYNRECV
+		   state (inactive) or other protocol inacive state */
+		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
+		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
+			atomic_inc(&dest->activeconns);
+		else
+			atomic_inc(&dest->inactconns);
+	} else {
+		/* It is a persistent connection/template, so increase
+		   the persistent connection counter */
+		atomic_inc(&dest->persistconns);
+	}
+
+	if (dest->u_threshold != 0 &&
+	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+		dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest;
+
+	if ((cp) && (!cp->dest)) {
+		dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
+				       cp->dport, &cp->vaddr, cp->vport,
+				       cp->protocol, cp->fwmark);
+		ip_vs_bind_dest(cp, dest);
+		return dest;
+	} else
+		return NULL;
+}
+
+
+/*
+ *	Unbind a connection entry with its VS destination
+ *	Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (!dest)
+		return;
+
+	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		      "dest->refcnt:%d\n",
+		      ip_vs_proto_name(cp->protocol),
+		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
+		      ip_vs_fwd_tag(cp), cp->state,
+		      cp->flags, atomic_read(&cp->refcnt),
+		      atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+		/* It is a normal connection, so decrease the inactconns
+		   or activeconns counter */
+		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+			atomic_dec(&dest->inactconns);
+		} else {
+			atomic_dec(&dest->activeconns);
+		}
+	} else {
+		/* It is a persistent connection/template, so decrease
+		   the persistent connection counter */
+		atomic_dec(&dest->persistconns);
+	}
+
+	if (dest->l_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else if (dest->u_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	}
+
+	/*
+	 * Simply decrease the refcnt of the dest, because the
+	 * dest will be either in service's destination list
+	 * or in the trash.
+	 */
+	atomic_dec(&dest->refcnt);
+}
+
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+				     struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+	return ipvs->sysctl_expire_quiescent_template &&
+		(atomic_read(&dest->weight) == 0);
+#else
+	return 0;
+#endif
+}
+
+/*
+ *	Checking if the destination of a connection template is available.
+ *	If available, return 1, otherwise invalidate this connection
+ *	template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+	struct ip_vs_dest *dest = ct->dest;
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+
+	/*
+	 * Checking the dest server status.
+	 */
+	if ((dest == NULL) ||
+	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+	    expire_quiescent_template(ipvs, dest)) {
+		IP_VS_DBG_BUF(9, "check_template: dest not available for "
+			      "protocol %s s:%s:%d v:%s:%d "
+			      "-> d:%s:%d\n",
+			      ip_vs_proto_name(ct->protocol),
+			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+			      ntohs(ct->cport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+			      ntohs(ct->vport),
+			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
+			      ntohs(ct->dport));
+
+		/*
+		 * Invalidate the connection template
+		 */
+		if (ct->vport != htons(0xffff)) {
+			if (ip_vs_conn_unhash(ct)) {
+				ct->dport = htons(0xffff);
+				ct->vport = htons(0xffff);
+				ct->cport = 0;
+				ip_vs_conn_hash(ct);
+			}
+		}
+
+		/*
+		 * Simply decrease the refcnt of the template,
+		 * don't restart its timer.
+		 */
+		atomic_dec(&ct->refcnt);
+		return 0;
+	}
+	return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+
+	cp->timeout = 60*HZ;
+
+	/*
+	 *	hey, I'm using it
+	 */
+	atomic_inc(&cp->refcnt);
+
+	/*
+	 *	do I control anybody?
+	 */
+	if (atomic_read(&cp->n_control))
+		goto expire_later;
+
+	/*
+	 *	unhash it if it is hashed in the conn table
+	 */
+	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
+		goto expire_later;
+
+	/*
+	 *	refcnt==1 implies I'm the only one referrer
+	 */
+	if (likely(atomic_read(&cp->refcnt) == 1)) {
+		/* delete the timer if it is activated by other users */
+		if (timer_pending(&cp->timer))
+			del_timer(&cp->timer);
+
+		/* does anybody control me? */
+		if (cp->control)
+			ip_vs_control_del(cp);
+
+		if (cp->flags & IP_VS_CONN_F_NFCT) {
+			ip_vs_conn_drop_conntrack(cp);
+			/* Do not access conntracks during subsys cleanup
+			 * because nf_conntrack_find_get can not be used after
+			 * conntrack cleanup for the net.
+			 */
+			smp_rmb();
+			if (ipvs->enable)
+				ip_vs_conn_drop_conntrack(cp);
+		}
+
+		ip_vs_pe_put(cp->pe);
+		kfree(cp->pe_data);
+		if (unlikely(cp->app != NULL))
+			ip_vs_unbind_app(cp);
+		ip_vs_unbind_dest(cp);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		atomic_dec(&ipvs->conn_count);
+
+		kmem_cache_free(ip_vs_conn_cachep, cp);
+		return;
+	}
+
+	/* hash it back to the table */
+	ip_vs_conn_hash(cp);
+
+  expire_later:
+	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
+		  atomic_read(&cp->refcnt)-1,
+		  atomic_read(&cp->n_control));
+
+	ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+	if (del_timer(&cp->timer))
+		mod_timer(&cp->timer, jiffies);
+}
+
+
+/*
+ *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
+	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
+	       struct ip_vs_dest *dest, __u32 fwmark)
+{
+	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net_ipvs(p->net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+							   p->protocol);
+
+	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	if (cp == NULL) {
+		IP_VS_ERR_RL("%s(): no memory\n", __func__);
+		return NULL;
+	}
+
+	INIT_HLIST_NODE(&cp->c_list);
+	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+	ip_vs_conn_net_set(cp, p->net);
+	cp->af		   = p->af;
+	cp->protocol	   = p->protocol;
+	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	cp->cport	   = p->cport;
+	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	cp->vport	   = p->vport;
+	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
+	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			&cp->daddr, daddr);
+	cp->dport          = dport;
+	cp->flags	   = flags;
+	cp->fwmark         = fwmark;
+	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+		ip_vs_pe_get(p->pe);
+		cp->pe = p->pe;
+		cp->pe_data = p->pe_data;
+		cp->pe_data_len = p->pe_data_len;
+	}
+	spin_lock_init(&cp->lock);
+
+	/*
+	 * Set the entry is referenced by the current thread before hashing
+	 * it in the table, so that other thread run ip_vs_random_dropentry
+	 * but cannot drop this entry.
+	 */
+	atomic_set(&cp->refcnt, 1);
+
+	atomic_set(&cp->n_control, 0);
+	atomic_set(&cp->in_pkts, 0);
+
+	atomic_inc(&ipvs->conn_count);
+	if (flags & IP_VS_CONN_F_NO_CPORT)
+		atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+	/* Bind the connection with a destination server */
+	ip_vs_bind_dest(cp, dest);
+
+	/* Set its state and timeout */
+	cp->state = 0;
+	cp->timeout = 3*HZ;
+
+	/* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+	if (p->af == AF_INET6)
+		ip_vs_bind_xmit_v6(cp);
+	else
+#endif
+		ip_vs_bind_xmit(cp);
+
+	if (unlikely(pd && atomic_read(&pd->appcnt)))
+		ip_vs_bind_app(cp, pd->pp);
+
+	/*
+	 * Allow conntrack to be preserved. By default, conntrack
+	 * is created and destroyed for every packet.
+	 * Sometimes keeping conntrack can be useful for
+	 * IP_VS_CONN_F_ONE_PACKET too.
+	 */
+
+	if (ip_vs_conntrack_enabled(ipvs))
+		cp->flags |= IP_VS_CONN_F_NFCT;
+
+	/* Hash it in the ip_vs_conn_tab finally */
+	ip_vs_conn_hash(cp);
+
+	return cp;
+}
+
+/*
+ *	/proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+	struct seq_net_private	p;
+	struct hlist_head	*l;
+};
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct ip_vs_iter_state *iter = seq->private;
+	struct hlist_node *n;
+
+	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+		ct_read_lock_bh(idx);
+		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+			if (pos-- == 0) {
+				iter->l = &ip_vs_conn_tab[idx];
+				return cp;
+			}
+		}
+		ct_read_unlock_bh(idx);
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ip_vs_iter_state *iter = seq->private;
+
+	iter->l = NULL;
+	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_conn *cp = v;
+	struct ip_vs_iter_state *iter = seq->private;
+	struct hlist_node *e;
+	struct hlist_head *l = iter->l;
+	int idx;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_conn_array(seq, 0);
+
+	/* more on same hash chain? */
+	if ((e = cp->c_list.next))
+		return hlist_entry(e, struct ip_vs_conn, c_list);
+
+	idx = l - ip_vs_conn_tab;
+	ct_read_unlock_bh(idx);
+
+	while (++idx < ip_vs_conn_tab_size) {
+		ct_read_lock_bh(idx);
+		hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
+			iter->l = &ip_vs_conn_tab[idx];
+			return cp;
+		}
+		ct_read_unlock_bh(idx);
+	}
+	iter->l = NULL;
+	return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+	struct ip_vs_iter_state *iter = seq->private;
+	struct hlist_head *l = iter->l;
+
+	if (l)
+		ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
+	else {
+		const struct ip_vs_conn *cp = v;
+		struct net *net = seq_file_net(seq);
+		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+		size_t len = 0;
+
+		if (!ip_vs_conn_net_eq(cp, net))
+			return 0;
+		if (cp->pe_data) {
+			pe_data[0] = ' ';
+			len = strlen(cp->pe->name);
+			memcpy(pe_data + 1, cp->pe->name, len);
+			pe_data[len + 1] = ' ';
+			len += 2;
+			len += cp->pe->show_pe_data(cp, pe_data + len);
+		}
+		pe_data[len] = '\0';
+
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+				"%pI6 %04X %-11s %7lu%s\n",
+				ip_vs_proto_name(cp->protocol),
+				&cp->caddr.in6, ntohs(cp->cport),
+				&cp->vaddr.in6, ntohs(cp->vport),
+				&cp->daddr.in6, ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ, pe_data);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X"
+				" %08X %04X %-11s %7lu%s\n",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ, pe_data);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+	.start = ip_vs_conn_seq_start,
+	.next  = ip_vs_conn_seq_next,
+	.stop  = ip_vs_conn_seq_stop,
+	.show  = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+			    sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_conn_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static const char *ip_vs_origin_name(unsigned flags)
+{
+	if (flags & IP_VS_CONN_F_SYNC)
+		return "SYNC";
+	else
+		return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
+	else {
+		const struct ip_vs_conn *cp = v;
+		struct net *net = seq_file_net(seq);
+
+		if (!ip_vs_conn_net_eq(cp, net))
+			return 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				&cp->caddr.in6, ntohs(cp->cport),
+				&cp->vaddr.in6, ntohs(cp->vport),
+				&cp->daddr.in6, ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_origin_name(cp->flags),
+				(cp->timer.expires-jiffies)/HZ);
+		else
+#endif
+			seq_printf(seq,
+				"%-3s %08X %04X %08X %04X "
+				"%08X %04X %-11s %-6s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr.ip), ntohs(cp->cport),
+				ntohl(cp->vaddr.ip), ntohs(cp->vport),
+				ntohl(cp->daddr.ip), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_origin_name(cp->flags),
+				(cp->timer.expires-jiffies)/HZ);
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+	.start = ip_vs_conn_seq_start,
+	.next  = ip_vs_conn_seq_next,
+	.stop  = ip_vs_conn_seq_stop,
+	.show  = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+			    sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_conn_sync_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+#endif
+
+
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+	/*
+	 * The drop rate array needs tuning for real environments.
+	 * Called from timer bh only => no locking
+	 */
+	static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+	static char todrop_counter[9] = {0};
+	int i;
+
+	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+	   This will leave enough time for normal connection to get
+	   through. */
+	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+		return 0;
+
+	/* Don't drop the entry if its number of incoming packets is not
+	   located in [0, 8] */
+	i = atomic_read(&cp->in_pkts);
+	if (i > 8 || i < 0) return 0;
+
+	if (!todrop_rate[i]) return 0;
+	if (--todrop_counter[i] > 0) return 0;
+
+	todrop_counter[i] = todrop_rate[i];
+	return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(struct net *net)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+
+	/*
+	 * Randomly scan 1/32 of the whole table every second
+	 */
+	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+		unsigned hash = net_random() & ip_vs_conn_tab_mask;
+		struct hlist_node *n;
+
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(hash);
+
+		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
+			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+				/* connection template */
+				continue;
+			if (!ip_vs_conn_net_eq(cp, net))
+				continue;
+			if (cp->protocol == IPPROTO_TCP) {
+				switch(cp->state) {
+				case IP_VS_TCP_S_SYN_RECV:
+				case IP_VS_TCP_S_SYNACK:
+					break;
+
+				case IP_VS_TCP_S_ESTABLISHED:
+					if (todrop_entry(cp))
+						break;
+					continue;
+
+				default:
+					continue;
+				}
+			} else {
+				if (!todrop_entry(cp))
+					continue;
+			}
+
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (cp->control) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(cp->control);
+			}
+		}
+		ct_write_unlock_bh(hash);
+	}
+}
+
+
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(struct net *net)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+flush_again:
+	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+		struct hlist_node *n;
+
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(idx);
+
+		hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
+			if (!ip_vs_conn_net_eq(cp, net))
+				continue;
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (cp->control) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(cp->control);
+			}
+		}
+		ct_write_unlock_bh(idx);
+	}
+
+	/* the counter may be not NULL, because maybe some conn entries
+	   are run by slow timer handler or unhashed but still referred */
+	if (atomic_read(&ipvs->conn_count) != 0) {
+		schedule();
+		goto flush_again;
+	}
+}
+/*
+ * per netns init and exit
+ */
+int __net_init __ip_vs_conn_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	atomic_set(&ipvs->conn_count, 0);
+
+	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+	return 0;
+}
+
+void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+	/* flush all the connection entries first */
+	ip_vs_conn_flush(net);
+	proc_net_remove(net, "ip_vs_conn");
+	proc_net_remove(net, "ip_vs_conn_sync");
+}
+
+int __init ip_vs_conn_init(void)
+{
+	int idx;
+
+	/* Compute size and mask */
+	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
+	/*
+	 * Allocate the connection hash table and initialize its list heads
+	 */
+	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
+	if (!ip_vs_conn_tab)
+		return -ENOMEM;
+
+	/* Allocate ip_vs_conn slab cache */
+	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+					      sizeof(struct ip_vs_conn), 0,
+					      SLAB_HWCACHE_ALIGN, NULL);
+	if (!ip_vs_conn_cachep) {
+		vfree(ip_vs_conn_tab);
+		return -ENOMEM;
+	}
+
+	pr_info("Connection hash table configured "
+		"(size=%d, memory=%ldKbytes)\n",
+		ip_vs_conn_tab_size,
+		(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+		  sizeof(struct ip_vs_conn));
+
+	for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+		INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
+
+	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+	}
+
+	/* calculate the random value for connection hash */
+	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+	return 0;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+	/* Release the empty cache */
+	kmem_cache_destroy(ip_vs_conn_cachep);
+	vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
new file mode 100644
index 00000000..0787bed0
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,2050 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *	Paul `Rusty' Russell		properly handle non-linear skbs
+ *	Harald Welte			don't use nfcache
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sctp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+#include <net/ip6_checksum.h>
+#include <net/netns/generic.h>		/* net_generic() */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+int ip_vs_net_id __read_mostly;
+#ifdef IP_VS_GENERIC_NETNS
+EXPORT_SYMBOL(ip_vs_net_id);
+#endif
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+	static char buf[20];
+
+	switch (proto) {
+	case IPPROTO_IP:
+		return "IP";
+	case IPPROTO_UDP:
+		return "UDP";
+	case IPPROTO_TCP:
+		return "TCP";
+	case IPPROTO_SCTP:
+		return "SCTP";
+	case IPPROTO_ICMP:
+		return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+	case IPPROTO_ICMPV6:
+		return "ICMPv6";
+#endif
+	default:
+		sprintf(buf, "IP_%d", proto);
+		return buf;
+	}
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+	while (--rows >= 0)
+		INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		struct ip_vs_cpu_stats *s;
+
+		s = this_cpu_ptr(dest->stats.cpustats);
+		s->ustats.inpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.inbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+
+		s = this_cpu_ptr(dest->svc->stats.cpustats);
+		s->ustats.inpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.inbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+
+		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+		s->ustats.inpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.inbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+	}
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		struct ip_vs_cpu_stats *s;
+
+		s = this_cpu_ptr(dest->stats.cpustats);
+		s->ustats.outpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.outbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+
+		s = this_cpu_ptr(dest->svc->stats.cpustats);
+		s->ustats.outpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.outbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+
+		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+		s->ustats.outpkts++;
+		u64_stats_update_begin(&s->syncp);
+		s->ustats.outbytes += skb->len;
+		u64_stats_update_end(&s->syncp);
+	}
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+	struct ip_vs_cpu_stats *s;
+
+	s = this_cpu_ptr(cp->dest->stats.cpustats);
+	s->ustats.conns++;
+
+	s = this_cpu_ptr(svc->stats.cpustats);
+	s->ustats.conns++;
+
+	s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+	s->ustats.conns++;
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+		const struct sk_buff *skb,
+		struct ip_vs_proto_data *pd)
+{
+	if (unlikely(!pd->pp->state_transition))
+		return 0;
+	return pd->pp->state_transition(cp, direction, skb, pd);
+}
+
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+			      struct sk_buff *skb, int protocol,
+			      const union nf_inet_addr *caddr, __be16 cport,
+			      const union nf_inet_addr *vaddr, __be16 vport,
+			      struct ip_vs_conn_param *p)
+{
+	ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+			      vport, p);
+	p->pe = svc->pe;
+	if (p->pe && p->pe->fill_param)
+		return p->pe->fill_param(p, skb);
+
+	return 0;
+}
+
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+		    struct sk_buff *skb,
+		    __be16 src_port, __be16 dst_port, int *ignored)
+{
+	struct ip_vs_conn *cp = NULL;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *ct;
+	__be16 dport = 0;		/* destination port to forward */
+	unsigned int flags;
+	struct ip_vs_conn_param param;
+	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+	union nf_inet_addr snet;	/* source network of the client,
+					   after masking */
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	/* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
+	else
+#endif
+		snet.ip = iph.saddr.ip & svc->netmask;
+
+	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+		      "mnet %s\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
+		      IP_VS_DBG_ADDR(svc->af, &snet));
+
+	/*
+	 * As far as we know, FTP is a very complicated network protocol, and
+	 * it uses control connection and data connections. For active FTP,
+	 * FTP server initialize data connection to the client, its source port
+	 * is often 20. For passive FTP, FTP server tells the clients the port
+	 * that it passively listens to,  and the client issues the data
+	 * connection. In the tunneling or direct routing mode, the load
+	 * balancer is on the client-to-server half of connection, the port
+	 * number is unknown to the load balancer. So, a conn template like
+	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+	 * is created for other persistent services.
+	 */
+	{
+		int protocol = iph.protocol;
+		const union nf_inet_addr *vaddr = &iph.daddr;
+		__be16 vport = 0;
+
+		if (dst_port == svc->port) {
+			/* non-FTP template:
+			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+			 * FTP template:
+			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
+			 */
+			if (svc->port != FTPPORT)
+				vport = dst_port;
+		} else {
+			/* Note: persistent fwmark-based services and
+			 * persistent port zero service are handled here.
+			 * fwmark template:
+			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+			 * port zero template:
+			 * <protocol,caddr,0,vaddr,0,daddr,0>
+			 */
+			if (svc->fwmark) {
+				protocol = IPPROTO_IP;
+				vaddr = &fwmark;
+			}
+		}
+		/* return *ignored = -1 so NF_DROP can be used */
+		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+						  vaddr, vport, &param) < 0) {
+			*ignored = -1;
+			return NULL;
+		}
+	}
+
+	/* Check if a template already exists */
+	ct = ip_vs_ct_in_get(&param);
+	if (!ct || !ip_vs_check_template(ct)) {
+		/*
+		 * No template found or the dest of the connection
+		 * template is not available.
+		 * return *ignored=0 i.e. ICMP and NF_DROP
+		 */
+		dest = svc->scheduler->schedule(svc, skb);
+		if (!dest) {
+			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			kfree(param.pe_data);
+			*ignored = 0;
+			return NULL;
+		}
+
+		if (dst_port == svc->port && svc->port != FTPPORT)
+			dport = dest->port;
+
+		/* Create a template
+		 * This adds param.pe_data to the template,
+		 * and thus param.pe_data will be destroyed
+		 * when the template expires */
+		ct = ip_vs_conn_new(&param, &dest->addr, dport,
+				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
+		if (ct == NULL) {
+			kfree(param.pe_data);
+			*ignored = -1;
+			return NULL;
+		}
+
+		ct->timeout = svc->timeout;
+	} else {
+		/* set destination with the found template */
+		dest = ct->dest;
+		kfree(param.pe_data);
+	}
+
+	dport = dst_port;
+	if (dport == svc->port && dest->port)
+		dport = dest->port;
+
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
+	/*
+	 *    Create a new connection according to the template
+	 */
+	ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
+			      src_port, &iph.daddr, dst_port, &param);
+
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
+	if (cp == NULL) {
+		ip_vs_conn_put(ct);
+		*ignored = -1;
+		return NULL;
+	}
+
+	/*
+	 *    Add its control
+	 */
+	ip_vs_control_add(cp, ct);
+	ip_vs_conn_put(ct);
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ *
+ *  Usage of *ignored
+ *
+ * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
+ *       svc/scheduler decides that this packet should be accepted with
+ *       NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 :   scheduler can not find destination, so try bypass or
+ *       return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 :  scheduler tried to schedule but fatal error occurred, eg.
+ *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ *       failure such as missing Call-ID, ENOMEM on skb_linearize
+ *       or pe_data. In this case we should return NF_DROP without
+ *       any attempts to send ICMP with ip_vs_leave.
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+	       struct ip_vs_proto_data *pd, int *ignored)
+{
+	struct ip_vs_protocol *pp = pd->pp;
+	struct ip_vs_conn *cp = NULL;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest;
+	__be16 _ports[2], *pptr;
+	unsigned int flags;
+
+	*ignored = 1;
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	/*
+	 * FTPDATA needs this check when using local real server.
+	 * Never schedule Active FTPDATA connections from real server.
+	 * For LVS-NAT they must be already created. For other methods
+	 * with persistence the connection is created on SYN+ACK.
+	 */
+	if (pptr[0] == FTPDATA) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling FTPDATA");
+		return NULL;
+	}
+
+	/*
+	 *    Do not schedule replies from local real server.
+	 */
+	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+	    (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling reply for existing connection");
+		__ip_vs_conn_put(cp);
+		return NULL;
+	}
+
+	/*
+	 *    Persistent service
+	 */
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+	*ignored = 0;
+
+	/*
+	 *    Non-persistent service
+	 */
+	if (!svc->fwmark && pptr[1] != svc->port) {
+		if (!svc->port)
+			pr_err("Schedule: port zero only supported "
+			       "in persistent services, "
+			       "check your ipvs configuration\n");
+		return NULL;
+	}
+
+	dest = svc->scheduler->schedule(svc, skb);
+	if (dest == NULL) {
+		IP_VS_DBG(1, "Schedule: no dest found.\n");
+		return NULL;
+	}
+
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
+	/*
+	 *    Create a connection entry.
+	 */
+	{
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+				      &iph.saddr, pptr[0], &iph.daddr, pptr[1],
+				      &p);
+		cp = ip_vs_conn_new(&p, &dest->addr,
+				    dest->port ? dest->port : pptr[1],
+				    flags, dest, skb->mark);
+		if (!cp) {
+			*ignored = -1;
+			return NULL;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+		      ip_vs_fwd_tag(cp),
+		      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
+		      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
+		      cp->flags, atomic_read(&cp->refcnt));
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+		struct ip_vs_proto_data *pd)
+{
+	__be16 _ports[2], *pptr;
+	struct ip_vs_iphdr iph;
+#ifdef CONFIG_SYSCTL
+	struct net *net;
+	struct netns_ipvs *ipvs;
+	int unicast;
+#endif
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		ip_vs_service_put(svc);
+		return NF_DROP;
+	}
+
+#ifdef CONFIG_SYSCTL
+	net = skb_net(skb);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6)
+		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
+	else
+#endif
+		unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
+
+	/* if it is fwmark-based service, the cache_bypass sysctl is up
+	   and the destination is a non-local unicast, then create
+	   a cache_bypass connection entry */
+	ipvs = net_ipvs(net);
+	if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+		int ret, cs;
+		struct ip_vs_conn *cp;
+		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+				      iph.protocol == IPPROTO_UDP)?
+				      IP_VS_CONN_F_ONE_PACKET : 0;
+		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
+
+		ip_vs_service_put(svc);
+
+		/* create a new connection entry */
+		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
+					      &iph.saddr, pptr[0],
+					      &iph.daddr, pptr[1], &p);
+			cp = ip_vs_conn_new(&p, &daddr, 0,
+					    IP_VS_CONN_F_BYPASS | flags,
+					    NULL, skb->mark);
+			if (!cp)
+				return NF_DROP;
+		}
+
+		/* statistics */
+		ip_vs_in_stats(cp, skb);
+
+		/* set state */
+		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+
+		/* transmit the first SYN packet */
+		ret = cp->packet_xmit(skb, cp, pd->pp);
+		/* do not touch skb anymore */
+
+		atomic_inc(&cp->in_pkts);
+		ip_vs_conn_put(cp);
+		return ret;
+	}
+#endif
+
+	/*
+	 * When the virtual ftp service is presented, packets destined
+	 * for other services on the VIP may get here (except services
+	 * listed in the ipvs table), pass the packets, because it is
+	 * not ipvs job to decide to drop the packets.
+	 */
+	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+		ip_vs_service_put(svc);
+		return NF_ACCEPT;
+	}
+
+	ip_vs_service_put(svc);
+
+	/*
+	 * Notify the client that the destination is unreachable, and
+	 * release the socket buffer.
+	 * Since it is in IP layer, the TCP socket is not actually
+	 * created, the TCP RST packet cannot be sent, instead that
+	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+	 */
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+	} else
+#endif
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	return NF_DROP;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+	struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+	return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+	if (NF_INET_LOCAL_IN == hooknum)
+		return IP_DEFRAG_VS_IN;
+	if (NF_INET_FORWARD == hooknum)
+		return IP_DEFRAG_VS_FWD;
+	return IP_DEFRAG_VS_OUT;
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	int err = ip_defrag(skb, user);
+
+	if (!err)
+		ip_send_check(ip_hdr(skb));
+
+	return err;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
+{
+	/* TODO IPv6: Find out what to do here for IPv6 */
+	return 0;
+}
+#endif
+
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0)
+			return 1;
+	} else
+#endif
+		if ((sysctl_snat_reroute(skb) ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			return 1;
+
+	return 0;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct iphdr *iph	 = ip_hdr(skb);
+	unsigned int icmp_offset = iph->ihl*4;
+	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
+						      icmp_offset);
+	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr.ip;
+		ip_send_check(iph);
+		ciph->daddr = cp->vaddr.ip;
+		ip_send_check(ciph);
+	} else {
+		iph->daddr = cp->daddr.ip;
+		ip_send_check(iph);
+		ciph->saddr = cp->daddr.ip;
+		ip_send_check(ciph);
+	}
+
+	/* the TCP/UDP/SCTP port */
+	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
+	    IPPROTO_SCTP == ciph->protocol) {
+		__be16 *ports = (void *)ciph + ciph->ihl*4;
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->checksum = 0;
+	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered outgoing ICMP");
+	else
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct ipv6hdr *iph	 = ipv6_hdr(skb);
+	unsigned int icmp_offset = sizeof(struct ipv6hdr);
+	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) +
+						      icmp_offset);
+	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr.in6;
+		ciph->daddr = cp->vaddr.in6;
+	} else {
+		iph->daddr = cp->daddr.in6;
+		ciph->saddr = cp->daddr.in6;
+	}
+
+	/* the TCP/UDP/SCTP port */
+	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
+	    IPPROTO_SCTP == ciph->nexthdr) {
+		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+					      skb->len - icmp_offset,
+					      IPPROTO_ICMPV6, 0);
+	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered outgoing ICMPv6");
+	else
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+				union nf_inet_addr *snet,
+				__u8 protocol, struct ip_vs_conn *cp,
+				struct ip_vs_protocol *pp,
+				unsigned int offset, unsigned int ihl)
+{
+	unsigned int verdict = NF_DROP;
+
+	if (IP_VS_FWD_METHOD(cp) != 0) {
+		pr_err("shouldn't reach here, because the box is on the "
+		       "half connection in the tun/dr module.\n");
+	}
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+			      IP_VS_DBG_ADDR(af, snet));
+		goto out;
+	}
+
+	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+	    IPPROTO_SCTP == protocol)
+		offset += 2 * sizeof(__u16);
+	if (!skb_make_writable(skb, offset))
+		goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+	else
+#endif
+		ip_vs_nat_icmp(skb, pp, cp, 1);
+
+	if (ip_vs_route_me_harder(af, skb))
+		goto out;
+
+	/* do the statistics and put it back */
+	ip_vs_out_stats(cp, skb);
+
+	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
+	verdict = NF_ACCEPT;
+
+out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+/*
+ *	Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *	Find any that might be relevant, check against existing connections.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+			  unsigned int hooknum)
+{
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, ihl;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+			return NF_STOLEN;
+	}
+
+	iph = ip_hdr(skb);
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  &iph->saddr, &iph->daddr);
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->protocol);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking outgoing ICMP for");
+
+	offset += cih->ihl * 4;
+
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	snet.ip = iph->saddr;
+	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+				    pp, offset, ihl);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+			     unsigned int hooknum)
+{
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset;
+	union nf_inet_addr snet;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  &iph->saddr, &iph->daddr);
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->nexthdr);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking outgoing ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	ipv6_addr_copy(&snet.in6, &iph->saddr);
+	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
+				    pp, offset, sizeof(struct ipv6hdr));
+}
+#endif
+
+/*
+ * Check if sctp chunc is ABORT chunk
+ */
+static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
+{
+	sctp_chunkhdr_t *sch, schunk;
+	sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
+			sizeof(schunk), &schunk);
+	if (sch == NULL)
+		return 0;
+	if (sch->type == SCTP_CID_ABORT)
+		return 1;
+	return 0;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+	struct tcphdr _tcph, *th;
+
+	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+	return th->rst;
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+		struct ip_vs_conn *cp, int ihl)
+{
+	struct ip_vs_protocol *pp = pd->pp;
+
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+
+	if (!skb_make_writable(skb, ihl))
+		goto drop;
+
+	/* mangle the packet */
+	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+		goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+	else
+#endif
+	{
+		ip_hdr(skb)->saddr = cp->vaddr.ip;
+		ip_send_check(ip_hdr(skb));
+	}
+
+	/*
+	 * nf_iterate does not expect change in the skb->dst->dev.
+	 * It looks like it is not fatal to enable this code for hooks
+	 * where our handlers are at the end of the chain list and
+	 * when all next handlers use skb->dst->dev and not outdev.
+	 * It will definitely route properly the inout NAT traffic
+	 * when multiple paths are used.
+	 */
+
+	/* For policy routing, packets originating from this
+	 * machine itself may be routed differently to packets
+	 * passing through.  We want this packet to be routed as
+	 * if it came from this machine itself.  So re-compute
+	 * the routing information.
+	 */
+	if (ip_vs_route_me_harder(af, skb))
+		goto drop;
+
+	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+
+	ip_vs_out_stats(cp, skb);
+	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
+	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
+	ip_vs_conn_put(cp);
+
+	LeaveFunction(11);
+	return NF_ACCEPT;
+
+drop:
+	ip_vs_conn_put(cp);
+	kfree_skb(skb);
+	LeaveFunction(11);
+	return NF_STOLEN;
+}
+
+/*
+ *	Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+	struct net *net = NULL;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
+	struct ip_vs_conn *cp;
+
+	EnterFunction(11);
+
+	/* Already marked as IPVS request or reply? */
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
+
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
+
+	if (unlikely(!skb_dst(skb)))
+		return NF_ACCEPT;
+
+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related;
+			int verdict = ip_vs_out_icmp_v6(skb, &related,
+							hooknum);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+			int related;
+			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+
+	pd = ip_vs_proto_data_get(net, iph.protocol);
+	if (unlikely(!pd))
+		return NF_ACCEPT;
+	pp = pd->pp;
+
+	/* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+			if (ip_vs_gather_frags_v6(skb,
+						  ip_vs_defrag_user(hooknum)))
+				return NF_STOLEN;
+		}
+
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+	} else
+#endif
+		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
+			     !pp->dont_defrag)) {
+			if (ip_vs_gather_frags(skb,
+					       ip_vs_defrag_user(hooknum)))
+				return NF_STOLEN;
+
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
+
+	if (likely(cp))
+		return handle_response(af, skb, pd, cp, iph.len);
+	if (sysctl_nat_icmp_send(net) &&
+	    (pp->protocol == IPPROTO_TCP ||
+	     pp->protocol == IPPROTO_UDP ||
+	     pp->protocol == IPPROTO_SCTP)) {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, iph.len,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			return NF_ACCEPT;	/* Not for me */
+		if (ip_vs_lookup_real_service(net, af, iph.protocol,
+					      &iph.saddr,
+					      pptr[0])) {
+			/*
+			 * Notify the real server: there is no
+			 * existing entry if it is not RST
+			 * packet or not TCP packet.
+			 */
+			if ((iph.protocol != IPPROTO_TCP &&
+			     iph.protocol != IPPROTO_SCTP)
+			     || ((iph.protocol == IPPROTO_TCP
+				  && !is_tcp_reset(skb, iph.len))
+				 || (iph.protocol == IPPROTO_SCTP
+					&& !is_sctp_abort(skb,
+						iph.len)))) {
+#ifdef CONFIG_IP_VS_IPV6
+				if (af == AF_INET6) {
+					struct net *net =
+						dev_net(skb_dst(skb)->dev);
+
+					if (!skb->dev)
+						skb->dev = net->loopback_dev;
+					icmpv6_send(skb,
+						    ICMPV6_DEST_UNREACH,
+						    ICMPV6_PORT_UNREACH,
+						    0);
+				} else
+#endif
+					icmp_send(skb,
+						  ICMP_DEST_UNREACH,
+						  ICMP_PORT_UNREACH, 0);
+				return NF_DROP;
+			}
+		}
+	}
+	IP_VS_DBG_PKT(12, af, pp, skb, 0,
+		      "ip_vs_out: packet continues traversal as normal");
+	return NF_ACCEPT;
+}
+
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
+
+/*
+ *	Handle ICMP messages in the outside-to-inside direction (incoming).
+ *	Find any that might be relevant, check against existing connections,
+ *	forward to the right destination host if relevant.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+	struct net *net = NULL;
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
+	unsigned int offset, ihl, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+			return NF_STOLEN;
+	}
+
+	iph = ip_hdr(skb);
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  &iph->saddr, &iph->daddr);
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	net = skb_net(skb);
+
+	pd = ip_vs_proto_data_get(net, cih->protocol);
+	if (!pd)
+		return NF_ACCEPT;
+	pp = pd->pp;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking incoming ICMP for");
+
+	offset += cih->ihl * 4;
+
+	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	verdict = NF_DROP;
+
+	/* Ensure the checksum is correct */
+	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
+			  &iph->saddr);
+		goto out;
+	}
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum);
+
+  out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int
+ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+	struct net *net = NULL;
+	struct ipv6hdr *iph;
+	struct icmp6hdr	_icmph, *ic;
+	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
+					   within the ICMP */
+	struct ip_vs_iphdr ciph;
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
+	unsigned int offset, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
+			return NF_STOLEN;
+	}
+
+	iph = ipv6_hdr(skb);
+	offset = sizeof(struct ipv6hdr);
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
+		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
+		  &iph->saddr, &iph->daddr);
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
+	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
+	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	net = skb_net(skb);
+	pd = ip_vs_proto_data_get(net, cih->nexthdr);
+	if (!pd)
+		return NF_ACCEPT;
+	pp = pd->pp;
+
+	/* Is the embedded protocol header present? */
+	/* TODO: we don't support fragmentation at the moment anyways */
+	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking incoming ICMPv6 for");
+
+	offset += sizeof(struct ipv6hdr);
+
+	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
+	    IPPROTO_SCTP == cih->nexthdr)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum);
+
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+#endif
+
+
+/*
+ *	Check if it's for virtual services, look it up,
+ *	and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+	struct net *net;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
+	struct ip_vs_conn *cp;
+	int ret, restart, pkts;
+	struct netns_ipvs *ipvs;
+
+	/* Already marked as IPVS request or reply? */
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
+
+	/*
+	 *	Big tappo:
+	 *	- remote client: only PACKET_HOST
+	 *	- route: used for struct net when skb->dev is unset
+	 */
+	if (unlikely((skb->pkt_type != PACKET_HOST &&
+		      hooknum != NF_INET_LOCAL_OUT) ||
+		     !skb_dst(skb))) {
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+			      " ignored in hook %u\n",
+			      skb->pkt_type, iph.protocol,
+			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
+		return NF_ACCEPT;
+	}
+	/* ipvs enabled in this netns ? */
+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related;
+			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+			int related;
+			int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+
+	/* Protocol supported? */
+	pd = ip_vs_proto_data_get(net, iph.protocol);
+	if (unlikely(!pd))
+		return NF_ACCEPT;
+	pp = pd->pp;
+	/*
+	 * Check if the packet belongs to an existing connection entry
+	 */
+	cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
+
+	if (unlikely(!cp)) {
+		int v;
+
+		if (!pp->conn_schedule(af, skb, pd, &v, &cp))
+			return v;
+	}
+
+	if (unlikely(!cp)) {
+		/* sorry, all this trouble for a no-hit :) */
+		IP_VS_DBG_PKT(12, af, pp, skb, 0,
+			      "ip_vs_in: packet continues traversal as normal");
+		return NF_ACCEPT;
+	}
+
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
+	ipvs = net_ipvs(net);
+	/* Check the server status */
+	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		/* the destination server is not available */
+
+		if (sysctl_expire_nodest_conn(ipvs)) {
+			/* try to expire the connection immediately */
+			ip_vs_conn_expire_now(cp);
+		}
+		/* don't restart its timer, and silently
+		   drop the packet. */
+		__ip_vs_conn_put(cp);
+		return NF_DROP;
+	}
+
+	ip_vs_in_stats(cp, skb);
+	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+	if (cp->packet_xmit)
+		ret = cp->packet_xmit(skb, cp, pp);
+		/* do not touch skb anymore */
+	else {
+		IP_VS_DBG_RL("warning: packet_xmit is null");
+		ret = NF_ACCEPT;
+	}
+
+	/* Increase its packet counter and check if it is needed
+	 * to be synchronized
+	 *
+	 * Sync connection if it is about to close to
+	 * encorage the standby servers to update the connections timeout
+	 *
+	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
+	 */
+
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		pkts = sysctl_sync_threshold(ipvs);
+	else
+		pkts = atomic_add_return(1, &cp->in_pkts);
+
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
+	    cp->protocol == IPPROTO_SCTP) {
+		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
+			(pkts % sysctl_sync_period(ipvs)
+			 == sysctl_sync_threshold(ipvs))) ||
+				(cp->old_state != cp->state &&
+				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
+				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
+				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
+			ip_vs_sync_conn(net, cp);
+			goto out;
+		}
+	}
+
+	/* Keep this block last: TCP and others with pp->num_states <= 1 */
+	else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
+	    (((cp->protocol != IPPROTO_TCP ||
+	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+	      (pkts % sysctl_sync_period(ipvs)
+	       == sysctl_sync_threshold(ipvs))) ||
+	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
+	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
+	       (cp->state == IP_VS_TCP_S_CLOSE) ||
+	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
+	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
+		ip_vs_sync_conn(net, cp);
+out:
+	cp->old_state = cp->state;
+
+	ip_vs_conn_put(cp);
+	return ret;
+}
+
+/*
+ *	AF_INET handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *	AF_INET handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
+
+
+/*
+ *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	int r;
+	struct net *net;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+		return NF_ACCEPT;
+
+	/* ipvs enabled in this netns ? */
+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp(skb, &r, hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in, const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	int r;
+	struct net *net;
+
+	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+		return NF_ACCEPT;
+
+	/* ipvs enabled in this netns ? */
+	net = skb_net(skb);
+	if (!net_ipvs(net)->enable)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp_v6(skb, &r, hooknum);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC - 2,
+	},
+	/* After packet filtering, forward packet through VS/DR, VS/TUN,
+	 * or VS/NAT(change destination), so that filtering rules can be
+	 * applied to IPVS. */
+	{
+		.hook		= ip_vs_remote_request4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC - 1,
+	},
+	/* Before ip_vs_in, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_local_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST + 1,
+	},
+	/* After mangle, schedule and forward local requests */
+	{
+		.hook		= ip_vs_local_request4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST + 2,
+	},
+	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+	{
+		.hook		= ip_vs_forward_icmp,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
+	},
+#ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_NAT_SRC - 2,
+	},
+	/* After packet filtering, forward packet through VS/DR, VS/TUN,
+	 * or VS/NAT(change destination), so that filtering rules can be
+	 * applied to IPVS. */
+	{
+		.hook		= ip_vs_remote_request6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_NAT_SRC - 1,
+	},
+	/* Before ip_vs_in, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_local_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_NAT_DST + 1,
+	},
+	/* After mangle, schedule and forward local requests */
+	{
+		.hook		= ip_vs_local_request6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_NAT_DST + 2,
+	},
+	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+	{
+		.hook		= ip_vs_forward_icmp_v6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
+	},
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
+	},
+#endif
+};
+/*
+ *	Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+	struct netns_ipvs *ipvs;
+
+	ipvs = net_generic(net, ip_vs_net_id);
+	if (ipvs == NULL) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	/* Hold the beast until a service is registerd */
+	ipvs->enable = 0;
+	ipvs->net = net;
+	/* Counters used for creating unique names */
+	ipvs->gen = atomic_read(&ipvs_netns_cnt);
+	atomic_inc(&ipvs_netns_cnt);
+	net->ipvs = ipvs;
+
+	if (__ip_vs_estimator_init(net) < 0)
+		goto estimator_fail;
+
+	if (__ip_vs_control_init(net) < 0)
+		goto control_fail;
+
+	if (__ip_vs_protocol_init(net) < 0)
+		goto protocol_fail;
+
+	if (__ip_vs_app_init(net) < 0)
+		goto app_fail;
+
+	if (__ip_vs_conn_init(net) < 0)
+		goto conn_fail;
+
+	if (__ip_vs_sync_init(net) < 0)
+		goto sync_fail;
+
+	printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+			 sizeof(struct netns_ipvs), ipvs->gen);
+	return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+	__ip_vs_conn_cleanup(net);
+conn_fail:
+	__ip_vs_app_cleanup(net);
+app_fail:
+	__ip_vs_protocol_cleanup(net);
+protocol_fail:
+	__ip_vs_control_cleanup(net);
+control_fail:
+	__ip_vs_estimator_cleanup(net);
+estimator_fail:
+	return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+	__ip_vs_service_cleanup(net);	/* ip_vs_flush() with locks */
+	__ip_vs_conn_cleanup(net);
+	__ip_vs_app_cleanup(net);
+	__ip_vs_protocol_cleanup(net);
+	__ip_vs_control_cleanup(net);
+	__ip_vs_estimator_cleanup(net);
+	IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+}
+
+static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+{
+	EnterFunction(2);
+	net_ipvs(net)->enable = 0;	/* Disable packet reception */
+	smp_wmb();
+	__ip_vs_sync_cleanup(net);
+	LeaveFunction(2);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+	.init = __ip_vs_init,
+	.exit = __ip_vs_cleanup,
+	.id   = &ip_vs_net_id,
+	.size = sizeof(struct netns_ipvs),
+};
+
+static struct pernet_operations ipvs_core_dev_ops = {
+	.exit = __ip_vs_dev_cleanup,
+};
+
+/*
+ *	Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+	int ret;
+
+	ip_vs_estimator_init();
+	ret = ip_vs_control_init();
+	if (ret < 0) {
+		pr_err("can't setup control.\n");
+		goto cleanup_estimator;
+	}
+
+	ip_vs_protocol_init();
+
+	ret = ip_vs_app_init();
+	if (ret < 0) {
+		pr_err("can't setup application helper.\n");
+		goto cleanup_protocol;
+	}
+
+	ret = ip_vs_conn_init();
+	if (ret < 0) {
+		pr_err("can't setup connection table.\n");
+		goto cleanup_app;
+	}
+
+	ret = ip_vs_sync_init();
+	if (ret < 0) {
+		pr_err("can't setup sync data.\n");
+		goto cleanup_conn;
+	}
+
+	ret = register_pernet_subsys(&ipvs_core_ops);	/* Alloc ip_vs struct */
+	if (ret < 0)
+		goto cleanup_sync;
+
+	ret = register_pernet_device(&ipvs_core_dev_ops);
+	if (ret < 0)
+		goto cleanup_sub;
+
+	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	if (ret < 0) {
+		pr_err("can't register hooks.\n");
+		goto cleanup_dev;
+	}
+
+	pr_info("ipvs loaded.\n");
+
+	return ret;
+
+cleanup_dev:
+	unregister_pernet_device(&ipvs_core_dev_ops);
+cleanup_sub:
+	unregister_pernet_subsys(&ipvs_core_ops);
+cleanup_sync:
+	ip_vs_sync_cleanup();
+  cleanup_conn:
+	ip_vs_conn_cleanup();
+  cleanup_app:
+	ip_vs_app_cleanup();
+  cleanup_protocol:
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+  cleanup_estimator:
+	ip_vs_estimator_cleanup();
+	return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	unregister_pernet_device(&ipvs_core_dev_ops);
+	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
+	ip_vs_sync_cleanup();
+	ip_vs_conn_cleanup();
+	ip_vs_app_cleanup();
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+	ip_vs_estimator_cleanup();
+	pr_info("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
new file mode 100644
index 00000000..a178cb34
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3778 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+
+/* sysctl variables */
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+	return sysctl_ip_vs_debug_level;
+}
+#endif
+
+
+/*  Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc);
+
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static int __ip_vs_addr_is_local_v6(struct net *net,
+				    const struct in6_addr *addr)
+{
+	struct rt6_info *rt;
+	struct flowi6 fl6 = {
+		.daddr = *addr,
+	};
+
+	rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
+	if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
+		return 1;
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+/*
+ *	update_defense_level is called from keventd and from sysctl,
+ *	so it needs to protect itself from softirqs
+ */
+static void update_defense_level(struct netns_ipvs *ipvs)
+{
+	struct sysinfo i;
+	static int old_secure_tcp = 0;
+	int availmem;
+	int nomem;
+	int to_change = -1;
+
+	/* we only count free and buffered memory (in pages) */
+	si_meminfo(&i);
+	availmem = i.freeram + i.bufferram;
+	/* however in linux 2.5 the i.bufferram is total page cache size,
+	   we need adjust it */
+	/* si_swapinfo(&i); */
+	/* availmem = availmem - (i.totalswap - i.freeswap); */
+
+	nomem = (availmem < ipvs->sysctl_amemthresh);
+
+	local_bh_disable();
+
+	/* drop_entry */
+	spin_lock(&ipvs->dropentry_lock);
+	switch (ipvs->sysctl_drop_entry) {
+	case 0:
+		atomic_set(&ipvs->dropentry, 0);
+		break;
+	case 1:
+		if (nomem) {
+			atomic_set(&ipvs->dropentry, 1);
+			ipvs->sysctl_drop_entry = 2;
+		} else {
+			atomic_set(&ipvs->dropentry, 0);
+		}
+		break;
+	case 2:
+		if (nomem) {
+			atomic_set(&ipvs->dropentry, 1);
+		} else {
+			atomic_set(&ipvs->dropentry, 0);
+			ipvs->sysctl_drop_entry = 1;
+		};
+		break;
+	case 3:
+		atomic_set(&ipvs->dropentry, 1);
+		break;
+	}
+	spin_unlock(&ipvs->dropentry_lock);
+
+	/* drop_packet */
+	spin_lock(&ipvs->droppacket_lock);
+	switch (ipvs->sysctl_drop_packet) {
+	case 0:
+		ipvs->drop_rate = 0;
+		break;
+	case 1:
+		if (nomem) {
+			ipvs->drop_rate = ipvs->drop_counter
+				= ipvs->sysctl_amemthresh /
+				(ipvs->sysctl_amemthresh-availmem);
+			ipvs->sysctl_drop_packet = 2;
+		} else {
+			ipvs->drop_rate = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			ipvs->drop_rate = ipvs->drop_counter
+				= ipvs->sysctl_amemthresh /
+				(ipvs->sysctl_amemthresh-availmem);
+		} else {
+			ipvs->drop_rate = 0;
+			ipvs->sysctl_drop_packet = 1;
+		}
+		break;
+	case 3:
+		ipvs->drop_rate = ipvs->sysctl_am_droprate;
+		break;
+	}
+	spin_unlock(&ipvs->droppacket_lock);
+
+	/* secure_tcp */
+	spin_lock(&ipvs->securetcp_lock);
+	switch (ipvs->sysctl_secure_tcp) {
+	case 0:
+		if (old_secure_tcp >= 2)
+			to_change = 0;
+		break;
+	case 1:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+			ipvs->sysctl_secure_tcp = 2;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+			ipvs->sysctl_secure_tcp = 1;
+		}
+		break;
+	case 3:
+		if (old_secure_tcp < 2)
+			to_change = 1;
+		break;
+	}
+	old_secure_tcp = ipvs->sysctl_secure_tcp;
+	if (to_change >= 0)
+		ip_vs_protocol_timeout_change(ipvs,
+					      ipvs->sysctl_secure_tcp > 1);
+	spin_unlock(&ipvs->securetcp_lock);
+
+	local_bh_enable();
+}
+
+
+/*
+ *	Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD	1*HZ
+
+static void defense_work_handler(struct work_struct *work)
+{
+	struct netns_ipvs *ipvs =
+		container_of(work, struct netns_ipvs, defense_work.work);
+
+	update_defense_level(ipvs);
+	if (atomic_read(&ipvs->dropentry))
+		ip_vs_random_dropentry(ipvs->net);
+	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+}
+#endif
+
+int
+ip_vs_use_count_inc(void)
+{
+	return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+	module_put(THIS_MODULE);
+}
+
+
+/*
+ *	Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+
+/*
+ *	Returns hash value for virtual service
+ */
+static inline unsigned
+ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
+		  const union nf_inet_addr *addr, __be16 port)
+{
+	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	addr_fold ^= ((size_t)net>>8);
+
+	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+		& IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Returns hash value of fwmark for virtual service lookup
+ */
+static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+{
+	return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
+ *	or in the ip_vs_svc_fwm_table by fwmark.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+	unsigned hash;
+
+	if (svc->flags & IP_VS_SVC_F_HASHED) {
+		pr_err("%s(): request for already hashed, called from %pF\n",
+		       __func__, __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/*
+		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
+		 */
+		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+					 &svc->addr, svc->port);
+		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+	} else {
+		/*
+		 *  Hash it by fwmark in svc_fwm_table
+		 */
+		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+	}
+
+	svc->flags |= IP_VS_SVC_F_HASHED;
+	/* increase its refcnt because it is referenced by the svc table */
+	atomic_inc(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Unhashes a service from svc_table / svc_fwm_table.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+		pr_err("%s(): request for unhash flagged, called from %pF\n",
+		       __func__, __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/* Remove it from the svc_table table */
+		list_del(&svc->s_list);
+	} else {
+		/* Remove it from the svc_fwm_table table */
+		list_del(&svc->f_list);
+	}
+
+	svc->flags &= ~IP_VS_SVC_F_HASHED;
+	atomic_dec(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Get service by {netns, proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+		     const union nf_inet_addr *vaddr, __be16 vport)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for "full" addressed entries */
+	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+
+	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+		if ((svc->af == af)
+		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
+		    && (svc->port == vport)
+		    && (svc->protocol == protocol)
+		    && net_eq(svc->net, net)) {
+			/* HIT */
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *	Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for fwmark addressed entries */
+	hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+
+	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+		if (svc->fwmark == fwmark && svc->af == af
+		    && net_eq(svc->net, net)) {
+			/* HIT */
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
+		  const union nf_inet_addr *vaddr, __be16 vport)
+{
+	struct ip_vs_service *svc;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	read_lock(&__ip_vs_svc_lock);
+
+	/*
+	 *	Check the table hashed by fwmark first
+	 */
+	if (fwmark) {
+		svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+		if (svc)
+			goto out;
+	}
+
+	/*
+	 *	Check the table hashed by <protocol,addr,port>
+	 *	for "full" addressed entries
+	 */
+	svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+
+	if (svc == NULL
+	    && protocol == IPPROTO_TCP
+	    && atomic_read(&ipvs->ftpsvc_counter)
+	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+		/*
+		 * Check if ftp service entry exists, the packet
+		 * might belong to FTP data connections.
+		 */
+		svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+	}
+
+	if (svc == NULL
+	    && atomic_read(&ipvs->nullsvc_counter)) {
+		/*
+		 * Check if the catch-all port (port zero) exists
+		 */
+		svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+	}
+
+  out:
+	if (svc)
+		atomic_inc(&svc->usecnt);
+	read_unlock(&__ip_vs_svc_lock);
+
+	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+		      fwmark, ip_vs_proto_name(protocol),
+		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+		      svc ? "hit" : "not hit");
+
+	return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	atomic_inc(&svc->refcnt);
+	dest->svc = svc;
+}
+
+static void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+	struct ip_vs_service *svc = dest->svc;
+
+	dest->svc = NULL;
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
+		free_percpu(svc->stats.cpustats);
+		kfree(svc);
+	}
+}
+
+
+/*
+ *	Returns hash value for real service
+ */
+static inline unsigned ip_vs_rs_hashkey(int af,
+					    const union nf_inet_addr *addr,
+					    __be16 port)
+{
+	register unsigned porth = ntohs(port);
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+
+	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+		& IP_VS_RTAB_MASK;
+}
+
+/*
+ *	Hashes ip_vs_dest in rs_table by <proto,addr,port>.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+{
+	unsigned hash;
+
+	if (!list_empty(&dest->d_list)) {
+		return 0;
+	}
+
+	/*
+	 *	Hash by proto,addr,port,
+	 *	which are the parameters of the real service.
+	 */
+	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+	list_add(&dest->d_list, &ipvs->rs_table[hash]);
+
+	return 1;
+}
+
+/*
+ *	UNhashes ip_vs_dest from rs_table.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+	/*
+	 * Remove it from the rs_table table.
+	 */
+	if (!list_empty(&dest->d_list)) {
+		list_del(&dest->d_list);
+		INIT_LIST_HEAD(&dest->d_list);
+	}
+
+	return 1;
+}
+
+/*
+ *	Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
+			  const union nf_inet_addr *daddr,
+			  __be16 dport)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	unsigned hash;
+	struct ip_vs_dest *dest;
+
+	/*
+	 *	Check for "full" addressed entries
+	 *	Return the first found entry
+	 */
+	hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+	read_lock(&ipvs->rs_lock);
+	list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
+		if ((dest->af == af)
+		    && ip_vs_addr_equal(af, &dest->addr, daddr)
+		    && (dest->port == dport)
+		    && ((dest->protocol == protocol) ||
+			dest->vfwmark)) {
+			/* HIT */
+			read_unlock(&ipvs->rs_lock);
+			return dest;
+		}
+	}
+	read_unlock(&ipvs->rs_lock);
+
+	return NULL;
+}
+
+/*
+ *	Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		  __be16 dport)
+{
+	struct ip_vs_dest *dest;
+
+	/*
+	 * Find the destination for the given service
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->af == svc->af)
+		    && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
+		    && (dest->port == dport)) {
+			/* HIT */
+			return dest;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Cretaed to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ *
+ * ip_vs_lookup_real_service() looked promissing, but
+ * seems not working as expected.
+ */
+struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
+				   const union nf_inet_addr *daddr,
+				   __be16 dport,
+				   const union nf_inet_addr *vaddr,
+				   __be16 vport, __u16 protocol, __u32 fwmark)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_service *svc;
+
+	svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
+	if (!svc)
+		return NULL;
+	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (dest)
+		atomic_inc(&dest->refcnt);
+	ip_vs_service_put(svc);
+	return dest;
+}
+
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
+		     __be16 dport)
+{
+	struct ip_vs_dest *dest, *nxt;
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+	/*
+	 * Find the destination in trash
+	 */
+	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+			      "dest->refcnt=%d\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		if (dest->af == svc->af &&
+		    ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
+		    dest->port == dport &&
+		    dest->vfwmark == svc->fwmark &&
+		    dest->protocol == svc->protocol &&
+		    (svc->fwmark ||
+		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+		      dest->vport == svc->port))) {
+			/* HIT */
+			return dest;
+		}
+
+		/*
+		 * Try to purge the destination from trash if not referenced
+		 */
+		if (atomic_read(&dest->refcnt) == 1) {
+			IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
+				      "from trash\n",
+				      dest->vfwmark,
+				      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+				      ntohs(dest->port));
+			list_del(&dest->n_list);
+			ip_vs_dst_reset(dest);
+			__ip_vs_unbind_svc(dest);
+			free_percpu(dest->stats.cpustats);
+			kfree(dest);
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(struct net *net)
+{
+	struct ip_vs_dest *dest, *nxt;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+		list_del(&dest->n_list);
+		ip_vs_dst_reset(dest);
+		__ip_vs_unbind_svc(dest);
+		free_percpu(dest->stats.cpustats);
+		kfree(dest);
+	}
+}
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
+
+	spin_lock_bh(&src->lock);
+
+	IP_VS_SHOW_STATS_COUNTER(conns);
+	IP_VS_SHOW_STATS_COUNTER(inpkts);
+	IP_VS_SHOW_STATS_COUNTER(outpkts);
+	IP_VS_SHOW_STATS_COUNTER(inbytes);
+	IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+	ip_vs_read_estimator(dst, src);
+
+	spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+	spin_lock_bh(&stats->lock);
+
+	/* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
+
+	IP_VS_ZERO_STATS_COUNTER(conns);
+	IP_VS_ZERO_STATS_COUNTER(inpkts);
+	IP_VS_ZERO_STATS_COUNTER(outpkts);
+	IP_VS_ZERO_STATS_COUNTER(inbytes);
+	IP_VS_ZERO_STATS_COUNTER(outbytes);
+
+	ip_vs_zero_estimator(stats);
+
+	spin_unlock_bh(&stats->lock);
+}
+
+/*
+ *	Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+		    struct ip_vs_dest_user_kern *udest, int add)
+{
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+	int conn_flags;
+
+	/* set the weight and the flags */
+	atomic_set(&dest->weight, udest->weight);
+	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+	conn_flags |= IP_VS_CONN_F_INACTIVE;
+
+	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
+		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+	} else {
+		/*
+		 *    Put the real service in rs_table if not present.
+		 *    For now only for NAT!
+		 */
+		write_lock_bh(&ipvs->rs_lock);
+		ip_vs_rs_hash(ipvs, dest);
+		write_unlock_bh(&ipvs->rs_lock);
+	}
+	atomic_set(&dest->conn_flags, conn_flags);
+
+	/* bind the service */
+	if (!dest->svc) {
+		__ip_vs_bind_svc(dest, svc);
+	} else {
+		if (dest->svc != svc) {
+			__ip_vs_unbind_svc(dest);
+			ip_vs_zero_stats(&dest->stats);
+			__ip_vs_bind_svc(dest, svc);
+		}
+	}
+
+	/* set the dest status flags */
+	dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	dest->u_threshold = udest->u_threshold;
+	dest->l_threshold = udest->l_threshold;
+
+	spin_lock_bh(&dest->dst_lock);
+	ip_vs_dst_reset(dest);
+	spin_unlock_bh(&dest->dst_lock);
+
+	if (add)
+		ip_vs_start_estimator(svc->net, &dest->stats);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	if (add) {
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+	}
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+/*
+ *	Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+	       struct ip_vs_dest **dest_p)
+{
+	struct ip_vs_dest *dest;
+	unsigned atype;
+
+	EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (svc->af == AF_INET6) {
+		atype = ipv6_addr_type(&udest->addr.in6);
+		if ((!(atype & IPV6_ADDR_UNICAST) ||
+			atype & IPV6_ADDR_LINKLOCAL) &&
+			!__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+			return -EINVAL;
+	} else
+#endif
+	{
+		atype = inet_addr_type(svc->net, udest->addr.ip);
+		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+			return -EINVAL;
+	}
+
+	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
+	if (dest == NULL) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+	if (!dest->stats.cpustats) {
+		pr_err("%s() alloc_percpu failed\n", __func__);
+		goto err_alloc;
+	}
+
+	dest->af = svc->af;
+	dest->protocol = svc->protocol;
+	dest->vaddr = svc->addr;
+	dest->vport = svc->port;
+	dest->vfwmark = svc->fwmark;
+	ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
+	dest->port = udest->port;
+
+	atomic_set(&dest->activeconns, 0);
+	atomic_set(&dest->inactconns, 0);
+	atomic_set(&dest->persistconns, 0);
+	atomic_set(&dest->refcnt, 1);
+
+	INIT_LIST_HEAD(&dest->d_list);
+	spin_lock_init(&dest->dst_lock);
+	spin_lock_init(&dest->stats.lock);
+	__ip_vs_update_dest(svc, dest, udest, 1);
+
+	*dest_p = dest;
+
+	LeaveFunction(2);
+	return 0;
+
+err_alloc:
+	kfree(dest);
+	return -ENOMEM;
+}
+
+
+/*
+ *	Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	union nf_inet_addr daddr;
+	__be16 dport = udest->port;
+	int ret;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		pr_err("%s(): server weight less than zero\n", __func__);
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		pr_err("%s(): lower threshold is higher than upper threshold\n",
+			__func__);
+		return -ERANGE;
+	}
+
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+	/*
+	 * Check if the dest already exists in the list
+	 */
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+	if (dest != NULL) {
+		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
+		return -EEXIST;
+	}
+
+	/*
+	 * Check if the dest already exists in the trash and
+	 * is from the same service
+	 */
+	dest = ip_vs_trash_get_dest(svc, &daddr, dport);
+
+	if (dest != NULL) {
+		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+			      "dest->refcnt=%d, service %u/%s:%u\n",
+			      IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
+			      atomic_read(&dest->refcnt),
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+			      ntohs(dest->vport));
+
+		/*
+		 * Get the destination from the trash
+		 */
+		list_del(&dest->n_list);
+
+		__ip_vs_update_dest(svc, dest, udest, 1);
+		ret = 0;
+	} else {
+		/*
+		 * Allocate and initialize the dest structure
+		 */
+		ret = ip_vs_new_dest(svc, udest, &dest);
+	}
+	LeaveFunction(2);
+
+	return ret;
+}
+
+
+/*
+ *	Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	union nf_inet_addr daddr;
+	__be16 dport = udest->port;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		pr_err("%s(): server weight less than zero\n", __func__);
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		pr_err("%s(): lower threshold is higher than upper threshold\n",
+			__func__);
+		return -ERANGE;
+	}
+
+	ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
+
+	/*
+	 *  Lookup the destination list
+	 */
+	dest = ip_vs_lookup_dest(svc, &daddr, dport);
+
+	if (dest == NULL) {
+		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
+		return -ENOENT;
+	}
+
+	__ip_vs_update_dest(svc, dest, udest, 0);
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ip_vs_stop_estimator(net, &dest->stats);
+
+	/*
+	 *  Remove it from the d-linked list with the real services.
+	 */
+	write_lock_bh(&ipvs->rs_lock);
+	ip_vs_rs_unhash(dest);
+	write_unlock_bh(&ipvs->rs_lock);
+
+	/*
+	 *  Decrease the refcnt of the dest, and free the dest
+	 *  if nobody refers to it (refcnt=0). Otherwise, throw
+	 *  the destination into the trash.
+	 */
+	if (atomic_dec_and_test(&dest->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port));
+		ip_vs_dst_reset(dest);
+		/* simply decrease svc->refcnt here, let the caller check
+		   and release the service if nobody refers to it.
+		   Only user context can release destination and service,
+		   and only one user context can update virtual service at a
+		   time, so the operation here is OK */
+		atomic_dec(&dest->svc->refcnt);
+		free_percpu(dest->stats.cpustats);
+		kfree(dest);
+	} else {
+		IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
+			      "dest->refcnt=%d\n",
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		list_add(&dest->n_list, &ipvs->dest_trash);
+		atomic_inc(&dest->refcnt);
+	}
+}
+
+
+/*
+ *	Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+				struct ip_vs_dest *dest,
+				int svcupd)
+{
+	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+	/*
+	 *  Remove it from the d-linked destination list.
+	 */
+	list_del(&dest->n_list);
+	svc->num_dests--;
+
+	/*
+	 *  Call the update_service function of its scheduler
+	 */
+	if (svcupd && svc->scheduler->update_service)
+			svc->scheduler->update_service(svc);
+}
+
+
+/*
+ *	Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+	struct ip_vs_dest *dest;
+	__be16 dport = udest->port;
+
+	EnterFunction(2);
+
+	dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+
+	if (dest == NULL) {
+		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
+		return -ENOENT;
+	}
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	/*
+	 *	Unlink dest from the service
+	 */
+	__ip_vs_unlink_dest(svc, dest, 1);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Delete the destination
+	 */
+	__ip_vs_del_dest(svc->net, dest);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+		  struct ip_vs_service **svc_p)
+{
+	int ret = 0;
+	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_pe *pe = NULL;
+	struct ip_vs_service *svc = NULL;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	/* Lookup the scheduler by 'u->sched_name' */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+		ret = -ENOENT;
+		goto out_err;
+	}
+
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_getbyname(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
+	}
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
+		ret = -EINVAL;
+		goto out_err;
+	}
+#endif
+
+	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
+	if (svc == NULL) {
+		IP_VS_DBG(1, "%s(): no memory\n", __func__);
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+	if (!svc->stats.cpustats) {
+		pr_err("%s() alloc_percpu failed\n", __func__);
+		goto out_err;
+	}
+
+	/* I'm the first user of the service */
+	atomic_set(&svc->usecnt, 0);
+	atomic_set(&svc->refcnt, 0);
+
+	svc->af = u->af;
+	svc->protocol = u->protocol;
+	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+	svc->port = u->port;
+	svc->fwmark = u->fwmark;
+	svc->flags = u->flags;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+	svc->net = net;
+
+	INIT_LIST_HEAD(&svc->destinations);
+	rwlock_init(&svc->sched_lock);
+	spin_lock_init(&svc->stats.lock);
+
+	/* Bind the scheduler */
+	ret = ip_vs_bind_scheduler(svc, sched);
+	if (ret)
+		goto out_err;
+	sched = NULL;
+
+	/* Bind the ct retriever */
+	ip_vs_bind_pe(svc, pe);
+	pe = NULL;
+
+	/* Update the virtual service counters */
+	if (svc->port == FTPPORT)
+		atomic_inc(&ipvs->ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_inc(&ipvs->nullsvc_counter);
+
+	ip_vs_start_estimator(net, &svc->stats);
+
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ipvs->num_services++;
+
+	/* Hash the service into the service table */
+	write_lock_bh(&__ip_vs_svc_lock);
+	ip_vs_svc_hash(svc);
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	*svc_p = svc;
+	/* Now there is a service - full throttle */
+	ipvs->enable = 1;
+	return 0;
+
+
+ out_err:
+	if (svc != NULL) {
+		ip_vs_unbind_scheduler(svc);
+		if (svc->inc) {
+			local_bh_disable();
+			ip_vs_app_inc_put(svc->inc);
+			local_bh_enable();
+		}
+		if (svc->stats.cpustats)
+			free_percpu(svc->stats.cpustats);
+		kfree(svc);
+	}
+	ip_vs_scheduler_put(sched);
+	ip_vs_pe_put(pe);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+/*
+ *	Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
+	int ret = 0;
+
+	/*
+	 * Lookup the scheduler, by 'u->sched_name'
+	 */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+		return -ENOENT;
+	}
+	old_sched = sched;
+
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_getbyname(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out;
+		}
+		old_pe = pe;
+	}
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
+		ret = -EINVAL;
+		goto out;
+	}
+#endif
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 * Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	/*
+	 * Set the flags and timeout value
+	 */
+	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+
+	old_sched = svc->scheduler;
+	if (sched != old_sched) {
+		/*
+		 * Unbind the old scheduler
+		 */
+		if ((ret = ip_vs_unbind_scheduler(svc))) {
+			old_sched = sched;
+			goto out_unlock;
+		}
+
+		/*
+		 * Bind the new scheduler
+		 */
+		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+			/*
+			 * If ip_vs_bind_scheduler fails, restore the old
+			 * scheduler.
+			 * The main reason of failure is out of memory.
+			 *
+			 * The question is if the old scheduler can be
+			 * restored all the time. TODO: if it cannot be
+			 * restored some time, we must delete the service,
+			 * otherwise the system may crash.
+			 */
+			ip_vs_bind_scheduler(svc, old_sched);
+			old_sched = sched;
+			goto out_unlock;
+		}
+	}
+
+	old_pe = svc->pe;
+	if (pe != old_pe) {
+		ip_vs_unbind_pe(svc);
+		ip_vs_bind_pe(svc, pe);
+	}
+
+  out_unlock:
+	write_unlock_bh(&__ip_vs_svc_lock);
+  out:
+	ip_vs_scheduler_put(old_sched);
+	ip_vs_pe_put(old_pe);
+	return ret;
+}
+
+
+/*
+ *	Delete a service from the service list
+ *	- The service must be unlinked, unlocked and not referenced!
+ *	- We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest, *nxt;
+	struct ip_vs_scheduler *old_sched;
+	struct ip_vs_pe *old_pe;
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+	pr_info("%s: enter\n", __func__);
+
+	/* Count only IPv4 services for old get/setsockopt interface */
+	if (svc->af == AF_INET)
+		ipvs->num_services--;
+
+	ip_vs_stop_estimator(svc->net, &svc->stats);
+
+	/* Unbind scheduler */
+	old_sched = svc->scheduler;
+	ip_vs_unbind_scheduler(svc);
+	ip_vs_scheduler_put(old_sched);
+
+	/* Unbind persistence engine */
+	old_pe = svc->pe;
+	ip_vs_unbind_pe(svc);
+	ip_vs_pe_put(old_pe);
+
+	/* Unbind app inc */
+	if (svc->inc) {
+		ip_vs_app_inc_put(svc->inc);
+		svc->inc = NULL;
+	}
+
+	/*
+	 *    Unlink the whole destination list
+	 */
+	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+		__ip_vs_unlink_dest(svc, dest, 0);
+		__ip_vs_del_dest(svc->net, dest);
+	}
+
+	/*
+	 *    Update the virtual service counters
+	 */
+	if (svc->port == FTPPORT)
+		atomic_dec(&ipvs->ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_dec(&ipvs->nullsvc_counter);
+
+	/*
+	 *    Free the service if nobody refers to it
+	 */
+	if (atomic_read(&svc->refcnt) == 0) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
+		free_percpu(svc->stats.cpustats);
+		kfree(svc);
+	}
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+/*
+ * Unlink a service from list and try to delete it if its refcnt reached 0
+ */
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
+{
+	/*
+	 * Unhash it from the service table
+	 */
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	ip_vs_svc_unhash(svc);
+
+	/*
+	 * Wait until all the svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	__ip_vs_del_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+	ip_vs_unlink_service(svc);
+
+	return 0;
+}
+
+
+/*
+ *	Flush all the virtual services
+ */
+static int ip_vs_flush(struct net *net)
+{
+	int idx;
+	struct ip_vs_service *svc, *nxt;
+
+	/*
+	 * Flush the service table hashed by <netns,protocol,addr,port>
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
+					 s_list) {
+			if (net_eq(svc->net, net))
+				ip_vs_unlink_service(svc);
+		}
+	}
+
+	/*
+	 * Flush the service table hashed by fwmark
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt,
+					 &ip_vs_svc_fwm_table[idx], f_list) {
+			if (net_eq(svc->net, net))
+				ip_vs_unlink_service(svc);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Delete service by {netns} in the service table.
+ *	Called by __ip_vs_cleanup()
+ */
+void __ip_vs_service_cleanup(struct net *net)
+{
+	EnterFunction(2);
+	/* Check for "full" addressed entries */
+	mutex_lock(&__ip_vs_mutex);
+	ip_vs_flush(net);
+	mutex_unlock(&__ip_vs_mutex);
+	LeaveFunction(2);
+}
+/*
+ * Release dst hold by dst_cache
+ */
+static inline void
+__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+{
+	spin_lock_bh(&dest->dst_lock);
+	if (dest->dst_cache && dest->dst_cache->dev == dev) {
+		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
+			      dev->name,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port),
+			      atomic_read(&dest->refcnt));
+		ip_vs_dst_reset(dest);
+	}
+	spin_unlock_bh(&dest->dst_lock);
+
+}
+/*
+ * Netdev event receiver
+ * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
+ * a device that is "unregister" it must be released.
+ */
+static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
+	struct ip_vs_service *svc;
+	struct ip_vs_dest *dest;
+	unsigned int idx;
+
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+	EnterFunction(2);
+	mutex_lock(&__ip_vs_mutex);
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (net_eq(svc->net, net)) {
+				list_for_each_entry(dest, &svc->destinations,
+						    n_list) {
+					__ip_vs_dev_reset(dest, dev);
+				}
+			}
+		}
+
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (net_eq(svc->net, net)) {
+				list_for_each_entry(dest, &svc->destinations,
+						    n_list) {
+					__ip_vs_dev_reset(dest, dev);
+				}
+			}
+
+		}
+	}
+
+	list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) {
+		__ip_vs_dev_reset(dest, dev);
+	}
+	mutex_unlock(&__ip_vs_mutex);
+	LeaveFunction(2);
+	return NOTIFY_DONE;
+}
+
+/*
+ *	Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+
+	write_lock_bh(&__ip_vs_svc_lock);
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		ip_vs_zero_stats(&dest->stats);
+	}
+	ip_vs_zero_stats(&svc->stats);
+	write_unlock_bh(&__ip_vs_svc_lock);
+	return 0;
+}
+
+static int ip_vs_zero_all(struct net *net)
+{
+	int idx;
+	struct ip_vs_service *svc;
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (net_eq(svc->net, net))
+				ip_vs_zero_service(svc);
+		}
+	}
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (net_eq(svc->net, net))
+				ip_vs_zero_service(svc);
+		}
+	}
+
+	ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+	return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+static int
+proc_do_defense_mode(ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = current->nsproxy->net_ns;
+	int *valp = table->data;
+	int val = *valp;
+	int rc;
+
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (write && (*valp != val)) {
+		if ((*valp < 0) || (*valp > 3)) {
+			/* Restore the correct value */
+			*valp = val;
+		} else {
+			update_defense_level(net_ipvs(net));
+		}
+	}
+	return rc;
+}
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val[2];
+	int rc;
+
+	/* backup the value first */
+	memcpy(val, valp, sizeof(val));
+
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+		/* Restore the correct value */
+		memcpy(valp, val, sizeof(val));
+	}
+	return rc;
+}
+
+static int
+proc_do_sync_mode(ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val = *valp;
+	int rc;
+
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (write && (*valp != val)) {
+		if ((*valp < 0) || (*valp > 1)) {
+			/* Restore the correct value */
+			*valp = val;
+		} else {
+			struct net *net = current->nsproxy->net_ns;
+			ip_vs_sync_switch_mode(net, val);
+		}
+	}
+	return rc;
+}
+
+/*
+ *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ *	Do not change order or insert new entries without
+ *	align with netns init in __ip_vs_control_init()
+ */
+
+static struct ctl_table vs_vars[] = {
+	{
+		.procname	= "amemthresh",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "am_droprate",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "drop_entry",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_defense_mode,
+	},
+	{
+		.procname	= "drop_packet",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_defense_mode,
+	},
+#ifdef CONFIG_IP_VS_NFCT
+	{
+		.procname	= "conntrack",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.procname	= "secure_tcp",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_defense_mode,
+	},
+	{
+		.procname	= "snat_reroute",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "sync_version",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_sync_mode,
+	},
+	{
+		.procname	= "cache_bypass",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "expire_nodest_conn",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "expire_quiescent_template",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sync_threshold",
+		.maxlen		=
+			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+		.mode		= 0644,
+		.proc_handler	= proc_do_sync_threshold,
+	},
+	{
+		.procname	= "nat_icmp_send",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#ifdef CONFIG_IP_VS_DEBUG
+	{
+		.procname	= "debug_level",
+		.data		= &sysctl_ip_vs_debug_level,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
+#if 0
+	{
+		.procname	= "timeout_established",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synsent",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synrecv",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_finwait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_timewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_close",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_closewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_lastack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_listen",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_synack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_udp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "timeout_icmp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+#endif
+	{ }
+};
+
+const struct ctl_path net_vs_ctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "ipv4", },
+	{ .procname = "vs", },
+	{ }
+};
+EXPORT_SYMBOL_GPL(net_vs_ctl_path);
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
+	struct list_head *table;
+	int bucket;
+};
+
+/*
+ *	Write the contents of the VS rule table to a PROCfs file.
+ *	(It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+	switch (flags & IP_VS_CONN_F_FWD_MASK) {
+	case IP_VS_CONN_F_LOCALNODE:
+		return "Local";
+	case IP_VS_CONN_F_TUNNEL:
+		return "Tunnel";
+	case IP_VS_CONN_F_DROUTE:
+		return "Route";
+	default:
+		return "Masq";
+	}
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct ip_vs_iter *iter = seq->private;
+	int idx;
+	struct ip_vs_service *svc;
+
+	/* look in hash by protocol */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (net_eq(svc->net, net) && pos-- == 0) {
+				iter->table = ip_vs_svc_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	/* keep looking in fwmark */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (net_eq(svc->net, net) && pos-- == 0) {
+				iter->table = ip_vs_svc_fwm_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+__acquires(__ip_vs_svc_lock)
+{
+
+	read_lock_bh(&__ip_vs_svc_lock);
+	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct list_head *e;
+	struct ip_vs_iter *iter;
+	struct ip_vs_service *svc;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_info_array(seq,0);
+
+	svc = v;
+	iter = seq->private;
+
+	if (iter->table == ip_vs_svc_table) {
+		/* next service in table hashed by protocol */
+		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+			return list_entry(e, struct ip_vs_service, s_list);
+
+
+		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+					    s_list) {
+				return svc;
+			}
+		}
+
+		iter->table = ip_vs_svc_fwm_table;
+		iter->bucket = -1;
+		goto scan_fwmark;
+	}
+
+	/* next service in hashed by fwmark */
+	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+		return list_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+				    f_list)
+			return svc;
+	}
+
+	return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+__releases(__ip_vs_svc_lock)
+{
+	read_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			"IP Virtual Server version %d.%d.%d (size=%d)\n",
+			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+		seq_puts(seq,
+			 "Prot LocalAddress:Port Scheduler Flags\n");
+		seq_puts(seq,
+			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+	} else {
+		const struct ip_vs_service *svc = v;
+		const struct ip_vs_iter *iter = seq->private;
+		const struct ip_vs_dest *dest;
+
+		if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+			if (svc->af == AF_INET6)
+				seq_printf(seq, "%s  [%pI6]:%04X %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   &svc->addr.in6,
+					   ntohs(svc->port),
+					   svc->scheduler->name);
+			else
+#endif
+				seq_printf(seq, "%s  %08X:%04X %s %s ",
+					   ip_vs_proto_name(svc->protocol),
+					   ntohl(svc->addr.ip),
+					   ntohs(svc->port),
+					   svc->scheduler->name,
+					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+		} else {
+			seq_printf(seq, "FWM  %08X %s %s",
+				   svc->fwmark, svc->scheduler->name,
+				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+		}
+
+		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+			seq_printf(seq, "persistent %d %08X\n",
+				svc->timeout,
+				ntohl(svc->netmask));
+		else
+			seq_putc(seq, '\n');
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+			if (dest->af == AF_INET6)
+				seq_printf(seq,
+					   "  -> [%pI6]:%04X"
+					   "      %-7s %-6d %-10d %-10d\n",
+					   &dest->addr.in6,
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+			else
+#endif
+				seq_printf(seq,
+					   "  -> %08X:%04X      "
+					   "%-7s %-6d %-10d %-10d\n",
+					   ntohl(dest->addr.ip),
+					   ntohs(dest->port),
+					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+					   atomic_read(&dest->weight),
+					   atomic_read(&dest->activeconns),
+					   atomic_read(&dest->inactconns));
+
+		}
+	}
+	return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+	.start = ip_vs_info_seq_start,
+	.next  = ip_vs_info_seq_next,
+	.stop  = ip_vs_info_seq_stop,
+	.show  = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
+			sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_info_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_single_net(seq);
+	struct ip_vs_stats_user show;
+
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+	seq_printf(seq,
+		   "   Conns  Packets  Packets            Bytes            Bytes\n");
+
+	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
+		   show.inpkts, show.outpkts,
+		   (unsigned long long) show.inbytes,
+		   (unsigned long long) show.outbytes);
+
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+	seq_printf(seq, "%8X %8X %8X %16X %16X\n",
+			show.cps, show.inpps, show.outpps,
+			show.inbps, show.outbps);
+
+	return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, ip_vs_stats_show);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = ip_vs_stats_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release_net,
+};
+
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_single_net(seq);
+	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+	struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
+	struct ip_vs_stats_user rates;
+	int i;
+
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
+	seq_printf(seq,
+		   "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
+
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+		unsigned int start;
+		__u64 inbytes, outbytes;
+
+		do {
+			start = u64_stats_fetch_begin_bh(&u->syncp);
+			inbytes = u->ustats.inbytes;
+			outbytes = u->ustats.outbytes;
+		} while (u64_stats_fetch_retry_bh(&u->syncp, start));
+
+		seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
+			   i, u->ustats.conns, u->ustats.inpkts,
+			   u->ustats.outpkts, (__u64)inbytes,
+			   (__u64)outbytes);
+	}
+
+	spin_lock_bh(&tot_stats->lock);
+
+	seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
+		   tot_stats->ustats.conns, tot_stats->ustats.inpkts,
+		   tot_stats->ustats.outpkts,
+		   (unsigned long long) tot_stats->ustats.inbytes,
+		   (unsigned long long) tot_stats->ustats.outbytes);
+
+	ip_vs_read_estimator(&rates, tot_stats);
+
+	spin_unlock_bh(&tot_stats->lock);
+
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		   "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+	seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
+			rates.cps,
+			rates.inpps,
+			rates.outpps,
+			rates.inbps,
+			rates.outbps);
+
+	return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+	.owner = THIS_MODULE,
+	.open = ip_vs_stats_percpu_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release_net,
+};
+#endif
+
+/*
+ *	Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+	struct ip_vs_proto_data *pd;
+#endif
+
+	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+		  u->tcp_timeout,
+		  u->tcp_fin_timeout,
+		  u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout) {
+		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
+			= u->tcp_timeout * HZ;
+	}
+
+	if (u->tcp_fin_timeout) {
+		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
+			= u->tcp_fin_timeout * HZ;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout) {
+		pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+		pd->timeout_table[IP_VS_UDP_S_NORMAL]
+			= u->udp_timeout * HZ;
+	}
+#endif
+	return 0;
+}
+
+
+#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
+				 sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN		SVCDEST_ARG_LEN
+
+static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
+	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
+};
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+				  struct ip_vs_service_user *usvc_compat)
+{
+	memset(usvc, 0, sizeof(*usvc));
+
+	usvc->af		= AF_INET;
+	usvc->protocol		= usvc_compat->protocol;
+	usvc->addr.ip		= usvc_compat->addr;
+	usvc->port		= usvc_compat->port;
+	usvc->fwmark		= usvc_compat->fwmark;
+
+	/* Deep copy of sched_name is not needed here */
+	usvc->sched_name	= usvc_compat->sched_name;
+
+	usvc->flags		= usvc_compat->flags;
+	usvc->timeout		= usvc_compat->timeout;
+	usvc->netmask		= usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+				   struct ip_vs_dest_user *udest_compat)
+{
+	memset(udest, 0, sizeof(*udest));
+
+	udest->addr.ip		= udest_compat->addr;
+	udest->port		= udest_compat->port;
+	udest->conn_flags	= udest_compat->conn_flags;
+	udest->weight		= udest_compat->weight;
+	udest->u_threshold	= udest_compat->u_threshold;
+	udest->l_threshold	= udest_compat->l_threshold;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	struct net *net = sock_net(sk);
+	int ret;
+	unsigned char arg[MAX_ARG_LEN];
+	struct ip_vs_service_user *usvc_compat;
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
+	struct ip_vs_dest_user *udest_compat;
+	struct ip_vs_dest_user_kern udest;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
+		return -EINVAL;
+	if (len < 0 || len >  MAX_ARG_LEN)
+		return -EINVAL;
+	if (len != set_arglen[SET_CMDID(cmd)]) {
+		pr_err("set_ctl: len %u != %u\n",
+		       len, set_arglen[SET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(arg, user, len) != 0)
+		return -EFAULT;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
+		ret = -ERESTARTSYS;
+		goto out_dec;
+	}
+
+	if (cmd == IP_VS_SO_SET_FLUSH) {
+		/* Flush the virtual service */
+		ret = ip_vs_flush(net);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+		/* Set timeout values for (tcp tcpfin udp) */
+		ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+					dm->syncid);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = stop_sync_thread(net, dm->state);
+		goto out_unlock;
+	}
+
+	usvc_compat = (struct ip_vs_service_user *)arg;
+	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+	/* We only use the new structs internally, so copy userspace compat
+	 * structs to extended internal versions */
+	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+	ip_vs_copy_udest_compat(&udest, udest_compat);
+
+	if (cmd == IP_VS_SO_SET_ZERO) {
+		/* if no service address is set, zero counters in all */
+		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+			ret = ip_vs_zero_all(net);
+			goto out_unlock;
+		}
+	}
+
+	/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
+	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
+	    usvc.protocol != IPPROTO_SCTP) {
+		pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
+		       usvc.protocol, &usvc.addr.ip,
+		       ntohs(usvc.port), usvc.sched_name);
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	if (usvc.fwmark == 0)
+		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
+	else
+		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+
+	if (cmd != IP_VS_SO_SET_ADD
+	    && (svc == NULL || svc->protocol != usvc.protocol)) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	switch (cmd) {
+	case IP_VS_SO_SET_ADD:
+		if (svc != NULL)
+			ret = -EEXIST;
+		else
+			ret = ip_vs_add_service(net, &usvc, &svc);
+		break;
+	case IP_VS_SO_SET_EDIT:
+		ret = ip_vs_edit_service(svc, &usvc);
+		break;
+	case IP_VS_SO_SET_DEL:
+		ret = ip_vs_del_service(svc);
+		if (!ret)
+			goto out_unlock;
+		break;
+	case IP_VS_SO_SET_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	case IP_VS_SO_SET_ADDDEST:
+		ret = ip_vs_add_dest(svc, &udest);
+		break;
+	case IP_VS_SO_SET_EDITDEST:
+		ret = ip_vs_edit_dest(svc, &udest);
+		break;
+	case IP_VS_SO_SET_DELDEST:
+		ret = ip_vs_del_dest(svc, &udest);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+  out_unlock:
+	mutex_unlock(&__ip_vs_mutex);
+  out_dec:
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+	dst->protocol = src->protocol;
+	dst->addr = src->addr.ip;
+	dst->port = src->port;
+	dst->fwmark = src->fwmark;
+	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+	dst->flags = src->flags;
+	dst->timeout = src->timeout / HZ;
+	dst->netmask = src->netmask;
+	dst->num_dests = src->num_dests;
+	ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(struct net *net,
+			    const struct ip_vs_get_services *get,
+			    struct ip_vs_get_services __user *uptr)
+{
+	int idx, count=0;
+	struct ip_vs_service *svc;
+	struct ip_vs_service_entry entry;
+	int ret = 0;
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET || !net_eq(svc->net, net))
+				continue;
+
+			if (count >= get->num_services)
+				goto out;
+			memset(&entry, 0, sizeof(entry));
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			/* Only expose IPv4 entries to old interface */
+			if (svc->af != AF_INET || !net_eq(svc->net, net))
+				continue;
+
+			if (count >= get->num_services)
+				goto out;
+			memset(&entry, 0, sizeof(entry));
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+  out:
+	return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+			 struct ip_vs_get_dests __user *uptr)
+{
+	struct ip_vs_service *svc;
+	union nf_inet_addr addr = { .ip = get->addr };
+	int ret = 0;
+
+	if (get->fwmark)
+		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+	else
+		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+					   get->port);
+
+	if (svc) {
+		int count = 0;
+		struct ip_vs_dest *dest;
+		struct ip_vs_dest_entry entry;
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+			if (count >= get->num_dests)
+				break;
+
+			entry.addr = dest->addr.ip;
+			entry.port = dest->port;
+			entry.conn_flags = atomic_read(&dest->conn_flags);
+			entry.weight = atomic_read(&dest->weight);
+			entry.u_threshold = dest->u_threshold;
+			entry.l_threshold = dest->l_threshold;
+			entry.activeconns = atomic_read(&dest->activeconns);
+			entry.inactconns = atomic_read(&dest->inactconns);
+			entry.persistconns = atomic_read(&dest->persistconns);
+			ip_vs_copy_stats(&entry.stats, &dest->stats);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				break;
+			}
+			count++;
+		}
+	} else
+		ret = -ESRCH;
+	return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+	struct ip_vs_proto_data *pd;
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+	u->udp_timeout =
+			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)
+
+static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
+	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	unsigned char arg[128];
+	int ret = 0;
+	unsigned int copylen;
+	struct net *net = sock_net(sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	BUG_ON(!net);
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
+		return -EINVAL;
+
+	if (*len < get_arglen[GET_CMDID(cmd)]) {
+		pr_err("get_ctl: len %u < %u\n",
+		       *len, get_arglen[GET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	copylen = get_arglen[GET_CMDID(cmd)];
+	if (copylen > 128)
+		return -EINVAL;
+
+	if (copy_from_user(arg, user, copylen) != 0)
+		return -EFAULT;
+
+	if (mutex_lock_interruptible(&__ip_vs_mutex))
+		return -ERESTARTSYS;
+
+	switch (cmd) {
+	case IP_VS_SO_GET_VERSION:
+	{
+		char buf[64];
+
+		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		*len = strlen(buf)+1;
+	}
+	break;
+
+	case IP_VS_SO_GET_INFO:
+	{
+		struct ip_vs_getinfo info;
+		info.version = IP_VS_VERSION_CODE;
+		info.size = ip_vs_conn_tab_size;
+		info.num_services = ipvs->num_services;
+		if (copy_to_user(user, &info, sizeof(info)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICES:
+	{
+		struct ip_vs_get_services *get;
+		int size;
+
+		get = (struct ip_vs_get_services *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_service_entry) * get->num_services;
+		if (*len != size) {
+			pr_err("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_service_entries(net, get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICE:
+	{
+		struct ip_vs_service_entry *entry;
+		struct ip_vs_service *svc;
+		union nf_inet_addr addr;
+
+		entry = (struct ip_vs_service_entry *)arg;
+		addr.ip = entry->addr;
+		if (entry->fwmark)
+			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+		else
+			svc = __ip_vs_service_find(net, AF_INET,
+						   entry->protocol, &addr,
+						   entry->port);
+		if (svc) {
+			ip_vs_copy_service(entry, svc);
+			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+				ret = -EFAULT;
+		} else
+			ret = -ESRCH;
+	}
+	break;
+
+	case IP_VS_SO_GET_DESTS:
+	{
+		struct ip_vs_get_dests *get;
+		int size;
+
+		get = (struct ip_vs_get_dests *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_dest_entry) * get->num_dests;
+		if (*len != size) {
+			pr_err("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_dest_entries(net, get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_TIMEOUT:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(net, &t);
+		if (copy_to_user(user, &t, sizeof(t)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_DAEMON:
+	{
+		struct ip_vs_daemon_user d[2];
+
+		memset(&d, 0, sizeof(d));
+		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+			d[0].state = IP_VS_STATE_MASTER;
+			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+				sizeof(d[0].mcast_ifn));
+			d[0].syncid = ipvs->master_syncid;
+		}
+		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+			d[1].state = IP_VS_STATE_BACKUP;
+			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+				sizeof(d[1].mcast_ifn));
+			d[1].syncid = ipvs->backup_syncid;
+		}
+		if (copy_to_user(user, &d, sizeof(d)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+  out:
+	mutex_unlock(&__ip_vs_mutex);
+	return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IP_VS_BASE_CTL,
+	.set_optmax	= IP_VS_SO_SET_MAX+1,
+	.set		= do_ip_vs_set_ctl,
+	.get_optmin	= IP_VS_BASE_CTL,
+	.get_optmax	= IP_VS_SO_GET_MAX+1,
+	.get		= do_ip_vs_get_ctl,
+	.owner		= THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+	.id		= GENL_ID_GENERATE,
+	.hdrsize	= 0,
+	.name		= IPVS_GENL_NAME,
+	.version	= IPVS_GENL_VERSION,
+	.maxattr	= IPVS_CMD_MAX,
+	.netnsok        = true,         /* Make ipvsadm to work on netns */
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
+	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
+	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_IFNAME_MAXLEN },
+	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_PENAME_MAXLEN },
+	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
+					    .len = sizeof(struct ip_vs_flags) },
+	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
+	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
+					    .len = sizeof(union nf_inet_addr) },
+	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
+	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
+	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+				 struct ip_vs_stats *stats)
+{
+	struct ip_vs_stats_user ustats;
+	struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+	if (!nl_stats)
+		return -EMSGSIZE;
+
+	ip_vs_copy_stats(&ustats, stats);
+
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes);
+	NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps);
+	NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps);
+
+	nla_nest_end(skb, nl_stats);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_stats);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc)
+{
+	struct nlattr *nl_service;
+	struct ip_vs_flags flags = { .flags = svc->flags,
+				     .mask = ~0 };
+
+	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+	if (!nl_service)
+		return -EMSGSIZE;
+
+	NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
+
+	if (svc->fwmark) {
+		NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
+	} else {
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
+		NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
+		NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
+	}
+
+	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	if (svc->pe)
+		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
+	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
+	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_service);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_service);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+				   struct ip_vs_service *svc,
+				   struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_SERVICE);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_service(skb, svc) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	int idx = 0, i;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+	struct net *net = skb_sknet(skb);
+
+	mutex_lock(&__ip_vs_mutex);
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+			if (++idx <= start || !net_eq(svc->net, net))
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+			if (++idx <= start || !net_eq(svc->net, net))
+				continue;
+			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+				idx--;
+				goto nla_put_failure;
+			}
+		}
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct net *net,
+				    struct ip_vs_service_user_kern *usvc,
+				    struct nlattr *nla, int full_entry,
+				    struct ip_vs_service **ret_svc)
+{
+	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+	struct ip_vs_service *svc;
+
+	/* Parse mandatory identifying service fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+		return -EINVAL;
+
+	nla_af		= attrs[IPVS_SVC_ATTR_AF];
+	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
+	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
+	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
+	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
+
+	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+		return -EINVAL;
+
+	memset(usvc, 0, sizeof(*usvc));
+
+	usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+	if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+	if (usvc->af != AF_INET)
+#endif
+		return -EAFNOSUPPORT;
+
+	if (nla_fwmark) {
+		usvc->protocol = IPPROTO_TCP;
+		usvc->fwmark = nla_get_u32(nla_fwmark);
+	} else {
+		usvc->protocol = nla_get_u16(nla_protocol);
+		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+		usvc->port = nla_get_u16(nla_port);
+		usvc->fwmark = 0;
+	}
+
+	if (usvc->fwmark)
+		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+	else
+		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+					   &usvc->addr, usvc->port);
+	*ret_svc = svc;
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
+			      *nla_netmask;
+		struct ip_vs_flags flags;
+
+		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
+		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+			return -EINVAL;
+
+		nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+		/* prefill flags from service if it already exists */
+		if (svc)
+			usvc->flags = svc->flags;
+
+		/* set new flags from userland */
+		usvc->flags = (usvc->flags & ~flags.mask) |
+			      (flags.flags & flags.mask);
+		usvc->sched_name = nla_data(nla_sched);
+		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
+		usvc->timeout = nla_get_u32(nla_timeout);
+		usvc->netmask = nla_get_u32(nla_netmask);
+	}
+
+	return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+						     struct nlattr *nla)
+{
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
+	int ret;
+
+	ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+	return ret ? ERR_PTR(ret) : svc;
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+	struct nlattr *nl_dest;
+
+	nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+	if (!nl_dest)
+		return -EMSGSIZE;
+
+	NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
+	NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
+
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+		    atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+		    atomic_read(&dest->activeconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+		    atomic_read(&dest->inactconns));
+	NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+		    atomic_read(&dest->persistconns));
+
+	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nl_dest);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_dest);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+				struct netlink_callback *cb)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DEST);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_dest(skb, dest) < 0)
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	int idx = 0;
+	int start = cb->args[0];
+	struct ip_vs_service *svc;
+	struct ip_vs_dest *dest;
+	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+	struct net *net = skb_sknet(skb);
+
+	mutex_lock(&__ip_vs_mutex);
+
+	/* Try to find the service for which to dump destinations */
+	if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+		goto out_err;
+
+
+	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+	if (IS_ERR(svc) || svc == NULL)
+		goto out_err;
+
+	/* Dump the destinations */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (++idx <= start)
+			continue;
+		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+			idx--;
+			goto nla_put_failure;
+		}
+	}
+
+nla_put_failure:
+	cb->args[0] = idx;
+
+out_err:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+				 struct nlattr *nla, int full_entry)
+{
+	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+	struct nlattr *nla_addr, *nla_port;
+
+	/* Parse mandatory identifying destination fields first */
+	if (nla == NULL ||
+	    nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+		return -EINVAL;
+
+	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
+	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
+
+	if (!(nla_addr && nla_port))
+		return -EINVAL;
+
+	memset(udest, 0, sizeof(*udest));
+
+	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+	udest->port = nla_get_u16(nla_port);
+
+	/* If a full entry was requested, check for the additional fields */
+	if (full_entry) {
+		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+			      *nla_l_thresh;
+
+		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
+		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
+		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
+		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
+
+		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+			return -EINVAL;
+
+		udest->conn_flags = nla_get_u32(nla_fwd)
+				    & IP_VS_CONN_F_FWD_MASK;
+		udest->weight = nla_get_u32(nla_weight);
+		udest->u_threshold = nla_get_u32(nla_u_thresh);
+		udest->l_threshold = nla_get_u32(nla_l_thresh);
+	}
+
+	return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid)
+{
+	struct nlattr *nl_daemon;
+
+	nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+	if (!nl_daemon)
+		return -EMSGSIZE;
+
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
+	NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
+	NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
+
+	nla_nest_end(skb, nl_daemon);
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nl_daemon);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
+				  const char *mcast_ifn, __be32 syncid,
+				  struct netlink_callback *cb)
+{
+	void *hdr;
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			  &ip_vs_genl_family, NLM_F_MULTI,
+			  IPVS_CMD_NEW_DAEMON);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+		goto nla_put_failure;
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct net *net = skb_sknet(skb);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	mutex_lock(&__ip_vs_mutex);
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+					   ipvs->master_mcast_ifn,
+					   ipvs->master_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[0] = 1;
+	}
+
+	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+					   ipvs->backup_mcast_ifn,
+					   ipvs->backup_syncid, cb) < 0)
+			goto nla_put_failure;
+
+		cb->args[1] = 1;
+	}
+
+nla_put_failure:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+{
+	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+		return -EINVAL;
+
+	return start_sync_thread(net,
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+{
+	if (!attrs[IPVS_DAEMON_ATTR_STATE])
+		return -EINVAL;
+
+	return stop_sync_thread(net,
+				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+{
+	struct ip_vs_timeout_user t;
+
+	__ip_vs_get_timeouts(net, &t);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+		t.tcp_fin_timeout =
+			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+	return ip_vs_set_timeout(net, &t);
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ip_vs_service *svc = NULL;
+	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_dest_user_kern udest;
+	int ret = 0, cmd;
+	int need_full_svc = 0, need_full_dest = 0;
+	struct net *net;
+	struct netns_ipvs *ipvs;
+
+	net = skb_sknet(skb);
+	ipvs = net_ipvs(net);
+	cmd = info->genlhdr->cmd;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	if (cmd == IPVS_CMD_FLUSH) {
+		ret = ip_vs_flush(net);
+		goto out;
+	} else if (cmd == IPVS_CMD_SET_CONFIG) {
+		ret = ip_vs_genl_set_config(net, info->attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_NEW_DAEMON ||
+		   cmd == IPVS_CMD_DEL_DAEMON) {
+
+		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+		    nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+				     info->attrs[IPVS_CMD_ATTR_DAEMON],
+				     ip_vs_daemon_policy)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (cmd == IPVS_CMD_NEW_DAEMON)
+			ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+		else
+			ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+		goto out;
+	} else if (cmd == IPVS_CMD_ZERO &&
+		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+		ret = ip_vs_zero_all(net);
+		goto out;
+	}
+
+	/* All following commands require a service argument, so check if we
+	 * received a valid one. We need a full service specification when
+	 * adding / editing a service. Only identifying members otherwise. */
+	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+		need_full_svc = 1;
+
+	ret = ip_vs_genl_parse_service(net, &usvc,
+				       info->attrs[IPVS_CMD_ATTR_SERVICE],
+				       need_full_svc, &svc);
+	if (ret)
+		goto out;
+
+	/* Unless we're adding a new service, the service must already exist */
+	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* Destination commands require a valid destination argument. For
+	 * adding / editing a destination, we need a full destination
+	 * specification. */
+	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+	    cmd == IPVS_CMD_DEL_DEST) {
+		if (cmd != IPVS_CMD_DEL_DEST)
+			need_full_dest = 1;
+
+		ret = ip_vs_genl_parse_dest(&udest,
+					    info->attrs[IPVS_CMD_ATTR_DEST],
+					    need_full_dest);
+		if (ret)
+			goto out;
+	}
+
+	switch (cmd) {
+	case IPVS_CMD_NEW_SERVICE:
+		if (svc == NULL)
+			ret = ip_vs_add_service(net, &usvc, &svc);
+		else
+			ret = -EEXIST;
+		break;
+	case IPVS_CMD_SET_SERVICE:
+		ret = ip_vs_edit_service(svc, &usvc);
+		break;
+	case IPVS_CMD_DEL_SERVICE:
+		ret = ip_vs_del_service(svc);
+		/* do not use svc, it can be freed */
+		break;
+	case IPVS_CMD_NEW_DEST:
+		ret = ip_vs_add_dest(svc, &udest);
+		break;
+	case IPVS_CMD_SET_DEST:
+		ret = ip_vs_edit_dest(svc, &udest);
+		break;
+	case IPVS_CMD_DEL_DEST:
+		ret = ip_vs_del_dest(svc, &udest);
+		break;
+	case IPVS_CMD_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *reply;
+	int ret, cmd, reply_cmd;
+	struct net *net;
+	struct netns_ipvs *ipvs;
+
+	net = skb_sknet(skb);
+	ipvs = net_ipvs(net);
+	cmd = info->genlhdr->cmd;
+
+	if (cmd == IPVS_CMD_GET_SERVICE)
+		reply_cmd = IPVS_CMD_NEW_SERVICE;
+	else if (cmd == IPVS_CMD_GET_INFO)
+		reply_cmd = IPVS_CMD_SET_INFO;
+	else if (cmd == IPVS_CMD_GET_CONFIG)
+		reply_cmd = IPVS_CMD_SET_CONFIG;
+	else {
+		pr_err("unknown Generic Netlink command\n");
+		return -EINVAL;
+	}
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+	if (reply == NULL)
+		goto nla_put_failure;
+
+	switch (cmd) {
+	case IPVS_CMD_GET_SERVICE:
+	{
+		struct ip_vs_service *svc;
+
+		svc = ip_vs_genl_find_service(net,
+					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
+		if (IS_ERR(svc)) {
+			ret = PTR_ERR(svc);
+			goto out_err;
+		} else if (svc) {
+			ret = ip_vs_genl_fill_service(msg, svc);
+			if (ret)
+				goto nla_put_failure;
+		} else {
+			ret = -ESRCH;
+			goto out_err;
+		}
+
+		break;
+	}
+
+	case IPVS_CMD_GET_CONFIG:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(net, &t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+			    t.tcp_fin_timeout);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
+#endif
+
+		break;
+	}
+
+	case IPVS_CMD_GET_INFO:
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
+		NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+			    ip_vs_conn_tab_size);
+		break;
+	}
+
+	genlmsg_end(msg, reply);
+	ret = genlmsg_reply(msg, info);
+	goto out;
+
+nla_put_failure:
+	pr_err("not enough space in Netlink message\n");
+	ret = -EMSGSIZE;
+
+out_err:
+	nlmsg_free(msg);
+out:
+	mutex_unlock(&__ip_vs_mutex);
+
+	return ret;
+}
+
+
+static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
+	{
+		.cmd	= IPVS_CMD_NEW_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_SERVICE,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+		.dumpit	= ip_vs_genl_dump_services,
+		.policy	= ip_vs_cmd_policy,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DEST,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.dumpit	= ip_vs_genl_dump_dests,
+	},
+	{
+		.cmd	= IPVS_CMD_NEW_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_DEL_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_DAEMON,
+		.flags	= GENL_ADMIN_PERM,
+		.dumpit	= ip_vs_genl_dump_daemons,
+	},
+	{
+		.cmd	= IPVS_CMD_SET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_CONFIG,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_GET_INFO,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_get_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_ZERO,
+		.flags	= GENL_ADMIN_PERM,
+		.policy	= ip_vs_cmd_policy,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+	{
+		.cmd	= IPVS_CMD_FLUSH,
+		.flags	= GENL_ADMIN_PERM,
+		.doit	= ip_vs_genl_set_cmd,
+	},
+};
+
+static int __init ip_vs_genl_register(void)
+{
+	return genl_register_family_with_ops(&ip_vs_genl_family,
+		ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
+}
+
+static void ip_vs_genl_unregister(void)
+{
+	genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+int __net_init __ip_vs_control_init_sysctl(struct net *net)
+{
+	int idx;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ctl_table *tbl;
+
+	atomic_set(&ipvs->dropentry, 0);
+	spin_lock_init(&ipvs->dropentry_lock);
+	spin_lock_init(&ipvs->droppacket_lock);
+	spin_lock_init(&ipvs->securetcp_lock);
+
+	if (!net_eq(net, &init_net)) {
+		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+		if (tbl == NULL)
+			return -ENOMEM;
+	} else
+		tbl = vs_vars;
+	/* Initialize sysctl defaults */
+	idx = 0;
+	ipvs->sysctl_amemthresh = 1024;
+	tbl[idx++].data = &ipvs->sysctl_amemthresh;
+	ipvs->sysctl_am_droprate = 10;
+	tbl[idx++].data = &ipvs->sysctl_am_droprate;
+	tbl[idx++].data = &ipvs->sysctl_drop_entry;
+	tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+	tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+	ipvs->sysctl_snat_reroute = 1;
+	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+	ipvs->sysctl_sync_ver = 1;
+	tbl[idx++].data = &ipvs->sysctl_sync_ver;
+	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+	tbl[idx].data = &ipvs->sysctl_sync_threshold;
+	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+
+
+	ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
+						     tbl);
+	if (ipvs->sysctl_hdr == NULL) {
+		if (!net_eq(net, &init_net))
+			kfree(tbl);
+		return -ENOMEM;
+	}
+	ip_vs_start_estimator(net, &ipvs->tot_stats);
+	ipvs->sysctl_tbl = tbl;
+	/* Schedule defense work */
+	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+	schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+
+	return 0;
+}
+
+void __net_init __ip_vs_control_cleanup_sysctl(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	cancel_delayed_work_sync(&ipvs->defense_work);
+	cancel_work_sync(&ipvs->defense_work.work);
+	unregister_net_sysctl_table(ipvs->sysctl_hdr);
+}
+
+#else
+
+int __net_init __ip_vs_control_init_sysctl(struct net *net) { return 0; }
+void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+static struct notifier_block ip_vs_dst_notifier = {
+	.notifier_call = ip_vs_dst_event,
+};
+
+int __net_init __ip_vs_control_init(struct net *net)
+{
+	int idx;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
+
+	/* Initialize rs_table */
+	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+
+	INIT_LIST_HEAD(&ipvs->dest_trash);
+	atomic_set(&ipvs->ftpsvc_counter, 0);
+	atomic_set(&ipvs->nullsvc_counter, 0);
+
+	/* procfs stats */
+	ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+	if (!ipvs->tot_stats.cpustats) {
+		pr_err("%s(): alloc_percpu.\n", __func__);
+		return -ENOMEM;
+	}
+	spin_lock_init(&ipvs->tot_stats.lock);
+
+	proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
+	proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
+	proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
+			     &ip_vs_stats_percpu_fops);
+
+	if (__ip_vs_control_init_sysctl(net))
+		goto err;
+
+	return 0;
+
+err:
+	free_percpu(ipvs->tot_stats.cpustats);
+	return -ENOMEM;
+}
+
+void __net_exit __ip_vs_control_cleanup(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ip_vs_trash_cleanup(net);
+	ip_vs_stop_estimator(net, &ipvs->tot_stats);
+	__ip_vs_control_cleanup_sysctl(net);
+	proc_net_remove(net, "ip_vs_stats_percpu");
+	proc_net_remove(net, "ip_vs_stats");
+	proc_net_remove(net, "ip_vs");
+	free_percpu(ipvs->tot_stats.cpustats);
+}
+
+int __init ip_vs_control_init(void)
+{
+	int idx;
+	int ret;
+
+	EnterFunction(2);
+
+	/* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+
+	smp_wmb();	/* Do we really need it now ? */
+
+	ret = nf_register_sockopt(&ip_vs_sockopts);
+	if (ret) {
+		pr_err("cannot register sockopt.\n");
+		goto err_sock;
+	}
+
+	ret = ip_vs_genl_register();
+	if (ret) {
+		pr_err("cannot register Generic Netlink interface.\n");
+		goto err_genl;
+	}
+
+	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+	if (ret < 0)
+		goto err_notf;
+
+	LeaveFunction(2);
+	return 0;
+
+err_notf:
+	ip_vs_genl_unregister();
+err_genl:
+	nf_unregister_sockopt(&ip_vs_sockopts);
+err_sock:
+	return ret;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+	EnterFunction(2);
+	unregister_netdevice_notifier(&ip_vs_dst_notifier);
+	ip_vs_genl_unregister();
+	nf_unregister_sockopt(&ip_vs_sockopts);
+	LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
new file mode 100644
index 00000000..95fd0d14
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,271 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+{
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
+	     const union nf_inet_addr *addr)
+{
+	return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl;
+
+	/* allocate the DH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_dh_bucket *tbl;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+	dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		return NULL;
+	}
+
+	IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+		      ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+	.name =			"dh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+	.init_service =		ip_vs_dh_init_svc,
+	.done_service =		ip_vs_dh_done_svc,
+	.update_service =	ip_vs_dh_update_svc,
+	.schedule =		ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 00000000..508cce98
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,218 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *              Affected data: est_list and est_lock.
+ *              estimation_timer() runs with timer per netns.
+ *              get_stats()) do the per cpu summing.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+
+  We measure rate during the last 8 seconds every 2 seconds:
+
+    avgrate = avgrate*(1-W) + rate*W
+
+    where W = 2^(-2)
+
+  NOTES.
+
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+  * A lot code is taken from net/sched/estimator.c
+ */
+
+
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
+				 struct ip_vs_cpu_stats *stats)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+		unsigned int start;
+		__u64 inbytes, outbytes;
+		if (i) {
+			sum->conns += s->ustats.conns;
+			sum->inpkts += s->ustats.inpkts;
+			sum->outpkts += s->ustats.outpkts;
+			do {
+				start = u64_stats_fetch_begin(&s->syncp);
+				inbytes = s->ustats.inbytes;
+				outbytes = s->ustats.outbytes;
+			} while (u64_stats_fetch_retry(&s->syncp, start));
+			sum->inbytes += inbytes;
+			sum->outbytes += outbytes;
+		} else {
+			sum->conns = s->ustats.conns;
+			sum->inpkts = s->ustats.inpkts;
+			sum->outpkts = s->ustats.outpkts;
+			do {
+				start = u64_stats_fetch_begin(&s->syncp);
+				sum->inbytes = s->ustats.inbytes;
+				sum->outbytes = s->ustats.outbytes;
+			} while (u64_stats_fetch_retry(&s->syncp, start));
+		}
+	}
+}
+
+
+static void estimation_timer(unsigned long arg)
+{
+	struct ip_vs_estimator *e;
+	struct ip_vs_stats *s;
+	u32 n_conns;
+	u32 n_inpkts, n_outpkts;
+	u64 n_inbytes, n_outbytes;
+	u32 rate;
+	struct net *net = (struct net *)arg;
+	struct netns_ipvs *ipvs;
+
+	ipvs = net_ipvs(net);
+	spin_lock(&ipvs->est_lock);
+	list_for_each_entry(e, &ipvs->est_list, list) {
+		s = container_of(e, struct ip_vs_stats, est);
+
+		spin_lock(&s->lock);
+		ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
+		n_conns = s->ustats.conns;
+		n_inpkts = s->ustats.inpkts;
+		n_outpkts = s->ustats.outpkts;
+		n_inbytes = s->ustats.inbytes;
+		n_outbytes = s->ustats.outbytes;
+
+		/* scaled by 2^10, but divided 2 seconds */
+		rate = (n_conns - e->last_conns) << 9;
+		e->last_conns = n_conns;
+		e->cps += ((long)rate - (long)e->cps) >> 2;
+
+		rate = (n_inpkts - e->last_inpkts) << 9;
+		e->last_inpkts = n_inpkts;
+		e->inpps += ((long)rate - (long)e->inpps) >> 2;
+
+		rate = (n_outpkts - e->last_outpkts) << 9;
+		e->last_outpkts = n_outpkts;
+		e->outpps += ((long)rate - (long)e->outpps) >> 2;
+
+		rate = (n_inbytes - e->last_inbytes) << 4;
+		e->last_inbytes = n_inbytes;
+		e->inbps += ((long)rate - (long)e->inbps) >> 2;
+
+		rate = (n_outbytes - e->last_outbytes) << 4;
+		e->last_outbytes = n_outbytes;
+		e->outbps += ((long)rate - (long)e->outbps) >> 2;
+		spin_unlock(&s->lock);
+	}
+	spin_unlock(&ipvs->est_lock);
+	mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_estimator *est = &stats->est;
+
+	INIT_LIST_HEAD(&est->list);
+
+	spin_lock_bh(&ipvs->est_lock);
+	list_add(&est->list, &ipvs->est_list);
+	spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_estimator *est = &stats->est;
+
+	spin_lock_bh(&ipvs->est_lock);
+	list_del(&est->list);
+	spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est = &stats->est;
+	struct ip_vs_stats_user *u = &stats->ustats;
+
+	/* reset counters, caller must hold the stats->lock lock */
+	est->last_inbytes = u->inbytes;
+	est->last_outbytes = u->outbytes;
+	est->last_conns = u->conns;
+	est->last_inpkts = u->inpkts;
+	est->last_outpkts = u->outpkts;
+	est->cps = 0;
+	est->inpps = 0;
+	est->outpps = 0;
+	est->inbps = 0;
+	est->outbps = 0;
+}
+
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
+			  struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *e = &stats->est;
+
+	dst->cps = (e->cps + 0x1FF) >> 10;
+	dst->inpps = (e->inpps + 0x1FF) >> 10;
+	dst->outpps = (e->outpps + 0x1FF) >> 10;
+	dst->inbps = (e->inbps + 0xF) >> 5;
+	dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init __ip_vs_estimator_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	INIT_LIST_HEAD(&ipvs->est_list);
+	spin_lock_init(&ipvs->est_lock);
+	setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+	mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+	return 0;
+}
+
+void __net_exit __ip_vs_estimator_cleanup(struct net *net)
+{
+	del_timer_sync(&net_ipvs(net)->est_timer);
+}
+
+int __init ip_vs_estimator_init(void)
+{
+	return 0;
+}
+
+void ip_vs_estimator_cleanup(void)
+{
+}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
new file mode 100644
index 00000000..af63553f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,480 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors:	Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *		IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:	@(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:	Wouter Gadeyne
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/gfp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, NULL, 0);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/*	Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	/* We use connection tracking for the command connection */
+	cp->flags |= IP_VS_CONN_F_NFCT;
+	return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+				  const char *pattern, size_t plen, char term,
+				  __be32 *addr, __be16 *port,
+				  char **start, char **end)
+{
+	unsigned char p[6];
+	int i = 0;
+
+	if (data_limit - data < plen) {
+		/* check if there is partial match */
+		if (strnicmp(data, pattern, data_limit - data) == 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	if (strnicmp(data, pattern, plen) != 0) {
+		return 0;
+	}
+	*start = data + plen;
+
+	for (data = *start; *data != term; data++) {
+		if (data == data_limit)
+			return -1;
+	}
+	*end = data;
+
+	memset(p, 0, sizeof(p));
+	for (data = *start; data != *end; data++) {
+		if (*data >= '0' && *data <= '9') {
+			p[i] = p[i]*10 + *data - '0';
+		} else if (*data == ',' && i < 5) {
+			i++;
+		} else {
+			/* unexpected character */
+			return -1;
+		}
+	}
+
+	if (i != 5)
+		return -1;
+
+	*addr = get_unaligned((__be32 *)p);
+	*port = get_unaligned((__be16 *)(p + 4));
+	return 1;
+}
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			 struct sk_buff *skb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_limit;
+	char *start, *end;
+	union nf_inet_addr from;
+	__be16 port;
+	struct ip_vs_conn *n_cp;
+	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+	unsigned buf_len;
+	int ret = 0;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct net *net;
+
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	if (cp->app_data == &ip_vs_ftp_pasv) {
+		iph = ip_hdr(skb);
+		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+		data = (char *)th + (th->doff << 2);
+		data_limit = skb_tail_pointer(skb);
+
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING,
+					   sizeof(SERVER_STRING)-1, ')',
+					   &from.ip, &port,
+					   &start, &end) != 1)
+			return 1;
+
+		IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+			  &from.ip, ntohs(port), &cp->caddr.ip, 0);
+
+		/*
+		 * Now update or create an connection entry for it
+		 */
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+					      iph->protocol, &from, port,
+					      &cp->caddr, 0, &p);
+			n_cp = ip_vs_conn_out_get(&p);
+		}
+		if (!n_cp) {
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+					      AF_INET, IPPROTO_TCP, &cp->caddr,
+					      0, &cp->vaddr, port, &p);
+			n_cp = ip_vs_conn_new(&p, &from, port,
+					      IP_VS_CONN_F_NO_CPORT |
+					      IP_VS_CONN_F_NFCT,
+					      cp->dest, skb->mark);
+			if (!n_cp)
+				return 0;
+
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
+
+		/*
+		 * Replace the old passive address with the new one
+		 */
+		from.ip = n_cp->vaddr.ip;
+		port = n_cp->vport;
+		snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
+			 ((unsigned char *)&from.ip)[0],
+			 ((unsigned char *)&from.ip)[1],
+			 ((unsigned char *)&from.ip)[2],
+			 ((unsigned char *)&from.ip)[3],
+			 ntohs(port) >> 8,
+			 ntohs(port) & 0xFF);
+
+		buf_len = strlen(buf);
+
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+			/* If mangling fails this function will return 0
+			 * which will cause the packet to be dropped.
+			 * Mangling can only fail under memory pressure,
+			 * hopefully it will succeed on the retransmitted
+			 * packet.
+			 */
+			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						       start-data, end-start,
+						       buf, buf_len);
+			if (ret) {
+				ip_vs_nfct_expect_related(skb, ct, n_cp,
+							  IPPROTO_TCP, 0, 0);
+				if (skb->ip_summed == CHECKSUM_COMPLETE)
+					skb->ip_summed = CHECKSUM_UNNECESSARY;
+				/* csum is updated */
+				ret = 1;
+			}
+		}
+
+		/*
+		 * Not setting 'diff' is intentional, otherwise the sequence
+		 * would be adjusted twice.
+		 */
+
+		net = skb_net(skb);
+		cp->app_data = NULL;
+		ip_vs_tcp_conn_listen(net, n_cp);
+		ip_vs_conn_put(n_cp);
+		return ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			struct sk_buff *skb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_start, *data_limit;
+	char *start, *end;
+	union nf_inet_addr to;
+	__be16 port;
+	struct ip_vs_conn *n_cp;
+	struct net *net;
+
+#ifdef CONFIG_IP_VS_IPV6
+	/* This application helper doesn't work with IPv6 yet,
+	 * so turn this into a no-op for IPv6 packets
+	 */
+	if (cp->af == AF_INET6)
+		return 1;
+#endif
+
+	/* no diff required for incoming packets */
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!skb_make_writable(skb, skb->len))
+		return 0;
+
+	/*
+	 * Detecting whether it is passive
+	 */
+	iph = ip_hdr(skb);
+	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+	/* Since there may be OPTIONS in the TCP packet and the HLEN is
+	   the length of the header in 32-bit multiples, it is accurate
+	   to calculate data address by th+HLEN*4 */
+	data = data_start = (char *)th + (th->doff << 2);
+	data_limit = skb_tail_pointer(skb);
+
+	while (data <= data_limit - 6) {
+		if (strnicmp(data, "PASV\r\n", 6) == 0) {
+			/* Passive mode on */
+			IP_VS_DBG(7, "got PASV at %td of %td\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = &ip_vs_ftp_pasv;
+			return 1;
+		}
+		data++;
+	}
+
+	/*
+	 * To support virtual FTP server, the scenerio is as follows:
+	 *       FTP client ----> Load Balancer ----> FTP server
+	 * First detect the port number in the application data,
+	 * then create a new connection entry for the coming data
+	 * connection.
+	 */
+	if (ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+				   '\r', &to.ip, &port,
+				   &start, &end) != 1)
+		return 1;
+
+	IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+
+	/* Passive mode off */
+	cp->app_data = NULL;
+
+	/*
+	 * Now update or create a connection entry for it
+	 */
+	IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
+		  ip_vs_proto_name(iph->protocol),
+		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+				      iph->protocol, &to, port, &cp->vaddr,
+				      htons(ntohs(cp->vport)-1), &p);
+		n_cp = ip_vs_conn_in_get(&p);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(&p, &cp->daddr,
+					      htons(ntohs(cp->dport)-1),
+					      IP_VS_CONN_F_NFCT, cp->dest,
+					      skb->mark);
+			if (!n_cp)
+				return 0;
+
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
+	}
+
+	/*
+	 *	Move tunnel to listen state
+	 */
+	net = skb_net(skb);
+	ip_vs_tcp_conn_listen(net, n_cp);
+	ip_vs_conn_put(n_cp);
+
+	return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+	.name =		"ftp",
+	.type =		IP_VS_APP_TYPE_FTP,
+	.protocol =	IPPROTO_TCP,
+	.module =	THIS_MODULE,
+	.incs_list =	LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+	.init_conn =	ip_vs_ftp_init_conn,
+	.done_conn =	ip_vs_ftp_done_conn,
+	.bind_conn =	NULL,
+	.unbind_conn =	NULL,
+	.pkt_out =	ip_vs_ftp_out,
+	.pkt_in =	ip_vs_ftp_in,
+};
+
+/*
+ *	per netns ip_vs_ftp initialization
+ */
+static int __net_init __ip_vs_ftp_init(struct net *net)
+{
+	int i, ret;
+	struct ip_vs_app *app;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL);
+	if (!app)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&app->a_list);
+	INIT_LIST_HEAD(&app->incs_list);
+	ipvs->ftp_app = app;
+
+	ret = register_ip_vs_app(net, app);
+	if (ret)
+		goto err_exit;
+
+	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+		if (!ports[i])
+			continue;
+		ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+		if (ret)
+			goto err_unreg;
+		pr_info("%s: loaded support on port[%d] = %d\n",
+			app->name, i, ports[i]);
+	}
+	return 0;
+
+err_unreg:
+	unregister_ip_vs_app(net, app);
+err_exit:
+	kfree(ipvs->ftp_app);
+	return ret;
+}
+/*
+ *	netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	unregister_ip_vs_app(net, ipvs->ftp_app);
+	kfree(ipvs->ftp_app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+	.init = __ip_vs_ftp_init,
+	.exit = __ip_vs_ftp_exit,
+};
+
+int __init ip_vs_ftp_init(void)
+{
+	int rv;
+
+	rv = register_pernet_subsys(&ip_vs_ftp_ops);
+	return rv;
+}
+
+/*
+ *	ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+	unregister_pernet_subsys(&ip_vs_ftp_ops);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
new file mode 100644
index 00000000..87e40ea7
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,625 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+#define DEFAULT_EXPIRATION	(24*60*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+
+
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+	struct list_head        list;
+	int			af;		/* address family */
+	union nf_inet_addr      addr;           /* destination IP address */
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLC sysctl table
+ */
+#ifdef CONFIG_SYSCTL
+static ctl_table vs_vars_table[] = {
+	{
+		.procname	= "lblc_expiration",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+	list_del(&en->list);
+	/*
+	 * We don't kfree dest because it is referred either by its service
+	 * or the trash dest list.
+	 */
+	atomic_dec(&en->dest->refcnt);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned
+ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
+{
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblc_table.
+ *	returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+	unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters. Called under read
+ *  lock
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
+	       const union nf_inet_addr *addr)
+{
+	unsigned hash = ip_vs_lblc_hashkey(af, addr);
+	struct ip_vs_lblc_entry *en;
+
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (ip_vs_addr_equal(af, &en->addr, addr))
+			return en;
+
+	return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblc_entry *en;
+
+	en = ip_vs_lblc_get(dest->af, tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			pr_err("%s(): no memory\n", __func__);
+			return NULL;
+		}
+
+		en->af = dest->af;
+		ip_vs_addr_copy(dest->af, &en->addr, daddr);
+		en->lastuse = jiffies;
+
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+
+		ip_vs_lblc_hash(tbl, en);
+	} else if (en->dest != dest) {
+		atomic_dec(&en->dest->refcnt);
+		atomic_inc(&dest->refcnt);
+		en->dest = dest;
+	}
+
+	return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+	struct ip_vs_lblc_entry *en, *nxt;
+	int i;
+
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+	}
+}
+
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+	return ipvs->sysctl_lblc_expiration;
+#else
+	return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct ip_vs_lblc_entry *en, *nxt;
+	unsigned long now = jiffies;
+	int i, j;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now,
+					en->lastuse +
+					sysctl_lblc_expiration(svc)))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&svc->sched_lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblc_entry *en, *nxt;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblc_full_check(svc);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&svc->sched_lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblc_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblc_table for this service
+	 */
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+	if (tbl == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+		  "current service\n", sizeof(*tbl));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+	return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblc_flush(tbl);
+
+	/* release the table itself */
+	kfree(tbl);
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+		  sizeof(*tbl));
+
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We use the following formula to estimate the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_dest_conn_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = ip_vs_dest_conn_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(least->af, &least->addr),
+		      ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblc_entry *en;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
+	en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/*
+		 * If the destination is not available, i.e. it's in the trash,
+		 * we must ignore it, as it may be removed from under our feet,
+		 * if someone drops our reference count. Our caller only makes
+		 * sure that destinations, that are not in the trash, are not
+		 * moved to the trash, while we are scheduling. But anyone can
+		 * free up entries from the trash at any time.
+		 */
+
+		if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
+			dest = en->dest;
+	}
+	read_unlock(&svc->sched_lock);
+
+	/* If the destination has a weight and is not overloaded, use it */
+	if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+		goto out;
+
+	/* No cache entry or it is invalid, time to schedule */
+	dest = __ip_vs_lblc_schedule(svc);
+	if (!dest) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblc_new(tbl, &iph.daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
+	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+	.name =			"lblc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+	.init_service =		ip_vs_lblc_init_svc,
+	.done_service =		ip_vs_lblc_done_svc,
+	.schedule =		ip_vs_lblc_schedule,
+};
+
+/*
+ *  per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!net_eq(net, &init_net)) {
+		ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+						sizeof(vs_vars_table),
+						GFP_KERNEL);
+		if (ipvs->lblc_ctl_table == NULL)
+			return -ENOMEM;
+	} else
+		ipvs->lblc_ctl_table = vs_vars_table;
+	ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+	ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+	ipvs->lblc_ctl_header =
+		register_net_sysctl_table(net, net_vs_ctl_path,
+					  ipvs->lblc_ctl_table);
+	if (!ipvs->lblc_ctl_header) {
+		if (!net_eq(net, &init_net))
+			kfree(ipvs->lblc_ctl_table);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+	.init = __ip_vs_lblc_init,
+	.exit = __ip_vs_lblc_exit,
+};
+
+static int __init ip_vs_lblc_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip_vs_lblc_ops);
+	if (ret)
+		return ret;
+
+	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+	if (ret)
+		unregister_pernet_subsys(&ip_vs_lblc_ops);
+	return ret;
+}
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+	unregister_pernet_subsys(&ip_vs_lblc_ops);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
new file mode 100644
index 00000000..90f618ab
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,821 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+#define DEFAULT_EXPIRATION	(24*60*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_set_elem {
+	struct list_head	list;          /* list link */
+	struct ip_vs_dest       *dest;          /* destination server */
+};
+
+struct ip_vs_dest_set {
+	atomic_t                size;           /* set size */
+	unsigned long           lastmod;        /* last modified time */
+	struct list_head	list;           /* destination list */
+	rwlock_t	        lock;           /* lock for this list */
+};
+
+
+static struct ip_vs_dest_set_elem *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_set_elem *e;
+
+	list_for_each_entry(e, &set->list, list) {
+		if (e->dest == dest)
+			/* already existed */
+			return NULL;
+	}
+
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (e == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return NULL;
+	}
+
+	atomic_inc(&dest->refcnt);
+	e->dest = dest;
+
+	list_add(&e->list, &set->list);
+	atomic_inc(&set->size);
+
+	set->lastmod = jiffies;
+	return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_set_elem *e;
+
+	list_for_each_entry(e, &set->list, list) {
+		if (e->dest == dest) {
+			/* HIT */
+			atomic_dec(&set->size);
+			set->lastmod = jiffies;
+			atomic_dec(&e->dest->refcnt);
+			list_del(&e->list);
+			kfree(e);
+			break;
+		}
+	}
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+	struct ip_vs_dest_set_elem *e, *ep;
+
+	write_lock(&set->lock);
+	list_for_each_entry_safe(e, ep, &set->list, list) {
+		/*
+		 * We don't kfree dest because it is referred either
+		 * by its service or by the trash dest list.
+		 */
+		atomic_dec(&e->dest->refcnt);
+		list_del(&e->list);
+		kfree(e);
+	}
+	write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_set_elem *e;
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	/* select the first destination server, whose weight > 0 */
+	list_for_each_entry(e, &set->list, list) {
+		least = e->dest;
+		if (least->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if ((atomic_read(&least->weight) > 0)
+		    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+			loh = ip_vs_dest_conn_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/* find the destination with the weighted least load */
+  nextstage:
+	list_for_each_entry(e, &set->list, list) {
+		dest = e->dest;
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = ip_vs_dest_conn_overhead(dest);
+		if ((loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))
+		    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      __func__,
+		      IP_VS_DBG_ADDR(least->af, &least->addr),
+		      ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+	return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_set_elem *e;
+	struct ip_vs_dest *dest, *most;
+	int moh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	/* select the first destination server, whose weight > 0 */
+	list_for_each_entry(e, &set->list, list) {
+		most = e->dest;
+		if (atomic_read(&most->weight) > 0) {
+			moh = ip_vs_dest_conn_overhead(most);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/* find the destination with the weighted most load */
+  nextstage:
+	list_for_each_entry(e, &set->list, list) {
+		dest = e->dest;
+		doh = ip_vs_dest_conn_overhead(dest);
+		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+		if ((moh * atomic_read(&dest->weight) <
+		     doh * atomic_read(&most->weight))
+		    && (atomic_read(&dest->weight) > 0)) {
+			most = dest;
+			moh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      __func__,
+		      IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
+		      atomic_read(&most->activeconns),
+		      atomic_read(&most->refcnt),
+		      atomic_read(&most->weight), moh);
+	return most;
+}
+
+
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+	struct list_head        list;
+	int			af;		/* address family */
+	union nf_inet_addr      addr;           /* destination IP address */
+	struct ip_vs_dest_set   set;            /* destination server set */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+#ifdef CONFIG_SYSCTL
+/*
+ *      IPVS LBLCR sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+	{
+		.procname	= "lblcr_expiration",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+	list_del(&en->list);
+	ip_vs_dest_set_eraseall(&en->set);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned
+ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
+{
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblcr_table.
+ *	returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+	unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+}
+
+
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
+ *  read lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
+		const union nf_inet_addr *addr)
+{
+	unsigned hash = ip_vs_lblcr_hashkey(af, addr);
+	struct ip_vs_lblcr_entry *en;
+
+	list_for_each_entry(en, &tbl->bucket[hash], list)
+		if (ip_vs_addr_equal(af, &en->addr, addr))
+			return en;
+
+	return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under write lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
+		struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblcr_entry *en;
+
+	en = ip_vs_lblcr_get(dest->af, tbl, daddr);
+	if (!en) {
+		en = kmalloc(sizeof(*en), GFP_ATOMIC);
+		if (!en) {
+			pr_err("%s(): no memory\n", __func__);
+			return NULL;
+		}
+
+		en->af = dest->af;
+		ip_vs_addr_copy(dest->af, &en->addr, daddr);
+		en->lastuse = jiffies;
+
+		/* initialize its dest set */
+		atomic_set(&(en->set.size), 0);
+		INIT_LIST_HEAD(&en->set.list);
+		rwlock_init(&en->set.lock);
+
+		ip_vs_lblcr_hash(tbl, en);
+	}
+
+	write_lock(&en->set.lock);
+	ip_vs_dest_set_insert(&en->set, dest);
+	write_unlock(&en->set.lock);
+
+	return en;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+	int i;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	/* No locking required, only called during cleanup. */
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblcr_free(en);
+		}
+	}
+}
+
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
+	return ipvs->sysctl_lblcr_expiration;
+#else
+	return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_after(en->lastuse +
+				       sysctl_lblcr_expiration(svc), now))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&svc->sched_lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+	struct ip_vs_service *svc = (struct ip_vs_service *) data;
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblcr_full_check(svc);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&svc->sched_lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&svc->sched_lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblcr_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblcr_table for this service
+	 */
+	tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
+	if (tbl == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+		  "current service\n", sizeof(*tbl));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+			(unsigned long)svc);
+	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+	return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblcr_flush(tbl);
+
+	/* release the table itself */
+	kfree(tbl);
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+		  sizeof(*tbl));
+
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We use the following formula to estimate the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_dest_conn_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = ip_vs_dest_conn_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(least->af, &least->addr),
+		      ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_dest *dest = NULL;
+	struct ip_vs_lblcr_entry *en;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/* First look in our cache */
+	read_lock(&svc->sched_lock);
+	en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
+	if (en) {
+		/* We only hold a read lock, but this is atomic */
+		en->lastuse = jiffies;
+
+		/* Get the least loaded destination */
+		read_lock(&en->set.lock);
+		dest = ip_vs_dest_set_min(&en->set);
+		read_unlock(&en->set.lock);
+
+		/* More than one destination + enough time passed by, cleanup */
+		if (atomic_read(&en->set.size) > 1 &&
+				time_after(jiffies, en->set.lastmod +
+				sysctl_lblcr_expiration(svc))) {
+			struct ip_vs_dest *m;
+
+			write_lock(&en->set.lock);
+			m = ip_vs_dest_set_max(&en->set);
+			if (m)
+				ip_vs_dest_set_erase(&en->set, m);
+			write_unlock(&en->set.lock);
+		}
+
+		/* If the destination is not overloaded, use it */
+		if (dest && !is_overloaded(dest, svc)) {
+			read_unlock(&svc->sched_lock);
+			goto out;
+		}
+
+		/* The cache entry is invalid, time to schedule */
+		dest = __ip_vs_lblcr_schedule(svc);
+		if (!dest) {
+			ip_vs_scheduler_err(svc, "no destination available");
+			read_unlock(&svc->sched_lock);
+			return NULL;
+		}
+
+		/* Update our cache entry */
+		write_lock(&en->set.lock);
+		ip_vs_dest_set_insert(&en->set, dest);
+		write_unlock(&en->set.lock);
+	}
+	read_unlock(&svc->sched_lock);
+
+	if (dest)
+		goto out;
+
+	/* No cache entry, time to schedule */
+	dest = __ip_vs_lblcr_schedule(svc);
+	if (!dest) {
+		IP_VS_DBG(1, "no destination available\n");
+		return NULL;
+	}
+
+	/* If we fail to create a cache entry, we'll just use the valid dest */
+	write_lock(&svc->sched_lock);
+	ip_vs_lblcr_new(tbl, &iph.daddr, dest);
+	write_unlock(&svc->sched_lock);
+
+out:
+	IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr),
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+	.name =			"lblcr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+	.init_service =		ip_vs_lblcr_init_svc,
+	.done_service =		ip_vs_lblcr_done_svc,
+	.schedule =		ip_vs_lblcr_schedule,
+};
+
+/*
+ *  per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!net_eq(net, &init_net)) {
+		ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+						sizeof(vs_vars_table),
+						GFP_KERNEL);
+		if (ipvs->lblcr_ctl_table == NULL)
+			return -ENOMEM;
+	} else
+		ipvs->lblcr_ctl_table = vs_vars_table;
+	ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+	ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+	ipvs->lblcr_ctl_header =
+		register_net_sysctl_table(net, net_vs_ctl_path,
+					  ipvs->lblcr_ctl_table);
+	if (!ipvs->lblcr_ctl_header) {
+		if (!net_eq(net, &init_net))
+			kfree(ipvs->lblcr_ctl_table);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+	.init = __ip_vs_lblcr_init,
+	.exit = __ip_vs_lblcr_exit,
+};
+
+static int __init ip_vs_lblcr_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+	if (ret)
+		return ret;
+
+	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+	if (ret)
+		unregister_pernet_subsys(&ip_vs_lblcr_ops);
+	return ret;
+}
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+	unregister_pernet_subsys(&ip_vs_lblcr_ops);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
new file mode 100644
index 00000000..f391819c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,91 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ *	Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/*
+	 * Simply select the server with the least number of
+	 *        (activeconns<<5) + inactconns
+	 * Except whose weight is equal to zero.
+	 * If the weight is equal to zero, it means that the server is
+	 * quiesced, the existing connections to the server still get
+	 * served, but no new connection is assigned to the server.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+		    atomic_read(&dest->weight) == 0)
+			continue;
+		doh = ip_vs_dest_conn_overhead(dest);
+		if (!least || doh < loh) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (!least)
+		ip_vs_scheduler_err(svc, "no destination available");
+	else
+		IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
+			      "inactconns %d\n",
+			      IP_VS_DBG_ADDR(svc->af, &least->addr),
+			      ntohs(least->port),
+			      atomic_read(&least->activeconns),
+			      atomic_read(&least->inactconns));
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+	.name =			"lc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+	.schedule =		ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 00000000..f454c80d
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,294 @@
+/*
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>		Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>	Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+	    nf_ct_is_dying(ct))
+		return;
+
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return;
+
+	/* Alter reply only in original direction */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	if (outin) {
+		new_tuple.src.u3 = cp->daddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.src.u.tcp.port = cp->dport;
+	} else {
+		new_tuple.dst.u3 = cp->vaddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.dst.u.tcp.port = cp->vport;
+	}
+	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		  "ctinfo=%d, old reply=" FMT_TUPLE
+		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+		  __func__, ct, ct->status, ctinfo,
+		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+	struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
+	struct net *net = nf_ct_net(ct);
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+			      &orig->src.u3, orig->src.u.tcp.port,
+			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
+	cp = ip_vs_conn_out_get(&p);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(&p);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+			       struct ip_vs_conn *cp, u_int8_t proto,
+			       const __be16 port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			from_rs ? &cp->daddr : &cp->caddr,
+			from_rs ? &cp->caddr : &cp->vaddr,
+			proto, port ? &port : NULL,
+			from_rs ? &cp->cport : &cp->vport);
+
+	exp->expectfn = ip_vs_nfct_expect_callback;
+
+	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		__func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct nf_conntrack_tuple tuple;
+
+	if (!cp->cport)
+		return;
+
+	tuple = (struct nf_conntrack_tuple) {
+		.dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+	tuple.src.u3 = cp->caddr;
+	tuple.src.u.all = cp->cport;
+	tuple.src.l3num = cp->af;
+	tuple.dst.u3 = cp->vaddr;
+	tuple.dst.u.all = cp->vport;
+
+	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+		" for conn " FMT_CONN "\n",
+		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+	h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+				  &tuple);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* Show what happens instead of calling nf_ct_kill() */
+		if (del_timer(&ct->timeout)) {
+			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+			if (ct->timeout.function)
+				ct->timeout.function(ct->timeout.data);
+		} else {
+			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+		}
+		nf_ct_put(ct);
+	} else {
+		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			__func__, ARG_TUPLE(&tuple));
+	}
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
new file mode 100644
index 00000000..984d9c13
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,140 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+		    !atomic_read(&dest->weight))
+			continue;
+
+		doh = ip_vs_nq_dest_overhead(dest);
+
+		/* return the server directly if it is idle */
+		if (atomic_read(&dest->activeconns) == 0) {
+			least = dest;
+			loh = doh;
+			goto out;
+		}
+
+		if (!least ||
+		    (loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (!least) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+  out:
+	IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+	.name =			"nq",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+	.schedule =		ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 00000000..5cf859cc
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,140 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+	svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+	svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
+{
+	struct ip_vs_pe *pe;
+
+	IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
+		  pe_name);
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+		/* Test and get the modules atomically */
+		if (pe->module &&
+		    !try_module_get(pe->module)) {
+			/* This pe is just deleted */
+			continue;
+		}
+		if (strcmp(pe_name, pe->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_pe_lock);
+			return pe;
+		}
+		if (pe->module)
+			module_put(pe->module);
+	}
+
+	spin_unlock_bh(&ip_vs_pe_lock);
+	return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
+{
+	struct ip_vs_pe *pe;
+
+	/* Search for the pe by name */
+	pe = __ip_vs_pe_getbyname(name);
+
+	/* If pe not found, load the module and search again */
+	if (!pe) {
+		request_module("ip_vs_pe_%s", name);
+		pe = __ip_vs_pe_getbyname(name);
+	}
+
+	return pe;
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	struct ip_vs_pe *tmp;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	if (!list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] pe already linked\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Make sure that the pe with this name doesn't exist
+	 * in the pe list.
+	 */
+	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+		if (strcmp(tmp->name, pe->name) == 0) {
+			spin_unlock_bh(&ip_vs_pe_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] pe already existed "
+			       "in the system\n", __func__, pe->name);
+			return -EINVAL;
+		}
+	}
+	/* Add it into the d-linked pe list */
+	list_add(&pe->n_list, &ip_vs_pe);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	pr_info("[%s] pe registered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	spin_lock_bh(&ip_vs_pe_lock);
+	if (list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		pr_err("%s(): [%s] pe is not in the list. failed\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Remove it from the d-linked pe list */
+	list_del(&pe->n_list);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] pe unregistered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 00000000..13d607ae
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,171 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+				    const char *callid, size_t callid_len,
+				    int *idx)
+{
+	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+	memcpy(buf + *idx, callid, len);
+	buf[*idx+len] = '\0';
+	*idx += len + 1;
+	return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)					\
+	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
+			 callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+		      unsigned int datalen,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	/* Find callid */
+	while (1) {
+		int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+					    SIP_HDR_CALL_ID, matchoff,
+					    matchlen);
+		if (ret > 0)
+			break;
+		if (!ret)
+			return 0;
+		dataoff += *matchoff;
+	}
+
+	/* Empty callid is useless */
+	if (!*matchlen)
+		return -EINVAL;
+
+	/* Too large is useless */
+	if (*matchlen > IP_VS_PEDATA_MAXLEN)
+		return -EINVAL;
+
+	/* SIP headers are always followed by a line terminator */
+	if (*matchoff + *matchlen == datalen)
+		return -EINVAL;
+
+	/* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+	 * RFC 3261 allows only CRLF, we support both. */
+	if (*(dptr + *matchoff + *matchlen) != '\r' &&
+	    *(dptr + *matchoff + *matchlen) != '\n')
+		return -EINVAL;
+
+	IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+		      IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+		      *matchlen);
+	return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+	struct ip_vs_iphdr iph;
+	unsigned int dataoff, datalen, matchoff, matchlen;
+	const char *dptr;
+	int retc;
+
+	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+	/* Only useful with UDP */
+	if (iph.protocol != IPPROTO_UDP)
+		return -EINVAL;
+
+	/* No Data ? */
+	dataoff = iph.len + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return -EINVAL;
+
+	if ((retc=skb_linearize(skb)) < 0)
+		return retc;
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+
+	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+		return -EINVAL;
+
+	/* N.B: pe_data is only set on success,
+	 * this allows fallback to the default persistence logic on failure
+	 */
+	p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+	if (!p->pe_data)
+		return -ENOMEM;
+
+	p->pe_data_len = matchlen;
+
+	return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+				  struct ip_vs_conn *ct)
+
+{
+	bool ret = 0;
+
+	if (ct->af == p->af &&
+	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+	    /* protocol should only be IPPROTO_IP if
+	     * d_addr is a fwmark */
+	    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			     p->vaddr, &ct->vaddr) &&
+	    ct->vport == p->vport &&
+	    ct->flags & IP_VS_CONN_F_TEMPLATE &&
+	    ct->protocol == p->protocol &&
+	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+		ret = 1;
+
+	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+				 u32 initval, bool inverse)
+{
+	return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+	memcpy(buf, cp->pe_data, cp->pe_data_len);
+	return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+	.name =			"sip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+	.fill_param =		ip_vs_sip_fill_param,
+	.ct_match =		ip_vs_sip_ct_match,
+	.hashkey_raw =		ip_vs_sip_hashkey_raw,
+	.show_pe_data =		ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+	return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+	unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
new file mode 100644
index 00000000..eb860285
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,395 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE		32	/* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)		((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ *	register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp->next = ip_vs_proto_table[hash];
+	ip_vs_proto_table[hash] = pp;
+
+	if (pp->init != NULL)
+		pp->init(pp);
+
+	return 0;
+}
+
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) || \
+    defined(CONFIG_IP_VS_PROTO_SCTP) || defined(CONFIG_IP_VS_PROTO_AH) || \
+    defined(CONFIG_IP_VS_PROTO_ESP)
+/*
+ *	register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+	struct ip_vs_proto_data *pd =
+			kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+	if (!pd) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	pd->pp = pp;	/* For speed issues */
+	pd->next = ipvs->proto_data_table[hash];
+	ipvs->proto_data_table[hash] = pd;
+	atomic_set(&pd->appcnt, 0);	/* Init app counter */
+
+	if (pp->init_netns != NULL)
+		pp->init_netns(net, pd);
+
+	return 0;
+}
+#endif
+
+/*
+ *	unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	struct ip_vs_protocol **pp_p;
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp_p = &ip_vs_proto_table[hash];
+	for (; *pp_p; pp_p = &(*pp_p)->next) {
+		if (*pp_p == pp) {
+			*pp_p = pp->next;
+			if (pp->exit != NULL)
+				pp->exit(pp);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}
+
+/*
+ *	unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data **pd_p;
+	unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+	pd_p = &ipvs->proto_data_table[hash];
+	for (; *pd_p; pd_p = &(*pd_p)->next) {
+		if (*pd_p == pd) {
+			*pd_p = pd->next;
+			if (pd->pp->exit_netns != NULL)
+				pd->pp->exit_netns(net, pd);
+			kfree(pd);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}
+
+/*
+ *	get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+	struct ip_vs_protocol *pp;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+
+	for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+		if (pp->protocol == proto)
+			return pp;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_get);
+
+/*
+ *	get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+	struct ip_vs_proto_data *pd;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+
+	for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+		if (pd->pp->protocol == proto)
+			return pd;
+	}
+
+	return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
+
+/*
+ *	Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
+{
+	struct ip_vs_proto_data *pd;
+	int i;
+
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+			if (pd->pp->timeout_change)
+				pd->pp->timeout_change(pd, flags);
+		}
+	}
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+	return kmemdup(table, size, GFP_ATOMIC);
+}
+
+
+/*
+ *	Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, const char *const *names,
+			const char *name, int to)
+{
+	int i;
+
+	if (!table || !name || !to)
+		return -EINVAL;
+
+	for (i = 0; i < num; i++) {
+		if (strcmp(names[i], name))
+			continue;
+		table[i] = to * HZ;
+		return 0;
+	}
+	return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+	if (pp == NULL || pp->state_name == NULL)
+		return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+	return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
+{
+	char buf[128];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "TRUNCATED");
+	else if (ih->frag_off & htons(IP_OFFSET))
+		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
+	else {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "TRUNCATED %pI4->%pI4",
+				&ih->saddr, &ih->daddr);
+		else
+			sprintf(buf, "%pI4:%u->%pI4:%u",
+				&ih->saddr, ntohs(pptr[0]),
+				&ih->daddr, ntohs(pptr[1]));
+	}
+
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+			     const struct sk_buff *skb,
+			     int offset,
+			     const char *msg)
+{
+	char buf[192];
+	struct ipv6hdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "TRUNCATED");
+	else if (ih->nexthdr == IPPROTO_FRAGMENT)
+		sprintf(buf, "%pI6->%pI6 frag",	&ih->saddr, &ih->daddr);
+	else {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "TRUNCATED %pI6->%pI6",
+				&ih->saddr, &ih->daddr);
+		else
+			sprintf(buf, "%pI6:%u->%pI6:%u",
+				&ih->saddr, ntohs(pptr[0]),
+				&ih->daddr, ntohs(pptr[1]));
+	}
+
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+			  const struct sk_buff *skb,
+			  int offset,
+			  const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+	else
+#endif
+		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+/*
+ * per network name-space init
+ */
+int __net_init __ip_vs_protocol_init(struct net *net)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
+	return 0;
+}
+
+void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd;
+	int i;
+
+	/* unregister all the ipvs proto data for this netns */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pd = ipvs->proto_data_table[i]) != NULL)
+			unregister_ip_vs_proto_netns(net, pd);
+	}
+}
+
+int __init ip_vs_protocol_init(void)
+{
+	char protocols[64];
+#define REGISTER_PROTOCOL(p)			\
+	do {					\
+		register_ip_vs_protocol(p);	\
+		strcat(protocols, ", ");	\
+		strcat(protocols, (p)->name);	\
+	} while (0)
+
+	protocols[0] = '\0';
+	protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+	pr_info("Registered protocols (%s)\n", &protocols[2]);
+
+	return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+	struct ip_vs_protocol *pp;
+	int i;
+
+	/* unregister all the ipvs protocols */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pp = ip_vs_proto_table[i]) != NULL)
+			unregister_ip_vs_protocol(pp);
+	}
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 00000000..5b8eb8b1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,166 @@
+/*
+ * ip_vs_proto_ah_esp.c:	AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
+ *		Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+	__u8		icookie[8];
+	__u8		rcookie[8];
+	__u8		np;
+	__u8		version;
+	__u8		xchgtype;
+	__u8		flags;
+	__u32		msgid;
+	__u32		length;
+};
+
+*/
+
+#define PORT_ISAKMP	500
+
+static void
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+			     const struct ip_vs_iphdr *iph, int inverse,
+			     struct ip_vs_conn_param *p)
+{
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+	else
+		ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+				      &iph->daddr, htons(PORT_ISAKMP),
+				      &iph->saddr, htons(PORT_ISAKMP), p);
+}
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
+		   const struct ip_vs_iphdr *iph, unsigned int proto_off,
+		   int inverse)
+{
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
+	struct net *net = skb_net(skb);
+
+	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+	cp = ip_vs_conn_in_get(&p);
+	if (!cp) {
+		/*
+		 * We are not sure if the packet is from our
+		 * service, so our conn_schedule hook should return NF_ACCEPT
+		 */
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      ip_vs_proto_get(iph->protocol)->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+		    const struct ip_vs_iphdr *iph,
+		    unsigned int proto_off,
+		    int inverse)
+{
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
+	struct net *net = skb_net(skb);
+
+	ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+	cp = ip_vs_conn_out_get(&p);
+	if (!cp) {
+		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+			      "%s%s %s->%s\n",
+			      inverse ? "ICMP+" : "",
+			      ip_vs_proto_get(iph->protocol)->name,
+			      IP_VS_DBG_ADDR(af, &iph->saddr),
+			      IP_VS_DBG_ADDR(af, &iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+		     int *verdict, struct ip_vs_conn **cpp)
+{
+	/*
+	 * AH/ESP is only related traffic. Pass the packet to IP stack.
+	 */
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+	.name =			"AH",
+	.protocol =		IPPROTO_AH,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			NULL,
+	.exit =			NULL,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+	.name =			"ESP",
+	.protocol =		IPPROTO_ESP,
+	.num_states =		1,
+	.dont_defrag =		1,
+	.init =			NULL,
+	.exit =			NULL,
+	.conn_schedule =	ah_esp_conn_schedule,
+	.conn_in_get =		ah_esp_conn_in_get,
+	.conn_out_get =		ah_esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
new file mode 100644
index 00000000..d12ed53e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -0,0 +1,1136 @@
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sctp/checksum.h>
+#include <net/ip_vs.h>
+
+static int
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+		   int *verdict, struct ip_vs_conn **cpp)
+{
+	struct net *net;
+	struct ip_vs_service *svc;
+	sctp_chunkhdr_t _schunkh, *sch;
+	sctp_sctphdr_t *sh, _sctph;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		return 0;
+
+	sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t),
+				 sizeof(_schunkh), &_schunkh);
+	if (sch == NULL)
+		return 0;
+	net = skb_net(skb);
+	if ((sch->type == SCTP_CID_INIT) &&
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+				     &iph.daddr, sh->dest))) {
+		int ignored;
+
+		if (ip_vs_todrop(net_ipvs(net))) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pd);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	/* NF_ACCEPT */
+	return 1;
+}
+
+static int
+sctp_snat_handler(struct sk_buff *skb,
+		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	sctp_sctphdr_t *sctph;
+	unsigned int sctphoff;
+	struct sk_buff *iter;
+	__be32 crc32;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		sctphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		sctphoff = ip_hdrlen(skb);
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/* Call application helper if needed */
+		if (!ip_vs_app_pkt_out(cp, skb))
+			return 0;
+	}
+
+	sctph = (void *) skb_network_header(skb) + sctphoff;
+	sctph->source = cp->vport;
+
+	/* Calculate the checksum */
+	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
+				          crc32);
+	crc32 = sctp_end_cksum(crc32);
+	sctph->checksum = crc32;
+
+	return 1;
+}
+
+static int
+sctp_dnat_handler(struct sk_buff *skb,
+		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	sctp_sctphdr_t *sctph;
+	unsigned int sctphoff;
+	struct sk_buff *iter;
+	__be32 crc32;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		sctphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		sctphoff = ip_hdrlen(skb);
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/* Call application helper if needed */
+		if (!ip_vs_app_pkt_in(cp, skb))
+			return 0;
+	}
+
+	sctph = (void *) skb_network_header(skb) + sctphoff;
+	sctph->dest = cp->dport;
+
+	/* Calculate the checksum */
+	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
+					  crc32);
+	crc32 = sctp_end_cksum(crc32);
+	sctph->checksum = crc32;
+
+	return 1;
+}
+
+static int
+sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	unsigned int sctphoff;
+	struct sctphdr *sh, _sctph;
+	struct sk_buff *iter;
+	__le32 cmp;
+	__le32 val;
+	__u32 tmp;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		sctphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		sctphoff = ip_hdrlen(skb);
+
+	sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		return 0;
+
+	cmp = sh->checksum;
+
+	tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
+	skb_walk_frags(skb, iter)
+		tmp = sctp_update_cksum((__u8 *) iter->data,
+					skb_headlen(iter), tmp);
+
+	val = sctp_end_cksum(tmp);
+
+	if (val != cmp) {
+		/* CRC failure, dump it. */
+		IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+				"Failed checksum for");
+		return 0;
+	}
+	return 1;
+}
+
+struct ipvs_sctp_nextstate {
+	int next_state;
+};
+enum ipvs_sctp_event_t {
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_DATA_SER,
+	IP_VS_SCTP_EVE_INIT_CLI,
+	IP_VS_SCTP_EVE_INIT_SER,
+	IP_VS_SCTP_EVE_INIT_ACK_CLI,
+	IP_VS_SCTP_EVE_INIT_ACK_SER,
+	IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
+	IP_VS_SCTP_EVE_COOKIE_ECHO_SER,
+	IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
+	IP_VS_SCTP_EVE_COOKIE_ACK_SER,
+	IP_VS_SCTP_EVE_ABORT_CLI,
+	IP_VS_SCTP_EVE__ABORT_SER,
+	IP_VS_SCTP_EVE_SHUT_CLI,
+	IP_VS_SCTP_EVE_SHUT_SER,
+	IP_VS_SCTP_EVE_SHUT_ACK_CLI,
+	IP_VS_SCTP_EVE_SHUT_ACK_SER,
+	IP_VS_SCTP_EVE_SHUT_COM_CLI,
+	IP_VS_SCTP_EVE_SHUT_COM_SER,
+	IP_VS_SCTP_EVE_LAST
+};
+
+static enum ipvs_sctp_event_t sctp_events[255] = {
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_INIT_CLI,
+	IP_VS_SCTP_EVE_INIT_ACK_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_ABORT_CLI,
+	IP_VS_SCTP_EVE_SHUT_CLI,
+	IP_VS_SCTP_EVE_SHUT_ACK_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_COOKIE_ECHO_CLI,
+	IP_VS_SCTP_EVE_COOKIE_ACK_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_DATA_CLI,
+	IP_VS_SCTP_EVE_SHUT_COM_CLI,
+};
+
+static struct ipvs_sctp_nextstate
+ sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = {
+	/*
+	 * STATE : IP_VS_SCTP_S_NONE
+	 */
+	/*next state *//*event */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ },
+	 },
+	/*
+	 * STATE : IP_VS_SCTP_S_INIT_CLI
+	 * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT)
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_INIT_SER
+	 * Server sent INIT and waiting for INIT ACK from the client
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_INIT_ACK_CLI
+	 * Client sent INIT ACK and waiting for ECHO from the server
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK has been resent by the client, let us stay is in
+	  * the same state
+	  */
+	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 /*
+	  * INIT_ACK sent by the server, close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * ECHO by client, it should not happen, close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 /*
+	  * ECHO by server, this is what we are expecting, move to ECHO_SER
+	  */
+	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, it should not happen, close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 /*
+	  * Unexpected COOKIE ACK from server, staty in the same state
+	  */
+	 {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_INIT_ACK_SER
+	 * Server sent INIT ACK and waiting for ECHO from the client
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * Unexpected INIT_ACK by the client, let us close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 /*
+	  * INIT_ACK resent by the server, let us move to same state
+	  */
+	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client send the ECHO, this is what we are expecting,
+	  * move to ECHO_CLI
+	  */
+	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 /*
+	  * ECHO received from the server, Not sure what to do,
+	  * let us close it
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, let us stay in the same state
+	  */
+	 {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 /*
+	  * COOKIE ACK from server, hmm... this should not happen, lets close
+	  * the connection.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_ECHO_CLI
+	 * Cient  sent ECHO and waiting COOKEI ACK from the Server
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK has been by the client, let us close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client resent the ECHO, let us stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 /*
+	  * ECHO received from the server, Not sure what to do,
+	  * let us close it
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, this shoud not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 /*
+	  * COOKIE ACK from server, this is what we are awaiting,lets move to
+	  * ESTABLISHED.
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_ECHO_SER
+	 * Server sent ECHO and waiting COOKEI ACK from the client
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 /*
+	  * INIT_ACK has been by the server, let us close the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent the ECHO, not sure what to do, let's close the
+	  * connection.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 /*
+	  * ECHO resent by the server, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, this is what we are expecting, let's move
+	  * to ESTABLISHED.
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 /*
+	  * COOKIE ACK from server, this should not happen, lets close the
+	  * connection.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_ESTABLISHED
+	 * Association established
+	 */
+	{{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
+	  * peer and peer shall move to the ESTABISHED. if it doesn't handle
+	  * it will send ERROR chunk. So, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, not sure what to do stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 /*
+	  * SHUTDOWN from the client, move to SHUDDOWN_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 /*
+	  * SHUTDOWN from the server, move to SHUTDOWN_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 /*
+	  * client sent SHUDTDOWN_ACK, this should not happen, let's close
+	  * the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_SHUT_CLI
+	 * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server
+	 */
+	/*
+	 * We received the data chuck, keep the state unchanged. I assume
+	 * that still data chuncks  can be received by both the peers in
+	 * SHUDOWN state
+	 */
+
+	{{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
+	  * peer and peer shall move to the ESTABISHED. if it doesn't handle
+	  * it will send ERROR chunk. So, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, not sure what to do stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 /*
+	  * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 /*
+	  * SHUTDOWN from the server, move to SHUTDOWN_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 /*
+	  * client sent SHUDTDOWN_ACK, this should not happen, let's close
+	  * the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 /*
+	  * Server sent SHUTDOWN ACK, this is what we are expecting, let's move
+	  * to SHUDOWN_ACK_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 /*
+	  * SHUTDOWN COM from client, this should not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_SHUT_SER
+	 * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client
+	 */
+	/*
+	 * We received the data chuck, keep the state unchanged. I assume
+	 * that still data chuncks  can be received by both the peers in
+	 * SHUDOWN state
+	 */
+
+	{{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
+	  * peer and peer shall move to the ESTABISHED. if it doesn't handle
+	  * it will send ERROR chunk. So, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, not sure what to do stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 /*
+	  * SHUTDOWN resent from the client, move to SHUDDOWN_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 /*
+	  * SHUTDOWN resent from the server, move to SHUTDOWN_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 /*
+	  * client sent SHUDTDOWN_ACK, this is what we are expecting, let's
+	  * move to SHUT_ACK_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 /*
+	  * Server sent SHUTDOWN ACK, this should not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 /*
+	  * SHUTDOWN COM from client, this should not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+
+	/*
+	 * State : IP_VS_SCTP_S_SHUT_ACK_CLI
+	 * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server
+	 */
+	/*
+	 * We received the data chuck, keep the state unchanged. I assume
+	 * that still data chuncks  can be received by both the peers in
+	 * SHUDOWN state
+	 */
+
+	{{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
+	  * peer and peer shall move to the ESTABISHED. if it doesn't handle
+	  * it will send ERROR chunk. So, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, not sure what to do stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 /*
+	  * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 /*
+	  * SHUTDOWN sent from the server, move to SHUTDOWN_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 /*
+	  * client resent SHUDTDOWN_ACK, let's stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 /*
+	  * Server sent SHUTDOWN ACK, this should not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 /*
+	  * SHUTDOWN COM from client, this should not happen, let's close the
+	  * connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 /*
+	  * SHUTDOWN COMPLETE from server this is what we are expecting.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+
+	/*
+	 * State : IP_VS_SCTP_S_SHUT_ACK_SER
+	 * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client
+	 */
+	/*
+	 * We received the data chuck, keep the state unchanged. I assume
+	 * that still data chuncks  can be received by both the peers in
+	 * SHUDOWN state
+	 */
+
+	{{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 /*
+	  * We have got an INIT from client. From the spec.“Upon receipt of
+	  * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with
+	  * an INIT ACK using the same parameters it sent in its  original
+	  * INIT chunk (including its Initiate Tag, unchanged”).
+	  */
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 /*
+	  * INIT_ACK sent by the server, Unexpected INIT ACK, spec says,
+	  * “If an INIT ACK is received by an endpoint in any state other
+	  * than the COOKIE-WAIT state, the endpoint should discard the
+	  * INIT ACK chunk”. Stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 /*
+	  * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the
+	  * peer and peer shall move to the ESTABISHED. if it doesn't handle
+	  * it will send ERROR chunk. So, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 /*
+	  * COOKIE ACK from client, not sure what to do stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 /*
+	  * SHUTDOWN sent from the client, move to SHUDDOWN_CLI
+	  */
+	 {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 /*
+	  * SHUTDOWN sent from the server, move to SHUTDOWN_SER
+	  */
+	 {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 /*
+	  * client sent SHUDTDOWN_ACK, this should not happen let's close
+	  * the connection.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 /*
+	  * Server resent SHUTDOWN ACK, stay in the same state
+	  */
+	 {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 /*
+	  * SHUTDOWN COM from client, this what we are expecting, let's close
+	  * the connection
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 /*
+	  * SHUTDOWN COMPLETE from server this should not happen.
+	  */
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 },
+	/*
+	 * State : IP_VS_SCTP_S_CLOSED
+	 */
+	{{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ },
+	 {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ },
+	 {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ },
+	 {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }
+	 }
+};
+
+/*
+ *      Timeout table[state]
+ */
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+	[IP_VS_SCTP_S_NONE]         =     2 * HZ,
+	[IP_VS_SCTP_S_INIT_CLI]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_INIT_SER]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_INIT_ACK_CLI] =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_INIT_ACK_SER] =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_ECHO_CLI]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_ECHO_SER]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_ESTABLISHED]  =    15 * 60 * HZ,
+	[IP_VS_SCTP_S_SHUT_CLI]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_SHUT_SER]     =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_SHUT_ACK_CLI] =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_SHUT_ACK_SER] =     1 * 60 * HZ,
+	[IP_VS_SCTP_S_CLOSED]       =    10 * HZ,
+	[IP_VS_SCTP_S_LAST]         =     2 * HZ,
+};
+
+static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
+	[IP_VS_SCTP_S_NONE]         =    "NONE",
+	[IP_VS_SCTP_S_INIT_CLI]     =    "INIT_CLI",
+	[IP_VS_SCTP_S_INIT_SER]     =    "INIT_SER",
+	[IP_VS_SCTP_S_INIT_ACK_CLI] =    "INIT_ACK_CLI",
+	[IP_VS_SCTP_S_INIT_ACK_SER] =    "INIT_ACK_SER",
+	[IP_VS_SCTP_S_ECHO_CLI]     =    "COOKIE_ECHO_CLI",
+	[IP_VS_SCTP_S_ECHO_SER]     =    "COOKIE_ECHO_SER",
+	[IP_VS_SCTP_S_ESTABLISHED]  =    "ESTABISHED",
+	[IP_VS_SCTP_S_SHUT_CLI]     =    "SHUTDOWN_CLI",
+	[IP_VS_SCTP_S_SHUT_SER]     =    "SHUTDOWN_SER",
+	[IP_VS_SCTP_S_SHUT_ACK_CLI] =    "SHUTDOWN_ACK_CLI",
+	[IP_VS_SCTP_S_SHUT_ACK_SER] =    "SHUTDOWN_ACK_SER",
+	[IP_VS_SCTP_S_CLOSED]       =    "CLOSED",
+	[IP_VS_SCTP_S_LAST]         =    "BUG!"
+};
+
+
+static const char *sctp_state_name(int state)
+{
+	if (state >= IP_VS_SCTP_S_LAST)
+		return "ERR!";
+	if (sctp_state_name_table[state])
+		return sctp_state_name_table[state];
+	return "?";
+}
+
+static inline int
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+		int direction, const struct sk_buff *skb)
+{
+	sctp_chunkhdr_t _sctpch, *sch;
+	unsigned char chunk_type;
+	int event, next_state;
+	int ihl;
+
+#ifdef CONFIG_IP_VS_IPV6
+	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+	ihl = ip_hdrlen(skb);
+#endif
+
+	sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t),
+				sizeof(_sctpch), &_sctpch);
+	if (sch == NULL)
+		return 0;
+
+	chunk_type = sch->type;
+	/*
+	 * Section 3: Multiple chunks can be bundled into one SCTP packet
+	 * up to the MTU size, except for the INIT, INIT ACK, and
+	 * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with
+	 * any other chunk in a packet.
+	 *
+	 * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control
+	 * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be
+	 * bundled with an ABORT, but they MUST be placed before the ABORT
+	 * in the SCTP packet or they will be ignored by the receiver.
+	 */
+	if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
+	    (sch->type == SCTP_CID_COOKIE_ACK)) {
+		sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) +
+				sch->length), sizeof(_sctpch), &_sctpch);
+		if (sch) {
+			if (sch->type == SCTP_CID_ABORT)
+				chunk_type = sch->type;
+		}
+	}
+
+	event = sctp_events[chunk_type];
+
+	/*
+	 *  If the direction is IP_VS_DIR_OUTPUT, this event is from server
+	 */
+	if (direction == IP_VS_DIR_OUTPUT)
+		event++;
+	/*
+	 * get next state
+	 */
+	next_state = sctp_states_table[cp->state][event].next_state;
+
+	if (next_state != cp->state) {
+		struct ip_vs_dest *dest = cp->dest;
+
+		IP_VS_DBG_BUF(8, "%s %s  %s:%d->"
+				"%s:%d state: %s->%s conn->refcnt:%d\n",
+				pd->pp->name,
+				((direction == IP_VS_DIR_OUTPUT) ?
+				 "output " : "input "),
+				IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+				ntohs(cp->dport),
+				IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				ntohs(cp->cport),
+				sctp_state_name(cp->state),
+				sctp_state_name(next_state),
+				atomic_read(&cp->refcnt));
+		if (dest) {
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				(next_state != IP_VS_SCTP_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				   (next_state == IP_VS_SCTP_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+	}
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = next_state];
+	else	/* What to do ? */
+		cp->timeout = sctp_timeouts[cp->state = next_state];
+
+	return 1;
+}
+
+static int
+sctp_state_transition(struct ip_vs_conn *cp, int direction,
+		const struct sk_buff *skb, struct ip_vs_proto_data *pd)
+{
+	int ret = 0;
+
+	spin_lock(&cp->lock);
+	ret = set_sctp_state(pd, cp, direction, skb);
+	spin_unlock(&cp->lock);
+
+	return ret;
+}
+
+static inline __u16 sctp_app_hashkey(__be16 port)
+{
+	return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
+		& SCTP_APP_TAB_MASK;
+}
+
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash;
+	__be16 port = inc->port;
+	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+	hash = sctp_app_hashkey(port);
+
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+	atomic_inc(&pd->appcnt);
+out:
+	spin_unlock_bh(&ipvs->sctp_app_lock);
+
+	return ret;
+}
+
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	atomic_dec(&pd->appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&ipvs->sctp_app_lock);
+}
+
+static int sctp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+	/* Lookup application incarnations and bind the right one */
+	hash = sctp_app_hashkey(cp->vport);
+
+	spin_lock(&ipvs->sctp_app_lock);
+	list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&ipvs->sctp_app_lock);
+
+			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+					"%s:%u to app %s on port %u\n",
+					__func__,
+					IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+					ntohs(cp->cport),
+					IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+					ntohs(cp->vport),
+					inc->name, ntohs(inc->port));
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&ipvs->sctp_app_lock);
+out:
+	return result;
+}
+
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->sctp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+							sizeof(sctp_timeouts));
+}
+
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+	kfree(pd->timeout_table);
+}
+
+struct ip_vs_protocol ip_vs_protocol_sctp = {
+	.name		= "SCTP",
+	.protocol	= IPPROTO_SCTP,
+	.num_states	= IP_VS_SCTP_S_LAST,
+	.dont_defrag	= 0,
+	.init		= NULL,
+	.exit		= NULL,
+	.init_netns	= __ip_vs_sctp_init,
+	.exit_netns	= __ip_vs_sctp_exit,
+	.register_app	= sctp_register_app,
+	.unregister_app = sctp_unregister_app,
+	.conn_schedule	= sctp_conn_schedule,
+	.conn_in_get	= ip_vs_conn_in_get_proto,
+	.conn_out_get	= ip_vs_conn_out_get_proto,
+	.snat_handler	= sctp_snat_handler,
+	.dnat_handler	= sctp_dnat_handler,
+	.csum_check	= sctp_csum_check,
+	.state_name	= sctp_state_name,
+	.state_transition = sctp_state_transition,
+	.app_conn_bind	= sctp_app_conn_bind,
+	.debug_packet	= ip_vs_tcpudp_debug_packet,
+	.timeout_change	= NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 00000000..c0cc341b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,721 @@
+/*
+ * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *              tcp_timeouts table has copy per netns in a hash table per
+ *              protocol ip_vs_proto_data and is handled by netns
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct net *net;
+	struct ip_vs_service *svc;
+	struct tcphdr _tcph, *th;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+	net = skb_net(skb);
+	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+	if (th->syn &&
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+				     &iph.daddr, th->dest))) {
+		int ignored;
+
+		if (ip_vs_todrop(net_ipvs(net))) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pd);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	/* NF_ACCEPT */
+	return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(tcph->check))));
+	else
+#endif
+	tcph->check =
+		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcph->check =
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						csum_unfold(tcph->check))));
+	else
+#endif
+	tcph->check =
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff;
+	int oldlen;
+	int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		int ret;
+
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/* Call application helper if needed */
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
+	}
+
+	tcph = (void *)skb_network_header(skb) + tcphoff;
+	tcph->source = cp->vport;
+
+	/* Adjust TCP checksums */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+					htons(oldlen),
+					htons(skb->len - tcphoff));
+	} else if (!payload_csum) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+				     cp->dport, cp->vport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, tcph->check,
+			  (char*)&(tcph->check) - (char*)tcph);
+	}
+	return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff;
+	int oldlen;
+	int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+	oldlen = skb->len - tcphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		int ret;
+
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn and iph ack_seq stuff
+		 */
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
+	}
+
+	tcph = (void *)skb_network_header(skb) + tcphoff;
+	tcph->dest = cp->dport;
+
+	/*
+	 *	Adjust TCP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+					htons(oldlen),
+					htons(skb->len - tcphoff));
+	} else if (!payload_csum) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+				     cp->vport, cp->dport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - tcphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - tcphoff,
+							cp->protocol,
+							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		tcphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		tcphoff = ip_hdrlen(skb);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_NONE:
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+	case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+		if (af == AF_INET6) {
+			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					    &ipv6_hdr(skb)->daddr,
+					    skb->len - tcphoff,
+					    ipv6_hdr(skb)->nexthdr,
+					    skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+		} else
+#endif
+			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					      ip_hdr(skb)->daddr,
+					      skb->len - tcphoff,
+					      ip_hdr(skb)->protocol,
+					      skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+		break;
+	default:
+		/* No need to checksum. */
+		break;
+	}
+
+	return 1;
+}
+
+
+#define TCP_DIR_INPUT		0
+#define TCP_DIR_OUTPUT		4
+#define TCP_DIR_INPUT_ONLY	8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
+	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
+	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ *	Timeout table[state]
+ */
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	2*HZ,
+	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
+	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
+	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
+	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
+	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
+	[IP_VS_TCP_S_LAST]		=	2*HZ,
+};
+
+static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	"NONE",
+	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
+	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
+	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
+	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
+	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
+	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
+	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
+	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
+	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
+	[IP_VS_TCP_S_LAST]		=	"BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+	int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+	if (state >= IP_VS_TCP_S_LAST)
+		return "ERR!";
+	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
+{
+	int on = (flags & 1);		/* secure_tcp */
+
+	/*
+	** FIXME: change secure_tcp to independent sysctl var
+	** or make it per-service or per-app because it is valid
+	** for most if not for all of the applications. Something
+	** like "capabilities" (flags) for each object.
+	*/
+	pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+	if (th->rst)
+		return 3;
+	if (th->syn)
+		return 0;
+	if (th->fin)
+		return 1;
+	if (th->ack)
+		return 2;
+	return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+	      int direction, struct tcphdr *th)
+{
+	int state_idx;
+	int new_state = IP_VS_TCP_S_CLOSE;
+	int state_off = tcp_state_off[direction];
+
+	/*
+	 *    Update state offset to INPUT_ONLY if necessary
+	 *    or delete NO_OUTPUT flag if output packet detected
+	 */
+	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+		if (state_off == TCP_DIR_OUTPUT)
+			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+		else
+			state_off = TCP_DIR_INPUT_ONLY;
+	}
+
+	if ((state_idx = tcp_state_idx(th)) < 0) {
+		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+		goto tcp_state_out;
+	}
+
+	new_state =
+		pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+  tcp_state_out:
+	if (new_state != cp->state) {
+		struct ip_vs_dest *dest = cp->dest;
+
+		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+			      "%s:%d state: %s->%s conn->refcnt:%d\n",
+			      pd->pp->name,
+			      ((state_off == TCP_DIR_OUTPUT) ?
+			       "output " : "input "),
+			      th->syn ? 'S' : '.',
+			      th->fin ? 'F' : '.',
+			      th->ack ? 'A' : '.',
+			      th->rst ? 'R' : '.',
+			      IP_VS_DBG_ADDR(cp->af, &cp->daddr),
+			      ntohs(cp->dport),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+			      ntohs(cp->cport),
+			      tcp_state_name(cp->state),
+			      tcp_state_name(new_state),
+			      atomic_read(&cp->refcnt));
+
+		if (dest) {
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+	}
+
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = new_state];
+	else	/* What to do ? */
+		cp->timeout = tcp_timeouts[cp->state = new_state];
+}
+
+/*
+ *	Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_proto_data *pd)
+{
+	struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+	int ihl = ip_hdrlen(skb);
+#endif
+
+	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+
+	spin_lock(&cp->lock);
+	set_tcp_state(pd, cp, direction, th);
+	spin_unlock(&cp->lock);
+
+	return 1;
+}
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+		& TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash;
+	__be16 port = inc->port;
+	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+	hash = tcp_app_hashkey(port);
+
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+	atomic_inc(&pd->appcnt);
+
+  out:
+	spin_unlock_bh(&ipvs->tcp_app_lock);
+	return ret;
+}
+
+
+static void
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	atomic_dec(&pd->appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = tcp_app_hashkey(cp->vport);
+
+	spin_lock(&ipvs->tcp_app_lock);
+	list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&ipvs->tcp_app_lock);
+
+			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&ipvs->tcp_app_lock);
+
+  out:
+	return result;
+}
+
+
+/*
+ *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+{
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+	spin_lock(&cp->lock);
+	cp->state = IP_VS_TCP_S_LISTEN;
+	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
+	spin_unlock(&cp->lock);
+}
+
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+							sizeof(tcp_timeouts));
+	pd->tcp_state_table =  tcp_states;
+}
+
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+	kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+	.name =			"TCP",
+	.protocol =		IPPROTO_TCP,
+	.num_states =		IP_VS_TCP_S_LAST,
+	.dont_defrag =		0,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__ip_vs_tcp_init,
+	.exit_netns =		__ip_vs_tcp_exit,
+	.register_app =		tcp_register_app,
+	.unregister_app =	tcp_unregister_app,
+	.conn_schedule =	tcp_conn_schedule,
+	.conn_in_get =		ip_vs_conn_in_get_proto,
+	.conn_out_get =		ip_vs_conn_out_get_proto,
+	.snat_handler =		tcp_snat_handler,
+	.dnat_handler =		tcp_dnat_handler,
+	.csum_check =		tcp_csum_check,
+	.state_name =		tcp_state_name,
+	.state_transition =	tcp_state_transition,
+	.app_conn_bind =	tcp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	tcp_timeout_change,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 00000000..f1282cbe
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,509 @@
+/*
+ * ip_vs_proto_udp.c:	UDP load balancing support for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *              Network name space (netns) aware.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct net *net;
+	struct ip_vs_service *svc;
+	struct udphdr _udph, *uh;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+	net = skb_net(skb);
+	svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+				&iph.daddr, uh->dest);
+	if (svc) {
+		int ignored;
+
+		if (ip_vs_todrop(net_ipvs(net))) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pd);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	/* NF_ACCEPT */
+	return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
+	else
+#endif
+		uhdr->check =
+			csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+					 ip_vs_check_diff2(oldport, newport,
+						~csum_unfold(uhdr->check))));
+	if (!uhdr->check)
+		uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+		     const union nf_inet_addr *oldip,
+		     const union nf_inet_addr *newip,
+		     __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		uhdr->check =
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+					 ip_vs_check_diff2(oldlen, newlen,
+						csum_unfold(uhdr->check))));
+	else
+#endif
+	uhdr->check =
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+				ip_vs_check_diff2(oldlen, newlen,
+						csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff;
+	int oldlen;
+	int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		int ret;
+
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Call application helper if needed
+		 */
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
+	}
+
+	udph = (void *)skb_network_header(skb) + udphoff;
+	udph->source = cp->vport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+					htons(oldlen),
+					htons(skb->len - udphoff));
+	} else if (!payload_csum && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+				     cp->dport, cp->vport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+						      &cp->caddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+							cp->caddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, udph->check,
+			  (char*)&(udph->check) - (char*)udph);
+	}
+	return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff;
+	int oldlen;
+	int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+	oldlen = skb->len - udphoff;
+
+	/* csum_check requires unshared skb */
+	if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		int ret;
+
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn
+		 */
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
+	}
+
+	udph = (void *)skb_network_header(skb) + udphoff;
+	udph->dest = cp->dport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+					htons(oldlen),
+					htons(skb->len - udphoff));
+	} else if (!payload_csum && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+				     cp->vport, cp->dport);
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+		if (cp->af == AF_INET6)
+			udph->check = csum_ipv6_magic(&cp->caddr.in6,
+						      &cp->daddr.in6,
+						      skb->len - udphoff,
+						      cp->protocol, skb->csum);
+		else
+#endif
+			udph->check = csum_tcpudp_magic(cp->caddr.ip,
+							cp->daddr.ip,
+							skb->len - udphoff,
+							cp->protocol,
+							skb->csum);
+		if (udph->check == 0)
+			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	struct udphdr _udph, *uh;
+	unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		udphoff = sizeof(struct ipv6hdr);
+	else
+#endif
+		udphoff = ip_hdrlen(skb);
+
+	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+	if (uh == NULL)
+		return 0;
+
+	if (uh->check != 0) {
+		switch (skb->ip_summed) {
+		case CHECKSUM_NONE:
+			skb->csum = skb_checksum(skb, udphoff,
+						 skb->len - udphoff, 0);
+		case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+			if (af == AF_INET6) {
+				if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						    &ipv6_hdr(skb)->daddr,
+						    skb->len - udphoff,
+						    ipv6_hdr(skb)->nexthdr,
+						    skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
+			} else
+#endif
+				if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+						      ip_hdr(skb)->daddr,
+						      skb->len - udphoff,
+						      ip_hdr(skb)->protocol,
+						      skb->csum)) {
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+							 "Failed checksum for");
+					return 0;
+				}
+			break;
+		default:
+			/* No need to checksum. */
+			break;
+		}
+	}
+	return 1;
+}
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+		& UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash;
+	__be16 port = inc->port;
+	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+	hash = udp_app_hashkey(port);
+
+
+	spin_lock_bh(&ipvs->udp_app_lock);
+	list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+	atomic_inc(&pd->appcnt);
+
+  out:
+	spin_unlock_bh(&ipvs->udp_app_lock);
+	return ret;
+}
+
+
+static void
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	spin_lock_bh(&ipvs->udp_app_lock);
+	atomic_dec(&pd->appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&ipvs->udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = udp_app_hashkey(cp->vport);
+
+	spin_lock(&ipvs->udp_app_lock);
+	list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&ipvs->udp_app_lock);
+
+			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+				      "%s:%u to app %s on port %u\n",
+				      __func__,
+				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+				      ntohs(cp->cport),
+				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+				      ntohs(cp->vport),
+				      inc->name, ntohs(inc->port));
+
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&ipvs->udp_app_lock);
+
+  out:
+	return result;
+}
+
+
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
+	[IP_VS_UDP_S_LAST]		=	2*HZ,
+};
+
+static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	"UDP",
+	[IP_VS_UDP_S_LAST]		=	"BUG!",
+};
+
+static const char * udp_state_name(int state)
+{
+	if (state >= IP_VS_UDP_S_LAST)
+		return "ERR!";
+	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_proto_data *pd)
+{
+	if (unlikely(!pd)) {
+		pr_err("UDP no ns data\n");
+		return 0;
+	}
+
+	cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+	return 1;
+}
+
+static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->udp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+							sizeof(udp_timeouts));
+}
+
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+	kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+	.name =			"UDP",
+	.protocol =		IPPROTO_UDP,
+	.num_states =		IP_VS_UDP_S_LAST,
+	.dont_defrag =		0,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__udp_init,
+	.exit_netns =		__udp_exit,
+	.conn_schedule =	udp_conn_schedule,
+	.conn_in_get =		ip_vs_conn_in_get_proto,
+	.conn_out_get =		ip_vs_conn_out_get_proto,
+	.snat_handler =		udp_snat_handler,
+	.dnat_handler =		udp_dnat_handler,
+	.csum_check =		udp_csum_check,
+	.state_transition =	udp_state_transition,
+	.state_name =		udp_state_name,
+	.register_app =		udp_register_app,
+	.unregister_app =	udp_unregister_app,
+	.app_conn_bind =	udp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
new file mode 100644
index 00000000..c49b388d
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,113 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct list_head *p, *q;
+	struct ip_vs_dest *dest;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	write_lock(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	p = p->next;
+	q = p;
+	do {
+		/* skip list head */
+		if (q == &svc->destinations) {
+			q = q->next;
+			continue;
+		}
+
+		dest = list_entry(q, struct ip_vs_dest, n_list);
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0)
+			/* HIT */
+			goto out;
+		q = q->next;
+	} while (q != p);
+	write_unlock(&svc->sched_lock);
+	ip_vs_scheduler_err(svc, "no destination available");
+	return NULL;
+
+  out:
+	svc->sched_data = q;
+	write_unlock(&svc->sched_lock);
+	IP_VS_DBG_BUF(6, "RR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+	.name =			"rr",			/* name */
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+	.init_service =		ip_vs_rr_init_svc,
+	.update_service =	ip_vs_rr_update_svc,
+	.schedule =		ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
new file mode 100644
index 00000000..08dbdd5b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,260 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(ip_vs_scheduler_err);
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_sched_lock);
+
+
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+			 struct ip_vs_scheduler *scheduler)
+{
+	int ret;
+
+	svc->scheduler = scheduler;
+
+	if (scheduler->init_service) {
+		ret = scheduler->init_service(svc);
+		if (ret) {
+			pr_err("%s(): init error\n", __func__);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+	struct ip_vs_scheduler *sched = svc->scheduler;
+
+	if (!sched)
+		return 0;
+
+	if (sched->done_service) {
+		if (sched->done_service(svc) != 0) {
+			pr_err("%s(): done error\n", __func__);
+			return -EINVAL;
+		}
+	}
+
+	svc->scheduler = NULL;
+	return 0;
+}
+
+
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
+
+	spin_lock_bh(&ip_vs_sched_lock);
+
+	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+		/*
+		 * Test and get the modules atomically
+		 */
+		if (sched->module && !try_module_get(sched->module)) {
+			/*
+			 * This scheduler is just deleted
+			 */
+			continue;
+		}
+		if (strcmp(sched_name, sched->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_sched_lock);
+			return sched;
+		}
+		if (sched->module)
+			module_put(sched->module);
+	}
+
+	spin_unlock_bh(&ip_vs_sched_lock);
+	return NULL;
+}
+
+
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	/*
+	 *  Search for the scheduler by sched_name
+	 */
+	sched = ip_vs_sched_getbyname(sched_name);
+
+	/*
+	 *  If scheduler not found, load the module and search again
+	 */
+	if (sched == NULL) {
+		request_module("ip_vs_%s", sched_name);
+		sched = ip_vs_sched_getbyname(sched_name);
+	}
+
+	return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+	if (scheduler && scheduler->module)
+		module_put(scheduler->module);
+}
+
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+	if (svc->fwmark) {
+		IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+			     svc->scheduler->name, svc->fwmark,
+			     svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+	} else if (svc->af == AF_INET6) {
+		IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n",
+			     svc->scheduler->name,
+			     ip_vs_proto_name(svc->protocol),
+			     &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+	} else {
+		IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+			     svc->scheduler->name,
+			     ip_vs_proto_name(svc->protocol),
+			     &svc->addr.ip, ntohs(svc->port), msg);
+	}
+}
+
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	struct ip_vs_scheduler *sched;
+
+	if (!scheduler) {
+		pr_err("%s(): NULL arg\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!scheduler->name) {
+		pr_err("%s(): NULL scheduler_name\n", __func__);
+		return -EINVAL;
+	}
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_sched_lock);
+
+	if (!list_empty(&scheduler->n_list)) {
+		spin_unlock_bh(&ip_vs_sched_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] scheduler already linked\n",
+		       __func__, scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *  Make sure that the scheduler with this name doesn't exist
+	 *  in the scheduler list.
+	 */
+	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+		if (strcmp(scheduler->name, sched->name) == 0) {
+			spin_unlock_bh(&ip_vs_sched_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] scheduler already existed "
+			       "in the system\n", __func__, scheduler->name);
+			return -EINVAL;
+		}
+	}
+	/*
+	 *	Add it into the d-linked scheduler list
+	 */
+	list_add(&scheduler->n_list, &ip_vs_schedulers);
+	spin_unlock_bh(&ip_vs_sched_lock);
+
+	pr_info("[%s] scheduler registered.\n", scheduler->name);
+
+	return 0;
+}
+
+
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	if (!scheduler) {
+		pr_err("%s(): NULL arg\n", __func__);
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&ip_vs_sched_lock);
+	if (list_empty(&scheduler->n_list)) {
+		spin_unlock_bh(&ip_vs_sched_lock);
+		pr_err("%s(): [%s] scheduler is not in the list. failed\n",
+		       __func__, scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Remove it from the d-linked scheduler list
+	 */
+	list_del(&scheduler->n_list);
+	spin_unlock_bh(&ip_vs_sched_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] scheduler unregistered.\n", scheduler->name);
+
+	return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
new file mode 100644
index 00000000..89ead246
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,141 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_sed_dest_overhead(least);
+			goto nextstage;
+		}
+	}
+	ip_vs_scheduler_err(svc, "no destination available");
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_sed_dest_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "SED: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+	.name =			"sed",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+	.schedule =		ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
new file mode 100644
index 00000000..b5e2556c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,269 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr)
+{
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0]^addr->ip6[1]^
+			    addr->ip6[2]^addr->ip6[3];
+#endif
+	return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
+	     const union nf_inet_addr *addr)
+{
+	return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl;
+
+	/* allocate the SH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_sh_bucket *tbl;
+	struct ip_vs_iphdr iph;
+
+	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+
+	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+	dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+	IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr),
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr),
+		      ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+	.name =			"sh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list	 =		LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+	.init_service =		ip_vs_sh_init_svc,
+	.done_service =		ip_vs_sh_done_svc,
+	.update_service =	ip_vs_sh_update_svc,
+	.schedule =		ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
new file mode 100644
index 00000000..e292e5bd
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,1700 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version 1,   is capable of handling both version 0 and 1 messages.
+ *              Version 0 is the plain old format.
+ *              Note Version 0 receivers will just drop Ver 1 messages.
+ *              Version 1 is capable of handle IPv6, Persistence data,
+ *              time-outs, and firewall marks.
+ *              In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ *              Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions  Message: is a complete datagram
+ *              Sync_conn: is a part of a Message
+ *              Param Data is an option to a Sync_conn.
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *	Alexandre Cassen	:	Added master & backup support at a time.
+ *	Alexandre Cassen	:	Added SyncID support for incoming sync
+ *					messages filtering.
+ *	Justin Ossevoort	:	Fix endian problem on sync message size.
+ *	Hans Schillstrom	:	Added Version 1: i.e. IPv6,
+ *					Persistence support, fwmark and time-out.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+
+#define SYNC_PROTO_VER  1		/* Protocol version in header */
+
+/*
+ *	IPVS sync connection entry
+ *	Version 0, i.e. original version.
+ */
+struct ip_vs_sync_conn_v0 {
+	__u8			reserved;
+
+	/* Protocol, addresses and port numbers */
+	__u8			protocol;       /* Which protocol (TCP/UDP) */
+	__be16			cport;
+	__be16                  vport;
+	__be16                  dport;
+	__be32                  caddr;          /* client address */
+	__be32                  vaddr;          /* virtual address */
+	__be32                  daddr;          /* destination address */
+
+	/* Flags and state transition */
+	__be16                  flags;          /* status flags */
+	__be16                  state;          /* state info */
+
+	/* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+
+/*
+     Sync Connection format (sync_conn)
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |    Type       |    Protocol   | Ver.  |        Size           |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             Flags                             |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |            State              |         cport                 |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |            vport              |         dport                 |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             fwmark                            |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             timeout  (in sec.)                |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                              ...                              |
+      |                        IP-Addresses  (v4 or v6)               |
+      |                              ...                              |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  Optional Parameters.
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      | Param. Type    | Param. Length |   Param. data                |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
+      |                              ...                              |
+      |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                               | Param Type    | Param. Length |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                           Param  data                         |
+      |         Last Param data should be padded for 32 bit alignment |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ *  Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+	__u8			type;
+	__u8			protocol;	/* Which protocol (TCP/UDP) */
+	__be16			ver_size;	/* Version msb 4 bits */
+	/* Flags and state transition */
+	__be32			flags;		/* status flags */
+	__be16			state;		/* state info 	*/
+	/* Protocol, addresses and port numbers */
+	__be16			cport;
+	__be16			vport;
+	__be16			dport;
+	__be32			fwmark;		/* Firewall mark from skb */
+	__be32			timeout;	/* cp timeout */
+	__be32			caddr;		/* client address */
+	__be32			vaddr;		/* virtual address */
+	__be32			daddr;		/* destination address */
+	/* The sequence options start here */
+	/* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+	__u8			type;
+	__u8			protocol;	/* Which protocol (TCP/UDP) */
+	__be16			ver_size;	/* Version msb 4 bits */
+	/* Flags and state transition */
+	__be32			flags;		/* status flags */
+	__be16			state;		/* state info 	*/
+	/* Protocol, addresses and port numbers */
+	__be16			cport;
+	__be16			vport;
+	__be16			dport;
+	__be32			fwmark;		/* Firewall mark from skb */
+	__be32			timeout;	/* cp timeout */
+	struct in6_addr		caddr;		/* client address */
+	struct in6_addr		vaddr;		/* virtual address */
+	struct in6_addr		daddr;		/* destination address */
+	/* The sequence options start here */
+	/* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+	struct ip_vs_sync_v4	v4;
+	struct ip_vs_sync_v6	v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6		0
+#define STYPE_F_INET6		(1 << STYPE_INET6)
+
+#define SVER_SHIFT		12		/* Shift to get version */
+#define SVER_MASK		0x0fff		/* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA	1
+#define IPVS_OPT_PE_DATA	2
+#define IPVS_OPT_PE_NAME	3
+#define IPVS_OPT_PARAM		7
+
+#define IPVS_OPT_F_SEQ_DATA	(1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA	(1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME	(1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
+
+struct ip_vs_sync_thread_data {
+	struct net *net;
+	struct socket *sock;
+	char *buf;
+};
+
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn_v0))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+  The master mulitcasts messages (Datagrams) to the backup load balancers
+  in the following format.
+
+ Version 1:
+  Note, first byte should be Zero, so ver 0 receivers will drop the packet.
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |      0        |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    Version    |    Reserved, set to Zero      |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      ~                            .                                  ~
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                    IPVS Sync Connection (1)                   |
+*/
+
+#define SYNC_MESG_HEADER_LEN	4
+#define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
+	__u8                    nr_conns;
+	__u8                    syncid;
+	__u16                   size;
+
+	/* ip_vs_sync_conn entries start here */
+};
+
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+	__u8			reserved;	/* must be zero */
+	__u8			syncid;
+	__u16			size;
+	__u8			nr_conns;
+	__s8			version;	/* SYNC_PROTO_VER  */
+	__u16			spare;
+	/* ip_vs_sync_conn entries start here */
+};
+
+struct ip_vs_sync_buff {
+	struct list_head        list;
+	unsigned long           firstuse;
+
+	/* pointers for the message data */
+	struct ip_vs_sync_mesg  *mesg;
+	unsigned char           *head;
+	unsigned char           *end;
+};
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr = {
+	.sin_family		= AF_INET,
+	.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT),
+	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
+};
+
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+	ho->init_seq       = get_unaligned_be32(&no->init_seq);
+	ho->delta          = get_unaligned_be32(&no->delta);
+	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+	put_unaligned_be32(ho->init_seq, &no->init_seq);
+	put_unaligned_be32(ho->delta, &no->delta);
+	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&ipvs->sync_lock);
+	if (list_empty(&ipvs->sync_queue)) {
+		sb = NULL;
+	} else {
+		sb = list_entry(ipvs->sync_queue.next,
+				struct ip_vs_sync_buff,
+				list);
+		list_del(&sb->list);
+	}
+	spin_unlock_bh(&ipvs->sync_lock);
+
+	return sb;
+}
+
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_sync_buff *sb;
+
+	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+		return NULL;
+
+	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	if (!sb->mesg) {
+		kfree(sb);
+		return NULL;
+	}
+	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
+	sb->mesg->version = SYNC_PROTO_VER;
+	sb->mesg->syncid = ipvs->master_syncid;
+	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
+	sb->mesg->nr_conns = 0;
+	sb->mesg->spare = 0;
+	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
+	sb->firstuse = jiffies;
+	return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+	kfree(sb->mesg);
+	kfree(sb);
+}
+
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+	spin_lock(&ipvs->sync_lock);
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		list_add_tail(&sb->list, &ipvs->sync_queue);
+	else
+		ip_vs_sync_buff_release(sb);
+	spin_unlock(&ipvs->sync_lock);
+}
+
+/*
+ *	Get the current sync buffer if it has been created for more
+ *	than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&ipvs->sync_buff_lock);
+	if (ipvs->sync_buff &&
+	    time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
+		sb = ipvs->sync_buff;
+		ipvs->sync_buff = NULL;
+	} else
+		sb = NULL;
+	spin_unlock_bh(&ipvs->sync_buff_lock);
+	return sb;
+}
+
+/*
+ * Switch mode from sending version 0 or 1
+ *  - must handle sync_buf
+ */
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
+		return;
+	if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
+		return;
+
+	spin_lock_bh(&ipvs->sync_buff_lock);
+	/* Buffer empty ? then let buf_create do the job  */
+	if (ipvs->sync_buff->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
+		kfree(ipvs->sync_buff);
+		ipvs->sync_buff = NULL;
+	} else {
+		spin_lock_bh(&ipvs->sync_lock);
+		if (ipvs->sync_state & IP_VS_STATE_MASTER)
+			list_add_tail(&ipvs->sync_buff->list,
+				      &ipvs->sync_queue);
+		else
+			ip_vs_sync_buff_release(ipvs->sync_buff);
+		spin_unlock_bh(&ipvs->sync_lock);
+	}
+	spin_unlock_bh(&ipvs->sync_buff_lock);
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_sync_buff *sb;
+	struct ip_vs_sync_mesg_v0 *mesg;
+
+	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+		return NULL;
+
+	sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+	if (!sb->mesg) {
+		kfree(sb);
+		return NULL;
+	}
+	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+	mesg->nr_conns = 0;
+	mesg->syncid = ipvs->master_syncid;
+	mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+	sb->firstuse = jiffies;
+	return sb;
+}
+
+/*
+ *      Version 0 , could be switched in by sys_ctl.
+ *      Add an ip_vs_conn information into the current sync_buff.
+ */
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_sync_mesg_v0 *m;
+	struct ip_vs_sync_conn_v0 *s;
+	int len;
+
+	if (unlikely(cp->af != AF_INET))
+		return;
+	/* Do not sync ONE PACKET */
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return;
+
+	spin_lock(&ipvs->sync_buff_lock);
+	if (!ipvs->sync_buff) {
+		ipvs->sync_buff =
+			ip_vs_sync_buff_create_v0(ipvs);
+		if (!ipvs->sync_buff) {
+			spin_unlock(&ipvs->sync_buff_lock);
+			pr_err("ip_vs_sync_buff_create failed.\n");
+			return;
+		}
+	}
+
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
+	m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+	s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
+
+	/* copy members */
+	s->reserved = 0;
+	s->protocol = cp->protocol;
+	s->cport = cp->cport;
+	s->vport = cp->vport;
+	s->dport = cp->dport;
+	s->caddr = cp->caddr.ip;
+	s->vaddr = cp->vaddr.ip;
+	s->daddr = cp->daddr.ip;
+	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->state = htons(cp->state);
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		struct ip_vs_sync_conn_options *opt =
+			(struct ip_vs_sync_conn_options *)&s[1];
+		memcpy(opt, &cp->in_seq, sizeof(*opt));
+	}
+
+	m->nr_conns++;
+	m->size += len;
+	ipvs->sync_buff->head += len;
+
+	/* check if there is a space for next one */
+	if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+		sb_queue_tail(ipvs);
+		ipvs->sync_buff = NULL;
+	}
+	spin_unlock(&ipvs->sync_buff_lock);
+
+	/* synchronize its controller if it has */
+	if (cp->control)
+		ip_vs_sync_conn(net, cp->control);
+}
+
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ *      Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_sync_mesg *m;
+	union ip_vs_sync_conn *s;
+	__u8 *p;
+	unsigned int len, pe_name_len, pad;
+
+	/* Handle old version of the protocol */
+	if (sysctl_sync_ver(ipvs) == 0) {
+		ip_vs_sync_conn_v0(net, cp);
+		return;
+	}
+	/* Do not sync ONE PACKET */
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		goto control;
+sloop:
+	/* Sanity checks */
+	pe_name_len = 0;
+	if (cp->pe_data_len) {
+		if (!cp->pe_data || !cp->dest) {
+			IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+			return;
+		}
+		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+	}
+
+	spin_lock(&ipvs->sync_buff_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		len = sizeof(struct ip_vs_sync_v6);
+	else
+#endif
+		len = sizeof(struct ip_vs_sync_v4);
+
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+		len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+	if (cp->pe_data_len)
+		len += cp->pe_data_len + 2;	/* + Param hdr field */
+	if (pe_name_len)
+		len += pe_name_len + 2;
+
+	/* check if there is a space for this one  */
+	pad = 0;
+	if (ipvs->sync_buff) {
+		pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+		if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+			sb_queue_tail(ipvs);
+			ipvs->sync_buff = NULL;
+			pad = 0;
+		}
+	}
+
+	if (!ipvs->sync_buff) {
+		ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
+		if (!ipvs->sync_buff) {
+			spin_unlock(&ipvs->sync_buff_lock);
+			pr_err("ip_vs_sync_buff_create failed.\n");
+			return;
+		}
+	}
+
+	m = ipvs->sync_buff->mesg;
+	p = ipvs->sync_buff->head;
+	ipvs->sync_buff->head += pad + len;
+	m->size += pad + len;
+	/* Add ev. padding from prev. sync_conn */
+	while (pad--)
+		*(p++) = 0;
+
+	s = (union ip_vs_sync_conn *)p;
+
+	/* Set message type  & copy members */
+	s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+	s->v4.ver_size = htons(len & SVER_MASK);	/* Version 0 */
+	s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->v4.state = htons(cp->state);
+	s->v4.protocol = cp->protocol;
+	s->v4.cport = cp->cport;
+	s->v4.vport = cp->vport;
+	s->v4.dport = cp->dport;
+	s->v4.fwmark = htonl(cp->fwmark);
+	s->v4.timeout = htonl(cp->timeout / HZ);
+	m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6) {
+		p += sizeof(struct ip_vs_sync_v6);
+		ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+		ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+		ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+	} else
+#endif
+	{
+		p += sizeof(struct ip_vs_sync_v4);	/* options ptr */
+		s->v4.caddr = cp->caddr.ip;
+		s->v4.vaddr = cp->vaddr.ip;
+		s->v4.daddr = cp->daddr.ip;
+	}
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		*(p++) = IPVS_OPT_SEQ_DATA;
+		*(p++) = sizeof(struct ip_vs_sync_conn_options);
+		hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+		p += sizeof(struct ip_vs_seq);
+		hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+		p += sizeof(struct ip_vs_seq);
+	}
+	/* Handle pe data */
+	if (cp->pe_data_len && cp->pe_data) {
+		*(p++) = IPVS_OPT_PE_DATA;
+		*(p++) = cp->pe_data_len;
+		memcpy(p, cp->pe_data, cp->pe_data_len);
+		p += cp->pe_data_len;
+		if (pe_name_len) {
+			/* Add PE_NAME */
+			*(p++) = IPVS_OPT_PE_NAME;
+			*(p++) = pe_name_len;
+			memcpy(p, cp->pe->name, pe_name_len);
+			p += pe_name_len;
+		}
+	}
+
+	spin_unlock(&ipvs->sync_buff_lock);
+
+control:
+	/* synchronize its controller if it has */
+	cp = cp->control;
+	if (!cp)
+		return;
+	/*
+	 * Reduce sync rate for templates
+	 * i.e only increment in_pkts for Templates.
+	 */
+	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+		int pkts = atomic_add_return(1, &cp->in_pkts);
+
+		if (pkts % sysctl_sync_period(ipvs) != 1)
+			return;
+	}
+	goto sloop;
+}
+
+/*
+ *  fill_param used by version 1
+ */
+static inline int
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+			   struct ip_vs_conn_param *p,
+			   __u8 *pe_data, unsigned int pe_data_len,
+			   __u8 *pe_name, unsigned int pe_name_len)
+{
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+				      (const union nf_inet_addr *)&sc->v6.caddr,
+				      sc->v6.cport,
+				      (const union nf_inet_addr *)&sc->v6.vaddr,
+				      sc->v6.vport, p);
+	else
+#endif
+		ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+				      (const union nf_inet_addr *)&sc->v4.caddr,
+				      sc->v4.cport,
+				      (const union nf_inet_addr *)&sc->v4.vaddr,
+				      sc->v4.vport, p);
+	/* Handle pe data */
+	if (pe_data_len) {
+		if (pe_name_len) {
+			char buff[IP_VS_PENAME_MAXLEN+1];
+
+			memcpy(buff, pe_name, pe_name_len);
+			buff[pe_name_len]=0;
+			p->pe = __ip_vs_pe_getbyname(buff);
+			if (!p->pe) {
+				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+					     buff);
+				return 1;
+			}
+		} else {
+			IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+			return 1;
+		}
+
+		p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+		if (!p->pe_data) {
+			if (p->pe->module)
+				module_put(p->pe->module);
+			return -ENOMEM;
+		}
+		p->pe_data_len = pe_data_len;
+	}
+	return 0;
+}
+
+/*
+ *  Connection Add / Update.
+ *  Common for version 0 and 1 reception of backup sync_conns.
+ *  Param: ...
+ *         timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+			    unsigned int flags, unsigned int state,
+			    unsigned int protocol, unsigned int type,
+			    const union nf_inet_addr *daddr, __be16 dport,
+			    unsigned long timeout, __u32 fwmark,
+			    struct ip_vs_sync_conn_options *opt)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *cp;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE))
+		cp = ip_vs_conn_in_get(param);
+	else
+		cp = ip_vs_ct_in_get(param);
+
+	if (cp && param->pe_data) 	/* Free pe_data */
+		kfree(param->pe_data);
+	if (!cp) {
+		/*
+		 * Find the appropriate destination for the connection.
+		 * If it is not found the connection will remain unbound
+		 * but still handled.
+		 */
+		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
+				       param->vport, protocol, fwmark);
+
+		/*  Set the approprite ativity flag */
+		if (protocol == IPPROTO_TCP) {
+			if (state != IP_VS_TCP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		} else if (protocol == IPPROTO_SCTP) {
+			if (state != IP_VS_SCTP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+		if (!cp) {
+			if (param->pe_data)
+				kfree(param->pe_data);
+			IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+			return;
+		}
+	} else if (!cp->dest) {
+		dest = ip_vs_try_bind_dest(cp);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+		(cp->state != state)) {
+		/* update active/inactive flag for the connection */
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state != IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags |= IP_VS_CONN_F_INACTIVE;
+		} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state == IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_inc(&dest->activeconns);
+			atomic_dec(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+		(cp->state != state)) {
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+		(state != IP_VS_SCTP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	}
+
+	if (opt)
+		memcpy(&cp->in_seq, opt, sizeof(*opt));
+	atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+	cp->state = state;
+	cp->old_state = cp->state;
+	/*
+	 * For Ver 0 messages style
+	 *  - Not possible to recover the right timeout for templates
+	 *  - can not find the right fwmark
+	 *    virtual service. If needed, we can do it for
+	 *    non-fwmark persistent services.
+	 * Ver 1 messages style.
+	 *  - No problem.
+	 */
+	if (timeout) {
+		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+			timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+		cp->timeout = timeout*HZ;
+	} else {
+		struct ip_vs_proto_data *pd;
+
+		pd = ip_vs_proto_data_get(net, protocol);
+		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+			cp->timeout = pd->timeout_table[state];
+		else
+			cp->timeout = (3*60*HZ);
+	}
+	ip_vs_conn_put(cp);
+}
+
+/*
+ *  Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+				     const size_t buflen)
+{
+	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+	struct ip_vs_sync_conn_v0 *s;
+	struct ip_vs_sync_conn_options *opt;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn_param param;
+	char *p;
+	int i;
+
+	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
+	for (i=0; i<m->nr_conns; i++) {
+		unsigned flags, state;
+
+		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
+			return;
+		}
+		s = (struct ip_vs_sync_conn_v0 *) p;
+		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+		flags &= ~IP_VS_CONN_F_HASHED;
+		if (flags & IP_VS_CONN_F_SEQ_MASK) {
+			opt = (struct ip_vs_sync_conn_options *)&s[1];
+			p += FULL_CONN_SIZE;
+			if (p > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
+				return;
+			}
+		} else {
+			opt = NULL;
+			p += SIMPLE_CONN_SIZE;
+		}
+
+		state = ntohs(s->state);
+		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+			pp = ip_vs_proto_get(s->protocol);
+			if (!pp) {
+				IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
+					s->protocol);
+				continue;
+			}
+			if (state >= pp->num_states) {
+				IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
+					pp->name, state);
+				continue;
+			}
+		} else {
+			/* protocol in templates is not used for state/timeout */
+			if (state > 0) {
+				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
+					state);
+				state = 0;
+			}
+		}
+
+		ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+				      (const union nf_inet_addr *)&s->caddr,
+				      s->cport,
+				      (const union nf_inet_addr *)&s->vaddr,
+				      s->vport, &param);
+
+		/* Send timeout as Zero */
+		ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+				(union nf_inet_addr *)&s->daddr, s->dport,
+				0, 0, opt);
+	}
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+				    __u32 *opt_flags,
+				    struct ip_vs_sync_conn_options *opt)
+{
+	struct ip_vs_sync_conn_options *topt;
+
+	topt = (struct ip_vs_sync_conn_options *)p;
+
+	if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+		IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+		return -EINVAL;
+	}
+	if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+		IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+		return -EINVAL;
+	}
+	ntoh_seq(&topt->in_seq, &opt->in_seq);
+	ntoh_seq(&topt->out_seq, &opt->out_seq);
+	*opt_flags |= IPVS_OPT_F_SEQ_DATA;
+	return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+			  __u8 **data, unsigned int maxlen,
+			  __u32 *opt_flags, __u32 flag)
+{
+	if (plen > maxlen) {
+		IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+		return -EINVAL;
+	}
+	if (*opt_flags & flag) {
+		IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+		return -EINVAL;
+	}
+	*data_len = plen;
+	*data = p;
+	*opt_flags |= flag;
+	return 0;
+}
+/*
+ *   Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+	struct ip_vs_sync_conn_options opt;
+	union  ip_vs_sync_conn *s;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn_param param;
+	__u32 flags;
+	unsigned int af, state, pe_data_len=0, pe_name_len=0;
+	__u8 *pe_data=NULL, *pe_name=NULL;
+	__u32 opt_flags=0;
+	int retc=0;
+
+	s = (union ip_vs_sync_conn *) p;
+
+	if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+		af = AF_INET6;
+		p += sizeof(struct ip_vs_sync_v6);
+#else
+		IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+		retc = 10;
+		goto out;
+#endif
+	} else if (!s->v4.type) {
+		af = AF_INET;
+		p += sizeof(struct ip_vs_sync_v4);
+	} else {
+		return -10;
+	}
+	if (p > msg_end)
+		return -20;
+
+	/* Process optional params check Type & Len. */
+	while (p < msg_end) {
+		int ptype;
+		int plen;
+
+		if (p+2 > msg_end)
+			return -30;
+		ptype = *(p++);
+		plen  = *(p++);
+
+		if (!plen || ((p + plen) > msg_end))
+			return -40;
+		/* Handle seq option  p = param data */
+		switch (ptype & ~IPVS_OPT_F_PARAM) {
+		case IPVS_OPT_SEQ_DATA:
+			if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+				return -50;
+			break;
+
+		case IPVS_OPT_PE_DATA:
+			if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+					   IP_VS_PEDATA_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_DATA))
+				return -60;
+			break;
+
+		case IPVS_OPT_PE_NAME:
+			if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+					   IP_VS_PENAME_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_NAME))
+				return -70;
+			break;
+
+		default:
+			/* Param data mandatory ? */
+			if (!(ptype & IPVS_OPT_F_PARAM)) {
+				IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+					  ptype & ~IPVS_OPT_F_PARAM);
+				retc = 20;
+				goto out;
+			}
+		}
+		p += plen;  /* Next option */
+	}
+
+	/* Get flags and Mask off unsupported */
+	flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+	flags |= IP_VS_CONN_F_SYNC;
+	state = ntohs(s->v4.state);
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+		pp = ip_vs_proto_get(s->v4.protocol);
+		if (!pp) {
+			IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+				s->v4.protocol);
+			retc = 30;
+			goto out;
+		}
+		if (state >= pp->num_states) {
+			IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+				pp->name, state);
+			retc = 40;
+			goto out;
+		}
+	} else {
+		/* protocol in templates is not used for state/timeout */
+		if (state > 0) {
+			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+				state);
+			state = 0;
+		}
+	}
+	if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+				       pe_data_len, pe_name, pe_name_len)) {
+		retc = 50;
+		goto out;
+	}
+	/* If only IPv4, just silent skip IPv6 */
+	if (af == AF_INET)
+		ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+				ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+				);
+#ifdef CONFIG_IP_VS_IPV6
+	else
+		ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+				ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+				);
+#endif
+	return 0;
+	/* Error exit */
+out:
+	IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+	return retc;
+
+}
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ *      Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+				  const size_t buflen)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+	__u8 *p, *msg_end;
+	int i, nr_conns;
+
+	if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+		IP_VS_DBG(2, "BACKUP, message header too short\n");
+		return;
+	}
+	/* Convert size back to host byte order */
+	m2->size = ntohs(m2->size);
+
+	if (buflen != m2->size) {
+		IP_VS_DBG(2, "BACKUP, bogus message size\n");
+		return;
+	}
+	/* SyncID sanity check */
+	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+		return;
+	}
+	/* Handle version 1  message */
+	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+	    && (m2->spare == 0)) {
+
+		msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+		nr_conns = m2->nr_conns;
+
+		for (i=0; i<nr_conns; i++) {
+			union ip_vs_sync_conn *s;
+			unsigned size;
+			int retc;
+
+			p = msg_end;
+			if (p + sizeof(s->v4) > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+				return;
+			}
+			s = (union ip_vs_sync_conn *)p;
+			size = ntohs(s->v4.ver_size) & SVER_MASK;
+			msg_end = p + size;
+			/* Basic sanity checks */
+			if (msg_end  > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
+				return;
+			}
+			if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+					      ntohs(s->v4.ver_size) >> SVER_SHIFT);
+				return;
+			}
+			/* Process a single sync_conn */
+			retc = ip_vs_proc_sync_conn(net, p, msg_end);
+			if (retc < 0) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+					     retc);
+				return;
+			}
+			/* Make sure we have 32 bit alignment */
+			msg_end = p + ((size + 3) & ~3);
+		}
+	} else {
+		/* Old type of message */
+		ip_vs_process_message_v0(net, buffer, buflen);
+		return;
+	}
+}
+
+
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+	lock_sock(sk);
+	inet->mc_loop = loop ? 1 : 0;
+	release_sock(sk);
+}
+
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+	lock_sock(sk);
+	inet->mc_ttl = ttl;
+	release_sock(sk);
+}
+
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+	struct net_device *dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev)
+		return -ENODEV;
+
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	lock_sock(sk);
+	inet->mc_index = dev->ifindex;
+	/*  inet->mc_addr  = 0; */
+	release_sock(sk);
+
+	return 0;
+}
+
+
+/*
+ *	Set the maximum length of sync message according to the
+ *	specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct net_device *dev;
+	int num;
+
+	if (sync_state == IP_VS_STATE_MASTER) {
+		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+		if (!dev)
+			return -ENODEV;
+
+		num = (dev->mtu - sizeof(struct iphdr) -
+		       sizeof(struct udphdr) -
+		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+		IP_VS_DBG(7, "setting the maximum length of sync sending "
+			  "message %d.\n", ipvs->send_mesg_maxlen);
+	} else if (sync_state == IP_VS_STATE_BACKUP) {
+		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+		if (!dev)
+			return -ENODEV;
+
+		ipvs->recv_mesg_maxlen = dev->mtu -
+			sizeof(struct iphdr) - sizeof(struct udphdr);
+		IP_VS_DBG(7, "setting the maximum length of sync receiving "
+			  "message %d.\n", ipvs->recv_mesg_maxlen);
+	}
+
+	return 0;
+}
+
+
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+	struct net *net = sock_net(sk);
+	struct ip_mreqn mreq;
+	struct net_device *dev;
+	int ret;
+
+	memset(&mreq, 0, sizeof(mreq));
+	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev)
+		return -ENODEV;
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	mreq.imr_ifindex = dev->ifindex;
+
+	lock_sock(sk);
+	ret = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+
+	return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+	struct net *net = sock_net(sock->sk);
+	struct net_device *dev;
+	__be32 addr;
+	struct sockaddr_in sin;
+
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev)
+		return -ENODEV;
+
+	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	if (!addr)
+		pr_err("You probably need to specify IP address on "
+		       "multicast interface.\n");
+
+	IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
+		  ifname, &addr);
+
+	/* Now bind the socket with the address of multicast interface */
+	sin.sin_family	     = AF_INET;
+	sin.sin_addr.s_addr  = addr;
+	sin.sin_port         = 0;
+
+	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket *make_send_sock(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct socket *sock;
+	int result;
+
+	/* First create a socket move it to right name space later */
+	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (result < 0) {
+		pr_err("Error during creation of socket; terminating\n");
+		return ERR_PTR(result);
+	}
+	/*
+	 * Kernel sockets that are a part of a namespace, should not
+	 * hold a reference to a namespace in order to allow to stop it.
+	 * After sk_change_net should be released using sk_release_kernel.
+	 */
+	sk_change_net(sock->sk, net);
+	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+	if (result < 0) {
+		pr_err("Error setting outbound mcast interface\n");
+		goto error;
+	}
+
+	set_mcast_loop(sock->sk, 0);
+	set_mcast_ttl(sock->sk, 1);
+
+	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+	if (result < 0) {
+		pr_err("Error binding address of the mcast interface\n");
+		goto error;
+	}
+
+	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+			sizeof(struct sockaddr), 0);
+	if (result < 0) {
+		pr_err("Error connecting to the multicast addr\n");
+		goto error;
+	}
+
+	return sock;
+
+error:
+	sk_release_kernel(sock->sk);
+	return ERR_PTR(result);
+}
+
+
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket *make_receive_sock(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct socket *sock;
+	int result;
+
+	/* First create a socket */
+	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (result < 0) {
+		pr_err("Error during creation of socket; terminating\n");
+		return ERR_PTR(result);
+	}
+	/*
+	 * Kernel sockets that are a part of a namespace, should not
+	 * hold a reference to a namespace in order to allow to stop it.
+	 * After sk_change_net should be released using sk_release_kernel.
+	 */
+	sk_change_net(sock->sk, net);
+	/* it is equivalent to the REUSEADDR option in user-space */
+	sock->sk->sk_reuse = 1;
+
+	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+			sizeof(struct sockaddr));
+	if (result < 0) {
+		pr_err("Error binding to the multicast addr\n");
+		goto error;
+	}
+
+	/* join the multicast group */
+	result = join_mcast_group(sock->sk,
+			(struct in_addr *) &mcast_addr.sin_addr,
+			ipvs->backup_mcast_ifn);
+	if (result < 0) {
+		pr_err("Error joining to the multicast group\n");
+		goto error;
+	}
+
+	return sock;
+
+error:
+	sk_release_kernel(sock->sk);
+	return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+	struct msghdr	msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+	struct kvec	iov;
+	int		len;
+
+	EnterFunction(7);
+	iov.iov_base     = (void *)buffer;
+	iov.iov_len      = length;
+
+	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+	LeaveFunction(7);
+	return len;
+}
+
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+	int msize;
+
+	msize = msg->size;
+
+	/* Put size in network byte order */
+	msg->size = htons(msg->size);
+
+	if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+		pr_err("ip_vs_send_async error\n");
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+	struct msghdr		msg = {NULL,};
+	struct kvec		iov;
+	int			len;
+
+	EnterFunction(7);
+
+	/* Receive a packet */
+	iov.iov_base     = buffer;
+	iov.iov_len      = (size_t)buflen;
+
+	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+
+	if (len < 0)
+		return -1;
+
+	LeaveFunction(7);
+	return len;
+}
+
+
+static int sync_thread_master(void *data)
+{
+	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+	struct ip_vs_sync_buff *sb;
+
+	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
+		"syncid = %d\n",
+		ipvs->master_mcast_ifn, ipvs->master_syncid);
+
+	while (!kthread_should_stop()) {
+		while ((sb = sb_dequeue(ipvs))) {
+			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		/* check if entries stay in ipvs->sync_buff for 2 seconds */
+		sb = get_curr_sync_buff(ipvs, 2 * HZ);
+		if (sb) {
+			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		schedule_timeout_interruptible(HZ);
+	}
+
+	/* clean up the sync_buff queue */
+	while ((sb = sb_dequeue(ipvs)))
+		ip_vs_sync_buff_release(sb);
+
+	/* clean up the current sync_buff */
+	sb = get_curr_sync_buff(ipvs, 0);
+	if (sb)
+		ip_vs_sync_buff_release(sb);
+
+	/* release the sending multicast socket */
+	sk_release_kernel(tinfo->sock->sk);
+	kfree(tinfo);
+
+	return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+	int len;
+
+	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
+		"syncid = %d\n",
+		ipvs->backup_mcast_ifn, ipvs->backup_syncid);
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
+			 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+			 || kthread_should_stop());
+
+		/* do we have data now? */
+		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+			len = ip_vs_receive(tinfo->sock, tinfo->buf,
+					ipvs->recv_mesg_maxlen);
+			if (len <= 0) {
+				pr_err("receiving message error\n");
+				break;
+			}
+
+			/* disable bottom half, because it accesses the data
+			   shared by softirq while getting/creating conns */
+			local_bh_disable();
+			ip_vs_process_message(tinfo->net, tinfo->buf, len);
+			local_bh_enable();
+		}
+	}
+
+	/* release the sending multicast socket */
+	sk_release_kernel(tinfo->sock->sk);
+	kfree(tinfo->buf);
+	kfree(tinfo);
+
+	return 0;
+}
+
+
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+{
+	struct ip_vs_sync_thread_data *tinfo;
+	struct task_struct **realtask, *task;
+	struct socket *sock;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	char *name, *buf = NULL;
+	int (*threadfn)(void *data);
+	int result = -ENOMEM;
+
+	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+		  sizeof(struct ip_vs_sync_conn_v0));
+
+	if (state == IP_VS_STATE_MASTER) {
+		if (ipvs->master_thread)
+			return -EEXIST;
+
+		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->master_mcast_ifn));
+		ipvs->master_syncid = syncid;
+		realtask = &ipvs->master_thread;
+		name = "ipvs_master:%d";
+		threadfn = sync_thread_master;
+		sock = make_send_sock(net);
+	} else if (state == IP_VS_STATE_BACKUP) {
+		if (ipvs->backup_thread)
+			return -EEXIST;
+
+		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->backup_mcast_ifn));
+		ipvs->backup_syncid = syncid;
+		realtask = &ipvs->backup_thread;
+		name = "ipvs_backup:%d";
+		threadfn = sync_thread_backup;
+		sock = make_receive_sock(net);
+	} else {
+		return -EINVAL;
+	}
+
+	if (IS_ERR(sock)) {
+		result = PTR_ERR(sock);
+		goto out;
+	}
+
+	set_sync_mesg_maxlen(net, state);
+	if (state == IP_VS_STATE_BACKUP) {
+		buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
+		if (!buf)
+			goto outsocket;
+	}
+
+	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+	if (!tinfo)
+		goto outbuf;
+
+	tinfo->net = net;
+	tinfo->sock = sock;
+	tinfo->buf = buf;
+
+	task = kthread_run(threadfn, tinfo, name, ipvs->gen);
+	if (IS_ERR(task)) {
+		result = PTR_ERR(task);
+		goto outtinfo;
+	}
+
+	/* mark as active */
+	*realtask = task;
+	ipvs->sync_state |= state;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	return 0;
+
+outtinfo:
+	kfree(tinfo);
+outbuf:
+	kfree(buf);
+outsocket:
+	sk_release_kernel(sock->sk);
+out:
+	return result;
+}
+
+
+int stop_sync_thread(struct net *net, int state)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	int retc = -EINVAL;
+
+	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+
+	if (state == IP_VS_STATE_MASTER) {
+		if (!ipvs->master_thread)
+			return -ESRCH;
+
+		pr_info("stopping master sync thread %d ...\n",
+			task_pid_nr(ipvs->master_thread));
+
+		/*
+		 * The lock synchronizes with sb_queue_tail(), so that we don't
+		 * add sync buffers to the queue, when we are already in
+		 * progress of stopping the master sync daemon.
+		 */
+
+		spin_lock_bh(&ipvs->sync_lock);
+		ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+		spin_unlock_bh(&ipvs->sync_lock);
+		retc = kthread_stop(ipvs->master_thread);
+		ipvs->master_thread = NULL;
+	} else if (state == IP_VS_STATE_BACKUP) {
+		if (!ipvs->backup_thread)
+			return -ESRCH;
+
+		pr_info("stopping backup sync thread %d ...\n",
+			task_pid_nr(ipvs->backup_thread));
+
+		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+		retc = kthread_stop(ipvs->backup_thread);
+		ipvs->backup_thread = NULL;
+	}
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return retc;
+}
+
+/*
+ * Initialize data struct for each netns
+ */
+int __net_init __ip_vs_sync_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	INIT_LIST_HEAD(&ipvs->sync_queue);
+	spin_lock_init(&ipvs->sync_lock);
+	spin_lock_init(&ipvs->sync_buff_lock);
+
+	ipvs->sync_mcast_addr.sin_family = AF_INET;
+	ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+	ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
+	return 0;
+}
+
+void __ip_vs_sync_cleanup(struct net *net)
+{
+	int retc;
+
+	retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+	if (retc && retc != -ESRCH)
+		pr_err("Failed to stop Master Daemon\n");
+
+	retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+	if (retc && retc != -ESRCH)
+		pr_err("Failed to stop Backup Daemon\n");
+}
+
+int __init ip_vs_sync_init(void)
+{
+	return 0;
+}
+
+void ip_vs_sync_cleanup(void)
+{
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
new file mode 100644
index 00000000..bc1bfc48
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,113 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *		  (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_dest_conn_overhead(least);
+			goto nextstage;
+		}
+	}
+	ip_vs_scheduler_err(svc, "no destination available");
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_dest_conn_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+		      "activeconns %d refcnt %d weight %d overhead %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
+		      atomic_read(&least->activeconns),
+		      atomic_read(&least->refcnt),
+		      atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+	.name =			"wlc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+	.schedule =		ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
new file mode 100644
index 00000000..1ef41f50
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,232 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/gcd.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+	struct list_head *cl;	/* current list head */
+	int cw;			/* current weight */
+	int mw;			/* maximum weight */
+	int di;			/* decreasing interval */
+};
+
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g ? g : 1;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int new_weight, weight = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		new_weight = atomic_read(&dest->weight);
+		if (new_weight > weight)
+			weight = new_weight;
+	}
+
+	return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark;
+
+	/*
+	 *    Allocate the mark variable for WRR scheduling
+	 */
+	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+	if (mark == NULL) {
+		pr_err("%s(): no memory\n", __func__);
+		return -ENOMEM;
+	}
+	mark->cl = &svc->destinations;
+	mark->cw = 0;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	svc->sched_data = mark;
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+	/*
+	 *    Release the mark variable
+	 */
+	kfree(svc->sched_data);
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+	mark->cl = &svc->destinations;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	if (mark->cw > mark->mw)
+		mark->cw = 0;
+	return 0;
+}
+
+
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+	struct list_head *p;
+
+	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+	/*
+	 * This loop will always terminate, because mark->cw in (0, max_weight]
+	 * and at least one server has its weight equal to max_weight.
+	 */
+	write_lock(&svc->sched_lock);
+	p = mark->cl;
+	while (1) {
+		if (mark->cl == &svc->destinations) {
+			/* it is at the head of the destination list */
+
+			if (mark->cl == mark->cl->next) {
+				/* no dest entry */
+				ip_vs_scheduler_err(svc,
+					"no destination available: "
+					"no destinations present");
+				dest = NULL;
+				goto out;
+			}
+
+			mark->cl = svc->destinations.next;
+			mark->cw -= mark->di;
+			if (mark->cw <= 0) {
+				mark->cw = mark->mw;
+				/*
+				 * Still zero, which means no available servers.
+				 */
+				if (mark->cw == 0) {
+					mark->cl = &svc->destinations;
+					ip_vs_scheduler_err(svc,
+						"no destination available");
+					dest = NULL;
+					goto out;
+				}
+			}
+		} else
+			mark->cl = mark->cl->next;
+
+		if (mark->cl != &svc->destinations) {
+			/* not at the head of the list */
+			dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    atomic_read(&dest->weight) >= mark->cw) {
+				/* got it */
+				break;
+			}
+		}
+
+		if (mark->cl == p && mark->cw == mark->di) {
+			/* back to the start, and no dest is found.
+			   It is only possible when all dests are OVERLOADED */
+			dest = NULL;
+			ip_vs_scheduler_err(svc,
+				"no destination available: "
+				"all destinations are overloaded");
+			goto out;
+		}
+	}
+
+	IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+		      "activeconns %d refcnt %d weight %d\n",
+		      IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
+		      atomic_read(&dest->activeconns),
+		      atomic_read(&dest->refcnt),
+		      atomic_read(&dest->weight));
+
+  out:
+	write_unlock(&svc->sched_lock);
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+	.name =			"wrr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+	.init_service =		ip_vs_wrr_init_svc,
+	.done_service =		ip_vs_wrr_done_svc,
+	.update_service =	ip_vs_wrr_update_svc,
+	.schedule =		ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
new file mode 100644
index 00000000..ee319a43
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1370 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+enum {
+	IP_VS_RT_MODE_LOCAL	= 1, /* Allow local dest */
+	IP_VS_RT_MODE_NON_LOCAL	= 2, /* Allow non-local dest */
+	IP_VS_RT_MODE_RDR	= 4, /* Allow redirect from remote daddr to
+				      * local
+				      */
+};
+
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = dst;
+	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
+	dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+{
+	struct dst_entry *dst = dest->dst_cache;
+
+	if (!dst)
+		return NULL;
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
+		dest->dst_cache = NULL;
+		dst_release(dst);
+		return NULL;
+	}
+	dst_hold(dst);
+	return dst;
+}
+
+/* Get route to destination or remote server */
+static struct rtable *
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+		   __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct rtable *rt;			/* Route to the other host */
+	struct rtable *ort;			/* Original route */
+	int local;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		if (!(rt = (struct rtable *)
+		      __ip_vs_dst_check(dest, rtos))) {
+			struct flowi4 fl4;
+
+			memset(&fl4, 0, sizeof(fl4));
+			fl4.daddr = dest->addr.ip;
+			fl4.flowi4_tos = rtos;
+			rt = ip_route_output_key(net, &fl4);
+			if (IS_ERR(rt)) {
+				spin_unlock(&dest->dst_lock);
+				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
+					     &dest->addr.ip);
+				return NULL;
+			}
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
+			dest->dst_saddr.ip = fl4.saddr;
+			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
+				  "rtos=%X\n",
+				  &dest->addr.ip, &dest->dst_saddr.ip,
+				  atomic_read(&rt->dst.__refcnt), rtos);
+		}
+		daddr = dest->addr.ip;
+		if (ret_saddr)
+			*ret_saddr = dest->dst_saddr.ip;
+		spin_unlock(&dest->dst_lock);
+	} else {
+		struct flowi4 fl4;
+
+		memset(&fl4, 0, sizeof(fl4));
+		fl4.daddr = daddr;
+		fl4.flowi4_tos = rtos;
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt)) {
+			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
+				     &daddr);
+			return NULL;
+		}
+		if (ret_saddr)
+			*ret_saddr = fl4.saddr;
+	}
+
+	local = rt->rt_flags & RTCF_LOCAL;
+	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+	      rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+			     (rt->rt_flags & RTCF_LOCAL) ?
+			     "local":"non-local", &daddr);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
+	    !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+			     "requires NAT method, dest: %pI4\n",
+			     &ip_hdr(skb)->daddr, &daddr);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+			     "to non-local address, dest: %pI4\n",
+			     &ip_hdr(skb)->saddr, &daddr);
+		ip_rt_put(rt);
+		return NULL;
+	}
+
+	return rt;
+}
+
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+	struct net *net = dev_net(dev);
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (rt_is_input_route(rt)) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
+			return 0;
+		refdst_drop(orefdst);
+	} else {
+		struct flowi4 fl4 = {
+			.daddr = iph->daddr,
+			.saddr = iph->saddr,
+			.flowi4_tos = RT_TOS(iph->tos),
+			.flowi4_mark = skb->mark,
+		};
+
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt))
+			return 0;
+		if (!(rt->rt_flags & RTCF_LOCAL)) {
+			ip_rt_put(rt);
+			return 0;
+		}
+		/* Drop old route. */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	}
+	return 1;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+	return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi6 fl6 = {
+		.daddr = *daddr,
+	};
+
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl6.saddr) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl6.daddr, 0, &fl6.saddr) < 0)
+		goto out_err;
+	if (do_xfrm) {
+		dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+		if (IS_ERR(dst)) {
+			dst = NULL;
+			goto out_err;
+		}
+	}
+	ipv6_addr_copy(ret_saddr, &fl6.saddr);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ */
+static struct rt6_info *
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
+		      int do_xfrm, int rt_mode)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct rt6_info *rt;			/* Route to the other host */
+	struct rt6_info *ort;			/* Original route */
+	struct dst_entry *dst;
+	int local;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
+		if (!rt) {
+			u32 cookie;
+
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr.in6,
+						      do_xfrm);
+			if (!dst) {
+				spin_unlock(&dest->dst_lock);
+				return NULL;
+			}
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr.in6,
+				  atomic_read(&rt->dst.__refcnt));
+		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr.in6);
+		spin_unlock(&dest->dst_lock);
+	} else {
+		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+		if (!dst)
+			return NULL;
+		rt = (struct rt6_info *) dst;
+	}
+
+	local = __ip_vs_is_local_route6(rt);
+	if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
+	      rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+			     local ? "local":"non-local", daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
+	    !((ort = (struct rt6_info *) skb_dst(skb)) &&
+	      __ip_vs_is_local_route6(ort))) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+			     "requires NAT method, dest: %pI6\n",
+			     &ipv6_hdr(skb)->daddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+				    IPV6_ADDR_LOOPBACK)) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+			     "to non-local address, dest: %pI6\n",
+			     &ipv6_hdr(skb)->saddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+
+	return rt;
+}
+#endif
+
+
+/*
+ *	Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = NULL;
+	dst_release(old_dst);
+}
+
+#define IP_VS_XMIT_TUNNEL(skb, cp)				\
+({								\
+	int __ret = NF_ACCEPT;					\
+								\
+	(skb)->ipvs_property = 1;				\
+	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
+		__ret = ip_vs_confirm_conntrack(skb, cp);	\
+	if (__ret == NF_ACCEPT) {				\
+		nf_reset(skb);					\
+		skb_forward_csum(skb);				\
+	}							\
+	__ret;							\
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\
+do {							\
+	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	else						\
+		ip_vs_update_conntrack(skb, cp, 1);	\
+	if (local)					\
+		return NF_ACCEPT;			\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		skb_dst(skb)->dev, dst_output);		\
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local)			\
+do {							\
+	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	if (local)					\
+		return NF_ACCEPT;			\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		skb_dst(skb)->dev, dst_output);		\
+} while (0)
+
+
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp)
+{
+	/* we do not touch skb and do not need pskb ptr */
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+}
+
+
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = ip_hdr(skb);
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
+				      IP_VS_RT_MODE_NON_LOCAL, NULL)))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(ip_hdr(skb));
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	struct ipv6hdr  *iph = ipv6_hdr(skb);
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0,
+					 IP_VS_RT_MODE_NON_LOCAL)))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if (skb->len > mtu && !skb_is_gso(skb)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		dst_release(&rt->dst);
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	       struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;		/* Route to the other host */
+	int mtu;
+	struct iphdr *iph = ip_hdr(skb);
+	int local;
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__be16 _pt, *p;
+		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos),
+				      IP_VS_RT_MODE_LOCAL |
+					IP_VS_RT_MODE_NON_LOCAL |
+					IP_VS_RT_MODE_RDR, NULL)))
+		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+					 "ip_vs_nat_xmit(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(cp->daddr.ip) &&
+	    rt_is_input_route(skb_rtable(skb))) {
+		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+				 "ip_vs_nat_xmit(): frag needed for");
+		goto tx_error_put;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, sizeof(struct iphdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+		goto tx_error_put;
+	ip_hdr(skb)->daddr = cp->daddr.ip;
+	ip_send_check(ip_hdr(skb));
+
+	if (!local) {
+		/* drop old route */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
+	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	int mtu;
+	int local;
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__be16 _pt, *p;
+		p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
+				       sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, (IP_VS_RT_MODE_LOCAL |
+					     IP_VS_RT_MODE_NON_LOCAL |
+					     IP_VS_RT_MODE_RDR))))
+		goto tx_error_icmp;
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+					 "ip_vs_nat_xmit_v6(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if (skb->len > mtu && !skb_is_gso(skb)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): frag needed for");
+		goto tx_error_put;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
+		goto tx_error;
+	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
+
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
+	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	LeaveFunction(10);
+	kfree_skb(skb);
+	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
+}
+#endif
+
+
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	__be32 saddr;				/* Source for tunnel */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *old_iph = ip_hdr(skb);
+	u8     tos = old_iph->tos;
+	__be16 df = old_iph->frag_off;
+	struct iphdr  *iph;			/* Our new IP header */
+	unsigned int max_headroom;		/* The extra header space needed */
+	int    mtu;
+	int ret;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
+						   IP_VS_RT_MODE_NON_LOCAL,
+						   &saddr)))
+		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
+
+	tdev = rt->dst.dev;
+
+	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+	if (mtu < 68) {
+		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+		goto tx_error_put;
+	}
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+
+	df |= (old_iph->frag_off & htons(IP_DF));
+
+	if ((old_iph->frag_off & htons(IP_DF) &&
+	    mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error_put;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("%s(): no memory\n", __func__);
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ip_hdr(skb);
+	}
+
+	skb->transport_header = skb->network_header;
+
+	/* fix old IP header checksum */
+	ip_send_check(old_iph);
+
+	skb_push(skb, sizeof(struct iphdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	ip_hdr(skb);
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	tos;
+	iph->daddr		=	cp->daddr.ip;
+	iph->saddr		=	saddr;
+	iph->ttl		=	old_iph->ttl;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		     struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
+	struct net_device *tdev;	/* Device to other host */
+	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
+	struct ipv6hdr  *iph;		/* Our new IP header */
+	unsigned int max_headroom;	/* The extra header space needed */
+	int    mtu;
+	int ret;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+					 &saddr, 1, (IP_VS_RT_MODE_LOCAL |
+						     IP_VS_RT_MODE_NON_LOCAL))))
+		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
+
+	tdev = rt->dst.dev;
+
+	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+	if (mtu < IPV6_MIN_MTU) {
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
+		goto tx_error_put;
+	}
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+
+	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
+	    !skb_is_gso(skb)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error_put;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			dst_release(&rt->dst);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("%s(): no memory\n", __func__);
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = ipv6_hdr(skb);
+	}
+
+	skb->transport_header = skb->network_header;
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	ipv6_hdr(skb);
+	iph->version		=	6;
+	iph->nexthdr		=	IPPROTO_IPV6;
+	iph->payload_len	=	old_iph->payload_len;
+	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
+	iph->priority		=	old_iph->priority;
+	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+	ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+	ipv6_addr_copy(&iph->saddr, &saddr);
+	iph->hop_limit		=	old_iph->hop_limit;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip6_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
+}
+#endif
+
+
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	      struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = ip_hdr(skb);
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos),
+				      IP_VS_RT_MODE_LOCAL |
+					IP_VS_RT_MODE_NON_LOCAL, NULL)))
+		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
+	    !skb_is_gso(skb)) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(ip_hdr(skb));
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		 struct ip_vs_protocol *pp)
+{
+	struct rt6_info *rt;			/* Route to the other host */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, (IP_VS_RT_MODE_LOCAL |
+					     IP_VS_RT_MODE_NON_LOCAL))))
+		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		dst_release(&rt->dst);
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(skb == NULL)) {
+		dst_release(&rt->dst);
+		return NF_STOLEN;
+	}
+
+	/* drop old route */
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+#endif
+
+
+/*
+ *	ICMP packet transmitter
+ *	called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset, unsigned int hooknum)
+{
+	struct rtable	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+	int local;
+	int rt_mode;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	/* LOCALNODE from FORWARD hook is not supported */
+	rt_mode = (hooknum != NF_INET_FORWARD) ?
+		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(ip_hdr(skb)->tos),
+				      rt_mode, NULL)))
+		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI4\n",
+				  __func__, &cp->daddr.ip);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(cp->daddr.ip) &&
+	    rt_is_input_route(skb_rtable(skb))) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI4\n",
+			  __func__, &cp->daddr.ip);
+		goto tx_error_put;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
+	    !skb_is_gso(skb)) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error_put;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	ip_vs_nat_icmp(skb, pp, cp, 0);
+
+	if (!local) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+
+	rc = NF_STOLEN;
+	goto out;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+  out:
+	LeaveFunction(10);
+	return rc;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset, unsigned int hooknum)
+{
+	struct rt6_info	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+	int local;
+	int rt_mode;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	/* LOCALNODE from FORWARD hook is not supported */
+	rt_mode = (hooknum != NF_INET_FORWARD) ?
+		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, rt_mode)))
+		goto tx_error_icmp;
+
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI6\n",
+				  __func__, &cp->daddr.in6);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI6\n",
+			  __func__, &cp->daddr.in6);
+		goto tx_error_put;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->dst);
+	if (skb->len > mtu && !skb_is_gso(skb)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
+		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+		goto tx_error_put;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!skb_make_writable(skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+
+	rc = NF_STOLEN;
+	goto out;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+out:
+	LeaveFunction(10);
+	return rc;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
+}
+#endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
new file mode 100644
index 00000000..5178c691
--- /dev/null
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -0,0 +1,136 @@
+/* Accouting handling for netfilter. */
+
+/*
+ * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+
+static int nf_ct_acct __read_mostly;
+
+module_param_named(acct, nf_ct_acct, bool, 0644);
+MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table acct_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_acct",
+		.data		= &init_net.ct.sysctl_acct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+#endif /* CONFIG_SYSCTL */
+
+unsigned int
+seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
+{
+	struct nf_conn_counter *acct;
+
+	acct = nf_conn_acct_find(ct);
+	if (!acct)
+		return 0;
+
+	return seq_printf(s, "packets=%llu bytes=%llu ",
+			  (unsigned long long)acct[dir].packets,
+			  (unsigned long long)acct[dir].bytes);
+};
+EXPORT_SYMBOL_GPL(seq_print_acct);
+
+static struct nf_ct_ext_type acct_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]),
+	.align	= __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]),
+	.id	= NF_CT_EXT_ACCT,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_acct;
+
+	net->ct.acct_sysctl_header = register_net_sysctl_table(net,
+			nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.acct_sysctl_header) {
+		printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
+		goto out_register;
+	}
+	return 0;
+
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.acct_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.acct_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_acct_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_acct_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_acct_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_acct = nf_ct_acct;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_ct_extend_register(&acct_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_acct_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
+	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&acct_extend);
+out_extend_register:
+	return ret;
+}
+
+void nf_conntrack_acct_fini(struct net *net)
+{
+	nf_conntrack_acct_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&acct_extend);
+}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
new file mode 100644
index 00000000..13fd2c55
--- /dev/null
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -0,0 +1,237 @@
+/* Amanda extension for IP connection tracking
+ *
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/textsearch.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+#include <linux/gfp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_amanda.h>
+
+static unsigned int master_timeout __read_mostly = 300;
+static char *ts_algo = "kmp";
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_amanda");
+MODULE_ALIAS_NFCT_HELPER("amanda");
+
+module_param(master_timeout, uint, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+module_param(ts_algo, charp, 0400);
+MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)");
+
+unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb,
+				   enum ip_conntrack_info ctinfo,
+				   unsigned int matchoff,
+				   unsigned int matchlen,
+				   struct nf_conntrack_expect *exp)
+				   __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_amanda_hook);
+
+enum amanda_strings {
+	SEARCH_CONNECT,
+	SEARCH_NEWLINE,
+	SEARCH_DATA,
+	SEARCH_MESG,
+	SEARCH_INDEX,
+};
+
+static struct {
+	const char		*string;
+	size_t			len;
+	struct ts_config	*ts;
+} search[] __read_mostly = {
+	[SEARCH_CONNECT] = {
+		.string	= "CONNECT ",
+		.len	= 8,
+	},
+	[SEARCH_NEWLINE] = {
+		.string	= "\n",
+		.len	= 1,
+	},
+	[SEARCH_DATA] = {
+		.string	= "DATA ",
+		.len	= 5,
+	},
+	[SEARCH_MESG] = {
+		.string	= "MESG ",
+		.len	= 5,
+	},
+	[SEARCH_INDEX] = {
+		.string = "INDEX ",
+		.len	= 6,
+	},
+};
+
+static int amanda_help(struct sk_buff *skb,
+		       unsigned int protoff,
+		       struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo)
+{
+	struct ts_state ts;
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple *tuple;
+	unsigned int dataoff, start, stop, off, i;
+	char pbuf[sizeof("65535")], *tmp;
+	u_int16_t len;
+	__be16 port;
+	int ret = NF_ACCEPT;
+	typeof(nf_nat_amanda_hook) nf_nat_amanda;
+
+	/* Only look at packets from the Amanda server */
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* increase the UDP timeout of the master connection as replies from
+	 * Amanda clients to the server can be quite delayed */
+	nf_ct_refresh(ct, skb, master_timeout * HZ);
+
+	/* No data? */
+	dataoff = protoff + sizeof(struct udphdr);
+	if (dataoff >= skb->len) {
+		if (net_ratelimit())
+			printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len);
+		return NF_ACCEPT;
+	}
+
+	memset(&ts, 0, sizeof(ts));
+	start = skb_find_text(skb, dataoff, skb->len,
+			      search[SEARCH_CONNECT].ts, &ts);
+	if (start == UINT_MAX)
+		goto out;
+	start += dataoff + search[SEARCH_CONNECT].len;
+
+	memset(&ts, 0, sizeof(ts));
+	stop = skb_find_text(skb, start, skb->len,
+			     search[SEARCH_NEWLINE].ts, &ts);
+	if (stop == UINT_MAX)
+		goto out;
+	stop += start;
+
+	for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
+		memset(&ts, 0, sizeof(ts));
+		off = skb_find_text(skb, start, stop, search[i].ts, &ts);
+		if (off == UINT_MAX)
+			continue;
+		off += start + search[i].len;
+
+		len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off);
+		if (skb_copy_bits(skb, off, pbuf, len))
+			break;
+		pbuf[len] = '\0';
+
+		port = htons(simple_strtoul(pbuf, &tmp, 10));
+		len = tmp - pbuf;
+		if (port == 0 || len > 5)
+			break;
+
+		exp = nf_ct_expect_alloc(ct);
+		if (exp == NULL) {
+			ret = NF_DROP;
+			goto out;
+		}
+		tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct),
+				  &tuple->src.u3, &tuple->dst.u3,
+				  IPPROTO_TCP, NULL, &port);
+
+		nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook);
+		if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
+			ret = nf_nat_amanda(skb, ctinfo, off - dataoff,
+					    len, exp);
+		else if (nf_ct_expect_related(exp) != 0)
+			ret = NF_DROP;
+		nf_ct_expect_put(exp);
+	}
+
+out:
+	return ret;
+}
+
+static const struct nf_conntrack_expect_policy amanda_exp_policy = {
+	.max_expected		= 3,
+	.timeout		= 180,
+};
+
+static struct nf_conntrack_helper amanda_helper[2] __read_mostly = {
+	{
+		.name			= "amanda",
+		.me			= THIS_MODULE,
+		.help			= amanda_help,
+		.tuple.src.l3num	= AF_INET,
+		.tuple.src.u.udp.port	= cpu_to_be16(10080),
+		.tuple.dst.protonum	= IPPROTO_UDP,
+		.expect_policy		= &amanda_exp_policy,
+	},
+	{
+		.name			= "amanda",
+		.me			= THIS_MODULE,
+		.help			= amanda_help,
+		.tuple.src.l3num	= AF_INET6,
+		.tuple.src.u.udp.port	= cpu_to_be16(10080),
+		.tuple.dst.protonum	= IPPROTO_UDP,
+		.expect_policy		= &amanda_exp_policy,
+	},
+};
+
+static void __exit nf_conntrack_amanda_fini(void)
+{
+	int i;
+
+	nf_conntrack_helper_unregister(&amanda_helper[0]);
+	nf_conntrack_helper_unregister(&amanda_helper[1]);
+	for (i = 0; i < ARRAY_SIZE(search); i++)
+		textsearch_destroy(search[i].ts);
+}
+
+static int __init nf_conntrack_amanda_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < ARRAY_SIZE(search); i++) {
+		search[i].ts = textsearch_prepare(ts_algo, search[i].string,
+						  search[i].len,
+						  GFP_KERNEL, TS_AUTOLOAD);
+		if (IS_ERR(search[i].ts)) {
+			ret = PTR_ERR(search[i].ts);
+			goto err1;
+		}
+	}
+	ret = nf_conntrack_helper_register(&amanda_helper[0]);
+	if (ret < 0)
+		goto err1;
+	ret = nf_conntrack_helper_register(&amanda_helper[1]);
+	if (ret < 0)
+		goto err2;
+	return 0;
+
+err2:
+	nf_conntrack_helper_unregister(&amanda_helper[0]);
+err1:
+	while (--i >= 0)
+		textsearch_destroy(search[i].ts);
+
+	return ret;
+}
+
+module_init(nf_conntrack_amanda_init);
+module_exit(nf_conntrack_amanda_fini);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 00000000..4e99cca6
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ *      broadcast connection tracking helper
+ *
+ *      (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+				unsigned int protoff,
+				struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned int timeout)
+{
+	struct nf_conntrack_expect *exp;
+	struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt = skb_rtable(skb);
+	struct in_device *in_dev;
+	struct nf_conn_help *help = nfct_help(ct);
+	__be32 mask = 0;
+
+	/* we're only interested in locally generated packets */
+	if (skb->sk == NULL)
+		goto out;
+	if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+		goto out;
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		goto out;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (in_dev != NULL) {
+		for_primary_ifa(in_dev) {
+			if (ifa->ifa_broadcast == iph->daddr) {
+				mask = ifa->ifa_mask;
+				break;
+			}
+		} endfor_ifa(in_dev);
+	}
+	rcu_read_unlock();
+
+	if (mask == 0)
+		goto out;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (exp == NULL)
+		goto out;
+
+	exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+	exp->mask.src.u3.ip       = mask;
+	exp->mask.src.u.udp.port  = htons(0xFFFF);
+
+	exp->expectfn             = NULL;
+	exp->flags                = NF_CT_EXPECT_PERMANENT;
+	exp->class		  = NF_CT_EXPECT_CLASS_DEFAULT;
+	exp->helper               = NULL;
+
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+
+	nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
new file mode 100644
index 00000000..f7af8b86
--- /dev/null
+++ b/net/netfilter/nf_conntrack_core.c
@@ -0,0 +1,1592 @@
+/* Connection state tracking for netfilter.  This is separated from,
+   but required by, the NAT layer; it can also be used by an iptables
+   extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/rculist_nulls.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+
+#define NF_CONNTRACK_VERSION	"0.5.0"
+
+int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
+				      enum nf_nat_manip_type manip,
+				      const struct nlattr *attr) __read_mostly;
+EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
+
+DEFINE_SPINLOCK(nf_conntrack_lock);
+EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+
+unsigned int nf_conntrack_htable_size __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
+
+unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
+
+DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
+
+unsigned int nf_conntrack_hash_rnd __read_mostly;
+
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
+{
+	unsigned int n;
+
+	/* The direction must be ignored, so we hash everything up to the
+	 * destination ports (which is a multiple of 4) and treat the last
+	 * three bytes manually.
+	 */
+	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
+	return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+		      (((__force __u16)tuple->dst.u.all << 16) |
+		      tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+	return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+	return __hash_bucket(hash, net->ct.htable_size);
+}
+
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+				  u16 zone, unsigned int size)
+{
+	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
+}
+
+static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
+				       const struct nf_conntrack_tuple *tuple)
+{
+	return __hash_conntrack(tuple, zone, net->ct.htable_size);
+}
+
+bool
+nf_ct_get_tuple(const struct sk_buff *skb,
+		unsigned int nhoff,
+		unsigned int dataoff,
+		u_int16_t l3num,
+		u_int8_t protonum,
+		struct nf_conntrack_tuple *tuple,
+		const struct nf_conntrack_l3proto *l3proto,
+		const struct nf_conntrack_l4proto *l4proto)
+{
+	memset(tuple, 0, sizeof(*tuple));
+
+	tuple->src.l3num = l3num;
+	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+		return false;
+
+	tuple->dst.protonum = protonum;
+	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	return l4proto->pkt_to_tuple(skb, dataoff, tuple);
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
+
+bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
+		       u_int16_t l3num, struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	unsigned int protoff;
+	u_int8_t protonum;
+	int ret;
+
+	rcu_read_lock();
+
+	l3proto = __nf_ct_l3proto_find(l3num);
+	ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
+	if (ret != NF_ACCEPT) {
+		rcu_read_unlock();
+		return false;
+	}
+
+	l4proto = __nf_ct_l4proto_find(l3num, protonum);
+
+	ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
+			      l3proto, l4proto);
+
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
+
+bool
+nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
+		   const struct nf_conntrack_tuple *orig,
+		   const struct nf_conntrack_l3proto *l3proto,
+		   const struct nf_conntrack_l4proto *l4proto)
+{
+	memset(inverse, 0, sizeof(*inverse));
+
+	inverse->src.l3num = orig->src.l3num;
+	if (l3proto->invert_tuple(inverse, orig) == 0)
+		return false;
+
+	inverse->dst.dir = !orig->dst.dir;
+
+	inverse->dst.protonum = orig->dst.protonum;
+	return l4proto->invert_tuple(inverse, orig);
+}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
+
+static void
+clean_from_lists(struct nf_conn *ct)
+{
+	pr_debug("clean_from_lists(%p)\n", ct);
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+
+	/* Destroy all pending expectations */
+	nf_ct_remove_expectations(ct);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+	struct nf_conn *ct = (struct nf_conn *)nfct;
+	struct net *net = nf_ct_net(ct);
+	struct nf_conntrack_l4proto *l4proto;
+
+	pr_debug("destroy_conntrack(%p)\n", ct);
+	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+	NF_CT_ASSERT(!timer_pending(&ct->timeout));
+
+	/* To make sure we don't get any weird locking issues here:
+	 * destroy_conntrack() MUST NOT be called with a write lock
+	 * to nf_conntrack_lock!!! -HW */
+	rcu_read_lock();
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	if (l4proto && l4proto->destroy)
+		l4proto->destroy(ct);
+
+	rcu_read_unlock();
+
+	spin_lock_bh(&nf_conntrack_lock);
+	/* Expectations will have been removed in clean_from_lists,
+	 * except TFTP can create an expectation on the first packet,
+	 * before connection is in the list, so we need to clean here,
+	 * too. */
+	nf_ct_remove_expectations(ct);
+
+	/* We overload first tuple to link into unconfirmed list. */
+	if (!nf_ct_is_confirmed(ct)) {
+		BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
+		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+	}
+
+	NF_CT_STAT_INC(net, delete);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	if (ct->master)
+		nf_ct_put(ct->master);
+
+	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+	nf_conntrack_free(ct);
+}
+
+void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+
+	nf_ct_helper_destroy(ct);
+	spin_lock_bh(&nf_conntrack_lock);
+	/* Inside lock so preempt is disabled on module removal path.
+	 * Otherwise we can get spurious warnings. */
+	NF_CT_STAT_INC(net, delete_list);
+	clean_from_lists(ct);
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+	struct net *net = nf_ct_net(ct);
+
+	if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+		/* bad luck, let's retry again */
+		ct->timeout.expires = jiffies +
+			(random32() % net->ct.sysctl_events_retry_timeout);
+		add_timer(&ct->timeout);
+		return;
+	}
+	/* we've got the event delivered, now it's dying */
+	set_bit(IPS_DYING_BIT, &ct->status);
+	spin_lock(&nf_conntrack_lock);
+	hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+	spin_unlock(&nf_conntrack_lock);
+	nf_ct_put(ct);
+}
+
+void nf_ct_insert_dying_list(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+
+	/* add this conntrack to the dying list */
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+			     &net->ct.dying);
+	spin_unlock_bh(&nf_conntrack_lock);
+	/* set a new timer to retry event delivery */
+	setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+	ct->timeout.expires = jiffies +
+		(random32() % net->ct.sysctl_events_retry_timeout);
+	add_timer(&ct->timeout);
+}
+EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	struct nf_conn *ct = (void *)ul_conntrack;
+	struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp && tstamp->stop == 0)
+		tstamp->stop = ktime_to_ns(ktime_get_real());
+
+	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+		/* destroy event was not delivered */
+		nf_ct_delete_from_lists(ct);
+		nf_ct_insert_dying_list(ct);
+		return;
+	}
+	set_bit(IPS_DYING_BIT, &ct->status);
+	nf_ct_delete_from_lists(ct);
+	nf_ct_put(ct);
+}
+
+/*
+ * Warning :
+ * - Caller must take a reference on returned object
+ *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
+ * OR
+ * - Caller must lock nf_conntrack_lock before calling this function
+ */
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple, u32 hash)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
+	unsigned int bucket = hash_bucket(hash, net);
+
+	/* Disable BHs the entire time since we normally need to disable them
+	 * at least once for the stats anyway.
+	 */
+	local_bh_disable();
+begin:
+	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
+		if (nf_ct_tuple_equal(tuple, &h->tuple) &&
+		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
+			NF_CT_STAT_INC(net, found);
+			local_bh_enable();
+			return h;
+		}
+		NF_CT_STAT_INC(net, searched);
+	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(n) != bucket) {
+		NF_CT_STAT_INC(net, search_restart);
+		goto begin;
+	}
+	local_bh_enable();
+
+	return NULL;
+}
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+		    const struct nf_conntrack_tuple *tuple)
+{
+	return ____nf_conntrack_find(net, zone, tuple,
+				     hash_conntrack_raw(tuple, zone));
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_find);
+
+/* Find a connection corresponding to a tuple. */
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+			const struct nf_conntrack_tuple *tuple, u32 hash)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+
+	rcu_read_lock();
+begin:
+	h = ____nf_conntrack_find(net, zone, tuple, hash);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		if (unlikely(nf_ct_is_dying(ct) ||
+			     !atomic_inc_not_zero(&ct->ct_general.use)))
+			h = NULL;
+		else {
+			if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) ||
+				     nf_ct_zone(ct) != zone)) {
+				nf_ct_put(ct);
+				goto begin;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return h;
+}
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	return __nf_conntrack_find_get(net, zone, tuple,
+				       hash_conntrack_raw(tuple, zone));
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
+
+static void __nf_conntrack_hash_insert(struct nf_conn *ct,
+				       unsigned int hash,
+				       unsigned int repl_hash)
+{
+	struct net *net = nf_ct_net(ct);
+
+	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+			   &net->ct.hash[hash]);
+	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+			   &net->ct.hash[repl_hash]);
+}
+
+void nf_conntrack_hash_insert(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	unsigned int hash, repl_hash;
+	u16 zone;
+
+	zone = nf_ct_zone(ct);
+	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__nf_conntrack_confirm(struct sk_buff *skb)
+{
+	unsigned int hash, repl_hash;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct nf_conn_help *help;
+	struct nf_conn_tstamp *tstamp;
+	struct hlist_nulls_node *n;
+	enum ip_conntrack_info ctinfo;
+	struct net *net;
+	u16 zone;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	net = nf_ct_net(ct);
+
+	/* ipt_REJECT uses nf_conntrack_attach to attach related
+	   ICMP/TCP RST packets in other direction.  Actual packet
+	   which created connection will be IP_CT_NEW or for an
+	   expected connection, IP_CT_RELATED. */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	zone = nf_ct_zone(ct);
+	/* reuse the hash saved before */
+	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+	hash = hash_bucket(hash, net);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	/* We're not in hash table, and we refuse to set up related
+	   connections for unconfirmed conns.  But packet copies and
+	   REJECT will give spurious warnings here. */
+	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+	/* No external references means no one else could have
+	   confirmed us. */
+	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+	pr_debug("Confirming conntrack %p\n", ct);
+
+	spin_lock_bh(&nf_conntrack_lock);
+
+	/* We have to check the DYING flag inside the lock to prevent
+	   a race against nf_ct_get_next_corpse() possibly called from
+	   user context, else we insert an already 'dead' hash, blocking
+	   further use of that particular connection -JM */
+
+	if (unlikely(nf_ct_is_dying(ct))) {
+		spin_unlock_bh(&nf_conntrack_lock);
+		return NF_ACCEPT;
+	}
+
+	/* See if there's one in the list already, including reverse:
+	   NAT could have grabbed it without realizing, since we're
+	   not in the hash.  If there is, we lost race. */
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+	hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple) &&
+		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
+			goto out;
+
+	/* Remove from unconfirmed list */
+	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+
+	/* Timer relative to confirmation time, not original
+	   setting time, otherwise we'd get timer wrap in
+	   weird delay cases. */
+	ct->timeout.expires += jiffies;
+	add_timer(&ct->timeout);
+	atomic_inc(&ct->ct_general.use);
+	ct->status |= IPS_CONFIRMED;
+
+	/* set conntrack timestamp, if enabled. */
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp) {
+		if (skb->tstamp.tv64 == 0)
+			__net_timestamp((struct sk_buff *)skb);
+
+		tstamp->start = ktime_to_ns(skb->tstamp);
+	}
+	/* Since the lookup is lockless, hash insertion must be done after
+	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
+	 * guarantee that no other CPU can find the conntrack before the above
+	 * stores are visible.
+	 */
+	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	NF_CT_STAT_INC(net, insert);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	help = nfct_help(ct);
+	if (help && help->helper)
+		nf_conntrack_event_cache(IPCT_HELPER, ct);
+
+	nf_conntrack_event_cache(master_ct(ct) ?
+				 IPCT_RELATED : IPCT_NEW, ct);
+	return NF_ACCEPT;
+
+out:
+	NF_CT_STAT_INC(net, insert_failed);
+	spin_unlock_bh(&nf_conntrack_lock);
+	return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
+
+/* Returns true if a connection correspondings to the tuple (required
+   for NAT). */
+int
+nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
+			 const struct nf_conn *ignored_conntrack)
+{
+	struct net *net = nf_ct_net(ignored_conntrack);
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
+	struct nf_conn *ct;
+	u16 zone = nf_ct_zone(ignored_conntrack);
+	unsigned int hash = hash_conntrack(net, zone, tuple);
+
+	/* Disable BHs the entire time since we need to disable them at
+	 * least once for the stats anyway.
+	 */
+	rcu_read_lock_bh();
+	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		if (ct != ignored_conntrack &&
+		    nf_ct_tuple_equal(tuple, &h->tuple) &&
+		    nf_ct_zone(ct) == zone) {
+			NF_CT_STAT_INC(net, found);
+			rcu_read_unlock_bh();
+			return 1;
+		}
+		NF_CT_STAT_INC(net, searched);
+	}
+	rcu_read_unlock_bh();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
+
+#define NF_CT_EVICTION_RANGE	8
+
+/* There's a small race here where we may free a just-assured
+   connection.  Too bad: we're in trouble anyway. */
+static noinline int early_drop(struct net *net, unsigned int hash)
+{
+	/* Use oldest entry, which is roughly LRU */
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct = NULL, *tmp;
+	struct hlist_nulls_node *n;
+	unsigned int i, cnt = 0;
+	int dropped = 0;
+
+	rcu_read_lock();
+	for (i = 0; i < net->ct.htable_size; i++) {
+		hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
+					 hnnode) {
+			tmp = nf_ct_tuplehash_to_ctrack(h);
+			if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
+				ct = tmp;
+			cnt++;
+		}
+
+		if (ct != NULL) {
+			if (likely(!nf_ct_is_dying(ct) &&
+				   atomic_inc_not_zero(&ct->ct_general.use)))
+				break;
+			else
+				ct = NULL;
+		}
+
+		if (cnt >= NF_CT_EVICTION_RANGE)
+			break;
+
+		hash = (hash + 1) % net->ct.htable_size;
+	}
+	rcu_read_unlock();
+
+	if (!ct)
+		return dropped;
+
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
+		dropped = 1;
+		NF_CT_STAT_INC_ATOMIC(net, early_drop);
+	}
+	nf_ct_put(ct);
+	return dropped;
+}
+
+void init_nf_conntrack_hash_rnd(void)
+{
+	unsigned int rand;
+
+	/*
+	 * Why not initialize nf_conntrack_rnd in a "init()" function ?
+	 * Because there isn't enough entropy when system initializing,
+	 * and we initialize it as late as possible.
+	 */
+	do {
+		get_random_bytes(&rand, sizeof(rand));
+	} while (!rand);
+	cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+}
+
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+		     const struct nf_conntrack_tuple *orig,
+		     const struct nf_conntrack_tuple *repl,
+		     gfp_t gfp, u32 hash)
+{
+	struct nf_conn *ct;
+
+	if (unlikely(!nf_conntrack_hash_rnd)) {
+		init_nf_conntrack_hash_rnd();
+		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
+		hash = hash_conntrack_raw(orig, zone);
+	}
+
+	/* We don't want any race condition at early drop stage */
+	atomic_inc(&net->ct.count);
+
+	if (nf_conntrack_max &&
+	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+		if (!early_drop(net, hash_bucket(hash, net))) {
+			atomic_dec(&net->ct.count);
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "nf_conntrack: table full, dropping"
+				       " packet.\n");
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+	/*
+	 * Do not use kmem_cache_zalloc(), as this cache uses
+	 * SLAB_DESTROY_BY_RCU.
+	 */
+	ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+	if (ct == NULL) {
+		pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
+		atomic_dec(&net->ct.count);
+		return ERR_PTR(-ENOMEM);
+	}
+	/*
+	 * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
+	 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
+	 */
+	memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
+	       offsetof(struct nf_conn, proto) -
+	       offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+	spin_lock_init(&ct->lock);
+	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
+	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+	/* save hash for reusing when confirming */
+	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
+	/* Don't set timer yet: wait for confirmation */
+	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
+	write_pnet(&ct->ct_net, net);
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (zone) {
+		struct nf_conntrack_zone *nf_ct_zone;
+
+		nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC);
+		if (!nf_ct_zone)
+			goto out_free;
+		nf_ct_zone->id = zone;
+	}
+#endif
+	/*
+	 * changes to lookup keys must be done before setting refcnt to 1
+	 */
+	smp_wmb();
+	atomic_set(&ct->ct_general.use, 1);
+	return ct;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+out_free:
+	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+	return ERR_PTR(-ENOMEM);
+#endif
+}
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+				   const struct nf_conntrack_tuple *orig,
+				   const struct nf_conntrack_tuple *repl,
+				   gfp_t gfp)
+{
+	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
+
+void nf_conntrack_free(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+
+	nf_ct_ext_destroy(ct);
+	atomic_dec(&net->ct.count);
+	nf_ct_ext_free(ct);
+	kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_free);
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+   failed due to stress.  Otherwise it really is unclassifiable. */
+static struct nf_conntrack_tuple_hash *
+init_conntrack(struct net *net, struct nf_conn *tmpl,
+	       const struct nf_conntrack_tuple *tuple,
+	       struct nf_conntrack_l3proto *l3proto,
+	       struct nf_conntrack_l4proto *l4proto,
+	       struct sk_buff *skb,
+	       unsigned int dataoff, u32 hash)
+{
+	struct nf_conn *ct;
+	struct nf_conn_help *help;
+	struct nf_conntrack_tuple repl_tuple;
+	struct nf_conntrack_ecache *ecache;
+	struct nf_conntrack_expect *exp;
+	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+
+	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+		pr_debug("Can't invert tuple.\n");
+		return NULL;
+	}
+
+	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+				  hash);
+	if (IS_ERR(ct)) {
+		pr_debug("Can't allocate conntrack.\n");
+		return (struct nf_conntrack_tuple_hash *)ct;
+	}
+
+	if (!l4proto->new(ct, skb, dataoff)) {
+		nf_conntrack_free(ct);
+		pr_debug("init conntrack: can't track with proto module\n");
+		return NULL;
+	}
+
+	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+
+	ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
+	nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+				 ecache ? ecache->expmask : 0,
+			     GFP_ATOMIC);
+
+	spin_lock_bh(&nf_conntrack_lock);
+	exp = nf_ct_find_expectation(net, zone, tuple);
+	if (exp) {
+		pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+			 ct, exp);
+		/* Welcome, Mr. Bond.  We've been expecting you... */
+		__set_bit(IPS_EXPECTED_BIT, &ct->status);
+		ct->master = exp->master;
+		if (exp->helper) {
+			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+			if (help)
+				rcu_assign_pointer(help->helper, exp->helper);
+		}
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+		ct->mark = exp->master->mark;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+		ct->secmark = exp->master->secmark;
+#endif
+		nf_conntrack_get(&ct->master->ct_general);
+		NF_CT_STAT_INC(net, expect_new);
+	} else {
+		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
+		NF_CT_STAT_INC(net, new);
+	}
+
+	/* Overload tuple linked list to put us in unconfirmed list. */
+	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+		       &net->ct.unconfirmed);
+
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	if (exp) {
+		if (exp->expectfn)
+			exp->expectfn(ct, exp);
+		nf_ct_expect_put(exp);
+	}
+
+	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct nf_conn *
+resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
+		  struct sk_buff *skb,
+		  unsigned int dataoff,
+		  u_int16_t l3num,
+		  u_int8_t protonum,
+		  struct nf_conntrack_l3proto *l3proto,
+		  struct nf_conntrack_l4proto *l4proto,
+		  int *set_reply,
+		  enum ip_conntrack_info *ctinfo)
+{
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	u32 hash;
+
+	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
+			     dataoff, l3num, protonum, &tuple, l3proto,
+			     l4proto)) {
+		pr_debug("resolve_normal_ct: Can't get tuple\n");
+		return NULL;
+	}
+
+	/* look for tuple match */
+	hash = hash_conntrack_raw(&tuple, zone);
+	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
+	if (!h) {
+		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+				   skb, dataoff, hash);
+		if (!h)
+			return NULL;
+		if (IS_ERR(h))
+			return (void *)h;
+	}
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	/* It exists; we have (non-exclusive) reference. */
+	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
+		*ctinfo = IP_CT_ESTABLISHED_REPLY;
+		/* Please set reply bit if this packet OK */
+		*set_reply = 1;
+	} else {
+		/* Once we've had two way comms, always ESTABLISHED. */
+		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+			*ctinfo = IP_CT_ESTABLISHED;
+		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+			pr_debug("nf_conntrack_in: related packet for %p\n",
+				 ct);
+			*ctinfo = IP_CT_RELATED;
+		} else {
+			pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+			*ctinfo = IP_CT_NEW;
+		}
+		*set_reply = 0;
+	}
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return ct;
+}
+
+unsigned int
+nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
+		struct sk_buff *skb)
+{
+	struct nf_conn *ct, *tmpl = NULL;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	unsigned int dataoff;
+	u_int8_t protonum;
+	int set_reply = 0;
+	int ret;
+
+	if (skb->nfct) {
+		/* Previously seen (loopback or untracked)?  Ignore. */
+		tmpl = (struct nf_conn *)skb->nfct;
+		if (!nf_ct_is_template(tmpl)) {
+			NF_CT_STAT_INC_ATOMIC(net, ignore);
+			return NF_ACCEPT;
+		}
+		skb->nfct = NULL;
+	}
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	l3proto = __nf_ct_l3proto_find(pf);
+	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
+				   &dataoff, &protonum);
+	if (ret <= 0) {
+		pr_debug("not prepared to track yet or error occurred\n");
+		NF_CT_STAT_INC_ATOMIC(net, error);
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		ret = -ret;
+		goto out;
+	}
+
+	l4proto = __nf_ct_l4proto_find(pf, protonum);
+
+	/* It may be an special packet, error, unclean...
+	 * inverse of the return code tells to the netfilter
+	 * core what to do with the packet. */
+	if (l4proto->error != NULL) {
+		ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
+				     pf, hooknum);
+		if (ret <= 0) {
+			NF_CT_STAT_INC_ATOMIC(net, error);
+			NF_CT_STAT_INC_ATOMIC(net, invalid);
+			ret = -ret;
+			goto out;
+		}
+		/* ICMP[v6] protocol trackers may assign one conntrack. */
+		if (skb->nfct)
+			goto out;
+	}
+
+	ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
+			       l3proto, l4proto, &set_reply, &ctinfo);
+	if (!ct) {
+		/* Not valid part of a connection */
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		ret = NF_ACCEPT;
+		goto out;
+	}
+
+	if (IS_ERR(ct)) {
+		/* Too stressed to deal. */
+		NF_CT_STAT_INC_ATOMIC(net, drop);
+		ret = NF_DROP;
+		goto out;
+	}
+
+	NF_CT_ASSERT(skb->nfct);
+
+	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
+	if (ret <= 0) {
+		/* Invalid: inverse of the return code tells
+		 * the netfilter core what to do */
+		pr_debug("nf_conntrack_in: Can't track with proto module\n");
+		nf_conntrack_put(skb->nfct);
+		skb->nfct = NULL;
+		NF_CT_STAT_INC_ATOMIC(net, invalid);
+		if (ret == -NF_DROP)
+			NF_CT_STAT_INC_ATOMIC(net, drop);
+		ret = -ret;
+		goto out;
+	}
+
+	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+		nf_conntrack_event_cache(IPCT_REPLY, ct);
+out:
+	if (tmpl) {
+		/* Special case: we have to repeat this hook, assign the
+		 * template again to this packet. We assume that this packet
+		 * has no conntrack assigned. This is used by nf_ct_tcp. */
+		if (ret == NF_REPEAT)
+			skb->nfct = (struct nf_conntrack *)tmpl;
+		else
+			nf_ct_put(tmpl);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_in);
+
+bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
+			  const struct nf_conntrack_tuple *orig)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = nf_ct_invert_tuple(inverse, orig,
+				 __nf_ct_l3proto_find(orig->src.l3num),
+				 __nf_ct_l4proto_find(orig->src.l3num,
+						      orig->dst.protonum));
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
+
+/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
+   implicitly racy: see __nf_conntrack_confirm */
+void nf_conntrack_alter_reply(struct nf_conn *ct,
+			      const struct nf_conntrack_tuple *newreply)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+
+	/* Should be unconfirmed, so not in hash table yet */
+	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+
+	pr_debug("Altering reply tuple of %p to ", ct);
+	nf_ct_dump_tuple(newreply);
+
+	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+	if (ct->master || (help && !hlist_empty(&help->expectations)))
+		return;
+
+	rcu_read_lock();
+	__nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
+
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __nf_ct_refresh_acct(struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  const struct sk_buff *skb,
+			  unsigned long extra_jiffies,
+			  int do_acct)
+{
+	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
+	NF_CT_ASSERT(skb);
+
+	/* Only update if this is not a fixed timeout */
+	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+		goto acct;
+
+	/* If not in hash table, timer will not be active yet */
+	if (!nf_ct_is_confirmed(ct)) {
+		ct->timeout.expires = extra_jiffies;
+	} else {
+		unsigned long newtime = jiffies + extra_jiffies;
+
+		/* Only update the timeout if the new timeout is at least
+		   HZ jiffies from the old timeout. Need del_timer for race
+		   avoidance (may already be dying). */
+		if (newtime - ct->timeout.expires >= HZ)
+			mod_timer_pending(&ct->timeout, newtime);
+	}
+
+acct:
+	if (do_acct) {
+		struct nf_conn_counter *acct;
+
+		acct = nf_conn_acct_find(ct);
+		if (acct) {
+			spin_lock_bh(&ct->lock);
+			acct[CTINFO2DIR(ctinfo)].packets++;
+			acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
+			spin_unlock_bh(&ct->lock);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
+
+bool __nf_ct_kill_acct(struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       const struct sk_buff *skb,
+		       int do_acct)
+{
+	if (do_acct) {
+		struct nf_conn_counter *acct;
+
+		acct = nf_conn_acct_find(ct);
+		if (acct) {
+			spin_lock_bh(&ct->lock);
+			acct[CTINFO2DIR(ctinfo)].packets++;
+			acct[CTINFO2DIR(ctinfo)].bytes +=
+				skb->len - skb_network_offset(skb);
+			spin_unlock_bh(&ct->lock);
+		}
+	}
+
+	if (del_timer(&ct->timeout)) {
+		ct->timeout.function((unsigned long)ct);
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
+	.len	= sizeof(struct nf_conntrack_zone),
+	.align	= __alignof__(struct nf_conntrack_zone),
+	.id	= NF_CT_EXT_ZONE,
+};
+#endif
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/mutex.h>
+
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
+			       const struct nf_conntrack_tuple *tuple)
+{
+	NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
+	NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
+
+const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
+	[CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
+	[CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
+};
+EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
+
+int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
+			       struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
+		return -EINVAL;
+
+	t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+	t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
+
+int nf_ct_port_nlattr_tuple_size(void)
+{
+	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
+#endif
+
+/* Used by ipt_REJECT and ip6t_REJECT. */
+static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	/* This ICMP is in reverse direction to the packet which caused it */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+		ctinfo = IP_CT_RELATED_REPLY;
+	else
+		ctinfo = IP_CT_RELATED;
+
+	/* Attach to new skbuff, and increment count */
+	nskb->nfct = &ct->ct_general;
+	nskb->nfctinfo = ctinfo;
+	nf_conntrack_get(nskb->nfct);
+}
+
+/* Bring out ya dead! */
+static struct nf_conn *
+get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
+		void *data, unsigned int *bucket)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	for (; *bucket < net->ct.htable_size; (*bucket)++) {
+		hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (iter(ct, data))
+				goto found;
+		}
+	}
+	hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		if (iter(ct, data))
+			set_bit(IPS_DYING_BIT, &ct->status);
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+	return NULL;
+found:
+	atomic_inc(&ct->ct_general.use);
+	spin_unlock_bh(&nf_conntrack_lock);
+	return ct;
+}
+
+void nf_ct_iterate_cleanup(struct net *net,
+			   int (*iter)(struct nf_conn *i, void *data),
+			   void *data)
+{
+	struct nf_conn *ct;
+	unsigned int bucket = 0;
+
+	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
+		/* Time to push up daises... */
+		if (del_timer(&ct->timeout))
+			death_by_timeout((unsigned long)ct);
+		/* ... else the timer will get him soon. */
+
+		nf_ct_put(ct);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
+
+struct __nf_ct_flush_report {
+	u32 pid;
+	int report;
+};
+
+static int kill_report(struct nf_conn *i, void *data)
+{
+	struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
+	struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(i);
+	if (tstamp && tstamp->stop == 0)
+		tstamp->stop = ktime_to_ns(ktime_get_real());
+
+	/* If we fail to deliver the event, death_by_timeout() will retry */
+	if (nf_conntrack_event_report(IPCT_DESTROY, i,
+				      fr->pid, fr->report) < 0)
+		return 1;
+
+	/* Avoid the delivery of the destroy event in death_by_timeout(). */
+	set_bit(IPS_DYING_BIT, &i->status);
+	return 1;
+}
+
+static int kill_all(struct nf_conn *i, void *data)
+{
+	return 1;
+}
+
+void nf_ct_free_hashtable(void *hash, unsigned int size)
+{
+	if (is_vmalloc_addr(hash))
+		vfree(hash);
+	else
+		free_pages((unsigned long)hash,
+			   get_order(sizeof(struct hlist_head) * size));
+}
+EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
+
+void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
+{
+	struct __nf_ct_flush_report fr = {
+		.pid 	= pid,
+		.report = report,
+	};
+	nf_ct_iterate_cleanup(net, kill_report, &fr);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
+
+static void nf_ct_release_dying_list(struct net *net)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* never fails to remove them, no listeners at this point */
+		nf_ct_kill(ct);
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+
+static int untrack_refs(void)
+{
+	int cnt = 0, cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+
+		cnt += atomic_read(&ct->ct_general.use) - 1;
+	}
+	return cnt;
+}
+
+static void nf_conntrack_cleanup_init_net(void)
+{
+	while (untrack_refs() > 0)
+		schedule();
+
+	nf_conntrack_helper_fini();
+	nf_conntrack_proto_fini();
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	nf_ct_extend_unregister(&nf_ct_zone_extend);
+#endif
+}
+
+static void nf_conntrack_cleanup_net(struct net *net)
+{
+ i_see_dead_people:
+	nf_ct_iterate_cleanup(net, kill_all, NULL);
+	nf_ct_release_dying_list(net);
+	if (atomic_read(&net->ct.count) != 0) {
+		schedule();
+		goto i_see_dead_people;
+	}
+
+	nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+	nf_conntrack_ecache_fini(net);
+	nf_conntrack_tstamp_fini(net);
+	nf_conntrack_acct_fini(net);
+	nf_conntrack_expect_fini(net);
+	kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+	kfree(net->ct.slabname);
+	free_percpu(net->ct.stat);
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void nf_conntrack_cleanup(struct net *net)
+{
+	if (net_eq(net, &init_net))
+		rcu_assign_pointer(ip_ct_attach, NULL);
+
+	/* This makes sure all current packets have passed through
+	   netfilter framework.  Roll on, two-stage module
+	   delete... */
+	synchronize_net();
+
+	nf_conntrack_cleanup_net(net);
+
+	if (net_eq(net, &init_net)) {
+		rcu_assign_pointer(nf_ct_destroy, NULL);
+		nf_conntrack_cleanup_init_net();
+	}
+}
+
+void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
+{
+	struct hlist_nulls_head *hash;
+	unsigned int nr_slots, i;
+	size_t sz;
+
+	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
+	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+	sz = nr_slots * sizeof(struct hlist_nulls_head);
+	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+					get_order(sz));
+	if (!hash) {
+		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+		hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+				 PAGE_KERNEL);
+	}
+
+	if (hash && nulls)
+		for (i = 0; i < nr_slots; i++)
+			INIT_HLIST_NULLS_HEAD(&hash[i], i);
+
+	return hash;
+}
+EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
+
+int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
+{
+	int i, bucket;
+	unsigned int hashsize, old_size;
+	struct hlist_nulls_head *hash, *old_hash;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+
+	if (current->nsproxy->net_ns != &init_net)
+		return -EOPNOTSUPP;
+
+	/* On boot, we can set this without any fancy locking. */
+	if (!nf_conntrack_htable_size)
+		return param_set_uint(val, kp);
+
+	hashsize = simple_strtoul(val, NULL, 0);
+	if (!hashsize)
+		return -EINVAL;
+
+	hash = nf_ct_alloc_hashtable(&hashsize, 1);
+	if (!hash)
+		return -ENOMEM;
+
+	/* Lookups in the old hash might happen in parallel, which means we
+	 * might get false negatives during connection lookup. New connections
+	 * created because of a false negative won't make it into the hash
+	 * though since that required taking the lock.
+	 */
+	spin_lock_bh(&nf_conntrack_lock);
+	for (i = 0; i < init_net.ct.htable_size; i++) {
+		while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
+			h = hlist_nulls_entry(init_net.ct.hash[i].first,
+					struct nf_conntrack_tuple_hash, hnnode);
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			hlist_nulls_del_rcu(&h->hnnode);
+			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
+						  hashsize);
+			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
+		}
+	}
+	old_size = init_net.ct.htable_size;
+	old_hash = init_net.ct.hash;
+
+	init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
+	init_net.ct.hash = hash;
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	nf_ct_free_hashtable(old_hash, old_size);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
+
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+		  &nf_conntrack_htable_size, 0600);
+
+void nf_ct_untracked_status_or(unsigned long bits)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(nf_conntrack_untracked, cpu).status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
+static int nf_conntrack_init_init_net(void)
+{
+	int max_factor = 8;
+	int ret, cpu;
+
+	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
+	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
+	if (!nf_conntrack_htable_size) {
+		nf_conntrack_htable_size
+			= (((totalram_pages << PAGE_SHIFT) / 16384)
+			   / sizeof(struct hlist_head));
+		if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
+			nf_conntrack_htable_size = 16384;
+		if (nf_conntrack_htable_size < 32)
+			nf_conntrack_htable_size = 32;
+
+		/* Use a max. factor of four by default to get the same max as
+		 * with the old struct list_heads. When a table size is given
+		 * we use the old value of 8 to avoid reducing the max.
+		 * entries. */
+		max_factor = 4;
+	}
+	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+
+	printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
+	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
+	       nf_conntrack_max);
+
+	ret = nf_conntrack_proto_init();
+	if (ret < 0)
+		goto err_proto;
+
+	ret = nf_conntrack_helper_init();
+	if (ret < 0)
+		goto err_helper;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	ret = nf_ct_extend_register(&nf_ct_zone_extend);
+	if (ret < 0)
+		goto err_extend;
+#endif
+	/* Set up fake conntrack: to never be deleted, not in any hashes */
+	for_each_possible_cpu(cpu) {
+		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+		write_pnet(&ct->ct_net, &init_net);
+		atomic_set(&ct->ct_general.use, 1);
+	}
+	/*  - and look it like as a confirmed connection */
+	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
+	return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+err_extend:
+	nf_conntrack_helper_fini();
+#endif
+err_helper:
+	nf_conntrack_proto_fini();
+err_proto:
+	return ret;
+}
+
+/*
+ * We need to use special "null" values, not used in hash table
+ */
+#define UNCONFIRMED_NULLS_VAL	((1<<30)+0)
+#define DYING_NULLS_VAL		((1<<30)+1)
+
+static int nf_conntrack_init_net(struct net *net)
+{
+	int ret;
+
+	atomic_set(&net->ct.count, 0);
+	INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
+	INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
+	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
+	if (!net->ct.stat) {
+		ret = -ENOMEM;
+		goto err_stat;
+	}
+
+	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+	if (!net->ct.slabname) {
+		ret = -ENOMEM;
+		goto err_slabname;
+	}
+
+	net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
+							sizeof(struct nf_conn), 0,
+							SLAB_DESTROY_BY_RCU, NULL);
+	if (!net->ct.nf_conntrack_cachep) {
+		printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+		ret = -ENOMEM;
+		goto err_cache;
+	}
+
+	net->ct.htable_size = nf_conntrack_htable_size;
+	net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
+	if (!net->ct.hash) {
+		ret = -ENOMEM;
+		printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+		goto err_hash;
+	}
+	ret = nf_conntrack_expect_init(net);
+	if (ret < 0)
+		goto err_expect;
+	ret = nf_conntrack_acct_init(net);
+	if (ret < 0)
+		goto err_acct;
+	ret = nf_conntrack_tstamp_init(net);
+	if (ret < 0)
+		goto err_tstamp;
+	ret = nf_conntrack_ecache_init(net);
+	if (ret < 0)
+		goto err_ecache;
+
+	return 0;
+
+err_ecache:
+	nf_conntrack_tstamp_fini(net);
+err_tstamp:
+	nf_conntrack_acct_fini(net);
+err_acct:
+	nf_conntrack_expect_fini(net);
+err_expect:
+	nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+err_hash:
+	kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+err_cache:
+	kfree(net->ct.slabname);
+err_slabname:
+	free_percpu(net->ct.stat);
+err_stat:
+	return ret;
+}
+
+s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
+			enum ip_conntrack_dir dir,
+			u32 seq);
+EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
+
+int nf_conntrack_init(struct net *net)
+{
+	int ret;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_conntrack_init_init_net();
+		if (ret < 0)
+			goto out_init_net;
+	}
+	ret = nf_conntrack_init_net(net);
+	if (ret < 0)
+		goto out_net;
+
+	if (net_eq(net, &init_net)) {
+		/* For use by REJECT target */
+		rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
+		rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+
+		/* Howto get NAT offsets */
+		rcu_assign_pointer(nf_ct_nat_offset, NULL);
+	}
+	return 0;
+
+out_net:
+	if (net_eq(net, &init_net))
+		nf_conntrack_cleanup_init_net();
+out_init_net:
+	return ret;
+}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
new file mode 100644
index 00000000..63a1b915
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -0,0 +1,266 @@
+/* Event cache for netfilter. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static DEFINE_MUTEX(nf_ct_ecache_mutex);
+
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
+
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
+EXPORT_SYMBOL_GPL(nf_expect_event_cb);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+	unsigned long events;
+	struct nf_ct_event_notifier *notify;
+	struct nf_conntrack_ecache *e;
+
+	rcu_read_lock();
+	notify = rcu_dereference(nf_conntrack_event_cb);
+	if (notify == NULL)
+		goto out_unlock;
+
+	e = nf_ct_ecache_find(ct);
+	if (e == NULL)
+		goto out_unlock;
+
+	events = xchg(&e->cache, 0);
+
+	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) {
+		struct nf_ct_event item = {
+			.ct	= ct,
+			.pid	= 0,
+			.report	= 0
+		};
+		int ret;
+		/* We make a copy of the missed event cache without taking
+		 * the lock, thus we may send missed events twice. However,
+		 * this does not harm and it happens very rarely. */
+		unsigned long missed = e->missed;
+
+		if (!((events | missed) & e->ctmask))
+			goto out_unlock;
+
+		ret = notify->fcn(events | missed, &item);
+		if (unlikely(ret < 0 || missed)) {
+			spin_lock_bh(&ct->lock);
+			if (ret < 0)
+				e->missed |= events;
+			else
+				e->missed &= ~missed;
+			spin_unlock_bh(&ct->lock);
+		} 
+	}
+
+out_unlock:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+
+int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
+{
+	int ret = 0;
+	struct nf_ct_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference_protected(nf_conntrack_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	if (notify != NULL) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	rcu_assign_pointer(nf_conntrack_event_cb, new);
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+
+void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
+{
+	struct nf_ct_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference_protected(nf_conntrack_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	BUG_ON(notify != new);
+	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
+	mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
+
+int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
+{
+	int ret = 0;
+	struct nf_exp_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference_protected(nf_expect_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	if (notify != NULL) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	rcu_assign_pointer(nf_expect_event_cb, new);
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&nf_ct_ecache_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
+
+void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
+{
+	struct nf_exp_event_notifier *notify;
+
+	mutex_lock(&nf_ct_ecache_mutex);
+	notify = rcu_dereference_protected(nf_expect_event_cb,
+					   lockdep_is_held(&nf_ct_ecache_mutex));
+	BUG_ON(notify != new);
+	rcu_assign_pointer(nf_expect_event_cb, NULL);
+	mutex_unlock(&nf_ct_ecache_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#define NF_CT_EVENTS_DEFAULT 1
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_events",
+		.data		= &init_net.ct.sysctl_events,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_events_retry_timeout",
+		.data		= &init_net.ct.sysctl_events_retry_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+	.len	= sizeof(struct nf_conntrack_ecache),
+	.align	= __alignof__(struct nf_conntrack_ecache),
+	.id	= NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_events;
+	table[1].data = &net->ct.sysctl_events_retry_timeout;
+
+	net->ct.event_sysctl_header =
+		register_net_sysctl_table(net,
+					  nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.event_sysctl_header) {
+		printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+		goto out_register;
+	}
+	return 0;
+
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.event_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.event_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+int nf_conntrack_ecache_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_events = nf_ct_events;
+	net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_ct_extend_register(&event_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_ct_event: Unable to register "
+					"event extension.\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_event_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
+	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&event_extend);
+out_extend_register:
+	return ret;
+}
+
+void nf_conntrack_ecache_fini(struct net *net)
+{
+	nf_conntrack_event_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&event_extend);
+}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
new file mode 100644
index 00000000..cd1e8e09
--- /dev/null
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -0,0 +1,677 @@
+/* Expectation handling for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <net/net_namespace.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+unsigned int nf_ct_expect_hsize __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
+
+unsigned int nf_ct_expect_max __read_mostly;
+
+static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
+
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
+/* nf_conntrack_expect helper functions */
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+				u32 pid, int report)
+{
+	struct nf_conn_help *master_help = nfct_help(exp->master);
+	struct net *net = nf_ct_exp_net(exp);
+
+	NF_CT_ASSERT(!timer_pending(&exp->timeout));
+
+	hlist_del_rcu(&exp->hnode);
+	net->ct.expect_count--;
+
+	hlist_del(&exp->lnode);
+	if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+		master_help->expecting[exp->class]--;
+
+	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
+	nf_ct_expect_put(exp);
+
+	NF_CT_STAT_INC(net, expect_delete);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
+
+static void nf_ct_expectation_timed_out(unsigned long ul_expect)
+{
+	struct nf_conntrack_expect *exp = (void *)ul_expect;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	nf_ct_unlink_expect(exp);
+	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_expect_put(exp);
+}
+
+static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
+{
+	unsigned int hash;
+
+	if (unlikely(!nf_conntrack_hash_rnd)) {
+		init_nf_conntrack_hash_rnd();
+	}
+
+	hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
+		      (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
+		       (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
+	return ((u64)hash * nf_ct_expect_hsize) >> 32;
+}
+
+struct nf_conntrack_expect *
+__nf_ct_expect_find(struct net *net, u16 zone,
+		    const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i;
+	struct hlist_node *n;
+	unsigned int h;
+
+	if (!net->ct.expect_count)
+		return NULL;
+
+	h = nf_ct_expect_dst_hash(tuple);
+	hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) {
+		if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+		    nf_ct_zone(i->master) == zone)
+			return i;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
+
+/* Just find a expectation corresponding to a tuple. */
+struct nf_conntrack_expect *
+nf_ct_expect_find_get(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i;
+
+	rcu_read_lock();
+	i = __nf_ct_expect_find(net, zone, tuple);
+	if (i && !atomic_inc_not_zero(&i->use))
+		i = NULL;
+	rcu_read_unlock();
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+struct nf_conntrack_expect *
+nf_ct_find_expectation(struct net *net, u16 zone,
+		       const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_expect *i, *exp = NULL;
+	struct hlist_node *n;
+	unsigned int h;
+
+	if (!net->ct.expect_count)
+		return NULL;
+
+	h = nf_ct_expect_dst_hash(tuple);
+	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
+		if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
+		    nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
+		    nf_ct_zone(i->master) == zone) {
+			exp = i;
+			break;
+		}
+	}
+	if (!exp)
+		return NULL;
+
+	/* If master is not in hash table yet (ie. packet hasn't left
+	   this machine yet), how can other end know about expected?
+	   Hence these are not the droids you are looking for (if
+	   master ct never got confirmed, we'd hold a reference to it
+	   and weird things would happen to future packets). */
+	if (!nf_ct_is_confirmed(exp->master))
+		return NULL;
+
+	if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+		atomic_inc(&exp->use);
+		return exp;
+	} else if (del_timer(&exp->timeout)) {
+		nf_ct_unlink_expect(exp);
+		return exp;
+	}
+
+	return NULL;
+}
+
+/* delete all expectations for this conntrack */
+void nf_ct_remove_expectations(struct nf_conn *ct)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	/* Optimization: most connection never expect any others. */
+	if (!help)
+		return;
+
+	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect(exp);
+			nf_ct_expect_put(exp);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct nf_conntrack_expect *a,
+			       const struct nf_conntrack_expect *b)
+{
+	/* Part covered by intersection of masks must be unequal,
+	   otherwise they clash */
+	struct nf_conntrack_tuple_mask intersect_mask;
+	int count;
+
+	intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
+
+	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
+		intersect_mask.src.u3.all[count] =
+			a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
+	}
+
+	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct nf_conntrack_expect *a,
+				 const struct nf_conntrack_expect *b)
+{
+	return a->master == b->master && a->class == b->class &&
+		nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+		nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
+		nf_ct_zone(a->master) == nf_ct_zone(b->master);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
+{
+	spin_lock_bh(&nf_conntrack_lock);
+	if (del_timer(&exp->timeout)) {
+		nf_ct_unlink_expect(exp);
+		nf_ct_expect_put(exp);
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
+
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
+{
+	struct nf_conntrack_expect *new;
+
+	new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	new->master = me;
+	atomic_set(&new->use, 1);
+	return new;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
+
+void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
+		       u_int8_t family,
+		       const union nf_inet_addr *saddr,
+		       const union nf_inet_addr *daddr,
+		       u_int8_t proto, const __be16 *src, const __be16 *dst)
+{
+	int len;
+
+	if (family == AF_INET)
+		len = 4;
+	else
+		len = 16;
+
+	exp->flags = 0;
+	exp->class = class;
+	exp->expectfn = NULL;
+	exp->helper = NULL;
+	exp->tuple.src.l3num = family;
+	exp->tuple.dst.protonum = proto;
+
+	if (saddr) {
+		memcpy(&exp->tuple.src.u3, saddr, len);
+		if (sizeof(exp->tuple.src.u3) > len)
+			/* address needs to be cleared for nf_ct_tuple_equal */
+			memset((void *)&exp->tuple.src.u3 + len, 0x00,
+			       sizeof(exp->tuple.src.u3) - len);
+		memset(&exp->mask.src.u3, 0xFF, len);
+		if (sizeof(exp->mask.src.u3) > len)
+			memset((void *)&exp->mask.src.u3 + len, 0x00,
+			       sizeof(exp->mask.src.u3) - len);
+	} else {
+		memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
+		memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
+	}
+
+	if (src) {
+		exp->tuple.src.u.all = *src;
+		exp->mask.src.u.all = htons(0xFFFF);
+	} else {
+		exp->tuple.src.u.all = 0;
+		exp->mask.src.u.all = 0;
+	}
+
+	memcpy(&exp->tuple.dst.u3, daddr, len);
+	if (sizeof(exp->tuple.dst.u3) > len)
+		/* address needs to be cleared for nf_ct_tuple_equal */
+		memset((void *)&exp->tuple.dst.u3 + len, 0x00,
+		       sizeof(exp->tuple.dst.u3) - len);
+
+	exp->tuple.dst.u.all = *dst;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_init);
+
+static void nf_ct_expect_free_rcu(struct rcu_head *head)
+{
+	struct nf_conntrack_expect *exp;
+
+	exp = container_of(head, struct nf_conntrack_expect, rcu);
+	kmem_cache_free(nf_ct_expect_cachep, exp);
+}
+
+void nf_ct_expect_put(struct nf_conntrack_expect *exp)
+{
+	if (atomic_dec_and_test(&exp->use))
+		call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_put);
+
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+{
+	struct nf_conn_help *master_help = nfct_help(exp->master);
+	struct net *net = nf_ct_exp_net(exp);
+	const struct nf_conntrack_expect_policy *p;
+	unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
+
+	/* two references : one for hash insert, one for the timer */
+	atomic_add(2, &exp->use);
+
+	if (master_help) {
+		hlist_add_head(&exp->lnode, &master_help->expectations);
+		master_help->expecting[exp->class]++;
+	} else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+		hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
+
+	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
+	net->ct.expect_count++;
+
+	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
+		    (unsigned long)exp);
+	if (master_help) {
+		p = &rcu_dereference_protected(
+				master_help->helper,
+				lockdep_is_held(&nf_conntrack_lock)
+				)->expect_policy[exp->class];
+		exp->timeout.expires = jiffies + p->timeout * HZ;
+	}
+	add_timer(&exp->timeout);
+
+	NF_CT_STAT_INC(net, expect_create);
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct nf_conn *master,
+				struct nf_conntrack_expect *new)
+{
+	struct nf_conn_help *master_help = nfct_help(master);
+	struct nf_conntrack_expect *exp, *last = NULL;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(exp, n, &master_help->expectations, lnode) {
+		if (exp->class == new->class)
+			last = exp;
+	}
+
+	if (last && del_timer(&last->timeout)) {
+		nf_ct_unlink_expect(last);
+		nf_ct_expect_put(last);
+	}
+}
+
+static inline int refresh_timer(struct nf_conntrack_expect *i)
+{
+	struct nf_conn_help *master_help = nfct_help(i->master);
+	const struct nf_conntrack_expect_policy *p;
+
+	if (!del_timer(&i->timeout))
+		return 0;
+
+	p = &rcu_dereference_protected(
+		master_help->helper,
+		lockdep_is_held(&nf_conntrack_lock)
+		)->expect_policy[i->class];
+	i->timeout.expires = jiffies + p->timeout * HZ;
+	add_timer(&i->timeout);
+	return 1;
+}
+
+static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
+{
+	const struct nf_conntrack_expect_policy *p;
+	struct nf_conntrack_expect *i;
+	struct nf_conn *master = expect->master;
+	struct nf_conn_help *master_help = nfct_help(master);
+	struct net *net = nf_ct_exp_net(expect);
+	struct hlist_node *n;
+	unsigned int h;
+	int ret = 1;
+
+	/* Don't allow expectations created from kernel-space with no helper */
+	if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+	    (!master_help || (master_help && !master_help->helper))) {
+		ret = -ESHUTDOWN;
+		goto out;
+	}
+	h = nf_ct_expect_dst_hash(&expect->tuple);
+	hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) {
+		if (expect_matches(i, expect)) {
+			/* Refresh timer: if it's dying, ignore.. */
+			if (refresh_timer(i)) {
+				ret = 0;
+				goto out;
+			}
+		} else if (expect_clash(i, expect)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+	/* Will be over limit? */
+	if (master_help) {
+		p = &rcu_dereference_protected(
+			master_help->helper,
+			lockdep_is_held(&nf_conntrack_lock)
+			)->expect_policy[expect->class];
+		if (p->max_expected &&
+		    master_help->expecting[expect->class] >= p->max_expected) {
+			evict_oldest_expect(master, expect);
+			if (master_help->expecting[expect->class]
+						>= p->max_expected) {
+				ret = -EMFILE;
+				goto out;
+			}
+		}
+	}
+
+	if (net->ct.expect_count >= nf_ct_expect_max) {
+		if (net_ratelimit())
+			printk(KERN_WARNING
+			       "nf_conntrack: expectation table full\n");
+		ret = -EMFILE;
+	}
+out:
+	return ret;
+}
+
+int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
+				u32 pid, int report)
+{
+	int ret;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	ret = __nf_ct_expect_check(expect);
+	if (ret <= 0)
+		goto out;
+
+	ret = 0;
+	nf_ct_expect_insert(expect);
+	spin_unlock_bh(&nf_conntrack_lock);
+	nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
+	return ret;
+out:
+	spin_unlock_bh(&nf_conntrack_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
+
+void nf_ct_remove_userspace_expectations(void)
+{
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	hlist_for_each_entry_safe(exp, n, next,
+				  &nf_ct_userspace_expect_list, lnode) {
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect(exp);
+			nf_ct_expect_put(exp);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
+#ifdef CONFIG_PROC_FS
+struct ct_expect_iter_state {
+	struct seq_net_private p;
+	unsigned int bucket;
+};
+
+static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_expect_iter_state *st = seq->private;
+	struct hlist_node *n;
+
+	for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
+		n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+		if (n)
+			return n;
+	}
+	return NULL;
+}
+
+static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
+					     struct hlist_node *head)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_expect_iter_state *st = seq->private;
+
+	head = rcu_dereference(hlist_next_rcu(head));
+	while (head == NULL) {
+		if (++st->bucket >= nf_ct_expect_hsize)
+			return NULL;
+		head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+	}
+	return head;
+}
+
+static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_node *head = ct_expect_get_first(seq);
+
+	if (head)
+		while (pos && (head = ct_expect_get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	return ct_expect_get_idx(seq, *pos);
+}
+
+static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return ct_expect_get_next(seq, v);
+}
+
+static void exp_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+	struct nf_conntrack_expect *expect;
+	struct nf_conntrack_helper *helper;
+	struct hlist_node *n = v;
+	char *delim = "";
+
+	expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
+
+	if (expect->timeout.function)
+		seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+			   ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+	else
+		seq_printf(s, "- ");
+	seq_printf(s, "l3proto = %u proto=%u ",
+		   expect->tuple.src.l3num,
+		   expect->tuple.dst.protonum);
+	print_tuple(s, &expect->tuple,
+		    __nf_ct_l3proto_find(expect->tuple.src.l3num),
+		    __nf_ct_l4proto_find(expect->tuple.src.l3num,
+				       expect->tuple.dst.protonum));
+
+	if (expect->flags & NF_CT_EXPECT_PERMANENT) {
+		seq_printf(s, "PERMANENT");
+		delim = ",";
+	}
+	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
+		seq_printf(s, "%sINACTIVE", delim);
+		delim = ",";
+	}
+	if (expect->flags & NF_CT_EXPECT_USERSPACE)
+		seq_printf(s, "%sUSERSPACE", delim);
+
+	helper = rcu_dereference(nfct_help(expect->master)->helper);
+	if (helper) {
+		seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
+		if (helper->expect_policy[expect->class].name)
+			seq_printf(s, "/%s",
+				   helper->expect_policy[expect->class].name);
+	}
+
+	return seq_putc(s, '\n');
+}
+
+static const struct seq_operations exp_seq_ops = {
+	.start = exp_seq_start,
+	.next = exp_seq_next,
+	.stop = exp_seq_stop,
+	.show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &exp_seq_ops,
+			sizeof(struct ct_expect_iter_state));
+}
+
+static const struct file_operations exp_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = exp_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int exp_proc_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc;
+
+	proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops);
+	if (!proc)
+		return -ENOMEM;
+#endif /* CONFIG_PROC_FS */
+	return 0;
+}
+
+static void exp_proc_remove(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "nf_conntrack_expect");
+#endif /* CONFIG_PROC_FS */
+}
+
+module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
+
+int nf_conntrack_expect_init(struct net *net)
+{
+	int err = -ENOMEM;
+
+	if (net_eq(net, &init_net)) {
+		if (!nf_ct_expect_hsize) {
+			nf_ct_expect_hsize = net->ct.htable_size / 256;
+			if (!nf_ct_expect_hsize)
+				nf_ct_expect_hsize = 1;
+		}
+		nf_ct_expect_max = nf_ct_expect_hsize * 4;
+	}
+
+	net->ct.expect_count = 0;
+	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
+	if (net->ct.expect_hash == NULL)
+		goto err1;
+
+	if (net_eq(net, &init_net)) {
+		nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+					sizeof(struct nf_conntrack_expect),
+					0, 0, NULL);
+		if (!nf_ct_expect_cachep)
+			goto err2;
+	}
+
+	err = exp_proc_init(net);
+	if (err < 0)
+		goto err3;
+
+	return 0;
+
+err3:
+	if (net_eq(net, &init_net))
+		kmem_cache_destroy(nf_ct_expect_cachep);
+err2:
+	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+err1:
+	return err;
+}
+
+void nf_conntrack_expect_fini(struct net *net)
+{
+	exp_proc_remove(net);
+	if (net_eq(net, &init_net)) {
+		rcu_barrier(); /* Wait for call_rcu() before destroy */
+		kmem_cache_destroy(nf_ct_expect_cachep);
+	}
+	nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
+}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
new file mode 100644
index 00000000..05ecdc28
--- /dev/null
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -0,0 +1,189 @@
+/* Structure dynamic extension infrastructure
+ * Copyright (C) 2004 Rusty Russell IBM Corporation
+ * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org>
+ * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
+static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+
+void __nf_ct_ext_destroy(struct nf_conn *ct)
+{
+	unsigned int i;
+	struct nf_ct_ext_type *t;
+	struct nf_ct_ext *ext = ct->ext;
+
+	for (i = 0; i < NF_CT_EXT_NUM; i++) {
+		if (!__nf_ct_ext_exist(ext, i))
+			continue;
+
+		rcu_read_lock();
+		t = rcu_dereference(nf_ct_ext_types[i]);
+
+		/* Here the nf_ct_ext_type might have been unregisterd.
+		 * I.e., it has responsible to cleanup private
+		 * area in all conntracks when it is unregisterd.
+		 */
+		if (t && t->destroy)
+			t->destroy(ct);
+		rcu_read_unlock();
+	}
+}
+EXPORT_SYMBOL(__nf_ct_ext_destroy);
+
+static void *
+nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
+{
+	unsigned int off, len;
+	struct nf_ct_ext_type *t;
+	size_t alloc_size;
+
+	rcu_read_lock();
+	t = rcu_dereference(nf_ct_ext_types[id]);
+	BUG_ON(t == NULL);
+	off = ALIGN(sizeof(struct nf_ct_ext), t->align);
+	len = off + t->len;
+	alloc_size = t->alloc_size;
+	rcu_read_unlock();
+
+	*ext = kzalloc(alloc_size, gfp);
+	if (!*ext)
+		return NULL;
+
+	(*ext)->offset[id] = off;
+	(*ext)->len = len;
+
+	return (void *)(*ext) + off;
+}
+
+void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
+{
+	struct nf_ct_ext *old, *new;
+	int i, newlen, newoff;
+	struct nf_ct_ext_type *t;
+
+	/* Conntrack must not be confirmed to avoid races on reallocation. */
+	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+
+	old = ct->ext;
+	if (!old)
+		return nf_ct_ext_create(&ct->ext, id, gfp);
+
+	if (__nf_ct_ext_exist(old, id))
+		return NULL;
+
+	rcu_read_lock();
+	t = rcu_dereference(nf_ct_ext_types[id]);
+	BUG_ON(t == NULL);
+
+	newoff = ALIGN(old->len, t->align);
+	newlen = newoff + t->len;
+	rcu_read_unlock();
+
+	new = __krealloc(old, newlen, gfp);
+	if (!new)
+		return NULL;
+
+	if (new != old) {
+		for (i = 0; i < NF_CT_EXT_NUM; i++) {
+			if (!__nf_ct_ext_exist(old, i))
+				continue;
+
+			rcu_read_lock();
+			t = rcu_dereference(nf_ct_ext_types[i]);
+			if (t && t->move)
+				t->move((void *)new + new->offset[i],
+					(void *)old + old->offset[i]);
+			rcu_read_unlock();
+		}
+		kfree_rcu(old, rcu);
+		ct->ext = new;
+	}
+
+	new->offset[id] = newoff;
+	new->len = newlen;
+	memset((void *)new + newoff, 0, newlen - newoff);
+	return (void *)new + newoff;
+}
+EXPORT_SYMBOL(__nf_ct_ext_add);
+
+static void update_alloc_size(struct nf_ct_ext_type *type)
+{
+	int i, j;
+	struct nf_ct_ext_type *t1, *t2;
+	enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1;
+
+	/* unnecessary to update all types */
+	if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) {
+		min = type->id;
+		max = type->id;
+	}
+
+	/* This assumes that extended areas in conntrack for the types
+	   whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
+	for (i = min; i <= max; i++) {
+		t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+				lockdep_is_held(&nf_ct_ext_type_mutex));
+		if (!t1)
+			continue;
+
+		t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
+				 t1->len;
+		for (j = 0; j < NF_CT_EXT_NUM; j++) {
+			t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+				lockdep_is_held(&nf_ct_ext_type_mutex));
+			if (t2 == NULL || t2 == t1 ||
+			    (t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
+				continue;
+
+			t1->alloc_size = ALIGN(t1->alloc_size, t2->align)
+					 + t2->len;
+		}
+	}
+}
+
+/* This MUST be called in process context. */
+int nf_ct_extend_register(struct nf_ct_ext_type *type)
+{
+	int ret = 0;
+
+	mutex_lock(&nf_ct_ext_type_mutex);
+	if (nf_ct_ext_types[type->id]) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* This ensures that nf_ct_ext_create() can allocate enough area
+	   before updating alloc_size */
+	type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align)
+			   + type->len;
+	rcu_assign_pointer(nf_ct_ext_types[type->id], type);
+	update_alloc_size(type);
+out:
+	mutex_unlock(&nf_ct_ext_type_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+
+/* This MUST be called in process context. */
+void nf_ct_extend_unregister(struct nf_ct_ext_type *type)
+{
+	mutex_lock(&nf_ct_ext_type_mutex);
+	rcu_assign_pointer(nf_ct_ext_types[type->id], NULL);
+	update_alloc_size(type);
+	mutex_unlock(&nf_ct_ext_type_mutex);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
new file mode 100644
index 00000000..6f5801ea
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -0,0 +1,589 @@
+/* FTP extension for connection tracking. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/ipv6.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+MODULE_ALIAS("ip_conntrack_ftp");
+MODULE_ALIAS_NFCT_HELPER("ftp");
+
+/* This is slow, but it's simple. --RR */
+static char *ftp_buffer;
+
+static DEFINE_SPINLOCK(nf_ftp_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+static int loose;
+module_param(loose, bool, 0600);
+
+unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
+				enum ip_conntrack_info ctinfo,
+				enum nf_ct_ftp_type type,
+				unsigned int matchoff,
+				unsigned int matchlen,
+				struct nf_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
+
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
+			     char);
+
+static struct ftp_search {
+	const char *pattern;
+	size_t plen;
+	char skip;
+	char term;
+	enum nf_ct_ftp_type ftptype;
+	int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
+} search[IP_CT_DIR_MAX][2] = {
+	[IP_CT_DIR_ORIGINAL] = {
+		{
+			.pattern	= "PORT",
+			.plen		= sizeof("PORT") - 1,
+			.skip		= ' ',
+			.term		= '\r',
+			.ftptype	= NF_CT_FTP_PORT,
+			.getnum		= try_rfc959,
+		},
+		{
+			.pattern	= "EPRT",
+			.plen		= sizeof("EPRT") - 1,
+			.skip		= ' ',
+			.term		= '\r',
+			.ftptype	= NF_CT_FTP_EPRT,
+			.getnum		= try_eprt,
+		},
+	},
+	[IP_CT_DIR_REPLY] = {
+		{
+			.pattern	= "227 ",
+			.plen		= sizeof("227 ") - 1,
+			.skip		= '(',
+			.term		= ')',
+			.ftptype	= NF_CT_FTP_PASV,
+			.getnum		= try_rfc959,
+		},
+		{
+			.pattern	= "229 ",
+			.plen		= sizeof("229 ") - 1,
+			.skip		= '(',
+			.term		= ')',
+			.ftptype	= NF_CT_FTP_EPSV,
+			.getnum		= try_epsv_response,
+		},
+	},
+};
+
+static int
+get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
+{
+	const char *end;
+	int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end);
+	if (ret > 0)
+		return (int)(end - src);
+	return 0;
+}
+
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+		      int array_size, char sep, char term)
+{
+	u_int32_t i, len;
+
+	memset(array, 0, sizeof(array[0])*array_size);
+
+	/* Keep data pointing at next char. */
+	for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+		if (*data >= '0' && *data <= '9') {
+			array[i] = array[i]*10 + *data - '0';
+		}
+		else if (*data == sep)
+			i++;
+		else {
+			/* Unexpected character; true if it's the
+			   terminator and we're finished. */
+			if (*data == term && i == array_size - 1)
+				return len;
+
+			pr_debug("Char %u (got %u nums) `%u' unexpected\n",
+				 len, i, *data);
+			return 0;
+		}
+	}
+	pr_debug("Failed to fill %u numbers separated by %c\n",
+		 array_size, sep);
+	return 0;
+}
+
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen,
+		      struct nf_conntrack_man *cmd, char term)
+{
+	int length;
+	u_int32_t array[6];
+
+	length = try_number(data, dlen, array, 6, ',', term);
+	if (length == 0)
+		return 0;
+
+	cmd->u3.ip =  htonl((array[0] << 24) | (array[1] << 16) |
+				    (array[2] << 8) | array[3]);
+	cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
+	return length;
+}
+
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+		    __be16 *port)
+{
+	u_int16_t tmp_port = 0;
+	int i;
+
+	for (i = start; i < dlen; i++) {
+		/* Finished? */
+		if (data[i] == delim) {
+			if (tmp_port == 0)
+				break;
+			*port = htons(tmp_port);
+			pr_debug("get_port: return %d\n", tmp_port);
+			return i + 1;
+		}
+		else if (data[i] >= '0' && data[i] <= '9')
+			tmp_port = tmp_port*10 + data[i] - '0';
+		else { /* Some other crap */
+			pr_debug("get_port: invalid char.\n");
+			break;
+		}
+	}
+	return 0;
+}
+
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
+static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
+		    char term)
+{
+	char delim;
+	int length;
+
+	/* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
+	   then delimiter again. */
+	if (dlen <= 3) {
+		pr_debug("EPRT: too short\n");
+		return 0;
+	}
+	delim = data[0];
+	if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
+		pr_debug("try_eprt: invalid delimitter.\n");
+		return 0;
+	}
+
+	if ((cmd->l3num == PF_INET && data[1] != '1') ||
+	    (cmd->l3num == PF_INET6 && data[1] != '2')) {
+		pr_debug("EPRT: invalid protocol number.\n");
+		return 0;
+	}
+
+	pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim);
+
+	if (data[1] == '1') {
+		u_int32_t array[4];
+
+		/* Now we have IP address. */
+		length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+		if (length != 0)
+			cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16)
+					   | (array[2] << 8) | array[3]);
+	} else {
+		/* Now we have IPv6 address. */
+		length = get_ipv6_addr(data + 3, dlen - 3,
+				       (struct in6_addr *)cmd->u3.ip6, delim);
+	}
+
+	if (length == 0)
+		return 0;
+	pr_debug("EPRT: Got IP address!\n");
+	/* Start offset includes initial "|1|", and trailing delimiter */
+	return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen,
+			     struct nf_conntrack_man *cmd, char term)
+{
+	char delim;
+
+	/* Three delimiters. */
+	if (dlen <= 3) return 0;
+	delim = data[0];
+	if (isdigit(delim) || delim < 33 || delim > 126 ||
+	    data[1] != delim || data[2] != delim)
+		return 0;
+
+	return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+			const char *pattern, size_t plen,
+			char skip, char term,
+			unsigned int *numoff,
+			unsigned int *numlen,
+			struct nf_conntrack_man *cmd,
+			int (*getnum)(const char *, size_t,
+				      struct nf_conntrack_man *, char))
+{
+	size_t i;
+
+	pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
+	if (dlen == 0)
+		return 0;
+
+	if (dlen <= plen) {
+		/* Short packet: try for partial? */
+		if (strnicmp(data, pattern, dlen) == 0)
+			return -1;
+		else return 0;
+	}
+
+	if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+		size_t i;
+
+		pr_debug("ftp: string mismatch\n");
+		for (i = 0; i < plen; i++) {
+			pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+				 i, data[i], data[i],
+				 pattern[i], pattern[i]);
+		}
+#endif
+		return 0;
+	}
+
+	pr_debug("Pattern matches!\n");
+	/* Now we've found the constant string, try to skip
+	   to the 'skip' character */
+	for (i = plen; data[i] != skip; i++)
+		if (i == dlen - 1) return -1;
+
+	/* Skip over the last character */
+	i++;
+
+	pr_debug("Skipped up to `%c'!\n", skip);
+
+	*numoff = i;
+	*numlen = getnum(data + i, dlen - i, cmd, term);
+	if (!*numlen)
+		return -1;
+
+	pr_debug("Match succeeded!\n");
+	return 1;
+}
+
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir)
+{
+	unsigned int i;
+
+	for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+		if (info->seq_aft_nl[dir][i] == seq)
+			return 1;
+	return 0;
+}
+
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(struct nf_conn *ct, u32 nl_seq,
+			  struct nf_ct_ftp_master *info, int dir,
+			  struct sk_buff *skb)
+{
+	unsigned int i, oldest;
+
+	/* Look for oldest: if we find exact match, we're done. */
+	for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+		if (info->seq_aft_nl[dir][i] == nl_seq)
+			return;
+	}
+
+	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
+		info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+	} else {
+		if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1]))
+			oldest = 0;
+		else
+			oldest = 1;
+
+		if (after(nl_seq, info->seq_aft_nl[dir][oldest]))
+			info->seq_aft_nl[dir][oldest] = nl_seq;
+	}
+}
+
+static int help(struct sk_buff *skb,
+		unsigned int protoff,
+		struct nf_conn *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	const char *fb_ptr;
+	int ret;
+	u32 seq;
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+	struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info;
+	struct nf_conntrack_expect *exp;
+	union nf_inet_addr *daddr;
+	struct nf_conntrack_man cmd = {};
+	unsigned int i;
+	int found = 0, ends_in_nl;
+	typeof(nf_nat_ftp_hook) nf_nat_ftp;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED &&
+	    ctinfo != IP_CT_ESTABLISHED_REPLY) {
+		pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
+		return NF_ACCEPT;
+	}
+
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+
+	dataoff = protoff + th->doff * 4;
+	/* No data? */
+	if (dataoff >= skb->len) {
+		pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
+			 skb->len);
+		return NF_ACCEPT;
+	}
+	datalen = skb->len - dataoff;
+
+	spin_lock_bh(&nf_ftp_lock);
+	fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
+	BUG_ON(fb_ptr == NULL);
+
+	ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+	seq = ntohl(th->seq) + datalen;
+
+	/* Look up to see if we're just after a \n. */
+	if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+		/* Now if this ends in \n, update ftp info. */
+		pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
+			 ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
+			 ct_ftp_info->seq_aft_nl[dir][0],
+			 ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
+			 ct_ftp_info->seq_aft_nl[dir][1]);
+		ret = NF_ACCEPT;
+		goto out_update_nl;
+	}
+
+	/* Initialize IP/IPv6 addr to expected address (it's not mentioned
+	   in EPSV responses) */
+	cmd.l3num = nf_ct_l3num(ct);
+	memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+	       sizeof(cmd.u3.all));
+
+	for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
+		found = find_pattern(fb_ptr, datalen,
+				     search[dir][i].pattern,
+				     search[dir][i].plen,
+				     search[dir][i].skip,
+				     search[dir][i].term,
+				     &matchoff, &matchlen,
+				     &cmd,
+				     search[dir][i].getnum);
+		if (found) break;
+	}
+	if (found == -1) {
+		/* We don't usually drop packets.  After all, this is
+		   connection tracking, not packet filtering.
+		   However, it is necessary for accurate tracking in
+		   this case. */
+		pr_debug("conntrack_ftp: partial %s %u+%u\n",
+			 search[dir][i].pattern,  ntohl(th->seq), datalen);
+		ret = NF_DROP;
+		goto out;
+	} else if (found == 0) { /* No match */
+		ret = NF_ACCEPT;
+		goto out_update_nl;
+	}
+
+	pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+		 matchlen, fb_ptr + matchoff,
+		 matchlen, ntohl(th->seq) + matchoff);
+
+	exp = nf_ct_expect_alloc(ct);
+	if (exp == NULL) {
+		ret = NF_DROP;
+		goto out;
+	}
+
+	/* We refer to the reverse direction ("!dir") tuples here,
+	 * because we're expecting something in the other direction.
+	 * Doesn't matter unless NAT is happening.  */
+	daddr = &ct->tuplehash[!dir].tuple.dst.u3;
+
+	/* Update the ftp info */
+	if ((cmd.l3num == nf_ct_l3num(ct)) &&
+	    memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+		     sizeof(cmd.u3.all))) {
+		/* Enrico Scholz's passive FTP to partially RNAT'd ftp
+		   server: it really wants us to connect to a
+		   different IP address.  Simply don't record it for
+		   NAT. */
+		if (cmd.l3num == PF_INET) {
+			pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n",
+				 &cmd.u3.ip,
+				 &ct->tuplehash[dir].tuple.src.u3.ip);
+		} else {
+			pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
+				 cmd.u3.ip6,
+				 ct->tuplehash[dir].tuple.src.u3.ip6);
+		}
+
+		/* Thanks to Cristiano Lincoln Mattos
+		   <lincoln@cesar.org.br> for reporting this potential
+		   problem (DMZ machines opening holes to internal
+		   networks, or the packet filter itself). */
+		if (!loose) {
+			ret = NF_ACCEPT;
+			goto out_put_expect;
+		}
+		daddr = &cmd.u3;
+	}
+
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
+			  &ct->tuplehash[!dir].tuple.src.u3, daddr,
+			  IPPROTO_TCP, NULL, &cmd.u.tcp.port);
+
+	/* Now, NAT might want to mangle the packet, and register the
+	 * (possibly changed) expectation itself. */
+	nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
+	if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
+		ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
+				 matchoff, matchlen, exp);
+	else {
+		/* Can't expect this?  Best to drop packet now. */
+		if (nf_ct_expect_related(exp) != 0)
+			ret = NF_DROP;
+		else
+			ret = NF_ACCEPT;
+	}
+
+out_put_expect:
+	nf_ct_expect_put(exp);
+
+out_update_nl:
+	/* Now if this ends in \n, update ftp info.  Seq may have been
+	 * adjusted by NAT code. */
+	if (ends_in_nl)
+		update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
+ out:
+	spin_unlock_bh(&nf_ftp_lock);
+	return ret;
+}
+
+static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly;
+static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly;
+
+static const struct nf_conntrack_expect_policy ftp_exp_policy = {
+	.max_expected	= 1,
+	.timeout	= 5 * 60,
+};
+
+/* don't make this __exit, since it's called from __init ! */
+static void nf_conntrack_ftp_fini(void)
+{
+	int i, j;
+	for (i = 0; i < ports_c; i++) {
+		for (j = 0; j < 2; j++) {
+			if (ftp[i][j].me == NULL)
+				continue;
+
+			pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
+				 "port: %d\n",
+				 ftp[i][j].tuple.src.l3num, ports[i]);
+			nf_conntrack_helper_unregister(&ftp[i][j]);
+		}
+	}
+
+	kfree(ftp_buffer);
+}
+
+static int __init nf_conntrack_ftp_init(void)
+{
+	int i, j = -1, ret = 0;
+	char *tmpname;
+
+	ftp_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!ftp_buffer)
+		return -ENOMEM;
+
+	if (ports_c == 0)
+		ports[ports_c++] = FTP_PORT;
+
+	/* FIXME should be configurable whether IPv4 and IPv6 FTP connections
+		 are tracked or not - YK */
+	for (i = 0; i < ports_c; i++) {
+		ftp[i][0].tuple.src.l3num = PF_INET;
+		ftp[i][1].tuple.src.l3num = PF_INET6;
+		for (j = 0; j < 2; j++) {
+			ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+			ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
+			ftp[i][j].expect_policy = &ftp_exp_policy;
+			ftp[i][j].me = THIS_MODULE;
+			ftp[i][j].help = help;
+			tmpname = &ftp_names[i][j][0];
+			if (ports[i] == FTP_PORT)
+				sprintf(tmpname, "ftp");
+			else
+				sprintf(tmpname, "ftp-%d", ports[i]);
+			ftp[i][j].name = tmpname;
+
+			pr_debug("nf_ct_ftp: registering helper for pf: %d "
+				 "port: %d\n",
+				 ftp[i][j].tuple.src.l3num, ports[i]);
+			ret = nf_conntrack_helper_register(&ftp[i][j]);
+			if (ret) {
+				printk(KERN_ERR "nf_ct_ftp: failed to register"
+				       " helper for pf: %d port: %d\n",
+					ftp[i][j].tuple.src.l3num, ports[i]);
+				nf_conntrack_ftp_fini();
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+module_init(nf_conntrack_ftp_init);
+module_exit(nf_conntrack_ftp_fini);
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
new file mode 100644
index 00000000..bcd5ed6b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -0,0 +1,888 @@
+/****************************************************************************
+ * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
+ * 			      	     conntrack/NAT module.
+ *
+ * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * See ip_conntrack_helper_h323_asn1.h for details.
+ *
+ ****************************************************************************/
+
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#else
+#include <stdio.h>
+#endif
+#include <linux/netfilter/nf_conntrack_h323_asn1.h>
+
+/* Trace Flag */
+#ifndef H323_TRACE
+#define H323_TRACE 0
+#endif
+
+#if H323_TRACE
+#define TAB_SIZE 4
+#define IFTHEN(cond, act) if(cond){act;}
+#ifdef __KERNEL__
+#define PRINT printk
+#else
+#define PRINT printf
+#endif
+#define FNAME(name) name,
+#else
+#define IFTHEN(cond, act)
+#define PRINT(fmt, args...)
+#define FNAME(name)
+#endif
+
+/* ASN.1 Types */
+#define NUL 0
+#define BOOL 1
+#define OID 2
+#define INT 3
+#define ENUM 4
+#define BITSTR 5
+#define NUMSTR 6
+#define NUMDGT 6
+#define TBCDSTR 6
+#define OCTSTR 7
+#define PRTSTR 7
+#define IA5STR 7
+#define GENSTR 7
+#define BMPSTR 8
+#define SEQ 9
+#define SET 9
+#define SEQOF 10
+#define SETOF 10
+#define CHOICE 11
+
+/* Constraint Types */
+#define FIXD 0
+/* #define BITS 1-8 */
+#define BYTE 9
+#define WORD 10
+#define CONS 11
+#define SEMI 12
+#define UNCO 13
+
+/* ASN.1 Type Attributes */
+#define SKIP 0
+#define STOP 1
+#define DECODE 2
+#define EXT 4
+#define OPEN 8
+#define OPT 16
+
+
+/* ASN.1 Field Structure */
+typedef struct field_t {
+#if H323_TRACE
+	char *name;
+#endif
+	unsigned char type;
+	unsigned char sz;
+	unsigned char lb;
+	unsigned char ub;
+	unsigned short attr;
+	unsigned short offset;
+	const struct field_t *fields;
+} field_t;
+
+/* Bit Stream */
+typedef struct {
+	unsigned char *buf;
+	unsigned char *beg;
+	unsigned char *end;
+	unsigned char *cur;
+	unsigned int bit;
+} bitstr_t;
+
+/* Tool Functions */
+#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;}
+#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;}
+#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;}
+#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND)
+static unsigned int get_len(bitstr_t *bs);
+static unsigned int get_bit(bitstr_t *bs);
+static unsigned int get_bits(bitstr_t *bs, unsigned int b);
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b);
+static unsigned int get_uint(bitstr_t *bs, int b);
+
+/* Decoder Functions */
+static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level);
+static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level);
+
+/* Decoder Functions Vector */
+typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int);
+static const decoder_t Decoders[] = {
+	decode_nul,
+	decode_bool,
+	decode_oid,
+	decode_int,
+	decode_enum,
+	decode_bitstr,
+	decode_numstr,
+	decode_octstr,
+	decode_bmpstr,
+	decode_seq,
+	decode_seqof,
+	decode_choice,
+};
+
+/****************************************************************************
+ * H.323 Types
+ ****************************************************************************/
+#include "nf_conntrack_h323_types.c"
+
+/****************************************************************************
+ * Functions
+ ****************************************************************************/
+/* Assume bs is aligned && v < 16384 */
+static unsigned int get_len(bitstr_t *bs)
+{
+	unsigned int v;
+
+	v = *bs->cur++;
+
+	if (v & 0x80) {
+		v &= 0x3f;
+		v <<= 8;
+		v += *bs->cur++;
+	}
+
+	return v;
+}
+
+/****************************************************************************/
+static unsigned int get_bit(bitstr_t *bs)
+{
+	unsigned int b = (*bs->cur) & (0x80 >> bs->bit);
+
+	INC_BIT(bs);
+
+	return b;
+}
+
+/****************************************************************************/
+/* Assume b <= 8 */
+static unsigned int get_bits(bitstr_t *bs, unsigned int b)
+{
+	unsigned int v, l;
+
+	v = (*bs->cur) & (0xffU >> bs->bit);
+	l = b + bs->bit;
+
+	if (l < 8) {
+		v >>= 8 - l;
+		bs->bit = l;
+	} else if (l == 8) {
+		bs->cur++;
+		bs->bit = 0;
+	} else {		/* l > 8 */
+
+		v <<= 8;
+		v += *(++bs->cur);
+		v >>= 16 - l;
+		bs->bit = l - 8;
+	}
+
+	return v;
+}
+
+/****************************************************************************/
+/* Assume b <= 32 */
+static unsigned int get_bitmap(bitstr_t *bs, unsigned int b)
+{
+	unsigned int v, l, shift, bytes;
+
+	if (!b)
+		return 0;
+
+	l = bs->bit + b;
+
+	if (l < 8) {
+		v = (unsigned int)(*bs->cur) << (bs->bit + 24);
+		bs->bit = l;
+	} else if (l == 8) {
+		v = (unsigned int)(*bs->cur++) << (bs->bit + 24);
+		bs->bit = 0;
+	} else {
+		for (bytes = l >> 3, shift = 24, v = 0; bytes;
+		     bytes--, shift -= 8)
+			v |= (unsigned int)(*bs->cur++) << shift;
+
+		if (l < 32) {
+			v |= (unsigned int)(*bs->cur) << shift;
+			v <<= bs->bit;
+		} else if (l > 32) {
+			v <<= bs->bit;
+			v |= (*bs->cur) >> (8 - bs->bit);
+		}
+
+		bs->bit = l & 0x7;
+	}
+
+	v &= 0xffffffff << (32 - b);
+
+	return v;
+}
+
+/****************************************************************************
+ * Assume bs is aligned and sizeof(unsigned int) == 4
+ ****************************************************************************/
+static unsigned int get_uint(bitstr_t *bs, int b)
+{
+	unsigned int v = 0;
+
+	switch (b) {
+	case 4:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 3:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 2:
+		v |= *bs->cur++;
+		v <<= 8;
+	case 1:
+		v |= *bs->cur++;
+		break;
+	}
+	return v;
+}
+
+/****************************************************************************/
+static int decode_nul(bitstr_t *bs, const struct field_t *f,
+                      char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bool(bitstr_t *bs, const struct field_t *f,
+                       char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	INC_BIT(bs);
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_oid(bitstr_t *bs, const struct field_t *f,
+                      char *base, int level)
+{
+	int len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	BYTE_ALIGN(bs);
+	CHECK_BOUND(bs, 1);
+	len = *bs->cur++;
+	bs->cur += len;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_int(bitstr_t *bs, const struct field_t *f,
+                      char *base, int level)
+{
+	unsigned int len;
+
+	PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		bs->cur++;
+		break;
+	case WORD:		/* 257 <= Range <= 64K */
+		BYTE_ALIGN(bs);
+		bs->cur += 2;
+		break;
+	case CONS:		/* 64K < Range < 4G */
+		len = get_bits(bs, 2) + 1;
+		BYTE_ALIGN(bs);
+		if (base && (f->attr & DECODE)) {	/* timeToLive */
+			unsigned int v = get_uint(bs, len) + f->lb;
+			PRINT(" = %u", v);
+			*((unsigned int *)(base + f->offset)) = v;
+		}
+		bs->cur += len;
+		break;
+	case UNCO:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		bs->cur += len;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		INC_BITS(bs, f->sz);
+		break;
+	}
+
+	PRINT("\n");
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_enum(bitstr_t *bs, const struct field_t *f,
+                       char *base, int level)
+{
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	if ((f->attr & EXT) && get_bit(bs)) {
+		INC_BITS(bs, 7);
+	} else {
+		INC_BITS(bs, f->sz);
+	}
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bitstr(bitstr_t *bs, const struct field_t *f,
+                         char *base, int level)
+{
+	unsigned int len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	BYTE_ALIGN(bs);
+	switch (f->sz) {
+	case FIXD:		/* fixed length > 16 */
+		len = f->lb;
+		break;
+	case WORD:		/* 2-byte length */
+		CHECK_BOUND(bs, 2);
+		len = (*bs->cur++) << 8;
+		len += (*bs->cur++) + f->lb;
+		break;
+	case SEMI:
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		break;
+	default:
+		len = 0;
+		break;
+	}
+
+	bs->cur += len >> 3;
+	bs->bit = len & 7;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_numstr(bitstr_t *bs, const struct field_t *f,
+                         char *base, int level)
+{
+	unsigned int len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* 2 <= Range <= 255 */
+	len = get_bits(bs, f->sz) + f->lb;
+
+	BYTE_ALIGN(bs);
+	INC_BITS(bs, (len << 2));
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_octstr(bitstr_t *bs, const struct field_t *f,
+                         char *base, int level)
+{
+	unsigned int len;
+
+	PRINT("%*.s%s", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case FIXD:		/* Range == 1 */
+		if (f->lb > 2) {
+			BYTE_ALIGN(bs);
+			if (base && (f->attr & DECODE)) {
+				/* The IP Address */
+				IFTHEN(f->lb == 4,
+				       PRINT(" = %d.%d.%d.%d:%d",
+					     bs->cur[0], bs->cur[1],
+					     bs->cur[2], bs->cur[3],
+					     bs->cur[4] * 256 + bs->cur[5]));
+				*((unsigned int *)(base + f->offset)) =
+				    bs->cur - bs->buf;
+			}
+		}
+		len = f->lb;
+		break;
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		len = (*bs->cur++) + f->lb;
+		break;
+	case SEMI:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs) + f->lb;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		len = get_bits(bs, f->sz) + f->lb;
+		BYTE_ALIGN(bs);
+		break;
+	}
+
+	bs->cur += len;
+
+	PRINT("\n");
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_bmpstr(bitstr_t *bs, const struct field_t *f,
+                         char *base, int level)
+{
+	unsigned int len;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	switch (f->sz) {
+	case BYTE:		/* Range == 256 */
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		len = (*bs->cur++) + f->lb;
+		break;
+	default:		/* 2 <= Range <= 255 */
+		len = get_bits(bs, f->sz) + f->lb;
+		BYTE_ALIGN(bs);
+		break;
+	}
+
+	bs->cur += len << 1;
+
+	CHECK_BOUND(bs, 0);
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seq(bitstr_t *bs, const struct field_t *f,
+                      char *base, int level)
+{
+	unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len;
+	int err;
+	const struct field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Extensible? */
+	ext = (f->attr & EXT) ? get_bit(bs) : 0;
+
+	/* Get fields bitmap */
+	bmp = get_bitmap(bs, f->sz);
+	if (base)
+		*(unsigned int *)base = bmp;
+
+	/* Decode the root components */
+	for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) {
+		if (son->attr & STOP) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			return H323_ERROR_STOP;
+		}
+
+		if (son->attr & OPT) {	/* Optional component */
+			if (!((0x80000000U >> (opt++)) & bmp))	/* Not exist */
+				continue;
+		}
+
+		/* Decode */
+		if (son->attr & OPEN) {	/* Open field */
+			CHECK_BOUND(bs, 2);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			if (!base || !(son->attr & DECODE)) {
+				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+				      " ", son->name);
+				bs->cur += len;
+				continue;
+			}
+			beg = bs->cur;
+
+			/* Decode */
+			if ((err = (Decoders[son->type]) (bs, son, base,
+							  level + 1)) <
+			    H323_ERROR_NONE)
+				return err;
+
+			bs->cur = beg + len;
+			bs->bit = 0;
+		} else if ((err = (Decoders[son->type]) (bs, son, base,
+							 level + 1)) <
+			   H323_ERROR_NONE)
+			return err;
+	}
+
+	/* No extension? */
+	if (!ext)
+		return H323_ERROR_NONE;
+
+	/* Get the extension bitmap */
+	bmp2_len = get_bits(bs, 7) + 1;
+	CHECK_BOUND(bs, (bmp2_len + 7) >> 3);
+	bmp2 = get_bitmap(bs, bmp2_len);
+	bmp |= bmp2 >> f->sz;
+	if (base)
+		*(unsigned int *)base = bmp;
+	BYTE_ALIGN(bs);
+
+	/* Decode the extension components */
+	for (opt = 0; opt < bmp2_len; opt++, i++, son++) {
+		/* Check Range */
+		if (i >= f->ub) {	/* Newer Version? */
+			CHECK_BOUND(bs, 2);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			bs->cur += len;
+			continue;
+		}
+
+		if (son->attr & STOP) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			return H323_ERROR_STOP;
+		}
+
+		if (!((0x80000000 >> opt) & bmp2))	/* Not present */
+			continue;
+
+		CHECK_BOUND(bs, 2);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		if (!base || !(son->attr & DECODE)) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			bs->cur += len;
+			continue;
+		}
+		beg = bs->cur;
+
+		if ((err = (Decoders[son->type]) (bs, son, base,
+						  level + 1)) <
+		    H323_ERROR_NONE)
+			return err;
+
+		bs->cur = beg + len;
+		bs->bit = 0;
+	}
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+static int decode_seqof(bitstr_t *bs, const struct field_t *f,
+                        char *base, int level)
+{
+	unsigned int count, effective_count = 0, i, len = 0;
+	int err;
+	const struct field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Decode item count */
+	switch (f->sz) {
+	case BYTE:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 1);
+		count = *bs->cur++;
+		break;
+	case WORD:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		count = *bs->cur++;
+		count <<= 8;
+		count += *bs->cur++;
+		break;
+	case SEMI:
+		BYTE_ALIGN(bs);
+		CHECK_BOUND(bs, 2);
+		count = get_len(bs);
+		break;
+	default:
+		count = get_bits(bs, f->sz);
+		break;
+	}
+	count += f->lb;
+
+	/* Write Count */
+	if (base) {
+		effective_count = count > f->ub ? f->ub : count;
+		*(unsigned int *)base = effective_count;
+		base += sizeof(unsigned int);
+	}
+
+	/* Decode nested field */
+	son = f->fields;
+	if (base)
+		base -= son->offset;
+	for (i = 0; i < count; i++) {
+		if (son->attr & OPEN) {
+			BYTE_ALIGN(bs);
+			len = get_len(bs);
+			CHECK_BOUND(bs, len);
+			if (!base || !(son->attr & DECODE)) {
+				PRINT("%*.s%s\n", (level + 1) * TAB_SIZE,
+				      " ", son->name);
+				bs->cur += len;
+				continue;
+			}
+			beg = bs->cur;
+
+			if ((err = (Decoders[son->type]) (bs, son,
+							  i <
+							  effective_count ?
+							  base : NULL,
+							  level + 1)) <
+			    H323_ERROR_NONE)
+				return err;
+
+			bs->cur = beg + len;
+			bs->bit = 0;
+		} else
+			if ((err = (Decoders[son->type]) (bs, son,
+							  i <
+							  effective_count ?
+							  base : NULL,
+							  level + 1)) <
+			    H323_ERROR_NONE)
+				return err;
+
+		if (base)
+			base += son->offset;
+	}
+
+	return H323_ERROR_NONE;
+}
+
+
+/****************************************************************************/
+static int decode_choice(bitstr_t *bs, const struct field_t *f,
+                         char *base, int level)
+{
+	unsigned int type, ext, len = 0;
+	int err;
+	const struct field_t *son;
+	unsigned char *beg = NULL;
+
+	PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name);
+
+	/* Decode? */
+	base = (base && (f->attr & DECODE)) ? base + f->offset : NULL;
+
+	/* Decode the choice index number */
+	if ((f->attr & EXT) && get_bit(bs)) {
+		ext = 1;
+		type = get_bits(bs, 7) + f->lb;
+	} else {
+		ext = 0;
+		type = get_bits(bs, f->sz);
+		if (type >= f->lb)
+			return H323_ERROR_RANGE;
+	}
+
+	/* Write Type */
+	if (base)
+		*(unsigned int *)base = type;
+
+	/* Check Range */
+	if (type >= f->ub) {	/* Newer version? */
+		BYTE_ALIGN(bs);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		bs->cur += len;
+		return H323_ERROR_NONE;
+	}
+
+	/* Transfer to son level */
+	son = &f->fields[type];
+	if (son->attr & STOP) {
+		PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name);
+		return H323_ERROR_STOP;
+	}
+
+	if (ext || (son->attr & OPEN)) {
+		BYTE_ALIGN(bs);
+		len = get_len(bs);
+		CHECK_BOUND(bs, len);
+		if (!base || !(son->attr & DECODE)) {
+			PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ",
+			      son->name);
+			bs->cur += len;
+			return H323_ERROR_NONE;
+		}
+		beg = bs->cur;
+
+		if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+		    H323_ERROR_NONE)
+			return err;
+
+		bs->cur = beg + len;
+		bs->bit = 0;
+	} else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) <
+		   H323_ERROR_NONE)
+		return err;
+
+	return H323_ERROR_NONE;
+}
+
+/****************************************************************************/
+int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras)
+{
+	static const struct field_t ras_message = {
+		FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT,
+		0, _RasMessage
+	};
+	bitstr_t bs;
+
+	bs.buf = bs.beg = bs.cur = buf;
+	bs.end = buf + sz;
+	bs.bit = 0;
+
+	return decode_choice(&bs, &ras_message, (char *) ras, 0);
+}
+
+/****************************************************************************/
+static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg,
+				      size_t sz, H323_UserInformation *uuie)
+{
+	static const struct field_t h323_userinformation = {
+		FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT,
+		0, _H323_UserInformation
+	};
+	bitstr_t bs;
+
+	bs.buf = buf;
+	bs.beg = bs.cur = beg;
+	bs.end = beg + sz;
+	bs.bit = 0;
+
+	return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0);
+}
+
+/****************************************************************************/
+int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz,
+					 MultimediaSystemControlMessage *
+					 mscm)
+{
+	static const struct field_t multimediasystemcontrolmessage = {
+		FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4,
+		DECODE | EXT, 0, _MultimediaSystemControlMessage
+	};
+	bitstr_t bs;
+
+	bs.buf = bs.beg = bs.cur = buf;
+	bs.end = buf + sz;
+	bs.bit = 0;
+
+	return decode_choice(&bs, &multimediasystemcontrolmessage,
+			     (char *) mscm, 0);
+}
+
+/****************************************************************************/
+int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931)
+{
+	unsigned char *p = buf;
+	int len;
+
+	if (!p || sz < 1)
+		return H323_ERROR_BOUND;
+
+	/* Protocol Discriminator */
+	if (*p != 0x08) {
+		PRINT("Unknown Protocol Discriminator\n");
+		return H323_ERROR_RANGE;
+	}
+	p++;
+	sz--;
+
+	/* CallReferenceValue */
+	if (sz < 1)
+		return H323_ERROR_BOUND;
+	len = *p++;
+	sz--;
+	if (sz < len)
+		return H323_ERROR_BOUND;
+	p += len;
+	sz -= len;
+
+	/* Message Type */
+	if (sz < 1)
+		return H323_ERROR_BOUND;
+	q931->MessageType = *p++;
+	PRINT("MessageType = %02X\n", q931->MessageType);
+	if (*p & 0x80) {
+		p++;
+		sz--;
+	}
+
+	/* Decode Information Elements */
+	while (sz > 0) {
+		if (*p == 0x7e) {	/* UserUserIE */
+			if (sz < 3)
+				break;
+			p++;
+			len = *p++ << 8;
+			len |= *p++;
+			sz -= 3;
+			if (sz < len)
+				break;
+			p++;
+			len--;
+			return DecodeH323_UserInformation(buf, p, len,
+							  &q931->UUIE);
+		}
+		p++;
+		sz--;
+		if (sz < 1)
+			break;
+		len = *p++;
+		if (sz < len)
+			break;
+		p += len;
+		sz -= len;
+	}
+
+	PRINT("Q.931 UUIE not found\n");
+
+	return H323_ERROR_BOUND;
+}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
new file mode 100644
index 00000000..f03c2d45
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -0,0 +1,1837 @@
+/*
+ * H.323 connection tracking helper
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ *
+ * Based on the 'brute force' H.323 connection tracking module by
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * For more information, please see http://nath323.sourceforge.net/
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/ip6_route.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_h323.h>
+
+/* Parameters */
+static unsigned int default_rrq_ttl __read_mostly = 300;
+module_param(default_rrq_ttl, uint, 0600);
+MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ");
+
+static int gkrouted_only __read_mostly = 1;
+module_param(gkrouted_only, int, 0600);
+MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper");
+
+static int callforward_filter __read_mostly = 1;
+module_param(callforward_filter, bool, 0600);
+MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
+				     "if both endpoints are on different sides "
+				     "(determined by routing information)");
+
+/* Hooks for NAT */
+int (*set_h245_addr_hook) (struct sk_buff *skb,
+			   unsigned char **data, int dataoff,
+			   H245_TransportAddress *taddr,
+			   union nf_inet_addr *addr, __be16 port)
+			   __read_mostly;
+int (*set_h225_addr_hook) (struct sk_buff *skb,
+			   unsigned char **data, int dataoff,
+			   TransportAddress *taddr,
+			   union nf_inet_addr *addr, __be16 port)
+			   __read_mostly;
+int (*set_sig_addr_hook) (struct sk_buff *skb,
+			  struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data,
+			  TransportAddress *taddr, int count) __read_mostly;
+int (*set_ras_addr_hook) (struct sk_buff *skb,
+			  struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data,
+			  TransportAddress *taddr, int count) __read_mostly;
+int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
+			  struct nf_conn *ct,
+			  enum ip_conntrack_info ctinfo,
+			  unsigned char **data, int dataoff,
+			  H245_TransportAddress *taddr,
+			  __be16 port, __be16 rtp_port,
+			  struct nf_conntrack_expect *rtp_exp,
+			  struct nf_conntrack_expect *rtcp_exp) __read_mostly;
+int (*nat_t120_hook) (struct sk_buff *skb,
+		      struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, int dataoff,
+		      H245_TransportAddress *taddr, __be16 port,
+		      struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_h245_hook) (struct sk_buff *skb,
+		      struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, int dataoff,
+		      TransportAddress *taddr, __be16 port,
+		      struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_callforwarding_hook) (struct sk_buff *skb,
+				struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				TransportAddress *taddr, __be16 port,
+				struct nf_conntrack_expect *exp) __read_mostly;
+int (*nat_q931_hook) (struct sk_buff *skb,
+		      struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo,
+		      unsigned char **data, TransportAddress *taddr, int idx,
+		      __be16 port, struct nf_conntrack_expect *exp)
+		      __read_mostly;
+
+static DEFINE_SPINLOCK(nf_h323_lock);
+static char *h323_buffer;
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245;
+static struct nf_conntrack_helper nf_conntrack_helper_q931[];
+static struct nf_conntrack_helper nf_conntrack_helper_ras[];
+
+/****************************************************************************/
+static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
+			 struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			 unsigned char **data, int *datalen, int *dataoff)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	int tcpdatalen;
+	int tcpdataoff;
+	unsigned char *tpkt;
+	int tpktlen;
+	int tpktoff;
+
+	/* Get TCP header */
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+
+	/* Get TCP data offset */
+	tcpdataoff = protoff + th->doff * 4;
+
+	/* Get TCP data length */
+	tcpdatalen = skb->len - tcpdataoff;
+	if (tcpdatalen <= 0)	/* No TCP data */
+		goto clear_out;
+
+	if (*data == NULL) {	/* first TPKT */
+		/* Get first TPKT pointer */
+		tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
+					  h323_buffer);
+		BUG_ON(tpkt == NULL);
+
+		/* Validate TPKT identifier */
+		if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
+			/* Netmeeting sends TPKT header and data separately */
+			if (info->tpkt_len[dir] > 0) {
+				pr_debug("nf_ct_h323: previous packet "
+					 "indicated separate TPKT data of %hu "
+					 "bytes\n", info->tpkt_len[dir]);
+				if (info->tpkt_len[dir] <= tcpdatalen) {
+					/* Yes, there was a TPKT header
+					 * received */
+					*data = tpkt;
+					*datalen = info->tpkt_len[dir];
+					*dataoff = 0;
+					goto out;
+				}
+
+				/* Fragmented TPKT */
+				pr_debug("nf_ct_h323: fragmented TPKT\n");
+				goto clear_out;
+			}
+
+			/* It is not even a TPKT */
+			return 0;
+		}
+		tpktoff = 0;
+	} else {		/* Next TPKT */
+		tpktoff = *dataoff + *datalen;
+		tcpdatalen -= tpktoff;
+		if (tcpdatalen <= 4)	/* No more TPKT */
+			goto clear_out;
+		tpkt = *data + *datalen;
+
+		/* Validate TPKT identifier */
+		if (tpkt[0] != 0x03 || tpkt[1] != 0)
+			goto clear_out;
+	}
+
+	/* Validate TPKT length */
+	tpktlen = tpkt[2] * 256 + tpkt[3];
+	if (tpktlen < 4)
+		goto clear_out;
+	if (tpktlen > tcpdatalen) {
+		if (tcpdatalen == 4) {	/* Separate TPKT header */
+			/* Netmeeting sends TPKT header and data separately */
+			pr_debug("nf_ct_h323: separate TPKT header indicates "
+				 "there will be TPKT data of %hu bytes\n",
+				 tpktlen - 4);
+			info->tpkt_len[dir] = tpktlen - 4;
+			return 0;
+		}
+
+		pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n");
+		goto clear_out;
+	}
+
+	/* This is the encapsulated data */
+	*data = tpkt + 4;
+	*datalen = tpktlen - 4;
+	*dataoff = tpktoff + 4;
+
+      out:
+	/* Clear TPKT length */
+	info->tpkt_len[dir] = 0;
+	return 1;
+
+      clear_out:
+	info->tpkt_len[dir] = 0;
+	return 0;
+}
+
+/****************************************************************************/
+static int get_h245_addr(struct nf_conn *ct, const unsigned char *data,
+			 H245_TransportAddress *taddr,
+			 union nf_inet_addr *addr, __be16 *port)
+{
+	const unsigned char *p;
+	int len;
+
+	if (taddr->choice != eH245_TransportAddress_unicastAddress)
+		return 0;
+
+	switch (taddr->unicastAddress.choice) {
+	case eUnicastAddress_iPAddress:
+		if (nf_ct_l3num(ct) != AF_INET)
+			return 0;
+		p = data + taddr->unicastAddress.iPAddress.network;
+		len = 4;
+		break;
+	case eUnicastAddress_iP6Address:
+		if (nf_ct_l3num(ct) != AF_INET6)
+			return 0;
+		p = data + taddr->unicastAddress.iP6Address.network;
+		len = 16;
+		break;
+	default:
+		return 0;
+	}
+
+	memcpy(addr, p, len);
+	memset((void *)addr + len, 0, sizeof(*addr) - len);
+	memcpy(port, p + len, sizeof(__be16));
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned char **data, int dataoff,
+			   H245_TransportAddress *taddr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	__be16 rtp_port, rtcp_port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *rtp_exp;
+	struct nf_conntrack_expect *rtcp_exp;
+	typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
+
+	/* Read RTP or RTCP address */
+	if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+	    memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+	    port == 0)
+		return 0;
+
+	/* RTP port is even */
+	port &= htons(~1);
+	rtp_port = port;
+	rtcp_port = htons(ntohs(port) + 1);
+
+	/* Create expect for RTP */
+	if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  IPPROTO_UDP, NULL, &rtp_port);
+
+	/* Create expect for RTCP */
+	if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) {
+		nf_ct_expect_put(rtp_exp);
+		return -1;
+	}
+	nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  IPPROTO_UDP, NULL, &rtcp_port);
+
+	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+		   &ct->tuplehash[!dir].tuple.dst.u3,
+		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+		   (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+		   ct->status & IPS_NAT_MASK) {
+		/* NAT needed */
+		ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+				   taddr, port, rtp_port, rtp_exp, rtcp_exp);
+	} else {		/* Conntrack only */
+		if (nf_ct_expect_related(rtp_exp) == 0) {
+			if (nf_ct_expect_related(rtcp_exp) == 0) {
+				pr_debug("nf_ct_h323: expect RTP ");
+				nf_ct_dump_tuple(&rtp_exp->tuple);
+				pr_debug("nf_ct_h323: expect RTCP ");
+				nf_ct_dump_tuple(&rtcp_exp->tuple);
+			} else {
+				nf_ct_unexpect_related(rtp_exp);
+				ret = -1;
+			}
+		} else
+			ret = -1;
+	}
+
+	nf_ct_expect_put(rtp_exp);
+	nf_ct_expect_put(rtcp_exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int expect_t120(struct sk_buff *skb,
+		       struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       H245_TransportAddress *taddr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+	typeof(nat_t120_hook) nat_t120;
+
+	/* Read T.120 address */
+	if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
+	    memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+	    port == 0)
+		return 0;
+
+	/* Create expect for T.120 connections */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  IPPROTO_TCP, NULL, &port);
+	exp->flags = NF_CT_EXPECT_PERMANENT;	/* Accept multiple channels */
+
+	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+		   &ct->tuplehash[!dir].tuple.dst.u3,
+		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+	    (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+	    ct->status & IPS_NAT_MASK) {
+		/* NAT needed */
+		ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr,
+			       port, exp);
+	} else {		/* Conntrack only */
+		if (nf_ct_expect_related(exp) == 0) {
+			pr_debug("nf_ct_h323: expect T.120 ");
+			nf_ct_dump_tuple(&exp->tuple);
+		} else
+			ret = -1;
+	}
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_h245_channel(struct sk_buff *skb,
+				struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned char **data, int dataoff,
+				H2250LogicalChannelParameters *channel)
+{
+	int ret;
+
+	if (channel->options & eH2250LogicalChannelParameters_mediaChannel) {
+		/* RTP */
+		ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+				      &channel->mediaChannel);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (channel->
+	    options & eH2250LogicalChannelParameters_mediaControlChannel) {
+		/* RTCP */
+		ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+				      &channel->mediaControlChannel);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_olc(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       OpenLogicalChannel *olc)
+{
+	int ret;
+
+	pr_debug("nf_ct_h323: OpenLogicalChannel\n");
+
+	if (olc->forwardLogicalChannelParameters.multiplexParameters.choice ==
+	    eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)
+	{
+		ret = process_h245_channel(skb, ct, ctinfo, data, dataoff,
+					   &olc->
+					   forwardLogicalChannelParameters.
+					   multiplexParameters.
+					   h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olc->options &
+	     eOpenLogicalChannel_reverseLogicalChannelParameters) &&
+	    (olc->reverseLogicalChannelParameters.options &
+	     eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters)
+	    && (olc->reverseLogicalChannelParameters.multiplexParameters.
+		choice ==
+		eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+	{
+		ret =
+		    process_h245_channel(skb, ct, ctinfo, data, dataoff,
+					 &olc->
+					 reverseLogicalChannelParameters.
+					 multiplexParameters.
+					 h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olc->options & eOpenLogicalChannel_separateStack) &&
+	    olc->forwardLogicalChannelParameters.dataType.choice ==
+	    eDataType_data &&
+	    olc->forwardLogicalChannelParameters.dataType.data.application.
+	    choice == eDataApplicationCapability_application_t120 &&
+	    olc->forwardLogicalChannelParameters.dataType.data.application.
+	    t120.choice == eDataProtocolCapability_separateLANStack &&
+	    olc->separateStack.networkAddress.choice ==
+	    eNetworkAccessParameters_networkAddress_localAreaAddress) {
+		ret = expect_t120(skb, ct, ctinfo, data, dataoff,
+				  &olc->separateStack.networkAddress.
+				  localAreaAddress);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_olca(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			OpenLogicalChannelAck *olca)
+{
+	H2250LogicalChannelAckParameters *ack;
+	int ret;
+
+	pr_debug("nf_ct_h323: OpenLogicalChannelAck\n");
+
+	if ((olca->options &
+	     eOpenLogicalChannelAck_reverseLogicalChannelParameters) &&
+	    (olca->reverseLogicalChannelParameters.options &
+	     eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters)
+	    && (olca->reverseLogicalChannelParameters.multiplexParameters.
+		choice ==
+		eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters))
+	{
+		ret = process_h245_channel(skb, ct, ctinfo, data, dataoff,
+					   &olca->
+					   reverseLogicalChannelParameters.
+					   multiplexParameters.
+					   h2250LogicalChannelParameters);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((olca->options &
+	     eOpenLogicalChannelAck_forwardMultiplexAckParameters) &&
+	    (olca->forwardMultiplexAckParameters.choice ==
+	     eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters))
+	{
+		ack = &olca->forwardMultiplexAckParameters.
+		    h2250LogicalChannelAckParameters;
+		if (ack->options &
+		    eH2250LogicalChannelAckParameters_mediaChannel) {
+			/* RTP */
+			ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+					      &ack->mediaChannel);
+			if (ret < 0)
+				return -1;
+		}
+
+		if (ack->options &
+		    eH2250LogicalChannelAckParameters_mediaControlChannel) {
+			/* RTCP */
+			ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff,
+					      &ack->mediaControlChannel);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	if ((olca->options & eOpenLogicalChannelAck_separateStack) &&
+		olca->separateStack.networkAddress.choice ==
+		eNetworkAccessParameters_networkAddress_localAreaAddress) {
+		ret = expect_t120(skb, ct, ctinfo, data, dataoff,
+				  &olca->separateStack.networkAddress.
+				  localAreaAddress);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_h245(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff,
+			MultimediaSystemControlMessage *mscm)
+{
+	switch (mscm->choice) {
+	case eMultimediaSystemControlMessage_request:
+		if (mscm->request.choice ==
+		    eRequestMessage_openLogicalChannel) {
+			return process_olc(skb, ct, ctinfo, data, dataoff,
+					   &mscm->request.openLogicalChannel);
+		}
+		pr_debug("nf_ct_h323: H.245 Request %d\n",
+			 mscm->request.choice);
+		break;
+	case eMultimediaSystemControlMessage_response:
+		if (mscm->response.choice ==
+		    eResponseMessage_openLogicalChannelAck) {
+			return process_olca(skb, ct, ctinfo, data, dataoff,
+					    &mscm->response.
+					    openLogicalChannelAck);
+		}
+		pr_debug("nf_ct_h323: H.245 Response %d\n",
+			 mscm->response.choice);
+		break;
+	default:
+		pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice);
+		break;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int h245_help(struct sk_buff *skb, unsigned int protoff,
+		     struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	static MultimediaSystemControlMessage mscm;
+	unsigned char *data = NULL;
+	int datalen;
+	int dataoff;
+	int ret;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	pr_debug("nf_ct_h245: skblen = %u\n", skb->len);
+
+	spin_lock_bh(&nf_h323_lock);
+
+	/* Process each TPKT */
+	while (get_tpkt_data(skb, protoff, ct, ctinfo,
+			     &data, &datalen, &dataoff)) {
+		pr_debug("nf_ct_h245: TPKT len=%d ", datalen);
+		nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+		/* Decode H.245 signal */
+		ret = DecodeMultimediaSystemControlMessage(data, datalen,
+							   &mscm);
+		if (ret < 0) {
+			pr_debug("nf_ct_h245: decoding error: %s\n",
+				 ret == H323_ERROR_BOUND ?
+				 "out of bound" : "out of range");
+			/* We don't drop when decoding error */
+			break;
+		}
+
+		/* Process H.245 signal */
+		if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0)
+			goto drop;
+	}
+
+	spin_unlock_bh(&nf_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&nf_h323_lock);
+	if (net_ratelimit())
+		pr_info("nf_ct_h245: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy h245_exp_policy = {
+	.max_expected	= H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */,
+	.timeout	= 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = {
+	.name			= "H.245",
+	.me			= THIS_MODULE,
+	.tuple.src.l3num	= AF_UNSPEC,
+	.tuple.dst.protonum	= IPPROTO_UDP,
+	.help			= h245_help,
+	.expect_policy		= &h245_exp_policy,
+};
+
+/****************************************************************************/
+int get_h225_addr(struct nf_conn *ct, unsigned char *data,
+		  TransportAddress *taddr,
+		  union nf_inet_addr *addr, __be16 *port)
+{
+	const unsigned char *p;
+	int len;
+
+	switch (taddr->choice) {
+	case eTransportAddress_ipAddress:
+		if (nf_ct_l3num(ct) != AF_INET)
+			return 0;
+		p = data + taddr->ipAddress.ip;
+		len = 4;
+		break;
+	case eTransportAddress_ip6Address:
+		if (nf_ct_l3num(ct) != AF_INET6)
+			return 0;
+		p = data + taddr->ip6Address.ip;
+		len = 16;
+		break;
+	default:
+		return 0;
+	}
+
+	memcpy(addr, p, len);
+	memset((void *)addr + len, 0, sizeof(*addr) - len);
+	memcpy(port, p + len, sizeof(__be16));
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, int dataoff,
+		       TransportAddress *taddr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+	typeof(nat_h245_hook) nat_h245;
+
+	/* Read h245Address */
+	if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
+	    memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) ||
+	    port == 0)
+		return 0;
+
+	/* Create expect for h245 connection */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  IPPROTO_TCP, NULL, &port);
+	exp->helper = &nf_conntrack_helper_h245;
+
+	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+		   &ct->tuplehash[!dir].tuple.dst.u3,
+		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+	    (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+	    ct->status & IPS_NAT_MASK) {
+		/* NAT needed */
+		ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr,
+			       port, exp);
+	} else {		/* Conntrack only */
+		if (nf_ct_expect_related(exp) == 0) {
+			pr_debug("nf_ct_q931: expect H.245 ");
+			nf_ct_dump_tuple(&exp->tuple);
+		} else
+			ret = -1;
+	}
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/* If the calling party is on the same side of the forward-to party,
+ * we don't need to track the second call */
+static int callforward_do_filter(const union nf_inet_addr *src,
+				 const union nf_inet_addr *dst,
+				 u_int8_t family)
+{
+	const struct nf_afinfo *afinfo;
+	int ret = 0;
+
+	/* rcu_read_lock()ed by nf_hook_slow() */
+	afinfo = nf_get_afinfo(family);
+	if (!afinfo)
+		return 0;
+
+	switch (family) {
+	case AF_INET: {
+		struct flowi4 fl1, fl2;
+		struct rtable *rt1, *rt2;
+
+		memset(&fl1, 0, sizeof(fl1));
+		fl1.daddr = src->ip;
+
+		memset(&fl2, 0, sizeof(fl2));
+		fl2.daddr = dst->ip;
+		if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+				   flowi4_to_flowi(&fl1), false)) {
+			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+					   flowi4_to_flowi(&fl2), false)) {
+				if (rt1->rt_gateway == rt2->rt_gateway &&
+				    rt1->dst.dev  == rt2->dst.dev)
+					ret = 1;
+				dst_release(&rt2->dst);
+			}
+			dst_release(&rt1->dst);
+		}
+		break;
+	}
+#if defined(CONFIG_NF_CONNTRACK_IPV6) || \
+    defined(CONFIG_NF_CONNTRACK_IPV6_MODULE)
+	case AF_INET6: {
+		struct flowi6 fl1, fl2;
+		struct rt6_info *rt1, *rt2;
+
+		memset(&fl1, 0, sizeof(fl1));
+		ipv6_addr_copy(&fl1.daddr, &src->in6);
+
+		memset(&fl2, 0, sizeof(fl2));
+		ipv6_addr_copy(&fl2.daddr, &dst->in6);
+		if (!afinfo->route(&init_net, (struct dst_entry **)&rt1,
+				   flowi6_to_flowi(&fl1), false)) {
+			if (!afinfo->route(&init_net, (struct dst_entry **)&rt2,
+					   flowi6_to_flowi(&fl2), false)) {
+				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
+					    sizeof(rt1->rt6i_gateway)) &&
+				    rt1->dst.dev == rt2->dst.dev)
+					ret = 1;
+				dst_release(&rt2->dst);
+			}
+			dst_release(&rt1->dst);
+		}
+		break;
+	}
+#endif
+	}
+	return ret;
+
+}
+
+/****************************************************************************/
+static int expect_callforwarding(struct sk_buff *skb,
+				 struct nf_conn *ct,
+				 enum ip_conntrack_info ctinfo,
+				 unsigned char **data, int dataoff,
+				 TransportAddress *taddr)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+	typeof(nat_callforwarding_hook) nat_callforwarding;
+
+	/* Read alternativeAddress */
+	if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
+		return 0;
+
+	/* If the calling party is on the same side of the forward-to party,
+	 * we don't need to track the second call */
+	if (callforward_filter &&
+	    callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3,
+				  nf_ct_l3num(ct))) {
+		pr_debug("nf_ct_q931: Call Forwarding not tracked\n");
+		return 0;
+	}
+
+	/* Create expect for the second call leg */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3, &addr,
+			  IPPROTO_TCP, NULL, &port);
+	exp->helper = nf_conntrack_helper_q931;
+
+	if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
+		   &ct->tuplehash[!dir].tuple.dst.u3,
+		   sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
+	    (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+	    ct->status & IPS_NAT_MASK) {
+		/* Need NAT */
+		ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff,
+					 taddr, port, exp);
+	} else {		/* Conntrack only */
+		if (nf_ct_expect_related(exp) == 0) {
+			pr_debug("nf_ct_q931: expect Call Forwarding ");
+			nf_ct_dump_tuple(&exp->tuple);
+		} else
+			ret = -1;
+	}
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned char **data, int dataoff,
+			 Setup_UUIE *setup)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+	typeof(set_h225_addr_hook) set_h225_addr;
+
+	pr_debug("nf_ct_q931: Setup\n");
+
+	if (setup->options & eSetup_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &setup->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	set_h225_addr = rcu_dereference(set_h225_addr_hook);
+	if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
+	    (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+	    get_h225_addr(ct, *data, &setup->destCallSignalAddress,
+			  &addr, &port) &&
+	    memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
+		pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
+			 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
+		ret = set_h225_addr(skb, data, dataoff,
+				    &setup->destCallSignalAddress,
+				    &ct->tuplehash[!dir].tuple.src.u3,
+				    ct->tuplehash[!dir].tuple.src.u.tcp.port);
+		if (ret < 0)
+			return -1;
+	}
+
+	if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
+	    (set_h225_addr) && ct->status & IPS_NAT_MASK &&
+	    get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
+			  &addr, &port) &&
+	    memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
+		pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
+			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
+			 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
+		ret = set_h225_addr(skb, data, dataoff,
+				    &setup->sourceCallSignalAddress,
+				    &ct->tuplehash[!dir].tuple.dst.u3,
+				    ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (setup->options & eSetup_UUIE_fastStart) {
+		for (i = 0; i < setup->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &setup->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_callproceeding(struct sk_buff *skb,
+				  struct nf_conn *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned char **data, int dataoff,
+				  CallProceeding_UUIE *callproc)
+{
+	int ret;
+	int i;
+
+	pr_debug("nf_ct_q931: CallProceeding\n");
+
+	if (callproc->options & eCallProceeding_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &callproc->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (callproc->options & eCallProceeding_UUIE_fastStart) {
+		for (i = 0; i < callproc->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &callproc->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_connect(struct sk_buff *skb, struct nf_conn *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned char **data, int dataoff,
+			   Connect_UUIE *connect)
+{
+	int ret;
+	int i;
+
+	pr_debug("nf_ct_q931: Connect\n");
+
+	if (connect->options & eConnect_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &connect->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (connect->options & eConnect_UUIE_fastStart) {
+		for (i = 0; i < connect->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &connect->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_alerting(struct sk_buff *skb, struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Alerting_UUIE *alert)
+{
+	int ret;
+	int i;
+
+	pr_debug("nf_ct_q931: Alerting\n");
+
+	if (alert->options & eAlerting_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &alert->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (alert->options & eAlerting_UUIE_fastStart) {
+		for (i = 0; i < alert->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &alert->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_facility(struct sk_buff *skb, struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Facility_UUIE *facility)
+{
+	int ret;
+	int i;
+
+	pr_debug("nf_ct_q931: Facility\n");
+
+	if (facility->reason.choice == eFacilityReason_callForwarded) {
+		if (facility->options & eFacility_UUIE_alternativeAddress)
+			return expect_callforwarding(skb, ct, ctinfo, data,
+						     dataoff,
+						     &facility->
+						     alternativeAddress);
+		return 0;
+	}
+
+	if (facility->options & eFacility_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &facility->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (facility->options & eFacility_UUIE_fastStart) {
+		for (i = 0; i < facility->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &facility->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_progress(struct sk_buff *skb, struct nf_conn *ct,
+			    enum ip_conntrack_info ctinfo,
+			    unsigned char **data, int dataoff,
+			    Progress_UUIE *progress)
+{
+	int ret;
+	int i;
+
+	pr_debug("nf_ct_q931: Progress\n");
+
+	if (progress->options & eProgress_UUIE_h245Address) {
+		ret = expect_h245(skb, ct, ctinfo, data, dataoff,
+				  &progress->h245Address);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (progress->options & eProgress_UUIE_fastStart) {
+		for (i = 0; i < progress->fastStart.count; i++) {
+			ret = process_olc(skb, ct, ctinfo, data, dataoff,
+					  &progress->fastStart.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_q931(struct sk_buff *skb, struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned char **data, int dataoff, Q931 *q931)
+{
+	H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu;
+	int i;
+	int ret = 0;
+
+	switch (pdu->h323_message_body.choice) {
+	case eH323_UU_PDU_h323_message_body_setup:
+		ret = process_setup(skb, ct, ctinfo, data, dataoff,
+				    &pdu->h323_message_body.setup);
+		break;
+	case eH323_UU_PDU_h323_message_body_callProceeding:
+		ret = process_callproceeding(skb, ct, ctinfo, data, dataoff,
+					     &pdu->h323_message_body.
+					     callProceeding);
+		break;
+	case eH323_UU_PDU_h323_message_body_connect:
+		ret = process_connect(skb, ct, ctinfo, data, dataoff,
+				      &pdu->h323_message_body.connect);
+		break;
+	case eH323_UU_PDU_h323_message_body_alerting:
+		ret = process_alerting(skb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.alerting);
+		break;
+	case eH323_UU_PDU_h323_message_body_facility:
+		ret = process_facility(skb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.facility);
+		break;
+	case eH323_UU_PDU_h323_message_body_progress:
+		ret = process_progress(skb, ct, ctinfo, data, dataoff,
+				       &pdu->h323_message_body.progress);
+		break;
+	default:
+		pr_debug("nf_ct_q931: Q.931 signal %d\n",
+			 pdu->h323_message_body.choice);
+		break;
+	}
+
+	if (ret < 0)
+		return -1;
+
+	if (pdu->options & eH323_UU_PDU_h245Control) {
+		for (i = 0; i < pdu->h245Control.count; i++) {
+			ret = process_h245(skb, ct, ctinfo, data, dataoff,
+					   &pdu->h245Control.item[i]);
+			if (ret < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int q931_help(struct sk_buff *skb, unsigned int protoff,
+		     struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	static Q931 q931;
+	unsigned char *data = NULL;
+	int datalen;
+	int dataoff;
+	int ret;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	pr_debug("nf_ct_q931: skblen = %u\n", skb->len);
+
+	spin_lock_bh(&nf_h323_lock);
+
+	/* Process each TPKT */
+	while (get_tpkt_data(skb, protoff, ct, ctinfo,
+			     &data, &datalen, &dataoff)) {
+		pr_debug("nf_ct_q931: TPKT len=%d ", datalen);
+		nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+		/* Decode Q.931 signal */
+		ret = DecodeQ931(data, datalen, &q931);
+		if (ret < 0) {
+			pr_debug("nf_ct_q931: decoding error: %s\n",
+				 ret == H323_ERROR_BOUND ?
+				 "out of bound" : "out of range");
+			/* We don't drop when decoding error */
+			break;
+		}
+
+		/* Process Q.931 signal */
+		if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0)
+			goto drop;
+	}
+
+	spin_unlock_bh(&nf_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&nf_h323_lock);
+	if (net_ratelimit())
+		pr_info("nf_ct_q931: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy q931_exp_policy = {
+	/* T.120 and H.245 */
+	.max_expected		= H323_RTP_CHANNEL_MAX * 4 + 4,
+	.timeout		= 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = {
+	{
+		.name			= "Q.931",
+		.me			= THIS_MODULE,
+		.tuple.src.l3num	= AF_INET,
+		.tuple.src.u.tcp.port	= cpu_to_be16(Q931_PORT),
+		.tuple.dst.protonum	= IPPROTO_TCP,
+		.help			= q931_help,
+		.expect_policy		= &q931_exp_policy,
+	},
+	{
+		.name			= "Q.931",
+		.me			= THIS_MODULE,
+		.tuple.src.l3num	= AF_INET6,
+		.tuple.src.u.tcp.port	= cpu_to_be16(Q931_PORT),
+		.tuple.dst.protonum	= IPPROTO_TCP,
+		.help			= q931_help,
+		.expect_policy		= &q931_exp_policy,
+	},
+};
+
+/****************************************************************************/
+static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
+				   int *datalen)
+{
+	const struct udphdr *uh;
+	struct udphdr _uh;
+	int dataoff;
+
+	uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh);
+	if (uh == NULL)
+		return NULL;
+	dataoff = protoff + sizeof(_uh);
+	if (dataoff >= skb->len)
+		return NULL;
+	*datalen = skb->len - dataoff;
+	return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
+}
+
+/****************************************************************************/
+static struct nf_conntrack_expect *find_expect(struct nf_conn *ct,
+					       union nf_inet_addr *addr,
+					       __be16 port)
+{
+	struct net *net = nf_ct_net(ct);
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple tuple;
+
+	memset(&tuple.src.u3, 0, sizeof(tuple.src.u3));
+	tuple.src.u.tcp.port = 0;
+	memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3));
+	tuple.dst.u.tcp.port = port;
+	tuple.dst.protonum = IPPROTO_TCP;
+
+	exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+	if (exp && exp->master == ct)
+		return exp;
+	return NULL;
+}
+
+/****************************************************************************/
+static int set_expect_timeout(struct nf_conntrack_expect *exp,
+			      unsigned timeout)
+{
+	if (!exp || !del_timer(&exp->timeout))
+		return 0;
+
+	exp->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&exp->timeout);
+
+	return 1;
+}
+
+/****************************************************************************/
+static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data,
+		       TransportAddress *taddr, int count)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	int i;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+	typeof(nat_q931_hook) nat_q931;
+
+	/* Look for the first related address */
+	for (i = 0; i < count; i++) {
+		if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
+		    memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3,
+			   sizeof(addr)) == 0 && port != 0)
+			break;
+	}
+
+	if (i >= count)		/* Not found */
+		return 0;
+
+	/* Create expect for Q.931 */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  gkrouted_only ? /* only accept calls from GK? */
+				&ct->tuplehash[!dir].tuple.src.u3 : NULL,
+			  &ct->tuplehash[!dir].tuple.dst.u3,
+			  IPPROTO_TCP, NULL, &port);
+	exp->helper = nf_conntrack_helper_q931;
+	exp->flags = NF_CT_EXPECT_PERMANENT;	/* Accept multiple calls */
+
+	nat_q931 = rcu_dereference(nat_q931_hook);
+	if (nat_q931 && ct->status & IPS_NAT_MASK) {	/* Need NAT */
+		ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp);
+	} else {		/* Conntrack only */
+		if (nf_ct_expect_related(exp) == 0) {
+			pr_debug("nf_ct_ras: expect Q.931 ");
+			nf_ct_dump_tuple(&exp->tuple);
+
+			/* Save port for looking up expect in processing RCF */
+			info->sig_port[dir] = port;
+		} else
+			ret = -1;
+	}
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, GatekeeperRequest *grq)
+{
+	typeof(set_ras_addr_hook) set_ras_addr;
+
+	pr_debug("nf_ct_ras: GRQ\n");
+
+	set_ras_addr = rcu_dereference(set_ras_addr_hook);
+	if (set_ras_addr && ct->status & IPS_NAT_MASK)	/* NATed */
+		return set_ras_addr(skb, ct, ctinfo, data,
+				    &grq->rasAddress, 1);
+	return 0;
+}
+
+/****************************************************************************/
+static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, GatekeeperConfirm *gcf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+
+	pr_debug("nf_ct_ras: GCF\n");
+
+	if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port))
+		return 0;
+
+	/* Registration port is the same as discovery port */
+	if (!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+	    port == ct->tuplehash[dir].tuple.src.u.udp.port)
+		return 0;
+
+	/* Avoid RAS expectation loops. A GCF is never expected. */
+	if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+		return 0;
+
+	/* Need new expect */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3, &addr,
+			  IPPROTO_UDP, NULL, &port);
+	exp->helper = nf_conntrack_helper_ras;
+
+	if (nf_ct_expect_related(exp) == 0) {
+		pr_debug("nf_ct_ras: expect RAS ");
+		nf_ct_dump_tuple(&exp->tuple);
+	} else
+		ret = -1;
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RegistrationRequest *rrq)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int ret;
+	typeof(set_ras_addr_hook) set_ras_addr;
+
+	pr_debug("nf_ct_ras: RRQ\n");
+
+	ret = expect_q931(skb, ct, ctinfo, data,
+			  rrq->callSignalAddress.item,
+			  rrq->callSignalAddress.count);
+	if (ret < 0)
+		return -1;
+
+	set_ras_addr = rcu_dereference(set_ras_addr_hook);
+	if (set_ras_addr && ct->status & IPS_NAT_MASK) {
+		ret = set_ras_addr(skb, ct, ctinfo, data,
+				   rrq->rasAddress.item,
+				   rrq->rasAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (rrq->options & eRegistrationRequest_timeToLive) {
+		pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive);
+		info->timeout = rrq->timeToLive;
+	} else
+		info->timeout = default_rrq_ttl;
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RegistrationConfirm *rcf)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+	struct nf_conntrack_expect *exp;
+	typeof(set_sig_addr_hook) set_sig_addr;
+
+	pr_debug("nf_ct_ras: RCF\n");
+
+	set_sig_addr = rcu_dereference(set_sig_addr_hook);
+	if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+		ret = set_sig_addr(skb, ct, ctinfo, data,
+					rcf->callSignalAddress.item,
+					rcf->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	if (rcf->options & eRegistrationConfirm_timeToLive) {
+		pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive);
+		info->timeout = rcf->timeToLive;
+	}
+
+	if (info->timeout > 0) {
+		pr_debug("nf_ct_ras: set RAS connection timeout to "
+			 "%u seconds\n", info->timeout);
+		nf_ct_refresh(ct, skb, info->timeout * HZ);
+
+		/* Set expect timeout */
+		spin_lock_bh(&nf_conntrack_lock);
+		exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3,
+				  info->sig_port[!dir]);
+		if (exp) {
+			pr_debug("nf_ct_ras: set Q.931 expect "
+				 "timeout to %u seconds for",
+				 info->timeout);
+			nf_ct_dump_tuple(&exp->tuple);
+			set_expect_timeout(exp, info->timeout);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, UnregistrationRequest *urq)
+{
+	struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	int ret;
+	typeof(set_sig_addr_hook) set_sig_addr;
+
+	pr_debug("nf_ct_ras: URQ\n");
+
+	set_sig_addr = rcu_dereference(set_sig_addr_hook);
+	if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+		ret = set_sig_addr(skb, ct, ctinfo, data,
+				   urq->callSignalAddress.item,
+				   urq->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	/* Clear old expect */
+	nf_ct_remove_expectations(ct);
+	info->sig_port[dir] = 0;
+	info->sig_port[!dir] = 0;
+
+	/* Give it 30 seconds for UCF or URJ */
+	nf_ct_refresh(ct, skb, 30 * HZ);
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, AdmissionRequest *arq)
+{
+	const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
+	int dir = CTINFO2DIR(ctinfo);
+	__be16 port;
+	union nf_inet_addr addr;
+	typeof(set_h225_addr_hook) set_h225_addr;
+
+	pr_debug("nf_ct_ras: ARQ\n");
+
+	set_h225_addr = rcu_dereference(set_h225_addr_hook);
+	if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
+	    get_h225_addr(ct, *data, &arq->destCallSignalAddress,
+			  &addr, &port) &&
+	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+	    port == info->sig_port[dir] &&
+	    set_h225_addr && ct->status & IPS_NAT_MASK) {
+		/* Answering ARQ */
+		return set_h225_addr(skb, data, 0,
+				     &arq->destCallSignalAddress,
+				     &ct->tuplehash[!dir].tuple.dst.u3,
+				     info->sig_port[!dir]);
+	}
+
+	if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
+	    get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
+			  &addr, &port) &&
+	    !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
+	    set_h225_addr && ct->status & IPS_NAT_MASK) {
+		/* Calling ARQ */
+		return set_h225_addr(skb, data, 0,
+				     &arq->srcCallSignalAddress,
+				     &ct->tuplehash[!dir].tuple.dst.u3,
+				     port);
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, AdmissionConfirm *acf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+	typeof(set_sig_addr_hook) set_sig_addr;
+
+	pr_debug("nf_ct_ras: ACF\n");
+
+	if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress,
+			   &addr, &port))
+		return 0;
+
+	if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+		/* Answering ACF */
+		set_sig_addr = rcu_dereference(set_sig_addr_hook);
+		if (set_sig_addr && ct->status & IPS_NAT_MASK)
+			return set_sig_addr(skb, ct, ctinfo, data,
+					    &acf->destCallSignalAddress, 1);
+		return 0;
+	}
+
+	/* Need new expect */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3, &addr,
+			  IPPROTO_TCP, NULL, &port);
+	exp->flags = NF_CT_EXPECT_PERMANENT;
+	exp->helper = nf_conntrack_helper_q931;
+
+	if (nf_ct_expect_related(exp) == 0) {
+		pr_debug("nf_ct_ras: expect Q.931 ");
+		nf_ct_dump_tuple(&exp->tuple);
+	} else
+		ret = -1;
+
+	nf_ct_expect_put(exp);
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, LocationRequest *lrq)
+{
+	typeof(set_ras_addr_hook) set_ras_addr;
+
+	pr_debug("nf_ct_ras: LRQ\n");
+
+	set_ras_addr = rcu_dereference(set_ras_addr_hook);
+	if (set_ras_addr && ct->status & IPS_NAT_MASK)
+		return set_ras_addr(skb, ct, ctinfo, data,
+				    &lrq->replyAddress, 1);
+	return 0;
+}
+
+/****************************************************************************/
+static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, LocationConfirm *lcf)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	int ret = 0;
+	__be16 port;
+	union nf_inet_addr addr;
+	struct nf_conntrack_expect *exp;
+
+	pr_debug("nf_ct_ras: LCF\n");
+
+	if (!get_h225_addr(ct, *data, &lcf->callSignalAddress,
+			   &addr, &port))
+		return 0;
+
+	/* Need new expect for call signal */
+	if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+		return -1;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &ct->tuplehash[!dir].tuple.src.u3, &addr,
+			  IPPROTO_TCP, NULL, &port);
+	exp->flags = NF_CT_EXPECT_PERMANENT;
+	exp->helper = nf_conntrack_helper_q931;
+
+	if (nf_ct_expect_related(exp) == 0) {
+		pr_debug("nf_ct_ras: expect Q.931 ");
+		nf_ct_dump_tuple(&exp->tuple);
+	} else
+		ret = -1;
+
+	nf_ct_expect_put(exp);
+
+	/* Ignore rasAddress */
+
+	return ret;
+}
+
+/****************************************************************************/
+static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, InfoRequestResponse *irr)
+{
+	int ret;
+	typeof(set_ras_addr_hook) set_ras_addr;
+	typeof(set_sig_addr_hook) set_sig_addr;
+
+	pr_debug("nf_ct_ras: IRR\n");
+
+	set_ras_addr = rcu_dereference(set_ras_addr_hook);
+	if (set_ras_addr && ct->status & IPS_NAT_MASK) {
+		ret = set_ras_addr(skb, ct, ctinfo, data,
+				   &irr->rasAddress, 1);
+		if (ret < 0)
+			return -1;
+	}
+
+	set_sig_addr = rcu_dereference(set_sig_addr_hook);
+	if (set_sig_addr && ct->status & IPS_NAT_MASK) {
+		ret = set_sig_addr(skb, ct, ctinfo, data,
+					irr->callSignalAddress.item,
+					irr->callSignalAddress.count);
+		if (ret < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int process_ras(struct sk_buff *skb, struct nf_conn *ct,
+		       enum ip_conntrack_info ctinfo,
+		       unsigned char **data, RasMessage *ras)
+{
+	switch (ras->choice) {
+	case eRasMessage_gatekeeperRequest:
+		return process_grq(skb, ct, ctinfo, data,
+				   &ras->gatekeeperRequest);
+	case eRasMessage_gatekeeperConfirm:
+		return process_gcf(skb, ct, ctinfo, data,
+				   &ras->gatekeeperConfirm);
+	case eRasMessage_registrationRequest:
+		return process_rrq(skb, ct, ctinfo, data,
+				   &ras->registrationRequest);
+	case eRasMessage_registrationConfirm:
+		return process_rcf(skb, ct, ctinfo, data,
+				   &ras->registrationConfirm);
+	case eRasMessage_unregistrationRequest:
+		return process_urq(skb, ct, ctinfo, data,
+				   &ras->unregistrationRequest);
+	case eRasMessage_admissionRequest:
+		return process_arq(skb, ct, ctinfo, data,
+				   &ras->admissionRequest);
+	case eRasMessage_admissionConfirm:
+		return process_acf(skb, ct, ctinfo, data,
+				   &ras->admissionConfirm);
+	case eRasMessage_locationRequest:
+		return process_lrq(skb, ct, ctinfo, data,
+				   &ras->locationRequest);
+	case eRasMessage_locationConfirm:
+		return process_lcf(skb, ct, ctinfo, data,
+				   &ras->locationConfirm);
+	case eRasMessage_infoRequestResponse:
+		return process_irr(skb, ct, ctinfo, data,
+				   &ras->infoRequestResponse);
+	default:
+		pr_debug("nf_ct_ras: RAS message %d\n", ras->choice);
+		break;
+	}
+
+	return 0;
+}
+
+/****************************************************************************/
+static int ras_help(struct sk_buff *skb, unsigned int protoff,
+		    struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	static RasMessage ras;
+	unsigned char *data;
+	int datalen = 0;
+	int ret;
+
+	pr_debug("nf_ct_ras: skblen = %u\n", skb->len);
+
+	spin_lock_bh(&nf_h323_lock);
+
+	/* Get UDP data */
+	data = get_udp_data(skb, protoff, &datalen);
+	if (data == NULL)
+		goto accept;
+	pr_debug("nf_ct_ras: RAS message len=%d ", datalen);
+	nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple);
+
+	/* Decode RAS message */
+	ret = DecodeRasMessage(data, datalen, &ras);
+	if (ret < 0) {
+		pr_debug("nf_ct_ras: decoding error: %s\n",
+			 ret == H323_ERROR_BOUND ?
+			 "out of bound" : "out of range");
+		goto accept;
+	}
+
+	/* Process RAS message */
+	if (process_ras(skb, ct, ctinfo, &data, &ras) < 0)
+		goto drop;
+
+      accept:
+	spin_unlock_bh(&nf_h323_lock);
+	return NF_ACCEPT;
+
+      drop:
+	spin_unlock_bh(&nf_h323_lock);
+	if (net_ratelimit())
+		pr_info("nf_ct_ras: packet dropped\n");
+	return NF_DROP;
+}
+
+/****************************************************************************/
+static const struct nf_conntrack_expect_policy ras_exp_policy = {
+	.max_expected		= 32,
+	.timeout		= 240,
+};
+
+static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = {
+	{
+		.name			= "RAS",
+		.me			= THIS_MODULE,
+		.tuple.src.l3num	= AF_INET,
+		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),
+		.tuple.dst.protonum	= IPPROTO_UDP,
+		.help			= ras_help,
+		.expect_policy		= &ras_exp_policy,
+	},
+	{
+		.name			= "RAS",
+		.me			= THIS_MODULE,
+		.tuple.src.l3num	= AF_INET6,
+		.tuple.src.u.udp.port	= cpu_to_be16(RAS_PORT),
+		.tuple.dst.protonum	= IPPROTO_UDP,
+		.help			= ras_help,
+		.expect_policy		= &ras_exp_policy,
+	},
+};
+
+/****************************************************************************/
+static void __exit nf_conntrack_h323_fini(void)
+{
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]);
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+	kfree(h323_buffer);
+	pr_debug("nf_ct_h323: fini\n");
+}
+
+/****************************************************************************/
+static int __init nf_conntrack_h323_init(void)
+{
+	int ret;
+
+	h323_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!h323_buffer)
+		return -ENOMEM;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245);
+	if (ret < 0)
+		goto err1;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]);
+	if (ret < 0)
+		goto err2;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]);
+	if (ret < 0)
+		goto err3;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]);
+	if (ret < 0)
+		goto err4;
+	ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]);
+	if (ret < 0)
+		goto err5;
+	pr_debug("nf_ct_h323: init success\n");
+	return 0;
+
+err5:
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]);
+err4:
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]);
+err3:
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]);
+err2:
+	nf_conntrack_helper_unregister(&nf_conntrack_helper_h245);
+err1:
+	kfree(h323_buffer);
+	return ret;
+}
+
+/****************************************************************************/
+module_init(nf_conntrack_h323_init);
+module_exit(nf_conntrack_h323_fini);
+
+EXPORT_SYMBOL_GPL(get_h225_addr);
+EXPORT_SYMBOL_GPL(set_h245_addr_hook);
+EXPORT_SYMBOL_GPL(set_h225_addr_hook);
+EXPORT_SYMBOL_GPL(set_sig_addr_hook);
+EXPORT_SYMBOL_GPL(set_ras_addr_hook);
+EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
+EXPORT_SYMBOL_GPL(nat_t120_hook);
+EXPORT_SYMBOL_GPL(nat_h245_hook);
+EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
+EXPORT_SYMBOL_GPL(nat_q931_hook);
+
+MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
+MODULE_DESCRIPTION("H.323 connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_h323");
+MODULE_ALIAS_NFCT_HELPER("h323");
diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c
new file mode 100644
index 00000000..d880f352
--- /dev/null
+++ b/net/netfilter/nf_conntrack_h323_types.c
@@ -0,0 +1,1922 @@
+/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007
+ *
+ * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ *
+ * This source code is licensed under General Public License version 2.
+ */
+
+static const struct field_t _TransportAddress_ipAddress[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE,
+	 offsetof(TransportAddress_ipAddress, ip), NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_route[] = {	/* SEQUENCE OF */
+	{FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute_routing[] = {	/* CHOICE */
+	{FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ipSourceRoute[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _TransportAddress_ipSourceRoute_route},
+	{FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _TransportAddress_ipSourceRoute_routing},
+};
+
+static const struct field_t _TransportAddress_ipxAddress[] = {	/* SEQUENCE */
+	{FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+	{FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress_ip6Address[] = {	/* SEQUENCE */
+	{FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE,
+	 offsetof(TransportAddress_ip6Address, ip), NULL},
+	{FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H221NonStandard[] = {	/* SEQUENCE */
+	{FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _NonStandardIdentifier[] = {	/* CHOICE */
+	{FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _H221NonStandard},
+};
+
+static const struct field_t _NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _NonStandardIdentifier},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TransportAddress[] = {	/* CHOICE */
+	{FNAME("ipAddress") SEQ, 0, 2, 2, DECODE,
+	 offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress},
+	{FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0,
+	 _TransportAddress_ipSourceRoute},
+	{FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0,
+	 _TransportAddress_ipxAddress},
+	{FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT,
+	 offsetof(TransportAddress, ip6Address),
+	 _TransportAddress_ip6Address},
+	{FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0,
+	 _NonStandardParameter},
+};
+
+static const struct field_t _AliasAddress[] = {	/* CHOICE */
+	{FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL},
+	{FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL},
+	{FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_sourceAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _VendorIdentifier[] = {	/* SEQUENCE */
+	{FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard},
+	{FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static const struct field_t _H310Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H320Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H321Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H322Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H323Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H324Caps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VoiceCaps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T120OnlyCaps[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SupportedProtocols[] = {	/* CHOICE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0,
+	 _NonStandardParameter},
+	{FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps},
+	{FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps},
+	{FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps},
+	{FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps},
+	{FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps},
+	{FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps},
+	{FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps},
+	{FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps},
+	{FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _GatewayInfo_protocol[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols},
+};
+
+static const struct field_t _GatewayInfo[] = {	/* SEQUENCE */
+	{FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _GatewayInfo_protocol},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static const struct field_t _McuInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _TerminalInfo[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+};
+
+static const struct field_t _EndpointType[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 _VendorIdentifier},
+	{FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0,
+	 _GatekeeperInfo},
+	{FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo},
+	{FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo},
+	{FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo},
+	{FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT,
+	 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_destinationAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCallInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _Setup_UUIE_destExtraCRV[] = {	/* SEQUENCE OF */
+	{FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_conferenceGoal[] = {	/* CHOICE */
+	{FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP,
+	 0, NULL},
+};
+
+static const struct field_t _Q954Details[] = {	/* SEQUENCE */
+	{FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _QseriesOptions[] = {	/* SEQUENCE */
+	{FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details},
+};
+
+static const struct field_t _CallType[] = {	/* CHOICE */
+	{FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = {	/* SEQUENCE */
+	{FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_NonStandardIdentifier[] = {	/* CHOICE */
+	{FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0,
+	 _H245_NonStandardIdentifier_h221NonStandard},
+};
+
+static const struct field_t _H245_NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0,
+	 _H245_NonStandardIdentifier},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H261VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H262VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H263VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0,
+	 NULL},
+	{FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _IS11172VideoCapability[] = {	/* SEQUENCE */
+	{FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _VideoCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0,
+	 _H261VideoCapability},
+	{FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0,
+	 _H262VideoCapability},
+	{FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0,
+	 _H263VideoCapability},
+	{FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0,
+	 _IS11172VideoCapability},
+	{FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _AudioCapability_g7231[] = {	/* SEQUENCE */
+	{FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS11172AudioCapability[] = {	/* SEQUENCE */
+	{FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _IS13818AudioCapability[] = {	/* SEQUENCE */
+	{FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AudioCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231},
+	{FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0,
+	 _IS11172AudioCapability},
+	{FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0,
+	 _IS13818AudioCapability},
+	{FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL},
+	{FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+	{FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataProtocolCapability[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL},
+	{FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile_t84Restricted[] = {	/* SEQUENCE */
+	{FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _T84Profile[] = {	/* CHOICE */
+	{FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0,
+	 _T84Profile_t84Restricted},
+};
+
+static const struct field_t _DataApplicationCapability_application_t84[] = {	/* SEQUENCE */
+	{FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile},
+};
+
+static const struct field_t _DataApplicationCapability_application_nlpid[] = {	/* SEQUENCE */
+	{FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability_application[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT,
+	 offsetof(DataApplicationCapability_application, t120),
+	 _DataProtocolCapability},
+	{FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t84") SEQ, 0, 2, 2, SKIP, 0,
+	 _DataApplicationCapability_application_t84},
+	{FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0,
+	 _DataApplicationCapability_application_nlpid},
+	{FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0,
+	 _DataProtocolCapability},
+	{FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+	{FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL},
+	{FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _DataApplicationCapability[] = {	/* SEQUENCE */
+	{FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT,
+	 offsetof(DataApplicationCapability, application),
+	 _DataApplicationCapability_application},
+	{FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _EncryptionMode[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _DataType[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability},
+	{FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0,
+	 _AudioCapability},
+	{FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data),
+	 _DataApplicationCapability},
+	{FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _EncryptionMode},
+	{FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+	{FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H222LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = {	/* SEQUENCE */
+	{FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL},
+	{FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0,
+	 _H245_NonStandardParameter},
+	{FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("al3") SEQ, 0, 2, 2, SKIP, 0,
+	 _H223LogicalChannelParameters_adaptationLayerType_al3},
+	{FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL},
+	{FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL},
+	{FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _H223LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters_adaptationLayerType},
+	{FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CRCLength[] = {	/* CHOICE */
+	{FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76HDLCParameters[] = {	/* SEQUENCE */
+	{FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength},
+	{FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_suspendResume[] = {	/* CHOICE */
+	{FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = {	/* CHOICE */
+	{FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = {	/* SEQUENCE */
+	{FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode_eRM_recovery},
+};
+
+static const struct field_t _V76LogicalChannelParameters_mode[] = {	/* CHOICE */
+	{FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode_eRM},
+	{FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V75Parameters[] = {	/* SEQUENCE */
+	{FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _V76LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _V76HDLCParameters},
+	{FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_suspendResume},
+	{FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters_mode},
+	{FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters},
+};
+
+static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _UnicastAddress_iPAddress[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 4, 0, DECODE,
+	 offsetof(UnicastAddress_iPAddress, network), NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPXAddress[] = {	/* SEQUENCE */
+	{FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL},
+	{FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iP6Address[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 16, 0, DECODE,
+	 offsetof(UnicastAddress_iP6Address, network), NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = {	/* CHOICE */
+	{FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = {	/* SEQUENCE OF */
+	{FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = {	/* SEQUENCE */
+	{FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0,
+	 _UnicastAddress_iPSourceRouteAddress_routing},
+	{FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _UnicastAddress_iPSourceRouteAddress_route},
+};
+
+static const struct field_t _UnicastAddress[] = {	/* CHOICE */
+	{FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT,
+	 offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress},
+	{FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0,
+	 _UnicastAddress_iPXAddress},
+	{FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT,
+	 offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address},
+	{FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0,
+	 _UnicastAddress_iPSourceRouteAddress},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iPAddress[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress_iP6Address[] = {	/* SEQUENCE */
+	{FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _MulticastAddress[] = {	/* CHOICE */
+	{FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _MulticastAddress_iPAddress},
+	{FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _MulticastAddress_iP6Address},
+	{FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL},
+};
+
+static const struct field_t _H245_TransportAddress[] = {	/* CHOICE */
+	{FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT,
+	 offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress},
+	{FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0,
+	 _MulticastAddress},
+};
+
+static const struct field_t _H2250LogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _H2250LogicalChannelParameters_nonStandard},
+	{FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelParameters, mediaChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelParameters, mediaControlChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT,
+	 0, NULL},
+	{FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL},
+	{FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+	 _H222LogicalChannelParameters},
+	{FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters},
+	{FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+	{FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT,
+	 offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+		  dataType), _DataType},
+	{FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT,
+	 offsetof(OpenLogicalChannel_forwardLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters},
+	{FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+	 0, NULL},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0,
+	 _H223LogicalChannelParameters},
+	{FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0,
+	 _V76LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType},
+	{FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannel_reverseLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters},
+	{FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT,
+	 0, NULL},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_distribution[] = {	/* CHOICE */
+	{FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address_address[] = {	/* CHOICE */
+	{FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL},
+	{FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Q2931Address[] = {	/* SEQUENCE */
+	{FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0,
+	 _Q2931Address_address},
+	{FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _NetworkAccessParameters_networkAddress[] = {	/* CHOICE */
+	{FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address},
+	{FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT,
+	 offsetof(NetworkAccessParameters_networkAddress, localAreaAddress),
+	 _H245_TransportAddress},
+};
+
+static const struct field_t _NetworkAccessParameters[] = {	/* SEQUENCE */
+	{FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0,
+	 _NetworkAccessParameters_distribution},
+	{FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT,
+	 offsetof(NetworkAccessParameters, networkAddress),
+	 _NetworkAccessParameters_networkAddress},
+	{FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+};
+
+static const struct field_t _OpenLogicalChannel[] = {	/* SEQUENCE */
+	{FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT,
+	 offsetof(OpenLogicalChannel, forwardLogicalChannelParameters),
+	 _OpenLogicalChannel_forwardLogicalChannelParameters},
+	{FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannel,
+				      reverseLogicalChannelParameters),
+	 _OpenLogicalChannel_reverseLogicalChannelParameters},
+	{FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannel, separateStack),
+	 _NetworkAccessParameters},
+	{FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Setup_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _Setup_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, h245Address), _TransportAddress},
+	{FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_sourceAddress},
+	{FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destinationAddress},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destExtraCallInfo},
+	{FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Setup_UUIE_destExtraCRV},
+	{FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0,
+	 _Setup_UUIE_conferenceGoal},
+	{FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0,
+	 _QseriesOptions},
+	{FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+	{FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress},
+	{FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart},
+	{FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+};
+
+static const struct field_t _CallProceeding_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _CallProceeding_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(CallProceeding_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(CallProceeding_UUIE, fastStart),
+	 _CallProceeding_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Connect_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _Connect_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Connect_UUIE, h245Address), _TransportAddress},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Alerting_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _Alerting_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Alerting_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Information_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ReleaseCompleteReason[] = {	/* CHOICE */
+	{FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL},
+	{FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0,
+	 NULL},
+	{FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ReleaseComplete_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0,
+	 _ReleaseCompleteReason},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _FacilityReason[] = {	/* CHOICE */
+	{FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _Facility_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _Facility_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, alternativeAddress), _TransportAddress},
+	{FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Facility_UUIE_alternativeAliasAddress},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+	{FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT,
+	 offsetof(Facility_UUIE, reason), _FacilityReason},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Facility_UUIE, h245Address), _TransportAddress},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 NULL},
+};
+
+static const struct field_t _CallIdentifier[] = {	/* SEQUENCE */
+	{FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityServiceMode[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+	{FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _SecurityCapabilities[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+	{FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+	{FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0,
+	 _SecurityServiceMode},
+};
+
+static const struct field_t _H245Security[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter},
+	{FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+	{FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities},
+};
+
+static const struct field_t _DHset[] = {	/* SEQUENCE */
+	{FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+	{FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _TypedCertificate[] = {	/* SEQUENCE */
+	{FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _H235_NonStandardParameter[] = {	/* SEQUENCE */
+	{FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _ClearToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset},
+	{FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL},
+	{FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0,
+	 _TypedCertificate},
+	{FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _H235_NonStandardParameter},
+	{FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL},
+	{FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _Progress_UUIE_tokens[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+};
+
+static const struct field_t _Params[] = {	/* SEQUENCE */
+	{FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL},
+	{FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = {	/* SEQUENCE */
+	{FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = {	/* SEQUENCE */
+	{FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdHash_token},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoEPCert[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoGKCert[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoH323Token_cryptoFastStart[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoEncryptedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoEncryptedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken_token[] = {	/* SEQUENCE */
+	{FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoSignedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("token") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoToken_cryptoSignedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken_token[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken_cryptoHashedToken[] = {	/* SEQUENCE */
+	{FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken},
+	{FNAME("token") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoHashedToken_token},
+};
+
+static const struct field_t _CryptoToken_cryptoPwdEncr[] = {	/* SEQUENCE */
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params},
+	{FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _CryptoToken[] = {	/* CHOICE */
+	{FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0,
+	 _CryptoToken_cryptoEncryptedToken},
+	{FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0,
+	 _CryptoToken_cryptoSignedToken},
+	{FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoHashedToken},
+	{FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoToken_cryptoPwdEncr},
+};
+
+static const struct field_t _CryptoH323Token[] = {	/* CHOICE */
+	{FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdHash},
+	{FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdHash},
+	{FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoEPPwdEncr},
+	{FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0,
+	 _CryptoH323Token_cryptoGKPwdEncr},
+	{FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoEPCert},
+	{FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoGKCert},
+	{FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0,
+	 _CryptoH323Token_cryptoFastStart},
+	{FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0,
+	 _CryptoToken},
+};
+
+static const struct field_t _Progress_UUIE_cryptoTokens[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token},
+};
+
+static const struct field_t _Progress_UUIE_fastStart[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT,
+	 sizeof(OpenLogicalChannel), _OpenLogicalChannel}
+	,
+};
+
+static const struct field_t _Progress_UUIE[] = {	/* SEQUENCE */
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0,
+	 _EndpointType},
+	{FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(Progress_UUIE, h245Address), _TransportAddress},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0,
+	 _CallIdentifier},
+	{FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0,
+	 _H245Security},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Progress_UUIE_tokens},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _Progress_UUIE_cryptoTokens},
+	{FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT,
+	 offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h323_message_body[] = {	/* CHOICE */
+	{FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE},
+	{FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, callProceeding),
+	 _CallProceeding_UUIE},
+	{FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE},
+	{FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE},
+	{FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE},
+	{FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0,
+	 _ReleaseComplete_UUIE},
+	{FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE},
+	{FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT,
+	 offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE},
+	{FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+	{FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL},
+};
+
+static const struct field_t _RequestMessage[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT,
+	 offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel},
+	{FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL},
+	{FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL},
+	{FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+	{FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0,
+	 NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = {	/* CHOICE */
+	{FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0,
+	 _H222LogicalChannelParameters},
+	{FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT,
+	 offsetof
+	 (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters,
+	  h2250LogicalChannelParameters), _H2250LogicalChannelParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = {	/* SEQUENCE */
+	{FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters,
+		  multiplexParameters),
+	 _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters},
+	{FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = {	/* SEQUENCE OF */
+	{FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter},
+};
+
+static const struct field_t _H2250LogicalChannelAckParameters[] = {	/* SEQUENCE */
+	{FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _H2250LogicalChannelAckParameters_nonStandard},
+	{FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelAckParameters, mediaChannel),
+	 _H245_TransportAddress},
+	{FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT,
+	 offsetof(H2250LogicalChannelAckParameters, mediaControlChannel),
+	 _H245_TransportAddress},
+	{FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL},
+	{FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL},
+};
+
+static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = {	/* CHOICE */
+	{FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT,
+	 offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters,
+		  h2250LogicalChannelAckParameters),
+	 _H2250LogicalChannelAckParameters},
+};
+
+static const struct field_t _OpenLogicalChannelAck[] = {	/* SEQUENCE */
+	{FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+				      reverseLogicalChannelParameters),
+	 _OpenLogicalChannelAck_reverseLogicalChannelParameters},
+	{FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT,
+	 offsetof(OpenLogicalChannelAck, separateStack),
+	 _NetworkAccessParameters},
+	{FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1,
+	 DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck,
+				      forwardMultiplexAckParameters),
+	 _OpenLogicalChannelAck_forwardMultiplexAckParameters},
+	{FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _ResponseMessage[] = {	/* CHOICE */
+	{FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT,
+	 offsetof(ResponseMessage, openLogicalChannelAck),
+	 _OpenLogicalChannelAck},
+	{FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0,
+	 NULL},
+	{FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL},
+	{FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0,
+	 NULL},
+	{FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL},
+	{FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0,
+	 NULL},
+	{FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _MultimediaSystemControlMessage[] = {	/* CHOICE */
+	{FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT,
+	 offsetof(MultimediaSystemControlMessage, request), _RequestMessage},
+	{FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT,
+	 offsetof(MultimediaSystemControlMessage, response),
+	 _ResponseMessage},
+	{FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL},
+	{FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL},
+};
+
+static const struct field_t _H323_UU_PDU_h245Control[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT,
+	 sizeof(MultimediaSystemControlMessage),
+	 _MultimediaSystemControlMessage}
+	,
+};
+
+static const struct field_t _H323_UU_PDU[] = {	/* SEQUENCE */
+	{FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT,
+	 offsetof(H323_UU_PDU, h323_message_body),
+	 _H323_UU_PDU_h323_message_body},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 NULL},
+	{FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT,
+	 offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control},
+	{FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT,
+	 0, NULL},
+	{FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _H323_UserInformation[] = {	/* SEQUENCE */
+	{FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT,
+	 offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU},
+	{FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(GatekeeperRequest, rasAddress), _TransportAddress},
+	{FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _GatekeeperConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(GatekeeperConfirm, rasAddress), _TransportAddress},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationRequest_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static const struct field_t _RegistrationRequest_rasAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static const struct field_t _RegistrationRequest_terminalAlias[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationRequest, callSignalAddress),
+	 _RegistrationRequest_callSignalAddress},
+	{FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationRequest, rasAddress),
+	 _RegistrationRequest_rasAddress},
+	{FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _RegistrationRequest_terminalAlias},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0,
+	 _VendorIdentifier},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+	 offsetof(RegistrationRequest, timeToLive), NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RegistrationConfirm_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static const struct field_t _RegistrationConfirm_terminalAlias[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _RegistrationConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(RegistrationConfirm, callSignalAddress),
+	 _RegistrationConfirm_callSignalAddress},
+	{FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _RegistrationConfirm_terminalAlias},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL},
+	{FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT,
+	 offsetof(RegistrationConfirm, timeToLive), NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _UnregistrationRequest_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static const struct field_t _UnregistrationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(UnregistrationRequest, callSignalAddress),
+	 _UnregistrationRequest_callSignalAddress},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL},
+	{FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _CallModel[] = {	/* CHOICE */
+	{FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+	{FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL},
+};
+
+static const struct field_t _AdmissionRequest_destinationInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_destExtraCallInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest_srcInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _AdmissionRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType},
+	{FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _AdmissionRequest_destinationInfo},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(AdmissionRequest, destCallSignalAddress),
+	 _TransportAddress},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0,
+	 _AdmissionRequest_destExtraCallInfo},
+	{FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _AdmissionRequest_srcInfo},
+	{FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT,
+	 offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress},
+	{FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL},
+	{FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL},
+	{FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL},
+	{FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL},
+	{FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _AdmissionConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL},
+	{FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel},
+	{FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(AdmissionConfirm, destCallSignalAddress),
+	 _TransportAddress},
+	{FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL},
+	{FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationRequest_destinationInfo[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress},
+};
+
+static const struct field_t _LocationRequest[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0,
+	 _LocationRequest_destinationInfo},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationRequest, replyAddress), _TransportAddress},
+	{FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0,
+	 NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+};
+
+static const struct field_t _LocationConfirm[] = {	/* SEQUENCE */
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationConfirm, callSignalAddress), _TransportAddress},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(LocationConfirm, rasAddress), _TransportAddress},
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL},
+	{FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0,
+	 NULL},
+	{FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT,
+	 0, NULL},
+	{FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL},
+	{FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _InfoRequestResponse_callSignalAddress[] = {	/* SEQUENCE OF */
+	{FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT,
+	 sizeof(TransportAddress), _TransportAddress}
+	,
+};
+
+static const struct field_t _InfoRequestResponse[] = {	/* SEQUENCE */
+	{FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0,
+	 _NonStandardParameter},
+	{FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL},
+	{FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType},
+	{FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL},
+	{FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT,
+	 offsetof(InfoRequestResponse, rasAddress), _TransportAddress},
+	{FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE,
+	 offsetof(InfoRequestResponse, callSignalAddress),
+	 _InfoRequestResponse_callSignalAddress},
+	{FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+	{FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL},
+	{FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL},
+	{FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL},
+	{FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL},
+	{FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL},
+};
+
+static const struct field_t _RasMessage[] = {	/* CHOICE */
+	{FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT,
+	 offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest},
+	{FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT,
+	 offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm},
+	{FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+	{FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT,
+	 offsetof(RasMessage, registrationRequest), _RegistrationRequest},
+	{FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT,
+	 offsetof(RasMessage, registrationConfirm), _RegistrationConfirm},
+	{FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL},
+	{FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT,
+	 offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest},
+	{FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL},
+	{FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT,
+	 offsetof(RasMessage, admissionRequest), _AdmissionRequest},
+	{FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT,
+	 offsetof(RasMessage, admissionConfirm), _AdmissionConfirm},
+	{FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL},
+	{FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL},
+	{FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL},
+	{FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL},
+	{FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT,
+	 offsetof(RasMessage, locationRequest), _LocationRequest},
+	{FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT,
+	 offsetof(RasMessage, locationConfirm), _LocationConfirm},
+	{FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL},
+	{FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL},
+	{FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT,
+	 offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse},
+	{FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL},
+	{FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL},
+	{FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL},
+	{FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0,
+	 NULL},
+	{FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0,
+	 NULL},
+	{FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL},
+	{FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL},
+	{FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0,
+	 NULL},
+	{FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL},
+};
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
new file mode 100644
index 00000000..1bdfea35
--- /dev/null
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -0,0 +1,288 @@
+/* Helper handling for netfilter. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/random.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+
+static DEFINE_MUTEX(nf_ct_helper_mutex);
+static struct hlist_head *nf_ct_helper_hash __read_mostly;
+static unsigned int nf_ct_helper_hsize __read_mostly;
+static unsigned int nf_ct_helper_count __read_mostly;
+
+
+/* Stupid hash, but collision free for the default registrations of the
+ * helpers currently in the kernel. */
+static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
+{
+	return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^
+		(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
+}
+
+static struct nf_conntrack_helper *
+__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
+{
+	struct nf_conntrack_helper *helper;
+	struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
+	struct hlist_node *n;
+	unsigned int h;
+
+	if (!nf_ct_helper_count)
+		return NULL;
+
+	h = helper_hash(tuple);
+	hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) {
+		if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
+			return helper;
+	}
+	return NULL;
+}
+
+struct nf_conntrack_helper *
+__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
+{
+	struct nf_conntrack_helper *h;
+	struct hlist_node *n;
+	unsigned int i;
+
+	for (i = 0; i < nf_ct_helper_hsize; i++) {
+		hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) {
+			if (!strcmp(h->name, name) &&
+			    h->tuple.src.l3num == l3num &&
+			    h->tuple.dst.protonum == protonum)
+				return h;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find);
+
+struct nf_conntrack_helper *
+nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
+{
+	struct nf_conntrack_helper *h;
+
+	h = __nf_conntrack_helper_find(name, l3num, protonum);
+#ifdef CONFIG_MODULES
+	if (h == NULL) {
+		if (request_module("nfct-helper-%s", name) == 0)
+			h = __nf_conntrack_helper_find(name, l3num, protonum);
+	}
+#endif
+	if (h != NULL && !try_module_get(h->me))
+		h = NULL;
+
+	return h;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
+
+struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
+{
+	struct nf_conn_help *help;
+
+	help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp);
+	if (help)
+		INIT_HLIST_HEAD(&help->expectations);
+	else
+		pr_debug("failed to add helper extension area");
+	return help;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
+
+int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
+			      gfp_t flags)
+{
+	struct nf_conntrack_helper *helper = NULL;
+	struct nf_conn_help *help;
+	int ret = 0;
+
+	if (tmpl != NULL) {
+		help = nfct_help(tmpl);
+		if (help != NULL)
+			helper = help->helper;
+	}
+
+	help = nfct_help(ct);
+	if (helper == NULL)
+		helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	if (helper == NULL) {
+		if (help)
+			rcu_assign_pointer(help->helper, NULL);
+		goto out;
+	}
+
+	if (help == NULL) {
+		help = nf_ct_helper_ext_add(ct, flags);
+		if (help == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else {
+		memset(&help->help, 0, sizeof(help->help));
+	}
+
+	rcu_assign_pointer(help->helper, helper);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
+
+static inline int unhelp(struct nf_conntrack_tuple_hash *i,
+			 const struct nf_conntrack_helper *me)
+{
+	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
+	struct nf_conn_help *help = nfct_help(ct);
+
+	if (help && rcu_dereference_protected(
+			help->helper,
+			lockdep_is_held(&nf_conntrack_lock)
+			) == me) {
+		nf_conntrack_event(IPCT_HELPER, ct);
+		rcu_assign_pointer(help->helper, NULL);
+	}
+	return 0;
+}
+
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_helper *helper;
+
+	if (help) {
+		rcu_read_lock();
+		helper = rcu_dereference(help->helper);
+		if (helper && helper->destroy)
+			helper->destroy(ct);
+		rcu_read_unlock();
+	}
+}
+
+int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
+{
+	unsigned int h = helper_hash(&me->tuple);
+
+	BUG_ON(me->expect_policy == NULL);
+	BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
+	BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+
+	mutex_lock(&nf_ct_helper_mutex);
+	hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
+	nf_ct_helper_count++;
+	mutex_unlock(&nf_ct_helper_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
+
+static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
+					     struct net *net)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_expect *exp;
+	const struct hlist_node *n, *next;
+	const struct hlist_nulls_node *nn;
+	unsigned int i;
+
+	/* Get rid of expectations */
+	for (i = 0; i < nf_ct_expect_hsize; i++) {
+		hlist_for_each_entry_safe(exp, n, next,
+					  &net->ct.expect_hash[i], hnode) {
+			struct nf_conn_help *help = nfct_help(exp->master);
+			if ((rcu_dereference_protected(
+					help->helper,
+					lockdep_is_held(&nf_conntrack_lock)
+					) == me || exp->helper == me) &&
+			    del_timer(&exp->timeout)) {
+				nf_ct_unlink_expect(exp);
+				nf_ct_expect_put(exp);
+			}
+		}
+	}
+
+	/* Get rid of expecteds, set helpers to NULL. */
+	hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
+		unhelp(h, me);
+	for (i = 0; i < net->ct.htable_size; i++) {
+		hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+			unhelp(h, me);
+	}
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+	struct net *net;
+
+	mutex_lock(&nf_ct_helper_mutex);
+	hlist_del_rcu(&me->hnode);
+	nf_ct_helper_count--;
+	mutex_unlock(&nf_ct_helper_mutex);
+
+	/* Make sure every nothing is still using the helper unless its a
+	 * connection in the hash.
+	 */
+	synchronize_rcu();
+
+	rtnl_lock();
+	spin_lock_bh(&nf_conntrack_lock);
+	for_each_net(net)
+		__nf_conntrack_helper_unregister(me, net);
+	spin_unlock_bh(&nf_conntrack_lock);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
+
+static struct nf_ct_ext_type helper_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_help),
+	.align	= __alignof__(struct nf_conn_help),
+	.id	= NF_CT_EXT_HELPER,
+};
+
+int nf_conntrack_helper_init(void)
+{
+	int err;
+
+	nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
+	nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
+	if (!nf_ct_helper_hash)
+		return -ENOMEM;
+
+	err = nf_ct_extend_register(&helper_extend);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+
+err1:
+	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+	return err;
+}
+
+void nf_conntrack_helper_fini(void)
+{
+	nf_ct_extend_unregister(&helper_extend);
+	nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
new file mode 100644
index 00000000..4f9390b9
--- /dev/null
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -0,0 +1,291 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_irc.h>
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+static unsigned int max_dcc_channels = 8;
+static unsigned int dcc_timeout __read_mostly = 300;
+/* This is slow, but it's simple. --RR */
+static char *irc_buffer;
+static DEFINE_SPINLOCK(irc_buffer_lock);
+
+unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
+				enum ip_conntrack_info ctinfo,
+				unsigned int matchoff,
+				unsigned int matchlen,
+				struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_irc");
+MODULE_ALIAS_NFCT_HELPER("irc");
+
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, uint, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per "
+				   "IRC session");
+module_param(dcc_timeout, uint, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+
+static const char *const dccprotos[] = {
+	"SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT "
+};
+
+#define MINMATCHLEN	5
+
+/* tries to get the ip_addr and port out of a dcc command
+ * return value: -1 on failure, 0 on success
+ *	data		pointer to first byte of DCC command data
+ *	data_end	pointer to last byte of dcc command data
+ *	ip		returns parsed ip of dcc command
+ *	port		returns parsed port of dcc command
+ *	ad_beg_p	returns pointer to first byte of addr data
+ *	ad_end_p	returns pointer to last byte of addr data
+ */
+static int parse_dcc(char *data, const char *data_end, __be32 *ip,
+		     u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+{
+	char *tmp;
+
+	/* at least 12: "AAAAAAAA P\1\n" */
+	while (*data++ != ' ')
+		if (data > data_end - 12)
+			return -1;
+
+	/* Make sure we have a newline character within the packet boundaries
+	 * because simple_strtoul parses until the first invalid character. */
+	for (tmp = data; tmp <= data_end; tmp++)
+		if (*tmp == '\n')
+			break;
+	if (tmp > data_end || *tmp != '\n')
+		return -1;
+
+	*ad_beg_p = data;
+	*ip = cpu_to_be32(simple_strtoul(data, &data, 10));
+
+	/* skip blanks between ip and port */
+	while (*data == ' ') {
+		if (data >= data_end)
+			return -1;
+		data++;
+	}
+
+	*port = simple_strtoul(data, &data, 10);
+	*ad_end_p = data;
+
+	return 0;
+}
+
+static int help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff;
+	const struct iphdr *iph;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	const char *data_limit;
+	char *data, *ib_ptr;
+	int dir = CTINFO2DIR(ctinfo);
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple *tuple;
+	__be32 dcc_ip;
+	u_int16_t dcc_port;
+	__be16 port;
+	int i, ret = NF_ACCEPT;
+	char *addr_beg_p, *addr_end_p;
+	typeof(nf_nat_irc_hook) nf_nat_irc;
+
+	/* If packet is coming from IRC server */
+	if (dir == IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	/* Not a full tcp header? */
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+
+	/* No data? */
+	dataoff = protoff + th->doff*4;
+	if (dataoff >= skb->len)
+		return NF_ACCEPT;
+
+	spin_lock_bh(&irc_buffer_lock);
+	ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+				    irc_buffer);
+	BUG_ON(ib_ptr == NULL);
+
+	data = ib_ptr;
+	data_limit = ib_ptr + skb->len - dataoff;
+
+	/* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+	 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+	while (data < data_limit - (19 + MINMATCHLEN)) {
+		if (memcmp(data, "\1DCC ", 5)) {
+			data++;
+			continue;
+		}
+		data += 5;
+		/* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+
+		iph = ip_hdr(skb);
+		pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
+			 &iph->saddr, ntohs(th->source),
+			 &iph->daddr, ntohs(th->dest));
+
+		for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+			if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+				/* no match */
+				continue;
+			}
+			data += strlen(dccprotos[i]);
+			pr_debug("DCC %s detected\n", dccprotos[i]);
+
+			/* we have at least
+			 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+			 * data left (== 14/13 bytes) */
+			if (parse_dcc(data, data_limit, &dcc_ip,
+				       &dcc_port, &addr_beg_p, &addr_end_p)) {
+				pr_debug("unable to parse dcc command\n");
+				continue;
+			}
+
+			pr_debug("DCC bound ip/port: %pI4:%u\n",
+				 &dcc_ip, dcc_port);
+
+			/* dcc_ip can be the internal OR external (NAT'ed) IP */
+			tuple = &ct->tuplehash[dir].tuple;
+			if (tuple->src.u3.ip != dcc_ip &&
+			    tuple->dst.u3.ip != dcc_ip) {
+				if (net_ratelimit())
+					printk(KERN_WARNING
+						"Forged DCC command from %pI4: %pI4:%u\n",
+						&tuple->src.u3.ip,
+						&dcc_ip, dcc_port);
+				continue;
+			}
+
+			exp = nf_ct_expect_alloc(ct);
+			if (exp == NULL) {
+				ret = NF_DROP;
+				goto out;
+			}
+			tuple = &ct->tuplehash[!dir].tuple;
+			port = htons(dcc_port);
+			nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+					  tuple->src.l3num,
+					  NULL, &tuple->dst.u3,
+					  IPPROTO_TCP, NULL, &port);
+
+			nf_nat_irc = rcu_dereference(nf_nat_irc_hook);
+			if (nf_nat_irc && ct->status & IPS_NAT_MASK)
+				ret = nf_nat_irc(skb, ctinfo,
+						 addr_beg_p - ib_ptr,
+						 addr_end_p - addr_beg_p,
+						 exp);
+			else if (nf_ct_expect_related(exp) != 0)
+				ret = NF_DROP;
+			nf_ct_expect_put(exp);
+			goto out;
+		}
+	}
+ out:
+	spin_unlock_bh(&irc_buffer_lock);
+	return ret;
+}
+
+static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
+static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly;
+static struct nf_conntrack_expect_policy irc_exp_policy;
+
+static void nf_conntrack_irc_fini(void);
+
+static int __init nf_conntrack_irc_init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (max_dcc_channels < 1) {
+		printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n");
+		return -EINVAL;
+	}
+
+	irc_exp_policy.max_expected = max_dcc_channels;
+	irc_exp_policy.timeout = dcc_timeout;
+
+	irc_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!irc_buffer)
+		return -ENOMEM;
+
+	/* If no port given, default to standard irc port */
+	if (ports_c == 0)
+		ports[ports_c++] = IRC_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		irc[i].tuple.src.l3num = AF_INET;
+		irc[i].tuple.src.u.tcp.port = htons(ports[i]);
+		irc[i].tuple.dst.protonum = IPPROTO_TCP;
+		irc[i].expect_policy = &irc_exp_policy;
+		irc[i].me = THIS_MODULE;
+		irc[i].help = help;
+
+		tmpname = &irc_names[i][0];
+		if (ports[i] == IRC_PORT)
+			sprintf(tmpname, "irc");
+		else
+			sprintf(tmpname, "irc-%u", i);
+		irc[i].name = tmpname;
+
+		ret = nf_conntrack_helper_register(&irc[i]);
+		if (ret) {
+			printk(KERN_ERR "nf_ct_irc: failed to register helper "
+			       "for pf: %u port: %u\n",
+			       irc[i].tuple.src.l3num, ports[i]);
+			nf_conntrack_irc_fini();
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/* This function is intentionally _NOT_ defined as __exit, because
+ * it is needed by the init function */
+static void nf_conntrack_irc_fini(void)
+{
+	int i;
+
+	for (i = 0; i < ports_c; i++)
+		nf_conntrack_helper_unregister(&irc[i]);
+	kfree(irc_buffer);
+}
+
+module_init(nf_conntrack_irc_init);
+module_exit(nf_conntrack_irc_fini);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
new file mode 100644
index 00000000..e7eb807f
--- /dev/null
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -0,0 +1,74 @@
+/*
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * Based largely upon the original ip_conntrack code which
+ * had the following copyright information:
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ *	Yasuyuki Kozakai @USAGI	<yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+
+static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+				 struct nf_conntrack_tuple *tuple)
+{
+	memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+	memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+	return true;
+}
+
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_tuple *orig)
+{
+	memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+	memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+	return true;
+}
+
+static int generic_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return 0;
+}
+
+static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			       unsigned int *dataoff, u_int8_t *protonum)
+{
+	/* Never track !!! */
+	return -NF_ACCEPT;
+}
+
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
+	.l3proto	 = PF_UNSPEC,
+	.name		 = "unknown",
+	.pkt_to_tuple	 = generic_pkt_to_tuple,
+	.invert_tuple	 = generic_invert_tuple,
+	.print_tuple	 = generic_print_tuple,
+	.get_l4proto	 = generic_get_l4proto,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
new file mode 100644
index 00000000..4c8f30a3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -0,0 +1,71 @@
+/*
+ *      NetBIOS name service broadcast connection tracking helper
+ *
+ *      (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+/*
+ *      This helper tracks locally originating NetBIOS name service
+ *      requests by issuing permanent expectations (valid until
+ *      timing out) matching all reply connections from the
+ *      destination network. The only NetBIOS specific thing is
+ *      actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define NMBD_PORT	137
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_netbios_ns");
+MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+
+static unsigned int timeout __read_mostly = 3;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static struct nf_conntrack_expect_policy exp_policy = {
+	.max_expected	= 1,
+};
+
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+		   struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
+static struct nf_conntrack_helper helper __read_mostly = {
+	.name			= "netbios-ns",
+	.tuple.src.l3num	= NFPROTO_IPV4,
+	.tuple.src.u.udp.port	= cpu_to_be16(NMBD_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.help			= netbios_ns_help,
+	.expect_policy		= &exp_policy,
+};
+
+static int __init nf_conntrack_netbios_ns_init(void)
+{
+	exp_policy.timeout = timeout;
+	return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_netbios_ns_fini(void)
+{
+	nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_netbios_ns_init);
+module_exit(nf_conntrack_netbios_ns_fini);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
new file mode 100644
index 00000000..482e90c6
--- /dev/null
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -0,0 +1,2226 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005-2008 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Initial connection tracking via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/rculist.h>
+#include <linux/rculist_nulls.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_protocol.h>
+#endif
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.93";
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+			    const struct nf_conntrack_tuple *tuple,
+			    struct nf_conntrack_l4proto *l4proto)
+{
+	int ret = 0;
+	struct nlattr *nest_parms;
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum);
+
+	if (likely(l4proto->tuple_to_nlattr))
+		ret = l4proto->tuple_to_nlattr(skb, tuple);
+
+	nla_nest_end(skb, nest_parms);
+
+	return ret;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples_ip(struct sk_buff *skb,
+			 const struct nf_conntrack_tuple *tuple,
+			 struct nf_conntrack_l3proto *l3proto)
+{
+	int ret = 0;
+	struct nlattr *nest_parms;
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	if (likely(l3proto->tuple_to_nlattr))
+		ret = l3proto->tuple_to_nlattr(skb, tuple);
+
+	nla_nest_end(skb, nest_parms);
+
+	return ret;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	int ret;
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+
+	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+	ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+
+	if (unlikely(ret < 0))
+		return ret;
+
+	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+	ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
+
+	return ret;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	long timeout = (ct->timeout.expires - jiffies) / HZ;
+
+	if (timeout < 0)
+		timeout = 0;
+
+	NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+{
+	struct nf_conntrack_l4proto *l4proto;
+	struct nlattr *nest_proto;
+	int ret;
+
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	if (!l4proto->to_nlattr)
+		return 0;
+
+	nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED);
+	if (!nest_proto)
+		goto nla_put_failure;
+
+	ret = l4proto->to_nlattr(skb, nest_proto, ct);
+
+	nla_nest_end(skb, nest_proto);
+
+	return ret;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_helper;
+	const struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_helper *helper;
+
+	if (!help)
+		return 0;
+
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		goto out;
+
+	nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
+	if (!nest_helper)
+		goto nla_put_failure;
+	NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name);
+
+	if (helper->to_nlattr)
+		helper->to_nlattr(skb, ct);
+
+	nla_nest_end(skb, nest_helper);
+out:
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct,
+			enum ip_conntrack_dir dir)
+{
+	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+	struct nlattr *nest_count;
+	const struct nf_conn_counter *acct;
+
+	acct = nf_conn_acct_find(ct);
+	if (!acct)
+		return 0;
+
+	nest_count = nla_nest_start(skb, type | NLA_F_NESTED);
+	if (!nest_count)
+		goto nla_put_failure;
+
+	NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS,
+		     cpu_to_be64(acct[dir].packets));
+	NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES,
+		     cpu_to_be64(acct[dir].bytes));
+
+	nla_nest_end(skb, nest_count);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_count;
+	const struct nf_conn_tstamp *tstamp;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (!tstamp)
+		return 0;
+
+	nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+	if (!nest_count)
+		goto nla_put_failure;
+
+	NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start));
+	if (tstamp->stop != 0) {
+		NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP,
+			     cpu_to_be64(tstamp->stop));
+	}
+	nla_nest_end(skb, nest_count);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static inline int
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_secctx;
+	int len, ret;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return 0;
+
+	ret = -1;
+	nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+	if (!nest_secctx)
+		goto nla_put_failure;
+
+	NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+	nla_nest_end(skb, nest_secctx);
+
+	ret = 0;
+nla_put_failure:
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+#define ctnetlink_dump_secctx(a, b) (0)
+#endif
+
+#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
+
+static inline int
+ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_parms;
+
+	if (!(ct->status & IPS_EXPECTED))
+		return 0;
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static int
+dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
+{
+	struct nlattr *nest_parms;
+
+	nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS,
+		     htonl(natseq->correction_pos));
+	NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE,
+		     htonl(natseq->offset_before));
+	NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER,
+		     htonl(natseq->offset_after));
+
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nf_nat_seq *natseq;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+
+	if (!(ct->status & IPS_SEQ_ADJUST) || !nat)
+		return 0;
+
+	natseq = &nat->seq[IP_CT_DIR_ORIGINAL];
+	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1)
+		return -1;
+
+	natseq = &nat->seq[IP_CT_DIR_REPLY];
+	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1)
+		return -1;
+
+	return 0;
+}
+#else
+#define ctnetlink_dump_nat_seq_adj(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)));
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, struct nf_conn *ct)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nlattr *nest_parms;
+	unsigned int flags = pid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = nf_ct_l3num(ct);
+	nfmsg->version      = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	if (nf_ct_zone(ct))
+		NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct)));
+
+	if (ctnetlink_dump_status(skb, ct) < 0 ||
+	    ctnetlink_dump_timeout(skb, ct) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_timestamp(skb, ct) < 0 ||
+	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_mark(skb, ct) < 0 ||
+	    ctnetlink_dump_secctx(skb, ct) < 0 ||
+	    ctnetlink_dump_id(skb, ct) < 0 ||
+	    ctnetlink_dump_use(skb, ct) < 0 ||
+	    ctnetlink_dump_master(skb, ct) < 0 ||
+	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static inline size_t
+ctnetlink_proto_size(const struct nf_conn *ct)
+{
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	size_t len = 0;
+
+	rcu_read_lock();
+	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+	len += l3proto->nla_size;
+
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	len += l4proto->nla_size;
+	rcu_read_unlock();
+
+	return len;
+}
+
+static inline size_t
+ctnetlink_counters_size(const struct nf_conn *ct)
+{
+	if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
+		return 0;
+	return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
+	       + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
+	       + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
+	       ;
+}
+
+static inline int
+ctnetlink_secctx_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+	int len, ret;
+
+	ret = security_secid_to_secctx(ct->secmark, NULL, &len);
+	if (ret)
+		return 0;
+
+	return nla_total_size(0) /* CTA_SECCTX */
+	       + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+#else
+	return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_timestamp_size(const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+	if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+		return 0;
+	return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
+#else
+	return 0;
+#endif
+}
+
+static inline size_t
+ctnetlink_nlmsg_size(const struct nf_conn *ct)
+{
+	return NLMSG_ALIGN(sizeof(struct nfgenmsg))
+	       + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
+	       + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
+	       + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
+	       + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
+	       + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
+	       + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
+	       + ctnetlink_counters_size(ct)
+	       + ctnetlink_timestamp_size(ct)
+	       + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
+	       + nla_total_size(0) /* CTA_PROTOINFO */
+	       + nla_total_size(0) /* CTA_HELP */
+	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+	       + ctnetlink_secctx_size(ct)
+#ifdef CONFIG_NF_NAT_NEEDED
+	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+	       + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
+#endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	       + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
+#endif
+	       + ctnetlink_proto_size(ct)
+	       ;
+}
+
+static int
+ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+{
+	struct net *net;
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nlattr *nest_parms;
+	struct nf_conn *ct = item->ct;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned int flags = 0, group;
+	int err;
+
+	/* ignore our fake conntrack entry */
+	if (nf_ct_is_untracked(ct))
+		return 0;
+
+	if (events & (1 << IPCT_DESTROY)) {
+		type = IPCTNL_MSG_CT_DELETE;
+		group = NFNLGRP_CONNTRACK_DESTROY;
+	} else  if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
+		type = IPCTNL_MSG_CT_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+		group = NFNLGRP_CONNTRACK_NEW;
+	} else  if (events) {
+		type = IPCTNL_MSG_CT_NEW;
+		group = NFNLGRP_CONNTRACK_UPDATE;
+	} else
+		return 0;
+
+	net = nf_ct_net(ct);
+	if (!item->report && !nfnetlink_has_listeners(net, group))
+		return 0;
+
+	skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = nf_ct_l3num(ct);
+	nfmsg->version	= NFNETLINK_V0;
+	nfmsg->res_id	= 0;
+
+	rcu_read_lock();
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	if (nf_ct_zone(ct))
+		NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct)));
+
+	if (ctnetlink_dump_id(skb, ct) < 0)
+		goto nla_put_failure;
+
+	if (ctnetlink_dump_status(skb, ct) < 0)
+		goto nla_put_failure;
+
+	if (events & (1 << IPCT_DESTROY)) {
+		if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+		    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+		    ctnetlink_dump_timestamp(skb, ct) < 0)
+			goto nla_put_failure;
+	} else {
+		if (ctnetlink_dump_timeout(skb, ct) < 0)
+			goto nla_put_failure;
+
+		if (events & (1 << IPCT_PROTOINFO)
+		    && ctnetlink_dump_protoinfo(skb, ct) < 0)
+			goto nla_put_failure;
+
+		if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
+		    && ctnetlink_dump_helpinfo(skb, ct) < 0)
+			goto nla_put_failure;
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+		if ((events & (1 << IPCT_SECMARK) || ct->secmark)
+		    && ctnetlink_dump_secctx(skb, ct) < 0)
+			goto nla_put_failure;
+#endif
+
+		if (events & (1 << IPCT_RELATED) &&
+		    ctnetlink_dump_master(skb, ct) < 0)
+			goto nla_put_failure;
+
+		if (events & (1 << IPCT_NATSEQADJ) &&
+		    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+			goto nla_put_failure;
+	}
+
+#ifdef CONFIG_NF_CONNTRACK_MARK
+	if ((events & (1 << IPCT_MARK) || ct->mark)
+	    && ctnetlink_dump_mark(skb, ct) < 0)
+		goto nla_put_failure;
+#endif
+	rcu_read_unlock();
+
+	nlmsg_end(skb, nlh);
+	err = nfnetlink_send(skb, net, item->pid, group, item->report,
+			     GFP_ATOMIC);
+	if (err == -ENOBUFS || err == -EAGAIN)
+		return -ENOBUFS;
+
+	return 0;
+
+nla_put_failure:
+	rcu_read_unlock();
+	nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+	kfree_skb(skb);
+errout:
+	if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
+		return -ENOBUFS;
+
+	return 0;
+}
+#endif /* CONFIG_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+	if (cb->args[1])
+		nf_ct_put((struct nf_conn *)cb->args[1]);
+	return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nf_conn *ct, *last;
+	struct nf_conntrack_tuple_hash *h;
+	struct hlist_nulls_node *n;
+	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	u_int8_t l3proto = nfmsg->nfgen_family;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	last = (struct nf_conn *)cb->args[1];
+	for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
+restart:
+		hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
+					 hnnode) {
+			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			/* Dump entries of a given L3 protocol number.
+			 * If it is not specified, ie. l3proto == 0,
+			 * then dump everything. */
+			if (l3proto && nf_ct_l3num(ct) != l3proto)
+				continue;
+			if (cb->args[1]) {
+				if (ct != last)
+					continue;
+				cb->args[1] = 0;
+			}
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+						cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW, ct) < 0) {
+				nf_conntrack_get(&ct->ct_general);
+				cb->args[1] = (unsigned long)ct;
+				goto out;
+			}
+
+			if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) ==
+						IPCTNL_MSG_CT_GET_CTRZERO) {
+				struct nf_conn_counter *acct;
+
+				acct = nf_conn_acct_find(ct);
+				if (acct)
+					memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]));
+			}
+		}
+		if (cb->args[1]) {
+			cb->args[1] = 0;
+			goto restart;
+		}
+	}
+out:
+	spin_unlock_bh(&nf_conntrack_lock);
+	if (last)
+		nf_ct_put(last);
+
+	return skb->len;
+}
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple)
+{
+	struct nlattr *tb[CTA_IP_MAX+1];
+	struct nf_conntrack_l3proto *l3proto;
+	int ret = 0;
+
+	nla_parse_nested(tb, CTA_IP_MAX, attr, NULL);
+
+	rcu_read_lock();
+	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+
+	if (likely(l3proto->nlattr_to_tuple)) {
+		ret = nla_validate_nested(attr, CTA_IP_MAX,
+					  l3proto->nla_policy);
+		if (ret == 0)
+			ret = l3proto->nlattr_to_tuple(tb, tuple);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
+	[CTA_PROTO_NUM]	= { .type = NLA_U8 },
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nlattr *attr,
+			    struct nf_conntrack_tuple *tuple)
+{
+	struct nlattr *tb[CTA_PROTO_MAX+1];
+	struct nf_conntrack_l4proto *l4proto;
+	int ret = 0;
+
+	ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[CTA_PROTO_NUM])
+		return -EINVAL;
+	tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
+
+	rcu_read_lock();
+	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+
+	if (likely(l4proto->nlattr_to_tuple)) {
+		ret = nla_validate_nested(attr, CTA_PROTO_MAX,
+					  l4proto->nla_policy);
+		if (ret == 0)
+			ret = l4proto->nlattr_to_tuple(tb, tuple);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
+	[CTA_TUPLE_IP]		= { .type = NLA_NESTED },
+	[CTA_TUPLE_PROTO]	= { .type = NLA_NESTED },
+};
+
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+		      struct nf_conntrack_tuple *tuple,
+		      enum ctattr_type type, u_int8_t l3num)
+{
+	struct nlattr *tb[CTA_TUPLE_MAX+1];
+	int err;
+
+	memset(tuple, 0, sizeof(*tuple));
+
+	nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy);
+
+	if (!tb[CTA_TUPLE_IP])
+		return -EINVAL;
+
+	tuple->src.l3num = l3num;
+
+	err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_TUPLE_PROTO])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
+	if (err < 0)
+		return err;
+
+	/* orig and expect tuples get DIR_ORIGINAL */
+	if (type == CTA_TUPLE_REPLY)
+		tuple->dst.dir = IP_CT_DIR_REPLY;
+	else
+		tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	return 0;
+}
+
+static int
+ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone)
+{
+	if (attr)
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+		*zone = ntohs(nla_get_be16(attr));
+#else
+		return -EOPNOTSUPP;
+#endif
+	else
+		*zone = 0;
+
+	return 0;
+}
+
+static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
+	[CTA_HELP_NAME]		= { .type = NLA_NUL_STRING },
+};
+
+static inline int
+ctnetlink_parse_help(const struct nlattr *attr, char **helper_name)
+{
+	struct nlattr *tb[CTA_HELP_MAX+1];
+
+	nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy);
+
+	if (!tb[CTA_HELP_NAME])
+		return -EINVAL;
+
+	*helper_name = nla_data(tb[CTA_HELP_NAME]);
+
+	return 0;
+}
+
+static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
+	[CTA_TUPLE_ORIG]	= { .type = NLA_NESTED },
+	[CTA_TUPLE_REPLY]	= { .type = NLA_NESTED },
+	[CTA_STATUS] 		= { .type = NLA_U32 },
+	[CTA_PROTOINFO]		= { .type = NLA_NESTED },
+	[CTA_HELP]		= { .type = NLA_NESTED },
+	[CTA_NAT_SRC]		= { .type = NLA_NESTED },
+	[CTA_TIMEOUT] 		= { .type = NLA_U32 },
+	[CTA_MARK]		= { .type = NLA_U32 },
+	[CTA_ID]		= { .type = NLA_U32 },
+	[CTA_NAT_DST]		= { .type = NLA_NESTED },
+	[CTA_TUPLE_MASTER]	= { .type = NLA_NESTED },
+	[CTA_ZONE]		= { .type = NLA_U16 },
+};
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+			const struct nlmsghdr *nlh,
+			const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	u16 zone;
+	int err;
+
+	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_TUPLE_ORIG])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+	else if (cda[CTA_TUPLE_REPLY])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+	else {
+		/* Flush the whole table */
+		nf_conntrack_flush_report(net,
+					 NETLINK_CB(skb).pid,
+					 nlmsg_report(nlh));
+		return 0;
+	}
+
+	if (err < 0)
+		return err;
+
+	h = nf_conntrack_find_get(net, zone, &tuple);
+	if (!h)
+		return -ENOENT;
+
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	if (cda[CTA_ID]) {
+		u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
+		if (id != (u32)(unsigned long)ct) {
+			nf_ct_put(ct);
+			return -ENOENT;
+		}
+	}
+
+	if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+				      NETLINK_CB(skb).pid,
+				      nlmsg_report(nlh)) < 0) {
+		nf_ct_delete_from_lists(ct);
+		/* we failed to report the event, try later */
+		nf_ct_insert_dying_list(ct);
+		nf_ct_put(ct);
+		return 0;
+	}
+
+	/* death_by_timeout would report the event again */
+	set_bit(IPS_DYING_BIT, &ct->status);
+
+	nf_ct_kill(ct);
+	nf_ct_put(ct);
+
+	return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+			const struct nlmsghdr *nlh,
+			const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
+	struct sk_buff *skb2 = NULL;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	u16 zone;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP)
+		return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table,
+					  ctnetlink_done);
+
+	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_TUPLE_ORIG])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3);
+	else if (cda[CTA_TUPLE_REPLY])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	h = nf_conntrack_find_get(net, zone, &tuple);
+	if (!h)
+		return -ENOENT;
+
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	err = -ENOMEM;
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL) {
+		nf_ct_put(ct);
+		return -ENOMEM;
+	}
+
+	rcu_read_lock();
+	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
+				  IPCTNL_MSG_CT_NEW, ct);
+	rcu_read_unlock();
+	nf_ct_put(ct);
+	if (err <= 0)
+		goto free;
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto out;
+
+	return 0;
+
+free:
+	kfree_skb(skb2);
+out:
+	/* this avoids a loop in nfnetlink. */
+	return err == -EAGAIN ? -ENOBUFS : err;
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static int
+ctnetlink_parse_nat_setup(struct nf_conn *ct,
+			  enum nf_nat_manip_type manip,
+			  const struct nlattr *attr)
+{
+	typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+
+	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
+	if (!parse_nat_setup) {
+#ifdef CONFIG_MODULES
+		rcu_read_unlock();
+		spin_unlock_bh(&nf_conntrack_lock);
+		nfnl_unlock();
+		if (request_module("nf-nat-ipv4") < 0) {
+			nfnl_lock();
+			spin_lock_bh(&nf_conntrack_lock);
+			rcu_read_lock();
+			return -EOPNOTSUPP;
+		}
+		nfnl_lock();
+		spin_lock_bh(&nf_conntrack_lock);
+		rcu_read_lock();
+		if (nfnetlink_parse_nat_setup_hook)
+			return -EAGAIN;
+#endif
+		return -EOPNOTSUPP;
+	}
+
+	return parse_nat_setup(ct, manip, attr);
+}
+#endif
+
+static int
+ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	unsigned long d;
+	unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
+	d = ct->status ^ status;
+
+	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+		/* unchangeable */
+		return -EBUSY;
+
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EBUSY;
+
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EBUSY;
+
+	/* Be careful here, modifying NAT bits can screw up things,
+	 * so don't let users modify them directly if they don't pass
+	 * nf_nat_range. */
+	ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+	return 0;
+}
+
+static int
+ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_NAT_NEEDED
+	int ret;
+
+	if (cda[CTA_NAT_DST]) {
+		ret = ctnetlink_parse_nat_setup(ct,
+						IP_NAT_MANIP_DST,
+						cda[CTA_NAT_DST]);
+		if (ret < 0)
+			return ret;
+	}
+	if (cda[CTA_NAT_SRC]) {
+		ret = ctnetlink_parse_nat_setup(ct,
+						IP_NAT_MANIP_SRC,
+						cda[CTA_NAT_SRC]);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static inline int
+ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	struct nf_conntrack_helper *helper;
+	struct nf_conn_help *help = nfct_help(ct);
+	char *helpname = NULL;
+	int err;
+
+	/* don't change helper of sibling connections */
+	if (ct->master)
+		return -EBUSY;
+
+	err = ctnetlink_parse_help(cda[CTA_HELP], &helpname);
+	if (err < 0)
+		return err;
+
+	if (!strcmp(helpname, "")) {
+		if (help && help->helper) {
+			/* we had a helper before ... */
+			nf_ct_remove_expectations(ct);
+			rcu_assign_pointer(help->helper, NULL);
+		}
+
+		return 0;
+	}
+
+	helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+					    nf_ct_protonum(ct));
+	if (helper == NULL) {
+#ifdef CONFIG_MODULES
+		spin_unlock_bh(&nf_conntrack_lock);
+
+		if (request_module("nfct-helper-%s", helpname) < 0) {
+			spin_lock_bh(&nf_conntrack_lock);
+			return -EOPNOTSUPP;
+		}
+
+		spin_lock_bh(&nf_conntrack_lock);
+		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+						    nf_ct_protonum(ct));
+		if (helper)
+			return -EAGAIN;
+#endif
+		return -EOPNOTSUPP;
+	}
+
+	if (help) {
+		if (help->helper == helper)
+			return 0;
+		if (help->helper)
+			return -EBUSY;
+		/* need to zero data of old helper */
+		memset(&help->help, 0, sizeof(help->help));
+	} else {
+		/* we cannot set a helper for an existing conntrack */
+		return -EOPNOTSUPP;
+	}
+
+	rcu_assign_pointer(help->helper, helper);
+
+	return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+
+	if (!del_timer(&ct->timeout))
+		return -ETIME;
+
+	ct->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&ct->timeout);
+
+	return 0;
+}
+
+static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
+	[CTA_PROTOINFO_TCP]	= { .type = NLA_NESTED },
+	[CTA_PROTOINFO_DCCP]	= { .type = NLA_NESTED },
+	[CTA_PROTOINFO_SCTP]	= { .type = NLA_NESTED },
+};
+
+static inline int
+ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+	const struct nlattr *attr = cda[CTA_PROTOINFO];
+	struct nlattr *tb[CTA_PROTOINFO_MAX+1];
+	struct nf_conntrack_l4proto *l4proto;
+	int err = 0;
+
+	nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy);
+
+	rcu_read_lock();
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	if (l4proto->from_nlattr)
+		err = l4proto->from_nlattr(tb, ct);
+	rcu_read_unlock();
+
+	return err;
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
+	[CTA_NAT_SEQ_CORRECTION_POS]	= { .type = NLA_U32 },
+	[CTA_NAT_SEQ_OFFSET_BEFORE]	= { .type = NLA_U32 },
+	[CTA_NAT_SEQ_OFFSET_AFTER]	= { .type = NLA_U32 },
+};
+
+static inline int
+change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
+{
+	struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
+
+	nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
+
+	if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
+		return -EINVAL;
+
+	natseq->correction_pos =
+		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS]));
+
+	if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE])
+		return -EINVAL;
+
+	natseq->offset_before =
+		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE]));
+
+	if (!cda[CTA_NAT_SEQ_OFFSET_AFTER])
+		return -EINVAL;
+
+	natseq->offset_after =
+		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER]));
+
+	return 0;
+}
+
+static int
+ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
+			     const struct nlattr * const cda[])
+{
+	int ret = 0;
+	struct nf_conn_nat *nat = nfct_nat(ct);
+
+	if (!nat)
+		return 0;
+
+	if (cda[CTA_NAT_SEQ_ADJ_ORIG]) {
+		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL],
+					 cda[CTA_NAT_SEQ_ADJ_ORIG]);
+		if (ret < 0)
+			return ret;
+
+		ct->status |= IPS_SEQ_ADJUST;
+	}
+
+	if (cda[CTA_NAT_SEQ_ADJ_REPLY]) {
+		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY],
+					 cda[CTA_NAT_SEQ_ADJ_REPLY]);
+		if (ret < 0)
+			return ret;
+
+		ct->status |= IPS_SEQ_ADJUST;
+	}
+
+	return 0;
+}
+#endif
+
+static int
+ctnetlink_change_conntrack(struct nf_conn *ct,
+			   const struct nlattr * const cda[])
+{
+	int err;
+
+	/* only allow NAT changes and master assignation for new conntracks */
+	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER])
+		return -EOPNOTSUPP;
+
+	if (cda[CTA_HELP]) {
+		err = ctnetlink_change_helper(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TIMEOUT]) {
+		err = ctnetlink_change_timeout(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_STATUS]) {
+		err = ctnetlink_change_status(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_PROTOINFO]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	if (cda[CTA_MARK])
+		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+#endif
+
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_nat_seq_adj(ct, cda);
+		if (err < 0)
+			return err;
+	}
+#endif
+
+	return 0;
+}
+
+static struct nf_conn *
+ctnetlink_create_conntrack(struct net *net, u16 zone,
+			   const struct nlattr * const cda[],
+			   struct nf_conntrack_tuple *otuple,
+			   struct nf_conntrack_tuple *rtuple,
+			   u8 u3)
+{
+	struct nf_conn *ct;
+	int err = -EINVAL;
+	struct nf_conntrack_helper *helper;
+	struct nf_conn_tstamp *tstamp;
+
+	ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
+	if (IS_ERR(ct))
+		return ERR_PTR(-ENOMEM);
+
+	if (!cda[CTA_TIMEOUT])
+		goto err1;
+	ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT]));
+
+	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+
+	rcu_read_lock();
+ 	if (cda[CTA_HELP]) {
+		char *helpname = NULL;
+ 
+ 		err = ctnetlink_parse_help(cda[CTA_HELP], &helpname);
+ 		if (err < 0)
+			goto err2;
+
+		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+						    nf_ct_protonum(ct));
+		if (helper == NULL) {
+			rcu_read_unlock();
+#ifdef CONFIG_MODULES
+			if (request_module("nfct-helper-%s", helpname) < 0) {
+				err = -EOPNOTSUPP;
+				goto err1;
+			}
+
+			rcu_read_lock();
+			helper = __nf_conntrack_helper_find(helpname,
+							    nf_ct_l3num(ct),
+							    nf_ct_protonum(ct));
+			if (helper) {
+				err = -EAGAIN;
+				goto err2;
+			}
+			rcu_read_unlock();
+#endif
+			err = -EOPNOTSUPP;
+			goto err1;
+		} else {
+			struct nf_conn_help *help;
+
+			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+			if (help == NULL) {
+				err = -ENOMEM;
+				goto err2;
+			}
+
+			/* not in hash table yet so not strictly necessary */
+			rcu_assign_pointer(help->helper, helper);
+		}
+	} else {
+		/* try an implicit helper assignation */
+		err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
+		if (err < 0)
+			goto err2;
+	}
+
+	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
+		err = ctnetlink_change_nat(ct, cda);
+		if (err < 0)
+			goto err2;
+	}
+
+	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
+	nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
+	/* we must add conntrack extensions before confirmation. */
+	ct->status |= IPS_CONFIRMED;
+
+	if (cda[CTA_STATUS]) {
+		err = ctnetlink_change_status(ct, cda);
+		if (err < 0)
+			goto err2;
+	}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_nat_seq_adj(ct, cda);
+		if (err < 0)
+			goto err2;
+	}
+#endif
+
+	memset(&ct->proto, 0, sizeof(ct->proto));
+	if (cda[CTA_PROTOINFO]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			goto err2;
+	}
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	if (cda[CTA_MARK])
+		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
+#endif
+
+	/* setup master conntrack: this is a confirmed expectation */
+	if (cda[CTA_TUPLE_MASTER]) {
+		struct nf_conntrack_tuple master;
+		struct nf_conntrack_tuple_hash *master_h;
+		struct nf_conn *master_ct;
+
+		err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3);
+		if (err < 0)
+			goto err2;
+
+		master_h = nf_conntrack_find_get(net, zone, &master);
+		if (master_h == NULL) {
+			err = -ENOENT;
+			goto err2;
+		}
+		master_ct = nf_ct_tuplehash_to_ctrack(master_h);
+		__set_bit(IPS_EXPECTED_BIT, &ct->status);
+		ct->master = master_ct;
+	}
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp)
+		tstamp->start = ktime_to_ns(ktime_get_real());
+
+	add_timer(&ct->timeout);
+	nf_conntrack_hash_insert(ct);
+	rcu_read_unlock();
+
+	return ct;
+
+err2:
+	rcu_read_unlock();
+err1:
+	nf_conntrack_free(ct);
+	return ERR_PTR(err);
+}
+
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+			const struct nlmsghdr *nlh,
+			const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_tuple otuple, rtuple;
+	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	u16 zone;
+	int err;
+
+	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_TUPLE_ORIG]) {
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TUPLE_REPLY]) {
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3);
+		if (err < 0)
+			return err;
+	}
+
+	spin_lock_bh(&nf_conntrack_lock);
+	if (cda[CTA_TUPLE_ORIG])
+		h = __nf_conntrack_find(net, zone, &otuple);
+	else if (cda[CTA_TUPLE_REPLY])
+		h = __nf_conntrack_find(net, zone, &rtuple);
+
+	if (h == NULL) {
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE) {
+			struct nf_conn *ct;
+			enum ip_conntrack_events events;
+
+			ct = ctnetlink_create_conntrack(net, zone, cda, &otuple,
+							&rtuple, u3);
+			if (IS_ERR(ct)) {
+				err = PTR_ERR(ct);
+				goto out_unlock;
+			}
+			err = 0;
+			nf_conntrack_get(&ct->ct_general);
+			spin_unlock_bh(&nf_conntrack_lock);
+			if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+				events = IPCT_RELATED;
+			else
+				events = IPCT_NEW;
+
+			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+						      (1 << IPCT_ASSURED) |
+						      (1 << IPCT_HELPER) |
+						      (1 << IPCT_PROTOINFO) |
+						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_MARK) | events,
+						      ct, NETLINK_CB(skb).pid,
+						      nlmsg_report(nlh));
+			nf_ct_put(ct);
+		} else
+			spin_unlock_bh(&nf_conntrack_lock);
+
+		return err;
+	}
+	/* implicit 'else' */
+
+	/* We manipulate the conntrack inside the global conntrack table lock,
+	 * so there's no need to increase the refcount */
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+		err = ctnetlink_change_conntrack(ct, cda);
+		if (err == 0) {
+			nf_conntrack_get(&ct->ct_general);
+			spin_unlock_bh(&nf_conntrack_lock);
+			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
+						      (1 << IPCT_ASSURED) |
+						      (1 << IPCT_HELPER) |
+						      (1 << IPCT_PROTOINFO) |
+						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_MARK),
+						      ct, NETLINK_CB(skb).pid,
+						      nlmsg_report(nlh));
+			nf_ct_put(ct);
+		} else
+			spin_unlock_bh(&nf_conntrack_lock);
+
+		return err;
+	}
+
+out_unlock:
+	spin_unlock_bh(&nf_conntrack_lock);
+	return err;
+}
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+			 const struct nf_conntrack_tuple *tuple,
+			 enum ctattr_expect type)
+{
+	struct nlattr *nest_parms;
+
+	nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	if (ctnetlink_dump_tuples(skb, tuple) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_exp_dump_mask(struct sk_buff *skb,
+			const struct nf_conntrack_tuple *tuple,
+			const struct nf_conntrack_tuple_mask *mask)
+{
+	int ret;
+	struct nf_conntrack_l3proto *l3proto;
+	struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple m;
+	struct nlattr *nest_parms;
+
+	memset(&m, 0xFF, sizeof(m));
+	memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
+	m.src.u.all = mask->src.u.all;
+	m.dst.protonum = tuple->dst.protonum;
+
+	nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+	ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+
+	if (unlikely(ret < 0))
+		goto nla_put_failure;
+
+	l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+	ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
+	if (unlikely(ret < 0))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+			  const struct nf_conntrack_expect *exp)
+{
+	struct nf_conn *master = exp->master;
+	long timeout = (exp->timeout.expires - jiffies) / HZ;
+	struct nf_conn_help *help;
+
+	if (timeout < 0)
+		timeout = 0;
+
+	if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+		goto nla_put_failure;
+	if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
+		goto nla_put_failure;
+	if (ctnetlink_exp_dump_tuple(skb,
+				 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 CTA_EXPECT_MASTER) < 0)
+		goto nla_put_failure;
+
+	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
+	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
+	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+	help = nfct_help(master);
+	if (help) {
+		struct nf_conntrack_helper *helper;
+
+		helper = rcu_dereference(help->helper);
+		if (helper)
+			NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+			int event, const struct nf_conntrack_expect *exp)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = pid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = exp->tuple.src.l3num;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+{
+	struct nf_conntrack_expect *exp = item->exp;
+	struct net *net = nf_ct_exp_net(exp);
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct sk_buff *skb;
+	unsigned int type, group;
+	int flags = 0;
+
+	if (events & (1 << IPEXP_DESTROY)) {
+		type = IPCTNL_MSG_EXP_DELETE;
+		group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+	} else if (events & (1 << IPEXP_NEW)) {
+		type = IPCTNL_MSG_EXP_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+		group = NFNLGRP_CONNTRACK_EXP_NEW;
+	} else
+		return 0;
+
+	if (!item->report && !nfnetlink_has_listeners(net, group))
+		return 0;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = exp->tuple.src.l3num;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	rcu_read_lock();
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nla_put_failure;
+	rcu_read_unlock();
+
+	nlmsg_end(skb, nlh);
+	nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
+	return 0;
+
+nla_put_failure:
+	rcu_read_unlock();
+	nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+	kfree_skb(skb);
+errout:
+	nfnetlink_set_err(net, 0, 0, -ENOBUFS);
+	return 0;
+}
+#endif
+static int ctnetlink_exp_done(struct netlink_callback *cb)
+{
+	if (cb->args[1])
+		nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+	return 0;
+}
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nf_conntrack_expect *exp, *last;
+	struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	struct hlist_node *n;
+	u_int8_t l3proto = nfmsg->nfgen_family;
+
+	rcu_read_lock();
+	last = (struct nf_conntrack_expect *)cb->args[1];
+	for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
+restart:
+		hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]],
+				     hnode) {
+			if (l3proto && exp->tuple.src.l3num != l3proto)
+				continue;
+			if (cb->args[1]) {
+				if (exp != last)
+					continue;
+				cb->args[1] = 0;
+			}
+			if (ctnetlink_exp_fill_info(skb,
+						    NETLINK_CB(cb->skb).pid,
+						    cb->nlh->nlmsg_seq,
+						    IPCTNL_MSG_EXP_NEW,
+						    exp) < 0) {
+				if (!atomic_inc_not_zero(&exp->use))
+					continue;
+				cb->args[1] = (unsigned long)exp;
+				goto out;
+			}
+		}
+		if (cb->args[1]) {
+			cb->args[1] = 0;
+			goto restart;
+		}
+	}
+out:
+	rcu_read_unlock();
+	if (last)
+		nf_ct_expect_put(last);
+
+	return skb->len;
+}
+
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
+	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
+	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+		     const struct nlmsghdr *nlh,
+		     const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_expect *exp;
+	struct sk_buff *skb2;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	u16 zone;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		return netlink_dump_start(ctnl, skb, nlh,
+					  ctnetlink_exp_dump_table,
+					  ctnetlink_exp_done);
+	}
+
+	err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_EXPECT_MASTER])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	exp = nf_ct_expect_find_get(net, zone, &tuple);
+	if (!exp)
+		return -ENOENT;
+
+	if (cda[CTA_EXPECT_ID]) {
+		__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+		if (ntohl(id) != (u32)(unsigned long)exp) {
+			nf_ct_expect_put(exp);
+			return -ENOENT;
+		}
+	}
+
+	err = -ENOMEM;
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		goto out;
+
+	rcu_read_lock();
+	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
+				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+	rcu_read_unlock();
+	if (err <= 0)
+		goto free;
+
+	nf_ct_expect_put(exp);
+
+	return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+
+free:
+	kfree_skb(skb2);
+out:
+	nf_ct_expect_put(exp);
+	return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+		     const struct nlmsghdr *nlh,
+		     const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple tuple;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct hlist_node *n, *next;
+	u_int8_t u3 = nfmsg->nfgen_family;
+	unsigned int i;
+	u16 zone;
+	int err;
+
+	if (cda[CTA_EXPECT_TUPLE]) {
+		/* delete a single expect by tuple */
+		err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+		if (err < 0)
+			return err;
+
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+		if (err < 0)
+			return err;
+
+		/* bump usage count to 2 */
+		exp = nf_ct_expect_find_get(net, zone, &tuple);
+		if (!exp)
+			return -ENOENT;
+
+		if (cda[CTA_EXPECT_ID]) {
+			__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
+			if (ntohl(id) != (u32)(unsigned long)exp) {
+				nf_ct_expect_put(exp);
+				return -ENOENT;
+			}
+		}
+
+		/* after list removal, usage count == 1 */
+		spin_lock_bh(&nf_conntrack_lock);
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh));
+			nf_ct_expect_put(exp);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
+		/* have to put what we 'get' above.
+		 * after this line usage count == 0 */
+		nf_ct_expect_put(exp);
+	} else if (cda[CTA_EXPECT_HELP_NAME]) {
+		char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+		struct nf_conn_help *m_help;
+
+		/* delete all expectations for this helper */
+		spin_lock_bh(&nf_conntrack_lock);
+		for (i = 0; i < nf_ct_expect_hsize; i++) {
+			hlist_for_each_entry_safe(exp, n, next,
+						  &net->ct.expect_hash[i],
+						  hnode) {
+				m_help = nfct_help(exp->master);
+				if (!strcmp(m_help->helper->name, name) &&
+				    del_timer(&exp->timeout)) {
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
+					nf_ct_expect_put(exp);
+				}
+			}
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
+	} else {
+		/* This basically means we have to flush everything*/
+		spin_lock_bh(&nf_conntrack_lock);
+		for (i = 0; i < nf_ct_expect_hsize; i++) {
+			hlist_for_each_entry_safe(exp, n, next,
+						  &net->ct.expect_hash[i],
+						  hnode) {
+				if (del_timer(&exp->timeout)) {
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
+					nf_ct_expect_put(exp);
+				}
+			}
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
+	}
+
+	return 0;
+}
+static int
+ctnetlink_change_expect(struct nf_conntrack_expect *x,
+			const struct nlattr * const cda[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct net *net, u16 zone,
+			const struct nlattr * const cda[],
+			u_int8_t u3,
+			u32 pid, int report)
+{
+	struct nf_conntrack_tuple tuple, mask, master_tuple;
+	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nf_conntrack_expect *exp;
+	struct nf_conn *ct;
+	struct nf_conn_help *help;
+	int err = 0;
+
+	/* caller guarantees that those three CTA_EXPECT_* exist */
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3);
+	if (err < 0)
+		return err;
+
+	/* Look for master conntrack of this expectation */
+	h = nf_conntrack_find_get(net, zone, &master_tuple);
+	if (!h)
+		return -ENOENT;
+	ct = nf_ct_tuplehash_to_ctrack(h);
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp) {
+		err = -ENOMEM;
+		goto out;
+	}
+	help = nfct_help(ct);
+	if (!help) {
+		if (!cda[CTA_EXPECT_TIMEOUT]) {
+			err = -EINVAL;
+			goto out;
+		}
+		exp->timeout.expires =
+		  jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+		exp->flags = NF_CT_EXPECT_USERSPACE;
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags |=
+				ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+		}
+	} else {
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+			exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+		} else
+			exp->flags = 0;
+	}
+
+	exp->class = 0;
+	exp->expectfn = NULL;
+	exp->master = ct;
+	exp->helper = NULL;
+	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
+	memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3));
+	exp->mask.src.u.all = mask.src.u.all;
+
+	err = nf_ct_expect_related_report(exp, pid, report);
+	nf_ct_expect_put(exp);
+
+out:
+	nf_ct_put(nf_ct_tuplehash_to_ctrack(h));
+	return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+		     const struct nlmsghdr *nlh,
+		     const struct nlattr * const cda[])
+{
+	struct net *net = sock_net(ctnl);
+	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_expect *exp;
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
+	u16 zone;
+	int err;
+
+	if (!cda[CTA_EXPECT_TUPLE]
+	    || !cda[CTA_EXPECT_MASK]
+	    || !cda[CTA_EXPECT_MASTER])
+		return -EINVAL;
+
+	err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+	if (err < 0)
+		return err;
+
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3);
+	if (err < 0)
+		return err;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	exp = __nf_ct_expect_find(net, zone, &tuple);
+
+	if (!exp) {
+		spin_unlock_bh(&nf_conntrack_lock);
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE) {
+			err = ctnetlink_create_expect(net, zone, cda,
+						      u3,
+						      NETLINK_CB(skb).pid,
+						      nlmsg_report(nlh));
+		}
+		return err;
+	}
+
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_expect(exp, cda);
+	spin_unlock_bh(&nf_conntrack_lock);
+
+	return err;
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static struct nf_ct_event_notifier ctnl_notifier = {
+	.fcn = ctnetlink_conntrack_event,
+};
+
+static struct nf_exp_event_notifier ctnl_notifier_exp = {
+	.fcn = ctnetlink_expect_event,
+};
+#endif
+
+static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+	[IPCTNL_MSG_CT_NEW]		= { .call = ctnetlink_new_conntrack,
+					    .attr_count = CTA_MAX,
+					    .policy = ct_nla_policy },
+	[IPCTNL_MSG_CT_GET] 		= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .policy = ct_nla_policy },
+	[IPCTNL_MSG_CT_DELETE]  	= { .call = ctnetlink_del_conntrack,
+					    .attr_count = CTA_MAX,
+					    .policy = ct_nla_policy },
+	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .policy = ct_nla_policy },
+};
+
+static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+	[IPCTNL_MSG_EXP_GET]		= { .call = ctnetlink_get_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .policy = exp_nla_policy },
+	[IPCTNL_MSG_EXP_NEW]		= { .call = ctnetlink_new_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .policy = exp_nla_policy },
+	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .policy = exp_nla_policy },
+};
+
+static const struct nfnetlink_subsystem ctnl_subsys = {
+	.name				= "conntrack",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK,
+	.cb_count			= IPCTNL_MSG_MAX,
+	.cb				= ctnl_cb,
+};
+
+static const struct nfnetlink_subsystem ctnl_exp_subsys = {
+	.name				= "conntrack_expect",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK_EXP,
+	.cb_count			= IPCTNL_MSG_EXP_MAX,
+	.cb				= ctnl_exp_cb,
+};
+
+MODULE_ALIAS("ip_conntrack_netlink");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
+
+static int __init ctnetlink_init(void)
+{
+	int ret;
+
+	pr_info("ctnetlink v%s: registering with nfnetlink.\n", version);
+	ret = nfnetlink_subsys_register(&ctnl_subsys);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+
+	ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n");
+		goto err_unreg_subsys;
+	}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	ret = nf_conntrack_register_notifier(&ctnl_notifier);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot register notifier.\n");
+		goto err_unreg_exp_subsys;
+	}
+
+	ret = nf_ct_expect_register_notifier(&ctnl_notifier_exp);
+	if (ret < 0) {
+		pr_err("ctnetlink_init: cannot expect register notifier.\n");
+		goto err_unreg_notifier;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+	nf_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+	return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+	pr_info("ctnetlink: unregistering from nfnetlink.\n");
+
+	nf_ct_remove_userspace_expectations();
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
+	nf_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
new file mode 100644
index 00000000..2fd45651
--- /dev/null
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -0,0 +1,631 @@
+/*
+ * Connection tracking support for PPTP (Point to Point Tunneling Protocol).
+ * PPTP is a a protocol for creating virtual private networks.
+ * It is a specification defined by Microsoft and some vendors
+ * working with Microsoft.  PPTP is built on top of a modified
+ * version of the Internet Generic Routing Encapsulation Protocol.
+ * GRE is defined in RFC 1701 and RFC 1702.  Documentation of
+ * PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ * Limitations:
+ * 	 - We blindly assume that control connections are always
+ * 	   established in PNS->PAC direction.  This is a violation
+ * 	   of RFFC2673
+ * 	 - We can only support one single call within each session
+ * TODO:
+ *	 - testing of incoming PPTP calls
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define NF_CT_PPTP_VERSION "3.1"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP");
+MODULE_ALIAS("ip_conntrack_pptp");
+MODULE_ALIAS_NFCT_HELPER("pptp");
+
+static DEFINE_SPINLOCK(nf_pptp_lock);
+
+int
+(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
+			     struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			     struct PptpControlHeader *ctlh,
+			     union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
+
+int
+(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
+			    struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			    struct PptpControlHeader *ctlh,
+			    union pptp_ctrl_union *pptpReq) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
+
+void
+(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
+			    struct nf_conntrack_expect *expect_reply)
+			    __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
+
+void
+(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
+			     struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+/* PptpControlMessageType names */
+const char *const pptp_msg_name[] = {
+	"UNKNOWN_MESSAGE",
+	"START_SESSION_REQUEST",
+	"START_SESSION_REPLY",
+	"STOP_SESSION_REQUEST",
+	"STOP_SESSION_REPLY",
+	"ECHO_REQUEST",
+	"ECHO_REPLY",
+	"OUT_CALL_REQUEST",
+	"OUT_CALL_REPLY",
+	"IN_CALL_REQUEST",
+	"IN_CALL_REPLY",
+	"IN_CALL_CONNECT",
+	"CALL_CLEAR_REQUEST",
+	"CALL_DISCONNECT_NOTIFY",
+	"WAN_ERROR_NOTIFY",
+	"SET_LINK_INFO"
+};
+EXPORT_SYMBOL(pptp_msg_name);
+#endif
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+
+#define PPTP_GRE_TIMEOUT 		(10 MINS)
+#define PPTP_GRE_STREAM_TIMEOUT 	(5 HOURS)
+
+static void pptp_expectfn(struct nf_conn *ct,
+			 struct nf_conntrack_expect *exp)
+{
+	struct net *net = nf_ct_net(ct);
+	typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
+	pr_debug("increasing timeouts\n");
+
+	/* increase timeout of GRE data channel conntrack entry */
+	ct->proto.gre.timeout	     = PPTP_GRE_TIMEOUT;
+	ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT;
+
+	/* Can you see how rusty this code is, compared with the pre-2.6.11
+	 * one? That's what happened to my shiny newnat of 2002 ;( -HW */
+
+	rcu_read_lock();
+	nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
+	if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
+		nf_nat_pptp_expectfn(ct, exp);
+	else {
+		struct nf_conntrack_tuple inv_t;
+		struct nf_conntrack_expect *exp_other;
+
+		/* obviously this tuple inversion only works until you do NAT */
+		nf_ct_invert_tuplepr(&inv_t, &exp->tuple);
+		pr_debug("trying to unexpect other dir: ");
+		nf_ct_dump_tuple(&inv_t);
+
+		exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t);
+		if (exp_other) {
+			/* delete other expectation.  */
+			pr_debug("found\n");
+			nf_ct_unexpect_related(exp_other);
+			nf_ct_expect_put(exp_other);
+		} else {
+			pr_debug("not found\n");
+		}
+	}
+	rcu_read_unlock();
+}
+
+static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct,
+				  const struct nf_conntrack_tuple *t)
+{
+	const struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_expect *exp;
+	struct nf_conn *sibling;
+	u16 zone = nf_ct_zone(ct);
+
+	pr_debug("trying to timeout ct or exp for tuple ");
+	nf_ct_dump_tuple(t);
+
+	h = nf_conntrack_find_get(net, zone, t);
+	if (h)  {
+		sibling = nf_ct_tuplehash_to_ctrack(h);
+		pr_debug("setting timeout of conntrack %p to 0\n", sibling);
+		sibling->proto.gre.timeout	  = 0;
+		sibling->proto.gre.stream_timeout = 0;
+		if (del_timer(&sibling->timeout))
+			sibling->timeout.function((unsigned long)sibling);
+		nf_ct_put(sibling);
+		return 1;
+	} else {
+		exp = nf_ct_expect_find_get(net, zone, t);
+		if (exp) {
+			pr_debug("unexpect_related of expect %p\n", exp);
+			nf_ct_unexpect_related(exp);
+			nf_ct_expect_put(exp);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* timeout GRE data connections */
+static void pptp_destroy_siblings(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	const struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_tuple t;
+
+	nf_ct_gre_keymap_destroy(ct);
+
+	/* try original (pns->pac) tuple */
+	memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
+	t.dst.protonum = IPPROTO_GRE;
+	t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id;
+	t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id;
+	if (!destroy_sibling_or_exp(net, ct, &t))
+		pr_debug("failed to timeout original pns->pac ct/exp\n");
+
+	/* try reply (pac->pns) tuple */
+	memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
+	t.dst.protonum = IPPROTO_GRE;
+	t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id;
+	t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id;
+	if (!destroy_sibling_or_exp(net, ct, &t))
+		pr_debug("failed to timeout reply pac->pns ct/exp\n");
+}
+
+/* expect GRE connections (PNS->PAC and PAC->PNS direction) */
+static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
+{
+	struct nf_conntrack_expect *exp_orig, *exp_reply;
+	enum ip_conntrack_dir dir;
+	int ret = 1;
+	typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
+
+	exp_orig = nf_ct_expect_alloc(ct);
+	if (exp_orig == NULL)
+		goto out;
+
+	exp_reply = nf_ct_expect_alloc(ct);
+	if (exp_reply == NULL)
+		goto out_put_orig;
+
+	/* original direction, PNS->PAC */
+	dir = IP_CT_DIR_ORIGINAL;
+	nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT,
+			  nf_ct_l3num(ct),
+			  &ct->tuplehash[dir].tuple.src.u3,
+			  &ct->tuplehash[dir].tuple.dst.u3,
+			  IPPROTO_GRE, &peer_callid, &callid);
+	exp_orig->expectfn = pptp_expectfn;
+
+	/* reply direction, PAC->PNS */
+	dir = IP_CT_DIR_REPLY;
+	nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT,
+			  nf_ct_l3num(ct),
+			  &ct->tuplehash[dir].tuple.src.u3,
+			  &ct->tuplehash[dir].tuple.dst.u3,
+			  IPPROTO_GRE, &callid, &peer_callid);
+	exp_reply->expectfn = pptp_expectfn;
+
+	nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
+	if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
+		nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+	if (nf_ct_expect_related(exp_orig) != 0)
+		goto out_put_both;
+	if (nf_ct_expect_related(exp_reply) != 0)
+		goto out_unexpect_orig;
+
+	/* Add GRE keymap entries */
+	if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0)
+		goto out_unexpect_both;
+	if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) {
+		nf_ct_gre_keymap_destroy(ct);
+		goto out_unexpect_both;
+	}
+	ret = 0;
+
+out_put_both:
+	nf_ct_expect_put(exp_reply);
+out_put_orig:
+	nf_ct_expect_put(exp_orig);
+out:
+	return ret;
+
+out_unexpect_both:
+	nf_ct_unexpect_related(exp_reply);
+out_unexpect_orig:
+	nf_ct_unexpect_related(exp_orig);
+	goto out_put_both;
+}
+
+static inline int
+pptp_inbound_pkt(struct sk_buff *skb,
+		 struct PptpControlHeader *ctlh,
+		 union pptp_ctrl_union *pptpReq,
+		 unsigned int reqlen,
+		 struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo)
+{
+	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	u_int16_t msg;
+	__be16 cid = 0, pcid = 0;
+	typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
+
+	msg = ntohs(ctlh->messageType);
+	pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
+
+	switch (msg) {
+	case PPTP_START_SESSION_REPLY:
+		/* server confirms new control session */
+		if (info->sstate < PPTP_SESSION_REQUESTED)
+			goto invalid;
+		if (pptpReq->srep.resultCode == PPTP_START_OK)
+			info->sstate = PPTP_SESSION_CONFIRMED;
+		else
+			info->sstate = PPTP_SESSION_ERROR;
+		break;
+
+	case PPTP_STOP_SESSION_REPLY:
+		/* server confirms end of control session */
+		if (info->sstate > PPTP_SESSION_STOPREQ)
+			goto invalid;
+		if (pptpReq->strep.resultCode == PPTP_STOP_OK)
+			info->sstate = PPTP_SESSION_NONE;
+		else
+			info->sstate = PPTP_SESSION_ERROR;
+		break;
+
+	case PPTP_OUT_CALL_REPLY:
+		/* server accepted call, we now expect GRE frames */
+		if (info->sstate != PPTP_SESSION_CONFIRMED)
+			goto invalid;
+		if (info->cstate != PPTP_CALL_OUT_REQ &&
+		    info->cstate != PPTP_CALL_OUT_CONF)
+			goto invalid;
+
+		cid = pptpReq->ocack.callID;
+		pcid = pptpReq->ocack.peersCallID;
+		if (info->pns_call_id != pcid)
+			goto invalid;
+		pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+			 ntohs(cid), ntohs(pcid));
+
+		if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
+			info->cstate = PPTP_CALL_OUT_CONF;
+			info->pac_call_id = cid;
+			exp_gre(ct, cid, pcid);
+		} else
+			info->cstate = PPTP_CALL_NONE;
+		break;
+
+	case PPTP_IN_CALL_REQUEST:
+		/* server tells us about incoming call request */
+		if (info->sstate != PPTP_SESSION_CONFIRMED)
+			goto invalid;
+
+		cid = pptpReq->icreq.callID;
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		info->cstate = PPTP_CALL_IN_REQ;
+		info->pac_call_id = cid;
+		break;
+
+	case PPTP_IN_CALL_CONNECT:
+		/* server tells us about incoming call established */
+		if (info->sstate != PPTP_SESSION_CONFIRMED)
+			goto invalid;
+		if (info->cstate != PPTP_CALL_IN_REP &&
+		    info->cstate != PPTP_CALL_IN_CONF)
+			goto invalid;
+
+		pcid = pptpReq->iccon.peersCallID;
+		cid = info->pac_call_id;
+
+		if (info->pns_call_id != pcid)
+			goto invalid;
+
+		pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+		info->cstate = PPTP_CALL_IN_CONF;
+
+		/* we expect a GRE connection from PAC to PNS */
+		exp_gre(ct, cid, pcid);
+		break;
+
+	case PPTP_CALL_DISCONNECT_NOTIFY:
+		/* server confirms disconnect */
+		cid = pptpReq->disc.callID;
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		info->cstate = PPTP_CALL_NONE;
+
+		/* untrack this call id, unexpect GRE packets */
+		pptp_destroy_siblings(ct);
+		break;
+
+	case PPTP_WAN_ERROR_NOTIFY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* I don't have to explain these ;) */
+		break;
+
+	default:
+		goto invalid;
+	}
+
+	nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
+	if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
+		return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq);
+	return NF_ACCEPT;
+
+invalid:
+	pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
+		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+	return NF_ACCEPT;
+}
+
+static inline int
+pptp_outbound_pkt(struct sk_buff *skb,
+		  struct PptpControlHeader *ctlh,
+		  union pptp_ctrl_union *pptpReq,
+		  unsigned int reqlen,
+		  struct nf_conn *ct,
+		  enum ip_conntrack_info ctinfo)
+{
+	struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	u_int16_t msg;
+	__be16 cid = 0, pcid = 0;
+	typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
+
+	msg = ntohs(ctlh->messageType);
+	pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
+
+	switch (msg) {
+	case PPTP_START_SESSION_REQUEST:
+		/* client requests for new control session */
+		if (info->sstate != PPTP_SESSION_NONE)
+			goto invalid;
+		info->sstate = PPTP_SESSION_REQUESTED;
+		break;
+
+	case PPTP_STOP_SESSION_REQUEST:
+		/* client requests end of control session */
+		info->sstate = PPTP_SESSION_STOPREQ;
+		break;
+
+	case PPTP_OUT_CALL_REQUEST:
+		/* client initiating connection to server */
+		if (info->sstate != PPTP_SESSION_CONFIRMED)
+			goto invalid;
+		info->cstate = PPTP_CALL_OUT_REQ;
+		/* track PNS call id */
+		cid = pptpReq->ocreq.callID;
+		pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+		info->pns_call_id = cid;
+		break;
+
+	case PPTP_IN_CALL_REPLY:
+		/* client answers incoming call */
+		if (info->cstate != PPTP_CALL_IN_REQ &&
+		    info->cstate != PPTP_CALL_IN_REP)
+			goto invalid;
+
+		cid = pptpReq->icack.callID;
+		pcid = pptpReq->icack.peersCallID;
+		if (info->pac_call_id != pcid)
+			goto invalid;
+		pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+			 ntohs(cid), ntohs(pcid));
+
+		if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
+			/* part two of the three-way handshake */
+			info->cstate = PPTP_CALL_IN_REP;
+			info->pns_call_id = cid;
+		} else
+			info->cstate = PPTP_CALL_NONE;
+		break;
+
+	case PPTP_CALL_CLEAR_REQUEST:
+		/* client requests hangup of call */
+		if (info->sstate != PPTP_SESSION_CONFIRMED)
+			goto invalid;
+		/* FUTURE: iterate over all calls and check if
+		 * call ID is valid.  We don't do this without newnat,
+		 * because we only know about last call */
+		info->cstate = PPTP_CALL_CLEAR_REQ;
+		break;
+
+	case PPTP_SET_LINK_INFO:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* I don't have to explain these ;) */
+		break;
+
+	default:
+		goto invalid;
+	}
+
+	nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
+	if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
+		return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq);
+	return NF_ACCEPT;
+
+invalid:
+	pr_debug("invalid %s: type=%d cid=%u pcid=%u "
+		 "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
+		 msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+		 msg, ntohs(cid), ntohs(pcid),  info->cstate, info->sstate,
+		 ntohs(info->pns_call_id), ntohs(info->pac_call_id));
+	return NF_ACCEPT;
+}
+
+static const unsigned int pptp_msg_size[] = {
+	[PPTP_START_SESSION_REQUEST]  = sizeof(struct PptpStartSessionRequest),
+	[PPTP_START_SESSION_REPLY]    = sizeof(struct PptpStartSessionReply),
+	[PPTP_STOP_SESSION_REQUEST]   = sizeof(struct PptpStopSessionRequest),
+	[PPTP_STOP_SESSION_REPLY]     = sizeof(struct PptpStopSessionReply),
+	[PPTP_OUT_CALL_REQUEST]       = sizeof(struct PptpOutCallRequest),
+	[PPTP_OUT_CALL_REPLY]	      = sizeof(struct PptpOutCallReply),
+	[PPTP_IN_CALL_REQUEST]	      = sizeof(struct PptpInCallRequest),
+	[PPTP_IN_CALL_REPLY]	      = sizeof(struct PptpInCallReply),
+	[PPTP_IN_CALL_CONNECT]	      = sizeof(struct PptpInCallConnected),
+	[PPTP_CALL_CLEAR_REQUEST]     = sizeof(struct PptpClearCallRequest),
+	[PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify),
+	[PPTP_WAN_ERROR_NOTIFY]	      = sizeof(struct PptpWanErrorNotify),
+	[PPTP_SET_LINK_INFO]	      = sizeof(struct PptpSetLinkInfo),
+};
+
+/* track caller id inside control connection, call expect_related */
+static int
+conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
+		    struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+
+{
+	int dir = CTINFO2DIR(ctinfo);
+	const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info;
+	const struct tcphdr *tcph;
+	struct tcphdr _tcph;
+	const struct pptp_pkt_hdr *pptph;
+	struct pptp_pkt_hdr _pptph;
+	struct PptpControlHeader _ctlh, *ctlh;
+	union pptp_ctrl_union _pptpReq, *pptpReq;
+	unsigned int tcplen = skb->len - protoff;
+	unsigned int datalen, reqlen, nexthdr_off;
+	int oldsstate, oldcstate;
+	int ret;
+	u_int16_t msg;
+
+	/* don't do any tracking before tcp handshake complete */
+	if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	nexthdr_off = protoff;
+	tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
+	BUG_ON(!tcph);
+	nexthdr_off += tcph->doff * 4;
+	datalen = tcplen - tcph->doff * 4;
+
+	pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph);
+	if (!pptph) {
+		pr_debug("no full PPTP header, can't track\n");
+		return NF_ACCEPT;
+	}
+	nexthdr_off += sizeof(_pptph);
+	datalen -= sizeof(_pptph);
+
+	/* if it's not a control message we can't do anything with it */
+	if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL ||
+	    ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) {
+		pr_debug("not a control packet\n");
+		return NF_ACCEPT;
+	}
+
+	ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh);
+	if (!ctlh)
+		return NF_ACCEPT;
+	nexthdr_off += sizeof(_ctlh);
+	datalen -= sizeof(_ctlh);
+
+	reqlen = datalen;
+	msg = ntohs(ctlh->messageType);
+	if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg])
+		return NF_ACCEPT;
+	if (reqlen > sizeof(*pptpReq))
+		reqlen = sizeof(*pptpReq);
+
+	pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq);
+	if (!pptpReq)
+		return NF_ACCEPT;
+
+	oldsstate = info->sstate;
+	oldcstate = info->cstate;
+
+	spin_lock_bh(&nf_pptp_lock);
+
+	/* FIXME: We just blindly assume that the control connection is always
+	 * established from PNS->PAC.  However, RFC makes no guarantee */
+	if (dir == IP_CT_DIR_ORIGINAL)
+		/* client -> server (PNS -> PAC) */
+		ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct,
+					ctinfo);
+	else
+		/* server -> client (PAC -> PNS) */
+		ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct,
+				       ctinfo);
+	pr_debug("sstate: %d->%d, cstate: %d->%d\n",
+		 oldsstate, info->sstate, oldcstate, info->cstate);
+	spin_unlock_bh(&nf_pptp_lock);
+
+	return ret;
+}
+
+static const struct nf_conntrack_expect_policy pptp_exp_policy = {
+	.max_expected	= 2,
+	.timeout	= 5 * 60,
+};
+
+/* control protocol helper */
+static struct nf_conntrack_helper pptp __read_mostly = {
+	.name			= "pptp",
+	.me			= THIS_MODULE,
+	.tuple.src.l3num	= AF_INET,
+	.tuple.src.u.tcp.port	= cpu_to_be16(PPTP_CONTROL_PORT),
+	.tuple.dst.protonum	= IPPROTO_TCP,
+	.help			= conntrack_pptp_help,
+	.destroy		= pptp_destroy_siblings,
+	.expect_policy		= &pptp_exp_policy,
+};
+
+static void nf_conntrack_pptp_net_exit(struct net *net)
+{
+	nf_ct_gre_keymap_flush(net);
+}
+
+static struct pernet_operations nf_conntrack_pptp_net_ops = {
+	.exit = nf_conntrack_pptp_net_exit,
+};
+
+static int __init nf_conntrack_pptp_init(void)
+{
+	int rv;
+
+	rv = nf_conntrack_helper_register(&pptp);
+	if (rv < 0)
+		return rv;
+	rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops);
+	if (rv < 0)
+		nf_conntrack_helper_unregister(&pptp);
+	return rv;
+}
+
+static void __exit nf_conntrack_pptp_fini(void)
+{
+	nf_conntrack_helper_unregister(&pptp);
+	unregister_pernet_subsys(&nf_conntrack_pptp_net_ops);
+}
+
+module_init(nf_conntrack_pptp_init);
+module_exit(nf_conntrack_pptp_fini);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
new file mode 100644
index 00000000..5701c8dd
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -0,0 +1,384 @@
+/* L3/L4 protocol support for nf_conntrack. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_l3protos);
+
+static DEFINE_MUTEX(nf_ct_proto_mutex);
+
+#ifdef CONFIG_SYSCTL
+static int
+nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path,
+		      struct ctl_table *table, unsigned int *users)
+{
+	if (*header == NULL) {
+		*header = register_sysctl_paths(path, table);
+		if (*header == NULL)
+			return -ENOMEM;
+	}
+	if (users != NULL)
+		(*users)++;
+	return 0;
+}
+
+static void
+nf_ct_unregister_sysctl(struct ctl_table_header **header,
+			struct ctl_table *table, unsigned int *users)
+{
+	if (users != NULL && --*users > 0)
+		return;
+
+	unregister_sysctl_table(*header);
+	*header = NULL;
+}
+#endif
+
+struct nf_conntrack_l4proto *
+__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+{
+	if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
+		return &nf_conntrack_l4proto_generic;
+
+	return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
+}
+EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct nf_conntrack_l3proto *
+nf_ct_l3proto_find_get(u_int16_t l3proto)
+{
+	struct nf_conntrack_l3proto *p;
+
+	rcu_read_lock();
+	p = __nf_ct_l3proto_find(l3proto);
+	if (!try_module_get(p->me))
+		p = &nf_conntrack_l3proto_generic;
+	rcu_read_unlock();
+
+	return p;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
+
+void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
+{
+	module_put(p->me);
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_put);
+
+int
+nf_ct_l3proto_try_module_get(unsigned short l3proto)
+{
+	int ret;
+	struct nf_conntrack_l3proto *p;
+
+retry:	p = nf_ct_l3proto_find_get(l3proto);
+	if (p == &nf_conntrack_l3proto_generic) {
+		ret = request_module("nf_conntrack-%d", l3proto);
+		if (!ret)
+			goto retry;
+
+		return -EPROTOTYPE;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
+
+void nf_ct_l3proto_module_put(unsigned short l3proto)
+{
+	struct nf_conntrack_l3proto *p;
+
+	/* rcu_read_lock not necessary since the caller holds a reference, but
+	 * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
+	 */
+	rcu_read_lock();
+	p = __nf_ct_l3proto_find(l3proto);
+	module_put(p->me);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
+
+static int kill_l3proto(struct nf_conn *i, void *data)
+{
+	return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
+}
+
+static int kill_l4proto(struct nf_conn *i, void *data)
+{
+	struct nf_conntrack_l4proto *l4proto;
+	l4proto = (struct nf_conntrack_l4proto *)data;
+	return nf_ct_protonum(i) == l4proto->l4proto &&
+	       nf_ct_l3num(i) == l4proto->l3proto;
+}
+
+static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto)
+{
+	int err = 0;
+
+#ifdef CONFIG_SYSCTL
+	if (l3proto->ctl_table != NULL) {
+		err = nf_ct_register_sysctl(&l3proto->ctl_table_header,
+					    l3proto->ctl_table_path,
+					    l3proto->ctl_table, NULL);
+	}
+#endif
+	return err;
+}
+
+static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto)
+{
+#ifdef CONFIG_SYSCTL
+	if (l3proto->ctl_table_header != NULL)
+		nf_ct_unregister_sysctl(&l3proto->ctl_table_header,
+					l3proto->ctl_table, NULL);
+#endif
+}
+
+int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+{
+	int ret = 0;
+	struct nf_conntrack_l3proto *old;
+
+	if (proto->l3proto >= AF_MAX)
+		return -EBUSY;
+
+	if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size)
+		return -EINVAL;
+
+	mutex_lock(&nf_ct_proto_mutex);
+	old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+					lockdep_is_held(&nf_ct_proto_mutex));
+	if (old != &nf_conntrack_l3proto_generic) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = nf_ct_l3proto_register_sysctl(proto);
+	if (ret < 0)
+		goto out_unlock;
+
+	if (proto->nlattr_tuple_size)
+		proto->nla_size = 3 * proto->nlattr_tuple_size();
+
+	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
+
+out_unlock:
+	mutex_unlock(&nf_ct_proto_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
+
+void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+{
+	struct net *net;
+
+	BUG_ON(proto->l3proto >= AF_MAX);
+
+	mutex_lock(&nf_ct_proto_mutex);
+	BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+					 lockdep_is_held(&nf_ct_proto_mutex)
+					 ) != proto);
+	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
+			   &nf_conntrack_l3proto_generic);
+	nf_ct_l3proto_unregister_sysctl(proto);
+	mutex_unlock(&nf_ct_proto_mutex);
+
+	synchronize_rcu();
+
+	/* Remove all contrack entries for this protocol */
+	rtnl_lock();
+	for_each_net(net)
+		nf_ct_iterate_cleanup(net, kill_l3proto, proto);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister);
+
+static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto)
+{
+	int err = 0;
+
+#ifdef CONFIG_SYSCTL
+	if (l4proto->ctl_table != NULL) {
+		err = nf_ct_register_sysctl(l4proto->ctl_table_header,
+					    nf_net_netfilter_sysctl_path,
+					    l4proto->ctl_table,
+					    l4proto->ctl_table_users);
+		if (err < 0)
+			goto out;
+	}
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	if (l4proto->ctl_compat_table != NULL) {
+		err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header,
+					    nf_net_ipv4_netfilter_sysctl_path,
+					    l4proto->ctl_compat_table, NULL);
+		if (err == 0)
+			goto out;
+		nf_ct_unregister_sysctl(l4proto->ctl_table_header,
+					l4proto->ctl_table,
+					l4proto->ctl_table_users);
+	}
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+out:
+#endif /* CONFIG_SYSCTL */
+	return err;
+}
+
+static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto)
+{
+#ifdef CONFIG_SYSCTL
+	if (l4proto->ctl_table_header != NULL &&
+	    *l4proto->ctl_table_header != NULL)
+		nf_ct_unregister_sysctl(l4proto->ctl_table_header,
+					l4proto->ctl_table,
+					l4proto->ctl_table_users);
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	if (l4proto->ctl_compat_table_header != NULL)
+		nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header,
+					l4proto->ctl_compat_table, NULL);
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+   them. --RR */
+int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
+{
+	int ret = 0;
+
+	if (l4proto->l3proto >= PF_MAX)
+		return -EBUSY;
+
+	if ((l4proto->to_nlattr && !l4proto->nlattr_size)
+		|| (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
+		return -EINVAL;
+
+	mutex_lock(&nf_ct_proto_mutex);
+	if (!nf_ct_protos[l4proto->l3proto]) {
+		/* l3proto may be loaded latter. */
+		struct nf_conntrack_l4proto __rcu **proto_array;
+		int i;
+
+		proto_array = kmalloc(MAX_NF_CT_PROTO *
+				      sizeof(struct nf_conntrack_l4proto *),
+				      GFP_KERNEL);
+		if (proto_array == NULL) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+
+		for (i = 0; i < MAX_NF_CT_PROTO; i++)
+			RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
+
+		/* Before making proto_array visible to lockless readers,
+		 * we must make sure its content is committed to memory.
+		 */
+		smp_wmb();
+
+		nf_ct_protos[l4proto->l3proto] = proto_array;
+	} else if (rcu_dereference_protected(
+			nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+			lockdep_is_held(&nf_ct_proto_mutex)
+			) != &nf_conntrack_l4proto_generic) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	ret = nf_ct_l4proto_register_sysctl(l4proto);
+	if (ret < 0)
+		goto out_unlock;
+
+	l4proto->nla_size = 0;
+	if (l4proto->nlattr_size)
+		l4proto->nla_size += l4proto->nlattr_size();
+	if (l4proto->nlattr_tuple_size)
+		l4proto->nla_size += 3 * l4proto->nlattr_tuple_size();
+
+	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+			   l4proto);
+
+out_unlock:
+	mutex_unlock(&nf_ct_proto_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
+
+void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
+{
+	struct net *net;
+
+	BUG_ON(l4proto->l3proto >= PF_MAX);
+
+	mutex_lock(&nf_ct_proto_mutex);
+	BUG_ON(rcu_dereference_protected(
+			nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+			lockdep_is_held(&nf_ct_proto_mutex)
+			) != l4proto);
+	rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+			   &nf_conntrack_l4proto_generic);
+	nf_ct_l4proto_unregister_sysctl(l4proto);
+	mutex_unlock(&nf_ct_proto_mutex);
+
+	synchronize_rcu();
+
+	/* Remove all contrack entries for this protocol */
+	rtnl_lock();
+	for_each_net(net)
+		nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister);
+
+int nf_conntrack_proto_init(void)
+{
+	unsigned int i;
+	int err;
+
+	err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < AF_MAX; i++)
+		rcu_assign_pointer(nf_ct_l3protos[i],
+				   &nf_conntrack_l3proto_generic);
+	return 0;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+	unsigned int i;
+
+	nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic);
+
+	/* free l3proto protocol tables */
+	for (i = 0; i < PF_MAX; i++)
+		kfree(nf_ct_protos[i]);
+}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
new file mode 100644
index 00000000..2e664a69
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -0,0 +1,899 @@
+/*
+ * DCCP connection tracking protocol helper
+ *
+ * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/dccp.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+/* Timeouts are based on values from RFC4340:
+ *
+ * - REQUEST:
+ *
+ *   8.1.2. Client Request
+ *
+ *   A client MAY give up on its DCCP-Requests after some time
+ *   (3 minutes, for example).
+ *
+ * - RESPOND:
+ *
+ *   8.1.3. Server Response
+ *
+ *   It MAY also leave the RESPOND state for CLOSED after a timeout of
+ *   not less than 4MSL (8 minutes);
+ *
+ * - PARTOPEN:
+ *
+ *   8.1.5. Handshake Completion
+ *
+ *   If the client remains in PARTOPEN for more than 4MSL (8 minutes),
+ *   it SHOULD reset the connection with Reset Code 2, "Aborted".
+ *
+ * - OPEN:
+ *
+ *   The DCCP timestamp overflows after 11.9 hours. If the connection
+ *   stays idle this long the sequence number won't be recognized
+ *   as valid anymore.
+ *
+ * - CLOSEREQ/CLOSING:
+ *
+ *   8.3. Termination
+ *
+ *   The retransmission timer should initially be set to go off in two
+ *   round-trip times and should back off to not less than once every
+ *   64 seconds ...
+ *
+ * - TIMEWAIT:
+ *
+ *   4.3. States
+ *
+ *   A server or client socket remains in this state for 2MSL (4 minutes)
+ *   after the connection has been town down, ...
+ */
+
+#define DCCP_MSL (2 * 60 * HZ)
+
+static const char * const dccp_state_names[] = {
+	[CT_DCCP_NONE]		= "NONE",
+	[CT_DCCP_REQUEST]	= "REQUEST",
+	[CT_DCCP_RESPOND]	= "RESPOND",
+	[CT_DCCP_PARTOPEN]	= "PARTOPEN",
+	[CT_DCCP_OPEN]		= "OPEN",
+	[CT_DCCP_CLOSEREQ]	= "CLOSEREQ",
+	[CT_DCCP_CLOSING]	= "CLOSING",
+	[CT_DCCP_TIMEWAIT]	= "TIMEWAIT",
+	[CT_DCCP_IGNORE]	= "IGNORE",
+	[CT_DCCP_INVALID]	= "INVALID",
+};
+
+#define sNO	CT_DCCP_NONE
+#define sRQ	CT_DCCP_REQUEST
+#define sRS	CT_DCCP_RESPOND
+#define sPO	CT_DCCP_PARTOPEN
+#define sOP	CT_DCCP_OPEN
+#define sCR	CT_DCCP_CLOSEREQ
+#define sCG	CT_DCCP_CLOSING
+#define sTW	CT_DCCP_TIMEWAIT
+#define sIG	CT_DCCP_IGNORE
+#define sIV	CT_DCCP_INVALID
+
+/*
+ * DCCP state transition table
+ *
+ * The assumption is the same as for TCP tracking:
+ *
+ * We are the man in the middle. All the packets go through us but might
+ * get lost in transit to the destination. It is assumed that the destination
+ * can't receive segments we haven't seen.
+ *
+ * The following states exist:
+ *
+ * NONE:	Initial state, expecting Request
+ * REQUEST:	Request seen, waiting for Response from server
+ * RESPOND:	Response from server seen, waiting for Ack from client
+ * PARTOPEN:	Ack after Response seen, waiting for packet other than Response,
+ * 		Reset or Sync from server
+ * OPEN:	Packet other than Response, Reset or Sync seen
+ * CLOSEREQ:	CloseReq from server seen, expecting Close from client
+ * CLOSING:	Close seen, expecting Reset
+ * TIMEWAIT:	Reset seen
+ * IGNORE:	Not determinable whether packet is valid
+ *
+ * Some states exist only on one side of the connection: REQUEST, RESPOND,
+ * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
+ * the one it was in before.
+ *
+ * Packets are marked as ignored (sIG) if we don't know if they're valid
+ * (for example a reincarnation of a connection we didn't notice is dead
+ * already) and the server may send back a connection closing Reset or a
+ * Response. They're also used for Sync/SyncAck packets, which we don't
+ * care about.
+ */
+static const u_int8_t
+dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
+	[CT_DCCP_ROLE_CLIENT] = {
+		[DCCP_PKT_REQUEST] = {
+		/*
+		 * sNO -> sRQ		Regular Request
+		 * sRQ -> sRQ		Retransmitted Request or reincarnation
+		 * sRS -> sRS		Retransmitted Request (apparently Response
+		 * 			got lost after we saw it) or reincarnation
+		 * sPO -> sIG		Ignore, conntrack might be out of sync
+		 * sOP -> sIG		Ignore, conntrack might be out of sync
+		 * sCR -> sIG		Ignore, conntrack might be out of sync
+		 * sCG -> sIG		Ignore, conntrack might be out of sync
+		 * sTW -> sRQ		Reincarnation
+		 *
+		 *	sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
+			sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
+		},
+		[DCCP_PKT_RESPONSE] = {
+		/*
+		 * sNO -> sIV		Invalid
+		 * sRQ -> sIG		Ignore, might be response to ignored Request
+		 * sRS -> sIG		Ignore, might be response to ignored Request
+		 * sPO -> sIG		Ignore, might be response to ignored Request
+		 * sOP -> sIG		Ignore, might be response to ignored Request
+		 * sCR -> sIG		Ignore, might be response to ignored Request
+		 * sCG -> sIG		Ignore, might be response to ignored Request
+		 * sTW -> sIV		Invalid, reincarnation in reverse direction
+		 *			goes through sRQ
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
+		},
+		[DCCP_PKT_ACK] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sPO		Ack for Response, move to PARTOPEN (8.1.5.)
+		 * sPO -> sPO		Retransmitted Ack for Response, remain in PARTOPEN
+		 * sOP -> sOP		Regular ACK, remain in OPEN
+		 * sCR -> sCR		Ack in CLOSEREQ MAY be processed (8.3.)
+		 * sCG -> sCG		Ack in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+		},
+		[DCCP_PKT_DATA] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sIV		MUST use DataAck in PARTOPEN state (8.1.5.)
+		 * sOP -> sOP		Regular Data packet
+		 * sCR -> sCR		Data in CLOSEREQ MAY be processed (8.3.)
+		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
+		},
+		[DCCP_PKT_DATAACK] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sPO		Ack for Response, move to PARTOPEN (8.1.5.)
+		 * sPO -> sPO		Remain in PARTOPEN state
+		 * sOP -> sOP		Regular DataAck packet in OPEN state
+		 * sCR -> sCR		DataAck in CLOSEREQ MAY be processed (8.3.)
+		 * sCG -> sCG		DataAck in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
+		},
+		[DCCP_PKT_CLOSEREQ] = {
+		/*
+		 * CLOSEREQ may only be sent by the server.
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
+		},
+		[DCCP_PKT_CLOSE] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sCG		Client-initiated close
+		 * sOP -> sCG		Client-initiated close
+		 * sCR -> sCG		Close in response to CloseReq (8.3.)
+		 * sCG -> sCG		Retransmit
+		 * sTW -> sIV		Late retransmit, already in TIME_WAIT
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
+		},
+		[DCCP_PKT_RESET] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sTW		Sync received or timeout, SHOULD send Reset (8.1.1.)
+		 * sRS -> sTW		Response received without Request
+		 * sPO -> sTW		Timeout, SHOULD send Reset (8.1.5.)
+		 * sOP -> sTW		Connection reset
+		 * sCR -> sTW		Connection reset
+		 * sCG -> sTW		Connection reset
+		 * sTW -> sIG		Ignore (don't refresh timer)
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+		},
+		[DCCP_PKT_SYNC] = {
+		/*
+		 * We currently ignore Sync packets
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+		},
+		[DCCP_PKT_SYNCACK] = {
+		/*
+		 * We currently ignore SyncAck packets
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+		},
+	},
+	[CT_DCCP_ROLE_SERVER] = {
+		[DCCP_PKT_REQUEST] = {
+		/*
+		 * sNO -> sIV		Invalid
+		 * sRQ -> sIG		Ignore, conntrack might be out of sync
+		 * sRS -> sIG		Ignore, conntrack might be out of sync
+		 * sPO -> sIG		Ignore, conntrack might be out of sync
+		 * sOP -> sIG		Ignore, conntrack might be out of sync
+		 * sCR -> sIG		Ignore, conntrack might be out of sync
+		 * sCG -> sIG		Ignore, conntrack might be out of sync
+		 * sTW -> sRQ		Reincarnation, must reverse roles
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
+		},
+		[DCCP_PKT_RESPONSE] = {
+		/*
+		 * sNO -> sIV		Response without Request
+		 * sRQ -> sRS		Response to clients Request
+		 * sRS -> sRS		Retransmitted Response (8.1.3. SHOULD NOT)
+		 * sPO -> sIG		Response to an ignored Request or late retransmit
+		 * sOP -> sIG		Ignore, might be response to ignored Request
+		 * sCR -> sIG		Ignore, might be response to ignored Request
+		 * sCG -> sIG		Ignore, might be response to ignored Request
+		 * sTW -> sIV		Invalid, Request from client in sTW moves to sRQ
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
+		},
+		[DCCP_PKT_ACK] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sOP		Enter OPEN state (8.1.5.)
+		 * sOP -> sOP		Regular Ack in OPEN state
+		 * sCR -> sIV		Waiting for Close from client
+		 * sCG -> sCG		Ack in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+		},
+		[DCCP_PKT_DATA] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sOP		Enter OPEN state (8.1.5.)
+		 * sOP -> sOP		Regular Data packet in OPEN state
+		 * sCR -> sIV		Waiting for Close from client
+		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+		},
+		[DCCP_PKT_DATAACK] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sOP		Enter OPEN state (8.1.5.)
+		 * sOP -> sOP		Regular DataAck in OPEN state
+		 * sCR -> sIV		Waiting for Close from client
+		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
+		 * sTW -> sIV
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
+		},
+		[DCCP_PKT_CLOSEREQ] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sOP -> sCR	Move directly to CLOSEREQ (8.1.5.)
+		 * sOP -> sCR		CloseReq in OPEN state
+		 * sCR -> sCR		Retransmit
+		 * sCG -> sCR		Simultaneous close, client sends another Close
+		 * sTW -> sIV		Already closed
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
+		},
+		[DCCP_PKT_CLOSE] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sIV		No connection
+		 * sRS -> sIV		No connection
+		 * sPO -> sOP -> sCG	Move direcly to CLOSING
+		 * sOP -> sCG		Move to CLOSING
+		 * sCR -> sIV		Close after CloseReq is invalid
+		 * sCG -> sCG		Retransmit
+		 * sTW -> sIV		Already closed
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
+		},
+		[DCCP_PKT_RESET] = {
+		/*
+		 * sNO -> sIV		No connection
+		 * sRQ -> sTW		Reset in response to Request
+		 * sRS -> sTW		Timeout, SHOULD send Reset (8.1.3.)
+		 * sPO -> sTW		Timeout, SHOULD send Reset (8.1.3.)
+		 * sOP -> sTW
+		 * sCR -> sTW
+		 * sCG -> sTW
+		 * sTW -> sIG		Ignore (don't refresh timer)
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
+			sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
+		},
+		[DCCP_PKT_SYNC] = {
+		/*
+		 * We currently ignore Sync packets
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+		},
+		[DCCP_PKT_SYNCACK] = {
+		/*
+		 * We currently ignore SyncAck packets
+		 *
+		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
+			sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+		},
+	},
+};
+
+/* this module per-net specifics */
+static int dccp_net_id __read_mostly;
+struct dccp_net {
+	int dccp_loose;
+	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+#ifdef CONFIG_SYSCTL
+	struct ctl_table_header *sysctl_header;
+	struct ctl_table *sysctl_table;
+#endif
+};
+
+static inline struct dccp_net *dccp_pernet(struct net *net)
+{
+	return net_generic(net, dccp_net_id);
+}
+
+static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+			      struct nf_conntrack_tuple *tuple)
+{
+	struct dccp_hdr _hdr, *dh;
+
+	dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (dh == NULL)
+		return false;
+
+	tuple->src.u.dccp.port = dh->dccph_sport;
+	tuple->dst.u.dccp.port = dh->dccph_dport;
+	return true;
+}
+
+static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	inv->src.u.dccp.port = tuple->dst.u.dccp.port;
+	inv->dst.u.dccp.port = tuple->src.u.dccp.port;
+	return true;
+}
+
+static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
+		     unsigned int dataoff)
+{
+	struct net *net = nf_ct_net(ct);
+	struct dccp_net *dn;
+	struct dccp_hdr _dh, *dh;
+	const char *msg;
+	u_int8_t state;
+
+	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+	BUG_ON(dh == NULL);
+
+	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
+	switch (state) {
+	default:
+		dn = dccp_pernet(net);
+		if (dn->dccp_loose == 0) {
+			msg = "nf_ct_dccp: not picking up existing connection ";
+			goto out_invalid;
+		}
+	case CT_DCCP_REQUEST:
+		break;
+	case CT_DCCP_INVALID:
+		msg = "nf_ct_dccp: invalid state transition ";
+		goto out_invalid;
+	}
+
+	ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+	ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+	ct->proto.dccp.state = CT_DCCP_NONE;
+	ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
+	ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
+	ct->proto.dccp.handshake_seq = 0;
+	return true;
+
+out_invalid:
+	if (LOG_INVALID(net, IPPROTO_DCCP))
+		nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
+	return false;
+}
+
+static u64 dccp_ack_seq(const struct dccp_hdr *dh)
+{
+	const struct dccp_hdr_ack_bits *dhack;
+
+	dhack = (void *)dh + __dccp_basic_hdr_len(dh);
+	return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
+		     ntohl(dhack->dccph_ack_nr_low);
+}
+
+static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
+		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
+		       u_int8_t pf, unsigned int hooknum)
+{
+	struct net *net = nf_ct_net(ct);
+	struct dccp_net *dn;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct dccp_hdr _dh, *dh;
+	u_int8_t type, old_state, new_state;
+	enum ct_dccp_roles role;
+
+	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+	BUG_ON(dh == NULL);
+	type = dh->dccph_type;
+
+	if (type == DCCP_PKT_RESET &&
+	    !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		/* Tear down connection immediately if only reply is a RESET */
+		nf_ct_kill_acct(ct, ctinfo, skb);
+		return NF_ACCEPT;
+	}
+
+	spin_lock_bh(&ct->lock);
+
+	role = ct->proto.dccp.role[dir];
+	old_state = ct->proto.dccp.state;
+	new_state = dccp_state_table[role][type][old_state];
+
+	switch (new_state) {
+	case CT_DCCP_REQUEST:
+		if (old_state == CT_DCCP_TIMEWAIT &&
+		    role == CT_DCCP_ROLE_SERVER) {
+			/* Reincarnation in the reverse direction: reopen and
+			 * reverse client/server roles. */
+			ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
+			ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
+		}
+		break;
+	case CT_DCCP_RESPOND:
+		if (old_state == CT_DCCP_REQUEST)
+			ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+		break;
+	case CT_DCCP_PARTOPEN:
+		if (old_state == CT_DCCP_RESPOND &&
+		    type == DCCP_PKT_ACK &&
+		    dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
+			set_bit(IPS_ASSURED_BIT, &ct->status);
+		break;
+	case CT_DCCP_IGNORE:
+		/*
+		 * Connection tracking might be out of sync, so we ignore
+		 * packets that might establish a new connection and resync
+		 * if the server responds with a valid Response.
+		 */
+		if (ct->proto.dccp.last_dir == !dir &&
+		    ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
+		    type == DCCP_PKT_RESPONSE) {
+			ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
+			ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
+			ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
+			new_state = CT_DCCP_RESPOND;
+			break;
+		}
+		ct->proto.dccp.last_dir = dir;
+		ct->proto.dccp.last_pkt = type;
+
+		spin_unlock_bh(&ct->lock);
+		if (LOG_INVALID(net, IPPROTO_DCCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_dccp: invalid packet ignored ");
+		return NF_ACCEPT;
+	case CT_DCCP_INVALID:
+		spin_unlock_bh(&ct->lock);
+		if (LOG_INVALID(net, IPPROTO_DCCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_dccp: invalid state transition ");
+		return -NF_ACCEPT;
+	}
+
+	ct->proto.dccp.last_dir = dir;
+	ct->proto.dccp.last_pkt = type;
+	ct->proto.dccp.state = new_state;
+	spin_unlock_bh(&ct->lock);
+
+	if (new_state != old_state)
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+	dn = dccp_pernet(net);
+	nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]);
+
+	return NF_ACCEPT;
+}
+
+static int dccp_error(struct net *net, struct nf_conn *tmpl,
+		      struct sk_buff *skb, unsigned int dataoff,
+		      enum ip_conntrack_info *ctinfo,
+		      u_int8_t pf, unsigned int hooknum)
+{
+	struct dccp_hdr _dh, *dh;
+	unsigned int dccp_len = skb->len - dataoff;
+	unsigned int cscov;
+	const char *msg;
+
+	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh);
+	if (dh == NULL) {
+		msg = "nf_ct_dccp: short packet ";
+		goto out_invalid;
+	}
+
+	if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
+	    dh->dccph_doff * 4 > dccp_len) {
+		msg = "nf_ct_dccp: truncated/malformed packet ";
+		goto out_invalid;
+	}
+
+	cscov = dccp_len;
+	if (dh->dccph_cscov) {
+		cscov = (dh->dccph_cscov - 1) * 4;
+		if (cscov > dccp_len) {
+			msg = "nf_ct_dccp: bad checksum coverage ";
+			goto out_invalid;
+		}
+	}
+
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
+				pf)) {
+		msg = "nf_ct_dccp: bad checksum ";
+		goto out_invalid;
+	}
+
+	if (dh->dccph_type >= DCCP_PKT_INVALID) {
+		msg = "nf_ct_dccp: reserved packet type ";
+		goto out_invalid;
+	}
+
+	return NF_ACCEPT;
+
+out_invalid:
+	if (LOG_INVALID(net, IPPROTO_DCCP))
+		nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
+	return -NF_ACCEPT;
+}
+
+static int dccp_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.dccp.port),
+			  ntohs(tuple->dst.u.dccp.port));
+}
+
+static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+			  struct nf_conn *ct)
+{
+	struct nlattr *nest_parms;
+
+	spin_lock_bh(&ct->lock);
+	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
+	NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
+		   ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
+	NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
+		     cpu_to_be64(ct->proto.dccp.handshake_seq));
+	nla_nest_end(skb, nest_parms);
+	spin_unlock_bh(&ct->lock);
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(&ct->lock);
+	return -1;
+}
+
+static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
+	[CTA_PROTOINFO_DCCP_STATE]	= { .type = NLA_U8 },
+	[CTA_PROTOINFO_DCCP_ROLE]	= { .type = NLA_U8 },
+	[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
+};
+
+static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
+{
+	struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
+	struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
+	int err;
+
+	if (!attr)
+		return 0;
+
+	err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr,
+			       dccp_nla_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
+	    !tb[CTA_PROTOINFO_DCCP_ROLE] ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
+	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&ct->lock);
+	ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
+	if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
+	} else {
+		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
+		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
+	}
+	if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
+		ct->proto.dccp.handshake_seq =
+		be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
+	}
+	spin_unlock_bh(&ct->lock);
+	return 0;
+}
+
+static int dccp_nlattr_size(void)
+{
+	return nla_total_size(0)	/* CTA_PROTOINFO_DCCP */
+		+ nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1);
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+/* template, data assigned later */
+static struct ctl_table dccp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_dccp_timeout_request",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_respond",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_partopen",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_open",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_closereq",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_closing",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_timeout_timewait",
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_dccp_loose",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+	.l3proto		= AF_INET,
+	.l4proto		= IPPROTO_DCCP,
+	.name			= "dccp",
+	.pkt_to_tuple		= dccp_pkt_to_tuple,
+	.invert_tuple		= dccp_invert_tuple,
+	.new			= dccp_new,
+	.packet			= dccp_packet,
+	.error			= dccp_error,
+	.print_tuple		= dccp_print_tuple,
+	.print_conntrack	= dccp_print_conntrack,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= dccp_to_nlattr,
+	.nlattr_size		= dccp_nlattr_size,
+	.from_nlattr		= nlattr_to_dccp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+};
+
+static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+	.l3proto		= AF_INET6,
+	.l4proto		= IPPROTO_DCCP,
+	.name			= "dccp",
+	.pkt_to_tuple		= dccp_pkt_to_tuple,
+	.invert_tuple		= dccp_invert_tuple,
+	.new			= dccp_new,
+	.packet			= dccp_packet,
+	.error			= dccp_error,
+	.print_tuple		= dccp_print_tuple,
+	.print_conntrack	= dccp_print_conntrack,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= dccp_to_nlattr,
+	.nlattr_size		= dccp_nlattr_size,
+	.from_nlattr		= nlattr_to_dccp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+};
+
+static __net_init int dccp_net_init(struct net *net)
+{
+	struct dccp_net *dn = dccp_pernet(net);
+
+	/* default values */
+	dn->dccp_loose = 1;
+	dn->dccp_timeout[CT_DCCP_REQUEST]	= 2 * DCCP_MSL;
+	dn->dccp_timeout[CT_DCCP_RESPOND]	= 4 * DCCP_MSL;
+	dn->dccp_timeout[CT_DCCP_PARTOPEN]	= 4 * DCCP_MSL;
+	dn->dccp_timeout[CT_DCCP_OPEN]		= 12 * 3600 * HZ;
+	dn->dccp_timeout[CT_DCCP_CLOSEREQ]	= 64 * HZ;
+	dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ;
+	dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL;
+
+#ifdef CONFIG_SYSCTL
+	dn->sysctl_table = kmemdup(dccp_sysctl_table,
+			sizeof(dccp_sysctl_table), GFP_KERNEL);
+	if (!dn->sysctl_table)
+		return -ENOMEM;
+
+	dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
+	dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
+	dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
+	dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
+	dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
+	dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
+	dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
+	dn->sysctl_table[7].data = &dn->dccp_loose;
+
+	dn->sysctl_header = register_net_sysctl_table(net,
+			nf_net_netfilter_sysctl_path, dn->sysctl_table);
+	if (!dn->sysctl_header) {
+		kfree(dn->sysctl_table);
+		return -ENOMEM;
+	}
+#endif
+
+	return 0;
+}
+
+static __net_exit void dccp_net_exit(struct net *net)
+{
+	struct dccp_net *dn = dccp_pernet(net);
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(dn->sysctl_header);
+	kfree(dn->sysctl_table);
+#endif
+}
+
+static struct pernet_operations dccp_net_ops = {
+	.init = dccp_net_init,
+	.exit = dccp_net_exit,
+	.id   = &dccp_net_id,
+	.size = sizeof(struct dccp_net),
+};
+
+static int __init nf_conntrack_proto_dccp_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&dccp_net_ops);
+	if (err < 0)
+		goto err1;
+
+	err = nf_conntrack_l4proto_register(&dccp_proto4);
+	if (err < 0)
+		goto err2;
+
+	err = nf_conntrack_l4proto_register(&dccp_proto6);
+	if (err < 0)
+		goto err3;
+	return 0;
+
+err3:
+	nf_conntrack_l4proto_unregister(&dccp_proto4);
+err2:
+	unregister_pernet_subsys(&dccp_net_ops);
+err1:
+	return err;
+}
+
+static void __exit nf_conntrack_proto_dccp_fini(void)
+{
+	unregister_pernet_subsys(&dccp_net_ops);
+	nf_conntrack_l4proto_unregister(&dccp_proto6);
+	nf_conntrack_l4proto_unregister(&dccp_proto4);
+}
+
+module_init(nf_conntrack_proto_dccp_init);
+module_exit(nf_conntrack_proto_dccp_fini);
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
new file mode 100644
index 00000000..e2091d0c
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -0,0 +1,105 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+
+static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
+
+static bool generic_pkt_to_tuple(const struct sk_buff *skb,
+				 unsigned int dataoff,
+				 struct nf_conntrack_tuple *tuple)
+{
+	tuple->src.u.all = 0;
+	tuple->dst.u.all = 0;
+
+	return true;
+}
+
+static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u.all = 0;
+	tuple->dst.u.all = 0;
+
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int generic_print_tuple(struct seq_file *s,
+			       const struct nf_conntrack_tuple *tuple)
+{
+	return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int packet(struct nf_conn *ct,
+		  const struct sk_buff *skb,
+		  unsigned int dataoff,
+		  enum ip_conntrack_info ctinfo,
+		  u_int8_t pf,
+		  unsigned int hooknum)
+{
+	nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout);
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool new(struct nf_conn *ct, const struct sk_buff *skb,
+		unsigned int dataoff)
+{
+	return true;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *generic_sysctl_header;
+static struct ctl_table generic_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_generic_timeout",
+		.data		= &nf_ct_generic_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table generic_compat_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_generic_timeout",
+		.data		= &nf_ct_generic_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
+{
+	.l3proto		= PF_UNSPEC,
+	.l4proto		= 255,
+	.name			= "unknown",
+	.pkt_to_tuple		= generic_pkt_to_tuple,
+	.invert_tuple		= generic_invert_tuple,
+	.print_tuple		= generic_print_tuple,
+	.packet			= packet,
+	.new			= new,
+#ifdef CONFIG_SYSCTL
+	.ctl_table_header	= &generic_sysctl_header,
+	.ctl_table		= generic_sysctl_table,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	.ctl_compat_table	= generic_compat_sysctl_table,
+#endif
+#endif
+};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
new file mode 100644
index 00000000..cf616e55
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -0,0 +1,346 @@
+/*
+ * ip_conntrack_proto_gre.c - Version 3.0
+ *
+ * Connection tracking protocol helper module for GRE.
+ *
+ * GRE is a generic encapsulation protocol, which is generally not very
+ * suited for NAT, as it has no protocol-specific part as port numbers.
+ *
+ * It has an optional key field, which may help us distinguishing two
+ * connections between the same two hosts.
+ *
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
+ *
+ * PPTP is built on top of a modified version of GRE, and has a mandatory
+ * field called "CallID", which serves us for the same purpose as the key
+ * field in plain GRE.
+ *
+ * Documentation about PPTP can be found in RFC 2637
+ *
+ * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
+ *
+ * Development of this code funded by Astaro AG (http://www.astaro.com/)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/dst.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_proto_gre.h>
+#include <linux/netfilter/nf_conntrack_pptp.h>
+
+#define GRE_TIMEOUT		(30 * HZ)
+#define GRE_STREAM_TIMEOUT	(180 * HZ)
+
+static int proto_gre_net_id __read_mostly;
+struct netns_proto_gre {
+	rwlock_t		keymap_lock;
+	struct list_head	keymap_list;
+};
+
+void nf_ct_gre_keymap_flush(struct net *net)
+{
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+	struct nf_ct_gre_keymap *km, *tmp;
+
+	write_lock_bh(&net_gre->keymap_lock);
+	list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
+		list_del(&km->list);
+		kfree(km);
+	}
+	write_unlock_bh(&net_gre->keymap_lock);
+}
+EXPORT_SYMBOL(nf_ct_gre_keymap_flush);
+
+static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
+				const struct nf_conntrack_tuple *t)
+{
+	return km->tuple.src.l3num == t->src.l3num &&
+	       !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) &&
+	       !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) &&
+	       km->tuple.dst.protonum == t->dst.protonum &&
+	       km->tuple.dst.u.all == t->dst.u.all;
+}
+
+/* look up the source key for a given tuple */
+static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
+{
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+	struct nf_ct_gre_keymap *km;
+	__be16 key = 0;
+
+	read_lock_bh(&net_gre->keymap_lock);
+	list_for_each_entry(km, &net_gre->keymap_list, list) {
+		if (gre_key_cmpfn(km, t)) {
+			key = km->tuple.src.u.gre.key;
+			break;
+		}
+	}
+	read_unlock_bh(&net_gre->keymap_lock);
+
+	pr_debug("lookup src key 0x%x for ", key);
+	nf_ct_dump_tuple(t);
+
+	return key;
+}
+
+/* add a single keymap entry, associate with specified master ct */
+int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
+			 struct nf_conntrack_tuple *t)
+{
+	struct net *net = nf_ct_net(ct);
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_ct_gre_keymap **kmp, *km;
+
+	kmp = &help->help.ct_pptp_info.keymap[dir];
+	if (*kmp) {
+		/* check whether it's a retransmission */
+		read_lock_bh(&net_gre->keymap_lock);
+		list_for_each_entry(km, &net_gre->keymap_list, list) {
+			if (gre_key_cmpfn(km, t) && km == *kmp) {
+				read_unlock_bh(&net_gre->keymap_lock);
+				return 0;
+			}
+		}
+		read_unlock_bh(&net_gre->keymap_lock);
+		pr_debug("trying to override keymap_%s for ct %p\n",
+			 dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
+		return -EEXIST;
+	}
+
+	km = kmalloc(sizeof(*km), GFP_ATOMIC);
+	if (!km)
+		return -ENOMEM;
+	memcpy(&km->tuple, t, sizeof(*t));
+	*kmp = km;
+
+	pr_debug("adding new entry %p: ", km);
+	nf_ct_dump_tuple(&km->tuple);
+
+	write_lock_bh(&net_gre->keymap_lock);
+	list_add_tail(&km->list, &net_gre->keymap_list);
+	write_unlock_bh(&net_gre->keymap_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
+
+/* destroy the keymap entries associated with specified master ct */
+void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
+{
+	struct net *net = nf_ct_net(ct);
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+	struct nf_conn_help *help = nfct_help(ct);
+	enum ip_conntrack_dir dir;
+
+	pr_debug("entering for ct %p\n", ct);
+
+	write_lock_bh(&net_gre->keymap_lock);
+	for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
+		if (help->help.ct_pptp_info.keymap[dir]) {
+			pr_debug("removing %p from list\n",
+				 help->help.ct_pptp_info.keymap[dir]);
+			list_del(&help->help.ct_pptp_info.keymap[dir]->list);
+			kfree(help->help.ct_pptp_info.keymap[dir]);
+			help->help.ct_pptp_info.keymap[dir] = NULL;
+		}
+	}
+	write_unlock_bh(&net_gre->keymap_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
+
+/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
+
+/* invert gre part of tuple */
+static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
+			     const struct nf_conntrack_tuple *orig)
+{
+	tuple->dst.u.gre.key = orig->src.u.gre.key;
+	tuple->src.u.gre.key = orig->dst.u.gre.key;
+	return true;
+}
+
+/* gre hdr info to tuple */
+static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+			     struct nf_conntrack_tuple *tuple)
+{
+	struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev);
+	const struct gre_hdr_pptp *pgrehdr;
+	struct gre_hdr_pptp _pgrehdr;
+	__be16 srckey;
+	const struct gre_hdr *grehdr;
+	struct gre_hdr _grehdr;
+
+	/* first only delinearize old RFC1701 GRE header */
+	grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
+	if (!grehdr || grehdr->version != GRE_VERSION_PPTP) {
+		/* try to behave like "nf_conntrack_proto_generic" */
+		tuple->src.u.all = 0;
+		tuple->dst.u.all = 0;
+		return true;
+	}
+
+	/* PPTP header is variable length, only need up to the call_id field */
+	pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
+	if (!pgrehdr)
+		return true;
+
+	if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) {
+		pr_debug("GRE_VERSION_PPTP but unknown proto\n");
+		return false;
+	}
+
+	tuple->dst.u.gre.key = pgrehdr->call_id;
+	srckey = gre_keymap_lookup(net, tuple);
+	tuple->src.u.gre.key = srckey;
+
+	return true;
+}
+
+/* print gre part of tuple */
+static int gre_print_tuple(struct seq_file *s,
+			   const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+			  ntohs(tuple->src.u.gre.key),
+			  ntohs(tuple->dst.u.gre.key));
+}
+
+/* print private data for conntrack */
+static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	return seq_printf(s, "timeout=%u, stream_timeout=%u ",
+			  (ct->proto.gre.timeout / HZ),
+			  (ct->proto.gre.stream_timeout / HZ));
+}
+
+/* Returns verdict for packet, and may modify conntrack */
+static int gre_packet(struct nf_conn *ct,
+		      const struct sk_buff *skb,
+		      unsigned int dataoff,
+		      enum ip_conntrack_info ctinfo,
+		      u_int8_t pf,
+		      unsigned int hooknum)
+{
+	/* If we've seen traffic both ways, this is a GRE connection.
+	 * Extend timeout. */
+	if (ct->status & IPS_SEEN_REPLY) {
+		nf_ct_refresh_acct(ct, ctinfo, skb,
+				   ct->proto.gre.stream_timeout);
+		/* Also, more likely to be important, and not a probe. */
+		set_bit(IPS_ASSURED_BIT, &ct->status);
+		nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	} else
+		nf_ct_refresh_acct(ct, ctinfo, skb,
+				   ct->proto.gre.timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
+		    unsigned int dataoff)
+{
+	pr_debug(": ");
+	nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+
+	/* initialize to sane value.  Ideally a conntrack helper
+	 * (e.g. in case of pptp) is increasing them */
+	ct->proto.gre.stream_timeout = GRE_STREAM_TIMEOUT;
+	ct->proto.gre.timeout = GRE_TIMEOUT;
+
+	return true;
+}
+
+/* Called when a conntrack entry has already been removed from the hashes
+ * and is about to be deleted from memory */
+static void gre_destroy(struct nf_conn *ct)
+{
+	struct nf_conn *master = ct->master;
+	pr_debug(" entering\n");
+
+	if (!master)
+		pr_debug("no master !?!\n");
+	else
+		nf_ct_gre_keymap_destroy(master);
+}
+
+/* protocol helper struct */
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
+	.l3proto	 = AF_INET,
+	.l4proto	 = IPPROTO_GRE,
+	.name		 = "gre",
+	.pkt_to_tuple	 = gre_pkt_to_tuple,
+	.invert_tuple	 = gre_invert_tuple,
+	.print_tuple	 = gre_print_tuple,
+	.print_conntrack = gre_print_conntrack,
+	.packet		 = gre_packet,
+	.new		 = gre_new,
+	.destroy	 = gre_destroy,
+	.me 		 = THIS_MODULE,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+	.nla_policy	 = nf_ct_port_nla_policy,
+#endif
+};
+
+static int proto_gre_net_init(struct net *net)
+{
+	struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id);
+
+	rwlock_init(&net_gre->keymap_lock);
+	INIT_LIST_HEAD(&net_gre->keymap_list);
+
+	return 0;
+}
+
+static void proto_gre_net_exit(struct net *net)
+{
+	nf_ct_gre_keymap_flush(net);
+}
+
+static struct pernet_operations proto_gre_net_ops = {
+	.init = proto_gre_net_init,
+	.exit = proto_gre_net_exit,
+	.id   = &proto_gre_net_id,
+	.size = sizeof(struct netns_proto_gre),
+};
+
+static int __init nf_ct_proto_gre_init(void)
+{
+	int rv;
+
+	rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4);
+	if (rv < 0)
+		return rv;
+	rv = register_pernet_subsys(&proto_gre_net_ops);
+	if (rv < 0)
+		nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+	return rv;
+}
+
+static void __exit nf_ct_proto_gre_fini(void)
+{
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4);
+	unregister_pernet_subsys(&proto_gre_net_ops);
+}
+
+module_init(nf_ct_proto_gre_init);
+module_exit(nf_ct_proto_gre_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
new file mode 100644
index 00000000..6772b115
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -0,0 +1,750 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ *
+ * SCTP is defined in RFC 2960. References to various sections in this code
+ * are to this RFC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+   closely.  They're more complex. --RR
+
+   And so for me for SCTP :D -Kiran */
+
+static const char *const sctp_conntrack_names[] = {
+	"NONE",
+	"CLOSED",
+	"COOKIE_WAIT",
+	"COOKIE_ECHOED",
+	"ESTABLISHED",
+	"SHUTDOWN_SENT",
+	"SHUTDOWN_RECD",
+	"SHUTDOWN_ACK_SENT",
+};
+
+#define SECS  * HZ
+#define MINS  * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS  * 24 HOURS
+
+static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = {
+	[SCTP_CONNTRACK_CLOSED]			= 10 SECS,
+	[SCTP_CONNTRACK_COOKIE_WAIT]		= 3 SECS,
+	[SCTP_CONNTRACK_COOKIE_ECHOED]		= 3 SECS,
+	[SCTP_CONNTRACK_ESTABLISHED]		= 5 DAYS,
+	[SCTP_CONNTRACK_SHUTDOWN_SENT]		= 300 SECS / 1000,
+	[SCTP_CONNTRACK_SHUTDOWN_RECD]		= 300 SECS / 1000,
+	[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]	= 3 SECS,
+};
+
+#define sNO SCTP_CONNTRACK_NONE
+#define	sCL SCTP_CONNTRACK_CLOSED
+#define	sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define	sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define	sES SCTP_CONNTRACK_ESTABLISHED
+#define	sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define	sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define	sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define	sIV SCTP_CONNTRACK_MAX
+
+/*
+	These are the descriptions of the states:
+
+NOTE: These state names are tantalizingly similar to the states of an
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end
+point. Please note the subtleties. -Kiran
+
+NONE              - Nothing so far.
+COOKIE WAIT       - We have seen an INIT chunk in the original direction, or also
+		    an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+		    to that of the SHUTDOWN chunk.
+CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
+		    the SHUTDOWN chunk. Connection is closed.
+*/
+
+/* TODO
+ - I have assumed that the first INIT is in the original direction.
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+
+/* SCTP conntrack state transitions */
+static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+	{
+/*	ORIGINAL	*/
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/
+/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+	},
+	{
+/*	REPLY	*/
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack     */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */
+/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+	}
+};
+
+static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+			      struct nf_conntrack_tuple *tuple)
+{
+	const struct sctphdr *hp;
+	struct sctphdr _hdr;
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+	if (hp == NULL)
+		return false;
+
+	tuple->src.u.sctp.port = hp->source;
+	tuple->dst.u.sctp.port = hp->dest;
+	return true;
+}
+
+static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+	tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int sctp_print_tuple(struct seq_file *s,
+			    const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.sctp.port),
+			  ntohs(tuple->dst.u.sctp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	enum sctp_conntrack state;
+
+	spin_lock_bh(&ct->lock);
+	state = ct->proto.sctp.state;
+	spin_unlock_bh(&ct->lock);
+
+	return seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+
+#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
+for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0;	\
+	(offset) < (skb)->len &&					\
+	((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch)));	\
+	(offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)
+
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct nf_conn *ct,
+			   const struct sk_buff *skb,
+			   unsigned int dataoff,
+			   unsigned long *map)
+{
+	u_int32_t offset, count;
+	sctp_chunkhdr_t _sch, *sch;
+	int flag;
+
+	flag = 0;
+
+	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+		pr_debug("Chunk Num: %d  Type: %d\n", count, sch->type);
+
+		if (sch->type == SCTP_CID_INIT ||
+		    sch->type == SCTP_CID_INIT_ACK ||
+		    sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
+			flag = 1;
+
+		/*
+		 * Cookie Ack/Echo chunks not the first OR
+		 * Init / Init Ack / Shutdown compl chunks not the only chunks
+		 * OR zero-length.
+		 */
+		if (((sch->type == SCTP_CID_COOKIE_ACK ||
+		      sch->type == SCTP_CID_COOKIE_ECHO ||
+		      flag) &&
+		     count != 0) || !sch->length) {
+			pr_debug("Basic checks failed\n");
+			return 1;
+		}
+
+		if (map)
+			set_bit(sch->type, map);
+	}
+
+	pr_debug("Basic checks passed\n");
+	return count == 0;
+}
+
+static int sctp_new_state(enum ip_conntrack_dir dir,
+			  enum sctp_conntrack cur_state,
+			  int chunk_type)
+{
+	int i;
+
+	pr_debug("Chunk type: %d\n", chunk_type);
+
+	switch (chunk_type) {
+	case SCTP_CID_INIT:
+		pr_debug("SCTP_CID_INIT\n");
+		i = 0;
+		break;
+	case SCTP_CID_INIT_ACK:
+		pr_debug("SCTP_CID_INIT_ACK\n");
+		i = 1;
+		break;
+	case SCTP_CID_ABORT:
+		pr_debug("SCTP_CID_ABORT\n");
+		i = 2;
+		break;
+	case SCTP_CID_SHUTDOWN:
+		pr_debug("SCTP_CID_SHUTDOWN\n");
+		i = 3;
+		break;
+	case SCTP_CID_SHUTDOWN_ACK:
+		pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
+		i = 4;
+		break;
+	case SCTP_CID_ERROR:
+		pr_debug("SCTP_CID_ERROR\n");
+		i = 5;
+		break;
+	case SCTP_CID_COOKIE_ECHO:
+		pr_debug("SCTP_CID_COOKIE_ECHO\n");
+		i = 6;
+		break;
+	case SCTP_CID_COOKIE_ACK:
+		pr_debug("SCTP_CID_COOKIE_ACK\n");
+		i = 7;
+		break;
+	case SCTP_CID_SHUTDOWN_COMPLETE:
+		pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
+		i = 8;
+		break;
+	default:
+		/* Other chunks like DATA, SACK, HEARTBEAT and
+		its ACK do not cause a change in state */
+		pr_debug("Unknown chunk type, Will stay in %s\n",
+			 sctp_conntrack_names[cur_state]);
+		return cur_state;
+	}
+
+	pr_debug("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n",
+		 dir, sctp_conntrack_names[cur_state], chunk_type,
+		 sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+
+	return sctp_conntracks[dir][i][cur_state];
+}
+
+/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
+static int sctp_packet(struct nf_conn *ct,
+		       const struct sk_buff *skb,
+		       unsigned int dataoff,
+		       enum ip_conntrack_info ctinfo,
+		       u_int8_t pf,
+		       unsigned int hooknum)
+{
+	enum sctp_conntrack new_state, old_state;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	const struct sctphdr *sh;
+	struct sctphdr _sctph;
+	const struct sctp_chunkhdr *sch;
+	struct sctp_chunkhdr _sch;
+	u_int32_t offset, count;
+	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+
+	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		goto out;
+
+	if (do_basic_checks(ct, skb, dataoff, map) != 0)
+		goto out;
+
+	/* Check the verification tag (Sec 8.5) */
+	if (!test_bit(SCTP_CID_INIT, map) &&
+	    !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
+	    !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
+	    !test_bit(SCTP_CID_ABORT, map) &&
+	    !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
+	    sh->vtag != ct->proto.sctp.vtag[dir]) {
+		pr_debug("Verification tag check failed\n");
+		goto out;
+	}
+
+	old_state = new_state = SCTP_CONNTRACK_NONE;
+	spin_lock_bh(&ct->lock);
+	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+		/* Special cases of Verification tag check (Sec 8.5.1) */
+		if (sch->type == SCTP_CID_INIT) {
+			/* Sec 8.5.1 (A) */
+			if (sh->vtag != 0)
+				goto out_unlock;
+		} else if (sch->type == SCTP_CID_ABORT) {
+			/* Sec 8.5.1 (B) */
+			if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+			    sh->vtag != ct->proto.sctp.vtag[!dir])
+				goto out_unlock;
+		} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+			/* Sec 8.5.1 (C) */
+			if (sh->vtag != ct->proto.sctp.vtag[dir] &&
+			    sh->vtag != ct->proto.sctp.vtag[!dir] &&
+			    sch->flags & SCTP_CHUNK_FLAG_T)
+				goto out_unlock;
+		} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+			/* Sec 8.5.1 (D) */
+			if (sh->vtag != ct->proto.sctp.vtag[dir])
+				goto out_unlock;
+		}
+
+		old_state = ct->proto.sctp.state;
+		new_state = sctp_new_state(dir, old_state, sch->type);
+
+		/* Invalid */
+		if (new_state == SCTP_CONNTRACK_MAX) {
+			pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
+				 "conntrack=%u\n",
+				 dir, sch->type, old_state);
+			goto out_unlock;
+		}
+
+		/* If it is an INIT or an INIT ACK note down the vtag */
+		if (sch->type == SCTP_CID_INIT ||
+		    sch->type == SCTP_CID_INIT_ACK) {
+			sctp_inithdr_t _inithdr, *ih;
+
+			ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+						sizeof(_inithdr), &_inithdr);
+			if (ih == NULL)
+				goto out_unlock;
+			pr_debug("Setting vtag %x for dir %d\n",
+				 ih->init_tag, !dir);
+			ct->proto.sctp.vtag[!dir] = ih->init_tag;
+		}
+
+		ct->proto.sctp.state = new_state;
+		if (old_state != new_state)
+			nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+	}
+	spin_unlock_bh(&ct->lock);
+
+	nf_ct_refresh_acct(ct, ctinfo, skb, sctp_timeouts[new_state]);
+
+	if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
+	    dir == IP_CT_DIR_REPLY &&
+	    new_state == SCTP_CONNTRACK_ESTABLISHED) {
+		pr_debug("Setting assured bit\n");
+		set_bit(IPS_ASSURED_BIT, &ct->status);
+		nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	}
+
+	return NF_ACCEPT;
+
+out_unlock:
+	spin_unlock_bh(&ct->lock);
+out:
+	return -NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
+		     unsigned int dataoff)
+{
+	enum sctp_conntrack new_state;
+	const struct sctphdr *sh;
+	struct sctphdr _sctph;
+	const struct sctp_chunkhdr *sch;
+	struct sctp_chunkhdr _sch;
+	u_int32_t offset, count;
+	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+
+	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		return false;
+
+	if (do_basic_checks(ct, skb, dataoff, map) != 0)
+		return false;
+
+	/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+	if (test_bit(SCTP_CID_ABORT, map) ||
+	    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
+	    test_bit(SCTP_CID_COOKIE_ACK, map))
+		return false;
+
+	memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+	new_state = SCTP_CONNTRACK_MAX;
+	for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+		/* Don't need lock here: this conntrack not in circulation yet */
+		new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
+					   SCTP_CONNTRACK_NONE, sch->type);
+
+		/* Invalid: delete conntrack */
+		if (new_state == SCTP_CONNTRACK_NONE ||
+		    new_state == SCTP_CONNTRACK_MAX) {
+			pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
+			return false;
+		}
+
+		/* Copy the vtag into the state info */
+		if (sch->type == SCTP_CID_INIT) {
+			if (sh->vtag == 0) {
+				sctp_inithdr_t _inithdr, *ih;
+
+				ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+							sizeof(_inithdr), &_inithdr);
+				if (ih == NULL)
+					return false;
+
+				pr_debug("Setting vtag %x for new conn\n",
+					 ih->init_tag);
+
+				ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+								ih->init_tag;
+			} else {
+				/* Sec 8.5.1 (A) */
+				return false;
+			}
+		}
+		/* If it is a shutdown ack OOTB packet, we expect a return
+		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+		else {
+			pr_debug("Setting vtag %x for new conn OOTB\n",
+				 sh->vtag);
+			ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+		}
+
+		ct->proto.sctp.state = new_state;
+	}
+
+	return true;
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+			  struct nf_conn *ct)
+{
+	struct nlattr *nest_parms;
+
+	spin_lock_bh(&ct->lock);
+	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state);
+
+	NLA_PUT_BE32(skb,
+		     CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+		     ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]);
+
+	NLA_PUT_BE32(skb,
+		     CTA_PROTOINFO_SCTP_VTAG_REPLY,
+		     ct->proto.sctp.vtag[IP_CT_DIR_REPLY]);
+
+	spin_unlock_bh(&ct->lock);
+
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(&ct->lock);
+	return -1;
+}
+
+static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = {
+	[CTA_PROTOINFO_SCTP_STATE]	    = { .type = NLA_U8 },
+	[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]  = { .type = NLA_U32 },
+	[CTA_PROTOINFO_SCTP_VTAG_REPLY]     = { .type = NLA_U32 },
+};
+
+static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
+{
+	struct nlattr *attr = cda[CTA_PROTOINFO_SCTP];
+	struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1];
+	int err;
+
+	/* updates may not contain the internal protocol info, skip parsing */
+	if (!attr)
+		return 0;
+
+	err = nla_parse_nested(tb,
+			       CTA_PROTOINFO_SCTP_MAX,
+			       attr,
+			       sctp_nla_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_PROTOINFO_SCTP_STATE] ||
+	    !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] ||
+	    !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])
+		return -EINVAL;
+
+	spin_lock_bh(&ct->lock);
+	ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]);
+	ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] =
+		nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]);
+	ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+		nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]);
+	spin_unlock_bh(&ct->lock);
+
+	return 0;
+}
+
+static int sctp_nlattr_size(void)
+{
+	return nla_total_size(0)	/* CTA_PROTOINFO_SCTP */
+		+ nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1);
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+static unsigned int sctp_sysctl_table_users;
+static struct ctl_table_header *sctp_sysctl_header;
+static struct ctl_table sctp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_sctp_timeout_closed",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_CLOSED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_cookie_wait",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_cookie_echoed",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_established",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_shutdown_sent",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_shutdown_recd",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_sctp_timeout_shutdown_ack_sent",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table sctp_compat_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_sctp_timeout_closed",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_CLOSED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_cookie_wait",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_cookie_echoed",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_established",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_sent",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_recd",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+		.data		= &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
+	.l3proto		= PF_INET,
+	.l4proto 		= IPPROTO_SCTP,
+	.name 			= "sctp",
+	.pkt_to_tuple 		= sctp_pkt_to_tuple,
+	.invert_tuple 		= sctp_invert_tuple,
+	.print_tuple 		= sctp_print_tuple,
+	.print_conntrack	= sctp_print_conntrack,
+	.packet 		= sctp_packet,
+	.new 			= sctp_new,
+	.me 			= THIS_MODULE,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= sctp_to_nlattr,
+	.nlattr_size		= sctp_nlattr_size,
+	.from_nlattr		= nlattr_to_sctp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &sctp_sysctl_table_users,
+	.ctl_table_header	= &sctp_sysctl_header,
+	.ctl_table		= sctp_sysctl_table,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	.ctl_compat_table	= sctp_compat_sysctl_table,
+#endif
+#endif
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
+	.l3proto		= PF_INET6,
+	.l4proto 		= IPPROTO_SCTP,
+	.name 			= "sctp",
+	.pkt_to_tuple 		= sctp_pkt_to_tuple,
+	.invert_tuple 		= sctp_invert_tuple,
+	.print_tuple 		= sctp_print_tuple,
+	.print_conntrack	= sctp_print_conntrack,
+	.packet 		= sctp_packet,
+	.new 			= sctp_new,
+	.me 			= THIS_MODULE,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= sctp_to_nlattr,
+	.nlattr_size		= sctp_nlattr_size,
+	.from_nlattr		= nlattr_to_sctp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &sctp_sysctl_table_users,
+	.ctl_table_header	= &sctp_sysctl_header,
+	.ctl_table		= sctp_sysctl_table,
+#endif
+};
+
+static int __init nf_conntrack_proto_sctp_init(void)
+{
+	int ret;
+
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4);
+	if (ret) {
+		pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n");
+		goto out;
+	}
+	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6);
+	if (ret) {
+		pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n");
+		goto cleanup_sctp4;
+	}
+
+	return ret;
+
+ cleanup_sctp4:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+ out:
+	return ret;
+}
+
+static void __exit nf_conntrack_proto_sctp_fini(void)
+{
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4);
+}
+
+module_init(nf_conntrack_proto_sctp_init);
+module_exit(nf_conntrack_proto_sctp_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
+MODULE_ALIAS("ip_conntrack_proto_sctp");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
new file mode 100644
index 00000000..37bf9439
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -0,0 +1,1497 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <asm/unaligned.h>
+
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+
+/* "Be conservative in what you do,
+    be liberal in what you accept from others."
+    If it's non-zero, we mark only out of window RST segments as INVALID. */
+static int nf_ct_tcp_be_liberal __read_mostly = 0;
+
+/* If it is set to zero, we disable picking up already established
+   connections. */
+static int nf_ct_tcp_loose __read_mostly = 1;
+
+/* Max number of the retransmitted packets without receiving an (acceptable)
+   ACK from the destination. If this number is reached, a shorter timer
+   will be started. */
+static int nf_ct_tcp_max_retrans __read_mostly = 3;
+
+  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+     closely.  They're more complex. --RR */
+
+static const char *const tcp_conntrack_names[] = {
+	"NONE",
+	"SYN_SENT",
+	"SYN_RECV",
+	"ESTABLISHED",
+	"FIN_WAIT",
+	"CLOSE_WAIT",
+	"LAST_ACK",
+	"TIME_WAIT",
+	"CLOSE",
+	"SYN_SENT2",
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+   Linux uses 15 packets as limit, which corresponds
+   to ~13-30min depending on RTO. */
+static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly    =   5 MINS;
+static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly =   5 MINS;
+
+static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
+	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
+	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
+	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,
+	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
+	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
+	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
+	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
+	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
+	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
+};
+
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sS2 TCP_CONNTRACK_SYN_SENT2
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+	TCP_SYN_SET,
+	TCP_SYNACK_SET,
+	TCP_FIN_SET,
+	TCP_ACK_SET,
+	TCP_RST_SET,
+	TCP_NONE_SET,
+};
+
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE:	initial state
+ * SYN_SENT:	SYN-only packet seen
+ * SYN_SENT2:	SYN-only packet seen from reply dir, simultaneous open
+ * SYN_RECV:	SYN-ACK packet seen
+ * ESTABLISHED:	ACK packet seen
+ * FIN_WAIT:	FIN packet seen
+ * CLOSE_WAIT:	ACK seen (after FIN)
+ * LAST_ACK:	FIN seen (after FIN)
+ * TIME_WAIT:	last ACK seen
+ * CLOSE:	closed connection (RST)
+ *
+ * Packets marked as IGNORED (sIG):
+ *	if they may be either invalid or valid
+ *	and the receiver may send back a connection
+ *	closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ *	if we regard them as truly invalid packets
+ */
+static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+	{
+/* ORIGINAL */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
+/*
+ *	sNO -> sSS	Initialize a new connection
+ *	sSS -> sSS	Retransmitted SYN
+ *	sS2 -> sS2	Late retransmitted SYN
+ *	sSR -> sIG
+ *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
+ *			are errors. Receiver will reply with RST
+ *			and close the connection.
+ *			Or we are not in sync and hold a dead connection.
+ *	sFW -> sIG
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sSS	Reopened connection (RFC 1122).
+ *	sCL -> sSS
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*
+ *	sNO -> sIV	Too late and no reason to do anything
+ *	sSS -> sIV	Client can't send SYN and then SYN/ACK
+ *	sS2 -> sSR	SYN/ACK sent to SYN2 in simultaneous open
+ *	sSR -> sIG
+ *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
+ *			are errors. Receiver will reply with RST
+ *			and close the connection.
+ *			Or we are not in sync and hold a dead connection.
+ *	sFW -> sIG
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sIG
+ *	sCL -> sIG
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *	sNO -> sIV	Too late and no reason to do anything...
+ *	sSS -> sIV	Client migth not send FIN in this state:
+ *			we enforce waiting for a SYN/ACK reply first.
+ *	sS2 -> sIV
+ *	sSR -> sFW	Close started.
+ *	sES -> sFW
+ *	sFW -> sLA	FIN seen in both directions, waiting for
+ *			the last ACK.
+ *			Migth be a retransmitted FIN as well...
+ *	sCW -> sLA
+ *	sLA -> sLA	Retransmitted FIN. Remain in the same state.
+ *	sTW -> sTW
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ *	sNO -> sES	Assumed.
+ *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
+ *	sS2 -> sIV
+ *	sSR -> sES	Established state is reached.
+ *	sES -> sES	:-)
+ *	sFW -> sCW	Normal close request answered by ACK.
+ *	sCW -> sCW
+ *	sLA -> sTW	Last ACK detected.
+ *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+	},
+	{
+/* REPLY */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*syn*/	   { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 },
+/*
+ *	sNO -> sIV	Never reached.
+ *	sSS -> sS2	Simultaneous open
+ *	sS2 -> sS2	Retransmitted simultaneous SYN
+ *	sSR -> sIV	Invalid SYN packets sent by the server
+ *	sES -> sIV
+ *	sFW -> sIV
+ *	sCW -> sIV
+ *	sLA -> sIV
+ *	sTW -> sIV	Reopened connection, but server may not do it.
+ *	sCL -> sIV
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
+/*
+ *	sSS -> sSR	Standard open.
+ *	sS2 -> sSR	Simultaneous open
+ *	sSR -> sIG	Retransmitted SYN/ACK, ignore it.
+ *	sES -> sIG	Late retransmitted SYN/ACK?
+ *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sIG
+ *	sCL -> sIG
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *	sSS -> sIV	Server might not send FIN in this state.
+ *	sS2 -> sIV
+ *	sSR -> sFW	Close started.
+ *	sES -> sFW
+ *	sFW -> sLA	FIN seen in both directions.
+ *	sCW -> sLA
+ *	sLA -> sLA	Retransmitted FIN.
+ *	sTW -> sTW
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*ack*/	   { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
+/*
+ *	sSS -> sIG	Might be a half-open connection.
+ *	sS2 -> sIG
+ *	sSR -> sSR	Might answer late resent SYN.
+ *	sES -> sES	:-)
+ *	sFW -> sCW	Normal close request answered by ACK.
+ *	sCW -> sCW
+ *	sLA -> sTW	Last ACK detected.
+ *	sTW -> sTW	Retransmitted last ACK.
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+	}
+};
+
+static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+			     struct nf_conntrack_tuple *tuple)
+{
+	const struct tcphdr *hp;
+	struct tcphdr _hdr;
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+	if (hp == NULL)
+		return false;
+
+	tuple->src.u.tcp.port = hp->source;
+	tuple->dst.u.tcp.port = hp->dest;
+
+	return true;
+}
+
+static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
+			     const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+	tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int tcp_print_tuple(struct seq_file *s,
+			   const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.tcp.port),
+			  ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
+{
+	enum tcp_conntrack state;
+
+	spin_lock_bh(&ct->lock);
+	state = ct->proto.tcp.state;
+	spin_unlock_bh(&ct->lock);
+
+	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+	if (tcph->rst) return TCP_RST_SET;
+	else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+	else if (tcph->fin) return TCP_FIN_SET;
+	else if (tcph->ack) return TCP_ACK_SET;
+	else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+   in IP Filter' by Guido van Rooij.
+
+   http://www.sane.nl/events/sane2000/papers.html
+   http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
+
+   The boundaries and the conditions are changed according to RFC793:
+   the packet must intersect the window (i.e. segments may be
+   after the right or before the left edge) and thus receivers may ACK
+   segments after the right edge of the window.
+
+	td_maxend = max(sack + max(win,1)) seen in reply packets
+	td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+	td_maxwin += seq + len - sender.td_maxend
+			if seq + len > sender.td_maxend
+	td_end    = max(seq + len) seen in sent packets
+
+   I.   Upper bound for valid data:	seq <= sender.td_maxend
+   II.  Lower bound for valid data:	seq + len >= sender.td_end - receiver.td_maxwin
+   III.	Upper bound for valid (s)ack:   sack <= receiver.td_end
+   IV.	Lower bound for valid (s)ack:	sack >= receiver.td_end - MAXACKWINDOW
+
+   where sack is the highest right edge of sack block found in the packet
+   or ack in the case of packet without SACK option.
+
+   The upper bound limit for a valid (s)ack is not ignored -
+   we doesn't have to deal with fragments.
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+					 size_t len,
+					 unsigned int dataoff,
+					 const struct tcphdr *tcph)
+{
+	/* XXX Should I use payload length field in IP/IPv6 header ?
+	 * - YK */
+	return (seq + len - dataoff - tcph->doff*4
+		+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST			66000
+#define MAXACKWINDOW(sender)						\
+	((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin	\
+					      : MAXACKWINCONST)
+
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+			unsigned int dataoff,
+			const struct tcphdr *tcph,
+			struct ip_ct_tcp_state *state)
+{
+	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+	const unsigned char *ptr;
+	int length = (tcph->doff*4) - sizeof(struct tcphdr);
+
+	if (!length)
+		return;
+
+	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+				 length, buff);
+	BUG_ON(ptr == NULL);
+
+	state->td_scale =
+	state->flags = 0;
+
+	while (length > 0) {
+		int opcode=*ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize=*ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				break;	/* don't parse partial options */
+
+			if (opcode == TCPOPT_SACK_PERM
+			    && opsize == TCPOLEN_SACK_PERM)
+				state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+			else if (opcode == TCPOPT_WINDOW
+				 && opsize == TCPOLEN_WINDOW) {
+				state->td_scale = *(u_int8_t *)ptr;
+
+				if (state->td_scale > 14) {
+					/* See RFC1323 */
+					state->td_scale = 14;
+				}
+				state->flags |=
+					IP_CT_TCP_FLAG_WINDOW_SCALE;
+			}
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+
+static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
+                     const struct tcphdr *tcph, __u32 *sack)
+{
+	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+	const unsigned char *ptr;
+	int length = (tcph->doff*4) - sizeof(struct tcphdr);
+	__u32 tmp;
+
+	if (!length)
+		return;
+
+	ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+				 length, buff);
+	BUG_ON(ptr == NULL);
+
+	/* Fast path for timestamp-only option */
+	if (length == TCPOLEN_TSTAMP_ALIGNED*4
+	    && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
+				       | (TCPOPT_NOP << 16)
+				       | (TCPOPT_TIMESTAMP << 8)
+				       | TCPOLEN_TIMESTAMP))
+		return;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize, i;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				break;	/* don't parse partial options */
+
+			if (opcode == TCPOPT_SACK
+			    && opsize >= (TCPOLEN_SACK_BASE
+					  + TCPOLEN_SACK_PERBLOCK)
+			    && !((opsize - TCPOLEN_SACK_BASE)
+				 % TCPOLEN_SACK_PERBLOCK)) {
+				for (i = 0;
+				     i < (opsize - TCPOLEN_SACK_BASE);
+				     i += TCPOLEN_SACK_PERBLOCK) {
+					tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
+
+					if (after(tmp, *sack))
+						*sack = tmp;
+				}
+				return;
+			}
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+
+#ifdef CONFIG_NF_NAT_NEEDED
+static inline s16 nat_offset(const struct nf_conn *ct,
+			     enum ip_conntrack_dir dir,
+			     u32 seq)
+{
+	typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
+
+	return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
+}
+#define NAT_OFFSET(pf, ct, dir, seq) \
+	(pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0)
+#else
+#define NAT_OFFSET(pf, ct, dir, seq)	0
+#endif
+
+static bool tcp_in_window(const struct nf_conn *ct,
+			  struct ip_ct_tcp *state,
+			  enum ip_conntrack_dir dir,
+			  unsigned int index,
+			  const struct sk_buff *skb,
+			  unsigned int dataoff,
+			  const struct tcphdr *tcph,
+			  u_int8_t pf)
+{
+	struct net *net = nf_ct_net(ct);
+	struct ip_ct_tcp_state *sender = &state->seen[dir];
+	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+	const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
+	__u32 seq, ack, sack, end, win, swin;
+	s16 receiver_offset;
+	bool res;
+
+	/*
+	 * Get the required data from the packet.
+	 */
+	seq = ntohl(tcph->seq);
+	ack = sack = ntohl(tcph->ack_seq);
+	win = ntohs(tcph->window);
+	end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
+
+	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+		tcp_sack(skb, dataoff, tcph, &sack);
+
+	/* Take into account NAT sequence number mangling */
+	receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1);
+	ack -= receiver_offset;
+	sack -= receiver_offset;
+
+	pr_debug("tcp_in_window: START\n");
+	pr_debug("tcp_in_window: ");
+	nf_ct_dump_tuple(tuple);
+	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
+	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
+
+	if (sender->td_maxwin == 0) {
+		/*
+		 * Initialize sender data.
+		 */
+		if (tcph->syn) {
+			/*
+			 * SYN-ACK in reply to a SYN
+			 * or SYN from reply direction in simultaneous open.
+			 */
+			sender->td_end =
+			sender->td_maxend = end;
+			sender->td_maxwin = (win == 0 ? 1 : win);
+
+			tcp_options(skb, dataoff, tcph, sender);
+			/*
+			 * RFC 1323:
+			 * Both sides must send the Window Scale option
+			 * to enable window scaling in either direction.
+			 */
+			if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+			      && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+				sender->td_scale =
+				receiver->td_scale = 0;
+			if (!tcph->ack)
+				/* Simultaneous open */
+				return true;
+		} else {
+			/*
+			 * We are in the middle of a connection,
+			 * its history is lost for us.
+			 * Let's try to use the data from the packet.
+			 */
+			sender->td_end = end;
+			win <<= sender->td_scale;
+			sender->td_maxwin = (win == 0 ? 1 : win);
+			sender->td_maxend = end + sender->td_maxwin;
+			/*
+			 * We haven't seen traffic in the other direction yet
+			 * but we have to tweak window tracking to pass III
+			 * and IV until that happens.
+			 */
+			if (receiver->td_maxwin == 0)
+				receiver->td_end = receiver->td_maxend = sack;
+		}
+	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
+		     && dir == IP_CT_DIR_ORIGINAL)
+		   || (state->state == TCP_CONNTRACK_SYN_RECV
+		     && dir == IP_CT_DIR_REPLY))
+		   && after(end, sender->td_end)) {
+		/*
+		 * RFC 793: "if a TCP is reinitialized ... then it need
+		 * not wait at all; it must only be sure to use sequence
+		 * numbers larger than those recently used."
+		 */
+		sender->td_end =
+		sender->td_maxend = end;
+		sender->td_maxwin = (win == 0 ? 1 : win);
+
+		tcp_options(skb, dataoff, tcph, sender);
+	}
+
+	if (!(tcph->ack)) {
+		/*
+		 * If there is no ACK, just pretend it was set and OK.
+		 */
+		ack = sack = receiver->td_end;
+	} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+		    (TCP_FLAG_ACK|TCP_FLAG_RST))
+		   && (ack == 0)) {
+		/*
+		 * Broken TCP stacks, that set ACK in RST packets as well
+		 * with zero ack value.
+		 */
+		ack = sack = receiver->td_end;
+	}
+
+	if (seq == end
+	    && (!tcph->rst
+		|| (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+		/*
+		 * Packets contains no data: we assume it is valid
+		 * and check the ack value only.
+		 * However RST segments are always validated by their
+		 * SEQ number, except when seq == 0 (reset sent answering
+		 * SYN.
+		 */
+		seq = end = sender->td_end;
+
+	pr_debug("tcp_in_window: ");
+	nf_ct_dump_tuple(tuple);
+	pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+		 seq, ack, receiver_offset, sack, receiver_offset, win, end);
+	pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
+
+	pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+		 before(seq, sender->td_maxend + 1),
+		 after(end, sender->td_end - receiver->td_maxwin - 1),
+		 before(sack, receiver->td_end + 1),
+		 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+
+	if (before(seq, sender->td_maxend + 1) &&
+	    after(end, sender->td_end - receiver->td_maxwin - 1) &&
+	    before(sack, receiver->td_end + 1) &&
+	    after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
+		/*
+		 * Take into account window scaling (RFC 1323).
+		 */
+		if (!tcph->syn)
+			win <<= sender->td_scale;
+
+		/*
+		 * Update sender data.
+		 */
+		swin = win + (sack - ack);
+		if (sender->td_maxwin < swin)
+			sender->td_maxwin = swin;
+		if (after(end, sender->td_end)) {
+			sender->td_end = end;
+			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+		}
+		if (tcph->ack) {
+			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+				sender->td_maxack = ack;
+				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+			} else if (after(ack, sender->td_maxack))
+				sender->td_maxack = ack;
+		}
+
+		/*
+		 * Update receiver data.
+		 */
+		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+			receiver->td_maxwin += end - sender->td_maxend;
+		if (after(sack + win, receiver->td_maxend - 1)) {
+			receiver->td_maxend = sack + win;
+			if (win == 0)
+				receiver->td_maxend++;
+		}
+		if (ack == receiver->td_end)
+			receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+		/*
+		 * Check retransmissions.
+		 */
+		if (index == TCP_ACK_SET) {
+			if (state->last_dir == dir
+			    && state->last_seq == seq
+			    && state->last_ack == ack
+			    && state->last_end == end
+			    && state->last_win == win)
+				state->retrans++;
+			else {
+				state->last_dir = dir;
+				state->last_seq = seq;
+				state->last_ack = ack;
+				state->last_end = end;
+				state->last_win = win;
+				state->retrans = 0;
+			}
+		}
+		res = true;
+	} else {
+		res = false;
+		if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
+		    nf_ct_tcp_be_liberal)
+			res = true;
+		if (!res && LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+			"nf_ct_tcp: %s ",
+			before(seq, sender->td_maxend + 1) ?
+			after(end, sender->td_end - receiver->td_maxwin - 1) ?
+			before(sack, receiver->td_end + 1) ?
+			after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
+			: "ACK is under the lower bound (possible overly delayed ACK)"
+			: "ACK is over the upper bound (ACKed data not seen yet)"
+			: "SEQ is under the lower bound (already ACKed data retransmitted)"
+			: "SEQ is over the upper bound (over the window of the receiver)");
+	}
+
+	pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
+		 "receiver end=%u maxend=%u maxwin=%u\n",
+		 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+	return res;
+}
+
+/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
+static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
+				 TCPHDR_URG) + 1] =
+{
+	[TCPHDR_SYN]				= 1,
+	[TCPHDR_SYN|TCPHDR_URG]			= 1,
+	[TCPHDR_SYN|TCPHDR_ACK]			= 1,
+	[TCPHDR_RST]				= 1,
+	[TCPHDR_RST|TCPHDR_ACK]			= 1,
+	[TCPHDR_FIN|TCPHDR_ACK]			= 1,
+	[TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]	= 1,
+	[TCPHDR_ACK]				= 1,
+	[TCPHDR_ACK|TCPHDR_URG]			= 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
+static int tcp_error(struct net *net, struct nf_conn *tmpl,
+		     struct sk_buff *skb,
+		     unsigned int dataoff,
+		     enum ip_conntrack_info *ctinfo,
+		     u_int8_t pf,
+		     unsigned int hooknum)
+{
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	unsigned int tcplen = skb->len - dataoff;
+	u_int8_t tcpflags;
+
+	/* Smaller that minimal TCP header? */
+	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				"nf_ct_tcp: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* Not whole TCP header or malformed packet */
+	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				"nf_ct_tcp: truncated/malformed packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* Checksum invalid? Ignore.
+	 * We skip checking packets on the outgoing path
+	 * because the checksum is assumed to be correct.
+	 */
+	/* FIXME: Source route IP option packets --RR */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				  "nf_ct_tcp: bad TCP checksum ");
+		return -NF_ACCEPT;
+	}
+
+	/* Check TCP flags. */
+	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
+	if (!tcp_valid_flags[tcpflags]) {
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				  "nf_ct_tcp: invalid TCP flag combination ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct nf_conn *ct,
+		      const struct sk_buff *skb,
+		      unsigned int dataoff,
+		      enum ip_conntrack_info ctinfo,
+		      u_int8_t pf,
+		      unsigned int hooknum)
+{
+	struct net *net = nf_ct_net(ct);
+	struct nf_conntrack_tuple *tuple;
+	enum tcp_conntrack new_state, old_state;
+	enum ip_conntrack_dir dir;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	unsigned long timeout;
+	unsigned int index;
+
+	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+	BUG_ON(th == NULL);
+
+	spin_lock_bh(&ct->lock);
+	old_state = ct->proto.tcp.state;
+	dir = CTINFO2DIR(ctinfo);
+	index = get_conntrack_index(th);
+	new_state = tcp_conntracks[dir][index][old_state];
+	tuple = &ct->tuplehash[dir].tuple;
+
+	switch (new_state) {
+	case TCP_CONNTRACK_SYN_SENT:
+		if (old_state < TCP_CONNTRACK_TIME_WAIT)
+			break;
+		/* RFC 1122: "When a connection is closed actively,
+		 * it MUST linger in TIME-WAIT state for a time 2xMSL
+		 * (Maximum Segment Lifetime). However, it MAY accept
+		 * a new SYN from the remote TCP to reopen the connection
+		 * directly from TIME-WAIT state, if..."
+		 * We ignore the conditions because we are in the
+		 * TIME-WAIT state anyway.
+		 *
+		 * Handle aborted connections: we and the server
+		 * think there is an existing connection but the client
+		 * aborts it and starts a new one.
+		 */
+		if (((ct->proto.tcp.seen[dir].flags
+		      | ct->proto.tcp.seen[!dir].flags)
+		     & IP_CT_TCP_FLAG_CLOSE_INIT)
+		    || (ct->proto.tcp.last_dir == dir
+		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
+			/* Attempt to reopen a closed/aborted connection.
+			 * Delete this connection and look up again. */
+			spin_unlock_bh(&ct->lock);
+
+			/* Only repeat if we can actually remove the timer.
+			 * Destruction may already be in progress in process
+			 * context and we must give it a chance to terminate.
+			 */
+			if (nf_ct_kill(ct))
+				return -NF_REPEAT;
+			return NF_DROP;
+		}
+		/* Fall through */
+	case TCP_CONNTRACK_IGNORE:
+		/* Ignored packets:
+		 *
+		 * Our connection entry may be out of sync, so ignore
+		 * packets which may signal the real connection between
+		 * the client and the server.
+		 *
+		 * a) SYN in ORIGINAL
+		 * b) SYN/ACK in REPLY
+		 * c) ACK in reply direction after initial SYN in original.
+		 *
+		 * If the ignored packet is invalid, the receiver will send
+		 * a RST we'll catch below.
+		 */
+		if (index == TCP_SYNACK_SET
+		    && ct->proto.tcp.last_index == TCP_SYN_SET
+		    && ct->proto.tcp.last_dir != dir
+		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
+			/* b) This SYN/ACK acknowledges a SYN that we earlier
+			 * ignored as invalid. This means that the client and
+			 * the server are both in sync, while the firewall is
+			 * not. We get in sync from the previously annotated
+			 * values.
+			 */
+			old_state = TCP_CONNTRACK_SYN_SENT;
+			new_state = TCP_CONNTRACK_SYN_RECV;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+				ct->proto.tcp.last_win == 0 ?
+					1 : ct->proto.tcp.last_win;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+				ct->proto.tcp.last_wscale;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+				ct->proto.tcp.last_flags;
+			memset(&ct->proto.tcp.seen[dir], 0,
+			       sizeof(struct ip_ct_tcp_state));
+			break;
+		}
+		ct->proto.tcp.last_index = index;
+		ct->proto.tcp.last_dir = dir;
+		ct->proto.tcp.last_seq = ntohl(th->seq);
+		ct->proto.tcp.last_end =
+		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
+		ct->proto.tcp.last_win = ntohs(th->window);
+
+		/* a) This is a SYN in ORIGINAL. The client and the server
+		 * may be in sync but we are not. In that case, we annotate
+		 * the TCP options and let the packet go through. If it is a
+		 * valid SYN packet, the server will reply with a SYN/ACK, and
+		 * then we'll get in sync. Otherwise, the server ignores it. */
+		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+			struct ip_ct_tcp_state seen = {};
+
+			ct->proto.tcp.last_flags =
+			ct->proto.tcp.last_wscale = 0;
+			tcp_options(skb, dataoff, th, &seen);
+			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_WINDOW_SCALE;
+				ct->proto.tcp.last_wscale = seen.td_scale;
+			}
+			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_SACK_PERM;
+			}
+		}
+		spin_unlock_bh(&ct->lock);
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				  "nf_ct_tcp: invalid packet ignored ");
+		return NF_ACCEPT;
+	case TCP_CONNTRACK_MAX:
+		/* Invalid packet */
+		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+			 dir, get_conntrack_index(th), old_state);
+		spin_unlock_bh(&ct->lock);
+		if (LOG_INVALID(net, IPPROTO_TCP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				  "nf_ct_tcp: invalid state ");
+		return -NF_ACCEPT;
+	case TCP_CONNTRACK_CLOSE:
+		if (index == TCP_RST_SET
+		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+		    && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+			/* Invalid RST  */
+			spin_unlock_bh(&ct->lock);
+			if (LOG_INVALID(net, IPPROTO_TCP))
+				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+					  "nf_ct_tcp: invalid RST ");
+			return -NF_ACCEPT;
+		}
+		if (index == TCP_RST_SET
+		    && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
+			 && ct->proto.tcp.last_index == TCP_SYN_SET)
+			|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
+			    && ct->proto.tcp.last_index == TCP_ACK_SET))
+		    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
+			/* RST sent to invalid SYN or ACK we had let through
+			 * at a) and c) above:
+			 *
+			 * a) SYN was in window then
+			 * c) we hold a half-open connection.
+			 *
+			 * Delete our connection entry.
+			 * We skip window checking, because packet might ACK
+			 * segments we ignored. */
+			goto in_window;
+		}
+		/* Just fall through */
+	default:
+		/* Keep compilers happy. */
+		break;
+	}
+
+	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
+			   skb, dataoff, th, pf)) {
+		spin_unlock_bh(&ct->lock);
+		return -NF_ACCEPT;
+	}
+     in_window:
+	/* From now on we have got in-window packets */
+	ct->proto.tcp.last_index = index;
+	ct->proto.tcp.last_dir = dir;
+
+	pr_debug("tcp_conntracks: ");
+	nf_ct_dump_tuple(tuple);
+	pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+		 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+		 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+		 old_state, new_state);
+
+	ct->proto.tcp.state = new_state;
+	if (old_state != new_state
+	    && new_state == TCP_CONNTRACK_FIN_WAIT)
+		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+	if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans &&
+	    tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans)
+		timeout = nf_ct_tcp_timeout_max_retrans;
+	else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
+		 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
+		 tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged)
+		timeout = nf_ct_tcp_timeout_unacknowledged;
+	else
+		timeout = tcp_timeouts[new_state];
+	spin_unlock_bh(&ct->lock);
+
+	if (new_state != old_state)
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+	if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		/* If only reply is a RST, we can consider ourselves not to
+		   have an established connection: this is a fairly common
+		   problem case, so we can delete the conntrack
+		   immediately.  --RR */
+		if (th->rst) {
+			nf_ct_kill_acct(ct, ctinfo, skb);
+			return NF_ACCEPT;
+		}
+	} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
+		   && (old_state == TCP_CONNTRACK_SYN_RECV
+		       || old_state == TCP_CONNTRACK_ESTABLISHED)
+		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
+		/* Set ASSURED if we see see valid ack in ESTABLISHED
+		   after SYN_RECV or a valid answer for a picked up
+		   connection. */
+		set_bit(IPS_ASSURED_BIT, &ct->status);
+		nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	}
+	nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+		    unsigned int dataoff)
+{
+	enum tcp_conntrack new_state;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
+	const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
+
+	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+	BUG_ON(th == NULL);
+
+	/* Don't need lock here: this conntrack not in circulation yet */
+	new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
+
+	/* Invalid: delete conntrack */
+	if (new_state >= TCP_CONNTRACK_MAX) {
+		pr_debug("nf_ct_tcp: invalid new deleting.\n");
+		return false;
+	}
+
+	if (new_state == TCP_CONNTRACK_SYN_SENT) {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+		/* SYN packet */
+		ct->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     dataoff, th);
+		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (ct->proto.tcp.seen[0].td_maxwin == 0)
+			ct->proto.tcp.seen[0].td_maxwin = 1;
+		ct->proto.tcp.seen[0].td_maxend =
+			ct->proto.tcp.seen[0].td_end;
+
+		tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+	} else if (nf_ct_tcp_loose == 0) {
+		/* Don't try to pick up connections. */
+		return false;
+	} else {
+		memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+		/*
+		 * We are in the middle of a connection,
+		 * its history is lost for us.
+		 * Let's try to use the data from the packet.
+		 */
+		ct->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     dataoff, th);
+		ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (ct->proto.tcp.seen[0].td_maxwin == 0)
+			ct->proto.tcp.seen[0].td_maxwin = 1;
+		ct->proto.tcp.seen[0].td_maxend =
+			ct->proto.tcp.seen[0].td_end +
+			ct->proto.tcp.seen[0].td_maxwin;
+
+		/* We assume SACK and liberal window checking to handle
+		 * window scaling */
+		ct->proto.tcp.seen[0].flags =
+		ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
+					      IP_CT_TCP_FLAG_BE_LIBERAL;
+	}
+
+	/* tcp_packet will set them */
+	ct->proto.tcp.last_index = TCP_NONE_SET;
+
+	pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		 sender->td_end, sender->td_maxend, sender->td_maxwin,
+		 sender->td_scale,
+		 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		 receiver->td_scale);
+	return true;
+}
+
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
+			 struct nf_conn *ct)
+{
+	struct nlattr *nest_parms;
+	struct nf_ct_tcp_flags tmp = {};
+
+	spin_lock_bh(&ct->lock);
+	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state);
+
+	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+		   ct->proto.tcp.seen[0].td_scale);
+
+	NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
+		   ct->proto.tcp.seen[1].td_scale);
+
+	tmp.flags = ct->proto.tcp.seen[0].flags;
+	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
+		sizeof(struct nf_ct_tcp_flags), &tmp);
+
+	tmp.flags = ct->proto.tcp.seen[1].flags;
+	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
+		sizeof(struct nf_ct_tcp_flags), &tmp);
+	spin_unlock_bh(&ct->lock);
+
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	spin_unlock_bh(&ct->lock);
+	return -1;
+}
+
+static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
+	[CTA_PROTOINFO_TCP_STATE]	    = { .type = NLA_U8 },
+	[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
+	[CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
+	[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
+	[CTA_PROTOINFO_TCP_FLAGS_REPLY]	    = { .len =  sizeof(struct nf_ct_tcp_flags) },
+};
+
+static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
+{
+	struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
+	struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
+	int err;
+
+	/* updates could not contain anything about the private
+	 * protocol info, in that case skip the parsing */
+	if (!pattr)
+		return 0;
+
+	err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[CTA_PROTOINFO_TCP_STATE] &&
+	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
+		return -EINVAL;
+
+	spin_lock_bh(&ct->lock);
+	if (tb[CTA_PROTOINFO_TCP_STATE])
+		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
+
+	if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
+		struct nf_ct_tcp_flags *attr =
+			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
+		ct->proto.tcp.seen[0].flags &= ~attr->mask;
+		ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
+	}
+
+	if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
+		struct nf_ct_tcp_flags *attr =
+			nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
+		ct->proto.tcp.seen[1].flags &= ~attr->mask;
+		ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
+	}
+
+	if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
+	    tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
+	    ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+	    ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+		ct->proto.tcp.seen[0].td_scale =
+			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
+		ct->proto.tcp.seen[1].td_scale =
+			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
+	}
+	spin_unlock_bh(&ct->lock);
+
+	return 0;
+}
+
+static int tcp_nlattr_size(void)
+{
+	return nla_total_size(0)	   /* CTA_PROTOINFO_TCP */
+		+ nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
+}
+
+static int tcp_nlattr_tuple_size(void)
+{
+	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+static unsigned int tcp_sysctl_table_users;
+static struct ctl_table_header *tcp_sysctl_header;
+static struct ctl_table tcp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_tcp_timeout_syn_sent",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_syn_recv",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_established",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_fin_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_close_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_last_ack",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_time_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_close",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_max_retrans",
+		.data		= &nf_ct_tcp_timeout_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_timeout_unacknowledged",
+		.data		= &nf_ct_tcp_timeout_unacknowledged,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_loose",
+		.data		= &nf_ct_tcp_loose,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname       = "nf_conntrack_tcp_be_liberal",
+		.data           = &nf_ct_tcp_be_liberal,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_tcp_max_retrans",
+		.data		= &nf_ct_tcp_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table tcp_compat_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_syn_sent2",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_syn_recv",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_established",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_fin_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_close_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_last_ack",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_time_wait",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_close",
+		.data		= &tcp_timeouts[TCP_CONNTRACK_CLOSE],
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_timeout_max_retrans",
+		.data		= &nf_ct_tcp_timeout_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_loose",
+		.data		= &nf_ct_tcp_loose,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_be_liberal",
+		.data		= &nf_ct_tcp_be_liberal,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ip_conntrack_tcp_max_retrans",
+		.data		= &nf_ct_tcp_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto 		= IPPROTO_TCP,
+	.name 			= "tcp",
+	.pkt_to_tuple 		= tcp_pkt_to_tuple,
+	.invert_tuple 		= tcp_invert_tuple,
+	.print_tuple 		= tcp_print_tuple,
+	.print_conntrack 	= tcp_print_conntrack,
+	.packet 		= tcp_packet,
+	.new 			= tcp_new,
+	.error			= tcp_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= tcp_to_nlattr,
+	.nlattr_size		= tcp_nlattr_size,
+	.from_nlattr		= nlattr_to_tcp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &tcp_sysctl_table_users,
+	.ctl_table_header	= &tcp_sysctl_header,
+	.ctl_table		= tcp_sysctl_table,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	.ctl_compat_table	= tcp_compat_sysctl_table,
+#endif
+#endif
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto 		= IPPROTO_TCP,
+	.name 			= "tcp",
+	.pkt_to_tuple 		= tcp_pkt_to_tuple,
+	.invert_tuple 		= tcp_invert_tuple,
+	.print_tuple 		= tcp_print_tuple,
+	.print_conntrack 	= tcp_print_conntrack,
+	.packet 		= tcp_packet,
+	.new 			= tcp_new,
+	.error			= tcp_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.to_nlattr		= tcp_to_nlattr,
+	.nlattr_size		= tcp_nlattr_size,
+	.from_nlattr		= nlattr_to_tcp,
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= tcp_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &tcp_sysctl_table_users,
+	.ctl_table_header	= &tcp_sysctl_header,
+	.ctl_table		= tcp_sysctl_table,
+#endif
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
new file mode 100644
index 00000000..8289088b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -0,0 +1,231 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+
+static unsigned int nf_ct_udp_timeout __read_mostly = 30*HZ;
+static unsigned int nf_ct_udp_timeout_stream __read_mostly = 180*HZ;
+
+static bool udp_pkt_to_tuple(const struct sk_buff *skb,
+			     unsigned int dataoff,
+			     struct nf_conntrack_tuple *tuple)
+{
+	const struct udphdr *hp;
+	struct udphdr _hdr;
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return false;
+
+	tuple->src.u.udp.port = hp->source;
+	tuple->dst.u.udp.port = hp->dest;
+
+	return true;
+}
+
+static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
+			     const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u.udp.port = orig->dst.u.udp.port;
+	tuple->dst.u.udp.port = orig->src.u.udp.port;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udp_print_tuple(struct seq_file *s,
+			   const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.udp.port),
+			  ntohs(tuple->dst.u.udp.port));
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct nf_conn *ct,
+		      const struct sk_buff *skb,
+		      unsigned int dataoff,
+		      enum ip_conntrack_info ctinfo,
+		      u_int8_t pf,
+		      unsigned int hooknum)
+{
+	/* If we've seen traffic both ways, this is some kind of UDP
+	   stream.  Extend timeout. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	} else
+		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
+		    unsigned int dataoff)
+{
+	return true;
+}
+
+static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
+		     unsigned int dataoff, enum ip_conntrack_info *ctinfo,
+		     u_int8_t pf,
+		     unsigned int hooknum)
+{
+	unsigned int udplen = skb->len - dataoff;
+	const struct udphdr *hdr;
+	struct udphdr _hdr;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hdr == NULL) {
+		if (LOG_INVALID(net, IPPROTO_UDP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udp: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* Truncated/malformed packets */
+	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+		if (LOG_INVALID(net, IPPROTO_UDP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				"nf_ct_udp: truncated/malformed packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* Packet with no checksum */
+	if (!hdr->check)
+		return NF_ACCEPT;
+
+	/* Checksum invalid? Ignore.
+	 * We skip checking packets on the outgoing path
+	 * because the checksum is assumed to be correct.
+	 * FIXME: Source route IP option packets --RR */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
+		if (LOG_INVALID(net, IPPROTO_UDP))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				"nf_ct_udp: bad UDP checksum ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+
+#ifdef CONFIG_SYSCTL
+static unsigned int udp_sysctl_table_users;
+static struct ctl_table_header *udp_sysctl_header;
+static struct ctl_table udp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_udp_timeout",
+		.data		= &nf_ct_udp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_udp_timeout_stream",
+		.data		= &nf_ct_udp_timeout_stream,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+static struct ctl_table udp_compat_sysctl_table[] = {
+	{
+		.procname	= "ip_conntrack_udp_timeout",
+		.data		= &nf_ct_udp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "ip_conntrack_udp_timeout_stream",
+		.data		= &nf_ct_udp_timeout_stream,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
+#endif /* CONFIG_SYSCTL */
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto		= IPPROTO_UDP,
+	.name			= "udp",
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.new			= udp_new,
+	.error			= udp_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &udp_sysctl_table_users,
+	.ctl_table_header	= &udp_sysctl_header,
+	.ctl_table		= udp_sysctl_table,
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	.ctl_compat_table	= udp_compat_sysctl_table,
+#endif
+#endif
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
+
+struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto		= IPPROTO_UDP,
+	.name			= "udp",
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.packet			= udp_packet,
+	.new			= udp_new,
+	.error			= udp_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &udp_sysctl_table_users,
+	.ctl_table_header	= &udp_sysctl_header,
+	.ctl_table		= udp_sysctl_table,
+#endif
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
new file mode 100644
index 00000000..263b5a72
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -0,0 +1,240 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_log.h>
+
+static unsigned int nf_ct_udplite_timeout __read_mostly = 30*HZ;
+static unsigned int nf_ct_udplite_timeout_stream __read_mostly = 180*HZ;
+
+static bool udplite_pkt_to_tuple(const struct sk_buff *skb,
+				 unsigned int dataoff,
+				 struct nf_conntrack_tuple *tuple)
+{
+	const struct udphdr *hp;
+	struct udphdr _hdr;
+
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return false;
+
+	tuple->src.u.udp.port = hp->source;
+	tuple->dst.u.udp.port = hp->dest;
+	return true;
+}
+
+static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_tuple *orig)
+{
+	tuple->src.u.udp.port = orig->dst.u.udp.port;
+	tuple->dst.u.udp.port = orig->src.u.udp.port;
+	return true;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udplite_print_tuple(struct seq_file *s,
+			       const struct nf_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.udp.port),
+			  ntohs(tuple->dst.u.udp.port));
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udplite_packet(struct nf_conn *ct,
+			  const struct sk_buff *skb,
+			  unsigned int dataoff,
+			  enum ip_conntrack_info ctinfo,
+			  u_int8_t pf,
+			  unsigned int hooknum)
+{
+	/* If we've seen traffic both ways, this is some kind of UDP
+	   stream.  Extend timeout. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+		nf_ct_refresh_acct(ct, ctinfo, skb,
+				   nf_ct_udplite_timeout_stream);
+		/* Also, more likely to be important, and not a probe */
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+			nf_conntrack_event_cache(IPCT_ASSURED, ct);
+	} else
+		nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb,
+			unsigned int dataoff)
+{
+	return true;
+}
+
+static int udplite_error(struct net *net, struct nf_conn *tmpl,
+			 struct sk_buff *skb,
+			 unsigned int dataoff,
+			 enum ip_conntrack_info *ctinfo,
+			 u_int8_t pf,
+			 unsigned int hooknum)
+{
+	unsigned int udplen = skb->len - dataoff;
+	const struct udphdr *hdr;
+	struct udphdr _hdr;
+	unsigned int cscov;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hdr == NULL) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	cscov = ntohs(hdr->len);
+	if (cscov == 0)
+		cscov = udplen;
+	else if (cscov < sizeof(*hdr) || cscov > udplen) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				"nf_ct_udplite: invalid checksum coverage ");
+		return -NF_ACCEPT;
+	}
+
+	/* UDPLITE mandates checksums */
+	if (!hdr->check) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: checksum missing ");
+		return -NF_ACCEPT;
+	}
+
+	/* Checksum invalid? Ignore. */
+	if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
+	    nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
+	    			pf)) {
+		if (LOG_INVALID(net, IPPROTO_UDPLITE))
+			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_udplite: bad UDPLite checksum ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+
+#ifdef CONFIG_SYSCTL
+static unsigned int udplite_sysctl_table_users;
+static struct ctl_table_header *udplite_sysctl_header;
+static struct ctl_table udplite_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_udplite_timeout",
+		.data		= &nf_ct_udplite_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{
+		.procname	= "nf_conntrack_udplite_timeout_stream",
+		.data		= &nf_ct_udplite_timeout_stream,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
+{
+	.l3proto		= PF_INET,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.pkt_to_tuple		= udplite_pkt_to_tuple,
+	.invert_tuple		= udplite_invert_tuple,
+	.print_tuple		= udplite_print_tuple,
+	.packet			= udplite_packet,
+	.new			= udplite_new,
+	.error			= udplite_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &udplite_sysctl_table_users,
+	.ctl_table_header	= &udplite_sysctl_header,
+	.ctl_table		= udplite_sysctl_table,
+#endif
+};
+
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
+{
+	.l3proto		= PF_INET6,
+	.l4proto		= IPPROTO_UDPLITE,
+	.name			= "udplite",
+	.pkt_to_tuple		= udplite_pkt_to_tuple,
+	.invert_tuple		= udplite_invert_tuple,
+	.print_tuple		= udplite_print_tuple,
+	.packet			= udplite_packet,
+	.new			= udplite_new,
+	.error			= udplite_error,
+#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
+	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
+	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
+	.nla_policy		= nf_ct_port_nla_policy,
+#endif
+#ifdef CONFIG_SYSCTL
+	.ctl_table_users	= &udplite_sysctl_table_users,
+	.ctl_table_header	= &udplite_sysctl_header,
+	.ctl_table		= udplite_sysctl_table,
+#endif
+};
+
+static int __init nf_conntrack_proto_udplite_init(void)
+{
+	int err;
+
+	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4);
+	if (err < 0)
+		goto err1;
+	err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6);
+	if (err < 0)
+		goto err2;
+	return 0;
+err2:
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+err1:
+	return err;
+}
+
+static void __exit nf_conntrack_proto_udplite_exit(void)
+{
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6);
+	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4);
+}
+
+module_init(nf_conntrack_proto_udplite_init);
+module_exit(nf_conntrack_proto_udplite_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
new file mode 100644
index 00000000..8501823b
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -0,0 +1,238 @@
+/* SANE connection tracking helper
+ * (SANE = Scanner Access Now Easy)
+ * For documentation about the SANE network protocol see
+ * http://www.sane-project.org/html/doc015.html
+ */
+
+/* Copyright (C) 2007 Red Hat, Inc.
+ * Author: Michal Schmidt <mschmidt@redhat.com>
+ * Based on the FTP conntrack helper (net/netfilter/nf_conntrack_ftp.c):
+ *  (C) 1999-2001 Paul `Rusty' Russell
+ *  (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *  (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *  (C) 2003 Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_sane.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
+MODULE_DESCRIPTION("SANE connection tracking helper");
+MODULE_ALIAS_NFCT_HELPER("sane");
+
+static char *sane_buffer;
+
+static DEFINE_SPINLOCK(nf_sane_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+struct sane_request {
+	__be32 RPC_code;
+#define SANE_NET_START      7   /* RPC code */
+
+	__be32 handle;
+};
+
+struct sane_reply_net_start {
+	__be32 status;
+#define SANE_STATUS_SUCCESS 0
+
+	__be16 zero;
+	__be16 port;
+	/* other fields aren't interesting for conntrack */
+};
+
+static int help(struct sk_buff *skb,
+		unsigned int protoff,
+		struct nf_conn *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	void *sb_ptr;
+	int ret = NF_ACCEPT;
+	int dir = CTINFO2DIR(ctinfo);
+	struct nf_ct_sane_master *ct_sane_info;
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple *tuple;
+	struct sane_request *req;
+	struct sane_reply_net_start *reply;
+
+	ct_sane_info = &nfct_help(ct)->help.ct_sane_info;
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED &&
+	    ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	/* Not a full tcp header? */
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+
+	/* No data? */
+	dataoff = protoff + th->doff * 4;
+	if (dataoff >= skb->len)
+		return NF_ACCEPT;
+
+	datalen = skb->len - dataoff;
+
+	spin_lock_bh(&nf_sane_lock);
+	sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
+	BUG_ON(sb_ptr == NULL);
+
+	if (dir == IP_CT_DIR_ORIGINAL) {
+		if (datalen != sizeof(struct sane_request))
+			goto out;
+
+		req = sb_ptr;
+		if (req->RPC_code != htonl(SANE_NET_START)) {
+			/* Not an interesting command */
+			ct_sane_info->state = SANE_STATE_NORMAL;
+			goto out;
+		}
+
+		/* We're interested in the next reply */
+		ct_sane_info->state = SANE_STATE_START_REQUESTED;
+		goto out;
+	}
+
+	/* Is it a reply to an uninteresting command? */
+	if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
+		goto out;
+
+	/* It's a reply to SANE_NET_START. */
+	ct_sane_info->state = SANE_STATE_NORMAL;
+
+	if (datalen < sizeof(struct sane_reply_net_start)) {
+		pr_debug("nf_ct_sane: NET_START reply too short\n");
+		goto out;
+	}
+
+	reply = sb_ptr;
+	if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
+		/* saned refused the command */
+		pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
+			 ntohl(reply->status));
+		goto out;
+	}
+
+	/* Invalid saned reply? Ignore it. */
+	if (reply->zero != 0)
+		goto out;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (exp == NULL) {
+		ret = NF_DROP;
+		goto out;
+	}
+
+	tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			  &tuple->src.u3, &tuple->dst.u3,
+			  IPPROTO_TCP, NULL, &reply->port);
+
+	pr_debug("nf_ct_sane: expect: ");
+	nf_ct_dump_tuple(&exp->tuple);
+
+	/* Can't expect this?  Best to drop packet now. */
+	if (nf_ct_expect_related(exp) != 0)
+		ret = NF_DROP;
+
+	nf_ct_expect_put(exp);
+
+out:
+	spin_unlock_bh(&nf_sane_lock);
+	return ret;
+}
+
+static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly;
+static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sane_exp_policy = {
+	.max_expected	= 1,
+	.timeout	= 5 * 60,
+};
+
+/* don't make this __exit, since it's called from __init ! */
+static void nf_conntrack_sane_fini(void)
+{
+	int i, j;
+
+	for (i = 0; i < ports_c; i++) {
+		for (j = 0; j < 2; j++) {
+			pr_debug("nf_ct_sane: unregistering helper for pf: %d "
+				 "port: %d\n",
+				 sane[i][j].tuple.src.l3num, ports[i]);
+			nf_conntrack_helper_unregister(&sane[i][j]);
+		}
+	}
+
+	kfree(sane_buffer);
+}
+
+static int __init nf_conntrack_sane_init(void)
+{
+	int i, j = -1, ret = 0;
+	char *tmpname;
+
+	sane_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!sane_buffer)
+		return -ENOMEM;
+
+	if (ports_c == 0)
+		ports[ports_c++] = SANE_PORT;
+
+	/* FIXME should be configurable whether IPv4 and IPv6 connections
+		 are tracked or not - YK */
+	for (i = 0; i < ports_c; i++) {
+		sane[i][0].tuple.src.l3num = PF_INET;
+		sane[i][1].tuple.src.l3num = PF_INET6;
+		for (j = 0; j < 2; j++) {
+			sane[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+			sane[i][j].tuple.dst.protonum = IPPROTO_TCP;
+			sane[i][j].expect_policy = &sane_exp_policy;
+			sane[i][j].me = THIS_MODULE;
+			sane[i][j].help = help;
+			tmpname = &sane_names[i][j][0];
+			if (ports[i] == SANE_PORT)
+				sprintf(tmpname, "sane");
+			else
+				sprintf(tmpname, "sane-%d", ports[i]);
+			sane[i][j].name = tmpname;
+
+			pr_debug("nf_ct_sane: registering helper for pf: %d "
+				 "port: %d\n",
+				 sane[i][j].tuple.src.l3num, ports[i]);
+			ret = nf_conntrack_helper_register(&sane[i][j]);
+			if (ret) {
+				printk(KERN_ERR "nf_ct_sane: failed to "
+				       "register helper for pf: %d port: %d\n",
+					sane[i][j].tuple.src.l3num, ports[i]);
+				nf_conntrack_sane_fini();
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+module_init(nf_conntrack_sane_init);
+module_exit(nf_conntrack_sane_fini);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
new file mode 100644
index 00000000..93faf6a3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -0,0 +1,1610 @@
+/* SIP extension for IP connection tracking.
+ *
+ * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
+ * based on RR's ip_conntrack_ftp.c and other modules.
+ * (C) 2007 United Security Providers
+ * (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
+MODULE_DESCRIPTION("SIP connection tracking helper");
+MODULE_ALIAS("ip_conntrack_sip");
+MODULE_ALIAS_NFCT_HELPER("sip");
+
+#define MAX_PORTS	8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of SIP servers");
+
+static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT;
+module_param(sip_timeout, uint, 0600);
+MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session");
+
+static int sip_direct_signalling __read_mostly = 1;
+module_param(sip_direct_signalling, int, 0600);
+MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar "
+					"only (default 1)");
+
+static int sip_direct_media __read_mostly = 1;
+module_param(sip_direct_media, int, 0600);
+MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "
+				   "endpoints only (default 1)");
+
+unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff,
+				const char **dptr,
+				unsigned int *datalen) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sip_hook);
+
+void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook);
+
+unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb,
+				       unsigned int dataoff,
+				       const char **dptr,
+				       unsigned int *datalen,
+				       struct nf_conntrack_expect *exp,
+				       unsigned int matchoff,
+				       unsigned int matchlen) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook);
+
+unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff,
+				     const char **dptr,
+				     unsigned int *datalen,
+				     unsigned int sdpoff,
+				     enum sdp_header_types type,
+				     enum sdp_header_types term,
+				     const union nf_inet_addr *addr)
+				     __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook);
+
+unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff,
+				     const char **dptr,
+				     unsigned int *datalen,
+				     unsigned int matchoff,
+				     unsigned int matchlen,
+				     u_int16_t port) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook);
+
+unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb,
+					unsigned int dataoff,
+					const char **dptr,
+					unsigned int *datalen,
+					unsigned int sdpoff,
+					const union nf_inet_addr *addr)
+					__read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook);
+
+unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff,
+				      const char **dptr,
+				      unsigned int *datalen,
+				      struct nf_conntrack_expect *rtp_exp,
+				      struct nf_conntrack_expect *rtcp_exp,
+				      unsigned int mediaoff,
+				      unsigned int medialen,
+				      union nf_inet_addr *rtp_addr)
+				      __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook);
+
+static int string_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len = 0;
+
+	while (dptr < limit && isalpha(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int digits_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len = 0;
+	while (dptr < limit && isdigit(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int iswordc(const char c)
+{
+	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+	    c == '{' || c == '}' || c == '~')
+		return 1;
+	return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+	int len = 0;
+	while (dptr < limit && iswordc(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len, domain_len;
+
+	len = word_len(dptr, limit);
+	dptr += len;
+	if (!len || dptr == limit || *dptr != '@')
+		return len;
+	dptr++;
+	len++;
+
+	domain_len = word_len(dptr, limit);
+	if (!domain_len)
+		return 0;
+	return len + domain_len;
+}
+
+/* get media type + port length */
+static int media_len(const struct nf_conn *ct, const char *dptr,
+		     const char *limit, int *shift)
+{
+	int len = string_len(ct, dptr, limit, shift);
+
+	dptr += len;
+	if (dptr >= limit || *dptr != ' ')
+		return 0;
+	len++;
+	dptr++;
+
+	return len + digits_len(ct, dptr, limit, shift);
+}
+
+static int parse_addr(const struct nf_conn *ct, const char *cp,
+                      const char **endp, union nf_inet_addr *addr,
+                      const char *limit)
+{
+	const char *end;
+	int ret = 0;
+
+	if (!ct)
+		return 0;
+
+	memset(addr, 0, sizeof(*addr));
+	switch (nf_ct_l3num(ct)) {
+	case AF_INET:
+		ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
+		break;
+	case AF_INET6:
+		ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
+		break;
+	default:
+		BUG();
+	}
+
+	if (ret == 0 || end == cp)
+		return 0;
+	if (endp)
+		*endp = end;
+	return 1;
+}
+
+/* skip ip address. returns its length. */
+static int epaddr_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	union nf_inet_addr addr;
+	const char *aux = dptr;
+
+	if (!parse_addr(ct, dptr, &dptr, &addr, limit)) {
+		pr_debug("ip: %s parse failed.!\n", dptr);
+		return 0;
+	}
+
+	/* Port number */
+	if (*dptr == ':') {
+		dptr++;
+		dptr += digits_len(ct, dptr, limit, shift);
+	}
+	return dptr - aux;
+}
+
+/* get address length, skiping user info. */
+static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr,
+			  const char *limit, int *shift)
+{
+	const char *start = dptr;
+	int s = *shift;
+
+	/* Search for @, but stop at the end of the line.
+	 * We are inside a sip: URI, so we don't need to worry about
+	 * continuation lines. */
+	while (dptr < limit &&
+	       *dptr != '@' && *dptr != '\r' && *dptr != '\n') {
+		(*shift)++;
+		dptr++;
+	}
+
+	if (dptr < limit && *dptr == '@') {
+		dptr++;
+		(*shift)++;
+	} else {
+		dptr = start;
+		*shift = s;
+	}
+
+	return epaddr_len(ct, dptr, limit, shift);
+}
+
+/* Parse a SIP request line of the form:
+ *
+ * Request-Line = Method SP Request-URI SP SIP-Version CRLF
+ *
+ * and return the offset and length of the address contained in the Request-URI.
+ */
+int ct_sip_parse_request(const struct nf_conn *ct,
+			 const char *dptr, unsigned int datalen,
+			 unsigned int *matchoff, unsigned int *matchlen,
+			 union nf_inet_addr *addr, __be16 *port)
+{
+	const char *start = dptr, *limit = dptr + datalen, *end;
+	unsigned int mlen;
+	unsigned int p;
+	int shift = 0;
+
+	/* Skip method and following whitespace */
+	mlen = string_len(ct, dptr, limit, NULL);
+	if (!mlen)
+		return 0;
+	dptr += mlen;
+	if (++dptr >= limit)
+		return 0;
+
+	/* Find SIP URI */
+	for (; dptr < limit - strlen("sip:"); dptr++) {
+		if (*dptr == '\r' || *dptr == '\n')
+			return -1;
+		if (strnicmp(dptr, "sip:", strlen("sip:")) == 0) {
+			dptr += strlen("sip:");
+			break;
+		}
+	}
+	if (!skp_epaddr_len(ct, dptr, limit, &shift))
+		return 0;
+	dptr += shift;
+
+	if (!parse_addr(ct, dptr, &end, addr, limit))
+		return -1;
+	if (end < limit && *end == ':') {
+		end++;
+		p = simple_strtoul(end, (char **)&end, 10);
+		if (p < 1024 || p > 65535)
+			return -1;
+		*port = htons(p);
+	} else
+		*port = htons(SIP_PORT);
+
+	if (end == dptr)
+		return 0;
+	*matchoff = dptr - start;
+	*matchlen = end - dptr;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_request);
+
+/* SIP header parsing: SIP headers are located at the beginning of a line, but
+ * may span several lines, in which case the continuation lines begin with a
+ * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or
+ * CRLF, RFC 3261 allows only CRLF, we support both.
+ *
+ * Headers are followed by (optionally) whitespace, a colon, again (optionally)
+ * whitespace and the values. Whitespace in this context means any amount of
+ * tabs, spaces and continuation lines, which are treated as a single whitespace
+ * character.
+ *
+ * Some headers may appear multiple times. A comma separated list of values is
+ * equivalent to multiple headers.
+ */
+static const struct sip_header ct_sip_hdrs[] = {
+	[SIP_HDR_CSEQ]			= SIP_HDR("CSeq", NULL, NULL, digits_len),
+	[SIP_HDR_FROM]			= SIP_HDR("From", "f", "sip:", skp_epaddr_len),
+	[SIP_HDR_TO]			= SIP_HDR("To", "t", "sip:", skp_epaddr_len),
+	[SIP_HDR_CONTACT]		= SIP_HDR("Contact", "m", "sip:", skp_epaddr_len),
+	[SIP_HDR_VIA_UDP]		= SIP_HDR("Via", "v", "UDP ", epaddr_len),
+	[SIP_HDR_VIA_TCP]		= SIP_HDR("Via", "v", "TCP ", epaddr_len),
+	[SIP_HDR_EXPIRES]		= SIP_HDR("Expires", NULL, NULL, digits_len),
+	[SIP_HDR_CONTENT_LENGTH]	= SIP_HDR("Content-Length", "l", NULL, digits_len),
+	[SIP_HDR_CALL_ID]		= SIP_HDR("Call-Id", "i", NULL, callid_len),
+};
+
+static const char *sip_follow_continuation(const char *dptr, const char *limit)
+{
+	/* Walk past newline */
+	if (++dptr >= limit)
+		return NULL;
+
+	/* Skip '\n' in CR LF */
+	if (*(dptr - 1) == '\r' && *dptr == '\n') {
+		if (++dptr >= limit)
+			return NULL;
+	}
+
+	/* Continuation line? */
+	if (*dptr != ' ' && *dptr != '\t')
+		return NULL;
+
+	/* skip leading whitespace */
+	for (; dptr < limit; dptr++) {
+		if (*dptr != ' ' && *dptr != '\t')
+			break;
+	}
+	return dptr;
+}
+
+static const char *sip_skip_whitespace(const char *dptr, const char *limit)
+{
+	for (; dptr < limit; dptr++) {
+		if (*dptr == ' ')
+			continue;
+		if (*dptr != '\r' && *dptr != '\n')
+			break;
+		dptr = sip_follow_continuation(dptr, limit);
+		if (dptr == NULL)
+			return NULL;
+	}
+	return dptr;
+}
+
+/* Search within a SIP header value, dealing with continuation lines */
+static const char *ct_sip_header_search(const char *dptr, const char *limit,
+					const char *needle, unsigned int len)
+{
+	for (limit -= len; dptr < limit; dptr++) {
+		if (*dptr == '\r' || *dptr == '\n') {
+			dptr = sip_follow_continuation(dptr, limit);
+			if (dptr == NULL)
+				break;
+			continue;
+		}
+
+		if (strnicmp(dptr, needle, len) == 0)
+			return dptr;
+	}
+	return NULL;
+}
+
+int ct_sip_get_header(const struct nf_conn *ct, const char *dptr,
+		      unsigned int dataoff, unsigned int datalen,
+		      enum sip_header_types type,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	const struct sip_header *hdr = &ct_sip_hdrs[type];
+	const char *start = dptr, *limit = dptr + datalen;
+	int shift = 0;
+
+	for (dptr += dataoff; dptr < limit; dptr++) {
+		/* Find beginning of line */
+		if (*dptr != '\r' && *dptr != '\n')
+			continue;
+		if (++dptr >= limit)
+			break;
+		if (*(dptr - 1) == '\r' && *dptr == '\n') {
+			if (++dptr >= limit)
+				break;
+		}
+
+		/* Skip continuation lines */
+		if (*dptr == ' ' || *dptr == '\t')
+			continue;
+
+		/* Find header. Compact headers must be followed by a
+		 * non-alphabetic character to avoid mismatches. */
+		if (limit - dptr >= hdr->len &&
+		    strnicmp(dptr, hdr->name, hdr->len) == 0)
+			dptr += hdr->len;
+		else if (hdr->cname && limit - dptr >= hdr->clen + 1 &&
+			 strnicmp(dptr, hdr->cname, hdr->clen) == 0 &&
+			 !isalpha(*(dptr + hdr->clen)))
+			dptr += hdr->clen;
+		else
+			continue;
+
+		/* Find and skip colon */
+		dptr = sip_skip_whitespace(dptr, limit);
+		if (dptr == NULL)
+			break;
+		if (*dptr != ':' || ++dptr >= limit)
+			break;
+
+		/* Skip whitespace after colon */
+		dptr = sip_skip_whitespace(dptr, limit);
+		if (dptr == NULL)
+			break;
+
+		*matchoff = dptr - start;
+		if (hdr->search) {
+			dptr = ct_sip_header_search(dptr, limit, hdr->search,
+						    hdr->slen);
+			if (!dptr)
+				return -1;
+			dptr += hdr->slen;
+		}
+
+		*matchlen = hdr->match_len(ct, dptr, limit, &shift);
+		if (!*matchlen)
+			return -1;
+		*matchoff = dptr - start + shift;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_header);
+
+/* Get next header field in a list of comma separated values */
+static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr,
+			      unsigned int dataoff, unsigned int datalen,
+			      enum sip_header_types type,
+			      unsigned int *matchoff, unsigned int *matchlen)
+{
+	const struct sip_header *hdr = &ct_sip_hdrs[type];
+	const char *start = dptr, *limit = dptr + datalen;
+	int shift = 0;
+
+	dptr += dataoff;
+
+	dptr = ct_sip_header_search(dptr, limit, ",", strlen(","));
+	if (!dptr)
+		return 0;
+
+	dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen);
+	if (!dptr)
+		return 0;
+	dptr += hdr->slen;
+
+	*matchoff = dptr - start;
+	*matchlen = hdr->match_len(ct, dptr, limit, &shift);
+	if (!*matchlen)
+		return -1;
+	*matchoff += shift;
+	return 1;
+}
+
+/* Walk through headers until a parsable one is found or no header of the
+ * given type is left. */
+static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
+			       unsigned int dataoff, unsigned int datalen,
+			       enum sip_header_types type, int *in_header,
+			       unsigned int *matchoff, unsigned int *matchlen)
+{
+	int ret;
+
+	if (in_header && *in_header) {
+		while (1) {
+			ret = ct_sip_next_header(ct, dptr, dataoff, datalen,
+						 type, matchoff, matchlen);
+			if (ret > 0)
+				return ret;
+			if (ret == 0)
+				break;
+			dataoff += *matchoff;
+		}
+		*in_header = 0;
+	}
+
+	while (1) {
+		ret = ct_sip_get_header(ct, dptr, dataoff, datalen,
+					type, matchoff, matchlen);
+		if (ret > 0)
+			break;
+		if (ret == 0)
+			return ret;
+		dataoff += *matchoff;
+	}
+
+	if (in_header)
+		*in_header = 1;
+	return 1;
+}
+
+/* Locate a SIP header, parse the URI and return the offset and length of
+ * the address as well as the address and port themselves. A stream of
+ * headers can be parsed by handing in a non-NULL datalen and in_header
+ * pointer.
+ */
+int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
+			    unsigned int *dataoff, unsigned int datalen,
+			    enum sip_header_types type, int *in_header,
+			    unsigned int *matchoff, unsigned int *matchlen,
+			    union nf_inet_addr *addr, __be16 *port)
+{
+	const char *c, *limit = dptr + datalen;
+	unsigned int p;
+	int ret;
+
+	ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen,
+				  type, in_header, matchoff, matchlen);
+	WARN_ON(ret < 0);
+	if (ret == 0)
+		return ret;
+
+	if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit))
+		return -1;
+	if (*c == ':') {
+		c++;
+		p = simple_strtoul(c, (char **)&c, 10);
+		if (p < 1024 || p > 65535)
+			return -1;
+		*port = htons(p);
+	} else
+		*port = htons(SIP_PORT);
+
+	if (dataoff)
+		*dataoff = c - dptr;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri);
+
+static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr,
+			      unsigned int dataoff, unsigned int datalen,
+			      const char *name,
+			      unsigned int *matchoff, unsigned int *matchlen)
+{
+	const char *limit = dptr + datalen;
+	const char *start;
+	const char *end;
+
+	limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+	if (!limit)
+		limit = dptr + datalen;
+
+	start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+	if (!start)
+		return 0;
+	start += strlen(name);
+
+	end = ct_sip_header_search(start, limit, ";", strlen(";"));
+	if (!end)
+		end = limit;
+
+	*matchoff = start - dptr;
+	*matchlen = end - start;
+	return 1;
+}
+
+/* Parse address from header parameter and return address, offset and length */
+int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
+			       unsigned int dataoff, unsigned int datalen,
+			       const char *name,
+			       unsigned int *matchoff, unsigned int *matchlen,
+			       union nf_inet_addr *addr)
+{
+	const char *limit = dptr + datalen;
+	const char *start, *end;
+
+	limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+	if (!limit)
+		limit = dptr + datalen;
+
+	start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+	if (!start)
+		return 0;
+
+	start += strlen(name);
+	if (!parse_addr(ct, start, &end, addr, limit))
+		return 0;
+	*matchoff = start - dptr;
+	*matchlen = end - start;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_address_param);
+
+/* Parse numerical header parameter and return value, offset and length */
+int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
+				 unsigned int dataoff, unsigned int datalen,
+				 const char *name,
+				 unsigned int *matchoff, unsigned int *matchlen,
+				 unsigned int *val)
+{
+	const char *limit = dptr + datalen;
+	const char *start;
+	char *end;
+
+	limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(","));
+	if (!limit)
+		limit = dptr + datalen;
+
+	start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name));
+	if (!start)
+		return 0;
+
+	start += strlen(name);
+	*val = simple_strtoul(start, &end, 0);
+	if (start == end)
+		return 0;
+	if (matchoff && matchlen) {
+		*matchoff = start - dptr;
+		*matchlen = end - start;
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param);
+
+static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
+				  unsigned int dataoff, unsigned int datalen,
+				  u8 *proto)
+{
+	unsigned int matchoff, matchlen;
+
+	if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=",
+			       &matchoff, &matchlen)) {
+		if (!strnicmp(dptr + matchoff, "TCP", strlen("TCP")))
+			*proto = IPPROTO_TCP;
+		else if (!strnicmp(dptr + matchoff, "UDP", strlen("UDP")))
+			*proto = IPPROTO_UDP;
+		else
+			return 0;
+
+		if (*proto != nf_ct_protonum(ct))
+			return 0;
+	} else
+		*proto = nf_ct_protonum(ct);
+
+	return 1;
+}
+
+/* SDP header parsing: a SDP session description contains an ordered set of
+ * headers, starting with a section containing general session parameters,
+ * optionally followed by multiple media descriptions.
+ *
+ * SDP headers always start at the beginning of a line. According to RFC 2327:
+ * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should
+ * be tolerant and also accept records terminated with a single newline
+ * character". We handle both cases.
+ */
+static const struct sip_header ct_sdp_hdrs[] = {
+	[SDP_HDR_VERSION]		= SDP_HDR("v=", NULL, digits_len),
+	[SDP_HDR_OWNER_IP4]		= SDP_HDR("o=", "IN IP4 ", epaddr_len),
+	[SDP_HDR_CONNECTION_IP4]	= SDP_HDR("c=", "IN IP4 ", epaddr_len),
+	[SDP_HDR_OWNER_IP6]		= SDP_HDR("o=", "IN IP6 ", epaddr_len),
+	[SDP_HDR_CONNECTION_IP6]	= SDP_HDR("c=", "IN IP6 ", epaddr_len),
+	[SDP_HDR_MEDIA]			= SDP_HDR("m=", NULL, media_len),
+};
+
+/* Linear string search within SDP header values */
+static const char *ct_sdp_header_search(const char *dptr, const char *limit,
+					const char *needle, unsigned int len)
+{
+	for (limit -= len; dptr < limit; dptr++) {
+		if (*dptr == '\r' || *dptr == '\n')
+			break;
+		if (strncmp(dptr, needle, len) == 0)
+			return dptr;
+	}
+	return NULL;
+}
+
+/* Locate a SDP header (optionally a substring within the header value),
+ * optionally stopping at the first occurrence of the term header, parse
+ * it and return the offset and length of the data we're interested in.
+ */
+int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr,
+			  unsigned int dataoff, unsigned int datalen,
+			  enum sdp_header_types type,
+			  enum sdp_header_types term,
+			  unsigned int *matchoff, unsigned int *matchlen)
+{
+	const struct sip_header *hdr = &ct_sdp_hdrs[type];
+	const struct sip_header *thdr = &ct_sdp_hdrs[term];
+	const char *start = dptr, *limit = dptr + datalen;
+	int shift = 0;
+
+	for (dptr += dataoff; dptr < limit; dptr++) {
+		/* Find beginning of line */
+		if (*dptr != '\r' && *dptr != '\n')
+			continue;
+		if (++dptr >= limit)
+			break;
+		if (*(dptr - 1) == '\r' && *dptr == '\n') {
+			if (++dptr >= limit)
+				break;
+		}
+
+		if (term != SDP_HDR_UNSPEC &&
+		    limit - dptr >= thdr->len &&
+		    strnicmp(dptr, thdr->name, thdr->len) == 0)
+			break;
+		else if (limit - dptr >= hdr->len &&
+			 strnicmp(dptr, hdr->name, hdr->len) == 0)
+			dptr += hdr->len;
+		else
+			continue;
+
+		*matchoff = dptr - start;
+		if (hdr->search) {
+			dptr = ct_sdp_header_search(dptr, limit, hdr->search,
+						    hdr->slen);
+			if (!dptr)
+				return -1;
+			dptr += hdr->slen;
+		}
+
+		*matchlen = hdr->match_len(ct, dptr, limit, &shift);
+		if (!*matchlen)
+			return -1;
+		*matchoff = dptr - start + shift;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header);
+
+static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,
+				 unsigned int dataoff, unsigned int datalen,
+				 enum sdp_header_types type,
+				 enum sdp_header_types term,
+				 unsigned int *matchoff, unsigned int *matchlen,
+				 union nf_inet_addr *addr)
+{
+	int ret;
+
+	ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term,
+				    matchoff, matchlen);
+	if (ret <= 0)
+		return ret;
+
+	if (!parse_addr(ct, dptr + *matchoff, NULL, addr,
+			dptr + *matchoff + *matchlen))
+		return -1;
+	return 1;
+}
+
+static int refresh_signalling_expectation(struct nf_conn *ct,
+					  union nf_inet_addr *addr,
+					  u8 proto, __be16 port,
+					  unsigned int expires)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+	int found = 0;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+		if (exp->class != SIP_EXPECT_SIGNALLING ||
+		    !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) ||
+		    exp->tuple.dst.protonum != proto ||
+		    exp->tuple.dst.u.udp.port != port)
+			continue;
+		if (!del_timer(&exp->timeout))
+			continue;
+		exp->flags &= ~NF_CT_EXPECT_INACTIVE;
+		exp->timeout.expires = jiffies + expires * HZ;
+		add_timer(&exp->timeout);
+		found = 1;
+		break;
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+	return found;
+}
+
+static void flush_expectations(struct nf_conn *ct, bool media)
+{
+	struct nf_conn_help *help = nfct_help(ct);
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	spin_lock_bh(&nf_conntrack_lock);
+	hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) {
+		if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
+			continue;
+		if (!del_timer(&exp->timeout))
+			continue;
+		nf_ct_unlink_expect(exp);
+		nf_ct_expect_put(exp);
+		if (!media)
+			break;
+	}
+	spin_unlock_bh(&nf_conntrack_lock);
+}
+
+static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff,
+				 const char **dptr, unsigned int *datalen,
+				 union nf_inet_addr *daddr, __be16 port,
+				 enum sip_expectation_classes class,
+				 unsigned int mediaoff, unsigned int medialen)
+{
+	struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct net *net = nf_ct_net(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	union nf_inet_addr *saddr;
+	struct nf_conntrack_tuple tuple;
+	int direct_rtp = 0, skip_expect = 0, ret = NF_DROP;
+	u_int16_t base_port;
+	__be16 rtp_port, rtcp_port;
+	typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port;
+	typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media;
+
+	saddr = NULL;
+	if (sip_direct_media) {
+		if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3))
+			return NF_ACCEPT;
+		saddr = &ct->tuplehash[!dir].tuple.src.u3;
+	}
+
+	/* We need to check whether the registration exists before attempting
+	 * to register it since we can see the same media description multiple
+	 * times on different connections in case multiple endpoints receive
+	 * the same call.
+	 *
+	 * RTP optimization: if we find a matching media channel expectation
+	 * and both the expectation and this connection are SNATed, we assume
+	 * both sides can reach each other directly and use the final
+	 * destination address from the expectation. We still need to keep
+	 * the NATed expectations for media that might arrive from the
+	 * outside, and additionally need to expect the direct RTP stream
+	 * in case it passes through us even without NAT.
+	 */
+	memset(&tuple, 0, sizeof(tuple));
+	if (saddr)
+		tuple.src.u3 = *saddr;
+	tuple.src.l3num		= nf_ct_l3num(ct);
+	tuple.dst.protonum	= IPPROTO_UDP;
+	tuple.dst.u3		= *daddr;
+	tuple.dst.u.udp.port	= port;
+
+	rcu_read_lock();
+	do {
+		exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
+
+		if (!exp || exp->master == ct ||
+		    nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
+		    exp->class != class)
+			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		if (exp->tuple.src.l3num == AF_INET && !direct_rtp &&
+		    (exp->saved_ip != exp->tuple.dst.u3.ip ||
+		     exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
+		    ct->status & IPS_NAT_MASK) {
+			daddr->ip		= exp->saved_ip;
+			tuple.dst.u3.ip		= exp->saved_ip;
+			tuple.dst.u.udp.port	= exp->saved_proto.udp.port;
+			direct_rtp = 1;
+		} else
+#endif
+			skip_expect = 1;
+	} while (!skip_expect);
+	rcu_read_unlock();
+
+	base_port = ntohs(tuple.dst.u.udp.port) & ~1;
+	rtp_port = htons(base_port);
+	rtcp_port = htons(base_port + 1);
+
+	if (direct_rtp) {
+		nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook);
+		if (nf_nat_sdp_port &&
+		    !nf_nat_sdp_port(skb, dataoff, dptr, datalen,
+				     mediaoff, medialen, ntohs(rtp_port)))
+			goto err1;
+	}
+
+	if (skip_expect)
+		return NF_ACCEPT;
+
+	rtp_exp = nf_ct_expect_alloc(ct);
+	if (rtp_exp == NULL)
+		goto err1;
+	nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+			  IPPROTO_UDP, NULL, &rtp_port);
+
+	rtcp_exp = nf_ct_expect_alloc(ct);
+	if (rtcp_exp == NULL)
+		goto err2;
+	nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr,
+			  IPPROTO_UDP, NULL, &rtcp_port);
+
+	nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook);
+	if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp)
+		ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen,
+				       rtp_exp, rtcp_exp,
+				       mediaoff, medialen, daddr);
+	else {
+		if (nf_ct_expect_related(rtp_exp) == 0) {
+			if (nf_ct_expect_related(rtcp_exp) != 0)
+				nf_ct_unexpect_related(rtp_exp);
+			else
+				ret = NF_ACCEPT;
+		}
+	}
+	nf_ct_expect_put(rtcp_exp);
+err2:
+	nf_ct_expect_put(rtp_exp);
+err1:
+	return ret;
+}
+
+static const struct sdp_media_type sdp_media_types[] = {
+	SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO),
+	SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO),
+	SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE),
+};
+
+static const struct sdp_media_type *sdp_media_type(const char *dptr,
+						   unsigned int matchoff,
+						   unsigned int matchlen)
+{
+	const struct sdp_media_type *t;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) {
+		t = &sdp_media_types[i];
+		if (matchlen < t->len ||
+		    strncmp(dptr + matchoff, t->name, t->len))
+			continue;
+		return t;
+	}
+	return NULL;
+}
+
+static int process_sdp(struct sk_buff *skb, unsigned int dataoff,
+		       const char **dptr, unsigned int *datalen,
+		       unsigned int cseq)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchoff, matchlen;
+	unsigned int mediaoff, medialen;
+	unsigned int sdpoff;
+	unsigned int caddr_len, maddr_len;
+	unsigned int i;
+	union nf_inet_addr caddr, maddr, rtp_addr;
+	unsigned int port;
+	enum sdp_header_types c_hdr;
+	const struct sdp_media_type *t;
+	int ret = NF_ACCEPT;
+	typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr;
+	typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session;
+
+	nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook);
+	c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 :
+					     SDP_HDR_CONNECTION_IP6;
+
+	/* Find beginning of session description */
+	if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen,
+				  SDP_HDR_VERSION, SDP_HDR_UNSPEC,
+				  &matchoff, &matchlen) <= 0)
+		return NF_ACCEPT;
+	sdpoff = matchoff;
+
+	/* The connection information is contained in the session description
+	 * and/or once per media description. The first media description marks
+	 * the end of the session description. */
+	caddr_len = 0;
+	if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen,
+				  c_hdr, SDP_HDR_MEDIA,
+				  &matchoff, &matchlen, &caddr) > 0)
+		caddr_len = matchlen;
+
+	mediaoff = sdpoff;
+	for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) {
+		if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen,
+					  SDP_HDR_MEDIA, SDP_HDR_UNSPEC,
+					  &mediaoff, &medialen) <= 0)
+			break;
+
+		/* Get media type and port number. A media port value of zero
+		 * indicates an inactive stream. */
+		t = sdp_media_type(*dptr, mediaoff, medialen);
+		if (!t) {
+			mediaoff += medialen;
+			continue;
+		}
+		mediaoff += t->len;
+		medialen -= t->len;
+
+		port = simple_strtoul(*dptr + mediaoff, NULL, 10);
+		if (port == 0)
+			continue;
+		if (port < 1024 || port > 65535)
+			return NF_DROP;
+
+		/* The media description overrides the session description. */
+		maddr_len = 0;
+		if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen,
+					  c_hdr, SDP_HDR_MEDIA,
+					  &matchoff, &matchlen, &maddr) > 0) {
+			maddr_len = matchlen;
+			memcpy(&rtp_addr, &maddr, sizeof(rtp_addr));
+		} else if (caddr_len)
+			memcpy(&rtp_addr, &caddr, sizeof(rtp_addr));
+		else
+			return NF_DROP;
+
+		ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen,
+					    &rtp_addr, htons(port), t->class,
+					    mediaoff, medialen);
+		if (ret != NF_ACCEPT)
+			return ret;
+
+		/* Update media connection address if present */
+		if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) {
+			ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen,
+					      mediaoff, c_hdr, SDP_HDR_MEDIA,
+					      &rtp_addr);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+		i++;
+	}
+
+	/* Update session connection and owner addresses */
+	nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook);
+	if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK)
+		ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff,
+					 &rtp_addr);
+
+	return ret;
+}
+static int process_invite_response(struct sk_buff *skb, unsigned int dataoff,
+				   const char **dptr, unsigned int *datalen,
+				   unsigned int cseq, unsigned int code)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+
+	if ((code >= 100 && code <= 199) ||
+	    (code >= 200 && code <= 299))
+		return process_sdp(skb, dataoff, dptr, datalen, cseq);
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
+		flush_expectations(ct, true);
+	return NF_ACCEPT;
+}
+
+static int process_update_response(struct sk_buff *skb, unsigned int dataoff,
+				   const char **dptr, unsigned int *datalen,
+				   unsigned int cseq, unsigned int code)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+
+	if ((code >= 100 && code <= 199) ||
+	    (code >= 200 && code <= 299))
+		return process_sdp(skb, dataoff, dptr, datalen, cseq);
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
+		flush_expectations(ct, true);
+	return NF_ACCEPT;
+}
+
+static int process_prack_response(struct sk_buff *skb, unsigned int dataoff,
+				  const char **dptr, unsigned int *datalen,
+				  unsigned int cseq, unsigned int code)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+
+	if ((code >= 100 && code <= 199) ||
+	    (code >= 200 && code <= 299))
+		return process_sdp(skb, dataoff, dptr, datalen, cseq);
+	else if (help->help.ct_sip_info.invite_cseq == cseq)
+		flush_expectations(ct, true);
+	return NF_ACCEPT;
+}
+
+static int process_invite_request(struct sk_buff *skb, unsigned int dataoff,
+				  const char **dptr, unsigned int *datalen,
+				  unsigned int cseq)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+	unsigned int ret;
+
+	flush_expectations(ct, true);
+	ret = process_sdp(skb, dataoff, dptr, datalen, cseq);
+	if (ret == NF_ACCEPT)
+		help->help.ct_sip_info.invite_cseq = cseq;
+	return ret;
+}
+
+static int process_bye_request(struct sk_buff *skb, unsigned int dataoff,
+			       const char **dptr, unsigned int *datalen,
+			       unsigned int cseq)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+	flush_expectations(ct, true);
+	return NF_ACCEPT;
+}
+
+/* Parse a REGISTER request and create a permanent expectation for incoming
+ * signalling connections. The expectation is marked inactive and is activated
+ * when receiving a response indicating success from the registrar.
+ */
+static int process_register_request(struct sk_buff *skb, unsigned int dataoff,
+				    const char **dptr, unsigned int *datalen,
+				    unsigned int cseq)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned int matchoff, matchlen;
+	struct nf_conntrack_expect *exp;
+	union nf_inet_addr *saddr, daddr;
+	__be16 port;
+	u8 proto;
+	unsigned int expires = 0;
+	int ret;
+	typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect;
+
+	/* Expected connections can not register again. */
+	if (ct->status & IPS_EXPECTED)
+		return NF_ACCEPT;
+
+	/* We must check the expiration time: a value of zero signals the
+	 * registrar to release the binding. We'll remove our expectation
+	 * when receiving the new bindings in the response, but we don't
+	 * want to create new ones.
+	 *
+	 * The expiration time may be contained in Expires: header, the
+	 * Contact: header parameters or the URI parameters.
+	 */
+	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+			      &matchoff, &matchlen) > 0)
+		expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+	ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+				      SIP_HDR_CONTACT, NULL,
+				      &matchoff, &matchlen, &daddr, &port);
+	if (ret < 0)
+		return NF_DROP;
+	else if (ret == 0)
+		return NF_ACCEPT;
+
+	/* We don't support third-party registrations */
+	if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr))
+		return NF_ACCEPT;
+
+	if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen,
+				   &proto) == 0)
+		return NF_ACCEPT;
+
+	if (ct_sip_parse_numerical_param(ct, *dptr,
+					 matchoff + matchlen, *datalen,
+					 "expires=", NULL, NULL, &expires) < 0)
+		return NF_DROP;
+
+	if (expires == 0) {
+		ret = NF_ACCEPT;
+		goto store_cseq;
+	}
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return NF_DROP;
+
+	saddr = NULL;
+	if (sip_direct_signalling)
+		saddr = &ct->tuplehash[!dir].tuple.src.u3;
+
+	nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
+			  saddr, &daddr, proto, NULL, &port);
+	exp->timeout.expires = sip_timeout * HZ;
+	exp->helper = nfct_help(ct)->helper;
+	exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
+
+	nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook);
+	if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK)
+		ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp,
+					matchoff, matchlen);
+	else {
+		if (nf_ct_expect_related(exp) != 0)
+			ret = NF_DROP;
+		else
+			ret = NF_ACCEPT;
+	}
+	nf_ct_expect_put(exp);
+
+store_cseq:
+	if (ret == NF_ACCEPT)
+		help->help.ct_sip_info.register_cseq = cseq;
+	return ret;
+}
+
+static int process_register_response(struct sk_buff *skb, unsigned int dataoff,
+				     const char **dptr, unsigned int *datalen,
+				     unsigned int cseq, unsigned int code)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_help *help = nfct_help(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	union nf_inet_addr addr;
+	__be16 port;
+	u8 proto;
+	unsigned int matchoff, matchlen, coff = 0;
+	unsigned int expires = 0;
+	int in_contact = 0, ret;
+
+	/* According to RFC 3261, "UAs MUST NOT send a new registration until
+	 * they have received a final response from the registrar for the
+	 * previous one or the previous REGISTER request has timed out".
+	 *
+	 * However, some servers fail to detect retransmissions and send late
+	 * responses, so we store the sequence number of the last valid
+	 * request and compare it here.
+	 */
+	if (help->help.ct_sip_info.register_cseq != cseq)
+		return NF_ACCEPT;
+
+	if (code >= 100 && code <= 199)
+		return NF_ACCEPT;
+	if (code < 200 || code > 299)
+		goto flush;
+
+	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES,
+			      &matchoff, &matchlen) > 0)
+		expires = simple_strtoul(*dptr + matchoff, NULL, 10);
+
+	while (1) {
+		unsigned int c_expires = expires;
+
+		ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
+					      SIP_HDR_CONTACT, &in_contact,
+					      &matchoff, &matchlen,
+					      &addr, &port);
+		if (ret < 0)
+			return NF_DROP;
+		else if (ret == 0)
+			break;
+
+		/* We don't support third-party registrations */
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr))
+			continue;
+
+		if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen,
+					   *datalen, &proto) == 0)
+			continue;
+
+		ret = ct_sip_parse_numerical_param(ct, *dptr,
+						   matchoff + matchlen,
+						   *datalen, "expires=",
+						   NULL, NULL, &c_expires);
+		if (ret < 0)
+			return NF_DROP;
+		if (c_expires == 0)
+			break;
+		if (refresh_signalling_expectation(ct, &addr, proto, port,
+						   c_expires))
+			return NF_ACCEPT;
+	}
+
+flush:
+	flush_expectations(ct, false);
+	return NF_ACCEPT;
+}
+
+static const struct sip_handler sip_handlers[] = {
+	SIP_HANDLER("INVITE", process_invite_request, process_invite_response),
+	SIP_HANDLER("UPDATE", process_sdp, process_update_response),
+	SIP_HANDLER("ACK", process_sdp, NULL),
+	SIP_HANDLER("PRACK", process_sdp, process_prack_response),
+	SIP_HANDLER("BYE", process_bye_request, NULL),
+	SIP_HANDLER("REGISTER", process_register_request, process_register_response),
+};
+
+static int process_sip_response(struct sk_buff *skb, unsigned int dataoff,
+				const char **dptr, unsigned int *datalen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchoff, matchlen, matchend;
+	unsigned int code, cseq, i;
+
+	if (*datalen < strlen("SIP/2.0 200"))
+		return NF_ACCEPT;
+	code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10);
+	if (!code)
+		return NF_DROP;
+
+	if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+			      &matchoff, &matchlen) <= 0)
+		return NF_DROP;
+	cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+	if (!cseq)
+		return NF_DROP;
+	matchend = matchoff + matchlen + 1;
+
+	for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+		const struct sip_handler *handler;
+
+		handler = &sip_handlers[i];
+		if (handler->response == NULL)
+			continue;
+		if (*datalen < matchend + handler->len ||
+		    strnicmp(*dptr + matchend, handler->method, handler->len))
+			continue;
+		return handler->response(skb, dataoff, dptr, datalen,
+					 cseq, code);
+	}
+	return NF_ACCEPT;
+}
+
+static int process_sip_request(struct sk_buff *skb, unsigned int dataoff,
+			       const char **dptr, unsigned int *datalen)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	unsigned int matchoff, matchlen;
+	unsigned int cseq, i;
+
+	for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
+		const struct sip_handler *handler;
+
+		handler = &sip_handlers[i];
+		if (handler->request == NULL)
+			continue;
+		if (*datalen < handler->len ||
+		    strnicmp(*dptr, handler->method, handler->len))
+			continue;
+
+		if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ,
+				      &matchoff, &matchlen) <= 0)
+			return NF_DROP;
+		cseq = simple_strtoul(*dptr + matchoff, NULL, 10);
+		if (!cseq)
+			return NF_DROP;
+
+		return handler->request(skb, dataoff, dptr, datalen, cseq);
+	}
+	return NF_ACCEPT;
+}
+
+static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct,
+			   unsigned int dataoff, const char **dptr,
+			   unsigned int *datalen)
+{
+	typeof(nf_nat_sip_hook) nf_nat_sip;
+	int ret;
+
+	if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0)
+		ret = process_sip_request(skb, dataoff, dptr, datalen);
+	else
+		ret = process_sip_response(skb, dataoff, dptr, datalen);
+
+	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+		nf_nat_sip = rcu_dereference(nf_nat_sip_hook);
+		if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen))
+			ret = NF_DROP;
+	}
+
+	return ret;
+}
+
+static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
+			struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	struct tcphdr *th, _tcph;
+	unsigned int dataoff, datalen;
+	unsigned int matchoff, matchlen, clen;
+	unsigned int msglen, origlen;
+	const char *dptr, *end;
+	s16 diff, tdiff = 0;
+	int ret = NF_ACCEPT;
+	bool term;
+	typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
+
+	if (ctinfo != IP_CT_ESTABLISHED &&
+	    ctinfo != IP_CT_ESTABLISHED_REPLY)
+		return NF_ACCEPT;
+
+	/* No Data ? */
+	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+	dataoff = protoff + th->doff * 4;
+	if (dataoff >= skb->len)
+		return NF_ACCEPT;
+
+	nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+	if (unlikely(skb_linearize(skb)))
+		return NF_DROP;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+	if (datalen < strlen("SIP/2.0 200"))
+		return NF_ACCEPT;
+
+	while (1) {
+		if (ct_sip_get_header(ct, dptr, 0, datalen,
+				      SIP_HDR_CONTENT_LENGTH,
+				      &matchoff, &matchlen) <= 0)
+			break;
+
+		clen = simple_strtoul(dptr + matchoff, (char **)&end, 10);
+		if (dptr + matchoff == end)
+			break;
+
+		term = false;
+		for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) {
+			if (end[0] == '\r' && end[1] == '\n' &&
+			    end[2] == '\r' && end[3] == '\n') {
+				term = true;
+				break;
+			}
+		}
+		if (!term)
+			break;
+		end += strlen("\r\n\r\n") + clen;
+
+		msglen = origlen = end - dptr;
+		if (msglen > datalen)
+			return NF_DROP;
+
+		ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen);
+		if (ret != NF_ACCEPT)
+			break;
+		diff     = msglen - origlen;
+		tdiff   += diff;
+
+		dataoff += msglen;
+		dptr    += msglen;
+		datalen  = datalen + diff - msglen;
+	}
+
+	if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) {
+		nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook);
+		if (nf_nat_sip_seq_adjust)
+			nf_nat_sip_seq_adjust(skb, tdiff);
+	}
+
+	return ret;
+}
+
+static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
+			struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	const char *dptr;
+
+	/* No Data ? */
+	dataoff = protoff + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return NF_ACCEPT;
+
+	nf_ct_refresh(ct, skb, sip_timeout * HZ);
+
+	if (unlikely(skb_linearize(skb)))
+		return NF_DROP;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+	if (datalen < strlen("SIP/2.0 200"))
+		return NF_ACCEPT;
+
+	return process_sip_msg(skb, ct, dataoff, &dptr, &datalen);
+}
+
+static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
+static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly;
+
+static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
+	[SIP_EXPECT_SIGNALLING] = {
+		.name		= "signalling",
+		.max_expected	= 1,
+		.timeout	= 3 * 60,
+	},
+	[SIP_EXPECT_AUDIO] = {
+		.name		= "audio",
+		.max_expected	= 2 * IP_CT_DIR_MAX,
+		.timeout	= 3 * 60,
+	},
+	[SIP_EXPECT_VIDEO] = {
+		.name		= "video",
+		.max_expected	= 2 * IP_CT_DIR_MAX,
+		.timeout	= 3 * 60,
+	},
+	[SIP_EXPECT_IMAGE] = {
+		.name		= "image",
+		.max_expected	= IP_CT_DIR_MAX,
+		.timeout	= 3 * 60,
+	},
+};
+
+static void nf_conntrack_sip_fini(void)
+{
+	int i, j;
+
+	for (i = 0; i < ports_c; i++) {
+		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+			if (sip[i][j].me == NULL)
+				continue;
+			nf_conntrack_helper_unregister(&sip[i][j]);
+		}
+	}
+}
+
+static int __init nf_conntrack_sip_init(void)
+{
+	int i, j, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = SIP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		memset(&sip[i], 0, sizeof(sip[i]));
+
+		sip[i][0].tuple.src.l3num = AF_INET;
+		sip[i][0].tuple.dst.protonum = IPPROTO_UDP;
+		sip[i][0].help = sip_help_udp;
+		sip[i][1].tuple.src.l3num = AF_INET;
+		sip[i][1].tuple.dst.protonum = IPPROTO_TCP;
+		sip[i][1].help = sip_help_tcp;
+
+		sip[i][2].tuple.src.l3num = AF_INET6;
+		sip[i][2].tuple.dst.protonum = IPPROTO_UDP;
+		sip[i][2].help = sip_help_udp;
+		sip[i][3].tuple.src.l3num = AF_INET6;
+		sip[i][3].tuple.dst.protonum = IPPROTO_TCP;
+		sip[i][3].help = sip_help_tcp;
+
+		for (j = 0; j < ARRAY_SIZE(sip[i]); j++) {
+			sip[i][j].tuple.src.u.udp.port = htons(ports[i]);
+			sip[i][j].expect_policy = sip_exp_policy;
+			sip[i][j].expect_class_max = SIP_EXPECT_MAX;
+			sip[i][j].me = THIS_MODULE;
+
+			tmpname = &sip_names[i][j][0];
+			if (ports[i] == SIP_PORT)
+				sprintf(tmpname, "sip");
+			else
+				sprintf(tmpname, "sip-%u", i);
+			sip[i][j].name = tmpname;
+
+			pr_debug("port #%u: %u\n", i, ports[i]);
+
+			ret = nf_conntrack_helper_register(&sip[i][j]);
+			if (ret) {
+				printk(KERN_ERR "nf_ct_sip: failed to register"
+				       " helper for pf: %u port: %u\n",
+				       sip[i][j].tuple.src.l3num, ports[i]);
+				nf_conntrack_sip_fini();
+				return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+module_init(nf_conntrack_sip_init);
+module_exit(nf_conntrack_sip_fini);
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 00000000..6e545e26
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,77 @@
+/*
+ *      SNMP service broadcast connection tracking helper
+ *
+ *      (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define SNMP_PORT	161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+			unsigned int protoff,
+			struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+	nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+	nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+	if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+		return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+	.max_expected	= 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+	.name			= "snmp",
+	.tuple.src.l3num	= NFPROTO_IPV4,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.help			= snmp_conntrack_help,
+	.expect_policy		= &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+	exp_policy.timeout = timeout;
+	return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+	nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
new file mode 100644
index 00000000..05e9feb1
--- /dev/null
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -0,0 +1,590 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <linux/rculist_nulls.h>
+
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_PROC_FS
+int
+print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+            const struct nf_conntrack_l3proto *l3proto,
+            const struct nf_conntrack_l4proto *l4proto)
+{
+	return l3proto->print_tuple(s, tuple) || l4proto->print_tuple(s, tuple);
+}
+EXPORT_SYMBOL_GPL(print_tuple);
+
+struct ct_iter_state {
+	struct seq_net_private p;
+	unsigned int bucket;
+	u_int64_t time_now;
+};
+
+static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_iter_state *st = seq->private;
+	struct hlist_nulls_node *n;
+
+	for (st->bucket = 0;
+	     st->bucket < net->ct.htable_size;
+	     st->bucket++) {
+		n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+		if (!is_a_nulls(n))
+			return n;
+	}
+	return NULL;
+}
+
+static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
+				      struct hlist_nulls_node *head)
+{
+	struct net *net = seq_file_net(seq);
+	struct ct_iter_state *st = seq->private;
+
+	head = rcu_dereference(hlist_nulls_next_rcu(head));
+	while (is_a_nulls(head)) {
+		if (likely(get_nulls_value(head) == st->bucket)) {
+			if (++st->bucket >= net->ct.htable_size)
+				return NULL;
+		}
+		head = rcu_dereference(
+				hlist_nulls_first_rcu(
+					&net->ct.hash[st->bucket]));
+	}
+	return head;
+}
+
+static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_nulls_node *head = ct_get_first(seq);
+
+	if (head)
+		while (pos && (head = ct_get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct ct_iter_state *st = seq->private;
+
+	st->time_now = ktime_to_ns(ktime_get_real());
+	rcu_read_lock();
+	return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return 0;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+	struct ct_iter_state *st = s->private;
+	struct nf_conn_tstamp *tstamp;
+	s64 delta_time;
+
+	tstamp = nf_conn_tstamp_find(ct);
+	if (tstamp) {
+		delta_time = st->time_now - tstamp->start;
+		if (delta_time > 0)
+			delta_time = div_s64(delta_time, NSEC_PER_SEC);
+		else
+			delta_time = 0;
+
+		return seq_printf(s, "delta-time=%llu ",
+				  (unsigned long long)delta_time);
+	}
+	return 0;
+}
+#else
+static inline int
+ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
+/* return 0 on success, 1 in case of error */
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+	struct nf_conntrack_tuple_hash *hash = v;
+	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
+	int ret = 0;
+
+	NF_CT_ASSERT(ct);
+	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+		return 0;
+
+	/* we only want to print DIR_ORIGINAL */
+	if (NF_CT_DIRECTION(hash))
+		goto release;
+
+	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
+	NF_CT_ASSERT(l3proto);
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+	NF_CT_ASSERT(l4proto);
+
+	ret = -ENOSPC;
+	if (seq_printf(s, "%-8s %u %-8s %u %ld ",
+		       l3proto->name, nf_ct_l3num(ct),
+		       l4proto->name, nf_ct_protonum(ct),
+		       timer_pending(&ct->timeout)
+		       ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
+		goto release;
+
+	if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
+		goto release;
+
+	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+			l3proto, l4proto))
+		goto release;
+
+	if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
+		goto release;
+
+	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
+		if (seq_printf(s, "[UNREPLIED] "))
+			goto release;
+
+	if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+			l3proto, l4proto))
+		goto release;
+
+	if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
+		goto release;
+
+	if (test_bit(IPS_ASSURED_BIT, &ct->status))
+		if (seq_printf(s, "[ASSURED] "))
+			goto release;
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	if (seq_printf(s, "mark=%u ", ct->mark))
+		goto release;
+#endif
+
+	if (ct_show_secctx(s, ct))
+		goto release;
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
+		goto release;
+#endif
+
+	if (ct_show_delta_time(s, ct))
+		goto release;
+
+	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
+		goto release;
+
+	ret = 0;
+release:
+	nf_ct_put(ct);
+	return ret;
+}
+
+static const struct seq_operations ct_seq_ops = {
+	.start = ct_seq_start,
+	.next  = ct_seq_next,
+	.stop  = ct_seq_stop,
+	.show  = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ct_seq_ops,
+			sizeof(struct ct_iter_state));
+}
+
+static const struct file_operations ct_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = ct_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(net->ct.stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(net->ct.stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned int nr_conntracks = atomic_read(&net->ct.count);
+	const struct ip_conntrack_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
+			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
+		   nr_conntracks,
+		   st->searched,
+		   st->found,
+		   st->new,
+		   st->invalid,
+		   st->ignore,
+		   st->delete,
+		   st->delete_list,
+		   st->insert,
+		   st->insert_failed,
+		   st->drop,
+		   st->early_drop,
+		   st->error,
+
+		   st->expect_new,
+		   st->expect_create,
+		   st->expect_delete,
+		   st->search_restart
+		);
+	return 0;
+}
+
+static const struct seq_operations ct_cpu_seq_ops = {
+	.start	= ct_cpu_seq_start,
+	.next	= ct_cpu_seq_next,
+	.stop	= ct_cpu_seq_stop,
+	.show	= ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ct_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ct_cpu_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ct_cpu_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_net,
+};
+
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops);
+	if (!pde)
+		goto out_nf_conntrack;
+
+	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+			  &ct_cpu_seq_fops);
+	if (!pde)
+		goto out_stat_nf_conntrack;
+	return 0;
+
+out_stat_nf_conntrack:
+	proc_net_remove(net, "nf_conntrack");
+out_nf_conntrack:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+	remove_proc_entry("nf_conntrack", net->proc_net_stat);
+	proc_net_remove(net, "nf_conntrack");
+}
+#else
+static int nf_conntrack_standalone_init_proc(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_standalone_fini_proc(struct net *net)
+{
+}
+#endif /* CONFIG_PROC_FS */
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table_header *nf_ct_netfilter_header;
+
+static ctl_table nf_ct_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_max",
+		.data		= &nf_conntrack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_count",
+		.data		= &init_net.ct.count,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname       = "nf_conntrack_buckets",
+		.data           = &init_net.ct.htable_size,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0444,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_checksum",
+		.data		= &init_net.ct.sysctl_checksum,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "nf_conntrack_log_invalid",
+		.data		= &init_net.ct.sysctl_log_invalid,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &log_invalid_proto_min,
+		.extra2		= &log_invalid_proto_max,
+	},
+	{
+		.procname	= "nf_conntrack_expect_max",
+		.data		= &nf_ct_expect_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+#define NET_NF_CONNTRACK_MAX 2089
+
+static ctl_table nf_ct_netfilter_table[] = {
+	{
+		.procname	= "nf_conntrack_max",
+		.data		= &nf_conntrack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_path nf_ct_path[] = {
+	{ .procname = "net", },
+	{ }
+};
+
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	if (net_eq(net, &init_net)) {
+		nf_ct_netfilter_header =
+		       register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table);
+		if (!nf_ct_netfilter_header)
+			goto out;
+	}
+
+	table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out_kmemdup;
+
+	table[1].data = &net->ct.count;
+	table[2].data = &net->ct.htable_size;
+	table[3].data = &net->ct.sysctl_checksum;
+	table[4].data = &net->ct.sysctl_log_invalid;
+
+	net->ct.sysctl_header = register_net_sysctl_table(net,
+					nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.sysctl_header)
+		goto out_unregister_netfilter;
+
+	return 0;
+
+out_unregister_netfilter:
+	kfree(table);
+out_kmemdup:
+	if (net_eq(net, &init_net))
+		unregister_sysctl_table(nf_ct_netfilter_header);
+out:
+	printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n");
+	return -ENOMEM;
+}
+
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	if (net_eq(net, &init_net))
+		unregister_sysctl_table(nf_ct_netfilter_header);
+	table = net->ct.sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_standalone_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_standalone_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+static int nf_conntrack_net_init(struct net *net)
+{
+	int ret;
+
+	ret = nf_conntrack_init(net);
+	if (ret < 0)
+		goto out_init;
+	ret = nf_conntrack_standalone_init_proc(net);
+	if (ret < 0)
+		goto out_proc;
+	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_log_invalid = 0;
+	ret = nf_conntrack_standalone_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+	return 0;
+
+out_sysctl:
+	nf_conntrack_standalone_fini_proc(net);
+out_proc:
+	nf_conntrack_cleanup(net);
+out_init:
+	return ret;
+}
+
+static void nf_conntrack_net_exit(struct net *net)
+{
+	nf_conntrack_standalone_fini_sysctl(net);
+	nf_conntrack_standalone_fini_proc(net);
+	nf_conntrack_cleanup(net);
+}
+
+static struct pernet_operations nf_conntrack_net_ops = {
+	.init = nf_conntrack_net_init,
+	.exit = nf_conntrack_net_exit,
+};
+
+static int __init nf_conntrack_standalone_init(void)
+{
+	return register_pernet_subsys(&nf_conntrack_net_ops);
+}
+
+static void __exit nf_conntrack_standalone_fini(void)
+{
+	unregister_pernet_subsys(&nf_conntrack_net_ops);
+}
+
+module_init(nf_conntrack_standalone_init);
+module_exit(nf_conntrack_standalone_fini);
+
+/* Some modules need us, but don't depend directly on any symbol.
+   They should call this. */
+void need_conntrack(void)
+{
+}
+EXPORT_SYMBOL_GPL(need_conntrack);
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
new file mode 100644
index 00000000..75466fd7
--- /dev/null
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -0,0 +1,153 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_tftp.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("TFTP connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_tftp");
+MODULE_ALIAS_NFCT_HELPER("tftp");
+
+#define MAX_PORTS 8
+static unsigned short ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "Port numbers of TFTP servers");
+
+unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb,
+				 enum ip_conntrack_info ctinfo,
+				 struct nf_conntrack_expect *exp) __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_tftp_hook);
+
+static int tftp_help(struct sk_buff *skb,
+		     unsigned int protoff,
+		     struct nf_conn *ct,
+		     enum ip_conntrack_info ctinfo)
+{
+	const struct tftphdr *tfh;
+	struct tftphdr _tftph;
+	struct nf_conntrack_expect *exp;
+	struct nf_conntrack_tuple *tuple;
+	unsigned int ret = NF_ACCEPT;
+	typeof(nf_nat_tftp_hook) nf_nat_tftp;
+
+	tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr),
+				 sizeof(_tftph), &_tftph);
+	if (tfh == NULL)
+		return NF_ACCEPT;
+
+	switch (ntohs(tfh->opcode)) {
+	case TFTP_OPCODE_READ:
+	case TFTP_OPCODE_WRITE:
+		/* RRQ and WRQ works the same way */
+		nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+		exp = nf_ct_expect_alloc(ct);
+		if (exp == NULL)
+			return NF_DROP;
+		tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
+				  nf_ct_l3num(ct),
+				  &tuple->src.u3, &tuple->dst.u3,
+				  IPPROTO_UDP, NULL, &tuple->dst.u.udp.port);
+
+		pr_debug("expect: ");
+		nf_ct_dump_tuple(&exp->tuple);
+
+		nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
+		if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
+			ret = nf_nat_tftp(skb, ctinfo, exp);
+		else if (nf_ct_expect_related(exp) != 0)
+			ret = NF_DROP;
+		nf_ct_expect_put(exp);
+		break;
+	case TFTP_OPCODE_DATA:
+	case TFTP_OPCODE_ACK:
+		pr_debug("Data/ACK opcode\n");
+		break;
+	case TFTP_OPCODE_ERROR:
+		pr_debug("Error opcode\n");
+		break;
+	default:
+		pr_debug("Unknown opcode\n");
+	}
+	return ret;
+}
+
+static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly;
+static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly;
+
+static const struct nf_conntrack_expect_policy tftp_exp_policy = {
+	.max_expected	= 1,
+	.timeout	= 5 * 60,
+};
+
+static void nf_conntrack_tftp_fini(void)
+{
+	int i, j;
+
+	for (i = 0; i < ports_c; i++) {
+		for (j = 0; j < 2; j++)
+			nf_conntrack_helper_unregister(&tftp[i][j]);
+	}
+}
+
+static int __init nf_conntrack_tftp_init(void)
+{
+	int i, j, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = TFTP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		memset(&tftp[i], 0, sizeof(tftp[i]));
+
+		tftp[i][0].tuple.src.l3num = AF_INET;
+		tftp[i][1].tuple.src.l3num = AF_INET6;
+		for (j = 0; j < 2; j++) {
+			tftp[i][j].tuple.dst.protonum = IPPROTO_UDP;
+			tftp[i][j].tuple.src.u.udp.port = htons(ports[i]);
+			tftp[i][j].expect_policy = &tftp_exp_policy;
+			tftp[i][j].me = THIS_MODULE;
+			tftp[i][j].help = tftp_help;
+
+			tmpname = &tftp_names[i][j][0];
+			if (ports[i] == TFTP_PORT)
+				sprintf(tmpname, "tftp");
+			else
+				sprintf(tmpname, "tftp-%u", i);
+			tftp[i][j].name = tmpname;
+
+			ret = nf_conntrack_helper_register(&tftp[i][j]);
+			if (ret) {
+				printk(KERN_ERR "nf_ct_tftp: failed to register"
+				       " helper for pf: %u port: %u\n",
+					tftp[i][j].tuple.src.l3num, ports[i]);
+				nf_conntrack_tftp_fini();
+				return ret;
+			}
+		}
+	}
+	return 0;
+}
+
+module_init(nf_conntrack_tftp_init);
+module_exit(nf_conntrack_tftp_fini);
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 00000000..af7dd31a
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,120 @@
+/*
+ * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+
+static int nf_ct_tstamp __read_mostly;
+
+module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
+MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table tstamp_sysctl_table[] = {
+	{
+		.procname	= "nf_conntrack_timestamp",
+		.data		= &init_net.ct.sysctl_tstamp,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type tstamp_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_tstamp),
+	.align	= __alignof__(struct nf_conn_tstamp),
+	.id	= NF_CT_EXT_TSTAMP,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	table[0].data = &net->ct.sysctl_tstamp;
+
+	net->ct.tstamp_sysctl_header = register_net_sysctl_table(net,
+			nf_net_netfilter_sysctl_path, table);
+	if (!net->ct.tstamp_sysctl_header) {
+		printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+		goto out_register;
+	}
+	return 0;
+
+out_register:
+	kfree(table);
+out:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.tstamp_sysctl_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
+	kfree(table);
+}
+#else
+static int nf_conntrack_tstamp_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
+{
+}
+#endif
+
+int nf_conntrack_tstamp_init(struct net *net)
+{
+	int ret;
+
+	net->ct.sysctl_tstamp = nf_ct_tstamp;
+
+	if (net_eq(net, &init_net)) {
+		ret = nf_ct_extend_register(&tstamp_extend);
+		if (ret < 0) {
+			printk(KERN_ERR "nf_ct_tstamp: Unable to register "
+					"extension\n");
+			goto out_extend_register;
+		}
+	}
+
+	ret = nf_conntrack_tstamp_init_sysctl(net);
+	if (ret < 0)
+		goto out_sysctl;
+
+	return 0;
+
+out_sysctl:
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&tstamp_extend);
+out_extend_register:
+	return ret;
+}
+
+void nf_conntrack_tstamp_fini(struct net *net)
+{
+	nf_conntrack_tstamp_fini_sysctl(net);
+	if (net_eq(net, &init_net))
+		nf_ct_extend_unregister(&tstamp_extend);
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 00000000..770f7643
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,38 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...)  printk(KERN_DEBUG format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+
+
+/* core.c */
+extern unsigned int nf_iterate(struct list_head *head,
+				struct sk_buff *skb,
+				unsigned int hook,
+				const struct net_device *indev,
+				const struct net_device *outdev,
+				struct list_head **i,
+				int (*okfn)(struct sk_buff *),
+				int hook_thresh);
+
+/* nf_queue.c */
+extern int nf_queue(struct sk_buff *skb,
+		    struct list_head *elem,
+		    u_int8_t pf, unsigned int hook,
+		    struct net_device *indev,
+		    struct net_device *outdev,
+		    int (*okfn)(struct sk_buff *),
+		    unsigned int queuenum);
+extern int __init netfilter_queue_init(void);
+
+/* nf_log.c */
+extern int __init netfilter_log_init(void);
+
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 00000000..20714edf
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,318 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_log.h>
+
+#include "nf_internals.h"
+
+/* Internal logging interface, which relies on the real
+   LOG target modules */
+
+#define NF_LOG_PREFIXLEN		128
+#define NFLOGGER_NAME_LEN		64
+
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
+static DEFINE_MUTEX(nf_log_mutex);
+
+static struct nf_logger *__find_logger(int pf, const char *str_logger)
+{
+	struct nf_logger *t;
+
+	list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) {
+		if (!strnicmp(str_logger, t->name, strlen(t->name)))
+			return t;
+	}
+
+	return NULL;
+}
+
+/* return EEXIST if the same logger is registred, 0 on success. */
+int nf_log_register(u_int8_t pf, struct nf_logger *logger)
+{
+	const struct nf_logger *llog;
+	int i;
+
+	if (pf >= ARRAY_SIZE(nf_loggers))
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(logger->list); i++)
+		INIT_LIST_HEAD(&logger->list[i]);
+
+	mutex_lock(&nf_log_mutex);
+
+	if (pf == NFPROTO_UNSPEC) {
+		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+			list_add_tail(&(logger->list[i]), &(nf_loggers_l[i]));
+	} else {
+		/* register at end of list to honor first register win */
+		list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
+		llog = rcu_dereference_protected(nf_loggers[pf],
+						 lockdep_is_held(&nf_log_mutex));
+		if (llog == NULL)
+			rcu_assign_pointer(nf_loggers[pf], logger);
+	}
+
+	mutex_unlock(&nf_log_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(nf_log_register);
+
+void nf_log_unregister(struct nf_logger *logger)
+{
+	const struct nf_logger *c_logger;
+	int i;
+
+	mutex_lock(&nf_log_mutex);
+	for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
+		c_logger = rcu_dereference_protected(nf_loggers[i],
+						     lockdep_is_held(&nf_log_mutex));
+		if (c_logger == logger)
+			rcu_assign_pointer(nf_loggers[i], NULL);
+		list_del(&logger->list[i]);
+	}
+	mutex_unlock(&nf_log_mutex);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unregister);
+
+int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
+{
+	if (pf >= ARRAY_SIZE(nf_loggers))
+		return -EINVAL;
+	mutex_lock(&nf_log_mutex);
+	if (__find_logger(pf, logger->name) == NULL) {
+		mutex_unlock(&nf_log_mutex);
+		return -ENOENT;
+	}
+	rcu_assign_pointer(nf_loggers[pf], logger);
+	mutex_unlock(&nf_log_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(nf_log_bind_pf);
+
+void nf_log_unbind_pf(u_int8_t pf)
+{
+	if (pf >= ARRAY_SIZE(nf_loggers))
+		return;
+	mutex_lock(&nf_log_mutex);
+	rcu_assign_pointer(nf_loggers[pf], NULL);
+	mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_unbind_pf);
+
+void nf_log_packet(u_int8_t pf,
+		   unsigned int hooknum,
+		   const struct sk_buff *skb,
+		   const struct net_device *in,
+		   const struct net_device *out,
+		   const struct nf_loginfo *loginfo,
+		   const char *fmt, ...)
+{
+	va_list args;
+	char prefix[NF_LOG_PREFIXLEN];
+	const struct nf_logger *logger;
+
+	rcu_read_lock();
+	logger = rcu_dereference(nf_loggers[pf]);
+	if (logger) {
+		va_start(args, fmt);
+		vsnprintf(prefix, sizeof(prefix), fmt, args);
+		va_end(args);
+		logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	mutex_lock(&nf_log_mutex);
+
+	if (*pos >= ARRAY_SIZE(nf_loggers))
+		return NULL;
+
+	return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	if (*pos >= ARRAY_SIZE(nf_loggers))
+		return NULL;
+
+	return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+	mutex_unlock(&nf_log_mutex);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	loff_t *pos = v;
+	const struct nf_logger *logger;
+	struct nf_logger *t;
+	int ret;
+
+	logger = rcu_dereference_protected(nf_loggers[*pos],
+					   lockdep_is_held(&nf_log_mutex));
+
+	if (!logger)
+		ret = seq_printf(s, "%2lld NONE (", *pos);
+	else
+		ret = seq_printf(s, "%2lld %s (", *pos, logger->name);
+
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) {
+		ret = seq_printf(s, "%s", t->name);
+		if (ret < 0)
+			return ret;
+		if (&t->list[*pos] != nf_loggers_l[*pos].prev) {
+			ret = seq_printf(s, ",");
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return seq_printf(s, ")\n");
+}
+
+static const struct seq_operations nflog_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nflog_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nflog_seq_ops);
+}
+
+static const struct file_operations nflog_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nflog_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+
+#endif /* PROC_FS */
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_path nf_log_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "netfilter", },
+	{ .procname = "nf_log", },
+	{ }
+};
+
+static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table_header *nf_log_dir_header;
+
+static int nf_log_proc_dostring(ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	const struct nf_logger *logger;
+	char buf[NFLOGGER_NAME_LEN];
+	size_t size = *lenp;
+	int r = 0;
+	int tindex = (unsigned long)table->extra1;
+
+	if (write) {
+		if (size > sizeof(buf))
+			size = sizeof(buf);
+		if (copy_from_user(buf, buffer, size))
+			return -EFAULT;
+
+		if (!strcmp(buf, "NONE")) {
+			nf_log_unbind_pf(tindex);
+			return 0;
+		}
+		mutex_lock(&nf_log_mutex);
+		logger = __find_logger(tindex, buf);
+		if (logger == NULL) {
+			mutex_unlock(&nf_log_mutex);
+			return -ENOENT;
+		}
+		rcu_assign_pointer(nf_loggers[tindex], logger);
+		mutex_unlock(&nf_log_mutex);
+	} else {
+		mutex_lock(&nf_log_mutex);
+		logger = rcu_dereference_protected(nf_loggers[tindex],
+						   lockdep_is_held(&nf_log_mutex));
+		if (!logger)
+			table->data = "NONE";
+		else
+			table->data = logger->name;
+		r = proc_dostring(table, write, buffer, lenp, ppos);
+		mutex_unlock(&nf_log_mutex);
+	}
+
+	return r;
+}
+
+static __init int netfilter_log_sysctl_init(void)
+{
+	int i;
+
+	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+		snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i);
+		nf_log_sysctl_table[i].procname	=
+			nf_log_sysctl_fnames[i-NFPROTO_UNSPEC];
+		nf_log_sysctl_table[i].data = NULL;
+		nf_log_sysctl_table[i].maxlen =
+			NFLOGGER_NAME_LEN * sizeof(char);
+		nf_log_sysctl_table[i].mode = 0644;
+		nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring;
+		nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i;
+	}
+
+	nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path,
+				       nf_log_sysctl_table);
+	if (!nf_log_dir_header)
+		return -ENOMEM;
+
+	return 0;
+}
+#else
+static __init int netfilter_log_sysctl_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int __init netfilter_log_init(void)
+{
+	int i, r;
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nf_log", S_IRUGO,
+			 proc_net_netfilter, &nflog_file_ops))
+		return -1;
+#endif
+
+	/* Errors will trigger panic, unroll on error is unnecessary. */
+	r = netfilter_log_sysctl_init();
+	if (r < 0)
+		return r;
+
+	for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
+		INIT_LIST_HEAD(&(nf_loggers_l[i]));
+
+	return 0;
+}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 00000000..5b466cd1
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,395 @@
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <net/protocol.h>
+#include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
+
+#include "nf_internals.h"
+
+/*
+ * A queue handler may be registered for each protocol.  Each is protected by
+ * long term mutex.  The handler must provide an an outfn() to accept packets
+ * for queueing and must reinject all packets it receives, no matter what.
+ */
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+
+static DEFINE_MUTEX(queue_handler_mutex);
+
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+{
+	int ret;
+	const struct nf_queue_handler *old;
+
+	if (pf >= ARRAY_SIZE(queue_handler))
+		return -EINVAL;
+
+	mutex_lock(&queue_handler_mutex);
+	old = rcu_dereference_protected(queue_handler[pf],
+					lockdep_is_held(&queue_handler_mutex));
+	if (old == qh)
+		ret = -EEXIST;
+	else if (old)
+		ret = -EBUSY;
+	else {
+		rcu_assign_pointer(queue_handler[pf], qh);
+		ret = 0;
+	}
+	mutex_unlock(&queue_handler_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+
+/* The caller must flush their queue before this */
+int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
+{
+	const struct nf_queue_handler *old;
+
+	if (pf >= ARRAY_SIZE(queue_handler))
+		return -EINVAL;
+
+	mutex_lock(&queue_handler_mutex);
+	old = rcu_dereference_protected(queue_handler[pf],
+					lockdep_is_held(&queue_handler_mutex));
+	if (old && old != qh) {
+		mutex_unlock(&queue_handler_mutex);
+		return -EINVAL;
+	}
+
+	rcu_assign_pointer(queue_handler[pf], NULL);
+	mutex_unlock(&queue_handler_mutex);
+
+	synchronize_rcu();
+
+	return 0;
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+
+void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
+{
+	u_int8_t pf;
+
+	mutex_lock(&queue_handler_mutex);
+	for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++)  {
+		if (rcu_dereference_protected(
+				queue_handler[pf],
+				lockdep_is_held(&queue_handler_mutex)
+				) == qh)
+			rcu_assign_pointer(queue_handler[pf], NULL);
+	}
+	mutex_unlock(&queue_handler_mutex);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+
+static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+{
+	/* Release those devices we held, or Alexey will kill me. */
+	if (entry->indev)
+		dev_put(entry->indev);
+	if (entry->outdev)
+		dev_put(entry->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (entry->skb->nf_bridge) {
+		struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge;
+
+		if (nf_bridge->physindev)
+			dev_put(nf_bridge->physindev);
+		if (nf_bridge->physoutdev)
+			dev_put(nf_bridge->physoutdev);
+	}
+#endif
+	/* Drop reference to owner of hook which queued us. */
+	module_put(entry->elem->owner);
+}
+
+/*
+ * Any packet that leaves via this function must come back
+ * through nf_reinject().
+ */
+static int __nf_queue(struct sk_buff *skb,
+		      struct list_head *elem,
+		      u_int8_t pf, unsigned int hook,
+		      struct net_device *indev,
+		      struct net_device *outdev,
+		      int (*okfn)(struct sk_buff *),
+		      unsigned int queuenum)
+{
+	int status = -ENOENT;
+	struct nf_queue_entry *entry = NULL;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	struct net_device *physindev;
+	struct net_device *physoutdev;
+#endif
+	const struct nf_afinfo *afinfo;
+	const struct nf_queue_handler *qh;
+
+	/* QUEUE == DROP if no one is waiting, to be safe. */
+	rcu_read_lock();
+
+	qh = rcu_dereference(queue_handler[pf]);
+	if (!qh) {
+		status = -ESRCH;
+		goto err_unlock;
+	}
+
+	afinfo = nf_get_afinfo(pf);
+	if (!afinfo)
+		goto err_unlock;
+
+	entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
+	if (!entry) {
+		status = -ENOMEM;
+		goto err_unlock;
+	}
+
+	*entry = (struct nf_queue_entry) {
+		.skb	= skb,
+		.elem	= list_entry(elem, struct nf_hook_ops, list),
+		.pf	= pf,
+		.hook	= hook,
+		.indev	= indev,
+		.outdev	= outdev,
+		.okfn	= okfn,
+	};
+
+	/* If it's going away, ignore hook. */
+	if (!try_module_get(entry->elem->owner)) {
+		status = -ECANCELED;
+		goto err_unlock;
+	}
+	/* Bump dev refs so they don't vanish while packet is out */
+	if (indev)
+		dev_hold(indev);
+	if (outdev)
+		dev_hold(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge) {
+		physindev = skb->nf_bridge->physindev;
+		if (physindev)
+			dev_hold(physindev);
+		physoutdev = skb->nf_bridge->physoutdev;
+		if (physoutdev)
+			dev_hold(physoutdev);
+	}
+#endif
+	skb_dst_force(skb);
+	afinfo->saveroute(skb, entry);
+	status = qh->outfn(entry, queuenum);
+
+	rcu_read_unlock();
+
+	if (status < 0) {
+		nf_queue_entry_release_refs(entry);
+		goto err;
+	}
+
+	return 0;
+
+err_unlock:
+	rcu_read_unlock();
+err:
+	kfree(entry);
+	return status;
+}
+
+int nf_queue(struct sk_buff *skb,
+	     struct list_head *elem,
+	     u_int8_t pf, unsigned int hook,
+	     struct net_device *indev,
+	     struct net_device *outdev,
+	     int (*okfn)(struct sk_buff *),
+	     unsigned int queuenum)
+{
+	struct sk_buff *segs;
+	int err;
+	unsigned int queued;
+
+	if (!skb_is_gso(skb))
+		return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
+				  queuenum);
+
+	switch (pf) {
+	case NFPROTO_IPV4:
+		skb->protocol = htons(ETH_P_IP);
+		break;
+	case NFPROTO_IPV6:
+		skb->protocol = htons(ETH_P_IPV6);
+		break;
+	}
+
+	segs = skb_gso_segment(skb, 0);
+	/* Does not use PTR_ERR to limit the number of error codes that can be
+	 * returned by nf_queue.  For instance, callers rely on -ECANCELED to mean
+	 * 'ignore this hook'.
+	 */
+	if (IS_ERR(segs))
+		return -EINVAL;
+
+	queued = 0;
+	err = 0;
+	do {
+		struct sk_buff *nskb = segs->next;
+
+		segs->next = NULL;
+		if (err == 0)
+			err = __nf_queue(segs, elem, pf, hook, indev,
+					   outdev, okfn, queuenum);
+		if (err == 0)
+			queued++;
+		else
+			kfree_skb(segs);
+		segs = nskb;
+	} while (segs);
+
+	/* also free orig skb if only some segments were queued */
+	if (unlikely(err && queued))
+		err = 0;
+	if (err == 0)
+		kfree_skb(skb);
+	return err;
+}
+
+void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+	struct sk_buff *skb = entry->skb;
+	struct list_head *elem = &entry->elem->list;
+	const struct nf_afinfo *afinfo;
+	int err;
+
+	rcu_read_lock();
+
+	nf_queue_entry_release_refs(entry);
+
+	/* Continue traversal iff userspace said ok... */
+	if (verdict == NF_REPEAT) {
+		elem = elem->prev;
+		verdict = NF_ACCEPT;
+	}
+
+	if (verdict == NF_ACCEPT) {
+		afinfo = nf_get_afinfo(entry->pf);
+		if (!afinfo || afinfo->reroute(skb, entry) < 0)
+			verdict = NF_DROP;
+	}
+
+	if (verdict == NF_ACCEPT) {
+	next_hook:
+		verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook],
+				     skb, entry->hook,
+				     entry->indev, entry->outdev, &elem,
+				     entry->okfn, INT_MIN);
+	}
+
+	switch (verdict & NF_VERDICT_MASK) {
+	case NF_ACCEPT:
+	case NF_STOP:
+		local_bh_disable();
+		entry->okfn(skb);
+		local_bh_enable();
+		break;
+	case NF_QUEUE:
+		err = __nf_queue(skb, elem, entry->pf, entry->hook,
+				 entry->indev, entry->outdev, entry->okfn,
+				 verdict >> NF_VERDICT_QBITS);
+		if (err < 0) {
+			if (err == -ECANCELED)
+				goto next_hook;
+			if (err == -ESRCH &&
+			   (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
+				goto next_hook;
+			kfree_skb(skb);
+		}
+		break;
+	case NF_STOLEN:
+	default:
+		kfree_skb(skb);
+	}
+	rcu_read_unlock();
+	kfree(entry);
+}
+EXPORT_SYMBOL(nf_reinject);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= ARRAY_SIZE(queue_handler))
+		return NULL;
+
+	return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	if (*pos >= ARRAY_SIZE(queue_handler))
+		return NULL;
+
+	return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	int ret;
+	loff_t *pos = v;
+	const struct nf_queue_handler *qh;
+
+	rcu_read_lock();
+	qh = rcu_dereference(queue_handler[*pos]);
+	if (!qh)
+		ret = seq_printf(s, "%2lld NONE\n", *pos);
+	else
+		ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static const struct seq_operations nfqueue_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nfqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nfqueue_seq_ops);
+}
+
+static const struct file_operations nfqueue_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nfqueue_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+#endif /* PROC_FS */
+
+
+int __init netfilter_queue_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nf_queue", S_IRUGO,
+			 proc_net_netfilter, &nfqueue_file_ops))
+		return -1;
+#endif
+	return 0;
+}
+
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 00000000..f042ae52
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,169 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* Sockopts only registered and called from user context, so
+   net locking would be overkill.  Also, [gs]etsockopt calls may
+   sleep. */
+static DEFINE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+	return max1 > min2 && min1 < max2;
+}
+
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+	struct nf_sockopt_ops *ops;
+	int ret = 0;
+
+	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
+		return -EINTR;
+
+	list_for_each_entry(ops, &nf_sockopts, list) {
+		if (ops->pf == reg->pf
+		    && (overlap(ops->set_optmin, ops->set_optmax,
+				reg->set_optmin, reg->set_optmax)
+			|| overlap(ops->get_optmin, ops->get_optmax,
+				   reg->get_optmin, reg->get_optmax))) {
+			NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+				ops->set_optmin, ops->set_optmax,
+				ops->get_optmin, ops->get_optmax,
+				reg->set_optmin, reg->set_optmax,
+				reg->get_optmin, reg->get_optmax);
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	list_add(&reg->list, &nf_sockopts);
+out:
+	mutex_unlock(&nf_sockopt_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+	mutex_lock(&nf_sockopt_mutex);
+	list_del(&reg->list);
+	mutex_unlock(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+
+static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
+		int val, int get)
+{
+	struct nf_sockopt_ops *ops;
+
+	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(ops, &nf_sockopts, list) {
+		if (ops->pf == pf) {
+			if (!try_module_get(ops->owner))
+				goto out_nosup;
+
+			if (get) {
+				if (val >= ops->get_optmin &&
+						val < ops->get_optmax)
+					goto out;
+			} else {
+				if (val >= ops->set_optmin &&
+						val < ops->set_optmax)
+					goto out;
+			}
+			module_put(ops->owner);
+		}
+	}
+out_nosup:
+	ops = ERR_PTR(-ENOPROTOOPT);
+out:
+	mutex_unlock(&nf_sockopt_mutex);
+	return ops;
+}
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+		      char __user *opt, int *len, int get)
+{
+	struct nf_sockopt_ops *ops;
+	int ret;
+
+	ops = nf_sockopt_find(sk, pf, val, get);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	if (get)
+		ret = ops->get(sk, val, opt, len);
+	else
+		ret = ops->set(sk, val, opt, *len);
+
+	module_put(ops->owner);
+	return ret;
+}
+
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+		  unsigned int len)
+{
+	return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+
+int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
+		  int *len)
+{
+	return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
+
+#ifdef CONFIG_COMPAT
+static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
+			     char __user *opt, int *len, int get)
+{
+	struct nf_sockopt_ops *ops;
+	int ret;
+
+	ops = nf_sockopt_find(sk, pf, val, get);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	if (get) {
+		if (ops->compat_get)
+			ret = ops->compat_get(sk, val, opt, len);
+		else
+			ret = ops->get(sk, val, opt, len);
+	} else {
+		if (ops->compat_set)
+			ret = ops->compat_set(sk, val, opt, *len);
+		else
+			ret = ops->set(sk, val, opt, *len);
+	}
+
+	module_put(ops->owner);
+	return ret;
+}
+
+int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
+		int val, char __user *opt, unsigned int len)
+{
+	return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(compat_nf_setsockopt);
+
+int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
+		int val, char __user *opt, int *len)
+{
+	return compat_nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(compat_nf_getsockopt);
+#endif
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
new file mode 100644
index 00000000..474d621c
--- /dev/null
+++ b/net/netfilter/nf_tproxy_core.c
@@ -0,0 +1,62 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Author: Balazs Scheidler, Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+
+#include <linux/net.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <net/udp.h>
+#include <net/netfilter/nf_tproxy_core.h>
+
+
+static void
+nf_tproxy_destructor(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	skb->sk = NULL;
+	skb->destructor = NULL;
+
+	if (sk)
+		sock_put(sk);
+}
+
+/* consumes sk */
+void
+nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+	/* assigning tw sockets complicates things; most
+	 * skb->sk->X checks would have to test sk->sk_state first */
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		inet_twsk_put(inet_twsk(sk));
+		return;
+	}
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = nf_tproxy_destructor;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock);
+
+static int __init nf_tproxy_init(void)
+{
+	pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n");
+	pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n");
+	return 0;
+}
+
+module_init(nf_tproxy_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_DESCRIPTION("Transparent proxy support core routines");
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 00000000..b4a45328
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,225 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <linux/init.h>
+
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+
+static char __initdata nfversion[] = "0.30";
+
+static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+static DEFINE_MUTEX(nfnl_mutex);
+
+void nfnl_lock(void)
+{
+	mutex_lock(&nfnl_mutex);
+}
+EXPORT_SYMBOL_GPL(nfnl_lock);
+
+void nfnl_unlock(void)
+{
+	mutex_unlock(&nfnl_mutex);
+}
+EXPORT_SYMBOL_GPL(nfnl_unlock);
+
+int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
+{
+	nfnl_lock();
+	if (subsys_table[n->subsys_id]) {
+		nfnl_unlock();
+		return -EBUSY;
+	}
+	subsys_table[n->subsys_id] = n;
+	nfnl_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+
+int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
+{
+	nfnl_lock();
+	subsys_table[n->subsys_id] = NULL;
+	nfnl_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+
+static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+	u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+
+	if (subsys_id >= NFNL_SUBSYS_COUNT)
+		return NULL;
+
+	return subsys_table[subsys_id];
+}
+
+static inline const struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss)
+{
+	u_int8_t cb_id = NFNL_MSG_TYPE(type);
+
+	if (cb_id >= ss->cb_count)
+		return NULL;
+
+	return &ss->cb[cb_id];
+}
+
+int nfnetlink_has_listeners(struct net *net, unsigned int group)
+{
+	return netlink_has_listeners(net->nfnl, group);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
+
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
+		   unsigned group, int echo, gfp_t flags)
+{
+	return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+
+int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
+{
+	return netlink_set_err(net->nfnl, pid, group, error);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_set_err);
+
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags)
+{
+	return netlink_unicast(net->nfnl, skb, pid, flags);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+
+/* Process one complete nfnetlink message. */
+static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(skb->sk);
+	const struct nfnl_callback *nc;
+	const struct nfnetlink_subsystem *ss;
+	int type, err;
+
+	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* All the messages must at least contain nfgenmsg */
+	if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg)))
+		return 0;
+
+	type = nlh->nlmsg_type;
+replay:
+	ss = nfnetlink_get_subsys(type);
+	if (!ss) {
+#ifdef CONFIG_MODULES
+		nfnl_unlock();
+		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+		nfnl_lock();
+		ss = nfnetlink_get_subsys(type);
+		if (!ss)
+#endif
+			return -EINVAL;
+	}
+
+	nc = nfnetlink_find_client(type, ss);
+	if (!nc)
+		return -EINVAL;
+
+	{
+		int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+		u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+		struct nlattr *attr = (void *)nlh + min_len;
+		int attrlen = nlh->nlmsg_len - min_len;
+
+		err = nla_parse(cda, ss->cb[cb_id].attr_count,
+				attr, attrlen, ss->cb[cb_id].policy);
+		if (err < 0)
+			return err;
+
+		err = nc->call(net->nfnl, skb, nlh, (const struct nlattr **)cda);
+		if (err == -EAGAIN)
+			goto replay;
+		return err;
+	}
+}
+
+static void nfnetlink_rcv(struct sk_buff *skb)
+{
+	nfnl_lock();
+	netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
+	nfnl_unlock();
+}
+
+static int __net_init nfnetlink_net_init(struct net *net)
+{
+	struct sock *nfnl;
+
+	nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX,
+				     nfnetlink_rcv, NULL, THIS_MODULE);
+	if (!nfnl)
+		return -ENOMEM;
+	net->nfnl_stash = nfnl;
+	rcu_assign_pointer(net->nfnl, nfnl);
+	return 0;
+}
+
+static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
+{
+	struct net *net;
+
+	list_for_each_entry(net, net_exit_list, exit_list)
+		rcu_assign_pointer(net->nfnl, NULL);
+	synchronize_net();
+	list_for_each_entry(net, net_exit_list, exit_list)
+		netlink_kernel_release(net->nfnl_stash);
+}
+
+static struct pernet_operations nfnetlink_net_ops = {
+	.init		= nfnetlink_net_init,
+	.exit_batch	= nfnetlink_net_exit_batch,
+};
+
+static int __init nfnetlink_init(void)
+{
+	pr_info("Netfilter messages via NETLINK v%s.\n", nfversion);
+	return register_pernet_subsys(&nfnetlink_net_ops);
+}
+
+static void __exit nfnetlink_exit(void)
+{
+	pr_info("Removing netfilter NETLINK layer.\n");
+	unregister_pernet_subsys(&nfnetlink_net_ops);
+}
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 00000000..2e7ccbb4
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1015 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/nfnetlink_log.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
+#define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
+#define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
+#define NFULNL_COPY_RANGE_MAX	0xFFFF	/* max packet size is limited by 16-bit struct nfattr nfa_len field */
+
+#define PRINTR(x, args...)	do { if (net_ratelimit()) \
+				     printk(x, ## args); } while (0);
+
+struct nfulnl_instance {
+	struct hlist_node hlist;	/* global list of instances */
+	spinlock_t lock;
+	atomic_t use;			/* use count */
+
+	unsigned int qlen;		/* number of nlmsgs in skb */
+	struct sk_buff *skb;		/* pre-allocatd skb */
+	struct timer_list timer;
+	int peer_pid;			/* PID of the peer process */
+
+	/* configurable parameters */
+	unsigned int flushtimeout;	/* timeout until queue flush */
+	unsigned int nlbufsiz;		/* netlink buffer allocation size */
+	unsigned int qthreshold;	/* threshold of the queue */
+	u_int32_t copy_range;
+	u_int32_t seq;			/* instance-local sequential counter */
+	u_int16_t group_num;		/* number of this queue */
+	u_int16_t flags;
+	u_int8_t copy_mode;
+	struct rcu_head rcu;
+};
+
+static DEFINE_SPINLOCK(instances_lock);
+static atomic_t global_seq;
+
+#define INSTANCE_BUCKETS	16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static unsigned int hash_init;
+
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+	return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+
+static struct nfulnl_instance *
+__instance_lookup(u_int16_t group_num)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct nfulnl_instance *inst;
+
+	head = &instance_table[instance_hashfn(group_num)];
+	hlist_for_each_entry_rcu(inst, pos, head, hlist) {
+		if (inst->group_num == group_num)
+			return inst;
+	}
+	return NULL;
+}
+
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+	atomic_inc(&inst->use);
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(u_int16_t group_num)
+{
+	struct nfulnl_instance *inst;
+
+	rcu_read_lock_bh();
+	inst = __instance_lookup(group_num);
+	if (inst && !atomic_inc_not_zero(&inst->use))
+		inst = NULL;
+	rcu_read_unlock_bh();
+
+	return inst;
+}
+
+static void nfulnl_instance_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct nfulnl_instance, rcu));
+	module_put(THIS_MODULE);
+}
+
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+	if (inst && atomic_dec_and_test(&inst->use))
+		call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
+}
+
+static void nfulnl_timer(unsigned long data);
+
+static struct nfulnl_instance *
+instance_create(u_int16_t group_num, int pid)
+{
+	struct nfulnl_instance *inst;
+	int err;
+
+	spin_lock_bh(&instances_lock);
+	if (__instance_lookup(group_num)) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+	if (!inst) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (!try_module_get(THIS_MODULE)) {
+		kfree(inst);
+		err = -EAGAIN;
+		goto out_unlock;
+	}
+
+	INIT_HLIST_NODE(&inst->hlist);
+	spin_lock_init(&inst->lock);
+	/* needs to be two, since we _put() after creation */
+	atomic_set(&inst->use, 2);
+
+	setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+
+	inst->peer_pid = pid;
+	inst->group_num = group_num;
+
+	inst->qthreshold 	= NFULNL_QTHRESH_DEFAULT;
+	inst->flushtimeout 	= NFULNL_TIMEOUT_DEFAULT;
+	inst->nlbufsiz 		= NFULNL_NLBUFSIZ_DEFAULT;
+	inst->copy_mode 	= NFULNL_COPY_PACKET;
+	inst->copy_range 	= NFULNL_COPY_RANGE_MAX;
+
+	hlist_add_head_rcu(&inst->hlist,
+		       &instance_table[instance_hashfn(group_num)]);
+
+	spin_unlock_bh(&instances_lock);
+
+	return inst;
+
+out_unlock:
+	spin_unlock_bh(&instances_lock);
+	return ERR_PTR(err);
+}
+
+static void __nfulnl_flush(struct nfulnl_instance *inst);
+
+/* called with BH disabled */
+static void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+	/* first pull it out of the global list */
+	hlist_del_rcu(&inst->hlist);
+
+	/* then flush all pending packets from skb */
+
+	spin_lock(&inst->lock);
+
+	/* lockless readers wont be able to use us */
+	inst->copy_mode = NFULNL_COPY_DISABLED;
+
+	if (inst->skb)
+		__nfulnl_flush(inst);
+	spin_unlock(&inst->lock);
+
+	/* and finally put the refcount */
+	instance_put(inst);
+}
+
+static inline void
+instance_destroy(struct nfulnl_instance *inst)
+{
+	spin_lock_bh(&instances_lock);
+	__instance_destroy(inst);
+	spin_unlock_bh(&instances_lock);
+}
+
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+		  unsigned int range)
+{
+	int status = 0;
+
+	spin_lock_bh(&inst->lock);
+
+	switch (mode) {
+	case NFULNL_COPY_NONE:
+	case NFULNL_COPY_META:
+		inst->copy_mode = mode;
+		inst->copy_range = 0;
+		break;
+
+	case NFULNL_COPY_PACKET:
+		inst->copy_mode = mode;
+		inst->copy_range = min_t(unsigned int,
+					 range, NFULNL_COPY_RANGE_MAX);
+		break;
+
+	default:
+		status = -EINVAL;
+		break;
+	}
+
+	spin_unlock_bh(&inst->lock);
+
+	return status;
+}
+
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+	int status;
+
+	spin_lock_bh(&inst->lock);
+	if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+		status = -ERANGE;
+	else if (nlbufsiz > 131072)
+		status = -ERANGE;
+	else {
+		inst->nlbufsiz = nlbufsiz;
+		status = 0;
+	}
+	spin_unlock_bh(&inst->lock);
+
+	return status;
+}
+
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+	spin_lock_bh(&inst->lock);
+	inst->flushtimeout = timeout;
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+	spin_lock_bh(&inst->lock);
+	inst->qthreshold = qthresh;
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
+static int
+nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
+{
+	spin_lock_bh(&inst->lock);
+	inst->flags = flags;
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
+static struct sk_buff *
+nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
+{
+	struct sk_buff *skb;
+	unsigned int n;
+
+	/* alloc skb which should be big enough for a whole multipart
+	 * message.  WARNING: has to be <= 128k due to slab restrictions */
+
+	n = max(inst_size, pkt_size);
+	skb = alloc_skb(n, GFP_ATOMIC);
+	if (!skb) {
+		pr_notice("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
+			inst_size);
+
+		if (n > pkt_size) {
+			/* try to allocate only as much as we need for current
+			 * packet */
+
+			skb = alloc_skb(pkt_size, GFP_ATOMIC);
+			if (!skb)
+				pr_err("nfnetlink_log: can't even alloc %u "
+				       "bytes\n", pkt_size);
+		}
+	}
+
+	return skb;
+}
+
+static int
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+	int status = -1;
+
+	if (inst->qlen > 1)
+		NLMSG_PUT(inst->skb, 0, 0,
+			  NLMSG_DONE,
+			  sizeof(struct nfgenmsg));
+
+	status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid,
+				   MSG_DONTWAIT);
+
+	inst->qlen = 0;
+	inst->skb = NULL;
+
+nlmsg_failure:
+	return status;
+}
+
+static void
+__nfulnl_flush(struct nfulnl_instance *inst)
+{
+	/* timer holds a reference */
+	if (del_timer(&inst->timer))
+		instance_put(inst);
+	if (inst->skb)
+		__nfulnl_send(inst);
+}
+
+static void
+nfulnl_timer(unsigned long data)
+{
+	struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+
+	spin_lock_bh(&inst->lock);
+	if (inst->skb)
+		__nfulnl_send(inst);
+	spin_unlock_bh(&inst->lock);
+	instance_put(inst);
+}
+
+/* This is an inline function, we don't really care about a long
+ * list of arguments */
+static inline int
+__build_packet_message(struct nfulnl_instance *inst,
+			const struct sk_buff *skb,
+			unsigned int data_len,
+			u_int8_t pf,
+			unsigned int hooknum,
+			const struct net_device *indev,
+			const struct net_device *outdev,
+			const char *prefix, unsigned int plen)
+{
+	struct nfulnl_msg_packet_hdr pmsg;
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	sk_buff_data_t old_tail = inst->skb->tail;
+
+	nlh = NLMSG_PUT(inst->skb, 0, 0,
+			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+			sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+	nfmsg->nfgen_family = pf;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = htons(inst->group_num);
+
+	pmsg.hw_protocol	= skb->protocol;
+	pmsg.hook		= hooknum;
+
+	NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+	if (prefix)
+		NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix);
+
+	if (indev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
+			     htonl(indev->ifindex));
+#else
+		if (pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical input device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+				     htonl(indev->ifindex));
+			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
+		} else {
+			/* Case 2: indev is bridge group, we need to look for
+			 * physical device (when called from ipv4) */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
+				     htonl(indev->ifindex));
+			if (skb->nf_bridge && skb->nf_bridge->physindev)
+				NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+					     htonl(skb->nf_bridge->physindev->ifindex));
+		}
+#endif
+	}
+
+	if (outdev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
+			     htonl(outdev->ifindex));
+#else
+		if (pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical output device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+				     htonl(outdev->ifindex));
+			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
+		} else {
+			/* Case 2: indev is a bridge group, we need to look
+			 * for physical device (when called from ipv4) */
+			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
+				     htonl(outdev->ifindex));
+			if (skb->nf_bridge && skb->nf_bridge->physoutdev)
+				NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+					     htonl(skb->nf_bridge->physoutdev->ifindex));
+		}
+#endif
+	}
+
+	if (skb->mark)
+		NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark));
+
+	if (indev && skb->dev &&
+	    skb->mac_header != skb->network_header) {
+		struct nfulnl_msg_packet_hw phw;
+		int len = dev_parse_header(skb, phw.hw_addr);
+		if (len > 0) {
+			phw.hw_addrlen = htons(len);
+			NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+		}
+	}
+
+	if (indev && skb_mac_header_was_set(skb)) {
+		NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type));
+		NLA_PUT_BE16(inst->skb, NFULA_HWLEN,
+			     htons(skb->dev->hard_header_len));
+		NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len,
+			skb_mac_header(skb));
+	}
+
+	if (skb->tstamp.tv64) {
+		struct nfulnl_msg_packet_timestamp ts;
+		struct timeval tv = ktime_to_timeval(skb->tstamp);
+		ts.sec = cpu_to_be64(tv.tv_sec);
+		ts.usec = cpu_to_be64(tv.tv_usec);
+
+		NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+	}
+
+	/* UID */
+	if (skb->sk) {
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
+			struct file *file = skb->sk->sk_socket->file;
+			__be32 uid = htonl(file->f_cred->fsuid);
+			__be32 gid = htonl(file->f_cred->fsgid);
+			/* need to unlock here since NLA_PUT may goto */
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			NLA_PUT_BE32(inst->skb, NFULA_UID, uid);
+			NLA_PUT_BE32(inst->skb, NFULA_GID, gid);
+		} else
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+	}
+
+	/* local sequence number */
+	if (inst->flags & NFULNL_CFG_F_SEQ)
+		NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++));
+
+	/* global sequence number */
+	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
+		NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL,
+			     htonl(atomic_inc_return(&global_seq)));
+
+	if (data_len) {
+		struct nlattr *nla;
+		int size = nla_attr_size(data_len);
+
+		if (skb_tailroom(inst->skb) < nla_total_size(data_len)) {
+			printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
+			goto nlmsg_failure;
+		}
+
+		nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len));
+		nla->nla_type = NFULA_PAYLOAD;
+		nla->nla_len = size;
+
+		if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
+			BUG();
+	}
+
+	nlh->nlmsg_len = inst->skb->tail - old_tail;
+	return 0;
+
+nlmsg_failure:
+nla_put_failure:
+	PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+	return -1;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static struct nf_loginfo default_loginfo = {
+	.type =		NF_LOG_TYPE_ULOG,
+	.u = {
+		.ulog = {
+			.copy_len	= 0xffff,
+			.group		= 0,
+			.qthreshold	= 1,
+		},
+	},
+};
+
+/* log handler for internal netfilter logging api */
+void
+nfulnl_log_packet(u_int8_t pf,
+		  unsigned int hooknum,
+		  const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct nf_loginfo *li_user,
+		  const char *prefix)
+{
+	unsigned int size, data_len;
+	struct nfulnl_instance *inst;
+	const struct nf_loginfo *li;
+	unsigned int qthreshold;
+	unsigned int plen;
+
+	if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
+		li = li_user;
+	else
+		li = &default_loginfo;
+
+	inst = instance_lookup_get(li->u.ulog.group);
+	if (!inst)
+		return;
+
+	plen = 0;
+	if (prefix)
+		plen = strlen(prefix) + 1;
+
+	/* FIXME: do we want to make the size calculation conditional based on
+	 * what is actually present?  way more branches and checks, but more
+	 * memory efficient... */
+	size =    NLMSG_SPACE(sizeof(struct nfgenmsg))
+		+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+#endif
+		+ nla_total_size(sizeof(u_int32_t))	/* mark */
+		+ nla_total_size(sizeof(u_int32_t))	/* uid */
+		+ nla_total_size(sizeof(u_int32_t))	/* gid */
+		+ nla_total_size(plen)			/* prefix */
+		+ nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
+		+ nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp));
+
+	if (in && skb_mac_header_was_set(skb)) {
+		size +=   nla_total_size(skb->dev->hard_header_len)
+			+ nla_total_size(sizeof(u_int16_t))	/* hwtype */
+			+ nla_total_size(sizeof(u_int16_t));	/* hwlen */
+	}
+
+	spin_lock_bh(&inst->lock);
+
+	if (inst->flags & NFULNL_CFG_F_SEQ)
+		size += nla_total_size(sizeof(u_int32_t));
+	if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
+		size += nla_total_size(sizeof(u_int32_t));
+
+	qthreshold = inst->qthreshold;
+	/* per-rule qthreshold overrides per-instance */
+	if (li->u.ulog.qthreshold)
+		if (qthreshold > li->u.ulog.qthreshold)
+			qthreshold = li->u.ulog.qthreshold;
+
+
+	switch (inst->copy_mode) {
+	case NFULNL_COPY_META:
+	case NFULNL_COPY_NONE:
+		data_len = 0;
+		break;
+
+	case NFULNL_COPY_PACKET:
+		if (inst->copy_range == 0
+		    || inst->copy_range > skb->len)
+			data_len = skb->len;
+		else
+			data_len = inst->copy_range;
+
+		size += nla_total_size(data_len);
+		break;
+
+	case NFULNL_COPY_DISABLED:
+	default:
+		goto unlock_and_release;
+	}
+
+	if (inst->skb &&
+	    size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) {
+		/* either the queue len is too high or we don't have
+		 * enough room in the skb left. flush to userspace. */
+		__nfulnl_flush(inst);
+	}
+
+	if (!inst->skb) {
+		inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size);
+		if (!inst->skb)
+			goto alloc_failure;
+	}
+
+	inst->qlen++;
+
+	__build_packet_message(inst, skb, data_len, pf,
+				hooknum, in, out, prefix, plen);
+
+	if (inst->qlen >= qthreshold)
+		__nfulnl_flush(inst);
+	/* timer_pending always called within inst->lock, so there
+	 * is no chance of a race here */
+	else if (!timer_pending(&inst->timer)) {
+		instance_get(inst);
+		inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+		add_timer(&inst->timer);
+	}
+
+unlock_and_release:
+	spin_unlock_bh(&inst->lock);
+	instance_put(inst);
+	return;
+
+alloc_failure:
+	/* FIXME: statistics */
+	goto unlock_and_release;
+}
+EXPORT_SYMBOL_GPL(nfulnl_log_packet);
+
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+		   unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+		int i;
+
+		/* destroy all instances for this pid */
+		spin_lock_bh(&instances_lock);
+		for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+			struct hlist_node *tmp, *t2;
+			struct nfulnl_instance *inst;
+			struct hlist_head *head = &instance_table[i];
+
+			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+				if ((net_eq(n->net, &init_net)) &&
+				    (n->pid == inst->peer_pid))
+					__instance_destroy(inst);
+			}
+		}
+		spin_unlock_bh(&instances_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfulnl_rtnl_notifier = {
+	.notifier_call	= nfulnl_rcv_nl_event,
+};
+
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+		   const struct nlmsghdr *nlh,
+		   const struct nlattr * const nfqa[])
+{
+	return -ENOTSUPP;
+}
+
+static struct nf_logger nfulnl_logger __read_mostly = {
+	.name	= "nfnetlink_log",
+	.logfn	= &nfulnl_log_packet,
+	.me	= THIS_MODULE,
+};
+
+static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
+	[NFULA_CFG_CMD]		= { .len = sizeof(struct nfulnl_msg_config_cmd) },
+	[NFULA_CFG_MODE]	= { .len = sizeof(struct nfulnl_msg_config_mode) },
+	[NFULA_CFG_TIMEOUT]	= { .type = NLA_U32 },
+	[NFULA_CFG_QTHRESH]	= { .type = NLA_U32 },
+	[NFULA_CFG_NLBUFSIZ]	= { .type = NLA_U32 },
+	[NFULA_CFG_FLAGS]	= { .type = NLA_U16 },
+};
+
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+		   const struct nlmsghdr *nlh,
+		   const struct nlattr * const nfula[])
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t group_num = ntohs(nfmsg->res_id);
+	struct nfulnl_instance *inst;
+	struct nfulnl_msg_config_cmd *cmd = NULL;
+	int ret = 0;
+
+	if (nfula[NFULA_CFG_CMD]) {
+		u_int8_t pf = nfmsg->nfgen_family;
+		cmd = nla_data(nfula[NFULA_CFG_CMD]);
+
+		/* Commands without queue context */
+		switch (cmd->command) {
+		case NFULNL_CFG_CMD_PF_BIND:
+			return nf_log_bind_pf(pf, &nfulnl_logger);
+		case NFULNL_CFG_CMD_PF_UNBIND:
+			nf_log_unbind_pf(pf);
+			return 0;
+		}
+	}
+
+	inst = instance_lookup_get(group_num);
+	if (inst && inst->peer_pid != NETLINK_CB(skb).pid) {
+		ret = -EPERM;
+		goto out_put;
+	}
+
+	if (cmd != NULL) {
+		switch (cmd->command) {
+		case NFULNL_CFG_CMD_BIND:
+			if (inst) {
+				ret = -EBUSY;
+				goto out_put;
+			}
+
+			inst = instance_create(group_num,
+					       NETLINK_CB(skb).pid);
+			if (IS_ERR(inst)) {
+				ret = PTR_ERR(inst);
+				goto out;
+			}
+			break;
+		case NFULNL_CFG_CMD_UNBIND:
+			if (!inst) {
+				ret = -ENODEV;
+				goto out;
+			}
+
+			instance_destroy(inst);
+			goto out_put;
+		default:
+			ret = -ENOTSUPP;
+			break;
+		}
+	}
+
+	if (nfula[NFULA_CFG_MODE]) {
+		struct nfulnl_msg_config_mode *params;
+		params = nla_data(nfula[NFULA_CFG_MODE]);
+
+		if (!inst) {
+			ret = -ENODEV;
+			goto out;
+		}
+		nfulnl_set_mode(inst, params->copy_mode,
+				ntohl(params->copy_range));
+	}
+
+	if (nfula[NFULA_CFG_TIMEOUT]) {
+		__be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);
+
+		if (!inst) {
+			ret = -ENODEV;
+			goto out;
+		}
+		nfulnl_set_timeout(inst, ntohl(timeout));
+	}
+
+	if (nfula[NFULA_CFG_NLBUFSIZ]) {
+		__be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);
+
+		if (!inst) {
+			ret = -ENODEV;
+			goto out;
+		}
+		nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+	}
+
+	if (nfula[NFULA_CFG_QTHRESH]) {
+		__be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);
+
+		if (!inst) {
+			ret = -ENODEV;
+			goto out;
+		}
+		nfulnl_set_qthresh(inst, ntohl(qthresh));
+	}
+
+	if (nfula[NFULA_CFG_FLAGS]) {
+		__be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]);
+
+		if (!inst) {
+			ret = -ENODEV;
+			goto out;
+		}
+		nfulnl_set_flags(inst, ntohs(flags));
+	}
+
+out_put:
+	instance_put(inst);
+out:
+	return ret;
+}
+
+static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+	[NFULNL_MSG_PACKET]	= { .call = nfulnl_recv_unsupp,
+				    .attr_count = NFULA_MAX, },
+	[NFULNL_MSG_CONFIG]	= { .call = nfulnl_recv_config,
+				    .attr_count = NFULA_CFG_MAX,
+				    .policy = nfula_cfg_policy },
+};
+
+static const struct nfnetlink_subsystem nfulnl_subsys = {
+	.name		= "log",
+	.subsys_id	= NFNL_SUBSYS_ULOG,
+	.cb_count	= NFULNL_MSG_MAX,
+	.cb		= nfulnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+	unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct iter_state *st)
+{
+	if (!st)
+		return NULL;
+
+	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+		if (!hlist_empty(&instance_table[st->bucket]))
+			return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+	}
+	return NULL;
+}
+
+static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
+{
+	h = rcu_dereference_bh(hlist_next_rcu(h));
+	while (!h) {
+		if (++st->bucket >= INSTANCE_BUCKETS)
+			return NULL;
+
+		h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+	}
+	return h;
+}
+
+static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
+{
+	struct hlist_node *head;
+	head = get_first(st);
+
+	if (head)
+		while (pos && (head = get_next(st, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu_bh)
+{
+	rcu_read_lock_bh();
+	return get_idx(seq->private, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return get_next(s->private, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+	__releases(rcu_bh)
+{
+	rcu_read_unlock_bh();
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	const struct nfulnl_instance *inst = v;
+
+	return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
+			  inst->group_num,
+			  inst->peer_pid, inst->qlen,
+			  inst->copy_mode, inst->copy_range,
+			  inst->flushtimeout, atomic_read(&inst->use));
+}
+
+static const struct seq_operations nful_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nful_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &nful_seq_ops,
+			sizeof(struct iter_state));
+}
+
+static const struct file_operations nful_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nful_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int __init nfnetlink_log_init(void)
+{
+	int i, status = -ENOMEM;
+
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		INIT_HLIST_HEAD(&instance_table[i]);
+
+	/* it's not really all that important to have a random value, so
+	 * we can do this from the init function, even if there hasn't
+	 * been that much entropy yet */
+	get_random_bytes(&hash_init, sizeof(hash_init));
+
+	netlink_register_notifier(&nfulnl_rtnl_notifier);
+	status = nfnetlink_subsys_register(&nfulnl_subsys);
+	if (status < 0) {
+		printk(KERN_ERR "log: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+	status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
+	if (status < 0) {
+		printk(KERN_ERR "log: failed to register logger\n");
+		goto cleanup_subsys;
+	}
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nfnetlink_log", 0440,
+			 proc_net_netfilter, &nful_file_ops))
+		goto cleanup_logger;
+#endif
+	return status;
+
+#ifdef CONFIG_PROC_FS
+cleanup_logger:
+	nf_log_unregister(&nfulnl_logger);
+#endif
+cleanup_subsys:
+	nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+	return status;
+}
+
+static void __exit nfnetlink_log_fini(void)
+{
+	nf_log_unregister(&nfulnl_logger);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+#endif
+	nfnetlink_subsys_unregister(&nfulnl_subsys);
+	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+}
+
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+
+module_init(nfnetlink_log_init);
+module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 00000000..fdd2fafe
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,950 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfnetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2007 by Patrick McHardy <kaber@trash.net>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+#include <net/netfilter/nf_queue.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+struct nfqnl_instance {
+	struct hlist_node hlist;		/* global list of queues */
+	struct rcu_head rcu;
+
+	int peer_pid;
+	unsigned int queue_maxlen;
+	unsigned int copy_range;
+	unsigned int queue_dropped;
+	unsigned int queue_user_dropped;
+
+
+	u_int16_t queue_num;			/* number of this queue */
+	u_int8_t copy_mode;
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+	spinlock_t	lock;
+	unsigned int	queue_total;
+	atomic_t	id_sequence;		/* 'sequence' of pkt ids */
+	struct list_head queue_list;		/* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
+
+static DEFINE_SPINLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS	16
+static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly;
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+	return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+instance_lookup(u_int16_t queue_num)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct nfqnl_instance *inst;
+
+	head = &instance_table[instance_hashfn(queue_num)];
+	hlist_for_each_entry_rcu(inst, pos, head, hlist) {
+		if (inst->queue_num == queue_num)
+			return inst;
+	}
+	return NULL;
+}
+
+static struct nfqnl_instance *
+instance_create(u_int16_t queue_num, int pid)
+{
+	struct nfqnl_instance *inst;
+	unsigned int h;
+	int err;
+
+	spin_lock(&instances_lock);
+	if (instance_lookup(queue_num)) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
+	if (!inst) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	inst->queue_num = queue_num;
+	inst->peer_pid = pid;
+	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+	inst->copy_range = 0xfffff;
+	inst->copy_mode = NFQNL_COPY_NONE;
+	spin_lock_init(&inst->lock);
+	INIT_LIST_HEAD(&inst->queue_list);
+
+	if (!try_module_get(THIS_MODULE)) {
+		err = -EAGAIN;
+		goto out_free;
+	}
+
+	h = instance_hashfn(queue_num);
+	hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
+
+	spin_unlock(&instances_lock);
+
+	return inst;
+
+out_free:
+	kfree(inst);
+out_unlock:
+	spin_unlock(&instances_lock);
+	return ERR_PTR(err);
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+			unsigned long data);
+
+static void
+instance_destroy_rcu(struct rcu_head *head)
+{
+	struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
+						   rcu);
+
+	nfqnl_flush(inst, NULL, 0);
+	kfree(inst);
+	module_put(THIS_MODULE);
+}
+
+static void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+	hlist_del_rcu(&inst->hlist);
+	call_rcu(&inst->rcu, instance_destroy_rcu);
+}
+
+static void
+instance_destroy(struct nfqnl_instance *inst)
+{
+	spin_lock(&instances_lock);
+	__instance_destroy(inst);
+	spin_unlock(&instances_lock);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
+{
+       list_add_tail(&entry->list, &queue->queue_list);
+       queue->queue_total++;
+}
+
+static struct nf_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+{
+	struct nf_queue_entry *entry = NULL, *i;
+
+	spin_lock_bh(&queue->lock);
+
+	list_for_each_entry(i, &queue->queue_list, list) {
+		if (i->id == id) {
+			entry = i;
+			break;
+		}
+	}
+
+	if (entry) {
+		list_del(&entry->list);
+		queue->queue_total--;
+	}
+
+	spin_unlock_bh(&queue->lock);
+
+	return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
+{
+	struct nf_queue_entry *entry, *next;
+
+	spin_lock_bh(&queue->lock);
+	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
+		if (!cmpfn || cmpfn(entry, data)) {
+			list_del(&entry->list);
+			queue->queue_total--;
+			nf_reinject(entry, NF_DROP);
+		}
+	}
+	spin_unlock_bh(&queue->lock);
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct nfqnl_instance *queue,
+			   struct nf_queue_entry *entry)
+{
+	sk_buff_data_t old_tail;
+	size_t size;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct nfqnl_msg_packet_hdr pmsg;
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct sk_buff *entskb = entry->skb;
+	struct net_device *indev;
+	struct net_device *outdev;
+
+	size =    NLMSG_SPACE(sizeof(struct nfgenmsg))
+		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+		+ nla_total_size(sizeof(u_int32_t))	/* ifindex */
+#endif
+		+ nla_total_size(sizeof(u_int32_t))	/* mark */
+		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+		+ nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
+
+	outdev = entry->outdev;
+
+	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
+	case NFQNL_COPY_META:
+	case NFQNL_COPY_NONE:
+		break;
+
+	case NFQNL_COPY_PACKET:
+		if (entskb->ip_summed == CHECKSUM_PARTIAL &&
+		    skb_checksum_help(entskb))
+			return NULL;
+
+		data_len = ACCESS_ONCE(queue->copy_range);
+		if (data_len == 0 || data_len > entskb->len)
+			data_len = entskb->len;
+
+		size += nla_total_size(data_len);
+		break;
+	}
+
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+
+	old_tail = skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0,
+			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+			sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+	nfmsg->nfgen_family = entry->pf;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = htons(queue->queue_num);
+
+	entry->id = atomic_inc_return(&queue->id_sequence);
+	pmsg.packet_id 		= htonl(entry->id);
+	pmsg.hw_protocol	= entskb->protocol;
+	pmsg.hook		= entry->hook;
+
+	NLA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+	indev = entry->indev;
+	if (indev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex));
+#else
+		if (entry->pf == PF_BRIDGE) {
+			/* Case 1: indev is physical input device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
+				     htonl(indev->ifindex));
+			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
+		} else {
+			/* Case 2: indev is bridge group, we need to look for
+			 * physical device (when called from ipv4) */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
+				     htonl(indev->ifindex));
+			if (entskb->nf_bridge && entskb->nf_bridge->physindev)
+				NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
+					     htonl(entskb->nf_bridge->physindev->ifindex));
+		}
+#endif
+	}
+
+	if (outdev) {
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex));
+#else
+		if (entry->pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical output device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+				     htonl(outdev->ifindex));
+			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
+		} else {
+			/* Case 2: outdev is bridge group, we need to look for
+			 * physical output device (when called from ipv4) */
+			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
+				     htonl(outdev->ifindex));
+			if (entskb->nf_bridge && entskb->nf_bridge->physoutdev)
+				NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
+					     htonl(entskb->nf_bridge->physoutdev->ifindex));
+		}
+#endif
+	}
+
+	if (entskb->mark)
+		NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark));
+
+	if (indev && entskb->dev &&
+	    entskb->mac_header != entskb->network_header) {
+		struct nfqnl_msg_packet_hw phw;
+		int len = dev_parse_header(entskb, phw.hw_addr);
+		if (len) {
+			phw.hw_addrlen = htons(len);
+			NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
+		}
+	}
+
+	if (entskb->tstamp.tv64) {
+		struct nfqnl_msg_packet_timestamp ts;
+		struct timeval tv = ktime_to_timeval(entskb->tstamp);
+		ts.sec = cpu_to_be64(tv.tv_sec);
+		ts.usec = cpu_to_be64(tv.tv_usec);
+
+		NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
+	}
+
+	if (data_len) {
+		struct nlattr *nla;
+		int sz = nla_attr_size(data_len);
+
+		if (skb_tailroom(skb) < nla_total_size(data_len)) {
+			printk(KERN_WARNING "nf_queue: no tailroom!\n");
+			goto nlmsg_failure;
+		}
+
+		nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
+		nla->nla_type = NFQA_PAYLOAD;
+		nla->nla_len = sz;
+
+		if (skb_copy_bits(entskb, 0, nla_data(nla), data_len))
+			BUG();
+	}
+
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+nla_put_failure:
+	if (skb)
+		kfree_skb(skb);
+	if (net_ratelimit())
+		printk(KERN_ERR "nf_queue: error creating packet message\n");
+	return NULL;
+}
+
+static int
+nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+	struct sk_buff *nskb;
+	struct nfqnl_instance *queue;
+	int err = -ENOBUFS;
+
+	/* rcu_read_lock()ed by nf_hook_slow() */
+	queue = instance_lookup(queuenum);
+	if (!queue) {
+		err = -ESRCH;
+		goto err_out;
+	}
+
+	if (queue->copy_mode == NFQNL_COPY_NONE) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	nskb = nfqnl_build_packet_message(queue, entry);
+	if (nskb == NULL) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock_bh(&queue->lock);
+
+	if (!queue->peer_pid) {
+		err = -EINVAL;
+		goto err_out_free_nskb;
+	}
+	if (queue->queue_total >= queue->queue_maxlen) {
+		queue->queue_dropped++;
+		if (net_ratelimit())
+			  printk(KERN_WARNING "nf_queue: full at %d entries, "
+				 "dropping packets(s).\n",
+				 queue->queue_total);
+		goto err_out_free_nskb;
+	}
+
+	/* nfnetlink_unicast will either free the nskb or add it to a socket */
+	err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT);
+	if (err < 0) {
+		queue->queue_user_dropped++;
+		goto err_out_unlock;
+	}
+
+	__enqueue_entry(queue, entry);
+
+	spin_unlock_bh(&queue->lock);
+	return 0;
+
+err_out_free_nskb:
+	kfree_skb(nskb);
+err_out_unlock:
+	spin_unlock_bh(&queue->lock);
+err_out:
+	return err;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e)
+{
+	struct sk_buff *nskb;
+	int diff;
+
+	diff = data_len - e->skb->len;
+	if (diff < 0) {
+		if (pskb_trim(e->skb, data_len))
+			return -ENOMEM;
+	} else if (diff > 0) {
+		if (data_len > 0xFFFF)
+			return -EINVAL;
+		if (diff > skb_tailroom(e->skb)) {
+			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
+					       diff, GFP_ATOMIC);
+			if (!nskb) {
+				printk(KERN_WARNING "nf_queue: OOM "
+				      "in mangle, dropping packet\n");
+				return -ENOMEM;
+			}
+			kfree_skb(e->skb);
+			e->skb = nskb;
+		}
+		skb_put(e->skb, diff);
+	}
+	if (!skb_make_writable(e->skb, data_len))
+		return -ENOMEM;
+	skb_copy_to_linear_data(e->skb, data, data_len);
+	e->skb->ip_summed = CHECKSUM_NONE;
+	return 0;
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+	       unsigned char mode, unsigned int range)
+{
+	int status = 0;
+
+	spin_lock_bh(&queue->lock);
+	switch (mode) {
+	case NFQNL_COPY_NONE:
+	case NFQNL_COPY_META:
+		queue->copy_mode = mode;
+		queue->copy_range = 0;
+		break;
+
+	case NFQNL_COPY_PACKET:
+		queue->copy_mode = mode;
+		/* we're using struct nlattr which has 16bit nla_len */
+		if (range > 0xffff)
+			queue->copy_range = 0xffff;
+		else
+			queue->copy_range = range;
+		break;
+
+	default:
+		status = -EINVAL;
+
+	}
+	spin_unlock_bh(&queue->lock);
+
+	return status;
+}
+
+static int
+dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
+{
+	if (entry->indev)
+		if (entry->indev->ifindex == ifindex)
+			return 1;
+	if (entry->outdev)
+		if (entry->outdev->ifindex == ifindex)
+			return 1;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (entry->skb->nf_bridge) {
+		if (entry->skb->nf_bridge->physindev &&
+		    entry->skb->nf_bridge->physindev->ifindex == ifindex)
+			return 1;
+		if (entry->skb->nf_bridge->physoutdev &&
+		    entry->skb->nf_bridge->physoutdev->ifindex == ifindex)
+			return 1;
+	}
+#endif
+	return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(int ifindex)
+{
+	int i;
+
+	rcu_read_lock();
+
+	for (i = 0; i < INSTANCE_BUCKETS; i++) {
+		struct hlist_node *tmp;
+		struct nfqnl_instance *inst;
+		struct hlist_head *head = &instance_table[i];
+
+		hlist_for_each_entry_rcu(inst, tmp, head, hlist)
+			nfqnl_flush(inst, dev_cmp, ifindex);
+	}
+
+	rcu_read_unlock();
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+		    unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	/* Drop any packets associated with the downed device */
+	if (event == NETDEV_DOWN)
+		nfqnl_dev_drop(dev->ifindex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+	.notifier_call	= nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+		   unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
+		int i;
+
+		/* destroy all instances for this pid */
+		spin_lock(&instances_lock);
+		for (i = 0; i < INSTANCE_BUCKETS; i++) {
+			struct hlist_node *tmp, *t2;
+			struct nfqnl_instance *inst;
+			struct hlist_head *head = &instance_table[i];
+
+			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+				if ((n->net == &init_net) &&
+				    (n->pid == inst->peer_pid))
+					__instance_destroy(inst);
+			}
+		}
+		spin_unlock(&instances_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+	.notifier_call	= nfqnl_rcv_nl_event,
+};
+
+static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
+	[NFQA_VERDICT_HDR]	= { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
+	[NFQA_MARK]		= { .type = NLA_U32 },
+	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC },
+};
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+		   const struct nlmsghdr *nlh,
+		   const struct nlattr * const nfqa[])
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+	struct nfqnl_msg_verdict_hdr *vhdr;
+	struct nfqnl_instance *queue;
+	unsigned int verdict;
+	struct nf_queue_entry *entry;
+	int err;
+
+	rcu_read_lock();
+	queue = instance_lookup(queue_num);
+	if (!queue) {
+		err = -ENODEV;
+		goto err_out_unlock;
+	}
+
+	if (queue->peer_pid != NETLINK_CB(skb).pid) {
+		err = -EPERM;
+		goto err_out_unlock;
+	}
+
+	if (!nfqa[NFQA_VERDICT_HDR]) {
+		err = -EINVAL;
+		goto err_out_unlock;
+	}
+
+	vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
+	verdict = ntohl(vhdr->verdict);
+
+	if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
+		err = -EINVAL;
+		goto err_out_unlock;
+	}
+
+	entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+	if (entry == NULL) {
+		err = -ENOENT;
+		goto err_out_unlock;
+	}
+	rcu_read_unlock();
+
+	if (nfqa[NFQA_PAYLOAD]) {
+		if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
+				 nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0)
+			verdict = NF_DROP;
+	}
+
+	if (nfqa[NFQA_MARK])
+		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+
+	nf_reinject(entry, verdict);
+	return 0;
+
+err_out_unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+		  const struct nlmsghdr *nlh,
+		  const struct nlattr * const nfqa[])
+{
+	return -ENOTSUPP;
+}
+
+static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
+	[NFQA_CFG_CMD]		= { .len = sizeof(struct nfqnl_msg_config_cmd) },
+	[NFQA_CFG_PARAMS]	= { .len = sizeof(struct nfqnl_msg_config_params) },
+};
+
+static const struct nf_queue_handler nfqh = {
+	.name 	= "nf_queue",
+	.outfn	= &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+		  const struct nlmsghdr *nlh,
+		  const struct nlattr * const nfqa[])
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t queue_num = ntohs(nfmsg->res_id);
+	struct nfqnl_instance *queue;
+	struct nfqnl_msg_config_cmd *cmd = NULL;
+	int ret = 0;
+
+	if (nfqa[NFQA_CFG_CMD]) {
+		cmd = nla_data(nfqa[NFQA_CFG_CMD]);
+
+		/* Commands without queue context - might sleep */
+		switch (cmd->command) {
+		case NFQNL_CFG_CMD_PF_BIND:
+			return nf_register_queue_handler(ntohs(cmd->pf),
+							 &nfqh);
+		case NFQNL_CFG_CMD_PF_UNBIND:
+			return nf_unregister_queue_handler(ntohs(cmd->pf),
+							   &nfqh);
+		}
+	}
+
+	rcu_read_lock();
+	queue = instance_lookup(queue_num);
+	if (queue && queue->peer_pid != NETLINK_CB(skb).pid) {
+		ret = -EPERM;
+		goto err_out_unlock;
+	}
+
+	if (cmd != NULL) {
+		switch (cmd->command) {
+		case NFQNL_CFG_CMD_BIND:
+			if (queue) {
+				ret = -EBUSY;
+				goto err_out_unlock;
+			}
+			queue = instance_create(queue_num, NETLINK_CB(skb).pid);
+			if (IS_ERR(queue)) {
+				ret = PTR_ERR(queue);
+				goto err_out_unlock;
+			}
+			break;
+		case NFQNL_CFG_CMD_UNBIND:
+			if (!queue) {
+				ret = -ENODEV;
+				goto err_out_unlock;
+			}
+			instance_destroy(queue);
+			break;
+		case NFQNL_CFG_CMD_PF_BIND:
+		case NFQNL_CFG_CMD_PF_UNBIND:
+			break;
+		default:
+			ret = -ENOTSUPP;
+			break;
+		}
+	}
+
+	if (nfqa[NFQA_CFG_PARAMS]) {
+		struct nfqnl_msg_config_params *params;
+
+		if (!queue) {
+			ret = -ENODEV;
+			goto err_out_unlock;
+		}
+		params = nla_data(nfqa[NFQA_CFG_PARAMS]);
+		nfqnl_set_mode(queue, params->copy_mode,
+				ntohl(params->copy_range));
+	}
+
+	if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
+		__be32 *queue_maxlen;
+
+		if (!queue) {
+			ret = -ENODEV;
+			goto err_out_unlock;
+		}
+		queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
+		spin_lock_bh(&queue->lock);
+		queue->queue_maxlen = ntohl(*queue_maxlen);
+		spin_unlock_bh(&queue->lock);
+	}
+
+err_out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+	[NFQNL_MSG_PACKET]	= { .call = nfqnl_recv_unsupp,
+				    .attr_count = NFQA_MAX, },
+	[NFQNL_MSG_VERDICT]	= { .call = nfqnl_recv_verdict,
+				    .attr_count = NFQA_MAX,
+				    .policy = nfqa_verdict_policy },
+	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config,
+				    .attr_count = NFQA_CFG_MAX,
+				    .policy = nfqa_cfg_policy },
+};
+
+static const struct nfnetlink_subsystem nfqnl_subsys = {
+	.name		= "nf_queue",
+	.subsys_id	= NFNL_SUBSYS_QUEUE,
+	.cb_count	= NFQNL_MSG_MAX,
+	.cb		= nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+	unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+	struct iter_state *st = seq->private;
+
+	if (!st)
+		return NULL;
+
+	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+		if (!hlist_empty(&instance_table[st->bucket]))
+			return instance_table[st->bucket].first;
+	}
+	return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+	struct iter_state *st = seq->private;
+
+	h = h->next;
+	while (!h) {
+		if (++st->bucket >= INSTANCE_BUCKETS)
+			return NULL;
+
+		h = instance_table[st->bucket].first;
+	}
+	return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_node *head;
+	head = get_first(seq);
+
+	if (head)
+		while (pos && (head = get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(instances_lock)
+{
+	spin_lock(&instances_lock);
+	return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+	__releases(instances_lock)
+{
+	spin_unlock(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	const struct nfqnl_instance *inst = v;
+
+	return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+			  inst->queue_num,
+			  inst->peer_pid, inst->queue_total,
+			  inst->copy_mode, inst->copy_range,
+			  inst->queue_dropped, inst->queue_user_dropped,
+			  atomic_read(&inst->id_sequence), 1);
+}
+
+static const struct seq_operations nfqnl_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+	return seq_open_private(file, &nfqnl_seq_ops,
+			sizeof(struct iter_state));
+}
+
+static const struct file_operations nfqnl_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nfqnl_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int __init nfnetlink_queue_init(void)
+{
+	int i, status = -ENOMEM;
+
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		INIT_HLIST_HEAD(&instance_table[i]);
+
+	netlink_register_notifier(&nfqnl_rtnl_notifier);
+	status = nfnetlink_subsys_register(&nfqnl_subsys);
+	if (status < 0) {
+		printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_create("nfnetlink_queue", 0440,
+			 proc_net_netfilter, &nfqnl_file_ops))
+		goto cleanup_subsys;
+#endif
+
+	register_netdevice_notifier(&nfqnl_dev_notifier);
+	return status;
+
+#ifdef CONFIG_PROC_FS
+cleanup_subsys:
+	nfnetlink_subsys_unregister(&nfqnl_subsys);
+#endif
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+	return status;
+}
+
+static void __exit nfnetlink_queue_fini(void)
+{
+	nf_unregister_queue_handlers(&nfqh);
+	unregister_netdevice_notifier(&nfqnl_dev_notifier);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
+#endif
+	nfnetlink_subsys_unregister(&nfqnl_subsys);
+	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(nfnetlink_queue_init);
+module_exit(nfnetlink_queue_fini);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
new file mode 100644
index 00000000..b0869fe3
--- /dev/null
+++ b/net/netfilter/x_tables.c
@@ -0,0 +1,1395 @@
+/*
+ * x_tables core - Backend for {ip,ip6,arp}_tables
+ *
+ * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org>
+ *
+ * Based on existing ip_tables code which is
+ *   Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ *   Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/audit.h>
+#include <net/net_namespace.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
+
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+struct compat_delta {
+	unsigned int offset; /* offset in kernel */
+	int delta; /* delta in 32bit user land */
+};
+
+struct xt_af {
+	struct mutex mutex;
+	struct list_head match;
+	struct list_head target;
+#ifdef CONFIG_COMPAT
+	struct mutex compat_mutex;
+	struct compat_delta *compat_tab;
+	unsigned int number; /* number of slots in compat_tab[] */
+	unsigned int cur; /* number of used slots in compat_tab[] */
+#endif
+};
+
+static struct xt_af *xt;
+
+static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
+	[NFPROTO_UNSPEC] = "x",
+	[NFPROTO_IPV4]   = "ip",
+	[NFPROTO_ARP]    = "arp",
+	[NFPROTO_BRIDGE] = "eb",
+	[NFPROTO_IPV6]   = "ip6",
+};
+
+/* Allow this many total (re)entries. */
+static const unsigned int xt_jumpstack_multiplier = 2;
+
+/* Registration hooks for targets. */
+int
+xt_register_target(struct xt_target *target)
+{
+	u_int8_t af = target->family;
+	int ret;
+
+	ret = mutex_lock_interruptible(&xt[af].mutex);
+	if (ret != 0)
+		return ret;
+	list_add(&target->list, &xt[af].target);
+	mutex_unlock(&xt[af].mutex);
+	return ret;
+}
+EXPORT_SYMBOL(xt_register_target);
+
+void
+xt_unregister_target(struct xt_target *target)
+{
+	u_int8_t af = target->family;
+
+	mutex_lock(&xt[af].mutex);
+	list_del(&target->list);
+	mutex_unlock(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_target);
+
+int
+xt_register_targets(struct xt_target *target, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = xt_register_target(&target[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		xt_unregister_targets(target, i);
+	return err;
+}
+EXPORT_SYMBOL(xt_register_targets);
+
+void
+xt_unregister_targets(struct xt_target *target, unsigned int n)
+{
+	while (n-- > 0)
+		xt_unregister_target(&target[n]);
+}
+EXPORT_SYMBOL(xt_unregister_targets);
+
+int
+xt_register_match(struct xt_match *match)
+{
+	u_int8_t af = match->family;
+	int ret;
+
+	ret = mutex_lock_interruptible(&xt[af].mutex);
+	if (ret != 0)
+		return ret;
+
+	list_add(&match->list, &xt[af].match);
+	mutex_unlock(&xt[af].mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(xt_register_match);
+
+void
+xt_unregister_match(struct xt_match *match)
+{
+	u_int8_t af = match->family;
+
+	mutex_lock(&xt[af].mutex);
+	list_del(&match->list);
+	mutex_unlock(&xt[af].mutex);
+}
+EXPORT_SYMBOL(xt_unregister_match);
+
+int
+xt_register_matches(struct xt_match *match, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = xt_register_match(&match[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		xt_unregister_matches(match, i);
+	return err;
+}
+EXPORT_SYMBOL(xt_register_matches);
+
+void
+xt_unregister_matches(struct xt_match *match, unsigned int n)
+{
+	while (n-- > 0)
+		xt_unregister_match(&match[n]);
+}
+EXPORT_SYMBOL(xt_unregister_matches);
+
+
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use.
+ */
+
+/* Find match, grabs ref.  Returns ERR_PTR() on error. */
+struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
+{
+	struct xt_match *m;
+	int err = -ENOENT;
+
+	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(m, &xt[af].match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision == revision) {
+				if (try_module_get(m->me)) {
+					mutex_unlock(&xt[af].mutex);
+					return m;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	mutex_unlock(&xt[af].mutex);
+
+	if (af != NFPROTO_UNSPEC)
+		/* Try searching again in the family-independent list */
+		return xt_find_match(NFPROTO_UNSPEC, name, revision);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_match);
+
+struct xt_match *
+xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)
+{
+	struct xt_match *match;
+
+	match = xt_find_match(nfproto, name, revision);
+	if (IS_ERR(match)) {
+		request_module("%st_%s", xt_prefix[nfproto], name);
+		match = xt_find_match(nfproto, name, revision);
+	}
+
+	return match;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_match);
+
+/* Find target, grabs ref.  Returns ERR_PTR() on error. */
+struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
+{
+	struct xt_target *t;
+	int err = -ENOENT;
+
+	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &xt[af].target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision == revision) {
+				if (try_module_get(t->me)) {
+					mutex_unlock(&xt[af].mutex);
+					return t;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	mutex_unlock(&xt[af].mutex);
+
+	if (af != NFPROTO_UNSPEC)
+		/* Try searching again in the family-independent list */
+		return xt_find_target(NFPROTO_UNSPEC, name, revision);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xt_find_target);
+
+struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
+{
+	struct xt_target *target;
+
+	target = xt_find_target(af, name, revision);
+	if (IS_ERR(target)) {
+		request_module("%st_%s", xt_prefix[af], name);
+		target = xt_find_target(af, name, revision);
+	}
+
+	return target;
+}
+EXPORT_SYMBOL_GPL(xt_request_find_target);
+
+static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
+{
+	const struct xt_match *m;
+	int have_rev = 0;
+
+	list_for_each_entry(m, &xt[af].match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision > *bestp)
+				*bestp = m->revision;
+			if (m->revision == revision)
+				have_rev = 1;
+		}
+	}
+
+	if (af != NFPROTO_UNSPEC && !have_rev)
+		return match_revfn(NFPROTO_UNSPEC, name, revision, bestp);
+
+	return have_rev;
+}
+
+static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
+{
+	const struct xt_target *t;
+	int have_rev = 0;
+
+	list_for_each_entry(t, &xt[af].target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision > *bestp)
+				*bestp = t->revision;
+			if (t->revision == revision)
+				have_rev = 1;
+		}
+	}
+
+	if (af != NFPROTO_UNSPEC && !have_rev)
+		return target_revfn(NFPROTO_UNSPEC, name, revision, bestp);
+
+	return have_rev;
+}
+
+/* Returns true or false (if no such extension at all) */
+int xt_find_revision(u8 af, const char *name, u8 revision, int target,
+		     int *err)
+{
+	int have_rev, best = -1;
+
+	if (mutex_lock_interruptible(&xt[af].mutex) != 0) {
+		*err = -EINTR;
+		return 1;
+	}
+	if (target == 1)
+		have_rev = target_revfn(af, name, revision, &best);
+	else
+		have_rev = match_revfn(af, name, revision, &best);
+	mutex_unlock(&xt[af].mutex);
+
+	/* Nothing at all?  Return 0 to try loading module. */
+	if (best == -1) {
+		*err = -ENOENT;
+		return 0;
+	}
+
+	*err = best;
+	if (!have_rev)
+		*err = -EPROTONOSUPPORT;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(xt_find_revision);
+
+static char *textify_hooks(char *buf, size_t size, unsigned int mask)
+{
+	static const char *const names[] = {
+		"PREROUTING", "INPUT", "FORWARD",
+		"OUTPUT", "POSTROUTING", "BROUTING",
+	};
+	unsigned int i;
+	char *p = buf;
+	bool np = false;
+	int res;
+
+	*p = '\0';
+	for (i = 0; i < ARRAY_SIZE(names); ++i) {
+		if (!(mask & (1 << i)))
+			continue;
+		res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]);
+		if (res > 0) {
+			size -= res;
+			p += res;
+		}
+		np = true;
+	}
+
+	return buf;
+}
+
+int xt_check_match(struct xt_mtchk_param *par,
+		   unsigned int size, u_int8_t proto, bool inv_proto)
+{
+	int ret;
+
+	if (XT_ALIGN(par->match->matchsize) != size &&
+	    par->match->matchsize != -1) {
+		/*
+		 * ebt_among is exempt from centralized matchsize checking
+		 * because it uses a dynamic-size data set.
+		 */
+		pr_err("%s_tables: %s.%u match: invalid size "
+		       "%u (kernel) != (user) %u\n",
+		       xt_prefix[par->family], par->match->name,
+		       par->match->revision,
+		       XT_ALIGN(par->match->matchsize), size);
+		return -EINVAL;
+	}
+	if (par->match->table != NULL &&
+	    strcmp(par->match->table, par->table) != 0) {
+		pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+		       xt_prefix[par->family], par->match->name,
+		       par->match->table, par->table);
+		return -EINVAL;
+	}
+	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
+		char used[64], allow[64];
+
+		pr_err("%s_tables: %s match: used from hooks %s, but only "
+		       "valid from %s\n",
+		       xt_prefix[par->family], par->match->name,
+		       textify_hooks(used, sizeof(used), par->hook_mask),
+		       textify_hooks(allow, sizeof(allow), par->match->hooks));
+		return -EINVAL;
+	}
+	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
+		pr_err("%s_tables: %s match: only valid for protocol %u\n",
+		       xt_prefix[par->family], par->match->name,
+		       par->match->proto);
+		return -EINVAL;
+	}
+	if (par->match->checkentry != NULL) {
+		ret = par->match->checkentry(par);
+		if (ret < 0)
+			return ret;
+		else if (ret > 0)
+			/* Flag up potential errors. */
+			return -EIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_match);
+
+#ifdef CONFIG_COMPAT
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
+{
+	struct xt_af *xp = &xt[af];
+
+	if (!xp->compat_tab) {
+		if (!xp->number)
+			return -EINVAL;
+		xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+		if (!xp->compat_tab)
+			return -ENOMEM;
+		xp->cur = 0;
+	}
+
+	if (xp->cur >= xp->number)
+		return -EINVAL;
+
+	if (xp->cur)
+		delta += xp->compat_tab[xp->cur - 1].delta;
+	xp->compat_tab[xp->cur].offset = offset;
+	xp->compat_tab[xp->cur].delta = delta;
+	xp->cur++;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_add_offset);
+
+void xt_compat_flush_offsets(u_int8_t af)
+{
+	if (xt[af].compat_tab) {
+		vfree(xt[af].compat_tab);
+		xt[af].compat_tab = NULL;
+		xt[af].number = 0;
+		xt[af].cur = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
+
+int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
+{
+	struct compat_delta *tmp = xt[af].compat_tab;
+	int mid, left = 0, right = xt[af].cur - 1;
+
+	while (left <= right) {
+		mid = (left + right) >> 1;
+		if (offset > tmp[mid].offset)
+			left = mid + 1;
+		else if (offset < tmp[mid].offset)
+			right = mid - 1;
+		else
+			return mid ? tmp[mid - 1].delta : 0;
+	}
+	return left ? tmp[left - 1].delta : 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
+
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+	xt[af].number = number;
+	xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
+int xt_compat_match_offset(const struct xt_match *match)
+{
+	u_int16_t csize = match->compatsize ? : match->matchsize;
+	return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_offset);
+
+int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+			      unsigned int *size)
+{
+	const struct xt_match *match = m->u.kernel.match;
+	struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
+	int pad, off = xt_compat_match_offset(match);
+	u_int16_t msize = cm->u.user.match_size;
+
+	m = *dstptr;
+	memcpy(m, cm, sizeof(*cm));
+	if (match->compat_from_user)
+		match->compat_from_user(m->data, cm->data);
+	else
+		memcpy(m->data, cm->data, msize - sizeof(*cm));
+	pad = XT_ALIGN(match->matchsize) - match->matchsize;
+	if (pad > 0)
+		memset(m->data + match->matchsize, 0, pad);
+
+	msize += off;
+	m->u.user.match_size = msize;
+
+	*size += off;
+	*dstptr += msize;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
+
+int xt_compat_match_to_user(const struct xt_entry_match *m,
+			    void __user **dstptr, unsigned int *size)
+{
+	const struct xt_match *match = m->u.kernel.match;
+	struct compat_xt_entry_match __user *cm = *dstptr;
+	int off = xt_compat_match_offset(match);
+	u_int16_t msize = m->u.user.match_size - off;
+
+	if (copy_to_user(cm, m, sizeof(*cm)) ||
+	    put_user(msize, &cm->u.user.match_size) ||
+	    copy_to_user(cm->u.user.name, m->u.kernel.match->name,
+			 strlen(m->u.kernel.match->name) + 1))
+		return -EFAULT;
+
+	if (match->compat_to_user) {
+		if (match->compat_to_user((void __user *)cm->data, m->data))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+			return -EFAULT;
+	}
+
+	*size -= off;
+	*dstptr += msize;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+#endif /* CONFIG_COMPAT */
+
+int xt_check_target(struct xt_tgchk_param *par,
+		    unsigned int size, u_int8_t proto, bool inv_proto)
+{
+	int ret;
+
+	if (XT_ALIGN(par->target->targetsize) != size) {
+		pr_err("%s_tables: %s.%u target: invalid size "
+		       "%u (kernel) != (user) %u\n",
+		       xt_prefix[par->family], par->target->name,
+		       par->target->revision,
+		       XT_ALIGN(par->target->targetsize), size);
+		return -EINVAL;
+	}
+	if (par->target->table != NULL &&
+	    strcmp(par->target->table, par->table) != 0) {
+		pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
+		       xt_prefix[par->family], par->target->name,
+		       par->target->table, par->table);
+		return -EINVAL;
+	}
+	if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
+		char used[64], allow[64];
+
+		pr_err("%s_tables: %s target: used from hooks %s, but only "
+		       "usable from %s\n",
+		       xt_prefix[par->family], par->target->name,
+		       textify_hooks(used, sizeof(used), par->hook_mask),
+		       textify_hooks(allow, sizeof(allow), par->target->hooks));
+		return -EINVAL;
+	}
+	if (par->target->proto && (par->target->proto != proto || inv_proto)) {
+		pr_err("%s_tables: %s target: only valid for protocol %u\n",
+		       xt_prefix[par->family], par->target->name,
+		       par->target->proto);
+		return -EINVAL;
+	}
+	if (par->target->checkentry != NULL) {
+		ret = par->target->checkentry(par);
+		if (ret < 0)
+			return ret;
+		else if (ret > 0)
+			/* Flag up potential errors. */
+			return -EIO;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_check_target);
+
+#ifdef CONFIG_COMPAT
+int xt_compat_target_offset(const struct xt_target *target)
+{
+	u_int16_t csize = target->compatsize ? : target->targetsize;
+	return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_offset);
+
+void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
+				unsigned int *size)
+{
+	const struct xt_target *target = t->u.kernel.target;
+	struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
+	int pad, off = xt_compat_target_offset(target);
+	u_int16_t tsize = ct->u.user.target_size;
+
+	t = *dstptr;
+	memcpy(t, ct, sizeof(*ct));
+	if (target->compat_from_user)
+		target->compat_from_user(t->data, ct->data);
+	else
+		memcpy(t->data, ct->data, tsize - sizeof(*ct));
+	pad = XT_ALIGN(target->targetsize) - target->targetsize;
+	if (pad > 0)
+		memset(t->data + target->targetsize, 0, pad);
+
+	tsize += off;
+	t->u.user.target_size = tsize;
+
+	*size += off;
+	*dstptr += tsize;
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_from_user);
+
+int xt_compat_target_to_user(const struct xt_entry_target *t,
+			     void __user **dstptr, unsigned int *size)
+{
+	const struct xt_target *target = t->u.kernel.target;
+	struct compat_xt_entry_target __user *ct = *dstptr;
+	int off = xt_compat_target_offset(target);
+	u_int16_t tsize = t->u.user.target_size - off;
+
+	if (copy_to_user(ct, t, sizeof(*ct)) ||
+	    put_user(tsize, &ct->u.user.target_size) ||
+	    copy_to_user(ct->u.user.name, t->u.kernel.target->name,
+			 strlen(t->u.kernel.target->name) + 1))
+		return -EFAULT;
+
+	if (target->compat_to_user) {
+		if (target->compat_to_user((void __user *)ct->data, t->data))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+			return -EFAULT;
+	}
+
+	*size -= off;
+	*dstptr += tsize;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
+#endif
+
+struct xt_table_info *xt_alloc_table_info(unsigned int size)
+{
+	struct xt_table_info *newinfo;
+	int cpu;
+
+	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
+		return NULL;
+
+	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+
+	newinfo->size = size;
+
+	for_each_possible_cpu(cpu) {
+		if (size <= PAGE_SIZE)
+			newinfo->entries[cpu] = kmalloc_node(size,
+							GFP_KERNEL,
+							cpu_to_node(cpu));
+		else
+			newinfo->entries[cpu] = vmalloc_node(size,
+							cpu_to_node(cpu));
+
+		if (newinfo->entries[cpu] == NULL) {
+			xt_free_table_info(newinfo);
+			return NULL;
+		}
+	}
+
+	return newinfo;
+}
+EXPORT_SYMBOL(xt_alloc_table_info);
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+
+	if (info->jumpstack != NULL) {
+		if (sizeof(void *) * info->stacksize > PAGE_SIZE) {
+			for_each_possible_cpu(cpu)
+				vfree(info->jumpstack[cpu]);
+		} else {
+			for_each_possible_cpu(cpu)
+				kfree(info->jumpstack[cpu]);
+		}
+	}
+
+	if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE)
+		vfree(info->jumpstack);
+	else
+		kfree(info->jumpstack);
+
+	free_percpu(info->stackptr);
+
+	kfree(info);
+}
+EXPORT_SYMBOL(xt_free_table_info);
+
+/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
+struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
+				    const char *name)
+{
+	struct xt_table *t;
+
+	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &net->xt.tables[af], list)
+		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+			return t;
+	mutex_unlock(&xt[af].mutex);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xt_find_table_lock);
+
+void xt_table_unlock(struct xt_table *table)
+{
+	mutex_unlock(&xt[table->af].mutex);
+}
+EXPORT_SYMBOL_GPL(xt_table_unlock);
+
+#ifdef CONFIG_COMPAT
+void xt_compat_lock(u_int8_t af)
+{
+	mutex_lock(&xt[af].compat_mutex);
+}
+EXPORT_SYMBOL_GPL(xt_compat_lock);
+
+void xt_compat_unlock(u_int8_t af)
+{
+	mutex_unlock(&xt[af].compat_mutex);
+}
+EXPORT_SYMBOL_GPL(xt_compat_unlock);
+#endif
+
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+
+static int xt_jumpstack_alloc(struct xt_table_info *i)
+{
+	unsigned int size;
+	int cpu;
+
+	i->stackptr = alloc_percpu(unsigned int);
+	if (i->stackptr == NULL)
+		return -ENOMEM;
+
+	size = sizeof(void **) * nr_cpu_ids;
+	if (size > PAGE_SIZE)
+		i->jumpstack = vmalloc(size);
+	else
+		i->jumpstack = kmalloc(size, GFP_KERNEL);
+	if (i->jumpstack == NULL)
+		return -ENOMEM;
+	memset(i->jumpstack, 0, size);
+
+	i->stacksize *= xt_jumpstack_multiplier;
+	size = sizeof(void *) * i->stacksize;
+	for_each_possible_cpu(cpu) {
+		if (size > PAGE_SIZE)
+			i->jumpstack[cpu] = vmalloc_node(size,
+				cpu_to_node(cpu));
+		else
+			i->jumpstack[cpu] = kmalloc_node(size,
+				GFP_KERNEL, cpu_to_node(cpu));
+		if (i->jumpstack[cpu] == NULL)
+			/*
+			 * Freeing will be done later on by the callers. The
+			 * chain is: xt_replace_table -> __do_replace ->
+			 * do_replace -> xt_free_table_info.
+			 */
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+struct xt_table_info *
+xt_replace_table(struct xt_table *table,
+	      unsigned int num_counters,
+	      struct xt_table_info *newinfo,
+	      int *error)
+{
+	struct xt_table_info *private;
+	int ret;
+
+	ret = xt_jumpstack_alloc(newinfo);
+	if (ret < 0) {
+		*error = ret;
+		return NULL;
+	}
+
+	/* Do the substitution. */
+	local_bh_disable();
+	private = table->private;
+
+	/* Check inside lock: is the old number correct? */
+	if (num_counters != private->number) {
+		pr_debug("num_counters != table->private->number (%u/%u)\n",
+			 num_counters, private->number);
+		local_bh_enable();
+		*error = -EAGAIN;
+		return NULL;
+	}
+
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+#ifdef CONFIG_AUDIT
+	if (audit_enabled) {
+		struct audit_buffer *ab;
+
+		ab = audit_log_start(current->audit_context, GFP_KERNEL,
+				     AUDIT_NETFILTER_CFG);
+		if (ab) {
+			audit_log_format(ab, "table=%s family=%u entries=%u",
+					 table->name, table->af,
+					 private->number);
+			audit_log_end(ab);
+		}
+	}
+#endif
+
+	return private;
+}
+EXPORT_SYMBOL_GPL(xt_replace_table);
+
+struct xt_table *xt_register_table(struct net *net,
+				   const struct xt_table *input_table,
+				   struct xt_table_info *bootstrap,
+				   struct xt_table_info *newinfo)
+{
+	int ret;
+	struct xt_table_info *private;
+	struct xt_table *t, *table;
+
+	/* Don't add one object to multiple lists. */
+	table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
+	if (!table) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = mutex_lock_interruptible(&xt[table->af].mutex);
+	if (ret != 0)
+		goto out_free;
+
+	/* Don't autoload: we'd eat our tail... */
+	list_for_each_entry(t, &net->xt.tables[table->af], list) {
+		if (strcmp(t->name, table->name) == 0) {
+			ret = -EEXIST;
+			goto unlock;
+		}
+	}
+
+	/* Simplifies replace_table code. */
+	table->private = bootstrap;
+
+	if (!xt_replace_table(table, 0, newinfo, &ret))
+		goto unlock;
+
+	private = table->private;
+	pr_debug("table->private->number = %u\n", private->number);
+
+	/* save number of initial entries */
+	private->initial_entries = private->number;
+
+	list_add(&table->list, &net->xt.tables[table->af]);
+	mutex_unlock(&xt[table->af].mutex);
+	return table;
+
+ unlock:
+	mutex_unlock(&xt[table->af].mutex);
+out_free:
+	kfree(table);
+out:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(xt_register_table);
+
+void *xt_unregister_table(struct xt_table *table)
+{
+	struct xt_table_info *private;
+
+	mutex_lock(&xt[table->af].mutex);
+	private = table->private;
+	list_del(&table->list);
+	mutex_unlock(&xt[table->af].mutex);
+	kfree(table);
+
+	return private;
+}
+EXPORT_SYMBOL_GPL(xt_unregister_table);
+
+#ifdef CONFIG_PROC_FS
+struct xt_names_priv {
+	struct seq_net_private p;
+	u_int8_t af;
+};
+static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct xt_names_priv *priv = seq->private;
+	struct net *net = seq_file_net(seq);
+	u_int8_t af = priv->af;
+
+	mutex_lock(&xt[af].mutex);
+	return seq_list_start(&net->xt.tables[af], *pos);
+}
+
+static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct xt_names_priv *priv = seq->private;
+	struct net *net = seq_file_net(seq);
+	u_int8_t af = priv->af;
+
+	return seq_list_next(v, &net->xt.tables[af], pos);
+}
+
+static void xt_table_seq_stop(struct seq_file *seq, void *v)
+{
+	struct xt_names_priv *priv = seq->private;
+	u_int8_t af = priv->af;
+
+	mutex_unlock(&xt[af].mutex);
+}
+
+static int xt_table_seq_show(struct seq_file *seq, void *v)
+{
+	struct xt_table *table = list_entry(v, struct xt_table, list);
+
+	if (strlen(table->name))
+		return seq_printf(seq, "%s\n", table->name);
+	else
+		return 0;
+}
+
+static const struct seq_operations xt_table_seq_ops = {
+	.start	= xt_table_seq_start,
+	.next	= xt_table_seq_next,
+	.stop	= xt_table_seq_stop,
+	.show	= xt_table_seq_show,
+};
+
+static int xt_table_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct xt_names_priv *priv;
+
+	ret = seq_open_net(inode, file, &xt_table_seq_ops,
+			   sizeof(struct xt_names_priv));
+	if (!ret) {
+		priv = ((struct seq_file *)file->private_data)->private;
+		priv->af = (unsigned long)PDE(inode)->data;
+	}
+	return ret;
+}
+
+static const struct file_operations xt_table_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = xt_table_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_net,
+};
+
+/*
+ * Traverse state for ip{,6}_{tables,matches} for helping crossing
+ * the multi-AF mutexes.
+ */
+struct nf_mttg_trav {
+	struct list_head *head, *curr;
+	uint8_t class, nfproto;
+};
+
+enum {
+	MTTG_TRAV_INIT,
+	MTTG_TRAV_NFP_UNSPEC,
+	MTTG_TRAV_NFP_SPEC,
+	MTTG_TRAV_DONE,
+};
+
+static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
+    bool is_target)
+{
+	static const uint8_t next_class[] = {
+		[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
+		[MTTG_TRAV_NFP_SPEC]   = MTTG_TRAV_DONE,
+	};
+	struct nf_mttg_trav *trav = seq->private;
+
+	switch (trav->class) {
+	case MTTG_TRAV_INIT:
+		trav->class = MTTG_TRAV_NFP_UNSPEC;
+		mutex_lock(&xt[NFPROTO_UNSPEC].mutex);
+		trav->head = trav->curr = is_target ?
+			&xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match;
+ 		break;
+	case MTTG_TRAV_NFP_UNSPEC:
+		trav->curr = trav->curr->next;
+		if (trav->curr != trav->head)
+			break;
+		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+		mutex_lock(&xt[trav->nfproto].mutex);
+		trav->head = trav->curr = is_target ?
+			&xt[trav->nfproto].target : &xt[trav->nfproto].match;
+		trav->class = next_class[trav->class];
+		break;
+	case MTTG_TRAV_NFP_SPEC:
+		trav->curr = trav->curr->next;
+		if (trav->curr != trav->head)
+			break;
+		/* fallthru, _stop will unlock */
+	default:
+		return NULL;
+	}
+
+	if (ppos != NULL)
+		++*ppos;
+	return trav;
+}
+
+static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
+    bool is_target)
+{
+	struct nf_mttg_trav *trav = seq->private;
+	unsigned int j;
+
+	trav->class = MTTG_TRAV_INIT;
+	for (j = 0; j < *pos; ++j)
+		if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL)
+			return NULL;
+	return trav;
+}
+
+static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
+{
+	struct nf_mttg_trav *trav = seq->private;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+		mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
+		break;
+	case MTTG_TRAV_NFP_SPEC:
+		mutex_unlock(&xt[trav->nfproto].mutex);
+		break;
+	}
+}
+
+static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return xt_mttg_seq_start(seq, pos, false);
+}
+
+static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	return xt_mttg_seq_next(seq, v, ppos, false);
+}
+
+static int xt_match_seq_show(struct seq_file *seq, void *v)
+{
+	const struct nf_mttg_trav *trav = seq->private;
+	const struct xt_match *match;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+	case MTTG_TRAV_NFP_SPEC:
+		if (trav->curr == trav->head)
+			return 0;
+		match = list_entry(trav->curr, struct xt_match, list);
+		return (*match->name == '\0') ? 0 :
+		       seq_printf(seq, "%s\n", match->name);
+	}
+	return 0;
+}
+
+static const struct seq_operations xt_match_seq_ops = {
+	.start	= xt_match_seq_start,
+	.next	= xt_match_seq_next,
+	.stop	= xt_mttg_seq_stop,
+	.show	= xt_match_seq_show,
+};
+
+static int xt_match_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct nf_mttg_trav *trav;
+	int ret;
+
+	trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+	if (trav == NULL)
+		return -ENOMEM;
+
+	ret = seq_open(file, &xt_match_seq_ops);
+	if (ret < 0) {
+		kfree(trav);
+		return ret;
+	}
+
+	seq = file->private_data;
+	seq->private = trav;
+	trav->nfproto = (unsigned long)PDE(inode)->data;
+	return 0;
+}
+
+static const struct file_operations xt_match_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = xt_match_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return xt_mttg_seq_start(seq, pos, true);
+}
+
+static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	return xt_mttg_seq_next(seq, v, ppos, true);
+}
+
+static int xt_target_seq_show(struct seq_file *seq, void *v)
+{
+	const struct nf_mttg_trav *trav = seq->private;
+	const struct xt_target *target;
+
+	switch (trav->class) {
+	case MTTG_TRAV_NFP_UNSPEC:
+	case MTTG_TRAV_NFP_SPEC:
+		if (trav->curr == trav->head)
+			return 0;
+		target = list_entry(trav->curr, struct xt_target, list);
+		return (*target->name == '\0') ? 0 :
+		       seq_printf(seq, "%s\n", target->name);
+	}
+	return 0;
+}
+
+static const struct seq_operations xt_target_seq_ops = {
+	.start	= xt_target_seq_start,
+	.next	= xt_target_seq_next,
+	.stop	= xt_mttg_seq_stop,
+	.show	= xt_target_seq_show,
+};
+
+static int xt_target_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct nf_mttg_trav *trav;
+	int ret;
+
+	trav = kmalloc(sizeof(*trav), GFP_KERNEL);
+	if (trav == NULL)
+		return -ENOMEM;
+
+	ret = seq_open(file, &xt_target_seq_ops);
+	if (ret < 0) {
+		kfree(trav);
+		return ret;
+	}
+
+	seq = file->private_data;
+	seq->private = trav;
+	trav->nfproto = (unsigned long)PDE(inode)->data;
+	return 0;
+}
+
+static const struct file_operations xt_target_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = xt_target_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#define FORMAT_TABLES	"_tables_names"
+#define	FORMAT_MATCHES	"_tables_matches"
+#define FORMAT_TARGETS 	"_tables_targets"
+
+#endif /* CONFIG_PROC_FS */
+
+/**
+ * xt_hook_link - set up hooks for a new table
+ * @table:	table with metadata needed to set up hooks
+ * @fn:		Hook function
+ *
+ * This function will take care of creating and registering the necessary
+ * Netfilter hooks for XT tables.
+ */
+struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+{
+	unsigned int hook_mask = table->valid_hooks;
+	uint8_t i, num_hooks = hweight32(hook_mask);
+	uint8_t hooknum;
+	struct nf_hook_ops *ops;
+	int ret;
+
+	ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
+	if (ops == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0;
+	     hook_mask >>= 1, ++hooknum) {
+		if (!(hook_mask & 1))
+			continue;
+		ops[i].hook     = fn;
+		ops[i].owner    = table->me;
+		ops[i].pf       = table->af;
+		ops[i].hooknum  = hooknum;
+		ops[i].priority = table->priority;
+		++i;
+	}
+
+	ret = nf_register_hooks(ops, num_hooks);
+	if (ret < 0) {
+		kfree(ops);
+		return ERR_PTR(ret);
+	}
+
+	return ops;
+}
+EXPORT_SYMBOL_GPL(xt_hook_link);
+
+/**
+ * xt_hook_unlink - remove hooks for a table
+ * @ops:	nf_hook_ops array as returned by nf_hook_link
+ * @hook_mask:	the very same mask that was passed to nf_hook_link
+ */
+void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
+{
+	nf_unregister_hooks(ops, hweight32(table->valid_hooks));
+	kfree(ops);
+}
+EXPORT_SYMBOL_GPL(xt_hook_unlink);
+
+int xt_proto_init(struct net *net, u_int8_t af)
+{
+#ifdef CONFIG_PROC_FS
+	char buf[XT_FUNCTION_MAXNAMELEN];
+	struct proc_dir_entry *proc;
+#endif
+
+	if (af >= ARRAY_SIZE(xt_prefix))
+		return -EINVAL;
+
+
+#ifdef CONFIG_PROC_FS
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
+				(void *)(unsigned long)af);
+	if (!proc)
+		goto out;
+
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops,
+				(void *)(unsigned long)af);
+	if (!proc)
+		goto out_remove_tables;
+
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+	proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops,
+				(void *)(unsigned long)af);
+	if (!proc)
+		goto out_remove_matches;
+#endif
+
+	return 0;
+
+#ifdef CONFIG_PROC_FS
+out_remove_matches:
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc_net_remove(net, buf);
+
+out_remove_tables:
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc_net_remove(net, buf);
+out:
+	return -1;
+#endif
+}
+EXPORT_SYMBOL_GPL(xt_proto_init);
+
+void xt_proto_fini(struct net *net, u_int8_t af)
+{
+#ifdef CONFIG_PROC_FS
+	char buf[XT_FUNCTION_MAXNAMELEN];
+
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TABLES, sizeof(buf));
+	proc_net_remove(net, buf);
+
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
+	proc_net_remove(net, buf);
+
+	strlcpy(buf, xt_prefix[af], sizeof(buf));
+	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
+	proc_net_remove(net, buf);
+#endif /*CONFIG_PROC_FS*/
+}
+EXPORT_SYMBOL_GPL(xt_proto_fini);
+
+static int __net_init xt_net_init(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < NFPROTO_NUMPROTO; i++)
+		INIT_LIST_HEAD(&net->xt.tables[i]);
+	return 0;
+}
+
+static struct pernet_operations xt_net_ops = {
+	.init = xt_net_init,
+};
+
+static int __init xt_init(void)
+{
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i) {
+		seqcount_init(&per_cpu(xt_recseq, i));
+	}
+
+	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
+	if (!xt)
+		return -ENOMEM;
+
+	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+		mutex_init(&xt[i].mutex);
+#ifdef CONFIG_COMPAT
+		mutex_init(&xt[i].compat_mutex);
+		xt[i].compat_tab = NULL;
+#endif
+		INIT_LIST_HEAD(&xt[i].target);
+		INIT_LIST_HEAD(&xt[i].match);
+	}
+	rv = register_pernet_subsys(&xt_net_ops);
+	if (rv < 0)
+		kfree(xt);
+	return rv;
+}
+
+static void __exit xt_fini(void)
+{
+	unregister_pernet_subsys(&xt_net_ops);
+	kfree(xt);
+}
+
+module_init(xt_init);
+module_exit(xt_fini);
+
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 00000000..363a99ec
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,222 @@
+/*
+ * Creates audit record for dropped/accepted packets
+ *
+ * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
+ * (C) 2010-2011 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_AUDIT.h>
+#include <linux/netfilter_bridge/ebtables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
+MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
+MODULE_ALIAS("ipt_AUDIT");
+MODULE_ALIAS("ip6t_AUDIT");
+MODULE_ALIAS("ebt_AUDIT");
+MODULE_ALIAS("arpt_AUDIT");
+
+static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
+			unsigned int proto, unsigned int offset)
+{
+	switch (proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
+		const __be16 *pptr;
+		__be16 _ports[2];
+
+		pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
+		if (pptr == NULL) {
+			audit_log_format(ab, " truncated=1");
+			return;
+		}
+
+		audit_log_format(ab, " sport=%hu dport=%hu",
+				 ntohs(pptr[0]), ntohs(pptr[1]));
+		}
+		break;
+
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6: {
+		const u8 *iptr;
+		u8 _ih[2];
+
+		iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
+		if (iptr == NULL) {
+			audit_log_format(ab, " truncated=1");
+			return;
+		}
+
+		audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
+				 iptr[0], iptr[1]);
+
+		}
+		break;
+	}
+}
+
+static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+
+	ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+	if (!ih) {
+		audit_log_format(ab, " truncated=1");
+		return;
+	}
+
+	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
+		&ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
+
+	if (ntohs(ih->frag_off) & IP_OFFSET) {
+		audit_log_format(ab, " frag=1");
+		return;
+	}
+
+	audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
+}
+
+static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	u8 nexthdr;
+	int offset;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+	if (!ih) {
+		audit_log_format(ab, " truncated=1");
+		return;
+	}
+
+	nexthdr = ih->nexthdr;
+	offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
+				  &nexthdr);
+
+	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+			 &ih->saddr, &ih->daddr, nexthdr);
+
+	if (offset)
+		audit_proto(ab, skb, nexthdr, offset);
+}
+
+static unsigned int
+audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_audit_info *info = par->targinfo;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+	if (ab == NULL)
+		goto errout;
+
+	audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
+			 info->type, par->hooknum, skb->len,
+			 par->in ? par->in->name : "?",
+			 par->out ? par->out->name : "?");
+
+	if (skb->mark)
+		audit_log_format(ab, " mark=%#x", skb->mark);
+
+	if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+		audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
+				 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+				 ntohs(eth_hdr(skb)->h_proto));
+
+		if (par->family == NFPROTO_BRIDGE) {
+			switch (eth_hdr(skb)->h_proto) {
+			case __constant_htons(ETH_P_IP):
+				audit_ip4(ab, skb);
+				break;
+
+			case __constant_htons(ETH_P_IPV6):
+				audit_ip6(ab, skb);
+				break;
+			}
+		}
+	}
+
+	switch (par->family) {
+	case NFPROTO_IPV4:
+		audit_ip4(ab, skb);
+		break;
+
+	case NFPROTO_IPV6:
+		audit_ip6(ab, skb);
+		break;
+	}
+
+	audit_log_end(ab);
+
+errout:
+	return XT_CONTINUE;
+}
+
+static unsigned int
+audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	audit_tg(skb, par);
+	return EBT_CONTINUE;
+}
+
+static int audit_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_audit_info *info = par->targinfo;
+
+	if (info->type > XT_AUDIT_TYPE_MAX) {
+		pr_info("Audit type out of range (valid range: 0..%hhu)\n",
+			XT_AUDIT_TYPE_MAX);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static struct xt_target audit_tg_reg[] __read_mostly = {
+	{
+		.name		= "AUDIT",
+		.family		= NFPROTO_UNSPEC,
+		.target		= audit_tg,
+		.targetsize	= sizeof(struct xt_audit_info),
+		.checkentry	= audit_tg_check,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "AUDIT",
+		.family		= NFPROTO_BRIDGE,
+		.target		= audit_tg_ebt,
+		.targetsize	= sizeof(struct xt_audit_info),
+		.checkentry	= audit_tg_check,
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init audit_tg_init(void)
+{
+	return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+static void __exit audit_tg_exit(void)
+{
+	xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg));
+}
+
+module_init(audit_tg_init);
+module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 00000000..0f642ef8
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
new file mode 100644
index 00000000..af9c4dad
--- /dev/null
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -0,0 +1,73 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CLASSIFY.h>
+#include <linux/netfilter_arp.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Qdisc classification");
+MODULE_ALIAS("ipt_CLASSIFY");
+MODULE_ALIAS("ip6t_CLASSIFY");
+MODULE_ALIAS("arpt_CLASSIFY");
+
+static unsigned int
+classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_classify_target_info *clinfo = par->targinfo;
+
+	skb->priority = clinfo->priority;
+	return XT_CONTINUE;
+}
+
+static struct xt_target classify_tg_reg[] __read_mostly = {
+	{
+		.name       = "CLASSIFY",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+		              (1 << NF_INET_POST_ROUTING),
+		.target     = classify_tg,
+		.targetsize = sizeof(struct xt_classify_target_info),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "CLASSIFY",
+		.revision   = 0,
+		.family     = NFPROTO_ARP,
+		.hooks      = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
+		.target     = classify_tg,
+		.targetsize = sizeof(struct xt_classify_target_info),
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init classify_tg_init(void)
+{
+	return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
+}
+
+static void __exit classify_tg_exit(void)
+{
+	xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
+}
+
+module_init(classify_tg_init);
+module_exit(classify_tg_exit);
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
new file mode 100644
index 00000000..e04dc282
--- /dev/null
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -0,0 +1,143 @@
+/*
+ * This module is used to copy security markings from packets
+ * to connections, and restore security markings from connections
+ * back to packets.  This would normally be performed in conjunction
+ * with the SECMARK target and state match.
+ *
+ * Based somewhat on CONNMARK:
+ *   Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ *    by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CONNSECMARK.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark");
+MODULE_ALIAS("ipt_CONNSECMARK");
+MODULE_ALIAS("ip6t_CONNSECMARK");
+
+/*
+ * If the packet has a security mark and the connection does not, copy
+ * the security mark from the packet to the connection.
+ */
+static void secmark_save(const struct sk_buff *skb)
+{
+	if (skb->secmark) {
+		struct nf_conn *ct;
+		enum ip_conntrack_info ctinfo;
+
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && !ct->secmark) {
+			ct->secmark = skb->secmark;
+			nf_conntrack_event_cache(IPCT_SECMARK, ct);
+		}
+	}
+}
+
+/*
+ * If packet has no security mark, and the connection does, restore the
+ * security mark from the connection to the packet.
+ */
+static void secmark_restore(struct sk_buff *skb)
+{
+	if (!skb->secmark) {
+		const struct nf_conn *ct;
+		enum ip_conntrack_info ctinfo;
+
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && ct->secmark)
+			skb->secmark = ct->secmark;
+	}
+}
+
+static unsigned int
+connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connsecmark_target_info *info = par->targinfo;
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+		secmark_save(skb);
+		break;
+
+	case CONNSECMARK_RESTORE:
+		secmark_restore(skb);
+		break;
+
+	default:
+		BUG();
+	}
+
+	return XT_CONTINUE;
+}
+
+static int connsecmark_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_connsecmark_target_info *info = par->targinfo;
+	int ret;
+
+	if (strcmp(par->table, "mangle") != 0 &&
+	    strcmp(par->table, "security") != 0) {
+		pr_info("target only valid in the \'mangle\' "
+			"or \'security\' tables, not \'%s\'.\n", par->table);
+		return -EINVAL;
+	}
+
+	switch (info->mode) {
+	case CONNSECMARK_SAVE:
+	case CONNSECMARK_RESTORE:
+		break;
+
+	default:
+		pr_info("invalid mode: %hu\n", info->mode);
+		return -EINVAL;
+	}
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target connsecmark_tg_reg __read_mostly = {
+	.name       = "CONNSECMARK",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = connsecmark_tg_check,
+	.destroy    = connsecmark_tg_destroy,
+	.target     = connsecmark_tg,
+	.targetsize = sizeof(struct xt_connsecmark_target_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init connsecmark_tg_init(void)
+{
+	return xt_register_target(&connsecmark_tg_reg);
+}
+
+static void __exit connsecmark_tg_exit(void)
+{
+	xt_unregister_target(&connsecmark_tg_reg);
+}
+
+module_init(connsecmark_tg_init);
+module_exit(connsecmark_tg_exit);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
new file mode 100644
index 00000000..782e5198
--- /dev/null
+++ b/net/netfilter/xt_CT.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2010 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CT.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static unsigned int xt_ct_target(struct sk_buff *skb,
+				 const struct xt_action_param *par)
+{
+	const struct xt_ct_target_info *info = par->targinfo;
+	struct nf_conn *ct = info->ct;
+
+	/* Previously seen (loopback)? Ignore. */
+	if (skb->nfct != NULL)
+		return XT_CONTINUE;
+
+	atomic_inc(&ct->ct_general.use);
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+
+	return XT_CONTINUE;
+}
+
+static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)
+{
+	if (par->family == NFPROTO_IPV4) {
+		const struct ipt_entry *e = par->entryinfo;
+
+		if (e->ip.invflags & IPT_INV_PROTO)
+			return 0;
+		return e->ip.proto;
+	} else if (par->family == NFPROTO_IPV6) {
+		const struct ip6t_entry *e = par->entryinfo;
+
+		if (e->ipv6.invflags & IP6T_INV_PROTO)
+			return 0;
+		return e->ipv6.proto;
+	} else
+		return 0;
+}
+
+static int xt_ct_tg_check(const struct xt_tgchk_param *par)
+{
+	struct xt_ct_target_info *info = par->targinfo;
+	struct nf_conntrack_tuple t;
+	struct nf_conn_help *help;
+	struct nf_conn *ct;
+	int ret = 0;
+	u8 proto;
+
+	if (info->flags & ~XT_CT_NOTRACK)
+		return -EINVAL;
+
+	if (info->flags & XT_CT_NOTRACK) {
+		ct = nf_ct_untracked_get();
+		atomic_inc(&ct->ct_general.use);
+		goto out;
+	}
+
+#ifndef CONFIG_NF_CONNTRACK_ZONES
+	if (info->zone)
+		goto err1;
+#endif
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		goto err1;
+
+	memset(&t, 0, sizeof(t));
+	ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
+	ret = PTR_ERR(ct);
+	if (IS_ERR(ct))
+		goto err2;
+
+	ret = 0;
+	if ((info->ct_events || info->exp_events) &&
+	    !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
+				  GFP_KERNEL))
+		goto err3;
+
+	if (info->helper[0]) {
+		ret = -ENOENT;
+		proto = xt_ct_find_proto(par);
+		if (!proto)
+			goto err3;
+
+		ret = -ENOMEM;
+		help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+		if (help == NULL)
+			goto err3;
+
+		ret = -ENOENT;
+		help->helper = nf_conntrack_helper_try_module_get(info->helper,
+								  par->family,
+								  proto);
+		if (help->helper == NULL)
+			goto err3;
+	}
+
+	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+out:
+	info->ct = ct;
+	return 0;
+
+err3:
+	nf_conntrack_free(ct);
+err2:
+	nf_ct_l3proto_module_put(par->family);
+err1:
+	return ret;
+}
+
+static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	struct xt_ct_target_info *info = par->targinfo;
+	struct nf_conn *ct = info->ct;
+	struct nf_conn_help *help;
+
+	if (!nf_ct_is_untracked(ct)) {
+		help = nfct_help(ct);
+		if (help)
+			module_put(help->helper->me);
+
+		nf_ct_l3proto_module_put(par->family);
+	}
+	nf_ct_put(info->ct);
+}
+
+static struct xt_target xt_ct_tg __read_mostly = {
+	.name		= "CT",
+	.family		= NFPROTO_UNSPEC,
+	.targetsize	= sizeof(struct xt_ct_target_info),
+	.checkentry	= xt_ct_tg_check,
+	.destroy	= xt_ct_tg_destroy,
+	.target		= xt_ct_target,
+	.table		= "raw",
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_ct_tg_init(void)
+{
+	return xt_register_target(&xt_ct_tg);
+}
+
+static void __exit xt_ct_tg_exit(void)
+{
+	xt_unregister_target(&xt_ct_tg);
+}
+
+module_init(xt_ct_tg_init);
+module_exit(xt_ct_tg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: connection tracking target");
+MODULE_ALIAS("ipt_CT");
+MODULE_ALIAS("ip6t_CT");
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
new file mode 100644
index 00000000..a192f55e
--- /dev/null
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -0,0 +1,408 @@
+/*
+ * linux/net/netfilter/xt_IDLETIMER.c
+ *
+ * Netfilter module to trigger a timer when packet matches.
+ * After timer expires a kevent will be sent.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ *
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and reworked for upstream inclusion
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_IDLETIMER.h>
+#include <linux/netlink.h>
+#include <linux/kdev_t.h>
+#include <linux/kobject.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+#include <net/net_namespace.h>
+
+static struct sock *nl_sk;
+
+struct idletimer_tg_attr {
+	struct attribute attr;
+	ssize_t	(*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+};
+
+struct idletimer_tg {
+	struct list_head entry;
+	struct timer_list timer;
+	struct work_struct work;
+
+	struct kobject *kobj;
+	struct idletimer_tg_attr attr;
+
+	unsigned int refcnt;
+	bool send_nl_msg;
+	bool active;
+};
+
+static LIST_HEAD(idletimer_tg_list);
+static DEFINE_MUTEX(list_mutex);
+
+static struct kobject *idletimer_tg_kobj;
+
+static void notify_netlink(const char *iface, struct idletimer_tg *timer)
+{
+	struct sk_buff *log_skb;
+	size_t size;
+	struct nlmsghdr *nlh;
+	char str[NLMSG_MAX_SIZE];
+	int event_type, res;
+
+	size = NLMSG_SPACE(NLMSG_MAX_SIZE);
+	size = max(size, (size_t)NLMSG_GOODSIZE);
+	log_skb = alloc_skb(size, GFP_ATOMIC);
+	if (!log_skb) {
+		pr_err("xt_cannot alloc skb for logging\n");
+		return;
+	}
+
+	event_type = timer->active ? NL_EVENT_TYPE_ACTIVE
+				: NL_EVENT_TYPE_INACTIVE;
+	res = snprintf(str, NLMSG_MAX_SIZE, "%s %s\n", iface,
+		timer->active ? "ACTIVE" : "INACTIVE");
+	if (NLMSG_MAX_SIZE <= res)
+		goto nlmsg_failure;
+
+	/* NLMSG_PUT() uses "goto nlmsg_failure" */
+	nlh = NLMSG_PUT(log_skb, /*pid*/0, /*seq*/0, event_type,
+			/* Size of message */NLMSG_MAX_SIZE);
+
+	strncpy(NLMSG_DATA(nlh), str, MAX_IDLETIMER_LABEL_SIZE);
+
+	NETLINK_CB(log_skb).dst_group = 1;
+	netlink_broadcast(nl_sk, log_skb, 0, 1, GFP_ATOMIC);
+
+	pr_debug("putting nlmsg: %s", str);
+	return;
+
+nlmsg_failure:  /* Used within NLMSG_PUT() */
+	consume_skb(log_skb);
+	pr_debug("Failed nlmsg_put\n");
+}
+
+static
+struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
+{
+	struct idletimer_tg *entry;
+
+	BUG_ON(!label);
+
+	list_for_each_entry(entry, &idletimer_tg_list, entry) {
+		if (!strcmp(label, entry->attr.attr.name))
+			return entry;
+	}
+
+	return NULL;
+}
+
+static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct idletimer_tg *timer;
+	unsigned long expires = 0;
+	unsigned long now = jiffies;
+
+	mutex_lock(&list_mutex);
+
+	timer =	__idletimer_tg_find_by_label(attr->name);
+	if (timer)
+		expires = timer->timer.expires;
+
+	mutex_unlock(&list_mutex);
+
+	if (time_after(expires, now))
+		return sprintf(buf, "%u\n",
+			       jiffies_to_msecs(expires - now) / 1000);
+
+	if (timer->send_nl_msg)
+		return sprintf(buf, "0 %d\n",
+			jiffies_to_msecs(now - expires) / 1000);
+	else
+		return sprintf(buf, "0\n");
+}
+
+static void idletimer_tg_work(struct work_struct *work)
+{
+	struct idletimer_tg *timer = container_of(work, struct idletimer_tg,
+						  work);
+
+	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+	if (timer->send_nl_msg)
+		notify_netlink(timer->attr.attr.name, timer);
+}
+
+static void idletimer_tg_expired(unsigned long data)
+{
+	struct idletimer_tg *timer = (struct idletimer_tg *) data;
+
+	pr_debug("timer %s expired\n", timer->attr.attr.name);
+
+	timer->active = false;
+	schedule_work(&timer->work);
+}
+
+static int idletimer_tg_create(struct idletimer_tg_info *info)
+{
+	int ret;
+
+	info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+	if (!info->timer) {
+		pr_debug("couldn't alloc timer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+	if (!info->timer->attr.attr.name) {
+		pr_debug("couldn't alloc attribute name\n");
+		ret = -ENOMEM;
+		goto out_free_timer;
+	}
+	info->timer->attr.attr.mode = S_IRUGO;
+	info->timer->attr.show = idletimer_tg_show;
+
+	ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+	if (ret < 0) {
+		pr_debug("couldn't add file to sysfs");
+		goto out_free_attr;
+	}
+
+	list_add(&info->timer->entry, &idletimer_tg_list);
+
+	setup_timer(&info->timer->timer, idletimer_tg_expired,
+		    (unsigned long) info->timer);
+	info->timer->refcnt = 1;
+	info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+	info->timer->active = true;
+
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+	INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+	return 0;
+
+out_free_attr:
+	kfree(info->timer->attr.attr.name);
+out_free_timer:
+	kfree(info->timer);
+out:
+	return ret;
+}
+
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target(struct sk_buff *skb,
+					 const struct xt_action_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+	unsigned long now = jiffies;
+
+	pr_debug("resetting timer %s, timeout period %u\n",
+		 info->label, info->timeout);
+
+	BUG_ON(!info->timer);
+
+	info->timer->active = true;
+
+	if (time_before(info->timer->timer.expires, now)) {
+		schedule_work(&info->timer->work);
+		pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+			 info->label, info->timer->timer.expires, now);
+	}
+
+	/* TODO: Avoid modifying timers on each packet */
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + now);
+
+	return XT_CONTINUE;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+	struct idletimer_tg_info *info = par->targinfo;
+	int ret;
+	unsigned long now = jiffies;
+
+	pr_debug("checkentry targinfo %s\n", info->label);
+
+	if (info->timeout == 0) {
+		pr_debug("timeout value is zero\n");
+		return -EINVAL;
+	}
+
+	if (info->label[0] == '\0' ||
+	    strnlen(info->label,
+		    MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) {
+		pr_debug("label is empty or not nul-terminated\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&list_mutex);
+
+	info->timer = __idletimer_tg_find_by_label(info->label);
+	if (info->timer) {
+		info->timer->refcnt++;
+		info->timer->active = true;
+
+		if (time_before(info->timer->timer.expires, now)) {
+			schedule_work(&info->timer->work);
+			pr_debug("Starting Checkentry timer"
+				"(Expired, Jiffies): %lu, %lu\n",
+				info->timer->timer.expires, now);
+		}
+
+		mod_timer(&info->timer->timer,
+			  msecs_to_jiffies(info->timeout * 1000) + now);
+
+		pr_debug("increased refcnt of timer %s to %u\n",
+			 info->label, info->timer->refcnt);
+	} else {
+		ret = idletimer_tg_create(info);
+		if (ret < 0) {
+			pr_debug("failed to create timer\n");
+			mutex_unlock(&list_mutex);
+			return ret;
+		}
+	}
+
+	mutex_unlock(&list_mutex);
+
+	return 0;
+}
+
+static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+
+	pr_debug("destroy targinfo %s\n", info->label);
+
+	mutex_lock(&list_mutex);
+
+	if (--info->timer->refcnt == 0) {
+		pr_debug("deleting timer %s\n", info->label);
+
+		list_del(&info->timer->entry);
+		del_timer_sync(&info->timer->timer);
+		sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+		kfree(info->timer->attr.attr.name);
+		kfree(info->timer);
+	} else {
+		pr_debug("decreased refcnt of timer %s to %u\n",
+		info->label, info->timer->refcnt);
+	}
+
+	mutex_unlock(&list_mutex);
+}
+
+static struct xt_target idletimer_tg __read_mostly = {
+	.name		= "IDLETIMER",
+	.revision	= 1,
+	.family		= NFPROTO_UNSPEC,
+	.target		= idletimer_tg_target,
+	.targetsize     = sizeof(struct idletimer_tg_info),
+	.checkentry	= idletimer_tg_checkentry,
+	.destroy        = idletimer_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct class *idletimer_tg_class;
+
+static struct device *idletimer_tg_device;
+
+static int __init idletimer_tg_init(void)
+{
+	int err;
+
+	idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+	err = PTR_ERR(idletimer_tg_class);
+	if (IS_ERR(idletimer_tg_class)) {
+		pr_debug("couldn't register device class\n");
+		goto out;
+	}
+
+	idletimer_tg_device = device_create(idletimer_tg_class, NULL,
+					    MKDEV(0, 0), NULL, "timers");
+	err = PTR_ERR(idletimer_tg_device);
+	if (IS_ERR(idletimer_tg_device)) {
+		pr_debug("couldn't register system device\n");
+		goto out_class;
+	}
+
+	idletimer_tg_kobj = &idletimer_tg_device->kobj;
+
+	err =  xt_register_target(&idletimer_tg);
+	if (err < 0) {
+		pr_debug("couldn't register xt target\n");
+		goto out_dev;
+	}
+
+	nl_sk = netlink_kernel_create(&init_net,
+				      NETLINK_IDLETIMER, 1, NULL,
+				      NULL, THIS_MODULE);
+
+	if (!nl_sk) {
+		pr_err("Failed to create netlink socket\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+out_dev:
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+out_class:
+	class_destroy(idletimer_tg_class);
+out:
+	return err;
+}
+
+static void __exit idletimer_tg_exit(void)
+{
+	xt_unregister_target(&idletimer_tg);
+
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+	class_destroy(idletimer_tg_class);
+}
+
+module_init(idletimer_tg_init);
+module_exit(idletimer_tg_exit);
+
+MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
+MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
+MODULE_DESCRIPTION("Xtables: idle time monitor");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("ipt_IDLETIMER");
+MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
new file mode 100644
index 00000000..993de2ba
--- /dev/null
+++ b/net/netfilter/xt_LED.c
@@ -0,0 +1,215 @@
+/*
+ * xt_LED.c - netfilter target to make LEDs blink upon packet matches
+ *
+ * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/slab.h>
+#include <linux/leds.h>
+#include <linux/mutex.h>
+
+#include <linux/netfilter/xt_LED.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
+MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
+MODULE_ALIAS("ipt_LED");
+MODULE_ALIAS("ip6t_LED");
+
+static LIST_HEAD(xt_led_triggers);
+static DEFINE_MUTEX(xt_led_mutex);
+
+/*
+ * This is declared in here (the kernel module) only, to avoid having these
+ * dependencies in userspace code.  This is what xt_led_info.internal_data
+ * points to.
+ */
+struct xt_led_info_internal {
+	struct list_head list;
+	int refcnt;
+	char *trigger_id;
+	struct led_trigger netfilter_led_trigger;
+	struct timer_list timer;
+};
+
+static unsigned int
+led_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+	/*
+	 * If "always blink" is enabled, and there's still some time until the
+	 * LED will switch off, briefly switch it off now.
+	 */
+	if ((ledinfo->delay > 0) && ledinfo->always_blink &&
+	    timer_pending(&ledinternal->timer))
+		led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+
+	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
+
+	/* If there's a positive delay, start/update the timer */
+	if (ledinfo->delay > 0) {
+		mod_timer(&ledinternal->timer,
+			  jiffies + msecs_to_jiffies(ledinfo->delay));
+
+	/* Otherwise if there was no delay given, blink as fast as possible */
+	} else if (ledinfo->delay == 0) {
+		led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+	}
+
+	/* else the delay is negative, which means switch on and stay on */
+
+	return XT_CONTINUE;
+}
+
+static void led_timeout_callback(unsigned long data)
+{
+	struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data;
+
+	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
+}
+
+static struct xt_led_info_internal *led_trigger_lookup(const char *name)
+{
+	struct xt_led_info_internal *ledinternal;
+
+	list_for_each_entry(ledinternal, &xt_led_triggers, list) {
+		if (!strcmp(name, ledinternal->netfilter_led_trigger.name)) {
+			return ledinternal;
+		}
+	}
+	return NULL;
+}
+
+static int led_tg_check(const struct xt_tgchk_param *par)
+{
+	struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal;
+	int err;
+
+	if (ledinfo->id[0] == '\0') {
+		pr_info("No 'id' parameter given.\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&xt_led_mutex);
+
+	ledinternal = led_trigger_lookup(ledinfo->id);
+	if (ledinternal) {
+		ledinternal->refcnt++;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL);
+	if (!ledinternal)
+		goto exit_mutex_only;
+
+	ledinternal->trigger_id = kstrdup(ledinfo->id, GFP_KERNEL);
+	if (!ledinternal->trigger_id)
+		goto exit_internal_alloc;
+
+	ledinternal->refcnt = 1;
+	ledinternal->netfilter_led_trigger.name = ledinternal->trigger_id;
+
+	err = led_trigger_register(&ledinternal->netfilter_led_trigger);
+	if (err) {
+		pr_warning("led_trigger_register() failed\n");
+		if (err == -EEXIST)
+			pr_warning("Trigger name is already in use.\n");
+		goto exit_alloc;
+	}
+
+	/* See if we need to set up a timer */
+	if (ledinfo->delay > 0)
+		setup_timer(&ledinternal->timer, led_timeout_callback,
+			    (unsigned long)ledinternal);
+
+	list_add_tail(&ledinternal->list, &xt_led_triggers);
+
+out:
+	mutex_unlock(&xt_led_mutex);
+
+	ledinfo->internal_data = ledinternal;
+
+	return 0;
+
+exit_alloc:
+	kfree(ledinternal->trigger_id);
+
+exit_internal_alloc:
+	kfree(ledinternal);
+
+exit_mutex_only:
+	mutex_unlock(&xt_led_mutex);
+
+	return err;
+}
+
+static void led_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct xt_led_info *ledinfo = par->targinfo;
+	struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
+
+	mutex_lock(&xt_led_mutex);
+
+	if (--ledinternal->refcnt) {
+		mutex_unlock(&xt_led_mutex);
+		return;
+	}
+
+	list_del(&ledinternal->list);
+
+	if (ledinfo->delay > 0)
+		del_timer_sync(&ledinternal->timer);
+
+	led_trigger_unregister(&ledinternal->netfilter_led_trigger);
+
+	mutex_unlock(&xt_led_mutex);
+
+	kfree(ledinternal->trigger_id);
+	kfree(ledinternal);
+}
+
+static struct xt_target led_tg_reg __read_mostly = {
+	.name		= "LED",
+	.revision	= 0,
+	.family		= NFPROTO_UNSPEC,
+	.target		= led_tg,
+	.targetsize	= sizeof(struct xt_led_info),
+	.checkentry	= led_tg_check,
+	.destroy	= led_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static int __init led_tg_init(void)
+{
+	return xt_register_target(&led_tg_reg);
+}
+
+static void __exit led_tg_exit(void)
+{
+	xt_unregister_target(&led_tg_reg);
+}
+
+module_init(led_tg_init);
+module_exit(led_tg_exit);
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
new file mode 100644
index 00000000..a17dd0f5
--- /dev/null
+++ b/net/netfilter/xt_NFLOG.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_NFLOG.h>
+#include <net/netfilter/nf_log.h>
+#include <net/netfilter/nfnetlink_log.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NFLOG");
+MODULE_ALIAS("ip6t_NFLOG");
+
+static unsigned int
+nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_nflog_info *info = par->targinfo;
+	struct nf_loginfo li;
+
+	li.type		     = NF_LOG_TYPE_ULOG;
+	li.u.ulog.copy_len   = info->len;
+	li.u.ulog.group	     = info->group;
+	li.u.ulog.qthreshold = info->threshold;
+
+	nfulnl_log_packet(par->family, par->hooknum, skb, par->in,
+			  par->out, &li, info->prefix);
+	return XT_CONTINUE;
+}
+
+static int nflog_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_nflog_info *info = par->targinfo;
+
+	if (info->flags & ~XT_NFLOG_MASK)
+		return -EINVAL;
+	if (info->prefix[sizeof(info->prefix) - 1] != '\0')
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target nflog_tg_reg __read_mostly = {
+	.name       = "NFLOG",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = nflog_tg_check,
+	.target     = nflog_tg,
+	.targetsize = sizeof(struct xt_nflog_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init nflog_tg_init(void)
+{
+	return xt_register_target(&nflog_tg_reg);
+}
+
+static void __exit nflog_tg_exit(void)
+{
+	xt_unregister_target(&nflog_tg_reg);
+}
+
+module_init(nflog_tg_init);
+module_exit(nflog_tg_exit);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
new file mode 100644
index 00000000..d4f4b5d6
--- /dev/null
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -0,0 +1,160 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: packet forwarding to netlink");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NFQUEUE");
+MODULE_ALIAS("ip6t_NFQUEUE");
+MODULE_ALIAS("arpt_NFQUEUE");
+
+static u32 jhash_initval __read_mostly;
+static bool rnd_inited __read_mostly;
+
+static unsigned int
+nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_NFQ_info *tinfo = par->targinfo;
+
+	return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static u32 hash_v4(const struct sk_buff *skb)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__be32 ipaddr;
+
+	/* packets in either direction go into same queue */
+	ipaddr = iph->saddr ^ iph->daddr;
+
+	return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval);
+}
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+static u32 hash_v6(const struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__be32 addr[4];
+
+	addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0];
+	addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1];
+	addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2];
+	addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3];
+
+	return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval);
+}
+#endif
+
+static unsigned int
+nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_NFQ_info_v1 *info = par->targinfo;
+	u32 queue = info->queuenum;
+
+	if (info->queues_total > 1) {
+		if (par->family == NFPROTO_IPV4)
+			queue = (((u64) hash_v4(skb) * info->queues_total) >>
+				 32) + queue;
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+		else if (par->family == NFPROTO_IPV6)
+			queue = (((u64) hash_v6(skb) * info->queues_total) >>
+				 32) + queue;
+#endif
+	}
+	return NF_QUEUE_NR(queue);
+}
+
+static unsigned int
+nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_NFQ_info_v2 *info = par->targinfo;
+	unsigned int ret = nfqueue_tg_v1(skb, par);
+
+	if (info->bypass)
+		ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
+	return ret;
+}
+
+static int nfqueue_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_NFQ_info_v2 *info = par->targinfo;
+	u32 maxid;
+
+	if (unlikely(!rnd_inited)) {
+		get_random_bytes(&jhash_initval, sizeof(jhash_initval));
+		rnd_inited = true;
+	}
+	if (info->queues_total == 0) {
+		pr_err("NFQUEUE: number of total queues is 0\n");
+		return -EINVAL;
+	}
+	maxid = info->queues_total - 1 + info->queuenum;
+	if (maxid > 0xffff) {
+		pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n",
+		       info->queues_total, maxid);
+		return -ERANGE;
+	}
+	if (par->target->revision == 2 && info->bypass > 1)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_target nfqueue_tg_reg[] __read_mostly = {
+	{
+		.name		= "NFQUEUE",
+		.family		= NFPROTO_UNSPEC,
+		.target		= nfqueue_tg,
+		.targetsize	= sizeof(struct xt_NFQ_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "NFQUEUE",
+		.revision	= 1,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= nfqueue_tg_check,
+		.target		= nfqueue_tg_v1,
+		.targetsize	= sizeof(struct xt_NFQ_info_v1),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "NFQUEUE",
+		.revision	= 2,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= nfqueue_tg_check,
+		.target		= nfqueue_tg_v2,
+		.targetsize	= sizeof(struct xt_NFQ_info_v2),
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init nfqueue_tg_init(void)
+{
+	return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg));
+}
+
+static void __exit nfqueue_tg_exit(void)
+{
+	xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg));
+}
+
+module_init(nfqueue_tg_init);
+module_exit(nfqueue_tg_exit);
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
new file mode 100644
index 00000000..9d782181
--- /dev/null
+++ b/net/netfilter/xt_NOTRACK.c
@@ -0,0 +1,53 @@
+/* This is a module which is used for setting up fake conntracks
+ * on packets so that they are not seen by the conntrack/NAT code.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+
+MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_NOTRACK");
+MODULE_ALIAS("ip6t_NOTRACK");
+
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	/* Previously seen (loopback)? Ignore. */
+	if (skb->nfct != NULL)
+		return XT_CONTINUE;
+
+	/* Attach fake conntrack entry.
+	   If there is a real ct entry correspondig to this packet,
+	   it'll hang aroun till timing out. We don't deal with it
+	   for performance reasons. JK */
+	skb->nfct = &nf_ct_untracked_get()->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+
+	return XT_CONTINUE;
+}
+
+static struct xt_target notrack_tg_reg __read_mostly = {
+	.name     = "NOTRACK",
+	.revision = 0,
+	.family   = NFPROTO_UNSPEC,
+	.target   = notrack_tg,
+	.table    = "raw",
+	.me       = THIS_MODULE,
+};
+
+static int __init notrack_tg_init(void)
+{
+	return xt_register_target(&notrack_tg_reg);
+}
+
+static void __exit notrack_tg_exit(void)
+{
+	xt_unregister_target(&notrack_tg_reg);
+}
+
+module_init(notrack_tg_init);
+module_exit(notrack_tg_exit);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
new file mode 100644
index 00000000..9faf5e05
--- /dev/null
+++ b/net/netfilter/xt_SECMARK.c
@@ -0,0 +1,147 @@
+/*
+ * Module for modifying the secmark field of the skb, for use by
+ * security subsystems.
+ *
+ * Based on the nfmark match by:
+ * (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SECMARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@redhat.com>");
+MODULE_DESCRIPTION("Xtables: packet security mark modification");
+MODULE_ALIAS("ipt_SECMARK");
+MODULE_ALIAS("ip6t_SECMARK");
+
+#define PFX "SECMARK: "
+
+static u8 mode;
+
+static unsigned int
+secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	u32 secmark = 0;
+	const struct xt_secmark_target_info *info = par->targinfo;
+
+	BUG_ON(info->mode != mode);
+
+	switch (mode) {
+	case SECMARK_MODE_SEL:
+		secmark = info->secid;
+		break;
+	default:
+		BUG();
+	}
+
+	skb->secmark = secmark;
+	return XT_CONTINUE;
+}
+
+static int checkentry_lsm(struct xt_secmark_target_info *info)
+{
+	int err;
+
+	info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+	info->secid = 0;
+
+	err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+				       &info->secid);
+	if (err) {
+		if (err == -EINVAL)
+			pr_info("invalid security context \'%s\'\n", info->secctx);
+		return err;
+	}
+
+	if (!info->secid) {
+		pr_info("unable to map security context \'%s\'\n", info->secctx);
+		return -ENOENT;
+	}
+
+	err = security_secmark_relabel_packet(info->secid);
+	if (err) {
+		pr_info("unable to obtain relabeling permission\n");
+		return err;
+	}
+
+	security_secmark_refcount_inc();
+	return 0;
+}
+
+static int secmark_tg_check(const struct xt_tgchk_param *par)
+{
+	struct xt_secmark_target_info *info = par->targinfo;
+	int err;
+
+	if (strcmp(par->table, "mangle") != 0 &&
+	    strcmp(par->table, "security") != 0) {
+		pr_info("target only valid in the \'mangle\' "
+			"or \'security\' tables, not \'%s\'.\n", par->table);
+		return -EINVAL;
+	}
+
+	if (mode && mode != info->mode) {
+		pr_info("mode already set to %hu cannot mix with "
+			"rules for mode %hu\n", mode, info->mode);
+		return -EINVAL;
+	}
+
+	switch (info->mode) {
+	case SECMARK_MODE_SEL:
+		break;
+	default:
+		pr_info("invalid mode: %hu\n", info->mode);
+		return -EINVAL;
+	}
+
+	err = checkentry_lsm(info);
+	if (err)
+		return err;
+
+	if (!mode)
+		mode = info->mode;
+	return 0;
+}
+
+static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	switch (mode) {
+	case SECMARK_MODE_SEL:
+		security_secmark_refcount_dec();
+	}
+}
+
+static struct xt_target secmark_tg_reg __read_mostly = {
+	.name       = "SECMARK",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = secmark_tg_check,
+	.destroy    = secmark_tg_destroy,
+	.target     = secmark_tg,
+	.targetsize = sizeof(struct xt_secmark_target_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init secmark_tg_init(void)
+{
+	return xt_register_target(&secmark_tg_reg);
+}
+
+static void __exit secmark_tg_exit(void)
+{
+	xt_unregister_target(&secmark_tg_reg);
+}
+
+module_init(secmark_tg_init);
+module_exit(secmark_tg_exit);
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
new file mode 100644
index 00000000..9dc9ecfd
--- /dev/null
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -0,0 +1,142 @@
+/*
+ * A module for stripping a specific TCP option from TCP packets.
+ *
+ * Copyright (C) 2007 Sven Schnelle <svens@bitebene.org>
+ * Copyright © CC Computer Consultants GmbH, 2007
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TCPOPTSTRIP.h>
+
+static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
+{
+	/* Beware zero-length options: make finite progress */
+	if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0)
+		return 1;
+	else
+		return opt[offset+1];
+}
+
+static unsigned int
+tcpoptstrip_mangle_packet(struct sk_buff *skb,
+			  const struct xt_tcpoptstrip_target_info *info,
+			  unsigned int tcphoff, unsigned int minlen)
+{
+	unsigned int optl, i, j;
+	struct tcphdr *tcph;
+	u_int16_t n, o;
+	u_int8_t *opt;
+
+	if (!skb_make_writable(skb, skb->len))
+		return NF_DROP;
+
+	tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+	opt  = (u_int8_t *)tcph;
+
+	/*
+	 * Walk through all TCP options - if we find some option to remove,
+	 * set all octets to %TCPOPT_NOP and adjust checksum.
+	 */
+	for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) {
+		optl = optlen(opt, i);
+
+		if (i + optl > tcp_hdrlen(skb))
+			break;
+
+		if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i]))
+			continue;
+
+		for (j = 0; j < optl; ++j) {
+			o = opt[i+j];
+			n = TCPOPT_NOP;
+			if ((i + j) % 2 == 0) {
+				o <<= 8;
+				n <<= 8;
+			}
+			inet_proto_csum_replace2(&tcph->check, skb, htons(o),
+						 htons(n), 0);
+		}
+		memset(opt + i, TCPOPT_NOP, optl);
+	}
+
+	return XT_CONTINUE;
+}
+
+static unsigned int
+tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb),
+	       sizeof(struct iphdr) + sizeof(struct tcphdr));
+}
+
+#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
+static unsigned int
+tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	int tcphoff;
+	u_int8_t nexthdr;
+
+	nexthdr = ipv6h->nexthdr;
+	tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr);
+	if (tcphoff < 0)
+		return NF_DROP;
+
+	return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff,
+	       sizeof(*ipv6h) + sizeof(struct tcphdr));
+}
+#endif
+
+static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
+	{
+		.name       = "TCPOPTSTRIP",
+		.family     = NFPROTO_IPV4,
+		.table      = "mangle",
+		.proto      = IPPROTO_TCP,
+		.target     = tcpoptstrip_tg4,
+		.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
+		.me         = THIS_MODULE,
+	},
+#if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE)
+	{
+		.name       = "TCPOPTSTRIP",
+		.family     = NFPROTO_IPV6,
+		.table      = "mangle",
+		.proto      = IPPROTO_TCP,
+		.target     = tcpoptstrip_tg6,
+		.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
+		.me         = THIS_MODULE,
+	},
+#endif
+};
+
+static int __init tcpoptstrip_tg_init(void)
+{
+	return xt_register_targets(tcpoptstrip_tg_reg,
+				   ARRAY_SIZE(tcpoptstrip_tg_reg));
+}
+
+static void __exit tcpoptstrip_tg_exit(void)
+{
+	xt_unregister_targets(tcpoptstrip_tg_reg,
+			      ARRAY_SIZE(tcpoptstrip_tg_reg));
+}
+
+module_init(tcpoptstrip_tg_init);
+module_exit(tcpoptstrip_tg_exit);
+MODULE_AUTHOR("Sven Schnelle <svens@bitebene.org>, Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: TCP option stripping");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TCPOPTSTRIP");
+MODULE_ALIAS("ip6t_TCPOPTSTRIP");
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
new file mode 100644
index 00000000..5f054a0d
--- /dev/null
+++ b/net/netfilter/xt_TEE.c
@@ -0,0 +1,310 @@
+/*
+ *	"TEE" target extension for Xtables
+ *	Copyright © Sebastian Claßen, 2007
+ *	Jan Engelhardt, 2007-2010
+ *
+ *	based on ipt_ROUTE.c from Cédric de Launois
+ *	<delaunois@info.ucl.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 or later, as published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <linux/notifier.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TEE.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#	define WITH_CONNTRACK 1
+#	include <net/netfilter/nf_conntrack.h>
+#endif
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#	define WITH_IPV6 1
+#endif
+
+struct xt_tee_priv {
+	struct notifier_block	notifier;
+	struct xt_tee_tginfo	*tginfo;
+	int			oif;
+};
+
+static const union nf_inet_addr tee_zero_address;
+static DEFINE_PER_CPU(bool, tee_active);
+
+static struct net *pick_net(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_NS
+	const struct dst_entry *dst;
+
+	if (skb->dev != NULL)
+		return dev_net(skb->dev);
+	dst = skb_dst(skb);
+	if (dst != NULL && dst->dev != NULL)
+		return dev_net(dst->dev);
+#endif
+	return &init_net;
+}
+
+static bool
+tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	if (info->priv) {
+		if (info->priv->oif == -1)
+			return false;
+		fl4.flowi4_oif = info->priv->oif;
+	}
+	fl4.daddr = info->gw.ip;
+	fl4.flowi4_tos = RT_TOS(iph->tos);
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	rt = ip_route_output_key(net, &fl4);
+	if (IS_ERR(rt))
+		return false;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	skb->dev      = rt->dst.dev;
+	skb->protocol = htons(ETH_P_IP);
+	return true;
+}
+
+static unsigned int
+tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+	struct iphdr *iph;
+
+	if (percpu_read(tee_active))
+		return XT_CONTINUE;
+	/*
+	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+	 * the original skb, which should continue on its way as if nothing has
+	 * happened. The copy should be independently delivered to the TEE
+	 * --gateway.
+	 */
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+	/* Avoid counting cloned packets towards the original connection. */
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	/*
+	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
+	 * since the length could have changed as a result of defragmentation.
+	 *
+	 * We also decrease the TTL to mitigate potential TEE loops
+	 * between two hosts.
+	 *
+	 * Set %IP_DF so that the original source is notified of a potentially
+	 * decreased MTU on the clone route. IPv6 does this too.
+	 */
+	iph = ip_hdr(skb);
+	iph->frag_off |= htons(IP_DF);
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN)
+		--iph->ttl;
+	ip_send_check(iph);
+
+	if (tee_tg_route4(skb, info)) {
+		percpu_write(tee_active, true);
+		ip_local_out(skb);
+		percpu_write(tee_active, false);
+	} else {
+		kfree_skb(skb);
+	}
+	return XT_CONTINUE;
+}
+
+#ifdef WITH_IPV6
+static bool
+tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct dst_entry *dst;
+	struct flowi6 fl6;
+
+	memset(&fl6, 0, sizeof(fl6));
+	if (info->priv) {
+		if (info->priv->oif == -1)
+			return false;
+		fl6.flowi6_oif = info->priv->oif;
+	}
+	fl6.daddr = info->gw.in6;
+	fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+			   (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
+	dst = ip6_route_output(net, NULL, &fl6);
+	if (dst == NULL)
+		return false;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+	skb->dev      = dst->dev;
+	skb->protocol = htons(ETH_P_IPV6);
+	return true;
+}
+
+static unsigned int
+tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+
+	if (percpu_read(tee_active))
+		return XT_CONTINUE;
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN) {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+		--iph->hop_limit;
+	}
+	if (tee_tg_route6(skb, info)) {
+		percpu_write(tee_active, true);
+		ip6_local_out(skb);
+		percpu_write(tee_active, false);
+	} else {
+		kfree_skb(skb);
+	}
+	return XT_CONTINUE;
+}
+#endif /* WITH_IPV6 */
+
+static int tee_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct xt_tee_priv *priv;
+
+	priv = container_of(this, struct xt_tee_priv, notifier);
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (!strcmp(dev->name, priv->tginfo->oif))
+			priv->oif = dev->ifindex;
+		break;
+	case NETDEV_UNREGISTER:
+		if (dev->ifindex == priv->oif)
+			priv->oif = -1;
+		break;
+	case NETDEV_CHANGENAME:
+		if (!strcmp(dev->name, priv->tginfo->oif))
+			priv->oif = dev->ifindex;
+		else if (dev->ifindex == priv->oif)
+			priv->oif = -1;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int tee_tg_check(const struct xt_tgchk_param *par)
+{
+	struct xt_tee_tginfo *info = par->targinfo;
+	struct xt_tee_priv *priv;
+
+	/* 0.0.0.0 and :: not allowed */
+	if (memcmp(&info->gw, &tee_zero_address,
+		   sizeof(tee_zero_address)) == 0)
+		return -EINVAL;
+
+	if (info->oif[0]) {
+		if (info->oif[sizeof(info->oif)-1] != '\0')
+			return -EINVAL;
+
+		priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+		if (priv == NULL)
+			return -ENOMEM;
+
+		priv->tginfo  = info;
+		priv->oif     = -1;
+		priv->notifier.notifier_call = tee_netdev_event;
+		info->priv    = priv;
+
+		register_netdevice_notifier(&priv->notifier);
+	} else
+		info->priv = NULL;
+
+	return 0;
+}
+
+static void tee_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	struct xt_tee_tginfo *info = par->targinfo;
+
+	if (info->priv) {
+		unregister_netdevice_notifier(&info->priv->notifier);
+		kfree(info->priv);
+	}
+}
+
+static struct xt_target tee_tg_reg[] __read_mostly = {
+	{
+		.name       = "TEE",
+		.revision   = 1,
+		.family     = NFPROTO_IPV4,
+		.target     = tee_tg4,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.destroy    = tee_tg_destroy,
+		.me         = THIS_MODULE,
+	},
+#ifdef WITH_IPV6
+	{
+		.name       = "TEE",
+		.revision   = 1,
+		.family     = NFPROTO_IPV6,
+		.target     = tee_tg6,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.destroy    = tee_tg_destroy,
+		.me         = THIS_MODULE,
+	},
+#endif
+};
+
+static int __init tee_tg_init(void)
+{
+	return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+static void __exit tee_tg_exit(void)
+{
+	xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+module_init(tee_tg_init);
+module_exit(tee_tg_exit);
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Reroute packet copy");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TEE");
+MODULE_ALIAS("ip6t_TEE");
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
new file mode 100644
index 00000000..dcfd57eb
--- /dev/null
+++ b/net/netfilter/xt_TPROXY.c
@@ -0,0 +1,432 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
+ * Author: Balazs Scheidler, Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/inet_sock.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_TPROXY_HAVE_IPV6 1
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+#include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static bool tproxy_sk_is_transparent(struct sock *sk)
+{
+	if (sk->sk_state != TCP_TIME_WAIT) {
+		if (inet_sk(sk)->transparent)
+			return true;
+		sock_put(sk);
+	} else {
+		if (inet_twsk(sk)->tw_transparent)
+			return true;
+		inet_twsk_put(inet_twsk(sk));
+	}
+	return false;
+}
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	rcu_read_lock();
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+			struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+static unsigned int
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+	   u_int32_t mark_mask, u_int32_t mark_value)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return NF_DROP;
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	if (!lport)
+		lport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, laddr,
+					   hp->source, lport,
+					   skb->dev, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && tproxy_sk_is_transparent(sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+			 iph->protocol, &iph->daddr, ntohs(hp->dest),
+			 &laddr, ntohs(lport), skb->mark);
+
+		nf_tproxy_assign_sock(skb, sk);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+		 iph->protocol, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+	return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	rcu_read_lock();
+	indev = __in6_dev_get(skb->dev);
+	if (indev)
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+	const struct in6_addr *laddr;
+	__be16 lport;
+	int thoff;
+	int tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	lport = tgi->lport ? tgi->lport : hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					   &iph->saddr, laddr,
+					   hp->source, lport,
+					   par->in, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && tproxy_sk_is_transparent(sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
+
+		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+			 tproto, &iph->saddr, ntohs(hp->source),
+			 laddr, ntohs(lport), skb->mark);
+
+		nf_tproxy_assign_sock(skb, sk);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+		 tproto, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+
+	return NF_DROP;
+}
+
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_ip6 *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->flags & IP6T_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_ip *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->invflags & IPT_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v0,
+		.revision	= 0,
+		.targetsize	= sizeof(struct xt_tproxy_target_info),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#ifdef XT_TPROXY_HAVE_IPV6
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV6,
+		.table		= "mangle",
+		.target		= tproxy_tg6_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg6_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#endif
+
+};
+
+static int __init tproxy_tg_init(void)
+{
+	nf_defrag_ipv4_enable();
+#ifdef XT_TPROXY_HAVE_IPV6
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
+}
+
+static void __exit tproxy_tg_exit(void)
+{
+	xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
+}
+
+module_init(tproxy_tg_init);
+module_exit(tproxy_tg_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
+MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
new file mode 100644
index 00000000..df48967a
--- /dev/null
+++ b/net/netfilter/xt_TRACE.c
@@ -0,0 +1,40 @@
+/* This is a module which is used to mark packets for tracing.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+
+MODULE_DESCRIPTION("Xtables: packet flow tracing");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TRACE");
+MODULE_ALIAS("ip6t_TRACE");
+
+static unsigned int
+trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	skb->nf_trace = 1;
+	return XT_CONTINUE;
+}
+
+static struct xt_target trace_tg_reg __read_mostly = {
+	.name       = "TRACE",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.table      = "raw",
+	.target     = trace_tg,
+	.me         = THIS_MODULE,
+};
+
+static int __init trace_tg_init(void)
+{
+	return xt_register_target(&trace_tg_reg);
+}
+
+static void __exit trace_tg_exit(void)
+{
+	xt_unregister_target(&trace_tg_reg);
+}
+
+module_init(trace_tg_init);
+module_exit(trace_tg_exit);
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
new file mode 100644
index 00000000..b77d383c
--- /dev/null
+++ b/net/netfilter/xt_addrtype.c
@@ -0,0 +1,243 @@
+/*
+ *  iptables module to match inet_addr_type() of an ip.
+ *
+ *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ *  (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip6_fib.h>
+#endif
+
+#include <linux/netfilter/xt_addrtype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: address type match");
+MODULE_ALIAS("ipt_addrtype");
+MODULE_ALIAS("ip6t_addrtype");
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
+			    const struct in6_addr *addr)
+{
+	const struct nf_afinfo *afinfo;
+	struct flowi6 flow;
+	struct rt6_info *rt;
+	u32 ret;
+	int route_err;
+
+	memset(&flow, 0, sizeof(flow));
+	ipv6_addr_copy(&flow.daddr, addr);
+	if (dev)
+		flow.flowi6_oif = dev->ifindex;
+
+	rcu_read_lock();
+
+	afinfo = nf_get_afinfo(NFPROTO_IPV6);
+	if (afinfo != NULL)
+		route_err = afinfo->route(net, (struct dst_entry **)&rt,
+					flowi6_to_flowi(&flow), !!dev);
+	else
+		route_err = 1;
+
+	rcu_read_unlock();
+
+	if (route_err)
+		return XT_ADDRTYPE_UNREACHABLE;
+
+	if (rt->rt6i_flags & RTF_REJECT)
+		ret = XT_ADDRTYPE_UNREACHABLE;
+	else
+		ret = 0;
+
+	if (rt->rt6i_flags & RTF_LOCAL)
+		ret |= XT_ADDRTYPE_LOCAL;
+	if (rt->rt6i_flags & RTF_ANYCAST)
+		ret |= XT_ADDRTYPE_ANYCAST;
+
+
+	dst_release(&rt->dst);
+	return ret;
+}
+
+static bool match_type6(struct net *net, const struct net_device *dev,
+				const struct in6_addr *addr, u16 mask)
+{
+	int addr_type = ipv6_addr_type(addr);
+
+	if ((mask & XT_ADDRTYPE_MULTICAST) &&
+	    !(addr_type & IPV6_ADDR_MULTICAST))
+		return false;
+	if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST))
+		return false;
+	if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY)
+		return false;
+
+	if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST |
+	     XT_ADDRTYPE_UNREACHABLE) & mask)
+		return !!(mask & match_lookup_rt6(net, dev, addr));
+	return true;
+}
+
+static bool
+addrtype_mt6(struct net *net, const struct net_device *dev,
+	const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	bool ret = true;
+
+	if (info->source)
+		ret &= match_type6(net, dev, &iph->saddr, info->source) ^
+		       (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+	if (ret && info->dest)
+		ret &= match_type6(net, dev, &iph->daddr, info->dest) ^
+		       !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+	return ret;
+}
+#endif
+
+static inline bool match_type(struct net *net, const struct net_device *dev,
+			      __be32 addr, u_int16_t mask)
+{
+	return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
+}
+
+static bool
+addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct net *net = dev_net(par->in ? par->in : par->out);
+	const struct xt_addrtype_info *info = par->matchinfo;
+	const struct iphdr *iph = ip_hdr(skb);
+	bool ret = true;
+
+	if (info->source)
+		ret &= match_type(net, NULL, iph->saddr, info->source) ^
+		       info->invert_source;
+	if (info->dest)
+		ret &= match_type(net, NULL, iph->daddr, info->dest) ^
+		       info->invert_dest;
+
+	return ret;
+}
+
+static bool
+addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct net *net = dev_net(par->in ? par->in : par->out);
+	const struct xt_addrtype_info_v1 *info = par->matchinfo;
+	const struct iphdr *iph;
+	const struct net_device *dev = NULL;
+	bool ret = true;
+
+	if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
+		dev = par->in;
+	else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
+		dev = par->out;
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	if (par->family == NFPROTO_IPV6)
+		return addrtype_mt6(net, dev, skb, info);
+#endif
+	iph = ip_hdr(skb);
+	if (info->source)
+		ret &= match_type(net, dev, iph->saddr, info->source) ^
+		       (info->flags & XT_ADDRTYPE_INVERT_SOURCE);
+	if (ret && info->dest)
+		ret &= match_type(net, dev, iph->daddr, info->dest) ^
+		       !!(info->flags & XT_ADDRTYPE_INVERT_DEST);
+	return ret;
+}
+
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+{
+	struct xt_addrtype_info_v1 *info = par->matchinfo;
+
+	if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN &&
+	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+		pr_info("both incoming and outgoing "
+			"interface limitation cannot be selected\n");
+		return -EINVAL;
+	}
+
+	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+	    (1 << NF_INET_LOCAL_IN)) &&
+	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) {
+		pr_info("output interface limitation "
+			"not valid in PREROUTING and INPUT\n");
+		return -EINVAL;
+	}
+
+	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+	    (1 << NF_INET_LOCAL_OUT)) &&
+	    info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) {
+		pr_info("input interface limitation "
+			"not valid in POSTROUTING and OUTPUT\n");
+		return -EINVAL;
+	}
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	if (par->family == NFPROTO_IPV6) {
+		if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) {
+			pr_err("ipv6 BLACKHOLE matching not supported\n");
+			return -EINVAL;
+		}
+		if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) {
+			pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n");
+			return -EINVAL;
+		}
+		if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) {
+			pr_err("ipv6 does not support BROADCAST matching\n");
+			return -EINVAL;
+		}
+	}
+#endif
+	return 0;
+}
+
+static struct xt_match addrtype_mt_reg[] __read_mostly = {
+	{
+		.name		= "addrtype",
+		.family		= NFPROTO_IPV4,
+		.match		= addrtype_mt_v0,
+		.matchsize	= sizeof(struct xt_addrtype_info),
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "addrtype",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 1,
+		.match		= addrtype_mt_v1,
+		.checkentry	= addrtype_mt_checkentry_v1,
+		.matchsize	= sizeof(struct xt_addrtype_info_v1),
+		.me		= THIS_MODULE
+	}
+};
+
+static int __init addrtype_mt_init(void)
+{
+	return xt_register_matches(addrtype_mt_reg,
+				   ARRAY_SIZE(addrtype_mt_reg));
+}
+
+static void __exit addrtype_mt_exit(void)
+{
+	xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
+}
+
+module_init(addrtype_mt_init);
+module_exit(addrtype_mt_exit);
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
new file mode 100644
index 00000000..f4af1bfa
--- /dev/null
+++ b/net/netfilter/xt_cluster.c
@@ -0,0 +1,178 @@
+/*
+ * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/xt_cluster.h>
+
+static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct)
+{
+	return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+}
+
+static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct)
+{
+	return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6;
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info)
+{
+	return jhash_1word(ip, info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info)
+{
+	return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed);
+}
+
+static inline u_int32_t
+xt_cluster_hash(const struct nf_conn *ct,
+		const struct xt_cluster_match_info *info)
+{
+	u_int32_t hash = 0;
+
+	switch(nf_ct_l3num(ct)) {
+	case AF_INET:
+		hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info);
+		break;
+	case AF_INET6:
+		hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	return (((u64)hash * info->total_nodes) >> 32);
+}
+
+static inline bool
+xt_cluster_ipv6_is_multicast(const struct in6_addr *addr)
+{
+	__be32 st = addr->s6_addr32[0];
+	return ((st & htonl(0xFF000000)) == htonl(0xFF000000));
+}
+
+static inline bool
+xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
+{
+	bool is_multicast = false;
+
+	switch(family) {
+	case NFPROTO_IPV4:
+		is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
+		break;
+	case NFPROTO_IPV6:
+		is_multicast =
+			xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	return is_multicast;
+}
+
+static bool
+xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct sk_buff *pskb = (struct sk_buff *)skb;
+	const struct xt_cluster_match_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned long hash;
+
+	/* This match assumes that all nodes see the same packets. This can be
+	 * achieved if the switch that connects the cluster nodes support some
+	 * sort of 'port mirroring'. However, if your switch does not support
+	 * this, your cluster nodes can reply ARP request using a multicast MAC
+	 * address. Thus, your switch will flood the same packets to the
+	 * cluster nodes with the same multicast MAC address. Using a multicast
+	 * link address is a RFC 1812 (section 3.3.2) violation, but this works
+	 * fine in practise.
+	 *
+	 * Unfortunately, if you use the multicast MAC address, the link layer
+	 * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted
+	 * by TCP and others for packets coming to this node. For that reason,
+	 * this match mangles skbuff's pkt_type if it detects a packet
+	 * addressed to a unicast address but using PACKET_MULTICAST. Yes, I
+	 * know, matches should not alter packets, but we are doing this here
+	 * because we would need to add a PKTTYPE target for this sole purpose.
+	 */
+	if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+	    skb->pkt_type == PACKET_MULTICAST) {
+	    	pskb->pkt_type = PACKET_HOST;
+	}
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return false;
+
+	if (nf_ct_is_untracked(ct))
+		return false;
+
+	if (ct->master)
+		hash = xt_cluster_hash(ct->master, info);
+	else
+		hash = xt_cluster_hash(ct, info);
+
+	return !!((1 << hash) & info->node_mask) ^
+	       !!(info->flags & XT_CLUSTER_F_INV);
+}
+
+static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_cluster_match_info *info = par->matchinfo;
+
+	if (info->total_nodes > XT_CLUSTER_NODES_MAX) {
+		pr_info("you have exceeded the maximum "
+			"number of cluster nodes (%u > %u)\n",
+			info->total_nodes, XT_CLUSTER_NODES_MAX);
+		return -EINVAL;
+	}
+	if (info->node_mask >= (1ULL << info->total_nodes)) {
+		pr_info("this node mask cannot be "
+			"higher than the total number of nodes\n");
+		return -EDOM;
+	}
+	return 0;
+}
+
+static struct xt_match xt_cluster_match __read_mostly = {
+	.name		= "cluster",
+	.family		= NFPROTO_UNSPEC,
+	.match		= xt_cluster_mt,
+	.checkentry	= xt_cluster_mt_checkentry,
+	.matchsize	= sizeof(struct xt_cluster_match_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_cluster_mt_init(void)
+{
+	return xt_register_match(&xt_cluster_match);
+}
+
+static void __exit xt_cluster_mt_fini(void)
+{
+	xt_unregister_match(&xt_cluster_match);
+}
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: hash-based cluster match");
+MODULE_ALIAS("ipt_cluster");
+MODULE_ALIAS("ip6t_cluster");
+module_init(xt_cluster_mt_init);
+module_exit(xt_cluster_mt_fini);
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
new file mode 100644
index 00000000..5c861d2f
--- /dev/null
+++ b/net/netfilter/xt_comment.c
@@ -0,0 +1,45 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_comment.h>
+
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_comment");
+MODULE_ALIAS("ip6t_comment");
+
+static bool
+comment_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	/* We always match */
+	return true;
+}
+
+static struct xt_match comment_mt_reg __read_mostly = {
+	.name      = "comment",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = comment_mt,
+	.matchsize = sizeof(struct xt_comment_info),
+	.me        = THIS_MODULE,
+};
+
+static int __init comment_mt_init(void)
+{
+	return xt_register_match(&comment_mt_reg);
+}
+
+static void __exit comment_mt_exit(void)
+{
+	xt_unregister_match(&comment_mt_reg);
+}
+
+module_init(comment_mt_init);
+module_exit(comment_mt_exit);
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
new file mode 100644
index 00000000..5b138506
--- /dev/null
+++ b/net/netfilter/xt_connbytes.c
@@ -0,0 +1,155 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/math64.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connbytes.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching");
+MODULE_ALIAS("ipt_connbytes");
+MODULE_ALIAS("ip6t_connbytes");
+
+static bool
+connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_connbytes_info *sinfo = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int64_t what = 0;	/* initialize to make gcc happy */
+	u_int64_t bytes = 0;
+	u_int64_t pkts = 0;
+	const struct nf_conn_counter *counters;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+
+	counters = nf_conn_acct_find(ct);
+	if (!counters)
+		return false;
+
+	switch (sinfo->what) {
+	case XT_CONNBYTES_PKTS:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			what = counters[IP_CT_DIR_ORIGINAL].packets;
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			what = counters[IP_CT_DIR_REPLY].packets;
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			what = counters[IP_CT_DIR_ORIGINAL].packets;
+			what += counters[IP_CT_DIR_REPLY].packets;
+			break;
+		}
+		break;
+	case XT_CONNBYTES_BYTES:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			what = counters[IP_CT_DIR_ORIGINAL].bytes;
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			what = counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			what = counters[IP_CT_DIR_ORIGINAL].bytes;
+			what += counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		}
+		break;
+	case XT_CONNBYTES_AVGPKT:
+		switch (sinfo->direction) {
+		case XT_CONNBYTES_DIR_ORIGINAL:
+			bytes = counters[IP_CT_DIR_ORIGINAL].bytes;
+			pkts  = counters[IP_CT_DIR_ORIGINAL].packets;
+			break;
+		case XT_CONNBYTES_DIR_REPLY:
+			bytes = counters[IP_CT_DIR_REPLY].bytes;
+			pkts  = counters[IP_CT_DIR_REPLY].packets;
+			break;
+		case XT_CONNBYTES_DIR_BOTH:
+			bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
+				counters[IP_CT_DIR_REPLY].bytes;
+			pkts  = counters[IP_CT_DIR_ORIGINAL].packets +
+				counters[IP_CT_DIR_REPLY].packets;
+			break;
+		}
+		if (pkts != 0)
+			what = div64_u64(bytes, pkts);
+		break;
+	}
+
+	if (sinfo->count.to)
+		return what <= sinfo->count.to && what >= sinfo->count.from;
+	else
+		return what >= sinfo->count.from;
+}
+
+static int connbytes_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_connbytes_info *sinfo = par->matchinfo;
+	int ret;
+
+	if (sinfo->what != XT_CONNBYTES_PKTS &&
+	    sinfo->what != XT_CONNBYTES_BYTES &&
+	    sinfo->what != XT_CONNBYTES_AVGPKT)
+		return -EINVAL;
+
+	if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL &&
+	    sinfo->direction != XT_CONNBYTES_DIR_REPLY &&
+	    sinfo->direction != XT_CONNBYTES_DIR_BOTH)
+		return -EINVAL;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+
+	/*
+	 * This filter cannot function correctly unless connection tracking
+	 * accounting is enabled, so complain in the hope that someone notices.
+	 */
+	if (!nf_ct_acct_enabled(par->net)) {
+		pr_warning("Forcing CT accounting to be enabled\n");
+		nf_ct_set_acct(par->net, true);
+	}
+
+	return ret;
+}
+
+static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match connbytes_mt_reg __read_mostly = {
+	.name       = "connbytes",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = connbytes_mt_check,
+	.match      = connbytes_mt,
+	.destroy    = connbytes_mt_destroy,
+	.matchsize  = sizeof(struct xt_connbytes_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init connbytes_mt_init(void)
+{
+	return xt_register_match(&connbytes_mt_reg);
+}
+
+static void __exit connbytes_mt_exit(void)
+{
+	xt_unregister_match(&connbytes_mt_reg);
+}
+
+module_init(connbytes_mt_init);
+module_exit(connbytes_mt_exit);
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
new file mode 100644
index 00000000..c6d5a834
--- /dev/null
+++ b/net/netfilter/xt_connlimit.c
@@ -0,0 +1,317 @@
+/*
+ * netfilter module to limit the number of parallel tcp
+ * connections per IP address.
+ *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
+ *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
+ *		only ignore TIME_WAIT or gone connections
+ *   (C) CC Computer Consultants GmbH, 2007
+ *
+ * based on ...
+ *
+ * Kernel module to match connection tracking information.
+ * GPL (C) 1999  Rusty Russell (rusty@rustcorp.com.au).
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connlimit.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+/* we will save the tuples of all connections we care about */
+struct xt_connlimit_conn {
+	struct hlist_node		node;
+	struct nf_conntrack_tuple	tuple;
+	union nf_inet_addr		addr;
+};
+
+struct xt_connlimit_data {
+	struct hlist_head	iphash[256];
+	spinlock_t		lock;
+};
+
+static u_int32_t connlimit_rnd __read_mostly;
+
+static inline unsigned int connlimit_iphash(__be32 addr)
+{
+	return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF;
+}
+
+static inline unsigned int
+connlimit_iphash6(const union nf_inet_addr *addr,
+                  const union nf_inet_addr *mask)
+{
+	union nf_inet_addr res;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i)
+		res.ip6[i] = addr->ip6[i] & mask->ip6[i];
+
+	return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF;
+}
+
+static inline bool already_closed(const struct nf_conn *conn)
+{
+	if (nf_ct_protonum(conn) == IPPROTO_TCP)
+		return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
+		       conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
+	else
+		return 0;
+}
+
+static inline unsigned int
+same_source_net(const union nf_inet_addr *addr,
+		const union nf_inet_addr *mask,
+		const union nf_inet_addr *u3, u_int8_t family)
+{
+	if (family == NFPROTO_IPV4) {
+		return (addr->ip & mask->ip) == (u3->ip & mask->ip);
+	} else {
+		union nf_inet_addr lh, rh;
+		unsigned int i;
+
+		for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) {
+			lh.ip6[i] = addr->ip6[i] & mask->ip6[i];
+			rh.ip6[i] = u3->ip6[i] & mask->ip6[i];
+		}
+
+		return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0;
+	}
+}
+
+static int count_them(struct net *net,
+		      struct xt_connlimit_data *data,
+		      const struct nf_conntrack_tuple *tuple,
+		      const union nf_inet_addr *addr,
+		      const union nf_inet_addr *mask,
+		      u_int8_t family)
+{
+	const struct nf_conntrack_tuple_hash *found;
+	struct xt_connlimit_conn *conn;
+	struct hlist_node *pos, *n;
+	struct nf_conn *found_ct;
+	struct hlist_head *hash;
+	bool addit = true;
+	int matches = 0;
+
+	if (family == NFPROTO_IPV6)
+		hash = &data->iphash[connlimit_iphash6(addr, mask)];
+	else
+		hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)];
+
+	rcu_read_lock();
+
+	/* check the saved connections */
+	hlist_for_each_entry_safe(conn, pos, n, hash, node) {
+		found    = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,
+						 &conn->tuple);
+		found_ct = NULL;
+
+		if (found != NULL)
+			found_ct = nf_ct_tuplehash_to_ctrack(found);
+
+		if (found_ct != NULL &&
+		    nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		    !already_closed(found_ct))
+			/*
+			 * Just to be sure we have it only once in the list.
+			 * We should not see tuples twice unless someone hooks
+			 * this into a table without "-p tcp --syn".
+			 */
+			addit = false;
+
+		if (found == NULL) {
+			/* this one is gone */
+			hlist_del(&conn->node);
+			kfree(conn);
+			continue;
+		}
+
+		if (already_closed(found_ct)) {
+			/*
+			 * we do not care about connections which are
+			 * closed already -> ditch it
+			 */
+			nf_ct_put(found_ct);
+			hlist_del(&conn->node);
+			kfree(conn);
+			continue;
+		}
+
+		if (same_source_net(addr, mask, &conn->addr, family))
+			/* same source network -> be counted! */
+			++matches;
+		nf_ct_put(found_ct);
+	}
+
+	rcu_read_unlock();
+
+	if (addit) {
+		/* save the new connection in our list */
+		conn = kmalloc(sizeof(*conn), GFP_ATOMIC);
+		if (conn == NULL)
+			return -ENOMEM;
+		conn->tuple = *tuple;
+		conn->addr = *addr;
+		hlist_add_head(&conn->node, hash);
+		++matches;
+	}
+
+	return matches;
+}
+
+static bool
+connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct net *net = dev_net(par->in ? par->in : par->out);
+	const struct xt_connlimit_info *info = par->matchinfo;
+	union nf_inet_addr addr;
+	struct nf_conntrack_tuple tuple;
+	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	int connections;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct != NULL)
+		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+				    par->family, &tuple))
+		goto hotdrop;
+
+	if (par->family == NFPROTO_IPV6) {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+		memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
+		       &iph->daddr : &iph->saddr, sizeof(addr.ip6));
+	} else {
+		const struct iphdr *iph = ip_hdr(skb);
+		addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
+			  iph->daddr : iph->saddr;
+	}
+
+	spin_lock_bh(&info->data->lock);
+	connections = count_them(net, info->data, tuple_ptr, &addr,
+	                         &info->mask, par->family);
+	spin_unlock_bh(&info->data->lock);
+
+	if (connections < 0)
+		/* kmalloc failed, drop it entirely */
+		goto hotdrop;
+
+	return (connections > info->limit) ^
+	       !!(info->flags & XT_CONNLIMIT_INVERT);
+
+ hotdrop:
+	par->hotdrop = true;
+	return false;
+}
+
+static int connlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_connlimit_info *info = par->matchinfo;
+	unsigned int i;
+	int ret;
+
+	if (unlikely(!connlimit_rnd)) {
+		u_int32_t rand;
+
+		do {
+			get_random_bytes(&rand, sizeof(rand));
+		} while (!rand);
+		cmpxchg(&connlimit_rnd, 0, rand);
+	}
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0) {
+		pr_info("cannot load conntrack support for "
+			"address family %u\n", par->family);
+		return ret;
+	}
+
+	/* init private data */
+	info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL);
+	if (info->data == NULL) {
+		nf_ct_l3proto_module_put(par->family);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&info->data->lock);
+	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i)
+		INIT_HLIST_HEAD(&info->data->iphash[i]);
+
+	return 0;
+}
+
+static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_connlimit_info *info = par->matchinfo;
+	struct xt_connlimit_conn *conn;
+	struct hlist_node *pos, *n;
+	struct hlist_head *hash = info->data->iphash;
+	unsigned int i;
+
+	nf_ct_l3proto_module_put(par->family);
+
+	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
+		hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) {
+			hlist_del(&conn->node);
+			kfree(conn);
+		}
+	}
+
+	kfree(info->data);
+}
+
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+	{
+		.name       = "connlimit",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = connlimit_mt_check,
+		.match      = connlimit_mt,
+		.matchsize  = sizeof(struct xt_connlimit_info),
+		.destroy    = connlimit_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "connlimit",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = connlimit_mt_check,
+		.match      = connlimit_mt,
+		.matchsize  = sizeof(struct xt_connlimit_info),
+		.destroy    = connlimit_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init connlimit_mt_init(void)
+{
+	return xt_register_matches(connlimit_mt_reg,
+	       ARRAY_SIZE(connlimit_mt_reg));
+}
+
+static void __exit connlimit_mt_exit(void)
+{
+	xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
+}
+
+module_init(connlimit_mt_init);
+module_exit(connlimit_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Number of connections matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_connlimit");
+MODULE_ALIAS("ip6t_connlimit");
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
new file mode 100644
index 00000000..7278145e
--- /dev/null
+++ b/net/netfilter/xt_connmark.c
@@ -0,0 +1,167 @@
+/*
+ *	xt_connmark - Netfilter module to operate on connection marks
+ *
+ *	Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ *	by Henrik Nordstrom <hno@marasystems.com>
+ *	Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *	Jan Engelhardt <jengelh@medozas.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_connmark.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>");
+MODULE_DESCRIPTION("Xtables: connection mark operations");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_CONNMARK");
+MODULE_ALIAS("ip6t_CONNMARK");
+MODULE_ALIAS("ipt_connmark");
+MODULE_ALIAS("ip6t_connmark");
+
+static unsigned int
+connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connmark_tginfo1 *info = par->targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	u_int32_t newmark;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return XT_CONTINUE;
+
+	switch (info->mode) {
+	case XT_CONNMARK_SET:
+		newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+		if (ct->mark != newmark) {
+			ct->mark = newmark;
+			nf_conntrack_event_cache(IPCT_MARK, ct);
+		}
+		break;
+	case XT_CONNMARK_SAVE:
+		newmark = (ct->mark & ~info->ctmask) ^
+		          (skb->mark & info->nfmask);
+		if (ct->mark != newmark) {
+			ct->mark = newmark;
+			nf_conntrack_event_cache(IPCT_MARK, ct);
+		}
+		break;
+	case XT_CONNMARK_RESTORE:
+		newmark = (skb->mark & ~info->nfmask) ^
+		          (ct->mark & info->ctmask);
+		skb->mark = newmark;
+		break;
+	}
+
+	return XT_CONTINUE;
+}
+
+static int connmark_tg_check(const struct xt_tgchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static bool
+connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_connmark_mtinfo1 *info = par->matchinfo;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return false;
+
+	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int connmark_mt_check(const struct xt_mtchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target connmark_tg_reg __read_mostly = {
+	.name           = "CONNMARK",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.checkentry     = connmark_tg_check,
+	.target         = connmark_tg,
+	.targetsize     = sizeof(struct xt_connmark_tginfo1),
+	.destroy        = connmark_tg_destroy,
+	.me             = THIS_MODULE,
+};
+
+static struct xt_match connmark_mt_reg __read_mostly = {
+	.name           = "connmark",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.checkentry     = connmark_mt_check,
+	.match          = connmark_mt,
+	.matchsize      = sizeof(struct xt_connmark_mtinfo1),
+	.destroy        = connmark_mt_destroy,
+	.me             = THIS_MODULE,
+};
+
+static int __init connmark_mt_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&connmark_tg_reg);
+	if (ret < 0)
+		return ret;
+	ret = xt_register_match(&connmark_mt_reg);
+	if (ret < 0) {
+		xt_unregister_target(&connmark_tg_reg);
+		return ret;
+	}
+	return 0;
+}
+
+static void __exit connmark_mt_exit(void)
+{
+	xt_unregister_match(&connmark_mt_reg);
+	xt_unregister_target(&connmark_tg_reg);
+}
+
+module_init(connmark_mt_init);
+module_exit(connmark_mt_exit);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
new file mode 100644
index 00000000..61805d7b
--- /dev/null
+++ b/net/netfilter/xt_conntrack.c
@@ -0,0 +1,332 @@
+/*
+ *	xt_conntrack - Netfilter module to match connection tracking
+ *	information. (Superset of Rusty's minimalistic state match.)
+ *
+ *	(C) 2001  Marc Boucher (marc@mbsi.ca).
+ *	Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_conntrack.h>
+#include <net/netfilter/nf_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: connection tracking state match");
+MODULE_ALIAS("ipt_conntrack");
+MODULE_ALIAS("ip6t_conntrack");
+
+static bool
+conntrack_addrcmp(const union nf_inet_addr *kaddr,
+                  const union nf_inet_addr *uaddr,
+                  const union nf_inet_addr *umask, unsigned int l3proto)
+{
+	if (l3proto == NFPROTO_IPV4)
+		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+	else if (l3proto == NFPROTO_IPV6)
+		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+		       &uaddr->in6) == 0;
+	else
+		return false;
+}
+
+static inline bool
+conntrack_mt_origsrc(const struct nf_conn *ct,
+                     const struct xt_conntrack_mtinfo2 *info,
+		     u_int8_t family)
+{
+	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3,
+	       &info->origsrc_addr, &info->origsrc_mask, family);
+}
+
+static inline bool
+conntrack_mt_origdst(const struct nf_conn *ct,
+                     const struct xt_conntrack_mtinfo2 *info,
+		     u_int8_t family)
+{
+	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3,
+	       &info->origdst_addr, &info->origdst_mask, family);
+}
+
+static inline bool
+conntrack_mt_replsrc(const struct nf_conn *ct,
+                     const struct xt_conntrack_mtinfo2 *info,
+		     u_int8_t family)
+{
+	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3,
+	       &info->replsrc_addr, &info->replsrc_mask, family);
+}
+
+static inline bool
+conntrack_mt_repldst(const struct nf_conn *ct,
+                     const struct xt_conntrack_mtinfo2 *info,
+		     u_int8_t family)
+{
+	return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3,
+	       &info->repldst_addr, &info->repldst_mask, family);
+}
+
+static inline bool
+ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
+                    const struct nf_conn *ct)
+{
+	const struct nf_conntrack_tuple *tuple;
+
+	tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+	    (nf_ct_protonum(ct) == info->l4proto) ^
+	    !(info->invert_flags & XT_CONNTRACK_PROTO))
+		return false;
+
+	/* Shortcut to match all recognized protocols by using ->src.all. */
+	if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+	    (tuple->src.u.all == info->origsrc_port) ^
+	    !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+		return false;
+
+	if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+	    (tuple->dst.u.all == info->origdst_port) ^
+	    !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+		return false;
+
+	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+	if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+	    (tuple->src.u.all == info->replsrc_port) ^
+	    !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+		return false;
+
+	if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+	    (tuple->dst.u.all == info->repldst_port) ^
+	    !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+		return false;
+
+	return true;
+}
+
+static inline bool
+port_match(u16 min, u16 max, u16 port, bool invert)
+{
+	return (port >= min && port <= max) ^ invert;
+}
+
+static inline bool
+ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
+		       const struct nf_conn *ct)
+{
+	const struct nf_conntrack_tuple *tuple;
+
+	tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	if ((info->match_flags & XT_CONNTRACK_PROTO) &&
+	    (nf_ct_protonum(ct) == info->l4proto) ^
+	    !(info->invert_flags & XT_CONNTRACK_PROTO))
+		return false;
+
+	/* Shortcut to match all recognized protocols by using ->src.all. */
+	if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
+	    !port_match(info->origsrc_port, info->origsrc_port_high,
+			ntohs(tuple->src.u.all),
+			info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
+		return false;
+
+	if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
+	    !port_match(info->origdst_port, info->origdst_port_high,
+			ntohs(tuple->dst.u.all),
+			info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
+		return false;
+
+	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+	if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
+	    !port_match(info->replsrc_port, info->replsrc_port_high,
+			ntohs(tuple->src.u.all),
+			info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
+		return false;
+
+	if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
+	    !port_match(info->repldst_port, info->repldst_port_high,
+			ntohs(tuple->dst.u.all),
+			info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
+		return false;
+
+	return true;
+}
+
+static bool
+conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
+             u16 state_mask, u16 status_mask)
+{
+	const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int statebit;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+	if (ct) {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_CONNTRACK_STATE_UNTRACKED;
+		else
+			statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	} else
+		statebit = XT_CONNTRACK_STATE_INVALID;
+
+	if (info->match_flags & XT_CONNTRACK_STATE) {
+		if (ct != NULL) {
+			if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+			if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+		if (!!(state_mask & statebit) ^
+		    !(info->invert_flags & XT_CONNTRACK_STATE))
+			return false;
+	}
+
+	if (ct == NULL)
+		return info->match_flags & XT_CONNTRACK_STATE;
+	if ((info->match_flags & XT_CONNTRACK_DIRECTION) &&
+	    (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^
+	    !(info->invert_flags & XT_CONNTRACK_DIRECTION))
+		return false;
+
+	if (info->match_flags & XT_CONNTRACK_ORIGSRC)
+		if (conntrack_mt_origsrc(ct, info, par->family) ^
+		    !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
+			return false;
+
+	if (info->match_flags & XT_CONNTRACK_ORIGDST)
+		if (conntrack_mt_origdst(ct, info, par->family) ^
+		    !(info->invert_flags & XT_CONNTRACK_ORIGDST))
+			return false;
+
+	if (info->match_flags & XT_CONNTRACK_REPLSRC)
+		if (conntrack_mt_replsrc(ct, info, par->family) ^
+		    !(info->invert_flags & XT_CONNTRACK_REPLSRC))
+			return false;
+
+	if (info->match_flags & XT_CONNTRACK_REPLDST)
+		if (conntrack_mt_repldst(ct, info, par->family) ^
+		    !(info->invert_flags & XT_CONNTRACK_REPLDST))
+			return false;
+
+	if (par->match->revision != 3) {
+		if (!ct_proto_port_check(info, ct))
+			return false;
+	} else {
+		if (!ct_proto_port_check_v3(par->matchinfo, ct))
+			return false;
+	}
+
+	if ((info->match_flags & XT_CONNTRACK_STATUS) &&
+	    (!!(status_mask & ct->status) ^
+	    !(info->invert_flags & XT_CONNTRACK_STATUS)))
+		return false;
+
+	if (info->match_flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires = 0;
+
+		if (timer_pending(&ct->timeout))
+			expires = (ct->timeout.expires - jiffies) / HZ;
+		if ((expires >= info->expires_min &&
+		    expires <= info->expires_max) ^
+		    !(info->invert_flags & XT_CONNTRACK_EXPIRES))
+			return false;
+	}
+	return true;
+}
+
+static bool
+conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntrack_mtinfo1 *info = par->matchinfo;
+
+	return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static bool
+conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
+
+	return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static bool
+conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
+
+	return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static int conntrack_mt_check(const struct xt_mtchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match conntrack_mt_reg[] __read_mostly = {
+	{
+		.name       = "conntrack",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
+		.match      = conntrack_mt_v1,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "conntrack",
+		.revision   = 2,
+		.family     = NFPROTO_UNSPEC,
+		.matchsize  = sizeof(struct xt_conntrack_mtinfo2),
+		.match      = conntrack_mt_v2,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "conntrack",
+		.revision   = 3,
+		.family     = NFPROTO_UNSPEC,
+		.matchsize  = sizeof(struct xt_conntrack_mtinfo3),
+		.match      = conntrack_mt_v3,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init conntrack_mt_init(void)
+{
+	return xt_register_matches(conntrack_mt_reg,
+	       ARRAY_SIZE(conntrack_mt_reg));
+}
+
+static void __exit conntrack_mt_exit(void)
+{
+	xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg));
+}
+
+module_init(conntrack_mt_init);
+module_exit(conntrack_mt_exit);
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
new file mode 100644
index 00000000..c7a2e546
--- /dev/null
+++ b/net/netfilter/xt_cpu.c
@@ -0,0 +1,65 @@
+/* Kernel module to match running CPU */
+
+/*
+ * Might be used to distribute connections on several daemons, if
+ * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
+ * each RX queue IRQ affined to one CPU (1:1 mapping)
+ *
+ */
+
+/* (C) 2010 Eric Dumazet
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_cpu.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
+MODULE_DESCRIPTION("Xtables: CPU match");
+MODULE_ALIAS("ipt_cpu");
+MODULE_ALIAS("ip6t_cpu");
+
+static int cpu_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	if (info->invert & ~1)
+		return -EINVAL;
+	return 0;
+}
+
+static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	return (info->cpu == smp_processor_id()) ^ info->invert;
+}
+
+static struct xt_match cpu_mt_reg __read_mostly = {
+	.name       = "cpu",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = cpu_mt_check,
+	.match      = cpu_mt,
+	.matchsize  = sizeof(struct xt_cpu_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init cpu_mt_init(void)
+{
+	return xt_register_match(&cpu_mt_reg);
+}
+
+static void __exit cpu_mt_exit(void)
+{
+	xt_unregister_match(&cpu_mt_reg);
+}
+
+module_init(cpu_mt_init);
+module_exit(cpu_mt_exit);
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
new file mode 100644
index 00000000..b63d2a3d
--- /dev/null
+++ b/net/netfilter/xt_dccp.c
@@ -0,0 +1,188 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: DCCP protocol packet match");
+MODULE_ALIAS("ipt_dccp");
+MODULE_ALIAS("ip6t_dccp");
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+				  || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline bool
+dccp_find_option(u_int8_t option,
+		 const struct sk_buff *skb,
+		 unsigned int protoff,
+		 const struct dccp_hdr *dh,
+		 bool *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	const unsigned char *op;
+	unsigned int optoff = __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+	unsigned int i;
+
+	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh))
+		goto invalid;
+
+	if (!optlen)
+		return false;
+
+	spin_lock_bh(&dccp_buflock);
+	op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf);
+	if (op == NULL) {
+		/* If we don't have the whole header, drop packet. */
+		goto partial;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) {
+			spin_unlock_bh(&dccp_buflock);
+			return true;
+		}
+
+		if (op[i] < 2)
+			i++;
+		else
+			i += op[i+1]?:1;
+	}
+
+	spin_unlock_bh(&dccp_buflock);
+	return false;
+
+partial:
+	spin_unlock_bh(&dccp_buflock);
+invalid:
+	*hotdrop = true;
+	return false;
+}
+
+
+static inline bool
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+	return typemask & (1 << dh->dccph_type);
+}
+
+static inline bool
+match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff,
+	     const struct dccp_hdr *dh, bool *hotdrop)
+{
+	return dccp_find_option(option, skb, protoff, dh, hotdrop);
+}
+
+static bool
+dccp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_dccp_info *info = par->matchinfo;
+	const struct dccp_hdr *dh;
+	struct dccp_hdr _dh;
+
+	if (par->fragoff != 0)
+		return false;
+
+	dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh);
+	if (dh == NULL) {
+		par->hotdrop = true;
+		return false;
+	}
+
+	return  DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0]
+			&& ntohs(dh->dccph_sport) <= info->spts[1],
+			XT_DCCP_SRC_PORTS, info->flags, info->invflags)
+		&& DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0]
+			&& ntohs(dh->dccph_dport) <= info->dpts[1],
+			XT_DCCP_DEST_PORTS, info->flags, info->invflags)
+		&& DCCHECK(match_types(dh, info->typemask),
+			   XT_DCCP_TYPE, info->flags, info->invflags)
+		&& DCCHECK(match_option(info->option, skb, par->thoff, dh,
+					&par->hotdrop),
+			   XT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int dccp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_dccp_info *info = par->matchinfo;
+
+	if (info->flags & ~XT_DCCP_VALID_FLAGS)
+		return -EINVAL;
+	if (info->invflags & ~XT_DCCP_VALID_FLAGS)
+		return -EINVAL;
+	if (info->invflags & ~info->flags)
+		return -EINVAL;
+	return 0;
+}
+
+static struct xt_match dccp_mt_reg[] __read_mostly = {
+	{
+		.name 		= "dccp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= dccp_mt_check,
+		.match		= dccp_mt,
+		.matchsize	= sizeof(struct xt_dccp_info),
+		.proto		= IPPROTO_DCCP,
+		.me 		= THIS_MODULE,
+	},
+	{
+		.name 		= "dccp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= dccp_mt_check,
+		.match		= dccp_mt,
+		.matchsize	= sizeof(struct xt_dccp_info),
+		.proto		= IPPROTO_DCCP,
+		.me 		= THIS_MODULE,
+	},
+};
+
+static int __init dccp_mt_init(void)
+{
+	int ret;
+
+	/* doff is 8 bits, so the maximum option size is (4*256).  Don't put
+	 * this in BSS since DaveM is worried about locked TLB's for kernel
+	 * BSS. */
+	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+	if (!dccp_optbuf)
+		return -ENOMEM;
+	ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg));
+	if (ret)
+		goto out_kfree;
+	return ret;
+
+out_kfree:
+	kfree(dccp_optbuf);
+	return ret;
+}
+
+static void __exit dccp_mt_exit(void)
+{
+	xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg));
+	kfree(dccp_optbuf);
+}
+
+module_init(dccp_mt_init);
+module_exit(dccp_mt_exit);
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
new file mode 100644
index 00000000..d9202cdd
--- /dev/null
+++ b/net/netfilter/xt_devgroup.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <linux/netfilter/xt_devgroup.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Device group match");
+MODULE_ALIAS("ipt_devgroup");
+MODULE_ALIAS("ip6t_devgroup");
+
+static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_devgroup_info *info = par->matchinfo;
+
+	if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+	    (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+	     ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
+		return false;
+
+	if (info->flags & XT_DEVGROUP_MATCH_DST &&
+	    (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+	     ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
+		return false;
+
+	return true;
+}
+
+static int devgroup_mt_checkentry(const struct xt_mtchk_param *par)
+{
+	const struct xt_devgroup_info *info = par->matchinfo;
+
+	if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC |
+			    XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST))
+		return -EINVAL;
+
+	if (info->flags & XT_DEVGROUP_MATCH_SRC &&
+	    par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) |
+			       (1 << NF_INET_LOCAL_IN) |
+			       (1 << NF_INET_FORWARD)))
+		return -EINVAL;
+
+	if (info->flags & XT_DEVGROUP_MATCH_DST &&
+	    par->hook_mask & ~((1 << NF_INET_FORWARD) |
+			       (1 << NF_INET_LOCAL_OUT) |
+			       (1 << NF_INET_POST_ROUTING)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct xt_match devgroup_mt_reg __read_mostly = {
+	.name		= "devgroup",
+	.match		= devgroup_mt,
+	.checkentry	= devgroup_mt_checkentry,
+	.matchsize	= sizeof(struct xt_devgroup_info),
+	.family		= NFPROTO_UNSPEC,
+	.me		= THIS_MODULE
+};
+
+static int __init devgroup_mt_init(void)
+{
+	return xt_register_match(&devgroup_mt_reg);
+}
+
+static void __exit devgroup_mt_exit(void)
+{
+	xt_unregister_match(&devgroup_mt_reg);
+}
+
+module_init(devgroup_mt_init);
+module_exit(devgroup_mt_exit);
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
new file mode 100644
index 00000000..64670fc5
--- /dev/null
+++ b/net/netfilter/xt_dscp.c
@@ -0,0 +1,115 @@
+/* IP tables module for matching the value of the IPv4/IPv6 DSCP field
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_dscp.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: DSCP/TOS field match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_dscp");
+MODULE_ALIAS("ip6t_dscp");
+MODULE_ALIAS("ipt_tos");
+MODULE_ALIAS("ip6t_tos");
+
+static bool
+dscp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_dscp_info *info = par->matchinfo;
+	u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
+
+	return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static bool
+dscp_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_dscp_info *info = par->matchinfo;
+	u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
+
+	return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static int dscp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_dscp_info *info = par->matchinfo;
+
+	if (info->dscp > XT_DSCP_MAX) {
+		pr_info("dscp %x out of range\n", info->dscp);
+		return -EDOM;
+	}
+
+	return 0;
+}
+
+static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_tos_match_info *info = par->matchinfo;
+
+	if (par->family == NFPROTO_IPV4)
+		return ((ip_hdr(skb)->tos & info->tos_mask) ==
+		       info->tos_value) ^ !!info->invert;
+	else
+		return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) ==
+		       info->tos_value) ^ !!info->invert;
+}
+
+static struct xt_match dscp_mt_reg[] __read_mostly = {
+	{
+		.name		= "dscp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= dscp_mt_check,
+		.match		= dscp_mt,
+		.matchsize	= sizeof(struct xt_dscp_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "dscp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= dscp_mt_check,
+		.match		= dscp_mt6,
+		.matchsize	= sizeof(struct xt_dscp_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tos",
+		.revision	= 1,
+		.family		= NFPROTO_IPV4,
+		.match		= tos_mt,
+		.matchsize	= sizeof(struct xt_tos_match_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tos",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= tos_mt,
+		.matchsize	= sizeof(struct xt_tos_match_info),
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init dscp_mt_init(void)
+{
+	return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg));
+}
+
+static void __exit dscp_mt_exit(void)
+{
+	xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg));
+}
+
+module_init(dscp_mt_init);
+module_exit(dscp_mt_exit);
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
new file mode 100644
index 00000000..171ba82b
--- /dev/null
+++ b/net/netfilter/xt_esp.c
@@ -0,0 +1,107 @@
+/* Kernel module to match ESP parameters. */
+
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter/xt_esp.h>
+#include <linux/netfilter/x_tables.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match");
+MODULE_ALIAS("ipt_esp");
+MODULE_ALIAS("ip6t_esp");
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline bool
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
+{
+	bool r;
+	pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+		 invert ? '!' : ' ', min, spi, max);
+	r = (spi >= min && spi <= max) ^ invert;
+	pr_debug(" result %s\n", r ? "PASS" : "FAILED");
+	return r;
+}
+
+static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip_esp_hdr *eh;
+	struct ip_esp_hdr _esp;
+	const struct xt_esp *espinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp);
+	if (eh == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		pr_debug("Dropping evil ESP tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi),
+			 !!(espinfo->invflags & XT_ESP_INV_SPI));
+}
+
+static int esp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_esp *espinfo = par->matchinfo;
+
+	if (espinfo->invflags & ~XT_ESP_INV_MASK) {
+		pr_debug("unknown flags %X\n", espinfo->invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match esp_mt_reg[] __read_mostly = {
+	{
+		.name		= "esp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= esp_mt_check,
+		.match		= esp_mt,
+		.matchsize	= sizeof(struct xt_esp),
+		.proto		= IPPROTO_ESP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "esp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= esp_mt_check,
+		.match		= esp_mt,
+		.matchsize	= sizeof(struct xt_esp),
+		.proto		= IPPROTO_ESP,
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init esp_mt_init(void)
+{
+	return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg));
+}
+
+static void __exit esp_mt_exit(void)
+{
+	xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg));
+}
+
+module_init(esp_mt_init);
+module_exit(esp_mt_exit);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
new file mode 100644
index 00000000..9228ee0d
--- /dev/null
+++ b/net/netfilter/xt_hashlimit.c
@@ -0,0 +1,847 @@
+/*
+ *	xt_hashlimit - Netfilter module to limit the number of packets per time
+ *	separately for each hashbucket (sourceip/sourceport/dstip/dstport)
+ *
+ *	(C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ *	Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * Development of this code was funded by Astaro AG, http://www.astaro.com/
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#endif
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/xt_hashlimit.h>
+#include <linux/mutex.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match");
+MODULE_ALIAS("ipt_hashlimit");
+MODULE_ALIAS("ip6t_hashlimit");
+
+struct hashlimit_net {
+	struct hlist_head	htables;
+	struct proc_dir_entry	*ipt_hashlimit;
+	struct proc_dir_entry	*ip6t_hashlimit;
+};
+
+static int hashlimit_net_id;
+static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
+{
+	return net_generic(net, hashlimit_net_id);
+}
+
+/* need to declare this at the top */
+static const struct file_operations dl_file_ops;
+
+/* hash table crap */
+struct dsthash_dst {
+	union {
+		struct {
+			__be32 src;
+			__be32 dst;
+		} ip;
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+		struct {
+			__be32 src[4];
+			__be32 dst[4];
+		} ip6;
+#endif
+	};
+	__be16 src_port;
+	__be16 dst_port;
+};
+
+struct dsthash_ent {
+	/* static / read-only parts in the beginning */
+	struct hlist_node node;
+	struct dsthash_dst dst;
+
+	/* modified structure members in the end */
+	spinlock_t lock;
+	unsigned long expires;		/* precalculated expiry time */
+	struct {
+		unsigned long prev;	/* last modification */
+		u_int32_t credit;
+		u_int32_t credit_cap, cost;
+	} rateinfo;
+	struct rcu_head rcu;
+};
+
+struct xt_hashlimit_htable {
+	struct hlist_node node;		/* global list of all htables */
+	int use;
+	u_int8_t family;
+	bool rnd_initialized;
+
+	struct hashlimit_cfg1 cfg;	/* config */
+
+	/* used internally */
+	spinlock_t lock;		/* lock for list_head */
+	u_int32_t rnd;			/* random seed for hash */
+	unsigned int count;		/* number entries in table */
+	struct timer_list timer;	/* timer for gc */
+
+	/* seq_file stuff */
+	struct proc_dir_entry *pde;
+	struct net *net;
+
+	struct hlist_head hash[0];	/* hashtable itself */
+};
+
+static DEFINE_MUTEX(hashlimit_mutex);	/* protects htables list */
+static struct kmem_cache *hashlimit_cachep __read_mostly;
+
+static inline bool dst_cmp(const struct dsthash_ent *ent,
+			   const struct dsthash_dst *b)
+{
+	return !memcmp(&ent->dst, b, sizeof(ent->dst));
+}
+
+static u_int32_t
+hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+{
+	u_int32_t hash = jhash2((const u32 *)dst,
+				sizeof(*dst)/sizeof(u32),
+				ht->rnd);
+	/*
+	 * Instead of returning hash % ht->cfg.size (implying a divide)
+	 * we return the high 32 bits of the (hash * ht->cfg.size) that will
+	 * give results between [0 and cfg.size-1] and same hash distribution,
+	 * but using a multiply, less expensive than a divide
+	 */
+	return ((u64)hash * ht->cfg.size) >> 32;
+}
+
+static struct dsthash_ent *
+dsthash_find(const struct xt_hashlimit_htable *ht,
+	     const struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent;
+	struct hlist_node *pos;
+	u_int32_t hash = hash_dst(ht, dst);
+
+	if (!hlist_empty(&ht->hash[hash])) {
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
+				return ent;
+			}
+	}
+	return NULL;
+}
+
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+		   const struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent;
+
+	spin_lock(&ht->lock);
+	/* initialize hash with random val at the time we allocate
+	 * the first hashtable entry */
+	if (unlikely(!ht->rnd_initialized)) {
+		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+		ht->rnd_initialized = true;
+	}
+
+	if (ht->cfg.max && ht->count >= ht->cfg.max) {
+		/* FIXME: do something. question is what.. */
+		if (net_ratelimit())
+			pr_err("max count of %u reached\n", ht->cfg.max);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+	if (!ent) {
+		if (net_ratelimit())
+			pr_err("cannot allocate dsthash_ent\n");
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
+
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
+	return ent;
+}
+
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+	kmem_cache_free(hashlimit_cachep, ent);
+}
+
+static inline void
+dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+{
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
+	ht->count--;
+}
+static void htable_gc(unsigned long htlong);
+
+static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
+			 u_int8_t family)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+	struct xt_hashlimit_htable *hinfo;
+	unsigned int size;
+	unsigned int i;
+
+	if (minfo->cfg.size) {
+		size = minfo->cfg.size;
+	} else {
+		size = (totalram_pages << PAGE_SHIFT) / 16384 /
+		       sizeof(struct list_head);
+		if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE)
+			size = 8192;
+		if (size < 16)
+			size = 16;
+	}
+	/* FIXME: don't use vmalloc() here or anywhere else -HW */
+	hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
+	                sizeof(struct list_head) * size);
+	if (hinfo == NULL)
+		return -ENOMEM;
+	minfo->hinfo = hinfo;
+
+	/* copy match config into hashtable config */
+	memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+	hinfo->cfg.size = size;
+	if (hinfo->cfg.max == 0)
+		hinfo->cfg.max = 8 * hinfo->cfg.size;
+	else if (hinfo->cfg.max < hinfo->cfg.size)
+		hinfo->cfg.max = hinfo->cfg.size;
+
+	for (i = 0; i < hinfo->cfg.size; i++)
+		INIT_HLIST_HEAD(&hinfo->hash[i]);
+
+	hinfo->use = 1;
+	hinfo->count = 0;
+	hinfo->family = family;
+	hinfo->rnd_initialized = false;
+	spin_lock_init(&hinfo->lock);
+
+	hinfo->pde = proc_create_data(minfo->name, 0,
+		(family == NFPROTO_IPV4) ?
+		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
+		&dl_file_ops, hinfo);
+	if (hinfo->pde == NULL) {
+		vfree(hinfo);
+		return -ENOMEM;
+	}
+	hinfo->net = net;
+
+	setup_timer(&hinfo->timer, htable_gc, (unsigned long)hinfo);
+	hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval);
+	add_timer(&hinfo->timer);
+
+	hlist_add_head(&hinfo->node, &hashlimit_net->htables);
+
+	return 0;
+}
+
+static bool select_all(const struct xt_hashlimit_htable *ht,
+		       const struct dsthash_ent *he)
+{
+	return 1;
+}
+
+static bool select_gc(const struct xt_hashlimit_htable *ht,
+		      const struct dsthash_ent *he)
+{
+	return time_after_eq(jiffies, he->expires);
+}
+
+static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
+			bool (*select)(const struct xt_hashlimit_htable *ht,
+				      const struct dsthash_ent *he))
+{
+	unsigned int i;
+
+	/* lock hash table and iterate over it */
+	spin_lock_bh(&ht->lock);
+	for (i = 0; i < ht->cfg.size; i++) {
+		struct dsthash_ent *dh;
+		struct hlist_node *pos, *n;
+		hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
+			if ((*select)(ht, dh))
+				dsthash_free(ht, dh);
+		}
+	}
+	spin_unlock_bh(&ht->lock);
+}
+
+/* hash table garbage collector, run by timer */
+static void htable_gc(unsigned long htlong)
+{
+	struct xt_hashlimit_htable *ht = (struct xt_hashlimit_htable *)htlong;
+
+	htable_selective_cleanup(ht, select_gc);
+
+	/* re-add the timer accordingly */
+	ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval);
+	add_timer(&ht->timer);
+}
+
+static void htable_destroy(struct xt_hashlimit_htable *hinfo)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net);
+	struct proc_dir_entry *parent;
+
+	del_timer_sync(&hinfo->timer);
+
+	if (hinfo->family == NFPROTO_IPV4)
+		parent = hashlimit_net->ipt_hashlimit;
+	else
+		parent = hashlimit_net->ip6t_hashlimit;
+	remove_proc_entry(hinfo->pde->name, parent);
+	htable_selective_cleanup(hinfo, select_all);
+	vfree(hinfo);
+}
+
+static struct xt_hashlimit_htable *htable_find_get(struct net *net,
+						   const char *name,
+						   u_int8_t family)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+	struct xt_hashlimit_htable *hinfo;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) {
+		if (!strcmp(name, hinfo->pde->name) &&
+		    hinfo->family == family) {
+			hinfo->use++;
+			return hinfo;
+		}
+	}
+	return NULL;
+}
+
+static void htable_put(struct xt_hashlimit_htable *hinfo)
+{
+	mutex_lock(&hashlimit_mutex);
+	if (--hinfo->use == 0) {
+		hlist_del(&hinfo->node);
+		htable_destroy(hinfo);
+	}
+	mutex_unlock(&hashlimit_mutex);
+}
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maximum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+}
+
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+{
+	dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
+	if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
+		dh->rateinfo.credit = dh->rateinfo.credit_cap;
+	dh->rateinfo.prev = now;
+}
+
+static inline __be32 maskl(__be32 a, unsigned int l)
+{
+	return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
+}
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
+{
+	switch (p) {
+	case 0 ... 31:
+		i[0] = maskl(i[0], p);
+		i[1] = i[2] = i[3] = 0;
+		break;
+	case 32 ... 63:
+		i[1] = maskl(i[1], p - 32);
+		i[2] = i[3] = 0;
+		break;
+	case 64 ... 95:
+		i[2] = maskl(i[2], p - 64);
+		i[3] = 0;
+		break;
+	case 96 ... 127:
+		i[3] = maskl(i[3], p - 96);
+		break;
+	case 128:
+		break;
+	}
+}
+#endif
+
+static int
+hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
+		   struct dsthash_dst *dst,
+		   const struct sk_buff *skb, unsigned int protoff)
+{
+	__be16 _ports[2], *ports;
+	u8 nexthdr;
+	int poff;
+
+	memset(dst, 0, sizeof(*dst));
+
+	switch (hinfo->family) {
+	case NFPROTO_IPV4:
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP)
+			dst->ip.dst = maskl(ip_hdr(skb)->daddr,
+			              hinfo->cfg.dstmask);
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP)
+			dst->ip.src = maskl(ip_hdr(skb)->saddr,
+			              hinfo->cfg.srcmask);
+
+		if (!(hinfo->cfg.mode &
+		      (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
+			return 0;
+		nexthdr = ip_hdr(skb)->protocol;
+		break;
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	case NFPROTO_IPV6:
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) {
+			memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr,
+			       sizeof(dst->ip6.dst));
+			hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask);
+		}
+		if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) {
+			memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr,
+			       sizeof(dst->ip6.src));
+			hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask);
+		}
+
+		if (!(hinfo->cfg.mode &
+		      (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT)))
+			return 0;
+		nexthdr = ipv6_hdr(skb)->nexthdr;
+		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr);
+		if ((int)protoff < 0)
+			return -1;
+		break;
+#endif
+	default:
+		BUG();
+		return 0;
+	}
+
+	poff = proto_ports_offset(nexthdr);
+	if (poff >= 0) {
+		ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
+					   &_ports);
+	} else {
+		_ports[0] = _ports[1] = 0;
+		ports = _ports;
+	}
+	if (!ports)
+		return -1;
+	if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT)
+		dst->src_port = ports[0];
+	if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT)
+		dst->dst_port = ports[1];
+	return 0;
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	unsigned long now = jiffies;
+	struct dsthash_ent *dh;
+	struct dsthash_dst dst;
+
+	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
+		goto hotdrop;
+
+	rcu_read_lock_bh();
+	dh = dsthash_find(hinfo, &dst);
+	if (dh == NULL) {
+		dh = dsthash_alloc_init(hinfo, &dst);
+		if (dh == NULL) {
+			rcu_read_unlock_bh();
+			goto hotdrop;
+		}
+		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+		dh->rateinfo.prev = jiffies;
+		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
+		                      hinfo->cfg.burst);
+		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
+		                          hinfo->cfg.burst);
+		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+	} else {
+		/* update expiration timeout */
+		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+		rateinfo_recalc(dh, now);
+	}
+
+	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+		/* below the limit */
+		dh->rateinfo.credit -= dh->rateinfo.cost;
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
+		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
+	}
+
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
+	/* default match is underlimit - so over the limit, we need to invert */
+	return info->cfg.mode & XT_HASHLIMIT_INVERT;
+
+ hotdrop:
+	par->hotdrop = true;
+	return false;
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct net *net = par->net;
+	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+	int ret;
+
+	/* Check for overflow. */
+	if (info->cfg.burst == 0 ||
+	    user2credits(info->cfg.avg * info->cfg.burst) <
+	    user2credits(info->cfg.avg)) {
+		pr_info("overflow, try lower: %u/%u\n",
+			info->cfg.avg, info->cfg.burst);
+		return -ERANGE;
+	}
+	if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
+		return -EINVAL;
+	if (info->name[sizeof(info->name)-1] != '\0')
+		return -EINVAL;
+	if (par->family == NFPROTO_IPV4) {
+		if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32)
+			return -EINVAL;
+	} else {
+		if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128)
+			return -EINVAL;
+	}
+
+	mutex_lock(&hashlimit_mutex);
+	info->hinfo = htable_find_get(net, info->name, par->family);
+	if (info->hinfo == NULL) {
+		ret = htable_create(net, info, par->family);
+		if (ret < 0) {
+			mutex_unlock(&hashlimit_mutex);
+			return ret;
+		}
+	}
+	mutex_unlock(&hashlimit_mutex);
+	return 0;
+}
+
+static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
+}
+
+static struct xt_match hashlimit_mt_reg[] __read_mostly = {
+	{
+		.name           = "hashlimit",
+		.revision       = 1,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	{
+		.name           = "hashlimit",
+		.revision       = 1,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo1),
+		.checkentry     = hashlimit_mt_check,
+		.destroy        = hashlimit_mt_destroy,
+		.me             = THIS_MODULE,
+	},
+#endif
+};
+
+/* PROC stuff */
+static void *dl_seq_start(struct seq_file *s, loff_t *pos)
+	__acquires(htable->lock)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket;
+
+	spin_lock_bh(&htable->lock);
+	if (*pos >= htable->cfg.size)
+		return NULL;
+
+	bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+	if (!bucket)
+		return ERR_PTR(-ENOMEM);
+
+	*bucket = *pos;
+	return bucket;
+}
+
+static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+
+	*pos = ++(*bucket);
+	if (*pos >= htable->cfg.size) {
+		kfree(v);
+		return NULL;
+	}
+	return bucket;
+}
+
+static void dl_seq_stop(struct seq_file *s, void *v)
+	__releases(htable->lock)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+
+	if (!IS_ERR(bucket))
+		kfree(bucket);
+	spin_unlock_bh(&htable->lock);
+}
+
+static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
+				   struct seq_file *s)
+{
+	int res;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies);
+
+	switch (family) {
+	case NFPROTO_IPV4:
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+				 (long)(ent->expires - jiffies)/HZ,
+				 &ent->dst.ip.src,
+				 ntohs(ent->dst.src_port),
+				 &ent->dst.ip.dst,
+				 ntohs(ent->dst.dst_port),
+				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 ent->rateinfo.cost);
+		break;
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	case NFPROTO_IPV6:
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+				 (long)(ent->expires - jiffies)/HZ,
+				 &ent->dst.ip6.src,
+				 ntohs(ent->dst.src_port),
+				 &ent->dst.ip6.dst,
+				 ntohs(ent->dst.dst_port),
+				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 ent->rateinfo.cost);
+		break;
+#endif
+	default:
+		BUG();
+		res = 0;
+	}
+	spin_unlock(&ent->lock);
+	return res;
+}
+
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+	struct hlist_node *pos;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
+			if (dl_seq_real_show(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
+static const struct seq_operations dl_seq_ops = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show
+};
+
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		sf->private = PDE(inode)->data;
+	}
+	return ret;
+}
+
+static const struct file_operations dl_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static int __net_init hashlimit_proc_net_init(struct net *net)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+	hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
+	if (!hashlimit_net->ipt_hashlimit)
+		return -ENOMEM;
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
+	if (!hashlimit_net->ip6t_hashlimit) {
+		proc_net_remove(net, "ipt_hashlimit");
+		return -ENOMEM;
+	}
+#endif
+	return 0;
+}
+
+static void __net_exit hashlimit_proc_net_exit(struct net *net)
+{
+	proc_net_remove(net, "ipt_hashlimit");
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+	proc_net_remove(net, "ip6t_hashlimit");
+#endif
+}
+
+static int __net_init hashlimit_net_init(struct net *net)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+	INIT_HLIST_HEAD(&hashlimit_net->htables);
+	return hashlimit_proc_net_init(net);
+}
+
+static void __net_exit hashlimit_net_exit(struct net *net)
+{
+	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+
+	BUG_ON(!hlist_empty(&hashlimit_net->htables));
+	hashlimit_proc_net_exit(net);
+}
+
+static struct pernet_operations hashlimit_net_ops = {
+	.init	= hashlimit_net_init,
+	.exit	= hashlimit_net_exit,
+	.id	= &hashlimit_net_id,
+	.size	= sizeof(struct hashlimit_net),
+};
+
+static int __init hashlimit_mt_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&hashlimit_net_ops);
+	if (err < 0)
+		return err;
+	err = xt_register_matches(hashlimit_mt_reg,
+	      ARRAY_SIZE(hashlimit_mt_reg));
+	if (err < 0)
+		goto err1;
+
+	err = -ENOMEM;
+	hashlimit_cachep = kmem_cache_create("xt_hashlimit",
+					    sizeof(struct dsthash_ent), 0, 0,
+					    NULL);
+	if (!hashlimit_cachep) {
+		pr_warning("unable to create slab cache\n");
+		goto err2;
+	}
+	return 0;
+
+err2:
+	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
+err1:
+	unregister_pernet_subsys(&hashlimit_net_ops);
+	return err;
+
+}
+
+static void __exit hashlimit_mt_exit(void)
+{
+	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
+	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
+}
+
+module_init(hashlimit_mt_init);
+module_exit(hashlimit_mt_exit);
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
new file mode 100644
index 00000000..9f4ab00c
--- /dev/null
+++ b/net/netfilter/xt_helper.c
@@ -0,0 +1,99 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: Related connection matching");
+MODULE_ALIAS("ipt_helper");
+MODULE_ALIAS("ip6t_helper");
+
+
+static bool
+helper_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_helper_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	const struct nf_conn_help *master_help;
+	const struct nf_conntrack_helper *helper;
+	enum ip_conntrack_info ctinfo;
+	bool ret = info->invert;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || !ct->master)
+		return ret;
+
+	master_help = nfct_help(ct->master);
+	if (!master_help)
+		return ret;
+
+	/* rcu_read_lock()ed by nf_hook_slow */
+	helper = rcu_dereference(master_help->helper);
+	if (!helper)
+		return ret;
+
+	if (info->name[0] == '\0')
+		ret = !ret;
+	else
+		ret ^= !strncmp(helper->name, info->name,
+				strlen(helper->name));
+	return ret;
+}
+
+static int helper_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_helper_info *info = par->matchinfo;
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0) {
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+		return ret;
+	}
+	info->name[29] = '\0';
+	return 0;
+}
+
+static void helper_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match helper_mt_reg __read_mostly = {
+	.name       = "helper",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = helper_mt_check,
+	.match      = helper_mt,
+	.destroy    = helper_mt_destroy,
+	.matchsize  = sizeof(struct xt_helper_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init helper_mt_init(void)
+{
+	return xt_register_match(&helper_mt_reg);
+}
+
+static void __exit helper_mt_exit(void)
+{
+	xt_unregister_match(&helper_mt_reg);
+}
+
+module_init(helper_mt_init);
+module_exit(helper_mt_exit);
diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c
new file mode 100644
index 00000000..7d12221e
--- /dev/null
+++ b/net/netfilter/xt_hl.c
@@ -0,0 +1,96 @@
+/*
+ * IP tables module for matching the value of the TTL
+ * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
+ *
+ * Hop Limit matching module
+ * (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ipt_ttl.h>
+#include <linux/netfilter_ipv6/ip6t_hl.h>
+
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ttl");
+MODULE_ALIAS("ip6t_hl");
+
+static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ipt_ttl_info *info = par->matchinfo;
+	const u8 ttl = ip_hdr(skb)->ttl;
+
+	switch (info->mode) {
+		case IPT_TTL_EQ:
+			return ttl == info->ttl;
+		case IPT_TTL_NE:
+			return ttl != info->ttl;
+		case IPT_TTL_LT:
+			return ttl < info->ttl;
+		case IPT_TTL_GT:
+			return ttl > info->ttl;
+	}
+
+	return false;
+}
+
+static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_hl_info *info = par->matchinfo;
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+	switch (info->mode) {
+		case IP6T_HL_EQ:
+			return ip6h->hop_limit == info->hop_limit;
+		case IP6T_HL_NE:
+			return ip6h->hop_limit != info->hop_limit;
+		case IP6T_HL_LT:
+			return ip6h->hop_limit < info->hop_limit;
+		case IP6T_HL_GT:
+			return ip6h->hop_limit > info->hop_limit;
+	}
+
+	return false;
+}
+
+static struct xt_match hl_mt_reg[] __read_mostly = {
+	{
+		.name       = "ttl",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.match      = ttl_mt,
+		.matchsize  = sizeof(struct ipt_ttl_info),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "hl",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.match      = hl_mt6,
+		.matchsize  = sizeof(struct ip6t_hl_info),
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init hl_mt_init(void)
+{
+	return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg));
+}
+
+static void __exit hl_mt_exit(void)
+{
+	xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg));
+}
+
+module_init(hl_mt_init);
+module_exit(hl_mt_exit);
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
new file mode 100644
index 00000000..b46626cd
--- /dev/null
+++ b/net/netfilter/xt_iprange.c
@@ -0,0 +1,140 @@
+/*
+ *	xt_iprange - Netfilter module to match IP address ranges
+ *
+ *	(C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *	(C) CC Computer Consultants GmbH, 2008
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_iprange.h>
+
+static bool
+iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
+	const struct iphdr *iph = ip_hdr(skb);
+	bool m;
+
+	if (info->flags & IPRANGE_SRC) {
+		m  = ntohl(iph->saddr) < ntohl(info->src_min.ip);
+		m |= ntohl(iph->saddr) > ntohl(info->src_max.ip);
+		m ^= !!(info->flags & IPRANGE_SRC_INV);
+		if (m) {
+			pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+			         &iph->saddr,
+			         (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+			         &info->src_min.ip,
+			         &info->src_max.ip);
+			return false;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		m  = ntohl(iph->daddr) < ntohl(info->dst_min.ip);
+		m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip);
+		m ^= !!(info->flags & IPRANGE_DST_INV);
+		if (m) {
+			pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+			         &iph->daddr,
+			         (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+			         &info->dst_min.ip,
+			         &info->dst_max.ip);
+			return false;
+		}
+	}
+	return true;
+}
+
+static inline int
+iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b)
+{
+	unsigned int i;
+
+	for (i = 0; i < 4; ++i) {
+		if (a->s6_addr32[i] != b->s6_addr32[i])
+			return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]);
+	}
+
+	return 0;
+}
+
+static bool
+iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	bool m;
+
+	if (info->flags & IPRANGE_SRC) {
+		m  = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6);
+		m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr);
+		m ^= !!(info->flags & IPRANGE_SRC_INV);
+		if (m) {
+			pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n",
+				 &iph->saddr,
+				 (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "",
+				 &info->src_min.in6,
+				 &info->src_max.in6);
+			return false;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		m  = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6);
+		m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr);
+		m ^= !!(info->flags & IPRANGE_DST_INV);
+		if (m) {
+			pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n",
+				 &iph->daddr,
+				 (info->flags & IPRANGE_DST_INV) ? "(INV) " : "",
+				 &info->dst_min.in6,
+				 &info->dst_max.in6);
+			return false;
+		}
+	}
+	return true;
+}
+
+static struct xt_match iprange_mt_reg[] __read_mostly = {
+	{
+		.name      = "iprange",
+		.revision  = 1,
+		.family    = NFPROTO_IPV4,
+		.match     = iprange_mt4,
+		.matchsize = sizeof(struct xt_iprange_mtinfo),
+		.me        = THIS_MODULE,
+	},
+	{
+		.name      = "iprange",
+		.revision  = 1,
+		.family    = NFPROTO_IPV6,
+		.match     = iprange_mt6,
+		.matchsize = sizeof(struct xt_iprange_mtinfo),
+		.me        = THIS_MODULE,
+	},
+};
+
+static int __init iprange_mt_init(void)
+{
+	return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg));
+}
+
+static void __exit iprange_mt_exit(void)
+{
+	xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg));
+}
+
+module_init(iprange_mt_init);
+module_exit(iprange_mt_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
+MODULE_ALIAS("ipt_iprange");
+MODULE_ALIAS("ip6t_iprange");
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
new file mode 100644
index 00000000..bb10b071
--- /dev/null
+++ b/net/netfilter/xt_ipvs.c
@@ -0,0 +1,188 @@
+/*
+ *	xt_ipvs - kernel module to match IPVS connection properties
+ *
+ *	Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+			    const union nf_inet_addr *uaddr,
+			    const union nf_inet_addr *umask,
+			    unsigned int l3proto)
+{
+	if (l3proto == NFPROTO_IPV4)
+		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+	else if (l3proto == NFPROTO_IPV6)
+		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+		       &uaddr->in6) == 0;
+#endif
+	else
+		return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_ipvs_mtinfo *data = par->matchinfo;
+	/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+	const u_int8_t family = par->family;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	bool match = true;
+
+	if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+		match = skb->ipvs_property ^
+			!!(data->invert & XT_IPVS_IPVS_PROPERTY);
+		goto out;
+	}
+
+	/* other flags than XT_IPVS_IPVS_PROPERTY are set */
+	if (!skb->ipvs_property) {
+		match = false;
+		goto out;
+	}
+
+	ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+
+	if (data->bitmask & XT_IPVS_PROTO)
+		if ((iph.protocol == data->l4proto) ^
+		    !(data->invert & XT_IPVS_PROTO)) {
+			match = false;
+			goto out;
+		}
+
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */);
+	if (unlikely(cp == NULL)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * We found a connection, i.e. ct != 0, make sure to call
+	 * __ip_vs_conn_put before returning.  In our case jump to out_put_con.
+	 */
+
+	if (data->bitmask & XT_IPVS_VPORT)
+		if ((cp->vport == data->vport) ^
+		    !(data->invert & XT_IPVS_VPORT)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VPORTCTL)
+		if ((cp->control != NULL &&
+		     cp->control->vport == data->vportctl) ^
+		    !(data->invert & XT_IPVS_VPORTCTL)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_DIR) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct == NULL || nf_ct_is_untracked(ct)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+		if ((ctinfo >= IP_CT_IS_REPLY) ^
+		    !!(data->invert & XT_IPVS_DIR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+	if (data->bitmask & XT_IPVS_METHOD)
+		if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+		    !(data->invert & XT_IPVS_METHOD)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VADDR) {
+		if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+				    &data->vmask, family) ^
+		    !(data->invert & XT_IPVS_VADDR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+out_put_cp:
+	__ip_vs_conn_put(cp);
+out:
+	pr_debug("match=%d\n", match);
+	return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+	if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+	    && par->family != NFPROTO_IPV6
+#endif
+		) {
+		pr_info("protocol family %u not supported\n", par->family);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+	.name       = "ipvs",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = ipvs_mt,
+	.checkentry = ipvs_mt_check,
+	.matchsize  = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+	.me         = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+	return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+	xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
new file mode 100644
index 00000000..176e5570
--- /dev/null
+++ b/net/netfilter/xt_length.c
@@ -0,0 +1,70 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+
+#include <linux/netfilter/xt_length.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_length");
+MODULE_ALIAS("ip6t_length");
+
+static bool
+length_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_length_info *info = par->matchinfo;
+	u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
+
+	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static bool
+length_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_length_info *info = par->matchinfo;
+	const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
+				 sizeof(struct ipv6hdr);
+
+	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static struct xt_match length_mt_reg[] __read_mostly = {
+	{
+		.name		= "length",
+		.family		= NFPROTO_IPV4,
+		.match		= length_mt,
+		.matchsize	= sizeof(struct xt_length_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "length",
+		.family		= NFPROTO_IPV6,
+		.match		= length_mt6,
+		.matchsize	= sizeof(struct xt_length_info),
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init length_mt_init(void)
+{
+	return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg));
+}
+
+static void __exit length_mt_exit(void)
+{
+	xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg));
+}
+
+module_init(length_mt_init);
+module_exit(length_mt_exit);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
new file mode 100644
index 00000000..32b7a579
--- /dev/null
+++ b/net/netfilter/xt_limit.c
@@ -0,0 +1,210 @@
+/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_limit.h>
+
+struct xt_limit_priv {
+	unsigned long prev;
+	uint32_t credit;
+};
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("Xtables: rate-limit match");
+MODULE_ALIAS("ipt_limit");
+MODULE_ALIAS("ip6t_limit");
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static DEFINE_SPINLOCK(limit_lock);
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maxmum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static bool
+limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv = r->master;
+	unsigned long now = jiffies;
+
+	spin_lock_bh(&limit_lock);
+	priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+	if (priv->credit > r->credit_cap)
+		priv->credit = r->credit_cap;
+
+	if (priv->credit >= r->cost) {
+		/* We're not limited. */
+		priv->credit -= r->cost;
+		spin_unlock_bh(&limit_lock);
+		return true;
+	}
+
+	spin_unlock_bh(&limit_lock);
+	return false;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE;
+}
+
+static int limit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv;
+
+	/* Check for overflow. */
+	if (r->burst == 0
+	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+		pr_info("Overflow, try lower: %u/%u\n",
+			r->avg, r->burst);
+		return -ERANGE;
+	}
+
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	/* For SMP, we only want to use one set of state. */
+	r->master = priv;
+	if (r->cost == 0) {
+		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+		   128. */
+		priv->prev = jiffies;
+		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
+		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+		r->cost = user2credits(r->avg);
+	}
+	return 0;
+}
+
+static void limit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_rateinfo *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_rateinfo {
+	u_int32_t avg;
+	u_int32_t burst;
+
+	compat_ulong_t prev;
+	u_int32_t credit;
+	u_int32_t credit_cap, cost;
+
+	u_int32_t master;
+};
+
+/* To keep the full "prev" timestamp, the upper 32 bits are stored in the
+ * master pointer, which does not need to be preserved. */
+static void limit_mt_compat_from_user(void *dst, const void *src)
+{
+	const struct compat_xt_rateinfo *cm = src;
+	struct xt_rateinfo m = {
+		.avg		= cm->avg,
+		.burst		= cm->burst,
+		.prev		= cm->prev | (unsigned long)cm->master << 32,
+		.credit		= cm->credit,
+		.credit_cap	= cm->credit_cap,
+		.cost		= cm->cost,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int limit_mt_compat_to_user(void __user *dst, const void *src)
+{
+	const struct xt_rateinfo *m = src;
+	struct compat_xt_rateinfo cm = {
+		.avg		= m->avg,
+		.burst		= m->burst,
+		.prev		= m->prev,
+		.credit		= m->credit,
+		.credit_cap	= m->credit_cap,
+		.cost		= m->cost,
+		.master		= m->prev >> 32,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match limit_mt_reg __read_mostly = {
+	.name             = "limit",
+	.revision         = 0,
+	.family           = NFPROTO_UNSPEC,
+	.match            = limit_mt,
+	.checkentry       = limit_mt_check,
+	.destroy          = limit_mt_destroy,
+	.matchsize        = sizeof(struct xt_rateinfo),
+#ifdef CONFIG_COMPAT
+	.compatsize       = sizeof(struct compat_xt_rateinfo),
+	.compat_from_user = limit_mt_compat_from_user,
+	.compat_to_user   = limit_mt_compat_to_user,
+#endif
+	.me               = THIS_MODULE,
+};
+
+static int __init limit_mt_init(void)
+{
+	return xt_register_match(&limit_mt_reg);
+}
+
+static void __exit limit_mt_exit(void)
+{
+	xt_unregister_match(&limit_mt_reg);
+}
+
+module_init(limit_mt_init);
+module_exit(limit_mt_exit);
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
new file mode 100644
index 00000000..8160f6b1
--- /dev/null
+++ b/net/netfilter/xt_mac.c
@@ -0,0 +1,66 @@
+/* Kernel module to match MAC address parameters. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_mac.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: MAC address match");
+MODULE_ALIAS("ipt_mac");
+MODULE_ALIAS("ip6t_mac");
+
+static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mac_info *info = par->matchinfo;
+	bool ret;
+
+	if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER)
+		return false;
+	if (skb_mac_header(skb) < skb->head)
+		return false;
+	if (skb_mac_header(skb) + ETH_HLEN > skb->data)
+		return false;
+	ret  = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0;
+	ret ^= info->invert;
+	return ret;
+}
+
+static struct xt_match mac_mt_reg __read_mostly = {
+	.name      = "mac",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = mac_mt,
+	.matchsize = sizeof(struct xt_mac_info),
+	.hooks     = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) |
+	             (1 << NF_INET_FORWARD),
+	.me        = THIS_MODULE,
+};
+
+static int __init mac_mt_init(void)
+{
+	return xt_register_match(&mac_mt_reg);
+}
+
+static void __exit mac_mt_exit(void)
+{
+	xt_unregister_match(&mac_mt_reg);
+}
+
+module_init(mac_mt_init);
+module_exit(mac_mt_exit);
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
new file mode 100644
index 00000000..23345238
--- /dev/null
+++ b/net/netfilter/xt_mark.c
@@ -0,0 +1,84 @@
+/*
+ *	xt_mark - Netfilter module to match NFMARK value
+ *
+ *	(C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *	Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *	Jan Engelhardt <jengelh@medozas.de>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/xt_mark.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("Xtables: packet mark operations");
+MODULE_ALIAS("ipt_mark");
+MODULE_ALIAS("ip6t_mark");
+MODULE_ALIAS("ipt_MARK");
+MODULE_ALIAS("ip6t_MARK");
+
+static unsigned int
+mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_mark_tginfo2 *info = par->targinfo;
+
+	skb->mark = (skb->mark & ~info->mask) ^ info->mark;
+	return XT_CONTINUE;
+}
+
+static bool
+mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mark_mtinfo1 *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static struct xt_target mark_tg_reg __read_mostly = {
+	.name           = "MARK",
+	.revision       = 2,
+	.family         = NFPROTO_UNSPEC,
+	.target         = mark_tg,
+	.targetsize     = sizeof(struct xt_mark_tginfo2),
+	.me             = THIS_MODULE,
+};
+
+static struct xt_match mark_mt_reg __read_mostly = {
+	.name           = "mark",
+	.revision       = 1,
+	.family         = NFPROTO_UNSPEC,
+	.match          = mark_mt,
+	.matchsize      = sizeof(struct xt_mark_mtinfo1),
+	.me             = THIS_MODULE,
+};
+
+static int __init mark_mt_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&mark_tg_reg);
+	if (ret < 0)
+		return ret;
+	ret = xt_register_match(&mark_mt_reg);
+	if (ret < 0) {
+		xt_unregister_target(&mark_tg_reg);
+		return ret;
+	}
+	return 0;
+}
+
+static void __exit mark_mt_exit(void)
+{
+	xt_unregister_match(&mark_mt_reg);
+	xt_unregister_target(&mark_tg_reg);
+}
+
+module_init(mark_mt_init);
+module_exit(mark_mt_exit);
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
new file mode 100644
index 00000000..ac1d3c3d
--- /dev/null
+++ b/net/netfilter/xt_multiport.c
@@ -0,0 +1,165 @@
+/* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports:
+   ports are in the same place so we can treat them as equal. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+
+#include <linux/netfilter/xt_multiport.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP");
+MODULE_ALIAS("ipt_multiport");
+MODULE_ALIAS("ip6t_multiport");
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline bool
+ports_match_v1(const struct xt_multiport_v1 *minfo,
+	       u_int16_t src, u_int16_t dst)
+{
+	unsigned int i;
+	u_int16_t s, e;
+
+	for (i = 0; i < minfo->count; i++) {
+		s = minfo->ports[i];
+
+		if (minfo->pflags[i]) {
+			/* range port matching */
+			e = minfo->ports[++i];
+			pr_debug("src or dst matches with %d-%d?\n", s, e);
+
+			if (minfo->flags == XT_MULTIPORT_SOURCE
+			    && src >= s && src <= e)
+				return true ^ minfo->invert;
+			if (minfo->flags == XT_MULTIPORT_DESTINATION
+			    && dst >= s && dst <= e)
+				return true ^ minfo->invert;
+			if (minfo->flags == XT_MULTIPORT_EITHER
+			    && ((dst >= s && dst <= e)
+				|| (src >= s && src <= e)))
+				return true ^ minfo->invert;
+		} else {
+			/* exact port matching */
+			pr_debug("src or dst matches with %d?\n", s);
+
+			if (minfo->flags == XT_MULTIPORT_SOURCE
+			    && src == s)
+				return true ^ minfo->invert;
+			if (minfo->flags == XT_MULTIPORT_DESTINATION
+			    && dst == s)
+				return true ^ minfo->invert;
+			if (minfo->flags == XT_MULTIPORT_EITHER
+			    && (src == s || dst == s))
+				return true ^ minfo->invert;
+		}
+	}
+
+	return minfo->invert;
+}
+
+static bool
+multiport_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const __be16 *pptr;
+	__be16 _ports[2];
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+	if (par->fragoff != 0)
+		return false;
+
+	pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		pr_debug("Dropping evil offset=0 tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+static inline bool
+check(u_int16_t proto,
+      u_int8_t ip_invflags,
+      u_int8_t match_flags,
+      u_int8_t count)
+{
+	/* Must specify supported protocol, no unknown flags or bad count */
+	return (proto == IPPROTO_TCP || proto == IPPROTO_UDP
+		|| proto == IPPROTO_UDPLITE
+		|| proto == IPPROTO_SCTP || proto == IPPROTO_DCCP)
+		&& !(ip_invflags & XT_INV_PROTO)
+		&& (match_flags == XT_MULTIPORT_SOURCE
+		    || match_flags == XT_MULTIPORT_DESTINATION
+		    || match_flags == XT_MULTIPORT_EITHER)
+		&& count <= XT_MULTI_PORTS;
+}
+
+static int multiport_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct ipt_ip *ip = par->entryinfo;
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+	return check(ip->proto, ip->invflags, multiinfo->flags,
+		     multiinfo->count) ? 0 : -EINVAL;
+}
+
+static int multiport_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_ip6 *ip = par->entryinfo;
+	const struct xt_multiport_v1 *multiinfo = par->matchinfo;
+
+	return check(ip->proto, ip->invflags, multiinfo->flags,
+		     multiinfo->count) ? 0 : -EINVAL;
+}
+
+static struct xt_match multiport_mt_reg[] __read_mostly = {
+	{
+		.name		= "multiport",
+		.family		= NFPROTO_IPV4,
+		.revision	= 1,
+		.checkentry	= multiport_mt_check,
+		.match		= multiport_mt,
+		.matchsize	= sizeof(struct xt_multiport_v1),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "multiport",
+		.family		= NFPROTO_IPV6,
+		.revision	= 1,
+		.checkentry	= multiport_mt6_check,
+		.match		= multiport_mt,
+		.matchsize	= sizeof(struct xt_multiport_v1),
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init multiport_mt_init(void)
+{
+	return xt_register_matches(multiport_mt_reg,
+	       ARRAY_SIZE(multiport_mt_reg));
+}
+
+static void __exit multiport_mt_exit(void)
+{
+	xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg));
+}
+
+module_init(multiport_mt_init);
+module_exit(multiport_mt_exit);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
new file mode 100644
index 00000000..846f895c
--- /dev/null
+++ b/net/netfilter/xt_osf.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/xt_osf.h>
+
+struct xt_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct xt_osf_user_finger	finger;
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+static struct list_head xt_osf_fingers[2];
+
+static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
+	[OSF_ATTR_FINGER]	= { .len = sizeof(struct xt_osf_user_finger) },
+};
+
+static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
+			       const struct nlmsghdr *nlh,
+			       const struct nlattr * const osf_attrs[])
+{
+	struct xt_osf_user_finger *f;
+	struct xt_osf_finger *kf = NULL, *sf;
+	int err = 0;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
+	if (!kf)
+		return -ENOMEM;
+
+	memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
+
+	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+			continue;
+
+		kfree(kf);
+		kf = NULL;
+
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			err = -EEXIST;
+		break;
+	}
+
+	/*
+	 * We are protected by nfnl mutex.
+	 */
+	if (kf)
+		list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
+
+	return err;
+}
+
+static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,
+				  const struct nlmsghdr *nlh,
+				  const struct nlattr * const osf_attrs[])
+{
+	struct xt_osf_user_finger *f;
+	struct xt_osf_finger *sf;
+	int err = -ENOENT;
+
+	if (!osf_attrs[OSF_ATTR_FINGER])
+		return -EINVAL;
+
+	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+	list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
+		if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
+			continue;
+
+		/*
+		 * We are protected by nfnl mutex.
+		 */
+		list_del_rcu(&sf->finger_entry);
+		kfree_rcu(sf, rcu_head);
+
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
+	[OSF_MSG_ADD]	= {
+		.call		= xt_osf_add_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= xt_osf_policy,
+	},
+	[OSF_MSG_REMOVE]	= {
+		.call		= xt_osf_remove_callback,
+		.attr_count	= OSF_ATTR_MAX,
+		.policy		= xt_osf_policy,
+	},
+};
+
+static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
+	.name			= "osf",
+	.subsys_id		= NFNL_SUBSYS_OSF,
+	.cb_count		= OSF_MSG_MAX,
+	.cb			= xt_osf_nfnetlink_callbacks,
+};
+
+static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
+			    unsigned char f_ttl)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	if (info->flags & XT_OSF_TTL) {
+		if (info->ttl == XT_OSF_TTL_TRUE)
+			return ip->ttl == f_ttl;
+		if (info->ttl == XT_OSF_TTL_NOCHECK)
+			return 1;
+		else if (ip->ttl <= f_ttl)
+			return 1;
+		else {
+			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+			int ret = 0;
+
+			for_ifa(in_dev) {
+				if (inet_ifa_match(ip->saddr, ifa)) {
+					ret = (ip->ttl == f_ttl);
+					break;
+				}
+			}
+			endfor_ifa(in_dev);
+
+			return ret;
+		}
+	}
+
+	return ip->ttl == f_ttl;
+}
+
+static bool
+xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
+{
+	const struct xt_osf_info *info = p->matchinfo;
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	int fmatch = FMATCH_WRONG, fcount = 0;
+	unsigned int optsize = 0, check_WSS = 0;
+	u16 window, totlen, mss = 0;
+	bool df;
+	const unsigned char *optp = NULL, *_optp = NULL;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct xt_osf_finger *kf;
+	const struct xt_osf_user_finger *f;
+
+	if (!info)
+		return false;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return false;
+
+	if (!tcp->syn)
+		return false;
+
+	totlen = ntohs(ip->tot_len);
+	df = ntohs(ip->frag_off) & IP_DF;
+	window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), optsize, opts);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
+		f = &kf->finger;
+
+		if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
+			continue;
+
+		optp = _optp;
+		fmatch = FMATCH_WRONG;
+
+		if (totlen == f->ss && xt_osf_ttl(skb, info, f->ttl)) {
+			int foptsize, optnum;
+
+			/*
+			 * Should not happen if userspace parser was written correctly.
+			 */
+			if (f->wss.wc >= OSF_WSS_MAX)
+				continue;
+
+			/* Check options */
+
+			foptsize = 0;
+			for (optnum = 0; optnum < f->opt_num; ++optnum)
+				foptsize += f->opt[optnum].length;
+
+			if (foptsize > MAX_IPOPTLEN ||
+				optsize > MAX_IPOPTLEN ||
+				optsize != foptsize)
+				continue;
+
+			check_WSS = f->wss.wc;
+
+			for (optnum = 0; optnum < f->opt_num; ++optnum) {
+				if (f->opt[optnum].kind == (*optp)) {
+					__u32 len = f->opt[optnum].length;
+					const __u8 *optend = optp + len;
+					int loop_cont = 0;
+
+					fmatch = FMATCH_OK;
+
+					switch (*optp) {
+					case OSFOPT_MSS:
+						mss = optp[3];
+						mss <<= 8;
+						mss |= optp[2];
+
+						mss = ntohs(mss);
+						break;
+					case OSFOPT_TS:
+						loop_cont = 1;
+						break;
+					}
+
+					optp = optend;
+				} else
+					fmatch = FMATCH_OPT_WRONG;
+
+				if (fmatch != FMATCH_OK)
+					break;
+			}
+
+			if (fmatch != FMATCH_OPT_WRONG) {
+				fmatch = FMATCH_WRONG;
+
+				switch (check_WSS) {
+				case OSF_WSS_PLAIN:
+					if (f->wss.val == 0 || window == f->wss.val)
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MSS:
+					/*
+					 * Some smart modems decrease mangle MSS to 
+					 * SMART_MSS_2, so we check standard, decreased
+					 * and the one provided in the fingerprint MSS
+					 * values.
+					 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+					if (window == f->wss.val * mss ||
+					    window == f->wss.val * SMART_MSS_1 ||
+					    window == f->wss.val * SMART_MSS_2)
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MTU:
+					if (window == f->wss.val * (mss + 40) ||
+					    window == f->wss.val * (SMART_MSS_1 + 40) ||
+					    window == f->wss.val * (SMART_MSS_2 + 40))
+						fmatch = FMATCH_OK;
+					break;
+				case OSF_WSS_MODULO:
+					if ((window % f->wss.val) == 0)
+						fmatch = FMATCH_OK;
+					break;
+				}
+			}
+
+			if (fmatch != FMATCH_OK)
+				continue;
+
+			fcount++;
+
+			if (info->flags & XT_OSF_LOG)
+				nf_log_packet(p->family, p->hooknum, skb,
+					p->in, p->out, NULL,
+					"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+					f->genre, f->version, f->subtype,
+					&ip->saddr, ntohs(tcp->source),
+					&ip->daddr, ntohs(tcp->dest),
+					f->ttl - ip->ttl);
+
+			if ((info->flags & XT_OSF_LOG) &&
+			    info->loglevel == XT_OSF_LOGLEVEL_FIRST)
+				break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!fcount && (info->flags & XT_OSF_LOG))
+		nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL,
+			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+				&ip->saddr, ntohs(tcp->source),
+				&ip->daddr, ntohs(tcp->dest));
+
+	if (fcount)
+		fmatch = FMATCH_OK;
+
+	return fmatch == FMATCH_OK;
+}
+
+static struct xt_match xt_osf_match = {
+	.name 		= "osf",
+	.revision	= 0,
+	.family		= NFPROTO_IPV4,
+	.proto		= IPPROTO_TCP,
+	.hooks      	= (1 << NF_INET_LOCAL_IN) |
+				(1 << NF_INET_PRE_ROUTING) |
+				(1 << NF_INET_FORWARD),
+	.match 		= xt_osf_match_packet,
+	.matchsize	= sizeof(struct xt_osf_info),
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_osf_init(void)
+{
+	int err = -EINVAL;
+	int i;
+
+	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
+		INIT_LIST_HEAD(&xt_osf_fingers[i]);
+
+	err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
+	if (err < 0) {
+		pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
+		goto err_out_exit;
+	}
+
+	err = xt_register_match(&xt_osf_match);
+	if (err) {
+		pr_err("Failed to register OS fingerprint "
+		       "matching module (%d)\n", err);
+		goto err_out_remove;
+	}
+
+	return 0;
+
+err_out_remove:
+	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+err_out_exit:
+	return err;
+}
+
+static void __exit xt_osf_fini(void)
+{
+	struct xt_osf_finger *f;
+	int i;
+
+	nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
+	xt_unregister_match(&xt_osf_match);
+
+	rcu_read_lock();
+	for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
+
+		list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
+			list_del_rcu(&f->finger_entry);
+			kfree_rcu(f, rcu_head);
+		}
+	}
+	rcu_read_unlock();
+
+	rcu_barrier();
+}
+
+module_init(xt_osf_init);
+module_exit(xt_osf_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_DESCRIPTION("Passive OS fingerprint matching.");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
new file mode 100644
index 00000000..772d7389
--- /dev/null
+++ b/net/netfilter/xt_owner.c
@@ -0,0 +1,82 @@
+/*
+ * Kernel module to match various things tied to sockets associated with
+ * locally generated outgoing packets.
+ *
+ * (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_owner.h>
+
+static bool
+owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_owner_match_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return (info->match ^ info->invert) == 0;
+	else if (info->match & info->invert & XT_OWNER_SOCKET)
+		/*
+		 * Socket exists but user wanted ! --socket-exists.
+		 * (Single ampersands intended.)
+		 */
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return ((info->match ^ info->invert) &
+		       (XT_OWNER_UID | XT_OWNER_GID)) == 0;
+
+	if (info->match & XT_OWNER_UID)
+		if ((filp->f_cred->fsuid >= info->uid_min &&
+		    filp->f_cred->fsuid <= info->uid_max) ^
+		    !(info->invert & XT_OWNER_UID))
+			return false;
+
+	if (info->match & XT_OWNER_GID)
+		if ((filp->f_cred->fsgid >= info->gid_min &&
+		    filp->f_cred->fsgid <= info->gid_max) ^
+		    !(info->invert & XT_OWNER_GID))
+			return false;
+
+	return true;
+}
+
+static struct xt_match owner_mt_reg __read_mostly = {
+	.name       = "owner",
+	.revision   = 1,
+	.family     = NFPROTO_UNSPEC,
+	.match      = owner_mt,
+	.matchsize  = sizeof(struct xt_owner_match_info),
+	.hooks      = (1 << NF_INET_LOCAL_OUT) |
+	              (1 << NF_INET_POST_ROUTING),
+	.me         = THIS_MODULE,
+};
+
+static int __init owner_mt_init(void)
+{
+	return xt_register_match(&owner_mt_reg);
+}
+
+static void __exit owner_mt_exit(void)
+{
+	xt_unregister_match(&owner_mt_reg);
+}
+
+module_init(owner_mt_init);
+module_exit(owner_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: socket owner matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
new file mode 100644
index 00000000..d7ca16b8
--- /dev/null
+++ b/net/netfilter/xt_physdev.c
@@ -0,0 +1,128 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter/xt_physdev.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("Xtables: Bridge physical device match");
+MODULE_ALIAS("ipt_physdev");
+MODULE_ALIAS("ip6t_physdev");
+
+
+static bool
+physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	const struct xt_physdev_info *info = par->matchinfo;
+	unsigned long ret;
+	const char *indev, *outdev;
+	const struct nf_bridge_info *nf_bridge;
+
+	/* Not a bridged IP packet or no info available yet:
+	 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+	 * the destination device will be a bridge. */
+	if (!(nf_bridge = skb->nf_bridge)) {
+		/* Return MATCH if the invert flags of the used options are on */
+		if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+		    !(info->invert & XT_PHYSDEV_OP_BRIDGED))
+			return false;
+		if ((info->bitmask & XT_PHYSDEV_OP_ISIN) &&
+		    !(info->invert & XT_PHYSDEV_OP_ISIN))
+			return false;
+		if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) &&
+		    !(info->invert & XT_PHYSDEV_OP_ISOUT))
+			return false;
+		if ((info->bitmask & XT_PHYSDEV_OP_IN) &&
+		    !(info->invert & XT_PHYSDEV_OP_IN))
+			return false;
+		if ((info->bitmask & XT_PHYSDEV_OP_OUT) &&
+		    !(info->invert & XT_PHYSDEV_OP_OUT))
+			return false;
+		return true;
+	}
+
+	/* This only makes sense in the FORWARD and POSTROUTING chains */
+	if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) &&
+	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
+	    !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
+		return false;
+
+	if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
+	    (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) ||
+	    (info->bitmask & XT_PHYSDEV_OP_ISOUT &&
+	    (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT))))
+		return false;
+
+	if (!(info->bitmask & XT_PHYSDEV_OP_IN))
+		goto match_outdev;
+	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
+	ret = ifname_compare_aligned(indev, info->physindev, info->in_mask);
+
+	if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN))
+		return false;
+
+match_outdev:
+	if (!(info->bitmask & XT_PHYSDEV_OP_OUT))
+		return true;
+	outdev = nf_bridge->physoutdev ?
+		 nf_bridge->physoutdev->name : nulldevname;
+	ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask);
+
+	return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT));
+}
+
+static int physdev_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_physdev_info *info = par->matchinfo;
+
+	if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
+	    info->bitmask & ~XT_PHYSDEV_OP_MASK)
+		return -EINVAL;
+	if (info->bitmask & XT_PHYSDEV_OP_OUT &&
+	    (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
+	     info->invert & XT_PHYSDEV_OP_BRIDGED) &&
+	    par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
+	    (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
+		pr_info("using --physdev-out in the OUTPUT, FORWARD and "
+			"POSTROUTING chains for non-bridged traffic is not "
+			"supported anymore.\n");
+		if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match physdev_mt_reg __read_mostly = {
+	.name       = "physdev",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = physdev_mt_check,
+	.match      = physdev_mt,
+	.matchsize  = sizeof(struct xt_physdev_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init physdev_mt_init(void)
+{
+	return xt_register_match(&physdev_mt_reg);
+}
+
+static void __exit physdev_mt_exit(void)
+{
+	xt_unregister_match(&physdev_mt_reg);
+}
+
+module_init(physdev_mt_init);
+module_exit(physdev_mt_exit);
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
new file mode 100644
index 00000000..5b645cb5
--- /dev/null
+++ b/net/netfilter/xt_pkttype.c
@@ -0,0 +1,65 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/netfilter/xt_pkttype.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("Xtables: link layer packet type match");
+MODULE_ALIAS("ipt_pkttype");
+MODULE_ALIAS("ip6t_pkttype");
+
+static bool
+pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_pkttype_info *info = par->matchinfo;
+	u_int8_t type;
+
+	if (skb->pkt_type != PACKET_LOOPBACK)
+		type = skb->pkt_type;
+	else if (par->family == NFPROTO_IPV4 &&
+	    ipv4_is_multicast(ip_hdr(skb)->daddr))
+		type = PACKET_MULTICAST;
+	else if (par->family == NFPROTO_IPV6 &&
+	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
+		type = PACKET_MULTICAST;
+	else
+		type = PACKET_BROADCAST;
+
+	return (type == info->pkttype) ^ info->invert;
+}
+
+static struct xt_match pkttype_mt_reg __read_mostly = {
+	.name      = "pkttype",
+	.revision  = 0,
+	.family    = NFPROTO_UNSPEC,
+	.match     = pkttype_mt,
+	.matchsize = sizeof(struct xt_pkttype_info),
+	.me        = THIS_MODULE,
+};
+
+static int __init pkttype_mt_init(void)
+{
+	return xt_register_match(&pkttype_mt_reg);
+}
+
+static void __exit pkttype_mt_exit(void)
+{
+	xt_unregister_match(&pkttype_mt_reg);
+}
+
+module_init(pkttype_mt_init);
+module_exit(pkttype_mt_exit);
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
new file mode 100644
index 00000000..f23e97bb
--- /dev/null
+++ b/net/netfilter/xt_policy.c
@@ -0,0 +1,188 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_policy.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: IPsec policy match");
+MODULE_LICENSE("GPL");
+
+static inline bool
+xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m,
+	    const union nf_inet_addr *a2, unsigned short family)
+{
+	switch (family) {
+	case NFPROTO_IPV4:
+		return ((a1->ip ^ a2->ip) & m->ip) == 0;
+	case NFPROTO_IPV6:
+		return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0;
+	}
+	return false;
+}
+
+static bool
+match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e,
+		 unsigned short family)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x ||			       \
+				 (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \
+				  ^ e->invert.x))
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH_ADDR(saddr, smask, &x->props.saddr) &&
+	       MATCH_ADDR(daddr, dmask, &x->id.daddr) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info,
+		unsigned short family)
+{
+	const struct xt_policy_elem *e;
+	const struct sec_path *sp = skb->sp;
+	int strict = info->flags & XT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->xvec[i], e, family)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info,
+		 unsigned short family)
+{
+	const struct xt_policy_elem *e;
+	const struct dst_entry *dst = skb_dst(skb);
+	int strict = info->flags & XT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e, family)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? i == info->len : 0;
+}
+
+static bool
+policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_policy_info *info = par->matchinfo;
+	int ret;
+
+	if (info->flags & XT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info, par->family);
+	else
+		ret = match_policy_out(skb, info, par->family);
+
+	if (ret < 0)
+		ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
+	else if (info->flags & XT_POLICY_MATCH_NONE)
+		ret = false;
+
+	return ret;
+}
+
+static int policy_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_policy_info *info = par->matchinfo;
+
+	if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) {
+		pr_info("neither incoming nor outgoing policy selected\n");
+		return -EINVAL;
+	}
+	if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
+	    (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) {
+		pr_info("output policy not valid in PREROUTING and INPUT\n");
+		return -EINVAL;
+	}
+	if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
+	    (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) {
+		pr_info("input policy not valid in POSTROUTING and OUTPUT\n");
+		return -EINVAL;
+	}
+	if (info->len > XT_POLICY_MAX_ELEM) {
+		pr_info("too many policy elements\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_match policy_mt_reg[] __read_mostly = {
+	{
+		.name		= "policy",
+		.family		= NFPROTO_IPV4,
+		.checkentry 	= policy_mt_check,
+		.match		= policy_mt,
+		.matchsize	= sizeof(struct xt_policy_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "policy",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= policy_mt_check,
+		.match		= policy_mt,
+		.matchsize	= sizeof(struct xt_policy_info),
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init policy_mt_init(void)
+{
+	return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg));
+}
+
+static void __exit policy_mt_exit(void)
+{
+	xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg));
+}
+
+module_init(policy_mt_init);
+module_exit(policy_mt_exit);
+MODULE_ALIAS("ipt_policy");
+MODULE_ALIAS("ip6t_policy");
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c
new file mode 100644
index 00000000..08086d68
--- /dev/null
+++ b/net/netfilter/xt_qtaguid.c
@@ -0,0 +1,2785 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/file.h>
+#include <linux/inetdevice.h>
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_qtaguid.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+/*
+ * We only use the xt_socket funcs within a similar context to avoid unexpected
+ * return values.
+ */
+#define XT_SOCKET_SUPPORTED_HOOKS \
+	((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
+
+
+static const char *module_procdirname = "xt_qtaguid";
+static struct proc_dir_entry *xt_qtaguid_procdir;
+
+static unsigned int proc_iface_perms = S_IRUGO;
+module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_stats_file;
+static unsigned int proc_stats_perms = S_IRUGO;
+module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_ctrl_file;
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
+#else
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUSR;
+#endif
+module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+static gid_t proc_stats_readall_gid = AID_NET_BW_STATS;
+static gid_t proc_ctrl_write_gid = AID_NET_BW_ACCT;
+#else
+/* 0 means, don't limit anybody */
+static gid_t proc_stats_readall_gid;
+static gid_t proc_ctrl_write_gid;
+#endif
+module_param_named(stats_readall_gid, proc_stats_readall_gid, uint,
+		   S_IRUGO | S_IWUSR);
+module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint,
+		   S_IRUGO | S_IWUSR);
+
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
+/*
+ * After the kernel has initiallized this module, it is still possible
+ * to make it passive.
+ * Setting passive to Y:
+ *  - the iface stats handling will not act on notifications.
+ *  - iptables matches will never match.
+ *  - ctrl commands silently succeed.
+ *  - stats are always empty.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool module_passive;
+module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
+
+/*
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ *  - don't create proc specific structs to track tags
+ *  - don't check that active tag stats exceed some limits.
+ *  - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+		   S_IRUGO | S_IWUSR);
+
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
+module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
+static const char *iface_stat_procdirname = "iface_stat";
+static struct proc_dir_entry *iface_stat_procdir;
+static const char *iface_stat_all_procfilename = "iface_stat_all";
+static struct proc_dir_entry *iface_stat_all_procfile;
+
+/*
+ * Ordering of locks:
+ *  outer locks:
+ *    iface_stat_list_lock
+ *    sock_tag_list_lock
+ *  inner locks:
+ *    uid_tag_data_tree_lock
+ *    tag_counter_set_list_lock
+ * Notice how sock_tag_list_lock is held sometimes when uid_tag_data_tree_lock
+ * is acquired.
+ *
+ * Call tree with all lock holders as of 2011-09-25:
+ *
+ * iface_stat_all_proc_read()
+ *   iface_stat_list_lock
+ *     (struct iface_stat)
+ *
+ * qtaguid_ctrl_proc_read()
+ *   sock_tag_list_lock
+ *     (sock_tag_tree)
+ *     (struct proc_qtu_data->sock_tag_list)
+ *   prdebug_full_state()
+ *     sock_tag_list_lock
+ *       (sock_tag_tree)
+ *     uid_tag_data_tree_lock
+ *       (uid_tag_data_tree)
+ *       (proc_qtu_data_tree)
+ *     iface_stat_list_lock
+ *
+ * qtaguid_stats_proc_read()
+ *   iface_stat_list_lock
+ *     struct iface_stat->tag_stat_list_lock
+ *
+ * qtudev_open()
+ *   uid_tag_data_tree_lock
+ *
+ * qtudev_release()
+ *   sock_tag_data_list_lock
+ *     uid_tag_data_tree_lock
+ *   prdebug_full_state()
+ *     sock_tag_list_lock
+ *     uid_tag_data_tree_lock
+ *     iface_stat_list_lock
+ *
+ * iface_netdev_event_handler()
+ *   iface_stat_create()
+ *     iface_stat_list_lock
+ *   iface_stat_update()
+ *     iface_stat_list_lock
+ *
+ * iface_inetaddr_event_handler()
+ *   iface_stat_create()
+ *     iface_stat_list_lock
+ *   iface_stat_update()
+ *     iface_stat_list_lock
+ *
+ * iface_inet6addr_event_handler()
+ *   iface_stat_create_ipv6()
+ *     iface_stat_list_lock
+ *   iface_stat_update()
+ *     iface_stat_list_lock
+ *
+ * qtaguid_mt()
+ *   account_for_uid()
+ *     if_tag_stat_update()
+ *       get_sock_stat()
+ *         sock_tag_list_lock
+ *       struct iface_stat->tag_stat_list_lock
+ *         tag_stat_update()
+ *           get_active_counter_set()
+ *             tag_counter_set_list_lock
+ *         tag_stat_update()
+ *           get_active_counter_set()
+ *             tag_counter_set_list_lock
+ *
+ *
+ * qtaguid_ctrl_parse()
+ *   ctrl_cmd_delete()
+ *     sock_tag_list_lock
+ *     tag_counter_set_list_lock
+ *     iface_stat_list_lock
+ *       struct iface_stat->tag_stat_list_lock
+ *     uid_tag_data_tree_lock
+ *   ctrl_cmd_counter_set()
+ *     tag_counter_set_list_lock
+ *   ctrl_cmd_tag()
+ *     sock_tag_list_lock
+ *       (sock_tag_tree)
+ *       get_tag_ref()
+ *         uid_tag_data_tree_lock
+ *           (uid_tag_data_tree)
+ *       uid_tag_data_tree_lock
+ *         (proc_qtu_data_tree)
+ *   ctrl_cmd_untag()
+ *     sock_tag_list_lock
+ *     uid_tag_data_tree_lock
+ *
+ */
+static LIST_HEAD(iface_stat_list);
+static DEFINE_SPINLOCK(iface_stat_list_lock);
+
+static struct rb_root sock_tag_tree = RB_ROOT;
+static DEFINE_SPINLOCK(sock_tag_list_lock);
+
+static struct rb_root tag_counter_set_tree = RB_ROOT;
+static DEFINE_SPINLOCK(tag_counter_set_list_lock);
+
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
+
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
+/*----------------------------------------------*/
+static bool can_manipulate_uids(void)
+{
+	/* root pwnd */
+	return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid)
+		|| in_egroup_p(proc_ctrl_write_gid);
+}
+
+static bool can_impersonate_uid(uid_t uid)
+{
+	return uid == current_fsuid() || can_manipulate_uids();
+}
+
+static bool can_read_other_uid_stats(uid_t uid)
+{
+	/* root pwnd */
+	return unlikely(!current_fsuid()) || uid == current_fsuid()
+		|| unlikely(!proc_stats_readall_gid)
+		|| in_egroup_p(proc_stats_readall_gid);
+}
+
+static inline void dc_add_byte_packets(struct data_counters *counters, int set,
+				  enum ifs_tx_rx direction,
+				  enum ifs_proto ifs_proto,
+				  int bytes,
+				  int packets)
+{
+	counters->bpc[set][direction][ifs_proto].bytes += bytes;
+	counters->bpc[set][direction][ifs_proto].packets += packets;
+}
+
+static inline uint64_t dc_sum_bytes(struct data_counters *counters,
+				    int set,
+				    enum ifs_tx_rx direction)
+{
+	return counters->bpc[set][direction][IFS_TCP].bytes
+		+ counters->bpc[set][direction][IFS_UDP].bytes
+		+ counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
+}
+
+static inline uint64_t dc_sum_packets(struct data_counters *counters,
+				      int set,
+				      enum ifs_tx_rx direction)
+{
+	return counters->bpc[set][direction][IFS_TCP].packets
+		+ counters->bpc[set][direction][IFS_UDP].packets
+		+ counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
+}
+
+static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct tag_node *data = rb_entry(node, struct tag_node, node);
+		int result;
+		RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+			 " node=%p data=%p\n", tag, node, data);
+		result = tag_compare(tag, data->tag);
+		RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+			 " data.tag=0x%llx (uid=%u) res=%d\n",
+			 tag, data->tag, get_uid_from_tag(data->tag), result);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct tag_node *this = rb_entry(*new, struct tag_node,
+						 node);
+		int result = tag_compare(data->tag, this->tag);
+		RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+			 " (uid=%u)\n", __func__,
+			 this->tag,
+			 get_uid_from_tag(this->tag));
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else
+			BUG();
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
+{
+	tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
+{
+	struct tag_node *node = tag_node_tree_search(root, tag);
+	if (!node)
+		return NULL;
+	return rb_entry(&node->node, struct tag_stat, tn.node);
+}
+
+static void tag_counter_set_tree_insert(struct tag_counter_set *data,
+					struct rb_root *root)
+{
+	tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
+							   tag_t tag)
+{
+	struct tag_node *node = tag_node_tree_search(root, tag);
+	if (!node)
+		return NULL;
+	return rb_entry(&node->node, struct tag_counter_set, tn.node);
+
+}
+
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+	tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+	struct tag_node *node = tag_node_tree_search(root, tag);
+	if (!node)
+		return NULL;
+	return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
+static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
+					     const struct sock *sk)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct sock_tag *data = rb_entry(node, struct sock_tag,
+						 sock_node);
+		if (sk < data->sk)
+			node = node->rb_left;
+		else if (sk > data->sk)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct sock_tag *this = rb_entry(*new, struct sock_tag,
+						 sock_node);
+		parent = *new;
+		if (data->sk < this->sk)
+			new = &((*new)->rb_left);
+		else if (data->sk > this->sk)
+			new = &((*new)->rb_right);
+		else
+			BUG();
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&data->sock_node, parent, new);
+	rb_insert_color(&data->sock_node, root);
+}
+
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+	struct rb_node *node;
+	struct sock_tag *st_entry;
+
+	node = rb_first(st_to_free_tree);
+	while (node) {
+		st_entry = rb_entry(node, struct sock_tag, sock_node);
+		node = rb_next(node);
+		CT_DEBUG("qtaguid: %s(): "
+			 "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+			 st_entry->sk,
+			 st_entry->tag,
+			 get_uid_from_tag(st_entry->tag));
+		rb_erase(&st_entry->sock_node, st_to_free_tree);
+		sockfd_put(st_entry->socket);
+		kfree(st_entry);
+	}
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+						       const pid_t pid)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct proc_qtu_data *data = rb_entry(node,
+						      struct proc_qtu_data,
+						      node);
+		if (pid < data->pid)
+			node = node->rb_left;
+		else if (pid > data->pid)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+				      struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct proc_qtu_data *this = rb_entry(*new,
+						      struct proc_qtu_data,
+						      node);
+		parent = *new;
+		if (data->pid < this->pid)
+			new = &((*new)->rb_left);
+		else if (data->pid > this->pid)
+			new = &((*new)->rb_right);
+		else
+			BUG();
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+				     struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct uid_tag_data *this = rb_entry(*new,
+						     struct uid_tag_data,
+						     node);
+		parent = *new;
+		if (data->uid < this->uid)
+			new = &((*new)->rb_left);
+		else if (data->uid > this->uid)
+			new = &((*new)->rb_right);
+		else
+			BUG();
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+						     uid_t uid)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct uid_tag_data *data = rb_entry(node,
+						     struct uid_tag_data,
+						     node);
+		if (uid < data->uid)
+			node = node->rb_left;
+		else if (uid > data->uid)
+			node = node->rb_right;
+		else
+			return data;
+	}
+	return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ *   sets *found to true if not allocated.
+ *   sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+	struct uid_tag_data *utd_entry;
+
+	/* Look for top level uid_tag_data for the UID */
+	utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+	DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+	if (found_res)
+		*found_res = utd_entry;
+	if (utd_entry)
+		return utd_entry;
+
+	utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+	if (!utd_entry) {
+		pr_err("qtaguid: get_uid_data(%u): "
+		       "tag data alloc failed\n", uid);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	utd_entry->uid = uid;
+	utd_entry->tag_ref_tree = RB_ROOT;
+	uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+	DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+	return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+				   struct uid_tag_data *utd_entry)
+{
+	struct tag_ref *tr_entry;
+	int res;
+
+	if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+		pr_info("qtaguid: new_tag_ref(0x%llx): "
+			"tag ref alloc quota exceeded. max=%d\n",
+			new_tag, max_sock_tags);
+		res = -EMFILE;
+		goto err_res;
+
+	}
+
+	tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+	if (!tr_entry) {
+		pr_err("qtaguid: new_tag_ref(0x%llx): "
+		       "tag ref alloc failed\n",
+		       new_tag);
+		res = -ENOMEM;
+		goto err_res;
+	}
+	tr_entry->tn.tag = new_tag;
+	/* tr_entry->num_sock_tags  handled by caller */
+	utd_entry->num_active_tags++;
+	tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+	DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+		 " inserted new tag ref %p\n",
+		 new_tag, tr_entry);
+	return tr_entry;
+
+err_res:
+	return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+				      struct uid_tag_data **utd_res)
+{
+	struct uid_tag_data *utd_entry;
+	struct tag_ref *tr_entry;
+	bool found_utd;
+	uid_t uid = get_uid_from_tag(full_tag);
+
+	DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+		 full_tag, uid);
+
+	utd_entry = get_uid_data(uid, &found_utd);
+	if (IS_ERR_OR_NULL(utd_entry)) {
+		if (utd_res)
+			*utd_res = utd_entry;
+		return NULL;
+	}
+
+	tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+	if (utd_res)
+		*utd_res = utd_entry;
+	DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+		 full_tag, utd_entry, tr_entry);
+	return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+				   struct uid_tag_data **utd_res)
+{
+	struct uid_tag_data *utd_entry;
+	struct tag_ref *tr_entry;
+
+	DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+		 full_tag);
+	spin_lock_bh(&uid_tag_data_tree_lock);
+	tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+	BUG_ON(IS_ERR_OR_NULL(utd_entry));
+	if (!tr_entry)
+		tr_entry = new_tag_ref(full_tag, utd_entry);
+
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+	if (utd_res)
+		*utd_res = utd_entry;
+	DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+		 full_tag, utd_entry, tr_entry);
+	return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+	/* Are we done with the UID tag data entry? */
+	if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
+		!utd_entry->num_pqd) {
+		DR_DEBUG("qtaguid: %s(): "
+			 "erase utd_entry=%p uid=%u "
+			 "by pid=%u tgid=%u uid=%u\n", __func__,
+			 utd_entry, utd_entry->uid,
+			 current->pid, current->tgid, current_fsuid());
+		BUG_ON(utd_entry->num_active_tags);
+		rb_erase(&utd_entry->node, &uid_tag_data_tree);
+		kfree(utd_entry);
+	} else {
+		DR_DEBUG("qtaguid: %s(): "
+			 "utd_entry=%p still has %d tags %d proc_qtu_data\n",
+			 __func__, utd_entry, utd_entry->num_active_tags,
+			 utd_entry->num_pqd);
+		BUG_ON(!(utd_entry->num_active_tags ||
+			 utd_entry->num_pqd));
+	}
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+					struct uid_tag_data *utd_entry)
+{
+	DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+		 tr_entry, tr_entry->tn.tag,
+		 get_uid_from_tag(tr_entry->tn.tag));
+	if (!tr_entry->num_sock_tags) {
+		BUG_ON(!utd_entry->num_active_tags);
+		utd_entry->num_active_tags--;
+		rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+		DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+		kfree(tr_entry);
+	}
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+	struct rb_node *node;
+	struct tag_ref *tr_entry;
+	tag_t acct_tag;
+
+	DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+		 full_tag, get_uid_from_tag(full_tag));
+	acct_tag = get_atag_from_tag(full_tag);
+	node = rb_first(&utd_entry->tag_ref_tree);
+	while (node) {
+		tr_entry = rb_entry(node, struct tag_ref, tn.node);
+		node = rb_next(node);
+		if (!acct_tag || tr_entry->tn.tag == full_tag)
+			free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+	}
+}
+
+static int read_proc_u64(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len;
+	uint64_t value;
+	char *p = page;
+	uint64_t *iface_entry = data;
+
+	if (!data)
+		return 0;
+
+	value = *iface_entry;
+	p += sprintf(p, "%llu\n", value);
+	len = (p - page) - off;
+	*eof = (len <= count) ? 1 : 0;
+	*start = page + off;
+	return len;
+}
+
+static int read_proc_bool(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len;
+	bool value;
+	char *p = page;
+	bool *bool_entry = data;
+
+	if (!data)
+		return 0;
+
+	value = *bool_entry;
+	p += sprintf(p, "%u\n", value);
+	len = (p - page) - off;
+	*eof = (len <= count) ? 1 : 0;
+	*start = page + off;
+	return len;
+}
+
+static int get_active_counter_set(tag_t tag)
+{
+	int active_set = 0;
+	struct tag_counter_set *tcs;
+
+	MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
+		 " (uid=%u)\n",
+		 tag, get_uid_from_tag(tag));
+	/* For now we only handle UID tags for active sets */
+	tag = get_utag_from_tag(tag);
+	spin_lock_bh(&tag_counter_set_list_lock);
+	tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+	if (tcs)
+		active_set = tcs->active_set;
+	spin_unlock_bh(&tag_counter_set_list_lock);
+	return active_set;
+}
+
+/*
+ * Find the entry for tracking the specified interface.
+ * Caller must hold iface_stat_list_lock
+ */
+static struct iface_stat *get_iface_entry(const char *ifname)
+{
+	struct iface_stat *iface_entry;
+
+	/* Find the entry for tracking the specified tag within the interface */
+	if (ifname == NULL) {
+		pr_info("qtaguid: iface_stat: get() NULL device name\n");
+		return NULL;
+	}
+
+	/* Iterate over interfaces */
+	list_for_each_entry(iface_entry, &iface_stat_list, list) {
+		if (!strcmp(ifname, iface_entry->ifname))
+			goto done;
+	}
+	iface_entry = NULL;
+done:
+	return iface_entry;
+}
+
+static int iface_stat_all_proc_read(char *page, char **num_items_returned,
+				    off_t items_to_skip, int char_count,
+				    int *eof, void *data)
+{
+	char *outp = page;
+	int item_index = 0;
+	int len;
+	struct iface_stat *iface_entry;
+	struct rtnl_link_stats64 dev_stats, *stats;
+	struct rtnl_link_stats64 no_dev_stats = {0};
+
+	if (unlikely(module_passive)) {
+		*eof = 1;
+		return 0;
+	}
+
+	CT_DEBUG("qtaguid:proc iface_stat_all "
+		 "page=%p *num_items_returned=%p off=%ld "
+		 "char_count=%d *eof=%d\n", page, *num_items_returned,
+		 items_to_skip, char_count, *eof);
+
+	if (*eof)
+		return 0;
+
+	/*
+	 * This lock will prevent iface_stat_update() from changing active,
+	 * and in turn prevent an interface from unregistering itself.
+	 */
+	spin_lock_bh(&iface_stat_list_lock);
+	list_for_each_entry(iface_entry, &iface_stat_list, list) {
+		if (item_index++ < items_to_skip)
+			continue;
+
+		if (iface_entry->active) {
+			stats = dev_get_stats(iface_entry->net_dev,
+					      &dev_stats);
+		} else {
+			stats = &no_dev_stats;
+		}
+		len = snprintf(outp, char_count,
+			       "%s %d "
+			       "%llu %llu %llu %llu "
+			       "%llu %llu %llu %llu\n",
+			       iface_entry->ifname,
+			       iface_entry->active,
+			       iface_entry->totals[IFS_RX].bytes,
+			       iface_entry->totals[IFS_RX].packets,
+			       iface_entry->totals[IFS_TX].bytes,
+			       iface_entry->totals[IFS_TX].packets,
+			       stats->rx_bytes, stats->rx_packets,
+			       stats->tx_bytes, stats->tx_packets);
+		if (len >= char_count) {
+			spin_unlock_bh(&iface_stat_list_lock);
+			*outp = '\0';
+			return outp - page;
+		}
+		outp += len;
+		char_count -= len;
+		(*num_items_returned)++;
+	}
+	spin_unlock_bh(&iface_stat_list_lock);
+
+	*eof = 1;
+	return outp - page;
+}
+
+static void iface_create_proc_worker(struct work_struct *work)
+{
+	struct proc_dir_entry *proc_entry;
+	struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
+						   iface_work);
+	struct iface_stat *new_iface  = isw->iface_entry;
+
+	/* iface_entries are not deleted, so safe to manipulate. */
+	proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
+	if (IS_ERR_OR_NULL(proc_entry)) {
+		pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
+		kfree(isw);
+		return;
+	}
+
+	new_iface->proc_ptr = proc_entry;
+
+	create_proc_read_entry("tx_bytes", proc_iface_perms, proc_entry,
+			read_proc_u64, &new_iface->totals[IFS_TX].bytes);
+	create_proc_read_entry("rx_bytes", proc_iface_perms, proc_entry,
+			read_proc_u64, &new_iface->totals[IFS_RX].bytes);
+	create_proc_read_entry("tx_packets", proc_iface_perms, proc_entry,
+			read_proc_u64, &new_iface->totals[IFS_TX].packets);
+	create_proc_read_entry("rx_packets", proc_iface_perms, proc_entry,
+			read_proc_u64, &new_iface->totals[IFS_RX].packets);
+	create_proc_read_entry("active", proc_iface_perms, proc_entry,
+			read_proc_bool, &new_iface->active);
+
+	IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
+		 "entry=%p dev=%s\n", new_iface, new_iface->ifname);
+	kfree(isw);
+}
+
+/*
+ * Will set the entry's active state, and
+ * update the net_dev accordingly also.
+ */
+static void _iface_stat_set_active(struct iface_stat *entry,
+				   struct net_device *net_dev,
+				   bool activate)
+{
+	if (activate) {
+		entry->net_dev = net_dev;
+		entry->active = true;
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "enable tracking. rfcnt=%d\n", __func__,
+			 entry->ifname,
+			 percpu_read(*net_dev->pcpu_refcnt));
+	} else {
+		entry->active = false;
+		entry->net_dev = NULL;
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "disable tracking. rfcnt=%d\n", __func__,
+			 entry->ifname,
+			 percpu_read(*net_dev->pcpu_refcnt));
+
+	}
+}
+
+/* Caller must hold iface_stat_list_lock */
+static struct iface_stat *iface_alloc(struct net_device *net_dev)
+{
+	struct iface_stat *new_iface;
+	struct iface_stat_work *isw;
+
+	new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
+	if (new_iface == NULL) {
+		pr_err("qtaguid: iface_stat: create(%s): "
+		       "iface_stat alloc failed\n", net_dev->name);
+		return NULL;
+	}
+	new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
+	if (new_iface->ifname == NULL) {
+		pr_err("qtaguid: iface_stat: create(%s): "
+		       "ifname alloc failed\n", net_dev->name);
+		kfree(new_iface);
+		return NULL;
+	}
+	spin_lock_init(&new_iface->tag_stat_list_lock);
+	new_iface->tag_stat_tree = RB_ROOT;
+	_iface_stat_set_active(new_iface, net_dev, true);
+
+	/*
+	 * ipv6 notifier chains are atomic :(
+	 * No create_proc_read_entry() for you!
+	 */
+	isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
+	if (!isw) {
+		pr_err("qtaguid: iface_stat: create(%s): "
+		       "work alloc failed\n", new_iface->ifname);
+		_iface_stat_set_active(new_iface, net_dev, false);
+		kfree(new_iface->ifname);
+		kfree(new_iface);
+		return NULL;
+	}
+	isw->iface_entry = new_iface;
+	INIT_WORK(&isw->iface_work, iface_create_proc_worker);
+	schedule_work(&isw->iface_work);
+	list_add(&new_iface->list, &iface_stat_list);
+	return new_iface;
+}
+
+static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
+					       struct iface_stat *iface)
+{
+	struct rtnl_link_stats64 dev_stats, *stats;
+	bool stats_rewound;
+
+	stats = dev_get_stats(net_dev, &dev_stats);
+	/* No empty packets */
+	stats_rewound =
+		(stats->rx_bytes < iface->last_known[IFS_RX].bytes)
+		|| (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
+
+	IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
+		 "bytes rx/tx=%llu/%llu "
+		 "active=%d last_known=%d "
+		 "stats_rewound=%d\n", __func__,
+		 net_dev ? net_dev->name : "?",
+		 iface, net_dev,
+		 stats->rx_bytes, stats->tx_bytes,
+		 iface->active, iface->last_known_valid, stats_rewound);
+
+	if (iface->active && iface->last_known_valid && stats_rewound) {
+		pr_warn_once("qtaguid: iface_stat: %s(%s): "
+			     "iface reset its stats unexpectedly\n", __func__,
+			     net_dev->name);
+
+		iface->totals[IFS_TX].bytes += iface->last_known[IFS_TX].bytes;
+		iface->totals[IFS_TX].packets +=
+			iface->last_known[IFS_TX].packets;
+		iface->totals[IFS_RX].bytes += iface->last_known[IFS_RX].bytes;
+		iface->totals[IFS_RX].packets +=
+			iface->last_known[IFS_RX].packets;
+		iface->last_known_valid = false;
+		IF_DEBUG("qtaguid: %s(%s): iface=%p "
+			 "used last known bytes rx/tx=%llu/%llu\n", __func__,
+			 iface->ifname, iface, iface->last_known[IFS_RX].bytes,
+			 iface->last_known[IFS_TX].bytes);
+	}
+}
+
+/*
+ * Create a new entry for tracking the specified interface.
+ * Do nothing if the entry already exists.
+ * Called when an interface is configured with a valid IP address.
+ */
+static void iface_stat_create(struct net_device *net_dev,
+			      struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = NULL;
+	const char *ifname;
+	struct iface_stat *entry;
+	__be32 ipaddr = 0;
+	struct iface_stat *new_iface;
+
+	IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
+		 net_dev ? net_dev->name : "?",
+		 ifa, net_dev);
+	if (!net_dev) {
+		pr_err("qtaguid: iface_stat: create(): no net dev\n");
+		return;
+	}
+
+	ifname = net_dev->name;
+	if (!ifa) {
+		in_dev = in_dev_get(net_dev);
+		if (!in_dev) {
+			pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
+			       ifname);
+			return;
+		}
+		IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
+			 ifname, in_dev);
+		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+			IF_DEBUG("qtaguid: iface_stat: create(%s): "
+				 "ifa=%p ifa_label=%s\n",
+				 ifname, ifa,
+				 ifa->ifa_label ? ifa->ifa_label : "(null)");
+			if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
+				break;
+		}
+	}
+
+	if (!ifa) {
+		IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
+			 ifname);
+		goto done_put;
+	}
+	ipaddr = ifa->ifa_local;
+
+	spin_lock_bh(&iface_stat_list_lock);
+	entry = get_iface_entry(ifname);
+	if (entry != NULL) {
+		bool activate = !ipv4_is_loopback(ipaddr);
+		IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
+			 ifname, entry);
+		iface_check_stats_reset_and_adjust(net_dev, entry);
+		_iface_stat_set_active(entry, net_dev, activate);
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "tracking now %d on ip=%pI4\n", __func__,
+			 entry->ifname, activate, &ipaddr);
+		goto done_unlock_put;
+	} else if (ipv4_is_loopback(ipaddr)) {
+		IF_DEBUG("qtaguid: iface_stat: create(%s): "
+			 "ignore loopback dev. ip=%pI4\n", ifname, &ipaddr);
+		goto done_unlock_put;
+	}
+
+	new_iface = iface_alloc(net_dev);
+	IF_DEBUG("qtaguid: iface_stat: create(%s): done "
+		 "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
+done_unlock_put:
+	spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+	if (in_dev)
+		in_dev_put(in_dev);
+}
+
+static void iface_stat_create_ipv6(struct net_device *net_dev,
+				   struct inet6_ifaddr *ifa)
+{
+	struct in_device *in_dev;
+	const char *ifname;
+	struct iface_stat *entry;
+	struct iface_stat *new_iface;
+	int addr_type;
+
+	IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
+		 ifa, net_dev, net_dev ? net_dev->name : "");
+	if (!net_dev) {
+		pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
+		return;
+	}
+	ifname = net_dev->name;
+
+	in_dev = in_dev_get(net_dev);
+	if (!in_dev) {
+		pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
+		       ifname);
+		return;
+	}
+
+	IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
+		 ifname, in_dev);
+
+	if (!ifa) {
+		IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
+			 ifname);
+		goto done_put;
+	}
+	addr_type = ipv6_addr_type(&ifa->addr);
+
+	spin_lock_bh(&iface_stat_list_lock);
+	entry = get_iface_entry(ifname);
+	if (entry != NULL) {
+		bool activate = !(addr_type & IPV6_ADDR_LOOPBACK);
+		IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+			 ifname, entry);
+		iface_check_stats_reset_and_adjust(net_dev, entry);
+		_iface_stat_set_active(entry, net_dev, activate);
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "tracking now %d on ip=%pI6c\n", __func__,
+			 entry->ifname, activate, &ifa->addr);
+		goto done_unlock_put;
+	} else if (addr_type & IPV6_ADDR_LOOPBACK) {
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "ignore loopback dev. ip=%pI6c\n", __func__,
+			 ifname, &ifa->addr);
+		goto done_unlock_put;
+	}
+
+	new_iface = iface_alloc(net_dev);
+	IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
+		 "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
+
+done_unlock_put:
+	spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+	in_dev_put(in_dev);
+}
+
+static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
+{
+	MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
+	return sock_tag_tree_search(&sock_tag_tree, sk);
+}
+
+static struct sock_tag *get_sock_stat(const struct sock *sk)
+{
+	struct sock_tag *sock_tag_entry;
+	MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
+	if (!sk)
+		return NULL;
+	spin_lock_bh(&sock_tag_list_lock);
+	sock_tag_entry = get_sock_stat_nl(sk);
+	spin_unlock_bh(&sock_tag_list_lock);
+	return sock_tag_entry;
+}
+
+static void
+data_counters_update(struct data_counters *dc, int set,
+		     enum ifs_tx_rx direction, int proto, int bytes)
+{
+	switch (proto) {
+	case IPPROTO_TCP:
+		dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
+		break;
+	case IPPROTO_UDP:
+		dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
+		break;
+	case IPPROTO_IP:
+	default:
+		dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
+				    1);
+		break;
+	}
+}
+
+/*
+ * Update stats for the specified interface. Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called when an device is being unregistered.
+ */
+static void iface_stat_update(struct net_device *net_dev, bool stash_only)
+{
+	struct rtnl_link_stats64 dev_stats, *stats;
+	struct iface_stat *entry;
+
+	stats = dev_get_stats(net_dev, &dev_stats);
+	spin_lock_bh(&iface_stat_list_lock);
+	entry = get_iface_entry(net_dev->name);
+	if (entry == NULL) {
+		IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
+			 net_dev->name);
+		spin_unlock_bh(&iface_stat_list_lock);
+		return;
+	}
+
+	IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+		 net_dev->name, entry);
+	if (!entry->active) {
+		IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
+			 net_dev->name);
+		spin_unlock_bh(&iface_stat_list_lock);
+		return;
+	}
+
+	if (stash_only) {
+		entry->last_known[IFS_TX].bytes = stats->tx_bytes;
+		entry->last_known[IFS_TX].packets = stats->tx_packets;
+		entry->last_known[IFS_RX].bytes = stats->rx_bytes;
+		entry->last_known[IFS_RX].packets = stats->rx_packets;
+		entry->last_known_valid = true;
+		IF_DEBUG("qtaguid: %s(%s): "
+			 "dev stats stashed rx/tx=%llu/%llu\n", __func__,
+			 net_dev->name, stats->rx_bytes, stats->tx_bytes);
+		spin_unlock_bh(&iface_stat_list_lock);
+		return;
+	}
+	entry->totals[IFS_TX].bytes += stats->tx_bytes;
+	entry->totals[IFS_TX].packets += stats->tx_packets;
+	entry->totals[IFS_RX].bytes += stats->rx_bytes;
+	entry->totals[IFS_RX].packets += stats->rx_packets;
+	/* We don't need the last_known[] anymore */
+	entry->last_known_valid = false;
+	_iface_stat_set_active(entry, net_dev, false);
+	IF_DEBUG("qtaguid: %s(%s): "
+		 "disable tracking. rx/tx=%llu/%llu\n", __func__,
+		 net_dev->name, stats->rx_bytes, stats->tx_bytes);
+	spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static void tag_stat_update(struct tag_stat *tag_entry,
+			enum ifs_tx_rx direction, int proto, int bytes)
+{
+	int active_set;
+	active_set = get_active_counter_set(tag_entry->tn.tag);
+	MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
+		 "dir=%d proto=%d bytes=%d)\n",
+		 tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
+		 active_set, direction, proto, bytes);
+	data_counters_update(&tag_entry->counters, active_set, direction,
+			     proto, bytes);
+	if (tag_entry->parent_counters)
+		data_counters_update(tag_entry->parent_counters, active_set,
+				     direction, proto, bytes);
+}
+
+/*
+ * Create a new entry for tracking the specified {acct_tag,uid_tag} within
+ * the interface.
+ * iface_entry->tag_stat_list_lock should be held.
+ */
+static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
+					   tag_t tag)
+{
+	struct tag_stat *new_tag_stat_entry = NULL;
+	IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+		 " (uid=%u)\n", __func__,
+		 iface_entry, tag, get_uid_from_tag(tag));
+	new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
+	if (!new_tag_stat_entry) {
+		pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
+		goto done;
+	}
+	new_tag_stat_entry->tn.tag = tag;
+	tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
+done:
+	return new_tag_stat_entry;
+}
+
+static void if_tag_stat_update(const char *ifname, uid_t uid,
+			       const struct sock *sk, enum ifs_tx_rx direction,
+			       int proto, int bytes)
+{
+	struct tag_stat *tag_stat_entry;
+	tag_t tag, acct_tag;
+	tag_t uid_tag;
+	struct data_counters *uid_tag_counters;
+	struct sock_tag *sock_tag_entry;
+	struct iface_stat *iface_entry;
+	struct tag_stat *new_tag_stat;
+	MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
+		"uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
+		 ifname, uid, sk, direction, proto, bytes);
+
+
+	iface_entry = get_iface_entry(ifname);
+	if (!iface_entry) {
+		pr_err("qtaguid: iface_stat: stat_update() %s not found\n",
+		       ifname);
+		return;
+	}
+	/* It is ok to process data when an iface_entry is inactive */
+
+	MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
+		 ifname, iface_entry);
+
+	/*
+	 * Look for a tagged sock.
+	 * It will have an acct_uid.
+	 */
+	sock_tag_entry = get_sock_stat(sk);
+	if (sock_tag_entry) {
+		tag = sock_tag_entry->tag;
+		acct_tag = get_atag_from_tag(tag);
+		uid_tag = get_utag_from_tag(tag);
+	} else {
+		acct_tag = make_atag_from_value(0);
+		tag = combine_atag_with_uid(acct_tag, uid);
+		uid_tag = make_tag_from_uid(uid);
+	}
+	MT_DEBUG("qtaguid: iface_stat: stat_update(): "
+		 " looking for tag=0x%llx (uid=%u) in ife=%p\n",
+		 tag, get_uid_from_tag(tag), iface_entry);
+	/* Loop over tag list under this interface for {acct_tag,uid_tag} */
+	spin_lock_bh(&iface_entry->tag_stat_list_lock);
+
+	tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+					      tag);
+	if (tag_stat_entry) {
+		/*
+		 * Updating the {acct_tag, uid_tag} entry handles both stats:
+		 * {0, uid_tag} will also get updated.
+		 */
+		tag_stat_update(tag_stat_entry, direction, proto, bytes);
+		spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+		return;
+	}
+
+	/* Loop over tag list under this interface for {0,uid_tag} */
+	tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+					      uid_tag);
+	if (!tag_stat_entry) {
+		/* Here: the base uid_tag did not exist */
+		/*
+		 * No parent counters. So
+		 *  - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
+		 */
+		new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
+		uid_tag_counters = &new_tag_stat->counters;
+	} else {
+		uid_tag_counters = &tag_stat_entry->counters;
+	}
+
+	if (acct_tag) {
+		new_tag_stat = create_if_tag_stat(iface_entry, tag);
+		new_tag_stat->parent_counters = uid_tag_counters;
+	}
+	tag_stat_update(new_tag_stat, direction, proto, bytes);
+	spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+}
+
+static int iface_netdev_event_handler(struct notifier_block *nb,
+				      unsigned long event, void *ptr) {
+	struct net_device *dev = ptr;
+
+	if (unlikely(module_passive))
+		return NOTIFY_DONE;
+
+	IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
+		 "ev=0x%lx/%s netdev=%p->name=%s\n",
+		 event, netdev_evt_str(event), dev, dev ? dev->name : "");
+
+	switch (event) {
+	case NETDEV_UP:
+		iface_stat_create(dev, NULL);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		iface_stat_update(dev, event == NETDEV_DOWN);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int iface_inet6addr_event_handler(struct notifier_block *nb,
+					 unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = ptr;
+	struct net_device *dev;
+
+	if (unlikely(module_passive))
+		return NOTIFY_DONE;
+
+	IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
+		 "ev=0x%lx/%s ifa=%p\n",
+		 event, netdev_evt_str(event), ifa);
+
+	switch (event) {
+	case NETDEV_UP:
+		BUG_ON(!ifa || !ifa->idev);
+		dev = (struct net_device *)ifa->idev->dev;
+		iface_stat_create_ipv6(dev, ifa);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		BUG_ON(!ifa || !ifa->idev);
+		dev = (struct net_device *)ifa->idev->dev;
+		iface_stat_update(dev, event == NETDEV_DOWN);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int iface_inetaddr_event_handler(struct notifier_block *nb,
+					unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *dev;
+
+	if (unlikely(module_passive))
+		return NOTIFY_DONE;
+
+	IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
+		 "ev=0x%lx/%s ifa=%p\n",
+		 event, netdev_evt_str(event), ifa);
+
+	switch (event) {
+	case NETDEV_UP:
+		BUG_ON(!ifa || !ifa->ifa_dev);
+		dev = ifa->ifa_dev->dev;
+		iface_stat_create(dev, ifa);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	case NETDEV_DOWN:
+	case NETDEV_UNREGISTER:
+		BUG_ON(!ifa || !ifa->ifa_dev);
+		dev = ifa->ifa_dev->dev;
+		iface_stat_update(dev, event == NETDEV_DOWN);
+		atomic64_inc(&qtu_events.iface_events);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block iface_netdev_notifier_blk = {
+	.notifier_call = iface_netdev_event_handler,
+};
+
+static struct notifier_block iface_inetaddr_notifier_blk = {
+	.notifier_call = iface_inetaddr_event_handler,
+};
+
+static struct notifier_block iface_inet6addr_notifier_blk = {
+	.notifier_call = iface_inet6addr_event_handler,
+};
+
+static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
+{
+	int err;
+
+	iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
+	if (!iface_stat_procdir) {
+		pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
+		err = -1;
+		goto err;
+	}
+
+	iface_stat_all_procfile = create_proc_entry(iface_stat_all_procfilename,
+						    proc_iface_perms,
+						    parent_procdir);
+	if (!iface_stat_all_procfile) {
+		pr_err("qtaguid: iface_stat: init "
+		       " failed to create stat_all proc entry\n");
+		err = -1;
+		goto err_zap_entry;
+	}
+	iface_stat_all_procfile->read_proc = iface_stat_all_proc_read;
+
+
+	err = register_netdevice_notifier(&iface_netdev_notifier_blk);
+	if (err) {
+		pr_err("qtaguid: iface_stat: init "
+		       "failed to register dev event handler\n");
+		goto err_zap_all_stats_entry;
+	}
+	err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+	if (err) {
+		pr_err("qtaguid: iface_stat: init "
+		       "failed to register ipv4 dev event handler\n");
+		goto err_unreg_nd;
+	}
+
+	err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
+	if (err) {
+		pr_err("qtaguid: iface_stat: init "
+		       "failed to register ipv6 dev event handler\n");
+		goto err_unreg_ip4_addr;
+	}
+	return 0;
+
+err_unreg_ip4_addr:
+	unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+err_unreg_nd:
+	unregister_netdevice_notifier(&iface_netdev_notifier_blk);
+err_zap_all_stats_entry:
+	remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
+err_zap_entry:
+	remove_proc_entry(iface_stat_procdirname, parent_procdir);
+err:
+	return err;
+}
+
+static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
+				    struct xt_action_param *par)
+{
+	struct sock *sk;
+	unsigned int hook_mask = (1 << par->hooknum);
+
+	MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
+		 par->hooknum, par->family);
+
+	/*
+	 * Let's not abuse the the xt_socket_get*_sk(), or else it will
+	 * return garbage SKs.
+	 */
+	if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
+		return NULL;
+
+	switch (par->family) {
+	case NFPROTO_IPV6:
+		sk = xt_socket_get6_sk(skb, par);
+		break;
+	case NFPROTO_IPV4:
+		sk = xt_socket_get4_sk(skb, par);
+		break;
+	default:
+		return NULL;
+	}
+
+	/*
+	 * Seems to be issues on the file ptr for TCP_TIME_WAIT SKs.
+	 * http://kerneltrap.org/mailarchive/linux-netdev/2010/10/21/6287959
+	 * Not fixed in 3.0-r3 :(
+	 */
+	if (sk) {
+		MT_DEBUG("qtaguid: %p->sk_proto=%u "
+			 "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
+		if (sk->sk_state  == TCP_TIME_WAIT) {
+			xt_socket_put_sk(sk);
+			sk = NULL;
+		}
+	}
+	return sk;
+}
+
+static void account_for_uid(const struct sk_buff *skb,
+			    const struct sock *alternate_sk, uid_t uid,
+			    struct xt_action_param *par)
+{
+	const struct net_device *el_dev;
+
+	if (!skb->dev) {
+		MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
+		el_dev = par->in ? : par->out;
+	} else {
+		const struct net_device *other_dev;
+		el_dev = skb->dev;
+		other_dev = par->in ? : par->out;
+		if (el_dev != other_dev) {
+			MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
+				"par->(in/out)=%p %s\n",
+				par->hooknum, el_dev, el_dev->name, other_dev,
+				other_dev->name);
+		}
+	}
+
+	if (unlikely(!el_dev)) {
+		pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
+	} else if (unlikely(!el_dev->name)) {
+		pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum);
+	} else {
+		MT_DEBUG("qtaguid[%d]: dev name=%s type=%d\n",
+			 par->hooknum,
+			 el_dev->name,
+			 el_dev->type);
+
+		if_tag_stat_update(el_dev->name, uid,
+				skb->sk ? skb->sk : alternate_sk,
+				par->in ? IFS_RX : IFS_TX,
+				ip_hdr(skb)->protocol, skb->len);
+	}
+}
+
+static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_qtaguid_match_info *info = par->matchinfo;
+	const struct file *filp;
+	bool got_sock = false;
+	struct sock *sk;
+	uid_t sock_uid;
+	bool res;
+
+	if (unlikely(module_passive))
+		return (info->match ^ info->invert) == 0;
+
+	MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
+		 par->hooknum, skb, par->in, par->out, par->family);
+
+	atomic64_inc(&qtu_events.match_calls);
+	if (skb == NULL) {
+		res = (info->match ^ info->invert) == 0;
+		goto ret_res;
+	}
+
+	sk = skb->sk;
+
+	if (sk == NULL) {
+		/*
+		 * A missing sk->sk_socket happens when packets are in-flight
+		 * and the matching socket is already closed and gone.
+		 */
+		sk = qtaguid_find_sk(skb, par);
+		/*
+		 * If we got the socket from the find_sk(), we will need to put
+		 * it back, as nf_tproxy_get_sock_v4() got it.
+		 */
+		got_sock = sk;
+		if (sk)
+			atomic64_inc(&qtu_events.match_found_sk_in_ct);
+		else
+			atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
+	} else {
+		atomic64_inc(&qtu_events.match_found_sk);
+	}
+	MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d proto=%d\n",
+		par->hooknum, sk, got_sock, ip_hdr(skb)->protocol);
+	if (sk != NULL) {
+		MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+			par->hooknum, sk, sk->sk_socket,
+			sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+		filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+		MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+			par->hooknum, filp ? filp->f_cred->fsuid : -1);
+	}
+
+	if (sk == NULL || sk->sk_socket == NULL) {
+		/*
+		 * Here, the qtaguid_find_sk() using connection tracking
+		 * couldn't find the owner, so for now we just count them
+		 * against the system.
+		 */
+		/*
+		 * TODO: unhack how to force just accounting.
+		 * For now we only do iface stats when the uid-owner is not
+		 * requested.
+		 */
+		if (!(info->match & XT_QTAGUID_UID))
+			account_for_uid(skb, sk, 0, par);
+		MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
+			par->hooknum,
+			sk ? sk->sk_socket : NULL);
+		res = (info->match ^ info->invert) == 0;
+		atomic64_inc(&qtu_events.match_no_sk);
+		goto put_sock_ret_res;
+	} else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
+		res = false;
+		goto put_sock_ret_res;
+	}
+	filp = sk->sk_socket->file;
+	if (filp == NULL) {
+		MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
+		account_for_uid(skb, sk, 0, par);
+		res = ((info->match ^ info->invert) &
+			(XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
+		atomic64_inc(&qtu_events.match_no_sk_file);
+		goto put_sock_ret_res;
+	}
+	sock_uid = filp->f_cred->fsuid;
+	/*
+	 * TODO: unhack how to force just accounting.
+	 * For now we only do iface stats when the uid-owner is not requested
+	 */
+	if (!(info->match & XT_QTAGUID_UID))
+		account_for_uid(skb, sk, sock_uid, par);
+
+	/*
+	 * The following two tests fail the match when:
+	 *    id not in range AND no inverted condition requested
+	 * or id     in range AND    inverted condition requested
+	 * Thus (!a && b) || (a && !b) == a ^ b
+	 */
+	if (info->match & XT_QTAGUID_UID)
+		if ((filp->f_cred->fsuid >= info->uid_min &&
+		     filp->f_cred->fsuid <= info->uid_max) ^
+		    !(info->invert & XT_QTAGUID_UID)) {
+			MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
+				 par->hooknum);
+			res = false;
+			goto put_sock_ret_res;
+		}
+	if (info->match & XT_QTAGUID_GID)
+		if ((filp->f_cred->fsgid >= info->gid_min &&
+				filp->f_cred->fsgid <= info->gid_max) ^
+			!(info->invert & XT_QTAGUID_GID)) {
+			MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
+				par->hooknum);
+			res = false;
+			goto put_sock_ret_res;
+		}
+
+	MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
+	res = true;
+
+put_sock_ret_res:
+	if (got_sock)
+		xt_socket_put_sk(sk);
+ret_res:
+	MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
+	return res;
+}
+
+#ifdef DDEBUG
+/* This function is not in xt_qtaguid_print.c because of locks visibility */
+static void prdebug_full_state(int indent_level, const char *fmt, ...)
+{
+	va_list args;
+	char *fmt_buff;
+	char *buff;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	fmt_buff = kasprintf(GFP_ATOMIC,
+			     "qtaguid: %s(): %s {\n", __func__, fmt);
+	BUG_ON(!fmt_buff);
+	va_start(args, fmt);
+	buff = kvasprintf(GFP_ATOMIC,
+			  fmt_buff, args);
+	BUG_ON(!buff);
+	pr_debug("%s", buff);
+	kfree(fmt_buff);
+	kfree(buff);
+	va_end(args);
+
+	spin_lock_bh(&sock_tag_list_lock);
+	prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+	spin_unlock_bh(&sock_tag_list_lock);
+
+	spin_lock_bh(&sock_tag_list_lock);
+	spin_lock_bh(&uid_tag_data_tree_lock);
+	prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+	prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+	spin_unlock_bh(&sock_tag_list_lock);
+
+	spin_lock_bh(&iface_stat_list_lock);
+	prdebug_iface_stat_list(indent_level, &iface_stat_list);
+	spin_unlock_bh(&iface_stat_list_lock);
+
+	pr_debug("qtaguid: %s(): }\n", __func__);
+}
+#else
+static void prdebug_full_state(int indent_level, const char *fmt, ...) {}
+#endif
+
+/*
+ * Procfs reader to get all active socket tags using style "1)" as described in
+ * fs/proc/generic.c
+ */
+static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
+				  off_t items_to_skip, int char_count, int *eof,
+				  void *data)
+{
+	char *outp = page;
+	int len;
+	uid_t uid;
+	struct rb_node *node;
+	struct sock_tag *sock_tag_entry;
+	int item_index = 0;
+	int indent_level = 0;
+	long f_count;
+
+	if (unlikely(module_passive)) {
+		*eof = 1;
+		return 0;
+	}
+
+	if (*eof)
+		return 0;
+
+	CT_DEBUG("qtaguid: proc ctrl page=%p off=%ld char_count=%d *eof=%d\n",
+		page, items_to_skip, char_count, *eof);
+
+	spin_lock_bh(&sock_tag_list_lock);
+	for (node = rb_first(&sock_tag_tree);
+	     node;
+	     node = rb_next(node)) {
+		if (item_index++ < items_to_skip)
+			continue;
+		sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+		uid = get_uid_from_tag(sock_tag_entry->tag);
+		CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+			 "pid=%u\n",
+			 sock_tag_entry->sk,
+			 sock_tag_entry->tag,
+			 uid,
+			 sock_tag_entry->pid
+			);
+		f_count = atomic_long_read(
+			&sock_tag_entry->socket->file->f_count);
+		len = snprintf(outp, char_count,
+			       "sock=%p tag=0x%llx (uid=%u) pid=%u "
+			       "f_count=%lu\n",
+			       sock_tag_entry->sk,
+			       sock_tag_entry->tag, uid,
+			       sock_tag_entry->pid, f_count);
+		if (len >= char_count) {
+			spin_unlock_bh(&sock_tag_list_lock);
+			*outp = '\0';
+			return outp - page;
+		}
+		outp += len;
+		char_count -= len;
+		(*num_items_returned)++;
+	}
+	spin_unlock_bh(&sock_tag_list_lock);
+
+	if (item_index++ >= items_to_skip) {
+		len = snprintf(outp, char_count,
+			       "events: sockets_tagged=%llu "
+			       "sockets_untagged=%llu "
+			       "counter_set_changes=%llu "
+			       "delete_cmds=%llu "
+			       "iface_events=%llu "
+			       "match_calls=%llu "
+			       "match_found_sk=%llu "
+			       "match_found_sk_in_ct=%llu "
+			       "match_found_no_sk_in_ct=%llu "
+			       "match_no_sk=%llu "
+			       "match_no_sk_file=%llu\n",
+			       atomic64_read(&qtu_events.sockets_tagged),
+			       atomic64_read(&qtu_events.sockets_untagged),
+			       atomic64_read(&qtu_events.counter_set_changes),
+			       atomic64_read(&qtu_events.delete_cmds),
+			       atomic64_read(&qtu_events.iface_events),
+			       atomic64_read(&qtu_events.match_calls),
+			       atomic64_read(&qtu_events.match_found_sk),
+			       atomic64_read(&qtu_events.match_found_sk_in_ct),
+			       atomic64_read(
+				       &qtu_events.match_found_no_sk_in_ct),
+			       atomic64_read(&qtu_events.match_no_sk),
+			       atomic64_read(&qtu_events.match_no_sk_file));
+		if (len >= char_count) {
+			*outp = '\0';
+			return outp - page;
+		}
+		outp += len;
+		char_count -= len;
+		(*num_items_returned)++;
+	}
+
+	/* Count the following as part of the last item_index */
+	if (item_index > items_to_skip) {
+		prdebug_full_state(indent_level, "proc ctrl");
+	}
+
+	*eof = 1;
+	return outp - page;
+}
+
+/*
+ * Delete socket tags, and stat tags associated with a given
+ * accouting tag and uid.
+ */
+static int ctrl_cmd_delete(const char *input)
+{
+	char cmd;
+	uid_t uid;
+	uid_t entry_uid;
+	tag_t acct_tag;
+	tag_t tag;
+	int res, argc;
+	struct iface_stat *iface_entry;
+	struct rb_node *node;
+	struct sock_tag *st_entry;
+	struct rb_root st_to_free_tree = RB_ROOT;
+	struct tag_stat *ts_entry;
+	struct tag_counter_set *tcs_entry;
+	struct tag_ref *tr_entry;
+	struct uid_tag_data *utd_entry;
+
+	argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid);
+	CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
+		 "user_tag=0x%llx uid=%u\n", input, argc, cmd,
+		 acct_tag, uid);
+	if (argc < 2) {
+		res = -EINVAL;
+		goto err;
+	}
+	if (!valid_atag(acct_tag)) {
+		pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
+		res = -EINVAL;
+		goto err;
+	}
+	if (argc < 3) {
+		uid = current_fsuid();
+	} else if (!can_impersonate_uid(uid)) {
+		pr_info("qtaguid: ctrl_delete(%s): "
+			"insufficient priv from pid=%u tgid=%u uid=%u\n",
+			input, current->pid, current->tgid, current_fsuid());
+		res = -EPERM;
+		goto err;
+	}
+
+	tag = combine_atag_with_uid(acct_tag, uid);
+	CT_DEBUG("qtaguid: ctrl_delete(%s): "
+		 "looking for tag=0x%llx (uid=%u)\n",
+		 input, tag, uid);
+
+	/* Delete socket tags */
+	spin_lock_bh(&sock_tag_list_lock);
+	node = rb_first(&sock_tag_tree);
+	while (node) {
+		st_entry = rb_entry(node, struct sock_tag, sock_node);
+		entry_uid = get_uid_from_tag(st_entry->tag);
+		node = rb_next(node);
+		if (entry_uid != uid)
+			continue;
+
+		CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
+			 input, st_entry->tag, entry_uid);
+
+		if (!acct_tag || st_entry->tag == tag) {
+			rb_erase(&st_entry->sock_node, &sock_tag_tree);
+			/* Can't sockfd_put() within spinlock, do it later. */
+			sock_tag_tree_insert(st_entry, &st_to_free_tree);
+			tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+			BUG_ON(tr_entry->num_sock_tags <= 0);
+			tr_entry->num_sock_tags--;
+			/*
+			 * TODO: remove if, and start failing.
+			 * This is a hack to work around the fact that in some
+			 * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
+			 * and are trying to work around apps
+			 * that didn't open the /dev/xt_qtaguid.
+			 */
+			if (st_entry->list.next && st_entry->list.prev)
+				list_del(&st_entry->list);
+		}
+	}
+	spin_unlock_bh(&sock_tag_list_lock);
+
+	sock_tag_tree_erase(&st_to_free_tree);
+
+	/* Delete tag counter-sets */
+	spin_lock_bh(&tag_counter_set_list_lock);
+	/* Counter sets are only on the uid tag, not full tag */
+	tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+	if (tcs_entry) {
+		CT_DEBUG("qtaguid: ctrl_delete(%s): "
+			 "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
+			 input,
+			 tcs_entry->tn.tag,
+			 get_uid_from_tag(tcs_entry->tn.tag),
+			 tcs_entry->active_set);
+		rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
+		kfree(tcs_entry);
+	}
+	spin_unlock_bh(&tag_counter_set_list_lock);
+
+	/*
+	 * If acct_tag is 0, then all entries belonging to uid are
+	 * erased.
+	 */
+	spin_lock_bh(&iface_stat_list_lock);
+	list_for_each_entry(iface_entry, &iface_stat_list, list) {
+		spin_lock_bh(&iface_entry->tag_stat_list_lock);
+		node = rb_first(&iface_entry->tag_stat_tree);
+		while (node) {
+			ts_entry = rb_entry(node, struct tag_stat, tn.node);
+			entry_uid = get_uid_from_tag(ts_entry->tn.tag);
+			node = rb_next(node);
+
+			CT_DEBUG("qtaguid: ctrl_delete(%s): "
+				 "ts tag=0x%llx (uid=%u)\n",
+				 input, ts_entry->tn.tag, entry_uid);
+
+			if (entry_uid != uid)
+				continue;
+			if (!acct_tag || ts_entry->tn.tag == tag) {
+				CT_DEBUG("qtaguid: ctrl_delete(%s): "
+					 "erase ts: %s 0x%llx %u\n",
+					 input, iface_entry->ifname,
+					 get_atag_from_tag(ts_entry->tn.tag),
+					 entry_uid);
+				rb_erase(&ts_entry->tn.node,
+					 &iface_entry->tag_stat_tree);
+				kfree(ts_entry);
+			}
+		}
+		spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+	}
+	spin_unlock_bh(&iface_stat_list_lock);
+
+	/* Cleanup the uid_tag_data */
+	spin_lock_bh(&uid_tag_data_tree_lock);
+	node = rb_first(&uid_tag_data_tree);
+	while (node) {
+		utd_entry = rb_entry(node, struct uid_tag_data, node);
+		entry_uid = utd_entry->uid;
+		node = rb_next(node);
+
+		CT_DEBUG("qtaguid: ctrl_delete(%s): "
+			 "utd uid=%u\n",
+			 input, entry_uid);
+
+		if (entry_uid != uid)
+			continue;
+		/*
+		 * Go over the tag_refs, and those that don't have
+		 * sock_tags using them are freed.
+		 */
+		put_tag_ref_tree(tag, utd_entry);
+		put_utd_entry(utd_entry);
+	}
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+
+	atomic64_inc(&qtu_events.delete_cmds);
+	res = 0;
+
+err:
+	return res;
+}
+
+static int ctrl_cmd_counter_set(const char *input)
+{
+	char cmd;
+	uid_t uid = 0;
+	tag_t tag;
+	int res, argc;
+	struct tag_counter_set *tcs;
+	int counter_set;
+
+	argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
+	CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
+		 "set=%d uid=%u\n", input, argc, cmd,
+		 counter_set, uid);
+	if (argc != 3) {
+		res = -EINVAL;
+		goto err;
+	}
+	if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
+		pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
+			input);
+		res = -EINVAL;
+		goto err;
+	}
+	if (!can_manipulate_uids()) {
+		pr_info("qtaguid: ctrl_counterset(%s): "
+			"insufficient priv from pid=%u tgid=%u uid=%u\n",
+			input, current->pid, current->tgid, current_fsuid());
+		res = -EPERM;
+		goto err;
+	}
+
+	tag = make_tag_from_uid(uid);
+	spin_lock_bh(&tag_counter_set_list_lock);
+	tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+	if (!tcs) {
+		tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
+		if (!tcs) {
+			spin_unlock_bh(&tag_counter_set_list_lock);
+			pr_err("qtaguid: ctrl_counterset(%s): "
+			       "failed to alloc counter set\n",
+			       input);
+			res = -ENOMEM;
+			goto err;
+		}
+		tcs->tn.tag = tag;
+		tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
+		CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
+			 "(uid=%u) set=%d\n",
+			 input, tag, get_uid_from_tag(tag), counter_set);
+	}
+	tcs->active_set = counter_set;
+	spin_unlock_bh(&tag_counter_set_list_lock);
+	atomic64_inc(&qtu_events.counter_set_changes);
+	res = 0;
+
+err:
+	return res;
+}
+
+static int ctrl_cmd_tag(const char *input)
+{
+	char cmd;
+	int sock_fd = 0;
+	uid_t uid = 0;
+	tag_t acct_tag = make_atag_from_value(0);
+	tag_t full_tag;
+	struct socket *el_socket;
+	int res, argc;
+	struct sock_tag *sock_tag_entry;
+	struct tag_ref *tag_ref_entry;
+	struct uid_tag_data *uid_tag_data_entry;
+	struct proc_qtu_data *pqd_entry;
+
+	/* Unassigned args will get defaulted later. */
+	argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid);
+	CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
+		 "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
+		 acct_tag, uid);
+	if (argc < 2) {
+		res = -EINVAL;
+		goto err;
+	}
+	el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
+	if (!el_socket) {
+		pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
+			" sock_fd=%d err=%d\n", input, sock_fd, res);
+		goto err;
+	}
+	CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n",
+		 input, atomic_long_read(&el_socket->file->f_count),
+		 el_socket->sk);
+	if (argc < 3) {
+		acct_tag = make_atag_from_value(0);
+	} else if (!valid_atag(acct_tag)) {
+		pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
+		res = -EINVAL;
+		goto err_put;
+	}
+	CT_DEBUG("qtaguid: ctrl_tag(%s): "
+		 "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
+		 "in_group=%d in_egroup=%d\n",
+		 input, current->pid, current->tgid, current_uid(),
+		 current_euid(), current_fsuid(),
+		 in_group_p(proc_ctrl_write_gid),
+		 in_egroup_p(proc_ctrl_write_gid));
+	if (argc < 4) {
+		uid = current_fsuid();
+	} else if (!can_impersonate_uid(uid)) {
+		pr_info("qtaguid: ctrl_tag(%s): "
+			"insufficient priv from pid=%u tgid=%u uid=%u\n",
+			input, current->pid, current->tgid, current_fsuid());
+		res = -EPERM;
+		goto err_put;
+	}
+	full_tag = combine_atag_with_uid(acct_tag, uid);
+
+	spin_lock_bh(&sock_tag_list_lock);
+	sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+	tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+	if (IS_ERR(tag_ref_entry)) {
+		res = PTR_ERR(tag_ref_entry);
+		spin_unlock_bh(&sock_tag_list_lock);
+		goto err_put;
+	}
+	tag_ref_entry->num_sock_tags++;
+	if (sock_tag_entry) {
+		struct tag_ref *prev_tag_ref_entry;
+
+		CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+			 "st@%p ...->f_count=%ld\n",
+			 input, el_socket->sk, sock_tag_entry,
+			 atomic_long_read(&el_socket->file->f_count));
+		/*
+		 * This is a re-tagging, so release the sock_fd that was
+		 * locked at the time of the 1st tagging.
+		 * There is still the ref from this call's sockfd_lookup() so
+		 * it can be done within the spinlock.
+		 */
+		sockfd_put(sock_tag_entry->socket);
+		prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+						    &uid_tag_data_entry);
+		BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+		BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+		prev_tag_ref_entry->num_sock_tags--;
+		sock_tag_entry->tag = full_tag;
+	} else {
+		CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+			 input, el_socket->sk);
+		sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
+					 GFP_ATOMIC);
+		if (!sock_tag_entry) {
+			pr_err("qtaguid: ctrl_tag(%s): "
+			       "socket tag alloc failed\n",
+			       input);
+			spin_unlock_bh(&sock_tag_list_lock);
+			res = -ENOMEM;
+			goto err_tag_unref_put;
+		}
+		sock_tag_entry->sk = el_socket->sk;
+		sock_tag_entry->socket = el_socket;
+		sock_tag_entry->pid = current->tgid;
+		sock_tag_entry->tag = combine_atag_with_uid(acct_tag,
+							    uid);
+		spin_lock_bh(&uid_tag_data_tree_lock);
+		pqd_entry = proc_qtu_data_tree_search(
+			&proc_qtu_data_tree, current->tgid);
+		/*
+		 * TODO: remove if, and start failing.
+		 * At first, we want to catch user-space code that is not
+		 * opening the /dev/xt_qtaguid.
+		 */
+		if (IS_ERR_OR_NULL(pqd_entry))
+			pr_warn_once(
+				"qtaguid: %s(): "
+				"User space forgot to open /dev/xt_qtaguid? "
+				"pid=%u tgid=%u uid=%u\n", __func__,
+				current->pid, current->tgid,
+				current_fsuid());
+		else
+			list_add(&sock_tag_entry->list,
+				 &pqd_entry->sock_tag_list);
+		spin_unlock_bh(&uid_tag_data_tree_lock);
+
+		sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
+		atomic64_inc(&qtu_events.sockets_tagged);
+	}
+	spin_unlock_bh(&sock_tag_list_lock);
+	/* We keep the ref to the socket (file) until it is untagged */
+	CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n",
+		 input, sock_tag_entry,
+		 atomic_long_read(&el_socket->file->f_count));
+	return 0;
+
+err_tag_unref_put:
+	BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+	tag_ref_entry->num_sock_tags--;
+	free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
+err_put:
+	CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n",
+		 input, atomic_long_read(&el_socket->file->f_count) - 1);
+	/* Release the sock_fd that was grabbed by sockfd_lookup(). */
+	sockfd_put(el_socket);
+	return res;
+
+err:
+	CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
+	return res;
+}
+
+static int ctrl_cmd_untag(const char *input)
+{
+	char cmd;
+	int sock_fd = 0;
+	struct socket *el_socket;
+	int res, argc;
+	struct sock_tag *sock_tag_entry;
+	struct tag_ref *tag_ref_entry;
+	struct uid_tag_data *utd_entry;
+	struct proc_qtu_data *pqd_entry;
+
+	argc = sscanf(input, "%c %d", &cmd, &sock_fd);
+	CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
+		 input, argc, cmd, sock_fd);
+	if (argc < 2) {
+		res = -EINVAL;
+		goto err;
+	}
+	el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
+	if (!el_socket) {
+		pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
+			" sock_fd=%d err=%d\n", input, sock_fd, res);
+		goto err;
+	}
+	CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+		 input, atomic_long_read(&el_socket->file->f_count),
+		 el_socket->sk);
+	spin_lock_bh(&sock_tag_list_lock);
+	sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+	if (!sock_tag_entry) {
+		spin_unlock_bh(&sock_tag_list_lock);
+		res = -EINVAL;
+		goto err_put;
+	}
+	/*
+	 * The socket already belongs to the current process
+	 * so it can do whatever it wants to it.
+	 */
+	rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
+
+	tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+	BUG_ON(!tag_ref_entry);
+	BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+	spin_lock_bh(&uid_tag_data_tree_lock);
+	pqd_entry = proc_qtu_data_tree_search(
+		&proc_qtu_data_tree, current->tgid);
+	/*
+	 * TODO: remove if, and start failing.
+	 * At first, we want to catch user-space code that is not
+	 * opening the /dev/xt_qtaguid.
+	 */
+	if (IS_ERR_OR_NULL(pqd_entry))
+		pr_warn_once("qtaguid: %s(): "
+			     "User space forgot to open /dev/xt_qtaguid? "
+			     "pid=%u tgid=%u uid=%u\n", __func__,
+			     current->pid, current->tgid, current_fsuid());
+	else
+		list_del(&sock_tag_entry->list);
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+	/*
+	 * We don't free tag_ref from the utd_entry here,
+	 * only during a cmd_delete().
+	 */
+	tag_ref_entry->num_sock_tags--;
+	spin_unlock_bh(&sock_tag_list_lock);
+	/*
+	 * Release the sock_fd that was grabbed at tag time,
+	 * and once more for the sockfd_lookup() here.
+	 */
+	sockfd_put(sock_tag_entry->socket);
+	CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n",
+		 input, sock_tag_entry,
+		 atomic_long_read(&el_socket->file->f_count) - 1);
+	sockfd_put(el_socket);
+
+	kfree(sock_tag_entry);
+	atomic64_inc(&qtu_events.sockets_untagged);
+
+	return 0;
+
+err_put:
+	CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n",
+		 input, atomic_long_read(&el_socket->file->f_count) - 1);
+	/* Release the sock_fd that was grabbed by sockfd_lookup(). */
+	sockfd_put(el_socket);
+	return res;
+
+err:
+	CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input);
+	return res;
+}
+
+static int qtaguid_ctrl_parse(const char *input, int count)
+{
+	char cmd;
+	int res;
+
+	cmd = input[0];
+	/* Collect params for commands */
+	switch (cmd) {
+	case 'd':
+		res = ctrl_cmd_delete(input);
+		break;
+
+	case 's':
+		res = ctrl_cmd_counter_set(input);
+		break;
+
+	case 't':
+		res = ctrl_cmd_tag(input);
+		break;
+
+	case 'u':
+		res = ctrl_cmd_untag(input);
+		break;
+
+	default:
+		res = -EINVAL;
+		goto err;
+	}
+	if (!res)
+		res = count;
+err:
+	CT_DEBUG("qtaguid: ctrl(%s): res=%d\n", input, res);
+	return res;
+}
+
+#define MAX_QTAGUID_CTRL_INPUT_LEN 255
+static int qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
+			unsigned long count, void *data)
+{
+	char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
+
+	if (unlikely(module_passive))
+		return count;
+
+	if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
+		return -EINVAL;
+
+	if (copy_from_user(input_buf, buffer, count))
+		return -EFAULT;
+
+	input_buf[count] = '\0';
+	return qtaguid_ctrl_parse(input_buf, count);
+}
+
+struct proc_print_info {
+	char *outp;
+	char **num_items_returned;
+	struct iface_stat *iface_entry;
+	struct tag_stat *ts_entry;
+	int item_index;
+	int items_to_skip;
+	int char_count;
+};
+
+static int pp_stats_line(struct proc_print_info *ppi, int cnt_set)
+{
+	int len;
+	struct data_counters *cnts;
+
+	if (!ppi->item_index) {
+		if (ppi->item_index++ < ppi->items_to_skip)
+			return 0;
+		len = snprintf(ppi->outp, ppi->char_count,
+			       "idx iface acct_tag_hex uid_tag_int cnt_set "
+			       "rx_bytes rx_packets "
+			       "tx_bytes tx_packets "
+			       "rx_tcp_bytes rx_tcp_packets "
+			       "rx_udp_bytes rx_udp_packets "
+			       "rx_other_bytes rx_other_packets "
+			       "tx_tcp_bytes tx_tcp_packets "
+			       "tx_udp_bytes tx_udp_packets "
+			       "tx_other_bytes tx_other_packets\n");
+	} else {
+		tag_t tag = ppi->ts_entry->tn.tag;
+		uid_t stat_uid = get_uid_from_tag(tag);
+
+		if (!can_read_other_uid_stats(stat_uid)) {
+			CT_DEBUG("qtaguid: stats line: "
+				 "%s 0x%llx %u: insufficient priv "
+				 "from pid=%u tgid=%u uid=%u\n",
+				 ppi->iface_entry->ifname,
+				 get_atag_from_tag(tag), stat_uid,
+				 current->pid, current->tgid, current_fsuid());
+			return 0;
+		}
+		if (ppi->item_index++ < ppi->items_to_skip)
+			return 0;
+		cnts = &ppi->ts_entry->counters;
+		len = snprintf(
+			ppi->outp, ppi->char_count,
+			"%d %s 0x%llx %u %u "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu "
+			"%llu %llu\n",
+			ppi->item_index,
+			ppi->iface_entry->ifname,
+			get_atag_from_tag(tag),
+			stat_uid,
+			cnt_set,
+			dc_sum_bytes(cnts, cnt_set, IFS_RX),
+			dc_sum_packets(cnts, cnt_set, IFS_RX),
+			dc_sum_bytes(cnts, cnt_set, IFS_TX),
+			dc_sum_packets(cnts, cnt_set, IFS_TX),
+			cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+			cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+			cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+			cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+			cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+			cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+			cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+			cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+			cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+			cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+			cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+			cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+	}
+	return len;
+}
+
+static bool pp_sets(struct proc_print_info *ppi)
+{
+	int len;
+	int counter_set;
+	for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
+	     counter_set++) {
+		len = pp_stats_line(ppi, counter_set);
+		if (len >= ppi->char_count) {
+			*ppi->outp = '\0';
+			return false;
+		}
+		if (len) {
+			ppi->outp += len;
+			ppi->char_count -= len;
+			(*ppi->num_items_returned)++;
+		}
+	}
+	return true;
+}
+
+/*
+ * Procfs reader to get all tag stats using style "1)" as described in
+ * fs/proc/generic.c
+ * Groups all protocols tx/rx bytes.
+ */
+static int qtaguid_stats_proc_read(char *page, char **num_items_returned,
+				off_t items_to_skip, int char_count, int *eof,
+				void *data)
+{
+	struct proc_print_info ppi;
+	int len;
+
+	ppi.outp = page;
+	ppi.item_index = 0;
+	ppi.char_count = char_count;
+	ppi.num_items_returned = num_items_returned;
+	ppi.items_to_skip = items_to_skip;
+
+	if (unlikely(module_passive)) {
+		len = pp_stats_line(&ppi, 0);
+		/* The header should always be shorter than the buffer. */
+		BUG_ON(len >= ppi.char_count);
+		(*num_items_returned)++;
+		*eof = 1;
+		return len;
+	}
+
+	CT_DEBUG("qtaguid:proc stats page=%p *num_items_returned=%p off=%ld "
+		"char_count=%d *eof=%d\n", page, *num_items_returned,
+		items_to_skip, char_count, *eof);
+
+	if (*eof)
+		return 0;
+
+	/* The idx is there to help debug when things go belly up. */
+	len = pp_stats_line(&ppi, 0);
+	/* Don't advance the outp unless the whole line was printed */
+	if (len >= ppi.char_count) {
+		*ppi.outp = '\0';
+		return ppi.outp - page;
+	}
+	if (len) {
+		ppi.outp += len;
+		ppi.char_count -= len;
+		(*num_items_returned)++;
+	}
+
+	spin_lock_bh(&iface_stat_list_lock);
+	list_for_each_entry(ppi.iface_entry, &iface_stat_list, list) {
+		struct rb_node *node;
+		spin_lock_bh(&ppi.iface_entry->tag_stat_list_lock);
+		for (node = rb_first(&ppi.iface_entry->tag_stat_tree);
+		     node;
+		     node = rb_next(node)) {
+			ppi.ts_entry = rb_entry(node, struct tag_stat, tn.node);
+			if (!pp_sets(&ppi)) {
+				spin_unlock_bh(
+					&ppi.iface_entry->tag_stat_list_lock);
+				spin_unlock_bh(&iface_stat_list_lock);
+				return ppi.outp - page;
+			}
+		}
+		spin_unlock_bh(&ppi.iface_entry->tag_stat_list_lock);
+	}
+	spin_unlock_bh(&iface_stat_list_lock);
+
+	*eof = 1;
+	return ppi.outp - page;
+}
+
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+	struct uid_tag_data *utd_entry;
+	struct proc_qtu_data  *pqd_entry;
+	struct proc_qtu_data  *new_pqd_entry;
+	int res;
+	bool utd_entry_found;
+
+	if (unlikely(qtu_proc_handling_passive))
+		return 0;
+
+	DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+		 current->pid, current->tgid, current_fsuid());
+
+	spin_lock_bh(&uid_tag_data_tree_lock);
+
+	/* Look for existing uid data, or alloc one. */
+	utd_entry = get_uid_data(current_fsuid(), &utd_entry_found);
+	if (IS_ERR_OR_NULL(utd_entry)) {
+		res = PTR_ERR(utd_entry);
+		goto err;
+	}
+
+	/* Look for existing PID based proc_data */
+	pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+					      current->tgid);
+	if (pqd_entry) {
+		pr_err("qtaguid: qtudev_open(): %u/%u %u "
+		       "%s already opened\n",
+		       current->pid, current->tgid, current_fsuid(),
+		       QTU_DEV_NAME);
+		res = -EBUSY;
+		goto err_unlock_free_utd;
+	}
+
+	new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+	if (!new_pqd_entry) {
+		pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+		       "proc data alloc failed\n",
+		       current->pid, current->tgid, current_fsuid());
+		res = -ENOMEM;
+		goto err_unlock_free_utd;
+	}
+	new_pqd_entry->pid = current->tgid;
+	INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+	new_pqd_entry->parent_tag_data = utd_entry;
+	utd_entry->num_pqd++;
+
+	proc_qtu_data_tree_insert(new_pqd_entry,
+				  &proc_qtu_data_tree);
+
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+	DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
+		 current_fsuid(), new_pqd_entry);
+	file->private_data = new_pqd_entry;
+	return 0;
+
+err_unlock_free_utd:
+	if (!utd_entry_found) {
+		rb_erase(&utd_entry->node, &uid_tag_data_tree);
+		kfree(utd_entry);
+	}
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+err:
+	return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+	struct proc_qtu_data  *pqd_entry = file->private_data;
+	struct uid_tag_data  *utd_entry = pqd_entry->parent_tag_data;
+	struct sock_tag *st_entry;
+	struct rb_root st_to_free_tree = RB_ROOT;
+	struct list_head *entry, *next;
+	struct tag_ref *tr;
+
+	if (unlikely(qtu_proc_handling_passive))
+		return 0;
+
+	/*
+	 * Do not trust the current->pid, it might just be a kworker cleaning
+	 * up after a dead proc.
+	 */
+	DR_DEBUG("qtaguid: qtudev_release(): "
+		 "pid=%u tgid=%u uid=%u "
+		 "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+		 current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+		 pqd_entry, pqd_entry->pid, utd_entry,
+		 utd_entry->num_active_tags);
+
+	spin_lock_bh(&sock_tag_list_lock);
+	spin_lock_bh(&uid_tag_data_tree_lock);
+
+	list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+		st_entry = list_entry(entry, struct sock_tag, list);
+		DR_DEBUG("qtaguid: %s(): "
+			 "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+			 __func__,
+			 st_entry, st_entry->sk,
+			 current->pid, current->tgid,
+			 pqd_entry->parent_tag_data->uid);
+
+		utd_entry = uid_tag_data_tree_search(
+			&uid_tag_data_tree,
+			get_uid_from_tag(st_entry->tag));
+		BUG_ON(IS_ERR_OR_NULL(utd_entry));
+		DR_DEBUG("qtaguid: %s(): "
+			 "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+			 st_entry->tag, utd_entry);
+		tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+					 st_entry->tag);
+		BUG_ON(!tr);
+		BUG_ON(tr->num_sock_tags <= 0);
+		tr->num_sock_tags--;
+		free_tag_ref_from_utd_entry(tr, utd_entry);
+
+		rb_erase(&st_entry->sock_node, &sock_tag_tree);
+		list_del(&st_entry->list);
+		/* Can't sockfd_put() within spinlock, do it later. */
+		sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+		/*
+		 * Try to free the utd_entry if no other proc_qtu_data is
+		 * using it (num_pqd is 0) and it doesn't have active tags
+		 * (num_active_tags is 0).
+		 */
+		put_utd_entry(utd_entry);
+	}
+
+	rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+	BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
+	pqd_entry->parent_tag_data->num_pqd--;
+	put_utd_entry(pqd_entry->parent_tag_data);
+	kfree(pqd_entry);
+	file->private_data = NULL;
+
+	spin_unlock_bh(&uid_tag_data_tree_lock);
+	spin_unlock_bh(&sock_tag_list_lock);
+
+
+	sock_tag_tree_erase(&st_to_free_tree);
+
+	prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__,
+			   current->pid, current->tgid);
+	return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+	.owner = THIS_MODULE,
+	.open = qtudev_open,
+	.release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = QTU_DEV_NAME,
+	.fops = &qtudev_fops,
+	/* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
+/*------------------------------------------*/
+static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
+{
+	int ret;
+	*res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
+	if (!*res_procdir) {
+		pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
+		ret = -ENOMEM;
+		goto no_dir;
+	}
+
+	xt_qtaguid_ctrl_file = create_proc_entry("ctrl", proc_ctrl_perms,
+						*res_procdir);
+	if (!xt_qtaguid_ctrl_file) {
+		pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
+			" file\n");
+		ret = -ENOMEM;
+		goto no_ctrl_entry;
+	}
+	xt_qtaguid_ctrl_file->read_proc = qtaguid_ctrl_proc_read;
+	xt_qtaguid_ctrl_file->write_proc = qtaguid_ctrl_proc_write;
+
+	xt_qtaguid_stats_file = create_proc_entry("stats", proc_stats_perms,
+						*res_procdir);
+	if (!xt_qtaguid_stats_file) {
+		pr_err("qtaguid: failed to create xt_qtaguid/stats "
+			"file\n");
+		ret = -ENOMEM;
+		goto no_stats_entry;
+	}
+	xt_qtaguid_stats_file->read_proc = qtaguid_stats_proc_read;
+	/*
+	 * TODO: add support counter hacking
+	 * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
+	 */
+	return 0;
+
+no_stats_entry:
+	remove_proc_entry("ctrl", *res_procdir);
+no_ctrl_entry:
+	remove_proc_entry("xt_qtaguid", NULL);
+no_dir:
+	return ret;
+}
+
+static struct xt_match qtaguid_mt_reg __read_mostly = {
+	/*
+	 * This module masquerades as the "owner" module so that iptables
+	 * tools can deal with it.
+	 */
+	.name       = "owner",
+	.revision   = 1,
+	.family     = NFPROTO_UNSPEC,
+	.match      = qtaguid_mt,
+	.matchsize  = sizeof(struct xt_qtaguid_match_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init qtaguid_mt_init(void)
+{
+	if (qtaguid_proc_register(&xt_qtaguid_procdir)
+	    || iface_stat_init(xt_qtaguid_procdir)
+	    || xt_register_match(&qtaguid_mt_reg)
+	    || misc_register(&qtu_device))
+		return -1;
+	return 0;
+}
+
+/*
+ * TODO: allow unloading of the module.
+ * For now stats are permanent.
+ * Kconfig forces'y/n' and never an 'm'.
+ */
+
+module_init(qtaguid_mt_init);
+MODULE_AUTHOR("jpa <jpa@google.com>");
+MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
+MODULE_ALIAS("ipt_qtaguid");
+MODULE_ALIAS("ip6t_qtaguid");
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
new file mode 100644
index 00000000..02479d6d
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -0,0 +1,330 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Iface handling */
+#define IDEBUG_MASK (1<<0)
+/* Iptable Matching. Per packet. */
+#define MDEBUG_MASK (1<<1)
+/* Red-black tree handling. Per packet. */
+#define RDEBUG_MASK (1<<2)
+/* procfs ctrl/stats handling */
+#define CDEBUG_MASK (1<<3)
+/* dev and resource tracking */
+#define DDEBUG_MASK (1<<4)
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+/*
+ * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
+ * All undef: text size ~ 0x3030; all def: ~ 0x4404.
+ */
+#define IDEBUG
+#define MDEBUG
+#define RDEBUG
+#define CDEBUG
+#define DDEBUG
+
+#define MSK_DEBUG(mask, ...) do {                           \
+		if (unlikely(qtaguid_debug_mask & (mask)))  \
+			pr_debug(__VA_ARGS__);              \
+	} while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint qtaguid_debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a:  {acct_tag=1, uid_tag=10003}
+ * b:  {acct_tag=2, uid_tag=10003}
+ * c:  {acct_tag=3, uid_tag=10003}
+ * d:  {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t;  /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+	return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+	return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+	return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+	return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+	return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+	return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+	return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+	return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+	IFS_TX,
+	IFS_RX,
+	IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+	IFS_TCP,
+	IFS_UDP,
+	IFS_PROTO_OTHER,
+	IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+	uint64_t bytes;
+	uint64_t packets;
+};
+
+struct data_counters {
+	struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+	struct rb_node node;
+	tag_t tag;
+};
+
+struct tag_stat {
+	struct tag_node tn;
+	struct data_counters counters;
+	/*
+	 * If this tag is acct_tag based, we need to count against the
+	 * matching parent uid_tag.
+	 */
+	struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+	struct list_head list;  /* in iface_stat_list */
+	char *ifname;
+	bool active;
+	/* net_dev is only valid for active iface_stat */
+	struct net_device *net_dev;
+
+	struct byte_packet_counters totals[IFS_MAX_DIRECTIONS];
+	/*
+	 * We keep the last_known, because some devices reset their counters
+	 * just before NETDEV_UP, while some will reset just before
+	 * NETDEV_REGISTER (which is more normal).
+	 * So now, if the device didn't do a NETDEV_UNREGISTER and we see
+	 * its current dev stats smaller that what was previously known, we
+	 * assume an UNREGISTER and just use the last_known.
+	 */
+	struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
+	/* last_known is usable when last_known_valid is true */
+	bool last_known_valid;
+
+	struct proc_dir_entry *proc_ptr;
+
+	struct rb_root tag_stat_tree;
+	spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+	struct work_struct iface_work;
+	struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+	struct rb_node sock_node;
+	struct sock *sk;  /* Only used as a number, never dereferenced */
+	/* The socket is needed for sockfd_put() */
+	struct socket *socket;
+	/* Used to associate with a given pid */
+	struct list_head list;   /* in proc_qtu_data.sock_tag_list */
+	pid_t pid;
+
+	tag_t tag;
+};
+
+struct qtaguid_event_counts {
+	/* Various successful events */
+	atomic64_t sockets_tagged;
+	atomic64_t sockets_untagged;
+	atomic64_t counter_set_changes;
+	atomic64_t delete_cmds;
+	atomic64_t iface_events;  /* Number of NETDEV_* events handled */
+
+	atomic64_t match_calls;   /* Number of times iptables called mt */
+	/*
+	 * match_found_sk_*: numbers related to the netfilter matching
+	 * function finding a sock for the sk_buff.
+	 * Total skbs processed is sum(match_found*).
+	 */
+	atomic64_t match_found_sk;   /* An sk was already in the sk_buff. */
+	/* The connection tracker had or didn't have the sk. */
+	atomic64_t match_found_sk_in_ct;
+	atomic64_t match_found_no_sk_in_ct;
+	/*
+	 * No sk could be found. No apparent owner. Could happen with
+	 * unsolicited traffic.
+	 */
+	atomic64_t match_no_sk;
+	/*
+	 * The file ptr in the sk_socket wasn't there.
+	 * This might happen for traffic while the socket is being closed.
+	 */
+	atomic64_t match_no_sk_file;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+	struct tag_node tn;
+	int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+	struct rb_node node;
+	uid_t uid;
+
+	/*
+	 * For the uid, how many accounting tags have been set.
+	 */
+	int num_active_tags;
+	/* Track the number of proc_qtu_data that reference it */
+	int num_pqd;
+	struct rb_root tag_ref_tree;
+	/* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+	struct tag_node tn;
+
+	/*
+	 * This tracks the number of active sockets that have a tag on them
+	 * which matches this tag_ref.tn.tag.
+	 * A tag ref can live on after the sockets are untagged.
+	 * A tag ref can only be removed during a tag delete command.
+	 */
+	int num_sock_tags;
+};
+
+struct proc_qtu_data {
+	struct rb_node node;
+	pid_t pid;
+
+	struct uid_tag_data *parent_tag_data;
+
+	/* Tracks the sock_tags that need freeing upon this proc's death */
+	struct list_head sock_tag_list;
+	/* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif  /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
new file mode 100644
index 00000000..39176785
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -0,0 +1,556 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Most of the functions in this file just waste time if DEBUG is not defined.
+ * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
+ * debug flags ore not defined.
+ * Those funcs that fail to allocate memory will panic as there is no need to
+ * hobble allong just pretending to do the requested work.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+#ifdef DDEBUG
+
+static void _bug_on_err_or_null(void *ptr)
+{
+	if (IS_ERR_OR_NULL(ptr)) {
+		pr_err("qtaguid: kmalloc failed\n");
+		BUG();
+	}
+}
+
+char *pp_tag_t(tag_t *tag)
+{
+	char *res;
+
+	if (!tag)
+		res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
+	else
+		res = kasprintf(GFP_ATOMIC,
+				"tag_t@%p{tag=0x%llx, uid=%u}",
+				tag, *tag, get_uid_from_tag(*tag));
+	_bug_on_err_or_null(res);
+	return res;
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+	char *res;
+
+	if (!dc)
+		res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
+	else if (showValues)
+		res = kasprintf(
+			GFP_ATOMIC, "data_counters@%p{"
+			"set0{"
+			"rx{"
+			"tcp{b=%llu, p=%llu}, "
+			"udp{b=%llu, p=%llu},"
+			"other{b=%llu, p=%llu}}, "
+			"tx{"
+			"tcp{b=%llu, p=%llu}, "
+			"udp{b=%llu, p=%llu},"
+			"other{b=%llu, p=%llu}}}, "
+			"set1{"
+			"rx{"
+			"tcp{b=%llu, p=%llu}, "
+			"udp{b=%llu, p=%llu},"
+			"other{b=%llu, p=%llu}}, "
+			"tx{"
+			"tcp{b=%llu, p=%llu}, "
+			"udp{b=%llu, p=%llu},"
+			"other{b=%llu, p=%llu}}}}",
+			dc,
+			dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+			dc->bpc[0][IFS_RX][IFS_TCP].packets,
+			dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+			dc->bpc[0][IFS_RX][IFS_UDP].packets,
+			dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+			dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+			dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+			dc->bpc[0][IFS_TX][IFS_TCP].packets,
+			dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+			dc->bpc[0][IFS_TX][IFS_UDP].packets,
+			dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+			dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+			dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+			dc->bpc[1][IFS_RX][IFS_TCP].packets,
+			dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+			dc->bpc[1][IFS_RX][IFS_UDP].packets,
+			dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+			dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+			dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+			dc->bpc[1][IFS_TX][IFS_TCP].packets,
+			dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+			dc->bpc[1][IFS_TX][IFS_UDP].packets,
+			dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+			dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+	else
+		res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+	_bug_on_err_or_null(res);
+	return res;
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+	char *tag_str;
+	char *res;
+
+	if (!tn) {
+		res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
+		_bug_on_err_or_null(res);
+		return res;
+	}
+	tag_str = pp_tag_t(&tn->tag);
+	res = kasprintf(GFP_ATOMIC,
+			"tag_node@%p{tag=%s}",
+			tn, tag_str);
+	_bug_on_err_or_null(res);
+	kfree(tag_str);
+	return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+	char *tn_str;
+	char *res;
+
+	if (!tr) {
+		res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+		_bug_on_err_or_null(res);
+		return res;
+	}
+	tn_str = pp_tag_node(&tr->tn);
+	res = kasprintf(GFP_ATOMIC,
+			"tag_ref@%p{%s, num_sock_tags=%d}",
+			tr, tn_str, tr->num_sock_tags);
+	_bug_on_err_or_null(res);
+	kfree(tn_str);
+	return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+	char *tn_str;
+	char *counters_str;
+	char *parent_counters_str;
+	char *res;
+
+	if (!ts) {
+		res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+		_bug_on_err_or_null(res);
+		return res;
+	}
+	tn_str = pp_tag_node(&ts->tn);
+	counters_str = pp_data_counters(&ts->counters, true);
+	parent_counters_str = pp_data_counters(ts->parent_counters, false);
+	res = kasprintf(GFP_ATOMIC,
+			"tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+			ts, tn_str, counters_str, parent_counters_str);
+	_bug_on_err_or_null(res);
+	kfree(tn_str);
+	kfree(counters_str);
+	kfree(parent_counters_str);
+	return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+	char *res;
+	if (!is)
+		res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+	else
+		res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+				"list=list_head{...}, "
+				"ifname=%s, "
+				"total={rx={bytes=%llu, "
+				"packets=%llu}, "
+				"tx={bytes=%llu, "
+				"packets=%llu}}, "
+				"last_known_valid=%d, "
+				"last_known={rx={bytes=%llu, "
+				"packets=%llu}, "
+				"tx={bytes=%llu, "
+				"packets=%llu}}, "
+				"active=%d, "
+				"net_dev=%p, "
+				"proc_ptr=%p, "
+				"tag_stat_tree=rb_root{...}}",
+				is,
+				is->ifname,
+				is->totals[IFS_RX].bytes,
+				is->totals[IFS_RX].packets,
+				is->totals[IFS_TX].bytes,
+				is->totals[IFS_TX].packets,
+				is->last_known_valid,
+				is->last_known[IFS_RX].bytes,
+				is->last_known[IFS_RX].packets,
+				is->last_known[IFS_TX].bytes,
+				is->last_known[IFS_TX].packets,
+				is->active,
+				is->net_dev,
+				is->proc_ptr);
+	_bug_on_err_or_null(res);
+	return res;
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+	char *tag_str;
+	char *res;
+
+	if (!st) {
+		res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+		_bug_on_err_or_null(res);
+		return res;
+	}
+	tag_str = pp_tag_t(&st->tag);
+	res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+			"sock_node=rb_node{...}, "
+			"sk=%p socket=%p (f_count=%lu), list=list_head{...}, "
+			"pid=%u, tag=%s}",
+			st, st->sk, st->socket, atomic_long_read(
+				&st->socket->file->f_count),
+			st->pid, tag_str);
+	_bug_on_err_or_null(res);
+	kfree(tag_str);
+	return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+	char *res;
+
+	if (!utd)
+		res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+	else
+		res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+				"uid=%u, num_active_acct_tags=%d, "
+				"num_pqd=%d, "
+				"tag_node_tree=rb_root{...}, "
+				"proc_qtu_data_tree=rb_root{...}}",
+				utd, utd->uid,
+				utd->num_active_tags, utd->num_pqd);
+	_bug_on_err_or_null(res);
+	return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+	char *parent_tag_data_str;
+	char *res;
+
+	if (!pqd) {
+		res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+		_bug_on_err_or_null(res);
+		return res;
+	}
+	parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+	res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+			"node=rb_node{...}, pid=%u, "
+			"parent_tag_data=%s, "
+			"sock_tag_list=list_head{...}}",
+			pqd, pqd->pid, parent_tag_data_str
+		);
+	_bug_on_err_or_null(res);
+	kfree(parent_tag_data_str);
+	return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+			   struct rb_root *sock_tag_tree)
+{
+	struct rb_node *node;
+	struct sock_tag *sock_tag_entry;
+	char *str;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (RB_EMPTY_ROOT(sock_tag_tree)) {
+		str = "sock_tag_tree=rb_root{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "sock_tag_tree=rb_root{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	for (node = rb_first(sock_tag_tree);
+	     node;
+	     node = rb_next(node)) {
+		sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+		str = pp_sock_tag(sock_tag_entry);
+		pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+		kfree(str);
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+			   struct list_head *sock_tag_list)
+{
+	struct sock_tag *sock_tag_entry;
+	char *str;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (list_empty(sock_tag_list)) {
+		str = "sock_tag_list=list_head{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "sock_tag_list=list_head{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+		str = pp_sock_tag(sock_tag_entry);
+		pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+		kfree(str);
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+				struct rb_root *proc_qtu_data_tree)
+{
+	char *str;
+	struct rb_node *node;
+	struct proc_qtu_data *proc_qtu_data_entry;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
+		str = "proc_qtu_data_tree=rb_root{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "proc_qtu_data_tree=rb_root{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	for (node = rb_first(proc_qtu_data_tree);
+	     node;
+	     node = rb_next(node)) {
+		proc_qtu_data_entry = rb_entry(node,
+					       struct proc_qtu_data,
+					       node);
+		str = pp_proc_qtu_data(proc_qtu_data_entry);
+		pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+			 str);
+		kfree(str);
+		indent_level++;
+		prdebug_sock_tag_list(indent_level,
+				      &proc_qtu_data_entry->sock_tag_list);
+		indent_level--;
+
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+	char *str;
+	struct rb_node *node;
+	struct tag_ref *tag_ref_entry;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (RB_EMPTY_ROOT(tag_ref_tree)) {
+		str = "tag_ref_tree{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "tag_ref_tree{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	for (node = rb_first(tag_ref_tree);
+	     node;
+	     node = rb_next(node)) {
+		tag_ref_entry = rb_entry(node,
+					 struct tag_ref,
+					 tn.node);
+		str = pp_tag_ref(tag_ref_entry);
+		pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+			 str);
+		kfree(str);
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+			       struct rb_root *uid_tag_data_tree)
+{
+	char *str;
+	struct rb_node *node;
+	struct uid_tag_data *uid_tag_data_entry;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
+		str = "uid_tag_data_tree=rb_root{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "uid_tag_data_tree=rb_root{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	for (node = rb_first(uid_tag_data_tree);
+	     node;
+	     node = rb_next(node)) {
+		uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+					      node);
+		str = pp_uid_tag_data(uid_tag_data_entry);
+		pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+		kfree(str);
+		if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+			indent_level++;
+			prdebug_tag_ref_tree(indent_level,
+					     &uid_tag_data_entry->tag_ref_tree);
+			indent_level--;
+		}
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+				  struct rb_root *tag_stat_tree)
+{
+	char *str;
+	struct rb_node *node;
+	struct tag_stat *ts_entry;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (RB_EMPTY_ROOT(tag_stat_tree)) {
+		str = "tag_stat_tree{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "tag_stat_tree{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	for (node = rb_first(tag_stat_tree);
+	     node;
+	     node = rb_next(node)) {
+		ts_entry = rb_entry(node, struct tag_stat, tn.node);
+		str = pp_tag_stat(ts_entry);
+		pr_debug("%*d: %s\n", indent_level*2, indent_level,
+			 str);
+		kfree(str);
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+			     struct list_head *iface_stat_list)
+{
+	char *str;
+	struct iface_stat *iface_entry;
+
+	if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+		return;
+
+	if (list_empty(iface_stat_list)) {
+		str = "iface_stat_list=list_head{}";
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		return;
+	}
+
+	str = "iface_stat_list=list_head{";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+	indent_level++;
+	list_for_each_entry(iface_entry, iface_stat_list, list) {
+		str = pp_iface_stat(iface_entry);
+		pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+		kfree(str);
+
+		spin_lock_bh(&iface_entry->tag_stat_list_lock);
+		if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+			indent_level++;
+			prdebug_tag_stat_tree(indent_level,
+					      &iface_entry->tag_stat_tree);
+			indent_level--;
+		}
+		spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+	}
+	indent_level--;
+	str = "}";
+	pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+#endif  /* ifdef DDEBUG */
+/*------------------------------------------*/
+static const char * const netdev_event_strings[] = {
+	"netdev_unknown",
+	"NETDEV_UP",
+	"NETDEV_DOWN",
+	"NETDEV_REBOOT",
+	"NETDEV_CHANGE",
+	"NETDEV_REGISTER",
+	"NETDEV_UNREGISTER",
+	"NETDEV_CHANGEMTU",
+	"NETDEV_CHANGEADDR",
+	"NETDEV_GOING_DOWN",
+	"NETDEV_CHANGENAME",
+	"NETDEV_FEAT_CHANGE",
+	"NETDEV_BONDING_FAILOVER",
+	"NETDEV_PRE_UP",
+	"NETDEV_PRE_TYPE_CHANGE",
+	"NETDEV_POST_TYPE_CHANGE",
+	"NETDEV_POST_INIT",
+	"NETDEV_UNREGISTER_BATCH",
+	"NETDEV_RELEASE",
+	"NETDEV_NOTIFY_PEERS",
+	"NETDEV_JOIN",
+};
+
+const char *netdev_evt_str(int netdev_event)
+{
+	if (netdev_event < 0
+	    || netdev_event >= ARRAY_SIZE(netdev_event_strings))
+		return "bad event num";
+	return netdev_event_strings[netdev_event];
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
new file mode 100644
index 00000000..b63871a0
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.h
@@ -0,0 +1,120 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+#ifdef DDEBUG
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+			   struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+			   struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+				struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+			       struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+			   struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+			     struct list_head *iface_stat_list);
+
+#else
+
+/*------------------------------------------*/
+static inline char *pp_tag_t(tag_t *tag)
+{
+	return NULL;
+}
+static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+	return NULL;
+}
+static inline char *pp_tag_node(struct tag_node *tn)
+{
+	return NULL;
+}
+static inline char *pp_tag_ref(struct tag_ref *tr)
+{
+	return NULL;
+}
+static inline char *pp_tag_stat(struct tag_stat *ts)
+{
+	return NULL;
+}
+static inline char *pp_iface_stat(struct iface_stat *is)
+{
+	return NULL;
+}
+static inline char *pp_sock_tag(struct sock_tag *st)
+{
+	return NULL;
+}
+static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
+{
+	return NULL;
+}
+static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+	return NULL;
+}
+
+/*------------------------------------------*/
+static inline
+void prdebug_sock_tag_list(int indent_level,
+			   struct list_head *sock_tag_list)
+{
+}
+static inline
+void prdebug_sock_tag_tree(int indent_level,
+			   struct rb_root *sock_tag_tree)
+{
+}
+static inline
+void prdebug_proc_qtu_data_tree(int indent_level,
+				struct rb_root *proc_qtu_data_tree)
+{
+}
+static inline
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+}
+static inline
+void prdebug_uid_tag_data_tree(int indent_level,
+			       struct rb_root *uid_tag_data_tree)
+{
+}
+static inline
+void prdebug_tag_stat_tree(int indent_level,
+			   struct rb_root *tag_stat_tree)
+{
+}
+static inline
+void prdebug_iface_stat_list(int indent_level,
+			     struct list_head *iface_stat_list)
+{
+}
+#endif
+/*------------------------------------------*/
+const char *netdev_evt_str(int netdev_event);
+#endif  /* ifndef __XT_QTAGUID_PRINT_H__ */
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
new file mode 100644
index 00000000..70eb2b49
--- /dev/null
+++ b/net/netfilter/xt_quota.c
@@ -0,0 +1,89 @@
+/*
+ * netfilter module to enforce network quotas
+ *
+ * Sam Johnston <samj@samj.net>
+ */
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota.h>
+
+struct xt_quota_priv {
+	spinlock_t	lock;
+	uint64_t	quota;
+};
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_DESCRIPTION("Xtables: countdown quota match");
+MODULE_ALIAS("ipt_quota");
+MODULE_ALIAS("ip6t_quota");
+
+static bool
+quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct xt_quota_info *q = (void *)par->matchinfo;
+	struct xt_quota_priv *priv = q->master;
+	bool ret = q->flags & XT_QUOTA_INVERT;
+
+	spin_lock_bh(&priv->lock);
+	if (priv->quota >= skb->len) {
+		priv->quota -= skb->len;
+		ret = !ret;
+	} else {
+		/* we do not allow even small packets from now on */
+		priv->quota = 0;
+	}
+	spin_unlock_bh(&priv->lock);
+
+	return ret;
+}
+
+static int quota_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_quota_info *q = par->matchinfo;
+
+	if (q->flags & ~XT_QUOTA_MASK)
+		return -EINVAL;
+
+	q->master = kmalloc(sizeof(*q->master), GFP_KERNEL);
+	if (q->master == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&q->master->lock);
+	q->master->quota = q->quota;
+	return 0;
+}
+
+static void quota_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_quota_info *q = par->matchinfo;
+
+	kfree(q->master);
+}
+
+static struct xt_match quota_mt_reg __read_mostly = {
+	.name       = "quota",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = quota_mt,
+	.checkentry = quota_mt_check,
+	.destroy    = quota_mt_destroy,
+	.matchsize  = sizeof(struct xt_quota_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init quota_mt_init(void)
+{
+	return xt_register_match(&quota_mt_reg);
+}
+
+static void __exit quota_mt_exit(void)
+{
+	xt_unregister_match(&quota_mt_reg);
+}
+
+module_init(quota_mt_init);
+module_exit(quota_mt_exit);
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
new file mode 100644
index 00000000..3c72bea2
--- /dev/null
+++ b/net/netfilter/xt_quota2.c
@@ -0,0 +1,381 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ * 	netfilter module to enforce network quotas
+ * 	Sam Johnston <samj@samj.net>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License; either
+ *	version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#endif
+
+/**
+ * @lock:	lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+	u_int64_t quota;
+	spinlock_t lock;
+	struct list_head list;
+	atomic_t ref;
+	char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+	struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+		 "Event number for NETLINK_NFLOG message. 0 disables log."
+		 "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static unsigned int quota_list_uid   = 0;
+static unsigned int quota_list_gid   = 0;
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+module_param_named(uid, quota_list_uid, uint, S_IRUGO | S_IWUSR);
+module_param_named(gid, quota_list_gid, uint, S_IRUGO | S_IWUSR);
+
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+		       const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const char *prefix)
+{
+	ulog_packet_msg_t *pm;
+	struct sk_buff *log_skb;
+	size_t size;
+	struct nlmsghdr *nlh;
+
+	if (!qlog_nl_event)
+		return;
+
+	size = NLMSG_SPACE(sizeof(*pm));
+	size = max(size, (size_t)NLMSG_GOODSIZE);
+	log_skb = alloc_skb(size, GFP_ATOMIC);
+	if (!log_skb) {
+		pr_err("xt_quota2: cannot alloc skb for logging\n");
+		return;
+	}
+
+	/* NLMSG_PUT() uses "goto nlmsg_failure" */
+	nlh = NLMSG_PUT(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+			sizeof(*pm));
+	pm = NLMSG_DATA(nlh);
+	if (skb->tstamp.tv64 == 0)
+		__net_timestamp((struct sk_buff *)skb);
+	pm->data_len = 0;
+	pm->hook = hooknum;
+	if (prefix != NULL)
+		strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+	else
+		*(pm->prefix) = '\0';
+	if (in)
+		strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+	else
+		pm->indev_name[0] = '\0';
+
+	if (out)
+		strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+	else
+		pm->outdev_name[0] = '\0';
+
+	NETLINK_CB(log_skb).dst_group = 1;
+	pr_debug("throwing 1 packets to netlink group 1\n");
+	netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+
+nlmsg_failure:  /* Used within NLMSG_PUT() */
+	pr_debug("xt_quota2: error during NLMSG_PUT\n");
+}
+#else
+static void quota2_log(unsigned int hooknum,
+		       const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const char *prefix)
+{
+}
+#endif  /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static int quota_proc_read(char *page, char **start, off_t offset,
+                           int count, int *eof, void *data)
+{
+	struct xt_quota_counter *e = data;
+	int ret;
+
+	spin_lock_bh(&e->lock);
+	ret = snprintf(page, PAGE_SIZE, "%llu\n", e->quota);
+	spin_unlock_bh(&e->lock);
+	return ret;
+}
+
+static int quota_proc_write(struct file *file, const char __user *input,
+                            unsigned long size, void *data)
+{
+	struct xt_quota_counter *e = data;
+	char buf[sizeof("18446744073709551616")];
+
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size) != 0)
+		return -EFAULT;
+	buf[sizeof(buf)-1] = '\0';
+
+	spin_lock_bh(&e->lock);
+	e->quota = simple_strtoull(buf, NULL, 0);
+	spin_unlock_bh(&e->lock);
+	return size;
+}
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+	struct xt_quota_counter *e;
+	unsigned int size;
+
+	/* Do not need all the procfs things for anonymous counters. */
+	size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+	e = kmalloc(size, GFP_KERNEL);
+	if (e == NULL)
+		return NULL;
+
+	e->quota = q->quota;
+	spin_lock_init(&e->lock);
+	if (!anon) {
+		INIT_LIST_HEAD(&e->list);
+		atomic_set(&e->ref, 1);
+		strlcpy(e->name, q->name, sizeof(e->name));
+	}
+	return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name:	name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+	struct proc_dir_entry *p;
+	struct xt_quota_counter *e = NULL;
+	struct xt_quota_counter *new_e;
+
+	if (*q->name == '\0')
+		return q2_new_counter(q, true);
+
+	/* No need to hold a lock while getting a new counter */
+	new_e = q2_new_counter(q, false);
+	if (new_e == NULL)
+		goto out;
+
+	spin_lock_bh(&counter_list_lock);
+	list_for_each_entry(e, &counter_list, list)
+		if (strcmp(e->name, q->name) == 0) {
+			atomic_inc(&e->ref);
+			spin_unlock_bh(&counter_list_lock);
+			kfree(new_e);
+			pr_debug("xt_quota2: old counter name=%s", e->name);
+			return e;
+		}
+	e = new_e;
+	pr_debug("xt_quota2: new_counter name=%s", e->name);
+	list_add_tail(&e->list, &counter_list);
+	/* The entry having a refcount of 1 is not directly destructible.
+	 * This func has not yet returned the new entry, thus iptables
+	 * has not references for destroying this entry.
+	 * For another rule to try to destroy it, it would 1st need for this
+	 * func* to be re-invoked, acquire a new ref for the same named quota.
+	 * Nobody will access the e->procfs_entry either.
+	 * So release the lock. */
+	spin_unlock_bh(&counter_list_lock);
+
+	/* create_proc_entry() is not spin_lock happy */
+	p = e->procfs_entry = create_proc_entry(e->name, quota_list_perms,
+	                      proc_xt_quota);
+
+	if (IS_ERR_OR_NULL(p)) {
+		spin_lock_bh(&counter_list_lock);
+		list_del(&e->list);
+		spin_unlock_bh(&counter_list_lock);
+		goto out;
+	}
+	p->data         = e;
+	p->read_proc    = quota_proc_read;
+	p->write_proc   = quota_proc_write;
+	p->uid          = quota_list_uid;
+	p->gid          = quota_list_gid;
+	return e;
+
+ out:
+	kfree(e);
+	return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+	struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+	pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+	if (q->flags & ~XT_QUOTA_MASK)
+		return -EINVAL;
+
+	q->name[sizeof(q->name)-1] = '\0';
+	if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+		printk(KERN_ERR "xt_quota.3: illegal name\n");
+		return -EINVAL;
+	}
+
+	q->master = q2_get_counter(q);
+	if (q->master == NULL) {
+		printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_quota_mtinfo2 *q = par->matchinfo;
+	struct xt_quota_counter *e = q->master;
+
+	if (*q->name == '\0') {
+		kfree(e);
+		return;
+	}
+
+	spin_lock_bh(&counter_list_lock);
+	if (!atomic_dec_and_test(&e->ref)) {
+		spin_unlock_bh(&counter_list_lock);
+		return;
+	}
+
+	list_del(&e->list);
+	remove_proc_entry(e->name, proc_xt_quota);
+	spin_unlock_bh(&counter_list_lock);
+	kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+	struct xt_quota_counter *e = q->master;
+	bool ret = q->flags & XT_QUOTA_INVERT;
+
+	spin_lock_bh(&e->lock);
+	if (q->flags & XT_QUOTA_GROW) {
+		/*
+		 * While no_change is pointless in "grow" mode, we will
+		 * implement it here simply to have a consistent behavior.
+		 */
+		if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+			e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+		}
+		ret = true;
+	} else {
+		if (e->quota >= skb->len) {
+			if (!(q->flags & XT_QUOTA_NO_CHANGE))
+				e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+			ret = !ret;
+		} else {
+			/* We are transitioning, log that fact. */
+			if (e->quota) {
+				quota2_log(par->hooknum,
+					   skb,
+					   par->in,
+					   par->out,
+					   q->name);
+			}
+			/* we do not allow even small packets from now on */
+			e->quota = 0;
+		}
+	}
+	spin_unlock_bh(&e->lock);
+	return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+	{
+		.name       = "quota2",
+		.revision   = 3,
+		.family     = NFPROTO_IPV4,
+		.checkentry = quota_mt2_check,
+		.match      = quota_mt2,
+		.destroy    = quota_mt2_destroy,
+		.matchsize  = sizeof(struct xt_quota_mtinfo2),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "quota2",
+		.revision   = 3,
+		.family     = NFPROTO_IPV6,
+		.checkentry = quota_mt2_check,
+		.match      = quota_mt2,
+		.destroy    = quota_mt2_destroy,
+		.matchsize  = sizeof(struct xt_quota_mtinfo2),
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init quota_mt2_init(void)
+{
+	int ret;
+	pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+	nflognl = netlink_kernel_create(&init_net,
+					NETLINK_NFLOG, 1, NULL,
+					NULL, THIS_MODULE);
+	if (!nflognl)
+		return -ENOMEM;
+#endif
+
+	proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+	if (proc_xt_quota == NULL)
+		return -EACCES;
+
+	ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+	if (ret < 0)
+		remove_proc_entry("xt_quota", init_net.proc_net);
+	pr_debug("xt_quota2: init() %d", ret);
+	return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+	xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+	remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
new file mode 100644
index 00000000..76a08318
--- /dev/null
+++ b/net/netfilter/xt_rateest.c
@@ -0,0 +1,158 @@
+/*
+ * (C) 2007 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/gen_stats.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_rateest.h>
+#include <net/netfilter/xt_rateest.h>
+
+
+static bool
+xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_rateest_match_info *info = par->matchinfo;
+	struct gnet_stats_rate_est *r;
+	u_int32_t bps1, bps2, pps1, pps2;
+	bool ret = true;
+
+	spin_lock_bh(&info->est1->lock);
+	r = &info->est1->rstats;
+	if (info->flags & XT_RATEEST_MATCH_DELTA) {
+		bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
+		pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
+	} else {
+		bps1 = r->bps;
+		pps1 = r->pps;
+	}
+	spin_unlock_bh(&info->est1->lock);
+
+	if (info->flags & XT_RATEEST_MATCH_ABS) {
+		bps2 = info->bps2;
+		pps2 = info->pps2;
+	} else {
+		spin_lock_bh(&info->est2->lock);
+		r = &info->est2->rstats;
+		if (info->flags & XT_RATEEST_MATCH_DELTA) {
+			bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
+			pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
+		} else {
+			bps2 = r->bps;
+			pps2 = r->pps;
+		}
+		spin_unlock_bh(&info->est2->lock);
+	}
+
+	switch (info->mode) {
+	case XT_RATEEST_MATCH_LT:
+		if (info->flags & XT_RATEEST_MATCH_BPS)
+			ret &= bps1 < bps2;
+		if (info->flags & XT_RATEEST_MATCH_PPS)
+			ret &= pps1 < pps2;
+		break;
+	case XT_RATEEST_MATCH_GT:
+		if (info->flags & XT_RATEEST_MATCH_BPS)
+			ret &= bps1 > bps2;
+		if (info->flags & XT_RATEEST_MATCH_PPS)
+			ret &= pps1 > pps2;
+		break;
+	case XT_RATEEST_MATCH_EQ:
+		if (info->flags & XT_RATEEST_MATCH_BPS)
+			ret &= bps1 == bps2;
+		if (info->flags & XT_RATEEST_MATCH_PPS)
+			ret &= pps1 == pps2;
+		break;
+	}
+
+	ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false;
+	return ret;
+}
+
+static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_rateest_match_info *info = par->matchinfo;
+	struct xt_rateest *est1, *est2;
+	int ret = false;
+
+	if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS |
+				     XT_RATEEST_MATCH_REL)) != 1)
+		goto err1;
+
+	if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS)))
+		goto err1;
+
+	switch (info->mode) {
+	case XT_RATEEST_MATCH_EQ:
+	case XT_RATEEST_MATCH_LT:
+	case XT_RATEEST_MATCH_GT:
+		break;
+	default:
+		goto err1;
+	}
+
+	ret  = -ENOENT;
+	est1 = xt_rateest_lookup(info->name1);
+	if (!est1)
+		goto err1;
+
+	if (info->flags & XT_RATEEST_MATCH_REL) {
+		est2 = xt_rateest_lookup(info->name2);
+		if (!est2)
+			goto err2;
+	} else
+		est2 = NULL;
+
+
+	info->est1 = est1;
+	info->est2 = est2;
+	return 0;
+
+err2:
+	xt_rateest_put(est1);
+err1:
+	return -EINVAL;
+}
+
+static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_rateest_match_info *info = par->matchinfo;
+
+	xt_rateest_put(info->est1);
+	if (info->est2)
+		xt_rateest_put(info->est2);
+}
+
+static struct xt_match xt_rateest_mt_reg __read_mostly = {
+	.name       = "rateest",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = xt_rateest_mt,
+	.checkentry = xt_rateest_mt_checkentry,
+	.destroy    = xt_rateest_mt_destroy,
+	.matchsize  = sizeof(struct xt_rateest_match_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init xt_rateest_mt_init(void)
+{
+	return xt_register_match(&xt_rateest_mt_reg);
+}
+
+static void __exit xt_rateest_mt_fini(void)
+{
+	xt_unregister_match(&xt_rateest_mt_reg);
+}
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("xtables rate estimator match");
+MODULE_ALIAS("ipt_rateest");
+MODULE_ALIAS("ip6t_rateest");
+module_init(xt_rateest_mt_init);
+module_exit(xt_rateest_mt_fini);
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
new file mode 100644
index 00000000..459a7b25
--- /dev/null
+++ b/net/netfilter/xt_realm.c
@@ -0,0 +1,54 @@
+/* IP tables module for matching the routing realm
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/xt_realm.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Xtables: Routing realm match");
+MODULE_ALIAS("ipt_realm");
+
+static bool
+realm_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_realm_info *info = par->matchinfo;
+	const struct dst_entry *dst = skb_dst(skb);
+
+	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+
+static struct xt_match realm_mt_reg __read_mostly = {
+	.name		= "realm",
+	.match		= realm_mt,
+	.matchsize	= sizeof(struct xt_realm_info),
+	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN),
+	.family		= NFPROTO_UNSPEC,
+	.me		= THIS_MODULE
+};
+
+static int __init realm_mt_init(void)
+{
+	return xt_register_match(&realm_mt_reg);
+}
+
+static void __exit realm_mt_exit(void)
+{
+	xt_unregister_match(&realm_mt_reg);
+}
+
+module_init(realm_mt_init);
+module_exit(realm_mt_exit);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
new file mode 100644
index 00000000..d2ff15a2
--- /dev/null
+++ b/net/netfilter/xt_recent.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is a replacement of the old ipt_recent module, which carried the
+ * following copyright notice:
+ *
+ * Author: Stephen Frost <sfrost@snowman.net>
+ * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/bitops.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_recent.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_recent");
+MODULE_ALIAS("ip6t_recent");
+
+static unsigned int ip_list_tot = 100;
+static unsigned int ip_pkt_list_tot = 20;
+static unsigned int ip_list_hash_size = 0;
+static unsigned int ip_list_perms = 0644;
+static unsigned int ip_list_uid = 0;
+static unsigned int ip_list_gid = 0;
+module_param(ip_list_tot, uint, 0400);
+module_param(ip_pkt_list_tot, uint, 0400);
+module_param(ip_list_hash_size, uint, 0400);
+module_param(ip_list_perms, uint, 0400);
+module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR);
+module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
+MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files");
+
+struct recent_entry {
+	struct list_head	list;
+	struct list_head	lru_list;
+	union nf_inet_addr	addr;
+	u_int16_t		family;
+	u_int8_t		ttl;
+	u_int8_t		index;
+	u_int16_t		nstamps;
+	unsigned long		stamps[0];
+};
+
+struct recent_table {
+	struct list_head	list;
+	char			name[XT_RECENT_NAME_LEN];
+	unsigned int		refcnt;
+	unsigned int		entries;
+	struct list_head	lru_list;
+	struct list_head	iphash[0];
+};
+
+struct recent_net {
+	struct list_head	tables;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry	*xt_recent;
+#endif
+};
+
+static int recent_net_id;
+static inline struct recent_net *recent_pernet(struct net *net)
+{
+	return net_generic(net, recent_net_id);
+}
+
+static DEFINE_SPINLOCK(recent_lock);
+static DEFINE_MUTEX(recent_mutex);
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations recent_old_fops, recent_mt_fops;
+#endif
+
+static u_int32_t hash_rnd __read_mostly;
+static bool hash_rnd_inited __read_mostly;
+
+static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
+{
+	return jhash_1word((__force u32)addr->ip, hash_rnd) &
+	       (ip_list_hash_size - 1);
+}
+
+static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr)
+{
+	return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) &
+	       (ip_list_hash_size - 1);
+}
+
+static struct recent_entry *
+recent_entry_lookup(const struct recent_table *table,
+		    const union nf_inet_addr *addrp, u_int16_t family,
+		    u_int8_t ttl)
+{
+	struct recent_entry *e;
+	unsigned int h;
+
+	if (family == NFPROTO_IPV4)
+		h = recent_entry_hash4(addrp);
+	else
+		h = recent_entry_hash6(addrp);
+
+	list_for_each_entry(e, &table->iphash[h], list)
+		if (e->family == family &&
+		    memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 &&
+		    (ttl == e->ttl || ttl == 0 || e->ttl == 0))
+			return e;
+	return NULL;
+}
+
+static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
+{
+	list_del(&e->list);
+	list_del(&e->lru_list);
+	kfree(e);
+	t->entries--;
+}
+
+/*
+ * Drop entries with timestamps older then 'time'.
+ */
+static void recent_entry_reap(struct recent_table *t, unsigned long time)
+{
+	struct recent_entry *e;
+
+	/*
+	 * The head of the LRU list is always the oldest entry.
+	 */
+	e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+
+	/*
+	 * The last time stamp is the most recent.
+	 */
+	if (time_after(time, e->stamps[e->index-1]))
+		recent_entry_remove(t, e);
+}
+
+static struct recent_entry *
+recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
+		  u_int16_t family, u_int8_t ttl)
+{
+	struct recent_entry *e;
+
+	if (t->entries >= ip_list_tot) {
+		e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
+		recent_entry_remove(t, e);
+	}
+	e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
+		    GFP_ATOMIC);
+	if (e == NULL)
+		return NULL;
+	memcpy(&e->addr, addr, sizeof(e->addr));
+	e->ttl       = ttl;
+	e->stamps[0] = jiffies;
+	e->nstamps   = 1;
+	e->index     = 1;
+	e->family    = family;
+	if (family == NFPROTO_IPV4)
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
+	else
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
+	list_add_tail(&e->lru_list, &t->lru_list);
+	t->entries++;
+	return e;
+}
+
+static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
+{
+	e->index %= ip_pkt_list_tot;
+	e->stamps[e->index++] = jiffies;
+	if (e->index > e->nstamps)
+		e->nstamps = e->index;
+	list_move_tail(&e->lru_list, &t->lru_list);
+}
+
+static struct recent_table *recent_table_lookup(struct recent_net *recent_net,
+						const char *name)
+{
+	struct recent_table *t;
+
+	list_for_each_entry(t, &recent_net->tables, list)
+		if (!strcmp(t->name, name))
+			return t;
+	return NULL;
+}
+
+static void recent_table_flush(struct recent_table *t)
+{
+	struct recent_entry *e, *next;
+	unsigned int i;
+
+	for (i = 0; i < ip_list_hash_size; i++)
+		list_for_each_entry_safe(e, next, &t->iphash[i], list)
+			recent_entry_remove(t, e);
+}
+
+static bool
+recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct net *net = dev_net(par->in ? par->in : par->out);
+	struct recent_net *recent_net = recent_pernet(net);
+	const struct xt_recent_mtinfo *info = par->matchinfo;
+	struct recent_table *t;
+	struct recent_entry *e;
+	union nf_inet_addr addr = {};
+	u_int8_t ttl;
+	bool ret = info->invert;
+
+	if (par->family == NFPROTO_IPV4) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			addr.ip = iph->daddr;
+		else
+			addr.ip = iph->saddr;
+
+		ttl = iph->ttl;
+	} else {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6));
+		else
+			memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6));
+
+		ttl = iph->hop_limit;
+	}
+
+	/* use TTL as seen before forwarding */
+	if (par->out != NULL && skb->sk == NULL)
+		ttl++;
+
+	spin_lock_bh(&recent_lock);
+	t = recent_table_lookup(recent_net, info->name);
+	e = recent_entry_lookup(t, &addr, par->family,
+				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
+	if (e == NULL) {
+		if (!(info->check_set & XT_RECENT_SET))
+			goto out;
+		e = recent_entry_init(t, &addr, par->family, ttl);
+		if (e == NULL)
+			par->hotdrop = true;
+		ret = !ret;
+		goto out;
+	}
+
+	if (info->check_set & XT_RECENT_SET)
+		ret = !ret;
+	else if (info->check_set & XT_RECENT_REMOVE) {
+		recent_entry_remove(t, e);
+		ret = !ret;
+	} else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) {
+		unsigned long time = jiffies - info->seconds * HZ;
+		unsigned int i, hits = 0;
+
+		for (i = 0; i < e->nstamps; i++) {
+			if (info->seconds && time_after(time, e->stamps[i]))
+				continue;
+			if (!info->hit_count || ++hits >= info->hit_count) {
+				ret = !ret;
+				break;
+			}
+		}
+
+		/* info->seconds must be non-zero */
+		if (info->check_set & XT_RECENT_REAP)
+			recent_entry_reap(t, time);
+	}
+
+	if (info->check_set & XT_RECENT_SET ||
+	    (info->check_set & XT_RECENT_UPDATE && ret)) {
+		recent_entry_update(t, e);
+		e->ttl = ttl;
+	}
+out:
+	spin_unlock_bh(&recent_lock);
+	return ret;
+}
+
+static int recent_mt_check(const struct xt_mtchk_param *par)
+{
+	struct recent_net *recent_net = recent_pernet(par->net);
+	const struct xt_recent_mtinfo *info = par->matchinfo;
+	struct recent_table *t;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;
+#endif
+	unsigned i;
+	int ret = -EINVAL;
+
+	if (unlikely(!hash_rnd_inited)) {
+		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+		hash_rnd_inited = true;
+	}
+	if (info->check_set & ~XT_RECENT_VALID_FLAGS) {
+		pr_info("Unsupported user space flags (%08x)\n",
+			info->check_set);
+		return -EINVAL;
+	}
+	if (hweight8(info->check_set &
+		     (XT_RECENT_SET | XT_RECENT_REMOVE |
+		      XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1)
+		return -EINVAL;
+	if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) &&
+	    (info->seconds || info->hit_count ||
+	    (info->check_set & XT_RECENT_MODIFIERS)))
+		return -EINVAL;
+	if ((info->check_set & XT_RECENT_REAP) && !info->seconds)
+		return -EINVAL;
+	if (info->hit_count > ip_pkt_list_tot) {
+		pr_info("hitcount (%u) is larger than "
+			"packets to be remembered (%u)\n",
+			info->hit_count, ip_pkt_list_tot);
+		return -EINVAL;
+	}
+	if (info->name[0] == '\0' ||
+	    strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
+		return -EINVAL;
+
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(recent_net, info->name);
+	if (t != NULL) {
+		t->refcnt++;
+		ret = 0;
+		goto out;
+	}
+
+	t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
+		    GFP_KERNEL);
+	if (t == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	t->refcnt = 1;
+	strcpy(t->name, info->name);
+	INIT_LIST_HEAD(&t->lru_list);
+	for (i = 0; i < ip_list_hash_size; i++)
+		INIT_LIST_HEAD(&t->iphash[i]);
+#ifdef CONFIG_PROC_FS
+	pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
+		  &recent_mt_fops, t);
+	if (pde == NULL) {
+		kfree(t);
+		ret = -ENOMEM;
+		goto out;
+	}
+	pde->uid = ip_list_uid;
+	pde->gid = ip_list_gid;
+#endif
+	spin_lock_bh(&recent_lock);
+	list_add_tail(&t->list, &recent_net->tables);
+	spin_unlock_bh(&recent_lock);
+	ret = 0;
+out:
+	mutex_unlock(&recent_mutex);
+	return ret;
+}
+
+static void recent_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	struct recent_net *recent_net = recent_pernet(par->net);
+	const struct xt_recent_mtinfo *info = par->matchinfo;
+	struct recent_table *t;
+
+	mutex_lock(&recent_mutex);
+	t = recent_table_lookup(recent_net, info->name);
+	if (--t->refcnt == 0) {
+		spin_lock_bh(&recent_lock);
+		list_del(&t->list);
+		spin_unlock_bh(&recent_lock);
+#ifdef CONFIG_PROC_FS
+		remove_proc_entry(t->name, recent_net->xt_recent);
+#endif
+		recent_table_flush(t);
+		kfree(t);
+	}
+	mutex_unlock(&recent_mutex);
+}
+
+#ifdef CONFIG_PROC_FS
+struct recent_iter_state {
+	const struct recent_table *table;
+	unsigned int		bucket;
+};
+
+static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(recent_lock)
+{
+	struct recent_iter_state *st = seq->private;
+	const struct recent_table *t = st->table;
+	struct recent_entry *e;
+	loff_t p = *pos;
+
+	spin_lock_bh(&recent_lock);
+
+	for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
+		list_for_each_entry(e, &t->iphash[st->bucket], list)
+			if (p-- == 0)
+				return e;
+	return NULL;
+}
+
+static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct recent_iter_state *st = seq->private;
+	const struct recent_table *t = st->table;
+	const struct recent_entry *e = v;
+	const struct list_head *head = e->list.next;
+
+	while (head == &t->iphash[st->bucket]) {
+		if (++st->bucket >= ip_list_hash_size)
+			return NULL;
+		head = t->iphash[st->bucket].next;
+	}
+	(*pos)++;
+	return list_entry(head, struct recent_entry, list);
+}
+
+static void recent_seq_stop(struct seq_file *s, void *v)
+	__releases(recent_lock)
+{
+	spin_unlock_bh(&recent_lock);
+}
+
+static int recent_seq_show(struct seq_file *seq, void *v)
+{
+	const struct recent_entry *e = v;
+	unsigned int i;
+
+	i = (e->index - 1) % ip_pkt_list_tot;
+	if (e->family == NFPROTO_IPV4)
+		seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u",
+			   &e->addr.ip, e->ttl, e->stamps[i], e->index);
+	else
+		seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u",
+			   &e->addr.in6, e->ttl, e->stamps[i], e->index);
+	for (i = 0; i < e->nstamps; i++)
+		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
+	seq_printf(seq, "\n");
+	return 0;
+}
+
+static const struct seq_operations recent_seq_ops = {
+	.start		= recent_seq_start,
+	.next		= recent_seq_next,
+	.stop		= recent_seq_stop,
+	.show		= recent_seq_show,
+};
+
+static int recent_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct recent_iter_state *st;
+
+	st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
+	if (st == NULL)
+		return -ENOMEM;
+
+	st->table    = pde->data;
+	return 0;
+}
+
+static ssize_t
+recent_mt_proc_write(struct file *file, const char __user *input,
+		     size_t size, loff_t *loff)
+{
+	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct recent_table *t = pde->data;
+	struct recent_entry *e;
+	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+	const char *c = buf;
+	union nf_inet_addr addr = {};
+	u_int16_t family;
+	bool add, succ;
+
+	if (size == 0)
+		return 0;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size) != 0)
+		return -EFAULT;
+
+	/* Strict protocol! */
+	if (*loff != 0)
+		return -ESPIPE;
+	switch (*c) {
+	case '/': /* flush table */
+		spin_lock_bh(&recent_lock);
+		recent_table_flush(t);
+		spin_unlock_bh(&recent_lock);
+		return size;
+	case '-': /* remove address */
+		add = false;
+		break;
+	case '+': /* add address */
+		add = true;
+		break;
+	default:
+		pr_info("Need \"+ip\", \"-ip\" or \"/\"\n");
+		return -EINVAL;
+	}
+
+	++c;
+	--size;
+	if (strnchr(c, size, ':') != NULL) {
+		family = NFPROTO_IPV6;
+		succ   = in6_pton(c, size, (void *)&addr, '\n', NULL);
+	} else {
+		family = NFPROTO_IPV4;
+		succ   = in4_pton(c, size, (void *)&addr, '\n', NULL);
+	}
+
+	if (!succ) {
+		pr_info("illegal address written to procfs\n");
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&recent_lock);
+	e = recent_entry_lookup(t, &addr, family, 0);
+	if (e == NULL) {
+		if (add)
+			recent_entry_init(t, &addr, family, 0);
+	} else {
+		if (add)
+			recent_entry_update(t, e);
+		else
+			recent_entry_remove(t, e);
+	}
+	spin_unlock_bh(&recent_lock);
+	/* Note we removed one above */
+	*loff += size + 1;
+	return size + 1;
+}
+
+static const struct file_operations recent_mt_fops = {
+	.open    = recent_seq_open,
+	.read    = seq_read,
+	.write   = recent_mt_proc_write,
+	.release = seq_release_private,
+	.owner   = THIS_MODULE,
+	.llseek = seq_lseek,
+};
+
+static int __net_init recent_proc_net_init(struct net *net)
+{
+	struct recent_net *recent_net = recent_pernet(net);
+
+	recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
+	if (!recent_net->xt_recent)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit recent_proc_net_exit(struct net *net)
+{
+	proc_net_remove(net, "xt_recent");
+}
+#else
+static inline int recent_proc_net_init(struct net *net)
+{
+	return 0;
+}
+
+static inline void recent_proc_net_exit(struct net *net)
+{
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init recent_net_init(struct net *net)
+{
+	struct recent_net *recent_net = recent_pernet(net);
+
+	INIT_LIST_HEAD(&recent_net->tables);
+	return recent_proc_net_init(net);
+}
+
+static void __net_exit recent_net_exit(struct net *net)
+{
+	struct recent_net *recent_net = recent_pernet(net);
+
+	BUG_ON(!list_empty(&recent_net->tables));
+	recent_proc_net_exit(net);
+}
+
+static struct pernet_operations recent_net_ops = {
+	.init	= recent_net_init,
+	.exit	= recent_net_exit,
+	.id	= &recent_net_id,
+	.size	= sizeof(struct recent_net),
+};
+
+static struct xt_match recent_mt_reg[] __read_mostly = {
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init recent_mt_init(void)
+{
+	int err;
+
+	if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
+		return -EINVAL;
+	ip_list_hash_size = 1 << fls(ip_list_tot);
+
+	err = register_pernet_subsys(&recent_net_ops);
+	if (err)
+		return err;
+	err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	if (err)
+		unregister_pernet_subsys(&recent_net_ops);
+	return err;
+}
+
+static void __exit recent_mt_exit(void)
+{
+	xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+	unregister_pernet_subsys(&recent_net_ops);
+}
+
+module_init(recent_mt_init);
+module_exit(recent_mt_exit);
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
new file mode 100644
index 00000000..6efe4e5a
--- /dev/null
+++ b/net/netfilter/xt_repldata.h
@@ -0,0 +1,35 @@
+/*
+ * Today's hack: quantum tunneling in structs
+ *
+ * 'entries' and 'term' are never anywhere referenced by word in code. In fact,
+ * they serve as the hanging-off data accessed through repl.data[].
+ */
+
+#define xt_alloc_initial_table(type, typ2) ({ \
+	unsigned int hook_mask = info->valid_hooks; \
+	unsigned int nhooks = hweight32(hook_mask); \
+	unsigned int bytes = 0, hooknum = 0, i = 0; \
+	struct { \
+		struct type##_replace repl; \
+		struct type##_standard entries[nhooks]; \
+		struct type##_error term; \
+	} *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \
+	if (tbl == NULL) \
+		return NULL; \
+	strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
+	tbl->term = (struct type##_error)typ2##_ERROR_INIT;  \
+	tbl->repl.valid_hooks = hook_mask; \
+	tbl->repl.num_entries = nhooks + 1; \
+	tbl->repl.size = nhooks * sizeof(struct type##_standard) + \
+	                 sizeof(struct type##_error); \
+	for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \
+		if (!(hook_mask & 1)) \
+			continue; \
+		tbl->repl.hook_entry[hooknum] = bytes; \
+		tbl->repl.underflow[hooknum]  = bytes; \
+		tbl->entries[i++] = (struct type##_standard) \
+			typ2##_STANDARD_INIT(NF_ACCEPT); \
+		bytes += sizeof(struct type##_standard); \
+	} \
+	tbl; \
+})
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
new file mode 100644
index 00000000..ef36a56a
--- /dev/null
+++ b/net/netfilter/xt_sctp.c
@@ -0,0 +1,198 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <linux/sctp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_sctp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Xtables: SCTP protocol packet match");
+MODULE_ALIAS("ipt_sctp");
+MODULE_ALIAS("ip6t_sctp");
+
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+					      || (!!((invflag) & (option)) ^ (cond)))
+
+static bool
+match_flags(const struct xt_sctp_flag_info *flag_info,
+	    const int flag_count,
+	    u_int8_t chunktype,
+	    u_int8_t chunkflags)
+{
+	int i;
+
+	for (i = 0; i < flag_count; i++)
+		if (flag_info[i].chunktype == chunktype)
+			return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+
+	return true;
+}
+
+static inline bool
+match_packet(const struct sk_buff *skb,
+	     unsigned int offset,
+	     const struct xt_sctp_info *info,
+	     bool *hotdrop)
+{
+	u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+	const sctp_chunkhdr_t *sch;
+	sctp_chunkhdr_t _sch;
+	int chunk_match_type = info->chunk_match_type;
+	const struct xt_sctp_flag_info *flag_info = info->flag_info;
+	int flag_count = info->flag_count;
+
+#ifdef DEBUG
+	int i = 0;
+#endif
+
+	if (chunk_match_type == SCTP_CHUNK_MATCH_ALL)
+		SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap);
+
+	do {
+		sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+		if (sch == NULL || sch->length == 0) {
+			pr_debug("Dropping invalid SCTP packet.\n");
+			*hotdrop = true;
+			return false;
+		}
+#ifdef DEBUG
+		pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d"
+			 "\tflags: %x\n",
+			 ++i, offset, sch->type, htons(sch->length),
+			 sch->flags);
+#endif
+		offset += WORD_ROUND(ntohs(sch->length));
+
+		pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
+
+		if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ANY:
+				if (match_flags(flag_info, flag_count,
+					sch->type, sch->flags)) {
+					return true;
+				}
+				break;
+
+			case SCTP_CHUNK_MATCH_ALL:
+				if (match_flags(flag_info, flag_count,
+				    sch->type, sch->flags))
+					SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+				break;
+
+			case SCTP_CHUNK_MATCH_ONLY:
+				if (!match_flags(flag_info, flag_count,
+				    sch->type, sch->flags))
+					return false;
+				break;
+			}
+		} else {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ONLY:
+				return false;
+			}
+		}
+	} while (offset < skb->len);
+
+	switch (chunk_match_type) {
+	case SCTP_CHUNK_MATCH_ALL:
+		return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy);
+	case SCTP_CHUNK_MATCH_ANY:
+		return false;
+	case SCTP_CHUNK_MATCH_ONLY:
+		return true;
+	}
+
+	/* This will never be reached, but required to stop compiler whine */
+	return false;
+}
+
+static bool
+sctp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_sctp_info *info = par->matchinfo;
+	const sctp_sctphdr_t *sh;
+	sctp_sctphdr_t _sh;
+
+	if (par->fragoff != 0) {
+		pr_debug("Dropping non-first fragment.. FIXME\n");
+		return false;
+	}
+
+	sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh);
+	if (sh == NULL) {
+		pr_debug("Dropping evil TCP offset=0 tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+	pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+
+	return  SCCHECK(ntohs(sh->source) >= info->spts[0]
+			&& ntohs(sh->source) <= info->spts[1],
+			XT_SCTP_SRC_PORTS, info->flags, info->invflags)
+		&& SCCHECK(ntohs(sh->dest) >= info->dpts[0]
+			&& ntohs(sh->dest) <= info->dpts[1],
+			XT_SCTP_DEST_PORTS, info->flags, info->invflags)
+		&& SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t),
+					info, &par->hotdrop),
+			   XT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+
+static int sctp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_sctp_info *info = par->matchinfo;
+
+	if (info->flags & ~XT_SCTP_VALID_FLAGS)
+		return -EINVAL;
+	if (info->invflags & ~XT_SCTP_VALID_FLAGS)
+		return -EINVAL;
+	if (info->invflags & ~info->flags)
+		return -EINVAL;
+	if (!(info->flags & XT_SCTP_CHUNK_TYPES))
+		return 0;
+	if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL |
+	    SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY))
+		return 0;
+	return -EINVAL;
+}
+
+static struct xt_match sctp_mt_reg[] __read_mostly = {
+	{
+		.name		= "sctp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= sctp_mt_check,
+		.match		= sctp_mt,
+		.matchsize	= sizeof(struct xt_sctp_info),
+		.proto		= IPPROTO_SCTP,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "sctp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= sctp_mt_check,
+		.match		= sctp_mt,
+		.matchsize	= sizeof(struct xt_sctp_info),
+		.proto		= IPPROTO_SCTP,
+		.me		= THIS_MODULE
+	},
+};
+
+static int __init sctp_mt_init(void)
+{
+	return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg));
+}
+
+static void __exit sctp_mt_exit(void)
+{
+	xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg));
+}
+
+module_init(sctp_mt_init);
+module_exit(sctp_mt_exit);
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
new file mode 100644
index 00000000..b3babaed
--- /dev/null
+++ b/net/netfilter/xt_set.c
@@ -0,0 +1,373 @@
+/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
+ *                         Patrick Schaaf <bof@bof.de>
+ *                         Martin Josefsson <gandalf@wlug.westbo.se>
+ * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Kernel module which implements the set match and SET target
+ * for netfilter/iptables. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_set.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("Xtables: IP set match and target module");
+MODULE_ALIAS("xt_SET");
+MODULE_ALIAS("ipt_set");
+MODULE_ALIAS("ip6t_set");
+MODULE_ALIAS("ipt_SET");
+MODULE_ALIAS("ip6t_SET");
+
+static inline int
+match_set(ip_set_id_t index, const struct sk_buff *skb,
+	  u8 pf, u8 dim, u8 flags, int inv)
+{
+	if (ip_set_test(index, skb, pf, dim, flags))
+		inv = !inv;
+	return inv;
+}
+
+/* Revision 0 interface: backward compatible with netfilter/iptables */
+
+static bool
+set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_set_info_match_v0 *info = par->matchinfo;
+
+	return match_set(info->match_set.index, skb, par->family,
+			 info->match_set.u.compat.dim,
+			 info->match_set.u.compat.flags,
+			 info->match_set.u.compat.flags & IPSET_INV_MATCH);
+}
+
+static void
+compat_flags(struct xt_set_info_v0 *info)
+{
+	u_int8_t i;
+
+	/* Fill out compatibility data according to enum ip_set_kopt */
+	info->u.compat.dim = IPSET_DIM_ZERO;
+	if (info->u.flags[0] & IPSET_MATCH_INV)
+		info->u.compat.flags |= IPSET_INV_MATCH;
+	for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+		info->u.compat.dim++;
+		if (info->u.flags[i] & IPSET_SRC)
+			info->u.compat.flags |= (1<<info->u.compat.dim);
+	}
+}
+
+static int
+set_match_v0_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_set_info_match_v0 *info = par->matchinfo;
+	ip_set_id_t index;
+
+	index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+	if (index == IPSET_INVALID_ID) {
+		pr_warning("Cannot find set indentified by id %u to match\n",
+			   info->match_set.index);
+		return -ENOENT;
+	}
+	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+		pr_warning("Protocol error: set match dimension "
+			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
+		return -ERANGE;
+	}
+
+	/* Fill out compatibility data */
+	compat_flags(&info->match_set);
+
+	return 0;
+}
+
+static void
+set_match_v0_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_set_info_match_v0 *info = par->matchinfo;
+
+	ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_set_info_target_v0 *info = par->targinfo;
+
+	if (info->add_set.index != IPSET_INVALID_ID)
+		ip_set_add(info->add_set.index, skb, par->family,
+			   info->add_set.u.compat.dim,
+			   info->add_set.u.compat.flags);
+	if (info->del_set.index != IPSET_INVALID_ID)
+		ip_set_del(info->del_set.index, skb, par->family,
+			   info->del_set.u.compat.dim,
+			   info->del_set.u.compat.flags);
+
+	return XT_CONTINUE;
+}
+
+static int
+set_target_v0_checkentry(const struct xt_tgchk_param *par)
+{
+	struct xt_set_info_target_v0 *info = par->targinfo;
+	ip_set_id_t index;
+
+	if (info->add_set.index != IPSET_INVALID_ID) {
+		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		if (index == IPSET_INVALID_ID) {
+			pr_warning("Cannot find add_set index %u as target\n",
+				   info->add_set.index);
+			return -ENOENT;
+		}
+	}
+
+	if (info->del_set.index != IPSET_INVALID_ID) {
+		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		if (index == IPSET_INVALID_ID) {
+			pr_warning("Cannot find del_set index %u as target\n",
+				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
+			return -ENOENT;
+		}
+	}
+	if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
+	    info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+		pr_warning("Protocol error: SET target dimension "
+			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
+		return -ERANGE;
+	}
+
+	/* Fill out compatibility data */
+	compat_flags(&info->add_set);
+	compat_flags(&info->del_set);
+
+	return 0;
+}
+
+static void
+set_target_v0_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct xt_set_info_target_v0 *info = par->targinfo;
+
+	if (info->add_set.index != IPSET_INVALID_ID)
+		ip_set_nfnl_put(info->add_set.index);
+	if (info->del_set.index != IPSET_INVALID_ID)
+		ip_set_nfnl_put(info->del_set.index);
+}
+
+/* Revision 1: current interface to netfilter/iptables */
+
+static bool
+set_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_set_info_match *info = par->matchinfo;
+
+	return match_set(info->match_set.index, skb, par->family,
+			 info->match_set.dim,
+			 info->match_set.flags,
+			 info->match_set.flags & IPSET_INV_MATCH);
+}
+
+static int
+set_match_checkentry(const struct xt_mtchk_param *par)
+{
+	struct xt_set_info_match *info = par->matchinfo;
+	ip_set_id_t index;
+
+	index = ip_set_nfnl_get_byindex(info->match_set.index);
+
+	if (index == IPSET_INVALID_ID) {
+		pr_warning("Cannot find set indentified by id %u to match\n",
+			   info->match_set.index);
+		return -ENOENT;
+	}
+	if (info->match_set.dim > IPSET_DIM_MAX) {
+		pr_warning("Protocol error: set match dimension "
+			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static void
+set_match_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_set_info_match *info = par->matchinfo;
+
+	ip_set_nfnl_put(info->match_set.index);
+}
+
+static unsigned int
+set_target(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_set_info_target *info = par->targinfo;
+
+	if (info->add_set.index != IPSET_INVALID_ID)
+		ip_set_add(info->add_set.index,
+			   skb, par->family,
+			   info->add_set.dim,
+			   info->add_set.flags);
+	if (info->del_set.index != IPSET_INVALID_ID)
+		ip_set_del(info->del_set.index,
+			   skb, par->family,
+			   info->del_set.dim,
+			   info->del_set.flags);
+
+	return XT_CONTINUE;
+}
+
+static int
+set_target_checkentry(const struct xt_tgchk_param *par)
+{
+	const struct xt_set_info_target *info = par->targinfo;
+	ip_set_id_t index;
+
+	if (info->add_set.index != IPSET_INVALID_ID) {
+		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		if (index == IPSET_INVALID_ID) {
+			pr_warning("Cannot find add_set index %u as target\n",
+				   info->add_set.index);
+			return -ENOENT;
+		}
+	}
+
+	if (info->del_set.index != IPSET_INVALID_ID) {
+		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		if (index == IPSET_INVALID_ID) {
+			pr_warning("Cannot find del_set index %u as target\n",
+				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
+			return -ENOENT;
+		}
+	}
+	if (info->add_set.dim > IPSET_DIM_MAX ||
+	    info->del_set.dim > IPSET_DIM_MAX) {
+		pr_warning("Protocol error: SET target dimension "
+			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static void
+set_target_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct xt_set_info_target *info = par->targinfo;
+
+	if (info->add_set.index != IPSET_INVALID_ID)
+		ip_set_nfnl_put(info->add_set.index);
+	if (info->del_set.index != IPSET_INVALID_ID)
+		ip_set_nfnl_put(info->del_set.index);
+}
+
+static struct xt_match set_matches[] __read_mostly = {
+	{
+		.name		= "set",
+		.family		= NFPROTO_IPV4,
+		.revision	= 0,
+		.match		= set_match_v0,
+		.matchsize	= sizeof(struct xt_set_info_match_v0),
+		.checkentry	= set_match_v0_checkentry,
+		.destroy	= set_match_v0_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "set",
+		.family		= NFPROTO_IPV4,
+		.revision	= 1,
+		.match		= set_match,
+		.matchsize	= sizeof(struct xt_set_info_match),
+		.checkentry	= set_match_checkentry,
+		.destroy	= set_match_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "set",
+		.family		= NFPROTO_IPV6,
+		.revision	= 1,
+		.match		= set_match,
+		.matchsize	= sizeof(struct xt_set_info_match),
+		.checkentry	= set_match_checkentry,
+		.destroy	= set_match_destroy,
+		.me		= THIS_MODULE
+	},
+};
+
+static struct xt_target set_targets[] __read_mostly = {
+	{
+		.name		= "SET",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.target		= set_target_v0,
+		.targetsize	= sizeof(struct xt_set_info_target_v0),
+		.checkentry	= set_target_v0_checkentry,
+		.destroy	= set_target_v0_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "SET",
+		.revision	= 1,
+		.family		= NFPROTO_IPV4,
+		.target		= set_target,
+		.targetsize	= sizeof(struct xt_set_info_target),
+		.checkentry	= set_target_checkentry,
+		.destroy	= set_target_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "SET",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.target		= set_target,
+		.targetsize	= sizeof(struct xt_set_info_target),
+		.checkentry	= set_target_checkentry,
+		.destroy	= set_target_destroy,
+		.me		= THIS_MODULE
+	},
+};
+
+static int __init xt_set_init(void)
+{
+	int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches));
+
+	if (!ret) {
+		ret = xt_register_targets(set_targets,
+					  ARRAY_SIZE(set_targets));
+		if (ret)
+			xt_unregister_matches(set_matches,
+					      ARRAY_SIZE(set_matches));
+	}
+	return ret;
+}
+
+static void __exit xt_set_fini(void)
+{
+	xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches));
+	xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets));
+}
+
+module_init(xt_set_init);
+module_exit(xt_set_fini);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
new file mode 100644
index 00000000..ddf5e050
--- /dev/null
+++ b/net/netfilter/xt_socket.c
@@ -0,0 +1,404 @@
+/*
+ * Transparent proxy support for Linux/iptables
+ *
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netfilter/nf_tproxy_core.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_SOCKET_HAVE_IPV6 1
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define XT_SOCKET_HAVE_CONNTRACK 1
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+void
+xt_socket_put_sk(struct sock *sk)
+{
+	if (sk->sk_state == TCP_TIME_WAIT)
+		inet_twsk_put(inet_twsk(sk));
+	else
+		sock_put(sk);
+}
+EXPORT_SYMBOL(xt_socket_put_sk);
+
+static int
+extract_icmp4_fields(const struct sk_buff *skb,
+		    u8 *protocol,
+		    __be32 *raddr,
+		    __be32 *laddr,
+		    __be16 *rport,
+		    __be16 *lport)
+{
+	unsigned int outside_hdrlen = ip_hdrlen(skb);
+	struct iphdr *inside_iph, _inside_iph;
+	struct icmphdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	switch (icmph->type) {
+	case ICMP_DEST_UNREACH:
+	case ICMP_SOURCE_QUENCH:
+	case ICMP_REDIRECT:
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		break;
+	default:
+		return 1;
+	}
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen +
+					sizeof(struct icmphdr),
+					sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+
+	if (inside_iph->protocol != IPPROTO_TCP &&
+	    inside_iph->protocol != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, outside_hdrlen +
+				   sizeof(struct icmphdr) +
+				   (inside_iph->ihl << 2),
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_iph->protocol;
+	*laddr = inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+struct sock*
+xt_socket_get4_sk(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	__be32 daddr, saddr;
+	__be16 dport, sport;
+	u8 protocol;
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+	struct nf_conn const *ct;
+	enum ip_conntrack_info ctinfo;
+#endif
+
+	if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, ip_hdrlen(skb),
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
+		protocol = iph->protocol;
+		saddr = iph->saddr;
+		sport = hp->source;
+		daddr = iph->daddr;
+		dport = hp->dest;
+
+	} else if (iph->protocol == IPPROTO_ICMP) {
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
+					&sport, &dport))
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+#ifdef XT_SOCKET_HAVE_CONNTRACK
+	/* Do the lookup with the original socket address in case this is a
+	 * reply packet of an established SNAT-ted connection. */
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct && !nf_ct_is_untracked(ct) &&
+	    ((iph->protocol != IPPROTO_ICMP &&
+	      ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+	     (iph->protocol == IPPROTO_ICMP &&
+	      ctinfo == IP_CT_RELATED_REPLY)) &&
+	    (ct->status & IPS_SRC_NAT_DONE)) {
+
+		daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip;
+		dport = (iph->protocol == IPPROTO_TCP) ?
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+	}
+#endif
+
+	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+
+	pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+		 protocol, &saddr, ntohs(sport),
+		 &daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL(xt_socket_get4_sk);
+
+static bool
+socket_match(const struct sk_buff *skb, struct xt_action_param *par,
+	     const struct xt_socket_mtinfo1 *info)
+{
+	struct sock *sk;
+
+	sk = xt_socket_get4_sk(skb, par);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    inet_sk(sk)->inet_rcv_saddr == 0);
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		xt_socket_put_sk(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	return (sk != NULL);
+}
+
+static bool
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return socket_match(skb, par, NULL);
+}
+
+static bool
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return socket_match(skb, par, par->matchinfo);
+}
+
+#ifdef XT_SOCKET_HAVE_IPV6
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+		     unsigned int outside_hdrlen,
+		     int *protocol,
+		     struct in6_addr **raddr,
+		     struct in6_addr **laddr,
+		     __be16 *rport,
+		     __be16 *lport)
+{
+	struct ipv6hdr *inside_iph, _inside_iph;
+	struct icmp6hdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+	u8 inside_nexthdr;
+	int inside_hdrlen;
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+	inside_nexthdr = inside_iph->nexthdr;
+
+	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+	if (inside_hdrlen < 0)
+		return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+	if (inside_nexthdr != IPPROTO_TCP &&
+	    inside_nexthdr != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, inside_hdrlen,
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_nexthdr;
+	*laddr = &inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = &inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+struct sock*
+xt_socket_get6_sk(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	struct in6_addr *daddr, *saddr;
+	__be16 dport, sport;
+	int thoff, tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return NULL;
+
+		saddr = &iph->saddr;
+		sport = hp->source;
+		daddr = &iph->daddr;
+		dport = hp->dest;
+
+	} else if (tproto == IPPROTO_ICMPV6) {
+		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+					 &sport, &dport))
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+	pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
+		 "(orig %pI6:%hu) sock %p\n",
+		 tproto, saddr, ntohs(sport),
+		 daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+	return sk;
+}
+EXPORT_SYMBOL(xt_socket_get6_sk);
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct sock *sk;
+	const struct xt_socket_mtinfo1 *info;
+
+	info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+	sk = xt_socket_get6_sk(skb, par);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		xt_socket_put_sk(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	return (sk != NULL);
+}
+#endif
+
+static struct xt_match socket_mt_reg[] __read_mostly = {
+	{
+		.name		= "socket",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.match		= socket_mt4_v0,
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV4,
+		.match		= socket_mt4_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#ifdef XT_SOCKET_HAVE_IPV6
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
+};
+
+static int __init socket_mt_init(void)
+{
+	nf_defrag_ipv4_enable();
+#ifdef XT_SOCKET_HAVE_IPV6
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
+}
+
+static void __exit socket_mt_exit(void)
+{
+	xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
+}
+
+module_init(socket_mt_init);
+module_exit(socket_mt_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
+MODULE_DESCRIPTION("x_tables socket match module");
+MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
new file mode 100644
index 00000000..a507922d
--- /dev/null
+++ b/net/netfilter/xt_state.c
@@ -0,0 +1,79 @@
+/* Kernel module to match connection tracking information. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_state.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module");
+MODULE_ALIAS("ipt_state");
+MODULE_ALIAS("ip6t_state");
+
+static bool
+state_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_state_info *sinfo = par->matchinfo;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+	if (!ct)
+		statebit = XT_STATE_INVALID;
+	else {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_STATE_UNTRACKED;
+		else
+			statebit = XT_STATE_BIT(ctinfo);
+	}
+	return (sinfo->statemask & statebit);
+}
+
+static int state_mt_check(const struct xt_mtchk_param *par)
+{
+	int ret;
+
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0)
+		pr_info("cannot load conntrack support for proto=%u\n",
+			par->family);
+	return ret;
+}
+
+static void state_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_match state_mt_reg __read_mostly = {
+	.name       = "state",
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = state_mt_check,
+	.match      = state_mt,
+	.destroy    = state_mt_destroy,
+	.matchsize  = sizeof(struct xt_state_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init state_mt_init(void)
+{
+	return xt_register_match(&state_mt_reg);
+}
+
+static void __exit state_mt_exit(void)
+{
+	xt_unregister_match(&state_mt_reg);
+}
+
+module_init(state_mt_init);
+module_exit(state_mt_exit);
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
new file mode 100644
index 00000000..42ecb71d
--- /dev/null
+++ b/net/netfilter/xt_statistic.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
+ */
+
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter/xt_statistic.h>
+#include <linux/netfilter/x_tables.h>
+
+struct xt_statistic_priv {
+	atomic_t count;
+} ____cacheline_aligned_in_smp;
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
+MODULE_ALIAS("ipt_statistic");
+MODULE_ALIAS("ip6t_statistic");
+
+static bool
+statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_statistic_info *info = par->matchinfo;
+	bool ret = info->flags & XT_STATISTIC_INVERT;
+	int nval, oval;
+
+	switch (info->mode) {
+	case XT_STATISTIC_MODE_RANDOM:
+		if ((net_random() & 0x7FFFFFFF) < info->u.random.probability)
+			ret = !ret;
+		break;
+	case XT_STATISTIC_MODE_NTH:
+		do {
+			oval = atomic_read(&info->master->count);
+			nval = (oval == info->u.nth.every) ? 0 : oval + 1;
+		} while (atomic_cmpxchg(&info->master->count, oval, nval) != oval);
+		if (nval == 0)
+			ret = !ret;
+		break;
+	}
+
+	return ret;
+}
+
+static int statistic_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_statistic_info *info = par->matchinfo;
+
+	if (info->mode > XT_STATISTIC_MODE_MAX ||
+	    info->flags & ~XT_STATISTIC_MASK)
+		return -EINVAL;
+
+	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
+	if (info->master == NULL)
+		return -ENOMEM;
+	atomic_set(&info->master->count, info->u.nth.count);
+
+	return 0;
+}
+
+static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_statistic_info *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
+static struct xt_match xt_statistic_mt_reg __read_mostly = {
+	.name       = "statistic",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = statistic_mt,
+	.checkentry = statistic_mt_check,
+	.destroy    = statistic_mt_destroy,
+	.matchsize  = sizeof(struct xt_statistic_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init statistic_mt_init(void)
+{
+	return xt_register_match(&xt_statistic_mt_reg);
+}
+
+static void __exit statistic_mt_exit(void)
+{
+	xt_unregister_match(&xt_statistic_mt_reg);
+}
+
+module_init(statistic_mt_init);
+module_exit(statistic_mt_exit);
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
new file mode 100644
index 00000000..d3c48b14
--- /dev/null
+++ b/net/netfilter/xt_string.c
@@ -0,0 +1,96 @@
+/* String matching match for iptables
+ *
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("Xtables: string-based matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_string");
+MODULE_ALIAS("ip6t_string");
+
+static bool
+string_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_string_info *conf = par->matchinfo;
+	struct ts_state state;
+	bool invert;
+
+	memset(&state, 0, sizeof(struct ts_state));
+	invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT;
+
+	return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
+			     conf->to_offset, conf->config, &state)
+			     != UINT_MAX) ^ invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m))
+
+static int string_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_string_info *conf = par->matchinfo;
+	struct ts_config *ts_conf;
+	int flags = TS_AUTOLOAD;
+
+	/* Damn, can't handle this case properly with iptables... */
+	if (conf->from_offset > conf->to_offset)
+		return -EINVAL;
+	if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0')
+		return -EINVAL;
+	if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE)
+		return -EINVAL;
+	if (conf->u.v1.flags &
+	    ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT))
+		return -EINVAL;
+	if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE)
+		flags |= TS_IGNORECASE;
+	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+				     GFP_KERNEL, flags);
+	if (IS_ERR(ts_conf))
+		return PTR_ERR(ts_conf);
+
+	conf->config = ts_conf;
+	return 0;
+}
+
+static void string_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config);
+}
+
+static struct xt_match xt_string_mt_reg __read_mostly = {
+	.name       = "string",
+	.revision   = 1,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = string_mt_check,
+	.match      = string_mt,
+	.destroy    = string_mt_destroy,
+	.matchsize  = sizeof(struct xt_string_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init string_mt_init(void)
+{
+	return xt_register_match(&xt_string_mt_reg);
+}
+
+static void __exit string_mt_exit(void)
+{
+	xt_unregister_match(&xt_string_mt_reg);
+}
+
+module_init(string_mt_init);
+module_exit(string_mt_exit);
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
new file mode 100644
index 00000000..c53d4d18
--- /dev/null
+++ b/net/netfilter/xt_tcpmss.c
@@ -0,0 +1,110 @@
+/* Kernel module to match TCP MSS values. */
+
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Portions (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/xt_tcpmss.h>
+#include <linux/netfilter/x_tables.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("Xtables: TCP MSS match");
+MODULE_ALIAS("ipt_tcpmss");
+MODULE_ALIAS("ip6t_tcpmss");
+
+static bool
+tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_tcpmss_match_info *info = par->matchinfo;
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	const u_int8_t *op;
+	u8 _opt[15 * 4 - sizeof(_tcph)];
+	unsigned int i, optlen;
+
+	/* If we don't have the whole header, drop packet. */
+	th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		goto dropit;
+
+	/* Malformed. */
+	if (th->doff*4 < sizeof(*th))
+		goto dropit;
+
+	optlen = th->doff*4 - sizeof(*th);
+	if (!optlen)
+		goto out;
+
+	/* Truncated options. */
+	op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt);
+	if (op == NULL)
+		goto dropit;
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == TCPOPT_MSS
+		    && (optlen - i) >= TCPOLEN_MSS
+		    && op[i+1] == TCPOLEN_MSS) {
+			u_int16_t mssval;
+
+			mssval = (op[i+2] << 8) | op[i+3];
+
+			return (mssval >= info->mss_min &&
+				mssval <= info->mss_max) ^ info->invert;
+		}
+		if (op[i] < 2)
+			i++;
+		else
+			i += op[i+1] ? : 1;
+	}
+out:
+	return info->invert;
+
+dropit:
+	par->hotdrop = true;
+	return false;
+}
+
+static struct xt_match tcpmss_mt_reg[] __read_mostly = {
+	{
+		.name		= "tcpmss",
+		.family		= NFPROTO_IPV4,
+		.match		= tcpmss_mt,
+		.matchsize	= sizeof(struct xt_tcpmss_match_info),
+		.proto		= IPPROTO_TCP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tcpmss",
+		.family		= NFPROTO_IPV6,
+		.match		= tcpmss_mt,
+		.matchsize	= sizeof(struct xt_tcpmss_match_info),
+		.proto		= IPPROTO_TCP,
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init tcpmss_mt_init(void)
+{
+	return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg));
+}
+
+static void __exit tcpmss_mt_exit(void)
+{
+	xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg));
+}
+
+module_init(tcpmss_mt_init);
+module_exit(tcpmss_mt_exit);
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
new file mode 100644
index 00000000..c14d4645
--- /dev/null
+++ b/net/netfilter/xt_tcpudp.c
@@ -0,0 +1,234 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/types.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
+MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xt_tcp");
+MODULE_ALIAS("xt_udp");
+MODULE_ALIAS("ipt_udp");
+MODULE_ALIAS("ipt_tcp");
+MODULE_ALIAS("ip6t_udp");
+MODULE_ALIAS("ip6t_tcp");
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline bool
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert)
+{
+	return (port >= min && port <= max) ^ invert;
+}
+
+static bool
+tcp_find_option(u_int8_t option,
+		const struct sk_buff *skb,
+		unsigned int protoff,
+		unsigned int optlen,
+		bool invert,
+		bool *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	const u_int8_t *op;
+	u_int8_t _opt[60 - sizeof(struct tcphdr)];
+	unsigned int i;
+
+	pr_debug("finding option\n");
+
+	if (!optlen)
+		return invert;
+
+	/* If we don't have the whole header, drop packet. */
+	op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr),
+				optlen, _opt);
+	if (op == NULL) {
+		*hotdrop = true;
+		return false;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) return !invert;
+		if (op[i] < 2) i++;
+		else i += op[i+1]?:1;
+	}
+
+	return invert;
+}
+
+static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct tcphdr *th;
+	struct tcphdr _tcph;
+	const struct xt_tcp *tcpinfo = par->matchinfo;
+
+	if (par->fragoff != 0) {
+		/* To quote Alan:
+
+		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
+		   causes this. Its a cracker trying to break in by doing a
+		   flag overwrite to pass the direction checks.
+		*/
+		if (par->fragoff == 1) {
+			pr_debug("Dropping evil TCP offset=1 frag.\n");
+			par->hotdrop = true;
+		}
+		/* Must not be a fragment. */
+		return false;
+	}
+
+#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg)))
+
+	th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		pr_debug("Dropping evil TCP offset=0 tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+			ntohs(th->source),
+			!!(tcpinfo->invflags & XT_TCP_INV_SRCPT)))
+		return false;
+	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+			ntohs(th->dest),
+			!!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
+		return false;
+	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+		      == tcpinfo->flg_cmp,
+		      XT_TCP_INV_FLAGS))
+		return false;
+	if (tcpinfo->option) {
+		if (th->doff * 4 < sizeof(_tcph)) {
+			par->hotdrop = true;
+			return false;
+		}
+		if (!tcp_find_option(tcpinfo->option, skb, par->thoff,
+				     th->doff*4 - sizeof(_tcph),
+				     tcpinfo->invflags & XT_TCP_INV_OPTION,
+				     &par->hotdrop))
+			return false;
+	}
+	return true;
+}
+
+static int tcp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_tcp *tcpinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0;
+}
+
+static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct udphdr *uh;
+	struct udphdr _udph;
+	const struct xt_udp *udpinfo = par->matchinfo;
+
+	/* Must not be a fragment. */
+	if (par->fragoff != 0)
+		return false;
+
+	uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		pr_debug("Dropping evil UDP tinygram.\n");
+		par->hotdrop = true;
+		return false;
+	}
+
+	return port_match(udpinfo->spts[0], udpinfo->spts[1],
+			  ntohs(uh->source),
+			  !!(udpinfo->invflags & XT_UDP_INV_SRCPT))
+		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+			      ntohs(uh->dest),
+			      !!(udpinfo->invflags & XT_UDP_INV_DSTPT));
+}
+
+static int udp_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_udp *udpinfo = par->matchinfo;
+
+	/* Must specify no unknown invflags */
+	return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0;
+}
+
+static struct xt_match tcpudp_mt_reg[] __read_mostly = {
+	{
+		.name		= "tcp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= tcp_mt_check,
+		.match		= tcp_mt,
+		.matchsize	= sizeof(struct xt_tcp),
+		.proto		= IPPROTO_TCP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tcp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= tcp_mt_check,
+		.match		= tcp_mt,
+		.matchsize	= sizeof(struct xt_tcp),
+		.proto		= IPPROTO_TCP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "udp",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= udp_mt_check,
+		.match		= udp_mt,
+		.matchsize	= sizeof(struct xt_udp),
+		.proto		= IPPROTO_UDP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "udp",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= udp_mt_check,
+		.match		= udp_mt,
+		.matchsize	= sizeof(struct xt_udp),
+		.proto		= IPPROTO_UDP,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "udplite",
+		.family		= NFPROTO_IPV4,
+		.checkentry	= udp_mt_check,
+		.match		= udp_mt,
+		.matchsize	= sizeof(struct xt_udp),
+		.proto		= IPPROTO_UDPLITE,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "udplite",
+		.family		= NFPROTO_IPV6,
+		.checkentry	= udp_mt_check,
+		.match		= udp_mt,
+		.matchsize	= sizeof(struct xt_udp),
+		.proto		= IPPROTO_UDPLITE,
+		.me		= THIS_MODULE,
+	},
+};
+
+static int __init tcpudp_mt_init(void)
+{
+	return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg));
+}
+
+static void __exit tcpudp_mt_exit(void)
+{
+	xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg));
+}
+
+module_init(tcpudp_mt_init);
+module_exit(tcpudp_mt_exit);
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
new file mode 100644
index 00000000..c48975ff
--- /dev/null
+++ b/net/netfilter/xt_time.c
@@ -0,0 +1,269 @@
+/*
+ *	xt_time
+ *	Copyright © CC Computer Consultants GmbH, 2007
+ *
+ *	based on ipt_time by Fabrice MARIE <fabrice@netfilter.org>
+ *	This is a module which is used for time matching
+ *	It is using some modified code from dietlibc (localtime() function)
+ *	that you can find at http://www.fefe.de/dietlibc/
+ *	This file is distributed under the terms of the GNU General Public
+ *	License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
+ */
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_time.h>
+
+struct xtm {
+	u_int8_t month;    /* (1-12) */
+	u_int8_t monthday; /* (1-31) */
+	u_int8_t weekday;  /* (1-7) */
+	u_int8_t hour;     /* (0-23) */
+	u_int8_t minute;   /* (0-59) */
+	u_int8_t second;   /* (0-59) */
+	unsigned int dse;
+};
+
+extern struct timezone sys_tz; /* ouch */
+
+static const u_int16_t days_since_year[] = {
+	0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334,
+};
+
+static const u_int16_t days_since_leapyear[] = {
+	0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335,
+};
+
+/*
+ * Since time progresses forward, it is best to organize this array in reverse,
+ * to minimize lookup time.
+ */
+enum {
+	DSE_FIRST = 2039,
+};
+static const u_int16_t days_since_epoch[] = {
+	/* 2039 - 2030 */
+	25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915,
+	/* 2029 - 2020 */
+	21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262,
+	/* 2019 - 2010 */
+	17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610,
+	/* 2009 - 2000 */
+	14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957,
+	/* 1999 - 1990 */
+	10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305,
+	/* 1989 - 1980 */
+	6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652,
+	/* 1979 - 1970 */
+	3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0,
+};
+
+static inline bool is_leap(unsigned int y)
+{
+	return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0);
+}
+
+/*
+ * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp.
+ * Since we match against days and daytime, the SSTE value needs to be
+ * computed back into human-readable dates.
+ *
+ * This is done in three separate functions so that the most expensive
+ * calculations are done last, in case a "simple match" can be found earlier.
+ */
+static inline unsigned int localtime_1(struct xtm *r, time_t time)
+{
+	unsigned int v, w;
+
+	/* Each day has 86400s, so finding the hour/minute is actually easy. */
+	v         = time % 86400;
+	r->second = v % 60;
+	w         = v / 60;
+	r->minute = w % 60;
+	r->hour   = w / 60;
+	return v;
+}
+
+static inline void localtime_2(struct xtm *r, time_t time)
+{
+	/*
+	 * Here comes the rest (weekday, monthday). First, divide the SSTE
+	 * by seconds-per-day to get the number of _days_ since the epoch.
+	 */
+	r->dse = time / 86400;
+
+	/*
+	 * 1970-01-01 (w=0) was a Thursday (4).
+	 * -1 and +1 map Sunday properly onto 7.
+	 */
+	r->weekday = (4 + r->dse - 1) % 7 + 1;
+}
+
+static void localtime_3(struct xtm *r, time_t time)
+{
+	unsigned int year, i, w = r->dse;
+
+	/*
+	 * In each year, a certain number of days-since-the-epoch have passed.
+	 * Find the year that is closest to said days.
+	 *
+	 * Consider, for example, w=21612 (2029-03-04). Loop will abort on
+	 * dse[i] <= w, which happens when dse[i] == 21550. This implies
+	 * year == 2009. w will then be 62.
+	 */
+	for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w;
+	    ++i, --year)
+		/* just loop */;
+
+	w -= days_since_epoch[i];
+
+	/*
+	 * By now we have the current year, and the day of the year.
+	 * r->yearday = w;
+	 *
+	 * On to finding the month (like above). In each month, a certain
+	 * number of days-since-New Year have passed, and find the closest
+	 * one.
+	 *
+	 * Consider w=62 (in a non-leap year). Loop will abort on
+	 * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2).
+	 * Concludes i == 2, i.e. 3rd month => March.
+	 *
+	 * (A different approach to use would be to subtract a monthlength
+	 * from w repeatedly while counting.)
+	 */
+	if (is_leap(year)) {
+		/* use days_since_leapyear[] in a leap year */
+		for (i = ARRAY_SIZE(days_since_leapyear) - 1;
+		    i > 0 && days_since_leapyear[i] > w; --i)
+			/* just loop */;
+		r->monthday = w - days_since_leapyear[i] + 1;
+	} else {
+		for (i = ARRAY_SIZE(days_since_year) - 1;
+		    i > 0 && days_since_year[i] > w; --i)
+			/* just loop */;
+		r->monthday = w - days_since_year[i] + 1;
+	}
+
+	r->month    = i + 1;
+}
+
+static bool
+time_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_time_info *info = par->matchinfo;
+	unsigned int packet_time;
+	struct xtm current_time;
+	s64 stamp;
+
+	/*
+	 * We cannot use get_seconds() instead of __net_timestamp() here.
+	 * Suppose you have two rules:
+	 * 	1. match before 13:00
+	 * 	2. match after 13:00
+	 * If you match against processing time (get_seconds) it
+	 * may happen that the same packet matches both rules if
+	 * it arrived at the right moment before 13:00.
+	 */
+	if (skb->tstamp.tv64 == 0)
+		__net_timestamp((struct sk_buff *)skb);
+
+	stamp = ktime_to_ns(skb->tstamp);
+	stamp = div_s64(stamp, NSEC_PER_SEC);
+
+	if (info->flags & XT_TIME_LOCAL_TZ)
+		/* Adjust for local timezone */
+		stamp -= 60 * sys_tz.tz_minuteswest;
+
+	/*
+	 * xt_time will match when _all_ of the following hold:
+	 *   - 'now' is in the global time range date_start..date_end
+	 *   - 'now' is in the monthday mask
+	 *   - 'now' is in the weekday mask
+	 *   - 'now' is in the daytime range time_start..time_end
+	 * (and by default, libxt_time will set these so as to match)
+	 */
+
+	if (stamp < info->date_start || stamp > info->date_stop)
+		return false;
+
+	packet_time = localtime_1(&current_time, stamp);
+
+	if (info->daytime_start < info->daytime_stop) {
+		if (packet_time < info->daytime_start ||
+		    packet_time > info->daytime_stop)
+			return false;
+	} else {
+		if (packet_time < info->daytime_start &&
+		    packet_time > info->daytime_stop)
+			return false;
+	}
+
+	localtime_2(&current_time, stamp);
+
+	if (!(info->weekdays_match & (1 << current_time.weekday)))
+		return false;
+
+	/* Do not spend time computing monthday if all days match anyway */
+	if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) {
+		localtime_3(&current_time, stamp);
+		if (!(info->monthdays_match & (1 << current_time.monthday)))
+			return false;
+	}
+
+	return true;
+}
+
+static int time_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_time_info *info = par->matchinfo;
+
+	if (info->daytime_start > XT_TIME_MAX_DAYTIME ||
+	    info->daytime_stop > XT_TIME_MAX_DAYTIME) {
+		pr_info("invalid argument - start or "
+			"stop time greater than 23:59:59\n");
+		return -EDOM;
+	}
+
+	return 0;
+}
+
+static struct xt_match xt_time_mt_reg __read_mostly = {
+	.name       = "time",
+	.family     = NFPROTO_UNSPEC,
+	.match      = time_mt,
+	.checkentry = time_mt_check,
+	.matchsize  = sizeof(struct xt_time_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init time_mt_init(void)
+{
+	int minutes = sys_tz.tz_minuteswest;
+
+	if (minutes < 0) /* east of Greenwich */
+		printk(KERN_INFO KBUILD_MODNAME
+		       ": kernel timezone is +%02d%02d\n",
+		       -minutes / 60, -minutes % 60);
+	else /* west of Greenwich */
+		printk(KERN_INFO KBUILD_MODNAME
+		       ": kernel timezone is -%02d%02d\n",
+		       minutes / 60, minutes % 60);
+
+	return xt_register_match(&xt_time_mt_reg);
+}
+
+static void __exit time_mt_exit(void)
+{
+	xt_unregister_match(&xt_time_mt_reg);
+}
+
+module_init(time_mt_init);
+module_exit(time_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: time-based matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_time");
+MODULE_ALIAS("ip6t_time");
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
new file mode 100644
index 00000000..a95b5034
--- /dev/null
+++ b/net/netfilter/xt_u32.c
@@ -0,0 +1,123 @@
+/*
+ *	xt_u32 - kernel module to match u32 packet content
+ *
+ *	Original author: Don Cohen <don@isis.cs3-inc.com>
+ *	(C) CC Computer Consultants GmbH, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_u32.h>
+
+static bool u32_match_it(const struct xt_u32 *data,
+			 const struct sk_buff *skb)
+{
+	const struct xt_u32_test *ct;
+	unsigned int testind;
+	unsigned int nnums;
+	unsigned int nvals;
+	unsigned int i;
+	__be32 n;
+	u_int32_t pos;
+	u_int32_t val;
+	u_int32_t at;
+
+	/*
+	 * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17"
+	 * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands.
+	 */
+	for (testind = 0; testind < data->ntests; ++testind) {
+		ct  = &data->tests[testind];
+		at  = 0;
+		pos = ct->location[0].number;
+
+		if (skb->len < 4 || pos > skb->len - 4)
+			return false;
+
+		if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0)
+			BUG();
+		val   = ntohl(n);
+		nnums = ct->nnums;
+
+		/* Inner loop runs over "&", "<<", ">>" and "@" operands */
+		for (i = 1; i < nnums; ++i) {
+			u_int32_t number = ct->location[i].number;
+			switch (ct->location[i].nextop) {
+			case XT_U32_AND:
+				val &= number;
+				break;
+			case XT_U32_LEFTSH:
+				val <<= number;
+				break;
+			case XT_U32_RIGHTSH:
+				val >>= number;
+				break;
+			case XT_U32_AT:
+				if (at + val < at)
+					return false;
+				at += val;
+				pos = number;
+				if (at + 4 < at || skb->len < at + 4 ||
+				    pos > skb->len - at - 4)
+					return false;
+
+				if (skb_copy_bits(skb, at + pos, &n,
+						    sizeof(n)) < 0)
+					BUG();
+				val = ntohl(n);
+				break;
+			}
+		}
+
+		/* Run over the "," and ":" operands */
+		nvals = ct->nvalues;
+		for (i = 0; i < nvals; ++i)
+			if (ct->value[i].min <= val && val <= ct->value[i].max)
+				break;
+
+		if (i >= ct->nvalues)
+			return false;
+	}
+
+	return true;
+}
+
+static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_u32 *data = par->matchinfo;
+	bool ret;
+
+	ret = u32_match_it(data, skb);
+	return ret ^ data->invert;
+}
+
+static struct xt_match xt_u32_mt_reg __read_mostly = {
+	.name       = "u32",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = u32_mt,
+	.matchsize  = sizeof(struct xt_u32),
+	.me         = THIS_MODULE,
+};
+
+static int __init u32_mt_init(void)
+{
+	return xt_register_match(&xt_u32_mt_reg);
+}
+
+static void __exit u32_mt_exit(void)
+{
+	xt_unregister_match(&xt_u32_mt_reg);
+}
+
+module_init(u32_mt_init);
+module_exit(u32_mt_exit);
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: arbitrary byte matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_u32");
+MODULE_ALIAS("ip6t_u32");
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
new file mode 100644
index 00000000..56958c85
--- /dev/null
+++ b/net/netlabel/Kconfig
@@ -0,0 +1,17 @@
+#
+# NetLabel configuration
+#
+
+config NETLABEL
+	bool "NetLabel subsystem support"
+	depends on SECURITY
+	default n
+	---help---
+	  NetLabel provides support for explicit network packet labeling
+	  protocols such as CIPSO and RIPSO.  For more information see
+	  Documentation/netlabel as well as the NetLabel SourceForge project
+	  for configuration tools and additional documentation.
+
+	   * http://netlabel.sf.net
+
+	  If you are unsure, say N.
diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
new file mode 100644
index 00000000..ea750e9d
--- /dev/null
+++ b/net/netlabel/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for the NetLabel subsystem.
+#
+# Feb 9, 2006, Paul Moore <paul.moore@hp.com>
+#
+
+# base objects
+obj-y	:= netlabel_user.o netlabel_kapi.o
+obj-y	+= netlabel_domainhash.o netlabel_addrlist.o
+
+# management objects
+obj-y	+= netlabel_mgmt.o
+
+# protocol modules
+obj-y	+= netlabel_unlabeled.o
+obj-y	+= netlabel_cipso_v4.o
+
diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c
new file mode 100644
index 00000000..c0519139
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.c
@@ -0,0 +1,384 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/audit.h>
+
+#include "netlabel_addrlist.h"
+
+/*
+ * Address List Functions
+ */
+
+/**
+ * netlbl_af4list_search - Search for a matching IPv4 address entry
+ * @addr: IPv4 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head.  If a matching address entry
+ * is found it is returned, otherwise NULL is returned.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+					     struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid && (addr & iter->mask) == iter->addr)
+			return iter;
+
+	return NULL;
+}
+
+/**
+ * netlbl_af4list_search_exact - Search for an exact IPv4 address entry
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv4 address list given by @head.  If an exact match if found
+ * it is returned, otherwise NULL is returned.  The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+						   __be32 mask,
+						   struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid && iter->addr == addr && iter->mask == mask)
+			return iter;
+
+	return NULL;
+}
+
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_search - Search for a matching IPv6 address entry
+ * @addr: IPv6 address
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head.  If a matching address entry
+ * is found it is returned, otherwise NULL is returned.  The caller is
+ * responsible for calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+					     struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0)
+			return iter;
+
+	return NULL;
+}
+
+/**
+ * netlbl_af6list_search_exact - Search for an exact IPv6 address entry
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @head: the list head
+ *
+ * Description:
+ * Searches the IPv6 address list given by @head.  If an exact match if found
+ * it is returned, otherwise NULL is returned.  The caller is responsible for
+ * calling the rcu_read_[un]lock() functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+						   const struct in6_addr *mask,
+						   struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_addr_equal(&iter->addr, addr) &&
+		    ipv6_addr_equal(&iter->mask, mask))
+			return iter;
+
+	return NULL;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_add - Add a new IPv4 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head.  On success zero is
+ * returned, otherwise a negative value is returned.  The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head)
+{
+	struct netlbl_af4list *iter;
+
+	iter = netlbl_af4list_search(entry->addr, head);
+	if (iter != NULL &&
+	    iter->addr == entry->addr && iter->mask == entry->mask)
+		return -EEXIST;
+
+	/* in order to speed up address searches through the list (the common
+	 * case) we need to keep the list in order based on the size of the
+	 * address mask such that the entry with the widest mask (smallest
+	 * numerical value) appears first in the list */
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ntohl(entry->mask) > ntohl(iter->mask)) {
+			__list_add_rcu(&entry->list,
+				       iter->list.prev,
+				       &iter->list);
+			return 0;
+		}
+	list_add_tail_rcu(&entry->list, head);
+	return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_add - Add a new IPv6 address entry to a list
+ * @entry: address entry
+ * @head: the list head
+ *
+ * Description:
+ * Add a new address entry to the list pointed to by @head.  On success zero is
+ * returned, otherwise a negative value is returned.  The caller is responsible
+ * for calling the necessary locking functions.
+ *
+ */
+int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head)
+{
+	struct netlbl_af6list *iter;
+
+	iter = netlbl_af6list_search(&entry->addr, head);
+	if (iter != NULL &&
+	    ipv6_addr_equal(&iter->addr, &entry->addr) &&
+	    ipv6_addr_equal(&iter->mask, &entry->mask))
+		return -EEXIST;
+
+	/* in order to speed up address searches through the list (the common
+	 * case) we need to keep the list in order based on the size of the
+	 * address mask such that the entry with the widest mask (smallest
+	 * numerical value) appears first in the list */
+	list_for_each_entry_rcu(iter, head, list)
+		if (iter->valid &&
+		    ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) {
+			__list_add_rcu(&entry->list,
+				       iter->list.prev,
+				       &iter->list);
+			return 0;
+		}
+	list_add_tail_rcu(&entry->list, head);
+	return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_af4list_remove_entry - Remove an IPv4 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry.  The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry)
+{
+	entry->valid = 0;
+	list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af4list_remove - Remove an IPv4 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head.  Returns the
+ * entry on success, NULL on failure.  The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+					     struct list_head *head)
+{
+	struct netlbl_af4list *entry;
+
+	entry = netlbl_af4list_search_exact(addr, mask, head);
+	if (entry == NULL)
+		return NULL;
+	netlbl_af4list_remove_entry(entry);
+	return entry;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_remove_entry - Remove an IPv6 address entry
+ * @entry: address entry
+ *
+ * Description:
+ * Remove the specified IP address entry.  The caller is responsible for
+ * calling the necessary locking functions.
+ *
+ */
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry)
+{
+	entry->valid = 0;
+	list_del_rcu(&entry->list);
+}
+
+/**
+ * netlbl_af6list_remove - Remove an IPv6 address entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @head: the list head
+ *
+ * Description:
+ * Remove an IP address entry from the list pointed to by @head.  Returns the
+ * entry on success, NULL on failure.  The caller is responsible for calling
+ * the necessary locking functions.
+ *
+ */
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+					     const struct in6_addr *mask,
+					     struct list_head *head)
+{
+	struct netlbl_af6list *entry;
+
+	entry = netlbl_af6list_search_exact(addr, mask, head);
+	if (entry == NULL)
+		return NULL;
+	netlbl_af6list_remove_entry(entry);
+	return entry;
+}
+#endif /* IPv6 */
+
+/*
+ * Audit Helper Functions
+ */
+
+#ifdef CONFIG_AUDIT
+/**
+ * netlbl_af4list_audit_addr - Audit an IPv4 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv4 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+					int src, const char *dev,
+					__be32 addr, __be32 mask)
+{
+	u32 mask_val = ntohl(mask);
+	char *dir = (src ? "src" : "dst");
+
+	if (dev != NULL)
+		audit_log_format(audit_buf, " netif=%s", dev);
+	audit_log_format(audit_buf, " %s=%pI4", dir, &addr);
+	if (mask_val != 0xffffffff) {
+		u32 mask_len = 0;
+		while (mask_val > 0) {
+			mask_val <<= 1;
+			mask_len++;
+		}
+		audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+	}
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_af6list_audit_addr - Audit an IPv6 address
+ * @audit_buf: audit buffer
+ * @src: true if source address, false if destination
+ * @dev: network interface
+ * @addr: IP address
+ * @mask: IP address mask
+ *
+ * Description:
+ * Write the IPv6 address and address mask, if necessary, to @audit_buf.
+ *
+ */
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+				 int src,
+				 const char *dev,
+				 const struct in6_addr *addr,
+				 const struct in6_addr *mask)
+{
+	char *dir = (src ? "src" : "dst");
+
+	if (dev != NULL)
+		audit_log_format(audit_buf, " netif=%s", dev);
+	audit_log_format(audit_buf, " %s=%pI6", dir, addr);
+	if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
+		u32 mask_len = 0;
+		u32 mask_val;
+		int iter = -1;
+		while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff)
+			mask_len += 32;
+		mask_val = ntohl(mask->s6_addr32[iter]);
+		while (mask_val > 0) {
+			mask_val <<= 1;
+			mask_len++;
+		}
+		audit_log_format(audit_buf, " %s_prefixlen=%d", dir, mask_len);
+	}
+}
+#endif /* IPv6 */
+#endif /* CONFIG_AUDIT */
diff --git a/net/netlabel/netlabel_addrlist.h b/net/netlabel/netlabel_addrlist.h
new file mode 100644
index 00000000..2b9644e1
--- /dev/null
+++ b/net/netlabel/netlabel_addrlist.h
@@ -0,0 +1,209 @@
+/*
+ * NetLabel Network Address Lists
+ *
+ * This file contains network address list functions used to manage ordered
+ * lists of network addresses for use by the NetLabel subsystem.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_ADDRLIST_H
+#define _NETLABEL_ADDRLIST_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/in6.h>
+#include <linux/audit.h>
+
+/**
+ * struct netlbl_af4list - NetLabel IPv4 address list
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af4list {
+	__be32 addr;
+	__be32 mask;
+
+	u32 valid;
+	struct list_head list;
+};
+
+/**
+ * struct netlbl_af6list - NetLabel IPv6 address list
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @valid: valid flag
+ * @list: list structure, used internally
+ */
+struct netlbl_af6list {
+	struct in6_addr addr;
+	struct in6_addr mask;
+
+	u32 valid;
+	struct list_head list;
+};
+
+#define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list)
+
+static inline struct netlbl_af4list *__af4list_valid(struct list_head *s,
+						     struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af4list *n = __af4list_entry(s);
+	while (i != h && !n->valid) {
+		i = i->next;
+		n = __af4list_entry(i);
+	}
+	return n;
+}
+
+static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s,
+							 struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af4list *n = __af4list_entry(s);
+	while (i != h && !n->valid) {
+		i = rcu_dereference(i->next);
+		n = __af4list_entry(i);
+	}
+	return n;
+}
+
+#define netlbl_af4list_foreach(iter, head)				\
+	for (iter = __af4list_valid((head)->next, head);		\
+	     &iter->list != (head);					\
+	     iter = __af4list_valid(iter->list.next, head))
+
+#define netlbl_af4list_foreach_rcu(iter, head)				\
+	for (iter = __af4list_valid_rcu((head)->next, head);		\
+	     &iter->list != (head);					\
+	     iter = __af4list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af4list_foreach_safe(iter, tmp, head)			\
+	for (iter = __af4list_valid((head)->next, head),		\
+		     tmp = __af4list_valid(iter->list.next, head);	\
+	     &iter->list != (head);					\
+	     iter = tmp, tmp = __af4list_valid(iter->list.next, head))
+
+int netlbl_af4list_add(struct netlbl_af4list *entry,
+		       struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask,
+					     struct list_head *head);
+void netlbl_af4list_remove_entry(struct netlbl_af4list *entry);
+struct netlbl_af4list *netlbl_af4list_search(__be32 addr,
+					     struct list_head *head);
+struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr,
+						   __be32 mask,
+						   struct list_head *head);
+
+#ifdef CONFIG_AUDIT
+void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+			       int src, const char *dev,
+			       __be32 addr, __be32 mask);
+#else
+static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf,
+					     int src, const char *dev,
+					     __be32 addr, __be32 mask)
+{
+}
+#endif
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+#define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list)
+
+static inline struct netlbl_af6list *__af6list_valid(struct list_head *s,
+						     struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af6list *n = __af6list_entry(s);
+	while (i != h && !n->valid) {
+		i = i->next;
+		n = __af6list_entry(i);
+	}
+	return n;
+}
+
+static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s,
+							 struct list_head *h)
+{
+	struct list_head *i = s;
+	struct netlbl_af6list *n = __af6list_entry(s);
+	while (i != h && !n->valid) {
+		i = rcu_dereference(i->next);
+		n = __af6list_entry(i);
+	}
+	return n;
+}
+
+#define netlbl_af6list_foreach(iter, head)				\
+	for (iter = __af6list_valid((head)->next, head);		\
+	     &iter->list != (head);					\
+	     iter = __af6list_valid(iter->list.next, head))
+
+#define netlbl_af6list_foreach_rcu(iter, head)				\
+	for (iter = __af6list_valid_rcu((head)->next, head);		\
+	     &iter->list != (head);					\
+	     iter = __af6list_valid_rcu(iter->list.next, head))
+
+#define netlbl_af6list_foreach_safe(iter, tmp, head)			\
+	for (iter = __af6list_valid((head)->next, head),		\
+		     tmp = __af6list_valid(iter->list.next, head);	\
+	     &iter->list != (head);					\
+	     iter = tmp, tmp = __af6list_valid(iter->list.next, head))
+
+int netlbl_af6list_add(struct netlbl_af6list *entry,
+		       struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr,
+					     const struct in6_addr *mask,
+					     struct list_head *head);
+void netlbl_af6list_remove_entry(struct netlbl_af6list *entry);
+struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr,
+					     struct list_head *head);
+struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr,
+						   const struct in6_addr *mask,
+						   struct list_head *head);
+
+#ifdef CONFIG_AUDIT
+void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+			       int src,
+			       const char *dev,
+			       const struct in6_addr *addr,
+			       const struct in6_addr *mask);
+#else
+static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
+					     int src,
+					     const char *dev,
+					     const struct in6_addr *addr,
+					     const struct in6_addr *mask)
+{
+}
+#endif
+#endif /* IPV6 */
+
+#endif
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
new file mode 100644
index 00000000..bae5756b
--- /dev/null
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -0,0 +1,787 @@
+/*
+ * NetLabel CIPSO/IPv4 Support
+ *
+ * This file defines the CIPSO/IPv4 functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
+
+/* Argument struct for cipso_v4_doi_walk() */
+struct netlbl_cipsov4_doiwalk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+	struct netlbl_audit *audit_info;
+	u32 doi;
+};
+
+/* NetLabel Generic NETLINK CIPSOv4 family */
+static struct genl_family netlbl_cipsov4_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CIPSOV4_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
+	[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 },
+	[NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED },
+};
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * netlbl_cipsov4_add_common - Parse the common sections of a ADD message
+ * @info: the Generic NETLINK info block
+ * @doi_def: the CIPSO V4 DOI definition
+ *
+ * Description:
+ * Parse the common sections of a ADD message and fill in the related values
+ * in @doi_def.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_add_common(struct genl_info *info,
+				     struct cipso_v4_doi *doi_def)
+{
+	struct nlattr *nla;
+	int nla_rem;
+	u32 iter = 0;
+
+	doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+	if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST],
+				NLBL_CIPSOV4_A_MAX,
+				netlbl_cipsov4_genl_policy) != 0)
+		return -EINVAL;
+
+	nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem)
+		if (nla_type(nla) == NLBL_CIPSOV4_A_TAG) {
+			if (iter >= CIPSO_V4_TAG_MAXCNT)
+				return -EINVAL;
+			doi_def->tags[iter++] = nla_get_u8(nla);
+		}
+	while (iter < CIPSO_V4_TAG_MAXCNT)
+		doi_def->tags[iter++] = CIPSO_V4_TAG_INVALID;
+
+	return 0;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine.  Return zero on success and
+ * non-zero on error.
+ *
+ */
+static int netlbl_cipsov4_add_std(struct genl_info *info,
+				  struct netlbl_audit *audit_info)
+{
+	int ret_val = -EINVAL;
+	struct cipso_v4_doi *doi_def = NULL;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	int nla_a_rem;
+	int nla_b_rem;
+	u32 iter;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_TAGLST] ||
+	    !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST])
+		return -EINVAL;
+
+	if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+				NLBL_CIPSOV4_A_MAX,
+				netlbl_cipsov4_genl_policy) != 0)
+		return -EINVAL;
+
+	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+	if (doi_def == NULL)
+		return -ENOMEM;
+	doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL);
+	if (doi_def->map.std == NULL) {
+		ret_val = -ENOMEM;
+		goto add_std_failure;
+	}
+	doi_def->type = CIPSO_V4_MAP_TRANS;
+
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
+		goto add_std_failure;
+	ret_val = -EINVAL;
+
+	nla_for_each_nested(nla_a,
+			    info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+			    nla_a_rem)
+		if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) {
+			if (nla_validate_nested(nla_a,
+					    NLBL_CIPSOV4_A_MAX,
+					    netlbl_cipsov4_genl_policy) != 0)
+					goto add_std_failure;
+			nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+				switch (nla_type(nla_b)) {
+				case NLBL_CIPSOV4_A_MLSLVLLOC:
+					if (nla_get_u32(nla_b) >
+					    CIPSO_V4_MAX_LOC_LVLS)
+						goto add_std_failure;
+					if (nla_get_u32(nla_b) >=
+					    doi_def->map.std->lvl.local_size)
+					     doi_def->map.std->lvl.local_size =
+						     nla_get_u32(nla_b) + 1;
+					break;
+				case NLBL_CIPSOV4_A_MLSLVLREM:
+					if (nla_get_u32(nla_b) >
+					    CIPSO_V4_MAX_REM_LVLS)
+						goto add_std_failure;
+					if (nla_get_u32(nla_b) >=
+					    doi_def->map.std->lvl.cipso_size)
+					     doi_def->map.std->lvl.cipso_size =
+						     nla_get_u32(nla_b) + 1;
+					break;
+				}
+		}
+	doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size,
+					      sizeof(u32),
+					      GFP_KERNEL);
+	if (doi_def->map.std->lvl.local == NULL) {
+		ret_val = -ENOMEM;
+		goto add_std_failure;
+	}
+	doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size,
+					      sizeof(u32),
+					      GFP_KERNEL);
+	if (doi_def->map.std->lvl.cipso == NULL) {
+		ret_val = -ENOMEM;
+		goto add_std_failure;
+	}
+	for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++)
+		doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL;
+	for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++)
+		doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL;
+	nla_for_each_nested(nla_a,
+			    info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+			    nla_a_rem)
+		if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) {
+			struct nlattr *lvl_loc;
+			struct nlattr *lvl_rem;
+
+			lvl_loc = nla_find_nested(nla_a,
+						  NLBL_CIPSOV4_A_MLSLVLLOC);
+			lvl_rem = nla_find_nested(nla_a,
+						  NLBL_CIPSOV4_A_MLSLVLREM);
+			if (lvl_loc == NULL || lvl_rem == NULL)
+				goto add_std_failure;
+			doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] =
+				nla_get_u32(lvl_rem);
+			doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] =
+				nla_get_u32(lvl_loc);
+		}
+
+	if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) {
+		if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+					NLBL_CIPSOV4_A_MAX,
+					netlbl_cipsov4_genl_policy) != 0)
+			goto add_std_failure;
+
+		nla_for_each_nested(nla_a,
+				    info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+				    nla_a_rem)
+			if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) {
+				if (nla_validate_nested(nla_a,
+					      NLBL_CIPSOV4_A_MAX,
+					      netlbl_cipsov4_genl_policy) != 0)
+					goto add_std_failure;
+				nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+					switch (nla_type(nla_b)) {
+					case NLBL_CIPSOV4_A_MLSCATLOC:
+						if (nla_get_u32(nla_b) >
+						    CIPSO_V4_MAX_LOC_CATS)
+							goto add_std_failure;
+						if (nla_get_u32(nla_b) >=
+					      doi_def->map.std->cat.local_size)
+					     doi_def->map.std->cat.local_size =
+						     nla_get_u32(nla_b) + 1;
+						break;
+					case NLBL_CIPSOV4_A_MLSCATREM:
+						if (nla_get_u32(nla_b) >
+						    CIPSO_V4_MAX_REM_CATS)
+							goto add_std_failure;
+						if (nla_get_u32(nla_b) >=
+					      doi_def->map.std->cat.cipso_size)
+					     doi_def->map.std->cat.cipso_size =
+						     nla_get_u32(nla_b) + 1;
+						break;
+					}
+			}
+		doi_def->map.std->cat.local = kcalloc(
+					      doi_def->map.std->cat.local_size,
+					      sizeof(u32),
+					      GFP_KERNEL);
+		if (doi_def->map.std->cat.local == NULL) {
+			ret_val = -ENOMEM;
+			goto add_std_failure;
+		}
+		doi_def->map.std->cat.cipso = kcalloc(
+					      doi_def->map.std->cat.cipso_size,
+					      sizeof(u32),
+					      GFP_KERNEL);
+		if (doi_def->map.std->cat.cipso == NULL) {
+			ret_val = -ENOMEM;
+			goto add_std_failure;
+		}
+		for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++)
+			doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT;
+		for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++)
+			doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT;
+		nla_for_each_nested(nla_a,
+				    info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+				    nla_a_rem)
+			if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) {
+				struct nlattr *cat_loc;
+				struct nlattr *cat_rem;
+
+				cat_loc = nla_find_nested(nla_a,
+						     NLBL_CIPSOV4_A_MLSCATLOC);
+				cat_rem = nla_find_nested(nla_a,
+						     NLBL_CIPSOV4_A_MLSCATREM);
+				if (cat_loc == NULL || cat_rem == NULL)
+					goto add_std_failure;
+				doi_def->map.std->cat.local[
+							nla_get_u32(cat_loc)] =
+					nla_get_u32(cat_rem);
+				doi_def->map.std->cat.cipso[
+							nla_get_u32(cat_rem)] =
+					nla_get_u32(cat_loc);
+			}
+	}
+
+	ret_val = cipso_v4_doi_add(doi_def, audit_info);
+	if (ret_val != 0)
+		goto add_std_failure;
+	return 0;
+
+add_std_failure:
+	if (doi_def)
+		cipso_v4_doi_free(doi_def);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message
+ * and add it to the CIPSO V4 engine.  Return zero on success and non-zero on
+ * error.
+ *
+ */
+static int netlbl_cipsov4_add_pass(struct genl_info *info,
+				   struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct cipso_v4_doi *doi_def = NULL;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_TAGLST])
+		return -EINVAL;
+
+	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+	if (doi_def == NULL)
+		return -ENOMEM;
+	doi_def->type = CIPSO_V4_MAP_PASS;
+
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
+		goto add_pass_failure;
+
+	ret_val = cipso_v4_doi_add(doi_def, audit_info);
+	if (ret_val != 0)
+		goto add_pass_failure;
+	return 0;
+
+add_pass_failure:
+	cipso_v4_doi_free(doi_def);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD
+ * message and add it to the CIPSO V4 engine.  Return zero on success and
+ * non-zero on error.
+ *
+ */
+static int netlbl_cipsov4_add_local(struct genl_info *info,
+				    struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct cipso_v4_doi *doi_def = NULL;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_TAGLST])
+		return -EINVAL;
+
+	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+	if (doi_def == NULL)
+		return -ENOMEM;
+	doi_def->type = CIPSO_V4_MAP_LOCAL;
+
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
+		goto add_local_failure;
+
+	ret_val = cipso_v4_doi_add(doi_def, audit_info);
+	if (ret_val != 0)
+		goto add_local_failure;
+	return 0;
+
+add_local_failure:
+	cipso_v4_doi_free(doi_def);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new DOI definition based on the given ADD message and add it to the
+ * CIPSO V4 engine.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
+
+{
+	int ret_val = -EINVAL;
+	struct netlbl_audit audit_info;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI] ||
+	    !info->attrs[NLBL_CIPSOV4_A_MTYPE])
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+	switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) {
+	case CIPSO_V4_MAP_TRANS:
+		ret_val = netlbl_cipsov4_add_std(info, &audit_info);
+		break;
+	case CIPSO_V4_MAP_PASS:
+		ret_val = netlbl_cipsov4_add_pass(info, &audit_info);
+		break;
+	case CIPSO_V4_MAP_LOCAL:
+		ret_val = netlbl_cipsov4_add_local(info, &audit_info);
+		break;
+	}
+	if (ret_val == 0)
+		atomic_inc(&netlabel_mgmt_protocount);
+
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond accordingly.  While the
+ * response message generated by the kernel is straightforward, determining
+ * before hand the size of the buffer to allocate is not (we have to generate
+ * the message to know the size).  In order to keep this function sane what we
+ * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in
+ * that size, if we fail then we restart with a larger buffer and try again.
+ * We continue in this manner until we hit a limit of failed attempts then we
+ * give up and just send an error message.  Returns zero on success and
+ * negative values on error.
+ *
+ */
+static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val;
+	struct sk_buff *ans_skb = NULL;
+	u32 nlsze_mult = 1;
+	void *data;
+	u32 doi;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	struct cipso_v4_doi *doi_def;
+	u32 iter;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI]) {
+		ret_val = -EINVAL;
+		goto list_failure;
+	}
+
+list_start:
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE * nlsze_mult, GFP_KERNEL);
+	if (ans_skb == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure;
+	}
+	data = genlmsg_put_reply(ans_skb, info, &netlbl_cipsov4_gnl_family,
+				 0, NLBL_CIPSOV4_C_LIST);
+	if (data == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure;
+	}
+
+	doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_getdef(doi);
+	if (doi_def == NULL) {
+		ret_val = -EINVAL;
+		goto list_failure_lock;
+	}
+
+	ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type);
+	if (ret_val != 0)
+		goto list_failure_lock;
+
+	nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_TAGLST);
+	if (nla_a == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure_lock;
+	}
+	for (iter = 0;
+	     iter < CIPSO_V4_TAG_MAXCNT &&
+	       doi_def->tags[iter] != CIPSO_V4_TAG_INVALID;
+	     iter++) {
+		ret_val = nla_put_u8(ans_skb,
+				     NLBL_CIPSOV4_A_TAG,
+				     doi_def->tags[iter]);
+		if (ret_val != 0)
+			goto list_failure_lock;
+	}
+	nla_nest_end(ans_skb, nla_a);
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_TRANS:
+		nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST);
+		if (nla_a == NULL) {
+			ret_val = -ENOMEM;
+			goto list_failure_lock;
+		}
+		for (iter = 0;
+		     iter < doi_def->map.std->lvl.local_size;
+		     iter++) {
+			if (doi_def->map.std->lvl.local[iter] ==
+			    CIPSO_V4_INV_LVL)
+				continue;
+
+			nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVL);
+			if (nla_b == NULL) {
+				ret_val = -ENOMEM;
+				goto list_retry;
+			}
+			ret_val = nla_put_u32(ans_skb,
+					      NLBL_CIPSOV4_A_MLSLVLLOC,
+					      iter);
+			if (ret_val != 0)
+				goto list_retry;
+			ret_val = nla_put_u32(ans_skb,
+					    NLBL_CIPSOV4_A_MLSLVLREM,
+					    doi_def->map.std->lvl.local[iter]);
+			if (ret_val != 0)
+				goto list_retry;
+			nla_nest_end(ans_skb, nla_b);
+		}
+		nla_nest_end(ans_skb, nla_a);
+
+		nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCATLST);
+		if (nla_a == NULL) {
+			ret_val = -ENOMEM;
+			goto list_retry;
+		}
+		for (iter = 0;
+		     iter < doi_def->map.std->cat.local_size;
+		     iter++) {
+			if (doi_def->map.std->cat.local[iter] ==
+			    CIPSO_V4_INV_CAT)
+				continue;
+
+			nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCAT);
+			if (nla_b == NULL) {
+				ret_val = -ENOMEM;
+				goto list_retry;
+			}
+			ret_val = nla_put_u32(ans_skb,
+					      NLBL_CIPSOV4_A_MLSCATLOC,
+					      iter);
+			if (ret_val != 0)
+				goto list_retry;
+			ret_val = nla_put_u32(ans_skb,
+					    NLBL_CIPSOV4_A_MLSCATREM,
+					    doi_def->map.std->cat.local[iter]);
+			if (ret_val != 0)
+				goto list_retry;
+			nla_nest_end(ans_skb, nla_b);
+		}
+		nla_nest_end(ans_skb, nla_a);
+
+		break;
+	}
+	rcu_read_unlock();
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+list_retry:
+	/* XXX - this limit is a guesstimate */
+	if (nlsze_mult < 4) {
+		rcu_read_unlock();
+		kfree_skb(ans_skb);
+		nlsze_mult *= 2;
+		goto list_start;
+	}
+list_failure_lock:
+	rcu_read_unlock();
+list_failure:
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL
+ * @doi_def: the CIPSOv4 DOI definition
+ * @arg: the netlbl_cipsov4_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * cipso_v4_doi_walk() function for use in generating a response for a LISTALL
+ * message.  Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg;
+	void *data;
+
+	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
+			   cb_arg->seq, &netlbl_cipsov4_gnl_family,
+			   NLM_F_MULTI, NLBL_CIPSOV4_C_LISTALL);
+	if (data == NULL)
+		goto listall_cb_failure;
+
+	ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+	ret_val = nla_put_u32(cb_arg->skb,
+			      NLBL_CIPSOV4_A_MTYPE,
+			      doi_def->type);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+
+	return genlmsg_end(cb_arg->skb, data);
+
+listall_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and respond accordingly.  Returns
+ * zero on success and negative values on error.
+ *
+ */
+static int netlbl_cipsov4_listall(struct sk_buff *skb,
+				  struct netlink_callback *cb)
+{
+	struct netlbl_cipsov4_doiwalk_arg cb_arg;
+	u32 doi_skip = cb->args[0];
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg);
+
+	cb->args[0] = doi_skip;
+	return skb->len;
+}
+
+/**
+ * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_cipsov4_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CIPSO DOI specified in @arg.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+	struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+	if (entry->type == NETLBL_NLTYPE_CIPSOV4 &&
+	    entry->type_def.cipsov4->doi == cb_arg->doi)
+		return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+	return 0;
+}
+
+/**
+ * netlbl_cipsov4_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and respond accordingly.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val = -EINVAL;
+	struct netlbl_domhsh_walk_arg cb_arg;
+	struct netlbl_audit audit_info;
+	u32 skip_bkt = 0;
+	u32 skip_chain = 0;
+
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI])
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+	cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+	cb_arg.audit_info = &audit_info;
+	ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+				     netlbl_cipsov4_remove_cb, &cb_arg);
+	if (ret_val == 0 || ret_val == -ENOENT) {
+		ret_val = cipso_v4_doi_remove(cb_arg.doi, &audit_info);
+		if (ret_val == 0)
+			atomic_dec(&netlabel_mgmt_protocount);
+	}
+
+	return ret_val;
+}
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static struct genl_ops netlbl_cipsov4_ops[] = {
+	{
+	.cmd = NLBL_CIPSOV4_C_ADD,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_cipsov4_genl_policy,
+	.doit = netlbl_cipsov4_add,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_CIPSOV4_C_REMOVE,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_cipsov4_genl_policy,
+	.doit = netlbl_cipsov4_remove,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_CIPSOV4_C_LIST,
+	.flags = 0,
+	.policy = netlbl_cipsov4_genl_policy,
+	.doit = netlbl_cipsov4_list,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_CIPSOV4_C_LISTALL,
+	.flags = 0,
+	.policy = netlbl_cipsov4_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_cipsov4_listall,
+	},
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_cipsov4_genl_init - Register the CIPSOv4 NetLabel component
+ *
+ * Description:
+ * Register the CIPSOv4 packet NetLabel component with the Generic NETLINK
+ * mechanism.  Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_cipsov4_genl_init(void)
+{
+	return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
+		netlbl_cipsov4_ops, ARRAY_SIZE(netlbl_cipsov4_ops));
+}
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
new file mode 100644
index 00000000..af7f3355
--- /dev/null
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -0,0 +1,170 @@
+/*
+ * NetLabel CIPSO/IPv4 Support
+ *
+ * This file defines the CIPSO/IPv4 functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_CIPSO_V4
+#define _NETLABEL_CIPSO_V4
+
+#include <net/netlabel.h>
+
+/*
+ * The following NetLabel payloads are supported by the CIPSO subsystem.
+ *
+ * o ADD:
+ *   Sent by an application to add a new DOI mapping table.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CIPSOV4_A_DOI
+ *     NLBL_CIPSOV4_A_MTYPE
+ *     NLBL_CIPSOV4_A_TAGLST
+ *
+ *   If using CIPSO_V4_MAP_TRANS the following attributes are required:
+ *
+ *     NLBL_CIPSOV4_A_MLSLVLLST
+ *     NLBL_CIPSOV4_A_MLSCATLST
+ *
+ *   If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ *   are required.
+ *
+ * o REMOVE:
+ *   Sent by an application to remove a specific DOI mapping table from the
+ *   CIPSO V4 system.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CIPSOV4_A_DOI
+ *
+ * o LIST:
+ *   Sent by an application to list the details of a DOI definition.  On
+ *   success the kernel should send a response using the following format.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CIPSOV4_A_DOI
+ *
+ *   The valid response message format depends on the type of the DOI mapping,
+ *   the defined formats are shown below.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_CIPSOV4_A_MTYPE
+ *     NLBL_CIPSOV4_A_TAGLST
+ *
+ *   If using CIPSO_V4_MAP_TRANS the following attributes are required:
+ *
+ *     NLBL_CIPSOV4_A_MLSLVLLST
+ *     NLBL_CIPSOV4_A_MLSCATLST
+ *
+ *   If using CIPSO_V4_MAP_PASS or CIPSO_V4_MAP_LOCAL no additional attributes
+ *   are required.
+ *
+ * o LISTALL:
+ *   This message is sent by an application to list the valid DOIs on the
+ *   system.  When sent by an application there is no payload and the
+ *   NLM_F_DUMP flag should be set.  The kernel should respond with a series of
+ *   the following messages.
+ *
+ *   Required attributes:
+ *
+ *    NLBL_CIPSOV4_A_DOI
+ *    NLBL_CIPSOV4_A_MTYPE
+ *
+ */
+
+/* NetLabel CIPSOv4 commands */
+enum {
+	NLBL_CIPSOV4_C_UNSPEC,
+	NLBL_CIPSOV4_C_ADD,
+	NLBL_CIPSOV4_C_REMOVE,
+	NLBL_CIPSOV4_C_LIST,
+	NLBL_CIPSOV4_C_LISTALL,
+	__NLBL_CIPSOV4_C_MAX,
+};
+
+/* NetLabel CIPSOv4 attributes */
+enum {
+	NLBL_CIPSOV4_A_UNSPEC,
+	NLBL_CIPSOV4_A_DOI,
+	/* (NLA_U32)
+	 * the DOI value */
+	NLBL_CIPSOV4_A_MTYPE,
+	/* (NLA_U32)
+	 * the mapping table type (defined in the cipso_ipv4.h header as
+	 * CIPSO_V4_MAP_*) */
+	NLBL_CIPSOV4_A_TAG,
+	/* (NLA_U8)
+	 * a CIPSO tag type, meant to be used within a NLBL_CIPSOV4_A_TAGLST
+	 * attribute */
+	NLBL_CIPSOV4_A_TAGLST,
+	/* (NLA_NESTED)
+	 * the CIPSO tag list for the DOI, there must be at least one
+	 * NLBL_CIPSOV4_A_TAG attribute, tags listed first are given higher
+	 * priorirty when sending packets */
+	NLBL_CIPSOV4_A_MLSLVLLOC,
+	/* (NLA_U32)
+	 * the local MLS sensitivity level */
+	NLBL_CIPSOV4_A_MLSLVLREM,
+	/* (NLA_U32)
+	 * the remote MLS sensitivity level */
+	NLBL_CIPSOV4_A_MLSLVL,
+	/* (NLA_NESTED)
+	 * a MLS sensitivity level mapping, must contain only one attribute of
+	 * each of the following types: NLBL_CIPSOV4_A_MLSLVLLOC and
+	 * NLBL_CIPSOV4_A_MLSLVLREM */
+	NLBL_CIPSOV4_A_MLSLVLLST,
+	/* (NLA_NESTED)
+	 * the CIPSO level mappings, there must be at least one
+	 * NLBL_CIPSOV4_A_MLSLVL attribute */
+	NLBL_CIPSOV4_A_MLSCATLOC,
+	/* (NLA_U32)
+	 * the local MLS category */
+	NLBL_CIPSOV4_A_MLSCATREM,
+	/* (NLA_U32)
+	 * the remote MLS category */
+	NLBL_CIPSOV4_A_MLSCAT,
+	/* (NLA_NESTED)
+	 * a MLS category mapping, must contain only one attribute of each of
+	 * the following types: NLBL_CIPSOV4_A_MLSCATLOC and
+	 * NLBL_CIPSOV4_A_MLSCATREM */
+	NLBL_CIPSOV4_A_MLSCATLST,
+	/* (NLA_NESTED)
+	 * the CIPSO category mappings, there must be at least one
+	 * NLBL_CIPSOV4_A_MLSCAT attribute */
+	__NLBL_CIPSOV4_A_MAX,
+};
+#define NLBL_CIPSOV4_A_MAX (__NLBL_CIPSOV4_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_cipsov4_genl_init(void);
+
+/* Free the memory associated with a CIPSOv4 DOI definition */
+void netlbl_cipsov4_doi_free(struct rcu_head *entry);
+
+#endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
new file mode 100644
index 00000000..de0d8e4c
--- /dev/null
+++ b/net/netlabel/netlabel_domainhash.c
@@ -0,0 +1,729 @@
+/*
+ * NetLabel Domain Hash Table
+ *
+ * This file manages the domain hash table that NetLabel uses to determine
+ * which network labeling protocol to use for a given domain.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/bug.h>
+
+#include "netlabel_mgmt.h"
+#include "netlabel_addrlist.h"
+#include "netlabel_domainhash.h"
+#include "netlabel_user.h"
+
+struct netlbl_domhsh_tbl {
+	struct list_head *tbl;
+	u32 size;
+};
+
+/* Domain hash table */
+/* updates should be so rare that having one spinlock for the entire hash table
+ * should be okay */
+static DEFINE_SPINLOCK(netlbl_domhsh_lock);
+#define netlbl_domhsh_rcu_deref(p) \
+	rcu_dereference_check(p, rcu_read_lock_held() || \
+				 lockdep_is_held(&netlbl_domhsh_lock))
+static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
+static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
+
+/*
+ * Domain Hash Table Helper Functions
+ */
+
+/**
+ * netlbl_domhsh_free_entry - Frees a domain hash table entry
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to a hash table entry can be released
+ * safely.
+ *
+ */
+static void netlbl_domhsh_free_entry(struct rcu_head *entry)
+{
+	struct netlbl_dom_map *ptr;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+	ptr = container_of(entry, struct netlbl_dom_map, rcu);
+	if (ptr->type == NETLBL_NLTYPE_ADDRSELECT) {
+		netlbl_af4list_foreach_safe(iter4, tmp4,
+					    &ptr->type_def.addrsel->list4) {
+			netlbl_af4list_remove_entry(iter4);
+			kfree(netlbl_domhsh_addr4_entry(iter4));
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_safe(iter6, tmp6,
+					    &ptr->type_def.addrsel->list6) {
+			netlbl_af6list_remove_entry(iter6);
+			kfree(netlbl_domhsh_addr6_entry(iter6));
+		}
+#endif /* IPv6 */
+	}
+	kfree(ptr->domain);
+	kfree(ptr);
+}
+
+/**
+ * netlbl_domhsh_hash - Hashing function for the domain hash table
+ * @domain: the domain name to hash
+ *
+ * Description:
+ * This is the hashing function for the domain hash table, it returns the
+ * correct bucket number for the domain.  The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or the
+ * hash table lock.
+ *
+ */
+static u32 netlbl_domhsh_hash(const char *key)
+{
+	u32 iter;
+	u32 val;
+	u32 len;
+
+	/* This is taken (with slight modification) from
+	 * security/selinux/ss/symtab.c:symhash() */
+
+	for (iter = 0, val = 0, len = strlen(key); iter < len; iter++)
+		val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter];
+	return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
+}
+
+/**
+ * netlbl_domhsh_search - Search for a domain entry
+ * @domain: the domain
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if found, otherwise NULL is returned.  The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or the
+ * hash table lock.
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
+{
+	u32 bkt;
+	struct list_head *bkt_list;
+	struct netlbl_dom_map *iter;
+
+	if (domain != NULL) {
+		bkt = netlbl_domhsh_hash(domain);
+		bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
+		list_for_each_entry_rcu(iter, bkt_list, list)
+			if (iter->valid && strcmp(iter->domain, domain) == 0)
+				return iter;
+	}
+
+	return NULL;
+}
+
+/**
+ * netlbl_domhsh_search_def - Search for a domain entry
+ * @domain: the domain
+ * @def: return default if no match is found
+ *
+ * Description:
+ * Searches the domain hash table and returns a pointer to the hash table
+ * entry if an exact match is found, if an exact match is not present in the
+ * hash table then the default entry is returned if valid otherwise NULL is
+ * returned.  The caller is responsible ensuring that the hash table is
+ * protected with either a RCU read lock or the hash table lock.
+ *
+ */
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+{
+	struct netlbl_dom_map *entry;
+
+	entry = netlbl_domhsh_search(domain);
+	if (entry == NULL) {
+		entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
+		if (entry != NULL && !entry->valid)
+			entry = NULL;
+	}
+
+	return entry;
+}
+
+/**
+ * netlbl_domhsh_audit_add - Generate an audit entry for an add event
+ * @entry: the entry being added
+ * @addr4: the IPv4 address information
+ * @addr6: the IPv6 address information
+ * @result: the result code
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Generate an audit record for adding a new NetLabel/LSM mapping entry with
+ * the given information.  Caller is responsible for holding the necessary
+ * locks.
+ *
+ */
+static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
+				    struct netlbl_af4list *addr4,
+				    struct netlbl_af6list *addr6,
+				    int result,
+				    struct netlbl_audit *audit_info)
+{
+	struct audit_buffer *audit_buf;
+	struct cipso_v4_doi *cipsov4 = NULL;
+	u32 type;
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
+	if (audit_buf != NULL) {
+		audit_log_format(audit_buf, " nlbl_domain=%s",
+				 entry->domain ? entry->domain : "(default)");
+		if (addr4 != NULL) {
+			struct netlbl_domaddr4_map *map4;
+			map4 = netlbl_domhsh_addr4_entry(addr4);
+			type = map4->type;
+			cipsov4 = map4->type_def.cipsov4;
+			netlbl_af4list_audit_addr(audit_buf, 0, NULL,
+						  addr4->addr, addr4->mask);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		} else if (addr6 != NULL) {
+			struct netlbl_domaddr6_map *map6;
+			map6 = netlbl_domhsh_addr6_entry(addr6);
+			type = map6->type;
+			netlbl_af6list_audit_addr(audit_buf, 0, NULL,
+						  &addr6->addr, &addr6->mask);
+#endif /* IPv6 */
+		} else {
+			type = entry->type;
+			cipsov4 = entry->type_def.cipsov4;
+		}
+		switch (type) {
+		case NETLBL_NLTYPE_UNLABELED:
+			audit_log_format(audit_buf, " nlbl_protocol=unlbl");
+			break;
+		case NETLBL_NLTYPE_CIPSOV4:
+			BUG_ON(cipsov4 == NULL);
+			audit_log_format(audit_buf,
+					 " nlbl_protocol=cipsov4 cipso_doi=%u",
+					 cipsov4->doi);
+			break;
+		}
+		audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+}
+
+/*
+ * Domain Hash Table Functions
+ */
+
+/**
+ * netlbl_domhsh_init - Init for the domain hash
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the domain hash table, should be called only by
+ * netlbl_user_init() during initialization.  Returns zero on success, non-zero
+ * values on error.
+ *
+ */
+int __init netlbl_domhsh_init(u32 size)
+{
+	u32 iter;
+	struct netlbl_domhsh_tbl *hsh_tbl;
+
+	if (size == 0)
+		return -EINVAL;
+
+	hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+	if (hsh_tbl == NULL)
+		return -ENOMEM;
+	hsh_tbl->size = 1 << size;
+	hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+			       sizeof(struct list_head),
+			       GFP_KERNEL);
+	if (hsh_tbl->tbl == NULL) {
+		kfree(hsh_tbl);
+		return -ENOMEM;
+	}
+	for (iter = 0; iter < hsh_tbl->size; iter++)
+		INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+	spin_lock(&netlbl_domhsh_lock);
+	rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
+	spin_unlock(&netlbl_domhsh_lock);
+
+	return 0;
+}
+
+/**
+ * netlbl_domhsh_add - Adds a entry to the domain hash table
+ * @entry: the entry to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_add(struct netlbl_dom_map *entry,
+		      struct netlbl_audit *audit_info)
+{
+	int ret_val = 0;
+	struct netlbl_dom_map *entry_old;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+	/* XXX - we can remove this RCU read lock as the spinlock protects the
+	 *       entire function, but before we do we need to fixup the
+	 *       netlbl_af[4,6]list RCU functions to do "the right thing" with
+	 *       respect to rcu_dereference() when only a spinlock is held. */
+	rcu_read_lock();
+	spin_lock(&netlbl_domhsh_lock);
+	if (entry->domain != NULL)
+		entry_old = netlbl_domhsh_search(entry->domain);
+	else
+		entry_old = netlbl_domhsh_search_def(entry->domain);
+	if (entry_old == NULL) {
+		entry->valid = 1;
+
+		if (entry->domain != NULL) {
+			u32 bkt = netlbl_domhsh_hash(entry->domain);
+			list_add_tail_rcu(&entry->list,
+				    &rcu_dereference(netlbl_domhsh)->tbl[bkt]);
+		} else {
+			INIT_LIST_HEAD(&entry->list);
+			rcu_assign_pointer(netlbl_domhsh_def, entry);
+		}
+
+		if (entry->type == NETLBL_NLTYPE_ADDRSELECT) {
+			netlbl_af4list_foreach_rcu(iter4,
+					       &entry->type_def.addrsel->list4)
+				netlbl_domhsh_audit_add(entry, iter4, NULL,
+							ret_val, audit_info);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			netlbl_af6list_foreach_rcu(iter6,
+					       &entry->type_def.addrsel->list6)
+				netlbl_domhsh_audit_add(entry, NULL, iter6,
+							ret_val, audit_info);
+#endif /* IPv6 */
+		} else
+			netlbl_domhsh_audit_add(entry, NULL, NULL,
+						ret_val, audit_info);
+	} else if (entry_old->type == NETLBL_NLTYPE_ADDRSELECT &&
+		   entry->type == NETLBL_NLTYPE_ADDRSELECT) {
+		struct list_head *old_list4;
+		struct list_head *old_list6;
+
+		old_list4 = &entry_old->type_def.addrsel->list4;
+		old_list6 = &entry_old->type_def.addrsel->list6;
+
+		/* we only allow the addition of address selectors if all of
+		 * the selectors do not exist in the existing domain map */
+		netlbl_af4list_foreach_rcu(iter4,
+					   &entry->type_def.addrsel->list4)
+			if (netlbl_af4list_search_exact(iter4->addr,
+							iter4->mask,
+							old_list4)) {
+				ret_val = -EEXIST;
+				goto add_return;
+			}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_rcu(iter6,
+					   &entry->type_def.addrsel->list6)
+			if (netlbl_af6list_search_exact(&iter6->addr,
+							&iter6->mask,
+							old_list6)) {
+				ret_val = -EEXIST;
+				goto add_return;
+			}
+#endif /* IPv6 */
+
+		netlbl_af4list_foreach_safe(iter4, tmp4,
+					    &entry->type_def.addrsel->list4) {
+			netlbl_af4list_remove_entry(iter4);
+			iter4->valid = 1;
+			ret_val = netlbl_af4list_add(iter4, old_list4);
+			netlbl_domhsh_audit_add(entry_old, iter4, NULL,
+						ret_val, audit_info);
+			if (ret_val != 0)
+				goto add_return;
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_safe(iter6, tmp6,
+					    &entry->type_def.addrsel->list6) {
+			netlbl_af6list_remove_entry(iter6);
+			iter6->valid = 1;
+			ret_val = netlbl_af6list_add(iter6, old_list6);
+			netlbl_domhsh_audit_add(entry_old, NULL, iter6,
+						ret_val, audit_info);
+			if (ret_val != 0)
+				goto add_return;
+		}
+#endif /* IPv6 */
+	} else
+		ret_val = -EINVAL;
+
+add_return:
+	spin_unlock(&netlbl_domhsh_lock);
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * netlbl_domhsh_add_default - Adds the default entry to the domain hash table
+ * @entry: the entry to add
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new default entry to the domain hash table and handles any updates
+ * to the lower level protocol handler (i.e. CIPSO).  Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
+			      struct netlbl_audit *audit_info)
+{
+	return netlbl_domhsh_add(entry, audit_info);
+}
+
+/**
+ * netlbl_domhsh_remove_entry - Removes a given entry from the domain table
+ * @entry: the entry to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an entry from the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO).  Caller is responsible for
+ * ensuring that the RCU read lock is held.  Returns zero on success, negative
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+			       struct netlbl_audit *audit_info)
+{
+	int ret_val = 0;
+	struct audit_buffer *audit_buf;
+
+	if (entry == NULL)
+		return -ENOENT;
+
+	spin_lock(&netlbl_domhsh_lock);
+	if (entry->valid) {
+		entry->valid = 0;
+		if (entry != rcu_dereference(netlbl_domhsh_def))
+			list_del_rcu(&entry->list);
+		else
+			rcu_assign_pointer(netlbl_domhsh_def, NULL);
+	} else
+		ret_val = -ENOENT;
+	spin_unlock(&netlbl_domhsh_lock);
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
+	if (audit_buf != NULL) {
+		audit_log_format(audit_buf,
+				 " nlbl_domain=%s res=%u",
+				 entry->domain ? entry->domain : "(default)",
+				 ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	if (ret_val == 0) {
+		struct netlbl_af4list *iter4;
+		struct netlbl_domaddr4_map *map4;
+
+		switch (entry->type) {
+		case NETLBL_NLTYPE_ADDRSELECT:
+			netlbl_af4list_foreach_rcu(iter4,
+					     &entry->type_def.addrsel->list4) {
+				map4 = netlbl_domhsh_addr4_entry(iter4);
+				cipso_v4_doi_putdef(map4->type_def.cipsov4);
+			}
+			/* no need to check the IPv6 list since we currently
+			 * support only unlabeled protocols for IPv6 */
+			break;
+		case NETLBL_NLTYPE_CIPSOV4:
+			cipso_v4_doi_putdef(entry->type_def.cipsov4);
+			break;
+		}
+		call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
+	}
+
+	return ret_val;
+}
+
+/**
+ * netlbl_domhsh_remove_af4 - Removes an address selector entry
+ * @domain: the domain
+ * @addr: IPv4 address
+ * @mask: IPv4 address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an individual address selector from a domain mapping and potentially
+ * the entire mapping if it is empty.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_af4(const char *domain,
+			     const struct in_addr *addr,
+			     const struct in_addr *mask,
+			     struct netlbl_audit *audit_info)
+{
+	struct netlbl_dom_map *entry_map;
+	struct netlbl_af4list *entry_addr;
+	struct netlbl_af4list *iter4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+#endif /* IPv6 */
+	struct netlbl_domaddr4_map *entry;
+
+	rcu_read_lock();
+
+	if (domain)
+		entry_map = netlbl_domhsh_search(domain);
+	else
+		entry_map = netlbl_domhsh_search_def(domain);
+	if (entry_map == NULL || entry_map->type != NETLBL_NLTYPE_ADDRSELECT)
+		goto remove_af4_failure;
+
+	spin_lock(&netlbl_domhsh_lock);
+	entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
+					   &entry_map->type_def.addrsel->list4);
+	spin_unlock(&netlbl_domhsh_lock);
+
+	if (entry_addr == NULL)
+		goto remove_af4_failure;
+	netlbl_af4list_foreach_rcu(iter4, &entry_map->type_def.addrsel->list4)
+		goto remove_af4_single_addr;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_rcu(iter6, &entry_map->type_def.addrsel->list6)
+		goto remove_af4_single_addr;
+#endif /* IPv6 */
+	/* the domain mapping is empty so remove it from the mapping table */
+	netlbl_domhsh_remove_entry(entry_map, audit_info);
+
+remove_af4_single_addr:
+	rcu_read_unlock();
+	/* yick, we can't use call_rcu here because we don't have a rcu head
+	 * pointer but hopefully this should be a rare case so the pause
+	 * shouldn't be a problem */
+	synchronize_rcu();
+	entry = netlbl_domhsh_addr4_entry(entry_addr);
+	cipso_v4_doi_putdef(entry->type_def.cipsov4);
+	kfree(entry);
+	return 0;
+
+remove_af4_failure:
+	rcu_read_unlock();
+	return -ENOENT;
+}
+
+/**
+ * netlbl_domhsh_remove - Removes an entry from the domain hash table
+ * @domain: the domain to remove
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an entry from the domain hash table and handles any updates to the
+ * lower level protocol handler (i.e. CIPSO).  Returns zero on success,
+ * negative on failure.
+ *
+ */
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct netlbl_dom_map *entry;
+
+	rcu_read_lock();
+	if (domain)
+		entry = netlbl_domhsh_search(domain);
+	else
+		entry = netlbl_domhsh_search_def(domain);
+	ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+	rcu_read_unlock();
+
+	return ret_val;
+}
+
+/**
+ * netlbl_domhsh_remove_default - Removes the default entry from the table
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes/resets the default entry for the domain hash table and handles any
+ * updates to the lower level protocol handler (i.e. CIPSO).  Returns zero on
+ * success, non-zero on failure.
+ *
+ */
+int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
+{
+	return netlbl_domhsh_remove(NULL, audit_info);
+}
+
+/**
+ * netlbl_domhsh_getentry - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain,
+ * return a pointer to a copy of the entry or NULL.  The caller is responsible
+ * for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
+{
+	return netlbl_domhsh_search_def(domain);
+}
+
+/**
+ * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
+						       __be32 addr)
+{
+	struct netlbl_dom_map *dom_iter;
+	struct netlbl_af4list *addr_iter;
+
+	dom_iter = netlbl_domhsh_search_def(domain);
+	if (dom_iter == NULL)
+		return NULL;
+	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
+		return NULL;
+
+	addr_iter = netlbl_af4list_search(addr,
+					  &dom_iter->type_def.addrsel->list4);
+	if (addr_iter == NULL)
+		return NULL;
+
+	return netlbl_domhsh_addr4_entry(addr_iter);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
+ * @domain: the domain name to search for
+ * @addr: the IP address to search for
+ *
+ * Description:
+ * Look through the domain hash table searching for an entry to match @domain
+ * and @addr, return a pointer to a copy of the entry or NULL.  The caller is
+ * responsible for ensuring that rcu_read_[un]lock() is called.
+ *
+ */
+struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
+						   const struct in6_addr *addr)
+{
+	struct netlbl_dom_map *dom_iter;
+	struct netlbl_af6list *addr_iter;
+
+	dom_iter = netlbl_domhsh_search_def(domain);
+	if (dom_iter == NULL)
+		return NULL;
+	if (dom_iter->type != NETLBL_NLTYPE_ADDRSELECT)
+		return NULL;
+
+	addr_iter = netlbl_af6list_search(addr,
+					  &dom_iter->type_def.addrsel->list6);
+	if (addr_iter == NULL)
+		return NULL;
+
+	return netlbl_domhsh_addr6_entry(addr_iter);
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_domhsh_walk - Iterate through the domain mapping hash table
+ * @skip_bkt: the number of buckets to skip at the start
+ * @skip_chain: the number of entries to skip in the first iterated bucket
+ * @callback: callback for each entry
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Interate over the domain mapping hash table, skipping the first @skip_bkt
+ * buckets and @skip_chain entries.  For each entry in the table call
+ * @callback, if @callback returns a negative value stop 'walking' through the
+ * table and return.  Updates the values in @skip_bkt and @skip_chain on
+ * return.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_domhsh_walk(u32 *skip_bkt,
+		     u32 *skip_chain,
+		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
+		     void *cb_arg)
+{
+	int ret_val = -ENOENT;
+	u32 iter_bkt;
+	struct list_head *iter_list;
+	struct netlbl_dom_map *iter_entry;
+	u32 chain_cnt = 0;
+
+	rcu_read_lock();
+	for (iter_bkt = *skip_bkt;
+	     iter_bkt < rcu_dereference(netlbl_domhsh)->size;
+	     iter_bkt++, chain_cnt = 0) {
+		iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
+		list_for_each_entry_rcu(iter_entry, iter_list, list)
+			if (iter_entry->valid) {
+				if (chain_cnt++ < *skip_chain)
+					continue;
+				ret_val = callback(iter_entry, cb_arg);
+				if (ret_val < 0) {
+					chain_cnt--;
+					goto walk_return;
+				}
+			}
+	}
+
+walk_return:
+	rcu_read_unlock();
+	*skip_bkt = iter_bkt;
+	*skip_chain = chain_cnt;
+	return ret_val;
+}
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
new file mode 100644
index 00000000..0261dda3
--- /dev/null
+++ b/net/netlabel/netlabel_domainhash.h
@@ -0,0 +1,112 @@
+/*
+ * NetLabel Domain Hash Table
+ *
+ * This file manages the domain hash table that NetLabel uses to determine
+ * which network labeling protocol to use for a given domain.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_DOMAINHASH_H
+#define _NETLABEL_DOMAINHASH_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+
+#include "netlabel_addrlist.h"
+
+/* Domain hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_DOMHSH_BITSIZE       7
+
+/* Domain mapping definition structures */
+#define netlbl_domhsh_addr4_entry(iter) \
+	container_of(iter, struct netlbl_domaddr4_map, list)
+struct netlbl_domaddr4_map {
+	u32 type;
+	union {
+		struct cipso_v4_doi *cipsov4;
+	} type_def;
+
+	struct netlbl_af4list list;
+};
+#define netlbl_domhsh_addr6_entry(iter) \
+	container_of(iter, struct netlbl_domaddr6_map, list)
+struct netlbl_domaddr6_map {
+	u32 type;
+
+	/* NOTE: no 'type_def' union needed at present since we don't currently
+	 *       support any IPv6 labeling protocols */
+
+	struct netlbl_af6list list;
+};
+struct netlbl_domaddr_map {
+	struct list_head list4;
+	struct list_head list6;
+};
+struct netlbl_dom_map {
+	char *domain;
+	u32 type;
+	union {
+		struct cipso_v4_doi *cipsov4;
+		struct netlbl_domaddr_map *addrsel;
+	} type_def;
+
+	u32 valid;
+	struct list_head list;
+	struct rcu_head rcu;
+};
+
+/* init function */
+int netlbl_domhsh_init(u32 size);
+
+/* Manipulate the domain hash table */
+int netlbl_domhsh_add(struct netlbl_dom_map *entry,
+		      struct netlbl_audit *audit_info);
+int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
+			      struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
+			       struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_af4(const char *domain,
+			     const struct in_addr *addr,
+			     const struct in_addr *mask,
+			     struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
+struct netlbl_domaddr4_map *netlbl_domhsh_getentry_af4(const char *domain,
+						       __be32 addr);
+int netlbl_domhsh_walk(u32 *skip_bkt,
+		     u32 *skip_chain,
+		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
+		     void *cb_arg);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+struct netlbl_domaddr6_map *netlbl_domhsh_getentry_af6(const char *domain,
+						  const struct in6_addr *addr);
+#endif /* IPv6 */
+
+#endif
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
new file mode 100644
index 00000000..1b83e000
--- /dev/null
+++ b/net/netlabel/netlabel_kapi.c
@@ -0,0 +1,1090 @@
+/*
+ * NetLabel Kernel API
+ *
+ * This file defines the kernel API for the NetLabel system.  The NetLabel
+ * system manages static and dynamic label mappings for network protocols such
+ * as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/bug.h>
+#include <asm/atomic.h>
+
+#include "netlabel_domainhash.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_user.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_addrlist.h"
+
+/*
+ * Configuration Functions
+ */
+
+/**
+ * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
+ * @domain: the domain mapping to remove
+ * @family: address family
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes a NetLabel/LSM domain mapping.  A @domain value of NULL causes the
+ * default domain mapping to be removed.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+int netlbl_cfg_map_del(const char *domain,
+		       u16 family,
+		       const void *addr,
+		       const void *mask,
+		       struct netlbl_audit *audit_info)
+{
+	if (addr == NULL && mask == NULL) {
+		return netlbl_domhsh_remove(domain, audit_info);
+	} else if (addr != NULL && mask != NULL) {
+		switch (family) {
+		case AF_INET:
+			return netlbl_domhsh_remove_af4(domain, addr, mask,
+							audit_info);
+		default:
+			return -EPFNOSUPPORT;
+		}
+	} else
+		return -EINVAL;
+}
+
+/**
+ * netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping
+ * @domain: the domain mapping to add
+ * @family: address family
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new unlabeled NetLabel/LSM domain mapping.  A @domain value of NULL
+ * causes a new default domain mapping to be added.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_map_add(const char *domain,
+			     u16 family,
+			     const void *addr,
+			     const void *mask,
+			     struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_dom_map *entry;
+	struct netlbl_domaddr_map *addrmap = NULL;
+	struct netlbl_domaddr4_map *map4 = NULL;
+	struct netlbl_domaddr6_map *map6 = NULL;
+	const struct in_addr *addr4, *mask4;
+	const struct in6_addr *addr6, *mask6;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+	if (domain != NULL) {
+		entry->domain = kstrdup(domain, GFP_ATOMIC);
+		if (entry->domain == NULL)
+			goto cfg_unlbl_map_add_failure;
+	}
+
+	if (addr == NULL && mask == NULL)
+		entry->type = NETLBL_NLTYPE_UNLABELED;
+	else if (addr != NULL && mask != NULL) {
+		addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+		if (addrmap == NULL)
+			goto cfg_unlbl_map_add_failure;
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		switch (family) {
+		case AF_INET:
+			addr4 = addr;
+			mask4 = mask;
+			map4 = kzalloc(sizeof(*map4), GFP_ATOMIC);
+			if (map4 == NULL)
+				goto cfg_unlbl_map_add_failure;
+			map4->type = NETLBL_NLTYPE_UNLABELED;
+			map4->list.addr = addr4->s_addr & mask4->s_addr;
+			map4->list.mask = mask4->s_addr;
+			map4->list.valid = 1;
+			ret_val = netlbl_af4list_add(&map4->list,
+						     &addrmap->list4);
+			if (ret_val != 0)
+				goto cfg_unlbl_map_add_failure;
+			break;
+		case AF_INET6:
+			addr6 = addr;
+			mask6 = mask;
+			map6 = kzalloc(sizeof(*map6), GFP_ATOMIC);
+			if (map6 == NULL)
+				goto cfg_unlbl_map_add_failure;
+			map6->type = NETLBL_NLTYPE_UNLABELED;
+			ipv6_addr_copy(&map6->list.addr, addr6);
+			map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0];
+			map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1];
+			map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2];
+			map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3];
+			ipv6_addr_copy(&map6->list.mask, mask6);
+			map6->list.valid = 1;
+			ret_val = netlbl_af4list_add(&map4->list,
+						     &addrmap->list4);
+			if (ret_val != 0)
+				goto cfg_unlbl_map_add_failure;
+			break;
+		default:
+			goto cfg_unlbl_map_add_failure;
+			break;
+		}
+
+		entry->type_def.addrsel = addrmap;
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+	} else {
+		ret_val = -EINVAL;
+		goto cfg_unlbl_map_add_failure;
+	}
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto cfg_unlbl_map_add_failure;
+
+	return 0;
+
+cfg_unlbl_map_add_failure:
+	kfree(entry->domain);
+	kfree(entry);
+	kfree(addrmap);
+	kfree(map4);
+	kfree(map6);
+	return ret_val;
+}
+
+
+/**
+ * netlbl_cfg_unlbl_static_add - Adds a new static label
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order (struct in[6]_addr)
+ * @mask: address mask in network byte order (struct in[6]_addr)
+ * @family: address family
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new NetLabel static label to be used when protocol provided labels
+ * are not present on incoming traffic.  If @dev_name is NULL then the default
+ * interface will be used.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_static_add(struct net *net,
+				const char *dev_name,
+				const void *addr,
+				const void *mask,
+				u16 family,
+				u32 secid,
+				struct netlbl_audit *audit_info)
+{
+	u32 addr_len;
+
+	switch (family) {
+	case AF_INET:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case AF_INET6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EPFNOSUPPORT;
+	}
+
+	return netlbl_unlhsh_add(net,
+				 dev_name, addr, mask, addr_len,
+				 secid, audit_info);
+}
+
+/**
+ * netlbl_cfg_unlbl_static_del - Removes an existing static label
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order (struct in[6]_addr)
+ * @mask: address mask in network byte order (struct in[6]_addr)
+ * @family: address family
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an existing NetLabel static label used when protocol provided labels
+ * are not present on incoming traffic.  If @dev_name is NULL then the default
+ * interface will be used.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_unlbl_static_del(struct net *net,
+				const char *dev_name,
+				const void *addr,
+				const void *mask,
+				u16 family,
+				struct netlbl_audit *audit_info)
+{
+	u32 addr_len;
+
+	switch (family) {
+	case AF_INET:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case AF_INET6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EPFNOSUPPORT;
+	}
+
+	return netlbl_unlhsh_remove(net,
+				    dev_name, addr, mask, addr_len,
+				    audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
+ * @doi_def: CIPSO DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CIPSO DOI definition as defined by @doi_def.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
+			   struct netlbl_audit *audit_info)
+{
+	return cipso_v4_doi_add(doi_def, audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition
+ * @doi: CIPSO DOI
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an existing CIPSO DOI definition matching @doi.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
+{
+	cipso_v4_doi_remove(doi, audit_info);
+}
+
+/**
+ * netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping
+ * @doi: the CIPSO DOI
+ * @domain: the domain mapping to add
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel
+ * subsystem.  A @domain value of NULL adds a new default domain mapping.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_cipsov4_map_add(u32 doi,
+			       const char *domain,
+			       const struct in_addr *addr,
+			       const struct in_addr *mask,
+			       struct netlbl_audit *audit_info)
+{
+	int ret_val = -ENOMEM;
+	struct cipso_v4_doi *doi_def;
+	struct netlbl_dom_map *entry;
+	struct netlbl_domaddr_map *addrmap = NULL;
+	struct netlbl_domaddr4_map *addrinfo = NULL;
+
+	doi_def = cipso_v4_doi_getdef(doi);
+	if (doi_def == NULL)
+		return -ENOENT;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+	if (domain != NULL) {
+		entry->domain = kstrdup(domain, GFP_ATOMIC);
+		if (entry->domain == NULL)
+			goto cfg_cipsov4_map_add_failure;
+	}
+
+	if (addr == NULL && mask == NULL) {
+		entry->type_def.cipsov4 = doi_def;
+		entry->type = NETLBL_NLTYPE_CIPSOV4;
+	} else if (addr != NULL && mask != NULL) {
+		addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+		if (addrmap == NULL)
+			goto cfg_cipsov4_map_add_failure;
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
+		if (addrinfo == NULL)
+			goto cfg_cipsov4_map_add_failure;
+		addrinfo->type_def.cipsov4 = doi_def;
+		addrinfo->type = NETLBL_NLTYPE_CIPSOV4;
+		addrinfo->list.addr = addr->s_addr & mask->s_addr;
+		addrinfo->list.mask = mask->s_addr;
+		addrinfo->list.valid = 1;
+		ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4);
+		if (ret_val != 0)
+			goto cfg_cipsov4_map_add_failure;
+
+		entry->type_def.addrsel = addrmap;
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+	} else {
+		ret_val = -EINVAL;
+		goto cfg_cipsov4_map_add_failure;
+	}
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto cfg_cipsov4_map_add_failure;
+
+	return 0;
+
+cfg_cipsov4_map_add_failure:
+	cipso_v4_doi_putdef(doi_def);
+	kfree(entry->domain);
+	kfree(entry);
+	kfree(addrmap);
+	kfree(addrinfo);
+	return ret_val;
+}
+
+/*
+ * Security Attribute Functions
+ */
+
+/**
+ * netlbl_secattr_catmap_walk - Walk a LSM secattr catmap looking for a bit
+ * @catmap: the category bitmap
+ * @offset: the offset to start searching at, in bits
+ *
+ * Description:
+ * This function walks a LSM secattr category bitmap starting at @offset and
+ * returns the spot of the first set bit or -ENOENT if no bits are set.
+ *
+ */
+int netlbl_secattr_catmap_walk(struct netlbl_lsm_secattr_catmap *catmap,
+			       u32 offset)
+{
+	struct netlbl_lsm_secattr_catmap *iter = catmap;
+	u32 node_idx;
+	u32 node_bit;
+	NETLBL_CATMAP_MAPTYPE bitmap;
+
+	if (offset > iter->startbit) {
+		while (offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
+			iter = iter->next;
+			if (iter == NULL)
+				return -ENOENT;
+		}
+		node_idx = (offset - iter->startbit) / NETLBL_CATMAP_MAPSIZE;
+		node_bit = offset - iter->startbit -
+			   (NETLBL_CATMAP_MAPSIZE * node_idx);
+	} else {
+		node_idx = 0;
+		node_bit = 0;
+	}
+	bitmap = iter->bitmap[node_idx] >> node_bit;
+
+	for (;;) {
+		if (bitmap != 0) {
+			while ((bitmap & NETLBL_CATMAP_BIT) == 0) {
+				bitmap >>= 1;
+				node_bit++;
+			}
+			return iter->startbit +
+				(NETLBL_CATMAP_MAPSIZE * node_idx) + node_bit;
+		}
+		if (++node_idx >= NETLBL_CATMAP_MAPCNT) {
+			if (iter->next != NULL) {
+				iter = iter->next;
+				node_idx = 0;
+			} else
+				return -ENOENT;
+		}
+		bitmap = iter->bitmap[node_idx];
+		node_bit = 0;
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * netlbl_secattr_catmap_walk_rng - Find the end of a string of set bits
+ * @catmap: the category bitmap
+ * @offset: the offset to start searching at, in bits
+ *
+ * Description:
+ * This function walks a LSM secattr category bitmap starting at @offset and
+ * returns the spot of the first cleared bit or -ENOENT if the offset is past
+ * the end of the bitmap.
+ *
+ */
+int netlbl_secattr_catmap_walk_rng(struct netlbl_lsm_secattr_catmap *catmap,
+				   u32 offset)
+{
+	struct netlbl_lsm_secattr_catmap *iter = catmap;
+	u32 node_idx;
+	u32 node_bit;
+	NETLBL_CATMAP_MAPTYPE bitmask;
+	NETLBL_CATMAP_MAPTYPE bitmap;
+
+	if (offset > iter->startbit) {
+		while (offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
+			iter = iter->next;
+			if (iter == NULL)
+				return -ENOENT;
+		}
+		node_idx = (offset - iter->startbit) / NETLBL_CATMAP_MAPSIZE;
+		node_bit = offset - iter->startbit -
+			   (NETLBL_CATMAP_MAPSIZE * node_idx);
+	} else {
+		node_idx = 0;
+		node_bit = 0;
+	}
+	bitmask = NETLBL_CATMAP_BIT << node_bit;
+
+	for (;;) {
+		bitmap = iter->bitmap[node_idx];
+		while (bitmask != 0 && (bitmap & bitmask) != 0) {
+			bitmask <<= 1;
+			node_bit++;
+		}
+
+		if (bitmask != 0)
+			return iter->startbit +
+				(NETLBL_CATMAP_MAPSIZE * node_idx) +
+				node_bit - 1;
+		else if (++node_idx >= NETLBL_CATMAP_MAPCNT) {
+			if (iter->next == NULL)
+				return iter->startbit +	NETLBL_CATMAP_SIZE - 1;
+			iter = iter->next;
+			node_idx = 0;
+		}
+		bitmask = NETLBL_CATMAP_BIT;
+		node_bit = 0;
+	}
+
+	return -ENOENT;
+}
+
+/**
+ * netlbl_secattr_catmap_setbit - Set a bit in a LSM secattr catmap
+ * @catmap: the category bitmap
+ * @bit: the bit to set
+ * @flags: memory allocation flags
+ *
+ * Description:
+ * Set the bit specified by @bit in @catmap.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int netlbl_secattr_catmap_setbit(struct netlbl_lsm_secattr_catmap *catmap,
+				 u32 bit,
+				 gfp_t flags)
+{
+	struct netlbl_lsm_secattr_catmap *iter = catmap;
+	u32 node_bit;
+	u32 node_idx;
+
+	while (iter->next != NULL &&
+	       bit >= (iter->startbit + NETLBL_CATMAP_SIZE))
+		iter = iter->next;
+	if (bit >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
+		iter->next = netlbl_secattr_catmap_alloc(flags);
+		if (iter->next == NULL)
+			return -ENOMEM;
+		iter = iter->next;
+		iter->startbit = bit & ~(NETLBL_CATMAP_SIZE - 1);
+	}
+
+	/* gcc always rounds to zero when doing integer division */
+	node_idx = (bit - iter->startbit) / NETLBL_CATMAP_MAPSIZE;
+	node_bit = bit - iter->startbit - (NETLBL_CATMAP_MAPSIZE * node_idx);
+	iter->bitmap[node_idx] |= NETLBL_CATMAP_BIT << node_bit;
+
+	return 0;
+}
+
+/**
+ * netlbl_secattr_catmap_setrng - Set a range of bits in a LSM secattr catmap
+ * @catmap: the category bitmap
+ * @start: the starting bit
+ * @end: the last bit in the string
+ * @flags: memory allocation flags
+ *
+ * Description:
+ * Set a range of bits, starting at @start and ending with @end.  Returns zero
+ * on success, negative values on failure.
+ *
+ */
+int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap,
+				 u32 start,
+				 u32 end,
+				 gfp_t flags)
+{
+	int ret_val = 0;
+	struct netlbl_lsm_secattr_catmap *iter = catmap;
+	u32 iter_max_spot;
+	u32 spot;
+
+	/* XXX - This could probably be made a bit faster by combining writes
+	 * to the catmap instead of setting a single bit each time, but for
+	 * right now skipping to the start of the range in the catmap should
+	 * be a nice improvement over calling the individual setbit function
+	 * repeatedly from a loop. */
+
+	while (iter->next != NULL &&
+	       start >= (iter->startbit + NETLBL_CATMAP_SIZE))
+		iter = iter->next;
+	iter_max_spot = iter->startbit + NETLBL_CATMAP_SIZE;
+
+	for (spot = start; spot <= end && ret_val == 0; spot++) {
+		if (spot >= iter_max_spot && iter->next != NULL) {
+			iter = iter->next;
+			iter_max_spot = iter->startbit + NETLBL_CATMAP_SIZE;
+		}
+		ret_val = netlbl_secattr_catmap_setbit(iter, spot, GFP_ATOMIC);
+	}
+
+	return ret_val;
+}
+
+/*
+ * LSM Functions
+ */
+
+/**
+ * netlbl_enabled - Determine if the NetLabel subsystem is enabled
+ *
+ * Description:
+ * The LSM can use this function to determine if it should use NetLabel
+ * security attributes in it's enforcement mechanism.  Currently, NetLabel is
+ * considered to be enabled when it's configuration contains a valid setup for
+ * at least one labeled protocol (i.e. NetLabel can understand incoming
+ * labeled packets of at least one type); otherwise NetLabel is considered to
+ * be disabled.
+ *
+ */
+int netlbl_enabled(void)
+{
+	/* At some point we probably want to expose this mechanism to the user
+	 * as well so that admins can toggle NetLabel regardless of the
+	 * configuration */
+	return (atomic_read(&netlabel_mgmt_protocount) > 0);
+}
+
+/**
+ * netlbl_sock_setattr - Label a socket using the correct protocol
+ * @sk: the socket to label
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given socket using the security attributes
+ * specified in @secattr.  This function requires exclusive access to @sk,
+ * which means it either needs to be in the process of being created or locked.
+ * Returns zero on success, -EDESTADDRREQ if the domain is configured to use
+ * network address selectors (can't blindly label the socket), and negative
+ * values on all other failures.
+ *
+ */
+int netlbl_sock_setattr(struct sock *sk,
+			u16 family,
+			const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct netlbl_dom_map *dom_entry;
+
+	rcu_read_lock();
+	dom_entry = netlbl_domhsh_getentry(secattr->domain);
+	if (dom_entry == NULL) {
+		ret_val = -ENOENT;
+		goto socket_setattr_return;
+	}
+	switch (family) {
+	case AF_INET:
+		switch (dom_entry->type) {
+		case NETLBL_NLTYPE_ADDRSELECT:
+			ret_val = -EDESTADDRREQ;
+			break;
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_sock_setattr(sk,
+						    dom_entry->type_def.cipsov4,
+						    secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EPROTONOSUPPORT;
+	}
+
+socket_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * netlbl_sock_delattr - Delete all the NetLabel labels on a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Remove all the NetLabel labeling from @sk.  The caller is responsible for
+ * ensuring that @sk is locked.
+ *
+ */
+void netlbl_sock_delattr(struct sock *sk)
+{
+	cipso_v4_sock_delattr(sk);
+}
+
+/**
+ * netlbl_sock_getattr - Determine the security attributes of a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Examines the given sock to see if any NetLabel style labeling has been
+ * applied to the sock, if so it parses the socket label and returns the
+ * security attributes in @secattr.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_sock_getattr(struct sock *sk,
+			struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		ret_val = cipso_v4_sock_getattr(sk, secattr);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		ret_val = -ENOMSG;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EPROTONOSUPPORT;
+	}
+
+	return ret_val;
+}
+
+/**
+ * netlbl_conn_setattr - Label a connected socket using the correct protocol
+ * @sk: the socket to label
+ * @addr: the destination address
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given connected socket using the security
+ * attributes specified in @secattr.  The caller is responsible for ensuring
+ * that @sk is locked.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_conn_setattr(struct sock *sk,
+			struct sockaddr *addr,
+			const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct sockaddr_in *addr4;
+	struct netlbl_domaddr4_map *af4_entry;
+
+	rcu_read_lock();
+	switch (addr->sa_family) {
+	case AF_INET:
+		addr4 = (struct sockaddr_in *)addr;
+		af4_entry = netlbl_domhsh_getentry_af4(secattr->domain,
+						       addr4->sin_addr.s_addr);
+		if (af4_entry == NULL) {
+			ret_val = -ENOENT;
+			goto conn_setattr_return;
+		}
+		switch (af4_entry->type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_sock_setattr(sk,
+						   af4_entry->type_def.cipsov4,
+						   secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			cipso_v4_sock_delattr(sk);
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EPROTONOSUPPORT;
+	}
+
+conn_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * netlbl_req_setattr - Label a request socket using the correct protocol
+ * @req: the request socket to label
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given socket using the security attributes
+ * specified in @secattr.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_req_setattr(struct request_sock *req,
+		       const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct netlbl_dom_map *dom_entry;
+	struct netlbl_domaddr4_map *af4_entry;
+	u32 proto_type;
+	struct cipso_v4_doi *proto_cv4;
+
+	rcu_read_lock();
+	dom_entry = netlbl_domhsh_getentry(secattr->domain);
+	if (dom_entry == NULL) {
+		ret_val = -ENOENT;
+		goto req_setattr_return;
+	}
+	switch (req->rsk_ops->family) {
+	case AF_INET:
+		if (dom_entry->type == NETLBL_NLTYPE_ADDRSELECT) {
+			struct inet_request_sock *req_inet = inet_rsk(req);
+			af4_entry = netlbl_domhsh_getentry_af4(secattr->domain,
+							    req_inet->rmt_addr);
+			if (af4_entry == NULL) {
+				ret_val = -ENOENT;
+				goto req_setattr_return;
+			}
+			proto_type = af4_entry->type;
+			proto_cv4 = af4_entry->type_def.cipsov4;
+		} else {
+			proto_type = dom_entry->type;
+			proto_cv4 = dom_entry->type_def.cipsov4;
+		}
+		switch (proto_type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_req_setattr(req, proto_cv4, secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			cipso_v4_req_delattr(req);
+			ret_val = 0;
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EPROTONOSUPPORT;
+	}
+
+req_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+* netlbl_req_delattr - Delete all the NetLabel labels on a socket
+* @req: the socket
+*
+* Description:
+* Remove all the NetLabel labeling from @req.
+*
+*/
+void netlbl_req_delattr(struct request_sock *req)
+{
+	cipso_v4_req_delattr(req);
+}
+
+/**
+ * netlbl_skbuff_setattr - Label a packet using the correct protocol
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Attach the correct label to the given packet using the security attributes
+ * specified in @secattr.  Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_skbuff_setattr(struct sk_buff *skb,
+			  u16 family,
+			  const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	struct iphdr *hdr4;
+	struct netlbl_domaddr4_map *af4_entry;
+
+	rcu_read_lock();
+	switch (family) {
+	case AF_INET:
+		hdr4 = ip_hdr(skb);
+		af4_entry = netlbl_domhsh_getentry_af4(secattr->domain,
+						       hdr4->daddr);
+		if (af4_entry == NULL) {
+			ret_val = -ENOENT;
+			goto skbuff_setattr_return;
+		}
+		switch (af4_entry->type) {
+		case NETLBL_NLTYPE_CIPSOV4:
+			ret_val = cipso_v4_skbuff_setattr(skb,
+						   af4_entry->type_def.cipsov4,
+						   secattr);
+			break;
+		case NETLBL_NLTYPE_UNLABELED:
+			/* just delete the protocols we support for right now
+			 * but we could remove other protocols if needed */
+			ret_val = cipso_v4_skbuff_delattr(skb);
+			break;
+		default:
+			ret_val = -ENOENT;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		/* since we don't support any IPv6 labeling protocols right
+		 * now we can optimize everything away until we do */
+		ret_val = 0;
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EPROTONOSUPPORT;
+	}
+
+skbuff_setattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/**
+ * netlbl_skbuff_getattr - Determine the security attributes of a packet
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Examines the given packet to see if a recognized form of packet labeling
+ * is present, if so it parses the packet label and returns the security
+ * attributes in @secattr.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+int netlbl_skbuff_getattr(const struct sk_buff *skb,
+			  u16 family,
+			  struct netlbl_lsm_secattr *secattr)
+{
+	switch (family) {
+	case AF_INET:
+		if (CIPSO_V4_OPTEXIST(skb) &&
+		    cipso_v4_skbuff_getattr(skb, secattr) == 0)
+			return 0;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		break;
+#endif /* IPv6 */
+	}
+
+	return netlbl_unlabel_getattr(skb, family, secattr);
+}
+
+/**
+ * netlbl_skbuff_err - Handle a LSM error on a sk_buff
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: true if host is acting as a gateway, false otherwise
+ *
+ * Description:
+ * Deal with a LSM problem when handling the packet in @skb, typically this is
+ * a permission denied problem (-EACCES).  The correct action is determined
+ * according to the packet's labeling protocol.
+ *
+ */
+void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
+{
+	if (CIPSO_V4_OPTEXIST(skb))
+		cipso_v4_error(skb, error, gateway);
+}
+
+/**
+ * netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches
+ *
+ * Description:
+ * For all of the NetLabel protocols that support some form of label mapping
+ * cache, invalidate the cache.  Returns zero on success, negative values on
+ * error.
+ *
+ */
+void netlbl_cache_invalidate(void)
+{
+	cipso_v4_cache_invalidate();
+}
+
+/**
+ * netlbl_cache_add - Add an entry to a NetLabel protocol cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add the LSM security attributes for the given packet to the underlying
+ * NetLabel protocol's label mapping cache.  Returns zero on success, negative
+ * values on error.
+ *
+ */
+int netlbl_cache_add(const struct sk_buff *skb,
+		     const struct netlbl_lsm_secattr *secattr)
+{
+	if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
+		return -ENOMSG;
+
+	if (CIPSO_V4_OPTEXIST(skb))
+		return cipso_v4_cache_add(skb, secattr);
+
+	return -ENOMSG;
+}
+
+/*
+ * Protocol Engine Functions
+ */
+
+/**
+ * netlbl_audit_start - Start an audit message
+ * @type: audit message type
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Start an audit message using the type specified in @type and fill the audit
+ * message with some fields common to all NetLabel audit messages.  This
+ * function should only be used by protocol engines, not LSMs.  Returns a
+ * pointer to the audit buffer on success, NULL on failure.
+ *
+ */
+struct audit_buffer *netlbl_audit_start(int type,
+					struct netlbl_audit *audit_info)
+{
+	return netlbl_audit_start_common(type, audit_info);
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * netlbl_init - Initialize NetLabel
+ *
+ * Description:
+ * Perform the required NetLabel initialization before first use.
+ *
+ */
+static int __init netlbl_init(void)
+{
+	int ret_val;
+
+	printk(KERN_INFO "NetLabel: Initializing\n");
+	printk(KERN_INFO "NetLabel:  domain hash size = %u\n",
+	       (1 << NETLBL_DOMHSH_BITSIZE));
+	printk(KERN_INFO "NetLabel:  protocols ="
+	       " UNLABELED"
+	       " CIPSOv4"
+	       "\n");
+
+	ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
+	if (ret_val != 0)
+		goto init_failure;
+
+	ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
+	if (ret_val != 0)
+		goto init_failure;
+
+	ret_val = netlbl_netlink_init();
+	if (ret_val != 0)
+		goto init_failure;
+
+	ret_val = netlbl_unlabel_defconf();
+	if (ret_val != 0)
+		goto init_failure;
+	printk(KERN_INFO "NetLabel:  unlabeled traffic allowed by default\n");
+
+	return 0;
+
+init_failure:
+	panic("NetLabel: failed to initialize properly (%d)\n", ret_val);
+}
+
+subsys_initcall(netlbl_init);
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
new file mode 100644
index 00000000..4f251b19
--- /dev/null
+++ b/net/netlabel/netlabel_mgmt.c
@@ -0,0 +1,785 @@
+/*
+ * NetLabel Management Support
+ *
+ * This file defines the management functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/atomic.h>
+
+#include "netlabel_domainhash.h"
+#include "netlabel_user.h"
+#include "netlabel_mgmt.h"
+
+/* NetLabel configured protocol counter */
+atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0);
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
+/* NetLabel Generic NETLINK CIPSOv4 family */
+static struct genl_family netlbl_mgmt_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_MGMT_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_MGMT_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
+	[NLBL_MGMT_A_DOMAIN] = { .type = NLA_NUL_STRING },
+	[NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 },
+	[NLBL_MGMT_A_VERSION] = { .type = NLA_U32 },
+	[NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
+};
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * netlbl_mgmt_add - Handle an ADD message
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Helper function for the ADD and ADDDEF messages to add the domain mappings
+ * from the message to the hash table.  See netlabel.h for a description of the
+ * message format.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_add_common(struct genl_info *info,
+				  struct netlbl_audit *audit_info)
+{
+	int ret_val = -EINVAL;
+	struct netlbl_dom_map *entry = NULL;
+	struct netlbl_domaddr_map *addrmap = NULL;
+	struct cipso_v4_doi *cipsov4 = NULL;
+	u32 tmp_val;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		ret_val = -ENOMEM;
+		goto add_failure;
+	}
+	entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
+	if (info->attrs[NLBL_MGMT_A_DOMAIN]) {
+		size_t tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]);
+		entry->domain = kmalloc(tmp_size, GFP_KERNEL);
+		if (entry->domain == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		nla_strlcpy(entry->domain,
+			    info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
+	}
+
+	/* NOTE: internally we allow/use a entry->type value of
+	 *       NETLBL_NLTYPE_ADDRSELECT but we don't currently allow users
+	 *       to pass that as a protocol value because we need to know the
+	 *       "real" protocol */
+
+	switch (entry->type) {
+	case NETLBL_NLTYPE_UNLABELED:
+		break;
+	case NETLBL_NLTYPE_CIPSOV4:
+		if (!info->attrs[NLBL_MGMT_A_CV4DOI])
+			goto add_failure;
+
+		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
+		cipsov4 = cipso_v4_doi_getdef(tmp_val);
+		if (cipsov4 == NULL)
+			goto add_failure;
+		entry->type_def.cipsov4 = cipsov4;
+		break;
+	default:
+		goto add_failure;
+	}
+
+	if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) {
+		struct in_addr *addr;
+		struct in_addr *mask;
+		struct netlbl_domaddr4_map *map;
+
+		addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+		if (addrmap == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV4ADDR]) !=
+		    sizeof(struct in_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV4MASK]) !=
+		    sizeof(struct in_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		addr = nla_data(info->attrs[NLBL_MGMT_A_IPV4ADDR]);
+		mask = nla_data(info->attrs[NLBL_MGMT_A_IPV4MASK]);
+
+		map = kzalloc(sizeof(*map), GFP_KERNEL);
+		if (map == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		map->list.addr = addr->s_addr & mask->s_addr;
+		map->list.mask = mask->s_addr;
+		map->list.valid = 1;
+		map->type = entry->type;
+		if (cipsov4)
+			map->type_def.cipsov4 = cipsov4;
+
+		ret_val = netlbl_af4list_add(&map->list, &addrmap->list4);
+		if (ret_val != 0) {
+			kfree(map);
+			goto add_failure;
+		}
+
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+		entry->type_def.addrsel = addrmap;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	} else if (info->attrs[NLBL_MGMT_A_IPV6ADDR]) {
+		struct in6_addr *addr;
+		struct in6_addr *mask;
+		struct netlbl_domaddr6_map *map;
+
+		addrmap = kzalloc(sizeof(*addrmap), GFP_KERNEL);
+		if (addrmap == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		INIT_LIST_HEAD(&addrmap->list4);
+		INIT_LIST_HEAD(&addrmap->list6);
+
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV6ADDR]) !=
+		    sizeof(struct in6_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		if (nla_len(info->attrs[NLBL_MGMT_A_IPV6MASK]) !=
+		    sizeof(struct in6_addr)) {
+			ret_val = -EINVAL;
+			goto add_failure;
+		}
+		addr = nla_data(info->attrs[NLBL_MGMT_A_IPV6ADDR]);
+		mask = nla_data(info->attrs[NLBL_MGMT_A_IPV6MASK]);
+
+		map = kzalloc(sizeof(*map), GFP_KERNEL);
+		if (map == NULL) {
+			ret_val = -ENOMEM;
+			goto add_failure;
+		}
+		ipv6_addr_copy(&map->list.addr, addr);
+		map->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+		map->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+		map->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+		map->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+		ipv6_addr_copy(&map->list.mask, mask);
+		map->list.valid = 1;
+		map->type = entry->type;
+
+		ret_val = netlbl_af6list_add(&map->list, &addrmap->list6);
+		if (ret_val != 0) {
+			kfree(map);
+			goto add_failure;
+		}
+
+		entry->type = NETLBL_NLTYPE_ADDRSELECT;
+		entry->type_def.addrsel = addrmap;
+#endif /* IPv6 */
+	}
+
+	ret_val = netlbl_domhsh_add(entry, audit_info);
+	if (ret_val != 0)
+		goto add_failure;
+
+	return 0;
+
+add_failure:
+	if (cipsov4)
+		cipso_v4_doi_putdef(cipsov4);
+	if (entry)
+		kfree(entry->domain);
+	kfree(addrmap);
+	kfree(entry);
+	return ret_val;
+}
+
+/**
+ * netlbl_mgmt_listentry - List a NetLabel/LSM domain map entry
+ * @skb: the NETLINK buffer
+ * @entry: the map entry
+ *
+ * Description:
+ * This function is a helper function used by the LISTALL and LISTDEF command
+ * handlers.  The caller is responsible for ensuring that the RCU read lock
+ * is held.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listentry(struct sk_buff *skb,
+				 struct netlbl_dom_map *entry)
+{
+	int ret_val = 0;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	struct netlbl_af4list *iter4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+#endif
+
+	if (entry->domain != NULL) {
+		ret_val = nla_put_string(skb,
+					 NLBL_MGMT_A_DOMAIN, entry->domain);
+		if (ret_val != 0)
+			return ret_val;
+	}
+
+	switch (entry->type) {
+	case NETLBL_NLTYPE_ADDRSELECT:
+		nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST);
+		if (nla_a == NULL)
+			return -ENOMEM;
+
+		netlbl_af4list_foreach_rcu(iter4,
+					   &entry->type_def.addrsel->list4) {
+			struct netlbl_domaddr4_map *map4;
+			struct in_addr addr_struct;
+
+			nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+			if (nla_b == NULL)
+				return -ENOMEM;
+
+			addr_struct.s_addr = iter4->addr;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV4ADDR,
+					  sizeof(struct in_addr),
+					  &addr_struct);
+			if (ret_val != 0)
+				return ret_val;
+			addr_struct.s_addr = iter4->mask;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV4MASK,
+					  sizeof(struct in_addr),
+					  &addr_struct);
+			if (ret_val != 0)
+				return ret_val;
+			map4 = netlbl_domhsh_addr4_entry(iter4);
+			ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+					      map4->type);
+			if (ret_val != 0)
+				return ret_val;
+			switch (map4->type) {
+			case NETLBL_NLTYPE_CIPSOV4:
+				ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+						  map4->type_def.cipsov4->doi);
+				if (ret_val != 0)
+					return ret_val;
+				break;
+			}
+
+			nla_nest_end(skb, nla_b);
+		}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		netlbl_af6list_foreach_rcu(iter6,
+					   &entry->type_def.addrsel->list6) {
+			struct netlbl_domaddr6_map *map6;
+
+			nla_b = nla_nest_start(skb, NLBL_MGMT_A_ADDRSELECTOR);
+			if (nla_b == NULL)
+				return -ENOMEM;
+
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV6ADDR,
+					  sizeof(struct in6_addr),
+					  &iter6->addr);
+			if (ret_val != 0)
+				return ret_val;
+			ret_val = nla_put(skb, NLBL_MGMT_A_IPV6MASK,
+					  sizeof(struct in6_addr),
+					  &iter6->mask);
+			if (ret_val != 0)
+				return ret_val;
+			map6 = netlbl_domhsh_addr6_entry(iter6);
+			ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+					      map6->type);
+			if (ret_val != 0)
+				return ret_val;
+
+			nla_nest_end(skb, nla_b);
+		}
+#endif /* IPv6 */
+
+		nla_nest_end(skb, nla_a);
+		break;
+	case NETLBL_NLTYPE_UNLABELED:
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type);
+		break;
+	case NETLBL_NLTYPE_CIPSOV4:
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, entry->type);
+		if (ret_val != 0)
+			return ret_val;
+		ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
+				      entry->type_def.cipsov4->doi);
+		break;
+	}
+
+	return ret_val;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_mgmt_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ADD message and add the domains from the message
+ * to the hash table.  See netlabel.h for a description of the message format.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
+{
+	struct netlbl_audit audit_info;
+
+	if ((!info->attrs[NLBL_MGMT_A_DOMAIN]) ||
+	    (!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+	     info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+	     info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	return netlbl_mgmt_add_common(info, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and remove the specified domain
+ * mappings.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info)
+{
+	char *domain;
+	struct netlbl_audit audit_info;
+
+	if (!info->attrs[NLBL_MGMT_A_DOMAIN])
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]);
+	return netlbl_domhsh_remove(domain, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_listall_cb - netlbl_domhsh_walk() callback for LISTALL
+ * @entry: the domain mapping hash table entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * netlbl_domhsh_walk() function for use in generating a response for a LISTALL
+ * message.  Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_domhsh_walk_arg *cb_arg = arg;
+	void *data;
+
+	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
+			   cb_arg->seq, &netlbl_mgmt_gnl_family,
+			   NLM_F_MULTI, NLBL_MGMT_C_LISTALL);
+	if (data == NULL)
+		goto listall_cb_failure;
+
+	ret_val = netlbl_mgmt_listentry(cb_arg->skb, entry);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+
+	cb_arg->seq++;
+	return genlmsg_end(cb_arg->skb, data);
+
+listall_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
+	return ret_val;
+}
+
+/**
+ * netlbl_mgmt_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and dumps the domain hash table in
+ * a form suitable for use in a kernel generated LISTALL message.  Returns zero
+ * on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listall(struct sk_buff *skb,
+			       struct netlink_callback *cb)
+{
+	struct netlbl_domhsh_walk_arg cb_arg;
+	u32 skip_bkt = cb->args[0];
+	u32 skip_chain = cb->args[1];
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	netlbl_domhsh_walk(&skip_bkt,
+			   &skip_chain,
+			   netlbl_mgmt_listall_cb,
+			   &cb_arg);
+
+	cb->args[0] = skip_bkt;
+	cb->args[1] = skip_chain;
+	return skb->len;
+}
+
+/**
+ * netlbl_mgmt_adddef - Handle an ADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ADDDEF message and respond accordingly.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info)
+{
+	struct netlbl_audit audit_info;
+
+	if ((!info->attrs[NLBL_MGMT_A_PROTOCOL]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4ADDR] &&
+	     info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+	    (info->attrs[NLBL_MGMT_A_IPV4MASK] &&
+	     info->attrs[NLBL_MGMT_A_IPV6MASK]) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV4ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV4MASK] != NULL)) ||
+	    ((info->attrs[NLBL_MGMT_A_IPV6ADDR] != NULL) ^
+	     (info->attrs[NLBL_MGMT_A_IPV6MASK] != NULL)))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	return netlbl_mgmt_add_common(info, &audit_info);
+}
+
+/**
+ * netlbl_mgmt_removedef - Handle a REMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVEDEF message and remove the default domain
+ * mapping.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info)
+{
+	struct netlbl_audit audit_info;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	return netlbl_domhsh_remove_default(&audit_info);
+}
+
+/**
+ * netlbl_mgmt_listdef - Handle a LISTDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LISTDEF message and dumps the default domain
+ * mapping in a form suitable for use in a kernel generated LISTDEF message.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val = -ENOMEM;
+	struct sk_buff *ans_skb = NULL;
+	void *data;
+	struct netlbl_dom_map *entry;
+
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (ans_skb == NULL)
+		return -ENOMEM;
+	data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family,
+				 0, NLBL_MGMT_C_LISTDEF);
+	if (data == NULL)
+		goto listdef_failure;
+
+	rcu_read_lock();
+	entry = netlbl_domhsh_getentry(NULL);
+	if (entry == NULL) {
+		ret_val = -ENOENT;
+		goto listdef_failure_lock;
+	}
+	ret_val = netlbl_mgmt_listentry(ans_skb, entry);
+	rcu_read_unlock();
+	if (ret_val != 0)
+		goto listdef_failure;
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+listdef_failure_lock:
+	rcu_read_unlock();
+listdef_failure:
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
+/**
+ * netlbl_mgmt_protocols_cb - Write an individual PROTOCOL message response
+ * @skb: the skb to write to
+ * @cb: the NETLINK callback
+ * @protocol: the NetLabel protocol to use in the message
+ *
+ * Description:
+ * This function is to be used in conjunction with netlbl_mgmt_protocols() to
+ * answer a application's PROTOCOLS message.  Returns the size of the message
+ * on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_protocols_cb(struct sk_buff *skb,
+				    struct netlink_callback *cb,
+				    u32 protocol)
+{
+	int ret_val = -ENOMEM;
+	void *data;
+
+	data = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			   &netlbl_mgmt_gnl_family, NLM_F_MULTI,
+			   NLBL_MGMT_C_PROTOCOLS);
+	if (data == NULL)
+		goto protocols_cb_failure;
+
+	ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, protocol);
+	if (ret_val != 0)
+		goto protocols_cb_failure;
+
+	return genlmsg_end(skb, data);
+
+protocols_cb_failure:
+	genlmsg_cancel(skb, data);
+	return ret_val;
+}
+
+/**
+ * netlbl_mgmt_protocols - Handle a PROTOCOLS message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated PROTOCOLS message and respond accordingly.
+ *
+ */
+static int netlbl_mgmt_protocols(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	u32 protos_sent = cb->args[0];
+
+	if (protos_sent == 0) {
+		if (netlbl_mgmt_protocols_cb(skb,
+					     cb,
+					     NETLBL_NLTYPE_UNLABELED) < 0)
+			goto protocols_return;
+		protos_sent++;
+	}
+	if (protos_sent == 1) {
+		if (netlbl_mgmt_protocols_cb(skb,
+					     cb,
+					     NETLBL_NLTYPE_CIPSOV4) < 0)
+			goto protocols_return;
+		protos_sent++;
+	}
+
+protocols_return:
+	cb->args[0] = protos_sent;
+	return skb->len;
+}
+
+/**
+ * netlbl_mgmt_version - Handle a VERSION message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated VERSION message and respond accordingly.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val = -ENOMEM;
+	struct sk_buff *ans_skb = NULL;
+	void *data;
+
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (ans_skb == NULL)
+		return -ENOMEM;
+	data = genlmsg_put_reply(ans_skb, info, &netlbl_mgmt_gnl_family,
+				 0, NLBL_MGMT_C_VERSION);
+	if (data == NULL)
+		goto version_failure;
+
+	ret_val = nla_put_u32(ans_skb,
+			      NLBL_MGMT_A_VERSION,
+			      NETLBL_PROTO_VERSION);
+	if (ret_val != 0)
+		goto version_failure;
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+version_failure:
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static struct genl_ops netlbl_mgmt_genl_ops[] = {
+	{
+	.cmd = NLBL_MGMT_C_ADD,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_add,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_MGMT_C_REMOVE,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_remove,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_MGMT_C_LISTALL,
+	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_mgmt_listall,
+	},
+	{
+	.cmd = NLBL_MGMT_C_ADDDEF,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_adddef,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_MGMT_C_REMOVEDEF,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_removedef,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_MGMT_C_LISTDEF,
+	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_listdef,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_MGMT_C_PROTOCOLS,
+	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_mgmt_protocols,
+	},
+	{
+	.cmd = NLBL_MGMT_C_VERSION,
+	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = netlbl_mgmt_version,
+	.dumpit = NULL,
+	},
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_mgmt_genl_init - Register the NetLabel management component
+ *
+ * Description:
+ * Register the NetLabel management component with the Generic NETLINK
+ * mechanism.  Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_mgmt_genl_init(void)
+{
+	return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
+		netlbl_mgmt_genl_ops, ARRAY_SIZE(netlbl_mgmt_genl_ops));
+}
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
new file mode 100644
index 00000000..0a25838b
--- /dev/null
+++ b/net/netlabel/netlabel_mgmt.h
@@ -0,0 +1,223 @@
+/*
+ * NetLabel Management Support
+ *
+ * This file defines the management functions for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_MGMT_H
+#define _NETLABEL_MGMT_H
+
+#include <net/netlabel.h>
+#include <asm/atomic.h>
+
+/*
+ * The following NetLabel payloads are supported by the management interface.
+ *
+ * o ADD:
+ *   Sent by an application to add a domain mapping to the NetLabel system.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_DOMAIN
+ *     NLBL_MGMT_A_PROTOCOL
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_MGMT_A_IPV4ADDR
+ *     NLBL_MGMT_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_MGMT_A_IPV6ADDR
+ *     NLBL_MGMT_A_IPV6MASK
+ *
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *
+ *     NLBL_MGMT_A_CV4DOI
+ *
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *
+ * o REMOVE:
+ *   Sent by an application to remove a domain mapping from the NetLabel
+ *   system.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_DOMAIN
+ *
+ * o LISTALL:
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated LISTALL message.  When sent by an
+ *   application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should respond with a series of the following messages.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_DOMAIN
+ *
+ *   If the IP address selectors are not used the following attribute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_PROTOCOL
+ *
+ *   If the IP address selectors are used then the following attritbute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_SELECTORLIST
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ *   attributes are required:
+ *
+ *     NLBL_MGMT_A_CV4DOI
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ *   attributes are required.
+ *
+ * o ADDDEF:
+ *   Sent by an application to set the default domain mapping for the NetLabel
+ *   system.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_PROTOCOL
+ *
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
+ *
+ *     NLBL_MGMT_A_CV4DOI
+ *
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ *
+ * o REMOVEDEF:
+ *   Sent by an application to remove the default domain mapping from the
+ *   NetLabel system, there is no payload.
+ *
+ * o LISTDEF:
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated LISTDEF message.  When sent by an
+ *   application there is no payload.  On success the kernel should send a
+ *   response using the following format.
+ *
+ *   If the IP address selectors are not used the following attribute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_PROTOCOL
+ *
+ *   If the IP address selectors are used then the following attritbute is
+ *   required:
+ *
+ *     NLBL_MGMT_A_SELECTORLIST
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_CIPSOV4 type then the following
+ *   attributes are required:
+ *
+ *     NLBL_MGMT_A_CV4DOI
+ *
+ *   If the mapping is using the NETLBL_NLTYPE_UNLABELED type no other
+ *   attributes are required.
+ *
+ * o PROTOCOLS:
+ *   Sent by an application to request a list of configured NetLabel protocols
+ *   in the kernel.  When sent by an application there is no payload and the
+ *   NLM_F_DUMP flag should be set.  The kernel should respond with a series of
+ *   the following messages.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_PROTOCOL
+ *
+ * o VERSION:
+ *   Sent by an application to request the NetLabel version.  When sent by an
+ *   application there is no payload.  This message type is also used by the
+ *   kernel to respond to an VERSION request.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_MGMT_A_VERSION
+ *
+ */
+
+/* NetLabel Management commands */
+enum {
+	NLBL_MGMT_C_UNSPEC,
+	NLBL_MGMT_C_ADD,
+	NLBL_MGMT_C_REMOVE,
+	NLBL_MGMT_C_LISTALL,
+	NLBL_MGMT_C_ADDDEF,
+	NLBL_MGMT_C_REMOVEDEF,
+	NLBL_MGMT_C_LISTDEF,
+	NLBL_MGMT_C_PROTOCOLS,
+	NLBL_MGMT_C_VERSION,
+	__NLBL_MGMT_C_MAX,
+};
+
+/* NetLabel Management attributes */
+enum {
+	NLBL_MGMT_A_UNSPEC,
+	NLBL_MGMT_A_DOMAIN,
+	/* (NLA_NUL_STRING)
+	 * the NULL terminated LSM domain string */
+	NLBL_MGMT_A_PROTOCOL,
+	/* (NLA_U32)
+	 * the NetLabel protocol type (defined by NETLBL_NLTYPE_*) */
+	NLBL_MGMT_A_VERSION,
+	/* (NLA_U32)
+	 * the NetLabel protocol version number (defined by
+	 * NETLBL_PROTO_VERSION) */
+	NLBL_MGMT_A_CV4DOI,
+	/* (NLA_U32)
+	 * the CIPSOv4 DOI value */
+	NLBL_MGMT_A_IPV6ADDR,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address */
+	NLBL_MGMT_A_IPV6MASK,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address mask */
+	NLBL_MGMT_A_IPV4ADDR,
+	/* (NLA_BINARY, struct in_addr)
+	 * an IPv4 address */
+	NLBL_MGMT_A_IPV4MASK,
+	/* (NLA_BINARY, struct in_addr)
+	 * and IPv4 address mask */
+	NLBL_MGMT_A_ADDRSELECTOR,
+	/* (NLA_NESTED)
+	 * an IP address selector, must contain an address, mask, and protocol
+	 * attribute plus any protocol specific attributes */
+	NLBL_MGMT_A_SELECTORLIST,
+	/* (NLA_NESTED)
+	 * the selector list, there must be at least one
+	 * NLBL_MGMT_A_ADDRSELECTOR attribute */
+	__NLBL_MGMT_A_MAX,
+};
+#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_mgmt_genl_init(void);
+
+/* NetLabel configured protocol reference counter */
+extern atomic_t netlabel_mgmt_protocount;
+
+#endif
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
new file mode 100644
index 00000000..9c38658f
--- /dev/null
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -0,0 +1,1563 @@
+/*
+ * NetLabel Unlabeled Support
+ *
+ * This file defines functions for dealing with unlabeled packets for the
+ * NetLabel system.  The NetLabel system manages static and dynamic label
+ * mappings for network protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/net_namespace.h>
+#include <net/netlabel.h>
+#include <asm/bug.h>
+#include <asm/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_addrlist.h"
+#include "netlabel_domainhash.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_mgmt.h"
+
+/* NOTE: at present we always use init's network namespace since we don't
+ *       presently support different namespaces even though the majority of
+ *       the functions in this file are "namespace safe" */
+
+/* The unlabeled connection hash table which we use to map network interfaces
+ * and addresses of unlabeled packets to a user specified secid value for the
+ * LSM.  The hash table is used to lookup the network interface entry
+ * (struct netlbl_unlhsh_iface) and then the interface entry is used to
+ * lookup an IP address match from an ordered list.  If a network interface
+ * match can not be found in the hash table then the default entry
+ * (netlbl_unlhsh_def) is used.  The IP address entry list
+ * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
+ * larger netmask come first.
+ */
+struct netlbl_unlhsh_tbl {
+	struct list_head *tbl;
+	u32 size;
+};
+#define netlbl_unlhsh_addr4_entry(iter) \
+	container_of(iter, struct netlbl_unlhsh_addr4, list)
+struct netlbl_unlhsh_addr4 {
+	u32 secid;
+
+	struct netlbl_af4list list;
+	struct rcu_head rcu;
+};
+#define netlbl_unlhsh_addr6_entry(iter) \
+	container_of(iter, struct netlbl_unlhsh_addr6, list)
+struct netlbl_unlhsh_addr6 {
+	u32 secid;
+
+	struct netlbl_af6list list;
+	struct rcu_head rcu;
+};
+struct netlbl_unlhsh_iface {
+	int ifindex;
+	struct list_head addr4_list;
+	struct list_head addr6_list;
+
+	u32 valid;
+	struct list_head list;
+	struct rcu_head rcu;
+};
+
+/* Argument struct for netlbl_unlhsh_walk() */
+struct netlbl_unlhsh_walk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
+/* Unlabeled connection hash table */
+/* updates should be so rare that having one spinlock for the entire
+ * hash table should be okay */
+static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
+#define netlbl_unlhsh_rcu_deref(p) \
+	rcu_dereference_check(p, rcu_read_lock_held() || \
+				 lockdep_is_held(&netlbl_unlhsh_lock))
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
+
+/* Accept unlabeled packets flag */
+static u8 netlabel_unlabel_acceptflg = 0;
+
+/* NetLabel Generic NETLINK unlabeled family */
+static struct genl_family netlbl_unlabel_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_UNLABELED_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_UNLABEL_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
+	[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+	[NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
+				      .len = sizeof(struct in6_addr) },
+	[NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
+				      .len = sizeof(struct in6_addr) },
+	[NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
+				      .len = sizeof(struct in_addr) },
+	[NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
+				      .len = sizeof(struct in_addr) },
+	[NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
+				   .len = IFNAMSIZ - 1 },
+	[NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
+};
+
+/*
+ * Unlabeled Connection Hash Table Functions
+ */
+
+/**
+ * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that memory allocated to a hash table interface entry can be
+ * released safely.  It is important to note that this function does not free
+ * the IPv4 and IPv6 address lists contained as part of an interface entry.  It
+ * is up to the rest of the code to make sure an interface entry is only freed
+ * once it's address lists are empty.
+ *
+ */
+static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
+{
+	struct netlbl_unlhsh_iface *iface;
+	struct netlbl_af4list *iter4;
+	struct netlbl_af4list *tmp4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+	struct netlbl_af6list *tmp6;
+#endif /* IPv6 */
+
+	iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);
+
+	/* no need for locks here since we are the only one with access to this
+	 * structure */
+
+	netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
+		netlbl_af4list_remove_entry(iter4);
+		kfree(netlbl_unlhsh_addr4_entry(iter4));
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
+		netlbl_af6list_remove_entry(iter6);
+		kfree(netlbl_unlhsh_addr6_entry(iter6));
+	}
+#endif /* IPv6 */
+	kfree(iface);
+}
+
+/**
+ * netlbl_unlhsh_hash - Hashing function for the hash table
+ * @ifindex: the network interface/device to hash
+ *
+ * Description:
+ * This is the hashing function for the unlabeled hash table, it returns the
+ * bucket number for the given device/interface.  The caller is responsible for
+ * ensuring that the hash table is protected with either a RCU read lock or
+ * the hash table lock.
+ *
+ */
+static u32 netlbl_unlhsh_hash(int ifindex)
+{
+	return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1);
+}
+
+/**
+ * netlbl_unlhsh_search_iface - Search for a matching interface entry
+ * @ifindex: the network interface
+ *
+ * Description:
+ * Searches the unlabeled connection hash table and returns a pointer to the
+ * interface entry which matches @ifindex, otherwise NULL is returned.  The
+ * caller is responsible for ensuring that the hash table is protected with
+ * either a RCU read lock or the hash table lock.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
+{
+	u32 bkt;
+	struct list_head *bkt_list;
+	struct netlbl_unlhsh_iface *iter;
+
+	bkt = netlbl_unlhsh_hash(ifindex);
+	bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
+	list_for_each_entry_rcu(iter, bkt_list, list)
+		if (iter->valid && iter->ifindex == ifindex)
+			return iter;
+
+	return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv4 address in network byte order
+ * @mask: IPv4 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface.  On success zero is returned, otherwise
+ * a negative value is returned.
+ *
+ */
+static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
+				   const struct in_addr *addr,
+				   const struct in_addr *mask,
+				   u32 secid)
+{
+	int ret_val;
+	struct netlbl_unlhsh_addr4 *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	entry->list.addr = addr->s_addr & mask->s_addr;
+	entry->list.mask = mask->s_addr;
+	entry->list.valid = 1;
+	entry->secid = secid;
+
+	spin_lock(&netlbl_unlhsh_lock);
+	ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
+	spin_unlock(&netlbl_unlhsh_lock);
+
+	if (ret_val != 0)
+		kfree(entry);
+	return ret_val;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
+ * @iface: the associated interface entry
+ * @addr: IPv6 address in network byte order
+ * @mask: IPv6 address mask in network byte order
+ * @secid: LSM secid value for entry
+ *
+ * Description:
+ * Add a new address entry into the unlabeled connection hash table using the
+ * interface entry specified by @iface.  On success zero is returned, otherwise
+ * a negative value is returned.
+ *
+ */
+static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
+				   const struct in6_addr *addr,
+				   const struct in6_addr *mask,
+				   u32 secid)
+{
+	int ret_val;
+	struct netlbl_unlhsh_addr6 *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	ipv6_addr_copy(&entry->list.addr, addr);
+	entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+	entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+	entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+	entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+	ipv6_addr_copy(&entry->list.mask, mask);
+	entry->list.valid = 1;
+	entry->secid = secid;
+
+	spin_lock(&netlbl_unlhsh_lock);
+	ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
+	spin_unlock(&netlbl_unlhsh_lock);
+
+	if (ret_val != 0)
+		kfree(entry);
+	return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
+ * @ifindex: network interface
+ *
+ * Description:
+ * Add a new, empty, interface entry into the unlabeled connection hash table.
+ * On success a pointer to the new interface entry is returned, on failure NULL
+ * is returned.
+ *
+ */
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
+{
+	u32 bkt;
+	struct netlbl_unlhsh_iface *iface;
+
+	iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
+	if (iface == NULL)
+		return NULL;
+
+	iface->ifindex = ifindex;
+	INIT_LIST_HEAD(&iface->addr4_list);
+	INIT_LIST_HEAD(&iface->addr6_list);
+	iface->valid = 1;
+
+	spin_lock(&netlbl_unlhsh_lock);
+	if (ifindex > 0) {
+		bkt = netlbl_unlhsh_hash(ifindex);
+		if (netlbl_unlhsh_search_iface(ifindex) != NULL)
+			goto add_iface_failure;
+		list_add_tail_rcu(&iface->list,
+			     &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]);
+	} else {
+		INIT_LIST_HEAD(&iface->list);
+		if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL)
+			goto add_iface_failure;
+		rcu_assign_pointer(netlbl_unlhsh_def, iface);
+	}
+	spin_unlock(&netlbl_unlhsh_lock);
+
+	return iface;
+
+add_iface_failure:
+	spin_unlock(&netlbl_unlhsh_lock);
+	kfree(iface);
+	return NULL;
+}
+
+/**
+ * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @secid: LSM secid value for the entry
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Adds a new entry to the unlabeled connection hash table.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int netlbl_unlhsh_add(struct net *net,
+		      const char *dev_name,
+		      const void *addr,
+		      const void *mask,
+		      u32 addr_len,
+		      u32 secid,
+		      struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	int ifindex;
+	struct net_device *dev;
+	struct netlbl_unlhsh_iface *iface;
+	struct audit_buffer *audit_buf = NULL;
+	char *secctx = NULL;
+	u32 secctx_len;
+
+	if (addr_len != sizeof(struct in_addr) &&
+	    addr_len != sizeof(struct in6_addr))
+		return -EINVAL;
+
+	rcu_read_lock();
+	if (dev_name != NULL) {
+		dev = dev_get_by_name_rcu(net, dev_name);
+		if (dev == NULL) {
+			ret_val = -ENODEV;
+			goto unlhsh_add_return;
+		}
+		ifindex = dev->ifindex;
+		iface = netlbl_unlhsh_search_iface(ifindex);
+	} else {
+		ifindex = 0;
+		iface = rcu_dereference(netlbl_unlhsh_def);
+	}
+	if (iface == NULL) {
+		iface = netlbl_unlhsh_add_iface(ifindex);
+		if (iface == NULL) {
+			ret_val = -ENOMEM;
+			goto unlhsh_add_return;
+		}
+	}
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
+					      audit_info);
+	switch (addr_len) {
+	case sizeof(struct in_addr): {
+		struct in_addr *addr4, *mask4;
+
+		addr4 = (struct in_addr *)addr;
+		mask4 = (struct in_addr *)mask;
+		ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
+		if (audit_buf != NULL)
+			netlbl_af4list_audit_addr(audit_buf, 1,
+						  dev_name,
+						  addr4->s_addr,
+						  mask4->s_addr);
+		break;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case sizeof(struct in6_addr): {
+		struct in6_addr *addr6, *mask6;
+
+		addr6 = (struct in6_addr *)addr;
+		mask6 = (struct in6_addr *)mask;
+		ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
+		if (audit_buf != NULL)
+			netlbl_af6list_audit_addr(audit_buf, 1,
+						  dev_name,
+						  addr6, mask6);
+		break;
+	}
+#endif /* IPv6 */
+	default:
+		ret_val = -EINVAL;
+	}
+	if (ret_val == 0)
+		atomic_inc(&netlabel_mgmt_protocount);
+
+unlhsh_add_return:
+	rcu_read_unlock();
+	if (audit_buf != NULL) {
+		if (security_secid_to_secctx(secid,
+					     &secctx,
+					     &secctx_len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", secctx);
+			security_release_secctx(secctx, secctx_len);
+		}
+		audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+	return ret_val;
+}
+
+/**
+ * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove_addr4(struct net *net,
+				      struct netlbl_unlhsh_iface *iface,
+				      const struct in_addr *addr,
+				      const struct in_addr *mask,
+				      struct netlbl_audit *audit_info)
+{
+	struct netlbl_af4list *list_entry;
+	struct netlbl_unlhsh_addr4 *entry;
+	struct audit_buffer *audit_buf;
+	struct net_device *dev;
+	char *secctx;
+	u32 secctx_len;
+
+	spin_lock(&netlbl_unlhsh_lock);
+	list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
+					   &iface->addr4_list);
+	spin_unlock(&netlbl_unlhsh_lock);
+	if (list_entry != NULL)
+		entry = netlbl_unlhsh_addr4_entry(list_entry);
+	else
+		entry = NULL;
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+					      audit_info);
+	if (audit_buf != NULL) {
+		dev = dev_get_by_index(net, iface->ifindex);
+		netlbl_af4list_audit_addr(audit_buf, 1,
+					  (dev != NULL ? dev->name : NULL),
+					  addr->s_addr, mask->s_addr);
+		if (dev != NULL)
+			dev_put(dev);
+		if (entry != NULL &&
+		    security_secid_to_secctx(entry->secid,
+					     &secctx, &secctx_len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", secctx);
+			security_release_secctx(secctx, secctx_len);
+		}
+		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	if (entry == NULL)
+		return -ENOENT;
+
+	kfree_rcu(entry, rcu);
+	return 0;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
+ * @net: network namespace
+ * @iface: interface entry
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an IP address entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlhsh_remove_addr6(struct net *net,
+				      struct netlbl_unlhsh_iface *iface,
+				      const struct in6_addr *addr,
+				      const struct in6_addr *mask,
+				      struct netlbl_audit *audit_info)
+{
+	struct netlbl_af6list *list_entry;
+	struct netlbl_unlhsh_addr6 *entry;
+	struct audit_buffer *audit_buf;
+	struct net_device *dev;
+	char *secctx;
+	u32 secctx_len;
+
+	spin_lock(&netlbl_unlhsh_lock);
+	list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
+	spin_unlock(&netlbl_unlhsh_lock);
+	if (list_entry != NULL)
+		entry = netlbl_unlhsh_addr6_entry(list_entry);
+	else
+		entry = NULL;
+
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
+					      audit_info);
+	if (audit_buf != NULL) {
+		dev = dev_get_by_index(net, iface->ifindex);
+		netlbl_af6list_audit_addr(audit_buf, 1,
+					  (dev != NULL ? dev->name : NULL),
+					  addr, mask);
+		if (dev != NULL)
+			dev_put(dev);
+		if (entry != NULL &&
+		    security_secid_to_secctx(entry->secid,
+					     &secctx, &secctx_len) == 0) {
+			audit_log_format(audit_buf, " sec_obj=%s", secctx);
+			security_release_secctx(secctx, secctx_len);
+		}
+		audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
+		audit_log_end(audit_buf);
+	}
+
+	if (entry == NULL)
+		return -ENOENT;
+
+	kfree_rcu(entry, rcu);
+	return 0;
+}
+#endif /* IPv6 */
+
+/**
+ * netlbl_unlhsh_condremove_iface - Remove an interface entry
+ * @iface: the interface entry
+ *
+ * Description:
+ * Remove an interface entry from the unlabeled connection hash table if it is
+ * empty.  An interface entry is considered to be empty if there are no
+ * address entries assigned to it.
+ *
+ */
+static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
+{
+	struct netlbl_af4list *iter4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *iter6;
+#endif /* IPv6 */
+
+	spin_lock(&netlbl_unlhsh_lock);
+	netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
+		goto unlhsh_condremove_failure;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
+		goto unlhsh_condremove_failure;
+#endif /* IPv6 */
+	iface->valid = 0;
+	if (iface->ifindex > 0)
+		list_del_rcu(&iface->list);
+	else
+		rcu_assign_pointer(netlbl_unlhsh_def, NULL);
+	spin_unlock(&netlbl_unlhsh_lock);
+
+	call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+	return;
+
+unlhsh_condremove_failure:
+	spin_unlock(&netlbl_unlhsh_lock);
+}
+
+/**
+ * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
+ * @net: network namespace
+ * @dev_name: interface name
+ * @addr: IP address in network byte order
+ * @mask: address mask in network byte order
+ * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes and existing entry from the unlabeled connection hash table.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_unlhsh_remove(struct net *net,
+			 const char *dev_name,
+			 const void *addr,
+			 const void *mask,
+			 u32 addr_len,
+			 struct netlbl_audit *audit_info)
+{
+	int ret_val;
+	struct net_device *dev;
+	struct netlbl_unlhsh_iface *iface;
+
+	if (addr_len != sizeof(struct in_addr) &&
+	    addr_len != sizeof(struct in6_addr))
+		return -EINVAL;
+
+	rcu_read_lock();
+	if (dev_name != NULL) {
+		dev = dev_get_by_name_rcu(net, dev_name);
+		if (dev == NULL) {
+			ret_val = -ENODEV;
+			goto unlhsh_remove_return;
+		}
+		iface = netlbl_unlhsh_search_iface(dev->ifindex);
+	} else
+		iface = rcu_dereference(netlbl_unlhsh_def);
+	if (iface == NULL) {
+		ret_val = -ENOENT;
+		goto unlhsh_remove_return;
+	}
+	switch (addr_len) {
+	case sizeof(struct in_addr):
+		ret_val = netlbl_unlhsh_remove_addr4(net,
+						     iface, addr, mask,
+						     audit_info);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case sizeof(struct in6_addr):
+		ret_val = netlbl_unlhsh_remove_addr6(net,
+						     iface, addr, mask,
+						     audit_info);
+		break;
+#endif /* IPv6 */
+	default:
+		ret_val = -EINVAL;
+	}
+	if (ret_val == 0) {
+		netlbl_unlhsh_condremove_iface(iface);
+		atomic_dec(&netlabel_mgmt_protocount);
+	}
+
+unlhsh_remove_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/*
+ * General Helper Functions
+ */
+
+/**
+ * netlbl_unlhsh_netdev_handler - Network device notification handler
+ * @this: notifier block
+ * @event: the event
+ * @ptr: the network device (cast to void)
+ *
+ * Description:
+ * Handle network device events, although at present all we care about is a
+ * network device going away.  In the case of a device going away we clear any
+ * related entries from the unlabeled connection hash table.
+ *
+ */
+static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
+					unsigned long event,
+					void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct netlbl_unlhsh_iface *iface = NULL;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	/* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
+	if (event == NETDEV_DOWN) {
+		spin_lock(&netlbl_unlhsh_lock);
+		iface = netlbl_unlhsh_search_iface(dev->ifindex);
+		if (iface != NULL && iface->valid) {
+			iface->valid = 0;
+			list_del_rcu(&iface->list);
+		} else
+			iface = NULL;
+		spin_unlock(&netlbl_unlhsh_lock);
+	}
+
+	if (iface != NULL)
+		call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
+
+	return NOTIFY_DONE;
+}
+
+/**
+ * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
+ * @value: desired value
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Set the value of the unlabeled accept flag to @value.
+ *
+ */
+static void netlbl_unlabel_acceptflg_set(u8 value,
+					 struct netlbl_audit *audit_info)
+{
+	struct audit_buffer *audit_buf;
+	u8 old_val;
+
+	old_val = netlabel_unlabel_acceptflg;
+	netlabel_unlabel_acceptflg = value;
+	audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
+					      audit_info);
+	if (audit_buf != NULL) {
+		audit_log_format(audit_buf,
+				 " unlbl_accept=%u old=%u", value, old_val);
+		audit_log_end(audit_buf);
+	}
+}
+
+/**
+ * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
+ * @info: the Generic NETLINK info block
+ * @addr: the IP address
+ * @mask: the IP address mask
+ * @len: the address length
+ *
+ * Description:
+ * Examine the Generic NETLINK message and extract the IP address information.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
+				       void **addr,
+				       void **mask,
+				       u32 *len)
+{
+	u32 addr_len;
+
+	if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+		if (addr_len != sizeof(struct in_addr) &&
+		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
+			return -EINVAL;
+		*len = addr_len;
+		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
+		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
+		return 0;
+	} else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
+		addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+		if (addr_len != sizeof(struct in6_addr) &&
+		    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
+			return -EINVAL;
+		*len = addr_len;
+		*addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
+		*mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * NetLabel Command Handlers
+ */
+
+/**
+ * netlbl_unlabel_accept - Handle an ACCEPT message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated ACCEPT message and set the accept flag accordingly.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
+{
+	u8 value;
+	struct netlbl_audit audit_info;
+
+	if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
+		value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
+		if (value == 1 || value == 0) {
+			netlbl_netlink_auditinfo(skb, &audit_info);
+			netlbl_unlabel_acceptflg_set(value, &audit_info);
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * netlbl_unlabel_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond with the current status.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
+{
+	int ret_val = -EINVAL;
+	struct sk_buff *ans_skb;
+	void *data;
+
+	ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (ans_skb == NULL)
+		goto list_failure;
+	data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family,
+				 0, NLBL_UNLABEL_C_LIST);
+	if (data == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure;
+	}
+
+	ret_val = nla_put_u8(ans_skb,
+			     NLBL_UNLABEL_A_ACPTFLG,
+			     netlabel_unlabel_acceptflg);
+	if (ret_val != 0)
+		goto list_failure;
+
+	genlmsg_end(ans_skb, data);
+	return genlmsg_reply(ans_skb, info);
+
+list_failure:
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticadd - Handle a STATICADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADD message and add a new unlabeled
+ * connection entry to the hash table.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticadd(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	int ret_val;
+	char *dev_name;
+	void *addr;
+	void *mask;
+	u32 addr_len;
+	u32 secid;
+	struct netlbl_audit audit_info;
+
+	/* Don't allow users to add both IPv4 and IPv6 addresses for a
+	 * single entry.  However, allow users to create two entries, one each
+	 * for IPv4 and IPv4, with the same LSM security context which should
+	 * achieve the same result. */
+	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+	    !info->attrs[NLBL_UNLABEL_A_IFACE] ||
+	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+	if (ret_val != 0)
+		return ret_val;
+	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+	ret_val = security_secctx_to_secid(
+		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+				  &secid);
+	if (ret_val != 0)
+		return ret_val;
+
+	return netlbl_unlhsh_add(&init_net,
+				 dev_name, addr, mask, addr_len, secid,
+				 &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICADDDEF message and add a new default
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	int ret_val;
+	void *addr;
+	void *mask;
+	u32 addr_len;
+	u32 secid;
+	struct netlbl_audit audit_info;
+
+	/* Don't allow users to add both IPv4 and IPv6 addresses for a
+	 * single entry.  However, allow users to create two entries, one each
+	 * for IPv4 and IPv6, with the same LSM security context which should
+	 * achieve the same result. */
+	if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
+	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+	if (ret_val != 0)
+		return ret_val;
+	ret_val = security_secctx_to_secid(
+		                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+				  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
+				  &secid);
+	if (ret_val != 0)
+		return ret_val;
+
+	return netlbl_unlhsh_add(&init_net,
+				 NULL, addr, mask, addr_len, secid,
+				 &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVE message and remove the specified
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremove(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	int ret_val;
+	char *dev_name;
+	void *addr;
+	void *mask;
+	u32 addr_len;
+	struct netlbl_audit audit_info;
+
+	/* See the note in netlbl_unlabel_staticadd() about not allowing both
+	 * IPv4 and IPv6 in the same entry. */
+	if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
+	    !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+	if (ret_val != 0)
+		return ret_val;
+	dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
+
+	return netlbl_unlhsh_remove(&init_net,
+				    dev_name, addr, mask, addr_len,
+				    &audit_info);
+}
+
+/**
+ * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated STATICREMOVEDEF message and remove the default
+ * unlabeled connection entry.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	int ret_val;
+	void *addr;
+	void *mask;
+	u32 addr_len;
+	struct netlbl_audit audit_info;
+
+	/* See the note in netlbl_unlabel_staticadd() about not allowing both
+	 * IPv4 and IPv6 in the same entry. */
+	if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
+	      (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
+	       !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
+		return -EINVAL;
+
+	netlbl_netlink_auditinfo(skb, &audit_info);
+
+	ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
+	if (ret_val != 0)
+		return ret_val;
+
+	return netlbl_unlhsh_remove(&init_net,
+				    NULL, addr, mask, addr_len,
+				    &audit_info);
+}
+
+
+/**
+ * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
+ * @cmd: command/message
+ * @iface: the interface entry
+ * @addr4: the IPv4 address entry
+ * @addr6: the IPv6 address entry
+ * @arg: the netlbl_unlhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used to generate a response for a
+ * STATICLIST or STATICLISTDEF message.  When called either @addr4 or @addr6
+ * can be specified, not both, the other unspecified entry should be set to
+ * NULL by the caller.  Returns the size of the message on success, negative
+ * values on failure.
+ *
+ */
+static int netlbl_unlabel_staticlist_gen(u32 cmd,
+				       const struct netlbl_unlhsh_iface *iface,
+				       const struct netlbl_unlhsh_addr4 *addr4,
+				       const struct netlbl_unlhsh_addr6 *addr6,
+				       void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_unlhsh_walk_arg *cb_arg = arg;
+	struct net_device *dev;
+	void *data;
+	u32 secid;
+	char *secctx;
+	u32 secctx_len;
+
+	data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid,
+			   cb_arg->seq, &netlbl_unlabel_gnl_family,
+			   NLM_F_MULTI, cmd);
+	if (data == NULL)
+		goto list_cb_failure;
+
+	if (iface->ifindex > 0) {
+		dev = dev_get_by_index(&init_net, iface->ifindex);
+		if (!dev) {
+			ret_val = -ENODEV;
+			goto list_cb_failure;
+		}
+		ret_val = nla_put_string(cb_arg->skb,
+					 NLBL_UNLABEL_A_IFACE, dev->name);
+		dev_put(dev);
+		if (ret_val != 0)
+			goto list_cb_failure;
+	}
+
+	if (addr4) {
+		struct in_addr addr_struct;
+
+		addr_struct.s_addr = addr4->list.addr;
+		ret_val = nla_put(cb_arg->skb,
+				  NLBL_UNLABEL_A_IPV4ADDR,
+				  sizeof(struct in_addr),
+				  &addr_struct);
+		if (ret_val != 0)
+			goto list_cb_failure;
+
+		addr_struct.s_addr = addr4->list.mask;
+		ret_val = nla_put(cb_arg->skb,
+				  NLBL_UNLABEL_A_IPV4MASK,
+				  sizeof(struct in_addr),
+				  &addr_struct);
+		if (ret_val != 0)
+			goto list_cb_failure;
+
+		secid = addr4->secid;
+	} else {
+		ret_val = nla_put(cb_arg->skb,
+				  NLBL_UNLABEL_A_IPV6ADDR,
+				  sizeof(struct in6_addr),
+				  &addr6->list.addr);
+		if (ret_val != 0)
+			goto list_cb_failure;
+
+		ret_val = nla_put(cb_arg->skb,
+				  NLBL_UNLABEL_A_IPV6MASK,
+				  sizeof(struct in6_addr),
+				  &addr6->list.mask);
+		if (ret_val != 0)
+			goto list_cb_failure;
+
+		secid = addr6->secid;
+	}
+
+	ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
+	if (ret_val != 0)
+		goto list_cb_failure;
+	ret_val = nla_put(cb_arg->skb,
+			  NLBL_UNLABEL_A_SECCTX,
+			  secctx_len,
+			  secctx);
+	security_release_secctx(secctx, secctx_len);
+	if (ret_val != 0)
+		goto list_cb_failure;
+
+	cb_arg->seq++;
+	return genlmsg_end(cb_arg->skb, data);
+
+list_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
+	return ret_val;
+}
+
+/**
+ * netlbl_unlabel_staticlist - Handle a STATICLIST message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLIST message and dump the unlabeled
+ * connection hash table in a form suitable for use in a kernel generated
+ * STATICLIST message.  Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlist(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	struct netlbl_unlhsh_walk_arg cb_arg;
+	u32 skip_bkt = cb->args[0];
+	u32 skip_chain = cb->args[1];
+	u32 skip_addr4 = cb->args[2];
+	u32 skip_addr6 = cb->args[3];
+	u32 iter_bkt;
+	u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
+	struct netlbl_unlhsh_iface *iface;
+	struct list_head *iter_list;
+	struct netlbl_af4list *addr4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct netlbl_af6list *addr6;
+#endif
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	rcu_read_lock();
+	for (iter_bkt = skip_bkt;
+	     iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
+	     iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) {
+		iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
+		list_for_each_entry_rcu(iface, iter_list, list) {
+			if (!iface->valid ||
+			    iter_chain++ < skip_chain)
+				continue;
+			netlbl_af4list_foreach_rcu(addr4,
+						   &iface->addr4_list) {
+				if (iter_addr4++ < skip_addr4)
+					continue;
+				if (netlbl_unlabel_staticlist_gen(
+					      NLBL_UNLABEL_C_STATICLIST,
+					      iface,
+					      netlbl_unlhsh_addr4_entry(addr4),
+					      NULL,
+					      &cb_arg) < 0) {
+					iter_addr4--;
+					iter_chain--;
+					goto unlabel_staticlist_return;
+				}
+			}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+			netlbl_af6list_foreach_rcu(addr6,
+						   &iface->addr6_list) {
+				if (iter_addr6++ < skip_addr6)
+					continue;
+				if (netlbl_unlabel_staticlist_gen(
+					      NLBL_UNLABEL_C_STATICLIST,
+					      iface,
+					      NULL,
+					      netlbl_unlhsh_addr6_entry(addr6),
+					      &cb_arg) < 0) {
+					iter_addr6--;
+					iter_chain--;
+					goto unlabel_staticlist_return;
+				}
+			}
+#endif /* IPv6 */
+		}
+	}
+
+unlabel_staticlist_return:
+	rcu_read_unlock();
+	cb->args[0] = skip_bkt;
+	cb->args[1] = skip_chain;
+	cb->args[2] = skip_addr4;
+	cb->args[3] = skip_addr6;
+	return skb->len;
+}
+
+/**
+ * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated STATICLISTDEF message and dump the default
+ * unlabeled connection entry in a form suitable for use in a kernel generated
+ * STATICLISTDEF message.  Returns the length of @skb.
+ *
+ */
+static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
+					struct netlink_callback *cb)
+{
+	struct netlbl_unlhsh_walk_arg cb_arg;
+	struct netlbl_unlhsh_iface *iface;
+	u32 skip_addr4 = cb->args[0];
+	u32 skip_addr6 = cb->args[1];
+	u32 iter_addr4 = 0;
+	struct netlbl_af4list *addr4;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u32 iter_addr6 = 0;
+	struct netlbl_af6list *addr6;
+#endif
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	rcu_read_lock();
+	iface = rcu_dereference(netlbl_unlhsh_def);
+	if (iface == NULL || !iface->valid)
+		goto unlabel_staticlistdef_return;
+
+	netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
+		if (iter_addr4++ < skip_addr4)
+			continue;
+		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+					      iface,
+					      netlbl_unlhsh_addr4_entry(addr4),
+					      NULL,
+					      &cb_arg) < 0) {
+			iter_addr4--;
+			goto unlabel_staticlistdef_return;
+		}
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
+		if (iter_addr6++ < skip_addr6)
+			continue;
+		if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
+					      iface,
+					      NULL,
+					      netlbl_unlhsh_addr6_entry(addr6),
+					      &cb_arg) < 0) {
+			iter_addr6--;
+			goto unlabel_staticlistdef_return;
+		}
+	}
+#endif /* IPv6 */
+
+unlabel_staticlistdef_return:
+	rcu_read_unlock();
+	cb->args[0] = skip_addr4;
+	cb->args[1] = skip_addr6;
+	return skb->len;
+}
+
+/*
+ * NetLabel Generic NETLINK Command Definitions
+ */
+
+static struct genl_ops netlbl_unlabel_genl_ops[] = {
+	{
+	.cmd = NLBL_UNLABEL_C_STATICADD,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_staticadd,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_STATICREMOVE,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_staticremove,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_STATICLIST,
+	.flags = 0,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_unlabel_staticlist,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_STATICADDDEF,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_staticadddef,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_staticremovedef,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_STATICLISTDEF,
+	.flags = 0,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_unlabel_staticlistdef,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_ACCEPT,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_accept,
+	.dumpit = NULL,
+	},
+	{
+	.cmd = NLBL_UNLABEL_C_LIST,
+	.flags = 0,
+	.policy = netlbl_unlabel_genl_policy,
+	.doit = netlbl_unlabel_list,
+	.dumpit = NULL,
+	},
+};
+
+/*
+ * NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component
+ *
+ * Description:
+ * Register the unlabeled packet NetLabel component with the Generic NETLINK
+ * mechanism.  Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_unlabel_genl_init(void)
+{
+	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
+		netlbl_unlabel_genl_ops, ARRAY_SIZE(netlbl_unlabel_genl_ops));
+}
+
+/*
+ * NetLabel KAPI Hooks
+ */
+
+static struct notifier_block netlbl_unlhsh_netdev_notifier = {
+	.notifier_call = netlbl_unlhsh_netdev_handler,
+};
+
+/**
+ * netlbl_unlabel_init - Initialize the unlabeled connection hash table
+ * @size: the number of bits to use for the hash buckets
+ *
+ * Description:
+ * Initializes the unlabeled connection hash table and registers a network
+ * device notification handler.  This function should only be called by the
+ * NetLabel subsystem itself during initialization.  Returns zero on success,
+ * non-zero values on error.
+ *
+ */
+int __init netlbl_unlabel_init(u32 size)
+{
+	u32 iter;
+	struct netlbl_unlhsh_tbl *hsh_tbl;
+
+	if (size == 0)
+		return -EINVAL;
+
+	hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
+	if (hsh_tbl == NULL)
+		return -ENOMEM;
+	hsh_tbl->size = 1 << size;
+	hsh_tbl->tbl = kcalloc(hsh_tbl->size,
+			       sizeof(struct list_head),
+			       GFP_KERNEL);
+	if (hsh_tbl->tbl == NULL) {
+		kfree(hsh_tbl);
+		return -ENOMEM;
+	}
+	for (iter = 0; iter < hsh_tbl->size; iter++)
+		INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
+
+	rcu_read_lock();
+	spin_lock(&netlbl_unlhsh_lock);
+	rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
+	spin_unlock(&netlbl_unlhsh_lock);
+	rcu_read_unlock();
+
+	register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);
+
+	return 0;
+}
+
+/**
+ * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
+ * @skb: the packet
+ * @family: protocol family
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Determine the security attributes, if any, for an unlabled packet and return
+ * them in @secattr.  Returns zero on success and negative values on failure.
+ *
+ */
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+			   u16 family,
+			   struct netlbl_lsm_secattr *secattr)
+{
+	struct netlbl_unlhsh_iface *iface;
+
+	rcu_read_lock();
+	iface = netlbl_unlhsh_search_iface(skb->skb_iif);
+	if (iface == NULL)
+		iface = rcu_dereference(netlbl_unlhsh_def);
+	if (iface == NULL || !iface->valid)
+		goto unlabel_getattr_nolabel;
+	switch (family) {
+	case PF_INET: {
+		struct iphdr *hdr4;
+		struct netlbl_af4list *addr4;
+
+		hdr4 = ip_hdr(skb);
+		addr4 = netlbl_af4list_search(hdr4->saddr,
+					      &iface->addr4_list);
+		if (addr4 == NULL)
+			goto unlabel_getattr_nolabel;
+		secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
+		break;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case PF_INET6: {
+		struct ipv6hdr *hdr6;
+		struct netlbl_af6list *addr6;
+
+		hdr6 = ipv6_hdr(skb);
+		addr6 = netlbl_af6list_search(&hdr6->saddr,
+					      &iface->addr6_list);
+		if (addr6 == NULL)
+			goto unlabel_getattr_nolabel;
+		secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
+		break;
+	}
+#endif /* IPv6 */
+	default:
+		goto unlabel_getattr_nolabel;
+	}
+	rcu_read_unlock();
+
+	secattr->flags |= NETLBL_SECATTR_SECID;
+	secattr->type = NETLBL_NLTYPE_UNLABELED;
+	return 0;
+
+unlabel_getattr_nolabel:
+	rcu_read_unlock();
+	if (netlabel_unlabel_acceptflg == 0)
+		return -ENOMSG;
+	secattr->type = NETLBL_NLTYPE_UNLABELED;
+	return 0;
+}
+
+/**
+ * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets
+ *
+ * Description:
+ * Set the default NetLabel configuration to allow incoming unlabeled packets
+ * and to send unlabeled network traffic by default.
+ *
+ */
+int __init netlbl_unlabel_defconf(void)
+{
+	int ret_val;
+	struct netlbl_dom_map *entry;
+	struct netlbl_audit audit_info;
+
+	/* Only the kernel is allowed to call this function and the only time
+	 * it is called is at bootup before the audit subsystem is reporting
+	 * messages so don't worry to much about these values. */
+	security_task_getsecid(current, &audit_info.secid);
+	audit_info.loginuid = 0;
+	audit_info.sessionid = 0;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+	entry->type = NETLBL_NLTYPE_UNLABELED;
+	ret_val = netlbl_domhsh_add_default(entry, &audit_info);
+	if (ret_val != 0)
+		return ret_val;
+
+	netlbl_unlabel_acceptflg_set(1, &audit_info);
+
+	return 0;
+}
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
new file mode 100644
index 00000000..0bc8dc3f
--- /dev/null
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -0,0 +1,246 @@
+/*
+ * NetLabel Unlabeled Support
+ *
+ * This file defines functions for dealing with unlabeled packets for the
+ * NetLabel system.  The NetLabel system manages static and dynamic label
+ * mappings for network protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_UNLABELED_H
+#define _NETLABEL_UNLABELED_H
+
+#include <net/netlabel.h>
+
+/*
+ * The following NetLabel payloads are supported by the Unlabeled subsystem.
+ *
+ * o STATICADD
+ *   This message is sent from an application to add a new static label for
+ *   incoming unlabeled connections.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVE
+ *   This message is sent from an application to remove an existing static
+ *   label for incoming unlabeled connections.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLIST
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated STATICLIST message.  When sent by an
+ *   application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should response with a series of the following messages.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_IFACE
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICADDDEF
+ *   This message is sent from an application to set the default static
+ *   label for incoming unlabeled connections.
+ *
+ *   Required attribute:
+ *
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICREMOVEDEF
+ *   This message is sent from an application to remove the existing default
+ *   static label for incoming unlabeled connections.
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o STATICLISTDEF
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated STATICLISTDEF message.  When sent by
+ *   an application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should response with the following message.
+ *
+ *   Required attribute:
+ *
+ *     NLBL_UNLABEL_A_SECCTX
+ *
+ *   If IPv4 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV4ADDR
+ *     NLBL_UNLABEL_A_IPV4MASK
+ *
+ *   If IPv6 is specified the following attributes are required:
+ *
+ *     NLBL_UNLABEL_A_IPV6ADDR
+ *     NLBL_UNLABEL_A_IPV6MASK
+ *
+ * o ACCEPT
+ *   This message is sent from an application to specify if the kernel should
+ *   allow unlabled packets to pass if they do not match any of the static
+ *   mappings defined in the unlabeled module.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_ACPTFLG
+ *
+ * o LIST
+ *   This message can be sent either from an application or by the kernel in
+ *   response to an application generated LIST message.  When sent by an
+ *   application there is no payload.  The kernel should respond to a LIST
+ *   message with a LIST message on success.
+ *
+ *   Required attributes:
+ *
+ *     NLBL_UNLABEL_A_ACPTFLG
+ *
+ */
+
+/* NetLabel Unlabeled commands */
+enum {
+	NLBL_UNLABEL_C_UNSPEC,
+	NLBL_UNLABEL_C_ACCEPT,
+	NLBL_UNLABEL_C_LIST,
+	NLBL_UNLABEL_C_STATICADD,
+	NLBL_UNLABEL_C_STATICREMOVE,
+	NLBL_UNLABEL_C_STATICLIST,
+	NLBL_UNLABEL_C_STATICADDDEF,
+	NLBL_UNLABEL_C_STATICREMOVEDEF,
+	NLBL_UNLABEL_C_STATICLISTDEF,
+	__NLBL_UNLABEL_C_MAX,
+};
+
+/* NetLabel Unlabeled attributes */
+enum {
+	NLBL_UNLABEL_A_UNSPEC,
+	NLBL_UNLABEL_A_ACPTFLG,
+	/* (NLA_U8)
+	 * if true then unlabeled packets are allowed to pass, else unlabeled
+	 * packets are rejected */
+	NLBL_UNLABEL_A_IPV6ADDR,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address */
+	NLBL_UNLABEL_A_IPV6MASK,
+	/* (NLA_BINARY, struct in6_addr)
+	 * an IPv6 address mask */
+	NLBL_UNLABEL_A_IPV4ADDR,
+	/* (NLA_BINARY, struct in_addr)
+	 * an IPv4 address */
+	NLBL_UNLABEL_A_IPV4MASK,
+	/* (NLA_BINARY, struct in_addr)
+	 * and IPv4 address mask */
+	NLBL_UNLABEL_A_IFACE,
+	/* (NLA_NULL_STRING)
+	 * network interface */
+	NLBL_UNLABEL_A_SECCTX,
+	/* (NLA_BINARY)
+	 * a LSM specific security context */
+	__NLBL_UNLABEL_A_MAX,
+};
+#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
+
+/* NetLabel protocol functions */
+int netlbl_unlabel_genl_init(void);
+
+/* Unlabeled connection hash table size */
+/* XXX - currently this number is an uneducated guess */
+#define NETLBL_UNLHSH_BITSIZE       7
+
+/* General Unlabeled init function */
+int netlbl_unlabel_init(u32 size);
+
+/* Static/Fallback label management functions */
+int netlbl_unlhsh_add(struct net *net,
+		      const char *dev_name,
+		      const void *addr,
+		      const void *mask,
+		      u32 addr_len,
+		      u32 secid,
+		      struct netlbl_audit *audit_info);
+int netlbl_unlhsh_remove(struct net *net,
+			 const char *dev_name,
+			 const void *addr,
+			 const void *mask,
+			 u32 addr_len,
+			 struct netlbl_audit *audit_info);
+
+/* Process Unlabeled incoming network packets */
+int netlbl_unlabel_getattr(const struct sk_buff *skb,
+			   u16 family,
+			   struct netlbl_lsm_secattr *secattr);
+
+/* Set the default configuration to allow Unlabeled packets */
+int netlbl_unlabel_defconf(void);
+
+#endif
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
new file mode 100644
index 00000000..a3fd75ac
--- /dev/null
+++ b/net/netlabel/netlabel_user.c
@@ -0,0 +1,124 @@
+/*
+ * NetLabel NETLINK Interface
+ *
+ * This file defines the NETLINK interface for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/audit.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <asm/bug.h>
+
+#include "netlabel_mgmt.h"
+#include "netlabel_unlabeled.h"
+#include "netlabel_cipso_v4.h"
+#include "netlabel_user.h"
+
+/*
+ * NetLabel NETLINK Setup Functions
+ */
+
+/**
+ * netlbl_netlink_init - Initialize the NETLINK communication channel
+ *
+ * Description:
+ * Call out to the NetLabel components so they can register their families and
+ * commands with the Generic NETLINK mechanism.  Returns zero on success and
+ * non-zero on failure.
+ *
+ */
+int __init netlbl_netlink_init(void)
+{
+	int ret_val;
+
+	ret_val = netlbl_mgmt_genl_init();
+	if (ret_val != 0)
+		return ret_val;
+
+	ret_val = netlbl_cipsov4_genl_init();
+	if (ret_val != 0)
+		return ret_val;
+
+	ret_val = netlbl_unlabel_genl_init();
+	if (ret_val != 0)
+		return ret_val;
+
+	return 0;
+}
+
+/*
+ * NetLabel Audit Functions
+ */
+
+/**
+ * netlbl_audit_start_common - Start an audit message
+ * @type: audit message type
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Start an audit message using the type specified in @type and fill the audit
+ * message with some fields common to all NetLabel audit messages.  Returns
+ * a pointer to the audit buffer on success, NULL on failure.
+ *
+ */
+struct audit_buffer *netlbl_audit_start_common(int type,
+					       struct netlbl_audit *audit_info)
+{
+	struct audit_buffer *audit_buf;
+	char *secctx;
+	u32 secctx_len;
+
+	if (audit_enabled == 0)
+		return NULL;
+
+	audit_buf = audit_log_start(current->audit_context, GFP_ATOMIC, type);
+	if (audit_buf == NULL)
+		return NULL;
+
+	audit_log_format(audit_buf, "netlabel: auid=%u ses=%u",
+			 audit_info->loginuid,
+			 audit_info->sessionid);
+
+	if (audit_info->secid != 0 &&
+	    security_secid_to_secctx(audit_info->secid,
+				     &secctx,
+				     &secctx_len) == 0) {
+		audit_log_format(audit_buf, " subj=%s", secctx);
+		security_release_secctx(secctx, secctx_len);
+	}
+
+	return audit_buf;
+}
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
new file mode 100644
index 00000000..f4fc4c9a
--- /dev/null
+++ b/net/netlabel/netlabel_user.h
@@ -0,0 +1,66 @@
+/*
+ * NetLabel NETLINK Interface
+ *
+ * This file defines the NETLINK interface for the NetLabel system.  The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_USER_H
+#define _NETLABEL_USER_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/capability.h>
+#include <linux/audit.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+
+/* NetLabel NETLINK helper functions */
+
+/**
+ * netlbl_netlink_auditinfo - Fetch the audit information from a NETLINK msg
+ * @skb: the packet
+ * @audit_info: NetLabel audit information
+ */
+static inline void netlbl_netlink_auditinfo(struct sk_buff *skb,
+					    struct netlbl_audit *audit_info)
+{
+	security_task_getsecid(current, &audit_info->secid);
+	audit_info->loginuid = audit_get_loginuid(current);
+	audit_info->sessionid = audit_get_sessionid(current);
+}
+
+/* NetLabel NETLINK I/O functions */
+
+int netlbl_netlink_init(void);
+
+/* NetLabel Audit Functions */
+
+struct audit_buffer *netlbl_audit_start_common(int type,
+					      struct netlbl_audit *audit_info);
+
+#endif
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
new file mode 100644
index 00000000..bdd6ddf4
--- /dev/null
+++ b/net/netlink/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the netlink driver.
+#
+
+obj-y  				:= af_netlink.o genetlink.o
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
new file mode 100644
index 00000000..24bc620b
--- /dev/null
+++ b/net/netlink/af_netlink.c
@@ -0,0 +1,2165 @@
+/*
+ * NETLINK      Kernel-user communication protocol.
+ *
+ * 		Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
+ * 				Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
+ *                               added netlink_proto_exit
+ * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
+ * 				 use nlk_sk, as sk->protinfo is on a diet 8)
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ * 				 - inc module use count of module that owns
+ * 				   the kernel socket in case userspace opens
+ * 				   socket of same protocol
+ * 				 - remove all module support, since netlink is
+ * 				   mandatory if CONFIG_NET=y these days
+ */
+
+#include <linux/module.h>
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/security.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/netlink.h>
+
+#define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
+#define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
+
+struct netlink_sock {
+	/* struct sock has to be the first member of netlink_sock */
+	struct sock		sk;
+	u32			pid;
+	u32			dst_pid;
+	u32			dst_group;
+	u32			flags;
+	u32			subscriptions;
+	u32			ngroups;
+	unsigned long		*groups;
+	unsigned long		state;
+	wait_queue_head_t	wait;
+	struct netlink_callback	*cb;
+	struct mutex		*cb_mutex;
+	struct mutex		cb_def_mutex;
+	void			(*netlink_rcv)(struct sk_buff *skb);
+	struct module		*module;
+};
+
+struct listeners {
+	struct rcu_head		rcu;
+	unsigned long		masks[0];
+};
+
+#define NETLINK_KERNEL_SOCKET	0x1
+#define NETLINK_RECV_PKTINFO	0x2
+#define NETLINK_BROADCAST_SEND_ERROR	0x4
+#define NETLINK_RECV_NO_ENOBUFS	0x8
+
+static inline struct netlink_sock *nlk_sk(struct sock *sk)
+{
+	return container_of(sk, struct netlink_sock, sk);
+}
+
+static inline int netlink_is_kernel(struct sock *sk)
+{
+	return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
+}
+
+struct nl_pid_hash {
+	struct hlist_head *table;
+	unsigned long rehash_time;
+
+	unsigned int mask;
+	unsigned int shift;
+
+	unsigned int entries;
+	unsigned int max_shift;
+
+	u32 rnd;
+};
+
+struct netlink_table {
+	struct nl_pid_hash hash;
+	struct hlist_head mc_list;
+	struct listeners __rcu *listeners;
+	unsigned int nl_nonroot;
+	unsigned int groups;
+	struct mutex *cb_mutex;
+	struct module *module;
+	int registered;
+};
+
+static struct netlink_table *nl_table;
+
+static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
+
+static int netlink_dump(struct sock *sk);
+static void netlink_destroy_callback(struct netlink_callback *cb);
+
+static DEFINE_RWLOCK(nl_table_lock);
+static atomic_t nl_table_users = ATOMIC_INIT(0);
+
+static ATOMIC_NOTIFIER_HEAD(netlink_chain);
+
+static u32 netlink_group_mask(u32 group)
+{
+	return group ? 1 << (group - 1) : 0;
+}
+
+static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
+{
+	return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
+}
+
+static void netlink_sock_destruct(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (nlk->cb) {
+		if (nlk->cb->done)
+			nlk->cb->done(nlk->cb);
+		netlink_destroy_callback(nlk->cb);
+	}
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
+		return;
+	}
+
+	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+	WARN_ON(nlk_sk(sk)->groups);
+}
+
+/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
+ * SMP. Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines.
+ */
+
+void netlink_table_grab(void)
+	__acquires(nl_table_lock)
+{
+	might_sleep();
+
+	write_lock_irq(&nl_table_lock);
+
+	if (atomic_read(&nl_table_users)) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		add_wait_queue_exclusive(&nl_table_wait, &wait);
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&nl_table_users) == 0)
+				break;
+			write_unlock_irq(&nl_table_lock);
+			schedule();
+			write_lock_irq(&nl_table_lock);
+		}
+
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&nl_table_wait, &wait);
+	}
+}
+
+void netlink_table_ungrab(void)
+	__releases(nl_table_lock)
+{
+	write_unlock_irq(&nl_table_lock);
+	wake_up(&nl_table_wait);
+}
+
+static inline void
+netlink_lock_table(void)
+{
+	/* read_lock() synchronizes us to netlink_table_grab */
+
+	read_lock(&nl_table_lock);
+	atomic_inc(&nl_table_users);
+	read_unlock(&nl_table_lock);
+}
+
+static inline void
+netlink_unlock_table(void)
+{
+	if (atomic_dec_and_test(&nl_table_users))
+		wake_up(&nl_table_wait);
+}
+
+static inline struct sock *netlink_lookup(struct net *net, int protocol,
+					  u32 pid)
+{
+	struct nl_pid_hash *hash = &nl_table[protocol].hash;
+	struct hlist_head *head;
+	struct sock *sk;
+	struct hlist_node *node;
+
+	read_lock(&nl_table_lock);
+	head = nl_pid_hashfn(hash, pid);
+	sk_for_each(sk, node, head) {
+		if (net_eq(sock_net(sk), net) && (nlk_sk(sk)->pid == pid)) {
+			sock_hold(sk);
+			goto found;
+		}
+	}
+	sk = NULL;
+found:
+	read_unlock(&nl_table_lock);
+	return sk;
+}
+
+static inline struct hlist_head *nl_pid_hash_zalloc(size_t size)
+{
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_ATOMIC);
+	else
+		return (struct hlist_head *)
+			__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+					 get_order(size));
+}
+
+static inline void nl_pid_hash_free(struct hlist_head *table, size_t size)
+{
+	if (size <= PAGE_SIZE)
+		kfree(table);
+	else
+		free_pages((unsigned long)table, get_order(size));
+}
+
+static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow)
+{
+	unsigned int omask, mask, shift;
+	size_t osize, size;
+	struct hlist_head *otable, *table;
+	int i;
+
+	omask = mask = hash->mask;
+	osize = size = (mask + 1) * sizeof(*table);
+	shift = hash->shift;
+
+	if (grow) {
+		if (++shift > hash->max_shift)
+			return 0;
+		mask = mask * 2 + 1;
+		size *= 2;
+	}
+
+	table = nl_pid_hash_zalloc(size);
+	if (!table)
+		return 0;
+
+	otable = hash->table;
+	hash->table = table;
+	hash->mask = mask;
+	hash->shift = shift;
+	get_random_bytes(&hash->rnd, sizeof(hash->rnd));
+
+	for (i = 0; i <= omask; i++) {
+		struct sock *sk;
+		struct hlist_node *node, *tmp;
+
+		sk_for_each_safe(sk, node, tmp, &otable[i])
+			__sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid));
+	}
+
+	nl_pid_hash_free(otable, osize);
+	hash->rehash_time = jiffies + 10 * 60 * HZ;
+	return 1;
+}
+
+static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
+{
+	int avg = hash->entries >> hash->shift;
+
+	if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1))
+		return 1;
+
+	if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) {
+		nl_pid_hash_rehash(hash, 0);
+		return 1;
+	}
+
+	return 0;
+}
+
+static const struct proto_ops netlink_ops;
+
+static void
+netlink_update_listeners(struct sock *sk)
+{
+	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+	struct hlist_node *node;
+	unsigned long mask;
+	unsigned int i;
+
+	for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
+		mask = 0;
+		sk_for_each_bound(sk, node, &tbl->mc_list) {
+			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
+				mask |= nlk_sk(sk)->groups[i];
+		}
+		tbl->listeners->masks[i] = mask;
+	}
+	/* this function is only called with the netlink table "grabbed", which
+	 * makes sure updates are visible before bind or setsockopt return. */
+}
+
+static int netlink_insert(struct sock *sk, struct net *net, u32 pid)
+{
+	struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
+	struct hlist_head *head;
+	int err = -EADDRINUSE;
+	struct sock *osk;
+	struct hlist_node *node;
+	int len;
+
+	netlink_table_grab();
+	head = nl_pid_hashfn(hash, pid);
+	len = 0;
+	sk_for_each(osk, node, head) {
+		if (net_eq(sock_net(osk), net) && (nlk_sk(osk)->pid == pid))
+			break;
+		len++;
+	}
+	if (node)
+		goto err;
+
+	err = -EBUSY;
+	if (nlk_sk(sk)->pid)
+		goto err;
+
+	err = -ENOMEM;
+	if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
+		goto err;
+
+	if (len && nl_pid_hash_dilute(hash, len))
+		head = nl_pid_hashfn(hash, pid);
+	hash->entries++;
+	nlk_sk(sk)->pid = pid;
+	sk_add_node(sk, head);
+	err = 0;
+
+err:
+	netlink_table_ungrab();
+	return err;
+}
+
+static void netlink_remove(struct sock *sk)
+{
+	netlink_table_grab();
+	if (sk_del_node_init(sk))
+		nl_table[sk->sk_protocol].hash.entries--;
+	if (nlk_sk(sk)->subscriptions)
+		__sk_del_bind_node(sk);
+	netlink_table_ungrab();
+}
+
+static struct proto netlink_proto = {
+	.name	  = "NETLINK",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct netlink_sock),
+};
+
+static int __netlink_create(struct net *net, struct socket *sock,
+			    struct mutex *cb_mutex, int protocol)
+{
+	struct sock *sk;
+	struct netlink_sock *nlk;
+
+	sock->ops = &netlink_ops;
+
+	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+
+	nlk = nlk_sk(sk);
+	if (cb_mutex)
+		nlk->cb_mutex = cb_mutex;
+	else {
+		nlk->cb_mutex = &nlk->cb_def_mutex;
+		mutex_init(nlk->cb_mutex);
+	}
+	init_waitqueue_head(&nlk->wait);
+
+	sk->sk_destruct = netlink_sock_destruct;
+	sk->sk_protocol = protocol;
+	return 0;
+}
+
+static int netlink_create(struct net *net, struct socket *sock, int protocol,
+			  int kern)
+{
+	struct module *module = NULL;
+	struct mutex *cb_mutex;
+	struct netlink_sock *nlk;
+	int err = 0;
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol < 0 || protocol >= MAX_LINKS)
+		return -EPROTONOSUPPORT;
+
+	netlink_lock_table();
+#ifdef CONFIG_MODULES
+	if (!nl_table[protocol].registered) {
+		netlink_unlock_table();
+		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+		netlink_lock_table();
+	}
+#endif
+	if (nl_table[protocol].registered &&
+	    try_module_get(nl_table[protocol].module))
+		module = nl_table[protocol].module;
+	else
+		err = -EPROTONOSUPPORT;
+	cb_mutex = nl_table[protocol].cb_mutex;
+	netlink_unlock_table();
+
+	if (err < 0)
+		goto out;
+
+	err = __netlink_create(net, sock, cb_mutex, protocol);
+	if (err < 0)
+		goto out_module;
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &netlink_proto, 1);
+	local_bh_enable();
+
+	nlk = nlk_sk(sock->sk);
+	nlk->module = module;
+out:
+	return err;
+
+out_module:
+	module_put(module);
+	goto out;
+}
+
+static int netlink_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk;
+
+	if (!sk)
+		return 0;
+
+	netlink_remove(sk);
+	sock_orphan(sk);
+	nlk = nlk_sk(sk);
+
+	/*
+	 * OK. Socket is unlinked, any packets that arrive now
+	 * will be purged.
+	 */
+
+	sock->sk = NULL;
+	wake_up_interruptible_all(&nlk->wait);
+
+	skb_queue_purge(&sk->sk_write_queue);
+
+	if (nlk->pid) {
+		struct netlink_notify n = {
+						.net = sock_net(sk),
+						.protocol = sk->sk_protocol,
+						.pid = nlk->pid,
+					  };
+		atomic_notifier_call_chain(&netlink_chain,
+				NETLINK_URELEASE, &n);
+	}
+
+	module_put(nlk->module);
+
+	netlink_table_grab();
+	if (netlink_is_kernel(sk)) {
+		BUG_ON(nl_table[sk->sk_protocol].registered == 0);
+		if (--nl_table[sk->sk_protocol].registered == 0) {
+			kfree(nl_table[sk->sk_protocol].listeners);
+			nl_table[sk->sk_protocol].module = NULL;
+			nl_table[sk->sk_protocol].registered = 0;
+		}
+	} else if (nlk->subscriptions)
+		netlink_update_listeners(sk);
+	netlink_table_ungrab();
+
+	kfree(nlk->groups);
+	nlk->groups = NULL;
+
+	local_bh_disable();
+	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+	local_bh_enable();
+	sock_put(sk);
+	return 0;
+}
+
+static int netlink_autobind(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
+	struct hlist_head *head;
+	struct sock *osk;
+	struct hlist_node *node;
+	s32 pid = task_tgid_vnr(current);
+	int err;
+	static s32 rover = -4097;
+
+retry:
+	cond_resched();
+	netlink_table_grab();
+	head = nl_pid_hashfn(hash, pid);
+	sk_for_each(osk, node, head) {
+		if (!net_eq(sock_net(osk), net))
+			continue;
+		if (nlk_sk(osk)->pid == pid) {
+			/* Bind collision, search negative pid values. */
+			pid = rover--;
+			if (rover > -4097)
+				rover = -4097;
+			netlink_table_ungrab();
+			goto retry;
+		}
+	}
+	netlink_table_ungrab();
+
+	err = netlink_insert(sk, net, pid);
+	if (err == -EADDRINUSE)
+		goto retry;
+
+	/* If 2 threads race to autobind, that is fine.  */
+	if (err == -EBUSY)
+		err = 0;
+
+	return err;
+}
+
+static inline int netlink_capable(struct socket *sock, unsigned int flag)
+{
+	return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
+	       capable(CAP_NET_ADMIN);
+}
+
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (nlk->subscriptions && !subscriptions)
+		__sk_del_bind_node(sk);
+	else if (!nlk->subscriptions && subscriptions)
+		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+	nlk->subscriptions = subscriptions;
+}
+
+static int netlink_realloc_groups(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	unsigned int groups;
+	unsigned long *new_groups;
+	int err = 0;
+
+	netlink_table_grab();
+
+	groups = nl_table[sk->sk_protocol].groups;
+	if (!nl_table[sk->sk_protocol].registered) {
+		err = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (nlk->ngroups >= groups)
+		goto out_unlock;
+
+	new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
+	if (new_groups == NULL) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
+	       NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
+
+	nlk->groups = new_groups;
+	nlk->ngroups = groups;
+ out_unlock:
+	netlink_table_ungrab();
+	return err;
+}
+
+static int netlink_bind(struct socket *sock, struct sockaddr *addr,
+			int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
+	int err;
+
+	if (nladdr->nl_family != AF_NETLINK)
+		return -EINVAL;
+
+	/* Only superuser is allowed to listen multicasts */
+	if (nladdr->nl_groups) {
+		if (!netlink_capable(sock, NL_NONROOT_RECV))
+			return -EPERM;
+		err = netlink_realloc_groups(sk);
+		if (err)
+			return err;
+	}
+
+	if (nlk->pid) {
+		if (nladdr->nl_pid != nlk->pid)
+			return -EINVAL;
+	} else {
+		err = nladdr->nl_pid ?
+			netlink_insert(sk, net, nladdr->nl_pid) :
+			netlink_autobind(sock);
+		if (err)
+			return err;
+	}
+
+	if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
+		return 0;
+
+	netlink_table_grab();
+	netlink_update_subscriptions(sk, nlk->subscriptions +
+					 hweight32(nladdr->nl_groups) -
+					 hweight32(nlk->groups[0]));
+	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
+	netlink_update_listeners(sk);
+	netlink_table_ungrab();
+
+	return 0;
+}
+
+static int netlink_connect(struct socket *sock, struct sockaddr *addr,
+			   int alen, int flags)
+{
+	int err = 0;
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
+
+	if (alen < sizeof(addr->sa_family))
+		return -EINVAL;
+
+	if (addr->sa_family == AF_UNSPEC) {
+		sk->sk_state	= NETLINK_UNCONNECTED;
+		nlk->dst_pid	= 0;
+		nlk->dst_group  = 0;
+		return 0;
+	}
+	if (addr->sa_family != AF_NETLINK)
+		return -EINVAL;
+
+	/* Only superuser is allowed to send multicasts */
+	if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+		return -EPERM;
+
+	if (!nlk->pid)
+		err = netlink_autobind(sock);
+
+	if (err == 0) {
+		sk->sk_state	= NETLINK_CONNECTED;
+		nlk->dst_pid 	= nladdr->nl_pid;
+		nlk->dst_group  = ffs(nladdr->nl_groups);
+	}
+
+	return err;
+}
+
+static int netlink_getname(struct socket *sock, struct sockaddr *addr,
+			   int *addr_len, int peer)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
+
+	nladdr->nl_family = AF_NETLINK;
+	nladdr->nl_pad = 0;
+	*addr_len = sizeof(*nladdr);
+
+	if (peer) {
+		nladdr->nl_pid = nlk->dst_pid;
+		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
+	} else {
+		nladdr->nl_pid = nlk->pid;
+		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
+	}
+	return 0;
+}
+
+static void netlink_overrun(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
+		if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
+			sk->sk_err = ENOBUFS;
+			sk->sk_error_report(sk);
+		}
+	}
+	atomic_inc(&sk->sk_drops);
+}
+
+static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
+{
+	struct sock *sock;
+	struct netlink_sock *nlk;
+
+	sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, pid);
+	if (!sock)
+		return ERR_PTR(-ECONNREFUSED);
+
+	/* Don't bother queuing skb if kernel socket has no input function */
+	nlk = nlk_sk(sock);
+	if (sock->sk_state == NETLINK_CONNECTED &&
+	    nlk->dst_pid != nlk_sk(ssk)->pid) {
+		sock_put(sock);
+		return ERR_PTR(-ECONNREFUSED);
+	}
+	return sock;
+}
+
+struct sock *netlink_getsockbyfilp(struct file *filp)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct sock *sock;
+
+	if (!S_ISSOCK(inode->i_mode))
+		return ERR_PTR(-ENOTSOCK);
+
+	sock = SOCKET_I(inode)->sk;
+	if (sock->sk_family != AF_NETLINK)
+		return ERR_PTR(-EINVAL);
+
+	sock_hold(sock);
+	return sock;
+}
+
+/*
+ * Attach a skb to a netlink socket.
+ * The caller must hold a reference to the destination socket. On error, the
+ * reference is dropped. The skb is not send to the destination, just all
+ * all error checks are performed and memory in the queue is reserved.
+ * Return values:
+ * < 0: error. skb freed, reference to sock dropped.
+ * 0: continue
+ * 1: repeat lookup - reference dropped while waiting for socket memory.
+ */
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
+		      long *timeo, struct sock *ssk)
+{
+	struct netlink_sock *nlk;
+
+	nlk = nlk_sk(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    test_bit(0, &nlk->state)) {
+		DECLARE_WAITQUEUE(wait, current);
+		if (!*timeo) {
+			if (!ssk || netlink_is_kernel(ssk))
+				netlink_overrun(sk);
+			sock_put(sk);
+			kfree_skb(skb);
+			return -EAGAIN;
+		}
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&nlk->wait, &wait);
+
+		if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+		     test_bit(0, &nlk->state)) &&
+		    !sock_flag(sk, SOCK_DEAD))
+			*timeo = schedule_timeout(*timeo);
+
+		__set_current_state(TASK_RUNNING);
+		remove_wait_queue(&nlk->wait, &wait);
+		sock_put(sk);
+
+		if (signal_pending(current)) {
+			kfree_skb(skb);
+			return sock_intr_errno(*timeo);
+		}
+		return 1;
+	}
+	skb_set_owner_r(skb, sk);
+	return 0;
+}
+
+static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+{
+	int len = skb->len;
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_data_ready(sk, len);
+	return len;
+}
+
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
+{
+	int len = __netlink_sendskb(sk, skb);
+
+	sock_put(sk);
+	return len;
+}
+
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+	sock_put(sk);
+}
+
+static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
+					   gfp_t allocation)
+{
+	int delta;
+
+	skb_orphan(skb);
+
+	delta = skb->end - skb->tail;
+	if (delta * 2 < skb->truesize)
+		return skb;
+
+	if (skb_shared(skb)) {
+		struct sk_buff *nskb = skb_clone(skb, allocation);
+		if (!nskb)
+			return skb;
+		kfree_skb(skb);
+		skb = nskb;
+	}
+
+	if (!pskb_expand_head(skb, 0, -delta, allocation))
+		skb->truesize -= delta;
+
+	return skb;
+}
+
+static inline void netlink_rcv_wake(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (skb_queue_empty(&sk->sk_receive_queue))
+		clear_bit(0, &nlk->state);
+	if (!test_bit(0, &nlk->state))
+		wake_up_interruptible(&nlk->wait);
+}
+
+static inline int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	ret = -ECONNREFUSED;
+	if (nlk->netlink_rcv != NULL) {
+		ret = skb->len;
+		skb_set_owner_r(skb, sk);
+		nlk->netlink_rcv(skb);
+	}
+	kfree_skb(skb);
+	sock_put(sk);
+	return ret;
+}
+
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
+		    u32 pid, int nonblock)
+{
+	struct sock *sk;
+	int err;
+	long timeo;
+
+	skb = netlink_trim(skb, gfp_any());
+
+	timeo = sock_sndtimeo(ssk, nonblock);
+retry:
+	sk = netlink_getsockbypid(ssk, pid);
+	if (IS_ERR(sk)) {
+		kfree_skb(skb);
+		return PTR_ERR(sk);
+	}
+	if (netlink_is_kernel(sk))
+		return netlink_unicast_kernel(sk, skb);
+
+	if (sk_filter(sk, skb)) {
+		err = skb->len;
+		kfree_skb(skb);
+		sock_put(sk);
+		return err;
+	}
+
+	err = netlink_attachskb(sk, skb, &timeo, ssk);
+	if (err == 1)
+		goto retry;
+	if (err)
+		return err;
+
+	return netlink_sendskb(sk, skb);
+}
+EXPORT_SYMBOL(netlink_unicast);
+
+int netlink_has_listeners(struct sock *sk, unsigned int group)
+{
+	int res = 0;
+	struct listeners *listeners;
+
+	BUG_ON(!netlink_is_kernel(sk));
+
+	rcu_read_lock();
+	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
+
+	if (group - 1 < nl_table[sk->sk_protocol].groups)
+		res = test_bit(group - 1, listeners->masks);
+
+	rcu_read_unlock();
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(netlink_has_listeners);
+
+static inline int netlink_broadcast_deliver(struct sock *sk,
+					    struct sk_buff *skb)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+	    !test_bit(0, &nlk->state)) {
+		skb_set_owner_r(skb, sk);
+		__netlink_sendskb(sk, skb);
+		return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;
+	}
+	return -1;
+}
+
+struct netlink_broadcast_data {
+	struct sock *exclude_sk;
+	struct net *net;
+	u32 pid;
+	u32 group;
+	int failure;
+	int delivery_failure;
+	int congested;
+	int delivered;
+	gfp_t allocation;
+	struct sk_buff *skb, *skb2;
+	int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
+	void *tx_data;
+};
+
+static inline int do_one_broadcast(struct sock *sk,
+				   struct netlink_broadcast_data *p)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int val;
+
+	if (p->exclude_sk == sk)
+		goto out;
+
+	if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+	    !test_bit(p->group - 1, nlk->groups))
+		goto out;
+
+	if (!net_eq(sock_net(sk), p->net))
+		goto out;
+
+	if (p->failure) {
+		netlink_overrun(sk);
+		goto out;
+	}
+
+	sock_hold(sk);
+	if (p->skb2 == NULL) {
+		if (skb_shared(p->skb)) {
+			p->skb2 = skb_clone(p->skb, p->allocation);
+		} else {
+			p->skb2 = skb_get(p->skb);
+			/*
+			 * skb ownership may have been set when
+			 * delivered to a previous socket.
+			 */
+			skb_orphan(p->skb2);
+		}
+	}
+	if (p->skb2 == NULL) {
+		netlink_overrun(sk);
+		/* Clone failed. Notify ALL listeners. */
+		p->failure = 1;
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
+	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+		kfree_skb(p->skb2);
+		p->skb2 = NULL;
+	} else if (sk_filter(sk, p->skb2)) {
+		kfree_skb(p->skb2);
+		p->skb2 = NULL;
+	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+		netlink_overrun(sk);
+		if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+			p->delivery_failure = 1;
+	} else {
+		p->congested |= val;
+		p->delivered = 1;
+		p->skb2 = NULL;
+	}
+	sock_put(sk);
+
+out:
+	return 0;
+}
+
+int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid,
+	u32 group, gfp_t allocation,
+	int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data),
+	void *filter_data)
+{
+	struct net *net = sock_net(ssk);
+	struct netlink_broadcast_data info;
+	struct hlist_node *node;
+	struct sock *sk;
+
+	skb = netlink_trim(skb, allocation);
+
+	info.exclude_sk = ssk;
+	info.net = net;
+	info.pid = pid;
+	info.group = group;
+	info.failure = 0;
+	info.delivery_failure = 0;
+	info.congested = 0;
+	info.delivered = 0;
+	info.allocation = allocation;
+	info.skb = skb;
+	info.skb2 = NULL;
+	info.tx_filter = filter;
+	info.tx_data = filter_data;
+
+	/* While we sleep in clone, do not allow to change socket list */
+
+	netlink_lock_table();
+
+	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+		do_one_broadcast(sk, &info);
+
+	consume_skb(skb);
+
+	netlink_unlock_table();
+
+	if (info.delivery_failure) {
+		kfree_skb(info.skb2);
+		return -ENOBUFS;
+	} else
+		consume_skb(info.skb2);
+
+	if (info.delivered) {
+		if (info.congested && (allocation & __GFP_WAIT))
+			yield();
+		return 0;
+	}
+	return -ESRCH;
+}
+EXPORT_SYMBOL(netlink_broadcast_filtered);
+
+int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
+		      u32 group, gfp_t allocation)
+{
+	return netlink_broadcast_filtered(ssk, skb, pid, group, allocation,
+		NULL, NULL);
+}
+EXPORT_SYMBOL(netlink_broadcast);
+
+struct netlink_set_err_data {
+	struct sock *exclude_sk;
+	u32 pid;
+	u32 group;
+	int code;
+};
+
+static inline int do_one_set_err(struct sock *sk,
+				 struct netlink_set_err_data *p)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int ret = 0;
+
+	if (sk == p->exclude_sk)
+		goto out;
+
+	if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
+		goto out;
+
+	if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+	    !test_bit(p->group - 1, nlk->groups))
+		goto out;
+
+	if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
+		ret = 1;
+		goto out;
+	}
+
+	sk->sk_err = p->code;
+	sk->sk_error_report(sk);
+out:
+	return ret;
+}
+
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @pid: the PID of a process that we want to skip (if any)
+ * @groups: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ *
+ * This function returns the number of broadcast listeners that have set the
+ * NETLINK_RECV_NO_ENOBUFS socket option.
+ */
+int netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
+{
+	struct netlink_set_err_data info;
+	struct hlist_node *node;
+	struct sock *sk;
+	int ret = 0;
+
+	info.exclude_sk = ssk;
+	info.pid = pid;
+	info.group = group;
+	/* sk->sk_err wants a positive error value */
+	info.code = -code;
+
+	read_lock(&nl_table_lock);
+
+	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
+		ret += do_one_set_err(sk, &info);
+
+	read_unlock(&nl_table_lock);
+	return ret;
+}
+EXPORT_SYMBOL(netlink_set_err);
+
+/* must be called with netlink table grabbed */
+static void netlink_update_socket_mc(struct netlink_sock *nlk,
+				     unsigned int group,
+				     int is_new)
+{
+	int old, new = !!is_new, subscriptions;
+
+	old = test_bit(group - 1, nlk->groups);
+	subscriptions = nlk->subscriptions - old + new;
+	if (new)
+		__set_bit(group - 1, nlk->groups);
+	else
+		__clear_bit(group - 1, nlk->groups);
+	netlink_update_subscriptions(&nlk->sk, subscriptions);
+	netlink_update_listeners(&nlk->sk);
+}
+
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+			      char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	unsigned int val = 0;
+	int err;
+
+	if (level != SOL_NETLINK)
+		return -ENOPROTOOPT;
+
+	if (optlen >= sizeof(int) &&
+	    get_user(val, (unsigned int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+	case NETLINK_PKTINFO:
+		if (val)
+			nlk->flags |= NETLINK_RECV_PKTINFO;
+		else
+			nlk->flags &= ~NETLINK_RECV_PKTINFO;
+		err = 0;
+		break;
+	case NETLINK_ADD_MEMBERSHIP:
+	case NETLINK_DROP_MEMBERSHIP: {
+		if (!netlink_capable(sock, NL_NONROOT_RECV))
+			return -EPERM;
+		err = netlink_realloc_groups(sk);
+		if (err)
+			return err;
+		if (!val || val - 1 >= nlk->ngroups)
+			return -EINVAL;
+		netlink_table_grab();
+		netlink_update_socket_mc(nlk, val,
+					 optname == NETLINK_ADD_MEMBERSHIP);
+		netlink_table_ungrab();
+		err = 0;
+		break;
+	}
+	case NETLINK_BROADCAST_ERROR:
+		if (val)
+			nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+		else
+			nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+		err = 0;
+		break;
+	case NETLINK_NO_ENOBUFS:
+		if (val) {
+			nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
+			clear_bit(0, &nlk->state);
+			wake_up_interruptible(&nlk->wait);
+		} else
+			nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
+		err = 0;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+	return err;
+}
+
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+			      char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int len, val, err;
+
+	if (level != SOL_NETLINK)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case NETLINK_PKTINFO:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
+	case NETLINK_BROADCAST_ERROR:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
+	case NETLINK_NO_ENOBUFS:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
+		if (put_user(len, optlen) ||
+		    put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+	return err;
+}
+
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct nl_pktinfo info;
+
+	info.group = NETLINK_CB(skb).dst_group;
+	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
+
+static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
+			   struct msghdr *msg, size_t len)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct sockaddr_nl *addr = msg->msg_name;
+	u32 dst_pid;
+	u32 dst_group;
+	struct sk_buff *skb;
+	int err;
+	struct scm_cookie scm;
+
+	if (msg->msg_flags&MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (NULL == siocb->scm) {
+		siocb->scm = &scm;
+		memset(&scm, 0, sizeof(scm));
+	}
+	err = scm_send(sock, msg, siocb->scm);
+	if (err < 0)
+		return err;
+
+	if (msg->msg_namelen) {
+		err = -EINVAL;
+		if (addr->nl_family != AF_NETLINK)
+			goto out;
+		dst_pid = addr->nl_pid;
+		dst_group = ffs(addr->nl_groups);
+		err =  -EPERM;
+		if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
+			goto out;
+	} else {
+		dst_pid = nlk->dst_pid;
+		dst_group = nlk->dst_group;
+	}
+
+	if (!nlk->pid) {
+		err = netlink_autobind(sock);
+		if (err)
+			goto out;
+	}
+
+	err = -EMSGSIZE;
+	if (len > sk->sk_sndbuf - 32)
+		goto out;
+	err = -ENOBUFS;
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (skb == NULL)
+		goto out;
+
+	NETLINK_CB(skb).pid	= nlk->pid;
+	NETLINK_CB(skb).dst_group = dst_group;
+	memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
+
+	err = -EFAULT;
+	if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	err = security_netlink_send(sk, skb);
+	if (err) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	if (dst_group) {
+		atomic_inc(&skb->users);
+		netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
+	}
+	err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
+
+out:
+	scm_destroy(siocb->scm);
+	return err;
+}
+
+static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
+			   struct msghdr *msg, size_t len,
+			   int flags)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
+	struct scm_cookie scm;
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int noblock = flags&MSG_DONTWAIT;
+	size_t copied;
+	struct sk_buff *skb, *data_skb;
+	int err, ret;
+
+	if (flags&MSG_OOB)
+		return -EOPNOTSUPP;
+
+	copied = 0;
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (skb == NULL)
+		goto out;
+
+	data_skb = skb;
+
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+	if (unlikely(skb_shinfo(skb)->frag_list)) {
+		/*
+		 * If this skb has a frag_list, then here that means that we
+		 * will have to use the frag_list skb's data for compat tasks
+		 * and the regular skb's data for normal (non-compat) tasks.
+		 *
+		 * If we need to send the compat skb, assign it to the
+		 * 'data_skb' variable so that it will be used below for data
+		 * copying. We keep 'skb' for everything else, including
+		 * freeing both later.
+		 */
+		if (flags & MSG_CMSG_COMPAT)
+			data_skb = skb_shinfo(skb)->frag_list;
+	}
+#endif
+
+	msg->msg_namelen = 0;
+
+	copied = data_skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	skb_reset_transport_header(data_skb);
+	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+
+	if (msg->msg_name) {
+		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
+		addr->nl_family = AF_NETLINK;
+		addr->nl_pad    = 0;
+		addr->nl_pid	= NETLINK_CB(skb).pid;
+		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
+		msg->msg_namelen = sizeof(*addr);
+	}
+
+	if (nlk->flags & NETLINK_RECV_PKTINFO)
+		netlink_cmsg_recv_pktinfo(msg, skb);
+
+	if (NULL == siocb->scm) {
+		memset(&scm, 0, sizeof(scm));
+		siocb->scm = &scm;
+	}
+	siocb->scm->creds = *NETLINK_CREDS(skb);
+	if (flags & MSG_TRUNC)
+		copied = data_skb->len;
+
+	skb_free_datagram(sk, skb);
+
+	if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
+		ret = netlink_dump(sk);
+		if (ret) {
+			sk->sk_err = ret;
+			sk->sk_error_report(sk);
+		}
+	}
+
+	scm_recv(sock, msg, siocb->scm, flags);
+out:
+	netlink_rcv_wake(sk);
+	return err ? : copied;
+}
+
+static void netlink_data_ready(struct sock *sk, int len)
+{
+	BUG();
+}
+
+/*
+ *	We export these functions to other modules. They provide a
+ *	complete set of kernel non-blocking support for message
+ *	queueing.
+ */
+
+struct sock *
+netlink_kernel_create(struct net *net, int unit, unsigned int groups,
+		      void (*input)(struct sk_buff *skb),
+		      struct mutex *cb_mutex, struct module *module)
+{
+	struct socket *sock;
+	struct sock *sk;
+	struct netlink_sock *nlk;
+	struct listeners *listeners = NULL;
+
+	BUG_ON(!nl_table);
+
+	if (unit < 0 || unit >= MAX_LINKS)
+		return NULL;
+
+	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
+		return NULL;
+
+	/*
+	 * We have to just have a reference on the net from sk, but don't
+	 * get_net it. Besides, we cannot get and then put the net here.
+	 * So we create one inside init_net and the move it to net.
+	 */
+
+	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+		goto out_sock_release_nosk;
+
+	sk = sock->sk;
+	sk_change_net(sk, net);
+
+	if (groups < 32)
+		groups = 32;
+
+	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
+	if (!listeners)
+		goto out_sock_release;
+
+	sk->sk_data_ready = netlink_data_ready;
+	if (input)
+		nlk_sk(sk)->netlink_rcv = input;
+
+	if (netlink_insert(sk, net, 0))
+		goto out_sock_release;
+
+	nlk = nlk_sk(sk);
+	nlk->flags |= NETLINK_KERNEL_SOCKET;
+
+	netlink_table_grab();
+	if (!nl_table[unit].registered) {
+		nl_table[unit].groups = groups;
+		rcu_assign_pointer(nl_table[unit].listeners, listeners);
+		nl_table[unit].cb_mutex = cb_mutex;
+		nl_table[unit].module = module;
+		nl_table[unit].registered = 1;
+	} else {
+		kfree(listeners);
+		nl_table[unit].registered++;
+	}
+	netlink_table_ungrab();
+	return sk;
+
+out_sock_release:
+	kfree(listeners);
+	netlink_kernel_release(sk);
+	return NULL;
+
+out_sock_release_nosk:
+	sock_release(sock);
+	return NULL;
+}
+EXPORT_SYMBOL(netlink_kernel_create);
+
+
+void
+netlink_kernel_release(struct sock *sk)
+{
+	sk_release_kernel(sk);
+}
+EXPORT_SYMBOL(netlink_kernel_release);
+
+int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+	struct listeners *new, *old;
+	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+
+	if (groups < 32)
+		groups = 32;
+
+	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
+		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
+		if (!new)
+			return -ENOMEM;
+		old = rcu_dereference_raw(tbl->listeners);
+		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
+		rcu_assign_pointer(tbl->listeners, new);
+
+		kfree_rcu(old, rcu);
+	}
+	tbl->groups = groups;
+
+	return 0;
+}
+
+/**
+ * netlink_change_ngroups - change number of multicast groups
+ *
+ * This changes the number of multicast groups that are available
+ * on a certain netlink family. Note that it is not possible to
+ * change the number of groups to below 32. Also note that it does
+ * not implicitly call netlink_clear_multicast_users() when the
+ * number of groups is reduced.
+ *
+ * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
+ * @groups: The new number of groups.
+ */
+int netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+	int err;
+
+	netlink_table_grab();
+	err = __netlink_change_ngroups(sk, groups);
+	netlink_table_ungrab();
+
+	return err;
+}
+
+void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+	struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
+
+	sk_for_each_bound(sk, node, &tbl->mc_list)
+		netlink_update_socket_mc(nlk_sk(sk), group, 0);
+}
+
+/**
+ * netlink_clear_multicast_users - kick off multicast listeners
+ *
+ * This function removes all listeners from the given group.
+ * @ksk: The kernel netlink socket, as returned by
+ *	netlink_kernel_create().
+ * @group: The multicast group to clear.
+ */
+void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+	netlink_table_grab();
+	__netlink_clear_multicast_users(ksk, group);
+	netlink_table_ungrab();
+}
+
+void netlink_set_nonroot(int protocol, unsigned int flags)
+{
+	if ((unsigned int)protocol < MAX_LINKS)
+		nl_table[protocol].nl_nonroot = flags;
+}
+EXPORT_SYMBOL(netlink_set_nonroot);
+
+static void netlink_destroy_callback(struct netlink_callback *cb)
+{
+	kfree_skb(cb->skb);
+	kfree(cb);
+}
+
+/*
+ * It looks a bit ugly.
+ * It would be better to create kernel thread.
+ */
+
+static int netlink_dump(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_callback *cb;
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	int len, err = -ENOBUFS;
+
+	skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	mutex_lock(nlk->cb_mutex);
+
+	cb = nlk->cb;
+	if (cb == NULL) {
+		err = -EINVAL;
+		goto errout_skb;
+	}
+
+	len = cb->dump(skb, cb);
+
+	if (len > 0) {
+		mutex_unlock(nlk->cb_mutex);
+
+		if (sk_filter(sk, skb))
+			kfree_skb(skb);
+		else
+			__netlink_sendskb(sk, skb);
+		return 0;
+	}
+
+	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
+	if (!nlh)
+		goto errout_skb;
+
+	memcpy(nlmsg_data(nlh), &len, sizeof(len));
+
+	if (sk_filter(sk, skb))
+		kfree_skb(skb);
+	else
+		__netlink_sendskb(sk, skb);
+
+	if (cb->done)
+		cb->done(cb);
+	nlk->cb = NULL;
+	mutex_unlock(nlk->cb_mutex);
+
+	netlink_destroy_callback(cb);
+	return 0;
+
+errout_skb:
+	mutex_unlock(nlk->cb_mutex);
+	kfree_skb(skb);
+errout:
+	return err;
+}
+
+int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+		       const struct nlmsghdr *nlh,
+		       int (*dump)(struct sk_buff *skb,
+				   struct netlink_callback *),
+		       int (*done)(struct netlink_callback *))
+{
+	struct netlink_callback *cb;
+	struct sock *sk;
+	struct netlink_sock *nlk;
+	int ret;
+
+	cb = kzalloc(sizeof(*cb), GFP_KERNEL);
+	if (cb == NULL)
+		return -ENOBUFS;
+
+	cb->dump = dump;
+	cb->done = done;
+	cb->nlh = nlh;
+	atomic_inc(&skb->users);
+	cb->skb = skb;
+
+	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).pid);
+	if (sk == NULL) {
+		netlink_destroy_callback(cb);
+		return -ECONNREFUSED;
+	}
+	nlk = nlk_sk(sk);
+	/* A dump is in progress... */
+	mutex_lock(nlk->cb_mutex);
+	if (nlk->cb) {
+		mutex_unlock(nlk->cb_mutex);
+		netlink_destroy_callback(cb);
+		sock_put(sk);
+		return -EBUSY;
+	}
+	nlk->cb = cb;
+	mutex_unlock(nlk->cb_mutex);
+
+	ret = netlink_dump(sk);
+
+	sock_put(sk);
+
+	if (ret)
+		return ret;
+
+	/* We successfully started a dump, by returning -EINTR we
+	 * signal not to send ACK even if it was requested.
+	 */
+	return -EINTR;
+}
+EXPORT_SYMBOL(netlink_dump_start);
+
+void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *rep;
+	struct nlmsgerr *errmsg;
+	size_t payload = sizeof(*errmsg);
+
+	/* error messages get the original request appened */
+	if (err)
+		payload += nlmsg_len(nlh);
+
+	skb = nlmsg_new(payload, GFP_KERNEL);
+	if (!skb) {
+		struct sock *sk;
+
+		sk = netlink_lookup(sock_net(in_skb->sk),
+				    in_skb->sk->sk_protocol,
+				    NETLINK_CB(in_skb).pid);
+		if (sk) {
+			sk->sk_err = ENOBUFS;
+			sk->sk_error_report(sk);
+			sock_put(sk);
+		}
+		return;
+	}
+
+	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+			  NLMSG_ERROR, payload, 0);
+	errmsg = nlmsg_data(rep);
+	errmsg->error = err;
+	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
+	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+}
+EXPORT_SYMBOL(netlink_ack);
+
+int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+						     struct nlmsghdr *))
+{
+	struct nlmsghdr *nlh;
+	int err;
+
+	while (skb->len >= nlmsg_total_size(0)) {
+		int msglen;
+
+		nlh = nlmsg_hdr(skb);
+		err = 0;
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+			return 0;
+
+		/* Only requests are handled by the kernel */
+		if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
+			goto ack;
+
+		/* Skip control messages */
+		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+			goto ack;
+
+		err = cb(skb, nlh);
+		if (err == -EINTR)
+			goto skip;
+
+ack:
+		if (nlh->nlmsg_flags & NLM_F_ACK || err)
+			netlink_ack(skb, nlh, err);
+
+skip:
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(netlink_rcv_skb);
+
+/**
+ * nlmsg_notify - send a notification netlink message
+ * @sk: netlink socket to use
+ * @skb: notification message
+ * @pid: destination netlink pid for reports or 0
+ * @group: destination multicast group or 0
+ * @report: 1 to report back, 0 to disable
+ * @flags: allocation flags
+ */
+int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
+		 unsigned int group, int report, gfp_t flags)
+{
+	int err = 0;
+
+	if (group) {
+		int exclude_pid = 0;
+
+		if (report) {
+			atomic_inc(&skb->users);
+			exclude_pid = pid;
+		}
+
+		/* errors reported via destination sk->sk_err, but propagate
+		 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+		err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+	}
+
+	if (report) {
+		int err2;
+
+		err2 = nlmsg_unicast(sk, skb, pid);
+		if (!err || err == -ESRCH)
+			err = err2;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(nlmsg_notify);
+
+#ifdef CONFIG_PROC_FS
+struct nl_seq_iter {
+	struct seq_net_private p;
+	int link;
+	int hash_idx;
+};
+
+static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
+{
+	struct nl_seq_iter *iter = seq->private;
+	int i, j;
+	struct sock *s;
+	struct hlist_node *node;
+	loff_t off = 0;
+
+	for (i = 0; i < MAX_LINKS; i++) {
+		struct nl_pid_hash *hash = &nl_table[i].hash;
+
+		for (j = 0; j <= hash->mask; j++) {
+			sk_for_each(s, node, &hash->table[j]) {
+				if (sock_net(s) != seq_file_net(seq))
+					continue;
+				if (off == pos) {
+					iter->link = i;
+					iter->hash_idx = j;
+					return s;
+				}
+				++off;
+			}
+		}
+	}
+	return NULL;
+}
+
+static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(nl_table_lock)
+{
+	read_lock(&nl_table_lock);
+	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *s;
+	struct nl_seq_iter *iter;
+	int i, j;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return netlink_seq_socket_idx(seq, 0);
+
+	iter = seq->private;
+	s = v;
+	do {
+		s = sk_next(s);
+	} while (s && sock_net(s) != seq_file_net(seq));
+	if (s)
+		return s;
+
+	i = iter->link;
+	j = iter->hash_idx + 1;
+
+	do {
+		struct nl_pid_hash *hash = &nl_table[i].hash;
+
+		for (; j <= hash->mask; j++) {
+			s = sk_head(&hash->table[j]);
+			while (s && sock_net(s) != seq_file_net(seq))
+				s = sk_next(s);
+			if (s) {
+				iter->link = i;
+				iter->hash_idx = j;
+				return s;
+			}
+		}
+
+		j = 0;
+	} while (++i < MAX_LINKS);
+
+	return NULL;
+}
+
+static void netlink_seq_stop(struct seq_file *seq, void *v)
+	__releases(nl_table_lock)
+{
+	read_unlock(&nl_table_lock);
+}
+
+
+static int netlink_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "sk       Eth Pid    Groups   "
+			 "Rmem     Wmem     Dump     Locks     Drops     Inode\n");
+	else {
+		struct sock *s = v;
+		struct netlink_sock *nlk = nlk_sk(s);
+
+		seq_printf(seq, "%pK %-3d %-6d %08x %-8d %-8d %pK %-8d %-8d %-8lu\n",
+			   s,
+			   s->sk_protocol,
+			   nlk->pid,
+			   nlk->groups ? (u32)nlk->groups[0] : 0,
+			   sk_rmem_alloc_get(s),
+			   sk_wmem_alloc_get(s),
+			   nlk->cb,
+			   atomic_read(&s->sk_refcnt),
+			   atomic_read(&s->sk_drops),
+			   sock_i_ino(s)
+			);
+
+	}
+	return 0;
+}
+
+static const struct seq_operations netlink_seq_ops = {
+	.start  = netlink_seq_start,
+	.next   = netlink_seq_next,
+	.stop   = netlink_seq_stop,
+	.show   = netlink_seq_show,
+};
+
+
+static int netlink_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &netlink_seq_ops,
+				sizeof(struct nl_seq_iter));
+}
+
+static const struct file_operations netlink_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= netlink_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+#endif
+
+int netlink_register_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&netlink_chain, nb);
+}
+EXPORT_SYMBOL(netlink_register_notifier);
+
+int netlink_unregister_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&netlink_chain, nb);
+}
+EXPORT_SYMBOL(netlink_unregister_notifier);
+
+static const struct proto_ops netlink_ops = {
+	.family =	PF_NETLINK,
+	.owner =	THIS_MODULE,
+	.release =	netlink_release,
+	.bind =		netlink_bind,
+	.connect =	netlink_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	netlink_getname,
+	.poll =		datagram_poll,
+	.ioctl =	sock_no_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	netlink_setsockopt,
+	.getsockopt =	netlink_getsockopt,
+	.sendmsg =	netlink_sendmsg,
+	.recvmsg =	netlink_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct net_proto_family netlink_family_ops = {
+	.family = PF_NETLINK,
+	.create = netlink_create,
+	.owner	= THIS_MODULE,	/* for consistency 8) */
+};
+
+static int __net_init netlink_net_init(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	if (!proc_net_fops_create(net, "netlink", 0, &netlink_seq_fops))
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static void __net_exit netlink_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	proc_net_remove(net, "netlink");
+#endif
+}
+
+static void __init netlink_add_usersock_entry(void)
+{
+	struct listeners *listeners;
+	int groups = 32;
+
+	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
+	if (!listeners)
+		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
+
+	netlink_table_grab();
+
+	nl_table[NETLINK_USERSOCK].groups = groups;
+	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
+	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
+	nl_table[NETLINK_USERSOCK].registered = 1;
+
+	netlink_table_ungrab();
+}
+
+static struct pernet_operations __net_initdata netlink_net_ops = {
+	.init = netlink_net_init,
+	.exit = netlink_net_exit,
+};
+
+static int __init netlink_proto_init(void)
+{
+	struct sk_buff *dummy_skb;
+	int i;
+	unsigned long limit;
+	unsigned int order;
+	int err = proto_register(&netlink_proto, 0);
+
+	if (err != 0)
+		goto out;
+
+	BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb));
+
+	nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
+	if (!nl_table)
+		goto panic;
+
+	if (totalram_pages >= (128 * 1024))
+		limit = totalram_pages >> (21 - PAGE_SHIFT);
+	else
+		limit = totalram_pages >> (23 - PAGE_SHIFT);
+
+	order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
+	limit = (1UL << order) / sizeof(struct hlist_head);
+	order = get_bitmask_order(min(limit, (unsigned long)UINT_MAX)) - 1;
+
+	for (i = 0; i < MAX_LINKS; i++) {
+		struct nl_pid_hash *hash = &nl_table[i].hash;
+
+		hash->table = nl_pid_hash_zalloc(1 * sizeof(*hash->table));
+		if (!hash->table) {
+			while (i-- > 0)
+				nl_pid_hash_free(nl_table[i].hash.table,
+						 1 * sizeof(*hash->table));
+			kfree(nl_table);
+			goto panic;
+		}
+		hash->max_shift = order;
+		hash->shift = 0;
+		hash->mask = 0;
+		hash->rehash_time = jiffies;
+	}
+
+	netlink_add_usersock_entry();
+
+	sock_register(&netlink_family_ops);
+	register_pernet_subsys(&netlink_net_ops);
+	/* The netlink device handler may be needed early. */
+	rtnetlink_init();
+out:
+	return err;
+panic:
+	panic("netlink_init: Cannot allocate nl_table\n");
+}
+
+core_initcall(netlink_proto_init);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
new file mode 100644
index 00000000..1781d991
--- /dev/null
+++ b/net/netlink/genetlink.c
@@ -0,0 +1,948 @@
+/*
+ * NETLINK      Generic Netlink Family
+ *
+ * 		Authors:	Jamal Hadi Salim
+ * 				Thomas Graf <tgraf@suug.ch>
+ *				Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
+
+void genl_lock(void)
+{
+	mutex_lock(&genl_mutex);
+}
+EXPORT_SYMBOL(genl_lock);
+
+void genl_unlock(void)
+{
+	mutex_unlock(&genl_mutex);
+}
+EXPORT_SYMBOL(genl_unlock);
+
+#define GENL_FAM_TAB_SIZE	16
+#define GENL_FAM_TAB_MASK	(GENL_FAM_TAB_SIZE - 1)
+
+static struct list_head family_ht[GENL_FAM_TAB_SIZE];
+/*
+ * Bitmap of multicast groups that are currently in use.
+ *
+ * To avoid an allocation at boot of just one unsigned long,
+ * declare it global instead.
+ * Bit 0 is marked as already used since group 0 is invalid.
+ */
+static unsigned long mc_group_start = 0x1;
+static unsigned long *mc_groups = &mc_group_start;
+static unsigned long mc_groups_longs = 1;
+
+static int genl_ctrl_event(int event, void *data);
+
+static inline unsigned int genl_family_hash(unsigned int id)
+{
+	return id & GENL_FAM_TAB_MASK;
+}
+
+static inline struct list_head *genl_family_chain(unsigned int id)
+{
+	return &family_ht[genl_family_hash(id)];
+}
+
+static struct genl_family *genl_family_find_byid(unsigned int id)
+{
+	struct genl_family *f;
+
+	list_for_each_entry(f, genl_family_chain(id), family_list)
+		if (f->id == id)
+			return f;
+
+	return NULL;
+}
+
+static struct genl_family *genl_family_find_byname(char *name)
+{
+	struct genl_family *f;
+	int i;
+
+	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+		list_for_each_entry(f, genl_family_chain(i), family_list)
+			if (strcmp(f->name, name) == 0)
+				return f;
+
+	return NULL;
+}
+
+static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
+{
+	struct genl_ops *ops;
+
+	list_for_each_entry(ops, &family->ops_list, ops_list)
+		if (ops->cmd == cmd)
+			return ops;
+
+	return NULL;
+}
+
+/* Of course we are going to have problems once we hit
+ * 2^16 alive types, but that can only happen by year 2K
+*/
+static inline u16 genl_generate_id(void)
+{
+	static u16 id_gen_idx = GENL_MIN_ID;
+	int i;
+
+	for (i = 0; i <= GENL_MAX_ID - GENL_MIN_ID; i++) {
+		if (!genl_family_find_byid(id_gen_idx))
+			return id_gen_idx;
+		if (++id_gen_idx > GENL_MAX_ID)
+			id_gen_idx = GENL_MIN_ID;
+	}
+
+	return 0;
+}
+
+static struct genl_multicast_group notify_grp;
+
+/**
+ * genl_register_mc_group - register a multicast group
+ *
+ * Registers the specified multicast group and notifies userspace
+ * about the new group.
+ *
+ * Returns 0 on success or a negative error code.
+ *
+ * @family: The generic netlink family the group shall be registered for.
+ * @grp: The group to register, must have a name.
+ */
+int genl_register_mc_group(struct genl_family *family,
+			   struct genl_multicast_group *grp)
+{
+	int id;
+	unsigned long *new_groups;
+	int err = 0;
+
+	BUG_ON(grp->name[0] == '\0');
+
+	genl_lock();
+
+	/* special-case our own group */
+	if (grp == &notify_grp)
+		id = GENL_ID_CTRL;
+	else
+		id = find_first_zero_bit(mc_groups,
+					 mc_groups_longs * BITS_PER_LONG);
+
+
+	if (id >= mc_groups_longs * BITS_PER_LONG) {
+		size_t nlen = (mc_groups_longs + 1) * sizeof(unsigned long);
+
+		if (mc_groups == &mc_group_start) {
+			new_groups = kzalloc(nlen, GFP_KERNEL);
+			if (!new_groups) {
+				err = -ENOMEM;
+				goto out;
+			}
+			mc_groups = new_groups;
+			*mc_groups = mc_group_start;
+		} else {
+			new_groups = krealloc(mc_groups, nlen, GFP_KERNEL);
+			if (!new_groups) {
+				err = -ENOMEM;
+				goto out;
+			}
+			mc_groups = new_groups;
+			mc_groups[mc_groups_longs] = 0;
+		}
+		mc_groups_longs++;
+	}
+
+	if (family->netnsok) {
+		struct net *net;
+
+		netlink_table_grab();
+		rcu_read_lock();
+		for_each_net_rcu(net) {
+			err = __netlink_change_ngroups(net->genl_sock,
+					mc_groups_longs * BITS_PER_LONG);
+			if (err) {
+				/*
+				 * No need to roll back, can only fail if
+				 * memory allocation fails and then the
+				 * number of _possible_ groups has been
+				 * increased on some sockets which is ok.
+				 */
+				rcu_read_unlock();
+				netlink_table_ungrab();
+				goto out;
+			}
+		}
+		rcu_read_unlock();
+		netlink_table_ungrab();
+	} else {
+		err = netlink_change_ngroups(init_net.genl_sock,
+					     mc_groups_longs * BITS_PER_LONG);
+		if (err)
+			goto out;
+	}
+
+	grp->id = id;
+	set_bit(id, mc_groups);
+	list_add_tail(&grp->list, &family->mcast_groups);
+	grp->family = family;
+
+	genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, grp);
+ out:
+	genl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(genl_register_mc_group);
+
+static void __genl_unregister_mc_group(struct genl_family *family,
+				       struct genl_multicast_group *grp)
+{
+	struct net *net;
+	BUG_ON(grp->family != family);
+
+	netlink_table_grab();
+	rcu_read_lock();
+	for_each_net_rcu(net)
+		__netlink_clear_multicast_users(net->genl_sock, grp->id);
+	rcu_read_unlock();
+	netlink_table_ungrab();
+
+	clear_bit(grp->id, mc_groups);
+	list_del(&grp->list);
+	genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, grp);
+	grp->id = 0;
+	grp->family = NULL;
+}
+
+/**
+ * genl_unregister_mc_group - unregister a multicast group
+ *
+ * Unregisters the specified multicast group and notifies userspace
+ * about it. All current listeners on the group are removed.
+ *
+ * Note: It is not necessary to unregister all multicast groups before
+ *       unregistering the family, unregistering the family will cause
+ *       all assigned multicast groups to be unregistered automatically.
+ *
+ * @family: Generic netlink family the group belongs to.
+ * @grp: The group to unregister, must have been registered successfully
+ *	 previously.
+ */
+void genl_unregister_mc_group(struct genl_family *family,
+			      struct genl_multicast_group *grp)
+{
+	genl_lock();
+	__genl_unregister_mc_group(family, grp);
+	genl_unlock();
+}
+EXPORT_SYMBOL(genl_unregister_mc_group);
+
+static void genl_unregister_mc_groups(struct genl_family *family)
+{
+	struct genl_multicast_group *grp, *tmp;
+
+	list_for_each_entry_safe(grp, tmp, &family->mcast_groups, list)
+		__genl_unregister_mc_group(family, grp);
+}
+
+/**
+ * genl_register_ops - register generic netlink operations
+ * @family: generic netlink family
+ * @ops: operations to be registered
+ *
+ * Registers the specified operations and assigns them to the specified
+ * family. Either a doit or dumpit callback must be specified or the
+ * operation will fail. Only one operation structure per command
+ * identifier may be registered.
+ *
+ * See include/net/genetlink.h for more documenation on the operations
+ * structure.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
+{
+	int err = -EINVAL;
+
+	if (ops->dumpit == NULL && ops->doit == NULL)
+		goto errout;
+
+	if (genl_get_cmd(ops->cmd, family)) {
+		err = -EEXIST;
+		goto errout;
+	}
+
+	if (ops->dumpit)
+		ops->flags |= GENL_CMD_CAP_DUMP;
+	if (ops->doit)
+		ops->flags |= GENL_CMD_CAP_DO;
+	if (ops->policy)
+		ops->flags |= GENL_CMD_CAP_HASPOL;
+
+	genl_lock();
+	list_add_tail(&ops->ops_list, &family->ops_list);
+	genl_unlock();
+
+	genl_ctrl_event(CTRL_CMD_NEWOPS, ops);
+	err = 0;
+errout:
+	return err;
+}
+EXPORT_SYMBOL(genl_register_ops);
+
+/**
+ * genl_unregister_ops - unregister generic netlink operations
+ * @family: generic netlink family
+ * @ops: operations to be unregistered
+ *
+ * Unregisters the specified operations and unassigns them from the
+ * specified family. The operation blocks until the current message
+ * processing has finished and doesn't start again until the
+ * unregister process has finished.
+ *
+ * Note: It is not necessary to unregister all operations before
+ *       unregistering the family, unregistering the family will cause
+ *       all assigned operations to be unregistered automatically.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
+{
+	struct genl_ops *rc;
+
+	genl_lock();
+	list_for_each_entry(rc, &family->ops_list, ops_list) {
+		if (rc == ops) {
+			list_del(&ops->ops_list);
+			genl_unlock();
+			genl_ctrl_event(CTRL_CMD_DELOPS, ops);
+			return 0;
+		}
+	}
+	genl_unlock();
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(genl_unregister_ops);
+
+/**
+ * genl_register_family - register a generic netlink family
+ * @family: generic netlink family
+ *
+ * Registers the specified family after validating it first. Only one
+ * family may be registered with the same family name or identifier.
+ * The family id may equal GENL_ID_GENERATE causing an unique id to
+ * be automatically generated and assigned.
+ *
+ * Return 0 on success or a negative error code.
+ */
+int genl_register_family(struct genl_family *family)
+{
+	int err = -EINVAL;
+
+	if (family->id && family->id < GENL_MIN_ID)
+		goto errout;
+
+	if (family->id > GENL_MAX_ID)
+		goto errout;
+
+	INIT_LIST_HEAD(&family->ops_list);
+	INIT_LIST_HEAD(&family->mcast_groups);
+
+	genl_lock();
+
+	if (genl_family_find_byname(family->name)) {
+		err = -EEXIST;
+		goto errout_locked;
+	}
+
+	if (family->id == GENL_ID_GENERATE) {
+		u16 newid = genl_generate_id();
+
+		if (!newid) {
+			err = -ENOMEM;
+			goto errout_locked;
+		}
+
+		family->id = newid;
+	} else if (genl_family_find_byid(family->id)) {
+		err = -EEXIST;
+		goto errout_locked;
+	}
+
+	if (family->maxattr) {
+		family->attrbuf = kmalloc((family->maxattr+1) *
+					sizeof(struct nlattr *), GFP_KERNEL);
+		if (family->attrbuf == NULL) {
+			err = -ENOMEM;
+			goto errout_locked;
+		}
+	} else
+		family->attrbuf = NULL;
+
+	list_add_tail(&family->family_list, genl_family_chain(family->id));
+	genl_unlock();
+
+	genl_ctrl_event(CTRL_CMD_NEWFAMILY, family);
+
+	return 0;
+
+errout_locked:
+	genl_unlock();
+errout:
+	return err;
+}
+EXPORT_SYMBOL(genl_register_family);
+
+/**
+ * genl_register_family_with_ops - register a generic netlink family
+ * @family: generic netlink family
+ * @ops: operations to be registered
+ * @n_ops: number of elements to register
+ *
+ * Registers the specified family and operations from the specified table.
+ * Only one family may be registered with the same family name or identifier.
+ *
+ * The family id may equal GENL_ID_GENERATE causing an unique id to
+ * be automatically generated and assigned.
+ *
+ * Either a doit or dumpit callback must be specified for every registered
+ * operation or the function will fail. Only one operation structure per
+ * command identifier may be registered.
+ *
+ * See include/net/genetlink.h for more documenation on the operations
+ * structure.
+ *
+ * This is equivalent to calling genl_register_family() followed by
+ * genl_register_ops() for every operation entry in the table taking
+ * care to unregister the family on error path.
+ *
+ * Return 0 on success or a negative error code.
+ */
+int genl_register_family_with_ops(struct genl_family *family,
+	struct genl_ops *ops, size_t n_ops)
+{
+	int err, i;
+
+	err = genl_register_family(family);
+	if (err)
+		return err;
+
+	for (i = 0; i < n_ops; ++i, ++ops) {
+		err = genl_register_ops(family, ops);
+		if (err)
+			goto err_out;
+	}
+	return 0;
+err_out:
+	genl_unregister_family(family);
+	return err;
+}
+EXPORT_SYMBOL(genl_register_family_with_ops);
+
+/**
+ * genl_unregister_family - unregister generic netlink family
+ * @family: generic netlink family
+ *
+ * Unregisters the specified family.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_unregister_family(struct genl_family *family)
+{
+	struct genl_family *rc;
+
+	genl_lock();
+
+	genl_unregister_mc_groups(family);
+
+	list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
+		if (family->id != rc->id || strcmp(rc->name, family->name))
+			continue;
+
+		list_del(&rc->family_list);
+		INIT_LIST_HEAD(&family->ops_list);
+		genl_unlock();
+
+		kfree(family->attrbuf);
+		genl_ctrl_event(CTRL_CMD_DELFAMILY, family);
+		return 0;
+	}
+
+	genl_unlock();
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(genl_unregister_family);
+
+static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct genl_ops *ops;
+	struct genl_family *family;
+	struct net *net = sock_net(skb->sk);
+	struct genl_info info;
+	struct genlmsghdr *hdr = nlmsg_data(nlh);
+	int hdrlen, err;
+
+	family = genl_family_find_byid(nlh->nlmsg_type);
+	if (family == NULL)
+		return -ENOENT;
+
+	/* this family doesn't exist in this netns */
+	if (!family->netnsok && !net_eq(net, &init_net))
+		return -ENOENT;
+
+	hdrlen = GENL_HDRLEN + family->hdrsize;
+	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+		return -EINVAL;
+
+	ops = genl_get_cmd(hdr->cmd, family);
+	if (ops == NULL)
+		return -EOPNOTSUPP;
+
+	if ((ops->flags & GENL_ADMIN_PERM) &&
+	    security_netlink_recv(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		if (ops->dumpit == NULL)
+			return -EOPNOTSUPP;
+
+		genl_unlock();
+		err = netlink_dump_start(net->genl_sock, skb, nlh,
+					 ops->dumpit, ops->done);
+		genl_lock();
+		return err;
+	}
+
+	if (ops->doit == NULL)
+		return -EOPNOTSUPP;
+
+	if (family->attrbuf) {
+		err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr,
+				  ops->policy);
+		if (err < 0)
+			return err;
+	}
+
+	info.snd_seq = nlh->nlmsg_seq;
+	info.snd_pid = NETLINK_CB(skb).pid;
+	info.nlhdr = nlh;
+	info.genlhdr = nlmsg_data(nlh);
+	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
+	info.attrs = family->attrbuf;
+	genl_info_net_set(&info, net);
+	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
+
+	if (family->pre_doit) {
+		err = family->pre_doit(ops, skb, &info);
+		if (err)
+			return err;
+	}
+
+	err = ops->doit(skb, &info);
+
+	if (family->post_doit)
+		family->post_doit(ops, skb, &info);
+
+	return err;
+}
+
+static void genl_rcv(struct sk_buff *skb)
+{
+	genl_lock();
+	netlink_rcv_skb(skb, &genl_rcv_msg);
+	genl_unlock();
+}
+
+/**************************************************************************
+ * Controller
+ **************************************************************************/
+
+static struct genl_family genl_ctrl = {
+	.id = GENL_ID_CTRL,
+	.name = "nlctrl",
+	.version = 0x2,
+	.maxattr = CTRL_ATTR_MAX,
+	.netnsok = true,
+};
+
+static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
+			  u32 flags, struct sk_buff *skb, u8 cmd)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd);
+	if (hdr == NULL)
+		return -1;
+
+	NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name);
+	NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id);
+	NLA_PUT_U32(skb, CTRL_ATTR_VERSION, family->version);
+	NLA_PUT_U32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize);
+	NLA_PUT_U32(skb, CTRL_ATTR_MAXATTR, family->maxattr);
+
+	if (!list_empty(&family->ops_list)) {
+		struct nlattr *nla_ops;
+		struct genl_ops *ops;
+		int idx = 1;
+
+		nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS);
+		if (nla_ops == NULL)
+			goto nla_put_failure;
+
+		list_for_each_entry(ops, &family->ops_list, ops_list) {
+			struct nlattr *nest;
+
+			nest = nla_nest_start(skb, idx++);
+			if (nest == NULL)
+				goto nla_put_failure;
+
+			NLA_PUT_U32(skb, CTRL_ATTR_OP_ID, ops->cmd);
+			NLA_PUT_U32(skb, CTRL_ATTR_OP_FLAGS, ops->flags);
+
+			nla_nest_end(skb, nest);
+		}
+
+		nla_nest_end(skb, nla_ops);
+	}
+
+	if (!list_empty(&family->mcast_groups)) {
+		struct genl_multicast_group *grp;
+		struct nlattr *nla_grps;
+		int idx = 1;
+
+		nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+		if (nla_grps == NULL)
+			goto nla_put_failure;
+
+		list_for_each_entry(grp, &family->mcast_groups, list) {
+			struct nlattr *nest;
+
+			nest = nla_nest_start(skb, idx++);
+			if (nest == NULL)
+				goto nla_put_failure;
+
+			NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id);
+			NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME,
+				       grp->name);
+
+			nla_nest_end(skb, nest);
+		}
+		nla_nest_end(skb, nla_grps);
+	}
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid,
+				u32 seq, u32 flags, struct sk_buff *skb,
+				u8 cmd)
+{
+	void *hdr;
+	struct nlattr *nla_grps;
+	struct nlattr *nest;
+
+	hdr = genlmsg_put(skb, pid, seq, &genl_ctrl, flags, cmd);
+	if (hdr == NULL)
+		return -1;
+
+	NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, grp->family->name);
+	NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, grp->family->id);
+
+	nla_grps = nla_nest_start(skb, CTRL_ATTR_MCAST_GROUPS);
+	if (nla_grps == NULL)
+		goto nla_put_failure;
+
+	nest = nla_nest_start(skb, 1);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, CTRL_ATTR_MCAST_GRP_ID, grp->id);
+	NLA_PUT_STRING(skb, CTRL_ATTR_MCAST_GRP_NAME,
+		       grp->name);
+
+	nla_nest_end(skb, nest);
+	nla_nest_end(skb, nla_grps);
+
+	return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+	genlmsg_cancel(skb, hdr);
+	return -EMSGSIZE;
+}
+
+static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
+{
+
+	int i, n = 0;
+	struct genl_family *rt;
+	struct net *net = sock_net(skb->sk);
+	int chains_to_skip = cb->args[0];
+	int fams_to_skip = cb->args[1];
+
+	for (i = chains_to_skip; i < GENL_FAM_TAB_SIZE; i++) {
+		n = 0;
+		list_for_each_entry(rt, genl_family_chain(i), family_list) {
+			if (!rt->netnsok && !net_eq(net, &init_net))
+				continue;
+			if (++n < fams_to_skip)
+				continue;
+			if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid,
+					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					   skb, CTRL_CMD_NEWFAMILY) < 0)
+				goto errout;
+		}
+
+		fams_to_skip = 0;
+	}
+
+errout:
+	cb->args[0] = i;
+	cb->args[1] = n;
+
+	return skb->len;
+}
+
+static struct sk_buff *ctrl_build_family_msg(struct genl_family *family,
+					     u32 pid, int seq, u8 cmd)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	err = ctrl_fill_info(family, pid, seq, 0, skb, cmd);
+	if (err < 0) {
+		nlmsg_free(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
+static struct sk_buff *ctrl_build_mcgrp_msg(struct genl_multicast_group *grp,
+					    u32 pid, int seq, u8 cmd)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb == NULL)
+		return ERR_PTR(-ENOBUFS);
+
+	err = ctrl_fill_mcgrp_info(grp, pid, seq, 0, skb, cmd);
+	if (err < 0) {
+		nlmsg_free(skb);
+		return ERR_PTR(err);
+	}
+
+	return skb;
+}
+
+static const struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] = {
+	[CTRL_ATTR_FAMILY_ID]	= { .type = NLA_U16 },
+	[CTRL_ATTR_FAMILY_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = GENL_NAMSIZ - 1 },
+};
+
+static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct genl_family *res = NULL;
+	int err = -EINVAL;
+
+	if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
+		u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
+		res = genl_family_find_byid(id);
+		err = -ENOENT;
+	}
+
+	if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
+		char *name;
+
+		name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]);
+		res = genl_family_find_byname(name);
+		err = -ENOENT;
+	}
+
+	if (res == NULL)
+		return err;
+
+	if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) {
+		/* family doesn't exist here */
+		return -ENOENT;
+	}
+
+	msg = ctrl_build_family_msg(res, info->snd_pid, info->snd_seq,
+				    CTRL_CMD_NEWFAMILY);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	return genlmsg_reply(msg, info);
+}
+
+static int genl_ctrl_event(int event, void *data)
+{
+	struct sk_buff *msg;
+	struct genl_family *family;
+	struct genl_multicast_group *grp;
+
+	/* genl is still initialising */
+	if (!init_net.genl_sock)
+		return 0;
+
+	switch (event) {
+	case CTRL_CMD_NEWFAMILY:
+	case CTRL_CMD_DELFAMILY:
+		family = data;
+		msg = ctrl_build_family_msg(family, 0, 0, event);
+		break;
+	case CTRL_CMD_NEWMCAST_GRP:
+	case CTRL_CMD_DELMCAST_GRP:
+		grp = data;
+		family = grp->family;
+		msg = ctrl_build_mcgrp_msg(data, 0, 0, event);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	if (!family->netnsok) {
+		genlmsg_multicast_netns(&init_net, msg, 0,
+					GENL_ID_CTRL, GFP_KERNEL);
+	} else {
+		rcu_read_lock();
+		genlmsg_multicast_allns(msg, 0, GENL_ID_CTRL, GFP_ATOMIC);
+		rcu_read_unlock();
+	}
+
+	return 0;
+}
+
+static struct genl_ops genl_ctrl_ops = {
+	.cmd		= CTRL_CMD_GETFAMILY,
+	.doit		= ctrl_getfamily,
+	.dumpit		= ctrl_dumpfamily,
+	.policy		= ctrl_policy,
+};
+
+static struct genl_multicast_group notify_grp = {
+	.name		= "notify",
+};
+
+static int __net_init genl_pernet_init(struct net *net)
+{
+	/* we'll bump the group number right afterwards */
+	net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, 0,
+					       genl_rcv, &genl_mutex,
+					       THIS_MODULE);
+
+	if (!net->genl_sock && net_eq(net, &init_net))
+		panic("GENL: Cannot initialize generic netlink\n");
+
+	if (!net->genl_sock)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit genl_pernet_exit(struct net *net)
+{
+	netlink_kernel_release(net->genl_sock);
+	net->genl_sock = NULL;
+}
+
+static struct pernet_operations genl_pernet_ops = {
+	.init = genl_pernet_init,
+	.exit = genl_pernet_exit,
+};
+
+static int __init genl_init(void)
+{
+	int i, err;
+
+	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+		INIT_LIST_HEAD(&family_ht[i]);
+
+	err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1);
+	if (err < 0)
+		goto problem;
+
+	netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
+
+	err = register_pernet_subsys(&genl_pernet_ops);
+	if (err)
+		goto problem;
+
+	err = genl_register_mc_group(&genl_ctrl, &notify_grp);
+	if (err < 0)
+		goto problem;
+
+	return 0;
+
+problem:
+	panic("GENL: Cannot register controller: %d\n", err);
+}
+
+subsys_initcall(genl_init);
+
+static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group,
+			 gfp_t flags)
+{
+	struct sk_buff *tmp;
+	struct net *net, *prev = NULL;
+	int err;
+
+	for_each_net_rcu(net) {
+		if (prev) {
+			tmp = skb_clone(skb, flags);
+			if (!tmp) {
+				err = -ENOMEM;
+				goto error;
+			}
+			err = nlmsg_multicast(prev->genl_sock, tmp,
+					      pid, group, flags);
+			if (err)
+				goto error;
+		}
+
+		prev = net;
+	}
+
+	return nlmsg_multicast(prev->genl_sock, skb, pid, group, flags);
+ error:
+	kfree_skb(skb);
+	return err;
+}
+
+int genlmsg_multicast_allns(struct sk_buff *skb, u32 pid, unsigned int group,
+			    gfp_t flags)
+{
+	return genlmsg_mcast(skb, pid, group, flags);
+}
+EXPORT_SYMBOL(genlmsg_multicast_allns);
diff --git a/net/netrom/Makefile b/net/netrom/Makefile
new file mode 100644
index 00000000..2660f5a1
--- /dev/null
+++ b/net/netrom/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux NET/ROM layer.
+#
+
+obj-$(CONFIG_NETROM) += netrom.o
+
+netrom-y		:= af_netrom.o nr_dev.o nr_in.o nr_loopback.o \
+			   nr_out.o nr_route.o nr_subr.o nr_timer.o
+netrom-$(CONFIG_SYSCTL)	+= sysctl_net_netrom.o
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
new file mode 100644
index 00000000..732152f7
--- /dev/null
+++ b/net/netrom/af_netrom.c
@@ -0,0 +1,1509 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/stat.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/netrom.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
+#include <net/arp.h>
+#include <linux/init.h>
+
+static int nr_ndevs = 4;
+
+int sysctl_netrom_default_path_quality            = NR_DEFAULT_QUAL;
+int sysctl_netrom_obsolescence_count_initialiser  = NR_DEFAULT_OBS;
+int sysctl_netrom_network_ttl_initialiser         = NR_DEFAULT_TTL;
+int sysctl_netrom_transport_timeout               = NR_DEFAULT_T1;
+int sysctl_netrom_transport_maximum_tries         = NR_DEFAULT_N2;
+int sysctl_netrom_transport_acknowledge_delay     = NR_DEFAULT_T2;
+int sysctl_netrom_transport_busy_delay            = NR_DEFAULT_T4;
+int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW;
+int sysctl_netrom_transport_no_activity_timeout   = NR_DEFAULT_IDLE;
+int sysctl_netrom_routing_control                 = NR_DEFAULT_ROUTING;
+int sysctl_netrom_link_fails_count                = NR_DEFAULT_FAILS;
+int sysctl_netrom_reset_circuit                   = NR_DEFAULT_RESET;
+
+static unsigned short circuit = 0x101;
+
+static HLIST_HEAD(nr_list);
+static DEFINE_SPINLOCK(nr_list_lock);
+
+static const struct proto_ops nr_proto_ops;
+
+/*
+ * NETROM network devices are virtual network devices encapsulating NETROM
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key nr_netdev_xmit_lock_key;
+static struct lock_class_key nr_netdev_addr_lock_key;
+
+static void nr_set_lockdep_one(struct net_device *dev,
+			       struct netdev_queue *txq,
+			       void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key);
+}
+
+static void nr_set_lockdep_key(struct net_device *dev)
+{
+	lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key);
+	netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL);
+}
+
+/*
+ *	Socket removal during an interrupt is now safe.
+ */
+static void nr_remove_socket(struct sock *sk)
+{
+	spin_lock_bh(&nr_list_lock);
+	sk_del_node_init(sk);
+	spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ *	Kill all bound sockets on a dropped device.
+ */
+static void nr_kill_by_device(struct net_device *dev)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_list_lock);
+	sk_for_each(s, node, &nr_list)
+		if (nr_sk(s)->device == dev)
+			nr_disconnect(s, ENETUNREACH);
+	spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ *	Handle device status changes.
+ */
+static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = (struct net_device *)ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_DOWN)
+		return NOTIFY_DONE;
+
+	nr_kill_by_device(dev);
+	nr_rt_device_down(dev);
+
+	return NOTIFY_DONE;
+}
+
+/*
+ *	Add a socket to the bound sockets list.
+ */
+static void nr_insert_socket(struct sock *sk)
+{
+	spin_lock_bh(&nr_list_lock);
+	sk_add_node(sk, &nr_list);
+	spin_unlock_bh(&nr_list_lock);
+}
+
+/*
+ *	Find a socket that wants to accept the Connect Request we just
+ *	received.
+ */
+static struct sock *nr_find_listener(ax25_address *addr)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_list_lock);
+	sk_for_each(s, node, &nr_list)
+		if (!ax25cmp(&nr_sk(s)->source_addr, addr) &&
+		    s->sk_state == TCP_LISTEN) {
+			bh_lock_sock(s);
+			goto found;
+		}
+	s = NULL;
+found:
+	spin_unlock_bh(&nr_list_lock);
+	return s;
+}
+
+/*
+ *	Find a connected NET/ROM socket given my circuit IDs.
+ */
+static struct sock *nr_find_socket(unsigned char index, unsigned char id)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_list_lock);
+	sk_for_each(s, node, &nr_list) {
+		struct nr_sock *nr = nr_sk(s);
+
+		if (nr->my_index == index && nr->my_id == id) {
+			bh_lock_sock(s);
+			goto found;
+		}
+	}
+	s = NULL;
+found:
+	spin_unlock_bh(&nr_list_lock);
+	return s;
+}
+
+/*
+ *	Find a connected NET/ROM socket given their circuit IDs.
+ */
+static struct sock *nr_find_peer(unsigned char index, unsigned char id,
+	ax25_address *dest)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_list_lock);
+	sk_for_each(s, node, &nr_list) {
+		struct nr_sock *nr = nr_sk(s);
+
+		if (nr->your_index == index && nr->your_id == id &&
+		    !ax25cmp(&nr->dest_addr, dest)) {
+			bh_lock_sock(s);
+			goto found;
+		}
+	}
+	s = NULL;
+found:
+	spin_unlock_bh(&nr_list_lock);
+	return s;
+}
+
+/*
+ *	Find next free circuit ID.
+ */
+static unsigned short nr_find_next_circuit(void)
+{
+	unsigned short id = circuit;
+	unsigned char i, j;
+	struct sock *sk;
+
+	for (;;) {
+		i = id / 256;
+		j = id % 256;
+
+		if (i != 0 && j != 0) {
+			if ((sk=nr_find_socket(i, j)) == NULL)
+				break;
+			bh_unlock_sock(sk);
+		}
+
+		id++;
+	}
+
+	return id;
+}
+
+/*
+ *	Deferred destroy.
+ */
+void nr_destroy_socket(struct sock *);
+
+/*
+ *	Handler for deferred kills.
+ */
+static void nr_destroy_timer(unsigned long data)
+{
+	struct sock *sk=(struct sock *)data;
+	bh_lock_sock(sk);
+	sock_hold(sk);
+	nr_destroy_socket(sk);
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	This is called from user mode and the timers. Thus it protects itself
+ *	against interrupt users but doesn't worry about being called during
+ *	work. Once it is removed from the queue no interrupt or bottom half
+ *	will touch it and we are (fairly 8-) ) safe.
+ */
+void nr_destroy_socket(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	nr_remove_socket(sk);
+
+	nr_stop_heartbeat(sk);
+	nr_stop_t1timer(sk);
+	nr_stop_t2timer(sk);
+	nr_stop_t4timer(sk);
+	nr_stop_idletimer(sk);
+
+	nr_clear_queues(sk);		/* Flush the queues */
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (skb->sk != sk) { /* A pending connection */
+			/* Queue the unaccepted socket for death */
+			sock_set_flag(skb->sk, SOCK_DEAD);
+			nr_start_heartbeat(skb->sk);
+			nr_sk(skb->sk)->state = NR_STATE_0;
+		}
+
+		kfree_skb(skb);
+	}
+
+	if (sk_has_allocations(sk)) {
+		/* Defer: outstanding buffers */
+		sk->sk_timer.function = nr_destroy_timer;
+		sk->sk_timer.expires  = jiffies + 2 * HZ;
+		add_timer(&sk->sk_timer);
+	} else
+		sock_put(sk);
+}
+
+/*
+ *	Handling for system calls applied via the various interfaces to a
+ *	NET/ROM socket object.
+ */
+
+static int nr_setsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+	int opt;
+
+	if (level != SOL_NETROM)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(opt, (int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+	case NETROM_T1:
+		if (opt < 1)
+			return -EINVAL;
+		nr->t1 = opt * HZ;
+		return 0;
+
+	case NETROM_T2:
+		if (opt < 1)
+			return -EINVAL;
+		nr->t2 = opt * HZ;
+		return 0;
+
+	case NETROM_N2:
+		if (opt < 1 || opt > 31)
+			return -EINVAL;
+		nr->n2 = opt;
+		return 0;
+
+	case NETROM_T4:
+		if (opt < 1)
+			return -EINVAL;
+		nr->t4 = opt * HZ;
+		return 0;
+
+	case NETROM_IDLE:
+		if (opt < 0)
+			return -EINVAL;
+		nr->idle = opt * 60 * HZ;
+		return 0;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+static int nr_getsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+	int val = 0;
+	int len;
+
+	if (level != SOL_NETROM)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case NETROM_T1:
+		val = nr->t1 / HZ;
+		break;
+
+	case NETROM_T2:
+		val = nr->t2 / HZ;
+		break;
+
+	case NETROM_N2:
+		val = nr->n2;
+		break;
+
+	case NETROM_T4:
+		val = nr->t4 / HZ;
+		break;
+
+	case NETROM_IDLE:
+		val = nr->idle / (60 * HZ);
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return copy_to_user(optval, &val, len) ? -EFAULT : 0;
+}
+
+static int nr_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_LISTEN) {
+		memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN);
+		sk->sk_max_ack_backlog = backlog;
+		sk->sk_state           = TCP_LISTEN;
+		release_sock(sk);
+		return 0;
+	}
+	release_sock(sk);
+
+	return -EOPNOTSUPP;
+}
+
+static struct proto nr_proto = {
+	.name	  = "NETROM",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct nr_sock),
+};
+
+static int nr_create(struct net *net, struct socket *sock, int protocol,
+		     int kern)
+{
+	struct sock *sk;
+	struct nr_sock *nr;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	if (sock->type != SOCK_SEQPACKET || protocol != 0)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+	if (sk  == NULL)
+		return -ENOMEM;
+
+	nr = nr_sk(sk);
+
+	sock_init_data(sock, sk);
+
+	sock->ops    = &nr_proto_ops;
+	sk->sk_protocol = protocol;
+
+	skb_queue_head_init(&nr->ack_queue);
+	skb_queue_head_init(&nr->reseq_queue);
+	skb_queue_head_init(&nr->frag_queue);
+
+	nr_init_timers(sk);
+
+	nr->t1     =
+		msecs_to_jiffies(sysctl_netrom_transport_timeout);
+	nr->t2     =
+		msecs_to_jiffies(sysctl_netrom_transport_acknowledge_delay);
+	nr->n2     =
+		msecs_to_jiffies(sysctl_netrom_transport_maximum_tries);
+	nr->t4     =
+		msecs_to_jiffies(sysctl_netrom_transport_busy_delay);
+	nr->idle   =
+		msecs_to_jiffies(sysctl_netrom_transport_no_activity_timeout);
+	nr->window = sysctl_netrom_transport_requested_window_size;
+
+	nr->bpqext = 1;
+	nr->state  = NR_STATE_0;
+
+	return 0;
+}
+
+static struct sock *nr_make_new(struct sock *osk)
+{
+	struct sock *sk;
+	struct nr_sock *nr, *onr;
+
+	if (osk->sk_type != SOCK_SEQPACKET)
+		return NULL;
+
+	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+	if (sk == NULL)
+		return NULL;
+
+	nr = nr_sk(sk);
+
+	sock_init_data(NULL, sk);
+
+	sk->sk_type     = osk->sk_type;
+	sk->sk_priority = osk->sk_priority;
+	sk->sk_protocol = osk->sk_protocol;
+	sk->sk_rcvbuf   = osk->sk_rcvbuf;
+	sk->sk_sndbuf   = osk->sk_sndbuf;
+	sk->sk_state    = TCP_ESTABLISHED;
+	sock_copy_flags(sk, osk);
+
+	skb_queue_head_init(&nr->ack_queue);
+	skb_queue_head_init(&nr->reseq_queue);
+	skb_queue_head_init(&nr->frag_queue);
+
+	nr_init_timers(sk);
+
+	onr = nr_sk(osk);
+
+	nr->t1      = onr->t1;
+	nr->t2      = onr->t2;
+	nr->n2      = onr->n2;
+	nr->t4      = onr->t4;
+	nr->idle    = onr->idle;
+	nr->window  = onr->window;
+
+	nr->device  = onr->device;
+	nr->bpqext  = onr->bpqext;
+
+	return sk;
+}
+
+static int nr_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr;
+
+	if (sk == NULL) return 0;
+
+	sock_hold(sk);
+	sock_orphan(sk);
+	lock_sock(sk);
+	nr = nr_sk(sk);
+
+	switch (nr->state) {
+	case NR_STATE_0:
+	case NR_STATE_1:
+	case NR_STATE_2:
+		nr_disconnect(sk, 0);
+		nr_destroy_socket(sk);
+		break;
+
+	case NR_STATE_3:
+		nr_clear_queues(sk);
+		nr->n2count = 0;
+		nr_write_internal(sk, NR_DISCREQ);
+		nr_start_t1timer(sk);
+		nr_stop_t2timer(sk);
+		nr_stop_t4timer(sk);
+		nr_stop_idletimer(sk);
+		nr->state    = NR_STATE_2;
+		sk->sk_state    = TCP_CLOSE;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DESTROY);
+		break;
+
+	default:
+		break;
+	}
+
+	sock->sk   = NULL;
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+	struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
+	struct net_device *dev;
+	ax25_uid_assoc *user;
+	ax25_address *source;
+
+	lock_sock(sk);
+	if (!sock_flag(sk, SOCK_ZAPPED)) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+	if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+	if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+	if (addr->fsa_ax25.sax25_family != AF_NETROM) {
+		release_sock(sk);
+		return -EINVAL;
+	}
+	if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) {
+		release_sock(sk);
+		return -EADDRNOTAVAIL;
+	}
+
+	/*
+	 * Only the super user can set an arbitrary user callsign.
+	 */
+	if (addr->fsa_ax25.sax25_ndigis == 1) {
+		if (!capable(CAP_NET_BIND_SERVICE)) {
+			dev_put(dev);
+			release_sock(sk);
+			return -EACCES;
+		}
+		nr->user_addr   = addr->fsa_digipeater[0];
+		nr->source_addr = addr->fsa_ax25.sax25_call;
+	} else {
+		source = &addr->fsa_ax25.sax25_call;
+
+		user = ax25_findbyuid(current_euid());
+		if (user) {
+			nr->user_addr   = user->call;
+			ax25_uid_put(user);
+		} else {
+			if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
+				release_sock(sk);
+				dev_put(dev);
+				return -EPERM;
+			}
+			nr->user_addr   = *source;
+		}
+
+		nr->source_addr = *source;
+	}
+
+	nr->device = dev;
+	nr_insert_socket(sk);
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	dev_put(dev);
+	release_sock(sk);
+
+	return 0;
+}
+
+static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
+	int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+	struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
+	ax25_address *source = NULL;
+	ax25_uid_assoc *user;
+	struct net_device *dev;
+	int err = 0;
+
+	lock_sock(sk);
+	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+		sock->state = SS_CONNECTED;
+		goto out_release;	/* Connect completed during a ERESTARTSYS event */
+	}
+
+	if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+		sock->state = SS_UNCONNECTED;
+		err = -ECONNREFUSED;
+		goto out_release;
+	}
+
+	if (sk->sk_state == TCP_ESTABLISHED) {
+		err = -EISCONN;	/* No reconnect on a seqpacket socket */
+		goto out_release;
+	}
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) {
+		err = -EINVAL;
+		goto out_release;
+	}
+	if (addr->sax25_family != AF_NETROM) {
+		err = -EINVAL;
+		goto out_release;
+	}
+	if (sock_flag(sk, SOCK_ZAPPED)) {	/* Must bind first - autobinding in this may or may not work */
+		sock_reset_flag(sk, SOCK_ZAPPED);
+
+		if ((dev = nr_dev_first()) == NULL) {
+			err = -ENETUNREACH;
+			goto out_release;
+		}
+		source = (ax25_address *)dev->dev_addr;
+
+		user = ax25_findbyuid(current_euid());
+		if (user) {
+			nr->user_addr   = user->call;
+			ax25_uid_put(user);
+		} else {
+			if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
+				dev_put(dev);
+				err = -EPERM;
+				goto out_release;
+			}
+			nr->user_addr   = *source;
+		}
+
+		nr->source_addr = *source;
+		nr->device      = dev;
+
+		dev_put(dev);
+		nr_insert_socket(sk);		/* Finish the bind */
+	}
+
+	nr->dest_addr = addr->sax25_call;
+
+	release_sock(sk);
+	circuit = nr_find_next_circuit();
+	lock_sock(sk);
+
+	nr->my_index = circuit / 256;
+	nr->my_id    = circuit % 256;
+
+	circuit++;
+
+	/* Move to connecting socket, start sending Connect Requests */
+	sock->state  = SS_CONNECTING;
+	sk->sk_state = TCP_SYN_SENT;
+
+	nr_establish_data_link(sk);
+
+	nr->state = NR_STATE_1;
+
+	nr_start_heartbeat(sk);
+
+	/* Now the loop */
+	if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+		err = -EINPROGRESS;
+		goto out_release;
+	}
+
+	/*
+	 * A Connect Ack with Choke or timeout or failed routing will go to
+	 * closed.
+	 */
+	if (sk->sk_state == TCP_SYN_SENT) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk->sk_state != TCP_SYN_SENT)
+				break;
+			if (!signal_pending(current)) {
+				release_sock(sk);
+				schedule();
+				lock_sock(sk);
+				continue;
+			}
+			err = -ERESTARTSYS;
+			break;
+		}
+		finish_wait(sk_sleep(sk), &wait);
+		if (err)
+			goto out_release;
+	}
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		sock->state = SS_UNCONNECTED;
+		err = sock_error(sk);	/* Always set at this point */
+		goto out_release;
+	}
+
+	sock->state = SS_CONNECTED;
+
+out_release:
+	release_sock(sk);
+
+	return err;
+}
+
+static int nr_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sk_buff *skb;
+	struct sock *newsk;
+	DEFINE_WAIT(wait);
+	struct sock *sk;
+	int err = 0;
+
+	if ((sk = sock->sk) == NULL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EOPNOTSUPP;
+		goto out_release;
+	}
+
+	if (sk->sk_state != TCP_LISTEN) {
+		err = -EINVAL;
+		goto out_release;
+	}
+
+	/*
+	 *	The write queue this time is holding sockets ready to use
+	 *	hooked into the SABM we saved
+	 */
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb)
+			break;
+
+		if (flags & O_NONBLOCK) {
+			err = -EWOULDBLOCK;
+			break;
+		}
+		if (!signal_pending(current)) {
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (err)
+		goto out_release;
+
+	newsk = skb->sk;
+	sock_graft(newsk, newsock);
+
+	/* Now attach up the new socket */
+	kfree_skb(skb);
+	sk_acceptq_removed(sk);
+
+out_release:
+	release_sock(sk);
+
+	return err;
+}
+
+static int nr_getname(struct socket *sock, struct sockaddr *uaddr,
+	int *uaddr_len, int peer)
+{
+	struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr;
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+
+	lock_sock(sk);
+	if (peer != 0) {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			release_sock(sk);
+			return -ENOTCONN;
+		}
+		sax->fsa_ax25.sax25_family = AF_NETROM;
+		sax->fsa_ax25.sax25_ndigis = 1;
+		sax->fsa_ax25.sax25_call   = nr->user_addr;
+		memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater));
+		sax->fsa_digipeater[0]     = nr->dest_addr;
+		*uaddr_len = sizeof(struct full_sockaddr_ax25);
+	} else {
+		sax->fsa_ax25.sax25_family = AF_NETROM;
+		sax->fsa_ax25.sax25_ndigis = 0;
+		sax->fsa_ax25.sax25_call   = nr->source_addr;
+		*uaddr_len = sizeof(struct sockaddr_ax25);
+	}
+	release_sock(sk);
+
+	return 0;
+}
+
+int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
+{
+	struct sock *sk;
+	struct sock *make;
+	struct nr_sock *nr_make;
+	ax25_address *src, *dest, *user;
+	unsigned short circuit_index, circuit_id;
+	unsigned short peer_circuit_index, peer_circuit_id;
+	unsigned short frametype, flags, window, timeout;
+	int ret;
+
+	skb->sk = NULL;		/* Initially we don't know who it's for */
+
+	/*
+	 *	skb->data points to the netrom frame start
+	 */
+
+	src  = (ax25_address *)(skb->data + 0);
+	dest = (ax25_address *)(skb->data + 7);
+
+	circuit_index      = skb->data[15];
+	circuit_id         = skb->data[16];
+	peer_circuit_index = skb->data[17];
+	peer_circuit_id    = skb->data[18];
+	frametype          = skb->data[19] & 0x0F;
+	flags              = skb->data[19] & 0xF0;
+
+	/*
+	 * Check for an incoming IP over NET/ROM frame.
+	 */
+	if (frametype == NR_PROTOEXT &&
+	    circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+		skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+		skb_reset_transport_header(skb);
+
+		return nr_rx_ip(skb, dev);
+	}
+
+	/*
+	 * Find an existing socket connection, based on circuit ID, if it's
+	 * a Connect Request base it on their circuit ID.
+	 *
+	 * Circuit ID 0/0 is not valid but it could still be a "reset" for a
+	 * circuit that no longer exists at the other end ...
+	 */
+
+	sk = NULL;
+
+	if (circuit_index == 0 && circuit_id == 0) {
+		if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG)
+			sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src);
+	} else {
+		if (frametype == NR_CONNREQ)
+			sk = nr_find_peer(circuit_index, circuit_id, src);
+		else
+			sk = nr_find_socket(circuit_index, circuit_id);
+	}
+
+	if (sk != NULL) {
+		skb_reset_transport_header(skb);
+
+		if (frametype == NR_CONNACK && skb->len == 22)
+			nr_sk(sk)->bpqext = 1;
+		else
+			nr_sk(sk)->bpqext = 0;
+
+		ret = nr_process_rx_frame(sk, skb);
+		bh_unlock_sock(sk);
+		return ret;
+	}
+
+	/*
+	 * Now it should be a CONNREQ.
+	 */
+	if (frametype != NR_CONNREQ) {
+		/*
+		 * Here it would be nice to be able to send a reset but
+		 * NET/ROM doesn't have one.  We've tried to extend the protocol
+		 * by sending NR_CONNACK | NR_CHOKE_FLAGS replies but that
+		 * apparently kills BPQ boxes... :-(
+		 * So now we try to follow the established behaviour of
+		 * G8PZT's Xrouter which is sending packets with command type 7
+		 * as an extension of the protocol.
+		 */
+		if (sysctl_netrom_reset_circuit &&
+		    (frametype != NR_RESET || flags != 0))
+			nr_transmit_reset(skb, 1);
+
+		return 0;
+	}
+
+	sk = nr_find_listener(dest);
+
+	user = (ax25_address *)(skb->data + 21);
+
+	if (sk == NULL || sk_acceptq_is_full(sk) ||
+	    (make = nr_make_new(sk)) == NULL) {
+		nr_transmit_refusal(skb, 0);
+		if (sk)
+			bh_unlock_sock(sk);
+		return 0;
+	}
+
+	window = skb->data[20];
+
+	skb->sk             = make;
+	make->sk_state	    = TCP_ESTABLISHED;
+
+	/* Fill in his circuit details */
+	nr_make = nr_sk(make);
+	nr_make->source_addr = *dest;
+	nr_make->dest_addr   = *src;
+	nr_make->user_addr   = *user;
+
+	nr_make->your_index  = circuit_index;
+	nr_make->your_id     = circuit_id;
+
+	bh_unlock_sock(sk);
+	circuit = nr_find_next_circuit();
+	bh_lock_sock(sk);
+
+	nr_make->my_index    = circuit / 256;
+	nr_make->my_id       = circuit % 256;
+
+	circuit++;
+
+	/* Window negotiation */
+	if (window < nr_make->window)
+		nr_make->window = window;
+
+	/* L4 timeout negotiation */
+	if (skb->len == 37) {
+		timeout = skb->data[36] * 256 + skb->data[35];
+		if (timeout * HZ < nr_make->t1)
+			nr_make->t1 = timeout * HZ;
+		nr_make->bpqext = 1;
+	} else {
+		nr_make->bpqext = 0;
+	}
+
+	nr_write_internal(make, NR_CONNACK);
+
+	nr_make->condition = 0x00;
+	nr_make->vs        = 0;
+	nr_make->va        = 0;
+	nr_make->vr        = 0;
+	nr_make->vl        = 0;
+	nr_make->state     = NR_STATE_3;
+	sk_acceptq_added(sk);
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb->len);
+
+	bh_unlock_sock(sk);
+
+	nr_insert_socket(make);
+
+	nr_start_heartbeat(make);
+	nr_start_idletimer(make);
+
+	return 1;
+}
+
+static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
+		      struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct nr_sock *nr = nr_sk(sk);
+	struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name;
+	int err;
+	struct sockaddr_ax25 sax;
+	struct sk_buff *skb;
+	unsigned char *asmptr;
+	int size;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sock_flag(sk, SOCK_ZAPPED)) {
+		err = -EADDRNOTAVAIL;
+		goto out;
+	}
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		err = -EPIPE;
+		goto out;
+	}
+
+	if (nr->device == NULL) {
+		err = -ENETUNREACH;
+		goto out;
+	}
+
+	if (usax) {
+		if (msg->msg_namelen < sizeof(sax)) {
+			err = -EINVAL;
+			goto out;
+		}
+		sax = *usax;
+		if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) {
+			err = -EISCONN;
+			goto out;
+		}
+		if (sax.sax25_family != AF_NETROM) {
+			err = -EINVAL;
+			goto out;
+		}
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			err = -ENOTCONN;
+			goto out;
+		}
+		sax.sax25_family = AF_NETROM;
+		sax.sax25_call   = nr->dest_addr;
+	}
+
+	/* Build a packet - the conventional user limit is 236 bytes. We can
+	   do ludicrously large NetROM frames but must not overflow */
+	if (len > 65536) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+
+	if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
+		goto out;
+
+	skb_reserve(skb, size - len);
+	skb_reset_transport_header(skb);
+
+	/*
+	 *	Push down the NET/ROM header
+	 */
+
+	asmptr = skb_push(skb, NR_TRANSPORT_LEN);
+
+	/* Build a NET/ROM Transport header */
+
+	*asmptr++ = nr->your_index;
+	*asmptr++ = nr->your_id;
+	*asmptr++ = 0;		/* To be filled in later */
+	*asmptr++ = 0;		/*      Ditto            */
+	*asmptr++ = NR_INFO;
+
+	/*
+	 *	Put the data on the end
+	 */
+	skb_put(skb, len);
+
+	/* User data follows immediately after the NET/ROM transport header */
+	if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) {
+		kfree_skb(skb);
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		kfree_skb(skb);
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	nr_output(sk, skb);	/* Shove it onto the queue */
+
+	err = len;
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
+		      struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name;
+	size_t copied;
+	struct sk_buff *skb;
+	int er;
+
+	/*
+	 * This works for seqpacket too. The receiver has ordered the queue for
+	 * us! We do one quick check first though
+	 */
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		release_sock(sk);
+		return -ENOTCONN;
+	}
+
+	/* Now we can treat all alike */
+	if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) {
+		release_sock(sk);
+		return er;
+	}
+
+	skb_reset_transport_header(skb);
+	copied     = skb->len;
+
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	if (sax != NULL) {
+		sax->sax25_family = AF_NETROM;
+		skb_copy_from_linear_data_offset(skb, 7, sax->sax25_call.ax25_call,
+			      AX25_ADDR_LEN);
+	}
+
+	msg->msg_namelen = sizeof(*sax);
+
+	skb_free_datagram(sk, skb);
+
+	release_sock(sk);
+	return copied;
+}
+
+
+static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	void __user *argp = (void __user *)arg;
+	int ret;
+
+	switch (cmd) {
+	case TIOCOUTQ: {
+		long amount;
+
+		lock_sock(sk);
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		release_sock(sk);
+		return put_user(amount, (int __user *)argp);
+	}
+
+	case TIOCINQ: {
+		struct sk_buff *skb;
+		long amount = 0L;
+
+		lock_sock(sk);
+		/* These two are safe on a single CPU system as only user tasks fiddle here */
+		if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+			amount = skb->len;
+		release_sock(sk);
+		return put_user(amount, (int __user *)argp);
+	}
+
+	case SIOCGSTAMP:
+		lock_sock(sk);
+		ret = sock_get_timestamp(sk, argp);
+		release_sock(sk);
+		return ret;
+
+	case SIOCGSTAMPNS:
+		lock_sock(sk);
+		ret = sock_get_timestampns(sk, argp);
+		release_sock(sk);
+		return ret;
+
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+		return -EINVAL;
+
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCNRDECOBS:
+		if (!capable(CAP_NET_ADMIN)) return -EPERM;
+		return nr_rt_ioctl(cmd, argp);
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *nr_info_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_bh(&nr_list_lock);
+	return seq_hlist_start_head(&nr_list, *pos);
+}
+
+static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &nr_list, pos);
+}
+
+static void nr_info_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_bh(&nr_list_lock);
+}
+
+static int nr_info_show(struct seq_file *seq, void *v)
+{
+	struct sock *s = sk_entry(v);
+	struct net_device *dev;
+	struct nr_sock *nr;
+	const char *devname;
+	char buf[11];
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+"user_addr dest_node src_node  dev    my  your  st  vs  vr  va    t1     t2     t4      idle   n2  wnd Snd-Q Rcv-Q inode\n");
+
+	else {
+
+		bh_lock_sock(s);
+		nr = nr_sk(s);
+
+		if ((dev = nr->device) == NULL)
+			devname = "???";
+		else
+			devname = dev->name;
+
+		seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr));
+		seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr));
+		seq_printf(seq,
+"%-9s %-3s  %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n",
+			ax2asc(buf, &nr->source_addr),
+			devname,
+			nr->my_index,
+			nr->my_id,
+			nr->your_index,
+			nr->your_id,
+			nr->state,
+			nr->vs,
+			nr->vr,
+			nr->va,
+			ax25_display_timer(&nr->t1timer) / HZ,
+			nr->t1 / HZ,
+			ax25_display_timer(&nr->t2timer) / HZ,
+			nr->t2 / HZ,
+			ax25_display_timer(&nr->t4timer) / HZ,
+			nr->t4 / HZ,
+			ax25_display_timer(&nr->idletimer) / (60 * HZ),
+			nr->idle / (60 * HZ),
+			nr->n2count,
+			nr->n2,
+			nr->window,
+			sk_wmem_alloc_get(s),
+			sk_rmem_alloc_get(s),
+			s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+
+		bh_unlock_sock(s);
+	}
+	return 0;
+}
+
+static const struct seq_operations nr_info_seqops = {
+	.start = nr_info_start,
+	.next = nr_info_next,
+	.stop = nr_info_stop,
+	.show = nr_info_show,
+};
+
+static int nr_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nr_info_seqops);
+}
+
+static const struct file_operations nr_info_fops = {
+	.owner = THIS_MODULE,
+	.open = nr_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+#endif	/* CONFIG_PROC_FS */
+
+static const struct net_proto_family nr_family_ops = {
+	.family		=	PF_NETROM,
+	.create		=	nr_create,
+	.owner		=	THIS_MODULE,
+};
+
+static const struct proto_ops nr_proto_ops = {
+	.family		=	PF_NETROM,
+	.owner		=	THIS_MODULE,
+	.release	=	nr_release,
+	.bind		=	nr_bind,
+	.connect	=	nr_connect,
+	.socketpair	=	sock_no_socketpair,
+	.accept		=	nr_accept,
+	.getname	=	nr_getname,
+	.poll		=	datagram_poll,
+	.ioctl		=	nr_ioctl,
+	.listen		=	nr_listen,
+	.shutdown	=	sock_no_shutdown,
+	.setsockopt	=	nr_setsockopt,
+	.getsockopt	=	nr_getsockopt,
+	.sendmsg	=	nr_sendmsg,
+	.recvmsg	=	nr_recvmsg,
+	.mmap		=	sock_no_mmap,
+	.sendpage	=	sock_no_sendpage,
+};
+
+static struct notifier_block nr_dev_notifier = {
+	.notifier_call	=	nr_device_event,
+};
+
+static struct net_device **dev_nr;
+
+static struct ax25_protocol nr_pid = {
+	.pid	= AX25_P_NETROM,
+	.func	= nr_route_frame
+};
+
+static struct ax25_linkfail nr_linkfail_notifier = {
+	.func	= nr_link_failed,
+};
+
+static int __init nr_proto_init(void)
+{
+	int i;
+	int rc = proto_register(&nr_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) {
+		printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n");
+		return -1;
+	}
+
+	dev_nr = kzalloc(nr_ndevs * sizeof(struct net_device *), GFP_KERNEL);
+	if (dev_nr == NULL) {
+		printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n");
+		return -1;
+	}
+
+	for (i = 0; i < nr_ndevs; i++) {
+		char name[IFNAMSIZ];
+		struct net_device *dev;
+
+		sprintf(name, "nr%d", i);
+		dev = alloc_netdev(0, name, nr_setup);
+		if (!dev) {
+			printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n");
+			goto fail;
+		}
+
+		dev->base_addr = i;
+		if (register_netdev(dev)) {
+			printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n");
+			free_netdev(dev);
+			goto fail;
+		}
+		nr_set_lockdep_key(dev);
+		dev_nr[i] = dev;
+	}
+
+	if (sock_register(&nr_family_ops)) {
+		printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n");
+		goto fail;
+	}
+
+	register_netdevice_notifier(&nr_dev_notifier);
+
+	ax25_register_pid(&nr_pid);
+	ax25_linkfail_register(&nr_linkfail_notifier);
+
+#ifdef CONFIG_SYSCTL
+	nr_register_sysctl();
+#endif
+
+	nr_loopback_init();
+
+	proc_net_fops_create(&init_net, "nr", S_IRUGO, &nr_info_fops);
+	proc_net_fops_create(&init_net, "nr_neigh", S_IRUGO, &nr_neigh_fops);
+	proc_net_fops_create(&init_net, "nr_nodes", S_IRUGO, &nr_nodes_fops);
+out:
+	return rc;
+fail:
+	while (--i >= 0) {
+		unregister_netdev(dev_nr[i]);
+		free_netdev(dev_nr[i]);
+	}
+	kfree(dev_nr);
+	proto_unregister(&nr_proto);
+	rc = -1;
+	goto out;
+}
+
+module_init(nr_proto_init);
+
+module_param(nr_ndevs, int, 0);
+MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices");
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_NETROM);
+
+static void __exit nr_exit(void)
+{
+	int i;
+
+	proc_net_remove(&init_net, "nr");
+	proc_net_remove(&init_net, "nr_neigh");
+	proc_net_remove(&init_net, "nr_nodes");
+	nr_loopback_clear();
+
+	nr_rt_free();
+
+#ifdef CONFIG_SYSCTL
+	nr_unregister_sysctl();
+#endif
+
+	ax25_linkfail_release(&nr_linkfail_notifier);
+	ax25_protocol_release(AX25_P_NETROM);
+
+	unregister_netdevice_notifier(&nr_dev_notifier);
+
+	sock_unregister(PF_NETROM);
+
+	for (i = 0; i < nr_ndevs; i++) {
+		struct net_device *dev = dev_nr[i];
+		if (dev) {
+			unregister_netdev(dev);
+			free_netdev(dev);
+		}
+	}
+
+	kfree(dev_nr);
+	proto_unregister(&nr_proto);
+}
+module_exit(nr_exit);
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
new file mode 100644
index 00000000..64e6dde9
--- /dev/null
+++ b/net/netrom/nr_dev.c
@@ -0,0 +1,213 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>	/* For the statistics structure. */
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/arp.h>
+
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+/*
+ *	Only allow IP over NET/ROM frames through if the netrom device is up.
+ */
+
+int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats = &dev->stats;
+
+	if (!netif_running(dev)) {
+		stats->rx_dropped++;
+		return 0;
+	}
+
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+
+	skb->protocol = htons(ETH_P_IP);
+
+	/* Spoof incoming device */
+	skb->dev      = dev;
+	skb->mac_header = skb->network_header;
+	skb_reset_network_header(skb);
+	skb->pkt_type = PACKET_HOST;
+
+	netif_rx(skb);
+
+	return 1;
+}
+
+#ifdef CONFIG_INET
+
+static int nr_rebuild_header(struct sk_buff *skb)
+{
+	unsigned char *bp = skb->data;
+
+	if (arp_find(bp + 7, skb))
+		return 1;
+
+	bp[6] &= ~AX25_CBIT;
+	bp[6] &= ~AX25_EBIT;
+	bp[6] |= AX25_SSSID_SPARE;
+	bp    += AX25_ADDR_LEN;
+
+	bp[6] &= ~AX25_CBIT;
+	bp[6] |= AX25_EBIT;
+	bp[6] |= AX25_SSSID_SPARE;
+
+	return 0;
+}
+
+#else
+
+static int nr_rebuild_header(struct sk_buff *skb)
+{
+	return 1;
+}
+
+#endif
+
+static int nr_header(struct sk_buff *skb, struct net_device *dev,
+		     unsigned short type,
+		     const void *daddr, const void *saddr, unsigned len)
+{
+	unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+	memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len);
+	buff[6] &= ~AX25_CBIT;
+	buff[6] &= ~AX25_EBIT;
+	buff[6] |= AX25_SSSID_SPARE;
+	buff    += AX25_ADDR_LEN;
+
+	if (daddr != NULL)
+		memcpy(buff, daddr, dev->addr_len);
+	buff[6] &= ~AX25_CBIT;
+	buff[6] |= AX25_EBIT;
+	buff[6] |= AX25_SSSID_SPARE;
+	buff    += AX25_ADDR_LEN;
+
+	*buff++ = sysctl_netrom_network_ttl_initialiser;
+
+	*buff++ = NR_PROTO_IP;
+	*buff++ = NR_PROTO_IP;
+	*buff++ = 0;
+	*buff++ = 0;
+	*buff++ = NR_PROTOEXT;
+
+	if (daddr != NULL)
+		return 37;
+
+	return -37;
+}
+
+static int __must_check nr_set_mac_address(struct net_device *dev, void *addr)
+{
+	struct sockaddr *sa = addr;
+	int err;
+
+	if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len))
+		return 0;
+
+	if (dev->flags & IFF_UP) {
+		err = ax25_listen_register((ax25_address *)sa->sa_data, NULL);
+		if (err)
+			return err;
+
+		ax25_listen_release((ax25_address *)dev->dev_addr, NULL);
+	}
+
+	memcpy(dev->dev_addr, sa->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+static int nr_open(struct net_device *dev)
+{
+	int err;
+
+	err = ax25_listen_register((ax25_address *)dev->dev_addr, NULL);
+	if (err)
+		return err;
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static int nr_close(struct net_device *dev)
+{
+	ax25_listen_release((ax25_address *)dev->dev_addr, NULL);
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats = &dev->stats;
+	unsigned int len = skb->len;
+
+	if (!nr_route_frame(skb, NULL)) {
+		kfree_skb(skb);
+		stats->tx_errors++;
+		return NETDEV_TX_OK;
+	}
+
+	stats->tx_packets++;
+	stats->tx_bytes += len;
+
+	return NETDEV_TX_OK;
+}
+
+static const struct header_ops nr_header_ops = {
+	.create	= nr_header,
+	.rebuild= nr_rebuild_header,
+};
+
+static const struct net_device_ops nr_netdev_ops = {
+	.ndo_open		= nr_open,
+	.ndo_stop		= nr_close,
+	.ndo_start_xmit		= nr_xmit,
+	.ndo_set_mac_address    = nr_set_mac_address,
+};
+
+void nr_setup(struct net_device *dev)
+{
+	dev->mtu		= NR_MAX_PACKET_SIZE;
+	dev->netdev_ops		= &nr_netdev_ops;
+	dev->header_ops		= &nr_header_ops;
+	dev->hard_header_len	= NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+	dev->addr_len		= AX25_ADDR_LEN;
+	dev->type		= ARPHRD_NETROM;
+
+	/* New-style flags. */
+	dev->flags		= IFF_NOARP;
+}
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
new file mode 100644
index 00000000..6d4ef6d6
--- /dev/null
+++ b/net/netrom/nr_in.c
@@ -0,0 +1,306 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
+{
+	struct sk_buff *skbo, *skbn = skb;
+	struct nr_sock *nr = nr_sk(sk);
+
+	skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+	nr_start_idletimer(sk);
+
+	if (more) {
+		nr->fraglen += skb->len;
+		skb_queue_tail(&nr->frag_queue, skb);
+		return 0;
+	}
+
+	if (!more && nr->fraglen > 0) {	/* End of fragment */
+		nr->fraglen += skb->len;
+		skb_queue_tail(&nr->frag_queue, skb);
+
+		if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL)
+			return 1;
+
+		skb_reset_transport_header(skbn);
+
+		while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) {
+			skb_copy_from_linear_data(skbo,
+						  skb_put(skbn, skbo->len),
+						  skbo->len);
+			kfree_skb(skbo);
+		}
+
+		nr->fraglen = 0;
+	}
+
+	return sock_queue_rcv_skb(sk, skbn);
+}
+
+/*
+ * State machine for state 1, Awaiting Connection State.
+ * The handling of the timer(s) is in file nr_timer.c.
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
+	int frametype)
+{
+	switch (frametype) {
+	case NR_CONNACK: {
+		struct nr_sock *nr = nr_sk(sk);
+
+		nr_stop_t1timer(sk);
+		nr_start_idletimer(sk);
+		nr->your_index = skb->data[17];
+		nr->your_id    = skb->data[18];
+		nr->vs	       = 0;
+		nr->va	       = 0;
+		nr->vr	       = 0;
+		nr->vl	       = 0;
+		nr->state      = NR_STATE_3;
+		nr->n2count    = 0;
+		nr->window     = skb->data[20];
+		sk->sk_state   = TCP_ESTABLISHED;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		break;
+	}
+
+	case NR_CONNACK | NR_CHOKE_FLAG:
+		nr_disconnect(sk, ECONNREFUSED);
+		break;
+
+	case NR_RESET:
+		if (sysctl_netrom_reset_circuit)
+			nr_disconnect(sk, ECONNRESET);
+		break;
+
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Release State.
+ * The handling of the timer(s) is in file nr_timer.c
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
+	int frametype)
+{
+	switch (frametype) {
+	case NR_CONNACK | NR_CHOKE_FLAG:
+		nr_disconnect(sk, ECONNRESET);
+		break;
+
+	case NR_DISCREQ:
+		nr_write_internal(sk, NR_DISCACK);
+
+	case NR_DISCACK:
+		nr_disconnect(sk, 0);
+		break;
+
+	case NR_RESET:
+		if (sysctl_netrom_reset_circuit)
+			nr_disconnect(sk, ECONNRESET);
+		break;
+
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file nr_timer.c
+ * Handling of state 0 and connection release is in netrom.c.
+ */
+static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	struct nr_sock *nrom = nr_sk(sk);
+	struct sk_buff_head temp_queue;
+	struct sk_buff *skbn;
+	unsigned short save_vr;
+	unsigned short nr, ns;
+	int queued = 0;
+
+	nr = skb->data[18];
+	ns = skb->data[17];
+
+	switch (frametype) {
+	case NR_CONNREQ:
+		nr_write_internal(sk, NR_CONNACK);
+		break;
+
+	case NR_DISCREQ:
+		nr_write_internal(sk, NR_DISCACK);
+		nr_disconnect(sk, 0);
+		break;
+
+	case NR_CONNACK | NR_CHOKE_FLAG:
+	case NR_DISCACK:
+		nr_disconnect(sk, ECONNRESET);
+		break;
+
+	case NR_INFOACK:
+	case NR_INFOACK | NR_CHOKE_FLAG:
+	case NR_INFOACK | NR_NAK_FLAG:
+	case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG:
+		if (frametype & NR_CHOKE_FLAG) {
+			nrom->condition |= NR_COND_PEER_RX_BUSY;
+			nr_start_t4timer(sk);
+		} else {
+			nrom->condition &= ~NR_COND_PEER_RX_BUSY;
+			nr_stop_t4timer(sk);
+		}
+		if (!nr_validate_nr(sk, nr)) {
+			break;
+		}
+		if (frametype & NR_NAK_FLAG) {
+			nr_frames_acked(sk, nr);
+			nr_send_nak_frame(sk);
+		} else {
+			if (nrom->condition & NR_COND_PEER_RX_BUSY) {
+				nr_frames_acked(sk, nr);
+			} else {
+				nr_check_iframes_acked(sk, nr);
+			}
+		}
+		break;
+
+	case NR_INFO:
+	case NR_INFO | NR_NAK_FLAG:
+	case NR_INFO | NR_CHOKE_FLAG:
+	case NR_INFO | NR_MORE_FLAG:
+	case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG:
+	case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG:
+	case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG:
+	case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG:
+		if (frametype & NR_CHOKE_FLAG) {
+			nrom->condition |= NR_COND_PEER_RX_BUSY;
+			nr_start_t4timer(sk);
+		} else {
+			nrom->condition &= ~NR_COND_PEER_RX_BUSY;
+			nr_stop_t4timer(sk);
+		}
+		if (nr_validate_nr(sk, nr)) {
+			if (frametype & NR_NAK_FLAG) {
+				nr_frames_acked(sk, nr);
+				nr_send_nak_frame(sk);
+			} else {
+				if (nrom->condition & NR_COND_PEER_RX_BUSY) {
+					nr_frames_acked(sk, nr);
+				} else {
+					nr_check_iframes_acked(sk, nr);
+				}
+			}
+		}
+		queued = 1;
+		skb_queue_head(&nrom->reseq_queue, skb);
+		if (nrom->condition & NR_COND_OWN_RX_BUSY)
+			break;
+		skb_queue_head_init(&temp_queue);
+		do {
+			save_vr = nrom->vr;
+			while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) {
+				ns = skbn->data[17];
+				if (ns == nrom->vr) {
+					if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) {
+						nrom->vr = (nrom->vr + 1) % NR_MODULUS;
+					} else {
+						nrom->condition |= NR_COND_OWN_RX_BUSY;
+						skb_queue_tail(&temp_queue, skbn);
+					}
+				} else if (nr_in_rx_window(sk, ns)) {
+					skb_queue_tail(&temp_queue, skbn);
+				} else {
+					kfree_skb(skbn);
+				}
+			}
+			while ((skbn = skb_dequeue(&temp_queue)) != NULL) {
+				skb_queue_tail(&nrom->reseq_queue, skbn);
+			}
+		} while (save_vr != nrom->vr);
+		/*
+		 * Window is full, ack it immediately.
+		 */
+		if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) {
+			nr_enquiry_response(sk);
+		} else {
+			if (!(nrom->condition & NR_COND_ACK_PENDING)) {
+				nrom->condition |= NR_COND_ACK_PENDING;
+				nr_start_t2timer(sk);
+			}
+		}
+		break;
+
+	case NR_RESET:
+		if (sysctl_netrom_reset_circuit)
+			nr_disconnect(sk, ECONNRESET);
+		break;
+
+	default:
+		break;
+	}
+	return queued;
+}
+
+/* Higher level upcall for a LAPB frame - called with sk locked */
+int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	int queued = 0, frametype;
+
+	if (nr->state == NR_STATE_0)
+		return 0;
+
+	frametype = skb->data[19];
+
+	switch (nr->state) {
+	case NR_STATE_1:
+		queued = nr_state1_machine(sk, skb, frametype);
+		break;
+	case NR_STATE_2:
+		queued = nr_state2_machine(sk, skb, frametype);
+		break;
+	case NR_STATE_3:
+		queued = nr_state3_machine(sk, skb, frametype);
+		break;
+	}
+
+	nr_kick(sk);
+
+	return queued;
+}
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
new file mode 100644
index 00000000..94d4e922
--- /dev/null
+++ b/net/netrom/nr_loopback.c
@@ -0,0 +1,77 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <net/ax25.h>
+#include <linux/skbuff.h>
+#include <net/netrom.h>
+#include <linux/init.h>
+
+static void nr_loopback_timer(unsigned long);
+
+static struct sk_buff_head loopback_queue;
+static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0);
+
+void __init nr_loopback_init(void)
+{
+	skb_queue_head_init(&loopback_queue);
+}
+
+static inline int nr_loopback_running(void)
+{
+	return timer_pending(&loopback_timer);
+}
+
+int nr_loopback_queue(struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+
+	if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) {
+		skb_copy_from_linear_data(skb, skb_put(skbn, skb->len), skb->len);
+		skb_reset_transport_header(skbn);
+
+		skb_queue_tail(&loopback_queue, skbn);
+
+		if (!nr_loopback_running())
+			mod_timer(&loopback_timer, jiffies + 10);
+	}
+
+	kfree_skb(skb);
+	return 1;
+}
+
+static void nr_loopback_timer(unsigned long param)
+{
+	struct sk_buff *skb;
+	ax25_address *nr_dest;
+	struct net_device *dev;
+
+	if ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+		nr_dest = (ax25_address *)(skb->data + 7);
+
+		dev = nr_dev_get(nr_dest);
+
+		if (dev == NULL || nr_rx_frame(skb, dev) == 0)
+			kfree_skb(skb);
+
+		if (dev != NULL)
+			dev_put(dev);
+
+		if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running())
+			mod_timer(&loopback_timer, jiffies + 10);
+	}
+}
+
+void __exit nr_loopback_clear(void)
+{
+	del_timer_sync(&loopback_timer);
+	skb_queue_purge(&loopback_queue);
+}
diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c
new file mode 100644
index 00000000..607fddb4
--- /dev/null
+++ b/net/netrom/nr_out.c
@@ -0,0 +1,274 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+/*
+ *	This is where all NET/ROM frames pass, except for IP-over-NET/ROM which
+ *	cannot be fragmented in this manner.
+ */
+void nr_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+	unsigned char transport[NR_TRANSPORT_LEN];
+	int err, frontlen, len;
+
+	if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) {
+		/* Save a copy of the Transport Header */
+		skb_copy_from_linear_data(skb, transport, NR_TRANSPORT_LEN);
+		skb_pull(skb, NR_TRANSPORT_LEN);
+
+		frontlen = skb_headroom(skb);
+
+		while (skb->len > 0) {
+			if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL)
+				return;
+
+			skb_reserve(skbn, frontlen);
+
+			len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE;
+
+			/* Copy the user data */
+			skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+			skb_pull(skb, len);
+
+			/* Duplicate the Transport Header */
+			skb_push(skbn, NR_TRANSPORT_LEN);
+			skb_copy_to_linear_data(skbn, transport,
+						NR_TRANSPORT_LEN);
+			if (skb->len > 0)
+				skbn->data[4] |= NR_MORE_FLAG;
+
+			skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */
+		}
+
+		kfree_skb(skb);
+	} else {
+		skb_queue_tail(&sk->sk_write_queue, skb);		/* Throw it on the queue */
+	}
+
+	nr_kick(sk);
+}
+
+/*
+ *	This procedure is passed a buffer descriptor for an iframe. It builds
+ *	the rest of the control part of the frame and then writes it out.
+ */
+static void nr_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	if (skb == NULL)
+		return;
+
+	skb->data[2] = nr->vs;
+	skb->data[3] = nr->vr;
+
+	if (nr->condition & NR_COND_OWN_RX_BUSY)
+		skb->data[4] |= NR_CHOKE_FLAG;
+
+	nr_start_idletimer(sk);
+
+	nr_transmit_buffer(sk, skb);
+}
+
+void nr_send_nak_frame(struct sock *sk)
+{
+	struct sk_buff *skb, *skbn;
+	struct nr_sock *nr = nr_sk(sk);
+
+	if ((skb = skb_peek(&nr->ack_queue)) == NULL)
+		return;
+
+	if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL)
+		return;
+
+	skbn->data[2] = nr->va;
+	skbn->data[3] = nr->vr;
+
+	if (nr->condition & NR_COND_OWN_RX_BUSY)
+		skbn->data[4] |= NR_CHOKE_FLAG;
+
+	nr_transmit_buffer(sk, skbn);
+
+	nr->condition &= ~NR_COND_ACK_PENDING;
+	nr->vl         = nr->vr;
+
+	nr_stop_t1timer(sk);
+}
+
+void nr_kick(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	struct sk_buff *skb, *skbn;
+	unsigned short start, end;
+
+	if (nr->state != NR_STATE_3)
+		return;
+
+	if (nr->condition & NR_COND_PEER_RX_BUSY)
+		return;
+
+	if (!skb_peek(&sk->sk_write_queue))
+		return;
+
+	start = (skb_peek(&nr->ack_queue) == NULL) ? nr->va : nr->vs;
+	end   = (nr->va + nr->window) % NR_MODULUS;
+
+	if (start == end)
+		return;
+
+	nr->vs = start;
+
+	/*
+	 * Transmit data until either we're out of data to send or
+	 * the window is full.
+	 */
+
+	/*
+	 * Dequeue the frame and copy it.
+	 */
+	skb = skb_dequeue(&sk->sk_write_queue);
+
+	do {
+		if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+			skb_queue_head(&sk->sk_write_queue, skb);
+			break;
+		}
+
+		skb_set_owner_w(skbn, sk);
+
+		/*
+		 * Transmit the frame copy.
+		 */
+		nr_send_iframe(sk, skbn);
+
+		nr->vs = (nr->vs + 1) % NR_MODULUS;
+
+		/*
+		 * Requeue the original data frame.
+		 */
+		skb_queue_tail(&nr->ack_queue, skb);
+
+	} while (nr->vs != end &&
+		 (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+	nr->vl         = nr->vr;
+	nr->condition &= ~NR_COND_ACK_PENDING;
+
+	if (!nr_t1timer_running(sk))
+		nr_start_t1timer(sk);
+}
+
+void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	unsigned char *dptr;
+
+	/*
+	 *	Add the protocol byte and network header.
+	 */
+	dptr = skb_push(skb, NR_NETWORK_LEN);
+
+	memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN);
+	dptr[6] &= ~AX25_CBIT;
+	dptr[6] &= ~AX25_EBIT;
+	dptr[6] |= AX25_SSSID_SPARE;
+	dptr += AX25_ADDR_LEN;
+
+	memcpy(dptr, &nr->dest_addr, AX25_ADDR_LEN);
+	dptr[6] &= ~AX25_CBIT;
+	dptr[6] |= AX25_EBIT;
+	dptr[6] |= AX25_SSSID_SPARE;
+	dptr += AX25_ADDR_LEN;
+
+	*dptr++ = sysctl_netrom_network_ttl_initialiser;
+
+	if (!nr_route_frame(skb, NULL)) {
+		kfree_skb(skb);
+		nr_disconnect(sk, ENETUNREACH);
+	}
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void nr_establish_data_link(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	nr->condition = 0x00;
+	nr->n2count   = 0;
+
+	nr_write_internal(sk, NR_CONNREQ);
+
+	nr_stop_t2timer(sk);
+	nr_stop_t4timer(sk);
+	nr_stop_idletimer(sk);
+	nr_start_t1timer(sk);
+}
+
+/*
+ * Never send a NAK when we are CHOKEd.
+ */
+void nr_enquiry_response(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	int frametype = NR_INFOACK;
+
+	if (nr->condition & NR_COND_OWN_RX_BUSY) {
+		frametype |= NR_CHOKE_FLAG;
+	} else {
+		if (skb_peek(&nr->reseq_queue) != NULL)
+			frametype |= NR_NAK_FLAG;
+	}
+
+	nr_write_internal(sk, frametype);
+
+	nr->vl         = nr->vr;
+	nr->condition &= ~NR_COND_ACK_PENDING;
+}
+
+void nr_check_iframes_acked(struct sock *sk, unsigned short nr)
+{
+	struct nr_sock *nrom = nr_sk(sk);
+
+	if (nrom->vs == nr) {
+		nr_frames_acked(sk, nr);
+		nr_stop_t1timer(sk);
+		nrom->n2count = 0;
+	} else {
+		if (nrom->va != nr) {
+			nr_frames_acked(sk, nr);
+			nr_start_t1timer(sk);
+		}
+	}
+}
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
new file mode 100644
index 00000000..44059d0c
--- /dev/null
+++ b/net/netrom/nr_route.c
@@ -0,0 +1,1027 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/arp.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/netfilter.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <net/netrom.h>
+#include <linux/seq_file.h>
+
+static unsigned int nr_neigh_no = 1;
+
+static HLIST_HEAD(nr_node_list);
+static DEFINE_SPINLOCK(nr_node_list_lock);
+static HLIST_HEAD(nr_neigh_list);
+static DEFINE_SPINLOCK(nr_neigh_list_lock);
+
+static struct nr_node *nr_node_get(ax25_address *callsign)
+{
+	struct nr_node *found = NULL;
+	struct nr_node *nr_node;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_node_list_lock);
+	nr_node_for_each(nr_node, node, &nr_node_list)
+		if (ax25cmp(callsign, &nr_node->callsign) == 0) {
+			nr_node_hold(nr_node);
+			found = nr_node;
+			break;
+		}
+	spin_unlock_bh(&nr_node_list_lock);
+	return found;
+}
+
+static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign,
+					 struct net_device *dev)
+{
+	struct nr_neigh *found = NULL;
+	struct nr_neigh *nr_neigh;
+	struct hlist_node *node;
+
+	spin_lock_bh(&nr_neigh_list_lock);
+	nr_neigh_for_each(nr_neigh, node, &nr_neigh_list)
+		if (ax25cmp(callsign, &nr_neigh->callsign) == 0 &&
+		    nr_neigh->dev == dev) {
+			nr_neigh_hold(nr_neigh);
+			found = nr_neigh;
+			break;
+		}
+	spin_unlock_bh(&nr_neigh_list_lock);
+	return found;
+}
+
+static void nr_remove_neigh(struct nr_neigh *);
+
+/*
+ *	Add a new route to a node, and in the process add the node and the
+ *	neighbour if it is new.
+ */
+static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic,
+	ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev,
+	int quality, int obs_count)
+{
+	struct nr_node  *nr_node;
+	struct nr_neigh *nr_neigh;
+	struct nr_route nr_route;
+	int i, found;
+	struct net_device *odev;
+
+	if ((odev=nr_dev_get(nr)) != NULL) {	/* Can't add routes to ourself */
+		dev_put(odev);
+		return -EINVAL;
+	}
+
+	nr_node = nr_node_get(nr);
+
+	nr_neigh = nr_neigh_get_dev(ax25, dev);
+
+	/*
+	 * The L2 link to a neighbour has failed in the past
+	 * and now a frame comes from this neighbour. We assume
+	 * it was a temporary trouble with the link and reset the
+	 * routes now (and not wait for a node broadcast).
+	 */
+	if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) {
+		struct nr_node *nr_nodet;
+		struct hlist_node *node;
+
+		spin_lock_bh(&nr_node_list_lock);
+		nr_node_for_each(nr_nodet, node, &nr_node_list) {
+			nr_node_lock(nr_nodet);
+			for (i = 0; i < nr_nodet->count; i++)
+				if (nr_nodet->routes[i].neighbour == nr_neigh)
+					if (i < nr_nodet->which)
+						nr_nodet->which = i;
+			nr_node_unlock(nr_nodet);
+		}
+		spin_unlock_bh(&nr_node_list_lock);
+	}
+
+	if (nr_neigh != NULL)
+		nr_neigh->failed = 0;
+
+	if (quality == 0 && nr_neigh != NULL && nr_node != NULL) {
+		nr_neigh_put(nr_neigh);
+		nr_node_put(nr_node);
+		return 0;
+	}
+
+	if (nr_neigh == NULL) {
+		if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) {
+			if (nr_node)
+				nr_node_put(nr_node);
+			return -ENOMEM;
+		}
+
+		nr_neigh->callsign = *ax25;
+		nr_neigh->digipeat = NULL;
+		nr_neigh->ax25     = NULL;
+		nr_neigh->dev      = dev;
+		nr_neigh->quality  = sysctl_netrom_default_path_quality;
+		nr_neigh->locked   = 0;
+		nr_neigh->count    = 0;
+		nr_neigh->number   = nr_neigh_no++;
+		nr_neigh->failed   = 0;
+		atomic_set(&nr_neigh->refcount, 1);
+
+		if (ax25_digi != NULL && ax25_digi->ndigi > 0) {
+			nr_neigh->digipeat = kmemdup(ax25_digi,
+						     sizeof(*ax25_digi),
+						     GFP_KERNEL);
+			if (nr_neigh->digipeat == NULL) {
+				kfree(nr_neigh);
+				if (nr_node)
+					nr_node_put(nr_node);
+				return -ENOMEM;
+			}
+		}
+
+		spin_lock_bh(&nr_neigh_list_lock);
+		hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list);
+		nr_neigh_hold(nr_neigh);
+		spin_unlock_bh(&nr_neigh_list_lock);
+	}
+
+	if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked)
+		nr_neigh->quality = quality;
+
+	if (nr_node == NULL) {
+		if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) {
+			if (nr_neigh)
+				nr_neigh_put(nr_neigh);
+			return -ENOMEM;
+		}
+
+		nr_node->callsign = *nr;
+		strcpy(nr_node->mnemonic, mnemonic);
+
+		nr_node->which = 0;
+		nr_node->count = 1;
+		atomic_set(&nr_node->refcount, 1);
+		spin_lock_init(&nr_node->node_lock);
+
+		nr_node->routes[0].quality   = quality;
+		nr_node->routes[0].obs_count = obs_count;
+		nr_node->routes[0].neighbour = nr_neigh;
+
+		nr_neigh_hold(nr_neigh);
+		nr_neigh->count++;
+
+		spin_lock_bh(&nr_node_list_lock);
+		hlist_add_head(&nr_node->node_node, &nr_node_list);
+		/* refcount initialized at 1 */
+		spin_unlock_bh(&nr_node_list_lock);
+
+		return 0;
+	}
+	nr_node_lock(nr_node);
+
+	if (quality != 0)
+		strcpy(nr_node->mnemonic, mnemonic);
+
+	for (found = 0, i = 0; i < nr_node->count; i++) {
+		if (nr_node->routes[i].neighbour == nr_neigh) {
+			nr_node->routes[i].quality   = quality;
+			nr_node->routes[i].obs_count = obs_count;
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		/* We have space at the bottom, slot it in */
+		if (nr_node->count < 3) {
+			nr_node->routes[2] = nr_node->routes[1];
+			nr_node->routes[1] = nr_node->routes[0];
+
+			nr_node->routes[0].quality   = quality;
+			nr_node->routes[0].obs_count = obs_count;
+			nr_node->routes[0].neighbour = nr_neigh;
+
+			nr_node->which++;
+			nr_node->count++;
+			nr_neigh_hold(nr_neigh);
+			nr_neigh->count++;
+		} else {
+			/* It must be better than the worst */
+			if (quality > nr_node->routes[2].quality) {
+				nr_node->routes[2].neighbour->count--;
+				nr_neigh_put(nr_node->routes[2].neighbour);
+
+				if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked)
+					nr_remove_neigh(nr_node->routes[2].neighbour);
+
+				nr_node->routes[2].quality   = quality;
+				nr_node->routes[2].obs_count = obs_count;
+				nr_node->routes[2].neighbour = nr_neigh;
+
+				nr_neigh_hold(nr_neigh);
+				nr_neigh->count++;
+			}
+		}
+	}
+
+	/* Now re-sort the routes in quality order */
+	switch (nr_node->count) {
+	case 3:
+		if (nr_node->routes[1].quality > nr_node->routes[0].quality) {
+			switch (nr_node->which) {
+				case 0:  nr_node->which = 1; break;
+				case 1:  nr_node->which = 0; break;
+				default: break;
+			}
+			nr_route           = nr_node->routes[0];
+			nr_node->routes[0] = nr_node->routes[1];
+			nr_node->routes[1] = nr_route;
+		}
+		if (nr_node->routes[2].quality > nr_node->routes[1].quality) {
+			switch (nr_node->which) {
+			case 1:  nr_node->which = 2;
+				break;
+
+			case 2:  nr_node->which = 1;
+				break;
+
+			default:
+				break;
+			}
+			nr_route           = nr_node->routes[1];
+			nr_node->routes[1] = nr_node->routes[2];
+			nr_node->routes[2] = nr_route;
+		}
+	case 2:
+		if (nr_node->routes[1].quality > nr_node->routes[0].quality) {
+			switch (nr_node->which) {
+			case 0:  nr_node->which = 1;
+				break;
+
+			case 1:  nr_node->which = 0;
+				break;
+
+			default: break;
+			}
+			nr_route           = nr_node->routes[0];
+			nr_node->routes[0] = nr_node->routes[1];
+			nr_node->routes[1] = nr_route;
+			}
+	case 1:
+		break;
+	}
+
+	for (i = 0; i < nr_node->count; i++) {
+		if (nr_node->routes[i].neighbour == nr_neigh) {
+			if (i < nr_node->which)
+				nr_node->which = i;
+			break;
+		}
+	}
+
+	nr_neigh_put(nr_neigh);
+	nr_node_unlock(nr_node);
+	nr_node_put(nr_node);
+	return 0;
+}
+
+static inline void __nr_remove_node(struct nr_node *nr_node)
+{
+	hlist_del_init(&nr_node->node_node);
+	nr_node_put(nr_node);
+}
+
+#define nr_remove_node_locked(__node) \
+	__nr_remove_node(__node)
+
+static void nr_remove_node(struct nr_node *nr_node)
+{
+	spin_lock_bh(&nr_node_list_lock);
+	__nr_remove_node(nr_node);
+	spin_unlock_bh(&nr_node_list_lock);
+}
+
+static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh)
+{
+	hlist_del_init(&nr_neigh->neigh_node);
+	nr_neigh_put(nr_neigh);
+}
+
+#define nr_remove_neigh_locked(__neigh) \
+	__nr_remove_neigh(__neigh)
+
+static void nr_remove_neigh(struct nr_neigh *nr_neigh)
+{
+	spin_lock_bh(&nr_neigh_list_lock);
+	__nr_remove_neigh(nr_neigh);
+	spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+/*
+ *	"Delete" a node. Strictly speaking remove a route to a node. The node
+ *	is only deleted if no routes are left to it.
+ */
+static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev)
+{
+	struct nr_node  *nr_node;
+	struct nr_neigh *nr_neigh;
+	int i;
+
+	nr_node = nr_node_get(callsign);
+
+	if (nr_node == NULL)
+		return -EINVAL;
+
+	nr_neigh = nr_neigh_get_dev(neighbour, dev);
+
+	if (nr_neigh == NULL) {
+		nr_node_put(nr_node);
+		return -EINVAL;
+	}
+
+	nr_node_lock(nr_node);
+	for (i = 0; i < nr_node->count; i++) {
+		if (nr_node->routes[i].neighbour == nr_neigh) {
+			nr_neigh->count--;
+			nr_neigh_put(nr_neigh);
+
+			if (nr_neigh->count == 0 && !nr_neigh->locked)
+				nr_remove_neigh(nr_neigh);
+			nr_neigh_put(nr_neigh);
+
+			nr_node->count--;
+
+			if (nr_node->count == 0) {
+				nr_remove_node(nr_node);
+			} else {
+				switch (i) {
+				case 0:
+					nr_node->routes[0] = nr_node->routes[1];
+				case 1:
+					nr_node->routes[1] = nr_node->routes[2];
+				case 2:
+					break;
+				}
+				nr_node_put(nr_node);
+			}
+			nr_node_unlock(nr_node);
+
+			return 0;
+		}
+	}
+	nr_neigh_put(nr_neigh);
+	nr_node_unlock(nr_node);
+	nr_node_put(nr_node);
+
+	return -EINVAL;
+}
+
+/*
+ *	Lock a neighbour with a quality.
+ */
+static int __must_check nr_add_neigh(ax25_address *callsign,
+	ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality)
+{
+	struct nr_neigh *nr_neigh;
+
+	nr_neigh = nr_neigh_get_dev(callsign, dev);
+	if (nr_neigh) {
+		nr_neigh->quality = quality;
+		nr_neigh->locked  = 1;
+		nr_neigh_put(nr_neigh);
+		return 0;
+	}
+
+	if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL)
+		return -ENOMEM;
+
+	nr_neigh->callsign = *callsign;
+	nr_neigh->digipeat = NULL;
+	nr_neigh->ax25     = NULL;
+	nr_neigh->dev      = dev;
+	nr_neigh->quality  = quality;
+	nr_neigh->locked   = 1;
+	nr_neigh->count    = 0;
+	nr_neigh->number   = nr_neigh_no++;
+	nr_neigh->failed   = 0;
+	atomic_set(&nr_neigh->refcount, 1);
+
+	if (ax25_digi != NULL && ax25_digi->ndigi > 0) {
+		nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi),
+					     GFP_KERNEL);
+		if (nr_neigh->digipeat == NULL) {
+			kfree(nr_neigh);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock_bh(&nr_neigh_list_lock);
+	hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list);
+	/* refcount is initialized at 1 */
+	spin_unlock_bh(&nr_neigh_list_lock);
+
+	return 0;
+}
+
+/*
+ *	"Delete" a neighbour. The neighbour is only removed if the number
+ *	of nodes that may use it is zero.
+ */
+static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality)
+{
+	struct nr_neigh *nr_neigh;
+
+	nr_neigh = nr_neigh_get_dev(callsign, dev);
+
+	if (nr_neigh == NULL) return -EINVAL;
+
+	nr_neigh->quality = quality;
+	nr_neigh->locked  = 0;
+
+	if (nr_neigh->count == 0)
+		nr_remove_neigh(nr_neigh);
+	nr_neigh_put(nr_neigh);
+
+	return 0;
+}
+
+/*
+ *	Decrement the obsolescence count by one. If a route is reduced to a
+ *	count of zero, remove it. Also remove any unlocked neighbours with
+ *	zero nodes routing via it.
+ */
+static int nr_dec_obs(void)
+{
+	struct nr_neigh *nr_neigh;
+	struct nr_node  *s;
+	struct hlist_node *node, *nodet;
+	int i;
+
+	spin_lock_bh(&nr_node_list_lock);
+	nr_node_for_each_safe(s, node, nodet, &nr_node_list) {
+		nr_node_lock(s);
+		for (i = 0; i < s->count; i++) {
+			switch (s->routes[i].obs_count) {
+			case 0:		/* A locked entry */
+				break;
+
+			case 1:		/* From 1 -> 0 */
+				nr_neigh = s->routes[i].neighbour;
+
+				nr_neigh->count--;
+				nr_neigh_put(nr_neigh);
+
+				if (nr_neigh->count == 0 && !nr_neigh->locked)
+					nr_remove_neigh(nr_neigh);
+
+				s->count--;
+
+				switch (i) {
+					case 0:
+						s->routes[0] = s->routes[1];
+					case 1:
+						s->routes[1] = s->routes[2];
+					case 2:
+						break;
+				}
+				break;
+
+			default:
+				s->routes[i].obs_count--;
+				break;
+
+			}
+		}
+
+		if (s->count <= 0)
+			nr_remove_node_locked(s);
+		nr_node_unlock(s);
+	}
+	spin_unlock_bh(&nr_node_list_lock);
+
+	return 0;
+}
+
+/*
+ *	A device has been removed. Remove its routes and neighbours.
+ */
+void nr_rt_device_down(struct net_device *dev)
+{
+	struct nr_neigh *s;
+	struct hlist_node *node, *nodet, *node2, *node2t;
+	struct nr_node  *t;
+	int i;
+
+	spin_lock_bh(&nr_neigh_list_lock);
+	nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) {
+		if (s->dev == dev) {
+			spin_lock_bh(&nr_node_list_lock);
+			nr_node_for_each_safe(t, node2, node2t, &nr_node_list) {
+				nr_node_lock(t);
+				for (i = 0; i < t->count; i++) {
+					if (t->routes[i].neighbour == s) {
+						t->count--;
+
+						switch (i) {
+						case 0:
+							t->routes[0] = t->routes[1];
+						case 1:
+							t->routes[1] = t->routes[2];
+						case 2:
+							break;
+						}
+					}
+				}
+
+				if (t->count <= 0)
+					nr_remove_node_locked(t);
+				nr_node_unlock(t);
+			}
+			spin_unlock_bh(&nr_node_list_lock);
+
+			nr_remove_neigh_locked(s);
+		}
+	}
+	spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+/*
+ *	Check that the device given is a valid AX.25 interface that is "up".
+ *	Or a valid ethernet interface with an AX.25 callsign binding.
+ */
+static struct net_device *nr_ax25_dev_get(char *devname)
+{
+	struct net_device *dev;
+
+	if ((dev = dev_get_by_name(&init_net, devname)) == NULL)
+		return NULL;
+
+	if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25)
+		return dev;
+
+	dev_put(dev);
+	return NULL;
+}
+
+/*
+ *	Find the first active NET/ROM device, usually "nr0".
+ */
+struct net_device *nr_dev_first(void)
+{
+	struct net_device *dev, *first = NULL;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM)
+			if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
+				first = dev;
+	}
+	if (first)
+		dev_hold(first);
+	rcu_read_unlock();
+
+	return first;
+}
+
+/*
+ *	Find the NET/ROM device for the given callsign.
+ */
+struct net_device *nr_dev_get(ax25_address *addr)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM &&
+		    ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) {
+			dev_hold(dev);
+			goto out;
+		}
+	}
+	dev = NULL;
+out:
+	rcu_read_unlock();
+	return dev;
+}
+
+static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis,
+	ax25_address *digipeaters)
+{
+	int i;
+
+	if (ndigis == 0)
+		return NULL;
+
+	for (i = 0; i < ndigis; i++) {
+		digi->calls[i]    = digipeaters[i];
+		digi->repeated[i] = 0;
+	}
+
+	digi->ndigi      = ndigis;
+	digi->lastrepeat = -1;
+
+	return digi;
+}
+
+/*
+ *	Handle the ioctls that control the routing functions.
+ */
+int nr_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct nr_route_struct nr_route;
+	struct net_device *dev;
+	ax25_digi digi;
+	int ret;
+
+	switch (cmd) {
+	case SIOCADDRT:
+		if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)))
+			return -EFAULT;
+		if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL)
+			return -EINVAL;
+		if (nr_route.ndigis < 0 || nr_route.ndigis > AX25_MAX_DIGIS) {
+			dev_put(dev);
+			return -EINVAL;
+		}
+		switch (nr_route.type) {
+		case NETROM_NODE:
+			ret = nr_add_node(&nr_route.callsign,
+				nr_route.mnemonic,
+				&nr_route.neighbour,
+				nr_call_to_digi(&digi, nr_route.ndigis,
+						nr_route.digipeaters),
+				dev, nr_route.quality,
+				nr_route.obs_count);
+			break;
+		case NETROM_NEIGH:
+			ret = nr_add_neigh(&nr_route.callsign,
+				nr_call_to_digi(&digi, nr_route.ndigis,
+						nr_route.digipeaters),
+				dev, nr_route.quality);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		dev_put(dev);
+		return ret;
+
+	case SIOCDELRT:
+		if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)))
+			return -EFAULT;
+		if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL)
+			return -EINVAL;
+		switch (nr_route.type) {
+		case NETROM_NODE:
+			ret = nr_del_node(&nr_route.callsign,
+				&nr_route.neighbour, dev);
+			break;
+		case NETROM_NEIGH:
+			ret = nr_del_neigh(&nr_route.callsign,
+				dev, nr_route.quality);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		dev_put(dev);
+		return ret;
+
+	case SIOCNRDECOBS:
+		return nr_dec_obs();
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * 	A level 2 link has timed out, therefore it appears to be a poor link,
+ *	then don't use that neighbour until it is reset.
+ */
+void nr_link_failed(ax25_cb *ax25, int reason)
+{
+	struct nr_neigh *s, *nr_neigh = NULL;
+	struct hlist_node *node;
+	struct nr_node  *nr_node = NULL;
+
+	spin_lock_bh(&nr_neigh_list_lock);
+	nr_neigh_for_each(s, node, &nr_neigh_list) {
+		if (s->ax25 == ax25) {
+			nr_neigh_hold(s);
+			nr_neigh = s;
+			break;
+		}
+	}
+	spin_unlock_bh(&nr_neigh_list_lock);
+
+	if (nr_neigh == NULL)
+		return;
+
+	nr_neigh->ax25 = NULL;
+	ax25_cb_put(ax25);
+
+	if (++nr_neigh->failed < sysctl_netrom_link_fails_count) {
+		nr_neigh_put(nr_neigh);
+		return;
+	}
+	spin_lock_bh(&nr_node_list_lock);
+	nr_node_for_each(nr_node, node, &nr_node_list) {
+		nr_node_lock(nr_node);
+		if (nr_node->which < nr_node->count &&
+		    nr_node->routes[nr_node->which].neighbour == nr_neigh)
+			nr_node->which++;
+		nr_node_unlock(nr_node);
+	}
+	spin_unlock_bh(&nr_node_list_lock);
+	nr_neigh_put(nr_neigh);
+}
+
+/*
+ *	Route a frame to an appropriate AX.25 connection. A NULL ax25_cb
+ *	indicates an internally generated frame.
+ */
+int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25)
+{
+	ax25_address *nr_src, *nr_dest;
+	struct nr_neigh *nr_neigh;
+	struct nr_node  *nr_node;
+	struct net_device *dev;
+	unsigned char *dptr;
+	ax25_cb *ax25s;
+	int ret;
+	struct sk_buff *skbn;
+
+
+	nr_src  = (ax25_address *)(skb->data + 0);
+	nr_dest = (ax25_address *)(skb->data + 7);
+
+	if (ax25 != NULL) {
+		ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat,
+				  ax25->ax25_dev->dev, 0,
+				  sysctl_netrom_obsolescence_count_initialiser);
+		if (ret)
+			return ret;
+	}
+
+	if ((dev = nr_dev_get(nr_dest)) != NULL) {	/* Its for me */
+		if (ax25 == NULL)			/* Its from me */
+			ret = nr_loopback_queue(skb);
+		else
+			ret = nr_rx_frame(skb, dev);
+		dev_put(dev);
+		return ret;
+	}
+
+	if (!sysctl_netrom_routing_control && ax25 != NULL)
+		return 0;
+
+	/* Its Time-To-Live has expired */
+	if (skb->data[14] == 1) {
+		return 0;
+	}
+
+	nr_node = nr_node_get(nr_dest);
+	if (nr_node == NULL)
+		return 0;
+	nr_node_lock(nr_node);
+
+	if (nr_node->which >= nr_node->count) {
+		nr_node_unlock(nr_node);
+		nr_node_put(nr_node);
+		return 0;
+	}
+
+	nr_neigh = nr_node->routes[nr_node->which].neighbour;
+
+	if ((dev = nr_dev_first()) == NULL) {
+		nr_node_unlock(nr_node);
+		nr_node_put(nr_node);
+		return 0;
+	}
+
+	/* We are going to change the netrom headers so we should get our
+	   own skb, we also did not know until now how much header space
+	   we had to reserve... - RXQ */
+	if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) {
+		nr_node_unlock(nr_node);
+		nr_node_put(nr_node);
+		dev_put(dev);
+		return 0;
+	}
+	kfree_skb(skb);
+	skb=skbn;
+	skb->data[14]--;
+
+	dptr  = skb_push(skb, 1);
+	*dptr = AX25_P_NETROM;
+
+	ax25s = nr_neigh->ax25;
+	nr_neigh->ax25 = ax25_send_frame(skb, 256,
+					 (ax25_address *)dev->dev_addr,
+					 &nr_neigh->callsign,
+					 nr_neigh->digipeat, nr_neigh->dev);
+	if (ax25s)
+		ax25_cb_put(ax25s);
+
+	dev_put(dev);
+	ret = (nr_neigh->ax25 != NULL);
+	nr_node_unlock(nr_node);
+	nr_node_put(nr_node);
+
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *nr_node_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_bh(&nr_node_list_lock);
+	return seq_hlist_start_head(&nr_node_list, *pos);
+}
+
+static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &nr_node_list, pos);
+}
+
+static void nr_node_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_bh(&nr_node_list_lock);
+}
+
+static int nr_node_show(struct seq_file *seq, void *v)
+{
+	char buf[11];
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "callsign  mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n");
+	else {
+		struct nr_node *nr_node = hlist_entry(v, struct nr_node,
+						      node_node);
+
+		nr_node_lock(nr_node);
+		seq_printf(seq, "%-9s %-7s  %d %d",
+			ax2asc(buf, &nr_node->callsign),
+			(nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic,
+			nr_node->which + 1,
+			nr_node->count);
+
+		for (i = 0; i < nr_node->count; i++) {
+			seq_printf(seq, "  %3d   %d %05d",
+				nr_node->routes[i].quality,
+				nr_node->routes[i].obs_count,
+				nr_node->routes[i].neighbour->number);
+		}
+		nr_node_unlock(nr_node);
+
+		seq_puts(seq, "\n");
+	}
+	return 0;
+}
+
+static const struct seq_operations nr_node_seqops = {
+	.start = nr_node_start,
+	.next = nr_node_next,
+	.stop = nr_node_stop,
+	.show = nr_node_show,
+};
+
+static int nr_node_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nr_node_seqops);
+}
+
+const struct file_operations nr_nodes_fops = {
+	.owner = THIS_MODULE,
+	.open = nr_node_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static void *nr_neigh_start(struct seq_file *seq, loff_t *pos)
+{
+	spin_lock_bh(&nr_neigh_list_lock);
+	return seq_hlist_start_head(&nr_neigh_list, *pos);
+}
+
+static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &nr_neigh_list, pos);
+}
+
+static void nr_neigh_stop(struct seq_file *seq, void *v)
+{
+	spin_unlock_bh(&nr_neigh_list_lock);
+}
+
+static int nr_neigh_show(struct seq_file *seq, void *v)
+{
+	char buf[11];
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "addr  callsign  dev  qual lock count failed digipeaters\n");
+	else {
+		struct nr_neigh *nr_neigh;
+
+		nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node);
+		seq_printf(seq, "%05d %-9s %-4s  %3d    %d   %3d    %3d",
+			nr_neigh->number,
+			ax2asc(buf, &nr_neigh->callsign),
+			nr_neigh->dev ? nr_neigh->dev->name : "???",
+			nr_neigh->quality,
+			nr_neigh->locked,
+			nr_neigh->count,
+			nr_neigh->failed);
+
+		if (nr_neigh->digipeat != NULL) {
+			for (i = 0; i < nr_neigh->digipeat->ndigi; i++)
+				seq_printf(seq, " %s",
+					   ax2asc(buf, &nr_neigh->digipeat->calls[i]));
+		}
+
+		seq_puts(seq, "\n");
+	}
+	return 0;
+}
+
+static const struct seq_operations nr_neigh_seqops = {
+	.start = nr_neigh_start,
+	.next = nr_neigh_next,
+	.stop = nr_neigh_stop,
+	.show = nr_neigh_show,
+};
+
+static int nr_neigh_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nr_neigh_seqops);
+}
+
+const struct file_operations nr_neigh_fops = {
+	.owner = THIS_MODULE,
+	.open = nr_neigh_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#endif
+
+/*
+ *	Free all memory associated with the nodes and routes lists.
+ */
+void __exit nr_rt_free(void)
+{
+	struct nr_neigh *s = NULL;
+	struct nr_node  *t = NULL;
+	struct hlist_node *node, *nodet;
+
+	spin_lock_bh(&nr_neigh_list_lock);
+	spin_lock_bh(&nr_node_list_lock);
+	nr_node_for_each_safe(t, node, nodet, &nr_node_list) {
+		nr_node_lock(t);
+		nr_remove_node_locked(t);
+		nr_node_unlock(t);
+	}
+	nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) {
+		while(s->count) {
+			s->count--;
+			nr_neigh_put(s);
+		}
+		nr_remove_neigh_locked(s);
+	}
+	spin_unlock_bh(&nr_node_list_lock);
+	spin_unlock_bh(&nr_neigh_list_lock);
+}
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
new file mode 100644
index 00000000..6a947ae5
--- /dev/null
+++ b/net/netrom/nr_subr.c
@@ -0,0 +1,282 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+/*
+ *	This routine purges all of the queues of frames.
+ */
+void nr_clear_queues(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	skb_queue_purge(&sk->sk_write_queue);
+	skb_queue_purge(&nr->ack_queue);
+	skb_queue_purge(&nr->reseq_queue);
+	skb_queue_purge(&nr->frag_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void nr_frames_acked(struct sock *sk, unsigned short nr)
+{
+	struct nr_sock *nrom = nr_sk(sk);
+	struct sk_buff *skb;
+
+	/*
+	 * Remove all the ack-ed frames from the ack queue.
+	 */
+	if (nrom->va != nr) {
+		while (skb_peek(&nrom->ack_queue) != NULL && nrom->va != nr) {
+			skb = skb_dequeue(&nrom->ack_queue);
+			kfree_skb(skb);
+			nrom->va = (nrom->va + 1) % NR_MODULUS;
+		}
+	}
+}
+
+/*
+ * Requeue all the un-ack-ed frames on the output queue to be picked
+ * up by nr_kick called from the timer. This arrangement handles the
+ * possibility of an empty output queue.
+ */
+void nr_requeue_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *skb_prev = NULL;
+
+	while ((skb = skb_dequeue(&nr_sk(sk)->ack_queue)) != NULL) {
+		if (skb_prev == NULL)
+			skb_queue_head(&sk->sk_write_queue, skb);
+		else
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
+		skb_prev = skb;
+	}
+}
+
+/*
+ *	Validate that the value of nr is between va and vs. Return true or
+ *	false for testing.
+ */
+int nr_validate_nr(struct sock *sk, unsigned short nr)
+{
+	struct nr_sock *nrom = nr_sk(sk);
+	unsigned short vc = nrom->va;
+
+	while (vc != nrom->vs) {
+		if (nr == vc) return 1;
+		vc = (vc + 1) % NR_MODULUS;
+	}
+
+	return nr == nrom->vs;
+}
+
+/*
+ *	Check that ns is within the receive window.
+ */
+int nr_in_rx_window(struct sock *sk, unsigned short ns)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	unsigned short vc = nr->vr;
+	unsigned short vt = (nr->vl + nr->window) % NR_MODULUS;
+
+	while (vc != vt) {
+		if (ns == vc) return 1;
+		vc = (vc + 1) % NR_MODULUS;
+	}
+
+	return 0;
+}
+
+/*
+ *  This routine is called when the HDLC layer internally generates a
+ *  control frame.
+ */
+void nr_write_internal(struct sock *sk, int frametype)
+{
+	struct nr_sock *nr = nr_sk(sk);
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+	int len, timeout;
+
+	len = NR_NETWORK_LEN + NR_TRANSPORT_LEN;
+
+	switch (frametype & 0x0F) {
+	case NR_CONNREQ:
+		len += 17;
+		break;
+	case NR_CONNACK:
+		len += (nr->bpqext) ? 2 : 1;
+		break;
+	case NR_DISCREQ:
+	case NR_DISCACK:
+	case NR_INFOACK:
+		break;
+	default:
+		printk(KERN_ERR "NET/ROM: nr_write_internal - invalid frame type %d\n", frametype);
+		return;
+	}
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	/*
+	 *	Space for AX.25 and NET/ROM network header
+	 */
+	skb_reserve(skb, NR_NETWORK_LEN);
+
+	dptr = skb_put(skb, skb_tailroom(skb));
+
+	switch (frametype & 0x0F) {
+	case NR_CONNREQ:
+		timeout  = nr->t1 / HZ;
+		*dptr++  = nr->my_index;
+		*dptr++  = nr->my_id;
+		*dptr++  = 0;
+		*dptr++  = 0;
+		*dptr++  = frametype;
+		*dptr++  = nr->window;
+		memcpy(dptr, &nr->user_addr, AX25_ADDR_LEN);
+		dptr[6] &= ~AX25_CBIT;
+		dptr[6] &= ~AX25_EBIT;
+		dptr[6] |= AX25_SSSID_SPARE;
+		dptr    += AX25_ADDR_LEN;
+		memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN);
+		dptr[6] &= ~AX25_CBIT;
+		dptr[6] &= ~AX25_EBIT;
+		dptr[6] |= AX25_SSSID_SPARE;
+		dptr    += AX25_ADDR_LEN;
+		*dptr++  = timeout % 256;
+		*dptr++  = timeout / 256;
+		break;
+
+	case NR_CONNACK:
+		*dptr++ = nr->your_index;
+		*dptr++ = nr->your_id;
+		*dptr++ = nr->my_index;
+		*dptr++ = nr->my_id;
+		*dptr++ = frametype;
+		*dptr++ = nr->window;
+		if (nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser;
+		break;
+
+	case NR_DISCREQ:
+	case NR_DISCACK:
+		*dptr++ = nr->your_index;
+		*dptr++ = nr->your_id;
+		*dptr++ = 0;
+		*dptr++ = 0;
+		*dptr++ = frametype;
+		break;
+
+	case NR_INFOACK:
+		*dptr++ = nr->your_index;
+		*dptr++ = nr->your_id;
+		*dptr++ = 0;
+		*dptr++ = nr->vr;
+		*dptr++ = frametype;
+		break;
+	}
+
+	nr_transmit_buffer(sk, skb);
+}
+
+/*
+ * This routine is called to send an error reply.
+ */
+void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags)
+{
+	struct sk_buff *skbn;
+	unsigned char *dptr;
+	int len;
+
+	len = NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1;
+
+	if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skbn, 0);
+
+	dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
+
+	skb_copy_from_linear_data_offset(skb, 7, dptr, AX25_ADDR_LEN);
+	dptr[6] &= ~AX25_CBIT;
+	dptr[6] &= ~AX25_EBIT;
+	dptr[6] |= AX25_SSSID_SPARE;
+	dptr += AX25_ADDR_LEN;
+
+	skb_copy_from_linear_data(skb, dptr, AX25_ADDR_LEN);
+	dptr[6] &= ~AX25_CBIT;
+	dptr[6] |= AX25_EBIT;
+	dptr[6] |= AX25_SSSID_SPARE;
+	dptr += AX25_ADDR_LEN;
+
+	*dptr++ = sysctl_netrom_network_ttl_initialiser;
+
+	if (mine) {
+		*dptr++ = 0;
+		*dptr++ = 0;
+		*dptr++ = skb->data[15];
+		*dptr++ = skb->data[16];
+	} else {
+		*dptr++ = skb->data[15];
+		*dptr++ = skb->data[16];
+		*dptr++ = 0;
+		*dptr++ = 0;
+	}
+
+	*dptr++ = cmdflags;
+	*dptr++ = 0;
+
+	if (!nr_route_frame(skbn, NULL))
+		kfree_skb(skbn);
+}
+
+void nr_disconnect(struct sock *sk, int reason)
+{
+	nr_stop_t1timer(sk);
+	nr_stop_t2timer(sk);
+	nr_stop_t4timer(sk);
+	nr_stop_idletimer(sk);
+
+	nr_clear_queues(sk);
+
+	nr_sk(sk)->state = NR_STATE_0;
+
+	sk->sk_state     = TCP_CLOSE;
+	sk->sk_err       = reason;
+	sk->sk_shutdown |= SEND_SHUTDOWN;
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+	}
+}
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
new file mode 100644
index 00000000..1cb98e88
--- /dev/null
+++ b/net/netrom/nr_timer.c
@@ -0,0 +1,249 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/netrom.h>
+
+static void nr_heartbeat_expiry(unsigned long);
+static void nr_t1timer_expiry(unsigned long);
+static void nr_t2timer_expiry(unsigned long);
+static void nr_t4timer_expiry(unsigned long);
+static void nr_idletimer_expiry(unsigned long);
+
+void nr_init_timers(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk);
+	setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk);
+	setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk);
+	setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk);
+
+	/* initialized by sock_init_data */
+	sk->sk_timer.data     = (unsigned long)sk;
+	sk->sk_timer.function = &nr_heartbeat_expiry;
+}
+
+void nr_start_t1timer(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	mod_timer(&nr->t1timer, jiffies + nr->t1);
+}
+
+void nr_start_t2timer(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	mod_timer(&nr->t2timer, jiffies + nr->t2);
+}
+
+void nr_start_t4timer(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	mod_timer(&nr->t4timer, jiffies + nr->t4);
+}
+
+void nr_start_idletimer(struct sock *sk)
+{
+	struct nr_sock *nr = nr_sk(sk);
+
+	if (nr->idle > 0)
+		mod_timer(&nr->idletimer, jiffies + nr->idle);
+}
+
+void nr_start_heartbeat(struct sock *sk)
+{
+	mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+}
+
+void nr_stop_t1timer(struct sock *sk)
+{
+	del_timer(&nr_sk(sk)->t1timer);
+}
+
+void nr_stop_t2timer(struct sock *sk)
+{
+	del_timer(&nr_sk(sk)->t2timer);
+}
+
+void nr_stop_t4timer(struct sock *sk)
+{
+	del_timer(&nr_sk(sk)->t4timer);
+}
+
+void nr_stop_idletimer(struct sock *sk)
+{
+	del_timer(&nr_sk(sk)->idletimer);
+}
+
+void nr_stop_heartbeat(struct sock *sk)
+{
+	del_timer(&sk->sk_timer);
+}
+
+int nr_t1timer_running(struct sock *sk)
+{
+	return timer_pending(&nr_sk(sk)->t1timer);
+}
+
+static void nr_heartbeat_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct nr_sock *nr = nr_sk(sk);
+
+	bh_lock_sock(sk);
+	switch (nr->state) {
+	case NR_STATE_0:
+		/* Magic here: If we listen() and a new link dies before it
+		   is accepted() it isn't 'dead' so doesn't get removed. */
+		if (sock_flag(sk, SOCK_DESTROY) ||
+		    (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+			sock_hold(sk);
+			bh_unlock_sock(sk);
+			nr_destroy_socket(sk);
+			sock_put(sk);
+			return;
+		}
+		break;
+
+	case NR_STATE_3:
+		/*
+		 * Check for the state of the receive buffer.
+		 */
+		if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) &&
+		    (nr->condition & NR_COND_OWN_RX_BUSY)) {
+			nr->condition &= ~NR_COND_OWN_RX_BUSY;
+			nr->condition &= ~NR_COND_ACK_PENDING;
+			nr->vl         = nr->vr;
+			nr_write_internal(sk, NR_INFOACK);
+			break;
+		}
+		break;
+	}
+
+	nr_start_heartbeat(sk);
+	bh_unlock_sock(sk);
+}
+
+static void nr_t2timer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct nr_sock *nr = nr_sk(sk);
+
+	bh_lock_sock(sk);
+	if (nr->condition & NR_COND_ACK_PENDING) {
+		nr->condition &= ~NR_COND_ACK_PENDING;
+		nr_enquiry_response(sk);
+	}
+	bh_unlock_sock(sk);
+}
+
+static void nr_t4timer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+
+	bh_lock_sock(sk);
+	nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY;
+	bh_unlock_sock(sk);
+}
+
+static void nr_idletimer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct nr_sock *nr = nr_sk(sk);
+
+	bh_lock_sock(sk);
+
+	nr_clear_queues(sk);
+
+	nr->n2count = 0;
+	nr_write_internal(sk, NR_DISCREQ);
+	nr->state = NR_STATE_2;
+
+	nr_start_t1timer(sk);
+	nr_stop_t2timer(sk);
+	nr_stop_t4timer(sk);
+
+	sk->sk_state     = TCP_CLOSE;
+	sk->sk_err       = 0;
+	sk->sk_shutdown |= SEND_SHUTDOWN;
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+	}
+	bh_unlock_sock(sk);
+}
+
+static void nr_t1timer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct nr_sock *nr = nr_sk(sk);
+
+	bh_lock_sock(sk);
+	switch (nr->state) {
+	case NR_STATE_1:
+		if (nr->n2count == nr->n2) {
+			nr_disconnect(sk, ETIMEDOUT);
+			bh_unlock_sock(sk);
+			return;
+		} else {
+			nr->n2count++;
+			nr_write_internal(sk, NR_CONNREQ);
+		}
+		break;
+
+	case NR_STATE_2:
+		if (nr->n2count == nr->n2) {
+			nr_disconnect(sk, ETIMEDOUT);
+			bh_unlock_sock(sk);
+			return;
+		} else {
+			nr->n2count++;
+			nr_write_internal(sk, NR_DISCREQ);
+		}
+		break;
+
+	case NR_STATE_3:
+		if (nr->n2count == nr->n2) {
+			nr_disconnect(sk, ETIMEDOUT);
+			bh_unlock_sock(sk);
+			return;
+		} else {
+			nr->n2count++;
+			nr_requeue_frames(sk);
+		}
+		break;
+	}
+
+	nr_start_t1timer(sk);
+	bh_unlock_sock(sk);
+}
diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c
new file mode 100644
index 00000000..1e0fa9e5
--- /dev/null
+++ b/net/netrom/sysctl_net_netrom.c
@@ -0,0 +1,163 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <net/ax25.h>
+#include <net/netrom.h>
+
+/*
+ *	Values taken from NET/ROM documentation.
+ */
+static int min_quality[] = {0}, max_quality[] = {255};
+static int min_obs[]     = {0}, max_obs[]     = {255};
+static int min_ttl[]     = {0}, max_ttl[]     = {255};
+static int min_t1[]      = {5 * HZ};
+static int max_t1[]      = {600 * HZ};
+static int min_n2[]      = {2}, max_n2[]      = {127};
+static int min_t2[]      = {1 * HZ};
+static int max_t2[]      = {60 * HZ};
+static int min_t4[]      = {1 * HZ};
+static int max_t4[]      = {1000 * HZ};
+static int min_window[]  = {1}, max_window[]  = {127};
+static int min_idle[]    = {0 * HZ};
+static int max_idle[]    = {65535 * HZ};
+static int min_route[]   = {0}, max_route[]   = {1};
+static int min_fails[]   = {1}, max_fails[]   = {10};
+static int min_reset[]   = {0}, max_reset[]   = {1};
+
+static struct ctl_table_header *nr_table_header;
+
+static ctl_table nr_table[] = {
+	{
+		.procname	= "default_path_quality",
+		.data		= &sysctl_netrom_default_path_quality,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_quality,
+		.extra2		= &max_quality
+	},
+	{
+		.procname	= "obsolescence_count_initialiser",
+		.data		= &sysctl_netrom_obsolescence_count_initialiser,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_obs,
+		.extra2		= &max_obs
+	},
+	{
+		.procname	= "network_ttl_initialiser",
+		.data		= &sysctl_netrom_network_ttl_initialiser,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ttl,
+		.extra2		= &max_ttl
+	},
+	{
+		.procname	= "transport_timeout",
+		.data		= &sysctl_netrom_transport_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t1,
+		.extra2		= &max_t1
+	},
+	{
+		.procname	= "transport_maximum_tries",
+		.data		= &sysctl_netrom_transport_maximum_tries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_n2,
+		.extra2		= &max_n2
+	},
+	{
+		.procname	= "transport_acknowledge_delay",
+		.data		= &sysctl_netrom_transport_acknowledge_delay,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t2,
+		.extra2		= &max_t2
+	},
+	{
+		.procname	= "transport_busy_delay",
+		.data		= &sysctl_netrom_transport_busy_delay,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_t4,
+		.extra2		= &max_t4
+	},
+	{
+		.procname	= "transport_requested_window_size",
+		.data		= &sysctl_netrom_transport_requested_window_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_window,
+		.extra2		= &max_window
+	},
+	{
+		.procname	= "transport_no_activity_timeout",
+		.data		= &sysctl_netrom_transport_no_activity_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_idle,
+		.extra2		= &max_idle
+	},
+	{
+		.procname	= "routing_control",
+		.data		= &sysctl_netrom_routing_control,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_route,
+		.extra2		= &max_route
+	},
+	{
+		.procname	= "link_fails_count",
+		.data		= &sysctl_netrom_link_fails_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_fails,
+		.extra2		= &max_fails
+	},
+	{
+		.procname	= "reset",
+		.data		= &sysctl_netrom_reset_circuit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_reset,
+		.extra2		= &max_reset
+	},
+	{ }
+};
+
+static struct ctl_path nr_path[] = {
+	{ .procname = "net", },
+	{ .procname = "netrom", },
+	{ }
+};
+
+void __init nr_register_sysctl(void)
+{
+	nr_table_header = register_sysctl_paths(nr_path, nr_table);
+}
+
+void nr_unregister_sysctl(void)
+{
+	unregister_sysctl_table(nr_table_header);
+}
diff --git a/net/nonet.c b/net/nonet.c
new file mode 100644
index 00000000..b1a73fda
--- /dev/null
+++ b/net/nonet.c
@@ -0,0 +1,26 @@
+/*
+ * net/nonet.c
+ *
+ * Dummy functions to allow us to configure network support entirely
+ * out of the kernel.
+ *
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) Matthew Wilcox 2003
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+	return -ENXIO;
+}
+
+const struct file_operations bad_sock_fops = {
+	.owner = THIS_MODULE,
+	.open = sock_no_open,
+	.llseek = noop_llseek,
+};
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 00000000..0060e3b3
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,16 @@
+#
+# Packet configuration
+#
+
+config PACKET
+	tristate "Packet socket"
+	---help---
+	  The Packet protocol is used by applications which communicate
+	  directly with network devices without an intermediate network
+	  protocol implemented in the kernel, e.g. tcpdump.  If you want them
+	  to work, choose Y.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called af_packet.
+
+	  If unsure, say Y.
diff --git a/net/packet/Makefile b/net/packet/Makefile
new file mode 100644
index 00000000..81183eab
--- /dev/null
+++ b/net/packet/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the packet AF.
+#
+
+obj-$(CONFIG_PACKET) += af_packet.o
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
new file mode 100644
index 00000000..fafb9683
--- /dev/null
+++ b/net/packet/af_packet.c
@@ -0,0 +1,2808 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		PACKET - implements raw packet sockets.
+ *
+ * Authors:	Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() now used correctly
+ *		Alan Cox	:	new skbuff lists, look ma no backlogs!
+ *		Alan Cox	:	tidied skbuff lists.
+ *		Alan Cox	:	Now uses generic datagram routines I
+ *					added. Also fixed the peek/read crash
+ *					from all old Linux datagram code.
+ *		Alan Cox	:	Uses the improved datagram code.
+ *		Alan Cox	:	Added NULL's for socket options.
+ *		Alan Cox	:	Re-commented the code.
+ *		Alan Cox	:	Use new kernel side addressing
+ *		Rob Janssen	:	Correct MTU usage.
+ *		Dave Platt	:	Counter leaks caused by incorrect
+ *					interrupt locking and some slightly
+ *					dubious gcc output. Can you read
+ *					compiler: it said _VOLATILE_
+ *	Richard Kooijman	:	Timestamp fixes.
+ *		Alan Cox	:	New buffers. Use sk->mac.raw.
+ *		Alan Cox	:	sendmsg/recvmsg support.
+ *		Alan Cox	:	Protocol setting support
+ *	Alexey Kuznetsov	:	Untied from IPv4 stack.
+ *	Cyrus Durgin		:	Fixed kerneld for kmod.
+ *	Michal Ostrowski        :       Module initialization cleanup.
+ *         Ulises Alonso        :       Frame number limit removal and
+ *                                      packet_set_ring memory leak.
+ *		Eric Biederman	:	Allow for > 8 byte hardware addresses.
+ *					The convention is that longer addresses
+ *					will simply extend the hardware address
+ *					byte arrays at the end of sockaddr_ll
+ *					and packet_mreq.
+ *		Johann Baudy	:	Added TX RING.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_packet.h>
+#include <linux/wireless.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <asm/page.h>
+#include <asm/cacheflush.h>
+#include <asm/io.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/poll.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <linux/errqueue.h>
+#include <linux/net_tstamp.h>
+
+#ifdef CONFIG_INET
+#include <net/inet_common.h>
+#endif
+
+/*
+   Assumptions:
+   - if device has no dev->hard_header routine, it adds and removes ll header
+     inside itself. In this case ll header is invisible outside of device,
+     but higher levels still should reserve dev->hard_header_len.
+     Some devices are enough clever to reallocate skb, when header
+     will not fit to reserved space (tunnel), another ones are silly
+     (PPP).
+   - packet socket receives packets with pulled ll header,
+     so that SOCK_RAW should push it back.
+
+On receive:
+-----------
+
+Incoming, dev->hard_header!=NULL
+   mac_header -> ll header
+   data       -> data
+
+Outgoing, dev->hard_header!=NULL
+   mac_header -> ll header
+   data       -> ll header
+
+Incoming, dev->hard_header==NULL
+   mac_header -> UNKNOWN position. It is very likely, that it points to ll
+		 header.  PPP makes it, that is wrong, because introduce
+		 assymetry between rx and tx paths.
+   data       -> data
+
+Outgoing, dev->hard_header==NULL
+   mac_header -> data. ll header is still not built!
+   data       -> data
+
+Resume
+  If dev->hard_header==NULL we are unlikely to restore sensible ll header.
+
+
+On transmit:
+------------
+
+dev->hard_header != NULL
+   mac_header -> ll header
+   data       -> ll header
+
+dev->hard_header == NULL (ll header is added by device, we cannot control it)
+   mac_header -> data
+   data       -> data
+
+   We should set nh.raw on output to correct posistion,
+   packet classifier depends on it.
+ */
+
+/* Private packet socket structures. */
+
+struct packet_mclist {
+	struct packet_mclist	*next;
+	int			ifindex;
+	int			count;
+	unsigned short		type;
+	unsigned short		alen;
+	unsigned char		addr[MAX_ADDR_LEN];
+};
+/* identical to struct packet_mreq except it has
+ * a longer address field.
+ */
+struct packet_mreq_max {
+	int		mr_ifindex;
+	unsigned short	mr_type;
+	unsigned short	mr_alen;
+	unsigned char	mr_address[MAX_ADDR_LEN];
+};
+
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+		int closing, int tx_ring);
+
+struct pgv {
+	char *buffer;
+};
+
+struct packet_ring_buffer {
+	struct pgv		*pg_vec;
+	unsigned int		head;
+	unsigned int		frames_per_block;
+	unsigned int		frame_size;
+	unsigned int		frame_max;
+
+	unsigned int		pg_vec_order;
+	unsigned int		pg_vec_pages;
+	unsigned int		pg_vec_len;
+
+	atomic_t		pending;
+};
+
+struct packet_sock;
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
+
+static void packet_flush_mclist(struct sock *sk);
+
+struct packet_sock {
+	/* struct sock has to be the first member of packet_sock */
+	struct sock		sk;
+	struct tpacket_stats	stats;
+	struct packet_ring_buffer	rx_ring;
+	struct packet_ring_buffer	tx_ring;
+	int			copy_thresh;
+	spinlock_t		bind_lock;
+	struct mutex		pg_vec_lock;
+	unsigned int		running:1,	/* prot_hook is attached*/
+				auxdata:1,
+				origdev:1,
+				has_vnet_hdr:1;
+	int			ifindex;	/* bound device		*/
+	__be16			num;
+	struct packet_mclist	*mclist;
+	atomic_t		mapped;
+	enum tpacket_versions	tp_version;
+	unsigned int		tp_hdrlen;
+	unsigned int		tp_reserve;
+	unsigned int		tp_loss:1;
+	unsigned int		tp_tstamp;
+	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+};
+
+struct packet_skb_cb {
+	unsigned int origlen;
+	union {
+		struct sockaddr_pkt pkt;
+		struct sockaddr_ll ll;
+	} sa;
+};
+
+#define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
+
+static inline __pure struct page *pgv_to_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	return virt_to_page(addr);
+}
+
+static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+{
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} h;
+
+	h.raw = frame;
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		h.h1->tp_status = status;
+		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
+		break;
+	case TPACKET_V2:
+		h.h2->tp_status = status;
+		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
+		break;
+	default:
+		pr_err("TPACKET version not supported\n");
+		BUG();
+	}
+
+	smp_wmb();
+}
+
+static int __packet_get_status(struct packet_sock *po, void *frame)
+{
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} h;
+
+	smp_rmb();
+
+	h.raw = frame;
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
+		return h.h1->tp_status;
+	case TPACKET_V2:
+		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
+		return h.h2->tp_status;
+	default:
+		pr_err("TPACKET version not supported\n");
+		BUG();
+		return 0;
+	}
+}
+
+static void *packet_lookup_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		unsigned int position,
+		int status)
+{
+	unsigned int pg_vec_pos, frame_offset;
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} h;
+
+	pg_vec_pos = position / rb->frames_per_block;
+	frame_offset = position % rb->frames_per_block;
+
+	h.raw = rb->pg_vec[pg_vec_pos].buffer +
+		(frame_offset * rb->frame_size);
+
+	if (status != __packet_get_status(po, h.raw))
+		return NULL;
+
+	return h.raw;
+}
+
+static inline void *packet_current_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	return packet_lookup_frame(po, rb, rb->head, status);
+}
+
+static inline void *packet_previous_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
+	return packet_lookup_frame(po, rb, previous, status);
+}
+
+static inline void packet_increment_head(struct packet_ring_buffer *buff)
+{
+	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
+}
+
+static inline struct packet_sock *pkt_sk(struct sock *sk)
+{
+	return (struct packet_sock *)sk;
+}
+
+static void packet_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_error_queue);
+
+	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		pr_err("Attempt to release alive packet socket: %p\n", sk);
+		return;
+	}
+
+	sk_refcnt_debug_dec(sk);
+}
+
+
+static const struct proto_ops packet_ops;
+
+static const struct proto_ops packet_ops_spkt;
+
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
+			   struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct sock *sk;
+	struct sockaddr_pkt *spkt;
+
+	/*
+	 *	When we registered the protocol we saved the socket in the data
+	 *	field for just this event.
+	 */
+
+	sk = pt->af_packet_priv;
+
+	/*
+	 *	Yank back the headers [hope the device set this
+	 *	right or kerboom...]
+	 *
+	 *	Incoming packets have ll header pulled,
+	 *	push it back.
+	 *
+	 *	For outgoing ones skb->data == skb_mac_header(skb)
+	 *	so that this procedure is noop.
+	 */
+
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		goto out;
+
+	if (!net_eq(dev_net(dev), sock_net(sk)))
+		goto out;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		goto oom;
+
+	/* drop any routing info */
+	skb_dst_drop(skb);
+
+	/* drop conntrack reference */
+	nf_reset(skb);
+
+	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
+
+	skb_push(skb, skb->data - skb_mac_header(skb));
+
+	/*
+	 *	The SOCK_PACKET socket receives _all_ frames.
+	 */
+
+	spkt->spkt_family = dev->type;
+	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
+	spkt->spkt_protocol = skb->protocol;
+
+	/*
+	 *	Charge the memory to the socket. This is done specifically
+	 *	to prevent sockets using all the memory up.
+	 */
+
+	if (sock_queue_rcv_skb(sk, skb) == 0)
+		return 0;
+
+out:
+	kfree_skb(skb);
+oom:
+	return 0;
+}
+
+
+/*
+ *	Output a raw packet to a device layer. This bypasses all the other
+ *	protocol layers and you must therefore supply it with a complete frame
+ */
+
+static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
+	struct sk_buff *skb = NULL;
+	struct net_device *dev;
+	__be16 proto = 0;
+	int err;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (saddr) {
+		if (msg->msg_namelen < sizeof(struct sockaddr))
+			return -EINVAL;
+		if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
+			proto = saddr->spkt_protocol;
+	} else
+		return -ENOTCONN;	/* SOCK_PACKET must be sent giving an address */
+
+	/*
+	 *	Find the device first to size check it
+	 */
+
+	saddr->spkt_device[13] = 0;
+retry:
+	rcu_read_lock();
+	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
+	err = -ENODEV;
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = -ENETDOWN;
+	if (!(dev->flags & IFF_UP))
+		goto out_unlock;
+
+	/*
+	 * You may not queue a frame bigger than the mtu. This is the lowest level
+	 * raw protocol and you must do your own fragmentation at this level.
+	 */
+
+	err = -EMSGSIZE;
+	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
+		goto out_unlock;
+
+	if (!skb) {
+		size_t reserved = LL_RESERVED_SPACE(dev);
+		unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
+
+		rcu_read_unlock();
+		skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
+		if (skb == NULL)
+			return -ENOBUFS;
+		/* FIXME: Save some space for broken drivers that write a hard
+		 * header at transmission time by themselves. PPP is the notable
+		 * one here. This should really be fixed at the driver level.
+		 */
+		skb_reserve(skb, reserved);
+		skb_reset_network_header(skb);
+
+		/* Try to align data part correctly */
+		if (hhlen) {
+			skb->data -= hhlen;
+			skb->tail -= hhlen;
+			if (len < hhlen)
+				skb_reset_network_header(skb);
+		}
+		err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+		if (err)
+			goto out_free;
+		goto retry;
+	}
+
+	if (len > (dev->mtu + dev->hard_header_len)) {
+		/* Earlier code assumed this would be a VLAN pkt,
+		 * double-check this now that we have the actual
+		 * packet in hand.
+		 */
+		struct ethhdr *ehdr;
+		skb_reset_mac_header(skb);
+		ehdr = eth_hdr(skb);
+		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+			err = -EMSGSIZE;
+			goto out_unlock;
+		}
+	}
+
+	skb->protocol = proto;
+	skb->dev = dev;
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+	if (err < 0)
+		goto out_unlock;
+
+	dev_queue_xmit(skb);
+	rcu_read_unlock();
+	return len;
+
+out_unlock:
+	rcu_read_unlock();
+out_free:
+	kfree_skb(skb);
+	return err;
+}
+
+static inline unsigned int run_filter(const struct sk_buff *skb,
+				      const struct sock *sk,
+				      unsigned int res)
+{
+	struct sk_filter *filter;
+
+	rcu_read_lock();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter != NULL)
+		res = SK_RUN_FILTER(filter, skb);
+	rcu_read_unlock();
+
+	return res;
+}
+
+/*
+ * This function makes lazy skb cloning in hope that most of packets
+ * are discarded by BPF.
+ *
+ * Note tricky part: we DO mangle shared skb! skb->data, skb->len
+ * and skb->cb are mangled. It works because (and until) packets
+ * falling here are owned by current CPU. Output packets are cloned
+ * by dev_queue_xmit_nit(), input packets are processed by net_bh
+ * sequencially, so that if we return skb to original state on exit,
+ * we will not harm anyone.
+ */
+
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
+		      struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct sock *sk;
+	struct sockaddr_ll *sll;
+	struct packet_sock *po;
+	u8 *skb_head = skb->data;
+	int skb_len = skb->len;
+	unsigned int snaplen, res;
+
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		goto drop;
+
+	sk = pt->af_packet_priv;
+	po = pkt_sk(sk);
+
+	if (!net_eq(dev_net(dev), sock_net(sk)))
+		goto drop;
+
+	skb->dev = dev;
+
+	if (dev->header_ops) {
+		/* The device has an explicit notion of ll header,
+		 * exported to higher levels.
+		 *
+		 * Otherwise, the device hides details of its frame
+		 * structure, so that corresponding packet head is
+		 * never delivered to user.
+		 */
+		if (sk->sk_type != SOCK_DGRAM)
+			skb_push(skb, skb->data - skb_mac_header(skb));
+		else if (skb->pkt_type == PACKET_OUTGOING) {
+			/* Special case: outgoing packets have ll header at head */
+			skb_pull(skb, skb_network_offset(skb));
+		}
+	}
+
+	snaplen = skb->len;
+
+	res = run_filter(skb, sk, snaplen);
+	if (!res)
+		goto drop_n_restore;
+	if (snaplen > res)
+		snaplen = res;
+
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf)
+		goto drop_n_acct;
+
+	if (skb_shared(skb)) {
+		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+		if (nskb == NULL)
+			goto drop_n_acct;
+
+		if (skb_head != skb->data) {
+			skb->data = skb_head;
+			skb->len = skb_len;
+		}
+		kfree_skb(skb);
+		skb = nskb;
+	}
+
+	BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
+		     sizeof(skb->cb));
+
+	sll = &PACKET_SKB_CB(skb)->sa.ll;
+	sll->sll_family = AF_PACKET;
+	sll->sll_hatype = dev->type;
+	sll->sll_protocol = skb->protocol;
+	sll->sll_pkttype = skb->pkt_type;
+	if (unlikely(po->origdev))
+		sll->sll_ifindex = orig_dev->ifindex;
+	else
+		sll->sll_ifindex = dev->ifindex;
+
+	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
+
+	PACKET_SKB_CB(skb)->origlen = skb->len;
+
+	if (pskb_trim(skb, snaplen))
+		goto drop_n_acct;
+
+	skb_set_owner_r(skb, sk);
+	skb->dev = NULL;
+	skb_dst_drop(skb);
+
+	/* drop conntrack reference */
+	nf_reset(skb);
+
+	spin_lock(&sk->sk_receive_queue.lock);
+	po->stats.tp_packets++;
+	skb->dropcount = atomic_read(&sk->sk_drops);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	spin_unlock(&sk->sk_receive_queue.lock);
+	sk->sk_data_ready(sk, skb->len);
+	return 0;
+
+drop_n_acct:
+	spin_lock(&sk->sk_receive_queue.lock);
+	po->stats.tp_drops++;
+	atomic_inc(&sk->sk_drops);
+	spin_unlock(&sk->sk_receive_queue.lock);
+
+drop_n_restore:
+	if (skb_head != skb->data && skb_shared(skb)) {
+		skb->data = skb_head;
+		skb->len = skb_len;
+	}
+drop:
+	consume_skb(skb);
+	return 0;
+}
+
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct sock *sk;
+	struct packet_sock *po;
+	struct sockaddr_ll *sll;
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} h;
+	u8 *skb_head = skb->data;
+	int skb_len = skb->len;
+	unsigned int snaplen, res;
+	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
+	unsigned short macoff, netoff, hdrlen;
+	struct sk_buff *copy_skb = NULL;
+	struct timeval tv;
+	struct timespec ts;
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		goto drop;
+
+	sk = pt->af_packet_priv;
+	po = pkt_sk(sk);
+
+	if (!net_eq(dev_net(dev), sock_net(sk)))
+		goto drop;
+
+	if (dev->header_ops) {
+		if (sk->sk_type != SOCK_DGRAM)
+			skb_push(skb, skb->data - skb_mac_header(skb));
+		else if (skb->pkt_type == PACKET_OUTGOING) {
+			/* Special case: outgoing packets have ll header at head */
+			skb_pull(skb, skb_network_offset(skb));
+		}
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		status |= TP_STATUS_CSUMNOTREADY;
+
+	snaplen = skb->len;
+
+	res = run_filter(skb, sk, snaplen);
+	if (!res)
+		goto drop_n_restore;
+	if (snaplen > res)
+		snaplen = res;
+
+	if (sk->sk_type == SOCK_DGRAM) {
+		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
+				  po->tp_reserve;
+	} else {
+		unsigned maclen = skb_network_offset(skb);
+		netoff = TPACKET_ALIGN(po->tp_hdrlen +
+				       (maclen < 16 ? 16 : maclen)) +
+			po->tp_reserve;
+		macoff = netoff - maclen;
+	}
+
+	if (macoff + snaplen > po->rx_ring.frame_size) {
+		if (po->copy_thresh &&
+		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
+		    (unsigned)sk->sk_rcvbuf) {
+			if (skb_shared(skb)) {
+				copy_skb = skb_clone(skb, GFP_ATOMIC);
+			} else {
+				copy_skb = skb_get(skb);
+				skb_head = skb->data;
+			}
+			if (copy_skb)
+				skb_set_owner_r(copy_skb, sk);
+		}
+		snaplen = po->rx_ring.frame_size - macoff;
+		if ((int)snaplen < 0)
+			snaplen = 0;
+	}
+
+	spin_lock(&sk->sk_receive_queue.lock);
+	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
+	if (!h.raw)
+		goto ring_is_full;
+	packet_increment_head(&po->rx_ring);
+	po->stats.tp_packets++;
+	if (copy_skb) {
+		status |= TP_STATUS_COPY;
+		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
+	}
+	if (!po->stats.tp_drops)
+		status &= ~TP_STATUS_LOSING;
+	spin_unlock(&sk->sk_receive_queue.lock);
+
+	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
+
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		h.h1->tp_len = skb->len;
+		h.h1->tp_snaplen = snaplen;
+		h.h1->tp_mac = macoff;
+		h.h1->tp_net = netoff;
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
+			tv = ktime_to_timeval(skb->tstamp);
+		else
+			do_gettimeofday(&tv);
+		h.h1->tp_sec = tv.tv_sec;
+		h.h1->tp_usec = tv.tv_usec;
+		hdrlen = sizeof(*h.h1);
+		break;
+	case TPACKET_V2:
+		h.h2->tp_len = skb->len;
+		h.h2->tp_snaplen = snaplen;
+		h.h2->tp_mac = macoff;
+		h.h2->tp_net = netoff;
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
+			ts = ktime_to_timespec(skb->tstamp);
+		else
+			getnstimeofday(&ts);
+		h.h2->tp_sec = ts.tv_sec;
+		h.h2->tp_nsec = ts.tv_nsec;
+		if (vlan_tx_tag_present(skb)) {
+			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+			status |= TP_STATUS_VLAN_VALID;
+		} else {
+			h.h2->tp_vlan_tci = 0;
+		}
+		h.h2->tp_padding = 0;
+		hdrlen = sizeof(*h.h2);
+		break;
+	default:
+		BUG();
+	}
+
+	sll = h.raw + TPACKET_ALIGN(hdrlen);
+	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
+	sll->sll_family = AF_PACKET;
+	sll->sll_hatype = dev->type;
+	sll->sll_protocol = skb->protocol;
+	sll->sll_pkttype = skb->pkt_type;
+	if (unlikely(po->origdev))
+		sll->sll_ifindex = orig_dev->ifindex;
+	else
+		sll->sll_ifindex = dev->ifindex;
+
+	__packet_set_status(po, h.raw, status);
+	smp_mb();
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+	{
+		u8 *start, *end;
+
+		end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
+		for (start = h.raw; start < end; start += PAGE_SIZE)
+			flush_dcache_page(pgv_to_page(start));
+	}
+#endif
+
+	sk->sk_data_ready(sk, 0);
+
+drop_n_restore:
+	if (skb_head != skb->data && skb_shared(skb)) {
+		skb->data = skb_head;
+		skb->len = skb_len;
+	}
+drop:
+	kfree_skb(skb);
+	return 0;
+
+ring_is_full:
+	po->stats.tp_drops++;
+	spin_unlock(&sk->sk_receive_queue.lock);
+
+	sk->sk_data_ready(sk, 0);
+	kfree_skb(copy_skb);
+	goto drop_n_restore;
+}
+
+static void tpacket_destruct_skb(struct sk_buff *skb)
+{
+	struct packet_sock *po = pkt_sk(skb->sk);
+	void *ph;
+
+	BUG_ON(skb == NULL);
+
+	if (likely(po->tx_ring.pg_vec)) {
+		ph = skb_shinfo(skb)->destructor_arg;
+		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
+		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
+		atomic_dec(&po->tx_ring.pending);
+		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
+	}
+
+	sock_wfree(skb);
+}
+
+static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
+		void *frame, struct net_device *dev, int size_max,
+		__be16 proto, unsigned char *addr)
+{
+	union {
+		struct tpacket_hdr *h1;
+		struct tpacket2_hdr *h2;
+		void *raw;
+	} ph;
+	int to_write, offset, len, tp_len, nr_frags, len_max;
+	struct socket *sock = po->sk.sk_socket;
+	struct page *page;
+	void *data;
+	int err;
+
+	ph.raw = frame;
+
+	skb->protocol = proto;
+	skb->dev = dev;
+	skb->priority = po->sk.sk_priority;
+	skb->mark = po->sk.sk_mark;
+	skb_shinfo(skb)->destructor_arg = ph.raw;
+
+	switch (po->tp_version) {
+	case TPACKET_V2:
+		tp_len = ph.h2->tp_len;
+		break;
+	default:
+		tp_len = ph.h1->tp_len;
+		break;
+	}
+	if (unlikely(tp_len > size_max)) {
+		pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+		return -EMSGSIZE;
+	}
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	skb_reset_network_header(skb);
+
+	data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
+	to_write = tp_len;
+
+	if (sock->type == SOCK_DGRAM) {
+		err = dev_hard_header(skb, dev, ntohs(proto), addr,
+				NULL, tp_len);
+		if (unlikely(err < 0))
+			return -EINVAL;
+	} else if (dev->hard_header_len) {
+		/* net device doesn't like empty head */
+		if (unlikely(tp_len <= dev->hard_header_len)) {
+			pr_err("packet size is too short (%d < %d)\n",
+			       tp_len, dev->hard_header_len);
+			return -EINVAL;
+		}
+
+		skb_push(skb, dev->hard_header_len);
+		err = skb_store_bits(skb, 0, data,
+				dev->hard_header_len);
+		if (unlikely(err))
+			return err;
+
+		data += dev->hard_header_len;
+		to_write -= dev->hard_header_len;
+	}
+
+	err = -EFAULT;
+	offset = offset_in_page(data);
+	len_max = PAGE_SIZE - offset;
+	len = ((to_write > len_max) ? len_max : to_write);
+
+	skb->data_len = to_write;
+	skb->len += to_write;
+	skb->truesize += to_write;
+	atomic_add(to_write, &po->sk.sk_wmem_alloc);
+
+	while (likely(to_write)) {
+		nr_frags = skb_shinfo(skb)->nr_frags;
+
+		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
+			pr_err("Packet exceed the number of skb frags(%lu)\n",
+			       MAX_SKB_FRAGS);
+			return -EFAULT;
+		}
+
+		page = pgv_to_page(data);
+		data += len;
+		flush_dcache_page(page);
+		get_page(page);
+		skb_fill_page_desc(skb, nr_frags, page, offset, len);
+		to_write -= len;
+		offset = 0;
+		len_max = PAGE_SIZE;
+		len = ((to_write > len_max) ? len_max : to_write);
+	}
+
+	return tp_len;
+}
+
+static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
+{
+	struct sk_buff *skb;
+	struct net_device *dev;
+	__be16 proto;
+	int ifindex, err, reserve = 0;
+	void *ph;
+	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
+	int tp_len, size_max;
+	unsigned char *addr;
+	int len_sum = 0;
+	int status = 0;
+
+	mutex_lock(&po->pg_vec_lock);
+
+	err = -EBUSY;
+	if (saddr == NULL) {
+		ifindex	= po->ifindex;
+		proto	= po->num;
+		addr	= NULL;
+	} else {
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+			goto out;
+		if (msg->msg_namelen < (saddr->sll_halen
+					+ offsetof(struct sockaddr_ll,
+						sll_addr)))
+			goto out;
+		ifindex	= saddr->sll_ifindex;
+		proto	= saddr->sll_protocol;
+		addr	= saddr->sll_addr;
+	}
+
+	dev = dev_get_by_index(sock_net(&po->sk), ifindex);
+	err = -ENXIO;
+	if (unlikely(dev == NULL))
+		goto out;
+
+	reserve = dev->hard_header_len;
+
+	err = -ENETDOWN;
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto out_put;
+
+	size_max = po->tx_ring.frame_size
+		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
+
+	if (size_max > dev->mtu + reserve)
+		size_max = dev->mtu + reserve;
+
+	do {
+		ph = packet_current_frame(po, &po->tx_ring,
+				TP_STATUS_SEND_REQUEST);
+
+		if (unlikely(ph == NULL)) {
+			schedule();
+			continue;
+		}
+
+		status = TP_STATUS_SEND_REQUEST;
+		skb = sock_alloc_send_skb(&po->sk,
+				LL_ALLOCATED_SPACE(dev)
+				+ sizeof(struct sockaddr_ll),
+				0, &err);
+
+		if (unlikely(skb == NULL))
+			goto out_status;
+
+		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
+				addr);
+
+		if (unlikely(tp_len < 0)) {
+			if (po->tp_loss) {
+				__packet_set_status(po, ph,
+						TP_STATUS_AVAILABLE);
+				packet_increment_head(&po->tx_ring);
+				kfree_skb(skb);
+				continue;
+			} else {
+				status = TP_STATUS_WRONG_FORMAT;
+				err = tp_len;
+				goto out_status;
+			}
+		}
+
+		skb->destructor = tpacket_destruct_skb;
+		__packet_set_status(po, ph, TP_STATUS_SENDING);
+		atomic_inc(&po->tx_ring.pending);
+
+		status = TP_STATUS_SEND_REQUEST;
+		err = dev_queue_xmit(skb);
+		if (unlikely(err > 0)) {
+			err = net_xmit_errno(err);
+			if (err && __packet_get_status(po, ph) ==
+				   TP_STATUS_AVAILABLE) {
+				/* skb was destructed already */
+				skb = NULL;
+				goto out_status;
+			}
+			/*
+			 * skb was dropped but not destructed yet;
+			 * let's treat it like congestion or err < 0
+			 */
+			err = 0;
+		}
+		packet_increment_head(&po->tx_ring);
+		len_sum += tp_len;
+	} while (likely((ph != NULL) ||
+			((!(msg->msg_flags & MSG_DONTWAIT)) &&
+			 (atomic_read(&po->tx_ring.pending))))
+		);
+
+	err = len_sum;
+	goto out_put;
+
+out_status:
+	__packet_set_status(po, ph, status);
+	kfree_skb(skb);
+out_put:
+	dev_put(dev);
+out:
+	mutex_unlock(&po->pg_vec_lock);
+	return err;
+}
+
+static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
+					       size_t reserve, size_t len,
+					       size_t linear, int noblock,
+					       int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, reserve);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
+static int packet_snd(struct socket *sock,
+			  struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
+	struct sk_buff *skb;
+	struct net_device *dev;
+	__be16 proto;
+	unsigned char *addr;
+	int ifindex, err, reserve = 0;
+	struct virtio_net_hdr vnet_hdr = { 0 };
+	int offset = 0;
+	int vnet_hdr_len;
+	struct packet_sock *po = pkt_sk(sk);
+	unsigned short gso_type = 0;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (saddr == NULL) {
+		ifindex	= po->ifindex;
+		proto	= po->num;
+		addr	= NULL;
+	} else {
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(struct sockaddr_ll))
+			goto out;
+		if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
+			goto out;
+		ifindex	= saddr->sll_ifindex;
+		proto	= saddr->sll_protocol;
+		addr	= saddr->sll_addr;
+	}
+
+
+	dev = dev_get_by_index(sock_net(sk), ifindex);
+	err = -ENXIO;
+	if (dev == NULL)
+		goto out_unlock;
+	if (sock->type == SOCK_RAW)
+		reserve = dev->hard_header_len;
+
+	err = -ENETDOWN;
+	if (!(dev->flags & IFF_UP))
+		goto out_unlock;
+
+	if (po->has_vnet_hdr) {
+		vnet_hdr_len = sizeof(vnet_hdr);
+
+		err = -EINVAL;
+		if (len < vnet_hdr_len)
+			goto out_unlock;
+
+		len -= vnet_hdr_len;
+
+		err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
+				       vnet_hdr_len);
+		if (err < 0)
+			goto out_unlock;
+
+		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		    (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
+		      vnet_hdr.hdr_len))
+			vnet_hdr.hdr_len = vnet_hdr.csum_start +
+						 vnet_hdr.csum_offset + 2;
+
+		err = -EINVAL;
+		if (vnet_hdr.hdr_len > len)
+			goto out_unlock;
+
+		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+			case VIRTIO_NET_HDR_GSO_TCPV4:
+				gso_type = SKB_GSO_TCPV4;
+				break;
+			case VIRTIO_NET_HDR_GSO_TCPV6:
+				gso_type = SKB_GSO_TCPV6;
+				break;
+			case VIRTIO_NET_HDR_GSO_UDP:
+				gso_type = SKB_GSO_UDP;
+				break;
+			default:
+				goto out_unlock;
+			}
+
+			if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+				gso_type |= SKB_GSO_TCP_ECN;
+
+			if (vnet_hdr.gso_size == 0)
+				goto out_unlock;
+
+		}
+	}
+
+	err = -EMSGSIZE;
+	if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
+		goto out_unlock;
+
+	err = -ENOBUFS;
+	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
+			       LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
+			       msg->msg_flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto out_unlock;
+
+	skb_set_network_header(skb, reserve);
+
+	err = -EINVAL;
+	if (sock->type == SOCK_DGRAM &&
+	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
+		goto out_free;
+
+	/* Returns -EFAULT on error */
+	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
+	if (err)
+		goto out_free;
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+	if (err < 0)
+		goto out_free;
+
+	if (!gso_type && (len > dev->mtu + reserve)) {
+		/* Earlier code assumed this would be a VLAN pkt,
+		 * double-check this now that we have the actual
+		 * packet in hand.
+		 */
+		struct ethhdr *ehdr;
+		skb_reset_mac_header(skb);
+		ehdr = eth_hdr(skb);
+		if (ehdr->h_proto != htons(ETH_P_8021Q)) {
+			err = -EMSGSIZE;
+			goto out_free;
+		}
+	}
+
+	skb->protocol = proto;
+	skb->dev = dev;
+	skb->priority = sk->sk_priority;
+	skb->mark = sk->sk_mark;
+
+	if (po->has_vnet_hdr) {
+		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+			if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
+						  vnet_hdr.csum_offset)) {
+				err = -EINVAL;
+				goto out_free;
+			}
+		}
+
+		skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
+		skb_shinfo(skb)->gso_type = gso_type;
+
+		/* Header must be checked, and gso_segs computed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+
+		len += vnet_hdr_len;
+	}
+
+	/*
+	 *	Now send it
+	 */
+
+	err = dev_queue_xmit(skb);
+	if (err > 0 && (err = net_xmit_errno(err)) != 0)
+		goto out_unlock;
+
+	dev_put(dev);
+
+	return len;
+
+out_free:
+	kfree_skb(skb);
+out_unlock:
+	if (dev)
+		dev_put(dev);
+out:
+	return err;
+}
+
+static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	if (po->tx_ring.pg_vec)
+		return tpacket_snd(po, msg);
+	else
+		return packet_snd(sock, msg, len);
+}
+
+/*
+ *	Close a PACKET socket. This is fairly simple. We immediately go
+ *	to 'closed' state and remove our protocol entry in the device list.
+ */
+
+static int packet_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct packet_sock *po;
+	struct net *net;
+	struct tpacket_req req;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+	po = pkt_sk(sk);
+
+	spin_lock_bh(&net->packet.sklist_lock);
+	sk_del_node_init_rcu(sk);
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	spin_unlock_bh(&net->packet.sklist_lock);
+
+	spin_lock(&po->bind_lock);
+	if (po->running) {
+		/*
+		 * Remove from protocol table
+		 */
+		po->running = 0;
+		po->num = 0;
+		__dev_remove_pack(&po->prot_hook);
+		__sock_put(sk);
+	}
+	spin_unlock(&po->bind_lock);
+
+	packet_flush_mclist(sk);
+
+	memset(&req, 0, sizeof(req));
+
+	if (po->rx_ring.pg_vec)
+		packet_set_ring(sk, &req, 1, 0);
+
+	if (po->tx_ring.pg_vec)
+		packet_set_ring(sk, &req, 1, 1);
+
+	synchronize_net();
+	/*
+	 *	Now the socket is dead. No more input will appear.
+	 */
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	/* Purge queues */
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	sk_refcnt_debug_release(sk);
+
+	sock_put(sk);
+	return 0;
+}
+
+/*
+ *	Attach a packet hook.
+ */
+
+static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	/*
+	 *	Detach an existing hook if present.
+	 */
+
+	lock_sock(sk);
+
+	spin_lock(&po->bind_lock);
+	if (po->running) {
+		__sock_put(sk);
+		po->running = 0;
+		po->num = 0;
+		spin_unlock(&po->bind_lock);
+		dev_remove_pack(&po->prot_hook);
+		spin_lock(&po->bind_lock);
+	}
+
+	po->num = protocol;
+	po->prot_hook.type = protocol;
+	po->prot_hook.dev = dev;
+
+	po->ifindex = dev ? dev->ifindex : 0;
+
+	if (protocol == 0)
+		goto out_unlock;
+
+	if (!dev || (dev->flags & IFF_UP)) {
+		dev_add_pack(&po->prot_hook);
+		sock_hold(sk);
+		po->running = 1;
+	} else {
+		sk->sk_err = ENETDOWN;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_error_report(sk);
+	}
+
+out_unlock:
+	spin_unlock(&po->bind_lock);
+	release_sock(sk);
+	return 0;
+}
+
+/*
+ *	Bind a packet socket to a device
+ */
+
+static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
+			    int addr_len)
+{
+	struct sock *sk = sock->sk;
+	char name[15];
+	struct net_device *dev;
+	int err = -ENODEV;
+
+	/*
+	 *	Check legality
+	 */
+
+	if (addr_len != sizeof(struct sockaddr))
+		return -EINVAL;
+	strlcpy(name, uaddr->sa_data, sizeof(name));
+
+	dev = dev_get_by_name(sock_net(sk), name);
+	if (dev) {
+		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
+		dev_put(dev);
+	}
+	return err;
+}
+
+static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
+	struct sock *sk = sock->sk;
+	struct net_device *dev = NULL;
+	int err;
+
+
+	/*
+	 *	Check legality
+	 */
+
+	if (addr_len < sizeof(struct sockaddr_ll))
+		return -EINVAL;
+	if (sll->sll_family != AF_PACKET)
+		return -EINVAL;
+
+	if (sll->sll_ifindex) {
+		err = -ENODEV;
+		dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
+		if (dev == NULL)
+			goto out;
+	}
+	err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
+	if (dev)
+		dev_put(dev);
+
+out:
+	return err;
+}
+
+static struct proto packet_proto = {
+	.name	  = "PACKET",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct packet_sock),
+};
+
+/*
+ *	Create a packet of type SOCK_PACKET.
+ */
+
+static int packet_create(struct net *net, struct socket *sock, int protocol,
+			 int kern)
+{
+	struct sock *sk;
+	struct packet_sock *po;
+	__be16 proto = (__force __be16)protocol; /* weird, but documented */
+	int err;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+	    sock->type != SOCK_PACKET)
+		return -ESOCKTNOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	err = -ENOBUFS;
+	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+	if (sk == NULL)
+		goto out;
+
+	sock->ops = &packet_ops;
+	if (sock->type == SOCK_PACKET)
+		sock->ops = &packet_ops_spkt;
+
+	sock_init_data(sock, sk);
+
+	po = pkt_sk(sk);
+	sk->sk_family = PF_PACKET;
+	po->num = proto;
+
+	sk->sk_destruct = packet_sock_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	/*
+	 *	Attach a protocol block
+	 */
+
+	spin_lock_init(&po->bind_lock);
+	mutex_init(&po->pg_vec_lock);
+	po->prot_hook.func = packet_rcv;
+
+	if (sock->type == SOCK_PACKET)
+		po->prot_hook.func = packet_rcv_spkt;
+
+	po->prot_hook.af_packet_priv = sk;
+
+	if (proto) {
+		po->prot_hook.type = proto;
+		dev_add_pack(&po->prot_hook);
+		sock_hold(sk);
+		po->running = 1;
+	}
+
+	spin_lock_bh(&net->packet.sklist_lock);
+	sk_add_node_rcu(sk, &net->packet.sklist);
+	sock_prot_inuse_add(net, &packet_proto, 1);
+	spin_unlock_bh(&net->packet.sklist_lock);
+
+	return 0;
+out:
+	return err;
+}
+
+static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb, *skb2;
+	int copied, err;
+
+	err = -EAGAIN;
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+	put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
+		 sizeof(serr->ee), &serr->ee);
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	/* Reset and regenerate socket error */
+	spin_lock_bh(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+
+out_free_skb:
+	kfree_skb(skb);
+out:
+	return err;
+}
+
+/*
+ *	Pull a packet from our receive queue and hand it to the user.
+ *	If necessary we block.
+ */
+
+static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
+			  struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	int copied, err;
+	struct sockaddr_ll *sll;
+	int vnet_hdr_len = 0;
+
+	err = -EINVAL;
+	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
+		goto out;
+
+#if 0
+	/* What error should we return now? EUNATTACH? */
+	if (pkt_sk(sk)->ifindex < 0)
+		return -ENODEV;
+#endif
+
+	if (flags & MSG_ERRQUEUE) {
+		err = packet_recv_error(sk, msg, len);
+		goto out;
+	}
+
+	/*
+	 *	Call the generic datagram receiver. This handles all sorts
+	 *	of horrible races and re-entrancy so we can forget about it
+	 *	in the protocol layers.
+	 *
+	 *	Now it will return ENETDOWN, if device have just gone down,
+	 *	but then it will block.
+	 */
+
+	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+
+	/*
+	 *	An error occurred so return it. Because skb_recv_datagram()
+	 *	handles the blocking we don't see and worry about blocking
+	 *	retries.
+	 */
+
+	if (skb == NULL)
+		goto out;
+
+	if (pkt_sk(sk)->has_vnet_hdr) {
+		struct virtio_net_hdr vnet_hdr = { 0 };
+
+		err = -EINVAL;
+		vnet_hdr_len = sizeof(vnet_hdr);
+		if (len < vnet_hdr_len)
+			goto out_free;
+
+		len -= vnet_hdr_len;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			vnet_hdr.hdr_len = skb_headlen(skb);
+			vnet_hdr.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else if (sinfo->gso_type & SKB_GSO_FCOE)
+				goto out_free;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			vnet_hdr.csum_start = skb_checksum_start_offset(skb);
+			vnet_hdr.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
+				     vnet_hdr_len);
+		if (err < 0)
+			goto out_free;
+	}
+
+	/*
+	 *	If the address length field is there to be filled in, we fill
+	 *	it in now.
+	 */
+
+	sll = &PACKET_SKB_CB(skb)->sa.ll;
+	if (sock->type == SOCK_PACKET)
+		msg->msg_namelen = sizeof(struct sockaddr_pkt);
+	else
+		msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
+
+	/*
+	 *	You lose any data beyond the buffer you gave. If it worries a
+	 *	user program they can ask the device for its MTU anyway.
+	 */
+
+	copied = skb->len;
+	if (copied > len) {
+		copied = len;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+
+	if (msg->msg_name)
+		memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
+		       msg->msg_namelen);
+
+	if (pkt_sk(sk)->auxdata) {
+		struct tpacket_auxdata aux;
+
+		aux.tp_status = TP_STATUS_USER;
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
+		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
+		aux.tp_snaplen = skb->len;
+		aux.tp_mac = 0;
+		aux.tp_net = skb_network_offset(skb);
+		if (vlan_tx_tag_present(skb)) {
+			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+			aux.tp_status |= TP_STATUS_VLAN_VALID;
+		} else {
+			aux.tp_vlan_tci = 0;
+		}
+		aux.tp_padding = 0;
+		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
+	}
+
+	/*
+	 *	Free or return the buffer as appropriate. Again this
+	 *	hides all the races and re-entrancy issues from us.
+	 */
+	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
+
+out_free:
+	skb_free_datagram(sk, skb);
+out:
+	return err;
+}
+
+static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
+			       int *uaddr_len, int peer)
+{
+	struct net_device *dev;
+	struct sock *sk	= sock->sk;
+
+	if (peer)
+		return -EOPNOTSUPP;
+
+	uaddr->sa_family = AF_PACKET;
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
+	if (dev)
+		strncpy(uaddr->sa_data, dev->name, 14);
+	else
+		memset(uaddr->sa_data, 0, 14);
+	rcu_read_unlock();
+	*uaddr_len = sizeof(*uaddr);
+
+	return 0;
+}
+
+static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
+			  int *uaddr_len, int peer)
+{
+	struct net_device *dev;
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
+
+	if (peer)
+		return -EOPNOTSUPP;
+
+	sll->sll_family = AF_PACKET;
+	sll->sll_ifindex = po->ifindex;
+	sll->sll_protocol = po->num;
+	sll->sll_pkttype = 0;
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
+	if (dev) {
+		sll->sll_hatype = dev->type;
+		sll->sll_halen = dev->addr_len;
+		memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
+	} else {
+		sll->sll_hatype = 0;	/* Bad: we have no ARPHRD_UNSPEC */
+		sll->sll_halen = 0;
+	}
+	rcu_read_unlock();
+	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
+
+	return 0;
+}
+
+static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
+			 int what)
+{
+	switch (i->type) {
+	case PACKET_MR_MULTICAST:
+		if (i->alen != dev->addr_len)
+			return -EINVAL;
+		if (what > 0)
+			return dev_mc_add(dev, i->addr);
+		else
+			return dev_mc_del(dev, i->addr);
+		break;
+	case PACKET_MR_PROMISC:
+		return dev_set_promiscuity(dev, what);
+		break;
+	case PACKET_MR_ALLMULTI:
+		return dev_set_allmulti(dev, what);
+		break;
+	case PACKET_MR_UNICAST:
+		if (i->alen != dev->addr_len)
+			return -EINVAL;
+		if (what > 0)
+			return dev_uc_add(dev, i->addr);
+		else
+			return dev_uc_del(dev, i->addr);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
+{
+	for ( ; i; i = i->next) {
+		if (i->ifindex == dev->ifindex)
+			packet_dev_mc(dev, i, what);
+	}
+}
+
+static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_mclist *ml, *i;
+	struct net_device *dev;
+	int err;
+
+	rtnl_lock();
+
+	err = -ENODEV;
+	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
+	if (!dev)
+		goto done;
+
+	err = -EINVAL;
+	if (mreq->mr_alen > dev->addr_len)
+		goto done;
+
+	err = -ENOBUFS;
+	i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i == NULL)
+		goto done;
+
+	err = 0;
+	for (ml = po->mclist; ml; ml = ml->next) {
+		if (ml->ifindex == mreq->mr_ifindex &&
+		    ml->type == mreq->mr_type &&
+		    ml->alen == mreq->mr_alen &&
+		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+			ml->count++;
+			/* Free the new element ... */
+			kfree(i);
+			goto done;
+		}
+	}
+
+	i->type = mreq->mr_type;
+	i->ifindex = mreq->mr_ifindex;
+	i->alen = mreq->mr_alen;
+	memcpy(i->addr, mreq->mr_address, i->alen);
+	i->count = 1;
+	i->next = po->mclist;
+	po->mclist = i;
+	err = packet_dev_mc(dev, i, 1);
+	if (err) {
+		po->mclist = i->next;
+		kfree(i);
+	}
+
+done:
+	rtnl_unlock();
+	return err;
+}
+
+static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
+{
+	struct packet_mclist *ml, **mlp;
+
+	rtnl_lock();
+
+	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
+		if (ml->ifindex == mreq->mr_ifindex &&
+		    ml->type == mreq->mr_type &&
+		    ml->alen == mreq->mr_alen &&
+		    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
+			if (--ml->count == 0) {
+				struct net_device *dev;
+				*mlp = ml->next;
+				dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
+				if (dev)
+					packet_dev_mc(dev, ml, -1);
+				kfree(ml);
+			}
+			rtnl_unlock();
+			return 0;
+		}
+	}
+	rtnl_unlock();
+	return -EADDRNOTAVAIL;
+}
+
+static void packet_flush_mclist(struct sock *sk)
+{
+	struct packet_sock *po = pkt_sk(sk);
+	struct packet_mclist *ml;
+
+	if (!po->mclist)
+		return;
+
+	rtnl_lock();
+	while ((ml = po->mclist) != NULL) {
+		struct net_device *dev;
+
+		po->mclist = ml->next;
+		dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
+		if (dev != NULL)
+			packet_dev_mc(dev, ml, -1);
+		kfree(ml);
+	}
+	rtnl_unlock();
+}
+
+static int
+packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	int ret;
+
+	if (level != SOL_PACKET)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case PACKET_ADD_MEMBERSHIP:
+	case PACKET_DROP_MEMBERSHIP:
+	{
+		struct packet_mreq_max mreq;
+		int len = optlen;
+		memset(&mreq, 0, sizeof(mreq));
+		if (len < sizeof(struct packet_mreq))
+			return -EINVAL;
+		if (len > sizeof(mreq))
+			len = sizeof(mreq);
+		if (copy_from_user(&mreq, optval, len))
+			return -EFAULT;
+		if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
+			return -EINVAL;
+		if (optname == PACKET_ADD_MEMBERSHIP)
+			ret = packet_mc_add(sk, &mreq);
+		else
+			ret = packet_mc_drop(sk, &mreq);
+		return ret;
+	}
+
+	case PACKET_RX_RING:
+	case PACKET_TX_RING:
+	{
+		struct tpacket_req req;
+
+		if (optlen < sizeof(req))
+			return -EINVAL;
+		if (pkt_sk(sk)->has_vnet_hdr)
+			return -EINVAL;
+		if (copy_from_user(&req, optval, sizeof(req)))
+			return -EFAULT;
+		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
+	}
+	case PACKET_COPY_THRESH:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		pkt_sk(sk)->copy_thresh = val;
+		return 0;
+	}
+	case PACKET_VERSION:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+			return -EBUSY;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+		switch (val) {
+		case TPACKET_V1:
+		case TPACKET_V2:
+			po->tp_version = val;
+			return 0;
+		default:
+			return -EINVAL;
+		}
+	}
+	case PACKET_RESERVE:
+	{
+		unsigned int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+			return -EBUSY;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+		po->tp_reserve = val;
+		return 0;
+	}
+	case PACKET_LOSS:
+	{
+		unsigned int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+			return -EBUSY;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+		po->tp_loss = !!val;
+		return 0;
+	}
+	case PACKET_AUXDATA:
+	{
+		int val;
+
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->auxdata = !!val;
+		return 0;
+	}
+	case PACKET_ORIGDEV:
+	{
+		int val;
+
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->origdev = !!val;
+		return 0;
+	}
+	case PACKET_VNET_HDR:
+	{
+		int val;
+
+		if (sock->type != SOCK_RAW)
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
+			return -EBUSY;
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->has_vnet_hdr = !!val;
+		return 0;
+	}
+	case PACKET_TIMESTAMP:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->tp_tstamp = val;
+		return 0;
+	}
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+static int packet_getsockopt(struct socket *sock, int level, int optname,
+			     char __user *optval, int __user *optlen)
+{
+	int len;
+	int val;
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	void *data;
+	struct tpacket_stats st;
+
+	if (level != SOL_PACKET)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case PACKET_STATISTICS:
+		if (len > sizeof(struct tpacket_stats))
+			len = sizeof(struct tpacket_stats);
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		st = po->stats;
+		memset(&po->stats, 0, sizeof(st));
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		st.tp_packets += st.tp_drops;
+
+		data = &st;
+		break;
+	case PACKET_AUXDATA:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->auxdata;
+
+		data = &val;
+		break;
+	case PACKET_ORIGDEV:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->origdev;
+
+		data = &val;
+		break;
+	case PACKET_VNET_HDR:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->has_vnet_hdr;
+
+		data = &val;
+		break;
+	case PACKET_VERSION:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->tp_version;
+		data = &val;
+		break;
+	case PACKET_HDRLEN:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		if (copy_from_user(&val, optval, len))
+			return -EFAULT;
+		switch (val) {
+		case TPACKET_V1:
+			val = sizeof(struct tpacket_hdr);
+			break;
+		case TPACKET_V2:
+			val = sizeof(struct tpacket2_hdr);
+			break;
+		default:
+			return -EINVAL;
+		}
+		data = &val;
+		break;
+	case PACKET_RESERVE:
+		if (len > sizeof(unsigned int))
+			len = sizeof(unsigned int);
+		val = po->tp_reserve;
+		data = &val;
+		break;
+	case PACKET_LOSS:
+		if (len > sizeof(unsigned int))
+			len = sizeof(unsigned int);
+		val = po->tp_loss;
+		data = &val;
+		break;
+	case PACKET_TIMESTAMP:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->tp_tstamp;
+		data = &val;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, data, len))
+		return -EFAULT;
+	return 0;
+}
+
+
+static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
+{
+	struct sock *sk;
+	struct hlist_node *node;
+	struct net_device *dev = data;
+	struct net *net = dev_net(dev);
+
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &net->packet.sklist) {
+		struct packet_sock *po = pkt_sk(sk);
+
+		switch (msg) {
+		case NETDEV_UNREGISTER:
+			if (po->mclist)
+				packet_dev_mclist(dev, po->mclist, -1);
+			/* fallthrough */
+
+		case NETDEV_DOWN:
+			if (dev->ifindex == po->ifindex) {
+				spin_lock(&po->bind_lock);
+				if (po->running) {
+					__dev_remove_pack(&po->prot_hook);
+					__sock_put(sk);
+					po->running = 0;
+					sk->sk_err = ENETDOWN;
+					if (!sock_flag(sk, SOCK_DEAD))
+						sk->sk_error_report(sk);
+				}
+				if (msg == NETDEV_UNREGISTER) {
+					po->ifindex = -1;
+					po->prot_hook.dev = NULL;
+				}
+				spin_unlock(&po->bind_lock);
+			}
+			break;
+		case NETDEV_UP:
+			if (dev->ifindex == po->ifindex) {
+				spin_lock(&po->bind_lock);
+				if (po->num && !po->running) {
+					dev_add_pack(&po->prot_hook);
+					sock_hold(sk);
+					po->running = 1;
+				}
+				spin_unlock(&po->bind_lock);
+			}
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return NOTIFY_DONE;
+}
+
+
+static int packet_ioctl(struct socket *sock, unsigned int cmd,
+			unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+
+	switch (cmd) {
+	case SIOCOUTQ:
+	{
+		int amount = sk_wmem_alloc_get(sk);
+
+		return put_user(amount, (int __user *)arg);
+	}
+	case SIOCINQ:
+	{
+		struct sk_buff *skb;
+		int amount = 0;
+
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb)
+			amount = skb->len;
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		return put_user(amount, (int __user *)arg);
+	}
+	case SIOCGSTAMP:
+		return sock_get_timestamp(sk, (struct timeval __user *)arg);
+	case SIOCGSTAMPNS:
+		return sock_get_timestampns(sk, (struct timespec __user *)arg);
+
+#ifdef CONFIG_INET
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCDARP:
+	case SIOCGARP:
+	case SIOCSARP:
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCSIFFLAGS:
+		return inet_dgram_ops.ioctl(sock, cmd, arg);
+#endif
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return 0;
+}
+
+static unsigned int packet_poll(struct file *file, struct socket *sock,
+				poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	unsigned int mask = datagram_poll(file, sock, wait);
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (po->rx_ring.pg_vec) {
+		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
+			mask |= POLLIN | POLLRDNORM;
+	}
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	if (po->tx_ring.pg_vec) {
+		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
+			mask |= POLLOUT | POLLWRNORM;
+	}
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+	return mask;
+}
+
+
+/* Dirty? Well, I still did not learn better way to account
+ * for user mmaps.
+ */
+
+static void packet_mm_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+
+	if (sk)
+		atomic_inc(&pkt_sk(sk)->mapped);
+}
+
+static void packet_mm_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct socket *sock = file->private_data;
+	struct sock *sk = sock->sk;
+
+	if (sk)
+		atomic_dec(&pkt_sk(sk)->mapped);
+}
+
+static const struct vm_operations_struct packet_mmap_ops = {
+	.open	=	packet_mm_open,
+	.close	=	packet_mm_close,
+};
+
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+			unsigned int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (likely(pg_vec[i].buffer)) {
+			if (is_vmalloc_addr(pg_vec[i].buffer))
+				vfree(pg_vec[i].buffer);
+			else
+				free_pages((unsigned long)pg_vec[i].buffer,
+					   order);
+			pg_vec[i].buffer = NULL;
+		}
+	}
+	kfree(pg_vec);
+}
+
+static inline char *alloc_one_pg_vec_page(unsigned long order)
+{
+	char *buffer = NULL;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+			  __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
+
+	buffer = (char *) __get_free_pages(gfp_flags, order);
+
+	if (buffer)
+		return buffer;
+
+	/*
+	 * __get_free_pages failed, fall back to vmalloc
+	 */
+	buffer = vzalloc((1 << order) * PAGE_SIZE);
+
+	if (buffer)
+		return buffer;
+
+	/*
+	 * vmalloc failed, lets dig into swap here
+	 */
+	gfp_flags &= ~__GFP_NORETRY;
+	buffer = (char *)__get_free_pages(gfp_flags, order);
+	if (buffer)
+		return buffer;
+
+	/*
+	 * complete and utter failure
+	 */
+	return NULL;
+}
+
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
+{
+	unsigned int block_nr = req->tp_block_nr;
+	struct pgv *pg_vec;
+	int i;
+
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
+	if (unlikely(!pg_vec))
+		goto out;
+
+	for (i = 0; i < block_nr; i++) {
+		pg_vec[i].buffer = alloc_one_pg_vec_page(order);
+		if (unlikely(!pg_vec[i].buffer))
+			goto out_free_pgvec;
+	}
+
+out:
+	return pg_vec;
+
+out_free_pgvec:
+	free_pg_vec(pg_vec, order, block_nr);
+	pg_vec = NULL;
+	goto out;
+}
+
+static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+		int closing, int tx_ring)
+{
+	struct pgv *pg_vec = NULL;
+	struct packet_sock *po = pkt_sk(sk);
+	int was_running, order = 0;
+	struct packet_ring_buffer *rb;
+	struct sk_buff_head *rb_queue;
+	__be16 num;
+	int err;
+
+	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
+	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+
+	err = -EBUSY;
+	if (!closing) {
+		if (atomic_read(&po->mapped))
+			goto out;
+		if (atomic_read(&rb->pending))
+			goto out;
+	}
+
+	if (req->tp_block_nr) {
+		/* Sanity tests and some calculations */
+		err = -EBUSY;
+		if (unlikely(rb->pg_vec))
+			goto out;
+
+		switch (po->tp_version) {
+		case TPACKET_V1:
+			po->tp_hdrlen = TPACKET_HDRLEN;
+			break;
+		case TPACKET_V2:
+			po->tp_hdrlen = TPACKET2_HDRLEN;
+			break;
+		}
+
+		err = -EINVAL;
+		if (unlikely((int)req->tp_block_size <= 0))
+			goto out;
+		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+			goto out;
+		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
+					po->tp_reserve))
+			goto out;
+		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
+			goto out;
+
+		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
+		if (unlikely(rb->frames_per_block <= 0))
+			goto out;
+		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
+					req->tp_frame_nr))
+			goto out;
+
+		err = -ENOMEM;
+		order = get_order(req->tp_block_size);
+		pg_vec = alloc_pg_vec(req, order);
+		if (unlikely(!pg_vec))
+			goto out;
+	}
+	/* Done */
+	else {
+		err = -EINVAL;
+		if (unlikely(req->tp_frame_nr))
+			goto out;
+	}
+
+	lock_sock(sk);
+
+	/* Detach socket from network */
+	spin_lock(&po->bind_lock);
+	was_running = po->running;
+	num = po->num;
+	if (was_running) {
+		__dev_remove_pack(&po->prot_hook);
+		po->num = 0;
+		po->running = 0;
+		__sock_put(sk);
+	}
+	spin_unlock(&po->bind_lock);
+
+	synchronize_net();
+
+	err = -EBUSY;
+	mutex_lock(&po->pg_vec_lock);
+	if (closing || atomic_read(&po->mapped) == 0) {
+		err = 0;
+		spin_lock_bh(&rb_queue->lock);
+		swap(rb->pg_vec, pg_vec);
+		rb->frame_max = (req->tp_frame_nr - 1);
+		rb->head = 0;
+		rb->frame_size = req->tp_frame_size;
+		spin_unlock_bh(&rb_queue->lock);
+
+		swap(rb->pg_vec_order, order);
+		swap(rb->pg_vec_len, req->tp_block_nr);
+
+		rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
+		po->prot_hook.func = (po->rx_ring.pg_vec) ?
+						tpacket_rcv : packet_rcv;
+		skb_queue_purge(rb_queue);
+		if (atomic_read(&po->mapped))
+			pr_err("packet_mmap: vma is busy: %d\n",
+			       atomic_read(&po->mapped));
+	}
+	mutex_unlock(&po->pg_vec_lock);
+
+	spin_lock(&po->bind_lock);
+	if (was_running && !po->running) {
+		sock_hold(sk);
+		po->running = 1;
+		po->num = num;
+		dev_add_pack(&po->prot_hook);
+	}
+	spin_unlock(&po->bind_lock);
+
+	release_sock(sk);
+
+	if (pg_vec)
+		free_pg_vec(pg_vec, order, req->tp_block_nr);
+out:
+	return err;
+}
+
+static int packet_mmap(struct file *file, struct socket *sock,
+		struct vm_area_struct *vma)
+{
+	struct sock *sk = sock->sk;
+	struct packet_sock *po = pkt_sk(sk);
+	unsigned long size, expected_size;
+	struct packet_ring_buffer *rb;
+	unsigned long start;
+	int err = -EINVAL;
+	int i;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	mutex_lock(&po->pg_vec_lock);
+
+	expected_size = 0;
+	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+		if (rb->pg_vec) {
+			expected_size += rb->pg_vec_len
+						* rb->pg_vec_pages
+						* PAGE_SIZE;
+		}
+	}
+
+	if (expected_size == 0)
+		goto out;
+
+	size = vma->vm_end - vma->vm_start;
+	if (size != expected_size)
+		goto out;
+
+	start = vma->vm_start;
+	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
+		if (rb->pg_vec == NULL)
+			continue;
+
+		for (i = 0; i < rb->pg_vec_len; i++) {
+			struct page *page;
+			void *kaddr = rb->pg_vec[i].buffer;
+			int pg_num;
+
+			for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
+				page = pgv_to_page(kaddr);
+				err = vm_insert_page(vma, start, page);
+				if (unlikely(err))
+					goto out;
+				start += PAGE_SIZE;
+				kaddr += PAGE_SIZE;
+			}
+		}
+	}
+
+	atomic_inc(&po->mapped);
+	vma->vm_ops = &packet_mmap_ops;
+	err = 0;
+
+out:
+	mutex_unlock(&po->pg_vec_lock);
+	return err;
+}
+
+static const struct proto_ops packet_ops_spkt = {
+	.family =	PF_PACKET,
+	.owner =	THIS_MODULE,
+	.release =	packet_release,
+	.bind =		packet_bind_spkt,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	packet_getname_spkt,
+	.poll =		datagram_poll,
+	.ioctl =	packet_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	sock_no_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	packet_sendmsg_spkt,
+	.recvmsg =	packet_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct proto_ops packet_ops = {
+	.family =	PF_PACKET,
+	.owner =	THIS_MODULE,
+	.release =	packet_release,
+	.bind =		packet_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	packet_getname,
+	.poll =		packet_poll,
+	.ioctl =	packet_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	packet_setsockopt,
+	.getsockopt =	packet_getsockopt,
+	.sendmsg =	packet_sendmsg,
+	.recvmsg =	packet_recvmsg,
+	.mmap =		packet_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct net_proto_family packet_family_ops = {
+	.family =	PF_PACKET,
+	.create =	packet_create,
+	.owner	=	THIS_MODULE,
+};
+
+static struct notifier_block packet_netdev_notifier = {
+	.notifier_call =	packet_notifier,
+};
+
+#ifdef CONFIG_PROC_FS
+
+static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(RCU)
+{
+	struct net *net = seq_file_net(seq);
+
+	rcu_read_lock();
+	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
+}
+
+static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
+}
+
+static void packet_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int packet_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
+	else {
+		struct sock *s = sk_entry(v);
+		const struct packet_sock *po = pkt_sk(s);
+
+		seq_printf(seq,
+			   "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
+			   s,
+			   atomic_read(&s->sk_refcnt),
+			   s->sk_type,
+			   ntohs(po->num),
+			   po->ifindex,
+			   po->running,
+			   atomic_read(&s->sk_rmem_alloc),
+			   sock_i_uid(s),
+			   sock_i_ino(s));
+	}
+
+	return 0;
+}
+
+static const struct seq_operations packet_seq_ops = {
+	.start	= packet_seq_start,
+	.next	= packet_seq_next,
+	.stop	= packet_seq_stop,
+	.show	= packet_seq_show,
+};
+
+static int packet_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &packet_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations packet_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= packet_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+#endif
+
+static int __net_init packet_net_init(struct net *net)
+{
+	spin_lock_init(&net->packet.sklist_lock);
+	INIT_HLIST_HEAD(&net->packet.sklist);
+
+	if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit packet_net_exit(struct net *net)
+{
+	proc_net_remove(net, "packet");
+}
+
+static struct pernet_operations packet_net_ops = {
+	.init = packet_net_init,
+	.exit = packet_net_exit,
+};
+
+
+static void __exit packet_exit(void)
+{
+	unregister_netdevice_notifier(&packet_netdev_notifier);
+	unregister_pernet_subsys(&packet_net_ops);
+	sock_unregister(PF_PACKET);
+	proto_unregister(&packet_proto);
+}
+
+static int __init packet_init(void)
+{
+	int rc = proto_register(&packet_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	sock_register(&packet_family_ops);
+	register_pernet_subsys(&packet_net_ops);
+	register_netdevice_notifier(&packet_netdev_notifier);
+out:
+	return rc;
+}
+
+module_init(packet_init);
+module_exit(packet_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_PACKET);
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
new file mode 100644
index 00000000..6ec7d55b
--- /dev/null
+++ b/net/phonet/Kconfig
@@ -0,0 +1,16 @@
+#
+# Phonet protocol
+#
+
+config PHONET
+	tristate "Phonet protocols family"
+	help
+	  The Phone Network protocol (PhoNet) is a packet-oriented
+	  communication protocol developed by Nokia for use with its modems.
+
+	  This is required for Maemo to use cellular data connectivity (if
+	  supported). It can also be used to control Nokia phones
+	  from a Linux computer, although AT commands may be easier to use.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called phonet. If unsure, say N.
diff --git a/net/phonet/Makefile b/net/phonet/Makefile
new file mode 100644
index 00000000..e10b1b18
--- /dev/null
+++ b/net/phonet/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_PHONET) += phonet.o pn_pep.o
+
+phonet-y := \
+	pn_dev.o \
+	pn_netlink.o \
+	socket.o \
+	datagram.o \
+	sysctl.o \
+	af_phonet.o
+
+pn_pep-y := pep.o pep-gprs.o
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
new file mode 100644
index 00000000..c6fffd94
--- /dev/null
+++ b/net/phonet/af_phonet.c
@@ -0,0 +1,548 @@
+/*
+ * File: af_phonet.c
+ *
+ * Phonet protocols family
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pn_dev.h>
+
+/* Transport protocol registration */
+static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+
+static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+{
+	struct phonet_protocol *pp;
+
+	if (protocol >= PHONET_NPROTO)
+		return NULL;
+
+	rcu_read_lock();
+	pp = rcu_dereference(proto_tab[protocol]);
+	if (pp && !try_module_get(pp->prot->owner))
+		pp = NULL;
+	rcu_read_unlock();
+
+	return pp;
+}
+
+static inline void phonet_proto_put(struct phonet_protocol *pp)
+{
+	module_put(pp->prot->owner);
+}
+
+/* protocol family functions */
+
+static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
+			    int kern)
+{
+	struct sock *sk;
+	struct pn_sock *pn;
+	struct phonet_protocol *pnp;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (protocol == 0) {
+		/* Default protocol selection */
+		switch (sock->type) {
+		case SOCK_DGRAM:
+			protocol = PN_PROTO_PHONET;
+			break;
+		case SOCK_SEQPACKET:
+			protocol = PN_PROTO_PIPE;
+			break;
+		default:
+			return -EPROTONOSUPPORT;
+		}
+	}
+
+	pnp = phonet_proto_get(protocol);
+	if (pnp == NULL &&
+	    request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0)
+		pnp = phonet_proto_get(protocol);
+
+	if (pnp == NULL)
+		return -EPROTONOSUPPORT;
+	if (sock->type != pnp->sock_type) {
+		err = -EPROTONOSUPPORT;
+		goto out;
+	}
+
+	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+	if (sk == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sock_init_data(sock, sk);
+	sock->state = SS_UNCONNECTED;
+	sock->ops = pnp->ops;
+	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+	sk->sk_protocol = protocol;
+	pn = pn_sk(sk);
+	pn->sobject = 0;
+	pn->dobject = 0;
+	pn->resource = 0;
+	sk->sk_prot->init(sk);
+	err = 0;
+
+out:
+	phonet_proto_put(pnp);
+	return err;
+}
+
+static const struct net_proto_family phonet_proto_family = {
+	.family = PF_PHONET,
+	.create = pn_socket_create,
+	.owner = THIS_MODULE,
+};
+
+/* Phonet device header operations */
+static int pn_header_create(struct sk_buff *skb, struct net_device *dev,
+				unsigned short type, const void *daddr,
+				const void *saddr, unsigned len)
+{
+	u8 *media = skb_push(skb, 1);
+
+	if (type != ETH_P_PHONET)
+		return -1;
+
+	if (!saddr)
+		saddr = dev->dev_addr;
+	*media = *(const u8 *)saddr;
+	return 1;
+}
+
+static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
+{
+	const u8 *media = skb_mac_header(skb);
+	*haddr = *media;
+	return 1;
+}
+
+struct header_ops phonet_header_ops = {
+	.create = pn_header_create,
+	.parse = pn_header_parse,
+};
+EXPORT_SYMBOL(phonet_header_ops);
+
+/*
+ * Prepends an ISI header and sends a datagram.
+ */
+static int pn_send(struct sk_buff *skb, struct net_device *dev,
+			u16 dst, u16 src, u8 res, u8 irq)
+{
+	struct phonethdr *ph;
+	int err;
+
+	if (skb->len + 2 > 0xffff /* Phonet length field limit */ ||
+	    skb->len + sizeof(struct phonethdr) > dev->mtu) {
+		err = -EMSGSIZE;
+		goto drop;
+	}
+
+	/* Broadcast sending is not implemented */
+	if (pn_addr(dst) == PNADDR_BROADCAST) {
+		err = -EOPNOTSUPP;
+		goto drop;
+	}
+
+	skb_reset_transport_header(skb);
+	WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */
+	skb_push(skb, sizeof(struct phonethdr));
+	skb_reset_network_header(skb);
+	ph = pn_hdr(skb);
+	ph->pn_rdev = pn_dev(dst);
+	ph->pn_sdev = pn_dev(src);
+	ph->pn_res = res;
+	ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph));
+	ph->pn_robj = pn_obj(dst);
+	ph->pn_sobj = pn_obj(src);
+
+	skb->protocol = htons(ETH_P_PHONET);
+	skb->priority = 0;
+	skb->dev = dev;
+
+	if (skb->pkt_type == PACKET_LOOPBACK) {
+		skb_reset_mac_header(skb);
+		skb_orphan(skb);
+		err = (irq ? netif_rx(skb) : netif_rx_ni(skb)) ? -ENOBUFS : 0;
+	} else {
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+					NULL, NULL, skb->len);
+		if (err < 0) {
+			err = -EHOSTUNREACH;
+			goto drop;
+		}
+		err = dev_queue_xmit(skb);
+		if (unlikely(err > 0))
+			err = net_xmit_errno(err);
+	}
+
+	return err;
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+static int pn_raw_send(const void *data, int len, struct net_device *dev,
+			u16 dst, u16 src, u8 res)
+{
+	struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0)
+		skb->pkt_type = PACKET_LOOPBACK;
+
+	skb_reserve(skb, MAX_PHONET_HEADER);
+	__skb_put(skb, len);
+	skb_copy_to_linear_data(skb, data, len);
+	return pn_send(skb, dev, dst, src, res, 1);
+}
+
+/*
+ * Create a Phonet header for the skb and send it out. Returns
+ * non-zero error code if failed. The skb is freed then.
+ */
+int pn_skb_send(struct sock *sk, struct sk_buff *skb,
+		const struct sockaddr_pn *target)
+{
+	struct net *net = sock_net(sk);
+	struct net_device *dev;
+	struct pn_sock *pn = pn_sk(sk);
+	int err;
+	u16 src, dst;
+	u8 daddr, saddr, res;
+
+	src = pn->sobject;
+	if (target != NULL) {
+		dst = pn_sockaddr_get_object(target);
+		res = pn_sockaddr_get_resource(target);
+	} else {
+		dst = pn->dobject;
+		res = pn->resource;
+	}
+	daddr = pn_addr(dst);
+
+	err = -EHOSTUNREACH;
+	if (sk->sk_bound_dev_if)
+		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+	else if (phonet_address_lookup(net, daddr) == 0) {
+		dev = phonet_device_get(net);
+		skb->pkt_type = PACKET_LOOPBACK;
+	} else if (dst == 0) {
+		/* Resource routing (small race until phonet_rcv()) */
+		struct sock *sk = pn_find_sock_by_res(net, res);
+		if (sk)	{
+			sock_put(sk);
+			dev = phonet_device_get(net);
+			skb->pkt_type = PACKET_LOOPBACK;
+		} else
+			dev = phonet_route_output(net, daddr);
+	} else
+		dev = phonet_route_output(net, daddr);
+
+	if (!dev || !(dev->flags & IFF_UP))
+		goto drop;
+
+	saddr = phonet_address_get(dev, daddr);
+	if (saddr == PN_NO_ADDR)
+		goto drop;
+
+	if (!pn_addr(src))
+		src = pn_object(saddr, pn_obj(src));
+
+	err = pn_send(skb, dev, dst, src, res, 0);
+	dev_put(dev);
+	return err;
+
+drop:
+	kfree_skb(skb);
+	if (dev)
+		dev_put(dev);
+	return err;
+}
+EXPORT_SYMBOL(pn_skb_send);
+
+/* Do not send an error message in response to an error message */
+static inline int can_respond(struct sk_buff *skb)
+{
+	const struct phonethdr *ph;
+	const struct phonetmsg *pm;
+	u8 submsg_id;
+
+	if (!pskb_may_pull(skb, 3))
+		return 0;
+
+	ph = pn_hdr(skb);
+	if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5))
+		return 0;
+	if (ph->pn_res == PN_COMMGR) /* indications */
+		return 0;
+
+	ph = pn_hdr(skb); /* re-acquires the pointer */
+	pm = pn_msg(skb);
+	if (pm->pn_msg_id != PN_COMMON_MESSAGE)
+		return 1;
+	submsg_id = (ph->pn_res == PN_PREFIX)
+		? pm->pn_e_submsg_id : pm->pn_submsg_id;
+	if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP &&
+		pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP)
+		return 1;
+	return 0;
+}
+
+static int send_obj_unreachable(struct sk_buff *rskb)
+{
+	const struct phonethdr *oph = pn_hdr(rskb);
+	const struct phonetmsg *opm = pn_msg(rskb);
+	struct phonetmsg resp;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.pn_trans_id = opm->pn_trans_id;
+	resp.pn_msg_id = PN_COMMON_MESSAGE;
+	if (oph->pn_res == PN_PREFIX) {
+		resp.pn_e_res_id = opm->pn_e_res_id;
+		resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+		resp.pn_e_orig_msg_id = opm->pn_msg_id;
+		resp.pn_e_status = 0;
+	} else {
+		resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
+		resp.pn_orig_msg_id = opm->pn_msg_id;
+		resp.pn_status = 0;
+	}
+	return pn_raw_send(&resp, sizeof(resp), rskb->dev,
+				pn_object(oph->pn_sdev, oph->pn_sobj),
+				pn_object(oph->pn_rdev, oph->pn_robj),
+				oph->pn_res);
+}
+
+static int send_reset_indications(struct sk_buff *rskb)
+{
+	struct phonethdr *oph = pn_hdr(rskb);
+	static const u8 data[4] = {
+		0x00 /* trans ID */, 0x10 /* subscribe msg */,
+		0x00 /* subscription count */, 0x00 /* dummy */
+	};
+
+	return pn_raw_send(data, sizeof(data), rskb->dev,
+				pn_object(oph->pn_sdev, 0x00),
+				pn_object(oph->pn_rdev, oph->pn_robj),
+				PN_COMMGR);
+}
+
+
+/* packet type functions */
+
+/*
+ * Stuff received packets to associated sockets.
+ * On error, returns non-zero and releases the skb.
+ */
+static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
+			struct packet_type *pkttype,
+			struct net_device *orig_dev)
+{
+	struct net *net = dev_net(dev);
+	struct phonethdr *ph;
+	struct sockaddr_pn sa;
+	u16 len;
+
+	/* check we have at least a full Phonet header */
+	if (!pskb_pull(skb, sizeof(struct phonethdr)))
+		goto out;
+
+	/* check that the advertised length is correct */
+	ph = pn_hdr(skb);
+	len = get_unaligned_be16(&ph->pn_length);
+	if (len < 2)
+		goto out;
+	len -= 2;
+	if ((len > skb->len) || pskb_trim(skb, len))
+		goto out;
+	skb_reset_transport_header(skb);
+
+	pn_skb_get_dst_sockaddr(skb, &sa);
+
+	/* check if this is broadcasted */
+	if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) {
+		pn_deliver_sock_broadcast(net, skb);
+		goto out;
+	}
+
+	/* resource routing */
+	if (pn_sockaddr_get_object(&sa) == 0) {
+		struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
+		if (sk)
+			return sk_receive_skb(sk, skb, 0);
+	}
+
+	/* check if we are the destination */
+	if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
+		/* Phonet packet input */
+		struct sock *sk = pn_find_sock_by_sa(net, &sa);
+
+		if (sk)
+			return sk_receive_skb(sk, skb, 0);
+
+		if (can_respond(skb)) {
+			send_obj_unreachable(skb);
+			send_reset_indications(skb);
+		}
+	} else if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+		goto out; /* Race between address deletion and loopback */
+	else {
+		/* Phonet packet routing */
+		struct net_device *out_dev;
+
+		out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa));
+		if (!out_dev) {
+			LIMIT_NETDEBUG(KERN_WARNING"No Phonet route to %02X\n",
+					pn_sockaddr_get_addr(&sa));
+			goto out;
+		}
+
+		__skb_push(skb, sizeof(struct phonethdr));
+		skb->dev = out_dev;
+		if (out_dev == dev) {
+			LIMIT_NETDEBUG(KERN_ERR"Phonet loop to %02X on %s\n",
+					pn_sockaddr_get_addr(&sa), dev->name);
+			goto out_dev;
+		}
+		/* Some drivers (e.g. TUN) do not allocate HW header space */
+		if (skb_cow_head(skb, out_dev->hard_header_len))
+			goto out_dev;
+
+		if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL,
+					skb->len) < 0)
+			goto out_dev;
+		dev_queue_xmit(skb);
+		dev_put(out_dev);
+		return NET_RX_SUCCESS;
+out_dev:
+		dev_put(out_dev);
+	}
+
+out:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static struct packet_type phonet_packet_type __read_mostly = {
+	.type = cpu_to_be16(ETH_P_PHONET),
+	.func = phonet_rcv,
+};
+
+static DEFINE_MUTEX(proto_tab_lock);
+
+int __init_or_module phonet_proto_register(unsigned int protocol,
+						struct phonet_protocol *pp)
+{
+	int err = 0;
+
+	if (protocol >= PHONET_NPROTO)
+		return -EINVAL;
+
+	err = proto_register(pp->prot, 1);
+	if (err)
+		return err;
+
+	mutex_lock(&proto_tab_lock);
+	if (proto_tab[protocol])
+		err = -EBUSY;
+	else
+		rcu_assign_pointer(proto_tab[protocol], pp);
+	mutex_unlock(&proto_tab_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(phonet_proto_register);
+
+void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+{
+	mutex_lock(&proto_tab_lock);
+	BUG_ON(proto_tab[protocol] != pp);
+	rcu_assign_pointer(proto_tab[protocol], NULL);
+	mutex_unlock(&proto_tab_lock);
+	synchronize_rcu();
+	proto_unregister(pp->prot);
+}
+EXPORT_SYMBOL(phonet_proto_unregister);
+
+/* Module registration */
+static int __init phonet_init(void)
+{
+	int err;
+
+	err = phonet_device_init();
+	if (err)
+		return err;
+
+	pn_sock_init();
+	err = sock_register(&phonet_proto_family);
+	if (err) {
+		printk(KERN_ALERT
+			"phonet protocol family initialization failed\n");
+		goto err_sock;
+	}
+
+	dev_add_pack(&phonet_packet_type);
+	phonet_sysctl_init();
+
+	err = isi_register();
+	if (err)
+		goto err;
+	return 0;
+
+err:
+	phonet_sysctl_exit();
+	sock_unregister(PF_PHONET);
+	dev_remove_pack(&phonet_packet_type);
+err_sock:
+	phonet_device_exit();
+	return err;
+}
+
+static void __exit phonet_exit(void)
+{
+	isi_unregister();
+	phonet_sysctl_exit();
+	sock_unregister(PF_PHONET);
+	dev_remove_pack(&phonet_packet_type);
+	phonet_device_exit();
+}
+
+module_init(phonet_init);
+module_exit(phonet_exit);
+MODULE_DESCRIPTION("Phonet protocol stack for Linux");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_PHONET);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
new file mode 100644
index 00000000..2f032381
--- /dev/null
+++ b/net/phonet/datagram.c
@@ -0,0 +1,214 @@
+/*
+ * File: datagram.c
+ *
+ * Datagram (ISI) Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <asm/ioctls.h>
+#include <net/sock.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
+/* associated socket ceases to exist */
+static void pn_sock_close(struct sock *sk, long timeout)
+{
+	sk_common_release(sk);
+}
+
+static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct sk_buff *skb;
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		lock_sock(sk);
+		skb = skb_peek(&sk->sk_receive_queue);
+		answ = skb ? skb->len : 0;
+		release_sock(sk);
+		return put_user(answ, (int __user *)arg);
+
+	case SIOCPNADDRESOURCE:
+	case SIOCPNDELRESOURCE: {
+			u32 res;
+			if (get_user(res, (u32 __user *)arg))
+				return -EFAULT;
+			if (res >= 256)
+				return -EINVAL;
+			if (cmd == SIOCPNADDRESOURCE)
+				return pn_sock_bind_res(sk, res);
+			else
+				return pn_sock_unbind_res(sk, res);
+		}
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+/* Destroy socket. All references are gone. */
+static void pn_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static int pn_init(struct sock *sk)
+{
+	sk->sk_destruct = pn_destruct;
+	return 0;
+}
+
+static int pn_sendmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len)
+{
+	struct sockaddr_pn *target;
+	struct sk_buff *skb;
+	int err;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
+				MSG_CMSG_COMPAT))
+		return -EOPNOTSUPP;
+
+	if (msg->msg_name == NULL)
+		return -EDESTADDRREQ;
+
+	if (msg->msg_namelen < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+
+	target = (struct sockaddr_pn *)msg->msg_name;
+	if (target->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len,
+					msg->msg_flags & MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		return err;
+	skb_reserve(skb, MAX_PHONET_HEADER);
+
+	err = memcpy_fromiovec((void *)skb_put(skb, len), msg->msg_iov, len);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	/*
+	 * Fill in the Phonet header and
+	 * finally pass the packet forwards.
+	 */
+	err = pn_skb_send(sk, skb, target);
+
+	/* If ok, return len. */
+	return (err >= 0) ? len : err;
+}
+
+static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len, int noblock,
+			int flags, int *addr_len)
+{
+	struct sk_buff *skb = NULL;
+	struct sockaddr_pn sa;
+	int rval = -EOPNOTSUPP;
+	int copylen;
+
+	if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL|
+			MSG_CMSG_COMPAT))
+		goto out_nofree;
+
+	if (addr_len)
+		*addr_len = sizeof(sa);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &rval);
+	if (skb == NULL)
+		goto out_nofree;
+
+	pn_skb_get_src_sockaddr(skb, &sa);
+
+	copylen = skb->len;
+	if (len < copylen) {
+		msg->msg_flags |= MSG_TRUNC;
+		copylen = len;
+	}
+
+	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	if (rval) {
+		rval = -EFAULT;
+		goto out;
+	}
+
+	rval = (flags & MSG_TRUNC) ? skb->len : copylen;
+
+	if (msg->msg_name != NULL)
+		memcpy(msg->msg_name, &sa, sizeof(struct sockaddr_pn));
+
+out:
+	skb_free_datagram(sk, skb);
+
+out_nofree:
+	return rval;
+}
+
+/* Queue an skb for a sock. */
+static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int err = sock_queue_rcv_skb(sk, skb);
+
+	if (err < 0)
+		kfree_skb(skb);
+	return err ? NET_RX_DROP : NET_RX_SUCCESS;
+}
+
+/* Module registration */
+static struct proto pn_proto = {
+	.close		= pn_sock_close,
+	.ioctl		= pn_ioctl,
+	.init		= pn_init,
+	.sendmsg	= pn_sendmsg,
+	.recvmsg	= pn_recvmsg,
+	.backlog_rcv	= pn_backlog_rcv,
+	.hash		= pn_sock_hash,
+	.unhash		= pn_sock_unhash,
+	.get_port	= pn_sock_get_port,
+	.obj_size	= sizeof(struct pn_sock),
+	.owner		= THIS_MODULE,
+	.name		= "PHONET",
+};
+
+static struct phonet_protocol pn_dgram_proto = {
+	.ops		= &phonet_dgram_ops,
+	.prot		= &pn_proto,
+	.sock_type	= SOCK_DGRAM,
+};
+
+int __init isi_register(void)
+{
+	return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto);
+}
+
+void __exit isi_unregister(void)
+{
+	phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto);
+}
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
new file mode 100644
index 00000000..d0120896
--- /dev/null
+++ b/net/phonet/pep-gprs.c
@@ -0,0 +1,328 @@
+/*
+ * File: pep-gprs.c
+ *
+ * GPRS over Phonet pipe end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+
+#include <linux/if_phonet.h>
+#include <net/tcp_states.h>
+#include <net/phonet/gprs.h>
+
+#define GPRS_DEFAULT_MTU 1400
+
+struct gprs_dev {
+	struct sock		*sk;
+	void			(*old_state_change)(struct sock *);
+	void			(*old_data_ready)(struct sock *, int);
+	void			(*old_write_space)(struct sock *);
+
+	struct net_device	*dev;
+};
+
+static __be16 gprs_type_trans(struct sk_buff *skb)
+{
+	const u8 *pvfc;
+	u8 buf;
+
+	pvfc = skb_header_pointer(skb, 0, 1, &buf);
+	if (!pvfc)
+		return htons(0);
+	/* Look at IP version field */
+	switch (*pvfc >> 4) {
+	case 4:
+		return htons(ETH_P_IP);
+	case 6:
+		return htons(ETH_P_IPV6);
+	}
+	return htons(0);
+}
+
+static void gprs_writeable(struct gprs_dev *gp)
+{
+	struct net_device *dev = gp->dev;
+
+	if (pep_writeable(gp->sk))
+		netif_wake_queue(dev);
+}
+
+/*
+ * Socket callbacks
+ */
+
+static void gprs_state_change(struct sock *sk)
+{
+	struct gprs_dev *gp = sk->sk_user_data;
+
+	if (sk->sk_state == TCP_CLOSE_WAIT) {
+		struct net_device *dev = gp->dev;
+
+		netif_stop_queue(dev);
+		netif_carrier_off(dev);
+	}
+}
+
+static int gprs_recv(struct gprs_dev *gp, struct sk_buff *skb)
+{
+	struct net_device *dev = gp->dev;
+	int err = 0;
+	__be16 protocol = gprs_type_trans(skb);
+
+	if (!protocol) {
+		err = -EINVAL;
+		goto drop;
+	}
+
+	if (skb_headroom(skb) & 3) {
+		struct sk_buff *rskb, *fs;
+		int flen = 0;
+
+		/* Phonet Pipe data header may be misaligned (3 bytes),
+		 * so wrap the IP packet as a single fragment of an head-less
+		 * socket buffer. The network stack will pull what it needs,
+		 * but at least, the whole IP payload is not memcpy'd. */
+		rskb = netdev_alloc_skb(dev, 0);
+		if (!rskb) {
+			err = -ENOBUFS;
+			goto drop;
+		}
+		skb_shinfo(rskb)->frag_list = skb;
+		rskb->len += skb->len;
+		rskb->data_len += rskb->len;
+		rskb->truesize += rskb->len;
+
+		/* Avoid nested fragments */
+		skb_walk_frags(skb, fs)
+			flen += fs->len;
+		skb->next = skb_shinfo(skb)->frag_list;
+		skb_frag_list_init(skb);
+		skb->len -= flen;
+		skb->data_len -= flen;
+		skb->truesize -= flen;
+
+		skb = rskb;
+	}
+
+	skb->protocol = protocol;
+	skb_reset_mac_header(skb);
+	skb->dev = dev;
+
+	if (likely(dev->flags & IFF_UP)) {
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += skb->len;
+		netif_rx(skb);
+		skb = NULL;
+	} else
+		err = -ENODEV;
+
+drop:
+	if (skb) {
+		dev_kfree_skb(skb);
+		dev->stats.rx_dropped++;
+	}
+	return err;
+}
+
+static void gprs_data_ready(struct sock *sk, int len)
+{
+	struct gprs_dev *gp = sk->sk_user_data;
+	struct sk_buff *skb;
+
+	while ((skb = pep_read(sk)) != NULL) {
+		skb_orphan(skb);
+		gprs_recv(gp, skb);
+	}
+}
+
+static void gprs_write_space(struct sock *sk)
+{
+	struct gprs_dev *gp = sk->sk_user_data;
+
+	if (netif_running(gp->dev))
+		gprs_writeable(gp);
+}
+
+/*
+ * Network device callbacks
+ */
+
+static int gprs_open(struct net_device *dev)
+{
+	struct gprs_dev *gp = netdev_priv(dev);
+
+	gprs_writeable(gp);
+	return 0;
+}
+
+static int gprs_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct gprs_dev *gp = netdev_priv(dev);
+	struct sock *sk = gp->sk;
+	int len, err;
+
+	switch (skb->protocol) {
+	case  htons(ETH_P_IP):
+	case  htons(ETH_P_IPV6):
+		break;
+	default:
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	skb_orphan(skb);
+	skb_set_owner_w(skb, sk);
+	len = skb->len;
+	err = pep_write(sk, skb);
+	if (err) {
+		LIMIT_NETDEBUG(KERN_WARNING"%s: TX error (%d)\n",
+				dev->name, err);
+		dev->stats.tx_aborted_errors++;
+		dev->stats.tx_errors++;
+	} else {
+		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += len;
+	}
+
+	netif_stop_queue(dev);
+	if (pep_writeable(sk))
+		netif_wake_queue(dev);
+	return NETDEV_TX_OK;
+}
+
+static int gprs_set_mtu(struct net_device *dev, int new_mtu)
+{
+	if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static const struct net_device_ops gprs_netdev_ops = {
+	.ndo_open	= gprs_open,
+	.ndo_stop	= gprs_close,
+	.ndo_start_xmit	= gprs_xmit,
+	.ndo_change_mtu	= gprs_set_mtu,
+};
+
+static void gprs_setup(struct net_device *dev)
+{
+	dev->features		= NETIF_F_FRAGLIST;
+	dev->type		= ARPHRD_PHONET_PIPE;
+	dev->flags		= IFF_POINTOPOINT | IFF_NOARP;
+	dev->mtu		= GPRS_DEFAULT_MTU;
+	dev->hard_header_len	= 0;
+	dev->addr_len		= 0;
+	dev->tx_queue_len	= 10;
+
+	dev->netdev_ops		= &gprs_netdev_ops;
+	dev->destructor		= free_netdev;
+}
+
+/*
+ * External interface
+ */
+
+/*
+ * Attach a GPRS interface to a datagram socket.
+ * Returns the interface index on success, negative error code on error.
+ */
+int gprs_attach(struct sock *sk)
+{
+	static const char ifname[] = "gprs%d";
+	struct gprs_dev *gp;
+	struct net_device *dev;
+	int err;
+
+	if (unlikely(sk->sk_type == SOCK_STREAM))
+		return -EINVAL; /* need packet boundaries */
+
+	/* Create net device */
+	dev = alloc_netdev(sizeof(*gp), ifname, gprs_setup);
+	if (!dev)
+		return -ENOMEM;
+	gp = netdev_priv(dev);
+	gp->sk = sk;
+	gp->dev = dev;
+
+	netif_stop_queue(dev);
+	err = register_netdev(dev);
+	if (err) {
+		free_netdev(dev);
+		return err;
+	}
+
+	lock_sock(sk);
+	if (unlikely(sk->sk_user_data)) {
+		err = -EBUSY;
+		goto out_rel;
+	}
+	if (unlikely((1 << sk->sk_state & (TCPF_CLOSE|TCPF_LISTEN)) ||
+			sock_flag(sk, SOCK_DEAD))) {
+		err = -EINVAL;
+		goto out_rel;
+	}
+	sk->sk_user_data	= gp;
+	gp->old_state_change	= sk->sk_state_change;
+	gp->old_data_ready	= sk->sk_data_ready;
+	gp->old_write_space	= sk->sk_write_space;
+	sk->sk_state_change	= gprs_state_change;
+	sk->sk_data_ready	= gprs_data_ready;
+	sk->sk_write_space	= gprs_write_space;
+	release_sock(sk);
+	sock_hold(sk);
+
+	printk(KERN_DEBUG"%s: attached\n", dev->name);
+	return dev->ifindex;
+
+out_rel:
+	release_sock(sk);
+	unregister_netdev(dev);
+	return err;
+}
+
+void gprs_detach(struct sock *sk)
+{
+	struct gprs_dev *gp = sk->sk_user_data;
+	struct net_device *dev = gp->dev;
+
+	lock_sock(sk);
+	sk->sk_user_data	= NULL;
+	sk->sk_state_change	= gp->old_state_change;
+	sk->sk_data_ready	= gp->old_data_ready;
+	sk->sk_write_space	= gp->old_write_space;
+	release_sock(sk);
+
+	printk(KERN_DEBUG"%s: detached\n", dev->name);
+	unregister_netdev(dev);
+	sock_put(sk);
+}
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
new file mode 100644
index 00000000..d29a7fb3
--- /dev/null
+++ b/net/phonet/pep.c
@@ -0,0 +1,1293 @@
+/*
+ * File: pep.c
+ *
+ * Phonet pipe protocol end point socket
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Author: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/ioctls.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
+#include <net/phonet/gprs.h>
+
+/* sk_state values:
+ * TCP_CLOSE		sock not in use yet
+ * TCP_CLOSE_WAIT	disconnected pipe
+ * TCP_LISTEN		listening pipe endpoint
+ * TCP_SYN_RECV		connected pipe in disabled state
+ * TCP_ESTABLISHED	connected pipe in enabled state
+ *
+ * pep_sock locking:
+ *  - sk_state, hlist: sock lock needed
+ *  - listener: read only
+ *  - pipe_handle: read only
+ */
+
+#define CREDITS_MAX	10
+#define CREDITS_THR	7
+
+#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */
+
+/* Get the next TLV sub-block. */
+static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen,
+					void *buf)
+{
+	void *data = NULL;
+	struct {
+		u8 sb_type;
+		u8 sb_len;
+	} *ph, h;
+	int buflen = *plen;
+
+	ph = skb_header_pointer(skb, 0, 2, &h);
+	if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len))
+		return NULL;
+	ph->sb_len -= 2;
+	*ptype = ph->sb_type;
+	*plen = ph->sb_len;
+
+	if (buflen > ph->sb_len)
+		buflen = ph->sb_len;
+	data = skb_header_pointer(skb, 2, buflen, buf);
+	__skb_pull(skb, 2 + ph->sb_len);
+	return data;
+}
+
+static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload,
+					int len, gfp_t priority)
+{
+	struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+	if (!skb)
+		return NULL;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	__skb_put(skb, len);
+	skb_copy_to_linear_data(skb, payload, len);
+	__skb_push(skb, sizeof(struct pnpipehdr));
+	skb_reset_transport_header(skb);
+	return skb;
+}
+
+static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code,
+			const void *data, int len, gfp_t priority)
+{
+	const struct pnpipehdr *oph = pnp_hdr(oskb);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+	struct sockaddr_pn peer;
+
+	skb = pep_alloc_skb(sk, data, len, priority);
+	if (!skb)
+		return -ENOMEM;
+
+	ph = pnp_hdr(skb);
+	ph->utid = oph->utid;
+	ph->message_id = oph->message_id + 1; /* REQ -> RESP */
+	ph->pipe_handle = oph->pipe_handle;
+	ph->error_code = code;
+
+	pn_skb_get_src_sockaddr(oskb, &peer);
+	return pn_skb_send(sk, skb, &peer);
+}
+
+static int pep_indicate(struct sock *sk, u8 id, u8 code,
+			const void *data, int len, gfp_t priority)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = pep_alloc_skb(sk, data, len, priority);
+	if (!skb)
+		return -ENOMEM;
+
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = id;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->data[0] = code;
+	return pn_skb_send(sk, skb, NULL);
+}
+
+#define PAD 0x00
+
+static int pipe_handler_request(struct sock *sk, u8 id, u8 code,
+				const void *data, int len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = pep_alloc_skb(sk, data, len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	ph = pnp_hdr(skb);
+	ph->utid = id; /* whatever */
+	ph->message_id = id;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->data[0] = code;
+	return pn_skb_send(sk, skb, NULL);
+}
+
+static int pipe_handler_send_created_ind(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	u8 data[4] = {
+		PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2),
+		pn->tx_fc, pn->rx_fc,
+	};
+
+	return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */,
+				data, 4, GFP_ATOMIC);
+}
+
+static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
+{
+	static const u8 data[20] = {
+		PAD, PAD, PAD, 2 /* sub-blocks */,
+		PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD,
+			PN_MULTI_CREDIT_FLOW_CONTROL,
+			PN_ONE_CREDIT_FLOW_CONTROL,
+			PN_LEGACY_FLOW_CONTROL,
+			PAD,
+		PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD,
+			PN_MULTI_CREDIT_FLOW_CONTROL,
+			PN_ONE_CREDIT_FLOW_CONTROL,
+			PN_LEGACY_FLOW_CONTROL,
+			PAD,
+	};
+
+	might_sleep();
+	return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data),
+				GFP_KERNEL);
+}
+
+static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code,
+				gfp_t priority)
+{
+	static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ };
+	WARN_ON(code == PN_PIPE_NO_ERROR);
+	return pep_reply(sk, skb, code, data, sizeof(data), priority);
+}
+
+/* Control requests are not sent by the pipe service and have a specific
+ * message format. */
+static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
+				gfp_t priority)
+{
+	const struct pnpipehdr *oph = pnp_hdr(oskb);
+	struct sk_buff *skb;
+	struct pnpipehdr *ph;
+	struct sockaddr_pn dst;
+	u8 data[4] = {
+		oph->data[0], /* PEP type */
+		code, /* error code, at an unusual offset */
+		PAD, PAD,
+	};
+
+	skb = pep_alloc_skb(sk, data, 4, priority);
+	if (!skb)
+		return -ENOMEM;
+
+	ph = pnp_hdr(skb);
+	ph->utid = oph->utid;
+	ph->message_id = PNS_PEP_CTRL_RESP;
+	ph->pipe_handle = oph->pipe_handle;
+	ph->data[0] = oph->data[1]; /* CTRL id */
+
+	pn_skb_get_src_sockaddr(oskb, &dst);
+	return pn_skb_send(sk, skb, &dst);
+}
+
+static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
+{
+	u8 data[4] = { type, PAD, PAD, status };
+
+	return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON,
+				data, 4, priority);
+}
+
+/* Send our RX flow control information to the sender.
+ * Socket must be locked. */
+static void pipe_grant_credits(struct sock *sk, gfp_t priority)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+	switch (pn->rx_fc) {
+	case PN_LEGACY_FLOW_CONTROL: /* TODO */
+		break;
+	case PN_ONE_CREDIT_FLOW_CONTROL:
+		if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
+					PEP_IND_READY, priority) == 0)
+			pn->rx_credits = 1;
+		break;
+	case PN_MULTI_CREDIT_FLOW_CONTROL:
+		if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX)
+			break;
+		if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
+					CREDITS_MAX - pn->rx_credits,
+					priority) == 0)
+			pn->rx_credits = CREDITS_MAX;
+		break;
+	}
+}
+
+static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr;
+	int wake = 0;
+
+	if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+		return -EINVAL;
+
+	hdr = pnp_hdr(skb);
+	if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
+				(unsigned)hdr->data[0]);
+		return -EOPNOTSUPP;
+	}
+
+	switch (hdr->data[1]) {
+	case PN_PEP_IND_FLOW_CONTROL:
+		switch (pn->tx_fc) {
+		case PN_LEGACY_FLOW_CONTROL:
+			switch (hdr->data[4]) {
+			case PEP_IND_BUSY:
+				atomic_set(&pn->tx_credits, 0);
+				break;
+			case PEP_IND_READY:
+				atomic_set(&pn->tx_credits, wake = 1);
+				break;
+			}
+			break;
+		case PN_ONE_CREDIT_FLOW_CONTROL:
+			if (hdr->data[4] == PEP_IND_READY)
+				atomic_set(&pn->tx_credits, wake = 1);
+			break;
+		}
+		break;
+
+	case PN_PEP_IND_ID_MCFC_GRANT_CREDITS:
+		if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL)
+			break;
+		atomic_add(wake = hdr->data[4], &pn->tx_credits);
+		break;
+
+	default:
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP indication: %u\n",
+				(unsigned)hdr->data[1]);
+		return -EOPNOTSUPP;
+	}
+	if (wake)
+		sk->sk_write_space(sk);
+	return 0;
+}
+
+static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	u8 n_sb = hdr->data[0];
+
+	pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+	__skb_pull(skb, sizeof(*hdr));
+	while (n_sb > 0) {
+		u8 type, buf[2], len = sizeof(buf);
+		u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			return -EINVAL;
+		switch (type) {
+		case PN_PIPE_SB_NEGOTIATED_FC:
+			if (len < 2 || (data[0] | data[1]) > 3)
+				break;
+			pn->tx_fc = data[0] & 3;
+			pn->rx_fc = data[1] & 3;
+			break;
+		}
+		n_sb--;
+	}
+	return 0;
+}
+
+/* Queue an skb to a connected sock.
+ * Socket lock must be held. */
+static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	struct sk_buff_head *queue;
+	int err = 0;
+
+	BUG_ON(sk->sk_state == TCP_CLOSE_WAIT);
+
+	switch (hdr->message_id) {
+	case PNS_PEP_CONNECT_REQ:
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_DISCONNECT_REQ:
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		sk->sk_state = TCP_CLOSE_WAIT;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		break;
+
+	case PNS_PEP_ENABLE_REQ:
+		/* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_RESET_REQ:
+		switch (hdr->state_after_reset) {
+		case PN_PIPE_DISABLE:
+			pn->init_enable = 0;
+			break;
+		case PN_PIPE_ENABLE:
+			pn->init_enable = 1;
+			break;
+		default: /* not allowed to send an error here!? */
+			err = -EINVAL;
+			goto out;
+		}
+		/* fall through */
+	case PNS_PEP_DISABLE_REQ:
+		atomic_set(&pn->tx_credits, 0);
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_CTRL_REQ:
+		if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
+			atomic_inc(&sk->sk_drops);
+			break;
+		}
+		__skb_pull(skb, 4);
+		queue = &pn->ctrlreq_queue;
+		goto queue;
+
+	case PNS_PIPE_ALIGNED_DATA:
+		__skb_pull(skb, 1);
+		/* fall through */
+	case PNS_PIPE_DATA:
+		__skb_pull(skb, 3); /* Pipe data header */
+		if (!pn_flow_safe(pn->rx_fc)) {
+			err = sock_queue_rcv_skb(sk, skb);
+			if (!err)
+				return NET_RX_SUCCESS;
+			err = -ENOBUFS;
+			break;
+		}
+
+		if (pn->rx_credits == 0) {
+			atomic_inc(&sk->sk_drops);
+			err = -ENOBUFS;
+			break;
+		}
+		pn->rx_credits--;
+		queue = &sk->sk_receive_queue;
+		goto queue;
+
+	case PNS_PEP_STATUS_IND:
+		pipe_rcv_status(sk, skb);
+		break;
+
+	case PNS_PIPE_REDIRECTED_IND:
+		err = pipe_rcv_created(sk, skb);
+		break;
+
+	case PNS_PIPE_CREATED_IND:
+		err = pipe_rcv_created(sk, skb);
+		if (err)
+			break;
+		/* fall through */
+	case PNS_PIPE_RESET_IND:
+		if (!pn->init_enable)
+			break;
+		/* fall through */
+	case PNS_PIPE_ENABLED_IND:
+		if (!pn_flow_safe(pn->tx_fc)) {
+			atomic_set(&pn->tx_credits, 1);
+			sk->sk_write_space(sk);
+		}
+		if (sk->sk_state == TCP_ESTABLISHED)
+			break; /* Nothing to do */
+		sk->sk_state = TCP_ESTABLISHED;
+		pipe_grant_credits(sk, GFP_ATOMIC);
+		break;
+
+	case PNS_PIPE_DISABLED_IND:
+		sk->sk_state = TCP_SYN_RECV;
+		pn->rx_credits = 0;
+		break;
+
+	default:
+		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP message: %u\n",
+				hdr->message_id);
+		err = -EINVAL;
+	}
+out:
+	kfree_skb(skb);
+	return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS;
+
+queue:
+	skb->dev = NULL;
+	skb_set_owner_r(skb, sk);
+	err = skb->len;
+	skb_queue_tail(queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, err);
+	return NET_RX_SUCCESS;
+}
+
+/* Destroy connected sock. */
+static void pipe_destruct(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+	skb_queue_purge(&pn->ctrlreq_queue);
+}
+
+static u8 pipe_negotiate_fc(const u8 *fcs, unsigned n)
+{
+	unsigned i;
+	u8 final_fc = PN_NO_FLOW_CONTROL;
+
+	for (i = 0; i < n; i++) {
+		u8 fc = fcs[i];
+
+		if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL)
+			final_fc = fc;
+	}
+	return final_fc;
+}
+
+static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr;
+	u8 n_sb;
+
+	if (!pskb_pull(skb, sizeof(*hdr) + 4))
+		return -EINVAL;
+
+	hdr = pnp_hdr(skb);
+	if (hdr->error_code != PN_PIPE_NO_ERROR)
+		return -ECONNREFUSED;
+
+	/* Parse sub-blocks */
+	n_sb = hdr->data[4];
+	while (n_sb > 0) {
+		u8 type, buf[6], len = sizeof(buf);
+		const u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			return -EINVAL;
+
+		switch (type) {
+		case PN_PIPE_SB_REQUIRED_FC_TX:
+			if (len < 2 || len < data[0])
+				break;
+			pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2);
+			break;
+
+		case PN_PIPE_SB_PREFERRED_FC_RX:
+			if (len < 2 || len < data[0])
+				break;
+			pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2);
+			break;
+
+		}
+		n_sb--;
+	}
+
+	return pipe_handler_send_created_ind(sk);
+}
+
+/* Queue an skb to an actively connected sock.
+ * Socket lock must be held. */
+static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+	int err = NET_RX_SUCCESS;
+
+	switch (hdr->message_id) {
+	case PNS_PIPE_ALIGNED_DATA:
+		__skb_pull(skb, 1);
+		/* fall through */
+	case PNS_PIPE_DATA:
+		__skb_pull(skb, 3); /* Pipe data header */
+		if (!pn_flow_safe(pn->rx_fc)) {
+			err = sock_queue_rcv_skb(sk, skb);
+			if (!err)
+				return NET_RX_SUCCESS;
+			err = NET_RX_DROP;
+			break;
+		}
+
+		if (pn->rx_credits == 0) {
+			atomic_inc(&sk->sk_drops);
+			err = NET_RX_DROP;
+			break;
+		}
+		pn->rx_credits--;
+		skb->dev = NULL;
+		skb_set_owner_r(skb, sk);
+		err = skb->len;
+		skb_queue_tail(&sk->sk_receive_queue, skb);
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, err);
+		return NET_RX_SUCCESS;
+
+	case PNS_PEP_CONNECT_RESP:
+		if (sk->sk_state != TCP_SYN_SENT)
+			break;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		if (pep_connresp_rcv(sk, skb)) {
+			sk->sk_state = TCP_CLOSE_WAIT;
+			break;
+		}
+
+		sk->sk_state = TCP_ESTABLISHED;
+		if (!pn_flow_safe(pn->tx_fc)) {
+			atomic_set(&pn->tx_credits, 1);
+			sk->sk_write_space(sk);
+		}
+		pipe_grant_credits(sk, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_DISCONNECT_RESP:
+		/* sock should already be dead, nothing to do */
+		break;
+
+	case PNS_PEP_STATUS_IND:
+		pipe_rcv_status(sk, skb);
+		break;
+	}
+	kfree_skb(skb);
+	return err;
+}
+
+/* Listening sock must be locked */
+static struct sock *pep_find_pipe(const struct hlist_head *hlist,
+					const struct sockaddr_pn *dst,
+					u8 pipe_handle)
+{
+	struct hlist_node *node;
+	struct sock *sknode;
+	u16 dobj = pn_sockaddr_get_object(dst);
+
+	sk_for_each(sknode, node, hlist) {
+		struct pep_sock *pnnode = pep_sk(sknode);
+
+		/* Ports match, but addresses might not: */
+		if (pnnode->pn_sk.sobject != dobj)
+			continue;
+		if (pnnode->pipe_handle != pipe_handle)
+			continue;
+		if (sknode->sk_state == TCP_CLOSE_WAIT)
+			continue;
+
+		sock_hold(sknode);
+		return sknode;
+	}
+	return NULL;
+}
+
+/*
+ * Deliver an skb to a listening sock.
+ * Socket lock must be held.
+ * We then queue the skb to the right connected sock (if any).
+ */
+static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sock *sknode;
+	struct pnpipehdr *hdr;
+	struct sockaddr_pn dst;
+	u8 pipe_handle;
+
+	if (!pskb_may_pull(skb, sizeof(*hdr)))
+		goto drop;
+
+	hdr = pnp_hdr(skb);
+	pipe_handle = hdr->pipe_handle;
+	if (pipe_handle == PN_PIPE_INVALID_HANDLE)
+		goto drop;
+
+	pn_skb_get_dst_sockaddr(skb, &dst);
+
+	/* Look for an existing pipe handle */
+	sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+	if (sknode)
+		return sk_receive_skb(sknode, skb, 1);
+
+	switch (hdr->message_id) {
+	case PNS_PEP_CONNECT_REQ:
+		if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) {
+			pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE,
+					GFP_ATOMIC);
+			break;
+		}
+		skb_queue_head(&sk->sk_receive_queue, skb);
+		sk_acceptq_added(sk);
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, 0);
+		return NET_RX_SUCCESS;
+
+	case PNS_PEP_DISCONNECT_REQ:
+		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_CTRL_REQ:
+		pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC);
+		break;
+
+	case PNS_PEP_RESET_REQ:
+	case PNS_PEP_ENABLE_REQ:
+	case PNS_PEP_DISABLE_REQ:
+		/* invalid handle is not even allowed here! */
+		break;
+
+	default:
+		if ((1 << sk->sk_state)
+				& ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT))
+			/* actively connected socket */
+			return pipe_handler_do_rcv(sk, skb);
+	}
+drop:
+	kfree_skb(skb);
+	return NET_RX_SUCCESS;
+}
+
+static int pipe_do_remove(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = PNS_PIPE_REMOVE_REQ;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->data[0] = PAD;
+	return pn_skb_send(sk, skb, NULL);
+}
+
+/* associated socket ceases to exist */
+static void pep_sock_close(struct sock *sk, long timeout)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int ifindex = 0;
+
+	sock_hold(sk); /* keep a reference after sk_common_release() */
+	sk_common_release(sk);
+
+	lock_sock(sk);
+	if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) {
+		if (sk->sk_backlog_rcv == pipe_do_rcv)
+			/* Forcefully remove dangling Phonet pipe */
+			pipe_do_remove(sk);
+		else
+			pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD,
+						NULL, 0);
+	}
+	sk->sk_state = TCP_CLOSE;
+
+	ifindex = pn->ifindex;
+	pn->ifindex = 0;
+	release_sock(sk);
+
+	if (ifindex)
+		gprs_detach(sk);
+	sock_put(sk);
+}
+
+static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
+{
+	struct pep_sock *pn = pep_sk(sk), *newpn;
+	struct sock *newsk = NULL;
+	struct sk_buff *skb;
+	struct pnpipehdr *hdr;
+	struct sockaddr_pn dst, src;
+	int err;
+	u16 peer_type;
+	u8 pipe_handle, enabled, n_sb;
+	u8 aligned = 0;
+
+	skb = skb_recv_datagram(sk, 0, flags & O_NONBLOCK, errp);
+	if (!skb)
+		return NULL;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_LISTEN) {
+		err = -EINVAL;
+		goto drop;
+	}
+	sk_acceptq_removed(sk);
+
+	err = -EPROTO;
+	if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+		goto drop;
+
+	hdr = pnp_hdr(skb);
+	pipe_handle = hdr->pipe_handle;
+	switch (hdr->state_after_connect) {
+	case PN_PIPE_DISABLE:
+		enabled = 0;
+		break;
+	case PN_PIPE_ENABLE:
+		enabled = 1;
+		break;
+	default:
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM,
+				GFP_KERNEL);
+		goto drop;
+	}
+	peer_type = hdr->other_pep_type << 8;
+
+	/* Parse sub-blocks (options) */
+	n_sb = hdr->data[4];
+	while (n_sb > 0) {
+		u8 type, buf[1], len = sizeof(buf);
+		const u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			goto drop;
+		switch (type) {
+		case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
+			if (len < 1)
+				goto drop;
+			peer_type = (peer_type & 0xff00) | data[0];
+			break;
+		case PN_PIPE_SB_ALIGNED_DATA:
+			aligned = data[0] != 0;
+			break;
+		}
+		n_sb--;
+	}
+
+	/* Check for duplicate pipe handle */
+	newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
+	if (unlikely(newsk)) {
+		__sock_put(newsk);
+		newsk = NULL;
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL);
+		goto drop;
+	}
+
+	/* Create a new to-be-accepted sock */
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+	if (!newsk) {
+		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
+		err = -ENOBUFS;
+		goto drop;
+	}
+
+	sock_init_data(NULL, newsk);
+	newsk->sk_state = TCP_SYN_RECV;
+	newsk->sk_backlog_rcv = pipe_do_rcv;
+	newsk->sk_protocol = sk->sk_protocol;
+	newsk->sk_destruct = pipe_destruct;
+
+	newpn = pep_sk(newsk);
+	pn_skb_get_dst_sockaddr(skb, &dst);
+	pn_skb_get_src_sockaddr(skb, &src);
+	newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
+	newpn->pn_sk.dobject = pn_sockaddr_get_object(&src);
+	newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst);
+	sock_hold(sk);
+	newpn->listener = sk;
+	skb_queue_head_init(&newpn->ctrlreq_queue);
+	newpn->pipe_handle = pipe_handle;
+	atomic_set(&newpn->tx_credits, 0);
+	newpn->ifindex = 0;
+	newpn->peer_type = peer_type;
+	newpn->rx_credits = 0;
+	newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+	newpn->init_enable = enabled;
+	newpn->aligned = aligned;
+
+	err = pep_accept_conn(newsk, skb);
+	if (err) {
+		sock_put(newsk);
+		newsk = NULL;
+		goto drop;
+	}
+	sk_add_node(newsk, &pn->hlist);
+drop:
+	release_sock(sk);
+	kfree_skb(skb);
+	*errp = err;
+	return newsk;
+}
+
+static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int err;
+	u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
+
+	pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+	err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
+					PN_PIPE_ENABLE, data, 4);
+	if (err) {
+		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+		return err;
+	}
+	sk->sk_state = TCP_SYN_SENT;
+	return 0;
+}
+
+static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		lock_sock(sk);
+		if (sock_flag(sk, SOCK_URGINLINE) &&
+		    !skb_queue_empty(&pn->ctrlreq_queue))
+			answ = skb_peek(&pn->ctrlreq_queue)->len;
+		else if (!skb_queue_empty(&sk->sk_receive_queue))
+			answ = skb_peek(&sk->sk_receive_queue)->len;
+		else
+			answ = 0;
+		release_sock(sk);
+		return put_user(answ, (int __user *)arg);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static int pep_init(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	sk->sk_destruct = pipe_destruct;
+	INIT_HLIST_HEAD(&pn->hlist);
+	pn->listener = NULL;
+	skb_queue_head_init(&pn->ctrlreq_queue);
+	atomic_set(&pn->tx_credits, 0);
+	pn->ifindex = 0;
+	pn->peer_type = 0;
+	pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+	pn->rx_credits = 0;
+	pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
+	pn->init_enable = 1;
+	pn->aligned = 0;
+	return 0;
+}
+
+static int pep_setsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int val = 0, err = 0;
+
+	if (level != SOL_PNPIPE)
+		return -ENOPROTOOPT;
+	if (optlen >= sizeof(int)) {
+		if (get_user(val, (int __user *) optval))
+			return -EFAULT;
+	}
+
+	lock_sock(sk);
+	switch (optname) {
+	case PNPIPE_ENCAP:
+		if (val && val != PNPIPE_ENCAP_IP) {
+			err = -EINVAL;
+			break;
+		}
+		if (!pn->ifindex == !val)
+			break; /* Nothing to do! */
+		if (!capable(CAP_NET_ADMIN)) {
+			err = -EPERM;
+			break;
+		}
+		if (val) {
+			release_sock(sk);
+			err = gprs_attach(sk);
+			if (err > 0) {
+				pn->ifindex = err;
+				err = 0;
+			}
+		} else {
+			pn->ifindex = 0;
+			release_sock(sk);
+			gprs_detach(sk);
+			err = 0;
+		}
+		goto out_norel;
+
+	default:
+		err = -ENOPROTOOPT;
+	}
+	release_sock(sk);
+
+out_norel:
+	return err;
+}
+
+static int pep_getsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int len, val;
+
+	if (level != SOL_PNPIPE)
+		return -ENOPROTOOPT;
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	switch (optname) {
+	case PNPIPE_ENCAP:
+		val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
+		break;
+
+	case PNPIPE_IFINDEX:
+		val = pn->ifindex;
+		break;
+
+	case PNPIPE_HANDLE:
+		val = pn->pipe_handle;
+		if (val == PN_PIPE_INVALID_HANDLE)
+			return -EINVAL;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = min_t(unsigned int, sizeof(int), len);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (put_user(val, (int __user *) optval))
+		return -EFAULT;
+	return 0;
+}
+
+static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	int err;
+
+	if (pn_flow_safe(pn->tx_fc) &&
+	    !atomic_add_unless(&pn->tx_credits, -1, 0)) {
+		kfree_skb(skb);
+		return -ENOBUFS;
+	}
+
+	skb_push(skb, 3 + pn->aligned);
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	if (pn->aligned) {
+		ph->message_id = PNS_PIPE_ALIGNED_DATA;
+		ph->data[0] = 0; /* padding */
+	} else
+		ph->message_id = PNS_PIPE_DATA;
+	ph->pipe_handle = pn->pipe_handle;
+	err = pn_skb_send(sk, skb, NULL);
+
+	if (err && pn_flow_safe(pn->tx_fc))
+		atomic_inc(&pn->tx_credits);
+	return err;
+
+}
+
+static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sk_buff *skb;
+	long timeo;
+	int flags = msg->msg_flags;
+	int err, done;
+
+	if (len > USHRT_MAX)
+		return -EMSGSIZE;
+
+	if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
+				MSG_CMSG_COMPAT)) ||
+			!(msg->msg_flags & MSG_EOR))
+		return -EOPNOTSUPP;
+
+	skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
+					flags & MSG_DONTWAIT, &err);
+	if (!skb)
+		return err;
+
+	skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned);
+	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (err < 0)
+		goto outfree;
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) {
+		err = -ENOTCONN;
+		goto out;
+	}
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		/* Wait until the pipe gets to enabled state */
+disabled:
+		err = sk_stream_wait_connect(sk, &timeo);
+		if (err)
+			goto out;
+
+		if (sk->sk_state == TCP_CLOSE_WAIT) {
+			err = -ECONNRESET;
+			goto out;
+		}
+	}
+	BUG_ON(sk->sk_state != TCP_ESTABLISHED);
+
+	/* Wait until flow control allows TX */
+	done = atomic_read(&pn->tx_credits);
+	while (!done) {
+		DEFINE_WAIT(wait);
+
+		if (!timeo) {
+			err = -EAGAIN;
+			goto out;
+		}
+		if (signal_pending(current)) {
+			err = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		prepare_to_wait(sk_sleep(sk), &wait,
+				TASK_INTERRUPTIBLE);
+		done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits));
+		finish_wait(sk_sleep(sk), &wait);
+
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto disabled;
+	}
+
+	err = pipe_skb_send(sk, skb);
+	if (err >= 0)
+		err = len; /* success! */
+	skb = NULL;
+out:
+	release_sock(sk);
+outfree:
+	kfree_skb(skb);
+	return err;
+}
+
+int pep_writeable(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+
+	return atomic_read(&pn->tx_credits);
+}
+
+int pep_write(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *rskb, *fs;
+	int flen = 0;
+
+	if (pep_sk(sk)->aligned)
+		return pipe_skb_send(sk, skb);
+
+	rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+	if (!rskb) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+	skb_shinfo(rskb)->frag_list = skb;
+	rskb->len += skb->len;
+	rskb->data_len += rskb->len;
+	rskb->truesize += rskb->len;
+
+	/* Avoid nested fragments */
+	skb_walk_frags(skb, fs)
+		flen += fs->len;
+	skb->next = skb_shinfo(skb)->frag_list;
+	skb_frag_list_init(skb);
+	skb->len -= flen;
+	skb->data_len -= flen;
+	skb->truesize -= flen;
+
+	skb_reserve(rskb, MAX_PHONET_HEADER + 3);
+	return pipe_skb_send(sk, rskb);
+}
+
+struct sk_buff *pep_read(struct sock *sk)
+{
+	struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		pipe_grant_credits(sk, GFP_ATOMIC);
+	return skb;
+}
+
+static int pep_recvmsg(struct kiocb *iocb, struct sock *sk,
+			struct msghdr *msg, size_t len, int noblock,
+			int flags, int *addr_len)
+{
+	struct sk_buff *skb;
+	int err;
+
+	if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL|
+			MSG_NOSIGNAL|MSG_CMSG_COMPAT))
+		return -EOPNOTSUPP;
+
+	if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE)))
+		return -ENOTCONN;
+
+	if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) {
+		/* Dequeue and acknowledge control request */
+		struct pep_sock *pn = pep_sk(sk);
+
+		if (flags & MSG_PEEK)
+			return -EOPNOTSUPP;
+		skb = skb_dequeue(&pn->ctrlreq_queue);
+		if (skb) {
+			pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR,
+						GFP_KERNEL);
+			msg->msg_flags |= MSG_OOB;
+			goto copy;
+		}
+		if (flags & MSG_OOB)
+			return -EINVAL;
+	}
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	lock_sock(sk);
+	if (skb == NULL) {
+		if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT)
+			err = -ECONNRESET;
+		release_sock(sk);
+		return err;
+	}
+
+	if (sk->sk_state == TCP_ESTABLISHED)
+		pipe_grant_credits(sk, GFP_KERNEL);
+	release_sock(sk);
+copy:
+	msg->msg_flags |= MSG_EOR;
+	if (skb->len > len)
+		msg->msg_flags |= MSG_TRUNC;
+	else
+		len = skb->len;
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	if (!err)
+		err = (flags & MSG_TRUNC) ? skb->len : len;
+
+	skb_free_datagram(sk, skb);
+	return err;
+}
+
+static void pep_sock_unhash(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sock *skparent = NULL;
+
+	lock_sock(sk);
+
+	if (pn->listener != NULL) {
+		skparent = pn->listener;
+		pn->listener = NULL;
+		release_sock(sk);
+
+		pn = pep_sk(skparent);
+		lock_sock(skparent);
+		sk_del_node_init(sk);
+		sk = skparent;
+	}
+
+	/* Unhash a listening sock only when it is closed
+	 * and all of its active connected pipes are closed. */
+	if (hlist_empty(&pn->hlist))
+		pn_sock_unhash(&pn->pn_sk.sk);
+	release_sock(sk);
+
+	if (skparent)
+		sock_put(skparent);
+}
+
+static struct proto pep_proto = {
+	.close		= pep_sock_close,
+	.accept		= pep_sock_accept,
+	.connect	= pep_sock_connect,
+	.ioctl		= pep_ioctl,
+	.init		= pep_init,
+	.setsockopt	= pep_setsockopt,
+	.getsockopt	= pep_getsockopt,
+	.sendmsg	= pep_sendmsg,
+	.recvmsg	= pep_recvmsg,
+	.backlog_rcv	= pep_do_rcv,
+	.hash		= pn_sock_hash,
+	.unhash		= pep_sock_unhash,
+	.get_port	= pn_sock_get_port,
+	.obj_size	= sizeof(struct pep_sock),
+	.owner		= THIS_MODULE,
+	.name		= "PNPIPE",
+};
+
+static struct phonet_protocol pep_pn_proto = {
+	.ops		= &phonet_stream_ops,
+	.prot		= &pep_proto,
+	.sock_type	= SOCK_SEQPACKET,
+};
+
+static int __init pep_register(void)
+{
+	return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+static void __exit pep_unregister(void)
+{
+	phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto);
+}
+
+module_init(pep_register);
+module_exit(pep_unregister);
+MODULE_AUTHOR("Remi Denis-Courmont, Nokia");
+MODULE_DESCRIPTION("Phonet pipe protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE);
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
new file mode 100644
index 00000000..d2df8f33
--- /dev/null
+++ b/net/phonet/pn_dev.c
@@ -0,0 +1,448 @@
+/*
+ * File: pn_dev.c
+ *
+ * Phonet network device
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/phonet.h>
+#include <linux/proc_fs.h>
+#include <linux/if_arp.h>
+#include <net/sock.h>
+#include <net/netns/generic.h>
+#include <net/phonet/pn_dev.h>
+
+struct phonet_routes {
+	struct mutex		lock;
+	struct net_device	*table[64];
+};
+
+struct phonet_net {
+	struct phonet_device_list pndevs;
+	struct phonet_routes routes;
+};
+
+int phonet_net_id __read_mostly;
+
+static struct phonet_net *phonet_pernet(struct net *net)
+{
+	BUG_ON(!net);
+
+	return net_generic(net, phonet_net_id);
+}
+
+struct phonet_device_list *phonet_device_list(struct net *net)
+{
+	struct phonet_net *pnn = phonet_pernet(net);
+	return &pnn->pndevs;
+}
+
+/* Allocate new Phonet device. */
+static struct phonet_device *__phonet_device_alloc(struct net_device *dev)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC);
+	if (pnd == NULL)
+		return NULL;
+	pnd->netdev = dev;
+	bitmap_zero(pnd->addrs, 64);
+
+	BUG_ON(!mutex_is_locked(&pndevs->lock));
+	list_add_rcu(&pnd->list, &pndevs->list);
+	return pnd;
+}
+
+static struct phonet_device *__phonet_get(struct net_device *dev)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd;
+
+	BUG_ON(!mutex_is_locked(&pndevs->lock));
+	list_for_each_entry(pnd, &pndevs->list, list) {
+		if (pnd->netdev == dev)
+			return pnd;
+	}
+	return NULL;
+}
+
+static struct phonet_device *__phonet_get_rcu(struct net_device *dev)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd;
+
+	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+		if (pnd->netdev == dev)
+			return pnd;
+	}
+	return NULL;
+}
+
+static void phonet_device_destroy(struct net_device *dev)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd;
+
+	ASSERT_RTNL();
+
+	mutex_lock(&pndevs->lock);
+	pnd = __phonet_get(dev);
+	if (pnd)
+		list_del_rcu(&pnd->list);
+	mutex_unlock(&pndevs->lock);
+
+	if (pnd) {
+		u8 addr;
+
+		for_each_set_bit(addr, pnd->addrs, 64)
+			phonet_address_notify(RTM_DELADDR, dev, addr);
+		kfree(pnd);
+	}
+}
+
+struct net_device *phonet_device_get(struct net *net)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(net);
+	struct phonet_device *pnd;
+	struct net_device *dev = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+		dev = pnd->netdev;
+		BUG_ON(!dev);
+
+		if ((dev->reg_state == NETREG_REGISTERED) &&
+			((pnd->netdev->flags & IFF_UP)) == IFF_UP)
+			break;
+		dev = NULL;
+	}
+	if (dev)
+		dev_hold(dev);
+	rcu_read_unlock();
+	return dev;
+}
+
+int phonet_address_add(struct net_device *dev, u8 addr)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd;
+	int err = 0;
+
+	mutex_lock(&pndevs->lock);
+	/* Find or create Phonet-specific device data */
+	pnd = __phonet_get(dev);
+	if (pnd == NULL)
+		pnd = __phonet_device_alloc(dev);
+	if (unlikely(pnd == NULL))
+		err = -ENOMEM;
+	else if (test_and_set_bit(addr >> 2, pnd->addrs))
+		err = -EEXIST;
+	mutex_unlock(&pndevs->lock);
+	return err;
+}
+
+int phonet_address_del(struct net_device *dev, u8 addr)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
+	struct phonet_device *pnd;
+	int err = 0;
+
+	mutex_lock(&pndevs->lock);
+	pnd = __phonet_get(dev);
+	if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) {
+		err = -EADDRNOTAVAIL;
+		pnd = NULL;
+	} else if (bitmap_empty(pnd->addrs, 64))
+		list_del_rcu(&pnd->list);
+	else
+		pnd = NULL;
+	mutex_unlock(&pndevs->lock);
+
+	if (pnd)
+		kfree_rcu(pnd, rcu);
+
+	return err;
+}
+
+/* Gets a source address toward a destination, through a interface. */
+u8 phonet_address_get(struct net_device *dev, u8 daddr)
+{
+	struct phonet_device *pnd;
+	u8 saddr;
+
+	rcu_read_lock();
+	pnd = __phonet_get_rcu(dev);
+	if (pnd) {
+		BUG_ON(bitmap_empty(pnd->addrs, 64));
+
+		/* Use same source address as destination, if possible */
+		if (test_bit(daddr >> 2, pnd->addrs))
+			saddr = daddr;
+		else
+			saddr = find_first_bit(pnd->addrs, 64) << 2;
+	} else
+		saddr = PN_NO_ADDR;
+	rcu_read_unlock();
+
+	if (saddr == PN_NO_ADDR) {
+		/* Fallback to another device */
+		struct net_device *def_dev;
+
+		def_dev = phonet_device_get(dev_net(dev));
+		if (def_dev) {
+			if (def_dev != dev)
+				saddr = phonet_address_get(def_dev, daddr);
+			dev_put(def_dev);
+		}
+	}
+	return saddr;
+}
+
+int phonet_address_lookup(struct net *net, u8 addr)
+{
+	struct phonet_device_list *pndevs = phonet_device_list(net);
+	struct phonet_device *pnd;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+		/* Don't allow unregistering devices! */
+		if ((pnd->netdev->reg_state != NETREG_REGISTERED) ||
+				((pnd->netdev->flags & IFF_UP)) != IFF_UP)
+			continue;
+
+		if (test_bit(addr >> 2, pnd->addrs)) {
+			err = 0;
+			goto found;
+		}
+	}
+found:
+	rcu_read_unlock();
+	return err;
+}
+
+/* automatically configure a Phonet device, if supported */
+static int phonet_device_autoconf(struct net_device *dev)
+{
+	struct if_phonet_req req;
+	int ret;
+
+	if (!dev->netdev_ops->ndo_do_ioctl)
+		return -EOPNOTSUPP;
+
+	ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req,
+						SIOCPNGAUTOCONF);
+	if (ret < 0)
+		return ret;
+
+	ASSERT_RTNL();
+	ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device);
+	if (ret)
+		return ret;
+	phonet_address_notify(RTM_NEWADDR, dev,
+				req.ifr_phonet_autoconf.device);
+	return 0;
+}
+
+static void phonet_route_autodel(struct net_device *dev)
+{
+	struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+	unsigned i;
+	DECLARE_BITMAP(deleted, 64);
+
+	/* Remove left-over Phonet routes */
+	bitmap_zero(deleted, 64);
+	mutex_lock(&pnn->routes.lock);
+	for (i = 0; i < 64; i++)
+		if (dev == pnn->routes.table[i]) {
+			rcu_assign_pointer(pnn->routes.table[i], NULL);
+			set_bit(i, deleted);
+		}
+	mutex_unlock(&pnn->routes.lock);
+
+	if (bitmap_empty(deleted, 64))
+		return; /* short-circuit RCU */
+	synchronize_rcu();
+	for_each_set_bit(i, deleted, 64) {
+		rtm_phonet_notify(RTM_DELROUTE, dev, i);
+		dev_put(dev);
+	}
+}
+
+/* notify Phonet of device events */
+static int phonet_device_notify(struct notifier_block *me, unsigned long what,
+				void *arg)
+{
+	struct net_device *dev = arg;
+
+	switch (what) {
+	case NETDEV_REGISTER:
+		if (dev->type == ARPHRD_PHONET)
+			phonet_device_autoconf(dev);
+		break;
+	case NETDEV_UNREGISTER:
+		phonet_device_destroy(dev);
+		phonet_route_autodel(dev);
+		break;
+	}
+	return 0;
+
+}
+
+static struct notifier_block phonet_device_notifier = {
+	.notifier_call = phonet_device_notify,
+	.priority = 0,
+};
+
+/* Per-namespace Phonet devices handling */
+static int __net_init phonet_init_net(struct net *net)
+{
+	struct phonet_net *pnn = phonet_pernet(net);
+
+	if (!proc_net_fops_create(net, "phonet", 0, &pn_sock_seq_fops))
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&pnn->pndevs.list);
+	mutex_init(&pnn->pndevs.lock);
+	mutex_init(&pnn->routes.lock);
+	return 0;
+}
+
+static void __net_exit phonet_exit_net(struct net *net)
+{
+	struct phonet_net *pnn = phonet_pernet(net);
+	struct net_device *dev;
+	unsigned i;
+
+	rtnl_lock();
+	for_each_netdev(net, dev)
+		phonet_device_destroy(dev);
+
+	for (i = 0; i < 64; i++) {
+		dev = pnn->routes.table[i];
+		if (dev) {
+			rtm_phonet_notify(RTM_DELROUTE, dev, i);
+			dev_put(dev);
+		}
+	}
+	rtnl_unlock();
+
+	proc_net_remove(net, "phonet");
+}
+
+static struct pernet_operations phonet_net_ops = {
+	.init = phonet_init_net,
+	.exit = phonet_exit_net,
+	.id   = &phonet_net_id,
+	.size = sizeof(struct phonet_net),
+};
+
+/* Initialize Phonet devices list */
+int __init phonet_device_init(void)
+{
+	int err = register_pernet_device(&phonet_net_ops);
+	if (err)
+		return err;
+
+	proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops);
+	register_netdevice_notifier(&phonet_device_notifier);
+	err = phonet_netlink_register();
+	if (err)
+		phonet_device_exit();
+	return err;
+}
+
+void phonet_device_exit(void)
+{
+	rtnl_unregister_all(PF_PHONET);
+	unregister_netdevice_notifier(&phonet_device_notifier);
+	unregister_pernet_device(&phonet_net_ops);
+	proc_net_remove(&init_net, "pnresource");
+}
+
+int phonet_route_add(struct net_device *dev, u8 daddr)
+{
+	struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+	struct phonet_routes *routes = &pnn->routes;
+	int err = -EEXIST;
+
+	daddr = daddr >> 2;
+	mutex_lock(&routes->lock);
+	if (routes->table[daddr] == NULL) {
+		rcu_assign_pointer(routes->table[daddr], dev);
+		dev_hold(dev);
+		err = 0;
+	}
+	mutex_unlock(&routes->lock);
+	return err;
+}
+
+int phonet_route_del(struct net_device *dev, u8 daddr)
+{
+	struct phonet_net *pnn = phonet_pernet(dev_net(dev));
+	struct phonet_routes *routes = &pnn->routes;
+
+	daddr = daddr >> 2;
+	mutex_lock(&routes->lock);
+	if (dev == routes->table[daddr])
+		rcu_assign_pointer(routes->table[daddr], NULL);
+	else
+		dev = NULL;
+	mutex_unlock(&routes->lock);
+
+	if (!dev)
+		return -ENOENT;
+	synchronize_rcu();
+	dev_put(dev);
+	return 0;
+}
+
+struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr)
+{
+	struct phonet_net *pnn = phonet_pernet(net);
+	struct phonet_routes *routes = &pnn->routes;
+	struct net_device *dev;
+
+	daddr >>= 2;
+	dev = rcu_dereference(routes->table[daddr]);
+	return dev;
+}
+
+struct net_device *phonet_route_output(struct net *net, u8 daddr)
+{
+	struct phonet_net *pnn = phonet_pernet(net);
+	struct phonet_routes *routes = &pnn->routes;
+	struct net_device *dev;
+
+	daddr >>= 2;
+	rcu_read_lock();
+	dev = rcu_dereference(routes->table[daddr]);
+	if (dev)
+		dev_hold(dev);
+	rcu_read_unlock();
+
+	if (!dev)
+		dev = phonet_device_get(net); /* Default route */
+	return dev;
+}
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
new file mode 100644
index 00000000..438accb7
--- /dev/null
+++ b/net/phonet/pn_netlink.c
@@ -0,0 +1,303 @@
+/*
+ * File: pn_netlink.c
+ *
+ * Phonet netlink interface
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/netlink.h>
+#include <linux/phonet.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/phonet/pn_dev.h>
+
+/* Device address handling */
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+		     u32 pid, u32 seq, int event);
+
+void phonet_address_notify(int event, struct net_device *dev, u8 addr)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+			nla_total_size(1), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+	err = fill_addr(skb, dev, addr, 0, 0, event);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, dev_net(dev), 0,
+		    RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
+	return;
+errout:
+	rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
+}
+
+static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = {
+	[IFA_LOCAL] = { .type = NLA_U8 },
+};
+
+static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFA_MAX+1];
+	struct net_device *dev;
+	struct ifaddrmsg *ifm;
+	int err;
+	u8 pnaddr;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy);
+	if (err < 0)
+		return err;
+
+	ifm = nlmsg_data(nlh);
+	if (tb[IFA_LOCAL] == NULL)
+		return -EINVAL;
+	pnaddr = nla_get_u8(tb[IFA_LOCAL]);
+	if (pnaddr & 3)
+		/* Phonet addresses only have 6 high-order bits */
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, ifm->ifa_index);
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (nlh->nlmsg_type == RTM_NEWADDR)
+		err = phonet_address_add(dev, pnaddr);
+	else
+		err = phonet_address_del(dev, pnaddr);
+	if (!err)
+		phonet_address_notify(nlh->nlmsg_type, dev, pnaddr);
+	return err;
+}
+
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr,
+			u32 pid, u32 seq, int event)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ifm = nlmsg_data(nlh);
+	ifm->ifa_family = AF_PHONET;
+	ifm->ifa_prefixlen = 0;
+	ifm->ifa_flags = IFA_F_PERMANENT;
+	ifm->ifa_scope = RT_SCOPE_LINK;
+	ifm->ifa_index = dev->ifindex;
+	NLA_PUT_U8(skb, IFA_LOCAL, addr);
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct phonet_device_list *pndevs;
+	struct phonet_device *pnd;
+	int dev_idx = 0, dev_start_idx = cb->args[0];
+	int addr_idx = 0, addr_start_idx = cb->args[1];
+
+	pndevs = phonet_device_list(sock_net(skb->sk));
+	rcu_read_lock();
+	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+		u8 addr;
+
+		if (dev_idx > dev_start_idx)
+			addr_start_idx = 0;
+		if (dev_idx++ < dev_start_idx)
+			continue;
+
+		addr_idx = 0;
+		for_each_set_bit(addr, pnd->addrs, 64) {
+			if (addr_idx++ < addr_start_idx)
+				continue;
+
+			if (fill_addr(skb, pnd->netdev, addr << 2,
+					 NETLINK_CB(cb->skb).pid,
+					cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0)
+				goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	cb->args[0] = dev_idx;
+	cb->args[1] = addr_idx;
+
+	return skb->len;
+}
+
+/* Routes handling */
+
+static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst,
+			u32 pid, u32 seq, int event)
+{
+	struct rtmsg *rtm;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family = AF_PHONET;
+	rtm->rtm_dst_len = 6;
+	rtm->rtm_src_len = 0;
+	rtm->rtm_tos = 0;
+	rtm->rtm_table = RT_TABLE_MAIN;
+	rtm->rtm_protocol = RTPROT_STATIC;
+	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+	rtm->rtm_type = RTN_UNICAST;
+	rtm->rtm_flags = 0;
+	NLA_PUT_U8(skb, RTA_DST, dst);
+	NLA_PUT_U32(skb, RTA_OIF, dev->ifindex);
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+void rtm_phonet_notify(int event, struct net_device *dev, u8 dst)
+{
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+			nla_total_size(1) + nla_total_size(4), GFP_KERNEL);
+	if (skb == NULL)
+		goto errout;
+	err = fill_route(skb, dev, dst, 0, 0, event);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+	rtnl_notify(skb, dev_net(dev), 0,
+			  RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL);
+	return;
+errout:
+	rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err);
+}
+
+static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = {
+	[RTA_DST] = { .type = NLA_U8 },
+	[RTA_OIF] = { .type = NLA_U32 },
+};
+
+static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[RTA_MAX+1];
+	struct net_device *dev;
+	struct rtmsg *rtm;
+	int err;
+	u8 dst;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ASSERT_RTNL();
+
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy);
+	if (err < 0)
+		return err;
+
+	rtm = nlmsg_data(nlh);
+	if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST)
+		return -EINVAL;
+	if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL)
+		return -EINVAL;
+	dst = nla_get_u8(tb[RTA_DST]);
+	if (dst & 3) /* Phonet addresses only have 6 high-order bits */
+		return -EINVAL;
+
+	dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (nlh->nlmsg_type == RTM_NEWROUTE)
+		err = phonet_route_add(dev, dst);
+	else
+		err = phonet_route_del(dev, dst);
+	if (!err)
+		rtm_phonet_notify(nlh->nlmsg_type, dev, dst);
+	return err;
+}
+
+static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	u8 addr, addr_idx = 0, addr_start_idx = cb->args[0];
+
+	rcu_read_lock();
+	for (addr = 0; addr < 64; addr++) {
+		struct net_device *dev;
+
+		dev = phonet_route_get_rcu(net, addr << 2);
+		if (!dev)
+			continue;
+
+		if (addr_idx++ < addr_start_idx)
+			continue;
+		if (fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, RTM_NEWROUTE))
+			goto out;
+	}
+
+out:
+	rcu_read_unlock();
+	cb->args[0] = addr_idx;
+	cb->args[1] = 0;
+
+	return skb->len;
+}
+
+int __init phonet_netlink_register(void)
+{
+	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit, NULL);
+	if (err)
+		return err;
+
+	/* Further __rtnl_register() cannot fail */
+	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL);
+	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit);
+	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL);
+	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL);
+	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit);
+	return 0;
+}
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
new file mode 100644
index 00000000..ab07711c
--- /dev/null
+++ b/net/phonet/socket.c
@@ -0,0 +1,825 @@
+/*
+ * File: socket.c
+ *
+ * Phonet sockets
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ * Original author: Sakari Ailus <sakari.ailus@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <linux/phonet.h>
+#include <net/phonet/phonet.h>
+#include <net/phonet/pep.h>
+#include <net/phonet/pn_dev.h>
+
+static int pn_socket_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		sock->sk = NULL;
+		sk->sk_prot->close(sk, 0);
+	}
+	return 0;
+}
+
+#define PN_HASHSIZE	16
+#define PN_HASHMASK	(PN_HASHSIZE-1)
+
+
+static struct  {
+	struct hlist_head hlist[PN_HASHSIZE];
+	struct mutex lock;
+} pnsocks;
+
+void __init pn_sock_init(void)
+{
+	unsigned i;
+
+	for (i = 0; i < PN_HASHSIZE; i++)
+		INIT_HLIST_HEAD(pnsocks.hlist + i);
+	mutex_init(&pnsocks.lock);
+}
+
+static struct hlist_head *pn_hash_list(u16 obj)
+{
+	return pnsocks.hlist + (obj & PN_HASHMASK);
+}
+
+/*
+ * Find address based on socket address, match only certain fields.
+ * Also grab sock if it was found. Remember to sock_put it later.
+ */
+struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn)
+{
+	struct hlist_node *node;
+	struct sock *sknode;
+	struct sock *rval = NULL;
+	u16 obj = pn_sockaddr_get_object(spn);
+	u8 res = spn->spn_resource;
+	struct hlist_head *hlist = pn_hash_list(obj);
+
+	rcu_read_lock();
+	sk_for_each_rcu(sknode, node, hlist) {
+		struct pn_sock *pn = pn_sk(sknode);
+		BUG_ON(!pn->sobject); /* unbound socket */
+
+		if (!net_eq(sock_net(sknode), net))
+			continue;
+		if (pn_port(obj)) {
+			/* Look up socket by port */
+			if (pn_port(pn->sobject) != pn_port(obj))
+				continue;
+		} else {
+			/* If port is zero, look up by resource */
+			if (pn->resource != res)
+				continue;
+		}
+		if (pn_addr(pn->sobject) &&
+		    pn_addr(pn->sobject) != pn_addr(obj))
+			continue;
+
+		rval = sknode;
+		sock_hold(sknode);
+		break;
+	}
+	rcu_read_unlock();
+
+	return rval;
+}
+
+/* Deliver a broadcast packet (only in bottom-half) */
+void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
+{
+	struct hlist_head *hlist = pnsocks.hlist;
+	unsigned h;
+
+	rcu_read_lock();
+	for (h = 0; h < PN_HASHSIZE; h++) {
+		struct hlist_node *node;
+		struct sock *sknode;
+
+		sk_for_each(sknode, node, hlist) {
+			struct sk_buff *clone;
+
+			if (!net_eq(sock_net(sknode), net))
+				continue;
+			if (!sock_flag(sknode, SOCK_BROADCAST))
+				continue;
+
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (clone) {
+				sock_hold(sknode);
+				sk_receive_skb(sknode, clone, 0);
+			}
+		}
+		hlist++;
+	}
+	rcu_read_unlock();
+}
+
+void pn_sock_hash(struct sock *sk)
+{
+	struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);
+
+	mutex_lock(&pnsocks.lock);
+	sk_add_node_rcu(sk, hlist);
+	mutex_unlock(&pnsocks.lock);
+}
+EXPORT_SYMBOL(pn_sock_hash);
+
+void pn_sock_unhash(struct sock *sk)
+{
+	mutex_lock(&pnsocks.lock);
+	sk_del_node_init_rcu(sk);
+	mutex_unlock(&pnsocks.lock);
+	pn_sock_unbind_all_res(sk);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(pn_sock_unhash);
+
+static DEFINE_MUTEX(port_mutex);
+
+static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+	struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+	int err;
+	u16 handle;
+	u8 saddr;
+
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, addr, len);
+
+	if (len < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+	if (spn->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr);
+	saddr = pn_addr(handle);
+	if (saddr && phonet_address_lookup(sock_net(sk), saddr))
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) {
+		err = -EINVAL; /* attempt to rebind */
+		goto out;
+	}
+	WARN_ON(sk_hashed(sk));
+	mutex_lock(&port_mutex);
+	err = sk->sk_prot->get_port(sk, pn_port(handle));
+	if (err)
+		goto out_port;
+
+	/* get_port() sets the port, bind() sets the address if applicable */
+	pn->sobject = pn_object(saddr, pn_port(pn->sobject));
+	pn->resource = spn->spn_resource;
+
+	/* Enable RX on the socket */
+	sk->sk_prot->hash(sk);
+out_port:
+	mutex_unlock(&port_mutex);
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int pn_socket_autobind(struct socket *sock)
+{
+	struct sockaddr_pn sa;
+	int err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.spn_family = AF_PHONET;
+	err = pn_socket_bind(sock, (struct sockaddr *)&sa,
+				sizeof(struct sockaddr_pn));
+	if (err != -EINVAL)
+		return err;
+	BUG_ON(!pn_port(pn_sk(sock->sk)->sobject));
+	return 0; /* socket was already bound */
+}
+
+static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
+		int len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+	struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+	struct task_struct *tsk = current;
+	long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	int err;
+
+	if (pn_socket_autobind(sock))
+		return -ENOBUFS;
+	if (len < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+	if (spn->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	lock_sock(sk);
+
+	switch (sock->state) {
+	case SS_UNCONNECTED:
+		if (sk->sk_state != TCP_CLOSE) {
+			err = -EISCONN;
+			goto out;
+		}
+		break;
+	case SS_CONNECTING:
+		err = -EALREADY;
+		goto out;
+	default:
+		err = -EISCONN;
+		goto out;
+	}
+
+	pn->dobject = pn_sockaddr_get_object(spn);
+	pn->resource = pn_sockaddr_get_resource(spn);
+	sock->state = SS_CONNECTING;
+
+	err = sk->sk_prot->connect(sk, addr, len);
+	if (err) {
+		sock->state = SS_UNCONNECTED;
+		pn->dobject = 0;
+		goto out;
+	}
+
+	while (sk->sk_state == TCP_SYN_SENT) {
+		DEFINE_WAIT(wait);
+
+		if (!timeo) {
+			err = -EINPROGRESS;
+			goto out;
+		}
+		if (signal_pending(tsk)) {
+			err = sock_intr_errno(timeo);
+			goto out;
+		}
+
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+						TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		finish_wait(sk_sleep(sk), &wait);
+	}
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
+		err = 0;
+	else if (sk->sk_state == TCP_CLOSE_WAIT)
+		err = -ECONNRESET;
+	else
+		err = -ECONNREFUSED;
+	sock->state = err ? SS_UNCONNECTED : SS_CONNECTED;
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int pn_socket_accept(struct socket *sock, struct socket *newsock,
+				int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sock *newsk;
+	int err;
+
+	if (unlikely(sk->sk_state != TCP_LISTEN))
+		return -EINVAL;
+
+	newsk = sk->sk_prot->accept(sk, flags, &err);
+	if (!newsk)
+		return err;
+
+	lock_sock(newsk);
+	sock_graft(newsk, newsock);
+	newsock->state = SS_CONNECTED;
+	release_sock(newsk);
+	return 0;
+}
+
+static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
+				int *sockaddr_len, int peer)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+
+	memset(addr, 0, sizeof(struct sockaddr_pn));
+	addr->sa_family = AF_PHONET;
+	if (!peer) /* Race with bind() here is userland's problem. */
+		pn_sockaddr_set_object((struct sockaddr_pn *)addr,
+					pn->sobject);
+
+	*sockaddr_len = sizeof(struct sockaddr_pn);
+	return 0;
+}
+
+static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
+					poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct pep_sock *pn = pep_sk(sk);
+	unsigned int mask = 0;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	if (sk->sk_state == TCP_CLOSE)
+		return POLLERR;
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+	if (!skb_queue_empty(&pn->ctrlreq_queue))
+		mask |= POLLPRI;
+	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
+		return POLLHUP;
+
+	if (sk->sk_state == TCP_ESTABLISHED &&
+		atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
+		atomic_read(&pn->tx_credits))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
+				unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct pn_sock *pn = pn_sk(sk);
+
+	if (cmd == SIOCPNGETOBJECT) {
+		struct net_device *dev;
+		u16 handle;
+		u8 saddr;
+
+		if (get_user(handle, (__u16 __user *)arg))
+			return -EFAULT;
+
+		lock_sock(sk);
+		if (sk->sk_bound_dev_if)
+			dev = dev_get_by_index(sock_net(sk),
+						sk->sk_bound_dev_if);
+		else
+			dev = phonet_device_get(sock_net(sk));
+		if (dev && (dev->flags & IFF_UP))
+			saddr = phonet_address_get(dev, pn_addr(handle));
+		else
+			saddr = PN_NO_ADDR;
+		release_sock(sk);
+
+		if (dev)
+			dev_put(dev);
+		if (saddr == PN_NO_ADDR)
+			return -EHOSTUNREACH;
+
+		handle = pn_object(saddr, pn_port(pn->sobject));
+		return put_user(handle, (__u16 __user *)arg);
+	}
+
+	return sk->sk_prot->ioctl(sk, cmd, arg);
+}
+
+static int pn_socket_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	if (pn_socket_autobind(sock))
+		return -ENOBUFS;
+
+	lock_sock(sk);
+	if (sock->state != SS_UNCONNECTED) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (sk->sk_state != TCP_LISTEN) {
+		sk->sk_state = TCP_LISTEN;
+		sk->sk_ack_backlog = 0;
+	}
+	sk->sk_max_ack_backlog = backlog;
+out:
+	release_sock(sk);
+	return err;
+}
+
+static int pn_socket_sendmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+
+	if (pn_socket_autobind(sock))
+		return -EAGAIN;
+
+	return sk->sk_prot->sendmsg(iocb, sk, m, total_len);
+}
+
+const struct proto_ops phonet_dgram_ops = {
+	.family		= AF_PHONET,
+	.owner		= THIS_MODULE,
+	.release	= pn_socket_release,
+	.bind		= pn_socket_bind,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= pn_socket_getname,
+	.poll		= datagram_poll,
+	.ioctl		= pn_socket_ioctl,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_no_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = sock_no_setsockopt,
+	.compat_getsockopt = sock_no_getsockopt,
+#endif
+	.sendmsg	= pn_socket_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+const struct proto_ops phonet_stream_ops = {
+	.family		= AF_PHONET,
+	.owner		= THIS_MODULE,
+	.release	= pn_socket_release,
+	.bind		= pn_socket_bind,
+	.connect	= pn_socket_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= pn_socket_accept,
+	.getname	= pn_socket_getname,
+	.poll		= pn_socket_poll,
+	.ioctl		= pn_socket_ioctl,
+	.listen		= pn_socket_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= sock_common_setsockopt,
+	.getsockopt	= sock_common_getsockopt,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+	.sendmsg	= pn_socket_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+EXPORT_SYMBOL(phonet_stream_ops);
+
+/* allocate port for a socket */
+int pn_sock_get_port(struct sock *sk, unsigned short sport)
+{
+	static int port_cur;
+	struct net *net = sock_net(sk);
+	struct pn_sock *pn = pn_sk(sk);
+	struct sockaddr_pn try_sa;
+	struct sock *tmpsk;
+
+	memset(&try_sa, 0, sizeof(struct sockaddr_pn));
+	try_sa.spn_family = AF_PHONET;
+	WARN_ON(!mutex_is_locked(&port_mutex));
+	if (!sport) {
+		/* search free port */
+		int port, pmin, pmax;
+
+		phonet_get_local_port_range(&pmin, &pmax);
+		for (port = pmin; port <= pmax; port++) {
+			port_cur++;
+			if (port_cur < pmin || port_cur > pmax)
+				port_cur = pmin;
+
+			pn_sockaddr_set_port(&try_sa, port_cur);
+			tmpsk = pn_find_sock_by_sa(net, &try_sa);
+			if (tmpsk == NULL) {
+				sport = port_cur;
+				goto found;
+			} else
+				sock_put(tmpsk);
+		}
+	} else {
+		/* try to find specific port */
+		pn_sockaddr_set_port(&try_sa, sport);
+		tmpsk = pn_find_sock_by_sa(net, &try_sa);
+		if (tmpsk == NULL)
+			/* No sock there! We can use that port... */
+			goto found;
+		else
+			sock_put(tmpsk);
+	}
+	/* the port must be in use already */
+	return -EADDRINUSE;
+
+found:
+	pn->sobject = pn_object(pn_addr(pn->sobject), sport);
+	return 0;
+}
+EXPORT_SYMBOL(pn_sock_get_port);
+
+#ifdef CONFIG_PROC_FS
+static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	struct hlist_head *hlist = pnsocks.hlist;
+	struct hlist_node *node;
+	struct sock *sknode;
+	unsigned h;
+
+	for (h = 0; h < PN_HASHSIZE; h++) {
+		sk_for_each_rcu(sknode, node, hlist) {
+			if (!net_eq(net, sock_net(sknode)))
+				continue;
+			if (!pos)
+				return sknode;
+			pos--;
+		}
+		hlist++;
+	}
+	return NULL;
+}
+
+static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct net *net = seq_file_net(seq);
+
+	do
+		sk = sk_next(sk);
+	while (sk && !net_eq(net, sock_net(sk)));
+
+	return sk;
+}
+
+static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rcu)
+{
+	rcu_read_lock();
+	return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = pn_sock_get_idx(seq, 0);
+	else
+		sk = pn_sock_get_next(seq, v);
+	(*pos)++;
+	return sk;
+}
+
+static void pn_sock_seq_stop(struct seq_file *seq, void *v)
+	__releases(rcu)
+{
+	rcu_read_unlock();
+}
+
+static int pn_sock_seq_show(struct seq_file *seq, void *v)
+{
+	int len;
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%s%n", "pt  loc  rem rs st tx_queue rx_queue "
+			"  uid inode ref pointer drops", &len);
+	else {
+		struct sock *sk = v;
+		struct pn_sock *pn = pn_sk(sk);
+
+		seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
+			"%d %pK %d%n",
+			sk->sk_protocol, pn->sobject, pn->dobject,
+			pn->resource, sk->sk_state,
+			sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
+			sock_i_uid(sk), sock_i_ino(sk),
+			atomic_read(&sk->sk_refcnt), sk,
+			atomic_read(&sk->sk_drops), &len);
+	}
+	seq_printf(seq, "%*s\n", 127 - len, "");
+	return 0;
+}
+
+static const struct seq_operations pn_sock_seq_ops = {
+	.start = pn_sock_seq_start,
+	.next = pn_sock_seq_next,
+	.stop = pn_sock_seq_stop,
+	.show = pn_sock_seq_show,
+};
+
+static int pn_sock_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &pn_sock_seq_ops,
+				sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_sock_seq_fops = {
+	.owner = THIS_MODULE,
+	.open = pn_sock_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
+
+static struct  {
+	struct sock *sk[256];
+} pnres;
+
+/*
+ * Find and hold socket based on resource.
+ */
+struct sock *pn_find_sock_by_res(struct net *net, u8 res)
+{
+	struct sock *sk;
+
+	if (!net_eq(net, &init_net))
+		return NULL;
+
+	rcu_read_lock();
+	sk = rcu_dereference(pnres.sk[res]);
+	if (sk)
+		sock_hold(sk);
+	rcu_read_unlock();
+	return sk;
+}
+
+static DEFINE_MUTEX(resource_mutex);
+
+int pn_sock_bind_res(struct sock *sk, u8 res)
+{
+	int ret = -EADDRINUSE;
+
+	if (!net_eq(sock_net(sk), &init_net))
+		return -ENOIOCTLCMD;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (pn_socket_autobind(sk->sk_socket))
+		return -EAGAIN;
+
+	mutex_lock(&resource_mutex);
+	if (pnres.sk[res] == NULL) {
+		sock_hold(sk);
+		rcu_assign_pointer(pnres.sk[res], sk);
+		ret = 0;
+	}
+	mutex_unlock(&resource_mutex);
+	return ret;
+}
+
+int pn_sock_unbind_res(struct sock *sk, u8 res)
+{
+	int ret = -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&resource_mutex);
+	if (pnres.sk[res] == sk) {
+		rcu_assign_pointer(pnres.sk[res], NULL);
+		ret = 0;
+	}
+	mutex_unlock(&resource_mutex);
+
+	if (ret == 0) {
+		synchronize_rcu();
+		sock_put(sk);
+	}
+	return ret;
+}
+
+void pn_sock_unbind_all_res(struct sock *sk)
+{
+	unsigned res, match = 0;
+
+	mutex_lock(&resource_mutex);
+	for (res = 0; res < 256; res++) {
+		if (pnres.sk[res] == sk) {
+			rcu_assign_pointer(pnres.sk[res], NULL);
+			match++;
+		}
+	}
+	mutex_unlock(&resource_mutex);
+
+	while (match > 0) {
+		__sock_put(sk);
+		match--;
+	}
+	/* Caller is responsible for RCU sync before final sock_put() */
+}
+
+#ifdef CONFIG_PROC_FS
+static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned i;
+
+	if (!net_eq(net, &init_net))
+		return NULL;
+
+	for (i = 0; i < 256; i++) {
+		if (pnres.sk[i] == NULL)
+			continue;
+		if (!pos)
+			return pnres.sk + i;
+		pos--;
+	}
+	return NULL;
+}
+
+static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned i;
+
+	BUG_ON(!net_eq(net, &init_net));
+
+	for (i = (sk - pnres.sk) + 1; i < 256; i++)
+		if (pnres.sk[i])
+			return pnres.sk + i;
+	return NULL;
+}
+
+static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(resource_mutex)
+{
+	mutex_lock(&resource_mutex);
+	return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock **sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = pn_res_get_idx(seq, 0);
+	else
+		sk = pn_res_get_next(seq, v);
+	(*pos)++;
+	return sk;
+}
+
+static void pn_res_seq_stop(struct seq_file *seq, void *v)
+	__releases(resource_mutex)
+{
+	mutex_unlock(&resource_mutex);
+}
+
+static int pn_res_seq_show(struct seq_file *seq, void *v)
+{
+	int len;
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%s%n", "rs   uid inode", &len);
+	else {
+		struct sock **psk = v;
+		struct sock *sk = *psk;
+
+		seq_printf(seq, "%02X %5d %lu%n",
+			   (int) (psk - pnres.sk), sock_i_uid(sk),
+			   sock_i_ino(sk), &len);
+	}
+	seq_printf(seq, "%*s\n", 63 - len, "");
+	return 0;
+}
+
+static const struct seq_operations pn_res_seq_ops = {
+	.start = pn_res_seq_start,
+	.next = pn_res_seq_next,
+	.stop = pn_res_seq_stop,
+	.show = pn_res_seq_show,
+};
+
+static int pn_res_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &pn_res_seq_ops,
+				sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_res_seq_fops = {
+	.owner = THIS_MODULE,
+	.open = pn_res_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c
new file mode 100644
index 00000000..cea1c7db
--- /dev/null
+++ b/net/phonet/sysctl.c
@@ -0,0 +1,111 @@
+/*
+ * File: sysctl.c
+ *
+ * Phonet /proc/sys/net/phonet interface implementation
+ *
+ * Copyright (C) 2008 Nokia Corporation.
+ *
+ * Contact: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/seqlock.h>
+#include <linux/sysctl.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#define DYNAMIC_PORT_MIN	0x40
+#define DYNAMIC_PORT_MAX	0x7f
+
+static DEFINE_SEQLOCK(local_port_range_lock);
+static int local_port_range_min[2] = {0, 0};
+static int local_port_range_max[2] = {1023, 1023};
+static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX};
+static struct ctl_table_header *phonet_table_hrd;
+
+static void set_local_port_range(int range[2])
+{
+	write_seqlock(&local_port_range_lock);
+	local_port_range[0] = range[0];
+	local_port_range[1] = range[1];
+	write_sequnlock(&local_port_range_lock);
+}
+
+void phonet_get_local_port_range(int *min, int *max)
+{
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&local_port_range_lock);
+		if (min)
+			*min = local_port_range[0];
+		if (max)
+			*max = local_port_range[1];
+	} while (read_seqretry(&local_port_range_lock, seq));
+}
+
+static int proc_local_port_range(ctl_table *table, int write,
+				void __user *buffer,
+				size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int range[2] = {local_port_range[0], local_port_range[1]};
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &local_port_range_min,
+		.extra2 = &local_port_range_max,
+	};
+
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0) {
+		if (range[1] < range[0])
+			ret = -EINVAL;
+		else
+			set_local_port_range(range);
+	}
+
+	return ret;
+}
+
+static struct ctl_table phonet_table[] = {
+	{
+		.procname	= "local_port_range",
+		.data		= &local_port_range,
+		.maxlen		= sizeof(local_port_range),
+		.mode		= 0644,
+		.proc_handler	= proc_local_port_range,
+	},
+	{ }
+};
+
+static struct ctl_path phonet_ctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "phonet", },
+	{ },
+};
+
+int __init phonet_sysctl_init(void)
+{
+	phonet_table_hrd = register_sysctl_paths(phonet_ctl_path, phonet_table);
+	return phonet_table_hrd == NULL ? -ENOMEM : 0;
+}
+
+void phonet_sysctl_exit(void)
+{
+	unregister_sysctl_table(phonet_table_hrd);
+}
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 00000000..ec753b3a
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,28 @@
+
+config RDS
+	tristate "The RDS Protocol (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	---help---
+	  The RDS (Reliable Datagram Sockets) protocol provides reliable,
+	  sequenced delivery of datagrams over Infiniband, iWARP,
+	  or TCP.
+
+config RDS_RDMA
+	tristate "RDS over Infiniband and iWARP"
+	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+	---help---
+	  Allow RDS to use Infiniband and iWARP as a transport.
+	  This transport supports RDMA operations.
+
+config RDS_TCP
+	tristate "RDS over TCP"
+	depends on RDS
+	---help---
+	  Allow RDS to use TCP as a transport.
+	  This transport does not support RDMA operations.
+
+config RDS_DEBUG
+        bool "RDS debugging messages"
+	depends on RDS
+        default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 00000000..56d3f602
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
+			recv.o send.o stats.o sysctl.o threads.o transport.o \
+			loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-y :=	rdma_transport.o \
+			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+			ib_sysctl.o ib_rdma.o \
+			iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+			iw_sysctl.o iw_rdma.o
+
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-y :=		tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+			tcp_send.o tcp_stats.o
+
+ccflags-$(CONFIG_RDS_DEBUG)	:=	-DDEBUG
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 00000000..424ff622
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+
+#include "rds.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+	if ((index < elements) && array[index])
+		return array[index];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path.  sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs;
+
+	if (!sk)
+		goto out;
+
+	rs = rds_sk_to_rs(sk);
+
+	sock_orphan(sk);
+	/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+	 * that ensures the recv path has completed messing
+	 * with the socket. */
+	rds_clear_recv_queue(rs);
+	rds_cong_remove_socket(rs);
+
+	/*
+	 * the binding lookup hash uses rcu, we need to
+	 * make sure we sychronize_rcu before we free our
+	 * entry
+	 */
+	rds_remove_bound(rs);
+	synchronize_rcu();
+
+	rds_send_drop_to(rs, NULL);
+	rds_rdma_drop_keys(rs);
+	rds_notify_queue_get(rs, NULL);
+
+	spin_lock_bh(&rds_sock_lock);
+	list_del_init(&rs->rs_item);
+	rds_sock_count--;
+	spin_unlock_bh(&rds_sock_lock);
+
+	rds_trans_put(rs->rs_transport);
+
+	sock->sk = NULL;
+	sock_put(sk);
+out:
+	return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	__rds_wake_sk_sleep(rds_rs_to_sk(rs));
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int *uaddr_len, int peer)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+	/* racey, don't care */
+	if (peer) {
+		if (!rs->rs_conn_addr)
+			return -ENOTCONN;
+
+		sin->sin_port = rs->rs_conn_port;
+		sin->sin_addr.s_addr = rs->rs_conn_addr;
+	} else {
+		sin->sin_port = rs->rs_bound_port;
+		sin->sin_addr.s_addr = rs->rs_bound_addr;
+	}
+
+	sin->sin_family = AF_INET;
+
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ *  -	there is data on the receive queue.
+ *  -	to signal that a previously congested destination may have become
+ *	uncongested
+ *  -	A notification has been queued to the socket (this can be a congestion
+ *	update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	unsigned int mask = 0;
+	unsigned long flags;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	if (rs->rs_seen_congestion)
+		poll_wait(file, &rds_poll_waitq, wait);
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!rs->rs_cong_monitor) {
+		/* When a congestion map was updated, we signal POLLIN for
+		 * "historical" reasons. Applications can also poll for
+		 * WRBAND instead. */
+		if (rds_cong_updated_since(&rs->rs_cong_track))
+			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+	} else {
+		spin_lock(&rs->rs_lock);
+		if (rs->rs_cong_notify)
+			mask |= (POLLIN | POLLRDNORM);
+		spin_unlock(&rs->rs_lock);
+	}
+	if (!list_empty(&rs->rs_recv_queue) ||
+	    !list_empty(&rs->rs_notify_queue))
+		mask |= (POLLIN | POLLRDNORM);
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+		mask |= (POLLOUT | POLLWRNORM);
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	/* clear state any time we wake a seen-congested socket */
+	if (mask)
+		rs->rs_seen_congestion = 0;
+
+	return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+			      int len)
+{
+	struct sockaddr_in sin;
+	int ret = 0;
+
+	/* racing with another thread binding seems ok here */
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (len < sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&sin, optval, sizeof(sin))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	rds_send_drop_to(rs, &sin);
+out:
+	return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+			       int optlen)
+{
+	int value;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(value, (int __user *) optval))
+		return -EFAULT;
+	*optvar = !!value;
+	return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+			    int optlen)
+{
+	int ret;
+
+	ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+	if (ret == 0) {
+		if (rs->rs_cong_monitor) {
+			rds_cong_add_socket(rs);
+		} else {
+			rds_cong_remove_socket(rs);
+			rs->rs_cong_mask = 0;
+			rs->rs_cong_notify = 0;
+		}
+	}
+	return ret;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret;
+
+	if (level != SOL_RDS) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_CANCEL_SENT_TO:
+		ret = rds_cancel_sent_to(rs, optval, optlen);
+		break;
+	case RDS_GET_MR:
+		ret = rds_get_mr(rs, optval, optlen);
+		break;
+	case RDS_GET_MR_FOR_DEST:
+		ret = rds_get_mr_for_dest(rs, optval, optlen);
+		break;
+	case RDS_FREE_MR:
+		ret = rds_free_mr(rs, optval, optlen);
+		break;
+	case RDS_RECVERR:
+		ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+		break;
+	case RDS_CONG_MONITOR:
+		ret = rds_cong_monitor(rs, optval, optlen);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+	}
+out:
+	return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret = -ENOPROTOOPT, len;
+
+	if (level != SOL_RDS)
+		goto out;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_INFO_FIRST ... RDS_INFO_LAST:
+		ret = rds_info_getsockopt(sock, optname, optval,
+					  optlen);
+		break;
+
+	case RDS_RECVERR:
+		if (len < sizeof(int))
+			ret = -EINVAL;
+		else
+		if (put_user(rs->rs_recverr, (int __user *) optval) ||
+		    put_user(sizeof(int), optlen))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+out:
+	return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sin->sin_family != AF_INET) {
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EDESTADDRREQ;
+		goto out;
+	}
+
+	rs->rs_conn_addr = sin->sin_addr.s_addr;
+	rs->rs_conn_port = sin->sin_port;
+
+out:
+	release_sock(sk);
+	return ret;
+}
+
+static struct proto rds_proto = {
+	.name	  = "RDS",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct rds_sock),
+};
+
+static const struct proto_ops rds_proto_ops = {
+	.family =	AF_RDS,
+	.owner =	THIS_MODULE,
+	.release =	rds_release,
+	.bind =		rds_bind,
+	.connect =	rds_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	rds_getname,
+	.poll =		rds_poll,
+	.ioctl =	rds_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	rds_setsockopt,
+	.getsockopt =	rds_getsockopt,
+	.sendmsg =	rds_sendmsg,
+	.recvmsg =	rds_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+	struct rds_sock *rs;
+
+	sock_init_data(sock, sk);
+	sock->ops		= &rds_proto_ops;
+	sk->sk_protocol		= protocol;
+
+	rs = rds_sk_to_rs(sk);
+	spin_lock_init(&rs->rs_lock);
+	rwlock_init(&rs->rs_recv_lock);
+	INIT_LIST_HEAD(&rs->rs_send_queue);
+	INIT_LIST_HEAD(&rs->rs_recv_queue);
+	INIT_LIST_HEAD(&rs->rs_notify_queue);
+	INIT_LIST_HEAD(&rs->rs_cong_list);
+	spin_lock_init(&rs->rs_rdma_lock);
+	rs->rs_rdma_keys = RB_ROOT;
+
+	spin_lock_bh(&rds_sock_lock);
+	list_add_tail(&rs->rs_item, &rds_sock_list);
+	rds_sock_count++;
+	spin_unlock_bh(&rds_sock_lock);
+
+	return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+
+	if (sock->type != SOCK_SEQPACKET || protocol)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+	sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+	sock_put(rds_rs_to_sk(rs));
+}
+
+static const struct net_proto_family rds_family_ops = {
+	.family =	AF_RDS,
+	.create =	rds_create,
+	.owner	=	THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens)
+{
+	struct rds_sock *rs;
+	struct rds_incoming *inc;
+	unsigned int total = 0;
+
+	len /= sizeof(struct rds_info_message);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		read_lock(&rs->rs_recv_lock);
+
+		/* XXX too lazy to maintain counts.. */
+		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+			total++;
+			if (total <= len)
+				rds_inc_info_copy(inc, iter, inc->i_saddr,
+						  rs->rs_bound_addr, 1);
+		}
+
+		read_unlock(&rs->rs_recv_lock);
+	}
+
+	spin_unlock_bh(&rds_sock_lock);
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	struct rds_info_socket sinfo;
+	struct rds_sock *rs;
+
+	len /= sizeof(struct rds_info_socket);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	if (len < rds_sock_count)
+		goto out;
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		sinfo.sndbuf = rds_sk_sndbuf(rs);
+		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+		sinfo.bound_addr = rs->rs_bound_addr;
+		sinfo.connected_addr = rs->rs_conn_addr;
+		sinfo.bound_port = rs->rs_bound_port;
+		sinfo.connected_port = rs->rs_conn_port;
+		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+		rds_info_copy(iter, &sinfo, sizeof(sinfo));
+	}
+
+out:
+	lens->nr = rds_sock_count;
+	lens->each = sizeof(struct rds_info_socket);
+
+	spin_unlock_bh(&rds_sock_lock);
+}
+
+static void rds_exit(void)
+{
+	sock_unregister(rds_family_ops.family);
+	proto_unregister(&rds_proto);
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_sysctl_exit();
+	rds_threads_exit();
+	rds_stats_exit();
+	rds_page_exit();
+	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+static int rds_init(void)
+{
+	int ret;
+
+	ret = rds_conn_init();
+	if (ret)
+		goto out;
+	ret = rds_threads_init();
+	if (ret)
+		goto out_conn;
+	ret = rds_sysctl_init();
+	if (ret)
+		goto out_threads;
+	ret = rds_stats_init();
+	if (ret)
+		goto out_sysctl;
+	ret = proto_register(&rds_proto, 1);
+	if (ret)
+		goto out_stats;
+	ret = sock_register(&rds_family_ops);
+	if (ret)
+		goto out_proto;
+
+	rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+	goto out;
+
+out_proto:
+	proto_unregister(&rds_proto);
+out_stats:
+	rds_stats_exit();
+out_sysctl:
+	rds_sysctl_exit();
+out_threads:
+	rds_threads_exit();
+out_conn:
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_page_exit();
+out:
+	return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION     "4.0"
+#define DRV_RELDATE     "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+		   " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 00000000..2f6b3fcc
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/jhash.h>
+#include "rds.h"
+
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+static DEFINE_SPINLOCK(rds_bind_lock);
+
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+	return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+				  (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+					struct rds_sock *insert)
+{
+	struct rds_sock *rs;
+	struct hlist_node *node;
+	struct hlist_head *head = hash_to_bucket(addr, port);
+	u64 cmp;
+	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
+		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+		      be16_to_cpu(rs->rs_bound_port);
+
+		if (cmp == needle) {
+			rcu_read_unlock();
+			return rs;
+		}
+	}
+	rcu_read_unlock();
+
+	if (insert) {
+		/*
+		 * make sure our addr and port are set before
+		 * we are added to the list, other people
+		 * in rcu will find us as soon as the
+		 * hlist_add_head_rcu is done
+		 */
+		insert->rs_bound_addr = addr;
+		insert->rs_bound_port = port;
+		rds_sock_addref(insert);
+
+		hlist_add_head_rcu(&insert->rs_bound_node, head);
+	}
+	return NULL;
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release.  We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+	struct rds_sock *rs;
+
+	rs = rds_bind_lookup(addr, port, NULL);
+
+	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+		rds_sock_addref(rs);
+	else
+		rs = NULL;
+
+	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+		ntohs(port));
+	return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+	unsigned long flags;
+	int ret = -EADDRINUSE;
+	u16 rover, last;
+
+	if (*port != 0) {
+		rover = be16_to_cpu(*port);
+		last = rover;
+	} else {
+		rover = max_t(u16, net_random(), 2);
+		last = rover - 1;
+	}
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	do {
+		if (rover == 0)
+			rover++;
+		if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+			*port = rs->rs_bound_port;
+			ret = 0;
+			rdsdebug("rs %p binding to %pI4:%d\n",
+			  rs, &addr, (int)ntohs(*port));
+			break;
+		}
+	} while (rover++ != last);
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+	return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	if (rs->rs_bound_addr) {
+		rdsdebug("rs %p unbinding from %pI4:%d\n",
+		  rs, &rs->rs_bound_addr,
+		  ntohs(rs->rs_bound_port));
+
+		hlist_del_init_rcu(&rs->rs_bound_node);
+		rds_sock_put(rs);
+		rs->rs_bound_addr = 0;
+	}
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct rds_transport *trans;
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in) ||
+	    sin->sin_family != AF_INET ||
+	    rs->rs_bound_addr ||
+	    sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+	if (ret)
+		goto out;
+
+	trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+	if (!trans) {
+		ret = -EADDRNOTAVAIL;
+		rds_remove_bound(rs);
+		if (printk_ratelimit())
+			printk(KERN_INFO "RDS: rds_bind() could not find a transport, "
+				"load rds_tcp or rds_rdma?\n");
+		goto out;
+	}
+
+	rs->rs_transport = trans;
+	ret = 0;
+
+out:
+	release_sock(sk);
+
+	/* we might have called rds_remove_bound on error */
+	if (ret)
+		synchronize_rcu();
+	return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 00000000..6daaa49d
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/bitops.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
+ * message are accounted for.  If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested.  All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs.  An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested.  As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up.  This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently.  This is much easier to implement than some
+ * finer-grained communication of per-port congestion.  The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t		rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock.  It's used so infrequently that it's worth keeping it
+ * global to simplify the locking.  It's only used in the following
+ * circumstances:
+ *
+ *  - on connection buildup to associate a conn with its maps
+ *  - on map changes to inform conns of a new map to send
+ *
+ *  It's sadly ordered under the socket callback lock and the connection lock.
+ *  Receive paths can mark ports congested from interrupt context so the
+ *  lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+					       struct rds_cong_map *insert)
+{
+	struct rb_node **p = &rds_cong_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_cong_map *map;
+
+	while (*p) {
+		parent = *p;
+		map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+		if (addr < map->m_addr)
+			p = &(*p)->rb_left;
+		else if (addr > map->m_addr)
+			p = &(*p)->rb_right;
+		else
+			return map;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->m_rb_node, parent, p);
+		rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+	}
+	return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address.  Connections try and allocate
+ * these bitmaps in the process getting pointers to them.  The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+	struct rds_cong_map *map;
+	struct rds_cong_map *ret = NULL;
+	unsigned long zp;
+	unsigned long i;
+	unsigned long flags;
+
+	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+	if (!map)
+		return NULL;
+
+	map->m_addr = addr;
+	init_waitqueue_head(&map->m_waitq);
+	INIT_LIST_HEAD(&map->m_conn_list);
+
+	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+		zp = get_zeroed_page(GFP_KERNEL);
+		if (zp == 0)
+			goto out;
+		map->m_page_addrs[i] = zp;
+	}
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	ret = rds_cong_tree_walk(addr, map);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (!ret) {
+		ret = map;
+		map = NULL;
+	}
+
+out:
+	if (map) {
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+
+	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+	return ret;
+}
+
+/*
+ * Put the conn on its local map's list.  This is called when the conn is
+ * really added to the hash.  It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_del_init(&conn->c_map_item);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+	if (!(conn->c_lcong && conn->c_fcong))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+	struct rds_connection *conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+
+	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+		if (!test_and_set_bit(0, &conn->c_map_queued)) {
+			rds_stats_inc(s_cong_update_queued);
+			rds_send_xmit(conn);
+		}
+	}
+
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+	rdsdebug("waking map %p for %pI4\n",
+	  map, &map->m_addr);
+	rds_stats_inc(s_cong_update_received);
+	atomic_inc(&rds_cong_generation);
+	if (waitqueue_active(&map->m_waitq))
+		wake_up(&map->m_waitq);
+	if (waitqueue_active(&rds_poll_waitq))
+		wake_up_all(&rds_poll_waitq);
+
+	if (portmask && !list_empty(&rds_cong_monitor)) {
+		unsigned long flags;
+		struct rds_sock *rs;
+
+		read_lock_irqsave(&rds_cong_monitor_lock, flags);
+		list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+			spin_lock(&rs->rs_lock);
+			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+			rs->rs_cong_mask &= ~portmask;
+			spin_unlock(&rs->rs_lock);
+			if (rs->rs_cong_notify)
+				rds_wake_sk_sleep(rs);
+		}
+		read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+	unsigned long gen = atomic_read(&rds_cong_generation);
+
+	if (likely(*recent == gen))
+		return 0;
+	*recent = gen;
+	return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption.  This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("setting congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	__set_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	__clear_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	return test_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	if (list_empty(&rs->rs_cong_list))
+		list_add(&rs->rs_cong_list, &rds_cong_monitor);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+	struct rds_cong_map *map;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	list_del_init(&rs->rs_cong_list);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+	/* update congestion map for now-closed port */
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+		rds_cong_clear_bit(map, rs->rs_bound_port);
+		rds_cong_queue_updates(map);
+	}
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+		  struct rds_sock *rs)
+{
+	if (!rds_cong_test_bit(map, port))
+		return 0;
+	if (nonblock) {
+		if (rs && rs->rs_cong_monitor) {
+			unsigned long flags;
+
+			/* It would have been nice to have an atomic set_bit on
+			 * a uint64_t. */
+			spin_lock_irqsave(&rs->rs_lock, flags);
+			rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+			spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+			/* Test again - a congestion update may have arrived in
+			 * the meantime. */
+			if (!rds_cong_test_bit(map, port))
+				return 0;
+		}
+		rds_stats_inc(s_cong_send_error);
+		return -ENOBUFS;
+	}
+
+	rds_stats_inc(s_cong_send_blocked);
+	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+	return wait_event_interruptible(map->m_waitq,
+					!rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+	struct rb_node *node;
+	struct rds_cong_map *map;
+	unsigned long i;
+
+	while ((node = rb_first(&rds_cong_tree))) {
+		map = rb_entry(node, struct rds_cong_map, m_rb_node);
+		rdsdebug("freeing map %p\n", map);
+		rb_erase(&map->m_rb_node, &rds_cong_tree);
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+	struct rds_cong_map *map = conn->c_lcong;
+	struct rds_message *rm;
+
+	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+	if (!IS_ERR(rm))
+		rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+	return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 00000000..9334d892
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+	/* Pass NULL, don't need struct net for hash */
+	unsigned long hash = inet_ehashfn(NULL,
+					  be32_to_cpu(laddr), 0,
+					  be32_to_cpu(faddr), 0);
+	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do {		\
+	if (test)						\
+		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
+} while (0)
+
+/* rcu read lock must be held or the connection spinlock */
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+					      __be32 laddr, __be32 faddr,
+					      struct rds_transport *trans)
+{
+	struct rds_connection *conn, *ret = NULL;
+	struct hlist_node *pos;
+
+	hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
+		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+				conn->c_trans == trans) {
+			ret = conn;
+			break;
+		}
+	}
+	rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+		 &laddr, &faddr);
+	return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future.  It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+static void rds_conn_reset(struct rds_connection *conn)
+{
+	rdsdebug("connection %pI4 to %pI4 reset\n",
+	  &conn->c_laddr, &conn->c_faddr);
+
+	rds_stats_inc(s_conn_reset);
+	rds_send_reset(conn);
+	conn->c_flags = 0;
+
+	/* Do not clear next_rx_seq here, else we cannot distinguish
+	 * retransmitted packets from new packets, and will hand all
+	 * of them to the application. That is not consistent with the
+	 * reliability guarantees of RDS. */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time.  They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created.  They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int is_outgoing)
+{
+	struct rds_connection *conn, *parent = NULL;
+	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	struct rds_transport *loop_trans;
+	unsigned long flags;
+	int ret;
+
+	rcu_read_lock();
+	conn = rds_conn_lookup(head, laddr, faddr, trans);
+	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
+	    !is_outgoing) {
+		/* This is a looped back IB connection, and we're
+		 * called by the code handling the incoming connect.
+		 * We need a second connection object into which we
+		 * can stick the other QP. */
+		parent = conn;
+		conn = parent->c_passive;
+	}
+	rcu_read_unlock();
+	if (conn)
+		goto out;
+
+	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
+	if (!conn) {
+		conn = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	INIT_HLIST_NODE(&conn->c_hash_node);
+	conn->c_laddr = laddr;
+	conn->c_faddr = faddr;
+	spin_lock_init(&conn->c_lock);
+	conn->c_next_tx_seq = 1;
+
+	init_waitqueue_head(&conn->c_waitq);
+	INIT_LIST_HEAD(&conn->c_send_queue);
+	INIT_LIST_HEAD(&conn->c_retrans);
+
+	ret = rds_cong_get_maps(conn);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	/*
+	 * This is where a connection becomes loopback.  If *any* RDS sockets
+	 * can bind to the destination address then we'd rather the messages
+	 * flow through loopback rather than either transport.
+	 */
+	loop_trans = rds_trans_get_preferred(faddr);
+	if (loop_trans) {
+		rds_trans_put(loop_trans);
+		conn->c_loopback = 1;
+		if (is_outgoing && trans->t_prefer_loopback) {
+			/* "outgoing" connection - and the transport
+			 * says it wants the connection handled by the
+			 * loopback transport. This is what TCP does.
+			 */
+			trans = &rds_loop_transport;
+		}
+	}
+
+	conn->c_trans = trans;
+
+	ret = trans->conn_alloc(conn, gfp);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	atomic_set(&conn->c_state, RDS_CONN_DOWN);
+	conn->c_reconnect_jiffies = 0;
+	INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+	INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+	INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+	INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+	mutex_init(&conn->c_cm_lock);
+	conn->c_flags = 0;
+
+	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+	  conn, &laddr, &faddr,
+	  trans->t_name ? trans->t_name : "[unknown]",
+	  is_outgoing ? "(outgoing)" : "");
+
+	/*
+	 * Since we ran without holding the conn lock, someone could
+	 * have created the same conn (either normal or passive) in the
+	 * interim. We check while holding the lock. If we won, we complete
+	 * init and return our conn. If we lost, we rollback and return the
+	 * other one.
+	 */
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	if (parent) {
+		/* Creating passive conn */
+		if (parent->c_passive) {
+			trans->conn_free(conn->c_transport_data);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = parent->c_passive;
+		} else {
+			parent->c_passive = conn;
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	} else {
+		/* Creating normal conn */
+		struct rds_connection *found;
+
+		found = rds_conn_lookup(head, laddr, faddr, trans);
+		if (found) {
+			trans->conn_free(conn->c_transport_data);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = found;
+		} else {
+			hlist_add_head_rcu(&conn->c_hash_node, head);
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	}
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+out:
+	return conn;
+}
+
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create);
+
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+	/* shut it down unless it's down already */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&conn->c_cm_lock);
+		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+		 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+			rds_conn_error(conn, "shutdown called in state %d\n",
+					atomic_read(&conn->c_state));
+			mutex_unlock(&conn->c_cm_lock);
+			return;
+		}
+		mutex_unlock(&conn->c_cm_lock);
+
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+		conn->c_trans->conn_shutdown(conn);
+		rds_conn_reset(conn);
+
+		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
+			rds_conn_error(conn,
+				"%s: failed to transition to state DOWN, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work_sync(&conn->c_conn_w);
+	rcu_read_lock();
+	if (!hlist_unhashed(&conn->c_hash_node)) {
+		rcu_read_unlock();
+		rds_queue_reconnect(conn);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
+void rds_conn_destroy(struct rds_connection *conn)
+{
+	struct rds_message *rm, *rtmp;
+	unsigned long flags;
+
+	rdsdebug("freeing conn %p for %pI4 -> "
+		 "%pI4\n", conn, &conn->c_laddr,
+		 &conn->c_faddr);
+
+	/* Ensure conn will not be scheduled for reconnect */
+	spin_lock_irq(&rds_conn_lock);
+	hlist_del_init_rcu(&conn->c_hash_node);
+	spin_unlock_irq(&rds_conn_lock);
+	synchronize_rcu();
+
+	/* shut the connection down */
+	rds_conn_drop(conn);
+	flush_work(&conn->c_down_w);
+
+	/* make sure lingering queued work won't try to ref the conn */
+	cancel_delayed_work_sync(&conn->c_send_w);
+	cancel_delayed_work_sync(&conn->c_recv_w);
+
+	/* tear down queued messages */
+	list_for_each_entry_safe(rm, rtmp,
+				 &conn->c_send_queue,
+				 m_conn_item) {
+		list_del_init(&rm->m_conn_item);
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		rds_message_put(rm);
+	}
+	if (conn->c_xmit_rm)
+		rds_message_put(conn->c_xmit_rm);
+
+	conn->c_trans->conn_free(conn->c_transport_data);
+
+	/*
+	 * The congestion maps aren't freed up here.  They're
+	 * freed by rds_cong_exit() after all the connections
+	 * have been freed.
+	 */
+	rds_cong_remove_conn(conn);
+
+	BUG_ON(!list_empty(&conn->c_retrans));
+	kmem_cache_free(rds_conn_slab, conn);
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	rds_conn_count--;
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+				  struct rds_info_iterator *iter,
+				  struct rds_info_lengths *lens,
+				  int want_send)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct list_head *list;
+	struct rds_connection *conn;
+	struct rds_message *rm;
+	unsigned int total = 0;
+	unsigned long flags;
+	size_t i;
+
+	len /= sizeof(struct rds_info_message);
+
+	rcu_read_lock();
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
+			if (want_send)
+				list = &conn->c_send_queue;
+			else
+				list = &conn->c_retrans;
+
+			spin_lock_irqsave(&conn->c_lock, flags);
+
+			/* XXX too lazy to maintain counts.. */
+			list_for_each_entry(rm, list, m_conn_item) {
+				total++;
+				if (total <= len)
+					rds_inc_info_copy(&rm->m_inc, iter,
+							  conn->c_laddr,
+							  conn->c_faddr, 0);
+			}
+
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+		}
+	}
+	rcu_read_unlock();
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+				       struct rds_info_iterator *iter,
+				       struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+					  unsigned int len,
+					  struct rds_info_iterator *iter,
+					  struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len)
+{
+	uint64_t buffer[(item_len + 7) / 8];
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct rds_connection *conn;
+	size_t i;
+
+	rcu_read_lock();
+
+	lens->nr = 0;
+	lens->each = item_len;
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
+
+			/* XXX no c_lock usage.. */
+			if (!visitor(conn, buffer))
+				continue;
+
+			/* We copy as much as we can fit in the buffer,
+			 * but we count all items so that the caller
+			 * can resize the buffer. */
+			if (len >= item_len) {
+				rds_info_copy(iter, buffer, item_len);
+				len -= item_len;
+			}
+			lens->nr++;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
+
+static int rds_conn_info_visitor(struct rds_connection *conn,
+				  void *buffer)
+{
+	struct rds_info_connection *cinfo = buffer;
+
+	cinfo->next_tx_seq = conn->c_next_tx_seq;
+	cinfo->next_rx_seq = conn->c_next_rx_seq;
+	cinfo->laddr = conn->c_laddr;
+	cinfo->faddr = conn->c_faddr;
+	strncpy(cinfo->transport, conn->c_trans->t_name,
+		sizeof(cinfo->transport));
+	cinfo->flags = 0;
+
+	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+			  SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_UP,
+			  CONNECTED);
+	return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_conn_info_visitor,
+				sizeof(struct rds_info_connection));
+}
+
+int rds_conn_init(void)
+{
+	rds_conn_slab = kmem_cache_create("rds_connection",
+					  sizeof(struct rds_connection),
+					  0, 0, NULL);
+	if (!rds_conn_slab)
+		return -ENOMEM;
+
+	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+			       rds_conn_message_info_send);
+	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+			       rds_conn_message_info_retrans);
+
+	return 0;
+}
+
+void rds_conn_exit(void)
+{
+	rds_loop_exit();
+
+	WARN_ON(!hlist_empty(rds_conn_hash));
+
+	kmem_cache_destroy(rds_conn_slab);
+
+	rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+				 rds_conn_message_info_send);
+	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+				 rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+	atomic_set(&conn->c_state, RDS_CONN_ERROR);
+	queue_work(rds_wq, &conn->c_down_w);
+}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
+
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+
+	rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 00000000..3b83086b
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
+struct list_head rds_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+static void rds_ib_nodev_connect(void)
+{
+	struct rds_ib_connection *ic;
+
+	spin_lock(&ib_nodev_conns_lock);
+	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+		rds_conn_connect_if_down(ic->conn);
+	spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+		rds_conn_drop(ic->conn);
+	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+	struct rds_ib_device *rds_ibdev = container_of(work,
+					struct rds_ib_device, free_work);
+
+	if (rds_ibdev->mr_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+	if (rds_ibdev->mr)
+		ib_dereg_mr(rds_ibdev->mr);
+	if (rds_ibdev->pd)
+		ib_dealloc_pd(rds_ibdev->pd);
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+	if (atomic_dec_and_test(&rds_ibdev->refcount))
+		queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle IB (no iWARP) devices */
+	if (device->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+				 ibdev_to_node(device));
+	if (!rds_ibdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_ibdev->spinlock);
+	atomic_set(&rds_ibdev->refcount, 1);
+	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+
+	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+
+	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+	rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+			fmr_pool_size;
+
+	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
+	rds_ibdev->dev = device;
+	rds_ibdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_ibdev->pd)) {
+		rds_ibdev->pd = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(rds_ibdev->mr)) {
+		rds_ibdev->mr = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+	if (IS_ERR(rds_ibdev->mr_pool)) {
+		rds_ibdev->mr_pool = NULL;
+		goto put_dev;
+	}
+
+	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+	INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
+	down_write(&rds_ib_devices_lock);
+	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+	up_write(&rds_ib_devices_lock);
+	atomic_inc(&rds_ibdev->refcount);
+
+	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+	atomic_inc(&rds_ibdev->refcount);
+
+	rds_ib_nodev_connect();
+
+put_dev:
+	rds_ib_dev_put(rds_ibdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rcu_read_lock();
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (rds_ibdev)
+		atomic_inc(&rds_ibdev->refcount);
+	rcu_read_unlock();
+	return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (!rds_ibdev)
+		return;
+
+	rds_ib_dev_shutdown(rds_ibdev);
+
+	/* stop connection attempts from getting a reference to this device. */
+	ib_set_client_data(device, &rds_ib_client, NULL);
+
+	down_write(&rds_ib_devices_lock);
+	list_del_rcu(&rds_ibdev->list);
+	up_write(&rds_ib_devices_lock);
+
+	/*
+	 * This synchronize rcu is waiting for readers of both the ib
+	 * client data and the devices list to finish before we drop
+	 * both of those references.
+	 */
+	synchronize_rcu();
+	rds_ib_dev_put(rds_ibdev);
+	rds_ib_dev_put(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+	.name   = "rds_ib",
+	.add    = rds_ib_add_one,
+	.remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_ibdev = ic->rds_ibdev;
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_ibdev->max_sge;
+		rds_ib_get_mr_info(rds_ibdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_ib_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support iWARP devices unless we
+	   check node_type. */
+	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+static void rds_ib_unregister_client(void)
+{
+	ib_unregister_client(&rds_ib_client);
+	/* wait for rds_ib_dev_free() to complete */
+	flush_workqueue(rds_wq);
+}
+
+void rds_ib_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_ib_unregister_client();
+	rds_ib_destroy_nodev_conns();
+	rds_ib_sysctl_exit();
+	rds_ib_recv_exit();
+	rds_trans_unregister(&rds_ib_transport);
+}
+
+struct rds_transport rds_ib_transport = {
+	.laddr_check		= rds_ib_laddr_check,
+	.xmit_complete		= rds_ib_xmit_complete,
+	.xmit			= rds_ib_xmit,
+	.xmit_rdma		= rds_ib_xmit_rdma,
+	.xmit_atomic		= rds_ib_xmit_atomic,
+	.recv			= rds_ib_recv,
+	.conn_alloc		= rds_ib_conn_alloc,
+	.conn_free		= rds_ib_conn_free,
+	.conn_connect		= rds_ib_conn_connect,
+	.conn_shutdown		= rds_ib_conn_shutdown,
+	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
+	.inc_free		= rds_ib_inc_free,
+	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
+	.cm_handle_connect	= rds_ib_cm_handle_connect,
+	.cm_connect_complete	= rds_ib_cm_connect_complete,
+	.stats_info_copy	= rds_ib_stats_info_copy,
+	.exit			= rds_ib_exit,
+	.get_mr			= rds_ib_get_mr,
+	.sync_mr		= rds_ib_sync_mr,
+	.free_mr		= rds_ib_free_mr,
+	.flush_mrs		= rds_ib_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "infiniband",
+	.t_type			= RDS_TRANS_IB
+};
+
+int rds_ib_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_ib_devices);
+
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out;
+
+	ret = rds_ib_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_ib_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_ib_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_ib_recv_exit();
+out_sysctl:
+	rds_ib_sysctl_exit();
+out_ibreg:
+	rds_ib_unregister_client();
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 00000000..4297d927
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,373 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FMR_SIZE			256
+#define RDS_FMR_POOL_SIZE		8192
+
+#define RDS_IB_MAX_SGE			8
+#define RDS_IB_RECV_SGE 		2
+
+#define RDS_IB_DEFAULT_RECV_WR		1024
+#define RDS_IB_DEFAULT_SEND_WR		256
+
+#define RDS_IB_DEFAULT_RETRY_COUNT	2
+
+#define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+#define RDS_IB_RECYCLE_BATCH_COUNT	32
+
+extern struct rw_semaphore rds_ib_devices_lock;
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct list_head	f_cache_entry;
+	struct scatterlist	f_sg;
+};
+
+struct rds_ib_incoming {
+	struct list_head	ii_frags;
+	struct list_head	ii_cache_entry;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_ib_cache_head {
+	struct list_head *first;
+	unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+	struct rds_ib_cache_head *percpu;
+	struct list_head	 *xfer;
+	struct list_head	 *ready;
+};
+
+struct rds_ib_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+	void			*s_op;
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_ib_recv_work {
+	struct rds_ib_incoming 	*r_ibinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_ib_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+	struct list_head	ib_node;
+	struct rds_ib_device	*rds_ibdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_ib_work_ring	i_send_ring;
+	struct rm_data_op	*i_data_op;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_ib_send_work *i_sends;
+	atomic_t		i_signaled_sends;
+
+	/* rx */
+	struct tasklet_struct	i_recv_tasklet;
+	struct mutex		i_recv_mutex;
+	struct rds_ib_work_ring	i_recv_ring;
+	struct rds_ib_incoming	*i_ibinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_ib_recv_work *i_recvs;
+	u64			i_ack_recv;	/* last ACK received */
+	struct rds_ib_refill_cache i_cache_incs;
+	struct rds_ib_refill_cache i_cache_frags;
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
+	u64			i_ack_next;	/* next ACK to send */
+#endif
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_ib_ipaddr {
+	struct list_head	list;
+	__be32			ipaddr;
+};
+
+struct rds_ib_device {
+	struct list_head	list;
+	struct list_head	ipaddr_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_ib_mr_pool	*mr_pool;
+	unsigned int		fmr_max_remaps;
+	unsigned int		max_fmrs;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		max_initiator_depth;
+	unsigned int		max_responder_resources;
+	spinlock_t		spinlock;	/* protect the above */
+	atomic_t		refcount;
+	struct work_struct	free_work;
+};
+
+#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
+#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID	(~(u64) 0)
+
+struct rds_ib_statistics {
+	uint64_t	s_ib_connect_raced;
+	uint64_t	s_ib_listen_closed_stale;
+	uint64_t	s_ib_tx_cq_call;
+	uint64_t	s_ib_tx_cq_event;
+	uint64_t	s_ib_tx_ring_full;
+	uint64_t	s_ib_tx_throttle;
+	uint64_t	s_ib_tx_sg_mapping_failure;
+	uint64_t	s_ib_tx_stalled;
+	uint64_t	s_ib_tx_credit_updates;
+	uint64_t	s_ib_rx_cq_call;
+	uint64_t	s_ib_rx_cq_event;
+	uint64_t	s_ib_rx_ring_empty;
+	uint64_t	s_ib_rx_refill_from_cq;
+	uint64_t	s_ib_rx_refill_from_thread;
+	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_credit_updates;
+	uint64_t	s_ib_ack_sent;
+	uint64_t	s_ib_ack_send_failure;
+	uint64_t	s_ib_ack_send_delayed;
+	uint64_t	s_ib_ack_send_piggybacked;
+	uint64_t	s_ib_ack_received;
+	uint64_t	s_ib_rdma_mr_alloc;
+	uint64_t	s_ib_rdma_mr_free;
+	uint64_t	s_ib_rdma_mr_used;
+	uint64_t	s_ib_rdma_mr_pool_flush;
+	uint64_t	s_ib_rdma_mr_pool_wait;
+	uint64_t	s_ib_rdma_mr_pool_depleted;
+	uint64_t	s_ib_atomic_cswp;
+	uint64_t	s_ib_atomic_fadd;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_destroy_nodev_conns(void);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_tasklet_fn(unsigned long data);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 00000000..fd453dd5
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+		[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+	RDS_IB_EVENT_STRING(CQ_ERR),
+	RDS_IB_EVENT_STRING(QP_FATAL),
+	RDS_IB_EVENT_STRING(QP_REQ_ERR),
+	RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+	RDS_IB_EVENT_STRING(COMM_EST),
+	RDS_IB_EVENT_STRING(SQ_DRAINED),
+	RDS_IB_EVENT_STRING(PATH_MIG),
+	RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+	RDS_IB_EVENT_STRING(DEVICE_FATAL),
+	RDS_IB_EVENT_STRING(PORT_ACTIVE),
+	RDS_IB_EVENT_STRING(PORT_ERR),
+	RDS_IB_EVENT_STRING(LID_CHANGE),
+	RDS_IB_EVENT_STRING(PKEY_CHANGE),
+	RDS_IB_EVENT_STRING(SM_CHANGE),
+	RDS_IB_EVENT_STRING(SRQ_ERR),
+	RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+	RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+	RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+	return rds_str_array(rds_ib_event_type_strings,
+			     ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (rds_ib_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_ib_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+	int ret;
+
+	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+	if (ret)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = NULL;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_qp_attr qp_attr;
+	int err;
+
+	if (event->param.conn.private_data_len >= sizeof(*dp)) {
+		dp = event->param.conn.private_data;
+
+		/* make sure it isn't empty data */
+		if (dp->dp_protocol_major) {
+			rds_ib_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+				dp->dp_protocol_minor));
+			rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+		}
+	}
+
+	if (conn->c_version < RDS_PROTOCOL(3,1)) {
+		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+		       " no longer supported\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version));
+		rds_conn_destroy(conn);
+		return;
+	} else {
+		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version),
+		       ic->i_flowctl ? ", flow control" : "");
+	}
+
+	/*
+	 * Init rings and fill recv. this needs to wait until protocol negotiation
+	 * is complete, since ring layout is different from 3.0 to 3.1.
+	 */
+	rds_ib_send_init_ring(ic);
+	rds_ib_recv_init_ring(ic);
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_ib_recv_refill(conn, 1);
+
+	/* Tune RNR behavior */
+	rds_ib_tune_rnr(ic, &qp_attr);
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+	if (err)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+	/* update ib_device with this local ipaddr */
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+	if (err)
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+			err);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_ib_connect_private *dp,
+			u32 protocol_version,
+			u32 max_responder_resources,
+			u32 max_initiator_depth)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+
+	conn_param->responder_resources =
+		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+	conn_param->initiator_depth =
+		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
+	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+	conn_param->rnr_retry_count = 7;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u (%s) data %p\n",
+		 event->event, rds_ib_event_str(event->event), data);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+		 rds_ib_event_str(event->event));
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	default:
+		rdsdebug("Fatal QP Event %u (%s) "
+			"- connection %pI4->%pI4, reconnecting\n",
+			event->event, rds_ib_event_str(event->event),
+			&conn->c_laddr, &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+	}
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_ib_device *rds_ibdev;
+	int ret;
+
+	/*
+	 * It's normal to see a null device if an incoming connection races
+	 * with device removal, so we don't print a warning.
+	 */
+	rds_ibdev = rds_ib_get_client_data(dev);
+	if (!rds_ibdev)
+		return -EOPNOTSUPP;
+
+	/* add the conn now so that connection establishment has the dev */
+	rds_ib_add_conn(rds_ibdev, conn);
+
+	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_ibdev->pd;
+	ic->i_mr = rds_ibdev->mr;
+
+	ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_send_ring.w_nr + 1, 0);
+	if (IS_ERR(ic->i_send_cq)) {
+		ret = PTR_ERR(ic->i_send_cq);
+		ic->i_send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_recv_ring.w_nr, 0);
+	if (IS_ERR(ic->i_recv_cq)) {
+		ret = PTR_ERR(ic->i_recv_cq);
+		ic->i_recv_cq = NULL;
+		rdsdebug("ib_create_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	/* XXX negotiate max send/recv with remote? */
+	memset(&attr, 0, sizeof(attr));
+	attr.event_handler = rds_ib_qp_event_handler;
+	attr.qp_context = conn;
+	/* + 1 to allow for the single ack message */
+	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+	attr.cap.max_send_sge = rds_ibdev->max_sge;
+	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	attr.send_cq = ic->i_send_cq;
+	attr.recv_cq = ic->i_recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_send_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_recv_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (!ic->i_ack) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_sends) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+	memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+
+	ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_recvs) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+	memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+
+	rds_ib_recv_init_ack(ic);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	rds_ib_dev_put(rds_ibdev);
+	return ret;
+}
+
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	u16 common;
+	u32 version = 0;
+
+	/*
+	 * rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-(
+	 */
+
+	/* Be paranoid. RDS always has privdata */
+	if (!event->param.conn.private_data_len) {
+		printk(KERN_NOTICE "RDS incoming connection has no private data, "
+			"rejecting\n");
+		return 0;
+	}
+
+	/* Even if len is crap *now* I still want to check it. -ASG */
+	if (event->param.conn.private_data_len < sizeof (*dp) ||
+	    dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else if (printk_ratelimit()) {
+		printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+			"incompatible protocol version %u.%u\n",
+			&dp->dp_saddr,
+			dp->dp_protocol_major,
+			dp->dp_protocol_minor);
+	}
+	return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	struct rds_ib_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_ib_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	u32 version;
+	int err = 1, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_ib_protocol_compatible(event);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+		 (unsigned long long)be64_to_cpu(lguid),
+		 (unsigned long long)be64_to_cpu(fguid));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_ib_stats_inc(s_ib_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_ib_stats_inc(s_ib_connect_raced);
+		}
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_ib_set_protocol(conn, version);
+	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_ib_setup_qp(conn);
+	if (err) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+		event->param.conn.responder_resources,
+		event->param.conn.initiator_depth);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	if (err)
+		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+	if (err)
+		rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_ib_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_ib_setup_qp(conn);
+	if (ret) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+		UINT_MAX, UINT_MAX);
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int err = 0;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("failed to disconnect, cm: %p err %d\n",
+				ic->i_cm_id, err);
+		}
+
+		/*
+		 * We want to wait for tx and rx completion to finish
+		 * before we tear down the connection, but we have to be
+		 * careful not to get stuck waiting on a send ring that
+		 * only has unsignaled sends in it.  We've shutdown new
+		 * sends before getting here so by waiting for signaled
+		 * sends to complete we're ensured that there will be no
+		 * more tx processing.
+		 */
+		wait_event(rds_ib_ring_empty_wait,
+			   rds_ib_ring_empty(&ic->i_recv_ring) &&
+			   (atomic_read(&ic->i_signaled_sends) == 0));
+		tasklet_kill(&ic->i_recv_tasklet);
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_ib_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_ib_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+		rdma_destroy_id(ic->i_cm_id);
+
+		/*
+		 * Move connection back to the nodev list.
+		 */
+		if (ic->rds_ibdev)
+			rds_ib_remove_conn(ic->rds_ibdev, conn);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_ibdev);
+
+	/* Clear pending transmit */
+	if (ic->i_data_op) {
+		struct rds_message *rm;
+
+		rm = container_of(ic->i_data_op, struct rds_message, data);
+		rds_message_put(rm);
+		ic->i_data_op = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	if (ic->i_ibinc) {
+		rds_inc_put(&ic->i_ibinc->ii_inc);
+		ic->i_ibinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+	int ret;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+	if (!ic)
+		return -ENOMEM;
+
+	ret = rds_ib_recv_alloc_caches(ic);
+	if (ret) {
+		kfree(ic);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&ic->ib_node);
+	tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
+		     (unsigned long) ic);
+	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
+	atomic_set(&ic->i_signaled_sends, 0);
+
+	/*
+	 * rds_ib_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_ib_conn_free(void *arg)
+{
+	struct rds_ib_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
+	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_ibdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
+	list_del(&ic->ib_node);
+	spin_unlock_irq(lock_ptr);
+
+	rds_ib_recv_free_caches(ic);
+
+	kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 00000000..819c35a0
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+
+#include "rds.h"
+#include "ib.h"
+#include "xlist.h"
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+	struct rds_ib_device	*device;
+	struct rds_ib_mr_pool	*pool;
+	struct ib_fmr		*fmr;
+
+	struct xlist_head	xlist;
+
+	/* unmap_list is for freeing */
+	struct list_head	unmap_list;
+	unsigned int		remap_count;
+
+	struct scatterlist	*sg;
+	unsigned int		sg_len;
+	u64			*dma;
+	int			sg_dma_len;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct delayed_work	flush_worker;		/* flush worker */
+
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+
+	struct xlist_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct xlist_head	free_list;		/* unused MRs */
+	struct xlist_head	clean_list;		/* global unused & unamapped MRs */
+	wait_queue_head_t	flush_wait;
+
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	struct ib_fmr_attr	fmr_attr;
+};
+
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+			if (i_ipaddr->ipaddr == ipaddr) {
+				atomic_inc(&rds_ibdev->refcount);
+				rcu_read_unlock();
+				return rds_ibdev;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+	if (!i_ipaddr)
+		return -ENOMEM;
+
+	i_ipaddr->ipaddr = ipaddr;
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+	struct rds_ib_ipaddr *to_free = NULL;
+
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+		if (i_ipaddr->ipaddr == ipaddr) {
+			list_del_rcu(&i_ipaddr->list);
+			to_free = i_ipaddr;
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	if (to_free) {
+		synchronize_rcu();
+		kfree(to_free);
+	}
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev_old;
+
+	rds_ibdev_old = rds_ib_get_device(ipaddr);
+	if (rds_ibdev_old) {
+		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_dev_put(rds_ibdev_old);
+	}
+
+	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	BUG_ON(list_empty(&ib_nodev_conns));
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+
+	spin_lock(&rds_ibdev->spinlock);
+	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+	spin_unlock(&rds_ibdev->spinlock);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = rds_ibdev;
+	atomic_inc(&rds_ibdev->refcount);
+}
+
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&ib_nodev_conns_lock);
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+	spin_unlock(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = NULL;
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_destroy_nodev_conns(void)
+{
+	struct rds_ib_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+		rds_conn_destroy(ic->conn);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_XLIST_HEAD(&pool->free_list);
+	INIT_XLIST_HEAD(&pool->drop_list);
+	INIT_XLIST_HEAD(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	init_waitqueue_head(&pool->flush_wait);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+	pool->fmr_attr.max_pages = fmr_message_size;
+	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+	pool->fmr_attr.page_shift = PAGE_SHIFT;
+	pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+	pool->max_items = rds_ibdev->max_fmrs;
+
+	return pool;
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+	cancel_delayed_work_sync(&pool->flush_worker);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
+	WARN_ON(atomic_read(&pool->item_count));
+	WARN_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
+			 struct rds_ib_mr **ibmr_ret)
+{
+	struct xlist_head *ibmr_xl;
+	ibmr_xl = xlist_del_head_fast(xl);
+	*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
+}
+
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct xlist_head *ret;
+	unsigned long *flag;
+
+	preempt_disable();
+	flag = &__get_cpu_var(clean_list_grace);
+	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	ret = xlist_del_head(&pool->clean_list);
+	if (ret)
+		ibmr = list_entry(ret, struct rds_ib_mr, xlist);
+
+	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+	preempt_enable();
+	return ibmr;
+}
+
+static inline void wait_clean_list_grace(void)
+{
+	int cpu;
+	unsigned long *flag;
+
+	for_each_online_cpu(cpu) {
+		flag = &per_cpu(clean_list_grace, cpu);
+		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+			cpu_relax();
+	}
+}
+
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+	struct rds_ib_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		schedule_delayed_work(&pool->flush_worker, 10);
+
+	while (1) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
+	}
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	memset(ibmr, 0, sizeof(*ibmr));
+
+	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+			(IB_ACCESS_LOCAL_WRITE |
+			 IB_ACCESS_REMOTE_READ |
+			 IB_ACCESS_REMOTE_WRITE|
+			 IB_ACCESS_REMOTE_ATOMIC),
+			&pool->fmr_attr);
+	if (IS_ERR(ibmr->fmr)) {
+		err = PTR_ERR(ibmr->fmr);
+		ibmr->fmr = NULL;
+		printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+		goto out_no_cigar;
+	}
+
+	rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		if (ibmr->fmr)
+			ib_dealloc_fmr(ibmr->fmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+	       struct scatterlist *sg, unsigned int nents)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct scatterlist *scat = sg;
+	u64 io_addr = 0;
+	u64 *dma_pages;
+	u32 len;
+	int page_cnt, sg_dma_len;
+	int i, j;
+	int ret;
+
+	sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+				 DMA_BIDIRECTIONAL);
+	if (unlikely(!sg_dma_len)) {
+		printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+		return -EBUSY;
+	}
+
+	len = 0;
+	page_cnt = 0;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < sg_dma_len - 1)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+
+		len += dma_len;
+	}
+
+	page_cnt += len >> PAGE_SHIFT;
+	if (page_cnt > fmr_message_size)
+		return -EINVAL;
+
+	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+				 rdsibdev_to_node(rds_ibdev));
+	if (!dma_pages)
+		return -ENOMEM;
+
+	page_cnt = 0;
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		for (j = 0; j < dma_len; j += PAGE_SIZE)
+			dma_pages[page_cnt++] =
+				(dma_addr & PAGE_MASK) + j;
+	}
+
+	ret = ib_map_phys_fmr(ibmr->fmr,
+				   dma_pages, page_cnt, io_addr);
+	if (ret)
+		goto out;
+
+	/* Success - we successfully remapped the MR, so we can
+	 * safely tear down the old mapping. */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = scat;
+	ibmr->sg_len = nents;
+	ibmr->sg_dma_len = sg_dma_len;
+	ibmr->remap_count++;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_used);
+	ret = 0;
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	if (ibmr->sg_dma_len) {
+		ib_dma_unmap_sg(rds_ibdev->dev,
+				ibmr->sg, ibmr->sg_len,
+				DMA_BIDIRECTIONAL);
+		ibmr->sg_dma_len = 0;
+	}
+
+	/* Release the s/g list */
+	if (ibmr->sg_len) {
+		unsigned int i;
+
+		for (i = 0; i < ibmr->sg_len; ++i) {
+			struct page *page = sg_page(&ibmr->sg[i]);
+
+			/* FIXME we need a way to tell a r/w MR
+			 * from a r/o MR */
+			BUG_ON(irqs_disabled());
+			set_page_dirty(page);
+			put_page(page);
+		}
+		kfree(ibmr->sg);
+
+		ibmr->sg = NULL;
+		ibmr->sg_len = 0;
+	}
+}
+
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	unsigned int pinned = ibmr->sg_len;
+
+	__rds_ib_teardown_mr(ibmr);
+	if (pinned) {
+		struct rds_ib_device *rds_ibdev = ibmr->device;
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		atomic_sub(pinned, &pool->free_pinned);
+	}
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * given an xlist of mrs, put them all into the list_head for more processing
+ */
+static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head splice;
+	struct xlist_head *cur;
+	struct xlist_head *next;
+
+	splice.next = NULL;
+	xlist_splice(xlist, &splice);
+	cur = splice.next;
+	while (cur) {
+		next = cur->next;
+		ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+		list_add_tail(&ibmr->unmap_list, list);
+		cur = next;
+	}
+}
+
+/*
+ * this takes a list head of mrs and turns it into an xlist of clusters.
+ * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
+ * reuse.
+ */
+static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
+				struct list_head *list, struct xlist_head *xlist,
+				struct xlist_head **tail_ret)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head *cur_mr = xlist;
+	struct xlist_head *tail_mr = NULL;
+
+	list_for_each_entry(ibmr, list, unmap_list) {
+		tail_mr = &ibmr->xlist;
+		tail_mr->next = NULL;
+		cur_mr->next = tail_mr;
+		cur_mr = tail_mr;
+	}
+	*tail_ret = tail_mr;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			        int free_all, struct rds_ib_mr **ibmr_ret)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct xlist_head clean_xlist;
+	struct xlist_head *clean_tail;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+	unsigned long unpinned = 0;
+	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	int ret = 0;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+	if (ibmr_ret) {
+		DEFINE_WAIT(wait);
+		while(!mutex_trylock(&pool->flush_lock)) {
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+
+			prepare_to_wait(&pool->flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (xlist_empty(&pool->clean_list))
+				schedule();
+
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+		}
+		finish_wait(&pool->flush_wait, &wait);
+	} else
+		mutex_lock(&pool->flush_lock);
+
+	if (ibmr_ret) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr) {
+			*ibmr_ret = ibmr;
+			goto out;
+		}
+	}
+
+	/* Get the list of all MRs to be dropped. Ordering matters -
+	 * we want to put drop_list ahead of free_list.
+	 */
+	xlist_append_to_list(&pool->drop_list, &unmap_list);
+	xlist_append_to_list(&pool->free_list, &unmap_list);
+	if (free_all)
+		xlist_append_to_list(&pool->clean_list, &unmap_list);
+
+	free_goal = rds_ib_flush_goal(pool, free_all);
+
+	if (list_empty(&unmap_list))
+		goto out;
+
+	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+	list_for_each_entry(ibmr, &unmap_list, unmap_list)
+		list_add(&ibmr->fmr->list, &fmr_list);
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+		unpinned += ibmr->sg_len;
+		__rds_ib_teardown_mr(ibmr);
+		if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+			rds_ib_stats_inc(s_ib_rdma_mr_free);
+			list_del(&ibmr->unmap_list);
+			ib_dealloc_fmr(ibmr->fmr);
+			kfree(ibmr);
+			nfreed++;
+		}
+		ncleaned++;
+	}
+
+	if (!list_empty(&unmap_list)) {
+		/* we have to make sure that none of the things we're about
+		 * to put on the clean list would race with other cpus trying
+		 * to pull items off.  The xlist would explode if we managed to
+		 * remove something from the clean list and then add it back again
+		 * while another CPU was spinning on that same item in xlist_del_head.
+		 *
+		 * This is pretty unlikely, but just in case  wait for an xlist grace period
+		 * here before adding anything back into the clean list.
+		 */
+		wait_clean_list_grace();
+
+		list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+		if (ibmr_ret)
+			refill_local(pool, &clean_xlist, ibmr_ret);
+
+		/* refill_local may have emptied our list */
+		if (!xlist_empty(&clean_xlist))
+			xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+
+	}
+
+	atomic_sub(unpinned, &pool->free_pinned);
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+out:
+	mutex_unlock(&pool->flush_lock);
+	if (waitqueue_active(&pool->flush_wait))
+		wake_up(&pool->flush_wait);
+out_nolock:
+	return ret;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+
+	rds_ib_flush_mr_pool(pool, 0, NULL);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+	/* Return it to the pool's free list */
+	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
+	else
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
+
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		schedule_delayed_work(&pool->flush_worker, 10);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_ib_flush_mr_pool(pool, 0, NULL);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			schedule_delayed_work(&pool->flush_worker, 10);
+		}
+	}
+
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_flush_mrs(void)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	down_read(&rds_ib_devices_lock);
+	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		if (pool)
+			rds_ib_flush_mr_pool(pool, 0, NULL);
+	}
+	up_read(&rds_ib_devices_lock);
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_mr *ibmr = NULL;
+	int ret;
+
+	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+	if (!rds_ibdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_ibdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_ib_alloc_fmr(rds_ibdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->fmr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+
+	ibmr->device = rds_ibdev;
+	rds_ibdev = NULL;
+
+ out:
+	if (ret) {
+		if (ibmr)
+			rds_ib_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	if (rds_ibdev)
+		rds_ib_dev_put(rds_ibdev);
+	return ibmr;
+}
+
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 00000000..e29e0ca3
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,1076 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_ibinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+		sge = &recv->r_sge[0];
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = &recv->r_sge[1];
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = ic->i_mr->lkey;
+	}
+}
+
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+				    struct list_head *to)
+{
+	struct list_head *from_last = from->prev;
+
+	list_splice_tail(from_last, to);
+	list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *tmp;
+
+	tmp = xchg(&cache->xfer, NULL);
+	if (tmp) {
+		if (cache->ready)
+			list_splice_entire_tail(tmp, cache->ready);
+		else
+			cache->ready = tmp;
+	}
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+	if (!cache->percpu)
+	       return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		head->first = NULL;
+		head->count = 0;
+	}
+	cache->xfer = NULL;
+	cache->ready = NULL;
+
+	return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+	int ret;
+
+	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+	if (!ret) {
+		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+		if (ret)
+			free_percpu(ic->i_cache_incs.percpu);
+	}
+
+	return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+					  struct list_head *caller_list)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		if (head->first) {
+			list_splice_entire_tail(head->first, caller_list);
+			head->first = NULL;
+		}
+	}
+
+	if (cache->ready) {
+		list_splice_entire_tail(cache->ready, caller_list);
+		cache->ready = NULL;
+	}
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *inc;
+	struct rds_ib_incoming *inc_tmp;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *frag_tmp;
+	LIST_HEAD(list);
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+	free_percpu(ic->i_cache_incs.percpu);
+
+	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+		list_del(&inc->ii_cache_entry);
+		WARN_ON(!list_empty(&inc->ii_frags));
+		kmem_cache_free(rds_ib_incoming_slab, inc);
+	}
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+	free_percpu(ic->i_cache_frags.percpu);
+
+	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+		list_del(&frag->f_cache_entry);
+		WARN_ON(!list_empty(&frag->f_item));
+		kmem_cache_free(rds_ib_frag_slab, frag);
+	}
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				  struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+			     struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	/* Free attached frags */
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_free(ic, frag);
+	}
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+				  struct rds_ib_recv_work *recv)
+{
+	if (recv->r_ibinc) {
+		rds_inc_put(&recv->r_ibinc->ii_inc);
+		recv->r_ibinc = NULL;
+	}
+	if (recv->r_frag) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+}
+
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+						     gfp_t slab_mask)
+{
+	struct rds_ib_incoming *ibinc;
+	struct list_head *cache_item;
+	int avail_allocs;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+	if (cache_item) {
+		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+	} else {
+		avail_allocs = atomic_add_unless(&rds_ib_allocation,
+						 1, rds_ib_sysctl_max_recv_allocation);
+		if (!avail_allocs) {
+			rds_ib_stats_inc(s_ib_rx_alloc_limit);
+			return NULL;
+		}
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+		if (!ibinc) {
+			atomic_dec(&rds_ib_allocation);
+			return NULL;
+		}
+	}
+	INIT_LIST_HEAD(&ibinc->ii_frags);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+
+	return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+						    gfp_t slab_mask, gfp_t page_mask)
+{
+	struct rds_page_frag *frag;
+	struct list_head *cache_item;
+	int ret;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+	if (cache_item) {
+		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+	} else {
+		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+		if (!frag)
+			return NULL;
+
+		sg_init_table(&frag->f_sg, 1);
+		ret = rds_page_remainder_alloc(&frag->f_sg,
+					       RDS_FRAG_SIZE, page_mask);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, frag);
+			return NULL;
+		}
+	}
+
+	INIT_LIST_HEAD(&frag->f_item);
+
+	return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+				  struct rds_ib_recv_work *recv, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+	gfp_t slab_mask = GFP_NOWAIT;
+	gfp_t page_mask = GFP_NOWAIT;
+
+	if (prefill) {
+		slab_mask = GFP_KERNEL;
+		page_mask = GFP_HIGHUSER;
+	}
+
+	if (!ic->i_cache_incs.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	if (!ic->i_cache_frags.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+	/*
+	 * ibinc was taken from recv if recv contained the start of a message.
+	 * recvs that were continuations will still have this allocated.
+	 */
+	if (!recv->r_ibinc) {
+		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+		if (!recv->r_ibinc)
+			goto out;
+	}
+
+	WARN_ON(recv->r_frag); /* leak! */
+	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+	if (!recv->r_frag)
+		goto out;
+
+	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+			    1, DMA_FROM_DEVICE);
+	WARN_ON(ret != 1);
+
+	sge = &recv->r_sge[0];
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	sge = &recv->r_sge[1];
+	sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+	sge->length = sg_dma_len(&recv->r_frag->f_sg);
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn)) &&
+	       rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_ib_recv_refill_one(conn, recv, prefill);
+		if (ret) {
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+			 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
+		if (ret) {
+			rds_ib_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_ib_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+}
+
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				 struct rds_ib_refill_cache *cache)
+{
+	unsigned long flags;
+	struct rds_ib_cache_head *chp;
+	struct list_head *old;
+
+	local_irq_save(flags);
+
+	chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+	if (!chp->first)
+		INIT_LIST_HEAD(new_item);
+	else /* put on front */
+		list_add_tail(new_item, chp->first);
+	chp->first = new_item;
+	chp->count++;
+
+	if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+		goto end;
+
+	/*
+	 * Return our per-cpu first list to the cache's xfer by atomically
+	 * grabbing the current xfer list, appending it to our per-cpu list,
+	 * and then atomically returning that entire list back to the
+	 * cache's xfer list as long as it's still empty.
+	 */
+	do {
+		old = xchg(&cache->xfer, NULL);
+		if (old)
+			list_splice_entire_tail(old, chp->first);
+		old = cmpxchg(&cache->xfer, NULL, chp->first);
+	} while (old);
+
+	chp->first = NULL;
+	chp->count = 0;
+end:
+	local_irq_restore(flags);
+}
+
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *head = cache->ready;
+
+	if (head) {
+		if (!list_empty(head)) {
+			cache->ready = head->next;
+			list_del_init(head);
+		} else
+			cache->ready = NULL;
+	}
+
+	return head;
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+					    frag->f_sg.offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_mr->lkey;
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IB_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_clear_bit();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_clear_bit();
+
+	return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_ib_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_ib_stats_inc(s_ib_ack_send_failure);
+
+		rds_ib_conn_error(ic->conn, "sending ack failed\n");
+	} else
+		rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_ib_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_ib_stats_inc(s_ib_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+		rds_ib_stats_inc(s_ib_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+	return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+			      struct rds_ib_incoming *ibinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr, KM_SOFTIRQ0);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+				struct rds_ib_recv_work *recv, u32 data_len,
+				struct rds_ib_ack_state *state)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_incoming *ibinc = ic->i_ibinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+		 data_len);
+
+	if (data_len < sizeof(struct rds_header)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 didn't inclue a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	data_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_ib_stats_inc(s_ib_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (!ibinc) {
+		ibinc = recv->r_ibinc;
+		recv->r_ibinc = NULL;
+		ic->i_ibinc = ibinc;
+
+		hdr = &ibinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &ibinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence ||
+		    hdr->h_len != ihdr->h_len ||
+		    hdr->h_sport != ihdr->h_sport ||
+		    hdr->h_dport != ihdr->h_dport) {
+			rds_ib_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_ibinc = NULL;
+
+		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_ib_cong_recv(conn, ibinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &ibinc->ii_inc, GFP_ATOMIC,
+					  KM_SOFTIRQ0);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&ibinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_ib_stats_inc(s_ib_rx_cq_call);
+
+	tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_ib_connection *ic,
+			       struct rds_ib_ack_state *state)
+{
+	struct rds_connection *conn = ic->conn;
+	struct ib_wc wc;
+	struct rds_ib_recv_work *recv;
+
+	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (wc.status == IB_WC_SUCCESS) {
+			rds_ib_process_recv(conn, recv, wc.byte_len, state);
+		} else {
+			/* We expect errors as the qp is drained during shutdown */
+			if (rds_conn_up(conn) || rds_conn_connecting(conn))
+				rds_ib_conn_error(conn, "recv completion on %pI4 had "
+						  "status %u (%s), disconnecting and "
+						  "reconnecting\n", &conn->c_faddr,
+						  wc.status,
+						  rds_ib_wc_status_str(wc.status));
+		}
+
+		/*
+		 * It's very important that we only free this ring entry if we've truly
+		 * freed the resources allocated to the entry.  The refilling path can
+		 * leak if we don't.
+		 */
+		rds_ib_ring_free(&ic->i_recv_ring, 1);
+	}
+}
+
+void rds_ib_recv_tasklet_fn(unsigned long data)
+{
+	struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
+	struct rds_connection *conn = ic->conn;
+	struct rds_ib_ack_state state = { 0, };
+
+	rds_poll_cq(ic, &state);
+	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	rds_poll_cq(ic, &state);
+
+	if (state.ack_next_valid)
+		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_ib_ring_empty(&ic->i_recv_ring))
+		rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+	if (rds_ib_ring_low(&ic->i_recv_ring))
+		rds_ib_recv_refill(conn, 0);
+}
+
+int rds_ib_recv(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	return ret;
+}
+
+int rds_ib_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+					sizeof(struct rds_ib_incoming),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_incoming_slab)
+		goto out;
+
+	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+					sizeof(struct rds_page_frag),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_frag_slab)
+		kmem_cache_destroy(rds_ib_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+	kmem_cache_destroy(rds_ib_incoming_slab);
+	kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 00000000..ff97e8ed
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_ib_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_ib_ring_empty(ring) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 00000000..7c4dce8f
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+		[IB_WC_##foo] = __stringify(IB_WC_##foo)
+	RDS_IB_WC_STATUS_STR(SUCCESS),
+	RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+	RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+	RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+	RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+	RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+	RDS_IB_WC_STATUS_STR(FATAL_ERR),
+	RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+	RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+	return rds_str_array(rds_ib_wc_status_strings,
+			     ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+				 int wc_status,
+				 void (*complete)(struct rds_message *rm, int status))
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+				   struct rm_rdma_op *op,
+				   int wc_status)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	/* If the user asked for a completion notification on this
+	 * message, we can implement three different semantics:
+	 *  1.	Notify when we received the ACK on the RDS message
+	 *	that was queued with the RDMA. This provides reliable
+	 *	notification of RDMA status at the expense of a one-way
+	 *	packet delay.
+	 *  2.	Notify when the IB stack gives us the completion event for
+	 *	the RDMA operation.
+	 *  3.	Notify when the IB stack gives us the completion event for
+	 *	the accompanying RDS messages.
+	 * Here, we implement approach #3. To implement approach #2,
+	 * we would need to take an event for the rdma WR. To implement #1,
+	 * don't call rds_rdma_send_complete at all, and fall back to the notify
+	 * handling in the ACK processing code.
+	 *
+	 * Note: There's no need to explicitly sync any RDMA buffers using
+	 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+	 * operation itself unmapped the RDMA buffers, which takes care
+	 * of synching.
+	 */
+	rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+			     wc_status, rds_rdma_send_complete);
+
+	if (op->op_write)
+		rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+	else
+		rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
+
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+				     struct rm_atomic_op *op,
+				     int wc_status)
+{
+	/* unmap atomic recvbuf */
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+				DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+			     wc_status, rds_atomic_send_complete);
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+		rds_ib_stats_inc(s_ib_atomic_cswp);
+	else
+		rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+						struct rds_ib_send_work *send,
+						int wc_status)
+{
+	struct rds_message *rm = NULL;
+
+	/* In the error case, wc.opcode sometimes contains garbage */
+	switch (send->s_wr.opcode) {
+	case IB_WR_SEND:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, data);
+			rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, rdma);
+			rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, atomic);
+			rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+		}
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_NOTICE
+			       "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+			       __func__, send->s_wr.opcode);
+		break;
+	}
+
+	send->s_wr.opcode = 0xdead;
+
+	return rm;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_op = NULL;
+
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = &send->s_sge[0];
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+
+		send->s_sge[1].lkey = ic->i_mr->lkey;
+	}
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		if (send->s_op && send->s_wr.opcode != 0xdead)
+			rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+	}
+}
+
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+	if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+	BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_message *rm = NULL;
+	struct ib_wc wc;
+	struct rds_ib_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i = 0;
+	int ret;
+	int nr_sig = 0;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_ib_stats_inc(s_ib_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_tx_cq_event);
+
+		if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+			if (ic->i_ack_queued + HZ/2 < jiffies)
+				rds_ib_stats_inc(s_ib_tx_stalled);
+			rds_ib_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+			if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+				nr_sig++;
+
+			rm = rds_ib_send_unmap_op(ic, send, wc.status);
+
+			if (send->s_queued + HZ/2 < jiffies)
+				rds_ib_stats_inc(s_ib_tx_stalled);
+
+			if (send->s_op) {
+				if (send->s_op == rm->m_final_op) {
+					/* If anyone waited for this message to get flushed out, wake
+					 * them up now */
+					rds_message_unmapped(rm);
+				}
+				rds_message_put(rm);
+				send->s_op = NULL;
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_ib_ring_free(&ic->i_send_ring, completed);
+		rds_ib_sub_signaled(ic, nr_sig);
+		nr_sig = 0;
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+		    test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_ib_conn_error(conn, "send completion on %pI4 had status "
+					  "%u (%s), disconnecting and reconnecting\n",
+					  &conn->c_faddr, wc.status,
+					  rds_ib_wc_status_str(wc.status));
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the receiver's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, max_posted);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					     struct rds_ib_send_work *send,
+					     bool notify)
+{
+	/*
+	 * We want to delay signaling completions just enough to get
+	 * the batching benefits but not so much that we create dead time
+	 * on the wire.
+	 */
+	if (ic->i_unsignaled_wrs-- == 0 || notify) {
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc = 0;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int bytes_sent = 0;
+	int ret;
+	int flow_controlled = 0;
+	int nr_sig = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Do not send cong updates to IB loopback */
+	if (conn->c_loopback
+	    && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		scat = &rm->data.op_sg[sg];
+		ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+		ret = min_t(int, ret, scat->length - conn->c_xmit_data_off);
+		return ret;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (ic->i_flowctl) {
+		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled = 1;
+		}
+		if (work_alloc == 0) {
+			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+			rds_ib_stats_inc(s_ib_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (!ic->i_data_op) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
+				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->data.op_count = 0;
+		}
+
+		rds_message_addref(rm);
+		ic->i_data_op = &rm->data;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->rdma.op_active) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_ib_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		if (ic->i_flowctl) {
+			rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+			adv_credits += posted;
+			BUG_ON(adv_credits > 255);
+		}
+	}
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->rdma.op_active && rm->rdma.op_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/* Each frag gets a header. Msgs may be 0 bytes */
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &ic->i_data_op->op_sg[sg];
+	i = 0;
+	do {
+		unsigned int len = 0;
+
+		/* Set up the header */
+		send->s_wr.send_flags = send_flags;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.num_sge = 1;
+		send->s_wr.next = NULL;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		send->s_sge[0].addr = ic->i_send_hdrs_dma
+			+ (pos * sizeof(struct rds_header));
+		send->s_sge[0].length = sizeof(struct rds_header);
+
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		/* Set up the data, if present */
+		if (i < work_alloc
+		    && scat != &rm->data.op_sg[rm->data.op_count]) {
+			len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+			send->s_wr.num_sge = 2;
+
+			send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+			send->s_sge[1].length = len;
+
+			bytes_sent += len;
+			off += len;
+			if (off == ib_sg_dma_len(dev, scat)) {
+				scat++;
+				off = 0;
+			}
+		}
+
+		rds_ib_set_wr_signal_state(ic, send, 0);
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		if (ic->i_flowctl && adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_ib_stats_inc(s_ib_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+		send = &ic->i_sends[pos];
+		i++;
+
+	} while (i < work_alloc
+		 && scat != &rm->data.op_sg[rm->data.op_count]);
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		bytes_sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_op = ic->i_data_op;
+		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+		ic->i_data_op = NULL;
+	}
+
+	/* Put back wrs & credits we didn't use */
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_ib_send_add_credits(conn, credit_alloc - i);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		if (prev->s_op) {
+			ic->i_data_op = prev->s_op;
+			prev->s_op = NULL;
+		}
+
+		rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+		goto out;
+	}
+
+	ret = bytes_sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	u32 pos;
+	u32 work_alloc;
+	int ret;
+	int nr_sig = 0;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* address of send request in ring */
+	send = &ic->i_sends[pos];
+	send->s_queued = jiffies;
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+		send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+		send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+		send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+	} else { /* FADD */
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+		send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+		send->s_wr.wr.atomic.swap = 0;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+		send->s_wr.wr.atomic.swap_mask = 0;
+	}
+	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+	send->s_wr.num_sge = 1;
+	send->s_wr.next = NULL;
+	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+	send->s_wr.wr.atomic.rkey = op->op_rkey;
+	send->s_op = op;
+	rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+	/* map 8 byte retval buffer to the device */
+	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+	if (ret != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+		ret = -ENOMEM; /* XXX ? */
+		goto out;
+	}
+
+	/* Convert our struct scatterlist to struct ib_sge */
+	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].lkey = ic->i_mr->lkey;
+
+	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+		 send->s_sge[0].addr, send->s_sge[0].length);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+		 send, &send->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &send->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &send->s_wr);
+	}
+
+out:
+	return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->op_remote_addr;
+	u32 max_sge = ic->rds_ibdev->max_sge;
+	u32 pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+	int nr_sig = 0;
+
+	/* map the op the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
+			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->op_mapped = 1;
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->op_count, max_sge);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &op->op_sg[0];
+	sent = 0;
+	num_sge = op->op_count;
+
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+
+		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
+
+		if (num_sge > max_sge) {
+			send->s_wr.num_sge = max_sge;
+			num_sge -= max_sge;
+		} else {
+			send->s_wr.num_sge = num_sge;
+		}
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+			send->s_sge[j].addr =
+				 ib_sg_dma_address(ic->i_cm_id->device, scat);
+			send->s_sge[j].length = len;
+			send->s_sge[j].lkey = ic->i_mr->lkey;
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+			remote_addr += len;
+			scat++;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* give a reference to the last op */
+	if (scat == &op->op_sg[op->op_count]) {
+		prev->s_op = op;
+		rds_message_addref(container_of(op, struct rds_message, rdma));
+	}
+
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &first->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &first->s_wr);
+	}
+
+
+out:
+	return ret;
+}
+
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 00000000..2d5965d6
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+
+static const char *const rds_ib_stat_names[] = {
+	"ib_connect_raced",
+	"ib_listen_closed_stale",
+	"ib_tx_cq_call",
+	"ib_tx_cq_event",
+	"ib_tx_ring_full",
+	"ib_tx_throttle",
+	"ib_tx_sg_mapping_failure",
+	"ib_tx_stalled",
+	"ib_tx_credit_updates",
+	"ib_rx_cq_call",
+	"ib_rx_cq_event",
+	"ib_rx_ring_empty",
+	"ib_rx_refill_from_cq",
+	"ib_rx_refill_from_thread",
+	"ib_rx_alloc_limit",
+	"ib_rx_credit_updates",
+	"ib_ack_sent",
+	"ib_ack_send_failure",
+	"ib_ack_send_delayed",
+	"ib_ack_send_piggybacked",
+	"ib_ack_received",
+	"ib_rdma_mr_alloc",
+	"ib_rdma_mr_free",
+	"ib_rdma_mr_used",
+	"ib_rdma_mr_pool_flush",
+	"ib_rdma_mr_pool_wait",
+	"ib_rdma_mr_pool_depleted",
+	"ib_atomic_cswp",
+	"ib_atomic_fadd",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_ib_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_ib_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+			    ARRAY_SIZE(rds_ib_stat_names));
+out:
+	return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 00000000..1253b006
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
+
+static ctl_table rds_ib_sysctl_table[] = {
+	{
+		.procname       = "max_send_wr",
+		.data		= &rds_ib_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_recv_wr",
+		.data		= &rds_ib_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_ib_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
+	},
+	{
+		.procname       = "max_recv_allocation",
+		.data		= &rds_ib_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "flow_control",
+		.data		= &rds_ib_sysctl_flow_control,
+		.maxlen		= sizeof(rds_ib_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "rds", },
+	{ .procname = "ib", },
+	{ }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+	if (rds_ib_sysctl_hdr)
+		unregister_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int rds_ib_sysctl_init(void)
+{
+	rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+	if (!rds_ib_sysctl_hdr)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 00000000..4fdf1b6e
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time.  The structs are only copied if the user-specified
+ * buffer is big enough.  The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+	struct page **pages;
+	void *addr;
+	unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset]);
+	rds_info_funcs[offset] = func;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset] != func);
+	rds_info_funcs[offset] = NULL;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive.  This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+	if (iter->addr) {
+		kunmap_atomic(iter->addr, KM_USER0);
+		iter->addr = NULL;
+	}
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes)
+{
+	unsigned long this;
+
+	while (bytes) {
+		if (!iter->addr)
+			iter->addr = kmap_atomic(*iter->pages, KM_USER0);
+
+		this = min(bytes, PAGE_SIZE - iter->offset);
+
+		rdsdebug("page %p addr %p offset %lu this %lu data %p "
+			  "bytes %lu\n", *iter->pages, iter->addr,
+			  iter->offset, this, data, bytes);
+
+		memcpy(iter->addr + iter->offset, data, this);
+
+		data += this;
+		bytes -= this;
+		iter->offset += this;
+
+		if (iter->offset == PAGE_SIZE) {
+			kunmap_atomic(iter->addr, KM_USER0);
+			iter->addr = NULL;
+			iter->offset = 0;
+			iter->pages++;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(rds_info_copy);
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace.  @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen)
+{
+	struct rds_info_iterator iter;
+	struct rds_info_lengths lens;
+	unsigned long nr_pages = 0;
+	unsigned long start;
+	unsigned long i;
+	rds_info_func func;
+	struct page **pages = NULL;
+	int ret;
+	int len;
+	int total;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* check for all kinds of wrapping and the like */
+	start = (unsigned long)optval;
+	if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* a 0 len call is just trying to probe its length */
+	if (len == 0)
+		goto call_func;
+
+	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+			>> PAGE_SHIFT;
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = get_user_pages_fast(start, nr_pages, 1, pages);
+	if (ret != nr_pages) {
+		if (ret > 0)
+			nr_pages = ret;
+		else
+			nr_pages = 0;
+		ret = -EAGAIN; /* XXX ? */
+		goto out;
+	}
+
+	rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+	func = rds_info_funcs[optname - RDS_INFO_FIRST];
+	if (!func) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	iter.pages = pages;
+	iter.addr = NULL;
+	iter.offset = start & (PAGE_SIZE - 1);
+
+	func(sock, len, &iter, &lens);
+	BUG_ON(lens.each == 0);
+
+	total = lens.nr * lens.each;
+
+	rds_info_iter_unmap(&iter);
+
+	if (total > len) {
+		len = total;
+		ret = -ENOSPC;
+	} else {
+		len = total;
+		ret = lens.each;
+	}
+
+	if (put_user(len, optlen))
+		ret = -EFAULT;
+
+out:
+	for (i = 0; pages && i < nr_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+
+	return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 00000000..b6c052ca
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+	unsigned int	nr;
+	unsigned int	each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source.  If the snapshot fits in @len then it
+ * should be copied using @iter.  The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 00000000..f7474844
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "rds.h"
+#include "iw.h"
+
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+
+struct list_head rds_iw_devices;
+
+/* NOTE: if also grabbing iwdev lock, grab this first */
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+
+static void rds_iw_add_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle iwarp devices */
+	if (device->node_type != RDMA_NODE_RNIC)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+	if (!rds_iwdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_iwdev->spinlock);
+
+	rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+	rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+	rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+
+	rds_iwdev->dev = device;
+	rds_iwdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_iwdev->pd))
+		goto free_dev;
+
+	if (!rds_iwdev->dma_local_lkey) {
+		rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+					IB_ACCESS_REMOTE_READ |
+					IB_ACCESS_REMOTE_WRITE |
+					IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(rds_iwdev->mr))
+			goto err_pd;
+	} else
+		rds_iwdev->mr = NULL;
+
+	rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+	if (IS_ERR(rds_iwdev->mr_pool)) {
+		rds_iwdev->mr_pool = NULL;
+		goto err_mr;
+	}
+
+	INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+	INIT_LIST_HEAD(&rds_iwdev->conn_list);
+	list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+
+	ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+
+	goto free_attr;
+
+err_mr:
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+	ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+	kfree(rds_iwdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+static void rds_iw_remove_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_cm_id *i_cm_id, *next;
+
+	rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+	if (!rds_iwdev)
+		return;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+		list_del(&i_cm_id->list);
+		kfree(i_cm_id);
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	rds_iw_destroy_conns(rds_iwdev);
+
+	if (rds_iwdev->mr_pool)
+		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+
+	while (ib_dealloc_pd(rds_iwdev->pd)) {
+		rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+		msleep(1);
+	}
+
+	list_del(&rds_iwdev->list);
+	kfree(rds_iwdev);
+}
+
+struct ib_client rds_iw_client = {
+	.name   = "rds_iw",
+	.add    = rds_iw_add_one,
+	.remove = rds_iw_remove_one
+};
+
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_iw_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_iw_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_iw_device *rds_iwdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_iwdev->max_sge;
+		rds_iw_get_mr_info(rds_iwdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_iw_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support IB devices unless we
+	   check node_type. */
+	if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+void rds_iw_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+	rds_iw_destroy_nodev_conns();
+	ib_unregister_client(&rds_iw_client);
+	rds_iw_sysctl_exit();
+	rds_iw_recv_exit();
+	rds_trans_unregister(&rds_iw_transport);
+}
+
+struct rds_transport rds_iw_transport = {
+	.laddr_check		= rds_iw_laddr_check,
+	.xmit_complete		= rds_iw_xmit_complete,
+	.xmit			= rds_iw_xmit,
+	.xmit_rdma		= rds_iw_xmit_rdma,
+	.recv			= rds_iw_recv,
+	.conn_alloc		= rds_iw_conn_alloc,
+	.conn_free		= rds_iw_conn_free,
+	.conn_connect		= rds_iw_conn_connect,
+	.conn_shutdown		= rds_iw_conn_shutdown,
+	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
+	.inc_free		= rds_iw_inc_free,
+	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
+	.cm_handle_connect	= rds_iw_cm_handle_connect,
+	.cm_connect_complete	= rds_iw_cm_connect_complete,
+	.stats_info_copy	= rds_iw_stats_info_copy,
+	.exit			= rds_iw_exit,
+	.get_mr			= rds_iw_get_mr,
+	.sync_mr		= rds_iw_sync_mr,
+	.free_mr		= rds_iw_free_mr,
+	.flush_mrs		= rds_iw_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "iwarp",
+	.t_type			= RDS_TRANS_IWARP,
+	.t_prefer_loopback	= 1,
+};
+
+int rds_iw_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_iw_devices);
+
+	ret = ib_register_client(&rds_iw_client);
+	if (ret)
+		goto out;
+
+	ret = rds_iw_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_iw_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_iw_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_iw_recv_exit();
+out_sysctl:
+	rds_iw_sysctl_exit();
+out_ibreg:
+	ib_unregister_client(&rds_iw_client);
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 00000000..90151922
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FASTREG_SIZE		20
+#define RDS_FASTREG_POOL_SIZE		2048
+
+#define RDS_IW_MAX_SGE			8
+#define RDS_IW_RECV_SGE 		2
+
+#define RDS_IW_DEFAULT_RECV_WR		1024
+#define RDS_IW_DEFAULT_SEND_WR		256
+
+#define RDS_IW_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+extern struct list_head rds_iw_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct page		*f_page;
+	unsigned long		f_offset;
+	dma_addr_t 		f_mapped;
+};
+
+struct rds_iw_incoming {
+	struct list_head	ii_frags;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_iw_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_iw_scatterlist {
+	struct scatterlist	*list;
+	unsigned int		len;
+	int			dma_len;
+	unsigned int		dma_npages;
+	unsigned int		bytes;
+};
+
+struct rds_iw_mapping {
+	spinlock_t		m_lock;	/* protect the mapping struct */
+	struct list_head	m_list;
+	struct rds_iw_mr	*m_mr;
+	uint32_t		m_rkey;
+	struct rds_iw_scatterlist m_sg;
+};
+
+struct rds_iw_send_work {
+	struct rds_message	*s_rm;
+
+	/* We should really put these into a union: */
+	struct rm_rdma_op	*s_op;
+	struct rds_iw_mapping	*s_mapping;
+	struct ib_mr		*s_mr;
+	struct ib_fast_reg_page_list *s_page_list;
+	unsigned char		s_remap_count;
+
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IW_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_iw_recv_work {
+	struct rds_iw_incoming 	*r_iwinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_iw_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_iw_device;
+
+struct rds_iw_connection {
+
+	struct list_head	iw_node;
+	struct rds_iw_device 	*rds_iwdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_iw_work_ring	i_send_ring;
+	struct rds_message	*i_rm;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_iw_send_work *i_sends;
+
+	/* rx */
+	struct tasklet_struct	i_recv_tasklet;
+	struct mutex		i_recv_mutex;
+	struct rds_iw_work_ring	i_recv_ring;
+	struct rds_iw_incoming	*i_iwinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_iw_recv_work *i_recvs;
+	struct rds_page_frag	i_frag;
+	u64			i_ack_recv;	/* last ACK received */
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
+	u64			i_ack_next;	/* next ACK to send */
+#endif
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+	unsigned int		i_dma_local_lkey:1;
+	unsigned int		i_fastreg_posted:1; /* fastreg posted on this connection */
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+	long			i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_iw_cm_id {
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+};
+
+struct rds_iw_device {
+	struct list_head	list;
+	struct list_head	cm_id_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_iw_mr_pool	*mr_pool;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		dma_local_lkey:1;
+	spinlock_t		spinlock;	/* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID	((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID	((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID	((u64)0xdfdfdfdfdfdfdfdfULL)
+
+struct rds_iw_statistics {
+	uint64_t	s_iw_connect_raced;
+	uint64_t	s_iw_listen_closed_stale;
+	uint64_t	s_iw_tx_cq_call;
+	uint64_t	s_iw_tx_cq_event;
+	uint64_t	s_iw_tx_ring_full;
+	uint64_t	s_iw_tx_throttle;
+	uint64_t	s_iw_tx_sg_mapping_failure;
+	uint64_t	s_iw_tx_stalled;
+	uint64_t	s_iw_tx_credit_updates;
+	uint64_t	s_iw_rx_cq_call;
+	uint64_t	s_iw_rx_cq_event;
+	uint64_t	s_iw_rx_ring_empty;
+	uint64_t	s_iw_rx_refill_from_cq;
+	uint64_t	s_iw_rx_refill_from_thread;
+	uint64_t	s_iw_rx_alloc_limit;
+	uint64_t	s_iw_rx_credit_updates;
+	uint64_t	s_iw_ack_sent;
+	uint64_t	s_iw_ack_send_failure;
+	uint64_t	s_iw_ack_send_delayed;
+	uint64_t	s_iw_ack_send_piggybacked;
+	uint64_t	s_iw_ack_received;
+	uint64_t	s_iw_rdma_mr_alloc;
+	uint64_t	s_iw_rdma_mr_free;
+	uint64_t	s_iw_rdma_mr_used;
+	uint64_t	s_iw_rdma_mr_pool_flush;
+	uint64_t	s_iw_rdma_mr_pool_wait;
+	uint64_t	s_iw_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_iw_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_iw_dma_sync_sg_for_cpu
+
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_iw_dma_sync_sg_for_device
+
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+	return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern struct ib_client rds_iw_client;
+
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_iw_conn_error(conn, fmt...) \
+	__rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_iw_destroy_nodev_conns(void)
+{
+	__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
+}
+static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
+{
+	__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
+}
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_tasklet_fn(unsigned long data);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted, int max_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[1];
+}
+
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 00000000..c12db66f
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (rds_iw_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_iw_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = NULL;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	int err;
+
+	if (event->param.conn.private_data_len) {
+		dp = event->param.conn.private_data;
+
+		rds_iw_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+					dp->dp_protocol_minor));
+		rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	}
+
+	/* update ib_device with this local ipaddr & conn */
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+	if (err)
+		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+	rds_iw_add_conn(rds_iwdev, conn);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+			&conn->c_laddr, &conn->c_faddr,
+			RDS_PROTOCOL_MAJOR(conn->c_version),
+			RDS_PROTOCOL_MINOR(conn->c_version),
+			ic->i_flowctl ? ", flow control" : "");
+
+	rds_connect_complete(conn);
+}
+
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_iw_connect_private *dp,
+			u32 protocol_version)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+	/* XXX tune these? */
+	conn_param->responder_resources = 1;
+	conn_param->initiator_depth = 1;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_FATAL:
+	default:
+		rdsdebug("Fatal QP Event %u "
+			"- connection %pI4->%pI4, reconnecting\n",
+			event->event, &conn->c_laddr,
+			&conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+	}
+}
+
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+		struct rds_iw_device *rds_iwdev,
+		struct rds_iw_work_ring *send_ring,
+		void (*send_cq_handler)(struct ib_cq *, void *),
+		struct rds_iw_work_ring *recv_ring,
+		void (*recv_cq_handler)(struct ib_cq *, void *),
+		void *context)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	unsigned int send_size, recv_size;
+	int ret;
+
+	/* The offset of 1 is to accommodate the additional ACK WR. */
+	send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+	recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+	rds_iw_ring_resize(send_ring, send_size - 1);
+	rds_iw_ring_resize(recv_ring, recv_size - 1);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->event_handler = rds_iw_qp_event_handler;
+	attr->qp_context = context;
+	attr->cap.max_send_wr = send_size;
+	attr->cap.max_recv_wr = recv_size;
+	attr->cap.max_send_sge = rds_iwdev->max_sge;
+	attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+	attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr->qp_type = IB_QPT_RC;
+
+	attr->send_cq = ib_create_cq(dev, send_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, send_size, 0);
+	if (IS_ERR(attr->send_cq)) {
+		ret = PTR_ERR(attr->send_cq);
+		attr->send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, recv_size, 0);
+	if (IS_ERR(attr->recv_cq)) {
+		ret = PTR_ERR(attr->recv_cq);
+		attr->recv_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (attr->send_cq)
+			ib_destroy_cq(attr->send_cq);
+		if (attr->recv_cq)
+			ib_destroy_cq(attr->recv_cq);
+	}
+	return ret;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_iw_device *rds_iwdev;
+	int ret;
+
+	/* rds_iw_add_one creates a rds_iw_device object per IB device,
+	 * and allocates a protection domain, memory range and MR pool
+	 * for each.  If that fails for any reason, it will not register
+	 * the rds_iwdev at all.
+	 */
+	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+	if (!rds_iwdev) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+					dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_iwdev->pd;
+	ic->i_mr = rds_iwdev->mr;
+
+	ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+			&ic->i_send_ring, rds_iw_send_cq_comp_handler,
+			&ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+			conn);
+	if (ret < 0)
+		goto out;
+
+	ic->i_send_cq = attr.send_cq;
+	ic->i_recv_cq = attr.recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_send_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_recv_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (!ic->i_ack) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+	if (!ic->i_sends) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+	rds_iw_send_init_ring(ic);
+
+	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+	if (!ic->i_recvs) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+
+	rds_iw_recv_init_ring(ic);
+	rds_iw_recv_init_ack(ic);
+
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	return ret;
+}
+
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+	u16 common;
+	u32 version = 0;
+
+	/* rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-( */
+	if (dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else if (printk_ratelimit()) {
+		printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+			"incompatible protocol version %u.%u\n",
+			&dp->dp_saddr,
+			dp->dp_protocol_major,
+			dp->dp_protocol_minor);
+	}
+	return version;
+}
+
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+	struct rds_iw_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_iw_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_device *rds_iwdev;
+	u32 version;
+	int err, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_iw_protocol_compatible(dp);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+		 &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_iw_stats_inc(s_iw_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_iw_stats_inc(s_iw_connect_raced);
+		}
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_iw_set_protocol(conn, version);
+	rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_iw_setup_qp(conn);
+	if (err) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	mutex_unlock(&conn->c_cm_lock);
+	if (err) {
+		rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_iw_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_iw_setup_qp(conn);
+	if (ret) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		struct rds_iw_connection *ic = conn->c_transport_data;
+
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	/* First, bind to the local address and device. */
+	ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+	if (ret) {
+		rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+				&conn->c_laddr, ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		goto out;
+	}
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int err = 0;
+	struct ib_qp_attr qp_attr;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+				   " cm: %p err %d\n", ic->i_cm_id, err);
+		}
+
+		if (ic->i_cm_id->qp) {
+			qp_attr.qp_state = IB_QPS_ERR;
+			ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+		}
+
+		wait_event(rds_iw_ring_empty_wait,
+			rds_iw_ring_empty(&ic->i_send_ring) &&
+			rds_iw_ring_empty(&ic->i_recv_ring));
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_iw_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_iw_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+
+		/*
+		 * If associated with an rds_iw_device:
+		 * 	Move connection back to the nodev list.
+		 * 	Remove cm_id from the device cm_id list.
+		 */
+		if (ic->rds_iwdev)
+			rds_iw_remove_conn(ic->rds_iwdev, conn);
+
+		rdma_destroy_id(ic->i_cm_id);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_iwdev);
+
+	/* Clear pending transmit */
+	if (ic->i_rm) {
+		rds_message_put(ic->i_rm);
+		ic->i_rm = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	if (ic->i_iwinc) {
+		rds_inc_put(&ic->i_iwinc->ii_inc);
+		ic->i_iwinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+	rdsdebug("shutdown complete\n");
+}
+
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_iw_connection *ic;
+	unsigned long flags;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+	if (!ic)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ic->iw_node);
+	tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
+		     (unsigned long) ic);
+	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
+
+	/*
+	 * rds_iw_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+	spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_iw_conn_free(void *arg)
+{
+	struct rds_iw_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
+	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_iwdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
+	list_del(&ic->iw_node);
+	spin_unlock_irq(lock_ptr);
+
+	kfree(ic);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 00000000..6deaa774
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,878 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "rds.h"
+#include "iw.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+	struct rds_iw_device	*device;
+	struct rds_iw_mr_pool	*pool;
+	struct rdma_cm_id	*cm_id;
+
+	struct ib_mr	*mr;
+	struct ib_fast_reg_page_list *page_list;
+
+	struct rds_iw_mapping	mapping;
+	unsigned char		remap_count;
+};
+
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+	struct rds_iw_device	*device;		/* back ptr to the device that owns us */
+
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct work_struct	flush_worker;		/* flush worker */
+
+	spinlock_t		list_lock;		/* protect variables below */
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+	struct list_head	dirty_list;		/* dirty mappings */
+	struct list_head	clean_list;		/* unused & unamapped MRs */
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_message_size;	/* in pages */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	int			max_pages;
+};
+
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			  struct rds_iw_mr *ibmr,
+			  struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+			struct list_head *unmap_list,
+			struct list_head *kill_list);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
+{
+	struct rds_iw_device *iwdev;
+	struct rds_iw_cm_id *i_cm_id;
+
+	*rds_iwdev = NULL;
+	*cm_id = NULL;
+
+	list_for_each_entry(iwdev, &rds_iw_devices, list) {
+		spin_lock_irq(&iwdev->spinlock);
+		list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+			struct sockaddr_in *src_addr, *dst_addr;
+
+			src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+			dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+
+			rdsdebug("local ipaddr = %x port %d, "
+				 "remote ipaddr = %x port %d"
+				 "..looking for %x port %d, "
+				 "remote ipaddr = %x port %d\n",
+				src_addr->sin_addr.s_addr,
+				src_addr->sin_port,
+				dst_addr->sin_addr.s_addr,
+				dst_addr->sin_port,
+				rs->rs_bound_addr,
+				rs->rs_bound_port,
+				rs->rs_conn_addr,
+				rs->rs_conn_port);
+#ifdef WORKING_TUPLE_DETECTION
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+			    src_addr->sin_port == rs->rs_bound_port &&
+			    dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+			    dst_addr->sin_port == rs->rs_conn_port) {
+#else
+			/* FIXME - needs to compare the local and remote
+			 * ipaddr/port tuple, but the ipaddr is the only
+			 * available information in the rds_sock (as the rest are
+			 * zero'ed.  It doesn't appear to be properly populated
+			 * during connection setup...
+			 */
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
+#endif
+				spin_unlock_irq(&iwdev->spinlock);
+				*rds_iwdev = iwdev;
+				*cm_id = i_cm_id->cm_id;
+				return 0;
+			}
+		}
+		spin_unlock_irq(&iwdev->spinlock);
+	}
+
+	return 1;
+}
+
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+	if (!i_cm_id)
+		return -ENOMEM;
+
+	i_cm_id->cm_id = cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	return 0;
+}
+
+static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
+				struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+		if (i_cm_id->cm_id == cm_id) {
+			list_del(&i_cm_id->list);
+			kfree(i_cm_id);
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+}
+
+
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct sockaddr_in *src_addr, *dst_addr;
+	struct rds_iw_device *rds_iwdev_old;
+	struct rds_sock rs;
+	struct rdma_cm_id *pcm_id;
+	int rc;
+
+	src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+	dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+
+	rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+	rs.rs_bound_port = src_addr->sin_port;
+	rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+	rs.rs_conn_port = dst_addr->sin_port;
+
+	rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
+	if (rc)
+		rds_iw_remove_cm_id(rds_iwdev, cm_id);
+
+	return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&iw_nodev_conns_lock);
+	BUG_ON(list_empty(&iw_nodev_conns));
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+
+	spin_lock(&rds_iwdev->spinlock);
+	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+	spin_unlock(&rds_iwdev->spinlock);
+	spin_unlock_irq(&iw_nodev_conns_lock);
+
+	ic->rds_iwdev = rds_iwdev;
+}
+
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&iw_nodev_conns_lock);
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+
+	spin_unlock(&iw_nodev_conns_lock);
+
+	rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+	ic->rds_iwdev = NULL;
+}
+
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+{
+	struct rds_iw_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+		rds_conn_destroy(ic->conn);
+}
+
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+		struct scatterlist *list, unsigned int sg_len)
+{
+	sg->list = list;
+	sg->len = sg_len;
+	sg->dma_len = 0;
+	sg->dma_npages = 0;
+	sg->bytes = 0;
+}
+
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+			struct rds_iw_scatterlist *sg)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	u64 *dma_pages = NULL;
+	int i, j, ret;
+
+	WARN_ON(sg->dma_len);
+
+	sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	if (unlikely(!sg->dma_len)) {
+		printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	sg->bytes = 0;
+	sg->dma_npages = 0;
+
+	ret = -EINVAL;
+	for (i = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		sg->bytes += dma_len;
+
+		end_addr = dma_addr + dma_len;
+		if (dma_addr & PAGE_MASK) {
+			if (i > 0)
+				goto out_unmap;
+			dma_addr &= ~PAGE_MASK;
+		}
+		if (end_addr & PAGE_MASK) {
+			if (i < sg->dma_len - 1)
+				goto out_unmap;
+			end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+		}
+
+		sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+	}
+
+	/* Now gather the dma addrs into one list */
+	if (sg->dma_npages > fastreg_message_size)
+		goto out_unmap;
+
+	dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+	if (!dma_pages) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	for (i = j = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		end_addr = dma_addr + dma_len;
+		dma_addr &= ~PAGE_MASK;
+		for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+			dma_pages[j++] = dma_addr;
+		BUG_ON(j > sg->dma_npages);
+	}
+
+	return dma_pages;
+
+out_unmap:
+	ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	sg->dma_len = 0;
+	kfree(dma_pages);
+	return ERR_PTR(ret);
+}
+
+
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->device = rds_iwdev;
+	INIT_LIST_HEAD(&pool->dirty_list);
+	INIT_LIST_HEAD(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	spin_lock_init(&pool->list_lock);
+	INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+
+	pool->max_message_size = fastreg_message_size;
+	pool->max_items = fastreg_pool_size;
+	pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+	pool->max_pages = fastreg_message_size;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = pool->max_items * 3 / 4;
+
+	return pool;
+}
+
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->max_pages;
+}
+
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+	flush_workqueue(rds_wq);
+	rds_iw_flush_mr_pool(pool, 1);
+	BUG_ON(atomic_read(&pool->item_count));
+	BUG_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+	struct rds_iw_mr *ibmr = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	if (!list_empty(&pool->clean_list)) {
+		ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+		list_del_init(&ibmr->mapping.m_list);
+	}
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	return ibmr;
+}
+
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+	struct rds_iw_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	while (1) {
+		ibmr = rds_iw_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+		rds_iw_flush_mr_pool(pool, 0);
+	}
+
+	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	spin_lock_init(&ibmr->mapping.m_lock);
+	INIT_LIST_HEAD(&ibmr->mapping.m_list);
+	ibmr->mapping.m_mr = ibmr;
+
+	err = rds_iw_init_fastreg(pool, ibmr);
+	if (err)
+		goto out_no_cigar;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_device *rds_iwdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+	struct rds_iw_mr *ibmr, *next;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(kill_list);
+	unsigned long flags;
+	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	int ret = 0;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+
+	mutex_lock(&pool->flush_lock);
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	/* Get the list of all mappings to be destroyed */
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	if (free_all)
+		list_splice_init(&pool->clean_list, &kill_list);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	free_goal = rds_iw_flush_goal(pool, free_all);
+
+	/* Batched invalidate of dirty MRs.
+	 * For FMR based MRs, the mappings on the unmap list are
+	 * actually members of an ibmr (ibmr->mapping). They either
+	 * migrate to the kill_list, or have been cleaned and should be
+	 * moved to the clean_list.
+	 * For fastregs, they will be dynamically allocated, and
+	 * will be destroyed by the unmap function.
+	 */
+	if (!list_empty(&unmap_list)) {
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+		/* If we've been asked to destroy all MRs, move those
+		 * that were simply cleaned to the kill list */
+		if (free_all)
+			list_splice_init(&unmap_list, &kill_list);
+	}
+
+	/* Destroy any MRs that are past their best before date */
+	list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+		rds_iw_stats_inc(s_iw_rdma_mr_free);
+		list_del(&ibmr->mapping.m_list);
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+		nfreed++;
+	}
+
+	/* Anything that remains are laundered ibmrs, which we can add
+	 * back to the clean list. */
+	if (!list_empty(&unmap_list)) {
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_splice(&unmap_list, &pool->clean_list);
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+	mutex_unlock(&pool->flush_lock);
+	return ret;
+}
+
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+
+	rds_iw_flush_mr_pool(pool, 0);
+}
+
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+
+	rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+	if (!pool)
+		return;
+
+	/* Return it to the pool's free list */
+	rds_iw_free_fastreg(pool, ibmr);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_work(rds_wq, &pool->flush_worker);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_iw_flush_mr_pool(pool, 0);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			queue_work(rds_wq, &pool->flush_worker);
+		}
+	}
+}
+
+void rds_iw_flush_mrs(void)
+{
+	struct rds_iw_device *rds_iwdev;
+
+	list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+		struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+		if (pool)
+			rds_iw_flush_mr_pool(pool, 0);
+	}
+}
+
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_mr *ibmr = NULL;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
+	if (ret || !cm_id) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_iwdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_iw_alloc_mr(rds_iwdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ibmr->cm_id = cm_id;
+	ibmr->device = rds_iwdev;
+
+	ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->mr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+
+out:
+	if (ret) {
+		if (ibmr)
+			rds_iw_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	return ibmr;
+}
+
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup.  If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+				struct rds_iw_mr *ibmr)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct ib_fast_reg_page_list *page_list = NULL;
+	struct ib_mr *mr;
+	int err;
+
+	mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+		return err;
+	}
+
+	/* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+	 * is not filled in.
+	 */
+	page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+	if (IS_ERR(page_list)) {
+		err = PTR_ERR(page_list);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+		ib_dereg_mr(mr);
+		return err;
+	}
+
+	ibmr->page_list = page_list;
+	ibmr->mr = mr;
+	return 0;
+}
+
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+	struct rds_iw_mr *ibmr = mapping->m_mr;
+	struct ib_send_wr f_wr, *failed_wr;
+	int ret;
+
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+	 * counter, which should guarantee uniqueness.
+	 */
+	ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+	mapping->m_rkey = ibmr->mr->rkey;
+
+	memset(&f_wr, 0, sizeof(f_wr));
+	f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+	f_wr.opcode = IB_WR_FAST_REG_MR;
+	f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+	f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+	f_wr.wr.fast_reg.page_list = ibmr->page_list;
+	f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+	f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_READ |
+				IB_ACCESS_REMOTE_WRITE;
+	f_wr.wr.fast_reg.iova_start = 0;
+	f_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &f_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+	BUG_ON(failed_wr != &f_wr);
+	if (ret && printk_ratelimit())
+		printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+	return ret;
+}
+
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+	struct ib_send_wr s_wr, *failed_wr;
+	int ret = 0;
+
+	if (!ibmr->cm_id->qp || !ibmr->mr)
+		goto out;
+
+	memset(&s_wr, 0, sizeof(s_wr));
+	s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+	s_wr.opcode = IB_WR_LOCAL_INV;
+	s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+	s_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &s_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+	if (ret && printk_ratelimit()) {
+		printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			struct rds_iw_mr *ibmr,
+			struct scatterlist *sg,
+			unsigned int sg_len)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct rds_iw_mapping *mapping = &ibmr->mapping;
+	u64 *dma_pages;
+	int i, ret = 0;
+
+	rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+
+	dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+	if (IS_ERR(dma_pages)) {
+		ret = PTR_ERR(dma_pages);
+		dma_pages = NULL;
+		goto out;
+	}
+
+	if (mapping->m_sg.dma_len > pool->max_message_size) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+		ibmr->page_list->page_list[i] = dma_pages[i];
+
+	ret = rds_iw_rdma_build_fastreg(mapping);
+	if (ret)
+		goto out;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_used);
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!ibmr->mapping.m_sg.dma_len)
+		return;
+
+	ret = rds_iw_rdma_fastreg_inv(ibmr);
+	if (ret)
+		return;
+
+	/* Try to post the LOCAL_INV WR to the queue. */
+	spin_lock_irqsave(&pool->list_lock, flags);
+
+	list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+	atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+				struct list_head *unmap_list,
+				struct list_head *kill_list)
+{
+	struct rds_iw_mapping *mapping, *next;
+	unsigned int ncleaned = 0;
+	LIST_HEAD(laundered);
+
+	/* Batched invalidation of fastreg MRs.
+	 * Why do we do it this way, even though we could pipeline unmap
+	 * and remap? The reason is the application semantics - when the
+	 * application requests an invalidation of MRs, it expects all
+	 * previously released R_Keys to become invalid.
+	 *
+	 * If we implement MR reuse naively, we risk memory corruption
+	 * (this has actually been observed). So the default behavior
+	 * requires that a MR goes through an explicit unmap operation before
+	 * we can reuse it again.
+	 *
+	 * We could probably improve on this a little, by allowing immediate
+	 * reuse of a MR on the same socket (eg you could add small
+	 * cache of unused MRs to strct rds_socket - GET_MR could grab one
+	 * of these without requiring an explicit invalidate).
+	 */
+	while (!list_empty(unmap_list)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			list_move(&mapping->m_list, &laundered);
+			ncleaned++;
+		}
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	/* Move all laundered mappings back to the unmap list.
+	 * We do not kill any WRs right now - it doesn't seem the
+	 * fastreg API has a max_remap limit. */
+	list_splice_init(&laundered, unmap_list);
+
+	return ncleaned;
+}
+
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	if (ibmr->page_list)
+		ib_free_fast_reg_page_list(ibmr->page_list);
+	if (ibmr->mr)
+		ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 00000000..5e57347f
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t	rds_iw_allocation = ATOMIC_INIT(0);
+
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	__free_page(frag->f_page);
+	frag->f_page = NULL;
+}
+
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	BUG_ON(frag->f_page);
+	kmem_cache_free(rds_iw_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+				   struct rds_iw_recv_work *recv)
+{
+	struct rds_page_frag *frag = recv->r_frag;
+
+	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+	if (frag->f_mapped)
+		ib_dma_unmap_page(ic->i_cm_id->device,
+			       frag->f_mapped,
+			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+	frag->f_mapped = 0;
+}
+
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_iwinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+
+		sge = rds_iw_data_sge(ic, recv->r_sge);
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, recv->r_sge);
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+	}
+}
+
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+				  struct rds_iw_recv_work *recv)
+{
+	if (recv->r_iwinc) {
+		rds_inc_put(&recv->r_iwinc->ii_inc);
+		recv->r_iwinc = NULL;
+	}
+	if (recv->r_frag) {
+		rds_iw_recv_unmap_page(ic, recv);
+		if (recv->r_frag->f_page)
+			rds_iw_frag_drop_page(recv->r_frag);
+		rds_iw_frag_free(recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+
+	if (ic->i_frag.f_page)
+		rds_iw_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+				  struct rds_iw_recv_work *recv,
+				  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+
+	if (!recv->r_iwinc) {
+		if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+			rds_iw_stats_inc(s_iw_rx_alloc_limit);
+			goto out;
+		}
+		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+						 kptr_gfp);
+		if (!recv->r_iwinc) {
+			atomic_dec(&rds_iw_allocation);
+			goto out;
+		}
+		INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+	}
+
+	if (!recv->r_frag) {
+		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+		if (!recv->r_frag)
+			goto out;
+		INIT_LIST_HEAD(&recv->r_frag->f_item);
+		recv->r_frag->f_page = NULL;
+	}
+
+	if (!ic->i_frag.f_page) {
+		ic->i_frag.f_page = alloc_page(page_gfp);
+		if (!ic->i_frag.f_page)
+			goto out;
+		ic->i_frag.f_offset = 0;
+	}
+
+	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+				  ic->i_frag.f_page,
+				  ic->i_frag.f_offset,
+				  RDS_FRAG_SIZE,
+				  DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+		goto out;
+
+	/*
+	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+	 * must be called on this recv.  This happens as completions hit
+	 * in order or on connection shutdown.
+	 */
+	recv->r_frag->f_page = ic->i_frag.f_page;
+	recv->r_frag->f_offset = ic->i_frag.f_offset;
+	recv->r_frag->f_mapped = dma_addr;
+
+	sge = rds_iw_data_sge(ic, recv->r_sge);
+	sge->addr = dma_addr;
+	sge->length = RDS_FRAG_SIZE;
+
+	sge = rds_iw_header_sge(ic, recv->r_sge);
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	get_page(recv->r_frag->f_page);
+
+	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+		ic->i_frag.f_offset += RDS_FRAG_SIZE;
+	} else {
+		put_page(ic->i_frag.f_page);
+		ic->i_frag.f_page = NULL;
+		ic->i_frag.f_offset = 0;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn)) &&
+	       rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			ret = -EINVAL;
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		if (ret) {
+			ret = -1;
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_iwinc, recv->r_frag->f_page,
+			 (long) recv->r_frag->f_mapped, ret);
+		if (ret) {
+			rds_iw_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			ret = -1;
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_iw_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+	return ret;
+}
+
+static void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+
+	list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_iw_frag_drop_page(frag);
+		rds_iw_frag_free(frag);
+	}
+}
+
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+
+	rds_iw_inc_purge(inc);
+	rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+	BUG_ON(!list_empty(&iwinc->ii_frags));
+	kmem_cache_free(rds_iw_incoming_slab, iwinc);
+	atomic_dec(&rds_iw_allocation);
+	BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %lu] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 frag->f_page, frag->f_offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(frag->f_page,
+					    frag->f_offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IW_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_clear_bit();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_clear_bit();
+
+	return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_iw_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_iw_stats_inc(s_iw_ack_send_failure);
+
+		rds_iw_conn_error(ic->conn, "sending ack failed\n");
+	} else
+		rds_iw_stats_inc(s_iw_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_iw_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_iw_stats_inc(s_iw_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+		rds_iw_stats_inc(s_iw_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_iw_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_iw_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+	return rds_iw_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+			      struct rds_iw_incoming *iwinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr, KM_SOFTIRQ0);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_iw_process_recv(struct rds_connection *conn,
+				struct rds_iw_recv_work *recv, u32 byte_len,
+				struct rds_iw_ack_state *state)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_incoming *iwinc = ic->i_iwinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+		 byte_len);
+
+	if (byte_len < sizeof(struct rds_header)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 didn't inclue a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	byte_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_iw_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_iw_stats_inc(s_iw_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.  We can leave the frag, though, it will be
+		 * reused.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_iw_frag_drop_page(recv->r_frag);
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (!iwinc) {
+		iwinc = recv->r_iwinc;
+		recv->r_iwinc = NULL;
+		ic->i_iwinc = iwinc;
+
+		hdr = &iwinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &iwinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence ||
+		    hdr->h_len != ihdr->h_len ||
+		    hdr->h_sport != ihdr->h_sport ||
+		    hdr->h_dport != ihdr->h_dport) {
+			rds_iw_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_iwinc = NULL;
+
+		if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_iw_cong_recv(conn, iwinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &iwinc->ii_inc, GFP_ATOMIC,
+					  KM_SOFTIRQ0);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&iwinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_iw_stats_inc(s_iw_rx_cq_call);
+
+	tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_iw_connection *ic,
+			       struct rds_iw_ack_state *state)
+{
+	struct rds_connection *conn = ic->conn;
+	struct ib_wc wc;
+	struct rds_iw_recv_work *recv;
+
+	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+
+		rds_iw_recv_unmap_page(ic, recv);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+			/* We expect errors as the qp is drained during shutdown */
+			if (wc.status == IB_WC_SUCCESS) {
+				rds_iw_process_recv(conn, recv, wc.byte_len, state);
+			} else {
+				rds_iw_conn_error(conn, "recv completion on "
+				       "%pI4 had status %u, disconnecting and "
+				       "reconnecting\n", &conn->c_faddr,
+				       wc.status);
+			}
+		}
+
+		rds_iw_ring_free(&ic->i_recv_ring, 1);
+	}
+}
+
+void rds_iw_recv_tasklet_fn(unsigned long data)
+{
+	struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
+	struct rds_connection *conn = ic->conn;
+	struct rds_iw_ack_state state = { 0, };
+
+	rds_poll_cq(ic, &state);
+	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	rds_poll_cq(ic, &state);
+
+	if (state.ack_next_valid)
+		rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_iw_ring_empty(&ic->i_recv_ring))
+		rds_iw_stats_inc(s_iw_rx_ring_empty);
+
+	/*
+	 * If the ring is running low, then schedule the thread to refill.
+	 */
+	if (rds_iw_ring_low(&ic->i_recv_ring))
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_iw_recv(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+
+	/*
+	 * If we get a temporary posting failure in this context then
+	 * we're really low and we want the caller to back off for a bit.
+	 */
+	mutex_lock(&ic->i_recv_mutex);
+	if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+		ret = -ENOMEM;
+	else
+		rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+	mutex_unlock(&ic->i_recv_mutex);
+
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	return ret;
+}
+
+int rds_iw_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+					sizeof(struct rds_iw_incoming),
+					0, 0, NULL);
+	if (!rds_iw_incoming_slab)
+		goto out;
+
+	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+					sizeof(struct rds_page_frag),
+					0, 0, NULL);
+	if (!rds_iw_frag_slab)
+		kmem_cache_destroy(rds_iw_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_iw_recv_exit(void)
+{
+	kmem_cache_destroy(rds_iw_incoming_slab);
+	kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 00000000..da8e3b63
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_iw_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) == 0;
+}
+
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_iw_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_iw_ring_empty(ring) &&
+	    waitqueue_active(&rds_iw_ring_empty_wait))
+		wake_up(&rds_iw_ring_empty_wait);
+}
+
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_empty(ring);
+}
+
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 00000000..545d8ee3
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+				      int wc_status)
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+				   struct rm_rdma_op *op)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+			op->op_sg, op->op_nents,
+			op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+}
+
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+			  struct rds_iw_send_work *send,
+			  int wc_status)
+{
+	struct rds_message *rm = send->s_rm;
+
+	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+	ib_dma_unmap_sg(ic->i_cm_id->device,
+		     rm->data.op_sg, rm->data.op_nents,
+		     DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma);
+
+		/* If the user asked for a completion notification on this
+		 * message, we can implement three different semantics:
+		 *  1.	Notify when we received the ACK on the RDS message
+		 *	that was queued with the RDMA. This provides reliable
+		 *	notification of RDMA status at the expense of a one-way
+		 *	packet delay.
+		 *  2.	Notify when the IB stack gives us the completion event for
+		 *	the RDMA operation.
+		 *  3.	Notify when the IB stack gives us the completion event for
+		 *	the accompanying RDS messages.
+		 * Here, we implement approach #3. To implement approach #2,
+		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+		 * don't call rds_rdma_send_complete at all, and fall back to the notify
+		 * handling in the ACK processing code.
+		 *
+		 * Note: There's no need to explicitly sync any RDMA buffers using
+		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+		 * operation itself unmapped the RDMA buffers, which takes care
+		 * of synching.
+		 */
+		rds_iw_send_rdma_complete(rm, wc_status);
+
+		if (rm->rdma.op_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+		else
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+	}
+
+	/* If anyone waited for this message to get flushed out, wake
+	 * them up now */
+	rds_message_unmapped(rm);
+
+	rds_message_put(rm);
+	send->s_rm = NULL;
+}
+
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_rm = NULL;
+		send->s_op = NULL;
+		send->s_mapping = NULL;
+
+		send->s_wr.next = NULL;
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.num_sge = 1;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.send_flags = 0;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+
+		send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+		if (IS_ERR(send->s_mr)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+			break;
+		}
+
+		send->s_page_list = ib_alloc_fast_reg_page_list(
+			ic->i_cm_id->device, fastreg_message_size);
+		if (IS_ERR(send->s_page_list)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+			break;
+		}
+	}
+}
+
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		BUG_ON(!send->s_mr);
+		ib_dereg_mr(send->s_mr);
+		BUG_ON(!send->s_page_list);
+		ib_free_fast_reg_page_list(send->s_page_list);
+		if (send->s_wr.opcode == 0xdead)
+			continue;
+		if (send->s_rm)
+			rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+		if (send->s_op)
+			rds_iw_send_unmap_rdma(ic, send->s_op);
+	}
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_iw_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i;
+	int ret;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_iw_stats_inc(s_iw_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_tx_cq_event);
+
+		if (wc.status != IB_WC_SUCCESS) {
+			printk(KERN_ERR "WC Error:  status = %d opcode = %d\n", wc.status, wc.opcode);
+			break;
+		}
+
+		if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+			ic->i_fastreg_posted = 0;
+			continue;
+		}
+
+		if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+			ic->i_fastreg_posted = 1;
+			continue;
+		}
+
+		if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+			if (ic->i_ack_queued + HZ/2 < jiffies)
+				rds_iw_stats_inc(s_iw_tx_stalled);
+			rds_iw_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+
+			/* In the error case, wc.opcode sometimes contains garbage */
+			switch (send->s_wr.opcode) {
+			case IB_WR_SEND:
+				if (send->s_rm)
+					rds_iw_send_unmap_rm(ic, send, wc.status);
+				break;
+			case IB_WR_FAST_REG_MR:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_READ_WITH_INV:
+				/* Nothing to be done - the SG list will be unmapped
+				 * when the SEND completes. */
+				break;
+			default:
+				if (printk_ratelimit())
+					printk(KERN_NOTICE
+						"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+						__func__, send->s_wr.opcode);
+				break;
+			}
+
+			send->s_wr.opcode = 0xdead;
+			send->s_wr.num_sge = 1;
+			if (send->s_queued + HZ/2 < jiffies)
+				rds_iw_stats_inc(s_iw_tx_stalled);
+
+			/* If a RDMA operation produced an error, signal this right
+			 * away. If we don't, the subsequent SEND that goes with this
+			 * RDMA will be canceled with ERR_WFLUSH, and the application
+			 * never learn that the RDMA failed. */
+			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+				struct rds_message *rm;
+
+				rm = rds_send_get_message(conn, send->s_op);
+				if (rm)
+					rds_iw_send_rdma_complete(rm, wc.status);
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_iw_ring_free(&ic->i_send_ring, completed);
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+		    test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_iw_conn_error(conn,
+				"send completion on %pI4 "
+				"had status %u, disconnecting and reconnecting\n",
+				&conn->c_faddr, wc.status);
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the receiver's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, max_posted);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+		struct rds_iw_send_work *send, unsigned int pos,
+		unsigned long buffer, unsigned int length,
+		int send_flags)
+{
+	struct ib_sge *sge;
+
+	WARN_ON(pos != send - ic->i_sends);
+
+	send->s_wr.send_flags = send_flags;
+	send->s_wr.opcode = IB_WR_SEND;
+	send->s_wr.num_sge = 2;
+	send->s_wr.next = NULL;
+	send->s_queued = jiffies;
+	send->s_op = NULL;
+
+	if (length != 0) {
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->addr = buffer;
+		sge->length = length;
+		sge->lkey = rds_iw_local_dma_lkey(ic);
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+	} else {
+		/* We're sending a packet with no payload. There is only
+		 * one SGE */
+		send->s_wr.num_sge = 1;
+		sge = &send->s_sge[0];
+	}
+
+	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int sent;
+	int ret;
+	int flow_controlled = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Fastreg support */
+	if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	credit_alloc = work_alloc;
+	if (ic->i_flowctl) {
+		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled++;
+		}
+		if (work_alloc == 0) {
+			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+			rds_iw_stats_inc(s_iw_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (!ic->i_rm) {
+		/*
+		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+				rm->m_inc.i_hdr.h_flags,
+				be32_to_cpu(rm->m_inc.i_hdr.h_len));
+		   */
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
+				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->data.op_count = 0;
+		}
+
+		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+		ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+		rds_message_addref(rm);
+		ic->i_rm = rm;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->rdma.op_active) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_iw_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &rm->data.op_sg[sg];
+	sent = 0;
+	i = 0;
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->rdma.op_active && rm->rdma.op_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/*
+	 * We could be copying the header into the unused tail of the page.
+	 * That would need to be changed in the future when those pages might
+	 * be mapped userspace pages or page cache pages.  So instead we always
+	 * use a second sge and our long-lived ring of mapped headers.  We send
+	 * the header after the data so that the data payload can be aligned on
+	 * the receiver.
+	 */
+
+	/* handle a 0-len message */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+		rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+		goto add_header;
+	}
+
+	/* if there's data reference it with a chain of work reqs */
+	for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+		unsigned int len;
+
+		send = &ic->i_sends[pos];
+
+		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+		rds_iw_xmit_populate_wr(ic, send, pos,
+				ib_sg_dma_address(dev, scat) + off, len,
+				send_flags);
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time
+		 * on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		ic->i_unsignaled_bytes -= len;
+		if (ic->i_unsignaled_bytes <= 0) {
+			ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		sent += len;
+		off += len;
+		if (off == ib_sg_dma_len(dev, scat)) {
+			scat++;
+			off = 0;
+		}
+
+add_header:
+		/* Tack on the header after the data. The header SGE should already
+		 * have been set up to point to the right header buffer. */
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		if (0) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(hdr->h_dport),
+				hdr->h_flags,
+				be32_to_cpu(hdr->h_len));
+		}
+		if (adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_iw_stats_inc(s_iw_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+	}
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_rm = ic->i_rm;
+		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		ic->i_rm = NULL;
+	}
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_iw_send_add_credits(conn, credit_alloc - i);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		if (prev->s_rm) {
+			ic->i_rm = prev->s_rm;
+			prev->s_rm = NULL;
+		}
+		goto out;
+	}
+
+	ret = sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+	BUG_ON(nent > send->s_page_list->max_page_list_len);
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.
+	 */
+	send->s_wr.opcode = IB_WR_FAST_REG_MR;
+	send->s_wr.wr.fast_reg.length = len;
+	send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+	send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+	send->s_wr.wr.fast_reg.page_list_len = nent;
+	send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+	send->s_wr.wr.fast_reg.iova_start = sg_addr;
+
+	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct rds_iw_device *rds_iwdev;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->op_remote_addr;
+	u32 pos, fr_pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+
+	/* map the message the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
+			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->op_mapped = 1;
+	}
+
+	if (!op->op_write) {
+		/* Alloc space on the send queue for the fastreg */
+		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+		if (work_alloc != 1) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+			rds_iw_stats_inc(s_iw_tx_ring_full);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->op_count, rds_iwdev->max_sge);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	if (!op->op_write) {
+		first = prev = &ic->i_sends[fr_pos];
+	} else {
+		first = send;
+		prev = NULL;
+	}
+	scat = &op->op_sg[0];
+	sent = 0;
+	num_sge = op->op_count;
+
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags = IB_SEND_SIGNALED;
+		}
+
+		/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+		 * for local access after RDS is finished with it, using
+		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+		 */
+		if (op->op_write)
+			send->s_wr.opcode = IB_WR_RDMA_WRITE;
+		else
+			send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
+		send->s_op = op;
+
+		if (num_sge > rds_iwdev->max_sge) {
+			send->s_wr.num_sge = rds_iwdev->max_sge;
+			num_sge -= rds_iwdev->max_sge;
+		} else
+			send->s_wr.num_sge = num_sge;
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+
+			if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+				send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+			else {
+				send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+				send->s_sge[j].length = len;
+				send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+			}
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+			remote_addr += len;
+
+			scat++;
+		}
+
+		if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+			send->s_wr.num_sge = 1;
+			send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+			send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+			send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &op->op_sg[op->op_count])
+		first->s_wr.send_flags = IB_SEND_SIGNALED;
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	/* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+	 * recommended.  Putting the lkey on the wire is a security hole, as it can
+	 * allow for memory access to all of memory on the remote system.  Some
+	 * adapters do not allow using the lkey for this at all.  To bypass this use a
+	 * fastreg_mr (or possibly a dma_mr)
+	 */
+	if (!op->op_write) {
+		rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+			op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+		work_alloc++;
+	}
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 00000000..5fe67f6a
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "iw.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+
+static const char *const rds_iw_stat_names[] = {
+	"iw_connect_raced",
+	"iw_listen_closed_stale",
+	"iw_tx_cq_call",
+	"iw_tx_cq_event",
+	"iw_tx_ring_full",
+	"iw_tx_throttle",
+	"iw_tx_sg_mapping_failure",
+	"iw_tx_stalled",
+	"iw_tx_credit_updates",
+	"iw_rx_cq_call",
+	"iw_rx_cq_event",
+	"iw_rx_ring_empty",
+	"iw_rx_refill_from_cq",
+	"iw_rx_refill_from_thread",
+	"iw_rx_alloc_limit",
+	"iw_rx_credit_updates",
+	"iw_ack_sent",
+	"iw_ack_send_failure",
+	"iw_ack_send_delayed",
+	"iw_ack_send_piggybacked",
+	"iw_ack_received",
+	"iw_rdma_mr_alloc",
+	"iw_rdma_mr_free",
+	"iw_rdma_mr_used",
+	"iw_rdma_mr_pool_flush",
+	"iw_rdma_mr_pool_wait",
+	"iw_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_iw_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_iw_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+			    ARRAY_SIZE(rds_iw_stat_names));
+out:
+	return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 00000000..e2e47176
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "iw.h"
+
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_iw_sysctl_flow_control = 1;
+
+static ctl_table rds_iw_sysctl_table[] = {
+	{
+		.procname       = "max_send_wr",
+		.data		= &rds_iw_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_recv_wr",
+		.data		= &rds_iw_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_iw_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_bytes",
+		.data		= &rds_iw_sysctl_max_unsig_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_bytes_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_bytes_max,
+	},
+	{
+		.procname       = "max_recv_allocation",
+		.data		= &rds_iw_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "flow_control",
+		.data		= &rds_iw_sysctl_flow_control,
+		.maxlen		= sizeof(rds_iw_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "rds", },
+	{ .procname = "iw", },
+	{ }
+};
+
+void rds_iw_sysctl_exit(void)
+{
+	if (rds_iw_sysctl_hdr)
+		unregister_sysctl_table(rds_iw_sysctl_hdr);
+}
+
+int rds_iw_sysctl_init(void)
+{
+	rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+	if (!rds_iw_sysctl_hdr)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 00000000..bca6761a
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport.  At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver.  In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+			 unsigned int hdr_off, unsigned int sg,
+			 unsigned int off)
+{
+	struct scatterlist *sgp = &rm->data.op_sg[sg];
+	int ret = sizeof(struct rds_header) +
+			be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* Do not send cong updates to loopback */
+	if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+		goto out;
+	}
+
+	BUG_ON(hdr_off || sg || off);
+
+	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+	/* For the embedded inc. Matching put is in loop_inc_free() */
+	rds_message_addref(rm);
+
+	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+			  GFP_KERNEL, KM_USER0);
+
+	rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+			    NULL);
+
+	rds_inc_put(&rm->m_inc);
+out:
+	return ret;
+}
+
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+	return 0;
+}
+
+struct rds_loop_connection {
+	struct list_head loop_node;
+	struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_loop_connection *lc;
+	unsigned long flags;
+
+	lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+	if (!lc)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&lc->loop_node);
+	lc->conn = conn;
+	conn->c_transport_data = lc;
+
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_add_tail(&lc->loop_node, &loop_conns);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+	return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+	struct rds_loop_connection *lc = arg;
+	unsigned long flags;
+
+	rdsdebug("lc %p\n", lc);
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_del(&lc->loop_node);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+	kfree(lc);
+}
+
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+	rds_connect_complete(conn);
+	return 0;
+}
+
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+
+void rds_loop_exit(void)
+{
+	struct rds_loop_connection *lc, *_lc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&loop_conns_lock);
+	list_splice(&loop_conns, &tmp_list);
+	INIT_LIST_HEAD(&loop_conns);
+	spin_unlock_irq(&loop_conns_lock);
+
+	list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+		WARN_ON(lc->conn->c_passive);
+		rds_conn_destroy(lc->conn);
+	}
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+	.xmit			= rds_loop_xmit,
+	.recv			= rds_loop_recv,
+	.conn_alloc		= rds_loop_conn_alloc,
+	.conn_free		= rds_loop_conn_free,
+	.conn_connect		= rds_loop_conn_connect,
+	.conn_shutdown		= rds_loop_conn_shutdown,
+	.inc_copy_to_user	= rds_message_inc_copy_to_user,
+	.inc_free		= rds_loop_inc_free,
+	.t_name			= "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 00000000..f32b0939
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 00000000..1fd3d290
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "rds.h"
+
+static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE]	= 0,
+[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST]	= sizeof(struct rds_ext_header_rdma_dest),
+};
+
+
+void rds_message_addref(struct rds_message *rm)
+{
+	rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+	atomic_inc(&rm->m_refcount);
+}
+EXPORT_SYMBOL_GPL(rds_message_addref);
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+	unsigned long i;
+
+	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+		return;
+
+	for (i = 0; i < rm->data.op_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+		/* XXX will have to put_page for page refs */
+		__free_page(sg_page(&rm->data.op_sg[i]));
+	}
+	rm->data.op_nents = 0;
+
+	if (rm->rdma.op_active)
+		rds_rdma_free_op(&rm->rdma);
+	if (rm->rdma.op_rdma_mr)
+		rds_mr_put(rm->rdma.op_rdma_mr);
+
+	if (rm->atomic.op_active)
+		rds_atomic_free_op(&rm->atomic);
+	if (rm->atomic.op_rdma_mr)
+		rds_mr_put(rm->atomic.op_rdma_mr);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+	if (atomic_read(&rm->m_refcount) == 0) {
+printk(KERN_CRIT "danger refcount zero on %p\n", rm);
+WARN_ON(1);
+	}
+	if (atomic_dec_and_test(&rm->m_refcount)) {
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		BUG_ON(!list_empty(&rm->m_conn_item));
+		rds_message_purge(rm);
+
+		kfree(rm);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq)
+{
+	hdr->h_flags = 0;
+	hdr->h_sport = sport;
+	hdr->h_dport = dport;
+	hdr->h_sequence = cpu_to_be64(seq);
+	hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
+
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+			      const void *data, unsigned int len)
+{
+	unsigned int ext_len = sizeof(u8) + len;
+	unsigned char *dst;
+
+	/* For now, refuse to add more than one extension header */
+	if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+		return 0;
+
+	if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+		return 0;
+
+	if (ext_len >= RDS_HEADER_EXT_SPACE)
+		return 0;
+	dst = hdr->h_exthdr;
+
+	*dst++ = type;
+	memcpy(dst, data, len);
+
+	dst[len] = RDS_EXTHDR_NONE;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ *	buflen = sizeof(buffer);
+ *	type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ *	if (type == RDS_EXTHDR_NONE)
+ *		break;
+ *	...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+		unsigned int *pos, void *buf, unsigned int *buflen)
+{
+	unsigned int offset, ext_type, ext_len;
+	u8 *src = hdr->h_exthdr;
+
+	offset = *pos;
+	if (offset >= RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	/* Get the extension type and length. For now, the
+	 * length is implied by the extension type. */
+	ext_type = src[offset++];
+
+	if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+		goto none;
+	ext_len = rds_exthdr_size[ext_type];
+	if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	*pos = offset + ext_len;
+	if (ext_len < *buflen)
+		*buflen = ext_len;
+	memcpy(buf, src + offset, *buflen);
+	return ext_type;
+
+none:
+	*pos = RDS_HEADER_EXT_SPACE;
+	*buflen = 0;
+	return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+	struct rds_ext_header_rdma_dest ext_hdr;
+
+	ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+	ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
+
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+{
+	struct rds_message *rm;
+
+	rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+	if (!rm)
+		goto out;
+
+	rm->m_used_sgs = 0;
+	rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
+	atomic_set(&rm->m_refcount, 1);
+	INIT_LIST_HEAD(&rm->m_sock_item);
+	INIT_LIST_HEAD(&rm->m_conn_item);
+	spin_lock_init(&rm->m_rs_lock);
+	init_waitqueue_head(&rm->m_flush_wait);
+
+out:
+	return rm;
+}
+
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+	struct scatterlist *sg_ret;
+
+	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+	WARN_ON(!nents);
+
+	if (rm->m_used_sgs + nents > rm->m_total_sgs)
+		return NULL;
+
+	sg_ret = &sg_first[rm->m_used_sgs];
+	sg_init_table(sg_ret, nents);
+	rm->m_used_sgs += nents;
+
+	return sg_ret;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+	struct rds_message *rm;
+	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
+
+	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+	if (!rm)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+	rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+	if (!rm->data.op_sg) {
+		rds_message_put(rm);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < rm->data.op_nents; ++i) {
+		sg_set_page(&rm->data.op_sg[i],
+				virt_to_page(page_addrs[i]),
+				PAGE_SIZE, 0);
+	}
+
+	return rm;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
+					       size_t total_len)
+{
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long sg_off;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	int ret = 0;
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+
+	/*
+	 * now allocate and copy in the data payload.
+	 */
+	sg = rm->data.op_sg;
+	iov = first_iov;
+	iov_off = 0;
+	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+	while (total_len) {
+		if (!sg_page(sg)) {
+			ret = rds_page_remainder_alloc(sg, total_len,
+						       GFP_HIGHUSER);
+			if (ret)
+				goto out;
+			rm->data.op_nents++;
+			sg_off = 0;
+		}
+
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+		to_copy = min_t(size_t, to_copy, total_len);
+
+		rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 (void *)sg_page(sg), sg->offset, sg->length, sg_off);
+
+		ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+					      iov->iov_base + iov_off,
+					      to_copy);
+		if (ret)
+			goto out;
+
+		iov_off += to_copy;
+		total_len -= to_copy;
+		sg_off += to_copy;
+
+		if (sg_off == sg->length)
+			sg++;
+	}
+
+out:
+	return ret;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size)
+{
+	struct rds_message *rm;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long vec_off;
+	int copied;
+	int ret;
+	u32 len;
+
+	rm = container_of(inc, struct rds_message, m_inc);
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	iov = first_iov;
+	iov_off = 0;
+	sg = rm->data.op_sg;
+	vec_off = 0;
+	copied = 0;
+
+	while (copied < size && copied < len) {
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 sg_page(sg), sg->offset, sg->length, vec_off);
+
+		ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		vec_off += to_copy;
+		copied += to_copy;
+
+		if (vec_off == sg->length) {
+			vec_off = 0;
+			sg++;
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+	wait_event_interruptible(rm->m_flush_wait,
+			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+	wake_up_interruptible(&rm->m_flush_wait);
+}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 00000000..d8acdebe
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+	struct page	*r_page;
+	unsigned long	r_offset;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
+				     rds_page_remainders);
+
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages.  If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence.  (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user)
+{
+	unsigned long ret;
+	void *addr;
+
+	addr = kmap(page);
+	if (to_user) {
+		rds_stats_add(s_copy_to_user, bytes);
+		ret = copy_to_user(ptr, addr + offset, bytes);
+	} else {
+		rds_stats_add(s_copy_from_user, bytes);
+		ret = copy_from_user(addr + offset, ptr, bytes);
+	}
+	kunmap(page);
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(rds_page_copy_user);
+
+/*
+ * Message allocation uses this to build up regions of a message.
+ *
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure.  Future partial-page allocations may be
+ * satisfied from that cached region.  This lets us waste less memory on
+ * small allocations with minimal complexity.  It works because the transmit
+ * path passes read-only page regions down to devices.  They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp)
+{
+	struct rds_page_remainder *rem;
+	unsigned long flags;
+	struct page *page;
+	int ret;
+
+	gfp |= __GFP_HIGHMEM;
+
+	/* jump straight to allocation if we're trying for a huge page */
+	if (bytes >= PAGE_SIZE) {
+		page = alloc_page(gfp);
+		if (!page) {
+			ret = -ENOMEM;
+		} else {
+			sg_set_page(scat, page, PAGE_SIZE, 0);
+			ret = 0;
+		}
+		goto out;
+	}
+
+	rem = &per_cpu(rds_page_remainders, get_cpu());
+	local_irq_save(flags);
+
+	while (1) {
+		/* avoid a tiny region getting stuck by tossing it */
+		if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+			rds_stats_inc(s_page_remainder_miss);
+			__free_page(rem->r_page);
+			rem->r_page = NULL;
+		}
+
+		/* hand out a fragment from the cached page */
+		if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+			sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+			get_page(sg_page(scat));
+
+			if (rem->r_offset != 0)
+				rds_stats_inc(s_page_remainder_hit);
+
+			rem->r_offset += bytes;
+			if (rem->r_offset == PAGE_SIZE) {
+				__free_page(rem->r_page);
+				rem->r_page = NULL;
+			}
+			ret = 0;
+			break;
+		}
+
+		/* alloc if there is nothing for us to use */
+		local_irq_restore(flags);
+		put_cpu();
+
+		page = alloc_page(gfp);
+
+		rem = &per_cpu(rds_page_remainders, get_cpu());
+		local_irq_save(flags);
+
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/* did someone race to fill the remainder before us? */
+		if (rem->r_page) {
+			__free_page(page);
+			continue;
+		}
+
+		/* otherwise install our page and loop around to alloc */
+		rem->r_page = page;
+		rem->r_offset = 0;
+	}
+
+	local_irq_restore(flags);
+	put_cpu();
+out:
+	rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+		 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+		 ret ? 0 : scat->length);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
+
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+					 unsigned long action, void *hcpu)
+{
+	struct rds_page_remainder *rem;
+	long cpu = (long)hcpu;
+
+	rem = &per_cpu(rds_page_remainders, cpu);
+
+	rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+
+	switch (action) {
+	case CPU_DEAD:
+		if (rem->r_page)
+			__free_page(rem->r_page);
+		rem->r_page = NULL;
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block rds_page_remainder_nb = {
+	.notifier_call = rds_page_remainder_cpu_notify,
+};
+
+void rds_page_exit(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+					      (unsigned long)CPU_DEAD,
+					      (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 00000000..4e37c1cb
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,858 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rds.h"
+
+/*
+ * XXX
+ *  - build with sparse
+ *  - should we limit the size of a mr region?  let transport return failure?
+ *  - should we detect duplicate keys on a socket?  hmm.
+ *  - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int.  This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+	if ((vec->addr + vec->bytes <= vec->addr) ||
+	    (vec->bytes > (u64)UINT_MAX))
+		return 0;
+
+	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+		(vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+				       struct rds_mr *insert)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_mr *mr;
+
+	while (*p) {
+		parent = *p;
+		mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+		if (key < mr->r_key)
+			p = &(*p)->rb_left;
+		else if (key > mr->r_key)
+			p = &(*p)->rb_right;
+		else
+			return mr;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->r_rb_node, parent, p);
+		rb_insert_color(&insert->r_rb_node, root);
+		atomic_inc(&insert->r_refcount);
+	}
+	return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+	struct rds_sock *rs = mr->r_sock;
+	void *trans_private = NULL;
+	unsigned long flags;
+
+	rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+			mr->r_key, atomic_read(&mr->r_refcount));
+
+	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+		return;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	if (!RB_EMPTY_NODE(&mr->r_rb_node))
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+	trans_private = mr->r_trans_private;
+	mr->r_trans_private = NULL;
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (trans_private)
+		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+	rds_destroy_mr(mr);
+	kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+	struct rds_mr *mr;
+	struct rb_node *node;
+	unsigned long flags;
+
+	/* Release any MRs associated with this socket */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	while ((node = rb_first(&rs->rs_rdma_keys))) {
+		mr = container_of(node, struct rds_mr, r_rb_node);
+		if (mr->r_trans == rs->rs_transport)
+			mr->r_invalidate = 0;
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		rds_destroy_mr(mr);
+		rds_mr_put(mr);
+		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (rs->rs_transport && rs->rs_transport->flush_mrs)
+		rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+			struct page **pages, int write)
+{
+	int ret;
+
+	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+
+	if (ret >= 0 && ret < nr_pages) {
+		while (ret--)
+			put_page(pages[ret]);
+		ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+				u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+	struct rds_mr *mr = NULL, *found;
+	unsigned int nr_pages;
+	struct page **pages = NULL;
+	struct scatterlist *sg;
+	void *trans_private;
+	unsigned long flags;
+	rds_rdma_cookie_t cookie;
+	unsigned int nents;
+	long i;
+	int ret;
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (!rs->rs_transport->get_mr) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	nr_pages = rds_pages_in_vec(&args->vec);
+	if (nr_pages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+		args->vec.addr, args->vec.bytes, nr_pages);
+
+	/* XXX clamp nr_pages to limit the size of this alloc? */
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+	if (!mr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&mr->r_refcount, 1);
+	RB_CLEAR_NODE(&mr->r_rb_node);
+	mr->r_trans = rs->rs_transport;
+	mr->r_sock = rs;
+
+	if (args->flags & RDS_RDMA_USE_ONCE)
+		mr->r_use_once = 1;
+	if (args->flags & RDS_RDMA_INVALIDATE)
+		mr->r_invalidate = 1;
+	if (args->flags & RDS_RDMA_READWRITE)
+		mr->r_write = 1;
+
+	/*
+	 * Pin the pages that make up the user buffer and transfer the page
+	 * pointers to the mr's sg array.  We check to see if we've mapped
+	 * the whole region after transferring the partial page references
+	 * to the sg array so that we can have one page ref cleanup path.
+	 *
+	 * For now we have no flag that tells us whether the mapping is
+	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+	 * the zero page.
+	 */
+	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+	if (ret < 0)
+		goto out;
+
+	nents = ret;
+	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+	if (!sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	WARN_ON(!nents);
+	sg_init_table(sg, nents);
+
+	/* Stick all pages into the scatterlist */
+	for (i = 0 ; i < nents; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+	rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+	/* Obtain a transport specific MR. If this succeeds, the
+	 * s/g list is now owned by the MR.
+	 * Note that dma_map() implies that pending writes are
+	 * flushed to RAM, so no dma_sync is needed here. */
+	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+						 &mr->r_key);
+
+	if (IS_ERR(trans_private)) {
+		for (i = 0 ; i < nents; i++)
+			put_page(sg_page(&sg[i]));
+		kfree(sg);
+		ret = PTR_ERR(trans_private);
+		goto out;
+	}
+
+	mr->r_trans_private = trans_private;
+
+	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+	       mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+	/* The user may pass us an unaligned address, but we can only
+	 * map page aligned regions. So we keep the offset, and build
+	 * a 64bit cookie containing <R_Key, offset> and pass that
+	 * around. */
+	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+	if (cookie_ret)
+		*cookie_ret = cookie;
+
+	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Inserting the new MR into the rbtree bumps its
+	 * reference count. */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	BUG_ON(found && found != mr);
+
+	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+	if (mr_ret) {
+		atomic_inc(&mr->r_refcount);
+		*mr_ret = mr;
+	}
+
+	ret = 0;
+out:
+	kfree(pages);
+	if (mr)
+		rds_mr_put(mr);
+	return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_args args;
+
+	if (optlen != sizeof(struct rds_get_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+			   sizeof(struct rds_get_mr_args)))
+		return -EFAULT;
+
+	return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_for_dest_args args;
+	struct rds_get_mr_args new_args;
+
+	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+			   sizeof(struct rds_get_mr_for_dest_args)))
+		return -EFAULT;
+
+	/*
+	 * Initially, just behave like get_mr().
+	 * TODO: Implement get_mr as wrapper around this
+	 *	 and deprecate it.
+	 */
+	new_args.vec = args.vec;
+	new_args.cookie_addr = args.cookie_addr;
+	new_args.flags = args.flags;
+
+	return __rds_rdma_map(rs, &new_args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_free_mr_args args;
+	struct rds_mr *mr;
+	unsigned long flags;
+
+	if (optlen != sizeof(struct rds_free_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+			   sizeof(struct rds_free_mr_args)))
+		return -EFAULT;
+
+	/* Special case - a null cookie means flush all unused MRs */
+	if (args.cookie == 0) {
+		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+			return -EINVAL;
+		rs->rs_transport->flush_mrs();
+		return 0;
+	}
+
+	/* Look up the MR given its R_key and remove it from the rbtree
+	 * so nobody else finds it.
+	 * This should also prevent races with rds_rdma_unuse.
+	 */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+	if (mr) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		if (args.flags & RDS_RDMA_INVALIDATE)
+			mr->r_invalidate = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (!mr)
+		return -EINVAL;
+
+	/*
+	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+	 * we return.  If we let rds_mr_put() do it it might not happen until
+	 * someone else drops their ref.
+	 */
+	rds_destroy_mr(mr);
+	rds_mr_put(mr);
+	return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+	struct rds_mr *mr;
+	unsigned long flags;
+	int zot_me = 0;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr) {
+		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		return;
+	}
+
+	if (mr->r_use_once || force) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		zot_me = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	/* May have to issue a dma_sync on this memory region.
+	 * Note we could avoid this if the operation was a RDMA READ,
+	 * but at this point we can't tell. */
+	if (mr->r_trans->sync_mr)
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+	/* If the MR was marked as invalidate, this will
+	 * trigger an async flush. */
+	if (zot_me)
+		rds_destroy_mr(mr);
+	rds_mr_put(mr);
+}
+
+void rds_rdma_free_op(struct rm_rdma_op *ro)
+{
+	unsigned int i;
+
+	for (i = 0; i < ro->op_nents; i++) {
+		struct page *page = sg_page(&ro->op_sg[i]);
+
+		/* Mark page dirty if it was possibly modified, which
+		 * is the case for a RDMA_READ which copies from remote
+		 * to local memory */
+		if (!ro->op_write) {
+			BUG_ON(irqs_disabled());
+			set_page_dirty(page);
+		}
+		put_page(page);
+	}
+
+	kfree(ro->op_notifier);
+	ro->op_notifier = NULL;
+	ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+	struct page *page = sg_page(ao->op_sg);
+
+	/* Mark page dirty if it was possibly modified, which
+	 * is the case for a RDMA_READ which copies from remote
+	 * to local memory */
+	set_page_dirty(page);
+	put_page(page);
+
+	kfree(ao->op_notifier);
+	ao->op_notifier = NULL;
+	ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec array.
+ */
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < nr_iovecs; i++) {
+		nr_pages = rds_pages_in_vec(&iov[i]);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_iovec __user *local_vec;
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	struct rds_rdma_args *args;
+	struct rm_rdma_op *op = &rm->rdma;
+	int nr_pages;
+	unsigned int nr_bytes;
+	struct page **pages = NULL;
+	struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
+	int iov_size;
+	unsigned int i, j;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	    || rm->rdma.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (args->nr_local > UIO_MAXIOV) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	/* Check whether to allocate the iovec area */
+	iov_size = args->nr_local * sizeof(struct rds_iovec);
+	if (args->nr_local > UIO_FASTIOV) {
+		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
+		if (!iovs) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	nr_pages = rds_rdma_pages(iovs, args->nr_local);
+	if (nr_pages < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	op->op_active = 1;
+	op->op_recverr = rs->rs_recverr;
+	WARN_ON(!nr_pages);
+	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+	if (!op->op_sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (op->op_notify || op->op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->op_notifier) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		op->op_notifier->n_user_token = args->user_token;
+		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	/* The cookie contains the R_Key of the remote memory region, and
+	 * optionally an offset into it. This is how we implement RDMA into
+	 * unaligned memory.
+	 * When setting up the RDMA, we need to add that offset to the
+	 * destination address (which is really an offset into the MR)
+	 * FIXME: We may want to move this into ib_rdma.c
+	 */
+	op->op_rkey = rds_rdma_cookie_key(args->cookie);
+	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+	nr_bytes = 0;
+
+	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+	       (unsigned long long)args->nr_local,
+	       (unsigned long long)args->remote_vec.addr,
+	       op->op_rkey);
+
+	for (i = 0; i < args->nr_local; i++) {
+		struct rds_iovec *iov = &iovs[i];
+		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+		unsigned int nr = rds_pages_in_vec(iov);
+
+		rs->rs_user_addr = iov->addr;
+		rs->rs_user_bytes = iov->bytes;
+
+		/* If it's a WRITE operation, we want to pin the pages for reading.
+		 * If it's a READ operation, we need to pin the pages for writing.
+		 */
+		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+		if (ret < 0)
+			goto out;
+
+		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+			 nr_bytes, nr, iov->bytes, iov->addr);
+
+		nr_bytes += iov->bytes;
+
+		for (j = 0; j < nr; j++) {
+			unsigned int offset = iov->addr & ~PAGE_MASK;
+			struct scatterlist *sg;
+
+			sg = &op->op_sg[op->op_nents + j];
+			sg_set_page(sg, pages[j],
+					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+					offset);
+
+			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+			       sg->offset, sg->length, iov->addr, iov->bytes);
+
+			iov->addr += sg->length;
+			iov->bytes -= sg->length;
+		}
+
+		op->op_nents += nr;
+	}
+
+	if (nr_bytes > args->remote_vec.bytes) {
+		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+				nr_bytes,
+				(unsigned int) args->remote_vec.bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+	op->op_bytes = nr_bytes;
+
+out:
+	if (iovs != iovstack)
+		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
+	kfree(pages);
+	if (ret)
+		rds_rdma_free_op(op);
+	else
+		rds_stats_inc(s_send_rdma);
+
+	return ret;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	unsigned long flags;
+	struct rds_mr *mr;
+	u32 r_key;
+	int err = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+	/* We are reusing a previously mapped MR here. Most likely, the
+	 * application has written to the buffer, so we need to explicitly
+	 * flush those writes to RAM. Otherwise the HCA may not see them
+	 * when doing a DMA from that buffer.
+	 */
+	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr)
+		err = -EINVAL;	/* invalid r_key */
+	else
+		atomic_inc(&mr->r_refcount);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (mr) {
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+		rm->rdma.op_rdma_mr = mr;
+	}
+	return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg)
+{
+	struct page *page = NULL;
+	struct rds_atomic_args *args;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+	 || rm->atomic.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	/* Nonmasked & masked cmsg ops converted to masked hw ops */
+	switch (cmsg->cmsg_type) {
+	case RDS_CMSG_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = 0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->m_fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+		break;
+	case RDS_CMSG_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = ~0;
+		rm->atomic.op_m_cswp.swap_mask = ~0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+		break;
+	default:
+		BUG(); /* should never happen */
+	}
+
+	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	rm->atomic.op_active = 1;
+	rm->atomic.op_recverr = rs->rs_recverr;
+	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+	if (!rm->atomic.op_sg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* verify 8 byte-aligned */
+	if (args->local_addr & 0x7) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+		if (!rm->atomic.op_notifier) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		rm->atomic.op_notifier->n_user_token = args->user_token;
+		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+	return ret;
+err:
+	if (page)
+		put_page(page);
+	kfree(rm->atomic.op_notifier);
+
+	return ret;
+}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 00000000..f8760e1b
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2009 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <rdma/rdma_cm.h>
+
+#include "rdma_transport.h"
+
+static struct rdma_cm_id *rds_rdma_listen_id;
+
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+		[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+	RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+	RDS_CM_EVENT_STRING(ADDR_ERROR),
+	RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+	RDS_CM_EVENT_STRING(ROUTE_ERROR),
+	RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+	RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+	RDS_CM_EVENT_STRING(CONNECT_ERROR),
+	RDS_CM_EVENT_STRING(UNREACHABLE),
+	RDS_CM_EVENT_STRING(REJECTED),
+	RDS_CM_EVENT_STRING(ESTABLISHED),
+	RDS_CM_EVENT_STRING(DISCONNECTED),
+	RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+	RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+	RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+	RDS_CM_EVENT_STRING(ADDR_CHANGE),
+	RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+	return rds_str_array(rds_cm_event_strings,
+			     ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	/* this can be null in the listening path */
+	struct rds_connection *conn = cm_id->context;
+	struct rds_transport *trans;
+	int ret = 0;
+
+	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+		 event->event, rds_cm_event_str(event->event));
+
+	if (cm_id->device->node_type == RDMA_NODE_RNIC)
+		trans = &rds_iw_transport;
+	else
+		trans = &rds_ib_transport;
+
+	/* Prevent shutdown from tearing down the connection
+	 * while we're executing. */
+	if (conn) {
+		mutex_lock(&conn->c_cm_lock);
+
+		/* If the connection is being shut down, bail out
+		 * right away. We return 0 so cm_id doesn't get
+		 * destroyed prematurely */
+		if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+			/* Reject incoming connections while we're tearing
+			 * down an existing one. */
+			if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+				ret = 1;
+			goto out;
+		}
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = trans->cm_handle_connect(cm_id, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		/* XXX do we need to clean up if this fails? */
+		ret = rdma_resolve_route(cm_id,
+					 RDS_RDMA_RESOLVE_TIMEOUT_MS);
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		/* XXX worry about racing with listen acceptance */
+		ret = trans->cm_initiate_connect(cm_id);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		trans->cm_connect_complete(conn, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		if (conn)
+			rds_conn_drop(conn);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		rdsdebug("DISCONNECT event - dropping connection "
+			"%pI4->%pI4\n", &conn->c_laddr,
+			 &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+
+	default:
+		/* things like device disconnect? */
+		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+		       event->event, rds_cm_event_str(event->event));
+		break;
+	}
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+
+	rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+		 rds_cm_event_str(event->event), ret);
+
+	return ret;
+}
+
+static int rds_rdma_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
+			       IB_QPT_RC);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_create_id() returned %d\n", ret);
+		return ret;
+	}
+
+	sin.sin_family = AF_INET,
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_PORT);
+
+	/*
+	 * XXX I bet this binds the cm_id to a device.  If we want to support
+	 * fail-over we'll have to take this into consideration.
+	 */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_bind_addr() returned %d\n", ret);
+		goto out;
+	}
+
+	ret = rdma_listen(cm_id, 128);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_listen() returned %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+	rds_rdma_listen_id = cm_id;
+	cm_id = NULL;
+out:
+	if (cm_id)
+		rdma_destroy_id(cm_id);
+	return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+	if (rds_rdma_listen_id) {
+		rdsdebug("cm %p\n", rds_rdma_listen_id);
+		rdma_destroy_id(rds_rdma_listen_id);
+		rds_rdma_listen_id = NULL;
+	}
+}
+
+static int rds_rdma_init(void)
+{
+	int ret;
+
+	ret = rds_rdma_listen_init();
+	if (ret)
+		goto out;
+
+	ret = rds_iw_init();
+	if (ret)
+		goto err_iw_init;
+
+	ret = rds_ib_init();
+	if (ret)
+		goto err_ib_init;
+
+	goto out;
+
+err_ib_init:
+	rds_iw_exit();
+err_iw_init:
+	rds_rdma_listen_stop();
+out:
+	return ret;
+}
+module_init(rds_rdma_init);
+
+static void rds_rdma_exit(void)
+{
+	/* stop listening first to ensure no new connections are attempted */
+	rds_rdma_listen_stop();
+	rds_ib_exit();
+	rds_iw_exit();
+}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 00000000..faba4e38
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,24 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 00000000..da8adac2
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,812 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0	0x0300
+#define RDS_PROTOCOL_3_1	0x0301
+#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
+#define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * #               18464-18768 Unassigned
+ * We should do better.  We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT	18634
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT	12
+#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+#define RDS_CONG_MAP_BYTES	(65536 / 8)
+#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
+
+struct rds_cong_map {
+	struct rb_node		m_rb_node;
+	__be32			m_addr;
+	wait_queue_head_t	m_waitq;
+	struct list_head	m_conn_list;
+	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+	RDS_CONN_DOWN = 0,
+	RDS_CONN_CONNECTING,
+	RDS_CONN_DISCONNECTING,
+	RDS_CONN_UP,
+	RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL	0
+#define RDS_RECONNECT_PENDING	1
+#define RDS_IN_XMIT		2
+
+struct rds_connection {
+	struct hlist_node	c_hash_node;
+	__be32			c_laddr;
+	__be32			c_faddr;
+	unsigned int		c_loopback:1;
+	struct rds_connection	*c_passive;
+
+	struct rds_cong_map	*c_lcong;
+	struct rds_cong_map	*c_fcong;
+
+	struct rds_message	*c_xmit_rm;
+	unsigned long		c_xmit_sg;
+	unsigned int		c_xmit_hdr_off;
+	unsigned int		c_xmit_data_off;
+	unsigned int		c_xmit_atomic_sent;
+	unsigned int		c_xmit_rdma_sent;
+	unsigned int		c_xmit_data_sent;
+
+	spinlock_t		c_lock;		/* protect msg queues */
+	u64			c_next_tx_seq;
+	struct list_head	c_send_queue;
+	struct list_head	c_retrans;
+
+	u64			c_next_rx_seq;
+
+	struct rds_transport	*c_trans;
+	void			*c_transport_data;
+
+	atomic_t		c_state;
+	unsigned long		c_flags;
+	unsigned long		c_reconnect_jiffies;
+	struct delayed_work	c_send_w;
+	struct delayed_work	c_recv_w;
+	struct delayed_work	c_conn_w;
+	struct work_struct	c_down_w;
+	struct mutex		c_cm_lock;	/* protect conn state & cm */
+	wait_queue_head_t	c_waitq;
+
+	struct list_head	c_map_item;
+	unsigned long		c_map_queued;
+
+	unsigned int		c_unacked_packets;
+	unsigned int		c_unacked_bytes;
+
+	/* Protocol version */
+	unsigned int		c_version;
+};
+
+#define RDS_FLAG_CONG_BITMAP	0x01
+#define RDS_FLAG_ACK_REQUIRED	0x02
+#define RDS_FLAG_RETRANSMITTED	0x04
+#define RDS_MAX_ADV_CREDIT	255
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE	16
+
+struct rds_header {
+	__be64	h_sequence;
+	__be64	h_ack;
+	__be32	h_len;
+	__be16	h_sport;
+	__be16	h_dport;
+	u8	h_flags;
+	u8	h_credit;
+	u8	h_padding[4];
+	__sum16	h_csum;
+
+	u8	h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE		0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION	1
+struct rds_ext_header_version {
+	__be32			h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA		2
+struct rds_ext_header_rdma {
+	__be32			h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST	3
+struct rds_ext_header_rdma_dest {
+	__be32			h_rdma_rkey;
+	__be32			h_rdma_offset;
+};
+
+#define __RDS_EXTHDR_MAX	16 /* for now */
+
+struct rds_incoming {
+	atomic_t		i_refcount;
+	struct list_head	i_item;
+	struct rds_connection	*i_conn;
+	struct rds_header	i_hdr;
+	unsigned long		i_rx_jiffies;
+	__be32			i_saddr;
+
+	rds_rdma_cookie_t	i_rdma_cookie;
+};
+
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	atomic_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP		0
+#define RDS_ATOMIC_TYPE_FADD		1
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path.  It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue.  m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting.  As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly.  That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate.  They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK		1
+#define RDS_MSG_ON_CONN		2
+#define RDS_MSG_HAS_ACK_SEQ	3
+#define RDS_MSG_ACK_REQUIRED	4
+#define RDS_MSG_RETRANSMITTED	5
+#define RDS_MSG_MAPPED		6
+#define RDS_MSG_PAGEVEC		7
+
+struct rds_message {
+	atomic_t		m_refcount;
+	struct list_head	m_sock_item;
+	struct list_head	m_conn_item;
+	struct rds_incoming	m_inc;
+	u64			m_ack_seq;
+	__be32			m_daddr;
+	unsigned long		m_flags;
+
+	/* Never access m_rs without holding m_rs_lock.
+	 * Lock nesting is
+	 *  rm->m_rs_lock
+	 *   -> rs->rs_lock
+	 */
+	spinlock_t		m_rs_lock;
+	wait_queue_head_t	m_flush_wait;
+
+	struct rds_sock		*m_rs;
+
+	/* cookie to send to remote, in rds header */
+	rds_rdma_cookie_t	m_rdma_cookie;
+
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
+
+	void			*m_final_op;
+
+	struct {
+		struct rm_atomic_op {
+			int			op_type;
+			union {
+				struct {
+					uint64_t	compare;
+					uint64_t	swap;
+					uint64_t	compare_mask;
+					uint64_t	swap_mask;
+				} op_m_cswp;
+				struct {
+					uint64_t	add;
+					uint64_t	nocarry_mask;
+				} op_m_fadd;
+			};
+
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} atomic;
+		struct rm_rdma_op {
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_write:1;
+			unsigned int		op_fence:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			unsigned int		op_bytes;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} rdma;
+		struct rm_data_op {
+			unsigned int		op_active:1;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+		} data;
+	};
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+	struct list_head	n_list;
+	uint64_t		n_user_token;
+	int			n_status;
+};
+
+/**
+ * struct rds_transport -  transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ *        part of a message.  The caller serializes on the send_sem so this
+ *        doesn't need to be reentrant for a given conn.  The header must be
+ *        sent before the data payload.  .xmit must be prepared to send a
+ *        message with no data payload.  .xmit should return the number of
+ *        bytes that were sent down the connection, including header bytes.
+ *        Returning 0 tells the caller that it doesn't need to perform any
+ *        additional work now.  This is usually the case when the transport has
+ *        filled the sending queue for its connection and will handle
+ *        triggering the rds thread to continue the send when space becomes
+ *        available.  Returning -EAGAIN tells the caller to retry the send
+ *        immediately.  Returning -ENOMEM tells the caller to retry the send at
+ *        some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
+ *                 it returns the connection can not call rds_recv_incoming().
+ *                 This will only be called once after conn_connect returns
+ *                 non-zero success and will The caller serializes this with
+ *                 the send and connecting paths (xmit_* and conn_*).  The
+ *                 transport is responsible for other serialization, including
+ *                 rds_recv_incoming().  This is called in process context but
+ *                 should try hard not to block.
+ */
+
+#define RDS_TRANS_IB	0
+#define RDS_TRANS_IWARP	1
+#define RDS_TRANS_TCP	2
+#define RDS_TRANS_COUNT	3
+
+struct rds_transport {
+	char			t_name[TRANSNAMSIZ];
+	struct list_head	t_item;
+	struct module		*t_owner;
+	unsigned int		t_prefer_loopback:1;
+	unsigned int		t_type;
+
+	int (*laddr_check)(__be32 addr);
+	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+	void (*conn_free)(void *data);
+	int (*conn_connect)(struct rds_connection *conn);
+	void (*conn_shutdown)(struct rds_connection *conn);
+	void (*xmit_prepare)(struct rds_connection *conn);
+	void (*xmit_complete)(struct rds_connection *conn);
+	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+		    unsigned int hdr_off, unsigned int sg, unsigned int off);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+	int (*recv)(struct rds_connection *conn);
+	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+				size_t size);
+	void (*inc_free)(struct rds_incoming *inc);
+
+	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+				 struct rdma_cm_event *event);
+	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+	void (*cm_connect_complete)(struct rds_connection *conn,
+				    struct rdma_cm_event *event);
+
+	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+					unsigned int avail);
+	void (*exit)(void);
+	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+			struct rds_sock *rs, u32 *key_ret);
+	void (*sync_mr)(void *trans_private, int direction);
+	void (*free_mr)(void *trans_private, int invalidate);
+	void (*flush_mrs)(void);
+};
+
+struct rds_sock {
+	struct sock		rs_sk;
+
+	u64			rs_user_addr;
+	u64			rs_user_bytes;
+
+	/*
+	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
+	 * support.
+	 */
+	struct hlist_node	rs_bound_node;
+	__be32			rs_bound_addr;
+	__be32			rs_conn_addr;
+	__be16			rs_bound_port;
+	__be16			rs_conn_port;
+	struct rds_transport    *rs_transport;
+
+	/*
+	 * rds_sendmsg caches the conn it used the last time around.
+	 * This helps avoid costly lookups.
+	 */
+	struct rds_connection	*rs_conn;
+
+	/* flag indicating we were congested or not */
+	int			rs_congested;
+	/* seen congestion (ENOBUFS) when sending? */
+	int			rs_seen_congestion;
+
+	/* rs_lock protects all these adjacent members before the newline */
+	spinlock_t		rs_lock;
+	struct list_head	rs_send_queue;
+	u32			rs_snd_bytes;
+	int			rs_rcv_bytes;
+	struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */
+
+	/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+	 * to decide whether the application should be woken up.
+	 * If not set, we use rs_cong_track to find out whether a cong map
+	 * update arrived.
+	 */
+	uint64_t		rs_cong_mask;
+	uint64_t		rs_cong_notify;
+	struct list_head	rs_cong_list;
+	unsigned long		rs_cong_track;
+
+	/*
+	 * rs_recv_lock protects the receive queue, and is
+	 * used to serialize with rds_release.
+	 */
+	rwlock_t		rs_recv_lock;
+	struct list_head	rs_recv_queue;
+
+	/* just for stats reporting */
+	struct list_head	rs_item;
+
+	/* these have their own lock */
+	spinlock_t		rs_rdma_lock;
+	struct rb_root		rs_rdma_keys;
+
+	/* Socket options - in case there will be more */
+	unsigned char		rs_recverr,
+				rs_cong_monitor;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+	return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+	return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead.  We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+	uint64_t	s_conn_reset;
+	uint64_t	s_recv_drop_bad_checksum;
+	uint64_t	s_recv_drop_old_seq;
+	uint64_t	s_recv_drop_no_sock;
+	uint64_t	s_recv_drop_dead_sock;
+	uint64_t	s_recv_deliver_raced;
+	uint64_t	s_recv_delivered;
+	uint64_t	s_recv_queued;
+	uint64_t	s_recv_immediate_retry;
+	uint64_t	s_recv_delayed_retry;
+	uint64_t	s_recv_ack_required;
+	uint64_t	s_recv_rdma_bytes;
+	uint64_t	s_recv_ping;
+	uint64_t	s_send_queue_empty;
+	uint64_t	s_send_queue_full;
+	uint64_t	s_send_lock_contention;
+	uint64_t	s_send_lock_queue_raced;
+	uint64_t	s_send_immediate_retry;
+	uint64_t	s_send_delayed_retry;
+	uint64_t	s_send_drop_acked;
+	uint64_t	s_send_ack_required;
+	uint64_t	s_send_queued;
+	uint64_t	s_send_rdma;
+	uint64_t	s_send_rdma_bytes;
+	uint64_t	s_send_pong;
+	uint64_t	s_page_remainder_hit;
+	uint64_t	s_page_remainder_miss;
+	uint64_t	s_copy_to_user;
+	uint64_t	s_copy_from_user;
+	uint64_t	s_cong_update_queued;
+	uint64_t	s_cong_update_received;
+	uint64_t	s_cong_send_error;
+	uint64_t	s_cong_send_blocked;
+};
+
+/* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+	wait_queue_head_t *waitq = sk_sleep(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD) && waitq)
+		wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+int rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+				__attribute__ ((format (printf, 2, 3)));
+#define rds_conn_error(conn, fmt...) \
+	__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+	return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state);
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
+					       size_t total_len);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+			      unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+			       unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+	hdr->h_csum = 0;
+	hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+	return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+					 struct rm_rdma_op *);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (atomic_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
+
+/* stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do {		\
+	per_cpu(which, get_cpu()).member++;		\
+	put_cpu();					\
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do {		\
+	per_cpu(which, get_cpu()).member += count;	\
+	put_cpu();					\
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names,
+			 size_t nr);
+
+/* sysctl.c */
+int rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int  rds_sysctl_max_unacked_packets;
+extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int  rds_sysctl_trace_level;
+
+/* threads.c */
+int rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail);
+int rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 00000000..596689e5
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include "rds.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr)
+{
+	atomic_set(&inc->i_refcount, 1);
+	INIT_LIST_HEAD(&inc->i_item);
+	inc->i_conn = conn;
+	inc->i_saddr = saddr;
+	inc->i_rdma_cookie = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_init);
+
+static void rds_inc_addref(struct rds_incoming *inc)
+{
+	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	atomic_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	if (atomic_dec_and_test(&inc->i_refcount)) {
+		BUG_ON(!list_empty(&inc->i_item));
+
+		inc->i_conn->c_trans->inc_free(inc);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_inc_put);
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+				  struct rds_cong_map *map,
+				  int delta, __be16 port)
+{
+	int now_congested;
+
+	if (delta == 0)
+		return;
+
+	rs->rs_rcv_bytes += delta;
+	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+	  "now_cong %d delta %d\n",
+	  rs, &rs->rs_bound_addr,
+	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+	  rds_sk_rcvbuf(rs), now_congested, delta);
+
+	/* wasn't -> am congested */
+	if (!rs->rs_congested && now_congested) {
+		rs->rs_congested = 1;
+		rds_cong_set_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+	/* was -> aren't congested */
+	/* Require more free space before reporting uncongested to prevent
+	   bouncing cong/uncong state too often */
+	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+		rs->rs_congested = 0;
+		rds_cong_clear_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+
+	/* do nothing if no change in cong state */
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+	struct rds_header *hdr = &inc->i_hdr;
+	unsigned int pos = 0, type, len;
+	union {
+		struct rds_ext_header_version version;
+		struct rds_ext_header_rdma rdma;
+		struct rds_ext_header_rdma_dest rdma_dest;
+	} buffer;
+
+	while (1) {
+		len = sizeof(buffer);
+		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+		if (type == RDS_EXTHDR_NONE)
+			break;
+		/* Process extension header here */
+		switch (type) {
+		case RDS_EXTHDR_RDMA:
+			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+			break;
+
+		case RDS_EXTHDR_RDMA_DEST:
+			/* We ignore the size for now. We could stash it
+			 * somewhere and use it for error checking. */
+			inc->i_rdma_cookie = rds_rdma_make_cookie(
+					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+			break;
+		}
+	}
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp, enum km_type km)
+{
+	struct rds_sock *rs = NULL;
+	struct sock *sk;
+	unsigned long flags;
+
+	inc->i_conn = conn;
+	inc->i_rx_jiffies = jiffies;
+
+	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+		 "flags 0x%x rx_jiffies %lu\n", conn,
+		 (unsigned long long)conn->c_next_rx_seq,
+		 inc,
+		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+		 be32_to_cpu(inc->i_hdr.h_len),
+		 be16_to_cpu(inc->i_hdr.h_sport),
+		 be16_to_cpu(inc->i_hdr.h_dport),
+		 inc->i_hdr.h_flags,
+		 inc->i_rx_jiffies);
+
+	/*
+	 * Sequence numbers should only increase.  Messages get their
+	 * sequence number as they're queued in a sending conn.  They
+	 * can be dropped, though, if the sending socket is closed before
+	 * they hit the wire.  So sequence numbers can skip forward
+	 * under normal operation.  They can also drop back in the conn
+	 * failover case as previously sent messages are resent down the
+	 * new instance of a conn.  We drop those, otherwise we have
+	 * to assume that the next valid seq does not come after a
+	 * hole in the fragment stream.
+	 *
+	 * The headers don't give us a way to realize if fragments of
+	 * a message have been dropped.  We assume that frags that arrive
+	 * to a flow are part of the current message on the flow that is
+	 * being reassembled.  This means that senders can't drop messages
+	 * from the sending conn until all their frags are sent.
+	 *
+	 * XXX we could spend more on the wire to get more robust failure
+	 * detection, arguably worth it to avoid data corruption.
+	 */
+	if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+		rds_stats_inc(s_recv_drop_old_seq);
+		goto out;
+	}
+	conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+		rds_stats_inc(s_recv_ping);
+		rds_send_pong(conn, inc->i_hdr.h_sport);
+		goto out;
+	}
+
+	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+	if (!rs) {
+		rds_stats_inc(s_recv_drop_no_sock);
+		goto out;
+	}
+
+	/* Process extension headers */
+	rds_recv_incoming_exthdrs(inc, rs);
+
+	/* We can be racing with rds_release() which marks the socket dead. */
+	sk = rds_rs_to_sk(rs);
+
+	/* serialize with rds_release -> sock_orphan */
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+		rds_stats_inc(s_recv_queued);
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		rds_inc_addref(inc);
+		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+		__rds_wake_sk_sleep(sk);
+	} else {
+		rds_stats_inc(s_recv_drop_dead_sock);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+	if (rs)
+		rds_sock_put(rs);
+}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
+
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+	unsigned long flags;
+
+	if (!*inc) {
+		read_lock_irqsave(&rs->rs_recv_lock, flags);
+		if (!list_empty(&rs->rs_recv_queue)) {
+			*inc = list_entry(rs->rs_recv_queue.next,
+					  struct rds_incoming,
+					  i_item);
+			rds_inc_addref(*inc);
+		}
+		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+	}
+
+	return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+			    int drop)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	int ret = 0;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!list_empty(&inc->i_item)) {
+		ret = 1;
+		if (drop) {
+			/* XXX make sure this i_conn is reliable */
+			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+					      -be32_to_cpu(inc->i_hdr.h_len),
+					      inc->i_hdr.h_dport);
+			list_del_init(&inc->i_item);
+			rds_inc_put(inc);
+		}
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+	return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	struct rds_notifier *notifier;
+	struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+	unsigned int count = 0, max_messages = ~0U;
+	unsigned long flags;
+	LIST_HEAD(copy);
+	int err = 0;
+
+
+	/* put_cmsg copies to user space and thus may sleep. We can't do this
+	 * with rs_lock held, so first grab as many notifications as we can stuff
+	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
+	 * losing notifications - except when the buffer is so small that it wouldn't
+	 * even hold a single notification. Then we give him as much of this single
+	 * msg as we can squeeze in, and set MSG_CTRUNC.
+	 */
+	if (msghdr) {
+		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+		if (!max_messages)
+			max_messages = 1;
+	}
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+		notifier = list_entry(rs->rs_notify_queue.next,
+				struct rds_notifier, n_list);
+		list_move(&notifier->n_list, &copy);
+		count++;
+	}
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (!count)
+		return 0;
+
+	while (!list_empty(&copy)) {
+		notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+		if (msghdr) {
+			cmsg.user_token = notifier->n_user_token;
+			cmsg.status = notifier->n_status;
+
+			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+				       sizeof(cmsg), &cmsg);
+			if (err)
+				break;
+		}
+
+		list_del_init(&notifier->n_list);
+		kfree(notifier);
+	}
+
+	/* If we bailed out because of an error in put_cmsg,
+	 * we may be left with one or more notifications that we
+	 * didn't process. Return them to the head of the list. */
+	if (!list_empty(&copy)) {
+		spin_lock_irqsave(&rs->rs_lock, flags);
+		list_splice(&copy, &rs->rs_notify_queue);
+		spin_unlock_irqrestore(&rs->rs_lock, flags);
+	}
+
+	return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	uint64_t notify = rs->rs_cong_notify;
+	unsigned long flags;
+	int err;
+
+	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+			sizeof(notify), &notify);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	rs->rs_cong_notify &= ~notify;
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+	int ret = 0;
+
+	if (inc->i_rdma_cookie) {
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	long timeo;
+	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+	struct sockaddr_in *sin;
+	struct rds_incoming *inc = NULL;
+
+	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+	if (msg_flags & MSG_OOB)
+		goto out;
+
+	while (1) {
+		/* If there are pending notifications, do those - and nothing else */
+		if (!list_empty(&rs->rs_notify_queue)) {
+			ret = rds_notify_queue_get(rs, msg);
+			break;
+		}
+
+		if (rs->rs_cong_notify) {
+			ret = rds_notify_cong(rs, msg);
+			break;
+		}
+
+		if (!rds_next_incoming(rs, &inc)) {
+			if (nonblock) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					(!list_empty(&rs->rs_notify_queue) ||
+					 rs->rs_cong_notify ||
+					 rds_next_incoming(rs, &inc)), timeo);
+			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+				 timeo);
+			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+				continue;
+
+			ret = timeo;
+			if (ret == 0)
+				ret = -ETIMEDOUT;
+			break;
+		}
+
+		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+			 &inc->i_conn->c_faddr,
+			 ntohs(inc->i_hdr.h_sport));
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+							     size);
+		if (ret < 0)
+			break;
+
+		/*
+		 * if the message we just copied isn't at the head of the
+		 * recv queue then someone else raced us to return it, try
+		 * to get the next message.
+		 */
+		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+			rds_inc_put(inc);
+			inc = NULL;
+			rds_stats_inc(s_recv_deliver_raced);
+			continue;
+		}
+
+		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+			if (msg_flags & MSG_TRUNC)
+				ret = be32_to_cpu(inc->i_hdr.h_len);
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		if (rds_cmsg_recv(inc, msg)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		rds_stats_inc(s_recv_delivered);
+
+		sin = (struct sockaddr_in *)msg->msg_name;
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = inc->i_hdr.h_sport;
+			sin->sin_addr.s_addr = inc->i_saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+		}
+		break;
+	}
+
+	if (inc)
+		rds_inc_put(inc);
+
+out:
+	return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct rds_incoming *inc, *tmp;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      -be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		list_del_init(&inc->i_item);
+		rds_inc_put(inc);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip)
+{
+	struct rds_info_message minfo;
+
+	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo.laddr = daddr;
+		minfo.faddr = saddr;
+		minfo.lport = inc->i_hdr.h_dport;
+		minfo.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo.laddr = saddr;
+		minfo.faddr = daddr;
+		minfo.lport = inc->i_hdr.h_sport;
+		minfo.fport = inc->i_hdr.h_dport;
+	}
+
+	rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 00000000..c803341f
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1133 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+
+#include "rds.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
+/*
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+
+	if (conn->c_xmit_rm) {
+		rm = conn->c_xmit_rm;
+		conn->c_xmit_rm = NULL;
+		/* Tell the user the RDMA op is no longer mapped by the
+		 * transport. This isn't entirely true (it's flushed out
+		 * independently) but as the connection is down, there's
+		 * no ongoing RDMA to/from that memory */
+		rds_message_unmapped(rm);
+		rds_message_put(rm);
+	}
+
+	conn->c_xmit_sg = 0;
+	conn->c_xmit_hdr_off = 0;
+	conn->c_xmit_data_off = 0;
+	conn->c_xmit_atomic_sent = 0;
+	conn->c_xmit_rdma_sent = 0;
+	conn->c_xmit_data_sent = 0;
+
+	conn->c_map_queued = 0;
+
+	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+	/* Mark messages as retransmissions, and move them to the send q */
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+		set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+	}
+	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+	clear_bit(RDS_IN_XMIT, &conn->c_flags);
+	smp_mb__after_clear_bit();
+	/*
+	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
+/*
+ * We're making the conscious trade-off here to only send one message
+ * down the connection at a time.
+ *   Pro:
+ *      - tx queueing is a simple fifo list
+ *   	- reassembly is optional and easily done by transports per conn
+ *      - no per flow rx lookup at all, straight to the socket
+ *   	- less per-frag memory and wire overhead
+ *   Con:
+ *      - queued acks can be delayed behind large messages
+ *   Depends:
+ *      - small message latency is higher behind queued large messages
+ *      - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	unsigned int tmp;
+	struct scatterlist *sg;
+	int ret = 0;
+	LIST_HEAD(to_be_dropped);
+
+restart:
+
+	/*
+	 * sendmsg calls here after having queued its message on the send
+	 * queue.  We only have one task feeding the connection at a time.  If
+	 * another thread is already feeding the queue then we back off.  This
+	 * avoids blocking the caller and trading per-connection data between
+	 * caches per message.
+	 */
+	if (!acquire_in_xmit(conn)) {
+		rds_stats_inc(s_send_lock_contention);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+	 * we do the opposite to avoid races.
+	 */
+	if (!rds_conn_up(conn)) {
+		release_in_xmit(conn);
+		ret = 0;
+		goto out;
+	}
+
+	if (conn->c_trans->xmit_prepare)
+		conn->c_trans->xmit_prepare(conn);
+
+	/*
+	 * spin trying to push headers and data down the connection until
+	 * the connection doesn't make forward progress.
+	 */
+	while (1) {
+
+		rm = conn->c_xmit_rm;
+
+		/*
+		 * If between sending messages, we can send a pending congestion
+		 * map update.
+		 */
+		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+			rm = rds_cong_update_alloc(conn);
+			if (IS_ERR(rm)) {
+				ret = PTR_ERR(rm);
+				break;
+			}
+			rm->data.op_active = 1;
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/*
+		 * If not already working on one, grab the next message.
+		 *
+		 * c_xmit_rm holds a ref while we're sending this message down
+		 * the connction.  We can use this ref while holding the
+		 * send_sem.. rds_send_reset() is serialized with it.
+		 */
+		if (!rm) {
+			unsigned int len;
+
+			spin_lock_irqsave(&conn->c_lock, flags);
+
+			if (!list_empty(&conn->c_send_queue)) {
+				rm = list_entry(conn->c_send_queue.next,
+						struct rds_message,
+						m_conn_item);
+				rds_message_addref(rm);
+
+				/*
+				 * Move the message from the send queue to the retransmit
+				 * list right away.
+				 */
+				list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+			}
+
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+
+			if (!rm)
+				break;
+
+			/* Unfortunately, the way Infiniband deals with
+			 * RDMA to a bad MR key is by moving the entire
+			 * queue pair to error state. We cold possibly
+			 * recover from that, but right now we drop the
+			 * connection.
+			 * Therefore, we never retransmit messages with RDMA ops.
+			 */
+			if (rm->rdma.op_active &&
+			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+				spin_lock_irqsave(&conn->c_lock, flags);
+				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+					list_move(&rm->m_conn_item, &to_be_dropped);
+				spin_unlock_irqrestore(&conn->c_lock, flags);
+				continue;
+			}
+
+			/* Require an ACK every once in a while */
+			len = ntohl(rm->m_inc.i_hdr.h_len);
+			if (conn->c_unacked_packets == 0 ||
+			    conn->c_unacked_bytes < len) {
+				__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+				conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+				conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+				rds_stats_inc(s_send_ack_required);
+			} else {
+				conn->c_unacked_bytes -= len;
+				conn->c_unacked_packets--;
+			}
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/* The transport either sends the whole rdma or none of it */
+		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			rm->m_final_op = &rm->rdma;
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+			if (ret)
+				break;
+			conn->c_xmit_rdma_sent = 1;
+
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			rm->m_final_op = &rm->atomic;
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			if (ret)
+				break;
+			conn->c_xmit_atomic_sent = 1;
+
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		/*
+		 * A number of cases require an RDS header to be sent
+		 * even if there is no data.
+		 * We permit 0-byte sends; rds-ping depends on this.
+		 * However, if there are exclusively attached silent ops,
+		 * we skip the hdr/data send, to enable silent operation.
+		 */
+		if (rm->data.op_nents == 0) {
+			int ops_present;
+			int all_ops_are_silent = 1;
+
+			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+			if (rm->atomic.op_active && !rm->atomic.op_silent)
+				all_ops_are_silent = 0;
+			if (rm->rdma.op_active && !rm->rdma.op_silent)
+				all_ops_are_silent = 0;
+
+			if (ops_present && all_ops_are_silent
+			    && !rm->m_rdma_cookie)
+				rm->data.op_active = 0;
+		}
+
+		if (rm->data.op_active && !conn->c_xmit_data_sent) {
+			rm->m_final_op = &rm->data;
+			ret = conn->c_trans->xmit(conn, rm,
+						  conn->c_xmit_hdr_off,
+						  conn->c_xmit_sg,
+						  conn->c_xmit_data_off);
+			if (ret <= 0)
+				break;
+
+			if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+				tmp = min_t(int, ret,
+					    sizeof(struct rds_header) -
+					    conn->c_xmit_hdr_off);
+				conn->c_xmit_hdr_off += tmp;
+				ret -= tmp;
+			}
+
+			sg = &rm->data.op_sg[conn->c_xmit_sg];
+			while (ret) {
+				tmp = min_t(int, ret, sg->length -
+						      conn->c_xmit_data_off);
+				conn->c_xmit_data_off += tmp;
+				ret -= tmp;
+				if (conn->c_xmit_data_off == sg->length) {
+					conn->c_xmit_data_off = 0;
+					sg++;
+					conn->c_xmit_sg++;
+					BUG_ON(ret != 0 &&
+					       conn->c_xmit_sg == rm->data.op_nents);
+				}
+			}
+
+			if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+			    (conn->c_xmit_sg == rm->data.op_nents))
+				conn->c_xmit_data_sent = 1;
+		}
+
+		/*
+		 * A rm will only take multiple times through this loop
+		 * if there is a data op. Thus, if the data is sent (or there was
+		 * none), then we're done with the rm.
+		 */
+		if (!rm->data.op_active || conn->c_xmit_data_sent) {
+			conn->c_xmit_rm = NULL;
+			conn->c_xmit_sg = 0;
+			conn->c_xmit_hdr_off = 0;
+			conn->c_xmit_data_off = 0;
+			conn->c_xmit_rdma_sent = 0;
+			conn->c_xmit_atomic_sent = 0;
+			conn->c_xmit_data_sent = 0;
+
+			rds_message_put(rm);
+		}
+	}
+
+	if (conn->c_trans->xmit_complete)
+		conn->c_trans->xmit_complete(conn);
+
+	release_in_xmit(conn);
+
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped)) {
+		/* irqs on here, so we can put(), unlike above */
+		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+			rds_message_put(rm);
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+	}
+
+	/*
+	 * Other senders can queue a message after we last test the send queue
+	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+	 * not try and send their newly queued message.  We need to check the
+	 * send queue after having cleared RDS_IN_XMIT so that their message
+	 * doesn't get stuck on the send queue.
+	 *
+	 * If the transport cannot continue (i.e ret != 0), then it must
+	 * call us when more room is available, such as from the tx
+	 * completion handler.
+	 */
+	if (ret == 0) {
+		smp_mb();
+		if (!list_empty(&conn->c_send_queue)) {
+			rds_stats_inc(s_send_lock_queue_raced);
+			goto restart;
+		}
+	}
+out:
+	return ret;
+}
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	assert_spin_locked(&rs->rs_lock);
+
+	BUG_ON(rs->rs_snd_bytes < len);
+	rs->rs_snd_bytes -= len;
+
+	if (rs->rs_snd_bytes == 0)
+		rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+				    is_acked_func is_acked)
+{
+	if (is_acked)
+		return is_acked(rm, ack);
+	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_rdma_op *ro;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	ro = &rm->rdma;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+	    ro->op_active && ro->op_notify && ro->op_notifier) {
+		notifier = ro->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ro->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_atomic_op *ao;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	ao = &rm->atomic;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	    && ao->op_active && ao->op_notify && ao->op_notifier) {
+		notifier = ao->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ao->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+	struct rm_rdma_op *ro;
+	struct rm_atomic_op *ao;
+
+	ro = &rm->rdma;
+	if (ro->op_active && ro->op_notify && ro->op_notifier) {
+		ro->op_notifier->n_status = status;
+		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+		ro->op_notifier = NULL;
+	}
+
+	ao = &rm->atomic;
+	if (ao->op_active && ao->op_notify && ao->op_notifier) {
+		ao->op_notifier->n_status = status;
+		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+		ao->op_notifier = NULL;
+	}
+
+	/* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+					 struct rm_rdma_op *op)
+{
+	struct rds_message *rm, *tmp, *found = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (&rm->rdma == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			goto out;
+		}
+	}
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+		if (&rm->rdma == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			break;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	return found;
+}
+EXPORT_SYMBOL_GPL(rds_send_get_message);
+
+/*
+ * This removes messages from the socket's list if they're on it.  The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks.  The messages must have a reference held for their
+ * position on the list.  This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+	unsigned long flags;
+	struct rds_sock *rs = NULL;
+	struct rds_message *rm;
+
+	while (!list_empty(messages)) {
+		int was_on_sock = 0;
+
+		rm = list_entry(messages->next, struct rds_message,
+				m_conn_item);
+		list_del_init(&rm->m_conn_item);
+
+		/*
+		 * If we see this flag cleared then we're *sure* that someone
+		 * else beat us to removing it from the sock.  If we race
+		 * with their flag update we'll get the lock and then really
+		 * see that the flag has been cleared.
+		 *
+		 * The message spinlock makes sure nobody clears rm->m_rs
+		 * while we're messing with it. It does not prevent the
+		 * message from being removed from the socket, though.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+			goto unlock_and_drop;
+
+		if (rs != rm->m_rs) {
+			if (rs) {
+				rds_wake_sk_sleep(rs);
+				sock_put(rds_rs_to_sk(rs));
+			}
+			rs = rm->m_rs;
+			sock_hold(rds_rs_to_sk(rs));
+		}
+		spin_lock(&rs->rs_lock);
+
+		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+			struct rm_rdma_op *ro = &rm->rdma;
+			struct rds_notifier *notifier;
+
+			list_del_init(&rm->m_sock_item);
+			rds_send_sndbuf_remove(rs, rm);
+
+			if (ro->op_active && ro->op_notifier &&
+			       (ro->op_notify || (ro->op_recverr && status))) {
+				notifier = ro->op_notifier;
+				list_add_tail(&notifier->n_list,
+						&rs->rs_notify_queue);
+				if (!notifier->n_status)
+					notifier->n_status = status;
+				rm->rdma.op_notifier = NULL;
+			}
+			was_on_sock = 1;
+			rm->m_rs = NULL;
+		}
+		spin_unlock(&rs->rs_lock);
+
+unlock_and_drop:
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+		rds_message_put(rm);
+		if (was_on_sock)
+			rds_message_put(rm);
+	}
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number.  Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction.  Maybe it should bail if it sees SOCK_DEAD.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (!rds_send_is_acked(rm, ack, is_acked))
+			break;
+
+		list_move(&rm->m_conn_item, &list);
+		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	}
+
+	/* order flag updates with spin locks */
+	if (!list_empty(&list))
+		smp_mb__after_clear_bit();
+
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	/* now remove the messages from the sock list as needed */
+	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+	struct rds_message *rm, *tmp;
+	struct rds_connection *conn;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	/* get all the messages we're dropping under the rs lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
+			continue;
+
+		list_move(&rm->m_sock_item, &list);
+		rds_send_sndbuf_remove(rs, rm);
+		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+	}
+
+	/* order flag updates with the rs lock */
+	smp_mb__after_clear_bit();
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (list_empty(&list))
+		return;
+
+	/* Remove the messages from the conn */
+	list_for_each_entry(rm, &list, m_sock_item) {
+
+		conn = rm->m_inc.i_conn;
+
+		spin_lock_irqsave(&conn->c_lock, flags);
+		/*
+		 * Maybe someone else beat us to removing rm from the conn.
+		 * If we race with their flag update we'll get the lock and
+		 * then really see that the flag has been cleared.
+		 */
+		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+			continue;
+		}
+		list_del_init(&rm->m_conn_item);
+		spin_unlock_irqrestore(&conn->c_lock, flags);
+
+		/*
+		 * Couldn't grab m_rs_lock in top loop (lock ordering),
+		 * but we can now.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		rm->m_rs = NULL;
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+		rds_message_put(rm);
+	}
+
+	rds_wake_sk_sleep(rs);
+
+	while (!list_empty(&list)) {
+		rm = list_entry(list.next, struct rds_message, m_sock_item);
+		list_del_init(&rm->m_sock_item);
+
+		rds_message_wait(rm);
+		rds_message_put(rm);
+	}
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'.  It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+			     struct rds_message *rm, __be16 sport,
+			     __be16 dport, int *queued)
+{
+	unsigned long flags;
+	u32 len;
+
+	if (*queued)
+		goto out;
+
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* this is the only place which holds both the socket's rs_lock
+	 * and the connection's c_lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	/*
+	 * If there is a little space in sndbuf, we don't queue anything,
+	 * and userspace gets -EAGAIN. But poll() indicates there's send
+	 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+	 * freed up by incoming acks. So we check the *old* value of
+	 * rs_snd_bytes here to allow the last msg to exceed the buffer,
+	 * and poll() now knows no more data can be sent.
+	 */
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+		rs->rs_snd_bytes += len;
+
+		/* let recv side know we are close to send space exhaustion.
+		 * This is probably not the optimal way to do it, as this
+		 * means we set the flag on *all* messages as soon as our
+		 * throughput hits a certain threshold.
+		 */
+		if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+			__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+		rds_message_addref(rm);
+		rm->m_rs = rs;
+
+		/* The code ordering is a little weird, but we're
+		   trying to minimize the time we hold c_lock */
+		rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+		rm->m_inc.i_conn = conn;
+		rds_message_addref(rm);
+
+		spin_lock(&conn->c_lock);
+		rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+		list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+		spin_unlock(&conn->c_lock);
+
+		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+			 rm, len, rs, rs->rs_snd_bytes,
+			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+		*queued = 1;
+	}
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+	return *queued;
+}
+
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+	struct cmsghdr *cmsg;
+	int size = 0;
+	int cmsg_groups = 0;
+	int retval;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			cmsg_groups |= 1;
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			cmsg_groups |= 2;
+			/* these are valid but do no add any size */
+			break;
+
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			cmsg_groups |= 1;
+			size += sizeof(struct scatterlist);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
+
+	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+	if (cmsg_groups == 3)
+		return -EINVAL;
+
+	return size;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+			 struct msghdr *msg, int *allocated_mr)
+{
+	struct cmsghdr *cmsg;
+	int ret = 0;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		/* As a side effect, RDMA_DEST and RDMA_MAP will set
+		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+		 */
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+			ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_MAP:
+			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+			if (!ret)
+				*allocated_mr = 1;
+			break;
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			ret = rds_cmsg_atomic(rs, rm, cmsg);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+	__be32 daddr;
+	__be16 dport;
+	struct rds_message *rm = NULL;
+	struct rds_connection *conn;
+	int ret = 0;
+	int queued = 0, allocated_mr = 0;
+	int nonblock = msg->msg_flags & MSG_DONTWAIT;
+	long timeo = sock_sndtimeo(sk, nonblock);
+
+	/* Mirror Linux UDP mirror of BSD error message compatibility */
+	/* XXX: Perhaps MSG_MORE someday */
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (msg->msg_namelen) {
+		/* XXX fail non-unicast destination IPs? */
+		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+			ret = -EINVAL;
+			goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+	} else {
+		/* We only care about consistency with ->connect() */
+		lock_sock(sk);
+		daddr = rs->rs_conn_addr;
+		dport = rs->rs_conn_port;
+		release_sock(sk);
+	}
+
+	/* racing with another thread binding seems ok here */
+	if (daddr == 0 || rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	/* size of rm including all sgs */
+	ret = rds_rm_size(msg, payload_len);
+	if (ret < 0)
+		goto out;
+
+	rm = rds_message_alloc(ret, GFP_KERNEL);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Attach data to the rm */
+	if (payload_len) {
+		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		if (!rm->data.op_sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+		if (ret)
+			goto out;
+	}
+	rm->data.op_active = 1;
+
+	rm->m_daddr = daddr;
+
+	/* rds_conn_create has a spinlock that runs with IRQ off.
+	 * Caching the conn in the socket helps a lot. */
+	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+		conn = rs->rs_conn;
+	else {
+		conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+					rs->rs_transport,
+					sock->sk->sk_allocation);
+		if (IS_ERR(conn)) {
+			ret = PTR_ERR(conn);
+			goto out;
+		}
+		rs->rs_conn = conn;
+	}
+
+	/* Parse any control messages the user may have included. */
+	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+	if (ret)
+		goto out;
+
+	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+			       &rm->rdma, conn->c_trans->xmit_rdma);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+			       &rm->atomic, conn->c_trans->xmit_atomic);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rds_conn_connect_if_down(conn);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+	if (ret) {
+		rs->rs_seen_congestion = 1;
+		goto out;
+	}
+
+	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+				  dport, &queued)) {
+		rds_stats_inc(s_send_queue_full);
+		/* XXX make sure this is reasonable */
+		if (payload_len > rds_sk_sndbuf(rs)) {
+			ret = -EMSGSIZE;
+			goto out;
+		}
+		if (nonblock) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
+		timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					rds_send_queue_rm(rs, conn, rm,
+							  rs->rs_bound_port,
+							  dport,
+							  &queued),
+					timeo);
+		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+			continue;
+
+		ret = timeo;
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/*
+	 * By now we've committed to the send.  We reuse rds_send_worker()
+	 * to retry sends in the rds thread if the transport asks us to.
+	 */
+	rds_stats_inc(s_send_queued);
+
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_xmit(conn);
+
+	rds_message_put(rm);
+	return payload_len;
+
+out:
+	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+	 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+	 * or in any other way, we need to destroy the MR again */
+	if (allocated_mr)
+		rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	int ret = 0;
+
+	rm = rds_message_alloc(0, GFP_ATOMIC);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_daddr = conn->c_faddr;
+	rm->data.op_active = 1;
+
+	rds_conn_connect_if_down(conn);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	rds_message_addref(rm);
+	rm->m_inc.i_conn = conn;
+
+	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+				    conn->c_next_tx_seq);
+	conn->c_next_tx_seq++;
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	rds_stats_inc(s_send_queued);
+	rds_stats_inc(s_send_pong);
+
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_xmit(conn);
+
+	rds_message_put(rm);
+	return 0;
+
+out:
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 00000000..10c759cc
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static const char *const rds_stat_names[] = {
+	"conn_reset",
+	"recv_drop_bad_checksum",
+	"recv_drop_old_seq",
+	"recv_drop_no_sock",
+	"recv_drop_dead_sock",
+	"recv_deliver_raced",
+	"recv_delivered",
+	"recv_queued",
+	"recv_immediate_retry",
+	"recv_delayed_retry",
+	"recv_ack_required",
+	"recv_rdma_bytes",
+	"recv_ping",
+	"send_queue_empty",
+	"send_queue_full",
+	"send_lock_contention",
+	"send_lock_queue_raced",
+	"send_immediate_retry",
+	"send_delayed_retry",
+	"send_drop_acked",
+	"send_ack_required",
+	"send_queued",
+	"send_rdma",
+	"send_rdma_bytes",
+	"send_pong",
+	"page_remainder_hit",
+	"page_remainder_miss",
+	"copy_to_user",
+	"copy_from_user",
+	"cong_update_queued",
+	"cong_update_received",
+	"cong_send_error",
+	"cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names, size_t nr)
+{
+	struct rds_info_counter ctr;
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+		strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+		ctr.value = values[i];
+
+		rds_info_copy(iter, &ctr, sizeof(ctr));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
+
+/*
+ * This gives global counters across all the transports.  The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting.  Some are pretty implementation dependent
+ * and may change over time.  That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace.  It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	struct rds_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+	unsigned int avail;
+
+	avail = len / sizeof(struct rds_info_counter);
+
+	if (avail < ARRAY_SIZE(rds_stat_names)) {
+		avail = 0;
+		goto trans;
+	}
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+			    ARRAY_SIZE(rds_stat_names));
+	avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+	lens->each = sizeof(struct rds_info_counter);
+	lens->nr = rds_trans_stats_info_copy(iter, avail) +
+		   ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int rds_stats_init(void)
+{
+	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+	return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 00000000..25ad0c77
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int  rds_sysctl_max_unacked_packets = 8;
+unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static ctl_table rds_sysctl_rds_table[] = {
+	{
+		.procname       = "reconnect_min_delay_ms",
+		.data		= &rds_sysctl_reconnect_min_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min,
+		.extra2		= &rds_sysctl_reconnect_max_jiffies,
+	},
+	{
+		.procname       = "reconnect_max_delay_ms",
+		.data		= &rds_sysctl_reconnect_max_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min_jiffies,
+		.extra2		= &rds_sysctl_reconnect_max,
+	},
+	{
+		.procname	= "max_unacked_packets",
+		.data		= &rds_sysctl_max_unacked_packets,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "max_unacked_bytes",
+		.data		= &rds_sysctl_max_unacked_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "ping_enable",
+		.data		= &rds_sysctl_ping_enable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+	{ .procname = "net", },
+	{ .procname = "rds", },
+	{ }
+};
+
+
+void rds_sysctl_exit(void)
+{
+	if (rds_sysctl_reg_table)
+		unregister_sysctl_table(rds_sysctl_reg_table);
+}
+
+int rds_sysctl_init(void)
+{
+	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+	rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+	if (!rds_sysctl_reg_table)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 00000000..8e0a3200
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+static unsigned int rds_tcp_tc_count;
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+
+/* doing it this way avoids calling tcp_sk() */
+void rds_tcp_nonagle(struct socket *sock)
+{
+	mm_segment_t oldfs = get_fs();
+	int val = 1;
+
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
+			      sizeof(val));
+	set_fs(oldfs);
+}
+
+void rds_tcp_tune(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	rds_tcp_nonagle(sock);
+
+	/*
+	 * We're trying to saturate gigabit with the default,
+	 * see svc_sock_setbufsize().
+	 */
+	lock_sock(sk);
+	sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
+	sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
+	sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+	release_sock(sk);
+}
+
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
+{
+	return tcp_sk(tc->t_sock->sk)->snd_nxt;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+	return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc)
+{
+	rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_del_init(&tc->t_list_item);
+	rds_tcp_tc_count--;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	tc->t_sock = NULL;
+
+	sock->sk->sk_write_space = tc->t_orig_write_space;
+	sock->sk->sk_data_ready = tc->t_orig_data_ready;
+	sock->sk->sk_state_change = tc->t_orig_state_change;
+	sock->sk->sk_user_data = NULL;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * This is the only path that sets tc->t_sock.  Send and receive trust that
+ * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
+ * called while it isn't set.
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+	rds_tcp_tc_count++;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	/* accepted sockets need our listen data ready undone */
+	if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+		sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+	tc->t_sock = sock;
+	tc->conn = conn;
+	tc->t_orig_data_ready = sock->sk->sk_data_ready;
+	tc->t_orig_write_space = sock->sk->sk_write_space;
+	tc->t_orig_state_change = sock->sk->sk_state_change;
+
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = rds_tcp_data_ready;
+	sock->sk->sk_write_space = rds_tcp_write_space;
+	sock->sk->sk_state_change = rds_tcp_state_change;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+			    struct rds_info_iterator *iter,
+			    struct rds_info_lengths *lens)
+{
+	struct rds_info_tcp_socket tsinfo;
+	struct rds_tcp_connection *tc;
+	unsigned long flags;
+	struct sockaddr_in sin;
+	int sinlen;
+
+	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+	if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+		goto out;
+
+	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+
+		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
+		tsinfo.local_addr = sin.sin_addr.s_addr;
+		tsinfo.local_port = sin.sin_port;
+		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
+		tsinfo.peer_addr = sin.sin_addr.s_addr;
+		tsinfo.peer_port = sin.sin_port;
+
+		tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+		tsinfo.data_rem = tc->t_tinc_data_rem;
+		tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+		tsinfo.last_expected_una = tc->t_last_expected_una;
+		tsinfo.last_seen_una = tc->t_last_seen_una;
+
+		rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+	}
+
+out:
+	lens->nr = rds_tcp_tc_count;
+	lens->each = sizeof(tsinfo);
+
+	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+static int rds_tcp_laddr_check(__be32 addr)
+{
+	if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+		return 0;
+	return -EADDRNOTAVAIL;
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_tcp_connection *tc;
+
+	tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->t_sock = NULL;
+	tc->t_tinc = NULL;
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+
+	conn->c_transport_data = tc;
+
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+	spin_unlock_irq(&rds_tcp_conn_lock);
+
+	rdsdebug("alloced tc %p\n", conn->c_transport_data);
+	return 0;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+	struct rds_tcp_connection *tc = arg;
+	unsigned long flags;
+	rdsdebug("freeing tc %p\n", tc);
+
+	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	list_del(&tc->t_tcp_node);
+	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+	kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+	struct rds_tcp_connection *tc, *_tc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_splice(&rds_tcp_conn_list, &tmp_list);
+	INIT_LIST_HEAD(&rds_tcp_conn_list);
+	spin_unlock_irq(&rds_tcp_conn_lock);
+
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+		if (tc->conn->c_passive)
+			rds_conn_destroy(tc->conn->c_passive);
+		rds_conn_destroy(tc->conn);
+	}
+}
+
+static void rds_tcp_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_tcp_listen_stop();
+	rds_tcp_destroy_conns();
+	rds_trans_unregister(&rds_tcp_transport);
+	rds_tcp_recv_exit();
+	kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+struct rds_transport rds_tcp_transport = {
+	.laddr_check		= rds_tcp_laddr_check,
+	.xmit_prepare		= rds_tcp_xmit_prepare,
+	.xmit_complete		= rds_tcp_xmit_complete,
+	.xmit			= rds_tcp_xmit,
+	.recv			= rds_tcp_recv,
+	.conn_alloc		= rds_tcp_conn_alloc,
+	.conn_free		= rds_tcp_conn_free,
+	.conn_connect		= rds_tcp_conn_connect,
+	.conn_shutdown		= rds_tcp_conn_shutdown,
+	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
+	.inc_free		= rds_tcp_inc_free,
+	.stats_info_copy	= rds_tcp_stats_info_copy,
+	.exit			= rds_tcp_exit,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "tcp",
+	.t_type			= RDS_TRANS_TCP,
+	.t_prefer_loopback	= 1,
+};
+
+static int rds_tcp_init(void)
+{
+	int ret;
+
+	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+					      sizeof(struct rds_tcp_connection),
+					      0, 0, NULL);
+	if (!rds_tcp_conn_slab) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = rds_tcp_recv_init();
+	if (ret)
+		goto out_slab;
+
+	ret = rds_trans_register(&rds_tcp_transport);
+	if (ret)
+		goto out_recv;
+
+	ret = rds_tcp_listen_init();
+	if (ret)
+		goto out_register;
+
+	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+
+	goto out;
+
+out_register:
+	rds_trans_unregister(&rds_tcp_transport);
+out_recv:
+	rds_tcp_recv_exit();
+out_slab:
+	kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+	return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 00000000..9cf2927d
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,88 @@
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT	16385
+
+struct rds_tcp_incoming {
+	struct rds_incoming	ti_inc;
+	struct sk_buff_head	ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+	struct list_head	t_tcp_node;
+	struct rds_connection   *conn;
+	struct socket		*t_sock;
+	void			*t_orig_write_space;
+	void			*t_orig_data_ready;
+	void			*t_orig_state_change;
+
+	struct rds_tcp_incoming	*t_tinc;
+	size_t			t_tinc_hdr_rem;
+	size_t			t_tinc_data_rem;
+
+	/* XXX error report? */
+	struct work_struct	t_conn_w;
+	struct work_struct	t_send_w;
+	struct work_struct	t_down_w;
+	struct work_struct	t_recv_w;
+
+	/* for info exporting only */
+	struct list_head	t_list_item;
+	u32			t_last_sent_nxt;
+	u32			t_last_expected_una;
+	u32			t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+	uint64_t	s_tcp_data_ready_calls;
+	uint64_t	s_tcp_write_space_calls;
+	uint64_t	s_tcp_sndbuf_full;
+	uint64_t	s_tcp_connect_raced;
+	uint64_t	s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+void rds_tcp_tune(struct socket *sock);
+void rds_tcp_nonagle(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+
+/* tcp_connect.c */
+int rds_tcp_conn_connect(struct rds_connection *conn);
+void rds_tcp_conn_shutdown(struct rds_connection *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+int rds_tcp_listen_init(void);
+void rds_tcp_listen_stop(void);
+void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
+
+/* tcp_recv.c */
+int rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk, int bytes);
+int rds_tcp_recv(struct rds_connection *conn);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+
+/* tcp_send.c */
+void rds_tcp_xmit_prepare(struct rds_connection *conn);
+void rds_tcp_xmit_complete(struct rds_connection *conn);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+	         unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 00000000..af95c8e0
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+	tc = conn->c_transport_data;
+	state_change = tc->t_orig_state_change;
+
+	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			rds_connect_complete(conn);
+			break;
+		case TCP_CLOSE:
+			rds_conn_drop(conn);
+		default:
+			break;
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+int rds_tcp_conn_connect(struct rds_connection *conn)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(sock);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+	if (ret) {
+		rdsdebug("bind failed with %d at address %pI4\n",
+			 ret, &conn->c_laddr);
+		goto out;
+	}
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	/*
+	 * once we call connect() we can start getting callbacks and they
+	 * own the socket
+	 */
+	rds_tcp_set_callbacks(sock, conn);
+	ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
+				 O_NONBLOCK);
+	sock = NULL;
+
+	rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+
+out:
+	if (sock)
+		sock_release(sock);
+	return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks.  The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP.  Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+
+	rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+
+	if (sock) {
+		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+		lock_sock(sock->sk);
+		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+		release_sock(sock->sk);
+		sock_release(sock);
+	}
+
+	if (tc->t_tinc) {
+		rds_inc_put(&tc->t_tinc->ti_inc);
+		tc->t_tinc = NULL;
+	}
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 00000000..8b5cc4aa
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/*
+ * cheesy, but simple..
+ */
+static void rds_tcp_accept_worker(struct work_struct *work);
+static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
+static struct socket *rds_tcp_listen_sock;
+
+static int rds_tcp_accept_one(struct socket *sock)
+{
+	struct socket *new_sock = NULL;
+	struct rds_connection *conn;
+	int ret;
+	struct inet_sock *inet;
+
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(new_sock);
+
+	inet = inet_sk(new_sock->sk);
+
+	rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
+		 &inet->inet_saddr, ntohs(inet->inet_sport),
+		 &inet->inet_daddr, ntohs(inet->inet_dport));
+
+	conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+			       &rds_tcp_transport, GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		ret = PTR_ERR(conn);
+		goto out;
+	}
+
+	/*
+	 * see the comment above rds_queue_delayed_reconnect()
+	 */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP)
+			rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+		else
+			rds_tcp_stats_inc(s_tcp_connect_raced);
+		rds_conn_drop(conn);
+		ret = 0;
+		goto out;
+	}
+
+	rds_tcp_set_callbacks(new_sock, conn);
+	rds_connect_complete(conn);
+	new_sock = NULL;
+	ret = 0;
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	return ret;
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+	while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
+		cond_resched();
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+
+	rdsdebug("listen data ready sk %p\n", sk);
+
+	read_lock_bh(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (!ready) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/*
+	 * ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the accepter has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket
+	 */
+	if (sk->sk_state == TCP_LISTEN)
+		queue_work(rds_wq, &rds_tcp_listen_work);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+int rds_tcp_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct socket *sock = NULL;
+	int ret;
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	sock->sk->sk_reuse = 1;
+	rds_tcp_nonagle(sock);
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	sin.sin_family = PF_INET,
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0)
+		goto out;
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_listen_sock = sock;
+	sock = NULL;
+out:
+	if (sock)
+		sock_release(sock);
+	return ret;
+}
+
+void rds_tcp_listen_stop(void)
+{
+	struct socket *sock = rds_tcp_listen_sock;
+	struct sock *sk;
+
+	if (!sock)
+		return;
+
+	sk = sock->sk;
+
+	/* serialize with and prevent further callbacks */
+	lock_sock(sk);
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	release_sock(sk);
+
+	/* wait for accepts to stop and close the socket */
+	flush_workqueue(rds_wq);
+	sock_release(sock);
+	rds_tcp_listen_sock = NULL;
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 00000000..78205e25
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+	skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rds_tcp_inc_purge(inc);
+	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+	kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			     size_t size)
+{
+	struct rds_tcp_incoming *tinc;
+	struct iovec *iov, tmp;
+	struct sk_buff *skb;
+	unsigned long to_copy, skb_off;
+	int ret = 0;
+
+	if (size == 0)
+		goto out;
+
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	iov = first_iov;
+	tmp = *iov;
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		skb_off = 0;
+		while (skb_off < skb->len) {
+			while (tmp.iov_len == 0) {
+				iov++;
+				tmp = *iov;
+			}
+
+			to_copy = min(tmp.iov_len, size);
+			to_copy = min(to_copy, skb->len - skb_off);
+
+			rdsdebug("ret %d size %zu skb %p skb_off %lu "
+				 "skblen %d iov_base %p iov_len %zu cpy %lu\n",
+				 ret, size, skb, skb_off, skb->len,
+				 tmp.iov_base, tmp.iov_len, to_copy);
+
+			/* modifies tmp as it copies */
+			if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
+						    to_copy)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			rds_stats_add(s_copy_to_user, to_copy);
+			size -= to_copy;
+			ret += to_copy;
+			skb_off += to_copy;
+			if (size == 0)
+				goto out;
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap.  They must add up to the exact size of the congestion bitmap.  We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection.  We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+			      struct rds_tcp_incoming *tinc)
+{
+	struct sk_buff *skb;
+	unsigned int to_copy, skb_off;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_cong_map *map;
+	int ret;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map_page = 0;
+	map_off = 0;
+	map = conn->c_fcong;
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		skb_off = 0;
+		while (skb_off < skb->len) {
+			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+					skb->len - skb_off);
+
+			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+			/* only returns 0 or -error */
+			ret = skb_copy_bits(skb, skb_off,
+				(void *)map->m_page_addrs[map_page] + map_off,
+				to_copy);
+			BUG_ON(ret != 0);
+
+			skb_off += to_copy;
+			map_off += to_copy;
+			if (map_off == PAGE_SIZE) {
+				map_off = 0;
+				map_page++;
+			}
+		}
+	}
+
+	rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+	struct rds_connection *conn;
+	gfp_t gfp;
+	enum km_type km;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct rds_tcp_desc_arg *arg = desc->arg.data;
+	struct rds_connection *conn = arg->conn;
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct rds_tcp_incoming *tinc = tc->t_tinc;
+	struct sk_buff *clone;
+	size_t left = len, to_copy;
+
+	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+		 len);
+
+	/*
+	 * tcp_read_sock() interprets partial progress as an indication to stop
+	 * processing.
+	 */
+	while (left) {
+		if (!tinc) {
+			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+					        arg->gfp);
+			if (!tinc) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+			tc->t_tinc = tinc;
+			rdsdebug("alloced tinc %p\n", tinc);
+			rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+			/*
+			 * XXX * we might be able to use the __ variants when
+			 * we've already serialized at a higher level.
+			 */
+			skb_queue_head_init(&tinc->ti_skb_list);
+		}
+
+		if (left && tc->t_tinc_hdr_rem) {
+			to_copy = min(tc->t_tinc_hdr_rem, left);
+			rdsdebug("copying %zu header from skb %p\n", to_copy,
+				 skb);
+			skb_copy_bits(skb, offset,
+				      (char *)&tinc->ti_inc.i_hdr +
+						sizeof(struct rds_header) -
+						tc->t_tinc_hdr_rem,
+				      to_copy);
+			tc->t_tinc_hdr_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+
+			if (tc->t_tinc_hdr_rem == 0) {
+				/* could be 0 for a 0 len message */
+				tc->t_tinc_data_rem =
+					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+			}
+		}
+
+		if (left && tc->t_tinc_data_rem) {
+			clone = skb_clone(skb, arg->gfp);
+			if (!clone) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+
+			to_copy = min(tc->t_tinc_data_rem, left);
+			pskb_pull(clone, offset);
+			pskb_trim(clone, to_copy);
+			skb_queue_tail(&tinc->ti_skb_list, clone);
+
+			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+				 "clone %p data %p len %d\n",
+				 skb, skb->data, skb->len, offset, to_copy,
+				 clone, clone->data, clone->len);
+
+			tc->t_tinc_data_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+		}
+
+		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+				rds_tcp_cong_recv(conn, tinc);
+			else
+				rds_recv_incoming(conn, conn->c_faddr,
+						  conn->c_laddr, &tinc->ti_inc,
+						  arg->gfp, arg->km);
+
+			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+			tc->t_tinc_data_rem = 0;
+			tc->t_tinc = NULL;
+			rds_inc_put(&tinc->ti_inc);
+			tinc = NULL;
+		}
+	}
+out:
+	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+		 len, left, skb->len,
+		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+	return len - left;
+}
+
+/* the caller has to hold the sock lock */
+static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp,
+			     enum km_type km)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+	read_descriptor_t desc;
+	struct rds_tcp_desc_arg arg;
+
+	/* It's like glib in the kernel! */
+	arg.conn = conn;
+	arg.gfp = gfp;
+	arg.km = km;
+	desc.arg.data = &arg;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+		 desc.error);
+
+	return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+	int ret = 0;
+
+	rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+
+	lock_sock(sock->sk);
+	ret = rds_tcp_read_sock(conn, GFP_KERNEL, KM_USER0);
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk, int bytes)
+{
+	void (*ready)(struct sock *sk, int bytes);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
+
+	read_lock_bh(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	tc = conn->c_transport_data;
+	ready = tc->t_orig_data_ready;
+	rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+	if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+	ready(sk, bytes);
+}
+
+int rds_tcp_recv_init(void)
+{
+	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+					sizeof(struct rds_tcp_incoming),
+					0, 0, NULL);
+	if (!rds_tcp_incoming_slab)
+		return -ENOMEM;
+	return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+	kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 00000000..1b4fd68f
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static void rds_tcp_cork(struct socket *sock, int val)
+{
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
+			      sizeof(val));
+	set_fs(oldfs);
+}
+
+void rds_tcp_xmit_prepare(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 1);
+}
+
+void rds_tcp_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 0);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+	struct kvec vec = {
+                .iov_base = data,
+                .iov_len = len,
+	};
+        struct msghdr msg = {
+                .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+        };
+
+	return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+	         unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	int done = 0;
+	int ret = 0;
+
+	if (hdr_off == 0) {
+		/*
+		 * m_ack_seq is set to the sequence number of the last byte of
+		 * header and data.  see rds_tcp_is_acked().
+		 */
+		tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
+		rm->m_ack_seq = tc->t_last_sent_nxt +
+				sizeof(struct rds_header) +
+				be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+		smp_mb__before_clear_bit();
+		set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+		tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+		rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+			 rm, rds_tcp_snd_nxt(tc),
+			 (unsigned long long)rm->m_ack_seq);
+	}
+
+	if (hdr_off < sizeof(struct rds_header)) {
+		/* see rds_tcp_write_space() */
+		set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+		ret = rds_tcp_sendmsg(tc->t_sock,
+				      (void *)&rm->m_inc.i_hdr + hdr_off,
+				      sizeof(rm->m_inc.i_hdr) - hdr_off);
+		if (ret < 0)
+			goto out;
+		done += ret;
+		if (hdr_off + done != sizeof(struct rds_header))
+			goto out;
+	}
+
+	while (sg < rm->data.op_nents) {
+		ret = tc->t_sock->ops->sendpage(tc->t_sock,
+						sg_page(&rm->data.op_sg[sg]),
+						rm->data.op_sg[sg].offset + off,
+						rm->data.op_sg[sg].length - off,
+						MSG_DONTWAIT|MSG_NOSIGNAL);
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
+			 ret);
+		if (ret <= 0)
+			break;
+
+		off += ret;
+		done += ret;
+		if (off == rm->data.op_sg[sg].length) {
+			off = 0;
+			sg++;
+		}
+	}
+
+out:
+	if (ret <= 0) {
+		/* write_space will hit after EAGAIN, all else fatal */
+		if (ret == -EAGAIN) {
+			rds_tcp_stats_inc(s_tcp_sndbuf_full);
+			ret = 0;
+		} else {
+			printk(KERN_WARNING "RDS/tcp: send to %pI4 "
+			       "returned %d, disconnecting and reconnecting\n",
+			       &conn->c_faddr, ret);
+			rds_conn_drop(conn);
+		}
+	}
+	if (done == 0)
+		done = ret;
+	return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header.  This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space.  We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+	if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+		return 0;
+	return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+	void (*write_space)(struct sock *sk);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) {
+		write_space = sk->sk_write_space;
+		goto out;
+	}
+
+	tc = conn->c_transport_data;
+	rdsdebug("write_space for tc %p\n", tc);
+	write_space = tc->t_orig_write_space;
+	rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+	rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+	tc->t_last_seen_una = rds_tcp_snd_una(tc);
+	rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+
+	/*
+	 * write_space is only called when data leaves tcp's send queue if
+	 * SOCK_NOSPACE is set.  We set SOCK_NOSPACE every time we put
+	 * data in tcp's send queue because we use write_space to parse the
+	 * sequence numbers and notice that rds messages have been fully
+	 * received.
+	 *
+	 * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+	 * than a certain amount of space. So we need to set it again *after*
+	 * we call tcp's write_space or else we might only get called on the
+	 * first of a series of incoming tcp acks.
+	 */
+	write_space(sk);
+
+	if (sk->sk_socket)
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 00000000..d5898d03
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+	____cacheline_aligned;
+
+static const char const *rds_tcp_stat_names[] = {
+	"tcp_data_ready_calls",
+	"tcp_write_space_calls",
+	"tcp_sndbuf_full",
+	"tcp_connect_raced",
+	"tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail)
+{
+	struct rds_tcp_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+			    ARRAY_SIZE(rds_tcp_stat_names));
+out:
+	return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 00000000..0fd90f8c
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ *  ANY		  -> ERROR
+ *  UP		  -> DISCONNECTING
+ *  ERROR	  -> DISCONNECTING
+ *  DISCONNECTING -> DOWN
+ *  DOWN	  -> CONNECTING
+ *  CONNECTING	  -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ *  -	Inside the shutdown worker; synchronizes with xmit path
+ *	through RDS_IN_XMIT, and with connection management callbacks
+ *	via c_cm_lock.
+ *
+ *	For receive callbacks, we rely on the underlying transport
+ *	(TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+	if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+		printk(KERN_WARNING "%s: Cannot transition to state UP, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+		atomic_set(&conn->c_state, RDS_CONN_ERROR);
+		queue_work(rds_wq, &conn->c_down_w);
+		return;
+	}
+
+	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+	  conn, &conn->c_laddr, &conn->c_faddr);
+
+	conn->c_reconnect_jiffies = 0;
+	set_bit(0, &conn->c_map_queued);
+	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again.  Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects.  This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+void rds_queue_reconnect(struct rds_connection *conn)
+{
+	unsigned long rand;
+
+	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+	  conn, &conn->c_laddr, &conn->c_faddr,
+	  conn->c_reconnect_jiffies);
+
+	set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (conn->c_reconnect_jiffies == 0) {
+		conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+		return;
+	}
+
+	get_random_bytes(&rand, sizeof(rand));
+	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+		 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+		 conn, &conn->c_laddr, &conn->c_faddr);
+	queue_delayed_work(rds_wq, &conn->c_conn_w,
+			   rand % conn->c_reconnect_jiffies);
+
+	conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+					rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+	int ret;
+
+	clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		ret = conn->c_trans->conn_connect(conn);
+		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+			conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+		if (ret) {
+			if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+				rds_queue_reconnect(conn);
+			else
+				rds_conn_error(conn, "RDS: connect failed\n");
+		}
+	}
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = rds_send_xmit(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_send_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_send_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = conn->c_trans->recv(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_recv_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_recv_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+	rds_conn_shutdown(conn);
+}
+
+void rds_threads_exit(void)
+{
+	destroy_workqueue(rds_wq);
+}
+
+int rds_threads_init(void)
+{
+	rds_wq = create_singlethread_workqueue("krdsd");
+	if (!rds_wq)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 00000000..7f2ac4fe
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static struct rds_transport *transports[RDS_TRANS_COUNT];
+static DECLARE_RWSEM(rds_trans_sem);
+
+int rds_trans_register(struct rds_transport *trans)
+{
+	BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+	down_write(&rds_trans_sem);
+
+	if (transports[trans->t_type])
+		printk(KERN_ERR "RDS Transport type %d already registered\n",
+			trans->t_type);
+	else {
+		transports[trans->t_type] = trans;
+		printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+	}
+
+	up_write(&rds_trans_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rds_trans_register);
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+	down_write(&rds_trans_sem);
+
+	transports[trans->t_type] = NULL;
+	printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+	up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
+
+void rds_trans_put(struct rds_transport *trans)
+{
+	if (trans && trans->t_owner)
+		module_put(trans->t_owner);
+}
+
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+	struct rds_transport *ret = NULL;
+	struct rds_transport *trans;
+	unsigned int i;
+
+	if (IN_LOOPBACK(ntohl(addr)))
+		return &rds_loop_transport;
+
+	down_read(&rds_trans_sem);
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && (trans->laddr_check(addr) == 0) &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
+			break;
+		}
+	}
+	up_read(&rds_trans_sem);
+
+	return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them.  The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail)
+
+{
+	struct rds_transport *trans;
+	unsigned int total = 0;
+	unsigned int part;
+	int i;
+
+	rds_info_iter_unmap(iter);
+	down_read(&rds_trans_sem);
+
+	for (i = 0; i < RDS_TRANS_COUNT; i++)
+	{
+		trans = transports[i];
+		if (!trans || !trans->stats_info_copy)
+			continue;
+
+		part = trans->stats_info_copy(iter, avail);
+		avail -= min(avail, part);
+		total += part;
+	}
+
+	up_read(&rds_trans_sem);
+
+	return total;
+}
+
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 00000000..e6b5190d
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,80 @@
+#ifndef _LINUX_XLIST_H
+#define _LINUX_XLIST_H
+
+#include <linux/stddef.h>
+#include <linux/poison.h>
+#include <linux/prefetch.h>
+#include <asm/system.h>
+
+struct xlist_head {
+	struct xlist_head *next;
+};
+
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+	list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+	return head->next == NULL;
+}
+
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
+			     struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+
+	while (1) {
+		cur = head->next;
+		tail->next = cur;
+		check = cmpxchg(&head->next, cur, new);
+		if (check == cur)
+			break;
+	}
+}
+
+static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+	struct xlist_head *next;
+
+	while (1) {
+		cur = head->next;
+		if (!cur)
+			goto out;
+
+		next = cur->next;
+		check = cmpxchg(&head->next, cur, next);
+		if (check == cur)
+			goto out;
+	}
+out:
+	return cur;
+}
+
+static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	cur = head->next;
+	if (!cur)
+		return NULL;
+
+	head->next = cur->next;
+	return cur;
+}
+
+static inline void xlist_splice(struct xlist_head *list,
+				struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	WARN_ON(head->next);
+	cur = xchg(&list->next, NULL);
+	head->next = cur;
+}
+
+#endif
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
new file mode 100644
index 00000000..8e12c8a2
--- /dev/null
+++ b/net/rfkill/Kconfig
@@ -0,0 +1,49 @@
+#
+# RF switch subsystem configuration
+#
+menuconfig RFKILL
+	tristate "RF switch subsystem support"
+	help
+	  Say Y here if you want to have control over RF switches
+	  found on many WiFi and Bluetooth cards.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called rfkill.
+
+config RFKILL_PM
+	bool "Power off on suspend"
+	depends on RFKILL && PM
+	default y
+
+# LED trigger support
+config RFKILL_LEDS
+	bool
+	depends on RFKILL
+	depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS
+	default y
+
+config RFKILL_INPUT
+	bool "RF switch input support" if EXPERT
+	depends on RFKILL
+	depends on INPUT = y || RFKILL = INPUT
+	default y if !EXPERT
+
+config RFKILL_REGULATOR
+	tristate "Generic rfkill regulator driver"
+	depends on RFKILL || !RFKILL
+	depends on REGULATOR
+	help
+          This options enable controlling radio transmitters connected to
+          voltage regulator using the regulator framework.
+
+          To compile this driver as a module, choose M here: the module will
+          be called rfkill-regulator.
+
+config RFKILL_GPIO
+	tristate "GPIO RFKILL driver"
+	depends on RFKILL && GPIOLIB && HAVE_CLK
+	default n
+	help
+	  If you say yes here you get support of a generic gpio RFKILL
+	  driver. The platform should fill in the appropriate fields in the
+	  rfkill_gpio_platform_data structure and pass that to the driver.
diff --git a/net/rfkill/Makefile b/net/rfkill/Makefile
new file mode 100644
index 00000000..31176878
--- /dev/null
+++ b/net/rfkill/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the RF switch subsystem.
+#
+
+rfkill-y			+= core.o
+rfkill-$(CONFIG_RFKILL_INPUT)	+= input.o
+obj-$(CONFIG_RFKILL)		+= rfkill.o
+obj-$(CONFIG_RFKILL_REGULATOR)	+= rfkill-regulator.o
+obj-$(CONFIG_RFKILL_GPIO)	+= rfkill-gpio.o
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
new file mode 100644
index 00000000..df2dae6b
--- /dev/null
+++ b/net/rfkill/core.c
@@ -0,0 +1,1283 @@
+/*
+ * Copyright (C) 2006 - 2007 Ivo van Doorn
+ * Copyright (C) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rfkill.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/miscdevice.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include "rfkill.h"
+
+#define POLL_INTERVAL		(5 * HZ)
+
+#define RFKILL_BLOCK_HW		BIT(0)
+#define RFKILL_BLOCK_SW		BIT(1)
+#define RFKILL_BLOCK_SW_PREV	BIT(2)
+#define RFKILL_BLOCK_ANY	(RFKILL_BLOCK_HW |\
+				 RFKILL_BLOCK_SW |\
+				 RFKILL_BLOCK_SW_PREV)
+#define RFKILL_BLOCK_SW_SETCALL	BIT(31)
+
+struct rfkill {
+	spinlock_t		lock;
+
+	const char		*name;
+	enum rfkill_type	type;
+
+	unsigned long		state;
+
+	u32			idx;
+
+	bool			registered;
+	bool			persistent;
+
+	const struct rfkill_ops	*ops;
+	void			*data;
+
+#ifdef CONFIG_RFKILL_LEDS
+	struct led_trigger	led_trigger;
+	const char		*ledtrigname;
+#endif
+
+	struct device		dev;
+	struct list_head	node;
+
+	struct delayed_work	poll_work;
+	struct work_struct	uevent_work;
+	struct work_struct	sync_work;
+};
+#define to_rfkill(d)	container_of(d, struct rfkill, dev)
+
+struct rfkill_int_event {
+	struct list_head	list;
+	struct rfkill_event	ev;
+};
+
+struct rfkill_data {
+	struct list_head	list;
+	struct list_head	events;
+	struct mutex		mtx;
+	wait_queue_head_t	read_wait;
+	bool			input_handler;
+};
+
+
+MODULE_AUTHOR("Ivo van Doorn <IvDoorn@gmail.com>");
+MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
+MODULE_DESCRIPTION("RF switch support");
+MODULE_LICENSE("GPL");
+
+
+/*
+ * The locking here should be made much smarter, we currently have
+ * a bit of a stupid situation because drivers might want to register
+ * the rfkill struct under their own lock, and take this lock during
+ * rfkill method calls -- which will cause an AB-BA deadlock situation.
+ *
+ * To fix that, we need to rework this code here to be mostly lock-free
+ * and only use the mutex for list manipulations, not to protect the
+ * various other global variables. Then we can avoid holding the mutex
+ * around driver operations, and all is happy.
+ */
+static LIST_HEAD(rfkill_list);	/* list of registered rf switches */
+static DEFINE_MUTEX(rfkill_global_mutex);
+static LIST_HEAD(rfkill_fds);	/* list of open fds of /dev/rfkill */
+
+static unsigned int rfkill_default_state = 1;
+module_param_named(default_state, rfkill_default_state, uint, 0444);
+MODULE_PARM_DESC(default_state,
+		 "Default initial state for all radio types, 0 = radio off");
+
+static struct {
+	bool cur, sav;
+} rfkill_global_states[NUM_RFKILL_TYPES];
+
+static bool rfkill_epo_lock_active;
+
+
+#ifdef CONFIG_RFKILL_LEDS
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+	struct led_trigger *trigger;
+
+	if (!rfkill->registered)
+		return;
+
+	trigger = &rfkill->led_trigger;
+
+	if (rfkill->state & RFKILL_BLOCK_ANY)
+		led_trigger_event(trigger, LED_OFF);
+	else
+		led_trigger_event(trigger, LED_FULL);
+}
+
+static void rfkill_led_trigger_activate(struct led_classdev *led)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(led->trigger, struct rfkill, led_trigger);
+
+	rfkill_led_trigger_event(rfkill);
+}
+
+static int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+	rfkill->led_trigger.name = rfkill->ledtrigname
+					? : dev_name(&rfkill->dev);
+	rfkill->led_trigger.activate = rfkill_led_trigger_activate;
+	return led_trigger_register(&rfkill->led_trigger);
+}
+
+static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+	led_trigger_unregister(&rfkill->led_trigger);
+}
+#else
+static void rfkill_led_trigger_event(struct rfkill *rfkill)
+{
+}
+
+static inline int rfkill_led_trigger_register(struct rfkill *rfkill)
+{
+	return 0;
+}
+
+static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
+{
+}
+#endif /* CONFIG_RFKILL_LEDS */
+
+static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
+			      enum rfkill_operation op)
+{
+	unsigned long flags;
+
+	ev->idx = rfkill->idx;
+	ev->type = rfkill->type;
+	ev->op = op;
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	ev->hard = !!(rfkill->state & RFKILL_BLOCK_HW);
+	ev->soft = !!(rfkill->state & (RFKILL_BLOCK_SW |
+					RFKILL_BLOCK_SW_PREV));
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+
+static void rfkill_send_events(struct rfkill *rfkill, enum rfkill_operation op)
+{
+	struct rfkill_data *data;
+	struct rfkill_int_event *ev;
+
+	list_for_each_entry(data, &rfkill_fds, list) {
+		ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+		if (!ev)
+			continue;
+		rfkill_fill_event(&ev->ev, rfkill, op);
+		mutex_lock(&data->mtx);
+		list_add_tail(&ev->list, &data->events);
+		mutex_unlock(&data->mtx);
+		wake_up_interruptible(&data->read_wait);
+	}
+}
+
+static void rfkill_event(struct rfkill *rfkill)
+{
+	if (!rfkill->registered)
+		return;
+
+	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
+
+	/* also send event to /dev/rfkill */
+	rfkill_send_events(rfkill, RFKILL_OP_CHANGE);
+}
+
+static bool __rfkill_set_hw_state(struct rfkill *rfkill,
+				  bool blocked, bool *change)
+{
+	unsigned long flags;
+	bool prev, any;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	prev = !!(rfkill->state & RFKILL_BLOCK_HW);
+	if (blocked)
+		rfkill->state |= RFKILL_BLOCK_HW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_HW;
+	*change = prev != blocked;
+	any = rfkill->state & RFKILL_BLOCK_ANY;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	rfkill_led_trigger_event(rfkill);
+
+	return any;
+}
+
+/**
+ * rfkill_set_block - wrapper for set_block method
+ *
+ * @rfkill: the rfkill struct to use
+ * @blocked: the new software state
+ *
+ * Calls the set_block method (when applicable) and handles notifications
+ * etc. as well.
+ */
+static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+	int err;
+
+	if (unlikely(rfkill->dev.power.power_state.event & PM_EVENT_SLEEP))
+		return;
+
+	/*
+	 * Some platforms (...!) generate input events which affect the
+	 * _hard_ kill state -- whenever something tries to change the
+	 * current software state query the hardware state too.
+	 */
+	if (rfkill->ops->query)
+		rfkill->ops->query(rfkill, rfkill->data);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	if (rfkill->state & RFKILL_BLOCK_SW)
+		rfkill->state |= RFKILL_BLOCK_SW_PREV;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+
+	if (blocked)
+		rfkill->state |= RFKILL_BLOCK_SW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_SW;
+
+	rfkill->state |= RFKILL_BLOCK_SW_SETCALL;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	err = rfkill->ops->set_block(rfkill->data, blocked);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	if (err) {
+		/*
+		 * Failed -- reset status to _prev, this may be different
+		 * from what set set _PREV to earlier in this function
+		 * if rfkill_set_sw_state was invoked.
+		 */
+		if (rfkill->state & RFKILL_BLOCK_SW_PREV)
+			rfkill->state |= RFKILL_BLOCK_SW;
+		else
+			rfkill->state &= ~RFKILL_BLOCK_SW;
+	}
+	rfkill->state &= ~RFKILL_BLOCK_SW_SETCALL;
+	rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	rfkill_led_trigger_event(rfkill);
+	rfkill_event(rfkill);
+}
+
+#ifdef CONFIG_RFKILL_INPUT
+static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
+
+/**
+ * __rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @state: the new state
+ *
+ * This function sets the state of all switches of given type,
+ * unless a specific switch is claimed by userspace (in which case,
+ * that switch is left alone) or suspended.
+ *
+ * Caller must have acquired rfkill_global_mutex.
+ */
+static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
+{
+	struct rfkill *rfkill;
+
+	rfkill_global_states[type].cur = blocked;
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (rfkill->type != type)
+			continue;
+
+		rfkill_set_block(rfkill, blocked);
+	}
+}
+
+/**
+ * rfkill_switch_all - Toggle state of all switches of given type
+ * @type: type of interfaces to be affected
+ * @state: the new state
+ *
+ * Acquires rfkill_global_mutex and calls __rfkill_switch_all(@type, @state).
+ * Please refer to __rfkill_switch_all() for details.
+ *
+ * Does nothing if the EPO lock is active.
+ */
+void rfkill_switch_all(enum rfkill_type type, bool blocked)
+{
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	if (!rfkill_epo_lock_active)
+		__rfkill_switch_all(type, blocked);
+
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_epo - emergency power off all transmitters
+ *
+ * This kicks all non-suspended rfkill devices to RFKILL_STATE_SOFT_BLOCKED,
+ * ignoring everything in its path but rfkill_global_mutex and rfkill->mutex.
+ *
+ * The global state before the EPO is saved and can be restored later
+ * using rfkill_restore_states().
+ */
+void rfkill_epo(void)
+{
+	struct rfkill *rfkill;
+	int i;
+
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	rfkill_epo_lock_active = true;
+	list_for_each_entry(rfkill, &rfkill_list, node)
+		rfkill_set_block(rfkill, true);
+
+	for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+		rfkill_global_states[i].sav = rfkill_global_states[i].cur;
+		rfkill_global_states[i].cur = true;
+	}
+
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_restore_states - restore global states
+ *
+ * Restore (and sync switches to) the global state from the
+ * states in rfkill_default_states.  This can undo the effects of
+ * a call to rfkill_epo().
+ */
+void rfkill_restore_states(void)
+{
+	int i;
+
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	rfkill_epo_lock_active = false;
+	for (i = 0; i < NUM_RFKILL_TYPES; i++)
+		__rfkill_switch_all(i, rfkill_global_states[i].sav);
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_remove_epo_lock - unlock state changes
+ *
+ * Used by rfkill-input manually unlock state changes, when
+ * the EPO switch is deactivated.
+ */
+void rfkill_remove_epo_lock(void)
+{
+	if (atomic_read(&rfkill_input_disabled))
+		return;
+
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_epo_lock_active = false;
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+/**
+ * rfkill_is_epo_lock_active - returns true EPO is active
+ *
+ * Returns 0 (false) if there is NOT an active EPO contidion,
+ * and 1 (true) if there is an active EPO contition, which
+ * locks all radios in one of the BLOCKED states.
+ *
+ * Can be called in atomic context.
+ */
+bool rfkill_is_epo_lock_active(void)
+{
+	return rfkill_epo_lock_active;
+}
+
+/**
+ * rfkill_get_global_sw_state - returns global state for a type
+ * @type: the type to get the global state of
+ *
+ * Returns the current global state for a given wireless
+ * device type.
+ */
+bool rfkill_get_global_sw_state(const enum rfkill_type type)
+{
+	return rfkill_global_states[type].cur;
+}
+#endif
+
+
+bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
+{
+	bool ret, change;
+
+	ret = __rfkill_set_hw_state(rfkill, blocked, &change);
+
+	if (!rfkill->registered)
+		return ret;
+
+	if (change)
+		schedule_work(&rfkill->uevent_work);
+
+	return ret;
+}
+EXPORT_SYMBOL(rfkill_set_hw_state);
+
+static void __rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	u32 bit = RFKILL_BLOCK_SW;
+
+	/* if in a ops->set_block right now, use other bit */
+	if (rfkill->state & RFKILL_BLOCK_SW_SETCALL)
+		bit = RFKILL_BLOCK_SW_PREV;
+
+	if (blocked)
+		rfkill->state |= bit;
+	else
+		rfkill->state &= ~bit;
+}
+
+bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+	bool prev, hwblock;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	prev = !!(rfkill->state & RFKILL_BLOCK_SW);
+	__rfkill_set_sw_state(rfkill, blocked);
+	hwblock = !!(rfkill->state & RFKILL_BLOCK_HW);
+	blocked = blocked || hwblock;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	if (!rfkill->registered)
+		return blocked;
+
+	if (prev != blocked && !hwblock)
+		schedule_work(&rfkill->uevent_work);
+
+	rfkill_led_trigger_event(rfkill);
+
+	return blocked;
+}
+EXPORT_SYMBOL(rfkill_set_sw_state);
+
+void rfkill_init_sw_state(struct rfkill *rfkill, bool blocked)
+{
+	unsigned long flags;
+
+	BUG_ON(!rfkill);
+	BUG_ON(rfkill->registered);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	__rfkill_set_sw_state(rfkill, blocked);
+	rfkill->persistent = true;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+}
+EXPORT_SYMBOL(rfkill_init_sw_state);
+
+void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
+{
+	unsigned long flags;
+	bool swprev, hwprev;
+
+	BUG_ON(!rfkill);
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+
+	/*
+	 * No need to care about prev/setblock ... this is for uevent only
+	 * and that will get triggered by rfkill_set_block anyway.
+	 */
+	swprev = !!(rfkill->state & RFKILL_BLOCK_SW);
+	hwprev = !!(rfkill->state & RFKILL_BLOCK_HW);
+	__rfkill_set_sw_state(rfkill, sw);
+	if (hw)
+		rfkill->state |= RFKILL_BLOCK_HW;
+	else
+		rfkill->state &= ~RFKILL_BLOCK_HW;
+
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	if (!rfkill->registered) {
+		rfkill->persistent = true;
+	} else {
+		if (swprev != sw || hwprev != hw)
+			schedule_work(&rfkill->uevent_work);
+
+		rfkill_led_trigger_event(rfkill);
+	}
+}
+EXPORT_SYMBOL(rfkill_set_states);
+
+static ssize_t rfkill_name_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%s\n", rfkill->name);
+}
+
+static const char *rfkill_get_type_str(enum rfkill_type type)
+{
+	BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_FM + 1);
+
+	switch (type) {
+	case RFKILL_TYPE_WLAN:
+		return "wlan";
+	case RFKILL_TYPE_BLUETOOTH:
+		return "bluetooth";
+	case RFKILL_TYPE_UWB:
+		return "ultrawideband";
+	case RFKILL_TYPE_WIMAX:
+		return "wimax";
+	case RFKILL_TYPE_WWAN:
+		return "wwan";
+	case RFKILL_TYPE_GPS:
+		return "gps";
+	case RFKILL_TYPE_FM:
+		return "fm";
+	default:
+		BUG();
+	}
+}
+
+static ssize_t rfkill_type_show(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+}
+
+static ssize_t rfkill_idx_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", rfkill->idx);
+}
+
+static ssize_t rfkill_persistent_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", rfkill->persistent);
+}
+
+static ssize_t rfkill_hard_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_HW) ? 1 : 0 );
+}
+
+static ssize_t rfkill_soft_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", (rfkill->state & RFKILL_BLOCK_SW) ? 1 : 0 );
+}
+
+static ssize_t rfkill_soft_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	unsigned long state;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 0, &state);
+	if (err)
+		return err;
+
+	if (state > 1 )
+		return -EINVAL;
+
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_set_block(rfkill, state);
+	mutex_unlock(&rfkill_global_mutex);
+
+	return err ?: count;
+}
+
+static u8 user_state_from_blocked(unsigned long state)
+{
+	if (state & RFKILL_BLOCK_HW)
+		return RFKILL_USER_STATE_HARD_BLOCKED;
+	if (state & RFKILL_BLOCK_SW)
+		return RFKILL_USER_STATE_SOFT_BLOCKED;
+
+	return RFKILL_USER_STATE_UNBLOCKED;
+}
+
+static ssize_t rfkill_state_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	return sprintf(buf, "%d\n", user_state_from_blocked(rfkill->state));
+}
+
+static ssize_t rfkill_state_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	unsigned long state;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = strict_strtoul(buf, 0, &state);
+	if (err)
+		return err;
+
+	if (state != RFKILL_USER_STATE_SOFT_BLOCKED &&
+	    state != RFKILL_USER_STATE_UNBLOCKED)
+		return -EINVAL;
+
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_set_block(rfkill, state == RFKILL_USER_STATE_SOFT_BLOCKED);
+	mutex_unlock(&rfkill_global_mutex);
+
+	return err ?: count;
+}
+
+static ssize_t rfkill_claim_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%d\n", 0);
+}
+
+static ssize_t rfkill_claim_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	return -EOPNOTSUPP;
+}
+
+static struct device_attribute rfkill_dev_attrs[] = {
+	__ATTR(name, S_IRUGO, rfkill_name_show, NULL),
+	__ATTR(type, S_IRUGO, rfkill_type_show, NULL),
+	__ATTR(index, S_IRUGO, rfkill_idx_show, NULL),
+	__ATTR(persistent, S_IRUGO, rfkill_persistent_show, NULL),
+	__ATTR(state, S_IRUGO|S_IWUSR, rfkill_state_show, rfkill_state_store),
+	__ATTR(claim, S_IRUGO|S_IWUSR, rfkill_claim_show, rfkill_claim_store),
+	__ATTR(soft, S_IRUGO|S_IWUSR, rfkill_soft_show, rfkill_soft_store),
+	__ATTR(hard, S_IRUGO, rfkill_hard_show, NULL),
+	__ATTR_NULL
+};
+
+static void rfkill_release(struct device *dev)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	kfree(rfkill);
+}
+
+static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	unsigned long flags;
+	u32 state;
+	int error;
+
+	error = add_uevent_var(env, "RFKILL_NAME=%s", rfkill->name);
+	if (error)
+		return error;
+	error = add_uevent_var(env, "RFKILL_TYPE=%s",
+			       rfkill_get_type_str(rfkill->type));
+	if (error)
+		return error;
+	spin_lock_irqsave(&rfkill->lock, flags);
+	state = rfkill->state;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+	error = add_uevent_var(env, "RFKILL_STATE=%d",
+			       user_state_from_blocked(state));
+	return error;
+}
+
+void rfkill_pause_polling(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (!rfkill->ops->poll)
+		return;
+
+	cancel_delayed_work_sync(&rfkill->poll_work);
+}
+EXPORT_SYMBOL(rfkill_pause_polling);
+
+#ifdef CONFIG_RFKILL_PM
+void rfkill_resume_polling(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (!rfkill->ops->poll)
+		return;
+
+	schedule_work(&rfkill->poll_work.work);
+}
+EXPORT_SYMBOL(rfkill_resume_polling);
+
+static int rfkill_suspend(struct device *dev, pm_message_t state)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+
+	rfkill_pause_polling(rfkill);
+
+	return 0;
+}
+
+static int rfkill_resume(struct device *dev)
+{
+	struct rfkill *rfkill = to_rfkill(dev);
+	bool cur;
+
+	if (!rfkill->persistent) {
+		cur = !!(rfkill->state & RFKILL_BLOCK_SW);
+		rfkill_set_block(rfkill, cur);
+	}
+
+	rfkill_resume_polling(rfkill);
+
+	return 0;
+}
+#endif
+
+static struct class rfkill_class = {
+	.name		= "rfkill",
+	.dev_release	= rfkill_release,
+	.dev_attrs	= rfkill_dev_attrs,
+	.dev_uevent	= rfkill_dev_uevent,
+#ifdef CONFIG_RFKILL_PM
+	.suspend	= rfkill_suspend,
+	.resume		= rfkill_resume,
+#endif
+};
+
+bool rfkill_blocked(struct rfkill *rfkill)
+{
+	unsigned long flags;
+	u32 state;
+
+	spin_lock_irqsave(&rfkill->lock, flags);
+	state = rfkill->state;
+	spin_unlock_irqrestore(&rfkill->lock, flags);
+
+	return !!(state & RFKILL_BLOCK_ANY);
+}
+EXPORT_SYMBOL(rfkill_blocked);
+
+
+struct rfkill * __must_check rfkill_alloc(const char *name,
+					  struct device *parent,
+					  const enum rfkill_type type,
+					  const struct rfkill_ops *ops,
+					  void *ops_data)
+{
+	struct rfkill *rfkill;
+	struct device *dev;
+
+	if (WARN_ON(!ops))
+		return NULL;
+
+	if (WARN_ON(!ops->set_block))
+		return NULL;
+
+	if (WARN_ON(!name))
+		return NULL;
+
+	if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
+		return NULL;
+
+	rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+	if (!rfkill)
+		return NULL;
+
+	spin_lock_init(&rfkill->lock);
+	INIT_LIST_HEAD(&rfkill->node);
+	rfkill->type = type;
+	rfkill->name = name;
+	rfkill->ops = ops;
+	rfkill->data = ops_data;
+
+	dev = &rfkill->dev;
+	dev->class = &rfkill_class;
+	dev->parent = parent;
+	device_initialize(dev);
+
+	return rfkill;
+}
+EXPORT_SYMBOL(rfkill_alloc);
+
+static void rfkill_poll(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(work, struct rfkill, poll_work.work);
+
+	/*
+	 * Poll hardware state -- driver will use one of the
+	 * rfkill_set{,_hw,_sw}_state functions and use its
+	 * return value to update the current status.
+	 */
+	rfkill->ops->poll(rfkill, rfkill->data);
+
+	schedule_delayed_work(&rfkill->poll_work,
+		round_jiffies_relative(POLL_INTERVAL));
+}
+
+static void rfkill_uevent_work(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+
+	rfkill = container_of(work, struct rfkill, uevent_work);
+
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_event(rfkill);
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+static void rfkill_sync_work(struct work_struct *work)
+{
+	struct rfkill *rfkill;
+	bool cur;
+
+	rfkill = container_of(work, struct rfkill, sync_work);
+
+	mutex_lock(&rfkill_global_mutex);
+	cur = rfkill_global_states[rfkill->type].cur;
+	rfkill_set_block(rfkill, cur);
+	mutex_unlock(&rfkill_global_mutex);
+}
+
+int __must_check rfkill_register(struct rfkill *rfkill)
+{
+	static unsigned long rfkill_no;
+	struct device *dev = &rfkill->dev;
+	int error;
+
+	BUG_ON(!rfkill);
+
+	mutex_lock(&rfkill_global_mutex);
+
+	if (rfkill->registered) {
+		error = -EALREADY;
+		goto unlock;
+	}
+
+	rfkill->idx = rfkill_no;
+	dev_set_name(dev, "rfkill%lu", rfkill_no);
+	rfkill_no++;
+
+	list_add_tail(&rfkill->node, &rfkill_list);
+
+	error = device_add(dev);
+	if (error)
+		goto remove;
+
+	error = rfkill_led_trigger_register(rfkill);
+	if (error)
+		goto devdel;
+
+	rfkill->registered = true;
+
+	INIT_DELAYED_WORK(&rfkill->poll_work, rfkill_poll);
+	INIT_WORK(&rfkill->uevent_work, rfkill_uevent_work);
+	INIT_WORK(&rfkill->sync_work, rfkill_sync_work);
+
+	if (rfkill->ops->poll)
+		schedule_delayed_work(&rfkill->poll_work,
+			round_jiffies_relative(POLL_INTERVAL));
+
+	if (!rfkill->persistent || rfkill_epo_lock_active) {
+		schedule_work(&rfkill->sync_work);
+	} else {
+#ifdef CONFIG_RFKILL_INPUT
+		bool soft_blocked = !!(rfkill->state & RFKILL_BLOCK_SW);
+
+		if (!atomic_read(&rfkill_input_disabled))
+			__rfkill_switch_all(rfkill->type, soft_blocked);
+#endif
+	}
+
+	rfkill_send_events(rfkill, RFKILL_OP_ADD);
+
+	mutex_unlock(&rfkill_global_mutex);
+	return 0;
+
+ devdel:
+	device_del(&rfkill->dev);
+ remove:
+	list_del_init(&rfkill->node);
+ unlock:
+	mutex_unlock(&rfkill_global_mutex);
+	return error;
+}
+EXPORT_SYMBOL(rfkill_register);
+
+void rfkill_unregister(struct rfkill *rfkill)
+{
+	BUG_ON(!rfkill);
+
+	if (rfkill->ops->poll)
+		cancel_delayed_work_sync(&rfkill->poll_work);
+
+	cancel_work_sync(&rfkill->uevent_work);
+	cancel_work_sync(&rfkill->sync_work);
+
+	rfkill->registered = false;
+
+	device_del(&rfkill->dev);
+
+	mutex_lock(&rfkill_global_mutex);
+	rfkill_send_events(rfkill, RFKILL_OP_DEL);
+	list_del_init(&rfkill->node);
+	mutex_unlock(&rfkill_global_mutex);
+
+	rfkill_led_trigger_unregister(rfkill);
+}
+EXPORT_SYMBOL(rfkill_unregister);
+
+void rfkill_destroy(struct rfkill *rfkill)
+{
+	if (rfkill)
+		put_device(&rfkill->dev);
+}
+EXPORT_SYMBOL(rfkill_destroy);
+
+static int rfkill_fop_open(struct inode *inode, struct file *file)
+{
+	struct rfkill_data *data;
+	struct rfkill *rfkill;
+	struct rfkill_int_event *ev, *tmp;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&data->events);
+	mutex_init(&data->mtx);
+	init_waitqueue_head(&data->read_wait);
+
+	mutex_lock(&rfkill_global_mutex);
+	mutex_lock(&data->mtx);
+	/*
+	 * start getting events from elsewhere but hold mtx to get
+	 * startup events added first
+	 */
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+		if (!ev)
+			goto free;
+		rfkill_fill_event(&ev->ev, rfkill, RFKILL_OP_ADD);
+		list_add_tail(&ev->list, &data->events);
+	}
+	list_add(&data->list, &rfkill_fds);
+	mutex_unlock(&data->mtx);
+	mutex_unlock(&rfkill_global_mutex);
+
+	file->private_data = data;
+
+	return nonseekable_open(inode, file);
+
+ free:
+	mutex_unlock(&data->mtx);
+	mutex_unlock(&rfkill_global_mutex);
+	mutex_destroy(&data->mtx);
+	list_for_each_entry_safe(ev, tmp, &data->events, list)
+		kfree(ev);
+	kfree(data);
+	return -ENOMEM;
+}
+
+static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
+{
+	struct rfkill_data *data = file->private_data;
+	unsigned int res = POLLOUT | POLLWRNORM;
+
+	poll_wait(file, &data->read_wait, wait);
+
+	mutex_lock(&data->mtx);
+	if (!list_empty(&data->events))
+		res = POLLIN | POLLRDNORM;
+	mutex_unlock(&data->mtx);
+
+	return res;
+}
+
+static bool rfkill_readable(struct rfkill_data *data)
+{
+	bool r;
+
+	mutex_lock(&data->mtx);
+	r = !list_empty(&data->events);
+	mutex_unlock(&data->mtx);
+
+	return r;
+}
+
+static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *pos)
+{
+	struct rfkill_data *data = file->private_data;
+	struct rfkill_int_event *ev;
+	unsigned long sz;
+	int ret;
+
+	mutex_lock(&data->mtx);
+
+	while (list_empty(&data->events)) {
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			goto out;
+		}
+		mutex_unlock(&data->mtx);
+		ret = wait_event_interruptible(data->read_wait,
+					       rfkill_readable(data));
+		mutex_lock(&data->mtx);
+
+		if (ret)
+			goto out;
+	}
+
+	ev = list_first_entry(&data->events, struct rfkill_int_event,
+				list);
+
+	sz = min_t(unsigned long, sizeof(ev->ev), count);
+	ret = sz;
+	if (copy_to_user(buf, &ev->ev, sz))
+		ret = -EFAULT;
+
+	list_del(&ev->list);
+	kfree(ev);
+ out:
+	mutex_unlock(&data->mtx);
+	return ret;
+}
+
+static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *pos)
+{
+	struct rfkill *rfkill;
+	struct rfkill_event ev;
+
+	/* we don't need the 'hard' variable but accept it */
+	if (count < RFKILL_EVENT_SIZE_V1 - 1)
+		return -EINVAL;
+
+	/*
+	 * Copy as much data as we can accept into our 'ev' buffer,
+	 * but tell userspace how much we've copied so it can determine
+	 * our API version even in a write() call, if it cares.
+	 */
+	count = min(count, sizeof(ev));
+	if (copy_from_user(&ev, buf, count))
+		return -EFAULT;
+
+	if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL)
+		return -EINVAL;
+
+	if (ev.type >= NUM_RFKILL_TYPES)
+		return -EINVAL;
+
+	mutex_lock(&rfkill_global_mutex);
+
+	if (ev.op == RFKILL_OP_CHANGE_ALL) {
+		if (ev.type == RFKILL_TYPE_ALL) {
+			enum rfkill_type i;
+			for (i = 0; i < NUM_RFKILL_TYPES; i++)
+				rfkill_global_states[i].cur = ev.soft;
+		} else {
+			rfkill_global_states[ev.type].cur = ev.soft;
+		}
+	}
+
+	list_for_each_entry(rfkill, &rfkill_list, node) {
+		if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
+			continue;
+
+		if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL)
+			continue;
+
+		rfkill_set_block(rfkill, ev.soft);
+	}
+	mutex_unlock(&rfkill_global_mutex);
+
+	return count;
+}
+
+static int rfkill_fop_release(struct inode *inode, struct file *file)
+{
+	struct rfkill_data *data = file->private_data;
+	struct rfkill_int_event *ev, *tmp;
+
+	mutex_lock(&rfkill_global_mutex);
+	list_del(&data->list);
+	mutex_unlock(&rfkill_global_mutex);
+
+	mutex_destroy(&data->mtx);
+	list_for_each_entry_safe(ev, tmp, &data->events, list)
+		kfree(ev);
+
+#ifdef CONFIG_RFKILL_INPUT
+	if (data->input_handler)
+		if (atomic_dec_return(&rfkill_input_disabled) == 0)
+			printk(KERN_DEBUG "rfkill: input handler enabled\n");
+#endif
+
+	kfree(data);
+
+	return 0;
+}
+
+#ifdef CONFIG_RFKILL_INPUT
+static long rfkill_fop_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct rfkill_data *data = file->private_data;
+
+	if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC)
+		return -ENOSYS;
+
+	if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT)
+		return -ENOSYS;
+
+	mutex_lock(&data->mtx);
+
+	if (!data->input_handler) {
+		if (atomic_inc_return(&rfkill_input_disabled) == 1)
+			printk(KERN_DEBUG "rfkill: input handler disabled\n");
+		data->input_handler = true;
+	}
+
+	mutex_unlock(&data->mtx);
+
+	return 0;
+}
+#endif
+
+static const struct file_operations rfkill_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rfkill_fop_open,
+	.read		= rfkill_fop_read,
+	.write		= rfkill_fop_write,
+	.poll		= rfkill_fop_poll,
+	.release	= rfkill_fop_release,
+#ifdef CONFIG_RFKILL_INPUT
+	.unlocked_ioctl	= rfkill_fop_ioctl,
+	.compat_ioctl	= rfkill_fop_ioctl,
+#endif
+	.llseek		= no_llseek,
+};
+
+static struct miscdevice rfkill_miscdev = {
+	.name	= "rfkill",
+	.fops	= &rfkill_fops,
+	.minor	= MISC_DYNAMIC_MINOR,
+};
+
+static int __init rfkill_init(void)
+{
+	int error;
+	int i;
+
+	for (i = 0; i < NUM_RFKILL_TYPES; i++)
+		rfkill_global_states[i].cur = !rfkill_default_state;
+
+	error = class_register(&rfkill_class);
+	if (error)
+		goto out;
+
+	error = misc_register(&rfkill_miscdev);
+	if (error) {
+		class_unregister(&rfkill_class);
+		goto out;
+	}
+
+#ifdef CONFIG_RFKILL_INPUT
+	error = rfkill_handler_init();
+	if (error) {
+		misc_deregister(&rfkill_miscdev);
+		class_unregister(&rfkill_class);
+		goto out;
+	}
+#endif
+
+ out:
+	return error;
+}
+subsys_initcall(rfkill_init);
+
+static void __exit rfkill_exit(void)
+{
+#ifdef CONFIG_RFKILL_INPUT
+	rfkill_handler_exit();
+#endif
+	misc_deregister(&rfkill_miscdev);
+	class_unregister(&rfkill_class);
+}
+module_exit(rfkill_exit);
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
new file mode 100644
index 00000000..1bca6d49
--- /dev/null
+++ b/net/rfkill/input.c
@@ -0,0 +1,350 @@
+/*
+ * Input layer to RF Kill interface connector
+ *
+ * Copyright (c) 2007 Dmitry Torokhov
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * If you ever run into a situation in which you have a SW_ type rfkill
+ * input device, then you can revive code that was removed in the patch
+ * "rfkill-input: remove unused code".
+ */
+
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/init.h>
+#include <linux/rfkill.h>
+#include <linux/sched.h>
+
+#include "rfkill.h"
+
+enum rfkill_input_master_mode {
+	RFKILL_INPUT_MASTER_UNLOCK = 0,
+	RFKILL_INPUT_MASTER_RESTORE = 1,
+	RFKILL_INPUT_MASTER_UNBLOCKALL = 2,
+	NUM_RFKILL_INPUT_MASTER_MODES
+};
+
+/* Delay (in ms) between consecutive switch ops */
+#define RFKILL_OPS_DELAY 200
+
+static enum rfkill_input_master_mode rfkill_master_switch_mode =
+					RFKILL_INPUT_MASTER_UNBLOCKALL;
+module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0);
+MODULE_PARM_DESC(master_switch_mode,
+	"SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all");
+
+static spinlock_t rfkill_op_lock;
+static bool rfkill_op_pending;
+static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)];
+
+enum rfkill_sched_op {
+	RFKILL_GLOBAL_OP_EPO = 0,
+	RFKILL_GLOBAL_OP_RESTORE,
+	RFKILL_GLOBAL_OP_UNLOCK,
+	RFKILL_GLOBAL_OP_UNBLOCK,
+};
+
+static enum rfkill_sched_op rfkill_master_switch_op;
+static enum rfkill_sched_op rfkill_op;
+
+static void __rfkill_handle_global_op(enum rfkill_sched_op op)
+{
+	unsigned int i;
+
+	switch (op) {
+	case RFKILL_GLOBAL_OP_EPO:
+		rfkill_epo();
+		break;
+	case RFKILL_GLOBAL_OP_RESTORE:
+		rfkill_restore_states();
+		break;
+	case RFKILL_GLOBAL_OP_UNLOCK:
+		rfkill_remove_epo_lock();
+		break;
+	case RFKILL_GLOBAL_OP_UNBLOCK:
+		rfkill_remove_epo_lock();
+		for (i = 0; i < NUM_RFKILL_TYPES; i++)
+			rfkill_switch_all(i, false);
+		break;
+	default:
+		/* memory corruption or bug, fail safely */
+		rfkill_epo();
+		WARN(1, "Unknown requested operation %d! "
+			"rfkill Emergency Power Off activated\n",
+			op);
+	}
+}
+
+static void __rfkill_handle_normal_op(const enum rfkill_type type,
+				      const bool complement)
+{
+	bool blocked;
+
+	blocked = rfkill_get_global_sw_state(type);
+	if (complement)
+		blocked = !blocked;
+
+	rfkill_switch_all(type, blocked);
+}
+
+static void rfkill_op_handler(struct work_struct *work)
+{
+	unsigned int i;
+	bool c;
+
+	spin_lock_irq(&rfkill_op_lock);
+	do {
+		if (rfkill_op_pending) {
+			enum rfkill_sched_op op = rfkill_op;
+			rfkill_op_pending = false;
+			memset(rfkill_sw_pending, 0,
+				sizeof(rfkill_sw_pending));
+			spin_unlock_irq(&rfkill_op_lock);
+
+			__rfkill_handle_global_op(op);
+
+			spin_lock_irq(&rfkill_op_lock);
+
+			/*
+			 * handle global ops first -- during unlocked period
+			 * we might have gotten a new global op.
+			 */
+			if (rfkill_op_pending)
+				continue;
+		}
+
+		if (rfkill_is_epo_lock_active())
+			continue;
+
+		for (i = 0; i < NUM_RFKILL_TYPES; i++) {
+			if (__test_and_clear_bit(i, rfkill_sw_pending)) {
+				c = __test_and_clear_bit(i, rfkill_sw_state);
+				spin_unlock_irq(&rfkill_op_lock);
+
+				__rfkill_handle_normal_op(i, c);
+
+				spin_lock_irq(&rfkill_op_lock);
+			}
+		}
+	} while (rfkill_op_pending);
+	spin_unlock_irq(&rfkill_op_lock);
+}
+
+static DECLARE_DELAYED_WORK(rfkill_op_work, rfkill_op_handler);
+static unsigned long rfkill_last_scheduled;
+
+static unsigned long rfkill_ratelimit(const unsigned long last)
+{
+	const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
+	return time_after(jiffies, last + delay) ? 0 : delay;
+}
+
+static void rfkill_schedule_ratelimited(void)
+{
+	if (delayed_work_pending(&rfkill_op_work))
+		return;
+	schedule_delayed_work(&rfkill_op_work,
+			      rfkill_ratelimit(rfkill_last_scheduled));
+	rfkill_last_scheduled = jiffies;
+}
+
+static void rfkill_schedule_global_op(enum rfkill_sched_op op)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rfkill_op_lock, flags);
+	rfkill_op = op;
+	rfkill_op_pending = true;
+	if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) {
+		/* bypass the limiter for EPO */
+		cancel_delayed_work(&rfkill_op_work);
+		schedule_delayed_work(&rfkill_op_work, 0);
+		rfkill_last_scheduled = jiffies;
+	} else
+		rfkill_schedule_ratelimited();
+	spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_toggle(enum rfkill_type type)
+{
+	unsigned long flags;
+
+	if (rfkill_is_epo_lock_active())
+		return;
+
+	spin_lock_irqsave(&rfkill_op_lock, flags);
+	if (!rfkill_op_pending) {
+		__set_bit(type, rfkill_sw_pending);
+		__change_bit(type, rfkill_sw_state);
+		rfkill_schedule_ratelimited();
+	}
+	spin_unlock_irqrestore(&rfkill_op_lock, flags);
+}
+
+static void rfkill_schedule_evsw_rfkillall(int state)
+{
+	if (state)
+		rfkill_schedule_global_op(rfkill_master_switch_op);
+	else
+		rfkill_schedule_global_op(RFKILL_GLOBAL_OP_EPO);
+}
+
+static void rfkill_event(struct input_handle *handle, unsigned int type,
+			unsigned int code, int data)
+{
+	if (type == EV_KEY && data == 1) {
+		switch (code) {
+		case KEY_WLAN:
+			rfkill_schedule_toggle(RFKILL_TYPE_WLAN);
+			break;
+		case KEY_BLUETOOTH:
+			rfkill_schedule_toggle(RFKILL_TYPE_BLUETOOTH);
+			break;
+		case KEY_UWB:
+			rfkill_schedule_toggle(RFKILL_TYPE_UWB);
+			break;
+		case KEY_WIMAX:
+			rfkill_schedule_toggle(RFKILL_TYPE_WIMAX);
+			break;
+		case KEY_RFKILL:
+			rfkill_schedule_toggle(RFKILL_TYPE_ALL);
+			break;
+		}
+	} else if (type == EV_SW && code == SW_RFKILL_ALL)
+		rfkill_schedule_evsw_rfkillall(data);
+}
+
+static int rfkill_connect(struct input_handler *handler, struct input_dev *dev,
+			  const struct input_device_id *id)
+{
+	struct input_handle *handle;
+	int error;
+
+	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->dev = dev;
+	handle->handler = handler;
+	handle->name = "rfkill";
+
+	/* causes rfkill_start() to be called */
+	error = input_register_handle(handle);
+	if (error)
+		goto err_free_handle;
+
+	error = input_open_device(handle);
+	if (error)
+		goto err_unregister_handle;
+
+	return 0;
+
+ err_unregister_handle:
+	input_unregister_handle(handle);
+ err_free_handle:
+	kfree(handle);
+	return error;
+}
+
+static void rfkill_start(struct input_handle *handle)
+{
+	/*
+	 * Take event_lock to guard against configuration changes, we
+	 * should be able to deal with concurrency with rfkill_event()
+	 * just fine (which event_lock will also avoid).
+	 */
+	spin_lock_irq(&handle->dev->event_lock);
+
+	if (test_bit(EV_SW, handle->dev->evbit) &&
+	    test_bit(SW_RFKILL_ALL, handle->dev->swbit))
+		rfkill_schedule_evsw_rfkillall(test_bit(SW_RFKILL_ALL,
+							handle->dev->sw));
+
+	spin_unlock_irq(&handle->dev->event_lock);
+}
+
+static void rfkill_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+	kfree(handle);
+}
+
+static const struct input_device_id rfkill_ids[] = {
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_WLAN)] = BIT_MASK(KEY_WLAN) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_BLUETOOTH)] = BIT_MASK(KEY_BLUETOOTH) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_UWB)] = BIT_MASK(KEY_UWB) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_WIMAX)] = BIT_MASK(KEY_WIMAX) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.evbit = { BIT_MASK(EV_KEY) },
+		.keybit = { [BIT_WORD(KEY_RFKILL)] = BIT_MASK(KEY_RFKILL) },
+	},
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_SWBIT,
+		.evbit = { BIT(EV_SW) },
+		.swbit = { [BIT_WORD(SW_RFKILL_ALL)] = BIT_MASK(SW_RFKILL_ALL) },
+	},
+	{ }
+};
+
+static struct input_handler rfkill_handler = {
+	.name =	"rfkill",
+	.event = rfkill_event,
+	.connect = rfkill_connect,
+	.start = rfkill_start,
+	.disconnect = rfkill_disconnect,
+	.id_table = rfkill_ids,
+};
+
+int __init rfkill_handler_init(void)
+{
+	switch (rfkill_master_switch_mode) {
+	case RFKILL_INPUT_MASTER_UNBLOCKALL:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNBLOCK;
+		break;
+	case RFKILL_INPUT_MASTER_RESTORE:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_RESTORE;
+		break;
+	case RFKILL_INPUT_MASTER_UNLOCK:
+		rfkill_master_switch_op = RFKILL_GLOBAL_OP_UNLOCK;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	spin_lock_init(&rfkill_op_lock);
+
+	/* Avoid delay at first schedule */
+	rfkill_last_scheduled =
+			jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1;
+	return input_register_handler(&rfkill_handler);
+}
+
+void __exit rfkill_handler_exit(void)
+{
+	input_unregister_handler(&rfkill_handler);
+	cancel_delayed_work_sync(&rfkill_op_work);
+}
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
new file mode 100644
index 00000000..256c5ddd
--- /dev/null
+++ b/net/rfkill/rfkill-gpio.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2011, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rfkill.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/slab.h>
+
+#include <linux/rfkill-gpio.h>
+
+enum rfkill_gpio_clk_state {
+	UNSPECIFIED = 0,
+	PWR_ENABLED,
+	PWR_DISABLED
+};
+
+#define PWR_CLK_SET(_RF, _EN) \
+	((_RF)->pwr_clk_enabled = (!(_EN) ? PWR_ENABLED : PWR_DISABLED))
+#define PWR_CLK_ENABLED(_RF) ((_RF)->pwr_clk_enabled == PWR_ENABLED)
+#define PWR_CLK_DISABLED(_RF) ((_RF)->pwr_clk_enabled != PWR_ENABLED)
+
+struct rfkill_gpio_data {
+	struct rfkill_gpio_platform_data	*pdata;
+	struct rfkill				*rfkill_dev;
+	char					*reset_name;
+	char					*shutdown_name;
+	enum rfkill_gpio_clk_state		pwr_clk_enabled;
+	struct clk				*pwr_clk;
+};
+
+static int rfkill_gpio_set_power(void *data, bool blocked)
+{
+	struct rfkill_gpio_data *rfkill = data;
+
+	if (blocked) {
+		if (gpio_is_valid(rfkill->pdata->shutdown_gpio))
+			gpio_direction_output(rfkill->pdata->shutdown_gpio, 0);
+		if (gpio_is_valid(rfkill->pdata->reset_gpio))
+			gpio_direction_output(rfkill->pdata->reset_gpio, 0);
+		if (rfkill->pwr_clk && PWR_CLK_ENABLED(rfkill))
+			clk_disable(rfkill->pwr_clk);
+	} else {
+		if (rfkill->pwr_clk && PWR_CLK_DISABLED(rfkill))
+			clk_enable(rfkill->pwr_clk);
+		if (gpio_is_valid(rfkill->pdata->reset_gpio))
+			gpio_direction_output(rfkill->pdata->reset_gpio, 1);
+		if (gpio_is_valid(rfkill->pdata->shutdown_gpio))
+			gpio_direction_output(rfkill->pdata->shutdown_gpio, 1);
+	}
+
+	if (rfkill->pwr_clk)
+		PWR_CLK_SET(rfkill, blocked);
+
+	return 0;
+}
+
+static const struct rfkill_ops rfkill_gpio_ops = {
+	.set_block = rfkill_gpio_set_power,
+};
+
+static int rfkill_gpio_probe(struct platform_device *pdev)
+{
+	struct rfkill_gpio_data *rfkill;
+	struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
+	int ret = 0;
+	int len = 0;
+
+	if (!pdata) {
+		pr_warn("%s: No platform data specified\n", __func__);
+		return -EINVAL;
+	}
+
+	/* make sure at-least one of the GPIO is defined and that
+	 * a name is specified for this instance */
+	if (!pdata->name || (!gpio_is_valid(pdata->reset_gpio) &&
+		!gpio_is_valid(pdata->shutdown_gpio))) {
+		pr_warn("%s: invalid platform data\n", __func__);
+		return -EINVAL;
+	}
+
+	rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+	if (!rfkill)
+		return -ENOMEM;
+
+	rfkill->pdata = pdata;
+
+	len = strlen(pdata->name);
+	rfkill->reset_name = kzalloc(len + 7, GFP_KERNEL);
+	if (!rfkill->reset_name) {
+		ret = -ENOMEM;
+		goto fail_alloc;
+	}
+
+	rfkill->shutdown_name = kzalloc(len + 10, GFP_KERNEL);
+	if (!rfkill->shutdown_name) {
+		ret = -ENOMEM;
+		goto fail_reset_name;
+	}
+
+	snprintf(rfkill->reset_name, len + 6 , "%s_reset", pdata->name);
+	snprintf(rfkill->shutdown_name, len + 9, "%s_shutdown", pdata->name);
+
+	if (pdata->power_clk_name) {
+		rfkill->pwr_clk = clk_get(&pdev->dev, pdata->power_clk_name);
+		if (IS_ERR(rfkill->pwr_clk)) {
+			pr_warn("%s: can't find pwr_clk.\n", __func__);
+			goto fail_shutdown_name;
+		}
+	}
+
+	if (gpio_is_valid(pdata->reset_gpio)) {
+		ret = gpio_request(pdata->reset_gpio, rfkill->reset_name);
+		if (ret) {
+			pr_warn("%s: failed to get reset gpio.\n", __func__);
+			goto fail_clock;
+		}
+	}
+
+	if (gpio_is_valid(pdata->shutdown_gpio)) {
+		ret = gpio_request(pdata->shutdown_gpio, rfkill->shutdown_name);
+		if (ret) {
+			pr_warn("%s: failed to get shutdown gpio.\n", __func__);
+			goto fail_reset;
+		}
+	}
+
+	rfkill->rfkill_dev = rfkill_alloc(pdata->name, &pdev->dev, pdata->type,
+				&rfkill_gpio_ops, rfkill);
+	if (!rfkill->rfkill_dev)
+		goto fail_shutdown;
+
+	ret = rfkill_register(rfkill->rfkill_dev);
+	if (ret < 0)
+		goto fail_rfkill;
+
+	platform_set_drvdata(pdev, rfkill);
+
+	dev_info(&pdev->dev, "%s device registered.\n", pdata->name);
+
+	return 0;
+
+fail_rfkill:
+	rfkill_destroy(rfkill->rfkill_dev);
+fail_shutdown:
+	if (gpio_is_valid(pdata->shutdown_gpio))
+		gpio_free(pdata->shutdown_gpio);
+fail_reset:
+	if (gpio_is_valid(pdata->reset_gpio))
+		gpio_free(pdata->reset_gpio);
+fail_clock:
+	if (rfkill->pwr_clk)
+		clk_put(rfkill->pwr_clk);
+fail_shutdown_name:
+	kfree(rfkill->shutdown_name);
+fail_reset_name:
+	kfree(rfkill->reset_name);
+fail_alloc:
+	kfree(rfkill);
+
+	return ret;
+}
+
+static int rfkill_gpio_remove(struct platform_device *pdev)
+{
+	struct rfkill_gpio_data *rfkill = platform_get_drvdata(pdev);
+
+	rfkill_unregister(rfkill->rfkill_dev);
+	rfkill_destroy(rfkill->rfkill_dev);
+	if (gpio_is_valid(rfkill->pdata->shutdown_gpio))
+		gpio_free(rfkill->pdata->shutdown_gpio);
+	if (gpio_is_valid(rfkill->pdata->reset_gpio))
+		gpio_free(rfkill->pdata->reset_gpio);
+	if (rfkill->pwr_clk && PWR_CLK_ENABLED(rfkill))
+		clk_disable(rfkill->pwr_clk);
+	if (rfkill->pwr_clk)
+		clk_put(rfkill->pwr_clk);
+	kfree(rfkill->shutdown_name);
+	kfree(rfkill->reset_name);
+	kfree(rfkill);
+
+	return 0;
+}
+
+static struct platform_driver rfkill_gpio_driver = {
+	.probe = rfkill_gpio_probe,
+	.remove = __devexit_p(rfkill_gpio_remove),
+	.driver = {
+		   .name = "rfkill_gpio",
+		   .owner = THIS_MODULE,
+	},
+};
+
+static int __init rfkill_gpio_init(void)
+{
+	return platform_driver_register(&rfkill_gpio_driver);
+}
+
+static void __exit rfkill_gpio_exit(void)
+{
+	platform_driver_unregister(&rfkill_gpio_driver);
+}
+
+module_init(rfkill_gpio_init);
+module_exit(rfkill_gpio_exit);
+
+MODULE_DESCRIPTION("gpio rfkill");
+MODULE_AUTHOR("NVIDIA");
+MODULE_LICENSE("GPL");
diff --git a/net/rfkill/rfkill-regulator.c b/net/rfkill/rfkill-regulator.c
new file mode 100644
index 00000000..18dc512a
--- /dev/null
+++ b/net/rfkill/rfkill-regulator.c
@@ -0,0 +1,164 @@
+/*
+ * rfkill-regulator.c - Regulator consumer driver for rfkill
+ *
+ * Copyright (C) 2009  Guiming Zhuo <gmzhuo@gmail.com>
+ * Copyright (C) 2011  Antonio Ospite <ospite@studenti.unina.it>
+ *
+ * Implementation inspired by leds-regulator driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/rfkill.h>
+#include <linux/rfkill-regulator.h>
+
+struct rfkill_regulator_data {
+	struct rfkill *rf_kill;
+	bool reg_enabled;
+
+	struct regulator *vcc;
+};
+
+static int rfkill_regulator_set_block(void *data, bool blocked)
+{
+	struct rfkill_regulator_data *rfkill_data = data;
+
+	pr_debug("%s: blocked: %d\n", __func__, blocked);
+
+	if (blocked) {
+		if (rfkill_data->reg_enabled) {
+			regulator_disable(rfkill_data->vcc);
+			rfkill_data->reg_enabled = 0;
+		}
+	} else {
+		if (!rfkill_data->reg_enabled) {
+			regulator_enable(rfkill_data->vcc);
+			rfkill_data->reg_enabled = 1;
+		}
+	}
+
+	pr_debug("%s: regulator_is_enabled after set_block: %d\n", __func__,
+		regulator_is_enabled(rfkill_data->vcc));
+
+	return 0;
+}
+
+struct rfkill_ops rfkill_regulator_ops = {
+	.set_block = rfkill_regulator_set_block,
+};
+
+static int __devinit rfkill_regulator_probe(struct platform_device *pdev)
+{
+	struct rfkill_regulator_platform_data *pdata = pdev->dev.platform_data;
+	struct rfkill_regulator_data *rfkill_data;
+	struct regulator *vcc;
+	struct rfkill *rf_kill;
+	int ret = 0;
+
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -ENODEV;
+	}
+
+	if (pdata->name == NULL || pdata->type == 0) {
+		dev_err(&pdev->dev, "invalid name or type in platform data\n");
+		return -EINVAL;
+	}
+
+	vcc = regulator_get_exclusive(&pdev->dev, "vrfkill");
+	if (IS_ERR(vcc)) {
+		dev_err(&pdev->dev, "Cannot get vcc for %s\n", pdata->name);
+		ret = PTR_ERR(vcc);
+		goto out;
+	}
+
+	rfkill_data = kzalloc(sizeof(*rfkill_data), GFP_KERNEL);
+	if (rfkill_data == NULL) {
+		ret = -ENOMEM;
+		goto err_data_alloc;
+	}
+
+	rf_kill = rfkill_alloc(pdata->name, &pdev->dev,
+				pdata->type,
+				&rfkill_regulator_ops, rfkill_data);
+	if (rf_kill == NULL) {
+		dev_err(&pdev->dev, "Cannot alloc rfkill device\n");
+		ret = -ENOMEM;
+		goto err_rfkill_alloc;
+	}
+
+	if (regulator_is_enabled(vcc)) {
+		dev_dbg(&pdev->dev, "Regulator already enabled\n");
+		rfkill_data->reg_enabled = 1;
+	}
+	rfkill_data->vcc = vcc;
+	rfkill_data->rf_kill = rf_kill;
+
+	ret = rfkill_register(rf_kill);
+	if (ret) {
+		dev_err(&pdev->dev, "Cannot register rfkill device\n");
+		goto err_rfkill_register;
+	}
+
+	platform_set_drvdata(pdev, rfkill_data);
+	dev_info(&pdev->dev, "%s initialized\n", pdata->name);
+
+	return 0;
+
+err_rfkill_register:
+	rfkill_destroy(rf_kill);
+err_rfkill_alloc:
+	kfree(rfkill_data);
+err_data_alloc:
+	regulator_put(vcc);
+out:
+	return ret;
+}
+
+static int __devexit rfkill_regulator_remove(struct platform_device *pdev)
+{
+	struct rfkill_regulator_data *rfkill_data = platform_get_drvdata(pdev);
+	struct rfkill *rf_kill = rfkill_data->rf_kill;
+
+	rfkill_unregister(rf_kill);
+	rfkill_destroy(rf_kill);
+	regulator_put(rfkill_data->vcc);
+	kfree(rfkill_data);
+
+	return 0;
+}
+
+static struct platform_driver rfkill_regulator_driver = {
+	.probe = rfkill_regulator_probe,
+	.remove = __devexit_p(rfkill_regulator_remove),
+	.driver = {
+		.name = "rfkill-regulator",
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init rfkill_regulator_init(void)
+{
+	return platform_driver_register(&rfkill_regulator_driver);
+}
+module_init(rfkill_regulator_init);
+
+static void __exit rfkill_regulator_exit(void)
+{
+	platform_driver_unregister(&rfkill_regulator_driver);
+}
+module_exit(rfkill_regulator_exit);
+
+MODULE_AUTHOR("Guiming Zhuo <gmzhuo@gmail.com>");
+MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
+MODULE_DESCRIPTION("Regulator consumer driver for rfkill");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:rfkill-regulator");
diff --git a/net/rfkill/rfkill.h b/net/rfkill/rfkill.h
new file mode 100644
index 00000000..d1117cb6
--- /dev/null
+++ b/net/rfkill/rfkill.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Ivo van Doorn
+ * Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef __RFKILL_INPUT_H
+#define __RFKILL_INPUT_H
+
+/* core code */
+void rfkill_switch_all(const enum rfkill_type type, bool blocked);
+void rfkill_epo(void);
+void rfkill_restore_states(void);
+void rfkill_remove_epo_lock(void);
+bool rfkill_is_epo_lock_active(void);
+bool rfkill_get_global_sw_state(const enum rfkill_type type);
+
+/* input handler */
+int rfkill_handler_init(void);
+void rfkill_handler_exit(void);
+
+#endif /* __RFKILL_INPUT_H */
diff --git a/net/rose/Makefile b/net/rose/Makefile
new file mode 100644
index 00000000..fa248116
--- /dev/null
+++ b/net/rose/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the Linux Rose (X.25 PLP) layer.
+#
+
+obj-$(CONFIG_ROSE) += rose.o
+
+rose-y	  := af_rose.o rose_dev.o rose_in.o rose_link.o rose_loopback.o \
+	     rose_out.o rose_route.o rose_subr.o rose_timer.o
+rose-$(CONFIG_SYSCTL) += sysctl_net_rose.o
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
new file mode 100644
index 00000000..f9ea925a
--- /dev/null
+++ b/net/rose/af_rose.c
@@ -0,0 +1,1643 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
+ * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net)
+ * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi)
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/stat.h>
+#include <net/net_namespace.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <net/rose.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/tcp_states.h>
+#include <net/ip.h>
+#include <net/arp.h>
+
+static int rose_ndevs = 10;
+
+int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0;
+int sysctl_rose_call_request_timeout    = ROSE_DEFAULT_T1;
+int sysctl_rose_reset_request_timeout   = ROSE_DEFAULT_T2;
+int sysctl_rose_clear_request_timeout   = ROSE_DEFAULT_T3;
+int sysctl_rose_no_activity_timeout     = ROSE_DEFAULT_IDLE;
+int sysctl_rose_ack_hold_back_timeout   = ROSE_DEFAULT_HB;
+int sysctl_rose_routing_control         = ROSE_DEFAULT_ROUTING;
+int sysctl_rose_link_fail_timeout       = ROSE_DEFAULT_FAIL_TIMEOUT;
+int sysctl_rose_maximum_vcs             = ROSE_DEFAULT_MAXVC;
+int sysctl_rose_window_size             = ROSE_DEFAULT_WINDOW_SIZE;
+
+static HLIST_HEAD(rose_list);
+static DEFINE_SPINLOCK(rose_list_lock);
+
+static const struct proto_ops rose_proto_ops;
+
+ax25_address rose_callsign;
+
+/*
+ * ROSE network devices are virtual network devices encapsulating ROSE
+ * frames into AX.25 which will be sent through an AX.25 device, so form a
+ * special "super class" of normal net devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key rose_netdev_xmit_lock_key;
+static struct lock_class_key rose_netdev_addr_lock_key;
+
+static void rose_set_lockdep_one(struct net_device *dev,
+				 struct netdev_queue *txq,
+				 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key);
+}
+
+static void rose_set_lockdep_key(struct net_device *dev)
+{
+	lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key);
+	netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL);
+}
+
+/*
+ *	Convert a ROSE address into text.
+ */
+char *rose2asc(char *buf, const rose_address *addr)
+{
+	if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 &&
+	    addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 &&
+	    addr->rose_addr[4] == 0x00) {
+		strcpy(buf, "*");
+	} else {
+		sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF,
+						addr->rose_addr[1] & 0xFF,
+						addr->rose_addr[2] & 0xFF,
+						addr->rose_addr[3] & 0xFF,
+						addr->rose_addr[4] & 0xFF);
+	}
+
+	return buf;
+}
+
+/*
+ *	Compare two ROSE addresses, 0 == equal.
+ */
+int rosecmp(rose_address *addr1, rose_address *addr2)
+{
+	int i;
+
+	for (i = 0; i < 5; i++)
+		if (addr1->rose_addr[i] != addr2->rose_addr[i])
+			return 1;
+
+	return 0;
+}
+
+/*
+ *	Compare two ROSE addresses for only mask digits, 0 == equal.
+ */
+int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask)
+{
+	unsigned int i, j;
+
+	if (mask > 10)
+		return 1;
+
+	for (i = 0; i < mask; i++) {
+		j = i / 2;
+
+		if ((i % 2) != 0) {
+			if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F))
+				return 1;
+		} else {
+			if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Socket removal during an interrupt is now safe.
+ */
+static void rose_remove_socket(struct sock *sk)
+{
+	spin_lock_bh(&rose_list_lock);
+	sk_del_node_init(sk);
+	spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ *	Kill all bound sockets on a broken link layer connection to a
+ *	particular neighbour.
+ */
+void rose_kill_by_neigh(struct rose_neigh *neigh)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&rose_list_lock);
+	sk_for_each(s, node, &rose_list) {
+		struct rose_sock *rose = rose_sk(s);
+
+		if (rose->neighbour == neigh) {
+			rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
+			rose->neighbour->use--;
+			rose->neighbour = NULL;
+		}
+	}
+	spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ *	Kill all bound sockets on a dropped device.
+ */
+static void rose_kill_by_device(struct net_device *dev)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&rose_list_lock);
+	sk_for_each(s, node, &rose_list) {
+		struct rose_sock *rose = rose_sk(s);
+
+		if (rose->device == dev) {
+			rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
+			rose->neighbour->use--;
+			rose->device = NULL;
+		}
+	}
+	spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ *	Handle device status changes.
+ */
+static int rose_device_event(struct notifier_block *this, unsigned long event,
+	void *ptr)
+{
+	struct net_device *dev = (struct net_device *)ptr;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_DOWN)
+		return NOTIFY_DONE;
+
+	switch (dev->type) {
+	case ARPHRD_ROSE:
+		rose_kill_by_device(dev);
+		break;
+	case ARPHRD_AX25:
+		rose_link_device_down(dev);
+		rose_rt_device_down(dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ *	Add a socket to the bound sockets list.
+ */
+static void rose_insert_socket(struct sock *sk)
+{
+
+	spin_lock_bh(&rose_list_lock);
+	sk_add_node(sk, &rose_list);
+	spin_unlock_bh(&rose_list_lock);
+}
+
+/*
+ *	Find a socket that wants to accept the Call Request we just
+ *	received.
+ */
+static struct sock *rose_find_listener(rose_address *addr, ax25_address *call)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&rose_list_lock);
+	sk_for_each(s, node, &rose_list) {
+		struct rose_sock *rose = rose_sk(s);
+
+		if (!rosecmp(&rose->source_addr, addr) &&
+		    !ax25cmp(&rose->source_call, call) &&
+		    !rose->source_ndigis && s->sk_state == TCP_LISTEN)
+			goto found;
+	}
+
+	sk_for_each(s, node, &rose_list) {
+		struct rose_sock *rose = rose_sk(s);
+
+		if (!rosecmp(&rose->source_addr, addr) &&
+		    !ax25cmp(&rose->source_call, &null_ax25_address) &&
+		    s->sk_state == TCP_LISTEN)
+			goto found;
+	}
+	s = NULL;
+found:
+	spin_unlock_bh(&rose_list_lock);
+	return s;
+}
+
+/*
+ *	Find a connected ROSE socket given my LCI and device.
+ */
+struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock_bh(&rose_list_lock);
+	sk_for_each(s, node, &rose_list) {
+		struct rose_sock *rose = rose_sk(s);
+
+		if (rose->lci == lci && rose->neighbour == neigh)
+			goto found;
+	}
+	s = NULL;
+found:
+	spin_unlock_bh(&rose_list_lock);
+	return s;
+}
+
+/*
+ *	Find a unique LCI for a given device.
+ */
+unsigned int rose_new_lci(struct rose_neigh *neigh)
+{
+	int lci;
+
+	if (neigh->dce_mode) {
+		for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++)
+			if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
+				return lci;
+	} else {
+		for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--)
+			if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
+				return lci;
+	}
+
+	return 0;
+}
+
+/*
+ *	Deferred destroy.
+ */
+void rose_destroy_socket(struct sock *);
+
+/*
+ *	Handler for deferred kills.
+ */
+static void rose_destroy_timer(unsigned long data)
+{
+	rose_destroy_socket((struct sock *)data);
+}
+
+/*
+ *	This is called from user mode and the timers. Thus it protects itself
+ *	against interrupt users but doesn't worry about being called during
+ *	work.  Once it is removed from the queue no interrupt or bottom half
+ *	will touch it and we are (fairly 8-) ) safe.
+ */
+void rose_destroy_socket(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	rose_remove_socket(sk);
+	rose_stop_heartbeat(sk);
+	rose_stop_idletimer(sk);
+	rose_stop_timer(sk);
+
+	rose_clear_queues(sk);		/* Flush the queues */
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (skb->sk != sk) {	/* A pending connection */
+			/* Queue the unaccepted socket for death */
+			sock_set_flag(skb->sk, SOCK_DEAD);
+			rose_start_heartbeat(skb->sk);
+			rose_sk(skb->sk)->state = ROSE_STATE_0;
+		}
+
+		kfree_skb(skb);
+	}
+
+	if (sk_has_allocations(sk)) {
+		/* Defer: outstanding buffers */
+		setup_timer(&sk->sk_timer, rose_destroy_timer,
+				(unsigned long)sk);
+		sk->sk_timer.expires  = jiffies + 10 * HZ;
+		add_timer(&sk->sk_timer);
+	} else
+		sock_put(sk);
+}
+
+/*
+ *	Handling for system calls applied via the various interfaces to a
+ *	ROSE socket object.
+ */
+
+static int rose_setsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	int opt;
+
+	if (level != SOL_ROSE)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(opt, (int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+	case ROSE_DEFER:
+		rose->defer = opt ? 1 : 0;
+		return 0;
+
+	case ROSE_T1:
+		if (opt < 1)
+			return -EINVAL;
+		rose->t1 = opt * HZ;
+		return 0;
+
+	case ROSE_T2:
+		if (opt < 1)
+			return -EINVAL;
+		rose->t2 = opt * HZ;
+		return 0;
+
+	case ROSE_T3:
+		if (opt < 1)
+			return -EINVAL;
+		rose->t3 = opt * HZ;
+		return 0;
+
+	case ROSE_HOLDBACK:
+		if (opt < 1)
+			return -EINVAL;
+		rose->hb = opt * HZ;
+		return 0;
+
+	case ROSE_IDLE:
+		if (opt < 0)
+			return -EINVAL;
+		rose->idle = opt * 60 * HZ;
+		return 0;
+
+	case ROSE_QBITINCL:
+		rose->qbitincl = opt ? 1 : 0;
+		return 0;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+}
+
+static int rose_getsockopt(struct socket *sock, int level, int optname,
+	char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	int val = 0;
+	int len;
+
+	if (level != SOL_ROSE)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case ROSE_DEFER:
+		val = rose->defer;
+		break;
+
+	case ROSE_T1:
+		val = rose->t1 / HZ;
+		break;
+
+	case ROSE_T2:
+		val = rose->t2 / HZ;
+		break;
+
+	case ROSE_T3:
+		val = rose->t3 / HZ;
+		break;
+
+	case ROSE_HOLDBACK:
+		val = rose->hb / HZ;
+		break;
+
+	case ROSE_IDLE:
+		val = rose->idle / (60 * HZ);
+		break;
+
+	case ROSE_QBITINCL:
+		val = rose->qbitincl;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return copy_to_user(optval, &val, len) ? -EFAULT : 0;
+}
+
+static int rose_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_state != TCP_LISTEN) {
+		struct rose_sock *rose = rose_sk(sk);
+
+		rose->dest_ndigis = 0;
+		memset(&rose->dest_addr, 0, ROSE_ADDR_LEN);
+		memset(&rose->dest_call, 0, AX25_ADDR_LEN);
+		memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS);
+		sk->sk_max_ack_backlog = backlog;
+		sk->sk_state           = TCP_LISTEN;
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static struct proto rose_proto = {
+	.name	  = "ROSE",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct rose_sock),
+};
+
+static int rose_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	struct sock *sk;
+	struct rose_sock *rose;
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	if (sock->type != SOCK_SEQPACKET || protocol != 0)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+	if (sk == NULL)
+		return -ENOMEM;
+
+	rose = rose_sk(sk);
+
+	sock_init_data(sock, sk);
+
+	skb_queue_head_init(&rose->ack_queue);
+#ifdef M_BIT
+	skb_queue_head_init(&rose->frag_queue);
+	rose->fraglen    = 0;
+#endif
+
+	sock->ops    = &rose_proto_ops;
+	sk->sk_protocol = protocol;
+
+	init_timer(&rose->timer);
+	init_timer(&rose->idletimer);
+
+	rose->t1   = msecs_to_jiffies(sysctl_rose_call_request_timeout);
+	rose->t2   = msecs_to_jiffies(sysctl_rose_reset_request_timeout);
+	rose->t3   = msecs_to_jiffies(sysctl_rose_clear_request_timeout);
+	rose->hb   = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout);
+	rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout);
+
+	rose->state = ROSE_STATE_0;
+
+	return 0;
+}
+
+static struct sock *rose_make_new(struct sock *osk)
+{
+	struct sock *sk;
+	struct rose_sock *rose, *orose;
+
+	if (osk->sk_type != SOCK_SEQPACKET)
+		return NULL;
+
+	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+	if (sk == NULL)
+		return NULL;
+
+	rose = rose_sk(sk);
+
+	sock_init_data(NULL, sk);
+
+	skb_queue_head_init(&rose->ack_queue);
+#ifdef M_BIT
+	skb_queue_head_init(&rose->frag_queue);
+	rose->fraglen  = 0;
+#endif
+
+	sk->sk_type     = osk->sk_type;
+	sk->sk_priority = osk->sk_priority;
+	sk->sk_protocol = osk->sk_protocol;
+	sk->sk_rcvbuf   = osk->sk_rcvbuf;
+	sk->sk_sndbuf   = osk->sk_sndbuf;
+	sk->sk_state    = TCP_ESTABLISHED;
+	sock_copy_flags(sk, osk);
+
+	init_timer(&rose->timer);
+	init_timer(&rose->idletimer);
+
+	orose		= rose_sk(osk);
+	rose->t1	= orose->t1;
+	rose->t2	= orose->t2;
+	rose->t3	= orose->t3;
+	rose->hb	= orose->hb;
+	rose->idle	= orose->idle;
+	rose->defer	= orose->defer;
+	rose->device	= orose->device;
+	rose->qbitincl	= orose->qbitincl;
+
+	return sk;
+}
+
+static int rose_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose;
+
+	if (sk == NULL) return 0;
+
+	sock_hold(sk);
+	sock_orphan(sk);
+	lock_sock(sk);
+	rose = rose_sk(sk);
+
+	switch (rose->state) {
+	case ROSE_STATE_0:
+		release_sock(sk);
+		rose_disconnect(sk, 0, -1, -1);
+		lock_sock(sk);
+		rose_destroy_socket(sk);
+		break;
+
+	case ROSE_STATE_2:
+		rose->neighbour->use--;
+		release_sock(sk);
+		rose_disconnect(sk, 0, -1, -1);
+		lock_sock(sk);
+		rose_destroy_socket(sk);
+		break;
+
+	case ROSE_STATE_1:
+	case ROSE_STATE_3:
+	case ROSE_STATE_4:
+	case ROSE_STATE_5:
+		rose_clear_queues(sk);
+		rose_stop_idletimer(sk);
+		rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+		rose_start_t3timer(sk);
+		rose->state  = ROSE_STATE_2;
+		sk->sk_state    = TCP_CLOSE;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+		sock_set_flag(sk, SOCK_DESTROY);
+		break;
+
+	default:
+		break;
+	}
+
+	sock->sk = NULL;
+	release_sock(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
+	struct net_device *dev;
+	ax25_address *source;
+	ax25_uid_assoc *user;
+	int n;
+
+	if (!sock_flag(sk, SOCK_ZAPPED))
+		return -EINVAL;
+
+	if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
+		return -EINVAL;
+
+	if (addr->srose_family != AF_ROSE)
+		return -EINVAL;
+
+	if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
+		return -EINVAL;
+
+	if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
+		return -EINVAL;
+
+	if ((dev = rose_dev_get(&addr->srose_addr)) == NULL)
+		return -EADDRNOTAVAIL;
+
+	source = &addr->srose_call;
+
+	user = ax25_findbyuid(current_euid());
+	if (user) {
+		rose->source_call = user->call;
+		ax25_uid_put(user);
+	} else {
+		if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
+			return -EACCES;
+		rose->source_call   = *source;
+	}
+
+	rose->source_addr   = addr->srose_addr;
+	rose->device        = dev;
+	rose->source_ndigis = addr->srose_ndigis;
+
+	if (addr_len == sizeof(struct full_sockaddr_rose)) {
+		struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
+		for (n = 0 ; n < addr->srose_ndigis ; n++)
+			rose->source_digis[n] = full_addr->srose_digis[n];
+	} else {
+		if (rose->source_ndigis == 1) {
+			rose->source_digis[0] = addr->srose_digi;
+		}
+	}
+
+	rose_insert_socket(sk);
+
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	return 0;
+}
+
+static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
+	unsigned char cause, diagnostic;
+	struct net_device *dev;
+	ax25_uid_assoc *user;
+	int n, err = 0;
+
+	if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
+		return -EINVAL;
+
+	if (addr->srose_family != AF_ROSE)
+		return -EINVAL;
+
+	if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
+		return -EINVAL;
+
+	if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
+		return -EINVAL;
+
+	/* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
+	if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+		/* Connect completed during a ERESTARTSYS event */
+		sock->state = SS_CONNECTED;
+		goto out_release;
+	}
+
+	if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+		sock->state = SS_UNCONNECTED;
+		err = -ECONNREFUSED;
+		goto out_release;
+	}
+
+	if (sk->sk_state == TCP_ESTABLISHED) {
+		/* No reconnect on a seqpacket socket */
+		err = -EISCONN;
+		goto out_release;
+	}
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause,
+					 &diagnostic, 0);
+	if (!rose->neighbour) {
+		err = -ENETUNREACH;
+		goto out_release;
+	}
+
+	rose->lci = rose_new_lci(rose->neighbour);
+	if (!rose->lci) {
+		err = -ENETUNREACH;
+		goto out_release;
+	}
+
+	if (sock_flag(sk, SOCK_ZAPPED)) {	/* Must bind first - autobinding in this may or may not work */
+		sock_reset_flag(sk, SOCK_ZAPPED);
+
+		if ((dev = rose_dev_first()) == NULL) {
+			err = -ENETUNREACH;
+			goto out_release;
+		}
+
+		user = ax25_findbyuid(current_euid());
+		if (!user) {
+			err = -EINVAL;
+			goto out_release;
+		}
+
+		memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
+		rose->source_call = user->call;
+		rose->device      = dev;
+		ax25_uid_put(user);
+
+		rose_insert_socket(sk);		/* Finish the bind */
+	}
+	rose->dest_addr   = addr->srose_addr;
+	rose->dest_call   = addr->srose_call;
+	rose->rand        = ((long)rose & 0xFFFF) + rose->lci;
+	rose->dest_ndigis = addr->srose_ndigis;
+
+	if (addr_len == sizeof(struct full_sockaddr_rose)) {
+		struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
+		for (n = 0 ; n < addr->srose_ndigis ; n++)
+			rose->dest_digis[n] = full_addr->srose_digis[n];
+	} else {
+		if (rose->dest_ndigis == 1) {
+			rose->dest_digis[0] = addr->srose_digi;
+		}
+	}
+
+	/* Move to connecting socket, start sending Connect Requests */
+	sock->state   = SS_CONNECTING;
+	sk->sk_state     = TCP_SYN_SENT;
+
+	rose->state = ROSE_STATE_1;
+
+	rose->neighbour->use++;
+
+	rose_write_internal(sk, ROSE_CALL_REQUEST);
+	rose_start_heartbeat(sk);
+	rose_start_t1timer(sk);
+
+	/* Now the loop */
+	if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
+		err = -EINPROGRESS;
+		goto out_release;
+	}
+
+	/*
+	 * A Connect Ack with Choke or timeout or failed routing will go to
+	 * closed.
+	 */
+	if (sk->sk_state == TCP_SYN_SENT) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait(sk_sleep(sk), &wait,
+					TASK_INTERRUPTIBLE);
+			if (sk->sk_state != TCP_SYN_SENT)
+				break;
+			if (!signal_pending(current)) {
+				release_sock(sk);
+				schedule();
+				lock_sock(sk);
+				continue;
+			}
+			err = -ERESTARTSYS;
+			break;
+		}
+		finish_wait(sk_sleep(sk), &wait);
+
+		if (err)
+			goto out_release;
+	}
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		sock->state = SS_UNCONNECTED;
+		err = sock_error(sk);	/* Always set at this point */
+		goto out_release;
+	}
+
+	sock->state = SS_CONNECTED;
+
+out_release:
+	release_sock(sk);
+
+	return err;
+}
+
+static int rose_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sk_buff *skb;
+	struct sock *newsk;
+	DEFINE_WAIT(wait);
+	struct sock *sk;
+	int err = 0;
+
+	if ((sk = sock->sk) == NULL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = -EOPNOTSUPP;
+		goto out_release;
+	}
+
+	if (sk->sk_state != TCP_LISTEN) {
+		err = -EINVAL;
+		goto out_release;
+	}
+
+	/*
+	 *	The write queue this time is holding sockets ready to use
+	 *	hooked into the SABM we saved
+	 */
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb)
+			break;
+
+		if (flags & O_NONBLOCK) {
+			err = -EWOULDBLOCK;
+			break;
+		}
+		if (!signal_pending(current)) {
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(sk_sleep(sk), &wait);
+	if (err)
+		goto out_release;
+
+	newsk = skb->sk;
+	sock_graft(newsk, newsock);
+
+	/* Now attach up the new socket */
+	skb->sk = NULL;
+	kfree_skb(skb);
+	sk->sk_ack_backlog--;
+
+out_release:
+	release_sock(sk);
+
+	return err;
+}
+
+static int rose_getname(struct socket *sock, struct sockaddr *uaddr,
+	int *uaddr_len, int peer)
+{
+	struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr;
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	int n;
+
+	memset(srose, 0, sizeof(*srose));
+	if (peer != 0) {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -ENOTCONN;
+		srose->srose_family = AF_ROSE;
+		srose->srose_addr   = rose->dest_addr;
+		srose->srose_call   = rose->dest_call;
+		srose->srose_ndigis = rose->dest_ndigis;
+		for (n = 0; n < rose->dest_ndigis; n++)
+			srose->srose_digis[n] = rose->dest_digis[n];
+	} else {
+		srose->srose_family = AF_ROSE;
+		srose->srose_addr   = rose->source_addr;
+		srose->srose_call   = rose->source_call;
+		srose->srose_ndigis = rose->source_ndigis;
+		for (n = 0; n < rose->source_ndigis; n++)
+			srose->srose_digis[n] = rose->source_digis[n];
+	}
+
+	*uaddr_len = sizeof(struct full_sockaddr_rose);
+	return 0;
+}
+
+int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci)
+{
+	struct sock *sk;
+	struct sock *make;
+	struct rose_sock *make_rose;
+	struct rose_facilities_struct facilities;
+	int n;
+
+	skb->sk = NULL;		/* Initially we don't know who it's for */
+
+	/*
+	 *	skb->data points to the rose frame start
+	 */
+	memset(&facilities, 0x00, sizeof(struct rose_facilities_struct));
+
+	if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF,
+				   skb->len - ROSE_CALL_REQ_FACILITIES_OFF,
+				   &facilities)) {
+		rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76);
+		return 0;
+	}
+
+	sk = rose_find_listener(&facilities.source_addr, &facilities.source_call);
+
+	/*
+	 * We can't accept the Call Request.
+	 */
+	if (sk == NULL || sk_acceptq_is_full(sk) ||
+	    (make = rose_make_new(sk)) == NULL) {
+		rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120);
+		return 0;
+	}
+
+	skb->sk     = make;
+	make->sk_state = TCP_ESTABLISHED;
+	make_rose = rose_sk(make);
+
+	make_rose->lci           = lci;
+	make_rose->dest_addr     = facilities.dest_addr;
+	make_rose->dest_call     = facilities.dest_call;
+	make_rose->dest_ndigis   = facilities.dest_ndigis;
+	for (n = 0 ; n < facilities.dest_ndigis ; n++)
+		make_rose->dest_digis[n] = facilities.dest_digis[n];
+	make_rose->source_addr   = facilities.source_addr;
+	make_rose->source_call   = facilities.source_call;
+	make_rose->source_ndigis = facilities.source_ndigis;
+	for (n = 0 ; n < facilities.source_ndigis ; n++)
+		make_rose->source_digis[n]= facilities.source_digis[n];
+	make_rose->neighbour     = neigh;
+	make_rose->device        = dev;
+	make_rose->facilities    = facilities;
+
+	make_rose->neighbour->use++;
+
+	if (rose_sk(sk)->defer) {
+		make_rose->state = ROSE_STATE_5;
+	} else {
+		rose_write_internal(make, ROSE_CALL_ACCEPTED);
+		make_rose->state = ROSE_STATE_3;
+		rose_start_idletimer(make);
+	}
+
+	make_rose->condition = 0x00;
+	make_rose->vs        = 0;
+	make_rose->va        = 0;
+	make_rose->vr        = 0;
+	make_rose->vl        = 0;
+	sk->sk_ack_backlog++;
+
+	rose_insert_socket(make);
+
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	rose_start_heartbeat(make);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb->len);
+
+	return 1;
+}
+
+static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name;
+	int err;
+	struct full_sockaddr_rose srose;
+	struct sk_buff *skb;
+	unsigned char *asmptr;
+	int n, size, qbit = 0;
+
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
+		return -EINVAL;
+
+	if (sock_flag(sk, SOCK_ZAPPED))
+		return -EADDRNOTAVAIL;
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		return -EPIPE;
+	}
+
+	if (rose->neighbour == NULL || rose->device == NULL)
+		return -ENETUNREACH;
+
+	if (usrose != NULL) {
+		if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose))
+			return -EINVAL;
+		memset(&srose, 0, sizeof(struct full_sockaddr_rose));
+		memcpy(&srose, usrose, msg->msg_namelen);
+		if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 ||
+		    ax25cmp(&rose->dest_call, &srose.srose_call) != 0)
+			return -EISCONN;
+		if (srose.srose_ndigis != rose->dest_ndigis)
+			return -EISCONN;
+		if (srose.srose_ndigis == rose->dest_ndigis) {
+			for (n = 0 ; n < srose.srose_ndigis ; n++)
+				if (ax25cmp(&rose->dest_digis[n],
+					    &srose.srose_digis[n]))
+					return -EISCONN;
+		}
+		if (srose.srose_family != AF_ROSE)
+			return -EINVAL;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -ENOTCONN;
+
+		srose.srose_family = AF_ROSE;
+		srose.srose_addr   = rose->dest_addr;
+		srose.srose_call   = rose->dest_call;
+		srose.srose_ndigis = rose->dest_ndigis;
+		for (n = 0 ; n < rose->dest_ndigis ; n++)
+			srose.srose_digis[n] = rose->dest_digis[n];
+	}
+
+	/* Build a packet */
+	/* Sanity check the packet size */
+	if (len > 65535)
+		return -EMSGSIZE;
+
+	size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN;
+
+	if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
+		return err;
+
+	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN);
+
+	/*
+	 *	Put the data on the end
+	 */
+
+	skb_reset_transport_header(skb);
+	skb_put(skb, len);
+
+	err = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	if (err) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	/*
+	 *	If the Q BIT Include socket option is in force, the first
+	 *	byte of the user data is the logical value of the Q Bit.
+	 */
+	if (rose->qbitincl) {
+		qbit = skb->data[0];
+		skb_pull(skb, 1);
+	}
+
+	/*
+	 *	Push down the ROSE header
+	 */
+	asmptr = skb_push(skb, ROSE_MIN_LEN);
+
+	/* Build a ROSE Network header */
+	asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI;
+	asmptr[1] = (rose->lci >> 0) & 0xFF;
+	asmptr[2] = ROSE_DATA;
+
+	if (qbit)
+		asmptr[0] |= ROSE_Q_BIT;
+
+	if (sk->sk_state != TCP_ESTABLISHED) {
+		kfree_skb(skb);
+		return -ENOTCONN;
+	}
+
+#ifdef M_BIT
+#define ROSE_PACLEN (256-ROSE_MIN_LEN)
+	if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) {
+		unsigned char header[ROSE_MIN_LEN];
+		struct sk_buff *skbn;
+		int frontlen;
+		int lg;
+
+		/* Save a copy of the Header */
+		skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN);
+		skb_pull(skb, ROSE_MIN_LEN);
+
+		frontlen = skb_headroom(skb);
+
+		while (skb->len > 0) {
+			if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) {
+				kfree_skb(skb);
+				return err;
+			}
+
+			skbn->sk   = sk;
+			skbn->free = 1;
+			skbn->arp  = 1;
+
+			skb_reserve(skbn, frontlen);
+
+			lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN;
+
+			/* Copy the user data */
+			skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg);
+			skb_pull(skb, lg);
+
+			/* Duplicate the Header */
+			skb_push(skbn, ROSE_MIN_LEN);
+			skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN);
+
+			if (skb->len > 0)
+				skbn->data[2] |= M_BIT;
+
+			skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */
+		}
+
+		skb->free = 1;
+		kfree_skb(skb);
+	} else {
+		skb_queue_tail(&sk->sk_write_queue, skb);		/* Throw it on the queue */
+	}
+#else
+	skb_queue_tail(&sk->sk_write_queue, skb);	/* Shove it onto the queue */
+#endif
+
+	rose_kick(sk);
+
+	return len;
+}
+
+
+static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	struct sockaddr_rose *srose = (struct sockaddr_rose *)msg->msg_name;
+	size_t copied;
+	unsigned char *asmptr;
+	struct sk_buff *skb;
+	int n, er, qbit;
+
+	/*
+	 * This works for seqpacket too. The receiver has ordered the queue for
+	 * us! We do one quick check first though
+	 */
+	if (sk->sk_state != TCP_ESTABLISHED)
+		return -ENOTCONN;
+
+	/* Now we can treat all alike */
+	if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL)
+		return er;
+
+	qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT;
+
+	skb_pull(skb, ROSE_MIN_LEN);
+
+	if (rose->qbitincl) {
+		asmptr  = skb_push(skb, 1);
+		*asmptr = qbit;
+	}
+
+	skb_reset_transport_header(skb);
+	copied     = skb->len;
+
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	if (srose != NULL) {
+		srose->srose_family = AF_ROSE;
+		srose->srose_addr   = rose->dest_addr;
+		srose->srose_call   = rose->dest_call;
+		srose->srose_ndigis = rose->dest_ndigis;
+		if (msg->msg_namelen >= sizeof(struct full_sockaddr_rose)) {
+			struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)msg->msg_name;
+			for (n = 0 ; n < rose->dest_ndigis ; n++)
+				full_srose->srose_digis[n] = rose->dest_digis[n];
+			msg->msg_namelen = sizeof(struct full_sockaddr_rose);
+		} else {
+			if (rose->dest_ndigis >= 1) {
+				srose->srose_ndigis = 1;
+				srose->srose_digi = rose->dest_digis[0];
+			}
+			msg->msg_namelen = sizeof(struct sockaddr_rose);
+		}
+	}
+
+	skb_free_datagram(sk, skb);
+
+	return copied;
+}
+
+
+static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct rose_sock *rose = rose_sk(sk);
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case TIOCOUTQ: {
+		long amount;
+
+		amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+		if (amount < 0)
+			amount = 0;
+		return put_user(amount, (unsigned int __user *) argp);
+	}
+
+	case TIOCINQ: {
+		struct sk_buff *skb;
+		long amount = 0L;
+		/* These two are safe on a single CPU system as only user tasks fiddle here */
+		if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+			amount = skb->len;
+		return put_user(amount, (unsigned int __user *) argp);
+	}
+
+	case SIOCGSTAMP:
+		return sock_get_timestamp(sk, (struct timeval __user *) argp);
+
+	case SIOCGSTAMPNS:
+		return sock_get_timestampns(sk, (struct timespec __user *) argp);
+
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+		return -EINVAL;
+
+	case SIOCADDRT:
+	case SIOCDELRT:
+	case SIOCRSCLRRT:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		return rose_rt_ioctl(cmd, argp);
+
+	case SIOCRSGCAUSE: {
+		struct rose_cause_struct rose_cause;
+		rose_cause.cause      = rose->cause;
+		rose_cause.diagnostic = rose->diagnostic;
+		return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0;
+	}
+
+	case SIOCRSSCAUSE: {
+		struct rose_cause_struct rose_cause;
+		if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct)))
+			return -EFAULT;
+		rose->cause      = rose_cause.cause;
+		rose->diagnostic = rose_cause.diagnostic;
+		return 0;
+	}
+
+	case SIOCRSSL2CALL:
+		if (!capable(CAP_NET_ADMIN)) return -EPERM;
+		if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+			ax25_listen_release(&rose_callsign, NULL);
+		if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address)))
+			return -EFAULT;
+		if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+			return ax25_listen_register(&rose_callsign, NULL);
+
+		return 0;
+
+	case SIOCRSGL2CALL:
+		return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0;
+
+	case SIOCRSACCEPT:
+		if (rose->state == ROSE_STATE_5) {
+			rose_write_internal(sk, ROSE_CALL_ACCEPTED);
+			rose_start_idletimer(sk);
+			rose->condition = 0x00;
+			rose->vs        = 0;
+			rose->va        = 0;
+			rose->vr        = 0;
+			rose->vl        = 0;
+			rose->state     = ROSE_STATE_3;
+		}
+		return 0;
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void *rose_info_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rose_list_lock)
+{
+	spin_lock_bh(&rose_list_lock);
+	return seq_hlist_start_head(&rose_list, *pos);
+}
+
+static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &rose_list, pos);
+}
+
+static void rose_info_stop(struct seq_file *seq, void *v)
+	__releases(rose_list_lock)
+{
+	spin_unlock_bh(&rose_list_lock);
+}
+
+static int rose_info_show(struct seq_file *seq, void *v)
+{
+	char buf[11], rsbuf[11];
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "dest_addr  dest_call src_addr   src_call  dev   lci neigh st vs vr va   t  t1  t2  t3  hb    idle Snd-Q Rcv-Q inode\n");
+
+	else {
+		struct sock *s = sk_entry(v);
+		struct rose_sock *rose = rose_sk(s);
+		const char *devname, *callsign;
+		const struct net_device *dev = rose->device;
+
+		if (!dev)
+			devname = "???";
+		else
+			devname = dev->name;
+
+		seq_printf(seq, "%-10s %-9s ",
+			   rose2asc(rsbuf, &rose->dest_addr),
+			   ax2asc(buf, &rose->dest_call));
+
+		if (ax25cmp(&rose->source_call, &null_ax25_address) == 0)
+			callsign = "??????-?";
+		else
+			callsign = ax2asc(buf, &rose->source_call);
+
+		seq_printf(seq,
+			   "%-10s %-9s %-5s %3.3X %05d  %d  %d  %d  %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n",
+			rose2asc(rsbuf, &rose->source_addr),
+			callsign,
+			devname,
+			rose->lci & 0x0FFF,
+			(rose->neighbour) ? rose->neighbour->number : 0,
+			rose->state,
+			rose->vs,
+			rose->vr,
+			rose->va,
+			ax25_display_timer(&rose->timer) / HZ,
+			rose->t1 / HZ,
+			rose->t2 / HZ,
+			rose->t3 / HZ,
+			rose->hb / HZ,
+			ax25_display_timer(&rose->idletimer) / (60 * HZ),
+			rose->idle / (60 * HZ),
+			sk_wmem_alloc_get(s),
+			sk_rmem_alloc_get(s),
+			s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations rose_info_seqops = {
+	.start = rose_info_start,
+	.next = rose_info_next,
+	.stop = rose_info_stop,
+	.show = rose_info_show,
+};
+
+static int rose_info_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rose_info_seqops);
+}
+
+static const struct file_operations rose_info_fops = {
+	.owner = THIS_MODULE,
+	.open = rose_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+#endif	/* CONFIG_PROC_FS */
+
+static const struct net_proto_family rose_family_ops = {
+	.family		=	PF_ROSE,
+	.create		=	rose_create,
+	.owner		=	THIS_MODULE,
+};
+
+static const struct proto_ops rose_proto_ops = {
+	.family		=	PF_ROSE,
+	.owner		=	THIS_MODULE,
+	.release	=	rose_release,
+	.bind		=	rose_bind,
+	.connect	=	rose_connect,
+	.socketpair	=	sock_no_socketpair,
+	.accept		=	rose_accept,
+	.getname	=	rose_getname,
+	.poll		=	datagram_poll,
+	.ioctl		=	rose_ioctl,
+	.listen		=	rose_listen,
+	.shutdown	=	sock_no_shutdown,
+	.setsockopt	=	rose_setsockopt,
+	.getsockopt	=	rose_getsockopt,
+	.sendmsg	=	rose_sendmsg,
+	.recvmsg	=	rose_recvmsg,
+	.mmap		=	sock_no_mmap,
+	.sendpage	=	sock_no_sendpage,
+};
+
+static struct notifier_block rose_dev_notifier = {
+	.notifier_call	=	rose_device_event,
+};
+
+static struct net_device **dev_rose;
+
+static struct ax25_protocol rose_pid = {
+	.pid	= AX25_P_ROSE,
+	.func	= rose_route_frame
+};
+
+static struct ax25_linkfail rose_linkfail_notifier = {
+	.func	= rose_link_failed
+};
+
+static int __init rose_proto_init(void)
+{
+	int i;
+	int rc;
+
+	if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
+		printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	rc = proto_register(&rose_proto, 0);
+	if (rc != 0)
+		goto out;
+
+	rose_callsign = null_ax25_address;
+
+	dev_rose = kzalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL);
+	if (dev_rose == NULL) {
+		printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n");
+		rc = -ENOMEM;
+		goto out_proto_unregister;
+	}
+
+	for (i = 0; i < rose_ndevs; i++) {
+		struct net_device *dev;
+		char name[IFNAMSIZ];
+
+		sprintf(name, "rose%d", i);
+		dev = alloc_netdev(0, name, rose_setup);
+		if (!dev) {
+			printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n");
+			rc = -ENOMEM;
+			goto fail;
+		}
+		rc = register_netdev(dev);
+		if (rc) {
+			printk(KERN_ERR "ROSE: netdevice registration failed\n");
+			free_netdev(dev);
+			goto fail;
+		}
+		rose_set_lockdep_key(dev);
+		dev_rose[i] = dev;
+	}
+
+	sock_register(&rose_family_ops);
+	register_netdevice_notifier(&rose_dev_notifier);
+
+	ax25_register_pid(&rose_pid);
+	ax25_linkfail_register(&rose_linkfail_notifier);
+
+#ifdef CONFIG_SYSCTL
+	rose_register_sysctl();
+#endif
+	rose_loopback_init();
+
+	rose_add_loopback_neigh();
+
+	proc_net_fops_create(&init_net, "rose", S_IRUGO, &rose_info_fops);
+	proc_net_fops_create(&init_net, "rose_neigh", S_IRUGO, &rose_neigh_fops);
+	proc_net_fops_create(&init_net, "rose_nodes", S_IRUGO, &rose_nodes_fops);
+	proc_net_fops_create(&init_net, "rose_routes", S_IRUGO, &rose_routes_fops);
+out:
+	return rc;
+fail:
+	while (--i >= 0) {
+		unregister_netdev(dev_rose[i]);
+		free_netdev(dev_rose[i]);
+	}
+	kfree(dev_rose);
+out_proto_unregister:
+	proto_unregister(&rose_proto);
+	goto out;
+}
+module_init(rose_proto_init);
+
+module_param(rose_ndevs, int, 0);
+MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices");
+
+MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_ROSE);
+
+static void __exit rose_exit(void)
+{
+	int i;
+
+	proc_net_remove(&init_net, "rose");
+	proc_net_remove(&init_net, "rose_neigh");
+	proc_net_remove(&init_net, "rose_nodes");
+	proc_net_remove(&init_net, "rose_routes");
+	rose_loopback_clear();
+
+	rose_rt_free();
+
+	ax25_protocol_release(AX25_P_ROSE);
+	ax25_linkfail_release(&rose_linkfail_notifier);
+
+	if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
+		ax25_listen_release(&rose_callsign, NULL);
+
+#ifdef CONFIG_SYSCTL
+	rose_unregister_sysctl();
+#endif
+	unregister_netdevice_notifier(&rose_dev_notifier);
+
+	sock_unregister(PF_ROSE);
+
+	for (i = 0; i < rose_ndevs; i++) {
+		struct net_device *dev = dev_rose[i];
+
+		if (dev) {
+			unregister_netdev(dev);
+			free_netdev(dev);
+		}
+	}
+
+	kfree(dev_rose);
+	proto_unregister(&rose_proto);
+}
+
+module_exit(rose_exit);
diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c
new file mode 100644
index 00000000..2679507a
--- /dev/null
+++ b/net/rose/rose_dev.c
@@ -0,0 +1,172 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/io.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/arp.h>
+
+#include <net/ax25.h>
+#include <net/rose.h>
+
+static int rose_header(struct sk_buff *skb, struct net_device *dev,
+		       unsigned short type,
+		       const void *daddr, const void *saddr, unsigned len)
+{
+	unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2);
+
+	*buff++ = ROSE_GFI | ROSE_Q_BIT;
+	*buff++ = 0x00;
+	*buff++ = ROSE_DATA;
+	*buff++ = 0x7F;
+	*buff++ = AX25_P_IP;
+
+	if (daddr != NULL)
+		return 37;
+
+	return -37;
+}
+
+static int rose_rebuild_header(struct sk_buff *skb)
+{
+#ifdef CONFIG_INET
+	struct net_device *dev = skb->dev;
+	struct net_device_stats *stats = &dev->stats;
+	unsigned char *bp = (unsigned char *)skb->data;
+	struct sk_buff *skbn;
+	unsigned int len;
+
+	if (arp_find(bp + 7, skb)) {
+		return 1;
+	}
+
+	if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+		kfree_skb(skb);
+		return 1;
+	}
+
+	if (skb->sk != NULL)
+		skb_set_owner_w(skbn, skb->sk);
+
+	kfree_skb(skb);
+
+	len = skbn->len;
+
+	if (!rose_route_frame(skbn, NULL)) {
+		kfree_skb(skbn);
+		stats->tx_errors++;
+		return 1;
+	}
+
+	stats->tx_packets++;
+	stats->tx_bytes += len;
+#endif
+	return 1;
+}
+
+static int rose_set_mac_address(struct net_device *dev, void *addr)
+{
+	struct sockaddr *sa = addr;
+	int err;
+
+	if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len))
+		return 0;
+
+	if (dev->flags & IFF_UP) {
+		err = rose_add_loopback_node((rose_address *)sa->sa_data);
+		if (err)
+			return err;
+
+		rose_del_loopback_node((rose_address *)dev->dev_addr);
+	}
+
+	memcpy(dev->dev_addr, sa->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+static int rose_open(struct net_device *dev)
+{
+	int err;
+
+	err = rose_add_loopback_node((rose_address *)dev->dev_addr);
+	if (err)
+		return err;
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static int rose_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	rose_del_loopback_node((rose_address *)dev->dev_addr);
+	return 0;
+}
+
+static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats = &dev->stats;
+
+	if (!netif_running(dev)) {
+		printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n");
+		return NETDEV_TX_BUSY;
+	}
+	dev_kfree_skb(skb);
+	stats->tx_errors++;
+	return NETDEV_TX_OK;
+}
+
+static const struct header_ops rose_header_ops = {
+	.create	= rose_header,
+	.rebuild= rose_rebuild_header,
+};
+
+static const struct net_device_ops rose_netdev_ops = {
+	.ndo_open		= rose_open,
+	.ndo_stop		= rose_close,
+	.ndo_start_xmit		= rose_xmit,
+	.ndo_set_mac_address    = rose_set_mac_address,
+};
+
+void rose_setup(struct net_device *dev)
+{
+	dev->mtu		= ROSE_MAX_PACKET_SIZE - 2;
+	dev->netdev_ops		= &rose_netdev_ops;
+
+	dev->header_ops		= &rose_header_ops;
+	dev->hard_header_len	= AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN;
+	dev->addr_len		= ROSE_ADDR_LEN;
+	dev->type		= ARPHRD_ROSE;
+
+	/* New-style flags. */
+	dev->flags		= IFF_NOARP;
+}
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
new file mode 100644
index 00000000..7f7fcb46
--- /dev/null
+++ b/net/rose/rose_in.c
@@ -0,0 +1,295 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ *
+ * Most of this code is based on the SDL diagrams published in the 7th ARRL
+ * Computer Networking Conference papers. The diagrams have mistakes in them,
+ * but are mostly correct. Before you modify the code could you read the SDL
+ * diagrams as the code is not obvious and probably very easy to break.
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+/*
+ * State machine for state 1, Awaiting Call Accepted State.
+ * The handling of the timer(s) is in file rose_timer.c.
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	switch (frametype) {
+	case ROSE_CALL_ACCEPTED:
+		rose_stop_timer(sk);
+		rose_start_idletimer(sk);
+		rose->condition = 0x00;
+		rose->vs        = 0;
+		rose->va        = 0;
+		rose->vr        = 0;
+		rose->vl        = 0;
+		rose->state     = ROSE_STATE_3;
+		sk->sk_state	= TCP_ESTABLISHED;
+		if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_state_change(sk);
+		break;
+
+	case ROSE_CLEAR_REQUEST:
+		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+		rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
+		rose->neighbour->use--;
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Clear Confirmation State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	switch (frametype) {
+	case ROSE_CLEAR_REQUEST:
+		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+		rose->neighbour->use--;
+		break;
+
+	case ROSE_CLEAR_CONFIRMATION:
+		rose_disconnect(sk, 0, -1, -1);
+		rose->neighbour->use--;
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
+{
+	struct rose_sock *rose = rose_sk(sk);
+	int queued = 0;
+
+	switch (frametype) {
+	case ROSE_RESET_REQUEST:
+		rose_stop_timer(sk);
+		rose_start_idletimer(sk);
+		rose_write_internal(sk, ROSE_RESET_CONFIRMATION);
+		rose->condition = 0x00;
+		rose->vs        = 0;
+		rose->vr        = 0;
+		rose->va        = 0;
+		rose->vl        = 0;
+		rose_requeue_frames(sk);
+		break;
+
+	case ROSE_CLEAR_REQUEST:
+		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+		rose->neighbour->use--;
+		break;
+
+	case ROSE_RR:
+	case ROSE_RNR:
+		if (!rose_validate_nr(sk, nr)) {
+			rose_write_internal(sk, ROSE_RESET_REQUEST);
+			rose->condition = 0x00;
+			rose->vs        = 0;
+			rose->vr        = 0;
+			rose->va        = 0;
+			rose->vl        = 0;
+			rose->state     = ROSE_STATE_4;
+			rose_start_t2timer(sk);
+			rose_stop_idletimer(sk);
+		} else {
+			rose_frames_acked(sk, nr);
+			if (frametype == ROSE_RNR) {
+				rose->condition |= ROSE_COND_PEER_RX_BUSY;
+			} else {
+				rose->condition &= ~ROSE_COND_PEER_RX_BUSY;
+			}
+		}
+		break;
+
+	case ROSE_DATA:	/* XXX */
+		rose->condition &= ~ROSE_COND_PEER_RX_BUSY;
+		if (!rose_validate_nr(sk, nr)) {
+			rose_write_internal(sk, ROSE_RESET_REQUEST);
+			rose->condition = 0x00;
+			rose->vs        = 0;
+			rose->vr        = 0;
+			rose->va        = 0;
+			rose->vl        = 0;
+			rose->state     = ROSE_STATE_4;
+			rose_start_t2timer(sk);
+			rose_stop_idletimer(sk);
+			break;
+		}
+		rose_frames_acked(sk, nr);
+		if (ns == rose->vr) {
+			rose_start_idletimer(sk);
+			if (sock_queue_rcv_skb(sk, skb) == 0) {
+				rose->vr = (rose->vr + 1) % ROSE_MODULUS;
+				queued = 1;
+			} else {
+				/* Should never happen ! */
+				rose_write_internal(sk, ROSE_RESET_REQUEST);
+				rose->condition = 0x00;
+				rose->vs        = 0;
+				rose->vr        = 0;
+				rose->va        = 0;
+				rose->vl        = 0;
+				rose->state     = ROSE_STATE_4;
+				rose_start_t2timer(sk);
+				rose_stop_idletimer(sk);
+				break;
+			}
+			if (atomic_read(&sk->sk_rmem_alloc) >
+			    (sk->sk_rcvbuf >> 1))
+				rose->condition |= ROSE_COND_OWN_RX_BUSY;
+		}
+		/*
+		 * If the window is full, ack the frame, else start the
+		 * acknowledge hold back timer.
+		 */
+		if (((rose->vl + sysctl_rose_window_size) % ROSE_MODULUS) == rose->vr) {
+			rose->condition &= ~ROSE_COND_ACK_PENDING;
+			rose_stop_timer(sk);
+			rose_enquiry_response(sk);
+		} else {
+			rose->condition |= ROSE_COND_ACK_PENDING;
+			rose_start_hbtimer(sk);
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "ROSE: unknown %02X in state 3\n", frametype);
+		break;
+	}
+
+	return queued;
+}
+
+/*
+ * State machine for state 4, Awaiting Reset Confirmation State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	switch (frametype) {
+	case ROSE_RESET_REQUEST:
+		rose_write_internal(sk, ROSE_RESET_CONFIRMATION);
+	case ROSE_RESET_CONFIRMATION:
+		rose_stop_timer(sk);
+		rose_start_idletimer(sk);
+		rose->condition = 0x00;
+		rose->va        = 0;
+		rose->vr        = 0;
+		rose->vs        = 0;
+		rose->vl        = 0;
+		rose->state     = ROSE_STATE_3;
+		rose_requeue_frames(sk);
+		break;
+
+	case ROSE_CLEAR_REQUEST:
+		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+		rose->neighbour->use--;
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * State machine for state 5, Awaiting Call Acceptance State.
+ * The handling of the timer(s) is in file rose_timer.c
+ * Handling of state 0 and connection release is in af_rose.c.
+ */
+static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	if (frametype == ROSE_CLEAR_REQUEST) {
+		rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION);
+		rose_disconnect(sk, 0, skb->data[3], skb->data[4]);
+		rose_sk(sk)->neighbour->use--;
+	}
+
+	return 0;
+}
+
+/* Higher level upcall for a LAPB frame */
+int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+	struct rose_sock *rose = rose_sk(sk);
+	int queued = 0, frametype, ns, nr, q, d, m;
+
+	if (rose->state == ROSE_STATE_0)
+		return 0;
+
+	frametype = rose_decode(skb, &ns, &nr, &q, &d, &m);
+
+	switch (rose->state) {
+	case ROSE_STATE_1:
+		queued = rose_state1_machine(sk, skb, frametype);
+		break;
+	case ROSE_STATE_2:
+		queued = rose_state2_machine(sk, skb, frametype);
+		break;
+	case ROSE_STATE_3:
+		queued = rose_state3_machine(sk, skb, frametype, ns, nr, q, d, m);
+		break;
+	case ROSE_STATE_4:
+		queued = rose_state4_machine(sk, skb, frametype);
+		break;
+	case ROSE_STATE_5:
+		queued = rose_state5_machine(sk, skb, frametype);
+		break;
+	}
+
+	rose_kick(sk);
+
+	return queued;
+}
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
new file mode 100644
index 00000000..fa5f5641
--- /dev/null
+++ b/net/rose/rose_link.c
@@ -0,0 +1,299 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/netfilter.h>
+#include <net/rose.h>
+
+static void rose_ftimer_expiry(unsigned long);
+static void rose_t0timer_expiry(unsigned long);
+
+static void rose_transmit_restart_confirmation(struct rose_neigh *neigh);
+static void rose_transmit_restart_request(struct rose_neigh *neigh);
+
+void rose_start_ftimer(struct rose_neigh *neigh)
+{
+	del_timer(&neigh->ftimer);
+
+	neigh->ftimer.data     = (unsigned long)neigh;
+	neigh->ftimer.function = &rose_ftimer_expiry;
+	neigh->ftimer.expires  =
+		jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout);
+
+	add_timer(&neigh->ftimer);
+}
+
+static void rose_start_t0timer(struct rose_neigh *neigh)
+{
+	del_timer(&neigh->t0timer);
+
+	neigh->t0timer.data     = (unsigned long)neigh;
+	neigh->t0timer.function = &rose_t0timer_expiry;
+	neigh->t0timer.expires  =
+		jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout);
+
+	add_timer(&neigh->t0timer);
+}
+
+void rose_stop_ftimer(struct rose_neigh *neigh)
+{
+	del_timer(&neigh->ftimer);
+}
+
+void rose_stop_t0timer(struct rose_neigh *neigh)
+{
+	del_timer(&neigh->t0timer);
+}
+
+int rose_ftimer_running(struct rose_neigh *neigh)
+{
+	return timer_pending(&neigh->ftimer);
+}
+
+static int rose_t0timer_running(struct rose_neigh *neigh)
+{
+	return timer_pending(&neigh->t0timer);
+}
+
+static void rose_ftimer_expiry(unsigned long param)
+{
+}
+
+static void rose_t0timer_expiry(unsigned long param)
+{
+	struct rose_neigh *neigh = (struct rose_neigh *)param;
+
+	rose_transmit_restart_request(neigh);
+
+	neigh->dce_mode = 0;
+
+	rose_start_t0timer(neigh);
+}
+
+/*
+ *	Interface to ax25_send_frame. Changes my level 2 callsign depending
+ *	on whether we have a global ROSE callsign or use the default port
+ *	callsign.
+ */
+static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+	ax25_address *rose_call;
+	ax25_cb *ax25s;
+
+	if (ax25cmp(&rose_callsign, &null_ax25_address) == 0)
+		rose_call = (ax25_address *)neigh->dev->dev_addr;
+	else
+		rose_call = &rose_callsign;
+
+	ax25s = neigh->ax25;
+	neigh->ax25 = ax25_send_frame(skb, 260, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev);
+	if (ax25s)
+		ax25_cb_put(ax25s);
+
+	return neigh->ax25 != NULL;
+}
+
+/*
+ *	Interface to ax25_link_up. Changes my level 2 callsign depending
+ *	on whether we have a global ROSE callsign or use the default port
+ *	callsign.
+ */
+static int rose_link_up(struct rose_neigh *neigh)
+{
+	ax25_address *rose_call;
+	ax25_cb *ax25s;
+
+	if (ax25cmp(&rose_callsign, &null_ax25_address) == 0)
+		rose_call = (ax25_address *)neigh->dev->dev_addr;
+	else
+		rose_call = &rose_callsign;
+
+	ax25s = neigh->ax25;
+	neigh->ax25 = ax25_find_cb(rose_call, &neigh->callsign, neigh->digipeat, neigh->dev);
+	if (ax25s)
+		ax25_cb_put(ax25s);
+
+	return neigh->ax25 != NULL;
+}
+
+/*
+ *	This handles all restart and diagnostic frames.
+ */
+void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigned short frametype)
+{
+	struct sk_buff *skbn;
+
+	switch (frametype) {
+	case ROSE_RESTART_REQUEST:
+		rose_stop_t0timer(neigh);
+		neigh->restarted = 1;
+		neigh->dce_mode  = (skb->data[3] == ROSE_DTE_ORIGINATED);
+		rose_transmit_restart_confirmation(neigh);
+		break;
+
+	case ROSE_RESTART_CONFIRMATION:
+		rose_stop_t0timer(neigh);
+		neigh->restarted = 1;
+		break;
+
+	case ROSE_DIAGNOSTIC:
+		printk(KERN_WARNING "ROSE: received diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]);
+		break;
+
+	default:
+		printk(KERN_WARNING "ROSE: received unknown %02X with LCI 000\n", frametype);
+		break;
+	}
+
+	if (neigh->restarted) {
+		while ((skbn = skb_dequeue(&neigh->queue)) != NULL)
+			if (!rose_send_frame(skbn, neigh))
+				kfree_skb(skbn);
+	}
+}
+
+/*
+ *	This routine is called when a Restart Request is needed
+ */
+static void rose_transmit_restart_request(struct rose_neigh *neigh)
+{
+	struct sk_buff *skb;
+	unsigned char *dptr;
+	int len;
+
+	len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+	dptr = skb_put(skb, ROSE_MIN_LEN + 3);
+
+	*dptr++ = AX25_P_ROSE;
+	*dptr++ = ROSE_GFI;
+	*dptr++ = 0x00;
+	*dptr++ = ROSE_RESTART_REQUEST;
+	*dptr++ = ROSE_DTE_ORIGINATED;
+	*dptr++ = 0;
+
+	if (!rose_send_frame(skb, neigh))
+		kfree_skb(skb);
+}
+
+/*
+ * This routine is called when a Restart Confirmation is needed
+ */
+static void rose_transmit_restart_confirmation(struct rose_neigh *neigh)
+{
+	struct sk_buff *skb;
+	unsigned char *dptr;
+	int len;
+
+	len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1;
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+	dptr = skb_put(skb, ROSE_MIN_LEN + 1);
+
+	*dptr++ = AX25_P_ROSE;
+	*dptr++ = ROSE_GFI;
+	*dptr++ = 0x00;
+	*dptr++ = ROSE_RESTART_CONFIRMATION;
+
+	if (!rose_send_frame(skb, neigh))
+		kfree_skb(skb);
+}
+
+/*
+ * This routine is called when a Clear Request is needed outside of the context
+ * of a connected socket.
+ */
+void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, unsigned char cause, unsigned char diagnostic)
+{
+	struct sk_buff *skb;
+	unsigned char *dptr;
+	int len;
+
+	len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3;
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN);
+
+	dptr = skb_put(skb, ROSE_MIN_LEN + 3);
+
+	*dptr++ = AX25_P_ROSE;
+	*dptr++ = ((lci >> 8) & 0x0F) | ROSE_GFI;
+	*dptr++ = ((lci >> 0) & 0xFF);
+	*dptr++ = ROSE_CLEAR_REQUEST;
+	*dptr++ = cause;
+	*dptr++ = diagnostic;
+
+	if (!rose_send_frame(skb, neigh))
+		kfree_skb(skb);
+}
+
+void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+	unsigned char *dptr;
+
+#if 0
+	if (call_fw_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) {
+		kfree_skb(skb);
+		return;
+	}
+#endif
+
+	if (neigh->loopback) {
+		rose_loopback_queue(skb, neigh);
+		return;
+	}
+
+	if (!rose_link_up(neigh))
+		neigh->restarted = 0;
+
+	dptr = skb_push(skb, 1);
+	*dptr++ = AX25_P_ROSE;
+
+	if (neigh->restarted) {
+		if (!rose_send_frame(skb, neigh))
+			kfree_skb(skb);
+	} else {
+		skb_queue_tail(&neigh->queue, skb);
+
+		if (!rose_t0timer_running(neigh)) {
+			rose_transmit_restart_request(neigh);
+			neigh->dce_mode = 0;
+			rose_start_t0timer(neigh);
+		}
+	}
+}
diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c
new file mode 100644
index 00000000..34445620
--- /dev/null
+++ b/net/rose/rose_loopback.c
@@ -0,0 +1,124 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timer.h>
+#include <net/ax25.h>
+#include <linux/skbuff.h>
+#include <net/rose.h>
+#include <linux/init.h>
+
+static struct sk_buff_head loopback_queue;
+static struct timer_list loopback_timer;
+
+static void rose_set_loopback_timer(void);
+
+void rose_loopback_init(void)
+{
+	skb_queue_head_init(&loopback_queue);
+
+	init_timer(&loopback_timer);
+}
+
+static int rose_loopback_running(void)
+{
+	return timer_pending(&loopback_timer);
+}
+
+int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh)
+{
+	struct sk_buff *skbn;
+
+	skbn = skb_clone(skb, GFP_ATOMIC);
+
+	kfree_skb(skb);
+
+	if (skbn != NULL) {
+		skb_queue_tail(&loopback_queue, skbn);
+
+		if (!rose_loopback_running())
+			rose_set_loopback_timer();
+	}
+
+	return 1;
+}
+
+static void rose_loopback_timer(unsigned long);
+
+static void rose_set_loopback_timer(void)
+{
+	del_timer(&loopback_timer);
+
+	loopback_timer.data     = 0;
+	loopback_timer.function = &rose_loopback_timer;
+	loopback_timer.expires  = jiffies + 10;
+
+	add_timer(&loopback_timer);
+}
+
+static void rose_loopback_timer(unsigned long param)
+{
+	struct sk_buff *skb;
+	struct net_device *dev;
+	rose_address *dest;
+	struct sock *sk;
+	unsigned short frametype;
+	unsigned int lci_i, lci_o;
+
+	while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+		if (skb->len < ROSE_MIN_LEN) {
+			kfree_skb(skb);
+			continue;
+		}
+		lci_i     = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+		frametype = skb->data[2];
+		if (frametype == ROSE_CALL_REQUEST &&
+		    (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF ||
+		     skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] !=
+		     ROSE_CALL_REQ_ADDR_LEN_VAL)) {
+			kfree_skb(skb);
+			continue;
+		}
+		dest      = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF);
+		lci_o     = ROSE_DEFAULT_MAXVC + 1 - lci_i;
+
+		skb_reset_transport_header(skb);
+
+		sk = rose_find_socket(lci_o, rose_loopback_neigh);
+		if (sk) {
+			if (rose_process_rx_frame(sk, skb) == 0)
+				kfree_skb(skb);
+			continue;
+		}
+
+		if (frametype == ROSE_CALL_REQUEST) {
+			if ((dev = rose_dev_get(dest)) != NULL) {
+				if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0)
+					kfree_skb(skb);
+			} else {
+				kfree_skb(skb);
+			}
+		} else {
+			kfree_skb(skb);
+		}
+	}
+}
+
+void __exit rose_loopback_clear(void)
+{
+	struct sk_buff *skb;
+
+	del_timer(&loopback_timer);
+
+	while ((skb = skb_dequeue(&loopback_queue)) != NULL) {
+		skb->sk = NULL;
+		kfree_skb(skb);
+	}
+}
diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c
new file mode 100644
index 00000000..4ebf33af
--- /dev/null
+++ b/net/rose/rose_out.c
@@ -0,0 +1,126 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+/*
+ *	This procedure is passed a buffer descriptor for an iframe. It builds
+ *	the rest of the control part of the frame and then writes it out.
+ */
+static void rose_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	if (skb == NULL)
+		return;
+
+	skb->data[2] |= (rose->vr << 5) & 0xE0;
+	skb->data[2] |= (rose->vs << 1) & 0x0E;
+
+	rose_start_idletimer(sk);
+
+	rose_transmit_link(skb, rose->neighbour);
+}
+
+void rose_kick(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+	struct sk_buff *skb, *skbn;
+	unsigned short start, end;
+
+	if (rose->state != ROSE_STATE_3)
+		return;
+
+	if (rose->condition & ROSE_COND_PEER_RX_BUSY)
+		return;
+
+	if (!skb_peek(&sk->sk_write_queue))
+		return;
+
+	start = (skb_peek(&rose->ack_queue) == NULL) ? rose->va : rose->vs;
+	end   = (rose->va + sysctl_rose_window_size) % ROSE_MODULUS;
+
+	if (start == end)
+		return;
+
+	rose->vs = start;
+
+	/*
+	 * Transmit data until either we're out of data to send or
+	 * the window is full.
+	 */
+
+	skb  = skb_dequeue(&sk->sk_write_queue);
+
+	do {
+		if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+			skb_queue_head(&sk->sk_write_queue, skb);
+			break;
+		}
+
+		skb_set_owner_w(skbn, sk);
+
+		/*
+		 * Transmit the frame copy.
+		 */
+		rose_send_iframe(sk, skbn);
+
+		rose->vs = (rose->vs + 1) % ROSE_MODULUS;
+
+		/*
+		 * Requeue the original data frame.
+		 */
+		skb_queue_tail(&rose->ack_queue, skb);
+
+	} while (rose->vs != end &&
+		 (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+	rose->vl         = rose->vr;
+	rose->condition &= ~ROSE_COND_ACK_PENDING;
+
+	rose_stop_timer(sk);
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void rose_enquiry_response(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	if (rose->condition & ROSE_COND_OWN_RX_BUSY)
+		rose_write_internal(sk, ROSE_RNR);
+	else
+		rose_write_internal(sk, ROSE_RR);
+
+	rose->vl         = rose->vr;
+	rose->condition &= ~ROSE_COND_ACK_PENDING;
+
+	rose_stop_timer(sk);
+}
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
new file mode 100644
index 00000000..479cae57
--- /dev/null
+++ b/net/rose/rose_route.c
@@ -0,0 +1,1371 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/arp.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/netfilter.h>
+#include <linux/init.h>
+#include <net/rose.h>
+#include <linux/seq_file.h>
+
+static unsigned int rose_neigh_no = 1;
+
+static struct rose_node  *rose_node_list;
+static DEFINE_SPINLOCK(rose_node_list_lock);
+static struct rose_neigh *rose_neigh_list;
+static DEFINE_SPINLOCK(rose_neigh_list_lock);
+static struct rose_route *rose_route_list;
+static DEFINE_SPINLOCK(rose_route_list_lock);
+
+struct rose_neigh *rose_loopback_neigh;
+
+/*
+ *	Add a new route to a node, and in the process add the node and the
+ *	neighbour if it is new.
+ */
+static int __must_check rose_add_node(struct rose_route_struct *rose_route,
+	struct net_device *dev)
+{
+	struct rose_node  *rose_node, *rose_tmpn, *rose_tmpp;
+	struct rose_neigh *rose_neigh;
+	int i, res = 0;
+
+	spin_lock_bh(&rose_node_list_lock);
+	spin_lock_bh(&rose_neigh_list_lock);
+
+	rose_node = rose_node_list;
+	while (rose_node != NULL) {
+		if ((rose_node->mask == rose_route->mask) &&
+		    (rosecmpm(&rose_route->address, &rose_node->address,
+			      rose_route->mask) == 0))
+			break;
+		rose_node = rose_node->next;
+	}
+
+	if (rose_node != NULL && rose_node->loopback) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	rose_neigh = rose_neigh_list;
+	while (rose_neigh != NULL) {
+		if (ax25cmp(&rose_route->neighbour,
+			    &rose_neigh->callsign) == 0 &&
+		    rose_neigh->dev == dev)
+			break;
+		rose_neigh = rose_neigh->next;
+	}
+
+	if (rose_neigh == NULL) {
+		rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC);
+		if (rose_neigh == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+
+		rose_neigh->callsign  = rose_route->neighbour;
+		rose_neigh->digipeat  = NULL;
+		rose_neigh->ax25      = NULL;
+		rose_neigh->dev       = dev;
+		rose_neigh->count     = 0;
+		rose_neigh->use       = 0;
+		rose_neigh->dce_mode  = 0;
+		rose_neigh->loopback  = 0;
+		rose_neigh->number    = rose_neigh_no++;
+		rose_neigh->restarted = 0;
+
+		skb_queue_head_init(&rose_neigh->queue);
+
+		init_timer(&rose_neigh->ftimer);
+		init_timer(&rose_neigh->t0timer);
+
+		if (rose_route->ndigis != 0) {
+			rose_neigh->digipeat =
+				kmalloc(sizeof(ax25_digi), GFP_ATOMIC);
+			if (rose_neigh->digipeat == NULL) {
+				kfree(rose_neigh);
+				res = -ENOMEM;
+				goto out;
+			}
+
+			rose_neigh->digipeat->ndigi      = rose_route->ndigis;
+			rose_neigh->digipeat->lastrepeat = -1;
+
+			for (i = 0; i < rose_route->ndigis; i++) {
+				rose_neigh->digipeat->calls[i]    =
+					rose_route->digipeaters[i];
+				rose_neigh->digipeat->repeated[i] = 0;
+			}
+		}
+
+		rose_neigh->next = rose_neigh_list;
+		rose_neigh_list  = rose_neigh;
+	}
+
+	/*
+	 * This is a new node to be inserted into the list. Find where it needs
+	 * to be inserted into the list, and insert it. We want to be sure
+	 * to order the list in descending order of mask size to ensure that
+	 * later when we are searching this list the first match will be the
+	 * best match.
+	 */
+	if (rose_node == NULL) {
+		rose_tmpn = rose_node_list;
+		rose_tmpp = NULL;
+
+		while (rose_tmpn != NULL) {
+			if (rose_tmpn->mask > rose_route->mask) {
+				rose_tmpp = rose_tmpn;
+				rose_tmpn = rose_tmpn->next;
+			} else {
+				break;
+			}
+		}
+
+		/* create new node */
+		rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC);
+		if (rose_node == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+
+		rose_node->address      = rose_route->address;
+		rose_node->mask         = rose_route->mask;
+		rose_node->count        = 1;
+		rose_node->loopback     = 0;
+		rose_node->neighbour[0] = rose_neigh;
+
+		if (rose_tmpn == NULL) {
+			if (rose_tmpp == NULL) {	/* Empty list */
+				rose_node_list  = rose_node;
+				rose_node->next = NULL;
+			} else {
+				rose_tmpp->next = rose_node;
+				rose_node->next = NULL;
+			}
+		} else {
+			if (rose_tmpp == NULL) {	/* 1st node */
+				rose_node->next = rose_node_list;
+				rose_node_list  = rose_node;
+			} else {
+				rose_tmpp->next = rose_node;
+				rose_node->next = rose_tmpn;
+			}
+		}
+		rose_neigh->count++;
+
+		goto out;
+	}
+
+	/* We have space, slot it in */
+	if (rose_node->count < 3) {
+		rose_node->neighbour[rose_node->count] = rose_neigh;
+		rose_node->count++;
+		rose_neigh->count++;
+	}
+
+out:
+	spin_unlock_bh(&rose_neigh_list_lock);
+	spin_unlock_bh(&rose_node_list_lock);
+
+	return res;
+}
+
+/*
+ * Caller is holding rose_node_list_lock.
+ */
+static void rose_remove_node(struct rose_node *rose_node)
+{
+	struct rose_node *s;
+
+	if ((s = rose_node_list) == rose_node) {
+		rose_node_list = rose_node->next;
+		kfree(rose_node);
+		return;
+	}
+
+	while (s != NULL && s->next != NULL) {
+		if (s->next == rose_node) {
+			s->next = rose_node->next;
+			kfree(rose_node);
+			return;
+		}
+
+		s = s->next;
+	}
+}
+
+/*
+ * Caller is holding rose_neigh_list_lock.
+ */
+static void rose_remove_neigh(struct rose_neigh *rose_neigh)
+{
+	struct rose_neigh *s;
+
+	rose_stop_ftimer(rose_neigh);
+	rose_stop_t0timer(rose_neigh);
+
+	skb_queue_purge(&rose_neigh->queue);
+
+	if ((s = rose_neigh_list) == rose_neigh) {
+		rose_neigh_list = rose_neigh->next;
+		if (rose_neigh->ax25)
+			ax25_cb_put(rose_neigh->ax25);
+		kfree(rose_neigh->digipeat);
+		kfree(rose_neigh);
+		return;
+	}
+
+	while (s != NULL && s->next != NULL) {
+		if (s->next == rose_neigh) {
+			s->next = rose_neigh->next;
+			if (rose_neigh->ax25)
+				ax25_cb_put(rose_neigh->ax25);
+			kfree(rose_neigh->digipeat);
+			kfree(rose_neigh);
+			return;
+		}
+
+		s = s->next;
+	}
+}
+
+/*
+ * Caller is holding rose_route_list_lock.
+ */
+static void rose_remove_route(struct rose_route *rose_route)
+{
+	struct rose_route *s;
+
+	if (rose_route->neigh1 != NULL)
+		rose_route->neigh1->use--;
+
+	if (rose_route->neigh2 != NULL)
+		rose_route->neigh2->use--;
+
+	if ((s = rose_route_list) == rose_route) {
+		rose_route_list = rose_route->next;
+		kfree(rose_route);
+		return;
+	}
+
+	while (s != NULL && s->next != NULL) {
+		if (s->next == rose_route) {
+			s->next = rose_route->next;
+			kfree(rose_route);
+			return;
+		}
+
+		s = s->next;
+	}
+}
+
+/*
+ *	"Delete" a node. Strictly speaking remove a route to a node. The node
+ *	is only deleted if no routes are left to it.
+ */
+static int rose_del_node(struct rose_route_struct *rose_route,
+	struct net_device *dev)
+{
+	struct rose_node  *rose_node;
+	struct rose_neigh *rose_neigh;
+	int i, err = 0;
+
+	spin_lock_bh(&rose_node_list_lock);
+	spin_lock_bh(&rose_neigh_list_lock);
+
+	rose_node = rose_node_list;
+	while (rose_node != NULL) {
+		if ((rose_node->mask == rose_route->mask) &&
+		    (rosecmpm(&rose_route->address, &rose_node->address,
+			      rose_route->mask) == 0))
+			break;
+		rose_node = rose_node->next;
+	}
+
+	if (rose_node == NULL || rose_node->loopback) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	rose_neigh = rose_neigh_list;
+	while (rose_neigh != NULL) {
+		if (ax25cmp(&rose_route->neighbour,
+			    &rose_neigh->callsign) == 0 &&
+		    rose_neigh->dev == dev)
+			break;
+		rose_neigh = rose_neigh->next;
+	}
+
+	if (rose_neigh == NULL) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < rose_node->count; i++) {
+		if (rose_node->neighbour[i] == rose_neigh) {
+			rose_neigh->count--;
+
+			if (rose_neigh->count == 0 && rose_neigh->use == 0)
+				rose_remove_neigh(rose_neigh);
+
+			rose_node->count--;
+
+			if (rose_node->count == 0) {
+				rose_remove_node(rose_node);
+			} else {
+				switch (i) {
+				case 0:
+					rose_node->neighbour[0] =
+						rose_node->neighbour[1];
+				case 1:
+					rose_node->neighbour[1] =
+						rose_node->neighbour[2];
+				case 2:
+					break;
+				}
+			}
+			goto out;
+		}
+	}
+	err = -EINVAL;
+
+out:
+	spin_unlock_bh(&rose_neigh_list_lock);
+	spin_unlock_bh(&rose_node_list_lock);
+
+	return err;
+}
+
+/*
+ *	Add the loopback neighbour.
+ */
+void rose_add_loopback_neigh(void)
+{
+	struct rose_neigh *sn;
+
+	rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_KERNEL);
+	if (!rose_loopback_neigh)
+		return;
+	sn = rose_loopback_neigh;
+
+	sn->callsign  = null_ax25_address;
+	sn->digipeat  = NULL;
+	sn->ax25      = NULL;
+	sn->dev       = NULL;
+	sn->count     = 0;
+	sn->use       = 0;
+	sn->dce_mode  = 1;
+	sn->loopback  = 1;
+	sn->number    = rose_neigh_no++;
+	sn->restarted = 1;
+
+	skb_queue_head_init(&sn->queue);
+
+	init_timer(&sn->ftimer);
+	init_timer(&sn->t0timer);
+
+	spin_lock_bh(&rose_neigh_list_lock);
+	sn->next = rose_neigh_list;
+	rose_neigh_list           = sn;
+	spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+/*
+ *	Add a loopback node.
+ */
+int rose_add_loopback_node(rose_address *address)
+{
+	struct rose_node *rose_node;
+	int err = 0;
+
+	spin_lock_bh(&rose_node_list_lock);
+
+	rose_node = rose_node_list;
+	while (rose_node != NULL) {
+		if ((rose_node->mask == 10) &&
+		     (rosecmpm(address, &rose_node->address, 10) == 0) &&
+		     rose_node->loopback)
+			break;
+		rose_node = rose_node->next;
+	}
+
+	if (rose_node != NULL)
+		goto out;
+
+	if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	rose_node->address      = *address;
+	rose_node->mask         = 10;
+	rose_node->count        = 1;
+	rose_node->loopback     = 1;
+	rose_node->neighbour[0] = rose_loopback_neigh;
+
+	/* Insert at the head of list. Address is always mask=10 */
+	rose_node->next = rose_node_list;
+	rose_node_list  = rose_node;
+
+	rose_loopback_neigh->count++;
+
+out:
+	spin_unlock_bh(&rose_node_list_lock);
+
+	return err;
+}
+
+/*
+ *	Delete a loopback node.
+ */
+void rose_del_loopback_node(rose_address *address)
+{
+	struct rose_node *rose_node;
+
+	spin_lock_bh(&rose_node_list_lock);
+
+	rose_node = rose_node_list;
+	while (rose_node != NULL) {
+		if ((rose_node->mask == 10) &&
+		    (rosecmpm(address, &rose_node->address, 10) == 0) &&
+		    rose_node->loopback)
+			break;
+		rose_node = rose_node->next;
+	}
+
+	if (rose_node == NULL)
+		goto out;
+
+	rose_remove_node(rose_node);
+
+	rose_loopback_neigh->count--;
+
+out:
+	spin_unlock_bh(&rose_node_list_lock);
+}
+
+/*
+ *	A device has been removed. Remove its routes and neighbours.
+ */
+void rose_rt_device_down(struct net_device *dev)
+{
+	struct rose_neigh *s, *rose_neigh;
+	struct rose_node  *t, *rose_node;
+	int i;
+
+	spin_lock_bh(&rose_node_list_lock);
+	spin_lock_bh(&rose_neigh_list_lock);
+	rose_neigh = rose_neigh_list;
+	while (rose_neigh != NULL) {
+		s          = rose_neigh;
+		rose_neigh = rose_neigh->next;
+
+		if (s->dev != dev)
+			continue;
+
+		rose_node = rose_node_list;
+
+		while (rose_node != NULL) {
+			t         = rose_node;
+			rose_node = rose_node->next;
+
+			for (i = 0; i < t->count; i++) {
+				if (t->neighbour[i] != s)
+					continue;
+
+				t->count--;
+
+				switch (i) {
+				case 0:
+					t->neighbour[0] = t->neighbour[1];
+				case 1:
+					t->neighbour[1] = t->neighbour[2];
+				case 2:
+					break;
+				}
+			}
+
+			if (t->count <= 0)
+				rose_remove_node(t);
+		}
+
+		rose_remove_neigh(s);
+	}
+	spin_unlock_bh(&rose_neigh_list_lock);
+	spin_unlock_bh(&rose_node_list_lock);
+}
+
+#if 0 /* Currently unused */
+/*
+ *	A device has been removed. Remove its links.
+ */
+void rose_route_device_down(struct net_device *dev)
+{
+	struct rose_route *s, *rose_route;
+
+	spin_lock_bh(&rose_route_list_lock);
+	rose_route = rose_route_list;
+	while (rose_route != NULL) {
+		s          = rose_route;
+		rose_route = rose_route->next;
+
+		if (s->neigh1->dev == dev || s->neigh2->dev == dev)
+			rose_remove_route(s);
+	}
+	spin_unlock_bh(&rose_route_list_lock);
+}
+#endif
+
+/*
+ *	Clear all nodes and neighbours out, except for neighbours with
+ *	active connections going through them.
+ *  Do not clear loopback neighbour and nodes.
+ */
+static int rose_clear_routes(void)
+{
+	struct rose_neigh *s, *rose_neigh;
+	struct rose_node  *t, *rose_node;
+
+	spin_lock_bh(&rose_node_list_lock);
+	spin_lock_bh(&rose_neigh_list_lock);
+
+	rose_neigh = rose_neigh_list;
+	rose_node  = rose_node_list;
+
+	while (rose_node != NULL) {
+		t         = rose_node;
+		rose_node = rose_node->next;
+		if (!t->loopback)
+			rose_remove_node(t);
+	}
+
+	while (rose_neigh != NULL) {
+		s          = rose_neigh;
+		rose_neigh = rose_neigh->next;
+
+		if (s->use == 0 && !s->loopback) {
+			s->count = 0;
+			rose_remove_neigh(s);
+		}
+	}
+
+	spin_unlock_bh(&rose_neigh_list_lock);
+	spin_unlock_bh(&rose_node_list_lock);
+
+	return 0;
+}
+
+/*
+ *	Check that the device given is a valid AX.25 interface that is "up".
+ * 	called with RTNL
+ */
+static struct net_device *rose_ax25_dev_find(char *devname)
+{
+	struct net_device *dev;
+
+	if ((dev = __dev_get_by_name(&init_net, devname)) == NULL)
+		return NULL;
+
+	if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25)
+		return dev;
+
+	return NULL;
+}
+
+/*
+ *	Find the first active ROSE device, usually "rose0".
+ */
+struct net_device *rose_dev_first(void)
+{
+	struct net_device *dev, *first = NULL;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE)
+			if (first == NULL || strncmp(dev->name, first->name, 3) < 0)
+				first = dev;
+	}
+	rcu_read_unlock();
+
+	return first;
+}
+
+/*
+ *	Find the ROSE device for the given address.
+ */
+struct net_device *rose_dev_get(rose_address *addr)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) {
+			dev_hold(dev);
+			goto out;
+		}
+	}
+	dev = NULL;
+out:
+	rcu_read_unlock();
+	return dev;
+}
+
+static int rose_dev_exists(rose_address *addr)
+{
+	struct net_device *dev;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0)
+			goto out;
+	}
+	dev = NULL;
+out:
+	rcu_read_unlock();
+	return dev != NULL;
+}
+
+
+
+
+struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neigh)
+{
+	struct rose_route *rose_route;
+
+	for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next)
+		if ((rose_route->neigh1 == neigh && rose_route->lci1 == lci) ||
+		    (rose_route->neigh2 == neigh && rose_route->lci2 == lci))
+			return rose_route;
+
+	return NULL;
+}
+
+/*
+ *	Find a neighbour or a route given a ROSE address.
+ */
+struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
+	unsigned char *diagnostic, int route_frame)
+{
+	struct rose_neigh *res = NULL;
+	struct rose_node *node;
+	int failed = 0;
+	int i;
+
+	if (!route_frame) spin_lock_bh(&rose_node_list_lock);
+	for (node = rose_node_list; node != NULL; node = node->next) {
+		if (rosecmpm(addr, &node->address, node->mask) == 0) {
+			for (i = 0; i < node->count; i++) {
+				if (node->neighbour[i]->restarted) {
+					res = node->neighbour[i];
+					goto out;
+				}
+			}
+		}
+	}
+	if (!route_frame) { /* connect request */
+		for (node = rose_node_list; node != NULL; node = node->next) {
+			if (rosecmpm(addr, &node->address, node->mask) == 0) {
+				for (i = 0; i < node->count; i++) {
+					if (!rose_ftimer_running(node->neighbour[i])) {
+						res = node->neighbour[i];
+						failed = 0;
+						goto out;
+					}
+					failed = 1;
+				}
+			}
+		}
+	}
+
+	if (failed) {
+		*cause      = ROSE_OUT_OF_ORDER;
+		*diagnostic = 0;
+	} else {
+		*cause      = ROSE_NOT_OBTAINABLE;
+		*diagnostic = 0;
+	}
+
+out:
+	if (!route_frame) spin_unlock_bh(&rose_node_list_lock);
+	return res;
+}
+
+/*
+ *	Handle the ioctls that control the routing functions.
+ */
+int rose_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct rose_route_struct rose_route;
+	struct net_device *dev;
+	int err;
+
+	switch (cmd) {
+	case SIOCADDRT:
+		if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)))
+			return -EFAULT;
+		if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL)
+			return -EINVAL;
+		if (rose_dev_exists(&rose_route.address)) /* Can't add routes to ourself */
+			return -EINVAL;
+		if (rose_route.mask > 10) /* Mask can't be more than 10 digits */
+			return -EINVAL;
+		if (rose_route.ndigis > AX25_MAX_DIGIS)
+			return -EINVAL;
+		err = rose_add_node(&rose_route, dev);
+		return err;
+
+	case SIOCDELRT:
+		if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)))
+			return -EFAULT;
+		if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL)
+			return -EINVAL;
+		err = rose_del_node(&rose_route, dev);
+		return err;
+
+	case SIOCRSCLRRT:
+		return rose_clear_routes();
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh)
+{
+	struct rose_route *rose_route, *s;
+
+	rose_neigh->restarted = 0;
+
+	rose_stop_t0timer(rose_neigh);
+	rose_start_ftimer(rose_neigh);
+
+	skb_queue_purge(&rose_neigh->queue);
+
+	spin_lock_bh(&rose_route_list_lock);
+
+	rose_route = rose_route_list;
+
+	while (rose_route != NULL) {
+		if ((rose_route->neigh1 == rose_neigh && rose_route->neigh2 == rose_neigh) ||
+		    (rose_route->neigh1 == rose_neigh && rose_route->neigh2 == NULL)       ||
+		    (rose_route->neigh2 == rose_neigh && rose_route->neigh1 == NULL)) {
+			s = rose_route->next;
+			rose_remove_route(rose_route);
+			rose_route = s;
+			continue;
+		}
+
+		if (rose_route->neigh1 == rose_neigh) {
+			rose_route->neigh1->use--;
+			rose_route->neigh1 = NULL;
+			rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0);
+		}
+
+		if (rose_route->neigh2 == rose_neigh) {
+			rose_route->neigh2->use--;
+			rose_route->neigh2 = NULL;
+			rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0);
+		}
+
+		rose_route = rose_route->next;
+	}
+	spin_unlock_bh(&rose_route_list_lock);
+}
+
+/*
+ * 	A level 2 link has timed out, therefore it appears to be a poor link,
+ *	then don't use that neighbour until it is reset. Blow away all through
+ *	routes and connections using this route.
+ */
+void rose_link_failed(ax25_cb *ax25, int reason)
+{
+	struct rose_neigh *rose_neigh;
+
+	spin_lock_bh(&rose_neigh_list_lock);
+	rose_neigh = rose_neigh_list;
+	while (rose_neigh != NULL) {
+		if (rose_neigh->ax25 == ax25)
+			break;
+		rose_neigh = rose_neigh->next;
+	}
+
+	if (rose_neigh != NULL) {
+		rose_neigh->ax25 = NULL;
+		ax25_cb_put(ax25);
+
+		rose_del_route_by_neigh(rose_neigh);
+		rose_kill_by_neigh(rose_neigh);
+	}
+	spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+/*
+ * 	A device has been "downed" remove its link status. Blow away all
+ *	through routes and connections that use this device.
+ */
+void rose_link_device_down(struct net_device *dev)
+{
+	struct rose_neigh *rose_neigh;
+
+	for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) {
+		if (rose_neigh->dev == dev) {
+			rose_del_route_by_neigh(rose_neigh);
+			rose_kill_by_neigh(rose_neigh);
+		}
+	}
+}
+
+/*
+ *	Route a frame to an appropriate AX.25 connection.
+ */
+int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
+{
+	struct rose_neigh *rose_neigh, *new_neigh;
+	struct rose_route *rose_route;
+	struct rose_facilities_struct facilities;
+	rose_address *src_addr, *dest_addr;
+	struct sock *sk;
+	unsigned short frametype;
+	unsigned int lci, new_lci;
+	unsigned char cause, diagnostic;
+	struct net_device *dev;
+	int res = 0;
+	char buf[11];
+
+#if 0
+	if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT)
+		return res;
+#endif
+
+	if (skb->len < ROSE_MIN_LEN)
+		return res;
+	frametype = skb->data[2];
+	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+	if (frametype == ROSE_CALL_REQUEST &&
+	    (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF ||
+	     skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] !=
+	     ROSE_CALL_REQ_ADDR_LEN_VAL))
+		return res;
+	src_addr  = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF);
+	dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF);
+
+	spin_lock_bh(&rose_neigh_list_lock);
+	spin_lock_bh(&rose_route_list_lock);
+
+	rose_neigh = rose_neigh_list;
+	while (rose_neigh != NULL) {
+		if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 &&
+		    ax25->ax25_dev->dev == rose_neigh->dev)
+			break;
+		rose_neigh = rose_neigh->next;
+	}
+
+	if (rose_neigh == NULL) {
+		printk("rose_route : unknown neighbour or device %s\n",
+		       ax2asc(buf, &ax25->dest_addr));
+		goto out;
+	}
+
+	/*
+	 *	Obviously the link is working, halt the ftimer.
+	 */
+	rose_stop_ftimer(rose_neigh);
+
+	/*
+	 *	LCI of zero is always for us, and its always a restart
+	 * 	frame.
+	 */
+	if (lci == 0) {
+		rose_link_rx_restart(skb, rose_neigh, frametype);
+		goto out;
+	}
+
+	/*
+	 *	Find an existing socket.
+	 */
+	if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) {
+		if (frametype == ROSE_CALL_REQUEST) {
+			struct rose_sock *rose = rose_sk(sk);
+
+			/* Remove an existing unused socket */
+			rose_clear_queues(sk);
+			rose->cause	 = ROSE_NETWORK_CONGESTION;
+			rose->diagnostic = 0;
+			rose->neighbour->use--;
+			rose->neighbour	 = NULL;
+			rose->lci	 = 0;
+			rose->state	 = ROSE_STATE_0;
+			sk->sk_state	 = TCP_CLOSE;
+			sk->sk_err	 = 0;
+			sk->sk_shutdown	 |= SEND_SHUTDOWN;
+			if (!sock_flag(sk, SOCK_DEAD)) {
+				sk->sk_state_change(sk);
+				sock_set_flag(sk, SOCK_DEAD);
+			}
+		}
+		else {
+			skb_reset_transport_header(skb);
+			res = rose_process_rx_frame(sk, skb);
+			goto out;
+		}
+	}
+
+	/*
+	 *	Is is a Call Request and is it for us ?
+	 */
+	if (frametype == ROSE_CALL_REQUEST)
+		if ((dev = rose_dev_get(dest_addr)) != NULL) {
+			res = rose_rx_call_request(skb, dev, rose_neigh, lci);
+			dev_put(dev);
+			goto out;
+		}
+
+	if (!sysctl_rose_routing_control) {
+		rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 0);
+		goto out;
+	}
+
+	/*
+	 *	Route it to the next in line if we have an entry for it.
+	 */
+	rose_route = rose_route_list;
+	while (rose_route != NULL) {
+		if (rose_route->lci1 == lci &&
+		    rose_route->neigh1 == rose_neigh) {
+			if (frametype == ROSE_CALL_REQUEST) {
+				/* F6FBB - Remove an existing unused route */
+				rose_remove_route(rose_route);
+				break;
+			} else if (rose_route->neigh2 != NULL) {
+				skb->data[0] &= 0xF0;
+				skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F;
+				skb->data[1]  = (rose_route->lci2 >> 0) & 0xFF;
+				rose_transmit_link(skb, rose_route->neigh2);
+				if (frametype == ROSE_CLEAR_CONFIRMATION)
+					rose_remove_route(rose_route);
+				res = 1;
+				goto out;
+			} else {
+				if (frametype == ROSE_CLEAR_CONFIRMATION)
+					rose_remove_route(rose_route);
+				goto out;
+			}
+		}
+		if (rose_route->lci2 == lci &&
+		    rose_route->neigh2 == rose_neigh) {
+			if (frametype == ROSE_CALL_REQUEST) {
+				/* F6FBB - Remove an existing unused route */
+				rose_remove_route(rose_route);
+				break;
+			} else if (rose_route->neigh1 != NULL) {
+				skb->data[0] &= 0xF0;
+				skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F;
+				skb->data[1]  = (rose_route->lci1 >> 0) & 0xFF;
+				rose_transmit_link(skb, rose_route->neigh1);
+				if (frametype == ROSE_CLEAR_CONFIRMATION)
+					rose_remove_route(rose_route);
+				res = 1;
+				goto out;
+			} else {
+				if (frametype == ROSE_CLEAR_CONFIRMATION)
+					rose_remove_route(rose_route);
+				goto out;
+			}
+		}
+		rose_route = rose_route->next;
+	}
+
+	/*
+	 *	We know that:
+	 *	1. The frame isn't for us,
+	 *	2. It isn't "owned" by any existing route.
+	 */
+	if (frametype != ROSE_CALL_REQUEST) {	/* XXX */
+		res = 0;
+		goto out;
+	}
+
+	memset(&facilities, 0x00, sizeof(struct rose_facilities_struct));
+
+	if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF,
+				   skb->len - ROSE_CALL_REQ_FACILITIES_OFF,
+				   &facilities)) {
+		rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76);
+		goto out;
+	}
+
+	/*
+	 *	Check for routing loops.
+	 */
+	rose_route = rose_route_list;
+	while (rose_route != NULL) {
+		if (rose_route->rand == facilities.rand &&
+		    rosecmp(src_addr, &rose_route->src_addr) == 0 &&
+		    ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 &&
+		    ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) {
+			rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120);
+			goto out;
+		}
+		rose_route = rose_route->next;
+	}
+
+	if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic, 1)) == NULL) {
+		rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic);
+		goto out;
+	}
+
+	if ((new_lci = rose_new_lci(new_neigh)) == 0) {
+		rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71);
+		goto out;
+	}
+
+	if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) {
+		rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120);
+		goto out;
+	}
+
+	rose_route->lci1      = lci;
+	rose_route->src_addr  = *src_addr;
+	rose_route->dest_addr = *dest_addr;
+	rose_route->src_call  = facilities.dest_call;
+	rose_route->dest_call = facilities.source_call;
+	rose_route->rand      = facilities.rand;
+	rose_route->neigh1    = rose_neigh;
+	rose_route->lci2      = new_lci;
+	rose_route->neigh2    = new_neigh;
+
+	rose_route->neigh1->use++;
+	rose_route->neigh2->use++;
+
+	rose_route->next = rose_route_list;
+	rose_route_list  = rose_route;
+
+	skb->data[0] &= 0xF0;
+	skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F;
+	skb->data[1]  = (rose_route->lci2 >> 0) & 0xFF;
+
+	rose_transmit_link(skb, rose_route->neigh2);
+	res = 1;
+
+out:
+	spin_unlock_bh(&rose_route_list_lock);
+	spin_unlock_bh(&rose_neigh_list_lock);
+
+	return res;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static void *rose_node_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rose_node_list_lock)
+{
+	struct rose_node *rose_node;
+	int i = 1;
+
+	spin_lock_bh(&rose_node_list_lock);
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (rose_node = rose_node_list; rose_node && i < *pos;
+	     rose_node = rose_node->next, ++i);
+
+	return (i == *pos) ? rose_node : NULL;
+}
+
+static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+
+	return (v == SEQ_START_TOKEN) ? rose_node_list
+		: ((struct rose_node *)v)->next;
+}
+
+static void rose_node_stop(struct seq_file *seq, void *v)
+	__releases(rose_node_list_lock)
+{
+	spin_unlock_bh(&rose_node_list_lock);
+}
+
+static int rose_node_show(struct seq_file *seq, void *v)
+{
+	char rsbuf[11];
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "address    mask n neigh neigh neigh\n");
+	else {
+		const struct rose_node *rose_node = v;
+		/* if (rose_node->loopback) {
+			seq_printf(seq, "%-10s %04d 1 loopback\n",
+				   rose2asc(rsbuf, &rose_node->address),
+				   rose_node->mask);
+		} else { */
+			seq_printf(seq, "%-10s %04d %d",
+				   rose2asc(rsbuf, &rose_node->address),
+				   rose_node->mask,
+				   rose_node->count);
+
+			for (i = 0; i < rose_node->count; i++)
+				seq_printf(seq, " %05d",
+					rose_node->neighbour[i]->number);
+
+			seq_puts(seq, "\n");
+		/* } */
+	}
+	return 0;
+}
+
+static const struct seq_operations rose_node_seqops = {
+	.start = rose_node_start,
+	.next = rose_node_next,
+	.stop = rose_node_stop,
+	.show = rose_node_show,
+};
+
+static int rose_nodes_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rose_node_seqops);
+}
+
+const struct file_operations rose_nodes_fops = {
+	.owner = THIS_MODULE,
+	.open = rose_nodes_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static void *rose_neigh_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rose_neigh_list_lock)
+{
+	struct rose_neigh *rose_neigh;
+	int i = 1;
+
+	spin_lock_bh(&rose_neigh_list_lock);
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (rose_neigh = rose_neigh_list; rose_neigh && i < *pos;
+	     rose_neigh = rose_neigh->next, ++i);
+
+	return (i == *pos) ? rose_neigh : NULL;
+}
+
+static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+
+	return (v == SEQ_START_TOKEN) ? rose_neigh_list
+		: ((struct rose_neigh *)v)->next;
+}
+
+static void rose_neigh_stop(struct seq_file *seq, void *v)
+	__releases(rose_neigh_list_lock)
+{
+	spin_unlock_bh(&rose_neigh_list_lock);
+}
+
+static int rose_neigh_show(struct seq_file *seq, void *v)
+{
+	char buf[11];
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "addr  callsign  dev  count use mode restart  t0  tf digipeaters\n");
+	else {
+		struct rose_neigh *rose_neigh = v;
+
+		/* if (!rose_neigh->loopback) { */
+		seq_printf(seq, "%05d %-9s %-4s   %3d %3d  %3s     %3s %3lu %3lu",
+			   rose_neigh->number,
+			   (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign),
+			   rose_neigh->dev ? rose_neigh->dev->name : "???",
+			   rose_neigh->count,
+			   rose_neigh->use,
+			   (rose_neigh->dce_mode) ? "DCE" : "DTE",
+			   (rose_neigh->restarted) ? "yes" : "no",
+			   ax25_display_timer(&rose_neigh->t0timer) / HZ,
+			   ax25_display_timer(&rose_neigh->ftimer)  / HZ);
+
+		if (rose_neigh->digipeat != NULL) {
+			for (i = 0; i < rose_neigh->digipeat->ndigi; i++)
+				seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i]));
+		}
+
+		seq_puts(seq, "\n");
+	}
+	return 0;
+}
+
+
+static const struct seq_operations rose_neigh_seqops = {
+	.start = rose_neigh_start,
+	.next = rose_neigh_next,
+	.stop = rose_neigh_stop,
+	.show = rose_neigh_show,
+};
+
+static int rose_neigh_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rose_neigh_seqops);
+}
+
+const struct file_operations rose_neigh_fops = {
+	.owner = THIS_MODULE,
+	.open = rose_neigh_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+
+static void *rose_route_start(struct seq_file *seq, loff_t *pos)
+	__acquires(rose_route_list_lock)
+{
+	struct rose_route *rose_route;
+	int i = 1;
+
+	spin_lock_bh(&rose_route_list_lock);
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (rose_route = rose_route_list; rose_route && i < *pos;
+	     rose_route = rose_route->next, ++i);
+
+	return (i == *pos) ? rose_route : NULL;
+}
+
+static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+
+	return (v == SEQ_START_TOKEN) ? rose_route_list
+		: ((struct rose_route *)v)->next;
+}
+
+static void rose_route_stop(struct seq_file *seq, void *v)
+	__releases(rose_route_list_lock)
+{
+	spin_unlock_bh(&rose_route_list_lock);
+}
+
+static int rose_route_show(struct seq_file *seq, void *v)
+{
+	char buf[11], rsbuf[11];
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+			 "lci  address     callsign   neigh  <-> lci  address     callsign   neigh\n");
+	else {
+		struct rose_route *rose_route = v;
+
+		if (rose_route->neigh1)
+			seq_printf(seq,
+				   "%3.3X  %-10s  %-9s  %05d      ",
+				   rose_route->lci1,
+				   rose2asc(rsbuf, &rose_route->src_addr),
+				   ax2asc(buf, &rose_route->src_call),
+				   rose_route->neigh1->number);
+		else
+			seq_puts(seq,
+				 "000  *           *          00000      ");
+
+		if (rose_route->neigh2)
+			seq_printf(seq,
+				   "%3.3X  %-10s  %-9s  %05d\n",
+				   rose_route->lci2,
+				   rose2asc(rsbuf, &rose_route->dest_addr),
+				   ax2asc(buf, &rose_route->dest_call),
+				   rose_route->neigh2->number);
+		 else
+			 seq_puts(seq,
+				  "000  *           *          00000\n");
+		}
+	return 0;
+}
+
+static const struct seq_operations rose_route_seqops = {
+	.start = rose_route_start,
+	.next = rose_route_next,
+	.stop = rose_route_stop,
+	.show = rose_route_show,
+};
+
+static int rose_route_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rose_route_seqops);
+}
+
+const struct file_operations rose_routes_fops = {
+	.owner = THIS_MODULE,
+	.open = rose_route_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+/*
+ *	Release all memory associated with ROSE routing structures.
+ */
+void __exit rose_rt_free(void)
+{
+	struct rose_neigh *s, *rose_neigh = rose_neigh_list;
+	struct rose_node  *t, *rose_node  = rose_node_list;
+	struct rose_route *u, *rose_route = rose_route_list;
+
+	while (rose_neigh != NULL) {
+		s          = rose_neigh;
+		rose_neigh = rose_neigh->next;
+
+		rose_remove_neigh(s);
+	}
+
+	while (rose_node != NULL) {
+		t         = rose_node;
+		rose_node = rose_node->next;
+
+		rose_remove_node(t);
+	}
+
+	while (rose_route != NULL) {
+		u          = rose_route;
+		rose_route = rose_route->next;
+
+		rose_remove_route(u);
+	}
+}
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
new file mode 100644
index 00000000..f6c71caa
--- /dev/null
+++ b/net/rose/rose_subr.c
@@ -0,0 +1,557 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose);
+
+/*
+ *	This routine purges all of the queues of frames.
+ */
+void rose_clear_queues(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_write_queue);
+	skb_queue_purge(&rose_sk(sk)->ack_queue);
+}
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+ */
+void rose_frames_acked(struct sock *sk, unsigned short nr)
+{
+	struct sk_buff *skb;
+	struct rose_sock *rose = rose_sk(sk);
+
+	/*
+	 * Remove all the ack-ed frames from the ack queue.
+	 */
+	if (rose->va != nr) {
+		while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) {
+			skb = skb_dequeue(&rose->ack_queue);
+			kfree_skb(skb);
+			rose->va = (rose->va + 1) % ROSE_MODULUS;
+		}
+	}
+}
+
+void rose_requeue_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *skb_prev = NULL;
+
+	/*
+	 * Requeue all the un-ack-ed frames on the output queue to be picked
+	 * up by rose_kick. This arrangement handles the possibility of an
+	 * empty output queue.
+	 */
+	while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) {
+		if (skb_prev == NULL)
+			skb_queue_head(&sk->sk_write_queue, skb);
+		else
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
+		skb_prev = skb;
+	}
+}
+
+/*
+ *	Validate that the value of nr is between va and vs. Return true or
+ *	false for testing.
+ */
+int rose_validate_nr(struct sock *sk, unsigned short nr)
+{
+	struct rose_sock *rose = rose_sk(sk);
+	unsigned short vc = rose->va;
+
+	while (vc != rose->vs) {
+		if (nr == vc) return 1;
+		vc = (vc + 1) % ROSE_MODULUS;
+	}
+
+	return nr == rose->vs;
+}
+
+/*
+ *  This routine is called when the packet layer internally generates a
+ *  control frame.
+ */
+void rose_write_internal(struct sock *sk, int frametype)
+{
+	struct rose_sock *rose = rose_sk(sk);
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+	unsigned char  lci1, lci2;
+	char buffer[100];
+	int len, faclen = 0;
+
+	len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1;
+
+	switch (frametype) {
+	case ROSE_CALL_REQUEST:
+		len   += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN;
+		faclen = rose_create_facilities(buffer, rose);
+		len   += faclen;
+		break;
+	case ROSE_CALL_ACCEPTED:
+	case ROSE_CLEAR_REQUEST:
+	case ROSE_RESET_REQUEST:
+		len   += 2;
+		break;
+	}
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	/*
+	 *	Space for AX.25 header and PID.
+	 */
+	skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1);
+
+	dptr = skb_put(skb, skb_tailroom(skb));
+
+	lci1 = (rose->lci >> 8) & 0x0F;
+	lci2 = (rose->lci >> 0) & 0xFF;
+
+	switch (frametype) {
+	case ROSE_CALL_REQUEST:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr++ = frametype;
+		*dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL;
+		memcpy(dptr, &rose->dest_addr,  ROSE_ADDR_LEN);
+		dptr   += ROSE_ADDR_LEN;
+		memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN);
+		dptr   += ROSE_ADDR_LEN;
+		memcpy(dptr, buffer, faclen);
+		dptr   += faclen;
+		break;
+
+	case ROSE_CALL_ACCEPTED:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr++ = frametype;
+		*dptr++ = 0x00;		/* Address length */
+		*dptr++ = 0;		/* Facilities length */
+		break;
+
+	case ROSE_CLEAR_REQUEST:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr++ = frametype;
+		*dptr++ = rose->cause;
+		*dptr++ = rose->diagnostic;
+		break;
+
+	case ROSE_RESET_REQUEST:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr++ = frametype;
+		*dptr++ = ROSE_DTE_ORIGINATED;
+		*dptr++ = 0;
+		break;
+
+	case ROSE_RR:
+	case ROSE_RNR:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr   = frametype;
+		*dptr++ |= (rose->vr << 5) & 0xE0;
+		break;
+
+	case ROSE_CLEAR_CONFIRMATION:
+	case ROSE_RESET_CONFIRMATION:
+		*dptr++ = ROSE_GFI | lci1;
+		*dptr++ = lci2;
+		*dptr++  = frametype;
+		break;
+
+	default:
+		printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype);
+		kfree_skb(skb);
+		return;
+	}
+
+	rose_transmit_link(skb, rose->neighbour);
+}
+
+int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m)
+{
+	unsigned char *frame;
+
+	frame = skb->data;
+
+	*ns = *nr = *q = *d = *m = 0;
+
+	switch (frame[2]) {
+	case ROSE_CALL_REQUEST:
+	case ROSE_CALL_ACCEPTED:
+	case ROSE_CLEAR_REQUEST:
+	case ROSE_CLEAR_CONFIRMATION:
+	case ROSE_RESET_REQUEST:
+	case ROSE_RESET_CONFIRMATION:
+		return frame[2];
+	default:
+		break;
+	}
+
+	if ((frame[2] & 0x1F) == ROSE_RR  ||
+	    (frame[2] & 0x1F) == ROSE_RNR) {
+		*nr = (frame[2] >> 5) & 0x07;
+		return frame[2] & 0x1F;
+	}
+
+	if ((frame[2] & 0x01) == ROSE_DATA) {
+		*q  = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT;
+		*d  = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT;
+		*m  = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT;
+		*nr = (frame[2] >> 5) & 0x07;
+		*ns = (frame[2] >> 1) & 0x07;
+		return ROSE_DATA;
+	}
+
+	return ROSE_ILLEGAL;
+}
+
+static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len)
+{
+	unsigned char *pt;
+	unsigned char l, lg, n = 0;
+	int fac_national_digis_received = 0;
+
+	do {
+		switch (*p & 0xC0) {
+		case 0x00:
+			if (len < 2)
+				return -1;
+			p   += 2;
+			n   += 2;
+			len -= 2;
+			break;
+
+		case 0x40:
+			if (len < 3)
+				return -1;
+			if (*p == FAC_NATIONAL_RAND)
+				facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF);
+			p   += 3;
+			n   += 3;
+			len -= 3;
+			break;
+
+		case 0x80:
+			if (len < 4)
+				return -1;
+			p   += 4;
+			n   += 4;
+			len -= 4;
+			break;
+
+		case 0xC0:
+			if (len < 2)
+				return -1;
+			l = p[1];
+			if (len < 2 + l)
+				return -1;
+			if (*p == FAC_NATIONAL_DEST_DIGI) {
+				if (!fac_national_digis_received) {
+					if (l < AX25_ADDR_LEN)
+						return -1;
+					memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN);
+					facilities->source_ndigis = 1;
+				}
+			}
+			else if (*p == FAC_NATIONAL_SRC_DIGI) {
+				if (!fac_national_digis_received) {
+					if (l < AX25_ADDR_LEN)
+						return -1;
+					memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN);
+					facilities->dest_ndigis = 1;
+				}
+			}
+			else if (*p == FAC_NATIONAL_FAIL_CALL) {
+				if (l < AX25_ADDR_LEN)
+					return -1;
+				memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN);
+			}
+			else if (*p == FAC_NATIONAL_FAIL_ADD) {
+				if (l < 1 + ROSE_ADDR_LEN)
+					return -1;
+				memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN);
+			}
+			else if (*p == FAC_NATIONAL_DIGIS) {
+				if (l % AX25_ADDR_LEN)
+					return -1;
+				fac_national_digis_received = 1;
+				facilities->source_ndigis = 0;
+				facilities->dest_ndigis   = 0;
+				for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) {
+					if (pt[6] & AX25_HBIT) {
+						if (facilities->dest_ndigis >= ROSE_MAX_DIGIS)
+							return -1;
+						memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN);
+					} else {
+						if (facilities->source_ndigis >= ROSE_MAX_DIGIS)
+							return -1;
+						memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN);
+					}
+				}
+			}
+			p   += l + 2;
+			n   += l + 2;
+			len -= l + 2;
+			break;
+		}
+	} while (*p != 0x00 && len > 0);
+
+	return n;
+}
+
+static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len)
+{
+	unsigned char l, n = 0;
+	char callsign[11];
+
+	do {
+		switch (*p & 0xC0) {
+		case 0x00:
+			if (len < 2)
+				return -1;
+			p   += 2;
+			n   += 2;
+			len -= 2;
+			break;
+
+		case 0x40:
+			if (len < 3)
+				return -1;
+			p   += 3;
+			n   += 3;
+			len -= 3;
+			break;
+
+		case 0x80:
+			if (len < 4)
+				return -1;
+			p   += 4;
+			n   += 4;
+			len -= 4;
+			break;
+
+		case 0xC0:
+			if (len < 2)
+				return -1;
+			l = p[1];
+
+			/* Prevent overflows*/
+			if (l < 10 || l > 20)
+				return -1;
+
+			if (*p == FAC_CCITT_DEST_NSAP) {
+				memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN);
+				memcpy(callsign, p + 12,   l - 10);
+				callsign[l - 10] = '\0';
+				asc2ax(&facilities->source_call, callsign);
+			}
+			if (*p == FAC_CCITT_SRC_NSAP) {
+				memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN);
+				memcpy(callsign, p + 12, l - 10);
+				callsign[l - 10] = '\0';
+				asc2ax(&facilities->dest_call, callsign);
+			}
+			p   += l + 2;
+			n   += l + 2;
+			len -= l + 2;
+			break;
+		}
+	} while (*p != 0x00 && len > 0);
+
+	return n;
+}
+
+int rose_parse_facilities(unsigned char *p, unsigned packet_len,
+	struct rose_facilities_struct *facilities)
+{
+	int facilities_len, len;
+
+	facilities_len = *p++;
+
+	if (facilities_len == 0 || (unsigned)facilities_len > packet_len)
+		return 0;
+
+	while (facilities_len >= 3 && *p == 0x00) {
+		facilities_len--;
+		p++;
+
+		switch (*p) {
+		case FAC_NATIONAL:		/* National */
+			len = rose_parse_national(p + 1, facilities, facilities_len - 1);
+			break;
+
+		case FAC_CCITT:		/* CCITT */
+			len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1);
+			break;
+
+		default:
+			printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p);
+			len = 1;
+			break;
+		}
+
+		if (len < 0)
+			return 0;
+		if (WARN_ON(len >= facilities_len))
+			return 0;
+		facilities_len -= len + 1;
+		p += len + 1;
+	}
+
+	return facilities_len == 0;
+}
+
+static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
+{
+	unsigned char *p = buffer + 1;
+	char *callsign;
+	char buf[11];
+	int len, nb;
+
+	/* National Facilities */
+	if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) {
+		*p++ = 0x00;
+		*p++ = FAC_NATIONAL;
+
+		if (rose->rand != 0) {
+			*p++ = FAC_NATIONAL_RAND;
+			*p++ = (rose->rand >> 8) & 0xFF;
+			*p++ = (rose->rand >> 0) & 0xFF;
+		}
+
+		/* Sent before older facilities */
+		if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) {
+			int maxdigi = 0;
+			*p++ = FAC_NATIONAL_DIGIS;
+			*p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis);
+			for (nb = 0 ; nb < rose->source_ndigis ; nb++) {
+				if (++maxdigi >= ROSE_MAX_DIGIS)
+					break;
+				memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN);
+				p[6] |= AX25_HBIT;
+				p += AX25_ADDR_LEN;
+			}
+			for (nb = 0 ; nb < rose->dest_ndigis ; nb++) {
+				if (++maxdigi >= ROSE_MAX_DIGIS)
+					break;
+				memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN);
+				p[6] &= ~AX25_HBIT;
+				p += AX25_ADDR_LEN;
+			}
+		}
+
+		/* For compatibility */
+		if (rose->source_ndigis > 0) {
+			*p++ = FAC_NATIONAL_SRC_DIGI;
+			*p++ = AX25_ADDR_LEN;
+			memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN);
+			p   += AX25_ADDR_LEN;
+		}
+
+		/* For compatibility */
+		if (rose->dest_ndigis > 0) {
+			*p++ = FAC_NATIONAL_DEST_DIGI;
+			*p++ = AX25_ADDR_LEN;
+			memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN);
+			p   += AX25_ADDR_LEN;
+		}
+	}
+
+	*p++ = 0x00;
+	*p++ = FAC_CCITT;
+
+	*p++ = FAC_CCITT_DEST_NSAP;
+
+	callsign = ax2asc(buf, &rose->dest_call);
+
+	*p++ = strlen(callsign) + 10;
+	*p++ = (strlen(callsign) + 9) * 2;		/* ??? */
+
+	*p++ = 0x47; *p++ = 0x00; *p++ = 0x11;
+	*p++ = ROSE_ADDR_LEN * 2;
+	memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN);
+	p   += ROSE_ADDR_LEN;
+
+	memcpy(p, callsign, strlen(callsign));
+	p   += strlen(callsign);
+
+	*p++ = FAC_CCITT_SRC_NSAP;
+
+	callsign = ax2asc(buf, &rose->source_call);
+
+	*p++ = strlen(callsign) + 10;
+	*p++ = (strlen(callsign) + 9) * 2;		/* ??? */
+
+	*p++ = 0x47; *p++ = 0x00; *p++ = 0x11;
+	*p++ = ROSE_ADDR_LEN * 2;
+	memcpy(p, &rose->source_addr, ROSE_ADDR_LEN);
+	p   += ROSE_ADDR_LEN;
+
+	memcpy(p, callsign, strlen(callsign));
+	p   += strlen(callsign);
+
+	len       = p - buffer;
+	buffer[0] = len - 1;
+
+	return len;
+}
+
+void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	rose_stop_timer(sk);
+	rose_stop_idletimer(sk);
+
+	rose_clear_queues(sk);
+
+	rose->lci   = 0;
+	rose->state = ROSE_STATE_0;
+
+	if (cause != -1)
+		rose->cause = cause;
+
+	if (diagnostic != -1)
+		rose->diagnostic = diagnostic;
+
+	sk->sk_state     = TCP_CLOSE;
+	sk->sk_err       = reason;
+	sk->sk_shutdown |= SEND_SHUTDOWN;
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+	}
+}
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
new file mode 100644
index 00000000..b6c8f38c
--- /dev/null
+++ b/net/rose/rose_timer.c
@@ -0,0 +1,217 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
+ * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org)
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <net/ax25.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/system.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <net/rose.h>
+
+static void rose_heartbeat_expiry(unsigned long);
+static void rose_timer_expiry(unsigned long);
+static void rose_idletimer_expiry(unsigned long);
+
+void rose_start_heartbeat(struct sock *sk)
+{
+	del_timer(&sk->sk_timer);
+
+	sk->sk_timer.data     = (unsigned long)sk;
+	sk->sk_timer.function = &rose_heartbeat_expiry;
+	sk->sk_timer.expires  = jiffies + 5 * HZ;
+
+	add_timer(&sk->sk_timer);
+}
+
+void rose_start_t1timer(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	del_timer(&rose->timer);
+
+	rose->timer.data     = (unsigned long)sk;
+	rose->timer.function = &rose_timer_expiry;
+	rose->timer.expires  = jiffies + rose->t1;
+
+	add_timer(&rose->timer);
+}
+
+void rose_start_t2timer(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	del_timer(&rose->timer);
+
+	rose->timer.data     = (unsigned long)sk;
+	rose->timer.function = &rose_timer_expiry;
+	rose->timer.expires  = jiffies + rose->t2;
+
+	add_timer(&rose->timer);
+}
+
+void rose_start_t3timer(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	del_timer(&rose->timer);
+
+	rose->timer.data     = (unsigned long)sk;
+	rose->timer.function = &rose_timer_expiry;
+	rose->timer.expires  = jiffies + rose->t3;
+
+	add_timer(&rose->timer);
+}
+
+void rose_start_hbtimer(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	del_timer(&rose->timer);
+
+	rose->timer.data     = (unsigned long)sk;
+	rose->timer.function = &rose_timer_expiry;
+	rose->timer.expires  = jiffies + rose->hb;
+
+	add_timer(&rose->timer);
+}
+
+void rose_start_idletimer(struct sock *sk)
+{
+	struct rose_sock *rose = rose_sk(sk);
+
+	del_timer(&rose->idletimer);
+
+	if (rose->idle > 0) {
+		rose->idletimer.data     = (unsigned long)sk;
+		rose->idletimer.function = &rose_idletimer_expiry;
+		rose->idletimer.expires  = jiffies + rose->idle;
+
+		add_timer(&rose->idletimer);
+	}
+}
+
+void rose_stop_heartbeat(struct sock *sk)
+{
+	del_timer(&sk->sk_timer);
+}
+
+void rose_stop_timer(struct sock *sk)
+{
+	del_timer(&rose_sk(sk)->timer);
+}
+
+void rose_stop_idletimer(struct sock *sk)
+{
+	del_timer(&rose_sk(sk)->idletimer);
+}
+
+static void rose_heartbeat_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct rose_sock *rose = rose_sk(sk);
+
+	bh_lock_sock(sk);
+	switch (rose->state) {
+	case ROSE_STATE_0:
+		/* Magic here: If we listen() and a new link dies before it
+		   is accepted() it isn't 'dead' so doesn't get removed. */
+		if (sock_flag(sk, SOCK_DESTROY) ||
+		    (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) {
+			bh_unlock_sock(sk);
+			rose_destroy_socket(sk);
+			return;
+		}
+		break;
+
+	case ROSE_STATE_3:
+		/*
+		 * Check for the state of the receive buffer.
+		 */
+		if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) &&
+		    (rose->condition & ROSE_COND_OWN_RX_BUSY)) {
+			rose->condition &= ~ROSE_COND_OWN_RX_BUSY;
+			rose->condition &= ~ROSE_COND_ACK_PENDING;
+			rose->vl         = rose->vr;
+			rose_write_internal(sk, ROSE_RR);
+			rose_stop_timer(sk);	/* HB */
+			break;
+		}
+		break;
+	}
+
+	rose_start_heartbeat(sk);
+	bh_unlock_sock(sk);
+}
+
+static void rose_timer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+	struct rose_sock *rose = rose_sk(sk);
+
+	bh_lock_sock(sk);
+	switch (rose->state) {
+	case ROSE_STATE_1:	/* T1 */
+	case ROSE_STATE_4:	/* T2 */
+		rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+		rose->state = ROSE_STATE_2;
+		rose_start_t3timer(sk);
+		break;
+
+	case ROSE_STATE_2:	/* T3 */
+		rose->neighbour->use--;
+		rose_disconnect(sk, ETIMEDOUT, -1, -1);
+		break;
+
+	case ROSE_STATE_3:	/* HB */
+		if (rose->condition & ROSE_COND_ACK_PENDING) {
+			rose->condition &= ~ROSE_COND_ACK_PENDING;
+			rose_enquiry_response(sk);
+		}
+		break;
+	}
+	bh_unlock_sock(sk);
+}
+
+static void rose_idletimer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+
+	bh_lock_sock(sk);
+	rose_clear_queues(sk);
+
+	rose_write_internal(sk, ROSE_CLEAR_REQUEST);
+	rose_sk(sk)->state = ROSE_STATE_2;
+
+	rose_start_t3timer(sk);
+
+	sk->sk_state     = TCP_CLOSE;
+	sk->sk_err       = 0;
+	sk->sk_shutdown |= SEND_SHUTDOWN;
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+	}
+	bh_unlock_sock(sk);
+}
diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c
new file mode 100644
index 00000000..df6d9dac
--- /dev/null
+++ b/net/rose/sysctl_net_rose.c
@@ -0,0 +1,135 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
+ */
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+#include <net/ax25.h>
+#include <net/rose.h>
+
+static int min_timer[]  = {1 * HZ};
+static int max_timer[]  = {300 * HZ};
+static int min_idle[]   = {0 * HZ};
+static int max_idle[]   = {65535 * HZ};
+static int min_route[1],       max_route[] = {1};
+static int min_ftimer[] = {60 * HZ};
+static int max_ftimer[] = {600 * HZ};
+static int min_maxvcs[] = {1}, max_maxvcs[] = {254};
+static int min_window[] = {1}, max_window[] = {7};
+
+static struct ctl_table_header *rose_table_header;
+
+static ctl_table rose_table[] = {
+	{
+		.procname	= "restart_request_timeout",
+		.data		= &sysctl_rose_restart_request_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_timer,
+		.extra2		= &max_timer
+	},
+	{
+		.procname	= "call_request_timeout",
+		.data		= &sysctl_rose_call_request_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_timer,
+		.extra2		= &max_timer
+	},
+	{
+		.procname	= "reset_request_timeout",
+		.data		= &sysctl_rose_reset_request_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_timer,
+		.extra2		= &max_timer
+	},
+	{
+		.procname	= "clear_request_timeout",
+		.data		= &sysctl_rose_clear_request_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_timer,
+		.extra2		= &max_timer
+	},
+	{
+		.procname	= "no_activity_timeout",
+		.data		= &sysctl_rose_no_activity_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_idle,
+		.extra2		= &max_idle
+	},
+	{
+		.procname	= "acknowledge_hold_back_timeout",
+		.data		= &sysctl_rose_ack_hold_back_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_timer,
+		.extra2		= &max_timer
+	},
+	{
+		.procname	= "routing_control",
+		.data		= &sysctl_rose_routing_control,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_route,
+		.extra2		= &max_route
+	},
+	{
+		.procname	= "link_fail_timeout",
+		.data		= &sysctl_rose_link_fail_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ftimer,
+		.extra2		= &max_ftimer
+	},
+	{
+		.procname	= "maximum_virtual_circuits",
+		.data		= &sysctl_rose_maximum_vcs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_maxvcs,
+		.extra2		= &max_maxvcs
+	},
+	{
+		.procname	= "window_size",
+		.data		= &sysctl_rose_window_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_window,
+		.extra2		= &max_window
+	},
+	{ }
+};
+
+static struct ctl_path rose_path[] = {
+	{ .procname = "net", },
+	{ .procname = "rose", },
+	{ }
+};
+
+void __init rose_register_sysctl(void)
+{
+	rose_table_header = register_sysctl_paths(rose_path, rose_table);
+}
+
+void rose_unregister_sysctl(void)
+{
+	unregister_sysctl_table(rose_table_header);
+}
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
new file mode 100644
index 00000000..0d3103c4
--- /dev/null
+++ b/net/rxrpc/Kconfig
@@ -0,0 +1,44 @@
+#
+# RxRPC session sockets
+#
+
+config AF_RXRPC
+	tristate "RxRPC session sockets"
+	depends on INET && EXPERIMENTAL
+	select CRYPTO
+	select KEYS
+	help
+	  Say Y or M here to include support for RxRPC session sockets (just
+	  the transport part, not the presentation part: (un)marshalling is
+	  left to the application).
+
+	  These are used for AFS kernel filesystem and userspace utilities.
+
+	  This module at the moment only supports client operations and is
+	  currently incomplete.
+
+	  See Documentation/networking/rxrpc.txt.
+
+
+config AF_RXRPC_DEBUG
+	bool "RxRPC dynamic debugging"
+	depends on AF_RXRPC
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See Documentation/networking/rxrpc.txt.
+
+
+config RXKAD
+	tristate "RxRPC Kerberos security"
+	depends on AF_RXRPC
+	select CRYPTO
+	select CRYPTO_MANAGER
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_PCBC
+	select CRYPTO_FCRYPT
+	help
+	  Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC
+	  through the use of the key retention service.
+
+	  See Documentation/networking/rxrpc.txt.
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
new file mode 100644
index 00000000..d1c3429b
--- /dev/null
+++ b/net/rxrpc/Makefile
@@ -0,0 +1,29 @@
+#
+# Makefile for Linux kernel RxRPC
+#
+
+af-rxrpc-y := \
+	af_rxrpc.o \
+	ar-accept.o \
+	ar-ack.o \
+	ar-call.o \
+	ar-connection.o \
+	ar-connevent.o \
+	ar-error.o \
+	ar-input.o \
+	ar-key.o \
+	ar-local.o \
+	ar-output.o \
+	ar-peer.o \
+	ar-recvmsg.o \
+	ar-security.o \
+	ar-skbuff.o \
+	ar-transport.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+af-rxrpc-y += ar-proc.o
+endif
+
+obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o
+
+obj-$(CONFIG_RXKAD) += rxkad.o
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
new file mode 100644
index 00000000..74c064c0
--- /dev/null
+++ b/net/rxrpc/af_rxrpc.c
@@ -0,0 +1,889 @@
+/* AF_RXRPC implementation
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/key-type.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+MODULE_DESCRIPTION("RxRPC network protocol");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_RXRPC);
+
+unsigned rxrpc_debug; // = RXRPC_DEBUG_KPROTO;
+module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "RxRPC debugging mask");
+
+static int sysctl_rxrpc_max_qlen __read_mostly = 10;
+
+static struct proto rxrpc_proto;
+static const struct proto_ops rxrpc_rpc_ops;
+
+/* local epoch for detecting local-end reset */
+__be32 rxrpc_epoch;
+
+/* current debugging ID */
+atomic_t rxrpc_debug_id;
+
+/* count of skbs currently in use */
+atomic_t rxrpc_n_skbs;
+
+struct workqueue_struct *rxrpc_workqueue;
+
+static void rxrpc_sock_destructor(struct sock *);
+
+/*
+ * see if an RxRPC socket is currently writable
+ */
+static inline int rxrpc_writable(struct sock *sk)
+{
+	return atomic_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf;
+}
+
+/*
+ * wait for write bufferage to become available
+ */
+static void rxrpc_write_space(struct sock *sk)
+{
+	_enter("%p", sk);
+	rcu_read_lock();
+	if (rxrpc_writable(sk)) {
+		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible(&wq->wait);
+		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * validate an RxRPC address
+ */
+static int rxrpc_validate_address(struct rxrpc_sock *rx,
+				  struct sockaddr_rxrpc *srx,
+				  int len)
+{
+	if (len < sizeof(struct sockaddr_rxrpc))
+		return -EINVAL;
+
+	if (srx->srx_family != AF_RXRPC)
+		return -EAFNOSUPPORT;
+
+	if (srx->transport_type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	len -= offsetof(struct sockaddr_rxrpc, transport);
+	if (srx->transport_len < sizeof(sa_family_t) ||
+	    srx->transport_len > len)
+		return -EINVAL;
+
+	if (srx->transport.family != rx->proto)
+		return -EAFNOSUPPORT;
+
+	switch (srx->transport.family) {
+	case AF_INET:
+		_debug("INET: %x @ %pI4",
+		       ntohs(srx->transport.sin.sin_port),
+		       &srx->transport.sin.sin_addr);
+		if (srx->transport_len > 8)
+			memset((void *)&srx->transport + 8, 0,
+			       srx->transport_len - 8);
+		break;
+
+	case AF_INET6:
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	return 0;
+}
+
+/*
+ * bind a local address to an RxRPC socket
+ */
+static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
+{
+	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr;
+	struct sock *sk = sock->sk;
+	struct rxrpc_local *local;
+	struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
+	__be16 service_id;
+	int ret;
+
+	_enter("%p,%p,%d", rx, saddr, len);
+
+	ret = rxrpc_validate_address(rx, srx, len);
+	if (ret < 0)
+		goto error;
+
+	lock_sock(&rx->sk);
+
+	if (rx->sk.sk_state != RXRPC_UNCONNECTED) {
+		ret = -EINVAL;
+		goto error_unlock;
+	}
+
+	memcpy(&rx->srx, srx, sizeof(rx->srx));
+
+	/* find a local transport endpoint if we don't have one already */
+	local = rxrpc_lookup_local(&rx->srx);
+	if (IS_ERR(local)) {
+		ret = PTR_ERR(local);
+		goto error_unlock;
+	}
+
+	rx->local = local;
+	if (srx->srx_service) {
+		service_id = htons(srx->srx_service);
+		write_lock_bh(&local->services_lock);
+		list_for_each_entry(prx, &local->services, listen_link) {
+			if (prx->service_id == service_id)
+				goto service_in_use;
+		}
+
+		rx->service_id = service_id;
+		list_add_tail(&rx->listen_link, &local->services);
+		write_unlock_bh(&local->services_lock);
+
+		rx->sk.sk_state = RXRPC_SERVER_BOUND;
+	} else {
+		rx->sk.sk_state = RXRPC_CLIENT_BOUND;
+	}
+
+	release_sock(&rx->sk);
+	_leave(" = 0");
+	return 0;
+
+service_in_use:
+	ret = -EADDRINUSE;
+	write_unlock_bh(&local->services_lock);
+error_unlock:
+	release_sock(&rx->sk);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * set the number of pending calls permitted on a listening socket
+ */
+static int rxrpc_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	int ret;
+
+	_enter("%p,%d", rx, backlog);
+
+	lock_sock(&rx->sk);
+
+	switch (rx->sk.sk_state) {
+	case RXRPC_UNCONNECTED:
+		ret = -EADDRNOTAVAIL;
+		break;
+	case RXRPC_CLIENT_BOUND:
+	case RXRPC_CLIENT_CONNECTED:
+	default:
+		ret = -EBUSY;
+		break;
+	case RXRPC_SERVER_BOUND:
+		ASSERT(rx->local != NULL);
+		sk->sk_max_ack_backlog = backlog;
+		rx->sk.sk_state = RXRPC_SERVER_LISTENING;
+		ret = 0;
+		break;
+	}
+
+	release_sock(&rx->sk);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * find a transport by address
+ */
+static struct rxrpc_transport *rxrpc_name_to_transport(struct socket *sock,
+						       struct sockaddr *addr,
+						       int addr_len, int flags,
+						       gfp_t gfp)
+{
+	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
+	struct rxrpc_transport *trans;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct rxrpc_peer *peer;
+
+	_enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
+
+	ASSERT(rx->local != NULL);
+	ASSERT(rx->sk.sk_state > RXRPC_UNCONNECTED);
+
+	if (rx->srx.transport_type != srx->transport_type)
+		return ERR_PTR(-ESOCKTNOSUPPORT);
+	if (rx->srx.transport.family != srx->transport.family)
+		return ERR_PTR(-EAFNOSUPPORT);
+
+	/* find a remote transport endpoint from the local one */
+	peer = rxrpc_get_peer(srx, gfp);
+	if (IS_ERR(peer))
+		return ERR_CAST(peer);
+
+	/* find a transport */
+	trans = rxrpc_get_transport(rx->local, peer, gfp);
+	rxrpc_put_peer(peer);
+	_leave(" = %p", trans);
+	return trans;
+}
+
+/**
+ * rxrpc_kernel_begin_call - Allow a kernel service to begin a call
+ * @sock: The socket on which to make the call
+ * @srx: The address of the peer to contact (defaults to socket setting)
+ * @key: The security context to use (defaults to socket setting)
+ * @user_call_ID: The ID to use
+ *
+ * Allow a kernel service to begin a call on the nominated socket.  This just
+ * sets up all the internal tracking structures and allocates connection and
+ * call IDs as appropriate.  The call to be used is returned.
+ *
+ * The default socket destination address and security may be overridden by
+ * supplying @srx and @key.
+ */
+struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+					   struct sockaddr_rxrpc *srx,
+					   struct key *key,
+					   unsigned long user_call_ID,
+					   gfp_t gfp)
+{
+	struct rxrpc_conn_bundle *bundle;
+	struct rxrpc_transport *trans;
+	struct rxrpc_call *call;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	__be16 service_id;
+
+	_enter(",,%x,%lx", key_serial(key), user_call_ID);
+
+	lock_sock(&rx->sk);
+
+	if (srx) {
+		trans = rxrpc_name_to_transport(sock, (struct sockaddr *) srx,
+						sizeof(*srx), 0, gfp);
+		if (IS_ERR(trans)) {
+			call = ERR_CAST(trans);
+			trans = NULL;
+			goto out_notrans;
+		}
+	} else {
+		trans = rx->trans;
+		if (!trans) {
+			call = ERR_PTR(-ENOTCONN);
+			goto out_notrans;
+		}
+		atomic_inc(&trans->usage);
+	}
+
+	service_id = rx->service_id;
+	if (srx)
+		service_id = htons(srx->srx_service);
+
+	if (!key)
+		key = rx->key;
+	if (key && !key->payload.data)
+		key = NULL; /* a no-security key */
+
+	bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
+	if (IS_ERR(bundle)) {
+		call = ERR_CAST(bundle);
+		goto out;
+	}
+
+	call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID, true,
+				     gfp);
+	rxrpc_put_bundle(trans, bundle);
+out:
+	rxrpc_put_transport(trans);
+out_notrans:
+	release_sock(&rx->sk);
+	_leave(" = %p", call);
+	return call;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_begin_call);
+
+/**
+ * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
+ * @call: The call to end
+ *
+ * Allow a kernel service to end a call it was using.  The call must be
+ * complete before this is called (the call should be aborted if necessary).
+ */
+void rxrpc_kernel_end_call(struct rxrpc_call *call)
+{
+	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
+	rxrpc_remove_user_ID(call->socket, call);
+	rxrpc_put_call(call);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_end_call);
+
+/**
+ * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages
+ * @sock: The socket to intercept received messages on
+ * @interceptor: The function to pass the messages to
+ *
+ * Allow a kernel service to intercept messages heading for the Rx queue on an
+ * RxRPC socket.  They get passed to the specified function instead.
+ * @interceptor should free the socket buffers it is given.  @interceptor is
+ * called with the socket receive queue spinlock held and softirqs disabled -
+ * this ensures that the messages will be delivered in the right order.
+ */
+void rxrpc_kernel_intercept_rx_messages(struct socket *sock,
+					rxrpc_interceptor_t interceptor)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+
+	_enter("");
+	rx->interceptor = interceptor;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages);
+
+/*
+ * connect an RxRPC socket
+ * - this just targets it at a specific destination; no actual connection
+ *   negotiation takes place
+ */
+static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
+			 int addr_len, int flags)
+{
+	struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) addr;
+	struct sock *sk = sock->sk;
+	struct rxrpc_transport *trans;
+	struct rxrpc_local *local;
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	int ret;
+
+	_enter("%p,%p,%d,%d", rx, addr, addr_len, flags);
+
+	ret = rxrpc_validate_address(rx, srx, addr_len);
+	if (ret < 0) {
+		_leave(" = %d [bad addr]", ret);
+		return ret;
+	}
+
+	lock_sock(&rx->sk);
+
+	switch (rx->sk.sk_state) {
+	case RXRPC_UNCONNECTED:
+		/* find a local transport endpoint if we don't have one already */
+		ASSERTCMP(rx->local, ==, NULL);
+		rx->srx.srx_family = AF_RXRPC;
+		rx->srx.srx_service = 0;
+		rx->srx.transport_type = srx->transport_type;
+		rx->srx.transport_len = sizeof(sa_family_t);
+		rx->srx.transport.family = srx->transport.family;
+		local = rxrpc_lookup_local(&rx->srx);
+		if (IS_ERR(local)) {
+			release_sock(&rx->sk);
+			return PTR_ERR(local);
+		}
+		rx->local = local;
+		rx->sk.sk_state = RXRPC_CLIENT_BOUND;
+	case RXRPC_CLIENT_BOUND:
+		break;
+	case RXRPC_CLIENT_CONNECTED:
+		release_sock(&rx->sk);
+		return -EISCONN;
+	default:
+		release_sock(&rx->sk);
+		return -EBUSY; /* server sockets can't connect as well */
+	}
+
+	trans = rxrpc_name_to_transport(sock, addr, addr_len, flags,
+					GFP_KERNEL);
+	if (IS_ERR(trans)) {
+		release_sock(&rx->sk);
+		_leave(" = %ld", PTR_ERR(trans));
+		return PTR_ERR(trans);
+	}
+
+	rx->trans = trans;
+	rx->service_id = htons(srx->srx_service);
+	rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
+
+	release_sock(&rx->sk);
+	return 0;
+}
+
+/*
+ * send a message through an RxRPC socket
+ * - in a client this does a number of things:
+ *   - finds/sets up a connection for the security specified (if any)
+ *   - initiates a call (ID in control data)
+ *   - ends the request phase of a call (if MSG_MORE is not set)
+ *   - sends a call data packet
+ *   - may send an abort (abort code in control data)
+ */
+static int rxrpc_sendmsg(struct kiocb *iocb, struct socket *sock,
+			 struct msghdr *m, size_t len)
+{
+	struct rxrpc_transport *trans;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	int ret;
+
+	_enter(",{%d},,%zu", rx->sk.sk_state, len);
+
+	if (m->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	if (m->msg_name) {
+		ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen);
+		if (ret < 0) {
+			_leave(" = %d [bad addr]", ret);
+			return ret;
+		}
+	}
+
+	trans = NULL;
+	lock_sock(&rx->sk);
+
+	if (m->msg_name) {
+		ret = -EISCONN;
+		trans = rxrpc_name_to_transport(sock, m->msg_name,
+						m->msg_namelen, 0, GFP_KERNEL);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
+		}
+	} else {
+		trans = rx->trans;
+		if (trans)
+			atomic_inc(&trans->usage);
+	}
+
+	switch (rx->sk.sk_state) {
+	case RXRPC_SERVER_LISTENING:
+		if (!m->msg_name) {
+			ret = rxrpc_server_sendmsg(iocb, rx, m, len);
+			break;
+		}
+	case RXRPC_SERVER_BOUND:
+	case RXRPC_CLIENT_BOUND:
+		if (!m->msg_name) {
+			ret = -ENOTCONN;
+			break;
+		}
+	case RXRPC_CLIENT_CONNECTED:
+		ret = rxrpc_client_sendmsg(iocb, rx, trans, m, len);
+		break;
+	default:
+		ret = -ENOTCONN;
+		break;
+	}
+
+out:
+	release_sock(&rx->sk);
+	if (trans)
+		rxrpc_put_transport(trans);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * set RxRPC socket options
+ */
+static int rxrpc_setsockopt(struct socket *sock, int level, int optname,
+			    char __user *optval, unsigned int optlen)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	unsigned min_sec_level;
+	int ret;
+
+	_enter(",%d,%d,,%d", level, optname, optlen);
+
+	lock_sock(&rx->sk);
+	ret = -EOPNOTSUPP;
+
+	if (level == SOL_RXRPC) {
+		switch (optname) {
+		case RXRPC_EXCLUSIVE_CONNECTION:
+			ret = -EINVAL;
+			if (optlen != 0)
+				goto error;
+			ret = -EISCONN;
+			if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+				goto error;
+			set_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags);
+			goto success;
+
+		case RXRPC_SECURITY_KEY:
+			ret = -EINVAL;
+			if (rx->key)
+				goto error;
+			ret = -EISCONN;
+			if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+				goto error;
+			ret = rxrpc_request_key(rx, optval, optlen);
+			goto error;
+
+		case RXRPC_SECURITY_KEYRING:
+			ret = -EINVAL;
+			if (rx->key)
+				goto error;
+			ret = -EISCONN;
+			if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+				goto error;
+			ret = rxrpc_server_keyring(rx, optval, optlen);
+			goto error;
+
+		case RXRPC_MIN_SECURITY_LEVEL:
+			ret = -EINVAL;
+			if (optlen != sizeof(unsigned))
+				goto error;
+			ret = -EISCONN;
+			if (rx->sk.sk_state != RXRPC_UNCONNECTED)
+				goto error;
+			ret = get_user(min_sec_level,
+				       (unsigned __user *) optval);
+			if (ret < 0)
+				goto error;
+			ret = -EINVAL;
+			if (min_sec_level > RXRPC_SECURITY_MAX)
+				goto error;
+			rx->min_sec_level = min_sec_level;
+			goto success;
+
+		default:
+			break;
+		}
+	}
+
+success:
+	ret = 0;
+error:
+	release_sock(&rx->sk);
+	return ret;
+}
+
+/*
+ * permit an RxRPC socket to be polled
+ */
+static unsigned int rxrpc_poll(struct file *file, struct socket *sock,
+			       poll_table *wait)
+{
+	unsigned int mask;
+	struct sock *sk = sock->sk;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* the socket is readable if there are any messages waiting on the Rx
+	 * queue */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* the socket is writable if there is space to add new data to the
+	 * socket; there is no guarantee that any particular call in progress
+	 * on the socket may have space in the Tx ACK window */
+	if (rxrpc_writable(sk))
+		mask |= POLLOUT | POLLWRNORM;
+
+	return mask;
+}
+
+/*
+ * create an RxRPC socket
+ */
+static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
+			int kern)
+{
+	struct rxrpc_sock *rx;
+	struct sock *sk;
+
+	_enter("%p,%d", sock, protocol);
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	/* we support transport protocol UDP only */
+	if (protocol != PF_INET)
+		return -EPROTONOSUPPORT;
+
+	if (sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	sock->ops = &rxrpc_rpc_ops;
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	sock_init_data(sock, sk);
+	sk->sk_state		= RXRPC_UNCONNECTED;
+	sk->sk_write_space	= rxrpc_write_space;
+	sk->sk_max_ack_backlog	= sysctl_rxrpc_max_qlen;
+	sk->sk_destruct		= rxrpc_sock_destructor;
+
+	rx = rxrpc_sk(sk);
+	rx->proto = protocol;
+	rx->calls = RB_ROOT;
+
+	INIT_LIST_HEAD(&rx->listen_link);
+	INIT_LIST_HEAD(&rx->secureq);
+	INIT_LIST_HEAD(&rx->acceptq);
+	rwlock_init(&rx->call_lock);
+	memset(&rx->srx, 0, sizeof(rx->srx));
+
+	_leave(" = 0 [%p]", rx);
+	return 0;
+}
+
+/*
+ * RxRPC socket destructor
+ */
+static void rxrpc_sock_destructor(struct sock *sk)
+{
+	_enter("%p", sk);
+
+	rxrpc_purge_queue(&sk->sk_receive_queue);
+
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+	WARN_ON(!sk_unhashed(sk));
+	WARN_ON(sk->sk_socket);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		printk("Attempt to release alive rxrpc socket: %p\n", sk);
+		return;
+	}
+}
+
+/*
+ * release an RxRPC socket
+ */
+static int rxrpc_release_sock(struct sock *sk)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+
+	_enter("%p{%d,%d}", sk, sk->sk_state, atomic_read(&sk->sk_refcnt));
+
+	/* declare the socket closed for business */
+	sock_orphan(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	sk->sk_state = RXRPC_CLOSE;
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1);
+
+	if (!list_empty(&rx->listen_link)) {
+		write_lock_bh(&rx->local->services_lock);
+		list_del(&rx->listen_link);
+		write_unlock_bh(&rx->local->services_lock);
+	}
+
+	/* try to flush out this socket */
+	rxrpc_release_calls_on_socket(rx);
+	flush_workqueue(rxrpc_workqueue);
+	rxrpc_purge_queue(&sk->sk_receive_queue);
+
+	if (rx->conn) {
+		rxrpc_put_connection(rx->conn);
+		rx->conn = NULL;
+	}
+
+	if (rx->bundle) {
+		rxrpc_put_bundle(rx->trans, rx->bundle);
+		rx->bundle = NULL;
+	}
+	if (rx->trans) {
+		rxrpc_put_transport(rx->trans);
+		rx->trans = NULL;
+	}
+	if (rx->local) {
+		rxrpc_put_local(rx->local);
+		rx->local = NULL;
+	}
+
+	key_put(rx->key);
+	rx->key = NULL;
+	key_put(rx->securities);
+	rx->securities = NULL;
+	sock_put(sk);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * release an RxRPC BSD socket on close() or equivalent
+ */
+static int rxrpc_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	_enter("%p{%p}", sock, sk);
+
+	if (!sk)
+		return 0;
+
+	sock->sk = NULL;
+
+	return rxrpc_release_sock(sk);
+}
+
+/*
+ * RxRPC network protocol
+ */
+static const struct proto_ops rxrpc_rpc_ops = {
+	.family		= PF_UNIX,
+	.owner		= THIS_MODULE,
+	.release	= rxrpc_release,
+	.bind		= rxrpc_bind,
+	.connect	= rxrpc_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= sock_no_getname,
+	.poll		= rxrpc_poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= rxrpc_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= rxrpc_setsockopt,
+	.getsockopt	= sock_no_getsockopt,
+	.sendmsg	= rxrpc_sendmsg,
+	.recvmsg	= rxrpc_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct proto rxrpc_proto = {
+	.name		= "RXRPC",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct rxrpc_sock),
+	.max_header	= sizeof(struct rxrpc_header),
+};
+
+static const struct net_proto_family rxrpc_family_ops = {
+	.family	= PF_RXRPC,
+	.create = rxrpc_create,
+	.owner	= THIS_MODULE,
+};
+
+/*
+ * initialise and register the RxRPC protocol
+ */
+static int __init af_rxrpc_init(void)
+{
+	struct sk_buff *dummy_skb;
+	int ret = -1;
+
+	BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof(dummy_skb->cb));
+
+	rxrpc_epoch = htonl(get_seconds());
+
+	ret = -ENOMEM;
+	rxrpc_call_jar = kmem_cache_create(
+		"rxrpc_call_jar", sizeof(struct rxrpc_call), 0,
+		SLAB_HWCACHE_ALIGN, NULL);
+	if (!rxrpc_call_jar) {
+		printk(KERN_NOTICE "RxRPC: Failed to allocate call jar\n");
+		goto error_call_jar;
+	}
+
+	rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1);
+	if (!rxrpc_workqueue) {
+		printk(KERN_NOTICE "RxRPC: Failed to allocate work queue\n");
+		goto error_work_queue;
+	}
+
+	ret = proto_register(&rxrpc_proto, 1);
+	if (ret < 0) {
+		printk(KERN_CRIT "RxRPC: Cannot register protocol\n");
+		goto error_proto;
+	}
+
+	ret = sock_register(&rxrpc_family_ops);
+	if (ret < 0) {
+		printk(KERN_CRIT "RxRPC: Cannot register socket family\n");
+		goto error_sock;
+	}
+
+	ret = register_key_type(&key_type_rxrpc);
+	if (ret < 0) {
+		printk(KERN_CRIT "RxRPC: Cannot register client key type\n");
+		goto error_key_type;
+	}
+
+	ret = register_key_type(&key_type_rxrpc_s);
+	if (ret < 0) {
+		printk(KERN_CRIT "RxRPC: Cannot register server key type\n");
+		goto error_key_type_s;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_net_fops_create(&init_net, "rxrpc_calls", 0, &rxrpc_call_seq_fops);
+	proc_net_fops_create(&init_net, "rxrpc_conns", 0, &rxrpc_connection_seq_fops);
+#endif
+	return 0;
+
+error_key_type_s:
+	unregister_key_type(&key_type_rxrpc);
+error_key_type:
+	sock_unregister(PF_RXRPC);
+error_sock:
+	proto_unregister(&rxrpc_proto);
+error_proto:
+	destroy_workqueue(rxrpc_workqueue);
+error_work_queue:
+	kmem_cache_destroy(rxrpc_call_jar);
+error_call_jar:
+	return ret;
+}
+
+/*
+ * unregister the RxRPC protocol
+ */
+static void __exit af_rxrpc_exit(void)
+{
+	_enter("");
+	unregister_key_type(&key_type_rxrpc_s);
+	unregister_key_type(&key_type_rxrpc);
+	sock_unregister(PF_RXRPC);
+	proto_unregister(&rxrpc_proto);
+	rxrpc_destroy_all_calls();
+	rxrpc_destroy_all_connections();
+	rxrpc_destroy_all_transports();
+	rxrpc_destroy_all_peers();
+	rxrpc_destroy_all_locals();
+
+	ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0);
+
+	_debug("flush scheduled work");
+	flush_workqueue(rxrpc_workqueue);
+	proc_net_remove(&init_net, "rxrpc_conns");
+	proc_net_remove(&init_net, "rxrpc_calls");
+	destroy_workqueue(rxrpc_workqueue);
+	kmem_cache_destroy(rxrpc_call_jar);
+	_leave("");
+}
+
+module_init(af_rxrpc_init);
+module_exit(af_rxrpc_exit);
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
new file mode 100644
index 00000000..6d79310f
--- /dev/null
+++ b/net/rxrpc/ar-accept.c
@@ -0,0 +1,510 @@
+/* incoming call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
+		      struct rxrpc_header *hdr)
+{
+	struct msghdr msg;
+	struct kvec iov[1];
+	size_t len;
+	int ret;
+
+	_enter("%d,,", local->debug_id);
+
+	msg.msg_name	= &srx->transport.sin;
+	msg.msg_namelen	= sizeof(srx->transport.sin);
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	hdr->seq	= 0;
+	hdr->type	= RXRPC_PACKET_TYPE_BUSY;
+	hdr->flags	= 0;
+	hdr->userStatus	= 0;
+	hdr->_rsvd	= 0;
+
+	iov[0].iov_base	= hdr;
+	iov[0].iov_len	= sizeof(*hdr);
+
+	len = iov[0].iov_len;
+
+	hdr->serial = htonl(1);
+	_proto("Tx BUSY %%%u", ntohl(hdr->serial));
+
+	ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
+	if (ret < 0) {
+		_leave(" = -EAGAIN [sendmsg failed: %d]", ret);
+		return -EAGAIN;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * accept an incoming call that needs peer, transport and/or connection setting
+ * up
+ */
+static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
+				      struct rxrpc_sock *rx,
+				      struct sk_buff *skb,
+				      struct sockaddr_rxrpc *srx)
+{
+	struct rxrpc_connection *conn;
+	struct rxrpc_transport *trans;
+	struct rxrpc_skb_priv *sp, *nsp;
+	struct rxrpc_peer *peer;
+	struct rxrpc_call *call;
+	struct sk_buff *notification;
+	int ret;
+
+	_enter("");
+
+	sp = rxrpc_skb(skb);
+
+	/* get a notification message to send to the server app */
+	notification = alloc_skb(0, GFP_NOFS);
+	if (!notification) {
+		_debug("no memory");
+		ret = -ENOMEM;
+		goto error_nofree;
+	}
+	rxrpc_new_skb(notification);
+	notification->mark = RXRPC_SKB_MARK_NEW_CALL;
+
+	peer = rxrpc_get_peer(srx, GFP_NOIO);
+	if (IS_ERR(peer)) {
+		_debug("no peer");
+		ret = -EBUSY;
+		goto error;
+	}
+
+	trans = rxrpc_get_transport(local, peer, GFP_NOIO);
+	rxrpc_put_peer(peer);
+	if (IS_ERR(trans)) {
+		_debug("no trans");
+		ret = -EBUSY;
+		goto error;
+	}
+
+	conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO);
+	rxrpc_put_transport(trans);
+	if (IS_ERR(conn)) {
+		_debug("no conn");
+		ret = PTR_ERR(conn);
+		goto error;
+	}
+
+	call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO);
+	rxrpc_put_connection(conn);
+	if (IS_ERR(call)) {
+		_debug("no call");
+		ret = PTR_ERR(call);
+		goto error;
+	}
+
+	/* attach the call to the socket */
+	read_lock_bh(&local->services_lock);
+	if (rx->sk.sk_state == RXRPC_CLOSE)
+		goto invalid_service;
+
+	write_lock(&rx->call_lock);
+	if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) {
+		rxrpc_get_call(call);
+
+		spin_lock(&call->conn->state_lock);
+		if (sp->hdr.securityIndex > 0 &&
+		    call->conn->state == RXRPC_CONN_SERVER_UNSECURED) {
+			_debug("await conn sec");
+			list_add_tail(&call->accept_link, &rx->secureq);
+			call->conn->state = RXRPC_CONN_SERVER_CHALLENGING;
+			atomic_inc(&call->conn->usage);
+			set_bit(RXRPC_CONN_CHALLENGE, &call->conn->events);
+			rxrpc_queue_conn(call->conn);
+		} else {
+			_debug("conn ready");
+			call->state = RXRPC_CALL_SERVER_ACCEPTING;
+			list_add_tail(&call->accept_link, &rx->acceptq);
+			rxrpc_get_call(call);
+			nsp = rxrpc_skb(notification);
+			nsp->call = call;
+
+			ASSERTCMP(atomic_read(&call->usage), >=, 3);
+
+			_debug("notify");
+			spin_lock(&call->lock);
+			ret = rxrpc_queue_rcv_skb(call, notification, true,
+						  false);
+			spin_unlock(&call->lock);
+			notification = NULL;
+			BUG_ON(ret < 0);
+		}
+		spin_unlock(&call->conn->state_lock);
+
+		_debug("queued");
+	}
+	write_unlock(&rx->call_lock);
+
+	_debug("process");
+	rxrpc_fast_process_packet(call, skb);
+
+	_debug("done");
+	read_unlock_bh(&local->services_lock);
+	rxrpc_free_skb(notification);
+	rxrpc_put_call(call);
+	_leave(" = 0");
+	return 0;
+
+invalid_service:
+	_debug("invalid");
+	read_unlock_bh(&local->services_lock);
+
+	read_lock_bh(&call->state_lock);
+	if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) &&
+	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) {
+		rxrpc_get_call(call);
+		rxrpc_queue_call(call);
+	}
+	read_unlock_bh(&call->state_lock);
+	rxrpc_put_call(call);
+	ret = -ECONNREFUSED;
+error:
+	rxrpc_free_skb(notification);
+error_nofree:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * accept incoming calls that need peer, transport and/or connection setting up
+ * - the packets we get are all incoming client DATA packets that have seq == 1
+ */
+void rxrpc_accept_incoming_calls(struct work_struct *work)
+{
+	struct rxrpc_local *local =
+		container_of(work, struct rxrpc_local, acceptor);
+	struct rxrpc_skb_priv *sp;
+	struct sockaddr_rxrpc srx;
+	struct rxrpc_sock *rx;
+	struct sk_buff *skb;
+	__be16 service_id;
+	int ret;
+
+	_enter("%d", local->debug_id);
+
+	read_lock_bh(&rxrpc_local_lock);
+	if (atomic_read(&local->usage) > 0)
+		rxrpc_get_local(local);
+	else
+		local = NULL;
+	read_unlock_bh(&rxrpc_local_lock);
+	if (!local) {
+		_leave(" [local dead]");
+		return;
+	}
+
+process_next_packet:
+	skb = skb_dequeue(&local->accept_queue);
+	if (!skb) {
+		rxrpc_put_local(local);
+		_leave("\n");
+		return;
+	}
+
+	_net("incoming call skb %p", skb);
+
+	sp = rxrpc_skb(skb);
+
+	/* determine the remote address */
+	memset(&srx, 0, sizeof(srx));
+	srx.srx_family = AF_RXRPC;
+	srx.transport.family = local->srx.transport.family;
+	srx.transport_type = local->srx.transport_type;
+	switch (srx.transport.family) {
+	case AF_INET:
+		srx.transport_len = sizeof(struct sockaddr_in);
+		srx.transport.sin.sin_port = udp_hdr(skb)->source;
+		srx.transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+		break;
+	default:
+		goto busy;
+	}
+
+	/* get the socket providing the service */
+	service_id = sp->hdr.serviceId;
+	read_lock_bh(&local->services_lock);
+	list_for_each_entry(rx, &local->services, listen_link) {
+		if (rx->service_id == service_id &&
+		    rx->sk.sk_state != RXRPC_CLOSE)
+			goto found_service;
+	}
+	read_unlock_bh(&local->services_lock);
+	goto invalid_service;
+
+found_service:
+	_debug("found service %hd", ntohs(rx->service_id));
+	if (sk_acceptq_is_full(&rx->sk))
+		goto backlog_full;
+	sk_acceptq_added(&rx->sk);
+	sock_hold(&rx->sk);
+	read_unlock_bh(&local->services_lock);
+
+	ret = rxrpc_accept_incoming_call(local, rx, skb, &srx);
+	if (ret < 0)
+		sk_acceptq_removed(&rx->sk);
+	sock_put(&rx->sk);
+	switch (ret) {
+	case -ECONNRESET: /* old calls are ignored */
+	case -ECONNABORTED: /* aborted calls are reaborted or ignored */
+	case 0:
+		goto process_next_packet;
+	case -ECONNREFUSED:
+		goto invalid_service;
+	case -EBUSY:
+		goto busy;
+	case -EKEYREJECTED:
+		goto security_mismatch;
+	default:
+		BUG();
+	}
+
+backlog_full:
+	read_unlock_bh(&local->services_lock);
+busy:
+	rxrpc_busy(local, &srx, &sp->hdr);
+	rxrpc_free_skb(skb);
+	goto process_next_packet;
+
+invalid_service:
+	skb->priority = RX_INVALID_OPERATION;
+	rxrpc_reject_packet(local, skb);
+	goto process_next_packet;
+
+	/* can't change connection security type mid-flow */
+security_mismatch:
+	skb->priority = RX_PROTOCOL_ERROR;
+	rxrpc_reject_packet(local, skb);
+	goto process_next_packet;
+}
+
+/*
+ * handle acceptance of a call by userspace
+ * - assign the user call ID to the call at the front of the queue
+ */
+struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
+				     unsigned long user_call_ID)
+{
+	struct rxrpc_call *call;
+	struct rb_node *parent, **pp;
+	int ret;
+
+	_enter(",%lx", user_call_ID);
+
+	ASSERT(!irqs_disabled());
+
+	write_lock(&rx->call_lock);
+
+	ret = -ENODATA;
+	if (list_empty(&rx->acceptq))
+		goto out;
+
+	/* check the user ID isn't already in use */
+	ret = -EBADSLT;
+	pp = &rx->calls.rb_node;
+	parent = NULL;
+	while (*pp) {
+		parent = *pp;
+		call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+		if (user_call_ID < call->user_call_ID)
+			pp = &(*pp)->rb_left;
+		else if (user_call_ID > call->user_call_ID)
+			pp = &(*pp)->rb_right;
+		else
+			goto out;
+	}
+
+	/* dequeue the first call and check it's still valid */
+	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	list_del_init(&call->accept_link);
+	sk_acceptq_removed(&rx->sk);
+
+	write_lock_bh(&call->state_lock);
+	switch (call->state) {
+	case RXRPC_CALL_SERVER_ACCEPTING:
+		call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+		break;
+	case RXRPC_CALL_REMOTELY_ABORTED:
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		ret = -ECONNABORTED;
+		goto out_release;
+	case RXRPC_CALL_NETWORK_ERROR:
+		ret = call->conn->error;
+		goto out_release;
+	case RXRPC_CALL_DEAD:
+		ret = -ETIME;
+		goto out_discard;
+	default:
+		BUG();
+	}
+
+	/* formalise the acceptance */
+	call->user_call_ID = user_call_ID;
+	rb_link_node(&call->sock_node, parent, pp);
+	rb_insert_color(&call->sock_node, &rx->calls);
+	if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
+		BUG();
+	if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events))
+		BUG();
+	rxrpc_queue_call(call);
+
+	rxrpc_get_call(call);
+	write_unlock_bh(&call->state_lock);
+	write_unlock(&rx->call_lock);
+	_leave(" = %p{%d}", call, call->debug_id);
+	return call;
+
+	/* if the call is already dying or dead, then we leave the socket's ref
+	 * on it to be released by rxrpc_dead_call_expired() as induced by
+	 * rxrpc_release_call() */
+out_release:
+	_debug("release %p", call);
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+		rxrpc_queue_call(call);
+out_discard:
+	write_unlock_bh(&call->state_lock);
+	_debug("discard %p", call);
+out:
+	write_unlock(&rx->call_lock);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * handle rejectance of a call by userspace
+ * - reject the call at the front of the queue
+ */
+int rxrpc_reject_call(struct rxrpc_sock *rx)
+{
+	struct rxrpc_call *call;
+	int ret;
+
+	_enter("");
+
+	ASSERT(!irqs_disabled());
+
+	write_lock(&rx->call_lock);
+
+	ret = -ENODATA;
+	if (list_empty(&rx->acceptq))
+		goto out;
+
+	/* dequeue the first call and check it's still valid */
+	call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link);
+	list_del_init(&call->accept_link);
+	sk_acceptq_removed(&rx->sk);
+
+	write_lock_bh(&call->state_lock);
+	switch (call->state) {
+	case RXRPC_CALL_SERVER_ACCEPTING:
+		call->state = RXRPC_CALL_SERVER_BUSY;
+		if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events))
+			rxrpc_queue_call(call);
+		ret = 0;
+		goto out_release;
+	case RXRPC_CALL_REMOTELY_ABORTED:
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		ret = -ECONNABORTED;
+		goto out_release;
+	case RXRPC_CALL_NETWORK_ERROR:
+		ret = call->conn->error;
+		goto out_release;
+	case RXRPC_CALL_DEAD:
+		ret = -ETIME;
+		goto out_discard;
+	default:
+		BUG();
+	}
+
+	/* if the call is already dying or dead, then we leave the socket's ref
+	 * on it to be released by rxrpc_dead_call_expired() as induced by
+	 * rxrpc_release_call() */
+out_release:
+	_debug("release %p", call);
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+		rxrpc_queue_call(call);
+out_discard:
+	write_unlock_bh(&call->state_lock);
+	_debug("discard %p", call);
+out:
+	write_unlock(&rx->call_lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/**
+ * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call
+ * @sock: The socket on which the impending call is waiting
+ * @user_call_ID: The tag to attach to the call
+ *
+ * Allow a kernel service to accept an incoming call, assuming the incoming
+ * call is still valid.
+ */
+struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
+					    unsigned long user_call_ID)
+{
+	struct rxrpc_call *call;
+
+	_enter(",%lx", user_call_ID);
+	call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID);
+	_leave(" = %p", call);
+	return call;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_accept_call);
+
+/**
+ * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call
+ * @sock: The socket on which the impending call is waiting
+ *
+ * Allow a kernel service to reject an incoming call with a BUSY message,
+ * assuming the incoming call is still valid.
+ */
+int rxrpc_kernel_reject_call(struct socket *sock)
+{
+	int ret;
+
+	_enter("");
+	ret = rxrpc_reject_call(rxrpc_sk(sock->sk));
+	_leave(" = %d", ret);
+	return ret;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_reject_call);
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
new file mode 100644
index 00000000..f99cfce7
--- /dev/null
+++ b/net/rxrpc/ar-ack.c
@@ -0,0 +1,1307 @@
+/* Management of Tx window, Tx resend, ACKs and out-of-sequence reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/udp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static unsigned rxrpc_ack_defer = 1;
+
+static const char *const rxrpc_acks[] = {
+	"---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL",
+	"-?-"
+};
+
+static const s8 rxrpc_ack_priority[] = {
+	[0]				= 0,
+	[RXRPC_ACK_DELAY]		= 1,
+	[RXRPC_ACK_REQUESTED]		= 2,
+	[RXRPC_ACK_IDLE]		= 3,
+	[RXRPC_ACK_PING_RESPONSE]	= 4,
+	[RXRPC_ACK_DUPLICATE]		= 5,
+	[RXRPC_ACK_OUT_OF_SEQUENCE]	= 6,
+	[RXRPC_ACK_EXCEEDS_WINDOW]	= 7,
+	[RXRPC_ACK_NOSPACE]		= 8,
+};
+
+/*
+ * propose an ACK be sent
+ */
+void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+			 __be32 serial, bool immediate)
+{
+	unsigned long expiry;
+	s8 prior = rxrpc_ack_priority[ack_reason];
+
+	ASSERTCMP(prior, >, 0);
+
+	_enter("{%d},%s,%%%x,%u",
+	       call->debug_id, rxrpc_acks[ack_reason], ntohl(serial),
+	       immediate);
+
+	if (prior < rxrpc_ack_priority[call->ackr_reason]) {
+		if (immediate)
+			goto cancel_timer;
+		return;
+	}
+
+	/* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial
+	 * numbers */
+	if (prior == rxrpc_ack_priority[call->ackr_reason]) {
+		if (prior <= 4)
+			call->ackr_serial = serial;
+		if (immediate)
+			goto cancel_timer;
+		return;
+	}
+
+	call->ackr_reason = ack_reason;
+	call->ackr_serial = serial;
+
+	switch (ack_reason) {
+	case RXRPC_ACK_DELAY:
+		_debug("run delay timer");
+		call->ack_timer.expires = jiffies + rxrpc_ack_timeout * HZ;
+		add_timer(&call->ack_timer);
+		return;
+
+	case RXRPC_ACK_IDLE:
+		if (!immediate) {
+			_debug("run defer timer");
+			expiry = 1;
+			goto run_timer;
+		}
+		goto cancel_timer;
+
+	case RXRPC_ACK_REQUESTED:
+		if (!rxrpc_ack_defer)
+			goto cancel_timer;
+		if (!immediate || serial == cpu_to_be32(1)) {
+			_debug("run defer timer");
+			expiry = rxrpc_ack_defer;
+			goto run_timer;
+		}
+
+	default:
+		_debug("immediate ACK");
+		goto cancel_timer;
+	}
+
+run_timer:
+	expiry += jiffies;
+	if (!timer_pending(&call->ack_timer) ||
+	    time_after(call->ack_timer.expires, expiry))
+		mod_timer(&call->ack_timer, expiry);
+	return;
+
+cancel_timer:
+	_debug("cancel timer %%%u", ntohl(serial));
+	try_to_del_timer_sync(&call->ack_timer);
+	read_lock_bh(&call->state_lock);
+	if (call->state <= RXRPC_CALL_COMPLETE &&
+	    !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+		rxrpc_queue_call(call);
+	read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * propose an ACK be sent, locking the call structure
+ */
+void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
+		       __be32 serial, bool immediate)
+{
+	s8 prior = rxrpc_ack_priority[ack_reason];
+
+	if (prior > rxrpc_ack_priority[call->ackr_reason]) {
+		spin_lock_bh(&call->lock);
+		__rxrpc_propose_ACK(call, ack_reason, serial, immediate);
+		spin_unlock_bh(&call->lock);
+	}
+}
+
+/*
+ * set the resend timer
+ */
+static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
+			     unsigned long resend_at)
+{
+	read_lock_bh(&call->state_lock);
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		resend = 0;
+
+	if (resend & 1) {
+		_debug("SET RESEND");
+		set_bit(RXRPC_CALL_RESEND, &call->events);
+	}
+
+	if (resend & 2) {
+		_debug("MODIFY RESEND TIMER");
+		set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+		mod_timer(&call->resend_timer, resend_at);
+	} else {
+		_debug("KILL RESEND TIMER");
+		del_timer_sync(&call->resend_timer);
+		clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+	}
+	read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * resend packets
+ */
+static void rxrpc_resend(struct rxrpc_call *call)
+{
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_header *hdr;
+	struct sk_buff *txb;
+	unsigned long *p_txb, resend_at;
+	int loop, stop;
+	u8 resend;
+
+	_enter("{%d,%d,%d,%d},",
+	       call->acks_hard, call->acks_unacked,
+	       atomic_read(&call->sequence),
+	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+	stop = 0;
+	resend = 0;
+	resend_at = 0;
+
+	for (loop = call->acks_tail;
+	     loop != call->acks_head || stop;
+	     loop = (loop + 1) &  (call->acks_winsz - 1)
+	     ) {
+		p_txb = call->acks_window + loop;
+		smp_read_barrier_depends();
+		if (*p_txb & 1)
+			continue;
+
+		txb = (struct sk_buff *) *p_txb;
+		sp = rxrpc_skb(txb);
+
+		if (sp->need_resend) {
+			sp->need_resend = 0;
+
+			/* each Tx packet has a new serial number */
+			sp->hdr.serial =
+				htonl(atomic_inc_return(&call->conn->serial));
+
+			hdr = (struct rxrpc_header *) txb->head;
+			hdr->serial = sp->hdr.serial;
+
+			_proto("Tx DATA %%%u { #%d }",
+			       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+			if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
+				stop = 0;
+				sp->resend_at = jiffies + 3;
+			} else {
+				sp->resend_at =
+					jiffies + rxrpc_resend_timeout * HZ;
+			}
+		}
+
+		if (time_after_eq(jiffies + 1, sp->resend_at)) {
+			sp->need_resend = 1;
+			resend |= 1;
+		} else if (resend & 2) {
+			if (time_before(sp->resend_at, resend_at))
+				resend_at = sp->resend_at;
+		} else {
+			resend_at = sp->resend_at;
+			resend |= 2;
+		}
+	}
+
+	rxrpc_set_resend(call, resend, resend_at);
+	_leave("");
+}
+
+/*
+ * handle resend timer expiry
+ */
+static void rxrpc_resend_timer(struct rxrpc_call *call)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *txb;
+	unsigned long *p_txb, resend_at;
+	int loop;
+	u8 resend;
+
+	_enter("%d,%d,%d",
+	       call->acks_tail, call->acks_unacked, call->acks_head);
+
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
+
+	resend = 0;
+	resend_at = 0;
+
+	for (loop = call->acks_unacked;
+	     loop != call->acks_head;
+	     loop = (loop + 1) &  (call->acks_winsz - 1)
+	     ) {
+		p_txb = call->acks_window + loop;
+		smp_read_barrier_depends();
+		txb = (struct sk_buff *) (*p_txb & ~1);
+		sp = rxrpc_skb(txb);
+
+		ASSERT(!(*p_txb & 1));
+
+		if (sp->need_resend) {
+			;
+		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+			sp->need_resend = 1;
+			resend |= 1;
+		} else if (resend & 2) {
+			if (time_before(sp->resend_at, resend_at))
+				resend_at = sp->resend_at;
+		} else {
+			resend_at = sp->resend_at;
+			resend |= 2;
+		}
+	}
+
+	rxrpc_set_resend(call, resend, resend_at);
+	_leave("");
+}
+
+/*
+ * process soft ACKs of our transmitted packets
+ * - these indicate packets the peer has or has not received, but hasn't yet
+ *   given to the consumer, and so can still be discarded and re-requested
+ */
+static int rxrpc_process_soft_ACKs(struct rxrpc_call *call,
+				   struct rxrpc_ackpacket *ack,
+				   struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *txb;
+	unsigned long *p_txb, resend_at;
+	int loop;
+	u8 sacks[RXRPC_MAXACKS], resend;
+
+	_enter("{%d,%d},{%d},",
+	       call->acks_hard,
+	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz),
+	       ack->nAcks);
+
+	if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0)
+		goto protocol_error;
+
+	resend = 0;
+	resend_at = 0;
+	for (loop = 0; loop < ack->nAcks; loop++) {
+		p_txb = call->acks_window;
+		p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1);
+		smp_read_barrier_depends();
+		txb = (struct sk_buff *) (*p_txb & ~1);
+		sp = rxrpc_skb(txb);
+
+		switch (sacks[loop]) {
+		case RXRPC_ACK_TYPE_ACK:
+			sp->need_resend = 0;
+			*p_txb |= 1;
+			break;
+		case RXRPC_ACK_TYPE_NACK:
+			sp->need_resend = 1;
+			*p_txb &= ~1;
+			resend = 1;
+			break;
+		default:
+			_debug("Unsupported ACK type %d", sacks[loop]);
+			goto protocol_error;
+		}
+	}
+
+	smp_mb();
+	call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1);
+
+	/* anything not explicitly ACK'd is implicitly NACK'd, but may just not
+	 * have been received or processed yet by the far end */
+	for (loop = call->acks_unacked;
+	     loop != call->acks_head;
+	     loop = (loop + 1) &  (call->acks_winsz - 1)
+	     ) {
+		p_txb = call->acks_window + loop;
+		smp_read_barrier_depends();
+		txb = (struct sk_buff *) (*p_txb & ~1);
+		sp = rxrpc_skb(txb);
+
+		if (*p_txb & 1) {
+			/* packet must have been discarded */
+			sp->need_resend = 1;
+			*p_txb &= ~1;
+			resend |= 1;
+		} else if (sp->need_resend) {
+			;
+		} else if (time_after_eq(jiffies + 1, sp->resend_at)) {
+			sp->need_resend = 1;
+			resend |= 1;
+		} else if (resend & 2) {
+			if (time_before(sp->resend_at, resend_at))
+				resend_at = sp->resend_at;
+		} else {
+			resend_at = sp->resend_at;
+			resend |= 2;
+		}
+	}
+
+	rxrpc_set_resend(call, resend, resend_at);
+	_leave(" = 0");
+	return 0;
+
+protocol_error:
+	_leave(" = -EPROTO");
+	return -EPROTO;
+}
+
+/*
+ * discard hard-ACK'd packets from the Tx window
+ */
+static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
+{
+	unsigned long _skb;
+	int tail = call->acks_tail, old_tail;
+	int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
+
+	_enter("{%u,%u},%u", call->acks_hard, win, hard);
+
+	ASSERTCMP(hard - call->acks_hard, <=, win);
+
+	while (call->acks_hard < hard) {
+		smp_read_barrier_depends();
+		_skb = call->acks_window[tail] & ~1;
+		rxrpc_free_skb((struct sk_buff *) _skb);
+		old_tail = tail;
+		tail = (tail + 1) & (call->acks_winsz - 1);
+		call->acks_tail = tail;
+		if (call->acks_unacked == old_tail)
+			call->acks_unacked = tail;
+		call->acks_hard++;
+	}
+
+	wake_up(&call->tx_waitq);
+}
+
+/*
+ * clear the Tx window in the event of a failure
+ */
+static void rxrpc_clear_tx_window(struct rxrpc_call *call)
+{
+	rxrpc_rotate_tx_window(call, atomic_read(&call->sequence));
+}
+
+/*
+ * drain the out of sequence received packet queue into the packet Rx queue
+ */
+static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	bool terminal;
+	int ret;
+
+	_enter("{%d,%d}", call->rx_data_post, call->rx_first_oos);
+
+	spin_lock_bh(&call->lock);
+
+	ret = -ECONNRESET;
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
+		goto socket_unavailable;
+
+	skb = skb_dequeue(&call->rx_oos_queue);
+	if (skb) {
+		sp = rxrpc_skb(skb);
+
+		_debug("drain OOS packet %d [%d]",
+		       ntohl(sp->hdr.seq), call->rx_first_oos);
+
+		if (ntohl(sp->hdr.seq) != call->rx_first_oos) {
+			skb_queue_head(&call->rx_oos_queue, skb);
+			call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq);
+			_debug("requeue %p {%u}", skb, call->rx_first_oos);
+		} else {
+			skb->mark = RXRPC_SKB_MARK_DATA;
+			terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
+				!(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+			ret = rxrpc_queue_rcv_skb(call, skb, true, terminal);
+			BUG_ON(ret < 0);
+			_debug("drain #%u", call->rx_data_post);
+			call->rx_data_post++;
+
+			/* find out what the next packet is */
+			skb = skb_peek(&call->rx_oos_queue);
+			if (skb)
+				call->rx_first_oos =
+					ntohl(rxrpc_skb(skb)->hdr.seq);
+			else
+				call->rx_first_oos = 0;
+			_debug("peek %p {%u}", skb, call->rx_first_oos);
+		}
+	}
+
+	ret = 0;
+socket_unavailable:
+	spin_unlock_bh(&call->lock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * insert an out of sequence packet into the buffer
+ */
+static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
+				    struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp, *psp;
+	struct sk_buff *p;
+	u32 seq;
+
+	sp = rxrpc_skb(skb);
+	seq = ntohl(sp->hdr.seq);
+	_enter(",,{%u}", seq);
+
+	skb->destructor = rxrpc_packet_destructor;
+	ASSERTCMP(sp->call, ==, NULL);
+	sp->call = call;
+	rxrpc_get_call(call);
+
+	/* insert into the buffer in sequence order */
+	spin_lock_bh(&call->lock);
+
+	skb_queue_walk(&call->rx_oos_queue, p) {
+		psp = rxrpc_skb(p);
+		if (ntohl(psp->hdr.seq) > seq) {
+			_debug("insert oos #%u before #%u",
+			       seq, ntohl(psp->hdr.seq));
+			skb_insert(p, skb, &call->rx_oos_queue);
+			goto inserted;
+		}
+	}
+
+	_debug("append oos #%u", seq);
+	skb_queue_tail(&call->rx_oos_queue, skb);
+inserted:
+
+	/* we might now have a new front to the queue */
+	if (call->rx_first_oos == 0 || seq < call->rx_first_oos)
+		call->rx_first_oos = seq;
+
+	read_lock(&call->state_lock);
+	if (call->state < RXRPC_CALL_COMPLETE &&
+	    call->rx_data_post == call->rx_first_oos) {
+		_debug("drain rx oos now");
+		set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+	}
+	read_unlock(&call->state_lock);
+
+	spin_unlock_bh(&call->lock);
+	_leave(" [stored #%u]", call->rx_first_oos);
+}
+
+/*
+ * clear the Tx window on final ACK reception
+ */
+static void rxrpc_zap_tx_window(struct rxrpc_call *call)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	unsigned long _skb, *acks_window;
+	u8 winsz = call->acks_winsz;
+	int tail;
+
+	acks_window = call->acks_window;
+	call->acks_window = NULL;
+
+	while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) {
+		tail = call->acks_tail;
+		smp_read_barrier_depends();
+		_skb = acks_window[tail] & ~1;
+		smp_mb();
+		call->acks_tail = (call->acks_tail + 1) & (winsz - 1);
+
+		skb = (struct sk_buff *) _skb;
+		sp = rxrpc_skb(skb);
+		_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+		rxrpc_free_skb(skb);
+	}
+
+	kfree(acks_window);
+}
+
+/*
+ * process the extra information that may be appended to an ACK packet
+ */
+static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb,
+				  unsigned latest, int nAcks)
+{
+	struct rxrpc_ackinfo ackinfo;
+	struct rxrpc_peer *peer;
+	unsigned mtu;
+
+	if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) {
+		_leave(" [no ackinfo]");
+		return;
+	}
+
+	_proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }",
+	       latest,
+	       ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU),
+	       ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max));
+
+	mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU));
+
+	peer = call->conn->trans->peer;
+	if (mtu < peer->maxdata) {
+		spin_lock_bh(&peer->lock);
+		peer->maxdata = mtu;
+		peer->mtu = mtu + peer->hdrsize;
+		spin_unlock_bh(&peer->lock);
+		_net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata);
+	}
+}
+
+/*
+ * process packets in the reception queue
+ */
+static int rxrpc_process_rx_queue(struct rxrpc_call *call,
+				  u32 *_abort_code)
+{
+	struct rxrpc_ackpacket ack;
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	bool post_ACK;
+	int latest;
+	u32 hard, tx;
+
+	_enter("");
+
+process_further:
+	skb = skb_dequeue(&call->rx_queue);
+	if (!skb)
+		return -EAGAIN;
+
+	_net("deferred skb %p", skb);
+
+	sp = rxrpc_skb(skb);
+
+	_debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state);
+
+	post_ACK = false;
+
+	switch (sp->hdr.type) {
+		/* data packets that wind up here have been received out of
+		 * order, need security processing or are jumbo packets */
+	case RXRPC_PACKET_TYPE_DATA:
+		_proto("OOSQ DATA %%%u { #%u }",
+		       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+
+		/* secured packets must be verified and possibly decrypted */
+		if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
+			goto protocol_error;
+
+		rxrpc_insert_oos_packet(call, skb);
+		goto process_further;
+
+		/* partial ACK to process */
+	case RXRPC_PACKET_TYPE_ACK:
+		if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) {
+			_debug("extraction failure");
+			goto protocol_error;
+		}
+		if (!skb_pull(skb, sizeof(ack)))
+			BUG();
+
+		latest = ntohl(sp->hdr.serial);
+		hard = ntohl(ack.firstPacket);
+		tx = atomic_read(&call->sequence);
+
+		_proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+		       latest,
+		       ntohs(ack.maxSkew),
+		       hard,
+		       ntohl(ack.previousPacket),
+		       ntohl(ack.serial),
+		       rxrpc_acks[ack.reason],
+		       ack.nAcks);
+
+		rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks);
+
+		if (ack.reason == RXRPC_ACK_PING) {
+			_proto("Rx ACK %%%u PING Request", latest);
+			rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE,
+					  sp->hdr.serial, true);
+		}
+
+		/* discard any out-of-order or duplicate ACKs */
+		if (latest - call->acks_latest <= 0) {
+			_debug("discard ACK %d <= %d",
+			       latest, call->acks_latest);
+			goto discard;
+		}
+		call->acks_latest = latest;
+
+		if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+		    call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY &&
+		    call->state != RXRPC_CALL_SERVER_SEND_REPLY &&
+		    call->state != RXRPC_CALL_SERVER_AWAIT_ACK)
+			goto discard;
+
+		_debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state);
+
+		if (hard > 0) {
+			if (hard - 1 > tx) {
+				_debug("hard-ACK'd packet %d not transmitted"
+				       " (%d top)",
+				       hard - 1, tx);
+				goto protocol_error;
+			}
+
+			if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
+			     call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
+			    hard > tx)
+				goto all_acked;
+
+			smp_rmb();
+			rxrpc_rotate_tx_window(call, hard - 1);
+		}
+
+		if (ack.nAcks > 0) {
+			if (hard - 1 + ack.nAcks > tx) {
+				_debug("soft-ACK'd packet %d+%d not"
+				       " transmitted (%d top)",
+				       hard - 1, ack.nAcks, tx);
+				goto protocol_error;
+			}
+
+			if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0)
+				goto protocol_error;
+		}
+		goto discard;
+
+		/* complete ACK to process */
+	case RXRPC_PACKET_TYPE_ACKALL:
+		goto all_acked;
+
+		/* abort and busy are handled elsewhere */
+	case RXRPC_PACKET_TYPE_BUSY:
+	case RXRPC_PACKET_TYPE_ABORT:
+		BUG();
+
+		/* connection level events - also handled elsewhere */
+	case RXRPC_PACKET_TYPE_CHALLENGE:
+	case RXRPC_PACKET_TYPE_RESPONSE:
+	case RXRPC_PACKET_TYPE_DEBUG:
+		BUG();
+	}
+
+	/* if we've had a hard ACK that covers all the packets we've sent, then
+	 * that ends that phase of the operation */
+all_acked:
+	write_lock_bh(&call->state_lock);
+	_debug("ack all %d", call->state);
+
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		break;
+	case RXRPC_CALL_SERVER_AWAIT_ACK:
+		_debug("srv complete");
+		call->state = RXRPC_CALL_COMPLETE;
+		post_ACK = true;
+		break;
+	case RXRPC_CALL_CLIENT_SEND_REQUEST:
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		goto protocol_error_unlock; /* can't occur yet */
+	default:
+		write_unlock_bh(&call->state_lock);
+		goto discard; /* assume packet left over from earlier phase */
+	}
+
+	write_unlock_bh(&call->state_lock);
+
+	/* if all the packets we sent are hard-ACK'd, then we can discard
+	 * whatever we've got left */
+	_debug("clear Tx %d",
+	       CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz));
+
+	del_timer_sync(&call->resend_timer);
+	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+	clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+
+	if (call->acks_window)
+		rxrpc_zap_tx_window(call);
+
+	if (post_ACK) {
+		/* post the final ACK message for userspace to pick up */
+		_debug("post ACK");
+		skb->mark = RXRPC_SKB_MARK_FINAL_ACK;
+		sp->call = call;
+		rxrpc_get_call(call);
+		spin_lock_bh(&call->lock);
+		if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0)
+			BUG();
+		spin_unlock_bh(&call->lock);
+		goto process_further;
+	}
+
+discard:
+	rxrpc_free_skb(skb);
+	goto process_further;
+
+protocol_error_unlock:
+	write_unlock_bh(&call->state_lock);
+protocol_error:
+	rxrpc_free_skb(skb);
+	_leave(" = -EPROTO");
+	return -EPROTO;
+}
+
+/*
+ * post a message to the socket Rx queue for recvmsg() to pick up
+ */
+static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error,
+			      bool fatal)
+{
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *skb;
+	int ret;
+
+	_enter("{%d,%lx},%u,%u,%d",
+	       call->debug_id, call->flags, mark, error, fatal);
+
+	/* remove timers and things for fatal messages */
+	if (fatal) {
+		del_timer_sync(&call->resend_timer);
+		del_timer_sync(&call->ack_timer);
+		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+	}
+
+	if (mark != RXRPC_SKB_MARK_NEW_CALL &&
+	    !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		_leave("[no userid]");
+		return 0;
+	}
+
+	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+		skb = alloc_skb(0, GFP_NOFS);
+		if (!skb)
+			return -ENOMEM;
+
+		rxrpc_new_skb(skb);
+
+		skb->mark = mark;
+
+		sp = rxrpc_skb(skb);
+		memset(sp, 0, sizeof(*sp));
+		sp->error = error;
+		sp->call = call;
+		rxrpc_get_call(call);
+
+		spin_lock_bh(&call->lock);
+		ret = rxrpc_queue_rcv_skb(call, skb, true, fatal);
+		spin_unlock_bh(&call->lock);
+		BUG_ON(ret < 0);
+	}
+
+	return 0;
+}
+
+/*
+ * handle background processing of incoming call packets and ACK / abort
+ * generation
+ */
+void rxrpc_process_call(struct work_struct *work)
+{
+	struct rxrpc_call *call =
+		container_of(work, struct rxrpc_call, processor);
+	struct rxrpc_ackpacket ack;
+	struct rxrpc_ackinfo ackinfo;
+	struct rxrpc_header hdr;
+	struct msghdr msg;
+	struct kvec iov[5];
+	unsigned long bits;
+	__be32 data, pad;
+	size_t len;
+	int genbit, loop, nbit, ioc, ret, mtu;
+	u32 abort_code = RX_PROTOCOL_ERROR;
+	u8 *acks = NULL;
+
+	//printk("\n--------------------\n");
+	_enter("{%d,%s,%lx} [%lu]",
+	       call->debug_id, rxrpc_call_states[call->state], call->events,
+	       (jiffies - call->creation_jif) / (HZ / 10));
+
+	if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) {
+		_debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX");
+		return;
+	}
+
+	/* there's a good chance we're going to have to send a message, so set
+	 * one up in advance */
+	msg.msg_name	= &call->conn->trans->peer->srx.transport.sin;
+	msg.msg_namelen	= sizeof(call->conn->trans->peer->srx.transport.sin);
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	hdr.epoch	= call->conn->epoch;
+	hdr.cid		= call->cid;
+	hdr.callNumber	= call->call_id;
+	hdr.seq		= 0;
+	hdr.type	= RXRPC_PACKET_TYPE_ACK;
+	hdr.flags	= call->conn->out_clientflag;
+	hdr.userStatus	= 0;
+	hdr.securityIndex = call->conn->security_ix;
+	hdr._rsvd	= 0;
+	hdr.serviceId	= call->conn->service_id;
+
+	memset(iov, 0, sizeof(iov));
+	iov[0].iov_base	= &hdr;
+	iov[0].iov_len	= sizeof(hdr);
+
+	/* deal with events of a final nature */
+	if (test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+		rxrpc_release_call(call);
+		clear_bit(RXRPC_CALL_RELEASE, &call->events);
+	}
+
+	if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) {
+		int error;
+
+		clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+		clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_ABORT, &call->events);
+
+		error = call->conn->trans->peer->net_error;
+		_debug("post net error %d", error);
+
+		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR,
+				       error, true) < 0)
+			goto no_mem;
+		clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+		goto kill_ACKs;
+	}
+
+	if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) {
+		ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+		clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_ABORT, &call->events);
+
+		_debug("post conn abort");
+
+		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+				       call->conn->error, true) < 0)
+			goto no_mem;
+		clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+		goto kill_ACKs;
+	}
+
+	if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) {
+		hdr.type = RXRPC_PACKET_TYPE_BUSY;
+		genbit = RXRPC_CALL_REJECT_BUSY;
+		goto send_message;
+	}
+
+	if (test_bit(RXRPC_CALL_ABORT, &call->events)) {
+		ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
+
+		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+				       ECONNABORTED, true) < 0)
+			goto no_mem;
+		hdr.type = RXRPC_PACKET_TYPE_ABORT;
+		data = htonl(call->abort_code);
+		iov[1].iov_base = &data;
+		iov[1].iov_len = sizeof(data);
+		genbit = RXRPC_CALL_ABORT;
+		goto send_message;
+	}
+
+	if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) {
+		genbit = RXRPC_CALL_ACK_FINAL;
+
+		ack.bufferSpace	= htons(8);
+		ack.maxSkew	= 0;
+		ack.serial	= 0;
+		ack.reason	= RXRPC_ACK_IDLE;
+		ack.nAcks	= 0;
+		call->ackr_reason = 0;
+
+		spin_lock_bh(&call->lock);
+		ack.serial = call->ackr_serial;
+		ack.previousPacket = call->ackr_prev_seq;
+		ack.firstPacket = htonl(call->rx_data_eaten + 1);
+		spin_unlock_bh(&call->lock);
+
+		pad = 0;
+
+		iov[1].iov_base = &ack;
+		iov[1].iov_len	= sizeof(ack);
+		iov[2].iov_base = &pad;
+		iov[2].iov_len	= 3;
+		iov[3].iov_base = &ackinfo;
+		iov[3].iov_len	= sizeof(ackinfo);
+		goto send_ACK;
+	}
+
+	if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) |
+			    (1 << RXRPC_CALL_RCVD_ABORT))
+	    ) {
+		u32 mark;
+
+		if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events))
+			mark = RXRPC_SKB_MARK_REMOTE_ABORT;
+		else
+			mark = RXRPC_SKB_MARK_BUSY;
+
+		_debug("post abort/busy");
+		rxrpc_clear_tx_window(call);
+		if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
+			goto no_mem;
+
+		clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+		clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+		goto kill_ACKs;
+	}
+
+	if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) {
+		_debug("do implicit ackall");
+		rxrpc_clear_tx_window(call);
+	}
+
+	if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) {
+		write_lock_bh(&call->state_lock);
+		if (call->state <= RXRPC_CALL_COMPLETE) {
+			call->state = RXRPC_CALL_LOCALLY_ABORTED;
+			call->abort_code = RX_CALL_TIMEOUT;
+			set_bit(RXRPC_CALL_ABORT, &call->events);
+		}
+		write_unlock_bh(&call->state_lock);
+
+		_debug("post timeout");
+		if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
+				       ETIME, true) < 0)
+			goto no_mem;
+
+		clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+		goto kill_ACKs;
+	}
+
+	/* deal with assorted inbound messages */
+	if (!skb_queue_empty(&call->rx_queue)) {
+		switch (rxrpc_process_rx_queue(call, &abort_code)) {
+		case 0:
+		case -EAGAIN:
+			break;
+		case -ENOMEM:
+			goto no_mem;
+		case -EKEYEXPIRED:
+		case -EKEYREJECTED:
+		case -EPROTO:
+			rxrpc_abort_call(call, abort_code);
+			goto kill_ACKs;
+		}
+	}
+
+	/* handle resending */
+	if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+		rxrpc_resend_timer(call);
+	if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events))
+		rxrpc_resend(call);
+
+	/* consider sending an ordinary ACK */
+	if (test_bit(RXRPC_CALL_ACK, &call->events)) {
+		_debug("send ACK: window: %d - %d { %lx }",
+		       call->rx_data_eaten, call->ackr_win_top,
+		       call->ackr_window[0]);
+
+		if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
+		    call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
+			/* ACK by sending reply DATA packet in this state */
+			clear_bit(RXRPC_CALL_ACK, &call->events);
+			goto maybe_reschedule;
+		}
+
+		genbit = RXRPC_CALL_ACK;
+
+		acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
+			       GFP_NOFS);
+		if (!acks)
+			goto no_mem;
+
+		//hdr.flags	= RXRPC_SLOW_START_OK;
+		ack.bufferSpace	= htons(8);
+		ack.maxSkew	= 0;
+		ack.serial	= 0;
+		ack.reason	= 0;
+
+		spin_lock_bh(&call->lock);
+		ack.reason = call->ackr_reason;
+		ack.serial = call->ackr_serial;
+		ack.previousPacket = call->ackr_prev_seq;
+		ack.firstPacket = htonl(call->rx_data_eaten + 1);
+
+		ack.nAcks = 0;
+		for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+			nbit = loop * BITS_PER_LONG;
+			for (bits = call->ackr_window[loop]; bits; bits >>= 1
+			     ) {
+				_debug("- l=%d n=%d b=%lx", loop, nbit, bits);
+				if (bits & 1) {
+					acks[nbit] = RXRPC_ACK_TYPE_ACK;
+					ack.nAcks = nbit + 1;
+				}
+				nbit++;
+			}
+		}
+		call->ackr_reason = 0;
+		spin_unlock_bh(&call->lock);
+
+		pad = 0;
+
+		iov[1].iov_base = &ack;
+		iov[1].iov_len	= sizeof(ack);
+		iov[2].iov_base = acks;
+		iov[2].iov_len	= ack.nAcks;
+		iov[3].iov_base = &pad;
+		iov[3].iov_len	= 3;
+		iov[4].iov_base = &ackinfo;
+		iov[4].iov_len	= sizeof(ackinfo);
+
+		switch (ack.reason) {
+		case RXRPC_ACK_REQUESTED:
+		case RXRPC_ACK_DUPLICATE:
+		case RXRPC_ACK_OUT_OF_SEQUENCE:
+		case RXRPC_ACK_EXCEEDS_WINDOW:
+		case RXRPC_ACK_NOSPACE:
+		case RXRPC_ACK_PING:
+		case RXRPC_ACK_PING_RESPONSE:
+			goto send_ACK_with_skew;
+		case RXRPC_ACK_DELAY:
+		case RXRPC_ACK_IDLE:
+			goto send_ACK;
+		}
+	}
+
+	/* handle completion of security negotiations on an incoming
+	 * connection */
+	if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) {
+		_debug("secured");
+		spin_lock_bh(&call->lock);
+
+		if (call->state == RXRPC_CALL_SERVER_SECURING) {
+			_debug("securing");
+			write_lock(&call->conn->lock);
+			if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+			    !test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+				_debug("not released");
+				call->state = RXRPC_CALL_SERVER_ACCEPTING;
+				list_move_tail(&call->accept_link,
+					       &call->socket->acceptq);
+			}
+			write_unlock(&call->conn->lock);
+			read_lock(&call->state_lock);
+			if (call->state < RXRPC_CALL_COMPLETE)
+				set_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+			read_unlock(&call->state_lock);
+		}
+
+		spin_unlock_bh(&call->lock);
+		if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events))
+			goto maybe_reschedule;
+	}
+
+	/* post a notification of an acceptable connection to the app */
+	if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) {
+		_debug("post accept");
+		if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
+				       0, false) < 0)
+			goto no_mem;
+		clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+		goto maybe_reschedule;
+	}
+
+	/* handle incoming call acceptance */
+	if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) {
+		_debug("accepted");
+		ASSERTCMP(call->rx_data_post, ==, 0);
+		call->rx_data_post = 1;
+		read_lock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE)
+			set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+		read_unlock_bh(&call->state_lock);
+	}
+
+	/* drain the out of sequence received packet queue into the packet Rx
+	 * queue */
+	if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) {
+		while (call->rx_data_post == call->rx_first_oos)
+			if (rxrpc_drain_rx_oos_queue(call) < 0)
+				break;
+		goto maybe_reschedule;
+	}
+
+	/* other events may have been raised since we started checking */
+	goto maybe_reschedule;
+
+send_ACK_with_skew:
+	ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) -
+			    ntohl(ack.serial));
+send_ACK:
+	mtu = call->conn->trans->peer->if_mtu;
+	mtu -= call->conn->trans->peer->hdrsize;
+	ackinfo.maxMTU	= htonl(mtu);
+	ackinfo.rwind	= htonl(32);
+
+	/* permit the peer to send us jumbo packets if it wants to */
+	ackinfo.rxMTU	= htonl(5692);
+	ackinfo.jumbo_max = htonl(4);
+
+	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+	_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
+	       ntohl(hdr.serial),
+	       ntohs(ack.maxSkew),
+	       ntohl(ack.firstPacket),
+	       ntohl(ack.previousPacket),
+	       ntohl(ack.serial),
+	       rxrpc_acks[ack.reason],
+	       ack.nAcks);
+
+	del_timer_sync(&call->ack_timer);
+	if (ack.nAcks > 0)
+		set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags);
+	goto send_message_2;
+
+send_message:
+	_debug("send message");
+
+	hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+	_proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial));
+send_message_2:
+
+	len = iov[0].iov_len;
+	ioc = 1;
+	if (iov[4].iov_len) {
+		ioc = 5;
+		len += iov[4].iov_len;
+		len += iov[3].iov_len;
+		len += iov[2].iov_len;
+		len += iov[1].iov_len;
+	} else if (iov[3].iov_len) {
+		ioc = 4;
+		len += iov[3].iov_len;
+		len += iov[2].iov_len;
+		len += iov[1].iov_len;
+	} else if (iov[2].iov_len) {
+		ioc = 3;
+		len += iov[2].iov_len;
+		len += iov[1].iov_len;
+	} else if (iov[1].iov_len) {
+		ioc = 2;
+		len += iov[1].iov_len;
+	}
+
+	ret = kernel_sendmsg(call->conn->trans->local->socket,
+			     &msg, iov, ioc, len);
+	if (ret < 0) {
+		_debug("sendmsg failed: %d", ret);
+		read_lock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_DEAD)
+			rxrpc_queue_call(call);
+		read_unlock_bh(&call->state_lock);
+		goto error;
+	}
+
+	switch (genbit) {
+	case RXRPC_CALL_ABORT:
+		clear_bit(genbit, &call->events);
+		clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+		goto kill_ACKs;
+
+	case RXRPC_CALL_ACK_FINAL:
+		write_lock_bh(&call->state_lock);
+		if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
+			call->state = RXRPC_CALL_COMPLETE;
+		write_unlock_bh(&call->state_lock);
+		goto kill_ACKs;
+
+	default:
+		clear_bit(genbit, &call->events);
+		switch (call->state) {
+		case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		case RXRPC_CALL_CLIENT_RECV_REPLY:
+		case RXRPC_CALL_SERVER_RECV_REQUEST:
+		case RXRPC_CALL_SERVER_ACK_REQUEST:
+			_debug("start ACK timer");
+			rxrpc_propose_ACK(call, RXRPC_ACK_DELAY,
+					  call->ackr_serial, false);
+		default:
+			break;
+		}
+		goto maybe_reschedule;
+	}
+
+kill_ACKs:
+	del_timer_sync(&call->ack_timer);
+	if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events))
+		rxrpc_put_call(call);
+	clear_bit(RXRPC_CALL_ACK, &call->events);
+
+maybe_reschedule:
+	if (call->events || !skb_queue_empty(&call->rx_queue)) {
+		read_lock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_DEAD)
+			rxrpc_queue_call(call);
+		read_unlock_bh(&call->state_lock);
+	}
+
+	/* don't leave aborted connections on the accept queue */
+	if (call->state >= RXRPC_CALL_COMPLETE &&
+	    !list_empty(&call->accept_link)) {
+		_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
+		       call, call->events, call->flags,
+		       ntohl(call->conn->cid));
+
+		read_lock_bh(&call->state_lock);
+		if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+		    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+			rxrpc_queue_call(call);
+		read_unlock_bh(&call->state_lock);
+	}
+
+error:
+	clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags);
+	kfree(acks);
+
+	/* because we don't want two CPUs both processing the work item for one
+	 * call at the same time, we use a flag to note when it's busy; however
+	 * this means there's a race between clearing the flag and setting the
+	 * work pending bit and the work item being processed again */
+	if (call->events && !work_pending(&call->processor)) {
+		_debug("jumpstart %x", ntohl(call->conn->cid));
+		rxrpc_queue_call(call);
+	}
+
+	_leave("");
+	return;
+
+no_mem:
+	_debug("out of memory");
+	goto maybe_reschedule;
+}
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
new file mode 100644
index 00000000..bf656c23
--- /dev/null
+++ b/net/rxrpc/ar-call.c
@@ -0,0 +1,822 @@
+/* RxRPC individual remote procedure call handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/circ_buf.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+const char *const rxrpc_call_states[] = {
+	[RXRPC_CALL_CLIENT_SEND_REQUEST]	= "ClSndReq",
+	[RXRPC_CALL_CLIENT_AWAIT_REPLY]		= "ClAwtRpl",
+	[RXRPC_CALL_CLIENT_RECV_REPLY]		= "ClRcvRpl",
+	[RXRPC_CALL_CLIENT_FINAL_ACK]		= "ClFnlACK",
+	[RXRPC_CALL_SERVER_SECURING]		= "SvSecure",
+	[RXRPC_CALL_SERVER_ACCEPTING]		= "SvAccept",
+	[RXRPC_CALL_SERVER_RECV_REQUEST]	= "SvRcvReq",
+	[RXRPC_CALL_SERVER_ACK_REQUEST]		= "SvAckReq",
+	[RXRPC_CALL_SERVER_SEND_REPLY]		= "SvSndRpl",
+	[RXRPC_CALL_SERVER_AWAIT_ACK]		= "SvAwtACK",
+	[RXRPC_CALL_COMPLETE]			= "Complete",
+	[RXRPC_CALL_SERVER_BUSY]		= "SvBusy  ",
+	[RXRPC_CALL_REMOTELY_ABORTED]		= "RmtAbort",
+	[RXRPC_CALL_LOCALLY_ABORTED]		= "LocAbort",
+	[RXRPC_CALL_NETWORK_ERROR]		= "NetError",
+	[RXRPC_CALL_DEAD]			= "Dead    ",
+};
+
+struct kmem_cache *rxrpc_call_jar;
+LIST_HEAD(rxrpc_calls);
+DEFINE_RWLOCK(rxrpc_call_lock);
+static unsigned rxrpc_call_max_lifetime = 60;
+static unsigned rxrpc_dead_call_timeout = 2;
+
+static void rxrpc_destroy_call(struct work_struct *work);
+static void rxrpc_call_life_expired(unsigned long _call);
+static void rxrpc_dead_call_expired(unsigned long _call);
+static void rxrpc_ack_time_expired(unsigned long _call);
+static void rxrpc_resend_time_expired(unsigned long _call);
+
+/*
+ * allocate a new call
+ */
+static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+{
+	struct rxrpc_call *call;
+
+	call = kmem_cache_zalloc(rxrpc_call_jar, gfp);
+	if (!call)
+		return NULL;
+
+	call->acks_winsz = 16;
+	call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long),
+				    gfp);
+	if (!call->acks_window) {
+		kmem_cache_free(rxrpc_call_jar, call);
+		return NULL;
+	}
+
+	setup_timer(&call->lifetimer, &rxrpc_call_life_expired,
+		    (unsigned long) call);
+	setup_timer(&call->deadspan, &rxrpc_dead_call_expired,
+		    (unsigned long) call);
+	setup_timer(&call->ack_timer, &rxrpc_ack_time_expired,
+		    (unsigned long) call);
+	setup_timer(&call->resend_timer, &rxrpc_resend_time_expired,
+		    (unsigned long) call);
+	INIT_WORK(&call->destroyer, &rxrpc_destroy_call);
+	INIT_WORK(&call->processor, &rxrpc_process_call);
+	INIT_LIST_HEAD(&call->accept_link);
+	skb_queue_head_init(&call->rx_queue);
+	skb_queue_head_init(&call->rx_oos_queue);
+	init_waitqueue_head(&call->tx_waitq);
+	spin_lock_init(&call->lock);
+	rwlock_init(&call->state_lock);
+	atomic_set(&call->usage, 1);
+	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
+	call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+
+	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
+
+	call->rx_data_expect = 1;
+	call->rx_data_eaten = 0;
+	call->rx_first_oos = 0;
+	call->ackr_win_top = call->rx_data_eaten + 1 + RXRPC_MAXACKS;
+	call->creation_jif = jiffies;
+	return call;
+}
+
+/*
+ * allocate a new client call and attempt to get a connection slot for it
+ */
+static struct rxrpc_call *rxrpc_alloc_client_call(
+	struct rxrpc_sock *rx,
+	struct rxrpc_transport *trans,
+	struct rxrpc_conn_bundle *bundle,
+	gfp_t gfp)
+{
+	struct rxrpc_call *call;
+	int ret;
+
+	_enter("");
+
+	ASSERT(rx != NULL);
+	ASSERT(trans != NULL);
+	ASSERT(bundle != NULL);
+
+	call = rxrpc_alloc_call(gfp);
+	if (!call)
+		return ERR_PTR(-ENOMEM);
+
+	sock_hold(&rx->sk);
+	call->socket = rx;
+	call->rx_data_post = 1;
+
+	ret = rxrpc_connect_call(rx, trans, bundle, call, gfp);
+	if (ret < 0) {
+		kmem_cache_free(rxrpc_call_jar, call);
+		return ERR_PTR(ret);
+	}
+
+	spin_lock(&call->conn->trans->peer->lock);
+	list_add(&call->error_link, &call->conn->trans->peer->error_targets);
+	spin_unlock(&call->conn->trans->peer->lock);
+
+	call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ;
+	add_timer(&call->lifetimer);
+
+	_leave(" = %p", call);
+	return call;
+}
+
+/*
+ * set up a call for the given data
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *rx,
+					 struct rxrpc_transport *trans,
+					 struct rxrpc_conn_bundle *bundle,
+					 unsigned long user_call_ID,
+					 int create,
+					 gfp_t gfp)
+{
+	struct rxrpc_call *call, *candidate;
+	struct rb_node *p, *parent, **pp;
+
+	_enter("%p,%d,%d,%lx,%d",
+	       rx, trans ? trans->debug_id : -1, bundle ? bundle->debug_id : -1,
+	       user_call_ID, create);
+
+	/* search the extant calls first for one that matches the specified
+	 * user ID */
+	read_lock(&rx->call_lock);
+
+	p = rx->calls.rb_node;
+	while (p) {
+		call = rb_entry(p, struct rxrpc_call, sock_node);
+
+		if (user_call_ID < call->user_call_ID)
+			p = p->rb_left;
+		else if (user_call_ID > call->user_call_ID)
+			p = p->rb_right;
+		else
+			goto found_extant_call;
+	}
+
+	read_unlock(&rx->call_lock);
+
+	if (!create || !trans)
+		return ERR_PTR(-EBADSLT);
+
+	/* not yet present - create a candidate for a new record and then
+	 * redo the search */
+	candidate = rxrpc_alloc_client_call(rx, trans, bundle, gfp);
+	if (IS_ERR(candidate)) {
+		_leave(" = %ld", PTR_ERR(candidate));
+		return candidate;
+	}
+
+	candidate->user_call_ID = user_call_ID;
+	__set_bit(RXRPC_CALL_HAS_USERID, &candidate->flags);
+
+	write_lock(&rx->call_lock);
+
+	pp = &rx->calls.rb_node;
+	parent = NULL;
+	while (*pp) {
+		parent = *pp;
+		call = rb_entry(parent, struct rxrpc_call, sock_node);
+
+		if (user_call_ID < call->user_call_ID)
+			pp = &(*pp)->rb_left;
+		else if (user_call_ID > call->user_call_ID)
+			pp = &(*pp)->rb_right;
+		else
+			goto found_extant_second;
+	}
+
+	/* second search also failed; add the new call */
+	call = candidate;
+	candidate = NULL;
+	rxrpc_get_call(call);
+
+	rb_link_node(&call->sock_node, parent, pp);
+	rb_insert_color(&call->sock_node, &rx->calls);
+	write_unlock(&rx->call_lock);
+
+	write_lock_bh(&rxrpc_call_lock);
+	list_add_tail(&call->link, &rxrpc_calls);
+	write_unlock_bh(&rxrpc_call_lock);
+
+	_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+	_leave(" = %p [new]", call);
+	return call;
+
+	/* we found the call in the list immediately */
+found_extant_call:
+	rxrpc_get_call(call);
+	read_unlock(&rx->call_lock);
+	_leave(" = %p [extant %d]", call, atomic_read(&call->usage));
+	return call;
+
+	/* we found the call on the second time through the list */
+found_extant_second:
+	rxrpc_get_call(call);
+	write_unlock(&rx->call_lock);
+	rxrpc_put_call(candidate);
+	_leave(" = %p [second %d]", call, atomic_read(&call->usage));
+	return call;
+}
+
+/*
+ * set up an incoming call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
+				       struct rxrpc_connection *conn,
+				       struct rxrpc_header *hdr,
+				       gfp_t gfp)
+{
+	struct rxrpc_call *call, *candidate;
+	struct rb_node **p, *parent;
+	__be32 call_id;
+
+	_enter(",%d,,%x", conn->debug_id, gfp);
+
+	ASSERT(rx != NULL);
+
+	candidate = rxrpc_alloc_call(gfp);
+	if (!candidate)
+		return ERR_PTR(-EBUSY);
+
+	candidate->socket = rx;
+	candidate->conn = conn;
+	candidate->cid = hdr->cid;
+	candidate->call_id = hdr->callNumber;
+	candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK;
+	candidate->rx_data_post = 0;
+	candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
+	if (conn->security_ix > 0)
+		candidate->state = RXRPC_CALL_SERVER_SECURING;
+
+	write_lock_bh(&conn->lock);
+
+	/* set the channel for this call */
+	call = conn->channels[candidate->channel];
+	_debug("channel[%u] is %p", candidate->channel, call);
+	if (call && call->call_id == hdr->callNumber) {
+		/* already set; must've been a duplicate packet */
+		_debug("extant call [%d]", call->state);
+		ASSERTCMP(call->conn, ==, conn);
+
+		read_lock(&call->state_lock);
+		switch (call->state) {
+		case RXRPC_CALL_LOCALLY_ABORTED:
+			if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+				rxrpc_queue_call(call);
+		case RXRPC_CALL_REMOTELY_ABORTED:
+			read_unlock(&call->state_lock);
+			goto aborted_call;
+		default:
+			rxrpc_get_call(call);
+			read_unlock(&call->state_lock);
+			goto extant_call;
+		}
+	}
+
+	if (call) {
+		/* it seems the channel is still in use from the previous call
+		 * - ditch the old binding if its call is now complete */
+		_debug("CALL: %u { %s }",
+		       call->debug_id, rxrpc_call_states[call->state]);
+
+		if (call->state >= RXRPC_CALL_COMPLETE) {
+			conn->channels[call->channel] = NULL;
+		} else {
+			write_unlock_bh(&conn->lock);
+			kmem_cache_free(rxrpc_call_jar, candidate);
+			_leave(" = -EBUSY");
+			return ERR_PTR(-EBUSY);
+		}
+	}
+
+	/* check the call number isn't duplicate */
+	_debug("check dup");
+	call_id = hdr->callNumber;
+	p = &conn->calls.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		call = rb_entry(parent, struct rxrpc_call, conn_node);
+
+		if (call_id < call->call_id)
+			p = &(*p)->rb_left;
+		else if (call_id > call->call_id)
+			p = &(*p)->rb_right;
+		else
+			goto old_call;
+	}
+
+	/* make the call available */
+	_debug("new call");
+	call = candidate;
+	candidate = NULL;
+	rb_link_node(&call->conn_node, parent, p);
+	rb_insert_color(&call->conn_node, &conn->calls);
+	conn->channels[call->channel] = call;
+	sock_hold(&rx->sk);
+	atomic_inc(&conn->usage);
+	write_unlock_bh(&conn->lock);
+
+	spin_lock(&conn->trans->peer->lock);
+	list_add(&call->error_link, &conn->trans->peer->error_targets);
+	spin_unlock(&conn->trans->peer->lock);
+
+	write_lock_bh(&rxrpc_call_lock);
+	list_add_tail(&call->link, &rxrpc_calls);
+	write_unlock_bh(&rxrpc_call_lock);
+
+	_net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+	call->lifetimer.expires = jiffies + rxrpc_call_max_lifetime * HZ;
+	add_timer(&call->lifetimer);
+	_leave(" = %p {%d} [new]", call, call->debug_id);
+	return call;
+
+extant_call:
+	write_unlock_bh(&conn->lock);
+	kmem_cache_free(rxrpc_call_jar, candidate);
+	_leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1);
+	return call;
+
+aborted_call:
+	write_unlock_bh(&conn->lock);
+	kmem_cache_free(rxrpc_call_jar, candidate);
+	_leave(" = -ECONNABORTED");
+	return ERR_PTR(-ECONNABORTED);
+
+old_call:
+	write_unlock_bh(&conn->lock);
+	kmem_cache_free(rxrpc_call_jar, candidate);
+	_leave(" = -ECONNRESET [old]");
+	return ERR_PTR(-ECONNRESET);
+}
+
+/*
+ * find an extant server call
+ * - called in process context with IRQs enabled
+ */
+struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *rx,
+					  unsigned long user_call_ID)
+{
+	struct rxrpc_call *call;
+	struct rb_node *p;
+
+	_enter("%p,%lx", rx, user_call_ID);
+
+	/* search the extant calls for one that matches the specified user
+	 * ID */
+	read_lock(&rx->call_lock);
+
+	p = rx->calls.rb_node;
+	while (p) {
+		call = rb_entry(p, struct rxrpc_call, sock_node);
+
+		if (user_call_ID < call->user_call_ID)
+			p = p->rb_left;
+		else if (user_call_ID > call->user_call_ID)
+			p = p->rb_right;
+		else
+			goto found_extant_call;
+	}
+
+	read_unlock(&rx->call_lock);
+	_leave(" = NULL");
+	return NULL;
+
+	/* we found the call in the list immediately */
+found_extant_call:
+	rxrpc_get_call(call);
+	read_unlock(&rx->call_lock);
+	_leave(" = %p [%d]", call, atomic_read(&call->usage));
+	return call;
+}
+
+/*
+ * detach a call from a socket and set up for release
+ */
+void rxrpc_release_call(struct rxrpc_call *call)
+{
+	struct rxrpc_connection *conn = call->conn;
+	struct rxrpc_sock *rx = call->socket;
+
+	_enter("{%d,%d,%d,%d}",
+	       call->debug_id, atomic_read(&call->usage),
+	       atomic_read(&call->ackr_not_idle),
+	       call->rx_first_oos);
+
+	spin_lock_bh(&call->lock);
+	if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags))
+		BUG();
+	spin_unlock_bh(&call->lock);
+
+	/* dissociate from the socket
+	 * - the socket's ref on the call is passed to the death timer
+	 */
+	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
+
+	write_lock_bh(&rx->call_lock);
+	if (!list_empty(&call->accept_link)) {
+		_debug("unlinking once-pending call %p { e=%lx f=%lx }",
+		       call, call->events, call->flags);
+		ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+		list_del_init(&call->accept_link);
+		sk_acceptq_removed(&rx->sk);
+	} else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		rb_erase(&call->sock_node, &rx->calls);
+		memset(&call->sock_node, 0xdd, sizeof(call->sock_node));
+		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+	}
+	write_unlock_bh(&rx->call_lock);
+
+	/* free up the channel for reuse */
+	spin_lock(&conn->trans->client_lock);
+	write_lock_bh(&conn->lock);
+	write_lock(&call->state_lock);
+
+	if (conn->channels[call->channel] == call)
+		conn->channels[call->channel] = NULL;
+
+	if (conn->out_clientflag && conn->bundle) {
+		conn->avail_calls++;
+		switch (conn->avail_calls) {
+		case 1:
+			list_move_tail(&conn->bundle_link,
+				       &conn->bundle->avail_conns);
+		case 2 ... RXRPC_MAXCALLS - 1:
+			ASSERT(conn->channels[0] == NULL ||
+			       conn->channels[1] == NULL ||
+			       conn->channels[2] == NULL ||
+			       conn->channels[3] == NULL);
+			break;
+		case RXRPC_MAXCALLS:
+			list_move_tail(&conn->bundle_link,
+				       &conn->bundle->unused_conns);
+			ASSERT(conn->channels[0] == NULL &&
+			       conn->channels[1] == NULL &&
+			       conn->channels[2] == NULL &&
+			       conn->channels[3] == NULL);
+			break;
+		default:
+			printk(KERN_ERR "RxRPC: conn->avail_calls=%d\n",
+			       conn->avail_calls);
+			BUG();
+		}
+	}
+
+	spin_unlock(&conn->trans->client_lock);
+
+	if (call->state < RXRPC_CALL_COMPLETE &&
+	    call->state != RXRPC_CALL_CLIENT_FINAL_ACK) {
+		_debug("+++ ABORTING STATE %d +++\n", call->state);
+		call->state = RXRPC_CALL_LOCALLY_ABORTED;
+		call->abort_code = RX_CALL_DEAD;
+		set_bit(RXRPC_CALL_ABORT, &call->events);
+		rxrpc_queue_call(call);
+	}
+	write_unlock(&call->state_lock);
+	write_unlock_bh(&conn->lock);
+
+	/* clean up the Rx queue */
+	if (!skb_queue_empty(&call->rx_queue) ||
+	    !skb_queue_empty(&call->rx_oos_queue)) {
+		struct rxrpc_skb_priv *sp;
+		struct sk_buff *skb;
+
+		_debug("purge Rx queues");
+
+		spin_lock_bh(&call->lock);
+		while ((skb = skb_dequeue(&call->rx_queue)) ||
+		       (skb = skb_dequeue(&call->rx_oos_queue))) {
+			sp = rxrpc_skb(skb);
+			if (sp->call) {
+				ASSERTCMP(sp->call, ==, call);
+				rxrpc_put_call(call);
+				sp->call = NULL;
+			}
+			skb->destructor = NULL;
+			spin_unlock_bh(&call->lock);
+
+			_debug("- zap %s %%%u #%u",
+			       rxrpc_pkts[sp->hdr.type],
+			       ntohl(sp->hdr.serial),
+			       ntohl(sp->hdr.seq));
+			rxrpc_free_skb(skb);
+			spin_lock_bh(&call->lock);
+		}
+		spin_unlock_bh(&call->lock);
+
+		ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE);
+	}
+
+	del_timer_sync(&call->resend_timer);
+	del_timer_sync(&call->ack_timer);
+	del_timer_sync(&call->lifetimer);
+	call->deadspan.expires = jiffies + rxrpc_dead_call_timeout * HZ;
+	add_timer(&call->deadspan);
+
+	_leave("");
+}
+
+/*
+ * handle a dead call being ready for reaping
+ */
+static void rxrpc_dead_call_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+	_enter("{%d}", call->debug_id);
+
+	write_lock_bh(&call->state_lock);
+	call->state = RXRPC_CALL_DEAD;
+	write_unlock_bh(&call->state_lock);
+	rxrpc_put_call(call);
+}
+
+/*
+ * mark a call as to be released, aborting it if it's still in progress
+ * - called with softirqs disabled
+ */
+static void rxrpc_mark_call_released(struct rxrpc_call *call)
+{
+	bool sched;
+
+	write_lock(&call->state_lock);
+	if (call->state < RXRPC_CALL_DEAD) {
+		sched = false;
+		if (call->state < RXRPC_CALL_COMPLETE) {
+			_debug("abort call %p", call);
+			call->state = RXRPC_CALL_LOCALLY_ABORTED;
+			call->abort_code = RX_CALL_DEAD;
+			if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+				sched = true;
+		}
+		if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+			sched = true;
+		if (sched)
+			rxrpc_queue_call(call);
+	}
+	write_unlock(&call->state_lock);
+}
+
+/*
+ * release all the calls associated with a socket
+ */
+void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
+{
+	struct rxrpc_call *call;
+	struct rb_node *p;
+
+	_enter("%p", rx);
+
+	read_lock_bh(&rx->call_lock);
+
+	/* mark all the calls as no longer wanting incoming packets */
+	for (p = rb_first(&rx->calls); p; p = rb_next(p)) {
+		call = rb_entry(p, struct rxrpc_call, sock_node);
+		rxrpc_mark_call_released(call);
+	}
+
+	/* kill the not-yet-accepted incoming calls */
+	list_for_each_entry(call, &rx->secureq, accept_link) {
+		rxrpc_mark_call_released(call);
+	}
+
+	list_for_each_entry(call, &rx->acceptq, accept_link) {
+		rxrpc_mark_call_released(call);
+	}
+
+	read_unlock_bh(&rx->call_lock);
+	_leave("");
+}
+
+/*
+ * release a call
+ */
+void __rxrpc_put_call(struct rxrpc_call *call)
+{
+	ASSERT(call != NULL);
+
+	_enter("%p{u=%d}", call, atomic_read(&call->usage));
+
+	ASSERTCMP(atomic_read(&call->usage), >, 0);
+
+	if (atomic_dec_and_test(&call->usage)) {
+		_debug("call %d dead", call->debug_id);
+		ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+		rxrpc_queue_work(&call->destroyer);
+	}
+	_leave("");
+}
+
+/*
+ * clean up a call
+ */
+static void rxrpc_cleanup_call(struct rxrpc_call *call)
+{
+	_net("DESTROY CALL %d", call->debug_id);
+
+	ASSERT(call->socket);
+
+	memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
+
+	del_timer_sync(&call->lifetimer);
+	del_timer_sync(&call->deadspan);
+	del_timer_sync(&call->ack_timer);
+	del_timer_sync(&call->resend_timer);
+
+	ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
+	ASSERTCMP(call->events, ==, 0);
+	if (work_pending(&call->processor)) {
+		_debug("defer destroy");
+		rxrpc_queue_work(&call->destroyer);
+		return;
+	}
+
+	if (call->conn) {
+		spin_lock(&call->conn->trans->peer->lock);
+		list_del(&call->error_link);
+		spin_unlock(&call->conn->trans->peer->lock);
+
+		write_lock_bh(&call->conn->lock);
+		rb_erase(&call->conn_node, &call->conn->calls);
+		write_unlock_bh(&call->conn->lock);
+		rxrpc_put_connection(call->conn);
+	}
+
+	if (call->acks_window) {
+		_debug("kill Tx window %d",
+		       CIRC_CNT(call->acks_head, call->acks_tail,
+				call->acks_winsz));
+		smp_mb();
+		while (CIRC_CNT(call->acks_head, call->acks_tail,
+				call->acks_winsz) > 0) {
+			struct rxrpc_skb_priv *sp;
+			unsigned long _skb;
+
+			_skb = call->acks_window[call->acks_tail] & ~1;
+			sp = rxrpc_skb((struct sk_buff *) _skb);
+			_debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+			rxrpc_free_skb((struct sk_buff *) _skb);
+			call->acks_tail =
+				(call->acks_tail + 1) & (call->acks_winsz - 1);
+		}
+
+		kfree(call->acks_window);
+	}
+
+	rxrpc_free_skb(call->tx_pending);
+
+	rxrpc_purge_queue(&call->rx_queue);
+	ASSERT(skb_queue_empty(&call->rx_oos_queue));
+	sock_put(&call->socket->sk);
+	kmem_cache_free(rxrpc_call_jar, call);
+}
+
+/*
+ * destroy a call
+ */
+static void rxrpc_destroy_call(struct work_struct *work)
+{
+	struct rxrpc_call *call =
+		container_of(work, struct rxrpc_call, destroyer);
+
+	_enter("%p{%d,%d,%p}",
+	       call, atomic_read(&call->usage), call->channel, call->conn);
+
+	ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+
+	write_lock_bh(&rxrpc_call_lock);
+	list_del_init(&call->link);
+	write_unlock_bh(&rxrpc_call_lock);
+
+	rxrpc_cleanup_call(call);
+	_leave("");
+}
+
+/*
+ * preemptively destroy all the call records from a transport endpoint rather
+ * than waiting for them to time out
+ */
+void __exit rxrpc_destroy_all_calls(void)
+{
+	struct rxrpc_call *call;
+
+	_enter("");
+	write_lock_bh(&rxrpc_call_lock);
+
+	while (!list_empty(&rxrpc_calls)) {
+		call = list_entry(rxrpc_calls.next, struct rxrpc_call, link);
+		_debug("Zapping call %p", call);
+
+		list_del_init(&call->link);
+
+		switch (atomic_read(&call->usage)) {
+		case 0:
+			ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD);
+			break;
+		case 1:
+			if (del_timer_sync(&call->deadspan) != 0 &&
+			    call->state != RXRPC_CALL_DEAD)
+				rxrpc_dead_call_expired((unsigned long) call);
+			if (call->state != RXRPC_CALL_DEAD)
+				break;
+		default:
+			printk(KERN_ERR "RXRPC:"
+			       " Call %p still in use (%d,%d,%s,%lx,%lx)!\n",
+			       call, atomic_read(&call->usage),
+			       atomic_read(&call->ackr_not_idle),
+			       rxrpc_call_states[call->state],
+			       call->flags, call->events);
+			if (!skb_queue_empty(&call->rx_queue))
+				printk(KERN_ERR"RXRPC: Rx queue occupied\n");
+			if (!skb_queue_empty(&call->rx_oos_queue))
+				printk(KERN_ERR"RXRPC: OOS queue occupied\n");
+			break;
+		}
+
+		write_unlock_bh(&rxrpc_call_lock);
+		cond_resched();
+		write_lock_bh(&rxrpc_call_lock);
+	}
+
+	write_unlock_bh(&rxrpc_call_lock);
+	_leave("");
+}
+
+/*
+ * handle call lifetime being exceeded
+ */
+static void rxrpc_call_life_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
+
+	_enter("{%d}", call->debug_id);
+	read_lock_bh(&call->state_lock);
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		set_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+		rxrpc_queue_call(call);
+	}
+	read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * handle resend timer expiry
+ * - may not take call->state_lock as this can deadlock against del_timer_sync()
+ */
+static void rxrpc_resend_time_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+	_enter("{%d}", call->debug_id);
+
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
+
+	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+	if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+		rxrpc_queue_call(call);
+}
+
+/*
+ * handle ACK timer expiry
+ */
+static void rxrpc_ack_time_expired(unsigned long _call)
+{
+	struct rxrpc_call *call = (struct rxrpc_call *) _call;
+
+	_enter("{%d}", call->debug_id);
+
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
+
+	read_lock_bh(&call->state_lock);
+	if (call->state < RXRPC_CALL_COMPLETE &&
+	    !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+		rxrpc_queue_call(call);
+	read_unlock_bh(&call->state_lock);
+}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
new file mode 100644
index 00000000..4106ca95
--- /dev/null
+++ b/net/rxrpc/ar-connection.c
@@ -0,0 +1,922 @@
+/* RxRPC virtual connection handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static void rxrpc_connection_reaper(struct work_struct *work);
+
+LIST_HEAD(rxrpc_connections);
+DEFINE_RWLOCK(rxrpc_connection_lock);
+static unsigned long rxrpc_connection_timeout = 10 * 60;
+static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper);
+
+/*
+ * allocate a new client connection bundle
+ */
+static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
+{
+	struct rxrpc_conn_bundle *bundle;
+
+	_enter("");
+
+	bundle = kzalloc(sizeof(struct rxrpc_conn_bundle), gfp);
+	if (bundle) {
+		INIT_LIST_HEAD(&bundle->unused_conns);
+		INIT_LIST_HEAD(&bundle->avail_conns);
+		INIT_LIST_HEAD(&bundle->busy_conns);
+		init_waitqueue_head(&bundle->chanwait);
+		atomic_set(&bundle->usage, 1);
+	}
+
+	_leave(" = %p", bundle);
+	return bundle;
+}
+
+/*
+ * compare bundle parameters with what we're looking for
+ * - return -ve, 0 or +ve
+ */
+static inline
+int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
+		     struct key *key, __be16 service_id)
+{
+	return (bundle->service_id - service_id) ?:
+		((unsigned long) bundle->key - (unsigned long) key);
+}
+
+/*
+ * get bundle of client connections that a client socket can make use of
+ */
+struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
+					   struct rxrpc_transport *trans,
+					   struct key *key,
+					   __be16 service_id,
+					   gfp_t gfp)
+{
+	struct rxrpc_conn_bundle *bundle, *candidate;
+	struct rb_node *p, *parent, **pp;
+
+	_enter("%p{%x},%x,%hx,",
+	       rx, key_serial(key), trans->debug_id, ntohs(service_id));
+
+	if (rx->trans == trans && rx->bundle) {
+		atomic_inc(&rx->bundle->usage);
+		return rx->bundle;
+	}
+
+	/* search the extant bundles first for one that matches the specified
+	 * user ID */
+	spin_lock(&trans->client_lock);
+
+	p = trans->bundles.rb_node;
+	while (p) {
+		bundle = rb_entry(p, struct rxrpc_conn_bundle, node);
+
+		if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
+			p = p->rb_left;
+		else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
+			p = p->rb_right;
+		else
+			goto found_extant_bundle;
+	}
+
+	spin_unlock(&trans->client_lock);
+
+	/* not yet present - create a candidate for a new record and then
+	 * redo the search */
+	candidate = rxrpc_alloc_bundle(gfp);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	candidate->key = key_get(key);
+	candidate->service_id = service_id;
+
+	spin_lock(&trans->client_lock);
+
+	pp = &trans->bundles.rb_node;
+	parent = NULL;
+	while (*pp) {
+		parent = *pp;
+		bundle = rb_entry(parent, struct rxrpc_conn_bundle, node);
+
+		if (rxrpc_cmp_bundle(bundle, key, service_id) < 0)
+			pp = &(*pp)->rb_left;
+		else if (rxrpc_cmp_bundle(bundle, key, service_id) > 0)
+			pp = &(*pp)->rb_right;
+		else
+			goto found_extant_second;
+	}
+
+	/* second search also failed; add the new bundle */
+	bundle = candidate;
+	candidate = NULL;
+
+	rb_link_node(&bundle->node, parent, pp);
+	rb_insert_color(&bundle->node, &trans->bundles);
+	spin_unlock(&trans->client_lock);
+	_net("BUNDLE new on trans %d", trans->debug_id);
+	if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+		atomic_inc(&bundle->usage);
+		rx->bundle = bundle;
+	}
+	_leave(" = %p [new]", bundle);
+	return bundle;
+
+	/* we found the bundle in the list immediately */
+found_extant_bundle:
+	atomic_inc(&bundle->usage);
+	spin_unlock(&trans->client_lock);
+	_net("BUNDLE old on trans %d", trans->debug_id);
+	if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+		atomic_inc(&bundle->usage);
+		rx->bundle = bundle;
+	}
+	_leave(" = %p [extant %d]", bundle, atomic_read(&bundle->usage));
+	return bundle;
+
+	/* we found the bundle on the second time through the list */
+found_extant_second:
+	atomic_inc(&bundle->usage);
+	spin_unlock(&trans->client_lock);
+	kfree(candidate);
+	_net("BUNDLE old2 on trans %d", trans->debug_id);
+	if (!rx->bundle && rx->sk.sk_state == RXRPC_CLIENT_CONNECTED) {
+		atomic_inc(&bundle->usage);
+		rx->bundle = bundle;
+	}
+	_leave(" = %p [second %d]", bundle, atomic_read(&bundle->usage));
+	return bundle;
+}
+
+/*
+ * release a bundle
+ */
+void rxrpc_put_bundle(struct rxrpc_transport *trans,
+		      struct rxrpc_conn_bundle *bundle)
+{
+	_enter("%p,%p{%d}",trans, bundle, atomic_read(&bundle->usage));
+
+	if (atomic_dec_and_lock(&bundle->usage, &trans->client_lock)) {
+		_debug("Destroy bundle");
+		rb_erase(&bundle->node, &trans->bundles);
+		spin_unlock(&trans->client_lock);
+		ASSERT(list_empty(&bundle->unused_conns));
+		ASSERT(list_empty(&bundle->avail_conns));
+		ASSERT(list_empty(&bundle->busy_conns));
+		ASSERTCMP(bundle->num_conns, ==, 0);
+		key_put(bundle->key);
+		kfree(bundle);
+	}
+
+	_leave("");
+}
+
+/*
+ * allocate a new connection
+ */
+static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
+{
+	struct rxrpc_connection *conn;
+
+	_enter("");
+
+	conn = kzalloc(sizeof(struct rxrpc_connection), gfp);
+	if (conn) {
+		INIT_WORK(&conn->processor, &rxrpc_process_connection);
+		INIT_LIST_HEAD(&conn->bundle_link);
+		conn->calls = RB_ROOT;
+		skb_queue_head_init(&conn->rx_queue);
+		rwlock_init(&conn->lock);
+		spin_lock_init(&conn->state_lock);
+		atomic_set(&conn->usage, 1);
+		conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
+		conn->avail_calls = RXRPC_MAXCALLS;
+		conn->size_align = 4;
+		conn->header_size = sizeof(struct rxrpc_header);
+	}
+
+	_leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
+	return conn;
+}
+
+/*
+ * assign a connection ID to a connection and add it to the transport's
+ * connection lookup tree
+ * - called with transport client lock held
+ */
+static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
+{
+	struct rxrpc_connection *xconn;
+	struct rb_node *parent, **p;
+	__be32 epoch;
+	u32 real_conn_id;
+
+	_enter("");
+
+	epoch = conn->epoch;
+
+	write_lock_bh(&conn->trans->conn_lock);
+
+	conn->trans->conn_idcounter += RXRPC_CID_INC;
+	if (conn->trans->conn_idcounter < RXRPC_CID_INC)
+		conn->trans->conn_idcounter = RXRPC_CID_INC;
+	real_conn_id = conn->trans->conn_idcounter;
+
+attempt_insertion:
+	parent = NULL;
+	p = &conn->trans->client_conns.rb_node;
+
+	while (*p) {
+		parent = *p;
+		xconn = rb_entry(parent, struct rxrpc_connection, node);
+
+		if (epoch < xconn->epoch)
+			p = &(*p)->rb_left;
+		else if (epoch > xconn->epoch)
+			p = &(*p)->rb_right;
+		else if (real_conn_id < xconn->real_conn_id)
+			p = &(*p)->rb_left;
+		else if (real_conn_id > xconn->real_conn_id)
+			p = &(*p)->rb_right;
+		else
+			goto id_exists;
+	}
+
+	/* we've found a suitable hole - arrange for this connection to occupy
+	 * it */
+	rb_link_node(&conn->node, parent, p);
+	rb_insert_color(&conn->node, &conn->trans->client_conns);
+
+	conn->real_conn_id = real_conn_id;
+	conn->cid = htonl(real_conn_id);
+	write_unlock_bh(&conn->trans->conn_lock);
+	_leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid));
+	return;
+
+	/* we found a connection with the proposed ID - walk the tree from that
+	 * point looking for the next unused ID */
+id_exists:
+	for (;;) {
+		real_conn_id += RXRPC_CID_INC;
+		if (real_conn_id < RXRPC_CID_INC) {
+			real_conn_id = RXRPC_CID_INC;
+			conn->trans->conn_idcounter = real_conn_id;
+			goto attempt_insertion;
+		}
+
+		parent = rb_next(parent);
+		if (!parent)
+			goto attempt_insertion;
+
+		xconn = rb_entry(parent, struct rxrpc_connection, node);
+		if (epoch < xconn->epoch ||
+		    real_conn_id < xconn->real_conn_id)
+			goto attempt_insertion;
+	}
+}
+
+/*
+ * add a call to a connection's call-by-ID tree
+ */
+static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
+				      struct rxrpc_call *call)
+{
+	struct rxrpc_call *xcall;
+	struct rb_node *parent, **p;
+	__be32 call_id;
+
+	write_lock_bh(&conn->lock);
+
+	call_id = call->call_id;
+	p = &conn->calls.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		xcall = rb_entry(parent, struct rxrpc_call, conn_node);
+
+		if (call_id < xcall->call_id)
+			p = &(*p)->rb_left;
+		else if (call_id > xcall->call_id)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&call->conn_node, parent, p);
+	rb_insert_color(&call->conn_node, &conn->calls);
+
+	write_unlock_bh(&conn->lock);
+}
+
+/*
+ * connect a call on an exclusive connection
+ */
+static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
+				   struct rxrpc_transport *trans,
+				   __be16 service_id,
+				   struct rxrpc_call *call,
+				   gfp_t gfp)
+{
+	struct rxrpc_connection *conn;
+	int chan, ret;
+
+	_enter("");
+
+	conn = rx->conn;
+	if (!conn) {
+		/* not yet present - create a candidate for a new connection
+		 * and then redo the check */
+		conn = rxrpc_alloc_connection(gfp);
+		if (!conn) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
+		}
+
+		conn->trans = trans;
+		conn->bundle = NULL;
+		conn->service_id = service_id;
+		conn->epoch = rxrpc_epoch;
+		conn->in_clientflag = 0;
+		conn->out_clientflag = RXRPC_CLIENT_INITIATED;
+		conn->cid = 0;
+		conn->state = RXRPC_CONN_CLIENT;
+		conn->avail_calls = RXRPC_MAXCALLS - 1;
+		conn->security_level = rx->min_sec_level;
+		conn->key = key_get(rx->key);
+
+		ret = rxrpc_init_client_conn_security(conn);
+		if (ret < 0) {
+			key_put(conn->key);
+			kfree(conn);
+			_leave(" = %d [key]", ret);
+			return ret;
+		}
+
+		write_lock_bh(&rxrpc_connection_lock);
+		list_add_tail(&conn->link, &rxrpc_connections);
+		write_unlock_bh(&rxrpc_connection_lock);
+
+		spin_lock(&trans->client_lock);
+		atomic_inc(&trans->usage);
+
+		_net("CONNECT EXCL new %d on TRANS %d",
+		     conn->debug_id, conn->trans->debug_id);
+
+		rxrpc_assign_connection_id(conn);
+		rx->conn = conn;
+	}
+
+	/* we've got a connection with a free channel and we can now attach the
+	 * call to it
+	 * - we're holding the transport's client lock
+	 * - we're holding a reference on the connection
+	 */
+	for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+		if (!conn->channels[chan])
+			goto found_channel;
+	goto no_free_channels;
+
+found_channel:
+	atomic_inc(&conn->usage);
+	conn->channels[chan] = call;
+	call->conn = conn;
+	call->channel = chan;
+	call->cid = conn->cid | htonl(chan);
+	call->call_id = htonl(++conn->call_counter);
+
+	_net("CONNECT client on conn %d chan %d as call %x",
+	     conn->debug_id, chan, ntohl(call->call_id));
+
+	spin_unlock(&trans->client_lock);
+
+	rxrpc_add_call_ID_to_conn(conn, call);
+	_leave(" = 0");
+	return 0;
+
+no_free_channels:
+	spin_unlock(&trans->client_lock);
+	_leave(" = -ENOSR");
+	return -ENOSR;
+}
+
+/*
+ * find a connection for a call
+ * - called in process context with IRQs enabled
+ */
+int rxrpc_connect_call(struct rxrpc_sock *rx,
+		       struct rxrpc_transport *trans,
+		       struct rxrpc_conn_bundle *bundle,
+		       struct rxrpc_call *call,
+		       gfp_t gfp)
+{
+	struct rxrpc_connection *conn, *candidate;
+	int chan, ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%p,%lx,", rx, call->user_call_ID);
+
+	if (test_bit(RXRPC_SOCK_EXCLUSIVE_CONN, &rx->flags))
+		return rxrpc_connect_exclusive(rx, trans, bundle->service_id,
+					       call, gfp);
+
+	spin_lock(&trans->client_lock);
+	for (;;) {
+		/* see if the bundle has a call slot available */
+		if (!list_empty(&bundle->avail_conns)) {
+			_debug("avail");
+			conn = list_entry(bundle->avail_conns.next,
+					  struct rxrpc_connection,
+					  bundle_link);
+			if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+				list_del_init(&conn->bundle_link);
+				bundle->num_conns--;
+				continue;
+			}
+			if (--conn->avail_calls == 0)
+				list_move(&conn->bundle_link,
+					  &bundle->busy_conns);
+			ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
+			ASSERT(conn->channels[0] == NULL ||
+			       conn->channels[1] == NULL ||
+			       conn->channels[2] == NULL ||
+			       conn->channels[3] == NULL);
+			atomic_inc(&conn->usage);
+			break;
+		}
+
+		if (!list_empty(&bundle->unused_conns)) {
+			_debug("unused");
+			conn = list_entry(bundle->unused_conns.next,
+					  struct rxrpc_connection,
+					  bundle_link);
+			if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+				list_del_init(&conn->bundle_link);
+				bundle->num_conns--;
+				continue;
+			}
+			ASSERTCMP(conn->avail_calls, ==, RXRPC_MAXCALLS);
+			conn->avail_calls = RXRPC_MAXCALLS - 1;
+			ASSERT(conn->channels[0] == NULL &&
+			       conn->channels[1] == NULL &&
+			       conn->channels[2] == NULL &&
+			       conn->channels[3] == NULL);
+			atomic_inc(&conn->usage);
+			list_move(&conn->bundle_link, &bundle->avail_conns);
+			break;
+		}
+
+		/* need to allocate a new connection */
+		_debug("get new conn [%d]", bundle->num_conns);
+
+		spin_unlock(&trans->client_lock);
+
+		if (signal_pending(current))
+			goto interrupted;
+
+		if (bundle->num_conns >= 20) {
+			_debug("too many conns");
+
+			if (!(gfp & __GFP_WAIT)) {
+				_leave(" = -EAGAIN");
+				return -EAGAIN;
+			}
+
+			add_wait_queue(&bundle->chanwait, &myself);
+			for (;;) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (bundle->num_conns < 20 ||
+				    !list_empty(&bundle->unused_conns) ||
+				    !list_empty(&bundle->avail_conns))
+					break;
+				if (signal_pending(current))
+					goto interrupted_dequeue;
+				schedule();
+			}
+			remove_wait_queue(&bundle->chanwait, &myself);
+			__set_current_state(TASK_RUNNING);
+			spin_lock(&trans->client_lock);
+			continue;
+		}
+
+		/* not yet present - create a candidate for a new connection and then
+		 * redo the check */
+		candidate = rxrpc_alloc_connection(gfp);
+		if (!candidate) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
+		}
+
+		candidate->trans = trans;
+		candidate->bundle = bundle;
+		candidate->service_id = bundle->service_id;
+		candidate->epoch = rxrpc_epoch;
+		candidate->in_clientflag = 0;
+		candidate->out_clientflag = RXRPC_CLIENT_INITIATED;
+		candidate->cid = 0;
+		candidate->state = RXRPC_CONN_CLIENT;
+		candidate->avail_calls = RXRPC_MAXCALLS;
+		candidate->security_level = rx->min_sec_level;
+		candidate->key = key_get(bundle->key);
+
+		ret = rxrpc_init_client_conn_security(candidate);
+		if (ret < 0) {
+			key_put(candidate->key);
+			kfree(candidate);
+			_leave(" = %d [key]", ret);
+			return ret;
+		}
+
+		write_lock_bh(&rxrpc_connection_lock);
+		list_add_tail(&candidate->link, &rxrpc_connections);
+		write_unlock_bh(&rxrpc_connection_lock);
+
+		spin_lock(&trans->client_lock);
+
+		list_add(&candidate->bundle_link, &bundle->unused_conns);
+		bundle->num_conns++;
+		atomic_inc(&bundle->usage);
+		atomic_inc(&trans->usage);
+
+		_net("CONNECT new %d on TRANS %d",
+		     candidate->debug_id, candidate->trans->debug_id);
+
+		rxrpc_assign_connection_id(candidate);
+		if (candidate->security)
+			candidate->security->prime_packet_security(candidate);
+
+		/* leave the candidate lurking in zombie mode attached to the
+		 * bundle until we're ready for it */
+		rxrpc_put_connection(candidate);
+		candidate = NULL;
+	}
+
+	/* we've got a connection with a free channel and we can now attach the
+	 * call to it
+	 * - we're holding the transport's client lock
+	 * - we're holding a reference on the connection
+	 * - we're holding a reference on the bundle
+	 */
+	for (chan = 0; chan < RXRPC_MAXCALLS; chan++)
+		if (!conn->channels[chan])
+			goto found_channel;
+	ASSERT(conn->channels[0] == NULL ||
+	       conn->channels[1] == NULL ||
+	       conn->channels[2] == NULL ||
+	       conn->channels[3] == NULL);
+	BUG();
+
+found_channel:
+	conn->channels[chan] = call;
+	call->conn = conn;
+	call->channel = chan;
+	call->cid = conn->cid | htonl(chan);
+	call->call_id = htonl(++conn->call_counter);
+
+	_net("CONNECT client on conn %d chan %d as call %x",
+	     conn->debug_id, chan, ntohl(call->call_id));
+
+	ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
+	spin_unlock(&trans->client_lock);
+
+	rxrpc_add_call_ID_to_conn(conn, call);
+
+	_leave(" = 0");
+	return 0;
+
+interrupted_dequeue:
+	remove_wait_queue(&bundle->chanwait, &myself);
+	__set_current_state(TASK_RUNNING);
+interrupted:
+	_leave(" = -ERESTARTSYS");
+	return -ERESTARTSYS;
+}
+
+/*
+ * get a record of an incoming connection
+ */
+struct rxrpc_connection *
+rxrpc_incoming_connection(struct rxrpc_transport *trans,
+			  struct rxrpc_header *hdr,
+			  gfp_t gfp)
+{
+	struct rxrpc_connection *conn, *candidate = NULL;
+	struct rb_node *p, **pp;
+	const char *new = "old";
+	__be32 epoch;
+	u32 conn_id;
+
+	_enter("");
+
+	ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
+
+	epoch = hdr->epoch;
+	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+
+	/* search the connection list first */
+	read_lock_bh(&trans->conn_lock);
+
+	p = trans->server_conns.rb_node;
+	while (p) {
+		conn = rb_entry(p, struct rxrpc_connection, node);
+
+		_debug("maybe %x", conn->real_conn_id);
+
+		if (epoch < conn->epoch)
+			p = p->rb_left;
+		else if (epoch > conn->epoch)
+			p = p->rb_right;
+		else if (conn_id < conn->real_conn_id)
+			p = p->rb_left;
+		else if (conn_id > conn->real_conn_id)
+			p = p->rb_right;
+		else
+			goto found_extant_connection;
+	}
+	read_unlock_bh(&trans->conn_lock);
+
+	/* not yet present - create a candidate for a new record and then
+	 * redo the search */
+	candidate = rxrpc_alloc_connection(gfp);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	candidate->trans = trans;
+	candidate->epoch = hdr->epoch;
+	candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK);
+	candidate->service_id = hdr->serviceId;
+	candidate->security_ix = hdr->securityIndex;
+	candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
+	candidate->out_clientflag = 0;
+	candidate->real_conn_id = conn_id;
+	candidate->state = RXRPC_CONN_SERVER;
+	if (candidate->service_id)
+		candidate->state = RXRPC_CONN_SERVER_UNSECURED;
+
+	write_lock_bh(&trans->conn_lock);
+
+	pp = &trans->server_conns.rb_node;
+	p = NULL;
+	while (*pp) {
+		p = *pp;
+		conn = rb_entry(p, struct rxrpc_connection, node);
+
+		if (epoch < conn->epoch)
+			pp = &(*pp)->rb_left;
+		else if (epoch > conn->epoch)
+			pp = &(*pp)->rb_right;
+		else if (conn_id < conn->real_conn_id)
+			pp = &(*pp)->rb_left;
+		else if (conn_id > conn->real_conn_id)
+			pp = &(*pp)->rb_right;
+		else
+			goto found_extant_second;
+	}
+
+	/* we can now add the new candidate to the list */
+	conn = candidate;
+	candidate = NULL;
+	rb_link_node(&conn->node, p, pp);
+	rb_insert_color(&conn->node, &trans->server_conns);
+	atomic_inc(&conn->trans->usage);
+
+	write_unlock_bh(&trans->conn_lock);
+
+	write_lock_bh(&rxrpc_connection_lock);
+	list_add_tail(&conn->link, &rxrpc_connections);
+	write_unlock_bh(&rxrpc_connection_lock);
+
+	new = "new";
+
+success:
+	_net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id);
+
+	_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
+	return conn;
+
+	/* we found the connection in the list immediately */
+found_extant_connection:
+	if (hdr->securityIndex != conn->security_ix) {
+		read_unlock_bh(&trans->conn_lock);
+		goto security_mismatch;
+	}
+	atomic_inc(&conn->usage);
+	read_unlock_bh(&trans->conn_lock);
+	goto success;
+
+	/* we found the connection on the second time through the list */
+found_extant_second:
+	if (hdr->securityIndex != conn->security_ix) {
+		write_unlock_bh(&trans->conn_lock);
+		goto security_mismatch;
+	}
+	atomic_inc(&conn->usage);
+	write_unlock_bh(&trans->conn_lock);
+	kfree(candidate);
+	goto success;
+
+security_mismatch:
+	kfree(candidate);
+	_leave(" = -EKEYREJECTED");
+	return ERR_PTR(-EKEYREJECTED);
+}
+
+/*
+ * find a connection based on transport and RxRPC connection ID for an incoming
+ * packet
+ */
+struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
+					       struct rxrpc_header *hdr)
+{
+	struct rxrpc_connection *conn;
+	struct rb_node *p;
+	__be32 epoch;
+	u32 conn_id;
+
+	_enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags);
+
+	read_lock_bh(&trans->conn_lock);
+
+	conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+	epoch = hdr->epoch;
+
+	if (hdr->flags & RXRPC_CLIENT_INITIATED)
+		p = trans->server_conns.rb_node;
+	else
+		p = trans->client_conns.rb_node;
+
+	while (p) {
+		conn = rb_entry(p, struct rxrpc_connection, node);
+
+		_debug("maybe %x", conn->real_conn_id);
+
+		if (epoch < conn->epoch)
+			p = p->rb_left;
+		else if (epoch > conn->epoch)
+			p = p->rb_right;
+		else if (conn_id < conn->real_conn_id)
+			p = p->rb_left;
+		else if (conn_id > conn->real_conn_id)
+			p = p->rb_right;
+		else
+			goto found;
+	}
+
+	read_unlock_bh(&trans->conn_lock);
+	_leave(" = NULL");
+	return NULL;
+
+found:
+	atomic_inc(&conn->usage);
+	read_unlock_bh(&trans->conn_lock);
+	_leave(" = %p", conn);
+	return conn;
+}
+
+/*
+ * release a virtual connection
+ */
+void rxrpc_put_connection(struct rxrpc_connection *conn)
+{
+	_enter("%p{u=%d,d=%d}",
+	       conn, atomic_read(&conn->usage), conn->debug_id);
+
+	ASSERTCMP(atomic_read(&conn->usage), >, 0);
+
+	conn->put_time = get_seconds();
+	if (atomic_dec_and_test(&conn->usage)) {
+		_debug("zombie");
+		rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+	}
+
+	_leave("");
+}
+
+/*
+ * destroy a virtual connection
+ */
+static void rxrpc_destroy_connection(struct rxrpc_connection *conn)
+{
+	_enter("%p{%d}", conn, atomic_read(&conn->usage));
+
+	ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+
+	_net("DESTROY CONN %d", conn->debug_id);
+
+	if (conn->bundle)
+		rxrpc_put_bundle(conn->trans, conn->bundle);
+
+	ASSERT(RB_EMPTY_ROOT(&conn->calls));
+	rxrpc_purge_queue(&conn->rx_queue);
+
+	rxrpc_clear_conn_security(conn);
+	rxrpc_put_transport(conn->trans);
+	kfree(conn);
+	_leave("");
+}
+
+/*
+ * reap dead connections
+ */
+static void rxrpc_connection_reaper(struct work_struct *work)
+{
+	struct rxrpc_connection *conn, *_p;
+	unsigned long now, earliest, reap_time;
+
+	LIST_HEAD(graveyard);
+
+	_enter("");
+
+	now = get_seconds();
+	earliest = ULONG_MAX;
+
+	write_lock_bh(&rxrpc_connection_lock);
+	list_for_each_entry_safe(conn, _p, &rxrpc_connections, link) {
+		_debug("reap CONN %d { u=%d,t=%ld }",
+		       conn->debug_id, atomic_read(&conn->usage),
+		       (long) now - (long) conn->put_time);
+
+		if (likely(atomic_read(&conn->usage) > 0))
+			continue;
+
+		spin_lock(&conn->trans->client_lock);
+		write_lock(&conn->trans->conn_lock);
+		reap_time = conn->put_time + rxrpc_connection_timeout;
+
+		if (atomic_read(&conn->usage) > 0) {
+			;
+		} else if (reap_time <= now) {
+			list_move_tail(&conn->link, &graveyard);
+			if (conn->out_clientflag)
+				rb_erase(&conn->node,
+					 &conn->trans->client_conns);
+			else
+				rb_erase(&conn->node,
+					 &conn->trans->server_conns);
+			if (conn->bundle) {
+				list_del_init(&conn->bundle_link);
+				conn->bundle->num_conns--;
+			}
+
+		} else if (reap_time < earliest) {
+			earliest = reap_time;
+		}
+
+		write_unlock(&conn->trans->conn_lock);
+		spin_unlock(&conn->trans->client_lock);
+	}
+	write_unlock_bh(&rxrpc_connection_lock);
+
+	if (earliest != ULONG_MAX) {
+		_debug("reschedule reaper %ld", (long) earliest - now);
+		ASSERTCMP(earliest, >, now);
+		rxrpc_queue_delayed_work(&rxrpc_connection_reap,
+					 (earliest - now) * HZ);
+	}
+
+	/* then destroy all those pulled out */
+	while (!list_empty(&graveyard)) {
+		conn = list_entry(graveyard.next, struct rxrpc_connection,
+				  link);
+		list_del_init(&conn->link);
+
+		ASSERTCMP(atomic_read(&conn->usage), ==, 0);
+		rxrpc_destroy_connection(conn);
+	}
+
+	_leave("");
+}
+
+/*
+ * preemptively destroy all the connection records rather than waiting for them
+ * to time out
+ */
+void __exit rxrpc_destroy_all_connections(void)
+{
+	_enter("");
+
+	rxrpc_connection_timeout = 0;
+	cancel_delayed_work(&rxrpc_connection_reap);
+	rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
+
+	_leave("");
+}
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
new file mode 100644
index 00000000..e7ed43a5
--- /dev/null
+++ b/net/rxrpc/ar-connevent.c
@@ -0,0 +1,405 @@
+/* connection-level event handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * pass a connection-level abort onto all calls on that connection
+ */
+static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
+			      u32 abort_code)
+{
+	struct rxrpc_call *call;
+	struct rb_node *p;
+
+	_enter("{%d},%x", conn->debug_id, abort_code);
+
+	read_lock_bh(&conn->lock);
+
+	for (p = rb_first(&conn->calls); p; p = rb_next(p)) {
+		call = rb_entry(p, struct rxrpc_call, conn_node);
+		write_lock(&call->state_lock);
+		if (call->state <= RXRPC_CALL_COMPLETE) {
+			call->state = state;
+			call->abort_code = abort_code;
+			if (state == RXRPC_CALL_LOCALLY_ABORTED)
+				set_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+			else
+				set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+			rxrpc_queue_call(call);
+		}
+		write_unlock(&call->state_lock);
+	}
+
+	read_unlock_bh(&conn->lock);
+	_leave("");
+}
+
+/*
+ * generate a connection-level abort
+ */
+static int rxrpc_abort_connection(struct rxrpc_connection *conn,
+				  u32 error, u32 abort_code)
+{
+	struct rxrpc_header hdr;
+	struct msghdr msg;
+	struct kvec iov[2];
+	__be32 word;
+	size_t len;
+	int ret;
+
+	_enter("%d,,%u,%u", conn->debug_id, error, abort_code);
+
+	/* generate a connection-level abort */
+	spin_lock_bh(&conn->state_lock);
+	if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) {
+		conn->state = RXRPC_CONN_LOCALLY_ABORTED;
+		conn->error = error;
+		spin_unlock_bh(&conn->state_lock);
+	} else {
+		spin_unlock_bh(&conn->state_lock);
+		_leave(" = 0 [already dead]");
+		return 0;
+	}
+
+	rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code);
+
+	msg.msg_name	= &conn->trans->peer->srx.transport.sin;
+	msg.msg_namelen	= sizeof(conn->trans->peer->srx.transport.sin);
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	hdr.epoch	= conn->epoch;
+	hdr.cid		= conn->cid;
+	hdr.callNumber	= 0;
+	hdr.seq		= 0;
+	hdr.type	= RXRPC_PACKET_TYPE_ABORT;
+	hdr.flags	= conn->out_clientflag;
+	hdr.userStatus	= 0;
+	hdr.securityIndex = conn->security_ix;
+	hdr._rsvd	= 0;
+	hdr.serviceId	= conn->service_id;
+
+	word = htonl(abort_code);
+
+	iov[0].iov_base	= &hdr;
+	iov[0].iov_len	= sizeof(hdr);
+	iov[1].iov_base	= &word;
+	iov[1].iov_len	= sizeof(word);
+
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	hdr.serial = htonl(atomic_inc_return(&conn->serial));
+	_proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code);
+
+	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+	if (ret < 0) {
+		_debug("sendmsg failed: %d", ret);
+		return -EAGAIN;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * mark a call as being on a now-secured channel
+ * - must be called with softirqs disabled
+ */
+static void rxrpc_call_is_secure(struct rxrpc_call *call)
+{
+	_enter("%p", call);
+	if (call) {
+		read_lock(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE &&
+		    !test_and_set_bit(RXRPC_CALL_SECURED, &call->events))
+			rxrpc_queue_call(call);
+		read_unlock(&call->state_lock);
+	}
+}
+
+/*
+ * connection-level Rx packet processor
+ */
+static int rxrpc_process_event(struct rxrpc_connection *conn,
+			       struct sk_buff *skb,
+			       u32 *_abort_code)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	__be32 tmp;
+	u32 serial;
+	int loop, ret;
+
+	if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
+		kleave(" = -ECONNABORTED [%u]", conn->state);
+		return -ECONNABORTED;
+	}
+
+	serial = ntohl(sp->hdr.serial);
+
+	_enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial);
+
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_ABORT:
+		if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0)
+			return -EPROTO;
+		_proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp));
+
+		conn->state = RXRPC_CONN_REMOTELY_ABORTED;
+		rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
+				  ntohl(tmp));
+		return -ECONNABORTED;
+
+	case RXRPC_PACKET_TYPE_CHALLENGE:
+		if (conn->security)
+			return conn->security->respond_to_challenge(
+				conn, skb, _abort_code);
+		return -EPROTO;
+
+	case RXRPC_PACKET_TYPE_RESPONSE:
+		if (!conn->security)
+			return -EPROTO;
+
+		ret = conn->security->verify_response(conn, skb, _abort_code);
+		if (ret < 0)
+			return ret;
+
+		ret = conn->security->init_connection_security(conn);
+		if (ret < 0)
+			return ret;
+
+		conn->security->prime_packet_security(conn);
+		read_lock_bh(&conn->lock);
+		spin_lock(&conn->state_lock);
+
+		if (conn->state == RXRPC_CONN_SERVER_CHALLENGING) {
+			conn->state = RXRPC_CONN_SERVER;
+			for (loop = 0; loop < RXRPC_MAXCALLS; loop++)
+				rxrpc_call_is_secure(conn->channels[loop]);
+		}
+
+		spin_unlock(&conn->state_lock);
+		read_unlock_bh(&conn->lock);
+		return 0;
+
+	default:
+		_leave(" = -EPROTO [%u]", sp->hdr.type);
+		return -EPROTO;
+	}
+}
+
+/*
+ * set up security and issue a challenge
+ */
+static void rxrpc_secure_connection(struct rxrpc_connection *conn)
+{
+	u32 abort_code;
+	int ret;
+
+	_enter("{%d}", conn->debug_id);
+
+	ASSERT(conn->security_ix != 0);
+
+	if (!conn->key) {
+		_debug("set up security");
+		ret = rxrpc_init_server_conn_security(conn);
+		switch (ret) {
+		case 0:
+			break;
+		case -ENOENT:
+			abort_code = RX_CALL_DEAD;
+			goto abort;
+		default:
+			abort_code = RXKADNOAUTH;
+			goto abort;
+		}
+	}
+
+	ASSERT(conn->security != NULL);
+
+	if (conn->security->issue_challenge(conn) < 0) {
+		abort_code = RX_CALL_DEAD;
+		ret = -ENOMEM;
+		goto abort;
+	}
+
+	_leave("");
+	return;
+
+abort:
+	_debug("abort %d, %d", ret, abort_code);
+	rxrpc_abort_connection(conn, -ret, abort_code);
+	_leave(" [aborted]");
+}
+
+/*
+ * connection-level event processor
+ */
+void rxrpc_process_connection(struct work_struct *work)
+{
+	struct rxrpc_connection *conn =
+		container_of(work, struct rxrpc_connection, processor);
+	struct sk_buff *skb;
+	u32 abort_code = RX_PROTOCOL_ERROR;
+	int ret;
+
+	_enter("{%d}", conn->debug_id);
+
+	atomic_inc(&conn->usage);
+
+	if (test_and_clear_bit(RXRPC_CONN_CHALLENGE, &conn->events)) {
+		rxrpc_secure_connection(conn);
+		rxrpc_put_connection(conn);
+	}
+
+	/* go through the conn-level event packets, releasing the ref on this
+	 * connection that each one has when we've finished with it */
+	while ((skb = skb_dequeue(&conn->rx_queue))) {
+		ret = rxrpc_process_event(conn, skb, &abort_code);
+		switch (ret) {
+		case -EPROTO:
+		case -EKEYEXPIRED:
+		case -EKEYREJECTED:
+			goto protocol_error;
+		case -EAGAIN:
+			goto requeue_and_leave;
+		case -ECONNABORTED:
+		default:
+			rxrpc_put_connection(conn);
+			rxrpc_free_skb(skb);
+			break;
+		}
+	}
+
+out:
+	rxrpc_put_connection(conn);
+	_leave("");
+	return;
+
+requeue_and_leave:
+	skb_queue_head(&conn->rx_queue, skb);
+	goto out;
+
+protocol_error:
+	if (rxrpc_abort_connection(conn, -ret, abort_code) < 0)
+		goto requeue_and_leave;
+	rxrpc_put_connection(conn);
+	rxrpc_free_skb(skb);
+	_leave(" [EPROTO]");
+	goto out;
+}
+
+/*
+ * put a packet up for transport-level abort
+ */
+void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
+{
+	CHECK_SLAB_OKAY(&local->usage);
+
+	if (!atomic_inc_not_zero(&local->usage)) {
+		printk("resurrected on reject\n");
+		BUG();
+	}
+
+	skb_queue_tail(&local->reject_queue, skb);
+	rxrpc_queue_work(&local->rejecter);
+}
+
+/*
+ * reject packets through the local endpoint
+ */
+void rxrpc_reject_packets(struct work_struct *work)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+	} sa;
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_header hdr;
+	struct rxrpc_local *local;
+	struct sk_buff *skb;
+	struct msghdr msg;
+	struct kvec iov[2];
+	size_t size;
+	__be32 code;
+
+	local = container_of(work, struct rxrpc_local, rejecter);
+	rxrpc_get_local(local);
+
+	_enter("%d", local->debug_id);
+
+	iov[0].iov_base = &hdr;
+	iov[0].iov_len = sizeof(hdr);
+	iov[1].iov_base = &code;
+	iov[1].iov_len = sizeof(code);
+	size = sizeof(hdr) + sizeof(code);
+
+	msg.msg_name = &sa;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.sa.sa_family = local->srx.transport.family;
+	switch (sa.sa.sa_family) {
+	case AF_INET:
+		msg.msg_namelen = sizeof(sa.sin);
+		break;
+	default:
+		msg.msg_namelen = 0;
+		break;
+	}
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.type = RXRPC_PACKET_TYPE_ABORT;
+
+	while ((skb = skb_dequeue(&local->reject_queue))) {
+		sp = rxrpc_skb(skb);
+		switch (sa.sa.sa_family) {
+		case AF_INET:
+			sa.sin.sin_port = udp_hdr(skb)->source;
+			sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
+			code = htonl(skb->priority);
+
+			hdr.epoch = sp->hdr.epoch;
+			hdr.cid = sp->hdr.cid;
+			hdr.callNumber = sp->hdr.callNumber;
+			hdr.serviceId = sp->hdr.serviceId;
+			hdr.flags = sp->hdr.flags;
+			hdr.flags ^= RXRPC_CLIENT_INITIATED;
+			hdr.flags &= RXRPC_CLIENT_INITIATED;
+
+			kernel_sendmsg(local->socket, &msg, iov, 2, size);
+			break;
+
+		default:
+			break;
+		}
+
+		rxrpc_free_skb(skb);
+		rxrpc_put_local(local);
+	}
+
+	rxrpc_put_local(local);
+	_leave("");
+}
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
new file mode 100644
index 00000000..5d6b572a
--- /dev/null
+++ b/net/rxrpc/ar-error.c
@@ -0,0 +1,251 @@
+/* Error message handling (ICMP)
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include "ar-internal.h"
+
+/*
+ * handle an error received on the local endpoint
+ */
+void rxrpc_UDP_error_report(struct sock *sk)
+{
+	struct sock_exterr_skb *serr;
+	struct rxrpc_transport *trans;
+	struct rxrpc_local *local = sk->sk_user_data;
+	struct rxrpc_peer *peer;
+	struct sk_buff *skb;
+	__be32 addr;
+	__be16 port;
+
+	_enter("%p{%d}", sk, local->debug_id);
+
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (!skb) {
+		_leave("UDP socket errqueue empty");
+		return;
+	}
+
+	rxrpc_new_skb(skb);
+
+	serr = SKB_EXT_ERR(skb);
+	addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset);
+	port = serr->port;
+
+	_net("Rx UDP Error from %pI4:%hu", &addr, ntohs(port));
+	_debug("Msg l:%d d:%d", skb->len, skb->data_len);
+
+	peer = rxrpc_find_peer(local, addr, port);
+	if (IS_ERR(peer)) {
+		rxrpc_free_skb(skb);
+		_leave(" [no peer]");
+		return;
+	}
+
+	trans = rxrpc_find_transport(local, peer);
+	if (!trans) {
+		rxrpc_put_peer(peer);
+		rxrpc_free_skb(skb);
+		_leave(" [no trans]");
+		return;
+	}
+
+	if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP &&
+	    serr->ee.ee_type == ICMP_DEST_UNREACH &&
+	    serr->ee.ee_code == ICMP_FRAG_NEEDED
+	    ) {
+		u32 mtu = serr->ee.ee_info;
+
+		_net("Rx Received ICMP Fragmentation Needed (%d)", mtu);
+
+		/* wind down the local interface MTU */
+		if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) {
+			peer->if_mtu = mtu;
+			_net("I/F MTU %u", mtu);
+		}
+
+		/* ip_rt_frag_needed() may have eaten the info */
+		if (mtu == 0)
+			mtu = ntohs(icmp_hdr(skb)->un.frag.mtu);
+
+		if (mtu == 0) {
+			/* they didn't give us a size, estimate one */
+			if (mtu > 1500) {
+				mtu >>= 1;
+				if (mtu < 1500)
+					mtu = 1500;
+			} else {
+				mtu -= 100;
+				if (mtu < peer->hdrsize)
+					mtu = peer->hdrsize + 4;
+			}
+		}
+
+		if (mtu < peer->mtu) {
+			spin_lock_bh(&peer->lock);
+			peer->mtu = mtu;
+			peer->maxdata = peer->mtu - peer->hdrsize;
+			spin_unlock_bh(&peer->lock);
+			_net("Net MTU %u (maxdata %u)",
+			     peer->mtu, peer->maxdata);
+		}
+	}
+
+	rxrpc_put_peer(peer);
+
+	/* pass the transport ref to error_handler to release */
+	skb_queue_tail(&trans->error_queue, skb);
+	rxrpc_queue_work(&trans->error_handler);
+
+	/* reset and regenerate socket error */
+	spin_lock_bh(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	skb = skb_peek(&sk->sk_error_queue);
+	if (skb) {
+		sk->sk_err = SKB_EXT_ERR(skb)->ee.ee_errno;
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else {
+		spin_unlock_bh(&sk->sk_error_queue.lock);
+	}
+
+	_leave("");
+}
+
+/*
+ * deal with UDP error messages
+ */
+void rxrpc_UDP_error_handler(struct work_struct *work)
+{
+	struct sock_extended_err *ee;
+	struct sock_exterr_skb *serr;
+	struct rxrpc_transport *trans =
+		container_of(work, struct rxrpc_transport, error_handler);
+	struct sk_buff *skb;
+	int err;
+
+	_enter("");
+
+	skb = skb_dequeue(&trans->error_queue);
+	if (!skb)
+		return;
+
+	serr = SKB_EXT_ERR(skb);
+	ee = &serr->ee;
+
+	_net("Rx Error o=%d t=%d c=%d e=%d",
+	     ee->ee_origin, ee->ee_type, ee->ee_code, ee->ee_errno);
+
+	err = ee->ee_errno;
+
+	switch (ee->ee_origin) {
+	case SO_EE_ORIGIN_ICMP:
+		switch (ee->ee_type) {
+		case ICMP_DEST_UNREACH:
+			switch (ee->ee_code) {
+			case ICMP_NET_UNREACH:
+				_net("Rx Received ICMP Network Unreachable");
+				err = ENETUNREACH;
+				break;
+			case ICMP_HOST_UNREACH:
+				_net("Rx Received ICMP Host Unreachable");
+				err = EHOSTUNREACH;
+				break;
+			case ICMP_PORT_UNREACH:
+				_net("Rx Received ICMP Port Unreachable");
+				err = ECONNREFUSED;
+				break;
+			case ICMP_FRAG_NEEDED:
+				_net("Rx Received ICMP Fragmentation Needed (%d)",
+				     ee->ee_info);
+				err = 0; /* dealt with elsewhere */
+				break;
+			case ICMP_NET_UNKNOWN:
+				_net("Rx Received ICMP Unknown Network");
+				err = ENETUNREACH;
+				break;
+			case ICMP_HOST_UNKNOWN:
+				_net("Rx Received ICMP Unknown Host");
+				err = EHOSTUNREACH;
+				break;
+			default:
+				_net("Rx Received ICMP DestUnreach code=%u",
+				     ee->ee_code);
+				break;
+			}
+			break;
+
+		case ICMP_TIME_EXCEEDED:
+			_net("Rx Received ICMP TTL Exceeded");
+			break;
+
+		default:
+			_proto("Rx Received ICMP error { type=%u code=%u }",
+			       ee->ee_type, ee->ee_code);
+			break;
+		}
+		break;
+
+	case SO_EE_ORIGIN_LOCAL:
+		_proto("Rx Received local error { error=%d }",
+		       ee->ee_errno);
+		break;
+
+	case SO_EE_ORIGIN_NONE:
+	case SO_EE_ORIGIN_ICMP6:
+	default:
+		_proto("Rx Received error report { orig=%u }",
+		       ee->ee_origin);
+		break;
+	}
+
+	/* terminate all the affected calls if there's an unrecoverable
+	 * error */
+	if (err) {
+		struct rxrpc_call *call, *_n;
+
+		_debug("ISSUE ERROR %d", err);
+
+		spin_lock_bh(&trans->peer->lock);
+		trans->peer->net_error = err;
+
+		list_for_each_entry_safe(call, _n, &trans->peer->error_targets,
+					 error_link) {
+			write_lock(&call->state_lock);
+			if (call->state != RXRPC_CALL_COMPLETE &&
+			    call->state < RXRPC_CALL_NETWORK_ERROR) {
+				call->state = RXRPC_CALL_NETWORK_ERROR;
+				set_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+				rxrpc_queue_call(call);
+			}
+			write_unlock(&call->state_lock);
+			list_del_init(&call->error_link);
+		}
+
+		spin_unlock_bh(&trans->peer->lock);
+	}
+
+	if (!skb_queue_empty(&trans->error_queue))
+		rxrpc_queue_work(&trans->error_handler);
+
+	rxrpc_free_skb(skb);
+	rxrpc_put_transport(trans);
+	_leave("");
+}
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
new file mode 100644
index 00000000..1a2b0633
--- /dev/null
+++ b/net/rxrpc/ar-input.c
@@ -0,0 +1,804 @@
+/* RxRPC packet reception
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/errqueue.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/net_namespace.h>
+#include "ar-internal.h"
+
+unsigned long rxrpc_ack_timeout = 1;
+
+const char *rxrpc_pkts[] = {
+	"?00",
+	"DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG",
+	"?09", "?10", "?11", "?12", "?13", "?14", "?15"
+};
+
+/*
+ * queue a packet for recvmsg to pass to userspace
+ * - the caller must hold a lock on call->lock
+ * - must not be called with interrupts disabled (sk_filter() disables BH's)
+ * - eats the packet whether successful or not
+ * - there must be just one reference to the packet, which the caller passes to
+ *   this function
+ */
+int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb,
+			bool force, bool terminal)
+{
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_sock *rx = call->socket;
+	struct sock *sk;
+	int skb_len, ret;
+
+	_enter(",,%d,%d", force, terminal);
+
+	ASSERT(!irqs_disabled());
+
+	sp = rxrpc_skb(skb);
+	ASSERTCMP(sp->call, ==, call);
+
+	/* if we've already posted the terminal message for a call, then we
+	 * don't post any more */
+	if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) {
+		_debug("already terminated");
+		ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE);
+		skb->destructor = NULL;
+		sp->call = NULL;
+		rxrpc_put_call(call);
+		rxrpc_free_skb(skb);
+		return 0;
+	}
+
+	sk = &rx->sk;
+
+	if (!force) {
+		/* cast skb->rcvbuf to unsigned...  It's pointless, but
+		 * reduces number of warnings when compiling with -W
+		 * --ANK */
+//		ret = -ENOBUFS;
+//		if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+//		    (unsigned) sk->sk_rcvbuf)
+//			goto out;
+
+		ret = sk_filter(sk, skb);
+		if (ret < 0)
+			goto out;
+	}
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) &&
+	    !test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+	    call->socket->sk.sk_state != RXRPC_CLOSE) {
+		skb->destructor = rxrpc_packet_destructor;
+		skb->dev = NULL;
+		skb->sk = sk;
+		atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+		if (terminal) {
+			_debug("<<<< TERMINAL MESSAGE >>>>");
+			set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags);
+		}
+
+		/* allow interception by a kernel service */
+		if (rx->interceptor) {
+			rx->interceptor(sk, call->user_call_ID, skb);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
+		} else {
+
+			/* Cache the SKB length before we tack it onto the
+			 * receive queue.  Once it is added it no longer
+			 * belongs to us and may be freed by other threads of
+			 * control pulling packets from the queue */
+			skb_len = skb->len;
+
+			_net("post skb %p", skb);
+			__skb_queue_tail(&sk->sk_receive_queue, skb);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_data_ready(sk, skb_len);
+		}
+		skb = NULL;
+	} else {
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	}
+	ret = 0;
+
+out:
+	/* release the socket buffer */
+	if (skb) {
+		skb->destructor = NULL;
+		sp->call = NULL;
+		rxrpc_put_call(call);
+		rxrpc_free_skb(skb);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * process a DATA packet, posting the packet to the appropriate queue
+ * - eats the packet if successful
+ */
+static int rxrpc_fast_process_data(struct rxrpc_call *call,
+				   struct sk_buff *skb, u32 seq)
+{
+	struct rxrpc_skb_priv *sp;
+	bool terminal;
+	int ret, ackbit, ack;
+
+	_enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq);
+
+	sp = rxrpc_skb(skb);
+	ASSERTCMP(sp->call, ==, NULL);
+
+	spin_lock(&call->lock);
+
+	if (call->state > RXRPC_CALL_COMPLETE)
+		goto discard;
+
+	ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post);
+	ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv);
+	ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten);
+
+	if (seq < call->rx_data_post) {
+		_debug("dup #%u [-%u]", seq, call->rx_data_post);
+		ack = RXRPC_ACK_DUPLICATE;
+		ret = -ENOBUFS;
+		goto discard_and_ack;
+	}
+
+	/* we may already have the packet in the out of sequence queue */
+	ackbit = seq - (call->rx_data_eaten + 1);
+	ASSERTCMP(ackbit, >=, 0);
+	if (__test_and_set_bit(ackbit, call->ackr_window)) {
+		_debug("dup oos #%u [%u,%u]",
+		       seq, call->rx_data_eaten, call->rx_data_post);
+		ack = RXRPC_ACK_DUPLICATE;
+		goto discard_and_ack;
+	}
+
+	if (seq >= call->ackr_win_top) {
+		_debug("exceed #%u [%u]", seq, call->ackr_win_top);
+		__clear_bit(ackbit, call->ackr_window);
+		ack = RXRPC_ACK_EXCEEDS_WINDOW;
+		goto discard_and_ack;
+	}
+
+	if (seq == call->rx_data_expect) {
+		clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags);
+		call->rx_data_expect++;
+	} else if (seq > call->rx_data_expect) {
+		_debug("oos #%u [%u]", seq, call->rx_data_expect);
+		call->rx_data_expect = seq + 1;
+		if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) {
+			ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+			goto enqueue_and_ack;
+		}
+		goto enqueue_packet;
+	}
+
+	if (seq != call->rx_data_post) {
+		_debug("ahead #%u [%u]", seq, call->rx_data_post);
+		goto enqueue_packet;
+	}
+
+	if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags))
+		goto protocol_error;
+
+	/* if the packet need security things doing to it, then it goes down
+	 * the slow path */
+	if (call->conn->security)
+		goto enqueue_packet;
+
+	sp->call = call;
+	rxrpc_get_call(call);
+	terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) &&
+		    !(sp->hdr.flags & RXRPC_CLIENT_INITIATED));
+	ret = rxrpc_queue_rcv_skb(call, skb, false, terminal);
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ENOBUFS) {
+			__clear_bit(ackbit, call->ackr_window);
+			ack = RXRPC_ACK_NOSPACE;
+			goto discard_and_ack;
+		}
+		goto out;
+	}
+
+	skb = NULL;
+
+	_debug("post #%u", seq);
+	ASSERTCMP(call->rx_data_post, ==, seq);
+	call->rx_data_post++;
+
+	if (sp->hdr.flags & RXRPC_LAST_PACKET)
+		set_bit(RXRPC_CALL_RCVD_LAST, &call->flags);
+
+	/* if we've reached an out of sequence packet then we need to drain
+	 * that queue into the socket Rx queue now */
+	if (call->rx_data_post == call->rx_first_oos) {
+		_debug("drain rx oos now");
+		read_lock(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE &&
+		    !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events))
+			rxrpc_queue_call(call);
+		read_unlock(&call->state_lock);
+	}
+
+	spin_unlock(&call->lock);
+	atomic_inc(&call->ackr_not_idle);
+	rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, sp->hdr.serial, false);
+	_leave(" = 0 [posted]");
+	return 0;
+
+protocol_error:
+	ret = -EBADMSG;
+out:
+	spin_unlock(&call->lock);
+	_leave(" = %d", ret);
+	return ret;
+
+discard_and_ack:
+	_debug("discard and ACK packet %p", skb);
+	__rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+discard:
+	spin_unlock(&call->lock);
+	rxrpc_free_skb(skb);
+	_leave(" = 0 [discarded]");
+	return 0;
+
+enqueue_and_ack:
+	__rxrpc_propose_ACK(call, ack, sp->hdr.serial, true);
+enqueue_packet:
+	_net("defer skb %p", skb);
+	spin_unlock(&call->lock);
+	skb_queue_tail(&call->rx_queue, skb);
+	atomic_inc(&call->ackr_not_idle);
+	read_lock(&call->state_lock);
+	if (call->state < RXRPC_CALL_DEAD)
+		rxrpc_queue_call(call);
+	read_unlock(&call->state_lock);
+	_leave(" = 0 [queued]");
+	return 0;
+}
+
+/*
+ * assume an implicit ACKALL of the transmission phase of a client socket upon
+ * reception of the first reply packet
+ */
+static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
+{
+	write_lock_bh(&call->state_lock);
+
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_AWAIT_REPLY:
+		call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+		call->acks_latest = serial;
+
+		_debug("implicit ACKALL %%%u", call->acks_latest);
+		set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events);
+		write_unlock_bh(&call->state_lock);
+
+		if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+			clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+			clear_bit(RXRPC_CALL_RESEND, &call->events);
+			clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+		}
+		break;
+
+	default:
+		write_unlock_bh(&call->state_lock);
+		break;
+	}
+}
+
+/*
+ * post an incoming packet to the nominated call to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	__be32 _abort_code;
+	u32 serial, hi_serial, seq, abort_code;
+
+	_enter("%p,%p", call, skb);
+
+	ASSERT(!irqs_disabled());
+
+#if 0 // INJECT RX ERROR
+	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+		static int skip = 0;
+		if (++skip == 3) {
+			printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n");
+			skip = 0;
+			goto free_packet;
+		}
+	}
+#endif
+
+	/* track the latest serial number on this connection for ACK packet
+	 * information */
+	serial = ntohl(sp->hdr.serial);
+	hi_serial = atomic_read(&call->conn->hi_serial);
+	while (serial > hi_serial)
+		hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
+					   serial);
+
+	/* request ACK generation for any ACK or DATA packet that requests
+	 * it */
+	if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
+		_proto("ACK Requested on %%%u", serial);
+		rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial,
+				  !(sp->hdr.flags & RXRPC_MORE_PACKETS));
+	}
+
+	switch (sp->hdr.type) {
+	case RXRPC_PACKET_TYPE_ABORT:
+		_debug("abort");
+
+		if (skb_copy_bits(skb, 0, &_abort_code,
+				  sizeof(_abort_code)) < 0)
+			goto protocol_error;
+
+		abort_code = ntohl(_abort_code);
+		_proto("Rx ABORT %%%u { %x }", serial, abort_code);
+
+		write_lock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_COMPLETE) {
+			call->state = RXRPC_CALL_REMOTELY_ABORTED;
+			call->abort_code = abort_code;
+			set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+			rxrpc_queue_call(call);
+		}
+		goto free_packet_unlock;
+
+	case RXRPC_PACKET_TYPE_BUSY:
+		_proto("Rx BUSY %%%u", serial);
+
+		if (call->conn->out_clientflag)
+			goto protocol_error;
+
+		write_lock_bh(&call->state_lock);
+		switch (call->state) {
+		case RXRPC_CALL_CLIENT_SEND_REQUEST:
+			call->state = RXRPC_CALL_SERVER_BUSY;
+			set_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+			rxrpc_queue_call(call);
+		case RXRPC_CALL_SERVER_BUSY:
+			goto free_packet_unlock;
+		default:
+			goto protocol_error_locked;
+		}
+
+	default:
+		_proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial);
+		goto protocol_error;
+
+	case RXRPC_PACKET_TYPE_DATA:
+		seq = ntohl(sp->hdr.seq);
+
+		_proto("Rx DATA %%%u { #%u }", serial, seq);
+
+		if (seq == 0)
+			goto protocol_error;
+
+		call->ackr_prev_seq = sp->hdr.seq;
+
+		/* received data implicitly ACKs all of the request packets we
+		 * sent when we're acting as a client */
+		if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
+			rxrpc_assume_implicit_ackall(call, serial);
+
+		switch (rxrpc_fast_process_data(call, skb, seq)) {
+		case 0:
+			skb = NULL;
+			goto done;
+
+		default:
+			BUG();
+
+			/* data packet received beyond the last packet */
+		case -EBADMSG:
+			goto protocol_error;
+		}
+
+	case RXRPC_PACKET_TYPE_ACKALL:
+	case RXRPC_PACKET_TYPE_ACK:
+		/* ACK processing is done in process context */
+		read_lock_bh(&call->state_lock);
+		if (call->state < RXRPC_CALL_DEAD) {
+			skb_queue_tail(&call->rx_queue, skb);
+			rxrpc_queue_call(call);
+			skb = NULL;
+		}
+		read_unlock_bh(&call->state_lock);
+		goto free_packet;
+	}
+
+protocol_error:
+	_debug("protocol error");
+	write_lock_bh(&call->state_lock);
+protocol_error_locked:
+	if (call->state <= RXRPC_CALL_COMPLETE) {
+		call->state = RXRPC_CALL_LOCALLY_ABORTED;
+		call->abort_code = RX_PROTOCOL_ERROR;
+		set_bit(RXRPC_CALL_ABORT, &call->events);
+		rxrpc_queue_call(call);
+	}
+free_packet_unlock:
+	write_unlock_bh(&call->state_lock);
+free_packet:
+	rxrpc_free_skb(skb);
+done:
+	_leave("");
+}
+
+/*
+ * split up a jumbo data packet
+ */
+static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
+				       struct sk_buff *jumbo)
+{
+	struct rxrpc_jumbo_header jhdr;
+	struct rxrpc_skb_priv *sp;
+	struct sk_buff *part;
+
+	_enter(",{%u,%u}", jumbo->data_len, jumbo->len);
+
+	sp = rxrpc_skb(jumbo);
+
+	do {
+		sp->hdr.flags &= ~RXRPC_JUMBO_PACKET;
+
+		/* make a clone to represent the first subpacket in what's left
+		 * of the jumbo packet */
+		part = skb_clone(jumbo, GFP_ATOMIC);
+		if (!part) {
+			/* simply ditch the tail in the event of ENOMEM */
+			pskb_trim(jumbo, RXRPC_JUMBO_DATALEN);
+			break;
+		}
+		rxrpc_new_skb(part);
+
+		pskb_trim(part, RXRPC_JUMBO_DATALEN);
+
+		if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN))
+			goto protocol_error;
+
+		if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0)
+			goto protocol_error;
+		if (!pskb_pull(jumbo, sizeof(jhdr)))
+			BUG();
+
+		sp->hdr.seq	= htonl(ntohl(sp->hdr.seq) + 1);
+		sp->hdr.serial	= htonl(ntohl(sp->hdr.serial) + 1);
+		sp->hdr.flags	= jhdr.flags;
+		sp->hdr._rsvd	= jhdr._rsvd;
+
+		_proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1);
+
+		rxrpc_fast_process_packet(call, part);
+		part = NULL;
+
+	} while (sp->hdr.flags & RXRPC_JUMBO_PACKET);
+
+	rxrpc_fast_process_packet(call, jumbo);
+	_leave("");
+	return;
+
+protocol_error:
+	_debug("protocol error");
+	rxrpc_free_skb(part);
+	rxrpc_free_skb(jumbo);
+	write_lock_bh(&call->state_lock);
+	if (call->state <= RXRPC_CALL_COMPLETE) {
+		call->state = RXRPC_CALL_LOCALLY_ABORTED;
+		call->abort_code = RX_PROTOCOL_ERROR;
+		set_bit(RXRPC_CALL_ABORT, &call->events);
+		rxrpc_queue_call(call);
+	}
+	write_unlock_bh(&call->state_lock);
+	_leave("");
+}
+
+/*
+ * post an incoming packet to the appropriate call/socket to deal with
+ * - must get rid of the sk_buff, either by freeing it or by queuing it
+ */
+static void rxrpc_post_packet_to_call(struct rxrpc_connection *conn,
+				      struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_call *call;
+	struct rb_node *p;
+	__be32 call_id;
+
+	_enter("%p,%p", conn, skb);
+
+	read_lock_bh(&conn->lock);
+
+	sp = rxrpc_skb(skb);
+
+	/* look at extant calls by channel number first */
+	call = conn->channels[ntohl(sp->hdr.cid) & RXRPC_CHANNELMASK];
+	if (!call || call->call_id != sp->hdr.callNumber)
+		goto call_not_extant;
+
+	_debug("extant call [%d]", call->state);
+	ASSERTCMP(call->conn, ==, conn);
+
+	read_lock(&call->state_lock);
+	switch (call->state) {
+	case RXRPC_CALL_LOCALLY_ABORTED:
+		if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+			rxrpc_queue_call(call);
+	case RXRPC_CALL_REMOTELY_ABORTED:
+	case RXRPC_CALL_NETWORK_ERROR:
+	case RXRPC_CALL_DEAD:
+		goto free_unlock;
+	default:
+		break;
+	}
+
+	read_unlock(&call->state_lock);
+	rxrpc_get_call(call);
+	read_unlock_bh(&conn->lock);
+
+	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+	    sp->hdr.flags & RXRPC_JUMBO_PACKET)
+		rxrpc_process_jumbo_packet(call, skb);
+	else
+		rxrpc_fast_process_packet(call, skb);
+
+	rxrpc_put_call(call);
+	goto done;
+
+call_not_extant:
+	/* search the completed calls in case what we're dealing with is
+	 * there */
+	_debug("call not extant");
+
+	call_id = sp->hdr.callNumber;
+	p = conn->calls.rb_node;
+	while (p) {
+		call = rb_entry(p, struct rxrpc_call, conn_node);
+
+		if (call_id < call->call_id)
+			p = p->rb_left;
+		else if (call_id > call->call_id)
+			p = p->rb_right;
+		else
+			goto found_completed_call;
+	}
+
+dead_call:
+	/* it's a either a really old call that we no longer remember or its a
+	 * new incoming call */
+	read_unlock_bh(&conn->lock);
+
+	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
+	    sp->hdr.seq == cpu_to_be32(1)) {
+		_debug("incoming call");
+		skb_queue_tail(&conn->trans->local->accept_queue, skb);
+		rxrpc_queue_work(&conn->trans->local->acceptor);
+		goto done;
+	}
+
+	_debug("dead call");
+	skb->priority = RX_CALL_DEAD;
+	rxrpc_reject_packet(conn->trans->local, skb);
+	goto done;
+
+	/* resend last packet of a completed call
+	 * - client calls may have been aborted or ACK'd
+	 * - server calls may have been aborted
+	 */
+found_completed_call:
+	_debug("completed call");
+
+	if (atomic_read(&call->usage) == 0)
+		goto dead_call;
+
+	/* synchronise any state changes */
+	read_lock(&call->state_lock);
+	ASSERTIFCMP(call->state != RXRPC_CALL_CLIENT_FINAL_ACK,
+		    call->state, >=, RXRPC_CALL_COMPLETE);
+
+	if (call->state == RXRPC_CALL_LOCALLY_ABORTED ||
+	    call->state == RXRPC_CALL_REMOTELY_ABORTED ||
+	    call->state == RXRPC_CALL_DEAD) {
+		read_unlock(&call->state_lock);
+		goto dead_call;
+	}
+
+	if (call->conn->in_clientflag) {
+		read_unlock(&call->state_lock);
+		goto dead_call; /* complete server call */
+	}
+
+	_debug("final ack again");
+	rxrpc_get_call(call);
+	set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+	rxrpc_queue_call(call);
+
+free_unlock:
+	read_unlock(&call->state_lock);
+	read_unlock_bh(&conn->lock);
+	rxrpc_free_skb(skb);
+done:
+	_leave("");
+}
+
+/*
+ * post connection-level events to the connection
+ * - this includes challenges, responses and some aborts
+ */
+static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn,
+				      struct sk_buff *skb)
+{
+	_enter("%p,%p", conn, skb);
+
+	atomic_inc(&conn->usage);
+	skb_queue_tail(&conn->rx_queue, skb);
+	rxrpc_queue_conn(conn);
+}
+
+/*
+ * handle data received on the local endpoint
+ * - may be called in interrupt context
+ */
+void rxrpc_data_ready(struct sock *sk, int count)
+{
+	struct rxrpc_connection *conn;
+	struct rxrpc_transport *trans;
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_local *local;
+	struct rxrpc_peer *peer;
+	struct sk_buff *skb;
+	int ret;
+
+	_enter("%p, %d", sk, count);
+
+	ASSERT(!irqs_disabled());
+
+	read_lock_bh(&rxrpc_local_lock);
+	local = sk->sk_user_data;
+	if (local && atomic_read(&local->usage) > 0)
+		rxrpc_get_local(local);
+	else
+		local = NULL;
+	read_unlock_bh(&rxrpc_local_lock);
+	if (!local) {
+		_leave(" [local dead]");
+		return;
+	}
+
+	skb = skb_recv_datagram(sk, 0, 1, &ret);
+	if (!skb) {
+		rxrpc_put_local(local);
+		if (ret == -EAGAIN)
+			return;
+		_debug("UDP socket error %d", ret);
+		return;
+	}
+
+	rxrpc_new_skb(skb);
+
+	_net("recv skb %p", skb);
+
+	/* we'll probably need to checksum it (didn't call sock_recvmsg) */
+	if (skb_checksum_complete(skb)) {
+		rxrpc_free_skb(skb);
+		rxrpc_put_local(local);
+		UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0);
+		_leave(" [CSUM failed]");
+		return;
+	}
+
+	UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
+
+	/* the socket buffer we have is owned by UDP, with UDP's data all over
+	 * it, but we really want our own */
+	skb_orphan(skb);
+	sp = rxrpc_skb(skb);
+	memset(sp, 0, sizeof(*sp));
+
+	_net("Rx UDP packet from %08x:%04hu",
+	     ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
+
+	/* dig out the RxRPC connection details */
+	if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr,
+			  sizeof(sp->hdr)) < 0)
+		goto bad_message;
+	if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr)))
+		BUG();
+
+	_net("Rx RxRPC %s ep=%x call=%x:%x",
+	     sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
+	     ntohl(sp->hdr.epoch),
+	     ntohl(sp->hdr.cid),
+	     ntohl(sp->hdr.callNumber));
+
+	if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
+		_proto("Rx Bad Packet Type %u", sp->hdr.type);
+		goto bad_message;
+	}
+
+	if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+	    (sp->hdr.callNumber == 0 || sp->hdr.seq == 0))
+		goto bad_message;
+
+	peer = rxrpc_find_peer(local, ip_hdr(skb)->saddr, udp_hdr(skb)->source);
+	if (IS_ERR(peer))
+		goto cant_route_call;
+
+	trans = rxrpc_find_transport(local, peer);
+	rxrpc_put_peer(peer);
+	if (!trans)
+		goto cant_route_call;
+
+	conn = rxrpc_find_connection(trans, &sp->hdr);
+	rxrpc_put_transport(trans);
+	if (!conn)
+		goto cant_route_call;
+
+	_debug("CONN %p {%d}", conn, conn->debug_id);
+
+	if (sp->hdr.callNumber == 0)
+		rxrpc_post_packet_to_conn(conn, skb);
+	else
+		rxrpc_post_packet_to_call(conn, skb);
+	rxrpc_put_connection(conn);
+	rxrpc_put_local(local);
+	return;
+
+cant_route_call:
+	_debug("can't route call");
+	if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
+	    sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
+		if (sp->hdr.seq == cpu_to_be32(1)) {
+			_debug("first packet");
+			skb_queue_tail(&local->accept_queue, skb);
+			rxrpc_queue_work(&local->acceptor);
+			rxrpc_put_local(local);
+			_leave(" [incoming]");
+			return;
+		}
+		skb->priority = RX_INVALID_OPERATION;
+	} else {
+		skb->priority = RX_CALL_DEAD;
+	}
+
+	_debug("reject");
+	rxrpc_reject_packet(local, skb);
+	rxrpc_put_local(local);
+	_leave(" [no call]");
+	return;
+
+bad_message:
+	skb->priority = RX_PROTOCOL_ERROR;
+	rxrpc_reject_packet(local, skb);
+	rxrpc_put_local(local);
+	_leave(" [badmsg]");
+}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
new file mode 100644
index 00000000..8e22bd34
--- /dev/null
+++ b/net/rxrpc/ar-internal.h
@@ -0,0 +1,786 @@
+/* AF_RXRPC internal definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <rxrpc/packet.h>
+
+#if 0
+#define CHECK_SLAB_OKAY(X)				     \
+	BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \
+	       (POISON_FREE << 8 | POISON_FREE))
+#else
+#define CHECK_SLAB_OKAY(X) do {} while(0)
+#endif
+
+#define FCRYPT_BSIZE 8
+struct rxrpc_crypt {
+	union {
+		u8	x[FCRYPT_BSIZE];
+		__be32	n[2];
+	};
+} __attribute__((aligned(8)));
+
+#define rxrpc_queue_work(WS)	queue_work(rxrpc_workqueue, (WS))
+#define rxrpc_queue_delayed_work(WS,D)	\
+	queue_delayed_work(rxrpc_workqueue, (WS), (D))
+
+#define rxrpc_queue_call(CALL)	rxrpc_queue_work(&(CALL)->processor)
+#define rxrpc_queue_conn(CONN)	rxrpc_queue_work(&(CONN)->processor)
+
+/*
+ * sk_state for RxRPC sockets
+ */
+enum {
+	RXRPC_UNCONNECTED = 0,
+	RXRPC_CLIENT_BOUND,		/* client local address bound */
+	RXRPC_CLIENT_CONNECTED,		/* client is connected */
+	RXRPC_SERVER_BOUND,		/* server local address bound */
+	RXRPC_SERVER_LISTENING,		/* server listening for connections */
+	RXRPC_CLOSE,			/* socket is being closed */
+};
+
+/*
+ * RxRPC socket definition
+ */
+struct rxrpc_sock {
+	/* WARNING: sk has to be the first member */
+	struct sock		sk;
+	rxrpc_interceptor_t	interceptor;	/* kernel service Rx interceptor function */
+	struct rxrpc_local	*local;		/* local endpoint */
+	struct rxrpc_transport	*trans;		/* transport handler */
+	struct rxrpc_conn_bundle *bundle;	/* virtual connection bundle */
+	struct rxrpc_connection	*conn;		/* exclusive virtual connection */
+	struct list_head	listen_link;	/* link in the local endpoint's listen list */
+	struct list_head	secureq;	/* calls awaiting connection security clearance */
+	struct list_head	acceptq;	/* calls awaiting acceptance */
+	struct key		*key;		/* security for this socket */
+	struct key		*securities;	/* list of server security descriptors */
+	struct rb_root		calls;		/* outstanding calls on this socket */
+	unsigned long		flags;
+#define RXRPC_SOCK_EXCLUSIVE_CONN	1	/* exclusive connection for a client socket */
+	rwlock_t		call_lock;	/* lock for calls */
+	u32			min_sec_level;	/* minimum security level */
+#define RXRPC_SECURITY_MAX	RXRPC_SECURITY_ENCRYPT
+	struct sockaddr_rxrpc	srx;		/* local address */
+	sa_family_t		proto;		/* protocol created with */
+	__be16			service_id;	/* service ID of local/remote service */
+};
+
+#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
+
+/*
+ * RxRPC socket buffer private variables
+ * - max 48 bytes (struct sk_buff::cb)
+ */
+struct rxrpc_skb_priv {
+	struct rxrpc_call	*call;		/* call with which associated */
+	unsigned long		resend_at;	/* time in jiffies at which to resend */
+	union {
+		unsigned	offset;		/* offset into buffer of next read */
+		int		remain;		/* amount of space remaining for next write */
+		u32		error;		/* network error code */
+		bool		need_resend;	/* T if needs resending */
+	};
+
+	struct rxrpc_header	hdr;		/* RxRPC packet header from this packet */
+};
+
+#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
+
+enum rxrpc_command {
+	RXRPC_CMD_SEND_DATA,		/* send data message */
+	RXRPC_CMD_SEND_ABORT,		/* request abort generation */
+	RXRPC_CMD_ACCEPT,		/* [server] accept incoming call */
+	RXRPC_CMD_REJECT_BUSY,		/* [server] reject a call as busy */
+};
+
+/*
+ * RxRPC security module interface
+ */
+struct rxrpc_security {
+	struct module		*owner;		/* providing module */
+	struct list_head	link;		/* link in master list */
+	const char		*name;		/* name of this service */
+	u8			security_index;	/* security type provided */
+
+	/* initialise a connection's security */
+	int (*init_connection_security)(struct rxrpc_connection *);
+
+	/* prime a connection's packet security */
+	void (*prime_packet_security)(struct rxrpc_connection *);
+
+	/* impose security on a packet */
+	int (*secure_packet)(const struct rxrpc_call *,
+			     struct sk_buff *,
+			     size_t,
+			     void *);
+
+	/* verify the security on a received packet */
+	int (*verify_packet)(const struct rxrpc_call *, struct sk_buff *,
+			     u32 *);
+
+	/* issue a challenge */
+	int (*issue_challenge)(struct rxrpc_connection *);
+
+	/* respond to a challenge */
+	int (*respond_to_challenge)(struct rxrpc_connection *,
+				    struct sk_buff *,
+				    u32 *);
+
+	/* verify a response */
+	int (*verify_response)(struct rxrpc_connection *,
+			       struct sk_buff *,
+			       u32 *);
+
+	/* clear connection security */
+	void (*clear)(struct rxrpc_connection *);
+};
+
+/*
+ * RxRPC local transport endpoint definition
+ * - matched by local port, address and protocol type
+ */
+struct rxrpc_local {
+	struct socket		*socket;	/* my UDP socket */
+	struct work_struct	destroyer;	/* endpoint destroyer */
+	struct work_struct	acceptor;	/* incoming call processor */
+	struct work_struct	rejecter;	/* packet reject writer */
+	struct list_head	services;	/* services listening on this endpoint */
+	struct list_head	link;		/* link in endpoint list */
+	struct rw_semaphore	defrag_sem;	/* control re-enablement of IP DF bit */
+	struct sk_buff_head	accept_queue;	/* incoming calls awaiting acceptance */
+	struct sk_buff_head	reject_queue;	/* packets awaiting rejection */
+	spinlock_t		lock;		/* access lock */
+	rwlock_t		services_lock;	/* lock for services list */
+	atomic_t		usage;
+	int			debug_id;	/* debug ID for printks */
+	volatile char		error_rcvd;	/* T if received ICMP error outstanding */
+	struct sockaddr_rxrpc	srx;		/* local address */
+};
+
+/*
+ * RxRPC remote transport endpoint definition
+ * - matched by remote port, address and protocol type
+ * - holds the connection ID counter for connections between the two endpoints
+ */
+struct rxrpc_peer {
+	struct work_struct	destroyer;	/* peer destroyer */
+	struct list_head	link;		/* link in master peer list */
+	struct list_head	error_targets;	/* targets for net error distribution */
+	spinlock_t		lock;		/* access lock */
+	atomic_t		usage;
+	unsigned		if_mtu;		/* interface MTU for this peer */
+	unsigned		mtu;		/* network MTU for this peer */
+	unsigned		maxdata;	/* data size (MTU - hdrsize) */
+	unsigned short		hdrsize;	/* header size (IP + UDP + RxRPC) */
+	int			debug_id;	/* debug ID for printks */
+	int			net_error;	/* network error distributed */
+	struct sockaddr_rxrpc	srx;		/* remote address */
+
+	/* calculated RTT cache */
+#define RXRPC_RTT_CACHE_SIZE 32
+	suseconds_t		rtt;		/* current RTT estimate (in uS) */
+	unsigned		rtt_point;	/* next entry at which to insert */
+	unsigned		rtt_usage;	/* amount of cache actually used */
+	suseconds_t		rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */
+};
+
+/*
+ * RxRPC point-to-point transport / connection manager definition
+ * - handles a bundle of connections between two endpoints
+ * - matched by { local, peer }
+ */
+struct rxrpc_transport {
+	struct rxrpc_local	*local;		/* local transport endpoint */
+	struct rxrpc_peer	*peer;		/* remote transport endpoint */
+	struct work_struct	error_handler;	/* network error distributor */
+	struct rb_root		bundles;	/* client connection bundles on this transport */
+	struct rb_root		client_conns;	/* client connections on this transport */
+	struct rb_root		server_conns;	/* server connections on this transport */
+	struct list_head	link;		/* link in master session list */
+	struct sk_buff_head	error_queue;	/* error packets awaiting processing */
+	time_t			put_time;	/* time at which to reap */
+	spinlock_t		client_lock;	/* client connection allocation lock */
+	rwlock_t		conn_lock;	/* lock for active/dead connections */
+	atomic_t		usage;
+	int			debug_id;	/* debug ID for printks */
+	unsigned int		conn_idcounter;	/* connection ID counter (client) */
+};
+
+/*
+ * RxRPC client connection bundle
+ * - matched by { transport, service_id, key }
+ */
+struct rxrpc_conn_bundle {
+	struct rb_node		node;		/* node in transport's lookup tree */
+	struct list_head	unused_conns;	/* unused connections in this bundle */
+	struct list_head	avail_conns;	/* available connections in this bundle */
+	struct list_head	busy_conns;	/* busy connections in this bundle */
+	struct key		*key;		/* security for this bundle */
+	wait_queue_head_t	chanwait;	/* wait for channel to become available */
+	atomic_t		usage;
+	int			debug_id;	/* debug ID for printks */
+	unsigned short		num_conns;	/* number of connections in this bundle */
+	__be16			service_id;	/* service ID */
+	u8			security_ix;	/* security type */
+};
+
+/*
+ * RxRPC connection definition
+ * - matched by { transport, service_id, conn_id, direction, key }
+ * - each connection can only handle four simultaneous calls
+ */
+struct rxrpc_connection {
+	struct rxrpc_transport	*trans;		/* transport session */
+	struct rxrpc_conn_bundle *bundle;	/* connection bundle (client) */
+	struct work_struct	processor;	/* connection event processor */
+	struct rb_node		node;		/* node in transport's lookup tree */
+	struct list_head	link;		/* link in master connection list */
+	struct list_head	bundle_link;	/* link in bundle */
+	struct rb_root		calls;		/* calls on this connection */
+	struct sk_buff_head	rx_queue;	/* received conn-level packets */
+	struct rxrpc_call	*channels[RXRPC_MAXCALLS]; /* channels (active calls) */
+	struct rxrpc_security	*security;	/* applied security module */
+	struct key		*key;		/* security for this connection (client) */
+	struct key		*server_key;	/* security for this service */
+	struct crypto_blkcipher	*cipher;	/* encryption handle */
+	struct rxrpc_crypt	csum_iv;	/* packet checksum base */
+	unsigned long		events;
+#define RXRPC_CONN_CHALLENGE	0		/* send challenge packet */
+	time_t			put_time;	/* time at which to reap */
+	rwlock_t		lock;		/* access lock */
+	spinlock_t		state_lock;	/* state-change lock */
+	atomic_t		usage;
+	u32			real_conn_id;	/* connection ID (host-endian) */
+	enum {					/* current state of connection */
+		RXRPC_CONN_UNUSED,		/* - connection not yet attempted */
+		RXRPC_CONN_CLIENT,		/* - client connection */
+		RXRPC_CONN_SERVER_UNSECURED,	/* - server unsecured connection */
+		RXRPC_CONN_SERVER_CHALLENGING,	/* - server challenging for security */
+		RXRPC_CONN_SERVER,		/* - server secured connection */
+		RXRPC_CONN_REMOTELY_ABORTED,	/* - conn aborted by peer */
+		RXRPC_CONN_LOCALLY_ABORTED,	/* - conn aborted locally */
+		RXRPC_CONN_NETWORK_ERROR,	/* - conn terminated by network error */
+	} state;
+	int			error;		/* error code for local abort */
+	int			debug_id;	/* debug ID for printks */
+	unsigned		call_counter;	/* call ID counter */
+	atomic_t		serial;		/* packet serial number counter */
+	atomic_t		hi_serial;	/* highest serial number received */
+	u8			avail_calls;	/* number of calls available */
+	u8			size_align;	/* data size alignment (for security) */
+	u8			header_size;	/* rxrpc + security header size */
+	u8			security_size;	/* security header size */
+	u32			security_level;	/* security level negotiated */
+	u32			security_nonce;	/* response re-use preventer */
+
+	/* the following are all in net order */
+	__be32			epoch;		/* epoch of this connection */
+	__be32			cid;		/* connection ID */
+	__be16			service_id;	/* service ID */
+	u8			security_ix;	/* security type */
+	u8			in_clientflag;	/* RXRPC_CLIENT_INITIATED if we are server */
+	u8			out_clientflag;	/* RXRPC_CLIENT_INITIATED if we are client */
+};
+
+/*
+ * RxRPC call definition
+ * - matched by { connection, call_id }
+ */
+struct rxrpc_call {
+	struct rxrpc_connection	*conn;		/* connection carrying call */
+	struct rxrpc_sock	*socket;	/* socket responsible */
+	struct timer_list	lifetimer;	/* lifetime remaining on call */
+	struct timer_list	deadspan;	/* reap timer for re-ACK'ing, etc  */
+	struct timer_list	ack_timer;	/* ACK generation timer */
+	struct timer_list	resend_timer;	/* Tx resend timer */
+	struct work_struct	destroyer;	/* call destroyer */
+	struct work_struct	processor;	/* packet processor and ACK generator */
+	struct list_head	link;		/* link in master call list */
+	struct list_head	error_link;	/* link in error distribution list */
+	struct list_head	accept_link;	/* calls awaiting acceptance */
+	struct rb_node		sock_node;	/* node in socket call tree */
+	struct rb_node		conn_node;	/* node in connection call tree */
+	struct sk_buff_head	rx_queue;	/* received packets */
+	struct sk_buff_head	rx_oos_queue;	/* packets received out of sequence */
+	struct sk_buff		*tx_pending;	/* Tx socket buffer being filled */
+	wait_queue_head_t	tx_waitq;	/* wait for Tx window space to become available */
+	unsigned long		user_call_ID;	/* user-defined call ID */
+	unsigned long		creation_jif;	/* time of call creation */
+	unsigned long		flags;
+#define RXRPC_CALL_RELEASED	0	/* call has been released - no more message to userspace */
+#define RXRPC_CALL_TERMINAL_MSG	1	/* call has given the socket its final message */
+#define RXRPC_CALL_RCVD_LAST	2	/* all packets received */
+#define RXRPC_CALL_RUN_RTIMER	3	/* Tx resend timer started */
+#define RXRPC_CALL_TX_SOFT_ACK	4	/* sent some soft ACKs */
+#define RXRPC_CALL_PROC_BUSY	5	/* the processor is busy */
+#define RXRPC_CALL_INIT_ACCEPT	6	/* acceptance was initiated */
+#define RXRPC_CALL_HAS_USERID	7	/* has a user ID attached */
+#define RXRPC_CALL_EXPECT_OOS	8	/* expect out of sequence packets */
+	unsigned long		events;
+#define RXRPC_CALL_RCVD_ACKALL	0	/* ACKALL or reply received */
+#define RXRPC_CALL_RCVD_BUSY	1	/* busy packet received */
+#define RXRPC_CALL_RCVD_ABORT	2	/* abort packet received */
+#define RXRPC_CALL_RCVD_ERROR	3	/* network error received */
+#define RXRPC_CALL_ACK_FINAL	4	/* need to generate final ACK (and release call) */
+#define RXRPC_CALL_ACK		5	/* need to generate ACK */
+#define RXRPC_CALL_REJECT_BUSY	6	/* need to generate busy message */
+#define RXRPC_CALL_ABORT	7	/* need to generate abort */
+#define RXRPC_CALL_CONN_ABORT	8	/* local connection abort generated */
+#define RXRPC_CALL_RESEND_TIMER	9	/* Tx resend timer expired */
+#define RXRPC_CALL_RESEND	10	/* Tx resend required */
+#define RXRPC_CALL_DRAIN_RX_OOS	11	/* drain the Rx out of sequence queue */
+#define RXRPC_CALL_LIFE_TIMER	12	/* call's lifetimer ran out */
+#define RXRPC_CALL_ACCEPTED	13	/* incoming call accepted by userspace app */
+#define RXRPC_CALL_SECURED	14	/* incoming call's connection is now secure */
+#define RXRPC_CALL_POST_ACCEPT	15	/* need to post an "accept?" message to the app */
+#define RXRPC_CALL_RELEASE	16	/* need to release the call's resources */
+
+	spinlock_t		lock;
+	rwlock_t		state_lock;	/* lock for state transition */
+	atomic_t		usage;
+	atomic_t		sequence;	/* Tx data packet sequence counter */
+	u32			abort_code;	/* local/remote abort code */
+	enum {					/* current state of call */
+		RXRPC_CALL_CLIENT_SEND_REQUEST,	/* - client sending request phase */
+		RXRPC_CALL_CLIENT_AWAIT_REPLY,	/* - client awaiting reply */
+		RXRPC_CALL_CLIENT_RECV_REPLY,	/* - client receiving reply phase */
+		RXRPC_CALL_CLIENT_FINAL_ACK,	/* - client sending final ACK phase */
+		RXRPC_CALL_SERVER_SECURING,	/* - server securing request connection */
+		RXRPC_CALL_SERVER_ACCEPTING,	/* - server accepting request */
+		RXRPC_CALL_SERVER_RECV_REQUEST,	/* - server receiving request */
+		RXRPC_CALL_SERVER_ACK_REQUEST,	/* - server pending ACK of request */
+		RXRPC_CALL_SERVER_SEND_REPLY,	/* - server sending reply */
+		RXRPC_CALL_SERVER_AWAIT_ACK,	/* - server awaiting final ACK */
+		RXRPC_CALL_COMPLETE,		/* - call completed */
+		RXRPC_CALL_SERVER_BUSY,		/* - call rejected by busy server */
+		RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
+		RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
+		RXRPC_CALL_NETWORK_ERROR,	/* - call terminated by network error */
+		RXRPC_CALL_DEAD,		/* - call is dead */
+	} state;
+	int			debug_id;	/* debug ID for printks */
+	u8			channel;	/* connection channel occupied by this call */
+
+	/* transmission-phase ACK management */
+	u8			acks_head;	/* offset into window of first entry */
+	u8			acks_tail;	/* offset into window of last entry */
+	u8			acks_winsz;	/* size of un-ACK'd window */
+	u8			acks_unacked;	/* lowest unacked packet in last ACK received */
+	int			acks_latest;	/* serial number of latest ACK received */
+	rxrpc_seq_t		acks_hard;	/* highest definitively ACK'd msg seq */
+	unsigned long		*acks_window;	/* sent packet window
+						 * - elements are pointers with LSB set if ACK'd
+						 */
+
+	/* receive-phase ACK management */
+	rxrpc_seq_t		rx_data_expect;	/* next data seq ID expected to be received */
+	rxrpc_seq_t		rx_data_post;	/* next data seq ID expected to be posted */
+	rxrpc_seq_t		rx_data_recv;	/* last data seq ID encountered by recvmsg */
+	rxrpc_seq_t		rx_data_eaten;	/* last data seq ID consumed by recvmsg */
+	rxrpc_seq_t		rx_first_oos;	/* first packet in rx_oos_queue (or 0) */
+	rxrpc_seq_t		ackr_win_top;	/* top of ACK window (rx_data_eaten is bottom) */
+	rxrpc_seq_net_t		ackr_prev_seq;	/* previous sequence number received */
+	u8			ackr_reason;	/* reason to ACK */
+	__be32			ackr_serial;	/* serial of packet being ACK'd */
+	atomic_t		ackr_not_idle;	/* number of packets in Rx queue */
+
+	/* received packet records, 1 bit per record */
+#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG)
+	unsigned long		ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1];
+
+	/* the following should all be in net order */
+	__be32			cid;		/* connection ID + channel index  */
+	__be32			call_id;	/* call ID on connection  */
+};
+
+/*
+ * locally abort an RxRPC call
+ */
+static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
+{
+	write_lock_bh(&call->state_lock);
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		call->abort_code = abort_code;
+		call->state = RXRPC_CALL_LOCALLY_ABORTED;
+		set_bit(RXRPC_CALL_ABORT, &call->events);
+	}
+	write_unlock_bh(&call->state_lock);
+}
+
+/*
+ * af_rxrpc.c
+ */
+extern atomic_t rxrpc_n_skbs;
+extern __be32 rxrpc_epoch;
+extern atomic_t rxrpc_debug_id;
+extern struct workqueue_struct *rxrpc_workqueue;
+
+/*
+ * ar-accept.c
+ */
+extern void rxrpc_accept_incoming_calls(struct work_struct *);
+extern struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *,
+					    unsigned long);
+extern int rxrpc_reject_call(struct rxrpc_sock *);
+
+/*
+ * ar-ack.c
+ */
+extern void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+extern void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+extern void rxrpc_process_call(struct work_struct *);
+
+/*
+ * ar-call.c
+ */
+extern struct kmem_cache *rxrpc_call_jar;
+extern struct list_head rxrpc_calls;
+extern rwlock_t rxrpc_call_lock;
+
+extern struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
+						struct rxrpc_transport *,
+						struct rxrpc_conn_bundle *,
+						unsigned long, int, gfp_t);
+extern struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
+					      struct rxrpc_connection *,
+					      struct rxrpc_header *, gfp_t);
+extern struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *,
+						 unsigned long);
+extern void rxrpc_release_call(struct rxrpc_call *);
+extern void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
+extern void __rxrpc_put_call(struct rxrpc_call *);
+extern void __exit rxrpc_destroy_all_calls(void);
+
+/*
+ * ar-connection.c
+ */
+extern struct list_head rxrpc_connections;
+extern rwlock_t rxrpc_connection_lock;
+
+extern struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
+						  struct rxrpc_transport *,
+						  struct key *,
+						  __be16, gfp_t);
+extern void rxrpc_put_bundle(struct rxrpc_transport *,
+			     struct rxrpc_conn_bundle *);
+extern int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
+			      struct rxrpc_conn_bundle *, struct rxrpc_call *,
+			      gfp_t);
+extern void rxrpc_put_connection(struct rxrpc_connection *);
+extern void __exit rxrpc_destroy_all_connections(void);
+extern struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
+						      struct rxrpc_header *);
+extern struct rxrpc_connection *
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *,
+			  gfp_t);
+
+/*
+ * ar-connevent.c
+ */
+extern void rxrpc_process_connection(struct work_struct *);
+extern void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *);
+extern void rxrpc_reject_packets(struct work_struct *);
+
+/*
+ * ar-error.c
+ */
+extern void rxrpc_UDP_error_report(struct sock *);
+extern void rxrpc_UDP_error_handler(struct work_struct *);
+
+/*
+ * ar-input.c
+ */
+extern unsigned long rxrpc_ack_timeout;
+extern const char *rxrpc_pkts[];
+
+extern void rxrpc_data_ready(struct sock *, int);
+extern int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool,
+			       bool);
+extern void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *);
+
+/*
+ * ar-local.c
+ */
+extern rwlock_t rxrpc_local_lock;
+extern struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *);
+extern void rxrpc_put_local(struct rxrpc_local *);
+extern void __exit rxrpc_destroy_all_locals(void);
+
+/*
+ * ar-key.c
+ */
+extern struct key_type key_type_rxrpc;
+extern struct key_type key_type_rxrpc_s;
+
+extern int rxrpc_request_key(struct rxrpc_sock *, char __user *, int);
+extern int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int);
+extern int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *,
+				     time_t, u32);
+
+/*
+ * ar-output.c
+ */
+extern int rxrpc_resend_timeout;
+
+extern int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
+extern int rxrpc_client_sendmsg(struct kiocb *, struct rxrpc_sock *,
+				struct rxrpc_transport *, struct msghdr *,
+				size_t);
+extern int rxrpc_server_sendmsg(struct kiocb *, struct rxrpc_sock *,
+				struct msghdr *, size_t);
+
+/*
+ * ar-peer.c
+ */
+extern struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *, gfp_t);
+extern void rxrpc_put_peer(struct rxrpc_peer *);
+extern struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *,
+					  __be32, __be16);
+extern void __exit rxrpc_destroy_all_peers(void);
+
+/*
+ * ar-proc.c
+ */
+extern const char *const rxrpc_call_states[];
+extern const struct file_operations rxrpc_call_seq_fops;
+extern const struct file_operations rxrpc_connection_seq_fops;
+
+/*
+ * ar-recvmsg.c
+ */
+extern void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *);
+extern int rxrpc_recvmsg(struct kiocb *, struct socket *, struct msghdr *,
+			 size_t, int);
+
+/*
+ * ar-security.c
+ */
+extern int rxrpc_register_security(struct rxrpc_security *);
+extern void rxrpc_unregister_security(struct rxrpc_security *);
+extern int rxrpc_init_client_conn_security(struct rxrpc_connection *);
+extern int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+extern int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *,
+			       size_t, void *);
+extern int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *,
+			       u32 *);
+extern void rxrpc_clear_conn_security(struct rxrpc_connection *);
+
+/*
+ * ar-skbuff.c
+ */
+extern void rxrpc_packet_destructor(struct sk_buff *);
+
+/*
+ * ar-transport.c
+ */
+extern struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
+						   struct rxrpc_peer *,
+						   gfp_t);
+extern void rxrpc_put_transport(struct rxrpc_transport *);
+extern void __exit rxrpc_destroy_all_transports(void);
+extern struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *,
+						    struct rxrpc_peer *);
+
+/*
+ * debug tracing
+ */
+extern unsigned rxrpc_debug;
+
+#define dbgprintk(FMT,...) \
+	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
+#define kproto(FMT,...)	dbgprintk("### "FMT ,##__VA_ARGS__)
+#define knet(FMT,...)	dbgprintk("@@@ "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+#define _proto(FMT,...)	kproto(FMT,##__VA_ARGS__)
+#define _net(FMT,...)	knet(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AF_RXRPC_DEBUG)
+#define RXRPC_DEBUG_KENTER	0x01
+#define RXRPC_DEBUG_KLEAVE	0x02
+#define RXRPC_DEBUG_KDEBUG	0x04
+#define RXRPC_DEBUG_KPROTO	0x08
+#define RXRPC_DEBUG_KNET	0x10
+
+#define _enter(FMT,...)					\
+do {							\
+	if (unlikely(rxrpc_debug & RXRPC_DEBUG_KENTER))	\
+		kenter(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT,...)					\
+do {							\
+	if (unlikely(rxrpc_debug & RXRPC_DEBUG_KLEAVE))	\
+		kleave(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT,...)					\
+do {							\
+	if (unlikely(rxrpc_debug & RXRPC_DEBUG_KDEBUG))	\
+		kdebug(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _proto(FMT,...)					\
+do {							\
+	if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO))	\
+		kproto(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _net(FMT,...)					\
+do {							\
+	if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET))	\
+		knet(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
+#define _proto(FMT,...)	no_printk("### "FMT ,##__VA_ARGS__)
+#define _net(FMT,...)	no_printk("@@@ "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X)						\
+do {								\
+	if (unlikely(!(X))) {					\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "RxRPC: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "RxRPC: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTIF(C, X)						\
+do {								\
+	if (unlikely((C) && !(X))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "RxRPC: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "RxRPC: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#else
+
+#define ASSERT(X)				\
+do {						\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)			\
+do {						\
+} while(0)
+
+#define ASSERTIF(C, X)				\
+do {						\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)		\
+do {						\
+} while(0)
+
+#endif /* __KDEBUGALL */
+
+/*
+ * socket buffer accounting / leak finding
+ */
+static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn)
+{
+	//_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
+	//atomic_inc(&rxrpc_n_skbs);
+}
+
+#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__)
+
+static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn)
+{
+	//_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs));
+	//atomic_dec(&rxrpc_n_skbs);
+}
+
+#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__)
+
+static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn)
+{
+	if (skb) {
+		CHECK_SLAB_OKAY(&skb->users);
+		//_net("free skb %p %s [%d]",
+		//     skb, fn, atomic_read(&rxrpc_n_skbs));
+		//atomic_dec(&rxrpc_n_skbs);
+		kfree_skb(skb);
+	}
+}
+
+#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__)
+
+static inline void rxrpc_purge_queue(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+	while ((skb = skb_dequeue((list))) != NULL)
+		rxrpc_free_skb(skb);
+}
+
+static inline void __rxrpc_get_local(struct rxrpc_local *local, const char *f)
+{
+	CHECK_SLAB_OKAY(&local->usage);
+	if (atomic_inc_return(&local->usage) == 1)
+		printk("resurrected (%s)\n", f);
+}
+
+#define rxrpc_get_local(LOCAL) __rxrpc_get_local((LOCAL), __func__)
+
+#define rxrpc_get_call(CALL)				\
+do {							\
+	CHECK_SLAB_OKAY(&(CALL)->usage);		\
+	if (atomic_inc_return(&(CALL)->usage) == 1)	\
+		BUG();					\
+} while(0)
+
+#define rxrpc_put_call(CALL)				\
+do {							\
+	__rxrpc_put_call(CALL);				\
+} while(0)
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
new file mode 100644
index 00000000..43ea7de2
--- /dev/null
+++ b/net/rxrpc/ar-key.c
@@ -0,0 +1,1220 @@
+/* RxRPC key management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * RxRPC keys should have a description of describing their purpose:
+ *	"afs@CAMBRIDGE.REDHAT.COM>
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/key-type.h>
+#include <linux/crypto.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include <keys/user-type.h>
+#include "ar-internal.h"
+
+static int rxrpc_vet_description_s(const char *);
+static int rxrpc_instantiate(struct key *, const void *, size_t);
+static int rxrpc_instantiate_s(struct key *, const void *, size_t);
+static void rxrpc_destroy(struct key *);
+static void rxrpc_destroy_s(struct key *);
+static void rxrpc_describe(const struct key *, struct seq_file *);
+static long rxrpc_read(const struct key *, char __user *, size_t);
+
+/*
+ * rxrpc defined keys take an arbitrary string as the description and an
+ * arbitrary blob of data as the payload
+ */
+struct key_type key_type_rxrpc = {
+	.name		= "rxrpc",
+	.instantiate	= rxrpc_instantiate,
+	.match		= user_match,
+	.destroy	= rxrpc_destroy,
+	.describe	= rxrpc_describe,
+	.read		= rxrpc_read,
+};
+EXPORT_SYMBOL(key_type_rxrpc);
+
+/*
+ * rxrpc server defined keys take "<serviceId>:<securityIndex>" as the
+ * description and an 8-byte decryption key as the payload
+ */
+struct key_type key_type_rxrpc_s = {
+	.name		= "rxrpc_s",
+	.vet_description = rxrpc_vet_description_s,
+	.instantiate	= rxrpc_instantiate_s,
+	.match		= user_match,
+	.destroy	= rxrpc_destroy_s,
+	.describe	= rxrpc_describe,
+};
+
+/*
+ * Vet the description for an RxRPC server key
+ */
+static int rxrpc_vet_description_s(const char *desc)
+{
+	unsigned long num;
+	char *p;
+
+	num = simple_strtoul(desc, &p, 10);
+	if (*p != ':' || num > 65535)
+		return -EINVAL;
+	num = simple_strtoul(p + 1, &p, 10);
+	if (*p || num < 1 || num > 255)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * parse an RxKAD type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_instantiate_xdr_rxkad(struct key *key, const __be32 *xdr,
+				       unsigned toklen)
+{
+	struct rxrpc_key_token *token, **pptoken;
+	size_t plen;
+	u32 tktlen;
+	int ret;
+
+	_enter(",{%x,%x,%x,%x},%u",
+	       ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+	       toklen);
+
+	if (toklen <= 8 * 4)
+		return -EKEYREJECTED;
+	tktlen = ntohl(xdr[7]);
+	_debug("tktlen: %x", tktlen);
+	if (tktlen > AFSTOKEN_RK_TIX_MAX)
+		return -EKEYREJECTED;
+	if (8 * 4 + tktlen != toklen)
+		return -EKEYREJECTED;
+
+	plen = sizeof(*token) + sizeof(*token->kad) + tktlen;
+	ret = key_payload_reserve(key, key->datalen + plen);
+	if (ret < 0)
+		return ret;
+
+	plen -= sizeof(*token);
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
+	if (!token)
+		return -ENOMEM;
+
+	token->kad = kzalloc(plen, GFP_KERNEL);
+	if (!token->kad) {
+		kfree(token);
+		return -ENOMEM;
+	}
+
+	token->security_index	= RXRPC_SECURITY_RXKAD;
+	token->kad->ticket_len	= tktlen;
+	token->kad->vice_id	= ntohl(xdr[0]);
+	token->kad->kvno	= ntohl(xdr[1]);
+	token->kad->start	= ntohl(xdr[4]);
+	token->kad->expiry	= ntohl(xdr[5]);
+	token->kad->primary_flag = ntohl(xdr[6]);
+	memcpy(&token->kad->session_key, &xdr[2], 8);
+	memcpy(&token->kad->ticket, &xdr[8], tktlen);
+
+	_debug("SCIX: %u", token->security_index);
+	_debug("TLEN: %u", token->kad->ticket_len);
+	_debug("EXPY: %x", token->kad->expiry);
+	_debug("KVNO: %u", token->kad->kvno);
+	_debug("PRIM: %u", token->kad->primary_flag);
+	_debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+	       token->kad->session_key[0], token->kad->session_key[1],
+	       token->kad->session_key[2], token->kad->session_key[3],
+	       token->kad->session_key[4], token->kad->session_key[5],
+	       token->kad->session_key[6], token->kad->session_key[7]);
+	if (token->kad->ticket_len >= 8)
+		_debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+		       token->kad->ticket[0], token->kad->ticket[1],
+		       token->kad->ticket[2], token->kad->ticket[3],
+		       token->kad->ticket[4], token->kad->ticket[5],
+		       token->kad->ticket[6], token->kad->ticket[7]);
+
+	/* count the number of tokens attached */
+	key->type_data.x[0]++;
+
+	/* attach the data */
+	for (pptoken = (struct rxrpc_key_token **)&key->payload.data;
+	     *pptoken;
+	     pptoken = &(*pptoken)->next)
+		continue;
+	*pptoken = token;
+	if (token->kad->expiry < key->expiry)
+		key->expiry = token->kad->expiry;
+
+	_leave(" = 0");
+	return 0;
+}
+
+static void rxrpc_free_krb5_principal(struct krb5_principal *princ)
+{
+	int loop;
+
+	if (princ->name_parts) {
+		for (loop = princ->n_name_parts - 1; loop >= 0; loop--)
+			kfree(princ->name_parts[loop]);
+		kfree(princ->name_parts);
+	}
+	kfree(princ->realm);
+}
+
+static void rxrpc_free_krb5_tagged(struct krb5_tagged_data *td)
+{
+	kfree(td->data);
+}
+
+/*
+ * free up an RxK5 token
+ */
+static void rxrpc_rxk5_free(struct rxk5_key *rxk5)
+{
+	int loop;
+
+	rxrpc_free_krb5_principal(&rxk5->client);
+	rxrpc_free_krb5_principal(&rxk5->server);
+	rxrpc_free_krb5_tagged(&rxk5->session);
+
+	if (rxk5->addresses) {
+		for (loop = rxk5->n_addresses - 1; loop >= 0; loop--)
+			rxrpc_free_krb5_tagged(&rxk5->addresses[loop]);
+		kfree(rxk5->addresses);
+	}
+	if (rxk5->authdata) {
+		for (loop = rxk5->n_authdata - 1; loop >= 0; loop--)
+			rxrpc_free_krb5_tagged(&rxk5->authdata[loop]);
+		kfree(rxk5->authdata);
+	}
+
+	kfree(rxk5->ticket);
+	kfree(rxk5->ticket2);
+	kfree(rxk5);
+}
+
+/*
+ * extract a krb5 principal
+ */
+static int rxrpc_krb5_decode_principal(struct krb5_principal *princ,
+				       const __be32 **_xdr,
+				       unsigned *_toklen)
+{
+	const __be32 *xdr = *_xdr;
+	unsigned toklen = *_toklen, n_parts, loop, tmp;
+
+	/* there must be at least one name, and at least #names+1 length
+	 * words */
+	if (toklen <= 12)
+		return -EINVAL;
+
+	_enter(",{%x,%x,%x},%u",
+	       ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), toklen);
+
+	n_parts = ntohl(*xdr++);
+	toklen -= 4;
+	if (n_parts <= 0 || n_parts > AFSTOKEN_K5_COMPONENTS_MAX)
+		return -EINVAL;
+	princ->n_name_parts = n_parts;
+
+	if (toklen <= (n_parts + 1) * 4)
+		return -EINVAL;
+
+	princ->name_parts = kcalloc(sizeof(char *), n_parts, GFP_KERNEL);
+	if (!princ->name_parts)
+		return -ENOMEM;
+
+	for (loop = 0; loop < n_parts; loop++) {
+		if (toklen < 4)
+			return -EINVAL;
+		tmp = ntohl(*xdr++);
+		toklen -= 4;
+		if (tmp <= 0 || tmp > AFSTOKEN_STRING_MAX)
+			return -EINVAL;
+		if (tmp > toklen)
+			return -EINVAL;
+		princ->name_parts[loop] = kmalloc(tmp + 1, GFP_KERNEL);
+		if (!princ->name_parts[loop])
+			return -ENOMEM;
+		memcpy(princ->name_parts[loop], xdr, tmp);
+		princ->name_parts[loop][tmp] = 0;
+		tmp = (tmp + 3) & ~3;
+		toklen -= tmp;
+		xdr += tmp >> 2;
+	}
+
+	if (toklen < 4)
+		return -EINVAL;
+	tmp = ntohl(*xdr++);
+	toklen -= 4;
+	if (tmp <= 0 || tmp > AFSTOKEN_K5_REALM_MAX)
+		return -EINVAL;
+	if (tmp > toklen)
+		return -EINVAL;
+	princ->realm = kmalloc(tmp + 1, GFP_KERNEL);
+	if (!princ->realm)
+		return -ENOMEM;
+	memcpy(princ->realm, xdr, tmp);
+	princ->realm[tmp] = 0;
+	tmp = (tmp + 3) & ~3;
+	toklen -= tmp;
+	xdr += tmp >> 2;
+
+	_debug("%s/...@%s", princ->name_parts[0], princ->realm);
+
+	*_xdr = xdr;
+	*_toklen = toklen;
+	_leave(" = 0 [toklen=%u]", toklen);
+	return 0;
+}
+
+/*
+ * extract a piece of krb5 tagged data
+ */
+static int rxrpc_krb5_decode_tagged_data(struct krb5_tagged_data *td,
+					 size_t max_data_size,
+					 const __be32 **_xdr,
+					 unsigned *_toklen)
+{
+	const __be32 *xdr = *_xdr;
+	unsigned toklen = *_toklen, len;
+
+	/* there must be at least one tag and one length word */
+	if (toklen <= 8)
+		return -EINVAL;
+
+	_enter(",%zu,{%x,%x},%u",
+	       max_data_size, ntohl(xdr[0]), ntohl(xdr[1]), toklen);
+
+	td->tag = ntohl(*xdr++);
+	len = ntohl(*xdr++);
+	toklen -= 8;
+	if (len > max_data_size)
+		return -EINVAL;
+	td->data_len = len;
+
+	if (len > 0) {
+		td->data = kmalloc(len, GFP_KERNEL);
+		if (!td->data)
+			return -ENOMEM;
+		memcpy(td->data, xdr, len);
+		len = (len + 3) & ~3;
+		toklen -= len;
+		xdr += len >> 2;
+	}
+
+	_debug("tag %x len %x", td->tag, td->data_len);
+
+	*_xdr = xdr;
+	*_toklen = toklen;
+	_leave(" = 0 [toklen=%u]", toklen);
+	return 0;
+}
+
+/*
+ * extract an array of tagged data
+ */
+static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td,
+					  u8 *_n_elem,
+					  u8 max_n_elem,
+					  size_t max_elem_size,
+					  const __be32 **_xdr,
+					  unsigned *_toklen)
+{
+	struct krb5_tagged_data *td;
+	const __be32 *xdr = *_xdr;
+	unsigned toklen = *_toklen, n_elem, loop;
+	int ret;
+
+	/* there must be at least one count */
+	if (toklen < 4)
+		return -EINVAL;
+
+	_enter(",,%u,%zu,{%x},%u",
+	       max_n_elem, max_elem_size, ntohl(xdr[0]), toklen);
+
+	n_elem = ntohl(*xdr++);
+	toklen -= 4;
+	if (n_elem < 0 || n_elem > max_n_elem)
+		return -EINVAL;
+	*_n_elem = n_elem;
+	if (n_elem > 0) {
+		if (toklen <= (n_elem + 1) * 4)
+			return -EINVAL;
+
+		_debug("n_elem %d", n_elem);
+
+		td = kcalloc(sizeof(struct krb5_tagged_data), n_elem,
+			     GFP_KERNEL);
+		if (!td)
+			return -ENOMEM;
+		*_td = td;
+
+		for (loop = 0; loop < n_elem; loop++) {
+			ret = rxrpc_krb5_decode_tagged_data(&td[loop],
+							    max_elem_size,
+							    &xdr, &toklen);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	*_xdr = xdr;
+	*_toklen = toklen;
+	_leave(" = 0 [toklen=%u]", toklen);
+	return 0;
+}
+
+/*
+ * extract a krb5 ticket
+ */
+static int rxrpc_krb5_decode_ticket(u8 **_ticket, u16 *_tktlen,
+				    const __be32 **_xdr, unsigned *_toklen)
+{
+	const __be32 *xdr = *_xdr;
+	unsigned toklen = *_toklen, len;
+
+	/* there must be at least one length word */
+	if (toklen <= 4)
+		return -EINVAL;
+
+	_enter(",{%x},%u", ntohl(xdr[0]), toklen);
+
+	len = ntohl(*xdr++);
+	toklen -= 4;
+	if (len > AFSTOKEN_K5_TIX_MAX)
+		return -EINVAL;
+	*_tktlen = len;
+
+	_debug("ticket len %u", len);
+
+	if (len > 0) {
+		*_ticket = kmalloc(len, GFP_KERNEL);
+		if (!*_ticket)
+			return -ENOMEM;
+		memcpy(*_ticket, xdr, len);
+		len = (len + 3) & ~3;
+		toklen -= len;
+		xdr += len >> 2;
+	}
+
+	*_xdr = xdr;
+	*_toklen = toklen;
+	_leave(" = 0 [toklen=%u]", toklen);
+	return 0;
+}
+
+/*
+ * parse an RxK5 type XDR format token
+ * - the caller guarantees we have at least 4 words
+ */
+static int rxrpc_instantiate_xdr_rxk5(struct key *key, const __be32 *xdr,
+				      unsigned toklen)
+{
+	struct rxrpc_key_token *token, **pptoken;
+	struct rxk5_key *rxk5;
+	const __be32 *end_xdr = xdr + (toklen >> 2);
+	int ret;
+
+	_enter(",{%x,%x,%x,%x},%u",
+	       ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+	       toklen);
+
+	/* reserve some payload space for this subkey - the length of the token
+	 * is a reasonable approximation */
+	ret = key_payload_reserve(key, key->datalen + toklen);
+	if (ret < 0)
+		return ret;
+
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
+	if (!token)
+		return -ENOMEM;
+
+	rxk5 = kzalloc(sizeof(*rxk5), GFP_KERNEL);
+	if (!rxk5) {
+		kfree(token);
+		return -ENOMEM;
+	}
+
+	token->security_index = RXRPC_SECURITY_RXK5;
+	token->k5 = rxk5;
+
+	/* extract the principals */
+	ret = rxrpc_krb5_decode_principal(&rxk5->client, &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+	ret = rxrpc_krb5_decode_principal(&rxk5->server, &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+
+	/* extract the session key and the encoding type (the tag field ->
+	 * ENCTYPE_xxx) */
+	ret = rxrpc_krb5_decode_tagged_data(&rxk5->session, AFSTOKEN_DATA_MAX,
+					    &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+
+	if (toklen < 4 * 8 + 2 * 4)
+		goto inval;
+	rxk5->authtime	= be64_to_cpup((const __be64 *) xdr);
+	xdr += 2;
+	rxk5->starttime	= be64_to_cpup((const __be64 *) xdr);
+	xdr += 2;
+	rxk5->endtime	= be64_to_cpup((const __be64 *) xdr);
+	xdr += 2;
+	rxk5->renew_till = be64_to_cpup((const __be64 *) xdr);
+	xdr += 2;
+	rxk5->is_skey = ntohl(*xdr++);
+	rxk5->flags = ntohl(*xdr++);
+	toklen -= 4 * 8 + 2 * 4;
+
+	_debug("times: a=%llx s=%llx e=%llx rt=%llx",
+	       rxk5->authtime, rxk5->starttime, rxk5->endtime,
+	       rxk5->renew_till);
+	_debug("is_skey=%x flags=%x", rxk5->is_skey, rxk5->flags);
+
+	/* extract the permitted client addresses */
+	ret = rxrpc_krb5_decode_tagged_array(&rxk5->addresses,
+					     &rxk5->n_addresses,
+					     AFSTOKEN_K5_ADDRESSES_MAX,
+					     AFSTOKEN_DATA_MAX,
+					     &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+
+	ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+	/* extract the tickets */
+	ret = rxrpc_krb5_decode_ticket(&rxk5->ticket, &rxk5->ticket_len,
+				       &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+	ret = rxrpc_krb5_decode_ticket(&rxk5->ticket2, &rxk5->ticket2_len,
+				       &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+
+	ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+	/* extract the typed auth data */
+	ret = rxrpc_krb5_decode_tagged_array(&rxk5->authdata,
+					     &rxk5->n_authdata,
+					     AFSTOKEN_K5_AUTHDATA_MAX,
+					     AFSTOKEN_BDATALN_MAX,
+					     &xdr, &toklen);
+	if (ret < 0)
+		goto error;
+
+	ASSERTCMP((end_xdr - xdr) << 2, ==, toklen);
+
+	if (toklen != 0)
+		goto inval;
+
+	/* attach the payload to the key */
+	for (pptoken = (struct rxrpc_key_token **)&key->payload.data;
+	     *pptoken;
+	     pptoken = &(*pptoken)->next)
+		continue;
+	*pptoken = token;
+	if (token->kad->expiry < key->expiry)
+		key->expiry = token->kad->expiry;
+
+	_leave(" = 0");
+	return 0;
+
+inval:
+	ret = -EINVAL;
+error:
+	rxrpc_rxk5_free(rxk5);
+	kfree(token);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attempt to parse the data as the XDR format
+ * - the caller guarantees we have more than 7 words
+ */
+static int rxrpc_instantiate_xdr(struct key *key, const void *data, size_t datalen)
+{
+	const __be32 *xdr = data, *token;
+	const char *cp;
+	unsigned len, tmp, loop, ntoken, toklen, sec_ix;
+	int ret;
+
+	_enter(",{%x,%x,%x,%x},%zu",
+	       ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]),
+	       datalen);
+
+	if (datalen > AFSTOKEN_LENGTH_MAX)
+		goto not_xdr;
+
+	/* XDR is an array of __be32's */
+	if (datalen & 3)
+		goto not_xdr;
+
+	/* the flags should be 0 (the setpag bit must be handled by
+	 * userspace) */
+	if (ntohl(*xdr++) != 0)
+		goto not_xdr;
+	datalen -= 4;
+
+	/* check the cell name */
+	len = ntohl(*xdr++);
+	if (len < 1 || len > AFSTOKEN_CELL_MAX)
+		goto not_xdr;
+	datalen -= 4;
+	tmp = (len + 3) & ~3;
+	if (tmp > datalen)
+		goto not_xdr;
+
+	cp = (const char *) xdr;
+	for (loop = 0; loop < len; loop++)
+		if (!isprint(cp[loop]))
+			goto not_xdr;
+	if (len < tmp)
+		for (; loop < tmp; loop++)
+			if (cp[loop])
+				goto not_xdr;
+	_debug("cellname: [%u/%u] '%*.*s'",
+	       len, tmp, len, len, (const char *) xdr);
+	datalen -= tmp;
+	xdr += tmp >> 2;
+
+	/* get the token count */
+	if (datalen < 12)
+		goto not_xdr;
+	ntoken = ntohl(*xdr++);
+	datalen -= 4;
+	_debug("ntoken: %x", ntoken);
+	if (ntoken < 1 || ntoken > AFSTOKEN_MAX)
+		goto not_xdr;
+
+	/* check each token wrapper */
+	token = xdr;
+	loop = ntoken;
+	do {
+		if (datalen < 8)
+			goto not_xdr;
+		toklen = ntohl(*xdr++);
+		sec_ix = ntohl(*xdr);
+		datalen -= 4;
+		_debug("token: [%x/%zx] %x", toklen, datalen, sec_ix);
+		if (toklen < 20 || toklen > datalen)
+			goto not_xdr;
+		datalen -= (toklen + 3) & ~3;
+		xdr += (toklen + 3) >> 2;
+
+	} while (--loop > 0);
+
+	_debug("remainder: %zu", datalen);
+	if (datalen != 0)
+		goto not_xdr;
+
+	/* okay: we're going to assume it's valid XDR format
+	 * - we ignore the cellname, relying on the key to be correctly named
+	 */
+	do {
+		xdr = token;
+		toklen = ntohl(*xdr++);
+		token = xdr + ((toklen + 3) >> 2);
+		sec_ix = ntohl(*xdr++);
+		toklen -= 4;
+
+		_debug("TOKEN type=%u [%p-%p]", sec_ix, xdr, token);
+
+		switch (sec_ix) {
+		case RXRPC_SECURITY_RXKAD:
+			ret = rxrpc_instantiate_xdr_rxkad(key, xdr, toklen);
+			if (ret != 0)
+				goto error;
+			break;
+
+		case RXRPC_SECURITY_RXK5:
+			ret = rxrpc_instantiate_xdr_rxk5(key, xdr, toklen);
+			if (ret != 0)
+				goto error;
+			break;
+
+		default:
+			ret = -EPROTONOSUPPORT;
+			goto error;
+		}
+
+	} while (--ntoken > 0);
+
+	_leave(" = 0");
+	return 0;
+
+not_xdr:
+	_leave(" = -EPROTO");
+	return -EPROTO;
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * instantiate an rxrpc defined key
+ * data should be of the form:
+ *	OFFSET	LEN	CONTENT
+ *	0	4	key interface version number
+ *	4	2	security index (type)
+ *	6	2	ticket length
+ *	8	4	key expiry time (time_t)
+ *	12	4	kvno
+ *	16	8	session key
+ *	24	[len]	ticket
+ *
+ * if no data is provided, then a no-security key is made
+ */
+static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
+{
+	const struct rxrpc_key_data_v1 *v1;
+	struct rxrpc_key_token *token, **pp;
+	size_t plen;
+	u32 kver;
+	int ret;
+
+	_enter("{%x},,%zu", key_serial(key), datalen);
+
+	/* handle a no-security key */
+	if (!data && datalen == 0)
+		return 0;
+
+	/* determine if the XDR payload format is being used */
+	if (datalen > 7 * 4) {
+		ret = rxrpc_instantiate_xdr(key, data, datalen);
+		if (ret != -EPROTO)
+			return ret;
+	}
+
+	/* get the key interface version number */
+	ret = -EINVAL;
+	if (datalen <= 4 || !data)
+		goto error;
+	memcpy(&kver, data, sizeof(kver));
+	data += sizeof(kver);
+	datalen -= sizeof(kver);
+
+	_debug("KEY I/F VERSION: %u", kver);
+
+	ret = -EKEYREJECTED;
+	if (kver != 1)
+		goto error;
+
+	/* deal with a version 1 key */
+	ret = -EINVAL;
+	if (datalen < sizeof(*v1))
+		goto error;
+
+	v1 = data;
+	if (datalen != sizeof(*v1) + v1->ticket_length)
+		goto error;
+
+	_debug("SCIX: %u", v1->security_index);
+	_debug("TLEN: %u", v1->ticket_length);
+	_debug("EXPY: %x", v1->expiry);
+	_debug("KVNO: %u", v1->kvno);
+	_debug("SKEY: %02x%02x%02x%02x%02x%02x%02x%02x",
+	       v1->session_key[0], v1->session_key[1],
+	       v1->session_key[2], v1->session_key[3],
+	       v1->session_key[4], v1->session_key[5],
+	       v1->session_key[6], v1->session_key[7]);
+	if (v1->ticket_length >= 8)
+		_debug("TCKT: %02x%02x%02x%02x%02x%02x%02x%02x",
+		       v1->ticket[0], v1->ticket[1],
+		       v1->ticket[2], v1->ticket[3],
+		       v1->ticket[4], v1->ticket[5],
+		       v1->ticket[6], v1->ticket[7]);
+
+	ret = -EPROTONOSUPPORT;
+	if (v1->security_index != RXRPC_SECURITY_RXKAD)
+		goto error;
+
+	plen = sizeof(*token->kad) + v1->ticket_length;
+	ret = key_payload_reserve(key, plen + sizeof(*token));
+	if (ret < 0)
+		goto error;
+
+	ret = -ENOMEM;
+	token = kzalloc(sizeof(*token), GFP_KERNEL);
+	if (!token)
+		goto error;
+	token->kad = kzalloc(plen, GFP_KERNEL);
+	if (!token->kad)
+		goto error_free;
+
+	token->security_index		= RXRPC_SECURITY_RXKAD;
+	token->kad->ticket_len		= v1->ticket_length;
+	token->kad->expiry		= v1->expiry;
+	token->kad->kvno		= v1->kvno;
+	memcpy(&token->kad->session_key, &v1->session_key, 8);
+	memcpy(&token->kad->ticket, v1->ticket, v1->ticket_length);
+
+	/* attach the data */
+	key->type_data.x[0]++;
+
+	pp = (struct rxrpc_key_token **)&key->payload.data;
+	while (*pp)
+		pp = &(*pp)->next;
+	*pp = token;
+	if (token->kad->expiry < key->expiry)
+		key->expiry = token->kad->expiry;
+	token = NULL;
+	ret = 0;
+
+error_free:
+	kfree(token);
+error:
+	return ret;
+}
+
+/*
+ * instantiate a server secret key
+ * data should be a pointer to the 8-byte secret key
+ */
+static int rxrpc_instantiate_s(struct key *key, const void *data,
+			       size_t datalen)
+{
+	struct crypto_blkcipher *ci;
+
+	_enter("{%x},,%zu", key_serial(key), datalen);
+
+	if (datalen != 8)
+		return -EINVAL;
+
+	memcpy(&key->type_data, data, 8);
+
+	ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ci)) {
+		_leave(" = %ld", PTR_ERR(ci));
+		return PTR_ERR(ci);
+	}
+
+	if (crypto_blkcipher_setkey(ci, data, 8) < 0)
+		BUG();
+
+	key->payload.data = ci;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy(struct key *key)
+{
+	struct rxrpc_key_token *token;
+
+	while ((token = key->payload.data)) {
+		key->payload.data = token->next;
+		switch (token->security_index) {
+		case RXRPC_SECURITY_RXKAD:
+			kfree(token->kad);
+			break;
+		case RXRPC_SECURITY_RXK5:
+			if (token->k5)
+				rxrpc_rxk5_free(token->k5);
+			break;
+		default:
+			printk(KERN_ERR "Unknown token type %x on rxrpc key\n",
+			       token->security_index);
+			BUG();
+		}
+
+		kfree(token);
+	}
+}
+
+/*
+ * dispose of the data dangling from the corpse of a rxrpc key
+ */
+static void rxrpc_destroy_s(struct key *key)
+{
+	if (key->payload.data) {
+		crypto_free_blkcipher(key->payload.data);
+		key->payload.data = NULL;
+	}
+}
+
+/*
+ * describe the rxrpc key
+ */
+static void rxrpc_describe(const struct key *key, struct seq_file *m)
+{
+	seq_puts(m, key->description);
+}
+
+/*
+ * grab the security key for a socket
+ */
+int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
+{
+	struct key *key;
+	char *description;
+
+	_enter("");
+
+	if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+		return -EINVAL;
+
+	description = kmalloc(optlen + 1, GFP_KERNEL);
+	if (!description)
+		return -ENOMEM;
+
+	if (copy_from_user(description, optval, optlen)) {
+		kfree(description);
+		return -EFAULT;
+	}
+	description[optlen] = 0;
+
+	key = request_key(&key_type_rxrpc, description, NULL);
+	if (IS_ERR(key)) {
+		kfree(description);
+		_leave(" = %ld", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	rx->key = key;
+	kfree(description);
+	_leave(" = 0 [key %x]", key->serial);
+	return 0;
+}
+
+/*
+ * grab the security keyring for a server socket
+ */
+int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
+			 int optlen)
+{
+	struct key *key;
+	char *description;
+
+	_enter("");
+
+	if (optlen <= 0 || optlen > PAGE_SIZE - 1)
+		return -EINVAL;
+
+	description = kmalloc(optlen + 1, GFP_KERNEL);
+	if (!description)
+		return -ENOMEM;
+
+	if (copy_from_user(description, optval, optlen)) {
+		kfree(description);
+		return -EFAULT;
+	}
+	description[optlen] = 0;
+
+	key = request_key(&key_type_keyring, description, NULL);
+	if (IS_ERR(key)) {
+		kfree(description);
+		_leave(" = %ld", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	rx->securities = key;
+	kfree(description);
+	_leave(" = 0 [key %x]", key->serial);
+	return 0;
+}
+
+/*
+ * generate a server data key
+ */
+int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
+			      const void *session_key,
+			      time_t expiry,
+			      u32 kvno)
+{
+	const struct cred *cred = current_cred();
+	struct key *key;
+	int ret;
+
+	struct {
+		u32 kver;
+		struct rxrpc_key_data_v1 v1;
+	} data;
+
+	_enter("");
+
+	key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0,
+			KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(key)) {
+		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
+		return -ENOMEM;
+	}
+
+	_debug("key %d", key_serial(key));
+
+	data.kver = 1;
+	data.v1.security_index = RXRPC_SECURITY_RXKAD;
+	data.v1.ticket_length = 0;
+	data.v1.expiry = expiry;
+	data.v1.kvno = 0;
+
+	memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key));
+
+	ret = key_instantiate_and_link(key, &data, sizeof(data), NULL, NULL);
+	if (ret < 0)
+		goto error;
+
+	conn->key = key;
+	_leave(" = 0 [%d]", key_serial(key));
+	return 0;
+
+error:
+	key_revoke(key);
+	key_put(key);
+	_leave(" = -ENOMEM [ins %d]", ret);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(rxrpc_get_server_data_key);
+
+/**
+ * rxrpc_get_null_key - Generate a null RxRPC key
+ * @keyname: The name to give the key.
+ *
+ * Generate a null RxRPC key that can be used to indicate anonymous security is
+ * required for a particular domain.
+ */
+struct key *rxrpc_get_null_key(const char *keyname)
+{
+	const struct cred *cred = current_cred();
+	struct key *key;
+	int ret;
+
+	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred,
+			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(key))
+		return key;
+
+	ret = key_instantiate_and_link(key, NULL, 0, NULL, NULL);
+	if (ret < 0) {
+		key_revoke(key);
+		key_put(key);
+		return ERR_PTR(ret);
+	}
+
+	return key;
+}
+EXPORT_SYMBOL(rxrpc_get_null_key);
+
+/*
+ * read the contents of an rxrpc key
+ * - this returns the result in XDR form
+ */
+static long rxrpc_read(const struct key *key,
+		       char __user *buffer, size_t buflen)
+{
+	const struct rxrpc_key_token *token;
+	const struct krb5_principal *princ;
+	size_t size;
+	__be32 __user *xdr, *oldxdr;
+	u32 cnlen, toksize, ntoks, tok, zero;
+	u16 toksizes[AFSTOKEN_MAX];
+	int loop;
+
+	_enter("");
+
+	/* we don't know what form we should return non-AFS keys in */
+	if (memcmp(key->description, "afs@", 4) != 0)
+		return -EOPNOTSUPP;
+	cnlen = strlen(key->description + 4);
+
+#define RND(X) (((X) + 3) & ~3)
+
+	/* AFS keys we return in XDR form, so we need to work out the size of
+	 * the XDR */
+	size = 2 * 4;	/* flags, cellname len */
+	size += RND(cnlen);	/* cellname */
+	size += 1 * 4;	/* token count */
+
+	ntoks = 0;
+	for (token = key->payload.data; token; token = token->next) {
+		toksize = 4;	/* sec index */
+
+		switch (token->security_index) {
+		case RXRPC_SECURITY_RXKAD:
+			toksize += 8 * 4;	/* viceid, kvno, key*2, begin,
+						 * end, primary, tktlen */
+			toksize += RND(token->kad->ticket_len);
+			break;
+
+		case RXRPC_SECURITY_RXK5:
+			princ = &token->k5->client;
+			toksize += 4 + princ->n_name_parts * 4;
+			for (loop = 0; loop < princ->n_name_parts; loop++)
+				toksize += RND(strlen(princ->name_parts[loop]));
+			toksize += 4 + RND(strlen(princ->realm));
+
+			princ = &token->k5->server;
+			toksize += 4 + princ->n_name_parts * 4;
+			for (loop = 0; loop < princ->n_name_parts; loop++)
+				toksize += RND(strlen(princ->name_parts[loop]));
+			toksize += 4 + RND(strlen(princ->realm));
+
+			toksize += 8 + RND(token->k5->session.data_len);
+
+			toksize += 4 * 8 + 2 * 4;
+
+			toksize += 4 + token->k5->n_addresses * 8;
+			for (loop = 0; loop < token->k5->n_addresses; loop++)
+				toksize += RND(token->k5->addresses[loop].data_len);
+
+			toksize += 4 + RND(token->k5->ticket_len);
+			toksize += 4 + RND(token->k5->ticket2_len);
+
+			toksize += 4 + token->k5->n_authdata * 8;
+			for (loop = 0; loop < token->k5->n_authdata; loop++)
+				toksize += RND(token->k5->authdata[loop].data_len);
+			break;
+
+		default: /* we have a ticket we can't encode */
+			BUG();
+			continue;
+		}
+
+		_debug("token[%u]: toksize=%u", ntoks, toksize);
+		ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX);
+
+		toksizes[ntoks++] = toksize;
+		size += toksize + 4; /* each token has a length word */
+	}
+
+#undef RND
+
+	if (!buffer || buflen < size)
+		return size;
+
+	xdr = (__be32 __user *) buffer;
+	zero = 0;
+#define ENCODE(x)				\
+	do {					\
+		__be32 y = htonl(x);		\
+		if (put_user(y, xdr++) < 0)	\
+			goto fault;		\
+	} while(0)
+#define ENCODE_DATA(l, s)						\
+	do {								\
+		u32 _l = (l);						\
+		ENCODE(l);						\
+		if (copy_to_user(xdr, (s), _l) != 0)			\
+			goto fault;					\
+		if (_l & 3 &&						\
+		    copy_to_user((u8 *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \
+			goto fault;					\
+		xdr += (_l + 3) >> 2;					\
+	} while(0)
+#define ENCODE64(x)					\
+	do {						\
+		__be64 y = cpu_to_be64(x);		\
+		if (copy_to_user(xdr, &y, 8) != 0)	\
+			goto fault;			\
+		xdr += 8 >> 2;				\
+	} while(0)
+#define ENCODE_STR(s)				\
+	do {					\
+		const char *_s = (s);		\
+		ENCODE_DATA(strlen(_s), _s);	\
+	} while(0)
+
+	ENCODE(0);					/* flags */
+	ENCODE_DATA(cnlen, key->description + 4);	/* cellname */
+	ENCODE(ntoks);
+
+	tok = 0;
+	for (token = key->payload.data; token; token = token->next) {
+		toksize = toksizes[tok++];
+		ENCODE(toksize);
+		oldxdr = xdr;
+		ENCODE(token->security_index);
+
+		switch (token->security_index) {
+		case RXRPC_SECURITY_RXKAD:
+			ENCODE(token->kad->vice_id);
+			ENCODE(token->kad->kvno);
+			ENCODE_DATA(8, token->kad->session_key);
+			ENCODE(token->kad->start);
+			ENCODE(token->kad->expiry);
+			ENCODE(token->kad->primary_flag);
+			ENCODE_DATA(token->kad->ticket_len, token->kad->ticket);
+			break;
+
+		case RXRPC_SECURITY_RXK5:
+			princ = &token->k5->client;
+			ENCODE(princ->n_name_parts);
+			for (loop = 0; loop < princ->n_name_parts; loop++)
+				ENCODE_STR(princ->name_parts[loop]);
+			ENCODE_STR(princ->realm);
+
+			princ = &token->k5->server;
+			ENCODE(princ->n_name_parts);
+			for (loop = 0; loop < princ->n_name_parts; loop++)
+				ENCODE_STR(princ->name_parts[loop]);
+			ENCODE_STR(princ->realm);
+
+			ENCODE(token->k5->session.tag);
+			ENCODE_DATA(token->k5->session.data_len,
+				    token->k5->session.data);
+
+			ENCODE64(token->k5->authtime);
+			ENCODE64(token->k5->starttime);
+			ENCODE64(token->k5->endtime);
+			ENCODE64(token->k5->renew_till);
+			ENCODE(token->k5->is_skey);
+			ENCODE(token->k5->flags);
+
+			ENCODE(token->k5->n_addresses);
+			for (loop = 0; loop < token->k5->n_addresses; loop++) {
+				ENCODE(token->k5->addresses[loop].tag);
+				ENCODE_DATA(token->k5->addresses[loop].data_len,
+					    token->k5->addresses[loop].data);
+			}
+
+			ENCODE_DATA(token->k5->ticket_len, token->k5->ticket);
+			ENCODE_DATA(token->k5->ticket2_len, token->k5->ticket2);
+
+			ENCODE(token->k5->n_authdata);
+			for (loop = 0; loop < token->k5->n_authdata; loop++) {
+				ENCODE(token->k5->authdata[loop].tag);
+				ENCODE_DATA(token->k5->authdata[loop].data_len,
+					    token->k5->authdata[loop].data);
+			}
+			break;
+
+		default:
+			BUG();
+			break;
+		}
+
+		ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
+			  toksize);
+	}
+
+#undef ENCODE_STR
+#undef ENCODE_DATA
+#undef ENCODE64
+#undef ENCODE
+
+	ASSERTCMP(tok, ==, ntoks);
+	ASSERTCMP((char __user *) xdr - buffer, ==, size);
+	_leave(" = %zu", size);
+	return size;
+
+fault:
+	_leave(" = -EFAULT");
+	return -EFAULT;
+}
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
new file mode 100644
index 00000000..87f7135d
--- /dev/null
+++ b/net/rxrpc/ar-local.c
@@ -0,0 +1,310 @@
+/* AF_RXRPC local endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_locals);
+DEFINE_RWLOCK(rxrpc_local_lock);
+static DECLARE_RWSEM(rxrpc_local_sem);
+static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq);
+
+static void rxrpc_destroy_local(struct work_struct *work);
+
+/*
+ * allocate a new local
+ */
+static
+struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx)
+{
+	struct rxrpc_local *local;
+
+	local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL);
+	if (local) {
+		INIT_WORK(&local->destroyer, &rxrpc_destroy_local);
+		INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls);
+		INIT_WORK(&local->rejecter, &rxrpc_reject_packets);
+		INIT_LIST_HEAD(&local->services);
+		INIT_LIST_HEAD(&local->link);
+		init_rwsem(&local->defrag_sem);
+		skb_queue_head_init(&local->accept_queue);
+		skb_queue_head_init(&local->reject_queue);
+		spin_lock_init(&local->lock);
+		rwlock_init(&local->services_lock);
+		atomic_set(&local->usage, 1);
+		local->debug_id = atomic_inc_return(&rxrpc_debug_id);
+		memcpy(&local->srx, srx, sizeof(*srx));
+	}
+
+	_leave(" = %p", local);
+	return local;
+}
+
+/*
+ * create the local socket
+ * - must be called with rxrpc_local_sem writelocked
+ */
+static int rxrpc_create_local(struct rxrpc_local *local)
+{
+	struct sock *sock;
+	int ret, opt;
+
+	_enter("%p{%d}", local, local->srx.transport_type);
+
+	/* create a socket to represent the local endpoint */
+	ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP,
+			       &local->socket);
+	if (ret < 0) {
+		_leave(" = %d [socket]", ret);
+		return ret;
+	}
+
+	/* if a local address was supplied then bind it */
+	if (local->srx.transport_len > sizeof(sa_family_t)) {
+		_debug("bind");
+		ret = kernel_bind(local->socket,
+				  (struct sockaddr *) &local->srx.transport,
+				  local->srx.transport_len);
+		if (ret < 0) {
+			_debug("bind failed");
+			goto error;
+		}
+	}
+
+	/* we want to receive ICMP errors */
+	opt = 1;
+	ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
+				(char *) &opt, sizeof(opt));
+	if (ret < 0) {
+		_debug("setsockopt failed");
+		goto error;
+	}
+
+	/* we want to set the don't fragment bit */
+	opt = IP_PMTUDISC_DO;
+	ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
+				(char *) &opt, sizeof(opt));
+	if (ret < 0) {
+		_debug("setsockopt failed");
+		goto error;
+	}
+
+	write_lock_bh(&rxrpc_local_lock);
+	list_add(&local->link, &rxrpc_locals);
+	write_unlock_bh(&rxrpc_local_lock);
+
+	/* set the socket up */
+	sock = local->socket->sk;
+	sock->sk_user_data	= local;
+	sock->sk_data_ready	= rxrpc_data_ready;
+	sock->sk_error_report	= rxrpc_UDP_error_report;
+	_leave(" = 0");
+	return 0;
+
+error:
+	kernel_sock_shutdown(local->socket, SHUT_RDWR);
+	local->socket->sk->sk_user_data = NULL;
+	sock_release(local->socket);
+	local->socket = NULL;
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a new local endpoint using the specified UDP address
+ */
+struct rxrpc_local *rxrpc_lookup_local(struct sockaddr_rxrpc *srx)
+{
+	struct rxrpc_local *local;
+	int ret;
+
+	_enter("{%d,%u,%pI4+%hu}",
+	       srx->transport_type,
+	       srx->transport.family,
+	       &srx->transport.sin.sin_addr,
+	       ntohs(srx->transport.sin.sin_port));
+
+	down_write(&rxrpc_local_sem);
+
+	/* see if we have a suitable local local endpoint already */
+	read_lock_bh(&rxrpc_local_lock);
+
+	list_for_each_entry(local, &rxrpc_locals, link) {
+		_debug("CMP {%d,%u,%pI4+%hu}",
+		       local->srx.transport_type,
+		       local->srx.transport.family,
+		       &local->srx.transport.sin.sin_addr,
+		       ntohs(local->srx.transport.sin.sin_port));
+
+		if (local->srx.transport_type != srx->transport_type ||
+		    local->srx.transport.family != srx->transport.family)
+			continue;
+
+		switch (srx->transport.family) {
+		case AF_INET:
+			if (local->srx.transport.sin.sin_port !=
+			    srx->transport.sin.sin_port)
+				continue;
+			if (memcmp(&local->srx.transport.sin.sin_addr,
+				   &srx->transport.sin.sin_addr,
+				   sizeof(struct in_addr)) != 0)
+				continue;
+			goto found_local;
+
+		default:
+			BUG();
+		}
+	}
+
+	read_unlock_bh(&rxrpc_local_lock);
+
+	/* we didn't find one, so we need to create one */
+	local = rxrpc_alloc_local(srx);
+	if (!local) {
+		up_write(&rxrpc_local_sem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret = rxrpc_create_local(local);
+	if (ret < 0) {
+		up_write(&rxrpc_local_sem);
+		kfree(local);
+		_leave(" = %d", ret);
+		return ERR_PTR(ret);
+	}
+
+	up_write(&rxrpc_local_sem);
+
+	_net("LOCAL new %d {%d,%u,%pI4+%hu}",
+	     local->debug_id,
+	     local->srx.transport_type,
+	     local->srx.transport.family,
+	     &local->srx.transport.sin.sin_addr,
+	     ntohs(local->srx.transport.sin.sin_port));
+
+	_leave(" = %p [new]", local);
+	return local;
+
+found_local:
+	rxrpc_get_local(local);
+	read_unlock_bh(&rxrpc_local_lock);
+	up_write(&rxrpc_local_sem);
+
+	_net("LOCAL old %d {%d,%u,%pI4+%hu}",
+	     local->debug_id,
+	     local->srx.transport_type,
+	     local->srx.transport.family,
+	     &local->srx.transport.sin.sin_addr,
+	     ntohs(local->srx.transport.sin.sin_port));
+
+	_leave(" = %p [reuse]", local);
+	return local;
+}
+
+/*
+ * release a local endpoint
+ */
+void rxrpc_put_local(struct rxrpc_local *local)
+{
+	_enter("%p{u=%d}", local, atomic_read(&local->usage));
+
+	ASSERTCMP(atomic_read(&local->usage), >, 0);
+
+	/* to prevent a race, the decrement and the dequeue must be effectively
+	 * atomic */
+	write_lock_bh(&rxrpc_local_lock);
+	if (unlikely(atomic_dec_and_test(&local->usage))) {
+		_debug("destroy local");
+		rxrpc_queue_work(&local->destroyer);
+	}
+	write_unlock_bh(&rxrpc_local_lock);
+	_leave("");
+}
+
+/*
+ * destroy a local endpoint
+ */
+static void rxrpc_destroy_local(struct work_struct *work)
+{
+	struct rxrpc_local *local =
+		container_of(work, struct rxrpc_local, destroyer);
+
+	_enter("%p{%d}", local, atomic_read(&local->usage));
+
+	down_write(&rxrpc_local_sem);
+
+	write_lock_bh(&rxrpc_local_lock);
+	if (atomic_read(&local->usage) > 0) {
+		write_unlock_bh(&rxrpc_local_lock);
+		up_read(&rxrpc_local_sem);
+		_leave(" [resurrected]");
+		return;
+	}
+
+	list_del(&local->link);
+	local->socket->sk->sk_user_data = NULL;
+	write_unlock_bh(&rxrpc_local_lock);
+
+	downgrade_write(&rxrpc_local_sem);
+
+	ASSERT(list_empty(&local->services));
+	ASSERT(!work_pending(&local->acceptor));
+	ASSERT(!work_pending(&local->rejecter));
+
+	/* finish cleaning up the local descriptor */
+	rxrpc_purge_queue(&local->accept_queue);
+	rxrpc_purge_queue(&local->reject_queue);
+	kernel_sock_shutdown(local->socket, SHUT_RDWR);
+	sock_release(local->socket);
+
+	up_read(&rxrpc_local_sem);
+
+	_net("DESTROY LOCAL %d", local->debug_id);
+	kfree(local);
+
+	if (list_empty(&rxrpc_locals))
+		wake_up_all(&rxrpc_local_wq);
+
+	_leave("");
+}
+
+/*
+ * preemptively destroy all local local endpoint rather than waiting for
+ * them to be destroyed
+ */
+void __exit rxrpc_destroy_all_locals(void)
+{
+	DECLARE_WAITQUEUE(myself,current);
+
+	_enter("");
+
+	/* we simply have to wait for them to go away */
+	if (!list_empty(&rxrpc_locals)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&rxrpc_local_wq, &myself);
+
+		while (!list_empty(&rxrpc_locals)) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&rxrpc_local_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	_leave("");
+}
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
new file mode 100644
index 00000000..5f22e263
--- /dev/null
+++ b/net/rxrpc/ar-output.c
@@ -0,0 +1,738 @@
+/* RxRPC packet transmission
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/net.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/circ_buf.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+int rxrpc_resend_timeout = 4;
+
+static int rxrpc_send_data(struct kiocb *iocb,
+			   struct rxrpc_sock *rx,
+			   struct rxrpc_call *call,
+			   struct msghdr *msg, size_t len);
+
+/*
+ * extract control messages from the sendmsg() control buffer
+ */
+static int rxrpc_sendmsg_cmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+			      unsigned long *user_call_ID,
+			      enum rxrpc_command *command,
+			      u32 *abort_code,
+			      bool server)
+{
+	struct cmsghdr *cmsg;
+	int len;
+
+	*command = RXRPC_CMD_SEND_DATA;
+
+	if (msg->msg_controllen == 0)
+		return -EINVAL;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+		_debug("CMSG %d, %d, %d",
+		       cmsg->cmsg_level, cmsg->cmsg_type, len);
+
+		if (cmsg->cmsg_level != SOL_RXRPC)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RXRPC_USER_CALL_ID:
+			if (msg->msg_flags & MSG_CMSG_COMPAT) {
+				if (len != sizeof(u32))
+					return -EINVAL;
+				*user_call_ID = *(u32 *) CMSG_DATA(cmsg);
+			} else {
+				if (len != sizeof(unsigned long))
+					return -EINVAL;
+				*user_call_ID = *(unsigned long *)
+					CMSG_DATA(cmsg);
+			}
+			_debug("User Call ID %lx", *user_call_ID);
+			break;
+
+		case RXRPC_ABORT:
+			if (*command != RXRPC_CMD_SEND_DATA)
+				return -EINVAL;
+			*command = RXRPC_CMD_SEND_ABORT;
+			if (len != sizeof(*abort_code))
+				return -EINVAL;
+			*abort_code = *(unsigned int *) CMSG_DATA(cmsg);
+			_debug("Abort %x", *abort_code);
+			if (*abort_code == 0)
+				return -EINVAL;
+			break;
+
+		case RXRPC_ACCEPT:
+			if (*command != RXRPC_CMD_SEND_DATA)
+				return -EINVAL;
+			*command = RXRPC_CMD_ACCEPT;
+			if (len != 0)
+				return -EINVAL;
+			if (!server)
+				return -EISCONN;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * abort a call, sending an ABORT packet to the peer
+ */
+static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
+{
+	write_lock_bh(&call->state_lock);
+
+	if (call->state <= RXRPC_CALL_COMPLETE) {
+		call->state = RXRPC_CALL_LOCALLY_ABORTED;
+		call->abort_code = abort_code;
+		set_bit(RXRPC_CALL_ABORT, &call->events);
+		del_timer_sync(&call->resend_timer);
+		del_timer_sync(&call->ack_timer);
+		clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+		clear_bit(RXRPC_CALL_ACK, &call->events);
+		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+		rxrpc_queue_call(call);
+	}
+
+	write_unlock_bh(&call->state_lock);
+}
+
+/*
+ * send a message forming part of a client call through an RxRPC socket
+ * - caller holds the socket locked
+ * - the socket may be either a client socket or a server socket
+ */
+int rxrpc_client_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx,
+			 struct rxrpc_transport *trans, struct msghdr *msg,
+			 size_t len)
+{
+	struct rxrpc_conn_bundle *bundle;
+	enum rxrpc_command cmd;
+	struct rxrpc_call *call;
+	unsigned long user_call_ID = 0;
+	struct key *key;
+	__be16 service_id;
+	u32 abort_code = 0;
+	int ret;
+
+	_enter("");
+
+	ASSERT(trans != NULL);
+
+	ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code,
+				 false);
+	if (ret < 0)
+		return ret;
+
+	bundle = NULL;
+	if (trans) {
+		service_id = rx->service_id;
+		if (msg->msg_name) {
+			struct sockaddr_rxrpc *srx =
+				(struct sockaddr_rxrpc *) msg->msg_name;
+			service_id = htons(srx->srx_service);
+		}
+		key = rx->key;
+		if (key && !rx->key->payload.data)
+			key = NULL;
+		bundle = rxrpc_get_bundle(rx, trans, key, service_id,
+					  GFP_KERNEL);
+		if (IS_ERR(bundle))
+			return PTR_ERR(bundle);
+	}
+
+	call = rxrpc_get_client_call(rx, trans, bundle, user_call_ID,
+				     abort_code == 0, GFP_KERNEL);
+	if (trans)
+		rxrpc_put_bundle(trans, bundle);
+	if (IS_ERR(call)) {
+		_leave(" = %ld", PTR_ERR(call));
+		return PTR_ERR(call);
+	}
+
+	_debug("CALL %d USR %lx ST %d on CONN %p",
+	       call->debug_id, call->user_call_ID, call->state, call->conn);
+
+	if (call->state >= RXRPC_CALL_COMPLETE) {
+		/* it's too late for this call */
+		ret = -ESHUTDOWN;
+	} else if (cmd == RXRPC_CMD_SEND_ABORT) {
+		rxrpc_send_abort(call, abort_code);
+	} else if (cmd != RXRPC_CMD_SEND_DATA) {
+		ret = -EINVAL;
+	} else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) {
+		/* request phase complete for this client call */
+		ret = -EPROTO;
+	} else {
+		ret = rxrpc_send_data(iocb, rx, call, msg, len);
+	}
+
+	rxrpc_put_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/**
+ * rxrpc_kernel_send_data - Allow a kernel service to send data on a call
+ * @call: The call to send data through
+ * @msg: The data to send
+ * @len: The amount of data to send
+ *
+ * Allow a kernel service to send data on a call.  The call must be in an state
+ * appropriate to sending data.  No control data should be supplied in @msg,
+ * nor should an address be supplied.  MSG_MORE should be flagged if there's
+ * more data to come, otherwise this data will end the transmission phase.
+ */
+int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg,
+			   size_t len)
+{
+	int ret;
+
+	_enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]);
+
+	ASSERTCMP(msg->msg_name, ==, NULL);
+	ASSERTCMP(msg->msg_control, ==, NULL);
+
+	lock_sock(&call->socket->sk);
+
+	_debug("CALL %d USR %lx ST %d on CONN %p",
+	       call->debug_id, call->user_call_ID, call->state, call->conn);
+
+	if (call->state >= RXRPC_CALL_COMPLETE) {
+		ret = -ESHUTDOWN; /* it's too late for this call */
+	} else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+		   call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+		   call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+		ret = -EPROTO; /* request phase complete for this client call */
+	} else {
+		mm_segment_t oldfs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = rxrpc_send_data(NULL, call->socket, call, msg, len);
+		set_fs(oldfs);
+	}
+
+	release_sock(&call->socket->sk);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_send_data);
+
+/*
+ * rxrpc_kernel_abort_call - Allow a kernel service to abort a call
+ * @call: The call to be aborted
+ * @abort_code: The abort code to stick into the ABORT packet
+ *
+ * Allow a kernel service to abort a call, if it's still in an abortable state.
+ */
+void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code)
+{
+	_enter("{%d},%d", call->debug_id, abort_code);
+
+	lock_sock(&call->socket->sk);
+
+	_debug("CALL %d USR %lx ST %d on CONN %p",
+	       call->debug_id, call->user_call_ID, call->state, call->conn);
+
+	if (call->state < RXRPC_CALL_COMPLETE)
+		rxrpc_send_abort(call, abort_code);
+
+	release_sock(&call->socket->sk);
+	_leave("");
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_abort_call);
+
+/*
+ * send a message through a server socket
+ * - caller holds the socket locked
+ */
+int rxrpc_server_sendmsg(struct kiocb *iocb, struct rxrpc_sock *rx,
+			 struct msghdr *msg, size_t len)
+{
+	enum rxrpc_command cmd;
+	struct rxrpc_call *call;
+	unsigned long user_call_ID = 0;
+	u32 abort_code = 0;
+	int ret;
+
+	_enter("");
+
+	ret = rxrpc_sendmsg_cmsg(rx, msg, &user_call_ID, &cmd, &abort_code,
+				 true);
+	if (ret < 0)
+		return ret;
+
+	if (cmd == RXRPC_CMD_ACCEPT) {
+		call = rxrpc_accept_call(rx, user_call_ID);
+		if (IS_ERR(call))
+			return PTR_ERR(call);
+		rxrpc_put_call(call);
+		return 0;
+	}
+
+	call = rxrpc_find_server_call(rx, user_call_ID);
+	if (!call)
+		return -EBADSLT;
+	if (call->state >= RXRPC_CALL_COMPLETE) {
+		ret = -ESHUTDOWN;
+		goto out;
+	}
+
+	switch (cmd) {
+	case RXRPC_CMD_SEND_DATA:
+		if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST &&
+		    call->state != RXRPC_CALL_SERVER_ACK_REQUEST &&
+		    call->state != RXRPC_CALL_SERVER_SEND_REPLY) {
+			/* Tx phase not yet begun for this call */
+			ret = -EPROTO;
+			break;
+		}
+
+		ret = rxrpc_send_data(iocb, rx, call, msg, len);
+		break;
+
+	case RXRPC_CMD_SEND_ABORT:
+		rxrpc_send_abort(call, abort_code);
+		break;
+	default:
+		BUG();
+	}
+
+	out:
+	rxrpc_put_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * send a packet through the transport endpoint
+ */
+int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
+{
+	struct kvec iov[1];
+	struct msghdr msg;
+	int ret, opt;
+
+	_enter(",{%d}", skb->len);
+
+	iov[0].iov_base = skb->head;
+	iov[0].iov_len = skb->len;
+
+	msg.msg_name = &trans->peer->srx.transport.sin;
+	msg.msg_namelen = sizeof(trans->peer->srx.transport.sin);
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	/* send the packet with the don't fragment bit set if we currently
+	 * think it's small enough */
+	if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) {
+		down_read(&trans->local->defrag_sem);
+		/* send the packet by UDP
+		 * - returns -EMSGSIZE if UDP would have to fragment the packet
+		 *   to go out of the interface
+		 *   - in which case, we'll have processed the ICMP error
+		 *     message and update the peer record
+		 */
+		ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
+				     iov[0].iov_len);
+
+		up_read(&trans->local->defrag_sem);
+		if (ret == -EMSGSIZE)
+			goto send_fragmentable;
+
+		_leave(" = %d [%u]", ret, trans->peer->maxdata);
+		return ret;
+	}
+
+send_fragmentable:
+	/* attempt to send this message with fragmentation enabled */
+	_debug("send fragment");
+
+	down_write(&trans->local->defrag_sem);
+	opt = IP_PMTUDISC_DONT;
+	ret = kernel_setsockopt(trans->local->socket, SOL_IP, IP_MTU_DISCOVER,
+				(char *) &opt, sizeof(opt));
+	if (ret == 0) {
+		ret = kernel_sendmsg(trans->local->socket, &msg, iov, 1,
+				     iov[0].iov_len);
+
+		opt = IP_PMTUDISC_DO;
+		kernel_setsockopt(trans->local->socket, SOL_IP,
+				  IP_MTU_DISCOVER, (char *) &opt, sizeof(opt));
+	}
+
+	up_write(&trans->local->defrag_sem);
+	_leave(" = %d [frag %u]", ret, trans->peer->maxdata);
+	return ret;
+}
+
+/*
+ * wait for space to appear in the transmit/ACK window
+ * - caller holds the socket locked
+ */
+static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
+				    struct rxrpc_call *call,
+				    long *timeo)
+{
+	DECLARE_WAITQUEUE(myself, current);
+	int ret;
+
+	_enter(",{%d},%ld",
+	       CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz),
+	       *timeo);
+
+	add_wait_queue(&call->tx_waitq, &myself);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		ret = 0;
+		if (CIRC_SPACE(call->acks_head, call->acks_tail,
+			       call->acks_winsz) > 0)
+			break;
+		if (signal_pending(current)) {
+			ret = sock_intr_errno(*timeo);
+			break;
+		}
+
+		release_sock(&rx->sk);
+		*timeo = schedule_timeout(*timeo);
+		lock_sock(&rx->sk);
+	}
+
+	remove_wait_queue(&call->tx_waitq, &myself);
+	set_current_state(TASK_RUNNING);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * attempt to schedule an instant Tx resend
+ */
+static inline void rxrpc_instant_resend(struct rxrpc_call *call)
+{
+	read_lock_bh(&call->state_lock);
+	if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
+		clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
+		if (call->state < RXRPC_CALL_COMPLETE &&
+		    !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+			rxrpc_queue_call(call);
+	}
+	read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * queue a packet for transmission, set the resend timer and attempt
+ * to send the packet immediately
+ */
+static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
+			       bool last)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	int ret;
+
+	_net("queue skb %p [%d]", skb, call->acks_head);
+
+	ASSERT(call->acks_window != NULL);
+	call->acks_window[call->acks_head] = (unsigned long) skb;
+	smp_wmb();
+	call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1);
+
+	if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) {
+		_debug("________awaiting reply/ACK__________");
+		write_lock_bh(&call->state_lock);
+		switch (call->state) {
+		case RXRPC_CALL_CLIENT_SEND_REQUEST:
+			call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+			break;
+		case RXRPC_CALL_SERVER_ACK_REQUEST:
+			call->state = RXRPC_CALL_SERVER_SEND_REPLY;
+			if (!last)
+				break;
+		case RXRPC_CALL_SERVER_SEND_REPLY:
+			call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
+			break;
+		default:
+			break;
+		}
+		write_unlock_bh(&call->state_lock);
+	}
+
+	_proto("Tx DATA %%%u { #%u }",
+	       ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+
+	sp->need_resend = 0;
+	sp->resend_at = jiffies + rxrpc_resend_timeout * HZ;
+	if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) {
+		_debug("run timer");
+		call->resend_timer.expires = sp->resend_at;
+		add_timer(&call->resend_timer);
+	}
+
+	/* attempt to cancel the rx-ACK timer, deferring reply transmission if
+	 * we're ACK'ing the request phase of an incoming call */
+	ret = -EAGAIN;
+	if (try_to_del_timer_sync(&call->ack_timer) >= 0) {
+		/* the packet may be freed by rxrpc_process_call() before this
+		 * returns */
+		ret = rxrpc_send_packet(call->conn->trans, skb);
+		_net("sent skb %p", skb);
+	} else {
+		_debug("failed to delete ACK timer");
+	}
+
+	if (ret < 0) {
+		_debug("need instant resend %d", ret);
+		sp->need_resend = 1;
+		rxrpc_instant_resend(call);
+	}
+
+	_leave("");
+}
+
+/*
+ * send data through a socket
+ * - must be called in process context
+ * - caller holds the socket locked
+ */
+static int rxrpc_send_data(struct kiocb *iocb,
+			   struct rxrpc_sock *rx,
+			   struct rxrpc_call *call,
+			   struct msghdr *msg, size_t len)
+{
+	struct rxrpc_skb_priv *sp;
+	unsigned char __user *from;
+	struct sk_buff *skb;
+	struct iovec *iov;
+	struct sock *sk = &rx->sk;
+	long timeo;
+	bool more;
+	int ret, ioc, segment, copied;
+
+	_enter(",,,{%zu},%zu", msg->msg_iovlen, len);
+
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+	/* this should be in poll */
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		return -EPIPE;
+
+	iov = msg->msg_iov;
+	ioc = msg->msg_iovlen - 1;
+	from = iov->iov_base;
+	segment = iov->iov_len;
+	iov++;
+	more = msg->msg_flags & MSG_MORE;
+
+	skb = call->tx_pending;
+	call->tx_pending = NULL;
+
+	copied = 0;
+	do {
+		int copy;
+
+		if (segment > len)
+			segment = len;
+
+		_debug("SEGMENT %d @%p", segment, from);
+
+		if (!skb) {
+			size_t size, chunk, max, space;
+
+			_debug("alloc");
+
+			if (CIRC_SPACE(call->acks_head, call->acks_tail,
+				       call->acks_winsz) <= 0) {
+				ret = -EAGAIN;
+				if (msg->msg_flags & MSG_DONTWAIT)
+					goto maybe_error;
+				ret = rxrpc_wait_for_tx_window(rx, call,
+							       &timeo);
+				if (ret < 0)
+					goto maybe_error;
+			}
+
+			max = call->conn->trans->peer->maxdata;
+			max -= call->conn->security_size;
+			max &= ~(call->conn->size_align - 1UL);
+
+			chunk = max;
+			if (chunk > len && !more)
+				chunk = len;
+
+			space = chunk + call->conn->size_align;
+			space &= ~(call->conn->size_align - 1UL);
+
+			size = space + call->conn->header_size;
+
+			_debug("SIZE: %zu/%zu/%zu", chunk, space, size);
+
+			/* create a buffer that we can retain until it's ACK'd */
+			skb = sock_alloc_send_skb(
+				sk, size, msg->msg_flags & MSG_DONTWAIT, &ret);
+			if (!skb)
+				goto maybe_error;
+
+			rxrpc_new_skb(skb);
+
+			_debug("ALLOC SEND %p", skb);
+
+			ASSERTCMP(skb->mark, ==, 0);
+
+			_debug("HS: %u", call->conn->header_size);
+			skb_reserve(skb, call->conn->header_size);
+			skb->len += call->conn->header_size;
+
+			sp = rxrpc_skb(skb);
+			sp->remain = chunk;
+			if (sp->remain > skb_tailroom(skb))
+				sp->remain = skb_tailroom(skb);
+
+			_net("skb: hr %d, tr %d, hl %d, rm %d",
+			       skb_headroom(skb),
+			       skb_tailroom(skb),
+			       skb_headlen(skb),
+			       sp->remain);
+
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		}
+
+		_debug("append");
+		sp = rxrpc_skb(skb);
+
+		/* append next segment of data to the current buffer */
+		copy = skb_tailroom(skb);
+		ASSERTCMP(copy, >, 0);
+		if (copy > segment)
+			copy = segment;
+		if (copy > sp->remain)
+			copy = sp->remain;
+
+		_debug("add");
+		ret = skb_add_data(skb, from, copy);
+		_debug("added");
+		if (ret < 0)
+			goto efault;
+		sp->remain -= copy;
+		skb->mark += copy;
+		copied += copy;
+
+		len -= copy;
+		segment -= copy;
+		from += copy;
+		while (segment == 0 && ioc > 0) {
+			from = iov->iov_base;
+			segment = iov->iov_len;
+			iov++;
+			ioc--;
+		}
+		if (len == 0) {
+			segment = 0;
+			ioc = 0;
+		}
+
+		/* check for the far side aborting the call or a network error
+		 * occurring */
+		if (call->state > RXRPC_CALL_COMPLETE)
+			goto call_aborted;
+
+		/* add the packet to the send queue if it's now full */
+		if (sp->remain <= 0 || (segment == 0 && !more)) {
+			struct rxrpc_connection *conn = call->conn;
+			size_t pad;
+
+			/* pad out if we're using security */
+			if (conn->security) {
+				pad = conn->security_size + skb->mark;
+				pad = conn->size_align - pad;
+				pad &= conn->size_align - 1;
+				_debug("pad %zu", pad);
+				if (pad)
+					memset(skb_put(skb, pad), 0, pad);
+			}
+
+			sp->hdr.epoch = conn->epoch;
+			sp->hdr.cid = call->cid;
+			sp->hdr.callNumber = call->call_id;
+			sp->hdr.seq =
+				htonl(atomic_inc_return(&call->sequence));
+			sp->hdr.serial =
+				htonl(atomic_inc_return(&conn->serial));
+			sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+			sp->hdr.userStatus = 0;
+			sp->hdr.securityIndex = conn->security_ix;
+			sp->hdr._rsvd = 0;
+			sp->hdr.serviceId = conn->service_id;
+
+			sp->hdr.flags = conn->out_clientflag;
+			if (len == 0 && !more)
+				sp->hdr.flags |= RXRPC_LAST_PACKET;
+			else if (CIRC_SPACE(call->acks_head, call->acks_tail,
+					    call->acks_winsz) > 1)
+				sp->hdr.flags |= RXRPC_MORE_PACKETS;
+
+			ret = rxrpc_secure_packet(
+				call, skb, skb->mark,
+				skb->head + sizeof(struct rxrpc_header));
+			if (ret < 0)
+				goto out;
+
+			memcpy(skb->head, &sp->hdr,
+			       sizeof(struct rxrpc_header));
+			rxrpc_queue_packet(call, skb, segment == 0 && !more);
+			skb = NULL;
+		}
+
+	} while (segment > 0);
+
+success:
+	ret = copied;
+out:
+	call->tx_pending = skb;
+	_leave(" = %d", ret);
+	return ret;
+
+call_aborted:
+	rxrpc_free_skb(skb);
+	if (call->state == RXRPC_CALL_NETWORK_ERROR)
+		ret = call->conn->trans->peer->net_error;
+	else
+		ret = -ECONNABORTED;
+	_leave(" = %d", ret);
+	return ret;
+
+maybe_error:
+	if (copied)
+		goto success;
+	goto out;
+
+efault:
+	ret = -EFAULT;
+	goto out;
+}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
new file mode 100644
index 00000000..2754f098
--- /dev/null
+++ b/net/rxrpc/ar-peer.c
@@ -0,0 +1,303 @@
+/* RxRPC remote transport endpoint management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_peers);
+static DEFINE_RWLOCK(rxrpc_peer_lock);
+static DECLARE_WAIT_QUEUE_HEAD(rxrpc_peer_wq);
+
+static void rxrpc_destroy_peer(struct work_struct *work);
+
+/*
+ * assess the MTU size for the network interface through which this peer is
+ * reached
+ */
+static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	peer->if_mtu = 1500;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL,
+				   peer->srx.transport.sin.sin_addr.s_addr, 0,
+				   htons(7000), htons(7001),
+				   IPPROTO_UDP, 0, 0);
+	if (IS_ERR(rt)) {
+		_leave(" [route err %ld]", PTR_ERR(rt));
+		return;
+	}
+
+	peer->if_mtu = dst_mtu(&rt->dst);
+	dst_release(&rt->dst);
+
+	_leave(" [if_mtu %u]", peer->if_mtu);
+}
+
+/*
+ * allocate a new peer
+ */
+static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
+					   gfp_t gfp)
+{
+	struct rxrpc_peer *peer;
+
+	_enter("");
+
+	peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
+	if (peer) {
+		INIT_WORK(&peer->destroyer, &rxrpc_destroy_peer);
+		INIT_LIST_HEAD(&peer->link);
+		INIT_LIST_HEAD(&peer->error_targets);
+		spin_lock_init(&peer->lock);
+		atomic_set(&peer->usage, 1);
+		peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
+		memcpy(&peer->srx, srx, sizeof(*srx));
+
+		rxrpc_assess_MTU_size(peer);
+		peer->mtu = peer->if_mtu;
+
+		if (srx->transport.family == AF_INET) {
+			peer->hdrsize = sizeof(struct iphdr);
+			switch (srx->transport_type) {
+			case SOCK_DGRAM:
+				peer->hdrsize += sizeof(struct udphdr);
+				break;
+			default:
+				BUG();
+				break;
+			}
+		} else {
+			BUG();
+		}
+
+		peer->hdrsize += sizeof(struct rxrpc_header);
+		peer->maxdata = peer->mtu - peer->hdrsize;
+	}
+
+	_leave(" = %p", peer);
+	return peer;
+}
+
+/*
+ * obtain a remote transport endpoint for the specified address
+ */
+struct rxrpc_peer *rxrpc_get_peer(struct sockaddr_rxrpc *srx, gfp_t gfp)
+{
+	struct rxrpc_peer *peer, *candidate;
+	const char *new = "old";
+	int usage;
+
+	_enter("{%d,%d,%pI4+%hu}",
+	       srx->transport_type,
+	       srx->transport_len,
+	       &srx->transport.sin.sin_addr,
+	       ntohs(srx->transport.sin.sin_port));
+
+	/* search the peer list first */
+	read_lock_bh(&rxrpc_peer_lock);
+	list_for_each_entry(peer, &rxrpc_peers, link) {
+		_debug("check PEER %d { u=%d t=%d l=%d }",
+		       peer->debug_id,
+		       atomic_read(&peer->usage),
+		       peer->srx.transport_type,
+		       peer->srx.transport_len);
+
+		if (atomic_read(&peer->usage) > 0 &&
+		    peer->srx.transport_type == srx->transport_type &&
+		    peer->srx.transport_len == srx->transport_len &&
+		    memcmp(&peer->srx.transport,
+			   &srx->transport,
+			   srx->transport_len) == 0)
+			goto found_extant_peer;
+	}
+	read_unlock_bh(&rxrpc_peer_lock);
+
+	/* not yet present - create a candidate for a new record and then
+	 * redo the search */
+	candidate = rxrpc_alloc_peer(srx, gfp);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	write_lock_bh(&rxrpc_peer_lock);
+
+	list_for_each_entry(peer, &rxrpc_peers, link) {
+		if (atomic_read(&peer->usage) > 0 &&
+		    peer->srx.transport_type == srx->transport_type &&
+		    peer->srx.transport_len == srx->transport_len &&
+		    memcmp(&peer->srx.transport,
+			   &srx->transport,
+			   srx->transport_len) == 0)
+			goto found_extant_second;
+	}
+
+	/* we can now add the new candidate to the list */
+	peer = candidate;
+	candidate = NULL;
+	usage = atomic_read(&peer->usage);
+
+	list_add_tail(&peer->link, &rxrpc_peers);
+	write_unlock_bh(&rxrpc_peer_lock);
+	new = "new";
+
+success:
+	_net("PEER %s %d {%d,%u,%pI4+%hu}",
+	     new,
+	     peer->debug_id,
+	     peer->srx.transport_type,
+	     peer->srx.transport.family,
+	     &peer->srx.transport.sin.sin_addr,
+	     ntohs(peer->srx.transport.sin.sin_port));
+
+	_leave(" = %p {u=%d}", peer, usage);
+	return peer;
+
+	/* we found the peer in the list immediately */
+found_extant_peer:
+	usage = atomic_inc_return(&peer->usage);
+	read_unlock_bh(&rxrpc_peer_lock);
+	goto success;
+
+	/* we found the peer on the second time through the list */
+found_extant_second:
+	usage = atomic_inc_return(&peer->usage);
+	write_unlock_bh(&rxrpc_peer_lock);
+	kfree(candidate);
+	goto success;
+}
+
+/*
+ * find the peer associated with a packet
+ */
+struct rxrpc_peer *rxrpc_find_peer(struct rxrpc_local *local,
+				   __be32 addr, __be16 port)
+{
+	struct rxrpc_peer *peer;
+
+	_enter("");
+
+	/* search the peer list */
+	read_lock_bh(&rxrpc_peer_lock);
+
+	if (local->srx.transport.family == AF_INET &&
+	    local->srx.transport_type == SOCK_DGRAM
+	    ) {
+		list_for_each_entry(peer, &rxrpc_peers, link) {
+			if (atomic_read(&peer->usage) > 0 &&
+			    peer->srx.transport_type == SOCK_DGRAM &&
+			    peer->srx.transport.family == AF_INET &&
+			    peer->srx.transport.sin.sin_port == port &&
+			    peer->srx.transport.sin.sin_addr.s_addr == addr)
+				goto found_UDP_peer;
+		}
+
+		goto new_UDP_peer;
+	}
+
+	read_unlock_bh(&rxrpc_peer_lock);
+	_leave(" = -EAFNOSUPPORT");
+	return ERR_PTR(-EAFNOSUPPORT);
+
+found_UDP_peer:
+	_net("Rx UDP DGRAM from peer %d", peer->debug_id);
+	atomic_inc(&peer->usage);
+	read_unlock_bh(&rxrpc_peer_lock);
+	_leave(" = %p", peer);
+	return peer;
+
+new_UDP_peer:
+	_net("Rx UDP DGRAM from NEW peer %d", peer->debug_id);
+	read_unlock_bh(&rxrpc_peer_lock);
+	_leave(" = -EBUSY [new]");
+	return ERR_PTR(-EBUSY);
+}
+
+/*
+ * release a remote transport endpoint
+ */
+void rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+	_enter("%p{u=%d}", peer, atomic_read(&peer->usage));
+
+	ASSERTCMP(atomic_read(&peer->usage), >, 0);
+
+	if (likely(!atomic_dec_and_test(&peer->usage))) {
+		_leave(" [in use]");
+		return;
+	}
+
+	rxrpc_queue_work(&peer->destroyer);
+	_leave("");
+}
+
+/*
+ * destroy a remote transport endpoint
+ */
+static void rxrpc_destroy_peer(struct work_struct *work)
+{
+	struct rxrpc_peer *peer =
+		container_of(work, struct rxrpc_peer, destroyer);
+
+	_enter("%p{%d}", peer, atomic_read(&peer->usage));
+
+	write_lock_bh(&rxrpc_peer_lock);
+	list_del(&peer->link);
+	write_unlock_bh(&rxrpc_peer_lock);
+
+	_net("DESTROY PEER %d", peer->debug_id);
+	kfree(peer);
+
+	if (list_empty(&rxrpc_peers))
+		wake_up_all(&rxrpc_peer_wq);
+	_leave("");
+}
+
+/*
+ * preemptively destroy all the peer records from a transport endpoint rather
+ * than waiting for them to time out
+ */
+void __exit rxrpc_destroy_all_peers(void)
+{
+	DECLARE_WAITQUEUE(myself,current);
+
+	_enter("");
+
+	/* we simply have to wait for them to go away */
+	if (!list_empty(&rxrpc_peers)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&rxrpc_peer_wq, &myself);
+
+		while (!list_empty(&rxrpc_peers)) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
+
+		remove_wait_queue(&rxrpc_peer_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	_leave("");
+}
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
new file mode 100644
index 00000000..38047f71
--- /dev/null
+++ b/net/rxrpc/ar-proc.c
@@ -0,0 +1,192 @@
+/* /proc/net/ support for AF_RXRPC
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static const char *const rxrpc_conn_states[] = {
+	[RXRPC_CONN_UNUSED]		= "Unused  ",
+	[RXRPC_CONN_CLIENT]		= "Client  ",
+	[RXRPC_CONN_SERVER_UNSECURED]	= "SvUnsec ",
+	[RXRPC_CONN_SERVER_CHALLENGING]	= "SvChall ",
+	[RXRPC_CONN_SERVER]		= "SvSecure",
+	[RXRPC_CONN_REMOTELY_ABORTED]	= "RmtAbort",
+	[RXRPC_CONN_LOCALLY_ABORTED]	= "LocAbort",
+	[RXRPC_CONN_NETWORK_ERROR]	= "NetError",
+};
+
+/*
+ * generate a list of extant and dead calls in /proc/net/rxrpc_calls
+ */
+static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+	read_lock(&rxrpc_call_lock);
+	return seq_list_start_head(&rxrpc_calls, *_pos);
+}
+
+static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &rxrpc_calls, pos);
+}
+
+static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&rxrpc_call_lock);
+}
+
+static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
+{
+	struct rxrpc_transport *trans;
+	struct rxrpc_call *call;
+	char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+	if (v == &rxrpc_calls) {
+		seq_puts(seq,
+			 "Proto Local                  Remote                "
+			 " SvID ConnID   CallID   End Use State    Abort   "
+			 " UserID\n");
+		return 0;
+	}
+
+	call = list_entry(v, struct rxrpc_call, link);
+	trans = call->conn->trans;
+
+	sprintf(lbuff, "%pI4:%u",
+		&trans->local->srx.transport.sin.sin_addr,
+		ntohs(trans->local->srx.transport.sin.sin_port));
+
+	sprintf(rbuff, "%pI4:%u",
+		&trans->peer->srx.transport.sin.sin_addr,
+		ntohs(trans->peer->srx.transport.sin.sin_port));
+
+	seq_printf(seq,
+		   "UDP   %-22.22s %-22.22s %4x %08x %08x %s %3u"
+		   " %-8.8s %08x %lx\n",
+		   lbuff,
+		   rbuff,
+		   ntohs(call->conn->service_id),
+		   ntohl(call->conn->cid),
+		   ntohl(call->call_id),
+		   call->conn->in_clientflag ? "Svc" : "Clt",
+		   atomic_read(&call->usage),
+		   rxrpc_call_states[call->state],
+		   call->abort_code,
+		   call->user_call_ID);
+
+	return 0;
+}
+
+static const struct seq_operations rxrpc_call_seq_ops = {
+	.start  = rxrpc_call_seq_start,
+	.next   = rxrpc_call_seq_next,
+	.stop   = rxrpc_call_seq_stop,
+	.show   = rxrpc_call_seq_show,
+};
+
+static int rxrpc_call_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rxrpc_call_seq_ops);
+}
+
+const struct file_operations rxrpc_call_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rxrpc_call_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * generate a list of extant virtual connections in /proc/net/rxrpc_conns
+ */
+static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
+{
+	read_lock(&rxrpc_connection_lock);
+	return seq_list_start_head(&rxrpc_connections, *_pos);
+}
+
+static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
+				       loff_t *pos)
+{
+	return seq_list_next(v, &rxrpc_connections, pos);
+}
+
+static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&rxrpc_connection_lock);
+}
+
+static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
+{
+	struct rxrpc_connection *conn;
+	struct rxrpc_transport *trans;
+	char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1];
+
+	if (v == &rxrpc_connections) {
+		seq_puts(seq,
+			 "Proto Local                  Remote                "
+			 " SvID ConnID   Calls    End Use State    Key     "
+			 " Serial   ISerial\n"
+			 );
+		return 0;
+	}
+
+	conn = list_entry(v, struct rxrpc_connection, link);
+	trans = conn->trans;
+
+	sprintf(lbuff, "%pI4:%u",
+		&trans->local->srx.transport.sin.sin_addr,
+		ntohs(trans->local->srx.transport.sin.sin_port));
+
+	sprintf(rbuff, "%pI4:%u",
+		&trans->peer->srx.transport.sin.sin_addr,
+		ntohs(trans->peer->srx.transport.sin.sin_port));
+
+	seq_printf(seq,
+		   "UDP   %-22.22s %-22.22s %4x %08x %08x %s %3u"
+		   " %s %08x %08x %08x\n",
+		   lbuff,
+		   rbuff,
+		   ntohs(conn->service_id),
+		   ntohl(conn->cid),
+		   conn->call_counter,
+		   conn->in_clientflag ? "Svc" : "Clt",
+		   atomic_read(&conn->usage),
+		   rxrpc_conn_states[conn->state],
+		   key_serial(conn->key),
+		   atomic_read(&conn->serial),
+		   atomic_read(&conn->hi_serial));
+
+	return 0;
+}
+
+static const struct seq_operations rxrpc_connection_seq_ops = {
+	.start  = rxrpc_connection_seq_start,
+	.next   = rxrpc_connection_seq_next,
+	.stop   = rxrpc_connection_seq_stop,
+	.show   = rxrpc_connection_seq_show,
+};
+
+
+static int rxrpc_connection_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rxrpc_connection_seq_ops);
+}
+
+const struct file_operations rxrpc_connection_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rxrpc_connection_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
new file mode 100644
index 00000000..0c65013e
--- /dev/null
+++ b/net/rxrpc/ar-recvmsg.c
@@ -0,0 +1,438 @@
+/* RxRPC recvmsg() implementation
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * removal a call's user ID from the socket tree to make the user ID available
+ * again and so that it won't be seen again in association with that call
+ */
+void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
+{
+	_debug("RELEASE CALL %d", call->debug_id);
+
+	if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) {
+		write_lock_bh(&rx->call_lock);
+		rb_erase(&call->sock_node, &call->socket->calls);
+		clear_bit(RXRPC_CALL_HAS_USERID, &call->flags);
+		write_unlock_bh(&rx->call_lock);
+	}
+
+	read_lock_bh(&call->state_lock);
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+	    !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+		rxrpc_queue_call(call);
+	read_unlock_bh(&call->state_lock);
+}
+
+/*
+ * receive a message from an RxRPC socket
+ * - we need to be careful about two or more threads calling recvmsg
+ *   simultaneously
+ */
+int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
+		  struct msghdr *msg, size_t len, int flags)
+{
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_call *call = NULL, *continue_call = NULL;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	struct sk_buff *skb;
+	long timeo;
+	int copy, ret, ullen, offset, copied = 0;
+	u32 abort_code;
+
+	DEFINE_WAIT(wait);
+
+	_enter(",,,%zu,%d", len, flags);
+
+	if (flags & (MSG_OOB | MSG_TRUNC))
+		return -EOPNOTSUPP;
+
+	ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long);
+
+	timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT);
+	msg->msg_flags |= MSG_MORE;
+
+	lock_sock(&rx->sk);
+
+	for (;;) {
+		/* return immediately if a client socket has no outstanding
+		 * calls */
+		if (RB_EMPTY_ROOT(&rx->calls)) {
+			if (copied)
+				goto out;
+			if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) {
+				release_sock(&rx->sk);
+				if (continue_call)
+					rxrpc_put_call(continue_call);
+				return -ENODATA;
+			}
+		}
+
+		/* get the next message on the Rx queue */
+		skb = skb_peek(&rx->sk.sk_receive_queue);
+		if (!skb) {
+			/* nothing remains on the queue */
+			if (copied &&
+			    (msg->msg_flags & MSG_PEEK || timeo == 0))
+				goto out;
+
+			/* wait for a message to turn up */
+			release_sock(&rx->sk);
+			prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait,
+						  TASK_INTERRUPTIBLE);
+			ret = sock_error(&rx->sk);
+			if (ret)
+				goto wait_error;
+
+			if (skb_queue_empty(&rx->sk.sk_receive_queue)) {
+				if (signal_pending(current))
+					goto wait_interrupted;
+				timeo = schedule_timeout(timeo);
+			}
+			finish_wait(sk_sleep(&rx->sk), &wait);
+			lock_sock(&rx->sk);
+			continue;
+		}
+
+	peek_next_packet:
+		sp = rxrpc_skb(skb);
+		call = sp->call;
+		ASSERT(call != NULL);
+
+		_debug("next pkt %s", rxrpc_pkts[sp->hdr.type]);
+
+		/* make sure we wait for the state to be updated in this call */
+		spin_lock_bh(&call->lock);
+		spin_unlock_bh(&call->lock);
+
+		if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+			_debug("packet from released call");
+			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+				BUG();
+			rxrpc_free_skb(skb);
+			continue;
+		}
+
+		/* determine whether to continue last data receive */
+		if (continue_call) {
+			_debug("maybe cont");
+			if (call != continue_call ||
+			    skb->mark != RXRPC_SKB_MARK_DATA) {
+				release_sock(&rx->sk);
+				rxrpc_put_call(continue_call);
+				_leave(" = %d [noncont]", copied);
+				return copied;
+			}
+		}
+
+		rxrpc_get_call(call);
+
+		/* copy the peer address and timestamp */
+		if (!continue_call) {
+			if (msg->msg_name && msg->msg_namelen > 0)
+				memcpy(msg->msg_name,
+				       &call->conn->trans->peer->srx,
+				       sizeof(call->conn->trans->peer->srx));
+			sock_recv_ts_and_drops(msg, &rx->sk, skb);
+		}
+
+		/* receive the message */
+		if (skb->mark != RXRPC_SKB_MARK_DATA)
+			goto receive_non_data_message;
+
+		_debug("recvmsg DATA #%u { %d, %d }",
+		       ntohl(sp->hdr.seq), skb->len, sp->offset);
+
+		if (!continue_call) {
+			/* only set the control data once per recvmsg() */
+			ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+				       ullen, &call->user_call_ID);
+			if (ret < 0)
+				goto copy_error;
+			ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+		}
+
+		ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
+		ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
+		call->rx_data_recv = ntohl(sp->hdr.seq);
+
+		ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+
+		offset = sp->offset;
+		copy = skb->len - offset;
+		if (copy > len - copied)
+			copy = len - copied;
+
+		if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+			ret = skb_copy_datagram_iovec(skb, offset,
+						      msg->msg_iov, copy);
+		} else {
+			ret = skb_copy_and_csum_datagram_iovec(skb, offset,
+							       msg->msg_iov);
+			if (ret == -EINVAL)
+				goto csum_copy_error;
+		}
+
+		if (ret < 0)
+			goto copy_error;
+
+		/* handle piecemeal consumption of data packets */
+		_debug("copied %d+%d", copy, copied);
+
+		offset += copy;
+		copied += copy;
+
+		if (!(flags & MSG_PEEK))
+			sp->offset = offset;
+
+		if (sp->offset < skb->len) {
+			_debug("buffer full");
+			ASSERTCMP(copied, ==, len);
+			break;
+		}
+
+		/* we transferred the whole data packet */
+		if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+			_debug("last");
+			if (call->conn->out_clientflag) {
+				 /* last byte of reply received */
+				ret = copied;
+				goto terminal_message;
+			}
+
+			/* last bit of request received */
+			if (!(flags & MSG_PEEK)) {
+				_debug("eat packet");
+				if (skb_dequeue(&rx->sk.sk_receive_queue) !=
+				    skb)
+					BUG();
+				rxrpc_free_skb(skb);
+			}
+			msg->msg_flags &= ~MSG_MORE;
+			break;
+		}
+
+		/* move on to the next data message */
+		_debug("next");
+		if (!continue_call)
+			continue_call = sp->call;
+		else
+			rxrpc_put_call(call);
+		call = NULL;
+
+		if (flags & MSG_PEEK) {
+			_debug("peek next");
+			skb = skb->next;
+			if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue)
+				break;
+			goto peek_next_packet;
+		}
+
+		_debug("eat packet");
+		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+			BUG();
+		rxrpc_free_skb(skb);
+	}
+
+	/* end of non-terminal data packet reception for the moment */
+	_debug("end rcv data");
+out:
+	release_sock(&rx->sk);
+	if (call)
+		rxrpc_put_call(call);
+	if (continue_call)
+		rxrpc_put_call(continue_call);
+	_leave(" = %d [data]", copied);
+	return copied;
+
+	/* handle non-DATA messages such as aborts, incoming connections and
+	 * final ACKs */
+receive_non_data_message:
+	_debug("non-data");
+
+	if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) {
+		_debug("RECV NEW CALL");
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code);
+		if (ret < 0)
+			goto copy_error;
+		if (!(flags & MSG_PEEK)) {
+			if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+				BUG();
+			rxrpc_free_skb(skb);
+		}
+		goto out;
+	}
+
+	ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID,
+		       ullen, &call->user_call_ID);
+	if (ret < 0)
+		goto copy_error;
+	ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
+
+	switch (skb->mark) {
+	case RXRPC_SKB_MARK_DATA:
+		BUG();
+	case RXRPC_SKB_MARK_FINAL_ACK:
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code);
+		break;
+	case RXRPC_SKB_MARK_BUSY:
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code);
+		break;
+	case RXRPC_SKB_MARK_REMOTE_ABORT:
+		abort_code = call->abort_code;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code);
+		break;
+	case RXRPC_SKB_MARK_NET_ERROR:
+		_debug("RECV NET ERROR %d", sp->error);
+		abort_code = sp->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code);
+		break;
+	case RXRPC_SKB_MARK_LOCAL_ERROR:
+		_debug("RECV LOCAL ERROR %d", sp->error);
+		abort_code = sp->error;
+		ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4,
+			       &abort_code);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	if (ret < 0)
+		goto copy_error;
+
+terminal_message:
+	_debug("terminal");
+	msg->msg_flags &= ~MSG_MORE;
+	msg->msg_flags |= MSG_EOR;
+
+	if (!(flags & MSG_PEEK)) {
+		_net("free terminal skb %p", skb);
+		if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+			BUG();
+		rxrpc_free_skb(skb);
+		rxrpc_remove_user_ID(rx, call);
+	}
+
+	release_sock(&rx->sk);
+	rxrpc_put_call(call);
+	if (continue_call)
+		rxrpc_put_call(continue_call);
+	_leave(" = %d", ret);
+	return ret;
+
+copy_error:
+	_debug("copy error");
+	release_sock(&rx->sk);
+	rxrpc_put_call(call);
+	if (continue_call)
+		rxrpc_put_call(continue_call);
+	_leave(" = %d", ret);
+	return ret;
+
+csum_copy_error:
+	_debug("csum error");
+	release_sock(&rx->sk);
+	if (continue_call)
+		rxrpc_put_call(continue_call);
+	rxrpc_kill_skb(skb);
+	skb_kill_datagram(&rx->sk, skb, flags);
+	rxrpc_put_call(call);
+	return -EAGAIN;
+
+wait_interrupted:
+	ret = sock_intr_errno(timeo);
+wait_error:
+	finish_wait(sk_sleep(&rx->sk), &wait);
+	if (continue_call)
+		rxrpc_put_call(continue_call);
+	if (copied)
+		copied = ret;
+	_leave(" = %d [waitfail %d]", copied, ret);
+	return copied;
+
+}
+
+/**
+ * rxrpc_kernel_data_delivered - Record delivery of data message
+ * @skb: Message holding data
+ *
+ * Record the delivery of a data message.  This permits RxRPC to keep its
+ * tracking correct.  The socket buffer will be deleted.
+ */
+void rxrpc_kernel_data_delivered(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_call *call = sp->call;
+
+	ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
+	ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
+	call->rx_data_recv = ntohl(sp->hdr.seq);
+
+	ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+	rxrpc_free_skb(skb);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_data_delivered);
+
+/**
+ * rxrpc_kernel_is_data_last - Determine if data message is last one
+ * @skb: Message holding data
+ *
+ * Determine if data message is last one for the parent call.
+ */
+bool rxrpc_kernel_is_data_last(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA);
+
+	return sp->hdr.flags & RXRPC_LAST_PACKET;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_is_data_last);
+
+/**
+ * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message
+ * @skb: Message indicating an abort
+ *
+ * Get the abort code from an RxRPC abort message.
+ */
+u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT);
+
+	return sp->call->abort_code;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_abort_code);
+
+/**
+ * rxrpc_kernel_get_error - Get the error number from an RxRPC error message
+ * @skb: Message indicating an error
+ *
+ * Get the error number from an RxRPC error message.
+ */
+int rxrpc_kernel_get_error_number(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+	return sp->error;
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_get_error_number);
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
new file mode 100644
index 00000000..49b3cc31
--- /dev/null
+++ b/net/rxrpc/ar-security.c
@@ -0,0 +1,264 @@
+/* RxRPC security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#include "ar-internal.h"
+
+static LIST_HEAD(rxrpc_security_methods);
+static DECLARE_RWSEM(rxrpc_security_sem);
+
+/*
+ * get an RxRPC security module
+ */
+static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec)
+{
+	return try_module_get(sec->owner) ? sec : NULL;
+}
+
+/*
+ * release an RxRPC security module
+ */
+static void rxrpc_security_put(struct rxrpc_security *sec)
+{
+	module_put(sec->owner);
+}
+
+/*
+ * look up an rxrpc security module
+ */
+static struct rxrpc_security *rxrpc_security_lookup(u8 security_index)
+{
+	struct rxrpc_security *sec = NULL;
+
+	_enter("");
+
+	down_read(&rxrpc_security_sem);
+
+	list_for_each_entry(sec, &rxrpc_security_methods, link) {
+		if (sec->security_index == security_index) {
+			if (unlikely(!rxrpc_security_get(sec)))
+				break;
+			goto out;
+		}
+	}
+
+	sec = NULL;
+out:
+	up_read(&rxrpc_security_sem);
+	_leave(" = %p [%s]", sec, sec ? sec->name : "");
+	return sec;
+}
+
+/**
+ * rxrpc_register_security - register an RxRPC security handler
+ * @sec: security module
+ *
+ * register an RxRPC security handler for use by RxRPC
+ */
+int rxrpc_register_security(struct rxrpc_security *sec)
+{
+	struct rxrpc_security *psec;
+	int ret;
+
+	_enter("");
+	down_write(&rxrpc_security_sem);
+
+	ret = -EEXIST;
+	list_for_each_entry(psec, &rxrpc_security_methods, link) {
+		if (psec->security_index == sec->security_index)
+			goto out;
+	}
+
+	list_add(&sec->link, &rxrpc_security_methods);
+
+	printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n",
+	       sec->security_index, sec->name);
+	ret = 0;
+
+out:
+	up_write(&rxrpc_security_sem);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(rxrpc_register_security);
+
+/**
+ * rxrpc_unregister_security - unregister an RxRPC security handler
+ * @sec: security module
+ *
+ * unregister an RxRPC security handler
+ */
+void rxrpc_unregister_security(struct rxrpc_security *sec)
+{
+
+	_enter("");
+	down_write(&rxrpc_security_sem);
+	list_del_init(&sec->link);
+	up_write(&rxrpc_security_sem);
+
+	printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n",
+	       sec->security_index, sec->name);
+}
+
+EXPORT_SYMBOL_GPL(rxrpc_unregister_security);
+
+/*
+ * initialise the security on a client connection
+ */
+int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
+{
+	struct rxrpc_key_token *token;
+	struct rxrpc_security *sec;
+	struct key *key = conn->key;
+	int ret;
+
+	_enter("{%d},{%x}", conn->debug_id, key_serial(key));
+
+	if (!key)
+		return 0;
+
+	ret = key_validate(key);
+	if (ret < 0)
+		return ret;
+
+	if (!key->payload.data)
+		return -EKEYREJECTED;
+	token = key->payload.data;
+
+	sec = rxrpc_security_lookup(token->security_index);
+	if (!sec)
+		return -EKEYREJECTED;
+	conn->security = sec;
+
+	ret = conn->security->init_connection_security(conn);
+	if (ret < 0) {
+		rxrpc_security_put(conn->security);
+		conn->security = NULL;
+		return ret;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * initialise the security on a server connection
+ */
+int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
+{
+	struct rxrpc_security *sec;
+	struct rxrpc_local *local = conn->trans->local;
+	struct rxrpc_sock *rx;
+	struct key *key;
+	key_ref_t kref;
+	char kdesc[5+1+3+1];
+
+	_enter("");
+
+	sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix);
+
+	sec = rxrpc_security_lookup(conn->security_ix);
+	if (!sec) {
+		_leave(" = -ENOKEY [lookup]");
+		return -ENOKEY;
+	}
+
+	/* find the service */
+	read_lock_bh(&local->services_lock);
+	list_for_each_entry(rx, &local->services, listen_link) {
+		if (rx->service_id == conn->service_id)
+			goto found_service;
+	}
+
+	/* the service appears to have died */
+	read_unlock_bh(&local->services_lock);
+	rxrpc_security_put(sec);
+	_leave(" = -ENOENT");
+	return -ENOENT;
+
+found_service:
+	if (!rx->securities) {
+		read_unlock_bh(&local->services_lock);
+		rxrpc_security_put(sec);
+		_leave(" = -ENOKEY");
+		return -ENOKEY;
+	}
+
+	/* look through the service's keyring */
+	kref = keyring_search(make_key_ref(rx->securities, 1UL),
+			      &key_type_rxrpc_s, kdesc);
+	if (IS_ERR(kref)) {
+		read_unlock_bh(&local->services_lock);
+		rxrpc_security_put(sec);
+		_leave(" = %ld [search]", PTR_ERR(kref));
+		return PTR_ERR(kref);
+	}
+
+	key = key_ref_to_ptr(kref);
+	read_unlock_bh(&local->services_lock);
+
+	conn->server_key = key;
+	conn->security = sec;
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * secure a packet prior to transmission
+ */
+int rxrpc_secure_packet(const struct rxrpc_call *call,
+			struct sk_buff *skb,
+			size_t data_size,
+			void *sechdr)
+{
+	if (call->conn->security)
+		return call->conn->security->secure_packet(
+			call, skb, data_size, sechdr);
+	return 0;
+}
+
+/*
+ * secure a packet prior to transmission
+ */
+int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb,
+			u32 *_abort_code)
+{
+	if (call->conn->security)
+		return call->conn->security->verify_packet(
+			call, skb, _abort_code);
+	return 0;
+}
+
+/*
+ * clear connection security
+ */
+void rxrpc_clear_conn_security(struct rxrpc_connection *conn)
+{
+	_enter("{%d}", conn->debug_id);
+
+	if (conn->security) {
+		conn->security->clear(conn);
+		rxrpc_security_put(conn->security);
+		conn->security = NULL;
+	}
+
+	key_put(conn->key);
+	key_put(conn->server_key);
+}
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
new file mode 100644
index 00000000..de755e04
--- /dev/null
+++ b/net/rxrpc/ar-skbuff.c
@@ -0,0 +1,132 @@
+/* ar-skbuff.c: socket buffer destruction handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+/*
+ * set up for the ACK at the end of the receive phase when we discard the final
+ * receive phase data packet
+ * - called with softirqs disabled
+ */
+static void rxrpc_request_final_ACK(struct rxrpc_call *call)
+{
+	/* the call may be aborted before we have a chance to ACK it */
+	write_lock(&call->state_lock);
+
+	switch (call->state) {
+	case RXRPC_CALL_CLIENT_RECV_REPLY:
+		call->state = RXRPC_CALL_CLIENT_FINAL_ACK;
+		_debug("request final ACK");
+
+		/* get an extra ref on the call for the final-ACK generator to
+		 * release */
+		rxrpc_get_call(call);
+		set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+		if (try_to_del_timer_sync(&call->ack_timer) >= 0)
+			rxrpc_queue_call(call);
+		break;
+
+	case RXRPC_CALL_SERVER_RECV_REQUEST:
+		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
+	default:
+		break;
+	}
+
+	write_unlock(&call->state_lock);
+}
+
+/*
+ * drop the bottom ACK off of the call ACK window and advance the window
+ */
+static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
+				struct rxrpc_skb_priv *sp)
+{
+	int loop;
+	u32 seq;
+
+	spin_lock_bh(&call->lock);
+
+	_debug("hard ACK #%u", ntohl(sp->hdr.seq));
+
+	for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
+		call->ackr_window[loop] >>= 1;
+		call->ackr_window[loop] |=
+			call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
+	}
+
+	seq = ntohl(sp->hdr.seq);
+	ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
+	call->rx_data_eaten = seq;
+
+	if (call->ackr_win_top < UINT_MAX)
+		call->ackr_win_top++;
+
+	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+		    call->rx_data_post, >=, call->rx_data_recv);
+	ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE,
+		    call->rx_data_recv, >=, call->rx_data_eaten);
+
+	if (sp->hdr.flags & RXRPC_LAST_PACKET) {
+		rxrpc_request_final_ACK(call);
+	} else if (atomic_dec_and_test(&call->ackr_not_idle) &&
+		   test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) {
+		_debug("send Rx idle ACK");
+		__rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial,
+				    true);
+	}
+
+	spin_unlock_bh(&call->lock);
+}
+
+/*
+ * destroy a packet that has an RxRPC control buffer
+ * - advance the hard-ACK state of the parent call (done here in case something
+ *   in the kernel bypasses recvmsg() and steals the packet directly off of the
+ *   socket receive queue)
+ */
+void rxrpc_packet_destructor(struct sk_buff *skb)
+{
+	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_call *call = sp->call;
+
+	_enter("%p{%p}", skb, call);
+
+	if (call) {
+		/* send the final ACK on a client call */
+		if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
+			rxrpc_hard_ACK_data(call, sp);
+		rxrpc_put_call(call);
+		sp->call = NULL;
+	}
+
+	if (skb->sk)
+		sock_rfree(skb);
+	_leave("");
+}
+
+/**
+ * rxrpc_kernel_free_skb - Free an RxRPC socket buffer
+ * @skb: The socket buffer to be freed
+ *
+ * Let RxRPC free its own socket buffer, permitting it to maintain debug
+ * accounting.
+ */
+void rxrpc_kernel_free_skb(struct sk_buff *skb)
+{
+	rxrpc_free_skb(skb);
+}
+
+EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
new file mode 100644
index 00000000..92df5669
--- /dev/null
+++ b/net/rxrpc/ar-transport.c
@@ -0,0 +1,279 @@
+/* RxRPC point-to-point transport session management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include "ar-internal.h"
+
+static void rxrpc_transport_reaper(struct work_struct *work);
+
+static LIST_HEAD(rxrpc_transports);
+static DEFINE_RWLOCK(rxrpc_transport_lock);
+static unsigned long rxrpc_transport_timeout = 3600 * 24;
+static DECLARE_DELAYED_WORK(rxrpc_transport_reap, rxrpc_transport_reaper);
+
+/*
+ * allocate a new transport session manager
+ */
+static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
+						     struct rxrpc_peer *peer,
+						     gfp_t gfp)
+{
+	struct rxrpc_transport *trans;
+
+	_enter("");
+
+	trans = kzalloc(sizeof(struct rxrpc_transport), gfp);
+	if (trans) {
+		trans->local = local;
+		trans->peer = peer;
+		INIT_LIST_HEAD(&trans->link);
+		trans->bundles = RB_ROOT;
+		trans->client_conns = RB_ROOT;
+		trans->server_conns = RB_ROOT;
+		skb_queue_head_init(&trans->error_queue);
+		spin_lock_init(&trans->client_lock);
+		rwlock_init(&trans->conn_lock);
+		atomic_set(&trans->usage, 1);
+		trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
+
+		if (peer->srx.transport.family == AF_INET) {
+			switch (peer->srx.transport_type) {
+			case SOCK_DGRAM:
+				INIT_WORK(&trans->error_handler,
+					  rxrpc_UDP_error_handler);
+				break;
+			default:
+				BUG();
+				break;
+			}
+		} else {
+			BUG();
+		}
+	}
+
+	_leave(" = %p", trans);
+	return trans;
+}
+
+/*
+ * obtain a transport session for the nominated endpoints
+ */
+struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *local,
+					    struct rxrpc_peer *peer,
+					    gfp_t gfp)
+{
+	struct rxrpc_transport *trans, *candidate;
+	const char *new = "old";
+	int usage;
+
+	_enter("{%pI4+%hu},{%pI4+%hu},",
+	       &local->srx.transport.sin.sin_addr,
+	       ntohs(local->srx.transport.sin.sin_port),
+	       &peer->srx.transport.sin.sin_addr,
+	       ntohs(peer->srx.transport.sin.sin_port));
+
+	/* search the transport list first */
+	read_lock_bh(&rxrpc_transport_lock);
+	list_for_each_entry(trans, &rxrpc_transports, link) {
+		if (trans->local == local && trans->peer == peer)
+			goto found_extant_transport;
+	}
+	read_unlock_bh(&rxrpc_transport_lock);
+
+	/* not yet present - create a candidate for a new record and then
+	 * redo the search */
+	candidate = rxrpc_alloc_transport(local, peer, gfp);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	write_lock_bh(&rxrpc_transport_lock);
+
+	list_for_each_entry(trans, &rxrpc_transports, link) {
+		if (trans->local == local && trans->peer == peer)
+			goto found_extant_second;
+	}
+
+	/* we can now add the new candidate to the list */
+	trans = candidate;
+	candidate = NULL;
+	usage = atomic_read(&trans->usage);
+
+	rxrpc_get_local(trans->local);
+	atomic_inc(&trans->peer->usage);
+	list_add_tail(&trans->link, &rxrpc_transports);
+	write_unlock_bh(&rxrpc_transport_lock);
+	new = "new";
+
+success:
+	_net("TRANSPORT %s %d local %d -> peer %d",
+	     new,
+	     trans->debug_id,
+	     trans->local->debug_id,
+	     trans->peer->debug_id);
+
+	_leave(" = %p {u=%d}", trans, usage);
+	return trans;
+
+	/* we found the transport in the list immediately */
+found_extant_transport:
+	usage = atomic_inc_return(&trans->usage);
+	read_unlock_bh(&rxrpc_transport_lock);
+	goto success;
+
+	/* we found the transport on the second time through the list */
+found_extant_second:
+	usage = atomic_inc_return(&trans->usage);
+	write_unlock_bh(&rxrpc_transport_lock);
+	kfree(candidate);
+	goto success;
+}
+
+/*
+ * find the transport connecting two endpoints
+ */
+struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *local,
+					     struct rxrpc_peer *peer)
+{
+	struct rxrpc_transport *trans;
+
+	_enter("{%pI4+%hu},{%pI4+%hu},",
+	       &local->srx.transport.sin.sin_addr,
+	       ntohs(local->srx.transport.sin.sin_port),
+	       &peer->srx.transport.sin.sin_addr,
+	       ntohs(peer->srx.transport.sin.sin_port));
+
+	/* search the transport list */
+	read_lock_bh(&rxrpc_transport_lock);
+
+	list_for_each_entry(trans, &rxrpc_transports, link) {
+		if (trans->local == local && trans->peer == peer)
+			goto found_extant_transport;
+	}
+
+	read_unlock_bh(&rxrpc_transport_lock);
+	_leave(" = NULL");
+	return NULL;
+
+found_extant_transport:
+	atomic_inc(&trans->usage);
+	read_unlock_bh(&rxrpc_transport_lock);
+	_leave(" = %p", trans);
+	return trans;
+}
+
+/*
+ * release a transport session
+ */
+void rxrpc_put_transport(struct rxrpc_transport *trans)
+{
+	_enter("%p{u=%d}", trans, atomic_read(&trans->usage));
+
+	ASSERTCMP(atomic_read(&trans->usage), >, 0);
+
+	trans->put_time = get_seconds();
+	if (unlikely(atomic_dec_and_test(&trans->usage))) {
+		_debug("zombie");
+		/* let the reaper determine the timeout to avoid a race with
+		 * overextending the timeout if the reaper is running at the
+		 * same time */
+		rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
+	}
+	_leave("");
+}
+
+/*
+ * clean up a transport session
+ */
+static void rxrpc_cleanup_transport(struct rxrpc_transport *trans)
+{
+	_net("DESTROY TRANS %d", trans->debug_id);
+
+	rxrpc_purge_queue(&trans->error_queue);
+
+	rxrpc_put_local(trans->local);
+	rxrpc_put_peer(trans->peer);
+	kfree(trans);
+}
+
+/*
+ * reap dead transports that have passed their expiry date
+ */
+static void rxrpc_transport_reaper(struct work_struct *work)
+{
+	struct rxrpc_transport *trans, *_p;
+	unsigned long now, earliest, reap_time;
+
+	LIST_HEAD(graveyard);
+
+	_enter("");
+
+	now = get_seconds();
+	earliest = ULONG_MAX;
+
+	/* extract all the transports that have been dead too long */
+	write_lock_bh(&rxrpc_transport_lock);
+	list_for_each_entry_safe(trans, _p, &rxrpc_transports, link) {
+		_debug("reap TRANS %d { u=%d t=%ld }",
+		       trans->debug_id, atomic_read(&trans->usage),
+		       (long) now - (long) trans->put_time);
+
+		if (likely(atomic_read(&trans->usage) > 0))
+			continue;
+
+		reap_time = trans->put_time + rxrpc_transport_timeout;
+		if (reap_time <= now)
+			list_move_tail(&trans->link, &graveyard);
+		else if (reap_time < earliest)
+			earliest = reap_time;
+	}
+	write_unlock_bh(&rxrpc_transport_lock);
+
+	if (earliest != ULONG_MAX) {
+		_debug("reschedule reaper %ld", (long) earliest - now);
+		ASSERTCMP(earliest, >, now);
+		rxrpc_queue_delayed_work(&rxrpc_transport_reap,
+					 (earliest - now) * HZ);
+	}
+
+	/* then destroy all those pulled out */
+	while (!list_empty(&graveyard)) {
+		trans = list_entry(graveyard.next, struct rxrpc_transport,
+				   link);
+		list_del_init(&trans->link);
+
+		ASSERTCMP(atomic_read(&trans->usage), ==, 0);
+		rxrpc_cleanup_transport(trans);
+	}
+
+	_leave("");
+}
+
+/*
+ * preemptively destroy all the transport session records rather than waiting
+ * for them to time out
+ */
+void __exit rxrpc_destroy_all_transports(void)
+{
+	_enter("");
+
+	rxrpc_transport_timeout = 0;
+	cancel_delayed_work(&rxrpc_transport_reap);
+	rxrpc_queue_delayed_work(&rxrpc_transport_reap, 0);
+
+	_leave("");
+}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
new file mode 100644
index 00000000..76351077
--- /dev/null
+++ b/net/rxrpc/rxkad.c
@@ -0,0 +1,1161 @@
+/* Kerberos-based RxRPC security
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/udp.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <keys/rxrpc-type.h>
+#define rxrpc_debug rxkad_debug
+#include "ar-internal.h"
+
+#define RXKAD_VERSION			2
+#define MAXKRB5TICKETLEN		1024
+#define RXKAD_TKT_TYPE_KERBEROS_V5	256
+#define ANAME_SZ			40	/* size of authentication name */
+#define INST_SZ				40	/* size of principal's instance */
+#define REALM_SZ			40	/* size of principal's auth domain */
+#define SNAME_SZ			40	/* size of service name */
+
+unsigned rxrpc_debug;
+module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "rxkad debugging mask");
+
+struct rxkad_level1_hdr {
+	__be32	data_size;	/* true data size (excluding padding) */
+};
+
+struct rxkad_level2_hdr {
+	__be32	data_size;	/* true data size (excluding padding) */
+	__be32	checksum;	/* decrypted data checksum */
+};
+
+MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos 4)");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+/*
+ * this holds a pinned cipher so that keventd doesn't get called by the cipher
+ * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
+ * packets
+ */
+static struct crypto_blkcipher *rxkad_ci;
+static DEFINE_MUTEX(rxkad_ci_mutex);
+
+/*
+ * initialise connection security
+ */
+static int rxkad_init_connection_security(struct rxrpc_connection *conn)
+{
+	struct crypto_blkcipher *ci;
+	struct rxrpc_key_token *token;
+	int ret;
+
+	_enter("{%d},{%x}", conn->debug_id, key_serial(conn->key));
+
+	token = conn->key->payload.data;
+	conn->security_ix = token->security_index;
+
+	ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ci)) {
+		_debug("no cipher");
+		ret = PTR_ERR(ci);
+		goto error;
+	}
+
+	if (crypto_blkcipher_setkey(ci, token->kad->session_key,
+				    sizeof(token->kad->session_key)) < 0)
+		BUG();
+
+	switch (conn->security_level) {
+	case RXRPC_SECURITY_PLAIN:
+		break;
+	case RXRPC_SECURITY_AUTH:
+		conn->size_align = 8;
+		conn->security_size = sizeof(struct rxkad_level1_hdr);
+		conn->header_size += sizeof(struct rxkad_level1_hdr);
+		break;
+	case RXRPC_SECURITY_ENCRYPT:
+		conn->size_align = 8;
+		conn->security_size = sizeof(struct rxkad_level2_hdr);
+		conn->header_size += sizeof(struct rxkad_level2_hdr);
+		break;
+	default:
+		ret = -EKEYREJECTED;
+		goto error;
+	}
+
+	conn->cipher = ci;
+	ret = 0;
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * prime the encryption state with the invariant parts of a connection's
+ * description
+ */
+static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
+{
+	struct rxrpc_key_token *token;
+	struct blkcipher_desc desc;
+	struct scatterlist sg[2];
+	struct rxrpc_crypt iv;
+	struct {
+		__be32 x[4];
+	} tmpbuf __attribute__((aligned(16))); /* must all be in same page */
+
+	_enter("");
+
+	if (!conn->key)
+		return;
+
+	token = conn->key->payload.data;
+	memcpy(&iv, token->kad->session_key, sizeof(iv));
+
+	desc.tfm = conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	tmpbuf.x[0] = conn->epoch;
+	tmpbuf.x[1] = conn->cid;
+	tmpbuf.x[2] = 0;
+	tmpbuf.x[3] = htonl(conn->security_ix);
+
+	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
+	ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]);
+
+	_leave("");
+}
+
+/*
+ * partially encrypt a packet (level 1 security)
+ */
+static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
+				    struct sk_buff *skb,
+				    u32 data_size,
+				    void *sechdr)
+{
+	struct rxrpc_skb_priv *sp;
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[2];
+	struct {
+		struct rxkad_level1_hdr hdr;
+		__be32	first;	/* first four bytes of data and padding */
+	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+	u16 check;
+
+	sp = rxrpc_skb(skb);
+
+	_enter("");
+
+	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	data_size |= (u32) check << 16;
+
+	tmpbuf.hdr.data_size = htonl(data_size);
+	memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
+
+	/* start the encryption afresh */
+	memset(&iv, 0, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	memcpy(sechdr, &tmpbuf, sizeof(tmpbuf));
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * wholly encrypt a packet (level 2 security)
+ */
+static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
+					struct sk_buff *skb,
+					u32 data_size,
+					void *sechdr)
+{
+	const struct rxrpc_key_token *token;
+	struct rxkad_level2_hdr rxkhdr
+		__attribute__((aligned(8))); /* must be all on one page */
+	struct rxrpc_skb_priv *sp;
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[16];
+	struct sk_buff *trailer;
+	unsigned len;
+	u16 check;
+	int nsg;
+
+	sp = rxrpc_skb(skb);
+
+	_enter("");
+
+	check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+
+	rxkhdr.data_size = htonl(data_size | (u32) check << 16);
+	rxkhdr.checksum = 0;
+
+	/* encrypt from the session key */
+	token = call->conn->key->payload.data;
+	memcpy(&iv, token->kad->session_key, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
+	sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr));
+	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr));
+
+	/* we want to encrypt the skbuff in-place */
+	nsg = skb_cow_data(skb, 0, &trailer);
+	if (nsg < 0 || nsg > 16)
+		return -ENOMEM;
+
+	len = data_size + call->conn->size_align - 1;
+	len &= ~(call->conn->size_align - 1);
+
+	sg_init_table(sg, nsg);
+	skb_to_sgvec(skb, sg, 0, len);
+	crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * checksum an RxRPC packet header
+ */
+static int rxkad_secure_packet(const struct rxrpc_call *call,
+				struct sk_buff *skb,
+				size_t data_size,
+				void *sechdr)
+{
+	struct rxrpc_skb_priv *sp;
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[2];
+	struct {
+		__be32 x[2];
+	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+	__be32 x;
+	u32 y;
+	int ret;
+
+	sp = rxrpc_skb(skb);
+
+	_enter("{%d{%x}},{#%u},%zu,",
+	       call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq),
+	       data_size);
+
+	if (!call->conn->cipher)
+		return 0;
+
+	ret = key_validate(call->conn->key);
+	if (ret < 0)
+		return ret;
+
+	/* continue encrypting from where we left off */
+	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	/* calculate the security checksum */
+	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
+	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
+	tmpbuf.x[0] = sp->hdr.callNumber;
+	tmpbuf.x[1] = x;
+
+	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	y = ntohl(tmpbuf.x[1]);
+	y = (y >> 16) & 0xffff;
+	if (y == 0)
+		y = 1; /* zero checksums are not permitted */
+	sp->hdr.cksum = htons(y);
+
+	switch (call->conn->security_level) {
+	case RXRPC_SECURITY_PLAIN:
+		ret = 0;
+		break;
+	case RXRPC_SECURITY_AUTH:
+		ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr);
+		break;
+	case RXRPC_SECURITY_ENCRYPT:
+		ret = rxkad_secure_packet_encrypt(call, skb, data_size,
+						  sechdr);
+		break;
+	default:
+		ret = -EPERM;
+		break;
+	}
+
+	_leave(" = %d [set %hx]", ret, y);
+	return ret;
+}
+
+/*
+ * decrypt partial encryption on a packet (level 1 security)
+ */
+static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
+				    struct sk_buff *skb,
+				    u32 *_abort_code)
+{
+	struct rxkad_level1_hdr sechdr;
+	struct rxrpc_skb_priv *sp;
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[16];
+	struct sk_buff *trailer;
+	u32 data_size, buf;
+	u16 check;
+	int nsg;
+
+	_enter("");
+
+	sp = rxrpc_skb(skb);
+
+	/* we want to decrypt the skbuff in-place */
+	nsg = skb_cow_data(skb, 0, &trailer);
+	if (nsg < 0 || nsg > 16)
+		goto nomem;
+
+	sg_init_table(sg, nsg);
+	skb_to_sgvec(skb, sg, 0, 8);
+
+	/* start the decryption afresh */
+	memset(&iv, 0, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8);
+
+	/* remove the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
+		goto datalen_error;
+	if (!skb_pull(skb, sizeof(sechdr)))
+		BUG();
+
+	buf = ntohl(sechdr.data_size);
+	data_size = buf & 0xffff;
+
+	check = buf >> 16;
+	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check &= 0xffff;
+	if (check != 0) {
+		*_abort_code = RXKADSEALEDINCON;
+		goto protocol_error;
+	}
+
+	/* shorten the packet to remove the padding */
+	if (data_size > skb->len)
+		goto datalen_error;
+	else if (data_size < skb->len)
+		skb->len = data_size;
+
+	_leave(" = 0 [dlen=%x]", data_size);
+	return 0;
+
+datalen_error:
+	*_abort_code = RXKADDATALEN;
+protocol_error:
+	_leave(" = -EPROTO");
+	return -EPROTO;
+
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * wholly decrypt a packet (level 2 security)
+ */
+static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
+				       struct sk_buff *skb,
+				       u32 *_abort_code)
+{
+	const struct rxrpc_key_token *token;
+	struct rxkad_level2_hdr sechdr;
+	struct rxrpc_skb_priv *sp;
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist _sg[4], *sg;
+	struct sk_buff *trailer;
+	u32 data_size, buf;
+	u16 check;
+	int nsg;
+
+	_enter(",{%d}", skb->len);
+
+	sp = rxrpc_skb(skb);
+
+	/* we want to decrypt the skbuff in-place */
+	nsg = skb_cow_data(skb, 0, &trailer);
+	if (nsg < 0)
+		goto nomem;
+
+	sg = _sg;
+	if (unlikely(nsg > 4)) {
+		sg = kmalloc(sizeof(*sg) * nsg, GFP_NOIO);
+		if (!sg)
+			goto nomem;
+	}
+
+	sg_init_table(sg, nsg);
+	skb_to_sgvec(skb, sg, 0, skb->len);
+
+	/* decrypt from the session key */
+	token = call->conn->key->payload.data;
+	memcpy(&iv, token->kad->session_key, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len);
+	if (sg != _sg)
+		kfree(sg);
+
+	/* remove the decrypted packet length */
+	if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
+		goto datalen_error;
+	if (!skb_pull(skb, sizeof(sechdr)))
+		BUG();
+
+	buf = ntohl(sechdr.data_size);
+	data_size = buf & 0xffff;
+
+	check = buf >> 16;
+	check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+	check &= 0xffff;
+	if (check != 0) {
+		*_abort_code = RXKADSEALEDINCON;
+		goto protocol_error;
+	}
+
+	/* shorten the packet to remove the padding */
+	if (data_size > skb->len)
+		goto datalen_error;
+	else if (data_size < skb->len)
+		skb->len = data_size;
+
+	_leave(" = 0 [dlen=%x]", data_size);
+	return 0;
+
+datalen_error:
+	*_abort_code = RXKADDATALEN;
+protocol_error:
+	_leave(" = -EPROTO");
+	return -EPROTO;
+
+nomem:
+	_leave(" = -ENOMEM");
+	return -ENOMEM;
+}
+
+/*
+ * verify the security on a received packet
+ */
+static int rxkad_verify_packet(const struct rxrpc_call *call,
+			       struct sk_buff *skb,
+			       u32 *_abort_code)
+{
+	struct blkcipher_desc desc;
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[2];
+	struct {
+		__be32 x[2];
+	} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
+	__be32 x;
+	__be16 cksum;
+	u32 y;
+	int ret;
+
+	sp = rxrpc_skb(skb);
+
+	_enter("{%d{%x}},{#%u}",
+	       call->debug_id, key_serial(call->conn->key),
+	       ntohl(sp->hdr.seq));
+
+	if (!call->conn->cipher)
+		return 0;
+
+	if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) {
+		*_abort_code = RXKADINCONSISTENCY;
+		_leave(" = -EPROTO [not rxkad]");
+		return -EPROTO;
+	}
+
+	/* continue encrypting from where we left off */
+	memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
+	desc.tfm = call->conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	/* validate the security checksum */
+	x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
+	x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
+	tmpbuf.x[0] = call->call_id;
+	tmpbuf.x[1] = x;
+
+	sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
+	sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
+	crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+	y = ntohl(tmpbuf.x[1]);
+	y = (y >> 16) & 0xffff;
+	if (y == 0)
+		y = 1; /* zero checksums are not permitted */
+
+	cksum = htons(y);
+	if (sp->hdr.cksum != cksum) {
+		*_abort_code = RXKADSEALEDINCON;
+		_leave(" = -EPROTO [csum failed]");
+		return -EPROTO;
+	}
+
+	switch (call->conn->security_level) {
+	case RXRPC_SECURITY_PLAIN:
+		ret = 0;
+		break;
+	case RXRPC_SECURITY_AUTH:
+		ret = rxkad_verify_packet_auth(call, skb, _abort_code);
+		break;
+	case RXRPC_SECURITY_ENCRYPT:
+		ret = rxkad_verify_packet_encrypt(call, skb, _abort_code);
+		break;
+	default:
+		ret = -ENOANO;
+		break;
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * issue a challenge
+ */
+static int rxkad_issue_challenge(struct rxrpc_connection *conn)
+{
+	struct rxkad_challenge challenge;
+	struct rxrpc_header hdr;
+	struct msghdr msg;
+	struct kvec iov[2];
+	size_t len;
+	int ret;
+
+	_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+	ret = key_validate(conn->key);
+	if (ret < 0)
+		return ret;
+
+	get_random_bytes(&conn->security_nonce, sizeof(conn->security_nonce));
+
+	challenge.version	= htonl(2);
+	challenge.nonce		= htonl(conn->security_nonce);
+	challenge.min_level	= htonl(0);
+	challenge.__padding	= 0;
+
+	msg.msg_name	= &conn->trans->peer->srx.transport.sin;
+	msg.msg_namelen	= sizeof(conn->trans->peer->srx.transport.sin);
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	hdr.epoch	= conn->epoch;
+	hdr.cid		= conn->cid;
+	hdr.callNumber	= 0;
+	hdr.seq		= 0;
+	hdr.type	= RXRPC_PACKET_TYPE_CHALLENGE;
+	hdr.flags	= conn->out_clientflag;
+	hdr.userStatus	= 0;
+	hdr.securityIndex = conn->security_ix;
+	hdr._rsvd	= 0;
+	hdr.serviceId	= conn->service_id;
+
+	iov[0].iov_base	= &hdr;
+	iov[0].iov_len	= sizeof(hdr);
+	iov[1].iov_base	= &challenge;
+	iov[1].iov_len	= sizeof(challenge);
+
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	hdr.serial = htonl(atomic_inc_return(&conn->serial));
+	_proto("Tx CHALLENGE %%%u", ntohl(hdr.serial));
+
+	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
+	if (ret < 0) {
+		_debug("sendmsg failed: %d", ret);
+		return -EAGAIN;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * send a Kerberos security response
+ */
+static int rxkad_send_response(struct rxrpc_connection *conn,
+			       struct rxrpc_header *hdr,
+			       struct rxkad_response *resp,
+			       const struct rxkad_key *s2)
+{
+	struct msghdr msg;
+	struct kvec iov[3];
+	size_t len;
+	int ret;
+
+	_enter("");
+
+	msg.msg_name	= &conn->trans->peer->srx.transport.sin;
+	msg.msg_namelen	= sizeof(conn->trans->peer->srx.transport.sin);
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	hdr->epoch	= conn->epoch;
+	hdr->seq	= 0;
+	hdr->type	= RXRPC_PACKET_TYPE_RESPONSE;
+	hdr->flags	= conn->out_clientflag;
+	hdr->userStatus	= 0;
+	hdr->_rsvd	= 0;
+
+	iov[0].iov_base	= hdr;
+	iov[0].iov_len	= sizeof(*hdr);
+	iov[1].iov_base	= resp;
+	iov[1].iov_len	= sizeof(*resp);
+	iov[2].iov_base	= (void *) s2->ticket;
+	iov[2].iov_len	= s2->ticket_len;
+
+	len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
+
+	hdr->serial = htonl(atomic_inc_return(&conn->serial));
+	_proto("Tx RESPONSE %%%u", ntohl(hdr->serial));
+
+	ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
+	if (ret < 0) {
+		_debug("sendmsg failed: %d", ret);
+		return -EAGAIN;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * calculate the response checksum
+ */
+static void rxkad_calc_response_checksum(struct rxkad_response *response)
+{
+	u32 csum = 1000003;
+	int loop;
+	u8 *p = (u8 *) response;
+
+	for (loop = sizeof(*response); loop > 0; loop--)
+		csum = csum * 0x10204081 + *p++;
+
+	response->encrypted.checksum = htonl(csum);
+}
+
+/*
+ * load a scatterlist with a potentially split-page buffer
+ */
+static void rxkad_sg_set_buf2(struct scatterlist sg[2],
+			      void *buf, size_t buflen)
+{
+	int nsg = 1;
+
+	sg_init_table(sg, 2);
+
+	sg_set_buf(&sg[0], buf, buflen);
+	if (sg[0].offset + buflen > PAGE_SIZE) {
+		/* the buffer was split over two pages */
+		sg[0].length = PAGE_SIZE - sg[0].offset;
+		sg_set_buf(&sg[1], buf + sg[0].length, buflen - sg[0].length);
+		nsg++;
+	}
+
+	sg_mark_end(&sg[nsg - 1]);
+
+	ASSERTCMP(sg[0].length + sg[1].length, ==, buflen);
+}
+
+/*
+ * encrypt the response packet
+ */
+static void rxkad_encrypt_response(struct rxrpc_connection *conn,
+				   struct rxkad_response *resp,
+				   const struct rxkad_key *s2)
+{
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv;
+	struct scatterlist sg[2];
+
+	/* continue encrypting from where we left off */
+	memcpy(&iv, s2->session_key, sizeof(iv));
+	desc.tfm = conn->cipher;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
+	crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+}
+
+/*
+ * respond to a challenge packet
+ */
+static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
+				      struct sk_buff *skb,
+				      u32 *_abort_code)
+{
+	const struct rxrpc_key_token *token;
+	struct rxkad_challenge challenge;
+	struct rxkad_response resp
+		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxrpc_skb_priv *sp;
+	u32 version, nonce, min_level, abort_code;
+	int ret;
+
+	_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
+
+	if (!conn->key) {
+		_leave(" = -EPROTO [no key]");
+		return -EPROTO;
+	}
+
+	ret = key_validate(conn->key);
+	if (ret < 0) {
+		*_abort_code = RXKADEXPIRED;
+		return ret;
+	}
+
+	abort_code = RXKADPACKETSHORT;
+	sp = rxrpc_skb(skb);
+	if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0)
+		goto protocol_error;
+
+	version = ntohl(challenge.version);
+	nonce = ntohl(challenge.nonce);
+	min_level = ntohl(challenge.min_level);
+
+	_proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
+	       ntohl(sp->hdr.serial), version, nonce, min_level);
+
+	abort_code = RXKADINCONSISTENCY;
+	if (version != RXKAD_VERSION)
+		goto protocol_error;
+
+	abort_code = RXKADLEVELFAIL;
+	if (conn->security_level < min_level)
+		goto protocol_error;
+
+	token = conn->key->payload.data;
+
+	/* build the response packet */
+	memset(&resp, 0, sizeof(resp));
+
+	resp.version = RXKAD_VERSION;
+	resp.encrypted.epoch = conn->epoch;
+	resp.encrypted.cid = conn->cid;
+	resp.encrypted.securityIndex = htonl(conn->security_ix);
+	resp.encrypted.call_id[0] =
+		(conn->channels[0] ? conn->channels[0]->call_id : 0);
+	resp.encrypted.call_id[1] =
+		(conn->channels[1] ? conn->channels[1]->call_id : 0);
+	resp.encrypted.call_id[2] =
+		(conn->channels[2] ? conn->channels[2]->call_id : 0);
+	resp.encrypted.call_id[3] =
+		(conn->channels[3] ? conn->channels[3]->call_id : 0);
+	resp.encrypted.inc_nonce = htonl(nonce + 1);
+	resp.encrypted.level = htonl(conn->security_level);
+	resp.kvno = htonl(token->kad->kvno);
+	resp.ticket_len = htonl(token->kad->ticket_len);
+
+	/* calculate the response checksum and then do the encryption */
+	rxkad_calc_response_checksum(&resp);
+	rxkad_encrypt_response(conn, &resp, token->kad);
+	return rxkad_send_response(conn, &sp->hdr, &resp, token->kad);
+
+protocol_error:
+	*_abort_code = abort_code;
+	_leave(" = -EPROTO [%d]", abort_code);
+	return -EPROTO;
+}
+
+/*
+ * decrypt the kerberos IV ticket in the response
+ */
+static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
+				void *ticket, size_t ticket_len,
+				struct rxrpc_crypt *_session_key,
+				time_t *_expiry,
+				u32 *_abort_code)
+{
+	struct blkcipher_desc desc;
+	struct rxrpc_crypt iv, key;
+	struct scatterlist sg[1];
+	struct in_addr addr;
+	unsigned life;
+	time_t issue, now;
+	bool little_endian;
+	int ret;
+	u8 *p, *q, *name, *end;
+
+	_enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key));
+
+	*_expiry = 0;
+
+	ret = key_validate(conn->server_key);
+	if (ret < 0) {
+		switch (ret) {
+		case -EKEYEXPIRED:
+			*_abort_code = RXKADEXPIRED;
+			goto error;
+		default:
+			*_abort_code = RXKADNOAUTH;
+			goto error;
+		}
+	}
+
+	ASSERT(conn->server_key->payload.data != NULL);
+	ASSERTCMP((unsigned long) ticket & 7UL, ==, 0);
+
+	memcpy(&iv, &conn->server_key->type_data, sizeof(iv));
+
+	desc.tfm = conn->server_key->payload.data;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	sg_init_one(&sg[0], ticket, ticket_len);
+	crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len);
+
+	p = ticket;
+	end = p + ticket_len;
+
+#define Z(size)						\
+	({						\
+		u8 *__str = p;				\
+		q = memchr(p, 0, end - p);		\
+		if (!q || q - p > (size))		\
+			goto bad_ticket;		\
+		for (; p < q; p++)			\
+			if (!isprint(*p))		\
+				goto bad_ticket;	\
+		p++;					\
+		__str;					\
+	})
+
+	/* extract the ticket flags */
+	_debug("KIV FLAGS: %x", *p);
+	little_endian = *p & 1;
+	p++;
+
+	/* extract the authentication name */
+	name = Z(ANAME_SZ);
+	_debug("KIV ANAME: %s", name);
+
+	/* extract the principal's instance */
+	name = Z(INST_SZ);
+	_debug("KIV INST : %s", name);
+
+	/* extract the principal's authentication domain */
+	name = Z(REALM_SZ);
+	_debug("KIV REALM: %s", name);
+
+	if (end - p < 4 + 8 + 4 + 2)
+		goto bad_ticket;
+
+	/* get the IPv4 address of the entity that requested the ticket */
+	memcpy(&addr, p, sizeof(addr));
+	p += 4;
+	_debug("KIV ADDR : %pI4", &addr);
+
+	/* get the session key from the ticket */
+	memcpy(&key, p, sizeof(key));
+	p += 8;
+	_debug("KIV KEY  : %08x %08x", ntohl(key.n[0]), ntohl(key.n[1]));
+	memcpy(_session_key, &key, sizeof(key));
+
+	/* get the ticket's lifetime */
+	life = *p++ * 5 * 60;
+	_debug("KIV LIFE : %u", life);
+
+	/* get the issue time of the ticket */
+	if (little_endian) {
+		__le32 stamp;
+		memcpy(&stamp, p, 4);
+		issue = le32_to_cpu(stamp);
+	} else {
+		__be32 stamp;
+		memcpy(&stamp, p, 4);
+		issue = be32_to_cpu(stamp);
+	}
+	p += 4;
+	now = get_seconds();
+	_debug("KIV ISSUE: %lx [%lx]", issue, now);
+
+	/* check the ticket is in date */
+	if (issue > now) {
+		*_abort_code = RXKADNOAUTH;
+		ret = -EKEYREJECTED;
+		goto error;
+	}
+
+	if (issue < now - life) {
+		*_abort_code = RXKADEXPIRED;
+		ret = -EKEYEXPIRED;
+		goto error;
+	}
+
+	*_expiry = issue + life;
+
+	/* get the service name */
+	name = Z(SNAME_SZ);
+	_debug("KIV SNAME: %s", name);
+
+	/* get the service instance name */
+	name = Z(INST_SZ);
+	_debug("KIV SINST: %s", name);
+
+	ret = 0;
+error:
+	_leave(" = %d", ret);
+	return ret;
+
+bad_ticket:
+	*_abort_code = RXKADBADTICKET;
+	ret = -EBADMSG;
+	goto error;
+}
+
+/*
+ * decrypt the response packet
+ */
+static void rxkad_decrypt_response(struct rxrpc_connection *conn,
+				   struct rxkad_response *resp,
+				   const struct rxrpc_crypt *session_key)
+{
+	struct blkcipher_desc desc;
+	struct scatterlist sg[2];
+	struct rxrpc_crypt iv;
+
+	_enter(",,%08x%08x",
+	       ntohl(session_key->n[0]), ntohl(session_key->n[1]));
+
+	ASSERT(rxkad_ci != NULL);
+
+	mutex_lock(&rxkad_ci_mutex);
+	if (crypto_blkcipher_setkey(rxkad_ci, session_key->x,
+				    sizeof(*session_key)) < 0)
+		BUG();
+
+	memcpy(&iv, session_key, sizeof(iv));
+	desc.tfm = rxkad_ci;
+	desc.info = iv.x;
+	desc.flags = 0;
+
+	rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
+	crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+	mutex_unlock(&rxkad_ci_mutex);
+
+	_leave("");
+}
+
+/*
+ * verify a response
+ */
+static int rxkad_verify_response(struct rxrpc_connection *conn,
+				 struct sk_buff *skb,
+				 u32 *_abort_code)
+{
+	struct rxkad_response response
+		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxrpc_skb_priv *sp;
+	struct rxrpc_crypt session_key;
+	time_t expiry;
+	void *ticket;
+	u32 abort_code, version, kvno, ticket_len, level;
+	__be32 csum;
+	int ret;
+
+	_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
+
+	abort_code = RXKADPACKETSHORT;
+	if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0)
+		goto protocol_error;
+	if (!pskb_pull(skb, sizeof(response)))
+		BUG();
+
+	version = ntohl(response.version);
+	ticket_len = ntohl(response.ticket_len);
+	kvno = ntohl(response.kvno);
+	sp = rxrpc_skb(skb);
+	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
+	       ntohl(sp->hdr.serial), version, kvno, ticket_len);
+
+	abort_code = RXKADINCONSISTENCY;
+	if (version != RXKAD_VERSION)
+		goto protocol_error;
+
+	abort_code = RXKADTICKETLEN;
+	if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN)
+		goto protocol_error;
+
+	abort_code = RXKADUNKNOWNKEY;
+	if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5)
+		goto protocol_error;
+
+	/* extract the kerberos ticket and decrypt and decode it */
+	ticket = kmalloc(ticket_len, GFP_NOFS);
+	if (!ticket)
+		return -ENOMEM;
+
+	abort_code = RXKADPACKETSHORT;
+	if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0)
+		goto protocol_error_free;
+
+	ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key,
+				   &expiry, &abort_code);
+	if (ret < 0) {
+		*_abort_code = abort_code;
+		kfree(ticket);
+		return ret;
+	}
+
+	/* use the session key from inside the ticket to decrypt the
+	 * response */
+	rxkad_decrypt_response(conn, &response, &session_key);
+
+	abort_code = RXKADSEALEDINCON;
+	if (response.encrypted.epoch != conn->epoch)
+		goto protocol_error_free;
+	if (response.encrypted.cid != conn->cid)
+		goto protocol_error_free;
+	if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
+		goto protocol_error_free;
+	csum = response.encrypted.checksum;
+	response.encrypted.checksum = 0;
+	rxkad_calc_response_checksum(&response);
+	if (response.encrypted.checksum != csum)
+		goto protocol_error_free;
+
+	if (ntohl(response.encrypted.call_id[0]) > INT_MAX ||
+	    ntohl(response.encrypted.call_id[1]) > INT_MAX ||
+	    ntohl(response.encrypted.call_id[2]) > INT_MAX ||
+	    ntohl(response.encrypted.call_id[3]) > INT_MAX)
+		goto protocol_error_free;
+
+	abort_code = RXKADOUTOFSEQUENCE;
+	if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1))
+		goto protocol_error_free;
+
+	abort_code = RXKADLEVELFAIL;
+	level = ntohl(response.encrypted.level);
+	if (level > RXRPC_SECURITY_ENCRYPT)
+		goto protocol_error_free;
+	conn->security_level = level;
+
+	/* create a key to hold the security data and expiration time - after
+	 * this the connection security can be handled in exactly the same way
+	 * as for a client connection */
+	ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
+	if (ret < 0) {
+		kfree(ticket);
+		return ret;
+	}
+
+	kfree(ticket);
+	_leave(" = 0");
+	return 0;
+
+protocol_error_free:
+	kfree(ticket);
+protocol_error:
+	*_abort_code = abort_code;
+	_leave(" = -EPROTO [%d]", abort_code);
+	return -EPROTO;
+}
+
+/*
+ * clear the connection security
+ */
+static void rxkad_clear(struct rxrpc_connection *conn)
+{
+	_enter("");
+
+	if (conn->cipher)
+		crypto_free_blkcipher(conn->cipher);
+}
+
+/*
+ * RxRPC Kerberos-based security
+ */
+static struct rxrpc_security rxkad = {
+	.owner				= THIS_MODULE,
+	.name				= "rxkad",
+	.security_index			= RXRPC_SECURITY_RXKAD,
+	.init_connection_security	= rxkad_init_connection_security,
+	.prime_packet_security		= rxkad_prime_packet_security,
+	.secure_packet			= rxkad_secure_packet,
+	.verify_packet			= rxkad_verify_packet,
+	.issue_challenge		= rxkad_issue_challenge,
+	.respond_to_challenge		= rxkad_respond_to_challenge,
+	.verify_response		= rxkad_verify_response,
+	.clear				= rxkad_clear,
+};
+
+static __init int rxkad_init(void)
+{
+	_enter("");
+
+	/* pin the cipher we need so that the crypto layer doesn't invoke
+	 * keventd to go get it */
+	rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(rxkad_ci))
+		return PTR_ERR(rxkad_ci);
+
+	return rxrpc_register_security(&rxkad);
+}
+
+module_init(rxkad_init);
+
+static __exit void rxkad_exit(void)
+{
+	_enter("");
+
+	rxrpc_unregister_security(&rxkad);
+	crypto_free_blkcipher(rxkad_ci);
+}
+
+module_exit(rxkad_exit);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
new file mode 100644
index 00000000..2590e91b
--- /dev/null
+++ b/net/sched/Kconfig
@@ -0,0 +1,585 @@
+#
+# Traffic control configuration.
+# 
+
+menuconfig NET_SCHED
+	bool "QoS and/or fair queueing"
+	select NET_SCH_FIFO
+	---help---
+	  When the kernel has several packets to send out over a network
+	  device, it has to decide which ones to send first, which ones to
+	  delay, and which ones to drop. This is the job of the queueing
+	  disciplines, several different algorithms for how to do this
+	  "fairly" have been proposed.
+
+	  If you say N here, you will get the standard packet scheduler, which
+	  is a FIFO (first come, first served). If you say Y here, you will be
+	  able to choose from among several alternative algorithms which can
+	  then be attached to different network devices. This is useful for
+	  example if some of your network devices are real time devices that
+	  need a certain minimum data flow rate, or if you need to limit the
+	  maximum data flow rate for traffic which matches specified criteria.
+	  This code is considered to be experimental.
+
+	  To administer these schedulers, you'll need the user-level utilities
+	  from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
+	  That package also contains some documentation; for more, check out
+	  <http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
+
+	  This Quality of Service (QoS) support will enable you to use
+	  Differentiated Services (diffserv) and Resource Reservation Protocol
+	  (RSVP) on your Linux router if you also say Y to the corresponding
+	  classifiers below.  Documentation and software is at
+	  <http://diffserv.sourceforge.net/>.
+
+	  If you say Y here and to "/proc file system" below, you will be able
+	  to read status information about packet schedulers from the file
+	  /proc/net/psched.
+
+	  The available schedulers are listed in the following questions; you
+	  can say Y to as many as you like. If unsure, say N now.
+
+if NET_SCHED
+
+comment "Queueing/Scheduling"
+
+config NET_SCH_CBQ
+	tristate "Class Based Queueing (CBQ)"
+	---help---
+	  Say Y here if you want to use the Class-Based Queueing (CBQ) packet
+	  scheduling algorithm. This algorithm classifies the waiting packets
+	  into a tree-like hierarchy of classes; the leaves of this tree are
+	  in turn scheduled by separate algorithms.
+
+	  See the top of <file:net/sched/sch_cbq.c> for more details.
+
+	  CBQ is a commonly used scheduler, so if you're unsure, you should
+	  say Y here. Then say Y to all the queueing algorithms below that you
+	  want to use as leaf disciplines.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_cbq.
+
+config NET_SCH_HTB
+	tristate "Hierarchical Token Bucket (HTB)"
+	---help---
+	  Say Y here if you want to use the Hierarchical Token Buckets (HTB)
+	  packet scheduling algorithm. See
+	  <http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
+	  in-depth articles.
+
+	  HTB is very similar to CBQ regarding its goals however is has
+	  different properties and different algorithm.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_htb.
+
+config NET_SCH_HFSC
+	tristate "Hierarchical Fair Service Curve (HFSC)"
+	---help---
+	  Say Y here if you want to use the Hierarchical Fair Service Curve
+	  (HFSC) packet scheduling algorithm.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_hfsc.
+
+config NET_SCH_ATM
+	tristate "ATM Virtual Circuits (ATM)"
+	depends on ATM
+	---help---
+	  Say Y here if you want to use the ATM pseudo-scheduler.  This
+	  provides a framework for invoking classifiers, which in turn
+	  select classes of this queuing discipline.  Each class maps
+	  the flow(s) it is handling to a given virtual circuit.
+
+	  See the top of <file:net/sched/sch_atm.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_atm.
+
+config NET_SCH_PRIO
+	tristate "Multi Band Priority Queueing (PRIO)"
+	---help---
+	  Say Y here if you want to use an n-band priority queue packet
+	  scheduler.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_prio.
+
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
+config NET_SCH_RED
+	tristate "Random Early Detection (RED)"
+	---help---
+	  Say Y here if you want to use the Random Early Detection (RED)
+	  packet scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_red.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_red.
+
+config NET_SCH_SFB
+	tristate "Stochastic Fair Blue (SFB)"
+	---help---
+	  Say Y here if you want to use the Stochastic Fair Blue (SFB)
+	  packet scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_sfb.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_sfb.
+
+config NET_SCH_SFQ
+	tristate "Stochastic Fairness Queueing (SFQ)"
+	---help---
+	  Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
+	  packet scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_sfq.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_sfq.
+
+config NET_SCH_TEQL
+	tristate "True Link Equalizer (TEQL)"
+	---help---
+	  Say Y here if you want to use the True Link Equalizer (TLE) packet
+	  scheduling algorithm. This queueing discipline allows the combination
+	  of several physical devices into one virtual device.
+
+	  See the top of <file:net/sched/sch_teql.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_teql.
+
+config NET_SCH_TBF
+	tristate "Token Bucket Filter (TBF)"
+	---help---
+	  Say Y here if you want to use the Token Bucket Filter (TBF) packet
+	  scheduling algorithm.
+
+	  See the top of <file:net/sched/sch_tbf.c> for more details.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_tbf.
+
+config NET_SCH_GRED
+	tristate "Generic Random Early Detection (GRED)"
+	---help---
+	  Say Y here if you want to use the Generic Random Early Detection
+	  (GRED) packet scheduling algorithm for some of your network devices
+	  (see the top of <file:net/sched/sch_red.c> for details and
+	  references about the algorithm).
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_gred.
+
+config NET_SCH_DSMARK
+	tristate "Differentiated Services marker (DSMARK)"
+	---help---
+	  Say Y if you want to schedule packets according to the
+	  Differentiated Services architecture proposed in RFC 2475.
+	  Technical information on this method, with pointers to associated
+	  RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_dsmark.
+
+config NET_SCH_NETEM
+	tristate "Network emulator (NETEM)"
+	---help---
+	  Say Y if you want to emulate network delay, loss, and packet
+	  re-ordering. This is often useful to simulate networks when
+	  testing applications or protocols.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_netem.
+
+	  If unsure, say N.
+
+config NET_SCH_DRR
+	tristate "Deficit Round Robin scheduler (DRR)"
+	help
+	  Say Y here if you want to use the Deficit Round Robin (DRR) packet
+	  scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_drr.
+
+	  If unsure, say N.
+
+config NET_SCH_MQPRIO
+	tristate "Multi-queue priority scheduler (MQPRIO)"
+	help
+	  Say Y here if you want to use the Multi-queue Priority scheduler.
+	  This scheduler allows QOS to be offloaded on NICs that have support
+	  for offloading QOS schedulers.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_mqprio.
+
+	  If unsure, say N.
+
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
+config NET_SCH_QFQ
+	tristate "Quick Fair Queueing scheduler (QFQ)"
+	help
+	  Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_qfq.
+
+	  If unsure, say N.
+
+config NET_SCH_INGRESS
+	tristate "Ingress Qdisc"
+	depends on NET_CLS_ACT
+	---help---
+	  Say Y here if you want to use classifiers for incoming packets.
+	  If unsure, say Y.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_ingress.
+
+comment "Classification"
+
+config NET_CLS
+	boolean
+
+config NET_CLS_BASIC
+	tristate "Elementary classification (BASIC)"
+	select NET_CLS
+	---help---
+	  Say Y here if you want to be able to classify packets using
+	  only extended matches and actions.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_basic.
+
+config NET_CLS_TCINDEX
+	tristate "Traffic-Control Index (TCINDEX)"
+	select NET_CLS
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  traffic control indices. You will want this feature if you want
+	  to implement Differentiated Services together with DSMARK.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_tcindex.
+
+config NET_CLS_ROUTE4
+	tristate "Routing decision (ROUTE)"
+	depends on INET
+	select IP_ROUTE_CLASSID
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets
+	  according to the route table entry they matched.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_route.
+
+config NET_CLS_FW
+	tristate "Netfilter mark (FW)"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets
+	  according to netfilter/firewall marks.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_fw.
+
+config NET_CLS_U32
+	tristate "Universal 32bit comparisons w/ hashing (U32)"
+	select NET_CLS
+	---help---
+	  Say Y here to be able to classify packets using a universal
+	  32bit pieces based comparison scheme.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_u32.
+
+config CLS_U32_PERF
+	bool "Performance counters support"
+	depends on NET_CLS_U32
+	---help---
+	  Say Y here to make u32 gather additional statistics useful for
+	  fine tuning u32 classifiers.
+
+config CLS_U32_MARK
+	bool "Netfilter marks support"
+	depends on NET_CLS_U32
+	---help---
+	  Say Y here to be able to use netfilter marks as u32 key.
+
+config NET_CLS_RSVP
+	tristate "IPv4 Resource Reservation Protocol (RSVP)"
+	select NET_CLS
+	---help---
+	  The Resource Reservation Protocol (RSVP) permits end systems to
+	  request a minimum and maximum data flow rate for a connection; this
+	  is important for real time data such as streaming sound or video.
+
+	  Say Y here if you want to be able to classify outgoing packets based
+	  on their RSVP requests.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_rsvp.
+
+config NET_CLS_RSVP6
+	tristate "IPv6 Resource Reservation Protocol (RSVP6)"
+	select NET_CLS
+	---help---
+	  The Resource Reservation Protocol (RSVP) permits end systems to
+	  request a minimum and maximum data flow rate for a connection; this
+	  is important for real time data such as streaming sound or video.
+
+	  Say Y here if you want to be able to classify outgoing packets based
+	  on their RSVP requests and you are using the IPv6 protocol.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_rsvp6.
+
+config NET_CLS_FLOW
+	tristate "Flow classifier"
+	select NET_CLS
+	---help---
+	  If you say Y here, you will be able to classify packets based on
+	  a configurable combination of packet keys. This is mostly useful
+	  in combination with SFQ.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_flow.
+
+config NET_CLS_CGROUP
+	tristate "Control Group Classifier"
+	select NET_CLS
+	depends on CGROUPS
+	---help---
+	  Say Y here if you want to classify packets based on the control
+	  cgroup of their process.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_cgroup.
+
+config NET_EMATCH
+	bool "Extended Matches"
+	select NET_CLS
+	---help---
+	  Say Y here if you want to use extended matches on top of classifiers
+	  and select the extended matches below.
+
+	  Extended matches are small classification helpers not worth writing
+	  a separate classifier for.
+
+	  A recent version of the iproute2 package is required to use
+	  extended matches.
+
+config NET_EMATCH_STACK
+	int "Stack size"
+	depends on NET_EMATCH
+	default "32"
+	---help---
+	  Size of the local stack variable used while evaluating the tree of
+	  ematches. Limits the depth of the tree, i.e. the number of
+	  encapsulated precedences. Every level requires 4 bytes of additional
+	  stack space.
+
+config NET_EMATCH_CMP
+	tristate "Simple packet data comparison"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  simple packet data comparisons for 8, 16, and 32bit values.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_cmp.
+
+config NET_EMATCH_NBYTE
+	tristate "Multi byte comparison"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  multiple byte comparisons mainly useful for IPv6 address comparisons.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_nbyte.
+
+config NET_EMATCH_U32
+	tristate "U32 key"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be able to classify packets using
+	  the famous u32 key in combination with logic relations.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_u32.
+
+config NET_EMATCH_META
+	tristate "Metadata"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  metadata such as load average, netfilter attributes, socket
+	  attributes and routing decisions.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_meta.
+
+config NET_EMATCH_TEXT
+	tristate "Textsearch"
+	depends on NET_EMATCH
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
+	select TEXTSEARCH_FSM
+	---help---
+	  Say Y here if you want to be able to classify packets based on
+	  textsearch comparisons.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_text.
+
+config NET_CLS_ACT
+	bool "Actions"
+	---help---
+	  Say Y here if you want to use traffic control actions. Actions
+	  get attached to classifiers and are invoked after a successful
+	  classification. They are used to overwrite the classification
+	  result, instantly drop or redirect packets, etc.
+
+	  A recent version of the iproute2 package is required to use
+	  extended matches.
+
+config NET_ACT_POLICE
+	tristate "Traffic Policing"
+        depends on NET_CLS_ACT 
+        ---help---
+	  Say Y here if you want to do traffic policing, i.e. strict
+	  bandwidth limiting. This action replaces the existing policing
+	  module.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_police.
+
+config NET_ACT_GACT
+        tristate "Generic actions"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to take generic actions such as dropping and
+	  accepting packets.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_gact.
+
+config GACT_PROB
+        bool "Probability support"
+        depends on NET_ACT_GACT
+        ---help---
+	  Say Y here to use the generic action randomly or deterministically.
+
+config NET_ACT_MIRRED
+        tristate "Redirecting and Mirroring"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow packets to be mirrored or redirected to
+	  other devices.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_mirred.
+
+config NET_ACT_IPT
+        tristate "IPtables targets"
+        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+        ---help---
+	  Say Y here to be able to invoke iptables targets after successful
+	  classification.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ipt.
+
+config NET_ACT_NAT
+        tristate "Stateless NAT"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to do stateless NAT on IPv4 packets.  You should use
+	  netfilter for NAT unless you know what you are doing.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_nat.
+
+config NET_ACT_PEDIT
+        tristate "Packet Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here if you want to mangle the content of packets.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_pedit.
+
+config NET_ACT_SIMP
+        tristate "Simple Example (Debug)"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to add a simple action for demonstration purposes.
+	  It is meant as an example and for debugging purposes. It will
+	  print a configured policy string followed by the packet count
+	  to the console for every packet that passes by.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_simple.
+
+config NET_ACT_SKBEDIT
+        tristate "SKB Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to change skb priority or queue_mapping settings.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_skbedit.
+
+config NET_ACT_CSUM
+        tristate "Checksum Updating"
+        depends on NET_CLS_ACT && INET
+        ---help---
+	  Say Y here to update some common checksum after some direct
+	  packet alterations.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_csum.
+
+config NET_CLS_IND
+	bool "Incoming device classification"
+	depends on NET_CLS_U32 || NET_CLS_FW
+	---help---
+	  Say Y here to extend the u32 and fw classifier to support
+	  classification based on the incoming device. This option is
+	  likely to disappear in favour of the metadata ematch.
+
+endif # NET_SCHED
+
+config NET_SCH_FIFO
+	bool
diff --git a/net/sched/Makefile b/net/sched/Makefile
new file mode 100644
index 00000000..dc5889c0
--- /dev/null
+++ b/net/sched/Makefile
@@ -0,0 +1,54 @@
+#
+# Makefile for the Linux Traffic Control Unit.
+#
+
+obj-y	:= sch_generic.o sch_mq.o
+
+obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_blackhole.o
+obj-$(CONFIG_NET_CLS)		+= cls_api.o
+obj-$(CONFIG_NET_CLS_ACT)	+= act_api.o
+obj-$(CONFIG_NET_ACT_POLICE)	+= act_police.o
+obj-$(CONFIG_NET_ACT_GACT)	+= act_gact.o
+obj-$(CONFIG_NET_ACT_MIRRED)	+= act_mirred.o
+obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
+obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
+obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
+obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
+obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
+obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
+obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
+obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
+obj-$(CONFIG_NET_SCH_RED)	+= sch_red.o
+obj-$(CONFIG_NET_SCH_GRED)	+= sch_gred.o
+obj-$(CONFIG_NET_SCH_INGRESS)	+= sch_ingress.o 
+obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
+obj-$(CONFIG_NET_SCH_SFB)	+= sch_sfb.o
+obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
+obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
+obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
+obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
+obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
+obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
+obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
+obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
+
+obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
+obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
+obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
+obj-$(CONFIG_NET_CLS_RSVP)	+= cls_rsvp.o
+obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
+obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
+obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
+obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
+obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
+obj-$(CONFIG_NET_EMATCH)	+= ematch.o
+obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
+obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
+obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
+obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
new file mode 100644
index 00000000..a6060258
--- /dev/null
+++ b/net/sched/act_api.c
@@ -0,0 +1,1125 @@
+/*
+ * net/sched/act_api.c	Packet action API.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Author:	Jamal Hadi Salim
+ *
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/err.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/sch_generic.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+{
+	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+	struct tcf_common **p1p;
+
+	for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
+		if (*p1p == p) {
+			write_lock_bh(hinfo->lock);
+			*p1p = p->tcfc_next;
+			write_unlock_bh(hinfo->lock);
+			gen_kill_estimator(&p->tcfc_bstats,
+					   &p->tcfc_rate_est);
+			/*
+			 * gen_estimator est_timer() might access p->tcfc_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			kfree_rcu(p, tcfc_rcu);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+EXPORT_SYMBOL(tcf_hash_destroy);
+
+int tcf_hash_release(struct tcf_common *p, int bind,
+		     struct tcf_hashinfo *hinfo)
+{
+	int ret = 0;
+
+	if (p) {
+		if (bind)
+			p->tcfc_bindcnt--;
+
+		p->tcfc_refcnt--;
+		if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
+			tcf_hash_destroy(p, hinfo);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(tcf_hash_release);
+
+static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct tc_action *a, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p;
+	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	struct nlattr *nest;
+
+	read_lock_bh(hinfo->lock);
+
+	s_i = cb->args[0];
+
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+		for (; p; p = p->tcfc_next) {
+			index++;
+			if (index < s_i)
+				continue;
+			a->priv = p;
+			a->order = n_i;
+
+			nest = nla_nest_start(skb, a->order);
+			if (nest == NULL)
+				goto nla_put_failure;
+			err = tcf_action_dump_1(skb, a, 0, 0);
+			if (err < 0) {
+				index--;
+				nlmsg_trim(skb, nest);
+				goto done;
+			}
+			nla_nest_end(skb, nest);
+			n_i++;
+			if (n_i >= TCA_ACT_MAX_PRIO)
+				goto done;
+		}
+	}
+done:
+	read_unlock_bh(hinfo->lock);
+	if (n_i)
+		cb->args[0] += n_i;
+	return n_i;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	goto done;
+}
+
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
+			  struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p, *s_p;
+	struct nlattr *nest;
+	int i = 0, n_i = 0;
+
+	nest = nla_nest_start(skb, a->order);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+		while (p != NULL) {
+			s_p = p->tcfc_next;
+			if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
+				module_put(a->ops->owner);
+			n_i++;
+			p = s_p;
+		}
+	}
+	NLA_PUT_U32(skb, TCA_FCNT, n_i);
+	nla_nest_end(skb, nest);
+
+	return n_i;
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EINVAL;
+}
+
+int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+		       int type, struct tc_action *a)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+
+	if (type == RTM_DELACTION) {
+		return tcf_del_walker(skb, a, hinfo);
+	} else if (type == RTM_GETACTION) {
+		return tcf_dump_walker(skb, cb, a, hinfo);
+	} else {
+		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(tcf_generic_walker);
+
+struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p;
+
+	read_lock_bh(hinfo->lock);
+	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
+	     p = p->tcfc_next) {
+		if (p->tcfc_index == index)
+			break;
+	}
+	read_unlock_bh(hinfo->lock);
+
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_lookup);
+
+u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo)
+{
+	u32 val = *idx_gen;
+
+	do {
+		if (++val == 0)
+			val = 1;
+	} while (tcf_hash_lookup(val, hinfo));
+
+	return (*idx_gen = val);
+}
+EXPORT_SYMBOL(tcf_hash_new_index);
+
+int tcf_hash_search(struct tc_action *a, u32 index)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+
+	if (p) {
+		a->priv = p;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcf_hash_search);
+
+struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
+				  struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p = NULL;
+	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
+		if (bind)
+			p->tcfc_bindcnt++;
+		p->tcfc_refcnt++;
+		a->priv = p;
+	}
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_check);
+
+struct tcf_common *tcf_hash_create(u32 index, struct nlattr *est,
+				   struct tc_action *a, int size, int bind,
+				   u32 *idx_gen, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+
+	if (unlikely(!p))
+		return ERR_PTR(-ENOMEM);
+	p->tcfc_refcnt = 1;
+	if (bind)
+		p->tcfc_bindcnt = 1;
+
+	spin_lock_init(&p->tcfc_lock);
+	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
+	p->tcfc_tm.install = jiffies;
+	p->tcfc_tm.lastuse = jiffies;
+	if (est) {
+		int err = gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
+					    &p->tcfc_lock, est);
+		if (err) {
+			kfree(p);
+			return ERR_PTR(err);
+		}
+	}
+
+	a->priv = (void *) p;
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_create);
+
+void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+{
+	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+
+	write_lock_bh(hinfo->lock);
+	p->tcfc_next = hinfo->htab[h];
+	hinfo->htab[h] = p;
+	write_unlock_bh(hinfo->lock);
+}
+EXPORT_SYMBOL(tcf_hash_insert);
+
+static struct tc_action_ops *act_base = NULL;
+static DEFINE_RWLOCK(act_mod_lock);
+
+int tcf_register_action(struct tc_action_ops *act)
+{
+	struct tc_action_ops *a, **ap;
+
+	write_lock(&act_mod_lock);
+	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) {
+		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
+			write_unlock(&act_mod_lock);
+			return -EEXIST;
+		}
+	}
+	act->next = NULL;
+	*ap = act;
+	write_unlock(&act_mod_lock);
+	return 0;
+}
+EXPORT_SYMBOL(tcf_register_action);
+
+int tcf_unregister_action(struct tc_action_ops *act)
+{
+	struct tc_action_ops *a, **ap;
+	int err = -ENOENT;
+
+	write_lock(&act_mod_lock);
+	for (ap = &act_base; (a = *ap) != NULL; ap = &a->next)
+		if (a == act)
+			break;
+	if (a) {
+		*ap = a->next;
+		a->next = NULL;
+		err = 0;
+	}
+	write_unlock(&act_mod_lock);
+	return err;
+}
+EXPORT_SYMBOL(tcf_unregister_action);
+
+/* lookup by name */
+static struct tc_action_ops *tc_lookup_action_n(char *kind)
+{
+	struct tc_action_ops *a = NULL;
+
+	if (kind) {
+		read_lock(&act_mod_lock);
+		for (a = act_base; a; a = a->next) {
+			if (strcmp(kind, a->kind) == 0) {
+				if (!try_module_get(a->owner)) {
+					read_unlock(&act_mod_lock);
+					return NULL;
+				}
+				break;
+			}
+		}
+		read_unlock(&act_mod_lock);
+	}
+	return a;
+}
+
+/* lookup by nlattr */
+static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
+{
+	struct tc_action_ops *a = NULL;
+
+	if (kind) {
+		read_lock(&act_mod_lock);
+		for (a = act_base; a; a = a->next) {
+			if (nla_strcmp(kind, a->kind) == 0) {
+				if (!try_module_get(a->owner)) {
+					read_unlock(&act_mod_lock);
+					return NULL;
+				}
+				break;
+			}
+		}
+		read_unlock(&act_mod_lock);
+	}
+	return a;
+}
+
+#if 0
+/* lookup by id */
+static struct tc_action_ops *tc_lookup_action_id(u32 type)
+{
+	struct tc_action_ops *a = NULL;
+
+	if (type) {
+		read_lock(&act_mod_lock);
+		for (a = act_base; a; a = a->next) {
+			if (a->type == type) {
+				if (!try_module_get(a->owner)) {
+					read_unlock(&act_mod_lock);
+					return NULL;
+				}
+				break;
+			}
+		}
+		read_unlock(&act_mod_lock);
+	}
+	return a;
+}
+#endif
+
+int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
+		    struct tcf_result *res)
+{
+	struct tc_action *a;
+	int ret = -1;
+
+	if (skb->tc_verd & TC_NCLS) {
+		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
+		ret = TC_ACT_OK;
+		goto exec_done;
+	}
+	while ((a = act) != NULL) {
+repeat:
+		if (a->ops && a->ops->act) {
+			ret = a->ops->act(skb, a, res);
+			if (TC_MUNGED & skb->tc_verd) {
+				/* copied already, allow trampling */
+				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
+				skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
+			}
+			if (ret == TC_ACT_REPEAT)
+				goto repeat;	/* we need a ttl - JHS */
+			if (ret != TC_ACT_PIPE)
+				goto exec_done;
+		}
+		act = a->next;
+	}
+exec_done:
+	return ret;
+}
+EXPORT_SYMBOL(tcf_action_exec);
+
+void tcf_action_destroy(struct tc_action *act, int bind)
+{
+	struct tc_action *a;
+
+	for (a = act; a; a = act) {
+		if (a->ops && a->ops->cleanup) {
+			if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
+				module_put(a->ops->owner);
+			act = act->next;
+			kfree(a);
+		} else {
+			/*FIXME: Remove later - catch insertion bugs*/
+			WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n");
+			act = act->next;
+			kfree(a);
+		}
+	}
+}
+
+int
+tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	int err = -EINVAL;
+
+	if (a->ops == NULL || a->ops->dump == NULL)
+		return err;
+	return a->ops->dump(skb, a, bind, ref);
+}
+
+int
+tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	int err = -EINVAL;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	if (a->ops == NULL || a->ops->dump == NULL)
+		return err;
+
+	NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind);
+	if (tcf_action_copy_stats(skb, a, 0))
+		goto nla_put_failure;
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	err = tcf_action_dump_old(skb, a, bind, ref);
+	if (err > 0) {
+		nla_nest_end(skb, nest);
+		return err;
+	}
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+EXPORT_SYMBOL(tcf_action_dump_1);
+
+int
+tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref)
+{
+	struct tc_action *a;
+	int err = -EINVAL;
+	struct nlattr *nest;
+
+	while ((a = act) != NULL) {
+		act = a->next;
+		nest = nla_nest_start(skb, a->order);
+		if (nest == NULL)
+			goto nla_put_failure;
+		err = tcf_action_dump_1(skb, a, bind, ref);
+		if (err < 0)
+			goto errout;
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+
+nla_put_failure:
+	err = -EINVAL;
+errout:
+	nla_nest_cancel(skb, nest);
+	return err;
+}
+
+struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
+				    char *name, int ovr, int bind)
+{
+	struct tc_action *a;
+	struct tc_action_ops *a_o;
+	char act_name[IFNAMSIZ];
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+	struct nlattr *kind;
+	int err;
+
+	if (name == NULL) {
+		err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+		if (err < 0)
+			goto err_out;
+		err = -EINVAL;
+		kind = tb[TCA_ACT_KIND];
+		if (kind == NULL)
+			goto err_out;
+		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
+			goto err_out;
+	} else {
+		err = -EINVAL;
+		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
+			goto err_out;
+	}
+
+	a_o = tc_lookup_action_n(act_name);
+	if (a_o == NULL) {
+#ifdef CONFIG_MODULES
+		rtnl_unlock();
+		request_module("act_%s", act_name);
+		rtnl_lock();
+
+		a_o = tc_lookup_action_n(act_name);
+
+		/* We dropped the RTNL semaphore in order to
+		 * perform the module load.  So, even if we
+		 * succeeded in loading the module we have to
+		 * tell the caller to replay the request.  We
+		 * indicate this using -EAGAIN.
+		 */
+		if (a_o != NULL) {
+			err = -EAGAIN;
+			goto err_mod;
+		}
+#endif
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (a == NULL)
+		goto err_mod;
+
+	/* backward compatibility for policer */
+	if (name == NULL)
+		err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind);
+	else
+		err = a_o->init(nla, est, a, ovr, bind);
+	if (err < 0)
+		goto err_free;
+
+	/* module count goes up only when brand new policy is created
+	 * if it exists and is only bound to in a_o->init() then
+	 * ACT_P_CREATED is not returned (a zero is).
+	 */
+	if (err != ACT_P_CREATED)
+		module_put(a_o->owner);
+	a->ops = a_o;
+
+	return a;
+
+err_free:
+	kfree(a);
+err_mod:
+	module_put(a_o->owner);
+err_out:
+	return ERR_PTR(err);
+}
+
+struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
+				  char *name, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct tc_action *head = NULL, *act, *act_prev = NULL;
+	int err;
+	int i;
+
+	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+		act = tcf_action_init_1(tb[i], est, name, ovr, bind);
+		if (IS_ERR(act))
+			goto err;
+		act->order = i;
+
+		if (head == NULL)
+			head = act;
+		else
+			act_prev->next = act;
+		act_prev = act;
+	}
+	return head;
+
+err:
+	if (head != NULL)
+		tcf_action_destroy(head, bind);
+	return act;
+}
+
+int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
+			  int compat_mode)
+{
+	int err = 0;
+	struct gnet_dump d;
+	struct tcf_act_hdr *h = a->priv;
+
+	if (h == NULL)
+		goto errout;
+
+	/* compat_mode being true specifies a call that is supposed
+	 * to add additional backward compatibility statistic TLVs.
+	 */
+	if (compat_mode) {
+		if (a->type == TCA_OLD_COMPAT)
+			err = gnet_stats_start_copy_compat(skb, 0,
+				TCA_STATS, TCA_XSTATS, &h->tcf_lock, &d);
+		else
+			return 0;
+	} else
+		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
+					    &h->tcf_lock, &d);
+
+	if (err < 0)
+		goto errout;
+
+	if (a->ops != NULL && a->ops->get_stats != NULL)
+		if (a->ops->get_stats(skb, a) < 0)
+			goto errout;
+
+	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &h->tcf_bstats,
+				     &h->tcf_rate_est) < 0 ||
+	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
+		goto errout;
+
+	if (gnet_stats_finish_copy(&d) < 0)
+		goto errout;
+
+	return 0;
+
+errout:
+	return -1;
+}
+
+static int
+tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
+	     u16 flags, int event, int bind, int ref)
+{
+	struct tcamsg *t;
+	struct nlmsghdr *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
+
+	t = NLMSG_DATA(nlh);
+	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
+
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (tcf_action_dump(skb, a, bind, ref) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int
+act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
+	       struct tc_action *a, int event)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+	if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return rtnl_unicast(skb, net, pid);
+}
+
+static struct tc_action *
+tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
+{
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+	struct tc_action *a;
+	int index;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+	if (err < 0)
+		goto err_out;
+
+	err = -EINVAL;
+	if (tb[TCA_ACT_INDEX] == NULL ||
+	    nla_len(tb[TCA_ACT_INDEX]) < sizeof(index))
+		goto err_out;
+	index = nla_get_u32(tb[TCA_ACT_INDEX]);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(struct tc_action), GFP_KERNEL);
+	if (a == NULL)
+		goto err_out;
+
+	err = -EINVAL;
+	a->ops = tc_lookup_action(tb[TCA_ACT_KIND]);
+	if (a->ops == NULL)
+		goto err_free;
+	if (a->ops->lookup == NULL)
+		goto err_mod;
+	err = -ENOENT;
+	if (a->ops->lookup(a, index) == 0)
+		goto err_mod;
+
+	module_put(a->ops->owner);
+	return a;
+
+err_mod:
+	module_put(a->ops->owner);
+err_free:
+	kfree(a);
+err_out:
+	return ERR_PTR(err);
+}
+
+static void cleanup_a(struct tc_action *act)
+{
+	struct tc_action *a;
+
+	for (a = act; a; a = act) {
+		act = a->next;
+		kfree(a);
+	}
+}
+
+static struct tc_action *create_a(int i)
+{
+	struct tc_action *act;
+
+	act = kzalloc(sizeof(*act), GFP_KERNEL);
+	if (act == NULL) {
+		pr_debug("create_a: failed to alloc!\n");
+		return NULL;
+	}
+	act->order = i;
+	return act;
+}
+
+static int tca_action_flush(struct net *net, struct nlattr *nla,
+			    struct nlmsghdr *n, u32 pid)
+{
+	struct sk_buff *skb;
+	unsigned char *b;
+	struct nlmsghdr *nlh;
+	struct tcamsg *t;
+	struct netlink_callback dcb;
+	struct nlattr *nest;
+	struct nlattr *tb[TCA_ACT_MAX + 1];
+	struct nlattr *kind;
+	struct tc_action *a = create_a(0);
+	int err = -ENOMEM;
+
+	if (a == NULL) {
+		pr_debug("tca_action_flush: couldnt create tc_action\n");
+		return err;
+	}
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb) {
+		pr_debug("tca_action_flush: failed skb alloc\n");
+		kfree(a);
+		return err;
+	}
+
+	b = skb_tail_pointer(skb);
+
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL);
+	if (err < 0)
+		goto err_out;
+
+	err = -EINVAL;
+	kind = tb[TCA_ACT_KIND];
+	a->ops = tc_lookup_action(kind);
+	if (a->ops == NULL)
+		goto err_out;
+
+	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
+	t = NLMSG_DATA(nlh);
+	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
+
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
+	if (err < 0)
+		goto nla_put_failure;
+	if (err == 0)
+		goto noflush_out;
+
+	nla_nest_end(skb, nest);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	nlh->nlmsg_flags |= NLM_F_ROOT;
+	module_put(a->ops->owner);
+	kfree(a);
+	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+			     n->nlmsg_flags & NLM_F_ECHO);
+	if (err > 0)
+		return 0;
+
+	return err;
+
+nla_put_failure:
+nlmsg_failure:
+	module_put(a->ops->owner);
+err_out:
+noflush_out:
+	kfree_skb(skb);
+	kfree(a);
+	return err;
+}
+
+static int
+tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	      u32 pid, int event)
+{
+	int i, ret;
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct tc_action *head = NULL, *act, *act_prev = NULL;
+
+	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
+		if (tb[1] != NULL)
+			return tca_action_flush(net, tb[1], n, pid);
+		else
+			return -EINVAL;
+	}
+
+	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
+		act = tcf_action_get_1(tb[i], n, pid);
+		if (IS_ERR(act)) {
+			ret = PTR_ERR(act);
+			goto err;
+		}
+		act->order = i;
+
+		if (head == NULL)
+			head = act;
+		else
+			act_prev->next = act;
+		act_prev = act;
+	}
+
+	if (event == RTM_GETACTION)
+		ret = act_get_notify(net, pid, n, head, event);
+	else { /* delete */
+		struct sk_buff *skb;
+
+		skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+		if (!skb) {
+			ret = -ENOBUFS;
+			goto err;
+		}
+
+		if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event,
+				 0, 1) <= 0) {
+			kfree_skb(skb);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/* now do the delete */
+		tcf_action_destroy(head, 0);
+		ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+				     n->nlmsg_flags & NLM_F_ECHO);
+		if (ret > 0)
+			return 0;
+		return ret;
+	}
+err:
+	cleanup_a(head);
+	return ret;
+}
+
+static int tcf_add_notify(struct net *net, struct tc_action *a,
+			  u32 pid, u32 seq, int event, u16 flags)
+{
+	struct tcamsg *t;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	struct nlattr *nest;
+	unsigned char *b;
+	int err = 0;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	b = skb_tail_pointer(skb);
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
+	t = NLMSG_DATA(nlh);
+	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
+
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (tcf_action_dump(skb, a, 0, 0) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	NETLINK_CB(skb).dst_group = RTNLGRP_TC;
+
+	err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO);
+	if (err > 0)
+		err = 0;
+	return err;
+
+nla_put_failure:
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+
+static int
+tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
+	       u32 pid, int ovr)
+{
+	int ret = 0;
+	struct tc_action *act;
+	struct tc_action *a;
+	u32 seq = n->nlmsg_seq;
+
+	act = tcf_action_init(nla, NULL, NULL, ovr, 0);
+	if (act == NULL)
+		goto done;
+	if (IS_ERR(act)) {
+		ret = PTR_ERR(act);
+		goto done;
+	}
+
+	/* dump then free all the actions after update; inserted policy
+	 * stays intact
+	 */
+	ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
+	for (a = act; a; a = act) {
+		act = a->next;
+		kfree(a);
+	}
+done:
+	return ret;
+}
+
+static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_ACT_MAX + 1];
+	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	int ret = 0, ovr = 0;
+
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (tca[TCA_ACT_TAB] == NULL) {
+		pr_notice("tc_ctl_action: received NO action attribs\n");
+		return -EINVAL;
+	}
+
+	/* n->nlmsg_flags & NLM_F_CREATE */
+	switch (n->nlmsg_type) {
+	case RTM_NEWACTION:
+		/* we are going to assume all other flags
+		 * imply create only if it doesn't exist
+		 * Note that CREATE | EXCL implies that
+		 * but since we want avoid ambiguity (eg when flags
+		 * is zero) then just set this
+		 */
+		if (n->nlmsg_flags & NLM_F_REPLACE)
+			ovr = 1;
+replay:
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
+		if (ret == -EAGAIN)
+			goto replay;
+		break;
+	case RTM_DELACTION:
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    pid, RTM_DELACTION);
+		break;
+	case RTM_GETACTION:
+		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
+				    pid, RTM_GETACTION);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+static struct nlattr *
+find_dump_kind(const struct nlmsghdr *n)
+{
+	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
+	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
+	struct nlattr *nla[TCAA_MAX + 1];
+	struct nlattr *kind;
+
+	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0)
+		return NULL;
+	tb1 = nla[TCA_ACT_TAB];
+	if (tb1 == NULL)
+		return NULL;
+
+	if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1),
+		      NLMSG_ALIGN(nla_len(tb1)), NULL) < 0)
+		return NULL;
+
+	if (tb[1] == NULL)
+		return NULL;
+	if (nla_parse(tb2, TCA_ACT_MAX, nla_data(tb[1]),
+		      nla_len(tb[1]), NULL) < 0)
+		return NULL;
+	kind = tb2[TCA_ACT_KIND];
+
+	return kind;
+}
+
+static int
+tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+	struct tc_action_ops *a_o;
+	struct tc_action a;
+	int ret = 0;
+	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
+	struct nlattr *kind = find_dump_kind(cb->nlh);
+
+	if (kind == NULL) {
+		pr_info("tc_dump_action: action bad kind\n");
+		return 0;
+	}
+
+	a_o = tc_lookup_action(kind);
+	if (a_o == NULL)
+		return 0;
+
+	memset(&a, 0, sizeof(struct tc_action));
+	a.ops = a_o;
+
+	if (a_o->walk == NULL) {
+		WARN(1, "tc_dump_action: %s !capable of dumping table\n",
+		     a_o->kind);
+		goto nla_put_failure;
+	}
+
+	nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*t));
+	t = NLMSG_DATA(nlh);
+	t->tca_family = AF_UNSPEC;
+	t->tca__pad1 = 0;
+	t->tca__pad2 = 0;
+
+	nest = nla_nest_start(skb, TCA_ACT_TAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+	if (ret < 0)
+		goto nla_put_failure;
+
+	if (ret > 0) {
+		nla_nest_end(skb, nest);
+		ret = skb->len;
+	} else
+		nla_nest_cancel(skb, nest);
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	if (NETLINK_CB(cb->skb).pid && ret)
+		nlh->nlmsg_flags |= NLM_F_MULTI;
+	module_put(a_o->owner);
+	return skb->len;
+
+nla_put_failure:
+nlmsg_failure:
+	module_put(a_o->owner);
+	nlmsg_trim(skb, b);
+	return skb->len;
+}
+
+static int __init tc_action_init(void)
+{
+	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action);
+
+	return 0;
+}
+
+subsys_initcall(tc_action_init);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 00000000..6cdf9abe
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,594 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+#define CSUM_TAB_MASK 15
+static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
+static u32 csum_idx_gen;
+static DEFINE_RWLOCK(csum_lock);
+
+static struct tcf_hashinfo csum_hash_info = {
+	.htab	= tcf_csum_ht,
+	.hmask	= CSUM_TAB_MASK,
+	.lock	= &csum_lock,
+};
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_CSUM_MAX + 1];
+	struct tc_csum *parm;
+	struct tcf_common *pc;
+	struct tcf_csum *p;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CSUM_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_CSUM_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+				     &csum_idx_gen, &csum_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		p = to_tcf_csum(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		p = to_tcf_csum(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &csum_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&p->tcf_lock);
+	p->tcf_action = parm->action;
+	p->update_flags = parm->update_flags;
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &csum_hash_info);
+
+	return ret;
+}
+
+static int tcf_csum_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_csum *p = a->priv;
+	return tcf_hash_release(&p->common, bind, &csum_hash_info);
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+				    unsigned int ihl, unsigned int ipl,
+				    unsigned int jhl)
+{
+	int ntkoff = skb_network_offset(skb);
+	int hl = ihl + jhl;
+
+	if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+	    (skb_cloned(skb) &&
+	     !skb_clone_writable(skb, hl + ntkoff) &&
+	     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		return NULL;
+	else
+		return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmphdr *icmph;
+
+	icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+	if (icmph == NULL)
+		return 0;
+
+	icmph->checksum = 0;
+	skb->csum = csum_partial(icmph, ipl - ihl, 0);
+	icmph->checksum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct igmphdr *igmph;
+
+	igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+	if (igmph == NULL)
+		return 0;
+
+	igmph->csum = 0;
+	skb->csum = csum_partial(igmph, ipl - ihl, 0);
+	igmph->csum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmp6hdr *icmp6h;
+
+	icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+	if (icmp6h == NULL)
+		return 0;
+
+	icmp6h->icmp6_cksum = 0;
+	skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+	icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					      ipl - ihl, IPPROTO_ICMPV6,
+					      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = tcp_v4_check(ipl - ihl,
+				   iph->saddr, iph->daddr, skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				      ipl - ihl, IPPROTO_TCP,
+				      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use iph->tot_len, or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	ul = ntohs(udph->len);
+
+	if (udplite || udph->check) {
+
+		udph->check = 0;
+
+		if (udplite) {
+			if (ul == 0)
+				skb->csum = csum_partial(udph, ipl - ihl, 0);
+			else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+				skb->csum = csum_partial(udph, ul, 0);
+			else
+				goto ignore_obscure_skb;
+		} else {
+			if (ul != ipl - ihl)
+				goto ignore_obscure_skb;
+
+			skb->csum = csum_partial(udph, ul, 0);
+		}
+
+		udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						ul, iph->protocol,
+						skb->csum);
+
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	ul = ntohs(udph->len);
+
+	udph->check = 0;
+
+	if (udplite) {
+		if (ul == 0)
+			skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+		else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+			skb->csum = csum_partial(udph, ul, 0);
+
+		else
+			goto ignore_obscure_skb;
+	} else {
+		if (ul != ipl - ihl)
+			goto ignore_obscure_skb;
+
+		skb->csum = csum_partial(udph, ul, 0);
+	}
+
+	udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+				      udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+				      skb->csum);
+
+	if (!udph->check)
+		udph->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+	struct iphdr *iph;
+	int ntkoff;
+
+	ntkoff = skb_network_offset(skb);
+
+	if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+		goto fail;
+
+	iph = ip_hdr(skb);
+
+	switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+	case IPPROTO_ICMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+			if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_IGMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+			if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_TCP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+			if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_UDP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len), 0))
+				goto fail;
+		break;
+	case IPPROTO_UDPLITE:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len), 1))
+				goto fail;
+		break;
+	}
+
+	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto fail;
+
+		ip_send_check(iph);
+	}
+
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
+				 unsigned int ixhl, unsigned int *pl)
+{
+	int off, len, optlen;
+	unsigned char *xh = (void *)ip6xh;
+
+	off = sizeof(*ip6xh);
+	len = ixhl - off;
+
+	while (len > 1) {
+		switch (xh[off]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		case IPV6_TLV_JUMBO:
+			optlen = xh[off + 1] + 2;
+			if (optlen != 6 || len < 6 || (off & 3) != 2)
+				/* wrong jumbo option length/alignment */
+				return 0;
+			*pl = ntohl(*(__be32 *)(xh + off + 2));
+			goto done;
+		default:
+			optlen = xh[off + 1] + 2;
+			if (optlen > len)
+				/* ignore obscure options */
+				goto done;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+
+done:
+	return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+	struct ipv6hdr *ip6h;
+	struct ipv6_opt_hdr *ip6xh;
+	unsigned int hl, ixhl;
+	unsigned int pl;
+	int ntkoff;
+	u8 nexthdr;
+
+	ntkoff = skb_network_offset(skb);
+
+	hl = sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, hl + ntkoff))
+		goto fail;
+
+	ip6h = ipv6_hdr(skb);
+
+	pl = ntohs(ip6h->payload_len);
+	nexthdr = ip6h->nexthdr;
+
+	do {
+		switch (nexthdr) {
+		case NEXTHDR_FRAGMENT:
+			goto ignore_skb;
+		case NEXTHDR_ROUTING:
+		case NEXTHDR_HOP:
+		case NEXTHDR_DEST:
+			if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+				goto fail;
+			ip6xh = (void *)(skb_network_header(skb) + hl);
+			ixhl = ipv6_optlen(ip6xh);
+			if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+				goto fail;
+			if ((nexthdr == NEXTHDR_HOP) &&
+			    !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+				goto fail;
+			nexthdr = ip6xh->nexthdr;
+			hl += ixhl;
+			break;
+		case IPPROTO_ICMPV6:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+				if (!tcf_csum_ipv6_icmp(skb, ip6h,
+							hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_TCP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+				if (!tcf_csum_ipv6_tcp(skb, ip6h,
+						       hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_UDP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+						       pl + sizeof(*ip6h), 0))
+					goto fail;
+			goto done;
+		case IPPROTO_UDPLITE:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+						       pl + sizeof(*ip6h), 1))
+					goto fail;
+			goto done;
+		default:
+			goto ignore_skb;
+		}
+	} while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum(struct sk_buff *skb,
+		    struct tc_action *a, struct tcf_result *res)
+{
+	struct tcf_csum *p = a->priv;
+	int action;
+	u32 update_flags;
+
+	spin_lock(&p->tcf_lock);
+	p->tcf_tm.lastuse = jiffies;
+	bstats_update(&p->tcf_bstats, skb);
+	action = p->tcf_action;
+	update_flags = p->update_flags;
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	switch (skb->protocol) {
+	case cpu_to_be16(ETH_P_IP):
+		if (!tcf_csum_ipv4(skb, update_flags))
+			goto drop;
+		break;
+	case cpu_to_be16(ETH_P_IPV6):
+		if (!tcf_csum_ipv6(skb, update_flags))
+			goto drop;
+		break;
+	}
+
+	return action;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb,
+			 struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_csum *p = a->priv;
+	struct tc_csum opt = {
+		.update_flags = p->update_flags,
+		.index   = p->tcf_index,
+		.action  = p->tcf_action,
+		.refcnt  = p->tcf_refcnt - ref,
+		.bindcnt = p->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_csum_ops = {
+	.kind		= "csum",
+	.hinfo		= &csum_hash_info,
+	.type		= TCA_ACT_CSUM,
+	.capab		= TCA_CAP_NONE,
+	.owner		= THIS_MODULE,
+	.act		= tcf_csum,
+	.dump		= tcf_csum_dump,
+	.cleanup	= tcf_csum_cleanup,
+	.lookup		= tcf_hash_search,
+	.init		= tcf_csum_init,
+	.walk		= tcf_generic_walker
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+	return tcf_register_action(&act_csum_ops);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+	tcf_unregister_action(&act_csum_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
new file mode 100644
index 00000000..2b4ab4b0
--- /dev/null
+++ b/net/sched/act_gact.c
@@ -0,0 +1,221 @@
+/*
+ * net/sched/gact.c	Generic actions
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright 	Jamal Hadi Salim (2002-4)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_gact.h>
+#include <net/tc_act/tc_gact.h>
+
+#define GACT_TAB_MASK	15
+static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1];
+static u32 gact_idx_gen;
+static DEFINE_RWLOCK(gact_lock);
+
+static struct tcf_hashinfo gact_hash_info = {
+	.htab	=	tcf_gact_ht,
+	.hmask	=	GACT_TAB_MASK,
+	.lock	=	&gact_lock,
+};
+
+#ifdef CONFIG_GACT_PROB
+static int gact_net_rand(struct tcf_gact *gact)
+{
+	if (!gact->tcfg_pval || net_random() % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
+}
+
+static int gact_determ(struct tcf_gact *gact)
+{
+	if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
+}
+
+typedef int (*g_rand)(struct tcf_gact *gact);
+static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
+#endif /* CONFIG_GACT_PROB */
+
+static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
+	[TCA_GACT_PARMS]	= { .len = sizeof(struct tc_gact) },
+	[TCA_GACT_PROB]		= { .len = sizeof(struct tc_gact_p) },
+};
+
+static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_GACT_MAX + 1];
+	struct tc_gact *parm;
+	struct tcf_gact *gact;
+	struct tcf_common *pc;
+	int ret = 0;
+	int err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GACT_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_GACT_PARMS]);
+
+#ifndef CONFIG_GACT_PROB
+	if (tb[TCA_GACT_PROB] != NULL)
+		return -EOPNOTSUPP;
+#endif
+
+	pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+				     bind, &gact_idx_gen, &gact_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &gact_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	gact = to_gact(pc);
+
+	spin_lock_bh(&gact->tcf_lock);
+	gact->tcf_action = parm->action;
+#ifdef CONFIG_GACT_PROB
+	if (tb[TCA_GACT_PROB] != NULL) {
+		struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]);
+		gact->tcfg_paction = p_parm->paction;
+		gact->tcfg_pval    = p_parm->pval;
+		gact->tcfg_ptype   = p_parm->ptype;
+	}
+#endif
+	spin_unlock_bh(&gact->tcf_lock);
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &gact_hash_info);
+	return ret;
+}
+
+static int tcf_gact_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_gact *gact = a->priv;
+
+	if (gact)
+		return tcf_hash_release(&gact->common, bind, &gact_hash_info);
+	return 0;
+}
+
+static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+{
+	struct tcf_gact *gact = a->priv;
+	int action = TC_ACT_SHOT;
+
+	spin_lock(&gact->tcf_lock);
+#ifdef CONFIG_GACT_PROB
+	if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL)
+		action = gact_rand[gact->tcfg_ptype](gact);
+	else
+		action = gact->tcf_action;
+#else
+	action = gact->tcf_action;
+#endif
+	gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	gact->tcf_bstats.packets++;
+	if (action == TC_ACT_SHOT)
+		gact->tcf_qstats.drops++;
+	gact->tcf_tm.lastuse = jiffies;
+	spin_unlock(&gact->tcf_lock);
+
+	return action;
+}
+
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_gact *gact = a->priv;
+	struct tc_gact opt = {
+		.index   = gact->tcf_index,
+		.refcnt  = gact->tcf_refcnt - ref,
+		.bindcnt = gact->tcf_bindcnt - bind,
+		.action  = gact->tcf_action,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
+#ifdef CONFIG_GACT_PROB
+	if (gact->tcfg_ptype) {
+		struct tc_gact_p p_opt = {
+			.paction = gact->tcfg_paction,
+			.pval    = gact->tcfg_pval,
+			.ptype   = gact->tcfg_ptype,
+		};
+
+		NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
+	}
+#endif
+	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
+	NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_gact_ops = {
+	.kind		=	"gact",
+	.hinfo		=	&gact_hash_info,
+	.type		=	TCA_ACT_GACT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_gact,
+	.dump		=	tcf_gact_dump,
+	.cleanup	=	tcf_gact_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_gact_init,
+	.walk		=	tcf_generic_walker
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Classifier actions");
+MODULE_LICENSE("GPL");
+
+static int __init gact_init_module(void)
+{
+#ifdef CONFIG_GACT_PROB
+	pr_info("GACT probability on\n");
+#else
+	pr_info("GACT probability NOT on\n");
+#endif
+	return tcf_register_action(&act_gact_ops);
+}
+
+static void __exit gact_cleanup_module(void)
+{
+	tcf_unregister_action(&act_gact_ops);
+}
+
+module_init(gact_init_module);
+module_exit(gact_cleanup_module);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
new file mode 100644
index 00000000..9fc211a1
--- /dev/null
+++ b/net/sched/act_ipt.c
@@ -0,0 +1,317 @@
+/*
+ * net/sched/ipt.c	iptables target interface
+ *
+ *TODO: Add other tables. For now we only support the ipv4 table targets
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Copyright:	Jamal Hadi Salim (2002-4)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_ipt.h>
+#include <net/tc_act/tc_ipt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+
+#define IPT_TAB_MASK     15
+static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1];
+static u32 ipt_idx_gen;
+static DEFINE_RWLOCK(ipt_lock);
+
+static struct tcf_hashinfo ipt_hash_info = {
+	.htab	=	tcf_ipt_ht,
+	.hmask	=	IPT_TAB_MASK,
+	.lock	=	&ipt_lock,
+};
+
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
+{
+	struct xt_tgchk_param par;
+	struct xt_target *target;
+	int ret = 0;
+
+	target = xt_request_find_target(AF_INET, t->u.user.name,
+					t->u.user.revision);
+	if (IS_ERR(target))
+		return PTR_ERR(target);
+
+	t->u.kernel.target = target;
+	par.table     = table;
+	par.entryinfo = NULL;
+	par.target    = target;
+	par.targinfo  = t->data;
+	par.hook_mask = hook;
+	par.family    = NFPROTO_IPV4;
+
+	ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
+	if (ret < 0) {
+		module_put(t->u.kernel.target->me);
+		return ret;
+	}
+	return 0;
+}
+
+static void ipt_destroy_target(struct xt_entry_target *t)
+{
+	struct xt_tgdtor_param par = {
+		.target   = t->u.kernel.target,
+		.targinfo = t->data,
+	};
+	if (par.target->destroy != NULL)
+		par.target->destroy(&par);
+	module_put(par.target->me);
+}
+
+static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
+{
+	int ret = 0;
+	if (ipt) {
+		if (bind)
+			ipt->tcf_bindcnt--;
+		ipt->tcf_refcnt--;
+		if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) {
+			ipt_destroy_target(ipt->tcfi_t);
+			kfree(ipt->tcfi_tname);
+			kfree(ipt->tcfi_t);
+			tcf_hash_destroy(&ipt->common, &ipt_hash_info);
+			ret = ACT_P_DELETED;
+		}
+	}
+	return ret;
+}
+
+static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
+	[TCA_IPT_TABLE]	= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_IPT_HOOK]	= { .type = NLA_U32 },
+	[TCA_IPT_INDEX]	= { .type = NLA_U32 },
+	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
+};
+
+static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
+			struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_IPT_MAX + 1];
+	struct tcf_ipt *ipt;
+	struct tcf_common *pc;
+	struct xt_entry_target *td, *t;
+	char *tname;
+	int ret = 0, err;
+	u32 hook = 0;
+	u32 index = 0;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_IPT_HOOK] == NULL)
+		return -EINVAL;
+	if (tb[TCA_IPT_TARG] == NULL)
+		return -EINVAL;
+
+	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
+		return -EINVAL;
+
+	if (tb[TCA_IPT_INDEX] != NULL)
+		index = nla_get_u32(tb[TCA_IPT_INDEX]);
+
+	pc = tcf_hash_check(index, a, bind, &ipt_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
+				     &ipt_idx_gen, &ipt_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		if (!ovr) {
+			tcf_ipt_release(to_ipt(pc), bind);
+			return -EEXIST;
+		}
+	}
+	ipt = to_ipt(pc);
+
+	hook = nla_get_u32(tb[TCA_IPT_HOOK]);
+
+	err = -ENOMEM;
+	tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
+	if (unlikely(!tname))
+		goto err1;
+	if (tb[TCA_IPT_TABLE] == NULL ||
+	    nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
+		strcpy(tname, "mangle");
+
+	t = kmemdup(td, td->u.target_size, GFP_KERNEL);
+	if (unlikely(!t))
+		goto err2;
+
+	err = ipt_init_target(t, tname, hook);
+	if (err < 0)
+		goto err3;
+
+	spin_lock_bh(&ipt->tcf_lock);
+	if (ret != ACT_P_CREATED) {
+		ipt_destroy_target(ipt->tcfi_t);
+		kfree(ipt->tcfi_tname);
+		kfree(ipt->tcfi_t);
+	}
+	ipt->tcfi_tname = tname;
+	ipt->tcfi_t     = t;
+	ipt->tcfi_hook  = hook;
+	spin_unlock_bh(&ipt->tcf_lock);
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &ipt_hash_info);
+	return ret;
+
+err3:
+	kfree(t);
+err2:
+	kfree(tname);
+err1:
+	kfree(pc);
+	return err;
+}
+
+static int tcf_ipt_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ipt *ipt = a->priv;
+	return tcf_ipt_release(ipt, bind);
+}
+
+static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
+		   struct tcf_result *res)
+{
+	int ret = 0, result = 0;
+	struct tcf_ipt *ipt = a->priv;
+	struct xt_action_param par;
+
+	if (skb_cloned(skb)) {
+		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			return TC_ACT_UNSPEC;
+	}
+
+	spin_lock(&ipt->tcf_lock);
+
+	ipt->tcf_tm.lastuse = jiffies;
+	bstats_update(&ipt->tcf_bstats, skb);
+
+	/* yes, we have to worry about both in and out dev
+	 * worry later - danger - this API seems to have changed
+	 * from earlier kernels
+	 */
+	par.in       = skb->dev;
+	par.out      = NULL;
+	par.hooknum  = ipt->tcfi_hook;
+	par.target   = ipt->tcfi_t->u.kernel.target;
+	par.targinfo = ipt->tcfi_t->data;
+	ret = par.target->target(skb, &par);
+
+	switch (ret) {
+	case NF_ACCEPT:
+		result = TC_ACT_OK;
+		break;
+	case NF_DROP:
+		result = TC_ACT_SHOT;
+		ipt->tcf_qstats.drops++;
+		break;
+	case XT_CONTINUE:
+		result = TC_ACT_PIPE;
+		break;
+	default:
+		if (net_ratelimit())
+			pr_notice("tc filter: Bogus netfilter code"
+				  " %d assume ACCEPT\n", ret);
+		result = TC_POLICE_OK;
+		break;
+	}
+	spin_unlock(&ipt->tcf_lock);
+	return result;
+
+}
+
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ipt *ipt = a->priv;
+	struct xt_entry_target *t;
+	struct tcf_t tm;
+	struct tc_cnt c;
+
+	/* for simple targets kernel size == user size
+	 * user name = target name
+	 * for foolproof you need to not assume this
+	 */
+
+	t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
+	if (unlikely(!t))
+		goto nla_put_failure;
+
+	c.bindcnt = ipt->tcf_bindcnt - bind;
+	c.refcnt = ipt->tcf_refcnt - ref;
+	strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
+
+	NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t);
+	NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index);
+	NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook);
+	NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
+	NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname);
+	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
+	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
+	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
+	NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
+	kfree(t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	kfree(t);
+	return -1;
+}
+
+static struct tc_action_ops act_ipt_ops = {
+	.kind		=	"ipt",
+	.hinfo		=	&ipt_hash_info,
+	.type		=	TCA_ACT_IPT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_ipt,
+	.dump		=	tcf_ipt_dump,
+	.cleanup	=	tcf_ipt_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_ipt_init,
+	.walk		=	tcf_generic_walker
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Iptables target actions");
+MODULE_LICENSE("GPL");
+
+static int __init ipt_init_module(void)
+{
+	return tcf_register_action(&act_ipt_ops);
+}
+
+static void __exit ipt_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ipt_ops);
+}
+
+module_init(ipt_init_module);
+module_exit(ipt_cleanup_module);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
new file mode 100644
index 00000000..961386e2
--- /dev/null
+++ b/net/sched/act_mirred.c
@@ -0,0 +1,300 @@
+/*
+ * net/sched/mirred.c	packet mirroring and redirect actions
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Jamal Hadi Salim (2002-4)
+ *
+ * TODO: Add ingress support (and socket redirect support)
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_mirred.h>
+
+#include <linux/if_arp.h>
+
+#define MIRRED_TAB_MASK     7
+static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1];
+static u32 mirred_idx_gen;
+static DEFINE_RWLOCK(mirred_lock);
+static LIST_HEAD(mirred_list);
+
+static struct tcf_hashinfo mirred_hash_info = {
+	.htab	=	tcf_mirred_ht,
+	.hmask	=	MIRRED_TAB_MASK,
+	.lock	=	&mirred_lock,
+};
+
+static int tcf_mirred_release(struct tcf_mirred *m, int bind)
+{
+	if (m) {
+		if (bind)
+			m->tcf_bindcnt--;
+		m->tcf_refcnt--;
+		if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
+			list_del(&m->tcfm_list);
+			if (m->tcfm_dev)
+				dev_put(m->tcfm_dev);
+			tcf_hash_destroy(&m->common, &mirred_hash_info);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
+	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
+};
+
+static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
+			   struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_MIRRED_MAX + 1];
+	struct tc_mirred *parm;
+	struct tcf_mirred *m;
+	struct tcf_common *pc;
+	struct net_device *dev;
+	int ret, ok_push = 0;
+
+	if (nla == NULL)
+		return -EINVAL;
+	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
+	if (ret < 0)
+		return ret;
+	if (tb[TCA_MIRRED_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_MIRRED_PARMS]);
+	switch (parm->eaction) {
+	case TCA_EGRESS_MIRROR:
+	case TCA_EGRESS_REDIR:
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (parm->ifindex) {
+		dev = __dev_get_by_index(&init_net, parm->ifindex);
+		if (dev == NULL)
+			return -ENODEV;
+		switch (dev->type) {
+		case ARPHRD_TUNNEL:
+		case ARPHRD_TUNNEL6:
+		case ARPHRD_SIT:
+		case ARPHRD_IPGRE:
+		case ARPHRD_VOID:
+		case ARPHRD_NONE:
+			ok_push = 0;
+			break;
+		default:
+			ok_push = 1;
+			break;
+		}
+	} else {
+		dev = NULL;
+	}
+
+	pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info);
+	if (!pc) {
+		if (dev == NULL)
+			return -EINVAL;
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind,
+				     &mirred_idx_gen, &mirred_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		if (!ovr) {
+			tcf_mirred_release(to_mirred(pc), bind);
+			return -EEXIST;
+		}
+	}
+	m = to_mirred(pc);
+
+	spin_lock_bh(&m->tcf_lock);
+	m->tcf_action = parm->action;
+	m->tcfm_eaction = parm->eaction;
+	if (dev != NULL) {
+		m->tcfm_ifindex = parm->ifindex;
+		if (ret != ACT_P_CREATED)
+			dev_put(m->tcfm_dev);
+		dev_hold(dev);
+		m->tcfm_dev = dev;
+		m->tcfm_ok_push = ok_push;
+	}
+	spin_unlock_bh(&m->tcf_lock);
+	if (ret == ACT_P_CREATED) {
+		list_add(&m->tcfm_list, &mirred_list);
+		tcf_hash_insert(pc, &mirred_hash_info);
+	}
+
+	return ret;
+}
+
+static int tcf_mirred_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_mirred *m = a->priv;
+
+	if (m)
+		return tcf_mirred_release(m, bind);
+	return 0;
+}
+
+static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
+		      struct tcf_result *res)
+{
+	struct tcf_mirred *m = a->priv;
+	struct net_device *dev;
+	struct sk_buff *skb2;
+	u32 at;
+	int retval, err = 1;
+
+	spin_lock(&m->tcf_lock);
+	m->tcf_tm.lastuse = jiffies;
+	bstats_update(&m->tcf_bstats, skb);
+
+	dev = m->tcfm_dev;
+	if (!dev) {
+		printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
+		goto out;
+	}
+
+	if (!(dev->flags & IFF_UP)) {
+		if (net_ratelimit())
+			pr_notice("tc mirred to Houston: device %s is down\n",
+				  dev->name);
+		goto out;
+	}
+
+	at = G_TC_AT(skb->tc_verd);
+	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
+	if (skb2 == NULL)
+		goto out;
+
+	if (!(at & AT_EGRESS)) {
+		if (m->tcfm_ok_push)
+			skb_push(skb2, skb2->dev->hard_header_len);
+	}
+
+	/* mirror is always swallowed */
+	if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
+		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
+
+	skb2->skb_iif = skb->dev->ifindex;
+	skb2->dev = dev;
+	dev_queue_xmit(skb2);
+	err = 0;
+
+out:
+	if (err) {
+		m->tcf_qstats.overlimits++;
+		/* should we be asking for packet to be dropped?
+		 * may make sense for redirect case only
+		 */
+		retval = TC_ACT_SHOT;
+	} else {
+		retval = m->tcf_action;
+	}
+	spin_unlock(&m->tcf_lock);
+
+	return retval;
+}
+
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_mirred *m = a->priv;
+	struct tc_mirred opt = {
+		.index   = m->tcf_index,
+		.action  = m->tcf_action,
+		.refcnt  = m->tcf_refcnt - ref,
+		.bindcnt = m->tcf_bindcnt - bind,
+		.eaction = m->tcfm_eaction,
+		.ifindex = m->tcfm_ifindex,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
+	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
+	NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int mirred_device_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct tcf_mirred *m;
+
+	if (event == NETDEV_UNREGISTER)
+		list_for_each_entry(m, &mirred_list, tcfm_list) {
+			if (m->tcfm_dev == dev) {
+				dev_put(dev);
+				m->tcfm_dev = NULL;
+			}
+		}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mirred_device_notifier = {
+	.notifier_call = mirred_device_event,
+};
+
+
+static struct tc_action_ops act_mirred_ops = {
+	.kind		=	"mirred",
+	.hinfo		=	&mirred_hash_info,
+	.type		=	TCA_ACT_MIRRED,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_mirred,
+	.dump		=	tcf_mirred_dump,
+	.cleanup	=	tcf_mirred_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_mirred_init,
+	.walk		=	tcf_generic_walker
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002)");
+MODULE_DESCRIPTION("Device Mirror/redirect actions");
+MODULE_LICENSE("GPL");
+
+static int __init mirred_init_module(void)
+{
+	int err = register_netdevice_notifier(&mirred_device_notifier);
+	if (err)
+		return err;
+
+	pr_info("Mirror/redirect action on\n");
+	return tcf_register_action(&act_mirred_ops);
+}
+
+static void __exit mirred_cleanup_module(void)
+{
+	unregister_netdevice_notifier(&mirred_device_notifier);
+	tcf_unregister_action(&act_mirred_ops);
+}
+
+module_init(mirred_init_module);
+module_exit(mirred_cleanup_module);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
new file mode 100644
index 00000000..762b0276
--- /dev/null
+++ b/net/sched/act_nat.c
@@ -0,0 +1,328 @@
+/*
+ * Stateless NAT actions
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tc_act/tc_nat.h>
+#include <net/act_api.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/tc_act/tc_nat.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+
+#define NAT_TAB_MASK	15
+static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1];
+static u32 nat_idx_gen;
+static DEFINE_RWLOCK(nat_lock);
+
+static struct tcf_hashinfo nat_hash_info = {
+	.htab	=	tcf_nat_ht,
+	.hmask	=	NAT_TAB_MASK,
+	.lock	=	&nat_lock,
+};
+
+static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
+	[TCA_NAT_PARMS]	= { .len = sizeof(struct tc_nat) },
+};
+
+static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
+			struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_NAT_MAX + 1];
+	struct tc_nat *parm;
+	int ret = 0, err;
+	struct tcf_nat *p;
+	struct tcf_common *pc;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_NAT_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_NAT_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+				     &nat_idx_gen, &nat_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		p = to_tcf_nat(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		p = to_tcf_nat(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &nat_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&p->tcf_lock);
+	p->old_addr = parm->old_addr;
+	p->new_addr = parm->new_addr;
+	p->mask = parm->mask;
+	p->flags = parm->flags;
+
+	p->tcf_action = parm->action;
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &nat_hash_info);
+
+	return ret;
+}
+
+static int tcf_nat_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_nat *p = a->priv;
+
+	return tcf_hash_release(&p->common, bind, &nat_hash_info);
+}
+
+static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
+		   struct tcf_result *res)
+{
+	struct tcf_nat *p = a->priv;
+	struct iphdr *iph;
+	__be32 old_addr;
+	__be32 new_addr;
+	__be32 mask;
+	__be32 addr;
+	int egress;
+	int action;
+	int ihl;
+	int noff;
+
+	spin_lock(&p->tcf_lock);
+
+	p->tcf_tm.lastuse = jiffies;
+	old_addr = p->old_addr;
+	new_addr = p->new_addr;
+	mask = p->mask;
+	egress = p->flags & TCA_NAT_FLAG_EGRESS;
+	action = p->tcf_action;
+
+	bstats_update(&p->tcf_bstats, skb);
+
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	noff = skb_network_offset(skb);
+	if (!pskb_may_pull(skb, sizeof(*iph) + noff))
+		goto drop;
+
+	iph = ip_hdr(skb);
+
+	if (egress)
+		addr = iph->saddr;
+	else
+		addr = iph->daddr;
+
+	if (!((old_addr ^ addr) & mask)) {
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + noff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto drop;
+
+		new_addr &= mask;
+		new_addr |= addr & ~mask;
+
+		/* Rewrite IP header */
+		iph = ip_hdr(skb);
+		if (egress)
+			iph->saddr = new_addr;
+		else
+			iph->daddr = new_addr;
+
+		csum_replace4(&iph->check, addr, new_addr);
+	} else if ((iph->frag_off & htons(IP_OFFSET)) ||
+		   iph->protocol != IPPROTO_ICMP) {
+		goto out;
+	}
+
+	ihl = iph->ihl * 4;
+
+	/* It would be nice to share code with stateful NAT. */
+	switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+	case IPPROTO_TCP:
+	{
+		struct tcphdr *tcph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
+		    (skb_cloned(skb) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
+		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+			goto drop;
+
+		tcph = (void *)(skb_network_header(skb) + ihl);
+		inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
+		break;
+	}
+	case IPPROTO_UDP:
+	{
+		struct udphdr *udph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
+		    (skb_cloned(skb) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
+		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+			goto drop;
+
+		udph = (void *)(skb_network_header(skb) + ihl);
+		if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+			inet_proto_csum_replace4(&udph->check, skb, addr,
+						 new_addr, 1);
+			if (!udph->check)
+				udph->check = CSUM_MANGLED_0;
+		}
+		break;
+	}
+	case IPPROTO_ICMP:
+	{
+		struct icmphdr *icmph;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+
+		if ((icmph->type != ICMP_DEST_UNREACH) &&
+		    (icmph->type != ICMP_TIME_EXCEEDED) &&
+		    (icmph->type != ICMP_PARAMETERPROB))
+			break;
+
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
+					noff))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+		iph = (void *)(icmph + 1);
+		if (egress)
+			addr = iph->daddr;
+		else
+			addr = iph->saddr;
+
+		if ((old_addr ^ addr) & mask)
+			break;
+
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, ihl + sizeof(*icmph) +
+					     sizeof(*iph) + noff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto drop;
+
+		icmph = (void *)(skb_network_header(skb) + ihl);
+		iph = (void *)(icmph + 1);
+
+		new_addr &= mask;
+		new_addr |= addr & ~mask;
+
+		/* XXX Fix up the inner checksums. */
+		if (egress)
+			iph->daddr = new_addr;
+		else
+			iph->saddr = new_addr;
+
+		inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
+					 0);
+		break;
+	}
+	default:
+		break;
+	}
+
+out:
+	return action;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
+			int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_nat *p = a->priv;
+	struct tc_nat opt = {
+		.old_addr = p->old_addr,
+		.new_addr = p->new_addr,
+		.mask     = p->mask,
+		.flags    = p->flags,
+
+		.index    = p->tcf_index,
+		.action   = p->tcf_action,
+		.refcnt   = p->tcf_refcnt - ref,
+		.bindcnt  = p->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_nat_ops = {
+	.kind		=	"nat",
+	.hinfo		=	&nat_hash_info,
+	.type		=	TCA_ACT_NAT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_nat,
+	.dump		=	tcf_nat_dump,
+	.cleanup	=	tcf_nat_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_nat_init,
+	.walk		=	tcf_generic_walker
+};
+
+MODULE_DESCRIPTION("Stateless NAT actions");
+MODULE_LICENSE("GPL");
+
+static int __init nat_init_module(void)
+{
+	return tcf_register_action(&act_nat_ops);
+}
+
+static void __exit nat_cleanup_module(void)
+{
+	tcf_unregister_action(&act_nat_ops);
+}
+
+module_init(nat_init_module);
+module_exit(nat_cleanup_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
new file mode 100644
index 00000000..7affe9a9
--- /dev/null
+++ b/net/sched/act_pedit.c
@@ -0,0 +1,262 @@
+/*
+ * net/sched/pedit.c	Generic packet editor
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Jamal Hadi Salim (2002-4)
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <linux/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_pedit.h>
+
+#define PEDIT_TAB_MASK	15
+static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1];
+static u32 pedit_idx_gen;
+static DEFINE_RWLOCK(pedit_lock);
+
+static struct tcf_hashinfo pedit_hash_info = {
+	.htab	=	tcf_pedit_ht,
+	.hmask	=	PEDIT_TAB_MASK,
+	.lock	=	&pedit_lock,
+};
+
+static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
+	[TCA_PEDIT_PARMS]	= { .len = sizeof(struct tc_pedit) },
+};
+
+static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
+			  struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_PEDIT_MAX + 1];
+	struct tc_pedit *parm;
+	int ret = 0, err;
+	struct tcf_pedit *p;
+	struct tcf_common *pc;
+	struct tc_pedit_key *keys = NULL;
+	int ksize;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_PEDIT_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_PEDIT_PARMS]);
+	ksize = parm->nkeys * sizeof(struct tc_pedit_key);
+	if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
+		return -EINVAL;
+
+	pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info);
+	if (!pc) {
+		if (!parm->nkeys)
+			return -EINVAL;
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+				     &pedit_idx_gen, &pedit_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		p = to_pedit(pc);
+		keys = kmalloc(ksize, GFP_KERNEL);
+		if (keys == NULL) {
+			kfree(pc);
+			return -ENOMEM;
+		}
+		ret = ACT_P_CREATED;
+	} else {
+		p = to_pedit(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &pedit_hash_info);
+			return -EEXIST;
+		}
+		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
+			keys = kmalloc(ksize, GFP_KERNEL);
+			if (keys == NULL)
+				return -ENOMEM;
+		}
+	}
+
+	spin_lock_bh(&p->tcf_lock);
+	p->tcfp_flags = parm->flags;
+	p->tcf_action = parm->action;
+	if (keys) {
+		kfree(p->tcfp_keys);
+		p->tcfp_keys = keys;
+		p->tcfp_nkeys = parm->nkeys;
+	}
+	memcpy(p->tcfp_keys, parm->keys, ksize);
+	spin_unlock_bh(&p->tcf_lock);
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &pedit_hash_info);
+	return ret;
+}
+
+static int tcf_pedit_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_pedit *p = a->priv;
+
+	if (p) {
+		struct tc_pedit_key *keys = p->tcfp_keys;
+		if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) {
+			kfree(keys);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
+		     struct tcf_result *res)
+{
+	struct tcf_pedit *p = a->priv;
+	int i, munged = 0;
+	unsigned int off;
+
+	if (skb_cloned(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		return p->tcf_action;
+
+	off = skb_network_offset(skb);
+
+	spin_lock(&p->tcf_lock);
+
+	p->tcf_tm.lastuse = jiffies;
+
+	if (p->tcfp_nkeys > 0) {
+		struct tc_pedit_key *tkey = p->tcfp_keys;
+
+		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
+			u32 *ptr, _data;
+			int offset = tkey->off;
+
+			if (tkey->offmask) {
+				char *d, _d;
+
+				d = skb_header_pointer(skb, off + tkey->at, 1,
+						       &_d);
+				if (!d)
+					goto bad;
+				offset += (*d & tkey->offmask) >> tkey->shift;
+			}
+
+			if (offset % 4) {
+				pr_info("tc filter pedit"
+					" offset must be on 32 bit boundaries\n");
+				goto bad;
+			}
+			if (offset > 0 && offset > skb->len) {
+				pr_info("tc filter pedit"
+					" offset %d can't exceed pkt length %d\n",
+				       offset, skb->len);
+				goto bad;
+			}
+
+			ptr = skb_header_pointer(skb, off + offset, 4, &_data);
+			if (!ptr)
+				goto bad;
+			/* just do it, baby */
+			*ptr = ((*ptr & tkey->mask) ^ tkey->val);
+			if (ptr == &_data)
+				skb_store_bits(skb, off + offset, ptr, 4);
+			munged++;
+		}
+
+		if (munged)
+			skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
+		goto done;
+	} else
+		WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+
+bad:
+	p->tcf_qstats.overlimits++;
+done:
+	bstats_update(&p->tcf_bstats, skb);
+	spin_unlock(&p->tcf_lock);
+	return p->tcf_action;
+}
+
+static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
+			  int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_pedit *p = a->priv;
+	struct tc_pedit *opt;
+	struct tcf_t t;
+	int s;
+
+	s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
+
+	/* netlink spinlocks held above us - must use ATOMIC */
+	opt = kzalloc(s, GFP_ATOMIC);
+	if (unlikely(!opt))
+		return -ENOBUFS;
+
+	memcpy(opt->keys, p->tcfp_keys,
+	       p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+	opt->index = p->tcf_index;
+	opt->nkeys = p->tcfp_nkeys;
+	opt->flags = p->tcfp_flags;
+	opt->action = p->tcf_action;
+	opt->refcnt = p->tcf_refcnt - ref;
+	opt->bindcnt = p->tcf_bindcnt - bind;
+
+	NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
+	kfree(opt);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	kfree(opt);
+	return -1;
+}
+
+static struct tc_action_ops act_pedit_ops = {
+	.kind		=	"pedit",
+	.hinfo		=	&pedit_hash_info,
+	.type		=	TCA_ACT_PEDIT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_pedit,
+	.dump		=	tcf_pedit_dump,
+	.cleanup	=	tcf_pedit_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_pedit_init,
+	.walk		=	tcf_generic_walker
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
+MODULE_DESCRIPTION("Generic Packet Editor actions");
+MODULE_LICENSE("GPL");
+
+static int __init pedit_init_module(void)
+{
+	return tcf_register_action(&act_pedit_ops);
+}
+
+static void __exit pedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_pedit_ops);
+}
+
+module_init(pedit_init_module);
+module_exit(pedit_cleanup_module);
+
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
new file mode 100644
index 00000000..b3b9b32f
--- /dev/null
+++ b/net/sched/act_police.c
@@ -0,0 +1,402 @@
+/*
+ * net/sched/police.c	Input police filter.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * 		J Hadi Salim (action changes)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+
+#define L2T(p, L)   qdisc_l2t((p)->tcfp_R_tab, L)
+#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L)
+
+#define POL_TAB_MASK     15
+static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
+static u32 police_idx_gen;
+static DEFINE_RWLOCK(police_lock);
+
+static struct tcf_hashinfo police_hash_info = {
+	.htab	=	tcf_police_ht,
+	.hmask	=	POL_TAB_MASK,
+	.lock	=	&police_lock,
+};
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+	u32			index;
+	int			action;
+	u32			limit;
+	u32			burst;
+	u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+};
+
+/* Each policer is serialized by its individual spinlock */
+
+static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			      int type, struct tc_action *a)
+{
+	struct tcf_common *p;
+	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	struct nlattr *nest;
+
+	read_lock_bh(&police_lock);
+
+	s_i = cb->args[0];
+
+	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
+		p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];
+
+		for (; p; p = p->tcfc_next) {
+			index++;
+			if (index < s_i)
+				continue;
+			a->priv = p;
+			a->order = index;
+			nest = nla_nest_start(skb, a->order);
+			if (nest == NULL)
+				goto nla_put_failure;
+			if (type == RTM_DELACTION)
+				err = tcf_action_dump_1(skb, a, 0, 1);
+			else
+				err = tcf_action_dump_1(skb, a, 0, 0);
+			if (err < 0) {
+				index--;
+				nla_nest_cancel(skb, nest);
+				goto done;
+			}
+			nla_nest_end(skb, nest);
+			n_i++;
+		}
+	}
+done:
+	read_unlock_bh(&police_lock);
+	if (n_i)
+		cb->args[0] += n_i;
+	return n_i;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	goto done;
+}
+
+static void tcf_police_destroy(struct tcf_police *p)
+{
+	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
+	struct tcf_common **p1p;
+
+	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
+		if (*p1p == &p->common) {
+			write_lock_bh(&police_lock);
+			*p1p = p->tcf_next;
+			write_unlock_bh(&police_lock);
+			gen_kill_estimator(&p->tcf_bstats,
+					   &p->tcf_rate_est);
+			if (p->tcfp_R_tab)
+				qdisc_put_rtab(p->tcfp_R_tab);
+			if (p->tcfp_P_tab)
+				qdisc_put_rtab(p->tcfp_P_tab);
+			/*
+			 * gen_estimator est_timer() might access p->tcf_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			kfree_rcu(p, tcf_rcu);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
+static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
+	[TCA_POLICE_RATE]	= { .len = TC_RTAB_SIZE },
+	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE },
+	[TCA_POLICE_AVRATE]	= { .type = NLA_U32 },
+	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
+};
+
+static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
+				 struct tc_action *a, int ovr, int bind)
+{
+	unsigned int h;
+	int ret = 0, err;
+	struct nlattr *tb[TCA_POLICE_MAX + 1];
+	struct tc_police *parm;
+	struct tcf_police *police;
+	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
+	int size;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_POLICE_TBF] == NULL)
+		return -EINVAL;
+	size = nla_len(tb[TCA_POLICE_TBF]);
+	if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
+		return -EINVAL;
+	parm = nla_data(tb[TCA_POLICE_TBF]);
+
+	if (parm->index) {
+		struct tcf_common *pc;
+
+		pc = tcf_hash_lookup(parm->index, &police_hash_info);
+		if (pc != NULL) {
+			a->priv = pc;
+			police = to_police(pc);
+			if (bind) {
+				police->tcf_bindcnt += 1;
+				police->tcf_refcnt += 1;
+			}
+			if (ovr)
+				goto override;
+			return ret;
+		}
+	}
+
+	police = kzalloc(sizeof(*police), GFP_KERNEL);
+	if (police == NULL)
+		return -ENOMEM;
+	ret = ACT_P_CREATED;
+	police->tcf_refcnt = 1;
+	spin_lock_init(&police->tcf_lock);
+	if (bind)
+		police->tcf_bindcnt = 1;
+override:
+	if (parm->rate.rate) {
+		err = -ENOMEM;
+		R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
+		if (R_tab == NULL)
+			goto failure;
+
+		if (parm->peakrate.rate) {
+			P_tab = qdisc_get_rtab(&parm->peakrate,
+					       tb[TCA_POLICE_PEAKRATE]);
+			if (P_tab == NULL)
+				goto failure;
+		}
+	}
+
+	spin_lock_bh(&police->tcf_lock);
+	if (est) {
+		err = gen_replace_estimator(&police->tcf_bstats,
+					    &police->tcf_rate_est,
+					    &police->tcf_lock, est);
+		if (err)
+			goto failure_unlock;
+	} else if (tb[TCA_POLICE_AVRATE] &&
+		   (ret == ACT_P_CREATED ||
+		    !gen_estimator_active(&police->tcf_bstats,
+					  &police->tcf_rate_est))) {
+		err = -EINVAL;
+		goto failure_unlock;
+	}
+
+	/* No failure allowed after this point */
+	if (R_tab != NULL) {
+		qdisc_put_rtab(police->tcfp_R_tab);
+		police->tcfp_R_tab = R_tab;
+	}
+	if (P_tab != NULL) {
+		qdisc_put_rtab(police->tcfp_P_tab);
+		police->tcfp_P_tab = P_tab;
+	}
+
+	if (tb[TCA_POLICE_RESULT])
+		police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+	police->tcfp_toks = police->tcfp_burst = parm->burst;
+	police->tcfp_mtu = parm->mtu;
+	if (police->tcfp_mtu == 0) {
+		police->tcfp_mtu = ~0;
+		if (police->tcfp_R_tab)
+			police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
+	}
+	if (police->tcfp_P_tab)
+		police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
+	police->tcf_action = parm->action;
+
+	if (tb[TCA_POLICE_AVRATE])
+		police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+
+	spin_unlock_bh(&police->tcf_lock);
+	if (ret != ACT_P_CREATED)
+		return ret;
+
+	police->tcfp_t_c = psched_get_time();
+	police->tcf_index = parm->index ? parm->index :
+		tcf_hash_new_index(&police_idx_gen, &police_hash_info);
+	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
+	write_lock_bh(&police_lock);
+	police->tcf_next = tcf_police_ht[h];
+	tcf_police_ht[h] = &police->common;
+	write_unlock_bh(&police_lock);
+
+	a->priv = police;
+	return ret;
+
+failure_unlock:
+	spin_unlock_bh(&police->tcf_lock);
+failure:
+	if (P_tab)
+		qdisc_put_rtab(P_tab);
+	if (R_tab)
+		qdisc_put_rtab(R_tab);
+	if (ret == ACT_P_CREATED)
+		kfree(police);
+	return err;
+}
+
+static int tcf_act_police_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_police *p = a->priv;
+	int ret = 0;
+
+	if (p != NULL) {
+		if (bind)
+			p->tcf_bindcnt--;
+
+		p->tcf_refcnt--;
+		if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) {
+			tcf_police_destroy(p);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_police *police = a->priv;
+	psched_time_t now;
+	long toks;
+	long ptoks = 0;
+
+	spin_lock(&police->tcf_lock);
+
+	bstats_update(&police->tcf_bstats, skb);
+
+	if (police->tcfp_ewma_rate &&
+	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+		police->tcf_qstats.overlimits++;
+		if (police->tcf_action == TC_ACT_SHOT)
+			police->tcf_qstats.drops++;
+		spin_unlock(&police->tcf_lock);
+		return police->tcf_action;
+	}
+
+	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
+		if (police->tcfp_R_tab == NULL) {
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
+		}
+
+		now = psched_get_time();
+		toks = psched_tdiff_bounded(now, police->tcfp_t_c,
+					    police->tcfp_burst);
+		if (police->tcfp_P_tab) {
+			ptoks = toks + police->tcfp_ptoks;
+			if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
+				ptoks = (long)L2T_P(police, police->tcfp_mtu);
+			ptoks -= L2T_P(police, qdisc_pkt_len(skb));
+		}
+		toks += police->tcfp_toks;
+		if (toks > (long)police->tcfp_burst)
+			toks = police->tcfp_burst;
+		toks -= L2T(police, qdisc_pkt_len(skb));
+		if ((toks|ptoks) >= 0) {
+			police->tcfp_t_c = now;
+			police->tcfp_toks = toks;
+			police->tcfp_ptoks = ptoks;
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
+		}
+	}
+
+	police->tcf_qstats.overlimits++;
+	if (police->tcf_action == TC_ACT_SHOT)
+		police->tcf_qstats.drops++;
+	spin_unlock(&police->tcf_lock);
+	return police->tcf_action;
+}
+
+static int
+tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_police *police = a->priv;
+	struct tc_police opt = {
+		.index = police->tcf_index,
+		.action = police->tcf_action,
+		.mtu = police->tcfp_mtu,
+		.burst = police->tcfp_burst,
+		.refcnt = police->tcf_refcnt - ref,
+		.bindcnt = police->tcf_bindcnt - bind,
+	};
+
+	if (police->tcfp_R_tab)
+		opt.rate = police->tcfp_R_tab->rate;
+	if (police->tcfp_P_tab)
+		opt.peakrate = police->tcfp_P_tab->rate;
+	NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
+	if (police->tcfp_result)
+		NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
+	if (police->tcfp_ewma_rate)
+		NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+MODULE_AUTHOR("Alexey Kuznetsov");
+MODULE_DESCRIPTION("Policing actions");
+MODULE_LICENSE("GPL");
+
+static struct tc_action_ops act_police_ops = {
+	.kind		=	"police",
+	.hinfo		=	&police_hash_info,
+	.type		=	TCA_ID_POLICE,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_act_police,
+	.dump		=	tcf_act_police_dump,
+	.cleanup	=	tcf_act_police_cleanup,
+	.lookup		=	tcf_hash_search,
+	.init		=	tcf_act_police_locate,
+	.walk		=	tcf_act_police_walker
+};
+
+static int __init
+police_init_module(void)
+{
+	return tcf_register_action(&act_police_ops);
+}
+
+static void __exit
+police_cleanup_module(void)
+{
+	tcf_unregister_action(&act_police_ops);
+}
+
+module_init(police_init_module);
+module_exit(police_cleanup_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
new file mode 100644
index 00000000..a34a22de
--- /dev/null
+++ b/net/sched/act_simple.c
@@ -0,0 +1,218 @@
+/*
+ * net/sched/simp.c	Simple example of an action
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Jamal Hadi Salim (2005-8)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#define TCA_ACT_SIMP 22
+
+#include <linux/tc_act/tc_defact.h>
+#include <net/tc_act/tc_defact.h>
+
+#define SIMP_TAB_MASK     7
+static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1];
+static u32 simp_idx_gen;
+static DEFINE_RWLOCK(simp_lock);
+
+static struct tcf_hashinfo simp_hash_info = {
+	.htab	=	tcf_simp_ht,
+	.hmask	=	SIMP_TAB_MASK,
+	.lock	=	&simp_lock,
+};
+
+#define SIMP_MAX_DATA	32
+static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+{
+	struct tcf_defact *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	bstats_update(&d->tcf_bstats, skb);
+
+	/* print policy string followed by _ then packet count
+	 * Example if this was the 3rd packet and the string was "hello"
+	 * then it would look like "hello_3" (without quotes)
+	 */
+	pr_info("simple: %s_%d\n",
+	       (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static int tcf_simp_release(struct tcf_defact *d, int bind)
+{
+	int ret = 0;
+	if (d) {
+		if (bind)
+			d->tcf_bindcnt--;
+		d->tcf_refcnt--;
+		if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) {
+			kfree(d->tcfd_defdata);
+			tcf_hash_destroy(&d->common, &simp_hash_info);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+static int alloc_defdata(struct tcf_defact *d, char *defdata)
+{
+	d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
+	if (unlikely(!d->tcfd_defdata))
+		return -ENOMEM;
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	return 0;
+}
+
+static void reset_policy(struct tcf_defact *d, char *defdata,
+			 struct tc_defact *p)
+{
+	spin_lock_bh(&d->tcf_lock);
+	d->tcf_action = p->action;
+	memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	spin_unlock_bh(&d->tcf_lock);
+}
+
+static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
+	[TCA_DEF_PARMS]	= { .len = sizeof(struct tc_defact) },
+	[TCA_DEF_DATA]	= { .type = NLA_STRING, .len = SIMP_MAX_DATA },
+};
+
+static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_DEF_MAX + 1];
+	struct tc_defact *parm;
+	struct tcf_defact *d;
+	struct tcf_common *pc;
+	char *defdata;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_DEF_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_DEF_DATA] == NULL)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_DEF_PARMS]);
+	defdata = nla_data(tb[TCA_DEF_DATA]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &simp_idx_gen, &simp_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+
+		d = to_defact(pc);
+		ret = alloc_defdata(d, defdata);
+		if (ret < 0) {
+			kfree(pc);
+			return ret;
+		}
+		d->tcf_action = parm->action;
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_defact(pc);
+		if (!ovr) {
+			tcf_simp_release(d, bind);
+			return -EEXIST;
+		}
+		reset_policy(d, defdata, parm);
+	}
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &simp_hash_info);
+	return ret;
+}
+
+static int tcf_simp_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_defact *d = a->priv;
+
+	if (d)
+		return tcf_simp_release(d, bind);
+	return 0;
+}
+
+static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_defact *d = a->priv;
+	struct tc_defact opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
+	NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_simp_ops = {
+	.kind		=	"simple",
+	.hinfo		=	&simp_hash_info,
+	.type		=	TCA_ACT_SIMP,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_simp,
+	.dump		=	tcf_simp_dump,
+	.cleanup	=	tcf_simp_cleanup,
+	.init		=	tcf_simp_init,
+	.walk		=	tcf_generic_walker,
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim(2005)");
+MODULE_DESCRIPTION("Simple example action");
+MODULE_LICENSE("GPL");
+
+static int __init simp_init_module(void)
+{
+	int ret = tcf_register_action(&act_simp_ops);
+	if (!ret)
+		pr_info("Simple TC action Loaded\n");
+	return ret;
+}
+
+static void __exit simp_cleanup_module(void)
+{
+	tcf_unregister_action(&act_simp_ops);
+}
+
+module_init(simp_init_module);
+module_exit(simp_cleanup_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 00000000..5f6f0c7c
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK     15
+static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
+static u32 skbedit_idx_gen;
+static DEFINE_RWLOCK(skbedit_lock);
+
+static struct tcf_hashinfo skbedit_hash_info = {
+	.htab	=	tcf_skbedit_ht,
+	.hmask	=	SKBEDIT_TAB_MASK,
+	.lock	=	&skbedit_lock,
+};
+
+static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	bstats_update(&d->tcf_bstats, skb);
+
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+	    skb->dev->real_num_tx_queues > d->queue_mapping)
+		skb_set_queue_mapping(skb, d->queue_mapping);
+	if (d->flags & SKBEDIT_F_MARK)
+		skb->mark = d->mark;
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
+	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
+};
+
+static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+	struct tc_skbedit *parm;
+	struct tcf_skbedit *d;
+	struct tcf_common *pc;
+	u32 flags = 0, *priority = NULL, *mark = NULL;
+	u16 *queue_mapping = NULL;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBEDIT_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+		flags |= SKBEDIT_F_PRIORITY;
+		priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+	}
+
+	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		flags |= SKBEDIT_F_QUEUE_MAPPING;
+		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+	}
+
+	if (tb[TCA_SKBEDIT_MARK] != NULL) {
+		flags |= SKBEDIT_F_MARK;
+		mark = nla_data(tb[TCA_SKBEDIT_MARK]);
+	}
+
+	if (!flags)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &skbedit_idx_gen, &skbedit_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+
+		d = to_skbedit(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbedit(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &skbedit_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	d->flags = flags;
+	if (flags & SKBEDIT_F_PRIORITY)
+		d->priority = *priority;
+	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+		d->queue_mapping = *queue_mapping;
+	if (flags & SKBEDIT_F_MARK)
+		d->mark = *mark;
+
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &skbedit_hash_info);
+	return ret;
+}
+
+static int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	if (d)
+		return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
+	return 0;
+}
+
+static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+			    int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbedit *d = a->priv;
+	struct tc_skbedit opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+			&d->priority);
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
+		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+			sizeof(d->queue_mapping), &d->queue_mapping);
+	if (d->flags & SKBEDIT_F_MARK)
+		NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
+			&d->mark);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+	.kind		=	"skbedit",
+	.hinfo		=	&skbedit_hash_info,
+	.type		=	TCA_ACT_SKBEDIT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbedit,
+	.dump		=	tcf_skbedit_dump,
+	.cleanup	=	tcf_skbedit_cleanup,
+	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_generic_walker,
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+	return tcf_register_action(&act_skbedit_ops);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
new file mode 100644
index 00000000..bb2c523f
--- /dev/null
+++ b/net/sched/cls_api.c
@@ -0,0 +1,621 @@
+/*
+ * net/sched/cls_api.c	Packet classifier API.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/netlink.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+/* The list of all installed classifier types */
+
+static struct tcf_proto_ops *tcf_proto_base __read_mostly;
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(cls_mod_lock);
+
+/* Find classifier type by string name */
+
+static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
+{
+	struct tcf_proto_ops *t = NULL;
+
+	if (kind) {
+		read_lock(&cls_mod_lock);
+		for (t = tcf_proto_base; t; t = t->next) {
+			if (nla_strcmp(kind, t->kind) == 0) {
+				if (!try_module_get(t->owner))
+					t = NULL;
+				break;
+			}
+		}
+		read_unlock(&cls_mod_lock);
+	}
+	return t;
+}
+
+/* Register(unregister) new classifier type */
+
+int register_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+	struct tcf_proto_ops *t, **tp;
+	int rc = -EEXIST;
+
+	write_lock(&cls_mod_lock);
+	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+		if (!strcmp(ops->kind, t->kind))
+			goto out;
+
+	ops->next = NULL;
+	*tp = ops;
+	rc = 0;
+out:
+	write_unlock(&cls_mod_lock);
+	return rc;
+}
+EXPORT_SYMBOL(register_tcf_proto_ops);
+
+int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
+{
+	struct tcf_proto_ops *t, **tp;
+	int rc = -ENOENT;
+
+	write_lock(&cls_mod_lock);
+	for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
+		if (t == ops)
+			break;
+
+	if (!t)
+		goto out;
+	*tp = t->next;
+	rc = 0;
+out:
+	write_unlock(&cls_mod_lock);
+	return rc;
+}
+EXPORT_SYMBOL(unregister_tcf_proto_ops);
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event);
+
+
+/* Select new prio value from the range, managed by kernel. */
+
+static inline u32 tcf_auto_prio(struct tcf_proto *tp)
+{
+	u32 first = TC_H_MAKE(0xC0000000U, 0U);
+
+	if (tp)
+		first = tp->prio - 1;
+
+	return first;
+}
+
+/* Add/change/delete/get a filter node */
+
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	spinlock_t *root_lock;
+	struct tcmsg *t;
+	u32 protocol;
+	u32 prio;
+	u32 nprio;
+	u32 parent;
+	struct net_device *dev;
+	struct Qdisc  *q;
+	struct tcf_proto **back, **chain;
+	struct tcf_proto *tp;
+	struct tcf_proto_ops *tp_ops;
+	const struct Qdisc_class_ops *cops;
+	unsigned long cl;
+	unsigned long fh;
+	int err;
+	int tp_created = 0;
+
+replay:
+	t = NLMSG_DATA(n);
+	protocol = TC_H_MIN(t->tcm_info);
+	prio = TC_H_MAJ(t->tcm_info);
+	nprio = prio;
+	parent = t->tcm_parent;
+	cl = 0;
+
+	if (prio == 0) {
+		/* If no priority is given, user wants we allocated it. */
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
+			return -ENOENT;
+		prio = TC_H_MAKE(0x80000000U, 0U);
+	}
+
+	/* Find head of filter chain. */
+
+	/* Find link */
+	dev = __dev_get_by_index(net, t->tcm_ifindex);
+	if (dev == NULL)
+		return -ENODEV;
+
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	/* Find qdisc */
+	if (!parent) {
+		q = dev->qdisc;
+		parent = q->handle;
+	} else {
+		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
+		if (q == NULL)
+			return -EINVAL;
+	}
+
+	/* Is it classful? */
+	cops = q->ops->cl_ops;
+	if (!cops)
+		return -EINVAL;
+
+	if (cops->tcf_chain == NULL)
+		return -EOPNOTSUPP;
+
+	/* Do we search for filter, attached to class? */
+	if (TC_H_MIN(parent)) {
+		cl = cops->get(q, parent);
+		if (cl == 0)
+			return -ENOENT;
+	}
+
+	/* And the last stroke */
+	chain = cops->tcf_chain(q, cl);
+	err = -EINVAL;
+	if (chain == NULL)
+		goto errout;
+
+	/* Check the chain for existence of proto-tcf with this priority */
+	for (back = chain; (tp = *back) != NULL; back = &tp->next) {
+		if (tp->prio >= prio) {
+			if (tp->prio == prio) {
+				if (!nprio ||
+				    (tp->protocol != protocol && protocol))
+					goto errout;
+			} else
+				tp = NULL;
+			break;
+		}
+	}
+
+	root_lock = qdisc_root_sleeping_lock(q);
+
+	if (tp == NULL) {
+		/* Proto-tcf does not exist, create new one */
+
+		if (tca[TCA_KIND] == NULL || !protocol)
+			goto errout;
+
+		err = -ENOENT;
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
+			goto errout;
+
+
+		/* Create new proto tcf */
+
+		err = -ENOBUFS;
+		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+		if (tp == NULL)
+			goto errout;
+		err = -ENOENT;
+		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
+		if (tp_ops == NULL) {
+#ifdef CONFIG_MODULES
+			struct nlattr *kind = tca[TCA_KIND];
+			char name[IFNAMSIZ];
+
+			if (kind != NULL &&
+			    nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+				rtnl_unlock();
+				request_module("cls_%s", name);
+				rtnl_lock();
+				tp_ops = tcf_proto_lookup_ops(kind);
+				/* We dropped the RTNL semaphore in order to
+				 * perform the module load.  So, even if we
+				 * succeeded in loading the module we have to
+				 * replay the request.  We indicate this using
+				 * -EAGAIN.
+				 */
+				if (tp_ops != NULL) {
+					module_put(tp_ops->owner);
+					err = -EAGAIN;
+				}
+			}
+#endif
+			kfree(tp);
+			goto errout;
+		}
+		tp->ops = tp_ops;
+		tp->protocol = protocol;
+		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
+		tp->q = q;
+		tp->classify = tp_ops->classify;
+		tp->classid = parent;
+
+		err = tp_ops->init(tp);
+		if (err != 0) {
+			module_put(tp_ops->owner);
+			kfree(tp);
+			goto errout;
+		}
+
+		tp_created = 1;
+
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
+		goto errout;
+
+	fh = tp->ops->get(tp, t->tcm_handle);
+
+	if (fh == 0) {
+		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
+			spin_lock_bh(root_lock);
+			*back = tp->next;
+			spin_unlock_bh(root_lock);
+
+			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+			tcf_destroy(tp);
+			err = 0;
+			goto errout;
+		}
+
+		err = -ENOENT;
+		if (n->nlmsg_type != RTM_NEWTFILTER ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
+			goto errout;
+	} else {
+		switch (n->nlmsg_type) {
+		case RTM_NEWTFILTER:
+			err = -EEXIST;
+			if (n->nlmsg_flags & NLM_F_EXCL) {
+				if (tp_created)
+					tcf_destroy(tp);
+				goto errout;
+			}
+			break;
+		case RTM_DELTFILTER:
+			err = tp->ops->delete(tp, fh);
+			if (err == 0)
+				tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+			goto errout;
+		case RTM_GETTFILTER:
+			err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+			goto errout;
+		default:
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+
+	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
+	if (err == 0) {
+		if (tp_created) {
+			spin_lock_bh(root_lock);
+			tp->next = *back;
+			*back = tp;
+			spin_unlock_bh(root_lock);
+		}
+		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+	} else {
+		if (tp_created)
+			tcf_destroy(tp);
+	}
+
+errout:
+	if (cl)
+		cops->put(q, cl);
+	if (err == -EAGAIN)
+		/* Replay the request. */
+		goto replay;
+	return err;
+}
+
+static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
+			 unsigned long fh, u32 pid, u32 seq, u16 flags, int event)
+{
+	struct tcmsg *tcm;
+	struct nlmsghdr  *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
+	tcm = NLMSG_DATA(nlh);
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
+	tcm->tcm_parent = tp->classid;
+	tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+	NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind);
+	tcm->tcm_handle = fh;
+	if (RTM_DELTFILTER != event) {
+		tcm->tcm_handle = 0;
+		if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
+			goto nla_put_failure;
+	}
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+			  struct nlmsghdr *n, struct tcf_proto *tp,
+			  unsigned long fh, int event)
+{
+	struct sk_buff *skb;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+			      n->nlmsg_flags & NLM_F_ECHO);
+}
+
+struct tcf_dump_args {
+	struct tcf_walker w;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+};
+
+static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
+			 struct tcf_walker *arg)
+{
+	struct tcf_dump_args *a = (void *)arg;
+
+	return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
+			     a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
+}
+
+/* called with RTNL */
+static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int t;
+	int s_t;
+	struct net_device *dev;
+	struct Qdisc *q;
+	struct tcf_proto *tp, **chain;
+	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	unsigned long cl = 0;
+	const struct Qdisc_class_ops *cops;
+	struct tcf_dump_args arg;
+
+	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+		return skb->len;
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return skb->len;
+
+	if (!tcm->tcm_parent)
+		q = dev->qdisc;
+	else
+		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+	if (!q)
+		goto out;
+	cops = q->ops->cl_ops;
+	if (!cops)
+		goto errout;
+	if (cops->tcf_chain == NULL)
+		goto errout;
+	if (TC_H_MIN(tcm->tcm_parent)) {
+		cl = cops->get(q, tcm->tcm_parent);
+		if (cl == 0)
+			goto errout;
+	}
+	chain = cops->tcf_chain(q, cl);
+	if (chain == NULL)
+		goto errout;
+
+	s_t = cb->args[0];
+
+	for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+		if (t < s_t)
+			continue;
+		if (TC_H_MAJ(tcm->tcm_info) &&
+		    TC_H_MAJ(tcm->tcm_info) != tp->prio)
+			continue;
+		if (TC_H_MIN(tcm->tcm_info) &&
+		    TC_H_MIN(tcm->tcm_info) != tp->protocol)
+			continue;
+		if (t > s_t)
+			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+		if (cb->args[1] == 0) {
+			if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
+					  cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					  RTM_NEWTFILTER) <= 0)
+				break;
+
+			cb->args[1] = 1;
+		}
+		if (tp->ops->walk == NULL)
+			continue;
+		arg.w.fn = tcf_node_dump;
+		arg.skb = skb;
+		arg.cb = cb;
+		arg.w.stop = 0;
+		arg.w.skip = cb->args[1] - 1;
+		arg.w.count = 0;
+		tp->ops->walk(tp, &arg.w);
+		cb->args[1] = arg.w.count + 1;
+		if (arg.w.stop)
+			break;
+	}
+
+	cb->args[0] = t;
+
+errout:
+	if (cl)
+		cops->put(q, cl);
+out:
+	return skb->len;
+}
+
+void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (exts->action) {
+		tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
+		exts->action = NULL;
+	}
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_destroy);
+
+int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
+		  struct nlattr *rate_tlv, struct tcf_exts *exts,
+		  const struct tcf_ext_map *map)
+{
+	memset(exts, 0, sizeof(*exts));
+
+#ifdef CONFIG_NET_CLS_ACT
+	{
+		struct tc_action *act;
+
+		if (map->police && tb[map->police]) {
+			act = tcf_action_init_1(tb[map->police], rate_tlv,
+						"police", TCA_ACT_NOREPLACE,
+						TCA_ACT_BIND);
+			if (IS_ERR(act))
+				return PTR_ERR(act);
+
+			act->type = TCA_OLD_COMPAT;
+			exts->action = act;
+		} else if (map->action && tb[map->action]) {
+			act = tcf_action_init(tb[map->action], rate_tlv, NULL,
+					      TCA_ACT_NOREPLACE, TCA_ACT_BIND);
+			if (IS_ERR(act))
+				return PTR_ERR(act);
+
+			exts->action = act;
+		}
+	}
+#else
+	if ((map->action && tb[map->action]) ||
+	    (map->police && tb[map->police]))
+		return -EOPNOTSUPP;
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL(tcf_exts_validate);
+
+void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
+		     struct tcf_exts *src)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (src->action) {
+		struct tc_action *act;
+		tcf_tree_lock(tp);
+		act = dst->action;
+		dst->action = src->action;
+		tcf_tree_unlock(tp);
+		if (act)
+			tcf_action_destroy(act, TCA_ACT_UNBIND);
+	}
+#endif
+}
+EXPORT_SYMBOL(tcf_exts_change);
+
+int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
+		  const struct tcf_ext_map *map)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (map->action && exts->action) {
+		/*
+		 * again for backward compatible mode - we want
+		 * to work with both old and new modes of entering
+		 * tc data even if iproute2  was newer - jhs
+		 */
+		struct nlattr *nest;
+
+		if (exts->action->type != TCA_OLD_COMPAT) {
+			nest = nla_nest_start(skb, map->action);
+			if (nest == NULL)
+				goto nla_put_failure;
+			if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
+				goto nla_put_failure;
+			nla_nest_end(skb, nest);
+		} else if (map->police) {
+			nest = nla_nest_start(skb, map->police);
+			if (nest == NULL)
+				goto nla_put_failure;
+			if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
+				goto nla_put_failure;
+			nla_nest_end(skb, nest);
+		}
+	}
+#endif
+	return 0;
+nla_put_failure: __attribute__ ((unused))
+	return -1;
+}
+EXPORT_SYMBOL(tcf_exts_dump);
+
+
+int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
+			const struct tcf_ext_map *map)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (exts->action)
+		if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
+			goto nla_put_failure;
+#endif
+	return 0;
+nla_put_failure: __attribute__ ((unused))
+	return -1;
+}
+EXPORT_SYMBOL(tcf_exts_dump_stats);
+
+static int __init tc_filter_init(void)
+{
+	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+						 tc_dump_tfilter);
+
+	return 0;
+}
+
+subsys_initcall(tc_filter_init);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
new file mode 100644
index 00000000..8be8872d
--- /dev/null
+++ b/net/sched/cls_basic.c
@@ -0,0 +1,306 @@
+/*
+ * net/sched/cls_basic.c	Basic Packet Classifier.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct basic_head {
+	u32			hgenerator;
+	struct list_head	flist;
+};
+
+struct basic_filter {
+	u32			handle;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+	struct tcf_result	res;
+	struct list_head	link;
+};
+
+static const struct tcf_ext_map basic_ext_map = {
+	.action = TCA_BASIC_ACT,
+	.police = TCA_BASIC_POLICE
+};
+
+static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			  struct tcf_result *res)
+{
+	int r;
+	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_filter *f;
+
+	list_for_each_entry(f, &head->flist, link) {
+		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+			continue;
+		*res = f->res;
+		r = tcf_exts_exec(skb, &f->exts, res);
+		if (r < 0)
+			continue;
+		return r;
+	}
+	return -1;
+}
+
+static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
+{
+	unsigned long l = 0UL;
+	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_filter *f;
+
+	if (head == NULL)
+		return 0UL;
+
+	list_for_each_entry(f, &head->flist, link)
+		if (f->handle == handle)
+			l = (unsigned long) f;
+
+	return l;
+}
+
+static void basic_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int basic_init(struct tcf_proto *tp)
+{
+	struct basic_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+	INIT_LIST_HEAD(&head->flist);
+	tp->root = head;
+	return 0;
+}
+
+static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
+{
+	tcf_unbind_filter(tp, &f->res);
+	tcf_exts_destroy(tp, &f->exts);
+	tcf_em_tree_destroy(tp, &f->ematches);
+	kfree(f);
+}
+
+static void basic_destroy(struct tcf_proto *tp)
+{
+	struct basic_head *head = tp->root;
+	struct basic_filter *f, *n;
+
+	list_for_each_entry_safe(f, n, &head->flist, link) {
+		list_del(&f->link);
+		basic_delete_filter(tp, f);
+	}
+	kfree(head);
+}
+
+static int basic_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_filter *t, *f = (struct basic_filter *) arg;
+
+	list_for_each_entry(t, &head->flist, link)
+		if (t == f) {
+			tcf_tree_lock(tp);
+			list_del(&t->link);
+			tcf_tree_unlock(tp);
+			basic_delete_filter(tp, t);
+			return 0;
+		}
+
+	return -ENOENT;
+}
+
+static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
+	[TCA_BASIC_CLASSID]	= { .type = NLA_U32 },
+	[TCA_BASIC_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
+			   unsigned long base, struct nlattr **tb,
+			   struct nlattr *est)
+{
+	int err = -EINVAL;
+	struct tcf_exts e;
+	struct tcf_ematch_tree t;
+
+	err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
+	if (err < 0)
+		goto errout;
+
+	if (tb[TCA_BASIC_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
+		tcf_bind_filter(tp, &f->res, base);
+	}
+
+	tcf_exts_change(tp, &f->exts, &e);
+	tcf_em_tree_change(tp, &f->ematches, &t);
+
+	return 0;
+errout:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+			struct nlattr **tca, unsigned long *arg)
+{
+	int err;
+	struct basic_head *head = (struct basic_head *) tp->root;
+	struct nlattr *tb[TCA_BASIC_MAX + 1];
+	struct basic_filter *f = (struct basic_filter *) *arg;
+
+	if (tca[TCA_OPTIONS] == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
+			       basic_policy);
+	if (err < 0)
+		return err;
+
+	if (f != NULL) {
+		if (handle && f->handle != handle)
+			return -EINVAL;
+		return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+	}
+
+	err = -ENOBUFS;
+	f = kzalloc(sizeof(*f), GFP_KERNEL);
+	if (f == NULL)
+		goto errout;
+
+	err = -EINVAL;
+	if (handle)
+		f->handle = handle;
+	else {
+		unsigned int i = 0x80000000;
+		do {
+			if (++head->hgenerator == 0x7FFFFFFF)
+				head->hgenerator = 1;
+		} while (--i > 0 && basic_get(tp, head->hgenerator));
+
+		if (i <= 0) {
+			pr_err("Insufficient number of handles\n");
+			goto errout;
+		}
+
+		f->handle = head->hgenerator;
+	}
+
+	err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+	if (err < 0)
+		goto errout;
+
+	tcf_tree_lock(tp);
+	list_add(&f->link, &head->flist);
+	tcf_tree_unlock(tp);
+	*arg = (unsigned long) f;
+
+	return 0;
+errout:
+	if (*arg == 0UL && f)
+		kfree(f);
+
+	return err;
+}
+
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_filter *f;
+
+	list_for_each_entry(f, &head->flist, link) {
+		if (arg->count < arg->skip)
+			goto skip;
+
+		if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+		      struct sk_buff *skb, struct tcmsg *t)
+{
+	struct basic_filter *f = (struct basic_filter *) fh;
+	struct nlattr *nest;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (f->res.classid)
+		NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid);
+
+	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &basic_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_basic_ops __read_mostly = {
+	.kind		=	"basic",
+	.classify	=	basic_classify,
+	.init		=	basic_init,
+	.destroy	=	basic_destroy,
+	.get		=	basic_get,
+	.put		=	basic_put,
+	.change		=	basic_change,
+	.delete		=	basic_delete,
+	.walk		=	basic_walk,
+	.dump		=	basic_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_basic(void)
+{
+	return register_tcf_proto_ops(&cls_basic_ops);
+}
+
+static void __exit exit_basic(void)
+{
+	unregister_tcf_proto_ops(&cls_basic_ops);
+}
+
+module_init(init_basic)
+module_exit(exit_basic)
+MODULE_LICENSE("GPL");
+
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
new file mode 100644
index 00000000..32a33519
--- /dev/null
+++ b/net/sched/cls_cgroup.c
@@ -0,0 +1,325 @@
+/*
+ * net/sched/cls_cgroup.c	Control Group Classifier
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+					       struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_cls_subsys = {
+	.name		= "net_cls",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+#ifdef CONFIG_NET_CLS_CGROUP
+	.subsys_id	= net_cls_subsys_id,
+#endif
+	.module		= THIS_MODULE,
+};
+
+
+static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
+{
+	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
+			    struct cgroup_cls_state, css);
+}
+
+static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, net_cls_subsys_id),
+			    struct cgroup_cls_state, css);
+}
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+						 struct cgroup *cgrp)
+{
+	struct cgroup_cls_state *cs;
+
+	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+	if (!cs)
+		return ERR_PTR(-ENOMEM);
+
+	if (cgrp->parent)
+		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
+
+	return &cs->css;
+}
+
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	kfree(cgrp_cls_state(cgrp));
+}
+
+static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgrp_cls_state(cgrp)->classid;
+}
+
+static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
+{
+	cgrp_cls_state(cgrp)->classid = (u32) value;
+	return 0;
+}
+
+static struct cftype ss_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = read_classid,
+		.write_u64 = write_classid,
+	},
+};
+
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+}
+
+struct cls_cgroup_head {
+	u32			handle;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+};
+
+static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			       struct tcf_result *res)
+{
+	struct cls_cgroup_head *head = tp->root;
+	u32 classid;
+
+	rcu_read_lock();
+	classid = task_cls_state(current)->classid;
+	rcu_read_unlock();
+
+	/*
+	 * Due to the nature of the classifier it is required to ignore all
+	 * packets originating from softirq context as accessing `current'
+	 * would lead to false results.
+	 *
+	 * This test assumes that all callers of dev_queue_xmit() explicitely
+	 * disable bh. Knowing this, it is possible to detect softirq based
+	 * calls by looking at the number of nested bh disable calls because
+	 * softirqs always disables bh.
+	 */
+	if (in_serving_softirq()) {
+		/* If there is an sk_classid we'll use that. */
+		if (!skb->sk)
+			return -1;
+		classid = skb->sk->sk_classid;
+	}
+
+	if (!classid)
+		return -1;
+
+	if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+		return -1;
+
+	res->classid = classid;
+	res->class = 0;
+	return tcf_exts_exec(skb, &head->exts, res);
+}
+
+static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+{
+	return 0UL;
+}
+
+static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_cgroup_init(struct tcf_proto *tp)
+{
+	return 0;
+}
+
+static const struct tcf_ext_map cgroup_ext_map = {
+	.action = TCA_CGROUP_ACT,
+	.police = TCA_CGROUP_POLICE,
+};
+
+static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
+	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
+			     u32 handle, struct nlattr **tca,
+			     unsigned long *arg)
+{
+	struct nlattr *tb[TCA_CGROUP_MAX + 1];
+	struct cls_cgroup_head *head = tp->root;
+	struct tcf_ematch_tree t;
+	struct tcf_exts e;
+	int err;
+
+	if (!tca[TCA_OPTIONS])
+		return -EINVAL;
+
+	if (head == NULL) {
+		if (!handle)
+			return -EINVAL;
+
+		head = kzalloc(sizeof(*head), GFP_KERNEL);
+		if (head == NULL)
+			return -ENOBUFS;
+
+		head->handle = handle;
+
+		tcf_tree_lock(tp);
+		tp->root = head;
+		tcf_tree_unlock(tp);
+	}
+
+	if (handle != head->handle)
+		return -ENOENT;
+
+	err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
+			       cgroup_policy);
+	if (err < 0)
+		return err;
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
+	if (err < 0)
+		return err;
+
+	tcf_exts_change(tp, &head->exts, &e);
+	tcf_em_tree_change(tp, &head->ematches, &t);
+
+	return 0;
+}
+
+static void cls_cgroup_destroy(struct tcf_proto *tp)
+{
+	struct cls_cgroup_head *head = tp->root;
+
+	if (head) {
+		tcf_exts_destroy(tp, &head->exts);
+		tcf_em_tree_destroy(tp, &head->ematches);
+		kfree(head);
+	}
+}
+
+static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_cgroup_head *head = tp->root;
+
+	if (arg->count < arg->skip)
+		goto skip;
+
+	if (arg->fn(tp, (unsigned long) head, arg) < 0) {
+		arg->stop = 1;
+		return;
+	}
+skip:
+	arg->count++;
+}
+
+static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
+			   struct sk_buff *skb, struct tcmsg *t)
+{
+	struct cls_cgroup_head *head = tp->root;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	t->tcm_handle = head->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 ||
+	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
+	.kind		=	"cgroup",
+	.init		=	cls_cgroup_init,
+	.change		=	cls_cgroup_change,
+	.classify	=	cls_cgroup_classify,
+	.destroy	=	cls_cgroup_destroy,
+	.get		=	cls_cgroup_get,
+	.put		=	cls_cgroup_put,
+	.delete		=	cls_cgroup_delete,
+	.walk		=	cls_cgroup_walk,
+	.dump		=	cls_cgroup_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_cgroup_cls(void)
+{
+	int ret;
+
+	ret = cgroup_load_subsys(&net_cls_subsys);
+	if (ret)
+		goto out;
+
+#ifndef CONFIG_NET_CLS_CGROUP
+	/* We can't use rcu_assign_pointer because this is an int. */
+	smp_wmb();
+	net_cls_subsys_id = net_cls_subsys.subsys_id;
+#endif
+
+	ret = register_tcf_proto_ops(&cls_cgroup_ops);
+	if (ret)
+		cgroup_unload_subsys(&net_cls_subsys);
+
+out:
+	return ret;
+}
+
+static void __exit exit_cgroup_cls(void)
+{
+	unregister_tcf_proto_ops(&cls_cgroup_ops);
+
+#ifndef CONFIG_NET_CLS_CGROUP
+	net_cls_subsys_id = -1;
+	synchronize_rcu();
+#endif
+
+	cgroup_unload_subsys(&net_cls_subsys);
+}
+
+module_init(init_cgroup_cls);
+module_exit(exit_cgroup_cls);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
new file mode 100644
index 00000000..8ec01391
--- /dev/null
+++ b/net/sched/cls_flow.c
@@ -0,0 +1,739 @@
+/*
+ * net/sched/cls_flow.c		Generic flow classifier
+ *
+ * Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/pkt_cls.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+
+#include <net/pkt_cls.h>
+#include <net/ip.h>
+#include <net/route.h>
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
+struct flow_head {
+	struct list_head	filters;
+};
+
+struct flow_filter {
+	struct list_head	list;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+	struct timer_list	perturb_timer;
+	u32			perturb_period;
+	u32			handle;
+
+	u32			nkeys;
+	u32			keymask;
+	u32			mode;
+	u32			mask;
+	u32			xor;
+	u32			rshift;
+	u32			addend;
+	u32			divisor;
+	u32			baseclass;
+	u32			hashrnd;
+};
+
+static const struct tcf_ext_map flow_ext_map = {
+	.action	= TCA_FLOW_ACT,
+	.police	= TCA_FLOW_POLICE,
+};
+
+static inline u32 addr_fold(void *addr)
+{
+	unsigned long a = (unsigned long)addr;
+
+	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
+}
+
+static u32 flow_get_src(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return ntohl(ip_hdr(skb)->saddr);
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
+		break;
+	}
+
+	return addr_fold(skb->sk);
+}
+
+static u32 flow_get_dst(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return ntohl(ip_hdr(skb)->daddr);
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
+		break;
+	}
+
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+}
+
+static u32 flow_get_proto(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return pskb_network_may_pull(skb, sizeof(struct iphdr)) ?
+		       ip_hdr(skb)->protocol : 0;
+	case htons(ETH_P_IPV6):
+		return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ?
+		       ipv6_hdr(skb)->nexthdr : 0;
+	default:
+		return 0;
+	}
+}
+
+static u32 flow_get_proto_src(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP): {
+		struct iphdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ip_hdr(skb);
+		if (iph->frag_off & htons(IP_MF | IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
+			iph = ip_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+						 poff));
+		}
+		break;
+	}
+	case htons(ETH_P_IPV6): {
+		struct ipv6hdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ipv6_hdr(skb);
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
+			iph = ipv6_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+						 poff));
+		}
+		break;
+	}
+	}
+
+	return addr_fold(skb->sk);
+}
+
+static u32 flow_get_proto_dst(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP): {
+		struct iphdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ip_hdr(skb);
+		if (iph->frag_off & htons(IP_MF | IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+			iph = ip_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+						 2 + poff));
+		}
+		break;
+	}
+	case htons(ETH_P_IPV6): {
+		struct ipv6hdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ipv6_hdr(skb);
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
+			iph = ipv6_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+						 poff + 2));
+		}
+		break;
+	}
+	}
+
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+}
+
+static u32 flow_get_iif(const struct sk_buff *skb)
+{
+	return skb->skb_iif;
+}
+
+static u32 flow_get_priority(const struct sk_buff *skb)
+{
+	return skb->priority;
+}
+
+static u32 flow_get_mark(const struct sk_buff *skb)
+{
+	return skb->mark;
+}
+
+static u32 flow_get_nfct(const struct sk_buff *skb)
+{
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	return addr_fold(skb->nfct);
+#else
+	return 0;
+#endif
+}
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#define CTTUPLE(skb, member)						\
+({									\
+	enum ip_conntrack_info ctinfo;					\
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);			\
+	if (ct == NULL)							\
+		goto fallback;						\
+	ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member;			\
+})
+#else
+#define CTTUPLE(skb, member)						\
+({									\
+	goto fallback;							\
+	0;								\
+})
+#endif
+
+static u32 flow_get_nfct_src(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, src.u3.ip));
+	case htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_src(skb);
+}
+
+static u32 flow_get_nfct_dst(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return ntohl(CTTUPLE(skb, dst.u3.ip));
+	case htons(ETH_P_IPV6):
+		return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
+	}
+fallback:
+	return flow_get_dst(skb);
+}
+
+static u32 flow_get_nfct_proto_src(struct sk_buff *skb)
+{
+	return ntohs(CTTUPLE(skb, src.u.all));
+fallback:
+	return flow_get_proto_src(skb);
+}
+
+static u32 flow_get_nfct_proto_dst(struct sk_buff *skb)
+{
+	return ntohs(CTTUPLE(skb, dst.u.all));
+fallback:
+	return flow_get_proto_dst(skb);
+}
+
+static u32 flow_get_rtclassid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+	if (skb_dst(skb))
+		return skb_dst(skb)->tclassid;
+#endif
+	return 0;
+}
+
+static u32 flow_get_skuid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
+		return skb->sk->sk_socket->file->f_cred->fsuid;
+	return 0;
+}
+
+static u32 flow_get_skgid(const struct sk_buff *skb)
+{
+	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
+		return skb->sk->sk_socket->file->f_cred->fsgid;
+	return 0;
+}
+
+static u32 flow_get_vlan_tag(const struct sk_buff *skb)
+{
+	u16 uninitialized_var(tag);
+
+	if (vlan_get_tag(skb, &tag) < 0)
+		return 0;
+	return tag & VLAN_VID_MASK;
+}
+
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+	return skb_get_rxhash(skb);
+}
+
+static u32 flow_key_get(struct sk_buff *skb, int key)
+{
+	switch (key) {
+	case FLOW_KEY_SRC:
+		return flow_get_src(skb);
+	case FLOW_KEY_DST:
+		return flow_get_dst(skb);
+	case FLOW_KEY_PROTO:
+		return flow_get_proto(skb);
+	case FLOW_KEY_PROTO_SRC:
+		return flow_get_proto_src(skb);
+	case FLOW_KEY_PROTO_DST:
+		return flow_get_proto_dst(skb);
+	case FLOW_KEY_IIF:
+		return flow_get_iif(skb);
+	case FLOW_KEY_PRIORITY:
+		return flow_get_priority(skb);
+	case FLOW_KEY_MARK:
+		return flow_get_mark(skb);
+	case FLOW_KEY_NFCT:
+		return flow_get_nfct(skb);
+	case FLOW_KEY_NFCT_SRC:
+		return flow_get_nfct_src(skb);
+	case FLOW_KEY_NFCT_DST:
+		return flow_get_nfct_dst(skb);
+	case FLOW_KEY_NFCT_PROTO_SRC:
+		return flow_get_nfct_proto_src(skb);
+	case FLOW_KEY_NFCT_PROTO_DST:
+		return flow_get_nfct_proto_dst(skb);
+	case FLOW_KEY_RTCLASSID:
+		return flow_get_rtclassid(skb);
+	case FLOW_KEY_SKUID:
+		return flow_get_skuid(skb);
+	case FLOW_KEY_SKGID:
+		return flow_get_skgid(skb);
+	case FLOW_KEY_VLAN_TAG:
+		return flow_get_vlan_tag(skb);
+	case FLOW_KEY_RXHASH:
+		return flow_get_rxhash(skb);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			 struct tcf_result *res)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	u32 keymask;
+	u32 classid;
+	unsigned int n, key;
+	int r;
+
+	list_for_each_entry(f, &head->filters, list) {
+		u32 keys[f->nkeys];
+
+		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
+			continue;
+
+		keymask = f->keymask;
+
+		for (n = 0; n < f->nkeys; n++) {
+			key = ffs(keymask) - 1;
+			keymask &= ~(1 << key);
+			keys[n] = flow_key_get(skb, key);
+		}
+
+		if (f->mode == FLOW_MODE_HASH)
+			classid = jhash2(keys, f->nkeys, f->hashrnd);
+		else {
+			classid = keys[0];
+			classid = (classid & f->mask) ^ f->xor;
+			classid = (classid >> f->rshift) + f->addend;
+		}
+
+		if (f->divisor)
+			classid %= f->divisor;
+
+		res->class   = 0;
+		res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+		r = tcf_exts_exec(skb, &f->exts, res);
+		if (r < 0)
+			continue;
+		return r;
+	}
+	return -1;
+}
+
+static void flow_perturbation(unsigned long arg)
+{
+	struct flow_filter *f = (struct flow_filter *)arg;
+
+	get_random_bytes(&f->hashrnd, 4);
+	if (f->perturb_period)
+		mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
+}
+
+static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
+	[TCA_FLOW_KEYS]		= { .type = NLA_U32 },
+	[TCA_FLOW_MODE]		= { .type = NLA_U32 },
+	[TCA_FLOW_BASECLASS]	= { .type = NLA_U32 },
+	[TCA_FLOW_RSHIFT]	= { .type = NLA_U32 },
+	[TCA_FLOW_ADDEND]	= { .type = NLA_U32 },
+	[TCA_FLOW_MASK]		= { .type = NLA_U32 },
+	[TCA_FLOW_XOR]		= { .type = NLA_U32 },
+	[TCA_FLOW_DIVISOR]	= { .type = NLA_U32 },
+	[TCA_FLOW_ACT]		= { .type = NLA_NESTED },
+	[TCA_FLOW_POLICE]	= { .type = NLA_NESTED },
+	[TCA_FLOW_EMATCHES]	= { .type = NLA_NESTED },
+	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
+};
+
+static int flow_change(struct tcf_proto *tp, unsigned long base,
+		       u32 handle, struct nlattr **tca,
+		       unsigned long *arg)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_FLOW_MAX + 1];
+	struct tcf_exts e;
+	struct tcf_ematch_tree t;
+	unsigned int nkeys = 0;
+	unsigned int perturb_period = 0;
+	u32 baseclass = 0;
+	u32 keymask = 0;
+	u32 mode;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_FLOW_BASECLASS]) {
+		baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
+		if (TC_H_MIN(baseclass) == 0)
+			return -EINVAL;
+	}
+
+	if (tb[TCA_FLOW_KEYS]) {
+		keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
+
+		nkeys = hweight32(keymask);
+		if (nkeys == 0)
+			return -EINVAL;
+
+		if (fls(keymask) - 1 > FLOW_KEY_MAX)
+			return -EOPNOTSUPP;
+	}
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
+	if (err < 0)
+		goto err1;
+
+	f = (struct flow_filter *)*arg;
+	if (f != NULL) {
+		err = -EINVAL;
+		if (f->handle != handle && handle)
+			goto err2;
+
+		mode = f->mode;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+
+		if (mode == FLOW_MODE_HASH)
+			perturb_period = f->perturb_period;
+		if (tb[TCA_FLOW_PERTURB]) {
+			if (mode != FLOW_MODE_HASH)
+				goto err2;
+			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+		}
+	} else {
+		err = -EINVAL;
+		if (!handle)
+			goto err2;
+		if (!tb[TCA_FLOW_KEYS])
+			goto err2;
+
+		mode = FLOW_MODE_MAP;
+		if (tb[TCA_FLOW_MODE])
+			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
+		if (mode != FLOW_MODE_HASH && nkeys > 1)
+			goto err2;
+
+		if (tb[TCA_FLOW_PERTURB]) {
+			if (mode != FLOW_MODE_HASH)
+				goto err2;
+			perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
+		}
+
+		if (TC_H_MAJ(baseclass) == 0)
+			baseclass = TC_H_MAKE(tp->q->handle, baseclass);
+		if (TC_H_MIN(baseclass) == 0)
+			baseclass = TC_H_MAKE(baseclass, 1);
+
+		err = -ENOBUFS;
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
+		if (f == NULL)
+			goto err2;
+
+		f->handle = handle;
+		f->mask	  = ~0U;
+
+		get_random_bytes(&f->hashrnd, 4);
+		f->perturb_timer.function = flow_perturbation;
+		f->perturb_timer.data = (unsigned long)f;
+		init_timer_deferrable(&f->perturb_timer);
+	}
+
+	tcf_exts_change(tp, &f->exts, &e);
+	tcf_em_tree_change(tp, &f->ematches, &t);
+
+	tcf_tree_lock(tp);
+
+	if (tb[TCA_FLOW_KEYS]) {
+		f->keymask = keymask;
+		f->nkeys   = nkeys;
+	}
+
+	f->mode = mode;
+
+	if (tb[TCA_FLOW_MASK])
+		f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+	if (tb[TCA_FLOW_XOR])
+		f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+	if (tb[TCA_FLOW_RSHIFT])
+		f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+	if (tb[TCA_FLOW_ADDEND])
+		f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+
+	if (tb[TCA_FLOW_DIVISOR])
+		f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+	if (baseclass)
+		f->baseclass = baseclass;
+
+	f->perturb_period = perturb_period;
+	del_timer(&f->perturb_timer);
+	if (perturb_period)
+		mod_timer(&f->perturb_timer, jiffies + perturb_period);
+
+	if (*arg == 0)
+		list_add_tail(&f->list, &head->filters);
+
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long)f;
+	return 0;
+
+err2:
+	tcf_em_tree_destroy(tp, &t);
+err1:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
+{
+	del_timer_sync(&f->perturb_timer);
+	tcf_exts_destroy(tp, &f->exts);
+	tcf_em_tree_destroy(tp, &f->ematches);
+	kfree(f);
+}
+
+static int flow_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct flow_filter *f = (struct flow_filter *)arg;
+
+	tcf_tree_lock(tp);
+	list_del(&f->list);
+	tcf_tree_unlock(tp);
+	flow_destroy_filter(tp, f);
+	return 0;
+}
+
+static int flow_init(struct tcf_proto *tp)
+{
+	struct flow_head *head;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (head == NULL)
+		return -ENOBUFS;
+	INIT_LIST_HEAD(&head->filters);
+	tp->root = head;
+	return 0;
+}
+
+static void flow_destroy(struct tcf_proto *tp)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f, *next;
+
+	list_for_each_entry_safe(f, next, &head->filters, list) {
+		list_del(&f->list);
+		flow_destroy_filter(tp, f);
+	}
+	kfree(head);
+}
+
+static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list)
+		if (f->handle == handle)
+			return (unsigned long)f;
+	return 0;
+}
+
+static void flow_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int flow_dump(struct tcf_proto *tp, unsigned long fh,
+		     struct sk_buff *skb, struct tcmsg *t)
+{
+	struct flow_filter *f = (struct flow_filter *)fh;
+	struct nlattr *nest;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask);
+	NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode);
+
+	if (f->mask != ~0 || f->xor != 0) {
+		NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask);
+		NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor);
+	}
+	if (f->rshift)
+		NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift);
+	if (f->addend)
+		NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend);
+
+	if (f->divisor)
+		NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor);
+	if (f->baseclass)
+		NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass);
+
+	if (f->perturb_period)
+		NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ);
+
+	if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
+		goto nla_put_failure;
+#ifdef CONFIG_NET_EMATCH
+	if (f->ematches.hdr.nmatches &&
+	    tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
+		goto nla_put_failure;
+#endif
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, nest);
+	return -1;
+}
+
+static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct flow_head *head = tp->root;
+	struct flow_filter *f;
+
+	list_for_each_entry(f, &head->filters, list) {
+		if (arg->count < arg->skip)
+			goto skip;
+		if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+skip:
+		arg->count++;
+	}
+}
+
+static struct tcf_proto_ops cls_flow_ops __read_mostly = {
+	.kind		= "flow",
+	.classify	= flow_classify,
+	.init		= flow_init,
+	.destroy	= flow_destroy,
+	.change		= flow_change,
+	.delete		= flow_delete,
+	.get		= flow_get,
+	.put		= flow_put,
+	.dump		= flow_dump,
+	.walk		= flow_walk,
+	.owner		= THIS_MODULE,
+};
+
+static int __init cls_flow_init(void)
+{
+	return register_tcf_proto_ops(&cls_flow_ops);
+}
+
+static void __exit cls_flow_exit(void)
+{
+	unregister_tcf_proto_ops(&cls_flow_ops);
+}
+
+module_init(cls_flow_init);
+module_exit(cls_flow_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("TC flow classifier");
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
new file mode 100644
index 00000000..26e7bc4f
--- /dev/null
+++ b/net/sched/cls_fw.c
@@ -0,0 +1,399 @@
+/*
+ * net/sched/cls_fw.c	Classifier mapping ipchains' fwmark to traffic class.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
+ * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
+ * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
+ *
+ * JHS: We should remove the CONFIG_NET_CLS_IND from here
+ * eventually when the meta match extension is made available
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
+
+struct fw_head {
+	struct fw_filter *ht[HTSIZE];
+	u32 mask;
+};
+
+struct fw_filter {
+	struct fw_filter	*next;
+	u32			id;
+	struct tcf_result	res;
+#ifdef CONFIG_NET_CLS_IND
+	char			indev[IFNAMSIZ];
+#endif /* CONFIG_NET_CLS_IND */
+	struct tcf_exts		exts;
+};
+
+static const struct tcf_ext_map fw_ext_map = {
+	.action = TCA_FW_ACT,
+	.police = TCA_FW_POLICE
+};
+
+static inline int fw_hash(u32 handle)
+{
+	if (HTSIZE == 4096)
+		return ((handle >> 24) & 0xFFF) ^
+		       ((handle >> 12) & 0xFFF) ^
+		       (handle & 0xFFF);
+	else if (HTSIZE == 2048)
+		return ((handle >> 22) & 0x7FF) ^
+		       ((handle >> 11) & 0x7FF) ^
+		       (handle & 0x7FF);
+	else if (HTSIZE == 1024)
+		return ((handle >> 20) & 0x3FF) ^
+		       ((handle >> 10) & 0x3FF) ^
+		       (handle & 0x3FF);
+	else if (HTSIZE == 512)
+		return (handle >> 27) ^
+		       ((handle >> 18) & 0x1FF) ^
+		       ((handle >> 9) & 0x1FF) ^
+		       (handle & 0x1FF);
+	else if (HTSIZE == 256) {
+		u8 *t = (u8 *) &handle;
+		return t[0] ^ t[1] ^ t[2] ^ t[3];
+	} else
+		return handle & (HTSIZE - 1);
+}
+
+static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			  struct tcf_result *res)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_filter *f;
+	int r;
+	u32 id = skb->mark;
+
+	if (head != NULL) {
+		id &= head->mask;
+		for (f = head->ht[fw_hash(id)]; f; f = f->next) {
+			if (f->id == id) {
+				*res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+				if (!tcf_match_indev(skb, f->indev))
+					continue;
+#endif /* CONFIG_NET_CLS_IND */
+				r = tcf_exts_exec(skb, &f->exts, res);
+				if (r < 0)
+					continue;
+
+				return r;
+			}
+		}
+	} else {
+		/* old method */
+		if (id && (TC_H_MAJ(id) == 0 ||
+			   !(TC_H_MAJ(id ^ tp->q->handle)))) {
+			res->classid = id;
+			res->class = 0;
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_filter *f;
+
+	if (head == NULL)
+		return 0;
+
+	for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
+		if (f->id == handle)
+			return (unsigned long)f;
+	}
+	return 0;
+}
+
+static void fw_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int fw_init(struct tcf_proto *tp)
+{
+	return 0;
+}
+
+static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+{
+	tcf_unbind_filter(tp, &f->res);
+	tcf_exts_destroy(tp, &f->exts);
+	kfree(f);
+}
+
+static void fw_destroy(struct tcf_proto *tp)
+{
+	struct fw_head *head = tp->root;
+	struct fw_filter *f;
+	int h;
+
+	if (head == NULL)
+		return;
+
+	for (h = 0; h < HTSIZE; h++) {
+		while ((f = head->ht[h]) != NULL) {
+			head->ht[h] = f->next;
+			fw_delete_filter(tp, f);
+		}
+	}
+	kfree(head);
+}
+
+static int fw_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_filter *f = (struct fw_filter *)arg;
+	struct fw_filter **fp;
+
+	if (head == NULL || f == NULL)
+		goto out;
+
+	for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
+		if (*fp == f) {
+			tcf_tree_lock(tp);
+			*fp = f->next;
+			tcf_tree_unlock(tp);
+			fw_delete_filter(tp, f);
+			return 0;
+		}
+	}
+out:
+	return -EINVAL;
+}
+
+static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
+	[TCA_FW_CLASSID]	= { .type = NLA_U32 },
+	[TCA_FW_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_FW_MASK]		= { .type = NLA_U32 },
+};
+
+static int
+fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
+	struct nlattr **tb, struct nlattr **tca, unsigned long base)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct tcf_exts e;
+	u32 mask;
+	int err;
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_FW_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
+		tcf_bind_filter(tp, &f->res, base);
+	}
+
+#ifdef CONFIG_NET_CLS_IND
+	if (tb[TCA_FW_INDEV]) {
+		err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]);
+		if (err < 0)
+			goto errout;
+	}
+#endif /* CONFIG_NET_CLS_IND */
+
+	if (tb[TCA_FW_MASK]) {
+		mask = nla_get_u32(tb[TCA_FW_MASK]);
+		if (mask != head->mask)
+			goto errout;
+	} else if (head->mask != 0xFFFFFFFF)
+		goto errout;
+
+	tcf_exts_change(tp, &f->exts, &e);
+
+	return 0;
+errout:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static int fw_change(struct tcf_proto *tp, unsigned long base,
+		     u32 handle,
+		     struct nlattr **tca,
+		     unsigned long *arg)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_filter *f = (struct fw_filter *) *arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_FW_MAX + 1];
+	int err;
+
+	if (!opt)
+		return handle ? -EINVAL : 0;
+
+	err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
+	if (err < 0)
+		return err;
+
+	if (f != NULL) {
+		if (f->id != handle && handle)
+			return -EINVAL;
+		return fw_change_attrs(tp, f, tb, tca, base);
+	}
+
+	if (!handle)
+		return -EINVAL;
+
+	if (head == NULL) {
+		u32 mask = 0xFFFFFFFF;
+		if (tb[TCA_FW_MASK])
+			mask = nla_get_u32(tb[TCA_FW_MASK]);
+
+		head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
+		if (head == NULL)
+			return -ENOBUFS;
+		head->mask = mask;
+
+		tcf_tree_lock(tp);
+		tp->root = head;
+		tcf_tree_unlock(tp);
+	}
+
+	f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+	if (f == NULL)
+		return -ENOBUFS;
+
+	f->id = handle;
+
+	err = fw_change_attrs(tp, f, tb, tca, base);
+	if (err < 0)
+		goto errout;
+
+	f->next = head->ht[fw_hash(handle)];
+	tcf_tree_lock(tp);
+	head->ht[fw_hash(handle)] = f;
+	tcf_tree_unlock(tp);
+
+	*arg = (unsigned long)f;
+	return 0;
+
+errout:
+	kfree(f);
+	return err;
+}
+
+static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	int h;
+
+	if (head == NULL)
+		arg->stop = 1;
+
+	if (arg->stop)
+		return;
+
+	for (h = 0; h < HTSIZE; h++) {
+		struct fw_filter *f;
+
+		for (f = head->ht[h]; f; f = f->next) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static int fw_dump(struct tcf_proto *tp, unsigned long fh,
+		   struct sk_buff *skb, struct tcmsg *t)
+{
+	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_filter *f = (struct fw_filter *)fh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->id;
+
+	if (!f->res.classid && !tcf_exts_is_available(&f->exts))
+		return skb->len;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (f->res.classid)
+		NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid);
+#ifdef CONFIG_NET_CLS_IND
+	if (strlen(f->indev))
+		NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev);
+#endif /* CONFIG_NET_CLS_IND */
+	if (head->mask != 0xFFFFFFFF)
+		NLA_PUT_U32(skb, TCA_FW_MASK, head->mask);
+
+	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_fw_ops __read_mostly = {
+	.kind		=	"fw",
+	.classify	=	fw_classify,
+	.init		=	fw_init,
+	.destroy	=	fw_destroy,
+	.get		=	fw_get,
+	.put		=	fw_put,
+	.change		=	fw_change,
+	.delete		=	fw_delete,
+	.walk		=	fw_walk,
+	.dump		=	fw_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_fw(void)
+{
+	return register_tcf_proto_ops(&cls_fw_ops);
+}
+
+static void __exit exit_fw(void)
+{
+	unregister_tcf_proto_ops(&cls_fw_ops);
+}
+
+module_init(init_fw)
+module_exit(exit_fw)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
new file mode 100644
index 00000000..a9079053
--- /dev/null
+++ b/net/sched/cls_route.c
@@ -0,0 +1,627 @@
+/*
+ * net/sched/cls_route.c	ROUTE4 classifier.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+/*
+ * 1. For now we assume that route tags < 256.
+ *    It allows to use direct table lookups, instead of hash tables.
+ * 2. For now we assume that "from TAG" and "fromdev DEV" statements
+ *    are mutually  exclusive.
+ * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
+ */
+
+struct route4_fastmap {
+	struct route4_filter	*filter;
+	u32			id;
+	int			iif;
+};
+
+struct route4_head {
+	struct route4_fastmap	fastmap[16];
+	struct route4_bucket	*table[256 + 1];
+};
+
+struct route4_bucket {
+	/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
+	struct route4_filter	*ht[16 + 16 + 1];
+};
+
+struct route4_filter {
+	struct route4_filter	*next;
+	u32			id;
+	int			iif;
+
+	struct tcf_result	res;
+	struct tcf_exts		exts;
+	u32			handle;
+	struct route4_bucket	*bkt;
+};
+
+#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
+
+static const struct tcf_ext_map route_ext_map = {
+	.police = TCA_ROUTE4_POLICE,
+	.action = TCA_ROUTE4_ACT
+};
+
+static inline int route4_fastmap_hash(u32 id, int iif)
+{
+	return id & 0xF;
+}
+
+static void
+route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+{
+	spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
+
+	spin_lock_bh(root_lock);
+	memset(head->fastmap, 0, sizeof(head->fastmap));
+	spin_unlock_bh(root_lock);
+}
+
+static void
+route4_set_fastmap(struct route4_head *head, u32 id, int iif,
+		   struct route4_filter *f)
+{
+	int h = route4_fastmap_hash(id, iif);
+
+	head->fastmap[h].id = id;
+	head->fastmap[h].iif = iif;
+	head->fastmap[h].filter = f;
+}
+
+static inline int route4_hash_to(u32 id)
+{
+	return id & 0xFF;
+}
+
+static inline int route4_hash_from(u32 id)
+{
+	return (id >> 16) & 0xF;
+}
+
+static inline int route4_hash_iif(int iif)
+{
+	return 16 + ((iif >> 16) & 0xF);
+}
+
+static inline int route4_hash_wild(void)
+{
+	return 32;
+}
+
+#define ROUTE4_APPLY_RESULT()					\
+{								\
+	*res = f->res;						\
+	if (tcf_exts_is_available(&f->exts)) {			\
+		int r = tcf_exts_exec(skb, &f->exts, res);	\
+		if (r < 0) {					\
+			dont_cache = 1;				\
+			continue;				\
+		}						\
+		return r;					\
+	} else if (!dont_cache)					\
+		route4_set_fastmap(head, id, iif, f);		\
+	return 0;						\
+}
+
+static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			   struct tcf_result *res)
+{
+	struct route4_head *head = (struct route4_head *)tp->root;
+	struct dst_entry *dst;
+	struct route4_bucket *b;
+	struct route4_filter *f;
+	u32 id, h;
+	int iif, dont_cache = 0;
+
+	dst = skb_dst(skb);
+	if (!dst)
+		goto failure;
+
+	id = dst->tclassid;
+	if (head == NULL)
+		goto old_method;
+
+	iif = ((struct rtable *)dst)->rt_iif;
+
+	h = route4_fastmap_hash(id, iif);
+	if (id == head->fastmap[h].id &&
+	    iif == head->fastmap[h].iif &&
+	    (f = head->fastmap[h].filter) != NULL) {
+		if (f == ROUTE4_FAILURE)
+			goto failure;
+
+		*res = f->res;
+		return 0;
+	}
+
+	h = route4_hash_to(id);
+
+restart:
+	b = head->table[h];
+	if (b) {
+		for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
+			if (f->id == id)
+				ROUTE4_APPLY_RESULT();
+
+		for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
+			if (f->iif == iif)
+				ROUTE4_APPLY_RESULT();
+
+		for (f = b->ht[route4_hash_wild()]; f; f = f->next)
+			ROUTE4_APPLY_RESULT();
+
+	}
+	if (h < 256) {
+		h = 256;
+		id &= ~0xFFFF;
+		goto restart;
+	}
+
+	if (!dont_cache)
+		route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
+failure:
+	return -1;
+
+old_method:
+	if (id && (TC_H_MAJ(id) == 0 ||
+		   !(TC_H_MAJ(id^tp->q->handle)))) {
+		res->classid = id;
+		res->class = 0;
+		return 0;
+	}
+	return -1;
+}
+
+static inline u32 to_hash(u32 id)
+{
+	u32 h = id & 0xFF;
+
+	if (id & 0x8000)
+		h += 256;
+	return h;
+}
+
+static inline u32 from_hash(u32 id)
+{
+	id &= 0xFFFF;
+	if (id == 0xFFFF)
+		return 32;
+	if (!(id & 0x8000)) {
+		if (id > 255)
+			return 256;
+		return id & 0xF;
+	}
+	return 16 + (id & 0xF);
+}
+
+static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+{
+	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_bucket *b;
+	struct route4_filter *f;
+	unsigned int h1, h2;
+
+	if (!head)
+		return 0;
+
+	h1 = to_hash(handle);
+	if (h1 > 256)
+		return 0;
+
+	h2 = from_hash(handle >> 16);
+	if (h2 > 32)
+		return 0;
+
+	b = head->table[h1];
+	if (b) {
+		for (f = b->ht[h2]; f; f = f->next)
+			if (f->handle == handle)
+				return (unsigned long)f;
+	}
+	return 0;
+}
+
+static void route4_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int route4_init(struct tcf_proto *tp)
+{
+	return 0;
+}
+
+static void
+route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
+{
+	tcf_unbind_filter(tp, &f->res);
+	tcf_exts_destroy(tp, &f->exts);
+	kfree(f);
+}
+
+static void route4_destroy(struct tcf_proto *tp)
+{
+	struct route4_head *head = tp->root;
+	int h1, h2;
+
+	if (head == NULL)
+		return;
+
+	for (h1 = 0; h1 <= 256; h1++) {
+		struct route4_bucket *b;
+
+		b = head->table[h1];
+		if (b) {
+			for (h2 = 0; h2 <= 32; h2++) {
+				struct route4_filter *f;
+
+				while ((f = b->ht[h2]) != NULL) {
+					b->ht[h2] = f->next;
+					route4_delete_filter(tp, f);
+				}
+			}
+			kfree(b);
+		}
+	}
+	kfree(head);
+}
+
+static int route4_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_filter **fp, *f = (struct route4_filter *)arg;
+	unsigned int h = 0;
+	struct route4_bucket *b;
+	int i;
+
+	if (!head || !f)
+		return -EINVAL;
+
+	h = f->handle;
+	b = f->bkt;
+
+	for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
+		if (*fp == f) {
+			tcf_tree_lock(tp);
+			*fp = f->next;
+			tcf_tree_unlock(tp);
+
+			route4_reset_fastmap(tp->q, head, f->id);
+			route4_delete_filter(tp, f);
+
+			/* Strip tree */
+
+			for (i = 0; i <= 32; i++)
+				if (b->ht[i])
+					return 0;
+
+			/* OK, session has no flows */
+			tcf_tree_lock(tp);
+			head->table[to_hash(h)] = NULL;
+			tcf_tree_unlock(tp);
+
+			kfree(b);
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
+	[TCA_ROUTE4_CLASSID]	= { .type = NLA_U32 },
+	[TCA_ROUTE4_TO]		= { .type = NLA_U32 },
+	[TCA_ROUTE4_FROM]	= { .type = NLA_U32 },
+	[TCA_ROUTE4_IIF]	= { .type = NLA_U32 },
+};
+
+static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
+	struct route4_filter *f, u32 handle, struct route4_head *head,
+	struct nlattr **tb, struct nlattr *est, int new)
+{
+	int err;
+	u32 id = 0, to = 0, nhandle = 0x8000;
+	struct route4_filter *fp;
+	unsigned int h1;
+	struct route4_bucket *b;
+	struct tcf_exts e;
+
+	err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_ROUTE4_TO]) {
+		if (new && handle & 0x8000)
+			goto errout;
+		to = nla_get_u32(tb[TCA_ROUTE4_TO]);
+		if (to > 0xFF)
+			goto errout;
+		nhandle = to;
+	}
+
+	if (tb[TCA_ROUTE4_FROM]) {
+		if (tb[TCA_ROUTE4_IIF])
+			goto errout;
+		id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
+		if (id > 0xFF)
+			goto errout;
+		nhandle |= id << 16;
+	} else if (tb[TCA_ROUTE4_IIF]) {
+		id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
+		if (id > 0x7FFF)
+			goto errout;
+		nhandle |= (id | 0x8000) << 16;
+	} else
+		nhandle |= 0xFFFF << 16;
+
+	if (handle && new) {
+		nhandle |= handle & 0x7F00;
+		if (nhandle != handle)
+			goto errout;
+	}
+
+	h1 = to_hash(nhandle);
+	b = head->table[h1];
+	if (!b) {
+		err = -ENOBUFS;
+		b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
+		if (b == NULL)
+			goto errout;
+
+		tcf_tree_lock(tp);
+		head->table[h1] = b;
+		tcf_tree_unlock(tp);
+	} else {
+		unsigned int h2 = from_hash(nhandle >> 16);
+
+		err = -EEXIST;
+		for (fp = b->ht[h2]; fp; fp = fp->next)
+			if (fp->handle == f->handle)
+				goto errout;
+	}
+
+	tcf_tree_lock(tp);
+	if (tb[TCA_ROUTE4_TO])
+		f->id = to;
+
+	if (tb[TCA_ROUTE4_FROM])
+		f->id = to | id<<16;
+	else if (tb[TCA_ROUTE4_IIF])
+		f->iif = id;
+
+	f->handle = nhandle;
+	f->bkt = b;
+	tcf_tree_unlock(tp);
+
+	if (tb[TCA_ROUTE4_CLASSID]) {
+		f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
+		tcf_bind_filter(tp, &f->res, base);
+	}
+
+	tcf_exts_change(tp, &f->exts, &e);
+
+	return 0;
+errout:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static int route4_change(struct tcf_proto *tp, unsigned long base,
+		       u32 handle,
+		       struct nlattr **tca,
+		       unsigned long *arg)
+{
+	struct route4_head *head = tp->root;
+	struct route4_filter *f, *f1, **fp;
+	struct route4_bucket *b;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_ROUTE4_MAX + 1];
+	unsigned int h, th;
+	u32 old_handle = 0;
+	int err;
+
+	if (opt == NULL)
+		return handle ? -EINVAL : 0;
+
+	err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
+	if (err < 0)
+		return err;
+
+	f = (struct route4_filter *)*arg;
+	if (f) {
+		if (f->handle != handle && handle)
+			return -EINVAL;
+
+		if (f->bkt)
+			old_handle = f->handle;
+
+		err = route4_set_parms(tp, base, f, handle, head, tb,
+			tca[TCA_RATE], 0);
+		if (err < 0)
+			return err;
+
+		goto reinsert;
+	}
+
+	err = -ENOBUFS;
+	if (head == NULL) {
+		head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
+		if (head == NULL)
+			goto errout;
+
+		tcf_tree_lock(tp);
+		tp->root = head;
+		tcf_tree_unlock(tp);
+	}
+
+	f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
+	if (f == NULL)
+		goto errout;
+
+	err = route4_set_parms(tp, base, f, handle, head, tb,
+		tca[TCA_RATE], 1);
+	if (err < 0)
+		goto errout;
+
+reinsert:
+	h = from_hash(f->handle >> 16);
+	for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
+		if (f->handle < f1->handle)
+			break;
+
+	f->next = f1;
+	tcf_tree_lock(tp);
+	*fp = f;
+
+	if (old_handle && f->handle != old_handle) {
+		th = to_hash(old_handle);
+		h = from_hash(old_handle >> 16);
+		b = head->table[th];
+		if (b) {
+			for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
+				if (*fp == f) {
+					*fp = f->next;
+					break;
+				}
+			}
+		}
+	}
+	tcf_tree_unlock(tp);
+
+	route4_reset_fastmap(tp->q, head, f->id);
+	*arg = (unsigned long)f;
+	return 0;
+
+errout:
+	kfree(f);
+	return err;
+}
+
+static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct route4_head *head = tp->root;
+	unsigned int h, h1;
+
+	if (head == NULL)
+		arg->stop = 1;
+
+	if (arg->stop)
+		return;
+
+	for (h = 0; h <= 256; h++) {
+		struct route4_bucket *b = head->table[h];
+
+		if (b) {
+			for (h1 = 0; h1 <= 32; h1++) {
+				struct route4_filter *f;
+
+				for (f = b->ht[h1]; f; f = f->next) {
+					if (arg->count < arg->skip) {
+						arg->count++;
+						continue;
+					}
+					if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+						arg->stop = 1;
+						return;
+					}
+					arg->count++;
+				}
+			}
+		}
+	}
+}
+
+static int route4_dump(struct tcf_proto *tp, unsigned long fh,
+		       struct sk_buff *skb, struct tcmsg *t)
+{
+	struct route4_filter *f = (struct route4_filter *)fh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+	u32 id;
+
+	if (f == NULL)
+		return skb->len;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (!(f->handle & 0x8000)) {
+		id = f->id & 0xFF;
+		NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
+	}
+	if (f->handle & 0x80000000) {
+		if ((f->handle >> 16) != 0xFFFF)
+			NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
+	} else {
+		id = f->id >> 16;
+		NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
+	}
+	if (f->res.classid)
+		NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid);
+
+	if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_route4_ops __read_mostly = {
+	.kind		=	"route",
+	.classify	=	route4_classify,
+	.init		=	route4_init,
+	.destroy	=	route4_destroy,
+	.get		=	route4_get,
+	.put		=	route4_put,
+	.change		=	route4_change,
+	.delete		=	route4_delete,
+	.walk		=	route4_walk,
+	.dump		=	route4_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_route4(void)
+{
+	return register_tcf_proto_ops(&cls_route4_ops);
+}
+
+static void __exit exit_route4(void)
+{
+	unregister_tcf_proto_ops(&cls_route4_ops);
+}
+
+module_init(init_route4)
+module_exit(exit_route4)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
new file mode 100644
index 00000000..cbb5e0d6
--- /dev/null
+++ b/net/sched/cls_rsvp.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp.c	Special RSVP packet classifier for IPv4.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+#define RSVP_DST_LEN	1
+#define RSVP_ID		"rsvp"
+#define RSVP_OPS	cls_rsvp_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
new file mode 100644
index 00000000..402c44b2
--- /dev/null
+++ b/net/sched/cls_rsvp.h
@@ -0,0 +1,671 @@
+/*
+ * net/sched/cls_rsvp.h	Template file for RSVPv[46] classifiers.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+/*
+   Comparing to general packet classification problem,
+   RSVP needs only sevaral relatively simple rules:
+
+   * (dst, protocol) are always specified,
+     so that we are able to hash them.
+   * src may be exact, or may be wildcard, so that
+     we can keep a hash table plus one wildcard entry.
+   * source port (or flow label) is important only if src is given.
+
+   IMPLEMENTATION.
+
+   We use a two level hash table: The top level is keyed by
+   destination address and protocol ID, every bucket contains a list
+   of "rsvp sessions", identified by destination address, protocol and
+   DPI(="Destination Port ID"): triple (key, mask, offset).
+
+   Every bucket has a smaller hash table keyed by source address
+   (cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
+   Every bucket is again a list of "RSVP flows", selected by
+   source address and SPI(="Source Port ID" here rather than
+   "security parameter index"): triple (key, mask, offset).
+
+
+   NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
+   and all fragmented packets go to the best-effort traffic class.
+
+
+   NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
+   only one "Generalized Port Identifier". So that for classic
+   ah, esp (and udp,tcp) both *pi should coincide or one of them
+   should be wildcard.
+
+   At first sight, this redundancy is just a waste of CPU
+   resources. But DPI and SPI add the possibility to assign different
+   priorities to GPIs. Look also at note 4 about tunnels below.
+
+
+   NOTE 3. One complication is the case of tunneled packets.
+   We implement it as following: if the first lookup
+   matches a special session with "tunnelhdr" value not zero,
+   flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
+   In this case, we pull tunnelhdr bytes and restart lookup
+   with tunnel ID added to the list of keys. Simple and stupid 8)8)
+   It's enough for PIMREG and IPIP.
+
+
+   NOTE 4. Two GPIs make it possible to parse even GRE packets.
+   F.e. DPI can select ETH_P_IP (and necessary flags to make
+   tunnelhdr correct) in GRE protocol field and SPI matches
+   GRE key. Is it not nice? 8)8)
+
+
+   Well, as result, despite its simplicity, we get a pretty
+   powerful classification engine.  */
+
+
+struct rsvp_head {
+	u32			tmap[256/32];
+	u32			hgenerator;
+	u8			tgenerator;
+	struct rsvp_session	*ht[256];
+};
+
+struct rsvp_session {
+	struct rsvp_session	*next;
+	__be32			dst[RSVP_DST_LEN];
+	struct tc_rsvp_gpi 	dpi;
+	u8			protocol;
+	u8			tunnelid;
+	/* 16 (src,sport) hash slots, and one wildcard source slot */
+	struct rsvp_filter	*ht[16 + 1];
+};
+
+
+struct rsvp_filter {
+	struct rsvp_filter	*next;
+	__be32			src[RSVP_DST_LEN];
+	struct tc_rsvp_gpi	spi;
+	u8			tunnelhdr;
+
+	struct tcf_result	res;
+	struct tcf_exts		exts;
+
+	u32			handle;
+	struct rsvp_session	*sess;
+};
+
+static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
+{
+	unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
+
+	h ^= h>>16;
+	h ^= h>>8;
+	return (h ^ protocol ^ tunnelid) & 0xFF;
+}
+
+static inline unsigned int hash_src(__be32 *src)
+{
+	unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
+
+	h ^= h>>16;
+	h ^= h>>8;
+	h ^= h>>4;
+	return h & 0xF;
+}
+
+static struct tcf_ext_map rsvp_ext_map = {
+	.police = TCA_RSVP_POLICE,
+	.action = TCA_RSVP_ACT
+};
+
+#define RSVP_APPLY_RESULT()				\
+{							\
+	int r = tcf_exts_exec(skb, &f->exts, res);	\
+	if (r < 0)					\
+		continue;				\
+	else if (r > 0)					\
+		return r;				\
+}
+
+static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			 struct tcf_result *res)
+{
+	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
+	struct rsvp_session *s;
+	struct rsvp_filter *f;
+	unsigned int h1, h2;
+	__be32 *dst, *src;
+	u8 protocol;
+	u8 tunnelid = 0;
+	u8 *xprt;
+#if RSVP_DST_LEN == 4
+	struct ipv6hdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ipv6_hdr(skb);
+#else
+	struct iphdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ip_hdr(skb);
+#endif
+
+restart:
+
+#if RSVP_DST_LEN == 4
+	src = &nhptr->saddr.s6_addr32[0];
+	dst = &nhptr->daddr.s6_addr32[0];
+	protocol = nhptr->nexthdr;
+	xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
+#else
+	src = &nhptr->saddr;
+	dst = &nhptr->daddr;
+	protocol = nhptr->protocol;
+	xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
+	if (nhptr->frag_off & htons(IP_MF | IP_OFFSET))
+		return -1;
+#endif
+
+	h1 = hash_dst(dst, protocol, tunnelid);
+	h2 = hash_src(src);
+
+	for (s = sht[h1]; s; s = s->next) {
+		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
+		    protocol == s->protocol &&
+		    !(s->dpi.mask &
+		      (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
+#if RSVP_DST_LEN == 4
+		    dst[0] == s->dst[0] &&
+		    dst[1] == s->dst[1] &&
+		    dst[2] == s->dst[2] &&
+#endif
+		    tunnelid == s->tunnelid) {
+
+			for (f = s->ht[h2]; f; f = f->next) {
+				if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
+				    !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
+#if RSVP_DST_LEN == 4
+				    &&
+				    src[0] == f->src[0] &&
+				    src[1] == f->src[1] &&
+				    src[2] == f->src[2]
+#endif
+				    ) {
+					*res = f->res;
+					RSVP_APPLY_RESULT();
+
+matched:
+					if (f->tunnelhdr == 0)
+						return 0;
+
+					tunnelid = f->res.classid;
+					nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
+					goto restart;
+				}
+			}
+
+			/* And wildcard bucket... */
+			for (f = s->ht[16]; f; f = f->next) {
+				*res = f->res;
+				RSVP_APPLY_RESULT();
+				goto matched;
+			}
+			return -1;
+		}
+	}
+	return -1;
+}
+
+static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
+{
+	struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
+	struct rsvp_session *s;
+	struct rsvp_filter *f;
+	unsigned int h1 = handle & 0xFF;
+	unsigned int h2 = (handle >> 8) & 0xFF;
+
+	if (h2 > 16)
+		return 0;
+
+	for (s = sht[h1]; s; s = s->next) {
+		for (f = s->ht[h2]; f; f = f->next) {
+			if (f->handle == handle)
+				return (unsigned long)f;
+		}
+	}
+	return 0;
+}
+
+static void rsvp_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int rsvp_init(struct tcf_proto *tp)
+{
+	struct rsvp_head *data;
+
+	data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
+	if (data) {
+		tp->root = data;
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+static void
+rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
+{
+	tcf_unbind_filter(tp, &f->res);
+	tcf_exts_destroy(tp, &f->exts);
+	kfree(f);
+}
+
+static void rsvp_destroy(struct tcf_proto *tp)
+{
+	struct rsvp_head *data = xchg(&tp->root, NULL);
+	struct rsvp_session **sht;
+	int h1, h2;
+
+	if (data == NULL)
+		return;
+
+	sht = data->ht;
+
+	for (h1 = 0; h1 < 256; h1++) {
+		struct rsvp_session *s;
+
+		while ((s = sht[h1]) != NULL) {
+			sht[h1] = s->next;
+
+			for (h2 = 0; h2 <= 16; h2++) {
+				struct rsvp_filter *f;
+
+				while ((f = s->ht[h2]) != NULL) {
+					s->ht[h2] = f->next;
+					rsvp_delete_filter(tp, f);
+				}
+			}
+			kfree(s);
+		}
+	}
+	kfree(data);
+}
+
+static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+	unsigned int h = f->handle;
+	struct rsvp_session **sp;
+	struct rsvp_session *s = f->sess;
+	int i;
+
+	for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
+		if (*fp == f) {
+			tcf_tree_lock(tp);
+			*fp = f->next;
+			tcf_tree_unlock(tp);
+			rsvp_delete_filter(tp, f);
+
+			/* Strip tree */
+
+			for (i = 0; i <= 16; i++)
+				if (s->ht[i])
+					return 0;
+
+			/* OK, session has no flows */
+			for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
+			     *sp; sp = &(*sp)->next) {
+				if (*sp == s) {
+					tcf_tree_lock(tp);
+					*sp = s->next;
+					tcf_tree_unlock(tp);
+
+					kfree(s);
+					return 0;
+				}
+			}
+
+			return 0;
+		}
+	}
+	return 0;
+}
+
+static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
+{
+	struct rsvp_head *data = tp->root;
+	int i = 0xFFFF;
+
+	while (i-- > 0) {
+		u32 h;
+
+		if ((data->hgenerator += 0x10000) == 0)
+			data->hgenerator = 0x10000;
+		h = data->hgenerator|salt;
+		if (rsvp_get(tp, h) == 0)
+			return h;
+	}
+	return 0;
+}
+
+static int tunnel_bts(struct rsvp_head *data)
+{
+	int n = data->tgenerator >> 5;
+	u32 b = 1 << (data->tgenerator & 0x1F);
+
+	if (data->tmap[n] & b)
+		return 0;
+	data->tmap[n] |= b;
+	return 1;
+}
+
+static void tunnel_recycle(struct rsvp_head *data)
+{
+	struct rsvp_session **sht = data->ht;
+	u32 tmap[256/32];
+	int h1, h2;
+
+	memset(tmap, 0, sizeof(tmap));
+
+	for (h1 = 0; h1 < 256; h1++) {
+		struct rsvp_session *s;
+		for (s = sht[h1]; s; s = s->next) {
+			for (h2 = 0; h2 <= 16; h2++) {
+				struct rsvp_filter *f;
+
+				for (f = s->ht[h2]; f; f = f->next) {
+					if (f->tunnelhdr == 0)
+						continue;
+					data->tgenerator = f->res.classid;
+					tunnel_bts(data);
+				}
+			}
+		}
+	}
+
+	memcpy(data->tmap, tmap, sizeof(tmap));
+}
+
+static u32 gen_tunnel(struct rsvp_head *data)
+{
+	int i, k;
+
+	for (k = 0; k < 2; k++) {
+		for (i = 255; i > 0; i--) {
+			if (++data->tgenerator == 0)
+				data->tgenerator = 1;
+			if (tunnel_bts(data))
+				return data->tgenerator;
+		}
+		tunnel_recycle(data);
+	}
+	return 0;
+}
+
+static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
+	[TCA_RSVP_CLASSID]	= { .type = NLA_U32 },
+	[TCA_RSVP_DST]		= { .type = NLA_BINARY,
+				    .len = RSVP_DST_LEN * sizeof(u32) },
+	[TCA_RSVP_SRC]		= { .type = NLA_BINARY,
+				    .len = RSVP_DST_LEN * sizeof(u32) },
+	[TCA_RSVP_PINFO]	= { .len = sizeof(struct tc_rsvp_pinfo) },
+};
+
+static int rsvp_change(struct tcf_proto *tp, unsigned long base,
+		       u32 handle,
+		       struct nlattr **tca,
+		       unsigned long *arg)
+{
+	struct rsvp_head *data = tp->root;
+	struct rsvp_filter *f, **fp;
+	struct rsvp_session *s, **sp;
+	struct tc_rsvp_pinfo *pinfo = NULL;
+	struct nlattr *opt = tca[TCA_OPTIONS-1];
+	struct nlattr *tb[TCA_RSVP_MAX + 1];
+	struct tcf_exts e;
+	unsigned int h1, h2;
+	__be32 *dst;
+	int err;
+
+	if (opt == NULL)
+		return handle ? -EINVAL : 0;
+
+	err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
+	if (err < 0)
+		return err;
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
+	if (err < 0)
+		return err;
+
+	f = (struct rsvp_filter *)*arg;
+	if (f) {
+		/* Node exists: adjust only classid */
+
+		if (f->handle != handle && handle)
+			goto errout2;
+		if (tb[TCA_RSVP_CLASSID-1]) {
+			f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
+			tcf_bind_filter(tp, &f->res, base);
+		}
+
+		tcf_exts_change(tp, &f->exts, &e);
+		return 0;
+	}
+
+	/* Now more serious part... */
+	err = -EINVAL;
+	if (handle)
+		goto errout2;
+	if (tb[TCA_RSVP_DST-1] == NULL)
+		goto errout2;
+
+	err = -ENOBUFS;
+	f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
+	if (f == NULL)
+		goto errout2;
+
+	h2 = 16;
+	if (tb[TCA_RSVP_SRC-1]) {
+		memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
+		h2 = hash_src(f->src);
+	}
+	if (tb[TCA_RSVP_PINFO-1]) {
+		pinfo = nla_data(tb[TCA_RSVP_PINFO-1]);
+		f->spi = pinfo->spi;
+		f->tunnelhdr = pinfo->tunnelhdr;
+	}
+	if (tb[TCA_RSVP_CLASSID-1])
+		f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
+
+	dst = nla_data(tb[TCA_RSVP_DST-1]);
+	h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
+
+	err = -ENOMEM;
+	if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
+		goto errout;
+
+	if (f->tunnelhdr) {
+		err = -EINVAL;
+		if (f->res.classid > 255)
+			goto errout;
+
+		err = -ENOMEM;
+		if (f->res.classid == 0 &&
+		    (f->res.classid = gen_tunnel(data)) == 0)
+			goto errout;
+	}
+
+	for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
+		if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
+		    pinfo && pinfo->protocol == s->protocol &&
+		    memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
+#if RSVP_DST_LEN == 4
+		    dst[0] == s->dst[0] &&
+		    dst[1] == s->dst[1] &&
+		    dst[2] == s->dst[2] &&
+#endif
+		    pinfo->tunnelid == s->tunnelid) {
+
+insert:
+			/* OK, we found appropriate session */
+
+			fp = &s->ht[h2];
+
+			f->sess = s;
+			if (f->tunnelhdr == 0)
+				tcf_bind_filter(tp, &f->res, base);
+
+			tcf_exts_change(tp, &f->exts, &e);
+
+			for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
+				if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
+					break;
+			f->next = *fp;
+			wmb();
+			*fp = f;
+
+			*arg = (unsigned long)f;
+			return 0;
+		}
+	}
+
+	/* No session found. Create new one. */
+
+	err = -ENOBUFS;
+	s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
+	if (s == NULL)
+		goto errout;
+	memcpy(s->dst, dst, sizeof(s->dst));
+
+	if (pinfo) {
+		s->dpi = pinfo->dpi;
+		s->protocol = pinfo->protocol;
+		s->tunnelid = pinfo->tunnelid;
+	}
+	for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
+		if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
+			break;
+	}
+	s->next = *sp;
+	wmb();
+	*sp = s;
+
+	goto insert;
+
+errout:
+	kfree(f);
+errout2:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct rsvp_head *head = tp->root;
+	unsigned int h, h1;
+
+	if (arg->stop)
+		return;
+
+	for (h = 0; h < 256; h++) {
+		struct rsvp_session *s;
+
+		for (s = head->ht[h]; s; s = s->next) {
+			for (h1 = 0; h1 <= 16; h1++) {
+				struct rsvp_filter *f;
+
+				for (f = s->ht[h1]; f; f = f->next) {
+					if (arg->count < arg->skip) {
+						arg->count++;
+						continue;
+					}
+					if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+						arg->stop = 1;
+						return;
+					}
+					arg->count++;
+				}
+			}
+		}
+	}
+}
+
+static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
+		     struct sk_buff *skb, struct tcmsg *t)
+{
+	struct rsvp_filter *f = (struct rsvp_filter *)fh;
+	struct rsvp_session *s;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+	struct tc_rsvp_pinfo pinfo;
+
+	if (f == NULL)
+		return skb->len;
+	s = f->sess;
+
+	t->tcm_handle = f->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
+	pinfo.dpi = s->dpi;
+	pinfo.spi = f->spi;
+	pinfo.protocol = s->protocol;
+	pinfo.tunnelid = s->tunnelid;
+	pinfo.tunnelhdr = f->tunnelhdr;
+	pinfo.pad = 0;
+	NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
+	if (f->res.classid)
+		NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
+	if (((f->handle >> 8) & 0xFF) != 16)
+		NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
+
+	if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops RSVP_OPS = {
+	.next		=	NULL,
+	.kind		=	RSVP_ID,
+	.classify	=	rsvp_classify,
+	.init		=	rsvp_init,
+	.destroy	=	rsvp_destroy,
+	.get		=	rsvp_get,
+	.put		=	rsvp_put,
+	.change		=	rsvp_change,
+	.delete		=	rsvp_delete,
+	.walk		=	rsvp_walk,
+	.dump		=	rsvp_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_rsvp(void)
+{
+	return register_tcf_proto_ops(&RSVP_OPS);
+}
+
+static void __exit exit_rsvp(void)
+{
+	unregister_tcf_proto_ops(&RSVP_OPS);
+}
+
+module_init(init_rsvp)
+module_exit(exit_rsvp)
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
new file mode 100644
index 00000000..dd08aea2
--- /dev/null
+++ b/net/sched/cls_rsvp6.c
@@ -0,0 +1,28 @@
+/*
+ * net/sched/cls_rsvp6.c	Special RSVP packet classifier for IPv6.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ipv6.h>
+#include <linux/skbuff.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <net/netlink.h>
+
+#define RSVP_DST_LEN	4
+#define RSVP_ID		"rsvp6"
+#define RSVP_OPS	cls_rsvp6_ops
+
+#include "cls_rsvp.h"
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
new file mode 100644
index 00000000..36667fa6
--- /dev/null
+++ b/net/sched/cls_tcindex.c
@@ -0,0 +1,507 @@
+/*
+ * net/sched/cls_tcindex.c	Packet classifier for skb->tc_index
+ *
+ * Written 1998,1999 by Werner Almesberger, EPFL ICA
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+
+/*
+ * Passing parameters to the root seems to be done more awkwardly than really
+ * necessary. At least, u32 doesn't seem to use such dirty hacks. To be
+ * verified. FIXME.
+ */
+
+#define PERFECT_HASH_THRESHOLD	64	/* use perfect hash if not bigger */
+#define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
+
+
+#define	PRIV(tp)	((struct tcindex_data *) (tp)->root)
+
+
+struct tcindex_filter_result {
+	struct tcf_exts		exts;
+	struct tcf_result	res;
+};
+
+struct tcindex_filter {
+	u16 key;
+	struct tcindex_filter_result result;
+	struct tcindex_filter *next;
+};
+
+
+struct tcindex_data {
+	struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
+	struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
+				      NULL if unused */
+	u16 mask;		/* AND key with mask */
+	int shift;		/* shift ANDed key to the right */
+	int hash;		/* hash table size; 0 if undefined */
+	int alloc_hash;		/* allocated size */
+	int fall_through;	/* 0: only classify if explicit match */
+};
+
+static const struct tcf_ext_map tcindex_ext_map = {
+	.police = TCA_TCINDEX_POLICE,
+	.action = TCA_TCINDEX_ACT
+};
+
+static inline int
+tcindex_filter_is_set(struct tcindex_filter_result *r)
+{
+	return tcf_exts_is_predicative(&r->exts) || r->res.classid;
+}
+
+static struct tcindex_filter_result *
+tcindex_lookup(struct tcindex_data *p, u16 key)
+{
+	struct tcindex_filter *f;
+
+	if (p->perfect)
+		return tcindex_filter_is_set(p->perfect + key) ?
+			p->perfect + key : NULL;
+	else if (p->h) {
+		for (f = p->h[key % p->hash]; f; f = f->next)
+			if (f->key == key)
+				return &f->result;
+	}
+
+	return NULL;
+}
+
+
+static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			    struct tcf_result *res)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter_result *f;
+	int key = (skb->tc_index & p->mask) >> p->shift;
+
+	pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
+		 skb, tp, res, p);
+
+	f = tcindex_lookup(p, key);
+	if (!f) {
+		if (!p->fall_through)
+			return -1;
+		res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
+		res->class = 0;
+		pr_debug("alg 0x%x\n", res->classid);
+		return 0;
+	}
+	*res = f->res;
+	pr_debug("map 0x%x\n", res->classid);
+
+	return tcf_exts_exec(skb, &f->exts, res);
+}
+
+
+static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter_result *r;
+
+	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
+	if (p->perfect && handle >= p->alloc_hash)
+		return 0;
+	r = tcindex_lookup(p, handle);
+	return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
+}
+
+
+static void tcindex_put(struct tcf_proto *tp, unsigned long f)
+{
+	pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f);
+}
+
+
+static int tcindex_init(struct tcf_proto *tp)
+{
+	struct tcindex_data *p;
+
+	pr_debug("tcindex_init(tp %p)\n", tp);
+	p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->mask = 0xffff;
+	p->hash = DEFAULT_HASH_SIZE;
+	p->fall_through = 1;
+
+	tp->root = p;
+	return 0;
+}
+
+
+static int
+__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+	struct tcindex_filter *f = NULL;
+
+	pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f);
+	if (p->perfect) {
+		if (!r->res.class)
+			return -ENOENT;
+	} else {
+		int i;
+		struct tcindex_filter **walk = NULL;
+
+		for (i = 0; i < p->hash; i++)
+			for (walk = p->h+i; *walk; walk = &(*walk)->next)
+				if (&(*walk)->result == r)
+					goto found;
+		return -ENOENT;
+
+found:
+		f = *walk;
+		if (lock)
+			tcf_tree_lock(tp);
+		*walk = f->next;
+		if (lock)
+			tcf_tree_unlock(tp);
+	}
+	tcf_unbind_filter(tp, &r->res);
+	tcf_exts_destroy(tp, &r->exts);
+	kfree(f);
+	return 0;
+}
+
+static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	return __tcindex_delete(tp, arg, 1);
+}
+
+static inline int
+valid_perfect_hash(struct tcindex_data *p)
+{
+	return  p->hash > (p->mask >> p->shift);
+}
+
+static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
+	[TCA_TCINDEX_HASH]		= { .type = NLA_U32 },
+	[TCA_TCINDEX_MASK]		= { .type = NLA_U16 },
+	[TCA_TCINDEX_SHIFT]		= { .type = NLA_U32 },
+	[TCA_TCINDEX_FALL_THROUGH]	= { .type = NLA_U32 },
+	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
+};
+
+static int
+tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
+		  struct tcindex_data *p, struct tcindex_filter_result *r,
+		  struct nlattr **tb, struct nlattr *est)
+{
+	int err, balloc = 0;
+	struct tcindex_filter_result new_filter_result, *old_r = r;
+	struct tcindex_filter_result cr;
+	struct tcindex_data cp;
+	struct tcindex_filter *f = NULL; /* make gcc behave */
+	struct tcf_exts e;
+
+	err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
+	if (err < 0)
+		return err;
+
+	memcpy(&cp, p, sizeof(cp));
+	memset(&new_filter_result, 0, sizeof(new_filter_result));
+
+	if (old_r)
+		memcpy(&cr, r, sizeof(cr));
+	else
+		memset(&cr, 0, sizeof(cr));
+
+	if (tb[TCA_TCINDEX_HASH])
+		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+
+	if (tb[TCA_TCINDEX_MASK])
+		cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+	if (tb[TCA_TCINDEX_SHIFT])
+		cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+
+	err = -EBUSY;
+	/* Hash already allocated, make sure that we still meet the
+	 * requirements for the allocated hash.
+	 */
+	if (cp.perfect) {
+		if (!valid_perfect_hash(&cp) ||
+		    cp.hash > cp.alloc_hash)
+			goto errout;
+	} else if (cp.h && cp.hash != cp.alloc_hash)
+		goto errout;
+
+	err = -EINVAL;
+	if (tb[TCA_TCINDEX_FALL_THROUGH])
+		cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
+
+	if (!cp.hash) {
+		/* Hash not specified, use perfect hash if the upper limit
+		 * of the hashing index is below the threshold.
+		 */
+		if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
+			cp.hash = (cp.mask >> cp.shift) + 1;
+		else
+			cp.hash = DEFAULT_HASH_SIZE;
+	}
+
+	if (!cp.perfect && !cp.h)
+		cp.alloc_hash = cp.hash;
+
+	/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
+	 * but then, we'd fail handles that may become valid after some future
+	 * mask change. While this is extremely unlikely to ever matter,
+	 * the check below is safer (and also more backwards-compatible).
+	 */
+	if (cp.perfect || valid_perfect_hash(&cp))
+		if (handle >= cp.alloc_hash)
+			goto errout;
+
+
+	err = -ENOMEM;
+	if (!cp.perfect && !cp.h) {
+		if (valid_perfect_hash(&cp)) {
+			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
+			if (!cp.perfect)
+				goto errout;
+			balloc = 1;
+		} else {
+			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
+			if (!cp.h)
+				goto errout;
+			balloc = 2;
+		}
+	}
+
+	if (cp.perfect)
+		r = cp.perfect + handle;
+	else
+		r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
+
+	if (r == &new_filter_result) {
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
+		if (!f)
+			goto errout_alloc;
+	}
+
+	if (tb[TCA_TCINDEX_CLASSID]) {
+		cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
+		tcf_bind_filter(tp, &cr.res, base);
+	}
+
+	tcf_exts_change(tp, &cr.exts, &e);
+
+	tcf_tree_lock(tp);
+	if (old_r && old_r != r)
+		memset(old_r, 0, sizeof(*old_r));
+
+	memcpy(p, &cp, sizeof(cp));
+	memcpy(r, &cr, sizeof(cr));
+
+	if (r == &new_filter_result) {
+		struct tcindex_filter **fp;
+
+		f->key = handle;
+		f->result = new_filter_result;
+		f->next = NULL;
+		for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
+			/* nothing */;
+		*fp = f;
+	}
+	tcf_tree_unlock(tp);
+
+	return 0;
+
+errout_alloc:
+	if (balloc == 1)
+		kfree(cp.perfect);
+	else if (balloc == 2)
+		kfree(cp.h);
+errout:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static int
+tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+	       struct nlattr **tca, unsigned long *arg)
+{
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
+	int err;
+
+	pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
+	    "p %p,r %p,*arg 0x%lx\n",
+	    tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
+
+	if (!opt)
+		return 0;
+
+	err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
+	if (err < 0)
+		return err;
+
+	return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]);
+}
+
+
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter *f, *next;
+	int i;
+
+	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
+	if (p->perfect) {
+		for (i = 0; i < p->hash; i++) {
+			if (!p->perfect[i].res.class)
+				continue;
+			if (walker->count >= walker->skip) {
+				if (walker->fn(tp,
+				    (unsigned long) (p->perfect+i), walker)
+				     < 0) {
+					walker->stop = 1;
+					return;
+				}
+			}
+			walker->count++;
+		}
+	}
+	if (!p->h)
+		return;
+	for (i = 0; i < p->hash; i++) {
+		for (f = p->h[i]; f; f = next) {
+			next = f->next;
+			if (walker->count >= walker->skip) {
+				if (walker->fn(tp, (unsigned long) &f->result,
+				    walker) < 0) {
+					walker->stop = 1;
+					return;
+				}
+			}
+			walker->count++;
+		}
+	}
+}
+
+
+static int tcindex_destroy_element(struct tcf_proto *tp,
+    unsigned long arg, struct tcf_walker *walker)
+{
+	return __tcindex_delete(tp, arg, 0);
+}
+
+
+static void tcindex_destroy(struct tcf_proto *tp)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcf_walker walker;
+
+	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
+	walker.count = 0;
+	walker.skip = 0;
+	walker.fn = &tcindex_destroy_element;
+	tcindex_walk(tp, &walker);
+	kfree(p->perfect);
+	kfree(p->h);
+	kfree(p);
+	tp->root = NULL;
+}
+
+
+static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
+    struct sk_buff *skb, struct tcmsg *t)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
+		 tp, fh, skb, t, p, r, b);
+	pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (!fh) {
+		t->tcm_handle = ~0; /* whatever ... */
+		NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash);
+		NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask);
+		NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift);
+		NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through);
+		nla_nest_end(skb, nest);
+	} else {
+		if (p->perfect) {
+			t->tcm_handle = r-p->perfect;
+		} else {
+			struct tcindex_filter *f;
+			int i;
+
+			t->tcm_handle = 0;
+			for (i = 0; !t->tcm_handle && i < p->hash; i++) {
+				for (f = p->h[i]; !t->tcm_handle && f;
+				     f = f->next) {
+					if (&f->result == r)
+						t->tcm_handle = f->key;
+				}
+			}
+		}
+		pr_debug("handle = %d\n", t->tcm_handle);
+		if (r->res.class)
+			NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid);
+
+		if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
+			goto nla_put_failure;
+		nla_nest_end(skb, nest);
+
+		if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
+			goto nla_put_failure;
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
+	.kind		=	"tcindex",
+	.classify	=	tcindex_classify,
+	.init		=	tcindex_init,
+	.destroy	=	tcindex_destroy,
+	.get		=	tcindex_get,
+	.put		=	tcindex_put,
+	.change		=	tcindex_change,
+	.delete		=	tcindex_delete,
+	.walk		=	tcindex_walk,
+	.dump		=	tcindex_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_tcindex(void)
+{
+	return register_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+static void __exit exit_tcindex(void)
+{
+	unregister_tcf_proto_ops(&cls_tcindex_ops);
+}
+
+module_init(init_tcindex)
+module_exit(exit_tcindex)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
new file mode 100644
index 00000000..3b93fc0c
--- /dev/null
+++ b/net/sched/cls_u32.c
@@ -0,0 +1,817 @@
+/*
+ * net/sched/cls_u32.c	Ugly (or Universal) 32bit key Packet Classifier.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	The filters are packed to hash tables of key nodes
+ *	with a set of 32bit key/mask pairs at every node.
+ *	Nodes reference next level hash tables etc.
+ *
+ *	This scheme is the best universal classifier I managed to
+ *	invent; it is not super-fast, but it is not slow (provided you
+ *	program it correctly), and general enough.  And its relative
+ *	speed grows as the number of rules becomes larger.
+ *
+ *	It seems that it represents the best middle point between
+ *	speed and manageability both by human and by machine.
+ *
+ *	It is especially useful for link sharing combined with QoS;
+ *	pure RSVP doesn't need such a general approach and can use
+ *	much simpler (and faster) schemes, sort of cls_rsvp.c.
+ *
+ *	JHS: We should remove the CONFIG_NET_CLS_IND from here
+ *	eventually when the meta match extension is made available
+ *
+ *	nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct tc_u_knode {
+	struct tc_u_knode	*next;
+	u32			handle;
+	struct tc_u_hnode	*ht_up;
+	struct tcf_exts		exts;
+#ifdef CONFIG_NET_CLS_IND
+	char                     indev[IFNAMSIZ];
+#endif
+	u8			fshift;
+	struct tcf_result	res;
+	struct tc_u_hnode	*ht_down;
+#ifdef CONFIG_CLS_U32_PERF
+	struct tc_u32_pcnt	*pf;
+#endif
+#ifdef CONFIG_CLS_U32_MARK
+	struct tc_u32_mark	mark;
+#endif
+	struct tc_u32_sel	sel;
+};
+
+struct tc_u_hnode {
+	struct tc_u_hnode	*next;
+	u32			handle;
+	u32			prio;
+	struct tc_u_common	*tp_c;
+	int			refcnt;
+	unsigned int		divisor;
+	struct tc_u_knode	*ht[1];
+};
+
+struct tc_u_common {
+	struct tc_u_hnode	*hlist;
+	struct Qdisc		*q;
+	int			refcnt;
+	u32			hgenerator;
+};
+
+static const struct tcf_ext_map u32_ext_map = {
+	.action = TCA_U32_ACT,
+	.police = TCA_U32_POLICE
+};
+
+static inline unsigned int u32_hash_fold(__be32 key,
+					 const struct tc_u32_sel *sel,
+					 u8 fshift)
+{
+	unsigned int h = ntohl(key & sel->hmask) >> fshift;
+
+	return h;
+}
+
+static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
+{
+	struct {
+		struct tc_u_knode *knode;
+		unsigned int	  off;
+	} stack[TC_U32_MAXDEPTH];
+
+	struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
+	unsigned int off = skb_network_offset(skb);
+	struct tc_u_knode *n;
+	int sdepth = 0;
+	int off2 = 0;
+	int sel = 0;
+#ifdef CONFIG_CLS_U32_PERF
+	int j;
+#endif
+	int i, r;
+
+next_ht:
+	n = ht->ht[sel];
+
+next_knode:
+	if (n) {
+		struct tc_u32_key *key = n->sel.keys;
+
+#ifdef CONFIG_CLS_U32_PERF
+		n->pf->rcnt += 1;
+		j = 0;
+#endif
+
+#ifdef CONFIG_CLS_U32_MARK
+		if ((skb->mark & n->mark.mask) != n->mark.val) {
+			n = n->next;
+			goto next_knode;
+		} else {
+			n->mark.success++;
+		}
+#endif
+
+		for (i = n->sel.nkeys; i > 0; i--, key++) {
+			int toff = off + key->off + (off2 & key->offmask);
+			__be32 *data, hdata;
+
+			if (skb_headroom(skb) + toff > INT_MAX)
+				goto out;
+
+			data = skb_header_pointer(skb, toff, 4, &hdata);
+			if (!data)
+				goto out;
+			if ((*data ^ key->val) & key->mask) {
+				n = n->next;
+				goto next_knode;
+			}
+#ifdef CONFIG_CLS_U32_PERF
+			n->pf->kcnts[j] += 1;
+			j++;
+#endif
+		}
+		if (n->ht_down == NULL) {
+check_terminal:
+			if (n->sel.flags & TC_U32_TERMINAL) {
+
+				*res = n->res;
+#ifdef CONFIG_NET_CLS_IND
+				if (!tcf_match_indev(skb, n->indev)) {
+					n = n->next;
+					goto next_knode;
+				}
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+				n->pf->rhit += 1;
+#endif
+				r = tcf_exts_exec(skb, &n->exts, res);
+				if (r < 0) {
+					n = n->next;
+					goto next_knode;
+				}
+
+				return r;
+			}
+			n = n->next;
+			goto next_knode;
+		}
+
+		/* PUSH */
+		if (sdepth >= TC_U32_MAXDEPTH)
+			goto deadloop;
+		stack[sdepth].knode = n;
+		stack[sdepth].off = off;
+		sdepth++;
+
+		ht = n->ht_down;
+		sel = 0;
+		if (ht->divisor) {
+			__be32 *data, hdata;
+
+			data = skb_header_pointer(skb, off + n->sel.hoff, 4,
+						  &hdata);
+			if (!data)
+				goto out;
+			sel = ht->divisor & u32_hash_fold(*data, &n->sel,
+							  n->fshift);
+		}
+		if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
+			goto next_ht;
+
+		if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
+			off2 = n->sel.off + 3;
+			if (n->sel.flags & TC_U32_VAROFFSET) {
+				__be16 *data, hdata;
+
+				data = skb_header_pointer(skb,
+							  off + n->sel.offoff,
+							  2, &hdata);
+				if (!data)
+					goto out;
+				off2 += ntohs(n->sel.offmask & *data) >>
+					n->sel.offshift;
+			}
+			off2 &= ~3;
+		}
+		if (n->sel.flags & TC_U32_EAT) {
+			off += off2;
+			off2 = 0;
+		}
+
+		if (off < skb->len)
+			goto next_ht;
+	}
+
+	/* POP */
+	if (sdepth--) {
+		n = stack[sdepth].knode;
+		ht = n->ht_up;
+		off = stack[sdepth].off;
+		goto check_terminal;
+	}
+out:
+	return -1;
+
+deadloop:
+	if (net_ratelimit())
+		pr_warning("cls_u32: dead loop\n");
+	return -1;
+}
+
+static struct tc_u_hnode *
+u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
+{
+	struct tc_u_hnode *ht;
+
+	for (ht = tp_c->hlist; ht; ht = ht->next)
+		if (ht->handle == handle)
+			break;
+
+	return ht;
+}
+
+static struct tc_u_knode *
+u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
+{
+	unsigned int sel;
+	struct tc_u_knode *n = NULL;
+
+	sel = TC_U32_HASH(handle);
+	if (sel > ht->divisor)
+		goto out;
+
+	for (n = ht->ht[sel]; n; n = n->next)
+		if (n->handle == handle)
+			break;
+out:
+	return n;
+}
+
+
+static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
+{
+	struct tc_u_hnode *ht;
+	struct tc_u_common *tp_c = tp->data;
+
+	if (TC_U32_HTID(handle) == TC_U32_ROOT)
+		ht = tp->root;
+	else
+		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
+
+	if (!ht)
+		return 0;
+
+	if (TC_U32_KEY(handle) == 0)
+		return (unsigned long)ht;
+
+	return (unsigned long)u32_lookup_key(ht, handle);
+}
+
+static void u32_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static u32 gen_new_htid(struct tc_u_common *tp_c)
+{
+	int i = 0x800;
+
+	do {
+		if (++tp_c->hgenerator == 0x7FF)
+			tp_c->hgenerator = 1;
+	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
+
+	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
+}
+
+static int u32_init(struct tcf_proto *tp)
+{
+	struct tc_u_hnode *root_ht;
+	struct tc_u_common *tp_c;
+
+	tp_c = tp->q->u32_node;
+
+	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
+	if (root_ht == NULL)
+		return -ENOBUFS;
+
+	root_ht->divisor = 0;
+	root_ht->refcnt++;
+	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
+	root_ht->prio = tp->prio;
+
+	if (tp_c == NULL) {
+		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
+		if (tp_c == NULL) {
+			kfree(root_ht);
+			return -ENOBUFS;
+		}
+		tp_c->q = tp->q;
+		tp->q->u32_node = tp_c;
+	}
+
+	tp_c->refcnt++;
+	root_ht->next = tp_c->hlist;
+	tp_c->hlist = root_ht;
+	root_ht->tp_c = tp_c;
+
+	tp->root = root_ht;
+	tp->data = tp_c;
+	return 0;
+}
+
+static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
+{
+	tcf_unbind_filter(tp, &n->res);
+	tcf_exts_destroy(tp, &n->exts);
+	if (n->ht_down)
+		n->ht_down->refcnt--;
+#ifdef CONFIG_CLS_U32_PERF
+	kfree(n->pf);
+#endif
+	kfree(n);
+	return 0;
+}
+
+static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
+{
+	struct tc_u_knode **kp;
+	struct tc_u_hnode *ht = key->ht_up;
+
+	if (ht) {
+		for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
+			if (*kp == key) {
+				tcf_tree_lock(tp);
+				*kp = key->next;
+				tcf_tree_unlock(tp);
+
+				u32_destroy_key(tp, key);
+				return 0;
+			}
+		}
+	}
+	WARN_ON(1);
+	return 0;
+}
+
+static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+	struct tc_u_knode *n;
+	unsigned int h;
+
+	for (h = 0; h <= ht->divisor; h++) {
+		while ((n = ht->ht[h]) != NULL) {
+			ht->ht[h] = n->next;
+
+			u32_destroy_key(tp, n);
+		}
+	}
+}
+
+static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+{
+	struct tc_u_common *tp_c = tp->data;
+	struct tc_u_hnode **hn;
+
+	WARN_ON(ht->refcnt);
+
+	u32_clear_hnode(tp, ht);
+
+	for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
+		if (*hn == ht) {
+			*hn = ht->next;
+			kfree(ht);
+			return 0;
+		}
+	}
+
+	WARN_ON(1);
+	return -ENOENT;
+}
+
+static void u32_destroy(struct tcf_proto *tp)
+{
+	struct tc_u_common *tp_c = tp->data;
+	struct tc_u_hnode *root_ht = tp->root;
+
+	WARN_ON(root_ht == NULL);
+
+	if (root_ht && --root_ht->refcnt == 0)
+		u32_destroy_hnode(tp, root_ht);
+
+	if (--tp_c->refcnt == 0) {
+		struct tc_u_hnode *ht;
+
+		tp->q->u32_node = NULL;
+
+		for (ht = tp_c->hlist; ht; ht = ht->next) {
+			ht->refcnt--;
+			u32_clear_hnode(tp, ht);
+		}
+
+		while ((ht = tp_c->hlist) != NULL) {
+			tp_c->hlist = ht->next;
+
+			WARN_ON(ht->refcnt != 0);
+
+			kfree(ht);
+		}
+
+		kfree(tp_c);
+	}
+
+	tp->data = NULL;
+}
+
+static int u32_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
+
+	if (ht == NULL)
+		return 0;
+
+	if (TC_U32_KEY(ht->handle))
+		return u32_delete_key(tp, (struct tc_u_knode *)ht);
+
+	if (tp->root == ht)
+		return -EINVAL;
+
+	if (ht->refcnt == 1) {
+		ht->refcnt--;
+		u32_destroy_hnode(tp, ht);
+	} else {
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
+{
+	struct tc_u_knode *n;
+	unsigned int i = 0x7FF;
+
+	for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+		if (i < TC_U32_NODE(n->handle))
+			i = TC_U32_NODE(n->handle);
+	i++;
+
+	return handle | (i > 0xFFF ? 0xFFF : i);
+}
+
+static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
+	[TCA_U32_CLASSID]	= { .type = NLA_U32 },
+	[TCA_U32_HASH]		= { .type = NLA_U32 },
+	[TCA_U32_LINK]		= { .type = NLA_U32 },
+	[TCA_U32_DIVISOR]	= { .type = NLA_U32 },
+	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
+	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
+	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+};
+
+static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
+			 struct tc_u_hnode *ht,
+			 struct tc_u_knode *n, struct nlattr **tb,
+			 struct nlattr *est)
+{
+	int err;
+	struct tcf_exts e;
+
+	err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_U32_LINK]) {
+		u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
+		struct tc_u_hnode *ht_down = NULL, *ht_old;
+
+		if (TC_U32_KEY(handle))
+			goto errout;
+
+		if (handle) {
+			ht_down = u32_lookup_ht(ht->tp_c, handle);
+
+			if (ht_down == NULL)
+				goto errout;
+			ht_down->refcnt++;
+		}
+
+		tcf_tree_lock(tp);
+		ht_old = n->ht_down;
+		n->ht_down = ht_down;
+		tcf_tree_unlock(tp);
+
+		if (ht_old)
+			ht_old->refcnt--;
+	}
+	if (tb[TCA_U32_CLASSID]) {
+		n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
+		tcf_bind_filter(tp, &n->res, base);
+	}
+
+#ifdef CONFIG_NET_CLS_IND
+	if (tb[TCA_U32_INDEV]) {
+		err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]);
+		if (err < 0)
+			goto errout;
+	}
+#endif
+	tcf_exts_change(tp, &n->exts, &e);
+
+	return 0;
+errout:
+	tcf_exts_destroy(tp, &e);
+	return err;
+}
+
+static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+		      struct nlattr **tca,
+		      unsigned long *arg)
+{
+	struct tc_u_common *tp_c = tp->data;
+	struct tc_u_hnode *ht;
+	struct tc_u_knode *n;
+	struct tc_u32_sel *s;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_U32_MAX + 1];
+	u32 htid;
+	int err;
+
+	if (opt == NULL)
+		return handle ? -EINVAL : 0;
+
+	err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
+	if (err < 0)
+		return err;
+
+	n = (struct tc_u_knode *)*arg;
+	if (n) {
+		if (TC_U32_KEY(n->handle) == 0)
+			return -EINVAL;
+
+		return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]);
+	}
+
+	if (tb[TCA_U32_DIVISOR]) {
+		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
+
+		if (--divisor > 0x100)
+			return -EINVAL;
+		if (TC_U32_KEY(handle))
+			return -EINVAL;
+		if (handle == 0) {
+			handle = gen_new_htid(tp->data);
+			if (handle == 0)
+				return -ENOMEM;
+		}
+		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
+		if (ht == NULL)
+			return -ENOBUFS;
+		ht->tp_c = tp_c;
+		ht->refcnt = 1;
+		ht->divisor = divisor;
+		ht->handle = handle;
+		ht->prio = tp->prio;
+		ht->next = tp_c->hlist;
+		tp_c->hlist = ht;
+		*arg = (unsigned long)ht;
+		return 0;
+	}
+
+	if (tb[TCA_U32_HASH]) {
+		htid = nla_get_u32(tb[TCA_U32_HASH]);
+		if (TC_U32_HTID(htid) == TC_U32_ROOT) {
+			ht = tp->root;
+			htid = ht->handle;
+		} else {
+			ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
+			if (ht == NULL)
+				return -EINVAL;
+		}
+	} else {
+		ht = tp->root;
+		htid = ht->handle;
+	}
+
+	if (ht->divisor < TC_U32_HASH(htid))
+		return -EINVAL;
+
+	if (handle) {
+		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
+			return -EINVAL;
+		handle = htid | TC_U32_NODE(handle);
+	} else
+		handle = gen_new_kid(ht, htid);
+
+	if (tb[TCA_U32_SEL] == NULL)
+		return -EINVAL;
+
+	s = nla_data(tb[TCA_U32_SEL]);
+
+	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
+	if (n == NULL)
+		return -ENOBUFS;
+
+#ifdef CONFIG_CLS_U32_PERF
+	n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
+	if (n->pf == NULL) {
+		kfree(n);
+		return -ENOBUFS;
+	}
+#endif
+
+	memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+	n->ht_up = ht;
+	n->handle = handle;
+	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+
+#ifdef CONFIG_CLS_U32_MARK
+	if (tb[TCA_U32_MARK]) {
+		struct tc_u32_mark *mark;
+
+		mark = nla_data(tb[TCA_U32_MARK]);
+		memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
+		n->mark.success = 0;
+	}
+#endif
+
+	err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]);
+	if (err == 0) {
+		struct tc_u_knode **ins;
+		for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
+			if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
+				break;
+
+		n->next = *ins;
+		tcf_tree_lock(tp);
+		*ins = n;
+		tcf_tree_unlock(tp);
+
+		*arg = (unsigned long)n;
+		return 0;
+	}
+#ifdef CONFIG_CLS_U32_PERF
+	kfree(n->pf);
+#endif
+	kfree(n);
+	return err;
+}
+
+static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct tc_u_common *tp_c = tp->data;
+	struct tc_u_hnode *ht;
+	struct tc_u_knode *n;
+	unsigned int h;
+
+	if (arg->stop)
+		return;
+
+	for (ht = tp_c->hlist; ht; ht = ht->next) {
+		if (ht->prio != tp->prio)
+			continue;
+		if (arg->count >= arg->skip) {
+			if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+		}
+		arg->count++;
+		for (h = 0; h <= ht->divisor; h++) {
+			for (n = ht->ht[h]; n; n = n->next) {
+				if (arg->count < arg->skip) {
+					arg->count++;
+					continue;
+				}
+				if (arg->fn(tp, (unsigned long)n, arg) < 0) {
+					arg->stop = 1;
+					return;
+				}
+				arg->count++;
+			}
+		}
+	}
+}
+
+static int u32_dump(struct tcf_proto *tp, unsigned long fh,
+		     struct sk_buff *skb, struct tcmsg *t)
+{
+	struct tc_u_knode *n = (struct tc_u_knode *)fh;
+	struct nlattr *nest;
+
+	if (n == NULL)
+		return skb->len;
+
+	t->tcm_handle = n->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (TC_U32_KEY(n->handle) == 0) {
+		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+		u32 divisor = ht->divisor + 1;
+
+		NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
+	} else {
+		NLA_PUT(skb, TCA_U32_SEL,
+			sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
+			&n->sel);
+		if (n->ht_up) {
+			u32 htid = n->handle & 0xFFFFF000;
+			NLA_PUT_U32(skb, TCA_U32_HASH, htid);
+		}
+		if (n->res.classid)
+			NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid);
+		if (n->ht_down)
+			NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle);
+
+#ifdef CONFIG_CLS_U32_MARK
+		if (n->mark.val || n->mark.mask)
+			NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
+#endif
+
+		if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
+			goto nla_put_failure;
+
+#ifdef CONFIG_NET_CLS_IND
+		if (strlen(n->indev))
+			NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
+#endif
+#ifdef CONFIG_CLS_U32_PERF
+		NLA_PUT(skb, TCA_U32_PCNT,
+		sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
+			n->pf);
+#endif
+	}
+
+	nla_nest_end(skb, nest);
+
+	if (TC_U32_KEY(n->handle))
+		if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
+			goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_u32_ops __read_mostly = {
+	.kind		=	"u32",
+	.classify	=	u32_classify,
+	.init		=	u32_init,
+	.destroy	=	u32_destroy,
+	.get		=	u32_get,
+	.put		=	u32_put,
+	.change		=	u32_change,
+	.delete		=	u32_delete,
+	.walk		=	u32_walk,
+	.dump		=	u32_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_u32(void)
+{
+	pr_info("u32 classifier\n");
+#ifdef CONFIG_CLS_U32_PERF
+	pr_info("    Performance counters on\n");
+#endif
+#ifdef CONFIG_NET_CLS_IND
+	pr_info("    input device check on\n");
+#endif
+#ifdef CONFIG_NET_CLS_ACT
+	pr_info("    Actions configured\n");
+#endif
+	return register_tcf_proto_ops(&cls_u32_ops);
+}
+
+static void __exit exit_u32(void)
+{
+	unregister_tcf_proto_ops(&cls_u32_ops);
+}
+
+module_init(init_u32)
+module_exit(exit_u32)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
new file mode 100644
index 00000000..1c8360a2
--- /dev/null
+++ b/net/sched/em_cmp.c
@@ -0,0 +1,99 @@
+/*
+ * net/sched/em_cmp.c	Simple packet data comparison ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_cmp.h>
+#include <asm/unaligned.h>
+#include <net/pkt_cls.h>
+
+static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
+{
+	return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
+}
+
+static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
+			struct tcf_pkt_info *info)
+{
+	struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
+	unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+	u32 val = 0;
+
+	if (!tcf_valid_offset(skb, ptr, cmp->align))
+		return 0;
+
+	switch (cmp->align) {
+	case TCF_EM_ALIGN_U8:
+		val = *ptr;
+		break;
+
+	case TCF_EM_ALIGN_U16:
+		val = get_unaligned_be16(ptr);
+
+		if (cmp_needs_transformation(cmp))
+			val = be16_to_cpu(val);
+		break;
+
+	case TCF_EM_ALIGN_U32:
+		/* Worth checking boundries? The branching seems
+		 * to get worse. Visit again.
+		 */
+		val = get_unaligned_be32(ptr);
+
+		if (cmp_needs_transformation(cmp))
+			val = be32_to_cpu(val);
+		break;
+
+	default:
+		return 0;
+	}
+
+	if (cmp->mask)
+		val &= cmp->mask;
+
+	switch (cmp->opnd) {
+	case TCF_EM_OPND_EQ:
+		return val == cmp->val;
+	case TCF_EM_OPND_LT:
+		return val < cmp->val;
+	case TCF_EM_OPND_GT:
+		return val > cmp->val;
+	}
+
+	return 0;
+}
+
+static struct tcf_ematch_ops em_cmp_ops = {
+	.kind	  = TCF_EM_CMP,
+	.datalen  = sizeof(struct tcf_em_cmp),
+	.match	  = em_cmp_match,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_cmp_ops.link)
+};
+
+static int __init init_em_cmp(void)
+{
+	return tcf_em_register(&em_cmp_ops);
+}
+
+static void __exit exit_em_cmp(void)
+{
+	tcf_em_unregister(&em_cmp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_cmp);
+module_exit(exit_em_cmp);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
new file mode 100644
index 00000000..49130e8a
--- /dev/null
+++ b/net/sched/em_meta.c
@@ -0,0 +1,879 @@
+/*
+ * net/sched/em_meta.c	Metadata ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * 	The metadata ematch compares two meta objects where each object
+ * 	represents either a meta value stored in the kernel or a static
+ * 	value provided by userspace. The objects are not provided by
+ * 	userspace itself but rather a definition providing the information
+ * 	to build them. Every object is of a certain type which must be
+ * 	equal to the object it is being compared to.
+ *
+ * 	The definition of a objects conists of the type (meta type), a
+ * 	identifier (meta id) and additional type specific information.
+ * 	The meta id is either TCF_META_TYPE_VALUE for values provided by
+ * 	userspace or a index to the meta operations table consisting of
+ * 	function pointers to type specific meta data collectors returning
+ * 	the value of the requested meta value.
+ *
+ * 	         lvalue                                   rvalue
+ * 	      +-----------+                           +-----------+
+ * 	      | type: INT |                           | type: INT |
+ * 	 def  | id: DEV   |                           | id: VALUE |
+ * 	      | data:     |                           | data: 3   |
+ * 	      +-----------+                           +-----------+
+ * 	            |                                       |
+ * 	            ---> meta_ops[INT][DEV](...)            |
+ *	                      |                             |
+ * 	            -----------                             |
+ * 	            V                                       V
+ * 	      +-----------+                           +-----------+
+ * 	      | type: INT |                           | type: INT |
+ * 	 obj  | id: DEV |                             | id: VALUE |
+ * 	      | data: 2   |<--data got filled out     | data: 3   |
+ * 	      +-----------+                           +-----------+
+ * 	            |                                         |
+ * 	            --------------> 2  equals 3 <--------------
+ *
+ * 	This is a simplified schema, the complexity varies depending
+ * 	on the meta type. Obviously, the length of the data must also
+ * 	be provided for non-numeric types.
+ *
+ * 	Additionally, type dependent modifiers such as shift operators
+ * 	or mask may be applied to extend the functionaliy. As of now,
+ * 	the variable length type supports shifting the byte string to
+ * 	the right, eating up any number of octets and thus supporting
+ * 	wildcard interface name comparisons such as "ppp%" matching
+ * 	ppp0..9.
+ *
+ * 	NOTE: Certain meta values depend on other subsystems and are
+ * 	      only available if that subsystem is enabled in the kernel.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/if_vlan.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+
+struct meta_obj {
+	unsigned long		value;
+	unsigned int		len;
+};
+
+struct meta_value {
+	struct tcf_meta_val	hdr;
+	unsigned long		val;
+	unsigned int		len;
+};
+
+struct meta_match {
+	struct meta_value	lvalue;
+	struct meta_value	rvalue;
+};
+
+static inline int meta_id(struct meta_value *v)
+{
+	return TCF_META_ID(v->hdr.kind);
+}
+
+static inline int meta_type(struct meta_value *v)
+{
+	return TCF_META_TYPE(v->hdr.kind);
+}
+
+#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
+	struct tcf_pkt_info *info, struct meta_value *v, \
+	struct meta_obj *dst, int *err)
+
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+
+META_COLLECTOR(int_random)
+{
+	get_random_bytes(&dst->value, sizeof(dst->value));
+}
+
+static inline unsigned long fixed_loadavg(int load)
+{
+	int rnd_load = load + (FIXED_1/200);
+	int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
+
+	return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
+}
+
+META_COLLECTOR(int_loadavg_0)
+{
+	dst->value = fixed_loadavg(avenrun[0]);
+}
+
+META_COLLECTOR(int_loadavg_1)
+{
+	dst->value = fixed_loadavg(avenrun[1]);
+}
+
+META_COLLECTOR(int_loadavg_2)
+{
+	dst->value = fixed_loadavg(avenrun[2]);
+}
+
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+
+static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
+{
+	if (unlikely(dev == NULL))
+		return -1;
+
+	dst->value = dev->ifindex;
+	return 0;
+}
+
+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
+{
+	if (unlikely(dev == NULL))
+		return -1;
+
+	dst->value = (unsigned long) dev->name;
+	dst->len = strlen(dev->name);
+	return 0;
+}
+
+META_COLLECTOR(int_dev)
+{
+	*err = int_dev(skb->dev, dst);
+}
+
+META_COLLECTOR(var_dev)
+{
+	*err = var_dev(skb->dev, dst);
+}
+
+/**************************************************************************
+ * vlan tag
+ **************************************************************************/
+
+META_COLLECTOR(int_vlan_tag)
+{
+	unsigned short tag;
+
+	tag = vlan_tx_tag_get(skb);
+	if (!tag && __vlan_get_tag(skb, &tag))
+		*err = -1;
+	else
+		dst->value = tag;
+}
+
+
+
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+
+META_COLLECTOR(int_priority)
+{
+	dst->value = skb->priority;
+}
+
+META_COLLECTOR(int_protocol)
+{
+	/* Let userspace take care of the byte ordering */
+	dst->value = skb->protocol;
+}
+
+META_COLLECTOR(int_pkttype)
+{
+	dst->value = skb->pkt_type;
+}
+
+META_COLLECTOR(int_pktlen)
+{
+	dst->value = skb->len;
+}
+
+META_COLLECTOR(int_datalen)
+{
+	dst->value = skb->data_len;
+}
+
+META_COLLECTOR(int_maclen)
+{
+	dst->value = skb->mac_len;
+}
+
+META_COLLECTOR(int_rxhash)
+{
+	dst->value = skb_get_rxhash(skb);
+}
+
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+
+META_COLLECTOR(int_mark)
+{
+	dst->value = skb->mark;
+}
+
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+
+META_COLLECTOR(int_tcindex)
+{
+	dst->value = skb->tc_index;
+}
+
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+
+META_COLLECTOR(int_rtclassid)
+{
+	if (unlikely(skb_dst(skb) == NULL))
+		*err = -1;
+	else
+#ifdef CONFIG_IP_ROUTE_CLASSID
+		dst->value = skb_dst(skb)->tclassid;
+#else
+		dst->value = 0;
+#endif
+}
+
+META_COLLECTOR(int_rtiif)
+{
+	if (unlikely(skb_rtable(skb) == NULL))
+		*err = -1;
+	else
+		dst->value = skb_rtable(skb)->rt_iif;
+}
+
+/**************************************************************************
+ * Socket Attributes
+ **************************************************************************/
+
+#define SKIP_NONLOCAL(skb)			\
+	if (unlikely(skb->sk == NULL)) {	\
+		*err = -1;			\
+		return;				\
+	}
+
+META_COLLECTOR(int_sk_family)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_family;
+}
+
+META_COLLECTOR(int_sk_state)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_state;
+}
+
+META_COLLECTOR(int_sk_reuse)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_reuse;
+}
+
+META_COLLECTOR(int_sk_bound_if)
+{
+	SKIP_NONLOCAL(skb);
+	/* No error if bound_dev_if is 0, legal userspace check */
+	dst->value = skb->sk->sk_bound_dev_if;
+}
+
+META_COLLECTOR(var_sk_bound_if)
+{
+	SKIP_NONLOCAL(skb);
+
+	if (skb->sk->sk_bound_dev_if == 0) {
+		dst->value = (unsigned long) "any";
+		dst->len = 3;
+	} else {
+		struct net_device *dev;
+
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(sock_net(skb->sk),
+					   skb->sk->sk_bound_dev_if);
+		*err = var_dev(dev, dst);
+		rcu_read_unlock();
+	}
+}
+
+META_COLLECTOR(int_sk_refcnt)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_refcnt);
+}
+
+META_COLLECTOR(int_sk_rcvbuf)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvbuf;
+}
+
+META_COLLECTOR(int_sk_shutdown)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_shutdown;
+}
+
+META_COLLECTOR(int_sk_proto)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_protocol;
+}
+
+META_COLLECTOR(int_sk_type)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_type;
+}
+
+META_COLLECTOR(int_sk_rmem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = sk_rmem_alloc_get(skb->sk);
+}
+
+META_COLLECTOR(int_sk_wmem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = sk_wmem_alloc_get(skb->sk);
+}
+
+META_COLLECTOR(int_sk_omem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_omem_alloc);
+}
+
+META_COLLECTOR(int_sk_rcv_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_receive_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_snd_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_write_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_wmem_queued)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_wmem_queued;
+}
+
+META_COLLECTOR(int_sk_fwd_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_forward_alloc;
+}
+
+META_COLLECTOR(int_sk_sndbuf)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndbuf;
+}
+
+META_COLLECTOR(int_sk_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = (__force int) skb->sk->sk_allocation;
+}
+
+META_COLLECTOR(int_sk_route_caps)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_route_caps;
+}
+
+META_COLLECTOR(int_sk_hash)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_hash;
+}
+
+META_COLLECTOR(int_sk_lingertime)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_lingertime / HZ;
+}
+
+META_COLLECTOR(int_sk_err_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_error_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_ack_bl)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_max_ack_bl)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_max_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_prio)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_priority;
+}
+
+META_COLLECTOR(int_sk_rcvlowat)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvlowat;
+}
+
+META_COLLECTOR(int_sk_rcvtimeo)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sndtimeo)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sendmsg_off)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndmsg_off;
+}
+
+META_COLLECTOR(int_sk_write_pend)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_write_pending;
+}
+
+/**************************************************************************
+ * Meta value collectors assignment table
+ **************************************************************************/
+
+struct meta_ops {
+	void		(*get)(struct sk_buff *, struct tcf_pkt_info *,
+			       struct meta_value *, struct meta_obj *, int *);
+};
+
+#define META_ID(name) TCF_META_ID_##name
+#define META_FUNC(name) { .get = meta_##name }
+
+/* Meta value operations table listing all meta value collectors and
+ * assigns them to a type and meta id. */
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
+	[TCF_META_TYPE_VAR] = {
+		[META_ID(DEV)]			= META_FUNC(var_dev),
+		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
+	},
+	[TCF_META_TYPE_INT] = {
+		[META_ID(RANDOM)]		= META_FUNC(int_random),
+		[META_ID(LOADAVG_0)]		= META_FUNC(int_loadavg_0),
+		[META_ID(LOADAVG_1)]		= META_FUNC(int_loadavg_1),
+		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
+		[META_ID(DEV)]			= META_FUNC(int_dev),
+		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
+		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
+		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
+		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
+		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
+		[META_ID(MACLEN)]		= META_FUNC(int_maclen),
+		[META_ID(NFMARK)]		= META_FUNC(int_mark),
+		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
+		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
+		[META_ID(RTIIF)]		= META_FUNC(int_rtiif),
+		[META_ID(SK_FAMILY)]		= META_FUNC(int_sk_family),
+		[META_ID(SK_STATE)]		= META_FUNC(int_sk_state),
+		[META_ID(SK_REUSE)]		= META_FUNC(int_sk_reuse),
+		[META_ID(SK_BOUND_IF)]		= META_FUNC(int_sk_bound_if),
+		[META_ID(SK_REFCNT)]		= META_FUNC(int_sk_refcnt),
+		[META_ID(SK_RCVBUF)]		= META_FUNC(int_sk_rcvbuf),
+		[META_ID(SK_SNDBUF)]		= META_FUNC(int_sk_sndbuf),
+		[META_ID(SK_SHUTDOWN)]		= META_FUNC(int_sk_shutdown),
+		[META_ID(SK_PROTO)]		= META_FUNC(int_sk_proto),
+		[META_ID(SK_TYPE)]		= META_FUNC(int_sk_type),
+		[META_ID(SK_RMEM_ALLOC)]	= META_FUNC(int_sk_rmem_alloc),
+		[META_ID(SK_WMEM_ALLOC)]	= META_FUNC(int_sk_wmem_alloc),
+		[META_ID(SK_OMEM_ALLOC)]	= META_FUNC(int_sk_omem_alloc),
+		[META_ID(SK_WMEM_QUEUED)]	= META_FUNC(int_sk_wmem_queued),
+		[META_ID(SK_RCV_QLEN)]		= META_FUNC(int_sk_rcv_qlen),
+		[META_ID(SK_SND_QLEN)]		= META_FUNC(int_sk_snd_qlen),
+		[META_ID(SK_ERR_QLEN)]		= META_FUNC(int_sk_err_qlen),
+		[META_ID(SK_FORWARD_ALLOCS)]	= META_FUNC(int_sk_fwd_alloc),
+		[META_ID(SK_ALLOCS)]		= META_FUNC(int_sk_alloc),
+		[META_ID(SK_ROUTE_CAPS)]	= META_FUNC(int_sk_route_caps),
+		[META_ID(SK_HASH)]		= META_FUNC(int_sk_hash),
+		[META_ID(SK_LINGERTIME)]	= META_FUNC(int_sk_lingertime),
+		[META_ID(SK_ACK_BACKLOG)]	= META_FUNC(int_sk_ack_bl),
+		[META_ID(SK_MAX_ACK_BACKLOG)]	= META_FUNC(int_sk_max_ack_bl),
+		[META_ID(SK_PRIO)]		= META_FUNC(int_sk_prio),
+		[META_ID(SK_RCVLOWAT)]		= META_FUNC(int_sk_rcvlowat),
+		[META_ID(SK_RCVTIMEO)]		= META_FUNC(int_sk_rcvtimeo),
+		[META_ID(SK_SNDTIMEO)]		= META_FUNC(int_sk_sndtimeo),
+		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
+		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
+		[META_ID(VLAN_TAG)]		= META_FUNC(int_vlan_tag),
+		[META_ID(RXHASH)]		= META_FUNC(int_rxhash),
+	}
+};
+
+static inline struct meta_ops *meta_ops(struct meta_value *val)
+{
+	return &__meta_ops[meta_type(val)][meta_id(val)];
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_VAR
+ **************************************************************************/
+
+static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
+{
+	int r = a->len - b->len;
+
+	if (r == 0)
+		r = memcmp((void *) a->value, (void *) b->value, a->len);
+
+	return r;
+}
+
+static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
+{
+	int len = nla_len(nla);
+
+	dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
+	if (dst->val == 0UL)
+		return -ENOMEM;
+	dst->len = len;
+	return 0;
+}
+
+static void meta_var_destroy(struct meta_value *v)
+{
+	kfree((void *) v->val);
+}
+
+static void meta_var_apply_extras(struct meta_value *v,
+				  struct meta_obj *dst)
+{
+	int shift = v->hdr.shift;
+
+	if (shift && shift < dst->len)
+		dst->len -= shift;
+}
+
+static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+	if (v->val && v->len)
+		NLA_PUT(skb, tlv, v->len, (void *) v->val);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+/**************************************************************************
+ * Type specific operations for TCF_META_TYPE_INT
+ **************************************************************************/
+
+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
+{
+	/* Let gcc optimize it, the unlikely is not really based on
+	 * some numbers but jump free code for mismatches seems
+	 * more logical. */
+	if (unlikely(a->value == b->value))
+		return 0;
+	else if (a->value < b->value)
+		return -1;
+	else
+		return 1;
+}
+
+static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
+{
+	if (nla_len(nla) >= sizeof(unsigned long)) {
+		dst->val = *(unsigned long *) nla_data(nla);
+		dst->len = sizeof(unsigned long);
+	} else if (nla_len(nla) == sizeof(u32)) {
+		dst->val = nla_get_u32(nla);
+		dst->len = sizeof(u32);
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static void meta_int_apply_extras(struct meta_value *v,
+				  struct meta_obj *dst)
+{
+	if (v->hdr.shift)
+		dst->value >>= v->hdr.shift;
+
+	if (v->val)
+		dst->value &= v->val;
+}
+
+static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
+{
+	if (v->len == sizeof(unsigned long))
+		NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
+	else if (v->len == sizeof(u32))
+		NLA_PUT_U32(skb, tlv, v->val);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+/**************************************************************************
+ * Type specific operations table
+ **************************************************************************/
+
+struct meta_type_ops {
+	void	(*destroy)(struct meta_value *);
+	int	(*compare)(struct meta_obj *, struct meta_obj *);
+	int	(*change)(struct meta_value *, struct nlattr *);
+	void	(*apply_extras)(struct meta_value *, struct meta_obj *);
+	int	(*dump)(struct sk_buff *, struct meta_value *, int);
+};
+
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
+	[TCF_META_TYPE_VAR] = {
+		.destroy = meta_var_destroy,
+		.compare = meta_var_compare,
+		.change = meta_var_change,
+		.apply_extras = meta_var_apply_extras,
+		.dump = meta_var_dump
+	},
+	[TCF_META_TYPE_INT] = {
+		.compare = meta_int_compare,
+		.change = meta_int_change,
+		.apply_extras = meta_int_apply_extras,
+		.dump = meta_int_dump
+	}
+};
+
+static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
+{
+	return &__meta_type_ops[meta_type(v)];
+}
+
+/**************************************************************************
+ * Core
+ **************************************************************************/
+
+static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+		    struct meta_value *v, struct meta_obj *dst)
+{
+	int err = 0;
+
+	if (meta_id(v) == TCF_META_ID_VALUE) {
+		dst->value = v->val;
+		dst->len = v->len;
+		return 0;
+	}
+
+	meta_ops(v)->get(skb, info, v, dst, &err);
+	if (err < 0)
+		return err;
+
+	if (meta_type_ops(v)->apply_extras)
+		meta_type_ops(v)->apply_extras(v, dst);
+
+	return 0;
+}
+
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	int r;
+	struct meta_match *meta = (struct meta_match *) m->data;
+	struct meta_obj l_value, r_value;
+
+	if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+	    meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+		return 0;
+
+	r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+
+	switch (meta->lvalue.hdr.op) {
+	case TCF_EM_OPND_EQ:
+		return !r;
+	case TCF_EM_OPND_LT:
+		return r < 0;
+	case TCF_EM_OPND_GT:
+		return r > 0;
+	}
+
+	return 0;
+}
+
+static void meta_delete(struct meta_match *meta)
+{
+	if (meta) {
+		struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+
+		if (ops && ops->destroy) {
+			ops->destroy(&meta->lvalue);
+			ops->destroy(&meta->rvalue);
+		}
+	}
+
+	kfree(meta);
+}
+
+static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
+{
+	if (nla) {
+		if (nla_len(nla) == 0)
+			return -EINVAL;
+
+		return meta_type_ops(dst)->change(dst, nla);
+	}
+
+	return 0;
+}
+
+static inline int meta_is_supported(struct meta_value *val)
+{
+	return !meta_id(val) || meta_ops(val)->get;
+}
+
+static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
+	[TCA_EM_META_HDR]	= { .len = sizeof(struct tcf_meta_hdr) },
+};
+
+static int em_meta_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	int err;
+	struct nlattr *tb[TCA_EM_META_MAX + 1];
+	struct tcf_meta_hdr *hdr;
+	struct meta_match *meta = NULL;
+
+	err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	if (tb[TCA_EM_META_HDR] == NULL)
+		goto errout;
+	hdr = nla_data(tb[TCA_EM_META_HDR]);
+
+	if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+	    TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+	    TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+	    TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+		goto errout;
+
+	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+	if (meta == NULL)
+		goto errout;
+
+	memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+	memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+
+	if (!meta_is_supported(&meta->lvalue) ||
+	    !meta_is_supported(&meta->rvalue)) {
+		err = -EOPNOTSUPP;
+		goto errout;
+	}
+
+	if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
+	    meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
+		goto errout;
+
+	m->datalen = sizeof(*meta);
+	m->data = (unsigned long) meta;
+
+	err = 0;
+errout:
+	if (err && meta)
+		meta_delete(meta);
+	return err;
+}
+
+static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	if (m)
+		meta_delete((struct meta_match *) m->data);
+}
+
+static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+	struct meta_match *meta = (struct meta_match *) em->data;
+	struct tcf_meta_hdr hdr;
+	struct meta_type_ops *ops;
+
+	memset(&hdr, 0, sizeof(hdr));
+	memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
+	memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
+
+	NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
+
+	ops = meta_type_ops(&meta->lvalue);
+	if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
+	    ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct tcf_ematch_ops em_meta_ops = {
+	.kind	  = TCF_EM_META,
+	.change	  = em_meta_change,
+	.match	  = em_meta_match,
+	.destroy  = em_meta_destroy,
+	.dump	  = em_meta_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_meta_ops.link)
+};
+
+static int __init init_em_meta(void)
+{
+	return tcf_em_register(&em_meta_ops);
+}
+
+static void __exit exit_em_meta(void)
+{
+	tcf_em_unregister(&em_meta_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_meta);
+module_exit(exit_em_meta);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
new file mode 100644
index 00000000..a3bed07a
--- /dev/null
+++ b/net/sched/em_nbyte.c
@@ -0,0 +1,80 @@
+/*
+ * net/sched/em_nbyte.c	N-Byte ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_nbyte.h>
+#include <net/pkt_cls.h>
+
+struct nbyte_data {
+	struct tcf_em_nbyte	hdr;
+	char			pattern[0];
+};
+
+static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
+			   struct tcf_ematch *em)
+{
+	struct tcf_em_nbyte *nbyte = data;
+
+	if (data_len < sizeof(*nbyte) ||
+	    data_len < (sizeof(*nbyte) + nbyte->len))
+		return -EINVAL;
+
+	em->datalen = sizeof(*nbyte) + nbyte->len;
+	em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
+	if (em->data == 0UL)
+		return -ENOBUFS;
+
+	return 0;
+}
+
+static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
+			  struct tcf_pkt_info *info)
+{
+	struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
+	unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+
+	ptr += nbyte->hdr.off;
+
+	if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
+		return 0;
+
+	return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
+}
+
+static struct tcf_ematch_ops em_nbyte_ops = {
+	.kind	  = TCF_EM_NBYTE,
+	.change	  = em_nbyte_change,
+	.match	  = em_nbyte_match,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_nbyte_ops.link)
+};
+
+static int __init init_em_nbyte(void)
+{
+	return tcf_em_register(&em_nbyte_ops);
+}
+
+static void __exit exit_em_nbyte(void)
+{
+	tcf_em_unregister(&em_nbyte_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_nbyte);
+module_exit(exit_em_nbyte);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 00000000..15d353d2
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,158 @@
+/*
+ * net/sched/em_text.c	Textsearch ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match {
+	u16			from_offset;
+	u16			to_offset;
+	u8			from_layer;
+	u8			to_layer;
+	struct ts_config	*config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	int from, to;
+	struct ts_state state;
+
+	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	from += tm->from_offset;
+
+	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	to += tm->to_offset;
+
+	return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct text_match *tm;
+	struct tcf_em_text *conf = data;
+	struct ts_config *ts_conf;
+	int flags = 0;
+
+	if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+		return -EINVAL;
+
+	if (conf->from_layer > conf->to_layer)
+		return -EINVAL;
+
+	if (conf->from_layer == conf->to_layer &&
+	    conf->from_offset > conf->to_offset)
+		return -EINVAL;
+
+retry:
+	ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+				     conf->pattern_len, GFP_KERNEL, flags);
+
+	if (flags & TS_AUTOLOAD)
+		rtnl_lock();
+
+	if (IS_ERR(ts_conf)) {
+		if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+			rtnl_unlock();
+			flags |= TS_AUTOLOAD;
+			goto retry;
+		} else
+			return PTR_ERR(ts_conf);
+	} else if (flags & TS_AUTOLOAD) {
+		textsearch_destroy(ts_conf);
+		return -EAGAIN;
+	}
+
+	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+	if (tm == NULL) {
+		textsearch_destroy(ts_conf);
+		return -ENOBUFS;
+	}
+
+	tm->from_offset = conf->from_offset;
+	tm->to_offset   = conf->to_offset;
+	tm->from_layer  = conf->from_layer;
+	tm->to_layer    = conf->to_layer;
+	tm->config      = ts_conf;
+
+	m->datalen = sizeof(*tm);
+	m->data = (unsigned long) tm;
+
+	return 0;
+}
+
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
+		textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	struct tcf_em_text conf;
+
+	strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+	conf.from_offset = tm->from_offset;
+	conf.to_offset = tm->to_offset;
+	conf.from_layer = tm->from_layer;
+	conf.to_layer = tm->to_layer;
+	conf.pattern_len = textsearch_get_pattern_len(tm->config);
+	conf.pad = 0;
+
+	if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
+		goto nla_put_failure;
+	if (nla_append(skb, conf.pattern_len,
+		       textsearch_get_pattern(tm->config)) < 0)
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct tcf_ematch_ops em_text_ops = {
+	.kind	  = TCF_EM_TEXT,
+	.change	  = em_text_change,
+	.match	  = em_text_match,
+	.destroy  = em_text_destroy,
+	.dump	  = em_text_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+	return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void)
+{
+	tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
new file mode 100644
index 00000000..797bdb88
--- /dev/null
+++ b/net/sched/em_u32.c
@@ -0,0 +1,64 @@
+/*
+ * net/sched/em_u32.c	U32 Ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Based on net/sched/cls_u32.c
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
+			struct tcf_pkt_info *info)
+{
+	struct tc_u32_key *key = (struct tc_u32_key *) em->data;
+	const unsigned char *ptr = skb_network_header(skb);
+
+	if (info) {
+		if (info->ptr)
+			ptr = info->ptr;
+		ptr += (info->nexthdr & key->offmask);
+	}
+
+	ptr += key->off;
+
+	if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
+		return 0;
+
+	return !(((*(__be32 *) ptr)  ^ key->val) & key->mask);
+}
+
+static struct tcf_ematch_ops em_u32_ops = {
+	.kind	  = TCF_EM_U32,
+	.datalen  = sizeof(struct tc_u32_key),
+	.match	  = em_u32_match,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_u32_ops.link)
+};
+
+static int __init init_em_u32(void)
+{
+	return tcf_em_register(&em_u32_ops);
+}
+
+static void __exit exit_em_u32(void)
+{
+	tcf_em_unregister(&em_u32_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_u32);
+module_exit(exit_em_u32);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
new file mode 100644
index 00000000..88d93eb9
--- /dev/null
+++ b/net/sched/ematch.c
@@ -0,0 +1,543 @@
+/*
+ * net/sched/ematch.c		Extended Match API
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ * An extended match (ematch) is a small classification tool not worth
+ * writing a full classifier for. Ematches can be interconnected to form
+ * a logic expression and get attached to classifiers to extend their
+ * functionatlity.
+ *
+ * The userspace part transforms the logic expressions into an array
+ * consisting of multiple sequences of interconnected ematches separated
+ * by markers. Precedence is implemented by a special ematch kind
+ * referencing a sequence beyond the marker of the current sequence
+ * causing the current position in the sequence to be pushed onto a stack
+ * to allow the current position to be overwritten by the position referenced
+ * in the special ematch. Matching continues in the new sequence until a
+ * marker is reached causing the position to be restored from the stack.
+ *
+ * Example:
+ *          A AND (B1 OR B2) AND C AND D
+ *
+ *              ------->-PUSH-------
+ *    -->--    /         -->--      \   -->--
+ *   /     \  /         /     \      \ /     \
+ * +-------+-------+-------+-------+-------+--------+
+ * | A AND | B AND | C AND | D END | B1 OR | B2 END |
+ * +-------+-------+-------+-------+-------+--------+
+ *                    \                      /
+ *                     --------<-POP---------
+ *
+ * where B is a virtual ematch referencing to sequence starting with B1.
+ *
+ * ==========================================================================
+ *
+ * How to write an ematch in 60 seconds
+ * ------------------------------------
+ *
+ *   1) Provide a matcher function:
+ *      static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
+ *                          struct tcf_pkt_info *info)
+ *      {
+ *      	struct mydata *d = (struct mydata *) m->data;
+ *
+ *      	if (...matching goes here...)
+ *      		return 1;
+ *      	else
+ *      		return 0;
+ *      }
+ *
+ *   2) Fill out a struct tcf_ematch_ops:
+ *      static struct tcf_ematch_ops my_ops = {
+ *      	.kind = unique id,
+ *      	.datalen = sizeof(struct mydata),
+ *      	.match = my_match,
+ *      	.owner = THIS_MODULE,
+ *      };
+ *
+ *   3) Register/Unregister your ematch:
+ *      static int __init init_my_ematch(void)
+ *      {
+ *      	return tcf_em_register(&my_ops);
+ *      }
+ *
+ *      static void __exit exit_my_ematch(void)
+ *      {
+ *      	tcf_em_unregister(&my_ops);
+ *      }
+ *
+ *      module_init(init_my_ematch);
+ *      module_exit(exit_my_ematch);
+ *
+ *   4) By now you should have two more seconds left, barely enough to
+ *      open up a beer to watch the compilation going.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static LIST_HEAD(ematch_ops);
+static DEFINE_RWLOCK(ematch_mod_lock);
+
+static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
+{
+	struct tcf_ematch_ops *e = NULL;
+
+	read_lock(&ematch_mod_lock);
+	list_for_each_entry(e, &ematch_ops, link) {
+		if (kind == e->kind) {
+			if (!try_module_get(e->owner))
+				e = NULL;
+			read_unlock(&ematch_mod_lock);
+			return e;
+		}
+	}
+	read_unlock(&ematch_mod_lock);
+
+	return NULL;
+}
+
+/**
+ * tcf_em_register - register an extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their presence.
+ * The given @ops must have kind set to a unique identifier and the
+ * callback match() must be implemented. All other callbacks are optional
+ * and a fallback implementation is used instead.
+ *
+ * Returns -EEXISTS if an ematch of the same kind has already registered.
+ */
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+	int err = -EEXIST;
+	struct tcf_ematch_ops *e;
+
+	if (ops->match == NULL)
+		return -EINVAL;
+
+	write_lock(&ematch_mod_lock);
+	list_for_each_entry(e, &ematch_ops, link)
+		if (ops->kind == e->kind)
+			goto errout;
+
+	list_add_tail(&ops->link, &ematch_ops);
+	err = 0;
+errout:
+	write_unlock(&ematch_mod_lock);
+	return err;
+}
+EXPORT_SYMBOL(tcf_em_register);
+
+/**
+ * tcf_em_unregister - unregster and extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their disappearance
+ * for examples when the module gets unloaded. The @ops parameter must be
+ * the same as the one used for registration.
+ *
+ * Returns -ENOENT if no matching ematch was found.
+ */
+void tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+	write_lock(&ematch_mod_lock);
+	list_del(&ops->link);
+	write_unlock(&ematch_mod_lock);
+}
+EXPORT_SYMBOL(tcf_em_unregister);
+
+static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
+						  int index)
+{
+	return &tree->matches[index];
+}
+
+
+static int tcf_em_validate(struct tcf_proto *tp,
+			   struct tcf_ematch_tree_hdr *tree_hdr,
+			   struct tcf_ematch *em, struct nlattr *nla, int idx)
+{
+	int err = -EINVAL;
+	struct tcf_ematch_hdr *em_hdr = nla_data(nla);
+	int data_len = nla_len(nla) - sizeof(*em_hdr);
+	void *data = (void *) em_hdr + sizeof(*em_hdr);
+
+	if (!TCF_EM_REL_VALID(em_hdr->flags))
+		goto errout;
+
+	if (em_hdr->kind == TCF_EM_CONTAINER) {
+		/* Special ematch called "container", carries an index
+		 * referencing an external ematch sequence.
+		 */
+		u32 ref;
+
+		if (data_len < sizeof(ref))
+			goto errout;
+		ref = *(u32 *) data;
+
+		if (ref >= tree_hdr->nmatches)
+			goto errout;
+
+		/* We do not allow backward jumps to avoid loops and jumps
+		 * to our own position are of course illegal.
+		 */
+		if (ref <= idx)
+			goto errout;
+
+
+		em->data = ref;
+	} else {
+		/* Note: This lookup will increase the module refcnt
+		 * of the ematch module referenced. In case of a failure,
+		 * a destroy function is called by the underlying layer
+		 * which automatically releases the reference again, therefore
+		 * the module MUST not be given back under any circumstances
+		 * here. Be aware, the destroy function assumes that the
+		 * module is held if the ops field is non zero.
+		 */
+		em->ops = tcf_em_lookup(em_hdr->kind);
+
+		if (em->ops == NULL) {
+			err = -ENOENT;
+#ifdef CONFIG_MODULES
+			__rtnl_unlock();
+			request_module("ematch-kind-%u", em_hdr->kind);
+			rtnl_lock();
+			em->ops = tcf_em_lookup(em_hdr->kind);
+			if (em->ops) {
+				/* We dropped the RTNL mutex in order to
+				 * perform the module load. Tell the caller
+				 * to replay the request.
+				 */
+				module_put(em->ops->owner);
+				err = -EAGAIN;
+			}
+#endif
+			goto errout;
+		}
+
+		/* ematch module provides expected length of data, so we
+		 * can do a basic sanity check.
+		 */
+		if (em->ops->datalen && data_len < em->ops->datalen)
+			goto errout;
+
+		if (em->ops->change) {
+			err = em->ops->change(tp, data, data_len, em);
+			if (err < 0)
+				goto errout;
+		} else if (data_len > 0) {
+			/* ematch module doesn't provide an own change
+			 * procedure and expects us to allocate and copy
+			 * the ematch data.
+			 *
+			 * TCF_EM_SIMPLE may be specified stating that the
+			 * data only consists of a u32 integer and the module
+			 * does not expected a memory reference but rather
+			 * the value carried.
+			 */
+			if (em_hdr->flags & TCF_EM_SIMPLE) {
+				if (data_len < sizeof(u32))
+					goto errout;
+				em->data = *(u32 *) data;
+			} else {
+				void *v = kmemdup(data, data_len, GFP_KERNEL);
+				if (v == NULL) {
+					err = -ENOBUFS;
+					goto errout;
+				}
+				em->data = (unsigned long) v;
+			}
+		}
+	}
+
+	em->matchid = em_hdr->matchid;
+	em->flags = em_hdr->flags;
+	em->datalen = data_len;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
+	[TCA_EMATCH_TREE_HDR]	= { .len = sizeof(struct tcf_ematch_tree_hdr) },
+	[TCA_EMATCH_TREE_LIST]	= { .type = NLA_NESTED },
+};
+
+/**
+ * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
+ *
+ * @tp: classifier kind handle
+ * @nla: ematch tree configuration TLV
+ * @tree: destination ematch tree variable to store the resulting
+ *        ematch tree.
+ *
+ * This function validates the given configuration TLV @nla and builds an
+ * ematch tree in @tree. The resulting tree must later be copied into
+ * the private classifier data using tcf_em_tree_change(). You MUST NOT
+ * provide the ematch tree variable of the private classifier data directly,
+ * the changes would not be locked properly.
+ *
+ * Returns a negative error code if the configuration TLV contains errors.
+ */
+int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
+			 struct tcf_ematch_tree *tree)
+{
+	int idx, list_len, matches_len, err;
+	struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
+	struct nlattr *rt_match, *rt_hdr, *rt_list;
+	struct tcf_ematch_tree_hdr *tree_hdr;
+	struct tcf_ematch *em;
+
+	memset(tree, 0, sizeof(*tree));
+	if (!nla)
+		return 0;
+
+	err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	rt_hdr = tb[TCA_EMATCH_TREE_HDR];
+	rt_list = tb[TCA_EMATCH_TREE_LIST];
+
+	if (rt_hdr == NULL || rt_list == NULL)
+		goto errout;
+
+	tree_hdr = nla_data(rt_hdr);
+	memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
+
+	rt_match = nla_data(rt_list);
+	list_len = nla_len(rt_list);
+	matches_len = tree_hdr->nmatches * sizeof(*em);
+
+	tree->matches = kzalloc(matches_len, GFP_KERNEL);
+	if (tree->matches == NULL)
+		goto errout;
+
+	/* We do not use nla_parse_nested here because the maximum
+	 * number of attributes is unknown. This saves us the allocation
+	 * for a tb buffer which would serve no purpose at all.
+	 *
+	 * The array of rt attributes is parsed in the order as they are
+	 * provided, their type must be incremental from 1 to n. Even
+	 * if it does not serve any real purpose, a failure of sticking
+	 * to this policy will result in parsing failure.
+	 */
+	for (idx = 0; nla_ok(rt_match, list_len); idx++) {
+		err = -EINVAL;
+
+		if (rt_match->nla_type != (idx + 1))
+			goto errout_abort;
+
+		if (idx >= tree_hdr->nmatches)
+			goto errout_abort;
+
+		if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
+			goto errout_abort;
+
+		em = tcf_em_get_match(tree, idx);
+
+		err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
+		if (err < 0)
+			goto errout_abort;
+
+		rt_match = nla_next(rt_match, &list_len);
+	}
+
+	/* Check if the number of matches provided by userspace actually
+	 * complies with the array of matches. The number was used for
+	 * the validation of references and a mismatch could lead to
+	 * undefined references during the matching process.
+	 */
+	if (idx != tree_hdr->nmatches) {
+		err = -EINVAL;
+		goto errout_abort;
+	}
+
+	err = 0;
+errout:
+	return err;
+
+errout_abort:
+	tcf_em_tree_destroy(tp, tree);
+	return err;
+}
+EXPORT_SYMBOL(tcf_em_tree_validate);
+
+/**
+ * tcf_em_tree_destroy - destroy an ematch tree
+ *
+ * @tp: classifier kind handle
+ * @tree: ematch tree to be deleted
+ *
+ * This functions destroys an ematch tree previously created by
+ * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
+ * the ematch tree is not in use before calling this function.
+ */
+void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
+{
+	int i;
+
+	if (tree->matches == NULL)
+		return;
+
+	for (i = 0; i < tree->hdr.nmatches; i++) {
+		struct tcf_ematch *em = tcf_em_get_match(tree, i);
+
+		if (em->ops) {
+			if (em->ops->destroy)
+				em->ops->destroy(tp, em);
+			else if (!tcf_em_is_simple(em))
+				kfree((void *) em->data);
+			module_put(em->ops->owner);
+		}
+	}
+
+	tree->hdr.nmatches = 0;
+	kfree(tree->matches);
+	tree->matches = NULL;
+}
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+
+/**
+ * tcf_em_tree_dump - dump ematch tree into a rtnl message
+ *
+ * @skb: skb holding the rtnl message
+ * @t: ematch tree to be dumped
+ * @tlv: TLV type to be used to encapsulate the tree
+ *
+ * This function dumps a ematch tree into a rtnl message. It is valid to
+ * call this function while the ematch tree is in use.
+ *
+ * Returns -1 if the skb tailroom is insufficient.
+ */
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
+{
+	int i;
+	u8 *tail;
+	struct nlattr *top_start;
+	struct nlattr *list_start;
+
+	top_start = nla_nest_start(skb, tlv);
+	if (top_start == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
+
+	list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
+	if (list_start == NULL)
+		goto nla_put_failure;
+
+	tail = skb_tail_pointer(skb);
+	for (i = 0; i < tree->hdr.nmatches; i++) {
+		struct nlattr *match_start = (struct nlattr *)tail;
+		struct tcf_ematch *em = tcf_em_get_match(tree, i);
+		struct tcf_ematch_hdr em_hdr = {
+			.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
+			.matchid = em->matchid,
+			.flags = em->flags
+		};
+
+		NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr);
+
+		if (em->ops && em->ops->dump) {
+			if (em->ops->dump(skb, em) < 0)
+				goto nla_put_failure;
+		} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
+			u32 u = em->data;
+			nla_put_nohdr(skb, sizeof(u), &u);
+		} else if (em->datalen > 0)
+			nla_put_nohdr(skb, em->datalen, (void *) em->data);
+
+		tail = skb_tail_pointer(skb);
+		match_start->nla_len = tail - (u8 *)match_start;
+	}
+
+	nla_nest_end(skb, list_start);
+	nla_nest_end(skb, top_start);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+EXPORT_SYMBOL(tcf_em_tree_dump);
+
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
+			       struct tcf_pkt_info *info)
+{
+	int r = em->ops->match(skb, em, info);
+
+	return tcf_em_is_inverted(em) ? !r : r;
+}
+
+/* Do not use this function directly, use tcf_em_tree_match instead */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
+			struct tcf_pkt_info *info)
+{
+	int stackp = 0, match_idx = 0, res = 0;
+	struct tcf_ematch *cur_match;
+	int stack[CONFIG_NET_EMATCH_STACK];
+
+proceed:
+	while (match_idx < tree->hdr.nmatches) {
+		cur_match = tcf_em_get_match(tree, match_idx);
+
+		if (tcf_em_is_container(cur_match)) {
+			if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
+				goto stack_overflow;
+
+			stack[stackp++] = match_idx;
+			match_idx = cur_match->data;
+			goto proceed;
+		}
+
+		res = tcf_em_match(skb, cur_match, info);
+
+		if (tcf_em_early_end(cur_match, res))
+			break;
+
+		match_idx++;
+	}
+
+pop_stack:
+	if (stackp > 0) {
+		match_idx = stack[--stackp];
+		cur_match = tcf_em_get_match(tree, match_idx);
+
+		if (tcf_em_early_end(cur_match, res))
+			goto pop_stack;
+		else {
+			match_idx++;
+			goto proceed;
+		}
+	}
+
+	return res;
+
+stack_overflow:
+	if (net_ratelimit())
+		pr_warning("tc ematch: local stack overflow,"
+			   " increase NET_EMATCH_STACK\n");
+	return -1;
+}
+EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
new file mode 100644
index 00000000..6b862766
--- /dev/null
+++ b/net/sched/sch_api.c
@@ -0,0 +1,1805 @@
+/*
+ * net/sched/sch_api.c	Packet scheduler API.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
+ * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
+ * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/hrtimer.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
+			struct Qdisc *old, struct Qdisc *new);
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event);
+
+/*
+
+   Short review.
+   -------------
+
+   This file consists of two interrelated parts:
+
+   1. queueing disciplines manager frontend.
+   2. traffic classes manager frontend.
+
+   Generally, queueing discipline ("qdisc") is a black box,
+   which is able to enqueue packets and to dequeue them (when
+   device is ready to send something) in order and at times
+   determined by algorithm hidden in it.
+
+   qdisc's are divided to two categories:
+   - "queues", which have no internal structure visible from outside.
+   - "schedulers", which split all the packets to "traffic classes",
+     using "packet classifiers" (look at cls_api.c)
+
+   In turn, classes may have child qdiscs (as rule, queues)
+   attached to them etc. etc. etc.
+
+   The goal of the routines in this file is to translate
+   information supplied by user in the form of handles
+   to more intelligible for kernel form, to make some sanity
+   checks and part of work, which is common to all qdiscs
+   and to provide rtnetlink notifications.
+
+   All real intelligent work is done inside qdisc modules.
+
+
+
+   Every discipline has two major routines: enqueue and dequeue.
+
+   ---dequeue
+
+   dequeue usually returns a skb to send. It is allowed to return NULL,
+   but it does not mean that queue is empty, it just means that
+   discipline does not want to send anything this time.
+   Queue is really empty if q->q.qlen == 0.
+   For complicated disciplines with multiple queues q->q is not
+   real packet queue, but however q->q.qlen must be valid.
+
+   ---enqueue
+
+   enqueue returns 0, if packet was enqueued successfully.
+   If packet (this one or another one) was dropped, it returns
+   not zero error code.
+   NET_XMIT_DROP 	- this packet dropped
+     Expected action: do not backoff, but wait until queue will clear.
+   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
+     Expected action: backoff or ignore
+   NET_XMIT_POLICED	- dropped by police.
+     Expected action: backoff or error to real-time apps.
+
+   Auxiliary routines:
+
+   ---peek
+
+   like dequeue but without removing a packet from the queue
+
+   ---reset
+
+   returns qdisc to initial state: purge all buffers, clear all
+   timers, counters (except for statistics) etc.
+
+   ---init
+
+   initializes newly created qdisc.
+
+   ---destroy
+
+   destroys resources allocated by init and during lifetime of qdisc.
+
+   ---change
+
+   changes qdisc parameters.
+ */
+
+/* Protects list of registered TC modules. It is pure SMP lock. */
+static DEFINE_RWLOCK(qdisc_mod_lock);
+
+
+/************************************************
+ *	Queueing disciplines manipulation.	*
+ ************************************************/
+
+
+/* The list of all installed queueing disciplines. */
+
+static struct Qdisc_ops *qdisc_base;
+
+/* Register/uregister queueing discipline */
+
+int register_qdisc(struct Qdisc_ops *qops)
+{
+	struct Qdisc_ops *q, **qp;
+	int rc = -EEXIST;
+
+	write_lock(&qdisc_mod_lock);
+	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+		if (!strcmp(qops->id, q->id))
+			goto out;
+
+	if (qops->enqueue == NULL)
+		qops->enqueue = noop_qdisc_ops.enqueue;
+	if (qops->peek == NULL) {
+		if (qops->dequeue == NULL)
+			qops->peek = noop_qdisc_ops.peek;
+		else
+			goto out_einval;
+	}
+	if (qops->dequeue == NULL)
+		qops->dequeue = noop_qdisc_ops.dequeue;
+
+	if (qops->cl_ops) {
+		const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+		if (!(cops->get && cops->put && cops->walk && cops->leaf))
+			goto out_einval;
+
+		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
+			goto out_einval;
+	}
+
+	qops->next = NULL;
+	*qp = qops;
+	rc = 0;
+out:
+	write_unlock(&qdisc_mod_lock);
+	return rc;
+
+out_einval:
+	rc = -EINVAL;
+	goto out;
+}
+EXPORT_SYMBOL(register_qdisc);
+
+int unregister_qdisc(struct Qdisc_ops *qops)
+{
+	struct Qdisc_ops *q, **qp;
+	int err = -ENOENT;
+
+	write_lock(&qdisc_mod_lock);
+	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
+		if (q == qops)
+			break;
+	if (q) {
+		*qp = q->next;
+		q->next = NULL;
+		err = 0;
+	}
+	write_unlock(&qdisc_mod_lock);
+	return err;
+}
+EXPORT_SYMBOL(unregister_qdisc);
+
+/* We know handle. Find qdisc among all qdisc's attached to device
+   (root qdisc, all its children, children of children etc.)
+ */
+
+static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
+{
+	struct Qdisc *q;
+
+	if (!(root->flags & TCQ_F_BUILTIN) &&
+	    root->handle == handle)
+		return root;
+
+	list_for_each_entry(q, &root->list, list) {
+		if (q->handle == handle)
+			return q;
+	}
+	return NULL;
+}
+
+static void qdisc_list_add(struct Qdisc *q)
+{
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
+		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
+}
+
+void qdisc_list_del(struct Qdisc *q)
+{
+	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
+		list_del(&q->list);
+}
+EXPORT_SYMBOL(qdisc_list_del);
+
+struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
+{
+	struct Qdisc *q;
+
+	q = qdisc_match_from_root(dev->qdisc, handle);
+	if (q)
+		goto out;
+
+	if (dev_ingress_queue(dev))
+		q = qdisc_match_from_root(
+			dev_ingress_queue(dev)->qdisc_sleeping,
+			handle);
+out:
+	return q;
+}
+
+static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
+{
+	unsigned long cl;
+	struct Qdisc *leaf;
+	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
+
+	if (cops == NULL)
+		return NULL;
+	cl = cops->get(p, classid);
+
+	if (cl == 0)
+		return NULL;
+	leaf = cops->leaf(p, cl);
+	cops->put(p, cl);
+	return leaf;
+}
+
+/* Find queueing discipline by name */
+
+static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
+{
+	struct Qdisc_ops *q = NULL;
+
+	if (kind) {
+		read_lock(&qdisc_mod_lock);
+		for (q = qdisc_base; q; q = q->next) {
+			if (nla_strcmp(kind, q->id) == 0) {
+				if (!try_module_get(q->owner))
+					q = NULL;
+				break;
+			}
+		}
+		read_unlock(&qdisc_mod_lock);
+	}
+	return q;
+}
+
+static struct qdisc_rate_table *qdisc_rtab_list;
+
+struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
+{
+	struct qdisc_rate_table *rtab;
+
+	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
+		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
+			rtab->refcnt++;
+			return rtab;
+		}
+	}
+
+	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+	    nla_len(tab) != TC_RTAB_SIZE)
+		return NULL;
+
+	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
+	if (rtab) {
+		rtab->rate = *r;
+		rtab->refcnt = 1;
+		memcpy(rtab->data, nla_data(tab), 1024);
+		rtab->next = qdisc_rtab_list;
+		qdisc_rtab_list = rtab;
+	}
+	return rtab;
+}
+EXPORT_SYMBOL(qdisc_get_rtab);
+
+void qdisc_put_rtab(struct qdisc_rate_table *tab)
+{
+	struct qdisc_rate_table *rtab, **rtabp;
+
+	if (!tab || --tab->refcnt)
+		return;
+
+	for (rtabp = &qdisc_rtab_list;
+	     (rtab = *rtabp) != NULL;
+	     rtabp = &rtab->next) {
+		if (rtab == tab) {
+			*rtabp = rtab->next;
+			kfree(rtab);
+			return;
+		}
+	}
+}
+EXPORT_SYMBOL(qdisc_put_rtab);
+
+static LIST_HEAD(qdisc_stab_list);
+static DEFINE_SPINLOCK(qdisc_stab_lock);
+
+static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
+	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
+	[TCA_STAB_DATA] = { .type = NLA_BINARY },
+};
+
+static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
+{
+	struct nlattr *tb[TCA_STAB_MAX + 1];
+	struct qdisc_size_table *stab;
+	struct tc_sizespec *s;
+	unsigned int tsize = 0;
+	u16 *tab = NULL;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
+	if (err < 0)
+		return ERR_PTR(err);
+	if (!tb[TCA_STAB_BASE])
+		return ERR_PTR(-EINVAL);
+
+	s = nla_data(tb[TCA_STAB_BASE]);
+
+	if (s->tsize > 0) {
+		if (!tb[TCA_STAB_DATA])
+			return ERR_PTR(-EINVAL);
+		tab = nla_data(tb[TCA_STAB_DATA]);
+		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
+	}
+
+	if (tsize != s->tsize || (!tab && tsize > 0))
+		return ERR_PTR(-EINVAL);
+
+	spin_lock(&qdisc_stab_lock);
+
+	list_for_each_entry(stab, &qdisc_stab_list, list) {
+		if (memcmp(&stab->szopts, s, sizeof(*s)))
+			continue;
+		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
+			continue;
+		stab->refcnt++;
+		spin_unlock(&qdisc_stab_lock);
+		return stab;
+	}
+
+	spin_unlock(&qdisc_stab_lock);
+
+	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	stab->refcnt = 1;
+	stab->szopts = *s;
+	if (tsize > 0)
+		memcpy(stab->data, tab, tsize * sizeof(u16));
+
+	spin_lock(&qdisc_stab_lock);
+	list_add_tail(&stab->list, &qdisc_stab_list);
+	spin_unlock(&qdisc_stab_lock);
+
+	return stab;
+}
+
+static void stab_kfree_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct qdisc_size_table, rcu));
+}
+
+void qdisc_put_stab(struct qdisc_size_table *tab)
+{
+	if (!tab)
+		return;
+
+	spin_lock(&qdisc_stab_lock);
+
+	if (--tab->refcnt == 0) {
+		list_del(&tab->list);
+		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
+	}
+
+	spin_unlock(&qdisc_stab_lock);
+}
+EXPORT_SYMBOL(qdisc_put_stab);
+
+static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_STAB);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
+	nla_nest_end(skb, nest);
+
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
+{
+	int pkt_len, slot;
+
+	pkt_len = skb->len + stab->szopts.overhead;
+	if (unlikely(!stab->szopts.tsize))
+		goto out;
+
+	slot = pkt_len + stab->szopts.cell_align;
+	if (unlikely(slot < 0))
+		slot = 0;
+
+	slot >>= stab->szopts.cell_log;
+	if (likely(slot < stab->szopts.tsize))
+		pkt_len = stab->data[slot];
+	else
+		pkt_len = stab->data[stab->szopts.tsize - 1] *
+				(slot / stab->szopts.tsize) +
+				stab->data[slot % stab->szopts.tsize];
+
+	pkt_len <<= stab->szopts.size_log;
+out:
+	if (unlikely(pkt_len < 1))
+		pkt_len = 1;
+	qdisc_skb_cb(skb)->pkt_len = pkt_len;
+}
+EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
+
+void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
+{
+	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+			txt, qdisc->ops->id, qdisc->handle >> 16);
+		qdisc->flags |= TCQ_F_WARN_NONWC;
+	}
+}
+EXPORT_SYMBOL(qdisc_warn_nonwc);
+
+static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
+{
+	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
+						 timer);
+
+	qdisc_unthrottled(wd->qdisc);
+	__netif_schedule(qdisc_root(wd->qdisc));
+
+	return HRTIMER_NORESTART;
+}
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	wd->timer.function = qdisc_watchdog;
+	wd->qdisc = qdisc;
+}
+EXPORT_SYMBOL(qdisc_watchdog_init);
+
+void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
+{
+	ktime_t time;
+
+	if (test_bit(__QDISC_STATE_DEACTIVATED,
+		     &qdisc_root_sleeping(wd->qdisc)->state))
+		return;
+
+	qdisc_throttled(wd->qdisc);
+	time = ktime_set(0, 0);
+	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
+	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
+}
+EXPORT_SYMBOL(qdisc_watchdog_schedule);
+
+void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
+{
+	hrtimer_cancel(&wd->timer);
+	qdisc_unthrottled(wd->qdisc);
+}
+EXPORT_SYMBOL(qdisc_watchdog_cancel);
+
+static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
+{
+	unsigned int size = n * sizeof(struct hlist_head), i;
+	struct hlist_head *h;
+
+	if (size <= PAGE_SIZE)
+		h = kmalloc(size, GFP_KERNEL);
+	else
+		h = (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(size));
+
+	if (h != NULL) {
+		for (i = 0; i < n; i++)
+			INIT_HLIST_HEAD(&h[i]);
+	}
+	return h;
+}
+
+static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
+{
+	unsigned int size = n * sizeof(struct hlist_head);
+
+	if (size <= PAGE_SIZE)
+		kfree(h);
+	else
+		free_pages((unsigned long)h, get_order(size));
+}
+
+void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
+{
+	struct Qdisc_class_common *cl;
+	struct hlist_node *n, *next;
+	struct hlist_head *nhash, *ohash;
+	unsigned int nsize, nmask, osize;
+	unsigned int i, h;
+
+	/* Rehash when load factor exceeds 0.75 */
+	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
+		return;
+	nsize = clhash->hashsize * 2;
+	nmask = nsize - 1;
+	nhash = qdisc_class_hash_alloc(nsize);
+	if (nhash == NULL)
+		return;
+
+	ohash = clhash->hash;
+	osize = clhash->hashsize;
+
+	sch_tree_lock(sch);
+	for (i = 0; i < osize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
+			h = qdisc_class_hash(cl->classid, nmask);
+			hlist_add_head(&cl->hnode, &nhash[h]);
+		}
+	}
+	clhash->hash     = nhash;
+	clhash->hashsize = nsize;
+	clhash->hashmask = nmask;
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_free(ohash, osize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_grow);
+
+int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
+{
+	unsigned int size = 4;
+
+	clhash->hash = qdisc_class_hash_alloc(size);
+	if (clhash->hash == NULL)
+		return -ENOMEM;
+	clhash->hashsize  = size;
+	clhash->hashmask  = size - 1;
+	clhash->hashelems = 0;
+	return 0;
+}
+EXPORT_SYMBOL(qdisc_class_hash_init);
+
+void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
+{
+	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
+}
+EXPORT_SYMBOL(qdisc_class_hash_destroy);
+
+void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
+			     struct Qdisc_class_common *cl)
+{
+	unsigned int h;
+
+	INIT_HLIST_NODE(&cl->hnode);
+	h = qdisc_class_hash(cl->classid, clhash->hashmask);
+	hlist_add_head(&cl->hnode, &clhash->hash[h]);
+	clhash->hashelems++;
+}
+EXPORT_SYMBOL(qdisc_class_hash_insert);
+
+void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
+			     struct Qdisc_class_common *cl)
+{
+	hlist_del(&cl->hnode);
+	clhash->hashelems--;
+}
+EXPORT_SYMBOL(qdisc_class_hash_remove);
+
+/* Allocate an unique handle from space managed by kernel */
+
+static u32 qdisc_alloc_handle(struct net_device *dev)
+{
+	int i = 0x10000;
+	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
+
+	do {
+		autohandle += TC_H_MAKE(0x10000U, 0);
+		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
+			autohandle = TC_H_MAKE(0x80000000U, 0);
+	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
+
+	return i > 0 ? autohandle : 0;
+}
+
+void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+{
+	const struct Qdisc_class_ops *cops;
+	unsigned long cl;
+	u32 parentid;
+
+	if (n == 0)
+		return;
+	while ((parentid = sch->parent)) {
+		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
+			return;
+
+		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
+		if (sch == NULL) {
+			WARN_ON(parentid != TC_H_ROOT);
+			return;
+		}
+		cops = sch->ops->cl_ops;
+		if (cops->qlen_notify) {
+			cl = cops->get(sch, parentid);
+			cops->qlen_notify(sch, cl);
+			cops->put(sch, cl);
+		}
+		sch->q.qlen -= n;
+	}
+}
+EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
+
+static void notify_and_destroy(struct net *net, struct sk_buff *skb,
+			       struct nlmsghdr *n, u32 clid,
+			       struct Qdisc *old, struct Qdisc *new)
+{
+	if (new || old)
+		qdisc_notify(net, skb, n, clid, old, new);
+
+	if (old)
+		qdisc_destroy(old);
+}
+
+/* Graft qdisc "new" to class "classid" of qdisc "parent" or
+ * to device "dev".
+ *
+ * When appropriate send a netlink notification using 'skb'
+ * and "n".
+ *
+ * On success, destroy old qdisc.
+ */
+
+static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
+		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
+		       struct Qdisc *new, struct Qdisc *old)
+{
+	struct Qdisc *q = old;
+	struct net *net = dev_net(dev);
+	int err = 0;
+
+	if (parent == NULL) {
+		unsigned int i, num_q, ingress;
+
+		ingress = 0;
+		num_q = dev->num_tx_queues;
+		if ((q && q->flags & TCQ_F_INGRESS) ||
+		    (new && new->flags & TCQ_F_INGRESS)) {
+			num_q = 1;
+			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
+		}
+
+		if (dev->flags & IFF_UP)
+			dev_deactivate(dev);
+
+		if (new && new->ops->attach) {
+			new->ops->attach(new);
+			num_q = 0;
+		}
+
+		for (i = 0; i < num_q; i++) {
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
+
+			if (!ingress)
+				dev_queue = netdev_get_tx_queue(dev, i);
+
+			old = dev_graft_qdisc(dev_queue, new);
+			if (new && i > 0)
+				atomic_inc(&new->refcnt);
+
+			if (!ingress)
+				qdisc_destroy(old);
+		}
+
+		if (!ingress) {
+			notify_and_destroy(net, skb, n, classid,
+					   dev->qdisc, new);
+			if (new && !new->ops->attach)
+				atomic_inc(&new->refcnt);
+			dev->qdisc = new ? : &noop_qdisc;
+		} else {
+			notify_and_destroy(net, skb, n, classid, old, new);
+		}
+
+		if (dev->flags & IFF_UP)
+			dev_activate(dev);
+	} else {
+		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+
+		err = -EOPNOTSUPP;
+		if (cops && cops->graft) {
+			unsigned long cl = cops->get(parent, classid);
+			if (cl) {
+				err = cops->graft(parent, cl, new, &old);
+				cops->put(parent, cl);
+			} else
+				err = -ENOENT;
+		}
+		if (!err)
+			notify_and_destroy(net, skb, n, classid, old, new);
+	}
+	return err;
+}
+
+/* lockdep annotation is needed for ingress; egress gets it only for name */
+static struct lock_class_key qdisc_tx_lock;
+static struct lock_class_key qdisc_rx_lock;
+
+/*
+   Allocate and initialize new qdisc.
+
+   Parameters are passed via opt.
+ */
+
+static struct Qdisc *
+qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
+	     struct Qdisc *p, u32 parent, u32 handle,
+	     struct nlattr **tca, int *errp)
+{
+	int err;
+	struct nlattr *kind = tca[TCA_KIND];
+	struct Qdisc *sch;
+	struct Qdisc_ops *ops;
+	struct qdisc_size_table *stab;
+
+	ops = qdisc_lookup_ops(kind);
+#ifdef CONFIG_MODULES
+	if (ops == NULL && kind != NULL) {
+		char name[IFNAMSIZ];
+		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
+			/* We dropped the RTNL semaphore in order to
+			 * perform the module load.  So, even if we
+			 * succeeded in loading the module we have to
+			 * tell the caller to replay the request.  We
+			 * indicate this using -EAGAIN.
+			 * We replay the request because the device may
+			 * go away in the mean time.
+			 */
+			rtnl_unlock();
+			request_module("sch_%s", name);
+			rtnl_lock();
+			ops = qdisc_lookup_ops(kind);
+			if (ops != NULL) {
+				/* We will try again qdisc_lookup_ops,
+				 * so don't keep a reference.
+				 */
+				module_put(ops->owner);
+				err = -EAGAIN;
+				goto err_out;
+			}
+		}
+	}
+#endif
+
+	err = -ENOENT;
+	if (ops == NULL)
+		goto err_out;
+
+	sch = qdisc_alloc(dev_queue, ops);
+	if (IS_ERR(sch)) {
+		err = PTR_ERR(sch);
+		goto err_out2;
+	}
+
+	sch->parent = parent;
+
+	if (handle == TC_H_INGRESS) {
+		sch->flags |= TCQ_F_INGRESS;
+		handle = TC_H_MAKE(TC_H_INGRESS, 0);
+		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
+	} else {
+		if (handle == 0) {
+			handle = qdisc_alloc_handle(dev);
+			err = -ENOMEM;
+			if (handle == 0)
+				goto err_out3;
+		}
+		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
+	}
+
+	sch->handle = handle;
+
+	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
+		if (tca[TCA_STAB]) {
+			stab = qdisc_get_stab(tca[TCA_STAB]);
+			if (IS_ERR(stab)) {
+				err = PTR_ERR(stab);
+				goto err_out4;
+			}
+			rcu_assign_pointer(sch->stab, stab);
+		}
+		if (tca[TCA_RATE]) {
+			spinlock_t *root_lock;
+
+			err = -EOPNOTSUPP;
+			if (sch->flags & TCQ_F_MQROOT)
+				goto err_out4;
+
+			if ((sch->parent != TC_H_ROOT) &&
+			    !(sch->flags & TCQ_F_INGRESS) &&
+			    (!p || !(p->flags & TCQ_F_MQROOT)))
+				root_lock = qdisc_root_sleeping_lock(sch);
+			else
+				root_lock = qdisc_lock(sch);
+
+			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+						root_lock, tca[TCA_RATE]);
+			if (err)
+				goto err_out4;
+		}
+
+		qdisc_list_add(sch);
+
+		return sch;
+	}
+err_out3:
+	dev_put(dev);
+	kfree((char *) sch - sch->padded);
+err_out2:
+	module_put(ops->owner);
+err_out:
+	*errp = err;
+	return NULL;
+
+err_out4:
+	/*
+	 * Any broken qdiscs that would require a ops->reset() here?
+	 * The qdisc was never in action so it shouldn't be necessary.
+	 */
+	qdisc_put_stab(rtnl_dereference(sch->stab));
+	if (ops->destroy)
+		ops->destroy(sch);
+	goto err_out3;
+}
+
+static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
+{
+	struct qdisc_size_table *ostab, *stab = NULL;
+	int err = 0;
+
+	if (tca[TCA_OPTIONS]) {
+		if (sch->ops->change == NULL)
+			return -EINVAL;
+		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
+		if (err)
+			return err;
+	}
+
+	if (tca[TCA_STAB]) {
+		stab = qdisc_get_stab(tca[TCA_STAB]);
+		if (IS_ERR(stab))
+			return PTR_ERR(stab);
+	}
+
+	ostab = rtnl_dereference(sch->stab);
+	rcu_assign_pointer(sch->stab, stab);
+	qdisc_put_stab(ostab);
+
+	if (tca[TCA_RATE]) {
+		/* NB: ignores errors from replace_estimator
+		   because change can't be undone. */
+		if (sch->flags & TCQ_F_MQROOT)
+			goto out;
+		gen_replace_estimator(&sch->bstats, &sch->rate_est,
+					    qdisc_root_sleeping_lock(sch),
+					    tca[TCA_RATE]);
+	}
+out:
+	return 0;
+}
+
+struct check_loop_arg {
+	struct qdisc_walker	w;
+	struct Qdisc		*p;
+	int			depth;
+};
+
+static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
+
+static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
+{
+	struct check_loop_arg	arg;
+
+	if (q->ops->cl_ops == NULL)
+		return 0;
+
+	arg.w.stop = arg.w.skip = arg.w.count = 0;
+	arg.w.fn = check_loop_fn;
+	arg.depth = depth;
+	arg.p = p;
+	q->ops->cl_ops->walk(q, &arg.w);
+	return arg.w.stop ? -ELOOP : 0;
+}
+
+static int
+check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
+{
+	struct Qdisc *leaf;
+	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+	struct check_loop_arg *arg = (struct check_loop_arg *)w;
+
+	leaf = cops->leaf(q, cl);
+	if (leaf) {
+		if (leaf == arg->p || arg->depth > 7)
+			return -ELOOP;
+		return check_loop(leaf, arg->p, arg->depth + 1);
+	}
+	return 0;
+}
+
+/*
+ * Delete/get qdisc.
+ */
+
+static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct net_device *dev;
+	u32 clid = tcm->tcm_parent;
+	struct Qdisc *q = NULL;
+	struct Qdisc *p = NULL;
+	int err;
+
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	if (clid) {
+		if (clid != TC_H_ROOT) {
+			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
+				p = qdisc_lookup(dev, TC_H_MAJ(clid));
+				if (!p)
+					return -ENOENT;
+				q = qdisc_leaf(p, clid);
+			} else if (dev_ingress_queue(dev)) {
+				q = dev_ingress_queue(dev)->qdisc_sleeping;
+			}
+		} else {
+			q = dev->qdisc;
+		}
+		if (!q)
+			return -ENOENT;
+
+		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
+			return -EINVAL;
+	} else {
+		q = qdisc_lookup(dev, tcm->tcm_handle);
+		if (!q)
+			return -ENOENT;
+	}
+
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+		return -EINVAL;
+
+	if (n->nlmsg_type == RTM_DELQDISC) {
+		if (!clid)
+			return -EINVAL;
+		if (q->handle == 0)
+			return -ENOENT;
+		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
+		if (err != 0)
+			return err;
+	} else {
+		qdisc_notify(net, skb, n, clid, NULL, q);
+	}
+	return 0;
+}
+
+/*
+ * Create/change qdisc.
+ */
+
+static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tcmsg *tcm;
+	struct nlattr *tca[TCA_MAX + 1];
+	struct net_device *dev;
+	u32 clid;
+	struct Qdisc *q, *p;
+	int err;
+
+replay:
+	/* Reinit, just in case something touches this. */
+	tcm = NLMSG_DATA(n);
+	clid = tcm->tcm_parent;
+	q = p = NULL;
+
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	if (clid) {
+		if (clid != TC_H_ROOT) {
+			if (clid != TC_H_INGRESS) {
+				p = qdisc_lookup(dev, TC_H_MAJ(clid));
+				if (!p)
+					return -ENOENT;
+				q = qdisc_leaf(p, clid);
+			} else if (dev_ingress_queue_create(dev)) {
+				q = dev_ingress_queue(dev)->qdisc_sleeping;
+			}
+		} else {
+			q = dev->qdisc;
+		}
+
+		/* It may be default qdisc, ignore it */
+		if (q && q->handle == 0)
+			q = NULL;
+
+		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
+			if (tcm->tcm_handle) {
+				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
+					return -EEXIST;
+				if (TC_H_MIN(tcm->tcm_handle))
+					return -EINVAL;
+				q = qdisc_lookup(dev, tcm->tcm_handle);
+				if (!q)
+					goto create_n_graft;
+				if (n->nlmsg_flags & NLM_F_EXCL)
+					return -EEXIST;
+				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+					return -EINVAL;
+				if (q == p ||
+				    (p && check_loop(q, p, 0)))
+					return -ELOOP;
+				atomic_inc(&q->refcnt);
+				goto graft;
+			} else {
+				if (!q)
+					goto create_n_graft;
+
+				/* This magic test requires explanation.
+				 *
+				 *   We know, that some child q is already
+				 *   attached to this parent and have choice:
+				 *   either to change it or to create/graft new one.
+				 *
+				 *   1. We are allowed to create/graft only
+				 *   if CREATE and REPLACE flags are set.
+				 *
+				 *   2. If EXCL is set, requestor wanted to say,
+				 *   that qdisc tcm_handle is not expected
+				 *   to exist, so that we choose create/graft too.
+				 *
+				 *   3. The last case is when no flags are set.
+				 *   Alas, it is sort of hole in API, we
+				 *   cannot decide what to do unambiguously.
+				 *   For now we select create/graft, if
+				 *   user gave KIND, which does not match existing.
+				 */
+				if ((n->nlmsg_flags & NLM_F_CREATE) &&
+				    (n->nlmsg_flags & NLM_F_REPLACE) &&
+				    ((n->nlmsg_flags & NLM_F_EXCL) ||
+				     (tca[TCA_KIND] &&
+				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
+					goto create_n_graft;
+			}
+		}
+	} else {
+		if (!tcm->tcm_handle)
+			return -EINVAL;
+		q = qdisc_lookup(dev, tcm->tcm_handle);
+	}
+
+	/* Change qdisc parameters */
+	if (q == NULL)
+		return -ENOENT;
+	if (n->nlmsg_flags & NLM_F_EXCL)
+		return -EEXIST;
+	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
+		return -EINVAL;
+	err = qdisc_change(q, tca);
+	if (err == 0)
+		qdisc_notify(net, skb, n, clid, NULL, q);
+	return err;
+
+create_n_graft:
+	if (!(n->nlmsg_flags & NLM_F_CREATE))
+		return -ENOENT;
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
+		struct netdev_queue *dev_queue;
+
+		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
+		else if (p)
+			dev_queue = p->dev_queue;
+		else
+			dev_queue = netdev_get_tx_queue(dev, 0);
+
+		q = qdisc_create(dev, dev_queue, p,
+				 tcm->tcm_parent, tcm->tcm_handle,
+				 tca, &err);
+	}
+	if (q == NULL) {
+		if (err == -EAGAIN)
+			goto replay;
+		return err;
+	}
+
+graft:
+	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
+	if (err) {
+		if (q)
+			qdisc_destroy(q);
+		return err;
+	}
+
+	return 0;
+}
+
+static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
+			 u32 pid, u32 seq, u16 flags, int event)
+{
+	struct tcmsg *tcm;
+	struct nlmsghdr  *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct gnet_dump d;
+	struct qdisc_size_table *stab;
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
+	tcm = NLMSG_DATA(nlh);
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+	tcm->tcm_parent = clid;
+	tcm->tcm_handle = q->handle;
+	tcm->tcm_info = atomic_read(&q->refcnt);
+	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+	if (q->ops->dump && q->ops->dump(q, skb) < 0)
+		goto nla_put_failure;
+	q->qstats.qlen = q->q.qlen;
+
+	stab = rtnl_dereference(q->stab);
+	if (stab && qdisc_dump_stab(skb, stab) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+					 qdisc_root_sleeping_lock(q), &d) < 0)
+		goto nla_put_failure;
+
+	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
+	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_finish_copy(&d) < 0)
+		goto nla_put_failure;
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static bool tc_qdisc_dump_ignore(struct Qdisc *q)
+{
+	return (q->flags & TCQ_F_BUILTIN) ? true : false;
+}
+
+static int qdisc_notify(struct net *net, struct sk_buff *oskb,
+			struct nlmsghdr *n, u32 clid,
+			struct Qdisc *old, struct Qdisc *new)
+{
+	struct sk_buff *skb;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (old && !tc_qdisc_dump_ignore(old)) {
+		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
+				  0, RTM_DELQDISC) < 0)
+			goto err_out;
+	}
+	if (new && !tc_qdisc_dump_ignore(new)) {
+		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
+				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
+			goto err_out;
+	}
+
+	if (skb->len)
+		return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+				      n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
+			      struct netlink_callback *cb,
+			      int *q_idx_p, int s_q_idx)
+{
+	int ret = 0, q_idx = *q_idx_p;
+	struct Qdisc *q;
+
+	if (!root)
+		return 0;
+
+	q = root;
+	if (q_idx < s_q_idx) {
+		q_idx++;
+	} else {
+		if (!tc_qdisc_dump_ignore(q) &&
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+			goto done;
+		q_idx++;
+	}
+	list_for_each_entry(q, &root->list, list) {
+		if (q_idx < s_q_idx) {
+			q_idx++;
+			continue;
+		}
+		if (!tc_qdisc_dump_ignore(q) &&
+		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
+			goto done;
+		q_idx++;
+	}
+
+out:
+	*q_idx_p = q_idx;
+	return ret;
+done:
+	ret = -1;
+	goto out;
+}
+
+static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	int idx, q_idx;
+	int s_idx, s_q_idx;
+	struct net_device *dev;
+
+	s_idx = cb->args[0];
+	s_q_idx = q_idx = cb->args[1];
+
+	rcu_read_lock();
+	idx = 0;
+	for_each_netdev_rcu(net, dev) {
+		struct netdev_queue *dev_queue;
+
+		if (idx < s_idx)
+			goto cont;
+		if (idx > s_idx)
+			s_q_idx = 0;
+		q_idx = 0;
+
+		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
+			goto done;
+
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
+			goto done;
+
+cont:
+		idx++;
+	}
+
+done:
+	rcu_read_unlock();
+
+	cb->args[0] = idx;
+	cb->args[1] = q_idx;
+
+	return skb->len;
+}
+
+
+
+/************************************************
+ *	Traffic classes manipulation.		*
+ ************************************************/
+
+
+
+static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct net_device *dev;
+	struct Qdisc *q = NULL;
+	const struct Qdisc_class_ops *cops;
+	unsigned long cl = 0;
+	unsigned long new_cl;
+	u32 pid = tcm->tcm_parent;
+	u32 clid = tcm->tcm_handle;
+	u32 qid = TC_H_MAJ(clid);
+	int err;
+
+	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return -ENODEV;
+
+	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
+	if (err < 0)
+		return err;
+
+	/*
+	   parent == TC_H_UNSPEC - unspecified parent.
+	   parent == TC_H_ROOT   - class is root, which has no parent.
+	   parent == X:0	 - parent is root class.
+	   parent == X:Y	 - parent is a node in hierarchy.
+	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
+
+	   handle == 0:0	 - generate handle from kernel pool.
+	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
+	   handle == X:Y	 - clear.
+	   handle == X:0	 - root class.
+	 */
+
+	/* Step 1. Determine qdisc handle X:0 */
+
+	if (pid != TC_H_ROOT) {
+		u32 qid1 = TC_H_MAJ(pid);
+
+		if (qid && qid1) {
+			/* If both majors are known, they must be identical. */
+			if (qid != qid1)
+				return -EINVAL;
+		} else if (qid1) {
+			qid = qid1;
+		} else if (qid == 0)
+			qid = dev->qdisc->handle;
+
+		/* Now qid is genuine qdisc handle consistent
+		 * both with parent and child.
+		 *
+		 * TC_H_MAJ(pid) still may be unspecified, complete it now.
+		 */
+		if (pid)
+			pid = TC_H_MAKE(qid, pid);
+	} else {
+		if (qid == 0)
+			qid = dev->qdisc->handle;
+	}
+
+	/* OK. Locate qdisc */
+	q = qdisc_lookup(dev, qid);
+	if (!q)
+		return -ENOENT;
+
+	/* An check that it supports classes */
+	cops = q->ops->cl_ops;
+	if (cops == NULL)
+		return -EINVAL;
+
+	/* Now try to get class */
+	if (clid == 0) {
+		if (pid == TC_H_ROOT)
+			clid = qid;
+	} else
+		clid = TC_H_MAKE(qid, clid);
+
+	if (clid)
+		cl = cops->get(q, clid);
+
+	if (cl == 0) {
+		err = -ENOENT;
+		if (n->nlmsg_type != RTM_NEWTCLASS ||
+		    !(n->nlmsg_flags & NLM_F_CREATE))
+			goto out;
+	} else {
+		switch (n->nlmsg_type) {
+		case RTM_NEWTCLASS:
+			err = -EEXIST;
+			if (n->nlmsg_flags & NLM_F_EXCL)
+				goto out;
+			break;
+		case RTM_DELTCLASS:
+			err = -EOPNOTSUPP;
+			if (cops->delete)
+				err = cops->delete(q, cl);
+			if (err == 0)
+				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
+			goto out;
+		case RTM_GETTCLASS:
+			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
+			goto out;
+		default:
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	new_cl = cl;
+	err = -EOPNOTSUPP;
+	if (cops->change)
+		err = cops->change(q, clid, pid, tca, &new_cl);
+	if (err == 0)
+		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
+
+out:
+	if (cl)
+		cops->put(q, cl);
+
+	return err;
+}
+
+
+static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
+			  unsigned long cl,
+			  u32 pid, u32 seq, u16 flags, int event)
+{
+	struct tcmsg *tcm;
+	struct nlmsghdr  *nlh;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct gnet_dump d;
+	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
+	tcm = NLMSG_DATA(nlh);
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm__pad1 = 0;
+	tcm->tcm__pad2 = 0;
+	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
+	tcm->tcm_parent = q->handle;
+	tcm->tcm_handle = q->handle;
+	tcm->tcm_info = 0;
+	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
+	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
+					 qdisc_root_sleeping_lock(q), &d) < 0)
+		goto nla_put_failure;
+
+	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
+		goto nla_put_failure;
+
+	if (gnet_stats_finish_copy(&d) < 0)
+		goto nla_put_failure;
+
+	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tclass_notify(struct net *net, struct sk_buff *oskb,
+			 struct nlmsghdr *n, struct Qdisc *q,
+			 unsigned long cl, int event)
+{
+	struct sk_buff *skb;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
+			      n->nlmsg_flags & NLM_F_ECHO);
+}
+
+struct qdisc_dump_args {
+	struct qdisc_walker	w;
+	struct sk_buff		*skb;
+	struct netlink_callback	*cb;
+};
+
+static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
+{
+	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
+
+	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
+			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
+}
+
+static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
+				struct tcmsg *tcm, struct netlink_callback *cb,
+				int *t_p, int s_t)
+{
+	struct qdisc_dump_args arg;
+
+	if (tc_qdisc_dump_ignore(q) ||
+	    *t_p < s_t || !q->ops->cl_ops ||
+	    (tcm->tcm_parent &&
+	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
+		(*t_p)++;
+		return 0;
+	}
+	if (*t_p > s_t)
+		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+	arg.w.fn = qdisc_class_dump;
+	arg.skb = skb;
+	arg.cb = cb;
+	arg.w.stop  = 0;
+	arg.w.skip = cb->args[1];
+	arg.w.count = 0;
+	q->ops->cl_ops->walk(q, &arg.w);
+	cb->args[1] = arg.w.count;
+	if (arg.w.stop)
+		return -1;
+	(*t_p)++;
+	return 0;
+}
+
+static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
+			       struct tcmsg *tcm, struct netlink_callback *cb,
+			       int *t_p, int s_t)
+{
+	struct Qdisc *q;
+
+	if (!root)
+		return 0;
+
+	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
+		return -1;
+
+	list_for_each_entry(q, &root->list, list) {
+		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	struct net *net = sock_net(skb->sk);
+	struct netdev_queue *dev_queue;
+	struct net_device *dev;
+	int t, s_t;
+
+	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
+		return 0;
+	dev = dev_get_by_index(net, tcm->tcm_ifindex);
+	if (!dev)
+		return 0;
+
+	s_t = cb->args[0];
+	t = 0;
+
+	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
+		goto done;
+
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
+		goto done;
+
+done:
+	cb->args[0] = t;
+
+	dev_put(dev);
+	return skb->len;
+}
+
+/* Main classifier routine: scans classifier chain attached
+ * to this qdisc, (optionally) tests for protocol and asks
+ * specific classifiers.
+ */
+int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
+		       struct tcf_result *res)
+{
+	__be16 protocol = skb->protocol;
+	int err;
+
+	for (; tp; tp = tp->next) {
+		if (tp->protocol != protocol &&
+		    tp->protocol != htons(ETH_P_ALL))
+			continue;
+		err = tp->classify(skb, tp, res);
+
+		if (err >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
+				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
+#endif
+			return err;
+		}
+	}
+	return -1;
+}
+EXPORT_SYMBOL(tc_classify_compat);
+
+int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
+		struct tcf_result *res)
+{
+	int err = 0;
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_proto *otp = tp;
+reclassify:
+#endif
+
+	err = tc_classify_compat(skb, tp, res);
+#ifdef CONFIG_NET_CLS_ACT
+	if (err == TC_ACT_RECLASSIFY) {
+		u32 verd = G_TC_VERD(skb->tc_verd);
+		tp = otp;
+
+		if (verd++ >= MAX_REC_LOOP) {
+			if (net_ratelimit())
+				pr_notice("%s: packet reclassify loop"
+					  " rule prio %u protocol %02x\n",
+					  tp->q->ops->id,
+					  tp->prio & 0xffff,
+					  ntohs(tp->protocol));
+			return TC_ACT_SHOT;
+		}
+		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
+		goto reclassify;
+	}
+#endif
+	return err;
+}
+EXPORT_SYMBOL(tc_classify);
+
+void tcf_destroy(struct tcf_proto *tp)
+{
+	tp->ops->destroy(tp);
+	module_put(tp->ops->owner);
+	kfree(tp);
+}
+
+void tcf_destroy_chain(struct tcf_proto **fl)
+{
+	struct tcf_proto *tp;
+
+	while ((tp = *fl) != NULL) {
+		*fl = tp->next;
+		tcf_destroy(tp);
+	}
+}
+EXPORT_SYMBOL(tcf_destroy_chain);
+
+#ifdef CONFIG_PROC_FS
+static int psched_show(struct seq_file *seq, void *v)
+{
+	struct timespec ts;
+
+	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
+	seq_printf(seq, "%08x %08x %08x %08x\n",
+		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
+		   1000000,
+		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+
+	return 0;
+}
+
+static int psched_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, psched_show, NULL);
+}
+
+static const struct file_operations psched_fops = {
+	.owner = THIS_MODULE,
+	.open = psched_open,
+	.read  = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int __net_init psched_net_init(struct net *net)
+{
+	struct proc_dir_entry *e;
+
+	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
+	if (e == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+	proc_net_remove(net, "psched");
+}
+#else
+static int __net_init psched_net_init(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit psched_net_exit(struct net *net)
+{
+}
+#endif
+
+static struct pernet_operations psched_net_ops = {
+	.init = psched_net_init,
+	.exit = psched_net_exit,
+};
+
+static int __init pktsched_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&psched_net_ops);
+	if (err) {
+		pr_err("pktsched_init: "
+		       "cannot initialize per netns operations\n");
+		return err;
+	}
+
+	register_qdisc(&pfifo_qdisc_ops);
+	register_qdisc(&bfifo_qdisc_ops);
+	register_qdisc(&pfifo_head_drop_qdisc_ops);
+	register_qdisc(&mq_qdisc_ops);
+
+	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
+	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
+	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
+
+	return 0;
+}
+
+subsys_initcall(pktsched_init);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
new file mode 100644
index 00000000..3f08158b
--- /dev/null
+++ b/net/sched/sch_atm.c
@@ -0,0 +1,690 @@
+/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/atmdev.h>
+#include <linux/atmclip.h>
+#include <linux/rtnetlink.h>
+#include <linux/file.h>		/* for fput */
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+extern struct socket *sockfd_lookup(int fd, int *err);	/* @@@ fix this */
+
+/*
+ * The ATM queuing discipline provides a framework for invoking classifiers
+ * (aka "filters"), which in turn select classes of this queuing discipline.
+ * Each class maps the flow(s) it is handling to a given VC. Multiple classes
+ * may share the same VC.
+ *
+ * When creating a class, VCs are specified by passing the number of the open
+ * socket descriptor by which the calling process references the VC. The kernel
+ * keeps the VC open at least until all classes using it are removed.
+ *
+ * In this file, most functions are named atm_tc_* to avoid confusion with all
+ * the atm_* in net/atm. This naming convention differs from what's used in the
+ * rest of net/sched.
+ *
+ * Known bugs:
+ *  - sometimes messes up the IP stack
+ *  - any manipulations besides the few operations described in the README, are
+ *    untested and likely to crash the system
+ *  - should lock the flow while there is data in the queue (?)
+ */
+
+#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
+
+struct atm_flow_data {
+	struct Qdisc		*q;	/* FIFO, TBF, etc. */
+	struct tcf_proto	*filter_list;
+	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
+	void			(*old_pop)(struct atm_vcc *vcc,
+					   struct sk_buff *skb); /* chaining */
+	struct atm_qdisc_data	*parent;	/* parent qdisc */
+	struct socket		*sock;		/* for closing */
+	u32			classid;	/* x:y type ID */
+	int			ref;		/* reference count */
+	struct gnet_stats_basic_packed	bstats;
+	struct gnet_stats_queue	qstats;
+	struct list_head	list;
+	struct atm_flow_data	*excess;	/* flow for excess traffic;
+						   NULL to set CLP instead */
+	int			hdr_len;
+	unsigned char		hdr[0];		/* header data; MUST BE LAST */
+};
+
+struct atm_qdisc_data {
+	struct atm_flow_data	link;		/* unclassified skbs go here */
+	struct list_head	flows;		/* NB: "link" is also on this
+						   list */
+	struct tasklet_struct	task;		/* dequeue tasklet */
+};
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow->classid == classid)
+			return flow;
+	}
+	return NULL;
+}
+
+static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
+			struct Qdisc *new, struct Qdisc **old)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+	pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
+		sch, p, flow, new, old);
+	if (list_empty(&flow->list))
+		return -EINVAL;
+	if (!new)
+		new = &noop_qdisc;
+	*old = flow->q;
+	flow->q = new;
+	if (*old)
+		qdisc_reset(*old);
+	return 0;
+}
+
+static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+	pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
+	return flow ? flow->q : NULL;
+}
+
+static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
+{
+	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+
+	pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
+	flow = lookup_flow(sch, classid);
+	if (flow)
+		flow->ref++;
+	pr_debug("atm_tc_get: flow %p\n", flow);
+	return (unsigned long)flow;
+}
+
+static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
+					unsigned long parent, u32 classid)
+{
+	return atm_tc_get(sch, classid);
+}
+
+/*
+ * atm_tc_put handles all destructions, including the ones that are explicitly
+ * requested (atm_tc_destroy, etc.). The assumption here is that we never drop
+ * anything that still seems to be in use.
+ */
+static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+	pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	if (--flow->ref)
+		return;
+	pr_debug("atm_tc_put: destroying\n");
+	list_del_init(&flow->list);
+	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
+	qdisc_destroy(flow->q);
+	tcf_destroy_chain(&flow->filter_list);
+	if (flow->sock) {
+		pr_debug("atm_tc_put: f_count %ld\n",
+			file_count(flow->sock->file));
+		flow->vcc->pop = flow->old_pop;
+		sockfd_put(flow->sock);
+	}
+	if (flow->excess)
+		atm_tc_put(sch, (unsigned long)flow->excess);
+	if (flow != &p->link)
+		kfree(flow);
+	/*
+	 * If flow == &p->link, the qdisc no longer works at this point and
+	 * needs to be removed. (By the caller of atm_tc_put.)
+	 */
+}
+
+static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
+
+	pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
+	VCC2FLOW(vcc)->old_pop(vcc, skb);
+	tasklet_schedule(&p->task);
+}
+
+static const u8 llc_oui_ip[] = {
+	0xaa,			/* DSAP: non-ISO */
+	0xaa,			/* SSAP: non-ISO */
+	0x03,			/* Ctrl: Unnumbered Information Command PDU */
+	0x00,			/* OUI: EtherType */
+	0x00, 0x00,
+	0x08, 0x00
+};				/* Ethertype IP (0800) */
+
+static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
+	[TCA_ATM_FD]		= { .type = NLA_U32 },
+	[TCA_ATM_EXCESS]	= { .type = NLA_U32 },
+};
+
+static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
+	struct atm_flow_data *excess = NULL;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_ATM_MAX + 1];
+	struct socket *sock;
+	int fd, error, hdr_len;
+	void *hdr;
+
+	pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
+		"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
+	/*
+	 * The concept of parents doesn't apply for this qdisc.
+	 */
+	if (parent && parent != TC_H_ROOT && parent != sch->handle)
+		return -EINVAL;
+	/*
+	 * ATM classes cannot be changed. In order to change properties of the
+	 * ATM connection, that socket needs to be modified directly (via the
+	 * native ATM API. In order to send a flow to a different VC, the old
+	 * class needs to be removed and a new one added. (This may be changed
+	 * later.)
+	 */
+	if (flow)
+		return -EBUSY;
+	if (opt == NULL)
+		return -EINVAL;
+
+	error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
+	if (error < 0)
+		return error;
+
+	if (!tb[TCA_ATM_FD])
+		return -EINVAL;
+	fd = nla_get_u32(tb[TCA_ATM_FD]);
+	pr_debug("atm_tc_change: fd %d\n", fd);
+	if (tb[TCA_ATM_HDR]) {
+		hdr_len = nla_len(tb[TCA_ATM_HDR]);
+		hdr = nla_data(tb[TCA_ATM_HDR]);
+	} else {
+		hdr_len = RFC1483LLC_LEN;
+		hdr = NULL;	/* default LLC/SNAP for IP */
+	}
+	if (!tb[TCA_ATM_EXCESS])
+		excess = NULL;
+	else {
+		excess = (struct atm_flow_data *)
+			atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
+		if (!excess)
+			return -ENOENT;
+	}
+	pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
+		 opt->nla_type, nla_len(opt), hdr_len);
+	sock = sockfd_lookup(fd, &error);
+	if (!sock)
+		return error;	/* f_count++ */
+	pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
+	if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
+		error = -EPROTOTYPE;
+		goto err_out;
+	}
+	/* @@@ should check if the socket is really operational or we'll crash
+	   on vcc->send */
+	if (classid) {
+		if (TC_H_MAJ(classid ^ sch->handle)) {
+			pr_debug("atm_tc_change: classid mismatch\n");
+			error = -EINVAL;
+			goto err_out;
+		}
+	} else {
+		int i;
+		unsigned long cl;
+
+		for (i = 1; i < 0x8000; i++) {
+			classid = TC_H_MAKE(sch->handle, 0x8000 | i);
+			cl = atm_tc_get(sch, classid);
+			if (!cl)
+				break;
+			atm_tc_put(sch, cl);
+		}
+	}
+	pr_debug("atm_tc_change: new id %x\n", classid);
+	flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
+	pr_debug("atm_tc_change: flow %p\n", flow);
+	if (!flow) {
+		error = -ENOBUFS;
+		goto err_out;
+	}
+	flow->filter_list = NULL;
+	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	if (!flow->q)
+		flow->q = &noop_qdisc;
+	pr_debug("atm_tc_change: qdisc %p\n", flow->q);
+	flow->sock = sock;
+	flow->vcc = ATM_SD(sock);	/* speedup */
+	flow->vcc->user_back = flow;
+	pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
+	flow->old_pop = flow->vcc->pop;
+	flow->parent = p;
+	flow->vcc->pop = sch_atm_pop;
+	flow->classid = classid;
+	flow->ref = 1;
+	flow->excess = excess;
+	list_add(&flow->list, &p->link.list);
+	flow->hdr_len = hdr_len;
+	if (hdr)
+		memcpy(flow->hdr, hdr, hdr_len);
+	else
+		memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
+	*arg = (unsigned long)flow;
+	return 0;
+err_out:
+	if (excess)
+		atm_tc_put(sch, (unsigned long)excess);
+	sockfd_put(sock);
+	return error;
+}
+
+static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	if (list_empty(&flow->list))
+		return -EINVAL;
+	if (flow->filter_list || flow == &p->link)
+		return -EBUSY;
+	/*
+	 * Reference count must be 2: one for "keepalive" (set at class
+	 * creation), and one for the reference held when calling delete.
+	 */
+	if (flow->ref < 2) {
+		pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
+		return -EINVAL;
+	}
+	if (flow->ref > 2)
+		return -EBUSY;	/* catch references via excess, etc. */
+	atm_tc_put(sch, arg);
+	return 0;
+}
+
+static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+
+	pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+	if (walker->stop)
+		return;
+	list_for_each_entry(flow, &p->flows, list) {
+		if (walker->count >= walker->skip &&
+		    walker->fn(sch, (unsigned long)flow, walker) < 0) {
+			walker->stop = 1;
+			break;
+		}
+		walker->count++;
+	}
+}
+
+static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+
+	pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
+	return flow ? &flow->filter_list : &p->link.filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+	struct tcf_result res;
+	int result;
+	int ret = NET_XMIT_POLICED;
+
+	pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+	result = TC_POLICE_OK;	/* be nice to gcc */
+	flow = NULL;
+	if (TC_H_MAJ(skb->priority) != sch->handle ||
+	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+		list_for_each_entry(flow, &p->flows, list) {
+			if (flow->filter_list) {
+				result = tc_classify_compat(skb,
+							    flow->filter_list,
+							    &res);
+				if (result < 0)
+					continue;
+				flow = (struct atm_flow_data *)res.class;
+				if (!flow)
+					flow = lookup_flow(sch, res.classid);
+				goto done;
+			}
+		}
+		flow = NULL;
+done:
+		;
+	}
+	if (!flow) {
+		flow = &p->link;
+	} else {
+		if (flow->vcc)
+			ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
+		/*@@@ looks good ... but it's not supposed to work :-) */
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			kfree_skb(skb);
+			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			kfree_skb(skb);
+			goto drop;
+		case TC_POLICE_RECLASSIFY:
+			if (flow->excess)
+				flow = flow->excess;
+			else
+				ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
+			break;
+		}
+#endif
+	}
+
+	ret = qdisc_enqueue(skb, flow->q);
+	if (ret != NET_XMIT_SUCCESS) {
+drop: __maybe_unused
+		if (net_xmit_drop_count(ret)) {
+			sch->qstats.drops++;
+			if (flow)
+				flow->qstats.drops++;
+		}
+		return ret;
+	}
+	qdisc_bstats_update(sch, skb);
+	bstats_update(&flow->bstats, skb);
+	/*
+	 * Okay, this may seem weird. We pretend we've dropped the packet if
+	 * it goes via ATM. The reason for this is that the outer qdisc
+	 * expects to be able to q->dequeue the packet later on if we return
+	 * success at this place. Also, sch->q.qdisc needs to reflect whether
+	 * there is a packet egligible for dequeuing or not. Note that the
+	 * statistics of the outer qdisc are necessarily wrong because of all
+	 * this. There's currently no correct solution for this.
+	 */
+	if (flow == &p->link) {
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	tasklet_schedule(&p->task);
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+/*
+ * Dequeue packets and send them over ATM. Note that we quite deliberately
+ * avoid checking net_device's flow control here, simply because sch_atm
+ * uses its own channels, which have nothing to do with any CLIP/LANE/or
+ * non-ATM interfaces.
+ */
+
+static void sch_atm_dequeue(unsigned long data)
+{
+	struct Qdisc *sch = (struct Qdisc *)data;
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+	struct sk_buff *skb;
+
+	pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow == &p->link)
+			continue;
+		/*
+		 * If traffic is properly shaped, this won't generate nasty
+		 * little bursts. Otherwise, it may ... (but that's okay)
+		 */
+		while ((skb = flow->q->ops->peek(flow->q))) {
+			if (!atm_may_send(flow->vcc, skb->truesize))
+				break;
+
+			skb = qdisc_dequeue_peeked(flow->q);
+			if (unlikely(!skb))
+				break;
+
+			pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
+			/* remove any LL header somebody else has attached */
+			skb_pull(skb, skb_network_offset(skb));
+			if (skb_headroom(skb) < flow->hdr_len) {
+				struct sk_buff *new;
+
+				new = skb_realloc_headroom(skb, flow->hdr_len);
+				dev_kfree_skb(skb);
+				if (!new)
+					continue;
+				skb = new;
+			}
+			pr_debug("sch_atm_dequeue: ip %p, data %p\n",
+				 skb_network_header(skb), skb->data);
+			ATM_SKB(skb)->vcc = flow->vcc;
+			memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
+			       flow->hdr_len);
+			atomic_add(skb->truesize,
+				   &sk_atm(flow->vcc)->sk_wmem_alloc);
+			/* atm.atm_options are already set by atm_tc_enqueue */
+			flow->vcc->send(flow->vcc, skb);
+		}
+	}
+}
+
+static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	tasklet_schedule(&p->task);
+	skb = qdisc_dequeue_peeked(p->link.q);
+	if (skb)
+		sch->q.qlen--;
+	return skb;
+}
+
+static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
+
+	return p->link.q->ops->peek(p->link.q);
+}
+
+static unsigned int atm_tc_drop(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+	unsigned int len;
+
+	pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
+			return len;
+	}
+	return 0;
+}
+
+static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+	INIT_LIST_HEAD(&p->flows);
+	INIT_LIST_HEAD(&p->link.list);
+	list_add(&p->link.list, &p->flows);
+	p->link.q = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, sch->handle);
+	if (!p->link.q)
+		p->link.q = &noop_qdisc;
+	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
+	p->link.filter_list = NULL;
+	p->link.vcc = NULL;
+	p->link.sock = NULL;
+	p->link.classid = sch->handle;
+	p->link.ref = 1;
+	tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
+	return 0;
+}
+
+static void atm_tc_reset(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+
+	pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list)
+		qdisc_reset(flow->q);
+	sch->q.qlen = 0;
+}
+
+static void atm_tc_destroy(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow, *tmp;
+
+	pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
+	list_for_each_entry(flow, &p->flows, list)
+		tcf_destroy_chain(&flow->filter_list);
+
+	list_for_each_entry_safe(flow, tmp, &p->flows, list) {
+		if (flow->ref > 1)
+			pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
+		atm_tc_put(sch, (unsigned long)flow);
+	}
+	tasklet_kill(&p->task);
+}
+
+static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+	struct nlattr *nest;
+
+	pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
+		sch, p, flow, skb, tcm);
+	if (list_empty(&flow->list))
+		return -EINVAL;
+	tcm->tcm_handle = flow->classid;
+	tcm->tcm_info = flow->q->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr);
+	if (flow->vcc) {
+		struct sockaddr_atmpvc pvc;
+		int state;
+
+		pvc.sap_family = AF_ATMPVC;
+		pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
+		pvc.sap_addr.vpi = flow->vcc->vpi;
+		pvc.sap_addr.vci = flow->vcc->vci;
+		NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc);
+		state = ATM_VF2VS(flow->vcc->flags);
+		NLA_PUT_U32(skb, TCA_ATM_STATE, state);
+	}
+	if (flow->excess)
+		NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
+	else
+		NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
+
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+static int
+atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+			struct gnet_dump *d)
+{
+	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
+
+	flow->qstats.qlen = flow->q->q.qlen;
+
+	if (gnet_stats_copy_basic(d, &flow->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &flow->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static const struct Qdisc_class_ops atm_class_ops = {
+	.graft		= atm_tc_graft,
+	.leaf		= atm_tc_leaf,
+	.get		= atm_tc_get,
+	.put		= atm_tc_put,
+	.change		= atm_tc_change,
+	.delete		= atm_tc_delete,
+	.walk		= atm_tc_walk,
+	.tcf_chain	= atm_tc_find_tcf,
+	.bind_tcf	= atm_tc_bind_filter,
+	.unbind_tcf	= atm_tc_put,
+	.dump		= atm_tc_dump_class,
+	.dump_stats	= atm_tc_dump_class_stats,
+};
+
+static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
+	.cl_ops		= &atm_class_ops,
+	.id		= "atm",
+	.priv_size	= sizeof(struct atm_qdisc_data),
+	.enqueue	= atm_tc_enqueue,
+	.dequeue	= atm_tc_dequeue,
+	.peek		= atm_tc_peek,
+	.drop		= atm_tc_drop,
+	.init		= atm_tc_init,
+	.reset		= atm_tc_reset,
+	.destroy	= atm_tc_destroy,
+	.dump		= atm_tc_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init atm_init(void)
+{
+	return register_qdisc(&atm_qdisc_ops);
+}
+
+static void __exit atm_exit(void)
+{
+	unregister_qdisc(&atm_qdisc_ops);
+}
+
+module_init(atm_init)
+module_exit(atm_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 00000000..094a874b
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,53 @@
+/*
+ * net/sched/sch_blackhole.c	Black hole queue
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	qdisc_drop(skb, sch);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+	return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
+	.id		= "blackhole",
+	.priv_size	= 0,
+	.enqueue	= blackhole_enqueue,
+	.dequeue	= blackhole_dequeue,
+	.peek		= blackhole_dequeue,
+	.owner		= THIS_MODULE,
+};
+
+static int __init blackhole_module_init(void)
+{
+	return register_qdisc(&blackhole_qdisc_ops);
+}
+
+static void __exit blackhole_module_exit(void)
+{
+	unregister_qdisc(&blackhole_qdisc_ops);
+}
+
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 00000000..24d94c09
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,2075 @@
+/*
+ * net/sched/sch_cbq.c	Class-Based Queueing discipline.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+/*	Class-Based Queueing (CBQ) algorithm.
+	=======================================
+
+	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+		 Management Models for Packet Networks",
+		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+
+		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+		 Parameters", 1996
+
+		 [4] Sally Floyd and Michael Speer, "Experimental Results
+		 for Class-Based Queueing", 1998, not published.
+
+	-----------------------------------------------------------------------
+
+	Algorithm skeleton was taken from NS simulator cbq.cc.
+	If someone wants to check this code against the LBL version,
+	he should take into account that ONLY the skeleton was borrowed,
+	the implementation is different. Particularly:
+
+	--- The WRR algorithm is different. Our version looks more
+	reasonable (I hope) and works when quanta are allowed to be
+	less than MTU, which is always the case when real time classes
+	have small rates. Note, that the statement of [3] is
+	incomplete, delay may actually be estimated even if class
+	per-round allotment is less than MTU. Namely, if per-round
+	allotment is W*r_i, and r_1+...+r_k = r < 1
+
+	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
+
+	In the worst case we have IntServ estimate with D = W*r+k*MTU
+	and C = MTU*r. The proof (if correct at all) is trivial.
+
+
+	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
+	interpret some places, which look like wrong translations
+	from NS. Anyone is advised to find these differences
+	and explain to me, why I am wrong 8).
+
+	--- Linux has no EOI event, so that we cannot estimate true class
+	idle time. Workaround is to consider the next dequeue event
+	as sign that previous packet is finished. This is wrong because of
+	internal device queueing, but on a permanently loaded link it is true.
+	Moreover, combined with clock integrator, this scheme looks
+	very close to an ideal solution.  */
+
+struct cbq_sched_data;
+
+
+struct cbq_class {
+	struct Qdisc_class_common common;
+	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
+
+/* Parameters */
+	unsigned char		priority;	/* class priority */
+	unsigned char		priority2;	/* priority to be used after overlimit */
+	unsigned char		ewma_log;	/* time constant for idle time calculation */
+	unsigned char		ovl_strategy;
+#ifdef CONFIG_NET_CLS_ACT
+	unsigned char		police;
+#endif
+
+	u32			defmap;
+
+	/* Link-sharing scheduler parameters */
+	long			maxidle;	/* Class parameters: see below. */
+	long			offtime;
+	long			minidle;
+	u32			avpkt;
+	struct qdisc_rate_table	*R_tab;
+
+	/* Overlimit strategy parameters */
+	void			(*overlimit)(struct cbq_class *cl);
+	psched_tdiff_t		penalty;
+
+	/* General scheduler (WRR) parameters */
+	long			allot;
+	long			quantum;	/* Allotment per WRR round */
+	long			weight;		/* Relative allotment: see below */
+
+	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
+	struct cbq_class	*split;		/* Ptr to split node */
+	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
+	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
+	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
+						   parent otherwise */
+	struct cbq_class	*sibling;	/* Sibling chain */
+	struct cbq_class	*children;	/* Pointer to children chain */
+
+	struct Qdisc		*q;		/* Elementary queueing discipline */
+
+
+/* Variables */
+	unsigned char		cpriority;	/* Effective priority */
+	unsigned char		delayed;
+	unsigned char		level;		/* level of the class in hierarchy:
+						   0 for leaf classes, and maximal
+						   level of children + 1 for nodes.
+						 */
+
+	psched_time_t		last;		/* Last end of service */
+	psched_time_t		undertime;
+	long			avgidle;
+	long			deficit;	/* Saved deficit for WRR */
+	psched_time_t		penalized;
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est rate_est;
+	struct tc_cbq_xstats	xstats;
+
+	struct tcf_proto	*filter_list;
+
+	int			refcnt;
+	int			filters;
+
+	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
+};
+
+struct cbq_sched_data {
+	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
+	int			nclasses[TC_CBQ_MAXPRIO + 1];
+	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
+
+	struct cbq_class	link;
+
+	unsigned int		activemask;
+	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
+								   with backlog */
+
+#ifdef CONFIG_NET_CLS_ACT
+	struct cbq_class	*rx_class;
+#endif
+	struct cbq_class	*tx_class;
+	struct cbq_class	*tx_borrowed;
+	int			tx_len;
+	psched_time_t		now;		/* Cached timestamp */
+	psched_time_t		now_rt;		/* Cached real time */
+	unsigned int		pmask;
+
+	struct hrtimer		delay_timer;
+	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
+						   started when CBQ has
+						   backlog, but cannot
+						   transmit just now */
+	psched_tdiff_t		wd_expires;
+	int			toplevel;
+	u32			hgenerator;
+};
+
+
+#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
+
+static inline struct cbq_class *
+cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
+{
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct cbq_class, common);
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+
+static struct cbq_class *
+cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
+{
+	struct cbq_class *cl;
+
+	for (cl = this->tparent; cl; cl = cl->tparent) {
+		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
+
+		if (new != NULL && new != this)
+			return new;
+	}
+	return NULL;
+}
+
+#endif
+
+/* Classify packet. The procedure is pretty complicated, but
+ * it allows us to combine link sharing and priority scheduling
+ * transparently.
+ *
+ * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+ * so that it resolves to split nodes. Then packets are classified
+ * by logical priority, or a more specific classifier may be attached
+ * to the split node.
+ */
+
+static struct cbq_class *
+cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *head = &q->link;
+	struct cbq_class **defmap;
+	struct cbq_class *cl = NULL;
+	u32 prio = skb->priority;
+	struct tcf_result res;
+
+	/*
+	 *  Step 1. If skb->priority points to one of our classes, use it.
+	 */
+	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
+	    (cl = cbq_class_lookup(q, prio)) != NULL)
+		return cl;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	for (;;) {
+		int result = 0;
+		defmap = head->defaults;
+
+		/*
+		 * Step 2+n. Apply classifier.
+		 */
+		if (!head->filter_list ||
+		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
+			goto fallback;
+
+		cl = (void *)res.class;
+		if (!cl) {
+			if (TC_H_MAJ(res.classid))
+				cl = cbq_class_lookup(q, res.classid);
+			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
+				cl = defmap[TC_PRIO_BESTEFFORT];
+
+			if (cl == NULL || cl->level >= head->level)
+				goto fallback;
+		}
+
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		case TC_ACT_RECLASSIFY:
+			return cbq_reclassify(skb, cl);
+		}
+#endif
+		if (cl->level == 0)
+			return cl;
+
+		/*
+		 * Step 3+n. If classifier selected a link sharing class,
+		 *	   apply agency specific classifier.
+		 *	   Repeat this procdure until we hit a leaf node.
+		 */
+		head = cl;
+	}
+
+fallback:
+	cl = head;
+
+	/*
+	 * Step 4. No success...
+	 */
+	if (TC_H_MAJ(prio) == 0 &&
+	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
+	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
+		return head;
+
+	return cl;
+}
+
+/*
+ * A packet has just been enqueued on the empty class.
+ * cbq_activate_class adds it to the tail of active class list
+ * of its priority band.
+ */
+
+static inline void cbq_activate_class(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	int prio = cl->cpriority;
+	struct cbq_class *cl_tail;
+
+	cl_tail = q->active[prio];
+	q->active[prio] = cl;
+
+	if (cl_tail != NULL) {
+		cl->next_alive = cl_tail->next_alive;
+		cl_tail->next_alive = cl;
+	} else {
+		cl->next_alive = cl;
+		q->activemask |= (1<<prio);
+	}
+}
+
+/*
+ * Unlink class from active chain.
+ * Note that this same procedure is done directly in cbq_dequeue*
+ * during round-robin procedure.
+ */
+
+static void cbq_deactivate_class(struct cbq_class *this)
+{
+	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+	int prio = this->cpriority;
+	struct cbq_class *cl;
+	struct cbq_class *cl_prev = q->active[prio];
+
+	do {
+		cl = cl_prev->next_alive;
+		if (cl == this) {
+			cl_prev->next_alive = cl->next_alive;
+			cl->next_alive = NULL;
+
+			if (cl == q->active[prio]) {
+				q->active[prio] = cl_prev;
+				if (cl == q->active[prio]) {
+					q->active[prio] = NULL;
+					q->activemask &= ~(1<<prio);
+					return;
+				}
+			}
+			return;
+		}
+	} while ((cl_prev = cl) != q->active[prio]);
+}
+
+static void
+cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+	int toplevel = q->toplevel;
+
+	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
+		psched_time_t now;
+		psched_tdiff_t incr;
+
+		now = psched_get_time();
+		incr = now - q->now_rt;
+		now = q->now + incr;
+
+		do {
+			if (cl->undertime < now) {
+				q->toplevel = cl->level;
+				return;
+			}
+		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
+	}
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	int uninitialized_var(ret);
+	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
+
+#ifdef CONFIG_NET_CLS_ACT
+	q->rx_class = cl;
+#endif
+	if (cl == NULL) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+#ifdef CONFIG_NET_CLS_ACT
+	cl->q->__parent = sch;
+#endif
+	ret = qdisc_enqueue(skb, cl->q);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		cbq_mark_toplevel(q, cl);
+		if (!cl->next_alive)
+			cbq_activate_class(cl);
+		return ret;
+	}
+
+	if (net_xmit_drop_count(ret)) {
+		sch->qstats.drops++;
+		cbq_mark_toplevel(q, cl);
+		cl->qstats.drops++;
+	}
+	return ret;
+}
+
+/* Overlimit actions */
+
+/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
+
+static void cbq_ovl_classic(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	psched_tdiff_t delay = cl->undertime - q->now;
+
+	if (!cl->delayed) {
+		delay += cl->offtime;
+
+		/*
+		 * Class goes to sleep, so that it will have no
+		 * chance to work avgidle. Let's forgive it 8)
+		 *
+		 * BTW cbq-2.0 has a crap in this
+		 * place, apparently they forgot to shift it by cl->ewma_log.
+		 */
+		if (cl->avgidle < 0)
+			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+		if (cl->avgidle < cl->minidle)
+			cl->avgidle = cl->minidle;
+		if (delay <= 0)
+			delay = 1;
+		cl->undertime = q->now + delay;
+
+		cl->xstats.overactions++;
+		cl->delayed = 1;
+	}
+	if (q->wd_expires == 0 || q->wd_expires > delay)
+		q->wd_expires = delay;
+
+	/* Dirty work! We must schedule wakeups based on
+	 * real available rate, rather than leaf rate,
+	 * which may be tiny (even zero).
+	 */
+	if (q->toplevel == TC_CBQ_MAXLEVEL) {
+		struct cbq_class *b;
+		psched_tdiff_t base_delay = q->wd_expires;
+
+		for (b = cl->borrow; b; b = b->borrow) {
+			delay = b->undertime - q->now;
+			if (delay < base_delay) {
+				if (delay <= 0)
+					delay = 1;
+				base_delay = delay;
+			}
+		}
+
+		q->wd_expires = base_delay;
+	}
+}
+
+/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
+ * they go overlimit
+ */
+
+static void cbq_ovl_rclassic(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *this = cl;
+
+	do {
+		if (cl->level > q->toplevel) {
+			cl = NULL;
+			break;
+		}
+	} while ((cl = cl->borrow) != NULL);
+
+	if (cl == NULL)
+		cl = this;
+	cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
+
+static void cbq_ovl_delay(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	psched_tdiff_t delay = cl->undertime - q->now;
+
+	if (test_bit(__QDISC_STATE_DEACTIVATED,
+		     &qdisc_root_sleeping(cl->qdisc)->state))
+		return;
+
+	if (!cl->delayed) {
+		psched_time_t sched = q->now;
+		ktime_t expires;
+
+		delay += cl->offtime;
+		if (cl->avgidle < 0)
+			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+		if (cl->avgidle < cl->minidle)
+			cl->avgidle = cl->minidle;
+		cl->undertime = q->now + delay;
+
+		if (delay > 0) {
+			sched += delay + cl->penalty;
+			cl->penalized = sched;
+			cl->cpriority = TC_CBQ_MAXPRIO;
+			q->pmask |= (1<<TC_CBQ_MAXPRIO);
+
+			expires = ktime_set(0, 0);
+			expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched));
+			if (hrtimer_try_to_cancel(&q->delay_timer) &&
+			    ktime_to_ns(ktime_sub(
+					hrtimer_get_expires(&q->delay_timer),
+					expires)) > 0)
+				hrtimer_set_expires(&q->delay_timer, expires);
+			hrtimer_restart(&q->delay_timer);
+			cl->delayed = 1;
+			cl->xstats.overactions++;
+			return;
+		}
+		delay = 1;
+	}
+	if (q->wd_expires == 0 || q->wd_expires > delay)
+		q->wd_expires = delay;
+}
+
+/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
+
+static void cbq_ovl_lowprio(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+	cl->penalized = q->now + cl->penalty;
+
+	if (cl->cpriority != cl->priority2) {
+		cl->cpriority = cl->priority2;
+		q->pmask |= (1<<cl->cpriority);
+		cl->xstats.overactions++;
+	}
+	cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DROP: penalize class by dropping */
+
+static void cbq_ovl_drop(struct cbq_class *cl)
+{
+	if (cl->q->ops->drop)
+		if (cl->q->ops->drop(cl->q))
+			cl->qdisc->q.qlen--;
+	cl->xstats.overactions++;
+	cbq_ovl_classic(cl);
+}
+
+static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
+				       psched_time_t now)
+{
+	struct cbq_class *cl;
+	struct cbq_class *cl_prev = q->active[prio];
+	psched_time_t sched = now;
+
+	if (cl_prev == NULL)
+		return 0;
+
+	do {
+		cl = cl_prev->next_alive;
+		if (now - cl->penalized > 0) {
+			cl_prev->next_alive = cl->next_alive;
+			cl->next_alive = NULL;
+			cl->cpriority = cl->priority;
+			cl->delayed = 0;
+			cbq_activate_class(cl);
+
+			if (cl == q->active[prio]) {
+				q->active[prio] = cl_prev;
+				if (cl == q->active[prio]) {
+					q->active[prio] = NULL;
+					return 0;
+				}
+			}
+
+			cl = cl_prev->next_alive;
+		} else if (sched - cl->penalized > 0)
+			sched = cl->penalized;
+	} while ((cl_prev = cl) != q->active[prio]);
+
+	return sched - now;
+}
+
+static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
+{
+	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
+						delay_timer);
+	struct Qdisc *sch = q->watchdog.qdisc;
+	psched_time_t now;
+	psched_tdiff_t delay = 0;
+	unsigned int pmask;
+
+	now = psched_get_time();
+
+	pmask = q->pmask;
+	q->pmask = 0;
+
+	while (pmask) {
+		int prio = ffz(~pmask);
+		psched_tdiff_t tmp;
+
+		pmask &= ~(1<<prio);
+
+		tmp = cbq_undelay_prio(q, prio, now);
+		if (tmp > 0) {
+			q->pmask |= 1<<prio;
+			if (tmp < delay || delay == 0)
+				delay = tmp;
+		}
+	}
+
+	if (delay) {
+		ktime_t time;
+
+		time = ktime_set(0, 0);
+		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
+		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
+	}
+
+	qdisc_unthrottled(sch);
+	__netif_schedule(qdisc_root(sch));
+	return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
+{
+	struct Qdisc *sch = child->__parent;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = q->rx_class;
+
+	q->rx_class = NULL;
+
+	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+		int ret;
+
+		cbq_mark_toplevel(q, cl);
+
+		q->rx_class = cl;
+		cl->q->__parent = sch;
+
+		ret = qdisc_enqueue(skb, cl->q);
+		if (ret == NET_XMIT_SUCCESS) {
+			sch->q.qlen++;
+			if (!cl->next_alive)
+				cbq_activate_class(cl);
+			return 0;
+		}
+		if (net_xmit_drop_count(ret))
+			sch->qstats.drops++;
+		return 0;
+	}
+
+	sch->qstats.drops++;
+	return -1;
+}
+#endif
+
+/*
+ * It is mission critical procedure.
+ *
+ * We "regenerate" toplevel cutoff, if transmitting class
+ * has backlog and it is not regulated. It is not part of
+ * original CBQ description, but looks more reasonable.
+ * Probably, it is wrong. This question needs further investigation.
+ */
+
+static inline void
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+		    struct cbq_class *borrowed)
+{
+	if (cl && q->toplevel >= borrowed->level) {
+		if (cl->q->q.qlen > 1) {
+			do {
+				if (borrowed->undertime == PSCHED_PASTPERFECT) {
+					q->toplevel = borrowed->level;
+					return;
+				}
+			} while ((borrowed = borrowed->borrow) != NULL);
+		}
+#if 0
+	/* It is not necessary now. Uncommenting it
+	   will save CPU cycles, but decrease fairness.
+	 */
+		q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
+	}
+}
+
+static void
+cbq_update(struct cbq_sched_data *q)
+{
+	struct cbq_class *this = q->tx_class;
+	struct cbq_class *cl = this;
+	int len = q->tx_len;
+
+	q->tx_class = NULL;
+
+	for ( ; cl; cl = cl->share) {
+		long avgidle = cl->avgidle;
+		long idle;
+
+		cl->bstats.packets++;
+		cl->bstats.bytes += len;
+
+		/*
+		 * (now - last) is total time between packet right edges.
+		 * (last_pktlen/rate) is "virtual" busy time, so that
+		 *
+		 *	idle = (now - last) - last_pktlen/rate
+		 */
+
+		idle = q->now - cl->last;
+		if ((unsigned long)idle > 128*1024*1024) {
+			avgidle = cl->maxidle;
+		} else {
+			idle -= L2T(cl, len);
+
+		/* true_avgidle := (1-W)*true_avgidle + W*idle,
+		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
+		 * cl->avgidle == true_avgidle/W,
+		 * hence:
+		 */
+			avgidle += idle - (avgidle>>cl->ewma_log);
+		}
+
+		if (avgidle <= 0) {
+			/* Overlimit or at-limit */
+
+			if (avgidle < cl->minidle)
+				avgidle = cl->minidle;
+
+			cl->avgidle = avgidle;
+
+			/* Calculate expected time, when this class
+			 * will be allowed to send.
+			 * It will occur, when:
+			 * (1-W)*true_avgidle + W*delay = 0, i.e.
+			 * idle = (1/W - 1)*(-true_avgidle)
+			 * or
+			 * idle = (1 - W)*(-cl->avgidle);
+			 */
+			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
+
+			/*
+			 * That is not all.
+			 * To maintain the rate allocated to the class,
+			 * we add to undertime virtual clock,
+			 * necessary to complete transmitted packet.
+			 * (len/phys_bandwidth has been already passed
+			 * to the moment of cbq_update)
+			 */
+
+			idle -= L2T(&q->link, len);
+			idle += L2T(cl, len);
+
+			cl->undertime = q->now + idle;
+		} else {
+			/* Underlimit */
+
+			cl->undertime = PSCHED_PASTPERFECT;
+			if (avgidle > cl->maxidle)
+				cl->avgidle = cl->maxidle;
+			else
+				cl->avgidle = avgidle;
+		}
+		cl->last = q->now;
+	}
+
+	cbq_update_toplevel(q, this, q->tx_borrowed);
+}
+
+static inline struct cbq_class *
+cbq_under_limit(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *this_cl = cl;
+
+	if (cl->tparent == NULL)
+		return cl;
+
+	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
+		cl->delayed = 0;
+		return cl;
+	}
+
+	do {
+		/* It is very suspicious place. Now overlimit
+		 * action is generated for not bounded classes
+		 * only if link is completely congested.
+		 * Though it is in agree with ancestor-only paradigm,
+		 * it looks very stupid. Particularly,
+		 * it means that this chunk of code will either
+		 * never be called or result in strong amplification
+		 * of burstiness. Dangerous, silly, and, however,
+		 * no another solution exists.
+		 */
+		cl = cl->borrow;
+		if (!cl) {
+			this_cl->qstats.overlimits++;
+			this_cl->overlimit(this_cl);
+			return NULL;
+		}
+		if (cl->level > q->toplevel)
+			return NULL;
+	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
+
+	cl->delayed = 0;
+	return cl;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl_tail, *cl_prev, *cl;
+	struct sk_buff *skb;
+	int deficit;
+
+	cl_tail = cl_prev = q->active[prio];
+	cl = cl_prev->next_alive;
+
+	do {
+		deficit = 0;
+
+		/* Start round */
+		do {
+			struct cbq_class *borrow = cl;
+
+			if (cl->q->q.qlen &&
+			    (borrow = cbq_under_limit(cl)) == NULL)
+				goto skip_class;
+
+			if (cl->deficit <= 0) {
+				/* Class exhausted its allotment per
+				 * this round. Switch to the next one.
+				 */
+				deficit = 1;
+				cl->deficit += cl->quantum;
+				goto next_class;
+			}
+
+			skb = cl->q->dequeue(cl->q);
+
+			/* Class did not give us any skb :-(
+			 * It could occur even if cl->q->q.qlen != 0
+			 * f.e. if cl->q == "tbf"
+			 */
+			if (skb == NULL)
+				goto skip_class;
+
+			cl->deficit -= qdisc_pkt_len(skb);
+			q->tx_class = cl;
+			q->tx_borrowed = borrow;
+			if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+				borrow->xstats.borrows++;
+				cl->xstats.borrows++;
+#else
+				borrow->xstats.borrows += qdisc_pkt_len(skb);
+				cl->xstats.borrows += qdisc_pkt_len(skb);
+#endif
+			}
+			q->tx_len = qdisc_pkt_len(skb);
+
+			if (cl->deficit <= 0) {
+				q->active[prio] = cl;
+				cl = cl->next_alive;
+				cl->deficit += cl->quantum;
+			}
+			return skb;
+
+skip_class:
+			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
+				/* Class is empty or penalized.
+				 * Unlink it from active chain.
+				 */
+				cl_prev->next_alive = cl->next_alive;
+				cl->next_alive = NULL;
+
+				/* Did cl_tail point to it? */
+				if (cl == cl_tail) {
+					/* Repair it! */
+					cl_tail = cl_prev;
+
+					/* Was it the last class in this band? */
+					if (cl == cl_tail) {
+						/* Kill the band! */
+						q->active[prio] = NULL;
+						q->activemask &= ~(1<<prio);
+						if (cl->q->q.qlen)
+							cbq_activate_class(cl);
+						return NULL;
+					}
+
+					q->active[prio] = cl_tail;
+				}
+				if (cl->q->q.qlen)
+					cbq_activate_class(cl);
+
+				cl = cl_prev;
+			}
+
+next_class:
+			cl_prev = cl;
+			cl = cl->next_alive;
+		} while (cl_prev != cl_tail);
+	} while (deficit);
+
+	q->active[prio] = cl_prev;
+
+	return NULL;
+}
+
+static inline struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned int activemask;
+
+	activemask = q->activemask & 0xFF;
+	while (activemask) {
+		int prio = ffz(~activemask);
+		activemask &= ~(1<<prio);
+		skb = cbq_dequeue_prio(sch, prio);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	psched_time_t now;
+	psched_tdiff_t incr;
+
+	now = psched_get_time();
+	incr = now - q->now_rt;
+
+	if (q->tx_class) {
+		psched_tdiff_t incr2;
+		/* Time integrator. We calculate EOS time
+		 * by adding expected packet transmission time.
+		 * If real time is greater, we warp artificial clock,
+		 * so that:
+		 *
+		 * cbq_time = max(real_time, work);
+		 */
+		incr2 = L2T(&q->link, q->tx_len);
+		q->now += incr2;
+		cbq_update(q);
+		if ((incr -= incr2) < 0)
+			incr = 0;
+	}
+	q->now += incr;
+	q->now_rt = now;
+
+	for (;;) {
+		q->wd_expires = 0;
+
+		skb = cbq_dequeue_1(sch);
+		if (skb) {
+			qdisc_bstats_update(sch, skb);
+			sch->q.qlen--;
+			qdisc_unthrottled(sch);
+			return skb;
+		}
+
+		/* All the classes are overlimit.
+		 *
+		 * It is possible, if:
+		 *
+		 * 1. Scheduler is empty.
+		 * 2. Toplevel cutoff inhibited borrowing.
+		 * 3. Root class is overlimit.
+		 *
+		 * Reset 2d and 3d conditions and retry.
+		 *
+		 * Note, that NS and cbq-2.0 are buggy, peeking
+		 * an arbitrary class is appropriate for ancestor-only
+		 * sharing, but not for toplevel algorithm.
+		 *
+		 * Our version is better, but slower, because it requires
+		 * two passes, but it is unavoidable with top-level sharing.
+		 */
+
+		if (q->toplevel == TC_CBQ_MAXLEVEL &&
+		    q->link.undertime == PSCHED_PASTPERFECT)
+			break;
+
+		q->toplevel = TC_CBQ_MAXLEVEL;
+		q->link.undertime = PSCHED_PASTPERFECT;
+	}
+
+	/* No packets in scheduler or nobody wants to give them to us :-(
+	 * Sigh... start watchdog timer in the last case.
+	 */
+
+	if (sch->q.qlen) {
+		sch->qstats.overlimits++;
+		if (q->wd_expires)
+			qdisc_watchdog_schedule(&q->watchdog,
+						now + q->wd_expires);
+	}
+	return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+	if (this == NULL)
+		return;
+
+	do {
+		int level = 0;
+		struct cbq_class *cl;
+
+		cl = this->children;
+		if (cl) {
+			do {
+				if (cl->level > level)
+					level = cl->level;
+			} while ((cl = cl->sibling) != this->children);
+		}
+		this->level = level + 1;
+	} while ((this = this->tparent) != NULL);
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+	struct cbq_class *cl;
+	struct hlist_node *n;
+	unsigned int h;
+
+	if (q->quanta[prio] == 0)
+		return;
+
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+			/* BUGGGG... Beware! This expression suffer of
+			 * arithmetic overflows!
+			 */
+			if (cl->priority == prio) {
+				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+					q->quanta[prio];
+			}
+			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
+				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
+					   cl->common.classid, cl->quantum);
+				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+			}
+		}
+	}
+}
+
+static void cbq_sync_defmap(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *split = cl->split;
+	unsigned int h;
+	int i;
+
+	if (split == NULL)
+		return;
+
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
+		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
+			split->defaults[i] = NULL;
+	}
+
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
+		int level = split->level;
+
+		if (split->defaults[i])
+			continue;
+
+		for (h = 0; h < q->clhash.hashsize; h++) {
+			struct hlist_node *n;
+			struct cbq_class *c;
+
+			hlist_for_each_entry(c, n, &q->clhash.hash[h],
+					     common.hnode) {
+				if (c->split == split && c->level < level &&
+				    c->defmap & (1<<i)) {
+					split->defaults[i] = c;
+					level = c->level;
+				}
+			}
+		}
+	}
+}
+
+static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
+{
+	struct cbq_class *split = NULL;
+
+	if (splitid == 0) {
+		split = cl->split;
+		if (!split)
+			return;
+		splitid = split->common.classid;
+	}
+
+	if (split == NULL || split->common.classid != splitid) {
+		for (split = cl->tparent; split; split = split->tparent)
+			if (split->common.classid == splitid)
+				break;
+	}
+
+	if (split == NULL)
+		return;
+
+	if (cl->split != split) {
+		cl->defmap = 0;
+		cbq_sync_defmap(cl);
+		cl->split = split;
+		cl->defmap = def & mask;
+	} else
+		cl->defmap = (cl->defmap & ~mask) | (def & mask);
+
+	cbq_sync_defmap(cl);
+}
+
+static void cbq_unlink_class(struct cbq_class *this)
+{
+	struct cbq_class *cl, **clp;
+	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+
+	qdisc_class_hash_remove(&q->clhash, &this->common);
+
+	if (this->tparent) {
+		clp = &this->sibling;
+		cl = *clp;
+		do {
+			if (cl == this) {
+				*clp = cl->sibling;
+				break;
+			}
+			clp = &cl->sibling;
+		} while ((cl = *clp) != this->sibling);
+
+		if (this->tparent->children == this) {
+			this->tparent->children = this->sibling;
+			if (this->sibling == this)
+				this->tparent->children = NULL;
+		}
+	} else {
+		WARN_ON(this->sibling != this);
+	}
+}
+
+static void cbq_link_class(struct cbq_class *this)
+{
+	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+	struct cbq_class *parent = this->tparent;
+
+	this->sibling = this;
+	qdisc_class_hash_insert(&q->clhash, &this->common);
+
+	if (parent == NULL)
+		return;
+
+	if (parent->children == NULL) {
+		parent->children = this;
+	} else {
+		this->sibling = parent->children->sibling;
+		parent->children->sibling = this;
+	}
+}
+
+static unsigned int cbq_drop(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl, *cl_head;
+	int prio;
+	unsigned int len;
+
+	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
+		cl_head = q->active[prio];
+		if (!cl_head)
+			continue;
+
+		cl = cl_head;
+		do {
+			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
+				sch->q.qlen--;
+				if (!cl->q->q.qlen)
+					cbq_deactivate_class(cl);
+				return len;
+			}
+		} while ((cl = cl->next_alive) != cl_head);
+	}
+	return 0;
+}
+
+static void
+cbq_reset(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl;
+	struct hlist_node *n;
+	int prio;
+	unsigned int h;
+
+	q->activemask = 0;
+	q->pmask = 0;
+	q->tx_class = NULL;
+	q->tx_borrowed = NULL;
+	qdisc_watchdog_cancel(&q->watchdog);
+	hrtimer_cancel(&q->delay_timer);
+	q->toplevel = TC_CBQ_MAXLEVEL;
+	q->now = psched_get_time();
+	q->now_rt = q->now;
+
+	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
+		q->active[prio] = NULL;
+
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+			qdisc_reset(cl->q);
+
+			cl->next_alive = NULL;
+			cl->undertime = PSCHED_PASTPERFECT;
+			cl->avgidle = cl->maxidle;
+			cl->deficit = cl->quantum;
+			cl->cpriority = cl->priority;
+		}
+	}
+	sch->q.qlen = 0;
+}
+
+
+static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
+{
+	if (lss->change & TCF_CBQ_LSS_FLAGS) {
+		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
+		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
+	}
+	if (lss->change & TCF_CBQ_LSS_EWMA)
+		cl->ewma_log = lss->ewma_log;
+	if (lss->change & TCF_CBQ_LSS_AVPKT)
+		cl->avpkt = lss->avpkt;
+	if (lss->change & TCF_CBQ_LSS_MINIDLE)
+		cl->minidle = -(long)lss->minidle;
+	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
+		cl->maxidle = lss->maxidle;
+		cl->avgidle = lss->maxidle;
+	}
+	if (lss->change & TCF_CBQ_LSS_OFFTIME)
+		cl->offtime = lss->offtime;
+	return 0;
+}
+
+static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+	q->nclasses[cl->priority]--;
+	q->quanta[cl->priority] -= cl->weight;
+	cbq_normalize_quanta(q, cl->priority);
+}
+
+static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+	q->nclasses[cl->priority]++;
+	q->quanta[cl->priority] += cl->weight;
+	cbq_normalize_quanta(q, cl->priority);
+}
+
+static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+	if (wrr->allot)
+		cl->allot = wrr->allot;
+	if (wrr->weight)
+		cl->weight = wrr->weight;
+	if (wrr->priority) {
+		cl->priority = wrr->priority - 1;
+		cl->cpriority = cl->priority;
+		if (cl->priority >= cl->priority2)
+			cl->priority2 = TC_CBQ_MAXPRIO - 1;
+	}
+
+	cbq_addprio(q, cl);
+	return 0;
+}
+
+static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
+{
+	switch (ovl->strategy) {
+	case TC_CBQ_OVL_CLASSIC:
+		cl->overlimit = cbq_ovl_classic;
+		break;
+	case TC_CBQ_OVL_DELAY:
+		cl->overlimit = cbq_ovl_delay;
+		break;
+	case TC_CBQ_OVL_LOWPRIO:
+		if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
+		    ovl->priority2 - 1 <= cl->priority)
+			return -EINVAL;
+		cl->priority2 = ovl->priority2 - 1;
+		cl->overlimit = cbq_ovl_lowprio;
+		break;
+	case TC_CBQ_OVL_DROP:
+		cl->overlimit = cbq_ovl_drop;
+		break;
+	case TC_CBQ_OVL_RCLASSIC:
+		cl->overlimit = cbq_ovl_rclassic;
+		break;
+	default:
+		return -EINVAL;
+	}
+	cl->penalty = ovl->penalty;
+	return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
+{
+	cl->police = p->police;
+
+	if (cl->q->handle) {
+		if (p->police == TC_POLICE_RECLASSIFY)
+			cl->q->reshape_fail = cbq_reshape_fail;
+		else
+			cl->q->reshape_fail = NULL;
+	}
+	return 0;
+}
+#endif
+
+static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
+{
+	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
+	return 0;
+}
+
+static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
+	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
+	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
+	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
+	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
+	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
+	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
+};
+
+static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CBQ_MAX + 1];
+	struct tc_ratespec *r;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
+		return -EINVAL;
+
+	r = nla_data(tb[TCA_CBQ_RATE]);
+
+	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
+		return -EINVAL;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		goto put_rtab;
+
+	q->link.refcnt = 1;
+	q->link.sibling = &q->link;
+	q->link.common.classid = sch->handle;
+	q->link.qdisc = sch;
+	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				      sch->handle);
+	if (!q->link.q)
+		q->link.q = &noop_qdisc;
+
+	q->link.priority = TC_CBQ_MAXPRIO - 1;
+	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
+	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
+	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
+	q->link.overlimit = cbq_ovl_classic;
+	q->link.allot = psched_mtu(qdisc_dev(sch));
+	q->link.quantum = q->link.allot;
+	q->link.weight = q->link.R_tab->rate.rate;
+
+	q->link.ewma_log = TC_CBQ_DEF_EWMA;
+	q->link.avpkt = q->link.allot/2;
+	q->link.minidle = -0x7FFFFFFF;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	q->delay_timer.function = cbq_undelay;
+	q->toplevel = TC_CBQ_MAXLEVEL;
+	q->now = psched_get_time();
+	q->now_rt = q->now;
+
+	cbq_link_class(&q->link);
+
+	if (tb[TCA_CBQ_LSSOPT])
+		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+	cbq_addprio(q, &q->link);
+	return 0;
+
+put_rtab:
+	qdisc_put_rtab(q->link.R_tab);
+	return err;
+}
+
+static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+
+	NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_cbq_lssopt opt;
+
+	opt.flags = 0;
+	if (cl->borrow == NULL)
+		opt.flags |= TCF_CBQ_LSS_BOUNDED;
+	if (cl->share == NULL)
+		opt.flags |= TCF_CBQ_LSS_ISOLATED;
+	opt.ewma_log = cl->ewma_log;
+	opt.level = cl->level;
+	opt.avpkt = cl->avpkt;
+	opt.maxidle = cl->maxidle;
+	opt.minidle = (u32)(-cl->minidle);
+	opt.offtime = cl->offtime;
+	opt.change = ~0;
+	NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_cbq_wrropt opt;
+
+	opt.flags = 0;
+	opt.allot = cl->allot;
+	opt.priority = cl->priority + 1;
+	opt.cpriority = cl->cpriority + 1;
+	opt.weight = cl->weight;
+	NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_cbq_ovl opt;
+
+	opt.strategy = cl->ovl_strategy;
+	opt.priority2 = cl->priority2 + 1;
+	opt.pad = 0;
+	opt.penalty = cl->penalty;
+	NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_cbq_fopt opt;
+
+	if (cl->split || cl->defmap) {
+		opt.split = cl->split ? cl->split->common.classid : 0;
+		opt.defmap = cl->defmap;
+		opt.defchange = ~0;
+		NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
+	}
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_cbq_police opt;
+
+	if (cl->police) {
+		opt.police = cl->police;
+		opt.__res1 = 0;
+		opt.__res2 = 0;
+		NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
+	}
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+#endif
+
+static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
+{
+	if (cbq_dump_lss(skb, cl) < 0 ||
+	    cbq_dump_rate(skb, cl) < 0 ||
+	    cbq_dump_wrr(skb, cl) < 0 ||
+	    cbq_dump_ovl(skb, cl) < 0 ||
+#ifdef CONFIG_NET_CLS_ACT
+	    cbq_dump_police(skb, cl) < 0 ||
+#endif
+	    cbq_dump_fopt(skb, cl) < 0)
+		return -1;
+	return 0;
+}
+
+static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (cbq_dump_attr(skb, &q->link) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int
+cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+
+	q->link.xstats.avgidle = q->link.avgidle;
+	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
+}
+
+static int
+cbq_dump_class(struct Qdisc *sch, unsigned long arg,
+	       struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+	struct nlattr *nest;
+
+	if (cl->tparent)
+		tcm->tcm_parent = cl->tparent->common.classid;
+	else
+		tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle = cl->common.classid;
+	tcm->tcm_info = cl->q->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (cbq_dump_attr(skb, cl) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int
+cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+	struct gnet_dump *d)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	cl->qstats.qlen = cl->q->q.qlen;
+	cl->xstats.avgidle = cl->avgidle;
+	cl->xstats.undertime = 0;
+
+	if (cl->undertime != PSCHED_PASTPERFECT)
+		cl->xstats.undertime = cl->undertime - q->now;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			return -ENOBUFS;
+	} else {
+#ifdef CONFIG_NET_CLS_ACT
+		if (cl->police == TC_POLICE_RECLASSIFY)
+			new->reshape_fail = cbq_reshape_fail;
+#endif
+	}
+	sch_tree_lock(sch);
+	*old = cl->q;
+	cl->q = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	return cl->q;
+}
+
+static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	if (cl->q->q.qlen == 0)
+		cbq_deactivate_class(cl);
+}
+
+static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = cbq_class_lookup(q, classid);
+
+	if (cl) {
+		cl->refcnt++;
+		return (unsigned long)cl;
+	}
+	return 0;
+}
+
+static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+
+	WARN_ON(cl->filters);
+
+	tcf_destroy_chain(&cl->filter_list);
+	qdisc_destroy(cl->q);
+	qdisc_put_rtab(cl->R_tab);
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	if (cl != &q->link)
+		kfree(cl);
+}
+
+static void cbq_destroy(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct hlist_node *n, *next;
+	struct cbq_class *cl;
+	unsigned int h;
+
+#ifdef CONFIG_NET_CLS_ACT
+	q->rx_class = NULL;
+#endif
+	/*
+	 * Filters must be destroyed first because we don't destroy the
+	 * classes from root to leafs which means that filters can still
+	 * be bound to classes which have been destroyed already. --TGR '04
+	 */
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
+					  common.hnode)
+			cbq_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static void cbq_put(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	if (--cl->refcnt == 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
+		struct cbq_sched_data *q = qdisc_priv(sch);
+
+		spin_lock_bh(root_lock);
+		if (q->rx_class == cl)
+			q->rx_class = NULL;
+		spin_unlock_bh(root_lock);
+#endif
+
+		cbq_destroy_class(sch, cl);
+	}
+}
+
+static int
+cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
+		 unsigned long *arg)
+{
+	int err;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = (struct cbq_class *)*arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_CBQ_MAX + 1];
+	struct cbq_class *parent;
+	struct qdisc_rate_table *rtab = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
+	if (err < 0)
+		return err;
+
+	if (cl) {
+		/* Check parent */
+		if (parentid) {
+			if (cl->tparent &&
+			    cl->tparent->common.classid != parentid)
+				return -EINVAL;
+			if (!cl->tparent && parentid != TC_H_ROOT)
+				return -EINVAL;
+		}
+
+		if (tb[TCA_CBQ_RATE]) {
+			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
+					      tb[TCA_CBQ_RTAB]);
+			if (rtab == NULL)
+				return -EINVAL;
+		}
+
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err) {
+				if (rtab)
+					qdisc_put_rtab(rtab);
+				return err;
+			}
+		}
+
+		/* Change class parameters */
+		sch_tree_lock(sch);
+
+		if (cl->next_alive != NULL)
+			cbq_deactivate_class(cl);
+
+		if (rtab) {
+			qdisc_put_rtab(cl->R_tab);
+			cl->R_tab = rtab;
+		}
+
+		if (tb[TCA_CBQ_LSSOPT])
+			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+
+		if (tb[TCA_CBQ_WRROPT]) {
+			cbq_rmprio(q, cl);
+			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+		}
+
+		if (tb[TCA_CBQ_OVL_STRATEGY])
+			cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
+
+#ifdef CONFIG_NET_CLS_ACT
+		if (tb[TCA_CBQ_POLICE])
+			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
+#endif
+
+		if (tb[TCA_CBQ_FOPT])
+			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+
+		if (cl->q->q.qlen)
+			cbq_activate_class(cl);
+
+		sch_tree_unlock(sch);
+
+		return 0;
+	}
+
+	if (parentid == TC_H_ROOT)
+		return -EINVAL;
+
+	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
+	    tb[TCA_CBQ_LSSOPT] == NULL)
+		return -EINVAL;
+
+	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
+	if (rtab == NULL)
+		return -EINVAL;
+
+	if (classid) {
+		err = -EINVAL;
+		if (TC_H_MAJ(classid ^ sch->handle) ||
+		    cbq_class_lookup(q, classid))
+			goto failure;
+	} else {
+		int i;
+		classid = TC_H_MAKE(sch->handle, 0x8000);
+
+		for (i = 0; i < 0x8000; i++) {
+			if (++q->hgenerator >= 0x8000)
+				q->hgenerator = 1;
+			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
+				break;
+		}
+		err = -ENOSR;
+		if (i >= 0x8000)
+			goto failure;
+		classid = classid|q->hgenerator;
+	}
+
+	parent = &q->link;
+	if (parentid) {
+		parent = cbq_class_lookup(q, parentid);
+		err = -EINVAL;
+		if (parent == NULL)
+			goto failure;
+	}
+
+	err = -ENOBUFS;
+	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+	if (cl == NULL)
+		goto failure;
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err) {
+			kfree(cl);
+			goto failure;
+		}
+	}
+
+	cl->R_tab = rtab;
+	rtab = NULL;
+	cl->refcnt = 1;
+	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	if (!cl->q)
+		cl->q = &noop_qdisc;
+	cl->common.classid = classid;
+	cl->tparent = parent;
+	cl->qdisc = sch;
+	cl->allot = parent->allot;
+	cl->quantum = cl->allot;
+	cl->weight = cl->R_tab->rate.rate;
+
+	sch_tree_lock(sch);
+	cbq_link_class(cl);
+	cl->borrow = cl->tparent;
+	if (cl->tparent != &q->link)
+		cl->share = cl->tparent;
+	cbq_adjust_levels(parent);
+	cl->minidle = -0x7FFFFFFF;
+	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
+	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
+	if (cl->ewma_log == 0)
+		cl->ewma_log = q->link.ewma_log;
+	if (cl->maxidle == 0)
+		cl->maxidle = q->link.maxidle;
+	if (cl->avpkt == 0)
+		cl->avpkt = q->link.avpkt;
+	cl->overlimit = cbq_ovl_classic;
+	if (tb[TCA_CBQ_OVL_STRATEGY])
+		cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
+#ifdef CONFIG_NET_CLS_ACT
+	if (tb[TCA_CBQ_POLICE])
+		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
+#endif
+	if (tb[TCA_CBQ_FOPT])
+		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+
+failure:
+	qdisc_put_rtab(rtab);
+	return err;
+}
+
+static int cbq_delete(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = (struct cbq_class *)arg;
+	unsigned int qlen;
+
+	if (cl->filters || cl->children || cl == &q->link)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	qlen = cl->q->q.qlen;
+	qdisc_reset(cl->q);
+	qdisc_tree_decrease_qlen(cl->q, qlen);
+
+	if (cl->next_alive)
+		cbq_deactivate_class(cl);
+
+	if (q->tx_borrowed == cl)
+		q->tx_borrowed = q->tx_class;
+	if (q->tx_class == cl) {
+		q->tx_class = NULL;
+		q->tx_borrowed = NULL;
+	}
+#ifdef CONFIG_NET_CLS_ACT
+	if (q->rx_class == cl)
+		q->rx_class = NULL;
+#endif
+
+	cbq_unlink_class(cl);
+	cbq_adjust_levels(cl->tparent);
+	cl->defmap = 0;
+	cbq_sync_defmap(cl);
+
+	cbq_rmprio(q, cl);
+	sch_tree_unlock(sch);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	return 0;
+}
+
+static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	if (cl == NULL)
+		cl = &q->link;
+
+	return &cl->filter_list;
+}
+
+static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
+				     u32 classid)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *p = (struct cbq_class *)parent;
+	struct cbq_class *cl = cbq_class_lookup(q, classid);
+
+	if (cl) {
+		if (p && p->level <= cl->level)
+			return 0;
+		cl->filters++;
+		return (unsigned long)cl;
+	}
+	return 0;
+}
+
+static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+	struct cbq_class *cl = (struct cbq_class *)arg;
+
+	cl->filters--;
+}
+
+static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl;
+	struct hlist_node *n;
+	unsigned int h;
+
+	if (arg->stop)
+		return;
+
+	for (h = 0; h < q->clhash.hashsize; h++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static const struct Qdisc_class_ops cbq_class_ops = {
+	.graft		=	cbq_graft,
+	.leaf		=	cbq_leaf,
+	.qlen_notify	=	cbq_qlen_notify,
+	.get		=	cbq_get,
+	.put		=	cbq_put,
+	.change		=	cbq_change_class,
+	.delete		=	cbq_delete,
+	.walk		=	cbq_walk,
+	.tcf_chain	=	cbq_find_tcf,
+	.bind_tcf	=	cbq_bind_filter,
+	.unbind_tcf	=	cbq_unbind_filter,
+	.dump		=	cbq_dump_class,
+	.dump_stats	=	cbq_dump_class_stats,
+};
+
+static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&cbq_class_ops,
+	.id		=	"cbq",
+	.priv_size	=	sizeof(struct cbq_sched_data),
+	.enqueue	=	cbq_enqueue,
+	.dequeue	=	cbq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	cbq_drop,
+	.init		=	cbq_init,
+	.reset		=	cbq_reset,
+	.destroy	=	cbq_destroy,
+	.change		=	NULL,
+	.dump		=	cbq_dump,
+	.dump_stats	=	cbq_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init cbq_module_init(void)
+{
+	return register_qdisc(&cbq_qdisc_ops);
+}
+static void __exit cbq_module_exit(void)
+{
+	unregister_qdisc(&cbq_qdisc_ops);
+}
+module_init(cbq_module_init)
+module_exit(cbq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
new file mode 100644
index 00000000..178ee831
--- /dev/null
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,687 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+   CHOKe stateless AQM for fair bandwidth allocation
+   =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE	(128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	struct {
+		u32	prob_drop;	/* Early probability drops */
+		u32	prob_mark;	/* Early probability marks */
+		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
+		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
+		u32	pdrop;          /* Drops due to queue limits */
+		u32	other;          /* Drops due to drop() calls */
+		u32	matched;	/* Drops to flow match */
+	} stats;
+
+	unsigned int	 head;
+	unsigned int	 tail;
+
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	do {
+		q->head = (q->head + 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	do {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = q->tab[idx];
+
+	q->tab[idx] = NULL;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_drop(skb, sch);
+	qdisc_tree_decrease_qlen(sch, 1);
+	--sch->q.qlen;
+}
+
+/*
+ * Compare flow of two packets
+ *  Returns true only if source and destination address and port match.
+ *          false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+			     struct sk_buff *skb2)
+{
+	int off1, off2, poff;
+	const u32 *ports1, *ports2;
+	u8 ip_proto;
+	__u32 hash1;
+
+	if (skb1->protocol != skb2->protocol)
+		return false;
+
+	/* Use hash value as quick check
+	 * Assumes that __skb_get_rxhash makes IP header and ports linear
+	 */
+	hash1 = skb_get_rxhash(skb1);
+	if (!hash1 || hash1 != skb_get_rxhash(skb2))
+		return false;
+
+	/* Probably match, but be sure to avoid hash collisions */
+	off1 = skb_network_offset(skb1);
+	off2 = skb_network_offset(skb2);
+
+	switch (skb1->protocol) {
+	case __constant_htons(ETH_P_IP): {
+		const struct iphdr *ip1, *ip2;
+
+		ip1 = (const struct iphdr *) (skb1->data + off1);
+		ip2 = (const struct iphdr *) (skb2->data + off2);
+
+		ip_proto = ip1->protocol;
+		if (ip_proto != ip2->protocol ||
+		    ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+			return false;
+
+		if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
+			ip_proto = 0;
+		off1 += ip1->ihl * 4;
+		off2 += ip2->ihl * 4;
+		break;
+	}
+
+	case __constant_htons(ETH_P_IPV6): {
+		const struct ipv6hdr *ip1, *ip2;
+
+		ip1 = (const struct ipv6hdr *) (skb1->data + off1);
+		ip2 = (const struct ipv6hdr *) (skb2->data + off2);
+
+		ip_proto = ip1->nexthdr;
+		if (ip_proto != ip2->nexthdr ||
+		    ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
+		    ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
+			return false;
+		off1 += 40;
+		off2 += 40;
+	}
+
+	default: /* Maybe compare MAC header here? */
+		return false;
+	}
+
+	poff = proto_ports_offset(ip_proto);
+	if (poff < 0)
+		return true;
+
+	off1 += poff;
+	off2 += poff;
+
+	ports1 = (__force u32 *)(skb1->data + off1);
+	ports2 = (__force u32 *)(skb2->data + off2);
+	return *ports1 == *ports2;
+}
+
+struct choke_skb_cb {
+	u16 classid;
+};
+
+static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
+	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+	choke_skb_cb(skb)->classid = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+	return choke_skb_cb(skb)->classid;
+}
+
+/*
+ * Classify flow using either:
+ *  1. pre-existing classification result in skb
+ *  2. fast internal classification
+ *  3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+			   struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return false;
+		}
+#endif
+		choke_set_classid(skb, TC_H_MIN(res.classid));
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ *   times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+			       struct sk_buff *nskb,
+			       unsigned int *pidx)
+{
+	struct sk_buff *oskb;
+
+	if (q->head == q->tail)
+		return false;
+
+	oskb = choke_peek_random(q, pidx);
+	if (q->filter_list)
+		return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+	return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (q->filter_list) {
+		/* If using external classifiers, get result and record it. */
+		if (!choke_classify(skb, sch, &ret))
+			goto other_drop;	/* Packet was eaten by filter */
+	}
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, sch->q.qlen);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		unsigned int idx;
+
+		/* Draw a packet at random from queue and compare flow */
+		if (choke_match_random(q, skb, &idx)) {
+			q->stats.matched++;
+			choke_drop_by_idx(sch, idx);
+			goto congestion_drop;
+		}
+
+		/* Queue is large, always mark/drop */
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		} else if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (sch->q.qlen < q->limit) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+		++sch->q.qlen;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+
+ other_drop:
+	if (ret & __NET_XMIT_BYPASS)
+		sch->qstats.drops++;
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL;
+	choke_zap_head_holes(q);
+	--sch->q.qlen;
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CHOKE_MAX + 1];
+	const struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CHOKE_PARMS] == NULL ||
+	    tb[TCA_CHOKE_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+	if (ctl->limit > CHOKE_MAX_QUEUE)
+		return -EINVAL;
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab;
+
+		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int oqlen = sch->q.qlen, tail = 0;
+
+			while (q->head != q->tail) {
+				struct sk_buff *skb = q->tab[q->head];
+
+				q->head = (q->head + 1) & q->tab_mask;
+				if (!skb)
+					continue;
+				if (tail < mask) {
+					ntab[tail++] = skb;
+					continue;
+				}
+				sch->qstats.backlog -= qdisc_pkt_len(skb);
+				--sch->q.qlen;
+				qdisc_drop(skb, sch);
+			}
+			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			q->head = 0;
+			q->tail = tail;
+		}
+
+		q->tab_mask = mask;
+		q->tab = ntab;
+	} else
+		sch_tree_lock(sch);
+
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_CHOKE_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_choke_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.matched = q->stats.matched,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
new file mode 100644
index 00000000..6b7fe4a8
--- /dev/null
+++ b/net/sched/sch_drr.c
@@ -0,0 +1,525 @@
+/*
+ * net/sched/sch_drr.c         Deficit Round Robin scheduler
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct drr_class {
+	struct Qdisc_class_common	common;
+	unsigned int			refcnt;
+	unsigned int			filter_cnt;
+
+	struct gnet_stats_basic_packed		bstats;
+	struct gnet_stats_queue		qstats;
+	struct gnet_stats_rate_est	rate_est;
+	struct list_head		alist;
+	struct Qdisc			*qdisc;
+
+	u32				quantum;
+	u32				deficit;
+};
+
+struct drr_sched {
+	struct list_head		active;
+	struct tcf_proto		*filter_list;
+	struct Qdisc_class_hash		clhash;
+};
+
+static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct drr_class, common);
+}
+
+static void drr_purge_queue(struct drr_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
+	[TCA_DRR_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)*arg;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_DRR_MAX + 1];
+	u32 quantum;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_DRR_QUANTUM]) {
+		quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
+		if (quantum == 0)
+			return -EINVAL;
+	} else
+		quantum = psched_mtu(qdisc_dev(sch));
+
+	if (cl != NULL) {
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+
+		sch_tree_lock(sch);
+		if (tb[TCA_DRR_QUANTUM])
+			cl->quantum = quantum;
+		sch_tree_unlock(sch);
+
+		return 0;
+	}
+
+	cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	cl->refcnt	   = 1;
+	cl->common.classid = classid;
+	cl->quantum	   = quantum;
+	cl->qdisc	   = qdisc_create_dflt(sch->dev_queue,
+					       &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+
+	if (tca[TCA_RATE]) {
+		err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					    qdisc_root_sleeping_lock(sch),
+					    tca[TCA_RATE]);
+		if (err) {
+			qdisc_destroy(cl->qdisc);
+			kfree(cl);
+			return err;
+		}
+	}
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
+{
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+}
+
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->filter_cnt > 0)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	drr_purge_queue(cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (--cl->refcnt == 0)
+		drr_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+
+	return &q->filter_list;
+}
+
+static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->filter_cnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	drr_purge_queue(cl);
+	*old = cl->qdisc;
+	cl->qdisc = new;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	return cl->qdisc;
+}
+
+static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0)
+		list_del(&cl->alist);
+}
+
+static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent	= TC_H_ROOT;
+	tcm->tcm_handle	= cl->common.classid;
+	tcm->tcm_info	= cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum);
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct tc_drr_stats xstats;
+
+	memset(&xstats, 0, sizeof(xstats));
+	if (cl->qdisc->q.qlen) {
+		xstats.deficit = cl->deficit;
+		cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
+	}
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+		cl = drr_find_class(sch, skb->priority);
+		if (cl != NULL)
+			return cl;
+	}
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct drr_class *)res.class;
+		if (cl == NULL)
+			cl = drr_find_class(sch, res.classid);
+		return cl;
+	}
+	return NULL;
+}
+
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	int err;
+
+	cl = drr_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	if (cl->qdisc->q.qlen == 1) {
+		list_add_tail(&cl->alist, &q->active);
+		cl->deficit = cl->quantum;
+	}
+
+	bstats_update(&cl->bstats, skb);
+
+	sch->q.qlen++;
+	return err;
+}
+
+static struct sk_buff *drr_dequeue(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	if (list_empty(&q->active))
+		goto out;
+	while (1) {
+		cl = list_first_entry(&q->active, struct drr_class, alist);
+		skb = cl->qdisc->ops->peek(cl->qdisc);
+		if (skb == NULL)
+			goto out;
+
+		len = qdisc_pkt_len(skb);
+		if (len <= cl->deficit) {
+			cl->deficit -= len;
+			skb = qdisc_dequeue_peeked(cl->qdisc);
+			if (cl->qdisc->q.qlen == 0)
+				list_del(&cl->alist);
+			qdisc_bstats_update(sch, skb);
+			sch->q.qlen--;
+			return skb;
+		}
+
+		cl->deficit += cl->quantum;
+		list_move_tail(&cl->alist, &q->active);
+	}
+out:
+	return NULL;
+}
+
+static unsigned int drr_drop(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int len;
+
+	list_for_each_entry(cl, &q->active, alist) {
+		if (cl->qdisc->ops->drop) {
+			len = cl->qdisc->ops->drop(cl->qdisc);
+			if (len > 0) {
+				sch->q.qlen--;
+				if (cl->qdisc->q.qlen == 0)
+					list_del(&cl->alist);
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	int err;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+	INIT_LIST_HEAD(&q->active);
+	return 0;
+}
+
+static void drr_reset_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc->q.qlen)
+				list_del(&cl->alist);
+			qdisc_reset(cl->qdisc);
+		}
+	}
+	sch->q.qlen = 0;
+}
+
+static void drr_destroy_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n, *next;
+	unsigned int i;
+
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+					  common.hnode)
+			drr_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops drr_class_ops = {
+	.change		= drr_change_class,
+	.delete		= drr_delete_class,
+	.get		= drr_get_class,
+	.put		= drr_put_class,
+	.tcf_chain	= drr_tcf_chain,
+	.bind_tcf	= drr_bind_tcf,
+	.unbind_tcf	= drr_unbind_tcf,
+	.graft		= drr_graft_class,
+	.leaf		= drr_class_leaf,
+	.qlen_notify	= drr_qlen_notify,
+	.dump		= drr_dump_class,
+	.dump_stats	= drr_dump_class_stats,
+	.walk		= drr_walk,
+};
+
+static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
+	.cl_ops		= &drr_class_ops,
+	.id		= "drr",
+	.priv_size	= sizeof(struct drr_sched),
+	.enqueue	= drr_enqueue,
+	.dequeue	= drr_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= drr_drop,
+	.init		= drr_init_qdisc,
+	.reset		= drr_reset_qdisc,
+	.destroy	= drr_destroy_qdisc,
+	.owner		= THIS_MODULE,
+};
+
+static int __init drr_init(void)
+{
+	return register_qdisc(&drr_qdisc_ops);
+}
+
+static void __exit drr_exit(void)
+{
+	unregister_qdisc(&drr_qdisc_ops);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
new file mode 100644
index 00000000..2c790204
--- /dev/null
+++ b/net/sched/sch_dsmark.c
@@ -0,0 +1,509 @@
+/* net/sched/sch_dsmark.c - Differentiated Services field marker */
+
+/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/bitops.h>
+#include <net/pkt_sched.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <asm/byteorder.h>
+
+/*
+ * classid	class		marking
+ * -------	-----		-------
+ *   n/a	  0		n/a
+ *   x:0	  1		use entry [0]
+ *   ...	 ...		...
+ *   x:y y>0	 y+1		use entry [y]
+ *   ...	 ...		...
+ * x:indices-1	indices		use entry [indices-1]
+ *   ...	 ...		...
+ *   x:y	 y+1		use entry [y & (indices-1)]
+ *   ...	 ...		...
+ * 0xffff	0x10000		use entry [indices-1]
+ */
+
+
+#define NO_DEFAULT_INDEX	(1 << 16)
+
+struct dsmark_qdisc_data {
+	struct Qdisc		*q;
+	struct tcf_proto	*filter_list;
+	u8			*mask;	/* "owns" the array */
+	u8			*value;
+	u16			indices;
+	u32			default_index;	/* index range is 0...0xffff */
+	int			set_tc_index;
+};
+
+static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
+{
+	return (index <= p->indices && index > 0);
+}
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
+			struct Qdisc *new, struct Qdisc **old)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
+		sch, p, new, old);
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					sch->handle);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	*old = p->q;
+	p->q = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	return p->q;
+}
+
+static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
+{
+	pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
+		sch, qdisc_priv(sch), classid);
+
+	return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long dsmark_bind_filter(struct Qdisc *sch,
+					unsigned long parent, u32 classid)
+{
+	return dsmark_get(sch, classid);
+}
+
+static void dsmark_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
+	[TCA_DSMARK_INDICES]		= { .type = NLA_U16 },
+	[TCA_DSMARK_DEFAULT_INDEX]	= { .type = NLA_U16 },
+	[TCA_DSMARK_SET_TC_INDEX]	= { .type = NLA_FLAG },
+	[TCA_DSMARK_MASK]		= { .type = NLA_U8 },
+	[TCA_DSMARK_VALUE]		= { .type = NLA_U8 },
+};
+
+static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_DSMARK_MAX + 1];
+	int err = -EINVAL;
+	u8 mask = 0;
+
+	pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
+		"arg 0x%lx\n", sch, p, classid, parent, *arg);
+
+	if (!dsmark_valid_index(p, *arg)) {
+		err = -ENOENT;
+		goto errout;
+	}
+
+	if (!opt)
+		goto errout;
+
+	err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+	if (err < 0)
+		goto errout;
+
+	if (tb[TCA_DSMARK_MASK])
+		mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
+
+	if (tb[TCA_DSMARK_VALUE])
+		p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
+
+	if (tb[TCA_DSMARK_MASK])
+		p->mask[*arg - 1] = mask;
+
+	err = 0;
+
+errout:
+	return err;
+}
+
+static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+	if (!dsmark_valid_index(p, arg))
+		return -EINVAL;
+
+	p->mask[arg - 1] = 0xff;
+	p->value[arg - 1] = 0;
+
+	return 0;
+}
+
+static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	int i;
+
+	pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+
+	if (walker->stop)
+		return;
+
+	for (i = 0; i < p->indices; i++) {
+		if (p->mask[i] == 0xff && !p->value[i])
+			goto ignore;
+		if (walker->count >= walker->skip) {
+			if (walker->fn(sch, i + 1, walker) < 0) {
+				walker->stop = 1;
+				break;
+			}
+		}
+ignore:
+		walker->count++;
+	}
+}
+
+static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,
+						 unsigned long cl)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	return &p->filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	int err;
+
+	pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+
+	if (p->set_tc_index) {
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			if (skb_cow_head(skb, sizeof(struct iphdr)))
+				goto drop;
+
+			skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
+				& ~INET_ECN_MASK;
+			break;
+
+		case htons(ETH_P_IPV6):
+			if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
+				goto drop;
+
+			skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
+				& ~INET_ECN_MASK;
+			break;
+		default:
+			skb->tc_index = 0;
+			break;
+		}
+	}
+
+	if (TC_H_MAJ(skb->priority) == sch->handle)
+		skb->tc_index = TC_H_MIN(skb->priority);
+	else {
+		struct tcf_result res;
+		int result = tc_classify(skb, p->filter_list, &res);
+
+		pr_debug("result %d class 0x%04x\n", result, res.classid);
+
+		switch (result) {
+#ifdef CONFIG_NET_CLS_ACT
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			kfree_skb(skb);
+			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+
+		case TC_ACT_SHOT:
+			goto drop;
+#endif
+		case TC_ACT_OK:
+			skb->tc_index = TC_H_MIN(res.classid);
+			break;
+
+		default:
+			if (p->default_index != NO_DEFAULT_INDEX)
+				skb->tc_index = p->default_index;
+			break;
+		}
+	}
+
+	err = qdisc_enqueue(skb, p->q);
+	if (err != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(err))
+			sch->qstats.drops++;
+		return err;
+	}
+
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+
+drop:
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+}
+
+static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct sk_buff *skb;
+	u32 index;
+
+	pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
+
+	skb = p->q->ops->dequeue(p->q);
+	if (skb == NULL)
+		return NULL;
+
+	qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+
+	index = skb->tc_index & (p->indices - 1);
+	pr_debug("index %d->%d\n", skb->tc_index, index);
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
+				    p->value[index]);
+			break;
+	case htons(ETH_P_IPV6):
+		ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
+				    p->value[index]);
+			break;
+	default:
+		/*
+		 * Only complain if a change was actually attempted.
+		 * This way, we can send non-IP traffic through dsmark
+		 * and don't need yet another qdisc as a bypass.
+		 */
+		if (p->mask[index] != 0xff || p->value[index])
+			pr_warning("dsmark_dequeue: unsupported protocol %d\n",
+				   ntohs(skb->protocol));
+		break;
+	}
+
+	return skb;
+}
+
+static struct sk_buff *dsmark_peek(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p);
+
+	return p->q->ops->peek(p->q);
+}
+
+static unsigned int dsmark_drop(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	unsigned int len;
+
+	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+
+	if (p->q->ops->drop == NULL)
+		return 0;
+
+	len = p->q->ops->drop(p->q);
+	if (len)
+		sch->q.qlen--;
+
+	return len;
+}
+
+static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *tb[TCA_DSMARK_MAX + 1];
+	int err = -EINVAL;
+	u32 default_index = NO_DEFAULT_INDEX;
+	u16 indices;
+	u8 *mask;
+
+	pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+
+	if (!opt)
+		goto errout;
+
+	err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
+	if (err < 0)
+		goto errout;
+
+	err = -EINVAL;
+	indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
+
+	if (hweight32(indices) != 1)
+		goto errout;
+
+	if (tb[TCA_DSMARK_DEFAULT_INDEX])
+		default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
+
+	mask = kmalloc(indices * 2, GFP_KERNEL);
+	if (mask == NULL) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	p->mask = mask;
+	memset(p->mask, 0xff, indices);
+
+	p->value = p->mask + indices;
+	memset(p->value, 0, indices);
+
+	p->indices = indices;
+	p->default_index = default_index;
+	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
+
+	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
+	if (p->q == NULL)
+		p->q = &noop_qdisc;
+
+	pr_debug("dsmark_init: qdisc %p\n", p->q);
+
+	err = 0;
+errout:
+	return err;
+}
+
+static void dsmark_reset(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+	qdisc_reset(p->q);
+	sch->q.qlen = 0;
+}
+
+static void dsmark_destroy(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+
+	pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+
+	tcf_destroy_chain(&p->filter_list);
+	qdisc_destroy(p->q);
+	kfree(p->mask);
+}
+
+static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+
+	pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
+
+	if (!dsmark_valid_index(p, cl))
+		return -EINVAL;
+
+	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
+	tcm->tcm_info = p->q->handle;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]);
+	NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]);
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+
+	if (p->default_index != NO_DEFAULT_INDEX)
+		NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
+
+	if (p->set_tc_index)
+		NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops dsmark_class_ops = {
+	.graft		=	dsmark_graft,
+	.leaf		=	dsmark_leaf,
+	.get		=	dsmark_get,
+	.put		=	dsmark_put,
+	.change		=	dsmark_change,
+	.delete		=	dsmark_delete,
+	.walk		=	dsmark_walk,
+	.tcf_chain	=	dsmark_find_tcf,
+	.bind_tcf	=	dsmark_bind_filter,
+	.unbind_tcf	=	dsmark_put,
+	.dump		=	dsmark_dump_class,
+};
+
+static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&dsmark_class_ops,
+	.id		=	"dsmark",
+	.priv_size	=	sizeof(struct dsmark_qdisc_data),
+	.enqueue	=	dsmark_enqueue,
+	.dequeue	=	dsmark_dequeue,
+	.peek		=	dsmark_peek,
+	.drop		=	dsmark_drop,
+	.init		=	dsmark_init,
+	.reset		=	dsmark_reset,
+	.destroy	=	dsmark_destroy,
+	.change		=	NULL,
+	.dump		=	dsmark_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init dsmark_module_init(void)
+{
+	return register_qdisc(&dsmark_qdisc_ops);
+}
+
+static void __exit dsmark_module_exit(void)
+{
+	unregister_qdisc(&dsmark_qdisc_ops);
+}
+
+module_init(dsmark_module_init)
+module_exit(dsmark_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
new file mode 100644
index 00000000..66effe2d
--- /dev/null
+++ b/net/sched/sch_fifo.c
@@ -0,0 +1,179 @@
+/*
+ * net/sched/sch_fifo.c	The simplest FIFO queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+/* 1 band FIFO pseudo-"scheduler" */
+
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
+		return qdisc_enqueue_tail(skb, sch);
+
+	return qdisc_reshape_fail(skb, sch);
+}
+
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	if (likely(skb_queue_len(&sch->q) < sch->limit))
+		return qdisc_enqueue_tail(skb, sch);
+
+	return qdisc_reshape_fail(skb, sch);
+}
+
+static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	if (likely(skb_queue_len(&sch->q) < sch->limit))
+		return qdisc_enqueue_tail(skb, sch);
+
+	/* queue full, remove one skb to fulfill the limit */
+	__qdisc_queue_drop_head(sch, &sch->q);
+	sch->qstats.drops++;
+	qdisc_enqueue_tail(skb, sch);
+
+	return NET_XMIT_CN;
+}
+
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	bool bypass;
+	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
+
+	if (opt == NULL) {
+		u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
+
+		if (is_bfifo)
+			limit *= psched_mtu(qdisc_dev(sch));
+
+		sch->limit = limit;
+	} else {
+		struct tc_fifo_qopt *ctl = nla_data(opt);
+
+		if (nla_len(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		sch->limit = ctl->limit;
+	}
+
+	if (is_bfifo)
+		bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
+	else
+		bypass = sch->limit >= 1;
+
+	if (bypass)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct tc_fifo_qopt opt = { .limit = sch->limit };
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
+	.id		=	"pfifo",
+	.priv_size	=	0,
+	.enqueue	=	pfifo_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_head,
+	.drop		=	qdisc_queue_drop,
+	.init		=	fifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	fifo_init,
+	.dump		=	fifo_dump,
+	.owner		=	THIS_MODULE,
+};
+EXPORT_SYMBOL(pfifo_qdisc_ops);
+
+struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
+	.id		=	"bfifo",
+	.priv_size	=	0,
+	.enqueue	=	bfifo_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_head,
+	.drop		=	qdisc_queue_drop,
+	.init		=	fifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	fifo_init,
+	.dump		=	fifo_dump,
+	.owner		=	THIS_MODULE,
+};
+EXPORT_SYMBOL(bfifo_qdisc_ops);
+
+struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
+	.id		=	"pfifo_head_drop",
+	.priv_size	=	0,
+	.enqueue	=	pfifo_tail_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_head,
+	.drop		=	qdisc_queue_drop_head,
+	.init		=	fifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	fifo_init,
+	.dump		=	fifo_dump,
+	.owner		=	THIS_MODULE,
+};
+
+/* Pass size change message down to embedded FIFO */
+int fifo_set_limit(struct Qdisc *q, unsigned int limit)
+{
+	struct nlattr *nla;
+	int ret = -ENOMEM;
+
+	/* Hack to avoid sending change message to non-FIFO */
+	if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+		return 0;
+
+	nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
+	if (nla) {
+		nla->nla_type = RTM_NEWQDISC;
+		nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
+		((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
+
+		ret = q->ops->change(q, nla);
+		kfree(nla);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(fifo_set_limit);
+
+struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
+			       unsigned int limit)
+{
+	struct Qdisc *q;
+	int err = -ENOMEM;
+
+	q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
+	if (q) {
+		err = fifo_set_limit(q, limit);
+		if (err < 0) {
+			qdisc_destroy(q);
+			q = NULL;
+		}
+	}
+
+	return q ? : ERR_PTR(err);
+}
+EXPORT_SYMBOL(fifo_create_dflt);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
new file mode 100644
index 00000000..b4c68090
--- /dev/null
+++ b/net/sched/sch_generic.c
@@ -0,0 +1,907 @@
+/*
+ * net/sched/sch_generic.c	Generic packet scheduler routines.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
+ *              - Ingress support
+ */
+
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+
+/* Main transmission queue. */
+
+/* Modifications to data participating in scheduling must be protected with
+ * qdisc_lock(qdisc) spinlock.
+ *
+ * The idea is the following:
+ * - enqueue, dequeue are serialized via qdisc root lock
+ * - ingress filtering is also serialized via qdisc root lock
+ * - updates to tree and tree walking are only done under the rtnl mutex.
+ */
+
+static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+{
+	skb_dst_force(skb);
+	q->gso_skb = skb;
+	q->qstats.requeues++;
+	q->q.qlen++;	/* it's still part of the queue */
+	__netif_schedule(q);
+
+	return 0;
+}
+
+static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
+{
+	struct sk_buff *skb = q->gso_skb;
+
+	if (unlikely(skb)) {
+		struct net_device *dev = qdisc_dev(q);
+		struct netdev_queue *txq;
+
+		/* check the reason of requeuing without tx lock first */
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		if (!netif_tx_queue_frozen_or_stopped(txq)) {
+			q->gso_skb = NULL;
+			q->q.qlen--;
+		} else
+			skb = NULL;
+	} else {
+		skb = q->dequeue(q);
+	}
+
+	return skb;
+}
+
+static inline int handle_dev_cpu_collision(struct sk_buff *skb,
+					   struct netdev_queue *dev_queue,
+					   struct Qdisc *q)
+{
+	int ret;
+
+	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+		/*
+		 * Same CPU holding the lock. It may be a transient
+		 * configuration error, when hard_start_xmit() recurses. We
+		 * detect it by checking xmit owner and drop the packet when
+		 * deadloop is detected. Return OK to try the next skb.
+		 */
+		kfree_skb(skb);
+		if (net_ratelimit())
+			pr_warning("Dead loop on netdevice %s, fix it urgently!\n",
+				   dev_queue->dev->name);
+		ret = qdisc_qlen(q);
+	} else {
+		/*
+		 * Another cpu is holding lock, requeue & delay xmits for
+		 * some time.
+		 */
+		__this_cpu_inc(softnet_data.cpu_collision);
+		ret = dev_requeue_skb(skb, q);
+	}
+
+	return ret;
+}
+
+/*
+ * Transmit one skb, and handle the return status as required. Holding the
+ * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
+ * function.
+ *
+ * Returns to the caller:
+ *				0  - queue is empty or throttled.
+ *				>0 - queue is not empty.
+ */
+int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+		    struct net_device *dev, struct netdev_queue *txq,
+		    spinlock_t *root_lock)
+{
+	int ret = NETDEV_TX_BUSY;
+
+	/* And release qdisc */
+	spin_unlock(root_lock);
+
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	if (!netif_tx_queue_frozen_or_stopped(txq))
+		ret = dev_hard_start_xmit(skb, dev, txq);
+
+	HARD_TX_UNLOCK(dev, txq);
+
+	spin_lock(root_lock);
+
+	if (dev_xmit_complete(ret)) {
+		/* Driver sent out skb successfully or skb was consumed */
+		ret = qdisc_qlen(q);
+	} else if (ret == NETDEV_TX_LOCKED) {
+		/* Driver try lock failed */
+		ret = handle_dev_cpu_collision(skb, txq, q);
+	} else {
+		/* Driver returned NETDEV_TX_BUSY - requeue skb */
+		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
+			pr_warning("BUG %s code %d qlen %d\n",
+				   dev->name, ret, q->q.qlen);
+
+		ret = dev_requeue_skb(skb, q);
+	}
+
+	if (ret && netif_tx_queue_frozen_or_stopped(txq))
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * NOTE: Called under qdisc_lock(q) with locally disabled BH.
+ *
+ * __QDISC_STATE_RUNNING guarantees only one CPU can process
+ * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
+ * this queue.
+ *
+ *  netif_tx_lock serializes accesses to device driver.
+ *
+ *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
+ *  if one is grabbed, another must be free.
+ *
+ * Note, that this procedure can be called by a watchdog timer
+ *
+ * Returns to the caller:
+ *				0  - queue is empty or throttled.
+ *				>0 - queue is not empty.
+ *
+ */
+static inline int qdisc_restart(struct Qdisc *q)
+{
+	struct netdev_queue *txq;
+	struct net_device *dev;
+	spinlock_t *root_lock;
+	struct sk_buff *skb;
+
+	/* Dequeue packet */
+	skb = dequeue_skb(q);
+	if (unlikely(!skb))
+		return 0;
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
+	root_lock = qdisc_lock(q);
+	dev = qdisc_dev(q);
+	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+	return sch_direct_xmit(skb, q, dev, txq, root_lock);
+}
+
+void __qdisc_run(struct Qdisc *q)
+{
+	unsigned long start_time = jiffies;
+
+	while (qdisc_restart(q)) {
+		/*
+		 * Postpone processing if
+		 * 1. another process needs the CPU;
+		 * 2. we've been doing it for too long.
+		 */
+		if (need_resched() || jiffies != start_time) {
+			__netif_schedule(q);
+			break;
+		}
+	}
+
+	qdisc_run_end(q);
+}
+
+unsigned long dev_trans_start(struct net_device *dev)
+{
+	unsigned long val, res = dev->trans_start;
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		val = netdev_get_tx_queue(dev, i)->trans_start;
+		if (val && time_after(val, res))
+			res = val;
+	}
+	dev->trans_start = res;
+	return res;
+}
+EXPORT_SYMBOL(dev_trans_start);
+
+static void dev_watchdog(unsigned long arg)
+{
+	struct net_device *dev = (struct net_device *)arg;
+
+	netif_tx_lock(dev);
+	if (!qdisc_tx_is_noop(dev)) {
+		if (netif_device_present(dev) &&
+		    netif_running(dev) &&
+		    netif_carrier_ok(dev)) {
+			int some_queue_timedout = 0;
+			unsigned int i;
+			unsigned long trans_start;
+
+			for (i = 0; i < dev->num_tx_queues; i++) {
+				struct netdev_queue *txq;
+
+				txq = netdev_get_tx_queue(dev, i);
+				/*
+				 * old device drivers set dev->trans_start
+				 */
+				trans_start = txq->trans_start ? : dev->trans_start;
+				if (netif_tx_queue_stopped(txq) &&
+				    time_after(jiffies, (trans_start +
+							 dev->watchdog_timeo))) {
+					some_queue_timedout = 1;
+					break;
+				}
+			}
+
+			if (some_queue_timedout) {
+				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
+				       dev->name, netdev_drivername(dev), i);
+				dev->netdev_ops->ndo_tx_timeout(dev);
+			}
+			if (!mod_timer(&dev->watchdog_timer,
+				       round_jiffies(jiffies +
+						     dev->watchdog_timeo)))
+				dev_hold(dev);
+		}
+	}
+	netif_tx_unlock(dev);
+
+	dev_put(dev);
+}
+
+void __netdev_watchdog_up(struct net_device *dev)
+{
+	if (dev->netdev_ops->ndo_tx_timeout) {
+		if (dev->watchdog_timeo <= 0)
+			dev->watchdog_timeo = 5*HZ;
+		if (!mod_timer(&dev->watchdog_timer,
+			       round_jiffies(jiffies + dev->watchdog_timeo)))
+			dev_hold(dev);
+	}
+}
+
+static void dev_watchdog_up(struct net_device *dev)
+{
+	__netdev_watchdog_up(dev);
+}
+
+static void dev_watchdog_down(struct net_device *dev)
+{
+	netif_tx_lock_bh(dev);
+	if (del_timer(&dev->watchdog_timer))
+		dev_put(dev);
+	netif_tx_unlock_bh(dev);
+}
+
+/**
+ *	netif_carrier_on - set carrier
+ *	@dev: network device
+ *
+ * Device has detected that carrier.
+ */
+void netif_carrier_on(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+		if (dev->reg_state == NETREG_UNINITIALIZED)
+			return;
+		linkwatch_fire_event(dev);
+		if (netif_running(dev))
+			__netdev_watchdog_up(dev);
+	}
+}
+EXPORT_SYMBOL(netif_carrier_on);
+
+/**
+ *	netif_carrier_off - clear carrier
+ *	@dev: network device
+ *
+ * Device has detected loss of carrier.
+ */
+void netif_carrier_off(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
+		if (dev->reg_state == NETREG_UNINITIALIZED)
+			return;
+		linkwatch_fire_event(dev);
+	}
+}
+EXPORT_SYMBOL(netif_carrier_off);
+
+/**
+ * 	netif_notify_peers - notify network peers about existence of @dev
+ * 	@dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netif_notify_peers(struct net_device *dev)
+{
+	rtnl_lock();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(netif_notify_peers);
+
+/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
+   under all circumstances. It is difficult to invent anything faster or
+   cheaper.
+ */
+
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+{
+	kfree_skb(skb);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
+{
+	return NULL;
+}
+
+struct Qdisc_ops noop_qdisc_ops __read_mostly = {
+	.id		=	"noop",
+	.priv_size	=	0,
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.peek		=	noop_dequeue,
+	.owner		=	THIS_MODULE,
+};
+
+static struct netdev_queue noop_netdev_queue = {
+	.qdisc		=	&noop_qdisc,
+	.qdisc_sleeping	=	&noop_qdisc,
+};
+
+struct Qdisc noop_qdisc = {
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.flags		=	TCQ_F_BUILTIN,
+	.ops		=	&noop_qdisc_ops,
+	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
+	.dev_queue	=	&noop_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
+};
+EXPORT_SYMBOL(noop_qdisc);
+
+static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
+	.id		=	"noqueue",
+	.priv_size	=	0,
+	.enqueue	=	noop_enqueue,
+	.dequeue	=	noop_dequeue,
+	.peek		=	noop_dequeue,
+	.owner		=	THIS_MODULE,
+};
+
+static struct Qdisc noqueue_qdisc;
+static struct netdev_queue noqueue_netdev_queue = {
+	.qdisc		=	&noqueue_qdisc,
+	.qdisc_sleeping	=	&noqueue_qdisc,
+};
+
+static struct Qdisc noqueue_qdisc = {
+	.enqueue	=	NULL,
+	.dequeue	=	noop_dequeue,
+	.flags		=	TCQ_F_BUILTIN,
+	.ops		=	&noqueue_qdisc_ops,
+	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
+	.dev_queue	=	&noqueue_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
+};
+
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+/* 3-band FIFO queue: old style, but should be a bit faster than
+   generic prio+fifo combination.
+ */
+
+#define PFIFO_FAST_BANDS 3
+
+/*
+ * Private data for a pfifo_fast scheduler containing:
+ * 	- queues for the three band
+ * 	- bitmap indicating which of the bands contain skbs
+ */
+struct pfifo_fast_priv {
+	u32 bitmap;
+	struct sk_buff_head q[PFIFO_FAST_BANDS];
+};
+
+/*
+ * Convert a bitmap to the first band number where an skb is queued, where:
+ * 	bitmap=0 means there are no skbs on any band.
+ * 	bitmap=1 means there is an skb on band 0.
+ *	bitmap=7 means there are skbs on all 3 bands, etc.
+ */
+static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
+
+static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
+					     int band)
+{
+	return priv->q + band;
+}
+
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
+{
+	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
+		int band = prio2band[skb->priority & TC_PRIO_MAX];
+		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+		struct sk_buff_head *list = band2list(priv, band);
+
+		priv->bitmap |= (1 << band);
+		qdisc->q.qlen++;
+		return __qdisc_enqueue_tail(skb, qdisc, list);
+	}
+
+	return qdisc_drop(skb, qdisc);
+}
+
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int band = bitmap2band[priv->bitmap];
+
+	if (likely(band >= 0)) {
+		struct sk_buff_head *list = band2list(priv, band);
+		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+
+		qdisc->q.qlen--;
+		if (skb_queue_empty(list))
+			priv->bitmap &= ~(1 << band);
+
+		return skb;
+	}
+
+	return NULL;
+}
+
+static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
+{
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+	int band = bitmap2band[priv->bitmap];
+
+	if (band >= 0) {
+		struct sk_buff_head *list = band2list(priv, band);
+
+		return skb_peek(list);
+	}
+
+	return NULL;
+}
+
+static void pfifo_fast_reset(struct Qdisc *qdisc)
+{
+	int prio;
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+		__qdisc_reset_queue(qdisc, band2list(priv, prio));
+
+	priv->bitmap = 0;
+	qdisc->qstats.backlog = 0;
+	qdisc->q.qlen = 0;
+}
+
+static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
+{
+	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
+
+	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
+{
+	int prio;
+	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+		skb_queue_head_init(band2list(priv, prio));
+
+	/* Can by-pass the queue discipline */
+	qdisc->flags |= TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+struct Qdisc_ops pfifo_fast_ops __read_mostly = {
+	.id		=	"pfifo_fast",
+	.priv_size	=	sizeof(struct pfifo_fast_priv),
+	.enqueue	=	pfifo_fast_enqueue,
+	.dequeue	=	pfifo_fast_dequeue,
+	.peek		=	pfifo_fast_peek,
+	.init		=	pfifo_fast_init,
+	.reset		=	pfifo_fast_reset,
+	.dump		=	pfifo_fast_dump,
+	.owner		=	THIS_MODULE,
+};
+EXPORT_SYMBOL(pfifo_fast_ops);
+
+struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
+			  struct Qdisc_ops *ops)
+{
+	void *p;
+	struct Qdisc *sch;
+	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
+	int err = -ENOBUFS;
+
+	p = kzalloc_node(size, GFP_KERNEL,
+			 netdev_queue_numa_node_read(dev_queue));
+
+	if (!p)
+		goto errout;
+	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+	/* if we got non aligned memory, ask more and do alignment ourself */
+	if (sch != p) {
+		kfree(p);
+		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
+				 netdev_queue_numa_node_read(dev_queue));
+		if (!p)
+			goto errout;
+		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+		sch->padded = (char *) sch - (char *) p;
+	}
+	INIT_LIST_HEAD(&sch->list);
+	skb_queue_head_init(&sch->q);
+	spin_lock_init(&sch->busylock);
+	sch->ops = ops;
+	sch->enqueue = ops->enqueue;
+	sch->dequeue = ops->dequeue;
+	sch->dev_queue = dev_queue;
+	dev_hold(qdisc_dev(sch));
+	atomic_set(&sch->refcnt, 1);
+
+	return sch;
+errout:
+	return ERR_PTR(err);
+}
+
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+				struct Qdisc_ops *ops, unsigned int parentid)
+{
+	struct Qdisc *sch;
+
+	sch = qdisc_alloc(dev_queue, ops);
+	if (IS_ERR(sch))
+		goto errout;
+	sch->parent = parentid;
+
+	if (!ops->init || ops->init(sch, NULL) == 0)
+		return sch;
+
+	qdisc_destroy(sch);
+errout:
+	return NULL;
+}
+EXPORT_SYMBOL(qdisc_create_dflt);
+
+/* Under qdisc_lock(qdisc) and BH! */
+
+void qdisc_reset(struct Qdisc *qdisc)
+{
+	const struct Qdisc_ops *ops = qdisc->ops;
+
+	if (ops->reset)
+		ops->reset(qdisc);
+
+	if (qdisc->gso_skb) {
+		kfree_skb(qdisc->gso_skb);
+		qdisc->gso_skb = NULL;
+		qdisc->q.qlen = 0;
+	}
+}
+EXPORT_SYMBOL(qdisc_reset);
+
+static void qdisc_rcu_free(struct rcu_head *head)
+{
+	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
+
+	kfree((char *) qdisc - qdisc->padded);
+}
+
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+	const struct Qdisc_ops  *ops = qdisc->ops;
+
+	if (qdisc->flags & TCQ_F_BUILTIN ||
+	    !atomic_dec_and_test(&qdisc->refcnt))
+		return;
+
+#ifdef CONFIG_NET_SCHED
+	qdisc_list_del(qdisc);
+
+	qdisc_put_stab(rtnl_dereference(qdisc->stab));
+#endif
+	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
+
+	module_put(ops->owner);
+	dev_put(qdisc_dev(qdisc));
+
+	kfree_skb(qdisc->gso_skb);
+	/*
+	 * gen_estimator est_timer() might access qdisc->q.lock,
+	 * wait a RCU grace period before freeing qdisc.
+	 */
+	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
+}
+EXPORT_SYMBOL(qdisc_destroy);
+
+/* Attach toplevel qdisc to device queue. */
+struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+			      struct Qdisc *qdisc)
+{
+	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+	spinlock_t *root_lock;
+
+	root_lock = qdisc_lock(oqdisc);
+	spin_lock_bh(root_lock);
+
+	/* Prune old scheduler */
+	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+		qdisc_reset(oqdisc);
+
+	/* ... and graft new one */
+	if (qdisc == NULL)
+		qdisc = &noop_qdisc;
+	dev_queue->qdisc_sleeping = qdisc;
+	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+
+	spin_unlock_bh(root_lock);
+
+	return oqdisc;
+}
+EXPORT_SYMBOL(dev_graft_qdisc);
+
+static void attach_one_default_qdisc(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_unused)
+{
+	struct Qdisc *qdisc = &noqueue_qdisc;
+
+	if (dev->tx_queue_len) {
+		qdisc = qdisc_create_dflt(dev_queue,
+					  &pfifo_fast_ops, TC_H_ROOT);
+		if (!qdisc) {
+			netdev_info(dev, "activation failed\n");
+			return;
+		}
+	}
+	dev_queue->qdisc_sleeping = qdisc;
+}
+
+static void attach_default_qdiscs(struct net_device *dev)
+{
+	struct netdev_queue *txq;
+	struct Qdisc *qdisc;
+
+	txq = netdev_get_tx_queue(dev, 0);
+
+	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
+		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+		dev->qdisc = txq->qdisc_sleeping;
+		atomic_inc(&dev->qdisc->refcnt);
+	} else {
+		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
+		if (qdisc) {
+			qdisc->ops->attach(qdisc);
+			dev->qdisc = qdisc;
+		}
+	}
+}
+
+static void transition_one_qdisc(struct net_device *dev,
+				 struct netdev_queue *dev_queue,
+				 void *_need_watchdog)
+{
+	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
+	int *need_watchdog_p = _need_watchdog;
+
+	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
+		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
+
+	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
+	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
+		dev_queue->trans_start = 0;
+		*need_watchdog_p = 1;
+	}
+}
+
+void dev_activate(struct net_device *dev)
+{
+	int need_watchdog;
+
+	/* No queueing discipline is attached to device;
+	   create default one i.e. pfifo_fast for devices,
+	   which need queueing and noqueue_qdisc for
+	   virtual interfaces
+	 */
+
+	if (dev->qdisc == &noop_qdisc)
+		attach_default_qdiscs(dev);
+
+	if (!netif_carrier_ok(dev))
+		/* Delay activation until next carrier-on event */
+		return;
+
+	need_watchdog = 0;
+	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
+	if (dev_ingress_queue(dev))
+		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
+
+	if (need_watchdog) {
+		dev->trans_start = jiffies;
+		dev_watchdog_up(dev);
+	}
+}
+EXPORT_SYMBOL(dev_activate);
+
+static void dev_deactivate_queue(struct net_device *dev,
+				 struct netdev_queue *dev_queue,
+				 void *_qdisc_default)
+{
+	struct Qdisc *qdisc_default = _qdisc_default;
+	struct Qdisc *qdisc;
+
+	qdisc = dev_queue->qdisc;
+	if (qdisc) {
+		spin_lock_bh(qdisc_lock(qdisc));
+
+		if (!(qdisc->flags & TCQ_F_BUILTIN))
+			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
+
+		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		qdisc_reset(qdisc);
+
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+}
+
+static bool some_qdisc_is_busy(struct net_device *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *dev_queue;
+		spinlock_t *root_lock;
+		struct Qdisc *q;
+		int val;
+
+		dev_queue = netdev_get_tx_queue(dev, i);
+		q = dev_queue->qdisc_sleeping;
+		root_lock = qdisc_lock(q);
+
+		spin_lock_bh(root_lock);
+
+		val = (qdisc_is_running(q) ||
+		       test_bit(__QDISC_STATE_SCHED, &q->state));
+
+		spin_unlock_bh(root_lock);
+
+		if (val)
+			return true;
+	}
+	return false;
+}
+
+/**
+ * 	dev_deactivate_many - deactivate transmissions on several devices
+ * 	@head: list of devices to deactivate
+ *
+ *	This function returns only when all outstanding transmissions
+ *	have completed, unless all devices are in dismantle phase.
+ */
+void dev_deactivate_many(struct list_head *head)
+{
+	struct net_device *dev;
+	bool sync_needed = false;
+
+	list_for_each_entry(dev, head, unreg_list) {
+		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
+					 &noop_qdisc);
+		if (dev_ingress_queue(dev))
+			dev_deactivate_queue(dev, dev_ingress_queue(dev),
+					     &noop_qdisc);
+
+		dev_watchdog_down(dev);
+		sync_needed |= !dev->dismantle;
+	}
+
+	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
+	 * This is avoided if all devices are in dismantle phase :
+	 * Caller will call synchronize_net() for us
+	 */
+	if (sync_needed)
+		synchronize_net();
+
+	/* Wait for outstanding qdisc_run calls. */
+	list_for_each_entry(dev, head, unreg_list)
+		while (some_qdisc_is_busy(dev))
+			yield();
+}
+
+void dev_deactivate(struct net_device *dev)
+{
+	LIST_HEAD(single);
+
+	list_add(&dev->unreg_list, &single);
+	dev_deactivate_many(&single);
+	list_del(&single);
+}
+EXPORT_SYMBOL(dev_deactivate);
+
+static void dev_init_scheduler_queue(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_qdisc)
+{
+	struct Qdisc *qdisc = _qdisc;
+
+	dev_queue->qdisc = qdisc;
+	dev_queue->qdisc_sleeping = qdisc;
+}
+
+void dev_init_scheduler(struct net_device *dev)
+{
+	dev->qdisc = &noop_qdisc;
+	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+
+	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
+}
+
+static void shutdown_scheduler_queue(struct net_device *dev,
+				     struct netdev_queue *dev_queue,
+				     void *_qdisc_default)
+{
+	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+	struct Qdisc *qdisc_default = _qdisc_default;
+
+	if (qdisc) {
+		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+		dev_queue->qdisc_sleeping = qdisc_default;
+
+		qdisc_destroy(qdisc);
+	}
+}
+
+void dev_shutdown(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
+	qdisc_destroy(dev->qdisc);
+	dev->qdisc = &noop_qdisc;
+
+	WARN_ON(timer_pending(&dev->watchdog_timer));
+}
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
new file mode 100644
index 00000000..e1afe0c2
--- /dev/null
+++ b/net/sched/sch_gred.c
@@ -0,0 +1,605 @@
+/*
+ * net/sched/sch_gred.c	Generic Random Early Detection queue.
+ *
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:    J Hadi Salim (hadi@cyberus.ca) 1998-2002
+ *
+ *             991129: -  Bug fix with grio mode
+ *		       - a better sing. AvgQ mode with Grio(WRED)
+ *		       - A finer grained VQ dequeue based on sugestion
+ *		         from Ren Liu
+ *		       - More error checks
+ *
+ *  For all the glorious comments look at include/net/red.h
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/red.h>
+
+#define GRED_DEF_PRIO (MAX_DPs / 2)
+#define GRED_VQ_MASK (MAX_DPs - 1)
+
+struct gred_sched_data;
+struct gred_sched;
+
+struct gred_sched_data {
+	u32		limit;		/* HARD maximal queue length	*/
+	u32      	DP;		/* the drop pramaters */
+	u32		bytesin;	/* bytes seen on virtualQ so far*/
+	u32		packetsin;	/* packets seen on virtualQ so far*/
+	u32		backlog;	/* bytes on the virtualQ */
+	u8		prio;		/* the prio of this vq */
+
+	struct red_parms parms;
+	struct red_stats stats;
+};
+
+enum {
+	GRED_WRED_MODE = 1,
+	GRED_RIO_MODE,
+};
+
+struct gred_sched {
+	struct gred_sched_data *tab[MAX_DPs];
+	unsigned long	flags;
+	u32		red_flags;
+	u32 		DPs;
+	u32 		def;
+	struct red_parms wred_set;
+};
+
+static inline int gred_wred_mode(struct gred_sched *table)
+{
+	return test_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_enable_wred_mode(struct gred_sched *table)
+{
+	__set_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+	__clear_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+	return test_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+	__set_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+	__clear_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	int i;
+
+	/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
+	for (i = 0; i < table->DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+		int n;
+
+		if (q == NULL)
+			continue;
+
+		for (n = 0; n < table->DPs; n++)
+			if (table->tab[n] && table->tab[n] != q &&
+			    table->tab[n]->prio == q->prio)
+				return 1;
+	}
+
+	return 0;
+}
+
+static inline unsigned int gred_backlog(struct gred_sched *table,
+					struct gred_sched_data *q,
+					struct Qdisc *sch)
+{
+	if (gred_wred_mode(table))
+		return sch->qstats.backlog;
+	else
+		return q->backlog;
+}
+
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+	return skb->tc_index & GRED_VQ_MASK;
+}
+
+static inline void gred_load_wred_set(struct gred_sched *table,
+				      struct gred_sched_data *q)
+{
+	q->parms.qavg = table->wred_set.qavg;
+	q->parms.qidlestart = table->wred_set.qidlestart;
+}
+
+static inline void gred_store_wred_set(struct gred_sched *table,
+				       struct gred_sched_data *q)
+{
+	table->wred_set.qavg = q->parms.qavg;
+}
+
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+	return t->red_flags & TC_RED_ECN;
+}
+
+static inline int gred_use_harddrop(struct gred_sched *t)
+{
+	return t->red_flags & TC_RED_HARDDROP;
+}
+
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct gred_sched_data *q = NULL;
+	struct gred_sched *t = qdisc_priv(sch);
+	unsigned long qavg = 0;
+	u16 dp = tc_index_to_dp(skb);
+
+	if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+		dp = t->def;
+
+		q = t->tab[dp];
+		if (!q) {
+			/* Pass through packets not assigned to a DP
+			 * if no default DP has been configured. This
+			 * allows for DP flows to be left untouched.
+			 */
+			if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
+				return qdisc_enqueue_tail(skb, sch);
+			else
+				goto drop;
+		}
+
+		/* fix tc_index? --could be controvesial but needed for
+		   requeueing */
+		skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
+	}
+
+	/* sum up all the qaves of prios <= to ours to get the new qave */
+	if (!gred_wred_mode(t) && gred_rio_mode(t)) {
+		int i;
+
+		for (i = 0; i < t->DPs; i++) {
+			if (t->tab[i] && t->tab[i]->prio < q->prio &&
+			    !red_is_idling(&t->tab[i]->parms))
+				qavg += t->tab[i]->parms.qavg;
+		}
+
+	}
+
+	q->packetsin++;
+	q->bytesin += qdisc_pkt_len(skb);
+
+	if (gred_wred_mode(t))
+		gred_load_wred_set(t, q);
+
+	q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
+
+	if (red_is_idling(&q->parms))
+		red_end_of_idle_period(&q->parms);
+
+	if (gred_wred_mode(t))
+		gred_store_wred_set(t, q);
+
+	switch (red_action(&q->parms, q->parms.qavg + qavg)) {
+	case RED_DONT_MARK:
+		break;
+
+	case RED_PROB_MARK:
+		sch->qstats.overlimits++;
+		if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+			q->stats.prob_drop++;
+			goto congestion_drop;
+		}
+
+		q->stats.prob_mark++;
+		break;
+
+	case RED_HARD_MARK:
+		sch->qstats.overlimits++;
+		if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+		    !INET_ECN_set_ce(skb)) {
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+		q->stats.forced_mark++;
+		break;
+	}
+
+	if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
+		q->backlog += qdisc_pkt_len(skb);
+		return qdisc_enqueue_tail(skb, sch);
+	}
+
+	q->stats.pdrop++;
+drop:
+	return qdisc_drop(skb, sch);
+
+congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *gred_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct gred_sched *t = qdisc_priv(sch);
+
+	skb = qdisc_dequeue_head(sch);
+
+	if (skb) {
+		struct gred_sched_data *q;
+		u16 dp = tc_index_to_dp(skb);
+
+		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+			if (net_ratelimit())
+				pr_warning("GRED: Unable to relocate VQ 0x%x "
+					   "after dequeue, screwing up "
+					   "backlog.\n", tc_index_to_dp(skb));
+		} else {
+			q->backlog -= qdisc_pkt_len(skb);
+
+			if (!q->backlog && !gred_wred_mode(t))
+				red_start_of_idle_period(&q->parms);
+		}
+
+		return skb;
+	}
+
+	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
+		red_start_of_idle_period(&t->wred_set);
+
+	return NULL;
+}
+
+static unsigned int gred_drop(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct gred_sched *t = qdisc_priv(sch);
+
+	skb = qdisc_dequeue_tail(sch);
+	if (skb) {
+		unsigned int len = qdisc_pkt_len(skb);
+		struct gred_sched_data *q;
+		u16 dp = tc_index_to_dp(skb);
+
+		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+			if (net_ratelimit())
+				pr_warning("GRED: Unable to relocate VQ 0x%x "
+					   "while dropping, screwing up "
+					   "backlog.\n", tc_index_to_dp(skb));
+		} else {
+			q->backlog -= len;
+			q->stats.other++;
+
+			if (!q->backlog && !gred_wred_mode(t))
+				red_start_of_idle_period(&q->parms);
+		}
+
+		qdisc_drop(skb, sch);
+		return len;
+	}
+
+	if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
+		red_start_of_idle_period(&t->wred_set);
+
+	return 0;
+
+}
+
+static void gred_reset(struct Qdisc *sch)
+{
+	int i;
+	struct gred_sched *t = qdisc_priv(sch);
+
+	qdisc_reset_queue(sch);
+
+	for (i = 0; i < t->DPs; i++) {
+		struct gred_sched_data *q = t->tab[i];
+
+		if (!q)
+			continue;
+
+		red_restart(&q->parms);
+		q->backlog = 0;
+	}
+}
+
+static inline void gred_destroy_vq(struct gred_sched_data *q)
+{
+	kfree(q);
+}
+
+static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct tc_gred_sopt *sopt;
+	int i;
+
+	if (dps == NULL)
+		return -EINVAL;
+
+	sopt = nla_data(dps);
+
+	if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
+		return -EINVAL;
+
+	sch_tree_lock(sch);
+	table->DPs = sopt->DPs;
+	table->def = sopt->def_DP;
+	table->red_flags = sopt->flags;
+
+	/*
+	 * Every entry point to GRED is synchronized with the above code
+	 * and the DP is checked against DPs, i.e. shadowed VQs can no
+	 * longer be found so we can unlock right here.
+	 */
+	sch_tree_unlock(sch);
+
+	if (sopt->grio) {
+		gred_enable_rio_mode(table);
+		gred_disable_wred_mode(table);
+		if (gred_wred_mode_check(sch))
+			gred_enable_wred_mode(table);
+	} else {
+		gred_disable_rio_mode(table);
+		gred_disable_wred_mode(table);
+	}
+
+	for (i = table->DPs; i < MAX_DPs; i++) {
+		if (table->tab[i]) {
+			pr_warning("GRED: Warning: Destroying "
+				   "shadowed VQ 0x%x\n", i);
+			gred_destroy_vq(table->tab[i]);
+			table->tab[i] = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static inline int gred_change_vq(struct Qdisc *sch, int dp,
+				 struct tc_gred_qopt *ctl, int prio, u8 *stab)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct gred_sched_data *q;
+
+	if (table->tab[dp] == NULL) {
+		table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC);
+		if (table->tab[dp] == NULL)
+			return -ENOMEM;
+	}
+
+	q = table->tab[dp];
+	q->DP = dp;
+	q->prio = prio;
+	q->limit = ctl->limit;
+
+	if (q->backlog == 0)
+		red_end_of_idle_period(&q->parms);
+
+	red_set_parms(&q->parms,
+		      ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
+		      ctl->Scell_log, stab);
+
+	return 0;
+}
+
+static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
+	[TCA_GRED_PARMS]	= { .len = sizeof(struct tc_gred_qopt) },
+	[TCA_GRED_STAB]		= { .len = 256 },
+	[TCA_GRED_DPS]		= { .len = sizeof(struct tc_gred_sopt) },
+};
+
+static int gred_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct tc_gred_qopt *ctl;
+	struct nlattr *tb[TCA_GRED_MAX + 1];
+	int err, prio = GRED_DEF_PRIO;
+	u8 *stab;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
+		return gred_change_table_def(sch, opt);
+
+	if (tb[TCA_GRED_PARMS] == NULL ||
+	    tb[TCA_GRED_STAB] == NULL)
+		return -EINVAL;
+
+	err = -EINVAL;
+	ctl = nla_data(tb[TCA_GRED_PARMS]);
+	stab = nla_data(tb[TCA_GRED_STAB]);
+
+	if (ctl->DP >= table->DPs)
+		goto errout;
+
+	if (gred_rio_mode(table)) {
+		if (ctl->prio == 0) {
+			int def_prio = GRED_DEF_PRIO;
+
+			if (table->tab[table->def])
+				def_prio = table->tab[table->def]->prio;
+
+			printk(KERN_DEBUG "GRED: DP %u does not have a prio "
+			       "setting default to %d\n", ctl->DP, def_prio);
+
+			prio = def_prio;
+		} else
+			prio = ctl->prio;
+	}
+
+	sch_tree_lock(sch);
+
+	err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+	if (err < 0)
+		goto errout_locked;
+
+	if (gred_rio_mode(table)) {
+		gred_disable_wred_mode(table);
+		if (gred_wred_mode_check(sch))
+			gred_enable_wred_mode(table);
+	}
+
+	err = 0;
+
+errout_locked:
+	sch_tree_unlock(sch);
+errout:
+	return err;
+}
+
+static int gred_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct nlattr *tb[TCA_GRED_MAX + 1];
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
+		return -EINVAL;
+
+	return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
+}
+
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	struct nlattr *parms, *opts = NULL;
+	int i;
+	struct tc_gred_sopt sopt = {
+		.DPs	= table->DPs,
+		.def_DP	= table->def,
+		.grio	= gred_rio_mode(table),
+		.flags	= table->red_flags,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
+	parms = nla_nest_start(skb, TCA_GRED_PARMS);
+	if (parms == NULL)
+		goto nla_put_failure;
+
+	for (i = 0; i < MAX_DPs; i++) {
+		struct gred_sched_data *q = table->tab[i];
+		struct tc_gred_qopt opt;
+
+		memset(&opt, 0, sizeof(opt));
+
+		if (!q) {
+			/* hack -- fix at some point with proper message
+			   This is how we indicate to tc that there is no VQ
+			   at this DP */
+
+			opt.DP = MAX_DPs + i;
+			goto append_opt;
+		}
+
+		opt.limit	= q->limit;
+		opt.DP		= q->DP;
+		opt.backlog	= q->backlog;
+		opt.prio	= q->prio;
+		opt.qth_min	= q->parms.qth_min >> q->parms.Wlog;
+		opt.qth_max	= q->parms.qth_max >> q->parms.Wlog;
+		opt.Wlog	= q->parms.Wlog;
+		opt.Plog	= q->parms.Plog;
+		opt.Scell_log	= q->parms.Scell_log;
+		opt.other	= q->stats.other;
+		opt.early	= q->stats.prob_drop;
+		opt.forced	= q->stats.forced_drop;
+		opt.pdrop	= q->stats.pdrop;
+		opt.packets	= q->packetsin;
+		opt.bytesin	= q->bytesin;
+
+		if (gred_wred_mode(table))
+			gred_load_wred_set(table, q);
+
+		opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+
+append_opt:
+		if (nla_append(skb, sizeof(opt), &opt) < 0)
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(skb, parms);
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static void gred_destroy(struct Qdisc *sch)
+{
+	struct gred_sched *table = qdisc_priv(sch);
+	int i;
+
+	for (i = 0; i < table->DPs; i++) {
+		if (table->tab[i])
+			gred_destroy_vq(table->tab[i]);
+	}
+}
+
+static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
+	.id		=	"gred",
+	.priv_size	=	sizeof(struct gred_sched),
+	.enqueue	=	gred_enqueue,
+	.dequeue	=	gred_dequeue,
+	.peek		=	qdisc_peek_head,
+	.drop		=	gred_drop,
+	.init		=	gred_init,
+	.reset		=	gred_reset,
+	.destroy	=	gred_destroy,
+	.change		=	gred_change,
+	.dump		=	gred_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init gred_module_init(void)
+{
+	return register_qdisc(&gred_qdisc_ops);
+}
+
+static void __exit gred_module_exit(void)
+{
+	unregister_qdisc(&gred_qdisc_ops);
+}
+
+module_init(gred_module_init)
+module_exit(gred_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
new file mode 100644
index 00000000..6488e642
--- /dev/null
+++ b/net/sched/sch_hfsc.c
@@ -0,0 +1,1744 @@
+/*
+ * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * 2003-10-17 - Ported from altq
+ */
+/*
+ * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation is hereby granted (including for commercial or
+ * for-profit use), provided that both the copyright notice and this
+ * permission notice appear in all copies of the software, derivative
+ * works, or modified versions, and any portions thereof.
+ *
+ * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF
+ * WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON PROVIDES THIS
+ * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * Carnegie Mellon encourages (but does not require) users of this
+ * software to return any improvements or extensions that they make,
+ * and to grant Carnegie Mellon the rights to redistribute these
+ * changes without encumbrance.
+ */
+/*
+ * H-FSC is described in Proceedings of SIGCOMM'97,
+ * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing,
+ * Real-Time and Priority Service"
+ * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng.
+ *
+ * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing.
+ * when a class has an upperlimit, the fit-time is computed from the
+ * upperlimit service curve.  the link-sharing scheduler does not schedule
+ * a class whose fit-time exceeds the current time.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/init.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <asm/div64.h>
+
+/*
+ * kernel internal service curve representation:
+ *   coordinates are given by 64 bit unsigned integers.
+ *   x-axis: unit is clock count.
+ *   y-axis: unit is byte.
+ *
+ *   The service curve parameters are converted to the internal
+ *   representation. The slope values are scaled to avoid overflow.
+ *   the inverse slope values as well as the y-projection of the 1st
+ *   segment are kept in order to avoid 64-bit divide operations
+ *   that are expensive on 32-bit architectures.
+ */
+
+struct internal_sc {
+	u64	sm1;	/* scaled slope of the 1st segment */
+	u64	ism1;	/* scaled inverse-slope of the 1st segment */
+	u64	dx;	/* the x-projection of the 1st segment */
+	u64	dy;	/* the y-projection of the 1st segment */
+	u64	sm2;	/* scaled slope of the 2nd segment */
+	u64	ism2;	/* scaled inverse-slope of the 2nd segment */
+};
+
+/* runtime service curve */
+struct runtime_sc {
+	u64	x;	/* current starting position on x-axis */
+	u64	y;	/* current starting position on y-axis */
+	u64	sm1;	/* scaled slope of the 1st segment */
+	u64	ism1;	/* scaled inverse-slope of the 1st segment */
+	u64	dx;	/* the x-projection of the 1st segment */
+	u64	dy;	/* the y-projection of the 1st segment */
+	u64	sm2;	/* scaled slope of the 2nd segment */
+	u64	ism2;	/* scaled inverse-slope of the 2nd segment */
+};
+
+enum hfsc_class_flags {
+	HFSC_RSC = 0x1,
+	HFSC_FSC = 0x2,
+	HFSC_USC = 0x4
+};
+
+struct hfsc_class {
+	struct Qdisc_class_common cl_common;
+	unsigned int	refcnt;		/* usage count */
+
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est rate_est;
+	unsigned int	level;		/* class level in hierarchy */
+	struct tcf_proto *filter_list;	/* filter list */
+	unsigned int	filter_cnt;	/* filter count */
+
+	struct hfsc_sched *sched;	/* scheduler data */
+	struct hfsc_class *cl_parent;	/* parent class */
+	struct list_head siblings;	/* sibling classes */
+	struct list_head children;	/* child classes */
+	struct Qdisc	*qdisc;		/* leaf qdisc */
+
+	struct rb_node el_node;		/* qdisc's eligible tree member */
+	struct rb_root vt_tree;		/* active children sorted by cl_vt */
+	struct rb_node vt_node;		/* parent's vt_tree member */
+	struct rb_root cf_tree;		/* active children sorted by cl_f */
+	struct rb_node cf_node;		/* parent's cf_heap member */
+	struct list_head dlist;		/* drop list member */
+
+	u64	cl_total;		/* total work in bytes */
+	u64	cl_cumul;		/* cumulative work in bytes done by
+					   real-time criteria */
+
+	u64	cl_d;			/* deadline*/
+	u64	cl_e;			/* eligible time */
+	u64	cl_vt;			/* virtual time */
+	u64	cl_f;			/* time when this class will fit for
+					   link-sharing, max(myf, cfmin) */
+	u64	cl_myf;			/* my fit-time (calculated from this
+					   class's own upperlimit curve) */
+	u64	cl_myfadj;		/* my fit-time adjustment (to cancel
+					   history dependence) */
+	u64	cl_cfmin;		/* earliest children's fit-time (used
+					   with cl_myf to obtain cl_f) */
+	u64	cl_cvtmin;		/* minimal virtual time among the
+					   children fit for link-sharing
+					   (monotonic within a period) */
+	u64	cl_vtadj;		/* intra-period cumulative vt
+					   adjustment */
+	u64	cl_vtoff;		/* inter-period cumulative vt offset */
+	u64	cl_cvtmax;		/* max child's vt in the last period */
+	u64	cl_cvtoff;		/* cumulative cvtmax of all periods */
+	u64	cl_pcvtoff;		/* parent's cvtoff at initialization
+					   time */
+
+	struct internal_sc cl_rsc;	/* internal real-time service curve */
+	struct internal_sc cl_fsc;	/* internal fair service curve */
+	struct internal_sc cl_usc;	/* internal upperlimit service curve */
+	struct runtime_sc cl_deadline;	/* deadline curve */
+	struct runtime_sc cl_eligible;	/* eligible curve */
+	struct runtime_sc cl_virtual;	/* virtual curve */
+	struct runtime_sc cl_ulimit;	/* upperlimit curve */
+
+	unsigned long	cl_flags;	/* which curves are valid */
+	unsigned long	cl_vtperiod;	/* vt period sequence number */
+	unsigned long	cl_parentperiod;/* parent's vt period sequence number*/
+	unsigned long	cl_nactive;	/* number of active children */
+};
+
+struct hfsc_sched {
+	u16	defcls;				/* default class id */
+	struct hfsc_class root;			/* root class */
+	struct Qdisc_class_hash clhash;		/* class hash */
+	struct rb_root eligible;		/* eligible tree */
+	struct list_head droplist;		/* active leaf class list (for
+						   dropping) */
+	struct qdisc_watchdog watchdog;		/* watchdog timer */
+};
+
+#define	HT_INFINITY	0xffffffffffffffffULL	/* infinite time value */
+
+
+/*
+ * eligible tree holds backlogged classes being sorted by their eligible times.
+ * there is one eligible tree per hfsc instance.
+ */
+
+static void
+eltree_insert(struct hfsc_class *cl)
+{
+	struct rb_node **p = &cl->sched->eligible.rb_node;
+	struct rb_node *parent = NULL;
+	struct hfsc_class *cl1;
+
+	while (*p != NULL) {
+		parent = *p;
+		cl1 = rb_entry(parent, struct hfsc_class, el_node);
+		if (cl->cl_e >= cl1->cl_e)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&cl->el_node, parent, p);
+	rb_insert_color(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_remove(struct hfsc_class *cl)
+{
+	rb_erase(&cl->el_node, &cl->sched->eligible);
+}
+
+static inline void
+eltree_update(struct hfsc_class *cl)
+{
+	eltree_remove(cl);
+	eltree_insert(cl);
+}
+
+/* find the class with the minimum deadline among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_mindl(struct hfsc_sched *q, u64 cur_time)
+{
+	struct hfsc_class *p, *cl = NULL;
+	struct rb_node *n;
+
+	for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) {
+		p = rb_entry(n, struct hfsc_class, el_node);
+		if (p->cl_e > cur_time)
+			break;
+		if (cl == NULL || p->cl_d < cl->cl_d)
+			cl = p;
+	}
+	return cl;
+}
+
+/* find the class with minimum eligible time among the eligible classes */
+static inline struct hfsc_class *
+eltree_get_minel(struct hfsc_sched *q)
+{
+	struct rb_node *n;
+
+	n = rb_first(&q->eligible);
+	if (n == NULL)
+		return NULL;
+	return rb_entry(n, struct hfsc_class, el_node);
+}
+
+/*
+ * vttree holds holds backlogged child classes being sorted by their virtual
+ * time. each intermediate class has one vttree.
+ */
+static void
+vttree_insert(struct hfsc_class *cl)
+{
+	struct rb_node **p = &cl->cl_parent->vt_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct hfsc_class *cl1;
+
+	while (*p != NULL) {
+		parent = *p;
+		cl1 = rb_entry(parent, struct hfsc_class, vt_node);
+		if (cl->cl_vt >= cl1->cl_vt)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&cl->vt_node, parent, p);
+	rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_remove(struct hfsc_class *cl)
+{
+	rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree);
+}
+
+static inline void
+vttree_update(struct hfsc_class *cl)
+{
+	vttree_remove(cl);
+	vttree_insert(cl);
+}
+
+static inline struct hfsc_class *
+vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
+{
+	struct hfsc_class *p;
+	struct rb_node *n;
+
+	for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) {
+		p = rb_entry(n, struct hfsc_class, vt_node);
+		if (p->cl_f <= cur_time)
+			return p;
+	}
+	return NULL;
+}
+
+/*
+ * get the leaf class with the minimum vt in the hierarchy
+ */
+static struct hfsc_class *
+vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
+{
+	/* if root-class's cfmin is bigger than cur_time nothing to do */
+	if (cl->cl_cfmin > cur_time)
+		return NULL;
+
+	while (cl->level > 0) {
+		cl = vttree_firstfit(cl, cur_time);
+		if (cl == NULL)
+			return NULL;
+		/*
+		 * update parent's cl_cvtmin.
+		 */
+		if (cl->cl_parent->cl_cvtmin < cl->cl_vt)
+			cl->cl_parent->cl_cvtmin = cl->cl_vt;
+	}
+	return cl;
+}
+
+static void
+cftree_insert(struct hfsc_class *cl)
+{
+	struct rb_node **p = &cl->cl_parent->cf_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct hfsc_class *cl1;
+
+	while (*p != NULL) {
+		parent = *p;
+		cl1 = rb_entry(parent, struct hfsc_class, cf_node);
+		if (cl->cl_f >= cl1->cl_f)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&cl->cf_node, parent, p);
+	rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_remove(struct hfsc_class *cl)
+{
+	rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree);
+}
+
+static inline void
+cftree_update(struct hfsc_class *cl)
+{
+	cftree_remove(cl);
+	cftree_insert(cl);
+}
+
+/*
+ * service curve support functions
+ *
+ *  external service curve parameters
+ *	m: bps
+ *	d: us
+ *  internal service curve parameters
+ *	sm: (bytes/psched_us) << SM_SHIFT
+ *	ism: (psched_us/byte) << ISM_SHIFT
+ *	dx: psched_us
+ *
+ * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us.
+ *
+ * sm and ism are scaled in order to keep effective digits.
+ * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
+ * digits in decimal using the following table.
+ *
+ *  bits/sec      100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
+ *  ------------+-------------------------------------------------------
+ *  bytes/1.024us 12.8e-3    128e-3     1280e-3    12800e-3   128000e-3
+ *
+ *  1.024us/byte  78.125     7.8125     0.78125    0.078125   0.0078125
+ *
+ * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.
+ */
+#define	SM_SHIFT	(30 - PSCHED_SHIFT)
+#define	ISM_SHIFT	(8 + PSCHED_SHIFT)
+
+#define	SM_MASK		((1ULL << SM_SHIFT) - 1)
+#define	ISM_MASK	((1ULL << ISM_SHIFT) - 1)
+
+static inline u64
+seg_x2y(u64 x, u64 sm)
+{
+	u64 y;
+
+	/*
+	 * compute
+	 *	y = x * sm >> SM_SHIFT
+	 * but divide it for the upper and lower bits to avoid overflow
+	 */
+	y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
+	return y;
+}
+
+static inline u64
+seg_y2x(u64 y, u64 ism)
+{
+	u64 x;
+
+	if (y == 0)
+		x = 0;
+	else if (ism == HT_INFINITY)
+		x = HT_INFINITY;
+	else {
+		x = (y >> ISM_SHIFT) * ism
+		    + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
+	}
+	return x;
+}
+
+/* Convert m (bps) into sm (bytes/psched us) */
+static u64
+m2sm(u32 m)
+{
+	u64 sm;
+
+	sm = ((u64)m << SM_SHIFT);
+	sm += PSCHED_TICKS_PER_SEC - 1;
+	do_div(sm, PSCHED_TICKS_PER_SEC);
+	return sm;
+}
+
+/* convert m (bps) into ism (psched us/byte) */
+static u64
+m2ism(u32 m)
+{
+	u64 ism;
+
+	if (m == 0)
+		ism = HT_INFINITY;
+	else {
+		ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT);
+		ism += m - 1;
+		do_div(ism, m);
+	}
+	return ism;
+}
+
+/* convert d (us) into dx (psched us) */
+static u64
+d2dx(u32 d)
+{
+	u64 dx;
+
+	dx = ((u64)d * PSCHED_TICKS_PER_SEC);
+	dx += USEC_PER_SEC - 1;
+	do_div(dx, USEC_PER_SEC);
+	return dx;
+}
+
+/* convert sm (bytes/psched us) into m (bps) */
+static u32
+sm2m(u64 sm)
+{
+	u64 m;
+
+	m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT;
+	return (u32)m;
+}
+
+/* convert dx (psched us) into d (us) */
+static u32
+dx2d(u64 dx)
+{
+	u64 d;
+
+	d = dx * USEC_PER_SEC;
+	do_div(d, PSCHED_TICKS_PER_SEC);
+	return (u32)d;
+}
+
+static void
+sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
+{
+	isc->sm1  = m2sm(sc->m1);
+	isc->ism1 = m2ism(sc->m1);
+	isc->dx   = d2dx(sc->d);
+	isc->dy   = seg_x2y(isc->dx, isc->sm1);
+	isc->sm2  = m2sm(sc->m2);
+	isc->ism2 = m2ism(sc->m2);
+}
+
+/*
+ * initialize the runtime service curve with the given internal
+ * service curve starting at (x, y).
+ */
+static void
+rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+	rtsc->x	   = x;
+	rtsc->y    = y;
+	rtsc->sm1  = isc->sm1;
+	rtsc->ism1 = isc->ism1;
+	rtsc->dx   = isc->dx;
+	rtsc->dy   = isc->dy;
+	rtsc->sm2  = isc->sm2;
+	rtsc->ism2 = isc->ism2;
+}
+
+/*
+ * calculate the y-projection of the runtime service curve by the
+ * given x-projection value
+ */
+static u64
+rtsc_y2x(struct runtime_sc *rtsc, u64 y)
+{
+	u64 x;
+
+	if (y < rtsc->y)
+		x = rtsc->x;
+	else if (y <= rtsc->y + rtsc->dy) {
+		/* x belongs to the 1st segment */
+		if (rtsc->dy == 0)
+			x = rtsc->x + rtsc->dx;
+		else
+			x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1);
+	} else {
+		/* x belongs to the 2nd segment */
+		x = rtsc->x + rtsc->dx
+		    + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2);
+	}
+	return x;
+}
+
+static u64
+rtsc_x2y(struct runtime_sc *rtsc, u64 x)
+{
+	u64 y;
+
+	if (x <= rtsc->x)
+		y = rtsc->y;
+	else if (x <= rtsc->x + rtsc->dx)
+		/* y belongs to the 1st segment */
+		y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1);
+	else
+		/* y belongs to the 2nd segment */
+		y = rtsc->y + rtsc->dy
+		    + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2);
+	return y;
+}
+
+/*
+ * update the runtime service curve by taking the minimum of the current
+ * runtime service curve and the service curve starting at (x, y).
+ */
+static void
+rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
+{
+	u64 y1, y2, dx, dy;
+	u32 dsm;
+
+	if (isc->sm1 <= isc->sm2) {
+		/* service curve is convex */
+		y1 = rtsc_x2y(rtsc, x);
+		if (y1 < y)
+			/* the current rtsc is smaller */
+			return;
+		rtsc->x = x;
+		rtsc->y = y;
+		return;
+	}
+
+	/*
+	 * service curve is concave
+	 * compute the two y values of the current rtsc
+	 *	y1: at x
+	 *	y2: at (x + dx)
+	 */
+	y1 = rtsc_x2y(rtsc, x);
+	if (y1 <= y) {
+		/* rtsc is below isc, no change to rtsc */
+		return;
+	}
+
+	y2 = rtsc_x2y(rtsc, x + isc->dx);
+	if (y2 >= y + isc->dy) {
+		/* rtsc is above isc, replace rtsc by isc */
+		rtsc->x = x;
+		rtsc->y = y;
+		rtsc->dx = isc->dx;
+		rtsc->dy = isc->dy;
+		return;
+	}
+
+	/*
+	 * the two curves intersect
+	 * compute the offsets (dx, dy) using the reverse
+	 * function of seg_x2y()
+	 *	seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
+	 */
+	dx = (y1 - y) << SM_SHIFT;
+	dsm = isc->sm1 - isc->sm2;
+	do_div(dx, dsm);
+	/*
+	 * check if (x, y1) belongs to the 1st segment of rtsc.
+	 * if so, add the offset.
+	 */
+	if (rtsc->x + rtsc->dx > x)
+		dx += rtsc->x + rtsc->dx - x;
+	dy = seg_x2y(dx, isc->sm1);
+
+	rtsc->x = x;
+	rtsc->y = y;
+	rtsc->dx = dx;
+	rtsc->dy = dy;
+}
+
+static void
+init_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+	u64 cur_time = psched_get_time();
+
+	/* update the deadline curve */
+	rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+
+	/*
+	 * update the eligible curve.
+	 * for concave, it is equal to the deadline curve.
+	 * for convex, it is a linear curve with slope m2.
+	 */
+	cl->cl_eligible = cl->cl_deadline;
+	if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+		cl->cl_eligible.dx = 0;
+		cl->cl_eligible.dy = 0;
+	}
+
+	/* compute e and d */
+	cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+	eltree_insert(cl);
+}
+
+static void
+update_ed(struct hfsc_class *cl, unsigned int next_len)
+{
+	cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul);
+	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+
+	eltree_update(cl);
+}
+
+static inline void
+update_d(struct hfsc_class *cl, unsigned int next_len)
+{
+	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+}
+
+static inline void
+update_cfmin(struct hfsc_class *cl)
+{
+	struct rb_node *n = rb_first(&cl->cf_tree);
+	struct hfsc_class *p;
+
+	if (n == NULL) {
+		cl->cl_cfmin = 0;
+		return;
+	}
+	p = rb_entry(n, struct hfsc_class, cf_node);
+	cl->cl_cfmin = p->cl_f;
+}
+
+static void
+init_vf(struct hfsc_class *cl, unsigned int len)
+{
+	struct hfsc_class *max_cl;
+	struct rb_node *n;
+	u64 vt, f, cur_time;
+	int go_active;
+
+	cur_time = 0;
+	go_active = 1;
+	for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+		if (go_active && cl->cl_nactive++ == 0)
+			go_active = 1;
+		else
+			go_active = 0;
+
+		if (go_active) {
+			n = rb_last(&cl->cl_parent->vt_tree);
+			if (n != NULL) {
+				max_cl = rb_entry(n, struct hfsc_class, vt_node);
+				/*
+				 * set vt to the average of the min and max
+				 * classes.  if the parent's period didn't
+				 * change, don't decrease vt of the class.
+				 */
+				vt = max_cl->cl_vt;
+				if (cl->cl_parent->cl_cvtmin != 0)
+					vt = (cl->cl_parent->cl_cvtmin + vt)/2;
+
+				if (cl->cl_parent->cl_vtperiod !=
+				    cl->cl_parentperiod || vt > cl->cl_vt)
+					cl->cl_vt = vt;
+			} else {
+				/*
+				 * first child for a new parent backlog period.
+				 * add parent's cvtmax to cvtoff to make a new
+				 * vt (vtoff + vt) larger than the vt in the
+				 * last period for all children.
+				 */
+				vt = cl->cl_parent->cl_cvtmax;
+				cl->cl_parent->cl_cvtoff += vt;
+				cl->cl_parent->cl_cvtmax = 0;
+				cl->cl_parent->cl_cvtmin = 0;
+				cl->cl_vt = 0;
+			}
+
+			cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
+							cl->cl_pcvtoff;
+
+			/* update the virtual curve */
+			vt = cl->cl_vt + cl->cl_vtoff;
+			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
+						      cl->cl_total);
+			if (cl->cl_virtual.x == vt) {
+				cl->cl_virtual.x -= cl->cl_vtoff;
+				cl->cl_vtoff = 0;
+			}
+			cl->cl_vtadj = 0;
+
+			cl->cl_vtperiod++;  /* increment vt period */
+			cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
+			if (cl->cl_parent->cl_nactive == 0)
+				cl->cl_parentperiod++;
+			cl->cl_f = 0;
+
+			vttree_insert(cl);
+			cftree_insert(cl);
+
+			if (cl->cl_flags & HFSC_USC) {
+				/* class has upper limit curve */
+				if (cur_time == 0)
+					cur_time = psched_get_time();
+
+				/* update the ulimit curve */
+				rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time,
+					 cl->cl_total);
+				/* compute myf */
+				cl->cl_myf = rtsc_y2x(&cl->cl_ulimit,
+						      cl->cl_total);
+				cl->cl_myfadj = 0;
+			}
+		}
+
+		f = max(cl->cl_myf, cl->cl_cfmin);
+		if (f != cl->cl_f) {
+			cl->cl_f = f;
+			cftree_update(cl);
+		}
+		update_cfmin(cl->cl_parent);
+	}
+}
+
+static void
+update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
+{
+	u64 f; /* , myf_bound, delta; */
+	int go_passive = 0;
+
+	if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
+		go_passive = 1;
+
+	for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+		cl->cl_total += len;
+
+		if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
+			continue;
+
+		if (go_passive && --cl->cl_nactive == 0)
+			go_passive = 1;
+		else
+			go_passive = 0;
+
+		if (go_passive) {
+			/* no more active child, going passive */
+
+			/* update cvtmax of the parent class */
+			if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
+				cl->cl_parent->cl_cvtmax = cl->cl_vt;
+
+			/* remove this class from the vt tree */
+			vttree_remove(cl);
+
+			cftree_remove(cl);
+			update_cfmin(cl->cl_parent);
+
+			continue;
+		}
+
+		/*
+		 * update vt and f
+		 */
+		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+			    - cl->cl_vtoff + cl->cl_vtadj;
+
+		/*
+		 * if vt of the class is smaller than cvtmin,
+		 * the class was skipped in the past due to non-fit.
+		 * if so, we need to adjust vtadj.
+		 */
+		if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+			cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+			cl->cl_vt = cl->cl_parent->cl_cvtmin;
+		}
+
+		/* update the vt tree */
+		vttree_update(cl);
+
+		if (cl->cl_flags & HFSC_USC) {
+			cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
+							      cl->cl_total);
+#if 0
+			/*
+			 * This code causes classes to stay way under their
+			 * limit when multiple classes are used at gigabit
+			 * speed. needs investigation. -kaber
+			 */
+			/*
+			 * if myf lags behind by more than one clock tick
+			 * from the current time, adjust myfadj to prevent
+			 * a rate-limited class from going greedy.
+			 * in a steady state under rate-limiting, myf
+			 * fluctuates within one clock tick.
+			 */
+			myf_bound = cur_time - PSCHED_JIFFIE2US(1);
+			if (cl->cl_myf < myf_bound) {
+				delta = cur_time - cl->cl_myf;
+				cl->cl_myfadj += delta;
+				cl->cl_myf += delta;
+			}
+#endif
+		}
+
+		f = max(cl->cl_myf, cl->cl_cfmin);
+		if (f != cl->cl_f) {
+			cl->cl_f = f;
+			cftree_update(cl);
+			update_cfmin(cl->cl_parent);
+		}
+	}
+}
+
+static void
+set_active(struct hfsc_class *cl, unsigned int len)
+{
+	if (cl->cl_flags & HFSC_RSC)
+		init_ed(cl, len);
+	if (cl->cl_flags & HFSC_FSC)
+		init_vf(cl, len);
+
+	list_add_tail(&cl->dlist, &cl->sched->droplist);
+}
+
+static void
+set_passive(struct hfsc_class *cl)
+{
+	if (cl->cl_flags & HFSC_RSC)
+		eltree_remove(cl);
+
+	list_del(&cl->dlist);
+
+	/*
+	 * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+	 * needs to be called explicitly to remove a class from vttree.
+	 */
+}
+
+static unsigned int
+qdisc_peek_len(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+
+	skb = sch->ops->peek(sch);
+	if (skb == NULL) {
+		qdisc_warn_nonwc("qdisc_peek_len", sch);
+		return 0;
+	}
+	len = qdisc_pkt_len(skb);
+
+	return len;
+}
+
+static void
+hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static void
+hfsc_adjust_levels(struct hfsc_class *cl)
+{
+	struct hfsc_class *p;
+	unsigned int level;
+
+	do {
+		level = 0;
+		list_for_each_entry(p, &cl->children, siblings) {
+			if (p->level >= level)
+				level = p->level + 1;
+		}
+		cl->level = level;
+	} while ((cl = cl->cl_parent) != NULL);
+}
+
+static inline struct hfsc_class *
+hfsc_find_class(u32 classid, struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct hfsc_class, cl_common);
+}
+
+static void
+hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc,
+		u64 cur_time)
+{
+	sc2isc(rsc, &cl->cl_rsc);
+	rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul);
+	cl->cl_eligible = cl->cl_deadline;
+	if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) {
+		cl->cl_eligible.dx = 0;
+		cl->cl_eligible.dy = 0;
+	}
+	cl->cl_flags |= HFSC_RSC;
+}
+
+static void
+hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc)
+{
+	sc2isc(fsc, &cl->cl_fsc);
+	rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+	cl->cl_flags |= HFSC_FSC;
+}
+
+static void
+hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
+		u64 cur_time)
+{
+	sc2isc(usc, &cl->cl_usc);
+	rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total);
+	cl->cl_flags |= HFSC_USC;
+}
+
+static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
+	[TCA_HFSC_RSC]	= { .len = sizeof(struct tc_service_curve) },
+	[TCA_HFSC_FSC]	= { .len = sizeof(struct tc_service_curve) },
+	[TCA_HFSC_USC]	= { .len = sizeof(struct tc_service_curve) },
+};
+
+static int
+hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+		  struct nlattr **tca, unsigned long *arg)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl = (struct hfsc_class *)*arg;
+	struct hfsc_class *parent = NULL;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct nlattr *tb[TCA_HFSC_MAX + 1];
+	struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL;
+	u64 cur_time;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_HFSC_RSC]) {
+		rsc = nla_data(tb[TCA_HFSC_RSC]);
+		if (rsc->m1 == 0 && rsc->m2 == 0)
+			rsc = NULL;
+	}
+
+	if (tb[TCA_HFSC_FSC]) {
+		fsc = nla_data(tb[TCA_HFSC_FSC]);
+		if (fsc->m1 == 0 && fsc->m2 == 0)
+			fsc = NULL;
+	}
+
+	if (tb[TCA_HFSC_USC]) {
+		usc = nla_data(tb[TCA_HFSC_USC]);
+		if (usc->m1 == 0 && usc->m2 == 0)
+			usc = NULL;
+	}
+
+	if (cl != NULL) {
+		if (parentid) {
+			if (cl->cl_parent &&
+			    cl->cl_parent->cl_common.classid != parentid)
+				return -EINVAL;
+			if (cl->cl_parent == NULL && parentid != TC_H_ROOT)
+				return -EINVAL;
+		}
+		cur_time = psched_get_time();
+
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					      qdisc_root_sleeping_lock(sch),
+					      tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+
+		sch_tree_lock(sch);
+		if (rsc != NULL)
+			hfsc_change_rsc(cl, rsc, cur_time);
+		if (fsc != NULL)
+			hfsc_change_fsc(cl, fsc);
+		if (usc != NULL)
+			hfsc_change_usc(cl, usc, cur_time);
+
+		if (cl->qdisc->q.qlen != 0) {
+			if (cl->cl_flags & HFSC_RSC)
+				update_ed(cl, qdisc_peek_len(cl->qdisc));
+			if (cl->cl_flags & HFSC_FSC)
+				update_vf(cl, 0, cur_time);
+		}
+		sch_tree_unlock(sch);
+
+		return 0;
+	}
+
+	if (parentid == TC_H_ROOT)
+		return -EEXIST;
+
+	parent = &q->root;
+	if (parentid) {
+		parent = hfsc_find_class(parentid, sch);
+		if (parent == NULL)
+			return -ENOENT;
+	}
+
+	if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
+		return -EINVAL;
+	if (hfsc_find_class(classid, sch))
+		return -EEXIST;
+
+	if (rsc == NULL && fsc == NULL)
+		return -EINVAL;
+
+	cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err) {
+			kfree(cl);
+			return err;
+		}
+	}
+
+	if (rsc != NULL)
+		hfsc_change_rsc(cl, rsc, 0);
+	if (fsc != NULL)
+		hfsc_change_fsc(cl, fsc);
+	if (usc != NULL)
+		hfsc_change_usc(cl, usc, 0);
+
+	cl->cl_common.classid = classid;
+	cl->refcnt    = 1;
+	cl->sched     = q;
+	cl->cl_parent = parent;
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+	INIT_LIST_HEAD(&cl->children);
+	cl->vt_tree = RB_ROOT;
+	cl->cf_tree = RB_ROOT;
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
+	list_add_tail(&cl->siblings, &parent->children);
+	if (parent->level == 0)
+		hfsc_purge_queue(sch, parent);
+	hfsc_adjust_levels(parent);
+	cl->cl_pcvtoff = parent->cl_cvtoff;
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static void
+hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&cl->filter_list);
+	qdisc_destroy(cl->qdisc);
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	if (cl != &q->root)
+		kfree(cl);
+}
+
+static int
+hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	list_del(&cl->siblings);
+	hfsc_adjust_levels(cl->cl_parent);
+
+	hfsc_purge_queue(sch, cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct hfsc_class *
+hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *head, *cl;
+	struct tcf_result res;
+	struct tcf_proto *tcf;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 &&
+	    (cl = hfsc_find_class(skb->priority, sch)) != NULL)
+		if (cl->level == 0)
+			return cl;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	head = &q->root;
+	tcf = q->root.filter_list;
+	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct hfsc_class *)res.class;
+		if (!cl) {
+			cl = hfsc_find_class(res.classid, sch);
+			if (!cl)
+				break; /* filter selected invalid classid */
+			if (cl->level >= head->level)
+				break; /* filter may only point downwards */
+		}
+
+		if (cl->level == 0)
+			return cl; /* hit leaf class */
+
+		/* apply inner filter chain */
+		tcf = cl->filter_list;
+		head = cl;
+	}
+
+	/* classification failed, try default class */
+	cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+	if (cl == NULL || cl->level > 0)
+		return NULL;
+
+	return cl;
+}
+
+static int
+hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		 struct Qdisc **old)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl->level > 0)
+		return -EINVAL;
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					cl->cl_common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	hfsc_purge_queue(sch, cl);
+	*old = cl->qdisc;
+	cl->qdisc = new;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *
+hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl->level == 0)
+		return cl->qdisc;
+
+	return NULL;
+}
+
+static void
+hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0) {
+		update_vf(cl, 0, 0);
+		set_passive(cl);
+	}
+}
+
+static unsigned long
+hfsc_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct hfsc_class *cl = hfsc_find_class(classid, sch);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void
+hfsc_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (--cl->refcnt == 0)
+		hfsc_destroy_class(sch, cl);
+}
+
+static unsigned long
+hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+	struct hfsc_class *p = (struct hfsc_class *)parent;
+	struct hfsc_class *cl = hfsc_find_class(classid, sch);
+
+	if (cl != NULL) {
+		if (p != NULL && p->level <= cl->level)
+			return 0;
+		cl->filter_cnt++;
+	}
+
+	return (unsigned long)cl;
+}
+
+static void
+hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static struct tcf_proto **
+hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+
+	if (cl == NULL)
+		cl = &q->root;
+
+	return &cl->filter_list;
+}
+
+static int
+hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
+{
+	struct tc_service_curve tsc;
+
+	tsc.m1 = sm2m(sc->sm1);
+	tsc.d  = dx2d(sc->dx);
+	tsc.m2 = sm2m(sc->sm2);
+	NLA_PUT(skb, attr, sizeof(tsc), &tsc);
+
+	return skb->len;
+
+ nla_put_failure:
+	return -1;
+}
+
+static int
+hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
+{
+	if ((cl->cl_flags & HFSC_RSC) &&
+	    (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0))
+		goto nla_put_failure;
+
+	if ((cl->cl_flags & HFSC_FSC) &&
+	    (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0))
+		goto nla_put_failure;
+
+	if ((cl->cl_flags & HFSC_USC) &&
+	    (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0))
+		goto nla_put_failure;
+
+	return skb->len;
+
+ nla_put_failure:
+	return -1;
+}
+
+static int
+hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb,
+		struct tcmsg *tcm)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid :
+					  TC_H_ROOT;
+	tcm->tcm_handle = cl->cl_common.classid;
+	if (cl->level == 0)
+		tcm->tcm_info = cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (hfsc_dump_curves(skb, cl) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+ nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int
+hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+	struct gnet_dump *d)
+{
+	struct hfsc_class *cl = (struct hfsc_class *)arg;
+	struct tc_hfsc_stats xstats;
+
+	cl->qstats.qlen = cl->qdisc->q.qlen;
+	xstats.level   = cl->level;
+	xstats.period  = cl->cl_vtperiod;
+	xstats.work    = cl->cl_total;
+	xstats.rtwork  = cl->cl_cumul;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+
+
+static void
+hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hlist_node *n;
+	struct hfsc_class *cl;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i],
+				     cl_common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static void
+hfsc_schedule_watchdog(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl;
+	u64 next_time = 0;
+
+	cl = eltree_get_minel(q);
+	if (cl)
+		next_time = cl->cl_e;
+	if (q->root.cl_cfmin != 0) {
+		if (next_time == 0 || next_time > q->root.cl_cfmin)
+			next_time = q->root.cl_cfmin;
+	}
+	WARN_ON(next_time == 0);
+	qdisc_watchdog_schedule(&q->watchdog, next_time);
+}
+
+static int
+hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct tc_hfsc_qopt *qopt;
+	int err;
+
+	if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+	qopt = nla_data(opt);
+
+	q->defcls = qopt->defcls;
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+	q->eligible = RB_ROOT;
+	INIT_LIST_HEAD(&q->droplist);
+
+	q->root.cl_common.classid = sch->handle;
+	q->root.refcnt  = 1;
+	q->root.sched   = q;
+	q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					  sch->handle);
+	if (q->root.qdisc == NULL)
+		q->root.qdisc = &noop_qdisc;
+	INIT_LIST_HEAD(&q->root.children);
+	q->root.vt_tree = RB_ROOT;
+	q->root.cf_tree = RB_ROOT;
+
+	qdisc_class_hash_insert(&q->clhash, &q->root.cl_common);
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	return 0;
+}
+
+static int
+hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct tc_hfsc_qopt *qopt;
+
+	if (opt == NULL || nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+	qopt = nla_data(opt);
+
+	sch_tree_lock(sch);
+	q->defcls = qopt->defcls;
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static void
+hfsc_reset_class(struct hfsc_class *cl)
+{
+	cl->cl_total        = 0;
+	cl->cl_cumul        = 0;
+	cl->cl_d            = 0;
+	cl->cl_e            = 0;
+	cl->cl_vt           = 0;
+	cl->cl_vtadj        = 0;
+	cl->cl_vtoff        = 0;
+	cl->cl_cvtmin       = 0;
+	cl->cl_cvtmax       = 0;
+	cl->cl_cvtoff       = 0;
+	cl->cl_pcvtoff      = 0;
+	cl->cl_vtperiod     = 0;
+	cl->cl_parentperiod = 0;
+	cl->cl_f            = 0;
+	cl->cl_myf          = 0;
+	cl->cl_myfadj       = 0;
+	cl->cl_cfmin        = 0;
+	cl->cl_nactive      = 0;
+
+	cl->vt_tree = RB_ROOT;
+	cl->cf_tree = RB_ROOT;
+	qdisc_reset(cl->qdisc);
+
+	if (cl->cl_flags & HFSC_RSC)
+		rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0);
+	if (cl->cl_flags & HFSC_FSC)
+		rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0);
+	if (cl->cl_flags & HFSC_USC)
+		rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0);
+}
+
+static void
+hfsc_reset_qdisc(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+			hfsc_reset_class(cl);
+	}
+	q->eligible = RB_ROOT;
+	INIT_LIST_HEAD(&q->droplist);
+	qdisc_watchdog_cancel(&q->watchdog);
+	sch->q.qlen = 0;
+}
+
+static void
+hfsc_destroy_qdisc(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hlist_node *n, *next;
+	struct hfsc_class *cl;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+					  cl_common.hnode)
+			hfsc_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int
+hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_hfsc_qopt qopt;
+
+	qopt.defcls = q->defcls;
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+	return skb->len;
+
+ nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int
+hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct hfsc_class *cl;
+	int uninitialized_var(err);
+
+	cl = hfsc_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	if (cl->qdisc->q.qlen == 1)
+		set_active(cl, qdisc_pkt_len(skb));
+
+	bstats_update(&cl->bstats, skb);
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+hfsc_dequeue(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl;
+	struct sk_buff *skb;
+	u64 cur_time;
+	unsigned int next_len;
+	int realtime = 0;
+
+	if (sch->q.qlen == 0)
+		return NULL;
+
+	cur_time = psched_get_time();
+
+	/*
+	 * if there are eligible classes, use real-time criteria.
+	 * find the class with the minimum deadline among
+	 * the eligible classes.
+	 */
+	cl = eltree_get_mindl(q, cur_time);
+	if (cl) {
+		realtime = 1;
+	} else {
+		/*
+		 * use link-sharing criteria
+		 * get the class with the minimum vt in the hierarchy
+		 */
+		cl = vttree_get_minvt(&q->root, cur_time);
+		if (cl == NULL) {
+			sch->qstats.overlimits++;
+			hfsc_schedule_watchdog(sch);
+			return NULL;
+		}
+	}
+
+	skb = qdisc_dequeue_peeked(cl->qdisc);
+	if (skb == NULL) {
+		qdisc_warn_nonwc("HFSC", cl->qdisc);
+		return NULL;
+	}
+
+	update_vf(cl, qdisc_pkt_len(skb), cur_time);
+	if (realtime)
+		cl->cl_cumul += qdisc_pkt_len(skb);
+
+	if (cl->qdisc->q.qlen != 0) {
+		if (cl->cl_flags & HFSC_RSC) {
+			/* update ed */
+			next_len = qdisc_peek_len(cl->qdisc);
+			if (realtime)
+				update_ed(cl, next_len);
+			else
+				update_d(cl, next_len);
+		}
+	} else {
+		/* the class becomes passive */
+		set_passive(cl);
+	}
+
+	qdisc_unthrottled(sch);
+	qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+
+	return skb;
+}
+
+static unsigned int
+hfsc_drop(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl;
+	unsigned int len;
+
+	list_for_each_entry(cl, &q->droplist, dlist) {
+		if (cl->qdisc->ops->drop != NULL &&
+		    (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
+			if (cl->qdisc->q.qlen == 0) {
+				update_vf(cl, 0, 0);
+				set_passive(cl);
+			} else {
+				list_move_tail(&cl->dlist, &q->droplist);
+			}
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+			sch->q.qlen--;
+			return len;
+		}
+	}
+	return 0;
+}
+
+static const struct Qdisc_class_ops hfsc_class_ops = {
+	.change		= hfsc_change_class,
+	.delete		= hfsc_delete_class,
+	.graft		= hfsc_graft_class,
+	.leaf		= hfsc_class_leaf,
+	.qlen_notify	= hfsc_qlen_notify,
+	.get		= hfsc_get_class,
+	.put		= hfsc_put_class,
+	.bind_tcf	= hfsc_bind_tcf,
+	.unbind_tcf	= hfsc_unbind_tcf,
+	.tcf_chain	= hfsc_tcf_chain,
+	.dump		= hfsc_dump_class,
+	.dump_stats	= hfsc_dump_class_stats,
+	.walk		= hfsc_walk
+};
+
+static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
+	.id		= "hfsc",
+	.init		= hfsc_init_qdisc,
+	.change		= hfsc_change_qdisc,
+	.reset		= hfsc_reset_qdisc,
+	.destroy	= hfsc_destroy_qdisc,
+	.dump		= hfsc_dump_qdisc,
+	.enqueue	= hfsc_enqueue,
+	.dequeue	= hfsc_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= hfsc_drop,
+	.cl_ops		= &hfsc_class_ops,
+	.priv_size	= sizeof(struct hfsc_sched),
+	.owner		= THIS_MODULE
+};
+
+static int __init
+hfsc_init(void)
+{
+	return register_qdisc(&hfsc_qdisc_ops);
+}
+
+static void __exit
+hfsc_cleanup(void)
+{
+	unregister_qdisc(&hfsc_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(hfsc_init);
+module_exit(hfsc_cleanup);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
new file mode 100644
index 00000000..29b942ce
--- /dev/null
+++ b/net/sched/sch_htb.c
@@ -0,0 +1,1587 @@
+/*
+ * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Martin Devera, <devik@cdi.cz>
+ *
+ * Credits (in time order) for older HTB versions:
+ *              Stef Coene <stef.coene@docum.org>
+ *			HTB support at LARTC mailing list
+ *		Ondrej Kraus, <krauso@barr.cz>
+ *			found missing INIT_QDISC(htb)
+ *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
+ *			helped a lot to locate nasty class stall bug
+ *		Andi Kleen, Jamal Hadi, Bert Hubert
+ *			code review and helpful comments on shaping
+ *		Tomasz Wrona, <tw@eter.tym.pl>
+ *			created test case so that I was able to fix nasty bug
+ *		Wilfried Weissmann
+ *			spotted bug in dequeue code and helped with fix
+ *		Jiri Fojtasek
+ *			fixed requeue routine
+ *		and many others. thanks.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+/* HTB algorithm.
+    Author: devik@cdi.cz
+    ========================================================================
+    HTB is like TBF with multiple classes. It is also similar to CBQ because
+    it allows to assign priority to each class in hierarchy.
+    In fact it is another implementation of Floyd's formal sharing.
+
+    Levels:
+    Each class is assigned level. Leaf has ALWAYS level 0 and root
+    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
+    one less than their parent.
+*/
+
+static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
+#define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
+
+#if HTB_VER >> 16 != TC_HTB_PROTOVER
+#error "Mismatched sch_htb.c and pkt_sch.h"
+#endif
+
+/* Module parameter and sysfs export */
+module_param    (htb_hysteresis, int, 0640);
+MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
+
+/* used internaly to keep status of single class */
+enum htb_cmode {
+	HTB_CANT_SEND,		/* class can't send and can't borrow */
+	HTB_MAY_BORROW,		/* class can't send but may borrow */
+	HTB_CAN_SEND		/* class can send */
+};
+
+/* interior & leaf nodes; props specific to leaves are marked L: */
+struct htb_class {
+	struct Qdisc_class_common common;
+	/* general class parameters */
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est rate_est;
+	struct tc_htb_xstats xstats;	/* our special stats */
+	int refcnt;		/* usage count of this class */
+
+	/* topology */
+	int level;		/* our level (see above) */
+	unsigned int children;
+	struct htb_class *parent;	/* parent class */
+
+	int prio;		/* these two are used only by leaves... */
+	int quantum;		/* but stored for parent-to-leaf return */
+
+	union {
+		struct htb_class_leaf {
+			struct Qdisc *q;
+			int deficit[TC_HTB_MAXDEPTH];
+			struct list_head drop_list;
+		} leaf;
+		struct htb_class_inner {
+			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
+			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
+			/* When class changes from state 1->2 and disconnects from
+			 * parent's feed then we lost ptr value and start from the
+			 * first child again. Here we store classid of the
+			 * last valid ptr (used when ptr is NULL).
+			 */
+			u32 last_ptr_id[TC_HTB_NUMPRIO];
+		} inner;
+	} un;
+	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
+	struct rb_node pq_node;	/* node for event queue */
+	psched_time_t pq_key;
+
+	int prio_activity;	/* for which prios are we active */
+	enum htb_cmode cmode;	/* current mode of the class */
+
+	/* class attached filters */
+	struct tcf_proto *filter_list;
+	int filter_cnt;
+
+	/* token bucket parameters */
+	struct qdisc_rate_table *rate;	/* rate table of the class itself */
+	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
+	long buffer, cbuffer;	/* token bucket depth/rate */
+	psched_tdiff_t mbuffer;	/* max wait time */
+	long tokens, ctokens;	/* current number of tokens */
+	psched_time_t t_c;	/* checkpoint time */
+};
+
+struct htb_sched {
+	struct Qdisc_class_hash clhash;
+	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
+
+	/* self list - roots of self generating tree */
+	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+	int row_mask[TC_HTB_MAXDEPTH];
+	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+
+	/* self wait list - roots of wait PQs per row */
+	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
+
+	/* time of nearest event per level (row) */
+	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
+
+	int defcls;		/* class where unclassified flows go to */
+
+	/* filters for qdisc itself */
+	struct tcf_proto *filter_list;
+
+	int rate2quantum;	/* quant = rate / rate2quantum */
+	psched_time_t now;	/* cached dequeue time */
+	struct qdisc_watchdog watchdog;
+
+	/* non shaped skbs; let them go directly thru */
+	struct sk_buff_head direct_queue;
+	int direct_qlen;	/* max qlen of above */
+
+	long direct_pkts;
+
+#define HTB_WARN_TOOMANYEVENTS	0x1
+	unsigned int warned;	/* only one warning */
+	struct work_struct work;
+};
+
+/* find class in global hash table using given handle */
+static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, handle);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct htb_class, common);
+}
+
+/**
+ * htb_classify - classify a packet into class
+ *
+ * It returns NULL if the packet should be dropped or -1 if the packet
+ * should be passed directly thru. In all other cases leaf class is returned.
+ * We allow direct class selection by classid in priority. The we examine
+ * filters in qdisc and in inner nodes (if higher filter points to the inner
+ * node). If we end up with classid MAJOR:0 we enqueue the skb into special
+ * internal fifo (direct). These packets then go directly thru. If we still
+ * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessful
+ * then finish and return direct queue.
+ */
+#define HTB_DIRECT ((struct htb_class *)-1L)
+
+static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl;
+	struct tcf_result res;
+	struct tcf_proto *tcf;
+	int result;
+
+	/* allow to select class by setting skb->priority to valid classid;
+	 * note that nfmark can be used too by attaching filter fw with no
+	 * rules in it
+	 */
+	if (skb->priority == sch->handle)
+		return HTB_DIRECT;	/* X:0 (direct flow) selected */
+	cl = htb_find(skb->priority, sch);
+	if (cl && cl->level == 0)
+		return cl;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	tcf = q->filter_list;
+	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (void *)res.class;
+		if (!cl) {
+			if (res.classid == sch->handle)
+				return HTB_DIRECT;	/* X:0 (direct flow) */
+			cl = htb_find(res.classid, sch);
+			if (!cl)
+				break;	/* filter selected invalid classid */
+		}
+		if (!cl->level)
+			return cl;	/* we hit leaf; return it */
+
+		/* we have got inner class; apply inner filter chain */
+		tcf = cl->filter_list;
+	}
+	/* classification failed; try to use default class */
+	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
+	if (!cl || cl->level)
+		return HTB_DIRECT;	/* bad default .. this is safe bet */
+	return cl;
+}
+
+/**
+ * htb_add_to_id_tree - adds class to the round robin list
+ *
+ * Routine adds class to the list (actually tree) sorted by classid.
+ * Make sure that class is not already on such list for given prio.
+ */
+static void htb_add_to_id_tree(struct rb_root *root,
+			       struct htb_class *cl, int prio)
+{
+	struct rb_node **p = &root->rb_node, *parent = NULL;
+
+	while (*p) {
+		struct htb_class *c;
+		parent = *p;
+		c = rb_entry(parent, struct htb_class, node[prio]);
+
+		if (cl->common.classid > c->common.classid)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&cl->node[prio], parent, p);
+	rb_insert_color(&cl->node[prio], root);
+}
+
+/**
+ * htb_add_to_wait_tree - adds class to the event queue with delay
+ *
+ * The class is added to priority event queue to indicate that class will
+ * change its mode in cl->pq_key microseconds. Make sure that class is not
+ * already in the queue.
+ */
+static void htb_add_to_wait_tree(struct htb_sched *q,
+				 struct htb_class *cl, long delay)
+{
+	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+
+	cl->pq_key = q->now + delay;
+	if (cl->pq_key == q->now)
+		cl->pq_key++;
+
+	/* update the nearest event cache */
+	if (q->near_ev_cache[cl->level] > cl->pq_key)
+		q->near_ev_cache[cl->level] = cl->pq_key;
+
+	while (*p) {
+		struct htb_class *c;
+		parent = *p;
+		c = rb_entry(parent, struct htb_class, pq_node);
+		if (cl->pq_key >= c->pq_key)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&cl->pq_node, parent, p);
+	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
+}
+
+/**
+ * htb_next_rb_node - finds next node in binary tree
+ *
+ * When we are past last key we return NULL.
+ * Average complexity is 2 steps per call.
+ */
+static inline void htb_next_rb_node(struct rb_node **n)
+{
+	*n = rb_next(*n);
+}
+
+/**
+ * htb_add_class_to_row - add class to its row
+ *
+ * The class is added to row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_add_class_to_row(struct htb_sched *q,
+					struct htb_class *cl, int mask)
+{
+	q->row_mask[cl->level] |= mask;
+	while (mask) {
+		int prio = ffz(~mask);
+		mask &= ~(1 << prio);
+		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
+	}
+}
+
+/* If this triggers, it is a bug in this code, but it need not be fatal */
+static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
+{
+	if (RB_EMPTY_NODE(rb)) {
+		WARN_ON(1);
+	} else {
+		rb_erase(rb, root);
+		RB_CLEAR_NODE(rb);
+	}
+}
+
+
+/**
+ * htb_remove_class_from_row - removes class from its row
+ *
+ * The class is removed from row at priorities marked in mask.
+ * It does nothing if mask == 0.
+ */
+static inline void htb_remove_class_from_row(struct htb_sched *q,
+						 struct htb_class *cl, int mask)
+{
+	int m = 0;
+
+	while (mask) {
+		int prio = ffz(~mask);
+
+		mask &= ~(1 << prio);
+		if (q->ptr[cl->level][prio] == cl->node + prio)
+			htb_next_rb_node(q->ptr[cl->level] + prio);
+
+		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
+		if (!q->row[cl->level][prio].rb_node)
+			m |= 1 << prio;
+	}
+	q->row_mask[cl->level] &= ~m;
+}
+
+/**
+ * htb_activate_prios - creates active classe's feed chain
+ *
+ * The class is connected to ancestors and/or appropriate rows
+ * for priorities it is participating on. cl->cmode must be new
+ * (activated) mode. It does nothing if cl->prio_activity == 0.
+ */
+static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+	struct htb_class *p = cl->parent;
+	long m, mask = cl->prio_activity;
+
+	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+		m = mask;
+		while (m) {
+			int prio = ffz(~m);
+			m &= ~(1 << prio);
+
+			if (p->un.inner.feed[prio].rb_node)
+				/* parent already has its feed in use so that
+				 * reset bit in mask as parent is already ok
+				 */
+				mask &= ~(1 << prio);
+
+			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
+		}
+		p->prio_activity |= mask;
+		cl = p;
+		p = cl->parent;
+
+	}
+	if (cl->cmode == HTB_CAN_SEND && mask)
+		htb_add_class_to_row(q, cl, mask);
+}
+
+/**
+ * htb_deactivate_prios - remove class from feed chain
+ *
+ * cl->cmode must represent old mode (before deactivation). It does
+ * nothing if cl->prio_activity == 0. Class is removed from all feed
+ * chains and rows.
+ */
+static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
+{
+	struct htb_class *p = cl->parent;
+	long m, mask = cl->prio_activity;
+
+	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
+		m = mask;
+		mask = 0;
+		while (m) {
+			int prio = ffz(~m);
+			m &= ~(1 << prio);
+
+			if (p->un.inner.ptr[prio] == cl->node + prio) {
+				/* we are removing child which is pointed to from
+				 * parent feed - forget the pointer but remember
+				 * classid
+				 */
+				p->un.inner.last_ptr_id[prio] = cl->common.classid;
+				p->un.inner.ptr[prio] = NULL;
+			}
+
+			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
+
+			if (!p->un.inner.feed[prio].rb_node)
+				mask |= 1 << prio;
+		}
+
+		p->prio_activity &= ~mask;
+		cl = p;
+		p = cl->parent;
+
+	}
+	if (cl->cmode == HTB_CAN_SEND && mask)
+		htb_remove_class_from_row(q, cl, mask);
+}
+
+static inline long htb_lowater(const struct htb_class *cl)
+{
+	if (htb_hysteresis)
+		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
+	else
+		return 0;
+}
+static inline long htb_hiwater(const struct htb_class *cl)
+{
+	if (htb_hysteresis)
+		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
+	else
+		return 0;
+}
+
+
+/**
+ * htb_class_mode - computes and returns current class mode
+ *
+ * It computes cl's mode at time cl->t_c+diff and returns it. If mode
+ * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
+ * from now to time when cl will change its state.
+ * Also it is worth to note that class mode doesn't change simply
+ * at cl->{c,}tokens == 0 but there can rather be hysteresis of
+ * 0 .. -cl->{c,}buffer range. It is meant to limit number of
+ * mode transitions per time unit. The speed gain is about 1/6.
+ */
+static inline enum htb_cmode
+htb_class_mode(struct htb_class *cl, long *diff)
+{
+	long toks;
+
+	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
+		*diff = -toks;
+		return HTB_CANT_SEND;
+	}
+
+	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
+		return HTB_CAN_SEND;
+
+	*diff = -toks;
+	return HTB_MAY_BORROW;
+}
+
+/**
+ * htb_change_class_mode - changes classe's mode
+ *
+ * This should be the only way how to change classe's mode under normal
+ * cirsumstances. Routine will update feed lists linkage, change mode
+ * and add class to the wait event queue if appropriate. New mode should
+ * be different from old one and cl->pq_key has to be valid if changing
+ * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
+ */
+static void
+htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
+{
+	enum htb_cmode new_mode = htb_class_mode(cl, diff);
+
+	if (new_mode == cl->cmode)
+		return;
+
+	if (cl->prio_activity) {	/* not necessary: speed optimization */
+		if (cl->cmode != HTB_CANT_SEND)
+			htb_deactivate_prios(q, cl);
+		cl->cmode = new_mode;
+		if (new_mode != HTB_CANT_SEND)
+			htb_activate_prios(q, cl);
+	} else
+		cl->cmode = new_mode;
+}
+
+/**
+ * htb_activate - inserts leaf cl into appropriate active feeds
+ *
+ * Routine learns (new) priority of leaf and activates feed chain
+ * for the prio. It can be called on already active leaf safely.
+ * It also adds leaf into droplist.
+ */
+static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
+{
+	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+
+	if (!cl->prio_activity) {
+		cl->prio_activity = 1 << cl->prio;
+		htb_activate_prios(q, cl);
+		list_add_tail(&cl->un.leaf.drop_list,
+			      q->drops + cl->prio);
+	}
+}
+
+/**
+ * htb_deactivate - remove leaf cl from active feeds
+ *
+ * Make sure that leaf is active. In the other words it can't be called
+ * with non-active leaf. It also removes class from the drop list.
+ */
+static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
+{
+	WARN_ON(!cl->prio_activity);
+
+	htb_deactivate_prios(q, cl);
+	cl->prio_activity = 0;
+	list_del_init(&cl->un.leaf.drop_list);
+}
+
+static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	int uninitialized_var(ret);
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl = htb_classify(skb, sch, &ret);
+
+	if (cl == HTB_DIRECT) {
+		/* enqueue to helper queue */
+		if (q->direct_queue.qlen < q->direct_qlen) {
+			__skb_queue_tail(&q->direct_queue, skb);
+			q->direct_pkts++;
+		} else {
+			kfree_skb(skb);
+			sch->qstats.drops++;
+			return NET_XMIT_DROP;
+		}
+#ifdef CONFIG_NET_CLS_ACT
+	} else if (!cl) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+#endif
+	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(ret)) {
+			sch->qstats.drops++;
+			cl->qstats.drops++;
+		}
+		return ret;
+	} else {
+		bstats_update(&cl->bstats, skb);
+		htb_activate(q, cl);
+	}
+
+	sch->q.qlen++;
+	return NET_XMIT_SUCCESS;
+}
+
+static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
+{
+	long toks = diff + cl->tokens;
+
+	if (toks > cl->buffer)
+		toks = cl->buffer;
+	toks -= (long) qdisc_l2t(cl->rate, bytes);
+	if (toks <= -cl->mbuffer)
+		toks = 1 - cl->mbuffer;
+
+	cl->tokens = toks;
+}
+
+static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
+{
+	long toks = diff + cl->ctokens;
+
+	if (toks > cl->cbuffer)
+		toks = cl->cbuffer;
+	toks -= (long) qdisc_l2t(cl->ceil, bytes);
+	if (toks <= -cl->mbuffer)
+		toks = 1 - cl->mbuffer;
+
+	cl->ctokens = toks;
+}
+
+/**
+ * htb_charge_class - charges amount "bytes" to leaf and ancestors
+ *
+ * Routine assumes that packet "bytes" long was dequeued from leaf cl
+ * borrowing from "level". It accounts bytes to ceil leaky bucket for
+ * leaf and all ancestors and to rate bucket for ancestors at levels
+ * "level" and higher. It also handles possible change of mode resulting
+ * from the update. Note that mode can also increase here (MAY_BORROW to
+ * CAN_SEND) because we can use more precise clock that event queue here.
+ * In such case we remove class from event queue first.
+ */
+static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
+			     int level, struct sk_buff *skb)
+{
+	int bytes = qdisc_pkt_len(skb);
+	enum htb_cmode old_mode;
+	long diff;
+
+	while (cl) {
+		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		if (cl->level >= level) {
+			if (cl->level == level)
+				cl->xstats.lends++;
+			htb_accnt_tokens(cl, bytes, diff);
+		} else {
+			cl->xstats.borrows++;
+			cl->tokens += diff;	/* we moved t_c; update tokens */
+		}
+		htb_accnt_ctokens(cl, bytes, diff);
+		cl->t_c = q->now;
+
+		old_mode = cl->cmode;
+		diff = 0;
+		htb_change_class_mode(q, cl, &diff);
+		if (old_mode != cl->cmode) {
+			if (old_mode != HTB_CAN_SEND)
+				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+			if (cl->cmode != HTB_CAN_SEND)
+				htb_add_to_wait_tree(q, cl, diff);
+		}
+
+		/* update basic stats except for leaves which are already updated */
+		if (cl->level)
+			bstats_update(&cl->bstats, skb);
+
+		cl = cl->parent;
+	}
+}
+
+/**
+ * htb_do_events - make mode changes to classes at the level
+ *
+ * Scans event queue for pending events and applies them. Returns time of
+ * next pending event (0 for no event in pq, q->now for too many events).
+ * Note: Applied are events whose have cl->pq_key <= q->now.
+ */
+static psched_time_t htb_do_events(struct htb_sched *q, int level,
+				   unsigned long start)
+{
+	/* don't run for longer than 2 jiffies; 2 is used instead of
+	 * 1 to simplify things when jiffy is going to be incremented
+	 * too soon
+	 */
+	unsigned long stop_at = start + 2;
+	while (time_before(jiffies, stop_at)) {
+		struct htb_class *cl;
+		long diff;
+		struct rb_node *p = rb_first(&q->wait_pq[level]);
+
+		if (!p)
+			return 0;
+
+		cl = rb_entry(p, struct htb_class, pq_node);
+		if (cl->pq_key > q->now)
+			return cl->pq_key;
+
+		htb_safe_rb_erase(p, q->wait_pq + level);
+		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
+		htb_change_class_mode(q, cl, &diff);
+		if (cl->cmode != HTB_CAN_SEND)
+			htb_add_to_wait_tree(q, cl, diff);
+	}
+
+	/* too much load - let's continue after a break for scheduling */
+	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
+		pr_warning("htb: too many events!\n");
+		q->warned |= HTB_WARN_TOOMANYEVENTS;
+	}
+
+	return q->now;
+}
+
+/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
+ * is no such one exists.
+ */
+static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
+					      u32 id)
+{
+	struct rb_node *r = NULL;
+	while (n) {
+		struct htb_class *cl =
+		    rb_entry(n, struct htb_class, node[prio]);
+
+		if (id > cl->common.classid) {
+			n = n->rb_right;
+		} else if (id < cl->common.classid) {
+			r = n;
+			n = n->rb_left;
+		} else {
+			return n;
+		}
+	}
+	return r;
+}
+
+/**
+ * htb_lookup_leaf - returns next leaf class in DRR order
+ *
+ * Find leaf where current feed pointers points to.
+ */
+static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
+					 struct rb_node **pptr, u32 * pid)
+{
+	int i;
+	struct {
+		struct rb_node *root;
+		struct rb_node **pptr;
+		u32 *pid;
+	} stk[TC_HTB_MAXDEPTH], *sp = stk;
+
+	BUG_ON(!tree->rb_node);
+	sp->root = tree->rb_node;
+	sp->pptr = pptr;
+	sp->pid = pid;
+
+	for (i = 0; i < 65535; i++) {
+		if (!*sp->pptr && *sp->pid) {
+			/* ptr was invalidated but id is valid - try to recover
+			 * the original or next ptr
+			 */
+			*sp->pptr =
+			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
+		}
+		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
+				 * can become out of date quickly
+				 */
+		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
+			*sp->pptr = sp->root;
+			while ((*sp->pptr)->rb_left)
+				*sp->pptr = (*sp->pptr)->rb_left;
+			if (sp > stk) {
+				sp--;
+				if (!*sp->pptr) {
+					WARN_ON(1);
+					return NULL;
+				}
+				htb_next_rb_node(sp->pptr);
+			}
+		} else {
+			struct htb_class *cl;
+			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
+			if (!cl->level)
+				return cl;
+			(++sp)->root = cl->un.inner.feed[prio].rb_node;
+			sp->pptr = cl->un.inner.ptr + prio;
+			sp->pid = cl->un.inner.last_ptr_id + prio;
+		}
+	}
+	WARN_ON(1);
+	return NULL;
+}
+
+/* dequeues packet at given priority and level; call only if
+ * you are sure that there is active class at prio/level
+ */
+static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
+					int level)
+{
+	struct sk_buff *skb = NULL;
+	struct htb_class *cl, *start;
+	/* look initial class up in the row */
+	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
+				     q->ptr[level] + prio,
+				     q->last_ptr_id[level] + prio);
+
+	do {
+next:
+		if (unlikely(!cl))
+			return NULL;
+
+		/* class can be empty - it is unlikely but can be true if leaf
+		 * qdisc drops packets in enqueue routine or if someone used
+		 * graft operation on the leaf since last dequeue;
+		 * simply deactivate and skip such class
+		 */
+		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+			struct htb_class *next;
+			htb_deactivate(q, cl);
+
+			/* row/level might become empty */
+			if ((q->row_mask[level] & (1 << prio)) == 0)
+				return NULL;
+
+			next = htb_lookup_leaf(q->row[level] + prio,
+					       prio, q->ptr[level] + prio,
+					       q->last_ptr_id[level] + prio);
+
+			if (cl == start)	/* fix start if we just deleted it */
+				start = next;
+			cl = next;
+			goto next;
+		}
+
+		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+		if (likely(skb != NULL))
+			break;
+
+		qdisc_warn_nonwc("htb", cl->un.leaf.q);
+		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
+				  ptr[0]) + prio);
+		cl = htb_lookup_leaf(q->row[level] + prio, prio,
+				     q->ptr[level] + prio,
+				     q->last_ptr_id[level] + prio);
+
+	} while (cl != start);
+
+	if (likely(skb != NULL)) {
+		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
+		if (cl->un.leaf.deficit[level] < 0) {
+			cl->un.leaf.deficit[level] += cl->quantum;
+			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
+					  ptr[0]) + prio);
+		}
+		/* this used to be after charge_class but this constelation
+		 * gives us slightly better performance
+		 */
+		if (!cl->un.leaf.q->q.qlen)
+			htb_deactivate(q, cl);
+		htb_charge_class(q, cl, level, skb);
+	}
+	return skb;
+}
+
+static struct sk_buff *htb_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct htb_sched *q = qdisc_priv(sch);
+	int level;
+	psched_time_t next_event;
+	unsigned long start_at;
+
+	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
+	skb = __skb_dequeue(&q->direct_queue);
+	if (skb != NULL) {
+ok:
+		qdisc_bstats_update(sch, skb);
+		qdisc_unthrottled(sch);
+		sch->q.qlen--;
+		return skb;
+	}
+
+	if (!sch->q.qlen)
+		goto fin;
+	q->now = psched_get_time();
+	start_at = jiffies;
+
+	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
+
+	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
+		/* common case optimization - skip event handler quickly */
+		int m;
+		psched_time_t event;
+
+		if (q->now >= q->near_ev_cache[level]) {
+			event = htb_do_events(q, level, start_at);
+			if (!event)
+				event = q->now + PSCHED_TICKS_PER_SEC;
+			q->near_ev_cache[level] = event;
+		} else
+			event = q->near_ev_cache[level];
+
+		if (next_event > event)
+			next_event = event;
+
+		m = ~q->row_mask[level];
+		while (m != (int)(-1)) {
+			int prio = ffz(m);
+
+			m |= 1 << prio;
+			skb = htb_dequeue_tree(q, prio, level);
+			if (likely(skb != NULL))
+				goto ok;
+		}
+	}
+	sch->qstats.overlimits++;
+	if (likely(next_event > q->now))
+		qdisc_watchdog_schedule(&q->watchdog, next_event);
+	else
+		schedule_work(&q->work);
+fin:
+	return skb;
+}
+
+/* try to drop from each class (by prio) until one succeed */
+static unsigned int htb_drop(struct Qdisc *sch)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
+		struct list_head *p;
+		list_for_each(p, q->drops + prio) {
+			struct htb_class *cl = list_entry(p, struct htb_class,
+							  un.leaf.drop_list);
+			unsigned int len;
+			if (cl->un.leaf.q->ops->drop &&
+			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+				sch->q.qlen--;
+				if (!cl->un.leaf.q->q.qlen)
+					htb_deactivate(q, cl);
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+/* reset all classes */
+/* always caled under BH & queue lock */
+static void htb_reset(struct Qdisc *sch)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (cl->level)
+				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+			else {
+				if (cl->un.leaf.q)
+					qdisc_reset(cl->un.leaf.q);
+				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+			}
+			cl->prio_activity = 0;
+			cl->cmode = HTB_CAN_SEND;
+
+		}
+	}
+	qdisc_watchdog_cancel(&q->watchdog);
+	__skb_queue_purge(&q->direct_queue);
+	sch->q.qlen = 0;
+	memset(q->row, 0, sizeof(q->row));
+	memset(q->row_mask, 0, sizeof(q->row_mask));
+	memset(q->wait_pq, 0, sizeof(q->wait_pq));
+	memset(q->ptr, 0, sizeof(q->ptr));
+	for (i = 0; i < TC_HTB_NUMPRIO; i++)
+		INIT_LIST_HEAD(q->drops + i);
+}
+
+static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
+	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
+	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
+	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+};
+
+static void htb_work_func(struct work_struct *work)
+{
+	struct htb_sched *q = container_of(work, struct htb_sched, work);
+	struct Qdisc *sch = q->watchdog.qdisc;
+
+	__netif_schedule(qdisc_root(sch));
+}
+
+static int htb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_HTB_INIT + 1];
+	struct tc_htb_glob *gopt;
+	int err;
+	int i;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_HTB_INIT] == NULL) {
+		pr_err("HTB: hey probably you have bad tc tool ?\n");
+		return -EINVAL;
+	}
+	gopt = nla_data(tb[TCA_HTB_INIT]);
+	if (gopt->version != HTB_VER >> 16) {
+		pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
+		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
+		return -EINVAL;
+	}
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+	for (i = 0; i < TC_HTB_NUMPRIO; i++)
+		INIT_LIST_HEAD(q->drops + i);
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+	INIT_WORK(&q->work, htb_work_func);
+	skb_queue_head_init(&q->direct_queue);
+
+	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
+	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
+		q->direct_qlen = 2;
+
+	if ((q->rate2quantum = gopt->rate2quantum) < 1)
+		q->rate2quantum = 1;
+	q->defcls = gopt->defcls;
+
+	return 0;
+}
+
+static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
+	struct htb_sched *q = qdisc_priv(sch);
+	struct nlattr *nest;
+	struct tc_htb_glob gopt;
+
+	spin_lock_bh(root_lock);
+
+	gopt.direct_pkts = q->direct_pkts;
+	gopt.version = HTB_VER;
+	gopt.rate2quantum = q->rate2quantum;
+	gopt.defcls = q->defcls;
+	gopt.debug = 0;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
+	nla_nest_end(skb, nest);
+
+	spin_unlock_bh(root_lock);
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(root_lock);
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
+	struct nlattr *nest;
+	struct tc_htb_opt opt;
+
+	spin_lock_bh(root_lock);
+	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
+	tcm->tcm_handle = cl->common.classid;
+	if (!cl->level && cl->un.leaf.q)
+		tcm->tcm_info = cl->un.leaf.q->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	memset(&opt, 0, sizeof(opt));
+
+	opt.rate = cl->rate->rate;
+	opt.buffer = cl->buffer;
+	opt.ceil = cl->ceil->rate;
+	opt.cbuffer = cl->cbuffer;
+	opt.quantum = cl->quantum;
+	opt.prio = cl->prio;
+	opt.level = cl->level;
+	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
+
+	nla_nest_end(skb, nest);
+	spin_unlock_bh(root_lock);
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(root_lock);
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int
+htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (!cl->level && cl->un.leaf.q)
+		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
+	cl->xstats.tokens = cl->tokens;
+	cl->xstats.ctokens = cl->ctokens;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
+}
+
+static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (cl->level)
+		return -EINVAL;
+	if (new == NULL &&
+	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				     cl->common.classid)) == NULL)
+		return -ENOBUFS;
+
+	sch_tree_lock(sch);
+	*old = cl->un.leaf.q;
+	cl->un.leaf.q = new;
+	if (*old != NULL) {
+		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+		qdisc_reset(*old);
+	}
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+	return !cl->level ? cl->un.leaf.q : NULL;
+}
+
+static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (cl->un.leaf.q->q.qlen == 0)
+		htb_deactivate(qdisc_priv(sch), cl);
+}
+
+static unsigned long htb_get(struct Qdisc *sch, u32 classid)
+{
+	struct htb_class *cl = htb_find(classid, sch);
+	if (cl)
+		cl->refcnt++;
+	return (unsigned long)cl;
+}
+
+static inline int htb_parent_last_child(struct htb_class *cl)
+{
+	if (!cl->parent)
+		/* the root class */
+		return 0;
+	if (cl->parent->children > 1)
+		/* not the last child */
+		return 0;
+	return 1;
+}
+
+static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
+			       struct Qdisc *new_q)
+{
+	struct htb_class *parent = cl->parent;
+
+	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+
+	if (parent->cmode != HTB_CAN_SEND)
+		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
+
+	parent->level = 0;
+	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
+	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+	parent->tokens = parent->buffer;
+	parent->ctokens = parent->cbuffer;
+	parent->t_c = psched_get_time();
+	parent->cmode = HTB_CAN_SEND;
+}
+
+static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
+{
+	if (!cl->level) {
+		WARN_ON(!cl->un.leaf.q);
+		qdisc_destroy(cl->un.leaf.q);
+	}
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_put_rtab(cl->rate);
+	qdisc_put_rtab(cl->ceil);
+
+	tcf_destroy_chain(&cl->filter_list);
+	kfree(cl);
+}
+
+static void htb_destroy(struct Qdisc *sch)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct hlist_node *n, *next;
+	struct htb_class *cl;
+	unsigned int i;
+
+	cancel_work_sync(&q->work);
+	qdisc_watchdog_cancel(&q->watchdog);
+	/* This line used to be after htb_destroy_class call below
+	 * and surprisingly it worked in 2.4. But it must precede it
+	 * because filter need its target class alive to be able to call
+	 * unbind_filter on it (without Oops).
+	 */
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
+			tcf_destroy_chain(&cl->filter_list);
+	}
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+					  common.hnode)
+			htb_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+	__skb_queue_purge(&q->direct_queue);
+}
+
+static int htb_delete(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl = (struct htb_class *)arg;
+	unsigned int qlen;
+	struct Qdisc *new_q = NULL;
+	int last_child = 0;
+
+	// TODO: why don't allow to delete subtree ? references ? does
+	// tc subsys quarantee us that in htb_destroy it holds no class
+	// refs so that we can remove children safely there ?
+	if (cl->children || cl->filter_cnt)
+		return -EBUSY;
+
+	if (!cl->level && htb_parent_last_child(cl)) {
+		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+					  cl->parent->common.classid);
+		last_child = 1;
+	}
+
+	sch_tree_lock(sch);
+
+	if (!cl->level) {
+		qlen = cl->un.leaf.q->q.qlen;
+		qdisc_reset(cl->un.leaf.q);
+		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+	}
+
+	/* delete from hash and active; remainder in destroy_class */
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+	if (cl->parent)
+		cl->parent->children--;
+
+	if (cl->prio_activity)
+		htb_deactivate(q, cl);
+
+	if (cl->cmode != HTB_CAN_SEND)
+		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
+
+	if (last_child)
+		htb_parent_to_leaf(q, cl, new_q);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void htb_put(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (--cl->refcnt == 0)
+		htb_destroy_class(sch, cl);
+}
+
+static int htb_change_class(struct Qdisc *sch, u32 classid,
+			    u32 parentid, struct nlattr **tca,
+			    unsigned long *arg)
+{
+	int err = -EINVAL;
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl = (struct htb_class *)*arg, *parent;
+	struct nlattr *opt = tca[TCA_OPTIONS];
+	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
+	struct nlattr *tb[__TCA_HTB_MAX];
+	struct tc_htb_opt *hopt;
+
+	/* extract all subattrs from opt attr */
+	if (!opt)
+		goto failure;
+
+	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
+	if (err < 0)
+		goto failure;
+
+	err = -EINVAL;
+	if (tb[TCA_HTB_PARMS] == NULL)
+		goto failure;
+
+	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
+
+	hopt = nla_data(tb[TCA_HTB_PARMS]);
+
+	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
+	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
+	if (!rtab || !ctab)
+		goto failure;
+
+	if (!cl) {		/* new class */
+		struct Qdisc *new_q;
+		int prio;
+		struct {
+			struct nlattr		nla;
+			struct gnet_estimator	opt;
+		} est = {
+			.nla = {
+				.nla_len	= nla_attr_size(sizeof(est.opt)),
+				.nla_type	= TCA_RATE,
+			},
+			.opt = {
+				/* 4s interval, 16s averaging constant */
+				.interval	= 2,
+				.ewma_log	= 2,
+			},
+		};
+
+		/* check for valid classid */
+		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
+		    htb_find(classid, sch))
+			goto failure;
+
+		/* check maximal depth */
+		if (parent && parent->parent && parent->parent->level < 2) {
+			pr_err("htb: tree is too deep\n");
+			goto failure;
+		}
+		err = -ENOBUFS;
+		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+		if (!cl)
+			goto failure;
+
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE] ? : &est.nla);
+		if (err) {
+			kfree(cl);
+			goto failure;
+		}
+
+		cl->refcnt = 1;
+		cl->children = 0;
+		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+		RB_CLEAR_NODE(&cl->pq_node);
+
+		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
+			RB_CLEAR_NODE(&cl->node[prio]);
+
+		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
+		 * so that can't be used inside of sch_tree_lock
+		 * -- thanks to Karlis Peisenieks
+		 */
+		new_q = qdisc_create_dflt(sch->dev_queue,
+					  &pfifo_qdisc_ops, classid);
+		sch_tree_lock(sch);
+		if (parent && !parent->level) {
+			unsigned int qlen = parent->un.leaf.q->q.qlen;
+
+			/* turn parent into inner node */
+			qdisc_reset(parent->un.leaf.q);
+			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+			qdisc_destroy(parent->un.leaf.q);
+			if (parent->prio_activity)
+				htb_deactivate(q, parent);
+
+			/* remove from evt list because of level change */
+			if (parent->cmode != HTB_CAN_SEND) {
+				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
+				parent->cmode = HTB_CAN_SEND;
+			}
+			parent->level = (parent->parent ? parent->parent->level
+					 : TC_HTB_MAXDEPTH) - 1;
+			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+		}
+		/* leaf (we) needs elementary qdisc */
+		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+
+		cl->common.classid = classid;
+		cl->parent = parent;
+
+		/* set class to be in HTB_CAN_SEND state */
+		cl->tokens = hopt->buffer;
+		cl->ctokens = hopt->cbuffer;
+		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
+		cl->t_c = psched_get_time();
+		cl->cmode = HTB_CAN_SEND;
+
+		/* attach to the hash list and parent's family */
+		qdisc_class_hash_insert(&q->clhash, &cl->common);
+		if (parent)
+			parent->children++;
+	} else {
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+		sch_tree_lock(sch);
+	}
+
+	/* it used to be a nasty bug here, we have to check that node
+	 * is really leaf before changing cl->un.leaf !
+	 */
+	if (!cl->level) {
+		cl->quantum = rtab->rate.rate / q->rate2quantum;
+		if (!hopt->quantum && cl->quantum < 1000) {
+			pr_warning(
+			       "HTB: quantum of class %X is small. Consider r2q change.\n",
+			       cl->common.classid);
+			cl->quantum = 1000;
+		}
+		if (!hopt->quantum && cl->quantum > 200000) {
+			pr_warning(
+			       "HTB: quantum of class %X is big. Consider r2q change.\n",
+			       cl->common.classid);
+			cl->quantum = 200000;
+		}
+		if (hopt->quantum)
+			cl->quantum = hopt->quantum;
+		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
+			cl->prio = TC_HTB_NUMPRIO - 1;
+	}
+
+	cl->buffer = hopt->buffer;
+	cl->cbuffer = hopt->cbuffer;
+	if (cl->rate)
+		qdisc_put_rtab(cl->rate);
+	cl->rate = rtab;
+	if (cl->ceil)
+		qdisc_put_rtab(cl->ceil);
+	cl->ceil = ctab;
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+
+failure:
+	if (rtab)
+		qdisc_put_rtab(rtab);
+	if (ctab)
+		qdisc_put_rtab(ctab);
+	return err;
+}
+
+static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl = (struct htb_class *)arg;
+	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
+
+	return fl;
+}
+
+static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
+				     u32 classid)
+{
+	struct htb_class *cl = htb_find(classid, sch);
+
+	/*if (cl && !cl->level) return 0;
+	 * The line above used to be there to prevent attaching filters to
+	 * leaves. But at least tc_index filter uses this just to get class
+	 * for other reasons so that we have to allow for it.
+	 * ----
+	 * 19.6.2002 As Werner explained it is ok - bind filter is just
+	 * another way to "lock" the class - unlike "get" this lock can
+	 * be broken by class during destroy IIUC.
+	 */
+	if (cl)
+		cl->filter_cnt++;
+	return (unsigned long)cl;
+}
+
+static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
+{
+	struct htb_class *cl = (struct htb_class *)arg;
+
+	if (cl)
+		cl->filter_cnt--;
+}
+
+static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct htb_sched *q = qdisc_priv(sch);
+	struct htb_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static const struct Qdisc_class_ops htb_class_ops = {
+	.graft		=	htb_graft,
+	.leaf		=	htb_leaf,
+	.qlen_notify	=	htb_qlen_notify,
+	.get		=	htb_get,
+	.put		=	htb_put,
+	.change		=	htb_change_class,
+	.delete		=	htb_delete,
+	.walk		=	htb_walk,
+	.tcf_chain	=	htb_find_tcf,
+	.bind_tcf	=	htb_bind_filter,
+	.unbind_tcf	=	htb_unbind_filter,
+	.dump		=	htb_dump_class,
+	.dump_stats	=	htb_dump_class_stats,
+};
+
+static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
+	.cl_ops		=	&htb_class_ops,
+	.id		=	"htb",
+	.priv_size	=	sizeof(struct htb_sched),
+	.enqueue	=	htb_enqueue,
+	.dequeue	=	htb_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	htb_drop,
+	.init		=	htb_init,
+	.reset		=	htb_reset,
+	.destroy	=	htb_destroy,
+	.dump		=	htb_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init htb_module_init(void)
+{
+	return register_qdisc(&htb_qdisc_ops);
+}
+static void __exit htb_module_exit(void)
+{
+	unregister_qdisc(&htb_qdisc_ops);
+}
+
+module_init(htb_module_init)
+module_exit(htb_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
new file mode 100644
index 00000000..bce16652
--- /dev/null
+++ b/net/sched/sch_ingress.c
@@ -0,0 +1,143 @@
+/* net/sched/sch_ingress.c - Ingress qdisc
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Jamal Hadi Salim 1999
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct ingress_qdisc_data {
+	struct tcf_proto	*filter_list;
+};
+
+/* ------------------------- Class/flow operations ------------------------- */
+
+static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
+{
+	return TC_H_MIN(classid) + 1;
+}
+
+static unsigned long ingress_bind_filter(struct Qdisc *sch,
+					 unsigned long parent, u32 classid)
+{
+	return ingress_get(sch, classid);
+}
+
+static void ingress_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+}
+
+static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
+
+	return &p->filter_list;
+}
+
+/* --------------------------- Qdisc operations ---------------------------- */
+
+static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, p->filter_list, &res);
+
+	qdisc_bstats_update(sch, skb);
+	switch (result) {
+	case TC_ACT_SHOT:
+		result = TC_ACT_SHOT;
+		sch->qstats.drops++;
+		break;
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		result = TC_ACT_STOLEN;
+		break;
+	case TC_ACT_RECLASSIFY:
+	case TC_ACT_OK:
+		skb->tc_index = TC_H_MIN(res.classid);
+	default:
+		result = TC_ACT_OK;
+		break;
+	}
+
+	return result;
+}
+
+/* ------------------------------------------------------------- */
+
+static void ingress_destroy(struct Qdisc *sch)
+{
+	struct ingress_qdisc_data *p = qdisc_priv(sch);
+
+	tcf_destroy_chain(&p->filter_list);
+}
+
+static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static const struct Qdisc_class_ops ingress_class_ops = {
+	.leaf		=	ingress_leaf,
+	.get		=	ingress_get,
+	.put		=	ingress_put,
+	.walk		=	ingress_walk,
+	.tcf_chain	=	ingress_find_tcf,
+	.bind_tcf	=	ingress_bind_filter,
+	.unbind_tcf	=	ingress_put,
+};
+
+static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
+	.cl_ops		=	&ingress_class_ops,
+	.id		=	"ingress",
+	.priv_size	=	sizeof(struct ingress_qdisc_data),
+	.enqueue	=	ingress_enqueue,
+	.destroy	=	ingress_destroy,
+	.dump		=	ingress_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init ingress_module_init(void)
+{
+	return register_qdisc(&ingress_qdisc_ops);
+}
+
+static void __exit ingress_module_exit(void)
+{
+	unregister_qdisc(&ingress_qdisc_ops);
+}
+
+module_init(ingress_module_init)
+module_exit(ingress_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
new file mode 100644
index 00000000..ec5cbc84
--- /dev/null
+++ b/net/sched/sch_mq.c
@@ -0,0 +1,240 @@
+/*
+ * net/sched/sch_mq.c		Classful multiqueue dummy scheduler
+ *
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+struct mq_sched {
+	struct Qdisc		**qdiscs;
+};
+
+static void mq_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (!priv->qdiscs)
+		return;
+	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+		qdisc_destroy(priv->qdiscs[ntx]);
+	kfree(priv->qdiscs);
+}
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	/* pre-allocate qdiscs, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL)
+		return -ENOMEM;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		dev_queue = netdev_get_tx_queue(dev, ntx);
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(ntx + 1)));
+		if (qdisc == NULL)
+			goto err;
+		priv->qdiscs[ntx] = qdisc;
+	}
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mq_destroy(sch);
+	return -ENOMEM;
+}
+
+static void mq_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (qdisc)
+			qdisc_destroy(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+	return 0;
+}
+
+static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1;
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+					    struct tcmsg *tcm)
+{
+	unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
+	struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
+
+	if (!dev_queue) {
+		struct net_device *dev = qdisc_dev(sch);
+
+		return netdev_get_tx_queue(dev, 0);
+	}
+	return dev_queue;
+}
+
+static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+	return 0;
+}
+
+static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mq_get(struct Qdisc *sch, u32 classid)
+{
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (!mq_queue_get(sch, ntx))
+		return 0;
+	return ntx;
+}
+
+static void mq_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	return 0;
+}
+
+static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	sch = dev_queue->qdisc_sleeping;
+	sch->qstats.qlen = sch->q.qlen;
+	if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+		return -1;
+	return 0;
+}
+
+static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx;
+
+	if (arg->stop)
+		return;
+
+	arg->count = arg->skip;
+	for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mq_class_ops = {
+	.select_queue	= mq_select_queue,
+	.graft		= mq_graft,
+	.leaf		= mq_leaf,
+	.get		= mq_get,
+	.put		= mq_put,
+	.walk		= mq_walk,
+	.dump		= mq_dump_class,
+	.dump_stats	= mq_dump_class_stats,
+};
+
+struct Qdisc_ops mq_qdisc_ops __read_mostly = {
+	.cl_ops		= &mq_class_ops,
+	.id		= "mq",
+	.priv_size	= sizeof(struct mq_sched),
+	.init		= mq_init,
+	.destroy	= mq_destroy,
+	.attach		= mq_attach,
+	.dump		= mq_dump,
+	.owner		= THIS_MODULE,
+};
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 00000000..59b26b8f
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,418 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+	struct Qdisc		**qdiscs;
+	int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (priv->qdiscs) {
+		for (ntx = 0;
+		     ntx < dev->num_tx_queues && priv->qdiscs[ntx];
+		     ntx++)
+			qdisc_destroy(priv->qdiscs[ntx]);
+		kfree(priv->qdiscs);
+	}
+
+	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, 0);
+	else
+		netdev_set_num_tc(dev, 0);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+	int i, j;
+
+	/* Verify num_tc is not out of max range */
+	if (qopt->num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	/* Verify priority mapping uses valid tcs */
+	for (i = 0; i < TC_BITMASK + 1; i++) {
+		if (qopt->prio_tc_map[i] >= qopt->num_tc)
+			return -EINVAL;
+	}
+
+	/* net_device does not support requested operation */
+	if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+		return -EINVAL;
+
+	/* if hw owned qcount and qoffset are taken from LLD so
+	 * no reason to verify them here
+	 */
+	if (qopt->hw)
+		return 0;
+
+	for (i = 0; i < qopt->num_tc; i++) {
+		unsigned int last = qopt->offset[i] + qopt->count[i];
+
+		/* Verify the queue count is in tx range being equal to the
+		 * real_num_tx_queues indicates the last queue is in use.
+		 */
+		if (qopt->offset[i] >= dev->real_num_tx_queues ||
+		    !qopt->count[i] ||
+		    last > dev->real_num_tx_queues)
+			return -EINVAL;
+
+		/* Verify that the offset and counts do not overlap */
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j])
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	int i, err = -EOPNOTSUPP;
+	struct tc_mqprio_qopt *qopt = NULL;
+
+	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	if (!opt || nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	if (mqprio_parse_opt(dev, qopt))
+		return -EINVAL;
+
+	/* pre-allocate qdisc, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)));
+		if (qdisc == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		priv->qdiscs[i] = qdisc;
+	}
+
+	/* If the mqprio options indicate that hardware should own
+	 * the queue mapping then run ndo_setup_tc otherwise use the
+	 * supplied and verified mapping
+	 */
+	if (qopt->hw) {
+		priv->hw_owned = 1;
+		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+		if (err)
+			goto err;
+	} else {
+		netdev_set_num_tc(dev, qopt->num_tc);
+		for (i = 0; i < qopt->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    qopt->count[i], qopt->offset[i]);
+	}
+
+	/* Always use supplied priority mappings */
+	for (i = 0; i < TC_BITMASK + 1; i++)
+		netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mqprio_destroy(sch);
+	return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	/* Attach underlying qdisc */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (qdisc)
+			qdisc_destroy(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+					     unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_mqprio_qopt opt = { 0 };
+	struct Qdisc *qdisc;
+	unsigned int i;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	opt.num_tc = netdev_get_num_tc(dev);
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+	opt.hw = priv->hw_owned;
+
+	for (i = 0; i < netdev_get_num_tc(dev); i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return NULL;
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+		return 0;
+	return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = TC_H_ROOT;
+		tcm->tcm_info = 0;
+	} else {
+		int i;
+		struct netdev_queue *dev_queue;
+
+		dev_queue = mqprio_queue_get(sch, cl);
+		tcm->tcm_parent = 0;
+		for (i = 0; i < netdev_get_num_tc(dev); i++) {
+			struct netdev_tc_txq tc = dev->tc_to_txq[i];
+			int q_idx = cl - netdev_get_num_tc(dev);
+
+			if (q_idx > tc.offset &&
+			    q_idx <= tc.offset + tc.count) {
+				tcm->tcm_parent =
+					TC_H_MAKE(TC_H_MAJ(sch->handle),
+						  TC_H_MIN(i + 1));
+				break;
+			}
+		}
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	}
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				   struct gnet_dump *d)
+	__releases(d->lock)
+	__acquires(d->lock)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		int i;
+		struct Qdisc *qdisc;
+		struct gnet_stats_queue qstats = {0};
+		struct gnet_stats_basic_packed bstats = {0};
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+		/* Drop lock here it will be reclaimed before touching
+		 * statistics this is required because the d->lock we
+		 * hold here is the look on dev_queue->qdisc_sleeping
+		 * also acquired below.
+		 */
+		spin_unlock_bh(d->lock);
+
+		for (i = tc.offset; i < tc.offset + tc.count; i++) {
+			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			spin_lock_bh(qdisc_lock(qdisc));
+			bstats.bytes      += qdisc->bstats.bytes;
+			bstats.packets    += qdisc->bstats.packets;
+			qstats.qlen       += qdisc->qstats.qlen;
+			qstats.backlog    += qdisc->qstats.backlog;
+			qstats.drops      += qdisc->qstats.drops;
+			qstats.requeues   += qdisc->qstats.requeues;
+			qstats.overlimits += qdisc->qstats.overlimits;
+			spin_unlock_bh(qdisc_lock(qdisc));
+		}
+		/* Reclaim root sleeping lock before completing stats */
+		spin_lock_bh(d->lock);
+		if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &qstats) < 0)
+			return -1;
+	} else {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+		sch = dev_queue->qdisc_sleeping;
+		sch->qstats.qlen = sch->q.qlen;
+		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx;
+
+	if (arg->stop)
+		return;
+
+	/* Walk hierarchy with a virtual class per tc */
+	arg->count = arg->skip;
+	for (ntx = arg->skip;
+	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+	     ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+	.graft		= mqprio_graft,
+	.leaf		= mqprio_leaf,
+	.get		= mqprio_get,
+	.put		= mqprio_put,
+	.walk		= mqprio_walk,
+	.dump		= mqprio_dump_class,
+	.dump_stats	= mqprio_dump_class_stats,
+};
+
+static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+	.cl_ops		= &mqprio_class_ops,
+	.id		= "mqprio",
+	.priv_size	= sizeof(struct mqprio_sched),
+	.init		= mqprio_init,
+	.destroy	= mqprio_destroy,
+	.attach		= mqprio_attach,
+	.dump		= mqprio_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+	return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+	unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 00000000..edc1950e
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	u16 bands;
+	u16 max_bands;
+	u16 curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid head-of-line blocking.
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				qdisc_bstats_update(sch, skb);
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static struct sk_buff *multiq_peek(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned int curband = q->curband;
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		curband++;
+		if (curband >= q->bands)
+			curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid head-of-line blocking.
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) {
+			qdisc = q->queues[curband];
+			skb = qdisc->ops->peek(qdisc);
+			if (skb)
+				return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands - 1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	u16 band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+	q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	int i;
+
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EOPNOTSUPP;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+	for (i = q->bands; i < q->max_bands; i++) {
+		if (q->queues[i] != &noop_qdisc) {
+			struct Qdisc *child = q->queues[i];
+			q->queues[i] = &noop_qdisc;
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_destroy(child);
+		}
+	}
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child, *old;
+			child = qdisc_create_dflt(sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				old = q->queues[i];
+				q->queues[i] = child;
+
+				if (old != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(old,
+								 old->q.qlen);
+					qdisc_destroy(old);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int i, err;
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+	q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+	if (!q->queues)
+		return -ENOBUFS;
+	for (i = 0; i < q->max_bands; i++)
+		q->queues[i] = &noop_qdisc;
+
+	err = multiq_tune(sch, opt);
+
+	if (err)
+		kfree(q->queues);
+
+	return err;
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+	opt.max_bands = q->max_bands;
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = q->queues[cl - 1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	cl_q->qstats.qlen = cl_q->q.qlen;
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.peek		=	multiq_peek,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
new file mode 100644
index 00000000..f0913ffc
--- /dev/null
+++ b/net/sched/sch_netem.c
@@ -0,0 +1,974 @@
+/*
+ * net/sched/sch_netem.c	Network emulator
+ *
+ * 		This program is free software; you can redistribute it and/or
+ * 		modify it under the terms of the GNU General Public License
+ * 		as published by the Free Software Foundation; either version
+ * 		2 of the License.
+ *
+ *  		Many of the algorithms and ideas for this came from
+ *		NIST Net which is not copyrighted.
+ *
+ * Authors:	Stephen Hemminger <shemminger@osdl.org>
+ *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#define VERSION "1.3"
+
+/*	Network Emulation Queuing algorithm.
+	====================================
+
+	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
+		 Network Emulation Tool
+		 [2] Luigi Rizzo, DummyNet for FreeBSD
+
+	 ----------------------------------------------------------------
+
+	 This started out as a simple way to delay outgoing packets to
+	 test TCP but has grown to include most of the functionality
+	 of a full blown network emulator like NISTnet. It can delay
+	 packets and add random jitter (and correlation). The random
+	 distribution can be loaded from a table as well to provide
+	 normal, Pareto, or experimental curves. Packet loss,
+	 duplication, and reordering can also be emulated.
+
+	 This qdisc does not do classification that can be handled in
+	 layering other disciplines.  It does not need to do bandwidth
+	 control either since that can be handled by using token
+	 bucket or other rate control.
+
+     Correlated Loss Generator models
+
+	Added generation of correlated loss according to the
+	"Gilbert-Elliot" model, a 4-state markov model.
+
+	References:
+	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
+	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
+	and intuitive loss model for packet networks and its implementation
+	in the Netem module in the Linux kernel", available in [1]
+
+	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
+		 Fabio Ludovici <fabio.ludovici at yahoo.it>
+*/
+
+struct netem_sched_data {
+	struct Qdisc	*qdisc;
+	struct qdisc_watchdog watchdog;
+
+	psched_tdiff_t latency;
+	psched_tdiff_t jitter;
+
+	u32 loss;
+	u32 limit;
+	u32 counter;
+	u32 gap;
+	u32 duplicate;
+	u32 reorder;
+	u32 corrupt;
+
+	struct crndstate {
+		u32 last;
+		u32 rho;
+	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
+
+	struct disttable {
+		u32  size;
+		s16 table[0];
+	} *delay_dist;
+
+	enum  {
+		CLG_RANDOM,
+		CLG_4_STATES,
+		CLG_GILB_ELL,
+	} loss_model;
+
+	/* Correlated Loss Generation models */
+	struct clgstate {
+		/* state of the Markov chain */
+		u8 state;
+
+		/* 4-states and Gilbert-Elliot models */
+		u32 a1;	/* p13 for 4-states or p for GE */
+		u32 a2;	/* p31 for 4-states or r for GE */
+		u32 a3;	/* p32 for 4-states or h for GE */
+		u32 a4;	/* p14 for 4-states or 1-k for GE */
+		u32 a5; /* p23 used only in 4-states */
+	} clg;
+
+};
+
+/* Time stamp put into socket buffer control block */
+struct netem_skb_cb {
+	psched_time_t	time_to_send;
+};
+
+static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
+	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/* init_crandom - initialize correlated random number generator
+ * Use entropy source for initial seed.
+ */
+static void init_crandom(struct crndstate *state, unsigned long rho)
+{
+	state->rho = rho;
+	state->last = net_random();
+}
+
+/* get_crandom - correlated random number generator
+ * Next number depends on last value.
+ * rho is scaled to avoid floating point.
+ */
+static u32 get_crandom(struct crndstate *state)
+{
+	u64 value, rho;
+	unsigned long answer;
+
+	if (state->rho == 0)	/* no correlation */
+		return net_random();
+
+	value = net_random();
+	rho = (u64)state->rho + 1;
+	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
+	state->last = answer;
+	return answer;
+}
+
+/* loss_4state - 4-state model loss generator
+ * Generates losses according to the 4-state Markov chain adopted in
+ * the GI (General and Intuitive) loss model.
+ */
+static bool loss_4state(struct netem_sched_data *q)
+{
+	struct clgstate *clg = &q->clg;
+	u32 rnd = net_random();
+
+	/*
+	 * Makes a comparison between rnd and the transition
+	 * probabilities outgoing from the current state, then decides the
+	 * next state and if the next packet has to be transmitted or lost.
+	 * The four states correspond to:
+	 *   1 => successfully transmitted packets within a gap period
+	 *   4 => isolated losses within a gap period
+	 *   3 => lost packets within a burst period
+	 *   2 => successfully transmitted packets within a burst period
+	 */
+	switch (clg->state) {
+	case 1:
+		if (rnd < clg->a4) {
+			clg->state = 4;
+			return true;
+		} else if (clg->a4 < rnd && rnd < clg->a1) {
+			clg->state = 3;
+			return true;
+		} else if (clg->a1 < rnd)
+			clg->state = 1;
+
+		break;
+	case 2:
+		if (rnd < clg->a5) {
+			clg->state = 3;
+			return true;
+		} else
+			clg->state = 2;
+
+		break;
+	case 3:
+		if (rnd < clg->a3)
+			clg->state = 2;
+		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+			clg->state = 1;
+			return true;
+		} else if (clg->a2 + clg->a3 < rnd) {
+			clg->state = 3;
+			return true;
+		}
+		break;
+	case 4:
+		clg->state = 1;
+		break;
+	}
+
+	return false;
+}
+
+/* loss_gilb_ell - Gilbert-Elliot model loss generator
+ * Generates losses according to the Gilbert-Elliot loss model or
+ * its special cases  (Gilbert or Simple Gilbert)
+ *
+ * Makes a comparison between random number and the transition
+ * probabilities outgoing from the current state, then decides the
+ * next state. A second random number is extracted and the comparison
+ * with the loss probability of the current state decides if the next
+ * packet will be transmitted or lost.
+ */
+static bool loss_gilb_ell(struct netem_sched_data *q)
+{
+	struct clgstate *clg = &q->clg;
+
+	switch (clg->state) {
+	case 1:
+		if (net_random() < clg->a1)
+			clg->state = 2;
+		if (net_random() < clg->a4)
+			return true;
+	case 2:
+		if (net_random() < clg->a2)
+			clg->state = 1;
+		if (clg->a3 > net_random())
+			return true;
+	}
+
+	return false;
+}
+
+static bool loss_event(struct netem_sched_data *q)
+{
+	switch (q->loss_model) {
+	case CLG_RANDOM:
+		/* Random packet drop 0 => none, ~0 => all */
+		return q->loss && q->loss >= get_crandom(&q->loss_cor);
+
+	case CLG_4_STATES:
+		/* 4state loss model algorithm (used also for GI model)
+		* Extracts a value from the markov 4 state loss generator,
+		* if it is 1 drops a packet and if needed writes the event in
+		* the kernel logs
+		*/
+		return loss_4state(q);
+
+	case CLG_GILB_ELL:
+		/* Gilbert-Elliot loss model algorithm
+		* Extracts a value from the Gilbert-Elliot loss generator,
+		* if it is 1 drops a packet and if needed writes the event in
+		* the kernel logs
+		*/
+		return loss_gilb_ell(q);
+	}
+
+	return false;	/* not reached */
+}
+
+
+/* tabledist - return a pseudo-randomly distributed value with mean mu and
+ * std deviation sigma.  Uses table lookup to approximate the desired
+ * distribution, and a uniformly-distributed pseudo-random source.
+ */
+static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
+				struct crndstate *state,
+				const struct disttable *dist)
+{
+	psched_tdiff_t x;
+	long t;
+	u32 rnd;
+
+	if (sigma == 0)
+		return mu;
+
+	rnd = get_crandom(state);
+
+	/* default uniform distribution */
+	if (dist == NULL)
+		return (rnd % (2*sigma)) - sigma + mu;
+
+	t = dist->table[rnd % dist->size];
+	x = (sigma % NETEM_DIST_SCALE) * t;
+	if (x >= 0)
+		x += NETEM_DIST_SCALE/2;
+	else
+		x -= NETEM_DIST_SCALE/2;
+
+	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
+}
+
+/*
+ * Insert one skb into qdisc.
+ * Note: parent depends on return value to account for queue length.
+ * 	NET_XMIT_DROP: queue length didn't change.
+ *      NET_XMIT_SUCCESS: one skb was queued.
+ */
+static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	/* We don't fill cb now as skb_unshare() may invalidate it */
+	struct netem_skb_cb *cb;
+	struct sk_buff *skb2;
+	int ret;
+	int count = 1;
+
+	/* Random duplication */
+	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+		++count;
+
+	/* Drop packet? */
+	if (loss_event(q))
+		--count;
+
+	if (count == 0) {
+		sch->qstats.drops++;
+		kfree_skb(skb);
+		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	}
+
+	skb_orphan(skb);
+
+	/*
+	 * If we need to duplicate packet, then re-insert at top of the
+	 * qdisc tree, since parent queuer expects that only one
+	 * skb will be queued.
+	 */
+	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+		struct Qdisc *rootq = qdisc_root(sch);
+		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+		q->duplicate = 0;
+
+		qdisc_enqueue_root(skb2, rootq);
+		q->duplicate = dupsave;
+	}
+
+	/*
+	 * Randomized packet corruption.
+	 * Make copy if needed since we are modifying
+	 * If packet is going to be hardware checksummed, then
+	 * do it now in software before we mangle it.
+	 */
+	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
+		    (skb->ip_summed == CHECKSUM_PARTIAL &&
+		     skb_checksum_help(skb)))
+			return qdisc_drop(skb, sch);
+
+		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+	}
+
+	cb = netem_skb_cb(skb);
+	if (q->gap == 0 ||		/* not doing reordering */
+	    q->counter < q->gap ||	/* inside last reordering gap */
+	    q->reorder < get_crandom(&q->reorder_cor)) {
+		psched_time_t now;
+		psched_tdiff_t delay;
+
+		delay = tabledist(q->latency, q->jitter,
+				  &q->delay_cor, q->delay_dist);
+
+		now = psched_get_time();
+		cb->time_to_send = now + delay;
+		++q->counter;
+		ret = qdisc_enqueue(skb, q->qdisc);
+	} else {
+		/*
+		 * Do re-ordering by putting one out of N packets at the front
+		 * of the queue.
+		 */
+		cb->time_to_send = psched_get_time();
+		q->counter = 0;
+
+		__skb_queue_head(&q->qdisc->q, skb);
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		sch->qstats.requeues++;
+		ret = NET_XMIT_SUCCESS;
+	}
+
+	if (ret != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(ret)) {
+			sch->qstats.drops++;
+			return ret;
+		}
+	}
+
+	sch->q.qlen++;
+	return NET_XMIT_SUCCESS;
+}
+
+static unsigned int netem_drop(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	unsigned int len = 0;
+
+	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+		sch->q.qlen--;
+		sch->qstats.drops++;
+	}
+	return len;
+}
+
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (qdisc_is_throttled(sch))
+		return NULL;
+
+	skb = q->qdisc->ops->peek(q->qdisc);
+	if (skb) {
+		const struct netem_skb_cb *cb = netem_skb_cb(skb);
+		psched_time_t now = psched_get_time();
+
+		/* if more time remaining? */
+		if (cb->time_to_send <= now) {
+			skb = qdisc_dequeue_peeked(q->qdisc);
+			if (unlikely(!skb))
+				return NULL;
+
+#ifdef CONFIG_NET_CLS_ACT
+			/*
+			 * If it's at ingress let's pretend the delay is
+			 * from the network (tstamp will be updated).
+			 */
+			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
+				skb->tstamp.tv64 = 0;
+#endif
+
+			sch->q.qlen--;
+			qdisc_unthrottled(sch);
+			qdisc_bstats_update(sch, skb);
+			return skb;
+		}
+
+		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+	}
+
+	return NULL;
+}
+
+static void netem_reset(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static void dist_free(struct disttable *d)
+{
+	if (d) {
+		if (is_vmalloc_addr(d))
+			vfree(d);
+		else
+			kfree(d);
+	}
+}
+
+/*
+ * Distribution data is a variable size payload containing
+ * signed 16 bit values.
+ */
+static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	size_t n = nla_len(attr)/sizeof(__s16);
+	const __s16 *data = nla_data(attr);
+	spinlock_t *root_lock;
+	struct disttable *d;
+	int i;
+	size_t s;
+
+	if (n > NETEM_DIST_MAX)
+		return -EINVAL;
+
+	s = sizeof(struct disttable) + n * sizeof(s16);
+	d = kmalloc(s, GFP_KERNEL);
+	if (!d)
+		d = vmalloc(s);
+	if (!d)
+		return -ENOMEM;
+
+	d->size = n;
+	for (i = 0; i < n; i++)
+		d->table[i] = data[i];
+
+	root_lock = qdisc_root_sleeping_lock(sch);
+
+	spin_lock_bh(root_lock);
+	dist_free(q->delay_dist);
+	q->delay_dist = d;
+	spin_unlock_bh(root_lock);
+	return 0;
+}
+
+static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_corr *c = nla_data(attr);
+
+	init_crandom(&q->delay_cor, c->delay_corr);
+	init_crandom(&q->loss_cor, c->loss_corr);
+	init_crandom(&q->dup_cor, c->dup_corr);
+}
+
+static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_reorder *r = nla_data(attr);
+
+	q->reorder = r->probability;
+	init_crandom(&q->reorder_cor, r->correlation);
+}
+
+static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_corrupt *r = nla_data(attr);
+
+	q->corrupt = r->probability;
+	init_crandom(&q->corrupt_cor, r->correlation);
+}
+
+static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct nlattr *la;
+	int rem;
+
+	nla_for_each_nested(la, attr, rem) {
+		u16 type = nla_type(la);
+
+		switch(type) {
+		case NETEM_LOSS_GI: {
+			const struct tc_netem_gimodel *gi = nla_data(la);
+
+			if (nla_len(la) != sizeof(struct tc_netem_gimodel)) {
+				pr_info("netem: incorrect gi model size\n");
+				return -EINVAL;
+			}
+
+			q->loss_model = CLG_4_STATES;
+
+			q->clg.state = 1;
+			q->clg.a1 = gi->p13;
+			q->clg.a2 = gi->p31;
+			q->clg.a3 = gi->p32;
+			q->clg.a4 = gi->p14;
+			q->clg.a5 = gi->p23;
+			break;
+		}
+
+		case NETEM_LOSS_GE: {
+			const struct tc_netem_gemodel *ge = nla_data(la);
+
+			if (nla_len(la) != sizeof(struct tc_netem_gemodel)) {
+				pr_info("netem: incorrect gi model size\n");
+				return -EINVAL;
+			}
+
+			q->loss_model = CLG_GILB_ELL;
+			q->clg.state = 1;
+			q->clg.a1 = ge->p;
+			q->clg.a2 = ge->r;
+			q->clg.a3 = ge->h;
+			q->clg.a4 = ge->k1;
+			break;
+		}
+
+		default:
+			pr_info("netem: unknown loss type %u\n", type);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
+	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
+	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
+	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
+	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
+};
+
+static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+		      const struct nla_policy *policy, int len)
+{
+	int nested_len = nla_len(nla) - NLA_ALIGN(len);
+
+	if (nested_len < 0) {
+		pr_info("netem: invalid attributes len %d\n", nested_len);
+		return -EINVAL;
+	}
+
+	if (nested_len >= nla_attr_size(0))
+		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
+				 nested_len, policy);
+
+	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+	return 0;
+}
+
+/* Parse netlink message to set options */
+static int netem_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_NETEM_MAX + 1];
+	struct tc_netem_qopt *qopt;
+	int ret;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
+	if (ret < 0)
+		return ret;
+
+	ret = fifo_set_limit(q->qdisc, qopt->limit);
+	if (ret) {
+		pr_info("netem: can't set fifo limit\n");
+		return ret;
+	}
+
+	q->latency = qopt->latency;
+	q->jitter = qopt->jitter;
+	q->limit = qopt->limit;
+	q->gap = qopt->gap;
+	q->counter = 0;
+	q->loss = qopt->loss;
+	q->duplicate = qopt->duplicate;
+
+	/* for compatibility with earlier versions.
+	 * if gap is set, need to assume 100% probability
+	 */
+	if (q->gap)
+		q->reorder = ~0;
+
+	if (tb[TCA_NETEM_CORR])
+		get_correlation(sch, tb[TCA_NETEM_CORR]);
+
+	if (tb[TCA_NETEM_DELAY_DIST]) {
+		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
+		if (ret)
+			return ret;
+	}
+
+	if (tb[TCA_NETEM_REORDER])
+		get_reorder(sch, tb[TCA_NETEM_REORDER]);
+
+	if (tb[TCA_NETEM_CORRUPT])
+		get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
+
+	q->loss_model = CLG_RANDOM;
+	if (tb[TCA_NETEM_LOSS])
+		ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
+
+	return ret;
+}
+
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+	u32 limit;
+	psched_time_t oldest;
+};
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *list = &sch->q;
+	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
+	struct sk_buff *skb;
+
+	if (likely(skb_queue_len(list) < q->limit)) {
+		/* Optimize for add at tail */
+		if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
+			q->oldest = tnext;
+			return qdisc_enqueue_tail(nskb, sch);
+		}
+
+		skb_queue_reverse_walk(list, skb) {
+			const struct netem_skb_cb *cb = netem_skb_cb(skb);
+
+			if (tnext >= cb->time_to_send)
+				break;
+		}
+
+		__skb_queue_after(list, skb, nskb);
+
+		sch->qstats.backlog += qdisc_pkt_len(nskb);
+
+		return NET_XMIT_SUCCESS;
+	}
+
+	return qdisc_reshape_fail(nskb, sch);
+}
+
+static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+
+	if (opt) {
+		struct tc_fifo_qopt *ctl = nla_data(opt);
+		if (nla_len(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		q->limit = ctl->limit;
+	} else
+		q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+	q->oldest = PSCHED_PASTPERFECT;
+	return 0;
+}
+
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+	struct tc_fifo_qopt opt = { .limit = q->limit };
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
+	.id		=	"tfifo",
+	.priv_size	=	sizeof(struct fifo_sched_data),
+	.enqueue	=	tfifo_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.peek		=	qdisc_peek_head,
+	.drop		=	qdisc_queue_drop,
+	.init		=	tfifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	tfifo_init,
+	.dump		=	tfifo_dump,
+};
+
+static int netem_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	int ret;
+
+	if (!opt)
+		return -EINVAL;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	q->loss_model = CLG_RANDOM;
+	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
+				     TC_H_MAKE(sch->handle, 1));
+	if (!q->qdisc) {
+		pr_notice("netem: qdisc create tfifo qdisc failed\n");
+		return -ENOMEM;
+	}
+
+	ret = netem_change(sch, opt);
+	if (ret) {
+		pr_info("netem: change failed\n");
+		qdisc_destroy(q->qdisc);
+	}
+	return ret;
+}
+
+static void netem_destroy(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+	qdisc_destroy(q->qdisc);
+	dist_free(q->delay_dist);
+}
+
+static int dump_loss_model(const struct netem_sched_data *q,
+			   struct sk_buff *skb)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	switch (q->loss_model) {
+	case CLG_RANDOM:
+		/* legacy loss model */
+		nla_nest_cancel(skb, nest);
+		return 0;	/* no data */
+
+	case CLG_4_STATES: {
+		struct tc_netem_gimodel gi = {
+			.p13 = q->clg.a1,
+			.p31 = q->clg.a2,
+			.p32 = q->clg.a3,
+			.p14 = q->clg.a4,
+			.p23 = q->clg.a5,
+		};
+
+		NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
+		break;
+	}
+	case CLG_GILB_ELL: {
+		struct tc_netem_gemodel ge = {
+			.p = q->clg.a1,
+			.r = q->clg.a2,
+			.h = q->clg.a3,
+			.k1 = q->clg.a4,
+		};
+
+		NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
+		break;
+	}
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	const struct netem_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
+	struct tc_netem_qopt qopt;
+	struct tc_netem_corr cor;
+	struct tc_netem_reorder reorder;
+	struct tc_netem_corrupt corrupt;
+
+	qopt.latency = q->latency;
+	qopt.jitter = q->jitter;
+	qopt.limit = q->limit;
+	qopt.loss = q->loss;
+	qopt.gap = q->gap;
+	qopt.duplicate = q->duplicate;
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
+
+	cor.delay_corr = q->delay_cor.rho;
+	cor.loss_corr = q->loss_cor.rho;
+	cor.dup_corr = q->dup_cor.rho;
+	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+
+	reorder.probability = q->reorder;
+	reorder.correlation = q->reorder_cor.rho;
+	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+
+	corrupt.probability = q->corrupt;
+	corrupt.correlation = q->corrupt_cor.rho;
+	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+
+	if (dump_loss_model(q, skb) != 0)
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nla);
+
+nla_put_failure:
+	nlmsg_trim(skb, nla);
+	return -1;
+}
+
+static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+
+	if (cl != 1) 	/* only one class */
+		return -ENOENT;
+
+	tcm->tcm_handle |= TC_H_MIN(1);
+	tcm->tcm_info = q->qdisc->handle;
+
+	return 0;
+}
+
+static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	return q->qdisc;
+}
+
+static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void netem_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static const struct Qdisc_class_ops netem_class_ops = {
+	.graft		=	netem_graft,
+	.leaf		=	netem_leaf,
+	.get		=	netem_get,
+	.put		=	netem_put,
+	.walk		=	netem_walk,
+	.dump		=	netem_dump_class,
+};
+
+static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
+	.id		=	"netem",
+	.cl_ops		=	&netem_class_ops,
+	.priv_size	=	sizeof(struct netem_sched_data),
+	.enqueue	=	netem_enqueue,
+	.dequeue	=	netem_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	netem_drop,
+	.init		=	netem_init,
+	.reset		=	netem_reset,
+	.destroy	=	netem_destroy,
+	.change		=	netem_change,
+	.dump		=	netem_dump,
+	.owner		=	THIS_MODULE,
+};
+
+
+static int __init netem_module_init(void)
+{
+	pr_info("netem: version " VERSION "\n");
+	return register_qdisc(&netem_qdisc_ops);
+}
+static void __exit netem_module_exit(void)
+{
+	unregister_qdisc(&netem_qdisc_ops);
+}
+module_init(netem_module_init)
+module_exit(netem_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
new file mode 100644
index 00000000..b5d56a22
--- /dev/null
+++ b/net/sched/sch_prio.c
@@ -0,0 +1,405 @@
+/*
+ * net/sched/sch_prio.c	Simple 3-band priority "scheduler".
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * Fixes:       19990609: J Hadi Salim <hadi@nortelnetworks.com>:
+ *              Init --  EINVAL when opt undefined
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct prio_sched_data {
+	int bands;
+	struct tcf_proto *filter_list;
+	u8  prio2band[TC_PRIO_MAX+1];
+	struct Qdisc *queues[TCQ_PRIO_BANDS];
+};
+
+
+static struct Qdisc *
+prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	u32 band = skb->priority;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	if (TC_H_MAJ(skb->priority) != sch->handle) {
+		err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+		switch (err) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		if (!q->filter_list || err < 0) {
+			if (TC_H_MAJ(band))
+				band = 0;
+			return q->queues[q->prio2band[band & TC_PRIO_MAX]];
+		}
+		band = res.classid;
+	}
+	band = TC_H_MIN(band) - 1;
+	if (band >= q->bands)
+		return q->queues[q->prio2band[0]];
+
+	return q->queues[band];
+}
+
+static int
+prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = prio_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+static struct sk_buff *prio_peek(struct Qdisc *sch)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < q->bands; prio++) {
+		struct Qdisc *qdisc = q->queues[prio];
+		struct sk_buff *skb = qdisc->ops->peek(qdisc);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+static struct sk_buff *prio_dequeue(struct Qdisc *sch)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < q->bands; prio++) {
+		struct Qdisc *qdisc = q->queues[prio];
+		struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
+		if (skb) {
+			qdisc_bstats_update(sch, skb);
+			sch->q.qlen--;
+			return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int prio_drop(struct Qdisc *sch)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (prio = q->bands-1; prio >= 0; prio--) {
+		qdisc = q->queues[prio];
+		if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+			sch->q.qlen--;
+			return len;
+		}
+	}
+	return 0;
+}
+
+
+static void
+prio_reset(struct Qdisc *sch)
+{
+	int prio;
+	struct prio_sched_data *q = qdisc_priv(sch);
+
+	for (prio = 0; prio < q->bands; prio++)
+		qdisc_reset(q->queues[prio]);
+	sch->q.qlen = 0;
+}
+
+static void
+prio_destroy(struct Qdisc *sch)
+{
+	int prio;
+	struct prio_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (prio = 0; prio < q->bands; prio++)
+		qdisc_destroy(q->queues[prio]);
+}
+
+static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct tc_prio_qopt *qopt;
+	int i;
+
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+	qopt = nla_data(opt);
+
+	if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
+		return -EINVAL;
+
+	for (i = 0; i <= TC_PRIO_MAX; i++) {
+		if (qopt->priomap[i] >= qopt->bands)
+			return -EINVAL;
+	}
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+
+	for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
+		struct Qdisc *child = q->queues[i];
+		q->queues[i] = &noop_qdisc;
+		if (child != &noop_qdisc) {
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_destroy(child);
+		}
+	}
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child, *old;
+
+			child = qdisc_create_dflt(sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle, i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				old = q->queues[i];
+				q->queues[i] = child;
+
+				if (old != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(old,
+								 old->q.qlen);
+					qdisc_destroy(old);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int prio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	for (i = 0; i < TCQ_PRIO_BANDS; i++)
+		q->queues[i] = &noop_qdisc;
+
+	if (opt == NULL) {
+		return -EINVAL;
+	} else {
+		int err;
+
+		if ((err = prio_tune(sch, opt)) != 0)
+			return err;
+	}
+	return 0;
+}
+
+static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_prio_qopt opt;
+
+	opt.bands = q->bands;
+	memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+prio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	return q->queues[band];
+}
+
+static unsigned long prio_get(struct Qdisc *sch, u32 classid)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
+{
+	return prio_get(sch, classid);
+}
+
+
+static void prio_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
+			   struct tcmsg *tcm)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = q->queues[cl-1]->handle;
+	return 0;
+}
+
+static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	cl_q->qstats.qlen = cl_q->q.qlen;
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	if (arg->stop)
+		return;
+
+	for (prio = 0; prio < q->bands; prio++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, prio + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops prio_class_ops = {
+	.graft		=	prio_graft,
+	.leaf		=	prio_leaf,
+	.get		=	prio_get,
+	.put		=	prio_put,
+	.walk		=	prio_walk,
+	.tcf_chain	=	prio_find_tcf,
+	.bind_tcf	=	prio_bind,
+	.unbind_tcf	=	prio_put,
+	.dump		=	prio_dump_class,
+	.dump_stats	=	prio_dump_class_stats,
+};
+
+static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&prio_class_ops,
+	.id		=	"prio",
+	.priv_size	=	sizeof(struct prio_sched_data),
+	.enqueue	=	prio_enqueue,
+	.dequeue	=	prio_dequeue,
+	.peek		=	prio_peek,
+	.drop		=	prio_drop,
+	.init		=	prio_init,
+	.reset		=	prio_reset,
+	.destroy	=	prio_destroy,
+	.change		=	prio_tune,
+	.dump		=	prio_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init prio_module_init(void)
+{
+	return register_qdisc(&prio_qdisc_ops);
+}
+
+static void __exit prio_module_exit(void)
+{
+	unregister_qdisc(&prio_qdisc_ops);
+}
+
+module_init(prio_module_init)
+module_exit(prio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
new file mode 100644
index 00000000..10334340
--- /dev/null
+++ b/net/sched/sch_qfq.c
@@ -0,0 +1,1137 @@
+/*
+ * net/sched/sch_qfq.c         Quick Fair Queueing Scheduler.
+ *
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+
+/*  Quick Fair Queueing
+    ===================
+
+    Sources:
+
+    Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
+    Packet Scheduling with Tight Bandwidth Distribution Guarantees."
+
+    See also:
+    http://retis.sssup.it/~fabio/linux/qfq/
+ */
+
+/*
+
+  Virtual time computations.
+
+  S, F and V are all computed in fixed point arithmetic with
+  FRAC_BITS decimal bits.
+
+  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+	one bit per index.
+  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+
+  The layout of the bits is as below:
+
+                   [ MTU_SHIFT ][      FRAC_BITS    ]
+                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
+				 ^.__grp->index = 0
+				 *.__grp->slot_shift
+
+  where MIN_SLOT_SHIFT is derived by difference from the others.
+
+  The max group index corresponds to Lmax/w_min, where
+  Lmax=1<<MTU_SHIFT, w_min = 1 .
+  From this, and knowing how many groups (MAX_INDEX) we want,
+  we can derive the shift corresponding to each group.
+
+  Because we often need to compute
+	F = S + len/w_i  and V = V + len/wsum
+  instead of storing w_i store the value
+	inv_w = (1<<FRAC_BITS)/w_i
+  so we can do F = S + len * inv_w * wsum.
+  We use W_TOT in the formulas so we can easily move between
+  static and adaptive weight sum.
+
+  The per-scheduler-instance data contain all the data structures
+  for the scheduler: bitmaps and bucket lists.
+
+ */
+
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group.
+ */
+#define QFQ_MAX_SLOTS	32
+
+/*
+ * Shifts used for class<->group mapping.  We allow class weights that are
+ * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the
+ * group with the smallest index that can support the L_i / r_i configured
+ * for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ */
+#define QFQ_MAX_INDEX		19
+#define QFQ_MAX_WSHIFT		16
+
+#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
+
+#define FRAC_BITS		30	/* fixed point arithmetic */
+#define ONE_FP			(1UL << FRAC_BITS)
+#define IWSUM			(ONE_FP/QFQ_MAX_WSUM)
+
+#define QFQ_MTU_SHIFT		11
+#define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states.  These values are used as indexes for the bitmaps
+ * array of struct qfq_queue.
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+
+struct qfq_class {
+	struct Qdisc_class_common common;
+
+	unsigned int refcnt;
+	unsigned int filter_cnt;
+
+	struct gnet_stats_basic_packed bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est rate_est;
+	struct Qdisc *qdisc;
+
+	struct hlist_node next;	/* Link for the slot list. */
+	u64 S, F;		/* flow timestamps (exact) */
+
+	/* group we belong to. In principle we would need the index,
+	 * which is log_2(lmax/weight), but we never reference it
+	 * directly, only the group.
+	 */
+	struct qfq_group *grp;
+
+	/* these are copied from the flowset. */
+	u32	inv_w;		/* ONE_FP/weight */
+	u32	lmax;		/* Max packet size for this flow. */
+};
+
+struct qfq_group {
+	u64 S, F;			/* group timestamps (approx). */
+	unsigned int slot_shift;	/* Slot shift. */
+	unsigned int index;		/* Group index. */
+	unsigned int front;		/* Index of the front slot. */
+	unsigned long full_slots;	/* non-empty slots */
+
+	/* Array of RR lists of active classes. */
+	struct hlist_head slots[QFQ_MAX_SLOTS];
+};
+
+struct qfq_sched {
+	struct tcf_proto *filter_list;
+	struct Qdisc_class_hash clhash;
+
+	u64		V;		/* Precise virtual time. */
+	u32		wsum;		/* weight sum */
+
+	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
+	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct qfq_class, common);
+}
+
+static void qfq_purge_queue(struct qfq_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
+	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
+	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
+};
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(u32 inv_w, unsigned int maxlen)
+{
+	u64 slot_size = (u64)maxlen * inv_w;
+	unsigned long size_map;
+	int index = 0;
+
+	size_map = slot_size >> QFQ_MIN_SLOT_SHIFT;
+	if (!size_map)
+		goto out;
+
+	index = __fls(size_map) + 1;	/* basically a log_2 */
+	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+	if (index < 0)
+		index = 0;
+out:
+	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
+		 (unsigned long) ONE_FP/inv_w, maxlen, index);
+
+	return index;
+}
+
+static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)*arg;
+	struct nlattr *tb[TCA_QFQ_MAX + 1];
+	u32 weight, lmax, inv_w;
+	int i, err;
+
+	if (tca[TCA_OPTIONS] == NULL) {
+		pr_notice("qfq: no options\n");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_QFQ_WEIGHT]) {
+		weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
+		if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
+			pr_notice("qfq: invalid weight %u\n", weight);
+			return -EINVAL;
+		}
+	} else
+		weight = 1;
+
+	inv_w = ONE_FP / weight;
+	weight = ONE_FP / inv_w;
+	if (q->wsum + weight > QFQ_MAX_WSUM) {
+		pr_notice("qfq: total weight out of range (%u + %u)\n",
+			  weight, q->wsum);
+		return -EINVAL;
+	}
+
+	if (tb[TCA_QFQ_LMAX]) {
+		lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
+		if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) {
+			pr_notice("qfq: invalid max length %u\n", lmax);
+			return -EINVAL;
+		}
+	} else
+		lmax = 1UL << QFQ_MTU_SHIFT;
+
+	if (cl != NULL) {
+		if (tca[TCA_RATE]) {
+			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
+						    qdisc_root_sleeping_lock(sch),
+						    tca[TCA_RATE]);
+			if (err)
+				return err;
+		}
+
+		sch_tree_lock(sch);
+		if (tb[TCA_QFQ_WEIGHT]) {
+			q->wsum = weight - ONE_FP / cl->inv_w;
+			cl->inv_w = inv_w;
+		}
+		sch_tree_unlock(sch);
+
+		return 0;
+	}
+
+	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	cl->refcnt = 1;
+	cl->common.classid = classid;
+	cl->lmax = lmax;
+	cl->inv_w = inv_w;
+	i = qfq_calc_index(cl->inv_w, cl->lmax);
+
+	cl->grp = &q->groups[i];
+	q->wsum += weight;
+
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
+				      &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+
+	if (tca[TCA_RATE]) {
+		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
+					qdisc_root_sleeping_lock(sch),
+					tca[TCA_RATE]);
+		if (err) {
+			qdisc_destroy(cl->qdisc);
+			kfree(cl);
+			return err;
+		}
+	}
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+
+	if (cl->inv_w) {
+		q->wsum -= ONE_FP / cl->inv_w;
+		cl->inv_w = 0;
+	}
+
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+}
+
+static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (cl->filter_cnt > 0)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	qfq_purge_queue(cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+	BUG_ON(--cl->refcnt == 0);
+	/*
+	 * This shouldn't happen: we "hold" one cops->get() when called
+	 * from tc_ctl_tclass; the destroy method is done from cops->put().
+	 */
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct qfq_class *cl = qfq_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (--cl->refcnt == 0)
+		qfq_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+
+	return &q->filter_list;
+}
+
+static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	struct qfq_class *cl = qfq_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->filter_cnt++;
+
+	return (unsigned long)cl;
+}
+
+static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	qfq_purge_queue(cl);
+	*old = cl->qdisc;
+	cl->qdisc = new;
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	return cl->qdisc;
+}
+
+static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent	= TC_H_ROOT;
+	tcm->tcm_handle	= cl->common.classid;
+	tcm->tcm_info	= cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w);
+	NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax);
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct qfq_class *cl = (struct qfq_class *)arg;
+	struct tc_qfq_stats xstats;
+
+	memset(&xstats, 0, sizeof(xstats));
+	cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
+
+	xstats.weight = ONE_FP/cl->inv_w;
+	xstats.lmax = cl->lmax;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+		pr_debug("qfq_classify: found %d\n", skb->priority);
+		cl = qfq_find_class(sch, skb->priority);
+		if (cl != NULL)
+			return cl;
+	}
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct qfq_class *)res.class;
+		if (cl == NULL)
+			cl = qfq_find_class(sch, res.classid);
+		return cl;
+	}
+
+	return NULL;
+}
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline u64 qfq_round_down(u64 ts, unsigned int shift)
+{
+	return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+					unsigned long bitmap)
+{
+	int index = __ffs(bitmap);
+	return &q->groups[index];
+}
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long mask_from(unsigned long bitmap, int from)
+{
+	return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
+{
+	/* if S > V we are not eligible */
+	unsigned int state = qfq_gt(grp->S, q->V);
+	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (qfq_gt(grp->F, next->F))
+			state |= EB;
+	}
+
+	return state;
+}
+
+
+/*
+ * In principle
+ *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ *	q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
+				   int src, int dst)
+{
+	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+	q->bitmaps[src] &= ~mask;
+}
+
+static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
+{
+	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (!qfq_gt(next->F, old_F))
+			return;
+	}
+
+	mask = (1UL << index) - 1;
+	qfq_move_groups(q, mask, EB, ER);
+	qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+	old_V ^= q->V;
+	old_V >>= QFQ_MIN_SLOT_SHIFT;
+	if (old_V) {
+		...
+	}
+ *
+ */
+static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
+{
+	unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+	unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+	if (vslot != old_vslot) {
+		unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
+		qfq_move_groups(q, mask, IR, ER);
+		qfq_move_groups(q, mask, IB, EB);
+	}
+}
+
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
+			    u64 roundedS)
+{
+	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
+	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+	hlist_add_head(&cl->next, &grp->slots[i]);
+	__set_bit(slot, &grp->full_slots);
+}
+
+/* Maybe introduce hlist_first_entry?? */
+static struct qfq_class *qfq_slot_head(struct qfq_group *grp)
+{
+	return hlist_entry(grp->slots[grp->front].first,
+			   struct qfq_class, next);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static void qfq_front_slot_remove(struct qfq_group *grp)
+{
+	struct qfq_class *cl = qfq_slot_head(grp);
+
+	BUG_ON(!cl);
+	hlist_del(&cl->next);
+	if (hlist_empty(&grp->slots[grp->front]))
+		__clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static struct qfq_class *qfq_slot_scan(struct qfq_group *grp)
+{
+	unsigned int i;
+
+	pr_debug("qfq slot_scan: grp %u full %#lx\n",
+		 grp->index, grp->full_slots);
+
+	if (grp->full_slots == 0)
+		return NULL;
+
+	i = __ffs(grp->full_slots);  /* zero based */
+	if (i > 0) {
+		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+		grp->full_slots >>= i;
+	}
+
+	return qfq_slot_head(grp);
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
+{
+	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+	grp->full_slots <<= i;
+	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
+{
+	struct qfq_group *grp;
+	unsigned long ineligible;
+
+	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+	if (ineligible) {
+		if (!q->bitmaps[ER]) {
+			grp = qfq_ffs(q, ineligible);
+			if (qfq_gt(grp->S, q->V))
+				q->V = grp->S;
+		}
+		qfq_make_eligible(q, old_V);
+	}
+}
+
+/* What is length of next packet in queue (0 if queue is empty) */
+static unsigned int qdisc_peek_len(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	skb = sch->ops->peek(sch);
+	return skb ? qdisc_pkt_len(skb) : 0;
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
+{
+	unsigned int len = qdisc_peek_len(cl->qdisc);
+
+	cl->S = cl->F;
+	if (!len)
+		qfq_front_slot_remove(grp);	/* queue is empty */
+	else {
+		u64 roundedS;
+
+		cl->F = cl->S + (u64)len * cl->inv_w;
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (roundedS == grp->S)
+			return false;
+
+		qfq_front_slot_remove(grp);
+		qfq_slot_insert(grp, cl, roundedS);
+	}
+
+	return true;
+}
+
+static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	struct qfq_class *cl;
+	struct sk_buff *skb;
+	unsigned int len;
+	u64 old_V;
+
+	if (!q->bitmaps[ER])
+		return NULL;
+
+	grp = qfq_ffs(q, q->bitmaps[ER]);
+
+	cl = qfq_slot_head(grp);
+	skb = qdisc_dequeue_peeked(cl->qdisc);
+	if (!skb) {
+		WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
+		return NULL;
+	}
+
+	sch->q.qlen--;
+	qdisc_bstats_update(sch, skb);
+
+	old_V = q->V;
+	len = qdisc_pkt_len(skb);
+	q->V += (u64)len * IWSUM;
+	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
+		 len, (unsigned long long) cl->F, (unsigned long long) q->V);
+
+	if (qfq_update_class(grp, cl)) {
+		u64 old_F = grp->F;
+
+		cl = qfq_slot_scan(grp);
+		if (!cl)
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+		else {
+			u64 roundedS = qfq_round_down(cl->S, grp->slot_shift);
+			unsigned int s;
+
+			if (grp->S == roundedS)
+				goto skip_unblock;
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+
+		qfq_unblock_groups(q, grp->index, old_F);
+	}
+
+skip_unblock:
+	qfq_update_eligible(q, old_V);
+
+	return skb;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+	unsigned long mask;
+	uint32_t limit, roundedF;
+	int slot_shift = cl->grp->slot_shift;
+
+	roundedF = qfq_round_down(cl->F, slot_shift);
+	limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+		/* timestamp was stale */
+		mask = mask_from(q->bitmaps[ER], cl->grp->index);
+		if (mask) {
+			struct qfq_group *next = qfq_ffs(q, mask);
+			if (qfq_gt(roundedF, next->F)) {
+				cl->S = next->F;
+				return;
+			}
+		}
+		cl->S = q->V;
+	} else  /* timestamp is not stale */
+		cl->S = cl->F;
+}
+
+static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	struct qfq_class *cl;
+	int err;
+	u64 roundedS;
+	int s;
+
+	cl = qfq_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
+
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		pr_debug("qfq_enqueue: enqueue failed %d\n", err);
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	bstats_update(&cl->bstats, skb);
+	++sch->q.qlen;
+
+	/* If the new skb is not the head of queue, then done here. */
+	if (cl->qdisc->q.qlen != 1)
+		return err;
+
+	/* If reach this point, queue q was idle */
+	grp = cl->grp;
+	qfq_update_start(q, cl);
+
+	/* compute new finish time and rounded start. */
+	cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w;
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+	/*
+	 * insert cl in the correct bucket.
+	 * If cl->S >= grp->S we don't need to adjust the
+	 * bucket list and simply go to the insertion phase.
+	 * Otherwise grp->S is decreasing, we must make room
+	 * in the bucket list, and also recompute the group state.
+	 * Finally, if there were no flows in this group and nobody
+	 * was in ER make sure to adjust V.
+	 */
+	if (grp->full_slots) {
+		if (!qfq_gt(grp->S, cl->S))
+			goto skip_update;
+
+		/* create a slot for this cl->S */
+		qfq_slot_rotate(grp, roundedS);
+		/* group was surely ineligible, remove */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+		q->V = roundedS;
+
+	grp->S = roundedS;
+	grp->F = roundedS + (2ULL << grp->slot_shift);
+	s = qfq_calc_state(q, grp);
+	__set_bit(grp->index, &q->bitmaps[s]);
+
+	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
+		 s, q->bitmaps[s],
+		 (unsigned long long) cl->S,
+		 (unsigned long long) cl->F,
+		 (unsigned long long) q->V);
+
+skip_update:
+	qfq_slot_insert(grp, cl, roundedS);
+
+	return err;
+}
+
+
+static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+			    struct qfq_class *cl)
+{
+	unsigned int i, offset;
+	u64 roundedS;
+
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+	offset = (roundedS - grp->S) >> grp->slot_shift;
+	i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+	hlist_del(&cl->next);
+	if (hlist_empty(&grp->slots[i]))
+		__clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ */
+static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl)
+{
+	struct qfq_group *grp = cl->grp;
+	unsigned long mask;
+	u64 roundedS;
+	int s;
+
+	cl->F = cl->S;
+	qfq_slot_remove(q, grp, cl);
+
+	if (!grp->full_slots) {
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[EB]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+
+		if (test_bit(grp->index, &q->bitmaps[ER]) &&
+		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+			if (mask)
+				mask = ~((1UL << __fls(mask)) - 1);
+			else
+				mask = ~0UL;
+			qfq_move_groups(q, mask, EB, ER);
+			qfq_move_groups(q, mask, IB, IR);
+		}
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	} else if (hlist_empty(&grp->slots[grp->front])) {
+		cl = qfq_slot_scan(grp);
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (grp->S != roundedS) {
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			__clear_bit(grp->index, &q->bitmaps[IR]);
+			__clear_bit(grp->index, &q->bitmaps[EB]);
+			__clear_bit(grp->index, &q->bitmaps[IB]);
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+	}
+
+	qfq_update_eligible(q, q->V);
+}
+
+static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl = (struct qfq_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0)
+		qfq_deactivate_class(q, cl);
+}
+
+static unsigned int qfq_drop(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	unsigned int i, j, len;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+			struct qfq_class *cl;
+			struct hlist_node *n;
+
+			hlist_for_each_entry(cl, n, &grp->slots[j], next) {
+
+				if (!cl->qdisc->ops->drop)
+					continue;
+
+				len = cl->qdisc->ops->drop(cl->qdisc);
+				if (len > 0) {
+					sch->q.qlen--;
+					if (!cl->qdisc->q.qlen)
+						qfq_deactivate_class(q, cl);
+
+					return len;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	int i, j, err;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		grp->index = i;
+		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS
+				   - (QFQ_MAX_INDEX - i);
+		for (j = 0; j < QFQ_MAX_SLOTS; j++)
+			INIT_HLIST_HEAD(&grp->slots[j]);
+	}
+
+	return 0;
+}
+
+static void qfq_reset_qdisc(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_group *grp;
+	struct qfq_class *cl;
+	struct hlist_node *n, *tmp;
+	unsigned int i, j;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+			hlist_for_each_entry_safe(cl, n, tmp,
+						  &grp->slots[j], next) {
+				qfq_deactivate_class(q, cl);
+			}
+		}
+	}
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
+			qdisc_reset(cl->qdisc);
+	}
+	sch->q.qlen = 0;
+}
+
+static void qfq_destroy_qdisc(struct Qdisc *sch)
+{
+	struct qfq_sched *q = qdisc_priv(sch);
+	struct qfq_class *cl;
+	struct hlist_node *n, *next;
+	unsigned int i;
+
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+					  common.hnode) {
+			qfq_destroy_class(sch, cl);
+		}
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops qfq_class_ops = {
+	.change		= qfq_change_class,
+	.delete		= qfq_delete_class,
+	.get		= qfq_get_class,
+	.put		= qfq_put_class,
+	.tcf_chain	= qfq_tcf_chain,
+	.bind_tcf	= qfq_bind_tcf,
+	.unbind_tcf	= qfq_unbind_tcf,
+	.graft		= qfq_graft_class,
+	.leaf		= qfq_class_leaf,
+	.qlen_notify	= qfq_qlen_notify,
+	.dump		= qfq_dump_class,
+	.dump_stats	= qfq_dump_class_stats,
+	.walk		= qfq_walk,
+};
+
+static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
+	.cl_ops		= &qfq_class_ops,
+	.id		= "qfq",
+	.priv_size	= sizeof(struct qfq_sched),
+	.enqueue	= qfq_enqueue,
+	.dequeue	= qfq_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= qfq_drop,
+	.init		= qfq_init_qdisc,
+	.reset		= qfq_reset_qdisc,
+	.destroy	= qfq_destroy_qdisc,
+	.owner		= THIS_MODULE,
+};
+
+static int __init qfq_init(void)
+{
+	return register_qdisc(&qfq_qdisc_ops);
+}
+
+static void __exit qfq_exit(void)
+{
+	unregister_qdisc(&qfq_qdisc_ops);
+}
+
+module_init(qfq_init);
+module_exit(qfq_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
new file mode 100644
index 00000000..6649463d
--- /dev/null
+++ b/net/sched/sch_red.c
@@ -0,0 +1,359 @@
+/*
+ * net/sched/sch_red.c	Random Early Detection queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * J Hadi Salim 980914:	computation fixes
+ * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
+ * J Hadi Salim 980816:  ECN support
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+
+/*	Parameters, settable by user:
+	-----------------------------
+
+	limit		- bytes (must be > qth_max + burst)
+
+	Hard limit on queue length, should be chosen >qth_max
+	to allow packet bursts. This parameter does not
+	affect the algorithms behaviour and can be chosen
+	arbitrarily high (well, less than ram size)
+	Really, this limit will never be reached
+	if RED works correctly.
+ */
+
+struct red_sched_data {
+	u32			limit;		/* HARD maximal queue length */
+	unsigned char		flags;
+	struct red_parms	parms;
+	struct red_stats	stats;
+	struct Qdisc		*qdisc;
+};
+
+static inline int red_use_ecn(struct red_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+static inline int red_use_harddrop(struct red_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	int ret;
+
+	q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog);
+
+	if (red_is_idling(&q->parms))
+		red_end_of_idle_period(&q->parms);
+
+	switch (red_action(&q->parms, q->parms.qavg)) {
+	case RED_DONT_MARK:
+		break;
+
+	case RED_PROB_MARK:
+		sch->qstats.overlimits++;
+		if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+			q->stats.prob_drop++;
+			goto congestion_drop;
+		}
+
+		q->stats.prob_mark++;
+		break;
+
+	case RED_HARD_MARK:
+		sch->qstats.overlimits++;
+		if (red_use_harddrop(q) || !red_use_ecn(q) ||
+		    !INET_ECN_set_ce(skb)) {
+			q->stats.forced_drop++;
+			goto congestion_drop;
+		}
+
+		q->stats.forced_mark++;
+		break;
+	}
+
+	ret = qdisc_enqueue(skb, child);
+	if (likely(ret == NET_XMIT_SUCCESS)) {
+		sch->q.qlen++;
+	} else if (net_xmit_drop_count(ret)) {
+		q->stats.pdrop++;
+		sch->qstats.drops++;
+	}
+	return ret;
+
+congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+}
+
+static struct sk_buff *red_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+
+	skb = child->dequeue(child);
+	if (skb) {
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+	} else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+	return skb;
+}
+
+static struct sk_buff *red_peek(struct Qdisc *sch)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+
+	return child->ops->peek(child);
+}
+
+static unsigned int red_drop(struct Qdisc *sch)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	unsigned int len;
+
+	if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
+		q->stats.other++;
+		sch->qstats.drops++;
+		sch->q.qlen--;
+		return len;
+	}
+
+	if (!red_is_idling(&q->parms))
+		red_start_of_idle_period(&q->parms);
+
+	return 0;
+}
+
+static void red_reset(struct Qdisc *sch)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	red_restart(&q->parms);
+}
+
+static void red_destroy(struct Qdisc *sch)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+	[TCA_RED_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_RED_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+static int red_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_RED_MAX + 1];
+	struct tc_red_qopt *ctl;
+	struct Qdisc *child = NULL;
+	int err;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_RED_PARMS] == NULL ||
+	    tb[TCA_RED_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_RED_PARMS]);
+
+	if (ctl->limit > 0) {
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
+		if (IS_ERR(child))
+			return PTR_ERR(child);
+	}
+
+	sch_tree_lock(sch);
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+	if (child) {
+		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_destroy(q->qdisc);
+		q->qdisc = child;
+	}
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+				 ctl->Plog, ctl->Scell_log,
+				 nla_data(tb[TCA_RED_STAB]));
+
+	if (skb_queue_empty(&sch->q))
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int red_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	q->qdisc = &noop_qdisc;
+	return red_change(sch, opt);
+}
+
+static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+	NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct tc_red_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int red_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(1);
+	tcm->tcm_info = q->qdisc->handle;
+	return 0;
+}
+
+static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct red_sched_data *q = qdisc_priv(sch);
+	return q->qdisc;
+}
+
+static unsigned long red_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void red_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static const struct Qdisc_class_ops red_class_ops = {
+	.graft		=	red_graft,
+	.leaf		=	red_leaf,
+	.get		=	red_get,
+	.put		=	red_put,
+	.walk		=	red_walk,
+	.dump		=	red_dump_class,
+};
+
+static struct Qdisc_ops red_qdisc_ops __read_mostly = {
+	.id		=	"red",
+	.priv_size	=	sizeof(struct red_sched_data),
+	.cl_ops		=	&red_class_ops,
+	.enqueue	=	red_enqueue,
+	.dequeue	=	red_dequeue,
+	.peek		=	red_peek,
+	.drop		=	red_drop,
+	.init		=	red_init,
+	.reset		=	red_reset,
+	.destroy	=	red_destroy,
+	.change		=	red_change,
+	.dump		=	red_dump,
+	.dump_stats	=	red_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init red_module_init(void)
+{
+	return register_qdisc(&red_qdisc_ops);
+}
+
+static void __exit red_module_exit(void)
+{
+	unregister_qdisc(&red_qdisc_ops);
+}
+
+module_init(red_module_init)
+module_exit(red_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
new file mode 100644
index 00000000..47ee29fa
--- /dev/null
+++ b/net/sched/sch_sfb.c
@@ -0,0 +1,708 @@
+/*
+ * net/sched/sch_sfb.c	  Stochastic Fair Blue
+ *
+ * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
+ * A New Class of Active Queue Management Algorithms.
+ * U. Michigan CSE-TR-387-99, April 1999.
+ *
+ * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+
+/*
+ * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
+ * This implementation uses L = 8 and N = 16
+ * This permits us to split one 32bit hash (provided per packet by rxhash or
+ * external classifier) into 8 subhashes of 4 bits.
+ */
+#define SFB_BUCKET_SHIFT 4
+#define SFB_NUMBUCKETS	(1 << SFB_BUCKET_SHIFT) /* N bins per Level */
+#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
+#define SFB_LEVELS	(32 / SFB_BUCKET_SHIFT) /* L */
+
+/* SFB algo uses a virtual queue, named "bin" */
+struct sfb_bucket {
+	u16		qlen; /* length of virtual queue */
+	u16		p_mark; /* marking probability */
+};
+
+/* We use a double buffering right before hash change
+ * (Section 4.4 of SFB reference : moving hash functions)
+ */
+struct sfb_bins {
+	u32		  perturbation; /* jhash perturbation */
+	struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
+};
+
+struct sfb_sched_data {
+	struct Qdisc	*qdisc;
+	struct tcf_proto *filter_list;
+	unsigned long	rehash_interval;
+	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */
+	u32		max;
+	u32		bin_size;	/* maximum queue length per bin */
+	u32		increment;	/* d1 */
+	u32		decrement;	/* d2 */
+	u32		limit;		/* HARD maximal queue length */
+	u32		penalty_rate;
+	u32		penalty_burst;
+	u32		tokens_avail;
+	unsigned long	rehash_time;
+	unsigned long	token_time;
+
+	u8		slot;		/* current active bins (0 or 1) */
+	bool		double_buffering;
+	struct sfb_bins bins[2];
+
+	struct {
+		u32	earlydrop;
+		u32	penaltydrop;
+		u32	bucketdrop;
+		u32	queuedrop;
+		u32	childdrop;	/* drops in child qdisc */
+		u32	marked;		/* ECN mark */
+	} stats;
+};
+
+/*
+ * Each queued skb might be hashed on one or two bins
+ * We store in skb_cb the two hash values.
+ * (A zero value means double buffering was not used)
+ */
+struct sfb_skb_cb {
+	u32 hashes[2];
+};
+
+static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
+	return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+/*
+ * If using 'internal' SFB flow classifier, hash comes from skb rxhash
+ * If using external classifier, hash comes from the classid.
+ */
+static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
+{
+	return sfb_skb_cb(skb)->hashes[slot];
+}
+
+/* Probabilities are coded as Q0.16 fixed-point values,
+ * with 0xFFFF representing 65535/65536 (almost 1.0)
+ * Addition and subtraction are saturating in [0, 65535]
+ */
+static u32 prob_plus(u32 p1, u32 p2)
+{
+	u32 res = p1 + p2;
+
+	return min_t(u32, res, SFB_MAX_PROB);
+}
+
+static u32 prob_minus(u32 p1, u32 p2)
+{
+	return p1 > p2 ? p1 - p2 : 0;
+}
+
+static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
+{
+	int i;
+	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b[hash].qlen < 0xFFFF)
+			b[hash].qlen++;
+		b += SFB_NUMBUCKETS; /* next level */
+	}
+}
+
+static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	u32 sfbhash;
+
+	sfbhash = sfb_hash(skb, 0);
+	if (sfbhash)
+		increment_one_qlen(sfbhash, 0, q);
+
+	sfbhash = sfb_hash(skb, 1);
+	if (sfbhash)
+		increment_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_one_qlen(u32 sfbhash, u32 slot,
+			       struct sfb_sched_data *q)
+{
+	int i;
+	struct sfb_bucket *b = &q->bins[slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b[hash].qlen > 0)
+			b[hash].qlen--;
+		b += SFB_NUMBUCKETS; /* next level */
+	}
+}
+
+static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	u32 sfbhash;
+
+	sfbhash = sfb_hash(skb, 0);
+	if (sfbhash)
+		decrement_one_qlen(sfbhash, 0, q);
+
+	sfbhash = sfb_hash(skb, 1);
+	if (sfbhash)
+		decrement_one_qlen(sfbhash, 1, q);
+}
+
+static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+	b->p_mark = prob_minus(b->p_mark, q->decrement);
+}
+
+static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
+{
+	b->p_mark = prob_plus(b->p_mark, q->increment);
+}
+
+static void sfb_zero_all_buckets(struct sfb_sched_data *q)
+{
+	memset(&q->bins, 0, sizeof(q->bins));
+}
+
+/*
+ * compute max qlen, max p_mark, and avg p_mark
+ */
+static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
+{
+	int i;
+	u32 qlen = 0, prob = 0, totalpm = 0;
+	const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
+
+	for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
+		if (qlen < b->qlen)
+			qlen = b->qlen;
+		totalpm += b->p_mark;
+		if (prob < b->p_mark)
+			prob = b->p_mark;
+		b++;
+	}
+	*prob_r = prob;
+	*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
+	return qlen;
+}
+
+
+static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
+{
+	q->bins[slot].perturbation = net_random();
+}
+
+static void sfb_swap_slot(struct sfb_sched_data *q)
+{
+	sfb_init_perturbation(q->slot, q);
+	q->slot ^= 1;
+	q->double_buffering = false;
+}
+
+/* Non elastic flows are allowed to use part of the bandwidth, expressed
+ * in "penalty_rate" packets per second, with "penalty_burst" burst
+ */
+static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
+{
+	if (q->penalty_rate == 0 || q->penalty_burst == 0)
+		return true;
+
+	if (q->tokens_avail < 1) {
+		unsigned long age = min(10UL * HZ, jiffies - q->token_time);
+
+		q->tokens_avail = (age * q->penalty_rate) / HZ;
+		if (q->tokens_avail > q->penalty_burst)
+			q->tokens_avail = q->penalty_burst;
+		q->token_time = jiffies;
+		if (q->tokens_avail < 1)
+			return true;
+	}
+
+	q->tokens_avail--;
+	return false;
+}
+
+static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+			 int *qerr, u32 *salt)
+{
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return false;
+		}
+#endif
+		*salt = TC_H_MIN(res.classid);
+		return true;
+	}
+	return false;
+}
+
+static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	int i;
+	u32 p_min = ~0;
+	u32 minqlen = ~0;
+	u32 r, slot, salt, sfbhash;
+	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (q->rehash_interval > 0) {
+		unsigned long limit = q->rehash_time + q->rehash_interval;
+
+		if (unlikely(time_after(jiffies, limit))) {
+			sfb_swap_slot(q);
+			q->rehash_time = jiffies;
+		} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
+				    time_after(jiffies, limit - q->warmup_time))) {
+			q->double_buffering = true;
+		}
+	}
+
+	if (q->filter_list) {
+		/* If using external classifiers, get result and record it. */
+		if (!sfb_classify(skb, q, &ret, &salt))
+			goto other_drop;
+	} else {
+		salt = skb_get_rxhash(skb);
+	}
+
+	slot = q->slot;
+
+	sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+	if (!sfbhash)
+		sfbhash = 1;
+	sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+	for (i = 0; i < SFB_LEVELS; i++) {
+		u32 hash = sfbhash & SFB_BUCKET_MASK;
+		struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+		sfbhash >>= SFB_BUCKET_SHIFT;
+		if (b->qlen == 0)
+			decrement_prob(b, q);
+		else if (b->qlen >= q->bin_size)
+			increment_prob(b, q);
+		if (minqlen > b->qlen)
+			minqlen = b->qlen;
+		if (p_min > b->p_mark)
+			p_min = b->p_mark;
+	}
+
+	slot ^= 1;
+	sfb_skb_cb(skb)->hashes[slot] = 0;
+
+	if (unlikely(minqlen >= q->max || sch->q.qlen >= q->limit)) {
+		sch->qstats.overlimits++;
+		if (minqlen >= q->max)
+			q->stats.bucketdrop++;
+		else
+			q->stats.queuedrop++;
+		goto drop;
+	}
+
+	if (unlikely(p_min >= SFB_MAX_PROB)) {
+		/* Inelastic flow */
+		if (q->double_buffering) {
+			sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+			if (!sfbhash)
+				sfbhash = 1;
+			sfb_skb_cb(skb)->hashes[slot] = sfbhash;
+
+			for (i = 0; i < SFB_LEVELS; i++) {
+				u32 hash = sfbhash & SFB_BUCKET_MASK;
+				struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
+
+				sfbhash >>= SFB_BUCKET_SHIFT;
+				if (b->qlen == 0)
+					decrement_prob(b, q);
+				else if (b->qlen >= q->bin_size)
+					increment_prob(b, q);
+			}
+		}
+		if (sfb_rate_limit(skb, q)) {
+			sch->qstats.overlimits++;
+			q->stats.penaltydrop++;
+			goto drop;
+		}
+		goto enqueue;
+	}
+
+	r = net_random() & SFB_MAX_PROB;
+
+	if (unlikely(r < p_min)) {
+		if (unlikely(p_min > SFB_MAX_PROB / 2)) {
+			/* If we're marking that many packets, then either
+			 * this flow is unresponsive, or we're badly congested.
+			 * In either case, we want to start dropping packets.
+			 */
+			if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
+				q->stats.earlydrop++;
+				goto drop;
+			}
+		}
+		if (INET_ECN_set_ce(skb)) {
+			q->stats.marked++;
+		} else {
+			q->stats.earlydrop++;
+			goto drop;
+		}
+	}
+
+enqueue:
+	ret = qdisc_enqueue(skb, child);
+	if (likely(ret == NET_XMIT_SUCCESS)) {
+		sch->q.qlen++;
+		increment_qlen(skb, q);
+	} else if (net_xmit_drop_count(ret)) {
+		q->stats.childdrop++;
+		sch->qstats.drops++;
+	}
+	return ret;
+
+drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+other_drop:
+	if (ret & __NET_XMIT_BYPASS)
+		sch->qstats.drops++;
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+	struct sk_buff *skb;
+
+	skb = child->dequeue(q->qdisc);
+
+	if (skb) {
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+		decrement_qlen(skb, q);
+	}
+
+	return skb;
+}
+
+static struct sk_buff *sfb_peek(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+
+	return child->ops->peek(child);
+}
+
+/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
+
+static void sfb_reset(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	q->slot = 0;
+	q->double_buffering = false;
+	sfb_zero_all_buckets(q);
+	sfb_init_perturbation(0, q);
+}
+
+static void sfb_destroy(struct Qdisc *sch)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	qdisc_destroy(q->qdisc);
+}
+
+static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
+	[TCA_SFB_PARMS]	= { .len = sizeof(struct tc_sfb_qopt) },
+};
+
+static const struct tc_sfb_qopt sfb_default_ops = {
+	.rehash_interval = 600 * MSEC_PER_SEC,
+	.warmup_time = 60 * MSEC_PER_SEC,
+	.limit = 0,
+	.max = 25,
+	.bin_size = 20,
+	.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
+	.decrement = (SFB_MAX_PROB + 3000) / 6000,
+	.penalty_rate = 10,
+	.penalty_burst = 20,
+};
+
+static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child;
+	struct nlattr *tb[TCA_SFB_MAX + 1];
+	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
+	u32 limit;
+	int err;
+
+	if (opt) {
+		err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
+		if (err < 0)
+			return -EINVAL;
+
+		if (tb[TCA_SFB_PARMS] == NULL)
+			return -EINVAL;
+
+		ctl = nla_data(tb[TCA_SFB_PARMS]);
+	}
+
+	limit = ctl->limit;
+	if (limit == 0)
+		limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
+
+	child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	sch_tree_lock(sch);
+
+	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+	qdisc_destroy(q->qdisc);
+	q->qdisc = child;
+
+	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
+	q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
+	q->rehash_time = jiffies;
+	q->limit = limit;
+	q->increment = ctl->increment;
+	q->decrement = ctl->decrement;
+	q->max = ctl->max;
+	q->bin_size = ctl->bin_size;
+	q->penalty_rate = ctl->penalty_rate;
+	q->penalty_burst = ctl->penalty_burst;
+	q->tokens_avail = ctl->penalty_burst;
+	q->token_time = jiffies;
+
+	q->slot = 0;
+	q->double_buffering = false;
+	sfb_zero_all_buckets(q);
+	sfb_init_perturbation(0, q);
+	sfb_init_perturbation(1, q);
+
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	q->qdisc = &noop_qdisc;
+	return sfb_change(sch, opt);
+}
+
+static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+	struct tc_sfb_qopt opt = {
+		.rehash_interval = jiffies_to_msecs(q->rehash_interval),
+		.warmup_time = jiffies_to_msecs(q->warmup_time),
+		.limit = q->limit,
+		.max = q->max,
+		.bin_size = q->bin_size,
+		.increment = q->increment,
+		.decrement = q->decrement,
+		.penalty_rate = q->penalty_rate,
+		.penalty_burst = q->penalty_burst,
+	};
+
+	sch->qstats.backlog = q->qdisc->qstats.backlog;
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct tc_sfb_xstats st = {
+		.earlydrop = q->stats.earlydrop,
+		.penaltydrop = q->stats.penaltydrop,
+		.bucketdrop = q->stats.bucketdrop,
+		.queuedrop = q->stats.queuedrop,
+		.childdrop = q->stats.childdrop,
+		.marked = q->stats.marked,
+	};
+
+	st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	return -ENOSYS;
+}
+
+static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	return q->qdisc;
+}
+
+static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void sfb_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	return -ENOSYS;
+}
+
+static int sfb_delete(struct Qdisc *sch, unsigned long cl)
+{
+	return -ENOSYS;
+}
+
+static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static struct tcf_proto **sfb_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct sfb_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	return 0;
+}
+
+
+static const struct Qdisc_class_ops sfb_class_ops = {
+	.graft		=	sfb_graft,
+	.leaf		=	sfb_leaf,
+	.get		=	sfb_get,
+	.put		=	sfb_put,
+	.change		=	sfb_change_class,
+	.delete		=	sfb_delete,
+	.walk		=	sfb_walk,
+	.tcf_chain	=	sfb_find_tcf,
+	.bind_tcf	=	sfb_bind,
+	.unbind_tcf	=	sfb_put,
+	.dump		=	sfb_dump_class,
+};
+
+static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
+	.id		=	"sfb",
+	.priv_size	=	sizeof(struct sfb_sched_data),
+	.cl_ops		=	&sfb_class_ops,
+	.enqueue	=	sfb_enqueue,
+	.dequeue	=	sfb_dequeue,
+	.peek		=	sfb_peek,
+	.init		=	sfb_init,
+	.reset		=	sfb_reset,
+	.destroy	=	sfb_destroy,
+	.change		=	sfb_change,
+	.dump		=	sfb_dump,
+	.dump_stats	=	sfb_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init sfb_module_init(void)
+{
+	return register_qdisc(&sfb_qdisc_ops);
+}
+
+static void __exit sfb_module_exit(void)
+{
+	unregister_qdisc(&sfb_qdisc_ops);
+}
+
+module_init(sfb_module_init)
+module_exit(sfb_module_exit)
+
+MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
+MODULE_AUTHOR("Juliusz Chroboczek");
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
new file mode 100644
index 00000000..69400e3c
--- /dev/null
+++ b/net/sched/sch_sfq.c
@@ -0,0 +1,722 @@
+/*
+ * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/ipv6.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+/*	Stochastic Fairness Queuing algorithm.
+	=======================================
+
+	Source:
+	Paul E. McKenney "Stochastic Fairness Queuing",
+	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
+
+	Paul E. McKenney "Stochastic Fairness Queuing",
+	"Interworking: Research and Experience", v.2, 1991, p.113-131.
+
+
+	See also:
+	M. Shreedhar and George Varghese "Efficient Fair
+	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
+
+
+	This is not the thing that is usually called (W)FQ nowadays.
+	It does not use any timestamp mechanism, but instead
+	processes queues in round-robin order.
+
+	ADVANTAGE:
+
+	- It is very cheap. Both CPU and memory requirements are minimal.
+
+	DRAWBACKS:
+
+	- "Stochastic" -> It is not 100% fair.
+	When hash collisions occur, several flows are considered as one.
+
+	- "Round-robin" -> It introduces larger delays than virtual clock
+	based schemes, and should not be used for isolating interactive
+	traffic	from non-interactive. It means, that this scheduler
+	should be used as leaf of CBQ or P3, which put interactive traffic
+	to higher priority band.
+
+	We still need true WFQ for top level CSZ, but using WFQ
+	for the best effort traffic is absolutely pointless:
+	SFQ is superior for this purpose.
+
+	IMPLEMENTATION:
+	This implementation limits maximal queue length to 128;
+	max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024.
+	The only goal of this restrictions was that all data
+	fit into one 4K page on 32bit arches.
+
+	It is easy to increase these values, but not in flight.  */
+
+#define SFQ_DEPTH		128 /* max number of packets per flow */
+#define SFQ_SLOTS		128 /* max number of flows */
+#define SFQ_EMPTY_SLOT		255
+#define SFQ_DEFAULT_HASH_DIVISOR 1024
+
+/* We use 16 bits to store allot, and want to handle packets up to 64K
+ * Scale allot by 8 (1<<3) so that no overflow occurs.
+ */
+#define SFQ_ALLOT_SHIFT		3
+#define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
+
+/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
+typedef unsigned char sfq_index;
+
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
+ * are 'pointers' to dep[] array
+ */
+struct sfq_head {
+	sfq_index	next;
+	sfq_index	prev;
+};
+
+struct sfq_slot {
+	struct sk_buff	*skblist_next;
+	struct sk_buff	*skblist_prev;
+	sfq_index	qlen; /* number of skbs in skblist */
+	sfq_index	next; /* next slot in sfq chain */
+	struct sfq_head dep; /* anchor in dep[] chains */
+	unsigned short	hash; /* hash value (index in ht[]) */
+	short		allot; /* credit for this slot */
+};
+
+struct sfq_sched_data {
+/* Parameters */
+	int		perturb_period;
+	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
+	int		limit;
+	unsigned int	divisor;	/* number of slots in hash table */
+/* Variables */
+	struct tcf_proto *filter_list;
+	struct timer_list perturb_timer;
+	u32		perturbation;
+	sfq_index	cur_depth;	/* depth of longest slot */
+	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
+	struct sfq_slot *tail;		/* current slot in round */
+	sfq_index	*ht;		/* Hash table (divisor slots) */
+	struct sfq_slot	slots[SFQ_SLOTS];
+	struct sfq_head	dep[SFQ_DEPTH];	/* Linked list of slots, indexed by depth */
+};
+
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
+{
+	if (val < SFQ_SLOTS)
+		return &q->slots[val].dep;
+	return &q->dep[val - SFQ_SLOTS];
+}
+
+static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
+{
+	return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
+}
+
+static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
+{
+	u32 h, h2;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+	{
+		const struct iphdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			goto err;
+		iph = ip_hdr(skb);
+		h = (__force u32)iph->daddr;
+		h2 = (__force u32)iph->saddr ^ iph->protocol;
+		if (iph->frag_off & htons(IP_MF | IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+			iph = ip_hdr(skb);
+			h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
+		}
+		break;
+	}
+	case htons(ETH_P_IPV6):
+	{
+		const struct ipv6hdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			goto err;
+		iph = ipv6_hdr(skb);
+		h = (__force u32)iph->daddr.s6_addr32[3];
+		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
+			iph = ipv6_hdr(skb);
+			h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
+		}
+		break;
+	}
+	default:
+err:
+		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
+		h2 = (unsigned long)skb->sk;
+	}
+
+	return sfq_fold_hash(q, h, h2);
+}
+
+static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
+				 int *qerr)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->divisor)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return sfq_hash(q, skb) + 1;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= q->divisor)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
+/*
+ * x : slot number [0 .. SFQ_SLOTS - 1]
+ */
+static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+	int qlen = q->slots[x].qlen;
+
+	p = qlen + SFQ_SLOTS;
+	n = q->dep[qlen].next;
+
+	q->slots[x].dep.next = n;
+	q->slots[x].dep.prev = p;
+
+	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
+	sfq_dep_head(q, n)->prev = x;
+}
+
+#define sfq_unlink(q, x, n, p)			\
+	n = q->slots[x].dep.next;		\
+	p = q->slots[x].dep.prev;		\
+	sfq_dep_head(q, p)->next = n;		\
+	sfq_dep_head(q, n)->prev = p
+
+
+static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+	int d;
+
+	sfq_unlink(q, x, n, p);
+
+	d = q->slots[x].qlen--;
+	if (n == p && q->cur_depth == d)
+		q->cur_depth--;
+	sfq_link(q, x);
+}
+
+static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
+{
+	sfq_index p, n;
+	int d;
+
+	sfq_unlink(q, x, n, p);
+
+	d = ++q->slots[x].qlen;
+	if (q->cur_depth < d)
+		q->cur_depth = d;
+	sfq_link(q, x);
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_prev;
+
+	slot->skblist_prev = skb->prev;
+	skb->prev->next = (struct sk_buff *)slot;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_next;
+
+	slot->skblist_next = skb->next;
+	skb->next->prev = (struct sk_buff *)slot;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+	skb->prev = slot->skblist_prev;
+	skb->next = (struct sk_buff *)slot;
+	slot->skblist_prev->next = skb;
+	slot->skblist_prev = skb;
+}
+
+#define	slot_queue_walk(slot, skb)		\
+	for (skb = slot->skblist_next;		\
+	     skb != (struct sk_buff *)slot;	\
+	     skb = skb->next)
+
+static unsigned int sfq_drop(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	sfq_index x, d = q->cur_depth;
+	struct sk_buff *skb;
+	unsigned int len;
+	struct sfq_slot *slot;
+
+	/* Queue is full! Find the longest slot and drop tail packet from it */
+	if (d > 1) {
+		x = q->dep[d].next;
+		slot = &q->slots[x];
+drop:
+		skb = slot_dequeue_tail(slot);
+		len = qdisc_pkt_len(skb);
+		sfq_dec(q, x);
+		kfree_skb(skb);
+		sch->q.qlen--;
+		sch->qstats.drops++;
+		sch->qstats.backlog -= len;
+		return len;
+	}
+
+	if (d == 1) {
+		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
+		x = q->tail->next;
+		slot = &q->slots[x];
+		q->tail->next = slot->next;
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		goto drop;
+	}
+
+	return 0;
+}
+
+static int
+sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	unsigned int hash;
+	sfq_index x, qlen;
+	struct sfq_slot *slot;
+	int uninitialized_var(ret);
+
+	hash = sfq_classify(skb, sch, &ret);
+	if (hash == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	hash--;
+
+	x = q->ht[hash];
+	slot = &q->slots[x];
+	if (x == SFQ_EMPTY_SLOT) {
+		x = q->dep[0].next; /* get a free slot */
+		q->ht[hash] = x;
+		slot = &q->slots[x];
+		slot->hash = hash;
+	}
+
+	/* If selected queue has length q->limit, do simple tail drop,
+	 * i.e. drop _this_ packet.
+	 */
+	if (slot->qlen >= q->limit)
+		return qdisc_drop(skb, sch);
+
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	slot_queue_add(slot, skb);
+	sfq_inc(q, x);
+	if (slot->qlen == 1) {		/* The flow is new */
+		if (q->tail == NULL) {	/* It is the first flow */
+			slot->next = x;
+		} else {
+			slot->next = q->tail->next;
+			q->tail->next = x;
+		}
+		q->tail = slot;
+		slot->allot = q->scaled_quantum;
+	}
+	if (++sch->q.qlen <= q->limit)
+		return NET_XMIT_SUCCESS;
+
+	qlen = slot->qlen;
+	sfq_drop(sch);
+	/* Return Congestion Notification only if we dropped a packet
+	 * from this flow.
+	 */
+	if (qlen != slot->qlen)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+sfq_dequeue(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	sfq_index a, next_a;
+	struct sfq_slot *slot;
+
+	/* No active slots */
+	if (q->tail == NULL)
+		return NULL;
+
+next_slot:
+	a = q->tail->next;
+	slot = &q->slots[a];
+	if (slot->allot <= 0) {
+		q->tail = slot;
+		slot->allot += q->scaled_quantum;
+		goto next_slot;
+	}
+	skb = slot_dequeue_head(slot);
+	sfq_dec(q, a);
+	qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+
+	/* Is the slot empty? */
+	if (slot->qlen == 0) {
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		next_a = slot->next;
+		if (a == next_a) {
+			q->tail = NULL; /* no more active slots */
+			return skb;
+		}
+		q->tail->next = next_a;
+	} else {
+		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
+	}
+	return skb;
+}
+
+static void
+sfq_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = sfq_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static void sfq_perturbation(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc *)arg;
+	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	q->perturbation = net_random();
+
+	if (q->perturb_period)
+		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+}
+
+static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct tc_sfq_qopt *ctl = nla_data(opt);
+	unsigned int qlen;
+
+	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
+		return -EINVAL;
+
+	if (ctl->divisor &&
+	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
+		return -EINVAL;
+
+	sch_tree_lock(sch);
+	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
+	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+	q->perturb_period = ctl->perturb_period * HZ;
+	if (ctl->limit)
+		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
+	if (ctl->divisor)
+		q->divisor = ctl->divisor;
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > q->limit)
+		sfq_drop(sch);
+	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+
+	del_timer(&q->perturb_timer);
+	if (q->perturb_period) {
+		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
+		q->perturbation = net_random();
+	}
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	size_t sz;
+	int i;
+
+	q->perturb_timer.function = sfq_perturbation;
+	q->perturb_timer.data = (unsigned long)sch;
+	init_timer_deferrable(&q->perturb_timer);
+
+	for (i = 0; i < SFQ_DEPTH; i++) {
+		q->dep[i].next = i + SFQ_SLOTS;
+		q->dep[i].prev = i + SFQ_SLOTS;
+	}
+
+	q->limit = SFQ_DEPTH - 1;
+	q->cur_depth = 0;
+	q->tail = NULL;
+	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
+	if (opt == NULL) {
+		q->quantum = psched_mtu(qdisc_dev(sch));
+		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
+		q->perturb_period = 0;
+		q->perturbation = net_random();
+	} else {
+		int err = sfq_change(sch, opt);
+		if (err)
+			return err;
+	}
+
+	sz = sizeof(q->ht[0]) * q->divisor;
+	q->ht = kmalloc(sz, GFP_KERNEL);
+	if (!q->ht && sz > PAGE_SIZE)
+		q->ht = vmalloc(sz);
+	if (!q->ht)
+		return -ENOMEM;
+	for (i = 0; i < q->divisor; i++)
+		q->ht[i] = SFQ_EMPTY_SLOT;
+
+	for (i = 0; i < SFQ_SLOTS; i++) {
+		slot_queue_init(&q->slots[i]);
+		sfq_link(q, i);
+	}
+	if (q->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void sfq_destroy(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	q->perturb_period = 0;
+	del_timer_sync(&q->perturb_timer);
+	if (is_vmalloc_addr(q->ht))
+		vfree(q->ht);
+	else
+		kfree(q->ht);
+}
+
+static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_sfq_qopt opt;
+
+	opt.quantum = q->quantum;
+	opt.perturb_period = q->perturb_period / HZ;
+
+	opt.limit = q->limit;
+	opt.divisor = q->divisor;
+	opt.flows = q->limit;
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void sfq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				struct gnet_dump *d)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	sfq_index idx = q->ht[cl - 1];
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_sfq_xstats xstats = { 0 };
+	struct sk_buff *skb;
+
+	if (idx != SFQ_EMPTY_SLOT) {
+		const struct sfq_slot *slot = &q->slots[idx];
+
+		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
+		qs.qlen = slot->qlen;
+		slot_queue_walk(slot, skb)
+			qs.backlog += qdisc_pkt_len(skb);
+	}
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->divisor; i++) {
+		if (q->ht[i] == SFQ_EMPTY_SLOT ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops sfq_class_ops = {
+	.leaf		=	sfq_leaf,
+	.get		=	sfq_get,
+	.put		=	sfq_put,
+	.tcf_chain	=	sfq_find_tcf,
+	.bind_tcf	=	sfq_bind,
+	.unbind_tcf	=	sfq_put,
+	.dump		=	sfq_dump_class,
+	.dump_stats	=	sfq_dump_class_stats,
+	.walk		=	sfq_walk,
+};
+
+static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
+	.cl_ops		=	&sfq_class_ops,
+	.id		=	"sfq",
+	.priv_size	=	sizeof(struct sfq_sched_data),
+	.enqueue	=	sfq_enqueue,
+	.dequeue	=	sfq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	sfq_drop,
+	.init		=	sfq_init,
+	.reset		=	sfq_reset,
+	.destroy	=	sfq_destroy,
+	.change		=	NULL,
+	.dump		=	sfq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init sfq_module_init(void)
+{
+	return register_qdisc(&sfq_qdisc_ops);
+}
+static void __exit sfq_module_exit(void)
+{
+	unregister_qdisc(&sfq_qdisc_ops);
+}
+module_init(sfq_module_init)
+module_exit(sfq_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
new file mode 100644
index 00000000..1dcfb522
--- /dev/null
+++ b/net/sched/sch_tbf.c
@@ -0,0 +1,464 @@
+/*
+ * net/sched/sch_tbf.c	Token Bucket Filter queue.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
+ *						 original idea by Martin Devera
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+/*	Simple Token Bucket Filter.
+	=======================================
+
+	SOURCE.
+	-------
+
+	None.
+
+	Description.
+	------------
+
+	A data flow obeys TBF with rate R and depth B, if for any
+	time interval t_i...t_f the number of transmitted bits
+	does not exceed B + R*(t_f-t_i).
+
+	Packetized version of this definition:
+	The sequence of packets of sizes s_i served at moments t_i
+	obeys TBF, if for any i<=k:
+
+	s_i+....+s_k <= B + R*(t_k - t_i)
+
+	Algorithm.
+	----------
+
+	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
+
+	N(t+delta) = min{B/R, N(t) + delta}
+
+	If the first packet in queue has length S, it may be
+	transmitted only at the time t_* when S/R <= N(t_*),
+	and in this case N(t) jumps:
+
+	N(t_* + 0) = N(t_* - 0) - S/R.
+
+
+
+	Actually, QoS requires two TBF to be applied to a data stream.
+	One of them controls steady state burst size, another
+	one with rate P (peak rate) and depth M (equal to link MTU)
+	limits bursts at a smaller time scale.
+
+	It is easy to see that P>R, and B>M. If P is infinity, this double
+	TBF is equivalent to a single one.
+
+	When TBF works in reshaping mode, latency is estimated as:
+
+	lat = max ((L-B)/R, (L-M)/P)
+
+
+	NOTES.
+	------
+
+	If TBF throttles, it starts a watchdog timer, which will wake it up
+	when it is ready to transmit.
+	Note that the minimal timer resolution is 1/HZ.
+	If no new packets arrive during this period,
+	or if the device is not awaken by EOI for some previous packet,
+	TBF can stop its activity for 1/HZ.
+
+
+	This means, that with depth B, the maximal rate is
+
+	R_crit = B*HZ
+
+	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
+
+	Note that the peak rate TBF is much more tough: with MTU 1500
+	P_crit = 150Kbytes/sec. So, if you need greater peak
+	rates, use alpha with HZ=1000 :-)
+
+	With classful TBF, limit is just kept for backwards compatibility.
+	It is passed to the default bfifo qdisc - if the inner qdisc is
+	changed the limit is not effective anymore.
+*/
+
+struct tbf_sched_data {
+/* Parameters */
+	u32		limit;		/* Maximal length of backlog: bytes */
+	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
+	u32		mtu;
+	u32		max_size;
+	struct qdisc_rate_table	*R_tab;
+	struct qdisc_rate_table	*P_tab;
+
+/* Variables */
+	long	tokens;			/* Current number of B tokens */
+	long	ptokens;		/* Current number of P tokens */
+	psched_time_t	t_c;		/* Time check-point */
+	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
+	struct qdisc_watchdog watchdog;	/* Watchdog timer */
+};
+
+#define L2T(q, L)   qdisc_l2t((q)->R_tab, L)
+#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
+
+static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	int ret;
+
+	if (qdisc_pkt_len(skb) > q->max_size)
+		return qdisc_reshape_fail(skb, sch);
+
+	ret = qdisc_enqueue(skb, q->qdisc);
+	if (ret != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(ret))
+			sch->qstats.drops++;
+		return ret;
+	}
+
+	sch->q.qlen++;
+	return NET_XMIT_SUCCESS;
+}
+
+static unsigned int tbf_drop(struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	unsigned int len = 0;
+
+	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
+		sch->q.qlen--;
+		sch->qstats.drops++;
+	}
+	return len;
+}
+
+static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	skb = q->qdisc->ops->peek(q->qdisc);
+
+	if (skb) {
+		psched_time_t now;
+		long toks;
+		long ptoks = 0;
+		unsigned int len = qdisc_pkt_len(skb);
+
+		now = psched_get_time();
+		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+
+		if (q->P_tab) {
+			ptoks = toks + q->ptokens;
+			if (ptoks > (long)q->mtu)
+				ptoks = q->mtu;
+			ptoks -= L2T_P(q, len);
+		}
+		toks += q->tokens;
+		if (toks > (long)q->buffer)
+			toks = q->buffer;
+		toks -= L2T(q, len);
+
+		if ((toks|ptoks) >= 0) {
+			skb = qdisc_dequeue_peeked(q->qdisc);
+			if (unlikely(!skb))
+				return NULL;
+
+			q->t_c = now;
+			q->tokens = toks;
+			q->ptokens = ptoks;
+			sch->q.qlen--;
+			qdisc_unthrottled(sch);
+			qdisc_bstats_update(sch, skb);
+			return skb;
+		}
+
+		qdisc_watchdog_schedule(&q->watchdog,
+					now + max_t(long, -toks, -ptoks));
+
+		/* Maybe we have a shorter packet in the queue,
+		   which can be sent now. It sounds cool,
+		   but, however, this is wrong in principle.
+		   We MUST NOT reorder packets under these circumstances.
+
+		   Really, if we split the flow into independent
+		   subflows, it would be a very good solution.
+		   This is the main idea of all FQ algorithms
+		   (cf. CSZ, HPFQ, HFSC)
+		 */
+
+		sch->qstats.overlimits++;
+	}
+	return NULL;
+}
+
+static void tbf_reset(struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset(q->qdisc);
+	sch->q.qlen = 0;
+	q->t_c = psched_get_time();
+	q->tokens = q->buffer;
+	q->ptokens = q->mtu;
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
+	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
+	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
+};
+
+static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	int err;
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_TBF_PTAB + 1];
+	struct tc_tbf_qopt *qopt;
+	struct qdisc_rate_table *rtab = NULL;
+	struct qdisc_rate_table *ptab = NULL;
+	struct Qdisc *child = NULL;
+	int max_size, n;
+
+	err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
+	if (err < 0)
+		return err;
+
+	err = -EINVAL;
+	if (tb[TCA_TBF_PARMS] == NULL)
+		goto done;
+
+	qopt = nla_data(tb[TCA_TBF_PARMS]);
+	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
+	if (rtab == NULL)
+		goto done;
+
+	if (qopt->peakrate.rate) {
+		if (qopt->peakrate.rate > qopt->rate.rate)
+			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
+		if (ptab == NULL)
+			goto done;
+	}
+
+	for (n = 0; n < 256; n++)
+		if (rtab->data[n] > qopt->buffer)
+			break;
+	max_size = (n << qopt->rate.cell_log) - 1;
+	if (ptab) {
+		int size;
+
+		for (n = 0; n < 256; n++)
+			if (ptab->data[n] > qopt->mtu)
+				break;
+		size = (n << qopt->peakrate.cell_log) - 1;
+		if (size < max_size)
+			max_size = size;
+	}
+	if (max_size < 0)
+		goto done;
+
+	if (q->qdisc != &noop_qdisc) {
+		err = fifo_set_limit(q->qdisc, qopt->limit);
+		if (err)
+			goto done;
+	} else if (qopt->limit > 0) {
+		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
+		if (IS_ERR(child)) {
+			err = PTR_ERR(child);
+			goto done;
+		}
+	}
+
+	sch_tree_lock(sch);
+	if (child) {
+		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+		qdisc_destroy(q->qdisc);
+		q->qdisc = child;
+	}
+	q->limit = qopt->limit;
+	q->mtu = qopt->mtu;
+	q->max_size = max_size;
+	q->buffer = qopt->buffer;
+	q->tokens = q->buffer;
+	q->ptokens = q->mtu;
+
+	swap(q->R_tab, rtab);
+	swap(q->P_tab, ptab);
+
+	sch_tree_unlock(sch);
+	err = 0;
+done:
+	if (rtab)
+		qdisc_put_rtab(rtab);
+	if (ptab)
+		qdisc_put_rtab(ptab);
+	return err;
+}
+
+static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	q->t_c = psched_get_time();
+	qdisc_watchdog_init(&q->watchdog, sch);
+	q->qdisc = &noop_qdisc;
+
+	return tbf_change(sch, opt);
+}
+
+static void tbf_destroy(struct Qdisc *sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	qdisc_watchdog_cancel(&q->watchdog);
+
+	if (q->P_tab)
+		qdisc_put_rtab(q->P_tab);
+	if (q->R_tab)
+		qdisc_put_rtab(q->R_tab);
+
+	qdisc_destroy(q->qdisc);
+}
+
+static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nest;
+	struct tc_tbf_qopt opt;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	opt.limit = q->limit;
+	opt.rate = q->R_tab->rate;
+	if (q->P_tab)
+		opt.peakrate = q->P_tab->rate;
+	else
+		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+	opt.mtu = q->mtu;
+	opt.buffer = q->buffer;
+	NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
+
+	nla_nest_end(skb, nest);
+	return skb->len;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
+static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	tcm->tcm_handle |= TC_H_MIN(1);
+	tcm->tcm_info = q->qdisc->handle;
+
+	return 0;
+}
+
+static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		     struct Qdisc **old)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->qdisc;
+	q->qdisc = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	return q->qdisc;
+}
+
+static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
+{
+	return 1;
+}
+
+static void tbf_put(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+	if (!walker->stop) {
+		if (walker->count >= walker->skip)
+			if (walker->fn(sch, 1, walker) < 0) {
+				walker->stop = 1;
+				return;
+			}
+		walker->count++;
+	}
+}
+
+static const struct Qdisc_class_ops tbf_class_ops = {
+	.graft		=	tbf_graft,
+	.leaf		=	tbf_leaf,
+	.get		=	tbf_get,
+	.put		=	tbf_put,
+	.walk		=	tbf_walk,
+	.dump		=	tbf_dump_class,
+};
+
+static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&tbf_class_ops,
+	.id		=	"tbf",
+	.priv_size	=	sizeof(struct tbf_sched_data),
+	.enqueue	=	tbf_enqueue,
+	.dequeue	=	tbf_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	tbf_drop,
+	.init		=	tbf_init,
+	.reset		=	tbf_reset,
+	.destroy	=	tbf_destroy,
+	.change		=	tbf_change,
+	.dump		=	tbf_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init tbf_module_init(void)
+{
+	return register_qdisc(&tbf_qdisc_ops);
+}
+
+static void __exit tbf_module_exit(void)
+{
+	unregister_qdisc(&tbf_qdisc_ops);
+}
+module_init(tbf_module_init)
+module_exit(tbf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
new file mode 100644
index 00000000..4f4c52c0
--- /dev/null
+++ b/net/sched/sch_teql.c
@@ -0,0 +1,538 @@
+/* net/sched/sch_teql.c	"True" (or "trivial") link equalizer.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/moduleparam.h>
+#include <net/dst.h>
+#include <net/neighbour.h>
+#include <net/pkt_sched.h>
+
+/*
+   How to setup it.
+   ----------------
+
+   After loading this module you will find a new device teqlN
+   and new qdisc with the same name. To join a slave to the equalizer
+   you should just set this qdisc on a device f.e.
+
+   # tc qdisc add dev eth0 root teql0
+   # tc qdisc add dev eth1 root teql0
+
+   That's all. Full PnP 8)
+
+   Applicability.
+   --------------
+
+   1. Slave devices MUST be active devices, i.e., they must raise the tbusy
+      signal and generate EOI events. If you want to equalize virtual devices
+      like tunnels, use a normal eql device.
+   2. This device puts no limitations on physical slave characteristics
+      f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
+      Certainly, large difference in link speeds will make the resulting
+      eqalized link unusable, because of huge packet reordering.
+      I estimate an upper useful difference as ~10 times.
+   3. If the slave requires address resolution, only protocols using
+      neighbour cache (IPv4/IPv6) will work over the equalized link.
+      Other protocols are still allowed to use the slave device directly,
+      which will not break load balancing, though native slave
+      traffic will have the highest priority.  */
+
+struct teql_master {
+	struct Qdisc_ops qops;
+	struct net_device *dev;
+	struct Qdisc *slaves;
+	struct list_head master_list;
+	unsigned long	tx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_errors;
+	unsigned long	tx_dropped;
+};
+
+struct teql_sched_data {
+	struct Qdisc *next;
+	struct teql_master *m;
+	struct neighbour *ncache;
+	struct sk_buff_head q;
+};
+
+#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
+
+#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
+
+/* "teql*" qdisc routines */
+
+static int
+teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct teql_sched_data *q = qdisc_priv(sch);
+
+	if (q->q.qlen < dev->tx_queue_len) {
+		__skb_queue_tail(&q->q, skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	kfree_skb(skb);
+	sch->qstats.drops++;
+	return NET_XMIT_DROP;
+}
+
+static struct sk_buff *
+teql_dequeue(struct Qdisc *sch)
+{
+	struct teql_sched_data *dat = qdisc_priv(sch);
+	struct netdev_queue *dat_queue;
+	struct sk_buff *skb;
+
+	skb = __skb_dequeue(&dat->q);
+	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
+	if (skb == NULL) {
+		struct net_device *m = qdisc_dev(dat_queue->qdisc);
+		if (m) {
+			dat->m->slaves = sch;
+			netif_wake_queue(m);
+		}
+	} else {
+		qdisc_bstats_update(sch, skb);
+	}
+	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
+	return skb;
+}
+
+static struct sk_buff *
+teql_peek(struct Qdisc *sch)
+{
+	/* teql is meant to be used as root qdisc */
+	return NULL;
+}
+
+static inline void
+teql_neigh_release(struct neighbour *n)
+{
+	if (n)
+		neigh_release(n);
+}
+
+static void
+teql_reset(struct Qdisc *sch)
+{
+	struct teql_sched_data *dat = qdisc_priv(sch);
+
+	skb_queue_purge(&dat->q);
+	sch->q.qlen = 0;
+	teql_neigh_release(xchg(&dat->ncache, NULL));
+}
+
+static void
+teql_destroy(struct Qdisc *sch)
+{
+	struct Qdisc *q, *prev;
+	struct teql_sched_data *dat = qdisc_priv(sch);
+	struct teql_master *master = dat->m;
+
+	prev = master->slaves;
+	if (prev) {
+		do {
+			q = NEXT_SLAVE(prev);
+			if (q == sch) {
+				NEXT_SLAVE(prev) = NEXT_SLAVE(q);
+				if (q == master->slaves) {
+					master->slaves = NEXT_SLAVE(q);
+					if (q == master->slaves) {
+						struct netdev_queue *txq;
+						spinlock_t *root_lock;
+
+						txq = netdev_get_tx_queue(master->dev, 0);
+						master->slaves = NULL;
+
+						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
+						spin_lock_bh(root_lock);
+						qdisc_reset(txq->qdisc);
+						spin_unlock_bh(root_lock);
+					}
+				}
+				skb_queue_purge(&dat->q);
+				teql_neigh_release(xchg(&dat->ncache, NULL));
+				break;
+			}
+
+		} while ((prev = q) != master->slaves);
+	}
+}
+
+static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct teql_master *m = (struct teql_master *)sch->ops;
+	struct teql_sched_data *q = qdisc_priv(sch);
+
+	if (dev->hard_header_len > m->dev->hard_header_len)
+		return -EINVAL;
+
+	if (m->dev == dev)
+		return -ELOOP;
+
+	q->m = m;
+
+	skb_queue_head_init(&q->q);
+
+	if (m->slaves) {
+		if (m->dev->flags & IFF_UP) {
+			if ((m->dev->flags & IFF_POINTOPOINT &&
+			     !(dev->flags & IFF_POINTOPOINT)) ||
+			    (m->dev->flags & IFF_BROADCAST &&
+			     !(dev->flags & IFF_BROADCAST)) ||
+			    (m->dev->flags & IFF_MULTICAST &&
+			     !(dev->flags & IFF_MULTICAST)) ||
+			    dev->mtu < m->dev->mtu)
+				return -EINVAL;
+		} else {
+			if (!(dev->flags&IFF_POINTOPOINT))
+				m->dev->flags &= ~IFF_POINTOPOINT;
+			if (!(dev->flags&IFF_BROADCAST))
+				m->dev->flags &= ~IFF_BROADCAST;
+			if (!(dev->flags&IFF_MULTICAST))
+				m->dev->flags &= ~IFF_MULTICAST;
+			if (dev->mtu < m->dev->mtu)
+				m->dev->mtu = dev->mtu;
+		}
+		q->next = NEXT_SLAVE(m->slaves);
+		NEXT_SLAVE(m->slaves) = sch;
+	} else {
+		q->next = sch;
+		m->slaves = sch;
+		m->dev->mtu = dev->mtu;
+		m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
+	}
+	return 0;
+}
+
+
+static int
+__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
+	       struct net_device *dev, struct netdev_queue *txq,
+	       struct neighbour *mn)
+{
+	struct teql_sched_data *q = qdisc_priv(txq->qdisc);
+	struct neighbour *n = q->ncache;
+
+	if (mn->tbl == NULL)
+		return -EINVAL;
+	if (n && n->tbl == mn->tbl &&
+	    memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
+		atomic_inc(&n->refcnt);
+	} else {
+		n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+	}
+	if (neigh_event_send(n, skb_res) == 0) {
+		int err;
+		char haddr[MAX_ADDR_LEN];
+
+		neigh_ha_snapshot(haddr, n, dev);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
+				      NULL, skb->len);
+
+		if (err < 0) {
+			neigh_release(n);
+			return -EINVAL;
+		}
+		teql_neigh_release(xchg(&q->ncache, n));
+		return 0;
+	}
+	neigh_release(n);
+	return (skb_res == NULL) ? -EAGAIN : 1;
+}
+
+static inline int teql_resolve(struct sk_buff *skb,
+			       struct sk_buff *skb_res,
+			       struct net_device *dev,
+			       struct netdev_queue *txq)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct neighbour *mn;
+	int res;
+
+	if (txq->qdisc == &noop_qdisc)
+		return -ENODEV;
+
+	if (!dev->header_ops || !dst)
+		return 0;
+
+	rcu_read_lock();
+	mn = dst_get_neighbour(dst);
+	res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
+	rcu_read_unlock();
+
+	return res;
+}
+
+static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct teql_master *master = netdev_priv(dev);
+	struct Qdisc *start, *q;
+	int busy;
+	int nores;
+	int subq = skb_get_queue_mapping(skb);
+	struct sk_buff *skb_res = NULL;
+
+	start = master->slaves;
+
+restart:
+	nores = 0;
+	busy = 0;
+
+	q = start;
+	if (!q)
+		goto drop;
+
+	do {
+		struct net_device *slave = qdisc_dev(q);
+		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
+		const struct net_device_ops *slave_ops = slave->netdev_ops;
+
+		if (slave_txq->qdisc_sleeping != q)
+			continue;
+		if (__netif_subqueue_stopped(slave, subq) ||
+		    !netif_running(slave)) {
+			busy = 1;
+			continue;
+		}
+
+		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
+		case 0:
+			if (__netif_tx_trylock(slave_txq)) {
+				unsigned int length = qdisc_pkt_len(skb);
+
+				if (!netif_tx_queue_frozen_or_stopped(slave_txq) &&
+				    slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
+					txq_trans_update(slave_txq);
+					__netif_tx_unlock(slave_txq);
+					master->slaves = NEXT_SLAVE(q);
+					netif_wake_queue(dev);
+					master->tx_packets++;
+					master->tx_bytes += length;
+					return NETDEV_TX_OK;
+				}
+				__netif_tx_unlock(slave_txq);
+			}
+			if (netif_queue_stopped(dev))
+				busy = 1;
+			break;
+		case 1:
+			master->slaves = NEXT_SLAVE(q);
+			return NETDEV_TX_OK;
+		default:
+			nores = 1;
+			break;
+		}
+		__skb_pull(skb, skb_network_offset(skb));
+	} while ((q = NEXT_SLAVE(q)) != start);
+
+	if (nores && skb_res == NULL) {
+		skb_res = skb;
+		goto restart;
+	}
+
+	if (busy) {
+		netif_stop_queue(dev);
+		return NETDEV_TX_BUSY;
+	}
+	master->tx_errors++;
+
+drop:
+	master->tx_dropped++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int teql_master_open(struct net_device *dev)
+{
+	struct Qdisc *q;
+	struct teql_master *m = netdev_priv(dev);
+	int mtu = 0xFFFE;
+	unsigned int flags = IFF_NOARP | IFF_MULTICAST;
+
+	if (m->slaves == NULL)
+		return -EUNATCH;
+
+	flags = FMASK;
+
+	q = m->slaves;
+	do {
+		struct net_device *slave = qdisc_dev(q);
+
+		if (slave == NULL)
+			return -EUNATCH;
+
+		if (slave->mtu < mtu)
+			mtu = slave->mtu;
+		if (slave->hard_header_len > LL_MAX_HEADER)
+			return -EINVAL;
+
+		/* If all the slaves are BROADCAST, master is BROADCAST
+		   If all the slaves are PtP, master is PtP
+		   Otherwise, master is NBMA.
+		 */
+		if (!(slave->flags&IFF_POINTOPOINT))
+			flags &= ~IFF_POINTOPOINT;
+		if (!(slave->flags&IFF_BROADCAST))
+			flags &= ~IFF_BROADCAST;
+		if (!(slave->flags&IFF_MULTICAST))
+			flags &= ~IFF_MULTICAST;
+	} while ((q = NEXT_SLAVE(q)) != m->slaves);
+
+	m->dev->mtu = mtu;
+	m->dev->flags = (m->dev->flags&~FMASK) | flags;
+	netif_start_queue(m->dev);
+	return 0;
+}
+
+static int teql_master_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
+						     struct rtnl_link_stats64 *stats)
+{
+	struct teql_master *m = netdev_priv(dev);
+
+	stats->tx_packets	= m->tx_packets;
+	stats->tx_bytes		= m->tx_bytes;
+	stats->tx_errors	= m->tx_errors;
+	stats->tx_dropped	= m->tx_dropped;
+	return stats;
+}
+
+static int teql_master_mtu(struct net_device *dev, int new_mtu)
+{
+	struct teql_master *m = netdev_priv(dev);
+	struct Qdisc *q;
+
+	if (new_mtu < 68)
+		return -EINVAL;
+
+	q = m->slaves;
+	if (q) {
+		do {
+			if (new_mtu > qdisc_dev(q)->mtu)
+				return -EINVAL;
+		} while ((q = NEXT_SLAVE(q)) != m->slaves);
+	}
+
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static const struct net_device_ops teql_netdev_ops = {
+	.ndo_open	= teql_master_open,
+	.ndo_stop	= teql_master_close,
+	.ndo_start_xmit	= teql_master_xmit,
+	.ndo_get_stats64 = teql_master_stats64,
+	.ndo_change_mtu	= teql_master_mtu,
+};
+
+static __init void teql_master_setup(struct net_device *dev)
+{
+	struct teql_master *master = netdev_priv(dev);
+	struct Qdisc_ops *ops = &master->qops;
+
+	master->dev	= dev;
+	ops->priv_size  = sizeof(struct teql_sched_data);
+
+	ops->enqueue	=	teql_enqueue;
+	ops->dequeue	=	teql_dequeue;
+	ops->peek	=	teql_peek;
+	ops->init	=	teql_qdisc_init;
+	ops->reset	=	teql_reset;
+	ops->destroy	=	teql_destroy;
+	ops->owner	=	THIS_MODULE;
+
+	dev->netdev_ops =       &teql_netdev_ops;
+	dev->type		= ARPHRD_VOID;
+	dev->mtu		= 1500;
+	dev->tx_queue_len	= 100;
+	dev->flags		= IFF_NOARP;
+	dev->hard_header_len	= LL_MAX_HEADER;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+}
+
+static LIST_HEAD(master_dev_list);
+static int max_equalizers = 1;
+module_param(max_equalizers, int, 0);
+MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
+
+static int __init teql_init(void)
+{
+	int i;
+	int err = -ENODEV;
+
+	for (i = 0; i < max_equalizers; i++) {
+		struct net_device *dev;
+		struct teql_master *master;
+
+		dev = alloc_netdev(sizeof(struct teql_master),
+				  "teql%d", teql_master_setup);
+		if (!dev) {
+			err = -ENOMEM;
+			break;
+		}
+
+		if ((err = register_netdev(dev))) {
+			free_netdev(dev);
+			break;
+		}
+
+		master = netdev_priv(dev);
+
+		strlcpy(master->qops.id, dev->name, IFNAMSIZ);
+		err = register_qdisc(&master->qops);
+
+		if (err) {
+			unregister_netdev(dev);
+			free_netdev(dev);
+			break;
+		}
+
+		list_add_tail(&master->master_list, &master_dev_list);
+	}
+	return i ? 0 : err;
+}
+
+static void __exit teql_exit(void)
+{
+	struct teql_master *master, *nxt;
+
+	list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
+
+		list_del(&master->master_list);
+
+		unregister_qdisc(&master->qops);
+		unregister_netdev(master->dev);
+		free_netdev(master->dev);
+	}
+}
+
+module_init(teql_init);
+module_exit(teql_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
new file mode 100644
index 00000000..126b014e
--- /dev/null
+++ b/net/sctp/Kconfig
@@ -0,0 +1,100 @@
+#
+# SCTP configuration
+#
+
+menuconfig IP_SCTP
+	tristate "The SCTP Protocol (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	depends on IPV6 || IPV6=n
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA1
+	select CRYPTO_MD5 if SCTP_HMAC_MD5
+	select LIBCRC32C
+	---help---
+	  Stream Control Transmission Protocol
+
+	  From RFC 2960 <http://www.ietf.org/rfc/rfc2960.txt>.
+
+	  "SCTP is a reliable transport protocol operating on top of a
+	  connectionless packet network such as IP.  It offers the following
+	  services to its users:
+
+	  -- acknowledged error-free non-duplicated transfer of user data,
+	  -- data fragmentation to conform to discovered path MTU size,
+	  -- sequenced delivery of user messages within multiple streams,
+	  with an option for order-of-arrival delivery of individual user
+	  messages,
+	  -- optional bundling of multiple user messages into a single SCTP
+	  packet, and
+	  -- network-level fault tolerance through supporting of multi-
+	  homing at either or both ends of an association."
+
+	  To compile this protocol support as a module, choose M here: the
+	  module will be called sctp.
+
+	  If in doubt, say N.
+
+if IP_SCTP
+
+config NET_SCTPPROBE
+	tristate "SCTP: Association probing"
+        depends on PROC_FS && KPROBES
+        ---help---
+        This module allows for capturing the changes to SCTP association
+        state in response to incoming packets. It is used for debugging
+        SCTP congestion control algorithms. If you don't understand
+        what was just said, you don't need it: say N.
+
+        To compile this code as a module, choose M here: the
+        module will be called sctp_probe.
+
+config SCTP_DBG_MSG
+	bool "SCTP: Debug messages"
+	help
+	  If you say Y, this will enable verbose debugging messages. 
+
+	  If unsure, say N.  However, if you are running into problems, use 
+	  this option to gather detailed trace information
+
+config SCTP_DBG_OBJCNT
+	bool "SCTP: Debug object counts"
+	depends on PROC_FS
+	help
+	  If you say Y, this will enable debugging support for counting the 
+	  type of objects that are currently allocated.  This is useful for 
+	  identifying memory leaks. This debug information can be viewed by
+	  'cat /proc/net/sctp/sctp_dbg_objcnt'
+
+	  If unsure, say N
+
+choice
+	prompt "SCTP: Cookie HMAC Algorithm"
+	default SCTP_HMAC_MD5
+	help
+	  HMAC algorithm to be used during association initialization.  It
+	  is strongly recommended to use HMAC-SHA1 or HMAC-MD5.  See 
+	  configuration for Cryptographic API and enable those algorithms
+          to make usable by SCTP. 
+
+config SCTP_HMAC_NONE
+	bool "None"
+	help 
+	  Choosing this disables the use of an HMAC during association 
+	  establishment.  It is advised to use either HMAC-MD5 or HMAC-SHA1.
+
+config SCTP_HMAC_SHA1
+	bool "HMAC-SHA1"
+	help 
+	  Enable the use of HMAC-SHA1 during association establishment.  It 
+	  is advised to use either HMAC-MD5 or HMAC-SHA1.
+
+config SCTP_HMAC_MD5
+	bool "HMAC-MD5"
+	help
+	  Enable the use of HMAC-MD5 during association establishment.  It is 
+	  advised to use either HMAC-MD5 or HMAC-SHA1.
+
+endchoice
+
+endif # IP_SCTP
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
new file mode 100644
index 00000000..5c30b7a8
--- /dev/null
+++ b/net/sctp/Makefile
@@ -0,0 +1,21 @@
+#
+# Makefile for SCTP support code.
+#
+
+obj-$(CONFIG_IP_SCTP) += sctp.o
+obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
+
+sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
+	  protocol.o endpointola.o associola.o \
+	  transport.o chunk.o sm_make_chunk.o ulpevent.o \
+	  inqueue.o outqueue.o ulpqueue.o command.o \
+	  tsnmap.o bind_addr.o socket.o primitive.o \
+	  output.o input.o debug.o ssnmap.o auth.o
+
+sctp_probe-y := probe.o
+
+sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
+sctp-$(CONFIG_PROC_FS) += proc.o
+sctp-$(CONFIG_SYSCTL) += sysctl.o
+
+sctp-$(subst m,y,$(CONFIG_IPV6))	+= ipv6.o
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
new file mode 100644
index 00000000..17a6e658
--- /dev/null
+++ b/net/sctp/associola.c
@@ -0,0 +1,1653 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This module provides the abstraction for an SCTP association.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Hui Huang             <hui.huang@nokia.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Ryan Layer	    <rmlayer@us.ibm.com>
+ *    Kevin Gao             <kevin.gao@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal functions. */
+static void sctp_assoc_bh_rcv(struct work_struct *work);
+static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc);
+static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc);
+
+/* Keep track of the new idr low so that we don't re-use association id
+ * numbers too fast.  It is protected by they idr spin lock is in the
+ * range of 1 - INT_MAX.
+ */
+static u32 idr_low = 1;
+
+
+/* 1st Level Abstractions. */
+
+/* Initialize a new association from provided memory. */
+static struct sctp_association *sctp_association_init(struct sctp_association *asoc,
+					  const struct sctp_endpoint *ep,
+					  const struct sock *sk,
+					  sctp_scope_t scope,
+					  gfp_t gfp)
+{
+	struct sctp_sock *sp;
+	int i;
+	sctp_paramhdr_t *p;
+	int err;
+
+	/* Retrieve the SCTP per socket area.  */
+	sp = sctp_sk((struct sock *)sk);
+
+	/* Discarding const is appropriate here.  */
+	asoc->ep = (struct sctp_endpoint *)ep;
+	sctp_endpoint_hold(asoc->ep);
+
+	/* Hold the sock.  */
+	asoc->base.sk = (struct sock *)sk;
+	sock_hold(asoc->base.sk);
+
+	/* Initialize the common base substructure.  */
+	asoc->base.type = SCTP_EP_TYPE_ASSOCIATION;
+
+	/* Initialize the object handling fields.  */
+	atomic_set(&asoc->base.refcnt, 1);
+	asoc->base.dead = 0;
+	asoc->base.malloced = 0;
+
+	/* Initialize the bind addr area.  */
+	sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port);
+
+	asoc->state = SCTP_STATE_CLOSED;
+
+	/* Set these values from the socket values, a conversion between
+	 * millsecons to seconds/microseconds must also be done.
+	 */
+	asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
+	asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
+					* 1000;
+	asoc->frag_point = 0;
+	asoc->user_frag = sp->user_frag;
+
+	/* Set the association max_retrans and RTO values from the
+	 * socket values.
+	 */
+	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
+	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
+	asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
+
+	asoc->overall_error_count = 0;
+
+	/* Initialize the association's heartbeat interval based on the
+	 * sock configured value.
+	 */
+	asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+
+	/* Initialize path max retrans value. */
+	asoc->pathmaxrxt = sp->pathmaxrxt;
+
+	/* Initialize default path MTU. */
+	asoc->pathmtu = sp->pathmtu;
+
+	/* Set association default SACK delay */
+	asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
+	asoc->sackfreq = sp->sackfreq;
+
+	/* Set the association default flags controlling
+	 * Heartbeat, SACK delay, and Path MTU Discovery.
+	 */
+	asoc->param_flags = sp->param_flags;
+
+	/* Initialize the maximum mumber of new data packets that can be sent
+	 * in a burst.
+	 */
+	asoc->max_burst = sp->max_burst;
+
+	/* initialize association timers */
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
+
+	/* sctpimpguide Section 2.12.2
+	 * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
+	 * recommended value of 5 times 'RTO.Max'.
+	 */
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
+		= 5 * asoc->rto_max;
+
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
+		min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ;
+
+	/* Initializes the timers */
+	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
+		setup_timer(&asoc->timers[i], sctp_timer_events[i],
+				(unsigned long)asoc);
+
+	/* Pull default initialization values from the sock options.
+	 * Note: This assumes that the values have already been
+	 * validated in the sock.
+	 */
+	asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams;
+	asoc->c.sinit_num_ostreams  = sp->initmsg.sinit_num_ostreams;
+	asoc->max_init_attempts	= sp->initmsg.sinit_max_attempts;
+
+	asoc->max_init_timeo =
+		 msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo);
+
+	/* Allocate storage for the ssnmap after the inbound and outbound
+	 * streams have been negotiated during Init.
+	 */
+	asoc->ssnmap = NULL;
+
+	/* Set the local window size for receive.
+	 * This is also the rcvbuf space per association.
+	 * RFC 6 - A SCTP receiver MUST be able to receive a minimum of
+	 * 1500 bytes in one SCTP packet.
+	 */
+	if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
+		asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
+	else
+		asoc->rwnd = sk->sk_rcvbuf/2;
+
+	asoc->a_rwnd = asoc->rwnd;
+
+	asoc->rwnd_over = 0;
+	asoc->rwnd_press = 0;
+
+	/* Use my own max window until I learn something better.  */
+	asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW;
+
+	/* Set the sndbuf size for transmit.  */
+	asoc->sndbuf_used = 0;
+
+	/* Initialize the receive memory counter */
+	atomic_set(&asoc->rmem_alloc, 0);
+
+	init_waitqueue_head(&asoc->wait);
+
+	asoc->c.my_vtag = sctp_generate_tag(ep);
+	asoc->peer.i.init_tag = 0;     /* INIT needs a vtag of 0. */
+	asoc->c.peer_vtag = 0;
+	asoc->c.my_ttag   = 0;
+	asoc->c.peer_ttag = 0;
+	asoc->c.my_port = ep->base.bind_addr.port;
+
+	asoc->c.initial_tsn = sctp_generate_tsn(ep);
+
+	asoc->next_tsn = asoc->c.initial_tsn;
+
+	asoc->ctsn_ack_point = asoc->next_tsn - 1;
+	asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
+	asoc->highest_sacked = asoc->ctsn_ack_point;
+	asoc->last_cwr_tsn = asoc->ctsn_ack_point;
+	asoc->unack_data = 0;
+
+	/* ADDIP Section 4.1 Asconf Chunk Procedures
+	 *
+	 * When an endpoint has an ASCONF signaled change to be sent to the
+	 * remote endpoint it should do the following:
+	 * ...
+	 * A2) a serial number should be assigned to the chunk. The serial
+	 * number SHOULD be a monotonically increasing number. The serial
+	 * numbers SHOULD be initialized at the start of the
+	 * association to the same value as the initial TSN.
+	 */
+	asoc->addip_serial = asoc->c.initial_tsn;
+
+	INIT_LIST_HEAD(&asoc->addip_chunk_list);
+	INIT_LIST_HEAD(&asoc->asconf_ack_list);
+
+	/* Make an empty list of remote transport addresses.  */
+	INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
+	asoc->peer.transport_count = 0;
+
+	/* RFC 2960 5.1 Normal Establishment of an Association
+	 *
+	 * After the reception of the first data chunk in an
+	 * association the endpoint must immediately respond with a
+	 * sack to acknowledge the data chunk.  Subsequent
+	 * acknowledgements should be done as described in Section
+	 * 6.2.
+	 *
+	 * [We implement this by telling a new association that it
+	 * already received one packet.]
+	 */
+	asoc->peer.sack_needed = 1;
+	asoc->peer.sack_cnt = 0;
+
+	/* Assume that the peer will tell us if he recognizes ASCONF
+	 * as part of INIT exchange.
+	 * The sctp_addip_noauth option is there for backward compatibilty
+	 * and will revert old behavior.
+	 */
+	asoc->peer.asconf_capable = 0;
+	if (sctp_addip_noauth)
+		asoc->peer.asconf_capable = 1;
+
+	/* Create an input queue.  */
+	sctp_inq_init(&asoc->base.inqueue);
+	sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv);
+
+	/* Create an output queue.  */
+	sctp_outq_init(asoc, &asoc->outqueue);
+
+	if (!sctp_ulpq_init(&asoc->ulpq, asoc))
+		goto fail_init;
+
+	memset(&asoc->peer.tsn_map, 0, sizeof(struct sctp_tsnmap));
+
+	asoc->need_ecne = 0;
+
+	asoc->assoc_id = 0;
+
+	/* Assume that peer would support both address types unless we are
+	 * told otherwise.
+	 */
+	asoc->peer.ipv4_address = 1;
+	if (asoc->base.sk->sk_family == PF_INET6)
+		asoc->peer.ipv6_address = 1;
+	INIT_LIST_HEAD(&asoc->asocs);
+
+	asoc->autoclose = sp->autoclose;
+
+	asoc->default_stream = sp->default_stream;
+	asoc->default_ppid = sp->default_ppid;
+	asoc->default_flags = sp->default_flags;
+	asoc->default_context = sp->default_context;
+	asoc->default_timetolive = sp->default_timetolive;
+	asoc->default_rcv_context = sp->default_rcv_context;
+
+	/* AUTH related initializations */
+	INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
+	err = sctp_auth_asoc_copy_shkeys(ep, asoc, gfp);
+	if (err)
+		goto fail_init;
+
+	asoc->active_key_id = ep->active_key_id;
+	asoc->asoc_shared_key = NULL;
+
+	asoc->default_hmac_id = 0;
+	/* Save the hmacs and chunks list into this association */
+	if (ep->auth_hmacs_list)
+		memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list,
+			ntohs(ep->auth_hmacs_list->param_hdr.length));
+	if (ep->auth_chunk_list)
+		memcpy(asoc->c.auth_chunks, ep->auth_chunk_list,
+			ntohs(ep->auth_chunk_list->param_hdr.length));
+
+	/* Get the AUTH random number for this association */
+	p = (sctp_paramhdr_t *)asoc->c.auth_random;
+	p->type = SCTP_PARAM_RANDOM;
+	p->length = htons(sizeof(sctp_paramhdr_t) + SCTP_AUTH_RANDOM_LENGTH);
+	get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH);
+
+	return asoc;
+
+fail_init:
+	sctp_endpoint_put(asoc->ep);
+	sock_put(asoc->base.sk);
+	return NULL;
+}
+
+/* Allocate and initialize a new association */
+struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
+					 const struct sock *sk,
+					 sctp_scope_t scope,
+					 gfp_t gfp)
+{
+	struct sctp_association *asoc;
+
+	asoc = t_new(struct sctp_association, gfp);
+	if (!asoc)
+		goto fail;
+
+	if (!sctp_association_init(asoc, ep, sk, scope, gfp))
+		goto fail_init;
+
+	asoc->base.malloced = 1;
+	SCTP_DBG_OBJCNT_INC(assoc);
+	SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
+
+	return asoc;
+
+fail_init:
+	kfree(asoc);
+fail:
+	return NULL;
+}
+
+/* Free this association if possible.  There may still be users, so
+ * the actual deallocation may be delayed.
+ */
+void sctp_association_free(struct sctp_association *asoc)
+{
+	struct sock *sk = asoc->base.sk;
+	struct sctp_transport *transport;
+	struct list_head *pos, *temp;
+	int i;
+
+	/* Only real associations count against the endpoint, so
+	 * don't bother for if this is a temporary association.
+	 */
+	if (!asoc->temp) {
+		list_del(&asoc->asocs);
+
+		/* Decrement the backlog value for a TCP-style listening
+		 * socket.
+		 */
+		if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+			sk->sk_ack_backlog--;
+	}
+
+	/* Mark as dead, so other users can know this structure is
+	 * going away.
+	 */
+	asoc->base.dead = 1;
+
+	/* Dispose of any data lying around in the outqueue. */
+	sctp_outq_free(&asoc->outqueue);
+
+	/* Dispose of any pending messages for the upper layer. */
+	sctp_ulpq_free(&asoc->ulpq);
+
+	/* Dispose of any pending chunks on the inqueue. */
+	sctp_inq_free(&asoc->base.inqueue);
+
+	sctp_tsnmap_free(&asoc->peer.tsn_map);
+
+	/* Free ssnmap storage. */
+	sctp_ssnmap_free(asoc->ssnmap);
+
+	/* Clean up the bound address list. */
+	sctp_bind_addr_free(&asoc->base.bind_addr);
+
+	/* Do we need to go through all of our timers and
+	 * delete them?   To be safe we will try to delete all, but we
+	 * should be able to go through and make a guess based
+	 * on our state.
+	 */
+	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
+		if (timer_pending(&asoc->timers[i]) &&
+		    del_timer(&asoc->timers[i]))
+			sctp_association_put(asoc);
+	}
+
+	/* Free peer's cached cookie. */
+	kfree(asoc->peer.cookie);
+	kfree(asoc->peer.peer_random);
+	kfree(asoc->peer.peer_chunks);
+	kfree(asoc->peer.peer_hmacs);
+
+	/* Release the transport structures. */
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		transport = list_entry(pos, struct sctp_transport, transports);
+		list_del(pos);
+		sctp_transport_free(transport);
+	}
+
+	asoc->peer.transport_count = 0;
+
+	sctp_asconf_queue_teardown(asoc);
+
+	/* AUTH - Free the endpoint shared keys */
+	sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
+
+	/* AUTH - Free the association shared key */
+	sctp_auth_key_put(asoc->asoc_shared_key);
+
+	sctp_association_put(asoc);
+}
+
+/* Cleanup and free up an association. */
+static void sctp_association_destroy(struct sctp_association *asoc)
+{
+	SCTP_ASSERT(asoc->base.dead, "Assoc is not dead", return);
+
+	sctp_endpoint_put(asoc->ep);
+	sock_put(asoc->base.sk);
+
+	if (asoc->assoc_id != 0) {
+		spin_lock_bh(&sctp_assocs_id_lock);
+		idr_remove(&sctp_assocs_id, asoc->assoc_id);
+		spin_unlock_bh(&sctp_assocs_id_lock);
+	}
+
+	WARN_ON(atomic_read(&asoc->rmem_alloc));
+
+	if (asoc->base.malloced) {
+		kfree(asoc);
+		SCTP_DBG_OBJCNT_DEC(assoc);
+	}
+}
+
+/* Change the primary destination address for the peer. */
+void sctp_assoc_set_primary(struct sctp_association *asoc,
+			    struct sctp_transport *transport)
+{
+	int changeover = 0;
+
+	/* it's a changeover only if we already have a primary path
+	 * that we are changing
+	 */
+	if (asoc->peer.primary_path != NULL &&
+	    asoc->peer.primary_path != transport)
+		changeover = 1 ;
+
+	asoc->peer.primary_path = transport;
+
+	/* Set a default msg_name for events. */
+	memcpy(&asoc->peer.primary_addr, &transport->ipaddr,
+	       sizeof(union sctp_addr));
+
+	/* If the primary path is changing, assume that the
+	 * user wants to use this new path.
+	 */
+	if ((transport->state == SCTP_ACTIVE) ||
+	    (transport->state == SCTP_UNKNOWN))
+		asoc->peer.active_path = transport;
+
+	/*
+	 * SFR-CACC algorithm:
+	 * Upon the receipt of a request to change the primary
+	 * destination address, on the data structure for the new
+	 * primary destination, the sender MUST do the following:
+	 *
+	 * 1) If CHANGEOVER_ACTIVE is set, then there was a switch
+	 * to this destination address earlier. The sender MUST set
+	 * CYCLING_CHANGEOVER to indicate that this switch is a
+	 * double switch to the same destination address.
+	 *
+	 * Really, only bother is we have data queued or outstanding on
+	 * the association.
+	 */
+	if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen)
+		return;
+
+	if (transport->cacc.changeover_active)
+		transport->cacc.cycling_changeover = changeover;
+
+	/* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that
+	 * a changeover has occurred.
+	 */
+	transport->cacc.changeover_active = changeover;
+
+	/* 3) The sender MUST store the next TSN to be sent in
+	 * next_tsn_at_change.
+	 */
+	transport->cacc.next_tsn_at_change = asoc->next_tsn;
+}
+
+/* Remove a transport from an association.  */
+void sctp_assoc_rm_peer(struct sctp_association *asoc,
+			struct sctp_transport *peer)
+{
+	struct list_head	*pos;
+	struct sctp_transport	*transport;
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
+				 " port: %d\n",
+				 asoc,
+				 (&peer->ipaddr),
+				 ntohs(peer->ipaddr.v4.sin_port));
+
+	/* If we are to remove the current retran_path, update it
+	 * to the next peer before removing this peer from the list.
+	 */
+	if (asoc->peer.retran_path == peer)
+		sctp_assoc_update_retran_path(asoc);
+
+	/* Remove this peer from the list. */
+	list_del(&peer->transports);
+
+	/* Get the first transport of asoc. */
+	pos = asoc->peer.transport_addr_list.next;
+	transport = list_entry(pos, struct sctp_transport, transports);
+
+	/* Update any entries that match the peer to be deleted. */
+	if (asoc->peer.primary_path == peer)
+		sctp_assoc_set_primary(asoc, transport);
+	if (asoc->peer.active_path == peer)
+		asoc->peer.active_path = transport;
+	if (asoc->peer.retran_path == peer)
+		asoc->peer.retran_path = transport;
+	if (asoc->peer.last_data_from == peer)
+		asoc->peer.last_data_from = transport;
+
+	/* If we remove the transport an INIT was last sent to, set it to
+	 * NULL. Combined with the update of the retran path above, this
+	 * will cause the next INIT to be sent to the next available
+	 * transport, maintaining the cycle.
+	 */
+	if (asoc->init_last_sent_to == peer)
+		asoc->init_last_sent_to = NULL;
+
+	/* If we remove the transport an SHUTDOWN was last sent to, set it
+	 * to NULL. Combined with the update of the retran path above, this
+	 * will cause the next SHUTDOWN to be sent to the next available
+	 * transport, maintaining the cycle.
+	 */
+	if (asoc->shutdown_last_sent_to == peer)
+		asoc->shutdown_last_sent_to = NULL;
+
+	/* If we remove the transport an ASCONF was last sent to, set it to
+	 * NULL.
+	 */
+	if (asoc->addip_last_asconf &&
+	    asoc->addip_last_asconf->transport == peer)
+		asoc->addip_last_asconf->transport = NULL;
+
+	/* If we have something on the transmitted list, we have to
+	 * save it off.  The best place is the active path.
+	 */
+	if (!list_empty(&peer->transmitted)) {
+		struct sctp_transport *active = asoc->peer.active_path;
+		struct sctp_chunk *ch;
+
+		/* Reset the transport of each chunk on this list */
+		list_for_each_entry(ch, &peer->transmitted,
+					transmitted_list) {
+			ch->transport = NULL;
+			ch->rtt_in_progress = 0;
+		}
+
+		list_splice_tail_init(&peer->transmitted,
+					&active->transmitted);
+
+		/* Start a T3 timer here in case it wasn't running so
+		 * that these migrated packets have a chance to get
+		 * retrnasmitted.
+		 */
+		if (!timer_pending(&active->T3_rtx_timer))
+			if (!mod_timer(&active->T3_rtx_timer,
+					jiffies + active->rto))
+				sctp_transport_hold(active);
+	}
+
+	asoc->peer.transport_count--;
+
+	sctp_transport_free(peer);
+}
+
+/* Add a transport address to an association.  */
+struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
+					   const union sctp_addr *addr,
+					   const gfp_t gfp,
+					   const int peer_state)
+{
+	struct sctp_transport *peer;
+	struct sctp_sock *sp;
+	unsigned short port;
+
+	sp = sctp_sk(asoc->base.sk);
+
+	/* AF_INET and AF_INET6 share common port field. */
+	port = ntohs(addr->v4.sin_port);
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
+				 " port: %d state:%d\n",
+				 asoc,
+				 addr,
+				 port,
+				 peer_state);
+
+	/* Set the port if it has not been set yet.  */
+	if (0 == asoc->peer.port)
+		asoc->peer.port = port;
+
+	/* Check to see if this is a duplicate. */
+	peer = sctp_assoc_lookup_paddr(asoc, addr);
+	if (peer) {
+		/* An UNKNOWN state is only set on transports added by
+		 * user in sctp_connectx() call.  Such transports should be
+		 * considered CONFIRMED per RFC 4960, Section 5.4.
+		 */
+		if (peer->state == SCTP_UNKNOWN) {
+			peer->state = SCTP_ACTIVE;
+		}
+		return peer;
+	}
+
+	peer = sctp_transport_new(addr, gfp);
+	if (!peer)
+		return NULL;
+
+	sctp_transport_set_owner(peer, asoc);
+
+	/* Initialize the peer's heartbeat interval based on the
+	 * association configured value.
+	 */
+	peer->hbinterval = asoc->hbinterval;
+
+	/* Set the path max_retrans.  */
+	peer->pathmaxrxt = asoc->pathmaxrxt;
+
+	/* Initialize the peer's SACK delay timeout based on the
+	 * association configured value.
+	 */
+	peer->sackdelay = asoc->sackdelay;
+	peer->sackfreq = asoc->sackfreq;
+
+	/* Enable/disable heartbeat, SACK delay, and path MTU discovery
+	 * based on association setting.
+	 */
+	peer->param_flags = asoc->param_flags;
+
+	sctp_transport_route(peer, NULL, sp);
+
+	/* Initialize the pmtu of the transport. */
+	if (peer->param_flags & SPP_PMTUD_DISABLE) {
+		if (asoc->pathmtu)
+			peer->pathmtu = asoc->pathmtu;
+		else
+			peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+	}
+
+	/* If this is the first transport addr on this association,
+	 * initialize the association PMTU to the peer's PMTU.
+	 * If not and the current association PMTU is higher than the new
+	 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
+	 */
+	if (asoc->pathmtu)
+		asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
+	else
+		asoc->pathmtu = peer->pathmtu;
+
+	SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
+			  "%d\n", asoc, asoc->pathmtu);
+	peer->pmtu_pending = 0;
+
+	asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+
+	/* The asoc->peer.port might not be meaningful yet, but
+	 * initialize the packet structure anyway.
+	 */
+	sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port,
+			 asoc->peer.port);
+
+	/* 7.2.1 Slow-Start
+	 *
+	 * o The initial cwnd before DATA transmission or after a sufficiently
+	 *   long idle period MUST be set to
+	 *      min(4*MTU, max(2*MTU, 4380 bytes))
+	 *
+	 * o The initial value of ssthresh MAY be arbitrarily high
+	 *   (for example, implementations MAY use the size of the
+	 *   receiver advertised window).
+	 */
+	peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
+
+	/* At this point, we may not have the receiver's advertised window,
+	 * so initialize ssthresh to the default value and it will be set
+	 * later when we process the INIT.
+	 */
+	peer->ssthresh = SCTP_DEFAULT_MAXWINDOW;
+
+	peer->partial_bytes_acked = 0;
+	peer->flight_size = 0;
+	peer->burst_limited = 0;
+
+	/* Set the transport's RTO.initial value */
+	peer->rto = asoc->rto_initial;
+
+	/* Set the peer's active state. */
+	peer->state = peer_state;
+
+	/* Attach the remote transport to our asoc.  */
+	list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
+	asoc->peer.transport_count++;
+
+	/* If we do not yet have a primary path, set one.  */
+	if (!asoc->peer.primary_path) {
+		sctp_assoc_set_primary(asoc, peer);
+		asoc->peer.retran_path = peer;
+	}
+
+	if (asoc->peer.active_path == asoc->peer.retran_path &&
+	    peer->state != SCTP_UNCONFIRMED) {
+		asoc->peer.retran_path = peer;
+	}
+
+	return peer;
+}
+
+/* Delete a transport address from an association.  */
+void sctp_assoc_del_peer(struct sctp_association *asoc,
+			 const union sctp_addr *addr)
+{
+	struct list_head	*pos;
+	struct list_head	*temp;
+	struct sctp_transport	*transport;
+
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		transport = list_entry(pos, struct sctp_transport, transports);
+		if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
+			/* Do book keeping for removing the peer and free it. */
+			sctp_assoc_rm_peer(asoc, transport);
+			break;
+		}
+	}
+}
+
+/* Lookup a transport by address. */
+struct sctp_transport *sctp_assoc_lookup_paddr(
+					const struct sctp_association *asoc,
+					const union sctp_addr *address)
+{
+	struct sctp_transport *t;
+
+	/* Cycle through all transports searching for a peer address. */
+
+	list_for_each_entry(t, &asoc->peer.transport_addr_list,
+			transports) {
+		if (sctp_cmp_addr_exact(address, &t->ipaddr))
+			return t;
+	}
+
+	return NULL;
+}
+
+/* Remove all transports except a give one */
+void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc,
+				     struct sctp_transport *primary)
+{
+	struct sctp_transport	*temp;
+	struct sctp_transport	*t;
+
+	list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list,
+				 transports) {
+		/* if the current transport is not the primary one, delete it */
+		if (t != primary)
+			sctp_assoc_rm_peer(asoc, t);
+	}
+}
+
+/* Engage in transport control operations.
+ * Mark the transport up or down and send a notification to the user.
+ * Select and update the new active and retran paths.
+ */
+void sctp_assoc_control_transport(struct sctp_association *asoc,
+				  struct sctp_transport *transport,
+				  sctp_transport_cmd_t command,
+				  sctp_sn_error_t error)
+{
+	struct sctp_transport *t = NULL;
+	struct sctp_transport *first;
+	struct sctp_transport *second;
+	struct sctp_ulpevent *event;
+	struct sockaddr_storage addr;
+	int spc_state = 0;
+
+	/* Record the transition on the transport.  */
+	switch (command) {
+	case SCTP_TRANSPORT_UP:
+		/* If we are moving from UNCONFIRMED state due
+		 * to heartbeat success, report the SCTP_ADDR_CONFIRMED
+		 * state to the user, otherwise report SCTP_ADDR_AVAILABLE.
+		 */
+		if (SCTP_UNCONFIRMED == transport->state &&
+		    SCTP_HEARTBEAT_SUCCESS == error)
+			spc_state = SCTP_ADDR_CONFIRMED;
+		else
+			spc_state = SCTP_ADDR_AVAILABLE;
+		transport->state = SCTP_ACTIVE;
+		break;
+
+	case SCTP_TRANSPORT_DOWN:
+		/* If the transport was never confirmed, do not transition it
+		 * to inactive state.  Also, release the cached route since
+		 * there may be a better route next time.
+		 */
+		if (transport->state != SCTP_UNCONFIRMED)
+			transport->state = SCTP_INACTIVE;
+		else {
+			dst_release(transport->dst);
+			transport->dst = NULL;
+		}
+
+		spc_state = SCTP_ADDR_UNREACHABLE;
+		break;
+
+	default:
+		return;
+	}
+
+	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
+	 * user.
+	 */
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
+	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
+	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+				0, spc_state, error, GFP_ATOMIC);
+	if (event)
+		sctp_ulpq_tail_event(&asoc->ulpq, event);
+
+	/* Select new active and retran paths. */
+
+	/* Look for the two most recently used active transports.
+	 *
+	 * This code produces the wrong ordering whenever jiffies
+	 * rolls over, but we still get usable transports, so we don't
+	 * worry about it.
+	 */
+	first = NULL; second = NULL;
+
+	list_for_each_entry(t, &asoc->peer.transport_addr_list,
+			transports) {
+
+		if ((t->state == SCTP_INACTIVE) ||
+		    (t->state == SCTP_UNCONFIRMED))
+			continue;
+		if (!first || t->last_time_heard > first->last_time_heard) {
+			second = first;
+			first = t;
+		}
+		if (!second || t->last_time_heard > second->last_time_heard)
+			second = t;
+	}
+
+	/* RFC 2960 6.4 Multi-Homed SCTP Endpoints
+	 *
+	 * By default, an endpoint should always transmit to the
+	 * primary path, unless the SCTP user explicitly specifies the
+	 * destination transport address (and possibly source
+	 * transport address) to use.
+	 *
+	 * [If the primary is active but not most recent, bump the most
+	 * recently used transport.]
+	 */
+	if (((asoc->peer.primary_path->state == SCTP_ACTIVE) ||
+	     (asoc->peer.primary_path->state == SCTP_UNKNOWN)) &&
+	    first != asoc->peer.primary_path) {
+		second = first;
+		first = asoc->peer.primary_path;
+	}
+
+	/* If we failed to find a usable transport, just camp on the
+	 * primary, even if it is inactive.
+	 */
+	if (!first) {
+		first = asoc->peer.primary_path;
+		second = asoc->peer.primary_path;
+	}
+
+	/* Set the active and retran transports.  */
+	asoc->peer.active_path = first;
+	asoc->peer.retran_path = second;
+}
+
+/* Hold a reference to an association. */
+void sctp_association_hold(struct sctp_association *asoc)
+{
+	atomic_inc(&asoc->base.refcnt);
+}
+
+/* Release a reference to an association and cleanup
+ * if there are no more references.
+ */
+void sctp_association_put(struct sctp_association *asoc)
+{
+	if (atomic_dec_and_test(&asoc->base.refcnt))
+		sctp_association_destroy(asoc);
+}
+
+/* Allocate the next TSN, Transmission Sequence Number, for the given
+ * association.
+ */
+__u32 sctp_association_get_next_tsn(struct sctp_association *asoc)
+{
+	/* From Section 1.6 Serial Number Arithmetic:
+	 * Transmission Sequence Numbers wrap around when they reach
+	 * 2**32 - 1.  That is, the next TSN a DATA chunk MUST use
+	 * after transmitting TSN = 2*32 - 1 is TSN = 0.
+	 */
+	__u32 retval = asoc->next_tsn;
+	asoc->next_tsn++;
+	asoc->unack_data++;
+
+	return retval;
+}
+
+/* Compare two addresses to see if they match.  Wildcard addresses
+ * only match themselves.
+ */
+int sctp_cmp_addr_exact(const union sctp_addr *ss1,
+			const union sctp_addr *ss2)
+{
+	struct sctp_af *af;
+
+	af = sctp_get_af_specific(ss1->sa.sa_family);
+	if (unlikely(!af))
+		return 0;
+
+	return af->cmp_addr(ss1, ss2);
+}
+
+/* Return an ecne chunk to get prepended to a packet.
+ * Note:  We are sly and return a shared, prealloced chunk.  FIXME:
+ * No we don't, but we could/should.
+ */
+struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc)
+{
+	struct sctp_chunk *chunk;
+
+	/* Send ECNE if needed.
+	 * Not being able to allocate a chunk here is not deadly.
+	 */
+	if (asoc->need_ecne)
+		chunk = sctp_make_ecne(asoc, asoc->last_ecne_tsn);
+	else
+		chunk = NULL;
+
+	return chunk;
+}
+
+/*
+ * Find which transport this TSN was sent on.
+ */
+struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc,
+					     __u32 tsn)
+{
+	struct sctp_transport *active;
+	struct sctp_transport *match;
+	struct sctp_transport *transport;
+	struct sctp_chunk *chunk;
+	__be32 key = htonl(tsn);
+
+	match = NULL;
+
+	/*
+	 * FIXME: In general, find a more efficient data structure for
+	 * searching.
+	 */
+
+	/*
+	 * The general strategy is to search each transport's transmitted
+	 * list.   Return which transport this TSN lives on.
+	 *
+	 * Let's be hopeful and check the active_path first.
+	 * Another optimization would be to know if there is only one
+	 * outbound path and not have to look for the TSN at all.
+	 *
+	 */
+
+	active = asoc->peer.active_path;
+
+	list_for_each_entry(chunk, &active->transmitted,
+			transmitted_list) {
+
+		if (key == chunk->subh.data_hdr->tsn) {
+			match = active;
+			goto out;
+		}
+	}
+
+	/* If not found, go search all the other transports. */
+	list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+			transports) {
+
+		if (transport == active)
+			break;
+		list_for_each_entry(chunk, &transport->transmitted,
+				transmitted_list) {
+			if (key == chunk->subh.data_hdr->tsn) {
+				match = transport;
+				goto out;
+			}
+		}
+	}
+out:
+	return match;
+}
+
+/* Is this the association we are looking for? */
+struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
+					   const union sctp_addr *laddr,
+					   const union sctp_addr *paddr)
+{
+	struct sctp_transport *transport;
+
+	if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
+	    (htons(asoc->peer.port) == paddr->v4.sin_port)) {
+		transport = sctp_assoc_lookup_paddr(asoc, paddr);
+		if (!transport)
+			goto out;
+
+		if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
+					 sctp_sk(asoc->base.sk)))
+			goto out;
+	}
+	transport = NULL;
+
+out:
+	return transport;
+}
+
+/* Do delayed input processing.  This is scheduled by sctp_rcv(). */
+static void sctp_assoc_bh_rcv(struct work_struct *work)
+{
+	struct sctp_association *asoc =
+		container_of(work, struct sctp_association,
+			     base.inqueue.immediate);
+	struct sctp_endpoint *ep;
+	struct sctp_chunk *chunk;
+	struct sctp_inq *inqueue;
+	int state;
+	sctp_subtype_t subtype;
+	int error = 0;
+
+	/* The association should be held so we should be safe. */
+	ep = asoc->ep;
+
+	inqueue = &asoc->base.inqueue;
+	sctp_association_hold(asoc);
+	while (NULL != (chunk = sctp_inq_pop(inqueue))) {
+		state = asoc->state;
+		subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
+
+		/* SCTP-AUTH, Section 6.3:
+		 *    The receiver has a list of chunk types which it expects
+		 *    to be received only after an AUTH-chunk.  This list has
+		 *    been sent to the peer during the association setup.  It
+		 *    MUST silently discard these chunks if they are not placed
+		 *    after an AUTH chunk in the packet.
+		 */
+		if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
+			continue;
+
+		/* Remember where the last DATA chunk came from so we
+		 * know where to send the SACK.
+		 */
+		if (sctp_chunk_is_data(chunk))
+			asoc->peer.last_data_from = chunk->transport;
+		else
+			SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS);
+
+		if (chunk->transport)
+			chunk->transport->last_time_heard = jiffies;
+
+		/* Run through the state machine. */
+		error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype,
+				   state, ep, asoc, chunk, GFP_ATOMIC);
+
+		/* Check to see if the association is freed in response to
+		 * the incoming chunk.  If so, get out of the while loop.
+		 */
+		if (asoc->base.dead)
+			break;
+
+		/* If there is an error on chunk, discard this packet. */
+		if (error && chunk)
+			chunk->pdiscard = 1;
+	}
+	sctp_association_put(asoc);
+}
+
+/* This routine moves an association from its old sk to a new sk.  */
+void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk)
+{
+	struct sctp_sock *newsp = sctp_sk(newsk);
+	struct sock *oldsk = assoc->base.sk;
+
+	/* Delete the association from the old endpoint's list of
+	 * associations.
+	 */
+	list_del_init(&assoc->asocs);
+
+	/* Decrement the backlog value for a TCP-style socket. */
+	if (sctp_style(oldsk, TCP))
+		oldsk->sk_ack_backlog--;
+
+	/* Release references to the old endpoint and the sock.  */
+	sctp_endpoint_put(assoc->ep);
+	sock_put(assoc->base.sk);
+
+	/* Get a reference to the new endpoint.  */
+	assoc->ep = newsp->ep;
+	sctp_endpoint_hold(assoc->ep);
+
+	/* Get a reference to the new sock.  */
+	assoc->base.sk = newsk;
+	sock_hold(assoc->base.sk);
+
+	/* Add the association to the new endpoint's list of associations.  */
+	sctp_endpoint_add_asoc(newsp->ep, assoc);
+}
+
+/* Update an association (possibly from unexpected COOKIE-ECHO processing).  */
+void sctp_assoc_update(struct sctp_association *asoc,
+		       struct sctp_association *new)
+{
+	struct sctp_transport *trans;
+	struct list_head *pos, *temp;
+
+	/* Copy in new parameters of peer. */
+	asoc->c = new->c;
+	asoc->peer.rwnd = new->peer.rwnd;
+	asoc->peer.sack_needed = new->peer.sack_needed;
+	asoc->peer.i = new->peer.i;
+	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+			 asoc->peer.i.initial_tsn, GFP_ATOMIC);
+
+	/* Remove any peer addresses not present in the new association. */
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		trans = list_entry(pos, struct sctp_transport, transports);
+		if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) {
+			sctp_assoc_rm_peer(asoc, trans);
+			continue;
+		}
+
+		if (asoc->state >= SCTP_STATE_ESTABLISHED)
+			sctp_transport_reset(trans);
+	}
+
+	/* If the case is A (association restart), use
+	 * initial_tsn as next_tsn. If the case is B, use
+	 * current next_tsn in case data sent to peer
+	 * has been discarded and needs retransmission.
+	 */
+	if (asoc->state >= SCTP_STATE_ESTABLISHED) {
+		asoc->next_tsn = new->next_tsn;
+		asoc->ctsn_ack_point = new->ctsn_ack_point;
+		asoc->adv_peer_ack_point = new->adv_peer_ack_point;
+
+		/* Reinitialize SSN for both local streams
+		 * and peer's streams.
+		 */
+		sctp_ssnmap_clear(asoc->ssnmap);
+
+		/* Flush the ULP reassembly and ordered queue.
+		 * Any data there will now be stale and will
+		 * cause problems.
+		 */
+		sctp_ulpq_flush(&asoc->ulpq);
+
+		/* reset the overall association error count so
+		 * that the restarted association doesn't get torn
+		 * down on the next retransmission timer.
+		 */
+		asoc->overall_error_count = 0;
+
+	} else {
+		/* Add any peer addresses from the new association. */
+		list_for_each_entry(trans, &new->peer.transport_addr_list,
+				transports) {
+			if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
+				sctp_assoc_add_peer(asoc, &trans->ipaddr,
+						    GFP_ATOMIC, trans->state);
+		}
+
+		asoc->ctsn_ack_point = asoc->next_tsn - 1;
+		asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
+		if (!asoc->ssnmap) {
+			/* Move the ssnmap. */
+			asoc->ssnmap = new->ssnmap;
+			new->ssnmap = NULL;
+		}
+
+		if (!asoc->assoc_id) {
+			/* get a new association id since we don't have one
+			 * yet.
+			 */
+			sctp_assoc_set_id(asoc, GFP_ATOMIC);
+		}
+	}
+
+	/* SCTP-AUTH: Save the peer parameters from the new assocaitions
+	 * and also move the association shared keys over
+	 */
+	kfree(asoc->peer.peer_random);
+	asoc->peer.peer_random = new->peer.peer_random;
+	new->peer.peer_random = NULL;
+
+	kfree(asoc->peer.peer_chunks);
+	asoc->peer.peer_chunks = new->peer.peer_chunks;
+	new->peer.peer_chunks = NULL;
+
+	kfree(asoc->peer.peer_hmacs);
+	asoc->peer.peer_hmacs = new->peer.peer_hmacs;
+	new->peer.peer_hmacs = NULL;
+
+	sctp_auth_key_put(asoc->asoc_shared_key);
+	sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC);
+}
+
+/* Update the retran path for sending a retransmitted packet.
+ * Round-robin through the active transports, else round-robin
+ * through the inactive transports as this is the next best thing
+ * we can try.
+ */
+void sctp_assoc_update_retran_path(struct sctp_association *asoc)
+{
+	struct sctp_transport *t, *next;
+	struct list_head *head = &asoc->peer.transport_addr_list;
+	struct list_head *pos;
+
+	if (asoc->peer.transport_count == 1)
+		return;
+
+	/* Find the next transport in a round-robin fashion. */
+	t = asoc->peer.retran_path;
+	pos = &t->transports;
+	next = NULL;
+
+	while (1) {
+		/* Skip the head. */
+		if (pos->next == head)
+			pos = head->next;
+		else
+			pos = pos->next;
+
+		t = list_entry(pos, struct sctp_transport, transports);
+
+		/* We have exhausted the list, but didn't find any
+		 * other active transports.  If so, use the next
+		 * transport.
+		 */
+		if (t == asoc->peer.retran_path) {
+			t = next;
+			break;
+		}
+
+		/* Try to find an active transport. */
+
+		if ((t->state == SCTP_ACTIVE) ||
+		    (t->state == SCTP_UNKNOWN)) {
+			break;
+		} else {
+			/* Keep track of the next transport in case
+			 * we don't find any active transport.
+			 */
+			if (t->state != SCTP_UNCONFIRMED && !next)
+				next = t;
+		}
+	}
+
+	if (t)
+		asoc->peer.retran_path = t;
+	else
+		t = asoc->peer.retran_path;
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
+				 " %p addr: ",
+				 " port: %d\n",
+				 asoc,
+				 (&t->ipaddr),
+				 ntohs(t->ipaddr.v4.sin_port));
+}
+
+/* Choose the transport for sending retransmit packet.  */
+struct sctp_transport *sctp_assoc_choose_alter_transport(
+	struct sctp_association *asoc, struct sctp_transport *last_sent_to)
+{
+	/* If this is the first time packet is sent, use the active path,
+	 * else use the retran path. If the last packet was sent over the
+	 * retran path, update the retran path and use it.
+	 */
+	if (!last_sent_to)
+		return asoc->peer.active_path;
+	else {
+		if (last_sent_to == asoc->peer.retran_path)
+			sctp_assoc_update_retran_path(asoc);
+		return asoc->peer.retran_path;
+	}
+}
+
+/* Update the association's pmtu and frag_point by going through all the
+ * transports. This routine is called when a transport's PMTU has changed.
+ */
+void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+	__u32 pmtu = 0;
+
+	if (!asoc)
+		return;
+
+	/* Get the lowest pmtu of all the transports. */
+	list_for_each_entry(t, &asoc->peer.transport_addr_list,
+				transports) {
+		if (t->pmtu_pending && t->dst) {
+			sctp_transport_update_pmtu(t, dst_mtu(t->dst));
+			t->pmtu_pending = 0;
+		}
+		if (!pmtu || (t->pathmtu < pmtu))
+			pmtu = t->pathmtu;
+	}
+
+	if (pmtu) {
+		asoc->pathmtu = pmtu;
+		asoc->frag_point = sctp_frag_point(asoc, pmtu);
+	}
+
+	SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
+			  __func__, asoc, asoc->pathmtu, asoc->frag_point);
+}
+
+/* Should we send a SACK to update our peer? */
+static inline int sctp_peer_needs_update(struct sctp_association *asoc)
+{
+	switch (asoc->state) {
+	case SCTP_STATE_ESTABLISHED:
+	case SCTP_STATE_SHUTDOWN_PENDING:
+	case SCTP_STATE_SHUTDOWN_RECEIVED:
+	case SCTP_STATE_SHUTDOWN_SENT:
+		if ((asoc->rwnd > asoc->a_rwnd) &&
+		    ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32,
+			   (asoc->base.sk->sk_rcvbuf >> sctp_rwnd_upd_shift),
+			   asoc->pathmtu)))
+			return 1;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/* Increase asoc's rwnd by len and send any window update SACK if needed. */
+void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned len)
+{
+	struct sctp_chunk *sack;
+	struct timer_list *timer;
+
+	if (asoc->rwnd_over) {
+		if (asoc->rwnd_over >= len) {
+			asoc->rwnd_over -= len;
+		} else {
+			asoc->rwnd += (len - asoc->rwnd_over);
+			asoc->rwnd_over = 0;
+		}
+	} else {
+		asoc->rwnd += len;
+	}
+
+	/* If we had window pressure, start recovering it
+	 * once our rwnd had reached the accumulated pressure
+	 * threshold.  The idea is to recover slowly, but up
+	 * to the initial advertised window.
+	 */
+	if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
+		int change = min(asoc->pathmtu, asoc->rwnd_press);
+		asoc->rwnd += change;
+		asoc->rwnd_press -= change;
+	}
+
+	SCTP_DEBUG_PRINTK("%s: asoc %p rwnd increased by %d to (%u, %u) "
+			  "- %u\n", __func__, asoc, len, asoc->rwnd,
+			  asoc->rwnd_over, asoc->a_rwnd);
+
+	/* Send a window update SACK if the rwnd has increased by at least the
+	 * minimum of the association's PMTU and half of the receive buffer.
+	 * The algorithm used is similar to the one described in
+	 * Section 4.2.3.3 of RFC 1122.
+	 */
+	if (sctp_peer_needs_update(asoc)) {
+		asoc->a_rwnd = asoc->rwnd;
+		SCTP_DEBUG_PRINTK("%s: Sending window update SACK- asoc: %p "
+				  "rwnd: %u a_rwnd: %u\n", __func__,
+				  asoc, asoc->rwnd, asoc->a_rwnd);
+		sack = sctp_make_sack(asoc);
+		if (!sack)
+			return;
+
+		asoc->peer.sack_needed = 0;
+
+		sctp_outq_tail(&asoc->outqueue, sack);
+
+		/* Stop the SACK timer.  */
+		timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
+		if (timer_pending(timer) && del_timer(timer))
+			sctp_association_put(asoc);
+	}
+}
+
+/* Decrease asoc's rwnd by len. */
+void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
+{
+	int rx_count;
+	int over = 0;
+
+	SCTP_ASSERT(asoc->rwnd, "rwnd zero", return);
+	SCTP_ASSERT(!asoc->rwnd_over, "rwnd_over not zero", return);
+
+	if (asoc->ep->rcvbuf_policy)
+		rx_count = atomic_read(&asoc->rmem_alloc);
+	else
+		rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+
+	/* If we've reached or overflowed our receive buffer, announce
+	 * a 0 rwnd if rwnd would still be positive.  Store the
+	 * the pottential pressure overflow so that the window can be restored
+	 * back to original value.
+	 */
+	if (rx_count >= asoc->base.sk->sk_rcvbuf)
+		over = 1;
+
+	if (asoc->rwnd >= len) {
+		asoc->rwnd -= len;
+		if (over) {
+			asoc->rwnd_press += asoc->rwnd;
+			asoc->rwnd = 0;
+		}
+	} else {
+		asoc->rwnd_over = len - asoc->rwnd;
+		asoc->rwnd = 0;
+	}
+	SCTP_DEBUG_PRINTK("%s: asoc %p rwnd decreased by %d to (%u, %u, %u)\n",
+			  __func__, asoc, len, asoc->rwnd,
+			  asoc->rwnd_over, asoc->rwnd_press);
+}
+
+/* Build the bind address list for the association based on info from the
+ * local endpoint and the remote peer.
+ */
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+				     sctp_scope_t scope, gfp_t gfp)
+{
+	int flags;
+
+	/* Use scoping rules to determine the subset of addresses from
+	 * the endpoint.
+	 */
+	flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0;
+	if (asoc->peer.ipv4_address)
+		flags |= SCTP_ADDR4_PEERSUPP;
+	if (asoc->peer.ipv6_address)
+		flags |= SCTP_ADDR6_PEERSUPP;
+
+	return sctp_bind_addr_copy(&asoc->base.bind_addr,
+				   &asoc->ep->base.bind_addr,
+				   scope, gfp, flags);
+}
+
+/* Build the association's bind address list from the cookie.  */
+int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
+					 struct sctp_cookie *cookie,
+					 gfp_t gfp)
+{
+	int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
+	int var_size3 = cookie->raw_addr_list_len;
+	__u8 *raw = (__u8 *)cookie->peer_init + var_size2;
+
+	return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
+				      asoc->ep->base.bind_addr.port, gfp);
+}
+
+/* Lookup laddr in the bind address list of an association. */
+int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
+			    const union sctp_addr *laddr)
+{
+	int found = 0;
+
+	if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) &&
+	    sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
+				 sctp_sk(asoc->base.sk)))
+		found = 1;
+
+	return found;
+}
+
+/* Set an association id for a given association */
+int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
+{
+	int assoc_id;
+	int error = 0;
+
+	/* If the id is already assigned, keep it. */
+	if (asoc->assoc_id)
+		return error;
+retry:
+	if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp)))
+		return -ENOMEM;
+
+	spin_lock_bh(&sctp_assocs_id_lock);
+	error = idr_get_new_above(&sctp_assocs_id, (void *)asoc,
+				    idr_low, &assoc_id);
+	if (!error) {
+		idr_low = assoc_id + 1;
+		if (idr_low == INT_MAX)
+			idr_low = 1;
+	}
+	spin_unlock_bh(&sctp_assocs_id_lock);
+	if (error == -EAGAIN)
+		goto retry;
+	else if (error)
+		return error;
+
+	asoc->assoc_id = (sctp_assoc_t) assoc_id;
+	return error;
+}
+
+/* Free the ASCONF queue */
+static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc)
+{
+	struct sctp_chunk *asconf;
+	struct sctp_chunk *tmp;
+
+	list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) {
+		list_del_init(&asconf->list);
+		sctp_chunk_free(asconf);
+	}
+}
+
+/* Free asconf_ack cache */
+static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc)
+{
+	struct sctp_chunk *ack;
+	struct sctp_chunk *tmp;
+
+	list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
+				transmitted_list) {
+		list_del_init(&ack->transmitted_list);
+		sctp_chunk_free(ack);
+	}
+}
+
+/* Clean up the ASCONF_ACK queue */
+void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc)
+{
+	struct sctp_chunk *ack;
+	struct sctp_chunk *tmp;
+
+	/* We can remove all the entries from the queue up to
+	 * the "Peer-Sequence-Number".
+	 */
+	list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
+				transmitted_list) {
+		if (ack->subh.addip_hdr->serial ==
+				htonl(asoc->peer.addip_serial))
+			break;
+
+		list_del_init(&ack->transmitted_list);
+		sctp_chunk_free(ack);
+	}
+}
+
+/* Find the ASCONF_ACK whose serial number matches ASCONF */
+struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
+					const struct sctp_association *asoc,
+					__be32 serial)
+{
+	struct sctp_chunk *ack;
+
+	/* Walk through the list of cached ASCONF-ACKs and find the
+	 * ack chunk whose serial number matches that of the request.
+	 */
+	list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) {
+		if (ack->subh.addip_hdr->serial == serial) {
+			sctp_chunk_hold(ack);
+			return ack;
+		}
+	}
+
+	return NULL;
+}
+
+void sctp_asconf_queue_teardown(struct sctp_association *asoc)
+{
+	/* Free any cached ASCONF_ACK chunk. */
+	sctp_assoc_free_asconf_acks(asoc);
+
+	/* Free the ASCONF queue. */
+	sctp_assoc_free_asconf_queue(asoc);
+
+	/* Free any cached ASCONF chunk. */
+	if (asoc->addip_last_asconf)
+		sctp_chunk_free(asoc->addip_last_asconf);
+}
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
new file mode 100644
index 00000000..865e68fe
--- /dev/null
+++ b/net/sctp/auth.c
@@ -0,0 +1,950 @@
+/* SCTP kernel implementation
+ * (C) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *   Vlad Yasevich     <vladislav.yasevich@hp.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/auth.h>
+
+static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
+	{
+		/* id 0 is reserved.  as all 0 */
+		.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0,
+	},
+	{
+		.hmac_id = SCTP_AUTH_HMAC_ID_SHA1,
+		.hmac_name="hmac(sha1)",
+		.hmac_len = SCTP_SHA1_SIG_SIZE,
+	},
+	{
+		/* id 2 is reserved as well */
+		.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
+	},
+#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
+	{
+		.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
+		.hmac_name="hmac(sha256)",
+		.hmac_len = SCTP_SHA256_SIG_SIZE,
+	}
+#endif
+};
+
+
+void sctp_auth_key_put(struct sctp_auth_bytes *key)
+{
+	if (!key)
+		return;
+
+	if (atomic_dec_and_test(&key->refcnt)) {
+		kfree(key);
+		SCTP_DBG_OBJCNT_DEC(keys);
+	}
+}
+
+/* Create a new key structure of a given length */
+static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp)
+{
+	struct sctp_auth_bytes *key;
+
+	/* Verify that we are not going to overflow INT_MAX */
+	if ((INT_MAX - key_len) < sizeof(struct sctp_auth_bytes))
+		return NULL;
+
+	/* Allocate the shared key */
+	key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp);
+	if (!key)
+		return NULL;
+
+	key->len = key_len;
+	atomic_set(&key->refcnt, 1);
+	SCTP_DBG_OBJCNT_INC(keys);
+
+	return key;
+}
+
+/* Create a new shared key container with a give key id */
+struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp)
+{
+	struct sctp_shared_key *new;
+
+	/* Allocate the shared key container */
+	new = kzalloc(sizeof(struct sctp_shared_key), gfp);
+	if (!new)
+		return NULL;
+
+	INIT_LIST_HEAD(&new->key_list);
+	new->key_id = key_id;
+
+	return new;
+}
+
+/* Free the shared key structure */
+static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key)
+{
+	BUG_ON(!list_empty(&sh_key->key_list));
+	sctp_auth_key_put(sh_key->key);
+	sh_key->key = NULL;
+	kfree(sh_key);
+}
+
+/* Destroy the entire key list.  This is done during the
+ * associon and endpoint free process.
+ */
+void sctp_auth_destroy_keys(struct list_head *keys)
+{
+	struct sctp_shared_key *ep_key;
+	struct sctp_shared_key *tmp;
+
+	if (list_empty(keys))
+		return;
+
+	key_for_each_safe(ep_key, tmp, keys) {
+		list_del_init(&ep_key->key_list);
+		sctp_auth_shkey_free(ep_key);
+	}
+}
+
+/* Compare two byte vectors as numbers.  Return values
+ * are:
+ * 	  0 - vectors are equal
+ * 	< 0 - vector 1 is smaller than vector2
+ * 	> 0 - vector 1 is greater than vector2
+ *
+ * Algorithm is:
+ * 	This is performed by selecting the numerically smaller key vector...
+ *	If the key vectors are equal as numbers but differ in length ...
+ *	the shorter vector is considered smaller
+ *
+ * Examples (with small values):
+ * 	000123456789 > 123456789 (first number is longer)
+ * 	000123456789 < 234567891 (second number is larger numerically)
+ * 	123456789 > 2345678 	 (first number is both larger & longer)
+ */
+static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
+			      struct sctp_auth_bytes *vector2)
+{
+	int diff;
+	int i;
+	const __u8 *longer;
+
+	diff = vector1->len - vector2->len;
+	if (diff) {
+		longer = (diff > 0) ? vector1->data : vector2->data;
+
+		/* Check to see if the longer number is
+		 * lead-zero padded.  If it is not, it
+		 * is automatically larger numerically.
+		 */
+		for (i = 0; i < abs(diff); i++ ) {
+			if (longer[i] != 0)
+				return diff;
+		}
+	}
+
+	/* lengths are the same, compare numbers */
+	return memcmp(vector1->data, vector2->data, vector1->len);
+}
+
+/*
+ * Create a key vector as described in SCTP-AUTH, Section 6.1
+ *    The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
+ *    parameter sent by each endpoint are concatenated as byte vectors.
+ *    These parameters include the parameter type, parameter length, and
+ *    the parameter value, but padding is omitted; all padding MUST be
+ *    removed from this concatenation before proceeding with further
+ *    computation of keys.  Parameters which were not sent are simply
+ *    omitted from the concatenation process.  The resulting two vectors
+ *    are called the two key vectors.
+ */
+static struct sctp_auth_bytes *sctp_auth_make_key_vector(
+			sctp_random_param_t *random,
+			sctp_chunks_param_t *chunks,
+			sctp_hmac_algo_param_t *hmacs,
+			gfp_t gfp)
+{
+	struct sctp_auth_bytes *new;
+	__u32	len;
+	__u32	offset = 0;
+
+	len = ntohs(random->param_hdr.length) + ntohs(hmacs->param_hdr.length);
+        if (chunks)
+		len += ntohs(chunks->param_hdr.length);
+
+	new = kmalloc(sizeof(struct sctp_auth_bytes) + len, gfp);
+	if (!new)
+		return NULL;
+
+	new->len = len;
+
+	memcpy(new->data, random, ntohs(random->param_hdr.length));
+	offset += ntohs(random->param_hdr.length);
+
+	if (chunks) {
+		memcpy(new->data + offset, chunks,
+			ntohs(chunks->param_hdr.length));
+		offset += ntohs(chunks->param_hdr.length);
+	}
+
+	memcpy(new->data + offset, hmacs, ntohs(hmacs->param_hdr.length));
+
+	return new;
+}
+
+
+/* Make a key vector based on our local parameters */
+static struct sctp_auth_bytes *sctp_auth_make_local_vector(
+				    const struct sctp_association *asoc,
+				    gfp_t gfp)
+{
+	return sctp_auth_make_key_vector(
+				    (sctp_random_param_t*)asoc->c.auth_random,
+				    (sctp_chunks_param_t*)asoc->c.auth_chunks,
+				    (sctp_hmac_algo_param_t*)asoc->c.auth_hmacs,
+				    gfp);
+}
+
+/* Make a key vector based on peer's parameters */
+static struct sctp_auth_bytes *sctp_auth_make_peer_vector(
+				    const struct sctp_association *asoc,
+				    gfp_t gfp)
+{
+	return sctp_auth_make_key_vector(asoc->peer.peer_random,
+					 asoc->peer.peer_chunks,
+					 asoc->peer.peer_hmacs,
+					 gfp);
+}
+
+
+/* Set the value of the association shared key base on the parameters
+ * given.  The algorithm is:
+ *    From the endpoint pair shared keys and the key vectors the
+ *    association shared keys are computed.  This is performed by selecting
+ *    the numerically smaller key vector and concatenating it to the
+ *    endpoint pair shared key, and then concatenating the numerically
+ *    larger key vector to that.  The result of the concatenation is the
+ *    association shared key.
+ */
+static struct sctp_auth_bytes *sctp_auth_asoc_set_secret(
+			struct sctp_shared_key *ep_key,
+			struct sctp_auth_bytes *first_vector,
+			struct sctp_auth_bytes *last_vector,
+			gfp_t gfp)
+{
+	struct sctp_auth_bytes *secret;
+	__u32 offset = 0;
+	__u32 auth_len;
+
+	auth_len = first_vector->len + last_vector->len;
+	if (ep_key->key)
+		auth_len += ep_key->key->len;
+
+	secret = sctp_auth_create_key(auth_len, gfp);
+	if (!secret)
+		return NULL;
+
+	if (ep_key->key) {
+		memcpy(secret->data, ep_key->key->data, ep_key->key->len);
+		offset += ep_key->key->len;
+	}
+
+	memcpy(secret->data + offset, first_vector->data, first_vector->len);
+	offset += first_vector->len;
+
+	memcpy(secret->data + offset, last_vector->data, last_vector->len);
+
+	return secret;
+}
+
+/* Create an association shared key.  Follow the algorithm
+ * described in SCTP-AUTH, Section 6.1
+ */
+static struct sctp_auth_bytes *sctp_auth_asoc_create_secret(
+				 const struct sctp_association *asoc,
+				 struct sctp_shared_key *ep_key,
+				 gfp_t gfp)
+{
+	struct sctp_auth_bytes *local_key_vector;
+	struct sctp_auth_bytes *peer_key_vector;
+	struct sctp_auth_bytes	*first_vector,
+				*last_vector;
+	struct sctp_auth_bytes	*secret = NULL;
+	int	cmp;
+
+
+	/* Now we need to build the key vectors
+	 * SCTP-AUTH , Section 6.1
+	 *    The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
+	 *    parameter sent by each endpoint are concatenated as byte vectors.
+	 *    These parameters include the parameter type, parameter length, and
+	 *    the parameter value, but padding is omitted; all padding MUST be
+	 *    removed from this concatenation before proceeding with further
+	 *    computation of keys.  Parameters which were not sent are simply
+	 *    omitted from the concatenation process.  The resulting two vectors
+	 *    are called the two key vectors.
+	 */
+
+	local_key_vector = sctp_auth_make_local_vector(asoc, gfp);
+	peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp);
+
+	if (!peer_key_vector || !local_key_vector)
+		goto out;
+
+	/* Figure out the order in which the key_vectors will be
+	 * added to the endpoint shared key.
+	 * SCTP-AUTH, Section 6.1:
+	 *   This is performed by selecting the numerically smaller key
+	 *   vector and concatenating it to the endpoint pair shared
+	 *   key, and then concatenating the numerically larger key
+	 *   vector to that.  If the key vectors are equal as numbers
+	 *   but differ in length, then the concatenation order is the
+	 *   endpoint shared key, followed by the shorter key vector,
+	 *   followed by the longer key vector.  Otherwise, the key
+	 *   vectors are identical, and may be concatenated to the
+	 *   endpoint pair key in any order.
+	 */
+	cmp = sctp_auth_compare_vectors(local_key_vector,
+					peer_key_vector);
+	if (cmp < 0) {
+		first_vector = local_key_vector;
+		last_vector = peer_key_vector;
+	} else {
+		first_vector = peer_key_vector;
+		last_vector = local_key_vector;
+	}
+
+	secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector,
+					    gfp);
+out:
+	kfree(local_key_vector);
+	kfree(peer_key_vector);
+
+	return secret;
+}
+
+/*
+ * Populate the association overlay list with the list
+ * from the endpoint.
+ */
+int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
+				struct sctp_association *asoc,
+				gfp_t gfp)
+{
+	struct sctp_shared_key *sh_key;
+	struct sctp_shared_key *new;
+
+	BUG_ON(!list_empty(&asoc->endpoint_shared_keys));
+
+	key_for_each(sh_key, &ep->endpoint_shared_keys) {
+		new = sctp_auth_shkey_create(sh_key->key_id, gfp);
+		if (!new)
+			goto nomem;
+
+		new->key = sh_key->key;
+		sctp_auth_key_hold(new->key);
+		list_add(&new->key_list, &asoc->endpoint_shared_keys);
+	}
+
+	return 0;
+
+nomem:
+	sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
+	return -ENOMEM;
+}
+
+
+/* Public interface to creat the association shared key.
+ * See code above for the algorithm.
+ */
+int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
+{
+	struct sctp_auth_bytes	*secret;
+	struct sctp_shared_key *ep_key;
+
+	/* If we don't support AUTH, or peer is not capable
+	 * we don't need to do anything.
+	 */
+	if (!sctp_auth_enable || !asoc->peer.auth_capable)
+		return 0;
+
+	/* If the key_id is non-zero and we couldn't find an
+	 * endpoint pair shared key, we can't compute the
+	 * secret.
+	 * For key_id 0, endpoint pair shared key is a NULL key.
+	 */
+	ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id);
+	BUG_ON(!ep_key);
+
+	secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
+	if (!secret)
+		return -ENOMEM;
+
+	sctp_auth_key_put(asoc->asoc_shared_key);
+	asoc->asoc_shared_key = secret;
+
+	return 0;
+}
+
+
+/* Find the endpoint pair shared key based on the key_id */
+struct sctp_shared_key *sctp_auth_get_shkey(
+				const struct sctp_association *asoc,
+				__u16 key_id)
+{
+	struct sctp_shared_key *key;
+
+	/* First search associations set of endpoint pair shared keys */
+	key_for_each(key, &asoc->endpoint_shared_keys) {
+		if (key->key_id == key_id)
+			return key;
+	}
+
+	return NULL;
+}
+
+/*
+ * Initialize all the possible digest transforms that we can use.  Right now
+ * now, the supported digests are SHA1 and SHA256.  We do this here once
+ * because of the restrictiong that transforms may only be allocated in
+ * user context.  This forces us to pre-allocated all possible transforms
+ * at the endpoint init time.
+ */
+int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
+{
+	struct crypto_hash *tfm = NULL;
+	__u16   id;
+
+	/* if the transforms are already allocted, we are done */
+	if (!sctp_auth_enable) {
+		ep->auth_hmacs = NULL;
+		return 0;
+	}
+
+	if (ep->auth_hmacs)
+		return 0;
+
+	/* Allocated the array of pointers to transorms */
+	ep->auth_hmacs = kzalloc(
+			    sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS,
+			    gfp);
+	if (!ep->auth_hmacs)
+		return -ENOMEM;
+
+	for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) {
+
+		/* See is we support the id.  Supported IDs have name and
+		 * length fields set, so that we can allocated and use
+		 * them.  We can safely just check for name, for without the
+		 * name, we can't allocate the TFM.
+		 */
+		if (!sctp_hmac_list[id].hmac_name)
+			continue;
+
+		/* If this TFM has been allocated, we are all set */
+		if (ep->auth_hmacs[id])
+			continue;
+
+		/* Allocate the ID */
+		tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0,
+					CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm))
+			goto out_err;
+
+		ep->auth_hmacs[id] = tfm;
+	}
+
+	return 0;
+
+out_err:
+	/* Clean up any successful allocations */
+	sctp_auth_destroy_hmacs(ep->auth_hmacs);
+	return -ENOMEM;
+}
+
+/* Destroy the hmac tfm array */
+void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
+{
+	int i;
+
+	if (!auth_hmacs)
+		return;
+
+	for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++)
+	{
+		if (auth_hmacs[i])
+			crypto_free_hash(auth_hmacs[i]);
+	}
+	kfree(auth_hmacs);
+}
+
+
+struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
+{
+	return &sctp_hmac_list[hmac_id];
+}
+
+/* Get an hmac description information that we can use to build
+ * the AUTH chunk
+ */
+struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
+{
+	struct sctp_hmac_algo_param *hmacs;
+	__u16 n_elt;
+	__u16 id = 0;
+	int i;
+
+	/* If we have a default entry, use it */
+	if (asoc->default_hmac_id)
+		return &sctp_hmac_list[asoc->default_hmac_id];
+
+	/* Since we do not have a default entry, find the first entry
+	 * we support and return that.  Do not cache that id.
+	 */
+	hmacs = asoc->peer.peer_hmacs;
+	if (!hmacs)
+		return NULL;
+
+	n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1;
+	for (i = 0; i < n_elt; i++) {
+		id = ntohs(hmacs->hmac_ids[i]);
+
+		/* Check the id is in the supported range */
+		if (id > SCTP_AUTH_HMAC_ID_MAX) {
+			id = 0;
+			continue;
+		}
+
+		/* See is we support the id.  Supported IDs have name and
+		 * length fields set, so that we can allocated and use
+		 * them.  We can safely just check for name, for without the
+		 * name, we can't allocate the TFM.
+		 */
+		if (!sctp_hmac_list[id].hmac_name) {
+			id = 0;
+			continue;
+		}
+
+		break;
+	}
+
+	if (id == 0)
+		return NULL;
+
+	return &sctp_hmac_list[id];
+}
+
+static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id)
+{
+	int  found = 0;
+	int  i;
+
+	for (i = 0; i < n_elts; i++) {
+		if (hmac_id == hmacs[i]) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+/* See if the HMAC_ID is one that we claim as supported */
+int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
+				    __be16 hmac_id)
+{
+	struct sctp_hmac_algo_param *hmacs;
+	__u16 n_elt;
+
+	if (!asoc)
+		return 0;
+
+	hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs;
+	n_elt = (ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t)) >> 1;
+
+	return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id);
+}
+
+
+/* Cache the default HMAC id.  This to follow this text from SCTP-AUTH:
+ * Section 6.1:
+ *   The receiver of a HMAC-ALGO parameter SHOULD use the first listed
+ *   algorithm it supports.
+ */
+void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
+				     struct sctp_hmac_algo_param *hmacs)
+{
+	struct sctp_endpoint *ep;
+	__u16   id;
+	int	i;
+	int	n_params;
+
+	/* if the default id is already set, use it */
+	if (asoc->default_hmac_id)
+		return;
+
+	n_params = (ntohs(hmacs->param_hdr.length)
+				- sizeof(sctp_paramhdr_t)) >> 1;
+	ep = asoc->ep;
+	for (i = 0; i < n_params; i++) {
+		id = ntohs(hmacs->hmac_ids[i]);
+
+		/* Check the id is in the supported range */
+		if (id > SCTP_AUTH_HMAC_ID_MAX)
+			continue;
+
+		/* If this TFM has been allocated, use this id */
+		if (ep->auth_hmacs[id]) {
+			asoc->default_hmac_id = id;
+			break;
+		}
+	}
+}
+
+
+/* Check to see if the given chunk is supposed to be authenticated */
+static int __sctp_auth_cid(sctp_cid_t chunk, struct sctp_chunks_param *param)
+{
+	unsigned short len;
+	int found = 0;
+	int i;
+
+	if (!param || param->param_hdr.length == 0)
+		return 0;
+
+	len = ntohs(param->param_hdr.length) - sizeof(sctp_paramhdr_t);
+
+	/* SCTP-AUTH, Section 3.2
+	 *    The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH
+	 *    chunks MUST NOT be listed in the CHUNKS parameter.  However, if
+	 *    a CHUNKS parameter is received then the types for INIT, INIT-ACK,
+	 *    SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored.
+	 */
+	for (i = 0; !found && i < len; i++) {
+		switch (param->chunks[i]) {
+		    case SCTP_CID_INIT:
+		    case SCTP_CID_INIT_ACK:
+		    case SCTP_CID_SHUTDOWN_COMPLETE:
+		    case SCTP_CID_AUTH:
+			break;
+
+		    default:
+			if (param->chunks[i] == chunk)
+			    found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+/* Check if peer requested that this chunk is authenticated */
+int sctp_auth_send_cid(sctp_cid_t chunk, const struct sctp_association *asoc)
+{
+	if (!sctp_auth_enable || !asoc || !asoc->peer.auth_capable)
+		return 0;
+
+	return __sctp_auth_cid(chunk, asoc->peer.peer_chunks);
+}
+
+/* Check if we requested that peer authenticate this chunk. */
+int sctp_auth_recv_cid(sctp_cid_t chunk, const struct sctp_association *asoc)
+{
+	if (!sctp_auth_enable || !asoc)
+		return 0;
+
+	return __sctp_auth_cid(chunk,
+			      (struct sctp_chunks_param *)asoc->c.auth_chunks);
+}
+
+/* SCTP-AUTH: Section 6.2:
+ *    The sender MUST calculate the MAC as described in RFC2104 [2] using
+ *    the hash function H as described by the MAC Identifier and the shared
+ *    association key K based on the endpoint pair shared key described by
+ *    the shared key identifier.  The 'data' used for the computation of
+ *    the AUTH-chunk is given by the AUTH chunk with its HMAC field set to
+ *    zero (as shown in Figure 6) followed by all chunks that are placed
+ *    after the AUTH chunk in the SCTP packet.
+ */
+void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
+			      struct sk_buff *skb,
+			      struct sctp_auth_chunk *auth,
+			      gfp_t gfp)
+{
+	struct scatterlist sg;
+	struct hash_desc desc;
+	struct sctp_auth_bytes *asoc_key;
+	__u16 key_id, hmac_id;
+	__u8 *digest;
+	unsigned char *end;
+	int free_key = 0;
+
+	/* Extract the info we need:
+	 * - hmac id
+	 * - key id
+	 */
+	key_id = ntohs(auth->auth_hdr.shkey_id);
+	hmac_id = ntohs(auth->auth_hdr.hmac_id);
+
+	if (key_id == asoc->active_key_id)
+		asoc_key = asoc->asoc_shared_key;
+	else {
+		struct sctp_shared_key *ep_key;
+
+		ep_key = sctp_auth_get_shkey(asoc, key_id);
+		if (!ep_key)
+			return;
+
+		asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
+		if (!asoc_key)
+			return;
+
+		free_key = 1;
+	}
+
+	/* set up scatter list */
+	end = skb_tail_pointer(skb);
+	sg_init_one(&sg, auth, end - (unsigned char *)auth);
+
+	desc.tfm = asoc->ep->auth_hmacs[hmac_id];
+	desc.flags = 0;
+
+	digest = auth->auth_hdr.hmac;
+	if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len))
+		goto free;
+
+	crypto_hash_digest(&desc, &sg, sg.length, digest);
+
+free:
+	if (free_key)
+		sctp_auth_key_put(asoc_key);
+}
+
+/* API Helpers */
+
+/* Add a chunk to the endpoint authenticated chunk list */
+int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id)
+{
+	struct sctp_chunks_param *p = ep->auth_chunk_list;
+	__u16 nchunks;
+	__u16 param_len;
+
+	/* If this chunk is already specified, we are done */
+	if (__sctp_auth_cid(chunk_id, p))
+		return 0;
+
+	/* Check if we can add this chunk to the array */
+	param_len = ntohs(p->param_hdr.length);
+	nchunks = param_len - sizeof(sctp_paramhdr_t);
+	if (nchunks == SCTP_NUM_CHUNK_TYPES)
+		return -EINVAL;
+
+	p->chunks[nchunks] = chunk_id;
+	p->param_hdr.length = htons(param_len + 1);
+	return 0;
+}
+
+/* Add hmac identifires to the endpoint list of supported hmac ids */
+int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
+			   struct sctp_hmacalgo *hmacs)
+{
+	int has_sha1 = 0;
+	__u16 id;
+	int i;
+
+	/* Scan the list looking for unsupported id.  Also make sure that
+	 * SHA1 is specified.
+	 */
+	for (i = 0; i < hmacs->shmac_num_idents; i++) {
+		id = hmacs->shmac_idents[i];
+
+		if (id > SCTP_AUTH_HMAC_ID_MAX)
+			return -EOPNOTSUPP;
+
+		if (SCTP_AUTH_HMAC_ID_SHA1 == id)
+			has_sha1 = 1;
+
+		if (!sctp_hmac_list[id].hmac_name)
+			return -EOPNOTSUPP;
+	}
+
+	if (!has_sha1)
+		return -EINVAL;
+
+	memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0],
+		hmacs->shmac_num_idents * sizeof(__u16));
+	ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) +
+				hmacs->shmac_num_idents * sizeof(__u16));
+	return 0;
+}
+
+/* Set a new shared key on either endpoint or association.  If the
+ * the key with a same ID already exists, replace the key (remove the
+ * old key and add a new one).
+ */
+int sctp_auth_set_key(struct sctp_endpoint *ep,
+		      struct sctp_association *asoc,
+		      struct sctp_authkey *auth_key)
+{
+	struct sctp_shared_key *cur_key = NULL;
+	struct sctp_auth_bytes *key;
+	struct list_head *sh_keys;
+	int replace = 0;
+
+	/* Try to find the given key id to see if
+	 * we are doing a replace, or adding a new key
+	 */
+	if (asoc)
+		sh_keys = &asoc->endpoint_shared_keys;
+	else
+		sh_keys = &ep->endpoint_shared_keys;
+
+	key_for_each(cur_key, sh_keys) {
+		if (cur_key->key_id == auth_key->sca_keynumber) {
+			replace = 1;
+			break;
+		}
+	}
+
+	/* If we are not replacing a key id, we need to allocate
+	 * a shared key.
+	 */
+	if (!replace) {
+		cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber,
+						 GFP_KERNEL);
+		if (!cur_key)
+			return -ENOMEM;
+	}
+
+	/* Create a new key data based on the info passed in */
+	key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL);
+	if (!key)
+		goto nomem;
+
+	memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength);
+
+	/* If we are replacing, remove the old keys data from the
+	 * key id.  If we are adding new key id, add it to the
+	 * list.
+	 */
+	if (replace)
+		sctp_auth_key_put(cur_key->key);
+	else
+		list_add(&cur_key->key_list, sh_keys);
+
+	cur_key->key = key;
+	sctp_auth_key_hold(key);
+
+	return 0;
+nomem:
+	if (!replace)
+		sctp_auth_shkey_free(cur_key);
+
+	return -ENOMEM;
+}
+
+int sctp_auth_set_active_key(struct sctp_endpoint *ep,
+			     struct sctp_association *asoc,
+			     __u16  key_id)
+{
+	struct sctp_shared_key *key;
+	struct list_head *sh_keys;
+	int found = 0;
+
+	/* The key identifier MUST correst to an existing key */
+	if (asoc)
+		sh_keys = &asoc->endpoint_shared_keys;
+	else
+		sh_keys = &ep->endpoint_shared_keys;
+
+	key_for_each(key, sh_keys) {
+		if (key->key_id == key_id) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	if (asoc) {
+		asoc->active_key_id = key_id;
+		sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL);
+	} else
+		ep->active_key_id = key_id;
+
+	return 0;
+}
+
+int sctp_auth_del_key_id(struct sctp_endpoint *ep,
+			 struct sctp_association *asoc,
+			 __u16  key_id)
+{
+	struct sctp_shared_key *key;
+	struct list_head *sh_keys;
+	int found = 0;
+
+	/* The key identifier MUST NOT be the current active key
+	 * The key identifier MUST correst to an existing key
+	 */
+	if (asoc) {
+		if (asoc->active_key_id == key_id)
+			return -EINVAL;
+
+		sh_keys = &asoc->endpoint_shared_keys;
+	} else {
+		if (ep->active_key_id == key_id)
+			return -EINVAL;
+
+		sh_keys = &ep->endpoint_shared_keys;
+	}
+
+	key_for_each(key, sh_keys) {
+		if (key->key_id == key_id) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	/* Delete the shared key */
+	list_del_init(&key->key_list);
+	sctp_auth_shkey_free(key);
+
+	return 0;
+}
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
new file mode 100644
index 00000000..83e3011c
--- /dev/null
+++ b/net/sctp/bind_addr.c
@@ -0,0 +1,551 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2003
+ * Copyright (c) Cisco 1999,2000
+ * Copyright (c) Motorola 1999,2000,2001
+ * Copyright (c) La Monte H.P. Yarroll 2001
+ *
+ * This file is part of the SCTP kernel implementation.
+ *
+ * A collection class to handle the storage of transport addresses.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Daisy Chang           <daisyc@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers. */
+static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
+			      sctp_scope_t scope, gfp_t gfp,
+			      int flags);
+static void sctp_bind_addr_clean(struct sctp_bind_addr *);
+
+/* First Level Abstractions. */
+
+/* Copy 'src' to 'dest' taking 'scope' into account.  Omit addresses
+ * in 'src' which have a broader scope than 'scope'.
+ */
+int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
+			const struct sctp_bind_addr *src,
+			sctp_scope_t scope, gfp_t gfp,
+			int flags)
+{
+	struct sctp_sockaddr_entry *addr;
+	int error = 0;
+
+	/* All addresses share the same port.  */
+	dest->port = src->port;
+
+	/* Extract the addresses which are relevant for this scope.  */
+	list_for_each_entry(addr, &src->address_list, list) {
+		error = sctp_copy_one_addr(dest, &addr->a, scope,
+					   gfp, flags);
+		if (error < 0)
+			goto out;
+	}
+
+	/* If there are no addresses matching the scope and
+	 * this is global scope, try to get a link scope address, with
+	 * the assumption that we must be sitting behind a NAT.
+	 */
+	if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) {
+		list_for_each_entry(addr, &src->address_list, list) {
+			error = sctp_copy_one_addr(dest, &addr->a,
+						   SCTP_SCOPE_LINK, gfp,
+						   flags);
+			if (error < 0)
+				goto out;
+		}
+	}
+
+out:
+	if (error)
+		sctp_bind_addr_clean(dest);
+
+	return error;
+}
+
+/* Exactly duplicate the address lists.  This is necessary when doing
+ * peer-offs and accepts.  We don't want to put all the current system
+ * addresses into the endpoint.  That's useless.  But we do want duplicat
+ * the list of bound addresses that the older endpoint used.
+ */
+int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
+			const struct sctp_bind_addr *src,
+			gfp_t gfp)
+{
+	struct sctp_sockaddr_entry *addr;
+	int error = 0;
+
+	/* All addresses share the same port.  */
+	dest->port = src->port;
+
+	list_for_each_entry(addr, &src->address_list, list) {
+		error = sctp_add_bind_addr(dest, &addr->a, 1, gfp);
+		if (error < 0)
+			break;
+	}
+
+	return error;
+}
+
+/* Initialize the SCTP_bind_addr structure for either an endpoint or
+ * an association.
+ */
+void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port)
+{
+	bp->malloced = 0;
+
+	INIT_LIST_HEAD(&bp->address_list);
+	bp->port = port;
+}
+
+/* Dispose of the address list. */
+static void sctp_bind_addr_clean(struct sctp_bind_addr *bp)
+{
+	struct sctp_sockaddr_entry *addr, *temp;
+
+	/* Empty the bind address list. */
+	list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
+		list_del_rcu(&addr->list);
+		kfree_rcu(addr, rcu);
+		SCTP_DBG_OBJCNT_DEC(addr);
+	}
+}
+
+/* Dispose of an SCTP_bind_addr structure  */
+void sctp_bind_addr_free(struct sctp_bind_addr *bp)
+{
+	/* Empty the bind address list. */
+	sctp_bind_addr_clean(bp);
+
+	if (bp->malloced) {
+		kfree(bp);
+		SCTP_DBG_OBJCNT_DEC(bind_addr);
+	}
+}
+
+/* Add an address to the bind address list in the SCTP_bind_addr structure. */
+int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
+		       __u8 addr_state, gfp_t gfp)
+{
+	struct sctp_sockaddr_entry *addr;
+
+	/* Add the address to the bind address list.  */
+	addr = t_new(struct sctp_sockaddr_entry, gfp);
+	if (!addr)
+		return -ENOMEM;
+
+	memcpy(&addr->a, new, sizeof(*new));
+
+	/* Fix up the port if it has not yet been set.
+	 * Both v4 and v6 have the port at the same offset.
+	 */
+	if (!addr->a.v4.sin_port)
+		addr->a.v4.sin_port = htons(bp->port);
+
+	addr->state = addr_state;
+	addr->valid = 1;
+
+	INIT_LIST_HEAD(&addr->list);
+
+	/* We always hold a socket lock when calling this function,
+	 * and that acts as a writer synchronizing lock.
+	 */
+	list_add_tail_rcu(&addr->list, &bp->address_list);
+	SCTP_DBG_OBJCNT_INC(addr);
+
+	return 0;
+}
+
+/* Delete an address from the bind address list in the SCTP_bind_addr
+ * structure.
+ */
+int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
+{
+	struct sctp_sockaddr_entry *addr, *temp;
+	int found = 0;
+
+	/* We hold the socket lock when calling this function,
+	 * and that acts as a writer synchronizing lock.
+	 */
+	list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
+		if (sctp_cmp_addr_exact(&addr->a, del_addr)) {
+			/* Found the exact match. */
+			found = 1;
+			addr->valid = 0;
+			list_del_rcu(&addr->list);
+			break;
+		}
+	}
+
+	if (found) {
+		kfree_rcu(addr, rcu);
+		SCTP_DBG_OBJCNT_DEC(addr);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+/* Create a network byte-order representation of all the addresses
+ * formated as SCTP parameters.
+ *
+ * The second argument is the return value for the length.
+ */
+union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
+					 int *addrs_len,
+					 gfp_t gfp)
+{
+	union sctp_params addrparms;
+	union sctp_params retval;
+	int addrparms_len;
+	union sctp_addr_param rawaddr;
+	int len;
+	struct sctp_sockaddr_entry *addr;
+	struct list_head *pos;
+	struct sctp_af *af;
+
+	addrparms_len = 0;
+	len = 0;
+
+	/* Allocate enough memory at once. */
+	list_for_each(pos, &bp->address_list) {
+		len += sizeof(union sctp_addr_param);
+	}
+
+	/* Don't even bother embedding an address if there
+	 * is only one.
+	 */
+	if (len == sizeof(union sctp_addr_param)) {
+		retval.v = NULL;
+		goto end_raw;
+	}
+
+	retval.v = kmalloc(len, gfp);
+	if (!retval.v)
+		goto end_raw;
+
+	addrparms = retval;
+
+	list_for_each_entry(addr, &bp->address_list, list) {
+		af = sctp_get_af_specific(addr->a.v4.sin_family);
+		len = af->to_addr_param(&addr->a, &rawaddr);
+		memcpy(addrparms.v, &rawaddr, len);
+		addrparms.v += len;
+		addrparms_len += len;
+	}
+
+end_raw:
+	*addrs_len = addrparms_len;
+	return retval;
+}
+
+/*
+ * Create an address list out of the raw address list format (IPv4 and IPv6
+ * address parameters).
+ */
+int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
+			   int addrs_len, __u16 port, gfp_t gfp)
+{
+	union sctp_addr_param *rawaddr;
+	struct sctp_paramhdr *param;
+	union sctp_addr addr;
+	int retval = 0;
+	int len;
+	struct sctp_af *af;
+
+	/* Convert the raw address to standard address format */
+	while (addrs_len) {
+		param = (struct sctp_paramhdr *)raw_addr_list;
+		rawaddr = (union sctp_addr_param *)raw_addr_list;
+
+		af = sctp_get_af_specific(param_type2af(param->type));
+		if (unlikely(!af)) {
+			retval = -EINVAL;
+			sctp_bind_addr_clean(bp);
+			break;
+		}
+
+		af->from_addr_param(&addr, rawaddr, htons(port), 0);
+		retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp);
+		if (retval) {
+			/* Can't finish building the list, clean up. */
+			sctp_bind_addr_clean(bp);
+			break;
+		}
+
+		len = ntohs(param->length);
+		addrs_len -= len;
+		raw_addr_list += len;
+	}
+
+	return retval;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Does this contain a specified address?  Allow wildcarding. */
+int sctp_bind_addr_match(struct sctp_bind_addr *bp,
+			 const union sctp_addr *addr,
+			 struct sctp_sock *opt)
+{
+	struct sctp_sockaddr_entry *laddr;
+	int match = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
+		if (opt->pf->cmp_addr(&laddr->a, addr, opt)) {
+			match = 1;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return match;
+}
+
+/* Does the address 'addr' conflict with any addresses in
+ * the bp.
+ */
+int sctp_bind_addr_conflict(struct sctp_bind_addr *bp,
+			    const union sctp_addr *addr,
+			    struct sctp_sock *bp_sp,
+			    struct sctp_sock *addr_sp)
+{
+	struct sctp_sockaddr_entry *laddr;
+	int conflict = 0;
+	struct sctp_sock *sp;
+
+	/* Pick the IPv6 socket as the basis of comparison
+	 * since it's usually a superset of the IPv4.
+	 * If there is no IPv6 socket, then default to bind_addr.
+	 */
+	if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6)
+		sp = bp_sp;
+	else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6)
+		sp = addr_sp;
+	else
+		sp = bp_sp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
+
+		conflict = sp->pf->cmp_addr(&laddr->a, addr, sp);
+		if (conflict)
+			break;
+	}
+	rcu_read_unlock();
+
+	return conflict;
+}
+
+/* Get the state of the entry in the bind_addr_list */
+int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
+			 const union sctp_addr *addr)
+{
+	struct sctp_sockaddr_entry *laddr;
+	struct sctp_af *af;
+	int state = -1;
+
+	af = sctp_get_af_specific(addr->sa.sa_family);
+	if (unlikely(!af))
+		return state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
+		if (af->cmp_addr(&laddr->a, addr)) {
+			state = laddr->state;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return state;
+}
+
+/* Find the first address in the bind address list that is not present in
+ * the addrs packed array.
+ */
+union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
+					const union sctp_addr	*addrs,
+					int			addrcnt,
+					struct sctp_sock	*opt)
+{
+	struct sctp_sockaddr_entry	*laddr;
+	union sctp_addr			*addr;
+	void 				*addr_buf;
+	struct sctp_af			*af;
+	int				i;
+
+	/* This is only called sctp_send_asconf_del_ip() and we hold
+	 * the socket lock in that code patch, so that address list
+	 * can't change.
+	 */
+	list_for_each_entry(laddr, &bp->address_list, list) {
+		addr_buf = (union sctp_addr *)addrs;
+		for (i = 0; i < addrcnt; i++) {
+			addr = (union sctp_addr *)addr_buf;
+			af = sctp_get_af_specific(addr->v4.sin_family);
+			if (!af)
+				break;
+
+			if (opt->pf->cmp_addr(&laddr->a, addr, opt))
+				break;
+
+			addr_buf += af->sockaddr_len;
+		}
+		if (i == addrcnt)
+			return &laddr->a;
+	}
+
+	return NULL;
+}
+
+/* Copy out addresses from the global local address list. */
+static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
+			      union sctp_addr *addr,
+			      sctp_scope_t scope, gfp_t gfp,
+			      int flags)
+{
+	int error = 0;
+
+	if (sctp_is_any(NULL, addr)) {
+		error = sctp_copy_local_addr_list(dest, scope, gfp, flags);
+	} else if (sctp_in_scope(addr, scope)) {
+		/* Now that the address is in scope, check to see if
+		 * the address type is supported by local sock as
+		 * well as the remote peer.
+		 */
+		if ((((AF_INET == addr->sa.sa_family) &&
+		      (flags & SCTP_ADDR4_PEERSUPP))) ||
+		    (((AF_INET6 == addr->sa.sa_family) &&
+		      (flags & SCTP_ADDR6_ALLOWED) &&
+		      (flags & SCTP_ADDR6_PEERSUPP))))
+			error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC,
+						    gfp);
+	}
+
+	return error;
+}
+
+/* Is this a wildcard address?  */
+int sctp_is_any(struct sock *sk, const union sctp_addr *addr)
+{
+	unsigned short fam = 0;
+	struct sctp_af *af;
+
+	/* Try to get the right address family */
+	if (addr->sa.sa_family != AF_UNSPEC)
+		fam = addr->sa.sa_family;
+	else if (sk)
+		fam = sk->sk_family;
+
+	af = sctp_get_af_specific(fam);
+	if (!af)
+		return 0;
+
+	return af->is_any(addr);
+}
+
+/* Is 'addr' valid for 'scope'?  */
+int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope)
+{
+	sctp_scope_t addr_scope = sctp_scope(addr);
+
+	/* The unusable SCTP addresses will not be considered with
+	 * any defined scopes.
+	 */
+	if (SCTP_SCOPE_UNUSABLE == addr_scope)
+		return 0;
+	/*
+	 * For INIT and INIT-ACK address list, let L be the level of
+	 * of requested destination address, sender and receiver
+	 * SHOULD include all of its addresses with level greater
+	 * than or equal to L.
+	 *
+	 * Address scoping can be selectively controlled via sysctl
+	 * option
+	 */
+	switch (sctp_scope_policy) {
+	case SCTP_SCOPE_POLICY_DISABLE:
+		return 1;
+	case SCTP_SCOPE_POLICY_ENABLE:
+		if (addr_scope <= scope)
+			return 1;
+		break;
+	case SCTP_SCOPE_POLICY_PRIVATE:
+		if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope)
+			return 1;
+		break;
+	case SCTP_SCOPE_POLICY_LINK:
+		if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope)
+			return 1;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/********************************************************************
+ * 3rd Level Abstractions
+ ********************************************************************/
+
+/* What is the scope of 'addr'?  */
+sctp_scope_t sctp_scope(const union sctp_addr *addr)
+{
+	struct sctp_af *af;
+
+	af = sctp_get_af_specific(addr->sa.sa_family);
+	if (!af)
+		return SCTP_SCOPE_UNUSABLE;
+
+	return af->scope((union sctp_addr *)addr);
+}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
new file mode 100644
index 00000000..6c855645
--- /dev/null
+++ b/net/sctp/chunk.c
@@ -0,0 +1,364 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2003, 2004
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This file contains the code relating the chunk abstraction.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* This file is mostly in anticipation of future work, but initially
+ * populate with fragment tracking for an outbound message.
+ */
+
+/* Initialize datamsg from memory. */
+static void sctp_datamsg_init(struct sctp_datamsg *msg)
+{
+	atomic_set(&msg->refcnt, 1);
+	msg->send_failed = 0;
+	msg->send_error = 0;
+	msg->can_abandon = 0;
+	msg->can_delay = 1;
+	msg->expires_at = 0;
+	INIT_LIST_HEAD(&msg->chunks);
+}
+
+/* Allocate and initialize datamsg. */
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
+{
+	struct sctp_datamsg *msg;
+	msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
+	if (msg) {
+		sctp_datamsg_init(msg);
+		SCTP_DBG_OBJCNT_INC(datamsg);
+	}
+	return msg;
+}
+
+void sctp_datamsg_free(struct sctp_datamsg *msg)
+{
+	struct sctp_chunk *chunk;
+
+	/* This doesn't have to be a _safe vairant because
+	 * sctp_chunk_free() only drops the refs.
+	 */
+	list_for_each_entry(chunk, &msg->chunks, frag_list)
+		sctp_chunk_free(chunk);
+
+	sctp_datamsg_put(msg);
+}
+
+/* Final destructruction of datamsg memory. */
+static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
+{
+	struct list_head *pos, *temp;
+	struct sctp_chunk *chunk;
+	struct sctp_sock *sp;
+	struct sctp_ulpevent *ev;
+	struct sctp_association *asoc = NULL;
+	int error = 0, notify;
+
+	/* If we failed, we may need to notify. */
+	notify = msg->send_failed ? -1 : 0;
+
+	/* Release all references. */
+	list_for_each_safe(pos, temp, &msg->chunks) {
+		list_del_init(pos);
+		chunk = list_entry(pos, struct sctp_chunk, frag_list);
+		/* Check whether we _really_ need to notify. */
+		if (notify < 0) {
+			asoc = chunk->asoc;
+			if (msg->send_error)
+				error = msg->send_error;
+			else
+				error = asoc->outqueue.error;
+
+			sp = sctp_sk(asoc->base.sk);
+			notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED,
+							    &sp->subscribe);
+		}
+
+		/* Generate a SEND FAILED event only if enabled. */
+		if (notify > 0) {
+			int sent;
+			if (chunk->has_tsn)
+				sent = SCTP_DATA_SENT;
+			else
+				sent = SCTP_DATA_UNSENT;
+
+			ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
+							    error, GFP_ATOMIC);
+			if (ev)
+				sctp_ulpq_tail_event(&asoc->ulpq, ev);
+		}
+
+		sctp_chunk_put(chunk);
+	}
+
+	SCTP_DBG_OBJCNT_DEC(datamsg);
+	kfree(msg);
+}
+
+/* Hold a reference. */
+static void sctp_datamsg_hold(struct sctp_datamsg *msg)
+{
+	atomic_inc(&msg->refcnt);
+}
+
+/* Release a reference. */
+void sctp_datamsg_put(struct sctp_datamsg *msg)
+{
+	if (atomic_dec_and_test(&msg->refcnt))
+		sctp_datamsg_destroy(msg);
+}
+
+/* Assign a chunk to this datamsg. */
+static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk)
+{
+	sctp_datamsg_hold(msg);
+	chunk->msg = msg;
+}
+
+
+/* A data chunk can have a maximum payload of (2^16 - 20).  Break
+ * down any such message into smaller chunks.  Opportunistically, fragment
+ * the chunks down to the current MTU constraints.  We may get refragmented
+ * later if the PMTU changes, but it is _much better_ to fragment immediately
+ * with a reasonable guess than always doing our fragmentation on the
+ * soft-interrupt.
+ */
+struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
+					    struct sctp_sndrcvinfo *sinfo,
+					    struct msghdr *msgh, int msg_len)
+{
+	int max, whole, i, offset, over, err;
+	int len, first_len;
+	int max_data;
+	struct sctp_chunk *chunk;
+	struct sctp_datamsg *msg;
+	struct list_head *pos, *temp;
+	__u8 frag;
+
+	msg = sctp_datamsg_new(GFP_KERNEL);
+	if (!msg)
+		return NULL;
+
+	/* Note: Calculate this outside of the loop, so that all fragments
+	 * have the same expiration.
+	 */
+	if (sinfo->sinfo_timetolive) {
+		/* sinfo_timetolive is in milliseconds */
+		msg->expires_at = jiffies +
+				    msecs_to_jiffies(sinfo->sinfo_timetolive);
+		msg->can_abandon = 1;
+		SCTP_DEBUG_PRINTK("%s: msg:%p expires_at: %ld jiffies:%ld\n",
+				  __func__, msg, msg->expires_at, jiffies);
+	}
+
+	/* This is the biggest possible DATA chunk that can fit into
+	 * the packet
+	 */
+	max_data = asoc->pathmtu -
+		sctp_sk(asoc->base.sk)->pf->af->net_header_len -
+		sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
+
+	max = asoc->frag_point;
+	/* If the the peer requested that we authenticate DATA chunks
+	 * we need to accound for bundling of the AUTH chunks along with
+	 * DATA.
+	 */
+	if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) {
+		struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+
+		if (hmac_desc)
+			max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
+					    hmac_desc->hmac_len);
+	}
+
+	/* Now, check if we need to reduce our max */
+	if (max > max_data)
+		max = max_data;
+
+	whole = 0;
+	first_len = max;
+
+	/* Check to see if we have a pending SACK and try to let it be bundled
+	 * with this message.  Do this if we don't have any data queued already.
+	 * To check that, look at out_qlen and retransmit list.
+	 * NOTE: we will not reduce to account for SACK, if the message would
+	 * not have been fragmented.
+	 */
+	if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) &&
+	    asoc->outqueue.out_qlen == 0 &&
+	    list_empty(&asoc->outqueue.retransmit) &&
+	    msg_len > max)
+		max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
+
+	/* Encourage Cookie-ECHO bundling. */
+	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
+		max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN;
+
+	/* Now that we adjusted completely, reset first_len */
+	if (first_len > max_data)
+		first_len = max_data;
+
+	/* Account for a different sized first fragment */
+	if (msg_len >= first_len) {
+		msg_len -= first_len;
+		whole = 1;
+		msg->can_delay = 0;
+	}
+
+	/* How many full sized?  How many bytes leftover? */
+	whole += msg_len / max;
+	over = msg_len % max;
+	offset = 0;
+
+	if ((whole > 1) || (whole && over))
+		SCTP_INC_STATS_USER(SCTP_MIB_FRAGUSRMSGS);
+
+	/* Create chunks for all the full sized DATA chunks. */
+	for (i=0, len=first_len; i < whole; i++) {
+		frag = SCTP_DATA_MIDDLE_FRAG;
+
+		if (0 == i)
+			frag |= SCTP_DATA_FIRST_FRAG;
+
+		if ((i == (whole - 1)) && !over) {
+			frag |= SCTP_DATA_LAST_FRAG;
+
+			/* The application requests to set the I-bit of the
+			 * last DATA chunk of a user message when providing
+			 * the user message to the SCTP implementation.
+			 */
+			if ((sinfo->sinfo_flags & SCTP_EOF) ||
+			    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
+				frag |= SCTP_DATA_SACK_IMM;
+		}
+
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0);
+
+		if (!chunk)
+			goto errout;
+		err = sctp_user_addto_chunk(chunk, offset, len, msgh->msg_iov);
+		if (err < 0)
+			goto errout;
+
+		offset += len;
+
+		/* Put the chunk->skb back into the form expected by send.  */
+		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
+			   - (__u8 *)chunk->skb->data);
+
+		sctp_datamsg_assign(msg, chunk);
+		list_add_tail(&chunk->frag_list, &msg->chunks);
+
+		/* The first chunk, the first chunk was likely short
+		 * to allow bundling, so reset to full size.
+		 */
+		if (0 == i)
+			len = max;
+	}
+
+	/* .. now the leftover bytes. */
+	if (over) {
+		if (!whole)
+			frag = SCTP_DATA_NOT_FRAG;
+		else
+			frag = SCTP_DATA_LAST_FRAG;
+
+		if ((sinfo->sinfo_flags & SCTP_EOF) ||
+		    (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
+			frag |= SCTP_DATA_SACK_IMM;
+
+		chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0);
+
+		if (!chunk)
+			goto errout;
+
+		err = sctp_user_addto_chunk(chunk, offset, over,msgh->msg_iov);
+
+		/* Put the chunk->skb back into the form expected by send.  */
+		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr
+			   - (__u8 *)chunk->skb->data);
+		if (err < 0)
+			goto errout;
+
+		sctp_datamsg_assign(msg, chunk);
+		list_add_tail(&chunk->frag_list, &msg->chunks);
+	}
+
+	return msg;
+
+errout:
+	list_for_each_safe(pos, temp, &msg->chunks) {
+		list_del_init(pos);
+		chunk = list_entry(pos, struct sctp_chunk, frag_list);
+		sctp_chunk_free(chunk);
+	}
+	sctp_datamsg_put(msg);
+	return NULL;
+}
+
+/* Check whether this message has expired. */
+int sctp_chunk_abandoned(struct sctp_chunk *chunk)
+{
+	struct sctp_datamsg *msg = chunk->msg;
+
+	if (!msg->can_abandon)
+		return 0;
+
+	if (time_after(jiffies, msg->expires_at))
+		return 1;
+
+	return 0;
+}
+
+/* This chunk (and consequently entire message) has failed in its sending. */
+void sctp_chunk_fail(struct sctp_chunk *chunk, int error)
+{
+	chunk->msg->send_failed = 1;
+	chunk->msg->send_error = error;
+}
diff --git a/net/sctp/command.c b/net/sctp/command.c
new file mode 100644
index 00000000..c0044019
--- /dev/null
+++ b/net/sctp/command.c
@@ -0,0 +1,75 @@
+/* SCTP kernel implementation Copyright (C) 1999-2001
+ * Cisco, Motorola, and IBM
+ * Copyright 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp command sequences.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson <karl@athena.chicago.il.us>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Initialize a block of memory as a command sequence. */
+int sctp_init_cmd_seq(sctp_cmd_seq_t *seq)
+{
+	memset(seq, 0, sizeof(sctp_cmd_seq_t));
+	return 1;		/* We always succeed.  */
+}
+
+/* Add a command to a sctp_cmd_seq_t.
+ * Return 0 if the command sequence is full.
+ */
+void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb, sctp_arg_t obj)
+{
+	BUG_ON(seq->next_free_slot >= SCTP_MAX_NUM_COMMANDS);
+
+	seq->cmds[seq->next_free_slot].verb = verb;
+	seq->cmds[seq->next_free_slot++].obj = obj;
+}
+
+/* Return the next command structure in a sctp_cmd_seq.
+ * Returns NULL at the end of the sequence.
+ */
+sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq)
+{
+	sctp_cmd_t *retval = NULL;
+
+	if (seq->next_cmd < seq->next_free_slot)
+		retval = &seq->cmds[seq->next_cmd++];
+
+	return retval;
+}
+
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
new file mode 100644
index 00000000..ec997cfe
--- /dev/null
+++ b/net/sctp/debug.c
@@ -0,0 +1,183 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This file converts numerical ID value to alphabetical names for SCTP
+ * terms such as chunk type, parameter time, event type, etc.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <net/sctp/sctp.h>
+
+#if SCTP_DEBUG
+int sctp_debug_flag = 1;	/* Initially enable DEBUG */
+#endif	/* SCTP_DEBUG */
+
+/* These are printable forms of Chunk ID's from section 3.1.  */
+static const char *const sctp_cid_tbl[SCTP_NUM_BASE_CHUNK_TYPES] = {
+	"DATA",
+	"INIT",
+	"INIT_ACK",
+	"SACK",
+	"HEARTBEAT",
+	"HEARTBEAT_ACK",
+	"ABORT",
+	"SHUTDOWN",
+	"SHUTDOWN_ACK",
+	"ERROR",
+	"COOKIE_ECHO",
+	"COOKIE_ACK",
+	"ECN_ECNE",
+	"ECN_CWR",
+	"SHUTDOWN_COMPLETE",
+};
+
+/* Lookup "chunk type" debug name. */
+const char *sctp_cname(const sctp_subtype_t cid)
+{
+	if (cid.chunk <= SCTP_CID_BASE_MAX)
+		return sctp_cid_tbl[cid.chunk];
+
+	switch (cid.chunk) {
+	case SCTP_CID_ASCONF:
+		return "ASCONF";
+
+	case SCTP_CID_ASCONF_ACK:
+		return "ASCONF_ACK";
+
+	case SCTP_CID_FWD_TSN:
+		return "FWD_TSN";
+
+	case SCTP_CID_AUTH:
+		return "AUTH";
+
+	default:
+		break;
+	}
+
+	return "unknown chunk";
+}
+
+/* These are printable forms of the states.  */
+const char *const sctp_state_tbl[SCTP_STATE_NUM_STATES] = {
+	"STATE_CLOSED",
+	"STATE_COOKIE_WAIT",
+	"STATE_COOKIE_ECHOED",
+	"STATE_ESTABLISHED",
+	"STATE_SHUTDOWN_PENDING",
+	"STATE_SHUTDOWN_SENT",
+	"STATE_SHUTDOWN_RECEIVED",
+	"STATE_SHUTDOWN_ACK_SENT",
+};
+
+/* Events that could change the state of an association.  */
+const char *const sctp_evttype_tbl[] = {
+	"EVENT_T_unknown",
+	"EVENT_T_CHUNK",
+	"EVENT_T_TIMEOUT",
+	"EVENT_T_OTHER",
+	"EVENT_T_PRIMITIVE"
+};
+
+/* Return value of a state function */
+const char *const sctp_status_tbl[] = {
+	"DISPOSITION_DISCARD",
+	"DISPOSITION_CONSUME",
+	"DISPOSITION_NOMEM",
+	"DISPOSITION_DELETE_TCB",
+	"DISPOSITION_ABORT",
+	"DISPOSITION_VIOLATION",
+	"DISPOSITION_NOT_IMPL",
+	"DISPOSITION_ERROR",
+	"DISPOSITION_BUG"
+};
+
+/* Printable forms of primitives */
+static const char *const sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
+	"PRIMITIVE_ASSOCIATE",
+	"PRIMITIVE_SHUTDOWN",
+	"PRIMITIVE_ABORT",
+	"PRIMITIVE_SEND",
+	"PRIMITIVE_REQUESTHEARTBEAT",
+	"PRIMITIVE_ASCONF",
+};
+
+/* Lookup primitive debug name. */
+const char *sctp_pname(const sctp_subtype_t id)
+{
+	if (id.primitive <= SCTP_EVENT_PRIMITIVE_MAX)
+		return sctp_primitive_tbl[id.primitive];
+	return "unknown_primitive";
+}
+
+static const char *const sctp_other_tbl[] = {
+	"NO_PENDING_TSN",
+	"ICMP_PROTO_UNREACH",
+};
+
+/* Lookup "other" debug name. */
+const char *sctp_oname(const sctp_subtype_t id)
+{
+	if (id.other <= SCTP_EVENT_OTHER_MAX)
+		return sctp_other_tbl[id.other];
+	return "unknown 'other' event";
+}
+
+static const char *const sctp_timer_tbl[] = {
+	"TIMEOUT_NONE",
+	"TIMEOUT_T1_COOKIE",
+	"TIMEOUT_T1_INIT",
+	"TIMEOUT_T2_SHUTDOWN",
+	"TIMEOUT_T3_RTX",
+	"TIMEOUT_T4_RTO",
+	"TIMEOUT_T5_SHUTDOWN_GUARD",
+	"TIMEOUT_HEARTBEAT",
+	"TIMEOUT_SACK",
+	"TIMEOUT_AUTOCLOSE",
+};
+
+/* Lookup timer debug name. */
+const char *sctp_tname(const sctp_subtype_t id)
+{
+	if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX)
+		return sctp_timer_tbl[id.timeout];
+	return "unknown_timer";
+}
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
new file mode 100644
index 00000000..c8cc24e2
--- /dev/null
+++ b/net/sctp/endpointola.c
@@ -0,0 +1,497 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 International Business Machines, Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This abstraction represents an SCTP endpoint.
+ *
+ * The SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * The SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson <karl@athena.chicago.il.us>
+ *    Jon Grimm <jgrimm@austin.ibm.com>
+ *    Daisy Chang <daisyc@us.ibm.com>
+ *    Dajiang Zhang <dajiang.zhang@nokia.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/random.h>	/* get_random_bytes() */
+#include <linux/crypto.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers. */
+static void sctp_endpoint_bh_rcv(struct work_struct *work);
+
+/*
+ * Initialize the base fields of the endpoint structure.
+ */
+static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
+						struct sock *sk,
+						gfp_t gfp)
+{
+	struct sctp_hmac_algo_param *auth_hmacs = NULL;
+	struct sctp_chunks_param *auth_chunks = NULL;
+	struct sctp_shared_key *null_key;
+	int err;
+
+	ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
+	if (!ep->digest)
+		return NULL;
+
+	if (sctp_auth_enable) {
+		/* Allocate space for HMACS and CHUNKS authentication
+		 * variables.  There are arrays that we encode directly
+		 * into parameters to make the rest of the operations easier.
+		 */
+		auth_hmacs = kzalloc(sizeof(sctp_hmac_algo_param_t) +
+				sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp);
+		if (!auth_hmacs)
+			goto nomem;
+
+		auth_chunks = kzalloc(sizeof(sctp_chunks_param_t) +
+					SCTP_NUM_CHUNK_TYPES, gfp);
+		if (!auth_chunks)
+			goto nomem;
+
+		/* Initialize the HMACS parameter.
+		 * SCTP-AUTH: Section 3.3
+		 *    Every endpoint supporting SCTP chunk authentication MUST
+		 *    support the HMAC based on the SHA-1 algorithm.
+		 */
+		auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO;
+		auth_hmacs->param_hdr.length =
+					htons(sizeof(sctp_paramhdr_t) + 2);
+		auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1);
+
+		/* Initialize the CHUNKS parameter */
+		auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS;
+		auth_chunks->param_hdr.length = htons(sizeof(sctp_paramhdr_t));
+
+		/* If the Add-IP functionality is enabled, we must
+		 * authenticate, ASCONF and ASCONF-ACK chunks
+		 */
+		if (sctp_addip_enable) {
+			auth_chunks->chunks[0] = SCTP_CID_ASCONF;
+			auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
+			auth_chunks->param_hdr.length =
+					htons(sizeof(sctp_paramhdr_t) + 2);
+		}
+	}
+
+	/* Initialize the base structure. */
+	/* What type of endpoint are we?  */
+	ep->base.type = SCTP_EP_TYPE_SOCKET;
+
+	/* Initialize the basic object fields. */
+	atomic_set(&ep->base.refcnt, 1);
+	ep->base.dead = 0;
+	ep->base.malloced = 1;
+
+	/* Create an input queue.  */
+	sctp_inq_init(&ep->base.inqueue);
+
+	/* Set its top-half handler */
+	sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv);
+
+	/* Initialize the bind addr area */
+	sctp_bind_addr_init(&ep->base.bind_addr, 0);
+
+	/* Remember who we are attached to.  */
+	ep->base.sk = sk;
+	sock_hold(ep->base.sk);
+
+	/* Create the lists of associations.  */
+	INIT_LIST_HEAD(&ep->asocs);
+
+	/* Use SCTP specific send buffer space queues.  */
+	ep->sndbuf_policy = sctp_sndbuf_policy;
+
+	sk->sk_data_ready = sctp_data_ready;
+	sk->sk_write_space = sctp_write_space;
+	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+	/* Get the receive buffer policy for this endpoint */
+	ep->rcvbuf_policy = sctp_rcvbuf_policy;
+
+	/* Initialize the secret key used with cookie. */
+	get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE);
+	ep->last_key = ep->current_key = 0;
+	ep->key_changed_at = jiffies;
+
+	/* SCTP-AUTH extensions*/
+	INIT_LIST_HEAD(&ep->endpoint_shared_keys);
+	null_key = sctp_auth_shkey_create(0, GFP_KERNEL);
+	if (!null_key)
+		goto nomem;
+
+	list_add(&null_key->key_list, &ep->endpoint_shared_keys);
+
+	/* Allocate and initialize transorms arrays for suported HMACs. */
+	err = sctp_auth_init_hmacs(ep, gfp);
+	if (err)
+		goto nomem_hmacs;
+
+	/* Add the null key to the endpoint shared keys list and
+	 * set the hmcas and chunks pointers.
+	 */
+	ep->auth_hmacs_list = auth_hmacs;
+	ep->auth_chunk_list = auth_chunks;
+
+	return ep;
+
+nomem_hmacs:
+	sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
+nomem:
+	/* Free all allocations */
+	kfree(auth_hmacs);
+	kfree(auth_chunks);
+	kfree(ep->digest);
+	return NULL;
+
+}
+
+/* Create a sctp_endpoint with all that boring stuff initialized.
+ * Returns NULL if there isn't enough memory.
+ */
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp)
+{
+	struct sctp_endpoint *ep;
+
+	/* Build a local endpoint. */
+	ep = t_new(struct sctp_endpoint, gfp);
+	if (!ep)
+		goto fail;
+	if (!sctp_endpoint_init(ep, sk, gfp))
+		goto fail_init;
+	ep->base.malloced = 1;
+	SCTP_DBG_OBJCNT_INC(ep);
+	return ep;
+
+fail_init:
+	kfree(ep);
+fail:
+	return NULL;
+}
+
+/* Add an association to an endpoint.  */
+void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
+			    struct sctp_association *asoc)
+{
+	struct sock *sk = ep->base.sk;
+
+	/* If this is a temporary association, don't bother
+	 * since we'll be removing it shortly and don't
+	 * want anyone to find it anyway.
+	 */
+	if (asoc->temp)
+		return;
+
+	/* Now just add it to our list of asocs */
+	list_add_tail(&asoc->asocs, &ep->asocs);
+
+	/* Increment the backlog value for a TCP-style listening socket. */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+		sk->sk_ack_backlog++;
+}
+
+/* Free the endpoint structure.  Delay cleanup until
+ * all users have released their reference count on this structure.
+ */
+void sctp_endpoint_free(struct sctp_endpoint *ep)
+{
+	ep->base.dead = 1;
+
+	ep->base.sk->sk_state = SCTP_SS_CLOSED;
+
+	/* Unlink this endpoint, so we can't find it again! */
+	sctp_unhash_endpoint(ep);
+
+	sctp_endpoint_put(ep);
+}
+
+/* Final destructor for endpoint.  */
+static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
+{
+	SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return);
+
+	/* Free up the HMAC transform. */
+	crypto_free_hash(sctp_sk(ep->base.sk)->hmac);
+
+	/* Free the digest buffer */
+	kfree(ep->digest);
+
+	/* SCTP-AUTH: Free up AUTH releated data such as shared keys
+	 * chunks and hmacs arrays that were allocated
+	 */
+	sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
+	kfree(ep->auth_hmacs_list);
+	kfree(ep->auth_chunk_list);
+
+	/* AUTH - Free any allocated HMAC transform containers */
+	sctp_auth_destroy_hmacs(ep->auth_hmacs);
+
+	/* Cleanup. */
+	sctp_inq_free(&ep->base.inqueue);
+	sctp_bind_addr_free(&ep->base.bind_addr);
+
+	/* Remove and free the port */
+	if (sctp_sk(ep->base.sk)->bind_hash)
+		sctp_put_port(ep->base.sk);
+
+	/* Give up our hold on the sock. */
+	if (ep->base.sk)
+		sock_put(ep->base.sk);
+
+	/* Finally, free up our memory. */
+	if (ep->base.malloced) {
+		kfree(ep);
+		SCTP_DBG_OBJCNT_DEC(ep);
+	}
+}
+
+/* Hold a reference to an endpoint. */
+void sctp_endpoint_hold(struct sctp_endpoint *ep)
+{
+	atomic_inc(&ep->base.refcnt);
+}
+
+/* Release a reference to an endpoint and clean up if there are
+ * no more references.
+ */
+void sctp_endpoint_put(struct sctp_endpoint *ep)
+{
+	if (atomic_dec_and_test(&ep->base.refcnt))
+		sctp_endpoint_destroy(ep);
+}
+
+/* Is this the endpoint we are looking for?  */
+struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
+					       const union sctp_addr *laddr)
+{
+	struct sctp_endpoint *retval = NULL;
+
+	if (htons(ep->base.bind_addr.port) == laddr->v4.sin_port) {
+		if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
+					 sctp_sk(ep->base.sk)))
+			retval = ep;
+	}
+
+	return retval;
+}
+
+/* Find the association that goes with this chunk.
+ * We do a linear search of the associations for this endpoint.
+ * We return the matching transport address too.
+ */
+static struct sctp_association *__sctp_endpoint_lookup_assoc(
+	const struct sctp_endpoint *ep,
+	const union sctp_addr *paddr,
+	struct sctp_transport **transport)
+{
+	struct sctp_association *asoc = NULL;
+	struct sctp_association *tmp;
+	struct sctp_transport *t = NULL;
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct hlist_node *node;
+	int hash;
+	int rport;
+
+	*transport = NULL;
+
+	/* If the local port is not set, there can't be any associations
+	 * on this endpoint.
+	 */
+	if (!ep->base.bind_addr.port)
+		goto out;
+
+	rport = ntohs(paddr->v4.sin_port);
+
+	hash = sctp_assoc_hashfn(ep->base.bind_addr.port, rport);
+	head = &sctp_assoc_hashtable[hash];
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		tmp = sctp_assoc(epb);
+		if (tmp->ep != ep || rport != tmp->peer.port)
+			continue;
+
+		t = sctp_assoc_lookup_paddr(tmp, paddr);
+		if (t) {
+			asoc = tmp;
+			*transport = t;
+			break;
+		}
+	}
+	read_unlock(&head->lock);
+out:
+	return asoc;
+}
+
+/* Lookup association on an endpoint based on a peer address.  BH-safe.  */
+struct sctp_association *sctp_endpoint_lookup_assoc(
+	const struct sctp_endpoint *ep,
+	const union sctp_addr *paddr,
+	struct sctp_transport **transport)
+{
+	struct sctp_association *asoc;
+
+	sctp_local_bh_disable();
+	asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport);
+	sctp_local_bh_enable();
+
+	return asoc;
+}
+
+/* Look for any peeled off association from the endpoint that matches the
+ * given peer address.
+ */
+int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
+				const union sctp_addr *paddr)
+{
+	struct sctp_sockaddr_entry *addr;
+	struct sctp_bind_addr *bp;
+
+	bp = &ep->base.bind_addr;
+	/* This function is called with the socket lock held,
+	 * so the address_list can not change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
+		if (sctp_has_association(&addr->a, paddr))
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Do delayed input processing.  This is scheduled by sctp_rcv().
+ * This may be called on BH or task time.
+ */
+static void sctp_endpoint_bh_rcv(struct work_struct *work)
+{
+	struct sctp_endpoint *ep =
+		container_of(work, struct sctp_endpoint,
+			     base.inqueue.immediate);
+	struct sctp_association *asoc;
+	struct sock *sk;
+	struct sctp_transport *transport;
+	struct sctp_chunk *chunk;
+	struct sctp_inq *inqueue;
+	sctp_subtype_t subtype;
+	sctp_state_t state;
+	int error = 0;
+	int first_time = 1;	/* is this the first time through the looop */
+
+	if (ep->base.dead)
+		return;
+
+	asoc = NULL;
+	inqueue = &ep->base.inqueue;
+	sk = ep->base.sk;
+
+	while (NULL != (chunk = sctp_inq_pop(inqueue))) {
+		subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);
+
+		/* If the first chunk in the packet is AUTH, do special
+		 * processing specified in Section 6.3 of SCTP-AUTH spec
+		 */
+		if (first_time && (subtype.chunk == SCTP_CID_AUTH)) {
+			struct sctp_chunkhdr *next_hdr;
+
+			next_hdr = sctp_inq_peek(inqueue);
+			if (!next_hdr)
+				goto normal;
+
+			/* If the next chunk is COOKIE-ECHO, skip the AUTH
+			 * chunk while saving a pointer to it so we can do
+			 * Authentication later (during cookie-echo
+			 * processing).
+			 */
+			if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
+				chunk->auth_chunk = skb_clone(chunk->skb,
+								GFP_ATOMIC);
+				chunk->auth = 1;
+				continue;
+			}
+		}
+normal:
+		/* We might have grown an association since last we
+		 * looked, so try again.
+		 *
+		 * This happens when we've just processed our
+		 * COOKIE-ECHO chunk.
+		 */
+		if (NULL == chunk->asoc) {
+			asoc = sctp_endpoint_lookup_assoc(ep,
+							  sctp_source(chunk),
+							  &transport);
+			chunk->asoc = asoc;
+			chunk->transport = transport;
+		}
+
+		state = asoc ? asoc->state : SCTP_STATE_CLOSED;
+		if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
+			continue;
+
+		/* Remember where the last DATA chunk came from so we
+		 * know where to send the SACK.
+		 */
+		if (asoc && sctp_chunk_is_data(chunk))
+			asoc->peer.last_data_from = chunk->transport;
+		else
+			SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS);
+
+		if (chunk->transport)
+			chunk->transport->last_time_heard = jiffies;
+
+		error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype, state,
+				   ep, asoc, chunk, GFP_ATOMIC);
+
+		if (error && chunk)
+			chunk->pdiscard = 1;
+
+		/* Check to see if the endpoint is freed in response to
+		 * the incoming chunk. If so, get out of the while loop.
+		 */
+		if (!sctp_sk(sk)->ep)
+			break;
+
+		if (first_time)
+			first_time = 0;
+	}
+}
diff --git a/net/sctp/input.c b/net/sctp/input.c
new file mode 100644
index 00000000..741ed164
--- /dev/null
+++ b/net/sctp/input.c
@@ -0,0 +1,1139 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 International Business Machines, Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions handle all input from the IP layer into SCTP.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson <karl@athena.chicago.il.us>
+ *    Xingang Guo <xingang.guo@intel.com>
+ *    Jon Grimm <jgrimm@us.ibm.com>
+ *    Hui Huang <hui.huang@nokia.com>
+ *    Daisy Chang <daisyc@us.ibm.com>
+ *    Sridhar Samudrala <sri@us.ibm.com>
+ *    Ardelle Fan <ardelle.fan@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/list.h> /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/time.h> /* For struct timeval */
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/checksum.h>
+#include <net/net_namespace.h>
+
+/* Forward declarations for internal helpers. */
+static int sctp_rcv_ootb(struct sk_buff *);
+static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb,
+				      const union sctp_addr *laddr,
+				      const union sctp_addr *paddr,
+				      struct sctp_transport **transportp);
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr);
+static struct sctp_association *__sctp_lookup_association(
+					const union sctp_addr *local,
+					const union sctp_addr *peer,
+					struct sctp_transport **pt);
+
+static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb);
+
+
+/* Calculate the SCTP checksum of an SCTP packet.  */
+static inline int sctp_rcv_checksum(struct sk_buff *skb)
+{
+	struct sctphdr *sh = sctp_hdr(skb);
+	__le32 cmp = sh->checksum;
+	struct sk_buff *list;
+	__le32 val;
+	__u32 tmp = sctp_start_cksum((__u8 *)sh, skb_headlen(skb));
+
+	skb_walk_frags(skb, list)
+		tmp = sctp_update_cksum((__u8 *)list->data, skb_headlen(list),
+					tmp);
+
+	val = sctp_end_cksum(tmp);
+
+	if (val != cmp) {
+		/* CRC failure, dump it. */
+		SCTP_INC_STATS_BH(SCTP_MIB_CHECKSUMERRORS);
+		return -1;
+	}
+	return 0;
+}
+
+struct sctp_input_cb {
+	union {
+		struct inet_skb_parm	h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		struct inet6_skb_parm	h6;
+#endif
+	} header;
+	struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb)	((struct sctp_input_cb *)&((__skb)->cb[0]))
+
+/*
+ * This is the routine which IP calls when receiving an SCTP packet.
+ */
+int sctp_rcv(struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct sctp_association *asoc;
+	struct sctp_endpoint *ep = NULL;
+	struct sctp_ep_common *rcvr;
+	struct sctp_transport *transport = NULL;
+	struct sctp_chunk *chunk;
+	struct sctphdr *sh;
+	union sctp_addr src;
+	union sctp_addr dest;
+	int family;
+	struct sctp_af *af;
+
+	if (skb->pkt_type!=PACKET_HOST)
+		goto discard_it;
+
+	SCTP_INC_STATS_BH(SCTP_MIB_INSCTPPACKS);
+
+	if (skb_linearize(skb))
+		goto discard_it;
+
+	sh = sctp_hdr(skb);
+
+	/* Pull up the IP and SCTP headers. */
+	__skb_pull(skb, skb_transport_offset(skb));
+	if (skb->len < sizeof(struct sctphdr))
+		goto discard_it;
+	if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) &&
+		  sctp_rcv_checksum(skb) < 0)
+		goto discard_it;
+
+	skb_pull(skb, sizeof(struct sctphdr));
+
+	/* Make sure we at least have chunk headers worth of data left. */
+	if (skb->len < sizeof(struct sctp_chunkhdr))
+		goto discard_it;
+
+	family = ipver2af(ip_hdr(skb)->version);
+	af = sctp_get_af_specific(family);
+	if (unlikely(!af))
+		goto discard_it;
+
+	/* Initialize local addresses for lookups. */
+	af->from_skb(&src, skb, 1);
+	af->from_skb(&dest, skb, 0);
+
+	/* If the packet is to or from a non-unicast address,
+	 * silently discard the packet.
+	 *
+	 * This is not clearly defined in the RFC except in section
+	 * 8.4 - OOTB handling.  However, based on the book "Stream Control
+	 * Transmission Protocol" 2.1, "It is important to note that the
+	 * IP address of an SCTP transport address must be a routable
+	 * unicast address.  In other words, IP multicast addresses and
+	 * IP broadcast addresses cannot be used in an SCTP transport
+	 * address."
+	 */
+	if (!af->addr_valid(&src, NULL, skb) ||
+	    !af->addr_valid(&dest, NULL, skb))
+		goto discard_it;
+
+	asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
+
+	if (!asoc)
+		ep = __sctp_rcv_lookup_endpoint(&dest);
+
+	/* Retrieve the common input handling substructure. */
+	rcvr = asoc ? &asoc->base : &ep->base;
+	sk = rcvr->sk;
+
+	/*
+	 * If a frame arrives on an interface and the receiving socket is
+	 * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
+	 */
+	if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb)))
+	{
+		if (asoc) {
+			sctp_association_put(asoc);
+			asoc = NULL;
+		} else {
+			sctp_endpoint_put(ep);
+			ep = NULL;
+		}
+		sk = sctp_get_ctl_sock();
+		ep = sctp_sk(sk)->ep;
+		sctp_endpoint_hold(ep);
+		rcvr = &ep->base;
+	}
+
+	/*
+	 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
+	 * An SCTP packet is called an "out of the blue" (OOTB)
+	 * packet if it is correctly formed, i.e., passed the
+	 * receiver's checksum check, but the receiver is not
+	 * able to identify the association to which this
+	 * packet belongs.
+	 */
+	if (!asoc) {
+		if (sctp_rcv_ootb(skb)) {
+			SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES);
+			goto discard_release;
+		}
+	}
+
+	if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
+		goto discard_release;
+	nf_reset(skb);
+
+	if (sk_filter(sk, skb))
+		goto discard_release;
+
+	/* Create an SCTP packet structure. */
+	chunk = sctp_chunkify(skb, asoc, sk);
+	if (!chunk)
+		goto discard_release;
+	SCTP_INPUT_CB(skb)->chunk = chunk;
+
+	/* Remember what endpoint is to handle this packet. */
+	chunk->rcvr = rcvr;
+
+	/* Remember the SCTP header. */
+	chunk->sctp_hdr = sh;
+
+	/* Set the source and destination addresses of the incoming chunk.  */
+	sctp_init_addrs(chunk, &src, &dest);
+
+	/* Remember where we came from.  */
+	chunk->transport = transport;
+
+	/* Acquire access to the sock lock. Note: We are safe from other
+	 * bottom halves on this lock, but a user may be in the lock too,
+	 * so check if it is busy.
+	 */
+	sctp_bh_lock_sock(sk);
+
+	if (sk != rcvr->sk) {
+		/* Our cached sk is different from the rcvr->sk.  This is
+		 * because migrate()/accept() may have moved the association
+		 * to a new socket and released all the sockets.  So now we
+		 * are holding a lock on the old socket while the user may
+		 * be doing something with the new socket.  Switch our veiw
+		 * of the current sk.
+		 */
+		sctp_bh_unlock_sock(sk);
+		sk = rcvr->sk;
+		sctp_bh_lock_sock(sk);
+	}
+
+	if (sock_owned_by_user(sk)) {
+		if (sctp_add_backlog(sk, skb)) {
+			sctp_bh_unlock_sock(sk);
+			sctp_chunk_free(chunk);
+			skb = NULL; /* sctp_chunk_free already freed the skb */
+			goto discard_release;
+		}
+		SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_BACKLOG);
+	} else {
+		SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_SOFTIRQ);
+		sctp_inq_push(&chunk->rcvr->inqueue, chunk);
+	}
+
+	sctp_bh_unlock_sock(sk);
+
+	/* Release the asoc/ep ref we took in the lookup calls. */
+	if (asoc)
+		sctp_association_put(asoc);
+	else
+		sctp_endpoint_put(ep);
+
+	return 0;
+
+discard_it:
+	SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_DISCARDS);
+	kfree_skb(skb);
+	return 0;
+
+discard_release:
+	/* Release the asoc/ep ref we took in the lookup calls. */
+	if (asoc)
+		sctp_association_put(asoc);
+	else
+		sctp_endpoint_put(ep);
+
+	goto discard_it;
+}
+
+/* Process the backlog queue of the socket.  Every skb on
+ * the backlog holds a ref on an association or endpoint.
+ * We hold this ref throughout the state machine to make
+ * sure that the structure we need is still around.
+ */
+int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+	struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
+	struct sctp_ep_common *rcvr = NULL;
+	int backloged = 0;
+
+	rcvr = chunk->rcvr;
+
+	/* If the rcvr is dead then the association or endpoint
+	 * has been deleted and we can safely drop the chunk
+	 * and refs that we are holding.
+	 */
+	if (rcvr->dead) {
+		sctp_chunk_free(chunk);
+		goto done;
+	}
+
+	if (unlikely(rcvr->sk != sk)) {
+		/* In this case, the association moved from one socket to
+		 * another.  We are currently sitting on the backlog of the
+		 * old socket, so we need to move.
+		 * However, since we are here in the process context we
+		 * need to take make sure that the user doesn't own
+		 * the new socket when we process the packet.
+		 * If the new socket is user-owned, queue the chunk to the
+		 * backlog of the new socket without dropping any refs.
+		 * Otherwise, we can safely push the chunk on the inqueue.
+		 */
+
+		sk = rcvr->sk;
+		sctp_bh_lock_sock(sk);
+
+		if (sock_owned_by_user(sk)) {
+			if (sk_add_backlog(sk, skb))
+				sctp_chunk_free(chunk);
+			else
+				backloged = 1;
+		} else
+			sctp_inq_push(inqueue, chunk);
+
+		sctp_bh_unlock_sock(sk);
+
+		/* If the chunk was backloged again, don't drop refs */
+		if (backloged)
+			return 0;
+	} else {
+		sctp_inq_push(inqueue, chunk);
+	}
+
+done:
+	/* Release the refs we took in sctp_add_backlog */
+	if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
+		sctp_association_put(sctp_assoc(rcvr));
+	else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
+		sctp_endpoint_put(sctp_ep(rcvr));
+	else
+		BUG();
+
+	return 0;
+}
+
+static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
+{
+	struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+	struct sctp_ep_common *rcvr = chunk->rcvr;
+	int ret;
+
+	ret = sk_add_backlog(sk, skb);
+	if (!ret) {
+		/* Hold the assoc/ep while hanging on the backlog queue.
+		 * This way, we know structures we need will not disappear
+		 * from us
+		 */
+		if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
+			sctp_association_hold(sctp_assoc(rcvr));
+		else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
+			sctp_endpoint_hold(sctp_ep(rcvr));
+		else
+			BUG();
+	}
+	return ret;
+
+}
+
+/* Handle icmp frag needed error. */
+void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
+			   struct sctp_transport *t, __u32 pmtu)
+{
+	if (!t || (t->pathmtu <= pmtu))
+		return;
+
+	if (sock_owned_by_user(sk)) {
+		asoc->pmtu_pending = 1;
+		t->pmtu_pending = 1;
+		return;
+	}
+
+	if (t->param_flags & SPP_PMTUD_ENABLE) {
+		/* Update transports view of the MTU */
+		sctp_transport_update_pmtu(t, pmtu);
+
+		/* Update association pmtu. */
+		sctp_assoc_sync_pmtu(asoc);
+	}
+
+	/* Retransmit with the new pmtu setting.
+	 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
+	 * Needed will never be sent, but if a message was sent before
+	 * PMTU discovery was disabled that was larger than the PMTU, it
+	 * would not be fragmented, so it must be re-transmitted fragmented.
+	 */
+	sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
+}
+
+/*
+ * SCTP Implementer's Guide, 2.37 ICMP handling procedures
+ *
+ * ICMP8) If the ICMP code is a "Unrecognized next header type encountered"
+ *        or a "Protocol Unreachable" treat this message as an abort
+ *        with the T bit set.
+ *
+ * This function sends an event to the state machine, which will abort the
+ * association.
+ *
+ */
+void sctp_icmp_proto_unreachable(struct sock *sk,
+			   struct sctp_association *asoc,
+			   struct sctp_transport *t)
+{
+	SCTP_DEBUG_PRINTK("%s\n",  __func__);
+
+	if (sock_owned_by_user(sk)) {
+		if (timer_pending(&t->proto_unreach_timer))
+			return;
+		else {
+			if (!mod_timer(&t->proto_unreach_timer,
+						jiffies + (HZ/20)))
+				sctp_association_hold(asoc);
+		}
+			
+	} else {
+		if (timer_pending(&t->proto_unreach_timer) &&
+		    del_timer(&t->proto_unreach_timer))
+			sctp_association_put(asoc);
+
+		sctp_do_sm(SCTP_EVENT_T_OTHER,
+			   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+			   asoc->state, asoc->ep, asoc, t,
+			   GFP_ATOMIC);
+	}
+}
+
+/* Common lookup code for icmp/icmpv6 error handler. */
+struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
+			     struct sctphdr *sctphdr,
+			     struct sctp_association **app,
+			     struct sctp_transport **tpp)
+{
+	union sctp_addr saddr;
+	union sctp_addr daddr;
+	struct sctp_af *af;
+	struct sock *sk = NULL;
+	struct sctp_association *asoc;
+	struct sctp_transport *transport = NULL;
+	struct sctp_init_chunk *chunkhdr;
+	__u32 vtag = ntohl(sctphdr->vtag);
+	int len = skb->len - ((void *)sctphdr - (void *)skb->data);
+
+	*app = NULL; *tpp = NULL;
+
+	af = sctp_get_af_specific(family);
+	if (unlikely(!af)) {
+		return NULL;
+	}
+
+	/* Initialize local addresses for lookups. */
+	af->from_skb(&saddr, skb, 1);
+	af->from_skb(&daddr, skb, 0);
+
+	/* Look for an association that matches the incoming ICMP error
+	 * packet.
+	 */
+	asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
+	if (!asoc)
+		return NULL;
+
+	sk = asoc->base.sk;
+
+	/* RFC 4960, Appendix C. ICMP Handling
+	 *
+	 * ICMP6) An implementation MUST validate that the Verification Tag
+	 * contained in the ICMP message matches the Verification Tag of
+	 * the peer.  If the Verification Tag is not 0 and does NOT
+	 * match, discard the ICMP message.  If it is 0 and the ICMP
+	 * message contains enough bytes to verify that the chunk type is
+	 * an INIT chunk and that the Initiate Tag matches the tag of the
+	 * peer, continue with ICMP7.  If the ICMP message is too short
+	 * or the chunk type or the Initiate Tag does not match, silently
+	 * discard the packet.
+	 */
+	if (vtag == 0) {
+		chunkhdr = (struct sctp_init_chunk *)((void *)sctphdr
+				+ sizeof(struct sctphdr));
+		if (len < sizeof(struct sctphdr) + sizeof(sctp_chunkhdr_t)
+			  + sizeof(__be32) ||
+		    chunkhdr->chunk_hdr.type != SCTP_CID_INIT ||
+		    ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) {
+			goto out;
+		}
+	} else if (vtag != asoc->c.peer_vtag) {
+		goto out;
+	}
+
+	sctp_bh_lock_sock(sk);
+
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 */
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(&init_net, LINUX_MIB_LOCKDROPPEDICMPS);
+
+	*app = asoc;
+	*tpp = transport;
+	return sk;
+
+out:
+	if (asoc)
+		sctp_association_put(asoc);
+	return NULL;
+}
+
+/* Common cleanup code for icmp/icmpv6 error handler. */
+void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
+{
+	sctp_bh_unlock_sock(sk);
+	if (asoc)
+		sctp_association_put(asoc);
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  After adjustment
+ * header points to the first 8 bytes of the sctp header.  We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+void sctp_v4_err(struct sk_buff *skb, __u32 info)
+{
+	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const int ihlen = iph->ihl * 4;
+	const int type = icmp_hdr(skb)->type;
+	const int code = icmp_hdr(skb)->code;
+	struct sock *sk;
+	struct sctp_association *asoc = NULL;
+	struct sctp_transport *transport;
+	struct inet_sock *inet;
+	sk_buff_data_t saveip, savesctp;
+	int err;
+
+	if (skb->len < ihlen + 8) {
+		ICMP_INC_STATS_BH(&init_net, ICMP_MIB_INERRORS);
+		return;
+	}
+
+	/* Fix up skb to look at the embedded net header. */
+	saveip = skb->network_header;
+	savesctp = skb->transport_header;
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, ihlen);
+	sk = sctp_err_lookup(AF_INET, skb, sctp_hdr(skb), &asoc, &transport);
+	/* Put back, the original values. */
+	skb->network_header = saveip;
+	skb->transport_header = savesctp;
+	if (!sk) {
+		ICMP_INC_STATS_BH(&init_net, ICMP_MIB_INERRORS);
+		return;
+	}
+	/* Warning:  The sock lock is held.  Remember to call
+	 * sctp_err_finish!
+	 */
+
+	switch (type) {
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out_unlock;
+
+		/* PMTU discovery (RFC1191) */
+		if (ICMP_FRAG_NEEDED == code) {
+			sctp_icmp_frag_needed(sk, asoc, transport, info);
+			goto out_unlock;
+		}
+		else {
+			if (ICMP_PROT_UNREACH == code) {
+				sctp_icmp_proto_unreachable(sk, asoc,
+							    transport);
+				goto out_unlock;
+			}
+		}
+		err = icmp_err_convert[code].errno;
+		break;
+	case ICMP_TIME_EXCEEDED:
+		/* Ignore any time exceeded errors due to fragment reassembly
+		 * timeouts.
+		 */
+		if (ICMP_EXC_FRAGTIME == code)
+			goto out_unlock;
+
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out_unlock;
+	}
+
+	inet = inet_sk(sk);
+	if (!sock_owned_by_user(sk) && inet->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else {  /* Only an error on timeout */
+		sk->sk_err_soft = err;
+	}
+
+out_unlock:
+	sctp_err_finish(sk, asoc);
+}
+
+/*
+ * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
+ *
+ * This function scans all the chunks in the OOTB packet to determine if
+ * the packet should be discarded right away.  If a response might be needed
+ * for this packet, or, if further processing is possible, the packet will
+ * be queued to a proper inqueue for the next phase of handling.
+ *
+ * Output:
+ * Return 0 - If further processing is needed.
+ * Return 1 - If the packet can be discarded right away.
+ */
+static int sctp_rcv_ootb(struct sk_buff *skb)
+{
+	sctp_chunkhdr_t *ch;
+	__u8 *ch_end;
+
+	ch = (sctp_chunkhdr_t *) skb->data;
+
+	/* Scan through all the chunks in the packet.  */
+	do {
+		/* Break out if chunk length is less then minimal. */
+		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+			break;
+
+		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		if (ch_end > skb_tail_pointer(skb))
+			break;
+
+		/* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the
+		 * receiver MUST silently discard the OOTB packet and take no
+		 * further action.
+		 */
+		if (SCTP_CID_ABORT == ch->type)
+			goto discard;
+
+		/* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE
+		 * chunk, the receiver should silently discard the packet
+		 * and take no further action.
+		 */
+		if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type)
+			goto discard;
+
+		/* RFC 4460, 2.11.2
+		 * This will discard packets with INIT chunk bundled as
+		 * subsequent chunks in the packet.  When INIT is first,
+		 * the normal INIT processing will discard the chunk.
+		 */
+		if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data)
+			goto discard;
+
+		ch = (sctp_chunkhdr_t *) ch_end;
+	} while (ch_end < skb_tail_pointer(skb));
+
+	return 0;
+
+discard:
+	return 1;
+}
+
+/* Insert endpoint into the hash table.  */
+static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
+{
+	struct sctp_ep_common *epb;
+	struct sctp_hashbucket *head;
+
+	epb = &ep->base;
+
+	epb->hashent = sctp_ep_hashfn(epb->bind_addr.port);
+	head = &sctp_ep_hashtable[epb->hashent];
+
+	sctp_write_lock(&head->lock);
+	hlist_add_head(&epb->node, &head->chain);
+	sctp_write_unlock(&head->lock);
+}
+
+/* Add an endpoint to the hash. Local BH-safe. */
+void sctp_hash_endpoint(struct sctp_endpoint *ep)
+{
+	sctp_local_bh_disable();
+	__sctp_hash_endpoint(ep);
+	sctp_local_bh_enable();
+}
+
+/* Remove endpoint from the hash table.  */
+static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+
+	epb = &ep->base;
+
+	if (hlist_unhashed(&epb->node))
+		return;
+
+	epb->hashent = sctp_ep_hashfn(epb->bind_addr.port);
+
+	head = &sctp_ep_hashtable[epb->hashent];
+
+	sctp_write_lock(&head->lock);
+	__hlist_del(&epb->node);
+	sctp_write_unlock(&head->lock);
+}
+
+/* Remove endpoint from the hash.  Local BH-safe. */
+void sctp_unhash_endpoint(struct sctp_endpoint *ep)
+{
+	sctp_local_bh_disable();
+	__sctp_unhash_endpoint(ep);
+	sctp_local_bh_enable();
+}
+
+/* Look up an endpoint. */
+static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct sctp_endpoint *ep;
+	struct hlist_node *node;
+	int hash;
+
+	hash = sctp_ep_hashfn(ntohs(laddr->v4.sin_port));
+	head = &sctp_ep_hashtable[hash];
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		ep = sctp_ep(epb);
+		if (sctp_endpoint_is_match(ep, laddr))
+			goto hit;
+	}
+
+	ep = sctp_sk((sctp_get_ctl_sock()))->ep;
+
+hit:
+	sctp_endpoint_hold(ep);
+	read_unlock(&head->lock);
+	return ep;
+}
+
+/* Insert association into the hash table.  */
+static void __sctp_hash_established(struct sctp_association *asoc)
+{
+	struct sctp_ep_common *epb;
+	struct sctp_hashbucket *head;
+
+	epb = &asoc->base;
+
+	/* Calculate which chain this entry will belong to. */
+	epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port, asoc->peer.port);
+
+	head = &sctp_assoc_hashtable[epb->hashent];
+
+	sctp_write_lock(&head->lock);
+	hlist_add_head(&epb->node, &head->chain);
+	sctp_write_unlock(&head->lock);
+}
+
+/* Add an association to the hash. Local BH-safe. */
+void sctp_hash_established(struct sctp_association *asoc)
+{
+	if (asoc->temp)
+		return;
+
+	sctp_local_bh_disable();
+	__sctp_hash_established(asoc);
+	sctp_local_bh_enable();
+}
+
+/* Remove association from the hash table.  */
+static void __sctp_unhash_established(struct sctp_association *asoc)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+
+	epb = &asoc->base;
+
+	epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port,
+					 asoc->peer.port);
+
+	head = &sctp_assoc_hashtable[epb->hashent];
+
+	sctp_write_lock(&head->lock);
+	__hlist_del(&epb->node);
+	sctp_write_unlock(&head->lock);
+}
+
+/* Remove association from the hash table.  Local BH-safe. */
+void sctp_unhash_established(struct sctp_association *asoc)
+{
+	if (asoc->temp)
+		return;
+
+	sctp_local_bh_disable();
+	__sctp_unhash_established(asoc);
+	sctp_local_bh_enable();
+}
+
+/* Look up an association. */
+static struct sctp_association *__sctp_lookup_association(
+					const union sctp_addr *local,
+					const union sctp_addr *peer,
+					struct sctp_transport **pt)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct sctp_association *asoc;
+	struct sctp_transport *transport;
+	struct hlist_node *node;
+	int hash;
+
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	hash = sctp_assoc_hashfn(ntohs(local->v4.sin_port), ntohs(peer->v4.sin_port));
+	head = &sctp_assoc_hashtable[hash];
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		asoc = sctp_assoc(epb);
+		transport = sctp_assoc_is_match(asoc, local, peer);
+		if (transport)
+			goto hit;
+	}
+
+	read_unlock(&head->lock);
+
+	return NULL;
+
+hit:
+	*pt = transport;
+	sctp_association_hold(asoc);
+	read_unlock(&head->lock);
+	return asoc;
+}
+
+/* Look up an association. BH-safe. */
+SCTP_STATIC
+struct sctp_association *sctp_lookup_association(const union sctp_addr *laddr,
+						 const union sctp_addr *paddr,
+					    struct sctp_transport **transportp)
+{
+	struct sctp_association *asoc;
+
+	sctp_local_bh_disable();
+	asoc = __sctp_lookup_association(laddr, paddr, transportp);
+	sctp_local_bh_enable();
+
+	return asoc;
+}
+
+/* Is there an association matching the given local and peer addresses? */
+int sctp_has_association(const union sctp_addr *laddr,
+			 const union sctp_addr *paddr)
+{
+	struct sctp_association *asoc;
+	struct sctp_transport *transport;
+
+	if ((asoc = sctp_lookup_association(laddr, paddr, &transport))) {
+		sctp_association_put(asoc);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * SCTP Implementors Guide, 2.18 Handling of address
+ * parameters within the INIT or INIT-ACK.
+ *
+ * D) When searching for a matching TCB upon reception of an INIT
+ *    or INIT-ACK chunk the receiver SHOULD use not only the
+ *    source address of the packet (containing the INIT or
+ *    INIT-ACK) but the receiver SHOULD also use all valid
+ *    address parameters contained within the chunk.
+ *
+ * 2.18.3 Solution description
+ *
+ * This new text clearly specifies to an implementor the need
+ * to look within the INIT or INIT-ACK. Any implementation that
+ * does not do this, may not be able to establish associations
+ * in certain circumstances.
+ *
+ */
+static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb,
+	const union sctp_addr *laddr, struct sctp_transport **transportp)
+{
+	struct sctp_association *asoc;
+	union sctp_addr addr;
+	union sctp_addr *paddr = &addr;
+	struct sctphdr *sh = sctp_hdr(skb);
+	union sctp_params params;
+	sctp_init_chunk_t *init;
+	struct sctp_transport *transport;
+	struct sctp_af *af;
+
+	/*
+	 * This code will NOT touch anything inside the chunk--it is
+	 * strictly READ-ONLY.
+	 *
+	 * RFC 2960 3  SCTP packet Format
+	 *
+	 * Multiple chunks can be bundled into one SCTP packet up to
+	 * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN
+	 * COMPLETE chunks.  These chunks MUST NOT be bundled with any
+	 * other chunk in a packet.  See Section 6.10 for more details
+	 * on chunk bundling.
+	 */
+
+	/* Find the start of the TLVs and the end of the chunk.  This is
+	 * the region we search for address parameters.
+	 */
+	init = (sctp_init_chunk_t *)skb->data;
+
+	/* Walk the parameters looking for embedded addresses. */
+	sctp_walk_params(params, init, init_hdr.params) {
+
+		/* Note: Ignoring hostname addresses. */
+		af = sctp_get_af_specific(param_type2af(params.p->type));
+		if (!af)
+			continue;
+
+		af->from_addr_param(paddr, params.addr, sh->source, 0);
+
+		asoc = __sctp_lookup_association(laddr, paddr, &transport);
+		if (asoc)
+			return asoc;
+	}
+
+	return NULL;
+}
+
+/* ADD-IP, Section 5.2
+ * When an endpoint receives an ASCONF Chunk from the remote peer
+ * special procedures may be needed to identify the association the
+ * ASCONF Chunk is associated with. To properly find the association
+ * the following procedures SHOULD be followed:
+ *
+ * D2) If the association is not found, use the address found in the
+ * Address Parameter TLV combined with the port number found in the
+ * SCTP common header. If found proceed to rule D4.
+ *
+ * D2-ext) If more than one ASCONF Chunks are packed together, use the
+ * address found in the ASCONF Address Parameter TLV of each of the
+ * subsequent ASCONF Chunks. If found, proceed to rule D4.
+ */
+static struct sctp_association *__sctp_rcv_asconf_lookup(
+					sctp_chunkhdr_t *ch,
+					const union sctp_addr *laddr,
+					__be16 peer_port,
+					struct sctp_transport **transportp)
+{
+	sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch;
+	struct sctp_af *af;
+	union sctp_addr_param *param;
+	union sctp_addr paddr;
+
+	/* Skip over the ADDIP header and find the Address parameter */
+	param = (union sctp_addr_param *)(asconf + 1);
+
+	af = sctp_get_af_specific(param_type2af(param->p.type));
+	if (unlikely(!af))
+		return NULL;
+
+	af->from_addr_param(&paddr, param, peer_port, 0);
+
+	return __sctp_lookup_association(laddr, &paddr, transportp);
+}
+
+
+/* SCTP-AUTH, Section 6.3:
+*    If the receiver does not find a STCB for a packet containing an AUTH
+*    chunk as the first chunk and not a COOKIE-ECHO chunk as the second
+*    chunk, it MUST use the chunks after the AUTH chunk to look up an existing
+*    association.
+*
+* This means that any chunks that can help us identify the association need
+* to be looked at to find this association.
+*/
+static struct sctp_association *__sctp_rcv_walk_lookup(struct sk_buff *skb,
+				      const union sctp_addr *laddr,
+				      struct sctp_transport **transportp)
+{
+	struct sctp_association *asoc = NULL;
+	sctp_chunkhdr_t *ch;
+	int have_auth = 0;
+	unsigned int chunk_num = 1;
+	__u8 *ch_end;
+
+	/* Walk through the chunks looking for AUTH or ASCONF chunks
+	 * to help us find the association.
+	 */
+	ch = (sctp_chunkhdr_t *) skb->data;
+	do {
+		/* Break out if chunk length is less then minimal. */
+		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+			break;
+
+		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		if (ch_end > skb_tail_pointer(skb))
+			break;
+
+		switch(ch->type) {
+		    case SCTP_CID_AUTH:
+			    have_auth = chunk_num;
+			    break;
+
+		    case SCTP_CID_COOKIE_ECHO:
+			    /* If a packet arrives containing an AUTH chunk as
+			     * a first chunk, a COOKIE-ECHO chunk as the second
+			     * chunk, and possibly more chunks after them, and
+			     * the receiver does not have an STCB for that
+			     * packet, then authentication is based on
+			     * the contents of the COOKIE- ECHO chunk.
+			     */
+			    if (have_auth == 1 && chunk_num == 2)
+				    return NULL;
+			    break;
+
+		    case SCTP_CID_ASCONF:
+			    if (have_auth || sctp_addip_noauth)
+				    asoc = __sctp_rcv_asconf_lookup(ch, laddr,
+							sctp_hdr(skb)->source,
+							transportp);
+		    default:
+			    break;
+		}
+
+		if (asoc)
+			break;
+
+		ch = (sctp_chunkhdr_t *) ch_end;
+		chunk_num++;
+	} while (ch_end < skb_tail_pointer(skb));
+
+	return asoc;
+}
+
+/*
+ * There are circumstances when we need to look inside the SCTP packet
+ * for information to help us find the association.   Examples
+ * include looking inside of INIT/INIT-ACK chunks or after the AUTH
+ * chunks.
+ */
+static struct sctp_association *__sctp_rcv_lookup_harder(struct sk_buff *skb,
+				      const union sctp_addr *laddr,
+				      struct sctp_transport **transportp)
+{
+	sctp_chunkhdr_t *ch;
+
+	ch = (sctp_chunkhdr_t *) skb->data;
+
+	/* The code below will attempt to walk the chunk and extract
+	 * parameter information.  Before we do that, we need to verify
+	 * that the chunk length doesn't cause overflow.  Otherwise, we'll
+	 * walk off the end.
+	 */
+	if (WORD_ROUND(ntohs(ch->length)) > skb->len)
+		return NULL;
+
+	/* If this is INIT/INIT-ACK look inside the chunk too. */
+	switch (ch->type) {
+	case SCTP_CID_INIT:
+	case SCTP_CID_INIT_ACK:
+		return __sctp_rcv_init_lookup(skb, laddr, transportp);
+		break;
+
+	default:
+		return __sctp_rcv_walk_lookup(skb, laddr, transportp);
+		break;
+	}
+
+
+	return NULL;
+}
+
+/* Lookup an association for an inbound skb. */
+static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb,
+				      const union sctp_addr *paddr,
+				      const union sctp_addr *laddr,
+				      struct sctp_transport **transportp)
+{
+	struct sctp_association *asoc;
+
+	asoc = __sctp_lookup_association(laddr, paddr, transportp);
+
+	/* Further lookup for INIT/INIT-ACK packets.
+	 * SCTP Implementors Guide, 2.18 Handling of address
+	 * parameters within the INIT or INIT-ACK.
+	 */
+	if (!asoc)
+		asoc = __sctp_rcv_lookup_harder(skb, laddr, transportp);
+
+	return asoc;
+}
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
new file mode 100644
index 00000000..397296fb
--- /dev/null
+++ b/net/sctp/inqueue.c
@@ -0,0 +1,246 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2002 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions are the methods for accessing the SCTP inqueue.
+ *
+ * An SCTP inqueue is a queue into which you push SCTP packets
+ * (which might be bundles or fragments of chunks) and out of which you
+ * pop SCTP whole chunks.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson <karl@athena.chicago.il.us>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+/* Initialize an SCTP inqueue.  */
+void sctp_inq_init(struct sctp_inq *queue)
+{
+	INIT_LIST_HEAD(&queue->in_chunk_list);
+	queue->in_progress = NULL;
+
+	/* Create a task for delivering data.  */
+	INIT_WORK(&queue->immediate, NULL);
+
+	queue->malloced = 0;
+}
+
+/* Release the memory associated with an SCTP inqueue.  */
+void sctp_inq_free(struct sctp_inq *queue)
+{
+	struct sctp_chunk *chunk, *tmp;
+
+	/* Empty the queue.  */
+	list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+		list_del_init(&chunk->list);
+		sctp_chunk_free(chunk);
+	}
+
+	/* If there is a packet which is currently being worked on,
+	 * free it as well.
+	 */
+	if (queue->in_progress) {
+		sctp_chunk_free(queue->in_progress);
+		queue->in_progress = NULL;
+	}
+
+	if (queue->malloced) {
+		/* Dump the master memory segment.  */
+		kfree(queue);
+	}
+}
+
+/* Put a new packet in an SCTP inqueue.
+ * We assume that packet->sctp_hdr is set and in host byte order.
+ */
+void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk)
+{
+	/* Directly call the packet handling routine. */
+	if (chunk->rcvr->dead) {
+		sctp_chunk_free(chunk);
+		return;
+	}
+
+	/* We are now calling this either from the soft interrupt
+	 * or from the backlog processing.
+	 * Eventually, we should clean up inqueue to not rely
+	 * on the BH related data structures.
+	 */
+	list_add_tail(&chunk->list, &q->in_chunk_list);
+	q->immediate.func(&q->immediate);
+}
+
+/* Peek at the next chunk on the inqeue. */
+struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue)
+{
+	struct sctp_chunk *chunk;
+	sctp_chunkhdr_t *ch = NULL;
+
+	chunk = queue->in_progress;
+	/* If there is no more chunks in this packet, say so */
+	if (chunk->singleton ||
+	    chunk->end_of_packet ||
+	    chunk->pdiscard)
+		    return NULL;
+
+	ch = (sctp_chunkhdr_t *)chunk->chunk_end;
+
+	return ch;
+}
+
+
+/* Extract a chunk from an SCTP inqueue.
+ *
+ * WARNING:  If you need to put the chunk on another queue, you need to
+ * make a shallow copy (clone) of it.
+ */
+struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
+{
+	struct sctp_chunk *chunk;
+	sctp_chunkhdr_t *ch = NULL;
+
+	/* The assumption is that we are safe to process the chunks
+	 * at this time.
+	 */
+
+	if ((chunk = queue->in_progress)) {
+		/* There is a packet that we have been working on.
+		 * Any post processing work to do before we move on?
+		 */
+		if (chunk->singleton ||
+		    chunk->end_of_packet ||
+		    chunk->pdiscard) {
+			sctp_chunk_free(chunk);
+			chunk = queue->in_progress = NULL;
+		} else {
+			/* Nothing to do. Next chunk in the packet, please. */
+			ch = (sctp_chunkhdr_t *) chunk->chunk_end;
+
+			/* Force chunk->skb->data to chunk->chunk_end.  */
+			skb_pull(chunk->skb,
+				 chunk->chunk_end - chunk->skb->data);
+
+			/* Verify that we have at least chunk headers
+			 * worth of buffer left.
+			 */
+			if (skb_headlen(chunk->skb) < sizeof(sctp_chunkhdr_t)) {
+				sctp_chunk_free(chunk);
+				chunk = queue->in_progress = NULL;
+			}
+		}
+	}
+
+	/* Do we need to take the next packet out of the queue to process? */
+	if (!chunk) {
+		struct list_head *entry;
+
+		/* Is the queue empty?  */
+		if (list_empty(&queue->in_chunk_list))
+			return NULL;
+
+		entry = queue->in_chunk_list.next;
+		chunk = queue->in_progress =
+			list_entry(entry, struct sctp_chunk, list);
+		list_del_init(entry);
+
+		/* This is the first chunk in the packet.  */
+		chunk->singleton = 1;
+		ch = (sctp_chunkhdr_t *) chunk->skb->data;
+		chunk->data_accepted = 0;
+	}
+
+	chunk->chunk_hdr = ch;
+	chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+	/* In the unlikely case of an IP reassembly, the skb could be
+	 * non-linear. If so, update chunk_end so that it doesn't go past
+	 * the skb->tail.
+	 */
+	if (unlikely(skb_is_nonlinear(chunk->skb))) {
+		if (chunk->chunk_end > skb_tail_pointer(chunk->skb))
+			chunk->chunk_end = skb_tail_pointer(chunk->skb);
+	}
+	skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t));
+	chunk->subh.v = NULL; /* Subheader is no longer valid.  */
+
+	if (chunk->chunk_end < skb_tail_pointer(chunk->skb)) {
+		/* This is not a singleton */
+		chunk->singleton = 0;
+	} else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) {
+		/* RFC 2960, Section 6.10  Bundling
+		 *
+		 * Partial chunks MUST NOT be placed in an SCTP packet.
+		 * If the receiver detects a partial chunk, it MUST drop
+		 * the chunk.
+		 *
+		 * Since the end of the chunk is past the end of our buffer
+		 * (which contains the whole packet, we can freely discard
+		 * the whole packet.
+		 */
+		sctp_chunk_free(chunk);
+		chunk = queue->in_progress = NULL;
+
+		return NULL;
+	} else {
+		/* We are at the end of the packet, so mark the chunk
+		 * in case we need to send a SACK.
+		 */
+		chunk->end_of_packet = 1;
+	}
+
+	SCTP_DEBUG_PRINTK("+++sctp_inq_pop+++ chunk %p[%s],"
+			  " length %d, skb->len %d\n",chunk,
+			  sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
+			  ntohs(chunk->chunk_hdr->length), chunk->skb->len);
+	return chunk;
+}
+
+/* Set a top-half handler.
+ *
+ * Originally, we the top-half handler was scheduled as a BH.  We now
+ * call the handler directly in sctp_inq_push() at a time that
+ * we know we are lock safe.
+ * The intent is that this routine will pull stuff out of the
+ * inqueue and process it.
+ */
+void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback)
+{
+	INIT_WORK(&q->immediate, callback);
+}
+
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
new file mode 100644
index 00000000..0bb0d7cb
--- /dev/null
+++ b/net/sctp/ipv6.c
@@ -0,0 +1,1079 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2002, 2004
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ * Copyright (c) 2002-2003 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * SCTP over IPv6.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *		   ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Le Yanqun		    <yanqun.le@nokia.com>
+ *    Hui Huang		    <hui.huang@nokia.com>
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *    Jon Grimm		    <jgrimm@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *
+ * Based on:
+ *	linux/net/ipv6/tcp_ipv6.c
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <linux/ipsec.h>
+#include <linux/slab.h>
+
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include <net/protocol.h>
+#include <net/ndisc.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/transp_v6.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/inet_common.h>
+#include <net/inet_ecn.h>
+#include <net/sctp/sctp.h>
+
+#include <asm/uaccess.h>
+
+static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
+					 union sctp_addr *s2);
+static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
+			      __be16 port);
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+			    const union sctp_addr *addr2);
+
+/* Event handler for inet6 address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
+static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
+				void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	struct sctp_sockaddr_entry *addr = NULL;
+	struct sctp_sockaddr_entry *temp;
+	int found = 0;
+
+	switch (ev) {
+	case NETDEV_UP:
+		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+		if (addr) {
+			addr->a.v6.sin6_family = AF_INET6;
+			addr->a.v6.sin6_port = 0;
+			ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifa->addr);
+			addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
+			addr->valid = 1;
+			spin_lock_bh(&sctp_local_addr_lock);
+			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			spin_unlock_bh(&sctp_local_addr_lock);
+		}
+		break;
+	case NETDEV_DOWN:
+		spin_lock_bh(&sctp_local_addr_lock);
+		list_for_each_entry_safe(addr, temp,
+					&sctp_local_addr_list, list) {
+			if (addr->a.sa.sa_family == AF_INET6 &&
+					ipv6_addr_equal(&addr->a.v6.sin6_addr,
+						&ifa->addr)) {
+				found = 1;
+				addr->valid = 0;
+				list_del_rcu(&addr->list);
+				break;
+			}
+		}
+		spin_unlock_bh(&sctp_local_addr_lock);
+		if (found)
+			kfree_rcu(addr, rcu);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block sctp_inet6addr_notifier = {
+	.notifier_call = sctp_inet6addr_event,
+};
+
+/* ICMP error handler. */
+SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			     u8 type, u8 code, int offset, __be32 info)
+{
+	struct inet6_dev *idev;
+	struct sock *sk;
+	struct sctp_association *asoc;
+	struct sctp_transport *transport;
+	struct ipv6_pinfo *np;
+	sk_buff_data_t saveip, savesctp;
+	int err;
+
+	idev = in6_dev_get(skb->dev);
+
+	/* Fix up skb to look at the embedded net header. */
+	saveip	 = skb->network_header;
+	savesctp = skb->transport_header;
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, offset);
+	sk = sctp_err_lookup(AF_INET6, skb, sctp_hdr(skb), &asoc, &transport);
+	/* Put back, the original pointers. */
+	skb->network_header   = saveip;
+	skb->transport_header = savesctp;
+	if (!sk) {
+		ICMP6_INC_STATS_BH(dev_net(skb->dev), idev, ICMP6_MIB_INERRORS);
+		goto out;
+	}
+
+	/* Warning:  The sock lock is held.  Remember to call
+	 * sctp_err_finish!
+	 */
+
+	switch (type) {
+	case ICMPV6_PKT_TOOBIG:
+		sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info));
+		goto out_unlock;
+	case ICMPV6_PARAMPROB:
+		if (ICMPV6_UNK_NEXTHDR == code) {
+			sctp_icmp_proto_unreachable(sk, asoc, transport);
+			goto out_unlock;
+		}
+		break;
+	default:
+		break;
+	}
+
+	np = inet6_sk(sk);
+	icmpv6_err_convert(type, code, &err);
+	if (!sock_owned_by_user(sk) && np->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else {  /* Only an error on timeout */
+		sk->sk_err_soft = err;
+	}
+
+out_unlock:
+	sctp_err_finish(sk, asoc);
+out:
+	if (likely(idev != NULL))
+		in6_dev_put(idev);
+}
+
+/* Based on tcp_v6_xmit() in tcp_ipv6.c. */
+static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
+{
+	struct sock *sk = skb->sk;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 fl6;
+
+	memset(&fl6, 0, sizeof(fl6));
+
+	fl6.flowi6_proto = sk->sk_protocol;
+
+	/* Fill in the dest address from the route entry passed with the skb
+	 * and the source address from the transport.
+	 */
+	ipv6_addr_copy(&fl6.daddr, &transport->ipaddr.v6.sin6_addr);
+	ipv6_addr_copy(&fl6.saddr, &transport->saddr.v6.sin6_addr);
+
+	fl6.flowlabel = np->flow_label;
+	IP6_ECN_flow_xmit(sk, fl6.flowlabel);
+	if (ipv6_addr_type(&fl6.saddr) & IPV6_ADDR_LINKLOCAL)
+		fl6.flowi6_oif = transport->saddr.v6.sin6_scope_id;
+	else
+		fl6.flowi6_oif = sk->sk_bound_dev_if;
+
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		ipv6_addr_copy(&fl6.daddr, rt0->addr);
+	}
+
+	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n",
+			  __func__, skb, skb->len,
+			  &fl6.saddr, &fl6.daddr);
+
+	SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS);
+
+	if (!(transport->param_flags & SPP_PMTUD_ENABLE))
+		skb->local_df = 1;
+
+	return ip6_xmit(sk, skb, &fl6, np->opt);
+}
+
+/* Returns the dst cache entry for the given source and destination ip
+ * addresses.
+ */
+static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+			    struct flowi *fl, struct sock *sk)
+{
+	struct sctp_association *asoc = t->asoc;
+	struct dst_entry *dst = NULL;
+	struct flowi6 *fl6 = &fl->u.ip6;
+	struct sctp_bind_addr *bp;
+	struct sctp_sockaddr_entry *laddr;
+	union sctp_addr *baddr = NULL;
+	union sctp_addr *daddr = &t->ipaddr;
+	union sctp_addr dst_saddr;
+	__u8 matchlen = 0;
+	__u8 bmatchlen;
+	sctp_scope_t scope;
+
+	memset(fl6, 0, sizeof(struct flowi6));
+	ipv6_addr_copy(&fl6->daddr, &daddr->v6.sin6_addr);
+	fl6->fl6_dport = daddr->v6.sin6_port;
+	fl6->flowi6_proto = IPPROTO_SCTP;
+	if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		fl6->flowi6_oif = daddr->v6.sin6_scope_id;
+
+	SCTP_DEBUG_PRINTK("%s: DST=%pI6 ", __func__, &fl6->daddr);
+
+	if (asoc)
+		fl6->fl6_sport = htons(asoc->base.bind_addr.port);
+
+	if (saddr) {
+		ipv6_addr_copy(&fl6->saddr, &saddr->v6.sin6_addr);
+		fl6->fl6_sport = saddr->v6.sin6_port;
+		SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl6->saddr);
+	}
+
+	dst = ip6_dst_lookup_flow(sk, fl6, NULL, false);
+	if (!asoc || saddr)
+		goto out;
+
+	bp = &asoc->base.bind_addr;
+	scope = sctp_scope(daddr);
+	/* ip6_dst_lookup has filled in the fl6->saddr for us.  Check
+	 * to see if we can use it.
+	 */
+	if (!IS_ERR(dst)) {
+		/* Walk through the bind address list and look for a bind
+		 * address that matches the source address of the returned dst.
+		 */
+		sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port));
+		rcu_read_lock();
+		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+				continue;
+
+			/* Do not compare against v4 addrs */
+			if ((laddr->a.sa.sa_family == AF_INET6) &&
+			    (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) {
+				rcu_read_unlock();
+				goto out;
+			}
+		}
+		rcu_read_unlock();
+		/* None of the bound addresses match the source address of the
+		 * dst. So release it.
+		 */
+		dst_release(dst);
+		dst = NULL;
+	}
+
+	/* Walk through the bind address list and try to get the
+	 * best source address for a given destination.
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid && laddr->state != SCTP_ADDR_SRC)
+			continue;
+		if ((laddr->a.sa.sa_family == AF_INET6) &&
+		    (scope <= sctp_scope(&laddr->a))) {
+			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+			if (!baddr || (matchlen < bmatchlen)) {
+				baddr = &laddr->a;
+				matchlen = bmatchlen;
+			}
+		}
+	}
+	rcu_read_unlock();
+	if (baddr) {
+		ipv6_addr_copy(&fl6->saddr, &baddr->v6.sin6_addr);
+		fl6->fl6_sport = baddr->v6.sin6_port;
+		dst = ip6_dst_lookup_flow(sk, fl6, NULL, false);
+	}
+
+out:
+	if (!IS_ERR(dst)) {
+		struct rt6_info *rt;
+		rt = (struct rt6_info *)dst;
+		t->dst = dst;
+		SCTP_DEBUG_PRINTK("rt6_dst:%pI6 rt6_src:%pI6\n",
+			&rt->rt6i_dst.addr, &fl6->saddr);
+	} else {
+		t->dst = NULL;
+		SCTP_DEBUG_PRINTK("NO ROUTE\n");
+	}
+}
+
+/* Returns the number of consecutive initial bits that match in the 2 ipv6
+ * addresses.
+ */
+static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
+					 union sctp_addr *s2)
+{
+	return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr);
+}
+
+/* Fills in the source address(saddr) based on the destination address(daddr)
+ * and asoc's bind address list.
+ */
+static void sctp_v6_get_saddr(struct sctp_sock *sk,
+			      struct sctp_transport *t,
+			      struct flowi *fl)
+{
+	struct flowi6 *fl6 = &fl->u.ip6;
+	union sctp_addr *saddr = &t->saddr;
+
+	SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst);
+
+	if (t->dst) {
+		saddr->v6.sin6_family = AF_INET6;
+		ipv6_addr_copy(&saddr->v6.sin6_addr, &fl6->saddr);
+	}
+}
+
+/* Make a copy of all potential local addresses. */
+static void sctp_v6_copy_addrlist(struct list_head *addrlist,
+				  struct net_device *dev)
+{
+	struct inet6_dev *in6_dev;
+	struct inet6_ifaddr *ifp;
+	struct sctp_sockaddr_entry *addr;
+
+	rcu_read_lock();
+	if ((in6_dev = __in6_dev_get(dev)) == NULL) {
+		rcu_read_unlock();
+		return;
+	}
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		/* Add the address to the local list.  */
+		addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC);
+		if (addr) {
+			addr->a.v6.sin6_family = AF_INET6;
+			addr->a.v6.sin6_port = 0;
+			ipv6_addr_copy(&addr->a.v6.sin6_addr, &ifp->addr);
+			addr->a.v6.sin6_scope_id = dev->ifindex;
+			addr->valid = 1;
+			INIT_LIST_HEAD(&addr->list);
+			list_add_tail(&addr->list, addrlist);
+		}
+	}
+
+	read_unlock_bh(&in6_dev->lock);
+	rcu_read_unlock();
+}
+
+/* Initialize a sockaddr_storage from in incoming skb. */
+static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb,
+			     int is_saddr)
+{
+	void *from;
+	__be16 *port;
+	struct sctphdr *sh;
+
+	port = &addr->v6.sin6_port;
+	addr->v6.sin6_family = AF_INET6;
+	addr->v6.sin6_flowinfo = 0; /* FIXME */
+	addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
+
+	sh = sctp_hdr(skb);
+	if (is_saddr) {
+		*port  = sh->source;
+		from = &ipv6_hdr(skb)->saddr;
+	} else {
+		*port = sh->dest;
+		from = &ipv6_hdr(skb)->daddr;
+	}
+	ipv6_addr_copy(&addr->v6.sin6_addr, from);
+}
+
+/* Initialize an sctp_addr from a socket. */
+static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk)
+{
+	addr->v6.sin6_family = AF_INET6;
+	addr->v6.sin6_port = 0;
+	ipv6_addr_copy(&addr->v6.sin6_addr, &inet6_sk(sk)->rcv_saddr);
+}
+
+/* Initialize sk->sk_rcv_saddr from sctp_addr. */
+static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
+{
+	if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) {
+		inet6_sk(sk)->rcv_saddr.s6_addr32[0] = 0;
+		inet6_sk(sk)->rcv_saddr.s6_addr32[1] = 0;
+		inet6_sk(sk)->rcv_saddr.s6_addr32[2] = htonl(0x0000ffff);
+		inet6_sk(sk)->rcv_saddr.s6_addr32[3] =
+			addr->v4.sin_addr.s_addr;
+	} else {
+		ipv6_addr_copy(&inet6_sk(sk)->rcv_saddr, &addr->v6.sin6_addr);
+	}
+}
+
+/* Initialize sk->sk_daddr from sctp_addr. */
+static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
+{
+	if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) {
+		inet6_sk(sk)->daddr.s6_addr32[0] = 0;
+		inet6_sk(sk)->daddr.s6_addr32[1] = 0;
+		inet6_sk(sk)->daddr.s6_addr32[2] = htonl(0x0000ffff);
+		inet6_sk(sk)->daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
+	} else {
+		ipv6_addr_copy(&inet6_sk(sk)->daddr, &addr->v6.sin6_addr);
+	}
+}
+
+/* Initialize a sctp_addr from an address parameter. */
+static void sctp_v6_from_addr_param(union sctp_addr *addr,
+				    union sctp_addr_param *param,
+				    __be16 port, int iif)
+{
+	addr->v6.sin6_family = AF_INET6;
+	addr->v6.sin6_port = port;
+	addr->v6.sin6_flowinfo = 0; /* BUG */
+	ipv6_addr_copy(&addr->v6.sin6_addr, &param->v6.addr);
+	addr->v6.sin6_scope_id = iif;
+}
+
+/* Initialize an address parameter from a sctp_addr and return the length
+ * of the address parameter.
+ */
+static int sctp_v6_to_addr_param(const union sctp_addr *addr,
+				 union sctp_addr_param *param)
+{
+	int length = sizeof(sctp_ipv6addr_param_t);
+
+	param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
+	param->v6.param_hdr.length = htons(length);
+	ipv6_addr_copy(&param->v6.addr, &addr->v6.sin6_addr);
+
+	return length;
+}
+
+/* Initialize a sctp_addr from struct in6_addr. */
+static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
+			      __be16 port)
+{
+	addr->sa.sa_family = AF_INET6;
+	addr->v6.sin6_port = port;
+	ipv6_addr_copy(&addr->v6.sin6_addr, saddr);
+}
+
+/* Compare addresses exactly.
+ * v4-mapped-v6 is also in consideration.
+ */
+static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
+			    const union sctp_addr *addr2)
+{
+	if (addr1->sa.sa_family != addr2->sa.sa_family) {
+		if (addr1->sa.sa_family == AF_INET &&
+		    addr2->sa.sa_family == AF_INET6 &&
+		    ipv6_addr_v4mapped(&addr2->v6.sin6_addr)) {
+			if (addr2->v6.sin6_port == addr1->v4.sin_port &&
+			    addr2->v6.sin6_addr.s6_addr32[3] ==
+			    addr1->v4.sin_addr.s_addr)
+				return 1;
+		}
+		if (addr2->sa.sa_family == AF_INET &&
+		    addr1->sa.sa_family == AF_INET6 &&
+		    ipv6_addr_v4mapped(&addr1->v6.sin6_addr)) {
+			if (addr1->v6.sin6_port == addr2->v4.sin_port &&
+			    addr1->v6.sin6_addr.s6_addr32[3] ==
+			    addr2->v4.sin_addr.s_addr)
+				return 1;
+		}
+		return 0;
+	}
+	if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
+		return 0;
+	/* If this is a linklocal address, compare the scope_id. */
+	if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+		if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
+		    (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) {
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+/* Initialize addr struct to INADDR_ANY. */
+static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port)
+{
+	memset(addr, 0x00, sizeof(union sctp_addr));
+	addr->v6.sin6_family = AF_INET6;
+	addr->v6.sin6_port = port;
+}
+
+/* Is this a wildcard address? */
+static int sctp_v6_is_any(const union sctp_addr *addr)
+{
+	return ipv6_addr_any(&addr->v6.sin6_addr);
+}
+
+/* Should this be available for binding?   */
+static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
+{
+	int type;
+	const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr;
+
+	type = ipv6_addr_type(in6);
+	if (IPV6_ADDR_ANY == type)
+		return 1;
+	if (type == IPV6_ADDR_MAPPED) {
+		if (sp && !sp->v4mapped)
+			return 0;
+		if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+			return 0;
+		sctp_v6_map_v4(addr);
+		return sctp_get_af_specific(AF_INET)->available(addr, sp);
+	}
+	if (!(type & IPV6_ADDR_UNICAST))
+		return 0;
+
+	return ipv6_chk_addr(&init_net, in6, NULL, 0);
+}
+
+/* This function checks if the address is a valid address to be used for
+ * SCTP.
+ *
+ * Output:
+ * Return 0 - If the address is a non-unicast or an illegal address.
+ * Return 1 - If the address is a unicast.
+ */
+static int sctp_v6_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
+{
+	int ret = ipv6_addr_type(&addr->v6.sin6_addr);
+
+	/* Support v4-mapped-v6 address. */
+	if (ret == IPV6_ADDR_MAPPED) {
+		/* Note: This routine is used in input, so v4-mapped-v6
+		 * are disallowed here when there is no sctp_sock.
+		 */
+		if (!sp || !sp->v4mapped)
+			return 0;
+		if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+			return 0;
+		sctp_v6_map_v4(addr);
+		return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
+	}
+
+	/* Is this a non-unicast address */
+	if (!(ret & IPV6_ADDR_UNICAST))
+		return 0;
+
+	return 1;
+}
+
+/* What is the scope of 'addr'?  */
+static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
+{
+	int v6scope;
+	sctp_scope_t retval;
+
+	/* The IPv6 scope is really a set of bit fields.
+	 * See IFA_* in <net/if_inet6.h>.  Map to a generic SCTP scope.
+	 */
+
+	v6scope = ipv6_addr_scope(&addr->v6.sin6_addr);
+	switch (v6scope) {
+	case IFA_HOST:
+		retval = SCTP_SCOPE_LOOPBACK;
+		break;
+	case IFA_LINK:
+		retval = SCTP_SCOPE_LINK;
+		break;
+	case IFA_SITE:
+		retval = SCTP_SCOPE_PRIVATE;
+		break;
+	default:
+		retval = SCTP_SCOPE_GLOBAL;
+		break;
+	}
+
+	return retval;
+}
+
+/* Create and initialize a new sk for the socket to be returned by accept(). */
+static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
+					     struct sctp_association *asoc)
+{
+	struct sock *newsk;
+	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+	struct sctp6_sock *newsctp6sk;
+
+	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+	if (!newsk)
+		goto out;
+
+	sock_init_data(NULL, newsk);
+
+	sctp_copy_sock(newsk, sk, asoc);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+
+	newsctp6sk = (struct sctp6_sock *)newsk;
+	inet_sk(newsk)->pinet6 = &newsctp6sk->inet6;
+
+	sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped;
+
+	newnp = inet6_sk(newsk);
+
+	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+	/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
+	 * and getpeername().
+	 */
+	sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk);
+
+	sk_refcnt_debug_inc(newsk);
+
+	if (newsk->sk_prot->init(newsk)) {
+		sk_common_release(newsk);
+		newsk = NULL;
+	}
+
+out:
+	return newsk;
+}
+
+/* Map v4 address to mapped v6 address */
+static void sctp_v6_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr)
+{
+	if (sp->v4mapped && AF_INET == addr->sa.sa_family)
+		sctp_v4_map_v6(addr);
+}
+
+/* Where did this skb come from?  */
+static int sctp_v6_skb_iif(const struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
+	return opt->iif;
+}
+
+/* Was this packet marked by Explicit Congestion Notification? */
+static int sctp_v6_is_ce(const struct sk_buff *skb)
+{
+	return *((__u32 *)(ipv6_hdr(skb))) & htonl(1 << 20);
+}
+
+/* Dump the v6 addr to the seq file. */
+static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
+{
+	seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr);
+}
+
+static void sctp_v6_ecn_capable(struct sock *sk)
+{
+	inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+}
+
+/* Initialize a PF_INET6 socket msg_name. */
+static void sctp_inet6_msgname(char *msgname, int *addr_len)
+{
+	struct sockaddr_in6 *sin6;
+
+	sin6 = (struct sockaddr_in6 *)msgname;
+	sin6->sin6_family = AF_INET6;
+	sin6->sin6_flowinfo = 0;
+	sin6->sin6_scope_id = 0; /*FIXME */
+	*addr_len = sizeof(struct sockaddr_in6);
+}
+
+/* Initialize a PF_INET msgname from a ulpevent. */
+static void sctp_inet6_event_msgname(struct sctp_ulpevent *event,
+				     char *msgname, int *addrlen)
+{
+	struct sockaddr_in6 *sin6, *sin6from;
+
+	if (msgname) {
+		union sctp_addr *addr;
+		struct sctp_association *asoc;
+
+		asoc = event->asoc;
+		sctp_inet6_msgname(msgname, addrlen);
+		sin6 = (struct sockaddr_in6 *)msgname;
+		sin6->sin6_port = htons(asoc->peer.port);
+		addr = &asoc->peer.primary_addr;
+
+		/* Note: If we go to a common v6 format, this code
+		 * will change.
+		 */
+
+		/* Map ipv4 address into v4-mapped-on-v6 address.  */
+		if (sctp_sk(asoc->base.sk)->v4mapped &&
+		    AF_INET == addr->sa.sa_family) {
+			sctp_v4_map_v6((union sctp_addr *)sin6);
+			sin6->sin6_addr.s6_addr32[3] =
+				addr->v4.sin_addr.s_addr;
+			return;
+		}
+
+		sin6from = &asoc->peer.primary_addr.v6;
+		ipv6_addr_copy(&sin6->sin6_addr, &sin6from->sin6_addr);
+		if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+			sin6->sin6_scope_id = sin6from->sin6_scope_id;
+	}
+}
+
+/* Initialize a msg_name from an inbound skb. */
+static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname,
+				   int *addr_len)
+{
+	struct sctphdr *sh;
+	struct sockaddr_in6 *sin6;
+
+	if (msgname) {
+		sctp_inet6_msgname(msgname, addr_len);
+		sin6 = (struct sockaddr_in6 *)msgname;
+		sh = sctp_hdr(skb);
+		sin6->sin6_port = sh->source;
+
+		/* Map ipv4 address into v4-mapped-on-v6 address. */
+		if (sctp_sk(skb->sk)->v4mapped &&
+		    ip_hdr(skb)->version == 4) {
+			sctp_v4_map_v6((union sctp_addr *)sin6);
+			sin6->sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr;
+			return;
+		}
+
+		/* Otherwise, just copy the v6 address. */
+		ipv6_addr_copy(&sin6->sin6_addr, &ipv6_hdr(skb)->saddr);
+		if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) {
+			struct sctp_ulpevent *ev = sctp_skb2event(skb);
+			sin6->sin6_scope_id = ev->iif;
+		}
+	}
+}
+
+/* Do we support this AF? */
+static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp)
+{
+	switch (family) {
+	case AF_INET6:
+		return 1;
+	/* v4-mapped-v6 addresses */
+	case AF_INET:
+		if (!__ipv6_only_sock(sctp_opt2sk(sp)))
+			return 1;
+	default:
+		return 0;
+	}
+}
+
+/* Address matching with wildcards allowed.  This extra level
+ * of indirection lets us choose whether a PF_INET6 should
+ * disallow any v4 addresses if we so choose.
+ */
+static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
+			       const union sctp_addr *addr2,
+			       struct sctp_sock *opt)
+{
+	struct sctp_af *af1, *af2;
+	struct sock *sk = sctp_opt2sk(opt);
+
+	af1 = sctp_get_af_specific(addr1->sa.sa_family);
+	af2 = sctp_get_af_specific(addr2->sa.sa_family);
+
+	if (!af1 || !af2)
+		return 0;
+
+	/* If the socket is IPv6 only, v4 addrs will not match */
+	if (__ipv6_only_sock(sk) && af1 != af2)
+		return 0;
+
+	/* Today, wildcard AF_INET/AF_INET6. */
+	if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
+		return 1;
+
+	if (addr1->sa.sa_family != addr2->sa.sa_family)
+		return 0;
+
+	return af1->cmp_addr(addr1, addr2);
+}
+
+/* Verify that the provided sockaddr looks bindable.   Common verification,
+ * has already been taken care of.
+ */
+static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+	struct sctp_af *af;
+
+	/* ASSERT: address family has already been verified. */
+	if (addr->sa.sa_family != AF_INET6)
+		af = sctp_get_af_specific(addr->sa.sa_family);
+	else {
+		int type = ipv6_addr_type(&addr->v6.sin6_addr);
+		struct net_device *dev;
+
+		if (type & IPV6_ADDR_LINKLOCAL) {
+			if (!addr->v6.sin6_scope_id)
+				return 0;
+			rcu_read_lock();
+			dev = dev_get_by_index_rcu(&init_net,
+						   addr->v6.sin6_scope_id);
+			if (!dev ||
+			    !ipv6_chk_addr(&init_net, &addr->v6.sin6_addr,
+					   dev, 0)) {
+				rcu_read_unlock();
+				return 0;
+			}
+			rcu_read_unlock();
+		} else if (type == IPV6_ADDR_MAPPED) {
+			if (!opt->v4mapped)
+				return 0;
+		}
+
+		af = opt->pf->af;
+	}
+	return af->available(addr, opt);
+}
+
+/* Verify that the provided sockaddr looks sendable.   Common verification,
+ * has already been taken care of.
+ */
+static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+	struct sctp_af *af = NULL;
+
+	/* ASSERT: address family has already been verified. */
+	if (addr->sa.sa_family != AF_INET6)
+		af = sctp_get_af_specific(addr->sa.sa_family);
+	else {
+		int type = ipv6_addr_type(&addr->v6.sin6_addr);
+		struct net_device *dev;
+
+		if (type & IPV6_ADDR_LINKLOCAL) {
+			if (!addr->v6.sin6_scope_id)
+				return 0;
+			rcu_read_lock();
+			dev = dev_get_by_index_rcu(&init_net,
+						   addr->v6.sin6_scope_id);
+			rcu_read_unlock();
+			if (!dev)
+				return 0;
+		}
+		af = opt->pf->af;
+	}
+
+	return af != NULL;
+}
+
+/* Fill in Supported Address Type information for INIT and INIT-ACK
+ * chunks.   Note: In the future, we may want to look at sock options
+ * to determine whether a PF_INET6 socket really wants to have IPV4
+ * addresses.
+ * Returns number of addresses supported.
+ */
+static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
+				      __be16 *types)
+{
+	types[0] = SCTP_PARAM_IPV6_ADDRESS;
+	if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) {
+		types[1] = SCTP_PARAM_IPV4_ADDRESS;
+		return 2;
+	}
+	return 1;
+}
+
+static const struct proto_ops inet6_seqpacket_ops = {
+	.family		   = PF_INET6,
+	.owner		   = THIS_MODULE,
+	.release	   = inet6_release,
+	.bind		   = inet6_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet6_getname,
+	.poll		   = sctp_poll,
+	.ioctl		   = inet6_ioctl,
+	.listen		   = sctp_inet_listen,
+	.shutdown	   = inet_shutdown,
+	.setsockopt	   = sock_common_setsockopt,
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw sctpv6_seqpacket_protosw = {
+	.type          = SOCK_SEQPACKET,
+	.protocol      = IPPROTO_SCTP,
+	.prot 	       = &sctpv6_prot,
+	.ops           = &inet6_seqpacket_ops,
+	.no_check      = 0,
+	.flags         = SCTP_PROTOSW_FLAG
+};
+static struct inet_protosw sctpv6_stream_protosw = {
+	.type          = SOCK_STREAM,
+	.protocol      = IPPROTO_SCTP,
+	.prot 	       = &sctpv6_prot,
+	.ops           = &inet6_seqpacket_ops,
+	.no_check      = 0,
+	.flags         = SCTP_PROTOSW_FLAG,
+};
+
+static int sctp6_rcv(struct sk_buff *skb)
+{
+	return sctp_rcv(skb) ? -1 : 0;
+}
+
+static const struct inet6_protocol sctpv6_protocol = {
+	.handler      = sctp6_rcv,
+	.err_handler  = sctp_v6_err,
+	.flags        = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static struct sctp_af sctp_af_inet6 = {
+	.sa_family	   = AF_INET6,
+	.sctp_xmit	   = sctp_v6_xmit,
+	.setsockopt	   = ipv6_setsockopt,
+	.getsockopt	   = ipv6_getsockopt,
+	.get_dst	   = sctp_v6_get_dst,
+	.get_saddr	   = sctp_v6_get_saddr,
+	.copy_addrlist	   = sctp_v6_copy_addrlist,
+	.from_skb	   = sctp_v6_from_skb,
+	.from_sk	   = sctp_v6_from_sk,
+	.to_sk_saddr	   = sctp_v6_to_sk_saddr,
+	.to_sk_daddr	   = sctp_v6_to_sk_daddr,
+	.from_addr_param   = sctp_v6_from_addr_param,
+	.to_addr_param	   = sctp_v6_to_addr_param,
+	.cmp_addr	   = sctp_v6_cmp_addr,
+	.scope		   = sctp_v6_scope,
+	.addr_valid	   = sctp_v6_addr_valid,
+	.inaddr_any	   = sctp_v6_inaddr_any,
+	.is_any		   = sctp_v6_is_any,
+	.available	   = sctp_v6_available,
+	.skb_iif	   = sctp_v6_skb_iif,
+	.is_ce		   = sctp_v6_is_ce,
+	.seq_dump_addr	   = sctp_v6_seq_dump_addr,
+	.ecn_capable	   = sctp_v6_ecn_capable,
+	.net_header_len	   = sizeof(struct ipv6hdr),
+	.sockaddr_len	   = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ipv6_setsockopt,
+	.compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+};
+
+static struct sctp_pf sctp_pf_inet6 = {
+	.event_msgname = sctp_inet6_event_msgname,
+	.skb_msgname   = sctp_inet6_skb_msgname,
+	.af_supported  = sctp_inet6_af_supported,
+	.cmp_addr      = sctp_inet6_cmp_addr,
+	.bind_verify   = sctp_inet6_bind_verify,
+	.send_verify   = sctp_inet6_send_verify,
+	.supported_addrs = sctp_inet6_supported_addrs,
+	.create_accept_sk = sctp_v6_create_accept_sk,
+	.addr_v4map    = sctp_v6_addr_v4map,
+	.af            = &sctp_af_inet6,
+};
+
+/* Initialize IPv6 support and register with socket layer.  */
+void sctp_v6_pf_init(void)
+{
+	/* Register the SCTP specific PF_INET6 functions. */
+	sctp_register_pf(&sctp_pf_inet6, PF_INET6);
+
+	/* Register the SCTP specific AF_INET6 functions. */
+	sctp_register_af(&sctp_af_inet6);
+}
+
+void sctp_v6_pf_exit(void)
+{
+	list_del(&sctp_af_inet6.list);
+}
+
+/* Initialize IPv6 support and register with socket layer.  */
+int sctp_v6_protosw_init(void)
+{
+	int rc;
+
+	rc = proto_register(&sctpv6_prot, 1);
+	if (rc)
+		return rc;
+
+	/* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */
+	inet6_register_protosw(&sctpv6_seqpacket_protosw);
+	inet6_register_protosw(&sctpv6_stream_protosw);
+
+	return 0;
+}
+
+void sctp_v6_protosw_exit(void)
+{
+	inet6_unregister_protosw(&sctpv6_seqpacket_protosw);
+	inet6_unregister_protosw(&sctpv6_stream_protosw);
+	proto_unregister(&sctpv6_prot);
+}
+
+
+/* Register with inet6 layer. */
+int sctp_v6_add_protocol(void)
+{
+	/* Register notifier for inet6 address additions/deletions. */
+	register_inet6addr_notifier(&sctp_inet6addr_notifier);
+
+	if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0)
+		return -EAGAIN;
+
+	return 0;
+}
+
+/* Unregister with inet6 layer. */
+void sctp_v6_del_protocol(void)
+{
+	inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP);
+	unregister_inet6addr_notifier(&sctp_inet6addr_notifier);
+}
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
new file mode 100644
index 00000000..8ef8e7d9
--- /dev/null
+++ b/net/sctp/objcnt.c
@@ -0,0 +1,148 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Support for memory object debugging.  This allows one to monitor the
+ * object allocations/deallocations for types instrumented for this
+ * via the proc fs.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <net/sctp/sctp.h>
+
+/*
+ * Global counters to count raw object allocation counts.
+ * To add new counters, choose a unique suffix for the variable
+ * name as the helper macros key off this suffix to make
+ * life easier for the programmer.
+ */
+
+SCTP_DBG_OBJCNT(sock);
+SCTP_DBG_OBJCNT(ep);
+SCTP_DBG_OBJCNT(transport);
+SCTP_DBG_OBJCNT(assoc);
+SCTP_DBG_OBJCNT(bind_addr);
+SCTP_DBG_OBJCNT(bind_bucket);
+SCTP_DBG_OBJCNT(chunk);
+SCTP_DBG_OBJCNT(addr);
+SCTP_DBG_OBJCNT(ssnmap);
+SCTP_DBG_OBJCNT(datamsg);
+SCTP_DBG_OBJCNT(keys);
+
+/* An array to make it easy to pretty print the debug information
+ * to the proc fs.
+ */
+static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
+	SCTP_DBG_OBJCNT_ENTRY(sock),
+	SCTP_DBG_OBJCNT_ENTRY(ep),
+	SCTP_DBG_OBJCNT_ENTRY(assoc),
+	SCTP_DBG_OBJCNT_ENTRY(transport),
+	SCTP_DBG_OBJCNT_ENTRY(chunk),
+	SCTP_DBG_OBJCNT_ENTRY(bind_addr),
+	SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
+	SCTP_DBG_OBJCNT_ENTRY(addr),
+	SCTP_DBG_OBJCNT_ENTRY(ssnmap),
+	SCTP_DBG_OBJCNT_ENTRY(datamsg),
+	SCTP_DBG_OBJCNT_ENTRY(keys),
+};
+
+/* Callback from procfs to read out objcount information.
+ * Walk through the entries in the sctp_dbg_objcnt array, dumping
+ * the raw object counts for each monitored type.
+ */
+static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
+{
+	int i, len;
+
+	i = (int)*(loff_t *)v;
+	seq_printf(seq, "%s: %d%n", sctp_dbg_objcnt[i].label,
+				atomic_read(sctp_dbg_objcnt[i].counter), &len);
+	seq_printf(seq, "%*s\n", 127 - len, "");
+	return 0;
+}
+
+static void *sctp_objcnt_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
+}
+
+static void sctp_objcnt_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void * sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
+}
+
+static const struct seq_operations sctp_objcnt_seq_ops = {
+	.start = sctp_objcnt_seq_start,
+	.next  = sctp_objcnt_seq_next,
+	.stop  = sctp_objcnt_seq_stop,
+	.show  = sctp_objcnt_seq_show,
+};
+
+static int sctp_objcnt_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sctp_objcnt_seq_ops);
+}
+
+static const struct file_operations sctp_objcnt_ops = {
+	.open	 = sctp_objcnt_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+/* Initialize the objcount in the proc filesystem.  */
+void sctp_dbg_objcnt_init(void)
+{
+	struct proc_dir_entry *ent;
+
+	ent = proc_create("sctp_dbg_objcnt", 0,
+			  proc_net_sctp, &sctp_objcnt_ops);
+	if (!ent)
+		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
+}
+
+/* Cleanup the objcount entry in the proc filesystem.  */
+void sctp_dbg_objcnt_exit(void)
+{
+	remove_proc_entry("sctp_dbg_objcnt", proc_net_sctp);
+}
+
+
diff --git a/net/sctp/output.c b/net/sctp/output.c
new file mode 100644
index 00000000..8fc4dcd2
--- /dev/null
+++ b/net/sctp/output.c
@@ -0,0 +1,751 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions handle output processing.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@austin.ibm.com>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/net_namespace.h>
+
+#include <linux/socket.h> /* for sa_family_t */
+#include <net/sock.h>
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/checksum.h>
+
+/* Forward declarations for private helpers. */
+static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
+					   struct sctp_chunk *chunk);
+static void sctp_packet_append_data(struct sctp_packet *packet,
+					   struct sctp_chunk *chunk);
+static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
+					struct sctp_chunk *chunk,
+					u16 chunk_len);
+
+static void sctp_packet_reset(struct sctp_packet *packet)
+{
+	packet->size = packet->overhead;
+	packet->has_cookie_echo = 0;
+	packet->has_sack = 0;
+	packet->has_data = 0;
+	packet->has_auth = 0;
+	packet->ipfragok = 0;
+	packet->auth = NULL;
+}
+
+/* Config a packet.
+ * This appears to be a followup set of initializations.
+ */
+struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
+				       __u32 vtag, int ecn_capable)
+{
+	struct sctp_chunk *chunk = NULL;
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__,
+			  packet, vtag);
+
+	packet->vtag = vtag;
+
+	if (ecn_capable && sctp_packet_empty(packet)) {
+		chunk = sctp_get_ecne_prepend(packet->transport->asoc);
+
+		/* If there a is a prepend chunk stick it on the list before
+		 * any other chunks get appended.
+		 */
+		if (chunk)
+			sctp_packet_append_chunk(packet, chunk);
+	}
+
+	return packet;
+}
+
+/* Initialize the packet structure. */
+struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
+				     struct sctp_transport *transport,
+				     __u16 sport, __u16 dport)
+{
+	struct sctp_association *asoc = transport->asoc;
+	size_t overhead;
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p transport:%p\n", __func__,
+			  packet, transport);
+
+	packet->transport = transport;
+	packet->source_port = sport;
+	packet->destination_port = dport;
+	INIT_LIST_HEAD(&packet->chunk_list);
+	if (asoc) {
+		struct sctp_sock *sp = sctp_sk(asoc->base.sk);
+		overhead = sp->pf->af->net_header_len;
+	} else {
+		overhead = sizeof(struct ipv6hdr);
+	}
+	overhead += sizeof(struct sctphdr);
+	packet->overhead = overhead;
+	sctp_packet_reset(packet);
+	packet->vtag = 0;
+	packet->malloced = 0;
+	return packet;
+}
+
+/* Free a packet.  */
+void sctp_packet_free(struct sctp_packet *packet)
+{
+	struct sctp_chunk *chunk, *tmp;
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p\n", __func__, packet);
+
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
+		sctp_chunk_free(chunk);
+	}
+
+	if (packet->malloced)
+		kfree(packet);
+}
+
+/* This routine tries to append the chunk to the offered packet. If adding
+ * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk
+ * is not present in the packet, it transmits the input packet.
+ * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long
+ * as it can fit in the packet, but any more data that does not fit in this
+ * packet can be sent only after receiving the COOKIE_ACK.
+ */
+sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
+				       struct sctp_chunk *chunk,
+				       int one_packet)
+{
+	sctp_xmit_t retval;
+	int error = 0;
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __func__,
+			  packet, chunk);
+
+	switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
+	case SCTP_XMIT_PMTU_FULL:
+		if (!packet->has_cookie_echo) {
+			error = sctp_packet_transmit(packet);
+			if (error < 0)
+				chunk->skb->sk->sk_err = -error;
+
+			/* If we have an empty packet, then we can NOT ever
+			 * return PMTU_FULL.
+			 */
+			if (!one_packet)
+				retval = sctp_packet_append_chunk(packet,
+								  chunk);
+		}
+		break;
+
+	case SCTP_XMIT_RWND_FULL:
+	case SCTP_XMIT_OK:
+	case SCTP_XMIT_NAGLE_DELAY:
+		break;
+	}
+
+	return retval;
+}
+
+/* Try to bundle an auth chunk into the packet. */
+static sctp_xmit_t sctp_packet_bundle_auth(struct sctp_packet *pkt,
+					   struct sctp_chunk *chunk)
+{
+	struct sctp_association *asoc = pkt->transport->asoc;
+	struct sctp_chunk *auth;
+	sctp_xmit_t retval = SCTP_XMIT_OK;
+
+	/* if we don't have an association, we can't do authentication */
+	if (!asoc)
+		return retval;
+
+	/* See if this is an auth chunk we are bundling or if
+	 * auth is already bundled.
+	 */
+	if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth)
+		return retval;
+
+	/* if the peer did not request this chunk to be authenticated,
+	 * don't do it
+	 */
+	if (!chunk->auth)
+		return retval;
+
+	auth = sctp_make_auth(asoc);
+	if (!auth)
+		return retval;
+
+	retval = sctp_packet_append_chunk(pkt, auth);
+
+	return retval;
+}
+
+/* Try to bundle a SACK with the packet. */
+static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt,
+					   struct sctp_chunk *chunk)
+{
+	sctp_xmit_t retval = SCTP_XMIT_OK;
+
+	/* If sending DATA and haven't aleady bundled a SACK, try to
+	 * bundle one in to the packet.
+	 */
+	if (sctp_chunk_is_data(chunk) && !pkt->has_sack &&
+	    !pkt->has_cookie_echo) {
+		struct sctp_association *asoc;
+		struct timer_list *timer;
+		asoc = pkt->transport->asoc;
+		timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
+
+		/* If the SACK timer is running, we have a pending SACK */
+		if (timer_pending(timer)) {
+			struct sctp_chunk *sack;
+			asoc->a_rwnd = asoc->rwnd;
+			sack = sctp_make_sack(asoc);
+			if (sack) {
+				retval = sctp_packet_append_chunk(pkt, sack);
+				asoc->peer.sack_needed = 0;
+				if (del_timer(timer))
+					sctp_association_put(asoc);
+			}
+		}
+	}
+	return retval;
+}
+
+/* Append a chunk to the offered packet reporting back any inability to do
+ * so.
+ */
+sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
+				     struct sctp_chunk *chunk)
+{
+	sctp_xmit_t retval = SCTP_XMIT_OK;
+	__u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length));
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __func__, packet,
+			  chunk);
+
+	/* Data chunks are special.  Before seeing what else we can
+	 * bundle into this packet, check to see if we are allowed to
+	 * send this DATA.
+	 */
+	if (sctp_chunk_is_data(chunk)) {
+		retval = sctp_packet_can_append_data(packet, chunk);
+		if (retval != SCTP_XMIT_OK)
+			goto finish;
+	}
+
+	/* Try to bundle AUTH chunk */
+	retval = sctp_packet_bundle_auth(packet, chunk);
+	if (retval != SCTP_XMIT_OK)
+		goto finish;
+
+	/* Try to bundle SACK chunk */
+	retval = sctp_packet_bundle_sack(packet, chunk);
+	if (retval != SCTP_XMIT_OK)
+		goto finish;
+
+	/* Check to see if this chunk will fit into the packet */
+	retval = sctp_packet_will_fit(packet, chunk, chunk_len);
+	if (retval != SCTP_XMIT_OK)
+		goto finish;
+
+	/* We believe that this chunk is OK to add to the packet */
+	switch (chunk->chunk_hdr->type) {
+	    case SCTP_CID_DATA:
+		/* Account for the data being in the packet */
+		sctp_packet_append_data(packet, chunk);
+		/* Disallow SACK bundling after DATA. */
+		packet->has_sack = 1;
+		/* Disallow AUTH bundling after DATA */
+		packet->has_auth = 1;
+		/* Let it be knows that packet has DATA in it */
+		packet->has_data = 1;
+		/* timestamp the chunk for rtx purposes */
+		chunk->sent_at = jiffies;
+		break;
+	    case SCTP_CID_COOKIE_ECHO:
+		packet->has_cookie_echo = 1;
+		break;
+
+	    case SCTP_CID_SACK:
+		packet->has_sack = 1;
+		break;
+
+	    case SCTP_CID_AUTH:
+		packet->has_auth = 1;
+		packet->auth = chunk;
+		break;
+	}
+
+	/* It is OK to send this chunk.  */
+	list_add_tail(&chunk->list, &packet->chunk_list);
+	packet->size += chunk_len;
+	chunk->transport = packet->transport;
+finish:
+	return retval;
+}
+
+/* All packets are sent to the network through this function from
+ * sctp_outq_tail().
+ *
+ * The return value is a normal kernel error return value.
+ */
+int sctp_packet_transmit(struct sctp_packet *packet)
+{
+	struct sctp_transport *tp = packet->transport;
+	struct sctp_association *asoc = tp->asoc;
+	struct sctphdr *sh;
+	struct sk_buff *nskb;
+	struct sctp_chunk *chunk, *tmp;
+	struct sock *sk;
+	int err = 0;
+	int padding;		/* How much padding do we need?  */
+	__u8 has_data = 0;
+	struct dst_entry *dst = tp->dst;
+	unsigned char *auth = NULL;	/* pointer to auth in skb data */
+	__u32 cksum_buf_len = sizeof(struct sctphdr);
+
+	SCTP_DEBUG_PRINTK("%s: packet:%p\n", __func__, packet);
+
+	/* Do NOT generate a chunkless packet. */
+	if (list_empty(&packet->chunk_list))
+		return err;
+
+	/* Set up convenience variables... */
+	chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
+	sk = chunk->skb->sk;
+
+	/* Allocate the new skb.  */
+	nskb = alloc_skb(packet->size + LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		goto nomem;
+
+	/* Make sure the outbound skb has enough header room reserved. */
+	skb_reserve(nskb, packet->overhead + LL_MAX_HEADER);
+
+	/* Set the owning socket so that we know where to get the
+	 * destination IP address.
+	 */
+	skb_set_owner_w(nskb, sk);
+
+	if (!sctp_transport_dst_check(tp)) {
+		sctp_transport_route(tp, NULL, sctp_sk(sk));
+		if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
+			sctp_assoc_sync_pmtu(asoc);
+		}
+	}
+	dst = dst_clone(tp->dst);
+	skb_dst_set(nskb, dst);
+	if (!dst)
+		goto no_route;
+
+	/* Build the SCTP header.  */
+	sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr));
+	skb_reset_transport_header(nskb);
+	sh->source = htons(packet->source_port);
+	sh->dest   = htons(packet->destination_port);
+
+	/* From 6.8 Adler-32 Checksum Calculation:
+	 * After the packet is constructed (containing the SCTP common
+	 * header and one or more control or DATA chunks), the
+	 * transmitter shall:
+	 *
+	 * 1) Fill in the proper Verification Tag in the SCTP common
+	 *    header and initialize the checksum field to 0's.
+	 */
+	sh->vtag     = htonl(packet->vtag);
+	sh->checksum = 0;
+
+	/**
+	 * 6.10 Bundling
+	 *
+	 *    An endpoint bundles chunks by simply including multiple
+	 *    chunks in one outbound SCTP packet.  ...
+	 */
+
+	/**
+	 * 3.2  Chunk Field Descriptions
+	 *
+	 * The total length of a chunk (including Type, Length and
+	 * Value fields) MUST be a multiple of 4 bytes.  If the length
+	 * of the chunk is not a multiple of 4 bytes, the sender MUST
+	 * pad the chunk with all zero bytes and this padding is not
+	 * included in the chunk length field.  The sender should
+	 * never pad with more than 3 bytes.
+	 *
+	 * [This whole comment explains WORD_ROUND() below.]
+	 */
+	SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
+		if (sctp_chunk_is_data(chunk)) {
+			/* 6.3.1 C4) When data is in flight and when allowed
+			 * by rule C5, a new RTT measurement MUST be made each
+			 * round trip.  Furthermore, new RTT measurements
+			 * SHOULD be made no more than once per round-trip
+			 * for a given destination transport address.
+			 */
+
+			if (!tp->rto_pending) {
+				chunk->rtt_in_progress = 1;
+				tp->rto_pending = 1;
+			}
+			has_data = 1;
+		}
+
+		padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len;
+		if (padding)
+			memset(skb_put(chunk->skb, padding), 0, padding);
+
+		/* if this is the auth chunk that we are adding,
+		 * store pointer where it will be added and put
+		 * the auth into the packet.
+		 */
+		if (chunk == packet->auth)
+			auth = skb_tail_pointer(nskb);
+
+		cksum_buf_len += chunk->skb->len;
+		memcpy(skb_put(nskb, chunk->skb->len),
+			       chunk->skb->data, chunk->skb->len);
+
+		SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n",
+				  "*** Chunk", chunk,
+				  sctp_cname(SCTP_ST_CHUNK(
+					  chunk->chunk_hdr->type)),
+				  chunk->has_tsn ? "TSN" : "No TSN",
+				  chunk->has_tsn ?
+				  ntohl(chunk->subh.data_hdr->tsn) : 0,
+				  "length", ntohs(chunk->chunk_hdr->length),
+				  "chunk->skb->len", chunk->skb->len,
+				  "rtt_in_progress", chunk->rtt_in_progress);
+
+		/*
+		 * If this is a control chunk, this is our last
+		 * reference. Free data chunks after they've been
+		 * acknowledged or have failed.
+		 */
+		if (!sctp_chunk_is_data(chunk))
+			sctp_chunk_free(chunk);
+	}
+
+	/* SCTP-AUTH, Section 6.2
+	 *    The sender MUST calculate the MAC as described in RFC2104 [2]
+	 *    using the hash function H as described by the MAC Identifier and
+	 *    the shared association key K based on the endpoint pair shared key
+	 *    described by the shared key identifier.  The 'data' used for the
+	 *    computation of the AUTH-chunk is given by the AUTH chunk with its
+	 *    HMAC field set to zero (as shown in Figure 6) followed by all
+	 *    chunks that are placed after the AUTH chunk in the SCTP packet.
+	 */
+	if (auth)
+		sctp_auth_calculate_hmac(asoc, nskb,
+					(struct sctp_auth_chunk *)auth,
+					GFP_ATOMIC);
+
+	/* 2) Calculate the Adler-32 checksum of the whole packet,
+	 *    including the SCTP common header and all the
+	 *    chunks.
+	 *
+	 * Note: Adler-32 is no longer applicable, as has been replaced
+	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
+	 */
+	if (!sctp_checksum_disable) {
+		if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
+			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
+
+			/* 3) Put the resultant value into the checksum field in the
+			 *    common header, and leave the rest of the bits unchanged.
+			 */
+			sh->checksum = sctp_end_cksum(crc32);
+		} else {
+			/* no need to seed pseudo checksum for SCTP */
+			nskb->ip_summed = CHECKSUM_PARTIAL;
+			nskb->csum_start = (skb_transport_header(nskb) -
+			                    nskb->head);
+			nskb->csum_offset = offsetof(struct sctphdr, checksum);
+		}
+	}
+
+	/* IP layer ECN support
+	 * From RFC 2481
+	 *  "The ECN-Capable Transport (ECT) bit would be set by the
+	 *   data sender to indicate that the end-points of the
+	 *   transport protocol are ECN-capable."
+	 *
+	 * Now setting the ECT bit all the time, as it should not cause
+	 * any problems protocol-wise even if our peer ignores it.
+	 *
+	 * Note: The works for IPv6 layer checks this bit too later
+	 * in transmission.  See IP6_ECN_flow_xmit().
+	 */
+	(*tp->af_specific->ecn_capable)(nskb->sk);
+
+	/* Set up the IP options.  */
+	/* BUG: not implemented
+	 * For v4 this all lives somewhere in sk->sk_opt...
+	 */
+
+	/* Dump that on IP!  */
+	if (asoc && asoc->peer.last_sent_to != tp) {
+		/* Considering the multiple CPU scenario, this is a
+		 * "correcter" place for last_sent_to.  --xguo
+		 */
+		asoc->peer.last_sent_to = tp;
+	}
+
+	if (has_data) {
+		struct timer_list *timer;
+		unsigned long timeout;
+
+		/* Restart the AUTOCLOSE timer when sending data. */
+		if (sctp_state(asoc, ESTABLISHED) && asoc->autoclose) {
+			timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+			timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
+
+			if (!mod_timer(timer, jiffies + timeout))
+				sctp_association_hold(asoc);
+		}
+	}
+
+	SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
+			  nskb->len);
+
+	nskb->local_df = packet->ipfragok;
+	(*tp->af_specific->sctp_xmit)(nskb, tp);
+
+out:
+	sctp_packet_reset(packet);
+	return err;
+no_route:
+	kfree_skb(nskb);
+	IP_INC_STATS_BH(&init_net, IPSTATS_MIB_OUTNOROUTES);
+
+	/* FIXME: Returning the 'err' will effect all the associations
+	 * associated with a socket, although only one of the paths of the
+	 * association is unreachable.
+	 * The real failure of a transport or association can be passed on
+	 * to the user via notifications. So setting this error may not be
+	 * required.
+	 */
+	 /* err = -EHOSTUNREACH; */
+err:
+	/* Control chunks are unreliable so just drop them.  DATA chunks
+	 * will get resent or dropped later.
+	 */
+
+	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+		list_del_init(&chunk->list);
+		if (!sctp_chunk_is_data(chunk))
+			sctp_chunk_free(chunk);
+	}
+	goto out;
+nomem:
+	err = -ENOMEM;
+	goto err;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This private function check to see if a chunk can be added */
+static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
+					   struct sctp_chunk *chunk)
+{
+	sctp_xmit_t retval = SCTP_XMIT_OK;
+	size_t datasize, rwnd, inflight, flight_size;
+	struct sctp_transport *transport = packet->transport;
+	struct sctp_association *asoc = transport->asoc;
+	struct sctp_outq *q = &asoc->outqueue;
+
+	/* RFC 2960 6.1  Transmission of DATA Chunks
+	 *
+	 * A) At any given time, the data sender MUST NOT transmit new data to
+	 * any destination transport address if its peer's rwnd indicates
+	 * that the peer has no buffer space (i.e. rwnd is 0, see Section
+	 * 6.2.1).  However, regardless of the value of rwnd (including if it
+	 * is 0), the data sender can always have one DATA chunk in flight to
+	 * the receiver if allowed by cwnd (see rule B below).  This rule
+	 * allows the sender to probe for a change in rwnd that the sender
+	 * missed due to the SACK having been lost in transit from the data
+	 * receiver to the data sender.
+	 */
+
+	rwnd = asoc->peer.rwnd;
+	inflight = q->outstanding_bytes;
+	flight_size = transport->flight_size;
+
+	datasize = sctp_data_size(chunk);
+
+	if (datasize > rwnd) {
+		if (inflight > 0) {
+			/* We have (at least) one data chunk in flight,
+			 * so we can't fall back to rule 6.1 B).
+			 */
+			retval = SCTP_XMIT_RWND_FULL;
+			goto finish;
+		}
+	}
+
+	/* RFC 2960 6.1  Transmission of DATA Chunks
+	 *
+	 * B) At any given time, the sender MUST NOT transmit new data
+	 * to a given transport address if it has cwnd or more bytes
+	 * of data outstanding to that transport address.
+	 */
+	/* RFC 7.2.4 & the Implementers Guide 2.8.
+	 *
+	 * 3) ...
+	 *    When a Fast Retransmit is being performed the sender SHOULD
+	 *    ignore the value of cwnd and SHOULD NOT delay retransmission.
+	 */
+	if (chunk->fast_retransmit != SCTP_NEED_FRTX)
+		if (flight_size >= transport->cwnd) {
+			retval = SCTP_XMIT_RWND_FULL;
+			goto finish;
+		}
+
+	/* Nagle's algorithm to solve small-packet problem:
+	 * Inhibit the sending of new chunks when new outgoing data arrives
+	 * if any previously transmitted data on the connection remains
+	 * unacknowledged.
+	 */
+	if (!sctp_sk(asoc->base.sk)->nodelay && sctp_packet_empty(packet) &&
+	    inflight && sctp_state(asoc, ESTABLISHED)) {
+		unsigned max = transport->pathmtu - packet->overhead;
+		unsigned len = chunk->skb->len + q->out_qlen;
+
+		/* Check whether this chunk and all the rest of pending
+		 * data will fit or delay in hopes of bundling a full
+		 * sized packet.
+		 * Don't delay large message writes that may have been
+		 * fragmeneted into small peices.
+		 */
+		if ((len < max) && chunk->msg->can_delay) {
+			retval = SCTP_XMIT_NAGLE_DELAY;
+			goto finish;
+		}
+	}
+
+finish:
+	return retval;
+}
+
+/* This private function does management things when adding DATA chunk */
+static void sctp_packet_append_data(struct sctp_packet *packet,
+				struct sctp_chunk *chunk)
+{
+	struct sctp_transport *transport = packet->transport;
+	size_t datasize = sctp_data_size(chunk);
+	struct sctp_association *asoc = transport->asoc;
+	u32 rwnd = asoc->peer.rwnd;
+
+	/* Keep track of how many bytes are in flight over this transport. */
+	transport->flight_size += datasize;
+
+	/* Keep track of how many bytes are in flight to the receiver. */
+	asoc->outqueue.outstanding_bytes += datasize;
+
+	/* Update our view of the receiver's rwnd. */
+	if (datasize < rwnd)
+		rwnd -= datasize;
+	else
+		rwnd = 0;
+
+	asoc->peer.rwnd = rwnd;
+	/* Has been accepted for transmission. */
+	if (!asoc->peer.prsctp_capable)
+		chunk->msg->can_abandon = 0;
+	sctp_chunk_assign_tsn(chunk);
+	sctp_chunk_assign_ssn(chunk);
+}
+
+static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
+					struct sctp_chunk *chunk,
+					u16 chunk_len)
+{
+	size_t psize;
+	size_t pmtu;
+	int too_big;
+	sctp_xmit_t retval = SCTP_XMIT_OK;
+
+	psize = packet->size;
+	pmtu  = ((packet->transport->asoc) ?
+		(packet->transport->asoc->pathmtu) :
+		(packet->transport->pathmtu));
+
+	too_big = (psize + chunk_len > pmtu);
+
+	/* Decide if we need to fragment or resubmit later. */
+	if (too_big) {
+		/* It's OK to fragmet at IP level if any one of the following
+		 * is true:
+		 * 	1. The packet is empty (meaning this chunk is greater
+		 * 	   the MTU)
+		 * 	2. The chunk we are adding is a control chunk
+		 * 	3. The packet doesn't have any data in it yet and data
+		 * 	requires authentication.
+		 */
+		if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) ||
+		    (!packet->has_data && chunk->auth)) {
+			/* We no longer do re-fragmentation.
+			 * Just fragment at the IP layer, if we
+			 * actually hit this condition
+			 */
+			packet->ipfragok = 1;
+		} else {
+			retval = SCTP_XMIT_PMTU_FULL;
+		}
+	}
+
+	return retval;
+}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
new file mode 100644
index 00000000..1f2938fb
--- /dev/null
+++ b/net/sctp/outqueue.c
@@ -0,0 +1,1900 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions implement the sctp_outq class.   The outqueue handles
+ * bundling and queueing of outgoing SCTP chunks.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Perry Melange         <pmelange@null.cc.uic.edu>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Hui Huang 	    <hui.huang@nokia.com>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/list.h>   /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <net/sock.h>	  /* For skb_set_owner_w */
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Declare internal functions here.  */
+static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
+static void sctp_check_transmitted(struct sctp_outq *q,
+				   struct list_head *transmitted_queue,
+				   struct sctp_transport *transport,
+				   struct sctp_sackhdr *sack,
+				   __u32 *highest_new_tsn);
+
+static void sctp_mark_missing(struct sctp_outq *q,
+			      struct list_head *transmitted_queue,
+			      struct sctp_transport *transport,
+			      __u32 highest_new_tsn,
+			      int count_of_newacks);
+
+static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
+
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout);
+
+/* Add data to the front of the queue. */
+static inline void sctp_outq_head_data(struct sctp_outq *q,
+					struct sctp_chunk *ch)
+{
+	list_add(&ch->list, &q->out_chunk_list);
+	q->out_qlen += ch->skb->len;
+}
+
+/* Take data from the front of the queue. */
+static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
+{
+	struct sctp_chunk *ch = NULL;
+
+	if (!list_empty(&q->out_chunk_list)) {
+		struct list_head *entry = q->out_chunk_list.next;
+
+		ch = list_entry(entry, struct sctp_chunk, list);
+		list_del_init(entry);
+		q->out_qlen -= ch->skb->len;
+	}
+	return ch;
+}
+/* Add data chunk to the end of the queue. */
+static inline void sctp_outq_tail_data(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	list_add_tail(&ch->list, &q->out_chunk_list);
+	q->out_qlen += ch->skb->len;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * D) If count_of_newacks is greater than or equal to 2
+ * and t was not sent to the current primary then the
+ * sender MUST NOT increment missing report count for t.
+ */
+static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary,
+				       struct sctp_transport *transport,
+				       int count_of_newacks)
+{
+	if (count_of_newacks >=2 && transport != primary)
+		return 1;
+	return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * F) If count_of_newacks is less than 2, let d be the
+ * destination to which t was sent. If cacc_saw_newack
+ * is 0 for destination d, then the sender MUST NOT
+ * increment missing report count for t.
+ */
+static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport,
+				       int count_of_newacks)
+{
+	if (count_of_newacks < 2 &&
+			(transport && !transport->cacc.cacc_saw_newack))
+		return 1;
+	return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD
+ * execute steps C, D, F.
+ *
+ * C has been implemented in sctp_outq_sack
+ */
+static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary,
+				     struct sctp_transport *transport,
+				     int count_of_newacks)
+{
+	if (!primary->cacc.cycling_changeover) {
+		if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks))
+			return 1;
+		if (sctp_cacc_skip_3_1_f(transport, count_of_newacks))
+			return 1;
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less
+ * than next_tsn_at_change of the current primary, then
+ * the sender MUST NOT increment missing report count
+ * for t.
+ */
+static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn)
+{
+	if (primary->cacc.cycling_changeover &&
+	    TSN_lt(tsn, primary->cacc.next_tsn_at_change))
+		return 1;
+	return 0;
+}
+
+/*
+ * SFR-CACC algorithm:
+ * 3) If the missing report count for TSN t is to be
+ * incremented according to [RFC2960] and
+ * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set,
+ * then the sender MUST further execute steps 3.1 and
+ * 3.2 to determine if the missing report count for
+ * TSN t SHOULD NOT be incremented.
+ *
+ * 3.3) If 3.1 and 3.2 do not dictate that the missing
+ * report count for t should not be incremented, then
+ * the sender SHOULD increment missing report count for
+ * t (according to [RFC2960] and [SCTP_STEWART_2002]).
+ */
+static inline int sctp_cacc_skip(struct sctp_transport *primary,
+				 struct sctp_transport *transport,
+				 int count_of_newacks,
+				 __u32 tsn)
+{
+	if (primary->cacc.changeover_active &&
+	    (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) ||
+	     sctp_cacc_skip_3_2(primary, tsn)))
+		return 1;
+	return 0;
+}
+
+/* Initialize an existing sctp_outq.  This does the boring stuff.
+ * You still need to define handlers if you really want to DO
+ * something with this structure...
+ */
+void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
+{
+	q->asoc = asoc;
+	INIT_LIST_HEAD(&q->out_chunk_list);
+	INIT_LIST_HEAD(&q->control_chunk_list);
+	INIT_LIST_HEAD(&q->retransmit);
+	INIT_LIST_HEAD(&q->sacked);
+	INIT_LIST_HEAD(&q->abandoned);
+
+	q->fast_rtx = 0;
+	q->outstanding_bytes = 0;
+	q->empty = 1;
+	q->cork  = 0;
+
+	q->malloced = 0;
+	q->out_qlen = 0;
+}
+
+/* Free the outqueue structure and any related pending chunks.
+ */
+void sctp_outq_teardown(struct sctp_outq *q)
+{
+	struct sctp_transport *transport;
+	struct list_head *lchunk, *temp;
+	struct sctp_chunk *chunk, *tmp;
+
+	/* Throw away unacknowledged chunks. */
+	list_for_each_entry(transport, &q->asoc->peer.transport_addr_list,
+			transports) {
+		while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) {
+			chunk = list_entry(lchunk, struct sctp_chunk,
+					   transmitted_list);
+			/* Mark as part of a failed message. */
+			sctp_chunk_fail(chunk, q->error);
+			sctp_chunk_free(chunk);
+		}
+	}
+
+	/* Throw away chunks that have been gap ACKed.  */
+	list_for_each_safe(lchunk, temp, &q->sacked) {
+		list_del_init(lchunk);
+		chunk = list_entry(lchunk, struct sctp_chunk,
+				   transmitted_list);
+		sctp_chunk_fail(chunk, q->error);
+		sctp_chunk_free(chunk);
+	}
+
+	/* Throw away any chunks in the retransmit queue. */
+	list_for_each_safe(lchunk, temp, &q->retransmit) {
+		list_del_init(lchunk);
+		chunk = list_entry(lchunk, struct sctp_chunk,
+				   transmitted_list);
+		sctp_chunk_fail(chunk, q->error);
+		sctp_chunk_free(chunk);
+	}
+
+	/* Throw away any chunks that are in the abandoned queue. */
+	list_for_each_safe(lchunk, temp, &q->abandoned) {
+		list_del_init(lchunk);
+		chunk = list_entry(lchunk, struct sctp_chunk,
+				   transmitted_list);
+		sctp_chunk_fail(chunk, q->error);
+		sctp_chunk_free(chunk);
+	}
+
+	/* Throw away any leftover data chunks. */
+	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+
+		/* Mark as send failure. */
+		sctp_chunk_fail(chunk, q->error);
+		sctp_chunk_free(chunk);
+	}
+
+	q->error = 0;
+
+	/* Throw away any leftover control chunks. */
+	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		list_del_init(&chunk->list);
+		sctp_chunk_free(chunk);
+	}
+}
+
+/* Free the outqueue structure and any related pending chunks.  */
+void sctp_outq_free(struct sctp_outq *q)
+{
+	/* Throw away leftover chunks. */
+	sctp_outq_teardown(q);
+
+	/* If we were kmalloc()'d, free the memory.  */
+	if (q->malloced)
+		kfree(q);
+}
+
+/* Put a new chunk in an sctp_outq.  */
+int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
+{
+	int error = 0;
+
+	SCTP_DEBUG_PRINTK("sctp_outq_tail(%p, %p[%s])\n",
+			  q, chunk, chunk && chunk->chunk_hdr ?
+			  sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type))
+			  : "Illegal Chunk");
+
+	/* If it is data, queue it up, otherwise, send it
+	 * immediately.
+	 */
+	if (sctp_chunk_is_data(chunk)) {
+		/* Is it OK to queue data chunks?  */
+		/* From 9. Termination of Association
+		 *
+		 * When either endpoint performs a shutdown, the
+		 * association on each peer will stop accepting new
+		 * data from its user and only deliver data in queue
+		 * at the time of sending or receiving the SHUTDOWN
+		 * chunk.
+		 */
+		switch (q->asoc->state) {
+		case SCTP_STATE_CLOSED:
+		case SCTP_STATE_SHUTDOWN_PENDING:
+		case SCTP_STATE_SHUTDOWN_SENT:
+		case SCTP_STATE_SHUTDOWN_RECEIVED:
+		case SCTP_STATE_SHUTDOWN_ACK_SENT:
+			/* Cannot send after transport endpoint shutdown */
+			error = -ESHUTDOWN;
+			break;
+
+		default:
+			SCTP_DEBUG_PRINTK("outqueueing (%p, %p[%s])\n",
+			  q, chunk, chunk && chunk->chunk_hdr ?
+			  sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type))
+			  : "Illegal Chunk");
+
+			sctp_outq_tail_data(q, chunk);
+			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+				SCTP_INC_STATS(SCTP_MIB_OUTUNORDERCHUNKS);
+			else
+				SCTP_INC_STATS(SCTP_MIB_OUTORDERCHUNKS);
+			q->empty = 0;
+			break;
+		}
+	} else {
+		list_add_tail(&chunk->list, &q->control_chunk_list);
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+	}
+
+	if (error < 0)
+		return error;
+
+	if (!q->cork)
+		error = sctp_outq_flush(q, 0);
+
+	return error;
+}
+
+/* Insert a chunk into the sorted list based on the TSNs.  The retransmit list
+ * and the abandoned list are in ascending order.
+ */
+static void sctp_insert_list(struct list_head *head, struct list_head *new)
+{
+	struct list_head *pos;
+	struct sctp_chunk *nchunk, *lchunk;
+	__u32 ntsn, ltsn;
+	int done = 0;
+
+	nchunk = list_entry(new, struct sctp_chunk, transmitted_list);
+	ntsn = ntohl(nchunk->subh.data_hdr->tsn);
+
+	list_for_each(pos, head) {
+		lchunk = list_entry(pos, struct sctp_chunk, transmitted_list);
+		ltsn = ntohl(lchunk->subh.data_hdr->tsn);
+		if (TSN_lt(ntsn, ltsn)) {
+			list_add(new, pos->prev);
+			done = 1;
+			break;
+		}
+	}
+	if (!done)
+		list_add_tail(new, head);
+}
+
+/* Mark all the eligible packets on a transport for retransmission.  */
+void sctp_retransmit_mark(struct sctp_outq *q,
+			  struct sctp_transport *transport,
+			  __u8 reason)
+{
+	struct list_head *lchunk, *ltemp;
+	struct sctp_chunk *chunk;
+
+	/* Walk through the specified transmitted queue.  */
+	list_for_each_safe(lchunk, ltemp, &transport->transmitted) {
+		chunk = list_entry(lchunk, struct sctp_chunk,
+				   transmitted_list);
+
+		/* If the chunk is abandoned, move it to abandoned list. */
+		if (sctp_chunk_abandoned(chunk)) {
+			list_del_init(lchunk);
+			sctp_insert_list(&q->abandoned, lchunk);
+
+			/* If this chunk has not been previousely acked,
+			 * stop considering it 'outstanding'.  Our peer
+			 * will most likely never see it since it will
+			 * not be retransmitted
+			 */
+			if (!chunk->tsn_gap_acked) {
+				if (chunk->transport)
+					chunk->transport->flight_size -=
+							sctp_data_size(chunk);
+				q->outstanding_bytes -= sctp_data_size(chunk);
+				q->asoc->peer.rwnd += sctp_data_size(chunk);
+			}
+			continue;
+		}
+
+		/* If we are doing  retransmission due to a timeout or pmtu
+		 * discovery, only the  chunks that are not yet acked should
+		 * be added to the retransmit queue.
+		 */
+		if ((reason == SCTP_RTXR_FAST_RTX  &&
+			    (chunk->fast_retransmit == SCTP_NEED_FRTX)) ||
+		    (reason != SCTP_RTXR_FAST_RTX  && !chunk->tsn_gap_acked)) {
+			/* RFC 2960 6.2.1 Processing a Received SACK
+			 *
+			 * C) Any time a DATA chunk is marked for
+			 * retransmission (via either T3-rtx timer expiration
+			 * (Section 6.3.3) or via fast retransmit
+			 * (Section 7.2.4)), add the data size of those
+			 * chunks to the rwnd.
+			 */
+			q->asoc->peer.rwnd += sctp_data_size(chunk);
+			q->outstanding_bytes -= sctp_data_size(chunk);
+			if (chunk->transport)
+				transport->flight_size -= sctp_data_size(chunk);
+
+			/* sctpimpguide-05 Section 2.8.2
+			 * M5) If a T3-rtx timer expires, the
+			 * 'TSN.Missing.Report' of all affected TSNs is set
+			 * to 0.
+			 */
+			chunk->tsn_missing_report = 0;
+
+			/* If a chunk that is being used for RTT measurement
+			 * has to be retransmitted, we cannot use this chunk
+			 * anymore for RTT measurements. Reset rto_pending so
+			 * that a new RTT measurement is started when a new
+			 * data chunk is sent.
+			 */
+			if (chunk->rtt_in_progress) {
+				chunk->rtt_in_progress = 0;
+				transport->rto_pending = 0;
+			}
+
+			/* Move the chunk to the retransmit queue. The chunks
+			 * on the retransmit queue are always kept in order.
+			 */
+			list_del_init(lchunk);
+			sctp_insert_list(&q->retransmit, lchunk);
+		}
+	}
+
+	SCTP_DEBUG_PRINTK("%s: transport: %p, reason: %d, "
+			  "cwnd: %d, ssthresh: %d, flight_size: %d, "
+			  "pba: %d\n", __func__,
+			  transport, reason,
+			  transport->cwnd, transport->ssthresh,
+			  transport->flight_size,
+			  transport->partial_bytes_acked);
+
+}
+
+/* Mark all the eligible packets on a transport for retransmission and force
+ * one packet out.
+ */
+void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
+		     sctp_retransmit_reason_t reason)
+{
+	int error = 0;
+
+	switch(reason) {
+	case SCTP_RTXR_T3_RTX:
+		SCTP_INC_STATS(SCTP_MIB_T3_RETRANSMITS);
+		sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX);
+		/* Update the retran path if the T3-rtx timer has expired for
+		 * the current retran path.
+		 */
+		if (transport == transport->asoc->peer.retran_path)
+			sctp_assoc_update_retran_path(transport->asoc);
+		transport->asoc->rtx_data_chunks +=
+			transport->asoc->unack_data;
+		break;
+	case SCTP_RTXR_FAST_RTX:
+		SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS);
+		sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
+		q->fast_rtx = 1;
+		break;
+	case SCTP_RTXR_PMTUD:
+		SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS);
+		break;
+	case SCTP_RTXR_T1_RTX:
+		SCTP_INC_STATS(SCTP_MIB_T1_RETRANSMITS);
+		transport->asoc->init_retries++;
+		break;
+	default:
+		BUG();
+	}
+
+	sctp_retransmit_mark(q, transport, reason);
+
+	/* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination,
+	 * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by
+	 * following the procedures outlined in C1 - C5.
+	 */
+	if (reason == SCTP_RTXR_T3_RTX)
+		sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point);
+
+	/* Flush the queues only on timeout, since fast_rtx is only
+	 * triggered during sack processing and the queue
+	 * will be flushed at the end.
+	 */
+	if (reason != SCTP_RTXR_FAST_RTX)
+		error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+
+	if (error)
+		q->asoc->base.sk->sk_err = -error;
+}
+
+/*
+ * Transmit DATA chunks on the retransmit queue.  Upon return from
+ * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
+ * need to be transmitted by the caller.
+ * We assume that pkt->transport has already been set.
+ *
+ * The return value is a normal kernel error return value.
+ */
+static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
+			       int rtx_timeout, int *start_timer)
+{
+	struct list_head *lqueue;
+	struct sctp_transport *transport = pkt->transport;
+	sctp_xmit_t status;
+	struct sctp_chunk *chunk, *chunk1;
+	int fast_rtx;
+	int error = 0;
+	int timer = 0;
+	int done = 0;
+
+	lqueue = &q->retransmit;
+	fast_rtx = q->fast_rtx;
+
+	/* This loop handles time-out retransmissions, fast retransmissions,
+	 * and retransmissions due to opening of whindow.
+	 *
+	 * RFC 2960 6.3.3 Handle T3-rtx Expiration
+	 *
+	 * E3) Determine how many of the earliest (i.e., lowest TSN)
+	 * outstanding DATA chunks for the address for which the
+	 * T3-rtx has expired will fit into a single packet, subject
+	 * to the MTU constraint for the path corresponding to the
+	 * destination transport address to which the retransmission
+	 * is being sent (this may be different from the address for
+	 * which the timer expires [see Section 6.4]). Call this value
+	 * K. Bundle and retransmit those K DATA chunks in a single
+	 * packet to the destination endpoint.
+	 *
+	 * [Just to be painfully clear, if we are retransmitting
+	 * because a timeout just happened, we should send only ONE
+	 * packet of retransmitted data.]
+	 *
+	 * For fast retransmissions we also send only ONE packet.  However,
+	 * if we are just flushing the queue due to open window, we'll
+	 * try to send as much as possible.
+	 */
+	list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) {
+		/* If the chunk is abandoned, move it to abandoned list. */
+		if (sctp_chunk_abandoned(chunk)) {
+			list_del_init(&chunk->transmitted_list);
+			sctp_insert_list(&q->abandoned,
+					 &chunk->transmitted_list);
+			continue;
+		}
+
+		/* Make sure that Gap Acked TSNs are not retransmitted.  A
+		 * simple approach is just to move such TSNs out of the
+		 * way and into a 'transmitted' queue and skip to the
+		 * next chunk.
+		 */
+		if (chunk->tsn_gap_acked) {
+			list_del(&chunk->transmitted_list);
+			list_add_tail(&chunk->transmitted_list,
+					&transport->transmitted);
+			continue;
+		}
+
+		/* If we are doing fast retransmit, ignore non-fast_rtransmit
+		 * chunks
+		 */
+		if (fast_rtx && !chunk->fast_retransmit)
+			continue;
+
+redo:
+		/* Attempt to append this chunk to the packet. */
+		status = sctp_packet_append_chunk(pkt, chunk);
+
+		switch (status) {
+		case SCTP_XMIT_PMTU_FULL:
+			if (!pkt->has_data && !pkt->has_cookie_echo) {
+				/* If this packet did not contain DATA then
+				 * retransmission did not happen, so do it
+				 * again.  We'll ignore the error here since
+				 * control chunks are already freed so there
+				 * is nothing we can do.
+				 */
+				sctp_packet_transmit(pkt);
+				goto redo;
+			}
+
+			/* Send this packet.  */
+			error = sctp_packet_transmit(pkt);
+
+			/* If we are retransmitting, we should only
+			 * send a single packet.
+			 * Otherwise, try appending this chunk again.
+			 */
+			if (rtx_timeout || fast_rtx)
+				done = 1;
+			else
+				goto redo;
+
+			/* Bundle next chunk in the next round.  */
+			break;
+
+		case SCTP_XMIT_RWND_FULL:
+			/* Send this packet. */
+			error = sctp_packet_transmit(pkt);
+
+			/* Stop sending DATA as there is no more room
+			 * at the receiver.
+			 */
+			done = 1;
+			break;
+
+		case SCTP_XMIT_NAGLE_DELAY:
+			/* Send this packet. */
+			error = sctp_packet_transmit(pkt);
+
+			/* Stop sending DATA because of nagle delay. */
+			done = 1;
+			break;
+
+		default:
+			/* The append was successful, so add this chunk to
+			 * the transmitted list.
+			 */
+			list_del(&chunk->transmitted_list);
+			list_add_tail(&chunk->transmitted_list,
+					&transport->transmitted);
+
+			/* Mark the chunk as ineligible for fast retransmit
+			 * after it is retransmitted.
+			 */
+			if (chunk->fast_retransmit == SCTP_NEED_FRTX)
+				chunk->fast_retransmit = SCTP_DONT_FRTX;
+
+			q->empty = 0;
+			break;
+		}
+
+		/* Set the timer if there were no errors */
+		if (!error && !timer)
+			timer = 1;
+
+		if (done)
+			break;
+	}
+
+	/* If we are here due to a retransmit timeout or a fast
+	 * retransmit and if there are any chunks left in the retransmit
+	 * queue that could not fit in the PMTU sized packet, they need
+	 * to be marked as ineligible for a subsequent fast retransmit.
+	 */
+	if (rtx_timeout || fast_rtx) {
+		list_for_each_entry(chunk1, lqueue, transmitted_list) {
+			if (chunk1->fast_retransmit == SCTP_NEED_FRTX)
+				chunk1->fast_retransmit = SCTP_DONT_FRTX;
+		}
+	}
+
+	*start_timer = timer;
+
+	/* Clear fast retransmit hint */
+	if (fast_rtx)
+		q->fast_rtx = 0;
+
+	return error;
+}
+
+/* Cork the outqueue so queued chunks are really queued. */
+int sctp_outq_uncork(struct sctp_outq *q)
+{
+	int error = 0;
+	if (q->cork)
+		q->cork = 0;
+	error = sctp_outq_flush(q, 0);
+	return error;
+}
+
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made.  Today we use the sock lock to protect
+ * this function.
+ */
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
+{
+	struct sctp_packet *packet;
+	struct sctp_packet singleton;
+	struct sctp_association *asoc = q->asoc;
+	__u16 sport = asoc->base.bind_addr.port;
+	__u16 dport = asoc->peer.port;
+	__u32 vtag = asoc->peer.i.init_tag;
+	struct sctp_transport *transport = NULL;
+	struct sctp_transport *new_transport;
+	struct sctp_chunk *chunk, *tmp;
+	sctp_xmit_t status;
+	int error = 0;
+	int start_timer = 0;
+	int one_packet = 0;
+
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	struct list_head *ltransport;
+
+	INIT_LIST_HEAD(&transport_list);
+	packet = NULL;
+
+	/*
+	 * 6.10 Bundling
+	 *   ...
+	 *   When bundling control chunks with DATA chunks, an
+	 *   endpoint MUST place control chunks first in the outbound
+	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
+	 *   within a SCTP packet in increasing order of TSN.
+	 *   ...
+	 */
+
+	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		list_del_init(&chunk->list);
+
+		/* Pick the right transport to use. */
+		new_transport = chunk->transport;
+
+		if (!new_transport) {
+			/*
+			 * If we have a prior transport pointer, see if
+			 * the destination address of the chunk
+			 * matches the destination address of the
+			 * current transport.  If not a match, then
+			 * try to look up the transport with a given
+			 * destination address.  We do this because
+			 * after processing ASCONFs, we may have new
+			 * transports created.
+			 */
+			if (transport &&
+			    sctp_cmp_addr_exact(&chunk->dest,
+						&transport->ipaddr))
+					new_transport = transport;
+			else
+				new_transport = sctp_assoc_lookup_paddr(asoc,
+								&chunk->dest);
+
+			/* if we still don't have a new transport, then
+			 * use the current active path.
+			 */
+			if (!new_transport)
+				new_transport = asoc->peer.active_path;
+		} else if ((new_transport->state == SCTP_INACTIVE) ||
+			   (new_transport->state == SCTP_UNCONFIRMED)) {
+			/* If the chunk is Heartbeat or Heartbeat Ack,
+			 * send it to chunk->transport, even if it's
+			 * inactive.
+			 *
+			 * 3.3.6 Heartbeat Acknowledgement:
+			 * ...
+			 * A HEARTBEAT ACK is always sent to the source IP
+			 * address of the IP datagram containing the
+			 * HEARTBEAT chunk to which this ack is responding.
+			 * ...
+			 *
+			 * ASCONF_ACKs also must be sent to the source.
+			 */
+			if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT &&
+			    chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK &&
+			    chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK)
+				new_transport = asoc->peer.active_path;
+		}
+
+		/* Are we switching transports?
+		 * Take care of transport locks.
+		 */
+		if (new_transport != transport) {
+			transport = new_transport;
+			if (list_empty(&transport->send_ready)) {
+				list_add_tail(&transport->send_ready,
+					      &transport_list);
+			}
+			packet = &transport->packet;
+			sctp_packet_config(packet, vtag,
+					   asoc->peer.ecn_capable);
+		}
+
+		switch (chunk->chunk_hdr->type) {
+		/*
+		 * 6.10 Bundling
+		 *   ...
+		 *   An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
+		 *   COMPLETE with any other chunks.  [Send them immediately.]
+		 */
+		case SCTP_CID_INIT:
+		case SCTP_CID_INIT_ACK:
+		case SCTP_CID_SHUTDOWN_COMPLETE:
+			sctp_packet_init(&singleton, transport, sport, dport);
+			sctp_packet_config(&singleton, vtag, 0);
+			sctp_packet_append_chunk(&singleton, chunk);
+			error = sctp_packet_transmit(&singleton);
+			if (error < 0)
+				return error;
+			break;
+
+		case SCTP_CID_ABORT:
+			if (sctp_test_T_bit(chunk)) {
+				packet->vtag = asoc->c.my_vtag;
+			}
+		/* The following chunks are "response" chunks, i.e.
+		 * they are generated in response to something we
+		 * received.  If we are sending these, then we can
+		 * send only 1 packet containing these chunks.
+		 */
+		case SCTP_CID_HEARTBEAT_ACK:
+		case SCTP_CID_SHUTDOWN_ACK:
+		case SCTP_CID_COOKIE_ACK:
+		case SCTP_CID_COOKIE_ECHO:
+		case SCTP_CID_ERROR:
+		case SCTP_CID_ECN_CWR:
+		case SCTP_CID_ASCONF_ACK:
+			one_packet = 1;
+			/* Fall through */
+
+		case SCTP_CID_SACK:
+		case SCTP_CID_HEARTBEAT:
+		case SCTP_CID_SHUTDOWN:
+		case SCTP_CID_ECN_ECNE:
+		case SCTP_CID_ASCONF:
+		case SCTP_CID_FWD_TSN:
+			status = sctp_packet_transmit_chunk(packet, chunk,
+							    one_packet);
+			if (status  != SCTP_XMIT_OK) {
+				/* put the chunk back */
+				list_add(&chunk->list, &q->control_chunk_list);
+			} else if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+				/* PR-SCTP C5) If a FORWARD TSN is sent, the
+				 * sender MUST assure that at least one T3-rtx
+				 * timer is running.
+				 */
+				sctp_transport_reset_timers(transport);
+			}
+			break;
+
+		default:
+			/* We built a chunk with an illegal type! */
+			BUG();
+		}
+	}
+
+	/* Is it OK to send data chunks?  */
+	switch (asoc->state) {
+	case SCTP_STATE_COOKIE_ECHOED:
+		/* Only allow bundling when this packet has a COOKIE-ECHO
+		 * chunk.
+		 */
+		if (!packet || !packet->has_cookie_echo)
+			break;
+
+		/* fallthru */
+	case SCTP_STATE_ESTABLISHED:
+	case SCTP_STATE_SHUTDOWN_PENDING:
+	case SCTP_STATE_SHUTDOWN_RECEIVED:
+		/*
+		 * RFC 2960 6.1  Transmission of DATA Chunks
+		 *
+		 * C) When the time comes for the sender to transmit,
+		 * before sending new DATA chunks, the sender MUST
+		 * first transmit any outstanding DATA chunks which
+		 * are marked for retransmission (limited by the
+		 * current cwnd).
+		 */
+		if (!list_empty(&q->retransmit)) {
+			if (transport == asoc->peer.retran_path)
+				goto retran;
+
+			/* Switch transports & prepare the packet.  */
+
+			transport = asoc->peer.retran_path;
+
+			if (list_empty(&transport->send_ready)) {
+				list_add_tail(&transport->send_ready,
+					      &transport_list);
+			}
+
+			packet = &transport->packet;
+			sctp_packet_config(packet, vtag,
+					   asoc->peer.ecn_capable);
+		retran:
+			error = sctp_outq_flush_rtx(q, packet,
+						    rtx_timeout, &start_timer);
+
+			if (start_timer)
+				sctp_transport_reset_timers(transport);
+
+			/* This can happen on COOKIE-ECHO resend.  Only
+			 * one chunk can get bundled with a COOKIE-ECHO.
+			 */
+			if (packet->has_cookie_echo)
+				goto sctp_flush_out;
+
+			/* Don't send new data if there is still data
+			 * waiting to retransmit.
+			 */
+			if (!list_empty(&q->retransmit))
+				goto sctp_flush_out;
+		}
+
+		/* Apply Max.Burst limitation to the current transport in
+		 * case it will be used for new data.  We are going to
+		 * rest it before we return, but we want to apply the limit
+		 * to the currently queued data.
+		 */
+		if (transport)
+			sctp_transport_burst_limited(transport);
+
+		/* Finally, transmit new packets.  */
+		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+			/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
+			 * stream identifier.
+			 */
+			if (chunk->sinfo.sinfo_stream >=
+			    asoc->c.sinit_num_ostreams) {
+
+				/* Mark as failed send. */
+				sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
+				sctp_chunk_free(chunk);
+				continue;
+			}
+
+			/* Has this chunk expired? */
+			if (sctp_chunk_abandoned(chunk)) {
+				sctp_chunk_fail(chunk, 0);
+				sctp_chunk_free(chunk);
+				continue;
+			}
+
+			/* If there is a specified transport, use it.
+			 * Otherwise, we want to use the active path.
+			 */
+			new_transport = chunk->transport;
+			if (!new_transport ||
+			    ((new_transport->state == SCTP_INACTIVE) ||
+			     (new_transport->state == SCTP_UNCONFIRMED)))
+				new_transport = asoc->peer.active_path;
+
+			/* Change packets if necessary.  */
+			if (new_transport != transport) {
+				transport = new_transport;
+
+				/* Schedule to have this transport's
+				 * packet flushed.
+				 */
+				if (list_empty(&transport->send_ready)) {
+					list_add_tail(&transport->send_ready,
+						      &transport_list);
+				}
+
+				packet = &transport->packet;
+				sctp_packet_config(packet, vtag,
+						   asoc->peer.ecn_capable);
+				/* We've switched transports, so apply the
+				 * Burst limit to the new transport.
+				 */
+				sctp_transport_burst_limited(transport);
+			}
+
+			SCTP_DEBUG_PRINTK("sctp_outq_flush(%p, %p[%s]), ",
+					  q, chunk,
+					  chunk && chunk->chunk_hdr ?
+					  sctp_cname(SCTP_ST_CHUNK(
+						  chunk->chunk_hdr->type))
+					  : "Illegal Chunk");
+
+			SCTP_DEBUG_PRINTK("TX TSN 0x%x skb->head "
+					"%p skb->users %d.\n",
+					ntohl(chunk->subh.data_hdr->tsn),
+					chunk->skb ?chunk->skb->head : NULL,
+					chunk->skb ?
+					atomic_read(&chunk->skb->users) : -1);
+
+			/* Add the chunk to the packet.  */
+			status = sctp_packet_transmit_chunk(packet, chunk, 0);
+
+			switch (status) {
+			case SCTP_XMIT_PMTU_FULL:
+			case SCTP_XMIT_RWND_FULL:
+			case SCTP_XMIT_NAGLE_DELAY:
+				/* We could not append this chunk, so put
+				 * the chunk back on the output queue.
+				 */
+				SCTP_DEBUG_PRINTK("sctp_outq_flush: could "
+					"not transmit TSN: 0x%x, status: %d\n",
+					ntohl(chunk->subh.data_hdr->tsn),
+					status);
+				sctp_outq_head_data(q, chunk);
+				goto sctp_flush_out;
+				break;
+
+			case SCTP_XMIT_OK:
+				/* The sender is in the SHUTDOWN-PENDING state,
+				 * The sender MAY set the I-bit in the DATA
+				 * chunk header.
+				 */
+				if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+					chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+
+				break;
+
+			default:
+				BUG();
+			}
+
+			/* BUG: We assume that the sctp_packet_transmit()
+			 * call below will succeed all the time and add the
+			 * chunk to the transmitted list and restart the
+			 * timers.
+			 * It is possible that the call can fail under OOM
+			 * conditions.
+			 *
+			 * Is this really a problem?  Won't this behave
+			 * like a lost TSN?
+			 */
+			list_add_tail(&chunk->transmitted_list,
+				      &transport->transmitted);
+
+			sctp_transport_reset_timers(transport);
+
+			q->empty = 0;
+
+			/* Only let one DATA chunk get bundled with a
+			 * COOKIE-ECHO chunk.
+			 */
+			if (packet->has_cookie_echo)
+				goto sctp_flush_out;
+		}
+		break;
+
+	default:
+		/* Do nothing.  */
+		break;
+	}
+
+sctp_flush_out:
+
+	/* Before returning, examine all the transports touched in
+	 * this call.  Right now, we bluntly force clear all the
+	 * transports.  Things might change after we implement Nagle.
+	 * But such an examination is still required.
+	 *
+	 * --xguo
+	 */
+	while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL ) {
+		struct sctp_transport *t = list_entry(ltransport,
+						      struct sctp_transport,
+						      send_ready);
+		packet = &t->packet;
+		if (!sctp_packet_empty(packet))
+			error = sctp_packet_transmit(packet);
+
+		/* Clear the burst limited state, if any */
+		sctp_transport_burst_reset(t);
+	}
+
+	return error;
+}
+
+/* Update unack_data based on the incoming SACK chunk */
+static void sctp_sack_update_unack_data(struct sctp_association *assoc,
+					struct sctp_sackhdr *sack)
+{
+	sctp_sack_variable_t *frags;
+	__u16 unack_data;
+	int i;
+
+	unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;
+
+	frags = sack->variable;
+	for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
+		unack_data -= ((ntohs(frags[i].gab.end) -
+				ntohs(frags[i].gab.start) + 1));
+	}
+
+	assoc->unack_data = unack_data;
+}
+
+/* This is where we REALLY process a SACK.
+ *
+ * Process the SACK against the outqueue.  Mostly, this just frees
+ * things off the transmitted queue.
+ */
+int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
+{
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_transport *transport;
+	struct sctp_chunk *tchunk = NULL;
+	struct list_head *lchunk, *transport_list, *temp;
+	sctp_sack_variable_t *frags = sack->variable;
+	__u32 sack_ctsn, ctsn, tsn;
+	__u32 highest_tsn, highest_new_tsn;
+	__u32 sack_a_rwnd;
+	unsigned outstanding;
+	struct sctp_transport *primary = asoc->peer.primary_path;
+	int count_of_newacks = 0;
+	int gap_ack_blocks;
+	u8 accum_moved = 0;
+
+	/* Grab the association's destination address list. */
+	transport_list = &asoc->peer.transport_addr_list;
+
+	sack_ctsn = ntohl(sack->cum_tsn_ack);
+	gap_ack_blocks = ntohs(sack->num_gap_ack_blocks);
+	/*
+	 * SFR-CACC algorithm:
+	 * On receipt of a SACK the sender SHOULD execute the
+	 * following statements.
+	 *
+	 * 1) If the cumulative ack in the SACK passes next tsn_at_change
+	 * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be
+	 * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for
+	 * all destinations.
+	 * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE
+	 * is set the receiver of the SACK MUST take the following actions:
+	 *
+	 * A) Initialize the cacc_saw_newack to 0 for all destination
+	 * addresses.
+	 *
+	 * Only bother if changeover_active is set. Otherwise, this is
+	 * totally suboptimal to do on every SACK.
+	 */
+	if (primary->cacc.changeover_active) {
+		u8 clear_cycling = 0;
+
+		if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
+			primary->cacc.changeover_active = 0;
+			clear_cycling = 1;
+		}
+
+		if (clear_cycling || gap_ack_blocks) {
+			list_for_each_entry(transport, transport_list,
+					transports) {
+				if (clear_cycling)
+					transport->cacc.cycling_changeover = 0;
+				if (gap_ack_blocks)
+					transport->cacc.cacc_saw_newack = 0;
+			}
+		}
+	}
+
+	/* Get the highest TSN in the sack. */
+	highest_tsn = sack_ctsn;
+	if (gap_ack_blocks)
+		highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
+
+	if (TSN_lt(asoc->highest_sacked, highest_tsn))
+		asoc->highest_sacked = highest_tsn;
+
+	highest_new_tsn = sack_ctsn;
+
+	/* Run through the retransmit queue.  Credit bytes received
+	 * and free those chunks that we can.
+	 */
+	sctp_check_transmitted(q, &q->retransmit, NULL, sack, &highest_new_tsn);
+
+	/* Run through the transmitted queue.
+	 * Credit bytes received and free those chunks which we can.
+	 *
+	 * This is a MASSIVE candidate for optimization.
+	 */
+	list_for_each_entry(transport, transport_list, transports) {
+		sctp_check_transmitted(q, &transport->transmitted,
+				       transport, sack, &highest_new_tsn);
+		/*
+		 * SFR-CACC algorithm:
+		 * C) Let count_of_newacks be the number of
+		 * destinations for which cacc_saw_newack is set.
+		 */
+		if (transport->cacc.cacc_saw_newack)
+			count_of_newacks ++;
+	}
+
+	/* Move the Cumulative TSN Ack Point if appropriate.  */
+	if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) {
+		asoc->ctsn_ack_point = sack_ctsn;
+		accum_moved = 1;
+	}
+
+	if (gap_ack_blocks) {
+
+		if (asoc->fast_recovery && accum_moved)
+			highest_new_tsn = highest_tsn;
+
+		list_for_each_entry(transport, transport_list, transports)
+			sctp_mark_missing(q, &transport->transmitted, transport,
+					  highest_new_tsn, count_of_newacks);
+	}
+
+	/* Update unack_data field in the assoc. */
+	sctp_sack_update_unack_data(asoc, sack);
+
+	ctsn = asoc->ctsn_ack_point;
+
+	/* Throw away stuff rotting on the sack queue.  */
+	list_for_each_safe(lchunk, temp, &q->sacked) {
+		tchunk = list_entry(lchunk, struct sctp_chunk,
+				    transmitted_list);
+		tsn = ntohl(tchunk->subh.data_hdr->tsn);
+		if (TSN_lte(tsn, ctsn)) {
+			list_del_init(&tchunk->transmitted_list);
+			sctp_chunk_free(tchunk);
+		}
+	}
+
+	/* ii) Set rwnd equal to the newly received a_rwnd minus the
+	 *     number of bytes still outstanding after processing the
+	 *     Cumulative TSN Ack and the Gap Ack Blocks.
+	 */
+
+	sack_a_rwnd = ntohl(sack->a_rwnd);
+	outstanding = q->outstanding_bytes;
+
+	if (outstanding < sack_a_rwnd)
+		sack_a_rwnd -= outstanding;
+	else
+		sack_a_rwnd = 0;
+
+	asoc->peer.rwnd = sack_a_rwnd;
+
+	sctp_generate_fwdtsn(q, sack_ctsn);
+
+	SCTP_DEBUG_PRINTK("%s: sack Cumulative TSN Ack is 0x%x.\n",
+			  __func__, sack_ctsn);
+	SCTP_DEBUG_PRINTK("%s: Cumulative TSN Ack of association, "
+			  "%p is 0x%x. Adv peer ack point: 0x%x\n",
+			  __func__, asoc, ctsn, asoc->adv_peer_ack_point);
+
+	/* See if all chunks are acked.
+	 * Make sure the empty queue handler will get run later.
+	 */
+	q->empty = (list_empty(&q->out_chunk_list) &&
+		    list_empty(&q->retransmit));
+	if (!q->empty)
+		goto finish;
+
+	list_for_each_entry(transport, transport_list, transports) {
+		q->empty = q->empty && list_empty(&transport->transmitted);
+		if (!q->empty)
+			goto finish;
+	}
+
+	SCTP_DEBUG_PRINTK("sack queue is empty.\n");
+finish:
+	return q->empty;
+}
+
+/* Is the outqueue empty?  */
+int sctp_outq_is_empty(const struct sctp_outq *q)
+{
+	return q->empty;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Go through a transport's transmitted list or the association's retransmit
+ * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked.
+ * The retransmit list will not have an associated transport.
+ *
+ * I added coherent debug information output.	--xguo
+ *
+ * Instead of printing 'sacked' or 'kept' for each TSN on the
+ * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5.
+ * KEPT TSN6-TSN7, etc.
+ */
+static void sctp_check_transmitted(struct sctp_outq *q,
+				   struct list_head *transmitted_queue,
+				   struct sctp_transport *transport,
+				   struct sctp_sackhdr *sack,
+				   __u32 *highest_new_tsn_in_sack)
+{
+	struct list_head *lchunk;
+	struct sctp_chunk *tchunk;
+	struct list_head tlist;
+	__u32 tsn;
+	__u32 sack_ctsn;
+	__u32 rtt;
+	__u8 restart_timer = 0;
+	int bytes_acked = 0;
+	int migrate_bytes = 0;
+
+	/* These state variables are for coherent debug output. --xguo */
+
+#if SCTP_DEBUG
+	__u32 dbg_ack_tsn = 0;	/* An ACKed TSN range starts here... */
+	__u32 dbg_last_ack_tsn = 0;  /* ...and finishes here.	     */
+	__u32 dbg_kept_tsn = 0;	/* An un-ACKed range starts here...  */
+	__u32 dbg_last_kept_tsn = 0; /* ...and finishes here.	     */
+
+	/* 0 : The last TSN was ACKed.
+	 * 1 : The last TSN was NOT ACKed (i.e. KEPT).
+	 * -1: We need to initialize.
+	 */
+	int dbg_prt_state = -1;
+#endif /* SCTP_DEBUG */
+
+	sack_ctsn = ntohl(sack->cum_tsn_ack);
+
+	INIT_LIST_HEAD(&tlist);
+
+	/* The while loop will skip empty transmitted queues. */
+	while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) {
+		tchunk = list_entry(lchunk, struct sctp_chunk,
+				    transmitted_list);
+
+		if (sctp_chunk_abandoned(tchunk)) {
+			/* Move the chunk to abandoned list. */
+			sctp_insert_list(&q->abandoned, lchunk);
+
+			/* If this chunk has not been acked, stop
+			 * considering it as 'outstanding'.
+			 */
+			if (!tchunk->tsn_gap_acked) {
+				if (tchunk->transport)
+					tchunk->transport->flight_size -=
+							sctp_data_size(tchunk);
+				q->outstanding_bytes -= sctp_data_size(tchunk);
+			}
+			continue;
+		}
+
+		tsn = ntohl(tchunk->subh.data_hdr->tsn);
+		if (sctp_acked(sack, tsn)) {
+			/* If this queue is the retransmit queue, the
+			 * retransmit timer has already reclaimed
+			 * the outstanding bytes for this chunk, so only
+			 * count bytes associated with a transport.
+			 */
+			if (transport) {
+				/* If this chunk is being used for RTT
+				 * measurement, calculate the RTT and update
+				 * the RTO using this value.
+				 *
+				 * 6.3.1 C5) Karn's algorithm: RTT measurements
+				 * MUST NOT be made using packets that were
+				 * retransmitted (and thus for which it is
+				 * ambiguous whether the reply was for the
+				 * first instance of the packet or a later
+				 * instance).
+				 */
+				if (!tchunk->tsn_gap_acked &&
+				    tchunk->rtt_in_progress) {
+					tchunk->rtt_in_progress = 0;
+					rtt = jiffies - tchunk->sent_at;
+					sctp_transport_update_rto(transport,
+								  rtt);
+				}
+			}
+
+			/* If the chunk hasn't been marked as ACKED,
+			 * mark it and account bytes_acked if the
+			 * chunk had a valid transport (it will not
+			 * have a transport if ASCONF had deleted it
+			 * while DATA was outstanding).
+			 */
+			if (!tchunk->tsn_gap_acked) {
+				tchunk->tsn_gap_acked = 1;
+				*highest_new_tsn_in_sack = tsn;
+				bytes_acked += sctp_data_size(tchunk);
+				if (!tchunk->transport)
+					migrate_bytes += sctp_data_size(tchunk);
+			}
+
+			if (TSN_lte(tsn, sack_ctsn)) {
+				/* RFC 2960  6.3.2 Retransmission Timer Rules
+				 *
+				 * R3) Whenever a SACK is received
+				 * that acknowledges the DATA chunk
+				 * with the earliest outstanding TSN
+				 * for that address, restart T3-rtx
+				 * timer for that address with its
+				 * current RTO.
+				 */
+				restart_timer = 1;
+
+				if (!tchunk->tsn_gap_acked) {
+					/*
+					 * SFR-CACC algorithm:
+					 * 2) If the SACK contains gap acks
+					 * and the flag CHANGEOVER_ACTIVE is
+					 * set the receiver of the SACK MUST
+					 * take the following action:
+					 *
+					 * B) For each TSN t being acked that
+					 * has not been acked in any SACK so
+					 * far, set cacc_saw_newack to 1 for
+					 * the destination that the TSN was
+					 * sent to.
+					 */
+					if (transport &&
+					    sack->num_gap_ack_blocks &&
+					    q->asoc->peer.primary_path->cacc.
+					    changeover_active)
+						transport->cacc.cacc_saw_newack
+							= 1;
+				}
+
+				list_add_tail(&tchunk->transmitted_list,
+					      &q->sacked);
+			} else {
+				/* RFC2960 7.2.4, sctpimpguide-05 2.8.2
+				 * M2) Each time a SACK arrives reporting
+				 * 'Stray DATA chunk(s)' record the highest TSN
+				 * reported as newly acknowledged, call this
+				 * value 'HighestTSNinSack'. A newly
+				 * acknowledged DATA chunk is one not
+				 * previously acknowledged in a SACK.
+				 *
+				 * When the SCTP sender of data receives a SACK
+				 * chunk that acknowledges, for the first time,
+				 * the receipt of a DATA chunk, all the still
+				 * unacknowledged DATA chunks whose TSN is
+				 * older than that newly acknowledged DATA
+				 * chunk, are qualified as 'Stray DATA chunks'.
+				 */
+				list_add_tail(lchunk, &tlist);
+			}
+
+#if SCTP_DEBUG
+			switch (dbg_prt_state) {
+			case 0:	/* last TSN was ACKed */
+				if (dbg_last_ack_tsn + 1 == tsn) {
+					/* This TSN belongs to the
+					 * current ACK range.
+					 */
+					break;
+				}
+
+				if (dbg_last_ack_tsn != dbg_ack_tsn) {
+					/* Display the end of the
+					 * current range.
+					 */
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_ack_tsn);
+				}
+
+				/* Start a new range.  */
+				SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
+				dbg_ack_tsn = tsn;
+				break;
+
+			case 1:	/* The last TSN was NOT ACKed. */
+				if (dbg_last_kept_tsn != dbg_kept_tsn) {
+					/* Display the end of current range. */
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_kept_tsn);
+				}
+
+				SCTP_DEBUG_PRINTK_CONT("\n");
+
+				/* FALL THROUGH... */
+			default:
+				/* This is the first-ever TSN we examined.  */
+				/* Start a new range of ACK-ed TSNs.  */
+				SCTP_DEBUG_PRINTK("ACKed: %08x", tsn);
+				dbg_prt_state = 0;
+				dbg_ack_tsn = tsn;
+			}
+
+			dbg_last_ack_tsn = tsn;
+#endif /* SCTP_DEBUG */
+
+		} else {
+			if (tchunk->tsn_gap_acked) {
+				SCTP_DEBUG_PRINTK("%s: Receiver reneged on "
+						  "data TSN: 0x%x\n",
+						  __func__,
+						  tsn);
+				tchunk->tsn_gap_acked = 0;
+
+				if (tchunk->transport)
+					bytes_acked -= sctp_data_size(tchunk);
+
+				/* RFC 2960 6.3.2 Retransmission Timer Rules
+				 *
+				 * R4) Whenever a SACK is received missing a
+				 * TSN that was previously acknowledged via a
+				 * Gap Ack Block, start T3-rtx for the
+				 * destination address to which the DATA
+				 * chunk was originally
+				 * transmitted if it is not already running.
+				 */
+				restart_timer = 1;
+			}
+
+			list_add_tail(lchunk, &tlist);
+
+#if SCTP_DEBUG
+			/* See the above comments on ACK-ed TSNs. */
+			switch (dbg_prt_state) {
+			case 1:
+				if (dbg_last_kept_tsn + 1 == tsn)
+					break;
+
+				if (dbg_last_kept_tsn != dbg_kept_tsn)
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_kept_tsn);
+
+				SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
+				dbg_kept_tsn = tsn;
+				break;
+
+			case 0:
+				if (dbg_last_ack_tsn != dbg_ack_tsn)
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_ack_tsn);
+				SCTP_DEBUG_PRINTK_CONT("\n");
+
+				/* FALL THROUGH... */
+			default:
+				SCTP_DEBUG_PRINTK("KEPT: %08x",tsn);
+				dbg_prt_state = 1;
+				dbg_kept_tsn = tsn;
+			}
+
+			dbg_last_kept_tsn = tsn;
+#endif /* SCTP_DEBUG */
+		}
+	}
+
+#if SCTP_DEBUG
+	/* Finish off the last range, displaying its ending TSN.  */
+	switch (dbg_prt_state) {
+	case 0:
+		if (dbg_last_ack_tsn != dbg_ack_tsn) {
+			SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn);
+		} else {
+			SCTP_DEBUG_PRINTK_CONT("\n");
+		}
+	break;
+
+	case 1:
+		if (dbg_last_kept_tsn != dbg_kept_tsn) {
+			SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn);
+		} else {
+			SCTP_DEBUG_PRINTK_CONT("\n");
+		}
+	}
+#endif /* SCTP_DEBUG */
+	if (transport) {
+		if (bytes_acked) {
+			struct sctp_association *asoc = transport->asoc;
+
+			/* We may have counted DATA that was migrated
+			 * to this transport due to DEL-IP operation.
+			 * Subtract those bytes, since the were never
+			 * send on this transport and shouldn't be
+			 * credited to this transport.
+			 */
+			bytes_acked -= migrate_bytes;
+
+			/* 8.2. When an outstanding TSN is acknowledged,
+			 * the endpoint shall clear the error counter of
+			 * the destination transport address to which the
+			 * DATA chunk was last sent.
+			 * The association's overall error counter is
+			 * also cleared.
+			 */
+			transport->error_count = 0;
+			transport->asoc->overall_error_count = 0;
+
+			/*
+			 * While in SHUTDOWN PENDING, we may have started
+			 * the T5 shutdown guard timer after reaching the
+			 * retransmission limit. Stop that timer as soon
+			 * as the receiver acknowledged any data.
+			 */
+			if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING &&
+			    del_timer(&asoc->timers
+				[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]))
+					sctp_association_put(asoc);
+
+			/* Mark the destination transport address as
+			 * active if it is not so marked.
+			 */
+			if ((transport->state == SCTP_INACTIVE) ||
+			    (transport->state == SCTP_UNCONFIRMED)) {
+				sctp_assoc_control_transport(
+					transport->asoc,
+					transport,
+					SCTP_TRANSPORT_UP,
+					SCTP_RECEIVED_SACK);
+			}
+
+			sctp_transport_raise_cwnd(transport, sack_ctsn,
+						  bytes_acked);
+
+			transport->flight_size -= bytes_acked;
+			if (transport->flight_size == 0)
+				transport->partial_bytes_acked = 0;
+			q->outstanding_bytes -= bytes_acked + migrate_bytes;
+		} else {
+			/* RFC 2960 6.1, sctpimpguide-06 2.15.2
+			 * When a sender is doing zero window probing, it
+			 * should not timeout the association if it continues
+			 * to receive new packets from the receiver. The
+			 * reason is that the receiver MAY keep its window
+			 * closed for an indefinite time.
+			 * A sender is doing zero window probing when the
+			 * receiver's advertised window is zero, and there is
+			 * only one data chunk in flight to the receiver.
+			 *
+			 * Allow the association to timeout while in SHUTDOWN
+			 * PENDING or SHUTDOWN RECEIVED in case the receiver
+			 * stays in zero window mode forever.
+			 */
+			if (!q->asoc->peer.rwnd &&
+			    !list_empty(&tlist) &&
+			    (sack_ctsn+2 == q->asoc->next_tsn) &&
+			    q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) {
+				SCTP_DEBUG_PRINTK("%s: SACK received for zero "
+						  "window probe: %u\n",
+						  __func__, sack_ctsn);
+				q->asoc->overall_error_count = 0;
+				transport->error_count = 0;
+			}
+		}
+
+		/* RFC 2960 6.3.2 Retransmission Timer Rules
+		 *
+		 * R2) Whenever all outstanding data sent to an address have
+		 * been acknowledged, turn off the T3-rtx timer of that
+		 * address.
+		 */
+		if (!transport->flight_size) {
+			if (timer_pending(&transport->T3_rtx_timer) &&
+			    del_timer(&transport->T3_rtx_timer)) {
+				sctp_transport_put(transport);
+			}
+		} else if (restart_timer) {
+			if (!mod_timer(&transport->T3_rtx_timer,
+				       jiffies + transport->rto))
+				sctp_transport_hold(transport);
+		}
+	}
+
+	list_splice(&tlist, transmitted_queue);
+}
+
+/* Mark chunks as missing and consequently may get retransmitted. */
+static void sctp_mark_missing(struct sctp_outq *q,
+			      struct list_head *transmitted_queue,
+			      struct sctp_transport *transport,
+			      __u32 highest_new_tsn_in_sack,
+			      int count_of_newacks)
+{
+	struct sctp_chunk *chunk;
+	__u32 tsn;
+	char do_fast_retransmit = 0;
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_transport *primary = asoc->peer.primary_path;
+
+	list_for_each_entry(chunk, transmitted_queue, transmitted_list) {
+
+		tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+		/* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all
+		 * 'Unacknowledged TSN's', if the TSN number of an
+		 * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack'
+		 * value, increment the 'TSN.Missing.Report' count on that
+		 * chunk if it has NOT been fast retransmitted or marked for
+		 * fast retransmit already.
+		 */
+		if (chunk->fast_retransmit == SCTP_CAN_FRTX &&
+		    !chunk->tsn_gap_acked &&
+		    TSN_lt(tsn, highest_new_tsn_in_sack)) {
+
+			/* SFR-CACC may require us to skip marking
+			 * this chunk as missing.
+			 */
+			if (!transport || !sctp_cacc_skip(primary,
+						chunk->transport,
+						count_of_newacks, tsn)) {
+				chunk->tsn_missing_report++;
+
+				SCTP_DEBUG_PRINTK(
+					"%s: TSN 0x%x missing counter: %d\n",
+					__func__, tsn,
+					chunk->tsn_missing_report);
+			}
+		}
+		/*
+		 * M4) If any DATA chunk is found to have a
+		 * 'TSN.Missing.Report'
+		 * value larger than or equal to 3, mark that chunk for
+		 * retransmission and start the fast retransmit procedure.
+		 */
+
+		if (chunk->tsn_missing_report >= 3) {
+			chunk->fast_retransmit = SCTP_NEED_FRTX;
+			do_fast_retransmit = 1;
+		}
+	}
+
+	if (transport) {
+		if (do_fast_retransmit)
+			sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX);
+
+		SCTP_DEBUG_PRINTK("%s: transport: %p, cwnd: %d, "
+				  "ssthresh: %d, flight_size: %d, pba: %d\n",
+				  __func__, transport, transport->cwnd,
+				  transport->ssthresh, transport->flight_size,
+				  transport->partial_bytes_acked);
+	}
+}
+
+/* Is the given TSN acked by this packet?  */
+static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
+{
+	int i;
+	sctp_sack_variable_t *frags;
+	__u16 gap;
+	__u32 ctsn = ntohl(sack->cum_tsn_ack);
+
+	if (TSN_lte(tsn, ctsn))
+		goto pass;
+
+	/* 3.3.4 Selective Acknowledgement (SACK) (3):
+	 *
+	 * Gap Ack Blocks:
+	 *  These fields contain the Gap Ack Blocks. They are repeated
+	 *  for each Gap Ack Block up to the number of Gap Ack Blocks
+	 *  defined in the Number of Gap Ack Blocks field. All DATA
+	 *  chunks with TSNs greater than or equal to (Cumulative TSN
+	 *  Ack + Gap Ack Block Start) and less than or equal to
+	 *  (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack
+	 *  Block are assumed to have been received correctly.
+	 */
+
+	frags = sack->variable;
+	gap = tsn - ctsn;
+	for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
+		if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
+		    TSN_lte(gap, ntohs(frags[i].gab.end)))
+			goto pass;
+	}
+
+	return 0;
+pass:
+	return 1;
+}
+
+static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
+				    int nskips, __be16 stream)
+{
+	int i;
+
+	for (i = 0; i < nskips; i++) {
+		if (skiplist[i].stream == stream)
+			return i;
+	}
+	return i;
+}
+
+/* Create and add a fwdtsn chunk to the outq's control queue if needed. */
+static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
+{
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_chunk *ftsn_chunk = NULL;
+	struct sctp_fwdtsn_skip ftsn_skip_arr[10];
+	int nskips = 0;
+	int skip_pos = 0;
+	__u32 tsn;
+	struct sctp_chunk *chunk;
+	struct list_head *lchunk, *temp;
+
+	if (!asoc->peer.prsctp_capable)
+		return;
+
+	/* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
+	 * received SACK.
+	 *
+	 * If (Advanced.Peer.Ack.Point < SackCumAck), then update
+	 * Advanced.Peer.Ack.Point to be equal to SackCumAck.
+	 */
+	if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
+		asoc->adv_peer_ack_point = ctsn;
+
+	/* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point"
+	 * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as
+	 * the chunk next in the out-queue space is marked as "abandoned" as
+	 * shown in the following example:
+	 *
+	 * Assuming that a SACK arrived with the Cumulative TSN ACK 102
+	 * and the Advanced.Peer.Ack.Point is updated to this value:
+	 *
+	 *   out-queue at the end of  ==>   out-queue after Adv.Ack.Point
+	 *   normal SACK processing           local advancement
+	 *                ...                           ...
+	 *   Adv.Ack.Pt-> 102 acked                     102 acked
+	 *                103 abandoned                 103 abandoned
+	 *                104 abandoned     Adv.Ack.P-> 104 abandoned
+	 *                105                           105
+	 *                106 acked                     106 acked
+	 *                ...                           ...
+	 *
+	 * In this example, the data sender successfully advanced the
+	 * "Advanced.Peer.Ack.Point" from 102 to 104 locally.
+	 */
+	list_for_each_safe(lchunk, temp, &q->abandoned) {
+		chunk = list_entry(lchunk, struct sctp_chunk,
+					transmitted_list);
+		tsn = ntohl(chunk->subh.data_hdr->tsn);
+
+		/* Remove any chunks in the abandoned queue that are acked by
+		 * the ctsn.
+		 */
+		if (TSN_lte(tsn, ctsn)) {
+			list_del_init(lchunk);
+			sctp_chunk_free(chunk);
+		} else {
+			if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) {
+				asoc->adv_peer_ack_point = tsn;
+				if (chunk->chunk_hdr->flags &
+					 SCTP_DATA_UNORDERED)
+					continue;
+				skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0],
+						nskips,
+						chunk->subh.data_hdr->stream);
+				ftsn_skip_arr[skip_pos].stream =
+					chunk->subh.data_hdr->stream;
+				ftsn_skip_arr[skip_pos].ssn =
+					 chunk->subh.data_hdr->ssn;
+				if (skip_pos == nskips)
+					nskips++;
+				if (nskips == 10)
+					break;
+			} else
+				break;
+		}
+	}
+
+	/* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point"
+	 * is greater than the Cumulative TSN ACK carried in the received
+	 * SACK, the data sender MUST send the data receiver a FORWARD TSN
+	 * chunk containing the latest value of the
+	 * "Advanced.Peer.Ack.Point".
+	 *
+	 * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD
+	 * list each stream and sequence number in the forwarded TSN. This
+	 * information will enable the receiver to easily find any
+	 * stranded TSN's waiting on stream reorder queues. Each stream
+	 * SHOULD only be reported once; this means that if multiple
+	 * abandoned messages occur in the same stream then only the
+	 * highest abandoned stream sequence number is reported. If the
+	 * total size of the FORWARD TSN does NOT fit in a single MTU then
+	 * the sender of the FORWARD TSN SHOULD lower the
+	 * Advanced.Peer.Ack.Point to the last TSN that will fit in a
+	 * single MTU.
+	 */
+	if (asoc->adv_peer_ack_point > ctsn)
+		ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point,
+					      nskips, &ftsn_skip_arr[0]);
+
+	if (ftsn_chunk) {
+		list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+	}
+}
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
new file mode 100644
index 00000000..534c7eae
--- /dev/null
+++ b/net/sctp/primitive.c
@@ -0,0 +1,220 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions implement the SCTP primitive functions from Section 10.
+ *
+ * Note that the descriptions from the specification are USER level
+ * functions--this file is the functions which populate the struct proto
+ * for SCTP which is the BOTTOM of the sockets interface.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Narasimha Budihal     <narasimha@refcode.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Kevin Gao             <kevin.gao@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/list.h> /* For struct list_head */
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/time.h> /* For struct timeval */
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+#define DECLARE_PRIMITIVE(name) \
+/* This is called in the code as sctp_primitive_ ## name.  */ \
+int sctp_primitive_ ## name(struct sctp_association *asoc, \
+			    void *arg) { \
+	int error = 0; \
+	sctp_event_t event_type; sctp_subtype_t subtype; \
+	sctp_state_t state; \
+	struct sctp_endpoint *ep; \
+	\
+	event_type = SCTP_EVENT_T_PRIMITIVE; \
+	subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \
+	state = asoc ? asoc->state : SCTP_STATE_CLOSED; \
+	ep = asoc ? asoc->ep : NULL; \
+	\
+	error = sctp_do_sm(event_type, subtype, state, ep, asoc, \
+			   arg, GFP_KERNEL); \
+	return error; \
+}
+
+/* 10.1 ULP-to-SCTP
+ * B) Associate
+ *
+ * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
+ *         outbound stream count)
+ * -> association id [,destination transport addr list] [,outbound stream
+ *    count]
+ *
+ * This primitive allows the upper layer to initiate an association to a
+ * specific peer endpoint.
+ *
+ * This version assumes that asoc is fully populated with the initial
+ * parameters.  We then return a traditional kernel indicator of
+ * success or failure.
+ */
+
+/* This is called in the code as sctp_primitive_ASSOCIATE.  */
+
+DECLARE_PRIMITIVE(ASSOCIATE)
+
+/* 10.1 ULP-to-SCTP
+ * C) Shutdown
+ *
+ * Format: SHUTDOWN(association id)
+ * -> result
+ *
+ * Gracefully closes an association. Any locally queued user data
+ * will be delivered to the peer. The association will be terminated only
+ * after the peer acknowledges all the SCTP packets sent.  A success code
+ * will be returned on successful termination of the association. If
+ * attempting to terminate the association results in a failure, an error
+ * code shall be returned.
+ */
+
+DECLARE_PRIMITIVE(SHUTDOWN);
+
+/* 10.1 ULP-to-SCTP
+ * C) Abort
+ *
+ * Format: Abort(association id [, cause code])
+ * -> result
+ *
+ * Ungracefully closes an association. Any locally queued user data
+ * will be discarded and an ABORT chunk is sent to the peer. A success
+ * code will be returned on successful abortion of the association. If
+ * attempting to abort the association results in a failure, an error
+ * code shall be returned.
+ */
+
+DECLARE_PRIMITIVE(ABORT);
+
+/* 10.1 ULP-to-SCTP
+ * E) Send
+ *
+ * Format: SEND(association id, buffer address, byte count [,context]
+ *         [,stream id] [,life time] [,destination transport address]
+ *         [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
+ * -> result
+ *
+ * This is the main method to send user data via SCTP.
+ *
+ * Mandatory attributes:
+ *
+ *  o association id - local handle to the SCTP association
+ *
+ *  o buffer address - the location where the user message to be
+ *    transmitted is stored;
+ *
+ *  o byte count - The size of the user data in number of bytes;
+ *
+ * Optional attributes:
+ *
+ *  o context - an optional 32 bit integer that will be carried in the
+ *    sending failure notification to the ULP if the transportation of
+ *    this User Message fails.
+ *
+ *  o stream id - to indicate which stream to send the data on. If not
+ *    specified, stream 0 will be used.
+ *
+ *  o life time - specifies the life time of the user data. The user data
+ *    will not be sent by SCTP after the life time expires. This
+ *    parameter can be used to avoid efforts to transmit stale
+ *    user messages. SCTP notifies the ULP if the data cannot be
+ *    initiated to transport (i.e. sent to the destination via SCTP's
+ *    send primitive) within the life time variable. However, the
+ *    user data will be transmitted if SCTP has attempted to transmit a
+ *    chunk before the life time expired.
+ *
+ *  o destination transport address - specified as one of the destination
+ *    transport addresses of the peer endpoint to which this packet
+ *    should be sent. Whenever possible, SCTP should use this destination
+ *    transport address for sending the packets, instead of the current
+ *    primary path.
+ *
+ *  o unorder flag - this flag, if present, indicates that the user
+ *    would like the data delivered in an unordered fashion to the peer
+ *    (i.e., the U flag is set to 1 on all DATA chunks carrying this
+ *    message).
+ *
+ *  o no-bundle flag - instructs SCTP not to bundle this user data with
+ *    other outbound DATA chunks. SCTP MAY still bundle even when
+ *    this flag is present, when faced with network congestion.
+ *
+ *  o payload protocol-id - A 32 bit unsigned integer that is to be
+ *    passed to the peer indicating the type of payload protocol data
+ *    being transmitted. This value is passed as opaque data by SCTP.
+ */
+
+DECLARE_PRIMITIVE(SEND);
+
+/* 10.1 ULP-to-SCTP
+ * J) Request Heartbeat
+ *
+ * Format: REQUESTHEARTBEAT(association id, destination transport address)
+ *
+ * -> result
+ *
+ * Instructs the local endpoint to perform a HeartBeat on the specified
+ * destination transport address of the given association. The returned
+ * result should indicate whether the transmission of the HEARTBEAT
+ * chunk to the destination address is successful.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o destination transport address - the transport address of the
+ *   association on which a heartbeat should be issued.
+ */
+
+DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
+
+/* ADDIP
+* 3.1.1 Address Configuration Change Chunk (ASCONF)
+*
+* This chunk is used to communicate to the remote endpoint one of the
+* configuration change requests that MUST be acknowledged.  The
+* information carried in the ASCONF Chunk uses the form of a
+* Type-Length-Value (TLV), as described in "3.2.1 Optional/
+* Variable-length Parameter Format" in RFC2960 [5], forall variable
+* parameters.
+*/
+
+DECLARE_PRIMITIVE(ASCONF);
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
new file mode 100644
index 00000000..bc6cd75c
--- /dev/null
+++ b/net/sctp/probe.c
@@ -0,0 +1,217 @@
+/*
+ * sctp_probe - Observe the SCTP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for SCTP from Stephen Hemminger's code
+ * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/time.h>
+#include <net/net_namespace.h>
+
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
+MODULE_DESCRIPTION("SCTP snooper");
+MODULE_LICENSE("GPL");
+
+static int port __read_mostly = 0;
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+static int bufsize __read_mostly = 64 * 1024;
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+static int full __read_mostly = 1;
+MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
+module_param(full, int, 0);
+
+static const char procname[] = "sctpprobe";
+
+static struct {
+	struct kfifo	  fifo;
+	spinlock_t	  lock;
+	wait_queue_head_t wait;
+	struct timespec	  tstart;
+} sctpw;
+
+static void printl(const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	char tbuf[256];
+
+	va_start(args, fmt);
+	len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
+	va_end(args);
+
+	kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
+	wake_up(&sctpw.wait);
+}
+
+static int sctpprobe_open(struct inode *inode, struct file *file)
+{
+	kfifo_reset(&sctpw.fifo);
+	getnstimeofday(&sctpw.tstart);
+
+	return 0;
+}
+
+static ssize_t sctpprobe_read(struct file *file, char __user *buf,
+			      size_t len, loff_t *ppos)
+{
+	int error = 0, cnt = 0;
+	unsigned char *tbuf;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (len == 0)
+		return 0;
+
+	tbuf = vmalloc(len);
+	if (!tbuf)
+		return -ENOMEM;
+
+	error = wait_event_interruptible(sctpw.wait,
+					 kfifo_len(&sctpw.fifo) != 0);
+	if (error)
+		goto out_free;
+
+	cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
+	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
+
+out_free:
+	vfree(tbuf);
+
+	return error ? error : cnt;
+}
+
+static const struct file_operations sctpprobe_fops = {
+	.owner	= THIS_MODULE,
+	.open	= sctpprobe_open,
+	.read	= sctpprobe_read,
+	.llseek = noop_llseek,
+};
+
+sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *sp;
+	static __u32 lcwnd = 0;
+	struct timespec now;
+
+	sp = asoc->peer.primary_path;
+
+	if ((full || sp->cwnd != lcwnd) &&
+	    (!port || asoc->peer.port == port ||
+	     ep->base.bind_addr.port == port)) {
+		lcwnd = sp->cwnd;
+
+		getnstimeofday(&now);
+		now = timespec_sub(now, sctpw.tstart);
+
+		printl("%lu.%06lu ", (unsigned long) now.tv_sec,
+		       (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+
+		printl("%p %5d %5d %5d %8d %5d ", asoc,
+		       ep->base.bind_addr.port, asoc->peer.port,
+		       asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
+
+		list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+					transports) {
+			if (sp == asoc->peer.primary_path)
+				printl("*");
+
+			if (sp->ipaddr.sa.sa_family == AF_INET)
+				printl("%pI4 ", &sp->ipaddr.v4.sin_addr);
+			else
+				printl("%pI6 ", &sp->ipaddr.v6.sin6_addr);
+
+			printl("%2u %8u %8u %8u %8u %8u ",
+			       sp->state, sp->cwnd, sp->ssthresh,
+			       sp->flight_size, sp->partial_bytes_acked,
+			       sp->pathmtu);
+		}
+		printl("\n");
+	}
+
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe sctp_recv_probe = {
+	.kp	= {
+		.symbol_name = "sctp_sf_eat_sack_6_2",
+	},
+	.entry	= jsctp_sf_eat_sack,
+};
+
+static __init int sctpprobe_init(void)
+{
+	int ret = -ENOMEM;
+
+	init_waitqueue_head(&sctpw.wait);
+	spin_lock_init(&sctpw.lock);
+	if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
+		return ret;
+
+	if (!proc_net_fops_create(&init_net, procname, S_IRUSR,
+				  &sctpprobe_fops))
+		goto free_kfifo;
+
+	ret = register_jprobe(&sctp_recv_probe);
+	if (ret)
+		goto remove_proc;
+
+	pr_info("probe registered (port=%d)\n", port);
+
+	return 0;
+
+remove_proc:
+	proc_net_remove(&init_net, procname);
+free_kfifo:
+	kfifo_free(&sctpw.fifo);
+	return ret;
+}
+
+static __exit void sctpprobe_exit(void)
+{
+	kfifo_free(&sctpw.fifo);
+	proc_net_remove(&init_net, procname);
+	unregister_jprobe(&sctp_recv_probe);
+}
+
+module_init(sctpprobe_init);
+module_exit(sctpprobe_exit);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
new file mode 100644
index 00000000..05a6ce21
--- /dev/null
+++ b/net/sctp/proc.c
@@ -0,0 +1,516 @@
+/* SCTP kernel implementation
+ * Copyright (c) 2003 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Sridhar Samudrala <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <net/sctp/sctp.h>
+#include <net/ip.h> /* for snmp_fold_field */
+
+static const struct snmp_mib sctp_snmp_list[] = {
+	SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB),
+	SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS),
+	SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS),
+	SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS),
+	SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS),
+	SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES),
+	SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS),
+	SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS),
+	SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS),
+	SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS),
+	SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS),
+	SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS),
+	SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS),
+	SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS),
+	SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
+	SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
+	SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+	SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS),
+	SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS),
+	SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ),
+	SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG),
+	SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS),
+	SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS),
+	SNMP_MIB_SENTINEL
+};
+
+/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
+static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; sctp_snmp_list[i].name != NULL; i++)
+		seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
+			   snmp_fold_field((void __percpu **)sctp_statistics,
+				      sctp_snmp_list[i].entry));
+
+	return 0;
+}
+
+/* Initialize the seq file operations for 'snmp' object. */
+static int sctp_snmp_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sctp_snmp_seq_show, NULL);
+}
+
+static const struct file_operations sctp_snmp_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = sctp_snmp_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+/* Set up the proc fs entry for 'snmp' object. */
+int __init sctp_snmp_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("snmp", S_IRUGO, proc_net_sctp, &sctp_snmp_seq_fops);
+	if (!p)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Cleanup the proc fs entry for 'snmp' object. */
+void sctp_snmp_proc_exit(void)
+{
+	remove_proc_entry("snmp", proc_net_sctp);
+}
+
+/* Dump local addresses of an association/endpoint. */
+static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
+{
+	struct sctp_association *asoc;
+	struct sctp_sockaddr_entry *laddr;
+	struct sctp_transport *peer;
+	union sctp_addr *addr, *primary = NULL;
+	struct sctp_af *af;
+
+	if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
+	    asoc = sctp_assoc(epb);
+	    peer = asoc->peer.primary_path;
+	    primary = &peer->saddr;
+	}
+
+	list_for_each_entry(laddr, &epb->bind_addr.address_list, list) {
+		addr = &laddr->a;
+		af = sctp_get_af_specific(addr->sa.sa_family);
+		if (primary && af->cmp_addr(addr, primary)) {
+			seq_printf(seq, "*");
+		}
+		af->seq_dump_addr(seq, addr);
+	}
+}
+
+/* Dump remote addresses of an association. */
+static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc)
+{
+	struct sctp_transport *transport;
+	union sctp_addr *addr, *primary;
+	struct sctp_af *af;
+
+	primary = &assoc->peer.primary_addr;
+	list_for_each_entry(transport, &assoc->peer.transport_addr_list,
+			transports) {
+		addr = &transport->ipaddr;
+		af = sctp_get_af_specific(addr->sa.sa_family);
+		if (af->cmp_addr(addr, primary)) {
+			seq_printf(seq, "*");
+		}
+		af->seq_dump_addr(seq, addr);
+	}
+}
+
+static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= sctp_ep_hashsize)
+		return NULL;
+
+	if (*pos < 0)
+		*pos = 0;
+
+	if (*pos == 0)
+		seq_printf(seq, " ENDPT     SOCK   STY SST HBKT LPORT   UID INODE LADDRS\n");
+
+	return (void *)pos;
+}
+
+static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+
+static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	if (++*pos >= sctp_ep_hashsize)
+		return NULL;
+
+	return pos;
+}
+
+
+/* Display sctp endpoints (/proc/net/sctp/eps). */
+static int sctp_eps_seq_show(struct seq_file *seq, void *v)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct sctp_endpoint *ep;
+	struct sock *sk;
+	struct hlist_node *node;
+	int    hash = *(loff_t *)v;
+
+	if (hash >= sctp_ep_hashsize)
+		return -ENOMEM;
+
+	head = &sctp_ep_hashtable[hash];
+	sctp_local_bh_disable();
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		ep = sctp_ep(epb);
+		sk = epb->sk;
+		seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
+			   sctp_sk(sk)->type, sk->sk_state, hash,
+			   epb->bind_addr.port,
+			   sock_i_uid(sk), sock_i_ino(sk));
+
+		sctp_seq_dump_local_addrs(seq, epb);
+		seq_printf(seq, "\n");
+	}
+	read_unlock(&head->lock);
+	sctp_local_bh_enable();
+
+	return 0;
+}
+
+static const struct seq_operations sctp_eps_ops = {
+	.start = sctp_eps_seq_start,
+	.next  = sctp_eps_seq_next,
+	.stop  = sctp_eps_seq_stop,
+	.show  = sctp_eps_seq_show,
+};
+
+
+/* Initialize the seq file operations for 'eps' object. */
+static int sctp_eps_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sctp_eps_ops);
+}
+
+static const struct file_operations sctp_eps_seq_fops = {
+	.open	 = sctp_eps_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+/* Set up the proc fs entry for 'eps' object. */
+int __init sctp_eps_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("eps", S_IRUGO, proc_net_sctp, &sctp_eps_seq_fops);
+	if (!p)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Cleanup the proc fs entry for 'eps' object. */
+void sctp_eps_proc_exit(void)
+{
+	remove_proc_entry("eps", proc_net_sctp);
+}
+
+
+static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= sctp_assoc_hashsize)
+		return NULL;
+
+	if (*pos < 0)
+		*pos = 0;
+
+	if (*pos == 0)
+		seq_printf(seq, " ASSOC     SOCK   STY SST ST HBKT "
+				"ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
+				"RPORT LADDRS <-> RADDRS "
+				"HBINT INS OUTS MAXRT T1X T2X RTXC\n");
+
+	return (void *)pos;
+}
+
+static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+
+static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	if (++*pos >= sctp_assoc_hashsize)
+		return NULL;
+
+	return pos;
+}
+
+/* Display sctp associations (/proc/net/sctp/assocs). */
+static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct sctp_association *assoc;
+	struct sock *sk;
+	struct hlist_node *node;
+	int    hash = *(loff_t *)v;
+
+	if (hash >= sctp_assoc_hashsize)
+		return -ENOMEM;
+
+	head = &sctp_assoc_hashtable[hash];
+	sctp_local_bh_disable();
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		assoc = sctp_assoc(epb);
+		sk = epb->sk;
+		seq_printf(seq,
+			   "%8pK %8pK %-3d %-3d %-2d %-4d "
+			   "%4d %8d %8d %7d %5lu %-5d %5d ",
+			   assoc, sk, sctp_sk(sk)->type, sk->sk_state,
+			   assoc->state, hash,
+			   assoc->assoc_id,
+			   assoc->sndbuf_used,
+			   atomic_read(&assoc->rmem_alloc),
+			   sock_i_uid(sk), sock_i_ino(sk),
+			   epb->bind_addr.port,
+			   assoc->peer.port);
+		seq_printf(seq, " ");
+		sctp_seq_dump_local_addrs(seq, epb);
+		seq_printf(seq, "<-> ");
+		sctp_seq_dump_remote_addrs(seq, assoc);
+		seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d ",
+			assoc->hbinterval, assoc->c.sinit_max_instreams,
+			assoc->c.sinit_num_ostreams, assoc->max_retrans,
+			assoc->init_retries, assoc->shutdown_retries,
+			assoc->rtx_data_chunks);
+		seq_printf(seq, "\n");
+	}
+	read_unlock(&head->lock);
+	sctp_local_bh_enable();
+
+	return 0;
+}
+
+static const struct seq_operations sctp_assoc_ops = {
+	.start = sctp_assocs_seq_start,
+	.next  = sctp_assocs_seq_next,
+	.stop  = sctp_assocs_seq_stop,
+	.show  = sctp_assocs_seq_show,
+};
+
+/* Initialize the seq file operations for 'assocs' object. */
+static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sctp_assoc_ops);
+}
+
+static const struct file_operations sctp_assocs_seq_fops = {
+	.open	 = sctp_assocs_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+/* Set up the proc fs entry for 'assocs' object. */
+int __init sctp_assocs_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("assocs", S_IRUGO, proc_net_sctp,
+			&sctp_assocs_seq_fops);
+	if (!p)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Cleanup the proc fs entry for 'assocs' object. */
+void sctp_assocs_proc_exit(void)
+{
+	remove_proc_entry("assocs", proc_net_sctp);
+}
+
+static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= sctp_assoc_hashsize)
+		return NULL;
+
+	if (*pos < 0)
+		*pos = 0;
+
+	if (*pos == 0)
+		seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
+				"REM_ADDR_RTX  START\n");
+
+	return (void *)pos;
+}
+
+static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	if (++*pos >= sctp_assoc_hashsize)
+		return NULL;
+
+	return pos;
+}
+
+static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
+{
+	struct sctp_hashbucket *head;
+	struct sctp_ep_common *epb;
+	struct sctp_association *assoc;
+	struct hlist_node *node;
+	struct sctp_transport *tsp;
+	int    hash = *(loff_t *)v;
+
+	if (hash >= sctp_assoc_hashsize)
+		return -ENOMEM;
+
+	head = &sctp_assoc_hashtable[hash];
+	sctp_local_bh_disable();
+	read_lock(&head->lock);
+	sctp_for_each_hentry(epb, node, &head->chain) {
+		assoc = sctp_assoc(epb);
+		list_for_each_entry(tsp, &assoc->peer.transport_addr_list,
+					transports) {
+			/*
+			 * The remote address (ADDR)
+			 */
+			tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr);
+			seq_printf(seq, " ");
+
+			/*
+			 * The association ID (ASSOC_ID)
+			 */
+			seq_printf(seq, "%d ", tsp->asoc->assoc_id);
+
+			/*
+			 * If the Heartbeat is active (HB_ACT)
+			 * Note: 1 = Active, 0 = Inactive
+			 */
+			seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer));
+
+			/*
+			 * Retransmit time out (RTO)
+			 */
+			seq_printf(seq, "%lu ", tsp->rto);
+
+			/*
+			 * Maximum path retransmit count (PATH_MAX_RTX)
+			 */
+			seq_printf(seq, "%d ", tsp->pathmaxrxt);
+
+			/*
+			 * remote address retransmit count (REM_ADDR_RTX)
+			 * Note: We don't have a way to tally this at the moment
+			 * so lets just leave it as zero for the moment
+			 */
+			seq_printf(seq, "0 ");
+
+			/*
+			 * remote address start time (START).  This is also not
+			 * currently implemented, but we can record it with a
+			 * jiffies marker in a subsequent patch
+			 */
+			seq_printf(seq, "0");
+
+			seq_printf(seq, "\n");
+		}
+	}
+
+	read_unlock(&head->lock);
+	sctp_local_bh_enable();
+
+	return 0;
+
+}
+
+static const struct seq_operations sctp_remaddr_ops = {
+	.start = sctp_remaddr_seq_start,
+	.next  = sctp_remaddr_seq_next,
+	.stop  = sctp_remaddr_seq_stop,
+	.show  = sctp_remaddr_seq_show,
+};
+
+/* Cleanup the proc fs entry for 'remaddr' object. */
+void sctp_remaddr_proc_exit(void)
+{
+	remove_proc_entry("remaddr", proc_net_sctp);
+}
+
+static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sctp_remaddr_ops);
+}
+
+static const struct file_operations sctp_remaddr_seq_fops = {
+	.open = sctp_remaddr_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+int __init sctp_remaddr_proc_init(void)
+{
+	struct proc_dir_entry *p;
+
+	p = proc_create("remaddr", S_IRUGO, proc_net_sctp, &sctp_remaddr_seq_fops);
+	if (!p)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
new file mode 100644
index 00000000..946afd60
--- /dev/null
+++ b/net/sctp/protocol.c
@@ -0,0 +1,1381 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Initialization/cleanup for SCTP protocol support.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson <karl@athena.chicago.il.us>
+ *    Jon Grimm <jgrimm@us.ibm.com>
+ *    Sridhar Samudrala <sri@us.ibm.com>
+ *    Daisy Chang <daisyc@us.ibm.com>
+ *    Ardelle Fan <ardelle.fan@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/route.h>
+#include <net/sctp/sctp.h>
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_ecn.h>
+
+/* Global data structures. */
+struct sctp_globals sctp_globals __read_mostly;
+DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry	*proc_net_sctp;
+#endif
+
+struct idr sctp_assocs_id;
+DEFINE_SPINLOCK(sctp_assocs_id_lock);
+
+/* This is the global socket data structure used for responding to
+ * the Out-of-the-blue (OOTB) packets.  A control sock will be created
+ * for this socket at the initialization time.
+ */
+static struct sock *sctp_ctl_sock;
+
+static struct sctp_pf *sctp_pf_inet6_specific;
+static struct sctp_pf *sctp_pf_inet_specific;
+static struct sctp_af *sctp_af_v4_specific;
+static struct sctp_af *sctp_af_v6_specific;
+
+struct kmem_cache *sctp_chunk_cachep __read_mostly;
+struct kmem_cache *sctp_bucket_cachep __read_mostly;
+
+long sysctl_sctp_mem[3];
+int sysctl_sctp_rmem[3];
+int sysctl_sctp_wmem[3];
+
+/* Return the address of the control sock. */
+struct sock *sctp_get_ctl_sock(void)
+{
+	return sctp_ctl_sock;
+}
+
+/* Set up the proc fs entry for the SCTP protocol. */
+static __init int sctp_proc_init(void)
+{
+	if (percpu_counter_init(&sctp_sockets_allocated, 0))
+		goto out_nomem;
+#ifdef CONFIG_PROC_FS
+	if (!proc_net_sctp) {
+		proc_net_sctp = proc_mkdir("sctp", init_net.proc_net);
+		if (!proc_net_sctp)
+			goto out_free_percpu;
+	}
+
+	if (sctp_snmp_proc_init())
+		goto out_snmp_proc_init;
+	if (sctp_eps_proc_init())
+		goto out_eps_proc_init;
+	if (sctp_assocs_proc_init())
+		goto out_assocs_proc_init;
+	if (sctp_remaddr_proc_init())
+		goto out_remaddr_proc_init;
+
+	return 0;
+
+out_remaddr_proc_init:
+	sctp_assocs_proc_exit();
+out_assocs_proc_init:
+	sctp_eps_proc_exit();
+out_eps_proc_init:
+	sctp_snmp_proc_exit();
+out_snmp_proc_init:
+	if (proc_net_sctp) {
+		proc_net_sctp = NULL;
+		remove_proc_entry("sctp", init_net.proc_net);
+	}
+out_free_percpu:
+	percpu_counter_destroy(&sctp_sockets_allocated);
+#else
+	return 0;
+#endif /* CONFIG_PROC_FS */
+
+out_nomem:
+	return -ENOMEM;
+}
+
+/* Clean up the proc fs entry for the SCTP protocol.
+ * Note: Do not make this __exit as it is used in the init error
+ * path.
+ */
+static void sctp_proc_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	sctp_snmp_proc_exit();
+	sctp_eps_proc_exit();
+	sctp_assocs_proc_exit();
+	sctp_remaddr_proc_exit();
+
+	if (proc_net_sctp) {
+		proc_net_sctp = NULL;
+		remove_proc_entry("sctp", init_net.proc_net);
+	}
+#endif
+	percpu_counter_destroy(&sctp_sockets_allocated);
+}
+
+/* Private helper to extract ipv4 address and stash them in
+ * the protocol structure.
+ */
+static void sctp_v4_copy_addrlist(struct list_head *addrlist,
+				  struct net_device *dev)
+{
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+	struct sctp_sockaddr_entry *addr;
+
+	rcu_read_lock();
+	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+		rcu_read_unlock();
+		return;
+	}
+
+	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+		/* Add the address to the local list.  */
+		addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC);
+		if (addr) {
+			addr->a.v4.sin_family = AF_INET;
+			addr->a.v4.sin_port = 0;
+			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
+			addr->valid = 1;
+			INIT_LIST_HEAD(&addr->list);
+			list_add_tail(&addr->list, addrlist);
+		}
+	}
+
+	rcu_read_unlock();
+}
+
+/* Extract our IP addresses from the system and stash them in the
+ * protocol structure.
+ */
+static void sctp_get_local_addr_list(void)
+{
+	struct net_device *dev;
+	struct list_head *pos;
+	struct sctp_af *af;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
+		__list_for_each(pos, &sctp_address_families) {
+			af = list_entry(pos, struct sctp_af, list);
+			af->copy_addrlist(&sctp_local_addr_list, dev);
+		}
+	}
+	rcu_read_unlock();
+}
+
+/* Free the existing local addresses.  */
+static void sctp_free_local_addr_list(void)
+{
+	struct sctp_sockaddr_entry *addr;
+	struct list_head *pos, *temp;
+
+	list_for_each_safe(pos, temp, &sctp_local_addr_list) {
+		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
+		list_del(pos);
+		kfree(addr);
+	}
+}
+
+/* Copy the local addresses which are valid for 'scope' into 'bp'.  */
+int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
+			      gfp_t gfp, int copy_flags)
+{
+	struct sctp_sockaddr_entry *addr;
+	int error = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) {
+		if (!addr->valid)
+			continue;
+		if (sctp_in_scope(&addr->a, scope)) {
+			/* Now that the address is in scope, check to see if
+			 * the address type is really supported by the local
+			 * sock as well as the remote peer.
+			 */
+			if ((((AF_INET == addr->a.sa.sa_family) &&
+			      (copy_flags & SCTP_ADDR4_PEERSUPP))) ||
+			    (((AF_INET6 == addr->a.sa.sa_family) &&
+			      (copy_flags & SCTP_ADDR6_ALLOWED) &&
+			      (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
+				error = sctp_add_bind_addr(bp, &addr->a,
+						    SCTP_ADDR_SRC, GFP_ATOMIC);
+				if (error)
+					goto end_copy;
+			}
+		}
+	}
+
+end_copy:
+	rcu_read_unlock();
+	return error;
+}
+
+/* Initialize a sctp_addr from in incoming skb.  */
+static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
+			     int is_saddr)
+{
+	void *from;
+	__be16 *port;
+	struct sctphdr *sh;
+
+	port = &addr->v4.sin_port;
+	addr->v4.sin_family = AF_INET;
+
+	sh = sctp_hdr(skb);
+	if (is_saddr) {
+		*port  = sh->source;
+		from = &ip_hdr(skb)->saddr;
+	} else {
+		*port = sh->dest;
+		from = &ip_hdr(skb)->daddr;
+	}
+	memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr));
+}
+
+/* Initialize an sctp_addr from a socket. */
+static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk)
+{
+	addr->v4.sin_family = AF_INET;
+	addr->v4.sin_port = 0;
+	addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr;
+}
+
+/* Initialize sk->sk_rcv_saddr from sctp_addr. */
+static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
+{
+	inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr;
+}
+
+/* Initialize sk->sk_daddr from sctp_addr. */
+static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
+{
+	inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr;
+}
+
+/* Initialize a sctp_addr from an address parameter. */
+static void sctp_v4_from_addr_param(union sctp_addr *addr,
+				    union sctp_addr_param *param,
+				    __be16 port, int iif)
+{
+	addr->v4.sin_family = AF_INET;
+	addr->v4.sin_port = port;
+	addr->v4.sin_addr.s_addr = param->v4.addr.s_addr;
+}
+
+/* Initialize an address parameter from a sctp_addr and return the length
+ * of the address parameter.
+ */
+static int sctp_v4_to_addr_param(const union sctp_addr *addr,
+				 union sctp_addr_param *param)
+{
+	int length = sizeof(sctp_ipv4addr_param_t);
+
+	param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS;
+	param->v4.param_hdr.length = htons(length);
+	param->v4.addr.s_addr = addr->v4.sin_addr.s_addr;
+
+	return length;
+}
+
+/* Initialize a sctp_addr from a dst_entry. */
+static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4,
+			      __be16 port)
+{
+	saddr->v4.sin_family = AF_INET;
+	saddr->v4.sin_port = port;
+	saddr->v4.sin_addr.s_addr = fl4->saddr;
+}
+
+/* Compare two addresses exactly. */
+static int sctp_v4_cmp_addr(const union sctp_addr *addr1,
+			    const union sctp_addr *addr2)
+{
+	if (addr1->sa.sa_family != addr2->sa.sa_family)
+		return 0;
+	if (addr1->v4.sin_port != addr2->v4.sin_port)
+		return 0;
+	if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr)
+		return 0;
+
+	return 1;
+}
+
+/* Initialize addr struct to INADDR_ANY. */
+static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port)
+{
+	addr->v4.sin_family = AF_INET;
+	addr->v4.sin_addr.s_addr = htonl(INADDR_ANY);
+	addr->v4.sin_port = port;
+}
+
+/* Is this a wildcard address? */
+static int sctp_v4_is_any(const union sctp_addr *addr)
+{
+	return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr;
+}
+
+/* This function checks if the address is a valid address to be used for
+ * SCTP binding.
+ *
+ * Output:
+ * Return 0 - If the address is a non-unicast or an illegal address.
+ * Return 1 - If the address is a unicast.
+ */
+static int sctp_v4_addr_valid(union sctp_addr *addr,
+			      struct sctp_sock *sp,
+			      const struct sk_buff *skb)
+{
+	/* IPv4 addresses not allowed */
+	if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
+		return 0;
+
+	/* Is this a non-unicast address or a unusable SCTP address? */
+	if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr))
+		return 0;
+
+	/* Is this a broadcast address? */
+	if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST)
+		return 0;
+
+	return 1;
+}
+
+/* Should this be available for binding?   */
+static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
+{
+	int ret = inet_addr_type(&init_net, addr->v4.sin_addr.s_addr);
+
+
+	if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
+	   ret != RTN_LOCAL &&
+	   !sp->inet.freebind &&
+	   !sysctl_ip_nonlocal_bind)
+		return 0;
+
+	if (ipv6_only_sock(sctp_opt2sk(sp)))
+		return 0;
+
+	return 1;
+}
+
+/* Checking the loopback, private and other address scopes as defined in
+ * RFC 1918.   The IPv4 scoping is based on the draft for SCTP IPv4
+ * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>.
+ *
+ * Level 0 - unusable SCTP addresses
+ * Level 1 - loopback address
+ * Level 2 - link-local addresses
+ * Level 3 - private addresses.
+ * Level 4 - global addresses
+ * For INIT and INIT-ACK address list, let L be the level of
+ * of requested destination address, sender and receiver
+ * SHOULD include all of its addresses with level greater
+ * than or equal to L.
+ *
+ * IPv4 scoping can be controlled through sysctl option
+ * net.sctp.addr_scope_policy
+ */
+static sctp_scope_t sctp_v4_scope(union sctp_addr *addr)
+{
+	sctp_scope_t retval;
+
+	/* Check for unusable SCTP addresses. */
+	if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) {
+		retval =  SCTP_SCOPE_UNUSABLE;
+	} else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) {
+		retval = SCTP_SCOPE_LOOPBACK;
+	} else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) {
+		retval = SCTP_SCOPE_LINK;
+	} else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
+		   ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
+		   ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
+		retval = SCTP_SCOPE_PRIVATE;
+	} else {
+		retval = SCTP_SCOPE_GLOBAL;
+	}
+
+	return retval;
+}
+
+/* Returns a valid dst cache entry for the given source and destination ip
+ * addresses. If an association is passed, trys to get a dst entry with a
+ * source address that matches an address in the bind address list.
+ */
+static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
+				struct flowi *fl, struct sock *sk)
+{
+	struct sctp_association *asoc = t->asoc;
+	struct rtable *rt;
+	struct flowi4 *fl4 = &fl->u.ip4;
+	struct sctp_bind_addr *bp;
+	struct sctp_sockaddr_entry *laddr;
+	struct dst_entry *dst = NULL;
+	union sctp_addr *daddr = &t->ipaddr;
+	union sctp_addr dst_saddr;
+
+	memset(fl4, 0x0, sizeof(struct flowi4));
+	fl4->daddr  = daddr->v4.sin_addr.s_addr;
+	fl4->fl4_dport = daddr->v4.sin_port;
+	fl4->flowi4_proto = IPPROTO_SCTP;
+	if (asoc) {
+		fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+		fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
+		fl4->fl4_sport = htons(asoc->base.bind_addr.port);
+	}
+	if (saddr) {
+		fl4->saddr = saddr->v4.sin_addr.s_addr;
+		fl4->fl4_sport = saddr->v4.sin_port;
+	}
+
+	SCTP_DEBUG_PRINTK("%s: DST:%pI4, SRC:%pI4 - ",
+			  __func__, &fl4->daddr, &fl4->saddr);
+
+	rt = ip_route_output_key(&init_net, fl4);
+	if (!IS_ERR(rt))
+		dst = &rt->dst;
+
+	/* If there is no association or if a source address is passed, no
+	 * more validation is required.
+	 */
+	if (!asoc || saddr)
+		goto out;
+
+	bp = &asoc->base.bind_addr;
+
+	if (dst) {
+		/* Walk through the bind address list and look for a bind
+		 * address that matches the source address of the returned dst.
+		 */
+		sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port));
+		rcu_read_lock();
+		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
+				continue;
+			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
+				goto out_unlock;
+		}
+		rcu_read_unlock();
+
+		/* None of the bound addresses match the source address of the
+		 * dst. So release it.
+		 */
+		dst_release(dst);
+		dst = NULL;
+	}
+
+	/* Walk through the bind address list and try to get a dst that
+	 * matches a bind address as the source address.
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
+		if (!laddr->valid)
+			continue;
+		if ((laddr->state == SCTP_ADDR_SRC) &&
+		    (AF_INET == laddr->a.sa.sa_family)) {
+			fl4->saddr = laddr->a.v4.sin_addr.s_addr;
+			fl4->fl4_sport = laddr->a.v4.sin_port;
+			rt = ip_route_output_key(&init_net, fl4);
+			if (!IS_ERR(rt)) {
+				dst = &rt->dst;
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	t->dst = dst;
+	if (dst)
+		SCTP_DEBUG_PRINTK("rt_dst:%pI4, rt_src:%pI4\n",
+				  &fl4->daddr, &fl4->saddr);
+	else
+		SCTP_DEBUG_PRINTK("NO ROUTE\n");
+}
+
+/* For v4, the source address is cached in the route entry(dst). So no need
+ * to cache it separately and hence this is an empty routine.
+ */
+static void sctp_v4_get_saddr(struct sctp_sock *sk,
+			      struct sctp_transport *t,
+			      struct flowi *fl)
+{
+	union sctp_addr *saddr = &t->saddr;
+	struct rtable *rt = (struct rtable *)t->dst;
+
+	if (rt) {
+		saddr->v4.sin_family = AF_INET;
+		saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr;
+	}
+}
+
+/* What interface did this skb arrive on? */
+static int sctp_v4_skb_iif(const struct sk_buff *skb)
+{
+	return skb_rtable(skb)->rt_iif;
+}
+
+/* Was this packet marked by Explicit Congestion Notification? */
+static int sctp_v4_is_ce(const struct sk_buff *skb)
+{
+	return INET_ECN_is_ce(ip_hdr(skb)->tos);
+}
+
+/* Create and initialize a new sk for the socket returned by accept(). */
+static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
+					     struct sctp_association *asoc)
+{
+	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
+			sk->sk_prot);
+	struct inet_sock *newinet;
+
+	if (!newsk)
+		goto out;
+
+	sock_init_data(NULL, newsk);
+
+	sctp_copy_sock(newsk, sk, asoc);
+	sock_reset_flag(newsk, SOCK_ZAPPED);
+
+	newinet = inet_sk(newsk);
+
+	newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;
+
+	sk_refcnt_debug_inc(newsk);
+
+	if (newsk->sk_prot->init(newsk)) {
+		sk_common_release(newsk);
+		newsk = NULL;
+	}
+
+out:
+	return newsk;
+}
+
+/* Map address, empty for v4 family */
+static void sctp_v4_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr)
+{
+	/* Empty */
+}
+
+/* Dump the v4 addr to the seq file. */
+static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
+{
+	seq_printf(seq, "%pI4 ", &addr->v4.sin_addr);
+}
+
+static void sctp_v4_ecn_capable(struct sock *sk)
+{
+	INET_ECN_xmit(sk);
+}
+
+/* Event handler for inet address addition/deletion events.
+ * The sctp_local_addr_list needs to be protocted by a spin lock since
+ * multiple notifiers (say IPv4 and IPv6) may be running at the same
+ * time and thus corrupt the list.
+ * The reader side is protected with RCU.
+ */
+static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
+			       void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	struct sctp_sockaddr_entry *addr = NULL;
+	struct sctp_sockaddr_entry *temp;
+	int found = 0;
+
+	if (!net_eq(dev_net(ifa->ifa_dev->dev), &init_net))
+		return NOTIFY_DONE;
+
+	switch (ev) {
+	case NETDEV_UP:
+		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
+		if (addr) {
+			addr->a.v4.sin_family = AF_INET;
+			addr->a.v4.sin_port = 0;
+			addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
+			addr->valid = 1;
+			spin_lock_bh(&sctp_local_addr_lock);
+			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
+			spin_unlock_bh(&sctp_local_addr_lock);
+		}
+		break;
+	case NETDEV_DOWN:
+		spin_lock_bh(&sctp_local_addr_lock);
+		list_for_each_entry_safe(addr, temp,
+					&sctp_local_addr_list, list) {
+			if (addr->a.sa.sa_family == AF_INET &&
+					addr->a.v4.sin_addr.s_addr ==
+					ifa->ifa_local) {
+				found = 1;
+				addr->valid = 0;
+				list_del_rcu(&addr->list);
+				break;
+			}
+		}
+		spin_unlock_bh(&sctp_local_addr_lock);
+		if (found)
+			kfree_rcu(addr, rcu);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Initialize the control inode/socket with a control endpoint data
+ * structure.  This endpoint is reserved exclusively for the OOTB processing.
+ */
+static int sctp_ctl_sock_init(void)
+{
+	int err;
+	sa_family_t family = PF_INET;
+
+	if (sctp_get_pf_specific(PF_INET6))
+		family = PF_INET6;
+
+	err = inet_ctl_sock_create(&sctp_ctl_sock, family,
+				   SOCK_SEQPACKET, IPPROTO_SCTP, &init_net);
+
+	/* If IPv6 socket could not be created, try the IPv4 socket */
+	if (err < 0 && family == PF_INET6)
+		err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET,
+					   SOCK_SEQPACKET, IPPROTO_SCTP,
+					   &init_net);
+
+	if (err < 0) {
+		pr_err("Failed to create the SCTP control socket\n");
+		return err;
+	}
+	return 0;
+}
+
+/* Register address family specific functions. */
+int sctp_register_af(struct sctp_af *af)
+{
+	switch (af->sa_family) {
+	case AF_INET:
+		if (sctp_af_v4_specific)
+			return 0;
+		sctp_af_v4_specific = af;
+		break;
+	case AF_INET6:
+		if (sctp_af_v6_specific)
+			return 0;
+		sctp_af_v6_specific = af;
+		break;
+	default:
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&af->list);
+	list_add_tail(&af->list, &sctp_address_families);
+	return 1;
+}
+
+/* Get the table of functions for manipulating a particular address
+ * family.
+ */
+struct sctp_af *sctp_get_af_specific(sa_family_t family)
+{
+	switch (family) {
+	case AF_INET:
+		return sctp_af_v4_specific;
+	case AF_INET6:
+		return sctp_af_v6_specific;
+	default:
+		return NULL;
+	}
+}
+
+/* Common code to initialize a AF_INET msg_name. */
+static void sctp_inet_msgname(char *msgname, int *addr_len)
+{
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *)msgname;
+	*addr_len = sizeof(struct sockaddr_in);
+	sin->sin_family = AF_INET;
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+}
+
+/* Copy the primary address of the peer primary address as the msg_name. */
+static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname,
+				    int *addr_len)
+{
+	struct sockaddr_in *sin, *sinfrom;
+
+	if (msgname) {
+		struct sctp_association *asoc;
+
+		asoc = event->asoc;
+		sctp_inet_msgname(msgname, addr_len);
+		sin = (struct sockaddr_in *)msgname;
+		sinfrom = &asoc->peer.primary_addr.v4;
+		sin->sin_port = htons(asoc->peer.port);
+		sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr;
+	}
+}
+
+/* Initialize and copy out a msgname from an inbound skb. */
+static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
+{
+	if (msgname) {
+		struct sctphdr *sh = sctp_hdr(skb);
+		struct sockaddr_in *sin = (struct sockaddr_in *)msgname;
+
+		sctp_inet_msgname(msgname, len);
+		sin->sin_port = sh->source;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+	}
+}
+
+/* Do we support this AF? */
+static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
+{
+	/* PF_INET only supports AF_INET addresses. */
+	return AF_INET == family;
+}
+
+/* Address matching with wildcards allowed. */
+static int sctp_inet_cmp_addr(const union sctp_addr *addr1,
+			      const union sctp_addr *addr2,
+			      struct sctp_sock *opt)
+{
+	/* PF_INET only supports AF_INET addresses. */
+	if (addr1->sa.sa_family != addr2->sa.sa_family)
+		return 0;
+	if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr ||
+	    htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr)
+		return 1;
+	if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr)
+		return 1;
+
+	return 0;
+}
+
+/* Verify that provided sockaddr looks bindable.  Common verification has
+ * already been taken care of.
+ */
+static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+	return sctp_v4_available(addr, opt);
+}
+
+/* Verify that sockaddr looks sendable.  Common verification has already
+ * been taken care of.
+ */
+static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
+{
+	return 1;
+}
+
+/* Fill in Supported Address Type information for INIT and INIT-ACK
+ * chunks.  Returns number of addresses supported.
+ */
+static int sctp_inet_supported_addrs(const struct sctp_sock *opt,
+				     __be16 *types)
+{
+	types[0] = SCTP_PARAM_IPV4_ADDRESS;
+	return 1;
+}
+
+/* Wrapper routine that calls the ip transmit routine. */
+static inline int sctp_v4_xmit(struct sk_buff *skb,
+			       struct sctp_transport *transport)
+{
+	struct inet_sock *inet = inet_sk(skb->sk);
+
+	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n",
+			  __func__, skb, skb->len,
+			  &transport->fl.u.ip4.saddr,
+			  &transport->fl.u.ip4.daddr);
+
+	inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
+			 IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
+
+	SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS);
+	return ip_queue_xmit(skb, &transport->fl);
+}
+
+static struct sctp_af sctp_af_inet;
+
+static struct sctp_pf sctp_pf_inet = {
+	.event_msgname = sctp_inet_event_msgname,
+	.skb_msgname   = sctp_inet_skb_msgname,
+	.af_supported  = sctp_inet_af_supported,
+	.cmp_addr      = sctp_inet_cmp_addr,
+	.bind_verify   = sctp_inet_bind_verify,
+	.send_verify   = sctp_inet_send_verify,
+	.supported_addrs = sctp_inet_supported_addrs,
+	.create_accept_sk = sctp_v4_create_accept_sk,
+	.addr_v4map	= sctp_v4_addr_v4map,
+	.af            = &sctp_af_inet
+};
+
+/* Notifier for inetaddr addition/deletion events.  */
+static struct notifier_block sctp_inetaddr_notifier = {
+	.notifier_call = sctp_inetaddr_event,
+};
+
+/* Socket operations.  */
+static const struct proto_ops inet_seqpacket_ops = {
+	.family		   = PF_INET,
+	.owner		   = THIS_MODULE,
+	.release	   = inet_release,	/* Needs to be wrapped... */
+	.bind		   = inet_bind,
+	.connect	   = inet_dgram_connect,
+	.socketpair	   = sock_no_socketpair,
+	.accept		   = inet_accept,
+	.getname	   = inet_getname,	/* Semantics are different.  */
+	.poll		   = sctp_poll,
+	.ioctl		   = inet_ioctl,
+	.listen		   = sctp_inet_listen,
+	.shutdown	   = inet_shutdown,	/* Looks harmless.  */
+	.setsockopt	   = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */
+	.getsockopt	   = sock_common_getsockopt,
+	.sendmsg	   = inet_sendmsg,
+	.recvmsg	   = sock_common_recvmsg,
+	.mmap		   = sock_no_mmap,
+	.sendpage	   = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_sock_common_setsockopt,
+	.compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+/* Registration with AF_INET family.  */
+static struct inet_protosw sctp_seqpacket_protosw = {
+	.type       = SOCK_SEQPACKET,
+	.protocol   = IPPROTO_SCTP,
+	.prot       = &sctp_prot,
+	.ops        = &inet_seqpacket_ops,
+	.no_check   = 0,
+	.flags      = SCTP_PROTOSW_FLAG
+};
+static struct inet_protosw sctp_stream_protosw = {
+	.type       = SOCK_STREAM,
+	.protocol   = IPPROTO_SCTP,
+	.prot       = &sctp_prot,
+	.ops        = &inet_seqpacket_ops,
+	.no_check   = 0,
+	.flags      = SCTP_PROTOSW_FLAG
+};
+
+/* Register with IP layer.  */
+static const struct net_protocol sctp_protocol = {
+	.handler     = sctp_rcv,
+	.err_handler = sctp_v4_err,
+	.no_policy   = 1,
+};
+
+/* IPv4 address related functions.  */
+static struct sctp_af sctp_af_inet = {
+	.sa_family	   = AF_INET,
+	.sctp_xmit	   = sctp_v4_xmit,
+	.setsockopt	   = ip_setsockopt,
+	.getsockopt	   = ip_getsockopt,
+	.get_dst	   = sctp_v4_get_dst,
+	.get_saddr	   = sctp_v4_get_saddr,
+	.copy_addrlist	   = sctp_v4_copy_addrlist,
+	.from_skb	   = sctp_v4_from_skb,
+	.from_sk	   = sctp_v4_from_sk,
+	.to_sk_saddr	   = sctp_v4_to_sk_saddr,
+	.to_sk_daddr	   = sctp_v4_to_sk_daddr,
+	.from_addr_param   = sctp_v4_from_addr_param,
+	.to_addr_param	   = sctp_v4_to_addr_param,
+	.cmp_addr	   = sctp_v4_cmp_addr,
+	.addr_valid	   = sctp_v4_addr_valid,
+	.inaddr_any	   = sctp_v4_inaddr_any,
+	.is_any		   = sctp_v4_is_any,
+	.available	   = sctp_v4_available,
+	.scope		   = sctp_v4_scope,
+	.skb_iif	   = sctp_v4_skb_iif,
+	.is_ce		   = sctp_v4_is_ce,
+	.seq_dump_addr	   = sctp_v4_seq_dump_addr,
+	.ecn_capable	   = sctp_v4_ecn_capable,
+	.net_header_len	   = sizeof(struct iphdr),
+	.sockaddr_len	   = sizeof(struct sockaddr_in),
+#ifdef CONFIG_COMPAT
+	.compat_setsockopt = compat_ip_setsockopt,
+	.compat_getsockopt = compat_ip_getsockopt,
+#endif
+};
+
+struct sctp_pf *sctp_get_pf_specific(sa_family_t family) {
+
+	switch (family) {
+	case PF_INET:
+		return sctp_pf_inet_specific;
+	case PF_INET6:
+		return sctp_pf_inet6_specific;
+	default:
+		return NULL;
+	}
+}
+
+/* Register the PF specific function table.  */
+int sctp_register_pf(struct sctp_pf *pf, sa_family_t family)
+{
+	switch (family) {
+	case PF_INET:
+		if (sctp_pf_inet_specific)
+			return 0;
+		sctp_pf_inet_specific = pf;
+		break;
+	case PF_INET6:
+		if (sctp_pf_inet6_specific)
+			return 0;
+		sctp_pf_inet6_specific = pf;
+		break;
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+static inline int init_sctp_mibs(void)
+{
+	return snmp_mib_init((void __percpu **)sctp_statistics,
+			     sizeof(struct sctp_mib),
+			     __alignof__(struct sctp_mib));
+}
+
+static inline void cleanup_sctp_mibs(void)
+{
+	snmp_mib_free((void __percpu **)sctp_statistics);
+}
+
+static void sctp_v4_pf_init(void)
+{
+	/* Initialize the SCTP specific PF functions. */
+	sctp_register_pf(&sctp_pf_inet, PF_INET);
+	sctp_register_af(&sctp_af_inet);
+}
+
+static void sctp_v4_pf_exit(void)
+{
+	list_del(&sctp_af_inet.list);
+}
+
+static int sctp_v4_protosw_init(void)
+{
+	int rc;
+
+	rc = proto_register(&sctp_prot, 1);
+	if (rc)
+		return rc;
+
+	/* Register SCTP(UDP and TCP style) with socket layer.  */
+	inet_register_protosw(&sctp_seqpacket_protosw);
+	inet_register_protosw(&sctp_stream_protosw);
+
+	return 0;
+}
+
+static void sctp_v4_protosw_exit(void)
+{
+	inet_unregister_protosw(&sctp_stream_protosw);
+	inet_unregister_protosw(&sctp_seqpacket_protosw);
+	proto_unregister(&sctp_prot);
+}
+
+static int sctp_v4_add_protocol(void)
+{
+	/* Register notifier for inet address additions/deletions. */
+	register_inetaddr_notifier(&sctp_inetaddr_notifier);
+
+	/* Register SCTP with inet layer.  */
+	if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0)
+		return -EAGAIN;
+
+	return 0;
+}
+
+static void sctp_v4_del_protocol(void)
+{
+	inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
+	unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
+}
+
+/* Initialize the universe into something sensible.  */
+SCTP_STATIC __init int sctp_init(void)
+{
+	int i;
+	int status = -EINVAL;
+	unsigned long goal;
+	unsigned long limit;
+	int max_share;
+	int order;
+
+	/* SCTP_DEBUG sanity check. */
+	if (!sctp_sanity_check())
+		goto out;
+
+	/* Allocate bind_bucket and chunk caches. */
+	status = -ENOBUFS;
+	sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket",
+					       sizeof(struct sctp_bind_bucket),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL);
+	if (!sctp_bucket_cachep)
+		goto out;
+
+	sctp_chunk_cachep = kmem_cache_create("sctp_chunk",
+					       sizeof(struct sctp_chunk),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL);
+	if (!sctp_chunk_cachep)
+		goto err_chunk_cachep;
+
+	/* Allocate and initialise sctp mibs.  */
+	status = init_sctp_mibs();
+	if (status)
+		goto err_init_mibs;
+
+	/* Initialize proc fs directory.  */
+	status = sctp_proc_init();
+	if (status)
+		goto err_init_proc;
+
+	/* Initialize object count debugging.  */
+	sctp_dbg_objcnt_init();
+
+	/*
+	 * 14. Suggested SCTP Protocol Parameter Values
+	 */
+	/* The following protocol parameters are RECOMMENDED:  */
+	/* RTO.Initial              - 3  seconds */
+	sctp_rto_initial		= SCTP_RTO_INITIAL;
+	/* RTO.Min                  - 1  second */
+	sctp_rto_min	 		= SCTP_RTO_MIN;
+	/* RTO.Max                 -  60 seconds */
+	sctp_rto_max 			= SCTP_RTO_MAX;
+	/* RTO.Alpha                - 1/8 */
+	sctp_rto_alpha	        	= SCTP_RTO_ALPHA;
+	/* RTO.Beta                 - 1/4 */
+	sctp_rto_beta			= SCTP_RTO_BETA;
+
+	/* Valid.Cookie.Life        - 60  seconds */
+	sctp_valid_cookie_life		= SCTP_DEFAULT_COOKIE_LIFE;
+
+	/* Whether Cookie Preservative is enabled(1) or not(0) */
+	sctp_cookie_preserve_enable 	= 1;
+
+	/* Max.Burst		    - 4 */
+	sctp_max_burst 			= SCTP_DEFAULT_MAX_BURST;
+
+	/* Association.Max.Retrans  - 10 attempts
+	 * Path.Max.Retrans         - 5  attempts (per destination address)
+	 * Max.Init.Retransmits     - 8  attempts
+	 */
+	sctp_max_retrans_association 	= 10;
+	sctp_max_retrans_path		= 5;
+	sctp_max_retrans_init		= 8;
+
+	/* Sendbuffer growth	    - do per-socket accounting */
+	sctp_sndbuf_policy		= 0;
+
+	/* Rcvbuffer growth	    - do per-socket accounting */
+	sctp_rcvbuf_policy		= 0;
+
+	/* HB.interval              - 30 seconds */
+	sctp_hb_interval		= SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+
+	/* delayed SACK timeout */
+	sctp_sack_timeout		= SCTP_DEFAULT_TIMEOUT_SACK;
+
+	/* Implementation specific variables. */
+
+	/* Initialize default stream count setup information. */
+	sctp_max_instreams    		= SCTP_DEFAULT_INSTREAMS;
+	sctp_max_outstreams   		= SCTP_DEFAULT_OUTSTREAMS;
+
+	/* Initialize maximum autoclose timeout. */
+	sctp_max_autoclose		= INT_MAX / HZ;
+
+	/* Initialize handle used for association ids. */
+	idr_init(&sctp_assocs_id);
+
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	sysctl_sctp_mem[0] = limit / 4 * 3;
+	sysctl_sctp_mem[1] = limit;
+	sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2;
+
+	/* Set per-socket limits to no more than 1/128 the pressure threshold*/
+	limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7);
+	max_share = min(4UL*1024*1024, limit);
+
+	sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */
+	sysctl_sctp_rmem[1] = (1500 *(sizeof(struct sk_buff) + 1));
+	sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share);
+
+	sysctl_sctp_wmem[0] = SK_MEM_QUANTUM;
+	sysctl_sctp_wmem[1] = 16*1024;
+	sysctl_sctp_wmem[2] = max(64*1024, max_share);
+
+	/* Size and allocate the association hash table.
+	 * The methodology is similar to that of the tcp hash tables.
+	 */
+	if (totalram_pages >= (128 * 1024))
+		goal = totalram_pages >> (22 - PAGE_SHIFT);
+	else
+		goal = totalram_pages >> (24 - PAGE_SHIFT);
+
+	for (order = 0; (1UL << order) < goal; order++)
+		;
+
+	do {
+		sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE /
+					sizeof(struct sctp_hashbucket);
+		if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0)
+			continue;
+		sctp_assoc_hashtable = (struct sctp_hashbucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
+	} while (!sctp_assoc_hashtable && --order > 0);
+	if (!sctp_assoc_hashtable) {
+		pr_err("Failed association hash alloc\n");
+		status = -ENOMEM;
+		goto err_ahash_alloc;
+	}
+	for (i = 0; i < sctp_assoc_hashsize; i++) {
+		rwlock_init(&sctp_assoc_hashtable[i].lock);
+		INIT_HLIST_HEAD(&sctp_assoc_hashtable[i].chain);
+	}
+
+	/* Allocate and initialize the endpoint hash table.  */
+	sctp_ep_hashsize = 64;
+	sctp_ep_hashtable = (struct sctp_hashbucket *)
+		kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL);
+	if (!sctp_ep_hashtable) {
+		pr_err("Failed endpoint_hash alloc\n");
+		status = -ENOMEM;
+		goto err_ehash_alloc;
+	}
+	for (i = 0; i < sctp_ep_hashsize; i++) {
+		rwlock_init(&sctp_ep_hashtable[i].lock);
+		INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
+	}
+
+	/* Allocate and initialize the SCTP port hash table.  */
+	do {
+		sctp_port_hashsize = (1UL << order) * PAGE_SIZE /
+					sizeof(struct sctp_bind_hashbucket);
+		if ((sctp_port_hashsize > (64 * 1024)) && order > 0)
+			continue;
+		sctp_port_hashtable = (struct sctp_bind_hashbucket *)
+			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
+	} while (!sctp_port_hashtable && --order > 0);
+	if (!sctp_port_hashtable) {
+		pr_err("Failed bind hash alloc\n");
+		status = -ENOMEM;
+		goto err_bhash_alloc;
+	}
+	for (i = 0; i < sctp_port_hashsize; i++) {
+		spin_lock_init(&sctp_port_hashtable[i].lock);
+		INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
+	}
+
+	pr_info("Hash tables configured (established %d bind %d)\n",
+		sctp_assoc_hashsize, sctp_port_hashsize);
+
+	/* Disable ADDIP by default. */
+	sctp_addip_enable = 0;
+	sctp_addip_noauth = 0;
+
+	/* Enable PR-SCTP by default. */
+	sctp_prsctp_enable = 1;
+
+	/* Disable AUTH by default. */
+	sctp_auth_enable = 0;
+
+	/* Set SCOPE policy to enabled */
+	sctp_scope_policy = SCTP_SCOPE_POLICY_ENABLE;
+
+	/* Set the default rwnd update threshold */
+	sctp_rwnd_upd_shift		= SCTP_DEFAULT_RWND_SHIFT;
+
+	sctp_sysctl_register();
+
+	INIT_LIST_HEAD(&sctp_address_families);
+	sctp_v4_pf_init();
+	sctp_v6_pf_init();
+
+	/* Initialize the local address list. */
+	INIT_LIST_HEAD(&sctp_local_addr_list);
+	spin_lock_init(&sctp_local_addr_lock);
+	sctp_get_local_addr_list();
+
+	status = sctp_v4_protosw_init();
+
+	if (status)
+		goto err_protosw_init;
+
+	status = sctp_v6_protosw_init();
+	if (status)
+		goto err_v6_protosw_init;
+
+	/* Initialize the control inode/socket for handling OOTB packets.  */
+	if ((status = sctp_ctl_sock_init())) {
+		pr_err("Failed to initialize the SCTP control sock\n");
+		goto err_ctl_sock_init;
+	}
+
+	status = sctp_v4_add_protocol();
+	if (status)
+		goto err_add_protocol;
+
+	/* Register SCTP with inet6 layer.  */
+	status = sctp_v6_add_protocol();
+	if (status)
+		goto err_v6_add_protocol;
+
+	status = 0;
+out:
+	return status;
+err_v6_add_protocol:
+	sctp_v4_del_protocol();
+err_add_protocol:
+	inet_ctl_sock_destroy(sctp_ctl_sock);
+err_ctl_sock_init:
+	sctp_v6_protosw_exit();
+err_v6_protosw_init:
+	sctp_v4_protosw_exit();
+err_protosw_init:
+	sctp_free_local_addr_list();
+	sctp_v4_pf_exit();
+	sctp_v6_pf_exit();
+	sctp_sysctl_unregister();
+	free_pages((unsigned long)sctp_port_hashtable,
+		   get_order(sctp_port_hashsize *
+			     sizeof(struct sctp_bind_hashbucket)));
+err_bhash_alloc:
+	kfree(sctp_ep_hashtable);
+err_ehash_alloc:
+	free_pages((unsigned long)sctp_assoc_hashtable,
+		   get_order(sctp_assoc_hashsize *
+			     sizeof(struct sctp_hashbucket)));
+err_ahash_alloc:
+	sctp_dbg_objcnt_exit();
+	sctp_proc_exit();
+err_init_proc:
+	cleanup_sctp_mibs();
+err_init_mibs:
+	kmem_cache_destroy(sctp_chunk_cachep);
+err_chunk_cachep:
+	kmem_cache_destroy(sctp_bucket_cachep);
+	goto out;
+}
+
+/* Exit handler for the SCTP protocol.  */
+SCTP_STATIC __exit void sctp_exit(void)
+{
+	/* BUG.  This should probably do something useful like clean
+	 * up all the remaining associations and all that memory.
+	 */
+
+	/* Unregister with inet6/inet layers. */
+	sctp_v6_del_protocol();
+	sctp_v4_del_protocol();
+
+	/* Free the control endpoint.  */
+	inet_ctl_sock_destroy(sctp_ctl_sock);
+
+	/* Free protosw registrations */
+	sctp_v6_protosw_exit();
+	sctp_v4_protosw_exit();
+
+	/* Free the local address list.  */
+	sctp_free_local_addr_list();
+
+	/* Unregister with socket layer. */
+	sctp_v6_pf_exit();
+	sctp_v4_pf_exit();
+
+	sctp_sysctl_unregister();
+
+	free_pages((unsigned long)sctp_assoc_hashtable,
+		   get_order(sctp_assoc_hashsize *
+			     sizeof(struct sctp_hashbucket)));
+	kfree(sctp_ep_hashtable);
+	free_pages((unsigned long)sctp_port_hashtable,
+		   get_order(sctp_port_hashsize *
+			     sizeof(struct sctp_bind_hashbucket)));
+
+	sctp_dbg_objcnt_exit();
+	sctp_proc_exit();
+	cleanup_sctp_mibs();
+
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+	kmem_cache_destroy(sctp_chunk_cachep);
+	kmem_cache_destroy(sctp_bucket_cachep);
+}
+
+module_init(sctp_init);
+module_exit(sctp_exit);
+
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
+MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
+MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
+module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
+MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
+MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
new file mode 100644
index 00000000..58eb27fe
--- /dev/null
+++ b/net/sctp/sm_make_chunk.c
@@ -0,0 +1,3421 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions work with the state functions in sctp_sm_statefuns.c
+ * to implement the state operations.  These functions implement the
+ * steps which require modifying existing data structures.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    C. Robin              <chris@hundredacre.ac.uk>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Dajiang Zhang	    <dajiang.zhang@nokia.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Kevin Gao             <kevin.gao@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+
+#include <linux/skbuff.h>
+#include <linux/random.h>	/* for get_random_bytes */
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+SCTP_STATIC
+struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
+				   __u8 type, __u8 flags, int paylen);
+static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *init_chunk,
+					int *cookie_len,
+					const __u8 *raw_addrs, int addrs_len);
+static int sctp_process_param(struct sctp_association *asoc,
+			      union sctp_params param,
+			      const union sctp_addr *peer_addr,
+			      gfp_t gfp);
+static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
+			      const void *data);
+
+/* What was the inbound interface for this chunk? */
+int sctp_chunk_iif(const struct sctp_chunk *chunk)
+{
+	struct sctp_af *af;
+	int iif = 0;
+
+	af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version));
+	if (af)
+		iif = af->skb_iif(chunk->skb);
+
+	return iif;
+}
+
+/* RFC 2960 3.3.2 Initiation (INIT) (1)
+ *
+ * Note 2: The ECN capable field is reserved for future use of
+ * Explicit Congestion Notification.
+ */
+static const struct sctp_paramhdr ecap_param = {
+	SCTP_PARAM_ECN_CAPABLE,
+	cpu_to_be16(sizeof(struct sctp_paramhdr)),
+};
+static const struct sctp_paramhdr prsctp_param = {
+	SCTP_PARAM_FWD_TSN_SUPPORT,
+	cpu_to_be16(sizeof(struct sctp_paramhdr)),
+};
+
+/* A helper to initialize an op error inside a
+ * provided chunk, as most cause codes will be embedded inside an
+ * abort chunk.
+ */
+void  sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+		      size_t paylen)
+{
+	sctp_errhdr_t err;
+	__u16 len;
+
+	/* Cause code constants are now defined in network order.  */
+	err.cause = cause_code;
+	len = sizeof(sctp_errhdr_t) + paylen;
+	err.length  = htons(len);
+	chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err);
+}
+
+/* A helper to initialize an op error inside a
+ * provided chunk, as most cause codes will be embedded inside an
+ * abort chunk.  Differs from sctp_init_cause in that it won't oops
+ * if there isn't enough space in the op error chunk
+ */
+int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
+		      size_t paylen)
+{
+	sctp_errhdr_t err;
+	__u16 len;
+
+	/* Cause code constants are now defined in network order.  */
+	err.cause = cause_code;
+	len = sizeof(sctp_errhdr_t) + paylen;
+	err.length  = htons(len);
+
+	if (skb_tailroom(chunk->skb) < len)
+		return -ENOSPC;
+	chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk,
+						     sizeof(sctp_errhdr_t),
+						     &err);
+	return 0;
+}
+/* 3.3.2 Initiation (INIT) (1)
+ *
+ * This chunk is used to initiate a SCTP association between two
+ * endpoints. The format of the INIT chunk is shown below:
+ *
+ *     0                   1                   2                   3
+ *     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |   Type = 1    |  Chunk Flags  |      Chunk Length             |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |                         Initiate Tag                          |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |           Advertised Receiver Window Credit (a_rwnd)          |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |  Number of Outbound Streams   |  Number of Inbound Streams    |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |                          Initial TSN                          |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    \                                                               \
+ *    /              Optional/Variable-Length Parameters              /
+ *    \                                                               \
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *
+ * The INIT chunk contains the following parameters. Unless otherwise
+ * noted, each parameter MUST only be included once in the INIT chunk.
+ *
+ * Fixed Parameters                     Status
+ * ----------------------------------------------
+ * Initiate Tag                        Mandatory
+ * Advertised Receiver Window Credit   Mandatory
+ * Number of Outbound Streams          Mandatory
+ * Number of Inbound Streams           Mandatory
+ * Initial TSN                         Mandatory
+ *
+ * Variable Parameters                  Status     Type Value
+ * -------------------------------------------------------------
+ * IPv4 Address (Note 1)               Optional    5
+ * IPv6 Address (Note 1)               Optional    6
+ * Cookie Preservative                 Optional    9
+ * Reserved for ECN Capable (Note 2)   Optional    32768 (0x8000)
+ * Host Name Address (Note 3)          Optional    11
+ * Supported Address Types (Note 4)    Optional    12
+ */
+struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
+			     const struct sctp_bind_addr *bp,
+			     gfp_t gfp, int vparam_len)
+{
+	sctp_inithdr_t init;
+	union sctp_params addrs;
+	size_t chunksize;
+	struct sctp_chunk *retval = NULL;
+	int num_types, addrs_len = 0;
+	struct sctp_sock *sp;
+	sctp_supported_addrs_param_t sat;
+	__be16 types[2];
+	sctp_adaptation_ind_param_t aiparam;
+	sctp_supported_ext_param_t ext_param;
+	int num_ext = 0;
+	__u8 extensions[3];
+	sctp_paramhdr_t *auth_chunks = NULL,
+			*auth_hmacs = NULL;
+
+	/* RFC 2960 3.3.2 Initiation (INIT) (1)
+	 *
+	 * Note 1: The INIT chunks can contain multiple addresses that
+	 * can be IPv4 and/or IPv6 in any combination.
+	 */
+	retval = NULL;
+
+	/* Convert the provided bind address list to raw format. */
+	addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp);
+
+	init.init_tag		   = htonl(asoc->c.my_vtag);
+	init.a_rwnd		   = htonl(asoc->rwnd);
+	init.num_outbound_streams  = htons(asoc->c.sinit_num_ostreams);
+	init.num_inbound_streams   = htons(asoc->c.sinit_max_instreams);
+	init.initial_tsn	   = htonl(asoc->c.initial_tsn);
+
+	/* How many address types are needed? */
+	sp = sctp_sk(asoc->base.sk);
+	num_types = sp->pf->supported_addrs(sp, types);
+
+	chunksize = sizeof(init) + addrs_len;
+	chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
+	chunksize += sizeof(ecap_param);
+
+	if (sctp_prsctp_enable)
+		chunksize += sizeof(prsctp_param);
+
+	/* ADDIP: Section 4.2.7:
+	 *  An implementation supporting this extension [ADDIP] MUST list
+	 *  the ASCONF,the ASCONF-ACK, and the AUTH  chunks in its INIT and
+	 *  INIT-ACK parameters.
+	 */
+	if (sctp_addip_enable) {
+		extensions[num_ext] = SCTP_CID_ASCONF;
+		extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
+		num_ext += 2;
+	}
+
+	if (sp->adaptation_ind)
+		chunksize += sizeof(aiparam);
+
+	chunksize += vparam_len;
+
+	/* Account for AUTH related parameters */
+	if (sctp_auth_enable) {
+		/* Add random parameter length*/
+		chunksize += sizeof(asoc->c.auth_random);
+
+		/* Add HMACS parameter length if any were defined */
+		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
+		if (auth_hmacs->length)
+			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+		else
+			auth_hmacs = NULL;
+
+		/* Add CHUNKS parameter length */
+		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
+		if (auth_chunks->length)
+			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+		else
+			auth_chunks = NULL;
+
+		extensions[num_ext] = SCTP_CID_AUTH;
+		num_ext += 1;
+	}
+
+	/* If we have any extensions to report, account for that */
+	if (num_ext)
+		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
+					num_ext);
+
+	/* RFC 2960 3.3.2 Initiation (INIT) (1)
+	 *
+	 * Note 3: An INIT chunk MUST NOT contain more than one Host
+	 * Name address parameter. Moreover, the sender of the INIT
+	 * MUST NOT combine any other address types with the Host Name
+	 * address in the INIT. The receiver of INIT MUST ignore any
+	 * other address types if the Host Name address parameter is
+	 * present in the received INIT chunk.
+	 *
+	 * PLEASE DO NOT FIXME [This version does not support Host Name.]
+	 */
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_INIT, 0, chunksize);
+	if (!retval)
+		goto nodata;
+
+	retval->subh.init_hdr =
+		sctp_addto_chunk(retval, sizeof(init), &init);
+	retval->param_hdr.v =
+		sctp_addto_chunk(retval, addrs_len, addrs.v);
+
+	/* RFC 2960 3.3.2 Initiation (INIT) (1)
+	 *
+	 * Note 4: This parameter, when present, specifies all the
+	 * address types the sending endpoint can support. The absence
+	 * of this parameter indicates that the sending endpoint can
+	 * support any address type.
+	 */
+	sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES;
+	sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types));
+	sctp_addto_chunk(retval, sizeof(sat), &sat);
+	sctp_addto_chunk(retval, num_types * sizeof(__u16), &types);
+
+	sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
+
+	/* Add the supported extensions parameter.  Be nice and add this
+	 * fist before addiding the parameters for the extensions themselves
+	 */
+	if (num_ext) {
+		ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
+		ext_param.param_hdr.length =
+			    htons(sizeof(sctp_supported_ext_param_t) + num_ext);
+		sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
+				&ext_param);
+		sctp_addto_param(retval, num_ext, extensions);
+	}
+
+	if (sctp_prsctp_enable)
+		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
+
+	if (sp->adaptation_ind) {
+		aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+		aiparam.param_hdr.length = htons(sizeof(aiparam));
+		aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+		sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	}
+
+	/* Add SCTP-AUTH chunks to the parameter list */
+	if (sctp_auth_enable) {
+		sctp_addto_chunk(retval, sizeof(asoc->c.auth_random),
+				 asoc->c.auth_random);
+		if (auth_hmacs)
+			sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
+					auth_hmacs);
+		if (auth_chunks)
+			sctp_addto_chunk(retval, ntohs(auth_chunks->length),
+					auth_chunks);
+	}
+nodata:
+	kfree(addrs.v);
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
+				 const struct sctp_chunk *chunk,
+				 gfp_t gfp, int unkparam_len)
+{
+	sctp_inithdr_t initack;
+	struct sctp_chunk *retval;
+	union sctp_params addrs;
+	struct sctp_sock *sp;
+	int addrs_len;
+	sctp_cookie_param_t *cookie;
+	int cookie_len;
+	size_t chunksize;
+	sctp_adaptation_ind_param_t aiparam;
+	sctp_supported_ext_param_t ext_param;
+	int num_ext = 0;
+	__u8 extensions[3];
+	sctp_paramhdr_t *auth_chunks = NULL,
+			*auth_hmacs = NULL,
+			*auth_random = NULL;
+
+	retval = NULL;
+
+	/* Note: there may be no addresses to embed. */
+	addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp);
+
+	initack.init_tag	        = htonl(asoc->c.my_vtag);
+	initack.a_rwnd			= htonl(asoc->rwnd);
+	initack.num_outbound_streams	= htons(asoc->c.sinit_num_ostreams);
+	initack.num_inbound_streams	= htons(asoc->c.sinit_max_instreams);
+	initack.initial_tsn		= htonl(asoc->c.initial_tsn);
+
+	/* FIXME:  We really ought to build the cookie right
+	 * into the packet instead of allocating more fresh memory.
+	 */
+	cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len,
+				  addrs.v, addrs_len);
+	if (!cookie)
+		goto nomem_cookie;
+
+	/* Calculate the total size of allocation, include the reserved
+	 * space for reporting unknown parameters if it is specified.
+	 */
+	sp = sctp_sk(asoc->base.sk);
+	chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;
+
+	/* Tell peer that we'll do ECN only if peer advertised such cap.  */
+	if (asoc->peer.ecn_capable)
+		chunksize += sizeof(ecap_param);
+
+	if (asoc->peer.prsctp_capable)
+		chunksize += sizeof(prsctp_param);
+
+	if (asoc->peer.asconf_capable) {
+		extensions[num_ext] = SCTP_CID_ASCONF;
+		extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
+		num_ext += 2;
+	}
+
+	if (sp->adaptation_ind)
+		chunksize += sizeof(aiparam);
+
+	if (asoc->peer.auth_capable) {
+		auth_random = (sctp_paramhdr_t *)asoc->c.auth_random;
+		chunksize += ntohs(auth_random->length);
+
+		auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
+		if (auth_hmacs->length)
+			chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
+		else
+			auth_hmacs = NULL;
+
+		auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
+		if (auth_chunks->length)
+			chunksize += WORD_ROUND(ntohs(auth_chunks->length));
+		else
+			auth_chunks = NULL;
+
+		extensions[num_ext] = SCTP_CID_AUTH;
+		num_ext += 1;
+	}
+
+	if (num_ext)
+		chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
+					num_ext);
+
+	/* Now allocate and fill out the chunk.  */
+	retval = sctp_make_chunk(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+	if (!retval)
+		goto nomem_chunk;
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [INIT ACK back to where the INIT came from.]
+	 */
+	retval->transport = chunk->transport;
+
+	retval->subh.init_hdr =
+		sctp_addto_chunk(retval, sizeof(initack), &initack);
+	retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v);
+	sctp_addto_chunk(retval, cookie_len, cookie);
+	if (asoc->peer.ecn_capable)
+		sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
+	if (num_ext) {
+		ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
+		ext_param.param_hdr.length =
+			    htons(sizeof(sctp_supported_ext_param_t) + num_ext);
+		sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
+				 &ext_param);
+		sctp_addto_param(retval, num_ext, extensions);
+	}
+	if (asoc->peer.prsctp_capable)
+		sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
+
+	if (sp->adaptation_ind) {
+		aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
+		aiparam.param_hdr.length = htons(sizeof(aiparam));
+		aiparam.adaptation_ind = htonl(sp->adaptation_ind);
+		sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
+	}
+
+	if (asoc->peer.auth_capable) {
+		sctp_addto_chunk(retval, ntohs(auth_random->length),
+				 auth_random);
+		if (auth_hmacs)
+			sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
+					auth_hmacs);
+		if (auth_chunks)
+			sctp_addto_chunk(retval, ntohs(auth_chunks->length),
+					auth_chunks);
+	}
+
+	/* We need to remove the const qualifier at this point.  */
+	retval->asoc = (struct sctp_association *) asoc;
+
+nomem_chunk:
+	kfree(cookie);
+nomem_cookie:
+	kfree(addrs.v);
+	return retval;
+}
+
+/* 3.3.11 Cookie Echo (COOKIE ECHO) (10):
+ *
+ * This chunk is used only during the initialization of an association.
+ * It is sent by the initiator of an association to its peer to complete
+ * the initialization process. This chunk MUST precede any DATA chunk
+ * sent within the association, but MAY be bundled with one or more DATA
+ * chunks in the same packet.
+ *
+ *      0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |   Type = 10   |Chunk  Flags   |         Length                |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     /                     Cookie                                    /
+ *     \                                                               \
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Chunk Flags: 8 bit
+ *
+ *   Set to zero on transmit and ignored on receipt.
+ *
+ * Length: 16 bits (unsigned integer)
+ *
+ *   Set to the size of the chunk in bytes, including the 4 bytes of
+ *   the chunk header and the size of the Cookie.
+ *
+ * Cookie: variable size
+ *
+ *   This field must contain the exact cookie received in the
+ *   State Cookie parameter from the previous INIT ACK.
+ *
+ *   An implementation SHOULD make the cookie as small as possible
+ *   to insure interoperability.
+ */
+struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
+				    const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+	void *cookie;
+	int cookie_len;
+
+	cookie = asoc->peer.cookie;
+	cookie_len = asoc->peer.cookie_len;
+
+	/* Build a cookie echo chunk.  */
+	retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+	if (!retval)
+		goto nodata;
+	retval->subh.cookie_hdr =
+		sctp_addto_chunk(retval, cookie_len, cookie);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [COOKIE ECHO back to where the INIT ACK came from.]
+	 */
+	if (chunk)
+		retval->transport = chunk->transport;
+
+nodata:
+	return retval;
+}
+
+/* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11):
+ *
+ * This chunk is used only during the initialization of an
+ * association.  It is used to acknowledge the receipt of a COOKIE
+ * ECHO chunk.  This chunk MUST precede any DATA or SACK chunk sent
+ * within the association, but MAY be bundled with one or more DATA
+ * chunks or SACK chunk in the same SCTP packet.
+ *
+ *      0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |   Type = 11   |Chunk  Flags   |     Length = 4                |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Chunk Flags: 8 bits
+ *
+ *   Set to zero on transmit and ignored on receipt.
+ */
+struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
+				   const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [COOKIE ACK back to where the COOKIE ECHO came from.]
+	 */
+	if (retval && chunk)
+		retval->transport = chunk->transport;
+
+	return retval;
+}
+
+/*
+ *  Appendix A: Explicit Congestion Notification:
+ *  CWR:
+ *
+ *  RFC 2481 details a specific bit for a sender to send in the header of
+ *  its next outbound TCP segment to indicate to its peer that it has
+ *  reduced its congestion window.  This is termed the CWR bit.  For
+ *  SCTP the same indication is made by including the CWR chunk.
+ *  This chunk contains one data element, i.e. the TSN number that
+ *  was sent in the ECNE chunk.  This element represents the lowest
+ *  TSN number in the datagram that was originally marked with the
+ *  CE bit.
+ *
+ *     0                   1                   2                   3
+ *     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    | Chunk Type=13 | Flags=00000000|    Chunk Length = 8           |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *    |                      Lowest TSN Number                        |
+ *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *     Note: The CWR is considered a Control chunk.
+ */
+struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
+			    const __u32 lowest_tsn,
+			    const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+	sctp_cwrhdr_t cwr;
+
+	cwr.lowest_tsn = htonl(lowest_tsn);
+	retval = sctp_make_chunk(asoc, SCTP_CID_ECN_CWR, 0,
+				 sizeof(sctp_cwrhdr_t));
+
+	if (!retval)
+		goto nodata;
+
+	retval->subh.ecn_cwr_hdr =
+		sctp_addto_chunk(retval, sizeof(cwr), &cwr);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [Report a reduced congestion window back to where the ECNE
+	 * came from.]
+	 */
+	if (chunk)
+		retval->transport = chunk->transport;
+
+nodata:
+	return retval;
+}
+
+/* Make an ECNE chunk.  This is a congestion experienced report.  */
+struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
+			     const __u32 lowest_tsn)
+{
+	struct sctp_chunk *retval;
+	sctp_ecnehdr_t ecne;
+
+	ecne.lowest_tsn = htonl(lowest_tsn);
+	retval = sctp_make_chunk(asoc, SCTP_CID_ECN_ECNE, 0,
+				 sizeof(sctp_ecnehdr_t));
+	if (!retval)
+		goto nodata;
+	retval->subh.ecne_hdr =
+		sctp_addto_chunk(retval, sizeof(ecne), &ecne);
+
+nodata:
+	return retval;
+}
+
+/* Make a DATA chunk for the given association from the provided
+ * parameters.  However, do not populate the data payload.
+ */
+struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+				       const struct sctp_sndrcvinfo *sinfo,
+				       int data_len, __u8 flags, __u16 ssn)
+{
+	struct sctp_chunk *retval;
+	struct sctp_datahdr dp;
+	int chunk_len;
+
+	/* We assign the TSN as LATE as possible, not here when
+	 * creating the chunk.
+	 */
+	dp.tsn = 0;
+	dp.stream = htons(sinfo->sinfo_stream);
+	dp.ppid   = sinfo->sinfo_ppid;
+
+	/* Set the flags for an unordered send.  */
+	if (sinfo->sinfo_flags & SCTP_UNORDERED) {
+		flags |= SCTP_DATA_UNORDERED;
+		dp.ssn = 0;
+	} else
+		dp.ssn = htons(ssn);
+
+	chunk_len = sizeof(dp) + data_len;
+	retval = sctp_make_chunk(asoc, SCTP_CID_DATA, flags, chunk_len);
+	if (!retval)
+		goto nodata;
+
+	retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
+	memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
+
+nodata:
+	return retval;
+}
+
+/* Create a selective ackowledgement (SACK) for the given
+ * association.  This reports on which TSN's we've seen to date,
+ * including duplicates and gaps.
+ */
+struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+{
+	struct sctp_chunk *retval;
+	struct sctp_sackhdr sack;
+	int len;
+	__u32 ctsn;
+	__u16 num_gabs, num_dup_tsns;
+	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
+
+	memset(gabs, 0, sizeof(gabs));
+	ctsn = sctp_tsnmap_get_ctsn(map);
+	SCTP_DEBUG_PRINTK("sackCTSNAck sent:  0x%x.\n", ctsn);
+
+	/* How much room is needed in the chunk? */
+	num_gabs = sctp_tsnmap_num_gabs(map, gabs);
+	num_dup_tsns = sctp_tsnmap_num_dups(map);
+
+	/* Initialize the SACK header.  */
+	sack.cum_tsn_ack	    = htonl(ctsn);
+	sack.a_rwnd 		    = htonl(asoc->a_rwnd);
+	sack.num_gap_ack_blocks     = htons(num_gabs);
+	sack.num_dup_tsns           = htons(num_dup_tsns);
+
+	len = sizeof(sack)
+		+ sizeof(struct sctp_gap_ack_block) * num_gabs
+		+ sizeof(__u32) * num_dup_tsns;
+
+	/* Create the chunk.  */
+	retval = sctp_make_chunk(asoc, SCTP_CID_SACK, 0, len);
+	if (!retval)
+		goto nodata;
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, etc.) to the same destination transport
+	 * address from which it received the DATA or control chunk to
+	 * which it is replying.  This rule should also be followed if
+	 * the endpoint is bundling DATA chunks together with the
+	 * reply chunk.
+	 *
+	 * However, when acknowledging multiple DATA chunks received
+	 * in packets from different source addresses in a single
+	 * SACK, the SACK chunk may be transmitted to one of the
+	 * destination transport addresses from which the DATA or
+	 * control chunks being acknowledged were received.
+	 *
+	 * [BUG:  We do not implement the following paragraph.
+	 * Perhaps we should remember the last transport we used for a
+	 * SACK and avoid that (if possible) if we have seen any
+	 * duplicates. --piggy]
+	 *
+	 * When a receiver of a duplicate DATA chunk sends a SACK to a
+	 * multi- homed endpoint it MAY be beneficial to vary the
+	 * destination address and not use the source address of the
+	 * DATA chunk.  The reason being that receiving a duplicate
+	 * from a multi-homed endpoint might indicate that the return
+	 * path (as specified in the source address of the DATA chunk)
+	 * for the SACK is broken.
+	 *
+	 * [Send to the address from which we last received a DATA chunk.]
+	 */
+	retval->transport = asoc->peer.last_data_from;
+
+	retval->subh.sack_hdr =
+		sctp_addto_chunk(retval, sizeof(sack), &sack);
+
+	/* Add the gap ack block information.   */
+	if (num_gabs)
+		sctp_addto_chunk(retval, sizeof(__u32) * num_gabs,
+				 gabs);
+
+	/* Add the duplicate TSN information.  */
+	if (num_dup_tsns)
+		sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
+				 sctp_tsnmap_get_dups(map));
+
+nodata:
+	return retval;
+}
+
+/* Make a SHUTDOWN chunk. */
+struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
+				      const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+	sctp_shutdownhdr_t shut;
+	__u32 ctsn;
+
+	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+	shut.cum_tsn_ack = htonl(ctsn);
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN, 0,
+				 sizeof(sctp_shutdownhdr_t));
+	if (!retval)
+		goto nodata;
+
+	retval->subh.shutdown_hdr =
+		sctp_addto_chunk(retval, sizeof(shut), &shut);
+
+	if (chunk)
+		retval->transport = chunk->transport;
+nodata:
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
+				     const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [ACK back to where the SHUTDOWN came from.]
+	 */
+	if (retval && chunk)
+		retval->transport = chunk->transport;
+
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_shutdown_complete(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *retval;
+	__u8 flags = 0;
+
+	/* Set the T-bit if we have no association (vtag will be
+	 * reflected)
+	 */
+	flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK
+	 * came from.]
+	 */
+	if (retval && chunk)
+		retval->transport = chunk->transport;
+
+	return retval;
+}
+
+/* Create an ABORT.  Note that we set the T bit if we have no
+ * association, except when responding to an INIT (sctpimpguide 2.41).
+ */
+struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
+			      const struct sctp_chunk *chunk,
+			      const size_t hint)
+{
+	struct sctp_chunk *retval;
+	__u8 flags = 0;
+
+	/* Set the T-bit if we have no association and 'chunk' is not
+	 * an INIT (vtag will be reflected).
+	 */
+	if (!asoc) {
+		if (chunk && chunk->chunk_hdr &&
+		    chunk->chunk_hdr->type == SCTP_CID_INIT)
+			flags = 0;
+		else
+			flags = SCTP_CHUNK_FLAG_T;
+	}
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_ABORT, flags, hint);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [ABORT back to where the offender came from.]
+	 */
+	if (retval && chunk)
+		retval->transport = chunk->transport;
+
+	return retval;
+}
+
+/* Helper to create ABORT with a NO_USER_DATA error.  */
+struct sctp_chunk *sctp_make_abort_no_data(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk, __u32 tsn)
+{
+	struct sctp_chunk *retval;
+	__be32 payload;
+
+	retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t)
+				 + sizeof(tsn));
+
+	if (!retval)
+		goto no_mem;
+
+	/* Put the tsn back into network byte order.  */
+	payload = htonl(tsn);
+	sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload));
+	sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [ABORT back to where the offender came from.]
+	 */
+	if (chunk)
+		retval->transport = chunk->transport;
+
+no_mem:
+	return retval;
+}
+
+/* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error.  */
+struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
+					const struct msghdr *msg,
+					size_t paylen)
+{
+	struct sctp_chunk *retval;
+	void *payload = NULL;
+	int err;
+
+	retval = sctp_make_abort(asoc, NULL, sizeof(sctp_errhdr_t) + paylen);
+	if (!retval)
+		goto err_chunk;
+
+	if (paylen) {
+		/* Put the msg_iov together into payload.  */
+		payload = kmalloc(paylen, GFP_KERNEL);
+		if (!payload)
+			goto err_payload;
+
+		err = memcpy_fromiovec(payload, msg->msg_iov, paylen);
+		if (err < 0)
+			goto err_copy;
+	}
+
+	sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen);
+	sctp_addto_chunk(retval, paylen, payload);
+
+	if (paylen)
+		kfree(payload);
+
+	return retval;
+
+err_copy:
+	kfree(payload);
+err_payload:
+	sctp_chunk_free(retval);
+	retval = NULL;
+err_chunk:
+	return retval;
+}
+
+/* Append bytes to the end of a parameter.  Will panic if chunk is not big
+ * enough.
+ */
+static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
+			      const void *data)
+{
+	void *target;
+	int chunklen = ntohs(chunk->chunk_hdr->length);
+
+	target = skb_put(chunk->skb, len);
+
+	if (data)
+		memcpy(target, data, len);
+	else
+		memset(target, 0, len);
+
+	/* Adjust the chunk length field.  */
+	chunk->chunk_hdr->length = htons(chunklen + len);
+	chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+	return target;
+}
+
+/* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */
+struct sctp_chunk *sctp_make_abort_violation(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk,
+	const __u8   *payload,
+	const size_t paylen)
+{
+	struct sctp_chunk  *retval;
+	struct sctp_paramhdr phdr;
+
+	retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen
+					+ sizeof(sctp_paramhdr_t));
+	if (!retval)
+		goto end;
+
+	sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen
+					+ sizeof(sctp_paramhdr_t));
+
+	phdr.type = htons(chunk->chunk_hdr->type);
+	phdr.length = chunk->chunk_hdr->length;
+	sctp_addto_chunk(retval, paylen, payload);
+	sctp_addto_param(retval, sizeof(sctp_paramhdr_t), &phdr);
+
+end:
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_violation_paramlen(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk,
+	struct sctp_paramhdr *param)
+{
+	struct sctp_chunk *retval;
+	static const char error[] = "The following parameter had invalid length:";
+	size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t) +
+				sizeof(sctp_paramhdr_t);
+
+	retval = sctp_make_abort(asoc, chunk, payload_len);
+	if (!retval)
+		goto nodata;
+
+	sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION,
+			sizeof(error) + sizeof(sctp_paramhdr_t));
+	sctp_addto_chunk(retval, sizeof(error), error);
+	sctp_addto_param(retval, sizeof(sctp_paramhdr_t), param);
+
+nodata:
+	return retval;
+}
+
+/* Make a HEARTBEAT chunk.  */
+struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
+				  const struct sctp_transport *transport)
+{
+	struct sctp_chunk *retval;
+	sctp_sender_hb_info_t hbinfo;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+
+	if (!retval)
+		goto nodata;
+
+	hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO;
+	hbinfo.param_hdr.length = htons(sizeof(sctp_sender_hb_info_t));
+	hbinfo.daddr = transport->ipaddr;
+	hbinfo.sent_at = jiffies;
+	hbinfo.hb_nonce = transport->hb_nonce;
+
+	/* Cast away the 'const', as this is just telling the chunk
+	 * what transport it belongs to.
+	 */
+	retval->transport = (struct sctp_transport *) transport;
+	retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo),
+						&hbinfo);
+
+nodata:
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
+				      const struct sctp_chunk *chunk,
+				      const void *payload, const size_t paylen)
+{
+	struct sctp_chunk *retval;
+
+	retval  = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+	if (!retval)
+		goto nodata;
+
+	retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload);
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, * etc.) to the same destination transport
+	 * address from which it * received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 * [HBACK back to where the HEARTBEAT came from.]
+	 */
+	if (chunk)
+		retval->transport = chunk->transport;
+
+nodata:
+	return retval;
+}
+
+/* Create an Operation Error chunk with the specified space reserved.
+ * This routine can be used for containing multiple causes in the chunk.
+ */
+static struct sctp_chunk *sctp_make_op_error_space(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk,
+	size_t size)
+{
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_ERROR, 0,
+				 sizeof(sctp_errhdr_t) + size);
+	if (!retval)
+		goto nodata;
+
+	/* RFC 2960 6.4 Multi-homed SCTP Endpoints
+	 *
+	 * An endpoint SHOULD transmit reply chunks (e.g., SACK,
+	 * HEARTBEAT ACK, etc.) to the same destination transport
+	 * address from which it received the DATA or control chunk
+	 * to which it is replying.
+	 *
+	 */
+	if (chunk)
+		retval->transport = chunk->transport;
+
+nodata:
+	return retval;
+}
+
+/* Create an Operation Error chunk of a fixed size,
+ * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
+ * This is a helper function to allocate an error chunk for
+ * for those invalid parameter codes in which we may not want
+ * to report all the errors, if the incomming chunk is large
+ */
+static inline struct sctp_chunk *sctp_make_op_error_fixed(
+	const struct sctp_association *asoc,
+	const struct sctp_chunk *chunk)
+{
+	size_t size = asoc ? asoc->pathmtu : 0;
+
+	if (!size)
+		size = SCTP_DEFAULT_MAXSEGMENT;
+
+	return sctp_make_op_error_space(asoc, chunk, size);
+}
+
+/* Create an Operation Error chunk.  */
+struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
+				 const struct sctp_chunk *chunk,
+				 __be16 cause_code, const void *payload,
+				 size_t paylen, size_t reserve_tail)
+{
+	struct sctp_chunk *retval;
+
+	retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail);
+	if (!retval)
+		goto nodata;
+
+	sctp_init_cause(retval, cause_code, paylen + reserve_tail);
+	sctp_addto_chunk(retval, paylen, payload);
+	if (reserve_tail)
+		sctp_addto_param(retval, reserve_tail, NULL);
+
+nodata:
+	return retval;
+}
+
+struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
+{
+	struct sctp_chunk *retval;
+	struct sctp_hmac *hmac_desc;
+	struct sctp_authhdr auth_hdr;
+	__u8 *hmac;
+
+	/* Get the first hmac that the peer told us to use */
+	hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+	if (unlikely(!hmac_desc))
+		return NULL;
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_AUTH, 0,
+			hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
+	if (!retval)
+		return NULL;
+
+	auth_hdr.hmac_id = htons(hmac_desc->hmac_id);
+	auth_hdr.shkey_id = htons(asoc->active_key_id);
+
+	retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(sctp_authhdr_t),
+						&auth_hdr);
+
+	hmac = skb_put(retval->skb, hmac_desc->hmac_len);
+	memset(hmac, 0, hmac_desc->hmac_len);
+
+	/* Adjust the chunk header to include the empty MAC */
+	retval->chunk_hdr->length =
+		htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len);
+	retval->chunk_end = skb_tail_pointer(retval->skb);
+
+	return retval;
+}
+
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Turn an skb into a chunk.
+ * FIXME: Eventually move the structure directly inside the skb->cb[].
+ */
+struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
+			    const struct sctp_association *asoc,
+			    struct sock *sk)
+{
+	struct sctp_chunk *retval;
+
+	retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC);
+
+	if (!retval)
+		goto nodata;
+
+	if (!sk) {
+		SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
+	}
+
+	INIT_LIST_HEAD(&retval->list);
+	retval->skb		= skb;
+	retval->asoc		= (struct sctp_association *)asoc;
+	retval->has_tsn		= 0;
+	retval->has_ssn         = 0;
+	retval->rtt_in_progress	= 0;
+	retval->sent_at		= 0;
+	retval->singleton	= 1;
+	retval->end_of_packet	= 0;
+	retval->ecn_ce_done	= 0;
+	retval->pdiscard	= 0;
+
+	/* sctpimpguide-05.txt Section 2.8.2
+	 * M1) Each time a new DATA chunk is transmitted
+	 * set the 'TSN.Missing.Report' count for that TSN to 0. The
+	 * 'TSN.Missing.Report' count will be used to determine missing chunks
+	 * and when to fast retransmit.
+	 */
+	retval->tsn_missing_report = 0;
+	retval->tsn_gap_acked = 0;
+	retval->fast_retransmit = SCTP_CAN_FRTX;
+
+	/* If this is a fragmented message, track all fragments
+	 * of the message (for SEND_FAILED).
+	 */
+	retval->msg = NULL;
+
+	/* Polish the bead hole.  */
+	INIT_LIST_HEAD(&retval->transmitted_list);
+	INIT_LIST_HEAD(&retval->frag_list);
+	SCTP_DBG_OBJCNT_INC(chunk);
+	atomic_set(&retval->refcnt, 1);
+
+nodata:
+	return retval;
+}
+
+/* Set chunk->source and dest based on the IP header in chunk->skb.  */
+void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src,
+		     union sctp_addr *dest)
+{
+	memcpy(&chunk->source, src, sizeof(union sctp_addr));
+	memcpy(&chunk->dest, dest, sizeof(union sctp_addr));
+}
+
+/* Extract the source address from a chunk.  */
+const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
+{
+	/* If we have a known transport, use that.  */
+	if (chunk->transport) {
+		return &chunk->transport->ipaddr;
+	} else {
+		/* Otherwise, extract it from the IP header.  */
+		return &chunk->source;
+	}
+}
+
+/* Create a new chunk, setting the type and flags headers from the
+ * arguments, reserving enough space for a 'paylen' byte payload.
+ */
+SCTP_STATIC
+struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc,
+				   __u8 type, __u8 flags, int paylen)
+{
+	struct sctp_chunk *retval;
+	sctp_chunkhdr_t *chunk_hdr;
+	struct sk_buff *skb;
+	struct sock *sk;
+
+	/* No need to allocate LL here, as this is only a chunk. */
+	skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen),
+			GFP_ATOMIC);
+	if (!skb)
+		goto nodata;
+
+	/* Make room for the chunk header.  */
+	chunk_hdr = (sctp_chunkhdr_t *)skb_put(skb, sizeof(sctp_chunkhdr_t));
+	chunk_hdr->type	  = type;
+	chunk_hdr->flags  = flags;
+	chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t));
+
+	sk = asoc ? asoc->base.sk : NULL;
+	retval = sctp_chunkify(skb, asoc, sk);
+	if (!retval) {
+		kfree_skb(skb);
+		goto nodata;
+	}
+
+	retval->chunk_hdr = chunk_hdr;
+	retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(struct sctp_chunkhdr);
+
+	/* Determine if the chunk needs to be authenticated */
+	if (sctp_auth_send_cid(type, asoc))
+		retval->auth = 1;
+
+	/* Set the skb to the belonging sock for accounting.  */
+	skb->sk = sk;
+
+	return retval;
+nodata:
+	return NULL;
+}
+
+
+/* Release the memory occupied by a chunk.  */
+static void sctp_chunk_destroy(struct sctp_chunk *chunk)
+{
+	BUG_ON(!list_empty(&chunk->list));
+	list_del_init(&chunk->transmitted_list);
+
+	/* Free the chunk skb data and the SCTP_chunk stub itself. */
+	dev_kfree_skb(chunk->skb);
+
+	SCTP_DBG_OBJCNT_DEC(chunk);
+	kmem_cache_free(sctp_chunk_cachep, chunk);
+}
+
+/* Possibly, free the chunk.  */
+void sctp_chunk_free(struct sctp_chunk *chunk)
+{
+	/* Release our reference on the message tracker. */
+	if (chunk->msg)
+		sctp_datamsg_put(chunk->msg);
+
+	sctp_chunk_put(chunk);
+}
+
+/* Grab a reference to the chunk. */
+void sctp_chunk_hold(struct sctp_chunk *ch)
+{
+	atomic_inc(&ch->refcnt);
+}
+
+/* Release a reference to the chunk. */
+void sctp_chunk_put(struct sctp_chunk *ch)
+{
+	if (atomic_dec_and_test(&ch->refcnt))
+		sctp_chunk_destroy(ch);
+}
+
+/* Append bytes to the end of a chunk.  Will panic if chunk is not big
+ * enough.
+ */
+void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
+{
+	void *target;
+	void *padding;
+	int chunklen = ntohs(chunk->chunk_hdr->length);
+	int padlen = WORD_ROUND(chunklen) - chunklen;
+
+	padding = skb_put(chunk->skb, padlen);
+	target = skb_put(chunk->skb, len);
+
+	memset(padding, 0, padlen);
+	memcpy(target, data, len);
+
+	/* Adjust the chunk length field.  */
+	chunk->chunk_hdr->length = htons(chunklen + padlen + len);
+	chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+	return target;
+}
+
+/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
+ * space in the chunk
+ */
+void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
+			     int len, const void *data)
+{
+	if (skb_tailroom(chunk->skb) >= len)
+		return sctp_addto_chunk(chunk, len, data);
+	else
+		return NULL;
+}
+
+/* Append bytes from user space to the end of a chunk.  Will panic if
+ * chunk is not big enough.
+ * Returns a kernel err value.
+ */
+int sctp_user_addto_chunk(struct sctp_chunk *chunk, int off, int len,
+			  struct iovec *data)
+{
+	__u8 *target;
+	int err = 0;
+
+	/* Make room in chunk for data.  */
+	target = skb_put(chunk->skb, len);
+
+	/* Copy data (whole iovec) into chunk */
+	if ((err = memcpy_fromiovecend(target, data, off, len)))
+		goto out;
+
+	/* Adjust the chunk length field.  */
+	chunk->chunk_hdr->length =
+		htons(ntohs(chunk->chunk_hdr->length) + len);
+	chunk->chunk_end = skb_tail_pointer(chunk->skb);
+
+out:
+	return err;
+}
+
+/* Helper function to assign a TSN if needed.  This assumes that both
+ * the data_hdr and association have already been assigned.
+ */
+void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
+{
+	struct sctp_datamsg *msg;
+	struct sctp_chunk *lchunk;
+	struct sctp_stream *stream;
+	__u16 ssn;
+	__u16 sid;
+
+	if (chunk->has_ssn)
+		return;
+
+	/* All fragments will be on the same stream */
+	sid = ntohs(chunk->subh.data_hdr->stream);
+	stream = &chunk->asoc->ssnmap->out;
+
+	/* Now assign the sequence number to the entire message.
+	 * All fragments must have the same stream sequence number.
+	 */
+	msg = chunk->msg;
+	list_for_each_entry(lchunk, &msg->chunks, frag_list) {
+		if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+			ssn = 0;
+		} else {
+			if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
+				ssn = sctp_ssn_next(stream, sid);
+			else
+				ssn = sctp_ssn_peek(stream, sid);
+		}
+
+		lchunk->subh.data_hdr->ssn = htons(ssn);
+		lchunk->has_ssn = 1;
+	}
+}
+
+/* Helper function to assign a TSN if needed.  This assumes that both
+ * the data_hdr and association have already been assigned.
+ */
+void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
+{
+	if (!chunk->has_tsn) {
+		/* This is the last possible instant to
+		 * assign a TSN.
+		 */
+		chunk->subh.data_hdr->tsn =
+			htonl(sctp_association_get_next_tsn(chunk->asoc));
+		chunk->has_tsn = 1;
+	}
+}
+
+/* Create a CLOSED association to use with an incoming packet.  */
+struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
+					struct sctp_chunk *chunk,
+					gfp_t gfp)
+{
+	struct sctp_association *asoc;
+	struct sk_buff *skb;
+	sctp_scope_t scope;
+	struct sctp_af *af;
+
+	/* Create the bare association.  */
+	scope = sctp_scope(sctp_source(chunk));
+	asoc = sctp_association_new(ep, ep->base.sk, scope, gfp);
+	if (!asoc)
+		goto nodata;
+	asoc->temp = 1;
+	skb = chunk->skb;
+	/* Create an entry for the source address of the packet.  */
+	af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version));
+	if (unlikely(!af))
+		goto fail;
+	af->from_skb(&asoc->c.peer_addr, skb, 1);
+nodata:
+	return asoc;
+
+fail:
+	sctp_association_free(asoc);
+	return NULL;
+}
+
+/* Build a cookie representing asoc.
+ * This INCLUDES the param header needed to put the cookie in the INIT ACK.
+ */
+static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const struct sctp_chunk *init_chunk,
+				      int *cookie_len,
+				      const __u8 *raw_addrs, int addrs_len)
+{
+	sctp_cookie_param_t *retval;
+	struct sctp_signed_cookie *cookie;
+	struct scatterlist sg;
+	int headersize, bodysize;
+	unsigned int keylen;
+	char *key;
+
+	/* Header size is static data prior to the actual cookie, including
+	 * any padding.
+	 */
+	headersize = sizeof(sctp_paramhdr_t) +
+		     (sizeof(struct sctp_signed_cookie) -
+		      sizeof(struct sctp_cookie));
+	bodysize = sizeof(struct sctp_cookie)
+		+ ntohs(init_chunk->chunk_hdr->length) + addrs_len;
+
+	/* Pad out the cookie to a multiple to make the signature
+	 * functions simpler to write.
+	 */
+	if (bodysize % SCTP_COOKIE_MULTIPLE)
+		bodysize += SCTP_COOKIE_MULTIPLE
+			- (bodysize % SCTP_COOKIE_MULTIPLE);
+	*cookie_len = headersize + bodysize;
+
+	/* Clear this memory since we are sending this data structure
+	 * out on the network.
+	 */
+	retval = kzalloc(*cookie_len, GFP_ATOMIC);
+	if (!retval)
+		goto nodata;
+
+	cookie = (struct sctp_signed_cookie *) retval->body;
+
+	/* Set up the parameter header.  */
+	retval->p.type = SCTP_PARAM_STATE_COOKIE;
+	retval->p.length = htons(*cookie_len);
+
+	/* Copy the cookie part of the association itself.  */
+	cookie->c = asoc->c;
+	/* Save the raw address list length in the cookie. */
+	cookie->c.raw_addr_list_len = addrs_len;
+
+	/* Remember PR-SCTP capability. */
+	cookie->c.prsctp_capable = asoc->peer.prsctp_capable;
+
+	/* Save adaptation indication in the cookie. */
+	cookie->c.adaptation_ind = asoc->peer.adaptation_ind;
+
+	/* Set an expiration time for the cookie.  */
+	do_gettimeofday(&cookie->c.expiration);
+	TIMEVAL_ADD(asoc->cookie_life, cookie->c.expiration);
+
+	/* Copy the peer's init packet.  */
+	memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
+	       ntohs(init_chunk->chunk_hdr->length));
+
+	/* Copy the raw local address list of the association. */
+	memcpy((__u8 *)&cookie->c.peer_init[0] +
+	       ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
+
+	if (sctp_sk(ep->base.sk)->hmac) {
+		struct hash_desc desc;
+
+		/* Sign the message.  */
+		sg_init_one(&sg, &cookie->c, bodysize);
+		keylen = SCTP_SECRET_SIZE;
+		key = (char *)ep->secret_key[ep->current_key];
+		desc.tfm = sctp_sk(ep->base.sk)->hmac;
+		desc.flags = 0;
+
+		if (crypto_hash_setkey(desc.tfm, key, keylen) ||
+		    crypto_hash_digest(&desc, &sg, bodysize, cookie->signature))
+			goto free_cookie;
+	}
+
+	return retval;
+
+free_cookie:
+	kfree(retval);
+nodata:
+	*cookie_len = 0;
+	return NULL;
+}
+
+/* Unpack the cookie from COOKIE ECHO chunk, recreating the association.  */
+struct sctp_association *sctp_unpack_cookie(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	struct sctp_chunk *chunk, gfp_t gfp,
+	int *error, struct sctp_chunk **errp)
+{
+	struct sctp_association *retval = NULL;
+	struct sctp_signed_cookie *cookie;
+	struct sctp_cookie *bear_cookie;
+	int headersize, bodysize, fixed_size;
+	__u8 *digest = ep->digest;
+	struct scatterlist sg;
+	unsigned int keylen, len;
+	char *key;
+	sctp_scope_t scope;
+	struct sk_buff *skb = chunk->skb;
+	struct timeval tv;
+	struct hash_desc desc;
+
+	/* Header size is static data prior to the actual cookie, including
+	 * any padding.
+	 */
+	headersize = sizeof(sctp_chunkhdr_t) +
+		     (sizeof(struct sctp_signed_cookie) -
+		      sizeof(struct sctp_cookie));
+	bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
+	fixed_size = headersize + sizeof(struct sctp_cookie);
+
+	/* Verify that the chunk looks like it even has a cookie.
+	 * There must be enough room for our cookie and our peer's
+	 * INIT chunk.
+	 */
+	len = ntohs(chunk->chunk_hdr->length);
+	if (len < fixed_size + sizeof(struct sctp_chunkhdr))
+		goto malformed;
+
+	/* Verify that the cookie has been padded out. */
+	if (bodysize % SCTP_COOKIE_MULTIPLE)
+		goto malformed;
+
+	/* Process the cookie.  */
+	cookie = chunk->subh.cookie_hdr;
+	bear_cookie = &cookie->c;
+
+	if (!sctp_sk(ep->base.sk)->hmac)
+		goto no_hmac;
+
+	/* Check the signature.  */
+	keylen = SCTP_SECRET_SIZE;
+	sg_init_one(&sg, bear_cookie, bodysize);
+	key = (char *)ep->secret_key[ep->current_key];
+	desc.tfm = sctp_sk(ep->base.sk)->hmac;
+	desc.flags = 0;
+
+	memset(digest, 0x00, SCTP_SIGNATURE_SIZE);
+	if (crypto_hash_setkey(desc.tfm, key, keylen) ||
+	    crypto_hash_digest(&desc, &sg, bodysize, digest)) {
+		*error = -SCTP_IERROR_NOMEM;
+		goto fail;
+	}
+
+	if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
+		/* Try the previous key. */
+		key = (char *)ep->secret_key[ep->last_key];
+		memset(digest, 0x00, SCTP_SIGNATURE_SIZE);
+		if (crypto_hash_setkey(desc.tfm, key, keylen) ||
+		    crypto_hash_digest(&desc, &sg, bodysize, digest)) {
+			*error = -SCTP_IERROR_NOMEM;
+			goto fail;
+		}
+
+		if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
+			/* Yikes!  Still bad signature! */
+			*error = -SCTP_IERROR_BAD_SIG;
+			goto fail;
+		}
+	}
+
+no_hmac:
+	/* IG Section 2.35.2:
+	 *  3) Compare the port numbers and the verification tag contained
+	 *     within the COOKIE ECHO chunk to the actual port numbers and the
+	 *     verification tag within the SCTP common header of the received
+	 *     packet. If these values do not match the packet MUST be silently
+	 *     discarded,
+	 */
+	if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) {
+		*error = -SCTP_IERROR_BAD_TAG;
+		goto fail;
+	}
+
+	if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port ||
+	    ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) {
+		*error = -SCTP_IERROR_BAD_PORTS;
+		goto fail;
+	}
+
+	/* Check to see if the cookie is stale.  If there is already
+	 * an association, there is no need to check cookie's expiration
+	 * for init collision case of lost COOKIE ACK.
+	 * If skb has been timestamped, then use the stamp, otherwise
+	 * use current time.  This introduces a small possibility that
+	 * that a cookie may be considered expired, but his would only slow
+	 * down the new association establishment instead of every packet.
+	 */
+	if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
+		skb_get_timestamp(skb, &tv);
+	else
+		do_gettimeofday(&tv);
+
+	if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
+		/*
+		 * Section 3.3.10.3 Stale Cookie Error (3)
+		 *
+		 * Cause of error
+		 * ---------------
+		 * Stale Cookie Error:  Indicates the receipt of a valid State
+		 * Cookie that has expired.
+		 */
+		len = ntohs(chunk->chunk_hdr->length);
+		*errp = sctp_make_op_error_space(asoc, chunk, len);
+		if (*errp) {
+			suseconds_t usecs = (tv.tv_sec -
+				bear_cookie->expiration.tv_sec) * 1000000L +
+				tv.tv_usec - bear_cookie->expiration.tv_usec;
+			__be32 n = htonl(usecs);
+
+			sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
+					sizeof(n));
+			sctp_addto_chunk(*errp, sizeof(n), &n);
+			*error = -SCTP_IERROR_STALE_COOKIE;
+		} else
+			*error = -SCTP_IERROR_NOMEM;
+
+		goto fail;
+	}
+
+	/* Make a new base association.  */
+	scope = sctp_scope(sctp_source(chunk));
+	retval = sctp_association_new(ep, ep->base.sk, scope, gfp);
+	if (!retval) {
+		*error = -SCTP_IERROR_NOMEM;
+		goto fail;
+	}
+
+	/* Set up our peer's port number.  */
+	retval->peer.port = ntohs(chunk->sctp_hdr->source);
+
+	/* Populate the association from the cookie.  */
+	memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie));
+
+	if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie,
+						 GFP_ATOMIC) < 0) {
+		*error = -SCTP_IERROR_NOMEM;
+		goto fail;
+	}
+
+	/* Also, add the destination address. */
+	if (list_empty(&retval->base.bind_addr.address_list)) {
+		sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest,
+				SCTP_ADDR_SRC, GFP_ATOMIC);
+	}
+
+	retval->next_tsn = retval->c.initial_tsn;
+	retval->ctsn_ack_point = retval->next_tsn - 1;
+	retval->addip_serial = retval->c.initial_tsn;
+	retval->adv_peer_ack_point = retval->ctsn_ack_point;
+	retval->peer.prsctp_capable = retval->c.prsctp_capable;
+	retval->peer.adaptation_ind = retval->c.adaptation_ind;
+
+	/* The INIT stuff will be done by the side effects.  */
+	return retval;
+
+fail:
+	if (retval)
+		sctp_association_free(retval);
+
+	return NULL;
+
+malformed:
+	/* Yikes!  The packet is either corrupt or deliberately
+	 * malformed.
+	 */
+	*error = -SCTP_IERROR_MALFORMED;
+	goto fail;
+}
+
+/********************************************************************
+ * 3rd Level Abstractions
+ ********************************************************************/
+
+struct __sctp_missing {
+	__be32 num_missing;
+	__be16 type;
+}  __packed;
+
+/*
+ * Report a missing mandatory parameter.
+ */
+static int sctp_process_missing_param(const struct sctp_association *asoc,
+				      sctp_param_t paramtype,
+				      struct sctp_chunk *chunk,
+				      struct sctp_chunk **errp)
+{
+	struct __sctp_missing report;
+	__u16 len;
+
+	len = WORD_ROUND(sizeof(report));
+
+	/* Make an ERROR chunk, preparing enough room for
+	 * returning multiple unknown parameters.
+	 */
+	if (!*errp)
+		*errp = sctp_make_op_error_space(asoc, chunk, len);
+
+	if (*errp) {
+		report.num_missing = htonl(1);
+		report.type = paramtype;
+		sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM,
+				sizeof(report));
+		sctp_addto_chunk(*errp, sizeof(report), &report);
+	}
+
+	/* Stop processing this chunk. */
+	return 0;
+}
+
+/* Report an Invalid Mandatory Parameter.  */
+static int sctp_process_inv_mandatory(const struct sctp_association *asoc,
+				      struct sctp_chunk *chunk,
+				      struct sctp_chunk **errp)
+{
+	/* Invalid Mandatory Parameter Error has no payload. */
+
+	if (!*errp)
+		*errp = sctp_make_op_error_space(asoc, chunk, 0);
+
+	if (*errp)
+		sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0);
+
+	/* Stop processing this chunk. */
+	return 0;
+}
+
+static int sctp_process_inv_paramlength(const struct sctp_association *asoc,
+					struct sctp_paramhdr *param,
+					const struct sctp_chunk *chunk,
+					struct sctp_chunk **errp)
+{
+	/* This is a fatal error.  Any accumulated non-fatal errors are
+	 * not reported.
+	 */
+	if (*errp)
+		sctp_chunk_free(*errp);
+
+	/* Create an error chunk and fill it in with our payload. */
+	*errp = sctp_make_violation_paramlen(asoc, chunk, param);
+
+	return 0;
+}
+
+
+/* Do not attempt to handle the HOST_NAME parm.  However, do
+ * send back an indicator to the peer.
+ */
+static int sctp_process_hn_param(const struct sctp_association *asoc,
+				 union sctp_params param,
+				 struct sctp_chunk *chunk,
+				 struct sctp_chunk **errp)
+{
+	__u16 len = ntohs(param.p->length);
+
+	/* Processing of the HOST_NAME parameter will generate an
+	 * ABORT.  If we've accumulated any non-fatal errors, they
+	 * would be unrecognized parameters and we should not include
+	 * them in the ABORT.
+	 */
+	if (*errp)
+		sctp_chunk_free(*errp);
+
+	*errp = sctp_make_op_error_space(asoc, chunk, len);
+
+	if (*errp) {
+		sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
+		sctp_addto_chunk(*errp, len, param.v);
+	}
+
+	/* Stop processing this chunk. */
+	return 0;
+}
+
+static int sctp_verify_ext_param(union sctp_params param)
+{
+	__u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+	int have_auth = 0;
+	int have_asconf = 0;
+	int i;
+
+	for (i = 0; i < num_ext; i++) {
+		switch (param.ext->chunks[i]) {
+		    case SCTP_CID_AUTH:
+			    have_auth = 1;
+			    break;
+		    case SCTP_CID_ASCONF:
+		    case SCTP_CID_ASCONF_ACK:
+			    have_asconf = 1;
+			    break;
+		}
+	}
+
+	/* ADD-IP Security: The draft requires us to ABORT or ignore the
+	 * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not.  Do this
+	 * only if ADD-IP is turned on and we are not backward-compatible
+	 * mode.
+	 */
+	if (sctp_addip_noauth)
+		return 1;
+
+	if (sctp_addip_enable && !have_auth && have_asconf)
+		return 0;
+
+	return 1;
+}
+
+static void sctp_process_ext_param(struct sctp_association *asoc,
+				    union sctp_params param)
+{
+	__u16 num_ext = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+	int i;
+
+	for (i = 0; i < num_ext; i++) {
+		switch (param.ext->chunks[i]) {
+		    case SCTP_CID_FWD_TSN:
+			    if (sctp_prsctp_enable &&
+				!asoc->peer.prsctp_capable)
+				    asoc->peer.prsctp_capable = 1;
+			    break;
+		    case SCTP_CID_AUTH:
+			    /* if the peer reports AUTH, assume that he
+			     * supports AUTH.
+			     */
+			    if (sctp_auth_enable)
+				    asoc->peer.auth_capable = 1;
+			    break;
+		    case SCTP_CID_ASCONF:
+		    case SCTP_CID_ASCONF_ACK:
+			    if (sctp_addip_enable)
+				    asoc->peer.asconf_capable = 1;
+			    break;
+		    default:
+			    break;
+		}
+	}
+}
+
+/* RFC 3.2.1 & the Implementers Guide 2.2.
+ *
+ * The Parameter Types are encoded such that the
+ * highest-order two bits specify the action that must be
+ * taken if the processing endpoint does not recognize the
+ * Parameter Type.
+ *
+ * 00 - Stop processing this parameter; do not process any further
+ * 	parameters within this chunk
+ *
+ * 01 - Stop processing this parameter, do not process any further
+ *	parameters within this chunk, and report the unrecognized
+ *	parameter in an 'Unrecognized Parameter' ERROR chunk.
+ *
+ * 10 - Skip this parameter and continue processing.
+ *
+ * 11 - Skip this parameter and continue processing but
+ *	report the unrecognized parameter in an
+ *	'Unrecognized Parameter' ERROR chunk.
+ *
+ * Return value:
+ * 	SCTP_IERROR_NO_ERROR - continue with the chunk
+ * 	SCTP_IERROR_ERROR    - stop and report an error.
+ * 	SCTP_IERROR_NOMEME   - out of memory.
+ */
+static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
+					    union sctp_params param,
+					    struct sctp_chunk *chunk,
+					    struct sctp_chunk **errp)
+{
+	int retval = SCTP_IERROR_NO_ERROR;
+
+	switch (param.p->type & SCTP_PARAM_ACTION_MASK) {
+	case SCTP_PARAM_ACTION_DISCARD:
+		retval =  SCTP_IERROR_ERROR;
+		break;
+	case SCTP_PARAM_ACTION_SKIP:
+		break;
+	case SCTP_PARAM_ACTION_DISCARD_ERR:
+		retval =  SCTP_IERROR_ERROR;
+		/* Fall through */
+	case SCTP_PARAM_ACTION_SKIP_ERR:
+		/* Make an ERROR chunk, preparing enough room for
+		 * returning multiple unknown parameters.
+		 */
+		if (NULL == *errp)
+			*errp = sctp_make_op_error_fixed(asoc, chunk);
+
+		if (*errp) {
+			if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+					WORD_ROUND(ntohs(param.p->length))))
+				sctp_addto_chunk_fixed(*errp,
+						WORD_ROUND(ntohs(param.p->length)),
+						param.v);
+		} else {
+			/* If there is no memory for generating the ERROR
+			 * report as specified, an ABORT will be triggered
+			 * to the peer and the association won't be
+			 * established.
+			 */
+			retval = SCTP_IERROR_NOMEM;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return retval;
+}
+
+/* Verify variable length parameters
+ * Return values:
+ * 	SCTP_IERROR_ABORT - trigger an ABORT
+ * 	SCTP_IERROR_NOMEM - out of memory (abort)
+ *	SCTP_IERROR_ERROR - stop processing, trigger an ERROR
+ * 	SCTP_IERROR_NO_ERROR - continue with the chunk
+ */
+static sctp_ierror_t sctp_verify_param(const struct sctp_association *asoc,
+					union sctp_params param,
+					sctp_cid_t cid,
+					struct sctp_chunk *chunk,
+					struct sctp_chunk **err_chunk)
+{
+	struct sctp_hmac_algo_param *hmacs;
+	int retval = SCTP_IERROR_NO_ERROR;
+	__u16 n_elt, id = 0;
+	int i;
+
+	/* FIXME - This routine is not looking at each parameter per the
+	 * chunk type, i.e., unrecognized parameters should be further
+	 * identified based on the chunk id.
+	 */
+
+	switch (param.p->type) {
+	case SCTP_PARAM_IPV4_ADDRESS:
+	case SCTP_PARAM_IPV6_ADDRESS:
+	case SCTP_PARAM_COOKIE_PRESERVATIVE:
+	case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
+	case SCTP_PARAM_STATE_COOKIE:
+	case SCTP_PARAM_HEARTBEAT_INFO:
+	case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
+	case SCTP_PARAM_ECN_CAPABLE:
+	case SCTP_PARAM_ADAPTATION_LAYER_IND:
+		break;
+
+	case SCTP_PARAM_SUPPORTED_EXT:
+		if (!sctp_verify_ext_param(param))
+			return SCTP_IERROR_ABORT;
+		break;
+
+	case SCTP_PARAM_SET_PRIMARY:
+		if (sctp_addip_enable)
+			break;
+		goto fallthrough;
+
+	case SCTP_PARAM_HOST_NAME_ADDRESS:
+		/* Tell the peer, we won't support this param.  */
+		sctp_process_hn_param(asoc, param, chunk, err_chunk);
+		retval = SCTP_IERROR_ABORT;
+		break;
+
+	case SCTP_PARAM_FWD_TSN_SUPPORT:
+		if (sctp_prsctp_enable)
+			break;
+		goto fallthrough;
+
+	case SCTP_PARAM_RANDOM:
+		if (!sctp_auth_enable)
+			goto fallthrough;
+
+		/* SCTP-AUTH: Secion 6.1
+		 * If the random number is not 32 byte long the association
+		 * MUST be aborted.  The ABORT chunk SHOULD contain the error
+		 * cause 'Protocol Violation'.
+		 */
+		if (SCTP_AUTH_RANDOM_LENGTH !=
+			ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) {
+			sctp_process_inv_paramlength(asoc, param.p,
+							chunk, err_chunk);
+			retval = SCTP_IERROR_ABORT;
+		}
+		break;
+
+	case SCTP_PARAM_CHUNKS:
+		if (!sctp_auth_enable)
+			goto fallthrough;
+
+		/* SCTP-AUTH: Section 3.2
+		 * The CHUNKS parameter MUST be included once in the INIT or
+		 *  INIT-ACK chunk if the sender wants to receive authenticated
+		 *  chunks.  Its maximum length is 260 bytes.
+		 */
+		if (260 < ntohs(param.p->length)) {
+			sctp_process_inv_paramlength(asoc, param.p,
+						     chunk, err_chunk);
+			retval = SCTP_IERROR_ABORT;
+		}
+		break;
+
+	case SCTP_PARAM_HMAC_ALGO:
+		if (!sctp_auth_enable)
+			goto fallthrough;
+
+		hmacs = (struct sctp_hmac_algo_param *)param.p;
+		n_elt = (ntohs(param.p->length) - sizeof(sctp_paramhdr_t)) >> 1;
+
+		/* SCTP-AUTH: Section 6.1
+		 * The HMAC algorithm based on SHA-1 MUST be supported and
+		 * included in the HMAC-ALGO parameter.
+		 */
+		for (i = 0; i < n_elt; i++) {
+			id = ntohs(hmacs->hmac_ids[i]);
+
+			if (id == SCTP_AUTH_HMAC_ID_SHA1)
+				break;
+		}
+
+		if (id != SCTP_AUTH_HMAC_ID_SHA1) {
+			sctp_process_inv_paramlength(asoc, param.p, chunk,
+						     err_chunk);
+			retval = SCTP_IERROR_ABORT;
+		}
+		break;
+fallthrough:
+	default:
+		SCTP_DEBUG_PRINTK("Unrecognized param: %d for chunk %d.\n",
+				ntohs(param.p->type), cid);
+		retval = sctp_process_unk_param(asoc, param, chunk, err_chunk);
+		break;
+	}
+	return retval;
+}
+
+/* Verify the INIT packet before we process it.  */
+int sctp_verify_init(const struct sctp_association *asoc,
+		     sctp_cid_t cid,
+		     sctp_init_chunk_t *peer_init,
+		     struct sctp_chunk *chunk,
+		     struct sctp_chunk **errp)
+{
+	union sctp_params param;
+	int has_cookie = 0;
+	int result;
+
+	/* Verify stream values are non-zero. */
+	if ((0 == peer_init->init_hdr.num_outbound_streams) ||
+	    (0 == peer_init->init_hdr.num_inbound_streams) ||
+	    (0 == peer_init->init_hdr.init_tag) ||
+	    (SCTP_DEFAULT_MINWINDOW > ntohl(peer_init->init_hdr.a_rwnd))) {
+
+		return sctp_process_inv_mandatory(asoc, chunk, errp);
+	}
+
+	/* Check for missing mandatory parameters.  */
+	sctp_walk_params(param, peer_init, init_hdr.params) {
+
+		if (SCTP_PARAM_STATE_COOKIE == param.p->type)
+			has_cookie = 1;
+
+	} /* for (loop through all parameters) */
+
+	/* There is a possibility that a parameter length was bad and
+	 * in that case we would have stoped walking the parameters.
+	 * The current param.p would point at the bad one.
+	 * Current consensus on the mailing list is to generate a PROTOCOL
+	 * VIOLATION error.  We build the ERROR chunk here and let the normal
+	 * error handling code build and send the packet.
+	 */
+	if (param.v != (void*)chunk->chunk_end)
+		return sctp_process_inv_paramlength(asoc, param.p, chunk, errp);
+
+	/* The only missing mandatory param possible today is
+	 * the state cookie for an INIT-ACK chunk.
+	 */
+	if ((SCTP_CID_INIT_ACK == cid) && !has_cookie)
+		return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE,
+						  chunk, errp);
+
+	/* Verify all the variable length parameters */
+	sctp_walk_params(param, peer_init, init_hdr.params) {
+
+		result = sctp_verify_param(asoc, param, cid, chunk, errp);
+		switch (result) {
+		    case SCTP_IERROR_ABORT:
+		    case SCTP_IERROR_NOMEM:
+				return 0;
+		    case SCTP_IERROR_ERROR:
+				return 1;
+		    case SCTP_IERROR_NO_ERROR:
+		    default:
+				break;
+		}
+
+	} /* for (loop through all parameters) */
+
+	return 1;
+}
+
+/* Unpack the parameters in an INIT packet into an association.
+ * Returns 0 on failure, else success.
+ * FIXME:  This is an association method.
+ */
+int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
+		      const union sctp_addr *peer_addr,
+		      sctp_init_chunk_t *peer_init, gfp_t gfp)
+{
+	union sctp_params param;
+	struct sctp_transport *transport;
+	struct list_head *pos, *temp;
+	struct sctp_af *af;
+	union sctp_addr addr;
+	char *cookie;
+	int src_match = 0;
+
+	/* We must include the address that the INIT packet came from.
+	 * This is the only address that matters for an INIT packet.
+	 * When processing a COOKIE ECHO, we retrieve the from address
+	 * of the INIT from the cookie.
+	 */
+
+	/* This implementation defaults to making the first transport
+	 * added as the primary transport.  The source address seems to
+	 * be a a better choice than any of the embedded addresses.
+	 */
+	if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
+		goto nomem;
+
+	if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr))
+		src_match = 1;
+
+	/* Process the initialization parameters.  */
+	sctp_walk_params(param, peer_init, init_hdr.params) {
+		if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
+		    param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
+			af = sctp_get_af_specific(param_type2af(param.p->type));
+			af->from_addr_param(&addr, param.addr,
+					    chunk->sctp_hdr->source, 0);
+			if (sctp_cmp_addr_exact(sctp_source(chunk), &addr))
+				src_match = 1;
+		}
+
+		if (!sctp_process_param(asoc, param, peer_addr, gfp))
+			goto clean_up;
+	}
+
+	/* source address of chunk may not match any valid address */
+	if (!src_match)
+		goto clean_up;
+
+	/* AUTH: After processing the parameters, make sure that we
+	 * have all the required info to potentially do authentications.
+	 */
+	if (asoc->peer.auth_capable && (!asoc->peer.peer_random ||
+					!asoc->peer.peer_hmacs))
+		asoc->peer.auth_capable = 0;
+
+	/* In a non-backward compatible mode, if the peer claims
+	 * support for ADD-IP but not AUTH,  the ADD-IP spec states
+	 * that we MUST ABORT the association. Section 6.  The section
+	 * also give us an option to silently ignore the packet, which
+	 * is what we'll do here.
+	 */
+	if (!sctp_addip_noauth &&
+	     (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) {
+		asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP |
+						  SCTP_PARAM_DEL_IP |
+						  SCTP_PARAM_SET_PRIMARY);
+		asoc->peer.asconf_capable = 0;
+		goto clean_up;
+	}
+
+	/* Walk list of transports, removing transports in the UNKNOWN state. */
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		transport = list_entry(pos, struct sctp_transport, transports);
+		if (transport->state == SCTP_UNKNOWN) {
+			sctp_assoc_rm_peer(asoc, transport);
+		}
+	}
+
+	/* The fixed INIT headers are always in network byte
+	 * order.
+	 */
+	asoc->peer.i.init_tag =
+		ntohl(peer_init->init_hdr.init_tag);
+	asoc->peer.i.a_rwnd =
+		ntohl(peer_init->init_hdr.a_rwnd);
+	asoc->peer.i.num_outbound_streams =
+		ntohs(peer_init->init_hdr.num_outbound_streams);
+	asoc->peer.i.num_inbound_streams =
+		ntohs(peer_init->init_hdr.num_inbound_streams);
+	asoc->peer.i.initial_tsn =
+		ntohl(peer_init->init_hdr.initial_tsn);
+
+	/* Apply the upper bounds for output streams based on peer's
+	 * number of inbound streams.
+	 */
+	if (asoc->c.sinit_num_ostreams  >
+	    ntohs(peer_init->init_hdr.num_inbound_streams)) {
+		asoc->c.sinit_num_ostreams =
+			ntohs(peer_init->init_hdr.num_inbound_streams);
+	}
+
+	if (asoc->c.sinit_max_instreams >
+	    ntohs(peer_init->init_hdr.num_outbound_streams)) {
+		asoc->c.sinit_max_instreams =
+			ntohs(peer_init->init_hdr.num_outbound_streams);
+	}
+
+	/* Copy Initiation tag from INIT to VT_peer in cookie.   */
+	asoc->c.peer_vtag = asoc->peer.i.init_tag;
+
+	/* Peer Rwnd   : Current calculated value of the peer's rwnd.  */
+	asoc->peer.rwnd = asoc->peer.i.a_rwnd;
+
+	/* Copy cookie in case we need to resend COOKIE-ECHO. */
+	cookie = asoc->peer.cookie;
+	if (cookie) {
+		asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp);
+		if (!asoc->peer.cookie)
+			goto clean_up;
+	}
+
+	/* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily
+	 * high (for example, implementations MAY use the size of the receiver
+	 * advertised window).
+	 */
+	list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+			transports) {
+		transport->ssthresh = asoc->peer.i.a_rwnd;
+	}
+
+	/* Set up the TSN tracking pieces.  */
+	if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
+				asoc->peer.i.initial_tsn, gfp))
+		goto clean_up;
+
+	/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
+	 *
+	 * The stream sequence number in all the streams shall start
+	 * from 0 when the association is established.  Also, when the
+	 * stream sequence number reaches the value 65535 the next
+	 * stream sequence number shall be set to 0.
+	 */
+
+	/* Allocate storage for the negotiated streams if it is not a temporary
+	 * association.
+	 */
+	if (!asoc->temp) {
+		int error;
+
+		asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
+					       asoc->c.sinit_num_ostreams, gfp);
+		if (!asoc->ssnmap)
+			goto clean_up;
+
+		error = sctp_assoc_set_id(asoc, gfp);
+		if (error)
+			goto clean_up;
+	}
+
+	/* ADDIP Section 4.1 ASCONF Chunk Procedures
+	 *
+	 * When an endpoint has an ASCONF signaled change to be sent to the
+	 * remote endpoint it should do the following:
+	 * ...
+	 * A2) A serial number should be assigned to the Chunk. The serial
+	 * number should be a monotonically increasing number. All serial
+	 * numbers are defined to be initialized at the start of the
+	 * association to the same value as the Initial TSN.
+	 */
+	asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1;
+	return 1;
+
+clean_up:
+	/* Release the transport structures. */
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		transport = list_entry(pos, struct sctp_transport, transports);
+		if (transport->state != SCTP_ACTIVE)
+			sctp_assoc_rm_peer(asoc, transport);
+	}
+
+nomem:
+	return 0;
+}
+
+
+/* Update asoc with the option described in param.
+ *
+ * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT
+ *
+ * asoc is the association to update.
+ * param is the variable length parameter to use for update.
+ * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO.
+ * If the current packet is an INIT we want to minimize the amount of
+ * work we do.  In particular, we should not build transport
+ * structures for the addresses.
+ */
+static int sctp_process_param(struct sctp_association *asoc,
+			      union sctp_params param,
+			      const union sctp_addr *peer_addr,
+			      gfp_t gfp)
+{
+	union sctp_addr addr;
+	int i;
+	__u16 sat;
+	int retval = 1;
+	sctp_scope_t scope;
+	time_t stale;
+	struct sctp_af *af;
+	union sctp_addr_param *addr_param;
+	struct sctp_transport *t;
+
+	/* We maintain all INIT parameters in network byte order all the
+	 * time.  This allows us to not worry about whether the parameters
+	 * came from a fresh INIT, and INIT ACK, or were stored in a cookie.
+	 */
+	switch (param.p->type) {
+	case SCTP_PARAM_IPV6_ADDRESS:
+		if (PF_INET6 != asoc->base.sk->sk_family)
+			break;
+		goto do_addr_param;
+
+	case SCTP_PARAM_IPV4_ADDRESS:
+		/* v4 addresses are not allowed on v6-only socket */
+		if (ipv6_only_sock(asoc->base.sk))
+			break;
+do_addr_param:
+		af = sctp_get_af_specific(param_type2af(param.p->type));
+		af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0);
+		scope = sctp_scope(peer_addr);
+		if (sctp_in_scope(&addr, scope))
+			if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED))
+				return 0;
+		break;
+
+	case SCTP_PARAM_COOKIE_PRESERVATIVE:
+		if (!sctp_cookie_preserve_enable)
+			break;
+
+		stale = ntohl(param.life->lifespan_increment);
+
+		/* Suggested Cookie Life span increment's unit is msec,
+		 * (1/1000sec).
+		 */
+		asoc->cookie_life.tv_sec += stale / 1000;
+		asoc->cookie_life.tv_usec += (stale % 1000) * 1000;
+		break;
+
+	case SCTP_PARAM_HOST_NAME_ADDRESS:
+		SCTP_DEBUG_PRINTK("unimplemented SCTP_HOST_NAME_ADDRESS\n");
+		break;
+
+	case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
+		/* Turn off the default values first so we'll know which
+		 * ones are really set by the peer.
+		 */
+		asoc->peer.ipv4_address = 0;
+		asoc->peer.ipv6_address = 0;
+
+		/* Assume that peer supports the address family
+		 * by which it sends a packet.
+		 */
+		if (peer_addr->sa.sa_family == AF_INET6)
+			asoc->peer.ipv6_address = 1;
+		else if (peer_addr->sa.sa_family == AF_INET)
+			asoc->peer.ipv4_address = 1;
+
+		/* Cycle through address types; avoid divide by 0. */
+		sat = ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+		if (sat)
+			sat /= sizeof(__u16);
+
+		for (i = 0; i < sat; ++i) {
+			switch (param.sat->types[i]) {
+			case SCTP_PARAM_IPV4_ADDRESS:
+				asoc->peer.ipv4_address = 1;
+				break;
+
+			case SCTP_PARAM_IPV6_ADDRESS:
+				if (PF_INET6 == asoc->base.sk->sk_family)
+					asoc->peer.ipv6_address = 1;
+				break;
+
+			case SCTP_PARAM_HOST_NAME_ADDRESS:
+				asoc->peer.hostname_address = 1;
+				break;
+
+			default: /* Just ignore anything else.  */
+				break;
+			}
+		}
+		break;
+
+	case SCTP_PARAM_STATE_COOKIE:
+		asoc->peer.cookie_len =
+			ntohs(param.p->length) - sizeof(sctp_paramhdr_t);
+		asoc->peer.cookie = param.cookie->body;
+		break;
+
+	case SCTP_PARAM_HEARTBEAT_INFO:
+		/* Would be odd to receive, but it causes no problems. */
+		break;
+
+	case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
+		/* Rejected during verify stage. */
+		break;
+
+	case SCTP_PARAM_ECN_CAPABLE:
+		asoc->peer.ecn_capable = 1;
+		break;
+
+	case SCTP_PARAM_ADAPTATION_LAYER_IND:
+		asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind);
+		break;
+
+	case SCTP_PARAM_SET_PRIMARY:
+		if (!sctp_addip_enable)
+			goto fall_through;
+
+		addr_param = param.v + sizeof(sctp_addip_param_t);
+
+		af = sctp_get_af_specific(param_type2af(param.p->type));
+		af->from_addr_param(&addr, addr_param,
+				    htons(asoc->peer.port), 0);
+
+		/* if the address is invalid, we can't process it.
+		 * XXX: see spec for what to do.
+		 */
+		if (!af->addr_valid(&addr, NULL, NULL))
+			break;
+
+		t = sctp_assoc_lookup_paddr(asoc, &addr);
+		if (!t)
+			break;
+
+		sctp_assoc_set_primary(asoc, t);
+		break;
+
+	case SCTP_PARAM_SUPPORTED_EXT:
+		sctp_process_ext_param(asoc, param);
+		break;
+
+	case SCTP_PARAM_FWD_TSN_SUPPORT:
+		if (sctp_prsctp_enable) {
+			asoc->peer.prsctp_capable = 1;
+			break;
+		}
+		/* Fall Through */
+		goto fall_through;
+
+	case SCTP_PARAM_RANDOM:
+		if (!sctp_auth_enable)
+			goto fall_through;
+
+		/* Save peer's random parameter */
+		asoc->peer.peer_random = kmemdup(param.p,
+					    ntohs(param.p->length), gfp);
+		if (!asoc->peer.peer_random) {
+			retval = 0;
+			break;
+		}
+		break;
+
+	case SCTP_PARAM_HMAC_ALGO:
+		if (!sctp_auth_enable)
+			goto fall_through;
+
+		/* Save peer's HMAC list */
+		asoc->peer.peer_hmacs = kmemdup(param.p,
+					    ntohs(param.p->length), gfp);
+		if (!asoc->peer.peer_hmacs) {
+			retval = 0;
+			break;
+		}
+
+		/* Set the default HMAC the peer requested*/
+		sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo);
+		break;
+
+	case SCTP_PARAM_CHUNKS:
+		if (!sctp_auth_enable)
+			goto fall_through;
+
+		asoc->peer.peer_chunks = kmemdup(param.p,
+					    ntohs(param.p->length), gfp);
+		if (!asoc->peer.peer_chunks)
+			retval = 0;
+		break;
+fall_through:
+	default:
+		/* Any unrecognized parameters should have been caught
+		 * and handled by sctp_verify_param() which should be
+		 * called prior to this routine.  Simply log the error
+		 * here.
+		 */
+		SCTP_DEBUG_PRINTK("Ignoring param: %d for association %p.\n",
+				  ntohs(param.p->type), asoc);
+		break;
+	}
+
+	return retval;
+}
+
+/* Select a new verification tag.  */
+__u32 sctp_generate_tag(const struct sctp_endpoint *ep)
+{
+	/* I believe that this random number generator complies with RFC1750.
+	 * A tag of 0 is reserved for special cases (e.g. INIT).
+	 */
+	__u32 x;
+
+	do {
+		get_random_bytes(&x, sizeof(__u32));
+	} while (x == 0);
+
+	return x;
+}
+
+/* Select an initial TSN to send during startup.  */
+__u32 sctp_generate_tsn(const struct sctp_endpoint *ep)
+{
+	__u32 retval;
+
+	get_random_bytes(&retval, sizeof(__u32));
+	return retval;
+}
+
+/*
+ * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF)
+ *      0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     | Type = 0xC1   |  Chunk Flags  |      Chunk Length             |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                       Serial Number                           |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                    Address Parameter                          |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                     ASCONF Parameter #1                       |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     \                                                               \
+ *     /                             ....                              /
+ *     \                                                               \
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                     ASCONF Parameter #N                       |
+ *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Address Parameter and other parameter will not be wrapped in this function
+ */
+static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
+					   union sctp_addr *addr,
+					   int vparam_len)
+{
+	sctp_addiphdr_t asconf;
+	struct sctp_chunk *retval;
+	int length = sizeof(asconf) + vparam_len;
+	union sctp_addr_param addrparam;
+	int addrlen;
+	struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);
+
+	addrlen = af->to_addr_param(addr, &addrparam);
+	if (!addrlen)
+		return NULL;
+	length += addrlen;
+
+	/* Create the chunk.  */
+	retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF, 0, length);
+	if (!retval)
+		return NULL;
+
+	asconf.serial = htonl(asoc->addip_serial++);
+
+	retval->subh.addip_hdr =
+		sctp_addto_chunk(retval, sizeof(asconf), &asconf);
+	retval->param_hdr.v =
+		sctp_addto_chunk(retval, addrlen, &addrparam);
+
+	return retval;
+}
+
+/* ADDIP
+ * 3.2.1 Add IP Address
+ * 	0                   1                   2                   3
+ * 	0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |        Type = 0xC001          |    Length = Variable          |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |               ASCONF-Request Correlation ID                   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                       Address Parameter                       |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * 3.2.2 Delete IP Address
+ * 	0                   1                   2                   3
+ * 	0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |        Type = 0xC002          |    Length = Variable          |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |               ASCONF-Request Correlation ID                   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                       Address Parameter                       |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ */
+struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
+					      union sctp_addr	      *laddr,
+					      struct sockaddr	      *addrs,
+					      int		      addrcnt,
+					      __be16		      flags)
+{
+	sctp_addip_param_t	param;
+	struct sctp_chunk	*retval;
+	union sctp_addr_param	addr_param;
+	union sctp_addr		*addr;
+	void			*addr_buf;
+	struct sctp_af		*af;
+	int			paramlen = sizeof(param);
+	int			addr_param_len = 0;
+	int 			totallen = 0;
+	int 			i;
+
+	/* Get total length of all the address parameters. */
+	addr_buf = addrs;
+	for (i = 0; i < addrcnt; i++) {
+		addr = (union sctp_addr *)addr_buf;
+		af = sctp_get_af_specific(addr->v4.sin_family);
+		addr_param_len = af->to_addr_param(addr, &addr_param);
+
+		totallen += paramlen;
+		totallen += addr_param_len;
+
+		addr_buf += af->sockaddr_len;
+	}
+
+	/* Create an asconf chunk with the required length. */
+	retval = sctp_make_asconf(asoc, laddr, totallen);
+	if (!retval)
+		return NULL;
+
+	/* Add the address parameters to the asconf chunk. */
+	addr_buf = addrs;
+	for (i = 0; i < addrcnt; i++) {
+		addr = (union sctp_addr *)addr_buf;
+		af = sctp_get_af_specific(addr->v4.sin_family);
+		addr_param_len = af->to_addr_param(addr, &addr_param);
+		param.param_hdr.type = flags;
+		param.param_hdr.length = htons(paramlen + addr_param_len);
+		param.crr_id = i;
+
+		sctp_addto_chunk(retval, paramlen, &param);
+		sctp_addto_chunk(retval, addr_param_len, &addr_param);
+
+		addr_buf += af->sockaddr_len;
+	}
+	return retval;
+}
+
+/* ADDIP
+ * 3.2.4 Set Primary IP Address
+ *	0                   1                   2                   3
+ *	0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |        Type =0xC004           |    Length = Variable          |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |               ASCONF-Request Correlation ID                   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                       Address Parameter                       |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Create an ASCONF chunk with Set Primary IP address parameter.
+ */
+struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
+					     union sctp_addr *addr)
+{
+	sctp_addip_param_t	param;
+	struct sctp_chunk 	*retval;
+	int 			len = sizeof(param);
+	union sctp_addr_param	addrparam;
+	int			addrlen;
+	struct sctp_af		*af = sctp_get_af_specific(addr->v4.sin_family);
+
+	addrlen = af->to_addr_param(addr, &addrparam);
+	if (!addrlen)
+		return NULL;
+	len += addrlen;
+
+	/* Create the chunk and make asconf header. */
+	retval = sctp_make_asconf(asoc, addr, len);
+	if (!retval)
+		return NULL;
+
+	param.param_hdr.type = SCTP_PARAM_SET_PRIMARY;
+	param.param_hdr.length = htons(len);
+	param.crr_id = 0;
+
+	sctp_addto_chunk(retval, sizeof(param), &param);
+	sctp_addto_chunk(retval, addrlen, &addrparam);
+
+	return retval;
+}
+
+/* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK)
+ *      0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     | Type = 0x80   |  Chunk Flags  |      Chunk Length             |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                       Serial Number                           |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                 ASCONF Parameter Response#1                   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     \                                                               \
+ *     /                             ....                              /
+ *     \                                                               \
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |                 ASCONF Parameter Response#N                   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Create an ASCONF_ACK chunk with enough space for the parameter responses.
+ */
+static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc,
+					       __u32 serial, int vparam_len)
+{
+	sctp_addiphdr_t		asconf;
+	struct sctp_chunk	*retval;
+	int			length = sizeof(asconf) + vparam_len;
+
+	/* Create the chunk.  */
+	retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+	if (!retval)
+		return NULL;
+
+	asconf.serial = htonl(serial);
+
+	retval->subh.addip_hdr =
+		sctp_addto_chunk(retval, sizeof(asconf), &asconf);
+
+	return retval;
+}
+
+/* Add response parameters to an ASCONF_ACK chunk. */
+static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
+			      __be16 err_code, sctp_addip_param_t *asconf_param)
+{
+	sctp_addip_param_t 	ack_param;
+	sctp_errhdr_t		err_param;
+	int			asconf_param_len = 0;
+	int			err_param_len = 0;
+	__be16			response_type;
+
+	if (SCTP_ERROR_NO_ERROR == err_code) {
+		response_type = SCTP_PARAM_SUCCESS_REPORT;
+	} else {
+		response_type = SCTP_PARAM_ERR_CAUSE;
+		err_param_len = sizeof(err_param);
+		if (asconf_param)
+			asconf_param_len =
+				 ntohs(asconf_param->param_hdr.length);
+	}
+
+	/* Add Success Indication or Error Cause Indication parameter. */
+	ack_param.param_hdr.type = response_type;
+	ack_param.param_hdr.length = htons(sizeof(ack_param) +
+					   err_param_len +
+					   asconf_param_len);
+	ack_param.crr_id = crr_id;
+	sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param);
+
+	if (SCTP_ERROR_NO_ERROR == err_code)
+		return;
+
+	/* Add Error Cause parameter. */
+	err_param.cause = err_code;
+	err_param.length = htons(err_param_len + asconf_param_len);
+	sctp_addto_chunk(chunk, err_param_len, &err_param);
+
+	/* Add the failed TLV copied from ASCONF chunk. */
+	if (asconf_param)
+		sctp_addto_chunk(chunk, asconf_param_len, asconf_param);
+}
+
+/* Process a asconf parameter. */
+static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
+				       struct sctp_chunk *asconf,
+				       sctp_addip_param_t *asconf_param)
+{
+	struct sctp_transport *peer;
+	struct sctp_af *af;
+	union sctp_addr	addr;
+	union sctp_addr_param *addr_param;
+
+	addr_param = (union sctp_addr_param *)
+			((void *)asconf_param + sizeof(sctp_addip_param_t));
+
+	if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP &&
+	    asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP &&
+	    asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY)
+		return SCTP_ERROR_UNKNOWN_PARAM;
+
+	switch (addr_param->p.type) {
+	case SCTP_PARAM_IPV6_ADDRESS:
+		if (!asoc->peer.ipv6_address)
+			return SCTP_ERROR_DNS_FAILED;
+		break;
+	case SCTP_PARAM_IPV4_ADDRESS:
+		if (!asoc->peer.ipv4_address)
+			return SCTP_ERROR_DNS_FAILED;
+		break;
+	default:
+		return SCTP_ERROR_DNS_FAILED;
+	}
+
+	af = sctp_get_af_specific(param_type2af(addr_param->p.type));
+	if (unlikely(!af))
+		return SCTP_ERROR_DNS_FAILED;
+
+	af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0);
+
+	/* ADDIP 4.2.1  This parameter MUST NOT contain a broadcast
+	 * or multicast address.
+	 * (note: wildcard is permitted and requires special handling so
+	 *  make sure we check for that)
+	 */
+	if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb))
+		return SCTP_ERROR_DNS_FAILED;
+
+	switch (asconf_param->param_hdr.type) {
+	case SCTP_PARAM_ADD_IP:
+		/* Section 4.2.1:
+		 * If the address 0.0.0.0 or ::0 is provided, the source
+		 * address of the packet MUST be added.
+		 */
+		if (af->is_any(&addr))
+			memcpy(&addr, &asconf->source, sizeof(addr));
+
+		/* ADDIP 4.3 D9) If an endpoint receives an ADD IP address
+		 * request and does not have the local resources to add this
+		 * new address to the association, it MUST return an Error
+		 * Cause TLV set to the new error code 'Operation Refused
+		 * Due to Resource Shortage'.
+		 */
+
+		peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED);
+		if (!peer)
+			return SCTP_ERROR_RSRC_LOW;
+
+		/* Start the heartbeat timer. */
+		if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer)))
+			sctp_transport_hold(peer);
+		break;
+	case SCTP_PARAM_DEL_IP:
+		/* ADDIP 4.3 D7) If a request is received to delete the
+		 * last remaining IP address of a peer endpoint, the receiver
+		 * MUST send an Error Cause TLV with the error cause set to the
+		 * new error code 'Request to Delete Last Remaining IP Address'.
+		 */
+		if (asoc->peer.transport_count == 1)
+			return SCTP_ERROR_DEL_LAST_IP;
+
+		/* ADDIP 4.3 D8) If a request is received to delete an IP
+		 * address which is also the source address of the IP packet
+		 * which contained the ASCONF chunk, the receiver MUST reject
+		 * this request. To reject the request the receiver MUST send
+		 * an Error Cause TLV set to the new error code 'Request to
+		 * Delete Source IP Address'
+		 */
+		if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
+			return SCTP_ERROR_DEL_SRC_IP;
+
+		/* Section 4.2.2
+		 * If the address 0.0.0.0 or ::0 is provided, all
+		 * addresses of the peer except	the source address of the
+		 * packet MUST be deleted.
+		 */
+		if (af->is_any(&addr)) {
+			sctp_assoc_set_primary(asoc, asconf->transport);
+			sctp_assoc_del_nonprimary_peers(asoc,
+							asconf->transport);
+		} else
+			sctp_assoc_del_peer(asoc, &addr);
+		break;
+	case SCTP_PARAM_SET_PRIMARY:
+		/* ADDIP Section 4.2.4
+		 * If the address 0.0.0.0 or ::0 is provided, the receiver
+		 * MAY mark the source address of the packet as its
+		 * primary.
+		 */
+		if (af->is_any(&addr))
+			memcpy(&addr.v4, sctp_source(asconf), sizeof(addr));
+
+		peer = sctp_assoc_lookup_paddr(asoc, &addr);
+		if (!peer)
+			return SCTP_ERROR_DNS_FAILED;
+
+		sctp_assoc_set_primary(asoc, peer);
+		break;
+	}
+
+	return SCTP_ERROR_NO_ERROR;
+}
+
+/* Verify the ASCONF packet before we process it.  */
+int sctp_verify_asconf(const struct sctp_association *asoc,
+		       struct sctp_paramhdr *param_hdr, void *chunk_end,
+		       struct sctp_paramhdr **errp) {
+	sctp_addip_param_t *asconf_param;
+	union sctp_params param;
+	int length, plen;
+
+	param.v = (sctp_paramhdr_t *) param_hdr;
+	while (param.v <= chunk_end - sizeof(sctp_paramhdr_t)) {
+		length = ntohs(param.p->length);
+		*errp = param.p;
+
+		if (param.v > chunk_end - length ||
+		    length < sizeof(sctp_paramhdr_t))
+			return 0;
+
+		switch (param.p->type) {
+		case SCTP_PARAM_ADD_IP:
+		case SCTP_PARAM_DEL_IP:
+		case SCTP_PARAM_SET_PRIMARY:
+			asconf_param = (sctp_addip_param_t *)param.v;
+			plen = ntohs(asconf_param->param_hdr.length);
+			if (plen < sizeof(sctp_addip_param_t) +
+			    sizeof(sctp_paramhdr_t))
+				return 0;
+			break;
+		case SCTP_PARAM_SUCCESS_REPORT:
+		case SCTP_PARAM_ADAPTATION_LAYER_IND:
+			if (length != sizeof(sctp_addip_param_t))
+				return 0;
+
+			break;
+		default:
+			break;
+		}
+
+		param.v += WORD_ROUND(length);
+	}
+
+	if (param.v != chunk_end)
+		return 0;
+
+	return 1;
+}
+
+/* Process an incoming ASCONF chunk with the next expected serial no. and
+ * return an ASCONF_ACK chunk to be sent in response.
+ */
+struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
+				       struct sctp_chunk *asconf)
+{
+	sctp_addiphdr_t		*hdr;
+	union sctp_addr_param	*addr_param;
+	sctp_addip_param_t	*asconf_param;
+	struct sctp_chunk	*asconf_ack;
+
+	__be16	err_code;
+	int	length = 0;
+	int	chunk_len;
+	__u32	serial;
+	int	all_param_pass = 1;
+
+	chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(sctp_chunkhdr_t);
+	hdr = (sctp_addiphdr_t *)asconf->skb->data;
+	serial = ntohl(hdr->serial);
+
+	/* Skip the addiphdr and store a pointer to address parameter.  */
+	length = sizeof(sctp_addiphdr_t);
+	addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
+	chunk_len -= length;
+
+	/* Skip the address parameter and store a pointer to the first
+	 * asconf parameter.
+	 */
+	length = ntohs(addr_param->p.length);
+	asconf_param = (sctp_addip_param_t *)((void *)addr_param + length);
+	chunk_len -= length;
+
+	/* create an ASCONF_ACK chunk.
+	 * Based on the definitions of parameters, we know that the size of
+	 * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF
+	 * parameters.
+	 */
+	asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4);
+	if (!asconf_ack)
+		goto done;
+
+	/* Process the TLVs contained within the ASCONF chunk. */
+	while (chunk_len > 0) {
+		err_code = sctp_process_asconf_param(asoc, asconf,
+						     asconf_param);
+		/* ADDIP 4.1 A7)
+		 * If an error response is received for a TLV parameter,
+		 * all TLVs with no response before the failed TLV are
+		 * considered successful if not reported.  All TLVs after
+		 * the failed response are considered unsuccessful unless
+		 * a specific success indication is present for the parameter.
+		 */
+		if (SCTP_ERROR_NO_ERROR != err_code)
+			all_param_pass = 0;
+
+		if (!all_param_pass)
+			sctp_add_asconf_response(asconf_ack,
+						 asconf_param->crr_id, err_code,
+						 asconf_param);
+
+		/* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add
+		 * an IP address sends an 'Out of Resource' in its response, it
+		 * MUST also fail any subsequent add or delete requests bundled
+		 * in the ASCONF.
+		 */
+		if (SCTP_ERROR_RSRC_LOW == err_code)
+			goto done;
+
+		/* Move to the next ASCONF param. */
+		length = ntohs(asconf_param->param_hdr.length);
+		asconf_param = (sctp_addip_param_t *)((void *)asconf_param +
+						      length);
+		chunk_len -= length;
+	}
+
+done:
+	asoc->peer.addip_serial++;
+
+	/* If we are sending a new ASCONF_ACK hold a reference to it in assoc
+	 * after freeing the reference to old asconf ack if any.
+	 */
+	if (asconf_ack) {
+		sctp_chunk_hold(asconf_ack);
+		list_add_tail(&asconf_ack->transmitted_list,
+			      &asoc->asconf_ack_list);
+	}
+
+	return asconf_ack;
+}
+
+/* Process a asconf parameter that is successfully acked. */
+static void sctp_asconf_param_success(struct sctp_association *asoc,
+				     sctp_addip_param_t *asconf_param)
+{
+	struct sctp_af *af;
+	union sctp_addr	addr;
+	struct sctp_bind_addr *bp = &asoc->base.bind_addr;
+	union sctp_addr_param *addr_param;
+	struct sctp_transport *transport;
+	struct sctp_sockaddr_entry *saddr;
+
+	addr_param = (union sctp_addr_param *)
+			((void *)asconf_param + sizeof(sctp_addip_param_t));
+
+	/* We have checked the packet before, so we do not check again.	*/
+	af = sctp_get_af_specific(param_type2af(addr_param->p.type));
+	af->from_addr_param(&addr, addr_param, htons(bp->port), 0);
+
+	switch (asconf_param->param_hdr.type) {
+	case SCTP_PARAM_ADD_IP:
+		/* This is always done in BH context with a socket lock
+		 * held, so the list can not change.
+		 */
+		local_bh_disable();
+		list_for_each_entry(saddr, &bp->address_list, list) {
+			if (sctp_cmp_addr_exact(&saddr->a, &addr))
+				saddr->state = SCTP_ADDR_SRC;
+		}
+		local_bh_enable();
+		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+				transports) {
+			dst_release(transport->dst);
+			transport->dst = NULL;
+		}
+		break;
+	case SCTP_PARAM_DEL_IP:
+		local_bh_disable();
+		sctp_del_bind_addr(bp, &addr);
+		local_bh_enable();
+		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+				transports) {
+			dst_release(transport->dst);
+			transport->dst = NULL;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk
+ * for the given asconf parameter.  If there is no response for this parameter,
+ * return the error code based on the third argument 'no_err'.
+ * ADDIP 4.1
+ * A7) If an error response is received for a TLV parameter, all TLVs with no
+ * response before the failed TLV are considered successful if not reported.
+ * All TLVs after the failed response are considered unsuccessful unless a
+ * specific success indication is present for the parameter.
+ */
+static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
+				      sctp_addip_param_t *asconf_param,
+				      int no_err)
+{
+	sctp_addip_param_t	*asconf_ack_param;
+	sctp_errhdr_t		*err_param;
+	int			length;
+	int			asconf_ack_len;
+	__be16			err_code;
+
+	if (no_err)
+		err_code = SCTP_ERROR_NO_ERROR;
+	else
+		err_code = SCTP_ERROR_REQ_REFUSED;
+
+	asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) -
+			     sizeof(sctp_chunkhdr_t);
+
+	/* Skip the addiphdr from the asconf_ack chunk and store a pointer to
+	 * the first asconf_ack parameter.
+	 */
+	length = sizeof(sctp_addiphdr_t);
+	asconf_ack_param = (sctp_addip_param_t *)(asconf_ack->skb->data +
+						  length);
+	asconf_ack_len -= length;
+
+	while (asconf_ack_len > 0) {
+		if (asconf_ack_param->crr_id == asconf_param->crr_id) {
+			switch(asconf_ack_param->param_hdr.type) {
+			case SCTP_PARAM_SUCCESS_REPORT:
+				return SCTP_ERROR_NO_ERROR;
+			case SCTP_PARAM_ERR_CAUSE:
+				length = sizeof(sctp_addip_param_t);
+				err_param = (sctp_errhdr_t *)
+					   ((void *)asconf_ack_param + length);
+				asconf_ack_len -= length;
+				if (asconf_ack_len > 0)
+					return err_param->cause;
+				else
+					return SCTP_ERROR_INV_PARAM;
+				break;
+			default:
+				return SCTP_ERROR_INV_PARAM;
+			}
+		}
+
+		length = ntohs(asconf_ack_param->param_hdr.length);
+		asconf_ack_param = (sctp_addip_param_t *)
+					((void *)asconf_ack_param + length);
+		asconf_ack_len -= length;
+	}
+
+	return err_code;
+}
+
+/* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */
+int sctp_process_asconf_ack(struct sctp_association *asoc,
+			    struct sctp_chunk *asconf_ack)
+{
+	struct sctp_chunk	*asconf = asoc->addip_last_asconf;
+	union sctp_addr_param	*addr_param;
+	sctp_addip_param_t	*asconf_param;
+	int	length = 0;
+	int	asconf_len = asconf->skb->len;
+	int	all_param_pass = 0;
+	int	no_err = 1;
+	int	retval = 0;
+	__be16	err_code = SCTP_ERROR_NO_ERROR;
+
+	/* Skip the chunkhdr and addiphdr from the last asconf sent and store
+	 * a pointer to address parameter.
+	 */
+	length = sizeof(sctp_addip_chunk_t);
+	addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
+	asconf_len -= length;
+
+	/* Skip the address parameter in the last asconf sent and store a
+	 * pointer to the first asconf parameter.
+	 */
+	length = ntohs(addr_param->p.length);
+	asconf_param = (sctp_addip_param_t *)((void *)addr_param + length);
+	asconf_len -= length;
+
+	/* ADDIP 4.1
+	 * A8) If there is no response(s) to specific TLV parameter(s), and no
+	 * failures are indicated, then all request(s) are considered
+	 * successful.
+	 */
+	if (asconf_ack->skb->len == sizeof(sctp_addiphdr_t))
+		all_param_pass = 1;
+
+	/* Process the TLVs contained in the last sent ASCONF chunk. */
+	while (asconf_len > 0) {
+		if (all_param_pass)
+			err_code = SCTP_ERROR_NO_ERROR;
+		else {
+			err_code = sctp_get_asconf_response(asconf_ack,
+							    asconf_param,
+							    no_err);
+			if (no_err && (SCTP_ERROR_NO_ERROR != err_code))
+				no_err = 0;
+		}
+
+		switch (err_code) {
+		case SCTP_ERROR_NO_ERROR:
+			sctp_asconf_param_success(asoc, asconf_param);
+			break;
+
+		case SCTP_ERROR_RSRC_LOW:
+			retval = 1;
+			break;
+
+		case SCTP_ERROR_UNKNOWN_PARAM:
+			/* Disable sending this type of asconf parameter in
+			 * future.
+			 */
+			asoc->peer.addip_disabled_mask |=
+				asconf_param->param_hdr.type;
+			break;
+
+		case SCTP_ERROR_REQ_REFUSED:
+		case SCTP_ERROR_DEL_LAST_IP:
+		case SCTP_ERROR_DEL_SRC_IP:
+		default:
+			 break;
+		}
+
+		/* Skip the processed asconf parameter and move to the next
+		 * one.
+		 */
+		length = ntohs(asconf_param->param_hdr.length);
+		asconf_param = (sctp_addip_param_t *)((void *)asconf_param +
+						      length);
+		asconf_len -= length;
+	}
+
+	/* Free the cached last sent asconf chunk. */
+	list_del_init(&asconf->transmitted_list);
+	sctp_chunk_free(asconf);
+	asoc->addip_last_asconf = NULL;
+
+	return retval;
+}
+
+/* Make a FWD TSN chunk. */
+struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
+				    __u32 new_cum_tsn, size_t nstreams,
+				    struct sctp_fwdtsn_skip *skiplist)
+{
+	struct sctp_chunk *retval = NULL;
+	struct sctp_fwdtsn_hdr ftsn_hdr;
+	struct sctp_fwdtsn_skip skip;
+	size_t hint;
+	int i;
+
+	hint = (nstreams + 1) * sizeof(__u32);
+
+	retval = sctp_make_chunk(asoc, SCTP_CID_FWD_TSN, 0, hint);
+
+	if (!retval)
+		return NULL;
+
+	ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
+	retval->subh.fwdtsn_hdr =
+		sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);
+
+	for (i = 0; i < nstreams; i++) {
+		skip.stream = skiplist[i].stream;
+		skip.ssn = skiplist[i].ssn;
+		sctp_addto_chunk(retval, sizeof(skip), &skip);
+	}
+
+	return retval;
+}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
new file mode 100644
index 00000000..6e0f8829
--- /dev/null
+++ b/net/sctp/sm_sideeffect.c
@@ -0,0 +1,1717 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions work with the state functions in sctp_sm_statefuns.c
+ * to implement that state operations.  These functions implement the
+ * steps which require modifying existing data structures.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@austin.ibm.com>
+ *    Hui Huang		    <hui.huang@nokia.com>
+ *    Dajiang Zhang	    <dajiang.zhang@nokia.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static int sctp_cmd_interpreter(sctp_event_t event_type,
+				sctp_subtype_t subtype,
+				sctp_state_t state,
+				struct sctp_endpoint *ep,
+				struct sctp_association *asoc,
+				void *event_arg,
+				sctp_disposition_t status,
+				sctp_cmd_seq_t *commands,
+				gfp_t gfp);
+static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+			     sctp_state_t state,
+			     struct sctp_endpoint *ep,
+			     struct sctp_association *asoc,
+			     void *event_arg,
+			     sctp_disposition_t status,
+			     sctp_cmd_seq_t *commands,
+			     gfp_t gfp);
+
+/********************************************************************
+ * Helper functions
+ ********************************************************************/
+
+/* A helper function for delayed processing of INET ECN CE bit. */
+static void sctp_do_ecn_ce_work(struct sctp_association *asoc,
+				__u32 lowest_tsn)
+{
+	/* Save the TSN away for comparison when we receive CWR */
+
+	asoc->last_ecne_tsn = lowest_tsn;
+	asoc->need_ecne = 1;
+}
+
+/* Helper function for delayed processing of SCTP ECNE chunk.  */
+/* RFC 2960 Appendix A
+ *
+ * RFC 2481 details a specific bit for a sender to send in
+ * the header of its next outbound TCP segment to indicate to
+ * its peer that it has reduced its congestion window.  This
+ * is termed the CWR bit.  For SCTP the same indication is made
+ * by including the CWR chunk.  This chunk contains one data
+ * element, i.e. the TSN number that was sent in the ECNE chunk.
+ * This element represents the lowest TSN number in the datagram
+ * that was originally marked with the CE bit.
+ */
+static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc,
+					   __u32 lowest_tsn,
+					   struct sctp_chunk *chunk)
+{
+	struct sctp_chunk *repl;
+
+	/* Our previously transmitted packet ran into some congestion
+	 * so we should take action by reducing cwnd and ssthresh
+	 * and then ACK our peer that we we've done so by
+	 * sending a CWR.
+	 */
+
+	/* First, try to determine if we want to actually lower
+	 * our cwnd variables.  Only lower them if the ECNE looks more
+	 * recent than the last response.
+	 */
+	if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) {
+		struct sctp_transport *transport;
+
+		/* Find which transport's congestion variables
+		 * need to be adjusted.
+		 */
+		transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn);
+
+		/* Update the congestion variables. */
+		if (transport)
+			sctp_transport_lower_cwnd(transport,
+						  SCTP_LOWER_CWND_ECNE);
+		asoc->last_cwr_tsn = lowest_tsn;
+	}
+
+	/* Always try to quiet the other end.  In case of lost CWR,
+	 * resend last_cwr_tsn.
+	 */
+	repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk);
+
+	/* If we run out of memory, it will look like a lost CWR.  We'll
+	 * get back in sync eventually.
+	 */
+	return repl;
+}
+
+/* Helper function to do delayed processing of ECN CWR chunk.  */
+static void sctp_do_ecn_cwr_work(struct sctp_association *asoc,
+				 __u32 lowest_tsn)
+{
+	/* Turn off ECNE getting auto-prepended to every outgoing
+	 * packet
+	 */
+	asoc->need_ecne = 0;
+}
+
+/* Generate SACK if necessary.  We call this at the end of a packet.  */
+static int sctp_gen_sack(struct sctp_association *asoc, int force,
+			 sctp_cmd_seq_t *commands)
+{
+	__u32 ctsn, max_tsn_seen;
+	struct sctp_chunk *sack;
+	struct sctp_transport *trans = asoc->peer.last_data_from;
+	int error = 0;
+
+	if (force ||
+	    (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
+	    (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
+		asoc->peer.sack_needed = 1;
+
+	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+	max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
+
+	/* From 12.2 Parameters necessary per association (i.e. the TCB):
+	 *
+	 * Ack State : This flag indicates if the next received packet
+	 * 	     : is to be responded to with a SACK. ...
+	 *	     : When DATA chunks are out of order, SACK's
+	 *           : are not delayed (see Section 6).
+	 *
+	 * [This is actually not mentioned in Section 6, but we
+	 * implement it here anyway. --piggy]
+	 */
+	if (max_tsn_seen != ctsn)
+		asoc->peer.sack_needed = 1;
+
+	/* From 6.2  Acknowledgement on Reception of DATA Chunks:
+	 *
+	 * Section 4.2 of [RFC2581] SHOULD be followed. Specifically,
+	 * an acknowledgement SHOULD be generated for at least every
+	 * second packet (not every second DATA chunk) received, and
+	 * SHOULD be generated within 200 ms of the arrival of any
+	 * unacknowledged DATA chunk. ...
+	 */
+	if (!asoc->peer.sack_needed) {
+		asoc->peer.sack_cnt++;
+
+		/* Set the SACK delay timeout based on the
+		 * SACK delay for the last transport
+		 * data was received from, or the default
+		 * for the association.
+		 */
+		if (trans) {
+			/* We will need a SACK for the next packet.  */
+			if (asoc->peer.sack_cnt >= trans->sackfreq - 1)
+				asoc->peer.sack_needed = 1;
+
+			asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+				trans->sackdelay;
+		} else {
+			/* We will need a SACK for the next packet.  */
+			if (asoc->peer.sack_cnt >= asoc->sackfreq - 1)
+				asoc->peer.sack_needed = 1;
+
+			asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+				asoc->sackdelay;
+		}
+
+		/* Restart the SACK timer. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+	} else {
+		asoc->a_rwnd = asoc->rwnd;
+		sack = sctp_make_sack(asoc);
+		if (!sack)
+			goto nomem;
+
+		asoc->peer.sack_needed = 0;
+		asoc->peer.sack_cnt = 0;
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack));
+
+		/* Stop the SACK timer.  */
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+	}
+
+	return error;
+nomem:
+	error = -ENOMEM;
+	return error;
+}
+
+/* When the T3-RTX timer expires, it calls this function to create the
+ * relevant state machine event.
+ */
+void sctp_generate_t3_rtx_event(unsigned long peer)
+{
+	int error;
+	struct sctp_transport *transport = (struct sctp_transport *) peer;
+	struct sctp_association *asoc = transport->asoc;
+
+	/* Check whether a task is in the sock.  */
+
+	sctp_bh_lock_sock(asoc->base.sk);
+	if (sock_owned_by_user(asoc->base.sk)) {
+		SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__);
+
+		/* Try again later.  */
+		if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20)))
+			sctp_transport_hold(transport);
+		goto out_unlock;
+	}
+
+	/* Is this transport really dead and just waiting around for
+	 * the timer to let go of the reference?
+	 */
+	if (transport->dead)
+		goto out_unlock;
+
+	/* Run through the state machine.  */
+	error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT,
+			   SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX),
+			   asoc->state,
+			   asoc->ep, asoc,
+			   transport, GFP_ATOMIC);
+
+	if (error)
+		asoc->base.sk->sk_err = -error;
+
+out_unlock:
+	sctp_bh_unlock_sock(asoc->base.sk);
+	sctp_transport_put(transport);
+}
+
+/* This is a sa interface for producing timeout events.  It works
+ * for timeouts which use the association as their parameter.
+ */
+static void sctp_generate_timeout_event(struct sctp_association *asoc,
+					sctp_event_timeout_t timeout_type)
+{
+	int error = 0;
+
+	sctp_bh_lock_sock(asoc->base.sk);
+	if (sock_owned_by_user(asoc->base.sk)) {
+		SCTP_DEBUG_PRINTK("%s:Sock is busy: timer %d\n",
+				  __func__,
+				  timeout_type);
+
+		/* Try again later.  */
+		if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20)))
+			sctp_association_hold(asoc);
+		goto out_unlock;
+	}
+
+	/* Is this association really dead and just waiting around for
+	 * the timer to let go of the reference?
+	 */
+	if (asoc->base.dead)
+		goto out_unlock;
+
+	/* Run through the state machine.  */
+	error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT,
+			   SCTP_ST_TIMEOUT(timeout_type),
+			   asoc->state, asoc->ep, asoc,
+			   (void *)timeout_type, GFP_ATOMIC);
+
+	if (error)
+		asoc->base.sk->sk_err = -error;
+
+out_unlock:
+	sctp_bh_unlock_sock(asoc->base.sk);
+	sctp_association_put(asoc);
+}
+
+static void sctp_generate_t1_cookie_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE);
+}
+
+static void sctp_generate_t1_init_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT);
+}
+
+static void sctp_generate_t2_shutdown_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN);
+}
+
+static void sctp_generate_t4_rto_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO);
+}
+
+static void sctp_generate_t5_shutdown_guard_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *)data;
+	sctp_generate_timeout_event(asoc,
+				    SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD);
+
+} /* sctp_generate_t5_shutdown_guard_event() */
+
+static void sctp_generate_autoclose_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE);
+}
+
+/* Generate a heart beat event.  If the sock is busy, reschedule.   Make
+ * sure that the transport is still valid.
+ */
+void sctp_generate_heartbeat_event(unsigned long data)
+{
+	int error = 0;
+	struct sctp_transport *transport = (struct sctp_transport *) data;
+	struct sctp_association *asoc = transport->asoc;
+
+	sctp_bh_lock_sock(asoc->base.sk);
+	if (sock_owned_by_user(asoc->base.sk)) {
+		SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__);
+
+		/* Try again later.  */
+		if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20)))
+			sctp_transport_hold(transport);
+		goto out_unlock;
+	}
+
+	/* Is this structure just waiting around for us to actually
+	 * get destroyed?
+	 */
+	if (transport->dead)
+		goto out_unlock;
+
+	error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT,
+			   SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
+			   asoc->state, asoc->ep, asoc,
+			   transport, GFP_ATOMIC);
+
+	 if (error)
+		 asoc->base.sk->sk_err = -error;
+
+out_unlock:
+	sctp_bh_unlock_sock(asoc->base.sk);
+	sctp_transport_put(transport);
+}
+
+/* Handle the timeout of the ICMP protocol unreachable timer.  Trigger
+ * the correct state machine transition that will close the association.
+ */
+void sctp_generate_proto_unreach_event(unsigned long data)
+{
+	struct sctp_transport *transport = (struct sctp_transport *) data;
+	struct sctp_association *asoc = transport->asoc;
+	
+	sctp_bh_lock_sock(asoc->base.sk);
+	if (sock_owned_by_user(asoc->base.sk)) {
+		SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__);
+
+		/* Try again later.  */
+		if (!mod_timer(&transport->proto_unreach_timer,
+				jiffies + (HZ/20)))
+			sctp_association_hold(asoc);
+		goto out_unlock;
+	}
+
+	/* Is this structure just waiting around for us to actually
+	 * get destroyed?
+	 */
+	if (asoc->base.dead)
+		goto out_unlock;
+
+	sctp_do_sm(SCTP_EVENT_T_OTHER,
+		   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+		   asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);
+
+out_unlock:
+	sctp_bh_unlock_sock(asoc->base.sk);
+	sctp_association_put(asoc);
+}
+
+
+/* Inject a SACK Timeout event into the state machine.  */
+static void sctp_generate_sack_event(unsigned long data)
+{
+	struct sctp_association *asoc = (struct sctp_association *) data;
+	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK);
+}
+
+sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
+	NULL,
+	sctp_generate_t1_cookie_event,
+	sctp_generate_t1_init_event,
+	sctp_generate_t2_shutdown_event,
+	NULL,
+	sctp_generate_t4_rto_event,
+	sctp_generate_t5_shutdown_guard_event,
+	NULL,
+	sctp_generate_sack_event,
+	sctp_generate_autoclose_event,
+};
+
+
+/* RFC 2960 8.2 Path Failure Detection
+ *
+ * When its peer endpoint is multi-homed, an endpoint should keep a
+ * error counter for each of the destination transport addresses of the
+ * peer endpoint.
+ *
+ * Each time the T3-rtx timer expires on any address, or when a
+ * HEARTBEAT sent to an idle address is not acknowledged within a RTO,
+ * the error counter of that destination address will be incremented.
+ * When the value in the error counter exceeds the protocol parameter
+ * 'Path.Max.Retrans' of that destination address, the endpoint should
+ * mark the destination transport address as inactive, and a
+ * notification SHOULD be sent to the upper layer.
+ *
+ */
+static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+					 struct sctp_transport *transport,
+					 int is_hb)
+{
+	/* The check for association's overall error counter exceeding the
+	 * threshold is done in the state function.
+	 */
+	/* We are here due to a timer expiration.  If the timer was
+	 * not a HEARTBEAT, then normal error tracking is done.
+	 * If the timer was a heartbeat, we only increment error counts
+	 * when we already have an outstanding HEARTBEAT that has not
+	 * been acknowledged.
+	 * Additionally, some tranport states inhibit error increments.
+	 */
+	if (!is_hb) {
+		asoc->overall_error_count++;
+		if (transport->state != SCTP_INACTIVE)
+			transport->error_count++;
+	 } else if (transport->hb_sent) {
+		if (transport->state != SCTP_UNCONFIRMED)
+			asoc->overall_error_count++;
+		if (transport->state != SCTP_INACTIVE)
+			transport->error_count++;
+	}
+
+	if (transport->state != SCTP_INACTIVE &&
+	    (transport->error_count > transport->pathmaxrxt)) {
+		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
+					 " transport IP: port:%d failed.\n",
+					 asoc,
+					 (&transport->ipaddr),
+					 ntohs(transport->ipaddr.v4.sin_port));
+		sctp_assoc_control_transport(asoc, transport,
+					     SCTP_TRANSPORT_DOWN,
+					     SCTP_FAILED_THRESHOLD);
+	}
+
+	/* E2) For the destination address for which the timer
+	 * expires, set RTO <- RTO * 2 ("back off the timer").  The
+	 * maximum value discussed in rule C7 above (RTO.max) may be
+	 * used to provide an upper bound to this doubling operation.
+	 *
+	 * Special Case:  the first HB doesn't trigger exponential backoff.
+	 * The first unacknowledged HB triggers it.  We do this with a flag
+	 * that indicates that we have an outstanding HB.
+	 */
+	if (!is_hb || transport->hb_sent) {
+		transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
+	}
+}
+
+/* Worker routine to handle INIT command failure.  */
+static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
+				 struct sctp_association *asoc,
+				 unsigned error)
+{
+	struct sctp_ulpevent *event;
+
+	event = sctp_ulpevent_make_assoc_change(asoc,0, SCTP_CANT_STR_ASSOC,
+						(__u16)error, 0, 0, NULL,
+						GFP_ATOMIC);
+
+	if (event)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(event));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+
+	/* SEND_FAILED sent later when cleaning up the association. */
+	asoc->outqueue.error = error;
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+}
+
+/* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
+static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
+				  struct sctp_association *asoc,
+				  sctp_event_t event_type,
+				  sctp_subtype_t subtype,
+				  struct sctp_chunk *chunk,
+				  unsigned error)
+{
+	struct sctp_ulpevent *event;
+
+	/* Cancel any partial delivery in progress. */
+	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+
+	if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
+		event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
+						(__u16)error, 0, 0, chunk,
+						GFP_ATOMIC);
+	else
+		event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
+						(__u16)error, 0, 0, NULL,
+						GFP_ATOMIC);
+	if (event)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(event));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+
+	/* SEND_FAILED sent later when cleaning up the association. */
+	asoc->outqueue.error = error;
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+}
+
+/* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT
+ * inside the cookie.  In reality, this is only used for INIT-ACK processing
+ * since all other cases use "temporary" associations and can do all
+ * their work in statefuns directly.
+ */
+static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
+				 struct sctp_association *asoc,
+				 struct sctp_chunk *chunk,
+				 sctp_init_chunk_t *peer_init,
+				 gfp_t gfp)
+{
+	int error;
+
+	/* We only process the init as a sideeffect in a single
+	 * case.   This is when we process the INIT-ACK.   If we
+	 * fail during INIT processing (due to malloc problems),
+	 * just return the error and stop processing the stack.
+	 */
+	if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp))
+		error = -ENOMEM;
+	else
+		error = 0;
+
+	return error;
+}
+
+/* Helper function to break out starting up of heartbeat timers.  */
+static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds,
+				     struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+
+	/* Start a heartbeat timer for each transport on the association.
+	 * hold a reference on the transport to make sure none of
+	 * the needed data structures go away.
+	 */
+	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
+
+		if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+			sctp_transport_hold(t);
+	}
+}
+
+static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds,
+				    struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+
+	/* Stop all heartbeat timers. */
+
+	list_for_each_entry(t, &asoc->peer.transport_addr_list,
+			transports) {
+		if (del_timer(&t->hb_timer))
+			sctp_transport_put(t);
+	}
+}
+
+/* Helper function to stop any pending T3-RTX timers */
+static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds,
+					struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+
+	list_for_each_entry(t, &asoc->peer.transport_addr_list,
+			transports) {
+		if (timer_pending(&t->T3_rtx_timer) &&
+		    del_timer(&t->T3_rtx_timer)) {
+			sctp_transport_put(t);
+		}
+	}
+}
+
+
+/* Helper function to update the heartbeat timer. */
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+				     struct sctp_transport *t)
+{
+	/* Update the heartbeat timer.  */
+	if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+		sctp_transport_hold(t);
+}
+
+/* Helper function to handle the reception of an HEARTBEAT ACK.  */
+static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
+				  struct sctp_association *asoc,
+				  struct sctp_transport *t,
+				  struct sctp_chunk *chunk)
+{
+	sctp_sender_hb_info_t *hbinfo;
+
+	/* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the
+	 * HEARTBEAT should clear the error counter of the destination
+	 * transport address to which the HEARTBEAT was sent.
+	 */
+	t->error_count = 0;
+
+	/*
+	 * Although RFC4960 specifies that the overall error count must
+	 * be cleared when a HEARTBEAT ACK is received, we make an
+	 * exception while in SHUTDOWN PENDING. If the peer keeps its
+	 * window shut forever, we may never be able to transmit our
+	 * outstanding data and rely on the retransmission limit be reached
+	 * to shutdown the association.
+	 */
+	if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING)
+		t->asoc->overall_error_count = 0;
+
+	/* Clear the hb_sent flag to signal that we had a good
+	 * acknowledgement.
+	 */
+	t->hb_sent = 0;
+
+	/* Mark the destination transport address as active if it is not so
+	 * marked.
+	 */
+	if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED))
+		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+					     SCTP_HEARTBEAT_SUCCESS);
+
+	/* The receiver of the HEARTBEAT ACK should also perform an
+	 * RTT measurement for that destination transport address
+	 * using the time value carried in the HEARTBEAT ACK chunk.
+	 * If the transport's rto_pending variable has been cleared,
+	 * it was most likely due to a retransmit.  However, we want
+	 * to re-enable it to properly update the rto.
+	 */
+	if (t->rto_pending == 0)
+		t->rto_pending = 1;
+
+	hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+	sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at));
+
+	/* Update the heartbeat timer.  */
+	if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t)))
+		sctp_transport_hold(t);
+}
+
+
+/* Helper function to process the process SACK command.  */
+static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
+				 struct sctp_association *asoc,
+				 struct sctp_sackhdr *sackh)
+{
+	int err = 0;
+
+	if (sctp_outq_sack(&asoc->outqueue, sackh)) {
+		/* There are no more TSNs awaiting SACK.  */
+		err = sctp_do_sm(SCTP_EVENT_T_OTHER,
+				 SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN),
+				 asoc->state, asoc->ep, asoc, NULL,
+				 GFP_ATOMIC);
+	}
+
+	return err;
+}
+
+/* Helper function to set the timeout value for T2-SHUTDOWN timer and to set
+ * the transport for a shutdown chunk.
+ */
+static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds,
+			      struct sctp_association *asoc,
+			      struct sctp_chunk *chunk)
+{
+	struct sctp_transport *t;
+
+	if (chunk->transport)
+		t = chunk->transport;
+	else {
+		t = sctp_assoc_choose_alter_transport(asoc,
+					      asoc->shutdown_last_sent_to);
+		chunk->transport = t;
+	}
+	asoc->shutdown_last_sent_to = t;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
+}
+
+/* Helper function to change the state of an association. */
+static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
+			       struct sctp_association *asoc,
+			       sctp_state_t state)
+{
+	struct sock *sk = asoc->base.sk;
+
+	asoc->state = state;
+
+	SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
+			  asoc, sctp_state_tbl[state]);
+
+	if (sctp_style(sk, TCP)) {
+		/* Change the sk->sk_state of a TCP-style socket that has
+		 * successfully completed a connect() call.
+		 */
+		if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
+			sk->sk_state = SCTP_SS_ESTABLISHED;
+
+		/* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */
+		if (sctp_state(asoc, SHUTDOWN_RECEIVED) &&
+		    sctp_sstate(sk, ESTABLISHED))
+			sk->sk_shutdown |= RCV_SHUTDOWN;
+	}
+
+	if (sctp_state(asoc, COOKIE_WAIT)) {
+		/* Reset init timeouts since they may have been
+		 * increased due to timer expirations.
+		 */
+		asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+						asoc->rto_initial;
+		asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
+						asoc->rto_initial;
+	}
+
+	if (sctp_state(asoc, ESTABLISHED) ||
+	    sctp_state(asoc, CLOSED) ||
+	    sctp_state(asoc, SHUTDOWN_RECEIVED)) {
+		/* Wake up any processes waiting in the asoc's wait queue in
+		 * sctp_wait_for_connect() or sctp_wait_for_sndbuf().
+		 */
+		if (waitqueue_active(&asoc->wait))
+			wake_up_interruptible(&asoc->wait);
+
+		/* Wake up any processes waiting in the sk's sleep queue of
+		 * a TCP-style or UDP-style peeled-off socket in
+		 * sctp_wait_for_accept() or sctp_wait_for_packet().
+		 * For a UDP-style socket, the waiters are woken up by the
+		 * notifications.
+		 */
+		if (!sctp_style(sk, UDP))
+			sk->sk_state_change(sk);
+	}
+}
+
+/* Helper function to delete an association. */
+static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
+				struct sctp_association *asoc)
+{
+	struct sock *sk = asoc->base.sk;
+
+	/* If it is a non-temporary association belonging to a TCP-style
+	 * listening socket that is not closed, do not free it so that accept()
+	 * can pick it up later.
+	 */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) &&
+	    (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
+		return;
+
+	sctp_unhash_established(asoc);
+	sctp_association_free(asoc);
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF Chunk Procedures
+ * A4) Start a T-4 RTO timer, using the RTO value of the selected
+ * destination address (we use active path instead of primary path just
+ * because primary path may be inactive.
+ */
+static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds,
+				struct sctp_association *asoc,
+				struct sctp_chunk *chunk)
+{
+	struct sctp_transport *t;
+
+	t = sctp_assoc_choose_alter_transport(asoc, chunk->transport);
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto;
+	chunk->transport = t;
+}
+
+/* Process an incoming Operation Error Chunk. */
+static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
+				   struct sctp_association *asoc,
+				   struct sctp_chunk *chunk)
+{
+	struct sctp_errhdr *err_hdr;
+	struct sctp_ulpevent *ev;
+
+	while (chunk->chunk_end > chunk->skb->data) {
+		err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
+
+		ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
+						     GFP_ATOMIC);
+		if (!ev)
+			return;
+
+		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+
+		switch (err_hdr->cause) {
+		case SCTP_ERROR_UNKNOWN_CHUNK:
+		{
+			sctp_chunkhdr_t *unk_chunk_hdr;
+
+			unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
+			switch (unk_chunk_hdr->type) {
+			/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
+			 * an ERROR chunk reporting that it did not recognized
+			 * the ASCONF chunk type, the sender of the ASCONF MUST
+			 * NOT send any further ASCONF chunks and MUST stop its
+			 * T-4 timer.
+			 */
+			case SCTP_CID_ASCONF:
+				if (asoc->peer.asconf_capable == 0)
+					break;
+
+				asoc->peer.asconf_capable = 0;
+				sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+				break;
+			default:
+				break;
+			}
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/* Process variable FWDTSN chunk information. */
+static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
+				    struct sctp_chunk *chunk)
+{
+	struct sctp_fwdtsn_skip *skip;
+	/* Walk through all the skipped SSNs */
+	sctp_walk_fwdtsn(skip, chunk) {
+		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
+	}
+}
+
+/* Helper function to remove the association non-primary peer
+ * transports.
+ */
+static void sctp_cmd_del_non_primary(struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+	struct list_head *pos;
+	struct list_head *temp;
+
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		t = list_entry(pos, struct sctp_transport, transports);
+		if (!sctp_cmp_addr_exact(&t->ipaddr,
+					 &asoc->peer.primary_addr)) {
+			sctp_assoc_del_peer(asoc, &t->ipaddr);
+		}
+	}
+}
+
+/* Helper function to set sk_err on a 1-1 style socket. */
+static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error)
+{
+	struct sock *sk = asoc->base.sk;
+
+	if (!sctp_style(sk, UDP))
+		sk->sk_err = error;
+}
+
+/* Helper function to generate an association change event */
+static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands,
+				 struct sctp_association *asoc,
+				 u8 state)
+{
+	struct sctp_ulpevent *ev;
+
+	ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0,
+					    asoc->c.sinit_num_ostreams,
+					    asoc->c.sinit_max_instreams,
+					    NULL, GFP_ATOMIC);
+	if (ev)
+		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+}
+
+/* Helper function to generate an adaptation indication event */
+static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands,
+				    struct sctp_association *asoc)
+{
+	struct sctp_ulpevent *ev;
+
+	ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
+
+	if (ev)
+		sctp_ulpq_tail_event(&asoc->ulpq, ev);
+}
+
+
+static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
+				    sctp_event_timeout_t timer,
+				    char *name)
+{
+	struct sctp_transport *t;
+
+	t = asoc->init_last_sent_to;
+	asoc->init_err_counter++;
+
+	if (t->init_sent_count > (asoc->init_cycle + 1)) {
+		asoc->timeouts[timer] *= 2;
+		if (asoc->timeouts[timer] > asoc->max_init_timeo) {
+			asoc->timeouts[timer] = asoc->max_init_timeo;
+		}
+		asoc->init_cycle++;
+		SCTP_DEBUG_PRINTK(
+			"T1 %s Timeout adjustment"
+			" init_err_counter: %d"
+			" cycle: %d"
+			" timeout: %ld\n",
+			name,
+			asoc->init_err_counter,
+			asoc->init_cycle,
+			asoc->timeouts[timer]);
+	}
+
+}
+
+/* Send the whole message, chunk by chunk, to the outqueue.
+ * This way the whole message is queued up and bundling if
+ * encouraged for small fragments.
+ */
+static int sctp_cmd_send_msg(struct sctp_association *asoc,
+				struct sctp_datamsg *msg)
+{
+	struct sctp_chunk *chunk;
+	int error = 0;
+
+	list_for_each_entry(chunk, &msg->chunks, frag_list) {
+		error = sctp_outq_tail(&asoc->outqueue, chunk);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+
+/* Sent the next ASCONF packet currently stored in the association.
+ * This happens after the ASCONF_ACK was succeffully processed.
+ */
+static void sctp_cmd_send_asconf(struct sctp_association *asoc)
+{
+	/* Send the next asconf chunk from the addip chunk
+	 * queue.
+	 */
+	if (!list_empty(&asoc->addip_chunk_list)) {
+		struct list_head *entry = asoc->addip_chunk_list.next;
+		struct sctp_chunk *asconf = list_entry(entry,
+						struct sctp_chunk, list);
+		list_del_init(entry);
+
+		/* Hold the chunk until an ASCONF_ACK is received. */
+		sctp_chunk_hold(asconf);
+		if (sctp_primitive_ASCONF(asoc, asconf))
+			sctp_chunk_free(asconf);
+		else
+			asoc->addip_last_asconf = asconf;
+	}
+}
+
+
+/* These three macros allow us to pull the debugging code out of the
+ * main flow of sctp_do_sm() to keep attention focused on the real
+ * functionality there.
+ */
+#define DEBUG_PRE \
+	SCTP_DEBUG_PRINTK("sctp_do_sm prefn: " \
+			  "ep %p, %s, %s, asoc %p[%s], %s\n", \
+			  ep, sctp_evttype_tbl[event_type], \
+			  (*debug_fn)(subtype), asoc, \
+			  sctp_state_tbl[state], state_fn->name)
+
+#define DEBUG_POST \
+	SCTP_DEBUG_PRINTK("sctp_do_sm postfn: " \
+			  "asoc %p, status: %s\n", \
+			  asoc, sctp_status_tbl[status])
+
+#define DEBUG_POST_SFX \
+	SCTP_DEBUG_PRINTK("sctp_do_sm post sfx: error %d, asoc %p[%s]\n", \
+			  error, asoc, \
+			  sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \
+			  sctp_assoc2id(asoc)))?asoc->state:SCTP_STATE_CLOSED])
+
+/*
+ * This is the master state machine processing function.
+ *
+ * If you want to understand all of lksctp, this is a
+ * good place to start.
+ */
+int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
+	       sctp_state_t state,
+	       struct sctp_endpoint *ep,
+	       struct sctp_association *asoc,
+	       void *event_arg,
+	       gfp_t gfp)
+{
+	sctp_cmd_seq_t commands;
+	const sctp_sm_table_entry_t *state_fn;
+	sctp_disposition_t status;
+	int error = 0;
+	typedef const char *(printfn_t)(sctp_subtype_t);
+
+	static printfn_t *table[] = {
+		NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname,
+	};
+	printfn_t *debug_fn  __attribute__ ((unused)) = table[event_type];
+
+	/* Look up the state function, run it, and then process the
+	 * side effects.  These three steps are the heart of lksctp.
+	 */
+	state_fn = sctp_sm_lookup_event(event_type, state, subtype);
+
+	sctp_init_cmd_seq(&commands);
+
+	DEBUG_PRE;
+	status = (*state_fn->fn)(ep, asoc, subtype, event_arg, &commands);
+	DEBUG_POST;
+
+	error = sctp_side_effects(event_type, subtype, state,
+				  ep, asoc, event_arg, status,
+				  &commands, gfp);
+	DEBUG_POST_SFX;
+
+	return error;
+}
+
+#undef DEBUG_PRE
+#undef DEBUG_POST
+
+/*****************************************************************
+ * This the master state function side effect processing function.
+ *****************************************************************/
+static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+			     sctp_state_t state,
+			     struct sctp_endpoint *ep,
+			     struct sctp_association *asoc,
+			     void *event_arg,
+			     sctp_disposition_t status,
+			     sctp_cmd_seq_t *commands,
+			     gfp_t gfp)
+{
+	int error;
+
+	/* FIXME - Most of the dispositions left today would be categorized
+	 * as "exceptional" dispositions.  For those dispositions, it
+	 * may not be proper to run through any of the commands at all.
+	 * For example, the command interpreter might be run only with
+	 * disposition SCTP_DISPOSITION_CONSUME.
+	 */
+	if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state,
+					       ep, asoc,
+					       event_arg, status,
+					       commands, gfp)))
+		goto bail;
+
+	switch (status) {
+	case SCTP_DISPOSITION_DISCARD:
+		SCTP_DEBUG_PRINTK("Ignored sctp protocol event - state %d, "
+				  "event_type %d, event_id %d\n",
+				  state, event_type, subtype.chunk);
+		break;
+
+	case SCTP_DISPOSITION_NOMEM:
+		/* We ran out of memory, so we need to discard this
+		 * packet.
+		 */
+		/* BUG--we should now recover some memory, probably by
+		 * reneging...
+		 */
+		error = -ENOMEM;
+		break;
+
+	case SCTP_DISPOSITION_DELETE_TCB:
+		/* This should now be a command. */
+		break;
+
+	case SCTP_DISPOSITION_CONSUME:
+	case SCTP_DISPOSITION_ABORT:
+		/*
+		 * We should no longer have much work to do here as the
+		 * real work has been done as explicit commands above.
+		 */
+		break;
+
+	case SCTP_DISPOSITION_VIOLATION:
+		if (net_ratelimit())
+			pr_err("protocol violation state %d chunkid %d\n",
+			       state, subtype.chunk);
+		break;
+
+	case SCTP_DISPOSITION_NOT_IMPL:
+		pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
+			state, event_type, subtype.chunk);
+		break;
+
+	case SCTP_DISPOSITION_BUG:
+		pr_err("bug in state %d, event_type %d, event_id %d\n",
+		       state, event_type, subtype.chunk);
+		BUG();
+		break;
+
+	default:
+		pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
+		       status, state, event_type, subtype.chunk);
+		BUG();
+		break;
+	}
+
+bail:
+	return error;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This is the side-effect interpreter.  */
+static int sctp_cmd_interpreter(sctp_event_t event_type,
+				sctp_subtype_t subtype,
+				sctp_state_t state,
+				struct sctp_endpoint *ep,
+				struct sctp_association *asoc,
+				void *event_arg,
+				sctp_disposition_t status,
+				sctp_cmd_seq_t *commands,
+				gfp_t gfp)
+{
+	int error = 0;
+	int force;
+	sctp_cmd_t *cmd;
+	struct sctp_chunk *new_obj;
+	struct sctp_chunk *chunk = NULL;
+	struct sctp_packet *packet;
+	struct timer_list *timer;
+	unsigned long timeout;
+	struct sctp_transport *t;
+	struct sctp_sackhdr sackh;
+	int local_cork = 0;
+
+	if (SCTP_EVENT_T_TIMEOUT != event_type)
+		chunk = (struct sctp_chunk *) event_arg;
+
+	/* Note:  This whole file is a huge candidate for rework.
+	 * For example, each command could either have its own handler, so
+	 * the loop would look like:
+	 *     while (cmds)
+	 *         cmd->handle(x, y, z)
+	 * --jgrimm
+	 */
+	while (NULL != (cmd = sctp_next_cmd(commands))) {
+		switch (cmd->verb) {
+		case SCTP_CMD_NOP:
+			/* Do nothing. */
+			break;
+
+		case SCTP_CMD_NEW_ASOC:
+			/* Register a new association.  */
+			if (local_cork) {
+				sctp_outq_uncork(&asoc->outqueue);
+				local_cork = 0;
+			}
+			asoc = cmd->obj.ptr;
+			/* Register with the endpoint.  */
+			sctp_endpoint_add_asoc(ep, asoc);
+			sctp_hash_established(asoc);
+			break;
+
+		case SCTP_CMD_UPDATE_ASSOC:
+		       sctp_assoc_update(asoc, cmd->obj.ptr);
+		       break;
+
+		case SCTP_CMD_PURGE_OUTQUEUE:
+		       sctp_outq_teardown(&asoc->outqueue);
+		       break;
+
+		case SCTP_CMD_DELETE_TCB:
+			if (local_cork) {
+				sctp_outq_uncork(&asoc->outqueue);
+				local_cork = 0;
+			}
+			/* Delete the current association.  */
+			sctp_cmd_delete_tcb(commands, asoc);
+			asoc = NULL;
+			break;
+
+		case SCTP_CMD_NEW_STATE:
+			/* Enter a new state.  */
+			sctp_cmd_new_state(commands, asoc, cmd->obj.state);
+			break;
+
+		case SCTP_CMD_REPORT_TSN:
+			/* Record the arrival of a TSN.  */
+			error = sctp_tsnmap_mark(&asoc->peer.tsn_map,
+						 cmd->obj.u32);
+			break;
+
+		case SCTP_CMD_REPORT_FWDTSN:
+			/* Move the Cumulattive TSN Ack ahead. */
+			sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32);
+
+			/* purge the fragmentation queue */
+			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, cmd->obj.u32);
+
+			/* Abort any in progress partial delivery. */
+			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
+			break;
+
+		case SCTP_CMD_PROCESS_FWDTSN:
+			sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_GEN_SACK:
+			/* Generate a Selective ACK.
+			 * The argument tells us whether to just count
+			 * the packet and MAYBE generate a SACK, or
+			 * force a SACK out.
+			 */
+			force = cmd->obj.i32;
+			error = sctp_gen_sack(asoc, force, commands);
+			break;
+
+		case SCTP_CMD_PROCESS_SACK:
+			/* Process an inbound SACK.  */
+			error = sctp_cmd_process_sack(commands, asoc,
+						      cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_GEN_INIT_ACK:
+			/* Generate an INIT ACK chunk.  */
+			new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,
+						     0);
+			if (!new_obj)
+				goto nomem;
+
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(new_obj));
+			break;
+
+		case SCTP_CMD_PEER_INIT:
+			/* Process a unified INIT from the peer.
+			 * Note: Only used during INIT-ACK processing.  If
+			 * there is an error just return to the outter
+			 * layer which will bail.
+			 */
+			error = sctp_cmd_process_init(commands, asoc, chunk,
+						      cmd->obj.ptr, gfp);
+			break;
+
+		case SCTP_CMD_GEN_COOKIE_ECHO:
+			/* Generate a COOKIE ECHO chunk.  */
+			new_obj = sctp_make_cookie_echo(asoc, chunk);
+			if (!new_obj) {
+				if (cmd->obj.ptr)
+					sctp_chunk_free(cmd->obj.ptr);
+				goto nomem;
+			}
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(new_obj));
+
+			/* If there is an ERROR chunk to be sent along with
+			 * the COOKIE_ECHO, send it, too.
+			 */
+			if (cmd->obj.ptr)
+				sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+						SCTP_CHUNK(cmd->obj.ptr));
+
+			if (new_obj->transport) {
+				new_obj->transport->init_sent_count++;
+				asoc->init_last_sent_to = new_obj->transport;
+			}
+
+			/* FIXME - Eventually come up with a cleaner way to
+			 * enabling COOKIE-ECHO + DATA bundling during
+			 * multihoming stale cookie scenarios, the following
+			 * command plays with asoc->peer.retran_path to
+			 * avoid the problem of sending the COOKIE-ECHO and
+			 * DATA in different paths, which could result
+			 * in the association being ABORTed if the DATA chunk
+			 * is processed first by the server.  Checking the
+			 * init error counter simply causes this command
+			 * to be executed only during failed attempts of
+			 * association establishment.
+			 */
+			if ((asoc->peer.retran_path !=
+			     asoc->peer.primary_path) &&
+			    (asoc->init_err_counter > 0)) {
+				sctp_add_cmd_sf(commands,
+						SCTP_CMD_FORCE_PRIM_RETRAN,
+						SCTP_NULL());
+			}
+
+			break;
+
+		case SCTP_CMD_GEN_SHUTDOWN:
+			/* Generate SHUTDOWN when in SHUTDOWN_SENT state.
+			 * Reset error counts.
+			 */
+			asoc->overall_error_count = 0;
+
+			/* Generate a SHUTDOWN chunk.  */
+			new_obj = sctp_make_shutdown(asoc, chunk);
+			if (!new_obj)
+				goto nomem;
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(new_obj));
+			break;
+
+		case SCTP_CMD_CHUNK_ULP:
+			/* Send a chunk to the sockets layer.  */
+			SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n",
+					  "chunk_up:", cmd->obj.ptr,
+					  "ulpq:", &asoc->ulpq);
+			sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.ptr,
+					    GFP_ATOMIC);
+			break;
+
+		case SCTP_CMD_EVENT_ULP:
+			/* Send a notification to the sockets layer.  */
+			SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n",
+					  "event_up:",cmd->obj.ptr,
+					  "ulpq:",&asoc->ulpq);
+			sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_REPLY:
+			/* If an caller has not already corked, do cork. */
+			if (!asoc->outqueue.cork) {
+				sctp_outq_cork(&asoc->outqueue);
+				local_cork = 1;
+			}
+			/* Send a chunk to our peer.  */
+			error = sctp_outq_tail(&asoc->outqueue, cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_SEND_PKT:
+			/* Send a full packet to our peer.  */
+			packet = cmd->obj.ptr;
+			sctp_packet_transmit(packet);
+			sctp_ootb_pkt_free(packet);
+			break;
+
+		case SCTP_CMD_T1_RETRAN:
+			/* Mark a transport for retransmission.  */
+			sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
+					SCTP_RTXR_T1_RTX);
+			break;
+
+		case SCTP_CMD_RETRAN:
+			/* Mark a transport for retransmission.  */
+			sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
+					SCTP_RTXR_T3_RTX);
+			break;
+
+		case SCTP_CMD_ECN_CE:
+			/* Do delayed CE processing.   */
+			sctp_do_ecn_ce_work(asoc, cmd->obj.u32);
+			break;
+
+		case SCTP_CMD_ECN_ECNE:
+			/* Do delayed ECNE processing. */
+			new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32,
+							chunk);
+			if (new_obj)
+				sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+						SCTP_CHUNK(new_obj));
+			break;
+
+		case SCTP_CMD_ECN_CWR:
+			/* Do delayed CWR processing.  */
+			sctp_do_ecn_cwr_work(asoc, cmd->obj.u32);
+			break;
+
+		case SCTP_CMD_SETUP_T2:
+			sctp_cmd_setup_t2(commands, asoc, cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_TIMER_START_ONCE:
+			timer = &asoc->timers[cmd->obj.to];
+
+			if (timer_pending(timer))
+				break;
+			/* fall through */
+
+		case SCTP_CMD_TIMER_START:
+			timer = &asoc->timers[cmd->obj.to];
+			timeout = asoc->timeouts[cmd->obj.to];
+			BUG_ON(!timeout);
+
+			timer->expires = jiffies + timeout;
+			sctp_association_hold(asoc);
+			add_timer(timer);
+			break;
+
+		case SCTP_CMD_TIMER_RESTART:
+			timer = &asoc->timers[cmd->obj.to];
+			timeout = asoc->timeouts[cmd->obj.to];
+			if (!mod_timer(timer, jiffies + timeout))
+				sctp_association_hold(asoc);
+			break;
+
+		case SCTP_CMD_TIMER_STOP:
+			timer = &asoc->timers[cmd->obj.to];
+			if (timer_pending(timer) && del_timer(timer))
+				sctp_association_put(asoc);
+			break;
+
+		case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
+			chunk = cmd->obj.ptr;
+			t = sctp_assoc_choose_alter_transport(asoc,
+						asoc->init_last_sent_to);
+			asoc->init_last_sent_to = t;
+			chunk->transport = t;
+			t->init_sent_count++;
+			/* Set the new transport as primary */
+			sctp_assoc_set_primary(asoc, t);
+			break;
+
+		case SCTP_CMD_INIT_RESTART:
+			/* Do the needed accounting and updates
+			 * associated with restarting an initialization
+			 * timer. Only multiply the timeout by two if
+			 * all transports have been tried at the current
+			 * timeout.
+			 */
+			sctp_cmd_t1_timer_update(asoc,
+						SCTP_EVENT_TIMEOUT_T1_INIT,
+						"INIT");
+
+			sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+			break;
+
+		case SCTP_CMD_COOKIEECHO_RESTART:
+			/* Do the needed accounting and updates
+			 * associated with restarting an initialization
+			 * timer. Only multiply the timeout by two if
+			 * all transports have been tried at the current
+			 * timeout.
+			 */
+			sctp_cmd_t1_timer_update(asoc,
+						SCTP_EVENT_TIMEOUT_T1_COOKIE,
+						"COOKIE");
+
+			/* If we've sent any data bundled with
+			 * COOKIE-ECHO we need to resend.
+			 */
+			list_for_each_entry(t, &asoc->peer.transport_addr_list,
+					transports) {
+				sctp_retransmit_mark(&asoc->outqueue, t,
+					    SCTP_RTXR_T1_RTX);
+			}
+
+			sctp_add_cmd_sf(commands,
+					SCTP_CMD_TIMER_RESTART,
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+			break;
+
+		case SCTP_CMD_INIT_FAILED:
+			sctp_cmd_init_failed(commands, asoc, cmd->obj.err);
+			break;
+
+		case SCTP_CMD_ASSOC_FAILED:
+			sctp_cmd_assoc_failed(commands, asoc, event_type,
+					      subtype, chunk, cmd->obj.err);
+			break;
+
+		case SCTP_CMD_INIT_COUNTER_INC:
+			asoc->init_err_counter++;
+			break;
+
+		case SCTP_CMD_INIT_COUNTER_RESET:
+			asoc->init_err_counter = 0;
+			asoc->init_cycle = 0;
+			list_for_each_entry(t, &asoc->peer.transport_addr_list,
+					    transports) {
+				t->init_sent_count = 0;
+			}
+			break;
+
+		case SCTP_CMD_REPORT_DUP:
+			sctp_tsnmap_mark_dup(&asoc->peer.tsn_map,
+					     cmd->obj.u32);
+			break;
+
+		case SCTP_CMD_REPORT_BAD_TAG:
+			SCTP_DEBUG_PRINTK("vtag mismatch!\n");
+			break;
+
+		case SCTP_CMD_STRIKE:
+			/* Mark one strike against a transport.  */
+			sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
+						    0);
+			break;
+
+		case SCTP_CMD_TRANSPORT_IDLE:
+			t = cmd->obj.transport;
+			sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
+			break;
+
+		case SCTP_CMD_TRANSPORT_HB_SENT:
+			t = cmd->obj.transport;
+			sctp_do_8_2_transport_strike(asoc, t, 1);
+			t->hb_sent = 1;
+			break;
+
+		case SCTP_CMD_TRANSPORT_ON:
+			t = cmd->obj.transport;
+			sctp_cmd_transport_on(commands, asoc, t, chunk);
+			break;
+
+		case SCTP_CMD_HB_TIMERS_START:
+			sctp_cmd_hb_timers_start(commands, asoc);
+			break;
+
+		case SCTP_CMD_HB_TIMER_UPDATE:
+			t = cmd->obj.transport;
+			sctp_cmd_hb_timer_update(commands, t);
+			break;
+
+		case SCTP_CMD_HB_TIMERS_STOP:
+			sctp_cmd_hb_timers_stop(commands, asoc);
+			break;
+
+		case SCTP_CMD_REPORT_ERROR:
+			error = cmd->obj.error;
+			break;
+
+		case SCTP_CMD_PROCESS_CTSN:
+			/* Dummy up a SACK for processing. */
+			sackh.cum_tsn_ack = cmd->obj.be32;
+			sackh.a_rwnd = asoc->peer.rwnd +
+					asoc->outqueue.outstanding_bytes;
+			sackh.num_gap_ack_blocks = 0;
+			sackh.num_dup_tsns = 0;
+			sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK,
+					SCTP_SACKH(&sackh));
+			break;
+
+		case SCTP_CMD_DISCARD_PACKET:
+			/* We need to discard the whole packet.
+			 * Uncork the queue since there might be
+			 * responses pending
+			 */
+			chunk->pdiscard = 1;
+			if (asoc) {
+				sctp_outq_uncork(&asoc->outqueue);
+				local_cork = 0;
+			}
+			break;
+
+		case SCTP_CMD_RTO_PENDING:
+			t = cmd->obj.transport;
+			t->rto_pending = 1;
+			break;
+
+		case SCTP_CMD_PART_DELIVER:
+			sctp_ulpq_partial_delivery(&asoc->ulpq, cmd->obj.ptr,
+						   GFP_ATOMIC);
+			break;
+
+		case SCTP_CMD_RENEGE:
+			sctp_ulpq_renege(&asoc->ulpq, cmd->obj.ptr,
+					 GFP_ATOMIC);
+			break;
+
+		case SCTP_CMD_SETUP_T4:
+			sctp_cmd_setup_t4(commands, asoc, cmd->obj.ptr);
+			break;
+
+		case SCTP_CMD_PROCESS_OPERR:
+			sctp_cmd_process_operr(commands, asoc, chunk);
+			break;
+		case SCTP_CMD_CLEAR_INIT_TAG:
+			asoc->peer.i.init_tag = 0;
+			break;
+		case SCTP_CMD_DEL_NON_PRIMARY:
+			sctp_cmd_del_non_primary(asoc);
+			break;
+		case SCTP_CMD_T3_RTX_TIMERS_STOP:
+			sctp_cmd_t3_rtx_timers_stop(commands, asoc);
+			break;
+		case SCTP_CMD_FORCE_PRIM_RETRAN:
+			t = asoc->peer.retran_path;
+			asoc->peer.retran_path = asoc->peer.primary_path;
+			error = sctp_outq_uncork(&asoc->outqueue);
+			local_cork = 0;
+			asoc->peer.retran_path = t;
+			break;
+		case SCTP_CMD_SET_SK_ERR:
+			sctp_cmd_set_sk_err(asoc, cmd->obj.error);
+			break;
+		case SCTP_CMD_ASSOC_CHANGE:
+			sctp_cmd_assoc_change(commands, asoc,
+					      cmd->obj.u8);
+			break;
+		case SCTP_CMD_ADAPTATION_IND:
+			sctp_cmd_adaptation_ind(commands, asoc);
+			break;
+
+		case SCTP_CMD_ASSOC_SHKEY:
+			error = sctp_auth_asoc_init_active_key(asoc,
+						GFP_ATOMIC);
+			break;
+		case SCTP_CMD_UPDATE_INITTAG:
+			asoc->peer.i.init_tag = cmd->obj.u32;
+			break;
+		case SCTP_CMD_SEND_MSG:
+			if (!asoc->outqueue.cork) {
+				sctp_outq_cork(&asoc->outqueue);
+				local_cork = 1;
+			}
+			error = sctp_cmd_send_msg(asoc, cmd->obj.msg);
+			break;
+		case SCTP_CMD_SEND_NEXT_ASCONF:
+			sctp_cmd_send_asconf(asoc);
+			break;
+		case SCTP_CMD_PURGE_ASCONF_QUEUE:
+			sctp_asconf_queue_teardown(asoc);
+			break;
+		default:
+			pr_warn("Impossible command: %u, %p\n",
+				cmd->verb, cmd->obj.ptr);
+			break;
+		}
+
+		if (error)
+			break;
+	}
+
+out:
+	/* If this is in response to a received chunk, wait until
+	 * we are done with the packet to open the queue so that we don't
+	 * send multiple packets in response to a single request.
+	 */
+	if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
+		if (chunk->end_of_packet || chunk->singleton)
+			error = sctp_outq_uncork(&asoc->outqueue);
+	} else if (local_cork)
+		error = sctp_outq_uncork(&asoc->outqueue);
+	return error;
+nomem:
+	error = -ENOMEM;
+	goto out;
+}
+
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
new file mode 100644
index 00000000..24611714
--- /dev/null
+++ b/net/sctp/sm_statefuns.c
@@ -0,0 +1,6161 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2002 Intel Corp.
+ * Copyright (c) 2002      Nokia Corp.
+ *
+ * This is part of the SCTP Linux Kernel Implementation.
+ *
+ * These are the state functions for the state machine.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Mathew Kotowsky       <kotowsky@sctp.org>
+ *    Sridhar Samudrala     <samudrala@us.ibm.com>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Hui Huang 	    <hui.huang@nokia.com>
+ *    Dajiang Zhang 	    <dajiang.zhang@nokia.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Ryan Layer	    <rmlayer@us.ibm.com>
+ *    Kevin Gao		    <kevin.gao@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/net.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/inet_ecn.h>
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/structs.h>
+
+static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep,
+				  const struct sctp_association *asoc,
+				  struct sctp_chunk *chunk,
+				  const void *payload,
+				  size_t paylen);
+static int sctp_eat_data(const struct sctp_association *asoc,
+			 struct sctp_chunk *chunk,
+			 sctp_cmd_seq_t *commands);
+static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc,
+					     const struct sctp_chunk *chunk);
+static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const struct sctp_chunk *chunk,
+				       sctp_cmd_seq_t *commands,
+				       struct sctp_chunk *err_chunk);
+static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
+						 const struct sctp_association *asoc,
+						 const sctp_subtype_t type,
+						 void *arg,
+						 sctp_cmd_seq_t *commands);
+static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
+					     const struct sctp_association *asoc,
+					     const sctp_subtype_t type,
+					     void *arg,
+					     sctp_cmd_seq_t *commands);
+static sctp_disposition_t sctp_sf_tabort_8_4_8(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands);
+static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
+
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __be16 error, int sk_err,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_abort_violation(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     void *arg,
+				     sctp_cmd_seq_t *commands,
+				     const __u8 *payload,
+				     const size_t paylen);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_paramlen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg, void *ext,
+				     sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_ctsn(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
+
+static sctp_disposition_t sctp_sf_violation_chunk(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
+
+static sctp_ierror_t sctp_sf_authenticate(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    struct sctp_chunk *chunk);
+
+static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands);
+
+/* Small helper function that checks if the chunk length
+ * is of the appropriate length.  The 'required_length' argument
+ * is set to be the size of a specific chunk we are testing.
+ * Return Values:  1 = Valid length
+ * 		   0 = Invalid length
+ *
+ */
+static inline int
+sctp_chunk_length_valid(struct sctp_chunk *chunk,
+			   __u16 required_length)
+{
+	__u16 chunk_length = ntohs(chunk->chunk_hdr->length);
+
+	if (unlikely(chunk_length < required_length))
+		return 0;
+
+	return 1;
+}
+
+/**********************************************************
+ * These are the state functions for handling chunk events.
+ **********************************************************/
+
+/*
+ * Process the final SHUTDOWN COMPLETE.
+ *
+ * Section: 4 (C) (diagram), 9.2
+ * Upon reception of the SHUTDOWN COMPLETE chunk the endpoint will verify
+ * that it is in SHUTDOWN-ACK-SENT state, if it is not the chunk should be
+ * discarded. If the endpoint is in the SHUTDOWN-ACK-SENT state the endpoint
+ * should stop the T2-shutdown timer and remove all knowledge of the
+ * association (and thus the association enters the CLOSED state).
+ *
+ * Verification Tag: 8.5.1(C), sctpimpguide 2.41.
+ * C) Rules for packet carrying SHUTDOWN COMPLETE:
+ * ...
+ * - The receiver of a SHUTDOWN COMPLETE shall accept the packet
+ *   if the Verification Tag field of the packet matches its own tag and
+ *   the T bit is not set
+ *   OR
+ *   it is set to its peer's tag and the T bit is set in the Chunk
+ *   Flags.
+ *   Otherwise, the receiver MUST silently discard the packet
+ *   and take no further action.  An endpoint MUST ignore the
+ *   SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_4_C(const struct sctp_endpoint *ep,
+				  const struct sctp_association *asoc,
+				  const sctp_subtype_t type,
+				  void *arg,
+				  sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_ulpevent *ev;
+
+	if (!sctp_vtag_verify_either(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* RFC 2960 6.10 Bundling
+	 *
+	 * An endpoint MUST NOT bundle INIT, INIT ACK or
+	 * SHUTDOWN COMPLETE with any other chunks.
+	 */
+	if (!chunk->singleton)
+		return sctp_sf_violation_chunk(ep, asoc, type, arg, commands);
+
+	/* Make sure that the SHUTDOWN_COMPLETE chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* RFC 2960 10.2 SCTP-to-ULP
+	 *
+	 * H) SHUTDOWN COMPLETE notification
+	 *
+	 * When SCTP completes the shutdown procedures (section 9.2) this
+	 * notification is passed to the upper layer.
+	 */
+	ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
+					     0, 0, 0, NULL, GFP_ATOMIC);
+	if (ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ev));
+
+	/* Upon reception of the SHUTDOWN COMPLETE chunk the endpoint
+	 * will verify that it is in SHUTDOWN-ACK-SENT state, if it is
+	 * not the chunk should be discarded. If the endpoint is in
+	 * the SHUTDOWN-ACK-SENT state the endpoint should stop the
+	 * T2-shutdown timer and remove all knowledge of the
+	 * association (and thus the association enters the CLOSED
+	 * state).
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+
+	SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+	return SCTP_DISPOSITION_DELETE_TCB;
+}
+
+/*
+ * Respond to a normal INIT chunk.
+ * We are the side that is being asked for an association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, B
+ * B) "Z" shall respond immediately with an INIT ACK chunk.  The
+ *    destination IP address of the INIT ACK MUST be set to the source
+ *    IP address of the INIT to which this INIT ACK is responding.  In
+ *    the response, besides filling in other parameters, "Z" must set the
+ *    Verification Tag field to Tag_A, and also provide its own
+ *    Verification Tag (Tag_Z) in the Initiate Tag field.
+ *
+ * Verification Tag: Must be 0.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *repl;
+	struct sctp_association *new_asoc;
+	struct sctp_chunk *err_chunk;
+	struct sctp_packet *packet;
+	sctp_unrecognized_param_t *unk_param;
+	int len;
+
+	/* 6.10 Bundling
+	 * An endpoint MUST NOT bundle INIT, INIT ACK or
+	 * SHUTDOWN COMPLETE with any other chunks.
+	 *
+	 * IG Section 2.11.2
+	 * Furthermore, we require that the receiver of an INIT chunk MUST
+	 * enforce these rules by silently discarding an arriving packet
+	 * with an INIT chunk that is bundled with other chunks.
+	 */
+	if (!chunk->singleton)
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* If the packet is an OOTB packet which is temporarily on the
+	 * control endpoint, respond with an ABORT.
+	 */
+	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) {
+		SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+	}
+
+	/* 3.1 A packet containing an INIT chunk MUST have a zero Verification
+	 * Tag.
+	 */
+	if (chunk->sctp_hdr->vtag != 0)
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+
+	/* Make sure that the INIT chunk has a valid length.
+	 * Normally, this would cause an ABORT with a Protocol Violation
+	 * error, but since we don't have an association, we'll
+	 * just discard the packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* If the INIT is coming toward a closing socket, we'll send back
+	 * and ABORT.  Essentially, this catches the race of INIT being
+	 * backloged to the socket at the same time as the user isses close().
+	 * Since the socket and all its associations are going away, we
+	 * can treat this OOTB
+	 */
+	if (sctp_sstate(ep->base.sk, CLOSING))
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+
+	/* Verify the INIT chunk before processing it. */
+	err_chunk = NULL;
+	if (!sctp_verify_init(asoc, chunk->chunk_hdr->type,
+			      (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+			      &err_chunk)) {
+		/* This chunk contains fatal error. It is to be discarded.
+		 * Send an ABORT, with causes if there is any.
+		 */
+		if (err_chunk) {
+			packet = sctp_abort_pkt_new(ep, asoc, arg,
+					(__u8 *)(err_chunk->chunk_hdr) +
+					sizeof(sctp_chunkhdr_t),
+					ntohs(err_chunk->chunk_hdr->length) -
+					sizeof(sctp_chunkhdr_t));
+
+			sctp_chunk_free(err_chunk);
+
+			if (packet) {
+				sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+						SCTP_PACKET(packet));
+				SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+				return SCTP_DISPOSITION_CONSUME;
+			} else {
+				return SCTP_DISPOSITION_NOMEM;
+			}
+		} else {
+			return sctp_sf_tabort_8_4_8(ep, asoc, type, arg,
+						    commands);
+		}
+	}
+
+	/* Grab the INIT header.  */
+	chunk->subh.init_hdr = (sctp_inithdr_t *)chunk->skb->data;
+
+	/* Tag the variable length parameters.  */
+	chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+	new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
+	if (!new_asoc)
+		goto nomem;
+
+	if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
+					     sctp_scope(sctp_source(chunk)),
+					     GFP_ATOMIC) < 0)
+		goto nomem_init;
+
+	/* The call, sctp_process_init(), can fail on memory allocation.  */
+	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
+			       (sctp_init_chunk_t *)chunk->chunk_hdr,
+			       GFP_ATOMIC))
+		goto nomem_init;
+
+	/* B) "Z" shall respond immediately with an INIT ACK chunk.  */
+
+	/* If there are errors need to be reported for unknown parameters,
+	 * make sure to reserve enough room in the INIT ACK for them.
+	 */
+	len = 0;
+	if (err_chunk)
+		len = ntohs(err_chunk->chunk_hdr->length) -
+			sizeof(sctp_chunkhdr_t);
+
+	repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
+	if (!repl)
+		goto nomem_init;
+
+	/* If there are errors need to be reported for unknown parameters,
+	 * include them in the outgoing INIT ACK as "Unrecognized parameter"
+	 * parameter.
+	 */
+	if (err_chunk) {
+		/* Get the "Unrecognized parameter" parameter(s) out of the
+		 * ERROR chunk generated by sctp_verify_init(). Since the
+		 * error cause code for "unknown parameter" and the
+		 * "Unrecognized parameter" type is the same, we can
+		 * construct the parameters in INIT ACK by copying the
+		 * ERROR causes over.
+		 */
+		unk_param = (sctp_unrecognized_param_t *)
+			    ((__u8 *)(err_chunk->chunk_hdr) +
+			    sizeof(sctp_chunkhdr_t));
+		/* Replace the cause code with the "Unrecognized parameter"
+		 * parameter type.
+		 */
+		sctp_addto_chunk(repl, len, unk_param);
+		sctp_chunk_free(err_chunk);
+	}
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+	/*
+	 * Note:  After sending out INIT ACK with the State Cookie parameter,
+	 * "Z" MUST NOT allocate any resources, nor keep any states for the
+	 * new association.  Otherwise, "Z" will be vulnerable to resource
+	 * attacks.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+	return SCTP_DISPOSITION_DELETE_TCB;
+
+nomem_init:
+	sctp_association_free(new_asoc);
+nomem:
+	if (err_chunk)
+		sctp_chunk_free(err_chunk);
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Respond to a normal INIT ACK chunk.
+ * We are the side that is initiating the association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, C
+ * C) Upon reception of the INIT ACK from "Z", "A" shall stop the T1-init
+ *    timer and leave COOKIE-WAIT state. "A" shall then send the State
+ *    Cookie received in the INIT ACK chunk in a COOKIE ECHO chunk, start
+ *    the T1-cookie timer, and enter the COOKIE-ECHOED state.
+ *
+ *    Note: The COOKIE ECHO chunk can be bundled with any pending outbound
+ *    DATA chunks, but it MUST be the first chunk in the packet and
+ *    until the COOKIE ACK is returned the sender MUST NOT send any
+ *    other packets to the peer.
+ *
+ * Verification Tag: 3.3.3
+ *   If the value of the Initiate Tag in a received INIT ACK chunk is
+ *   found to be 0, the receiver MUST treat it as an error and close the
+ *   association by transmitting an ABORT.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type,
+				       void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_init_chunk_t *initchunk;
+	struct sctp_chunk *err_chunk;
+	struct sctp_packet *packet;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* 6.10 Bundling
+	 * An endpoint MUST NOT bundle INIT, INIT ACK or
+	 * SHUTDOWN COMPLETE with any other chunks.
+	 */
+	if (!chunk->singleton)
+		return sctp_sf_violation_chunk(ep, asoc, type, arg, commands);
+
+	/* Make sure that the INIT-ACK chunk has a valid length */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+	/* Grab the INIT header.  */
+	chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data;
+
+	/* Verify the INIT chunk before processing it. */
+	err_chunk = NULL;
+	if (!sctp_verify_init(asoc, chunk->chunk_hdr->type,
+			      (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+			      &err_chunk)) {
+
+		sctp_error_t error = SCTP_ERROR_NO_RESOURCE;
+
+		/* This chunk contains fatal error. It is to be discarded.
+		 * Send an ABORT, with causes.  If there are no causes,
+		 * then there wasn't enough memory.  Just terminate
+		 * the association.
+		 */
+		if (err_chunk) {
+			packet = sctp_abort_pkt_new(ep, asoc, arg,
+					(__u8 *)(err_chunk->chunk_hdr) +
+					sizeof(sctp_chunkhdr_t),
+					ntohs(err_chunk->chunk_hdr->length) -
+					sizeof(sctp_chunkhdr_t));
+
+			sctp_chunk_free(err_chunk);
+
+			if (packet) {
+				sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+						SCTP_PACKET(packet));
+				SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+				error = SCTP_ERROR_INV_PARAM;
+			}
+		}
+
+		/* SCTP-AUTH, Section 6.3:
+		 *    It should be noted that if the receiver wants to tear
+		 *    down an association in an authenticated way only, the
+		 *    handling of malformed packets should not result in
+		 *    tearing down the association.
+		 *
+		 * This means that if we only want to abort associations
+		 * in an authenticated way (i.e AUTH+ABORT), then we
+		 * can't destroy this association just because the packet
+		 * was malformed.
+		 */
+		if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		return sctp_stop_t1_and_abort(commands, error, ECONNREFUSED,
+						asoc, chunk->transport);
+	}
+
+	/* Tag the variable length parameters.  Note that we never
+	 * convert the parameters in an INIT chunk.
+	 */
+	chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+	initchunk = (sctp_init_chunk_t *) chunk->chunk_hdr;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
+			SCTP_PEER_INIT(initchunk));
+
+	/* Reset init error count upon receipt of INIT-ACK.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
+	/* 5.1 C) "A" shall stop the T1-init timer and leave
+	 * COOKIE-WAIT state.  "A" shall then ... start the T1-cookie
+	 * timer, and enter the COOKIE-ECHOED state.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_COOKIE_ECHOED));
+
+	/* SCTP-AUTH: genereate the assocition shared keys so that
+	 * we can potentially signe the COOKIE-ECHO.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL());
+
+	/* 5.1 C) "A" shall then send the State Cookie received in the
+	 * INIT ACK chunk in a COOKIE ECHO chunk, ...
+	 */
+	/* If there is any errors to report, send the ERROR chunk generated
+	 * for unknown parameters as well.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_COOKIE_ECHO,
+			SCTP_CHUNK(err_chunk));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Respond to a normal COOKIE ECHO chunk.
+ * We are the side that is being asked for an association.
+ *
+ * Section: 5.1 Normal Establishment of an Association, D
+ * D) Upon reception of the COOKIE ECHO chunk, Endpoint "Z" will reply
+ *    with a COOKIE ACK chunk after building a TCB and moving to
+ *    the ESTABLISHED state. A COOKIE ACK chunk may be bundled with
+ *    any pending DATA chunks (and/or SACK chunks), but the COOKIE ACK
+ *    chunk MUST be the first chunk in the packet.
+ *
+ *   IMPLEMENTATION NOTE: An implementation may choose to send the
+ *   Communication Up notification to the SCTP user upon reception
+ *   of a valid COOKIE ECHO chunk.
+ *
+ * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
+ * D) Rules for packet carrying a COOKIE ECHO
+ *
+ * - When sending a COOKIE ECHO, the endpoint MUST use the value of the
+ *   Initial Tag received in the INIT ACK.
+ *
+ * - The receiver of a COOKIE ECHO follows the procedures in Section 5.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const sctp_subtype_t type, void *arg,
+				      sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_association *new_asoc;
+	sctp_init_chunk_t *peer_init;
+	struct sctp_chunk *repl;
+	struct sctp_ulpevent *ev, *ai_ev = NULL;
+	int error = 0;
+	struct sctp_chunk *err_chk_p;
+	struct sock *sk;
+
+	/* If the packet is an OOTB packet which is temporarily on the
+	 * control endpoint, respond with an ABORT.
+	 */
+	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) {
+		SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the COOKIE_ECHO chunk has a valid length.
+	 * In this case, we check that we have enough for at least a
+	 * chunk header.  More detailed verification is done
+	 * in sctp_unpack_cookie().
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* If the endpoint is not listening or if the number of associations
+	 * on the TCP-style socket exceed the max backlog, respond with an
+	 * ABORT.
+	 */
+	sk = ep->base.sk;
+	if (!sctp_sstate(sk, LISTENING) ||
+	    (sctp_style(sk, TCP) && sk_acceptq_is_full(sk)))
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+
+	/* "Decode" the chunk.  We have no optional parameters so we
+	 * are in good shape.
+	 */
+	chunk->subh.cookie_hdr =
+		(struct sctp_signed_cookie *)chunk->skb->data;
+	if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
+					 sizeof(sctp_chunkhdr_t)))
+		goto nomem;
+
+	/* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint
+	 * "Z" will reply with a COOKIE ACK chunk after building a TCB
+	 * and moving to the ESTABLISHED state.
+	 */
+	new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
+				      &err_chk_p);
+
+	/* FIXME:
+	 * If the re-build failed, what is the proper error path
+	 * from here?
+	 *
+	 * [We should abort the association. --piggy]
+	 */
+	if (!new_asoc) {
+		/* FIXME: Several errors are possible.  A bad cookie should
+		 * be silently discarded, but think about logging it too.
+		 */
+		switch (error) {
+		case -SCTP_IERROR_NOMEM:
+			goto nomem;
+
+		case -SCTP_IERROR_STALE_COOKIE:
+			sctp_send_stale_cookie_err(ep, asoc, chunk, commands,
+						   err_chk_p);
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		case -SCTP_IERROR_BAD_SIG:
+		default:
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		}
+	}
+
+
+	/* Delay state machine commands until later.
+	 *
+	 * Re-build the bind address for the association is done in
+	 * the sctp_unpack_cookie() already.
+	 */
+	/* This is a brand-new association, so these are not yet side
+	 * effects--it is safe to run them here.
+	 */
+	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+
+	if (!sctp_process_init(new_asoc, chunk,
+			       &chunk->subh.cookie_hdr->c.peer_addr,
+			       peer_init, GFP_ATOMIC))
+		goto nomem_init;
+
+	/* SCTP-AUTH:  Now that we've populate required fields in
+	 * sctp_process_init, set up the assocaition shared keys as
+	 * necessary so that we can potentially authenticate the ACK
+	 */
+	error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC);
+	if (error)
+		goto nomem_init;
+
+	/* SCTP-AUTH:  auth_chunk pointer is only set when the cookie-echo
+	 * is supposed to be authenticated and we have to do delayed
+	 * authentication.  We've just recreated the association using
+	 * the information in the cookie and now it's much easier to
+	 * do the authentication.
+	 */
+	if (chunk->auth_chunk) {
+		struct sctp_chunk auth;
+		sctp_ierror_t ret;
+
+		/* set-up our fake chunk so that we can process it */
+		auth.skb = chunk->auth_chunk;
+		auth.asoc = chunk->asoc;
+		auth.sctp_hdr = chunk->sctp_hdr;
+		auth.chunk_hdr = (sctp_chunkhdr_t *)skb_push(chunk->auth_chunk,
+					    sizeof(sctp_chunkhdr_t));
+		skb_pull(chunk->auth_chunk, sizeof(sctp_chunkhdr_t));
+		auth.transport = chunk->transport;
+
+		ret = sctp_sf_authenticate(ep, new_asoc, type, &auth);
+
+		/* We can now safely free the auth_chunk clone */
+		kfree_skb(chunk->auth_chunk);
+
+		if (ret != SCTP_IERROR_NO_ERROR) {
+			sctp_association_free(new_asoc);
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		}
+	}
+
+	repl = sctp_make_cookie_ack(new_asoc, chunk);
+	if (!repl)
+		goto nomem_init;
+
+	/* RFC 2960 5.1 Normal Establishment of an Association
+	 *
+	 * D) IMPLEMENTATION NOTE: An implementation may choose to
+	 * send the Communication Up notification to the SCTP user
+	 * upon reception of a valid COOKIE ECHO chunk.
+	 */
+	ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0,
+					     new_asoc->c.sinit_num_ostreams,
+					     new_asoc->c.sinit_max_instreams,
+					     NULL, GFP_ATOMIC);
+	if (!ev)
+		goto nomem_ev;
+
+	/* Sockets API Draft Section 5.3.1.6
+	 * When a peer sends a Adaptation Layer Indication parameter , SCTP
+	 * delivers this notification to inform the application that of the
+	 * peers requested adaptation layer.
+	 */
+	if (new_asoc->peer.adaptation_ind) {
+		ai_ev = sctp_ulpevent_make_adaptation_indication(new_asoc,
+							    GFP_ATOMIC);
+		if (!ai_ev)
+			goto nomem_aiev;
+	}
+
+	/* Add all the state machine commands now since we've created
+	 * everything.  This way we don't introduce memory corruptions
+	 * during side-effect processing and correclty count established
+	 * associations.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_ESTABLISHED));
+	SCTP_INC_STATS(SCTP_MIB_CURRESTAB);
+	SCTP_INC_STATS(SCTP_MIB_PASSIVEESTABS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+
+	if (new_asoc->autoclose)
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+	/* This will send the COOKIE ACK */
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+	/* Queue the ASSOC_CHANGE event */
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+	/* Send up the Adaptation Layer Indication event */
+	if (ai_ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ai_ev));
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem_aiev:
+	sctp_ulpevent_free(ev);
+nomem_ev:
+	sctp_chunk_free(repl);
+nomem_init:
+	sctp_association_free(new_asoc);
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Respond to a normal COOKIE ACK chunk.
+ * We are the side that is being asked for an association.
+ *
+ * RFC 2960 5.1 Normal Establishment of an Association
+ *
+ * E) Upon reception of the COOKIE ACK, endpoint "A" will move from the
+ *    COOKIE-ECHOED state to the ESTABLISHED state, stopping the T1-cookie
+ *    timer. It may also notify its ULP about the successful
+ *    establishment of the association with a Communication Up
+ *    notification (see Section 10).
+ *
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const sctp_subtype_t type, void *arg,
+				      sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_ulpevent *ev;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Verify that the chunk length for the COOKIE-ACK is OK.
+	 * If we don't do this, any bundled chunks may be junked.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Reset init error count upon receipt of COOKIE-ACK,
+	 * to avoid problems with the managemement of this
+	 * counter in stale cookie situations when a transition back
+	 * from the COOKIE-ECHOED state to the COOKIE-WAIT
+	 * state is performed.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
+	/* RFC 2960 5.1 Normal Establishment of an Association
+	 *
+	 * E) Upon reception of the COOKIE ACK, endpoint "A" will move
+	 * from the COOKIE-ECHOED state to the ESTABLISHED state,
+	 * stopping the T1-cookie timer.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_ESTABLISHED));
+	SCTP_INC_STATS(SCTP_MIB_CURRESTAB);
+	SCTP_INC_STATS(SCTP_MIB_ACTIVEESTABS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+	if (asoc->autoclose)
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+	/* It may also notify its ULP about the successful
+	 * establishment of the association with a Communication Up
+	 * notification (see Section 10).
+	 */
+	ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP,
+					     0, asoc->c.sinit_num_ostreams,
+					     asoc->c.sinit_max_instreams,
+					     NULL, GFP_ATOMIC);
+
+	if (!ev)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+	/* Sockets API Draft Section 5.3.1.6
+	 * When a peer sends a Adaptation Layer Indication parameter , SCTP
+	 * delivers this notification to inform the application that of the
+	 * peers requested adaptation layer.
+	 */
+	if (asoc->peer.adaptation_ind) {
+		ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
+		if (!ev)
+			goto nomem;
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ev));
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Generate and sendout a heartbeat packet.  */
+static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const sctp_subtype_t type,
+					    void *arg,
+					    sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *transport = (struct sctp_transport *) arg;
+	struct sctp_chunk *reply;
+
+	/* Send a heartbeat to our peer.  */
+	reply = sctp_make_heartbeat(asoc, transport);
+	if (!reply)
+		return SCTP_DISPOSITION_NOMEM;
+
+	/* Set rto_pending indicating that an RTT measurement
+	 * is started with this heartbeat chunk.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_RTO_PENDING,
+			SCTP_TRANSPORT(transport));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Generate a HEARTBEAT packet on the given transport.  */
+sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *transport = (struct sctp_transport *) arg;
+
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		/* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	/* Section 3.3.5.
+	 * The Sender-specific Heartbeat Info field should normally include
+	 * information about the sender's current time when this HEARTBEAT
+	 * chunk is sent and the destination transport address to which this
+	 * HEARTBEAT is sent (see Section 8.3).
+	 */
+
+	if (transport->param_flags & SPP_HB_ENABLE) {
+		if (SCTP_DISPOSITION_NOMEM ==
+				sctp_sf_heartbeat(ep, asoc, type, arg,
+						  commands))
+			return SCTP_DISPOSITION_NOMEM;
+
+		/* Set transport error counter and association error counter
+		 * when sending heartbeat.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
+				SCTP_TRANSPORT(transport));
+	}
+	sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
+			SCTP_TRANSPORT(transport));
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
+			SCTP_TRANSPORT(transport));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an heartbeat request.
+ *
+ * Section: 8.3 Path Heartbeat
+ * The receiver of the HEARTBEAT should immediately respond with a
+ * HEARTBEAT ACK that contains the Heartbeat Information field copied
+ * from the received HEARTBEAT chunk.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ * When receiving an SCTP packet, the endpoint MUST ensure that the
+ * value in the Verification Tag field of the received SCTP packet
+ * matches its own Tag. If the received Verification Tag value does not
+ * match the receiver's own tag value, the receiver shall silently
+ * discard the packet and shall not process it any further except for
+ * those cases listed in Section 8.5.1 below.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_beat_8_3(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *reply;
+	size_t paylen = 0;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the HEARTBEAT chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* 8.3 The receiver of the HEARTBEAT should immediately
+	 * respond with a HEARTBEAT ACK that contains the Heartbeat
+	 * Information field copied from the received HEARTBEAT chunk.
+	 */
+	chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data;
+	paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t);
+	if (!pskb_pull(chunk->skb, paylen))
+		goto nomem;
+
+	reply = sctp_make_heartbeat_ack(asoc, chunk,
+					chunk->subh.hb_hdr, paylen);
+	if (!reply)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process the returning HEARTBEAT ACK.
+ *
+ * Section: 8.3 Path Heartbeat
+ * Upon the receipt of the HEARTBEAT ACK, the sender of the HEARTBEAT
+ * should clear the error counter of the destination transport
+ * address to which the HEARTBEAT was sent, and mark the destination
+ * transport address as active if it is not so marked. The endpoint may
+ * optionally report to the upper layer when an inactive destination
+ * address is marked as active due to the reception of the latest
+ * HEARTBEAT ACK. The receiver of the HEARTBEAT ACK must also
+ * clear the association overall error count as well (as defined
+ * in section 8.1).
+ *
+ * The receiver of the HEARTBEAT ACK should also perform an RTT
+ * measurement for that destination transport address using the time
+ * value carried in the HEARTBEAT ACK chunk.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	union sctp_addr from_addr;
+	struct sctp_transport *link;
+	sctp_sender_hb_info_t *hbinfo;
+	unsigned long max_interval;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the HEARTBEAT-ACK chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t) +
+					    sizeof(sctp_sender_hb_info_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+	/* Make sure that the length of the parameter is what we expect */
+	if (ntohs(hbinfo->param_hdr.length) !=
+				    sizeof(sctp_sender_hb_info_t)) {
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	from_addr = hbinfo->daddr;
+	link = sctp_assoc_lookup_paddr(asoc, &from_addr);
+
+	/* This should never happen, but lets log it if so.  */
+	if (unlikely(!link)) {
+		if (from_addr.sa.sa_family == AF_INET6) {
+			if (net_ratelimit())
+				pr_warn("%s association %p could not find address %pI6\n",
+					__func__,
+					asoc,
+					&from_addr.v6.sin6_addr);
+		} else {
+			if (net_ratelimit())
+				pr_warn("%s association %p could not find address %pI4\n",
+					__func__,
+					asoc,
+					&from_addr.v4.sin_addr.s_addr);
+		}
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* Validate the 64-bit random nonce. */
+	if (hbinfo->hb_nonce != link->hb_nonce)
+		return SCTP_DISPOSITION_DISCARD;
+
+	max_interval = link->hbinterval + link->rto;
+
+	/* Check if the timestamp looks valid.  */
+	if (time_after(hbinfo->sent_at, jiffies) ||
+	    time_after(jiffies, hbinfo->sent_at + max_interval)) {
+		SCTP_DEBUG_PRINTK("%s: HEARTBEAT ACK with invalid timestamp "
+				  "received for transport: %p\n",
+				   __func__, link);
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of
+	 * the HEARTBEAT should clear the error counter of the
+	 * destination transport address to which the HEARTBEAT was
+	 * sent and mark the destination transport address as active if
+	 * it is not so marked.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_ON, SCTP_TRANSPORT(link));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Helper function to send out an abort for the restart
+ * condition.
+ */
+static int sctp_sf_send_restart_abort(union sctp_addr *ssa,
+				      struct sctp_chunk *init,
+				      sctp_cmd_seq_t *commands)
+{
+	int len;
+	struct sctp_packet *pkt;
+	union sctp_addr_param *addrparm;
+	struct sctp_errhdr *errhdr;
+	struct sctp_endpoint *ep;
+	char buffer[sizeof(struct sctp_errhdr)+sizeof(union sctp_addr_param)];
+	struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
+
+	/* Build the error on the stack.   We are way to malloc crazy
+	 * throughout the code today.
+	 */
+	errhdr = (struct sctp_errhdr *)buffer;
+	addrparm = (union sctp_addr_param *)errhdr->variable;
+
+	/* Copy into a parm format. */
+	len = af->to_addr_param(ssa, addrparm);
+	len += sizeof(sctp_errhdr_t);
+
+	errhdr->cause = SCTP_ERROR_RESTART;
+	errhdr->length = htons(len);
+
+	/* Assign to the control socket. */
+	ep = sctp_sk((sctp_get_ctl_sock()))->ep;
+
+	/* Association is NULL since this may be a restart attack and we
+	 * want to send back the attacker's vtag.
+	 */
+	pkt = sctp_abort_pkt_new(ep, NULL, init, errhdr, len);
+
+	if (!pkt)
+		goto out;
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(pkt));
+
+	SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+
+	/* Discard the rest of the inbound packet. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+
+out:
+	/* Even if there is no memory, treat as a failure so
+	 * the packet will get dropped.
+	 */
+	return 0;
+}
+
+static bool list_has_sctp_addr(const struct list_head *list,
+			       union sctp_addr *ipaddr)
+{
+	struct sctp_transport *addr;
+
+	list_for_each_entry(addr, list, transports) {
+		if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
+			return true;
+	}
+
+	return false;
+}
+/* A restart is occurring, check to make sure no new addresses
+ * are being added as we may be under a takeover attack.
+ */
+static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
+				       const struct sctp_association *asoc,
+				       struct sctp_chunk *init,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *new_addr;
+	int ret = 1;
+
+	/* Implementor's Guide - Section 5.2.2
+	 * ...
+	 * Before responding the endpoint MUST check to see if the
+	 * unexpected INIT adds new addresses to the association. If new
+	 * addresses are added to the association, the endpoint MUST respond
+	 * with an ABORT..
+	 */
+
+	/* Search through all current addresses and make sure
+	 * we aren't adding any new ones.
+	 */
+	list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
+			    transports) {
+		if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
+					&new_addr->ipaddr)) {
+			sctp_sf_send_restart_abort(&new_addr->ipaddr, init,
+						   commands);
+			ret = 0;
+			break;
+		}
+	}
+
+	/* Return success if all addresses were found. */
+	return ret;
+}
+
+/* Populate the verification/tie tags based on overlapping INIT
+ * scenario.
+ *
+ * Note: Do not use in CLOSED or SHUTDOWN-ACK-SENT state.
+ */
+static void sctp_tietags_populate(struct sctp_association *new_asoc,
+				  const struct sctp_association *asoc)
+{
+	switch (asoc->state) {
+
+	/* 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State */
+
+	case SCTP_STATE_COOKIE_WAIT:
+		new_asoc->c.my_vtag     = asoc->c.my_vtag;
+		new_asoc->c.my_ttag     = asoc->c.my_vtag;
+		new_asoc->c.peer_ttag   = 0;
+		break;
+
+	case SCTP_STATE_COOKIE_ECHOED:
+		new_asoc->c.my_vtag     = asoc->c.my_vtag;
+		new_asoc->c.my_ttag     = asoc->c.my_vtag;
+		new_asoc->c.peer_ttag   = asoc->c.peer_vtag;
+		break;
+
+	/* 5.2.2 Unexpected INIT in States Other than CLOSED, COOKIE-ECHOED,
+	 * COOKIE-WAIT and SHUTDOWN-ACK-SENT
+	 */
+	default:
+		new_asoc->c.my_ttag   = asoc->c.my_vtag;
+		new_asoc->c.peer_ttag = asoc->c.peer_vtag;
+		break;
+	}
+
+	/* Other parameters for the endpoint SHOULD be copied from the
+	 * existing parameters of the association (e.g. number of
+	 * outbound streams) into the INIT ACK and cookie.
+	 */
+	new_asoc->rwnd                  = asoc->rwnd;
+	new_asoc->c.sinit_num_ostreams  = asoc->c.sinit_num_ostreams;
+	new_asoc->c.sinit_max_instreams = asoc->c.sinit_max_instreams;
+	new_asoc->c.initial_tsn         = asoc->c.initial_tsn;
+}
+
+/*
+ * Compare vtag/tietag values to determine unexpected COOKIE-ECHO
+ * handling action.
+ *
+ * RFC 2960 5.2.4 Handle a COOKIE ECHO when a TCB exists.
+ *
+ * Returns value representing action to be taken.   These action values
+ * correspond to Action/Description values in RFC 2960, Table 2.
+ */
+static char sctp_tietags_compare(struct sctp_association *new_asoc,
+				 const struct sctp_association *asoc)
+{
+	/* In this case, the peer may have restarted.  */
+	if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
+	    (asoc->c.peer_vtag != new_asoc->c.peer_vtag) &&
+	    (asoc->c.my_vtag == new_asoc->c.my_ttag) &&
+	    (asoc->c.peer_vtag == new_asoc->c.peer_ttag))
+		return 'A';
+
+	/* Collision case B. */
+	if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
+	    ((asoc->c.peer_vtag != new_asoc->c.peer_vtag) ||
+	     (0 == asoc->c.peer_vtag))) {
+		return 'B';
+	}
+
+	/* Collision case D. */
+	if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
+	    (asoc->c.peer_vtag == new_asoc->c.peer_vtag))
+		return 'D';
+
+	/* Collision case C. */
+	if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
+	    (asoc->c.peer_vtag == new_asoc->c.peer_vtag) &&
+	    (0 == new_asoc->c.my_ttag) &&
+	    (0 == new_asoc->c.peer_ttag))
+		return 'C';
+
+	/* No match to any of the special cases; discard this packet. */
+	return 'E';
+}
+
+/* Common helper routine for both duplicate and simulataneous INIT
+ * chunk handling.
+ */
+static sctp_disposition_t sctp_sf_do_unexpected_init(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg, sctp_cmd_seq_t *commands)
+{
+	sctp_disposition_t retval;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *repl;
+	struct sctp_association *new_asoc;
+	struct sctp_chunk *err_chunk;
+	struct sctp_packet *packet;
+	sctp_unrecognized_param_t *unk_param;
+	int len;
+
+	/* 6.10 Bundling
+	 * An endpoint MUST NOT bundle INIT, INIT ACK or
+	 * SHUTDOWN COMPLETE with any other chunks.
+	 *
+	 * IG Section 2.11.2
+	 * Furthermore, we require that the receiver of an INIT chunk MUST
+	 * enforce these rules by silently discarding an arriving packet
+	 * with an INIT chunk that is bundled with other chunks.
+	 */
+	if (!chunk->singleton)
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* 3.1 A packet containing an INIT chunk MUST have a zero Verification
+	 * Tag.
+	 */
+	if (chunk->sctp_hdr->vtag != 0)
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+
+	/* Make sure that the INIT chunk has a valid length.
+	 * In this case, we generate a protocol violation since we have
+	 * an association established.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+	/* Grab the INIT header.  */
+	chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data;
+
+	/* Tag the variable length parameters.  */
+	chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t));
+
+	/* Verify the INIT chunk before processing it. */
+	err_chunk = NULL;
+	if (!sctp_verify_init(asoc, chunk->chunk_hdr->type,
+			      (sctp_init_chunk_t *)chunk->chunk_hdr, chunk,
+			      &err_chunk)) {
+		/* This chunk contains fatal error. It is to be discarded.
+		 * Send an ABORT, with causes if there is any.
+		 */
+		if (err_chunk) {
+			packet = sctp_abort_pkt_new(ep, asoc, arg,
+					(__u8 *)(err_chunk->chunk_hdr) +
+					sizeof(sctp_chunkhdr_t),
+					ntohs(err_chunk->chunk_hdr->length) -
+					sizeof(sctp_chunkhdr_t));
+
+			if (packet) {
+				sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+						SCTP_PACKET(packet));
+				SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+				retval = SCTP_DISPOSITION_CONSUME;
+			} else {
+				retval = SCTP_DISPOSITION_NOMEM;
+			}
+			goto cleanup;
+		} else {
+			return sctp_sf_tabort_8_4_8(ep, asoc, type, arg,
+						    commands);
+		}
+	}
+
+	/*
+	 * Other parameters for the endpoint SHOULD be copied from the
+	 * existing parameters of the association (e.g. number of
+	 * outbound streams) into the INIT ACK and cookie.
+	 * FIXME:  We are copying parameters from the endpoint not the
+	 * association.
+	 */
+	new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
+	if (!new_asoc)
+		goto nomem;
+
+	if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
+				sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0)
+		goto nomem;
+
+	/* In the outbound INIT ACK the endpoint MUST copy its current
+	 * Verification Tag and Peers Verification tag into a reserved
+	 * place (local tie-tag and per tie-tag) within the state cookie.
+	 */
+	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
+			       (sctp_init_chunk_t *)chunk->chunk_hdr,
+			       GFP_ATOMIC))
+		goto nomem;
+
+	/* Make sure no new addresses are being added during the
+	 * restart.   Do not do this check for COOKIE-WAIT state,
+	 * since there are no peer addresses to check against.
+	 * Upon return an ABORT will have been sent if needed.
+	 */
+	if (!sctp_state(asoc, COOKIE_WAIT)) {
+		if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk,
+						 commands)) {
+			retval = SCTP_DISPOSITION_CONSUME;
+			goto nomem_retval;
+		}
+	}
+
+	sctp_tietags_populate(new_asoc, asoc);
+
+	/* B) "Z" shall respond immediately with an INIT ACK chunk.  */
+
+	/* If there are errors need to be reported for unknown parameters,
+	 * make sure to reserve enough room in the INIT ACK for them.
+	 */
+	len = 0;
+	if (err_chunk) {
+		len = ntohs(err_chunk->chunk_hdr->length) -
+			sizeof(sctp_chunkhdr_t);
+	}
+
+	repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
+	if (!repl)
+		goto nomem;
+
+	/* If there are errors need to be reported for unknown parameters,
+	 * include them in the outgoing INIT ACK as "Unrecognized parameter"
+	 * parameter.
+	 */
+	if (err_chunk) {
+		/* Get the "Unrecognized parameter" parameter(s) out of the
+		 * ERROR chunk generated by sctp_verify_init(). Since the
+		 * error cause code for "unknown parameter" and the
+		 * "Unrecognized parameter" type is the same, we can
+		 * construct the parameters in INIT ACK by copying the
+		 * ERROR causes over.
+		 */
+		unk_param = (sctp_unrecognized_param_t *)
+			    ((__u8 *)(err_chunk->chunk_hdr) +
+			    sizeof(sctp_chunkhdr_t));
+		/* Replace the cause code with the "Unrecognized parameter"
+		 * parameter type.
+		 */
+		sctp_addto_chunk(repl, len, unk_param);
+	}
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+	/*
+	 * Note: After sending out INIT ACK with the State Cookie parameter,
+	 * "Z" MUST NOT allocate any resources for this new association.
+	 * Otherwise, "Z" will be vulnerable to resource attacks.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+	retval = SCTP_DISPOSITION_CONSUME;
+
+	return retval;
+
+nomem:
+	retval = SCTP_DISPOSITION_NOMEM;
+nomem_retval:
+	if (new_asoc)
+		sctp_association_free(new_asoc);
+cleanup:
+	if (err_chunk)
+		sctp_chunk_free(err_chunk);
+	return retval;
+}
+
+/*
+ * Handle simultaneous INIT.
+ * This means we started an INIT and then we got an INIT request from
+ * our peer.
+ *
+ * Section: 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State (Item B)
+ * This usually indicates an initialization collision, i.e., each
+ * endpoint is attempting, at about the same time, to establish an
+ * association with the other endpoint.
+ *
+ * Upon receipt of an INIT in the COOKIE-WAIT or COOKIE-ECHOED state, an
+ * endpoint MUST respond with an INIT ACK using the same parameters it
+ * sent in its original INIT chunk (including its Verification Tag,
+ * unchanged). These original parameters are combined with those from the
+ * newly received INIT chunk. The endpoint shall also generate a State
+ * Cookie with the INIT ACK. The endpoint uses the parameters sent in its
+ * INIT to calculate the State Cookie.
+ *
+ * After that, the endpoint MUST NOT change its state, the T1-init
+ * timer shall be left running and the corresponding TCB MUST NOT be
+ * destroyed. The normal procedures for handling State Cookies when
+ * a TCB exists will resolve the duplicate INITs to a single association.
+ *
+ * For an endpoint that is in the COOKIE-ECHOED state it MUST populate
+ * its Tie-Tags with the Tag information of itself and its peer (see
+ * section 5.2.2 for a description of the Tie-Tags).
+ *
+ * Verification Tag: Not explicit, but an INIT can not have a valid
+ * verification tag, so we skip the check.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_1_siminit(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	/* Call helper to do the real work for both simulataneous and
+	 * duplicate INIT chunk handling.
+	 */
+	return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle duplicated INIT messages.  These are usually delayed
+ * restransmissions.
+ *
+ * Section: 5.2.2 Unexpected INIT in States Other than CLOSED,
+ * COOKIE-ECHOED and COOKIE-WAIT
+ *
+ * Unless otherwise stated, upon reception of an unexpected INIT for
+ * this association, the endpoint shall generate an INIT ACK with a
+ * State Cookie.  In the outbound INIT ACK the endpoint MUST copy its
+ * current Verification Tag and peer's Verification Tag into a reserved
+ * place within the state cookie.  We shall refer to these locations as
+ * the Peer's-Tie-Tag and the Local-Tie-Tag.  The outbound SCTP packet
+ * containing this INIT ACK MUST carry a Verification Tag value equal to
+ * the Initiation Tag found in the unexpected INIT.  And the INIT ACK
+ * MUST contain a new Initiation Tag (randomly generated see Section
+ * 5.3.1).  Other parameters for the endpoint SHOULD be copied from the
+ * existing parameters of the association (e.g. number of outbound
+ * streams) into the INIT ACK and cookie.
+ *
+ * After sending out the INIT ACK, the endpoint shall take no further
+ * actions, i.e., the existing association, including its current state,
+ * and the corresponding TCB MUST NOT be changed.
+ *
+ * Note: Only when a TCB exists and the association is not in a COOKIE-
+ * WAIT state are the Tie-Tags populated.  For a normal association INIT
+ * (i.e. the endpoint is in a COOKIE-WAIT state), the Tie-Tags MUST be
+ * set to 0 (indicating that no previous TCB existed).  The INIT ACK and
+ * State Cookie are populated as specified in section 5.2.1.
+ *
+ * Verification Tag: Not specified, but an INIT has no way of knowing
+ * what the verification tag could be, so we ignore it.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_2_dupinit(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	/* Call helper to do the real work for both simulataneous and
+	 * duplicate INIT chunk handling.
+	 */
+	return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands);
+}
+
+
+/*
+ * Unexpected INIT-ACK handler.
+ *
+ * Section 5.2.3
+ * If an INIT ACK received by an endpoint in any state other than the
+ * COOKIE-WAIT state, the endpoint should discard the INIT ACK chunk.
+ * An unexpected INIT ACK usually indicates the processing of an old or
+ * duplicated INIT chunk.
+*/
+sctp_disposition_t sctp_sf_do_5_2_3_initack(const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const sctp_subtype_t type,
+					    void *arg, sctp_cmd_seq_t *commands)
+{
+	/* Per the above section, we'll discard the chunk if we have an
+	 * endpoint.  If this is an OOTB INIT-ACK, treat it as such.
+	 */
+	if (ep == sctp_sk((sctp_get_ctl_sock()))->ep)
+		return sctp_sf_ootb(ep, asoc, type, arg, commands);
+	else
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+}
+
+/* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A')
+ *
+ * Section 5.2.4
+ *  A)  In this case, the peer may have restarted.
+ */
+static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					sctp_cmd_seq_t *commands,
+					struct sctp_association *new_asoc)
+{
+	sctp_init_chunk_t *peer_init;
+	struct sctp_ulpevent *ev;
+	struct sctp_chunk *repl;
+	struct sctp_chunk *err;
+	sctp_disposition_t disposition;
+
+	/* new_asoc is a brand-new association, so these are not yet
+	 * side effects--it is safe to run them here.
+	 */
+	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+
+	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
+			       GFP_ATOMIC))
+		goto nomem;
+
+	/* Make sure no new addresses are being added during the
+	 * restart.  Though this is a pretty complicated attack
+	 * since you'd have to get inside the cookie.
+	 */
+	if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) {
+		return SCTP_DISPOSITION_CONSUME;
+	}
+
+	/* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes
+	 * the peer has restarted (Action A), it MUST NOT setup a new
+	 * association but instead resend the SHUTDOWN ACK and send an ERROR
+	 * chunk with a "Cookie Received while Shutting Down" error cause to
+	 * its peer.
+	*/
+	if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) {
+		disposition = sctp_sf_do_9_2_reshutack(ep, asoc,
+				SCTP_ST_CHUNK(chunk->chunk_hdr->type),
+				chunk, commands);
+		if (SCTP_DISPOSITION_NOMEM == disposition)
+			goto nomem;
+
+		err = sctp_make_op_error(asoc, chunk,
+					 SCTP_ERROR_COOKIE_IN_SHUTDOWN,
+					 NULL, 0, 0);
+		if (err)
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(err));
+
+		return SCTP_DISPOSITION_CONSUME;
+	}
+
+	/* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked
+	 * data. Consider the optional choice of resending of this data.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
+	sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL());
+
+	/* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue
+	 * and ASCONF-ACK cache.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+	sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());
+
+	repl = sctp_make_cookie_ack(new_asoc, chunk);
+	if (!repl)
+		goto nomem;
+
+	/* Report association restart to upper layer. */
+	ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0,
+					     new_asoc->c.sinit_num_ostreams,
+					     new_asoc->c.sinit_max_instreams,
+					     NULL, GFP_ATOMIC);
+	if (!ev)
+		goto nomem_ev;
+
+	/* Update the content of current association. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem_ev:
+	sctp_chunk_free(repl);
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'B')
+ *
+ * Section 5.2.4
+ *   B) In this case, both sides may be attempting to start an association
+ *      at about the same time but the peer endpoint started its INIT
+ *      after responding to the local endpoint's INIT
+ */
+/* This case represents an initialization collision.  */
+static sctp_disposition_t sctp_sf_do_dupcook_b(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					sctp_cmd_seq_t *commands,
+					struct sctp_association *new_asoc)
+{
+	sctp_init_chunk_t *peer_init;
+	struct sctp_chunk *repl;
+
+	/* new_asoc is a brand-new association, so these are not yet
+	 * side effects--it is safe to run them here.
+	 */
+	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
+			       GFP_ATOMIC))
+		goto nomem;
+
+	/* Update the content of current association.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc));
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_ESTABLISHED));
+	SCTP_INC_STATS(SCTP_MIB_CURRESTAB);
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
+
+	repl = sctp_make_cookie_ack(new_asoc, chunk);
+	if (!repl)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+	/* RFC 2960 5.1 Normal Establishment of an Association
+	 *
+	 * D) IMPLEMENTATION NOTE: An implementation may choose to
+	 * send the Communication Up notification to the SCTP user
+	 * upon reception of a valid COOKIE ECHO chunk.
+	 *
+	 * Sadly, this needs to be implemented as a side-effect, because
+	 * we are not guaranteed to have set the association id of the real
+	 * association and so these notifications need to be delayed until
+	 * the association id is allocated.
+	 */
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_CHANGE, SCTP_U8(SCTP_COMM_UP));
+
+	/* Sockets API Draft Section 5.3.1.6
+	 * When a peer sends a Adaptation Layer Indication parameter , SCTP
+	 * delivers this notification to inform the application that of the
+	 * peers requested adaptation layer.
+	 *
+	 * This also needs to be done as a side effect for the same reason as
+	 * above.
+	 */
+	if (asoc->peer.adaptation_ind)
+		sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL());
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'C')
+ *
+ * Section 5.2.4
+ *  C) In this case, the local endpoint's cookie has arrived late.
+ *     Before it arrived, the local endpoint sent an INIT and received an
+ *     INIT-ACK and finally sent a COOKIE ECHO with the peer's same tag
+ *     but a new tag of its own.
+ */
+/* This case represents an initialization collision.  */
+static sctp_disposition_t sctp_sf_do_dupcook_c(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					sctp_cmd_seq_t *commands,
+					struct sctp_association *new_asoc)
+{
+	/* The cookie should be silently discarded.
+	 * The endpoint SHOULD NOT change states and should leave
+	 * any timers running.
+	 */
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/* Unexpected COOKIE-ECHO handler lost chunk (Table 2, action 'D')
+ *
+ * Section 5.2.4
+ *
+ * D) When both local and remote tags match the endpoint should always
+ *    enter the ESTABLISHED state, if it has not already done so.
+ */
+/* This case represents an initialization collision.  */
+static sctp_disposition_t sctp_sf_do_dupcook_d(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					sctp_cmd_seq_t *commands,
+					struct sctp_association *new_asoc)
+{
+	struct sctp_ulpevent *ev = NULL, *ai_ev = NULL;
+	struct sctp_chunk *repl;
+
+	/* Clarification from Implementor's Guide:
+	 * D) When both local and remote tags match the endpoint should
+	 * enter the ESTABLISHED state, if it is in the COOKIE-ECHOED state.
+	 * It should stop any cookie timer that may be running and send
+	 * a COOKIE ACK.
+	 */
+
+	/* Don't accidentally move back into established state. */
+	if (asoc->state < SCTP_STATE_ESTABLISHED) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+		sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+				SCTP_STATE(SCTP_STATE_ESTABLISHED));
+		SCTP_INC_STATS(SCTP_MIB_CURRESTAB);
+		sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START,
+				SCTP_NULL());
+
+		/* RFC 2960 5.1 Normal Establishment of an Association
+		 *
+		 * D) IMPLEMENTATION NOTE: An implementation may choose
+		 * to send the Communication Up notification to the
+		 * SCTP user upon reception of a valid COOKIE
+		 * ECHO chunk.
+		 */
+		ev = sctp_ulpevent_make_assoc_change(asoc, 0,
+					     SCTP_COMM_UP, 0,
+					     asoc->c.sinit_num_ostreams,
+					     asoc->c.sinit_max_instreams,
+					     NULL, GFP_ATOMIC);
+		if (!ev)
+			goto nomem;
+
+		/* Sockets API Draft Section 5.3.1.6
+		 * When a peer sends a Adaptation Layer Indication parameter,
+		 * SCTP delivers this notification to inform the application
+		 * that of the peers requested adaptation layer.
+		 */
+		if (asoc->peer.adaptation_ind) {
+			ai_ev = sctp_ulpevent_make_adaptation_indication(asoc,
+								 GFP_ATOMIC);
+			if (!ai_ev)
+				goto nomem;
+
+		}
+	}
+
+	repl = sctp_make_cookie_ack(new_asoc, chunk);
+	if (!repl)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+
+	if (ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ev));
+	if (ai_ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+					SCTP_ULPEVENT(ai_ev));
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	if (ai_ev)
+		sctp_ulpevent_free(ai_ev);
+	if (ev)
+		sctp_ulpevent_free(ev);
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle a duplicate COOKIE-ECHO.  This usually means a cookie-carrying
+ * chunk was retransmitted and then delayed in the network.
+ *
+ * Section: 5.2.4 Handle a COOKIE ECHO when a TCB exists
+ *
+ * Verification Tag: None.  Do cookie validation.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	sctp_disposition_t retval;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_association *new_asoc;
+	int error = 0;
+	char action;
+	struct sctp_chunk *err_chk_p;
+
+	/* Make sure that the chunk has a valid length from the protocol
+	 * perspective.  In this case check to make sure we have at least
+	 * enough for the chunk header.  Cookie length verification is
+	 * done later.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* "Decode" the chunk.  We have no optional parameters so we
+	 * are in good shape.
+	 */
+	chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data;
+	if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
+					sizeof(sctp_chunkhdr_t)))
+		goto nomem;
+
+	/* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie
+	 * of a duplicate COOKIE ECHO match the Verification Tags of the
+	 * current association, consider the State Cookie valid even if
+	 * the lifespan is exceeded.
+	 */
+	new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
+				      &err_chk_p);
+
+	/* FIXME:
+	 * If the re-build failed, what is the proper error path
+	 * from here?
+	 *
+	 * [We should abort the association. --piggy]
+	 */
+	if (!new_asoc) {
+		/* FIXME: Several errors are possible.  A bad cookie should
+		 * be silently discarded, but think about logging it too.
+		 */
+		switch (error) {
+		case -SCTP_IERROR_NOMEM:
+			goto nomem;
+
+		case -SCTP_IERROR_STALE_COOKIE:
+			sctp_send_stale_cookie_err(ep, asoc, chunk, commands,
+						   err_chk_p);
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		case -SCTP_IERROR_BAD_SIG:
+		default:
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		}
+	}
+
+	/* Compare the tie_tag in cookie with the verification tag of
+	 * current association.
+	 */
+	action = sctp_tietags_compare(new_asoc, asoc);
+
+	switch (action) {
+	case 'A': /* Association restart. */
+		retval = sctp_sf_do_dupcook_a(ep, asoc, chunk, commands,
+					      new_asoc);
+		break;
+
+	case 'B': /* Collision case B. */
+		retval = sctp_sf_do_dupcook_b(ep, asoc, chunk, commands,
+					      new_asoc);
+		break;
+
+	case 'C': /* Collision case C. */
+		retval = sctp_sf_do_dupcook_c(ep, asoc, chunk, commands,
+					      new_asoc);
+		break;
+
+	case 'D': /* Collision case D. */
+		retval = sctp_sf_do_dupcook_d(ep, asoc, chunk, commands,
+					      new_asoc);
+		break;
+
+	default: /* Discard packet for all others. */
+		retval = sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		break;
+	}
+
+	/* Delete the tempory new association. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+	return retval;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process an ABORT.  (SHUTDOWN-PENDING state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_pending_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	if (!sctp_vtag_verify_either(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ABORT chunk has a valid length.
+	 * Since this is an ABORT chunk, we have to discard it
+	 * because of the following text:
+	 * RFC 2960, Section 3.3.7
+	 *    If an endpoint receives an ABORT with a format error or for an
+	 *    association that doesn't exist, it MUST silently discard it.
+	 * Because the length is "invalid", we can't really discard just
+	 * as we do not know its true length.  So, to be safe, discard the
+	 * packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* ADD-IP: Special case for ABORT chunks
+	 * F4)  One special consideration is that ABORT Chunks arriving
+	 * destined to the IP address being deleted MUST be
+	 * ignored (see Section 5.3.1 for further details).
+	 */
+	if (SCTP_ADDR_DEL ==
+		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+
+	return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process an ABORT.  (SHUTDOWN-SENT state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_sent_abort(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	if (!sctp_vtag_verify_either(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ABORT chunk has a valid length.
+	 * Since this is an ABORT chunk, we have to discard it
+	 * because of the following text:
+	 * RFC 2960, Section 3.3.7
+	 *    If an endpoint receives an ABORT with a format error or for an
+	 *    association that doesn't exist, it MUST silently discard it.
+	 * Because the length is "invalid", we can't really discard just
+	 * as we do not know its true length.  So, to be safe, discard the
+	 * packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* ADD-IP: Special case for ABORT chunks
+	 * F4)  One special consideration is that ABORT Chunks arriving
+	 * destined to the IP address being deleted MUST be
+	 * ignored (see Section 5.3.1 for further details).
+	 */
+	if (SCTP_ADDR_DEL ==
+		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+
+	/* Stop the T2-shutdown timer. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	/* Stop the T5-shutdown guard timer.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process an ABORT.  (SHUTDOWN-ACK-SENT state)
+ *
+ * See sctp_sf_do_9_1_abort().
+ */
+sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* The same T2 timer, so we should be able to use
+	 * common function with the SHUTDOWN-SENT state.
+	 */
+	return sctp_sf_shutdown_sent_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle an Error received in COOKIE_ECHOED state.
+ *
+ * Only handle the error type of stale COOKIE Error, the other errors will
+ * be ignored.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_err(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_errhdr_t *err;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ERROR chunk has a valid length.
+	 * The parameter walking depends on this as well.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Process the error here */
+	/* FUTURE FIXME:  When PR-SCTP related and other optional
+	 * parms are emitted, this will have to change to handle multiple
+	 * errors.
+	 */
+	sctp_walk_errors(err, chunk->chunk_hdr) {
+		if (SCTP_ERROR_STALE_COOKIE == err->cause)
+			return sctp_sf_do_5_2_6_stale(ep, asoc, type,
+							arg, commands);
+	}
+
+	/* It is possible to have malformed error causes, and that
+	 * will cause us to end the walk early.  However, since
+	 * we are discarding the packet, there should be no adverse
+	 * affects.
+	 */
+	return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle a Stale COOKIE Error
+ *
+ * Section: 5.2.6 Handle Stale COOKIE Error
+ * If the association is in the COOKIE-ECHOED state, the endpoint may elect
+ * one of the following three alternatives.
+ * ...
+ * 3) Send a new INIT chunk to the endpoint, adding a Cookie
+ *    Preservative parameter requesting an extension to the lifetime of
+ *    the State Cookie. When calculating the time extension, an
+ *    implementation SHOULD use the RTT information measured based on the
+ *    previous COOKIE ECHO / ERROR exchange, and should add no more
+ *    than 1 second beyond the measured RTT, due to long State Cookie
+ *    lifetimes making the endpoint more subject to a replay attack.
+ *
+ * Verification Tag:  Not explicit, but safe to ignore.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
+						 const struct sctp_association *asoc,
+						 const sctp_subtype_t type,
+						 void *arg,
+						 sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	time_t stale;
+	sctp_cookie_preserve_param_t bht;
+	sctp_errhdr_t *err;
+	struct sctp_chunk *reply;
+	struct sctp_bind_addr *bp;
+	int attempts = asoc->init_err_counter + 1;
+
+	if (attempts > asoc->max_init_attempts) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+				SCTP_PERR(SCTP_ERROR_STALE_COOKIE));
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	err = (sctp_errhdr_t *)(chunk->skb->data);
+
+	/* When calculating the time extension, an implementation
+	 * SHOULD use the RTT information measured based on the
+	 * previous COOKIE ECHO / ERROR exchange, and should add no
+	 * more than 1 second beyond the measured RTT, due to long
+	 * State Cookie lifetimes making the endpoint more subject to
+	 * a replay attack.
+	 * Measure of Staleness's unit is usec. (1/1000000 sec)
+	 * Suggested Cookie Life-span Increment's unit is msec.
+	 * (1/1000 sec)
+	 * In general, if you use the suggested cookie life, the value
+	 * found in the field of measure of staleness should be doubled
+	 * to give ample time to retransmit the new cookie and thus
+	 * yield a higher probability of success on the reattempt.
+	 */
+	stale = ntohl(*(__be32 *)((u8 *)err + sizeof(sctp_errhdr_t)));
+	stale = (stale * 2) / 1000;
+
+	bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE;
+	bht.param_hdr.length = htons(sizeof(bht));
+	bht.lifespan_increment = htonl(stale);
+
+	/* Build that new INIT chunk.  */
+	bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+	reply = sctp_make_init(asoc, bp, GFP_ATOMIC, sizeof(bht));
+	if (!reply)
+		goto nomem;
+
+	sctp_addto_chunk(reply, sizeof(bht), &bht);
+
+	/* Clear peer's init_tag cached in assoc as we are sending a new INIT */
+	sctp_add_cmd_sf(commands, SCTP_CMD_CLEAR_INIT_TAG, SCTP_NULL());
+
+	/* Stop pending T3-rtx and heartbeat timers */
+	sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+	/* Delete non-primary peer ip addresses since we are transitioning
+	 * back to the COOKIE-WAIT state
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL());
+
+	/* If we've sent any data bundled with COOKIE-ECHO we will need to
+	 * resend
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN,
+			SCTP_TRANSPORT(asoc->peer.primary_path));
+
+	/* Cast away the const modifier, as we want to just
+	 * rerun it through as a sideffect.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_COOKIE_WAIT));
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process an ABORT.
+ *
+ * Section: 9.1
+ * After checking the Verification Tag, the receiving endpoint shall
+ * remove the association from its record, and shall report the
+ * termination to its upper layer.
+ *
+ * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
+ * B) Rules for packet carrying ABORT:
+ *
+ *  - The endpoint shall always fill in the Verification Tag field of the
+ *    outbound packet with the destination endpoint's tag value if it
+ *    is known.
+ *
+ *  - If the ABORT is sent in response to an OOTB packet, the endpoint
+ *    MUST follow the procedure described in Section 8.4.
+ *
+ *  - The receiver MUST accept the packet if the Verification Tag
+ *    matches either its own tag, OR the tag of its peer. Otherwise, the
+ *    receiver MUST silently discard the packet and take no further
+ *    action.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	if (!sctp_vtag_verify_either(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ABORT chunk has a valid length.
+	 * Since this is an ABORT chunk, we have to discard it
+	 * because of the following text:
+	 * RFC 2960, Section 3.3.7
+	 *    If an endpoint receives an ABORT with a format error or for an
+	 *    association that doesn't exist, it MUST silently discard it.
+	 * Because the length is "invalid", we can't really discard just
+	 * as we do not know its true length.  So, to be safe, discard the
+	 * packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* ADD-IP: Special case for ABORT chunks
+	 * F4)  One special consideration is that ABORT Chunks arriving
+	 * destined to the IP address being deleted MUST be
+	 * ignored (see Section 5.3.1 for further details).
+	 */
+	if (SCTP_ADDR_DEL ==
+		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+
+	return __sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands);
+}
+
+static sctp_disposition_t __sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	unsigned len;
+	__be16 error = SCTP_ERROR_NO_ERROR;
+
+	/* See if we have an error cause code in the chunk.  */
+	len = ntohs(chunk->chunk_hdr->length);
+	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
+
+		sctp_errhdr_t *err;
+		sctp_walk_errors(err, chunk->chunk_hdr);
+		if ((void *)err != (void *)chunk->chunk_end)
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+	}
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
+	/* ASSOC_FAILED will DELETE_TCB. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, SCTP_PERR(error));
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+
+	return SCTP_DISPOSITION_ABORT;
+}
+
+/*
+ * Process an ABORT.  (COOKIE-WAIT state)
+ *
+ * See sctp_sf_do_9_1_abort() above.
+ */
+sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	unsigned len;
+	__be16 error = SCTP_ERROR_NO_ERROR;
+
+	if (!sctp_vtag_verify_either(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ABORT chunk has a valid length.
+	 * Since this is an ABORT chunk, we have to discard it
+	 * because of the following text:
+	 * RFC 2960, Section 3.3.7
+	 *    If an endpoint receives an ABORT with a format error or for an
+	 *    association that doesn't exist, it MUST silently discard it.
+	 * Because the length is "invalid", we can't really discard just
+	 * as we do not know its true length.  So, to be safe, discard the
+	 * packet.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* See if we have an error cause code in the chunk.  */
+	len = ntohs(chunk->chunk_hdr->length);
+	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
+		error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+
+	return sctp_stop_t1_and_abort(commands, error, ECONNREFUSED, asoc,
+				      chunk->transport);
+}
+
+/*
+ * Process an incoming ICMP as an ABORT.  (COOKIE-WAIT state)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR,
+				      ENOPROTOOPT, asoc,
+				      (struct sctp_transport *)arg);
+}
+
+/*
+ * Process an ABORT.  (COOKIE-ECHOED state)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
+					       const struct sctp_association *asoc,
+					       const sctp_subtype_t type,
+					       void *arg,
+					       sctp_cmd_seq_t *commands)
+{
+	/* There is a single T1 timer, so we should be able to use
+	 * common function with the COOKIE-WAIT state.
+	 */
+	return sctp_sf_cookie_wait_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Stop T1 timer and abort association with "INIT failed".
+ *
+ * This is common code called by several sctp_sf_*_abort() functions above.
+ */
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __be16 error, int sk_err,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport)
+{
+	SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(sk_err));
+	/* CMD_INIT_FAILED will DELETE_TCB. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+			SCTP_PERR(error));
+	return SCTP_DISPOSITION_ABORT;
+}
+
+/*
+ * sctp_sf_do_9_2_shut
+ *
+ * Section: 9.2
+ * Upon the reception of the SHUTDOWN, the peer endpoint shall
+ *  - enter the SHUTDOWN-RECEIVED state,
+ *
+ *  - stop accepting new data from its SCTP user
+ *
+ *  - verify, by checking the Cumulative TSN Ack field of the chunk,
+ *    that all its outstanding DATA chunks have been received by the
+ *    SHUTDOWN sender.
+ *
+ * Once an endpoint as reached the SHUTDOWN-RECEIVED state it MUST NOT
+ * send a SHUTDOWN in response to a ULP request. And should discard
+ * subsequent SHUTDOWN chunks.
+ *
+ * If there are still outstanding DATA chunks left, the SHUTDOWN
+ * receiver shall continue to follow normal data transmission
+ * procedures defined in Section 6 until all outstanding DATA chunks
+ * are acknowledged; however, the SHUTDOWN receiver MUST NOT accept
+ * new data from its SCTP user.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_shutdownhdr_t *sdh;
+	sctp_disposition_t disposition;
+	struct sctp_ulpevent *ev;
+	__u32 ctsn;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the SHUTDOWN chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk,
+				      sizeof(struct sctp_shutdown_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Convert the elaborate header.  */
+	sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t));
+	chunk->subh.shutdown_hdr = sdh;
+	ctsn = ntohl(sdh->cum_tsn_ack);
+
+	if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+		SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn);
+		SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point);
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* If Cumulative TSN Ack beyond the max tsn currently
+	 * send, terminating the association and respond to the
+	 * sender with an ABORT.
+	 */
+	if (!TSN_lt(ctsn, asoc->next_tsn))
+		return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands);
+
+	/* API 5.3.1.5 SCTP_SHUTDOWN_EVENT
+	 * When a peer sends a SHUTDOWN, SCTP delivers this notification to
+	 * inform the application that it should cease sending data.
+	 */
+	ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC);
+	if (!ev) {
+		disposition = SCTP_DISPOSITION_NOMEM;
+		goto out;
+	}
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+	/* Upon the reception of the SHUTDOWN, the peer endpoint shall
+	 *  - enter the SHUTDOWN-RECEIVED state,
+	 *  - stop accepting new data from its SCTP user
+	 *
+	 * [This is implicit in the new state.]
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_SHUTDOWN_RECEIVED));
+	disposition = SCTP_DISPOSITION_CONSUME;
+
+	if (sctp_outq_is_empty(&asoc->outqueue)) {
+		disposition = sctp_sf_do_9_2_shutdown_ack(ep, asoc, type,
+							  arg, commands);
+	}
+
+	if (SCTP_DISPOSITION_NOMEM == disposition)
+		goto out;
+
+	/*  - verify, by checking the Cumulative TSN Ack field of the
+	 *    chunk, that all its outstanding DATA chunks have been
+	 *    received by the SHUTDOWN sender.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
+			SCTP_BE32(chunk->subh.shutdown_hdr->cum_tsn_ack));
+
+out:
+	return disposition;
+}
+
+/*
+ * sctp_sf_do_9_2_shut_ctsn
+ *
+ * Once an endpoint has reached the SHUTDOWN-RECEIVED state,
+ * it MUST NOT send a SHUTDOWN in response to a ULP request.
+ * The Cumulative TSN Ack of the received SHUTDOWN chunk
+ * MUST be processed.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_shutdownhdr_t *sdh;
+	__u32 ctsn;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the SHUTDOWN chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk,
+				      sizeof(struct sctp_shutdown_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
+	ctsn = ntohl(sdh->cum_tsn_ack);
+
+	if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+		SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn);
+		SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point);
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* If Cumulative TSN Ack beyond the max tsn currently
+	 * send, terminating the association and respond to the
+	 * sender with an ABORT.
+	 */
+	if (!TSN_lt(ctsn, asoc->next_tsn))
+		return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands);
+
+	/* verify, by checking the Cumulative TSN Ack field of the
+	 * chunk, that all its outstanding DATA chunks have been
+	 * received by the SHUTDOWN sender.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
+			SCTP_BE32(sdh->cum_tsn_ack));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* RFC 2960 9.2
+ * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk
+ * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination
+ * transport addresses (either in the IP addresses or in the INIT chunk)
+ * that belong to this association, it should discard the INIT chunk and
+ * retransmit the SHUTDOWN ACK chunk.
+ */
+sctp_disposition_t sctp_sf_do_9_2_reshutack(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+	struct sctp_chunk *reply;
+
+	/* Make sure that the chunk has a valid length */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Since we are not going to really process this INIT, there
+	 * is no point in verifying chunk boundries.  Just generate
+	 * the SHUTDOWN ACK.
+	 */
+	reply = sctp_make_shutdown_ack(asoc, chunk);
+	if (NULL == reply)
+		goto nomem;
+
+	/* Set the transport for the SHUTDOWN ACK chunk and the timeout for
+	 * the T2-SHUTDOWN timer.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+	/* and restart the T2-shutdown timer. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+	return SCTP_DISPOSITION_CONSUME;
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * sctp_sf_do_ecn_cwr
+ *
+ * Section:  Appendix A: Explicit Congestion Notification
+ *
+ * CWR:
+ *
+ * RFC 2481 details a specific bit for a sender to send in the header of
+ * its next outbound TCP segment to indicate to its peer that it has
+ * reduced its congestion window.  This is termed the CWR bit.  For
+ * SCTP the same indication is made by including the CWR chunk.
+ * This chunk contains one data element, i.e. the TSN number that
+ * was sent in the ECNE chunk.  This element represents the lowest
+ * TSN number in the datagram that was originally marked with the
+ * CE bit.
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_ecn_cwr(const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const sctp_subtype_t type,
+				      void *arg,
+				      sctp_cmd_seq_t *commands)
+{
+	sctp_cwrhdr_t *cwr;
+	struct sctp_chunk *chunk = arg;
+	u32 lowest_tsn;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	cwr = (sctp_cwrhdr_t *) chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(sctp_cwrhdr_t));
+
+	lowest_tsn = ntohl(cwr->lowest_tsn);
+
+	/* Does this CWR ack the last sent congestion notification? */
+	if (TSN_lte(asoc->last_ecne_tsn, lowest_tsn)) {
+		/* Stop sending ECNE. */
+		sctp_add_cmd_sf(commands,
+				SCTP_CMD_ECN_CWR,
+				SCTP_U32(lowest_tsn));
+	}
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_do_ecne
+ *
+ * Section:  Appendix A: Explicit Congestion Notification
+ *
+ * ECN-Echo
+ *
+ * RFC 2481 details a specific bit for a receiver to send back in its
+ * TCP acknowledgements to notify the sender of the Congestion
+ * Experienced (CE) bit having arrived from the network.  For SCTP this
+ * same indication is made by including the ECNE chunk.  This chunk
+ * contains one data element, i.e. the lowest TSN associated with the IP
+ * datagram marked with the CE bit.....
+ *
+ * Verification Tag: 8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_ecne(const struct sctp_endpoint *ep,
+				   const struct sctp_association *asoc,
+				   const sctp_subtype_t type,
+				   void *arg,
+				   sctp_cmd_seq_t *commands)
+{
+	sctp_ecnehdr_t *ecne;
+	struct sctp_chunk *chunk = arg;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	ecne = (sctp_ecnehdr_t *) chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(sctp_ecnehdr_t));
+
+	/* If this is a newer ECNE than the last CWR packet we sent out */
+	sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE,
+			SCTP_U32(ntohl(ecne->lowest_tsn)));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Section: 6.2  Acknowledgement on Reception of DATA Chunks
+ *
+ * The SCTP endpoint MUST always acknowledge the reception of each valid
+ * DATA chunk.
+ *
+ * The guidelines on delayed acknowledgement algorithm specified in
+ * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an
+ * acknowledgement SHOULD be generated for at least every second packet
+ * (not every second DATA chunk) received, and SHOULD be generated within
+ * 200 ms of the arrival of any unacknowledged DATA chunk. In some
+ * situations it may be beneficial for an SCTP transmitter to be more
+ * conservative than the algorithms detailed in this document allow.
+ * However, an SCTP transmitter MUST NOT be more aggressive than the
+ * following algorithms allow.
+ *
+ * A SCTP receiver MUST NOT generate more than one SACK for every
+ * incoming packet, other than to update the offered window as the
+ * receiving application consumes new data.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_arg_t force = SCTP_NOFORCE();
+	int error;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	error = sctp_eat_data(asoc, chunk, commands );
+	switch (error) {
+	case SCTP_IERROR_NO_ERROR:
+		break;
+	case SCTP_IERROR_HIGH_TSN:
+	case SCTP_IERROR_BAD_STREAM:
+		SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
+		goto discard_noforce;
+	case SCTP_IERROR_DUP_TSN:
+	case SCTP_IERROR_IGNORE_TSN:
+		SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
+		goto discard_force;
+	case SCTP_IERROR_NO_DATA:
+		goto consume;
+	case SCTP_IERROR_PROTO_VIOLATION:
+		return sctp_sf_abort_violation(ep, asoc, chunk, commands,
+			(u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
+	default:
+		BUG();
+	}
+
+	if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM)
+		force = SCTP_FORCE();
+
+	if (asoc->autoclose) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+	}
+
+	/* If this is the last chunk in a packet, we need to count it
+	 * toward sack generation.  Note that we need to SACK every
+	 * OTHER packet containing data chunks, EVEN IF WE DISCARD
+	 * THEM.  We elect to NOT generate SACK's if the chunk fails
+	 * the verification tag test.
+	 *
+	 * RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
+	 *
+	 * The SCTP endpoint MUST always acknowledge the reception of
+	 * each valid DATA chunk.
+	 *
+	 * The guidelines on delayed acknowledgement algorithm
+	 * specified in  Section 4.2 of [RFC2581] SHOULD be followed.
+	 * Specifically, an acknowledgement SHOULD be generated for at
+	 * least every second packet (not every second DATA chunk)
+	 * received, and SHOULD be generated within 200 ms of the
+	 * arrival of any unacknowledged DATA chunk.  In some
+	 * situations it may be beneficial for an SCTP transmitter to
+	 * be more conservative than the algorithms detailed in this
+	 * document allow. However, an SCTP transmitter MUST NOT be
+	 * more aggressive than the following algorithms allow.
+	 */
+	if (chunk->end_of_packet)
+		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);
+
+	return SCTP_DISPOSITION_CONSUME;
+
+discard_force:
+	/* RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
+	 *
+	 * When a packet arrives with duplicate DATA chunk(s) and with
+	 * no new DATA chunk(s), the endpoint MUST immediately send a
+	 * SACK with no delay.  If a packet arrives with duplicate
+	 * DATA chunk(s) bundled with new DATA chunks, the endpoint
+	 * MAY immediately send a SACK.  Normally receipt of duplicate
+	 * DATA chunks will occur when the original SACK chunk was lost
+	 * and the peer's RTO has expired.  The duplicate TSN number(s)
+	 * SHOULD be reported in the SACK as duplicate.
+	 */
+	/* In our case, we split the MAY SACK advice up whether or not
+	 * the last chunk is a duplicate.'
+	 */
+	if (chunk->end_of_packet)
+		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+	return SCTP_DISPOSITION_DISCARD;
+
+discard_noforce:
+	if (chunk->end_of_packet)
+		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);
+
+	return SCTP_DISPOSITION_DISCARD;
+consume:
+	return SCTP_DISPOSITION_CONSUME;
+
+}
+
+/*
+ * sctp_sf_eat_data_fast_4_4
+ *
+ * Section: 4 (4)
+ * (4) In SHUTDOWN-SENT state the endpoint MUST acknowledge any received
+ *    DATA chunks without delay.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_data_fast_4_4(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	int error;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	error = sctp_eat_data(asoc, chunk, commands );
+	switch (error) {
+	case SCTP_IERROR_NO_ERROR:
+	case SCTP_IERROR_HIGH_TSN:
+	case SCTP_IERROR_DUP_TSN:
+	case SCTP_IERROR_IGNORE_TSN:
+	case SCTP_IERROR_BAD_STREAM:
+		break;
+	case SCTP_IERROR_NO_DATA:
+		goto consume;
+	case SCTP_IERROR_PROTO_VIOLATION:
+		return sctp_sf_abort_violation(ep, asoc, chunk, commands,
+			(u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
+	default:
+		BUG();
+	}
+
+	/* Go a head and force a SACK, since we are shutting down. */
+
+	/* Implementor's Guide.
+	 *
+	 * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
+	 * respond to each received packet containing one or more DATA chunk(s)
+	 * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
+	 */
+	if (chunk->end_of_packet) {
+		/* We must delay the chunk creation since the cumulative
+		 * TSN has not been updated yet.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
+		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+	}
+
+consume:
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Section: 6.2  Processing a Received SACK
+ * D) Any time a SACK arrives, the endpoint performs the following:
+ *
+ *     i) If Cumulative TSN Ack is less than the Cumulative TSN Ack Point,
+ *     then drop the SACK.   Since Cumulative TSN Ack is monotonically
+ *     increasing, a SACK whose Cumulative TSN Ack is less than the
+ *     Cumulative TSN Ack Point indicates an out-of-order SACK.
+ *
+ *     ii) Set rwnd equal to the newly received a_rwnd minus the number
+ *     of bytes still outstanding after processing the Cumulative TSN Ack
+ *     and the Gap Ack Blocks.
+ *
+ *     iii) If the SACK is missing a TSN that was previously
+ *     acknowledged via a Gap Ack Block (e.g., the data receiver
+ *     reneged on the data), then mark the corresponding DATA chunk
+ *     as available for retransmit:  Mark it as missing for fast
+ *     retransmit as described in Section 7.2.4 and if no retransmit
+ *     timer is running for the destination address to which the DATA
+ *     chunk was originally transmitted, then T3-rtx is started for
+ *     that destination address.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_sack_6_2(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_sackhdr_t *sackh;
+	__u32 ctsn;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the SACK chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Pull the SACK chunk from the data buffer */
+	sackh = sctp_sm_pull_sack(chunk);
+	/* Was this a bogus SACK? */
+	if (!sackh)
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	chunk->subh.sack_hdr = sackh;
+	ctsn = ntohl(sackh->cum_tsn_ack);
+
+	/* i) If Cumulative TSN Ack is less than the Cumulative TSN
+	 *     Ack Point, then drop the SACK.  Since Cumulative TSN
+	 *     Ack is monotonically increasing, a SACK whose
+	 *     Cumulative TSN Ack is less than the Cumulative TSN Ack
+	 *     Point indicates an out-of-order SACK.
+	 */
+	if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
+		SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn);
+		SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point);
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* If Cumulative TSN Ack beyond the max tsn currently
+	 * send, terminating the association and respond to the
+	 * sender with an ABORT.
+	 */
+	if (!TSN_lt(ctsn, asoc->next_tsn))
+		return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands);
+
+	/* Return this SACK for further processing.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_SACKH(sackh));
+
+	/* Note: We do the rest of the work on the PROCESS_SACK
+	 * sideeffect.
+	 */
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Generate an ABORT in response to a packet.
+ *
+ * Section: 8.4 Handle "Out of the blue" Packets, sctpimpguide 2.41
+ *
+ * 8) The receiver should respond to the sender of the OOTB packet with
+ *    an ABORT.  When sending the ABORT, the receiver of the OOTB packet
+ *    MUST fill in the Verification Tag field of the outbound packet
+ *    with the value found in the Verification Tag field of the OOTB
+ *    packet and set the T-bit in the Chunk Flags to indicate that the
+ *    Verification Tag is reflected.  After sending this ABORT, the
+ *    receiver of the OOTB packet shall discard the OOTB packet and take
+ *    no further action.
+ *
+ * Verification Tag:
+ *
+ * The return value is the disposition of the chunk.
+*/
+static sctp_disposition_t sctp_sf_tabort_8_4_8(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_packet *packet = NULL;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *abort;
+
+	packet = sctp_ootb_pkt_new(asoc, chunk);
+
+	if (packet) {
+		/* Make an ABORT. The T bit will be set if the asoc
+		 * is NULL.
+		 */
+		abort = sctp_make_abort(asoc, chunk, 0);
+		if (!abort) {
+			sctp_ootb_pkt_free(packet);
+			return SCTP_DISPOSITION_NOMEM;
+		}
+
+		/* Reflect vtag if T-Bit is set */
+		if (sctp_test_T_bit(abort))
+			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+		/* Set the skb to the belonging sock for accounting.  */
+		abort->skb->sk = ep->base.sk;
+
+		sctp_packet_append_chunk(packet, abort);
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+				SCTP_PACKET(packet));
+
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+
+		sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		return SCTP_DISPOSITION_CONSUME;
+	}
+
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Received an ERROR chunk from peer.  Generate SCTP_REMOTE_ERROR
+ * event as ULP notification for each cause included in the chunk.
+ *
+ * API 5.3.1.3 - SCTP_REMOTE_ERROR
+ *
+ * The return value is the disposition of the chunk.
+*/
+sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	sctp_errhdr_t *err;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ERROR chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+	sctp_walk_errors(err, chunk->chunk_hdr);
+	if ((void *)err != (void *)chunk->chunk_end)
+		return sctp_sf_violation_paramlen(ep, asoc, type, arg,
+						  (void *)err, commands);
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
+			SCTP_CHUNK(chunk));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an inbound SHUTDOWN ACK.
+ *
+ * From Section 9.2:
+ * Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
+ * stop the T2-shutdown timer, send a SHUTDOWN COMPLETE chunk to its
+ * peer, and remove all record of the association.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_final(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *reply;
+	struct sctp_ulpevent *ev;
+
+	if (!sctp_vtag_verify(chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+	/* 10.2 H) SHUTDOWN COMPLETE notification
+	 *
+	 * When SCTP completes the shutdown procedures (section 9.2) this
+	 * notification is passed to the upper layer.
+	 */
+	ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
+					     0, 0, 0, NULL, GFP_ATOMIC);
+	if (!ev)
+		goto nomem;
+
+	/* ...send a SHUTDOWN COMPLETE chunk to its peer, */
+	reply = sctp_make_shutdown_complete(asoc, chunk);
+	if (!reply)
+		goto nomem_chunk;
+
+	/* Do all the commands now (after allocation), so that we
+	 * have consistent state if memory allocation failes
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
+
+	/* Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
+	 * stop the T2-shutdown timer,
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+	SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+	/* ...and remove all record of the association. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+	return SCTP_DISPOSITION_DELETE_TCB;
+
+nomem_chunk:
+	sctp_ulpevent_free(ev);
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * RFC 2960, 8.4 - Handle "Out of the blue" Packets, sctpimpguide 2.41.
+ *
+ * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
+ *    respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
+ *    When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
+ *    packet must fill in the Verification Tag field of the outbound
+ *    packet with the Verification Tag received in the SHUTDOWN ACK and
+ *    set the T-bit in the Chunk Flags to indicate that the Verification
+ *    Tag is reflected.
+ *
+ * 8) The receiver should respond to the sender of the OOTB packet with
+ *    an ABORT.  When sending the ABORT, the receiver of the OOTB packet
+ *    MUST fill in the Verification Tag field of the outbound packet
+ *    with the value found in the Verification Tag field of the OOTB
+ *    packet and set the T-bit in the Chunk Flags to indicate that the
+ *    Verification Tag is reflected.  After sending this ABORT, the
+ *    receiver of the OOTB packet shall discard the OOTB packet and take
+ *    no further action.
+ */
+sctp_disposition_t sctp_sf_ootb(const struct sctp_endpoint *ep,
+				const struct sctp_association *asoc,
+				const sctp_subtype_t type,
+				void *arg,
+				sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sk_buff *skb = chunk->skb;
+	sctp_chunkhdr_t *ch;
+	sctp_errhdr_t *err;
+	__u8 *ch_end;
+	int ootb_shut_ack = 0;
+	int ootb_cookie_ack = 0;
+
+	SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
+
+	ch = (sctp_chunkhdr_t *) chunk->chunk_hdr;
+	do {
+		/* Report violation if the chunk is less then minimal */
+		if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t))
+			return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+		/* Now that we know we at least have a chunk header,
+		 * do things that are type appropriate.
+		 */
+		if (SCTP_CID_SHUTDOWN_ACK == ch->type)
+			ootb_shut_ack = 1;
+
+		/* RFC 2960, Section 3.3.7
+		 *   Moreover, under any circumstances, an endpoint that
+		 *   receives an ABORT  MUST NOT respond to that ABORT by
+		 *   sending an ABORT of its own.
+		 */
+		if (SCTP_CID_ABORT == ch->type)
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		/* RFC 8.4, 7) If the packet contains a "Stale cookie" ERROR
+		 * or a COOKIE ACK the SCTP Packet should be silently
+		 * discarded.
+		 */
+
+		if (SCTP_CID_COOKIE_ACK == ch->type)
+			ootb_cookie_ack = 1;
+
+		if (SCTP_CID_ERROR == ch->type) {
+			sctp_walk_errors(err, ch) {
+				if (SCTP_ERROR_STALE_COOKIE == err->cause) {
+					ootb_cookie_ack = 1;
+					break;
+				}
+			}
+		}
+
+		/* Report violation if chunk len overflows */
+		ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length));
+		if (ch_end > skb_tail_pointer(skb))
+			return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+		ch = (sctp_chunkhdr_t *) ch_end;
+	} while (ch_end < skb_tail_pointer(skb));
+
+	if (ootb_shut_ack)
+		return sctp_sf_shut_8_4_5(ep, asoc, type, arg, commands);
+	else if (ootb_cookie_ack)
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	else
+		return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Handle an "Out of the blue" SHUTDOWN ACK.
+ *
+ * Section: 8.4 5, sctpimpguide 2.41.
+ *
+ * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
+ *    respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
+ *    When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
+ *    packet must fill in the Verification Tag field of the outbound
+ *    packet with the Verification Tag received in the SHUTDOWN ACK and
+ *    set the T-bit in the Chunk Flags to indicate that the Verification
+ *    Tag is reflected.
+ *
+ * Inputs
+ * (endpoint, asoc, type, arg, commands)
+ *
+ * Outputs
+ * (sctp_disposition_t)
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
+					     const struct sctp_association *asoc,
+					     const sctp_subtype_t type,
+					     void *arg,
+					     sctp_cmd_seq_t *commands)
+{
+	struct sctp_packet *packet = NULL;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *shut;
+
+	packet = sctp_ootb_pkt_new(asoc, chunk);
+
+	if (packet) {
+		/* Make an SHUTDOWN_COMPLETE.
+		 * The T bit will be set if the asoc is NULL.
+		 */
+		shut = sctp_make_shutdown_complete(asoc, chunk);
+		if (!shut) {
+			sctp_ootb_pkt_free(packet);
+			return SCTP_DISPOSITION_NOMEM;
+		}
+
+		/* Reflect vtag if T-Bit is set */
+		if (sctp_test_T_bit(shut))
+			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+		/* Set the skb to the belonging sock for accounting.  */
+		shut->skb->sk = ep->base.sk;
+
+		sctp_packet_append_chunk(packet, shut);
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+				SCTP_PACKET(packet));
+
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+
+		/* If the chunk length is invalid, we don't want to process
+		 * the reset of the packet.
+		 */
+		if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		/* We need to discard the rest of the packet to prevent
+		 * potential bomming attacks from additional bundled chunks.
+		 * This is documented in SCTP Threats ID.
+		 */
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle SHUTDOWN ACK in COOKIE_ECHOED or COOKIE_WAIT state.
+ *
+ * Verification Tag:  8.5.1 E) Rules for packet carrying a SHUTDOWN ACK
+ *   If the receiver is in COOKIE-ECHOED or COOKIE-WAIT state the
+ *   procedures in section 8.4 SHOULD be followed, in other words it
+ *   should be treated as an Out Of The Blue packet.
+ *   [This means that we do NOT check the Verification Tag on these
+ *   chunks. --piggy ]
+ *
+ */
+sctp_disposition_t sctp_sf_do_8_5_1_E_sa(const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const sctp_subtype_t type,
+				      void *arg,
+				      sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	/* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	/* Although we do have an association in this case, it corresponds
+	 * to a restarted association. So the packet is treated as an OOTB
+	 * packet and the state function that handles OOTB SHUTDOWN_ACK is
+	 * called with a NULL association.
+	 */
+	SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES);
+
+	return sctp_sf_shut_8_4_5(ep, NULL, type, arg, commands);
+}
+
+/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk.  */
+sctp_disposition_t sctp_sf_do_asconf(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type, void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk	*chunk = arg;
+	struct sctp_chunk	*asconf_ack = NULL;
+	struct sctp_paramhdr	*err_param = NULL;
+	sctp_addiphdr_t		*hdr;
+	union sctp_addr_param	*addr_param;
+	__u32			serial;
+	int			length;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	/* ADD-IP: Section 4.1.1
+	 * This chunk MUST be sent in an authenticated way by using
+	 * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
+	 * is received unauthenticated it MUST be silently discarded as
+	 * described in [I-D.ietf-tsvwg-sctp-auth].
+	 */
+	if (!sctp_addip_noauth && !chunk->auth)
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ASCONF ADDIP chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	hdr = (sctp_addiphdr_t *)chunk->skb->data;
+	serial = ntohl(hdr->serial);
+
+	addr_param = (union sctp_addr_param *)hdr->params;
+	length = ntohs(addr_param->p.length);
+	if (length < sizeof(sctp_paramhdr_t))
+		return sctp_sf_violation_paramlen(ep, asoc, type, arg,
+			   (void *)addr_param, commands);
+
+	/* Verify the ASCONF chunk before processing it. */
+	if (!sctp_verify_asconf(asoc,
+			    (sctp_paramhdr_t *)((void *)addr_param + length),
+			    (void *)chunk->chunk_end,
+			    &err_param))
+		return sctp_sf_violation_paramlen(ep, asoc, type, arg,
+						  (void *)err_param, commands);
+
+	/* ADDIP 5.2 E1) Compare the value of the serial number to the value
+	 * the endpoint stored in a new association variable
+	 * 'Peer-Serial-Number'.
+	 */
+	if (serial == asoc->peer.addip_serial + 1) {
+		/* If this is the first instance of ASCONF in the packet,
+		 * we can clean our old ASCONF-ACKs.
+		 */
+		if (!chunk->has_asconf)
+			sctp_assoc_clean_asconf_ack_cache(asoc);
+
+		/* ADDIP 5.2 E4) When the Sequence Number matches the next one
+		 * expected, process the ASCONF as described below and after
+		 * processing the ASCONF Chunk, append an ASCONF-ACK Chunk to
+		 * the response packet and cache a copy of it (in the event it
+		 * later needs to be retransmitted).
+		 *
+		 * Essentially, do V1-V5.
+		 */
+		asconf_ack = sctp_process_asconf((struct sctp_association *)
+						 asoc, chunk);
+		if (!asconf_ack)
+			return SCTP_DISPOSITION_NOMEM;
+	} else if (serial < asoc->peer.addip_serial + 1) {
+		/* ADDIP 5.2 E2)
+		 * If the value found in the Sequence Number is less than the
+		 * ('Peer- Sequence-Number' + 1), simply skip to the next
+		 * ASCONF, and include in the outbound response packet
+		 * any previously cached ASCONF-ACK response that was
+		 * sent and saved that matches the Sequence Number of the
+		 * ASCONF.  Note: It is possible that no cached ASCONF-ACK
+		 * Chunk exists.  This will occur when an older ASCONF
+		 * arrives out of order.  In such a case, the receiver
+		 * should skip the ASCONF Chunk and not include ASCONF-ACK
+		 * Chunk for that chunk.
+		 */
+		asconf_ack = sctp_assoc_lookup_asconf_ack(asoc, hdr->serial);
+		if (!asconf_ack)
+			return SCTP_DISPOSITION_DISCARD;
+
+		/* Reset the transport so that we select the correct one
+		 * this time around.  This is to make sure that we don't
+		 * accidentally use a stale transport that's been removed.
+		 */
+		asconf_ack->transport = NULL;
+	} else {
+		/* ADDIP 5.2 E5) Otherwise, the ASCONF Chunk is discarded since
+		 * it must be either a stale packet or from an attacker.
+		 */
+		return SCTP_DISPOSITION_DISCARD;
+	}
+
+	/* ADDIP 5.2 E6)  The destination address of the SCTP packet
+	 * containing the ASCONF-ACK Chunks MUST be the source address of
+	 * the SCTP packet that held the ASCONF Chunks.
+	 *
+	 * To do this properly, we'll set the destination address of the chunk
+	 * and at the transmit time, will try look up the transport to use.
+	 * Since ASCONFs may be bundled, the correct transport may not be
+	 * created until we process the entire packet, thus this workaround.
+	 */
+	asconf_ack->dest = chunk->source;
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * ADDIP Section 4.3 General rules for address manipulation
+ * When building TLV parameters for the ASCONF Chunk that will add or
+ * delete IP addresses the D0 to D13 rules should be applied:
+ */
+sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const sctp_subtype_t type, void *arg,
+					 sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk	*asconf_ack = arg;
+	struct sctp_chunk	*last_asconf = asoc->addip_last_asconf;
+	struct sctp_chunk	*abort;
+	struct sctp_paramhdr	*err_param = NULL;
+	sctp_addiphdr_t		*addip_hdr;
+	__u32			sent_serial, rcvd_serial;
+
+	if (!sctp_vtag_verify(asconf_ack, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	/* ADD-IP, Section 4.1.2:
+	 * This chunk MUST be sent in an authenticated way by using
+	 * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
+	 * is received unauthenticated it MUST be silently discarded as
+	 * described in [I-D.ietf-tsvwg-sctp-auth].
+	 */
+	if (!sctp_addip_noauth && !asconf_ack->auth)
+		return sctp_sf_discard_chunk(ep, asoc, type, arg, commands);
+
+	/* Make sure that the ADDIP chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	addip_hdr = (sctp_addiphdr_t *)asconf_ack->skb->data;
+	rcvd_serial = ntohl(addip_hdr->serial);
+
+	/* Verify the ASCONF-ACK chunk before processing it. */
+	if (!sctp_verify_asconf(asoc,
+	    (sctp_paramhdr_t *)addip_hdr->params,
+	    (void *)asconf_ack->chunk_end,
+	    &err_param))
+		return sctp_sf_violation_paramlen(ep, asoc, type, arg,
+			   (void *)err_param, commands);
+
+	if (last_asconf) {
+		addip_hdr = (sctp_addiphdr_t *)last_asconf->subh.addip_hdr;
+		sent_serial = ntohl(addip_hdr->serial);
+	} else {
+		sent_serial = asoc->addip_serial - 1;
+	}
+
+	/* D0) If an endpoint receives an ASCONF-ACK that is greater than or
+	 * equal to the next serial number to be used but no ASCONF chunk is
+	 * outstanding the endpoint MUST ABORT the association. Note that a
+	 * sequence number is greater than if it is no more than 2^^31-1
+	 * larger than the current sequence number (using serial arithmetic).
+	 */
+	if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) &&
+	    !(asoc->addip_last_asconf)) {
+		abort = sctp_make_abort(asoc, asconf_ack,
+					sizeof(sctp_errhdr_t));
+		if (abort) {
+			sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, 0);
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(abort));
+		}
+		/* We are going to ABORT, so we might as well stop
+		 * processing the rest of the chunks in the packet.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+		sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL());
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ECONNABORTED));
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_ABORT;
+	}
+
+	if ((rcvd_serial == sent_serial) && asoc->addip_last_asconf) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+
+		if (!sctp_process_asconf_ack((struct sctp_association *)asoc,
+					     asconf_ack)) {
+			/* Successfully processed ASCONF_ACK.  We can
+			 * release the next asconf if we have one.
+			 */
+			sctp_add_cmd_sf(commands, SCTP_CMD_SEND_NEXT_ASCONF,
+					SCTP_NULL());
+			return SCTP_DISPOSITION_CONSUME;
+		}
+
+		abort = sctp_make_abort(asoc, asconf_ack,
+					sizeof(sctp_errhdr_t));
+		if (abort) {
+			sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(abort));
+		}
+		/* We are going to ABORT, so we might as well stop
+		 * processing the rest of the chunks in the packet.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL());
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ECONNABORTED));
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_ABORT;
+	}
+
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
+ *
+ * When a FORWARD TSN chunk arrives, the data receiver MUST first update
+ * its cumulative TSN point to the value carried in the FORWARD TSN
+ * chunk, and then MUST further advance its cumulative TSN point locally
+ * if possible.
+ * After the above processing, the data receiver MUST stop reporting any
+ * missing TSNs earlier than or equal to the new cumulative TSN point.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type,
+				       void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+	struct sctp_fwdtsn_skip *skip;
+	__u16 len;
+	__u32 tsn;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the FORWARD_TSN chunk has valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
+	chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
+	len = ntohs(chunk->chunk_hdr->length);
+	len -= sizeof(struct sctp_chunkhdr);
+	skb_pull(chunk->skb, len);
+
+	tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
+	SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __func__, tsn);
+
+	/* The TSN is too high--silently discard the chunk and count on it
+	 * getting retransmitted later.
+	 */
+	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
+		goto discard_noforce;
+
+	/* Silently discard the chunk if stream-id is not valid */
+	sctp_walk_fwdtsn(skip, chunk) {
+		if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+			goto discard_noforce;
+	}
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
+	if (len > sizeof(struct sctp_fwdtsn_hdr))
+		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
+				SCTP_CHUNK(chunk));
+
+	/* Count this as receiving DATA. */
+	if (asoc->autoclose) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+	}
+
+	/* FIXME: For now send a SACK, but DATA processing may
+	 * send another.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
+
+	return SCTP_DISPOSITION_CONSUME;
+
+discard_noforce:
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+	struct sctp_fwdtsn_skip *skip;
+	__u16 len;
+	__u32 tsn;
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the FORWARD_TSN chunk has a valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
+	chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
+	len = ntohs(chunk->chunk_hdr->length);
+	len -= sizeof(struct sctp_chunkhdr);
+	skb_pull(chunk->skb, len);
+
+	tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
+	SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __func__, tsn);
+
+	/* The TSN is too high--silently discard the chunk and count on it
+	 * getting retransmitted later.
+	 */
+	if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
+		goto gen_shutdown;
+
+	/* Silently discard the chunk if stream-id is not valid */
+	sctp_walk_fwdtsn(skip, chunk) {
+		if (ntohs(skip->stream) >= asoc->c.sinit_max_instreams)
+			goto gen_shutdown;
+	}
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
+	if (len > sizeof(struct sctp_fwdtsn_hdr))
+		sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
+				SCTP_CHUNK(chunk));
+
+	/* Go a head and force a SACK, since we are shutting down. */
+gen_shutdown:
+	/* Implementor's Guide.
+	 *
+	 * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
+	 * respond to each received packet containing one or more DATA chunk(s)
+	 * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
+	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * SCTP-AUTH Section 6.3 Receiving authenticated chukns
+ *
+ *    The receiver MUST use the HMAC algorithm indicated in the HMAC
+ *    Identifier field.  If this algorithm was not specified by the
+ *    receiver in the HMAC-ALGO parameter in the INIT or INIT-ACK chunk
+ *    during association setup, the AUTH chunk and all chunks after it MUST
+ *    be discarded and an ERROR chunk SHOULD be sent with the error cause
+ *    defined in Section 4.1.
+ *
+ *    If an endpoint with no shared key receives a Shared Key Identifier
+ *    other than 0, it MUST silently discard all authenticated chunks.  If
+ *    the endpoint has at least one endpoint pair shared key for the peer,
+ *    it MUST use the key specified by the Shared Key Identifier if a
+ *    key has been configured for that Shared Key Identifier.  If no
+ *    endpoint pair shared key has been configured for that Shared Key
+ *    Identifier, all authenticated chunks MUST be silently discarded.
+ *
+ * Verification Tag:  8.5 Verification Tag [Normal verification]
+ *
+ * The return value is the disposition of the chunk.
+ */
+static sctp_ierror_t sctp_sf_authenticate(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    struct sctp_chunk *chunk)
+{
+	struct sctp_authhdr *auth_hdr;
+	struct sctp_hmac *hmac;
+	unsigned int sig_len;
+	__u16 key_id;
+	__u8 *save_digest;
+	__u8 *digest;
+
+	/* Pull in the auth header, so we can do some more verification */
+	auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
+	chunk->subh.auth_hdr = auth_hdr;
+	skb_pull(chunk->skb, sizeof(struct sctp_authhdr));
+
+	/* Make sure that we suport the HMAC algorithm from the auth
+	 * chunk.
+	 */
+	if (!sctp_auth_asoc_verify_hmac_id(asoc, auth_hdr->hmac_id))
+		return SCTP_IERROR_AUTH_BAD_HMAC;
+
+	/* Make sure that the provided shared key identifier has been
+	 * configured
+	 */
+	key_id = ntohs(auth_hdr->shkey_id);
+	if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id))
+		return SCTP_IERROR_AUTH_BAD_KEYID;
+
+
+	/* Make sure that the length of the signature matches what
+	 * we expect.
+	 */
+	sig_len = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_auth_chunk_t);
+	hmac = sctp_auth_get_hmac(ntohs(auth_hdr->hmac_id));
+	if (sig_len != hmac->hmac_len)
+		return SCTP_IERROR_PROTO_VIOLATION;
+
+	/* Now that we've done validation checks, we can compute and
+	 * verify the hmac.  The steps involved are:
+	 *  1. Save the digest from the chunk.
+	 *  2. Zero out the digest in the chunk.
+	 *  3. Compute the new digest
+	 *  4. Compare saved and new digests.
+	 */
+	digest = auth_hdr->hmac;
+	skb_pull(chunk->skb, sig_len);
+
+	save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
+	if (!save_digest)
+		goto nomem;
+
+	memset(digest, 0, sig_len);
+
+	sctp_auth_calculate_hmac(asoc, chunk->skb,
+				(struct sctp_auth_chunk *)chunk->chunk_hdr,
+				GFP_ATOMIC);
+
+	/* Discard the packet if the digests do not match */
+	if (memcmp(save_digest, digest, sig_len)) {
+		kfree(save_digest);
+		return SCTP_IERROR_BAD_SIG;
+	}
+
+	kfree(save_digest);
+	chunk->auth = 1;
+
+	return SCTP_IERROR_NO_ERROR;
+nomem:
+	return SCTP_IERROR_NOMEM;
+}
+
+sctp_disposition_t sctp_sf_eat_auth(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	struct sctp_authhdr *auth_hdr;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_chunk *err_chunk;
+	sctp_ierror_t error;
+
+	/* Make sure that the peer has AUTH capable */
+	if (!asoc->peer.auth_capable)
+		return sctp_sf_unk_chunk(ep, asoc, type, arg, commands);
+
+	if (!sctp_vtag_verify(chunk, asoc)) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
+				SCTP_NULL());
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+	}
+
+	/* Make sure that the AUTH chunk has valid length.  */
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_auth_chunk)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
+	error = sctp_sf_authenticate(ep, asoc, type, chunk);
+	switch (error) {
+		case SCTP_IERROR_AUTH_BAD_HMAC:
+			/* Generate the ERROR chunk and discard the rest
+			 * of the packet
+			 */
+			err_chunk = sctp_make_op_error(asoc, chunk,
+							SCTP_ERROR_UNSUP_HMAC,
+							&auth_hdr->hmac_id,
+							sizeof(__u16), 0);
+			if (err_chunk) {
+				sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+						SCTP_CHUNK(err_chunk));
+			}
+			/* Fall Through */
+		case SCTP_IERROR_AUTH_BAD_KEYID:
+		case SCTP_IERROR_BAD_SIG:
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+			break;
+		case SCTP_IERROR_PROTO_VIOLATION:
+			return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+							  commands);
+			break;
+		case SCTP_IERROR_NOMEM:
+			return SCTP_DISPOSITION_NOMEM;
+		default:
+			break;
+	}
+
+	if (asoc->active_key_id != ntohs(auth_hdr->shkey_id)) {
+		struct sctp_ulpevent *ev;
+
+		ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id),
+				    SCTP_AUTH_NEWKEY, GFP_ATOMIC);
+
+		if (!ev)
+			return -ENOMEM;
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ev));
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process an unknown chunk.
+ *
+ * Section: 3.2. Also, 2.1 in the implementor's guide.
+ *
+ * Chunk Types are encoded such that the highest-order two bits specify
+ * the action that must be taken if the processing endpoint does not
+ * recognize the Chunk Type.
+ *
+ * 00 - Stop processing this SCTP packet and discard it, do not process
+ *      any further chunks within it.
+ *
+ * 01 - Stop processing this SCTP packet and discard it, do not process
+ *      any further chunks within it, and report the unrecognized
+ *      chunk in an 'Unrecognized Chunk Type'.
+ *
+ * 10 - Skip this chunk and continue processing.
+ *
+ * 11 - Skip this chunk and continue processing, but report in an ERROR
+ *      Chunk using the 'Unrecognized Chunk Type' cause of error.
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_unk_chunk(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *unk_chunk = arg;
+	struct sctp_chunk *err_chunk;
+	sctp_chunkhdr_t *hdr;
+
+	SCTP_DEBUG_PRINTK("Processing the unknown chunk id %d.\n", type.chunk);
+
+	if (!sctp_vtag_verify(unk_chunk, asoc))
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+	/* Make sure that the chunk has a valid length.
+	 * Since we don't know the chunk type, we use a general
+	 * chunkhdr structure to make a comparison.
+	 */
+	if (!sctp_chunk_length_valid(unk_chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	switch (type.chunk & SCTP_CID_ACTION_MASK) {
+	case SCTP_CID_ACTION_DISCARD:
+		/* Discard the packet.  */
+		return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		break;
+	case SCTP_CID_ACTION_DISCARD_ERR:
+		/* Generate an ERROR chunk as response. */
+		hdr = unk_chunk->chunk_hdr;
+		err_chunk = sctp_make_op_error(asoc, unk_chunk,
+					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
+					       WORD_ROUND(ntohs(hdr->length)),
+					       0);
+		if (err_chunk) {
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(err_chunk));
+		}
+
+		/* Discard the packet.  */
+		sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+		return SCTP_DISPOSITION_CONSUME;
+		break;
+	case SCTP_CID_ACTION_SKIP:
+		/* Skip the chunk.  */
+		return SCTP_DISPOSITION_DISCARD;
+		break;
+	case SCTP_CID_ACTION_SKIP_ERR:
+		/* Generate an ERROR chunk as response. */
+		hdr = unk_chunk->chunk_hdr;
+		err_chunk = sctp_make_op_error(asoc, unk_chunk,
+					       SCTP_ERROR_UNKNOWN_CHUNK, hdr,
+					       WORD_ROUND(ntohs(hdr->length)),
+					       0);
+		if (err_chunk) {
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(err_chunk));
+		}
+		/* Skip the chunk.  */
+		return SCTP_DISPOSITION_CONSUME;
+		break;
+	default:
+		break;
+	}
+
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * Discard the chunk.
+ *
+ * Section: 0.2, 5.2.3, 5.2.5, 5.2.6, 6.0, 8.4.6, 8.5.1c, 9.2
+ * [Too numerous to mention...]
+ * Verification Tag: No verification needed.
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_discard_chunk(const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const sctp_subtype_t type,
+					 void *arg,
+					 sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	/* Make sure that the chunk has a valid length.
+	 * Since we don't know the chunk type, we use a general
+	 * chunkhdr structure to make a comparison.
+	 */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	SCTP_DEBUG_PRINTK("Chunk %d is discarded\n", type.chunk);
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/*
+ * Discard the whole packet.
+ *
+ * Section: 8.4 2)
+ *
+ * 2) If the OOTB packet contains an ABORT chunk, the receiver MUST
+ *    silently discard the OOTB packet and take no further action.
+ *
+ * Verification Tag: No verification necessary
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_pdiscard(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	SCTP_INC_STATS(SCTP_MIB_IN_PKT_DISCARDS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+
+/*
+ * The other end is violating protocol.
+ *
+ * Section: Not specified
+ * Verification Tag: Not specified
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (asoc, reply_msg, msg_up, timers, counters)
+ *
+ * We simply tag the chunk as a violation.  The state machine will log
+ * the violation and continue.
+ */
+sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	/* Make sure that the chunk has a valid length. */
+	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t)))
+		return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+						  commands);
+
+	return SCTP_DISPOSITION_VIOLATION;
+}
+
+/*
+ * Common function to handle a protocol violation.
+ */
+static sctp_disposition_t sctp_sf_abort_violation(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     void *arg,
+				     sctp_cmd_seq_t *commands,
+				     const __u8 *payload,
+				     const size_t paylen)
+{
+	struct sctp_packet *packet = NULL;
+	struct sctp_chunk *chunk =  arg;
+	struct sctp_chunk *abort = NULL;
+
+	/* SCTP-AUTH, Section 6.3:
+	 *    It should be noted that if the receiver wants to tear
+	 *    down an association in an authenticated way only, the
+	 *    handling of malformed packets should not result in
+	 *    tearing down the association.
+	 *
+	 * This means that if we only want to abort associations
+	 * in an authenticated way (i.e AUTH+ABORT), then we
+	 * can't destroy this association just because the packet
+	 * was malformed.
+	 */
+	if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+		goto discard;
+
+	/* Make the abort chunk. */
+	abort = sctp_make_abort_violation(asoc, chunk, payload, paylen);
+	if (!abort)
+		goto nomem;
+
+	if (asoc) {
+		/* Treat INIT-ACK as a special case during COOKIE-WAIT. */
+		if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK &&
+		    !asoc->peer.i.init_tag) {
+			sctp_initack_chunk_t *initack;
+
+			initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+			if (!sctp_chunk_length_valid(chunk,
+						     sizeof(sctp_initack_chunk_t)))
+				abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T;
+			else {
+				unsigned int inittag;
+
+				inittag = ntohl(initack->init_hdr.init_tag);
+				sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_INITTAG,
+						SCTP_U32(inittag));
+			}
+		}
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+
+		if (asoc->state <= SCTP_STATE_COOKIE_ECHOED) {
+			sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+			sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+					SCTP_ERROR(ECONNREFUSED));
+			sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+					SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+		} else {
+			sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+					SCTP_ERROR(ECONNABORTED));
+			sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+					SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+			SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		}
+	} else {
+		packet = sctp_ootb_pkt_new(asoc, chunk);
+
+		if (!packet)
+			goto nomem_pkt;
+
+		if (sctp_test_T_bit(abort))
+			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+		abort->skb->sk = ep->base.sk;
+
+		sctp_packet_append_chunk(packet, abort);
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+			SCTP_PACKET(packet));
+
+		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+	}
+
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+
+discard:
+	sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
+	return SCTP_DISPOSITION_ABORT;
+
+nomem_pkt:
+	sctp_chunk_free(abort);
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Handle a protocol violation when the chunk length is invalid.
+ * "Invalid" length is identified as smaller than the minimal length a
+ * given chunk can be.  For example, a SACK chunk has invalid length
+ * if its length is set to be smaller than the size of sctp_sack_chunk_t.
+ *
+ * We inform the other end by sending an ABORT with a Protocol Violation
+ * error code.
+ *
+ * Section: Not specified
+ * Verification Tag:  Nothing to do
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * Outputs
+ * (reply_msg, msg_up, counters)
+ *
+ * Generate an  ABORT chunk and terminate the association.
+ */
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	static const char err_str[]="The following chunk had invalid length:";
+
+	return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str,
+					sizeof(err_str));
+}
+
+/*
+ * Handle a protocol violation when the parameter length is invalid.
+ * If the length is smaller than the minimum length of a given parameter,
+ * or accumulated length in multi parameters exceeds the end of the chunk,
+ * the length is considered as invalid.
+ */
+static sctp_disposition_t sctp_sf_violation_paramlen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg, void *ext,
+				     sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk =  arg;
+	struct sctp_paramhdr *param = ext;
+	struct sctp_chunk *abort = NULL;
+
+	if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
+		goto discard;
+
+	/* Make the abort chunk. */
+	abort = sctp_make_violation_paramlen(asoc, chunk, param);
+	if (!abort)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+	SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+			SCTP_ERROR(ECONNABORTED));
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+			SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+
+discard:
+	sctp_sf_pdiscard(ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
+	return SCTP_DISPOSITION_ABORT;
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Handle a protocol violation when the peer trying to advance the
+ * cumulative tsn ack to a point beyond the max tsn currently sent.
+ *
+ * We inform the other end by sending an ABORT with a Protocol Violation
+ * error code.
+ */
+static sctp_disposition_t sctp_sf_violation_ctsn(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	static const char err_str[]="The cumulative tsn ack beyond the max tsn currently sent:";
+
+	return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str,
+					sizeof(err_str));
+}
+
+/* Handle protocol violation of an invalid chunk bundling.  For example,
+ * when we have an association and we receive bundled INIT-ACK, or
+ * SHUDOWN-COMPLETE, our peer is clearly violationg the "MUST NOT bundle"
+ * statement from the specs.  Additionally, there might be an attacker
+ * on the path and we may not want to continue this communication.
+ */
+static sctp_disposition_t sctp_sf_violation_chunk(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands)
+{
+	static const char err_str[]="The following chunk violates protocol:";
+
+	if (!asoc)
+		return sctp_sf_violation(ep, asoc, type, arg, commands);
+
+	return sctp_sf_abort_violation(ep, asoc, arg, commands, err_str,
+					sizeof(err_str));
+}
+/***************************************************************************
+ * These are the state functions for handling primitive (Section 10) events.
+ ***************************************************************************/
+/*
+ * sctp_sf_do_prm_asoc
+ *
+ * Section: 10.1 ULP-to-SCTP
+ * B) Associate
+ *
+ * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
+ * outbound stream count)
+ * -> association id [,destination transport addr list] [,outbound stream
+ * count]
+ *
+ * This primitive allows the upper layer to initiate an association to a
+ * specific peer endpoint.
+ *
+ * The peer endpoint shall be specified by one of the transport addresses
+ * which defines the endpoint (see Section 1.4).  If the local SCTP
+ * instance has not been initialized, the ASSOCIATE is considered an
+ * error.
+ * [This is not relevant for the kernel implementation since we do all
+ * initialization at boot time.  It we hadn't initialized we wouldn't
+ * get anywhere near this code.]
+ *
+ * An association id, which is a local handle to the SCTP association,
+ * will be returned on successful establishment of the association. If
+ * SCTP is not able to open an SCTP association with the peer endpoint,
+ * an error is returned.
+ * [In the kernel implementation, the struct sctp_association needs to
+ * be created BEFORE causing this primitive to run.]
+ *
+ * Other association parameters may be returned, including the
+ * complete destination transport addresses of the peer as well as the
+ * outbound stream count of the local endpoint. One of the transport
+ * address from the returned destination addresses will be selected by
+ * the local endpoint as default primary path for sending SCTP packets
+ * to this peer.  The returned "destination transport addr list" can
+ * be used by the ULP to change the default primary path or to force
+ * sending a packet to a specific transport address.  [All of this
+ * stuff happens when the INIT ACK arrives.  This is a NON-BLOCKING
+ * function.]
+ *
+ * Mandatory attributes:
+ *
+ * o local SCTP instance name - obtained from the INITIALIZE operation.
+ *   [This is the argument asoc.]
+ * o destination transport addr - specified as one of the transport
+ * addresses of the peer endpoint with which the association is to be
+ * established.
+ *  [This is asoc->peer.active_path.]
+ * o outbound stream count - the number of outbound streams the ULP
+ * would like to open towards this peer endpoint.
+ * [BUG: This is not currently implemented.]
+ * Optional attributes:
+ *
+ * None.
+ *
+ * The return value is a disposition.
+ */
+sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type,
+				       void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *repl;
+	struct sctp_association* my_asoc;
+
+	/* The comment below says that we enter COOKIE-WAIT AFTER
+	 * sending the INIT, but that doesn't actually work in our
+	 * implementation...
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_COOKIE_WAIT));
+
+	/* RFC 2960 5.1 Normal Establishment of an Association
+	 *
+	 * A) "A" first sends an INIT chunk to "Z".  In the INIT, "A"
+	 * must provide its Verification Tag (Tag_A) in the Initiate
+	 * Tag field.  Tag_A SHOULD be a random number in the range of
+	 * 1 to 4294967295 (see 5.3.1 for Tag value selection). ...
+	 */
+
+	repl = sctp_make_init(asoc, &asoc->base.bind_addr, GFP_ATOMIC, 0);
+	if (!repl)
+		goto nomem;
+
+	/* Cast away the const modifier, as we want to just
+	 * rerun it through as a sideffect.
+	 */
+	my_asoc = (struct sctp_association *)asoc;
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(my_asoc));
+
+	/* Choose transport for INIT. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+			SCTP_CHUNK(repl));
+
+	/* After sending the INIT, "A" starts the T1-init timer and
+	 * enters the COOKIE-WAIT state.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Process the SEND primitive.
+ *
+ * Section: 10.1 ULP-to-SCTP
+ * E) Send
+ *
+ * Format: SEND(association id, buffer address, byte count [,context]
+ *         [,stream id] [,life time] [,destination transport address]
+ *         [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
+ * -> result
+ *
+ * This is the main method to send user data via SCTP.
+ *
+ * Mandatory attributes:
+ *
+ *  o association id - local handle to the SCTP association
+ *
+ *  o buffer address - the location where the user message to be
+ *    transmitted is stored;
+ *
+ *  o byte count - The size of the user data in number of bytes;
+ *
+ * Optional attributes:
+ *
+ *  o context - an optional 32 bit integer that will be carried in the
+ *    sending failure notification to the ULP if the transportation of
+ *    this User Message fails.
+ *
+ *  o stream id - to indicate which stream to send the data on. If not
+ *    specified, stream 0 will be used.
+ *
+ *  o life time - specifies the life time of the user data. The user data
+ *    will not be sent by SCTP after the life time expires. This
+ *    parameter can be used to avoid efforts to transmit stale
+ *    user messages. SCTP notifies the ULP if the data cannot be
+ *    initiated to transport (i.e. sent to the destination via SCTP's
+ *    send primitive) within the life time variable. However, the
+ *    user data will be transmitted if SCTP has attempted to transmit a
+ *    chunk before the life time expired.
+ *
+ *  o destination transport address - specified as one of the destination
+ *    transport addresses of the peer endpoint to which this packet
+ *    should be sent. Whenever possible, SCTP should use this destination
+ *    transport address for sending the packets, instead of the current
+ *    primary path.
+ *
+ *  o unorder flag - this flag, if present, indicates that the user
+ *    would like the data delivered in an unordered fashion to the peer
+ *    (i.e., the U flag is set to 1 on all DATA chunks carrying this
+ *    message).
+ *
+ *  o no-bundle flag - instructs SCTP not to bundle this user data with
+ *    other outbound DATA chunks. SCTP MAY still bundle even when
+ *    this flag is present, when faced with network congestion.
+ *
+ *  o payload protocol-id - A 32 bit unsigned integer that is to be
+ *    passed to the peer indicating the type of payload protocol data
+ *    being transmitted. This value is passed as opaque data by SCTP.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_prm_send(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type,
+				       void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	struct sctp_datamsg *msg = arg;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SEND_MSG, SCTP_DATAMSG(msg));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Process the SHUTDOWN primitive.
+ *
+ * Section: 10.1:
+ * C) Shutdown
+ *
+ * Format: SHUTDOWN(association id)
+ * -> result
+ *
+ * Gracefully closes an association. Any locally queued user data
+ * will be delivered to the peer. The association will be terminated only
+ * after the peer acknowledges all the SCTP packets sent.  A success code
+ * will be returned on successful termination of the association. If
+ * attempting to terminate the association results in a failure, an error
+ * code shall be returned.
+ *
+ * Mandatory attributes:
+ *
+ *  o association id - local handle to the SCTP association
+ *
+ * Optional attributes:
+ *
+ * None.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	int disposition;
+
+	/* From 9.2 Shutdown of an Association
+	 * Upon receipt of the SHUTDOWN primitive from its upper
+	 * layer, the endpoint enters SHUTDOWN-PENDING state and
+	 * remains there until all outstanding data has been
+	 * acknowledged by its peer. The endpoint accepts no new data
+	 * from its upper layer, but retransmits data to the far end
+	 * if necessary to fill gaps.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
+
+	disposition = SCTP_DISPOSITION_CONSUME;
+	if (sctp_outq_is_empty(&asoc->outqueue)) {
+		disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type,
+							    arg, commands);
+	}
+	return disposition;
+}
+
+/*
+ * Process the ABORT primitive.
+ *
+ * Section: 10.1:
+ * C) Abort
+ *
+ * Format: Abort(association id [, cause code])
+ * -> result
+ *
+ * Ungracefully closes an association. Any locally queued user data
+ * will be discarded and an ABORT chunk is sent to the peer.  A success code
+ * will be returned on successful abortion of the association. If
+ * attempting to abort the association results in a failure, an error
+ * code shall be returned.
+ *
+ * Mandatory attributes:
+ *
+ *  o association id - local handle to the SCTP association
+ *
+ * Optional attributes:
+ *
+ *  o cause code - reason of the abort to be passed to the peer
+ *
+ * None.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_1_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* From 9.1 Abort of an Association
+	 * Upon receipt of the ABORT primitive from its upper
+	 * layer, the endpoint enters CLOSED state and
+	 * discard all outstanding data has been
+	 * acknowledged by its peer. The endpoint accepts no new data
+	 * from its upper layer, but retransmits data to the far end
+	 * if necessary to fill gaps.
+	 */
+	struct sctp_chunk *abort = arg;
+	sctp_disposition_t retval;
+
+	retval = SCTP_DISPOSITION_CONSUME;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+
+	/* Even if we can't send the ABORT due to low memory delete the
+	 * TCB.  This is a departure from our typical NOMEM handling.
+	 */
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+			SCTP_ERROR(ECONNABORTED));
+	/* Delete the established association. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+			SCTP_PERR(SCTP_ERROR_USER_ABORT));
+
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+
+	return retval;
+}
+
+/* We tried an illegal operation on an association which is closed.  */
+sctp_disposition_t sctp_sf_error_closed(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* We tried an illegal operation on an association which is shutting
+ * down.
+ */
+sctp_disposition_t sctp_sf_error_shutdown(const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const sctp_subtype_t type,
+					  void *arg,
+					  sctp_cmd_seq_t *commands)
+{
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR,
+			SCTP_ERROR(-ESHUTDOWN));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_cookie_wait_prm_shutdown
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues a shutdown while in COOKIE_WAIT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+
+	SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS);
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
+
+	return SCTP_DISPOSITION_DELETE_TCB;
+}
+
+/*
+ * sctp_cookie_echoed_prm_shutdown
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues a shutdown while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg, sctp_cmd_seq_t *commands)
+{
+	/* There is a single T1 timer, so we should be able to use
+	 * common function with the COOKIE-WAIT state.
+	 */
+	return sctp_sf_cookie_wait_prm_shutdown(ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_cookie_wait_prm_abort
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_WAIT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *abort = arg;
+	sctp_disposition_t retval;
+
+	/* Stop T1-init timer */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+	retval = SCTP_DISPOSITION_CONSUME;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_CLOSED));
+
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+
+	/* Even if we can't send the ABORT due to low memory delete the
+	 * TCB.  This is a departure from our typical NOMEM handling.
+	 */
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+			SCTP_ERROR(ECONNREFUSED));
+	/* Delete the established association. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+			SCTP_PERR(SCTP_ERROR_USER_ABORT));
+
+	return retval;
+}
+
+/*
+ * sctp_sf_cookie_echoed_prm_abort
+ *
+ * Section: 4 Note: 3
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* There is a single T1 timer, so we should be able to use
+	 * common function with the COOKIE-WAIT state.
+	 */
+	return sctp_sf_cookie_wait_prm_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_shutdown_pending_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in SHUTDOWN-PENDING state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* Stop the T5-shutdown guard timer.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_shutdown_sent_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explicitly address this issue, but is the route through the
+ * state table when someone issues an abort while in SHUTDOWN-SENT state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* Stop the T2-shutdown timer.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	/* Stop the T5-shutdown guard timer.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * sctp_sf_cookie_echoed_prm_abort
+ *
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * The RFC does not explcitly address this issue, but is the route through the
+ * state table when someone issues an abort while in COOKIE_ECHOED state.
+ *
+ * Outputs
+ * (timers)
+ */
+sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	/* The same T2 timer, so we should be able to use
+	 * common function with the SHUTDOWN-SENT state.
+	 */
+	return sctp_sf_shutdown_sent_prm_abort(ep, asoc, type, arg, commands);
+}
+
+/*
+ * Process the REQUESTHEARTBEAT primitive
+ *
+ * 10.1 ULP-to-SCTP
+ * J) Request Heartbeat
+ *
+ * Format: REQUESTHEARTBEAT(association id, destination transport address)
+ *
+ * -> result
+ *
+ * Instructs the local endpoint to perform a HeartBeat on the specified
+ * destination transport address of the given association. The returned
+ * result should indicate whether the transmission of the HEARTBEAT
+ * chunk to the destination address is successful.
+ *
+ * Mandatory attributes:
+ *
+ * o association id - local handle to the SCTP association
+ *
+ * o destination transport address - the transport address of the
+ *   association on which a heartbeat should be issued.
+ */
+sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type,
+				      (struct sctp_transport *)arg, commands))
+		return SCTP_DISPOSITION_NOMEM;
+
+	/*
+	 * RFC 2960 (bis), section 8.3
+	 *
+	 *    D) Request an on-demand HEARTBEAT on a specific destination
+	 *    transport address of a given association.
+	 *
+	 *    The endpoint should increment the respective error  counter of
+	 *    the destination transport address each time a HEARTBEAT is sent
+	 *    to that address and not acknowledged within one RTO.
+	 *
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
+			SCTP_TRANSPORT(arg));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF Chunk Procedures
+ * When an endpoint has an ASCONF signaled change to be sent to the
+ * remote endpoint it should do A1 to A9
+ */
+sctp_disposition_t sctp_sf_do_prm_asconf(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = arg;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Ignore the primitive event
+ *
+ * The return value is the disposition of the primitive.
+ */
+sctp_disposition_t sctp_sf_ignore_primitive(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	SCTP_DEBUG_PRINTK("Primitive type %d is ignored.\n", type.primitive);
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/***************************************************************************
+ * These are the state functions for the OTHER events.
+ ***************************************************************************/
+
+/*
+ * When the SCTP stack has no more user data to send or retransmit, this
+ * notification is given to the user. Also, at the time when a user app
+ * subscribes to this event, if there is no data to be sent or
+ * retransmit, the stack will immediately send up this notification.
+ */
+sctp_disposition_t sctp_sf_do_no_pending_tsn(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_ulpevent *event;
+
+	event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_ATOMIC);
+	if (!event)
+		return SCTP_DISPOSITION_NOMEM;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Start the shutdown negotiation.
+ *
+ * From Section 9.2:
+ * Once all its outstanding data has been acknowledged, the endpoint
+ * shall send a SHUTDOWN chunk to its peer including in the Cumulative
+ * TSN Ack field the last sequential TSN it has received from the peer.
+ * It shall then start the T2-shutdown timer and enter the SHUTDOWN-SENT
+ * state. If the timer expires, the endpoint must re-send the SHUTDOWN
+ * with the updated last sequential TSN received from its peer.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *reply;
+
+	/* Once all its outstanding data has been acknowledged, the
+	 * endpoint shall send a SHUTDOWN chunk to its peer including
+	 * in the Cumulative TSN Ack field the last sequential TSN it
+	 * has received from the peer.
+	 */
+	reply = sctp_make_shutdown(asoc, NULL);
+	if (!reply)
+		goto nomem;
+
+	/* Set the transport for the SHUTDOWN chunk and the timeout for the
+	 * T2-shutdown timer.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+	/* It shall then start the T2-shutdown timer */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	/* RFC 4960 Section 9.2
+	 * The sender of the SHUTDOWN MAY also start an overall guard timer
+	 * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+
+	if (asoc->autoclose)
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+	/* and enter the SHUTDOWN-SENT state.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_SHUTDOWN_SENT));
+
+	/* sctp-implguide 2.10 Issues with Heartbeating and failover
+	 *
+	 * HEARTBEAT ... is discontinued after sending either SHUTDOWN
+	 * or SHUTDOWN-ACK.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Generate a SHUTDOWN ACK now that everything is SACK'd.
+ *
+ * From Section 9.2:
+ *
+ * If it has no more outstanding DATA chunks, the SHUTDOWN receiver
+ * shall send a SHUTDOWN ACK and start a T2-shutdown timer of its own,
+ * entering the SHUTDOWN-ACK-SENT state. If the timer expires, the
+ * endpoint must re-send the SHUTDOWN ACK.
+ *
+ * The return value is the disposition.
+ */
+sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+	struct sctp_chunk *reply;
+
+	/* There are 2 ways of getting here:
+	 *    1) called in response to a SHUTDOWN chunk
+	 *    2) called when SCTP_EVENT_NO_PENDING_TSN event is issued.
+	 *
+	 * For the case (2), the arg parameter is set to NULL.  We need
+	 * to check that we have a chunk before accessing it's fields.
+	 */
+	if (chunk) {
+		if (!sctp_vtag_verify(chunk, asoc))
+			return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
+
+		/* Make sure that the SHUTDOWN chunk has a valid length. */
+		if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk_t)))
+			return sctp_sf_violation_chunklen(ep, asoc, type, arg,
+							  commands);
+	}
+
+	/* If it has no more outstanding DATA chunks, the SHUTDOWN receiver
+	 * shall send a SHUTDOWN ACK ...
+	 */
+	reply = sctp_make_shutdown_ack(asoc, chunk);
+	if (!reply)
+		goto nomem;
+
+	/* Set the transport for the SHUTDOWN ACK chunk and the timeout for
+	 * the T2-shutdown timer.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+	/* and start/restart a T2-shutdown timer of its own, */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+
+	if (asoc->autoclose)
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
+
+	/* Enter the SHUTDOWN-ACK-SENT state.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_SHUTDOWN_ACK_SENT));
+
+	/* sctp-implguide 2.10 Issues with Heartbeating and failover
+	 *
+	 * HEARTBEAT ... is discontinued after sending either SHUTDOWN
+	 * or SHUTDOWN-ACK.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * Ignore the event defined as other
+ *
+ * The return value is the disposition of the event.
+ */
+sctp_disposition_t sctp_sf_ignore_other(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	SCTP_DEBUG_PRINTK("The event other type %d is ignored\n", type.other);
+	return SCTP_DISPOSITION_DISCARD;
+}
+
+/************************************************************
+ * These are the state functions for handling timeout events.
+ ************************************************************/
+
+/*
+ * RTX Timeout
+ *
+ * Section: 6.3.3 Handle T3-rtx Expiration
+ *
+ * Whenever the retransmission timer T3-rtx expires for a destination
+ * address, do the following:
+ * [See below]
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_do_6_3_3_rtx(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	struct sctp_transport *transport = arg;
+
+	SCTP_INC_STATS(SCTP_MIB_T3_RTX_EXPIREDS);
+
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
+			/*
+			 * We are here likely because the receiver had its rwnd
+			 * closed for a while and we have not been able to
+			 * transmit the locally queued data within the maximum
+			 * retransmission attempts limit.  Start the T5
+			 * shutdown guard timer to give the receiver one last
+			 * chance and some additional time to recover before
+			 * aborting.
+			 */
+			sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START_ONCE,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
+		} else {
+			sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+					SCTP_ERROR(ETIMEDOUT));
+			/* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+			sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+					SCTP_PERR(SCTP_ERROR_NO_ERROR));
+			SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+			SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+			return SCTP_DISPOSITION_DELETE_TCB;
+		}
+	}
+
+	/* E1) For the destination address for which the timer
+	 * expires, adjust its ssthresh with rules defined in Section
+	 * 7.2.3 and set the cwnd <- MTU.
+	 */
+
+	/* E2) For the destination address for which the timer
+	 * expires, set RTO <- RTO * 2 ("back off the timer").  The
+	 * maximum value discussed in rule C7 above (RTO.max) may be
+	 * used to provide an upper bound to this doubling operation.
+	 */
+
+	/* E3) Determine how many of the earliest (i.e., lowest TSN)
+	 * outstanding DATA chunks for the address for which the
+	 * T3-rtx has expired will fit into a single packet, subject
+	 * to the MTU constraint for the path corresponding to the
+	 * destination transport address to which the retransmission
+	 * is being sent (this may be different from the address for
+	 * which the timer expires [see Section 6.4]).  Call this
+	 * value K. Bundle and retransmit those K DATA chunks in a
+	 * single packet to the destination endpoint.
+	 *
+	 * Note: Any DATA chunks that were sent to the address for
+	 * which the T3-rtx timer expired but did not fit in one MTU
+	 * (rule E3 above), should be marked for retransmission and
+	 * sent as soon as cwnd allows (normally when a SACK arrives).
+	 */
+
+	/* Do some failure management (Section 8.2). */
+	sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));
+
+	/* NB: Rules E4 and F1 are implicit in R1.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, SCTP_TRANSPORT(transport));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * Generate delayed SACK on timeout
+ *
+ * Section: 6.2  Acknowledgement on Reception of DATA Chunks
+ *
+ * The guidelines on delayed acknowledgement algorithm specified in
+ * Section 4.2 of [RFC2581] SHOULD be followed.  Specifically, an
+ * acknowledgement SHOULD be generated for at least every second packet
+ * (not every second DATA chunk) received, and SHOULD be generated
+ * within 200 ms of the arrival of any unacknowledged DATA chunk.  In
+ * some situations it may be beneficial for an SCTP transmitter to be
+ * more conservative than the algorithms detailed in this document
+ * allow. However, an SCTP transmitter MUST NOT be more aggressive than
+ * the following algorithms allow.
+ */
+sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const sctp_subtype_t type,
+				       void *arg,
+				       sctp_cmd_seq_t *commands)
+{
+	SCTP_INC_STATS(SCTP_MIB_DELAY_SACK_EXPIREDS);
+	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_init_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ *  RFC 2960 Section 4 Notes
+ *  2) If the T1-init timer expires, the endpoint MUST retransmit INIT
+ *     and re-start the T1-init timer without changing state.  This MUST
+ *     be repeated up to 'Max.Init.Retransmits' times.  After that, the
+ *     endpoint MUST abort the initialization process and report the
+ *     error to SCTP user.
+ *
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *repl = NULL;
+	struct sctp_bind_addr *bp;
+	int attempts = asoc->init_err_counter + 1;
+
+	SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
+	SCTP_INC_STATS(SCTP_MIB_T1_INIT_EXPIREDS);
+
+	if (attempts <= asoc->max_init_attempts) {
+		bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+		repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
+		if (!repl)
+			return SCTP_DISPOSITION_NOMEM;
+
+		/* Choose transport for INIT. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+				SCTP_CHUNK(repl));
+
+		/* Issue a sideeffect to do the needed accounting. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+	} else {
+		SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
+				  " max_init_attempts: %d\n",
+				  attempts, asoc->max_init_attempts);
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_cookie_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ *  RFC 2960 Section 4 Notes
+ *  3) If the T1-cookie timer expires, the endpoint MUST retransmit
+ *     COOKIE ECHO and re-start the T1-cookie timer without changing
+ *     state.  This MUST be repeated up to 'Max.Init.Retransmits' times.
+ *     After that, the endpoint MUST abort the initialization process and
+ *     report the error to SCTP user.
+ *
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *repl = NULL;
+	int attempts = asoc->init_err_counter + 1;
+
+	SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
+	SCTP_INC_STATS(SCTP_MIB_T1_COOKIE_EXPIREDS);
+
+	if (attempts <= asoc->max_init_attempts) {
+		repl = sctp_make_cookie_echo(asoc, NULL);
+		if (!repl)
+			return SCTP_DISPOSITION_NOMEM;
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+				SCTP_CHUNK(repl));
+		/* Issue a sideeffect to do the needed accounting. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+	} else {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
+ * with the updated last sequential TSN received from its peer.
+ *
+ * An endpoint should limit the number of retransmissions of the
+ * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'.
+ * If this threshold is exceeded the endpoint should destroy the TCB and
+ * MUST report the peer endpoint unreachable to the upper layer (and
+ * thus the association enters the CLOSED state).  The reception of any
+ * packet from its peer (i.e. as the peer sends all of its queued DATA
+ * chunks) should clear the endpoint's retransmission count and restart
+ * the T2-Shutdown timer,  giving its peer ample opportunity to transmit
+ * all of its queued DATA chunks that have not yet been sent.
+ */
+sctp_disposition_t sctp_sf_t2_timer_expire(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *reply = NULL;
+
+	SCTP_DEBUG_PRINTK("Timer T2 expired.\n");
+	SCTP_INC_STATS(SCTP_MIB_T2_SHUTDOWN_EXPIREDS);
+
+	((struct sctp_association *)asoc)->shutdown_retries++;
+
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		/* Note:  CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	switch (asoc->state) {
+	case SCTP_STATE_SHUTDOWN_SENT:
+		reply = sctp_make_shutdown(asoc, NULL);
+		break;
+
+	case SCTP_STATE_SHUTDOWN_ACK_SENT:
+		reply = sctp_make_shutdown_ack(asoc, NULL);
+		break;
+
+	default:
+		BUG();
+		break;
+	}
+
+	if (!reply)
+		goto nomem;
+
+	/* Do some failure management (Section 8.2).
+	 * If we remove the transport an SHUTDOWN was last sent to, don't
+	 * do failure management.
+	 */
+	if (asoc->shutdown_last_sent_to)
+		sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
+				SCTP_TRANSPORT(asoc->shutdown_last_sent_to));
+
+	/* Set the transport for the SHUTDOWN/ACK chunk and the timeout for
+	 * the T2-shutdown timer.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));
+
+	/* Restart the T2-shutdown timer.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	return SCTP_DISPOSITION_CONSUME;
+
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/*
+ * ADDIP Section 4.1 ASCONF CHunk Procedures
+ * If the T4 RTO timer expires the endpoint should do B1 to B5
+ */
+sctp_disposition_t sctp_sf_t4_timer_expire(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *chunk = asoc->addip_last_asconf;
+	struct sctp_transport *transport = chunk->transport;
+
+	SCTP_INC_STATS(SCTP_MIB_T4_RTO_EXPIREDS);
+
+	/* ADDIP 4.1 B1) Increment the error counters and perform path failure
+	 * detection on the appropriate destination address as defined in
+	 * RFC2960 [5] section 8.1 and 8.2.
+	 */
+	if (transport)
+		sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
+				SCTP_TRANSPORT(transport));
+
+	/* Reconfig T4 timer and transport. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));
+
+	/* ADDIP 4.1 B2) Increment the association error counters and perform
+	 * endpoint failure detection on the association as defined in
+	 * RFC2960 [5] section 8.1 and 8.2.
+	 * association error counter is incremented in SCTP_CMD_STRIKE.
+	 */
+	if (asoc->overall_error_count >= asoc->max_retrans) {
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ETIMEDOUT));
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_ERROR));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_DISPOSITION_ABORT;
+	}
+
+	/* ADDIP 4.1 B3) Back-off the destination address RTO value to which
+	 * the ASCONF chunk was sent by doubling the RTO timer value.
+	 * This is done in SCTP_CMD_STRIKE.
+	 */
+
+	/* ADDIP 4.1 B4) Re-transmit the ASCONF Chunk last sent and if possible
+	 * choose an alternate destination address (please refer to RFC2960
+	 * [5] section 6.4.1). An endpoint MUST NOT add new parameters to this
+	 * chunk, it MUST be the same (including its serial number) as the last
+	 * ASCONF sent.
+	 */
+	sctp_chunk_hold(asoc->addip_last_asconf);
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+			SCTP_CHUNK(asoc->addip_last_asconf));
+
+	/* ADDIP 4.1 B5) Restart the T-4 RTO timer. Note that if a different
+	 * destination is selected, then the RTO used will be that of the new
+	 * destination address.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+			SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/* sctpimpguide-05 Section 2.12.2
+ * The sender of the SHUTDOWN MAY also start an overall guard timer
+ * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
+ * At the expiration of this timer the sender SHOULD abort the association
+ * by sending an ABORT chunk.
+ */
+sctp_disposition_t sctp_sf_t5_timer_expire(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *reply = NULL;
+
+	SCTP_DEBUG_PRINTK("Timer T5 expired.\n");
+	SCTP_INC_STATS(SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS);
+
+	reply = sctp_make_abort(asoc, NULL, 0);
+	if (!reply)
+		goto nomem;
+
+	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+			SCTP_ERROR(ETIMEDOUT));
+	sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+			SCTP_PERR(SCTP_ERROR_NO_ERROR));
+
+	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+	SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+
+	return SCTP_DISPOSITION_DELETE_TCB;
+nomem:
+	return SCTP_DISPOSITION_NOMEM;
+}
+
+/* Handle expiration of AUTOCLOSE timer.  When the autoclose timer expires,
+ * the association is automatically closed by starting the shutdown process.
+ * The work that needs to be done is same as when SHUTDOWN is initiated by
+ * the user.  So this routine looks same as sctp_sf_do_9_2_prm_shutdown().
+ */
+sctp_disposition_t sctp_sf_autoclose_timer_expire(
+	const struct sctp_endpoint *ep,
+	const struct sctp_association *asoc,
+	const sctp_subtype_t type,
+	void *arg,
+	sctp_cmd_seq_t *commands)
+{
+	int disposition;
+
+	SCTP_INC_STATS(SCTP_MIB_AUTOCLOSE_EXPIREDS);
+
+	/* From 9.2 Shutdown of an Association
+	 * Upon receipt of the SHUTDOWN primitive from its upper
+	 * layer, the endpoint enters SHUTDOWN-PENDING state and
+	 * remains there until all outstanding data has been
+	 * acknowledged by its peer. The endpoint accepts no new data
+	 * from its upper layer, but retransmits data to the far end
+	 * if necessary to fill gaps.
+	 */
+	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
+			SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));
+
+	disposition = SCTP_DISPOSITION_CONSUME;
+	if (sctp_outq_is_empty(&asoc->outqueue)) {
+		disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type,
+							    arg, commands);
+	}
+	return disposition;
+}
+
+/*****************************************************************************
+ * These are sa state functions which could apply to all types of events.
+ ****************************************************************************/
+
+/*
+ * This table entry is not implemented.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_not_impl(const struct sctp_endpoint *ep,
+				    const struct sctp_association *asoc,
+				    const sctp_subtype_t type,
+				    void *arg,
+				    sctp_cmd_seq_t *commands)
+{
+	return SCTP_DISPOSITION_NOT_IMPL;
+}
+
+/*
+ * This table entry represents a bug.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_bug(const struct sctp_endpoint *ep,
+			       const struct sctp_association *asoc,
+			       const sctp_subtype_t type,
+			       void *arg,
+			       sctp_cmd_seq_t *commands)
+{
+	return SCTP_DISPOSITION_BUG;
+}
+
+/*
+ * This table entry represents the firing of a timer in the wrong state.
+ * Since timer deletion cannot be guaranteed a timer 'may' end up firing
+ * when the association is in the wrong state.   This event should
+ * be ignored, so as to prevent any rearming of the timer.
+ *
+ * Inputs
+ * (endpoint, asoc, chunk)
+ *
+ * The return value is the disposition of the chunk.
+ */
+sctp_disposition_t sctp_sf_timer_ignore(const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const sctp_subtype_t type,
+					void *arg,
+					sctp_cmd_seq_t *commands)
+{
+	SCTP_DEBUG_PRINTK("Timer %d ignored.\n", type.chunk);
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* Pull the SACK chunk based on the SACK header. */
+static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk)
+{
+	struct sctp_sackhdr *sack;
+	unsigned int len;
+	__u16 num_blocks;
+	__u16 num_dup_tsns;
+
+	/* Protect ourselves from reading too far into
+	 * the skb from a bogus sender.
+	 */
+	sack = (struct sctp_sackhdr *) chunk->skb->data;
+
+	num_blocks = ntohs(sack->num_gap_ack_blocks);
+	num_dup_tsns = ntohs(sack->num_dup_tsns);
+	len = sizeof(struct sctp_sackhdr);
+	len += (num_blocks + num_dup_tsns) * sizeof(__u32);
+	if (len > chunk->skb->len)
+		return NULL;
+
+	skb_pull(chunk->skb, len);
+
+	return sack;
+}
+
+/* Create an ABORT packet to be sent as a response, with the specified
+ * error causes.
+ */
+static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep,
+				  const struct sctp_association *asoc,
+				  struct sctp_chunk *chunk,
+				  const void *payload,
+				  size_t paylen)
+{
+	struct sctp_packet *packet;
+	struct sctp_chunk *abort;
+
+	packet = sctp_ootb_pkt_new(asoc, chunk);
+
+	if (packet) {
+		/* Make an ABORT.
+		 * The T bit will be set if the asoc is NULL.
+		 */
+		abort = sctp_make_abort(asoc, chunk, paylen);
+		if (!abort) {
+			sctp_ootb_pkt_free(packet);
+			return NULL;
+		}
+
+		/* Reflect vtag if T-Bit is set */
+		if (sctp_test_T_bit(abort))
+			packet->vtag = ntohl(chunk->sctp_hdr->vtag);
+
+		/* Add specified error causes, i.e., payload, to the
+		 * end of the chunk.
+		 */
+		sctp_addto_chunk(abort, paylen, payload);
+
+		/* Set the skb to the belonging sock for accounting.  */
+		abort->skb->sk = ep->base.sk;
+
+		sctp_packet_append_chunk(packet, abort);
+
+	}
+
+	return packet;
+}
+
+/* Allocate a packet for responding in the OOTB conditions.  */
+static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc,
+					     const struct sctp_chunk *chunk)
+{
+	struct sctp_packet *packet;
+	struct sctp_transport *transport;
+	__u16 sport;
+	__u16 dport;
+	__u32 vtag;
+
+	/* Get the source and destination port from the inbound packet.  */
+	sport = ntohs(chunk->sctp_hdr->dest);
+	dport = ntohs(chunk->sctp_hdr->source);
+
+	/* The V-tag is going to be the same as the inbound packet if no
+	 * association exists, otherwise, use the peer's vtag.
+	 */
+	if (asoc) {
+		/* Special case the INIT-ACK as there is no peer's vtag
+		 * yet.
+		 */
+		switch(chunk->chunk_hdr->type) {
+		case SCTP_CID_INIT_ACK:
+		{
+			sctp_initack_chunk_t *initack;
+
+			initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+			vtag = ntohl(initack->init_hdr.init_tag);
+			break;
+		}
+		default:
+			vtag = asoc->peer.i.init_tag;
+			break;
+		}
+	} else {
+		/* Special case the INIT and stale COOKIE_ECHO as there is no
+		 * vtag yet.
+		 */
+		switch(chunk->chunk_hdr->type) {
+		case SCTP_CID_INIT:
+		{
+			sctp_init_chunk_t *init;
+
+			init = (sctp_init_chunk_t *)chunk->chunk_hdr;
+			vtag = ntohl(init->init_hdr.init_tag);
+			break;
+		}
+		default:
+			vtag = ntohl(chunk->sctp_hdr->vtag);
+			break;
+		}
+	}
+
+	/* Make a transport for the bucket, Eliza... */
+	transport = sctp_transport_new(sctp_source(chunk), GFP_ATOMIC);
+	if (!transport)
+		goto nomem;
+
+	/* Cache a route for the transport with the chunk's destination as
+	 * the source address.
+	 */
+	sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
+			     sctp_sk(sctp_get_ctl_sock()));
+
+	packet = sctp_packet_init(&transport->packet, transport, sport, dport);
+	packet = sctp_packet_config(packet, vtag, 0);
+
+	return packet;
+
+nomem:
+	return NULL;
+}
+
+/* Free the packet allocated earlier for responding in the OOTB condition.  */
+void sctp_ootb_pkt_free(struct sctp_packet *packet)
+{
+	sctp_transport_free(packet->transport);
+}
+
+/* Send a stale cookie error when a invalid COOKIE ECHO chunk is found  */
+static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const struct sctp_chunk *chunk,
+				       sctp_cmd_seq_t *commands,
+				       struct sctp_chunk *err_chunk)
+{
+	struct sctp_packet *packet;
+
+	if (err_chunk) {
+		packet = sctp_ootb_pkt_new(asoc, chunk);
+		if (packet) {
+			struct sctp_signed_cookie *cookie;
+
+			/* Override the OOTB vtag from the cookie. */
+			cookie = chunk->subh.cookie_hdr;
+			packet->vtag = cookie->c.peer_vtag;
+
+			/* Set the skb to the belonging sock for accounting. */
+			err_chunk->skb->sk = ep->base.sk;
+			sctp_packet_append_chunk(packet, err_chunk);
+			sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
+					SCTP_PACKET(packet));
+			SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
+		} else
+			sctp_chunk_free (err_chunk);
+	}
+}
+
+
+/* Process a data chunk */
+static int sctp_eat_data(const struct sctp_association *asoc,
+			 struct sctp_chunk *chunk,
+			 sctp_cmd_seq_t *commands)
+{
+	sctp_datahdr_t *data_hdr;
+	struct sctp_chunk *err;
+	size_t datalen;
+	sctp_verb_t deliver;
+	int tmp;
+	__u32 tsn;
+	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+	struct sock *sk = asoc->base.sk;
+	u16 ssn;
+	u16 sid;
+	u8 ordered = 0;
+
+	data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
+
+	tsn = ntohl(data_hdr->tsn);
+	SCTP_DEBUG_PRINTK("eat_data: TSN 0x%x.\n", tsn);
+
+	/* ASSERT:  Now skb->data is really the user data.  */
+
+	/* Process ECN based congestion.
+	 *
+	 * Since the chunk structure is reused for all chunks within
+	 * a packet, we use ecn_ce_done to track if we've already
+	 * done CE processing for this packet.
+	 *
+	 * We need to do ECN processing even if we plan to discard the
+	 * chunk later.
+	 */
+
+	if (!chunk->ecn_ce_done) {
+		struct sctp_af *af;
+		chunk->ecn_ce_done = 1;
+
+		af = sctp_get_af_specific(
+			ipver2af(ip_hdr(chunk->skb)->version));
+
+		if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) {
+			/* Do real work as sideffect. */
+			sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
+					SCTP_U32(tsn));
+		}
+	}
+
+	tmp = sctp_tsnmap_check(&asoc->peer.tsn_map, tsn);
+	if (tmp < 0) {
+		/* The TSN is too high--silently discard the chunk and
+		 * count on it getting retransmitted later.
+		 */
+		return SCTP_IERROR_HIGH_TSN;
+	} else if (tmp > 0) {
+		/* This is a duplicate.  Record it.  */
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_DUP, SCTP_U32(tsn));
+		return SCTP_IERROR_DUP_TSN;
+	}
+
+	/* This is a new TSN.  */
+
+	/* Discard if there is no room in the receive window.
+	 * Actually, allow a little bit of overflow (up to a MTU).
+	 */
+	datalen = ntohs(chunk->chunk_hdr->length);
+	datalen -= sizeof(sctp_data_chunk_t);
+
+	deliver = SCTP_CMD_CHUNK_ULP;
+
+	/* Think about partial delivery. */
+	if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) {
+
+		/* Even if we don't accept this chunk there is
+		 * memory pressure.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_PART_DELIVER, SCTP_NULL());
+	}
+
+	/* Spill over rwnd a little bit.  Note: While allowed, this spill over
+	 * seems a bit troublesome in that frag_point varies based on
+	 * PMTU.  In cases, such as loopback, this might be a rather
+	 * large spill over.
+	 */
+	if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
+	    (datalen > asoc->rwnd + asoc->frag_point))) {
+
+		/* If this is the next TSN, consider reneging to make
+		 * room.   Note: Playing nice with a confused sender.  A
+		 * malicious sender can still eat up all our buffer
+		 * space and in the future we may want to detect and
+		 * do more drastic reneging.
+		 */
+		if (sctp_tsnmap_has_gap(map) &&
+		    (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
+			SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn);
+			deliver = SCTP_CMD_RENEGE;
+		} else {
+			SCTP_DEBUG_PRINTK("Discard tsn: %u len: %Zd, "
+					  "rwnd: %d\n", tsn, datalen,
+					  asoc->rwnd);
+			return SCTP_IERROR_IGNORE_TSN;
+		}
+	}
+
+	/*
+	 * Also try to renege to limit our memory usage in the event that
+	 * we are under memory pressure
+	 * If we can't renege, don't worry about it, the sk_rmem_schedule
+	 * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our
+	 * memory usage too much
+	 */
+	if (*sk->sk_prot_creator->memory_pressure) {
+		if (sctp_tsnmap_has_gap(map) &&
+	           (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
+			SCTP_DEBUG_PRINTK("Under Pressure! Reneging for tsn:%u\n", tsn);
+			deliver = SCTP_CMD_RENEGE;
+		 }
+	}
+
+	/*
+	 * Section 3.3.10.9 No User Data (9)
+	 *
+	 * Cause of error
+	 * ---------------
+	 * No User Data:  This error cause is returned to the originator of a
+	 * DATA chunk if a received DATA chunk has no user data.
+	 */
+	if (unlikely(0 == datalen)) {
+		err = sctp_make_abort_no_data(asoc, chunk, tsn);
+		if (err) {
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(err));
+		}
+		/* We are going to ABORT, so we might as well stop
+		 * processing the rest of the chunks in the packet.
+		 */
+		sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL());
+		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
+				SCTP_ERROR(ECONNABORTED));
+		sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
+				SCTP_PERR(SCTP_ERROR_NO_DATA));
+		SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
+		SCTP_DEC_STATS(SCTP_MIB_CURRESTAB);
+		return SCTP_IERROR_NO_DATA;
+	}
+
+	chunk->data_accepted = 1;
+
+	/* Note: Some chunks may get overcounted (if we drop) or overcounted
+	 * if we renege and the chunk arrives again.
+	 */
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+		SCTP_INC_STATS(SCTP_MIB_INUNORDERCHUNKS);
+	else {
+		SCTP_INC_STATS(SCTP_MIB_INORDERCHUNKS);
+		ordered = 1;
+	}
+
+	/* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
+	 *
+	 * If an endpoint receive a DATA chunk with an invalid stream
+	 * identifier, it shall acknowledge the reception of the DATA chunk
+	 * following the normal procedure, immediately send an ERROR chunk
+	 * with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
+	 * and discard the DATA chunk.
+	 */
+	sid = ntohs(data_hdr->stream);
+	if (sid >= asoc->c.sinit_max_instreams) {
+		/* Mark tsn as received even though we drop it */
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));
+
+		err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM,
+					 &data_hdr->stream,
+					 sizeof(data_hdr->stream),
+					 sizeof(u16));
+		if (err)
+			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
+					SCTP_CHUNK(err));
+		return SCTP_IERROR_BAD_STREAM;
+	}
+
+	/* Check to see if the SSN is possible for this TSN.
+	 * The biggest gap we can record is 4K wide.  Since SSNs wrap
+	 * at an unsigned short, there is no way that an SSN can
+	 * wrap and for a valid TSN.  We can simply check if the current
+	 * SSN is smaller then the next expected one.  If it is, it wrapped
+	 * and is invalid.
+	 */
+	ssn = ntohs(data_hdr->ssn);
+	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
+		return SCTP_IERROR_PROTO_VIOLATION;
+	}
+
+	/* Send the data up to the user.  Note:  Schedule  the
+	 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
+	 * chunk needs the updated rwnd.
+	 */
+	sctp_add_cmd_sf(commands, deliver, SCTP_CHUNK(chunk));
+
+	return SCTP_IERROR_NO_ERROR;
+}
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
new file mode 100644
index 00000000..7c211a7f
--- /dev/null
+++ b/net/sctp/sm_statetable.c
@@ -0,0 +1,937 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These are the state tables for the SCTP state machine.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Hui Huang		    <hui.huang@nokia.com>
+ *    Daisy Chang	    <daisyc@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static const sctp_sm_table_entry_t
+primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES];
+static const sctp_sm_table_entry_t
+other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES];
+static const sctp_sm_table_entry_t
+timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES];
+
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid,
+							    sctp_state_t state);
+
+
+static const sctp_sm_table_entry_t bug = {
+	.fn = sctp_sf_bug,
+	.name = "sctp_sf_bug"
+};
+
+#define DO_LOOKUP(_max, _type, _table)					\
+({									\
+	const sctp_sm_table_entry_t *rtn;				\
+									\
+	if ((event_subtype._type > (_max))) {				\
+		pr_warn("table %p possible attack: event %d exceeds max %d\n", \
+			_table, event_subtype._type, _max);		\
+	        rtn = &bug;						\
+	} else								\
+		rtn = &_table[event_subtype._type][(int)state];		\
+									\
+	rtn;								\
+})
+
+const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
+						  sctp_state_t state,
+						  sctp_subtype_t event_subtype)
+{
+	switch (event_type) {
+	case SCTP_EVENT_T_CHUNK:
+		return sctp_chunk_event_lookup(event_subtype.chunk, state);
+	case SCTP_EVENT_T_TIMEOUT:
+		return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
+				 timeout_event_table);
+	case SCTP_EVENT_T_OTHER:
+		return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
+				 other_event_table);
+	case SCTP_EVENT_T_PRIMITIVE:
+		return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
+				 primitive_event_table);
+	default:
+		/* Yikes!  We got an illegal event type.  */
+		return &bug;
+	}
+}
+
+#define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func}
+
+#define TYPE_SCTP_DATA { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_DATA */
+
+#define TYPE_SCTP_INIT { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \
+} /* TYPE_SCTP_INIT */
+
+#define TYPE_SCTP_INIT_ACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_INIT_ACK */
+
+#define TYPE_SCTP_SACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_SACK */
+
+#define TYPE_SCTP_HEARTBEAT { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	/* This should not happen, but we are nice.  */ \
+	TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
+} /* TYPE_SCTP_HEARTBEAT */
+
+#define TYPE_SCTP_HEARTBEAT_ACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_violation), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_HEARTBEAT_ACK */
+
+#define TYPE_SCTP_ABORT { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_pdiscard), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \
+} /* TYPE_SCTP_ABORT */
+
+#define TYPE_SCTP_SHUTDOWN { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_SHUTDOWN */
+
+#define TYPE_SCTP_SHUTDOWN_ACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_violation), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_violation), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_violation), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
+} /* TYPE_SCTP_SHUTDOWN_ACK */
+
+#define TYPE_SCTP_ERROR { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ERROR */
+
+#define TYPE_SCTP_COOKIE_ECHO { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
+} /* TYPE_SCTP_COOKIE_ECHO */
+
+#define TYPE_SCTP_COOKIE_ACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_COOKIE_ACK */
+
+#define TYPE_SCTP_ECN_ECNE { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ECN_ECNE */
+
+#define TYPE_SCTP_ECN_CWR { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ECN_CWR */
+
+#define TYPE_SCTP_SHUTDOWN_COMPLETE { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_4_C), \
+} /* TYPE_SCTP_SHUTDOWN_COMPLETE */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ *
+ * For base protocol (RFC 2960).
+ */
+static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_DATA,
+	TYPE_SCTP_INIT,
+	TYPE_SCTP_INIT_ACK,
+	TYPE_SCTP_SACK,
+	TYPE_SCTP_HEARTBEAT,
+	TYPE_SCTP_HEARTBEAT_ACK,
+	TYPE_SCTP_ABORT,
+	TYPE_SCTP_SHUTDOWN,
+	TYPE_SCTP_SHUTDOWN_ACK,
+	TYPE_SCTP_ERROR,
+	TYPE_SCTP_COOKIE_ECHO,
+	TYPE_SCTP_COOKIE_ACK,
+	TYPE_SCTP_ECN_ECNE,
+	TYPE_SCTP_ECN_CWR,
+	TYPE_SCTP_SHUTDOWN_COMPLETE,
+}; /* state_fn_t chunk_event_table[][] */
+
+#define TYPE_SCTP_ASCONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ASCONF */
+
+#define TYPE_SCTP_ASCONF_ACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_ASCONF_ACK */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_ASCONF,
+	TYPE_SCTP_ASCONF_ACK,
+}; /*state_fn_t addip_chunk_event_table[][] */
+
+#define TYPE_SCTP_FWD_TSN { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+} /* TYPE_SCTP_FWD_TSN */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_FWD_TSN,
+}; /*state_fn_t prsctp_chunk_event_table[][] */
+
+#define TYPE_SCTP_AUTH { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ootb), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
+} /* TYPE_SCTP_AUTH */
+
+/* The primary index for this table is the chunk type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_AUTH,
+}; /*state_fn_t auth_chunk_event_table[][] */
+
+static const sctp_sm_table_entry_t
+chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
+	/* SCTP_STATE_CLOSED */
+	TYPE_SCTP_FUNC(sctp_sf_ootb),
+	/* SCTP_STATE_COOKIE_WAIT */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_COOKIE_ECHOED */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_ESTABLISHED */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_SHUTDOWN_PENDING */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_SHUTDOWN_SENT */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */
+	TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
+};	/* chunk unknown */
+
+
+#define TYPE_SCTP_PRIMITIVE_ASSOCIATE  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_not_impl), \
+} /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */
+
+#define TYPE_SCTP_PRIMITIVE_SHUTDOWN  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
+} /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */
+
+#define TYPE_SCTP_PRIMITIVE_ABORT  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \
+} /* TYPE_SCTP_PRIMITIVE_ABORT */
+
+#define TYPE_SCTP_PRIMITIVE_SEND  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_SEND */
+
+#define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
+} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */
+
+#define TYPE_SCTP_PRIMITIVE_ASCONF { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_ASCONF */
+
+/* The primary index for this table is the primitive type.
+ * The secondary index for this table is the state.
+ */
+static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_PRIMITIVE_ASSOCIATE,
+	TYPE_SCTP_PRIMITIVE_SHUTDOWN,
+	TYPE_SCTP_PRIMITIVE_ABORT,
+	TYPE_SCTP_PRIMITIVE_SEND,
+	TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
+	TYPE_SCTP_PRIMITIVE_ASCONF,
+};
+
+#define TYPE_SCTP_OTHER_NO_PENDING_TSN  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+}
+
+#define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH  { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
+}
+
+static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_OTHER_NO_PENDING_TSN,
+	TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH,
+};
+
+#define TYPE_SCTP_EVENT_TIMEOUT_NONE { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_bug), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_SACK { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+#define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \
+	/* SCTP_STATE_CLOSED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_WAIT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_COOKIE_ECHOED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_ESTABLISHED */ \
+	TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \
+	/* SCTP_STATE_SHUTDOWN_PENDING */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+	/* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
+}
+
+static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
+	TYPE_SCTP_EVENT_TIMEOUT_NONE,
+	TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
+	TYPE_SCTP_EVENT_TIMEOUT_T1_INIT,
+	TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN,
+	TYPE_SCTP_EVENT_TIMEOUT_T3_RTX,
+	TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
+	TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
+	TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
+	TYPE_SCTP_EVENT_TIMEOUT_SACK,
+	TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
+};
+
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid,
+							    sctp_state_t state)
+{
+	if (state > SCTP_STATE_MAX)
+		return &bug;
+
+	if (cid <= SCTP_CID_BASE_MAX)
+		return &chunk_event_table[cid][state];
+
+	if (sctp_prsctp_enable) {
+		if (cid == SCTP_CID_FWD_TSN)
+			return &prsctp_chunk_event_table[0][state];
+	}
+
+	if (sctp_addip_enable) {
+		if (cid == SCTP_CID_ASCONF)
+			return &addip_chunk_event_table[0][state];
+
+		if (cid == SCTP_CID_ASCONF_ACK)
+			return &addip_chunk_event_table[1][state];
+	}
+
+	if (sctp_auth_enable) {
+		if (cid == SCTP_CID_AUTH)
+			return &auth_chunk_event_table[0][state];
+	}
+
+	return &chunk_event_table_unknown[state];
+}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
new file mode 100644
index 00000000..4434853a
--- /dev/null
+++ b/net/sctp/socket.c
@@ -0,0 +1,6719 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 Intel Corp.
+ * Copyright (c) 2001-2002 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions interface with the sockets layer to implement the
+ * SCTP Extensions for the Sockets API.
+ *
+ * Note that the descriptions from the specification are USER level
+ * functions--this file is the functions which populate the struct proto
+ * for SCTP which is the BOTTOM of the sockets interface.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Narasimha Budihal     <narsi@refcode.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Daisy Chang           <daisyc@us.ibm.com>
+ *    Sridhar Samudrala     <samudrala@us.ibm.com>
+ *    Inaky Perez-Gonzalez  <inaky.gonzalez@intel.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Ryan Layer	    <rmlayer@us.ibm.com>
+ *    Anup Pemmaiah         <pemmaiah@cc.usu.edu>
+ *    Kevin Gao             <kevin.gao@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/time.h>
+#include <linux/ip.h>
+#include <linux/capability.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/crypto.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+
+#include <linux/socket.h> /* for sa_family_t */
+#include <net/sock.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* WARNING:  Please do not remove the SCTP_STATIC attribute to
+ * any of the functions below as they are used to export functions
+ * used by a project regression testsuite.
+ */
+
+/* Forward declarations for internal helper functions. */
+static int sctp_writeable(struct sock *sk);
+static void sctp_wfree(struct sk_buff *skb);
+static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p,
+				size_t msg_len);
+static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p);
+static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
+static int sctp_wait_for_accept(struct sock *sk, long timeo);
+static void sctp_wait_for_close(struct sock *sk, long timeo);
+static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
+					union sctp_addr *addr, int len);
+static int sctp_bindx_add(struct sock *, struct sockaddr *, int);
+static int sctp_bindx_rem(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int);
+static int sctp_send_asconf(struct sctp_association *asoc,
+			    struct sctp_chunk *chunk);
+static int sctp_do_bind(struct sock *, union sctp_addr *, int);
+static int sctp_autobind(struct sock *sk);
+static void sctp_sock_migrate(struct sock *, struct sock *,
+			      struct sctp_association *, sctp_socket_type_t);
+static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG;
+
+extern struct kmem_cache *sctp_bucket_cachep;
+extern long sysctl_sctp_mem[3];
+extern int sysctl_sctp_rmem[3];
+extern int sysctl_sctp_wmem[3];
+
+static int sctp_memory_pressure;
+static atomic_long_t sctp_memory_allocated;
+struct percpu_counter sctp_sockets_allocated;
+
+static void sctp_enter_memory_pressure(struct sock *sk)
+{
+	sctp_memory_pressure = 1;
+}
+
+
+/* Get the sndbuf space available at the time on the association.  */
+static inline int sctp_wspace(struct sctp_association *asoc)
+{
+	int amt;
+
+	if (asoc->ep->sndbuf_policy)
+		amt = asoc->sndbuf_used;
+	else
+		amt = sk_wmem_alloc_get(asoc->base.sk);
+
+	if (amt >= asoc->base.sk->sk_sndbuf) {
+		if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+			amt = 0;
+		else {
+			amt = sk_stream_wspace(asoc->base.sk);
+			if (amt < 0)
+				amt = 0;
+		}
+	} else {
+		amt = asoc->base.sk->sk_sndbuf - amt;
+	}
+	return amt;
+}
+
+/* Increment the used sndbuf space count of the corresponding association by
+ * the size of the outgoing data chunk.
+ * Also, set the skb destructor for sndbuf accounting later.
+ *
+ * Since it is always 1-1 between chunk and skb, and also a new skb is always
+ * allocated for chunk bundling in sctp_packet_transmit(), we can use the
+ * destructor in the data chunk skb for the purpose of the sndbuf space
+ * tracking.
+ */
+static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
+{
+	struct sctp_association *asoc = chunk->asoc;
+	struct sock *sk = asoc->base.sk;
+
+	/* The sndbuf space is tracked per association.  */
+	sctp_association_hold(asoc);
+
+	skb_set_owner_w(chunk->skb, sk);
+
+	chunk->skb->destructor = sctp_wfree;
+	/* Save the chunk pointer in skb for sctp_wfree to use later.  */
+	*((struct sctp_chunk **)(chunk->skb->cb)) = chunk;
+
+	asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
+				sizeof(struct sk_buff) +
+				sizeof(struct sctp_chunk);
+
+	atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
+	sk->sk_wmem_queued += chunk->skb->truesize;
+	sk_mem_charge(sk, chunk->skb->truesize);
+}
+
+/* Verify that this is a valid address. */
+static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
+				   int len)
+{
+	struct sctp_af *af;
+
+	/* Verify basic sockaddr. */
+	af = sctp_sockaddr_af(sctp_sk(sk), addr, len);
+	if (!af)
+		return -EINVAL;
+
+	/* Is this a valid SCTP address?  */
+	if (!af->addr_valid(addr, sctp_sk(sk), NULL))
+		return -EINVAL;
+
+	if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Look up the association by its id.  If this is not a UDP-style
+ * socket, the ID field is always ignored.
+ */
+struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
+{
+	struct sctp_association *asoc = NULL;
+
+	/* If this is not a UDP-style socket, assoc id should be ignored. */
+	if (!sctp_style(sk, UDP)) {
+		/* Return NULL if the socket state is not ESTABLISHED. It
+		 * could be a TCP-style listening socket or a socket which
+		 * hasn't yet called connect() to establish an association.
+		 */
+		if (!sctp_sstate(sk, ESTABLISHED))
+			return NULL;
+
+		/* Get the first and the only association from the list. */
+		if (!list_empty(&sctp_sk(sk)->ep->asocs))
+			asoc = list_entry(sctp_sk(sk)->ep->asocs.next,
+					  struct sctp_association, asocs);
+		return asoc;
+	}
+
+	/* Otherwise this is a UDP-style socket. */
+	if (!id || (id == (sctp_assoc_t)-1))
+		return NULL;
+
+	spin_lock_bh(&sctp_assocs_id_lock);
+	asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id);
+	spin_unlock_bh(&sctp_assocs_id_lock);
+
+	if (!asoc || (asoc->base.sk != sk) || asoc->base.dead)
+		return NULL;
+
+	return asoc;
+}
+
+/* Look up the transport from an address and an assoc id. If both address and
+ * id are specified, the associations matching the address and the id should be
+ * the same.
+ */
+static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
+					      struct sockaddr_storage *addr,
+					      sctp_assoc_t id)
+{
+	struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
+	struct sctp_transport *transport;
+	union sctp_addr *laddr = (union sctp_addr *)addr;
+
+	addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
+					       laddr,
+					       &transport);
+
+	if (!addr_asoc)
+		return NULL;
+
+	id_asoc = sctp_id2assoc(sk, id);
+	if (id_asoc && (id_asoc != addr_asoc))
+		return NULL;
+
+	sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
+						(union sctp_addr *)addr);
+
+	return transport;
+}
+
+/* API 3.1.2 bind() - UDP Style Syntax
+ * The syntax of bind() is,
+ *
+ *   ret = bind(int sd, struct sockaddr *addr, int addrlen);
+ *
+ *   sd      - the socket descriptor returned by socket().
+ *   addr    - the address structure (struct sockaddr_in or struct
+ *             sockaddr_in6 [RFC 2553]),
+ *   addr_len - the size of the address structure.
+ */
+SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+	int retval = 0;
+
+	sctp_lock_sock(sk);
+
+	SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
+			  sk, addr, addr_len);
+
+	/* Disallow binding twice. */
+	if (!sctp_sk(sk)->ep->base.bind_addr.port)
+		retval = sctp_do_bind(sk, (union sctp_addr *)addr,
+				      addr_len);
+	else
+		retval = -EINVAL;
+
+	sctp_release_sock(sk);
+
+	return retval;
+}
+
+static long sctp_get_port_local(struct sock *, union sctp_addr *);
+
+/* Verify this is a valid sockaddr. */
+static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
+					union sctp_addr *addr, int len)
+{
+	struct sctp_af *af;
+
+	/* Check minimum size.  */
+	if (len < sizeof (struct sockaddr))
+		return NULL;
+
+	/* V4 mapped address are really of AF_INET family */
+	if (addr->sa.sa_family == AF_INET6 &&
+	    ipv6_addr_v4mapped(&addr->v6.sin6_addr)) {
+		if (!opt->pf->af_supported(AF_INET, opt))
+			return NULL;
+	} else {
+		/* Does this PF support this AF? */
+		if (!opt->pf->af_supported(addr->sa.sa_family, opt))
+			return NULL;
+	}
+
+	/* If we get this far, af is valid. */
+	af = sctp_get_af_specific(addr->sa.sa_family);
+
+	if (len < af->sockaddr_len)
+		return NULL;
+
+	return af;
+}
+
+/* Bind a local address either to an endpoint or to an association.  */
+SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sctp_endpoint *ep = sp->ep;
+	struct sctp_bind_addr *bp = &ep->base.bind_addr;
+	struct sctp_af *af;
+	unsigned short snum;
+	int ret = 0;
+
+	/* Common sockaddr verification. */
+	af = sctp_sockaddr_af(sp, addr, len);
+	if (!af) {
+		SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
+				  sk, addr, len);
+		return -EINVAL;
+	}
+
+	snum = ntohs(addr->v4.sin_port);
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
+				 ", port: %d, new port: %d, len: %d)\n",
+				 sk,
+				 addr,
+				 bp->port, snum,
+				 len);
+
+	/* PF specific bind() address verification. */
+	if (!sp->pf->bind_verify(sp, addr))
+		return -EADDRNOTAVAIL;
+
+	/* We must either be unbound, or bind to the same port.
+	 * It's OK to allow 0 ports if we are already bound.
+	 * We'll just inhert an already bound port in this case
+	 */
+	if (bp->port) {
+		if (!snum)
+			snum = bp->port;
+		else if (snum != bp->port) {
+			SCTP_DEBUG_PRINTK("sctp_do_bind:"
+				  " New port %d does not match existing port "
+				  "%d.\n", snum, bp->port);
+			return -EINVAL;
+		}
+	}
+
+	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	/* See if the address matches any of the addresses we may have
+	 * already bound before checking against other endpoints.
+	 */
+	if (sctp_bind_addr_match(bp, addr, sp))
+		return -EINVAL;
+
+	/* Make sure we are allowed to bind here.
+	 * The function sctp_get_port_local() does duplicate address
+	 * detection.
+	 */
+	addr->v4.sin_port = htons(snum);
+	if ((ret = sctp_get_port_local(sk, addr))) {
+		return -EADDRINUSE;
+	}
+
+	/* Refresh ephemeral port.  */
+	if (!bp->port)
+		bp->port = inet_sk(sk)->inet_num;
+
+	/* Add the address to the bind address list.
+	 * Use GFP_ATOMIC since BHs will be disabled.
+	 */
+	ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC);
+
+	/* Copy back into socket for getsockname() use. */
+	if (!ret) {
+		inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
+		af->to_sk_saddr(addr, sk);
+	}
+
+	return ret;
+}
+
+ /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks
+ *
+ * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged
+ * at any one time.  If a sender, after sending an ASCONF chunk, decides
+ * it needs to transfer another ASCONF Chunk, it MUST wait until the
+ * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a
+ * subsequent ASCONF. Note this restriction binds each side, so at any
+ * time two ASCONF may be in-transit on any given association (one sent
+ * from each endpoint).
+ */
+static int sctp_send_asconf(struct sctp_association *asoc,
+			    struct sctp_chunk *chunk)
+{
+	int		retval = 0;
+
+	/* If there is an outstanding ASCONF chunk, queue it for later
+	 * transmission.
+	 */
+	if (asoc->addip_last_asconf) {
+		list_add_tail(&chunk->list, &asoc->addip_chunk_list);
+		goto out;
+	}
+
+	/* Hold the chunk until an ASCONF_ACK is received. */
+	sctp_chunk_hold(chunk);
+	retval = sctp_primitive_ASCONF(asoc, chunk);
+	if (retval)
+		sctp_chunk_free(chunk);
+	else
+		asoc->addip_last_asconf = chunk;
+
+out:
+	return retval;
+}
+
+/* Add a list of addresses as bind addresses to local endpoint or
+ * association.
+ *
+ * Basically run through each address specified in the addrs/addrcnt
+ * array/length pair, determine if it is IPv6 or IPv4 and call
+ * sctp_do_bind() on it.
+ *
+ * If any of them fails, then the operation will be reversed and the
+ * ones that were added will be removed.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt)
+{
+	int cnt;
+	int retval = 0;
+	void *addr_buf;
+	struct sockaddr *sa_addr;
+	struct sctp_af *af;
+
+	SCTP_DEBUG_PRINTK("sctp_bindx_add (sk: %p, addrs: %p, addrcnt: %d)\n",
+			  sk, addrs, addrcnt);
+
+	addr_buf = addrs;
+	for (cnt = 0; cnt < addrcnt; cnt++) {
+		/* The list may contain either IPv4 or IPv6 address;
+		 * determine the address length for walking thru the list.
+		 */
+		sa_addr = (struct sockaddr *)addr_buf;
+		af = sctp_get_af_specific(sa_addr->sa_family);
+		if (!af) {
+			retval = -EINVAL;
+			goto err_bindx_add;
+		}
+
+		retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr,
+				      af->sockaddr_len);
+
+		addr_buf += af->sockaddr_len;
+
+err_bindx_add:
+		if (retval < 0) {
+			/* Failed. Cleanup the ones that have been added */
+			if (cnt > 0)
+				sctp_bindx_rem(sk, addrs, cnt);
+			return retval;
+		}
+	}
+
+	return retval;
+}
+
+/* Send an ASCONF chunk with Add IP address parameters to all the peers of the
+ * associations that are part of the endpoint indicating that a list of local
+ * addresses are added to the endpoint.
+ *
+ * If any of the addresses is already in the bind address list of the
+ * association, we do not send the chunk for that association.  But it will not
+ * affect other associations.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_send_asconf_add_ip(struct sock		*sk,
+				   struct sockaddr	*addrs,
+				   int 			addrcnt)
+{
+	struct sctp_sock		*sp;
+	struct sctp_endpoint		*ep;
+	struct sctp_association		*asoc;
+	struct sctp_bind_addr		*bp;
+	struct sctp_chunk		*chunk;
+	struct sctp_sockaddr_entry	*laddr;
+	union sctp_addr			*addr;
+	union sctp_addr			saveaddr;
+	void				*addr_buf;
+	struct sctp_af			*af;
+	struct list_head		*p;
+	int 				i;
+	int 				retval = 0;
+
+	if (!sctp_addip_enable)
+		return retval;
+
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n",
+			  __func__, sk, addrs, addrcnt);
+
+	list_for_each_entry(asoc, &ep->asocs, asocs) {
+
+		if (!asoc->peer.asconf_capable)
+			continue;
+
+		if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP)
+			continue;
+
+		if (!sctp_state(asoc, ESTABLISHED))
+			continue;
+
+		/* Check if any address in the packed array of addresses is
+		 * in the bind address list of the association. If so,
+		 * do not send the asconf chunk to its peer, but continue with
+		 * other associations.
+		 */
+		addr_buf = addrs;
+		for (i = 0; i < addrcnt; i++) {
+			addr = (union sctp_addr *)addr_buf;
+			af = sctp_get_af_specific(addr->v4.sin_family);
+			if (!af) {
+				retval = -EINVAL;
+				goto out;
+			}
+
+			if (sctp_assoc_lookup_laddr(asoc, addr))
+				break;
+
+			addr_buf += af->sockaddr_len;
+		}
+		if (i < addrcnt)
+			continue;
+
+		/* Use the first valid address in bind addr list of
+		 * association as Address Parameter of ASCONF CHUNK.
+		 */
+		bp = &asoc->base.bind_addr;
+		p = bp->address_list.next;
+		laddr = list_entry(p, struct sctp_sockaddr_entry, list);
+		chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs,
+						   addrcnt, SCTP_PARAM_ADD_IP);
+		if (!chunk) {
+			retval = -ENOMEM;
+			goto out;
+		}
+
+		retval = sctp_send_asconf(asoc, chunk);
+		if (retval)
+			goto out;
+
+		/* Add the new addresses to the bind address list with
+		 * use_as_src set to 0.
+		 */
+		addr_buf = addrs;
+		for (i = 0; i < addrcnt; i++) {
+			addr = (union sctp_addr *)addr_buf;
+			af = sctp_get_af_specific(addr->v4.sin_family);
+			memcpy(&saveaddr, addr, af->sockaddr_len);
+			retval = sctp_add_bind_addr(bp, &saveaddr,
+						    SCTP_ADDR_NEW, GFP_ATOMIC);
+			addr_buf += af->sockaddr_len;
+		}
+	}
+
+out:
+	return retval;
+}
+
+/* Remove a list of addresses from bind addresses list.  Do not remove the
+ * last address.
+ *
+ * Basically run through each address specified in the addrs/addrcnt
+ * array/length pair, determine if it is IPv6 or IPv4 and call
+ * sctp_del_bind() on it.
+ *
+ * If any of them fails, then the operation will be reversed and the
+ * ones that were removed will be added back.
+ *
+ * At least one address has to be left; if only one address is
+ * available, the operation will return -EBUSY.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sctp_endpoint *ep = sp->ep;
+	int cnt;
+	struct sctp_bind_addr *bp = &ep->base.bind_addr;
+	int retval = 0;
+	void *addr_buf;
+	union sctp_addr *sa_addr;
+	struct sctp_af *af;
+
+	SCTP_DEBUG_PRINTK("sctp_bindx_rem (sk: %p, addrs: %p, addrcnt: %d)\n",
+			  sk, addrs, addrcnt);
+
+	addr_buf = addrs;
+	for (cnt = 0; cnt < addrcnt; cnt++) {
+		/* If the bind address list is empty or if there is only one
+		 * bind address, there is nothing more to be removed (we need
+		 * at least one address here).
+		 */
+		if (list_empty(&bp->address_list) ||
+		    (sctp_list_single_entry(&bp->address_list))) {
+			retval = -EBUSY;
+			goto err_bindx_rem;
+		}
+
+		sa_addr = (union sctp_addr *)addr_buf;
+		af = sctp_get_af_specific(sa_addr->sa.sa_family);
+		if (!af) {
+			retval = -EINVAL;
+			goto err_bindx_rem;
+		}
+
+		if (!af->addr_valid(sa_addr, sp, NULL)) {
+			retval = -EADDRNOTAVAIL;
+			goto err_bindx_rem;
+		}
+
+		if (sa_addr->v4.sin_port &&
+		    sa_addr->v4.sin_port != htons(bp->port)) {
+			retval = -EINVAL;
+			goto err_bindx_rem;
+		}
+
+		if (!sa_addr->v4.sin_port)
+			sa_addr->v4.sin_port = htons(bp->port);
+
+		/* FIXME - There is probably a need to check if sk->sk_saddr and
+		 * sk->sk_rcv_addr are currently set to one of the addresses to
+		 * be removed. This is something which needs to be looked into
+		 * when we are fixing the outstanding issues with multi-homing
+		 * socket routing and failover schemes. Refer to comments in
+		 * sctp_do_bind(). -daisy
+		 */
+		retval = sctp_del_bind_addr(bp, sa_addr);
+
+		addr_buf += af->sockaddr_len;
+err_bindx_rem:
+		if (retval < 0) {
+			/* Failed. Add the ones that has been removed back */
+			if (cnt > 0)
+				sctp_bindx_add(sk, addrs, cnt);
+			return retval;
+		}
+	}
+
+	return retval;
+}
+
+/* Send an ASCONF chunk with Delete IP address parameters to all the peers of
+ * the associations that are part of the endpoint indicating that a list of
+ * local addresses are removed from the endpoint.
+ *
+ * If any of the addresses is already in the bind address list of the
+ * association, we do not send the chunk for that association.  But it will not
+ * affect other associations.
+ *
+ * Only sctp_setsockopt_bindx() is supposed to call this function.
+ */
+static int sctp_send_asconf_del_ip(struct sock		*sk,
+				   struct sockaddr	*addrs,
+				   int			addrcnt)
+{
+	struct sctp_sock	*sp;
+	struct sctp_endpoint	*ep;
+	struct sctp_association	*asoc;
+	struct sctp_transport	*transport;
+	struct sctp_bind_addr	*bp;
+	struct sctp_chunk	*chunk;
+	union sctp_addr		*laddr;
+	void			*addr_buf;
+	struct sctp_af		*af;
+	struct sctp_sockaddr_entry *saddr;
+	int 			i;
+	int 			retval = 0;
+
+	if (!sctp_addip_enable)
+		return retval;
+
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n",
+			  __func__, sk, addrs, addrcnt);
+
+	list_for_each_entry(asoc, &ep->asocs, asocs) {
+
+		if (!asoc->peer.asconf_capable)
+			continue;
+
+		if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP)
+			continue;
+
+		if (!sctp_state(asoc, ESTABLISHED))
+			continue;
+
+		/* Check if any address in the packed array of addresses is
+		 * not present in the bind address list of the association.
+		 * If so, do not send the asconf chunk to its peer, but
+		 * continue with other associations.
+		 */
+		addr_buf = addrs;
+		for (i = 0; i < addrcnt; i++) {
+			laddr = (union sctp_addr *)addr_buf;
+			af = sctp_get_af_specific(laddr->v4.sin_family);
+			if (!af) {
+				retval = -EINVAL;
+				goto out;
+			}
+
+			if (!sctp_assoc_lookup_laddr(asoc, laddr))
+				break;
+
+			addr_buf += af->sockaddr_len;
+		}
+		if (i < addrcnt)
+			continue;
+
+		/* Find one address in the association's bind address list
+		 * that is not in the packed array of addresses. This is to
+		 * make sure that we do not delete all the addresses in the
+		 * association.
+		 */
+		bp = &asoc->base.bind_addr;
+		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
+					       addrcnt, sp);
+		if (!laddr)
+			continue;
+
+		/* We do not need RCU protection throughout this loop
+		 * because this is done under a socket lock from the
+		 * setsockopt call.
+		 */
+		chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt,
+						   SCTP_PARAM_DEL_IP);
+		if (!chunk) {
+			retval = -ENOMEM;
+			goto out;
+		}
+
+		/* Reset use_as_src flag for the addresses in the bind address
+		 * list that are to be deleted.
+		 */
+		addr_buf = addrs;
+		for (i = 0; i < addrcnt; i++) {
+			laddr = (union sctp_addr *)addr_buf;
+			af = sctp_get_af_specific(laddr->v4.sin_family);
+			list_for_each_entry(saddr, &bp->address_list, list) {
+				if (sctp_cmp_addr_exact(&saddr->a, laddr))
+					saddr->state = SCTP_ADDR_DEL;
+			}
+			addr_buf += af->sockaddr_len;
+		}
+
+		/* Update the route and saddr entries for all the transports
+		 * as some of the addresses in the bind address list are
+		 * about to be deleted and cannot be used as source addresses.
+		 */
+		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
+					transports) {
+			dst_release(transport->dst);
+			sctp_transport_route(transport, NULL,
+					     sctp_sk(asoc->base.sk));
+		}
+
+		retval = sctp_send_asconf(asoc, chunk);
+	}
+out:
+	return retval;
+}
+
+/* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
+ *
+ * API 8.1
+ * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt,
+ *                int flags);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distinguish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns
+ * -1, and sets errno to the appropriate error code.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_bindx() will fail, setting errno to EINVAL.
+ *
+ * The flags parameter is formed from the bitwise OR of zero or more of
+ * the following currently defined flags:
+ *
+ * SCTP_BINDX_ADD_ADDR
+ *
+ * SCTP_BINDX_REM_ADDR
+ *
+ * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the
+ * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given
+ * addresses from the association. The two flags are mutually exclusive;
+ * if both are given, sctp_bindx() will fail with EINVAL. A caller may
+ * not remove all addresses from an association; sctp_bindx() will
+ * reject such an attempt with EINVAL.
+ *
+ * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate
+ * additional addresses with an endpoint after calling bind().  Or use
+ * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening
+ * socket is associated with so that no new association accepted will be
+ * associated with those addresses. If the endpoint supports dynamic
+ * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a
+ * endpoint to send the appropriate message to the peer to change the
+ * peers address lists.
+ *
+ * Adding and removing addresses from a connected association is
+ * optional functionality. Implementations that do not support this
+ * functionality should return EOPNOTSUPP.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
+ * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
+ * from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk        The sk of the socket
+ * addrs     The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ * op        Operation to perform (add or remove, see the flags of
+ *           sctp_bindx)
+ *
+ * Returns 0 if ok, <0 errno code on error.
+ */
+SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk,
+				      struct sockaddr __user *addrs,
+				      int addrs_size, int op)
+{
+	struct sockaddr *kaddrs;
+	int err;
+	int addrcnt = 0;
+	int walk_size = 0;
+	struct sockaddr *sa_addr;
+	void *addr_buf;
+	struct sctp_af *af;
+
+	SCTP_DEBUG_PRINTK("sctp_setsocktopt_bindx: sk %p addrs %p"
+			  " addrs_size %d opt %d\n", sk, addrs, addrs_size, op);
+
+	if (unlikely(addrs_size <= 0))
+		return -EINVAL;
+
+	/* Check the user passed a healthy pointer.  */
+	if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+		return -EFAULT;
+
+	/* Alloc space for the address array in kernel memory.  */
+	kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+	if (unlikely(!kaddrs))
+		return -ENOMEM;
+
+	if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+		kfree(kaddrs);
+		return -EFAULT;
+	}
+
+	/* Walk through the addrs buffer and count the number of addresses. */
+	addr_buf = kaddrs;
+	while (walk_size < addrs_size) {
+		if (walk_size + sizeof(sa_family_t) > addrs_size) {
+			kfree(kaddrs);
+			return -EINVAL;
+		}
+
+		sa_addr = (struct sockaddr *)addr_buf;
+		af = sctp_get_af_specific(sa_addr->sa_family);
+
+		/* If the address family is not supported or if this address
+		 * causes the address buffer to overflow return EINVAL.
+		 */
+		if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+			kfree(kaddrs);
+			return -EINVAL;
+		}
+		addrcnt++;
+		addr_buf += af->sockaddr_len;
+		walk_size += af->sockaddr_len;
+	}
+
+	/* Do the work. */
+	switch (op) {
+	case SCTP_BINDX_ADD_ADDR:
+		err = sctp_bindx_add(sk, kaddrs, addrcnt);
+		if (err)
+			goto out;
+		err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt);
+		break;
+
+	case SCTP_BINDX_REM_ADDR:
+		err = sctp_bindx_rem(sk, kaddrs, addrcnt);
+		if (err)
+			goto out;
+		err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+out:
+	kfree(kaddrs);
+
+	return err;
+}
+
+/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
+ *
+ * Common routine for handling connect() and sctp_connectx().
+ * Connect will come in with just a single address.
+ */
+static int __sctp_connect(struct sock* sk,
+			  struct sockaddr *kaddrs,
+			  int addrs_size,
+			  sctp_assoc_t *assoc_id)
+{
+	struct sctp_sock *sp;
+	struct sctp_endpoint *ep;
+	struct sctp_association *asoc = NULL;
+	struct sctp_association *asoc2;
+	struct sctp_transport *transport;
+	union sctp_addr to;
+	struct sctp_af *af;
+	sctp_scope_t scope;
+	long timeo;
+	int err = 0;
+	int addrcnt = 0;
+	int walk_size = 0;
+	union sctp_addr *sa_addr = NULL;
+	void *addr_buf;
+	unsigned short port;
+	unsigned int f_flags = 0;
+
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	/* connect() cannot be done on a socket that is already in ESTABLISHED
+	 * state - UDP-style peeled off socket or a TCP-style socket that
+	 * is already connected.
+	 * It cannot be done even on a TCP-style listening socket.
+	 */
+	if (sctp_sstate(sk, ESTABLISHED) ||
+	    (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
+		err = -EISCONN;
+		goto out_free;
+	}
+
+	/* Walk through the addrs buffer and count the number of addresses. */
+	addr_buf = kaddrs;
+	while (walk_size < addrs_size) {
+		if (walk_size + sizeof(sa_family_t) > addrs_size) {
+			err = -EINVAL;
+			goto out_free;
+		}
+
+		sa_addr = (union sctp_addr *)addr_buf;
+		af = sctp_get_af_specific(sa_addr->sa.sa_family);
+
+		/* If the address family is not supported or if this address
+		 * causes the address buffer to overflow return EINVAL.
+		 */
+		if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+			err = -EINVAL;
+			goto out_free;
+		}
+
+		port = ntohs(sa_addr->v4.sin_port);
+
+		/* Save current address so we can work with it */
+		memcpy(&to, sa_addr, af->sockaddr_len);
+
+		err = sctp_verify_addr(sk, &to, af->sockaddr_len);
+		if (err)
+			goto out_free;
+
+		/* Make sure the destination port is correctly set
+		 * in all addresses.
+		 */
+		if (asoc && asoc->peer.port && asoc->peer.port != port)
+			goto out_free;
+
+
+		/* Check if there already is a matching association on the
+		 * endpoint (other than the one created here).
+		 */
+		asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+		if (asoc2 && asoc2 != asoc) {
+			if (asoc2->state >= SCTP_STATE_ESTABLISHED)
+				err = -EISCONN;
+			else
+				err = -EALREADY;
+			goto out_free;
+		}
+
+		/* If we could not find a matching association on the endpoint,
+		 * make sure that there is no peeled-off association matching
+		 * the peer address even on another socket.
+		 */
+		if (sctp_endpoint_is_peeled_off(ep, &to)) {
+			err = -EADDRNOTAVAIL;
+			goto out_free;
+		}
+
+		if (!asoc) {
+			/* If a bind() or sctp_bindx() is not called prior to
+			 * an sctp_connectx() call, the system picks an
+			 * ephemeral port and will choose an address set
+			 * equivalent to binding with a wildcard address.
+			 */
+			if (!ep->base.bind_addr.port) {
+				if (sctp_autobind(sk)) {
+					err = -EAGAIN;
+					goto out_free;
+				}
+			} else {
+				/*
+				 * If an unprivileged user inherits a 1-many
+				 * style socket with open associations on a
+				 * privileged port, it MAY be permitted to
+				 * accept new associations, but it SHOULD NOT
+				 * be permitted to open new associations.
+				 */
+				if (ep->base.bind_addr.port < PROT_SOCK &&
+				    !capable(CAP_NET_BIND_SERVICE)) {
+					err = -EACCES;
+					goto out_free;
+				}
+			}
+
+			scope = sctp_scope(&to);
+			asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+			if (!asoc) {
+				err = -ENOMEM;
+				goto out_free;
+			}
+
+			err = sctp_assoc_set_bind_addr_from_ep(asoc, scope,
+							      GFP_KERNEL);
+			if (err < 0) {
+				goto out_free;
+			}
+
+		}
+
+		/* Prime the peer's transport structures.  */
+		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
+						SCTP_UNKNOWN);
+		if (!transport) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+
+		addrcnt++;
+		addr_buf += af->sockaddr_len;
+		walk_size += af->sockaddr_len;
+	}
+
+	/* In case the user of sctp_connectx() wants an association
+	 * id back, assign one now.
+	 */
+	if (assoc_id) {
+		err = sctp_assoc_set_id(asoc, GFP_KERNEL);
+		if (err < 0)
+			goto out_free;
+	}
+
+	err = sctp_primitive_ASSOCIATE(asoc, NULL);
+	if (err < 0) {
+		goto out_free;
+	}
+
+	/* Initialize sk's dport and daddr for getpeername() */
+	inet_sk(sk)->inet_dport = htons(asoc->peer.port);
+	af = sctp_get_af_specific(sa_addr->sa.sa_family);
+	af->to_sk_daddr(sa_addr, sk);
+	sk->sk_err = 0;
+
+	/* in-kernel sockets don't generally have a file allocated to them
+	 * if all they do is call sock_create_kern().
+	 */
+	if (sk->sk_socket->file)
+		f_flags = sk->sk_socket->file->f_flags;
+
+	timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
+
+	err = sctp_wait_for_connect(asoc, &timeo);
+	if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+		*assoc_id = asoc->assoc_id;
+
+	/* Don't free association on exit. */
+	asoc = NULL;
+
+out_free:
+
+	SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
+			  " kaddrs: %p err: %d\n",
+			  asoc, kaddrs, err);
+	if (asoc)
+		sctp_association_free(asoc);
+	return err;
+}
+
+/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
+ *
+ * API 8.9
+ * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt,
+ * 			sctp_assoc_t *asoc);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distengish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_connectx() returns 0. It also sets the assoc_id to
+ * the association id of the new association.  On failure, sctp_connectx()
+ * returns -1, and sets errno to the appropriate error code.  The assoc_id
+ * is not touched by the kernel.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_connectx() will fail, setting errno to EINVAL.
+ *
+ * An application can use sctp_connectx to initiate an association with
+ * an endpoint that is multi-homed.  Much like sctp_bindx() this call
+ * allows a caller to specify multiple addresses at which a peer can be
+ * reached.  The way the SCTP stack uses the list of addresses to set up
+ * the association is implementation dependent.  This function only
+ * specifies that the stack will try to make use of all the addresses in
+ * the list when needed.
+ *
+ * Note that the list of addresses passed in is only used for setting up
+ * the association.  It does not necessarily equal the set of addresses
+ * the peer uses for the resulting association.  If the caller wants to
+ * find out the set of peer addresses, it must use sctp_getpaddrs() to
+ * retrieve them after the association has been set up.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_connectx(). This is used for tunneling
+ * the sctp_connectx() request through sctp_setsockopt() from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk        The sk of the socket
+ * addrs     The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ *
+ * Returns >=0 if ok, <0 errno code on error.
+ */
+SCTP_STATIC int __sctp_setsockopt_connectx(struct sock* sk,
+				      struct sockaddr __user *addrs,
+				      int addrs_size,
+				      sctp_assoc_t *assoc_id)
+{
+	int err = 0;
+	struct sockaddr *kaddrs;
+
+	SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
+			  __func__, sk, addrs, addrs_size);
+
+	if (unlikely(addrs_size <= 0))
+		return -EINVAL;
+
+	/* Check the user passed a healthy pointer.  */
+	if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+		return -EFAULT;
+
+	/* Alloc space for the address array in kernel memory.  */
+	kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+	if (unlikely(!kaddrs))
+		return -ENOMEM;
+
+	if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+		err = -EFAULT;
+	} else {
+		err = __sctp_connect(sk, kaddrs, addrs_size, assoc_id);
+	}
+
+	kfree(kaddrs);
+
+	return err;
+}
+
+/*
+ * This is an older interface.  It's kept for backward compatibility
+ * to the option that doesn't provide association id.
+ */
+SCTP_STATIC int sctp_setsockopt_connectx_old(struct sock* sk,
+				      struct sockaddr __user *addrs,
+				      int addrs_size)
+{
+	return __sctp_setsockopt_connectx(sk, addrs, addrs_size, NULL);
+}
+
+/*
+ * New interface for the API.  The since the API is done with a socket
+ * option, to make it simple we feed back the association id is as a return
+ * indication to the call.  Error is always negative and association id is
+ * always positive.
+ */
+SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
+				      struct sockaddr __user *addrs,
+				      int addrs_size)
+{
+	sctp_assoc_t assoc_id = 0;
+	int err = 0;
+
+	err = __sctp_setsockopt_connectx(sk, addrs, addrs_size, &assoc_id);
+
+	if (err)
+		return err;
+	else
+		return assoc_id;
+}
+
+/*
+ * New (hopefully final) interface for the API.
+ * We use the sctp_getaddrs_old structure so that use-space library
+ * can avoid any unnecessary allocations.   The only defferent part
+ * is that we store the actual length of the address buffer into the
+ * addrs_num structure member.  That way we can re-use the existing
+ * code.
+ */
+SCTP_STATIC int sctp_getsockopt_connectx3(struct sock* sk, int len,
+					char __user *optval,
+					int __user *optlen)
+{
+	struct sctp_getaddrs_old param;
+	sctp_assoc_t assoc_id = 0;
+	int err = 0;
+
+	if (len < sizeof(param))
+		return -EINVAL;
+
+	if (copy_from_user(&param, optval, sizeof(param)))
+		return -EFAULT;
+
+	err = __sctp_setsockopt_connectx(sk,
+			(struct sockaddr __user *)param.addrs,
+			param.addr_num, &assoc_id);
+
+	if (err == 0 || err == -EINPROGRESS) {
+		if (copy_to_user(optval, &assoc_id, sizeof(assoc_id)))
+			return -EFAULT;
+		if (put_user(sizeof(assoc_id), optlen))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+/* API 3.1.4 close() - UDP Style Syntax
+ * Applications use close() to perform graceful shutdown (as described in
+ * Section 10.1 of [SCTP]) on ALL the associations currently represented
+ * by a UDP-style socket.
+ *
+ * The syntax is
+ *
+ *   ret = close(int sd);
+ *
+ *   sd      - the socket descriptor of the associations to be closed.
+ *
+ * To gracefully shutdown a specific association represented by the
+ * UDP-style socket, an application should use the sendmsg() call,
+ * passing no user data, but including the appropriate flag in the
+ * ancillary data (see Section xxxx).
+ *
+ * If sd in the close() call is a branched-off socket representing only
+ * one association, the shutdown is performed on that association only.
+ *
+ * 4.1.6 close() - TCP Style Syntax
+ *
+ * Applications use close() to gracefully close down an association.
+ *
+ * The syntax is:
+ *
+ *    int close(int sd);
+ *
+ *      sd      - the socket descriptor of the association to be closed.
+ *
+ * After an application calls close() on a socket descriptor, no further
+ * socket operations will succeed on that descriptor.
+ *
+ * API 7.1.4 SO_LINGER
+ *
+ * An application using the TCP-style socket can use this option to
+ * perform the SCTP ABORT primitive.  The linger option structure is:
+ *
+ *  struct  linger {
+ *     int     l_onoff;                // option on/off
+ *     int     l_linger;               // linger time
+ * };
+ *
+ * To enable the option, set l_onoff to 1.  If the l_linger value is set
+ * to 0, calling close() is the same as the ABORT primitive.  If the
+ * value is set to a negative value, the setsockopt() call will return
+ * an error.  If the value is set to a positive value linger_time, the
+ * close() can be blocked for at most linger_time ms.  If the graceful
+ * shutdown phase does not finish during this period, close() will
+ * return but the graceful shutdown phase continues in the system.
+ */
+SCTP_STATIC void sctp_close(struct sock *sk, long timeout)
+{
+	struct sctp_endpoint *ep;
+	struct sctp_association *asoc;
+	struct list_head *pos, *temp;
+	unsigned int data_was_unread;
+
+	SCTP_DEBUG_PRINTK("sctp_close(sk: 0x%p, timeout:%ld)\n", sk, timeout);
+
+	sctp_lock_sock(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	sk->sk_state = SCTP_SS_CLOSING;
+
+	ep = sctp_sk(sk)->ep;
+
+	/* Clean up any skbs sitting on the receive queue.  */
+	data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue);
+	data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby);
+
+	/* Walk all associations on an endpoint.  */
+	list_for_each_safe(pos, temp, &ep->asocs) {
+		asoc = list_entry(pos, struct sctp_association, asocs);
+
+		if (sctp_style(sk, TCP)) {
+			/* A closed association can still be in the list if
+			 * it belongs to a TCP-style listening socket that is
+			 * not yet accepted. If so, free it. If not, send an
+			 * ABORT or SHUTDOWN based on the linger options.
+			 */
+			if (sctp_state(asoc, CLOSED)) {
+				sctp_unhash_established(asoc);
+				sctp_association_free(asoc);
+				continue;
+			}
+		}
+
+		if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
+		    !skb_queue_empty(&asoc->ulpq.reasm) ||
+		    (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
+			struct sctp_chunk *chunk;
+
+			chunk = sctp_make_abort_user(asoc, NULL, 0);
+			if (chunk)
+				sctp_primitive_ABORT(asoc, chunk);
+		} else
+			sctp_primitive_SHUTDOWN(asoc, NULL);
+	}
+
+	/* On a TCP-style socket, block for at most linger_time if set. */
+	if (sctp_style(sk, TCP) && timeout)
+		sctp_wait_for_close(sk, timeout);
+
+	/* This will run the backlog queue.  */
+	sctp_release_sock(sk);
+
+	/* Supposedly, no process has access to the socket, but
+	 * the net layers still may.
+	 */
+	sctp_local_bh_disable();
+	sctp_bh_lock_sock(sk);
+
+	/* Hold the sock, since sk_common_release() will put sock_put()
+	 * and we have just a little more cleanup.
+	 */
+	sock_hold(sk);
+	sk_common_release(sk);
+
+	sctp_bh_unlock_sock(sk);
+	sctp_local_bh_enable();
+
+	sock_put(sk);
+
+	SCTP_DBG_OBJCNT_DEC(sock);
+}
+
+/* Handle EPIPE error. */
+static int sctp_error(struct sock *sk, int flags, int err)
+{
+	if (err == -EPIPE)
+		err = sock_error(sk) ? : -EPIPE;
+	if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
+		send_sig(SIGPIPE, current, 0);
+	return err;
+}
+
+/* API 3.1.3 sendmsg() - UDP Style Syntax
+ *
+ * An application uses sendmsg() and recvmsg() calls to transmit data to
+ * and receive data from its peer.
+ *
+ *  ssize_t sendmsg(int socket, const struct msghdr *message,
+ *                  int flags);
+ *
+ *  socket  - the socket descriptor of the endpoint.
+ *  message - pointer to the msghdr structure which contains a single
+ *            user message and possibly some ancillary data.
+ *
+ *            See Section 5 for complete description of the data
+ *            structures.
+ *
+ *  flags   - flags sent or received with the user message, see Section
+ *            5 for complete description of the flags.
+ *
+ * Note:  This function could use a rewrite especially when explicit
+ * connect support comes in.
+ */
+/* BUG:  We do not implement the equivalent of sk_stream_wait_memory(). */
+
+SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *);
+
+SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			     struct msghdr *msg, size_t msg_len)
+{
+	struct sctp_sock *sp;
+	struct sctp_endpoint *ep;
+	struct sctp_association *new_asoc=NULL, *asoc=NULL;
+	struct sctp_transport *transport, *chunk_tp;
+	struct sctp_chunk *chunk;
+	union sctp_addr to;
+	struct sockaddr *msg_name = NULL;
+	struct sctp_sndrcvinfo default_sinfo;
+	struct sctp_sndrcvinfo *sinfo;
+	struct sctp_initmsg *sinit;
+	sctp_assoc_t associd = 0;
+	sctp_cmsgs_t cmsgs = { NULL };
+	int err;
+	sctp_scope_t scope;
+	long timeo;
+	__u16 sinfo_flags = 0;
+	struct sctp_datamsg *datamsg;
+	int msg_flags = msg->msg_flags;
+
+	SCTP_DEBUG_PRINTK("sctp_sendmsg(sk: %p, msg: %p, msg_len: %zu)\n",
+			  sk, msg, msg_len);
+
+	err = 0;
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
+
+	/* We cannot send a message over a TCP-style listening socket. */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
+		err = -EPIPE;
+		goto out_nounlock;
+	}
+
+	/* Parse out the SCTP CMSGs.  */
+	err = sctp_msghdr_parse(msg, &cmsgs);
+
+	if (err) {
+		SCTP_DEBUG_PRINTK("msghdr parse err = %x\n", err);
+		goto out_nounlock;
+	}
+
+	/* Fetch the destination address for this packet.  This
+	 * address only selects the association--it is not necessarily
+	 * the address we will send to.
+	 * For a peeled-off socket, msg_name is ignored.
+	 */
+	if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) {
+		int msg_namelen = msg->msg_namelen;
+
+		err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name,
+				       msg_namelen);
+		if (err)
+			return err;
+
+		if (msg_namelen > sizeof(to))
+			msg_namelen = sizeof(to);
+		memcpy(&to, msg->msg_name, msg_namelen);
+		msg_name = msg->msg_name;
+	}
+
+	sinfo = cmsgs.info;
+	sinit = cmsgs.init;
+
+	/* Did the user specify SNDRCVINFO?  */
+	if (sinfo) {
+		sinfo_flags = sinfo->sinfo_flags;
+		associd = sinfo->sinfo_assoc_id;
+	}
+
+	SCTP_DEBUG_PRINTK("msg_len: %zu, sinfo_flags: 0x%x\n",
+			  msg_len, sinfo_flags);
+
+	/* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */
+	if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) {
+		err = -EINVAL;
+		goto out_nounlock;
+	}
+
+	/* If SCTP_EOF is set, no data can be sent. Disallow sending zero
+	 * length messages when SCTP_EOF|SCTP_ABORT is not set.
+	 * If SCTP_ABORT is set, the message length could be non zero with
+	 * the msg_iov set to the user abort reason.
+	 */
+	if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) ||
+	    (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) {
+		err = -EINVAL;
+		goto out_nounlock;
+	}
+
+	/* If SCTP_ADDR_OVER is set, there must be an address
+	 * specified in msg_name.
+	 */
+	if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) {
+		err = -EINVAL;
+		goto out_nounlock;
+	}
+
+	transport = NULL;
+
+	SCTP_DEBUG_PRINTK("About to look up association.\n");
+
+	sctp_lock_sock(sk);
+
+	/* If a msg_name has been specified, assume this is to be used.  */
+	if (msg_name) {
+		/* Look for a matching association on the endpoint. */
+		asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+		if (!asoc) {
+			/* If we could not find a matching association on the
+			 * endpoint, make sure that it is not a TCP-style
+			 * socket that already has an association or there is
+			 * no peeled-off association on another socket.
+			 */
+			if ((sctp_style(sk, TCP) &&
+			     sctp_sstate(sk, ESTABLISHED)) ||
+			    sctp_endpoint_is_peeled_off(ep, &to)) {
+				err = -EADDRNOTAVAIL;
+				goto out_unlock;
+			}
+		}
+	} else {
+		asoc = sctp_id2assoc(sk, associd);
+		if (!asoc) {
+			err = -EPIPE;
+			goto out_unlock;
+		}
+	}
+
+	if (asoc) {
+		SCTP_DEBUG_PRINTK("Just looked up association: %p.\n", asoc);
+
+		/* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED
+		 * socket that has an association in CLOSED state. This can
+		 * happen when an accepted socket has an association that is
+		 * already CLOSED.
+		 */
+		if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) {
+			err = -EPIPE;
+			goto out_unlock;
+		}
+
+		if (sinfo_flags & SCTP_EOF) {
+			SCTP_DEBUG_PRINTK("Shutting down association: %p\n",
+					  asoc);
+			sctp_primitive_SHUTDOWN(asoc, NULL);
+			err = 0;
+			goto out_unlock;
+		}
+		if (sinfo_flags & SCTP_ABORT) {
+
+			chunk = sctp_make_abort_user(asoc, msg, msg_len);
+			if (!chunk) {
+				err = -ENOMEM;
+				goto out_unlock;
+			}
+
+			SCTP_DEBUG_PRINTK("Aborting association: %p\n", asoc);
+			sctp_primitive_ABORT(asoc, chunk);
+			err = 0;
+			goto out_unlock;
+		}
+	}
+
+	/* Do we need to create the association?  */
+	if (!asoc) {
+		SCTP_DEBUG_PRINTK("There is no association yet.\n");
+
+		if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* Check for invalid stream against the stream counts,
+		 * either the default or the user specified stream counts.
+		 */
+		if (sinfo) {
+			if (!sinit || (sinit && !sinit->sinit_num_ostreams)) {
+				/* Check against the defaults. */
+				if (sinfo->sinfo_stream >=
+				    sp->initmsg.sinit_num_ostreams) {
+					err = -EINVAL;
+					goto out_unlock;
+				}
+			} else {
+				/* Check against the requested.  */
+				if (sinfo->sinfo_stream >=
+				    sinit->sinit_num_ostreams) {
+					err = -EINVAL;
+					goto out_unlock;
+				}
+			}
+		}
+
+		/*
+		 * API 3.1.2 bind() - UDP Style Syntax
+		 * If a bind() or sctp_bindx() is not called prior to a
+		 * sendmsg() call that initiates a new association, the
+		 * system picks an ephemeral port and will choose an address
+		 * set equivalent to binding with a wildcard address.
+		 */
+		if (!ep->base.bind_addr.port) {
+			if (sctp_autobind(sk)) {
+				err = -EAGAIN;
+				goto out_unlock;
+			}
+		} else {
+			/*
+			 * If an unprivileged user inherits a one-to-many
+			 * style socket with open associations on a privileged
+			 * port, it MAY be permitted to accept new associations,
+			 * but it SHOULD NOT be permitted to open new
+			 * associations.
+			 */
+			if (ep->base.bind_addr.port < PROT_SOCK &&
+			    !capable(CAP_NET_BIND_SERVICE)) {
+				err = -EACCES;
+				goto out_unlock;
+			}
+		}
+
+		scope = sctp_scope(&to);
+		new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+		if (!new_asoc) {
+			err = -ENOMEM;
+			goto out_unlock;
+		}
+		asoc = new_asoc;
+		err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+		if (err < 0) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+
+		/* If the SCTP_INIT ancillary data is specified, set all
+		 * the association init values accordingly.
+		 */
+		if (sinit) {
+			if (sinit->sinit_num_ostreams) {
+				asoc->c.sinit_num_ostreams =
+					sinit->sinit_num_ostreams;
+			}
+			if (sinit->sinit_max_instreams) {
+				asoc->c.sinit_max_instreams =
+					sinit->sinit_max_instreams;
+			}
+			if (sinit->sinit_max_attempts) {
+				asoc->max_init_attempts
+					= sinit->sinit_max_attempts;
+			}
+			if (sinit->sinit_max_init_timeo) {
+				asoc->max_init_timeo =
+				 msecs_to_jiffies(sinit->sinit_max_init_timeo);
+			}
+		}
+
+		/* Prime the peer's transport structures.  */
+		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
+		if (!transport) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+	}
+
+	/* ASSERT: we have a valid association at this point.  */
+	SCTP_DEBUG_PRINTK("We have a valid association.\n");
+
+	if (!sinfo) {
+		/* If the user didn't specify SNDRCVINFO, make up one with
+		 * some defaults.
+		 */
+		memset(&default_sinfo, 0, sizeof(default_sinfo));
+		default_sinfo.sinfo_stream = asoc->default_stream;
+		default_sinfo.sinfo_flags = asoc->default_flags;
+		default_sinfo.sinfo_ppid = asoc->default_ppid;
+		default_sinfo.sinfo_context = asoc->default_context;
+		default_sinfo.sinfo_timetolive = asoc->default_timetolive;
+		default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc);
+		sinfo = &default_sinfo;
+	}
+
+	/* API 7.1.7, the sndbuf size per association bounds the
+	 * maximum size of data that can be sent in a single send call.
+	 */
+	if (msg_len > sk->sk_sndbuf) {
+		err = -EMSGSIZE;
+		goto out_free;
+	}
+
+	if (asoc->pmtu_pending)
+		sctp_assoc_pending_pmtu(asoc);
+
+	/* If fragmentation is disabled and the message length exceeds the
+	 * association fragmentation point, return EMSGSIZE.  The I-D
+	 * does not specify what this error is, but this looks like
+	 * a great fit.
+	 */
+	if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) {
+		err = -EMSGSIZE;
+		goto out_free;
+	}
+
+	/* Check for invalid stream. */
+	if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	if (!sctp_wspace(asoc)) {
+		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+		if (err)
+			goto out_free;
+	}
+
+	/* If an address is passed with the sendto/sendmsg call, it is used
+	 * to override the primary destination address in the TCP model, or
+	 * when SCTP_ADDR_OVER flag is set in the UDP model.
+	 */
+	if ((sctp_style(sk, TCP) && msg_name) ||
+	    (sinfo_flags & SCTP_ADDR_OVER)) {
+		chunk_tp = sctp_assoc_lookup_paddr(asoc, &to);
+		if (!chunk_tp) {
+			err = -EINVAL;
+			goto out_free;
+		}
+	} else
+		chunk_tp = NULL;
+
+	/* Auto-connect, if we aren't connected already. */
+	if (sctp_state(asoc, CLOSED)) {
+		err = sctp_primitive_ASSOCIATE(asoc, NULL);
+		if (err < 0)
+			goto out_free;
+		SCTP_DEBUG_PRINTK("We associated primitively.\n");
+	}
+
+	/* Break the message into multiple chunks of maximum size. */
+	datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len);
+	if (!datamsg) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	/* Now send the (possibly) fragmented message. */
+	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+		sctp_chunk_hold(chunk);
+
+		/* Do accounting for the write space.  */
+		sctp_set_owner_w(chunk);
+
+		chunk->transport = chunk_tp;
+	}
+
+	/* Send it to the lower layers.  Note:  all chunks
+	 * must either fail or succeed.   The lower layer
+	 * works that way today.  Keep it that way or this
+	 * breaks.
+	 */
+	err = sctp_primitive_SEND(asoc, datamsg);
+	/* Did the lower layer accept the chunk? */
+	if (err)
+		sctp_datamsg_free(datamsg);
+	else
+		sctp_datamsg_put(datamsg);
+
+	SCTP_DEBUG_PRINTK("We sent primitively.\n");
+
+	if (err)
+		goto out_free;
+	else
+		err = msg_len;
+
+	/* If we are already past ASSOCIATE, the lower
+	 * layers are responsible for association cleanup.
+	 */
+	goto out_unlock;
+
+out_free:
+	if (new_asoc)
+		sctp_association_free(asoc);
+out_unlock:
+	sctp_release_sock(sk);
+
+out_nounlock:
+	return sctp_error(sk, msg_flags, err);
+
+#if 0
+do_sock_err:
+	if (msg_len)
+		err = msg_len;
+	else
+		err = sock_error(sk);
+	goto out;
+
+do_interrupted:
+	if (msg_len)
+		err = msg_len;
+	goto out;
+#endif /* 0 */
+}
+
+/* This is an extended version of skb_pull() that removes the data from the
+ * start of a skb even when data is spread across the list of skb's in the
+ * frag_list. len specifies the total amount of data that needs to be removed.
+ * when 'len' bytes could be removed from the skb, it returns 0.
+ * If 'len' exceeds the total skb length,  it returns the no. of bytes that
+ * could not be removed.
+ */
+static int sctp_skb_pull(struct sk_buff *skb, int len)
+{
+	struct sk_buff *list;
+	int skb_len = skb_headlen(skb);
+	int rlen;
+
+	if (len <= skb_len) {
+		__skb_pull(skb, len);
+		return 0;
+	}
+	len -= skb_len;
+	__skb_pull(skb, skb_len);
+
+	skb_walk_frags(skb, list) {
+		rlen = sctp_skb_pull(list, len);
+		skb->len -= (len-rlen);
+		skb->data_len -= (len-rlen);
+
+		if (!rlen)
+			return 0;
+
+		len = rlen;
+	}
+
+	return len;
+}
+
+/* API 3.1.3  recvmsg() - UDP Style Syntax
+ *
+ *  ssize_t recvmsg(int socket, struct msghdr *message,
+ *                    int flags);
+ *
+ *  socket  - the socket descriptor of the endpoint.
+ *  message - pointer to the msghdr structure which contains a single
+ *            user message and possibly some ancillary data.
+ *
+ *            See Section 5 for complete description of the data
+ *            structures.
+ *
+ *  flags   - flags sent or received with the user message, see Section
+ *            5 for complete description of the flags.
+ */
+static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
+
+SCTP_STATIC int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
+			     struct msghdr *msg, size_t len, int noblock,
+			     int flags, int *addr_len)
+{
+	struct sctp_ulpevent *event = NULL;
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sk_buff *skb;
+	int copied;
+	int err = 0;
+	int skb_len;
+
+	SCTP_DEBUG_PRINTK("sctp_recvmsg(%s: %p, %s: %p, %s: %zd, %s: %d, %s: "
+			  "0x%x, %s: %p)\n", "sk", sk, "msghdr", msg,
+			  "len", len, "knoblauch", noblock,
+			  "flags", flags, "addr_len", addr_len);
+
+	sctp_lock_sock(sk);
+
+	if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	skb = sctp_skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	/* Get the total length of the skb including any skb's in the
+	 * frag_list.
+	 */
+	skb_len = skb->len;
+
+	copied = skb_len;
+	if (copied > len)
+		copied = len;
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+
+	event = sctp_skb2event(skb);
+
+	if (err)
+		goto out_free;
+
+	sock_recv_ts_and_drops(msg, sk, skb);
+	if (sctp_ulpevent_is_notification(event)) {
+		msg->msg_flags |= MSG_NOTIFICATION;
+		sp->pf->event_msgname(event, msg->msg_name, addr_len);
+	} else {
+		sp->pf->skb_msgname(skb, msg->msg_name, addr_len);
+	}
+
+	/* Check if we allow SCTP_SNDRCVINFO. */
+	if (sp->subscribe.sctp_data_io_event)
+		sctp_ulpevent_read_sndrcvinfo(event, msg);
+#if 0
+	/* FIXME: we should be calling IP/IPv6 layers.  */
+	if (sk->sk_protinfo.af_inet.cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+#endif
+
+	err = copied;
+
+	/* If skb's length exceeds the user's buffer, update the skb and
+	 * push it back to the receive_queue so that the next call to
+	 * recvmsg() will return the remaining data. Don't set MSG_EOR.
+	 */
+	if (skb_len > copied) {
+		msg->msg_flags &= ~MSG_EOR;
+		if (flags & MSG_PEEK)
+			goto out_free;
+		sctp_skb_pull(skb, copied);
+		skb_queue_head(&sk->sk_receive_queue, skb);
+
+		/* When only partial message is copied to the user, increase
+		 * rwnd by that amount. If all the data in the skb is read,
+		 * rwnd is updated when the event is freed.
+		 */
+		if (!sctp_ulpevent_is_notification(event))
+			sctp_assoc_rwnd_increase(event->asoc, copied);
+		goto out;
+	} else if ((event->msg_flags & MSG_NOTIFICATION) ||
+		   (event->msg_flags & MSG_EOR))
+		msg->msg_flags |= MSG_EOR;
+	else
+		msg->msg_flags &= ~MSG_EOR;
+
+out_free:
+	if (flags & MSG_PEEK) {
+		/* Release the skb reference acquired after peeking the skb in
+		 * sctp_skb_recv_datagram().
+		 */
+		kfree_skb(skb);
+	} else {
+		/* Free the event which includes releasing the reference to
+		 * the owner of the skb, freeing the skb and updating the
+		 * rwnd.
+		 */
+		sctp_ulpevent_free(event);
+	}
+out:
+	sctp_release_sock(sk);
+	return err;
+}
+
+/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
+ *
+ * This option is a on/off flag.  If enabled no SCTP message
+ * fragmentation will be performed.  Instead if a message being sent
+ * exceeds the current PMTU size, the message will NOT be sent and
+ * instead a error will be indicated to the user.
+ */
+static int sctp_setsockopt_disable_fragments(struct sock *sk,
+					     char __user *optval,
+					     unsigned int optlen)
+{
+	int val;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1;
+
+	return 0;
+}
+
+static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
+				  unsigned int optlen)
+{
+	struct sctp_association *asoc;
+	struct sctp_ulpevent *event;
+
+	if (optlen > sizeof(struct sctp_event_subscribe))
+		return -EINVAL;
+	if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
+		return -EFAULT;
+
+	/*
+	 * At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
+	 * if there is no data to be sent or retransmit, the stack will
+	 * immediately send up this notification.
+	 */
+	if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT,
+				       &sctp_sk(sk)->subscribe)) {
+		asoc = sctp_id2assoc(sk, 0);
+
+		if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
+			event = sctp_ulpevent_make_sender_dry_event(asoc,
+					GFP_ATOMIC);
+			if (!event)
+				return -ENOMEM;
+
+			sctp_ulpq_tail_event(&asoc->ulpq, event);
+		}
+	}
+
+	return 0;
+}
+
+/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
+ *
+ * This socket option is applicable to the UDP-style socket only.  When
+ * set it will cause associations that are idle for more than the
+ * specified number of seconds to automatically close.  An association
+ * being idle is defined an association that has NOT sent or received
+ * user data.  The special value of '0' indicates that no automatic
+ * close of any associations should be performed.  The option expects an
+ * integer defining the number of seconds of idle time before an
+ * association is closed.
+ */
+static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
+				     unsigned int optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	/* Applicable to UDP-style socket only */
+	if (sctp_style(sk, TCP))
+		return -EOPNOTSUPP;
+	if (optlen != sizeof(int))
+		return -EINVAL;
+	if (copy_from_user(&sp->autoclose, optval, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
+ *
+ * Applications can enable or disable heartbeats for any peer address of
+ * an association, modify an address's heartbeat interval, force a
+ * heartbeat to be sent immediately, and adjust the address's maximum
+ * number of retransmissions sent before an address is considered
+ * unreachable.  The following structure is used to access and modify an
+ * address's parameters:
+ *
+ *  struct sctp_paddrparams {
+ *     sctp_assoc_t            spp_assoc_id;
+ *     struct sockaddr_storage spp_address;
+ *     uint32_t                spp_hbinterval;
+ *     uint16_t                spp_pathmaxrxt;
+ *     uint32_t                spp_pathmtu;
+ *     uint32_t                spp_sackdelay;
+ *     uint32_t                spp_flags;
+ * };
+ *
+ *   spp_assoc_id    - (one-to-many style socket) This is filled in the
+ *                     application, and identifies the association for
+ *                     this query.
+ *   spp_address     - This specifies which address is of interest.
+ *   spp_hbinterval  - This contains the value of the heartbeat interval,
+ *                     in milliseconds.  If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmaxrxt  - This contains the maximum number of
+ *                     retransmissions before this address shall be
+ *                     considered unreachable. If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmtu     - When Path MTU discovery is disabled the value
+ *                     specified here will be the "fixed" path mtu.
+ *                     Note that if the spp_address field is empty
+ *                     then all associations on this address will
+ *                     have this fixed path mtu set upon them.
+ *
+ *   spp_sackdelay   - When delayed sack is enabled, this value specifies
+ *                     the number of milliseconds that sacks will be delayed
+ *                     for. This value will apply to all addresses of an
+ *                     association if the spp_address field is empty. Note
+ *                     also, that if delayed sack is enabled and this
+ *                     value is set to 0, no change is made to the last
+ *                     recorded delayed sack timer value.
+ *
+ *   spp_flags       - These flags are used to control various features
+ *                     on an association. The flag field may contain
+ *                     zero or more of the following options.
+ *
+ *                     SPP_HB_ENABLE  - Enable heartbeats on the
+ *                     specified address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     have heartbeats enabled upon them.
+ *
+ *                     SPP_HB_DISABLE - Disable heartbeats on the
+ *                     speicifed address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     will have their heartbeats disabled. Note also
+ *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ *                     mutually exclusive, only one of these two should
+ *                     be specified. Enabling both fields will have
+ *                     undetermined results.
+ *
+ *                     SPP_HB_DEMAND - Request a user initiated heartbeat
+ *                     to be made immediately.
+ *
+ *                     SPP_HB_TIME_IS_ZERO - Specify's that the time for
+ *                     heartbeat delayis to be set to the value of 0
+ *                     milliseconds.
+ *
+ *                     SPP_PMTUD_ENABLE - This field will enable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected.
+ *
+ *                     SPP_PMTUD_DISABLE - This field will disable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected. Not also that
+ *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ *                     exclusive. Enabling both will have undetermined
+ *                     results.
+ *
+ *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
+ *                     on delayed sack. The time specified in spp_sackdelay
+ *                     is used to specify the sack delay for this address. Note
+ *                     that if spp_address is empty then all addresses will
+ *                     enable delayed sack and take on the sack delay
+ *                     value specified in spp_sackdelay.
+ *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
+ *                     off delayed sack. If the spp_address field is blank then
+ *                     delayed sack is disabled for the entire association. Note
+ *                     also that this field is mutually exclusive to
+ *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
+ *                     results.
+ */
+static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+				       struct sctp_transport   *trans,
+				       struct sctp_association *asoc,
+				       struct sctp_sock        *sp,
+				       int                      hb_change,
+				       int                      pmtud_change,
+				       int                      sackdelay_change)
+{
+	int error;
+
+	if (params->spp_flags & SPP_HB_DEMAND && trans) {
+		error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
+		if (error)
+			return error;
+	}
+
+	/* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of
+	 * this field is ignored.  Note also that a value of zero indicates
+	 * the current setting should be left unchanged.
+	 */
+	if (params->spp_flags & SPP_HB_ENABLE) {
+
+		/* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is
+		 * set.  This lets us use 0 value when this flag
+		 * is set.
+		 */
+		if (params->spp_flags & SPP_HB_TIME_IS_ZERO)
+			params->spp_hbinterval = 0;
+
+		if (params->spp_hbinterval ||
+		    (params->spp_flags & SPP_HB_TIME_IS_ZERO)) {
+			if (trans) {
+				trans->hbinterval =
+				    msecs_to_jiffies(params->spp_hbinterval);
+			} else if (asoc) {
+				asoc->hbinterval =
+				    msecs_to_jiffies(params->spp_hbinterval);
+			} else {
+				sp->hbinterval = params->spp_hbinterval;
+			}
+		}
+	}
+
+	if (hb_change) {
+		if (trans) {
+			trans->param_flags =
+				(trans->param_flags & ~SPP_HB) | hb_change;
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_HB) | hb_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_HB) | hb_change;
+		}
+	}
+
+	/* When Path MTU discovery is disabled the value specified here will
+	 * be the "fixed" path mtu (i.e. the value of the spp_flags field must
+	 * include the flag SPP_PMTUD_DISABLE for this field to have any
+	 * effect).
+	 */
+	if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
+		if (trans) {
+			trans->pathmtu = params->spp_pathmtu;
+			sctp_assoc_sync_pmtu(asoc);
+		} else if (asoc) {
+			asoc->pathmtu = params->spp_pathmtu;
+			sctp_frag_point(asoc, params->spp_pathmtu);
+		} else {
+			sp->pathmtu = params->spp_pathmtu;
+		}
+	}
+
+	if (pmtud_change) {
+		if (trans) {
+			int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
+				(params->spp_flags & SPP_PMTUD_ENABLE);
+			trans->param_flags =
+				(trans->param_flags & ~SPP_PMTUD) | pmtud_change;
+			if (update) {
+				sctp_transport_pmtu(trans, sctp_opt2sk(sp));
+				sctp_assoc_sync_pmtu(asoc);
+			}
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_PMTUD) | pmtud_change;
+		}
+	}
+
+	/* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the
+	 * value of this field is ignored.  Note also that a value of zero
+	 * indicates the current setting should be left unchanged.
+	 */
+	if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) {
+		if (trans) {
+			trans->sackdelay =
+				msecs_to_jiffies(params->spp_sackdelay);
+		} else if (asoc) {
+			asoc->sackdelay =
+				msecs_to_jiffies(params->spp_sackdelay);
+		} else {
+			sp->sackdelay = params->spp_sackdelay;
+		}
+	}
+
+	if (sackdelay_change) {
+		if (trans) {
+			trans->param_flags =
+				(trans->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		}
+	}
+
+	/* Note that a value of zero indicates the current setting should be
+	   left unchanged.
+	 */
+	if (params->spp_pathmaxrxt) {
+		if (trans) {
+			trans->pathmaxrxt = params->spp_pathmaxrxt;
+		} else if (asoc) {
+			asoc->pathmaxrxt = params->spp_pathmaxrxt;
+		} else {
+			sp->pathmaxrxt = params->spp_pathmaxrxt;
+		}
+	}
+
+	return 0;
+}
+
+static int sctp_setsockopt_peer_addr_params(struct sock *sk,
+					    char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_paddrparams  params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+	int error;
+	int hb_change, pmtud_change, sackdelay_change;
+
+	if (optlen != sizeof(struct sctp_paddrparams))
+		return - EINVAL;
+
+	if (copy_from_user(&params, optval, optlen))
+		return -EFAULT;
+
+	/* Validate flags and value parameters. */
+	hb_change        = params.spp_flags & SPP_HB;
+	pmtud_change     = params.spp_flags & SPP_PMTUD;
+	sackdelay_change = params.spp_flags & SPP_SACKDELAY;
+
+	if (hb_change        == SPP_HB ||
+	    pmtud_change     == SPP_PMTUD ||
+	    sackdelay_change == SPP_SACKDELAY ||
+	    params.spp_sackdelay > 500 ||
+	    (params.spp_pathmtu &&
+	     params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
+		return -EINVAL;
+
+	/* If an address other than INADDR_ANY is specified, and
+	 * no transport is found, then the request is invalid.
+	 */
+	if (!sctp_is_any(sk, ( union sctp_addr *)&params.spp_address)) {
+		trans = sctp_addr_id2transport(sk, &params.spp_address,
+					       params.spp_assoc_id);
+		if (!trans)
+			return -EINVAL;
+	}
+
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+	 */
+	asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+	if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	/* Heartbeat demand can only be sent on a transport or
+	 * association, but not a socket.
+	 */
+	if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
+		return -EINVAL;
+
+	/* Process parameters. */
+	error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+					    hb_change, pmtud_change,
+					    sackdelay_change);
+
+	if (error)
+		return error;
+
+	/* If changes are for association, also apply parameters to each
+	 * transport.
+	 */
+	if (!trans && asoc) {
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				transports) {
+			sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+						    hb_change, pmtud_change,
+						    sackdelay_change);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
+ *
+ * This option will effect the way delayed acks are performed.  This
+ * option allows you to get or set the delayed ack time, in
+ * milliseconds.  It also allows changing the delayed ack frequency.
+ * Changing the frequency to 1 disables the delayed sack algorithm.  If
+ * the assoc_id is 0, then this sets or gets the endpoints default
+ * values.  If the assoc_id field is non-zero, then the set or get
+ * effects the specified association for the one to many model (the
+ * assoc_id field is ignored by the one to one model).  Note that if
+ * sack_delay or sack_freq are 0 when setting this option, then the
+ * current values will remain unchanged.
+ *
+ * struct sctp_sack_info {
+ *     sctp_assoc_t            sack_assoc_id;
+ *     uint32_t                sack_delay;
+ *     uint32_t                sack_freq;
+ * };
+ *
+ * sack_assoc_id -  This parameter, indicates which association the user
+ *    is performing an action upon.  Note that if this field's value is
+ *    zero then the endpoints default value is changed (effecting future
+ *    associations only).
+ *
+ * sack_delay -  This parameter contains the number of milliseconds that
+ *    the user is requesting the delayed ACK timer be set to.  Note that
+ *    this value is defined in the standard to be between 200 and 500
+ *    milliseconds.
+ *
+ * sack_freq -  This parameter contains the number of packets that must
+ *    be received before a sack is sent without waiting for the delay
+ *    timer to expire.  The default value for this is 2, setting this
+ *    value to 1 will disable the delayed sack algorithm.
+ */
+
+static int sctp_setsockopt_delayed_ack(struct sock *sk,
+				       char __user *optval, unsigned int optlen)
+{
+	struct sctp_sack_info    params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+
+	if (optlen == sizeof(struct sctp_sack_info)) {
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+
+		if (params.sack_delay == 0 && params.sack_freq == 0)
+			return 0;
+	} else if (optlen == sizeof(struct sctp_assoc_value)) {
+		pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+		pr_warn("Use struct sctp_sack_info instead\n");
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+
+		if (params.sack_delay == 0)
+			params.sack_freq = 1;
+		else
+			params.sack_freq = 0;
+	} else
+		return - EINVAL;
+
+	/* Validate value parameter. */
+	if (params.sack_delay > 500)
+		return -EINVAL;
+
+	/* Get association, if sack_assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+	 */
+	asoc = sctp_id2assoc(sk, params.sack_assoc_id);
+	if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (params.sack_delay) {
+		if (asoc) {
+			asoc->sackdelay =
+				msecs_to_jiffies(params.sack_delay);
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		} else {
+			sp->sackdelay = params.sack_delay;
+			sp->param_flags =
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		}
+	}
+
+	if (params.sack_freq == 1) {
+		if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_DISABLE;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_DISABLE;
+		}
+	} else if (params.sack_freq > 1) {
+		if (asoc) {
+			asoc->sackfreq = params.sack_freq;
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		} else {
+			sp->sackfreq = params.sack_freq;
+			sp->param_flags =
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		}
+	}
+
+	/* If change is for association, also apply to each transport. */
+	if (asoc) {
+		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+				transports) {
+			if (params.sack_delay) {
+				trans->sackdelay =
+					msecs_to_jiffies(params.sack_delay);
+				trans->param_flags =
+					(trans->param_flags & ~SPP_SACKDELAY) |
+					SPP_SACKDELAY_ENABLE;
+			}
+			if (params.sack_freq == 1) {
+				trans->param_flags =
+					(trans->param_flags & ~SPP_SACKDELAY) |
+					SPP_SACKDELAY_DISABLE;
+			} else if (params.sack_freq > 1) {
+				trans->sackfreq = params.sack_freq;
+				trans->param_flags =
+					(trans->param_flags & ~SPP_SACKDELAY) |
+					SPP_SACKDELAY_ENABLE;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
+ *
+ * Applications can specify protocol parameters for the default association
+ * initialization.  The option name argument to setsockopt() and getsockopt()
+ * is SCTP_INITMSG.
+ *
+ * Setting initialization parameters is effective only on an unconnected
+ * socket (for UDP-style sockets only future associations are effected
+ * by the change).  With TCP-style sockets, this option is inherited by
+ * sockets derived from a listener socket.
+ */
+static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+	struct sctp_initmsg sinit;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (optlen != sizeof(struct sctp_initmsg))
+		return -EINVAL;
+	if (copy_from_user(&sinit, optval, optlen))
+		return -EFAULT;
+
+	if (sinit.sinit_num_ostreams)
+		sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams;
+	if (sinit.sinit_max_instreams)
+		sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams;
+	if (sinit.sinit_max_attempts)
+		sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts;
+	if (sinit.sinit_max_init_timeo)
+		sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo;
+
+	return 0;
+}
+
+/*
+ * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
+ *
+ *   Applications that wish to use the sendto() system call may wish to
+ *   specify a default set of parameters that would normally be supplied
+ *   through the inclusion of ancillary data.  This socket option allows
+ *   such an application to set the default sctp_sndrcvinfo structure.
+ *   The application that wishes to use this socket option simply passes
+ *   in to this call the sctp_sndrcvinfo structure defined in Section
+ *   5.2.2) The input parameters accepted by this call include
+ *   sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
+ *   sinfo_timetolive.  The user must provide the sinfo_assoc_id field in
+ *   to this call if the caller is using the UDP model.
+ */
+static int sctp_setsockopt_default_send_param(struct sock *sk,
+					      char __user *optval,
+					      unsigned int optlen)
+{
+	struct sctp_sndrcvinfo info;
+	struct sctp_association *asoc;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (optlen != sizeof(struct sctp_sndrcvinfo))
+		return -EINVAL;
+	if (copy_from_user(&info, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, info.sinfo_assoc_id);
+	if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		asoc->default_stream = info.sinfo_stream;
+		asoc->default_flags = info.sinfo_flags;
+		asoc->default_ppid = info.sinfo_ppid;
+		asoc->default_context = info.sinfo_context;
+		asoc->default_timetolive = info.sinfo_timetolive;
+	} else {
+		sp->default_stream = info.sinfo_stream;
+		sp->default_flags = info.sinfo_flags;
+		sp->default_ppid = info.sinfo_ppid;
+		sp->default_context = info.sinfo_context;
+		sp->default_timetolive = info.sinfo_timetolive;
+	}
+
+	return 0;
+}
+
+/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
+ *
+ * Requests that the local SCTP stack use the enclosed peer address as
+ * the association primary.  The enclosed address must be one of the
+ * association peer's addresses.
+ */
+static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
+					unsigned int optlen)
+{
+	struct sctp_prim prim;
+	struct sctp_transport *trans;
+
+	if (optlen != sizeof(struct sctp_prim))
+		return -EINVAL;
+
+	if (copy_from_user(&prim, optval, sizeof(struct sctp_prim)))
+		return -EFAULT;
+
+	trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id);
+	if (!trans)
+		return -EINVAL;
+
+	sctp_assoc_set_primary(trans->asoc, trans);
+
+	return 0;
+}
+
+/*
+ * 7.1.5 SCTP_NODELAY
+ *
+ * Turn on/off any Nagle-like algorithm.  This means that packets are
+ * generally sent as soon as possible and no unnecessary delays are
+ * introduced, at the cost of more packets in the network.  Expects an
+ *  integer boolean flag.
+ */
+static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
+				   unsigned int optlen)
+{
+	int val;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1;
+	return 0;
+}
+
+/*
+ *
+ * 7.1.1 SCTP_RTOINFO
+ *
+ * The protocol parameters used to initialize and bound retransmission
+ * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
+ * and modify these parameters.
+ * All parameters are time values, in milliseconds.  A value of 0, when
+ * modifying the parameters, indicates that the current value should not
+ * be changed.
+ *
+ */
+static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+	struct sctp_rtoinfo rtoinfo;
+	struct sctp_association *asoc;
+
+	if (optlen != sizeof (struct sctp_rtoinfo))
+		return -EINVAL;
+
+	if (copy_from_user(&rtoinfo, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id);
+
+	/* Set the values to the specific association */
+	if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		if (rtoinfo.srto_initial != 0)
+			asoc->rto_initial =
+				msecs_to_jiffies(rtoinfo.srto_initial);
+		if (rtoinfo.srto_max != 0)
+			asoc->rto_max = msecs_to_jiffies(rtoinfo.srto_max);
+		if (rtoinfo.srto_min != 0)
+			asoc->rto_min = msecs_to_jiffies(rtoinfo.srto_min);
+	} else {
+		/* If there is no association or the association-id = 0
+		 * set the values to the endpoint.
+		 */
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		if (rtoinfo.srto_initial != 0)
+			sp->rtoinfo.srto_initial = rtoinfo.srto_initial;
+		if (rtoinfo.srto_max != 0)
+			sp->rtoinfo.srto_max = rtoinfo.srto_max;
+		if (rtoinfo.srto_min != 0)
+			sp->rtoinfo.srto_min = rtoinfo.srto_min;
+	}
+
+	return 0;
+}
+
+/*
+ *
+ * 7.1.2 SCTP_ASSOCINFO
+ *
+ * This option is used to tune the maximum retransmission attempts
+ * of the association.
+ * Returns an error if the new association retransmission value is
+ * greater than the sum of the retransmission value  of the peer.
+ * See [SCTP] for more information.
+ *
+ */
+static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+
+	struct sctp_assocparams assocparams;
+	struct sctp_association *asoc;
+
+	if (optlen != sizeof(struct sctp_assocparams))
+		return -EINVAL;
+	if (copy_from_user(&assocparams, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id);
+
+	if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	/* Set the values to the specific association */
+	if (asoc) {
+		if (assocparams.sasoc_asocmaxrxt != 0) {
+			__u32 path_sum = 0;
+			int   paths = 0;
+			struct sctp_transport *peer_addr;
+
+			list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list,
+					transports) {
+				path_sum += peer_addr->pathmaxrxt;
+				paths++;
+			}
+
+			/* Only validate asocmaxrxt if we have more than
+			 * one path/transport.  We do this because path
+			 * retransmissions are only counted when we have more
+			 * then one path.
+			 */
+			if (paths > 1 &&
+			    assocparams.sasoc_asocmaxrxt > path_sum)
+				return -EINVAL;
+
+			asoc->max_retrans = assocparams.sasoc_asocmaxrxt;
+		}
+
+		if (assocparams.sasoc_cookie_life != 0) {
+			asoc->cookie_life.tv_sec =
+					assocparams.sasoc_cookie_life / 1000;
+			asoc->cookie_life.tv_usec =
+					(assocparams.sasoc_cookie_life % 1000)
+					* 1000;
+		}
+	} else {
+		/* Set the values to the endpoint */
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		if (assocparams.sasoc_asocmaxrxt != 0)
+			sp->assocparams.sasoc_asocmaxrxt =
+						assocparams.sasoc_asocmaxrxt;
+		if (assocparams.sasoc_cookie_life != 0)
+			sp->assocparams.sasoc_cookie_life =
+						assocparams.sasoc_cookie_life;
+	}
+	return 0;
+}
+
+/*
+ * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
+ *
+ * This socket option is a boolean flag which turns on or off mapped V4
+ * addresses.  If this option is turned on and the socket is type
+ * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
+ * If this option is turned off, then no mapping will be done of V4
+ * addresses and a user will receive both PF_INET6 and PF_INET type
+ * addresses on the socket.
+ */
+static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+	int val;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+	if (val)
+		sp->v4mapped = 1;
+	else
+		sp->v4mapped = 0;
+
+	return 0;
+}
+
+/*
+ * 8.1.16.  Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
+ * This option will get or set the maximum size to put in any outgoing
+ * SCTP DATA chunk.  If a message is larger than this size it will be
+ * fragmented by SCTP into the specified size.  Note that the underlying
+ * SCTP implementation may fragment into smaller sized chunks when the
+ * PMTU of the underlying association is smaller than the value set by
+ * the user.  The default value for this option is '0' which indicates
+ * the user is NOT limiting fragmentation and only the PMTU will effect
+ * SCTP's choice of DATA chunk size.  Note also that values set larger
+ * than the maximum size of an IP datagram will effectively let SCTP
+ * control fragmentation (i.e. the same as setting this option to 0).
+ *
+ * The following structure is used to access and modify this parameter:
+ *
+ * struct sctp_assoc_value {
+ *   sctp_assoc_t assoc_id;
+ *   uint32_t assoc_value;
+ * };
+ *
+ * assoc_id:  This parameter is ignored for one-to-one style sockets.
+ *    For one-to-many style sockets this parameter indicates which
+ *    association the user is performing an action upon.  Note that if
+ *    this field's value is zero then the endpoints default value is
+ *    changed (effecting future associations only).
+ * assoc_value:  This parameter specifies the maximum size in bytes.
+ */
+static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	struct sctp_sock *sp = sctp_sk(sk);
+	int val;
+
+	if (optlen == sizeof(int)) {
+		pr_warn("Use of int in maxseg socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
+		if (copy_from_user(&val, optval, optlen))
+			return -EFAULT;
+		params.assoc_id = 0;
+	} else if (optlen == sizeof(struct sctp_assoc_value)) {
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+		val = params.assoc_value;
+	} else
+		return -EINVAL;
+
+	if ((val != 0) && ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)))
+		return -EINVAL;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		if (val == 0) {
+			val = asoc->pathmtu;
+			val -= sp->pf->af->net_header_len;
+			val -= sizeof(struct sctphdr) +
+					sizeof(struct sctp_data_chunk);
+		}
+		asoc->user_frag = val;
+		asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+	} else {
+		sp->user_frag = val;
+	}
+
+	return 0;
+}
+
+
+/*
+ *  7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR)
+ *
+ *   Requests that the peer mark the enclosed address as the association
+ *   primary. The enclosed address must be one of the association's
+ *   locally bound addresses. The following structure is used to make a
+ *   set primary request:
+ */
+static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval,
+					     unsigned int optlen)
+{
+	struct sctp_sock	*sp;
+	struct sctp_association	*asoc = NULL;
+	struct sctp_setpeerprim	prim;
+	struct sctp_chunk	*chunk;
+	struct sctp_af		*af;
+	int 			err;
+
+	sp = sctp_sk(sk);
+
+	if (!sctp_addip_enable)
+		return -EPERM;
+
+	if (optlen != sizeof(struct sctp_setpeerprim))
+		return -EINVAL;
+
+	if (copy_from_user(&prim, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, prim.sspp_assoc_id);
+	if (!asoc)
+		return -EINVAL;
+
+	if (!asoc->peer.asconf_capable)
+		return -EPERM;
+
+	if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY)
+		return -EPERM;
+
+	if (!sctp_state(asoc, ESTABLISHED))
+		return -ENOTCONN;
+
+	af = sctp_get_af_specific(prim.sspp_addr.ss_family);
+	if (!af)
+		return -EINVAL;
+
+	if (!af->addr_valid((union sctp_addr *)&prim.sspp_addr, sp, NULL))
+		return -EADDRNOTAVAIL;
+
+	if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr))
+		return -EADDRNOTAVAIL;
+
+	/* Create an ASCONF chunk with SET_PRIMARY parameter	*/
+	chunk = sctp_make_asconf_set_prim(asoc,
+					  (union sctp_addr *)&prim.sspp_addr);
+	if (!chunk)
+		return -ENOMEM;
+
+	err = sctp_send_asconf(asoc, chunk);
+
+	SCTP_DEBUG_PRINTK("We set peer primary addr primitively.\n");
+
+	return err;
+}
+
+static int sctp_setsockopt_adaptation_layer(struct sock *sk, char __user *optval,
+					    unsigned int optlen)
+{
+	struct sctp_setadaptation adaptation;
+
+	if (optlen != sizeof(struct sctp_setadaptation))
+		return -EINVAL;
+	if (copy_from_user(&adaptation, optval, optlen))
+		return -EFAULT;
+
+	sctp_sk(sk)->adaptation_ind = adaptation.ssb_adaptation_ind;
+
+	return 0;
+}
+
+/*
+ * 7.1.29.  Set or Get the default context (SCTP_CONTEXT)
+ *
+ * The context field in the sctp_sndrcvinfo structure is normally only
+ * used when a failed message is retrieved holding the value that was
+ * sent down on the actual send call.  This option allows the setting of
+ * a default context on an association basis that will be received on
+ * reading messages from the peer.  This is especially helpful in the
+ * one-2-many model for an application to keep some reference to an
+ * internal state machine that is processing messages on the
+ * association.  Note that the setting of this value only effects
+ * received messages from the peer and does not effect the value that is
+ * saved with outbound messages.
+ */
+static int sctp_setsockopt_context(struct sock *sk, char __user *optval,
+				   unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_sock *sp;
+	struct sctp_association *asoc;
+
+	if (optlen != sizeof(struct sctp_assoc_value))
+		return -EINVAL;
+	if (copy_from_user(&params, optval, optlen))
+		return -EFAULT;
+
+	sp = sctp_sk(sk);
+
+	if (params.assoc_id != 0) {
+		asoc = sctp_id2assoc(sk, params.assoc_id);
+		if (!asoc)
+			return -EINVAL;
+		asoc->default_rcv_context = params.assoc_value;
+	} else {
+		sp->default_rcv_context = params.assoc_value;
+	}
+
+	return 0;
+}
+
+/*
+ * 7.1.24.  Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
+ *
+ * This options will at a minimum specify if the implementation is doing
+ * fragmented interleave.  Fragmented interleave, for a one to many
+ * socket, is when subsequent calls to receive a message may return
+ * parts of messages from different associations.  Some implementations
+ * may allow you to turn this value on or off.  If so, when turned off,
+ * no fragment interleave will occur (which will cause a head of line
+ * blocking amongst multiple associations sharing the same one to many
+ * socket).  When this option is turned on, then each receive call may
+ * come from a different association (thus the user must receive data
+ * with the extended calls (e.g. sctp_recvmsg) to keep track of which
+ * association each receive belongs to.
+ *
+ * This option takes a boolean value.  A non-zero value indicates that
+ * fragmented interleave is on.  A value of zero indicates that
+ * fragmented interleave is off.
+ *
+ * Note that it is important that an implementation that allows this
+ * option to be turned on, have it off by default.  Otherwise an unaware
+ * application using the one to many model may become confused and act
+ * incorrectly.
+ */
+static int sctp_setsockopt_fragment_interleave(struct sock *sk,
+					       char __user *optval,
+					       unsigned int optlen)
+{
+	int val;
+
+	if (optlen != sizeof(int))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	sctp_sk(sk)->frag_interleave = (val == 0) ? 0 : 1;
+
+	return 0;
+}
+
+/*
+ * 8.1.21.  Set or Get the SCTP Partial Delivery Point
+ *       (SCTP_PARTIAL_DELIVERY_POINT)
+ *
+ * This option will set or get the SCTP partial delivery point.  This
+ * point is the size of a message where the partial delivery API will be
+ * invoked to help free up rwnd space for the peer.  Setting this to a
+ * lower value will cause partial deliveries to happen more often.  The
+ * calls argument is an integer that sets or gets the partial delivery
+ * point.  Note also that the call will fail if the user attempts to set
+ * this value larger than the socket receive buffer size.
+ *
+ * Note that any single message having a length smaller than or equal to
+ * the SCTP partial delivery point will be delivered in one single read
+ * call as long as the user provided buffer is large enough to hold the
+ * message.
+ */
+static int sctp_setsockopt_partial_delivery_point(struct sock *sk,
+						  char __user *optval,
+						  unsigned int optlen)
+{
+	u32 val;
+
+	if (optlen != sizeof(u32))
+		return -EINVAL;
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	/* Note: We double the receive buffer from what the user sets
+	 * it to be, also initial rwnd is based on rcvbuf/2.
+	 */
+	if (val > (sk->sk_rcvbuf >> 1))
+		return -EINVAL;
+
+	sctp_sk(sk)->pd_point = val;
+
+	return 0; /* is this the right error code? */
+}
+
+/*
+ * 7.1.28.  Set or Get the maximum burst (SCTP_MAX_BURST)
+ *
+ * This option will allow a user to change the maximum burst of packets
+ * that can be emitted by this association.  Note that the default value
+ * is 4, and some implementations may restrict this setting so that it
+ * can only be lowered.
+ *
+ * NOTE: This text doesn't seem right.  Do this on a socket basis with
+ * future associations inheriting the socket value.
+ */
+static int sctp_setsockopt_maxburst(struct sock *sk,
+				    char __user *optval,
+				    unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_sock *sp;
+	struct sctp_association *asoc;
+	int val;
+	int assoc_id = 0;
+
+	if (optlen == sizeof(int)) {
+		pr_warn("Use of int in max_burst socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
+		if (copy_from_user(&val, optval, optlen))
+			return -EFAULT;
+	} else if (optlen == sizeof(struct sctp_assoc_value)) {
+		if (copy_from_user(&params, optval, optlen))
+			return -EFAULT;
+		val = params.assoc_value;
+		assoc_id = params.assoc_id;
+	} else
+		return -EINVAL;
+
+	sp = sctp_sk(sk);
+
+	if (assoc_id != 0) {
+		asoc = sctp_id2assoc(sk, assoc_id);
+		if (!asoc)
+			return -EINVAL;
+		asoc->max_burst = val;
+	} else
+		sp->max_burst = val;
+
+	return 0;
+}
+
+/*
+ * 7.1.18.  Add a chunk that must be authenticated (SCTP_AUTH_CHUNK)
+ *
+ * This set option adds a chunk type that the user is requesting to be
+ * received only in an authenticated way.  Changes to the list of chunks
+ * will only effect future associations on the socket.
+ */
+static int sctp_setsockopt_auth_chunk(struct sock *sk,
+				      char __user *optval,
+				      unsigned int optlen)
+{
+	struct sctp_authchunk val;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (optlen != sizeof(struct sctp_authchunk))
+		return -EINVAL;
+	if (copy_from_user(&val, optval, optlen))
+		return -EFAULT;
+
+	switch (val.sauth_chunk) {
+		case SCTP_CID_INIT:
+		case SCTP_CID_INIT_ACK:
+		case SCTP_CID_SHUTDOWN_COMPLETE:
+		case SCTP_CID_AUTH:
+			return -EINVAL;
+	}
+
+	/* add this chunk id to the endpoint */
+	return sctp_auth_ep_add_chunkid(sctp_sk(sk)->ep, val.sauth_chunk);
+}
+
+/*
+ * 7.1.19.  Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT)
+ *
+ * This option gets or sets the list of HMAC algorithms that the local
+ * endpoint requires the peer to use.
+ */
+static int sctp_setsockopt_hmac_ident(struct sock *sk,
+				      char __user *optval,
+				      unsigned int optlen)
+{
+	struct sctp_hmacalgo *hmacs;
+	u32 idents;
+	int err;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (optlen < sizeof(struct sctp_hmacalgo))
+		return -EINVAL;
+
+	hmacs= memdup_user(optval, optlen);
+	if (IS_ERR(hmacs))
+		return PTR_ERR(hmacs);
+
+	idents = hmacs->shmac_num_idents;
+	if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS ||
+	    (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = sctp_auth_ep_set_hmacs(sctp_sk(sk)->ep, hmacs);
+out:
+	kfree(hmacs);
+	return err;
+}
+
+/*
+ * 7.1.20.  Set a shared key (SCTP_AUTH_KEY)
+ *
+ * This option will set a shared secret key which is used to build an
+ * association shared key.
+ */
+static int sctp_setsockopt_auth_key(struct sock *sk,
+				    char __user *optval,
+				    unsigned int optlen)
+{
+	struct sctp_authkey *authkey;
+	struct sctp_association *asoc;
+	int ret;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (optlen <= sizeof(struct sctp_authkey))
+		return -EINVAL;
+
+	authkey= memdup_user(optval, optlen);
+	if (IS_ERR(authkey))
+		return PTR_ERR(authkey);
+
+	if (authkey->sca_keylength > optlen - sizeof(struct sctp_authkey)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	asoc = sctp_id2assoc(sk, authkey->sca_assoc_id);
+	if (!asoc && authkey->sca_assoc_id && sctp_style(sk, UDP)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sctp_auth_set_key(sctp_sk(sk)->ep, asoc, authkey);
+out:
+	kfree(authkey);
+	return ret;
+}
+
+/*
+ * 7.1.21.  Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY)
+ *
+ * This option will get or set the active shared key to be used to build
+ * the association shared key.
+ */
+static int sctp_setsockopt_active_key(struct sock *sk,
+				      char __user *optval,
+				      unsigned int optlen)
+{
+	struct sctp_authkeyid val;
+	struct sctp_association *asoc;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (optlen != sizeof(struct sctp_authkeyid))
+		return -EINVAL;
+	if (copy_from_user(&val, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+	if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	return sctp_auth_set_active_key(sctp_sk(sk)->ep, asoc,
+					val.scact_keynumber);
+}
+
+/*
+ * 7.1.22.  Delete a shared key (SCTP_AUTH_DELETE_KEY)
+ *
+ * This set option will delete a shared secret key from use.
+ */
+static int sctp_setsockopt_del_key(struct sock *sk,
+				   char __user *optval,
+				   unsigned int optlen)
+{
+	struct sctp_authkeyid val;
+	struct sctp_association *asoc;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (optlen != sizeof(struct sctp_authkeyid))
+		return -EINVAL;
+	if (copy_from_user(&val, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+	if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	return sctp_auth_del_key_id(sctp_sk(sk)->ep, asoc,
+				    val.scact_keynumber);
+
+}
+
+
+/* API 6.2 setsockopt(), getsockopt()
+ *
+ * Applications use setsockopt() and getsockopt() to set or retrieve
+ * socket options.  Socket options are used to change the default
+ * behavior of sockets calls.  They are described in Section 7.
+ *
+ * The syntax is:
+ *
+ *   ret = getsockopt(int sd, int level, int optname, void __user *optval,
+ *                    int __user *optlen);
+ *   ret = setsockopt(int sd, int level, int optname, const void __user *optval,
+ *                    int optlen);
+ *
+ *   sd      - the socket descript.
+ *   level   - set to IPPROTO_SCTP for all SCTP options.
+ *   optname - the option name.
+ *   optval  - the buffer to store the value of the option.
+ *   optlen  - the size of the buffer.
+ */
+SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, unsigned int optlen)
+{
+	int retval = 0;
+
+	SCTP_DEBUG_PRINTK("sctp_setsockopt(sk: %p... optname: %d)\n",
+			  sk, optname);
+
+	/* I can hardly begin to describe how wrong this is.  This is
+	 * so broken as to be worse than useless.  The API draft
+	 * REALLY is NOT helpful here...  I am not convinced that the
+	 * semantics of setsockopt() with a level OTHER THAN SOL_SCTP
+	 * are at all well-founded.
+	 */
+	if (level != SOL_SCTP) {
+		struct sctp_af *af = sctp_sk(sk)->pf->af;
+		retval = af->setsockopt(sk, level, optname, optval, optlen);
+		goto out_nounlock;
+	}
+
+	sctp_lock_sock(sk);
+
+	switch (optname) {
+	case SCTP_SOCKOPT_BINDX_ADD:
+		/* 'optlen' is the size of the addresses buffer. */
+		retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+					       optlen, SCTP_BINDX_ADD_ADDR);
+		break;
+
+	case SCTP_SOCKOPT_BINDX_REM:
+		/* 'optlen' is the size of the addresses buffer. */
+		retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval,
+					       optlen, SCTP_BINDX_REM_ADDR);
+		break;
+
+	case SCTP_SOCKOPT_CONNECTX_OLD:
+		/* 'optlen' is the size of the addresses buffer. */
+		retval = sctp_setsockopt_connectx_old(sk,
+					    (struct sockaddr __user *)optval,
+					    optlen);
+		break;
+
+	case SCTP_SOCKOPT_CONNECTX:
+		/* 'optlen' is the size of the addresses buffer. */
+		retval = sctp_setsockopt_connectx(sk,
+					    (struct sockaddr __user *)optval,
+					    optlen);
+		break;
+
+	case SCTP_DISABLE_FRAGMENTS:
+		retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
+		break;
+
+	case SCTP_EVENTS:
+		retval = sctp_setsockopt_events(sk, optval, optlen);
+		break;
+
+	case SCTP_AUTOCLOSE:
+		retval = sctp_setsockopt_autoclose(sk, optval, optlen);
+		break;
+
+	case SCTP_PEER_ADDR_PARAMS:
+		retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
+		break;
+
+	case SCTP_DELAYED_SACK:
+		retval = sctp_setsockopt_delayed_ack(sk, optval, optlen);
+		break;
+	case SCTP_PARTIAL_DELIVERY_POINT:
+		retval = sctp_setsockopt_partial_delivery_point(sk, optval, optlen);
+		break;
+
+	case SCTP_INITMSG:
+		retval = sctp_setsockopt_initmsg(sk, optval, optlen);
+		break;
+	case SCTP_DEFAULT_SEND_PARAM:
+		retval = sctp_setsockopt_default_send_param(sk, optval,
+							    optlen);
+		break;
+	case SCTP_PRIMARY_ADDR:
+		retval = sctp_setsockopt_primary_addr(sk, optval, optlen);
+		break;
+	case SCTP_SET_PEER_PRIMARY_ADDR:
+		retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen);
+		break;
+	case SCTP_NODELAY:
+		retval = sctp_setsockopt_nodelay(sk, optval, optlen);
+		break;
+	case SCTP_RTOINFO:
+		retval = sctp_setsockopt_rtoinfo(sk, optval, optlen);
+		break;
+	case SCTP_ASSOCINFO:
+		retval = sctp_setsockopt_associnfo(sk, optval, optlen);
+		break;
+	case SCTP_I_WANT_MAPPED_V4_ADDR:
+		retval = sctp_setsockopt_mappedv4(sk, optval, optlen);
+		break;
+	case SCTP_MAXSEG:
+		retval = sctp_setsockopt_maxseg(sk, optval, optlen);
+		break;
+	case SCTP_ADAPTATION_LAYER:
+		retval = sctp_setsockopt_adaptation_layer(sk, optval, optlen);
+		break;
+	case SCTP_CONTEXT:
+		retval = sctp_setsockopt_context(sk, optval, optlen);
+		break;
+	case SCTP_FRAGMENT_INTERLEAVE:
+		retval = sctp_setsockopt_fragment_interleave(sk, optval, optlen);
+		break;
+	case SCTP_MAX_BURST:
+		retval = sctp_setsockopt_maxburst(sk, optval, optlen);
+		break;
+	case SCTP_AUTH_CHUNK:
+		retval = sctp_setsockopt_auth_chunk(sk, optval, optlen);
+		break;
+	case SCTP_HMAC_IDENT:
+		retval = sctp_setsockopt_hmac_ident(sk, optval, optlen);
+		break;
+	case SCTP_AUTH_KEY:
+		retval = sctp_setsockopt_auth_key(sk, optval, optlen);
+		break;
+	case SCTP_AUTH_ACTIVE_KEY:
+		retval = sctp_setsockopt_active_key(sk, optval, optlen);
+		break;
+	case SCTP_AUTH_DELETE_KEY:
+		retval = sctp_setsockopt_del_key(sk, optval, optlen);
+		break;
+	default:
+		retval = -ENOPROTOOPT;
+		break;
+	}
+
+	sctp_release_sock(sk);
+
+out_nounlock:
+	return retval;
+}
+
+/* API 3.1.6 connect() - UDP Style Syntax
+ *
+ * An application may use the connect() call in the UDP model to initiate an
+ * association without sending data.
+ *
+ * The syntax is:
+ *
+ * ret = connect(int sd, const struct sockaddr *nam, socklen_t len);
+ *
+ * sd: the socket descriptor to have a new association added to.
+ *
+ * nam: the address structure (either struct sockaddr_in or struct
+ *    sockaddr_in6 defined in RFC2553 [7]).
+ *
+ * len: the size of the address.
+ */
+SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
+			     int addr_len)
+{
+	int err = 0;
+	struct sctp_af *af;
+
+	sctp_lock_sock(sk);
+
+	SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
+			  __func__, sk, addr, addr_len);
+
+	/* Validate addr_len before calling common connect/connectx routine. */
+	af = sctp_get_af_specific(addr->sa_family);
+	if (!af || addr_len < af->sockaddr_len) {
+		err = -EINVAL;
+	} else {
+		/* Pass correct addr len to common routine (so it knows there
+		 * is only one address being passed.
+		 */
+		err = __sctp_connect(sk, addr, af->sockaddr_len, NULL);
+	}
+
+	sctp_release_sock(sk);
+	return err;
+}
+
+/* FIXME: Write comments. */
+SCTP_STATIC int sctp_disconnect(struct sock *sk, int flags)
+{
+	return -EOPNOTSUPP; /* STUB */
+}
+
+/* 4.1.4 accept() - TCP Style Syntax
+ *
+ * Applications use accept() call to remove an established SCTP
+ * association from the accept queue of the endpoint.  A new socket
+ * descriptor will be returned from accept() to represent the newly
+ * formed association.
+ */
+SCTP_STATIC struct sock *sctp_accept(struct sock *sk, int flags, int *err)
+{
+	struct sctp_sock *sp;
+	struct sctp_endpoint *ep;
+	struct sock *newsk = NULL;
+	struct sctp_association *asoc;
+	long timeo;
+	int error = 0;
+
+	sctp_lock_sock(sk);
+
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	if (!sctp_style(sk, TCP)) {
+		error = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (!sctp_sstate(sk, LISTENING)) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+	error = sctp_wait_for_accept(sk, timeo);
+	if (error)
+		goto out;
+
+	/* We treat the list of associations on the endpoint as the accept
+	 * queue and pick the first association on the list.
+	 */
+	asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
+
+	newsk = sp->pf->create_accept_sk(sk, asoc);
+	if (!newsk) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	/* Populate the fields of the newsk from the oldsk and migrate the
+	 * asoc to the newsk.
+	 */
+	sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP);
+
+out:
+	sctp_release_sock(sk);
+	*err = error;
+	return newsk;
+}
+
+/* The SCTP ioctl handler. */
+SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	int rc = -ENOTCONN;
+
+	sctp_lock_sock(sk);
+
+	/*
+	 * SEQPACKET-style sockets in LISTENING state are valid, for
+	 * SCTP, so only discard TCP-style sockets in LISTENING state.
+	 */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+		goto out;
+
+	switch (cmd) {
+	case SIOCINQ: {
+		struct sk_buff *skb;
+		unsigned int amount = 0;
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb != NULL) {
+			/*
+			 * We will only return the amount of this packet since
+			 * that is all that will be read.
+			 */
+			amount = skb->len;
+		}
+		rc = put_user(amount, (int __user *)arg);
+		break;
+	}
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+out:
+	sctp_release_sock(sk);
+	return rc;
+}
+
+/* This is the function which gets called during socket creation to
+ * initialized the SCTP-specific portion of the sock.
+ * The sock structure should already be zero-filled memory.
+ */
+SCTP_STATIC int sctp_init_sock(struct sock *sk)
+{
+	struct sctp_endpoint *ep;
+	struct sctp_sock *sp;
+
+	SCTP_DEBUG_PRINTK("sctp_init_sock(sk: %p)\n", sk);
+
+	sp = sctp_sk(sk);
+
+	/* Initialize the SCTP per socket area.  */
+	switch (sk->sk_type) {
+	case SOCK_SEQPACKET:
+		sp->type = SCTP_SOCKET_UDP;
+		break;
+	case SOCK_STREAM:
+		sp->type = SCTP_SOCKET_TCP;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	/* Initialize default send parameters. These parameters can be
+	 * modified with the SCTP_DEFAULT_SEND_PARAM socket option.
+	 */
+	sp->default_stream = 0;
+	sp->default_ppid = 0;
+	sp->default_flags = 0;
+	sp->default_context = 0;
+	sp->default_timetolive = 0;
+
+	sp->default_rcv_context = 0;
+	sp->max_burst = sctp_max_burst;
+
+	/* Initialize default setup parameters. These parameters
+	 * can be modified with the SCTP_INITMSG socket option or
+	 * overridden by the SCTP_INIT CMSG.
+	 */
+	sp->initmsg.sinit_num_ostreams   = sctp_max_outstreams;
+	sp->initmsg.sinit_max_instreams  = sctp_max_instreams;
+	sp->initmsg.sinit_max_attempts   = sctp_max_retrans_init;
+	sp->initmsg.sinit_max_init_timeo = sctp_rto_max;
+
+	/* Initialize default RTO related parameters.  These parameters can
+	 * be modified for with the SCTP_RTOINFO socket option.
+	 */
+	sp->rtoinfo.srto_initial = sctp_rto_initial;
+	sp->rtoinfo.srto_max     = sctp_rto_max;
+	sp->rtoinfo.srto_min     = sctp_rto_min;
+
+	/* Initialize default association related parameters. These parameters
+	 * can be modified with the SCTP_ASSOCINFO socket option.
+	 */
+	sp->assocparams.sasoc_asocmaxrxt = sctp_max_retrans_association;
+	sp->assocparams.sasoc_number_peer_destinations = 0;
+	sp->assocparams.sasoc_peer_rwnd = 0;
+	sp->assocparams.sasoc_local_rwnd = 0;
+	sp->assocparams.sasoc_cookie_life = sctp_valid_cookie_life;
+
+	/* Initialize default event subscriptions. By default, all the
+	 * options are off.
+	 */
+	memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe));
+
+	/* Default Peer Address Parameters.  These defaults can
+	 * be modified via SCTP_PEER_ADDR_PARAMS
+	 */
+	sp->hbinterval  = sctp_hb_interval;
+	sp->pathmaxrxt  = sctp_max_retrans_path;
+	sp->pathmtu     = 0; // allow default discovery
+	sp->sackdelay   = sctp_sack_timeout;
+	sp->sackfreq	= 2;
+	sp->param_flags = SPP_HB_ENABLE |
+			  SPP_PMTUD_ENABLE |
+			  SPP_SACKDELAY_ENABLE;
+
+	/* If enabled no SCTP message fragmentation will be performed.
+	 * Configure through SCTP_DISABLE_FRAGMENTS socket option.
+	 */
+	sp->disable_fragments = 0;
+
+	/* Enable Nagle algorithm by default.  */
+	sp->nodelay           = 0;
+
+	/* Enable by default. */
+	sp->v4mapped          = 1;
+
+	/* Auto-close idle associations after the configured
+	 * number of seconds.  A value of 0 disables this
+	 * feature.  Configure through the SCTP_AUTOCLOSE socket option,
+	 * for UDP-style sockets only.
+	 */
+	sp->autoclose         = 0;
+
+	/* User specified fragmentation limit. */
+	sp->user_frag         = 0;
+
+	sp->adaptation_ind = 0;
+
+	sp->pf = sctp_get_pf_specific(sk->sk_family);
+
+	/* Control variables for partial data delivery. */
+	atomic_set(&sp->pd_mode, 0);
+	skb_queue_head_init(&sp->pd_lobby);
+	sp->frag_interleave = 0;
+
+	/* Create a per socket endpoint structure.  Even if we
+	 * change the data structure relationships, this may still
+	 * be useful for storing pre-connect address information.
+	 */
+	ep = sctp_endpoint_new(sk, GFP_KERNEL);
+	if (!ep)
+		return -ENOMEM;
+
+	sp->ep = ep;
+	sp->hmac = NULL;
+
+	SCTP_DBG_OBJCNT_INC(sock);
+
+	local_bh_disable();
+	percpu_counter_inc(&sctp_sockets_allocated);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	local_bh_enable();
+
+	return 0;
+}
+
+/* Cleanup any SCTP per socket resources.  */
+SCTP_STATIC void sctp_destroy_sock(struct sock *sk)
+{
+	struct sctp_endpoint *ep;
+
+	SCTP_DEBUG_PRINTK("sctp_destroy_sock(sk: %p)\n", sk);
+
+	/* Release our hold on the endpoint. */
+	ep = sctp_sk(sk)->ep;
+	sctp_endpoint_free(ep);
+	local_bh_disable();
+	percpu_counter_dec(&sctp_sockets_allocated);
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	local_bh_enable();
+}
+
+/* API 4.1.7 shutdown() - TCP Style Syntax
+ *     int shutdown(int socket, int how);
+ *
+ *     sd      - the socket descriptor of the association to be closed.
+ *     how     - Specifies the type of shutdown.  The  values  are
+ *               as follows:
+ *               SHUT_RD
+ *                     Disables further receive operations. No SCTP
+ *                     protocol action is taken.
+ *               SHUT_WR
+ *                     Disables further send operations, and initiates
+ *                     the SCTP shutdown sequence.
+ *               SHUT_RDWR
+ *                     Disables further send  and  receive  operations
+ *                     and initiates the SCTP shutdown sequence.
+ */
+SCTP_STATIC void sctp_shutdown(struct sock *sk, int how)
+{
+	struct sctp_endpoint *ep;
+	struct sctp_association *asoc;
+
+	if (!sctp_style(sk, TCP))
+		return;
+
+	if (how & SEND_SHUTDOWN) {
+		ep = sctp_sk(sk)->ep;
+		if (!list_empty(&ep->asocs)) {
+			asoc = list_entry(ep->asocs.next,
+					  struct sctp_association, asocs);
+			sctp_primitive_SHUTDOWN(asoc, NULL);
+		}
+	}
+}
+
+/* 7.2.1 Association Status (SCTP_STATUS)
+
+ * Applications can retrieve current status information about an
+ * association, including association state, peer receiver window size,
+ * number of unacked data chunks, and number of data chunks pending
+ * receipt.  This information is read-only.
+ */
+static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
+				       char __user *optval,
+				       int __user *optlen)
+{
+	struct sctp_status status;
+	struct sctp_association *asoc = NULL;
+	struct sctp_transport *transport;
+	sctp_assoc_t associd;
+	int retval = 0;
+
+	if (len < sizeof(status)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(status);
+	if (copy_from_user(&status, optval, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	associd = status.sstat_assoc_id;
+	asoc = sctp_id2assoc(sk, associd);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	transport = asoc->peer.primary_path;
+
+	status.sstat_assoc_id = sctp_assoc2id(asoc);
+	status.sstat_state = asoc->state;
+	status.sstat_rwnd =  asoc->peer.rwnd;
+	status.sstat_unackdata = asoc->unack_data;
+
+	status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
+	status.sstat_instrms = asoc->c.sinit_max_instreams;
+	status.sstat_outstrms = asoc->c.sinit_num_ostreams;
+	status.sstat_fragmentation_point = asoc->frag_point;
+	status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
+	memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
+			transport->af_specific->sockaddr_len);
+	/* Map ipv4 address into v4-mapped-on-v6 address.  */
+	sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
+		(union sctp_addr *)&status.sstat_primary.spinfo_address);
+	status.sstat_primary.spinfo_state = transport->state;
+	status.sstat_primary.spinfo_cwnd = transport->cwnd;
+	status.sstat_primary.spinfo_srtt = transport->srtt;
+	status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
+	status.sstat_primary.spinfo_mtu = transport->pathmtu;
+
+	if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
+		status.sstat_primary.spinfo_state = SCTP_ACTIVE;
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	SCTP_DEBUG_PRINTK("sctp_getsockopt_sctp_status(%d): %d %d %d\n",
+			  len, status.sstat_state, status.sstat_rwnd,
+			  status.sstat_assoc_id);
+
+	if (copy_to_user(optval, &status, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
+
+/* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO)
+ *
+ * Applications can retrieve information about a specific peer address
+ * of an association, including its reachability state, congestion
+ * window, and retransmission timer values.  This information is
+ * read-only.
+ */
+static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
+					  char __user *optval,
+					  int __user *optlen)
+{
+	struct sctp_paddrinfo pinfo;
+	struct sctp_transport *transport;
+	int retval = 0;
+
+	if (len < sizeof(pinfo)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(pinfo);
+	if (copy_from_user(&pinfo, optval, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address,
+					   pinfo.spinfo_assoc_id);
+	if (!transport)
+		return -EINVAL;
+
+	pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
+	pinfo.spinfo_state = transport->state;
+	pinfo.spinfo_cwnd = transport->cwnd;
+	pinfo.spinfo_srtt = transport->srtt;
+	pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
+	pinfo.spinfo_mtu = transport->pathmtu;
+
+	if (pinfo.spinfo_state == SCTP_UNKNOWN)
+		pinfo.spinfo_state = SCTP_ACTIVE;
+
+	if (put_user(len, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (copy_to_user(optval, &pinfo, len)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+out:
+	return retval;
+}
+
+/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
+ *
+ * This option is a on/off flag.  If enabled no SCTP message
+ * fragmentation will be performed.  Instead if a message being sent
+ * exceeds the current PMTU size, the message will NOT be sent and
+ * instead a error will be indicated to the user.
+ */
+static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
+					char __user *optval, int __user *optlen)
+{
+	int val;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	val = (sctp_sk(sk)->disable_fragments == 1);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+/* 7.1.15 Set notification and ancillary events (SCTP_EVENTS)
+ *
+ * This socket option is used to specify various notifications and
+ * ancillary data the user wishes to receive.
+ */
+static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
+				  int __user *optlen)
+{
+	if (len <= 0)
+		return -EINVAL;
+	if (len > sizeof(struct sctp_event_subscribe))
+		len = sizeof(struct sctp_event_subscribe);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len))
+		return -EFAULT;
+	return 0;
+}
+
+/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
+ *
+ * This socket option is applicable to the UDP-style socket only.  When
+ * set it will cause associations that are idle for more than the
+ * specified number of seconds to automatically close.  An association
+ * being idle is defined an association that has NOT sent or received
+ * user data.  The special value of '0' indicates that no automatic
+ * close of any associations should be performed.  The option expects an
+ * integer defining the number of seconds of idle time before an
+ * association is closed.
+ */
+static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+	/* Applicable to UDP-style socket only */
+	if (sctp_style(sk, TCP))
+		return -EOPNOTSUPP;
+	if (len < sizeof(int))
+		return -EINVAL;
+	len = sizeof(int);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int)))
+		return -EFAULT;
+	return 0;
+}
+
+/* Helper routine to branch off an association to a new socket.  */
+SCTP_STATIC int sctp_do_peeloff(struct sctp_association *asoc,
+				struct socket **sockp)
+{
+	struct sock *sk = asoc->base.sk;
+	struct socket *sock;
+	struct sctp_af *af;
+	int err = 0;
+
+	/* An association cannot be branched off from an already peeled-off
+	 * socket, nor is this supported for tcp style sockets.
+	 */
+	if (!sctp_style(sk, UDP))
+		return -EINVAL;
+
+	/* Create a new socket.  */
+	err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
+	if (err < 0)
+		return err;
+
+	sctp_copy_sock(sock->sk, sk, asoc);
+
+	/* Make peeled-off sockets more like 1-1 accepted sockets.
+	 * Set the daddr and initialize id to something more random
+	 */
+	af = sctp_get_af_specific(asoc->peer.primary_addr.sa.sa_family);
+	af->to_sk_daddr(&asoc->peer.primary_addr, sk);
+
+	/* Populate the fields of the newsk from the oldsk and migrate the
+	 * asoc to the newsk.
+	 */
+	sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH);
+
+	*sockp = sock;
+
+	return err;
+}
+
+static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+	sctp_peeloff_arg_t peeloff;
+	struct socket *newsock;
+	int retval = 0;
+	struct sctp_association *asoc;
+
+	if (len < sizeof(sctp_peeloff_arg_t))
+		return -EINVAL;
+	len = sizeof(sctp_peeloff_arg_t);
+	if (copy_from_user(&peeloff, optval, len))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, peeloff.associd);
+	if (!asoc) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p\n", __func__, sk, asoc);
+
+	retval = sctp_do_peeloff(asoc, &newsock);
+	if (retval < 0)
+		goto out;
+
+	/* Map the socket to an unused fd that can be returned to the user.  */
+	retval = sock_map_fd(newsock, 0);
+	if (retval < 0) {
+		sock_release(newsock);
+		goto out;
+	}
+
+	SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p newsk: %p sd: %d\n",
+			  __func__, sk, asoc, newsock->sk, retval);
+
+	/* Return the fd mapped to the new socket.  */
+	peeloff.sd = retval;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &peeloff, len))
+		retval = -EFAULT;
+
+out:
+	return retval;
+}
+
+/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
+ *
+ * Applications can enable or disable heartbeats for any peer address of
+ * an association, modify an address's heartbeat interval, force a
+ * heartbeat to be sent immediately, and adjust the address's maximum
+ * number of retransmissions sent before an address is considered
+ * unreachable.  The following structure is used to access and modify an
+ * address's parameters:
+ *
+ *  struct sctp_paddrparams {
+ *     sctp_assoc_t            spp_assoc_id;
+ *     struct sockaddr_storage spp_address;
+ *     uint32_t                spp_hbinterval;
+ *     uint16_t                spp_pathmaxrxt;
+ *     uint32_t                spp_pathmtu;
+ *     uint32_t                spp_sackdelay;
+ *     uint32_t                spp_flags;
+ * };
+ *
+ *   spp_assoc_id    - (one-to-many style socket) This is filled in the
+ *                     application, and identifies the association for
+ *                     this query.
+ *   spp_address     - This specifies which address is of interest.
+ *   spp_hbinterval  - This contains the value of the heartbeat interval,
+ *                     in milliseconds.  If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmaxrxt  - This contains the maximum number of
+ *                     retransmissions before this address shall be
+ *                     considered unreachable. If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmtu     - When Path MTU discovery is disabled the value
+ *                     specified here will be the "fixed" path mtu.
+ *                     Note that if the spp_address field is empty
+ *                     then all associations on this address will
+ *                     have this fixed path mtu set upon them.
+ *
+ *   spp_sackdelay   - When delayed sack is enabled, this value specifies
+ *                     the number of milliseconds that sacks will be delayed
+ *                     for. This value will apply to all addresses of an
+ *                     association if the spp_address field is empty. Note
+ *                     also, that if delayed sack is enabled and this
+ *                     value is set to 0, no change is made to the last
+ *                     recorded delayed sack timer value.
+ *
+ *   spp_flags       - These flags are used to control various features
+ *                     on an association. The flag field may contain
+ *                     zero or more of the following options.
+ *
+ *                     SPP_HB_ENABLE  - Enable heartbeats on the
+ *                     specified address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     have heartbeats enabled upon them.
+ *
+ *                     SPP_HB_DISABLE - Disable heartbeats on the
+ *                     speicifed address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     will have their heartbeats disabled. Note also
+ *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ *                     mutually exclusive, only one of these two should
+ *                     be specified. Enabling both fields will have
+ *                     undetermined results.
+ *
+ *                     SPP_HB_DEMAND - Request a user initiated heartbeat
+ *                     to be made immediately.
+ *
+ *                     SPP_PMTUD_ENABLE - This field will enable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected.
+ *
+ *                     SPP_PMTUD_DISABLE - This field will disable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected. Not also that
+ *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ *                     exclusive. Enabling both will have undetermined
+ *                     results.
+ *
+ *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
+ *                     on delayed sack. The time specified in spp_sackdelay
+ *                     is used to specify the sack delay for this address. Note
+ *                     that if spp_address is empty then all addresses will
+ *                     enable delayed sack and take on the sack delay
+ *                     value specified in spp_sackdelay.
+ *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
+ *                     off delayed sack. If the spp_address field is blank then
+ *                     delayed sack is disabled for the entire association. Note
+ *                     also that this field is mutually exclusive to
+ *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
+ *                     results.
+ */
+static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
+					    char __user *optval, int __user *optlen)
+{
+	struct sctp_paddrparams  params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+
+	if (len < sizeof(struct sctp_paddrparams))
+		return -EINVAL;
+	len = sizeof(struct sctp_paddrparams);
+	if (copy_from_user(&params, optval, len))
+		return -EFAULT;
+
+	/* If an address other than INADDR_ANY is specified, and
+	 * no transport is found, then the request is invalid.
+	 */
+	if (!sctp_is_any(sk, ( union sctp_addr *)&params.spp_address)) {
+		trans = sctp_addr_id2transport(sk, &params.spp_address,
+					       params.spp_assoc_id);
+		if (!trans) {
+			SCTP_DEBUG_PRINTK("Failed no transport\n");
+			return -EINVAL;
+		}
+	}
+
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+	 */
+	asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+	if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
+		SCTP_DEBUG_PRINTK("Failed no association\n");
+		return -EINVAL;
+	}
+
+	if (trans) {
+		/* Fetch transport values. */
+		params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
+		params.spp_pathmtu    = trans->pathmtu;
+		params.spp_pathmaxrxt = trans->pathmaxrxt;
+		params.spp_sackdelay  = jiffies_to_msecs(trans->sackdelay);
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = trans->param_flags;
+	} else if (asoc) {
+		/* Fetch association values. */
+		params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
+		params.spp_pathmtu    = asoc->pathmtu;
+		params.spp_pathmaxrxt = asoc->pathmaxrxt;
+		params.spp_sackdelay  = jiffies_to_msecs(asoc->sackdelay);
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = asoc->param_flags;
+	} else {
+		/* Fetch socket values. */
+		params.spp_hbinterval = sp->hbinterval;
+		params.spp_pathmtu    = sp->pathmtu;
+		params.spp_sackdelay  = sp->sackdelay;
+		params.spp_pathmaxrxt = sp->pathmaxrxt;
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = sp->param_flags;
+	}
+
+	if (copy_to_user(optval, &params, len))
+		return -EFAULT;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
+ *
+ * This option will effect the way delayed acks are performed.  This
+ * option allows you to get or set the delayed ack time, in
+ * milliseconds.  It also allows changing the delayed ack frequency.
+ * Changing the frequency to 1 disables the delayed sack algorithm.  If
+ * the assoc_id is 0, then this sets or gets the endpoints default
+ * values.  If the assoc_id field is non-zero, then the set or get
+ * effects the specified association for the one to many model (the
+ * assoc_id field is ignored by the one to one model).  Note that if
+ * sack_delay or sack_freq are 0 when setting this option, then the
+ * current values will remain unchanged.
+ *
+ * struct sctp_sack_info {
+ *     sctp_assoc_t            sack_assoc_id;
+ *     uint32_t                sack_delay;
+ *     uint32_t                sack_freq;
+ * };
+ *
+ * sack_assoc_id -  This parameter, indicates which association the user
+ *    is performing an action upon.  Note that if this field's value is
+ *    zero then the endpoints default value is changed (effecting future
+ *    associations only).
+ *
+ * sack_delay -  This parameter contains the number of milliseconds that
+ *    the user is requesting the delayed ACK timer be set to.  Note that
+ *    this value is defined in the standard to be between 200 and 500
+ *    milliseconds.
+ *
+ * sack_freq -  This parameter contains the number of packets that must
+ *    be received before a sack is sent without waiting for the delay
+ *    timer to expire.  The default value for this is 2, setting this
+ *    value to 1 will disable the delayed sack algorithm.
+ */
+static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
+					    char __user *optval,
+					    int __user *optlen)
+{
+	struct sctp_sack_info    params;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+
+	if (len >= sizeof(struct sctp_sack_info)) {
+		len = sizeof(struct sctp_sack_info);
+
+		if (copy_from_user(&params, optval, len))
+			return -EFAULT;
+	} else if (len == sizeof(struct sctp_assoc_value)) {
+		pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+		pr_warn("Use struct sctp_sack_info instead\n");
+		if (copy_from_user(&params, optval, len))
+			return -EFAULT;
+	} else
+		return - EINVAL;
+
+	/* Get association, if sack_assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+	 */
+	asoc = sctp_id2assoc(sk, params.sack_assoc_id);
+	if (!asoc && params.sack_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		/* Fetch association values. */
+		if (asoc->param_flags & SPP_SACKDELAY_ENABLE) {
+			params.sack_delay = jiffies_to_msecs(
+				asoc->sackdelay);
+			params.sack_freq = asoc->sackfreq;
+
+		} else {
+			params.sack_delay = 0;
+			params.sack_freq = 1;
+		}
+	} else {
+		/* Fetch socket values. */
+		if (sp->param_flags & SPP_SACKDELAY_ENABLE) {
+			params.sack_delay  = sp->sackdelay;
+			params.sack_freq = sp->sackfreq;
+		} else {
+			params.sack_delay  = 0;
+			params.sack_freq = 1;
+		}
+	}
+
+	if (copy_to_user(optval, &params, len))
+		return -EFAULT;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
+ *
+ * Applications can specify protocol parameters for the default association
+ * initialization.  The option name argument to setsockopt() and getsockopt()
+ * is SCTP_INITMSG.
+ *
+ * Setting initialization parameters is effective only on an unconnected
+ * socket (for UDP-style sockets only future associations are effected
+ * by the change).  With TCP-style sockets, this option is inherited by
+ * sockets derived from a listener socket.
+ */
+static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen)
+{
+	if (len < sizeof(struct sctp_initmsg))
+		return -EINVAL;
+	len = sizeof(struct sctp_initmsg);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len))
+		return -EFAULT;
+	return 0;
+}
+
+
+static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
+				      char __user *optval, int __user *optlen)
+{
+	struct sctp_association *asoc;
+	int cnt = 0;
+	struct sctp_getaddrs getaddrs;
+	struct sctp_transport *from;
+	void __user *to;
+	union sctp_addr temp;
+	struct sctp_sock *sp = sctp_sk(sk);
+	int addrlen;
+	size_t space_left;
+	int bytes_copied;
+
+	if (len < sizeof(struct sctp_getaddrs))
+		return -EINVAL;
+
+	if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+		return -EFAULT;
+
+	/* For UDP-style sockets, id specifies the association to query.  */
+	asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+	if (!asoc)
+		return -EINVAL;
+
+	to = optval + offsetof(struct sctp_getaddrs,addrs);
+	space_left = len - offsetof(struct sctp_getaddrs,addrs);
+
+	list_for_each_entry(from, &asoc->peer.transport_addr_list,
+				transports) {
+		memcpy(&temp, &from->ipaddr, sizeof(temp));
+		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
+		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
+		if (space_left < addrlen)
+			return -ENOMEM;
+		if (copy_to_user(to, &temp, addrlen))
+			return -EFAULT;
+		to += addrlen;
+		cnt++;
+		space_left -= addrlen;
+	}
+
+	if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num))
+		return -EFAULT;
+	bytes_copied = ((char __user *)to) - optval;
+	if (put_user(bytes_copied, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
+			    size_t space_left, int *bytes_copied)
+{
+	struct sctp_sockaddr_entry *addr;
+	union sctp_addr temp;
+	int cnt = 0;
+	int addrlen;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(addr, &sctp_local_addr_list, list) {
+		if (!addr->valid)
+			continue;
+
+		if ((PF_INET == sk->sk_family) &&
+		    (AF_INET6 == addr->a.sa.sa_family))
+			continue;
+		if ((PF_INET6 == sk->sk_family) &&
+		    inet_v6_ipv6only(sk) &&
+		    (AF_INET == addr->a.sa.sa_family))
+			continue;
+		memcpy(&temp, &addr->a, sizeof(temp));
+		if (!temp.v4.sin_port)
+			temp.v4.sin_port = htons(port);
+
+		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
+								&temp);
+		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
+		if (space_left < addrlen) {
+			cnt =  -ENOMEM;
+			break;
+		}
+		memcpy(to, &temp, addrlen);
+
+		to += addrlen;
+		cnt ++;
+		space_left -= addrlen;
+		*bytes_copied += addrlen;
+	}
+	rcu_read_unlock();
+
+	return cnt;
+}
+
+
+static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
+				       char __user *optval, int __user *optlen)
+{
+	struct sctp_bind_addr *bp;
+	struct sctp_association *asoc;
+	int cnt = 0;
+	struct sctp_getaddrs getaddrs;
+	struct sctp_sockaddr_entry *addr;
+	void __user *to;
+	union sctp_addr temp;
+	struct sctp_sock *sp = sctp_sk(sk);
+	int addrlen;
+	int err = 0;
+	size_t space_left;
+	int bytes_copied = 0;
+	void *addrs;
+	void *buf;
+
+	if (len < sizeof(struct sctp_getaddrs))
+		return -EINVAL;
+
+	if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
+		return -EFAULT;
+
+	/*
+	 *  For UDP-style sockets, id specifies the association to query.
+	 *  If the id field is set to the value '0' then the locally bound
+	 *  addresses are returned without regard to any particular
+	 *  association.
+	 */
+	if (0 == getaddrs.assoc_id) {
+		bp = &sctp_sk(sk)->ep->base.bind_addr;
+	} else {
+		asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
+		if (!asoc)
+			return -EINVAL;
+		bp = &asoc->base.bind_addr;
+	}
+
+	to = optval + offsetof(struct sctp_getaddrs,addrs);
+	space_left = len - offsetof(struct sctp_getaddrs,addrs);
+
+	addrs = kmalloc(space_left, GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+
+	/* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
+	 * addresses from the global local address list.
+	 */
+	if (sctp_list_single_entry(&bp->address_list)) {
+		addr = list_entry(bp->address_list.next,
+				  struct sctp_sockaddr_entry, list);
+		if (sctp_is_any(sk, &addr->a)) {
+			cnt = sctp_copy_laddrs(sk, bp->port, addrs,
+						space_left, &bytes_copied);
+			if (cnt < 0) {
+				err = cnt;
+				goto out;
+			}
+			goto copy_getaddrs;
+		}
+	}
+
+	buf = addrs;
+	/* Protection on the bound address list is not needed since
+	 * in the socket option context we hold a socket lock and
+	 * thus the bound address list can't change.
+	 */
+	list_for_each_entry(addr, &bp->address_list, list) {
+		memcpy(&temp, &addr->a, sizeof(temp));
+		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
+		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
+		if (space_left < addrlen) {
+			err =  -ENOMEM; /*fixme: right error?*/
+			goto out;
+		}
+		memcpy(buf, &temp, addrlen);
+		buf += addrlen;
+		bytes_copied += addrlen;
+		cnt ++;
+		space_left -= addrlen;
+	}
+
+copy_getaddrs:
+	if (copy_to_user(to, addrs, bytes_copied)) {
+		err = -EFAULT;
+		goto out;
+	}
+	if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) {
+		err = -EFAULT;
+		goto out;
+	}
+	if (put_user(bytes_copied, optlen))
+		err = -EFAULT;
+out:
+	kfree(addrs);
+	return err;
+}
+
+/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
+ *
+ * Requests that the local SCTP stack use the enclosed peer address as
+ * the association primary.  The enclosed address must be one of the
+ * association peer's addresses.
+ */
+static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
+					char __user *optval, int __user *optlen)
+{
+	struct sctp_prim prim;
+	struct sctp_association *asoc;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (len < sizeof(struct sctp_prim))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_prim);
+
+	if (copy_from_user(&prim, optval, len))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, prim.ssp_assoc_id);
+	if (!asoc)
+		return -EINVAL;
+
+	if (!asoc->peer.primary_path)
+		return -ENOTCONN;
+
+	memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr,
+		asoc->peer.primary_path->af_specific->sockaddr_len);
+
+	sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp,
+			(union sctp_addr *)&prim.ssp_addr);
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &prim, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.11  Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER)
+ *
+ * Requests that the local endpoint set the specified Adaptation Layer
+ * Indication parameter for all future INIT and INIT-ACK exchanges.
+ */
+static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len,
+				  char __user *optval, int __user *optlen)
+{
+	struct sctp_setadaptation adaptation;
+
+	if (len < sizeof(struct sctp_setadaptation))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_setadaptation);
+
+	adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &adaptation, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ *
+ * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
+ *
+ *   Applications that wish to use the sendto() system call may wish to
+ *   specify a default set of parameters that would normally be supplied
+ *   through the inclusion of ancillary data.  This socket option allows
+ *   such an application to set the default sctp_sndrcvinfo structure.
+
+
+ *   The application that wishes to use this socket option simply passes
+ *   in to this call the sctp_sndrcvinfo structure defined in Section
+ *   5.2.2) The input parameters accepted by this call include
+ *   sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
+ *   sinfo_timetolive.  The user must provide the sinfo_assoc_id field in
+ *   to this call if the caller is using the UDP model.
+ *
+ *   For getsockopt, it get the default sctp_sndrcvinfo structure.
+ */
+static int sctp_getsockopt_default_send_param(struct sock *sk,
+					int len, char __user *optval,
+					int __user *optlen)
+{
+	struct sctp_sndrcvinfo info;
+	struct sctp_association *asoc;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (len < sizeof(struct sctp_sndrcvinfo))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_sndrcvinfo);
+
+	if (copy_from_user(&info, optval, len))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, info.sinfo_assoc_id);
+	if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		info.sinfo_stream = asoc->default_stream;
+		info.sinfo_flags = asoc->default_flags;
+		info.sinfo_ppid = asoc->default_ppid;
+		info.sinfo_context = asoc->default_context;
+		info.sinfo_timetolive = asoc->default_timetolive;
+	} else {
+		info.sinfo_stream = sp->default_stream;
+		info.sinfo_flags = sp->default_flags;
+		info.sinfo_ppid = sp->default_ppid;
+		info.sinfo_context = sp->default_context;
+		info.sinfo_timetolive = sp->default_timetolive;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &info, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ *
+ * 7.1.5 SCTP_NODELAY
+ *
+ * Turn on/off any Nagle-like algorithm.  This means that packets are
+ * generally sent as soon as possible and no unnecessary delays are
+ * introduced, at the cost of more packets in the network.  Expects an
+ * integer boolean flag.
+ */
+
+static int sctp_getsockopt_nodelay(struct sock *sk, int len,
+				   char __user *optval, int __user *optlen)
+{
+	int val;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	val = (sctp_sk(sk)->nodelay == 1);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ *
+ * 7.1.1 SCTP_RTOINFO
+ *
+ * The protocol parameters used to initialize and bound retransmission
+ * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
+ * and modify these parameters.
+ * All parameters are time values, in milliseconds.  A value of 0, when
+ * modifying the parameters, indicates that the current value should not
+ * be changed.
+ *
+ */
+static int sctp_getsockopt_rtoinfo(struct sock *sk, int len,
+				char __user *optval,
+				int __user *optlen) {
+	struct sctp_rtoinfo rtoinfo;
+	struct sctp_association *asoc;
+
+	if (len < sizeof (struct sctp_rtoinfo))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_rtoinfo);
+
+	if (copy_from_user(&rtoinfo, optval, len))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id);
+
+	if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	/* Values corresponding to the specific association. */
+	if (asoc) {
+		rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial);
+		rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max);
+		rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min);
+	} else {
+		/* Values corresponding to the endpoint. */
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		rtoinfo.srto_initial = sp->rtoinfo.srto_initial;
+		rtoinfo.srto_max = sp->rtoinfo.srto_max;
+		rtoinfo.srto_min = sp->rtoinfo.srto_min;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	if (copy_to_user(optval, &rtoinfo, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ *
+ * 7.1.2 SCTP_ASSOCINFO
+ *
+ * This option is used to tune the maximum retransmission attempts
+ * of the association.
+ * Returns an error if the new association retransmission value is
+ * greater than the sum of the retransmission value  of the peer.
+ * See [SCTP] for more information.
+ *
+ */
+static int sctp_getsockopt_associnfo(struct sock *sk, int len,
+				     char __user *optval,
+				     int __user *optlen)
+{
+
+	struct sctp_assocparams assocparams;
+	struct sctp_association *asoc;
+	struct list_head *pos;
+	int cnt = 0;
+
+	if (len < sizeof (struct sctp_assocparams))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_assocparams);
+
+	if (copy_from_user(&assocparams, optval, len))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id);
+
+	if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	/* Values correspoinding to the specific association */
+	if (asoc) {
+		assocparams.sasoc_asocmaxrxt = asoc->max_retrans;
+		assocparams.sasoc_peer_rwnd = asoc->peer.rwnd;
+		assocparams.sasoc_local_rwnd = asoc->a_rwnd;
+		assocparams.sasoc_cookie_life = (asoc->cookie_life.tv_sec
+						* 1000) +
+						(asoc->cookie_life.tv_usec
+						/ 1000);
+
+		list_for_each(pos, &asoc->peer.transport_addr_list) {
+			cnt ++;
+		}
+
+		assocparams.sasoc_number_peer_destinations = cnt;
+	} else {
+		/* Values corresponding to the endpoint */
+		struct sctp_sock *sp = sctp_sk(sk);
+
+		assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt;
+		assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd;
+		assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd;
+		assocparams.sasoc_cookie_life =
+					sp->assocparams.sasoc_cookie_life;
+		assocparams.sasoc_number_peer_destinations =
+					sp->assocparams.
+					sasoc_number_peer_destinations;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	if (copy_to_user(optval, &assocparams, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
+ *
+ * This socket option is a boolean flag which turns on or off mapped V4
+ * addresses.  If this option is turned on and the socket is type
+ * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
+ * If this option is turned off, then no mapping will be done of V4
+ * addresses and a user will receive both PF_INET6 and PF_INET type
+ * addresses on the socket.
+ */
+static int sctp_getsockopt_mappedv4(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	int val;
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+	val = sp->v4mapped;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.29.  Set or Get the default context (SCTP_CONTEXT)
+ * (chapter and verse is quoted at sctp_setsockopt_context())
+ */
+static int sctp_getsockopt_context(struct sock *sk, int len,
+				   char __user *optval, int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_sock *sp;
+	struct sctp_association *asoc;
+
+	if (len < sizeof(struct sctp_assoc_value))
+		return -EINVAL;
+
+	len = sizeof(struct sctp_assoc_value);
+
+	if (copy_from_user(&params, optval, len))
+		return -EFAULT;
+
+	sp = sctp_sk(sk);
+
+	if (params.assoc_id != 0) {
+		asoc = sctp_id2assoc(sk, params.assoc_id);
+		if (!asoc)
+			return -EINVAL;
+		params.assoc_value = asoc->default_rcv_context;
+	} else {
+		params.assoc_value = sp->default_rcv_context;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &params, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 8.1.16.  Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
+ * This option will get or set the maximum size to put in any outgoing
+ * SCTP DATA chunk.  If a message is larger than this size it will be
+ * fragmented by SCTP into the specified size.  Note that the underlying
+ * SCTP implementation may fragment into smaller sized chunks when the
+ * PMTU of the underlying association is smaller than the value set by
+ * the user.  The default value for this option is '0' which indicates
+ * the user is NOT limiting fragmentation and only the PMTU will effect
+ * SCTP's choice of DATA chunk size.  Note also that values set larger
+ * than the maximum size of an IP datagram will effectively let SCTP
+ * control fragmentation (i.e. the same as setting this option to 0).
+ *
+ * The following structure is used to access and modify this parameter:
+ *
+ * struct sctp_assoc_value {
+ *   sctp_assoc_t assoc_id;
+ *   uint32_t assoc_value;
+ * };
+ *
+ * assoc_id:  This parameter is ignored for one-to-one style sockets.
+ *    For one-to-many style sockets this parameter indicates which
+ *    association the user is performing an action upon.  Note that if
+ *    this field's value is zero then the endpoints default value is
+ *    changed (effecting future associations only).
+ * assoc_value:  This parameter specifies the maximum size in bytes.
+ */
+static int sctp_getsockopt_maxseg(struct sock *sk, int len,
+				  char __user *optval, int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+
+	if (len == sizeof(int)) {
+		pr_warn("Use of int in maxseg socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
+		params.assoc_id = 0;
+	} else if (len >= sizeof(struct sctp_assoc_value)) {
+		len = sizeof(struct sctp_assoc_value);
+		if (copy_from_user(&params, optval, sizeof(params)))
+			return -EFAULT;
+	} else
+		return -EINVAL;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc)
+		params.assoc_value = asoc->frag_point;
+	else
+		params.assoc_value = sctp_sk(sk)->user_frag;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (len == sizeof(int)) {
+		if (copy_to_user(optval, &params.assoc_value, len))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(optval, &params, len))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * 7.1.24.  Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
+ * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave())
+ */
+static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len,
+					       char __user *optval, int __user *optlen)
+{
+	int val;
+
+	if (len < sizeof(int))
+		return -EINVAL;
+
+	len = sizeof(int);
+
+	val = sctp_sk(sk)->frag_interleave;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.25.  Set or Get the sctp partial delivery point
+ * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point())
+ */
+static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len,
+						  char __user *optval,
+						  int __user *optlen)
+{
+	u32 val;
+
+	if (len < sizeof(u32))
+		return -EINVAL;
+
+	len = sizeof(u32);
+
+	val = sctp_sk(sk)->pd_point;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 7.1.28.  Set or Get the maximum burst (SCTP_MAX_BURST)
+ * (chapter and verse is quoted at sctp_setsockopt_maxburst())
+ */
+static int sctp_getsockopt_maxburst(struct sock *sk, int len,
+				    char __user *optval,
+				    int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_sock *sp;
+	struct sctp_association *asoc;
+
+	if (len == sizeof(int)) {
+		pr_warn("Use of int in max_burst socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
+		params.assoc_id = 0;
+	} else if (len >= sizeof(struct sctp_assoc_value)) {
+		len = sizeof(struct sctp_assoc_value);
+		if (copy_from_user(&params, optval, len))
+			return -EFAULT;
+	} else
+		return -EINVAL;
+
+	sp = sctp_sk(sk);
+
+	if (params.assoc_id != 0) {
+		asoc = sctp_id2assoc(sk, params.assoc_id);
+		if (!asoc)
+			return -EINVAL;
+		params.assoc_value = asoc->max_burst;
+	} else
+		params.assoc_value = sp->max_burst;
+
+	if (len == sizeof(int)) {
+		if (copy_to_user(optval, &params.assoc_value, len))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(optval, &params, len))
+			return -EFAULT;
+	}
+
+	return 0;
+
+}
+
+static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_hmacalgo  __user *p = (void __user *)optval;
+	struct sctp_hmac_algo_param *hmacs;
+	__u16 data_len = 0;
+	u32 num_idents;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	hmacs = sctp_sk(sk)->ep->auth_hmacs_list;
+	data_len = ntohs(hmacs->param_hdr.length) - sizeof(sctp_paramhdr_t);
+
+	if (len < sizeof(struct sctp_hmacalgo) + data_len)
+		return -EINVAL;
+
+	len = sizeof(struct sctp_hmacalgo) + data_len;
+	num_idents = data_len / sizeof(u16);
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (put_user(num_idents, &p->shmac_num_idents))
+		return -EFAULT;
+	if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
+		return -EFAULT;
+	return 0;
+}
+
+static int sctp_getsockopt_active_key(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_authkeyid val;
+	struct sctp_association *asoc;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (len < sizeof(struct sctp_authkeyid))
+		return -EINVAL;
+	if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid)))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+	if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc)
+		val.scact_keynumber = asoc->active_key_id;
+	else
+		val.scact_keynumber = sctp_sk(sk)->ep->active_key_id;
+
+	len = sizeof(struct sctp_authkeyid);
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_authchunks __user *p = (void __user *)optval;
+	struct sctp_authchunks val;
+	struct sctp_association *asoc;
+	struct sctp_chunks_param *ch;
+	u32    num_chunks = 0;
+	char __user *to;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (len < sizeof(struct sctp_authchunks))
+		return -EINVAL;
+
+	if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+		return -EFAULT;
+
+	to = p->gauth_chunks;
+	asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
+	if (!asoc)
+		return -EINVAL;
+
+	ch = asoc->peer.peer_chunks;
+	if (!ch)
+		goto num;
+
+	/* See if the user provided enough room for all the data */
+	num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t);
+	if (len < num_chunks)
+		return -EINVAL;
+
+	if (copy_to_user(to, ch->chunks, num_chunks))
+		return -EFAULT;
+num:
+	len = sizeof(struct sctp_authchunks) + num_chunks;
+	if (put_user(len, optlen)) return -EFAULT;
+	if (put_user(num_chunks, &p->gauth_number_of_chunks))
+		return -EFAULT;
+	return 0;
+}
+
+static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_authchunks __user *p = (void __user *)optval;
+	struct sctp_authchunks val;
+	struct sctp_association *asoc;
+	struct sctp_chunks_param *ch;
+	u32    num_chunks = 0;
+	char __user *to;
+
+	if (!sctp_auth_enable)
+		return -EACCES;
+
+	if (len < sizeof(struct sctp_authchunks))
+		return -EINVAL;
+
+	if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks)))
+		return -EFAULT;
+
+	to = p->gauth_chunks;
+	asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
+	if (!asoc && val.gauth_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc)
+		ch = (struct sctp_chunks_param*)asoc->c.auth_chunks;
+	else
+		ch = sctp_sk(sk)->ep->auth_chunk_list;
+
+	if (!ch)
+		goto num;
+
+	num_chunks = ntohs(ch->param_hdr.length) - sizeof(sctp_paramhdr_t);
+	if (len < sizeof(struct sctp_authchunks) + num_chunks)
+		return -EINVAL;
+
+	if (copy_to_user(to, ch->chunks, num_chunks))
+		return -EFAULT;
+num:
+	len = sizeof(struct sctp_authchunks) + num_chunks;
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (put_user(num_chunks, &p->gauth_number_of_chunks))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 8.2.5.  Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER)
+ * This option gets the current number of associations that are attached
+ * to a one-to-many style socket.  The option value is an uint32_t.
+ */
+static int sctp_getsockopt_assoc_number(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sctp_association *asoc;
+	u32 val = 0;
+
+	if (sctp_style(sk, TCP))
+		return -EOPNOTSUPP;
+
+	if (len < sizeof(u32))
+		return -EINVAL;
+
+	len = sizeof(u32);
+
+	list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+		val++;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * 8.2.6. Get the Current Identifiers of Associations
+ *        (SCTP_GET_ASSOC_ID_LIST)
+ *
+ * This option gets the current list of SCTP association identifiers of
+ * the SCTP associations handled by a one-to-many style socket.
+ */
+static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sctp_association *asoc;
+	struct sctp_assoc_ids *ids;
+	u32 num = 0;
+
+	if (sctp_style(sk, TCP))
+		return -EOPNOTSUPP;
+
+	if (len < sizeof(struct sctp_assoc_ids))
+		return -EINVAL;
+
+	list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+		num++;
+	}
+
+	if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num)
+		return -EINVAL;
+
+	len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num;
+
+	ids = kmalloc(len, GFP_KERNEL);
+	if (unlikely(!ids))
+		return -ENOMEM;
+
+	ids->gaids_number_of_ids = num;
+	num = 0;
+	list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
+		ids->gaids_assoc_id[num++] = asoc->assoc_id;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, ids, len)) {
+		kfree(ids);
+		return -EFAULT;
+	}
+
+	kfree(ids);
+	return 0;
+}
+
+SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
+				char __user *optval, int __user *optlen)
+{
+	int retval = 0;
+	int len;
+
+	SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
+			  sk, optname);
+
+	/* I can hardly begin to describe how wrong this is.  This is
+	 * so broken as to be worse than useless.  The API draft
+	 * REALLY is NOT helpful here...  I am not convinced that the
+	 * semantics of getsockopt() with a level OTHER THAN SOL_SCTP
+	 * are at all well-founded.
+	 */
+	if (level != SOL_SCTP) {
+		struct sctp_af *af = sctp_sk(sk)->pf->af;
+
+		retval = af->getsockopt(sk, level, optname, optval, optlen);
+		return retval;
+	}
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	sctp_lock_sock(sk);
+
+	switch (optname) {
+	case SCTP_STATUS:
+		retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen);
+		break;
+	case SCTP_DISABLE_FRAGMENTS:
+		retval = sctp_getsockopt_disable_fragments(sk, len, optval,
+							   optlen);
+		break;
+	case SCTP_EVENTS:
+		retval = sctp_getsockopt_events(sk, len, optval, optlen);
+		break;
+	case SCTP_AUTOCLOSE:
+		retval = sctp_getsockopt_autoclose(sk, len, optval, optlen);
+		break;
+	case SCTP_SOCKOPT_PEELOFF:
+		retval = sctp_getsockopt_peeloff(sk, len, optval, optlen);
+		break;
+	case SCTP_PEER_ADDR_PARAMS:
+		retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
+							  optlen);
+		break;
+	case SCTP_DELAYED_SACK:
+		retval = sctp_getsockopt_delayed_ack(sk, len, optval,
+							  optlen);
+		break;
+	case SCTP_INITMSG:
+		retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
+		break;
+	case SCTP_GET_PEER_ADDRS:
+		retval = sctp_getsockopt_peer_addrs(sk, len, optval,
+						    optlen);
+		break;
+	case SCTP_GET_LOCAL_ADDRS:
+		retval = sctp_getsockopt_local_addrs(sk, len, optval,
+						     optlen);
+		break;
+	case SCTP_SOCKOPT_CONNECTX3:
+		retval = sctp_getsockopt_connectx3(sk, len, optval, optlen);
+		break;
+	case SCTP_DEFAULT_SEND_PARAM:
+		retval = sctp_getsockopt_default_send_param(sk, len,
+							    optval, optlen);
+		break;
+	case SCTP_PRIMARY_ADDR:
+		retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen);
+		break;
+	case SCTP_NODELAY:
+		retval = sctp_getsockopt_nodelay(sk, len, optval, optlen);
+		break;
+	case SCTP_RTOINFO:
+		retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen);
+		break;
+	case SCTP_ASSOCINFO:
+		retval = sctp_getsockopt_associnfo(sk, len, optval, optlen);
+		break;
+	case SCTP_I_WANT_MAPPED_V4_ADDR:
+		retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen);
+		break;
+	case SCTP_MAXSEG:
+		retval = sctp_getsockopt_maxseg(sk, len, optval, optlen);
+		break;
+	case SCTP_GET_PEER_ADDR_INFO:
+		retval = sctp_getsockopt_peer_addr_info(sk, len, optval,
+							optlen);
+		break;
+	case SCTP_ADAPTATION_LAYER:
+		retval = sctp_getsockopt_adaptation_layer(sk, len, optval,
+							optlen);
+		break;
+	case SCTP_CONTEXT:
+		retval = sctp_getsockopt_context(sk, len, optval, optlen);
+		break;
+	case SCTP_FRAGMENT_INTERLEAVE:
+		retval = sctp_getsockopt_fragment_interleave(sk, len, optval,
+							     optlen);
+		break;
+	case SCTP_PARTIAL_DELIVERY_POINT:
+		retval = sctp_getsockopt_partial_delivery_point(sk, len, optval,
+								optlen);
+		break;
+	case SCTP_MAX_BURST:
+		retval = sctp_getsockopt_maxburst(sk, len, optval, optlen);
+		break;
+	case SCTP_AUTH_KEY:
+	case SCTP_AUTH_CHUNK:
+	case SCTP_AUTH_DELETE_KEY:
+		retval = -EOPNOTSUPP;
+		break;
+	case SCTP_HMAC_IDENT:
+		retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen);
+		break;
+	case SCTP_AUTH_ACTIVE_KEY:
+		retval = sctp_getsockopt_active_key(sk, len, optval, optlen);
+		break;
+	case SCTP_PEER_AUTH_CHUNKS:
+		retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval,
+							optlen);
+		break;
+	case SCTP_LOCAL_AUTH_CHUNKS:
+		retval = sctp_getsockopt_local_auth_chunks(sk, len, optval,
+							optlen);
+		break;
+	case SCTP_GET_ASSOC_NUMBER:
+		retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen);
+		break;
+	case SCTP_GET_ASSOC_ID_LIST:
+		retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen);
+		break;
+	default:
+		retval = -ENOPROTOOPT;
+		break;
+	}
+
+	sctp_release_sock(sk);
+	return retval;
+}
+
+static void sctp_hash(struct sock *sk)
+{
+	/* STUB */
+}
+
+static void sctp_unhash(struct sock *sk)
+{
+	/* STUB */
+}
+
+/* Check if port is acceptable.  Possibly find first available port.
+ *
+ * The port hash table (contained in the 'global' SCTP protocol storage
+ * returned by struct sctp_protocol *sctp_get_protocol()). The hash
+ * table is an array of 4096 lists (sctp_bind_hashbucket). Each
+ * list (the list number is the port number hashed out, so as you
+ * would expect from a hash function, all the ports in a given list have
+ * such a number that hashes out to the same list number; you were
+ * expecting that, right?); so each list has a set of ports, with a
+ * link to the socket (struct sock) that uses it, the port number and
+ * a fastreuse flag (FIXME: NPI ipg).
+ */
+static struct sctp_bind_bucket *sctp_bucket_create(
+	struct sctp_bind_hashbucket *head, unsigned short snum);
+
+static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
+{
+	struct sctp_bind_hashbucket *head; /* hash list */
+	struct sctp_bind_bucket *pp; /* hash list port iterator */
+	struct hlist_node *node;
+	unsigned short snum;
+	int ret;
+
+	snum = ntohs(addr->v4.sin_port);
+
+	SCTP_DEBUG_PRINTK("sctp_get_port() begins, snum=%d\n", snum);
+	sctp_local_bh_disable();
+
+	if (snum == 0) {
+		/* Search for an available port. */
+		int low, high, remaining, index;
+		unsigned int rover;
+
+		inet_get_local_port_range(&low, &high);
+		remaining = (high - low) + 1;
+		rover = net_random() % remaining + low;
+
+		do {
+			rover++;
+			if ((rover < low) || (rover > high))
+				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
+			index = sctp_phashfn(rover);
+			head = &sctp_port_hashtable[index];
+			sctp_spin_lock(&head->lock);
+			sctp_for_each_hentry(pp, node, &head->chain)
+				if (pp->port == rover)
+					goto next;
+			break;
+		next:
+			sctp_spin_unlock(&head->lock);
+		} while (--remaining > 0);
+
+		/* Exhausted local port range during search? */
+		ret = 1;
+		if (remaining <= 0)
+			goto fail;
+
+		/* OK, here is the one we will use.  HEAD (the port
+		 * hash table list entry) is non-NULL and we hold it's
+		 * mutex.
+		 */
+		snum = rover;
+	} else {
+		/* We are given an specific port number; we verify
+		 * that it is not being used. If it is used, we will
+		 * exahust the search in the hash list corresponding
+		 * to the port number (snum) - we detect that with the
+		 * port iterator, pp being NULL.
+		 */
+		head = &sctp_port_hashtable[sctp_phashfn(snum)];
+		sctp_spin_lock(&head->lock);
+		sctp_for_each_hentry(pp, node, &head->chain) {
+			if (pp->port == snum)
+				goto pp_found;
+		}
+	}
+	pp = NULL;
+	goto pp_not_found;
+pp_found:
+	if (!hlist_empty(&pp->owner)) {
+		/* We had a port hash table hit - there is an
+		 * available port (pp != NULL) and it is being
+		 * used by other socket (pp->owner not empty); that other
+		 * socket is going to be sk2.
+		 */
+		int reuse = sk->sk_reuse;
+		struct sock *sk2;
+
+		SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n");
+		if (pp->fastreuse && sk->sk_reuse &&
+			sk->sk_state != SCTP_SS_LISTENING)
+			goto success;
+
+		/* Run through the list of sockets bound to the port
+		 * (pp->port) [via the pointers bind_next and
+		 * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one,
+		 * we get the endpoint they describe and run through
+		 * the endpoint's list of IP (v4 or v6) addresses,
+		 * comparing each of the addresses with the address of
+		 * the socket sk. If we find a match, then that means
+		 * that this port/socket (sk) combination are already
+		 * in an endpoint.
+		 */
+		sk_for_each_bound(sk2, node, &pp->owner) {
+			struct sctp_endpoint *ep2;
+			ep2 = sctp_sk(sk2)->ep;
+
+			if (sk == sk2 ||
+			    (reuse && sk2->sk_reuse &&
+			     sk2->sk_state != SCTP_SS_LISTENING))
+				continue;
+
+			if (sctp_bind_addr_conflict(&ep2->base.bind_addr, addr,
+						 sctp_sk(sk2), sctp_sk(sk))) {
+				ret = (long)sk2;
+				goto fail_unlock;
+			}
+		}
+		SCTP_DEBUG_PRINTK("sctp_get_port(): Found a match\n");
+	}
+pp_not_found:
+	/* If there was a hash table miss, create a new port.  */
+	ret = 1;
+	if (!pp && !(pp = sctp_bucket_create(head, snum)))
+		goto fail_unlock;
+
+	/* In either case (hit or miss), make sure fastreuse is 1 only
+	 * if sk->sk_reuse is too (that is, if the caller requested
+	 * SO_REUSEADDR on this socket -sk-).
+	 */
+	if (hlist_empty(&pp->owner)) {
+		if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+			pp->fastreuse = 1;
+		else
+			pp->fastreuse = 0;
+	} else if (pp->fastreuse &&
+		(!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+		pp->fastreuse = 0;
+
+	/* We are set, so fill up all the data in the hash table
+	 * entry, tie the socket list information with the rest of the
+	 * sockets FIXME: Blurry, NPI (ipg).
+	 */
+success:
+	if (!sctp_sk(sk)->bind_hash) {
+		inet_sk(sk)->inet_num = snum;
+		sk_add_bind_node(sk, &pp->owner);
+		sctp_sk(sk)->bind_hash = pp;
+	}
+	ret = 0;
+
+fail_unlock:
+	sctp_spin_unlock(&head->lock);
+
+fail:
+	sctp_local_bh_enable();
+	return ret;
+}
+
+/* Assign a 'snum' port to the socket.  If snum == 0, an ephemeral
+ * port is requested.
+ */
+static int sctp_get_port(struct sock *sk, unsigned short snum)
+{
+	long ret;
+	union sctp_addr addr;
+	struct sctp_af *af = sctp_sk(sk)->pf->af;
+
+	/* Set up a dummy address struct from the sk. */
+	af->from_sk(&addr, sk);
+	addr.v4.sin_port = htons(snum);
+
+	/* Note: sk->sk_num gets filled in if ephemeral port request. */
+	ret = sctp_get_port_local(sk, &addr);
+
+	return ret ? 1 : 0;
+}
+
+/*
+ *  Move a socket to LISTENING state.
+ */
+SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+	struct sctp_endpoint *ep = sp->ep;
+	struct crypto_hash *tfm = NULL;
+
+	/* Allocate HMAC for generating cookie. */
+	if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
+		tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm)) {
+			if (net_ratelimit()) {
+				pr_info("failed to load transform for %s: %ld\n",
+					sctp_hmac_alg, PTR_ERR(tfm));
+			}
+			return -ENOSYS;
+		}
+		sctp_sk(sk)->hmac = tfm;
+	}
+
+	/*
+	 * If a bind() or sctp_bindx() is not called prior to a listen()
+	 * call that allows new associations to be accepted, the system
+	 * picks an ephemeral port and will choose an address set equivalent
+	 * to binding with a wildcard address.
+	 *
+	 * This is not currently spelled out in the SCTP sockets
+	 * extensions draft, but follows the practice as seen in TCP
+	 * sockets.
+	 *
+	 */
+	sk->sk_state = SCTP_SS_LISTENING;
+	if (!ep->base.bind_addr.port) {
+		if (sctp_autobind(sk))
+			return -EAGAIN;
+	} else {
+		if (sctp_get_port(sk, inet_sk(sk)->inet_num)) {
+			sk->sk_state = SCTP_SS_CLOSED;
+			return -EADDRINUSE;
+		}
+	}
+
+	sk->sk_max_ack_backlog = backlog;
+	sctp_hash_endpoint(ep);
+	return 0;
+}
+
+/*
+ * 4.1.3 / 5.1.3 listen()
+ *
+ *   By default, new associations are not accepted for UDP style sockets.
+ *   An application uses listen() to mark a socket as being able to
+ *   accept new associations.
+ *
+ *   On TCP style sockets, applications use listen() to ready the SCTP
+ *   endpoint for accepting inbound associations.
+ *
+ *   On both types of endpoints a backlog of '0' disables listening.
+ *
+ *  Move a socket to LISTENING state.
+ */
+int sctp_inet_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+	int err = -EINVAL;
+
+	if (unlikely(backlog < 0))
+		return err;
+
+	sctp_lock_sock(sk);
+
+	/* Peeled-off sockets are not allowed to listen().  */
+	if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
+		goto out;
+
+	if (sock->state != SS_UNCONNECTED)
+		goto out;
+
+	/* If backlog is zero, disable listening. */
+	if (!backlog) {
+		if (sctp_sstate(sk, CLOSED))
+			goto out;
+
+		err = 0;
+		sctp_unhash_endpoint(ep);
+		sk->sk_state = SCTP_SS_CLOSED;
+		if (sk->sk_reuse)
+			sctp_sk(sk)->bind_hash->fastreuse = 1;
+		goto out;
+	}
+
+	/* If we are already listening, just update the backlog */
+	if (sctp_sstate(sk, LISTENING))
+		sk->sk_max_ack_backlog = backlog;
+	else {
+		err = sctp_listen_start(sk, backlog);
+		if (err)
+			goto out;
+	}
+
+	err = 0;
+out:
+	sctp_release_sock(sk);
+	return err;
+}
+
+/*
+ * This function is done by modeling the current datagram_poll() and the
+ * tcp_poll().  Note that, based on these implementations, we don't
+ * lock the socket in this function, even though it seems that,
+ * ideally, locking or some other mechanisms can be used to ensure
+ * the integrity of the counters (sndbuf and wmem_alloc) used
+ * in this place.  We assume that we don't need locks either until proven
+ * otherwise.
+ *
+ * Another thing to note is that we include the Async I/O support
+ * here, again, by modeling the current TCP/UDP code.  We don't have
+ * a good way to test with it yet.
+ */
+unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct sctp_sock *sp = sctp_sk(sk);
+	unsigned int mask;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	/* A TCP-style listening socket becomes readable when the accept queue
+	 * is not empty.
+	 */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+		return (!list_empty(&sp->ep->asocs)) ?
+			(POLLIN | POLLRDNORM) : 0;
+
+	mask = 0;
+
+	/* Is there any exceptional events?  */
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	/* Is it readable?  Reconsider this code with TCP-style support.  */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* The association is either gone or not ready.  */
+	if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED))
+		return mask;
+
+	/* Is it writable?  */
+	if (sctp_writeable(sk)) {
+		mask |= POLLOUT | POLLWRNORM;
+	} else {
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+		/*
+		 * Since the socket is not locked, the buffer
+		 * might be made available after the writeable check and
+		 * before the bit is set.  This could cause a lost I/O
+		 * signal.  tcp_poll() has a race breaker for this race
+		 * condition.  Based on their implementation, we put
+		 * in the following code to cover it as well.
+		 */
+		if (sctp_writeable(sk))
+			mask |= POLLOUT | POLLWRNORM;
+	}
+	return mask;
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+static struct sctp_bind_bucket *sctp_bucket_create(
+	struct sctp_bind_hashbucket *head, unsigned short snum)
+{
+	struct sctp_bind_bucket *pp;
+
+	pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC);
+	if (pp) {
+		SCTP_DBG_OBJCNT_INC(bind_bucket);
+		pp->port = snum;
+		pp->fastreuse = 0;
+		INIT_HLIST_HEAD(&pp->owner);
+		hlist_add_head(&pp->node, &head->chain);
+	}
+	return pp;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+static void sctp_bucket_destroy(struct sctp_bind_bucket *pp)
+{
+	if (pp && hlist_empty(&pp->owner)) {
+		__hlist_del(&pp->node);
+		kmem_cache_free(sctp_bucket_cachep, pp);
+		SCTP_DBG_OBJCNT_DEC(bind_bucket);
+	}
+}
+
+/* Release this socket's reference to a local port.  */
+static inline void __sctp_put_port(struct sock *sk)
+{
+	struct sctp_bind_hashbucket *head =
+		&sctp_port_hashtable[sctp_phashfn(inet_sk(sk)->inet_num)];
+	struct sctp_bind_bucket *pp;
+
+	sctp_spin_lock(&head->lock);
+	pp = sctp_sk(sk)->bind_hash;
+	__sk_del_bind_node(sk);
+	sctp_sk(sk)->bind_hash = NULL;
+	inet_sk(sk)->inet_num = 0;
+	sctp_bucket_destroy(pp);
+	sctp_spin_unlock(&head->lock);
+}
+
+void sctp_put_port(struct sock *sk)
+{
+	sctp_local_bh_disable();
+	__sctp_put_port(sk);
+	sctp_local_bh_enable();
+}
+
+/*
+ * The system picks an ephemeral port and choose an address set equivalent
+ * to binding with a wildcard address.
+ * One of those addresses will be the primary address for the association.
+ * This automatically enables the multihoming capability of SCTP.
+ */
+static int sctp_autobind(struct sock *sk)
+{
+	union sctp_addr autoaddr;
+	struct sctp_af *af;
+	__be16 port;
+
+	/* Initialize a local sockaddr structure to INADDR_ANY. */
+	af = sctp_sk(sk)->pf->af;
+
+	port = htons(inet_sk(sk)->inet_num);
+	af->inaddr_any(&autoaddr, port);
+
+	return sctp_do_bind(sk, &autoaddr, af->sockaddr_len);
+}
+
+/* Parse out IPPROTO_SCTP CMSG headers.  Perform only minimal validation.
+ *
+ * From RFC 2292
+ * 4.2 The cmsghdr Structure *
+ *
+ * When ancillary data is sent or received, any number of ancillary data
+ * objects can be specified by the msg_control and msg_controllen members of
+ * the msghdr structure, because each object is preceded by
+ * a cmsghdr structure defining the object's length (the cmsg_len member).
+ * Historically Berkeley-derived implementations have passed only one object
+ * at a time, but this API allows multiple objects to be
+ * passed in a single call to sendmsg() or recvmsg(). The following example
+ * shows two ancillary data objects in a control buffer.
+ *
+ *   |<--------------------------- msg_controllen -------------------------->|
+ *   |                                                                       |
+ *
+ *   |<----- ancillary data object ----->|<----- ancillary data object ----->|
+ *
+ *   |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->|
+ *   |                                   |                                   |
+ *
+ *   |<---------- cmsg_len ---------->|  |<--------- cmsg_len ----------->|  |
+ *
+ *   |<--------- CMSG_LEN() --------->|  |<-------- CMSG_LEN() ---------->|  |
+ *   |                                |  |                                |  |
+ *
+ *   +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
+ *   |cmsg_|cmsg_|cmsg_|XX|           |XX|cmsg_|cmsg_|cmsg_|XX|           |XX|
+ *
+ *   |len  |level|type |XX|cmsg_data[]|XX|len  |level|type |XX|cmsg_data[]|XX|
+ *
+ *   +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
+ *    ^
+ *    |
+ *
+ * msg_control
+ * points here
+ */
+SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg,
+				  sctp_cmsgs_t *cmsgs)
+{
+	struct cmsghdr *cmsg;
+	struct msghdr *my_msg = (struct msghdr *)msg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg);
+	     cmsg != NULL;
+	     cmsg = CMSG_NXTHDR(my_msg, cmsg)) {
+		if (!CMSG_OK(my_msg, cmsg))
+			return -EINVAL;
+
+		/* Should we parse this header or ignore?  */
+		if (cmsg->cmsg_level != IPPROTO_SCTP)
+			continue;
+
+		/* Strictly check lengths following example in SCM code.  */
+		switch (cmsg->cmsg_type) {
+		case SCTP_INIT:
+			/* SCTP Socket API Extension
+			 * 5.2.1 SCTP Initiation Structure (SCTP_INIT)
+			 *
+			 * This cmsghdr structure provides information for
+			 * initializing new SCTP associations with sendmsg().
+			 * The SCTP_INITMSG socket option uses this same data
+			 * structure.  This structure is not used for
+			 * recvmsg().
+			 *
+			 * cmsg_level    cmsg_type      cmsg_data[]
+			 * ------------  ------------   ----------------------
+			 * IPPROTO_SCTP  SCTP_INIT      struct sctp_initmsg
+			 */
+			if (cmsg->cmsg_len !=
+			    CMSG_LEN(sizeof(struct sctp_initmsg)))
+				return -EINVAL;
+			cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg);
+			break;
+
+		case SCTP_SNDRCV:
+			/* SCTP Socket API Extension
+			 * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV)
+			 *
+			 * This cmsghdr structure specifies SCTP options for
+			 * sendmsg() and describes SCTP header information
+			 * about a received message through recvmsg().
+			 *
+			 * cmsg_level    cmsg_type      cmsg_data[]
+			 * ------------  ------------   ----------------------
+			 * IPPROTO_SCTP  SCTP_SNDRCV    struct sctp_sndrcvinfo
+			 */
+			if (cmsg->cmsg_len !=
+			    CMSG_LEN(sizeof(struct sctp_sndrcvinfo)))
+				return -EINVAL;
+
+			cmsgs->info =
+				(struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+			/* Minimally, validate the sinfo_flags. */
+			if (cmsgs->info->sinfo_flags &
+			    ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+			      SCTP_ABORT | SCTP_EOF))
+				return -EINVAL;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Wait for a packet..
+ * Note: This function is the same function as in core/datagram.c
+ * with a few modifications to make lksctp work.
+ */
+static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p)
+{
+	int error;
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+	/* Socket errors? */
+	error = sock_error(sk);
+	if (error)
+		goto out;
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		goto ready;
+
+	/* Socket shut down?  */
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		goto out;
+
+	/* Sequenced packets can come disconnected.  If so we report the
+	 * problem.
+	 */
+	error = -ENOTCONN;
+
+	/* Is there a good reason to think that we may receive some data?  */
+	if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING))
+		goto out;
+
+	/* Handle signals.  */
+	if (signal_pending(current))
+		goto interrupted;
+
+	/* Let another process have a go.  Since we are going to sleep
+	 * anyway.  Note: This may cause odd behaviors if the message
+	 * does not fit in the user's buffer, but this seems to be the
+	 * only way to honor MSG_DONTWAIT realistically.
+	 */
+	sctp_release_sock(sk);
+	*timeo_p = schedule_timeout(*timeo_p);
+	sctp_lock_sock(sk);
+
+ready:
+	finish_wait(sk_sleep(sk), &wait);
+	return 0;
+
+interrupted:
+	error = sock_intr_errno(*timeo_p);
+
+out:
+	finish_wait(sk_sleep(sk), &wait);
+	*err = error;
+	return error;
+}
+
+/* Receive a datagram.
+ * Note: This is pretty much the same routine as in core/datagram.c
+ * with a few changes to make lksctp work.
+ */
+static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
+					      int noblock, int *err)
+{
+	int error;
+	struct sk_buff *skb;
+	long timeo;
+
+	timeo = sock_rcvtimeo(sk, noblock);
+
+	SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n",
+			  timeo, MAX_SCHEDULE_TIMEOUT);
+
+	do {
+		/* Again only user level code calls this function,
+		 * so nothing interrupt level
+		 * will suddenly eat the receive_queue.
+		 *
+		 *  Look at current nfs client by the way...
+		 *  However, this function was correct in any case. 8)
+		 */
+		if (flags & MSG_PEEK) {
+			spin_lock_bh(&sk->sk_receive_queue.lock);
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb)
+				atomic_inc(&skb->users);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
+		} else {
+			skb = skb_dequeue(&sk->sk_receive_queue);
+		}
+
+		if (skb)
+			return skb;
+
+		/* Caller is allowed not to check sk->sk_err before calling. */
+		error = sock_error(sk);
+		if (error)
+			goto no_packet;
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+
+		/* User doesn't want to wait.  */
+		error = -EAGAIN;
+		if (!timeo)
+			goto no_packet;
+	} while (sctp_wait_for_packet(sk, err, &timeo) == 0);
+
+	return NULL;
+
+no_packet:
+	*err = error;
+	return NULL;
+}
+
+/* If sndbuf has changed, wake up per association sndbuf waiters.  */
+static void __sctp_write_space(struct sctp_association *asoc)
+{
+	struct sock *sk = asoc->base.sk;
+	struct socket *sock = sk->sk_socket;
+
+	if ((sctp_wspace(asoc) > 0) && sock) {
+		if (waitqueue_active(&asoc->wait))
+			wake_up_interruptible(&asoc->wait);
+
+		if (sctp_writeable(sk)) {
+			wait_queue_head_t *wq = sk_sleep(sk);
+
+			if (wq && waitqueue_active(wq))
+				wake_up_interruptible(wq);
+
+			/* Note that we try to include the Async I/O support
+			 * here by modeling from the current TCP/UDP code.
+			 * We have not tested with it yet.
+			 */
+			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
+				sock_wake_async(sock,
+						SOCK_WAKE_SPACE, POLL_OUT);
+		}
+	}
+}
+
+/* Do accounting for the sndbuf space.
+ * Decrement the used sndbuf space of the corresponding association by the
+ * data size which was just transmitted(freed).
+ */
+static void sctp_wfree(struct sk_buff *skb)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *chunk;
+	struct sock *sk;
+
+	/* Get the saved chunk pointer.  */
+	chunk = *((struct sctp_chunk **)(skb->cb));
+	asoc = chunk->asoc;
+	sk = asoc->base.sk;
+	asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
+				sizeof(struct sk_buff) +
+				sizeof(struct sctp_chunk);
+
+	atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
+
+	/*
+	 * This undoes what is done via sctp_set_owner_w and sk_mem_charge
+	 */
+	sk->sk_wmem_queued   -= skb->truesize;
+	sk_mem_uncharge(sk, skb->truesize);
+
+	sock_wfree(skb);
+	__sctp_write_space(asoc);
+
+	sctp_association_put(asoc);
+}
+
+/* Do accounting for the receive space on the socket.
+ * Accounting for the association is done in ulpevent.c
+ * We set this as a destructor for the cloned data skbs so that
+ * accounting is done at the correct time.
+ */
+void sctp_sock_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct sctp_ulpevent *event = sctp_skb2event(skb);
+
+	atomic_sub(event->rmem_len, &sk->sk_rmem_alloc);
+
+	/*
+	 * Mimic the behavior of sock_rfree
+	 */
+	sk_mem_uncharge(sk, event->rmem_len);
+}
+
+
+/* Helper function to wait for space in the sndbuf.  */
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
+				size_t msg_len)
+{
+	struct sock *sk = asoc->base.sk;
+	int err = 0;
+	long current_timeo = *timeo_p;
+	DEFINE_WAIT(wait);
+
+	SCTP_DEBUG_PRINTK("wait_for_sndbuf: asoc=%p, timeo=%ld, msg_len=%zu\n",
+			  asoc, (long)(*timeo_p), msg_len);
+
+	/* Increment the association's refcnt.  */
+	sctp_association_hold(asoc);
+
+	/* Wait on the association specific sndbuf space. */
+	for (;;) {
+		prepare_to_wait_exclusive(&asoc->wait, &wait,
+					  TASK_INTERRUPTIBLE);
+		if (!*timeo_p)
+			goto do_nonblock;
+		if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING ||
+		    asoc->base.dead)
+			goto do_error;
+		if (signal_pending(current))
+			goto do_interrupted;
+		if (msg_len <= sctp_wspace(asoc))
+			break;
+
+		/* Let another process have a go.  Since we are going
+		 * to sleep anyway.
+		 */
+		sctp_release_sock(sk);
+		current_timeo = schedule_timeout(current_timeo);
+		BUG_ON(sk != asoc->base.sk);
+		sctp_lock_sock(sk);
+
+		*timeo_p = current_timeo;
+	}
+
+out:
+	finish_wait(&asoc->wait, &wait);
+
+	/* Release the association's refcnt.  */
+	sctp_association_put(asoc);
+
+	return err;
+
+do_error:
+	err = -EPIPE;
+	goto out;
+
+do_interrupted:
+	err = sock_intr_errno(*timeo_p);
+	goto out;
+
+do_nonblock:
+	err = -EAGAIN;
+	goto out;
+}
+
+void sctp_data_ready(struct sock *sk, int len)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (wq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+						POLLRDNORM | POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+
+/* If socket sndbuf has changed, wake up all per association waiters.  */
+void sctp_write_space(struct sock *sk)
+{
+	struct sctp_association *asoc;
+
+	/* Wake up the tasks in each wait queue.  */
+	list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) {
+		__sctp_write_space(asoc);
+	}
+}
+
+/* Is there any sndbuf space available on the socket?
+ *
+ * Note that sk_wmem_alloc is the sum of the send buffers on all of the
+ * associations on the same socket.  For a UDP-style socket with
+ * multiple associations, it is possible for it to be "unwriteable"
+ * prematurely.  I assume that this is acceptable because
+ * a premature "unwriteable" is better than an accidental "writeable" which
+ * would cause an unwanted block under certain circumstances.  For the 1-1
+ * UDP-style sockets or TCP-style sockets, this code should work.
+ *  - Daisy
+ */
+static int sctp_writeable(struct sock *sk)
+{
+	int amt = 0;
+
+	amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+	if (amt < 0)
+		amt = 0;
+	return amt;
+}
+
+/* Wait for an association to go into ESTABLISHED state. If timeout is 0,
+ * returns immediately with EINPROGRESS.
+ */
+static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
+{
+	struct sock *sk = asoc->base.sk;
+	int err = 0;
+	long current_timeo = *timeo_p;
+	DEFINE_WAIT(wait);
+
+	SCTP_DEBUG_PRINTK("%s: asoc=%p, timeo=%ld\n", __func__, asoc,
+			  (long)(*timeo_p));
+
+	/* Increment the association's refcnt.  */
+	sctp_association_hold(asoc);
+
+	for (;;) {
+		prepare_to_wait_exclusive(&asoc->wait, &wait,
+					  TASK_INTERRUPTIBLE);
+		if (!*timeo_p)
+			goto do_nonblock;
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+		if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING ||
+		    asoc->base.dead)
+			goto do_error;
+		if (signal_pending(current))
+			goto do_interrupted;
+
+		if (sctp_state(asoc, ESTABLISHED))
+			break;
+
+		/* Let another process have a go.  Since we are going
+		 * to sleep anyway.
+		 */
+		sctp_release_sock(sk);
+		current_timeo = schedule_timeout(current_timeo);
+		sctp_lock_sock(sk);
+
+		*timeo_p = current_timeo;
+	}
+
+out:
+	finish_wait(&asoc->wait, &wait);
+
+	/* Release the association's refcnt.  */
+	sctp_association_put(asoc);
+
+	return err;
+
+do_error:
+	if (asoc->init_err_counter + 1 > asoc->max_init_attempts)
+		err = -ETIMEDOUT;
+	else
+		err = -ECONNREFUSED;
+	goto out;
+
+do_interrupted:
+	err = sock_intr_errno(*timeo_p);
+	goto out;
+
+do_nonblock:
+	err = -EINPROGRESS;
+	goto out;
+}
+
+static int sctp_wait_for_accept(struct sock *sk, long timeo)
+{
+	struct sctp_endpoint *ep;
+	int err = 0;
+	DEFINE_WAIT(wait);
+
+	ep = sctp_sk(sk)->ep;
+
+
+	for (;;) {
+		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
+					  TASK_INTERRUPTIBLE);
+
+		if (list_empty(&ep->asocs)) {
+			sctp_release_sock(sk);
+			timeo = schedule_timeout(timeo);
+			sctp_lock_sock(sk);
+		}
+
+		err = -EINVAL;
+		if (!sctp_sstate(sk, LISTENING))
+			break;
+
+		err = 0;
+		if (!list_empty(&ep->asocs))
+			break;
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+
+	finish_wait(sk_sleep(sk), &wait);
+
+	return err;
+}
+
+static void sctp_wait_for_close(struct sock *sk, long timeout)
+{
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		if (list_empty(&sctp_sk(sk)->ep->asocs))
+			break;
+		sctp_release_sock(sk);
+		timeout = schedule_timeout(timeout);
+		sctp_lock_sock(sk);
+	} while (!signal_pending(current) && timeout);
+
+	finish_wait(sk_sleep(sk), &wait);
+}
+
+static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk)
+{
+	struct sk_buff *frag;
+
+	if (!skb->data_len)
+		goto done;
+
+	/* Don't forget the fragments. */
+	skb_walk_frags(skb, frag)
+		sctp_skb_set_owner_r_frag(frag, sk);
+
+done:
+	sctp_skb_set_owner_r(skb, sk);
+}
+
+void sctp_copy_sock(struct sock *newsk, struct sock *sk,
+		    struct sctp_association *asoc)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_sock *newinet;
+
+	newsk->sk_type = sk->sk_type;
+	newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
+	newsk->sk_flags = sk->sk_flags;
+	newsk->sk_no_check = sk->sk_no_check;
+	newsk->sk_reuse = sk->sk_reuse;
+
+	newsk->sk_shutdown = sk->sk_shutdown;
+	newsk->sk_destruct = inet_sock_destruct;
+	newsk->sk_family = sk->sk_family;
+	newsk->sk_protocol = IPPROTO_SCTP;
+	newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+	newsk->sk_sndbuf = sk->sk_sndbuf;
+	newsk->sk_rcvbuf = sk->sk_rcvbuf;
+	newsk->sk_lingertime = sk->sk_lingertime;
+	newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
+	newsk->sk_sndtimeo = sk->sk_sndtimeo;
+
+	newinet = inet_sk(newsk);
+
+	/* Initialize sk's sport, dport, rcv_saddr and daddr for
+	 * getsockname() and getpeername()
+	 */
+	newinet->inet_sport = inet->inet_sport;
+	newinet->inet_saddr = inet->inet_saddr;
+	newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
+	newinet->inet_dport = htons(asoc->peer.port);
+	newinet->pmtudisc = inet->pmtudisc;
+	newinet->inet_id = asoc->next_tsn ^ jiffies;
+
+	newinet->uc_ttl = inet->uc_ttl;
+	newinet->mc_loop = 1;
+	newinet->mc_ttl = 1;
+	newinet->mc_index = 0;
+	newinet->mc_list = NULL;
+}
+
+/* Populate the fields of the newsk from the oldsk and migrate the assoc
+ * and its messages to the newsk.
+ */
+static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
+			      struct sctp_association *assoc,
+			      sctp_socket_type_t type)
+{
+	struct sctp_sock *oldsp = sctp_sk(oldsk);
+	struct sctp_sock *newsp = sctp_sk(newsk);
+	struct sctp_bind_bucket *pp; /* hash list port iterator */
+	struct sctp_endpoint *newep = newsp->ep;
+	struct sk_buff *skb, *tmp;
+	struct sctp_ulpevent *event;
+	struct sctp_bind_hashbucket *head;
+
+	/* Migrate socket buffer sizes and all the socket level options to the
+	 * new socket.
+	 */
+	newsk->sk_sndbuf = oldsk->sk_sndbuf;
+	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
+	/* Brute force copy old sctp opt. */
+	inet_sk_copy_descendant(newsk, oldsk);
+
+	/* Restore the ep value that was overwritten with the above structure
+	 * copy.
+	 */
+	newsp->ep = newep;
+	newsp->hmac = NULL;
+
+	/* Hook this new socket in to the bind_hash list. */
+	head = &sctp_port_hashtable[sctp_phashfn(inet_sk(oldsk)->inet_num)];
+	sctp_local_bh_disable();
+	sctp_spin_lock(&head->lock);
+	pp = sctp_sk(oldsk)->bind_hash;
+	sk_add_bind_node(newsk, &pp->owner);
+	sctp_sk(newsk)->bind_hash = pp;
+	inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
+	sctp_spin_unlock(&head->lock);
+	sctp_local_bh_enable();
+
+	/* Copy the bind_addr list from the original endpoint to the new
+	 * endpoint so that we can handle restarts properly
+	 */
+	sctp_bind_addr_dup(&newsp->ep->base.bind_addr,
+				&oldsp->ep->base.bind_addr, GFP_KERNEL);
+
+	/* Move any messages in the old socket's receive queue that are for the
+	 * peeled off association to the new socket's receive queue.
+	 */
+	sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
+		event = sctp_skb2event(skb);
+		if (event->asoc == assoc) {
+			__skb_unlink(skb, &oldsk->sk_receive_queue);
+			__skb_queue_tail(&newsk->sk_receive_queue, skb);
+			sctp_skb_set_owner_r_frag(skb, newsk);
+		}
+	}
+
+	/* Clean up any messages pending delivery due to partial
+	 * delivery.   Three cases:
+	 * 1) No partial deliver;  no work.
+	 * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby.
+	 * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue.
+	 */
+	skb_queue_head_init(&newsp->pd_lobby);
+	atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode);
+
+	if (atomic_read(&sctp_sk(oldsk)->pd_mode)) {
+		struct sk_buff_head *queue;
+
+		/* Decide which queue to move pd_lobby skbs to. */
+		if (assoc->ulpq.pd_mode) {
+			queue = &newsp->pd_lobby;
+		} else
+			queue = &newsk->sk_receive_queue;
+
+		/* Walk through the pd_lobby, looking for skbs that
+		 * need moved to the new socket.
+		 */
+		sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
+			event = sctp_skb2event(skb);
+			if (event->asoc == assoc) {
+				__skb_unlink(skb, &oldsp->pd_lobby);
+				__skb_queue_tail(queue, skb);
+				sctp_skb_set_owner_r_frag(skb, newsk);
+			}
+		}
+
+		/* Clear up any skbs waiting for the partial
+		 * delivery to finish.
+		 */
+		if (assoc->ulpq.pd_mode)
+			sctp_clear_pd(oldsk, NULL);
+
+	}
+
+	sctp_skb_for_each(skb, &assoc->ulpq.reasm, tmp)
+		sctp_skb_set_owner_r_frag(skb, newsk);
+
+	sctp_skb_for_each(skb, &assoc->ulpq.lobby, tmp)
+		sctp_skb_set_owner_r_frag(skb, newsk);
+
+	/* Set the type of socket to indicate that it is peeled off from the
+	 * original UDP-style socket or created with the accept() call on a
+	 * TCP-style socket..
+	 */
+	newsp->type = type;
+
+	/* Mark the new socket "in-use" by the user so that any packets
+	 * that may arrive on the association after we've moved it are
+	 * queued to the backlog.  This prevents a potential race between
+	 * backlog processing on the old socket and new-packet processing
+	 * on the new socket.
+	 *
+	 * The caller has just allocated newsk so we can guarantee that other
+	 * paths won't try to lock it and then oldsk.
+	 */
+	lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
+	sctp_assoc_migrate(assoc, newsk);
+
+	/* If the association on the newsk is already closed before accept()
+	 * is called, set RCV_SHUTDOWN flag.
+	 */
+	if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP))
+		newsk->sk_shutdown |= RCV_SHUTDOWN;
+
+	newsk->sk_state = SCTP_SS_ESTABLISHED;
+	sctp_release_sock(newsk);
+}
+
+
+/* This proto struct describes the ULP interface for SCTP.  */
+struct proto sctp_prot = {
+	.name        =	"SCTP",
+	.owner       =	THIS_MODULE,
+	.close       =	sctp_close,
+	.connect     =	sctp_connect,
+	.disconnect  =	sctp_disconnect,
+	.accept      =	sctp_accept,
+	.ioctl       =	sctp_ioctl,
+	.init        =	sctp_init_sock,
+	.destroy     =	sctp_destroy_sock,
+	.shutdown    =	sctp_shutdown,
+	.setsockopt  =	sctp_setsockopt,
+	.getsockopt  =	sctp_getsockopt,
+	.sendmsg     =	sctp_sendmsg,
+	.recvmsg     =	sctp_recvmsg,
+	.bind        =	sctp_bind,
+	.backlog_rcv =	sctp_backlog_rcv,
+	.hash        =	sctp_hash,
+	.unhash      =	sctp_unhash,
+	.get_port    =	sctp_get_port,
+	.obj_size    =  sizeof(struct sctp_sock),
+	.sysctl_mem  =  sysctl_sctp_mem,
+	.sysctl_rmem =  sysctl_sctp_rmem,
+	.sysctl_wmem =  sysctl_sctp_wmem,
+	.memory_pressure = &sctp_memory_pressure,
+	.enter_memory_pressure = sctp_enter_memory_pressure,
+	.memory_allocated = &sctp_memory_allocated,
+	.sockets_allocated = &sctp_sockets_allocated,
+};
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+struct proto sctpv6_prot = {
+	.name		= "SCTPv6",
+	.owner		= THIS_MODULE,
+	.close		= sctp_close,
+	.connect	= sctp_connect,
+	.disconnect	= sctp_disconnect,
+	.accept		= sctp_accept,
+	.ioctl		= sctp_ioctl,
+	.init		= sctp_init_sock,
+	.destroy	= sctp_destroy_sock,
+	.shutdown	= sctp_shutdown,
+	.setsockopt	= sctp_setsockopt,
+	.getsockopt	= sctp_getsockopt,
+	.sendmsg	= sctp_sendmsg,
+	.recvmsg	= sctp_recvmsg,
+	.bind		= sctp_bind,
+	.backlog_rcv	= sctp_backlog_rcv,
+	.hash		= sctp_hash,
+	.unhash		= sctp_unhash,
+	.get_port	= sctp_get_port,
+	.obj_size	= sizeof(struct sctp6_sock),
+	.sysctl_mem	= sysctl_sctp_mem,
+	.sysctl_rmem	= sysctl_sctp_rmem,
+	.sysctl_wmem	= sysctl_sctp_wmem,
+	.memory_pressure = &sctp_memory_pressure,
+	.enter_memory_pressure = sctp_enter_memory_pressure,
+	.memory_allocated = &sctp_memory_allocated,
+	.sockets_allocated = &sctp_sockets_allocated,
+};
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
new file mode 100644
index 00000000..442ad4ed
--- /dev/null
+++ b/net/sctp/ssnmap.c
@@ -0,0 +1,133 @@
+/* SCTP kernel implementation
+ * Copyright (c) 2003 International Business Machines, Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp SSN tracker.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+#define MAX_KMALLOC_SIZE	131072
+
+static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
+					    __u16 out);
+
+/* Storage size needed for map includes 2 headers and then the
+ * specific needs of in or out streams.
+ */
+static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
+{
+	return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
+}
+
+
+/* Create a new sctp_ssnmap.
+ * Allocate room to store at least 'len' contiguous TSNs.
+ */
+struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
+				    gfp_t gfp)
+{
+	struct sctp_ssnmap *retval;
+	int size;
+
+	size = sctp_ssnmap_size(in, out);
+	if (size <= MAX_KMALLOC_SIZE)
+		retval = kmalloc(size, gfp);
+	else
+		retval = (struct sctp_ssnmap *)
+			  __get_free_pages(gfp, get_order(size));
+	if (!retval)
+		goto fail;
+
+	if (!sctp_ssnmap_init(retval, in, out))
+		goto fail_map;
+
+	retval->malloced = 1;
+	SCTP_DBG_OBJCNT_INC(ssnmap);
+
+	return retval;
+
+fail_map:
+	if (size <= MAX_KMALLOC_SIZE)
+		kfree(retval);
+	else
+		free_pages((unsigned long)retval, get_order(size));
+fail:
+	return NULL;
+}
+
+
+/* Initialize a block of memory as a ssnmap.  */
+static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
+					    __u16 out)
+{
+	memset(map, 0x00, sctp_ssnmap_size(in, out));
+
+	/* Start 'in' stream just after the map header. */
+	map->in.ssn = (__u16 *)&map[1];
+	map->in.len = in;
+
+	/* Start 'out' stream just after 'in'. */
+	map->out.ssn = &map->in.ssn[in];
+	map->out.len = out;
+
+	return map;
+}
+
+/* Clear out the ssnmap streams.  */
+void sctp_ssnmap_clear(struct sctp_ssnmap *map)
+{
+	size_t size;
+
+	size = (map->in.len + map->out.len) * sizeof(__u16);
+	memset(map->in.ssn, 0x00, size);
+}
+
+/* Dispose of a ssnmap.  */
+void sctp_ssnmap_free(struct sctp_ssnmap *map)
+{
+	if (map && map->malloced) {
+		int size;
+
+		size = sctp_ssnmap_size(map->in.len, map->out.len);
+		if (size <= MAX_KMALLOC_SIZE)
+			kfree(map);
+		else
+			free_pages((unsigned long)map, get_order(size));
+		SCTP_DBG_OBJCNT_DEC(ssnmap);
+	}
+}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
new file mode 100644
index 00000000..6752f489
--- /dev/null
+++ b/net/sctp/sysctl.c
@@ -0,0 +1,289 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2002, 2004
+ * Copyright (c) 2002 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * Sysctl related interfaces for SCTP.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Mingqin Liu           <liuming@us.ibm.com>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Ardelle Fan           <ardelle.fan@intel.com>
+ *    Ryan Layer            <rmlayer@us.ibm.com>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <linux/sysctl.h>
+
+static int zero = 0;
+static int one = 1;
+static int timer_max = 86400000; /* ms in one day */
+static int int_max = INT_MAX;
+static int sack_timer_min = 1;
+static int sack_timer_max = 500;
+static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
+static int rwnd_scale_max = 16;
+static unsigned long max_autoclose_min = 0;
+static unsigned long max_autoclose_max =
+	(MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX)
+	? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ;
+
+extern long sysctl_sctp_mem[3];
+extern int sysctl_sctp_rmem[3];
+extern int sysctl_sctp_wmem[3];
+
+static ctl_table sctp_table[] = {
+	{
+		.procname	= "rto_initial",
+		.data		= &sctp_rto_initial,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2         = &timer_max
+	},
+	{
+		.procname	= "rto_min",
+		.data		= &sctp_rto_min,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2         = &timer_max
+	},
+	{
+		.procname	= "rto_max",
+		.data		= &sctp_rto_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2         = &timer_max
+	},
+	{
+		.procname	= "valid_cookie_life",
+		.data		= &sctp_valid_cookie_life,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2         = &timer_max
+	},
+	{
+		.procname	= "max_burst",
+		.data		= &sctp_max_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max
+	},
+	{
+		.procname	= "association_max_retrans",
+		.data		= &sctp_max_retrans_association,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &int_max
+	},
+	{
+		.procname	= "sndbuf_policy",
+		.data		= &sctp_sndbuf_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rcvbuf_policy",
+		.data		= &sctp_rcvbuf_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "path_max_retrans",
+		.data		= &sctp_max_retrans_path,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &int_max
+	},
+	{
+		.procname	= "max_init_retransmits",
+		.data		= &sctp_max_retrans_init,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &int_max
+	},
+	{
+		.procname	= "hb_interval",
+		.data		= &sctp_hb_interval,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &one,
+		.extra2         = &timer_max
+	},
+	{
+		.procname	= "cookie_preserve_enable",
+		.data		= &sctp_cookie_preserve_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rto_alpha_exp_divisor",
+		.data		= &sctp_rto_alpha,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rto_beta_exp_divisor",
+		.data		= &sctp_rto_beta,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "addip_enable",
+		.data		= &sctp_addip_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "prsctp_enable",
+		.data		= &sctp_prsctp_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sack_timeout",
+		.data		= &sctp_sack_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &sack_timer_min,
+		.extra2         = &sack_timer_max,
+	},
+	{
+		.procname	= "sctp_mem",
+		.data		= &sysctl_sctp_mem,
+		.maxlen		= sizeof(sysctl_sctp_mem),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax
+	},
+	{
+		.procname	= "sctp_rmem",
+		.data		= &sysctl_sctp_rmem,
+		.maxlen		= sizeof(sysctl_sctp_rmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sctp_wmem",
+		.data		= &sysctl_sctp_wmem,
+		.maxlen		= sizeof(sysctl_sctp_wmem),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "auth_enable",
+		.data		= &sctp_auth_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "addip_noauth_enable",
+		.data		= &sctp_addip_noauth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "addr_scope_policy",
+		.data		= &sctp_scope_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &addr_scope_max,
+	},
+	{
+		.procname	= "rwnd_update_shift",
+		.data		= &sctp_rwnd_upd_shift,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &rwnd_scale_max,
+	},
+	{
+		.procname	= "max_autoclose",
+		.data		= &sctp_max_autoclose,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.extra1		= &max_autoclose_min,
+		.extra2		= &max_autoclose_max,
+	},
+
+	{ /* sentinel */ }
+};
+
+static struct ctl_path sctp_path[] = {
+	{ .procname = "net", },
+	{ .procname = "sctp", },
+	{ }
+};
+
+static struct ctl_table_header * sctp_sysctl_header;
+
+/* Sysctl registration.  */
+void sctp_sysctl_register(void)
+{
+	sctp_sysctl_header = register_sysctl_paths(sctp_path, sctp_table);
+}
+
+/* Sysctl deregistration.  */
+void sctp_sysctl_unregister(void)
+{
+	unregister_sysctl_table(sctp_sysctl_header);
+}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
new file mode 100644
index 00000000..8da4481e
--- /dev/null
+++ b/net/sctp/transport.c
@@ -0,0 +1,626 @@
+/* SCTP kernel implementation
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001-2003 International Business Machines Corp.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This module provides the abstraction for an SCTP tranport representing
+ * a remote transport address.  For local transport addresses, we just use
+ * union sctp_addr.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Xingang Guo           <xingang.guo@intel.com>
+ *    Hui Huang             <hui.huang@nokia.com>
+ *    Sridhar Samudrala	    <sri@us.ibm.com>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* 1st Level Abstractions.  */
+
+/* Initialize a new transport from provided memory.  */
+static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
+						  const union sctp_addr *addr,
+						  gfp_t gfp)
+{
+	/* Copy in the address.  */
+	peer->ipaddr = *addr;
+	peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
+	memset(&peer->saddr, 0, sizeof(union sctp_addr));
+
+	/* From 6.3.1 RTO Calculation:
+	 *
+	 * C1) Until an RTT measurement has been made for a packet sent to the
+	 * given destination transport address, set RTO to the protocol
+	 * parameter 'RTO.Initial'.
+	 */
+	peer->rto = msecs_to_jiffies(sctp_rto_initial);
+
+	peer->last_time_heard = jiffies;
+	peer->last_time_ecne_reduced = jiffies;
+
+	peer->param_flags = SPP_HB_DISABLE |
+			    SPP_PMTUD_ENABLE |
+			    SPP_SACKDELAY_ENABLE;
+
+	/* Initialize the default path max_retrans.  */
+	peer->pathmaxrxt  = sctp_max_retrans_path;
+
+	INIT_LIST_HEAD(&peer->transmitted);
+	INIT_LIST_HEAD(&peer->send_ready);
+	INIT_LIST_HEAD(&peer->transports);
+
+	setup_timer(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event,
+			(unsigned long)peer);
+	setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
+			(unsigned long)peer);
+	setup_timer(&peer->proto_unreach_timer,
+		    sctp_generate_proto_unreach_event, (unsigned long)peer);
+
+	/* Initialize the 64-bit random nonce sent with heartbeat. */
+	get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce));
+
+	atomic_set(&peer->refcnt, 1);
+
+	return peer;
+}
+
+/* Allocate and initialize a new transport.  */
+struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
+					  gfp_t gfp)
+{
+	struct sctp_transport *transport;
+
+	transport = t_new(struct sctp_transport, gfp);
+	if (!transport)
+		goto fail;
+
+	if (!sctp_transport_init(transport, addr, gfp))
+		goto fail_init;
+
+	transport->malloced = 1;
+	SCTP_DBG_OBJCNT_INC(transport);
+
+	return transport;
+
+fail_init:
+	kfree(transport);
+
+fail:
+	return NULL;
+}
+
+/* This transport is no longer needed.  Free up if possible, or
+ * delay until it last reference count.
+ */
+void sctp_transport_free(struct sctp_transport *transport)
+{
+	transport->dead = 1;
+
+	/* Try to delete the heartbeat timer.  */
+	if (del_timer(&transport->hb_timer))
+		sctp_transport_put(transport);
+
+	/* Delete the T3_rtx timer if it's active.
+	 * There is no point in not doing this now and letting
+	 * structure hang around in memory since we know
+	 * the tranport is going away.
+	 */
+	if (timer_pending(&transport->T3_rtx_timer) &&
+	    del_timer(&transport->T3_rtx_timer))
+		sctp_transport_put(transport);
+
+	/* Delete the ICMP proto unreachable timer if it's active. */
+	if (timer_pending(&transport->proto_unreach_timer) &&
+	    del_timer(&transport->proto_unreach_timer))
+		sctp_association_put(transport->asoc);
+
+	sctp_transport_put(transport);
+}
+
+/* Destroy the transport data structure.
+ * Assumes there are no more users of this structure.
+ */
+static void sctp_transport_destroy(struct sctp_transport *transport)
+{
+	SCTP_ASSERT(transport->dead, "Transport is not dead", return);
+
+	if (transport->asoc)
+		sctp_association_put(transport->asoc);
+
+	sctp_packet_free(&transport->packet);
+
+	dst_release(transport->dst);
+	kfree(transport);
+	SCTP_DBG_OBJCNT_DEC(transport);
+}
+
+/* Start T3_rtx timer if it is not already running and update the heartbeat
+ * timer.  This routine is called every time a DATA chunk is sent.
+ */
+void sctp_transport_reset_timers(struct sctp_transport *transport)
+{
+	/* RFC 2960 6.3.2 Retransmission Timer Rules
+	 *
+	 * R1) Every time a DATA chunk is sent to any address(including a
+	 * retransmission), if the T3-rtx timer of that address is not running
+	 * start it running so that it will expire after the RTO of that
+	 * address.
+	 */
+
+	if (!timer_pending(&transport->T3_rtx_timer))
+		if (!mod_timer(&transport->T3_rtx_timer,
+			       jiffies + transport->rto))
+			sctp_transport_hold(transport);
+
+	/* When a data chunk is sent, reset the heartbeat interval.  */
+	if (!mod_timer(&transport->hb_timer,
+		       sctp_transport_timeout(transport)))
+	    sctp_transport_hold(transport);
+}
+
+/* This transport has been assigned to an association.
+ * Initialize fields from the association or from the sock itself.
+ * Register the reference count in the association.
+ */
+void sctp_transport_set_owner(struct sctp_transport *transport,
+			      struct sctp_association *asoc)
+{
+	transport->asoc = asoc;
+	sctp_association_hold(asoc);
+}
+
+/* Initialize the pmtu of a transport. */
+void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
+{
+	/* If we don't have a fresh route, look one up */
+	if (!transport->dst || transport->dst->obsolete > 1) {
+		dst_release(transport->dst);
+		transport->af_specific->get_dst(transport, &transport->saddr,
+						&transport->fl, sk);
+	}
+
+	if (transport->dst) {
+		transport->pathmtu = dst_mtu(transport->dst);
+	} else
+		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+}
+
+void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
+{
+	struct dst_entry *dst;
+
+	if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
+		pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
+			__func__, pmtu,
+			SCTP_DEFAULT_MINSEGMENT);
+		/* Use default minimum segment size and disable
+		 * pmtu discovery on this transport.
+		 */
+		t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
+	} else {
+		t->pathmtu = pmtu;
+	}
+
+	dst = sctp_transport_dst_check(t);
+	if (dst)
+		dst->ops->update_pmtu(dst, pmtu);
+}
+
+/* Caches the dst entry and source address for a transport's destination
+ * address.
+ */
+void sctp_transport_route(struct sctp_transport *transport,
+			  union sctp_addr *saddr, struct sctp_sock *opt)
+{
+	struct sctp_association *asoc = transport->asoc;
+	struct sctp_af *af = transport->af_specific;
+
+	af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
+
+	if (saddr)
+		memcpy(&transport->saddr, saddr, sizeof(union sctp_addr));
+	else
+		af->get_saddr(opt, transport, &transport->fl);
+
+	if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
+		return;
+	}
+	if (transport->dst) {
+		transport->pathmtu = dst_mtu(transport->dst);
+
+		/* Initialize sk->sk_rcv_saddr, if the transport is the
+		 * association's active path for getsockname().
+		 */
+		if (asoc && (!asoc->peer.primary_path ||
+				(transport == asoc->peer.active_path)))
+			opt->pf->af->to_sk_saddr(&transport->saddr,
+						 asoc->base.sk);
+	} else
+		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+}
+
+/* Hold a reference to a transport.  */
+void sctp_transport_hold(struct sctp_transport *transport)
+{
+	atomic_inc(&transport->refcnt);
+}
+
+/* Release a reference to a transport and clean up
+ * if there are no more references.
+ */
+void sctp_transport_put(struct sctp_transport *transport)
+{
+	if (atomic_dec_and_test(&transport->refcnt))
+		sctp_transport_destroy(transport);
+}
+
+/* Update transport's RTO based on the newly calculated RTT. */
+void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
+{
+	/* Check for valid transport.  */
+	SCTP_ASSERT(tp, "NULL transport", return);
+
+	/* We should not be doing any RTO updates unless rto_pending is set.  */
+	SCTP_ASSERT(tp->rto_pending, "rto_pending not set", return);
+
+	if (tp->rttvar || tp->srtt) {
+		/* 6.3.1 C3) When a new RTT measurement R' is made, set
+		 * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'|
+		 * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R'
+		 */
+
+		/* Note:  The above algorithm has been rewritten to
+		 * express rto_beta and rto_alpha as inverse powers
+		 * of two.
+		 * For example, assuming the default value of RTO.Alpha of
+		 * 1/8, rto_alpha would be expressed as 3.
+		 */
+		tp->rttvar = tp->rttvar - (tp->rttvar >> sctp_rto_beta)
+			+ ((abs(tp->srtt - rtt)) >> sctp_rto_beta);
+		tp->srtt = tp->srtt - (tp->srtt >> sctp_rto_alpha)
+			+ (rtt >> sctp_rto_alpha);
+	} else {
+		/* 6.3.1 C2) When the first RTT measurement R is made, set
+		 * SRTT <- R, RTTVAR <- R/2.
+		 */
+		tp->srtt = rtt;
+		tp->rttvar = rtt >> 1;
+	}
+
+	/* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then
+	 * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY.
+	 */
+	if (tp->rttvar == 0)
+		tp->rttvar = SCTP_CLOCK_GRANULARITY;
+
+	/* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */
+	tp->rto = tp->srtt + (tp->rttvar << 2);
+
+	/* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min
+	 * seconds then it is rounded up to RTO.Min seconds.
+	 */
+	if (tp->rto < tp->asoc->rto_min)
+		tp->rto = tp->asoc->rto_min;
+
+	/* 6.3.1 C7) A maximum value may be placed on RTO provided it is
+	 * at least RTO.max seconds.
+	 */
+	if (tp->rto > tp->asoc->rto_max)
+		tp->rto = tp->asoc->rto_max;
+
+	tp->rtt = rtt;
+
+	/* Reset rto_pending so that a new RTT measurement is started when a
+	 * new data chunk is sent.
+	 */
+	tp->rto_pending = 0;
+
+	SCTP_DEBUG_PRINTK("%s: transport: %p, rtt: %d, srtt: %d "
+			  "rttvar: %d, rto: %ld\n", __func__,
+			  tp, rtt, tp->srtt, tp->rttvar, tp->rto);
+}
+
+/* This routine updates the transport's cwnd and partial_bytes_acked
+ * parameters based on the bytes acked in the received SACK.
+ */
+void sctp_transport_raise_cwnd(struct sctp_transport *transport,
+			       __u32 sack_ctsn, __u32 bytes_acked)
+{
+	struct sctp_association *asoc = transport->asoc;
+	__u32 cwnd, ssthresh, flight_size, pba, pmtu;
+
+	cwnd = transport->cwnd;
+	flight_size = transport->flight_size;
+
+	/* See if we need to exit Fast Recovery first */
+	if (asoc->fast_recovery &&
+	    TSN_lte(asoc->fast_recovery_exit, sack_ctsn))
+		asoc->fast_recovery = 0;
+
+	/* The appropriate cwnd increase algorithm is performed if, and only
+	 * if the cumulative TSN whould advanced and the congestion window is
+	 * being fully utilized.
+	 */
+	if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) ||
+	    (flight_size < cwnd))
+		return;
+
+	ssthresh = transport->ssthresh;
+	pba = transport->partial_bytes_acked;
+	pmtu = transport->asoc->pathmtu;
+
+	if (cwnd <= ssthresh) {
+		/* RFC 4960 7.2.1
+		 * o  When cwnd is less than or equal to ssthresh, an SCTP
+		 *    endpoint MUST use the slow-start algorithm to increase
+		 *    cwnd only if the current congestion window is being fully
+		 *    utilized, an incoming SACK advances the Cumulative TSN
+		 *    Ack Point, and the data sender is not in Fast Recovery.
+		 *    Only when these three conditions are met can the cwnd be
+		 *    increased; otherwise, the cwnd MUST not be increased.
+		 *    If these conditions are met, then cwnd MUST be increased
+		 *    by, at most, the lesser of 1) the total size of the
+		 *    previously outstanding DATA chunk(s) acknowledged, and
+		 *    2) the destination's path MTU.  This upper bound protects
+		 *    against the ACK-Splitting attack outlined in [SAVAGE99].
+		 */
+		if (asoc->fast_recovery)
+			return;
+
+		if (bytes_acked > pmtu)
+			cwnd += pmtu;
+		else
+			cwnd += bytes_acked;
+		SCTP_DEBUG_PRINTK("%s: SLOW START: transport: %p, "
+				  "bytes_acked: %d, cwnd: %d, ssthresh: %d, "
+				  "flight_size: %d, pba: %d\n",
+				  __func__,
+				  transport, bytes_acked, cwnd,
+				  ssthresh, flight_size, pba);
+	} else {
+		/* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh,
+		 * upon each SACK arrival that advances the Cumulative TSN Ack
+		 * Point, increase partial_bytes_acked by the total number of
+		 * bytes of all new chunks acknowledged in that SACK including
+		 * chunks acknowledged by the new Cumulative TSN Ack and by
+		 * Gap Ack Blocks.
+		 *
+		 * When partial_bytes_acked is equal to or greater than cwnd
+		 * and before the arrival of the SACK the sender had cwnd or
+		 * more bytes of data outstanding (i.e., before arrival of the
+		 * SACK, flightsize was greater than or equal to cwnd),
+		 * increase cwnd by MTU, and reset partial_bytes_acked to
+		 * (partial_bytes_acked - cwnd).
+		 */
+		pba += bytes_acked;
+		if (pba >= cwnd) {
+			cwnd += pmtu;
+			pba = ((cwnd < pba) ? (pba - cwnd) : 0);
+		}
+		SCTP_DEBUG_PRINTK("%s: CONGESTION AVOIDANCE: "
+				  "transport: %p, bytes_acked: %d, cwnd: %d, "
+				  "ssthresh: %d, flight_size: %d, pba: %d\n",
+				  __func__,
+				  transport, bytes_acked, cwnd,
+				  ssthresh, flight_size, pba);
+	}
+
+	transport->cwnd = cwnd;
+	transport->partial_bytes_acked = pba;
+}
+
+/* This routine is used to lower the transport's cwnd when congestion is
+ * detected.
+ */
+void sctp_transport_lower_cwnd(struct sctp_transport *transport,
+			       sctp_lower_cwnd_t reason)
+{
+	struct sctp_association *asoc = transport->asoc;
+
+	switch (reason) {
+	case SCTP_LOWER_CWND_T3_RTX:
+		/* RFC 2960 Section 7.2.3, sctpimpguide
+		 * When the T3-rtx timer expires on an address, SCTP should
+		 * perform slow start by:
+		 *      ssthresh = max(cwnd/2, 4*MTU)
+		 *      cwnd = 1*MTU
+		 *      partial_bytes_acked = 0
+		 */
+		transport->ssthresh = max(transport->cwnd/2,
+					  4*asoc->pathmtu);
+		transport->cwnd = asoc->pathmtu;
+
+		/* T3-rtx also clears fast recovery */
+		asoc->fast_recovery = 0;
+		break;
+
+	case SCTP_LOWER_CWND_FAST_RTX:
+		/* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the
+		 * destination address(es) to which the missing DATA chunks
+		 * were last sent, according to the formula described in
+		 * Section 7.2.3.
+		 *
+		 * RFC 2960 7.2.3, sctpimpguide Upon detection of packet
+		 * losses from SACK (see Section 7.2.4), An endpoint
+		 * should do the following:
+		 *      ssthresh = max(cwnd/2, 4*MTU)
+		 *      cwnd = ssthresh
+		 *      partial_bytes_acked = 0
+		 */
+		if (asoc->fast_recovery)
+			return;
+
+		/* Mark Fast recovery */
+		asoc->fast_recovery = 1;
+		asoc->fast_recovery_exit = asoc->next_tsn - 1;
+
+		transport->ssthresh = max(transport->cwnd/2,
+					  4*asoc->pathmtu);
+		transport->cwnd = transport->ssthresh;
+		break;
+
+	case SCTP_LOWER_CWND_ECNE:
+		/* RFC 2481 Section 6.1.2.
+		 * If the sender receives an ECN-Echo ACK packet
+		 * then the sender knows that congestion was encountered in the
+		 * network on the path from the sender to the receiver. The
+		 * indication of congestion should be treated just as a
+		 * congestion loss in non-ECN Capable TCP. That is, the TCP
+		 * source halves the congestion window "cwnd" and reduces the
+		 * slow start threshold "ssthresh".
+		 * A critical condition is that TCP does not react to
+		 * congestion indications more than once every window of
+		 * data (or more loosely more than once every round-trip time).
+		 */
+		if (time_after(jiffies, transport->last_time_ecne_reduced +
+					transport->rtt)) {
+			transport->ssthresh = max(transport->cwnd/2,
+						  4*asoc->pathmtu);
+			transport->cwnd = transport->ssthresh;
+			transport->last_time_ecne_reduced = jiffies;
+		}
+		break;
+
+	case SCTP_LOWER_CWND_INACTIVE:
+		/* RFC 2960 Section 7.2.1, sctpimpguide
+		 * When the endpoint does not transmit data on a given
+		 * transport address, the cwnd of the transport address
+		 * should be adjusted to max(cwnd/2, 4*MTU) per RTO.
+		 * NOTE: Although the draft recommends that this check needs
+		 * to be done every RTO interval, we do it every hearbeat
+		 * interval.
+		 */
+		transport->cwnd = max(transport->cwnd/2,
+					 4*asoc->pathmtu);
+		break;
+	}
+
+	transport->partial_bytes_acked = 0;
+	SCTP_DEBUG_PRINTK("%s: transport: %p reason: %d cwnd: "
+			  "%d ssthresh: %d\n", __func__,
+			  transport, reason,
+			  transport->cwnd, transport->ssthresh);
+}
+
+/* Apply Max.Burst limit to the congestion window:
+ * sctpimpguide-05 2.14.2
+ * D) When the time comes for the sender to
+ * transmit new DATA chunks, the protocol parameter Max.Burst MUST
+ * first be applied to limit how many new DATA chunks may be sent.
+ * The limit is applied by adjusting cwnd as follows:
+ * 	if ((flightsize+ Max.Burst * MTU) < cwnd)
+ * 		cwnd = flightsize + Max.Burst * MTU
+ */
+
+void sctp_transport_burst_limited(struct sctp_transport *t)
+{
+	struct sctp_association *asoc = t->asoc;
+	u32 old_cwnd = t->cwnd;
+	u32 max_burst_bytes;
+
+	if (t->burst_limited)
+		return;
+
+	max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu);
+	if (max_burst_bytes < old_cwnd) {
+		t->cwnd = max_burst_bytes;
+		t->burst_limited = old_cwnd;
+	}
+}
+
+/* Restore the old cwnd congestion window, after the burst had it's
+ * desired effect.
+ */
+void sctp_transport_burst_reset(struct sctp_transport *t)
+{
+	if (t->burst_limited) {
+		t->cwnd = t->burst_limited;
+		t->burst_limited = 0;
+	}
+}
+
+/* What is the next timeout value for this transport? */
+unsigned long sctp_transport_timeout(struct sctp_transport *t)
+{
+	unsigned long timeout;
+	timeout = t->rto + sctp_jitter(t->rto);
+	if (t->state != SCTP_UNCONFIRMED)
+		timeout += t->hbinterval;
+	timeout += jiffies;
+	return timeout;
+}
+
+/* Reset transport variables to their initial values */
+void sctp_transport_reset(struct sctp_transport *t)
+{
+	struct sctp_association *asoc = t->asoc;
+
+	/* RFC 2960 (bis), Section 5.2.4
+	 * All the congestion control parameters (e.g., cwnd, ssthresh)
+	 * related to this peer MUST be reset to their initial values
+	 * (see Section 6.2.1)
+	 */
+	t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
+	t->burst_limited = 0;
+	t->ssthresh = asoc->peer.i.a_rwnd;
+	t->rto = asoc->rto_initial;
+	t->rtt = 0;
+	t->srtt = 0;
+	t->rttvar = 0;
+
+	/* Reset these additional varibles so that we have a clean
+	 * slate.
+	 */
+	t->partial_bytes_acked = 0;
+	t->flight_size = 0;
+	t->error_count = 0;
+	t->rto_pending = 0;
+	t->hb_sent = 0;
+
+	/* Initialize the state information for SFR-CACC */
+	t->cacc.changeover_active = 0;
+	t->cacc.cycling_changeover = 0;
+	t->cacc.next_tsn_at_change = 0;
+	t->cacc.cacc_saw_newack = 0;
+}
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
new file mode 100644
index 00000000..f1e40ceb
--- /dev/null
+++ b/net/sctp/tsnmap.c
@@ -0,0 +1,385 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp tsn mapping array.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    Karl Knutson          <karl@athena.chicago.il.us>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/bitmap.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static void sctp_tsnmap_update(struct sctp_tsnmap *map);
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+				     __u16 len, __u16 *start, __u16 *end);
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap);
+
+/* Initialize a block of memory as a tsnmap.  */
+struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len,
+				     __u32 initial_tsn, gfp_t gfp)
+{
+	if (!map->tsn_map) {
+		map->tsn_map = kzalloc(len>>3, gfp);
+		if (map->tsn_map == NULL)
+			return NULL;
+
+		map->len = len;
+	} else {
+		bitmap_zero(map->tsn_map, map->len);
+	}
+
+	/* Keep track of TSNs represented by tsn_map.  */
+	map->base_tsn = initial_tsn;
+	map->cumulative_tsn_ack_point = initial_tsn - 1;
+	map->max_tsn_seen = map->cumulative_tsn_ack_point;
+	map->num_dup_tsns = 0;
+
+	return map;
+}
+
+void sctp_tsnmap_free(struct sctp_tsnmap *map)
+{
+	map->len = 0;
+	kfree(map->tsn_map);
+}
+
+/* Test the tracking state of this TSN.
+ * Returns:
+ *   0 if the TSN has not yet been seen
+ *  >0 if the TSN has been seen (duplicate)
+ *  <0 if the TSN is invalid (too large to track)
+ */
+int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn)
+{
+	u32 gap;
+
+	/* Check to see if this is an old TSN */
+	if (TSN_lte(tsn, map->cumulative_tsn_ack_point))
+		return 1;
+
+	/* Verify that we can hold this TSN and that it will not
+	 * overlfow our map
+	 */
+	if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
+		return -1;
+
+	/* Calculate the index into the mapping arrays.  */
+	gap = tsn - map->base_tsn;
+
+	/* Check to see if TSN has already been recorded.  */
+	if (gap < map->len && test_bit(gap, map->tsn_map))
+		return 1;
+	else
+		return 0;
+}
+
+
+/* Mark this TSN as seen.  */
+int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn)
+{
+	u16 gap;
+
+	if (TSN_lt(tsn, map->base_tsn))
+		return 0;
+
+	gap = tsn - map->base_tsn;
+
+	if (gap >= map->len && !sctp_tsnmap_grow(map, gap))
+		return -ENOMEM;
+
+	if (!sctp_tsnmap_has_gap(map) && gap == 0) {
+		/* In this case the map has no gaps and the tsn we are
+		 * recording is the next expected tsn.  We don't touch
+		 * the map but simply bump the values.
+		 */
+		map->max_tsn_seen++;
+		map->cumulative_tsn_ack_point++;
+		map->base_tsn++;
+	} else {
+		/* Either we already have a gap, or about to record a gap, so
+		 * have work to do.
+		 *
+		 * Bump the max.
+		 */
+		if (TSN_lt(map->max_tsn_seen, tsn))
+			map->max_tsn_seen = tsn;
+
+		/* Mark the TSN as received.  */
+		set_bit(gap, map->tsn_map);
+
+		/* Go fixup any internal TSN mapping variables including
+		 * cumulative_tsn_ack_point.
+		 */
+		sctp_tsnmap_update(map);
+	}
+
+	return 0;
+}
+
+
+/* Initialize a Gap Ack Block iterator from memory being provided.  */
+SCTP_STATIC void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map,
+				       struct sctp_tsnmap_iter *iter)
+{
+	/* Only start looking one past the Cumulative TSN Ack Point.  */
+	iter->start = map->cumulative_tsn_ack_point + 1;
+}
+
+/* Get the next Gap Ack Blocks. Returns 0 if there was not another block
+ * to get.
+ */
+SCTP_STATIC int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map,
+					 struct sctp_tsnmap_iter *iter,
+					 __u16 *start, __u16 *end)
+{
+	int ended = 0;
+	__u16 start_ = 0, end_ = 0, offset;
+
+	/* If there are no more gap acks possible, get out fast.  */
+	if (TSN_lte(map->max_tsn_seen, iter->start))
+		return 0;
+
+	offset = iter->start - map->base_tsn;
+	sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len,
+				 &start_, &end_);
+
+	/* The Gap Ack Block happens to end at the end of the map. */
+	if (start_ && !end_)
+		end_ = map->len - 1;
+
+	/* If we found a Gap Ack Block, return the start and end and
+	 * bump the iterator forward.
+	 */
+	if (end_) {
+		/* Fix up the start and end based on the
+		 * Cumulative TSN Ack which is always 1 behind base.
+		 */
+		*start = start_ + 1;
+		*end = end_ + 1;
+
+		/* Move the iterator forward.  */
+		iter->start = map->cumulative_tsn_ack_point + *end + 1;
+		ended = 1;
+	}
+
+	return ended;
+}
+
+/* Mark this and any lower TSN as seen.  */
+void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn)
+{
+	u32 gap;
+
+	if (TSN_lt(tsn, map->base_tsn))
+		return;
+	if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
+		return;
+
+	/* Bump the max.  */
+	if (TSN_lt(map->max_tsn_seen, tsn))
+		map->max_tsn_seen = tsn;
+
+	gap = tsn - map->base_tsn + 1;
+
+	map->base_tsn += gap;
+	map->cumulative_tsn_ack_point += gap;
+	if (gap >= map->len) {
+		/* If our gap is larger then the map size, just
+		 * zero out the map.
+		 */
+		bitmap_zero(map->tsn_map, map->len);
+	} else {
+		/* If the gap is smaller than the map size,
+		 * shift the map by 'gap' bits and update further.
+		 */
+		bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len);
+		sctp_tsnmap_update(map);
+	}
+}
+
+/********************************************************************
+ * 2nd Level Abstractions
+ ********************************************************************/
+
+/* This private helper function updates the tsnmap buffers and
+ * the Cumulative TSN Ack Point.
+ */
+static void sctp_tsnmap_update(struct sctp_tsnmap *map)
+{
+	u16 len;
+	unsigned long zero_bit;
+
+
+	len = map->max_tsn_seen - map->cumulative_tsn_ack_point;
+	zero_bit = find_first_zero_bit(map->tsn_map, len);
+	if (!zero_bit)
+		return;		/* The first 0-bit is bit 0.  nothing to do */
+
+	map->base_tsn += zero_bit;
+	map->cumulative_tsn_ack_point += zero_bit;
+
+	bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len);
+}
+
+/* How many data chunks  are we missing from our peer?
+ */
+__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map)
+{
+	__u32 cum_tsn = map->cumulative_tsn_ack_point;
+	__u32 max_tsn = map->max_tsn_seen;
+	__u32 base_tsn = map->base_tsn;
+	__u16 pending_data;
+	u32 gap, i;
+
+	pending_data = max_tsn - cum_tsn;
+	gap = max_tsn - base_tsn;
+
+	if (gap == 0 || gap >= map->len)
+		goto out;
+
+	for (i = 0; i < gap+1; i++) {
+		if (test_bit(i, map->tsn_map))
+			pending_data--;
+	}
+
+out:
+	return pending_data;
+}
+
+/* This is a private helper for finding Gap Ack Blocks.  It searches a
+ * single array for the start and end of a Gap Ack Block.
+ *
+ * The flags "started" and "ended" tell is if we found the beginning
+ * or (respectively) the end of a Gap Ack Block.
+ */
+static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
+				     __u16 len, __u16 *start, __u16 *end)
+{
+	int i = off;
+
+	/* Look through the entire array, but break out
+	 * early if we have found the end of the Gap Ack Block.
+	 */
+
+	/* Also, stop looking past the maximum TSN seen. */
+
+	/* Look for the start. */
+	i = find_next_bit(map, len, off);
+	if (i < len)
+		*start = i;
+
+	/* Look for the end.  */
+	if (*start) {
+		/* We have found the start, let's find the
+		 * end.  If we find the end, break out.
+		 */
+		i = find_next_zero_bit(map, len, i);
+		if (i < len)
+			*end = i - 1;
+	}
+}
+
+/* Renege that we have seen a TSN.  */
+void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn)
+{
+	u32 gap;
+
+	if (TSN_lt(tsn, map->base_tsn))
+		return;
+	/* Assert: TSN is in range.  */
+	if (!TSN_lt(tsn, map->base_tsn + map->len))
+		return;
+
+	gap = tsn - map->base_tsn;
+
+	/* Pretend we never saw the TSN.  */
+	clear_bit(gap, map->tsn_map);
+}
+
+/* How many gap ack blocks do we have recorded? */
+__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
+			   struct sctp_gap_ack_block *gabs)
+{
+	struct sctp_tsnmap_iter iter;
+	int ngaps = 0;
+
+	/* Refresh the gap ack information. */
+	if (sctp_tsnmap_has_gap(map)) {
+		__u16 start = 0, end = 0;
+		sctp_tsnmap_iter_init(map, &iter);
+		while (sctp_tsnmap_next_gap_ack(map, &iter,
+						&start,
+						&end)) {
+
+			gabs[ngaps].start = htons(start);
+			gabs[ngaps].end = htons(end);
+			ngaps++;
+			if (ngaps >= SCTP_MAX_GABS)
+				break;
+		}
+	}
+	return ngaps;
+}
+
+static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 gap)
+{
+	unsigned long *new;
+	unsigned long inc;
+	u16  len;
+
+	if (gap >= SCTP_TSN_MAP_SIZE)
+		return 0;
+
+	inc = ALIGN((gap - map->len),BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT;
+	len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE);
+
+	new = kzalloc(len>>3, GFP_ATOMIC);
+	if (!new)
+		return 0;
+
+	bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->base_tsn);
+	kfree(map->tsn_map);
+	map->tsn_map = new;
+	map->len = len;
+
+	return 1;
+}
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
new file mode 100644
index 00000000..8a840178
--- /dev/null
+++ b/net/sctp/ulpevent.c
@@ -0,0 +1,1099 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * These functions manipulate an sctp event.   The struct ulpevent is used
+ * to carry notifications and data to the ULP (sockets).
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Ardelle Fan	    <ardelle.fan@intel.com>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
+				       struct sctp_association *asoc);
+static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);
+
+
+/* Initialize an ULP event from an given skb.  */
+SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event,
+				    int msg_flags,
+				    unsigned int len)
+{
+	memset(event, 0, sizeof(struct sctp_ulpevent));
+	event->msg_flags = msg_flags;
+	event->rmem_len = len;
+}
+
+/* Create a new sctp_ulpevent.  */
+SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
+						    gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(size, gfp);
+	if (!skb)
+		goto fail;
+
+	event = sctp_skb2event(skb);
+	sctp_ulpevent_init(event, msg_flags, skb->truesize);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Is this a MSG_NOTIFICATION?  */
+int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event)
+{
+	return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION);
+}
+
+/* Hold the association in case the msg_name needs read out of
+ * the association.
+ */
+static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
+					   const struct sctp_association *asoc)
+{
+	struct sk_buff *skb;
+
+	/* Cast away the const, as we are just wanting to
+	 * bump the reference count.
+	 */
+	sctp_association_hold((struct sctp_association *)asoc);
+	skb = sctp_event2skb(event);
+	event->asoc = (struct sctp_association *)asoc;
+	atomic_add(event->rmem_len, &event->asoc->rmem_alloc);
+	sctp_skb_set_owner_r(skb, asoc->base.sk);
+}
+
+/* A simple destructor to give up the reference to the association. */
+static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
+{
+	struct sctp_association *asoc = event->asoc;
+
+	atomic_sub(event->rmem_len, &asoc->rmem_alloc);
+	sctp_association_put(asoc);
+}
+
+/* Create and initialize an SCTP_ASSOC_CHANGE event.
+ *
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ * Communication notifications inform the ULP that an SCTP association
+ * has either begun or ended. The identifier for a new association is
+ * provided by this notification.
+ *
+ * Note: There is no field checking here.  If a field is unused it will be
+ * zero'd out.
+ */
+struct sctp_ulpevent  *sctp_ulpevent_make_assoc_change(
+	const struct sctp_association *asoc,
+	__u16 flags, __u16 state, __u16 error, __u16 outbound,
+	__u16 inbound, struct sctp_chunk *chunk, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_assoc_change *sac;
+	struct sk_buff *skb;
+
+	/* If the lower layer passed in the chunk, it will be
+	 * an ABORT, so we need to include it in the sac_info.
+	 */
+	if (chunk) {
+		/* Copy the chunk data to a new skb and reserve enough
+		 * head room to use as notification.
+		 */
+		skb = skb_copy_expand(chunk->skb,
+				      sizeof(struct sctp_assoc_change), 0, gfp);
+
+		if (!skb)
+			goto fail;
+
+		/* Embed the event fields inside the cloned skb.  */
+		event = sctp_skb2event(skb);
+		sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+		/* Include the notification structure */
+		sac = (struct sctp_assoc_change *)
+			skb_push(skb, sizeof(struct sctp_assoc_change));
+
+		/* Trim the buffer to the right length.  */
+		skb_trim(skb, sizeof(struct sctp_assoc_change) +
+			 ntohs(chunk->chunk_hdr->length) -
+			 sizeof(sctp_chunkhdr_t));
+	} else {
+		event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change),
+				  MSG_NOTIFICATION, gfp);
+		if (!event)
+			goto fail;
+
+		skb = sctp_event2skb(event);
+		sac = (struct sctp_assoc_change *) skb_put(skb,
+					sizeof(struct sctp_assoc_change));
+	}
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_type:
+	 * It should be SCTP_ASSOC_CHANGE.
+	 */
+	sac->sac_type = SCTP_ASSOC_CHANGE;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_state: 32 bits (signed integer)
+	 * This field holds one of a number of values that communicate the
+	 * event that happened to the association.
+	 */
+	sac->sac_state = state;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_flags: 16 bits (unsigned integer)
+	 * Currently unused.
+	 */
+	sac->sac_flags = 0;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_length: sizeof (__u32)
+	 * This field is the total length of the notification data, including
+	 * the notification header.
+	 */
+	sac->sac_length = skb->len;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_error:  32 bits (signed integer)
+	 *
+	 * If the state was reached due to a error condition (e.g.
+	 * COMMUNICATION_LOST) any relevant error information is available in
+	 * this field. This corresponds to the protocol error codes defined in
+	 * [SCTP].
+	 */
+	sac->sac_error = error;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_outbound_streams:  16 bits (unsigned integer)
+	 * sac_inbound_streams:  16 bits (unsigned integer)
+	 *
+	 * The maximum number of streams allowed in each direction are
+	 * available in sac_outbound_streams and sac_inbound streams.
+	 */
+	sac->sac_outbound_streams = outbound;
+	sac->sac_inbound_streams = inbound;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * sac_assoc_id: sizeof (sctp_assoc_t)
+	 *
+	 * The association id field, holds the identifier for the association.
+	 * All notifications for a given association have the same association
+	 * identifier.  For TCP style socket, this field is ignored.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	sac->sac_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Create and initialize an SCTP_PEER_ADDR_CHANGE event.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ * When a destination address on a multi-homed peer encounters a change
+ * an interface details event is sent.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
+	const struct sctp_association *asoc,
+	const struct sockaddr_storage *aaddr,
+	int flags, int state, int error, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_paddr_change  *spc;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		goto fail;
+
+	skb = sctp_event2skb(event);
+	spc = (struct sctp_paddr_change *)
+		skb_put(skb, sizeof(struct sctp_paddr_change));
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_type:
+	 *
+	 *    It should be SCTP_PEER_ADDR_CHANGE.
+	 */
+	spc->spc_type = SCTP_PEER_ADDR_CHANGE;
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_length: sizeof (__u32)
+	 *
+	 * This field is the total length of the notification data, including
+	 * the notification header.
+	 */
+	spc->spc_length = sizeof(struct sctp_paddr_change);
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_flags: 16 bits (unsigned integer)
+	 * Currently unused.
+	 */
+	spc->spc_flags = 0;
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_state:  32 bits (signed integer)
+	 *
+	 * This field holds one of a number of values that communicate the
+	 * event that happened to the address.
+	 */
+	spc->spc_state = state;
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_error:  32 bits (signed integer)
+	 *
+	 * If the state was reached due to any error condition (e.g.
+	 * ADDRESS_UNREACHABLE) any relevant error information is available in
+	 * this field.
+	 */
+	spc->spc_error = error;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.1 SCTP_ASSOC_CHANGE
+	 *
+	 * spc_assoc_id: sizeof (sctp_assoc_t)
+	 *
+	 * The association id field, holds the identifier for the association.
+	 * All notifications for a given association have the same association
+	 * identifier.  For TCP style socket, this field is ignored.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	spc->spc_assoc_id = sctp_assoc2id(asoc);
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+	 *
+	 * spc_aaddr: sizeof (struct sockaddr_storage)
+	 *
+	 * The affected address field, holds the remote peer's address that is
+	 * encountering the change of state.
+	 */
+	memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage));
+
+	/* Map ipv4 address into v4-mapped-on-v6 address.  */
+	sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_v4map(
+					sctp_sk(asoc->base.sk),
+					(union sctp_addr *)&spc->spc_aaddr);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Create and initialize an SCTP_REMOTE_ERROR notification.
+ *
+ * Note: This assumes that the chunk->skb->data already points to the
+ * operation error payload.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.3 SCTP_REMOTE_ERROR
+ *
+ * A remote peer may send an Operational Error message to its peer.
+ * This message indicates a variety of error conditions on an
+ * association. The entire error TLV as it appears on the wire is
+ * included in a SCTP_REMOTE_ERROR event.  Please refer to the SCTP
+ * specification [SCTP] and any extensions for a list of possible
+ * error formats.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
+	const struct sctp_association *asoc, struct sctp_chunk *chunk,
+	__u16 flags, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_remote_error *sre;
+	struct sk_buff *skb;
+	sctp_errhdr_t *ch;
+	__be16 cause;
+	int elen;
+
+	ch = (sctp_errhdr_t *)(chunk->skb->data);
+	cause = ch->cause;
+	elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+
+	/* Pull off the ERROR header.  */
+	skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
+
+	/* Copy the skb to a new skb with room for us to prepend
+	 * notification with.
+	 */
+	skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error),
+			      0, gfp);
+
+	/* Pull off the rest of the cause TLV from the chunk.  */
+	skb_pull(chunk->skb, elen);
+	if (!skb)
+		goto fail;
+
+	/* Embed the event fields inside the cloned skb.  */
+	event = sctp_skb2event(skb);
+	sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+	sre = (struct sctp_remote_error *)
+		skb_push(skb, sizeof(struct sctp_remote_error));
+
+	/* Trim the buffer to the right length.  */
+	skb_trim(skb, sizeof(struct sctp_remote_error) + elen);
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.3 SCTP_REMOTE_ERROR
+	 *
+	 * sre_type:
+	 *   It should be SCTP_REMOTE_ERROR.
+	 */
+	sre->sre_type = SCTP_REMOTE_ERROR;
+
+	/*
+	 * Socket Extensions for SCTP
+	 * 5.3.1.3 SCTP_REMOTE_ERROR
+	 *
+	 * sre_flags: 16 bits (unsigned integer)
+	 *   Currently unused.
+	 */
+	sre->sre_flags = 0;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.3 SCTP_REMOTE_ERROR
+	 *
+	 * sre_length: sizeof (__u32)
+	 *
+	 * This field is the total length of the notification data,
+	 * including the notification header.
+	 */
+	sre->sre_length = skb->len;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.3 SCTP_REMOTE_ERROR
+	 *
+	 * sre_error: 16 bits (unsigned integer)
+	 * This value represents one of the Operational Error causes defined in
+	 * the SCTP specification, in network byte order.
+	 */
+	sre->sre_error = cause;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.3 SCTP_REMOTE_ERROR
+	 *
+	 * sre_assoc_id: sizeof (sctp_assoc_t)
+	 *
+	 * The association id field, holds the identifier for the association.
+	 * All notifications for a given association have the same association
+	 * identifier.  For TCP style socket, this field is ignored.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	sre->sre_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Create and initialize a SCTP_SEND_FAILED notification.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.4 SCTP_SEND_FAILED
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
+	const struct sctp_association *asoc, struct sctp_chunk *chunk,
+	__u16 flags, __u32 error, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_send_failed *ssf;
+	struct sk_buff *skb;
+
+	/* Pull off any padding. */
+	int len = ntohs(chunk->chunk_hdr->length);
+
+	/* Make skb with more room so we can prepend notification.  */
+	skb = skb_copy_expand(chunk->skb,
+			      sizeof(struct sctp_send_failed), /* headroom */
+			      0,                               /* tailroom */
+			      gfp);
+	if (!skb)
+		goto fail;
+
+	/* Pull off the common chunk header and DATA header.  */
+	skb_pull(skb, sizeof(struct sctp_data_chunk));
+	len -= sizeof(struct sctp_data_chunk);
+
+	/* Embed the event fields inside the cloned skb.  */
+	event = sctp_skb2event(skb);
+	sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+	ssf = (struct sctp_send_failed *)
+		skb_push(skb, sizeof(struct sctp_send_failed));
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_type:
+	 * It should be SCTP_SEND_FAILED.
+	 */
+	ssf->ssf_type = SCTP_SEND_FAILED;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_flags: 16 bits (unsigned integer)
+	 * The flag value will take one of the following values
+	 *
+	 * SCTP_DATA_UNSENT - Indicates that the data was never put on
+	 *                    the wire.
+	 *
+	 * SCTP_DATA_SENT   - Indicates that the data was put on the wire.
+	 *                    Note that this does not necessarily mean that the
+	 *                    data was (or was not) successfully delivered.
+	 */
+	ssf->ssf_flags = flags;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_length: sizeof (__u32)
+	 * This field is the total length of the notification data, including
+	 * the notification header.
+	 */
+	ssf->ssf_length = sizeof(struct sctp_send_failed) + len;
+	skb_trim(skb, ssf->ssf_length);
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_error: 16 bits (unsigned integer)
+	 * This value represents the reason why the send failed, and if set,
+	 * will be a SCTP protocol error code as defined in [SCTP] section
+	 * 3.3.10.
+	 */
+	ssf->ssf_error = error;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_info: sizeof (struct sctp_sndrcvinfo)
+	 * The original send information associated with the undelivered
+	 * message.
+	 */
+	memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo));
+
+	/* Per TSVWG discussion with Randy. Allow the application to
+	 * reassemble a fragmented message.
+	 */
+	ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.4 SCTP_SEND_FAILED
+	 *
+	 * ssf_assoc_id: sizeof (sctp_assoc_t)
+	 * The association id field, sf_assoc_id, holds the identifier for the
+	 * association.  All notifications for a given association have the
+	 * same association identifier.  For TCP style socket, this field is
+	 * ignored.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	ssf->ssf_assoc_id = sctp_assoc2id(asoc);
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Create and initialize a SCTP_SHUTDOWN_EVENT notification.
+ *
+ * Socket Extensions for SCTP - draft-01
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
+	const struct sctp_association *asoc,
+	__u16 flags, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_shutdown_event *sse;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		goto fail;
+
+	skb = sctp_event2skb(event);
+	sse = (struct sctp_shutdown_event *)
+		skb_put(skb, sizeof(struct sctp_shutdown_event));
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+	 *
+	 * sse_type
+	 * It should be SCTP_SHUTDOWN_EVENT
+	 */
+	sse->sse_type = SCTP_SHUTDOWN_EVENT;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+	 *
+	 * sse_flags: 16 bits (unsigned integer)
+	 * Currently unused.
+	 */
+	sse->sse_flags = 0;
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+	 *
+	 * sse_length: sizeof (__u32)
+	 * This field is the total length of the notification data, including
+	 * the notification header.
+	 */
+	sse->sse_length = sizeof(struct sctp_shutdown_event);
+
+	/* Socket Extensions for SCTP
+	 * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+	 *
+	 * sse_assoc_id: sizeof (sctp_assoc_t)
+	 * The association id field, holds the identifier for the association.
+	 * All notifications for a given association have the same association
+	 * identifier.  For TCP style socket, this field is ignored.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	sse->sse_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* Create and initialize a SCTP_ADAPTATION_INDICATION notification.
+ *
+ * Socket Extensions for SCTP
+ * 5.3.1.6 SCTP_ADAPTATION_INDICATION
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
+	const struct sctp_association *asoc, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_adaptation_event *sai;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		goto fail;
+
+	skb = sctp_event2skb(event);
+	sai = (struct sctp_adaptation_event *)
+		skb_put(skb, sizeof(struct sctp_adaptation_event));
+
+	sai->sai_type = SCTP_ADAPTATION_INDICATION;
+	sai->sai_flags = 0;
+	sai->sai_length = sizeof(struct sctp_adaptation_event);
+	sai->sai_adaptation_ind = asoc->peer.adaptation_ind;
+	sctp_ulpevent_set_owner(event, asoc);
+	sai->sai_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+
+fail:
+	return NULL;
+}
+
+/* A message has been received.  Package this message as a notification
+ * to pass it to the upper layers.  Go ahead and calculate the sndrcvinfo
+ * even if filtered out later.
+ *
+ * Socket Extensions for SCTP
+ * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
+						struct sctp_chunk *chunk,
+						gfp_t gfp)
+{
+	struct sctp_ulpevent *event = NULL;
+	struct sk_buff *skb;
+	size_t padding, len;
+	int rx_count;
+
+	/*
+	 * check to see if we need to make space for this
+	 * new skb, expand the rcvbuffer if needed, or drop
+	 * the frame
+	 */
+	if (asoc->ep->rcvbuf_policy)
+		rx_count = atomic_read(&asoc->rmem_alloc);
+	else
+		rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
+
+	if (rx_count >= asoc->base.sk->sk_rcvbuf) {
+
+		if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+			goto fail;
+	}
+
+	/* Clone the original skb, sharing the data.  */
+	skb = skb_clone(chunk->skb, gfp);
+	if (!skb)
+		goto fail;
+
+	/* Now that all memory allocations for this chunk succeeded, we
+	 * can mark it as received so the tsn_map is updated correctly.
+	 */
+	if (sctp_tsnmap_mark(&asoc->peer.tsn_map,
+			     ntohl(chunk->subh.data_hdr->tsn)))
+		goto fail_mark;
+
+	/* First calculate the padding, so we don't inadvertently
+	 * pass up the wrong length to the user.
+	 *
+	 * RFC 2960 - Section 3.2  Chunk Field Descriptions
+	 *
+	 * The total length of a chunk(including Type, Length and Value fields)
+	 * MUST be a multiple of 4 bytes.  If the length of the chunk is not a
+	 * multiple of 4 bytes, the sender MUST pad the chunk with all zero
+	 * bytes and this padding is not included in the chunk length field.
+	 * The sender should never pad with more than 3 bytes.  The receiver
+	 * MUST ignore the padding bytes.
+	 */
+	len = ntohs(chunk->chunk_hdr->length);
+	padding = WORD_ROUND(len) - len;
+
+	/* Fixup cloned skb with just this chunks data.  */
+	skb_trim(skb, chunk->chunk_end - padding - skb->data);
+
+	/* Embed the event fields inside the cloned skb.  */
+	event = sctp_skb2event(skb);
+
+	/* Initialize event with flags 0  and correct length
+	 * Since this is a clone of the original skb, only account for
+	 * the data of this chunk as other chunks will be accounted separately.
+	 */
+	sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff));
+
+	sctp_ulpevent_receive_data(event, asoc);
+
+	event->stream = ntohs(chunk->subh.data_hdr->stream);
+	event->ssn = ntohs(chunk->subh.data_hdr->ssn);
+	event->ppid = chunk->subh.data_hdr->ppid;
+	if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
+		event->flags |= SCTP_UNORDERED;
+		event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
+	}
+	event->tsn = ntohl(chunk->subh.data_hdr->tsn);
+	event->msg_flags |= chunk->chunk_hdr->flags;
+	event->iif = sctp_chunk_iif(chunk);
+
+	return event;
+
+fail_mark:
+	kfree_skb(skb);
+fail:
+	return NULL;
+}
+
+/* Create a partial delivery related event.
+ *
+ * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT
+ *
+ *   When a receiver is engaged in a partial delivery of a
+ *   message this notification will be used to indicate
+ *   various events.
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
+	const struct sctp_association *asoc, __u32 indication,
+	gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_pdapi_event *pd;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		goto fail;
+
+	skb = sctp_event2skb(event);
+	pd = (struct sctp_pdapi_event *)
+		skb_put(skb, sizeof(struct sctp_pdapi_event));
+
+	/* pdapi_type
+	 *   It should be SCTP_PARTIAL_DELIVERY_EVENT
+	 *
+	 * pdapi_flags: 16 bits (unsigned integer)
+	 *   Currently unused.
+	 */
+	pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
+	pd->pdapi_flags = 0;
+
+	/* pdapi_length: 32 bits (unsigned integer)
+	 *
+	 * This field is the total length of the notification data, including
+	 * the notification header.  It will generally be sizeof (struct
+	 * sctp_pdapi_event).
+	 */
+	pd->pdapi_length = sizeof(struct sctp_pdapi_event);
+
+	/*  pdapi_indication: 32 bits (unsigned integer)
+	 *
+	 * This field holds the indication being sent to the application.
+	 */
+	pd->pdapi_indication = indication;
+
+	/*  pdapi_assoc_id: sizeof (sctp_assoc_t)
+	 *
+	 * The association id field, holds the identifier for the association.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	pd->pdapi_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+fail:
+	return NULL;
+}
+
+struct sctp_ulpevent *sctp_ulpevent_make_authkey(
+	const struct sctp_association *asoc, __u16 key_id,
+	__u32 indication, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_authkey_event *ak;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		goto fail;
+
+	skb = sctp_event2skb(event);
+	ak = (struct sctp_authkey_event *)
+		skb_put(skb, sizeof(struct sctp_authkey_event));
+
+	ak->auth_type = SCTP_AUTHENTICATION_EVENT;
+	ak->auth_flags = 0;
+	ak->auth_length = sizeof(struct sctp_authkey_event);
+
+	ak->auth_keynumber = key_id;
+	ak->auth_altkeynumber = 0;
+	ak->auth_indication = indication;
+
+	/*
+	 * The association id field, holds the identifier for the association.
+	 */
+	sctp_ulpevent_set_owner(event, asoc);
+	ak->auth_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+fail:
+	return NULL;
+}
+
+/*
+ * Socket Extensions for SCTP
+ * 6.3.10. SCTP_SENDER_DRY_EVENT
+ */
+struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
+	const struct sctp_association *asoc, gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_sender_dry_event *sdry;
+	struct sk_buff *skb;
+
+	event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event),
+				  MSG_NOTIFICATION, gfp);
+	if (!event)
+		return NULL;
+
+	skb = sctp_event2skb(event);
+	sdry = (struct sctp_sender_dry_event *)
+		skb_put(skb, sizeof(struct sctp_sender_dry_event));
+
+	sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT;
+	sdry->sender_dry_flags = 0;
+	sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event);
+	sctp_ulpevent_set_owner(event, asoc);
+	sdry->sender_dry_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+}
+
+/* Return the notification type, assuming this is a notification
+ * event.
+ */
+__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
+{
+	union sctp_notification *notification;
+	struct sk_buff *skb;
+
+	skb = sctp_event2skb(event);
+	notification = (union sctp_notification *) skb->data;
+	return notification->sn_header.sn_type;
+}
+
+/* Copy out the sndrcvinfo into a msghdr.  */
+void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
+				   struct msghdr *msghdr)
+{
+	struct sctp_sndrcvinfo sinfo;
+
+	if (sctp_ulpevent_is_notification(event))
+		return;
+
+	/* Sockets API Extensions for SCTP
+	 * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
+	 *
+	 * sinfo_stream: 16 bits (unsigned integer)
+	 *
+	 * For recvmsg() the SCTP stack places the message's stream number in
+	 * this value.
+	*/
+	sinfo.sinfo_stream = event->stream;
+	/* sinfo_ssn: 16 bits (unsigned integer)
+	 *
+	 * For recvmsg() this value contains the stream sequence number that
+	 * the remote endpoint placed in the DATA chunk.  For fragmented
+	 * messages this is the same number for all deliveries of the message
+	 * (if more than one recvmsg() is needed to read the message).
+	 */
+	sinfo.sinfo_ssn = event->ssn;
+	/* sinfo_ppid: 32 bits (unsigned integer)
+	 *
+	 * In recvmsg() this value is
+	 * the same information that was passed by the upper layer in the peer
+	 * application.  Please note that byte order issues are NOT accounted
+	 * for and this information is passed opaquely by the SCTP stack from
+	 * one end to the other.
+	 */
+	sinfo.sinfo_ppid = event->ppid;
+	/* sinfo_flags: 16 bits (unsigned integer)
+	 *
+	 * This field may contain any of the following flags and is composed of
+	 * a bitwise OR of these values.
+	 *
+	 * recvmsg() flags:
+	 *
+	 * SCTP_UNORDERED - This flag is present when the message was sent
+	 *                 non-ordered.
+	 */
+	sinfo.sinfo_flags = event->flags;
+	/* sinfo_tsn: 32 bit (unsigned integer)
+	 *
+	 * For the receiving side, this field holds a TSN that was
+	 * assigned to one of the SCTP Data Chunks.
+	 */
+	sinfo.sinfo_tsn = event->tsn;
+	/* sinfo_cumtsn: 32 bit (unsigned integer)
+	 *
+	 * This field will hold the current cumulative TSN as
+	 * known by the underlying SCTP layer.  Note this field is
+	 * ignored when sending and only valid for a receive
+	 * operation when sinfo_flags are set to SCTP_UNORDERED.
+	 */
+	sinfo.sinfo_cumtsn = event->cumtsn;
+	/* sinfo_assoc_id: sizeof (sctp_assoc_t)
+	 *
+	 * The association handle field, sinfo_assoc_id, holds the identifier
+	 * for the association announced in the COMMUNICATION_UP notification.
+	 * All notifications for a given association have the same identifier.
+	 * Ignored for one-to-one style sockets.
+	 */
+	sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc);
+
+	/* context value that is set via SCTP_CONTEXT socket option. */
+	sinfo.sinfo_context = event->asoc->default_rcv_context;
+
+	/* These fields are not used while receiving. */
+	sinfo.sinfo_timetolive = 0;
+
+	put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV,
+		 sizeof(struct sctp_sndrcvinfo), (void *)&sinfo);
+}
+
+/* Do accounting for bytes received and hold a reference to the association
+ * for each skb.
+ */
+static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
+				       struct sctp_association *asoc)
+{
+	struct sk_buff *skb, *frag;
+
+	skb = sctp_event2skb(event);
+	/* Set the owner and charge rwnd for bytes received.  */
+	sctp_ulpevent_set_owner(event, asoc);
+	sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));
+
+	if (!skb->data_len)
+		return;
+
+	/* Note:  Not clearing the entire event struct as this is just a
+	 * fragment of the real event.  However, we still need to do rwnd
+	 * accounting.
+	 * In general, the skb passed from IP can have only 1 level of
+	 * fragments. But we allow multiple levels of fragments.
+	 */
+	skb_walk_frags(skb, frag)
+		sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc);
+}
+
+/* Do accounting for bytes just read by user and release the references to
+ * the association.
+ */
+static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
+{
+	struct sk_buff *skb, *frag;
+	unsigned int	len;
+
+	/* Current stack structures assume that the rcv buffer is
+	 * per socket.   For UDP style sockets this is not true as
+	 * multiple associations may be on a single UDP-style socket.
+	 * Use the local private area of the skb to track the owning
+	 * association.
+	 */
+
+	skb = sctp_event2skb(event);
+	len = skb->len;
+
+	if (!skb->data_len)
+		goto done;
+
+	/* Don't forget the fragments. */
+	skb_walk_frags(skb, frag) {
+		/* NOTE:  skb_shinfos are recursive. Although IP returns
+		 * skb's with only 1 level of fragments, SCTP reassembly can
+		 * increase the levels.
+		 */
+		sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
+	}
+
+done:
+	sctp_assoc_rwnd_increase(event->asoc, len);
+	sctp_ulpevent_release_owner(event);
+}
+
+static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
+{
+	struct sk_buff *skb, *frag;
+
+	skb = sctp_event2skb(event);
+
+	if (!skb->data_len)
+		goto done;
+
+	/* Don't forget the fragments. */
+	skb_walk_frags(skb, frag) {
+		/* NOTE:  skb_shinfos are recursive. Although IP returns
+		 * skb's with only 1 level of fragments, SCTP reassembly can
+		 * increase the levels.
+		 */
+		sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
+	}
+
+done:
+	sctp_ulpevent_release_owner(event);
+}
+
+/* Free a ulpevent that has an owner.  It includes releasing the reference
+ * to the owner, updating the rwnd in case of a DATA event and freeing the
+ * skb.
+ */
+void sctp_ulpevent_free(struct sctp_ulpevent *event)
+{
+	if (sctp_ulpevent_is_notification(event))
+		sctp_ulpevent_release_owner(event);
+	else
+		sctp_ulpevent_release_data(event);
+
+	kfree_skb(sctp_event2skb(event));
+}
+
+/* Purge the skb lists holding ulpevents. */
+unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list)
+{
+	struct sk_buff *skb;
+	unsigned int data_unread = 0;
+
+	while ((skb = skb_dequeue(list)) != NULL) {
+		struct sctp_ulpevent *event = sctp_skb2event(skb);
+
+		if (!sctp_ulpevent_is_notification(event))
+			data_unread += skb->len;
+
+		sctp_ulpevent_free(event);
+	}
+
+	return data_unread;
+}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
new file mode 100644
index 00000000..f2d1de7f
--- /dev/null
+++ b/net/sctp/ulpqueue.c
@@ -0,0 +1,1088 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Nokia, Inc.
+ * Copyright (c) 2001 La Monte H.P. Yarroll
+ *
+ * This abstraction carries sctp events to the ULP (sockets).
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    Jon Grimm             <jgrimm@us.ibm.com>
+ *    La Monte H.P. Yarroll <piggy@acm.org>
+ *    Sridhar Samudrala     <sri@us.ibm.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+
+/* Forward declarations for internal helpers.  */
+static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
+					      struct sctp_ulpevent *);
+static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
+					      struct sctp_ulpevent *);
+static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq);
+
+/* 1st Level Abstractions */
+
+/* Initialize a ULP queue from a block of memory.  */
+struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
+				 struct sctp_association *asoc)
+{
+	memset(ulpq, 0, sizeof(struct sctp_ulpq));
+
+	ulpq->asoc = asoc;
+	skb_queue_head_init(&ulpq->reasm);
+	skb_queue_head_init(&ulpq->lobby);
+	ulpq->pd_mode  = 0;
+	ulpq->malloced = 0;
+
+	return ulpq;
+}
+
+
+/* Flush the reassembly and ordering queues.  */
+void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
+{
+	struct sk_buff *skb;
+	struct sctp_ulpevent *event;
+
+	while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) {
+		event = sctp_skb2event(skb);
+		sctp_ulpevent_free(event);
+	}
+
+	while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) {
+		event = sctp_skb2event(skb);
+		sctp_ulpevent_free(event);
+	}
+
+}
+
+/* Dispose of a ulpqueue.  */
+void sctp_ulpq_free(struct sctp_ulpq *ulpq)
+{
+	sctp_ulpq_flush(ulpq);
+	if (ulpq->malloced)
+		kfree(ulpq);
+}
+
+/* Process an incoming DATA chunk.  */
+int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+			gfp_t gfp)
+{
+	struct sk_buff_head temp;
+	struct sctp_ulpevent *event;
+
+	/* Create an event from the incoming chunk. */
+	event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
+	if (!event)
+		return -ENOMEM;
+
+	/* Do reassembly if needed.  */
+	event = sctp_ulpq_reasm(ulpq, event);
+
+	/* Do ordering if needed.  */
+	if ((event) && (event->msg_flags & MSG_EOR)){
+		/* Create a temporary list to collect chunks on.  */
+		skb_queue_head_init(&temp);
+		__skb_queue_tail(&temp, sctp_event2skb(event));
+
+		event = sctp_ulpq_order(ulpq, event);
+	}
+
+	/* Send event to the ULP.  'event' is the sctp_ulpevent for
+	 * very first SKB on the 'temp' list.
+	 */
+	if (event)
+		sctp_ulpq_tail_event(ulpq, event);
+
+	return 0;
+}
+
+/* Add a new event for propagation to the ULP.  */
+/* Clear the partial delivery mode for this socket.   Note: This
+ * assumes that no association is currently in partial delivery mode.
+ */
+int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
+{
+	struct sctp_sock *sp = sctp_sk(sk);
+
+	if (atomic_dec_and_test(&sp->pd_mode)) {
+		/* This means there are no other associations in PD, so
+		 * we can go ahead and clear out the lobby in one shot
+		 */
+		if (!skb_queue_empty(&sp->pd_lobby)) {
+			struct list_head *list;
+			sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue);
+			list = (struct list_head *)&sctp_sk(sk)->pd_lobby;
+			INIT_LIST_HEAD(list);
+			return 1;
+		}
+	} else {
+		/* There are other associations in PD, so we only need to
+		 * pull stuff out of the lobby that belongs to the
+		 * associations that is exiting PD (all of its notifications
+		 * are posted here).
+		 */
+		if (!skb_queue_empty(&sp->pd_lobby) && asoc) {
+			struct sk_buff *skb, *tmp;
+			struct sctp_ulpevent *event;
+
+			sctp_skb_for_each(skb, &sp->pd_lobby, tmp) {
+				event = sctp_skb2event(skb);
+				if (event->asoc == asoc) {
+					__skb_unlink(skb, &sp->pd_lobby);
+					__skb_queue_tail(&sk->sk_receive_queue,
+							 skb);
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* Set the pd_mode on the socket and ulpq */
+static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq)
+{
+	struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk);
+
+	atomic_inc(&sp->pd_mode);
+	ulpq->pd_mode = 1;
+}
+
+/* Clear the pd_mode and restart any pending messages waiting for delivery. */
+static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
+{
+	ulpq->pd_mode = 0;
+	sctp_ulpq_reasm_drain(ulpq);
+	return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc);
+}
+
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
+int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
+{
+	struct sock *sk = ulpq->asoc->base.sk;
+	struct sk_buff_head *queue, *skb_list;
+	struct sk_buff *skb = sctp_event2skb(event);
+	int clear_pd = 0;
+
+	skb_list = (struct sk_buff_head *) skb->prev;
+
+	/* If the socket is just going to throw this away, do not
+	 * even try to deliver it.
+	 */
+	if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
+		goto out_free;
+
+	/* Check if the user wishes to receive this event.  */
+	if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
+		goto out_free;
+
+	/* If we are in partial delivery mode, post to the lobby until
+	 * partial delivery is cleared, unless, of course _this_ is
+	 * the association the cause of the partial delivery.
+	 */
+
+	if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) {
+		queue = &sk->sk_receive_queue;
+	} else {
+		if (ulpq->pd_mode) {
+			/* If the association is in partial delivery, we
+			 * need to finish delivering the partially processed
+			 * packet before passing any other data.  This is
+			 * because we don't truly support stream interleaving.
+			 */
+			if ((event->msg_flags & MSG_NOTIFICATION) ||
+			    (SCTP_DATA_NOT_FRAG ==
+				    (event->msg_flags & SCTP_DATA_FRAG_MASK)))
+				queue = &sctp_sk(sk)->pd_lobby;
+			else {
+				clear_pd = event->msg_flags & MSG_EOR;
+				queue = &sk->sk_receive_queue;
+			}
+		} else {
+			/*
+			 * If fragment interleave is enabled, we
+			 * can queue this to the receive queue instead
+			 * of the lobby.
+			 */
+			if (sctp_sk(sk)->frag_interleave)
+				queue = &sk->sk_receive_queue;
+			else
+				queue = &sctp_sk(sk)->pd_lobby;
+		}
+	}
+
+	/* If we are harvesting multiple skbs they will be
+	 * collected on a list.
+	 */
+	if (skb_list)
+		sctp_skb_list_tail(skb_list, queue);
+	else
+		__skb_queue_tail(queue, skb);
+
+	/* Did we just complete partial delivery and need to get
+	 * rolling again?  Move pending data to the receive
+	 * queue.
+	 */
+	if (clear_pd)
+		sctp_ulpq_clear_pd(ulpq);
+
+	if (queue == &sk->sk_receive_queue)
+		sk->sk_data_ready(sk, 0);
+	return 1;
+
+out_free:
+	if (skb_list)
+		sctp_queue_purge_ulpevents(skb_list);
+	else
+		sctp_ulpevent_free(event);
+
+	return 0;
+}
+
+/* 2nd Level Abstractions */
+
+/* Helper function to store chunks that need to be reassembled.  */
+static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
+					 struct sctp_ulpevent *event)
+{
+	struct sk_buff *pos;
+	struct sctp_ulpevent *cevent;
+	__u32 tsn, ctsn;
+
+	tsn = event->tsn;
+
+	/* See if it belongs at the end. */
+	pos = skb_peek_tail(&ulpq->reasm);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	/* Short circuit just dropping it at the end. */
+	cevent = sctp_skb2event(pos);
+	ctsn = cevent->tsn;
+	if (TSN_lt(ctsn, tsn)) {
+		__skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
+		return;
+	}
+
+	/* Find the right place in this list. We store them by TSN.  */
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+		ctsn = cevent->tsn;
+
+		if (TSN_lt(tsn, ctsn))
+			break;
+	}
+
+	/* Insert before pos. */
+	__skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));
+
+}
+
+/* Helper function to return an event corresponding to the reassembled
+ * datagram.
+ * This routine creates a re-assembled skb given the first and last skb's
+ * as stored in the reassembly queue. The skb's may be non-linear if the sctp
+ * payload was fragmented on the way and ip had to reassemble them.
+ * We add the rest of skb's to the first skb's fraglist.
+ */
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
+{
+	struct sk_buff *pos;
+	struct sk_buff *new = NULL;
+	struct sctp_ulpevent *event;
+	struct sk_buff *pnext, *last;
+	struct sk_buff *list = skb_shinfo(f_frag)->frag_list;
+
+	/* Store the pointer to the 2nd skb */
+	if (f_frag == l_frag)
+		pos = NULL;
+	else
+		pos = f_frag->next;
+
+	/* Get the last skb in the f_frag's frag_list if present. */
+	for (last = list; list; last = list, list = list->next);
+
+	/* Add the list of remaining fragments to the first fragments
+	 * frag_list.
+	 */
+	if (last)
+		last->next = pos;
+	else {
+		if (skb_cloned(f_frag)) {
+			/* This is a cloned skb, we can't just modify
+			 * the frag_list.  We need a new skb to do that.
+			 * Instead of calling skb_unshare(), we'll do it
+			 * ourselves since we need to delay the free.
+			 */
+			new = skb_copy(f_frag, GFP_ATOMIC);
+			if (!new)
+				return NULL;	/* try again later */
+
+			sctp_skb_set_owner_r(new, f_frag->sk);
+
+			skb_shinfo(new)->frag_list = pos;
+		} else
+			skb_shinfo(f_frag)->frag_list = pos;
+	}
+
+	/* Remove the first fragment from the reassembly queue.  */
+	__skb_unlink(f_frag, queue);
+
+	/* if we did unshare, then free the old skb and re-assign */
+	if (new) {
+		kfree_skb(f_frag);
+		f_frag = new;
+	}
+
+	while (pos) {
+
+		pnext = pos->next;
+
+		/* Update the len and data_len fields of the first fragment. */
+		f_frag->len += pos->len;
+		f_frag->data_len += pos->len;
+
+		/* Remove the fragment from the reassembly queue.  */
+		__skb_unlink(pos, queue);
+
+		/* Break if we have reached the last fragment.  */
+		if (pos == l_frag)
+			break;
+		pos->next = pnext;
+		pos = pnext;
+	}
+
+	event = sctp_skb2event(f_frag);
+	SCTP_INC_STATS(SCTP_MIB_REASMUSRMSGS);
+
+	return event;
+}
+
+
+/* Helper function to check if an incoming chunk has filled up the last
+ * missing fragment in a SCTP datagram and return the corresponding event.
+ */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq)
+{
+	struct sk_buff *pos;
+	struct sctp_ulpevent *cevent;
+	struct sk_buff *first_frag = NULL;
+	__u32 ctsn, next_tsn;
+	struct sctp_ulpevent *retval = NULL;
+	struct sk_buff *pd_first = NULL;
+	struct sk_buff *pd_last = NULL;
+	size_t pd_len = 0;
+	struct sctp_association *asoc;
+	u32 pd_point;
+
+	/* Initialized to 0 just to avoid compiler warning message.  Will
+	 * never be used with this value. It is referenced only after it
+	 * is set when we find the first fragment of a message.
+	 */
+	next_tsn = 0;
+
+	/* The chunks are held in the reasm queue sorted by TSN.
+	 * Walk through the queue sequentially and look for a sequence of
+	 * fragmented chunks that complete a datagram.
+	 * 'first_frag' and next_tsn are reset when we find a chunk which
+	 * is the first fragment of a datagram. Once these 2 fields are set
+	 * we expect to find the remaining middle fragments and the last
+	 * fragment in order. If not, first_frag is reset to NULL and we
+	 * start the next pass when we find another first fragment.
+	 *
+	 * There is a potential to do partial delivery if user sets
+	 * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here
+	 * to see if can do PD.
+	 */
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+		ctsn = cevent->tsn;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			/* If this "FIRST_FRAG" is the first
+			 * element in the queue, then count it towards
+			 * possible PD.
+			 */
+			if (pos == ulpq->reasm.next) {
+			    pd_first = pos;
+			    pd_last = pos;
+			    pd_len = pos->len;
+			} else {
+			    pd_first = NULL;
+			    pd_last = NULL;
+			    pd_len = 0;
+			}
+
+			first_frag = pos;
+			next_tsn = ctsn + 1;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if ((first_frag) && (ctsn == next_tsn)) {
+				next_tsn++;
+				if (pd_first) {
+				    pd_last = pos;
+				    pd_len += pos->len;
+				}
+			} else
+				first_frag = NULL;
+			break;
+
+		case SCTP_DATA_LAST_FRAG:
+			if (first_frag && (ctsn == next_tsn))
+				goto found;
+			else
+				first_frag = NULL;
+			break;
+		}
+	}
+
+	asoc = ulpq->asoc;
+	if (pd_first) {
+		/* Make sure we can enter partial deliver.
+		 * We can trigger partial delivery only if framgent
+		 * interleave is set, or the socket is not already
+		 * in  partial delivery.
+		 */
+		if (!sctp_sk(asoc->base.sk)->frag_interleave &&
+		    atomic_read(&sctp_sk(asoc->base.sk)->pd_mode))
+			goto done;
+
+		cevent = sctp_skb2event(pd_first);
+		pd_point = sctp_sk(asoc->base.sk)->pd_point;
+		if (pd_point && pd_point <= pd_len) {
+			retval = sctp_make_reassembled_event(&ulpq->reasm,
+							     pd_first,
+							     pd_last);
+			if (retval)
+				sctp_ulpq_set_pd(ulpq);
+		}
+	}
+done:
+	return retval;
+found:
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
+	if (retval)
+		retval->msg_flags |= MSG_EOR;
+	goto done;
+}
+
+/* Retrieve the next set of fragments of a partial message. */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq)
+{
+	struct sk_buff *pos, *last_frag, *first_frag;
+	struct sctp_ulpevent *cevent;
+	__u32 ctsn, next_tsn;
+	int is_last;
+	struct sctp_ulpevent *retval;
+
+	/* The chunks are held in the reasm queue sorted by TSN.
+	 * Walk through the queue sequentially and look for the first
+	 * sequence of fragmented chunks.
+	 */
+
+	if (skb_queue_empty(&ulpq->reasm))
+		return NULL;
+
+	last_frag = first_frag = NULL;
+	retval = NULL;
+	next_tsn = 0;
+	is_last = 0;
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+		ctsn = cevent->tsn;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag) {
+				first_frag = pos;
+				next_tsn = ctsn + 1;
+				last_frag = pos;
+			} else if (next_tsn == ctsn)
+				next_tsn++;
+			else
+				goto done;
+			break;
+		case SCTP_DATA_LAST_FRAG:
+			if (!first_frag)
+				first_frag = pos;
+			else if (ctsn != next_tsn)
+				goto done;
+			last_frag = pos;
+			is_last = 1;
+			goto done;
+		default:
+			return NULL;
+		}
+	}
+
+	/* We have the reassembled event. There is no need to look
+	 * further.
+	 */
+done:
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
+	if (retval && is_last)
+		retval->msg_flags |= MSG_EOR;
+
+	return retval;
+}
+
+
+/* Helper function to reassemble chunks.  Hold chunks on the reasm queue that
+ * need reassembling.
+ */
+static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
+						struct sctp_ulpevent *event)
+{
+	struct sctp_ulpevent *retval = NULL;
+
+	/* Check if this is part of a fragmented message.  */
+	if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
+		event->msg_flags |= MSG_EOR;
+		return event;
+	}
+
+	sctp_ulpq_store_reasm(ulpq, event);
+	if (!ulpq->pd_mode)
+		retval = sctp_ulpq_retrieve_reassembled(ulpq);
+	else {
+		__u32 ctsn, ctsnap;
+
+		/* Do not even bother unless this is the next tsn to
+		 * be delivered.
+		 */
+		ctsn = event->tsn;
+		ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map);
+		if (TSN_lte(ctsn, ctsnap))
+			retval = sctp_ulpq_retrieve_partial(ulpq);
+	}
+
+	return retval;
+}
+
+/* Retrieve the first part (sequential fragments) for partial delivery.  */
+static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq)
+{
+	struct sk_buff *pos, *last_frag, *first_frag;
+	struct sctp_ulpevent *cevent;
+	__u32 ctsn, next_tsn;
+	struct sctp_ulpevent *retval;
+
+	/* The chunks are held in the reasm queue sorted by TSN.
+	 * Walk through the queue sequentially and look for a sequence of
+	 * fragmented chunks that start a datagram.
+	 */
+
+	if (skb_queue_empty(&ulpq->reasm))
+		return NULL;
+
+	last_frag = first_frag = NULL;
+	retval = NULL;
+	next_tsn = 0;
+
+	skb_queue_walk(&ulpq->reasm, pos) {
+		cevent = sctp_skb2event(pos);
+		ctsn = cevent->tsn;
+
+		switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
+		case SCTP_DATA_FIRST_FRAG:
+			if (!first_frag) {
+				first_frag = pos;
+				next_tsn = ctsn + 1;
+				last_frag = pos;
+			} else
+				goto done;
+			break;
+
+		case SCTP_DATA_MIDDLE_FRAG:
+			if (!first_frag)
+				return NULL;
+			if (ctsn == next_tsn) {
+				next_tsn++;
+				last_frag = pos;
+			} else
+				goto done;
+			break;
+		default:
+			return NULL;
+		}
+	}
+
+	/* We have the reassembled event. There is no need to look
+	 * further.
+	 */
+done:
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
+	return retval;
+}
+
+/*
+ * Flush out stale fragments from the reassembly queue when processing
+ * a Forward TSN.
+ *
+ * RFC 3758, Section 3.6
+ *
+ * After receiving and processing a FORWARD TSN, the data receiver MUST
+ * take cautions in updating its re-assembly queue.  The receiver MUST
+ * remove any partially reassembled message, which is still missing one
+ * or more TSNs earlier than or equal to the new cumulative TSN point.
+ * In the event that the receiver has invoked the partial delivery API,
+ * a notification SHOULD also be generated to inform the upper layer API
+ * that the message being partially delivered will NOT be completed.
+ */
+void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn)
+{
+	struct sk_buff *pos, *tmp;
+	struct sctp_ulpevent *event;
+	__u32 tsn;
+
+	if (skb_queue_empty(&ulpq->reasm))
+		return;
+
+	skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
+		event = sctp_skb2event(pos);
+		tsn = event->tsn;
+
+		/* Since the entire message must be abandoned by the
+		 * sender (item A3 in Section 3.5, RFC 3758), we can
+		 * free all fragments on the list that are less then
+		 * or equal to ctsn_point
+		 */
+		if (TSN_lte(tsn, fwd_tsn)) {
+			__skb_unlink(pos, &ulpq->reasm);
+			sctp_ulpevent_free(event);
+		} else
+			break;
+	}
+}
+
+/*
+ * Drain the reassembly queue.  If we just cleared parted delivery, it
+ * is possible that the reassembly queue will contain already reassembled
+ * messages.  Retrieve any such messages and give them to the user.
+ */
+static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
+{
+	struct sctp_ulpevent *event = NULL;
+	struct sk_buff_head temp;
+
+	if (skb_queue_empty(&ulpq->reasm))
+		return;
+
+	while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) {
+		/* Do ordering if needed.  */
+		if ((event) && (event->msg_flags & MSG_EOR)){
+			skb_queue_head_init(&temp);
+			__skb_queue_tail(&temp, sctp_event2skb(event));
+
+			event = sctp_ulpq_order(ulpq, event);
+		}
+
+		/* Send event to the ULP.  'event' is the
+		 * sctp_ulpevent for  very first SKB on the  temp' list.
+		 */
+		if (event)
+			sctp_ulpq_tail_event(ulpq, event);
+	}
+}
+
+
+/* Helper function to gather skbs that have possibly become
+ * ordered by an an incoming chunk.
+ */
+static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
+					      struct sctp_ulpevent *event)
+{
+	struct sk_buff_head *event_list;
+	struct sk_buff *pos, *tmp;
+	struct sctp_ulpevent *cevent;
+	struct sctp_stream *in;
+	__u16 sid, csid, cssn;
+
+	sid = event->stream;
+	in  = &ulpq->asoc->ssnmap->in;
+
+	event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
+
+	/* We are holding the chunks by stream, by SSN.  */
+	sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
+		cevent = (struct sctp_ulpevent *) pos->cb;
+		csid = cevent->stream;
+		cssn = cevent->ssn;
+
+		/* Have we gone too far?  */
+		if (csid > sid)
+			break;
+
+		/* Have we not gone far enough?  */
+		if (csid < sid)
+			continue;
+
+		if (cssn != sctp_ssn_peek(in, sid))
+			break;
+
+		/* Found it, so mark in the ssnmap. */
+		sctp_ssn_next(in, sid);
+
+		__skb_unlink(pos, &ulpq->lobby);
+
+		/* Attach all gathered skbs to the event.  */
+		__skb_queue_tail(event_list, pos);
+	}
+}
+
+/* Helper function to store chunks needing ordering.  */
+static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
+					   struct sctp_ulpevent *event)
+{
+	struct sk_buff *pos;
+	struct sctp_ulpevent *cevent;
+	__u16 sid, csid;
+	__u16 ssn, cssn;
+
+	pos = skb_peek_tail(&ulpq->lobby);
+	if (!pos) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	sid = event->stream;
+	ssn = event->ssn;
+
+	cevent = (struct sctp_ulpevent *) pos->cb;
+	csid = cevent->stream;
+	cssn = cevent->ssn;
+	if (sid > csid) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	if ((sid == csid) && SSN_lt(cssn, ssn)) {
+		__skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
+		return;
+	}
+
+	/* Find the right place in this list.  We store them by
+	 * stream ID and then by SSN.
+	 */
+	skb_queue_walk(&ulpq->lobby, pos) {
+		cevent = (struct sctp_ulpevent *) pos->cb;
+		csid = cevent->stream;
+		cssn = cevent->ssn;
+
+		if (csid > sid)
+			break;
+		if (csid == sid && SSN_lt(ssn, cssn))
+			break;
+	}
+
+
+	/* Insert before pos. */
+	__skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
+}
+
+static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
+					     struct sctp_ulpevent *event)
+{
+	__u16 sid, ssn;
+	struct sctp_stream *in;
+
+	/* Check if this message needs ordering.  */
+	if (SCTP_DATA_UNORDERED & event->msg_flags)
+		return event;
+
+	/* Note: The stream ID must be verified before this routine.  */
+	sid = event->stream;
+	ssn = event->ssn;
+	in  = &ulpq->asoc->ssnmap->in;
+
+	/* Is this the expected SSN for this stream ID?  */
+	if (ssn != sctp_ssn_peek(in, sid)) {
+		/* We've received something out of order, so find where it
+		 * needs to be placed.  We order by stream and then by SSN.
+		 */
+		sctp_ulpq_store_ordered(ulpq, event);
+		return NULL;
+	}
+
+	/* Mark that the next chunk has been found.  */
+	sctp_ssn_next(in, sid);
+
+	/* Go find any other chunks that were waiting for
+	 * ordering.
+	 */
+	sctp_ulpq_retrieve_ordered(ulpq, event);
+
+	return event;
+}
+
+/* Helper function to gather skbs that have possibly become
+ * ordered by forward tsn skipping their dependencies.
+ */
+static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
+{
+	struct sk_buff *pos, *tmp;
+	struct sctp_ulpevent *cevent;
+	struct sctp_ulpevent *event;
+	struct sctp_stream *in;
+	struct sk_buff_head temp;
+	struct sk_buff_head *lobby = &ulpq->lobby;
+	__u16 csid, cssn;
+
+	in  = &ulpq->asoc->ssnmap->in;
+
+	/* We are holding the chunks by stream, by SSN.  */
+	skb_queue_head_init(&temp);
+	event = NULL;
+	sctp_skb_for_each(pos, lobby, tmp) {
+		cevent = (struct sctp_ulpevent *) pos->cb;
+		csid = cevent->stream;
+		cssn = cevent->ssn;
+
+		/* Have we gone too far?  */
+		if (csid > sid)
+			break;
+
+		/* Have we not gone far enough?  */
+		if (csid < sid)
+			continue;
+
+		/* see if this ssn has been marked by skipping */
+		if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
+			break;
+
+		__skb_unlink(pos, lobby);
+		if (!event)
+			/* Create a temporary list to collect chunks on.  */
+			event = sctp_skb2event(pos);
+
+		/* Attach all gathered skbs to the event.  */
+		__skb_queue_tail(&temp, pos);
+	}
+
+	/* If we didn't reap any data, see if the next expected SSN
+	 * is next on the queue and if so, use that.
+	 */
+	if (event == NULL && pos != (struct sk_buff *)lobby) {
+		cevent = (struct sctp_ulpevent *) pos->cb;
+		csid = cevent->stream;
+		cssn = cevent->ssn;
+
+		if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
+			sctp_ssn_next(in, csid);
+			__skb_unlink(pos, lobby);
+			__skb_queue_tail(&temp, pos);
+			event = sctp_skb2event(pos);
+		}
+	}
+
+	/* Send event to the ULP.  'event' is the sctp_ulpevent for
+	 * very first SKB on the 'temp' list.
+	 */
+	if (event) {
+		/* see if we have more ordered that we can deliver */
+		sctp_ulpq_retrieve_ordered(ulpq, event);
+		sctp_ulpq_tail_event(ulpq, event);
+	}
+}
+
+/* Skip over an SSN. This is used during the processing of
+ * Forwared TSN chunk to skip over the abandoned ordered data
+ */
+void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
+{
+	struct sctp_stream *in;
+
+	/* Note: The stream ID must be verified before this routine.  */
+	in  = &ulpq->asoc->ssnmap->in;
+
+	/* Is this an old SSN?  If so ignore. */
+	if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
+		return;
+
+	/* Mark that we are no longer expecting this SSN or lower. */
+	sctp_ssn_skip(in, sid, ssn);
+
+	/* Go find any other chunks that were waiting for
+	 * ordering and deliver them if needed.
+	 */
+	sctp_ulpq_reap_ordered(ulpq, sid);
+}
+
+static __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq,
+		struct sk_buff_head *list, __u16 needed)
+{
+	__u16 freed = 0;
+	__u32 tsn;
+	struct sk_buff *skb;
+	struct sctp_ulpevent *event;
+	struct sctp_tsnmap *tsnmap;
+
+	tsnmap = &ulpq->asoc->peer.tsn_map;
+
+	while ((skb = __skb_dequeue_tail(list)) != NULL) {
+		freed += skb_headlen(skb);
+		event = sctp_skb2event(skb);
+		tsn = event->tsn;
+
+		sctp_ulpevent_free(event);
+		sctp_tsnmap_renege(tsnmap, tsn);
+		if (freed >= needed)
+			return freed;
+	}
+
+	return freed;
+}
+
+/* Renege 'needed' bytes from the ordering queue. */
+static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed)
+{
+	return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
+}
+
+/* Renege 'needed' bytes from the reassembly queue. */
+static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
+{
+	return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed);
+}
+
+/* Partial deliver the first message as there is pressure on rwnd. */
+void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
+				struct sctp_chunk *chunk,
+				gfp_t gfp)
+{
+	struct sctp_ulpevent *event;
+	struct sctp_association *asoc;
+	struct sctp_sock *sp;
+
+	asoc = ulpq->asoc;
+	sp = sctp_sk(asoc->base.sk);
+
+	/* If the association is already in Partial Delivery mode
+	 * we have noting to do.
+	 */
+	if (ulpq->pd_mode)
+		return;
+
+	/* If the user enabled fragment interleave socket option,
+	 * multiple associations can enter partial delivery.
+	 * Otherwise, we can only enter partial delivery if the
+	 * socket is not in partial deliver mode.
+	 */
+	if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) {
+		/* Is partial delivery possible?  */
+		event = sctp_ulpq_retrieve_first(ulpq);
+		/* Send event to the ULP.   */
+		if (event) {
+			sctp_ulpq_tail_event(ulpq, event);
+			sctp_ulpq_set_pd(ulpq);
+			return;
+		}
+	}
+}
+
+/* Renege some packets to make room for an incoming chunk.  */
+void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
+		      gfp_t gfp)
+{
+	struct sctp_association *asoc;
+	__u16 needed, freed;
+
+	asoc = ulpq->asoc;
+
+	if (chunk) {
+		needed = ntohs(chunk->chunk_hdr->length);
+		needed -= sizeof(sctp_data_chunk_t);
+	} else
+		needed = SCTP_DEFAULT_MAXWINDOW;
+
+	freed = 0;
+
+	if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
+		freed = sctp_ulpq_renege_order(ulpq, needed);
+		if (freed < needed) {
+			freed += sctp_ulpq_renege_frags(ulpq, needed - freed);
+		}
+	}
+	/* If able to free enough room, accept this chunk. */
+	if (chunk && (freed >= needed)) {
+		__u32 tsn;
+		tsn = ntohl(chunk->subh.data_hdr->tsn);
+		sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn);
+		sctp_ulpq_tail_data(ulpq, chunk, gfp);
+
+		sctp_ulpq_partial_delivery(ulpq, chunk, gfp);
+	}
+
+	sk_mem_reclaim(asoc->base.sk);
+}
+
+
+
+/* Notify the application if an association is aborted and in
+ * partial delivery mode.  Send up any pending received messages.
+ */
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
+{
+	struct sctp_ulpevent *ev = NULL;
+	struct sock *sk;
+
+	if (!ulpq->pd_mode)
+		return;
+
+	sk = ulpq->asoc->base.sk;
+	if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT,
+				       &sctp_sk(sk)->subscribe))
+		ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
+					      SCTP_PARTIAL_DELIVERY_ABORTED,
+					      gfp);
+	if (ev)
+		__skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));
+
+	/* If there is data waiting, send it up the socket now. */
+	if (sctp_ulpq_clear_pd(ulpq) || ev)
+		sk->sk_data_ready(sk, 0);
+}
diff --git a/net/socket.c b/net/socket.c
new file mode 100644
index 00000000..cf41afcc
--- /dev/null
+++ b/net/socket.c
@@ -0,0 +1,3384 @@
+/*
+ * NET		An implementation of the SOCKET network access protocol.
+ *
+ * Version:	@(#)socket.c	1.1.93	18/02/95
+ *
+ * Authors:	Orest Zborowski, <obz@Kodak.COM>
+ *		Ross Biro
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
+ *					shutdown()
+ *		Alan Cox	:	verify_area() fixes
+ *		Alan Cox	:	Removed DDI
+ *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
+ *		Alan Cox	:	Moved a load of checks to the very
+ *					top level.
+ *		Alan Cox	:	Move address structures to/from user
+ *					mode above the protocol layers.
+ *		Rob Janssen	:	Allow 0 length sends.
+ *		Alan Cox	:	Asynchronous I/O support (cribbed from the
+ *					tty drivers).
+ *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
+ *		Jeff Uphoff	:	Made max number of sockets command-line
+ *					configurable.
+ *		Matti Aarnio	:	Made the number of sockets dynamic,
+ *					to be allocated when needed, and mr.
+ *					Uphoff's max is used as max to be
+ *					allowed to allocate.
+ *		Linus		:	Argh. removed all the socket allocation
+ *					altogether: it's in the inode now.
+ *		Alan Cox	:	Made sock_alloc()/sock_release() public
+ *					for NetROM and future kernel nfsd type
+ *					stuff.
+ *		Alan Cox	:	sendmsg/recvmsg basics.
+ *		Tom Dyas	:	Export net symbols.
+ *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
+ *		Alan Cox	:	Added thread locking to sys_* calls
+ *					for sockets. May have errors at the
+ *					moment.
+ *		Kevin Buhr	:	Fixed the dumb errors in the above.
+ *		Andi Kleen	:	Some small cleanups, optimizations,
+ *					and fixed a copy_from_user() bug.
+ *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
+ *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
+ *					protocol-independent
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *
+ *	This module is effectively the top level interface to the BSD socket
+ *	paradigm.
+ *
+ *	Based upon Swansea University Computer Society NET3.039
+ */
+
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/interrupt.h>
+#include <linux/thread_info.h>
+#include <linux/rcupdate.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+#include <linux/wanrouter.h>
+#include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/kmod.h>
+#include <linux/audit.h>
+#include <linux/wireless.h>
+#include <linux/nsproxy.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#include <net/compat.h>
+#include <net/wext.h>
+#include <net/cls_cgroup.h>
+
+#include <net/sock.h>
+#include <linux/netfilter.h>
+
+#include <linux/if_tun.h>
+#include <linux/ipv6_route.h>
+#include <linux/route.h>
+#include <linux/sockios.h>
+#include <linux/atalk.h>
+
+static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos);
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos);
+static int sock_mmap(struct file *file, struct vm_area_struct *vma);
+
+static int sock_close(struct inode *inode, struct file *file);
+static unsigned int sock_poll(struct file *file,
+			      struct poll_table_struct *wait);
+static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long compat_sock_ioctl(struct file *file,
+			      unsigned int cmd, unsigned long arg);
+#endif
+static int sock_fasync(int fd, struct file *filp, int on);
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+			     int offset, size_t size, loff_t *ppos, int more);
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+				struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags);
+
+/*
+ *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
+ *	in the operation structures but are done directly via the socketcall() multiplexor.
+ */
+
+static const struct file_operations socket_file_ops = {
+	.owner =	THIS_MODULE,
+	.llseek =	no_llseek,
+	.aio_read =	sock_aio_read,
+	.aio_write =	sock_aio_write,
+	.poll =		sock_poll,
+	.unlocked_ioctl = sock_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = compat_sock_ioctl,
+#endif
+	.mmap =		sock_mmap,
+	.open =		sock_no_open,	/* special open code to disallow open via /proc */
+	.release =	sock_close,
+	.fasync =	sock_fasync,
+	.sendpage =	sock_sendpage,
+	.splice_write = generic_splice_sendpage,
+	.splice_read =	sock_splice_read,
+};
+
+/*
+ *	The protocol list. Each protocol is registered in here.
+ */
+
+static DEFINE_SPINLOCK(net_family_lock);
+static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
+
+/*
+ *	Statistics counters of the socket lists
+ */
+
+static DEFINE_PER_CPU(int, sockets_in_use);
+
+/*
+ * Support routines.
+ * Move socket addresses back and forth across the kernel/user
+ * divide and look after the messy bits.
+ */
+
+/**
+ *	move_addr_to_kernel	-	copy a socket address into kernel space
+ *	@uaddr: Address in user space
+ *	@kaddr: Address in kernel space
+ *	@ulen: Length in user space
+ *
+ *	The address is copied into kernel space. If the provided address is
+ *	too long an error code of -EINVAL is returned. If the copy gives
+ *	invalid addresses -EFAULT is returned. On a success 0 is returned.
+ */
+
+int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
+{
+	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
+		return -EINVAL;
+	if (ulen == 0)
+		return 0;
+	if (copy_from_user(kaddr, uaddr, ulen))
+		return -EFAULT;
+	return audit_sockaddr(ulen, kaddr);
+}
+
+/**
+ *	move_addr_to_user	-	copy an address to user space
+ *	@kaddr: kernel space address
+ *	@klen: length of address in kernel
+ *	@uaddr: user space address
+ *	@ulen: pointer to user length field
+ *
+ *	The value pointed to by ulen on entry is the buffer length available.
+ *	This is overwritten with the buffer space used. -EINVAL is returned
+ *	if an overlong buffer is specified or a negative buffer size. -EFAULT
+ *	is returned if either the buffer or the length field are not
+ *	accessible.
+ *	After copying the data up to the limit the user specifies, the true
+ *	length of the data is written over the length limit the user
+ *	specified. Zero is returned for a success.
+ */
+
+static int move_addr_to_user(struct sockaddr *kaddr, int klen,
+			     void __user *uaddr, int __user *ulen)
+{
+	int err;
+	int len;
+
+	err = get_user(len, ulen);
+	if (err)
+		return err;
+	if (len > klen)
+		len = klen;
+	if (len < 0 || len > sizeof(struct sockaddr_storage))
+		return -EINVAL;
+	if (len) {
+		if (audit_sockaddr(klen, kaddr))
+			return -ENOMEM;
+		if (copy_to_user(uaddr, kaddr, len))
+			return -EFAULT;
+	}
+	/*
+	 *      "fromlen shall refer to the value before truncation.."
+	 *                      1003.1g
+	 */
+	return __put_user(klen, ulen);
+}
+
+static struct kmem_cache *sock_inode_cachep __read_mostly;
+
+static struct inode *sock_alloc_inode(struct super_block *sb)
+{
+	struct socket_alloc *ei;
+	struct socket_wq *wq;
+
+	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq) {
+		kmem_cache_free(sock_inode_cachep, ei);
+		return NULL;
+	}
+	init_waitqueue_head(&wq->wait);
+	wq->fasync_list = NULL;
+	RCU_INIT_POINTER(ei->socket.wq, wq);
+
+	ei->socket.state = SS_UNCONNECTED;
+	ei->socket.flags = 0;
+	ei->socket.ops = NULL;
+	ei->socket.sk = NULL;
+	ei->socket.file = NULL;
+
+	return &ei->vfs_inode;
+}
+
+static void sock_destroy_inode(struct inode *inode)
+{
+	struct socket_alloc *ei;
+	struct socket_wq *wq;
+
+	ei = container_of(inode, struct socket_alloc, vfs_inode);
+	wq = rcu_dereference_protected(ei->socket.wq, 1);
+	kfree_rcu(wq, rcu);
+	kmem_cache_free(sock_inode_cachep, ei);
+}
+
+static void init_once(void *foo)
+{
+	struct socket_alloc *ei = (struct socket_alloc *)foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
+					      sizeof(struct socket_alloc),
+					      0,
+					      (SLAB_HWCACHE_ALIGN |
+					       SLAB_RECLAIM_ACCOUNT |
+					       SLAB_MEM_SPREAD),
+					      init_once);
+	if (sock_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static const struct super_operations sockfs_ops = {
+	.alloc_inode	= sock_alloc_inode,
+	.destroy_inode	= sock_destroy_inode,
+	.statfs		= simple_statfs,
+};
+
+/*
+ * sockfs_dname() is called from d_path().
+ */
+static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
+				dentry->d_inode->i_ino);
+}
+
+static const struct dentry_operations sockfs_dentry_operations = {
+	.d_dname  = sockfs_dname,
+};
+
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "socket:", &sockfs_ops,
+		&sockfs_dentry_operations, SOCKFS_MAGIC);
+}
+
+static struct vfsmount *sock_mnt __read_mostly;
+
+static struct file_system_type sock_fs_type = {
+	.name =		"sockfs",
+	.mount =	sockfs_mount,
+	.kill_sb =	kill_anon_super,
+};
+
+/*
+ *	Obtains the first available file descriptor and sets it up for use.
+ *
+ *	These functions create file structures and maps them to fd space
+ *	of the current process. On success it returns file descriptor
+ *	and file struct implicitly stored in sock->file.
+ *	Note that another thread may close file descriptor before we return
+ *	from this function. We use the fact that now we do not refer
+ *	to socket after mapping. If one day we will need it, this
+ *	function will increment ref. count on file by 1.
+ *
+ *	In any case returned fd MAY BE not valid!
+ *	This race condition is unavoidable
+ *	with shared fd spaces, we cannot solve it inside kernel,
+ *	but we take care of internal coherence yet.
+ */
+
+static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
+{
+	struct qstr name = { .name = "" };
+	struct path path;
+	struct file *file;
+	int fd;
+
+	fd = get_unused_fd_flags(flags);
+	if (unlikely(fd < 0))
+		return fd;
+
+	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
+	if (unlikely(!path.dentry)) {
+		put_unused_fd(fd);
+		return -ENOMEM;
+	}
+	path.mnt = mntget(sock_mnt);
+
+	d_instantiate(path.dentry, SOCK_INODE(sock));
+	SOCK_INODE(sock)->i_fop = &socket_file_ops;
+
+	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
+		  &socket_file_ops);
+	if (unlikely(!file)) {
+		/* drop dentry, keep inode */
+		ihold(path.dentry->d_inode);
+		path_put(&path);
+		put_unused_fd(fd);
+		return -ENFILE;
+	}
+
+	sock->file = file;
+	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
+	file->f_pos = 0;
+	file->private_data = sock;
+
+	*f = file;
+	return fd;
+}
+
+int sock_map_fd(struct socket *sock, int flags)
+{
+	struct file *newfile;
+	int fd = sock_alloc_file(sock, &newfile, flags);
+
+	if (likely(fd >= 0))
+		fd_install(fd, newfile);
+
+	return fd;
+}
+EXPORT_SYMBOL(sock_map_fd);
+
+static struct socket *sock_from_file(struct file *file, int *err)
+{
+	if (file->f_op == &socket_file_ops)
+		return file->private_data;	/* set in sock_map_fd */
+
+	*err = -ENOTSOCK;
+	return NULL;
+}
+
+/**
+ *	sockfd_lookup - Go from a file number to its socket slot
+ *	@fd: file handle
+ *	@err: pointer to an error code return
+ *
+ *	The file handle passed in is locked and the socket it is bound
+ *	too is returned. If an error occurs the err pointer is overwritten
+ *	with a negative errno code and NULL is returned. The function checks
+ *	for both invalid handles and passing a handle which is not a socket.
+ *
+ *	On a success the socket object pointer is returned.
+ */
+
+struct socket *sockfd_lookup(int fd, int *err)
+{
+	struct file *file;
+	struct socket *sock;
+
+	file = fget(fd);
+	if (!file) {
+		*err = -EBADF;
+		return NULL;
+	}
+
+	sock = sock_from_file(file, err);
+	if (!sock)
+		fput(file);
+	return sock;
+}
+EXPORT_SYMBOL(sockfd_lookup);
+
+static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
+{
+	struct file *file;
+	struct socket *sock;
+
+	*err = -EBADF;
+	file = fget_light(fd, fput_needed);
+	if (file) {
+		sock = sock_from_file(file, err);
+		if (sock)
+			return sock;
+		fput_light(file, *fput_needed);
+	}
+	return NULL;
+}
+
+/**
+ *	sock_alloc	-	allocate a socket
+ *
+ *	Allocate a new inode and socket object. The two are bound together
+ *	and initialised. The socket is then returned. If we are out of inodes
+ *	NULL is returned.
+ */
+
+static struct socket *sock_alloc(void)
+{
+	struct inode *inode;
+	struct socket *sock;
+
+	inode = new_inode(sock_mnt->mnt_sb);
+	if (!inode)
+		return NULL;
+
+	sock = SOCKET_I(inode);
+
+	kmemcheck_annotate_bitfield(sock, type);
+	inode->i_ino = get_next_ino();
+	inode->i_mode = S_IFSOCK | S_IRWXUGO;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+
+	percpu_add(sockets_in_use, 1);
+	return sock;
+}
+
+/*
+ *	In theory you can't get an open on this inode, but /proc provides
+ *	a back door. Remember to keep it shut otherwise you'll let the
+ *	creepy crawlies in.
+ */
+
+static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+	return -ENXIO;
+}
+
+const struct file_operations bad_sock_fops = {
+	.owner = THIS_MODULE,
+	.open = sock_no_open,
+	.llseek = noop_llseek,
+};
+
+/**
+ *	sock_release	-	close a socket
+ *	@sock: socket to close
+ *
+ *	The socket is released from the protocol stack if it has a release
+ *	callback, and the inode is then released if the socket is bound to
+ *	an inode not a file.
+ */
+
+void sock_release(struct socket *sock)
+{
+	if (sock->ops) {
+		struct module *owner = sock->ops->owner;
+
+		sock->ops->release(sock);
+		sock->ops = NULL;
+		module_put(owner);
+	}
+
+	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+		printk(KERN_ERR "sock_release: fasync list not empty!\n");
+
+	percpu_sub(sockets_in_use, 1);
+	if (!sock->file) {
+		iput(SOCK_INODE(sock));
+		return;
+	}
+	sock->file = NULL;
+}
+EXPORT_SYMBOL(sock_release);
+
+int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
+{
+	*tx_flags = 0;
+	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
+		*tx_flags |= SKBTX_HW_TSTAMP;
+	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
+		*tx_flags |= SKBTX_SW_TSTAMP;
+	return 0;
+}
+EXPORT_SYMBOL(sock_tx_timestamp);
+
+static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
+				       struct msghdr *msg, size_t size)
+{
+	struct sock_iocb *si = kiocb_to_siocb(iocb);
+
+	sock_update_classid(sock->sk);
+
+	si->sock = sock;
+	si->scm = NULL;
+	si->msg = msg;
+	si->size = size;
+
+	return sock->ops->sendmsg(iocb, sock, msg, size);
+}
+
+static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
+				 struct msghdr *msg, size_t size)
+{
+	int err = security_socket_sendmsg(sock, msg, size);
+
+	return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size);
+}
+
+int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_sendmsg(&iocb, sock, msg, size);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+EXPORT_SYMBOL(sock_sendmsg);
+
+int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_sendmsg_nosec(&iocb, sock, msg, size);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
+int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
+		   struct kvec *vec, size_t num, size_t size)
+{
+	mm_segment_t oldfs = get_fs();
+	int result;
+
+	set_fs(KERNEL_DS);
+	/*
+	 * the following is safe, since for compiler definitions of kvec and
+	 * iovec are identical, yielding the same in-core layout and alignment
+	 */
+	msg->msg_iov = (struct iovec *)vec;
+	msg->msg_iovlen = num;
+	result = sock_sendmsg(sock, msg, size);
+	set_fs(oldfs);
+	return result;
+}
+EXPORT_SYMBOL(kernel_sendmsg);
+
+static int ktime2ts(ktime_t kt, struct timespec *ts)
+{
+	if (kt.tv64) {
+		*ts = ktime_to_timespec(kt);
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+/*
+ * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
+ */
+void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb)
+{
+	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+	struct timespec ts[3];
+	int empty = 1;
+	struct skb_shared_hwtstamps *shhwtstamps =
+		skb_hwtstamps(skb);
+
+	/* Race occurred between timestamp enabling and packet
+	   receiving.  Fill in the current time for now. */
+	if (need_software_tstamp && skb->tstamp.tv64 == 0)
+		__net_timestamp(skb);
+
+	if (need_software_tstamp) {
+		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+			struct timeval tv;
+			skb_get_timestamp(skb, &tv);
+			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+				 sizeof(tv), &tv);
+		} else {
+			skb_get_timestampns(skb, &ts[0]);
+			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+				 sizeof(ts[0]), &ts[0]);
+		}
+	}
+
+
+	memset(ts, 0, sizeof(ts));
+	if (skb->tstamp.tv64 &&
+	    sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) {
+		skb_get_timestampns(skb, ts + 0);
+		empty = 0;
+	}
+	if (shhwtstamps) {
+		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) &&
+		    ktime2ts(shhwtstamps->syststamp, ts + 1))
+			empty = 0;
+		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) &&
+		    ktime2ts(shhwtstamps->hwtstamp, ts + 2))
+			empty = 0;
+	}
+	if (!empty)
+		put_cmsg(msg, SOL_SOCKET,
+			 SCM_TIMESTAMPING, sizeof(ts), &ts);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
+
+static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
+		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
+			sizeof(__u32), &skb->dropcount);
+}
+
+void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb)
+{
+	sock_recv_timestamp(msg, sk, skb);
+	sock_recv_drops(msg, sk, skb);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
+
+static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
+				       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock_iocb *si = kiocb_to_siocb(iocb);
+
+	sock_update_classid(sock->sk);
+
+	si->sock = sock;
+	si->scm = NULL;
+	si->msg = msg;
+	si->size = size;
+	si->flags = flags;
+
+	return sock->ops->recvmsg(iocb, sock, msg, size, flags);
+}
+
+static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				 struct msghdr *msg, size_t size, int flags)
+{
+	int err = security_socket_recvmsg(sock, msg, size, flags);
+
+	return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
+}
+
+int sock_recvmsg(struct socket *sock, struct msghdr *msg,
+		 size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+EXPORT_SYMBOL(sock_recvmsg);
+
+static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
+			      size_t size, int flags)
+{
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
+
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+	ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
+/**
+ * kernel_recvmsg - Receive a message from a socket (kernel space)
+ * @sock:       The socket to receive the message from
+ * @msg:        Received message
+ * @vec:        Input s/g array for message data
+ * @num:        Size of input s/g array
+ * @size:       Number of bytes to read
+ * @flags:      Message flags (MSG_DONTWAIT, etc...)
+ *
+ * On return the msg structure contains the scatter/gather array passed in the
+ * vec argument. The array is modified so that it consists of the unfilled
+ * portion of the original array.
+ *
+ * The returned value is the total number of bytes received, or an error.
+ */
+int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
+		   struct kvec *vec, size_t num, size_t size, int flags)
+{
+	mm_segment_t oldfs = get_fs();
+	int result;
+
+	set_fs(KERNEL_DS);
+	/*
+	 * the following is safe, since for compiler definitions of kvec and
+	 * iovec are identical, yielding the same in-core layout and alignment
+	 */
+	msg->msg_iov = (struct iovec *)vec, msg->msg_iovlen = num;
+	result = sock_recvmsg(sock, msg, size, flags);
+	set_fs(oldfs);
+	return result;
+}
+EXPORT_SYMBOL(kernel_recvmsg);
+
+static void sock_aio_dtor(struct kiocb *iocb)
+{
+	kfree(iocb->private);
+}
+
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+			     int offset, size_t size, loff_t *ppos, int more)
+{
+	struct socket *sock;
+	int flags;
+
+	sock = file->private_data;
+
+	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
+	flags |= more;
+
+	return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+				struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags)
+{
+	struct socket *sock = file->private_data;
+
+	if (unlikely(!sock->ops->splice_read))
+		return -EINVAL;
+
+	sock_update_classid(sock->sk);
+
+	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
+}
+
+static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
+					 struct sock_iocb *siocb)
+{
+	if (!is_sync_kiocb(iocb)) {
+		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
+		if (!siocb)
+			return NULL;
+		iocb->ki_dtor = sock_aio_dtor;
+	}
+
+	siocb->kiocb = iocb;
+	iocb->private = siocb;
+	return siocb;
+}
+
+static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
+		struct file *file, const struct iovec *iov,
+		unsigned long nr_segs)
+{
+	struct socket *sock = file->private_data;
+	size_t size = 0;
+	int i;
+
+	for (i = 0; i < nr_segs; i++)
+		size += iov[i].iov_len;
+
+	msg->msg_name = NULL;
+	msg->msg_namelen = 0;
+	msg->msg_control = NULL;
+	msg->msg_controllen = 0;
+	msg->msg_iov = (struct iovec *)iov;
+	msg->msg_iovlen = nr_segs;
+	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+
+	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
+}
+
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	struct sock_iocb siocb, *x;
+
+	if (pos != 0)
+		return -ESPIPE;
+
+	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
+		return 0;
+
+
+	x = alloc_sock_iocb(iocb, &siocb);
+	if (!x)
+		return -ENOMEM;
+	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
+}
+
+static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
+			struct file *file, const struct iovec *iov,
+			unsigned long nr_segs)
+{
+	struct socket *sock = file->private_data;
+	size_t size = 0;
+	int i;
+
+	for (i = 0; i < nr_segs; i++)
+		size += iov[i].iov_len;
+
+	msg->msg_name = NULL;
+	msg->msg_namelen = 0;
+	msg->msg_control = NULL;
+	msg->msg_controllen = 0;
+	msg->msg_iov = (struct iovec *)iov;
+	msg->msg_iovlen = nr_segs;
+	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+	if (sock->type == SOCK_SEQPACKET)
+		msg->msg_flags |= MSG_EOR;
+
+	return __sock_sendmsg(iocb, sock, msg, size);
+}
+
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos)
+{
+	struct sock_iocb siocb, *x;
+
+	if (pos != 0)
+		return -ESPIPE;
+
+	x = alloc_sock_iocb(iocb, &siocb);
+	if (!x)
+		return -ENOMEM;
+
+	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
+}
+
+/*
+ * Atomic setting of ioctl hooks to avoid race
+ * with module unload.
+ */
+
+static DEFINE_MUTEX(br_ioctl_mutex);
+static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
+
+void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
+{
+	mutex_lock(&br_ioctl_mutex);
+	br_ioctl_hook = hook;
+	mutex_unlock(&br_ioctl_mutex);
+}
+EXPORT_SYMBOL(brioctl_set);
+
+static DEFINE_MUTEX(vlan_ioctl_mutex);
+static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
+
+void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
+{
+	mutex_lock(&vlan_ioctl_mutex);
+	vlan_ioctl_hook = hook;
+	mutex_unlock(&vlan_ioctl_mutex);
+}
+EXPORT_SYMBOL(vlan_ioctl_set);
+
+static DEFINE_MUTEX(dlci_ioctl_mutex);
+static int (*dlci_ioctl_hook) (unsigned int, void __user *);
+
+void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
+{
+	mutex_lock(&dlci_ioctl_mutex);
+	dlci_ioctl_hook = hook;
+	mutex_unlock(&dlci_ioctl_mutex);
+}
+EXPORT_SYMBOL(dlci_ioctl_set);
+
+static long sock_do_ioctl(struct net *net, struct socket *sock,
+				 unsigned int cmd, unsigned long arg)
+{
+	int err;
+	void __user *argp = (void __user *)arg;
+
+	err = sock->ops->ioctl(sock, cmd, arg);
+
+	/*
+	 * If this ioctl is unknown try to hand it down
+	 * to the NIC driver.
+	 */
+	if (err == -ENOIOCTLCMD)
+		err = dev_ioctl(net, cmd, argp);
+
+	return err;
+}
+
+/*
+ *	With an ioctl, arg may well be a user mode pointer, but we don't know
+ *	what to do with it - that's up to the protocol still.
+ */
+
+static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct socket *sock;
+	struct sock *sk;
+	void __user *argp = (void __user *)arg;
+	int pid, err;
+	struct net *net;
+
+	sock = file->private_data;
+	sk = sock->sk;
+	net = sock_net(sk);
+	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
+		err = dev_ioctl(net, cmd, argp);
+	} else
+#ifdef CONFIG_WEXT_CORE
+	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
+		err = dev_ioctl(net, cmd, argp);
+	} else
+#endif
+		switch (cmd) {
+		case FIOSETOWN:
+		case SIOCSPGRP:
+			err = -EFAULT;
+			if (get_user(pid, (int __user *)argp))
+				break;
+			err = f_setown(sock->file, pid, 1);
+			break;
+		case FIOGETOWN:
+		case SIOCGPGRP:
+			err = put_user(f_getown(sock->file),
+				       (int __user *)argp);
+			break;
+		case SIOCGIFBR:
+		case SIOCSIFBR:
+		case SIOCBRADDBR:
+		case SIOCBRDELBR:
+			err = -ENOPKG;
+			if (!br_ioctl_hook)
+				request_module("bridge");
+
+			mutex_lock(&br_ioctl_mutex);
+			if (br_ioctl_hook)
+				err = br_ioctl_hook(net, cmd, argp);
+			mutex_unlock(&br_ioctl_mutex);
+			break;
+		case SIOCGIFVLAN:
+		case SIOCSIFVLAN:
+			err = -ENOPKG;
+			if (!vlan_ioctl_hook)
+				request_module("8021q");
+
+			mutex_lock(&vlan_ioctl_mutex);
+			if (vlan_ioctl_hook)
+				err = vlan_ioctl_hook(net, argp);
+			mutex_unlock(&vlan_ioctl_mutex);
+			break;
+		case SIOCADDDLCI:
+		case SIOCDELDLCI:
+			err = -ENOPKG;
+			if (!dlci_ioctl_hook)
+				request_module("dlci");
+
+			mutex_lock(&dlci_ioctl_mutex);
+			if (dlci_ioctl_hook)
+				err = dlci_ioctl_hook(cmd, argp);
+			mutex_unlock(&dlci_ioctl_mutex);
+			break;
+		default:
+			err = sock_do_ioctl(net, sock, cmd, arg);
+			break;
+		}
+	return err;
+}
+
+int sock_create_lite(int family, int type, int protocol, struct socket **res)
+{
+	int err;
+	struct socket *sock = NULL;
+
+	err = security_socket_create(family, type, protocol, 1);
+	if (err)
+		goto out;
+
+	sock = sock_alloc();
+	if (!sock) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sock->type = type;
+	err = security_socket_post_create(sock, family, type, protocol, 1);
+	if (err)
+		goto out_release;
+
+out:
+	*res = sock;
+	return err;
+out_release:
+	sock_release(sock);
+	sock = NULL;
+	goto out;
+}
+EXPORT_SYMBOL(sock_create_lite);
+
+/* No kernel lock held - perfect */
+static unsigned int sock_poll(struct file *file, poll_table *wait)
+{
+	struct socket *sock;
+
+	/*
+	 *      We can't return errors to poll, so it's either yes or no.
+	 */
+	sock = file->private_data;
+	return sock->ops->poll(file, sock, wait);
+}
+
+static int sock_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct socket *sock = file->private_data;
+
+	return sock->ops->mmap(file, sock, vma);
+}
+
+static int sock_close(struct inode *inode, struct file *filp)
+{
+	/*
+	 *      It was possible the inode is NULL we were
+	 *      closing an unfinished socket.
+	 */
+
+	if (!inode) {
+		printk(KERN_DEBUG "sock_close: NULL inode\n");
+		return 0;
+	}
+	sock_release(SOCKET_I(inode));
+	return 0;
+}
+
+/*
+ *	Update the socket async list
+ *
+ *	Fasync_list locking strategy.
+ *
+ *	1. fasync_list is modified only under process context socket lock
+ *	   i.e. under semaphore.
+ *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
+ *	   or under socket lock
+ */
+
+static int sock_fasync(int fd, struct file *filp, int on)
+{
+	struct socket *sock = filp->private_data;
+	struct sock *sk = sock->sk;
+	struct socket_wq *wq;
+
+	if (sk == NULL)
+		return -EINVAL;
+
+	lock_sock(sk);
+	wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+	fasync_helper(fd, filp, on, &wq->fasync_list);
+
+	if (!wq->fasync_list)
+		sock_reset_flag(sk, SOCK_FASYNC);
+	else
+		sock_set_flag(sk, SOCK_FASYNC);
+
+	release_sock(sk);
+	return 0;
+}
+
+/* This function may be called only under socket lock or callback_lock or rcu_lock */
+
+int sock_wake_async(struct socket *sock, int how, int band)
+{
+	struct socket_wq *wq;
+
+	if (!sock)
+		return -1;
+	rcu_read_lock();
+	wq = rcu_dereference(sock->wq);
+	if (!wq || !wq->fasync_list) {
+		rcu_read_unlock();
+		return -1;
+	}
+	switch (how) {
+	case SOCK_WAKE_WAITD:
+		if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+			break;
+		goto call_kill;
+	case SOCK_WAKE_SPACE:
+		if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+			break;
+		/* fall through */
+	case SOCK_WAKE_IO:
+call_kill:
+		kill_fasync(&wq->fasync_list, SIGIO, band);
+		break;
+	case SOCK_WAKE_URG:
+		kill_fasync(&wq->fasync_list, SIGURG, band);
+	}
+	rcu_read_unlock();
+	return 0;
+}
+EXPORT_SYMBOL(sock_wake_async);
+
+int __sock_create(struct net *net, int family, int type, int protocol,
+			 struct socket **res, int kern)
+{
+	int err;
+	struct socket *sock;
+	const struct net_proto_family *pf;
+
+	/*
+	 *      Check protocol is in range
+	 */
+	if (family < 0 || family >= NPROTO)
+		return -EAFNOSUPPORT;
+	if (type < 0 || type >= SOCK_MAX)
+		return -EINVAL;
+
+	/* Compatibility.
+
+	   This uglymoron is moved from INET layer to here to avoid
+	   deadlock in module load.
+	 */
+	if (family == PF_INET && type == SOCK_PACKET) {
+		static int warned;
+		if (!warned) {
+			warned = 1;
+			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
+			       current->comm);
+		}
+		family = PF_PACKET;
+	}
+
+	err = security_socket_create(family, type, protocol, kern);
+	if (err)
+		return err;
+
+	/*
+	 *	Allocate the socket and allow the family to set things up. if
+	 *	the protocol is 0, the family is instructed to select an appropriate
+	 *	default.
+	 */
+	sock = sock_alloc();
+	if (!sock) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "socket: no more sockets\n");
+		return -ENFILE;	/* Not exactly a match, but its the
+				   closest posix thing */
+	}
+
+	sock->type = type;
+
+#ifdef CONFIG_MODULES
+	/* Attempt to load a protocol module if the find failed.
+	 *
+	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
+	 * requested real, full-featured networking support upon configuration.
+	 * Otherwise module support will break!
+	 */
+	if (rcu_access_pointer(net_families[family]) == NULL)
+		request_module("net-pf-%d", family);
+#endif
+
+	rcu_read_lock();
+	pf = rcu_dereference(net_families[family]);
+	err = -EAFNOSUPPORT;
+	if (!pf)
+		goto out_release;
+
+	/*
+	 * We will call the ->create function, that possibly is in a loadable
+	 * module, so we have to bump that loadable module refcnt first.
+	 */
+	if (!try_module_get(pf->owner))
+		goto out_release;
+
+	/* Now protected by module ref count */
+	rcu_read_unlock();
+
+	err = pf->create(net, sock, protocol, kern);
+	if (err < 0)
+		goto out_module_put;
+
+	/*
+	 * Now to bump the refcnt of the [loadable] module that owns this
+	 * socket at sock_release time we decrement its refcnt.
+	 */
+	if (!try_module_get(sock->ops->owner))
+		goto out_module_busy;
+
+	/*
+	 * Now that we're done with the ->create function, the [loadable]
+	 * module can have its refcnt decremented
+	 */
+	module_put(pf->owner);
+	err = security_socket_post_create(sock, family, type, protocol, kern);
+	if (err)
+		goto out_sock_release;
+	*res = sock;
+
+	return 0;
+
+out_module_busy:
+	err = -EAFNOSUPPORT;
+out_module_put:
+	sock->ops = NULL;
+	module_put(pf->owner);
+out_sock_release:
+	sock_release(sock);
+	return err;
+
+out_release:
+	rcu_read_unlock();
+	goto out_sock_release;
+}
+EXPORT_SYMBOL(__sock_create);
+
+int sock_create(int family, int type, int protocol, struct socket **res)
+{
+	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
+}
+EXPORT_SYMBOL(sock_create);
+
+int sock_create_kern(int family, int type, int protocol, struct socket **res)
+{
+	return __sock_create(&init_net, family, type, protocol, res, 1);
+}
+EXPORT_SYMBOL(sock_create_kern);
+
+SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
+{
+	int retval;
+	struct socket *sock;
+	int flags;
+
+	/* Check the SOCK_* constants for consistency.  */
+	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
+	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
+	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
+
+	flags = type & ~SOCK_TYPE_MASK;
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	type &= SOCK_TYPE_MASK;
+
+	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+	retval = sock_create(family, type, protocol, &sock);
+	if (retval < 0)
+		goto out;
+
+	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
+	if (retval < 0)
+		goto out_release;
+
+out:
+	/* It may be already another descriptor 8) Not kernel problem. */
+	return retval;
+
+out_release:
+	sock_release(sock);
+	return retval;
+}
+
+/*
+ *	Create a pair of connected sockets.
+ */
+
+SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
+		int __user *, usockvec)
+{
+	struct socket *sock1, *sock2;
+	int fd1, fd2, err;
+	struct file *newfile1, *newfile2;
+	int flags;
+
+	flags = type & ~SOCK_TYPE_MASK;
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+	type &= SOCK_TYPE_MASK;
+
+	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+	/*
+	 * Obtain the first socket and check if the underlying protocol
+	 * supports the socketpair call.
+	 */
+
+	err = sock_create(family, type, protocol, &sock1);
+	if (err < 0)
+		goto out;
+
+	err = sock_create(family, type, protocol, &sock2);
+	if (err < 0)
+		goto out_release_1;
+
+	err = sock1->ops->socketpair(sock1, sock2);
+	if (err < 0)
+		goto out_release_both;
+
+	fd1 = sock_alloc_file(sock1, &newfile1, flags);
+	if (unlikely(fd1 < 0)) {
+		err = fd1;
+		goto out_release_both;
+	}
+
+	fd2 = sock_alloc_file(sock2, &newfile2, flags);
+	if (unlikely(fd2 < 0)) {
+		err = fd2;
+		fput(newfile1);
+		put_unused_fd(fd1);
+		sock_release(sock2);
+		goto out;
+	}
+
+	audit_fd_pair(fd1, fd2);
+	fd_install(fd1, newfile1);
+	fd_install(fd2, newfile2);
+	/* fd1 and fd2 may be already another descriptors.
+	 * Not kernel problem.
+	 */
+
+	err = put_user(fd1, &usockvec[0]);
+	if (!err)
+		err = put_user(fd2, &usockvec[1]);
+	if (!err)
+		return 0;
+
+	sys_close(fd2);
+	sys_close(fd1);
+	return err;
+
+out_release_both:
+	sock_release(sock2);
+out_release_1:
+	sock_release(sock1);
+out:
+	return err;
+}
+
+/*
+ *	Bind a name to a socket. Nothing much to do here since it's
+ *	the protocol's responsibility to handle the local address.
+ *
+ *	We move the socket address to kernel space before we call
+ *	the protocol layer (having also checked the address is ok).
+ */
+
+SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
+{
+	struct socket *sock;
+	struct sockaddr_storage address;
+	int err, fput_needed;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock) {
+		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
+		if (err >= 0) {
+			err = security_socket_bind(sock,
+						   (struct sockaddr *)&address,
+						   addrlen);
+			if (!err)
+				err = sock->ops->bind(sock,
+						      (struct sockaddr *)
+						      &address, addrlen);
+		}
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/*
+ *	Perform a listen. Basically, we allow the protocol to do anything
+ *	necessary for a listen, and if that works, we mark the socket as
+ *	ready for listening.
+ */
+
+SYSCALL_DEFINE2(listen, int, fd, int, backlog)
+{
+	struct socket *sock;
+	int err, fput_needed;
+	int somaxconn;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock) {
+		somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
+		if ((unsigned)backlog > somaxconn)
+			backlog = somaxconn;
+
+		err = security_socket_listen(sock, backlog);
+		if (!err)
+			err = sock->ops->listen(sock, backlog);
+
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/*
+ *	For accept, we attempt to create a new socket, set up the link
+ *	with the client, wake up the client, then return the new
+ *	connected fd. We collect the address of the connector in kernel
+ *	space and move it to user at the very end. This is unclean because
+ *	we open the socket then return an error.
+ *
+ *	1003.1g adds the ability to recvmsg() to query connection pending
+ *	status to recvmsg. We need to add that support in a way thats
+ *	clean when we restucture accept also.
+ */
+
+SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
+		int __user *, upeer_addrlen, int, flags)
+{
+	struct socket *sock, *newsock;
+	struct file *newfile;
+	int err, len, newfd, fput_needed;
+	struct sockaddr_storage address;
+
+	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return -EINVAL;
+
+	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
+		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		goto out;
+
+	err = -ENFILE;
+	newsock = sock_alloc();
+	if (!newsock)
+		goto out_put;
+
+	newsock->type = sock->type;
+	newsock->ops = sock->ops;
+
+	/*
+	 * We don't need try_module_get here, as the listening socket (sock)
+	 * has the protocol module (sock->ops->owner) held.
+	 */
+	__module_get(newsock->ops->owner);
+
+	newfd = sock_alloc_file(newsock, &newfile, flags);
+	if (unlikely(newfd < 0)) {
+		err = newfd;
+		sock_release(newsock);
+		goto out_put;
+	}
+
+	err = security_socket_accept(sock, newsock);
+	if (err)
+		goto out_fd;
+
+	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
+	if (err < 0)
+		goto out_fd;
+
+	if (upeer_sockaddr) {
+		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
+					  &len, 2) < 0) {
+			err = -ECONNABORTED;
+			goto out_fd;
+		}
+		err = move_addr_to_user((struct sockaddr *)&address,
+					len, upeer_sockaddr, upeer_addrlen);
+		if (err < 0)
+			goto out_fd;
+	}
+
+	/* File flags are not inherited via accept() unlike another OSes. */
+
+	fd_install(newfd, newfile);
+	err = newfd;
+
+out_put:
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+out_fd:
+	fput(newfile);
+	put_unused_fd(newfd);
+	goto out_put;
+}
+
+SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
+		int __user *, upeer_addrlen)
+{
+	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
+}
+
+/*
+ *	Attempt to connect to a socket with the server address.  The address
+ *	is in user space so we verify it is OK and move it to kernel space.
+ *
+ *	For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
+ *	break bindings
+ *
+ *	NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
+ *	other SEQPACKET protocols that take time to connect() as it doesn't
+ *	include the -EINPROGRESS status for such sockets.
+ */
+
+SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
+		int, addrlen)
+{
+	struct socket *sock;
+	struct sockaddr_storage address;
+	int err, fput_needed;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		goto out;
+	err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
+	if (err < 0)
+		goto out_put;
+
+	err =
+	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
+	if (err)
+		goto out_put;
+
+	err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
+				 sock->file->f_flags);
+out_put:
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *	Get the local address ('name') of a socket object. Move the obtained
+ *	name to user space.
+ */
+
+SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
+		int __user *, usockaddr_len)
+{
+	struct socket *sock;
+	struct sockaddr_storage address;
+	int len, err, fput_needed;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		goto out;
+
+	err = security_socket_getsockname(sock);
+	if (err)
+		goto out_put;
+
+	err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
+	if (err)
+		goto out_put;
+	err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr, usockaddr_len);
+
+out_put:
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *	Get the remote address ('name') of a socket object. Move the obtained
+ *	name to user space.
+ */
+
+SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
+		int __user *, usockaddr_len)
+{
+	struct socket *sock;
+	struct sockaddr_storage address;
+	int len, err, fput_needed;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock != NULL) {
+		err = security_socket_getpeername(sock);
+		if (err) {
+			fput_light(sock->file, fput_needed);
+			return err;
+		}
+
+		err =
+		    sock->ops->getname(sock, (struct sockaddr *)&address, &len,
+				       1);
+		if (!err)
+			err = move_addr_to_user((struct sockaddr *)&address, len, usockaddr,
+						usockaddr_len);
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/*
+ *	Send a datagram to a given address. We move the address into kernel
+ *	space and check the user space data area is readable before invoking
+ *	the protocol.
+ */
+
+SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
+		unsigned, flags, struct sockaddr __user *, addr,
+		int, addr_len)
+{
+	struct socket *sock;
+	struct sockaddr_storage address;
+	int err;
+	struct msghdr msg;
+	struct iovec iov;
+	int fput_needed;
+
+	if (len > INT_MAX)
+		len = INT_MAX;
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		goto out;
+
+	iov.iov_base = buff;
+	iov.iov_len = len;
+	msg.msg_name = NULL;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	if (addr) {
+		err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address);
+		if (err < 0)
+			goto out_put;
+		msg.msg_name = (struct sockaddr *)&address;
+		msg.msg_namelen = addr_len;
+	}
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	msg.msg_flags = flags;
+	err = sock_sendmsg(sock, &msg, len);
+
+out_put:
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *	Send a datagram down a socket.
+ */
+
+SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
+		unsigned, flags)
+{
+	return sys_sendto(fd, buff, len, flags, NULL, 0);
+}
+
+/*
+ *	Receive a frame from the socket and optionally record the address of the
+ *	sender. We verify the buffers are writable and if needed move the
+ *	sender address from kernel to user space.
+ */
+
+SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
+		unsigned, flags, struct sockaddr __user *, addr,
+		int __user *, addr_len)
+{
+	struct socket *sock;
+	struct iovec iov;
+	struct msghdr msg;
+	struct sockaddr_storage address;
+	int err, err2;
+	int fput_needed;
+
+	if (size > INT_MAX)
+		size = INT_MAX;
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		goto out;
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &iov;
+	iov.iov_len = size;
+	iov.iov_base = ubuf;
+	msg.msg_name = (struct sockaddr *)&address;
+	msg.msg_namelen = sizeof(address);
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	err = sock_recvmsg(sock, &msg, size, flags);
+
+	if (err >= 0 && addr != NULL) {
+		err2 = move_addr_to_user((struct sockaddr *)&address,
+					 msg.msg_namelen, addr, addr_len);
+		if (err2 < 0)
+			err = err2;
+	}
+
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *	Receive a datagram from a socket.
+ */
+
+asmlinkage long sys_recv(int fd, void __user *ubuf, size_t size,
+			 unsigned flags)
+{
+	return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
+}
+
+/*
+ *	Set a socket option. Because we don't know the option lengths we have
+ *	to pass the user mode parameter for the protocols to sort out.
+ */
+
+SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
+		char __user *, optval, int, optlen)
+{
+	int err, fput_needed;
+	struct socket *sock;
+
+	if (optlen < 0)
+		return -EINVAL;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock != NULL) {
+		err = security_socket_setsockopt(sock, level, optname);
+		if (err)
+			goto out_put;
+
+		if (level == SOL_SOCKET)
+			err =
+			    sock_setsockopt(sock, level, optname, optval,
+					    optlen);
+		else
+			err =
+			    sock->ops->setsockopt(sock, level, optname, optval,
+						  optlen);
+out_put:
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/*
+ *	Get a socket option. Because we don't know the option lengths we have
+ *	to pass a user mode parameter for the protocols to sort out.
+ */
+
+SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
+		char __user *, optval, int __user *, optlen)
+{
+	int err, fput_needed;
+	struct socket *sock;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock != NULL) {
+		err = security_socket_getsockopt(sock, level, optname);
+		if (err)
+			goto out_put;
+
+		if (level == SOL_SOCKET)
+			err =
+			    sock_getsockopt(sock, level, optname, optval,
+					    optlen);
+		else
+			err =
+			    sock->ops->getsockopt(sock, level, optname, optval,
+						  optlen);
+out_put:
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/*
+ *	Shutdown a socket.
+ */
+
+SYSCALL_DEFINE2(shutdown, int, fd, int, how)
+{
+	int err, fput_needed;
+	struct socket *sock;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (sock != NULL) {
+		err = security_socket_shutdown(sock, how);
+		if (!err)
+			err = sock->ops->shutdown(sock, how);
+		fput_light(sock->file, fput_needed);
+	}
+	return err;
+}
+
+/* A couple of helpful macros for getting the address of the 32/64 bit
+ * fields which are the same type (int / unsigned) on our platforms.
+ */
+#define COMPAT_MSG(msg, member)	((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
+#define COMPAT_NAMELEN(msg)	COMPAT_MSG(msg, msg_namelen)
+#define COMPAT_FLAGS(msg)	COMPAT_MSG(msg, msg_flags)
+
+struct used_address {
+	struct sockaddr_storage name;
+	unsigned int name_len;
+};
+
+static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags,
+			 struct used_address *used_address)
+{
+	struct compat_msghdr __user *msg_compat =
+	    (struct compat_msghdr __user *)msg;
+	struct sockaddr_storage address;
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	unsigned char ctl[sizeof(struct cmsghdr) + 20]
+	    __attribute__ ((aligned(sizeof(__kernel_size_t))));
+	/* 20 is size of ipv6_pktinfo */
+	unsigned char *ctl_buf = ctl;
+	int err, ctl_len, iov_size, total_len;
+
+	err = -EFAULT;
+	if (MSG_CMSG_COMPAT & flags) {
+		if (get_compat_msghdr(msg_sys, msg_compat))
+			return -EFAULT;
+	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
+		return -EFAULT;
+
+	/* do not move before msg_sys is valid */
+	err = -EMSGSIZE;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
+
+	/* Check whether to allocate the iovec area */
+	err = -ENOMEM;
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
+		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
+		if (!iov)
+			goto out;
+	}
+
+	/* This will also move the address data into kernel space */
+	if (MSG_CMSG_COMPAT & flags) {
+		err = verify_compat_iovec(msg_sys, iov,
+					  (struct sockaddr *)&address,
+					  VERIFY_READ);
+	} else
+		err = verify_iovec(msg_sys, iov,
+				   (struct sockaddr *)&address,
+				   VERIFY_READ);
+	if (err < 0)
+		goto out_freeiov;
+	total_len = err;
+
+	err = -ENOBUFS;
+
+	if (msg_sys->msg_controllen > INT_MAX)
+		goto out_freeiov;
+	ctl_len = msg_sys->msg_controllen;
+	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
+		err =
+		    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
+						     sizeof(ctl));
+		if (err)
+			goto out_freeiov;
+		ctl_buf = msg_sys->msg_control;
+		ctl_len = msg_sys->msg_controllen;
+	} else if (ctl_len) {
+		if (ctl_len > sizeof(ctl)) {
+			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
+			if (ctl_buf == NULL)
+				goto out_freeiov;
+		}
+		err = -EFAULT;
+		/*
+		 * Careful! Before this, msg_sys->msg_control contains a user pointer.
+		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
+		 * checking falls down on this.
+		 */
+		if (copy_from_user(ctl_buf,
+				   (void __user __force *)msg_sys->msg_control,
+				   ctl_len))
+			goto out_freectl;
+		msg_sys->msg_control = ctl_buf;
+	}
+	msg_sys->msg_flags = flags;
+
+	if (sock->file->f_flags & O_NONBLOCK)
+		msg_sys->msg_flags |= MSG_DONTWAIT;
+	/*
+	 * If this is sendmmsg() and current destination address is same as
+	 * previously succeeded address, omit asking LSM's decision.
+	 * used_address->name_len is initialized to UINT_MAX so that the first
+	 * destination address never matches.
+	 */
+	if (used_address && msg_sys->msg_name &&
+	    used_address->name_len == msg_sys->msg_namelen &&
+	    !memcmp(&used_address->name, msg_sys->msg_name,
+		    used_address->name_len)) {
+		err = sock_sendmsg_nosec(sock, msg_sys, total_len);
+		goto out_freectl;
+	}
+	err = sock_sendmsg(sock, msg_sys, total_len);
+	/*
+	 * If this is sendmmsg() and sending to current destination address was
+	 * successful, remember it.
+	 */
+	if (used_address && err >= 0) {
+		used_address->name_len = msg_sys->msg_namelen;
+		if (msg_sys->msg_name)
+			memcpy(&used_address->name, msg_sys->msg_name,
+			       used_address->name_len);
+	}
+
+out_freectl:
+	if (ctl_buf != ctl)
+		sock_kfree_s(sock->sk, ctl_buf, ctl_len);
+out_freeiov:
+	if (iov != iovstack)
+		sock_kfree_s(sock->sk, iov, iov_size);
+out:
+	return err;
+}
+
+/*
+ *	BSD sendmsg interface
+ */
+
+SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *	Linux sendmmsg interface
+ */
+
+int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+		   unsigned int flags)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct compat_mmsghdr __user *compat_entry;
+	struct msghdr msg_sys;
+	struct used_address used_address;
+
+	if (vlen > UIO_MAXIOV)
+		vlen = UIO_MAXIOV;
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	used_address.name_len = UINT_MAX;
+	entry = mmsg;
+	compat_entry = (struct compat_mmsghdr __user *)mmsg;
+	err = 0;
+
+	while (datagrams < vlen) {
+		if (MSG_CMSG_COMPAT & flags) {
+			err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry,
+					    &msg_sys, flags, &used_address);
+			if (err < 0)
+				break;
+			err = __put_user(err, &compat_entry->msg_len);
+			++compat_entry;
+		} else {
+			err = __sys_sendmsg(sock, (struct msghdr __user *)entry,
+					    &msg_sys, flags, &used_address);
+			if (err < 0)
+				break;
+			err = put_user(err, &entry->msg_len);
+			++entry;
+		}
+
+		if (err)
+			break;
+		++datagrams;
+	}
+
+	fput_light(sock->file, fput_needed);
+
+	/* We only return an error if no datagrams were able to be sent */
+	if (datagrams != 0)
+		return datagrams;
+
+	return err;
+}
+
+SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags)
+{
+	return __sys_sendmmsg(fd, mmsg, vlen, flags);
+}
+
+static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned flags, int nosec)
+{
+	struct compat_msghdr __user *msg_compat =
+	    (struct compat_msghdr __user *)msg;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	unsigned long cmsg_ptr;
+	int err, iov_size, total_len, len;
+
+	/* kernel mode address */
+	struct sockaddr_storage addr;
+
+	/* user mode address pointers */
+	struct sockaddr __user *uaddr;
+	int __user *uaddr_len;
+
+	if (MSG_CMSG_COMPAT & flags) {
+		if (get_compat_msghdr(msg_sys, msg_compat))
+			return -EFAULT;
+	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
+		return -EFAULT;
+
+	err = -EMSGSIZE;
+	if (msg_sys->msg_iovlen > UIO_MAXIOV)
+		goto out;
+
+	/* Check whether to allocate the iovec area */
+	err = -ENOMEM;
+	iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
+	if (msg_sys->msg_iovlen > UIO_FASTIOV) {
+		iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
+		if (!iov)
+			goto out;
+	}
+
+	/*
+	 *      Save the user-mode address (verify_iovec will change the
+	 *      kernel msghdr to use the kernel address space)
+	 */
+
+	uaddr = (__force void __user *)msg_sys->msg_name;
+	uaddr_len = COMPAT_NAMELEN(msg);
+	if (MSG_CMSG_COMPAT & flags) {
+		err = verify_compat_iovec(msg_sys, iov,
+					  (struct sockaddr *)&addr,
+					  VERIFY_WRITE);
+	} else
+		err = verify_iovec(msg_sys, iov,
+				   (struct sockaddr *)&addr,
+				   VERIFY_WRITE);
+	if (err < 0)
+		goto out_freeiov;
+	total_len = err;
+
+	cmsg_ptr = (unsigned long)msg_sys->msg_control;
+	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+
+	if (sock->file->f_flags & O_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
+							  total_len, flags);
+	if (err < 0)
+		goto out_freeiov;
+	len = err;
+
+	if (uaddr != NULL) {
+		err = move_addr_to_user((struct sockaddr *)&addr,
+					msg_sys->msg_namelen, uaddr,
+					uaddr_len);
+		if (err < 0)
+			goto out_freeiov;
+	}
+	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
+			 COMPAT_FLAGS(msg));
+	if (err)
+		goto out_freeiov;
+	if (MSG_CMSG_COMPAT & flags)
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
+				 &msg_compat->msg_controllen);
+	else
+		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
+				 &msg->msg_controllen);
+	if (err)
+		goto out_freeiov;
+	err = len;
+
+out_freeiov:
+	if (iov != iovstack)
+		sock_kfree_s(sock->sk, iov, iov_size);
+out:
+	return err;
+}
+
+/*
+ *	BSD recvmsg interface
+ */
+
+SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
+		unsigned int, flags)
+{
+	int fput_needed, err;
+	struct msghdr msg_sys;
+	struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
+
+	if (!sock)
+		goto out;
+
+	err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+
+	fput_light(sock->file, fput_needed);
+out:
+	return err;
+}
+
+/*
+ *     Linux recvmmsg interface
+ */
+
+int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
+		   unsigned int flags, struct timespec *timeout)
+{
+	int fput_needed, err, datagrams;
+	struct socket *sock;
+	struct mmsghdr __user *entry;
+	struct compat_mmsghdr __user *compat_entry;
+	struct msghdr msg_sys;
+	struct timespec end_time;
+
+	if (timeout &&
+	    poll_select_set_timeout(&end_time, timeout->tv_sec,
+				    timeout->tv_nsec))
+		return -EINVAL;
+
+	datagrams = 0;
+
+	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	if (!sock)
+		return err;
+
+	err = sock_error(sock->sk);
+	if (err)
+		goto out_put;
+
+	entry = mmsg;
+	compat_entry = (struct compat_mmsghdr __user *)mmsg;
+
+	while (datagrams < vlen) {
+		/*
+		 * No need to ask LSM for more than the first datagram.
+		 */
+		if (MSG_CMSG_COMPAT & flags) {
+			err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry,
+					    &msg_sys, flags & ~MSG_WAITFORONE,
+					    datagrams);
+			if (err < 0)
+				break;
+			err = __put_user(err, &compat_entry->msg_len);
+			++compat_entry;
+		} else {
+			err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
+					    &msg_sys, flags & ~MSG_WAITFORONE,
+					    datagrams);
+			if (err < 0)
+				break;
+			err = put_user(err, &entry->msg_len);
+			++entry;
+		}
+
+		if (err)
+			break;
+		++datagrams;
+
+		/* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
+		if (flags & MSG_WAITFORONE)
+			flags |= MSG_DONTWAIT;
+
+		if (timeout) {
+			ktime_get_ts(timeout);
+			*timeout = timespec_sub(end_time, *timeout);
+			if (timeout->tv_sec < 0) {
+				timeout->tv_sec = timeout->tv_nsec = 0;
+				break;
+			}
+
+			/* Timeout, return less than vlen datagrams */
+			if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
+				break;
+		}
+
+		/* Out of band data, return right away */
+		if (msg_sys.msg_flags & MSG_OOB)
+			break;
+	}
+
+out_put:
+	fput_light(sock->file, fput_needed);
+
+	if (err == 0)
+		return datagrams;
+
+	if (datagrams != 0) {
+		/*
+		 * We may return less entries than requested (vlen) if the
+		 * sock is non block and there aren't enough datagrams...
+		 */
+		if (err != -EAGAIN) {
+			/*
+			 * ... or  if recvmsg returns an error after we
+			 * received some datagrams, where we record the
+			 * error to return on the next call or if the
+			 * app asks about it using getsockopt(SO_ERROR).
+			 */
+			sock->sk->sk_err = -err;
+		}
+
+		return datagrams;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
+		unsigned int, vlen, unsigned int, flags,
+		struct timespec __user *, timeout)
+{
+	int datagrams;
+	struct timespec timeout_sys;
+
+	if (!timeout)
+		return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
+
+	if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+		return -EFAULT;
+
+	datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
+
+	if (datagrams > 0 &&
+	    copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+		datagrams = -EFAULT;
+
+	return datagrams;
+}
+
+#ifdef __ARCH_WANT_SYS_SOCKETCALL
+/* Argument list sizes for sys_socketcall */
+#define AL(x) ((x) * sizeof(unsigned long))
+static const unsigned char nargs[21] = {
+	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+	AL(4), AL(5), AL(4)
+};
+
+#undef AL
+
+/*
+ *	System call vectors.
+ *
+ *	Argument checking cleaned up. Saved 20% in size.
+ *  This function doesn't need to set the kernel lock because
+ *  it is set by the callees.
+ */
+
+SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
+{
+	unsigned long a[6];
+	unsigned long a0, a1;
+	int err;
+	unsigned int len;
+
+	if (call < 1 || call > SYS_SENDMMSG)
+		return -EINVAL;
+
+	len = nargs[call];
+	if (len > sizeof(a))
+		return -EINVAL;
+
+	/* copy_from_user should be SMP safe. */
+	if (copy_from_user(a, args, len))
+		return -EFAULT;
+
+	audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+
+	a0 = a[0];
+	a1 = a[1];
+
+	switch (call) {
+	case SYS_SOCKET:
+		err = sys_socket(a0, a1, a[2]);
+		break;
+	case SYS_BIND:
+		err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
+		break;
+	case SYS_CONNECT:
+		err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
+		break;
+	case SYS_LISTEN:
+		err = sys_listen(a0, a1);
+		break;
+	case SYS_ACCEPT:
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], 0);
+		break;
+	case SYS_GETSOCKNAME:
+		err =
+		    sys_getsockname(a0, (struct sockaddr __user *)a1,
+				    (int __user *)a[2]);
+		break;
+	case SYS_GETPEERNAME:
+		err =
+		    sys_getpeername(a0, (struct sockaddr __user *)a1,
+				    (int __user *)a[2]);
+		break;
+	case SYS_SOCKETPAIR:
+		err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
+		break;
+	case SYS_SEND:
+		err = sys_send(a0, (void __user *)a1, a[2], a[3]);
+		break;
+	case SYS_SENDTO:
+		err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
+				 (struct sockaddr __user *)a[4], a[5]);
+		break;
+	case SYS_RECV:
+		err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
+		break;
+	case SYS_RECVFROM:
+		err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
+				   (struct sockaddr __user *)a[4],
+				   (int __user *)a[5]);
+		break;
+	case SYS_SHUTDOWN:
+		err = sys_shutdown(a0, a1);
+		break;
+	case SYS_SETSOCKOPT:
+		err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
+		break;
+	case SYS_GETSOCKOPT:
+		err =
+		    sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
+				   (int __user *)a[4]);
+		break;
+	case SYS_SENDMSG:
+		err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
+		break;
+	case SYS_SENDMMSG:
+		err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
+		break;
+	case SYS_RECVMSG:
+		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
+		break;
+	case SYS_RECVMMSG:
+		err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
+				   (struct timespec __user *)a[4]);
+		break;
+	case SYS_ACCEPT4:
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], a[3]);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+#endif				/* __ARCH_WANT_SYS_SOCKETCALL */
+
+/**
+ *	sock_register - add a socket protocol handler
+ *	@ops: description of protocol
+ *
+ *	This function is called by a protocol handler that wants to
+ *	advertise its address family, and have it linked into the
+ *	socket interface. The value ops->family coresponds to the
+ *	socket system call protocol family.
+ */
+int sock_register(const struct net_proto_family *ops)
+{
+	int err;
+
+	if (ops->family >= NPROTO) {
+		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
+		       NPROTO);
+		return -ENOBUFS;
+	}
+
+	spin_lock(&net_family_lock);
+	if (rcu_dereference_protected(net_families[ops->family],
+				      lockdep_is_held(&net_family_lock)))
+		err = -EEXIST;
+	else {
+		rcu_assign_pointer(net_families[ops->family], ops);
+		err = 0;
+	}
+	spin_unlock(&net_family_lock);
+
+	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
+	return err;
+}
+EXPORT_SYMBOL(sock_register);
+
+/**
+ *	sock_unregister - remove a protocol handler
+ *	@family: protocol family to remove
+ *
+ *	This function is called by a protocol handler that wants to
+ *	remove its address family, and have it unlinked from the
+ *	new socket creation.
+ *
+ *	If protocol handler is a module, then it can use module reference
+ *	counts to protect against new references. If protocol handler is not
+ *	a module then it needs to provide its own protection in
+ *	the ops->create routine.
+ */
+void sock_unregister(int family)
+{
+	BUG_ON(family < 0 || family >= NPROTO);
+
+	spin_lock(&net_family_lock);
+	rcu_assign_pointer(net_families[family], NULL);
+	spin_unlock(&net_family_lock);
+
+	synchronize_rcu();
+
+	printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
+}
+EXPORT_SYMBOL(sock_unregister);
+
+static int __init sock_init(void)
+{
+	int err;
+
+	/*
+	 *      Initialize sock SLAB cache.
+	 */
+
+	sk_init();
+
+	/*
+	 *      Initialize skbuff SLAB cache
+	 */
+	skb_init();
+
+	/*
+	 *      Initialize the protocols module.
+	 */
+
+	init_inodecache();
+
+	err = register_filesystem(&sock_fs_type);
+	if (err)
+		goto out_fs;
+	sock_mnt = kern_mount(&sock_fs_type);
+	if (IS_ERR(sock_mnt)) {
+		err = PTR_ERR(sock_mnt);
+		goto out_mount;
+	}
+
+	/* The real protocol initialization is performed in later initcalls.
+	 */
+
+#ifdef CONFIG_NETFILTER
+	netfilter_init();
+#endif
+
+#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
+	skb_timestamping_init();
+#endif
+
+out:
+	return err;
+
+out_mount:
+	unregister_filesystem(&sock_fs_type);
+out_fs:
+	goto out;
+}
+
+core_initcall(sock_init);	/* early initcall */
+
+#ifdef CONFIG_PROC_FS
+void socket_seq_show(struct seq_file *seq)
+{
+	int cpu;
+	int counter = 0;
+
+	for_each_possible_cpu(cpu)
+	    counter += per_cpu(sockets_in_use, cpu);
+
+	/* It can be negative, by the way. 8) */
+	if (counter < 0)
+		counter = 0;
+
+	seq_printf(seq, "sockets: used %d\n", counter);
+}
+#endif				/* CONFIG_PROC_FS */
+
+#ifdef CONFIG_COMPAT
+static int do_siocgstamp(struct net *net, struct socket *sock,
+			 unsigned int cmd, struct compat_timeval __user *up)
+{
+	mm_segment_t old_fs = get_fs();
+	struct timeval ktv;
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
+	set_fs(old_fs);
+	if (!err) {
+		err = put_user(ktv.tv_sec, &up->tv_sec);
+		err |= __put_user(ktv.tv_usec, &up->tv_usec);
+	}
+	return err;
+}
+
+static int do_siocgstampns(struct net *net, struct socket *sock,
+			 unsigned int cmd, struct compat_timespec __user *up)
+{
+	mm_segment_t old_fs = get_fs();
+	struct timespec kts;
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
+	set_fs(old_fs);
+	if (!err) {
+		err = put_user(kts.tv_sec, &up->tv_sec);
+		err |= __put_user(kts.tv_nsec, &up->tv_nsec);
+	}
+	return err;
+}
+
+static int dev_ifname32(struct net *net, struct compat_ifreq __user *uifr32)
+{
+	struct ifreq __user *uifr;
+	int err;
+
+	uifr = compat_alloc_user_space(sizeof(struct ifreq));
+	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+		return -EFAULT;
+
+	err = dev_ioctl(net, SIOCGIFNAME, uifr);
+	if (err)
+		return err;
+
+	if (copy_in_user(uifr32, uifr, sizeof(struct compat_ifreq)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
+{
+	struct compat_ifconf ifc32;
+	struct ifconf ifc;
+	struct ifconf __user *uifc;
+	struct compat_ifreq __user *ifr32;
+	struct ifreq __user *ifr;
+	unsigned int i, j;
+	int err;
+
+	if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
+		return -EFAULT;
+
+	if (ifc32.ifcbuf == 0) {
+		ifc32.ifc_len = 0;
+		ifc.ifc_len = 0;
+		ifc.ifc_req = NULL;
+		uifc = compat_alloc_user_space(sizeof(struct ifconf));
+	} else {
+		size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) *
+			sizeof(struct ifreq);
+		uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
+		ifc.ifc_len = len;
+		ifr = ifc.ifc_req = (void __user *)(uifc + 1);
+		ifr32 = compat_ptr(ifc32.ifcbuf);
+		for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) {
+			if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq)))
+				return -EFAULT;
+			ifr++;
+			ifr32++;
+		}
+	}
+	if (copy_to_user(uifc, &ifc, sizeof(struct ifconf)))
+		return -EFAULT;
+
+	err = dev_ioctl(net, SIOCGIFCONF, uifc);
+	if (err)
+		return err;
+
+	if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+		return -EFAULT;
+
+	ifr = ifc.ifc_req;
+	ifr32 = compat_ptr(ifc32.ifcbuf);
+	for (i = 0, j = 0;
+	     i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
+	     i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) {
+		if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq)))
+			return -EFAULT;
+		ifr32++;
+		ifr++;
+	}
+
+	if (ifc32.ifcbuf == 0) {
+		/* Translate from 64-bit structure multiple to
+		 * a 32-bit one.
+		 */
+		i = ifc.ifc_len;
+		i = ((i / sizeof(struct ifreq)) * sizeof(struct compat_ifreq));
+		ifc32.ifc_len = i;
+	} else {
+		ifc32.ifc_len = i;
+	}
+	if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
+{
+	struct compat_ethtool_rxnfc __user *compat_rxnfc;
+	bool convert_in = false, convert_out = false;
+	size_t buf_size = ALIGN(sizeof(struct ifreq), 8);
+	struct ethtool_rxnfc __user *rxnfc;
+	struct ifreq __user *ifr;
+	u32 rule_cnt = 0, actual_rule_cnt;
+	u32 ethcmd;
+	u32 data;
+	int ret;
+
+	if (get_user(data, &ifr32->ifr_ifru.ifru_data))
+		return -EFAULT;
+
+	compat_rxnfc = compat_ptr(data);
+
+	if (get_user(ethcmd, &compat_rxnfc->cmd))
+		return -EFAULT;
+
+	/* Most ethtool structures are defined without padding.
+	 * Unfortunately struct ethtool_rxnfc is an exception.
+	 */
+	switch (ethcmd) {
+	default:
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		/* Buffer size is variable */
+		if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
+			return -EFAULT;
+		if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
+			return -ENOMEM;
+		buf_size += rule_cnt * sizeof(u32);
+		/* fall through */
+	case ETHTOOL_GRXRINGS:
+	case ETHTOOL_GRXCLSRLCNT:
+	case ETHTOOL_GRXCLSRULE:
+		convert_out = true;
+		/* fall through */
+	case ETHTOOL_SRXCLSRLDEL:
+	case ETHTOOL_SRXCLSRLINS:
+		buf_size += sizeof(struct ethtool_rxnfc);
+		convert_in = true;
+		break;
+	}
+
+	ifr = compat_alloc_user_space(buf_size);
+	rxnfc = (void *)ifr + ALIGN(sizeof(struct ifreq), 8);
+
+	if (copy_in_user(&ifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
+		return -EFAULT;
+
+	if (put_user(convert_in ? rxnfc : compat_ptr(data),
+		     &ifr->ifr_ifru.ifru_data))
+		return -EFAULT;
+
+	if (convert_in) {
+		/* We expect there to be holes between fs.m_ext and
+		 * fs.ring_cookie and at the end of fs, but nowhere else.
+		 */
+		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+			     sizeof(compat_rxnfc->fs.m_ext) !=
+			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
+			     sizeof(rxnfc->fs.m_ext));
+		BUILD_BUG_ON(
+			offsetof(struct compat_ethtool_rxnfc, fs.location) -
+			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+			offsetof(struct ethtool_rxnfc, fs.location) -
+			offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+		if (copy_in_user(rxnfc, compat_rxnfc,
+				 (void *)(&rxnfc->fs.m_ext + 1) -
+				 (void *)rxnfc) ||
+		    copy_in_user(&rxnfc->fs.ring_cookie,
+				 &compat_rxnfc->fs.ring_cookie,
+				 (void *)(&rxnfc->fs.location + 1) -
+				 (void *)&rxnfc->fs.ring_cookie) ||
+		    copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt,
+				 sizeof(rxnfc->rule_cnt)))
+			return -EFAULT;
+	}
+
+	ret = dev_ioctl(net, SIOCETHTOOL, ifr);
+	if (ret)
+		return ret;
+
+	if (convert_out) {
+		if (copy_in_user(compat_rxnfc, rxnfc,
+				 (const void *)(&rxnfc->fs.m_ext + 1) -
+				 (const void *)rxnfc) ||
+		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
+				 &rxnfc->fs.ring_cookie,
+				 (const void *)(&rxnfc->fs.location + 1) -
+				 (const void *)&rxnfc->fs.ring_cookie) ||
+		    copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
+				 sizeof(rxnfc->rule_cnt)))
+			return -EFAULT;
+
+		if (ethcmd == ETHTOOL_GRXCLSRLALL) {
+			/* As an optimisation, we only copy the actual
+			 * number of rules that the underlying
+			 * function returned.  Since Mallory might
+			 * change the rule count in user memory, we
+			 * check that it is less than the rule count
+			 * originally given (as the user buffer size),
+			 * which has been range-checked.
+			 */
+			if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
+				return -EFAULT;
+			if (actual_rule_cnt < rule_cnt)
+				rule_cnt = actual_rule_cnt;
+			if (copy_in_user(&compat_rxnfc->rule_locs[0],
+					 &rxnfc->rule_locs[0],
+					 rule_cnt * sizeof(u32)))
+				return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
+{
+	void __user *uptr;
+	compat_uptr_t uptr32;
+	struct ifreq __user *uifr;
+
+	uifr = compat_alloc_user_space(sizeof(*uifr));
+	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+		return -EFAULT;
+
+	if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
+		return -EFAULT;
+
+	uptr = compat_ptr(uptr32);
+
+	if (put_user(uptr, &uifr->ifr_settings.ifs_ifsu.raw_hdlc))
+		return -EFAULT;
+
+	return dev_ioctl(net, SIOCWANDEV, uifr);
+}
+
+static int bond_ioctl(struct net *net, unsigned int cmd,
+			 struct compat_ifreq __user *ifr32)
+{
+	struct ifreq kifr;
+	struct ifreq __user *uifr;
+	mm_segment_t old_fs;
+	int err;
+	u32 data;
+	void __user *datap;
+
+	switch (cmd) {
+	case SIOCBONDENSLAVE:
+	case SIOCBONDRELEASE:
+	case SIOCBONDSETHWADDR:
+	case SIOCBONDCHANGEACTIVE:
+		if (copy_from_user(&kifr, ifr32, sizeof(struct compat_ifreq)))
+			return -EFAULT;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = dev_ioctl(net, cmd,
+				(struct ifreq __user __force *) &kifr);
+		set_fs(old_fs);
+
+		return err;
+	case SIOCBONDSLAVEINFOQUERY:
+	case SIOCBONDINFOQUERY:
+		uifr = compat_alloc_user_space(sizeof(*uifr));
+		if (copy_in_user(&uifr->ifr_name, &ifr32->ifr_name, IFNAMSIZ))
+			return -EFAULT;
+
+		if (get_user(data, &ifr32->ifr_ifru.ifru_data))
+			return -EFAULT;
+
+		datap = compat_ptr(data);
+		if (put_user(datap, &uifr->ifr_ifru.ifru_data))
+			return -EFAULT;
+
+		return dev_ioctl(net, cmd, uifr);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int siocdevprivate_ioctl(struct net *net, unsigned int cmd,
+				 struct compat_ifreq __user *u_ifreq32)
+{
+	struct ifreq __user *u_ifreq64;
+	char tmp_buf[IFNAMSIZ];
+	void __user *data64;
+	u32 data32;
+
+	if (copy_from_user(&tmp_buf[0], &(u_ifreq32->ifr_ifrn.ifrn_name[0]),
+			   IFNAMSIZ))
+		return -EFAULT;
+	if (__get_user(data32, &u_ifreq32->ifr_ifru.ifru_data))
+		return -EFAULT;
+	data64 = compat_ptr(data32);
+
+	u_ifreq64 = compat_alloc_user_space(sizeof(*u_ifreq64));
+
+	/* Don't check these user accesses, just let that get trapped
+	 * in the ioctl handler instead.
+	 */
+	if (copy_to_user(&u_ifreq64->ifr_ifrn.ifrn_name[0], &tmp_buf[0],
+			 IFNAMSIZ))
+		return -EFAULT;
+	if (__put_user(data64, &u_ifreq64->ifr_ifru.ifru_data))
+		return -EFAULT;
+
+	return dev_ioctl(net, cmd, u_ifreq64);
+}
+
+static int dev_ifsioc(struct net *net, struct socket *sock,
+			 unsigned int cmd, struct compat_ifreq __user *uifr32)
+{
+	struct ifreq __user *uifr;
+	int err;
+
+	uifr = compat_alloc_user_space(sizeof(*uifr));
+	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
+		return -EFAULT;
+
+	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
+
+	if (!err) {
+		switch (cmd) {
+		case SIOCGIFFLAGS:
+		case SIOCGIFMETRIC:
+		case SIOCGIFMTU:
+		case SIOCGIFMEM:
+		case SIOCGIFHWADDR:
+		case SIOCGIFINDEX:
+		case SIOCGIFADDR:
+		case SIOCGIFBRDADDR:
+		case SIOCGIFDSTADDR:
+		case SIOCGIFNETMASK:
+		case SIOCGIFPFLAGS:
+		case SIOCGIFTXQLEN:
+		case SIOCGMIIPHY:
+		case SIOCGMIIREG:
+			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
+				err = -EFAULT;
+			break;
+		}
+	}
+	return err;
+}
+
+static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
+			struct compat_ifreq __user *uifr32)
+{
+	struct ifreq ifr;
+	struct compat_ifmap __user *uifmap32;
+	mm_segment_t old_fs;
+	int err;
+
+	uifmap32 = &uifr32->ifr_ifru.ifru_map;
+	err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
+	err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+	err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+	err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+	err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
+	err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
+	err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
+	if (err)
+		return -EFAULT;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	err = dev_ioctl(net, cmd, (void  __user __force *)&ifr);
+	set_fs(old_fs);
+
+	if (cmd == SIOCGIFMAP && !err) {
+		err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
+		err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+		err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+		err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+		err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
+		err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
+		err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
+		if (err)
+			err = -EFAULT;
+	}
+	return err;
+}
+
+static int compat_siocshwtstamp(struct net *net, struct compat_ifreq __user *uifr32)
+{
+	void __user *uptr;
+	compat_uptr_t uptr32;
+	struct ifreq __user *uifr;
+
+	uifr = compat_alloc_user_space(sizeof(*uifr));
+	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
+		return -EFAULT;
+
+	if (get_user(uptr32, &uifr32->ifr_data))
+		return -EFAULT;
+
+	uptr = compat_ptr(uptr32);
+
+	if (put_user(uptr, &uifr->ifr_data))
+		return -EFAULT;
+
+	return dev_ioctl(net, SIOCSHWTSTAMP, uifr);
+}
+
+struct rtentry32 {
+	u32		rt_pad1;
+	struct sockaddr rt_dst;         /* target address               */
+	struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
+	struct sockaddr rt_genmask;     /* target network mask (IP)     */
+	unsigned short	rt_flags;
+	short		rt_pad2;
+	u32		rt_pad3;
+	unsigned char	rt_tos;
+	unsigned char	rt_class;
+	short		rt_pad4;
+	short		rt_metric;      /* +1 for binary compatibility! */
+	/* char * */ u32 rt_dev;        /* forcing the device at add    */
+	u32		rt_mtu;         /* per route MTU/Window         */
+	u32		rt_window;      /* Window clamping              */
+	unsigned short  rt_irtt;        /* Initial RTT                  */
+};
+
+struct in6_rtmsg32 {
+	struct in6_addr		rtmsg_dst;
+	struct in6_addr		rtmsg_src;
+	struct in6_addr		rtmsg_gateway;
+	u32			rtmsg_type;
+	u16			rtmsg_dst_len;
+	u16			rtmsg_src_len;
+	u32			rtmsg_metric;
+	u32			rtmsg_info;
+	u32			rtmsg_flags;
+	s32			rtmsg_ifindex;
+};
+
+static int routing_ioctl(struct net *net, struct socket *sock,
+			 unsigned int cmd, void __user *argp)
+{
+	int ret;
+	void *r = NULL;
+	struct in6_rtmsg r6;
+	struct rtentry r4;
+	char devname[16];
+	u32 rtdev;
+	mm_segment_t old_fs = get_fs();
+
+	if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
+		struct in6_rtmsg32 __user *ur6 = argp;
+		ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
+			3 * sizeof(struct in6_addr));
+		ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
+		ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
+		ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
+		ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
+		ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
+		ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
+		ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
+
+		r = (void *) &r6;
+	} else { /* ipv4 */
+		struct rtentry32 __user *ur4 = argp;
+		ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
+					3 * sizeof(struct sockaddr));
+		ret |= __get_user(r4.rt_flags, &(ur4->rt_flags));
+		ret |= __get_user(r4.rt_metric, &(ur4->rt_metric));
+		ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu));
+		ret |= __get_user(r4.rt_window, &(ur4->rt_window));
+		ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt));
+		ret |= __get_user(rtdev, &(ur4->rt_dev));
+		if (rtdev) {
+			ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
+			r4.rt_dev = (char __user __force *)devname;
+			devname[15] = 0;
+		} else
+			r4.rt_dev = NULL;
+
+		r = (void *) &r4;
+	}
+
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
+	set_fs(old_fs);
+
+out:
+	return ret;
+}
+
+/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
+ * for some operations; this forces use of the newer bridge-utils that
+ * use compatible ioctls
+ */
+static int old_bridge_ioctl(compat_ulong_t __user *argp)
+{
+	compat_ulong_t tmp;
+
+	if (get_user(tmp, argp))
+		return -EFAULT;
+	if (tmp == BRCTL_GET_VERSION)
+		return BRCTL_VERSION + 1;
+	return -EINVAL;
+}
+
+static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
+			 unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = compat_ptr(arg);
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+
+	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
+		return siocdevprivate_ioctl(net, cmd, argp);
+
+	switch (cmd) {
+	case SIOCSIFBR:
+	case SIOCGIFBR:
+		return old_bridge_ioctl(argp);
+	case SIOCGIFNAME:
+		return dev_ifname32(net, argp);
+	case SIOCGIFCONF:
+		return dev_ifconf(net, argp);
+	case SIOCETHTOOL:
+		return ethtool_ioctl(net, argp);
+	case SIOCWANDEV:
+		return compat_siocwandev(net, argp);
+	case SIOCGIFMAP:
+	case SIOCSIFMAP:
+		return compat_sioc_ifmap(net, cmd, argp);
+	case SIOCBONDENSLAVE:
+	case SIOCBONDRELEASE:
+	case SIOCBONDSETHWADDR:
+	case SIOCBONDSLAVEINFOQUERY:
+	case SIOCBONDINFOQUERY:
+	case SIOCBONDCHANGEACTIVE:
+		return bond_ioctl(net, cmd, argp);
+	case SIOCADDRT:
+	case SIOCDELRT:
+		return routing_ioctl(net, sock, cmd, argp);
+	case SIOCGSTAMP:
+		return do_siocgstamp(net, sock, cmd, argp);
+	case SIOCGSTAMPNS:
+		return do_siocgstampns(net, sock, cmd, argp);
+	case SIOCSHWTSTAMP:
+		return compat_siocshwtstamp(net, argp);
+
+	case FIOSETOWN:
+	case SIOCSPGRP:
+	case FIOGETOWN:
+	case SIOCGPGRP:
+	case SIOCBRADDBR:
+	case SIOCBRDELBR:
+	case SIOCGIFVLAN:
+	case SIOCSIFVLAN:
+	case SIOCADDDLCI:
+	case SIOCDELDLCI:
+		return sock_ioctl(file, cmd, arg);
+
+	case SIOCGIFFLAGS:
+	case SIOCSIFFLAGS:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+	case SIOCGIFMTU:
+	case SIOCSIFMTU:
+	case SIOCGIFMEM:
+	case SIOCSIFMEM:
+	case SIOCGIFHWADDR:
+	case SIOCSIFHWADDR:
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+	case SIOCGIFINDEX:
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCSIFHWBROADCAST:
+	case SIOCDIFADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCSIFPFLAGS:
+	case SIOCGIFPFLAGS:
+	case SIOCGIFTXQLEN:
+	case SIOCSIFTXQLEN:
+	case SIOCBRADDIF:
+	case SIOCBRDELIF:
+	case SIOCSIFNAME:
+	case SIOCGMIIPHY:
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		return dev_ifsioc(net, sock, cmd, argp);
+
+	case SIOCSARP:
+	case SIOCGARP:
+	case SIOCDARP:
+	case SIOCATMARK:
+		return sock_do_ioctl(net, sock, cmd, arg);
+	}
+
+	/* Prevent warning from compat_sys_ioctl, these always
+	 * result in -EINVAL in the native case anyway. */
+	switch (cmd) {
+	case SIOCRTMSG:
+	case SIOCGIFCOUNT:
+	case SIOCSRARP:
+	case SIOCGRARP:
+	case SIOCDRARP:
+	case SIOCSIFLINK:
+	case SIOCGIFSLAVE:
+	case SIOCSIFSLAVE:
+		return -EINVAL;
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static long compat_sock_ioctl(struct file *file, unsigned cmd,
+			      unsigned long arg)
+{
+	struct socket *sock = file->private_data;
+	int ret = -ENOIOCTLCMD;
+	struct sock *sk;
+	struct net *net;
+
+	sk = sock->sk;
+	net = sock_net(sk);
+
+	if (sock->ops->compat_ioctl)
+		ret = sock->ops->compat_ioctl(sock, cmd, arg);
+
+	if (ret == -ENOIOCTLCMD &&
+	    (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
+		ret = compat_wext_handle_ioctl(net, cmd, arg);
+
+	if (ret == -ENOIOCTLCMD)
+		ret = compat_sock_ioctl_trans(file, sock, cmd, arg);
+
+	return ret;
+}
+#endif
+
+int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
+{
+	return sock->ops->bind(sock, addr, addrlen);
+}
+EXPORT_SYMBOL(kernel_bind);
+
+int kernel_listen(struct socket *sock, int backlog)
+{
+	return sock->ops->listen(sock, backlog);
+}
+EXPORT_SYMBOL(kernel_listen);
+
+int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto done;
+
+	err = sock->ops->accept(sock, *newsock, flags);
+	if (err < 0) {
+		sock_release(*newsock);
+		*newsock = NULL;
+		goto done;
+	}
+
+	(*newsock)->ops = sock->ops;
+	__module_get((*newsock)->ops->owner);
+
+done:
+	return err;
+}
+EXPORT_SYMBOL(kernel_accept);
+
+int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
+		   int flags)
+{
+	return sock->ops->connect(sock, addr, addrlen, flags);
+}
+EXPORT_SYMBOL(kernel_connect);
+
+int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
+			 int *addrlen)
+{
+	return sock->ops->getname(sock, addr, addrlen, 0);
+}
+EXPORT_SYMBOL(kernel_getsockname);
+
+int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
+			 int *addrlen)
+{
+	return sock->ops->getname(sock, addr, addrlen, 1);
+}
+EXPORT_SYMBOL(kernel_getpeername);
+
+int kernel_getsockopt(struct socket *sock, int level, int optname,
+			char *optval, int *optlen)
+{
+	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
+	int __user *uoptlen;
+	int err;
+
+	uoptval = (char __user __force *) optval;
+	uoptlen = (int __user __force *) optlen;
+
+	set_fs(KERNEL_DS);
+	if (level == SOL_SOCKET)
+		err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
+	else
+		err = sock->ops->getsockopt(sock, level, optname, uoptval,
+					    uoptlen);
+	set_fs(oldfs);
+	return err;
+}
+EXPORT_SYMBOL(kernel_getsockopt);
+
+int kernel_setsockopt(struct socket *sock, int level, int optname,
+			char *optval, unsigned int optlen)
+{
+	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
+	int err;
+
+	uoptval = (char __user __force *) optval;
+
+	set_fs(KERNEL_DS);
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, uoptval,
+					    optlen);
+	set_fs(oldfs);
+	return err;
+}
+EXPORT_SYMBOL(kernel_setsockopt);
+
+int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+		    size_t size, int flags)
+{
+	sock_update_classid(sock->sk);
+
+	if (sock->ops->sendpage)
+		return sock->ops->sendpage(sock, page, offset, size, flags);
+
+	return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(kernel_sendpage);
+
+int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
+{
+	mm_segment_t oldfs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sock->ops->ioctl(sock, cmd, arg);
+	set_fs(oldfs);
+
+	return err;
+}
+EXPORT_SYMBOL(kernel_sock_ioctl);
+
+int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
+{
+	return sock->ops->shutdown(sock, how);
+}
+EXPORT_SYMBOL(kernel_sock_shutdown);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
new file mode 100644
index 00000000..b2198e65
--- /dev/null
+++ b/net/sunrpc/Kconfig
@@ -0,0 +1,37 @@
+config SUNRPC
+	tristate
+
+config SUNRPC_GSS
+	tristate
+
+config SUNRPC_XPRT_RDMA
+	tristate
+	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS && EXPERIMENTAL
+	default SUNRPC && INFINIBAND
+	help
+	  This option allows the NFS client and server to support
+	  an RDMA-enabled transport.
+
+	  To compile RPC client RDMA transport support as a module,
+	  choose M here: the module will be called xprtrdma.
+
+	  If unsure, say N.
+
+config RPCSEC_GSS_KRB5
+	tristate "Secure RPC: Kerberos V mechanism"
+	depends on SUNRPC && CRYPTO
+	depends on CRYPTO_MD5 && CRYPTO_DES && CRYPTO_CBC && CRYPTO_CTS
+	depends on CRYPTO_ECB && CRYPTO_HMAC && CRYPTO_SHA1 && CRYPTO_AES
+	depends on CRYPTO_ARC4
+	default y
+	select SUNRPC_GSS
+	help
+	  Choose Y here to enable Secure RPC using the Kerberos version 5
+	  GSS-API mechanism (RFC 1964).
+
+	  Secure RPC calls with Kerberos require an auxiliary user-space
+	  daemon which may be found in the Linux nfs-utils package
+	  available from http://linux-nfs.org/.  In addition, user-space
+	  Kerberos support should be installed.
+
+	  If unsure, say Y.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
new file mode 100644
index 00000000..9d2fca5a
--- /dev/null
+++ b/net/sunrpc/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for Linux kernel SUN RPC
+#
+
+
+obj-$(CONFIG_SUNRPC) += sunrpc.o
+obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
+
+sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
+	    auth.o auth_null.o auth_unix.o auth_generic.o \
+	    svc.o svcsock.o svcauth.o svcauth_unix.o \
+	    addr.o rpcb_clnt.o timer.o xdr.o \
+	    sunrpc_syms.o cache.o rpc_pipe.o \
+	    svc_xprt.o
+sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_PROC_FS) += stats.o
+sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
new file mode 100644
index 00000000..4195233c
--- /dev/null
+++ b/net/sunrpc/addr.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright 2009, Oracle.  All rights reserved.
+ *
+ * Convert socket addresses to presentation addresses and universal
+ * addresses, and vice versa.
+ *
+ * Universal addresses are introduced by RFC 1833 and further refined by
+ * recent RFCs describing NFSv4.  The universal address format is part
+ * of the external (network) interface provided by rpcbind version 3
+ * and 4, and by NFSv4.  Such an address is a string containing a
+ * presentation format IP address followed by a port number in
+ * "hibyte.lobyte" format.
+ *
+ * IPv6 addresses can also include a scope ID, typically denoted by
+ * a '%' followed by a device name or a non-negative integer.  Refer to
+ * RFC 4291, Section 2.2 for details on IPv6 presentation formats.
+ */
+
+#include <net/ipv6.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+				  char *buf, const int buflen)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr *addr = &sin6->sin6_addr;
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded ANY address
+	 */
+	if (ipv6_addr_any(addr))
+		return snprintf(buf, buflen, "::");
+
+	/*
+	 * RFC 4291, Section 2.2.2
+	 *
+	 * Shorthanded loopback address
+	 */
+	if (ipv6_addr_loopback(addr))
+		return snprintf(buf, buflen, "::1");
+
+	/*
+	 * RFC 4291, Section 2.2.3
+	 *
+	 * Special presentation address format for mapped v4
+	 * addresses.
+	 */
+	if (ipv6_addr_v4mapped(addr))
+		return snprintf(buf, buflen, "::ffff:%pI4",
+					&addr->s6_addr32[3]);
+
+	/*
+	 * RFC 4291, Section 2.2.1
+	 */
+	return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+			char *buf, const size_t buflen)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char scopebuf[IPV6_SCOPE_ID_LEN];
+	size_t len;
+	int rc;
+
+	len = rpc_ntop6_noscopeid(sap, buf, buflen);
+	if (unlikely(len == 0))
+		return len;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return len;
+	if (sin6->sin6_scope_id == 0)
+		return len;
+
+	rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u",
+			IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id);
+	if (unlikely((size_t)rc > sizeof(scopebuf)))
+		return 0;
+
+	len += rc;
+	if (unlikely(len > buflen))
+		return 0;
+
+	strcat(buf, scopebuf);
+	return len;
+}
+
+#else	/* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */
+
+static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap,
+				  char *buf, const int buflen)
+{
+	return 0;
+}
+
+static size_t rpc_ntop6(const struct sockaddr *sap,
+			char *buf, const size_t buflen)
+{
+	return 0;
+}
+
+#endif	/* !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) */
+
+static int rpc_ntop4(const struct sockaddr *sap,
+		     char *buf, const size_t buflen)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+	return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+/**
+ * rpc_ntop - construct a presentation address in @buf
+ * @sap: socket address
+ * @buf: construction area
+ * @buflen: size of @buf, in bytes
+ *
+ * Plants a %NUL-terminated string in @buf and returns the length
+ * of the string, excluding the %NUL.  Otherwise zero is returned.
+ */
+size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		return rpc_ntop4(sap, buf, buflen);
+	case AF_INET6:
+		return rpc_ntop6(sap, buf, buflen);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_ntop);
+
+static size_t rpc_pton4(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+	if (buflen > INET_ADDRSTRLEN || salen < sizeof(struct sockaddr_in))
+		return 0;
+
+	memset(sap, 0, sizeof(struct sockaddr_in));
+
+	if (in4_pton(buf, buflen, addr, '\0', NULL) == 0)
+		return 0;
+
+	sin->sin_family = AF_INET;
+	return sizeof(struct sockaddr_in);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int rpc_parse_scope_id(const char *buf, const size_t buflen,
+			      const char *delim, struct sockaddr_in6 *sin6)
+{
+	char *p;
+	size_t len;
+
+	if ((buf + buflen) == delim)
+		return 1;
+
+	if (*delim != IPV6_SCOPE_DELIMITER)
+		return 0;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return 0;
+
+	len = (buf + buflen) - delim - 1;
+	p = kstrndup(delim + 1, len, GFP_KERNEL);
+	if (p) {
+		unsigned long scope_id = 0;
+		struct net_device *dev;
+
+		dev = dev_get_by_name(&init_net, p);
+		if (dev != NULL) {
+			scope_id = dev->ifindex;
+			dev_put(dev);
+		} else {
+			if (strict_strtoul(p, 10, &scope_id) == 0) {
+				kfree(p);
+				return 0;
+			}
+		}
+
+		kfree(p);
+
+		sin6->sin6_scope_id = scope_id;
+		return 1;
+	}
+
+	return 0;
+}
+
+static size_t rpc_pton6(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+	const char *delim;
+
+	if (buflen > (INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN) ||
+	    salen < sizeof(struct sockaddr_in6))
+		return 0;
+
+	memset(sap, 0, sizeof(struct sockaddr_in6));
+
+	if (in6_pton(buf, buflen, addr, IPV6_SCOPE_DELIMITER, &delim) == 0)
+		return 0;
+
+	if (!rpc_parse_scope_id(buf, buflen, delim, sin6))
+		return 0;
+
+	sin6->sin6_family = AF_INET6;
+	return sizeof(struct sockaddr_in6);
+}
+#else
+static size_t rpc_pton6(const char *buf, const size_t buflen,
+			struct sockaddr *sap, const size_t salen)
+{
+	return 0;
+}
+#endif
+
+/**
+ * rpc_pton - Construct a sockaddr in @sap
+ * @buf: C string containing presentation format IP address
+ * @buflen: length of presentation address in bytes
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer in bytes
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ *
+ * Plants a socket address in @sap and returns the size of the
+ * socket address, if successful.  Returns zero if an error
+ * occurred.
+ */
+size_t rpc_pton(const char *buf, const size_t buflen,
+		struct sockaddr *sap, const size_t salen)
+{
+	unsigned int i;
+
+	for (i = 0; i < buflen; i++)
+		if (buf[i] == ':')
+			return rpc_pton6(buf, buflen, sap, salen);
+	return rpc_pton4(buf, buflen, sap, salen);
+}
+EXPORT_SYMBOL_GPL(rpc_pton);
+
+/**
+ * rpc_sockaddr2uaddr - Construct a universal address string from @sap.
+ * @sap: socket address
+ *
+ * Returns a %NUL-terminated string in dynamically allocated memory;
+ * otherwise NULL is returned if an error occurred.  Caller must
+ * free the returned string.
+ */
+char *rpc_sockaddr2uaddr(const struct sockaddr *sap)
+{
+	char portbuf[RPCBIND_MAXUADDRPLEN];
+	char addrbuf[RPCBIND_MAXUADDRLEN];
+	unsigned short port;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return NULL;
+		port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+		break;
+	case AF_INET6:
+		if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+			return NULL;
+		port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+		break;
+	default:
+		return NULL;
+	}
+
+	if (snprintf(portbuf, sizeof(portbuf),
+		     ".%u.%u", port >> 8, port & 0xff) > (int)sizeof(portbuf))
+		return NULL;
+
+	if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) > sizeof(addrbuf))
+		return NULL;
+
+	return kstrdup(addrbuf, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(rpc_sockaddr2uaddr);
+
+/**
+ * rpc_uaddr2sockaddr - convert a universal address to a socket address.
+ * @uaddr: C string containing universal address to convert
+ * @uaddr_len: length of universal address string
+ * @sap: buffer into which to plant socket address
+ * @salen: size of buffer
+ *
+ * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * rpc_pton() require proper string termination to be successful.
+ *
+ * Returns the size of the socket address if successful; otherwise
+ * zero is returned.
+ */
+size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len,
+			  struct sockaddr *sap, const size_t salen)
+{
+	char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
+	unsigned long portlo, porthi;
+	unsigned short port;
+
+	if (uaddr_len > RPCBIND_MAXUADDRLEN)
+		return 0;
+
+	memcpy(buf, uaddr, uaddr_len);
+
+	buf[uaddr_len] = '\0';
+	c = strrchr(buf, '.');
+	if (unlikely(c == NULL))
+		return 0;
+	if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
+		return 0;
+	if (unlikely(portlo > 255))
+		return 0;
+
+	*c = '\0';
+	c = strrchr(buf, '.');
+	if (unlikely(c == NULL))
+		return 0;
+	if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
+		return 0;
+	if (unlikely(porthi > 255))
+		return 0;
+
+	port = (unsigned short)((porthi << 8) | portlo);
+
+	*c = '\0';
+	if (rpc_pton(buf, strlen(buf), sap, salen) == 0)
+		return 0;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = htons(port);
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = htons(port);
+		return sizeof(struct sockaddr_in6);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_uaddr2sockaddr);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
new file mode 100644
index 00000000..cd6e4aa1
--- /dev/null
+++ b/net/sunrpc/auth.c
@@ -0,0 +1,688 @@
+/*
+ * linux/net/sunrpc/auth.c
+ *
+ * Generic RPC client authentication API.
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/hash.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/spinlock.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+#define RPC_CREDCACHE_DEFAULT_HASHBITS	(4)
+struct rpc_cred_cache {
+	struct hlist_head	*hashtable;
+	unsigned int		hashbits;
+	spinlock_t		lock;
+};
+
+static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
+
+static DEFINE_SPINLOCK(rpc_authflavor_lock);
+static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
+	&authnull_ops,		/* AUTH_NULL */
+	&authunix_ops,		/* AUTH_UNIX */
+	NULL,			/* others can be loadable modules */
+};
+
+static LIST_HEAD(cred_unused);
+static unsigned long number_cred_unused;
+
+#define MAX_HASHTABLE_BITS (14)
+static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	unsigned int nbits;
+	int ret;
+
+	if (!val)
+		goto out_inval;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL)
+		goto out_inval;
+	nbits = fls(num);
+	if (num > (1U << nbits))
+		nbits++;
+	if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
+		goto out_inval;
+	*(unsigned int *)kp->arg = nbits;
+	return 0;
+out_inval:
+	return -EINVAL;
+}
+
+static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
+{
+	unsigned int nbits;
+
+	nbits = *(unsigned int *)kp->arg;
+	return sprintf(buffer, "%u", 1U << nbits);
+}
+
+#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+
+static struct kernel_param_ops param_ops_hashtbl_sz = {
+	.set = param_set_hashtbl_sz,
+	.get = param_get_hashtbl_sz,
+};
+
+module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
+MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+
+static u32
+pseudoflavor_to_flavor(u32 flavor) {
+	if (flavor >= RPC_AUTH_MAXFLAVOR)
+		return RPC_AUTH_GSS;
+	return flavor;
+}
+
+int
+rpcauth_register(const struct rpc_authops *ops)
+{
+	rpc_authflavor_t flavor;
+	int ret = -EPERM;
+
+	if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
+		return -EINVAL;
+	spin_lock(&rpc_authflavor_lock);
+	if (auth_flavors[flavor] == NULL) {
+		auth_flavors[flavor] = ops;
+		ret = 0;
+	}
+	spin_unlock(&rpc_authflavor_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpcauth_register);
+
+int
+rpcauth_unregister(const struct rpc_authops *ops)
+{
+	rpc_authflavor_t flavor;
+	int ret = -EPERM;
+
+	if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR)
+		return -EINVAL;
+	spin_lock(&rpc_authflavor_lock);
+	if (auth_flavors[flavor] == ops) {
+		auth_flavors[flavor] = NULL;
+		ret = 0;
+	}
+	spin_unlock(&rpc_authflavor_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpcauth_unregister);
+
+struct rpc_auth *
+rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
+{
+	struct rpc_auth		*auth;
+	const struct rpc_authops *ops;
+	u32			flavor = pseudoflavor_to_flavor(pseudoflavor);
+
+	auth = ERR_PTR(-EINVAL);
+	if (flavor >= RPC_AUTH_MAXFLAVOR)
+		goto out;
+
+	if ((ops = auth_flavors[flavor]) == NULL)
+		request_module("rpc-auth-%u", flavor);
+	spin_lock(&rpc_authflavor_lock);
+	ops = auth_flavors[flavor];
+	if (ops == NULL || !try_module_get(ops->owner)) {
+		spin_unlock(&rpc_authflavor_lock);
+		goto out;
+	}
+	spin_unlock(&rpc_authflavor_lock);
+	auth = ops->create(clnt, pseudoflavor);
+	module_put(ops->owner);
+	if (IS_ERR(auth))
+		return auth;
+	if (clnt->cl_auth)
+		rpcauth_release(clnt->cl_auth);
+	clnt->cl_auth = auth;
+
+out:
+	return auth;
+}
+EXPORT_SYMBOL_GPL(rpcauth_create);
+
+void
+rpcauth_release(struct rpc_auth *auth)
+{
+	if (!atomic_dec_and_test(&auth->au_count))
+		return;
+	auth->au_ops->destroy(auth);
+}
+
+static DEFINE_SPINLOCK(rpc_credcache_lock);
+
+static void
+rpcauth_unhash_cred_locked(struct rpc_cred *cred)
+{
+	hlist_del_rcu(&cred->cr_hash);
+	smp_mb__before_clear_bit();
+	clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+}
+
+static int
+rpcauth_unhash_cred(struct rpc_cred *cred)
+{
+	spinlock_t *cache_lock;
+	int ret;
+
+	cache_lock = &cred->cr_auth->au_credcache->lock;
+	spin_lock(cache_lock);
+	ret = atomic_read(&cred->cr_count) == 0;
+	if (ret)
+		rpcauth_unhash_cred_locked(cred);
+	spin_unlock(cache_lock);
+	return ret;
+}
+
+/*
+ * Initialize RPC credential cache
+ */
+int
+rpcauth_init_credcache(struct rpc_auth *auth)
+{
+	struct rpc_cred_cache *new;
+	unsigned int hashsize;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out_nocache;
+	new->hashbits = auth_hashbits;
+	hashsize = 1U << new->hashbits;
+	new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+	if (!new->hashtable)
+		goto out_nohashtbl;
+	spin_lock_init(&new->lock);
+	auth->au_credcache = new;
+	return 0;
+out_nohashtbl:
+	kfree(new);
+out_nocache:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
+
+/*
+ * Destroy a list of credentials
+ */
+static inline
+void rpcauth_destroy_credlist(struct list_head *head)
+{
+	struct rpc_cred *cred;
+
+	while (!list_empty(head)) {
+		cred = list_entry(head->next, struct rpc_cred, cr_lru);
+		list_del_init(&cred->cr_lru);
+		put_rpccred(cred);
+	}
+}
+
+/*
+ * Clear the RPC credential cache, and delete those credentials
+ * that are not referenced.
+ */
+void
+rpcauth_clear_credcache(struct rpc_cred_cache *cache)
+{
+	LIST_HEAD(free);
+	struct hlist_head *head;
+	struct rpc_cred	*cred;
+	unsigned int hashsize = 1U << cache->hashbits;
+	int		i;
+
+	spin_lock(&rpc_credcache_lock);
+	spin_lock(&cache->lock);
+	for (i = 0; i < hashsize; i++) {
+		head = &cache->hashtable[i];
+		while (!hlist_empty(head)) {
+			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
+			get_rpccred(cred);
+			if (!list_empty(&cred->cr_lru)) {
+				list_del(&cred->cr_lru);
+				number_cred_unused--;
+			}
+			list_add_tail(&cred->cr_lru, &free);
+			rpcauth_unhash_cred_locked(cred);
+		}
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&rpc_credcache_lock);
+	rpcauth_destroy_credlist(&free);
+}
+
+/*
+ * Destroy the RPC credential cache
+ */
+void
+rpcauth_destroy_credcache(struct rpc_auth *auth)
+{
+	struct rpc_cred_cache *cache = auth->au_credcache;
+
+	if (cache) {
+		auth->au_credcache = NULL;
+		rpcauth_clear_credcache(cache);
+		kfree(cache->hashtable);
+		kfree(cache);
+	}
+}
+EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
+
+
+#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ)
+
+/*
+ * Remove stale credentials. Avoid sleeping inside the loop.
+ */
+static int
+rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
+{
+	spinlock_t *cache_lock;
+	struct rpc_cred *cred, *next;
+	unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
+
+	list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
+
+		if (nr_to_scan-- == 0)
+			break;
+		/*
+		 * Enforce a 60 second garbage collection moratorium
+		 * Note that the cred_unused list must be time-ordered.
+		 */
+		if (time_in_range(cred->cr_expire, expired, jiffies) &&
+		    test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
+			return 0;
+
+		list_del_init(&cred->cr_lru);
+		number_cred_unused--;
+		if (atomic_read(&cred->cr_count) != 0)
+			continue;
+
+		cache_lock = &cred->cr_auth->au_credcache->lock;
+		spin_lock(cache_lock);
+		if (atomic_read(&cred->cr_count) == 0) {
+			get_rpccred(cred);
+			list_add_tail(&cred->cr_lru, free);
+			rpcauth_unhash_cred_locked(cred);
+		}
+		spin_unlock(cache_lock);
+	}
+	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+}
+
+/*
+ * Run memory cache shrinker.
+ */
+static int
+rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+{
+	LIST_HEAD(free);
+	int res;
+	int nr_to_scan = sc->nr_to_scan;
+	gfp_t gfp_mask = sc->gfp_mask;
+
+	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return (nr_to_scan == 0) ? 0 : -1;
+	if (list_empty(&cred_unused))
+		return 0;
+	spin_lock(&rpc_credcache_lock);
+	res = rpcauth_prune_expired(&free, nr_to_scan);
+	spin_unlock(&rpc_credcache_lock);
+	rpcauth_destroy_credlist(&free);
+	return res;
+}
+
+/*
+ * Look up a process' credentials in the authentication cache
+ */
+struct rpc_cred *
+rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
+		int flags)
+{
+	LIST_HEAD(free);
+	struct rpc_cred_cache *cache = auth->au_credcache;
+	struct hlist_node *pos;
+	struct rpc_cred	*cred = NULL,
+			*entry, *new;
+	unsigned int nr;
+
+	nr = hash_long(acred->uid, cache->hashbits);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
+		if (!entry->cr_ops->crmatch(acred, entry, flags))
+			continue;
+		spin_lock(&cache->lock);
+		if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
+			spin_unlock(&cache->lock);
+			continue;
+		}
+		cred = get_rpccred(entry);
+		spin_unlock(&cache->lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	if (cred != NULL)
+		goto found;
+
+	new = auth->au_ops->crcreate(auth, acred, flags);
+	if (IS_ERR(new)) {
+		cred = new;
+		goto out;
+	}
+
+	spin_lock(&cache->lock);
+	hlist_for_each_entry(entry, pos, &cache->hashtable[nr], cr_hash) {
+		if (!entry->cr_ops->crmatch(acred, entry, flags))
+			continue;
+		cred = get_rpccred(entry);
+		break;
+	}
+	if (cred == NULL) {
+		cred = new;
+		set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
+		hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]);
+	} else
+		list_add_tail(&new->cr_lru, &free);
+	spin_unlock(&cache->lock);
+found:
+	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
+	    cred->cr_ops->cr_init != NULL &&
+	    !(flags & RPCAUTH_LOOKUP_NEW)) {
+		int res = cred->cr_ops->cr_init(auth, cred);
+		if (res < 0) {
+			put_rpccred(cred);
+			cred = ERR_PTR(res);
+		}
+	}
+	rpcauth_destroy_credlist(&free);
+out:
+	return cred;
+}
+EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache);
+
+struct rpc_cred *
+rpcauth_lookupcred(struct rpc_auth *auth, int flags)
+{
+	struct auth_cred acred;
+	struct rpc_cred *ret;
+	const struct cred *cred = current_cred();
+
+	dprintk("RPC:       looking up %s cred\n",
+		auth->au_ops->au_name);
+
+	memset(&acred, 0, sizeof(acred));
+	acred.uid = cred->fsuid;
+	acred.gid = cred->fsgid;
+	acred.group_info = get_group_info(((struct cred *)cred)->group_info);
+
+	ret = auth->au_ops->lookup_cred(auth, &acred, flags);
+	put_group_info(acred.group_info);
+	return ret;
+}
+
+void
+rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
+		  struct rpc_auth *auth, const struct rpc_credops *ops)
+{
+	INIT_HLIST_NODE(&cred->cr_hash);
+	INIT_LIST_HEAD(&cred->cr_lru);
+	atomic_set(&cred->cr_count, 1);
+	cred->cr_auth = auth;
+	cred->cr_ops = ops;
+	cred->cr_expire = jiffies;
+#ifdef RPC_DEBUG
+	cred->cr_magic = RPCAUTH_CRED_MAGIC;
+#endif
+	cred->cr_uid = acred->uid;
+}
+EXPORT_SYMBOL_GPL(rpcauth_init_cred);
+
+struct rpc_cred *
+rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
+{
+	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
+			cred->cr_auth->au_ops->au_name, cred);
+	return get_rpccred(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
+
+static struct rpc_cred *
+rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
+{
+	struct rpc_auth *auth = task->tk_client->cl_auth;
+	struct auth_cred acred = {
+		.uid = 0,
+		.gid = 0,
+	};
+
+	dprintk("RPC: %5u looking up %s cred\n",
+		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
+	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
+}
+
+static struct rpc_cred *
+rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
+{
+	struct rpc_auth *auth = task->tk_client->cl_auth;
+
+	dprintk("RPC: %5u looking up %s cred\n",
+		task->tk_pid, auth->au_ops->au_name);
+	return rpcauth_lookupcred(auth, lookupflags);
+}
+
+static int
+rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *new;
+	int lookupflags = 0;
+
+	if (flags & RPC_TASK_ASYNC)
+		lookupflags |= RPCAUTH_LOOKUP_NEW;
+	if (cred != NULL)
+		new = cred->cr_ops->crbind(task, cred, lookupflags);
+	else if (flags & RPC_TASK_ROOTCREDS)
+		new = rpcauth_bind_root_cred(task, lookupflags);
+	else
+		new = rpcauth_bind_new_cred(task, lookupflags);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	req->rq_cred = new;
+	return 0;
+}
+
+void
+put_rpccred(struct rpc_cred *cred)
+{
+	/* Fast path for unhashed credentials */
+	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) {
+		if (atomic_dec_and_test(&cred->cr_count))
+			cred->cr_ops->crdestroy(cred);
+		return;
+	}
+
+	if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock))
+		return;
+	if (!list_empty(&cred->cr_lru)) {
+		number_cred_unused--;
+		list_del_init(&cred->cr_lru);
+	}
+	if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
+		if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) {
+			cred->cr_expire = jiffies;
+			list_add_tail(&cred->cr_lru, &cred_unused);
+			number_cred_unused++;
+			goto out_nodestroy;
+		}
+		if (!rpcauth_unhash_cred(cred)) {
+			/* We were hashed and someone looked us up... */
+			goto out_nodestroy;
+		}
+	}
+	spin_unlock(&rpc_credcache_lock);
+	cred->cr_ops->crdestroy(cred);
+	return;
+out_nodestroy:
+	spin_unlock(&rpc_credcache_lock);
+}
+EXPORT_SYMBOL_GPL(put_rpccred);
+
+__be32 *
+rpcauth_marshcred(struct rpc_task *task, __be32 *p)
+{
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
+
+	dprintk("RPC: %5u marshaling %s cred %p\n",
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+	return cred->cr_ops->crmarshal(task, p);
+}
+
+__be32 *
+rpcauth_checkverf(struct rpc_task *task, __be32 *p)
+{
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
+
+	dprintk("RPC: %5u validating %s cred %p\n",
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+	return cred->cr_ops->crvalidate(task, p);
+}
+
+static void rpcauth_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+				   __be32 *data, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, data);
+	encode(rqstp, &xdr, obj);
+}
+
+int
+rpcauth_wrap_req(struct rpc_task *task, kxdreproc_t encode, void *rqstp,
+		__be32 *data, void *obj)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+	dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
+			task->tk_pid, cred->cr_ops->cr_name, cred);
+	if (cred->cr_ops->crwrap_req)
+		return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
+	/* By default, we encode the arguments normally. */
+	rpcauth_wrap_req_encode(encode, rqstp, data, obj);
+	return 0;
+}
+
+static int
+rpcauth_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+			  __be32 *data, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, data);
+	return decode(rqstp, &xdr, obj);
+}
+
+int
+rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp,
+		__be32 *data, void *obj)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+	dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
+			task->tk_pid, cred->cr_ops->cr_name, cred);
+	if (cred->cr_ops->crunwrap_resp)
+		return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
+						   data, obj);
+	/* By default, we decode the arguments normally. */
+	return rpcauth_unwrap_req_decode(decode, rqstp, data, obj);
+}
+
+int
+rpcauth_refreshcred(struct rpc_task *task)
+{
+	struct rpc_cred	*cred;
+	int err;
+
+	cred = task->tk_rqstp->rq_cred;
+	if (cred == NULL) {
+		err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags);
+		if (err < 0)
+			goto out;
+		cred = task->tk_rqstp->rq_cred;
+	};
+	dprintk("RPC: %5u refreshing %s cred %p\n",
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+
+	err = cred->cr_ops->crrefresh(task);
+out:
+	if (err < 0)
+		task->tk_status = err;
+	return err;
+}
+
+void
+rpcauth_invalcred(struct rpc_task *task)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+	dprintk("RPC: %5u invalidating %s cred %p\n",
+		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
+	if (cred)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+}
+
+int
+rpcauth_uptodatecred(struct rpc_task *task)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+
+	return cred == NULL ||
+		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
+}
+
+static struct shrinker rpc_cred_shrinker = {
+	.shrink = rpcauth_cache_shrinker,
+	.seeks = DEFAULT_SEEKS,
+};
+
+int __init rpcauth_init_module(void)
+{
+	int err;
+
+	err = rpc_init_authunix();
+	if (err < 0)
+		goto out1;
+	err = rpc_init_generic_auth();
+	if (err < 0)
+		goto out2;
+	register_shrinker(&rpc_cred_shrinker);
+	return 0;
+out2:
+	rpc_destroy_authunix();
+out1:
+	return err;
+}
+
+void rpcauth_remove_module(void)
+{
+	rpc_destroy_authunix();
+	rpc_destroy_generic_auth();
+	unregister_shrinker(&rpc_cred_shrinker);
+}
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
new file mode 100644
index 00000000..e010a015
--- /dev/null
+++ b/net/sunrpc/auth_generic.c
@@ -0,0 +1,183 @@
+/*
+ * Generic RPC credential
+ *
+ * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/sched.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+#define RPC_MACHINE_CRED_USERID		((uid_t)0)
+#define RPC_MACHINE_CRED_GROUPID	((gid_t)0)
+
+struct generic_cred {
+	struct rpc_cred gc_base;
+	struct auth_cred acred;
+};
+
+static struct rpc_auth generic_auth;
+static const struct rpc_credops generic_credops;
+
+/*
+ * Public call interface
+ */
+struct rpc_cred *rpc_lookup_cred(void)
+{
+	return rpcauth_lookupcred(&generic_auth, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+
+/*
+ * Public call interface for looking up machine creds.
+ */
+struct rpc_cred *rpc_lookup_machine_cred(void)
+{
+	struct auth_cred acred = {
+		.uid = RPC_MACHINE_CRED_USERID,
+		.gid = RPC_MACHINE_CRED_GROUPID,
+		.machine_cred = 1,
+	};
+
+	dprintk("RPC:       looking up machine cred\n");
+	return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
+
+static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
+		struct rpc_cred *cred, int lookupflags)
+{
+	struct rpc_auth *auth = task->tk_client->cl_auth;
+	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
+
+	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
+}
+
+/*
+ * Lookup generic creds for current process
+ */
+static struct rpc_cred *
+generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	return rpcauth_lookup_credcache(&generic_auth, acred, flags);
+}
+
+static struct rpc_cred *
+generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	struct generic_cred *gcred;
+
+	gcred = kmalloc(sizeof(*gcred), GFP_KERNEL);
+	if (gcred == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops);
+	gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+
+	gcred->acred.uid = acred->uid;
+	gcred->acred.gid = acred->gid;
+	gcred->acred.group_info = acred->group_info;
+	if (gcred->acred.group_info != NULL)
+		get_group_info(gcred->acred.group_info);
+	gcred->acred.machine_cred = acred->machine_cred;
+
+	dprintk("RPC:       allocated %s cred %p for uid %d gid %d\n",
+			gcred->acred.machine_cred ? "machine" : "generic",
+			gcred, acred->uid, acred->gid);
+	return &gcred->gc_base;
+}
+
+static void
+generic_free_cred(struct rpc_cred *cred)
+{
+	struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+
+	dprintk("RPC:       generic_free_cred %p\n", gcred);
+	if (gcred->acred.group_info != NULL)
+		put_group_info(gcred->acred.group_info);
+	kfree(gcred);
+}
+
+static void
+generic_free_cred_callback(struct rcu_head *head)
+{
+	struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu);
+	generic_free_cred(cred);
+}
+
+static void
+generic_destroy_cred(struct rpc_cred *cred)
+{
+	call_rcu(&cred->cr_rcu, generic_free_cred_callback);
+}
+
+/*
+ * Match credentials against current process creds.
+ */
+static int
+generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags)
+{
+	struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base);
+	int i;
+
+	if (gcred->acred.uid != acred->uid ||
+	    gcred->acred.gid != acred->gid ||
+	    gcred->acred.machine_cred != acred->machine_cred)
+		goto out_nomatch;
+
+	/* Optimisation in the case where pointers are identical... */
+	if (gcred->acred.group_info == acred->group_info)
+		goto out_match;
+
+	/* Slow path... */
+	if (gcred->acred.group_info->ngroups != acred->group_info->ngroups)
+		goto out_nomatch;
+	for (i = 0; i < gcred->acred.group_info->ngroups; i++) {
+		if (GROUP_AT(gcred->acred.group_info, i) !=
+				GROUP_AT(acred->group_info, i))
+			goto out_nomatch;
+	}
+out_match:
+	return 1;
+out_nomatch:
+	return 0;
+}
+
+int __init rpc_init_generic_auth(void)
+{
+	return rpcauth_init_credcache(&generic_auth);
+}
+
+void rpc_destroy_generic_auth(void)
+{
+	rpcauth_destroy_credcache(&generic_auth);
+}
+
+static const struct rpc_authops generic_auth_ops = {
+	.owner = THIS_MODULE,
+	.au_name = "Generic",
+	.lookup_cred = generic_lookup_cred,
+	.crcreate = generic_create_cred,
+};
+
+static struct rpc_auth generic_auth = {
+	.au_ops = &generic_auth_ops,
+	.au_count = ATOMIC_INIT(0),
+};
+
+static const struct rpc_credops generic_credops = {
+	.cr_name = "Generic cred",
+	.crdestroy = generic_destroy_cred,
+	.crbind = generic_bind_cred,
+	.crmatch = generic_match,
+};
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
new file mode 100644
index 00000000..9e4cb59e
--- /dev/null
+++ b/net/sunrpc/auth_gss/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for Linux kernel rpcsec_gss implementation
+#
+
+obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
+
+auth_rpcgss-y := auth_gss.o gss_generic_token.o \
+	gss_mech_switch.o svcauth_gss.o
+
+obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+
+rpcsec_gss_krb5-y := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+	gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
new file mode 100644
index 00000000..5daf6cc4
--- /dev/null
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -0,0 +1,1646 @@
+/*
+ * linux/net/sunrpc/auth_gss/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/gss_api.h>
+#include <asm/uaccess.h>
+
+static const struct rpc_authops authgss_ops;
+
+static const struct rpc_credops gss_credops;
+static const struct rpc_credops gss_nullops;
+
+#define GSS_RETRY_EXPIRED 5
+static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED;
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+#define GSS_CRED_SLACK		(RPC_MAX_AUTH_SIZE * 2)
+/* length of a krb5 verifier (48), plus data added before arguments when
+ * using integrity (two 4-byte integers): */
+#define GSS_VERF_SLACK		100
+
+struct gss_auth {
+	struct kref kref;
+	struct rpc_auth rpc_auth;
+	struct gss_api_mech *mech;
+	enum rpc_gss_svc service;
+	struct rpc_clnt *client;
+	/*
+	 * There are two upcall pipes; dentry[1], named "gssd", is used
+	 * for the new text-based upcall; dentry[0] is named after the
+	 * mechanism (for example, "krb5") and exists for
+	 * backwards-compatibility with older gssd's.
+	 */
+	struct dentry *dentry[2];
+};
+
+/* pipe_version >= 0 if and only if someone has a pipe open. */
+static int pipe_version = -1;
+static atomic_t pipe_users = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(pipe_version_lock);
+static struct rpc_wait_queue pipe_version_rpc_waitqueue;
+static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue);
+
+static void gss_free_ctx(struct gss_cl_ctx *);
+static const struct rpc_pipe_ops gss_upcall_ops_v0;
+static const struct rpc_pipe_ops gss_upcall_ops_v1;
+
+static inline struct gss_cl_ctx *
+gss_get_ctx(struct gss_cl_ctx *ctx)
+{
+	atomic_inc(&ctx->count);
+	return ctx;
+}
+
+static inline void
+gss_put_ctx(struct gss_cl_ctx *ctx)
+{
+	if (atomic_dec_and_test(&ctx->count))
+		gss_free_ctx(ctx);
+}
+
+/* gss_cred_set_ctx:
+ * called by gss_upcall_callback and gss_create_upcall in order
+ * to set the gss context. The actual exchange of an old context
+ * and a new one is protected by the inode->i_lock.
+ */
+static void
+gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
+{
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+
+	if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+		return;
+	gss_get_ctx(ctx);
+	rcu_assign_pointer(gss_cred->gc_ctx, ctx);
+	set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	smp_mb__before_clear_bit();
+	clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
+}
+
+static const void *
+simple_get_bytes(const void *p, const void *end, void *res, size_t len)
+{
+	const void *q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	memcpy(res, p, len);
+	return q;
+}
+
+static inline const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+{
+	const void *q;
+	unsigned int len;
+
+	p = simple_get_bytes(p, end, &len, sizeof(len));
+	if (IS_ERR(p))
+		return p;
+	q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	dest->data = kmemdup(p, len, GFP_NOFS);
+	if (unlikely(dest->data == NULL))
+		return ERR_PTR(-ENOMEM);
+	dest->len = len;
+	return q;
+}
+
+static struct gss_cl_ctx *
+gss_cred_get_ctx(struct rpc_cred *cred)
+{
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+	struct gss_cl_ctx *ctx = NULL;
+
+	rcu_read_lock();
+	if (gss_cred->gc_ctx)
+		ctx = gss_get_ctx(gss_cred->gc_ctx);
+	rcu_read_unlock();
+	return ctx;
+}
+
+static struct gss_cl_ctx *
+gss_alloc_context(void)
+{
+	struct gss_cl_ctx *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+	if (ctx != NULL) {
+		ctx->gc_proc = RPC_GSS_PROC_DATA;
+		ctx->gc_seq = 1;	/* NetApp 6.4R1 doesn't accept seq. no. 0 */
+		spin_lock_init(&ctx->gc_seq_lock);
+		atomic_set(&ctx->count,1);
+	}
+	return ctx;
+}
+
+#define GSSD_MIN_TIMEOUT (60 * 60)
+static const void *
+gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm)
+{
+	const void *q;
+	unsigned int seclen;
+	unsigned int timeout;
+	u32 window_size;
+	int ret;
+
+	/* First unsigned int gives the lifetime (in seconds) of the cred */
+	p = simple_get_bytes(p, end, &timeout, sizeof(timeout));
+	if (IS_ERR(p))
+		goto err;
+	if (timeout == 0)
+		timeout = GSSD_MIN_TIMEOUT;
+	ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4;
+	/* Sequence number window. Determines the maximum number of simultaneous requests */
+	p = simple_get_bytes(p, end, &window_size, sizeof(window_size));
+	if (IS_ERR(p))
+		goto err;
+	ctx->gc_win = window_size;
+	/* gssd signals an error by passing ctx->gc_win = 0: */
+	if (ctx->gc_win == 0) {
+		/*
+		 * in which case, p points to an error code. Anything other
+		 * than -EKEYEXPIRED gets converted to -EACCES.
+		 */
+		p = simple_get_bytes(p, end, &ret, sizeof(ret));
+		if (!IS_ERR(p))
+			p = (ret == -EKEYEXPIRED) ? ERR_PTR(-EKEYEXPIRED) :
+						    ERR_PTR(-EACCES);
+		goto err;
+	}
+	/* copy the opaque wire context */
+	p = simple_get_netobj(p, end, &ctx->gc_wire_ctx);
+	if (IS_ERR(p))
+		goto err;
+	/* import the opaque security context */
+	p  = simple_get_bytes(p, end, &seclen, sizeof(seclen));
+	if (IS_ERR(p))
+		goto err;
+	q = (const void *)((const char *)p + seclen);
+	if (unlikely(q > end || q < p)) {
+		p = ERR_PTR(-EFAULT);
+		goto err;
+	}
+	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS);
+	if (ret < 0) {
+		p = ERR_PTR(ret);
+		goto err;
+	}
+	return q;
+err:
+	dprintk("RPC:       gss_fill_context returning %ld\n", -PTR_ERR(p));
+	return p;
+}
+
+#define UPCALL_BUF_LEN 128
+
+struct gss_upcall_msg {
+	atomic_t count;
+	uid_t	uid;
+	struct rpc_pipe_msg msg;
+	struct list_head list;
+	struct gss_auth *auth;
+	struct rpc_inode *inode;
+	struct rpc_wait_queue rpc_waitqueue;
+	wait_queue_head_t waitqueue;
+	struct gss_cl_ctx *ctx;
+	char databuf[UPCALL_BUF_LEN];
+};
+
+static int get_pipe_version(void)
+{
+	int ret;
+
+	spin_lock(&pipe_version_lock);
+	if (pipe_version >= 0) {
+		atomic_inc(&pipe_users);
+		ret = pipe_version;
+	} else
+		ret = -EAGAIN;
+	spin_unlock(&pipe_version_lock);
+	return ret;
+}
+
+static void put_pipe_version(void)
+{
+	if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) {
+		pipe_version = -1;
+		spin_unlock(&pipe_version_lock);
+	}
+}
+
+static void
+gss_release_msg(struct gss_upcall_msg *gss_msg)
+{
+	if (!atomic_dec_and_test(&gss_msg->count))
+		return;
+	put_pipe_version();
+	BUG_ON(!list_empty(&gss_msg->list));
+	if (gss_msg->ctx != NULL)
+		gss_put_ctx(gss_msg->ctx);
+	rpc_destroy_wait_queue(&gss_msg->rpc_waitqueue);
+	kfree(gss_msg);
+}
+
+static struct gss_upcall_msg *
+__gss_find_upcall(struct rpc_inode *rpci, uid_t uid)
+{
+	struct gss_upcall_msg *pos;
+	list_for_each_entry(pos, &rpci->in_downcall, list) {
+		if (pos->uid != uid)
+			continue;
+		atomic_inc(&pos->count);
+		dprintk("RPC:       gss_find_upcall found msg %p\n", pos);
+		return pos;
+	}
+	dprintk("RPC:       gss_find_upcall found nothing\n");
+	return NULL;
+}
+
+/* Try to add an upcall to the pipefs queue.
+ * If an upcall owned by our uid already exists, then we return a reference
+ * to that upcall instead of adding the new upcall.
+ */
+static inline struct gss_upcall_msg *
+gss_add_msg(struct gss_upcall_msg *gss_msg)
+{
+	struct rpc_inode *rpci = gss_msg->inode;
+	struct inode *inode = &rpci->vfs_inode;
+	struct gss_upcall_msg *old;
+
+	spin_lock(&inode->i_lock);
+	old = __gss_find_upcall(rpci, gss_msg->uid);
+	if (old == NULL) {
+		atomic_inc(&gss_msg->count);
+		list_add(&gss_msg->list, &rpci->in_downcall);
+	} else
+		gss_msg = old;
+	spin_unlock(&inode->i_lock);
+	return gss_msg;
+}
+
+static void
+__gss_unhash_msg(struct gss_upcall_msg *gss_msg)
+{
+	list_del_init(&gss_msg->list);
+	rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
+	wake_up_all(&gss_msg->waitqueue);
+	atomic_dec(&gss_msg->count);
+}
+
+static void
+gss_unhash_msg(struct gss_upcall_msg *gss_msg)
+{
+	struct inode *inode = &gss_msg->inode->vfs_inode;
+
+	if (list_empty(&gss_msg->list))
+		return;
+	spin_lock(&inode->i_lock);
+	if (!list_empty(&gss_msg->list))
+		__gss_unhash_msg(gss_msg);
+	spin_unlock(&inode->i_lock);
+}
+
+static void
+gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss_msg)
+{
+	switch (gss_msg->msg.errno) {
+	case 0:
+		if (gss_msg->ctx == NULL)
+			break;
+		clear_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags);
+		gss_cred_set_ctx(&gss_cred->gc_base, gss_msg->ctx);
+		break;
+	case -EKEYEXPIRED:
+		set_bit(RPCAUTH_CRED_NEGATIVE, &gss_cred->gc_base.cr_flags);
+	}
+	gss_cred->gc_upcall_timestamp = jiffies;
+	gss_cred->gc_upcall = NULL;
+	rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno);
+}
+
+static void
+gss_upcall_callback(struct rpc_task *task)
+{
+	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
+			struct gss_cred, gc_base);
+	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
+	struct inode *inode = &gss_msg->inode->vfs_inode;
+
+	spin_lock(&inode->i_lock);
+	gss_handle_downcall_result(gss_cred, gss_msg);
+	spin_unlock(&inode->i_lock);
+	task->tk_status = gss_msg->msg.errno;
+	gss_release_msg(gss_msg);
+}
+
+static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
+{
+	gss_msg->msg.data = &gss_msg->uid;
+	gss_msg->msg.len = sizeof(gss_msg->uid);
+}
+
+static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
+				struct rpc_clnt *clnt, int machine_cred)
+{
+	struct gss_api_mech *mech = gss_msg->auth->mech;
+	char *p = gss_msg->databuf;
+	int len = 0;
+
+	gss_msg->msg.len = sprintf(gss_msg->databuf, "mech=%s uid=%d ",
+				   mech->gm_name,
+				   gss_msg->uid);
+	p += gss_msg->msg.len;
+	if (clnt->cl_principal) {
+		len = sprintf(p, "target=%s ", clnt->cl_principal);
+		p += len;
+		gss_msg->msg.len += len;
+	}
+	if (machine_cred) {
+		len = sprintf(p, "service=* ");
+		p += len;
+		gss_msg->msg.len += len;
+	} else if (!strcmp(clnt->cl_program->name, "nfs4_cb")) {
+		len = sprintf(p, "service=nfs ");
+		p += len;
+		gss_msg->msg.len += len;
+	}
+	if (mech->gm_upcall_enctypes) {
+		len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes);
+		p += len;
+		gss_msg->msg.len += len;
+	}
+	len = sprintf(p, "\n");
+	gss_msg->msg.len += len;
+
+	gss_msg->msg.data = gss_msg->databuf;
+	BUG_ON(gss_msg->msg.len > UPCALL_BUF_LEN);
+}
+
+static void gss_encode_msg(struct gss_upcall_msg *gss_msg,
+				struct rpc_clnt *clnt, int machine_cred)
+{
+	if (pipe_version == 0)
+		gss_encode_v0_msg(gss_msg);
+	else /* pipe_version == 1 */
+		gss_encode_v1_msg(gss_msg, clnt, machine_cred);
+}
+
+static inline struct gss_upcall_msg *
+gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
+		int machine_cred)
+{
+	struct gss_upcall_msg *gss_msg;
+	int vers;
+
+	gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
+	if (gss_msg == NULL)
+		return ERR_PTR(-ENOMEM);
+	vers = get_pipe_version();
+	if (vers < 0) {
+		kfree(gss_msg);
+		return ERR_PTR(vers);
+	}
+	gss_msg->inode = RPC_I(gss_auth->dentry[vers]->d_inode);
+	INIT_LIST_HEAD(&gss_msg->list);
+	rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq");
+	init_waitqueue_head(&gss_msg->waitqueue);
+	atomic_set(&gss_msg->count, 1);
+	gss_msg->uid = uid;
+	gss_msg->auth = gss_auth;
+	gss_encode_msg(gss_msg, clnt, machine_cred);
+	return gss_msg;
+}
+
+static struct gss_upcall_msg *
+gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred)
+{
+	struct gss_cred *gss_cred = container_of(cred,
+			struct gss_cred, gc_base);
+	struct gss_upcall_msg *gss_new, *gss_msg;
+	uid_t uid = cred->cr_uid;
+
+	gss_new = gss_alloc_msg(gss_auth, uid, clnt, gss_cred->gc_machine_cred);
+	if (IS_ERR(gss_new))
+		return gss_new;
+	gss_msg = gss_add_msg(gss_new);
+	if (gss_msg == gss_new) {
+		struct inode *inode = &gss_new->inode->vfs_inode;
+		int res = rpc_queue_upcall(inode, &gss_new->msg);
+		if (res) {
+			gss_unhash_msg(gss_new);
+			gss_msg = ERR_PTR(res);
+		}
+	} else
+		gss_release_msg(gss_new);
+	return gss_msg;
+}
+
+static void warn_gssd(void)
+{
+	static unsigned long ratelimit;
+	unsigned long now = jiffies;
+
+	if (time_after(now, ratelimit)) {
+		printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n"
+				"Please check user daemon is running.\n");
+		ratelimit = now + 15*HZ;
+	}
+}
+
+static inline int
+gss_refresh_upcall(struct rpc_task *task)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	struct gss_auth *gss_auth = container_of(cred->cr_auth,
+			struct gss_auth, rpc_auth);
+	struct gss_cred *gss_cred = container_of(cred,
+			struct gss_cred, gc_base);
+	struct gss_upcall_msg *gss_msg;
+	struct inode *inode;
+	int err = 0;
+
+	dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid,
+								cred->cr_uid);
+	gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred);
+	if (PTR_ERR(gss_msg) == -EAGAIN) {
+		/* XXX: warning on the first, under the assumption we
+		 * shouldn't normally hit this case on a refresh. */
+		warn_gssd();
+		task->tk_timeout = 15*HZ;
+		rpc_sleep_on(&pipe_version_rpc_waitqueue, task, NULL);
+		return -EAGAIN;
+	}
+	if (IS_ERR(gss_msg)) {
+		err = PTR_ERR(gss_msg);
+		goto out;
+	}
+	inode = &gss_msg->inode->vfs_inode;
+	spin_lock(&inode->i_lock);
+	if (gss_cred->gc_upcall != NULL)
+		rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL);
+	else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) {
+		task->tk_timeout = 0;
+		gss_cred->gc_upcall = gss_msg;
+		/* gss_upcall_callback will release the reference to gss_upcall_msg */
+		atomic_inc(&gss_msg->count);
+		rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback);
+	} else {
+		gss_handle_downcall_result(gss_cred, gss_msg);
+		err = gss_msg->msg.errno;
+	}
+	spin_unlock(&inode->i_lock);
+	gss_release_msg(gss_msg);
+out:
+	dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n",
+			task->tk_pid, cred->cr_uid, err);
+	return err;
+}
+
+static inline int
+gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
+{
+	struct inode *inode;
+	struct rpc_cred *cred = &gss_cred->gc_base;
+	struct gss_upcall_msg *gss_msg;
+	DEFINE_WAIT(wait);
+	int err = 0;
+
+	dprintk("RPC:       gss_upcall for uid %u\n", cred->cr_uid);
+retry:
+	gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred);
+	if (PTR_ERR(gss_msg) == -EAGAIN) {
+		err = wait_event_interruptible_timeout(pipe_version_waitqueue,
+				pipe_version >= 0, 15*HZ);
+		if (pipe_version < 0) {
+			warn_gssd();
+			err = -EACCES;
+		}
+		if (err)
+			goto out;
+		goto retry;
+	}
+	if (IS_ERR(gss_msg)) {
+		err = PTR_ERR(gss_msg);
+		goto out;
+	}
+	inode = &gss_msg->inode->vfs_inode;
+	for (;;) {
+		prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE);
+		spin_lock(&inode->i_lock);
+		if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) {
+			break;
+		}
+		spin_unlock(&inode->i_lock);
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			goto out_intr;
+		}
+		schedule();
+	}
+	if (gss_msg->ctx)
+		gss_cred_set_ctx(cred, gss_msg->ctx);
+	else
+		err = gss_msg->msg.errno;
+	spin_unlock(&inode->i_lock);
+out_intr:
+	finish_wait(&gss_msg->waitqueue, &wait);
+	gss_release_msg(gss_msg);
+out:
+	dprintk("RPC:       gss_create_upcall for uid %u result %d\n",
+			cred->cr_uid, err);
+	return err;
+}
+
+static ssize_t
+gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+		char __user *dst, size_t buflen)
+{
+	char *data = (char *)msg->data + msg->copied;
+	size_t mlen = min(msg->len, buflen);
+	unsigned long left;
+
+	left = copy_to_user(dst, data, mlen);
+	if (left == mlen) {
+		msg->errno = -EFAULT;
+		return -EFAULT;
+	}
+
+	mlen -= left;
+	msg->copied += mlen;
+	msg->errno = 0;
+	return mlen;
+}
+
+#define MSG_BUF_MAXSIZE 1024
+
+static ssize_t
+gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
+{
+	const void *p, *end;
+	void *buf;
+	struct gss_upcall_msg *gss_msg;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gss_cl_ctx *ctx;
+	uid_t uid;
+	ssize_t err = -EFBIG;
+
+	if (mlen > MSG_BUF_MAXSIZE)
+		goto out;
+	err = -ENOMEM;
+	buf = kmalloc(mlen, GFP_NOFS);
+	if (!buf)
+		goto out;
+
+	err = -EFAULT;
+	if (copy_from_user(buf, src, mlen))
+		goto err;
+
+	end = (const void *)((char *)buf + mlen);
+	p = simple_get_bytes(buf, end, &uid, sizeof(uid));
+	if (IS_ERR(p)) {
+		err = PTR_ERR(p);
+		goto err;
+	}
+
+	err = -ENOMEM;
+	ctx = gss_alloc_context();
+	if (ctx == NULL)
+		goto err;
+
+	err = -ENOENT;
+	/* Find a matching upcall */
+	spin_lock(&inode->i_lock);
+	gss_msg = __gss_find_upcall(RPC_I(inode), uid);
+	if (gss_msg == NULL) {
+		spin_unlock(&inode->i_lock);
+		goto err_put_ctx;
+	}
+	list_del_init(&gss_msg->list);
+	spin_unlock(&inode->i_lock);
+
+	p = gss_fill_context(p, end, ctx, gss_msg->auth->mech);
+	if (IS_ERR(p)) {
+		err = PTR_ERR(p);
+		switch (err) {
+		case -EACCES:
+		case -EKEYEXPIRED:
+			gss_msg->msg.errno = err;
+			err = mlen;
+			break;
+		case -EFAULT:
+		case -ENOMEM:
+		case -EINVAL:
+		case -ENOSYS:
+			gss_msg->msg.errno = -EAGAIN;
+			break;
+		default:
+			printk(KERN_CRIT "%s: bad return from "
+				"gss_fill_context: %zd\n", __func__, err);
+			BUG();
+		}
+		goto err_release_msg;
+	}
+	gss_msg->ctx = gss_get_ctx(ctx);
+	err = mlen;
+
+err_release_msg:
+	spin_lock(&inode->i_lock);
+	__gss_unhash_msg(gss_msg);
+	spin_unlock(&inode->i_lock);
+	gss_release_msg(gss_msg);
+err_put_ctx:
+	gss_put_ctx(ctx);
+err:
+	kfree(buf);
+out:
+	dprintk("RPC:       gss_pipe_downcall returning %Zd\n", err);
+	return err;
+}
+
+static int gss_pipe_open(struct inode *inode, int new_version)
+{
+	int ret = 0;
+
+	spin_lock(&pipe_version_lock);
+	if (pipe_version < 0) {
+		/* First open of any gss pipe determines the version: */
+		pipe_version = new_version;
+		rpc_wake_up(&pipe_version_rpc_waitqueue);
+		wake_up(&pipe_version_waitqueue);
+	} else if (pipe_version != new_version) {
+		/* Trying to open a pipe of a different version */
+		ret = -EBUSY;
+		goto out;
+	}
+	atomic_inc(&pipe_users);
+out:
+	spin_unlock(&pipe_version_lock);
+	return ret;
+
+}
+
+static int gss_pipe_open_v0(struct inode *inode)
+{
+	return gss_pipe_open(inode, 0);
+}
+
+static int gss_pipe_open_v1(struct inode *inode)
+{
+	return gss_pipe_open(inode, 1);
+}
+
+static void
+gss_pipe_release(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	struct gss_upcall_msg *gss_msg;
+
+restart:
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
+
+		if (!list_empty(&gss_msg->msg.list))
+			continue;
+		gss_msg->msg.errno = -EPIPE;
+		atomic_inc(&gss_msg->count);
+		__gss_unhash_msg(gss_msg);
+		spin_unlock(&inode->i_lock);
+		gss_release_msg(gss_msg);
+		goto restart;
+	}
+	spin_unlock(&inode->i_lock);
+
+	put_pipe_version();
+}
+
+static void
+gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+	struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg);
+
+	if (msg->errno < 0) {
+		dprintk("RPC:       gss_pipe_destroy_msg releasing msg %p\n",
+				gss_msg);
+		atomic_inc(&gss_msg->count);
+		gss_unhash_msg(gss_msg);
+		if (msg->errno == -ETIMEDOUT)
+			warn_gssd();
+		gss_release_msg(gss_msg);
+	}
+}
+
+/*
+ * NOTE: we have the opportunity to use different
+ * parameters based on the input flavor (which must be a pseudoflavor)
+ */
+static struct rpc_auth *
+gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+{
+	struct gss_auth *gss_auth;
+	struct rpc_auth * auth;
+	int err = -ENOMEM; /* XXX? */
+
+	dprintk("RPC:       creating GSS authenticator for client %p\n", clnt);
+
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(err);
+	if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
+		goto out_dec;
+	gss_auth->client = clnt;
+	err = -EINVAL;
+	gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
+	if (!gss_auth->mech) {
+		printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n",
+				__func__, flavor);
+		goto err_free;
+	}
+	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
+	if (gss_auth->service == 0)
+		goto err_put_mech;
+	auth = &gss_auth->rpc_auth;
+	auth->au_cslack = GSS_CRED_SLACK >> 2;
+	auth->au_rslack = GSS_VERF_SLACK >> 2;
+	auth->au_ops = &authgss_ops;
+	auth->au_flavor = flavor;
+	atomic_set(&auth->au_count, 1);
+	kref_init(&gss_auth->kref);
+
+	/*
+	 * Note: if we created the old pipe first, then someone who
+	 * examined the directory at the right moment might conclude
+	 * that we supported only the old pipe.  So we instead create
+	 * the new pipe first.
+	 */
+	gss_auth->dentry[1] = rpc_mkpipe(clnt->cl_path.dentry,
+					 "gssd",
+					 clnt, &gss_upcall_ops_v1,
+					 RPC_PIPE_WAIT_FOR_OPEN);
+	if (IS_ERR(gss_auth->dentry[1])) {
+		err = PTR_ERR(gss_auth->dentry[1]);
+		goto err_put_mech;
+	}
+
+	gss_auth->dentry[0] = rpc_mkpipe(clnt->cl_path.dentry,
+					 gss_auth->mech->gm_name,
+					 clnt, &gss_upcall_ops_v0,
+					 RPC_PIPE_WAIT_FOR_OPEN);
+	if (IS_ERR(gss_auth->dentry[0])) {
+		err = PTR_ERR(gss_auth->dentry[0]);
+		goto err_unlink_pipe_1;
+	}
+	err = rpcauth_init_credcache(auth);
+	if (err)
+		goto err_unlink_pipe_0;
+
+	return auth;
+err_unlink_pipe_0:
+	rpc_unlink(gss_auth->dentry[0]);
+err_unlink_pipe_1:
+	rpc_unlink(gss_auth->dentry[1]);
+err_put_mech:
+	gss_mech_put(gss_auth->mech);
+err_free:
+	kfree(gss_auth);
+out_dec:
+	module_put(THIS_MODULE);
+	return ERR_PTR(err);
+}
+
+static void
+gss_free(struct gss_auth *gss_auth)
+{
+	rpc_unlink(gss_auth->dentry[1]);
+	rpc_unlink(gss_auth->dentry[0]);
+	gss_mech_put(gss_auth->mech);
+
+	kfree(gss_auth);
+	module_put(THIS_MODULE);
+}
+
+static void
+gss_free_callback(struct kref *kref)
+{
+	struct gss_auth *gss_auth = container_of(kref, struct gss_auth, kref);
+
+	gss_free(gss_auth);
+}
+
+static void
+gss_destroy(struct rpc_auth *auth)
+{
+	struct gss_auth *gss_auth;
+
+	dprintk("RPC:       destroying GSS authenticator %p flavor %d\n",
+			auth, auth->au_flavor);
+
+	rpcauth_destroy_credcache(auth);
+
+	gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+	kref_put(&gss_auth->kref, gss_free_callback);
+}
+
+/*
+ * gss_destroying_context will cause the RPCSEC_GSS to send a NULL RPC call
+ * to the server with the GSS control procedure field set to
+ * RPC_GSS_PROC_DESTROY. This should normally cause the server to release
+ * all RPCSEC_GSS state associated with that context.
+ */
+static int
+gss_destroying_context(struct rpc_cred *cred)
+{
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+	struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+	struct rpc_task *task;
+
+	if (gss_cred->gc_ctx == NULL ||
+	    test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+		return 0;
+
+	gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+	cred->cr_ops = &gss_nullops;
+
+	/* Take a reference to ensure the cred will be destroyed either
+	 * by the RPC call or by the put_rpccred() below */
+	get_rpccred(cred);
+
+	task = rpc_call_null(gss_auth->client, cred, RPC_TASK_ASYNC|RPC_TASK_SOFT);
+	if (!IS_ERR(task))
+		rpc_put_task(task);
+
+	put_rpccred(cred);
+	return 1;
+}
+
+/* gss_destroy_cred (and gss_free_ctx) are used to clean up after failure
+ * to create a new cred or context, so they check that things have been
+ * allocated before freeing them. */
+static void
+gss_do_free_ctx(struct gss_cl_ctx *ctx)
+{
+	dprintk("RPC:       gss_free_ctx\n");
+
+	gss_delete_sec_context(&ctx->gc_gss_ctx);
+	kfree(ctx->gc_wire_ctx.data);
+	kfree(ctx);
+}
+
+static void
+gss_free_ctx_callback(struct rcu_head *head)
+{
+	struct gss_cl_ctx *ctx = container_of(head, struct gss_cl_ctx, gc_rcu);
+	gss_do_free_ctx(ctx);
+}
+
+static void
+gss_free_ctx(struct gss_cl_ctx *ctx)
+{
+	call_rcu(&ctx->gc_rcu, gss_free_ctx_callback);
+}
+
+static void
+gss_free_cred(struct gss_cred *gss_cred)
+{
+	dprintk("RPC:       gss_free_cred %p\n", gss_cred);
+	kfree(gss_cred);
+}
+
+static void
+gss_free_cred_callback(struct rcu_head *head)
+{
+	struct gss_cred *gss_cred = container_of(head, struct gss_cred, gc_base.cr_rcu);
+	gss_free_cred(gss_cred);
+}
+
+static void
+gss_destroy_nullcred(struct rpc_cred *cred)
+{
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+	struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+	struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+
+	rcu_assign_pointer(gss_cred->gc_ctx, NULL);
+	call_rcu(&cred->cr_rcu, gss_free_cred_callback);
+	if (ctx)
+		gss_put_ctx(ctx);
+	kref_put(&gss_auth->kref, gss_free_callback);
+}
+
+static void
+gss_destroy_cred(struct rpc_cred *cred)
+{
+
+	if (gss_destroying_context(cred))
+		return;
+	gss_destroy_nullcred(cred);
+}
+
+/*
+ * Lookup RPCSEC_GSS cred for the current process
+ */
+static struct rpc_cred *
+gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	return rpcauth_lookup_credcache(auth, acred, flags);
+}
+
+static struct rpc_cred *
+gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+	struct gss_cred	*cred = NULL;
+	int err = -ENOMEM;
+
+	dprintk("RPC:       gss_create_cred for uid %d, flavor %d\n",
+		acred->uid, auth->au_flavor);
+
+	if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
+		goto out_err;
+
+	rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops);
+	/*
+	 * Note: in order to force a call to call_refresh(), we deliberately
+	 * fail to flag the credential as RPCAUTH_CRED_UPTODATE.
+	 */
+	cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
+	cred->gc_service = gss_auth->service;
+	cred->gc_machine_cred = acred->machine_cred;
+	kref_get(&gss_auth->kref);
+	return &cred->gc_base;
+
+out_err:
+	dprintk("RPC:       gss_create_cred failed with error %d\n", err);
+	return ERR_PTR(err);
+}
+
+static int
+gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
+{
+	struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth);
+	struct gss_cred *gss_cred = container_of(cred,struct gss_cred, gc_base);
+	int err;
+
+	do {
+		err = gss_create_upcall(gss_auth, gss_cred);
+	} while (err == -EAGAIN);
+	return err;
+}
+
+static int
+gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
+{
+	struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+
+	if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
+		goto out;
+	/* Don't match with creds that have expired. */
+	if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+		return 0;
+	if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
+		return 0;
+out:
+	if (acred->machine_cred != gss_cred->gc_machine_cred)
+		return 0;
+	return rc->cr_uid == acred->uid;
+}
+
+/*
+* Marshal credentials.
+* Maybe we should keep a cached credential for performance reasons.
+*/
+static __be32 *
+gss_marshal(struct rpc_task *task, __be32 *p)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *cred = req->rq_cred;
+	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
+						 gc_base);
+	struct gss_cl_ctx	*ctx = gss_cred_get_ctx(cred);
+	__be32		*cred_len;
+	u32             maj_stat = 0;
+	struct xdr_netobj mic;
+	struct kvec	iov;
+	struct xdr_buf	verf_buf;
+
+	dprintk("RPC: %5u gss_marshal\n", task->tk_pid);
+
+	*p++ = htonl(RPC_AUTH_GSS);
+	cred_len = p++;
+
+	spin_lock(&ctx->gc_seq_lock);
+	req->rq_seqno = ctx->gc_seq++;
+	spin_unlock(&ctx->gc_seq_lock);
+
+	*p++ = htonl((u32) RPC_GSS_VERSION);
+	*p++ = htonl((u32) ctx->gc_proc);
+	*p++ = htonl((u32) req->rq_seqno);
+	*p++ = htonl((u32) gss_cred->gc_service);
+	p = xdr_encode_netobj(p, &ctx->gc_wire_ctx);
+	*cred_len = htonl((p - (cred_len + 1)) << 2);
+
+	/* We compute the checksum for the verifier over the xdr-encoded bytes
+	 * starting with the xid and ending at the end of the credential: */
+	iov.iov_base = xprt_skip_transport_header(task->tk_xprt,
+					req->rq_snd_buf.head[0].iov_base);
+	iov.iov_len = (u8 *)p - (u8 *)iov.iov_base;
+	xdr_buf_from_iov(&iov, &verf_buf);
+
+	/* set verifier flavor*/
+	*p++ = htonl(RPC_AUTH_GSS);
+
+	mic.data = (u8 *)(p + 1);
+	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED) {
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	} else if (maj_stat != 0) {
+		printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+		goto out_put_ctx;
+	}
+	p = xdr_encode_opaque(p, NULL, mic.len);
+	gss_put_ctx(ctx);
+	return p;
+out_put_ctx:
+	gss_put_ctx(ctx);
+	return NULL;
+}
+
+static int gss_renew_cred(struct rpc_task *task)
+{
+	struct rpc_cred *oldcred = task->tk_rqstp->rq_cred;
+	struct gss_cred *gss_cred = container_of(oldcred,
+						 struct gss_cred,
+						 gc_base);
+	struct rpc_auth *auth = oldcred->cr_auth;
+	struct auth_cred acred = {
+		.uid = oldcred->cr_uid,
+		.machine_cred = gss_cred->gc_machine_cred,
+	};
+	struct rpc_cred *new;
+
+	new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	task->tk_rqstp->rq_cred = new;
+	put_rpccred(oldcred);
+	return 0;
+}
+
+static int gss_cred_is_negative_entry(struct rpc_cred *cred)
+{
+	if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) {
+		unsigned long now = jiffies;
+		unsigned long begin, expire;
+		struct gss_cred *gss_cred; 
+
+		gss_cred = container_of(cred, struct gss_cred, gc_base);
+		begin = gss_cred->gc_upcall_timestamp;
+		expire = begin + gss_expired_cred_retry_delay * HZ;
+
+		if (time_in_range_open(now, begin, expire))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+* Refresh credentials. XXX - finish
+*/
+static int
+gss_refresh(struct rpc_task *task)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	int ret = 0;
+
+	if (gss_cred_is_negative_entry(cred))
+		return -EKEYEXPIRED;
+
+	if (!test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
+			!test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags)) {
+		ret = gss_renew_cred(task);
+		if (ret < 0)
+			goto out;
+		cred = task->tk_rqstp->rq_cred;
+	}
+
+	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
+		ret = gss_refresh_upcall(task);
+out:
+	return ret;
+}
+
+/* Dummy refresh routine: used only when destroying the context */
+static int
+gss_refresh_null(struct rpc_task *task)
+{
+	return -EACCES;
+}
+
+static __be32 *
+gss_validate(struct rpc_task *task, __be32 *p)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+	__be32		seq;
+	struct kvec	iov;
+	struct xdr_buf	verf_buf;
+	struct xdr_netobj mic;
+	u32		flav,len;
+	u32		maj_stat;
+
+	dprintk("RPC: %5u gss_validate\n", task->tk_pid);
+
+	flav = ntohl(*p++);
+	if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE)
+		goto out_bad;
+	if (flav != RPC_AUTH_GSS)
+		goto out_bad;
+	seq = htonl(task->tk_rqstp->rq_seqno);
+	iov.iov_base = &seq;
+	iov.iov_len = sizeof(seq);
+	xdr_buf_from_iov(&iov, &verf_buf);
+	mic.data = (u8 *)p;
+	mic.len = len;
+
+	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic);
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	if (maj_stat) {
+		dprintk("RPC: %5u gss_validate: gss_verify_mic returned "
+				"error 0x%08x\n", task->tk_pid, maj_stat);
+		goto out_bad;
+	}
+	/* We leave it to unwrap to calculate au_rslack. For now we just
+	 * calculate the length of the verifier: */
+	cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
+	gss_put_ctx(ctx);
+	dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n",
+			task->tk_pid);
+	return p + XDR_QUADLEN(len);
+out_bad:
+	gss_put_ctx(ctx);
+	dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid);
+	return NULL;
+}
+
+static void gss_wrap_req_encode(kxdreproc_t encode, struct rpc_rqst *rqstp,
+				__be32 *p, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &rqstp->rq_snd_buf, p);
+	encode(rqstp, &xdr, obj);
+}
+
+static inline int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+		   kxdreproc_t encode, struct rpc_rqst *rqstp,
+		   __be32 *p, void *obj)
+{
+	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
+	struct xdr_buf	integ_buf;
+	__be32          *integ_len = NULL;
+	struct xdr_netobj mic;
+	u32		offset;
+	__be32		*q;
+	struct kvec	*iov;
+	u32             maj_stat = 0;
+	int		status = -EIO;
+
+	integ_len = p++;
+	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+	*p++ = htonl(rqstp->rq_seqno);
+
+	gss_wrap_req_encode(encode, rqstp, p, obj);
+
+	if (xdr_buf_subsegment(snd_buf, &integ_buf,
+				offset, snd_buf->len - offset))
+		return status;
+	*integ_len = htonl(integ_buf.len);
+
+	/* guess whether we're in the head or the tail: */
+	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+		iov = snd_buf->tail;
+	else
+		iov = snd_buf->head;
+	p = iov->iov_base + iov->iov_len;
+	mic.data = (u8 *)(p + 1);
+
+	maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+	status = -EIO; /* XXX? */
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	else if (maj_stat)
+		return status;
+	q = xdr_encode_opaque(p, NULL, mic.len);
+
+	offset = (u8 *)q - (u8 *)p;
+	iov->iov_len += offset;
+	snd_buf->len += offset;
+	return 0;
+}
+
+static void
+priv_release_snd_buf(struct rpc_rqst *rqstp)
+{
+	int i;
+
+	for (i=0; i < rqstp->rq_enc_pages_num; i++)
+		__free_page(rqstp->rq_enc_pages[i]);
+	kfree(rqstp->rq_enc_pages);
+}
+
+static int
+alloc_enc_pages(struct rpc_rqst *rqstp)
+{
+	struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+	int first, last, i;
+
+	if (snd_buf->page_len == 0) {
+		rqstp->rq_enc_pages_num = 0;
+		return 0;
+	}
+
+	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+	last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
+	rqstp->rq_enc_pages_num = last - first + 1 + 1;
+	rqstp->rq_enc_pages
+		= kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
+				GFP_NOFS);
+	if (!rqstp->rq_enc_pages)
+		goto out;
+	for (i=0; i < rqstp->rq_enc_pages_num; i++) {
+		rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
+		if (rqstp->rq_enc_pages[i] == NULL)
+			goto out_free;
+	}
+	rqstp->rq_release_snd_buf = priv_release_snd_buf;
+	return 0;
+out_free:
+	rqstp->rq_enc_pages_num = i;
+	priv_release_snd_buf(rqstp);
+out:
+	return -EAGAIN;
+}
+
+static inline int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+		  kxdreproc_t encode, struct rpc_rqst *rqstp,
+		  __be32 *p, void *obj)
+{
+	struct xdr_buf	*snd_buf = &rqstp->rq_snd_buf;
+	u32		offset;
+	u32             maj_stat;
+	int		status;
+	__be32		*opaque_len;
+	struct page	**inpages;
+	int		first;
+	int		pad;
+	struct kvec	*iov;
+	char		*tmp;
+
+	opaque_len = p++;
+	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+	*p++ = htonl(rqstp->rq_seqno);
+
+	gss_wrap_req_encode(encode, rqstp, p, obj);
+
+	status = alloc_enc_pages(rqstp);
+	if (status)
+		return status;
+	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+	inpages = snd_buf->pages + first;
+	snd_buf->pages = rqstp->rq_enc_pages;
+	snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
+	/*
+	 * Give the tail its own page, in case we need extra space in the
+	 * head when wrapping:
+	 *
+	 * call_allocate() allocates twice the slack space required
+	 * by the authentication flavor to rq_callsize.
+	 * For GSS, slack is GSS_CRED_SLACK.
+	 */
+	if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
+		tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
+		memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
+		snd_buf->tail[0].iov_base = tmp;
+	}
+	maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages);
+	/* slack space should prevent this ever happening: */
+	BUG_ON(snd_buf->len > snd_buf->buflen);
+	status = -EIO;
+	/* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was
+	 * done anyway, so it's safe to put the request on the wire: */
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	else if (maj_stat)
+		return status;
+
+	*opaque_len = htonl(snd_buf->len - offset);
+	/* guess whether we're in the head or the tail: */
+	if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+		iov = snd_buf->tail;
+	else
+		iov = snd_buf->head;
+	p = iov->iov_base + iov->iov_len;
+	pad = 3 - ((snd_buf->len - offset - 1) & 3);
+	memset(p, 0, pad);
+	iov->iov_len += pad;
+	snd_buf->len += pad;
+
+	return 0;
+}
+
+static int
+gss_wrap_req(struct rpc_task *task,
+	     kxdreproc_t encode, void *rqstp, __be32 *p, void *obj)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
+			gc_base);
+	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+	int             status = -EIO;
+
+	dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid);
+	if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
+		/* The spec seems a little ambiguous here, but I think that not
+		 * wrapping context destruction requests makes the most sense.
+		 */
+		gss_wrap_req_encode(encode, rqstp, p, obj);
+		status = 0;
+		goto out;
+	}
+	switch (gss_cred->gc_service) {
+		case RPC_GSS_SVC_NONE:
+			gss_wrap_req_encode(encode, rqstp, p, obj);
+			status = 0;
+			break;
+		case RPC_GSS_SVC_INTEGRITY:
+			status = gss_wrap_req_integ(cred, ctx, encode,
+								rqstp, p, obj);
+			break;
+		case RPC_GSS_SVC_PRIVACY:
+			status = gss_wrap_req_priv(cred, ctx, encode,
+					rqstp, p, obj);
+			break;
+	}
+out:
+	gss_put_ctx(ctx);
+	dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status);
+	return status;
+}
+
+static inline int
+gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+		struct rpc_rqst *rqstp, __be32 **p)
+{
+	struct xdr_buf	*rcv_buf = &rqstp->rq_rcv_buf;
+	struct xdr_buf integ_buf;
+	struct xdr_netobj mic;
+	u32 data_offset, mic_offset;
+	u32 integ_len;
+	u32 maj_stat;
+	int status = -EIO;
+
+	integ_len = ntohl(*(*p)++);
+	if (integ_len & 3)
+		return status;
+	data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+	mic_offset = integ_len + data_offset;
+	if (mic_offset > rcv_buf->len)
+		return status;
+	if (ntohl(*(*p)++) != rqstp->rq_seqno)
+		return status;
+
+	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+				mic_offset - data_offset))
+		return status;
+
+	if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
+		return status;
+
+	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	if (maj_stat != GSS_S_COMPLETE)
+		return status;
+	return 0;
+}
+
+static inline int
+gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+		struct rpc_rqst *rqstp, __be32 **p)
+{
+	struct xdr_buf  *rcv_buf = &rqstp->rq_rcv_buf;
+	u32 offset;
+	u32 opaque_len;
+	u32 maj_stat;
+	int status = -EIO;
+
+	opaque_len = ntohl(*(*p)++);
+	offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
+	if (offset + opaque_len > rcv_buf->len)
+		return status;
+	/* remove padding: */
+	rcv_buf->len = offset + opaque_len;
+
+	maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf);
+	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
+		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
+	if (maj_stat != GSS_S_COMPLETE)
+		return status;
+	if (ntohl(*(*p)++) != rqstp->rq_seqno)
+		return status;
+
+	return 0;
+}
+
+static int
+gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp,
+		      __be32 *p, void *obj)
+{
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	return decode(rqstp, &xdr, obj);
+}
+
+static int
+gss_unwrap_resp(struct rpc_task *task,
+		kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj)
+{
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
+	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
+			gc_base);
+	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
+	__be32		*savedp = p;
+	struct kvec	*head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head;
+	int		savedlen = head->iov_len;
+	int             status = -EIO;
+
+	if (ctx->gc_proc != RPC_GSS_PROC_DATA)
+		goto out_decode;
+	switch (gss_cred->gc_service) {
+		case RPC_GSS_SVC_NONE:
+			break;
+		case RPC_GSS_SVC_INTEGRITY:
+			status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p);
+			if (status)
+				goto out;
+			break;
+		case RPC_GSS_SVC_PRIVACY:
+			status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p);
+			if (status)
+				goto out;
+			break;
+	}
+	/* take into account extra slack for integrity and privacy cases: */
+	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
+						+ (savedlen - head->iov_len);
+out_decode:
+	status = gss_unwrap_req_decode(decode, rqstp, p, obj);
+out:
+	gss_put_ctx(ctx);
+	dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid,
+			status);
+	return status;
+}
+
+static const struct rpc_authops authgss_ops = {
+	.owner		= THIS_MODULE,
+	.au_flavor	= RPC_AUTH_GSS,
+	.au_name	= "RPCSEC_GSS",
+	.create		= gss_create,
+	.destroy	= gss_destroy,
+	.lookup_cred	= gss_lookup_cred,
+	.crcreate	= gss_create_cred
+};
+
+static const struct rpc_credops gss_credops = {
+	.cr_name	= "AUTH_GSS",
+	.crdestroy	= gss_destroy_cred,
+	.cr_init	= gss_cred_init,
+	.crbind		= rpcauth_generic_bind_cred,
+	.crmatch	= gss_match,
+	.crmarshal	= gss_marshal,
+	.crrefresh	= gss_refresh,
+	.crvalidate	= gss_validate,
+	.crwrap_req	= gss_wrap_req,
+	.crunwrap_resp	= gss_unwrap_resp,
+};
+
+static const struct rpc_credops gss_nullops = {
+	.cr_name	= "AUTH_GSS",
+	.crdestroy	= gss_destroy_nullcred,
+	.crbind		= rpcauth_generic_bind_cred,
+	.crmatch	= gss_match,
+	.crmarshal	= gss_marshal,
+	.crrefresh	= gss_refresh_null,
+	.crvalidate	= gss_validate,
+	.crwrap_req	= gss_wrap_req,
+	.crunwrap_resp	= gss_unwrap_resp,
+};
+
+static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
+	.upcall		= gss_pipe_upcall,
+	.downcall	= gss_pipe_downcall,
+	.destroy_msg	= gss_pipe_destroy_msg,
+	.open_pipe	= gss_pipe_open_v0,
+	.release_pipe	= gss_pipe_release,
+};
+
+static const struct rpc_pipe_ops gss_upcall_ops_v1 = {
+	.upcall		= gss_pipe_upcall,
+	.downcall	= gss_pipe_downcall,
+	.destroy_msg	= gss_pipe_destroy_msg,
+	.open_pipe	= gss_pipe_open_v1,
+	.release_pipe	= gss_pipe_release,
+};
+
+/*
+ * Initialize RPCSEC_GSS module
+ */
+static int __init init_rpcsec_gss(void)
+{
+	int err = 0;
+
+	err = rpcauth_register(&authgss_ops);
+	if (err)
+		goto out;
+	err = gss_svc_init();
+	if (err)
+		goto out_unregister;
+	rpc_init_wait_queue(&pipe_version_rpc_waitqueue, "gss pipe version");
+	return 0;
+out_unregister:
+	rpcauth_unregister(&authgss_ops);
+out:
+	return err;
+}
+
+static void __exit exit_rpcsec_gss(void)
+{
+	gss_svc_shutdown();
+	rpcauth_unregister(&authgss_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+
+MODULE_LICENSE("GPL");
+module_param_named(expired_cred_retry_delay,
+		   gss_expired_cred_retry_delay,
+		   uint, 0644);
+MODULE_PARM_DESC(expired_cred_retry_delay, "Timeout (in seconds) until "
+		"the RPC engine retries an expired credential");
+
+module_init(init_rpcsec_gss)
+module_exit(exit_rpcsec_gss)
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
new file mode 100644
index 00000000..c586e92b
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -0,0 +1,234 @@
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_asn1.h>
+
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+	memcpy((ptr), (char *) (str), (len)); \
+	(ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60				tag for APPLICATION 0, SEQUENCE
+					(constructed, definite-length)
+	<length>		possible multiple bytes, need to parse/generate
+	0x06			tag for OBJECT IDENTIFIER
+		<moid_length>	compile-time constant string (assume 1 byte)
+		<moid_bytes>	compile-time constant string
+	<inner_bytes>		the ANY containing the application token
+					bytes 0,1 are the token type
+					bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static int
+der_length_size( int length)
+{
+	if (length < (1<<7))
+		return 1;
+	else if (length < (1<<8))
+		return 2;
+#if (SIZEOF_INT == 2)
+	else
+		return 3;
+#else
+	else if (length < (1<<16))
+		return 3;
+	else if (length < (1<<24))
+		return 4;
+	else
+		return 5;
+#endif
+}
+
+static void
+der_write_length(unsigned char **buf, int length)
+{
+	if (length < (1<<7)) {
+		*(*buf)++ = (unsigned char) length;
+	} else {
+		*(*buf)++ = (unsigned char) (der_length_size(length)+127);
+#if (SIZEOF_INT > 2)
+		if (length >= (1<<24))
+			*(*buf)++ = (unsigned char) (length>>24);
+		if (length >= (1<<16))
+			*(*buf)++ = (unsigned char) ((length>>16)&0xff);
+#endif
+		if (length >= (1<<8))
+			*(*buf)++ = (unsigned char) ((length>>8)&0xff);
+		*(*buf)++ = (unsigned char) (length&0xff);
+	}
+}
+
+/* returns decoded length, or < 0 on failure.  Advances buf and
+   decrements bufsize */
+
+static int
+der_read_length(unsigned char **buf, int *bufsize)
+{
+	unsigned char sf;
+	int ret;
+
+	if (*bufsize < 1)
+		return -1;
+	sf = *(*buf)++;
+	(*bufsize)--;
+	if (sf & 0x80) {
+		if ((sf &= 0x7f) > ((*bufsize)-1))
+			return -1;
+		if (sf > SIZEOF_INT)
+			return -1;
+		ret = 0;
+		for (; sf; sf--) {
+			ret = (ret<<8) + (*(*buf)++);
+			(*bufsize)--;
+		}
+	} else {
+		ret = sf;
+	}
+
+	return ret;
+}
+
+/* returns the length of a token, given the mech oid and the body size */
+
+int
+g_token_size(struct xdr_netobj *mech, unsigned int body_size)
+{
+	/* set body_size to sequence contents size */
+	body_size += 2 + (int) mech->len;         /* NEED overflow check */
+	return 1 + der_length_size(body_size) + body_size;
+}
+
+EXPORT_SYMBOL_GPL(g_token_size);
+
+/* fills in a buffer with the token header.  The buffer is assumed to
+   be the right size.  buf is advanced past the token header */
+
+void
+g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf)
+{
+	*(*buf)++ = 0x60;
+	der_write_length(buf, 2 + mech->len + body_size);
+	*(*buf)++ = 0x06;
+	*(*buf)++ = (unsigned char) mech->len;
+	TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+EXPORT_SYMBOL_GPL(g_make_token_header);
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+u32
+g_verify_token_header(struct xdr_netobj *mech, int *body_size,
+		      unsigned char **buf_in, int toksize)
+{
+	unsigned char *buf = *buf_in;
+	int seqsize;
+	struct xdr_netobj toid;
+	int ret = 0;
+
+	if ((toksize-=1) < 0)
+		return G_BAD_TOK_HEADER;
+	if (*buf++ != 0x60)
+		return G_BAD_TOK_HEADER;
+
+	if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+		return G_BAD_TOK_HEADER;
+
+	if (seqsize != toksize)
+		return G_BAD_TOK_HEADER;
+
+	if ((toksize-=1) < 0)
+		return G_BAD_TOK_HEADER;
+	if (*buf++ != 0x06)
+		return G_BAD_TOK_HEADER;
+
+	if ((toksize-=1) < 0)
+		return G_BAD_TOK_HEADER;
+	toid.len = *buf++;
+
+	if ((toksize-=toid.len) < 0)
+		return G_BAD_TOK_HEADER;
+	toid.data = buf;
+	buf+=toid.len;
+
+	if (! g_OID_equal(&toid, mech))
+		ret = G_WRONG_MECH;
+
+   /* G_WRONG_MECH is not returned immediately because it's more important
+      to return G_BAD_TOK_HEADER if the token header is in fact bad */
+
+	if ((toksize-=2) < 0)
+		return G_BAD_TOK_HEADER;
+
+	if (ret)
+		return ret;
+
+	if (!ret) {
+		*buf_in = buf;
+		*body_size = toksize;
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(g_verify_token_header);
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
new file mode 100644
index 00000000..9576f35a
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -0,0 +1,990 @@
+/*
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *
+ *  Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+u32
+krb5_encrypt(
+	struct crypto_blkcipher *tfm,
+	void * iv,
+	void * in,
+	void * out,
+	int length)
+{
+	u32 ret = -EINVAL;
+	struct scatterlist sg[1];
+	u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
+	struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+
+	if (length % crypto_blkcipher_blocksize(tfm) != 0)
+		goto out;
+
+	if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+		dprintk("RPC:       gss_k5encrypt: tfm iv size too large %d\n",
+			crypto_blkcipher_ivsize(tfm));
+		goto out;
+	}
+
+	if (iv)
+		memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+
+	memcpy(out, in, length);
+	sg_init_one(sg, out, length);
+
+	ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length);
+out:
+	dprintk("RPC:       krb5_encrypt returns %d\n", ret);
+	return ret;
+}
+
+u32
+krb5_decrypt(
+     struct crypto_blkcipher *tfm,
+     void * iv,
+     void * in,
+     void * out,
+     int length)
+{
+	u32 ret = -EINVAL;
+	struct scatterlist sg[1];
+	u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
+	struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+
+	if (length % crypto_blkcipher_blocksize(tfm) != 0)
+		goto out;
+
+	if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+		dprintk("RPC:       gss_k5decrypt: tfm iv size too large %d\n",
+			crypto_blkcipher_ivsize(tfm));
+		goto out;
+	}
+	if (iv)
+		memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm));
+
+	memcpy(out, in, length);
+	sg_init_one(sg, out, length);
+
+	ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length);
+out:
+	dprintk("RPC:       gss_k5decrypt returns %d\n",ret);
+	return ret;
+}
+
+static int
+checksummer(struct scatterlist *sg, void *data)
+{
+	struct hash_desc *desc = data;
+
+	return crypto_hash_update(desc, sg, sg->length);
+}
+
+static int
+arcfour_hmac_md5_usage_to_salt(unsigned int usage, u8 salt[4])
+{
+	unsigned int ms_usage;
+
+	switch (usage) {
+	case KG_USAGE_SIGN:
+		ms_usage = 15;
+		break;
+	case KG_USAGE_SEAL:
+		ms_usage = 13;
+		break;
+	default:
+		return -EINVAL;
+	}
+	salt[0] = (ms_usage >> 0) & 0xff;
+	salt[1] = (ms_usage >> 8) & 0xff;
+	salt[2] = (ms_usage >> 16) & 0xff;
+	salt[3] = (ms_usage >> 24) & 0xff;
+
+	return 0;
+}
+
+static u32
+make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
+		       struct xdr_buf *body, int body_offset, u8 *cksumkey,
+		       unsigned int usage, struct xdr_netobj *cksumout)
+{
+	struct hash_desc                desc;
+	struct scatterlist              sg[1];
+	int err;
+	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	u8 rc4salt[4];
+	struct crypto_hash *md5;
+	struct crypto_hash *hmac_md5;
+
+	if (cksumkey == NULL)
+		return GSS_S_FAILURE;
+
+	if (cksumout->len < kctx->gk5e->cksumlength) {
+		dprintk("%s: checksum buffer length, %u, too small for %s\n",
+			__func__, cksumout->len, kctx->gk5e->name);
+		return GSS_S_FAILURE;
+	}
+
+	if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
+		dprintk("%s: invalid usage value %u\n", __func__, usage);
+		return GSS_S_FAILURE;
+	}
+
+	md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(md5))
+		return GSS_S_FAILURE;
+
+	hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
+				     CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hmac_md5)) {
+		crypto_free_hash(md5);
+		return GSS_S_FAILURE;
+	}
+
+	desc.tfm = md5;
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out;
+	sg_init_one(sg, rc4salt, 4);
+	err = crypto_hash_update(&desc, sg, 4);
+	if (err)
+		goto out;
+
+	sg_init_one(sg, header, hdrlen);
+	err = crypto_hash_update(&desc, sg, hdrlen);
+	if (err)
+		goto out;
+	err = xdr_process_buf(body, body_offset, body->len - body_offset,
+			      checksummer, &desc);
+	if (err)
+		goto out;
+	err = crypto_hash_final(&desc, checksumdata);
+	if (err)
+		goto out;
+
+	desc.tfm = hmac_md5;
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out;
+	err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
+	if (err)
+		goto out;
+
+	sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5));
+	err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5),
+				 checksumdata);
+	if (err)
+		goto out;
+
+	memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+	cksumout->len = kctx->gk5e->cksumlength;
+out:
+	crypto_free_hash(md5);
+	crypto_free_hash(hmac_md5);
+	return err ? GSS_S_FAILURE : 0;
+}
+
+/*
+ * checksum the plaintext data and hdrlen bytes of the token header
+ * The checksum is performed over the first 8 bytes of the
+ * gss token header and then over the data body
+ */
+u32
+make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
+	      struct xdr_buf *body, int body_offset, u8 *cksumkey,
+	      unsigned int usage, struct xdr_netobj *cksumout)
+{
+	struct hash_desc                desc;
+	struct scatterlist              sg[1];
+	int err;
+	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	unsigned int checksumlen;
+
+	if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR)
+		return make_checksum_hmac_md5(kctx, header, hdrlen,
+					      body, body_offset,
+					      cksumkey, usage, cksumout);
+
+	if (cksumout->len < kctx->gk5e->cksumlength) {
+		dprintk("%s: checksum buffer length, %u, too small for %s\n",
+			__func__, cksumout->len, kctx->gk5e->name);
+		return GSS_S_FAILURE;
+	}
+
+	desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(desc.tfm))
+		return GSS_S_FAILURE;
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	checksumlen = crypto_hash_digestsize(desc.tfm);
+
+	if (cksumkey != NULL) {
+		err = crypto_hash_setkey(desc.tfm, cksumkey,
+					 kctx->gk5e->keylength);
+		if (err)
+			goto out;
+	}
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out;
+	sg_init_one(sg, header, hdrlen);
+	err = crypto_hash_update(&desc, sg, hdrlen);
+	if (err)
+		goto out;
+	err = xdr_process_buf(body, body_offset, body->len - body_offset,
+			      checksummer, &desc);
+	if (err)
+		goto out;
+	err = crypto_hash_final(&desc, checksumdata);
+	if (err)
+		goto out;
+
+	switch (kctx->gk5e->ctype) {
+	case CKSUMTYPE_RSA_MD5:
+		err = kctx->gk5e->encrypt(kctx->seq, NULL, checksumdata,
+					  checksumdata, checksumlen);
+		if (err)
+			goto out;
+		memcpy(cksumout->data,
+		       checksumdata + checksumlen - kctx->gk5e->cksumlength,
+		       kctx->gk5e->cksumlength);
+		break;
+	case CKSUMTYPE_HMAC_SHA1_DES3:
+		memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+		break;
+	default:
+		BUG();
+		break;
+	}
+	cksumout->len = kctx->gk5e->cksumlength;
+out:
+	crypto_free_hash(desc.tfm);
+	return err ? GSS_S_FAILURE : 0;
+}
+
+/*
+ * checksum the plaintext data and hdrlen bytes of the token header
+ * Per rfc4121, sec. 4.2.4, the checksum is performed over the data
+ * body then over the first 16 octets of the MIC token
+ * Inclusion of the header data in the calculation of the
+ * checksum is optional.
+ */
+u32
+make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
+		 struct xdr_buf *body, int body_offset, u8 *cksumkey,
+		 unsigned int usage, struct xdr_netobj *cksumout)
+{
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	int err;
+	u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	unsigned int checksumlen;
+
+	if (kctx->gk5e->keyed_cksum == 0) {
+		dprintk("%s: expected keyed hash for %s\n",
+			__func__, kctx->gk5e->name);
+		return GSS_S_FAILURE;
+	}
+	if (cksumkey == NULL) {
+		dprintk("%s: no key supplied for %s\n",
+			__func__, kctx->gk5e->name);
+		return GSS_S_FAILURE;
+	}
+
+	desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
+							CRYPTO_ALG_ASYNC);
+	if (IS_ERR(desc.tfm))
+		return GSS_S_FAILURE;
+	checksumlen = crypto_hash_digestsize(desc.tfm);
+	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength);
+	if (err)
+		goto out;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out;
+	err = xdr_process_buf(body, body_offset, body->len - body_offset,
+			      checksummer, &desc);
+	if (err)
+		goto out;
+	if (header != NULL) {
+		sg_init_one(sg, header, hdrlen);
+		err = crypto_hash_update(&desc, sg, hdrlen);
+		if (err)
+			goto out;
+	}
+	err = crypto_hash_final(&desc, checksumdata);
+	if (err)
+		goto out;
+
+	cksumout->len = kctx->gk5e->cksumlength;
+
+	switch (kctx->gk5e->ctype) {
+	case CKSUMTYPE_HMAC_SHA1_96_AES128:
+	case CKSUMTYPE_HMAC_SHA1_96_AES256:
+		/* note that this truncates the hash */
+		memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
+		break;
+	default:
+		BUG();
+		break;
+	}
+out:
+	crypto_free_hash(desc.tfm);
+	return err ? GSS_S_FAILURE : 0;
+}
+
+struct encryptor_desc {
+	u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
+	struct blkcipher_desc desc;
+	int pos;
+	struct xdr_buf *outbuf;
+	struct page **pages;
+	struct scatterlist infrags[4];
+	struct scatterlist outfrags[4];
+	int fragno;
+	int fraglen;
+};
+
+static int
+encryptor(struct scatterlist *sg, void *data)
+{
+	struct encryptor_desc *desc = data;
+	struct xdr_buf *outbuf = desc->outbuf;
+	struct page *in_page;
+	int thislen = desc->fraglen + sg->length;
+	int fraglen, ret;
+	int page_pos;
+
+	/* Worst case is 4 fragments: head, end of page 1, start
+	 * of page 2, tail.  Anything more is a bug. */
+	BUG_ON(desc->fragno > 3);
+
+	page_pos = desc->pos - outbuf->head[0].iov_len;
+	if (page_pos >= 0 && page_pos < outbuf->page_len) {
+		/* pages are not in place: */
+		int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
+		in_page = desc->pages[i];
+	} else {
+		in_page = sg_page(sg);
+	}
+	sg_set_page(&desc->infrags[desc->fragno], in_page, sg->length,
+		    sg->offset);
+	sg_set_page(&desc->outfrags[desc->fragno], sg_page(sg), sg->length,
+		    sg->offset);
+	desc->fragno++;
+	desc->fraglen += sg->length;
+	desc->pos += sg->length;
+
+	fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+	thislen -= fraglen;
+
+	if (thislen == 0)
+		return 0;
+
+	sg_mark_end(&desc->infrags[desc->fragno - 1]);
+	sg_mark_end(&desc->outfrags[desc->fragno - 1]);
+
+	ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags,
+					  desc->infrags, thislen);
+	if (ret)
+		return ret;
+
+	sg_init_table(desc->infrags, 4);
+	sg_init_table(desc->outfrags, 4);
+
+	if (fraglen) {
+		sg_set_page(&desc->outfrags[0], sg_page(sg), fraglen,
+				sg->offset + sg->length - fraglen);
+		desc->infrags[0] = desc->outfrags[0];
+		sg_assign_page(&desc->infrags[0], in_page);
+		desc->fragno = 1;
+		desc->fraglen = fraglen;
+	} else {
+		desc->fragno = 0;
+		desc->fraglen = 0;
+	}
+	return 0;
+}
+
+int
+gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+		    int offset, struct page **pages)
+{
+	int ret;
+	struct encryptor_desc desc;
+
+	BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+
+	memset(desc.iv, 0, sizeof(desc.iv));
+	desc.desc.tfm = tfm;
+	desc.desc.info = desc.iv;
+	desc.desc.flags = 0;
+	desc.pos = offset;
+	desc.outbuf = buf;
+	desc.pages = pages;
+	desc.fragno = 0;
+	desc.fraglen = 0;
+
+	sg_init_table(desc.infrags, 4);
+	sg_init_table(desc.outfrags, 4);
+
+	ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
+	return ret;
+}
+
+struct decryptor_desc {
+	u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
+	struct blkcipher_desc desc;
+	struct scatterlist frags[4];
+	int fragno;
+	int fraglen;
+};
+
+static int
+decryptor(struct scatterlist *sg, void *data)
+{
+	struct decryptor_desc *desc = data;
+	int thislen = desc->fraglen + sg->length;
+	int fraglen, ret;
+
+	/* Worst case is 4 fragments: head, end of page 1, start
+	 * of page 2, tail.  Anything more is a bug. */
+	BUG_ON(desc->fragno > 3);
+	sg_set_page(&desc->frags[desc->fragno], sg_page(sg), sg->length,
+		    sg->offset);
+	desc->fragno++;
+	desc->fraglen += sg->length;
+
+	fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+	thislen -= fraglen;
+
+	if (thislen == 0)
+		return 0;
+
+	sg_mark_end(&desc->frags[desc->fragno - 1]);
+
+	ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags,
+					  desc->frags, thislen);
+	if (ret)
+		return ret;
+
+	sg_init_table(desc->frags, 4);
+
+	if (fraglen) {
+		sg_set_page(&desc->frags[0], sg_page(sg), fraglen,
+				sg->offset + sg->length - fraglen);
+		desc->fragno = 1;
+		desc->fraglen = fraglen;
+	} else {
+		desc->fragno = 0;
+		desc->fraglen = 0;
+	}
+	return 0;
+}
+
+int
+gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+		    int offset)
+{
+	struct decryptor_desc desc;
+
+	/* XXXJBF: */
+	BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+
+	memset(desc.iv, 0, sizeof(desc.iv));
+	desc.desc.tfm = tfm;
+	desc.desc.info = desc.iv;
+	desc.desc.flags = 0;
+	desc.fragno = 0;
+	desc.fraglen = 0;
+
+	sg_init_table(desc.frags, 4);
+
+	return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+}
+
+/*
+ * This function makes the assumption that it was ultimately called
+ * from gss_wrap().
+ *
+ * The client auth_gss code moves any existing tail data into a
+ * separate page before calling gss_wrap.
+ * The server svcauth_gss code ensures that both the head and the
+ * tail have slack space of RPC_MAX_AUTH_SIZE before calling gss_wrap.
+ *
+ * Even with that guarantee, this function may be called more than
+ * once in the processing of gss_wrap().  The best we can do is
+ * verify at compile-time (see GSS_KRB5_SLACK_CHECK) that the
+ * largest expected shift will fit within RPC_MAX_AUTH_SIZE.
+ * At run-time we can verify that a single invocation of this
+ * function doesn't attempt to use more the RPC_MAX_AUTH_SIZE.
+ */
+
+int
+xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen)
+{
+	u8 *p;
+
+	if (shiftlen == 0)
+		return 0;
+
+	BUILD_BUG_ON(GSS_KRB5_MAX_SLACK_NEEDED > RPC_MAX_AUTH_SIZE);
+	BUG_ON(shiftlen > RPC_MAX_AUTH_SIZE);
+
+	p = buf->head[0].iov_base + base;
+
+	memmove(p + shiftlen, p, buf->head[0].iov_len - base);
+
+	buf->head[0].iov_len += shiftlen;
+	buf->len += shiftlen;
+
+	return 0;
+}
+
+static u32
+gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
+		   u32 offset, u8 *iv, struct page **pages, int encrypt)
+{
+	u32 ret;
+	struct scatterlist sg[1];
+	struct blkcipher_desc desc = { .tfm = cipher, .info = iv };
+	u8 data[crypto_blkcipher_blocksize(cipher) * 2];
+	struct page **save_pages;
+	u32 len = buf->len - offset;
+
+	BUG_ON(len > crypto_blkcipher_blocksize(cipher) * 2);
+
+	/*
+	 * For encryption, we want to read from the cleartext
+	 * page cache pages, and write the encrypted data to
+	 * the supplied xdr_buf pages.
+	 */
+	save_pages = buf->pages;
+	if (encrypt)
+		buf->pages = pages;
+
+	ret = read_bytes_from_xdr_buf(buf, offset, data, len);
+	buf->pages = save_pages;
+	if (ret)
+		goto out;
+
+	sg_init_one(sg, data, len);
+
+	if (encrypt)
+		ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+	else
+		ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len);
+
+	if (ret)
+		goto out;
+
+	ret = write_bytes_to_xdr_buf(buf, offset, data, len);
+
+out:
+	return ret;
+}
+
+u32
+gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
+		     struct xdr_buf *buf, int ec, struct page **pages)
+{
+	u32 err;
+	struct xdr_netobj hmac;
+	u8 *cksumkey;
+	u8 *ecptr;
+	struct crypto_blkcipher *cipher, *aux_cipher;
+	int blocksize;
+	struct page **save_pages;
+	int nblocks, nbytes;
+	struct encryptor_desc desc;
+	u32 cbcbytes;
+	unsigned int usage;
+
+	if (kctx->initiate) {
+		cipher = kctx->initiator_enc;
+		aux_cipher = kctx->initiator_enc_aux;
+		cksumkey = kctx->initiator_integ;
+		usage = KG_USAGE_INITIATOR_SEAL;
+	} else {
+		cipher = kctx->acceptor_enc;
+		aux_cipher = kctx->acceptor_enc_aux;
+		cksumkey = kctx->acceptor_integ;
+		usage = KG_USAGE_ACCEPTOR_SEAL;
+	}
+	blocksize = crypto_blkcipher_blocksize(cipher);
+
+	/* hide the gss token header and insert the confounder */
+	offset += GSS_KRB5_TOK_HDR_LEN;
+	if (xdr_extend_head(buf, offset, kctx->gk5e->conflen))
+		return GSS_S_FAILURE;
+	gss_krb5_make_confounder(buf->head[0].iov_base + offset, kctx->gk5e->conflen);
+	offset -= GSS_KRB5_TOK_HDR_LEN;
+
+	if (buf->tail[0].iov_base != NULL) {
+		ecptr = buf->tail[0].iov_base + buf->tail[0].iov_len;
+	} else {
+		buf->tail[0].iov_base = buf->head[0].iov_base
+							+ buf->head[0].iov_len;
+		buf->tail[0].iov_len = 0;
+		ecptr = buf->tail[0].iov_base;
+	}
+
+	memset(ecptr, 'X', ec);
+	buf->tail[0].iov_len += ec;
+	buf->len += ec;
+
+	/* copy plaintext gss token header after filler (if any) */
+	memcpy(ecptr + ec, buf->head[0].iov_base + offset,
+						GSS_KRB5_TOK_HDR_LEN);
+	buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
+	buf->len += GSS_KRB5_TOK_HDR_LEN;
+
+	/* Do the HMAC */
+	hmac.len = GSS_KRB5_MAX_CKSUM_LEN;
+	hmac.data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+
+	/*
+	 * When we are called, pages points to the real page cache
+	 * data -- which we can't go and encrypt!  buf->pages points
+	 * to scratch pages which we are going to send off to the
+	 * client/server.  Swap in the plaintext pages to calculate
+	 * the hmac.
+	 */
+	save_pages = buf->pages;
+	buf->pages = pages;
+
+	err = make_checksum_v2(kctx, NULL, 0, buf,
+			       offset + GSS_KRB5_TOK_HDR_LEN,
+			       cksumkey, usage, &hmac);
+	buf->pages = save_pages;
+	if (err)
+		return GSS_S_FAILURE;
+
+	nbytes = buf->len - offset - GSS_KRB5_TOK_HDR_LEN;
+	nblocks = (nbytes + blocksize - 1) / blocksize;
+	cbcbytes = 0;
+	if (nblocks > 2)
+		cbcbytes = (nblocks - 2) * blocksize;
+
+	memset(desc.iv, 0, sizeof(desc.iv));
+
+	if (cbcbytes) {
+		desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
+		desc.fragno = 0;
+		desc.fraglen = 0;
+		desc.pages = pages;
+		desc.outbuf = buf;
+		desc.desc.info = desc.iv;
+		desc.desc.flags = 0;
+		desc.desc.tfm = aux_cipher;
+
+		sg_init_table(desc.infrags, 4);
+		sg_init_table(desc.outfrags, 4);
+
+		err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN,
+				      cbcbytes, encryptor, &desc);
+		if (err)
+			goto out_err;
+	}
+
+	/* Make sure IV carries forward from any CBC results. */
+	err = gss_krb5_cts_crypt(cipher, buf,
+				 offset + GSS_KRB5_TOK_HDR_LEN + cbcbytes,
+				 desc.iv, pages, 1);
+	if (err) {
+		err = GSS_S_FAILURE;
+		goto out_err;
+	}
+
+	/* Now update buf to account for HMAC */
+	buf->tail[0].iov_len += kctx->gk5e->cksumlength;
+	buf->len += kctx->gk5e->cksumlength;
+
+out_err:
+	if (err)
+		err = GSS_S_FAILURE;
+	return err;
+}
+
+u32
+gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
+		     u32 *headskip, u32 *tailskip)
+{
+	struct xdr_buf subbuf;
+	u32 ret = 0;
+	u8 *cksum_key;
+	struct crypto_blkcipher *cipher, *aux_cipher;
+	struct xdr_netobj our_hmac_obj;
+	u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
+	u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
+	int nblocks, blocksize, cbcbytes;
+	struct decryptor_desc desc;
+	unsigned int usage;
+
+	if (kctx->initiate) {
+		cipher = kctx->acceptor_enc;
+		aux_cipher = kctx->acceptor_enc_aux;
+		cksum_key = kctx->acceptor_integ;
+		usage = KG_USAGE_ACCEPTOR_SEAL;
+	} else {
+		cipher = kctx->initiator_enc;
+		aux_cipher = kctx->initiator_enc_aux;
+		cksum_key = kctx->initiator_integ;
+		usage = KG_USAGE_INITIATOR_SEAL;
+	}
+	blocksize = crypto_blkcipher_blocksize(cipher);
+
+
+	/* create a segment skipping the header and leaving out the checksum */
+	xdr_buf_subsegment(buf, &subbuf, offset + GSS_KRB5_TOK_HDR_LEN,
+				    (buf->len - offset - GSS_KRB5_TOK_HDR_LEN -
+				     kctx->gk5e->cksumlength));
+
+	nblocks = (subbuf.len + blocksize - 1) / blocksize;
+
+	cbcbytes = 0;
+	if (nblocks > 2)
+		cbcbytes = (nblocks - 2) * blocksize;
+
+	memset(desc.iv, 0, sizeof(desc.iv));
+
+	if (cbcbytes) {
+		desc.fragno = 0;
+		desc.fraglen = 0;
+		desc.desc.info = desc.iv;
+		desc.desc.flags = 0;
+		desc.desc.tfm = aux_cipher;
+
+		sg_init_table(desc.frags, 4);
+
+		ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc);
+		if (ret)
+			goto out_err;
+	}
+
+	/* Make sure IV carries forward from any CBC results. */
+	ret = gss_krb5_cts_crypt(cipher, &subbuf, cbcbytes, desc.iv, NULL, 0);
+	if (ret)
+		goto out_err;
+
+
+	/* Calculate our hmac over the plaintext data */
+	our_hmac_obj.len = sizeof(our_hmac);
+	our_hmac_obj.data = our_hmac;
+
+	ret = make_checksum_v2(kctx, NULL, 0, &subbuf, 0,
+			       cksum_key, usage, &our_hmac_obj);
+	if (ret)
+		goto out_err;
+
+	/* Get the packet's hmac value */
+	ret = read_bytes_from_xdr_buf(buf, buf->len - kctx->gk5e->cksumlength,
+				      pkt_hmac, kctx->gk5e->cksumlength);
+	if (ret)
+		goto out_err;
+
+	if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) {
+		ret = GSS_S_BAD_SIG;
+		goto out_err;
+	}
+	*headskip = kctx->gk5e->conflen;
+	*tailskip = kctx->gk5e->cksumlength;
+out_err:
+	if (ret && ret != GSS_S_BAD_SIG)
+		ret = GSS_S_FAILURE;
+	return ret;
+}
+
+/*
+ * Compute Kseq given the initial session key and the checksum.
+ * Set the key of the given cipher.
+ */
+int
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+		       unsigned char *cksum)
+{
+	struct crypto_hash *hmac;
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	u8 Kseq[GSS_KRB5_MAX_KEYLEN];
+	u32 zeroconstant = 0;
+	int err;
+
+	dprintk("%s: entered\n", __func__);
+
+	hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hmac)) {
+		dprintk("%s: error %ld, allocating hash '%s'\n",
+			__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
+		return PTR_ERR(hmac);
+	}
+
+	desc.tfm = hmac;
+	desc.flags = 0;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out_err;
+
+	/* Compute intermediate Kseq from session key */
+	err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	sg_init_table(sg, 1);
+	sg_set_buf(sg, &zeroconstant, 4);
+
+	err = crypto_hash_digest(&desc, sg, 4, Kseq);
+	if (err)
+		goto out_err;
+
+	/* Compute final Kseq from the checksum and intermediate Kseq */
+	err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	sg_set_buf(sg, cksum, 8);
+
+	err = crypto_hash_digest(&desc, sg, 8, Kseq);
+	if (err)
+		goto out_err;
+
+	err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	err = 0;
+
+out_err:
+	crypto_free_hash(hmac);
+	dprintk("%s: returning %d\n", __func__, err);
+	return err;
+}
+
+/*
+ * Compute Kcrypt given the initial session key and the plaintext seqnum.
+ * Set the key of cipher kctx->enc.
+ */
+int
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+		       s32 seqnum)
+{
+	struct crypto_hash *hmac;
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
+	u8 zeroconstant[4] = {0};
+	u8 seqnumarray[4];
+	int err, i;
+
+	dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
+
+	hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hmac)) {
+		dprintk("%s: error %ld, allocating hash '%s'\n",
+			__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
+		return PTR_ERR(hmac);
+	}
+
+	desc.tfm = hmac;
+	desc.flags = 0;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out_err;
+
+	/* Compute intermediate Kcrypt from session key */
+	for (i = 0; i < kctx->gk5e->keylength; i++)
+		Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
+
+	err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	sg_init_table(sg, 1);
+	sg_set_buf(sg, zeroconstant, 4);
+
+	err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+	if (err)
+		goto out_err;
+
+	/* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
+	err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	seqnumarray[0] = (unsigned char) ((seqnum >> 24) & 0xff);
+	seqnumarray[1] = (unsigned char) ((seqnum >> 16) & 0xff);
+	seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
+	seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
+
+	sg_set_buf(sg, seqnumarray, 4);
+
+	err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+	if (err)
+		goto out_err;
+
+	err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
+	if (err)
+		goto out_err;
+
+	err = 0;
+
+out_err:
+	crypto_free_hash(hmac);
+	dprintk("%s: returning %d\n", __func__, err);
+	return err;
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
new file mode 100644
index 00000000..76e42e6b
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -0,0 +1,336 @@
+/*
+ * COPYRIGHT (c) 2008
+ * The Regents of the University of Michigan
+ * ALL RIGHTS RESERVED
+ *
+ * Permission is granted to use, copy, create derivative works
+ * and redistribute this software and such derivative works
+ * for any purpose, so long as the name of The University of
+ * Michigan is not used in any advertising or publicity
+ * pertaining to the use of distribution of this software
+ * without specific, written prior authorization.  If the
+ * above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any
+ * portion of this software, then the disclaimer below must
+ * also be included.
+ *
+ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION
+ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY
+ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF
+ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE
+ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR
+ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN
+ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGES.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+/*
+ * This is the n-fold function as described in rfc3961, sec 5.1
+ * Taken from MIT Kerberos and modified.
+ */
+
+static void krb5_nfold(u32 inbits, const u8 *in,
+		       u32 outbits, u8 *out)
+{
+	int a, b, c, lcm;
+	int byte, i, msbit;
+
+	/* the code below is more readable if I make these bytes
+	   instead of bits */
+
+	inbits >>= 3;
+	outbits >>= 3;
+
+	/* first compute lcm(n,k) */
+
+	a = outbits;
+	b = inbits;
+
+	while (b != 0) {
+		c = b;
+		b = a%b;
+		a = c;
+	}
+
+	lcm = outbits*inbits/a;
+
+	/* now do the real work */
+
+	memset(out, 0, outbits);
+	byte = 0;
+
+	/* this will end up cycling through k lcm(k,n)/k times, which
+	   is correct */
+	for (i = lcm-1; i >= 0; i--) {
+		/* compute the msbit in k which gets added into this byte */
+		msbit = (
+			/* first, start with the msbit in the first,
+			 * unrotated byte */
+			 ((inbits << 3) - 1)
+			 /* then, for each byte, shift to the right
+			  * for each repetition */
+			 + (((inbits << 3) + 13) * (i/inbits))
+			 /* last, pick out the correct byte within
+			  * that shifted repetition */
+			 + ((inbits - (i % inbits)) << 3)
+			 ) % (inbits << 3);
+
+		/* pull out the byte value itself */
+		byte += (((in[((inbits - 1) - (msbit >> 3)) % inbits] << 8)|
+				  (in[((inbits) - (msbit >> 3)) % inbits]))
+				 >> ((msbit & 7) + 1)) & 0xff;
+
+		/* do the addition */
+		byte += out[i % outbits];
+		out[i % outbits] = byte & 0xff;
+
+		/* keep around the carry bit, if any */
+		byte >>= 8;
+
+	}
+
+	/* if there's a carry bit left over, add it back in */
+	if (byte) {
+		for (i = outbits - 1; i >= 0; i--) {
+			/* do the addition */
+			byte += out[i];
+			out[i] = byte & 0xff;
+
+			/* keep around the carry bit, if any */
+			byte >>= 8;
+		}
+	}
+}
+
+/*
+ * This is the DK (derive_key) function as described in rfc3961, sec 5.1
+ * Taken from MIT Kerberos and modified.
+ */
+
+u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
+		    const struct xdr_netobj *inkey,
+		    struct xdr_netobj *outkey,
+		    const struct xdr_netobj *in_constant,
+		    gfp_t gfp_mask)
+{
+	size_t blocksize, keybytes, keylength, n;
+	unsigned char *inblockdata, *outblockdata, *rawkey;
+	struct xdr_netobj inblock, outblock;
+	struct crypto_blkcipher *cipher;
+	u32 ret = EINVAL;
+
+	blocksize = gk5e->blocksize;
+	keybytes = gk5e->keybytes;
+	keylength = gk5e->keylength;
+
+	if ((inkey->len != keylength) || (outkey->len != keylength))
+		goto err_return;
+
+	cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0,
+					CRYPTO_ALG_ASYNC);
+	if (IS_ERR(cipher))
+		goto err_return;
+	if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len))
+		goto err_return;
+
+	/* allocate and set up buffers */
+
+	ret = ENOMEM;
+	inblockdata = kmalloc(blocksize, gfp_mask);
+	if (inblockdata == NULL)
+		goto err_free_cipher;
+
+	outblockdata = kmalloc(blocksize, gfp_mask);
+	if (outblockdata == NULL)
+		goto err_free_in;
+
+	rawkey = kmalloc(keybytes, gfp_mask);
+	if (rawkey == NULL)
+		goto err_free_out;
+
+	inblock.data = (char *) inblockdata;
+	inblock.len = blocksize;
+
+	outblock.data = (char *) outblockdata;
+	outblock.len = blocksize;
+
+	/* initialize the input block */
+
+	if (in_constant->len == inblock.len) {
+		memcpy(inblock.data, in_constant->data, inblock.len);
+	} else {
+		krb5_nfold(in_constant->len * 8, in_constant->data,
+			   inblock.len * 8, inblock.data);
+	}
+
+	/* loop encrypting the blocks until enough key bytes are generated */
+
+	n = 0;
+	while (n < keybytes) {
+		(*(gk5e->encrypt))(cipher, NULL, inblock.data,
+				   outblock.data, inblock.len);
+
+		if ((keybytes - n) <= outblock.len) {
+			memcpy(rawkey + n, outblock.data, (keybytes - n));
+			break;
+		}
+
+		memcpy(rawkey + n, outblock.data, outblock.len);
+		memcpy(inblock.data, outblock.data, outblock.len);
+		n += outblock.len;
+	}
+
+	/* postprocess the key */
+
+	inblock.data = (char *) rawkey;
+	inblock.len = keybytes;
+
+	BUG_ON(gk5e->mk_key == NULL);
+	ret = (*(gk5e->mk_key))(gk5e, &inblock, outkey);
+	if (ret) {
+		dprintk("%s: got %d from mk_key function for '%s'\n",
+			__func__, ret, gk5e->encrypt_name);
+		goto err_free_raw;
+	}
+
+	/* clean memory, free resources and exit */
+
+	ret = 0;
+
+err_free_raw:
+	memset(rawkey, 0, keybytes);
+	kfree(rawkey);
+err_free_out:
+	memset(outblockdata, 0, blocksize);
+	kfree(outblockdata);
+err_free_in:
+	memset(inblockdata, 0, blocksize);
+	kfree(inblockdata);
+err_free_cipher:
+	crypto_free_blkcipher(cipher);
+err_return:
+	return ret;
+}
+
+#define smask(step) ((1<<step)-1)
+#define pstep(x, step) (((x)&smask(step))^(((x)>>step)&smask(step)))
+#define parity_char(x) pstep(pstep(pstep((x), 4), 2), 1)
+
+static void mit_des_fixup_key_parity(u8 key[8])
+{
+	int i;
+	for (i = 0; i < 8; i++) {
+		key[i] &= 0xfe;
+		key[i] |= 1^parity_char(key[i]);
+	}
+}
+
+/*
+ * This is the des3 key derivation postprocess function
+ */
+u32 gss_krb5_des3_make_key(const struct gss_krb5_enctype *gk5e,
+			   struct xdr_netobj *randombits,
+			   struct xdr_netobj *key)
+{
+	int i;
+	u32 ret = EINVAL;
+
+	if (key->len != 24) {
+		dprintk("%s: key->len is %d\n", __func__, key->len);
+		goto err_out;
+	}
+	if (randombits->len != 21) {
+		dprintk("%s: randombits->len is %d\n",
+			__func__, randombits->len);
+		goto err_out;
+	}
+
+	/* take the seven bytes, move them around into the top 7 bits of the
+	   8 key bytes, then compute the parity bits.  Do this three times. */
+
+	for (i = 0; i < 3; i++) {
+		memcpy(key->data + i*8, randombits->data + i*7, 7);
+		key->data[i*8+7] = (((key->data[i*8]&1)<<1) |
+				    ((key->data[i*8+1]&1)<<2) |
+				    ((key->data[i*8+2]&1)<<3) |
+				    ((key->data[i*8+3]&1)<<4) |
+				    ((key->data[i*8+4]&1)<<5) |
+				    ((key->data[i*8+5]&1)<<6) |
+				    ((key->data[i*8+6]&1)<<7));
+
+		mit_des_fixup_key_parity(key->data + i*8);
+	}
+	ret = 0;
+err_out:
+	return ret;
+}
+
+/*
+ * This is the aes key derivation postprocess function
+ */
+u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
+			  struct xdr_netobj *randombits,
+			  struct xdr_netobj *key)
+{
+	u32 ret = EINVAL;
+
+	if (key->len != 16 && key->len != 32) {
+		dprintk("%s: key->len is %d\n", __func__, key->len);
+		goto err_out;
+	}
+	if (randombits->len != 16 && randombits->len != 32) {
+		dprintk("%s: randombits->len is %d\n",
+			__func__, randombits->len);
+		goto err_out;
+	}
+	if (randombits->len != key->len) {
+		dprintk("%s: randombits->len is %d, key->len is %d\n",
+			__func__, randombits->len, key->len);
+		goto err_out;
+	}
+	memcpy(key->data, randombits->data, key->len);
+	ret = 0;
+err_out:
+	return ret;
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
new file mode 100644
index 00000000..c3b75333
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -0,0 +1,774 @@
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *
+ *  Copyright (c) 2001-2008 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/crypto.h>
+#include <linux/sunrpc/gss_krb5_enctypes.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+static struct gss_api_mech gss_kerberos_mech;	/* forward declaration */
+
+static const struct gss_krb5_enctype supported_gss_krb5_enctypes[] = {
+	/*
+	 * DES (All DES enctypes are mapped to the same gss functionality)
+	 */
+	{
+	  .etype = ENCTYPE_DES_CBC_RAW,
+	  .ctype = CKSUMTYPE_RSA_MD5,
+	  .name = "des-cbc-crc",
+	  .encrypt_name = "cbc(des)",
+	  .cksum_name = "md5",
+	  .encrypt = krb5_encrypt,
+	  .decrypt = krb5_decrypt,
+	  .mk_key = NULL,
+	  .signalg = SGN_ALG_DES_MAC_MD5,
+	  .sealalg = SEAL_ALG_DES,
+	  .keybytes = 7,
+	  .keylength = 8,
+	  .blocksize = 8,
+	  .conflen = 8,
+	  .cksumlength = 8,
+	  .keyed_cksum = 0,
+	},
+	/*
+	 * RC4-HMAC
+	 */
+	{
+	  .etype = ENCTYPE_ARCFOUR_HMAC,
+	  .ctype = CKSUMTYPE_HMAC_MD5_ARCFOUR,
+	  .name = "rc4-hmac",
+	  .encrypt_name = "ecb(arc4)",
+	  .cksum_name = "hmac(md5)",
+	  .encrypt = krb5_encrypt,
+	  .decrypt = krb5_decrypt,
+	  .mk_key = NULL,
+	  .signalg = SGN_ALG_HMAC_MD5,
+	  .sealalg = SEAL_ALG_MICROSOFT_RC4,
+	  .keybytes = 16,
+	  .keylength = 16,
+	  .blocksize = 1,
+	  .conflen = 8,
+	  .cksumlength = 8,
+	  .keyed_cksum = 1,
+	},
+	/*
+	 * 3DES
+	 */
+	{
+	  .etype = ENCTYPE_DES3_CBC_RAW,
+	  .ctype = CKSUMTYPE_HMAC_SHA1_DES3,
+	  .name = "des3-hmac-sha1",
+	  .encrypt_name = "cbc(des3_ede)",
+	  .cksum_name = "hmac(sha1)",
+	  .encrypt = krb5_encrypt,
+	  .decrypt = krb5_decrypt,
+	  .mk_key = gss_krb5_des3_make_key,
+	  .signalg = SGN_ALG_HMAC_SHA1_DES3_KD,
+	  .sealalg = SEAL_ALG_DES3KD,
+	  .keybytes = 21,
+	  .keylength = 24,
+	  .blocksize = 8,
+	  .conflen = 8,
+	  .cksumlength = 20,
+	  .keyed_cksum = 1,
+	},
+	/*
+	 * AES128
+	 */
+	{
+	  .etype = ENCTYPE_AES128_CTS_HMAC_SHA1_96,
+	  .ctype = CKSUMTYPE_HMAC_SHA1_96_AES128,
+	  .name = "aes128-cts",
+	  .encrypt_name = "cts(cbc(aes))",
+	  .cksum_name = "hmac(sha1)",
+	  .encrypt = krb5_encrypt,
+	  .decrypt = krb5_decrypt,
+	  .mk_key = gss_krb5_aes_make_key,
+	  .encrypt_v2 = gss_krb5_aes_encrypt,
+	  .decrypt_v2 = gss_krb5_aes_decrypt,
+	  .signalg = -1,
+	  .sealalg = -1,
+	  .keybytes = 16,
+	  .keylength = 16,
+	  .blocksize = 16,
+	  .conflen = 16,
+	  .cksumlength = 12,
+	  .keyed_cksum = 1,
+	},
+	/*
+	 * AES256
+	 */
+	{
+	  .etype = ENCTYPE_AES256_CTS_HMAC_SHA1_96,
+	  .ctype = CKSUMTYPE_HMAC_SHA1_96_AES256,
+	  .name = "aes256-cts",
+	  .encrypt_name = "cts(cbc(aes))",
+	  .cksum_name = "hmac(sha1)",
+	  .encrypt = krb5_encrypt,
+	  .decrypt = krb5_decrypt,
+	  .mk_key = gss_krb5_aes_make_key,
+	  .encrypt_v2 = gss_krb5_aes_encrypt,
+	  .decrypt_v2 = gss_krb5_aes_decrypt,
+	  .signalg = -1,
+	  .sealalg = -1,
+	  .keybytes = 32,
+	  .keylength = 32,
+	  .blocksize = 16,
+	  .conflen = 16,
+	  .cksumlength = 12,
+	  .keyed_cksum = 1,
+	},
+};
+
+static const int num_supported_enctypes =
+	ARRAY_SIZE(supported_gss_krb5_enctypes);
+
+static int
+supported_gss_krb5_enctype(int etype)
+{
+	int i;
+	for (i = 0; i < num_supported_enctypes; i++)
+		if (supported_gss_krb5_enctypes[i].etype == etype)
+			return 1;
+	return 0;
+}
+
+static const struct gss_krb5_enctype *
+get_gss_krb5_enctype(int etype)
+{
+	int i;
+	for (i = 0; i < num_supported_enctypes; i++)
+		if (supported_gss_krb5_enctypes[i].etype == etype)
+			return &supported_gss_krb5_enctypes[i];
+	return NULL;
+}
+
+static const void *
+simple_get_bytes(const void *p, const void *end, void *res, int len)
+{
+	const void *q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	memcpy(res, p, len);
+	return q;
+}
+
+static const void *
+simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
+{
+	const void *q;
+	unsigned int len;
+
+	p = simple_get_bytes(p, end, &len, sizeof(len));
+	if (IS_ERR(p))
+		return p;
+	q = (const void *)((const char *)p + len);
+	if (unlikely(q > end || q < p))
+		return ERR_PTR(-EFAULT);
+	res->data = kmemdup(p, len, GFP_NOFS);
+	if (unlikely(res->data == NULL))
+		return ERR_PTR(-ENOMEM);
+	res->len = len;
+	return q;
+}
+
+static inline const void *
+get_key(const void *p, const void *end,
+	struct krb5_ctx *ctx, struct crypto_blkcipher **res)
+{
+	struct xdr_netobj	key;
+	int			alg;
+
+	p = simple_get_bytes(p, end, &alg, sizeof(alg));
+	if (IS_ERR(p))
+		goto out_err;
+
+	switch (alg) {
+	case ENCTYPE_DES_CBC_CRC:
+	case ENCTYPE_DES_CBC_MD4:
+	case ENCTYPE_DES_CBC_MD5:
+		/* Map all these key types to ENCTYPE_DES_CBC_RAW */
+		alg = ENCTYPE_DES_CBC_RAW;
+		break;
+	}
+
+	if (!supported_gss_krb5_enctype(alg)) {
+		printk(KERN_WARNING "gss_kerberos_mech: unsupported "
+			"encryption key algorithm %d\n", alg);
+		p = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+	p = simple_get_netobj(p, end, &key);
+	if (IS_ERR(p))
+		goto out_err;
+
+	*res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+							CRYPTO_ALG_ASYNC);
+	if (IS_ERR(*res)) {
+		printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
+			"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
+		*res = NULL;
+		goto out_err_free_key;
+	}
+	if (crypto_blkcipher_setkey(*res, key.data, key.len)) {
+		printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
+			"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
+		goto out_err_free_tfm;
+	}
+
+	kfree(key.data);
+	return p;
+
+out_err_free_tfm:
+	crypto_free_blkcipher(*res);
+out_err_free_key:
+	kfree(key.data);
+	p = ERR_PTR(-EINVAL);
+out_err:
+	return p;
+}
+
+static int
+gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
+{
+	int tmp;
+
+	p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate));
+	if (IS_ERR(p))
+		goto out_err;
+
+	/* Old format supports only DES!  Any other enctype uses new format */
+	ctx->enctype = ENCTYPE_DES_CBC_RAW;
+
+	ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
+	if (ctx->gk5e == NULL) {
+		p = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+
+	/* The downcall format was designed before we completely understood
+	 * the uses of the context fields; so it includes some stuff we
+	 * just give some minimal sanity-checking, and some we ignore
+	 * completely (like the next twenty bytes): */
+	if (unlikely(p + 20 > end || p + 20 < p)) {
+		p = ERR_PTR(-EFAULT);
+		goto out_err;
+	}
+	p += 20;
+	p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
+	if (IS_ERR(p))
+		goto out_err;
+	if (tmp != SGN_ALG_DES_MAC_MD5) {
+		p = ERR_PTR(-ENOSYS);
+		goto out_err;
+	}
+	p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
+	if (IS_ERR(p))
+		goto out_err;
+	if (tmp != SEAL_ALG_DES) {
+		p = ERR_PTR(-ENOSYS);
+		goto out_err;
+	}
+	p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+	if (IS_ERR(p))
+		goto out_err;
+	p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send));
+	if (IS_ERR(p))
+		goto out_err;
+	p = simple_get_netobj(p, end, &ctx->mech_used);
+	if (IS_ERR(p))
+		goto out_err;
+	p = get_key(p, end, ctx, &ctx->enc);
+	if (IS_ERR(p))
+		goto out_err_free_mech;
+	p = get_key(p, end, ctx, &ctx->seq);
+	if (IS_ERR(p))
+		goto out_err_free_key1;
+	if (p != end) {
+		p = ERR_PTR(-EFAULT);
+		goto out_err_free_key2;
+	}
+
+	return 0;
+
+out_err_free_key2:
+	crypto_free_blkcipher(ctx->seq);
+out_err_free_key1:
+	crypto_free_blkcipher(ctx->enc);
+out_err_free_mech:
+	kfree(ctx->mech_used.data);
+out_err:
+	return PTR_ERR(p);
+}
+
+struct crypto_blkcipher *
+context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
+{
+	struct crypto_blkcipher *cp;
+
+	cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(cp)) {
+		dprintk("gss_kerberos_mech: unable to initialize "
+			"crypto algorithm %s\n", cname);
+		return NULL;
+	}
+	if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) {
+		dprintk("gss_kerberos_mech: error setting key for "
+			"crypto algorithm %s\n", cname);
+		crypto_free_blkcipher(cp);
+		return NULL;
+	}
+	return cp;
+}
+
+static inline void
+set_cdata(u8 cdata[GSS_KRB5_K5CLENGTH], u32 usage, u8 seed)
+{
+	cdata[0] = (usage>>24)&0xff;
+	cdata[1] = (usage>>16)&0xff;
+	cdata[2] = (usage>>8)&0xff;
+	cdata[3] = usage&0xff;
+	cdata[4] = seed;
+}
+
+static int
+context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask)
+{
+	struct xdr_netobj c, keyin, keyout;
+	u8 cdata[GSS_KRB5_K5CLENGTH];
+	u32 err;
+
+	c.len = GSS_KRB5_K5CLENGTH;
+	c.data = cdata;
+
+	keyin.data = ctx->Ksess;
+	keyin.len = ctx->gk5e->keylength;
+	keyout.len = ctx->gk5e->keylength;
+
+	/* seq uses the raw key */
+	ctx->seq = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name,
+					   ctx->Ksess);
+	if (ctx->seq == NULL)
+		goto out_err;
+
+	ctx->enc = context_v2_alloc_cipher(ctx, ctx->gk5e->encrypt_name,
+					   ctx->Ksess);
+	if (ctx->enc == NULL)
+		goto out_free_seq;
+
+	/* derive cksum */
+	set_cdata(cdata, KG_USAGE_SIGN, KEY_USAGE_SEED_CHECKSUM);
+	keyout.data = ctx->cksum;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving cksum key\n",
+			__func__, err);
+		goto out_free_enc;
+	}
+
+	return 0;
+
+out_free_enc:
+	crypto_free_blkcipher(ctx->enc);
+out_free_seq:
+	crypto_free_blkcipher(ctx->seq);
+out_err:
+	return -EINVAL;
+}
+
+/*
+ * Note that RC4 depends on deriving keys using the sequence
+ * number or the checksum of a token.  Therefore, the final keys
+ * cannot be calculated until the token is being constructed!
+ */
+static int
+context_derive_keys_rc4(struct krb5_ctx *ctx)
+{
+	struct crypto_hash *hmac;
+	char sigkeyconstant[] = "signaturekey";
+	int slen = strlen(sigkeyconstant) + 1;	/* include null terminator */
+	struct hash_desc desc;
+	struct scatterlist sg[1];
+	int err;
+
+	dprintk("RPC:       %s: entered\n", __func__);
+	/*
+	 * derive cksum (aka Ksign) key
+	 */
+	hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(hmac)) {
+		dprintk("%s: error %ld allocating hash '%s'\n",
+			__func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
+		err = PTR_ERR(hmac);
+		goto out_err;
+	}
+
+	err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
+	if (err)
+		goto out_err_free_hmac;
+
+	sg_init_table(sg, 1);
+	sg_set_buf(sg, sigkeyconstant, slen);
+
+	desc.tfm = hmac;
+	desc.flags = 0;
+
+	err = crypto_hash_init(&desc);
+	if (err)
+		goto out_err_free_hmac;
+
+	err = crypto_hash_digest(&desc, sg, slen, ctx->cksum);
+	if (err)
+		goto out_err_free_hmac;
+	/*
+	 * allocate hash, and blkciphers for data and seqnum encryption
+	 */
+	ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+					  CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ctx->enc)) {
+		err = PTR_ERR(ctx->enc);
+		goto out_err_free_hmac;
+	}
+
+	ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+					  CRYPTO_ALG_ASYNC);
+	if (IS_ERR(ctx->seq)) {
+		crypto_free_blkcipher(ctx->enc);
+		err = PTR_ERR(ctx->seq);
+		goto out_err_free_hmac;
+	}
+
+	dprintk("RPC:       %s: returning success\n", __func__);
+
+	err = 0;
+
+out_err_free_hmac:
+	crypto_free_hash(hmac);
+out_err:
+	dprintk("RPC:       %s: returning %d\n", __func__, err);
+	return err;
+}
+
+static int
+context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
+{
+	struct xdr_netobj c, keyin, keyout;
+	u8 cdata[GSS_KRB5_K5CLENGTH];
+	u32 err;
+
+	c.len = GSS_KRB5_K5CLENGTH;
+	c.data = cdata;
+
+	keyin.data = ctx->Ksess;
+	keyin.len = ctx->gk5e->keylength;
+	keyout.len = ctx->gk5e->keylength;
+
+	/* initiator seal encryption */
+	set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_ENCRYPTION);
+	keyout.data = ctx->initiator_seal;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving initiator_seal key\n",
+			__func__, err);
+		goto out_err;
+	}
+	ctx->initiator_enc = context_v2_alloc_cipher(ctx,
+						     ctx->gk5e->encrypt_name,
+						     ctx->initiator_seal);
+	if (ctx->initiator_enc == NULL)
+		goto out_err;
+
+	/* acceptor seal encryption */
+	set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_ENCRYPTION);
+	keyout.data = ctx->acceptor_seal;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving acceptor_seal key\n",
+			__func__, err);
+		goto out_free_initiator_enc;
+	}
+	ctx->acceptor_enc = context_v2_alloc_cipher(ctx,
+						    ctx->gk5e->encrypt_name,
+						    ctx->acceptor_seal);
+	if (ctx->acceptor_enc == NULL)
+		goto out_free_initiator_enc;
+
+	/* initiator sign checksum */
+	set_cdata(cdata, KG_USAGE_INITIATOR_SIGN, KEY_USAGE_SEED_CHECKSUM);
+	keyout.data = ctx->initiator_sign;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving initiator_sign key\n",
+			__func__, err);
+		goto out_free_acceptor_enc;
+	}
+
+	/* acceptor sign checksum */
+	set_cdata(cdata, KG_USAGE_ACCEPTOR_SIGN, KEY_USAGE_SEED_CHECKSUM);
+	keyout.data = ctx->acceptor_sign;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving acceptor_sign key\n",
+			__func__, err);
+		goto out_free_acceptor_enc;
+	}
+
+	/* initiator seal integrity */
+	set_cdata(cdata, KG_USAGE_INITIATOR_SEAL, KEY_USAGE_SEED_INTEGRITY);
+	keyout.data = ctx->initiator_integ;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving initiator_integ key\n",
+			__func__, err);
+		goto out_free_acceptor_enc;
+	}
+
+	/* acceptor seal integrity */
+	set_cdata(cdata, KG_USAGE_ACCEPTOR_SEAL, KEY_USAGE_SEED_INTEGRITY);
+	keyout.data = ctx->acceptor_integ;
+	err = krb5_derive_key(ctx->gk5e, &keyin, &keyout, &c, gfp_mask);
+	if (err) {
+		dprintk("%s: Error %d deriving acceptor_integ key\n",
+			__func__, err);
+		goto out_free_acceptor_enc;
+	}
+
+	switch (ctx->enctype) {
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		ctx->initiator_enc_aux =
+			context_v2_alloc_cipher(ctx, "cbc(aes)",
+						ctx->initiator_seal);
+		if (ctx->initiator_enc_aux == NULL)
+			goto out_free_acceptor_enc;
+		ctx->acceptor_enc_aux =
+			context_v2_alloc_cipher(ctx, "cbc(aes)",
+						ctx->acceptor_seal);
+		if (ctx->acceptor_enc_aux == NULL) {
+			crypto_free_blkcipher(ctx->initiator_enc_aux);
+			goto out_free_acceptor_enc;
+		}
+	}
+
+	return 0;
+
+out_free_acceptor_enc:
+	crypto_free_blkcipher(ctx->acceptor_enc);
+out_free_initiator_enc:
+	crypto_free_blkcipher(ctx->initiator_enc);
+out_err:
+	return -EINVAL;
+}
+
+static int
+gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
+		gfp_t gfp_mask)
+{
+	int keylen;
+
+	p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
+	if (IS_ERR(p))
+		goto out_err;
+	ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR;
+
+	p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+	if (IS_ERR(p))
+		goto out_err;
+	p = simple_get_bytes(p, end, &ctx->seq_send64, sizeof(ctx->seq_send64));
+	if (IS_ERR(p))
+		goto out_err;
+	/* set seq_send for use by "older" enctypes */
+	ctx->seq_send = ctx->seq_send64;
+	if (ctx->seq_send64 != ctx->seq_send) {
+		dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
+			(long unsigned)ctx->seq_send64, ctx->seq_send);
+		p = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+	p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
+	if (IS_ERR(p))
+		goto out_err;
+	/* Map ENCTYPE_DES3_CBC_SHA1 to ENCTYPE_DES3_CBC_RAW */
+	if (ctx->enctype == ENCTYPE_DES3_CBC_SHA1)
+		ctx->enctype = ENCTYPE_DES3_CBC_RAW;
+	ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
+	if (ctx->gk5e == NULL) {
+		dprintk("gss_kerberos_mech: unsupported krb5 enctype %u\n",
+			ctx->enctype);
+		p = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+	keylen = ctx->gk5e->keylength;
+
+	p = simple_get_bytes(p, end, ctx->Ksess, keylen);
+	if (IS_ERR(p))
+		goto out_err;
+
+	if (p != end) {
+		p = ERR_PTR(-EINVAL);
+		goto out_err;
+	}
+
+	ctx->mech_used.data = kmemdup(gss_kerberos_mech.gm_oid.data,
+				      gss_kerberos_mech.gm_oid.len, gfp_mask);
+	if (unlikely(ctx->mech_used.data == NULL)) {
+		p = ERR_PTR(-ENOMEM);
+		goto out_err;
+	}
+	ctx->mech_used.len = gss_kerberos_mech.gm_oid.len;
+
+	switch (ctx->enctype) {
+	case ENCTYPE_DES3_CBC_RAW:
+		return context_derive_keys_des3(ctx, gfp_mask);
+	case ENCTYPE_ARCFOUR_HMAC:
+		return context_derive_keys_rc4(ctx);
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		return context_derive_keys_new(ctx, gfp_mask);
+	default:
+		return -EINVAL;
+	}
+
+out_err:
+	return PTR_ERR(p);
+}
+
+static int
+gss_import_sec_context_kerberos(const void *p, size_t len,
+				struct gss_ctx *ctx_id,
+				gfp_t gfp_mask)
+{
+	const void *end = (const void *)((const char *)p + len);
+	struct  krb5_ctx *ctx;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), gfp_mask);
+	if (ctx == NULL)
+		return -ENOMEM;
+
+	if (len == 85)
+		ret = gss_import_v1_context(p, end, ctx);
+	else
+		ret = gss_import_v2_context(p, end, ctx, gfp_mask);
+
+	if (ret == 0)
+		ctx_id->internal_ctx_id = ctx;
+	else
+		kfree(ctx);
+
+	dprintk("RPC:       %s: returning %d\n", __func__, ret);
+	return ret;
+}
+
+static void
+gss_delete_sec_context_kerberos(void *internal_ctx) {
+	struct krb5_ctx *kctx = internal_ctx;
+
+	crypto_free_blkcipher(kctx->seq);
+	crypto_free_blkcipher(kctx->enc);
+	crypto_free_blkcipher(kctx->acceptor_enc);
+	crypto_free_blkcipher(kctx->initiator_enc);
+	crypto_free_blkcipher(kctx->acceptor_enc_aux);
+	crypto_free_blkcipher(kctx->initiator_enc_aux);
+	kfree(kctx->mech_used.data);
+	kfree(kctx);
+}
+
+static const struct gss_api_ops gss_kerberos_ops = {
+	.gss_import_sec_context	= gss_import_sec_context_kerberos,
+	.gss_get_mic		= gss_get_mic_kerberos,
+	.gss_verify_mic		= gss_verify_mic_kerberos,
+	.gss_wrap		= gss_wrap_kerberos,
+	.gss_unwrap		= gss_unwrap_kerberos,
+	.gss_delete_sec_context	= gss_delete_sec_context_kerberos,
+};
+
+static struct pf_desc gss_kerberos_pfs[] = {
+	[0] = {
+		.pseudoflavor = RPC_AUTH_GSS_KRB5,
+		.service = RPC_GSS_SVC_NONE,
+		.name = "krb5",
+	},
+	[1] = {
+		.pseudoflavor = RPC_AUTH_GSS_KRB5I,
+		.service = RPC_GSS_SVC_INTEGRITY,
+		.name = "krb5i",
+	},
+	[2] = {
+		.pseudoflavor = RPC_AUTH_GSS_KRB5P,
+		.service = RPC_GSS_SVC_PRIVACY,
+		.name = "krb5p",
+	},
+};
+
+static struct gss_api_mech gss_kerberos_mech = {
+	.gm_name	= "krb5",
+	.gm_owner	= THIS_MODULE,
+	.gm_oid		= {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"},
+	.gm_ops		= &gss_kerberos_ops,
+	.gm_pf_num	= ARRAY_SIZE(gss_kerberos_pfs),
+	.gm_pfs		= gss_kerberos_pfs,
+	.gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES,
+};
+
+static int __init init_kerberos_module(void)
+{
+	int status;
+
+	status = gss_mech_register(&gss_kerberos_mech);
+	if (status)
+		printk("Failed to register kerberos gss mechanism!\n");
+	return status;
+}
+
+static void __exit cleanup_kerberos_module(void)
+{
+	gss_mech_unregister(&gss_kerberos_mech);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_kerberos_module);
+module_exit(cleanup_kerberos_module);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
new file mode 100644
index 00000000..d7941eab
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -0,0 +1,223 @@
+/*
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c
+ *
+ *  Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson	<andros@umich.edu>
+ *  J. Bruce Fields	<bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+DEFINE_SPINLOCK(krb5_seq_lock);
+
+static char *
+setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
+{
+	__be16 *ptr, *krb5_hdr;
+	int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
+
+	token->len = g_token_size(&ctx->mech_used, body_size);
+
+	ptr = (__be16 *)token->data;
+	g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
+
+	/* ptr now at start of header described in rfc 1964, section 1.2.1: */
+	krb5_hdr = ptr;
+	*ptr++ = KG_TOK_MIC_MSG;
+	*ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+	*ptr++ = SEAL_ALG_NONE;
+	*ptr++ = 0xffff;
+
+	return (char *)krb5_hdr;
+}
+
+static void *
+setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
+{
+	__be16 *ptr, *krb5_hdr;
+	u8 *p, flags = 0x00;
+
+	if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
+		flags |= 0x01;
+	if (ctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY)
+		flags |= 0x04;
+
+	/* Per rfc 4121, sec 4.2.6.1, there is no header,
+	 * just start the token */
+	krb5_hdr = ptr = (__be16 *)token->data;
+
+	*ptr++ = KG2_TOK_MIC;
+	p = (u8 *)ptr;
+	*p++ = flags;
+	*p++ = 0xff;
+	ptr = (__be16 *)p;
+	*ptr++ = 0xffff;
+	*ptr++ = 0xffff;
+
+	token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
+	return krb5_hdr;
+}
+
+static u32
+gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
+		struct xdr_netobj *token)
+{
+	char			cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj	md5cksum = {.len = sizeof(cksumdata),
+					    .data = cksumdata};
+	void			*ptr;
+	s32			now;
+	u32			seq_send;
+	u8			*cksumkey;
+
+	dprintk("RPC:       %s\n", __func__);
+	BUG_ON(ctx == NULL);
+
+	now = get_seconds();
+
+	ptr = setup_token(ctx, token);
+
+	if (ctx->gk5e->keyed_cksum)
+		cksumkey = ctx->cksum;
+	else
+		cksumkey = NULL;
+
+	if (make_checksum(ctx, ptr, 8, text, 0, cksumkey,
+			  KG_USAGE_SIGN, &md5cksum))
+		return GSS_S_FAILURE;
+
+	memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
+
+	spin_lock(&krb5_seq_lock);
+	seq_send = ctx->seq_send++;
+	spin_unlock(&krb5_seq_lock);
+
+	if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff,
+			      seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8))
+		return GSS_S_FAILURE;
+
+	return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+u32
+gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
+		struct xdr_netobj *token)
+{
+	char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj cksumobj = { .len = sizeof(cksumdata),
+				       .data = cksumdata};
+	void *krb5_hdr;
+	s32 now;
+	u64 seq_send;
+	u8 *cksumkey;
+	unsigned int cksum_usage;
+
+	dprintk("RPC:       %s\n", __func__);
+
+	krb5_hdr = setup_token_v2(ctx, token);
+
+	/* Set up the sequence number. Now 64-bits in clear
+	 * text and w/o direction indicator */
+	spin_lock(&krb5_seq_lock);
+	seq_send = ctx->seq_send64++;
+	spin_unlock(&krb5_seq_lock);
+	*((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+
+	if (ctx->initiate) {
+		cksumkey = ctx->initiator_sign;
+		cksum_usage = KG_USAGE_INITIATOR_SIGN;
+	} else {
+		cksumkey = ctx->acceptor_sign;
+		cksum_usage = KG_USAGE_ACCEPTOR_SIGN;
+	}
+
+	if (make_checksum_v2(ctx, krb5_hdr, GSS_KRB5_TOK_HDR_LEN,
+			     text, 0, cksumkey, cksum_usage, &cksumobj))
+		return GSS_S_FAILURE;
+
+	memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len);
+
+	now = get_seconds();
+
+	return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+u32
+gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
+		     struct xdr_netobj *token)
+{
+	struct krb5_ctx		*ctx = gss_ctx->internal_ctx_id;
+
+	switch (ctx->enctype) {
+	default:
+		BUG();
+	case ENCTYPE_DES_CBC_RAW:
+	case ENCTYPE_DES3_CBC_RAW:
+	case ENCTYPE_ARCFOUR_HMAC:
+		return gss_get_mic_v1(ctx, text, token);
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		return gss_get_mic_v2(ctx, text, token);
+	}
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
new file mode 100644
index 00000000..62ac90c6
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -0,0 +1,166 @@
+/*
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/crypto.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+static s32
+krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
+		      unsigned char *cksum, unsigned char *buf)
+{
+	struct crypto_blkcipher *cipher;
+	unsigned char plain[8];
+	s32 code;
+
+	dprintk("RPC:       %s:\n", __func__);
+	cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+					CRYPTO_ALG_ASYNC);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	plain[0] = (unsigned char) ((seqnum >> 24) & 0xff);
+	plain[1] = (unsigned char) ((seqnum >> 16) & 0xff);
+	plain[2] = (unsigned char) ((seqnum >> 8) & 0xff);
+	plain[3] = (unsigned char) ((seqnum >> 0) & 0xff);
+	plain[4] = direction;
+	plain[5] = direction;
+	plain[6] = direction;
+	plain[7] = direction;
+
+	code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
+	if (code)
+		goto out;
+
+	code = krb5_encrypt(cipher, cksum, plain, buf, 8);
+out:
+	crypto_free_blkcipher(cipher);
+	return code;
+}
+s32
+krb5_make_seq_num(struct krb5_ctx *kctx,
+		struct crypto_blkcipher *key,
+		int direction,
+		u32 seqnum,
+		unsigned char *cksum, unsigned char *buf)
+{
+	unsigned char plain[8];
+
+	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
+		return krb5_make_rc4_seq_num(kctx, direction, seqnum,
+					     cksum, buf);
+
+	plain[0] = (unsigned char) (seqnum & 0xff);
+	plain[1] = (unsigned char) ((seqnum >> 8) & 0xff);
+	plain[2] = (unsigned char) ((seqnum >> 16) & 0xff);
+	plain[3] = (unsigned char) ((seqnum >> 24) & 0xff);
+
+	plain[4] = direction;
+	plain[5] = direction;
+	plain[6] = direction;
+	plain[7] = direction;
+
+	return krb5_encrypt(key, cksum, plain, buf, 8);
+}
+
+static s32
+krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
+		     unsigned char *buf, int *direction, s32 *seqnum)
+{
+	struct crypto_blkcipher *cipher;
+	unsigned char plain[8];
+	s32 code;
+
+	dprintk("RPC:       %s:\n", __func__);
+	cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+					CRYPTO_ALG_ASYNC);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	code = krb5_rc4_setup_seq_key(kctx, cipher, cksum);
+	if (code)
+		goto out;
+
+	code = krb5_decrypt(cipher, cksum, buf, plain, 8);
+	if (code)
+		goto out;
+
+	if ((plain[4] != plain[5]) || (plain[4] != plain[6])
+				   || (plain[4] != plain[7])) {
+		code = (s32)KG_BAD_SEQ;
+		goto out;
+	}
+
+	*direction = plain[4];
+
+	*seqnum = ((plain[0] << 24) | (plain[1] << 16) |
+					(plain[2] << 8) | (plain[3]));
+out:
+	crypto_free_blkcipher(cipher);
+	return code;
+}
+
+s32
+krb5_get_seq_num(struct krb5_ctx *kctx,
+	       unsigned char *cksum,
+	       unsigned char *buf,
+	       int *direction, u32 *seqnum)
+{
+	s32 code;
+	unsigned char plain[8];
+	struct crypto_blkcipher *key = kctx->seq;
+
+	dprintk("RPC:       krb5_get_seq_num:\n");
+
+	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC)
+		return krb5_get_rc4_seq_num(kctx, cksum, buf,
+					    direction, seqnum);
+
+	if ((code = krb5_decrypt(key, cksum, buf, plain, 8)))
+		return code;
+
+	if ((plain[4] != plain[5]) || (plain[4] != plain[6]) ||
+	    (plain[4] != plain[7]))
+		return (s32)KG_BAD_SEQ;
+
+	*direction = plain[4];
+
+	*seqnum = ((plain[0]) |
+		   (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
+
+	return 0;
+}
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
new file mode 100644
index 00000000..6cd930f3
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -0,0 +1,226 @@
+/*
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c
+ *
+ *  Copyright (c) 2000-2008 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/crypto.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+
+/* read_token is a mic token, and message_buffer is the data that the mic was
+ * supposedly taken over. */
+
+static u32
+gss_verify_mic_v1(struct krb5_ctx *ctx,
+		struct xdr_buf *message_buffer, struct xdr_netobj *read_token)
+{
+	int			signalg;
+	int			sealalg;
+	char			cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj	md5cksum = {.len = sizeof(cksumdata),
+					    .data = cksumdata};
+	s32			now;
+	int			direction;
+	u32			seqnum;
+	unsigned char		*ptr = (unsigned char *)read_token->data;
+	int			bodysize;
+	u8			*cksumkey;
+
+	dprintk("RPC:       krb5_read_token\n");
+
+	if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+					read_token->len))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) ||
+	    (ptr[1] !=  (KG_TOK_MIC_MSG & 0xff)))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/* XXX sanity-check bodysize?? */
+
+	signalg = ptr[2] + (ptr[3] << 8);
+	if (signalg != ctx->gk5e->signalg)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	sealalg = ptr[4] + (ptr[5] << 8);
+	if (sealalg != SEAL_ALG_NONE)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	if (ctx->gk5e->keyed_cksum)
+		cksumkey = ctx->cksum;
+	else
+		cksumkey = NULL;
+
+	if (make_checksum(ctx, ptr, 8, message_buffer, 0,
+			  cksumkey, KG_USAGE_SIGN, &md5cksum))
+		return GSS_S_FAILURE;
+
+	if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+					ctx->gk5e->cksumlength))
+		return GSS_S_BAD_SIG;
+
+	/* it got through unscathed.  Make sure the context is unexpired */
+
+	now = get_seconds();
+
+	if (now > ctx->endtime)
+		return GSS_S_CONTEXT_EXPIRED;
+
+	/* do sequencing checks */
+
+	if (krb5_get_seq_num(ctx, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8,
+			     &direction, &seqnum))
+		return GSS_S_FAILURE;
+
+	if ((ctx->initiate && direction != 0xff) ||
+	    (!ctx->initiate && direction != 0))
+		return GSS_S_BAD_SIG;
+
+	return GSS_S_COMPLETE;
+}
+
+static u32
+gss_verify_mic_v2(struct krb5_ctx *ctx,
+		struct xdr_buf *message_buffer, struct xdr_netobj *read_token)
+{
+	char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj cksumobj = {.len = sizeof(cksumdata),
+				      .data = cksumdata};
+	s32 now;
+	u64 seqnum;
+	u8 *ptr = read_token->data;
+	u8 *cksumkey;
+	u8 flags;
+	int i;
+	unsigned int cksum_usage;
+
+	dprintk("RPC:       %s\n", __func__);
+
+	if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	flags = ptr[2];
+	if ((!ctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) ||
+	    (ctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)))
+		return GSS_S_BAD_SIG;
+
+	if (flags & KG2_TOKEN_FLAG_SEALED) {
+		dprintk("%s: token has unexpected sealed flag\n", __func__);
+		return GSS_S_FAILURE;
+	}
+
+	for (i = 3; i < 8; i++)
+		if (ptr[i] != 0xff)
+			return GSS_S_DEFECTIVE_TOKEN;
+
+	if (ctx->initiate) {
+		cksumkey = ctx->acceptor_sign;
+		cksum_usage = KG_USAGE_ACCEPTOR_SIGN;
+	} else {
+		cksumkey = ctx->initiator_sign;
+		cksum_usage = KG_USAGE_INITIATOR_SIGN;
+	}
+
+	if (make_checksum_v2(ctx, ptr, GSS_KRB5_TOK_HDR_LEN, message_buffer, 0,
+			     cksumkey, cksum_usage, &cksumobj))
+		return GSS_S_FAILURE;
+
+	if (memcmp(cksumobj.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+				ctx->gk5e->cksumlength))
+		return GSS_S_BAD_SIG;
+
+	/* it got through unscathed.  Make sure the context is unexpired */
+	now = get_seconds();
+	if (now > ctx->endtime)
+		return GSS_S_CONTEXT_EXPIRED;
+
+	/* do sequencing checks */
+
+	seqnum = be64_to_cpup((__be64 *)ptr + 8);
+
+	return GSS_S_COMPLETE;
+}
+
+u32
+gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
+			struct xdr_buf *message_buffer,
+			struct xdr_netobj *read_token)
+{
+	struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
+
+	switch (ctx->enctype) {
+	default:
+		BUG();
+	case ENCTYPE_DES_CBC_RAW:
+	case ENCTYPE_DES3_CBC_RAW:
+	case ENCTYPE_ARCFOUR_HMAC:
+		return gss_verify_mic_v1(ctx, message_buffer, read_token);
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		return gss_verify_mic_v2(ctx, message_buffer, read_token);
+	}
+}
+
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
new file mode 100644
index 00000000..2763e3e4
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -0,0 +1,587 @@
+/*
+ * COPYRIGHT (c) 2008
+ * The Regents of the University of Michigan
+ * ALL RIGHTS RESERVED
+ *
+ * Permission is granted to use, copy, create derivative works
+ * and redistribute this software and such derivative works
+ * for any purpose, so long as the name of The University of
+ * Michigan is not used in any advertising or publicity
+ * pertaining to the use of distribution of this software
+ * without specific, written prior authorization.  If the
+ * above copyright notice or any other identification of the
+ * University of Michigan is included in any copy of any
+ * portion of this software, then the disclaimer below must
+ * also be included.
+ *
+ * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION
+ * FROM THE UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY
+ * PURPOSE, AND WITHOUT WARRANTY BY THE UNIVERSITY OF
+ * MICHIGAN OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
+ * REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE LIABLE
+ * FOR ANY DAMAGES, INCLUDING SPECIAL, INDIRECT, INCIDENTAL, OR
+ * CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN
+ * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGES.
+ */
+
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/sunrpc/gss_krb5.h>
+#include <linux/random.h>
+#include <linux/pagemap.h>
+#include <linux/crypto.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+static inline int
+gss_krb5_padding(int blocksize, int length)
+{
+	return blocksize - (length % blocksize);
+}
+
+static inline void
+gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
+{
+	int padding = gss_krb5_padding(blocksize, buf->len - offset);
+	char *p;
+	struct kvec *iov;
+
+	if (buf->page_len || buf->tail[0].iov_len)
+		iov = &buf->tail[0];
+	else
+		iov = &buf->head[0];
+	p = iov->iov_base + iov->iov_len;
+	iov->iov_len += padding;
+	buf->len += padding;
+	memset(p, padding, padding);
+}
+
+static inline int
+gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
+{
+	u8 *ptr;
+	u8 pad;
+	size_t len = buf->len;
+
+	if (len <= buf->head[0].iov_len) {
+		pad = *(u8 *)(buf->head[0].iov_base + len - 1);
+		if (pad > buf->head[0].iov_len)
+			return -EINVAL;
+		buf->head[0].iov_len -= pad;
+		goto out;
+	} else
+		len -= buf->head[0].iov_len;
+	if (len <= buf->page_len) {
+		unsigned int last = (buf->page_base + len - 1)
+					>>PAGE_CACHE_SHIFT;
+		unsigned int offset = (buf->page_base + len - 1)
+					& (PAGE_CACHE_SIZE - 1);
+		ptr = kmap_atomic(buf->pages[last], KM_USER0);
+		pad = *(ptr + offset);
+		kunmap_atomic(ptr, KM_USER0);
+		goto out;
+	} else
+		len -= buf->page_len;
+	BUG_ON(len > buf->tail[0].iov_len);
+	pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
+out:
+	/* XXX: NOTE: we do not adjust the page lengths--they represent
+	 * a range of data in the real filesystem page cache, and we need
+	 * to know that range so the xdr code can properly place read data.
+	 * However adjusting the head length, as we do above, is harmless.
+	 * In the case of a request that fits into a single page, the server
+	 * also uses length and head length together to determine the original
+	 * start of the request to copy the request for deferal; so it's
+	 * easier on the server if we adjust head and tail length in tandem.
+	 * It's not really a problem that we don't fool with the page and
+	 * tail lengths, though--at worst badly formed xdr might lead the
+	 * server to attempt to parse the padding.
+	 * XXX: Document all these weird requirements for gss mechanism
+	 * wrap/unwrap functions. */
+	if (pad > blocksize)
+		return -EINVAL;
+	if (buf->len > pad)
+		buf->len -= pad;
+	else
+		return -EINVAL;
+	return 0;
+}
+
+void
+gss_krb5_make_confounder(char *p, u32 conflen)
+{
+	static u64 i = 0;
+	u64 *q = (u64 *)p;
+
+	/* rfc1964 claims this should be "random".  But all that's really
+	 * necessary is that it be unique.  And not even that is necessary in
+	 * our case since our "gssapi" implementation exists only to support
+	 * rpcsec_gss, so we know that the only buffers we will ever encrypt
+	 * already begin with a unique sequence number.  Just to hedge my bets
+	 * I'll make a half-hearted attempt at something unique, but ensuring
+	 * uniqueness would mean worrying about atomicity and rollover, and I
+	 * don't care enough. */
+
+	/* initialize to random value */
+	if (i == 0) {
+		i = random32();
+		i = (i << 32) | random32();
+	}
+
+	switch (conflen) {
+	case 16:
+		*q++ = i++;
+		/* fall through */
+	case 8:
+		*q++ = i++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* Assumptions: the head and tail of inbuf are ours to play with.
+ * The pages, however, may be real pages in the page cache and we replace
+ * them with scratch pages from **pages before writing to them. */
+/* XXX: obviously the above should be documentation of wrap interface,
+ * and shouldn't be in this kerberos-specific file. */
+
+/* XXX factor out common code with seal/unseal. */
+
+static u32
+gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
+		struct xdr_buf *buf, struct page **pages)
+{
+	char			cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj	md5cksum = {.len = sizeof(cksumdata),
+					    .data = cksumdata};
+	int			blocksize = 0, plainlen;
+	unsigned char		*ptr, *msg_start;
+	s32			now;
+	int			headlen;
+	struct page		**tmp_pages;
+	u32			seq_send;
+	u8			*cksumkey;
+	u32			conflen = kctx->gk5e->conflen;
+
+	dprintk("RPC:       %s\n", __func__);
+
+	now = get_seconds();
+
+	blocksize = crypto_blkcipher_blocksize(kctx->enc);
+	gss_krb5_add_padding(buf, offset, blocksize);
+	BUG_ON((buf->len - offset) % blocksize);
+	plainlen = conflen + buf->len - offset;
+
+	headlen = g_token_size(&kctx->mech_used,
+		GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength + plainlen) -
+		(buf->len - offset);
+
+	ptr = buf->head[0].iov_base + offset;
+	/* shift data to make room for header. */
+	xdr_extend_head(buf, offset, headlen);
+
+	/* XXX Would be cleverer to encrypt while copying. */
+	BUG_ON((buf->len - offset - headlen) % blocksize);
+
+	g_make_token_header(&kctx->mech_used,
+				GSS_KRB5_TOK_HDR_LEN +
+				kctx->gk5e->cksumlength + plainlen, &ptr);
+
+
+	/* ptr now at header described in rfc 1964, section 1.2.1: */
+	ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+	ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
+
+	msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
+
+	*(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+	memset(ptr + 4, 0xff, 4);
+	*(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+
+	gss_krb5_make_confounder(msg_start, conflen);
+
+	if (kctx->gk5e->keyed_cksum)
+		cksumkey = kctx->cksum;
+	else
+		cksumkey = NULL;
+
+	/* XXXJBF: UGH!: */
+	tmp_pages = buf->pages;
+	buf->pages = pages;
+	if (make_checksum(kctx, ptr, 8, buf, offset + headlen - conflen,
+					cksumkey, KG_USAGE_SEAL, &md5cksum))
+		return GSS_S_FAILURE;
+	buf->pages = tmp_pages;
+
+	memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len);
+
+	spin_lock(&krb5_seq_lock);
+	seq_send = kctx->seq_send++;
+	spin_unlock(&krb5_seq_lock);
+
+	/* XXX would probably be more efficient to compute checksum
+	 * and encrypt at the same time: */
+	if ((krb5_make_seq_num(kctx, kctx->seq, kctx->initiate ? 0 : 0xff,
+			       seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
+		return GSS_S_FAILURE;
+
+	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
+		struct crypto_blkcipher *cipher;
+		int err;
+		cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+						CRYPTO_ALG_ASYNC);
+		if (IS_ERR(cipher))
+			return GSS_S_FAILURE;
+
+		krb5_rc4_setup_enc_key(kctx, cipher, seq_send);
+
+		err = gss_encrypt_xdr_buf(cipher, buf,
+					  offset + headlen - conflen, pages);
+		crypto_free_blkcipher(cipher);
+		if (err)
+			return GSS_S_FAILURE;
+	} else {
+		if (gss_encrypt_xdr_buf(kctx->enc, buf,
+					offset + headlen - conflen, pages))
+			return GSS_S_FAILURE;
+	}
+
+	return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+static u32
+gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+{
+	int			signalg;
+	int			sealalg;
+	char			cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
+	struct xdr_netobj	md5cksum = {.len = sizeof(cksumdata),
+					    .data = cksumdata};
+	s32			now;
+	int			direction;
+	s32			seqnum;
+	unsigned char		*ptr;
+	int			bodysize;
+	void			*data_start, *orig_start;
+	int			data_len;
+	int			blocksize;
+	u32			conflen = kctx->gk5e->conflen;
+	int			crypt_offset;
+	u8			*cksumkey;
+
+	dprintk("RPC:       gss_unwrap_kerberos\n");
+
+	ptr = (u8 *)buf->head[0].iov_base + offset;
+	if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
+					buf->len - offset))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+	    (ptr[1] !=  (KG_TOK_WRAP_MSG & 0xff)))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/* XXX sanity-check bodysize?? */
+
+	/* get the sign and seal algorithms */
+
+	signalg = ptr[2] + (ptr[3] << 8);
+	if (signalg != kctx->gk5e->signalg)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	sealalg = ptr[4] + (ptr[5] << 8);
+	if (sealalg != kctx->gk5e->sealalg)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	/*
+	 * Data starts after token header and checksum.  ptr points
+	 * to the beginning of the token header
+	 */
+	crypt_offset = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) -
+					(unsigned char *)buf->head[0].iov_base;
+
+	/*
+	 * Need plaintext seqnum to derive encryption key for arcfour-hmac
+	 */
+	if (krb5_get_seq_num(kctx, ptr + GSS_KRB5_TOK_HDR_LEN,
+			     ptr + 8, &direction, &seqnum))
+		return GSS_S_BAD_SIG;
+
+	if ((kctx->initiate && direction != 0xff) ||
+	    (!kctx->initiate && direction != 0))
+		return GSS_S_BAD_SIG;
+
+	if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
+		struct crypto_blkcipher *cipher;
+		int err;
+
+		cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
+						CRYPTO_ALG_ASYNC);
+		if (IS_ERR(cipher))
+			return GSS_S_FAILURE;
+
+		krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
+
+		err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
+		crypto_free_blkcipher(cipher);
+		if (err)
+			return GSS_S_DEFECTIVE_TOKEN;
+	} else {
+		if (gss_decrypt_xdr_buf(kctx->enc, buf, crypt_offset))
+			return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (kctx->gk5e->keyed_cksum)
+		cksumkey = kctx->cksum;
+	else
+		cksumkey = NULL;
+
+	if (make_checksum(kctx, ptr, 8, buf, crypt_offset,
+					cksumkey, KG_USAGE_SEAL, &md5cksum))
+		return GSS_S_FAILURE;
+
+	if (memcmp(md5cksum.data, ptr + GSS_KRB5_TOK_HDR_LEN,
+						kctx->gk5e->cksumlength))
+		return GSS_S_BAD_SIG;
+
+	/* it got through unscathed.  Make sure the context is unexpired */
+
+	now = get_seconds();
+
+	if (now > kctx->endtime)
+		return GSS_S_CONTEXT_EXPIRED;
+
+	/* do sequencing checks */
+
+	/* Copy the data back to the right position.  XXX: Would probably be
+	 * better to copy and encrypt at the same time. */
+
+	blocksize = crypto_blkcipher_blocksize(kctx->enc);
+	data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
+					conflen;
+	orig_start = buf->head[0].iov_base + offset;
+	data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
+	memmove(orig_start, data_start, data_len);
+	buf->head[0].iov_len -= (data_start - orig_start);
+	buf->len -= (data_start - orig_start);
+
+	if (gss_krb5_remove_padding(buf, blocksize))
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	return GSS_S_COMPLETE;
+}
+
+/*
+ * We cannot currently handle tokens with rotated data.  We need a
+ * generalized routine to rotate the data in place.  It is anticipated
+ * that we won't encounter rotated data in the general case.
+ */
+static u32
+rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc)
+{
+	unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN);
+
+	if (realrrc == 0)
+		return 0;
+
+	dprintk("%s: cannot process token with rotated data: "
+		"rrc %u, realrrc %u\n", __func__, rrc, realrrc);
+	return 1;
+}
+
+static u32
+gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
+		     struct xdr_buf *buf, struct page **pages)
+{
+	int		blocksize;
+	u8		*ptr, *plainhdr;
+	s32		now;
+	u8		flags = 0x00;
+	__be16		*be16ptr, ec = 0;
+	__be64		*be64ptr;
+	u32		err;
+
+	dprintk("RPC:       %s\n", __func__);
+
+	if (kctx->gk5e->encrypt_v2 == NULL)
+		return GSS_S_FAILURE;
+
+	/* make room for gss token header */
+	if (xdr_extend_head(buf, offset, GSS_KRB5_TOK_HDR_LEN))
+		return GSS_S_FAILURE;
+
+	/* construct gss token header */
+	ptr = plainhdr = buf->head[0].iov_base + offset;
+	*ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff);
+	*ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff);
+
+	if ((kctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
+		flags |= KG2_TOKEN_FLAG_SENTBYACCEPTOR;
+	if ((kctx->flags & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) != 0)
+		flags |= KG2_TOKEN_FLAG_ACCEPTORSUBKEY;
+	/* We always do confidentiality in wrap tokens */
+	flags |= KG2_TOKEN_FLAG_SEALED;
+
+	*ptr++ = flags;
+	*ptr++ = 0xff;
+	be16ptr = (__be16 *)ptr;
+
+	blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
+	*be16ptr++ = cpu_to_be16(ec);
+	/* "inner" token header always uses 0 for RRC */
+	*be16ptr++ = cpu_to_be16(0);
+
+	be64ptr = (__be64 *)be16ptr;
+	spin_lock(&krb5_seq_lock);
+	*be64ptr = cpu_to_be64(kctx->seq_send64++);
+	spin_unlock(&krb5_seq_lock);
+
+	err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+	if (err)
+		return err;
+
+	now = get_seconds();
+	return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
+}
+
+static u32
+gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
+{
+	s32		now;
+	u64		seqnum;
+	u8		*ptr;
+	u8		flags = 0x00;
+	u16		ec, rrc;
+	int		err;
+	u32		headskip, tailskip;
+	u8		decrypted_hdr[GSS_KRB5_TOK_HDR_LEN];
+	unsigned int	movelen;
+
+
+	dprintk("RPC:       %s\n", __func__);
+
+	if (kctx->gk5e->decrypt_v2 == NULL)
+		return GSS_S_FAILURE;
+
+	ptr = buf->head[0].iov_base + offset;
+
+	if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_WRAP)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	flags = ptr[2];
+	if ((!kctx->initiate && (flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)) ||
+	    (kctx->initiate && !(flags & KG2_TOKEN_FLAG_SENTBYACCEPTOR)))
+		return GSS_S_BAD_SIG;
+
+	if ((flags & KG2_TOKEN_FLAG_SEALED) == 0) {
+		dprintk("%s: token missing expected sealed flag\n", __func__);
+		return GSS_S_DEFECTIVE_TOKEN;
+	}
+
+	if (ptr[3] != 0xff)
+		return GSS_S_DEFECTIVE_TOKEN;
+
+	ec = be16_to_cpup((__be16 *)(ptr + 4));
+	rrc = be16_to_cpup((__be16 *)(ptr + 6));
+
+	seqnum = be64_to_cpup((__be64 *)(ptr + 8));
+
+	if (rrc != 0) {
+		err = rotate_left(kctx, offset, buf, rrc);
+		if (err)
+			return GSS_S_FAILURE;
+	}
+
+	err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
+					&headskip, &tailskip);
+	if (err)
+		return GSS_S_FAILURE;
+
+	/*
+	 * Retrieve the decrypted gss token header and verify
+	 * it against the original
+	 */
+	err = read_bytes_from_xdr_buf(buf,
+				buf->len - GSS_KRB5_TOK_HDR_LEN - tailskip,
+				decrypted_hdr, GSS_KRB5_TOK_HDR_LEN);
+	if (err) {
+		dprintk("%s: error %u getting decrypted_hdr\n", __func__, err);
+		return GSS_S_FAILURE;
+	}
+	if (memcmp(ptr, decrypted_hdr, 6)
+				|| memcmp(ptr + 8, decrypted_hdr + 8, 8)) {
+		dprintk("%s: token hdr, plaintext hdr mismatch!\n", __func__);
+		return GSS_S_FAILURE;
+	}
+
+	/* do sequencing checks */
+
+	/* it got through unscathed.  Make sure the context is unexpired */
+	now = get_seconds();
+	if (now > kctx->endtime)
+		return GSS_S_CONTEXT_EXPIRED;
+
+	/*
+	 * Move the head data back to the right position in xdr_buf.
+	 * We ignore any "ec" data since it might be in the head or
+	 * the tail, and we really don't need to deal with it.
+	 * Note that buf->head[0].iov_len may indicate the available
+	 * head buffer space rather than that actually occupied.
+	 */
+	movelen = min_t(unsigned int, buf->head[0].iov_len, buf->len);
+	movelen -= offset + GSS_KRB5_TOK_HDR_LEN + headskip;
+	BUG_ON(offset + GSS_KRB5_TOK_HDR_LEN + headskip + movelen >
+							buf->head[0].iov_len);
+	memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen);
+	buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+	buf->len -= GSS_KRB5_TOK_HDR_LEN + headskip;
+
+	return GSS_S_COMPLETE;
+}
+
+u32
+gss_wrap_kerberos(struct gss_ctx *gctx, int offset,
+		  struct xdr_buf *buf, struct page **pages)
+{
+	struct krb5_ctx	*kctx = gctx->internal_ctx_id;
+
+	switch (kctx->enctype) {
+	default:
+		BUG();
+	case ENCTYPE_DES_CBC_RAW:
+	case ENCTYPE_DES3_CBC_RAW:
+	case ENCTYPE_ARCFOUR_HMAC:
+		return gss_wrap_kerberos_v1(kctx, offset, buf, pages);
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		return gss_wrap_kerberos_v2(kctx, offset, buf, pages);
+	}
+}
+
+u32
+gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
+{
+	struct krb5_ctx	*kctx = gctx->internal_ctx_id;
+
+	switch (kctx->enctype) {
+	default:
+		BUG();
+	case ENCTYPE_DES_CBC_RAW:
+	case ENCTYPE_DES3_CBC_RAW:
+	case ENCTYPE_ARCFOUR_HMAC:
+		return gss_unwrap_kerberos_v1(kctx, offset, buf);
+	case ENCTYPE_AES128_CTS_HMAC_SHA1_96:
+	case ENCTYPE_AES256_CTS_HMAC_SHA1_96:
+		return gss_unwrap_kerberos_v2(kctx, offset, buf);
+	}
+}
+
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
new file mode 100644
index 00000000..e3c36a27
--- /dev/null
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -0,0 +1,381 @@
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_asn1.h>
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/sunrpc/clnt.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY        RPCDBG_AUTH
+#endif
+
+static LIST_HEAD(registered_mechs);
+static DEFINE_SPINLOCK(registered_mechs_lock);
+
+static void
+gss_mech_free(struct gss_api_mech *gm)
+{
+	struct pf_desc *pf;
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		pf = &gm->gm_pfs[i];
+		kfree(pf->auth_domain_name);
+		pf->auth_domain_name = NULL;
+	}
+}
+
+static inline char *
+make_auth_domain_name(char *name)
+{
+	static char	*prefix = "gss/";
+	char		*new;
+
+	new = kmalloc(strlen(name) + strlen(prefix) + 1, GFP_KERNEL);
+	if (new) {
+		strcpy(new, prefix);
+		strcat(new, name);
+	}
+	return new;
+}
+
+static int
+gss_mech_svc_setup(struct gss_api_mech *gm)
+{
+	struct pf_desc *pf;
+	int i, status;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		pf = &gm->gm_pfs[i];
+		pf->auth_domain_name = make_auth_domain_name(pf->name);
+		status = -ENOMEM;
+		if (pf->auth_domain_name == NULL)
+			goto out;
+		status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor,
+							pf->auth_domain_name);
+		if (status)
+			goto out;
+	}
+	return 0;
+out:
+	gss_mech_free(gm);
+	return status;
+}
+
+int
+gss_mech_register(struct gss_api_mech *gm)
+{
+	int status;
+
+	status = gss_mech_svc_setup(gm);
+	if (status)
+		return status;
+	spin_lock(&registered_mechs_lock);
+	list_add(&gm->gm_list, &registered_mechs);
+	spin_unlock(&registered_mechs_lock);
+	dprintk("RPC:       registered gss mechanism %s\n", gm->gm_name);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_register);
+
+void
+gss_mech_unregister(struct gss_api_mech *gm)
+{
+	spin_lock(&registered_mechs_lock);
+	list_del(&gm->gm_list);
+	spin_unlock(&registered_mechs_lock);
+	dprintk("RPC:       unregistered gss mechanism %s\n", gm->gm_name);
+	gss_mech_free(gm);
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_unregister);
+
+struct gss_api_mech *
+gss_mech_get(struct gss_api_mech *gm)
+{
+	__module_get(gm->gm_owner);
+	return gm;
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_get);
+
+struct gss_api_mech *
+gss_mech_get_by_name(const char *name)
+{
+	struct gss_api_mech	*pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (0 == strcmp(name, pos->gm_name)) {
+			if (try_module_get(pos->gm_owner))
+				gm = pos;
+			break;
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_get_by_name);
+
+struct gss_api_mech *
+gss_mech_get_by_OID(struct xdr_netobj *obj)
+{
+	struct gss_api_mech	*pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (obj->len == pos->gm_oid.len) {
+			if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) {
+				if (try_module_get(pos->gm_owner))
+					gm = pos;
+				break;
+			}
+		}
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_get_by_OID);
+
+static inline int
+mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+			return 1;
+	}
+	return 0;
+}
+
+struct gss_api_mech *
+gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
+{
+	struct gss_api_mech *pos, *gm = NULL;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		if (!mech_supports_pseudoflavor(pos, pseudoflavor)) {
+			module_put(pos->gm_owner);
+			continue;
+		}
+		if (try_module_get(pos->gm_owner))
+			gm = pos;
+		break;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return gm;
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor);
+
+int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr)
+{
+	struct gss_api_mech *pos = NULL;
+	int i = 0;
+
+	spin_lock(&registered_mechs_lock);
+	list_for_each_entry(pos, &registered_mechs, gm_list) {
+		array_ptr[i] = pos->gm_pfs->pseudoflavor;
+		i++;
+	}
+	spin_unlock(&registered_mechs_lock);
+	return i;
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors);
+
+u32
+gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].service == service) {
+			return gm->gm_pfs[i].pseudoflavor;
+		}
+	}
+	return RPC_AUTH_MAXFLAVOR; /* illegal value */
+}
+EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor);
+
+u32
+gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+			return gm->gm_pfs[i].service;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gss_pseudoflavor_to_service);
+
+char *
+gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
+{
+	int i;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].service == service)
+			return gm->gm_pfs[i].auth_domain_name;
+	}
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(gss_service_to_auth_domain_name);
+
+void
+gss_mech_put(struct gss_api_mech * gm)
+{
+	if (gm)
+		module_put(gm->gm_owner);
+}
+
+EXPORT_SYMBOL_GPL(gss_mech_put);
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+int
+gss_import_sec_context(const void *input_token, size_t bufsize,
+		       struct gss_api_mech	*mech,
+		       struct gss_ctx		**ctx_id,
+		       gfp_t gfp_mask)
+{
+	if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
+		return -ENOMEM;
+	(*ctx_id)->mech_type = gss_mech_get(mech);
+
+	return mech->gm_ops
+		->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+
+u32
+gss_get_mic(struct gss_ctx	*context_handle,
+	    struct xdr_buf	*message,
+	    struct xdr_netobj	*mic_token)
+{
+	 return context_handle->mech_type->gm_ops
+		->gss_get_mic(context_handle,
+			      message,
+			      mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+
+u32
+gss_verify_mic(struct gss_ctx		*context_handle,
+	       struct xdr_buf		*message,
+	       struct xdr_netobj	*mic_token)
+{
+	return context_handle->mech_type->gm_ops
+		->gss_verify_mic(context_handle,
+				 message,
+				 mic_token);
+}
+
+/*
+ * This function is called from both the client and server code.
+ * Each makes guarantees about how much "slack" space is available
+ * for the underlying function in "buf"'s head and tail while
+ * performing the wrap.
+ *
+ * The client and server code allocate RPC_MAX_AUTH_SIZE extra
+ * space in both the head and tail which is available for use by
+ * the wrap function.
+ *
+ * Underlying functions should verify they do not use more than
+ * RPC_MAX_AUTH_SIZE of extra space in either the head or tail
+ * when performing the wrap.
+ */
+u32
+gss_wrap(struct gss_ctx	*ctx_id,
+	 int		offset,
+	 struct xdr_buf	*buf,
+	 struct page	**inpages)
+{
+	return ctx_id->mech_type->gm_ops
+		->gss_wrap(ctx_id, offset, buf, inpages);
+}
+
+u32
+gss_unwrap(struct gss_ctx	*ctx_id,
+	   int			offset,
+	   struct xdr_buf	*buf)
+{
+	return ctx_id->mech_type->gm_ops
+		->gss_unwrap(ctx_id, offset, buf);
+}
+
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+u32
+gss_delete_sec_context(struct gss_ctx	**context_handle)
+{
+	dprintk("RPC:       gss_delete_sec_context deleting %p\n",
+			*context_handle);
+
+	if (!*context_handle)
+		return GSS_S_NO_CONTEXT;
+	if ((*context_handle)->internal_ctx_id)
+		(*context_handle)->mech_type->gm_ops
+			->gss_delete_sec_context((*context_handle)
+							->internal_ctx_id);
+	gss_mech_put((*context_handle)->mech_type);
+	kfree(*context_handle);
+	*context_handle=NULL;
+	return GSS_S_COMPLETE;
+}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
new file mode 100644
index 00000000..8d0f7d3c
--- /dev/null
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -0,0 +1,1458 @@
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+
+#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_err.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/cache.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
+ * into replies.
+ *
+ * Key is context handle (\x if empty) and gss_token.
+ * Content is major_status minor_status (integers) context_handle, reply_token.
+ *
+ */
+
+static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b)
+{
+	return a->len == b->len && 0 == memcmp(a->data, b->data, a->len);
+}
+
+#define	RSI_HASHBITS	6
+#define	RSI_HASHMAX	(1<<RSI_HASHBITS)
+
+struct rsi {
+	struct cache_head	h;
+	struct xdr_netobj	in_handle, in_token;
+	struct xdr_netobj	out_handle, out_token;
+	int			major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old);
+static struct rsi *rsi_lookup(struct rsi *item);
+
+static void rsi_free(struct rsi *rsii)
+{
+	kfree(rsii->in_handle.data);
+	kfree(rsii->in_token.data);
+	kfree(rsii->out_handle.data);
+	kfree(rsii->out_token.data);
+}
+
+static void rsi_put(struct kref *ref)
+{
+	struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+	rsi_free(rsii);
+	kfree(rsii);
+}
+
+static inline int rsi_hash(struct rsi *item)
+{
+	return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS)
+	     ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS);
+}
+
+static int rsi_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsi *item = container_of(a, struct rsi, h);
+	struct rsi *tmp = container_of(b, struct rsi, h);
+	return netobj_equal(&item->in_handle, &tmp->in_handle) &&
+	       netobj_equal(&item->in_token, &tmp->in_token);
+}
+
+static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len)
+{
+	dst->len = len;
+	dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL);
+	if (len && !dst->data)
+		return -ENOMEM;
+	return 0;
+}
+
+static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src)
+{
+	return dup_to_netobj(dst, src->data, src->len);
+}
+
+static void rsi_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	new->out_handle.data = NULL;
+	new->out_handle.len = 0;
+	new->out_token.data = NULL;
+	new->out_token.len = 0;
+	new->in_handle.len = item->in_handle.len;
+	item->in_handle.len = 0;
+	new->in_token.len = item->in_token.len;
+	item->in_token.len = 0;
+	new->in_handle.data = item->in_handle.data;
+	item->in_handle.data = NULL;
+	new->in_token.data = item->in_token.data;
+	item->in_token.data = NULL;
+}
+
+static void update_rsi(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct rsi *new = container_of(cnew, struct rsi, h);
+	struct rsi *item = container_of(citem, struct rsi, h);
+
+	BUG_ON(new->out_handle.data || new->out_token.data);
+	new->out_handle.len = item->out_handle.len;
+	item->out_handle.len = 0;
+	new->out_token.len = item->out_token.len;
+	item->out_token.len = 0;
+	new->out_handle.data = item->out_handle.data;
+	item->out_handle.data = NULL;
+	new->out_token.data = item->out_token.data;
+	item->out_token.data = NULL;
+
+	new->major_status = item->major_status;
+	new->minor_status = item->minor_status;
+}
+
+static struct cache_head *rsi_alloc(void)
+{
+	struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
+	if (rsii)
+		return &rsii->h;
+	else
+		return NULL;
+}
+
+static void rsi_request(struct cache_detail *cd,
+		       struct cache_head *h,
+		       char **bpp, int *blen)
+{
+	struct rsi *rsii = container_of(h, struct rsi, h);
+
+	qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len);
+	qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len);
+	(*bpp)[-1] = '\n';
+}
+
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, rsi_request);
+}
+
+
+static int rsi_parse(struct cache_detail *cd,
+		    char *mesg, int mlen)
+{
+	/* context token expiry major minor context token */
+	char *buf = mesg;
+	char *ep;
+	int len;
+	struct rsi rsii, *rsip = NULL;
+	time_t expiry;
+	int status = -EINVAL;
+
+	memset(&rsii, 0, sizeof(rsii));
+	/* handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	status = -ENOMEM;
+	if (dup_to_netobj(&rsii.in_handle, buf, len))
+		goto out;
+
+	/* token */
+	len = qword_get(&mesg, buf, mlen);
+	status = -EINVAL;
+	if (len < 0)
+		goto out;
+	status = -ENOMEM;
+	if (dup_to_netobj(&rsii.in_token, buf, len))
+		goto out;
+
+	rsip = rsi_lookup(&rsii);
+	if (!rsip)
+		goto out;
+
+	rsii.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	status = -EINVAL;
+	if (expiry == 0)
+		goto out;
+
+	/* major/minor */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+	rsii.major_status = simple_strtoul(buf, &ep, 10);
+	if (*ep)
+		goto out;
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0)
+		goto out;
+	rsii.minor_status = simple_strtoul(buf, &ep, 10);
+	if (*ep)
+		goto out;
+
+	/* out_handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0)
+		goto out;
+	status = -ENOMEM;
+	if (dup_to_netobj(&rsii.out_handle, buf, len))
+		goto out;
+
+	/* out_token */
+	len = qword_get(&mesg, buf, mlen);
+	status = -EINVAL;
+	if (len < 0)
+		goto out;
+	status = -ENOMEM;
+	if (dup_to_netobj(&rsii.out_token, buf, len))
+		goto out;
+	rsii.h.expiry_time = expiry;
+	rsip = rsi_update(&rsii, rsip);
+	status = 0;
+out:
+	rsi_free(&rsii);
+	if (rsip)
+		cache_put(&rsip->h, &rsi_cache);
+	else
+		status = -ENOMEM;
+	return status;
+}
+
+static struct cache_detail rsi_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= RSI_HASHMAX,
+	.hash_table     = rsi_table,
+	.name           = "auth.rpcsec.init",
+	.cache_put      = rsi_put,
+	.cache_upcall   = rsi_upcall,
+	.cache_parse    = rsi_parse,
+	.match		= rsi_match,
+	.init		= rsi_init,
+	.update		= update_rsi,
+	.alloc		= rsi_alloc,
+};
+
+static struct rsi *rsi_lookup(struct rsi *item)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+static struct rsi *rsi_update(struct rsi *new, struct rsi *old)
+{
+	struct cache_head *ch;
+	int hash = rsi_hash(new);
+
+	ch = sunrpc_cache_update(&rsi_cache, &new->h,
+				 &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsi, h);
+	else
+		return NULL;
+}
+
+
+/*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+ * The key is a context handle. The content is:
+ *  uid, gidlist, mechanism, service-set, mech-specific-data
+ */
+
+#define	RSC_HASHBITS	10
+#define	RSC_HASHMAX	(1<<RSC_HASHBITS)
+
+#define GSS_SEQ_WIN	128
+
+struct gss_svc_seq_data {
+	/* highest seq number seen so far: */
+	int			sd_max;
+	/* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
+	 * sd_win is nonzero iff sequence number i has been seen already: */
+	unsigned long		sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
+	spinlock_t		sd_lock;
+};
+
+struct rsc {
+	struct cache_head	h;
+	struct xdr_netobj	handle;
+	struct svc_cred		cred;
+	struct gss_svc_seq_data	seqdata;
+	struct gss_ctx		*mechctx;
+	char			*client_name;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old);
+static struct rsc *rsc_lookup(struct rsc *item);
+
+static void rsc_free(struct rsc *rsci)
+{
+	kfree(rsci->handle.data);
+	if (rsci->mechctx)
+		gss_delete_sec_context(&rsci->mechctx);
+	if (rsci->cred.cr_group_info)
+		put_group_info(rsci->cred.cr_group_info);
+	kfree(rsci->client_name);
+}
+
+static void rsc_put(struct kref *ref)
+{
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
+
+	rsc_free(rsci);
+	kfree(rsci);
+}
+
+static inline int
+rsc_hash(struct rsc *rsci)
+{
+	return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS);
+}
+
+static int
+rsc_match(struct cache_head *a, struct cache_head *b)
+{
+	struct rsc *new = container_of(a, struct rsc, h);
+	struct rsc *tmp = container_of(b, struct rsc, h);
+
+	return netobj_equal(&new->handle, &tmp->handle);
+}
+
+static void
+rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	new->handle.len = tmp->handle.len;
+	tmp->handle.len = 0;
+	new->handle.data = tmp->handle.data;
+	tmp->handle.data = NULL;
+	new->mechctx = NULL;
+	new->cred.cr_group_info = NULL;
+	new->client_name = NULL;
+}
+
+static void
+update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
+{
+	struct rsc *new = container_of(cnew, struct rsc, h);
+	struct rsc *tmp = container_of(ctmp, struct rsc, h);
+
+	new->mechctx = tmp->mechctx;
+	tmp->mechctx = NULL;
+	memset(&new->seqdata, 0, sizeof(new->seqdata));
+	spin_lock_init(&new->seqdata.sd_lock);
+	new->cred = tmp->cred;
+	tmp->cred.cr_group_info = NULL;
+	new->client_name = tmp->client_name;
+	tmp->client_name = NULL;
+}
+
+static struct cache_head *
+rsc_alloc(void)
+{
+	struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
+	if (rsci)
+		return &rsci->h;
+	else
+		return NULL;
+}
+
+static int rsc_parse(struct cache_detail *cd,
+		     char *mesg, int mlen)
+{
+	/* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+	char *buf = mesg;
+	int len, rv;
+	struct rsc rsci, *rscp = NULL;
+	time_t expiry;
+	int status = -EINVAL;
+	struct gss_api_mech *gm = NULL;
+
+	memset(&rsci, 0, sizeof(rsci));
+	/* context handle */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0) goto out;
+	status = -ENOMEM;
+	if (dup_to_netobj(&rsci.handle, buf, len))
+		goto out;
+
+	rsci.h.flags = 0;
+	/* expiry */
+	expiry = get_expiry(&mesg);
+	status = -EINVAL;
+	if (expiry == 0)
+		goto out;
+
+	rscp = rsc_lookup(&rsci);
+	if (!rscp)
+		goto out;
+
+	/* uid, or NEGATIVE */
+	rv = get_int(&mesg, &rsci.cred.cr_uid);
+	if (rv == -EINVAL)
+		goto out;
+	if (rv == -ENOENT)
+		set_bit(CACHE_NEGATIVE, &rsci.h.flags);
+	else {
+		int N, i;
+
+		/* gid */
+		if (get_int(&mesg, &rsci.cred.cr_gid))
+			goto out;
+
+		/* number of additional gid's */
+		if (get_int(&mesg, &N))
+			goto out;
+		status = -ENOMEM;
+		rsci.cred.cr_group_info = groups_alloc(N);
+		if (rsci.cred.cr_group_info == NULL)
+			goto out;
+
+		/* gid's */
+		status = -EINVAL;
+		for (i=0; i<N; i++) {
+			gid_t gid;
+			if (get_int(&mesg, &gid))
+				goto out;
+			GROUP_AT(rsci.cred.cr_group_info, i) = gid;
+		}
+
+		/* mech name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		gm = gss_mech_get_by_name(buf);
+		status = -EOPNOTSUPP;
+		if (!gm)
+			goto out;
+
+		status = -EINVAL;
+		/* mech-specific data: */
+		len = qword_get(&mesg, buf, mlen);
+		if (len < 0)
+			goto out;
+		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL);
+		if (status)
+			goto out;
+
+		/* get client name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len > 0) {
+			rsci.client_name = kstrdup(buf, GFP_KERNEL);
+			if (!rsci.client_name)
+				goto out;
+		}
+
+	}
+	rsci.h.expiry_time = expiry;
+	rscp = rsc_update(&rsci, rscp);
+	status = 0;
+out:
+	gss_mech_put(gm);
+	rsc_free(&rsci);
+	if (rscp)
+		cache_put(&rscp->h, &rsc_cache);
+	else
+		status = -ENOMEM;
+	return status;
+}
+
+static struct cache_detail rsc_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= RSC_HASHMAX,
+	.hash_table	= rsc_table,
+	.name		= "auth.rpcsec.context",
+	.cache_put	= rsc_put,
+	.cache_parse	= rsc_parse,
+	.match		= rsc_match,
+	.init		= rsc_init,
+	.update		= update_rsc,
+	.alloc		= rsc_alloc,
+};
+
+static struct rsc *rsc_lookup(struct rsc *item)
+{
+	struct cache_head *ch;
+	int hash = rsc_hash(item);
+
+	ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+static struct rsc *rsc_update(struct rsc *new, struct rsc *old)
+{
+	struct cache_head *ch;
+	int hash = rsc_hash(new);
+
+	ch = sunrpc_cache_update(&rsc_cache, &new->h,
+				 &old->h, hash);
+	if (ch)
+		return container_of(ch, struct rsc, h);
+	else
+		return NULL;
+}
+
+
+static struct rsc *
+gss_svc_searchbyctx(struct xdr_netobj *handle)
+{
+	struct rsc rsci;
+	struct rsc *found;
+
+	memset(&rsci, 0, sizeof(rsci));
+	if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+		return NULL;
+	found = rsc_lookup(&rsci);
+	rsc_free(&rsci);
+	if (!found)
+		return NULL;
+	if (cache_check(&rsc_cache, &found->h, NULL))
+		return NULL;
+	return found;
+}
+
+/* Implements sequence number algorithm as specified in RFC 2203. */
+static int
+gss_check_seq_num(struct rsc *rsci, int seq_num)
+{
+	struct gss_svc_seq_data *sd = &rsci->seqdata;
+
+	spin_lock(&sd->sd_lock);
+	if (seq_num > sd->sd_max) {
+		if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
+			memset(sd->sd_win,0,sizeof(sd->sd_win));
+			sd->sd_max = seq_num;
+		} else while (sd->sd_max < seq_num) {
+			sd->sd_max++;
+			__clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win);
+		}
+		__set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
+		goto ok;
+	} else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) {
+		goto drop;
+	}
+	/* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */
+	if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
+		goto drop;
+ok:
+	spin_unlock(&sd->sd_lock);
+	return 1;
+drop:
+	spin_unlock(&sd->sd_lock);
+	return 0;
+}
+
+static inline u32 round_up_to_quad(u32 i)
+{
+	return (i + 3 ) & ~3;
+}
+
+static inline int
+svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o)
+{
+	int l;
+
+	if (argv->iov_len < 4)
+		return -1;
+	o->len = svc_getnl(argv);
+	l = round_up_to_quad(o->len);
+	if (argv->iov_len < l)
+		return -1;
+	o->data = argv->iov_base;
+	argv->iov_base += l;
+	argv->iov_len -= l;
+	return 0;
+}
+
+static inline int
+svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o)
+{
+	u8 *p;
+
+	if (resv->iov_len + 4 > PAGE_SIZE)
+		return -1;
+	svc_putnl(resv, o->len);
+	p = resv->iov_base + resv->iov_len;
+	resv->iov_len += round_up_to_quad(o->len);
+	if (resv->iov_len > PAGE_SIZE)
+		return -1;
+	memcpy(p, o->data, o->len);
+	memset(p + o->len, 0, round_up_to_quad(o->len) - o->len);
+	return 0;
+}
+
+/*
+ * Verify the checksum on the header and return SVC_OK on success.
+ * Otherwise, return SVC_DROP (in the case of a bad sequence number)
+ * or return SVC_DENIED and indicate error in authp.
+ */
+static int
+gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
+		  __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+	struct gss_ctx		*ctx_id = rsci->mechctx;
+	struct xdr_buf		rpchdr;
+	struct xdr_netobj	checksum;
+	u32			flavor = 0;
+	struct kvec		*argv = &rqstp->rq_arg.head[0];
+	struct kvec		iov;
+
+	/* data to compute the checksum over: */
+	iov.iov_base = rpcstart;
+	iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart;
+	xdr_buf_from_iov(&iov, &rpchdr);
+
+	*authp = rpc_autherr_badverf;
+	if (argv->iov_len < 4)
+		return SVC_DENIED;
+	flavor = svc_getnl(argv);
+	if (flavor != RPC_AUTH_GSS)
+		return SVC_DENIED;
+	if (svc_safe_getnetobj(argv, &checksum))
+		return SVC_DENIED;
+
+	if (rqstp->rq_deferred) /* skip verification of revisited request */
+		return SVC_OK;
+	if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) {
+		*authp = rpcsec_gsserr_credproblem;
+		return SVC_DENIED;
+	}
+
+	if (gc->gc_seq > MAXSEQ) {
+		dprintk("RPC:       svcauth_gss: discarding request with "
+				"large sequence number %d\n", gc->gc_seq);
+		*authp = rpcsec_gsserr_ctxproblem;
+		return SVC_DENIED;
+	}
+	if (!gss_check_seq_num(rsci, gc->gc_seq)) {
+		dprintk("RPC:       svcauth_gss: discarding request with "
+				"old sequence number %d\n", gc->gc_seq);
+		return SVC_DROP;
+	}
+	return SVC_OK;
+}
+
+static int
+gss_write_null_verf(struct svc_rqst *rqstp)
+{
+	__be32     *p;
+
+	svc_putnl(rqstp->rq_res.head, RPC_AUTH_NULL);
+	p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
+	/* don't really need to check if head->iov_len > PAGE_SIZE ... */
+	*p++ = 0;
+	if (!xdr_ressize_check(rqstp, p))
+		return -1;
+	return 0;
+}
+
+static int
+gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq)
+{
+	__be32			xdr_seq;
+	u32			maj_stat;
+	struct xdr_buf		verf_data;
+	struct xdr_netobj	mic;
+	__be32			*p;
+	struct kvec		iov;
+
+	svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS);
+	xdr_seq = htonl(seq);
+
+	iov.iov_base = &xdr_seq;
+	iov.iov_len = sizeof(xdr_seq);
+	xdr_buf_from_iov(&iov, &verf_data);
+	p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len;
+	mic.data = (u8 *)(p + 1);
+	maj_stat = gss_get_mic(ctx_id, &verf_data, &mic);
+	if (maj_stat != GSS_S_COMPLETE)
+		return -1;
+	*p++ = htonl(mic.len);
+	memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len);
+	p += XDR_QUADLEN(mic.len);
+	if (!xdr_ressize_check(rqstp, p))
+		return -1;
+	return 0;
+}
+
+struct gss_domain {
+	struct auth_domain	h;
+	u32			pseudoflavor;
+};
+
+static struct auth_domain *
+find_gss_auth_domain(struct gss_ctx *ctx, u32 svc)
+{
+	char *name;
+
+	name = gss_service_to_auth_domain_name(ctx->mech_type, svc);
+	if (!name)
+		return NULL;
+	return auth_domain_find(name);
+}
+
+static struct auth_ops svcauthops_gss;
+
+u32 svcauth_gss_flavor(struct auth_domain *dom)
+{
+	struct gss_domain *gd = container_of(dom, struct gss_domain, h);
+
+	return gd->pseudoflavor;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_gss_flavor);
+
+int
+svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
+{
+	struct gss_domain	*new;
+	struct auth_domain	*test;
+	int			stat = -ENOMEM;
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out;
+	kref_init(&new->h.ref);
+	new->h.name = kstrdup(name, GFP_KERNEL);
+	if (!new->h.name)
+		goto out_free_dom;
+	new->h.flavour = &svcauthops_gss;
+	new->pseudoflavor = pseudoflavor;
+
+	stat = 0;
+	test = auth_domain_lookup(name, &new->h);
+	if (test != &new->h) { /* Duplicate registration */
+		auth_domain_put(test);
+		kfree(new->h.name);
+		goto out_free_dom;
+	}
+	return 0;
+
+out_free_dom:
+	kfree(new);
+out:
+	return stat;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor);
+
+static inline int
+read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+{
+	__be32  raw;
+	int     status;
+
+	status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+	if (status)
+		return status;
+	*obj = ntohl(raw);
+	return 0;
+}
+
+/* It would be nice if this bit of code could be shared with the client.
+ * Obstacles:
+ *	The client shouldn't malloc(), would have to pass in own memory.
+ *	The server uses base of head iovec as read pointer, while the
+ *	client uses separate pointer. */
+static int
+unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+	int stat = -EINVAL;
+	u32 integ_len, maj_stat;
+	struct xdr_netobj mic;
+	struct xdr_buf integ_buf;
+
+	integ_len = svc_getnl(&buf->head[0]);
+	if (integ_len & 3)
+		return stat;
+	if (integ_len > buf->len)
+		return stat;
+	if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+		BUG();
+	/* copy out mic... */
+	if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
+		BUG();
+	if (mic.len > RPC_MAX_AUTH_SIZE)
+		return stat;
+	mic.data = kmalloc(mic.len, GFP_KERNEL);
+	if (!mic.data)
+		return stat;
+	if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
+		goto out;
+	maj_stat = gss_verify_mic(ctx, &integ_buf, &mic);
+	if (maj_stat != GSS_S_COMPLETE)
+		goto out;
+	if (svc_getnl(&buf->head[0]) != seq)
+		goto out;
+	stat = 0;
+out:
+	kfree(mic.data);
+	return stat;
+}
+
+static inline int
+total_buf_len(struct xdr_buf *buf)
+{
+	return buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len;
+}
+
+static void
+fix_priv_head(struct xdr_buf *buf, int pad)
+{
+	if (buf->page_len == 0) {
+		/* We need to adjust head and buf->len in tandem in this
+		 * case to make svc_defer() work--it finds the original
+		 * buffer start using buf->len - buf->head[0].iov_len. */
+		buf->head[0].iov_len -= pad;
+	}
+}
+
+static int
+unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+	u32 priv_len, maj_stat;
+	int pad, saved_len, remaining_len, offset;
+
+	rqstp->rq_splice_ok = 0;
+
+	priv_len = svc_getnl(&buf->head[0]);
+	if (rqstp->rq_deferred) {
+		/* Already decrypted last time through! The sequence number
+		 * check at out_seq is unnecessary but harmless: */
+		goto out_seq;
+	}
+	/* buf->len is the number of bytes from the original start of the
+	 * request to the end, where head[0].iov_len is just the bytes
+	 * not yet read from the head, so these two values are different: */
+	remaining_len = total_buf_len(buf);
+	if (priv_len > remaining_len)
+		return -EINVAL;
+	pad = remaining_len - priv_len;
+	buf->len -= pad;
+	fix_priv_head(buf, pad);
+
+	/* Maybe it would be better to give gss_unwrap a length parameter: */
+	saved_len = buf->len;
+	buf->len = priv_len;
+	maj_stat = gss_unwrap(ctx, 0, buf);
+	pad = priv_len - buf->len;
+	buf->len = saved_len;
+	buf->len -= pad;
+	/* The upper layers assume the buffer is aligned on 4-byte boundaries.
+	 * In the krb5p case, at least, the data ends up offset, so we need to
+	 * move it around. */
+	/* XXX: This is very inefficient.  It would be better to either do
+	 * this while we encrypt, or maybe in the receive code, if we can peak
+	 * ahead and work out the service and mechanism there. */
+	offset = buf->head[0].iov_len % 4;
+	if (offset) {
+		buf->buflen = RPCSVC_MAXPAYLOAD;
+		xdr_shift_buf(buf, offset);
+		fix_priv_head(buf, pad);
+	}
+	if (maj_stat != GSS_S_COMPLETE)
+		return -EINVAL;
+out_seq:
+	if (svc_getnl(&buf->head[0]) != seq)
+		return -EINVAL;
+	return 0;
+}
+
+struct gss_svc_data {
+	/* decoded gss client cred: */
+	struct rpc_gss_wire_cred	clcred;
+	/* save a pointer to the beginning of the encoded verifier,
+	 * for use in encryption/checksumming in svcauth_gss_release: */
+	__be32				*verf_start;
+	struct rsc			*rsci;
+};
+
+char *svc_gss_principal(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data;
+
+	if (gd && gd->rsci)
+		return gd->rsci->client_name;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(svc_gss_principal);
+
+static int
+svcauth_gss_set_client(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+	struct rsc *rsci = svcdata->rsci;
+	struct rpc_gss_wire_cred *gc = &svcdata->clcred;
+	int stat;
+
+	/*
+	 * A gss export can be specified either by:
+	 * 	export	*(sec=krb5,rw)
+	 * or by
+	 * 	export gss/krb5(rw)
+	 * The latter is deprecated; but for backwards compatibility reasons
+	 * the nfsd code will still fall back on trying it if the former
+	 * doesn't work; so we try to make both available to nfsd, below.
+	 */
+	rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc);
+	if (rqstp->rq_gssclient == NULL)
+		return SVC_DENIED;
+	stat = svcauth_unix_set_client(rqstp);
+	if (stat == SVC_DROP || stat == SVC_CLOSE)
+		return stat;
+	return SVC_OK;
+}
+
+static inline int
+gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip)
+{
+	struct rsc *rsci;
+	int        rc;
+
+	if (rsip->major_status != GSS_S_COMPLETE)
+		return gss_write_null_verf(rqstp);
+	rsci = gss_svc_searchbyctx(&rsip->out_handle);
+	if (rsci == NULL) {
+		rsip->major_status = GSS_S_NO_CONTEXT;
+		return gss_write_null_verf(rqstp);
+	}
+	rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN);
+	cache_put(&rsci->h, &rsc_cache);
+	return rc;
+}
+
+/*
+ * Having read the cred already and found we're in the context
+ * initiation case, read the verifier and initiate (or check the results
+ * of) upcalls to userspace for help with context initiation.  If
+ * the upcall results are available, write the verifier and result.
+ * Otherwise, drop the request pending an answer to the upcall.
+ */
+static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
+			struct rpc_gss_wire_cred *gc, __be32 *authp)
+{
+	struct kvec *argv = &rqstp->rq_arg.head[0];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+	struct xdr_netobj tmpobj;
+	struct rsi *rsip, rsikey;
+	int ret;
+
+	/* Read the verifier; should be NULL: */
+	*authp = rpc_autherr_badverf;
+	if (argv->iov_len < 2 * 4)
+		return SVC_DENIED;
+	if (svc_getnl(argv) != RPC_AUTH_NULL)
+		return SVC_DENIED;
+	if (svc_getnl(argv) != 0)
+		return SVC_DENIED;
+
+	/* Martial context handle and token for upcall: */
+	*authp = rpc_autherr_badcred;
+	if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+		return SVC_DENIED;
+	memset(&rsikey, 0, sizeof(rsikey));
+	if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
+		return SVC_CLOSE;
+	*authp = rpc_autherr_badverf;
+	if (svc_safe_getnetobj(argv, &tmpobj)) {
+		kfree(rsikey.in_handle.data);
+		return SVC_DENIED;
+	}
+	if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+		kfree(rsikey.in_handle.data);
+		return SVC_CLOSE;
+	}
+
+	/* Perform upcall, or find upcall result: */
+	rsip = rsi_lookup(&rsikey);
+	rsi_free(&rsikey);
+	if (!rsip)
+		return SVC_CLOSE;
+	if (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0)
+		/* No upcall result: */
+		return SVC_CLOSE;
+
+	ret = SVC_CLOSE;
+	/* Got an answer to the upcall; use it: */
+	if (gss_write_init_verf(rqstp, rsip))
+		goto out;
+	if (resv->iov_len + 4 > PAGE_SIZE)
+		goto out;
+	svc_putnl(resv, RPC_SUCCESS);
+	if (svc_safe_putnetobj(resv, &rsip->out_handle))
+		goto out;
+	if (resv->iov_len + 3 * 4 > PAGE_SIZE)
+		goto out;
+	svc_putnl(resv, rsip->major_status);
+	svc_putnl(resv, rsip->minor_status);
+	svc_putnl(resv, GSS_SEQ_WIN);
+	if (svc_safe_putnetobj(resv, &rsip->out_token))
+		goto out;
+
+	ret = SVC_COMPLETE;
+out:
+	cache_put(&rsip->h, &rsi_cache);
+	return ret;
+}
+
+/*
+ * Accept an rpcsec packet.
+ * If context establishment, punt to user space
+ * If data exchange, verify/decrypt
+ * If context destruction, handle here
+ * In the context establishment and destruction case we encode
+ * response here and return SVC_COMPLETE.
+ */
+static int
+svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+	u32		crlen;
+	struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc;
+	struct rsc	*rsci = NULL;
+	__be32		*rpcstart;
+	__be32		*reject_stat = resv->iov_base + resv->iov_len;
+	int		ret;
+
+	dprintk("RPC:       svcauth_gss: argv->iov_len = %zd\n",
+			argv->iov_len);
+
+	*authp = rpc_autherr_badcred;
+	if (!svcdata)
+		svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
+	if (!svcdata)
+		goto auth_err;
+	rqstp->rq_auth_data = svcdata;
+	svcdata->verf_start = NULL;
+	svcdata->rsci = NULL;
+	gc = &svcdata->clcred;
+
+	/* start of rpc packet is 7 u32's back from here:
+	 * xid direction rpcversion prog vers proc flavour
+	 */
+	rpcstart = argv->iov_base;
+	rpcstart -= 7;
+
+	/* credential is:
+	 *   version(==1), proc(0,1,2,3), seq, service (1,2,3), handle
+	 * at least 5 u32s, and is preceded by length, so that makes 6.
+	 */
+
+	if (argv->iov_len < 5 * 4)
+		goto auth_err;
+	crlen = svc_getnl(argv);
+	if (svc_getnl(argv) != RPC_GSS_VERSION)
+		goto auth_err;
+	gc->gc_proc = svc_getnl(argv);
+	gc->gc_seq = svc_getnl(argv);
+	gc->gc_svc = svc_getnl(argv);
+	if (svc_safe_getnetobj(argv, &gc->gc_ctx))
+		goto auth_err;
+	if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4)
+		goto auth_err;
+
+	if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0))
+		goto auth_err;
+
+	*authp = rpc_autherr_badverf;
+	switch (gc->gc_proc) {
+	case RPC_GSS_PROC_INIT:
+	case RPC_GSS_PROC_CONTINUE_INIT:
+		return svcauth_gss_handle_init(rqstp, gc, authp);
+	case RPC_GSS_PROC_DATA:
+	case RPC_GSS_PROC_DESTROY:
+		/* Look up the context, and check the verifier: */
+		*authp = rpcsec_gsserr_credproblem;
+		rsci = gss_svc_searchbyctx(&gc->gc_ctx);
+		if (!rsci)
+			goto auth_err;
+		switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) {
+		case SVC_OK:
+			break;
+		case SVC_DENIED:
+			goto auth_err;
+		case SVC_DROP:
+			goto drop;
+		}
+		break;
+	default:
+		*authp = rpc_autherr_rejectedcred;
+		goto auth_err;
+	}
+
+	/* now act upon the command: */
+	switch (gc->gc_proc) {
+	case RPC_GSS_PROC_DESTROY:
+		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+			goto auth_err;
+		rsci->h.expiry_time = get_seconds();
+		set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+		if (resv->iov_len + 4 > PAGE_SIZE)
+			goto drop;
+		svc_putnl(resv, RPC_SUCCESS);
+		goto complete;
+	case RPC_GSS_PROC_DATA:
+		*authp = rpcsec_gsserr_ctxproblem;
+		svcdata->verf_start = resv->iov_base + resv->iov_len;
+		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+			goto auth_err;
+		rqstp->rq_cred = rsci->cred;
+		get_group_info(rsci->cred.cr_group_info);
+		*authp = rpc_autherr_badcred;
+		switch (gc->gc_svc) {
+		case RPC_GSS_SVC_NONE:
+			break;
+		case RPC_GSS_SVC_INTEGRITY:
+			/* placeholders for length and seq. number: */
+			svc_putnl(resv, 0);
+			svc_putnl(resv, 0);
+			if (unwrap_integ_data(&rqstp->rq_arg,
+					gc->gc_seq, rsci->mechctx))
+				goto garbage_args;
+			break;
+		case RPC_GSS_SVC_PRIVACY:
+			/* placeholders for length and seq. number: */
+			svc_putnl(resv, 0);
+			svc_putnl(resv, 0);
+			if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
+					gc->gc_seq, rsci->mechctx))
+				goto garbage_args;
+			break;
+		default:
+			goto auth_err;
+		}
+		svcdata->rsci = rsci;
+		cache_get(&rsci->h);
+		rqstp->rq_flavor = gss_svc_to_pseudoflavor(
+					rsci->mechctx->mech_type, gc->gc_svc);
+		ret = SVC_OK;
+		goto out;
+	}
+garbage_args:
+	ret = SVC_GARBAGE;
+	goto out;
+auth_err:
+	/* Restore write pointer to its original value: */
+	xdr_ressize_check(rqstp, reject_stat);
+	ret = SVC_DENIED;
+	goto out;
+complete:
+	ret = SVC_COMPLETE;
+	goto out;
+drop:
+	ret = SVC_DROP;
+out:
+	if (rsci)
+		cache_put(&rsci->h, &rsc_cache);
+	return ret;
+}
+
+static __be32 *
+svcauth_gss_prepare_to_wrap(struct xdr_buf *resbuf, struct gss_svc_data *gsd)
+{
+	__be32 *p;
+	u32 verf_len;
+
+	p = gsd->verf_start;
+	gsd->verf_start = NULL;
+
+	/* If the reply stat is nonzero, don't wrap: */
+	if (*(p-1) != rpc_success)
+		return NULL;
+	/* Skip the verifier: */
+	p += 1;
+	verf_len = ntohl(*p++);
+	p += XDR_QUADLEN(verf_len);
+	/* move accept_stat to right place: */
+	memcpy(p, p + 2, 4);
+	/* Also don't wrap if the accept stat is nonzero: */
+	if (*p != rpc_success) {
+		resbuf->head[0].iov_len -= 2 * 4;
+		return NULL;
+	}
+	p++;
+	return p;
+}
+
+static inline int
+svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+	struct xdr_buf *resbuf = &rqstp->rq_res;
+	struct xdr_buf integ_buf;
+	struct xdr_netobj mic;
+	struct kvec *resv;
+	__be32 *p;
+	int integ_offset, integ_len;
+	int stat = -EINVAL;
+
+	p = svcauth_gss_prepare_to_wrap(resbuf, gsd);
+	if (p == NULL)
+		goto out;
+	integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+	integ_len = resbuf->len - integ_offset;
+	BUG_ON(integ_len % 4);
+	*p++ = htonl(integ_len);
+	*p++ = htonl(gc->gc_seq);
+	if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+				integ_len))
+		BUG();
+	if (resbuf->tail[0].iov_base == NULL) {
+		if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+			goto out_err;
+		resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+						+ resbuf->head[0].iov_len;
+		resbuf->tail[0].iov_len = 0;
+		resv = &resbuf->tail[0];
+	} else {
+		resv = &resbuf->tail[0];
+	}
+	mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+	if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic))
+		goto out_err;
+	svc_putnl(resv, mic.len);
+	memset(mic.data + mic.len, 0,
+			round_up_to_quad(mic.len) - mic.len);
+	resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+	/* not strictly required: */
+	resbuf->len += XDR_QUADLEN(mic.len) << 2;
+	BUG_ON(resv->iov_len > PAGE_SIZE);
+out:
+	stat = 0;
+out_err:
+	return stat;
+}
+
+static inline int
+svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+	struct xdr_buf *resbuf = &rqstp->rq_res;
+	struct page **inpages = NULL;
+	__be32 *p, *len;
+	int offset;
+	int pad;
+
+	p = svcauth_gss_prepare_to_wrap(resbuf, gsd);
+	if (p == NULL)
+		return 0;
+	len = p++;
+	offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
+	*p++ = htonl(gc->gc_seq);
+	inpages = resbuf->pages;
+	/* XXX: Would be better to write some xdr helper functions for
+	 * nfs{2,3,4}xdr.c that place the data right, instead of copying: */
+
+	/*
+	 * If there is currently tail data, make sure there is
+	 * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in
+	 * the page, and move the current tail data such that
+	 * there is RPC_MAX_AUTH_SIZE slack space available in
+	 * both the head and tail.
+	 */
+	if (resbuf->tail[0].iov_base) {
+		BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base
+							+ PAGE_SIZE);
+		BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base);
+		if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len
+				+ 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+			return -ENOMEM;
+		memmove(resbuf->tail[0].iov_base + RPC_MAX_AUTH_SIZE,
+			resbuf->tail[0].iov_base,
+			resbuf->tail[0].iov_len);
+		resbuf->tail[0].iov_base += RPC_MAX_AUTH_SIZE;
+	}
+	/*
+	 * If there is no current tail data, make sure there is
+	 * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the
+	 * allotted page, and set up tail information such that there
+	 * is RPC_MAX_AUTH_SIZE slack space available in both the
+	 * head and tail.
+	 */
+	if (resbuf->tail[0].iov_base == NULL) {
+		if (resbuf->head[0].iov_len + 2*RPC_MAX_AUTH_SIZE > PAGE_SIZE)
+			return -ENOMEM;
+		resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+			+ resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE;
+		resbuf->tail[0].iov_len = 0;
+	}
+	if (gss_wrap(gsd->rsci->mechctx, offset, resbuf, inpages))
+		return -ENOMEM;
+	*len = htonl(resbuf->len - offset);
+	pad = 3 - ((resbuf->len - offset - 1)&3);
+	p = (__be32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
+	memset(p, 0, pad);
+	resbuf->tail[0].iov_len += pad;
+	resbuf->len += pad;
+	return 0;
+}
+
+static int
+svcauth_gss_release(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+	struct xdr_buf *resbuf = &rqstp->rq_res;
+	int stat = -EINVAL;
+
+	if (gc->gc_proc != RPC_GSS_PROC_DATA)
+		goto out;
+	/* Release can be called twice, but we only wrap once. */
+	if (gsd->verf_start == NULL)
+		goto out;
+	/* normally not set till svc_send, but we need it here: */
+	/* XXX: what for?  Do we mess it up the moment we call svc_putu32
+	 * or whatever? */
+	resbuf->len = total_buf_len(resbuf);
+	switch (gc->gc_svc) {
+	case RPC_GSS_SVC_NONE:
+		break;
+	case RPC_GSS_SVC_INTEGRITY:
+		stat = svcauth_gss_wrap_resp_integ(rqstp);
+		if (stat)
+			goto out_err;
+		break;
+	case RPC_GSS_SVC_PRIVACY:
+		stat = svcauth_gss_wrap_resp_priv(rqstp);
+		if (stat)
+			goto out_err;
+		break;
+	/*
+	 * For any other gc_svc value, svcauth_gss_accept() already set
+	 * the auth_error appropriately; just fall through:
+	 */
+	}
+
+out:
+	stat = 0;
+out_err:
+	if (rqstp->rq_client)
+		auth_domain_put(rqstp->rq_client);
+	rqstp->rq_client = NULL;
+	if (rqstp->rq_gssclient)
+		auth_domain_put(rqstp->rq_gssclient);
+	rqstp->rq_gssclient = NULL;
+	if (rqstp->rq_cred.cr_group_info)
+		put_group_info(rqstp->rq_cred.cr_group_info);
+	rqstp->rq_cred.cr_group_info = NULL;
+	if (gsd->rsci)
+		cache_put(&gsd->rsci->h, &rsc_cache);
+	gsd->rsci = NULL;
+
+	return stat;
+}
+
+static void
+svcauth_gss_domain_release(struct auth_domain *dom)
+{
+	struct gss_domain *gd = container_of(dom, struct gss_domain, h);
+
+	kfree(dom->name);
+	kfree(gd);
+}
+
+static struct auth_ops svcauthops_gss = {
+	.name		= "rpcsec_gss",
+	.owner		= THIS_MODULE,
+	.flavour	= RPC_AUTH_GSS,
+	.accept		= svcauth_gss_accept,
+	.release	= svcauth_gss_release,
+	.domain_release = svcauth_gss_domain_release,
+	.set_client	= svcauth_gss_set_client,
+};
+
+int
+gss_svc_init(void)
+{
+	int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
+	if (rv)
+		return rv;
+	rv = cache_register(&rsc_cache);
+	if (rv)
+		goto out1;
+	rv = cache_register(&rsi_cache);
+	if (rv)
+		goto out2;
+	return 0;
+out2:
+	cache_unregister(&rsc_cache);
+out1:
+	svc_auth_unregister(RPC_AUTH_GSS);
+	return rv;
+}
+
+void
+gss_svc_shutdown(void)
+{
+	cache_unregister(&rsc_cache);
+	cache_unregister(&rsi_cache);
+	svc_auth_unregister(RPC_AUTH_GSS);
+}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
new file mode 100644
index 00000000..a5c36c01
--- /dev/null
+++ b/net/sunrpc/auth_null.c
@@ -0,0 +1,142 @@
+/*
+ * linux/net/sunrpc/auth_null.c
+ *
+ * AUTH_NULL authentication. Really :-)
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+static struct rpc_auth null_auth;
+static struct rpc_cred null_cred;
+
+static struct rpc_auth *
+nul_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+{
+	atomic_inc(&null_auth.au_count);
+	return &null_auth;
+}
+
+static void
+nul_destroy(struct rpc_auth *auth)
+{
+}
+
+/*
+ * Lookup NULL creds for current process
+ */
+static struct rpc_cred *
+nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	return get_rpccred(&null_cred);
+}
+
+/*
+ * Destroy cred handle.
+ */
+static void
+nul_destroy_cred(struct rpc_cred *cred)
+{
+}
+
+/*
+ * Match cred handle against current process
+ */
+static int
+nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags)
+{
+	return 1;
+}
+
+/*
+ * Marshal credential.
+ */
+static __be32 *
+nul_marshal(struct rpc_task *task, __be32 *p)
+{
+	*p++ = htonl(RPC_AUTH_NULL);
+	*p++ = 0;
+	*p++ = htonl(RPC_AUTH_NULL);
+	*p++ = 0;
+
+	return p;
+}
+
+/*
+ * Refresh credential. This is a no-op for AUTH_NULL
+ */
+static int
+nul_refresh(struct rpc_task *task)
+{
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
+	return 0;
+}
+
+static __be32 *
+nul_validate(struct rpc_task *task, __be32 *p)
+{
+	rpc_authflavor_t	flavor;
+	u32			size;
+
+	flavor = ntohl(*p++);
+	if (flavor != RPC_AUTH_NULL) {
+		printk("RPC: bad verf flavor: %u\n", flavor);
+		return NULL;
+	}
+
+	size = ntohl(*p++);
+	if (size != 0) {
+		printk("RPC: bad verf size: %u\n", size);
+		return NULL;
+	}
+
+	return p;
+}
+
+const struct rpc_authops authnull_ops = {
+	.owner		= THIS_MODULE,
+	.au_flavor	= RPC_AUTH_NULL,
+	.au_name	= "NULL",
+	.create		= nul_create,
+	.destroy	= nul_destroy,
+	.lookup_cred	= nul_lookup_cred,
+};
+
+static
+struct rpc_auth null_auth = {
+	.au_cslack	= 4,
+	.au_rslack	= 2,
+	.au_ops		= &authnull_ops,
+	.au_flavor	= RPC_AUTH_NULL,
+	.au_count	= ATOMIC_INIT(0),
+};
+
+static
+const struct rpc_credops null_credops = {
+	.cr_name	= "AUTH_NULL",
+	.crdestroy	= nul_destroy_cred,
+	.crbind		= rpcauth_generic_bind_cred,
+	.crmatch	= nul_match,
+	.crmarshal	= nul_marshal,
+	.crrefresh	= nul_refresh,
+	.crvalidate	= nul_validate,
+};
+
+static
+struct rpc_cred null_cred = {
+	.cr_lru		= LIST_HEAD_INIT(null_cred.cr_lru),
+	.cr_auth	= &null_auth,
+	.cr_ops		= &null_credops,
+	.cr_count	= ATOMIC_INIT(1),
+	.cr_flags	= 1UL << RPCAUTH_CRED_UPTODATE,
+#ifdef RPC_DEBUG
+	.cr_magic	= RPCAUTH_CRED_MAGIC,
+#endif
+};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
new file mode 100644
index 00000000..e50502d8
--- /dev/null
+++ b/net/sunrpc/auth_unix.c
@@ -0,0 +1,246 @@
+/*
+ * linux/net/sunrpc/auth_unix.c
+ *
+ * UNIX-style authentication; no AUTH_SHORT support
+ *
+ * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/auth.h>
+
+#define NFS_NGROUPS	16
+
+struct unx_cred {
+	struct rpc_cred		uc_base;
+	gid_t			uc_gid;
+	gid_t			uc_gids[NFS_NGROUPS];
+};
+#define uc_uid			uc_base.cr_uid
+
+#define UNX_WRITESLACK		(21 + (UNX_MAXNODENAME >> 2))
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_AUTH
+#endif
+
+static struct rpc_auth		unix_auth;
+static const struct rpc_credops	unix_credops;
+
+static struct rpc_auth *
+unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+{
+	dprintk("RPC:       creating UNIX authenticator for client %p\n",
+			clnt);
+	atomic_inc(&unix_auth.au_count);
+	return &unix_auth;
+}
+
+static void
+unx_destroy(struct rpc_auth *auth)
+{
+	dprintk("RPC:       destroying UNIX authenticator %p\n", auth);
+	rpcauth_clear_credcache(auth->au_credcache);
+}
+
+/*
+ * Lookup AUTH_UNIX creds for current process
+ */
+static struct rpc_cred *
+unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	return rpcauth_lookup_credcache(auth, acred, flags);
+}
+
+static struct rpc_cred *
+unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+{
+	struct unx_cred	*cred;
+	unsigned int groups = 0;
+	unsigned int i;
+
+	dprintk("RPC:       allocating UNIX cred for uid %d gid %d\n",
+			acred->uid, acred->gid);
+
+	if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS)))
+		return ERR_PTR(-ENOMEM);
+
+	rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops);
+	cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
+
+	if (acred->group_info != NULL)
+		groups = acred->group_info->ngroups;
+	if (groups > NFS_NGROUPS)
+		groups = NFS_NGROUPS;
+
+	cred->uc_gid = acred->gid;
+	for (i = 0; i < groups; i++)
+		cred->uc_gids[i] = GROUP_AT(acred->group_info, i);
+	if (i < NFS_NGROUPS)
+		cred->uc_gids[i] = NOGROUP;
+
+	return &cred->uc_base;
+}
+
+static void
+unx_free_cred(struct unx_cred *unx_cred)
+{
+	dprintk("RPC:       unx_free_cred %p\n", unx_cred);
+	kfree(unx_cred);
+}
+
+static void
+unx_free_cred_callback(struct rcu_head *head)
+{
+	struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu);
+	unx_free_cred(unx_cred);
+}
+
+static void
+unx_destroy_cred(struct rpc_cred *cred)
+{
+	call_rcu(&cred->cr_rcu, unx_free_cred_callback);
+}
+
+/*
+ * Match credentials against current process creds.
+ * The root_override argument takes care of cases where the caller may
+ * request root creds (e.g. for NFS swapping).
+ */
+static int
+unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
+{
+	struct unx_cred	*cred = container_of(rcred, struct unx_cred, uc_base);
+	unsigned int groups = 0;
+	unsigned int i;
+
+
+	if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid)
+		return 0;
+
+	if (acred->group_info != NULL)
+		groups = acred->group_info->ngroups;
+	if (groups > NFS_NGROUPS)
+		groups = NFS_NGROUPS;
+	for (i = 0; i < groups ; i++)
+		if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i))
+			return 0;
+	if (groups < NFS_NGROUPS &&
+	    cred->uc_gids[groups] != NOGROUP)
+		return 0;
+	return 1;
+}
+
+/*
+ * Marshal credentials.
+ * Maybe we should keep a cached credential for performance reasons.
+ */
+static __be32 *
+unx_marshal(struct rpc_task *task, __be32 *p)
+{
+	struct rpc_clnt	*clnt = task->tk_client;
+	struct unx_cred	*cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
+	__be32		*base, *hold;
+	int		i;
+
+	*p++ = htonl(RPC_AUTH_UNIX);
+	base = p++;
+	*p++ = htonl(jiffies/HZ);
+
+	/*
+	 * Copy the UTS nodename captured when the client was created.
+	 */
+	p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
+
+	*p++ = htonl((u32) cred->uc_uid);
+	*p++ = htonl((u32) cred->uc_gid);
+	hold = p++;
+	for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++)
+		*p++ = htonl((u32) cred->uc_gids[i]);
+	*hold = htonl(p - hold - 1);		/* gid array length */
+	*base = htonl((p - base - 1) << 2);	/* cred length */
+
+	*p++ = htonl(RPC_AUTH_NULL);
+	*p++ = htonl(0);
+
+	return p;
+}
+
+/*
+ * Refresh credentials. This is a no-op for AUTH_UNIX
+ */
+static int
+unx_refresh(struct rpc_task *task)
+{
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
+	return 0;
+}
+
+static __be32 *
+unx_validate(struct rpc_task *task, __be32 *p)
+{
+	rpc_authflavor_t	flavor;
+	u32			size;
+
+	flavor = ntohl(*p++);
+	if (flavor != RPC_AUTH_NULL &&
+	    flavor != RPC_AUTH_UNIX &&
+	    flavor != RPC_AUTH_SHORT) {
+		printk("RPC: bad verf flavor: %u\n", flavor);
+		return NULL;
+	}
+
+	size = ntohl(*p++);
+	if (size > RPC_MAX_AUTH_SIZE) {
+		printk("RPC: giant verf size: %u\n", size);
+		return NULL;
+	}
+	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	p += (size >> 2);
+
+	return p;
+}
+
+int __init rpc_init_authunix(void)
+{
+	return rpcauth_init_credcache(&unix_auth);
+}
+
+void rpc_destroy_authunix(void)
+{
+	rpcauth_destroy_credcache(&unix_auth);
+}
+
+const struct rpc_authops authunix_ops = {
+	.owner		= THIS_MODULE,
+	.au_flavor	= RPC_AUTH_UNIX,
+	.au_name	= "UNIX",
+	.create		= unx_create,
+	.destroy	= unx_destroy,
+	.lookup_cred	= unx_lookup_cred,
+	.crcreate	= unx_create_cred,
+};
+
+static
+struct rpc_auth		unix_auth = {
+	.au_cslack	= UNX_WRITESLACK,
+	.au_rslack	= 2,			/* assume AUTH_NULL verf */
+	.au_ops		= &authunix_ops,
+	.au_flavor	= RPC_AUTH_UNIX,
+	.au_count	= ATOMIC_INIT(0),
+};
+
+static
+const struct rpc_credops unix_credops = {
+	.cr_name	= "AUTH_UNIX",
+	.crdestroy	= unx_destroy_cred,
+	.crbind		= rpcauth_generic_bind_cred,
+	.crmatch	= unx_match,
+	.crmarshal	= unx_marshal,
+	.crrefresh	= unx_refresh,
+	.crvalidate	= unx_validate,
+};
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
new file mode 100644
index 00000000..cf06af3b
--- /dev/null
+++ b/net/sunrpc/backchannel_rqst.c
@@ -0,0 +1,282 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/xprt.h>
+
+#ifdef RPC_DEBUG
+#define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#if defined(CONFIG_NFS_V4_1)
+
+/*
+ * Helper routines that track the number of preallocation elements
+ * on the transport.
+ */
+static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
+{
+	return xprt->bc_alloc_count > 0;
+}
+
+static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	xprt->bc_alloc_count += n;
+}
+
+static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
+{
+	return xprt->bc_alloc_count -= n;
+}
+
+/*
+ * Free the preallocated rpc_rqst structure and the memory
+ * buffers hanging off of it.
+ */
+static void xprt_free_allocation(struct rpc_rqst *req)
+{
+	struct xdr_buf *xbufp;
+
+	dprintk("RPC:        free allocations for req= %p\n", req);
+	BUG_ON(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	xbufp = &req->rq_private_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	xbufp = &req->rq_snd_buf;
+	free_page((unsigned long)xbufp->head[0].iov_base);
+	list_del(&req->rq_bc_pa_list);
+	kfree(req);
+}
+
+/*
+ * Preallocate up to min_reqs structures and related buffers for use
+ * by the backchannel.  This function can be called multiple times
+ * when creating new sessions that use the same rpc_xprt.  The
+ * preallocated buffers are added to the pool of resources used by
+ * the rpc_xprt.  Anyone of these resources may be used used by an
+ * incoming callback request.  It's up to the higher levels in the
+ * stack to enforce that the maximum number of session slots is not
+ * being exceeded.
+ *
+ * Some callback arguments can be large.  For example, a pNFS server
+ * using multiple deviceids.  The list can be unbound, but the client
+ * has the ability to tell the server the maximum size of the callback
+ * requests.  Each deviceID is 16 bytes, so allocate one page
+ * for the arguments to have enough room to receive a number of these
+ * deviceIDs.  The NFS client indicates to the pNFS server that its
+ * callback requests can be up to 4096 bytes in size.
+ */
+int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
+	struct page *page_rcv = NULL, *page_snd = NULL;
+	struct xdr_buf *xbufp = NULL;
+	struct rpc_rqst *req, *tmp;
+	struct list_head tmp_list;
+	int i;
+
+	dprintk("RPC:       setup backchannel transport\n");
+
+	/*
+	 * We use a temporary list to keep track of the preallocated
+	 * buffers.  Once we're done building the list we splice it
+	 * into the backchannel preallocation list off of the rpc_xprt
+	 * struct.  This helps minimize the amount of time the list
+	 * lock is held on the rpc_xprt struct.  It also makes cleanup
+	 * easier in case of memory allocation errors.
+	 */
+	INIT_LIST_HEAD(&tmp_list);
+	for (i = 0; i < min_reqs; i++) {
+		/* Pre-allocate one backchannel rpc_rqst */
+		req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+		if (req == NULL) {
+			printk(KERN_ERR "Failed to create bc rpc_rqst\n");
+			goto out_free;
+		}
+
+		/* Add the allocated buffer to the tmp list */
+		dprintk("RPC:       adding req= %p\n", req);
+		list_add(&req->rq_bc_pa_list, &tmp_list);
+
+		req->rq_xprt = xprt;
+		INIT_LIST_HEAD(&req->rq_list);
+		INIT_LIST_HEAD(&req->rq_bc_list);
+
+		/* Preallocate one XDR receive buffer */
+		page_rcv = alloc_page(GFP_KERNEL);
+		if (page_rcv == NULL) {
+			printk(KERN_ERR "Failed to create bc receive xbuf\n");
+			goto out_free;
+		}
+		xbufp = &req->rq_rcv_buf;
+		xbufp->head[0].iov_base = page_address(page_rcv);
+		xbufp->head[0].iov_len = PAGE_SIZE;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = PAGE_SIZE;
+		xbufp->buflen = PAGE_SIZE;
+
+		/* Preallocate one XDR send buffer */
+		page_snd = alloc_page(GFP_KERNEL);
+		if (page_snd == NULL) {
+			printk(KERN_ERR "Failed to create bc snd xbuf\n");
+			goto out_free;
+		}
+
+		xbufp = &req->rq_snd_buf;
+		xbufp->head[0].iov_base = page_address(page_snd);
+		xbufp->head[0].iov_len = 0;
+		xbufp->tail[0].iov_base = NULL;
+		xbufp->tail[0].iov_len = 0;
+		xbufp->page_len = 0;
+		xbufp->len = 0;
+		xbufp->buflen = PAGE_SIZE;
+	}
+
+	/*
+	 * Add the temporary list to the backchannel preallocation list
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_splice(&tmp_list, &xprt->bc_pa_list);
+	xprt_inc_alloc_count(xprt, min_reqs);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:       setup backchannel transport done\n");
+	return 0;
+
+out_free:
+	/*
+	 * Memory allocation failed, free the temporary list
+	 */
+	list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list)
+		xprt_free_allocation(req);
+
+	dprintk("RPC:       setup backchannel transport failed\n");
+	return -1;
+}
+EXPORT_SYMBOL(xprt_setup_backchannel);
+
+/*
+ * Destroys the backchannel preallocated structures.
+ * Since these structures may have been allocated by multiple calls
+ * to xprt_setup_backchannel, we only destroy up to the maximum number
+ * of reqs specified by the caller.
+ * @xprt:	the transport holding the preallocated strucures
+ * @max_reqs	the maximum number of preallocated structures to destroy
+ */
+void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
+	struct rpc_rqst *req = NULL, *tmp = NULL;
+
+	dprintk("RPC:        destroy backchannel transport\n");
+
+	BUG_ON(max_reqs == 0);
+	spin_lock_bh(&xprt->bc_pa_lock);
+	xprt_dec_alloc_count(xprt, max_reqs);
+	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+		dprintk("RPC:        req=%p\n", req);
+		xprt_free_allocation(req);
+		if (--max_reqs == 0)
+			break;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+
+	dprintk("RPC:        backchannel list empty= %s\n",
+		list_empty(&xprt->bc_pa_list) ? "true" : "false");
+}
+EXPORT_SYMBOL(xprt_destroy_backchannel);
+
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup.  Buffer space for the send and private XDR buffers
+ * has been preallocated as well.  Use xprt_alloc_bc_request to allocate
+ * to this request.  Use xprt_free_bc_request to return it.
+ *
+ * We know that we're called in soft interrupt context, grab the spin_lock
+ * since there is no need to grab the bottom half spin_lock.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
+{
+	struct rpc_rqst *req;
+
+	dprintk("RPC:       allocate a backchannel request\n");
+	spin_lock(&xprt->bc_pa_lock);
+	if (!list_empty(&xprt->bc_pa_list)) {
+		req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
+				rq_bc_pa_list);
+		list_del(&req->rq_bc_pa_list);
+	} else {
+		req = NULL;
+	}
+	spin_unlock(&xprt->bc_pa_lock);
+
+	if (req != NULL) {
+		set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+		req->rq_reply_bytes_recvd = 0;
+		req->rq_bytes_sent = 0;
+		memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+			sizeof(req->rq_private_buf));
+	}
+	dprintk("RPC:       backchannel req=%p\n", req);
+	return req;
+}
+
+/*
+ * Return the preallocated rpc_rqst structure and XDR buffers
+ * associated with this rpc_task.
+ */
+void xprt_free_bc_request(struct rpc_rqst *req)
+{
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	dprintk("RPC:       free backchannel req=%p\n", req);
+
+	smp_mb__before_clear_bit();
+	BUG_ON(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
+	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+	smp_mb__after_clear_bit();
+
+	if (!xprt_need_to_requeue(xprt)) {
+		/*
+		 * The last remaining session was destroyed while this
+		 * entry was in use.  Free the entry and don't attempt
+		 * to add back to the list because there is no need to
+		 * have anymore preallocated entries.
+		 */
+		dprintk("RPC:       Last session removed req=%p\n", req);
+		xprt_free_allocation(req);
+		return;
+	}
+
+	/*
+	 * Return it to the list of preallocations so that it
+	 * may be reused by a new callback request.
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+	spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
new file mode 100644
index 00000000..1dd1a689
--- /dev/null
+++ b/net/sunrpc/bc_svc.c
@@ -0,0 +1,66 @@
+/******************************************************************************
+
+(c) 2007 Network Appliance, Inc.  All Rights Reserved.
+(c) 2009 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * The NFSv4.1 callback service helper routines.
+ * They implement the transport level processing required to send the
+ * reply over an existing open connection previously established by the client.
+ */
+
+#if defined(CONFIG_NFS_V4_1)
+
+#include <linux/module.h>
+
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCDSP
+
+/* Empty callback ops */
+static const struct rpc_call_ops nfs41_callback_ops = {
+};
+
+
+/*
+ * Send the callback reply
+ */
+int bc_send(struct rpc_rqst *req)
+{
+	struct rpc_task *task;
+	int ret;
+
+	dprintk("RPC:       bc_send req= %p\n", req);
+	task = rpc_run_bc_task(req, &nfs41_callback_ops);
+	if (IS_ERR(task))
+		ret = PTR_ERR(task);
+	else {
+		BUG_ON(atomic_read(&task->tk_count) != 1);
+		ret = task->tk_status;
+		rpc_put_task(task);
+	}
+	dprintk("RPC:       bc_send ret= %d\n", ret);
+	return ret;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
new file mode 100644
index 00000000..4530a912
--- /dev/null
+++ b/net/sunrpc/cache.c
@@ -0,0 +1,1812 @@
+/*
+ * net/sunrpc/cache.c
+ *
+ * Generic code for various authentication-related caches
+ * used by sunrpc clients and servers.
+ *
+ * Copyright (C) 2002 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Released under terms in GPL version 2.  See COPYING.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <asm/uaccess.h>
+#include <linux/poll.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <asm/ioctls.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "netns.h"
+
+#define	 RPCDBG_FACILITY RPCDBG_CACHE
+
+static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
+static void cache_revisit_request(struct cache_head *item);
+
+static void cache_init(struct cache_head *h)
+{
+	time_t now = seconds_since_boot();
+	h->next = NULL;
+	h->flags = 0;
+	kref_init(&h->ref);
+	h->expiry_time = now + CACHE_NEW_EXPIRY;
+	h->last_refresh = now;
+}
+
+static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
+{
+	return  (h->expiry_time < seconds_since_boot()) ||
+		(detail->flush_time > h->last_refresh);
+}
+
+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+				       struct cache_head *key, int hash)
+{
+	struct cache_head **head,  **hp;
+	struct cache_head *new = NULL, *freeme = NULL;
+
+	head = &detail->hash_table[hash];
+
+	read_lock(&detail->hash_lock);
+
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			if (cache_is_expired(detail, tmp))
+				/* This entry is expired, we will discard it. */
+				break;
+			cache_get(tmp);
+			read_unlock(&detail->hash_lock);
+			return tmp;
+		}
+	}
+	read_unlock(&detail->hash_lock);
+	/* Didn't find anything, insert an empty entry */
+
+	new = detail->alloc();
+	if (!new)
+		return NULL;
+	/* must fully initialise 'new', else
+	 * we might get lose if we need to
+	 * cache_put it soon.
+	 */
+	cache_init(new);
+	detail->init(new, key);
+
+	write_lock(&detail->hash_lock);
+
+	/* check if entry appeared while we slept */
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			if (cache_is_expired(detail, tmp)) {
+				*hp = tmp->next;
+				tmp->next = NULL;
+				detail->entries --;
+				freeme = tmp;
+				break;
+			}
+			cache_get(tmp);
+			write_unlock(&detail->hash_lock);
+			cache_put(new, detail);
+			return tmp;
+		}
+	}
+	new->next = *head;
+	*head = new;
+	detail->entries++;
+	cache_get(new);
+	write_unlock(&detail->hash_lock);
+
+	if (freeme)
+		cache_put(freeme, detail);
+	return new;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
+
+
+static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
+
+static void cache_fresh_locked(struct cache_head *head, time_t expiry)
+{
+	head->expiry_time = expiry;
+	head->last_refresh = seconds_since_boot();
+	smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */
+	set_bit(CACHE_VALID, &head->flags);
+}
+
+static void cache_fresh_unlocked(struct cache_head *head,
+				 struct cache_detail *detail)
+{
+	if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
+		cache_revisit_request(head);
+		cache_dequeue(detail, head);
+	}
+}
+
+struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
+				       struct cache_head *new, struct cache_head *old, int hash)
+{
+	/* The 'old' entry is to be replaced by 'new'.
+	 * If 'old' is not VALID, we update it directly,
+	 * otherwise we need to replace it
+	 */
+	struct cache_head **head;
+	struct cache_head *tmp;
+
+	if (!test_bit(CACHE_VALID, &old->flags)) {
+		write_lock(&detail->hash_lock);
+		if (!test_bit(CACHE_VALID, &old->flags)) {
+			if (test_bit(CACHE_NEGATIVE, &new->flags))
+				set_bit(CACHE_NEGATIVE, &old->flags);
+			else
+				detail->update(old, new);
+			cache_fresh_locked(old, new->expiry_time);
+			write_unlock(&detail->hash_lock);
+			cache_fresh_unlocked(old, detail);
+			return old;
+		}
+		write_unlock(&detail->hash_lock);
+	}
+	/* We need to insert a new entry */
+	tmp = detail->alloc();
+	if (!tmp) {
+		cache_put(old, detail);
+		return NULL;
+	}
+	cache_init(tmp);
+	detail->init(tmp, old);
+	head = &detail->hash_table[hash];
+
+	write_lock(&detail->hash_lock);
+	if (test_bit(CACHE_NEGATIVE, &new->flags))
+		set_bit(CACHE_NEGATIVE, &tmp->flags);
+	else
+		detail->update(tmp, new);
+	tmp->next = *head;
+	*head = tmp;
+	detail->entries++;
+	cache_get(tmp);
+	cache_fresh_locked(tmp, new->expiry_time);
+	cache_fresh_locked(old, 0);
+	write_unlock(&detail->hash_lock);
+	cache_fresh_unlocked(tmp, detail);
+	cache_fresh_unlocked(old, detail);
+	cache_put(old, detail);
+	return tmp;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_update);
+
+static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	if (!cd->cache_upcall)
+		return -EINVAL;
+	return cd->cache_upcall(cd, h);
+}
+
+static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h)
+{
+	if (!test_bit(CACHE_VALID, &h->flags))
+		return -EAGAIN;
+	else {
+		/* entry is valid */
+		if (test_bit(CACHE_NEGATIVE, &h->flags))
+			return -ENOENT;
+		else {
+			/*
+			 * In combination with write barrier in
+			 * sunrpc_cache_update, ensures that anyone
+			 * using the cache entry after this sees the
+			 * updated contents:
+			 */
+			smp_rmb();
+			return 0;
+		}
+	}
+}
+
+static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h)
+{
+	int rv;
+
+	write_lock(&detail->hash_lock);
+	rv = cache_is_valid(detail, h);
+	if (rv != -EAGAIN) {
+		write_unlock(&detail->hash_lock);
+		return rv;
+	}
+	set_bit(CACHE_NEGATIVE, &h->flags);
+	cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+	write_unlock(&detail->hash_lock);
+	cache_fresh_unlocked(h, detail);
+	return -ENOENT;
+}
+
+/*
+ * This is the generic cache management routine for all
+ * the authentication caches.
+ * It checks the currency of a cache item and will (later)
+ * initiate an upcall to fill it if needed.
+ *
+ *
+ * Returns 0 if the cache_head can be used, or cache_puts it and returns
+ * -EAGAIN if upcall is pending and request has been queued
+ * -ETIMEDOUT if upcall failed or request could not be queue or
+ *           upcall completed but item is still invalid (implying that
+ *           the cache item has been replaced with a newer one).
+ * -ENOENT if cache entry was negative
+ */
+int cache_check(struct cache_detail *detail,
+		    struct cache_head *h, struct cache_req *rqstp)
+{
+	int rv;
+	long refresh_age, age;
+
+	/* First decide return status as best we can */
+	rv = cache_is_valid(detail, h);
+
+	/* now see if we want to start an upcall */
+	refresh_age = (h->expiry_time - h->last_refresh);
+	age = seconds_since_boot() - h->last_refresh;
+
+	if (rqstp == NULL) {
+		if (rv == -EAGAIN)
+			rv = -ENOENT;
+	} else if (rv == -EAGAIN || age > refresh_age/2) {
+		dprintk("RPC:       Want update, refage=%ld, age=%ld\n",
+				refresh_age, age);
+		if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
+			switch (cache_make_upcall(detail, h)) {
+			case -EINVAL:
+				clear_bit(CACHE_PENDING, &h->flags);
+				cache_revisit_request(h);
+				rv = try_to_negate_entry(detail, h);
+				break;
+			case -EAGAIN:
+				clear_bit(CACHE_PENDING, &h->flags);
+				cache_revisit_request(h);
+				break;
+			}
+		}
+	}
+
+	if (rv == -EAGAIN) {
+		if (!cache_defer_req(rqstp, h)) {
+			/*
+			 * Request was not deferred; handle it as best
+			 * we can ourselves:
+			 */
+			rv = cache_is_valid(detail, h);
+			if (rv == -EAGAIN)
+				rv = -ETIMEDOUT;
+		}
+	}
+	if (rv)
+		cache_put(h, detail);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(cache_check);
+
+/*
+ * caches need to be periodically cleaned.
+ * For this we maintain a list of cache_detail and
+ * a current pointer into that list and into the table
+ * for that entry.
+ *
+ * Each time clean_cache is called it finds the next non-empty entry
+ * in the current table and walks the list in that entry
+ * looking for entries that can be removed.
+ *
+ * An entry gets removed if:
+ * - The expiry is before current time
+ * - The last_refresh time is before the flush_time for that cache
+ *
+ * later we might drop old entries with non-NEVER expiry if that table
+ * is getting 'full' for some definition of 'full'
+ *
+ * The question of "how often to scan a table" is an interesting one
+ * and is answered in part by the use of the "nextcheck" field in the
+ * cache_detail.
+ * When a scan of a table begins, the nextcheck field is set to a time
+ * that is well into the future.
+ * While scanning, if an expiry time is found that is earlier than the
+ * current nextcheck time, nextcheck is set to that expiry time.
+ * If the flush_time is ever set to a time earlier than the nextcheck
+ * time, the nextcheck time is then set to that flush_time.
+ *
+ * A table is then only scanned if the current time is at least
+ * the nextcheck time.
+ *
+ */
+
+static LIST_HEAD(cache_list);
+static DEFINE_SPINLOCK(cache_list_lock);
+static struct cache_detail *current_detail;
+static int current_index;
+
+static void do_cache_clean(struct work_struct *work);
+static struct delayed_work cache_cleaner;
+
+static void sunrpc_init_cache_detail(struct cache_detail *cd)
+{
+	rwlock_init(&cd->hash_lock);
+	INIT_LIST_HEAD(&cd->queue);
+	spin_lock(&cache_list_lock);
+	cd->nextcheck = 0;
+	cd->entries = 0;
+	atomic_set(&cd->readers, 0);
+	cd->last_close = 0;
+	cd->last_warn = -1;
+	list_add(&cd->others, &cache_list);
+	spin_unlock(&cache_list_lock);
+
+	/* start the cleaning process */
+	schedule_delayed_work(&cache_cleaner, 0);
+}
+
+static void sunrpc_destroy_cache_detail(struct cache_detail *cd)
+{
+	cache_purge(cd);
+	spin_lock(&cache_list_lock);
+	write_lock(&cd->hash_lock);
+	if (cd->entries || atomic_read(&cd->inuse)) {
+		write_unlock(&cd->hash_lock);
+		spin_unlock(&cache_list_lock);
+		goto out;
+	}
+	if (current_detail == cd)
+		current_detail = NULL;
+	list_del_init(&cd->others);
+	write_unlock(&cd->hash_lock);
+	spin_unlock(&cache_list_lock);
+	if (list_empty(&cache_list)) {
+		/* module must be being unloaded so its safe to kill the worker */
+		cancel_delayed_work_sync(&cache_cleaner);
+	}
+	return;
+out:
+	printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
+}
+
+/* clean cache tries to find something to clean
+ * and cleans it.
+ * It returns 1 if it cleaned something,
+ *            0 if it didn't find anything this time
+ *           -1 if it fell off the end of the list.
+ */
+static int cache_clean(void)
+{
+	int rv = 0;
+	struct list_head *next;
+
+	spin_lock(&cache_list_lock);
+
+	/* find a suitable table if we don't already have one */
+	while (current_detail == NULL ||
+	    current_index >= current_detail->hash_size) {
+		if (current_detail)
+			next = current_detail->others.next;
+		else
+			next = cache_list.next;
+		if (next == &cache_list) {
+			current_detail = NULL;
+			spin_unlock(&cache_list_lock);
+			return -1;
+		}
+		current_detail = list_entry(next, struct cache_detail, others);
+		if (current_detail->nextcheck > seconds_since_boot())
+			current_index = current_detail->hash_size;
+		else {
+			current_index = 0;
+			current_detail->nextcheck = seconds_since_boot()+30*60;
+		}
+	}
+
+	/* find a non-empty bucket in the table */
+	while (current_detail &&
+	       current_index < current_detail->hash_size &&
+	       current_detail->hash_table[current_index] == NULL)
+		current_index++;
+
+	/* find a cleanable entry in the bucket and clean it, or set to next bucket */
+
+	if (current_detail && current_index < current_detail->hash_size) {
+		struct cache_head *ch, **cp;
+		struct cache_detail *d;
+
+		write_lock(&current_detail->hash_lock);
+
+		/* Ok, now to clean this strand */
+
+		cp = & current_detail->hash_table[current_index];
+		for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) {
+			if (current_detail->nextcheck > ch->expiry_time)
+				current_detail->nextcheck = ch->expiry_time+1;
+			if (!cache_is_expired(current_detail, ch))
+				continue;
+
+			*cp = ch->next;
+			ch->next = NULL;
+			current_detail->entries--;
+			rv = 1;
+			break;
+		}
+
+		write_unlock(&current_detail->hash_lock);
+		d = current_detail;
+		if (!ch)
+			current_index ++;
+		spin_unlock(&cache_list_lock);
+		if (ch) {
+			if (test_and_clear_bit(CACHE_PENDING, &ch->flags))
+				cache_dequeue(current_detail, ch);
+			cache_revisit_request(ch);
+			cache_put(ch, d);
+		}
+	} else
+		spin_unlock(&cache_list_lock);
+
+	return rv;
+}
+
+/*
+ * We want to regularly clean the cache, so we need to schedule some work ...
+ */
+static void do_cache_clean(struct work_struct *work)
+{
+	int delay = 5;
+	if (cache_clean() == -1)
+		delay = round_jiffies_relative(30*HZ);
+
+	if (list_empty(&cache_list))
+		delay = 0;
+
+	if (delay)
+		schedule_delayed_work(&cache_cleaner, delay);
+}
+
+
+/*
+ * Clean all caches promptly.  This just calls cache_clean
+ * repeatedly until we are sure that every cache has had a chance to
+ * be fully cleaned
+ */
+void cache_flush(void)
+{
+	while (cache_clean() != -1)
+		cond_resched();
+	while (cache_clean() != -1)
+		cond_resched();
+}
+EXPORT_SYMBOL_GPL(cache_flush);
+
+void cache_purge(struct cache_detail *detail)
+{
+	detail->flush_time = LONG_MAX;
+	detail->nextcheck = seconds_since_boot();
+	cache_flush();
+	detail->flush_time = 1;
+}
+EXPORT_SYMBOL_GPL(cache_purge);
+
+
+/*
+ * Deferral and Revisiting of Requests.
+ *
+ * If a cache lookup finds a pending entry, we
+ * need to defer the request and revisit it later.
+ * All deferred requests are stored in a hash table,
+ * indexed by "struct cache_head *".
+ * As it may be wasteful to store a whole request
+ * structure, we allow the request to provide a
+ * deferred form, which must contain a
+ * 'struct cache_deferred_req'
+ * This cache_deferred_req contains a method to allow
+ * it to be revisited when cache info is available
+ */
+
+#define	DFR_HASHSIZE	(PAGE_SIZE/sizeof(struct list_head))
+#define	DFR_HASH(item)	((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE)
+
+#define	DFR_MAX	300	/* ??? */
+
+static DEFINE_SPINLOCK(cache_defer_lock);
+static LIST_HEAD(cache_defer_list);
+static struct hlist_head cache_defer_hash[DFR_HASHSIZE];
+static int cache_defer_cnt;
+
+static void __unhash_deferred_req(struct cache_deferred_req *dreq)
+{
+	hlist_del_init(&dreq->hash);
+	if (!list_empty(&dreq->recent)) {
+		list_del_init(&dreq->recent);
+		cache_defer_cnt--;
+	}
+}
+
+static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item)
+{
+	int hash = DFR_HASH(item);
+
+	INIT_LIST_HEAD(&dreq->recent);
+	hlist_add_head(&dreq->hash, &cache_defer_hash[hash]);
+}
+
+static void setup_deferral(struct cache_deferred_req *dreq,
+			   struct cache_head *item,
+			   int count_me)
+{
+
+	dreq->item = item;
+
+	spin_lock(&cache_defer_lock);
+
+	__hash_deferred_req(dreq, item);
+
+	if (count_me) {
+		cache_defer_cnt++;
+		list_add(&dreq->recent, &cache_defer_list);
+	}
+
+	spin_unlock(&cache_defer_lock);
+
+}
+
+struct thread_deferred_req {
+	struct cache_deferred_req handle;
+	struct completion completion;
+};
+
+static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many)
+{
+	struct thread_deferred_req *dr =
+		container_of(dreq, struct thread_deferred_req, handle);
+	complete(&dr->completion);
+}
+
+static void cache_wait_req(struct cache_req *req, struct cache_head *item)
+{
+	struct thread_deferred_req sleeper;
+	struct cache_deferred_req *dreq = &sleeper.handle;
+
+	sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion);
+	dreq->revisit = cache_restart_thread;
+
+	setup_deferral(dreq, item, 0);
+
+	if (!test_bit(CACHE_PENDING, &item->flags) ||
+	    wait_for_completion_interruptible_timeout(
+		    &sleeper.completion, req->thread_wait) <= 0) {
+		/* The completion wasn't completed, so we need
+		 * to clean up
+		 */
+		spin_lock(&cache_defer_lock);
+		if (!hlist_unhashed(&sleeper.handle.hash)) {
+			__unhash_deferred_req(&sleeper.handle);
+			spin_unlock(&cache_defer_lock);
+		} else {
+			/* cache_revisit_request already removed
+			 * this from the hash table, but hasn't
+			 * called ->revisit yet.  It will very soon
+			 * and we need to wait for it.
+			 */
+			spin_unlock(&cache_defer_lock);
+			wait_for_completion(&sleeper.completion);
+		}
+	}
+}
+
+static void cache_limit_defers(void)
+{
+	/* Make sure we haven't exceed the limit of allowed deferred
+	 * requests.
+	 */
+	struct cache_deferred_req *discard = NULL;
+
+	if (cache_defer_cnt <= DFR_MAX)
+		return;
+
+	spin_lock(&cache_defer_lock);
+
+	/* Consider removing either the first or the last */
+	if (cache_defer_cnt > DFR_MAX) {
+		if (net_random() & 1)
+			discard = list_entry(cache_defer_list.next,
+					     struct cache_deferred_req, recent);
+		else
+			discard = list_entry(cache_defer_list.prev,
+					     struct cache_deferred_req, recent);
+		__unhash_deferred_req(discard);
+	}
+	spin_unlock(&cache_defer_lock);
+	if (discard)
+		discard->revisit(discard, 1);
+}
+
+/* Return true if and only if a deferred request is queued. */
+static bool cache_defer_req(struct cache_req *req, struct cache_head *item)
+{
+	struct cache_deferred_req *dreq;
+
+	if (req->thread_wait) {
+		cache_wait_req(req, item);
+		if (!test_bit(CACHE_PENDING, &item->flags))
+			return false;
+	}
+	dreq = req->defer(req);
+	if (dreq == NULL)
+		return false;
+	setup_deferral(dreq, item, 1);
+	if (!test_bit(CACHE_PENDING, &item->flags))
+		/* Bit could have been cleared before we managed to
+		 * set up the deferral, so need to revisit just in case
+		 */
+		cache_revisit_request(item);
+
+	cache_limit_defers();
+	return true;
+}
+
+static void cache_revisit_request(struct cache_head *item)
+{
+	struct cache_deferred_req *dreq;
+	struct list_head pending;
+	struct hlist_node *lp, *tmp;
+	int hash = DFR_HASH(item);
+
+	INIT_LIST_HEAD(&pending);
+	spin_lock(&cache_defer_lock);
+
+	hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash)
+		if (dreq->item == item) {
+			__unhash_deferred_req(dreq);
+			list_add(&dreq->recent, &pending);
+		}
+
+	spin_unlock(&cache_defer_lock);
+
+	while (!list_empty(&pending)) {
+		dreq = list_entry(pending.next, struct cache_deferred_req, recent);
+		list_del_init(&dreq->recent);
+		dreq->revisit(dreq, 0);
+	}
+}
+
+void cache_clean_deferred(void *owner)
+{
+	struct cache_deferred_req *dreq, *tmp;
+	struct list_head pending;
+
+
+	INIT_LIST_HEAD(&pending);
+	spin_lock(&cache_defer_lock);
+
+	list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) {
+		if (dreq->owner == owner) {
+			__unhash_deferred_req(dreq);
+			list_add(&dreq->recent, &pending);
+		}
+	}
+	spin_unlock(&cache_defer_lock);
+
+	while (!list_empty(&pending)) {
+		dreq = list_entry(pending.next, struct cache_deferred_req, recent);
+		list_del_init(&dreq->recent);
+		dreq->revisit(dreq, 1);
+	}
+}
+
+/*
+ * communicate with user-space
+ *
+ * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * On read, you get a full request, or block.
+ * On write, an update request is processed.
+ * Poll works if anything to read, and always allows write.
+ *
+ * Implemented by linked list of requests.  Each open file has
+ * a ->private that also exists in this list.  New requests are added
+ * to the end and may wakeup and preceding readers.
+ * New readers are added to the head.  If, on read, an item is found with
+ * CACHE_UPCALLING clear, we free it from the list.
+ *
+ */
+
+static DEFINE_SPINLOCK(queue_lock);
+static DEFINE_MUTEX(queue_io_mutex);
+
+struct cache_queue {
+	struct list_head	list;
+	int			reader;	/* if 0, then request */
+};
+struct cache_request {
+	struct cache_queue	q;
+	struct cache_head	*item;
+	char			* buf;
+	int			len;
+	int			readers;
+};
+struct cache_reader {
+	struct cache_queue	q;
+	int			offset;	/* if non-0, we have a refcnt on next request */
+};
+
+static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *ppos, struct cache_detail *cd)
+{
+	struct cache_reader *rp = filp->private_data;
+	struct cache_request *rq;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int err;
+
+	if (count == 0)
+		return 0;
+
+	mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+			      * readers on this file */
+ again:
+	spin_lock(&queue_lock);
+	/* need to find next request */
+	while (rp->q.list.next != &cd->queue &&
+	       list_entry(rp->q.list.next, struct cache_queue, list)
+	       ->reader) {
+		struct list_head *next = rp->q.list.next;
+		list_move(&rp->q.list, next);
+	}
+	if (rp->q.list.next == &cd->queue) {
+		spin_unlock(&queue_lock);
+		mutex_unlock(&inode->i_mutex);
+		BUG_ON(rp->offset);
+		return 0;
+	}
+	rq = container_of(rp->q.list.next, struct cache_request, q.list);
+	BUG_ON(rq->q.reader);
+	if (rp->offset == 0)
+		rq->readers++;
+	spin_unlock(&queue_lock);
+
+	if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) {
+		err = -EAGAIN;
+		spin_lock(&queue_lock);
+		list_move(&rp->q.list, &rq->q.list);
+		spin_unlock(&queue_lock);
+	} else {
+		if (rp->offset + count > rq->len)
+			count = rq->len - rp->offset;
+		err = -EFAULT;
+		if (copy_to_user(buf, rq->buf + rp->offset, count))
+			goto out;
+		rp->offset += count;
+		if (rp->offset >= rq->len) {
+			rp->offset = 0;
+			spin_lock(&queue_lock);
+			list_move(&rp->q.list, &rq->q.list);
+			spin_unlock(&queue_lock);
+		}
+		err = 0;
+	}
+ out:
+	if (rp->offset == 0) {
+		/* need to release rq */
+		spin_lock(&queue_lock);
+		rq->readers--;
+		if (rq->readers == 0 &&
+		    !test_bit(CACHE_PENDING, &rq->item->flags)) {
+			list_del(&rq->q.list);
+			spin_unlock(&queue_lock);
+			cache_put(rq->item, cd);
+			kfree(rq->buf);
+			kfree(rq);
+		} else
+			spin_unlock(&queue_lock);
+	}
+	if (err == -EAGAIN)
+		goto again;
+	mutex_unlock(&inode->i_mutex);
+	return err ? err :  count;
+}
+
+static ssize_t cache_do_downcall(char *kaddr, const char __user *buf,
+				 size_t count, struct cache_detail *cd)
+{
+	ssize_t ret;
+
+	if (count == 0)
+		return -EINVAL;
+	if (copy_from_user(kaddr, buf, count))
+		return -EFAULT;
+	kaddr[count] = '\0';
+	ret = cd->cache_parse(cd, kaddr, count);
+	if (!ret)
+		ret = count;
+	return ret;
+}
+
+static ssize_t cache_slow_downcall(const char __user *buf,
+				   size_t count, struct cache_detail *cd)
+{
+	static char write_buf[8192]; /* protected by queue_io_mutex */
+	ssize_t ret = -EINVAL;
+
+	if (count >= sizeof(write_buf))
+		goto out;
+	mutex_lock(&queue_io_mutex);
+	ret = cache_do_downcall(write_buf, buf, count, cd);
+	mutex_unlock(&queue_io_mutex);
+out:
+	return ret;
+}
+
+static ssize_t cache_downcall(struct address_space *mapping,
+			      const char __user *buf,
+			      size_t count, struct cache_detail *cd)
+{
+	struct page *page;
+	char *kaddr;
+	ssize_t ret = -ENOMEM;
+
+	if (count >= PAGE_CACHE_SIZE)
+		goto out_slow;
+
+	page = find_or_create_page(mapping, 0, GFP_KERNEL);
+	if (!page)
+		goto out_slow;
+
+	kaddr = kmap(page);
+	ret = cache_do_downcall(kaddr, buf, count, cd);
+	kunmap(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+out_slow:
+	return cache_slow_downcall(buf, count, cd);
+}
+
+static ssize_t cache_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *ppos,
+			   struct cache_detail *cd)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	ssize_t ret = -EINVAL;
+
+	if (!cd->cache_parse)
+		goto out;
+
+	mutex_lock(&inode->i_mutex);
+	ret = cache_downcall(mapping, buf, count, cd);
+	mutex_unlock(&inode->i_mutex);
+out:
+	return ret;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(queue_wait);
+
+static unsigned int cache_poll(struct file *filp, poll_table *wait,
+			       struct cache_detail *cd)
+{
+	unsigned int mask;
+	struct cache_reader *rp = filp->private_data;
+	struct cache_queue *cq;
+
+	poll_wait(filp, &queue_wait, wait);
+
+	/* alway allow write */
+	mask = POLL_OUT | POLLWRNORM;
+
+	if (!rp)
+		return mask;
+
+	spin_lock(&queue_lock);
+
+	for (cq= &rp->q; &cq->list != &cd->queue;
+	     cq = list_entry(cq->list.next, struct cache_queue, list))
+		if (!cq->reader) {
+			mask |= POLLIN | POLLRDNORM;
+			break;
+		}
+	spin_unlock(&queue_lock);
+	return mask;
+}
+
+static int cache_ioctl(struct inode *ino, struct file *filp,
+		       unsigned int cmd, unsigned long arg,
+		       struct cache_detail *cd)
+{
+	int len = 0;
+	struct cache_reader *rp = filp->private_data;
+	struct cache_queue *cq;
+
+	if (cmd != FIONREAD || !rp)
+		return -EINVAL;
+
+	spin_lock(&queue_lock);
+
+	/* only find the length remaining in current request,
+	 * or the length of the next request
+	 */
+	for (cq= &rp->q; &cq->list != &cd->queue;
+	     cq = list_entry(cq->list.next, struct cache_queue, list))
+		if (!cq->reader) {
+			struct cache_request *cr =
+				container_of(cq, struct cache_request, q);
+			len = cr->len - rp->offset;
+			break;
+		}
+	spin_unlock(&queue_lock);
+
+	return put_user(len, (int __user *)arg);
+}
+
+static int cache_open(struct inode *inode, struct file *filp,
+		      struct cache_detail *cd)
+{
+	struct cache_reader *rp = NULL;
+
+	if (!cd || !try_module_get(cd->owner))
+		return -EACCES;
+	nonseekable_open(inode, filp);
+	if (filp->f_mode & FMODE_READ) {
+		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
+		if (!rp)
+			return -ENOMEM;
+		rp->offset = 0;
+		rp->q.reader = 1;
+		atomic_inc(&cd->readers);
+		spin_lock(&queue_lock);
+		list_add(&rp->q.list, &cd->queue);
+		spin_unlock(&queue_lock);
+	}
+	filp->private_data = rp;
+	return 0;
+}
+
+static int cache_release(struct inode *inode, struct file *filp,
+			 struct cache_detail *cd)
+{
+	struct cache_reader *rp = filp->private_data;
+
+	if (rp) {
+		spin_lock(&queue_lock);
+		if (rp->offset) {
+			struct cache_queue *cq;
+			for (cq= &rp->q; &cq->list != &cd->queue;
+			     cq = list_entry(cq->list.next, struct cache_queue, list))
+				if (!cq->reader) {
+					container_of(cq, struct cache_request, q)
+						->readers--;
+					break;
+				}
+			rp->offset = 0;
+		}
+		list_del(&rp->q.list);
+		spin_unlock(&queue_lock);
+
+		filp->private_data = NULL;
+		kfree(rp);
+
+		cd->last_close = seconds_since_boot();
+		atomic_dec(&cd->readers);
+	}
+	module_put(cd->owner);
+	return 0;
+}
+
+
+
+static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch)
+{
+	struct cache_queue *cq;
+	spin_lock(&queue_lock);
+	list_for_each_entry(cq, &detail->queue, list)
+		if (!cq->reader) {
+			struct cache_request *cr = container_of(cq, struct cache_request, q);
+			if (cr->item != ch)
+				continue;
+			if (cr->readers != 0)
+				continue;
+			list_del(&cr->q.list);
+			spin_unlock(&queue_lock);
+			cache_put(cr->item, detail);
+			kfree(cr->buf);
+			kfree(cr);
+			return;
+		}
+	spin_unlock(&queue_lock);
+}
+
+/*
+ * Support routines for text-based upcalls.
+ * Fields are separated by spaces.
+ * Fields are either mangled to quote space tab newline slosh with slosh
+ * or a hexified with a leading \x
+ * Record is terminated with newline.
+ *
+ */
+
+void qword_add(char **bpp, int *lp, char *str)
+{
+	char *bp = *bpp;
+	int len = *lp;
+	char c;
+
+	if (len < 0) return;
+
+	while ((c=*str++) && len)
+		switch(c) {
+		case ' ':
+		case '\t':
+		case '\n':
+		case '\\':
+			if (len >= 4) {
+				*bp++ = '\\';
+				*bp++ = '0' + ((c & 0300)>>6);
+				*bp++ = '0' + ((c & 0070)>>3);
+				*bp++ = '0' + ((c & 0007)>>0);
+			}
+			len -= 4;
+			break;
+		default:
+			*bp++ = c;
+			len--;
+		}
+	if (c || len <1) len = -1;
+	else {
+		*bp++ = ' ';
+		len--;
+	}
+	*bpp = bp;
+	*lp = len;
+}
+EXPORT_SYMBOL_GPL(qword_add);
+
+void qword_addhex(char **bpp, int *lp, char *buf, int blen)
+{
+	char *bp = *bpp;
+	int len = *lp;
+
+	if (len < 0) return;
+
+	if (len > 2) {
+		*bp++ = '\\';
+		*bp++ = 'x';
+		len -= 2;
+		while (blen && len >= 2) {
+			unsigned char c = *buf++;
+			*bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
+			*bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+			len -= 2;
+			blen--;
+		}
+	}
+	if (blen || len<1) len = -1;
+	else {
+		*bp++ = ' ';
+		len--;
+	}
+	*bpp = bp;
+	*lp = len;
+}
+EXPORT_SYMBOL_GPL(qword_addhex);
+
+static void warn_no_listener(struct cache_detail *detail)
+{
+	if (detail->last_warn != detail->last_close) {
+		detail->last_warn = detail->last_close;
+		if (detail->warn_no_listener)
+			detail->warn_no_listener(detail, detail->last_close != 0);
+	}
+}
+
+static bool cache_listeners_exist(struct cache_detail *detail)
+{
+	if (atomic_read(&detail->readers))
+		return true;
+	if (detail->last_close == 0)
+		/* This cache was never opened */
+		return false;
+	if (detail->last_close < seconds_since_boot() - 30)
+		/*
+		 * We allow for the possibility that someone might
+		 * restart a userspace daemon without restarting the
+		 * server; but after 30 seconds, we give up.
+		 */
+		 return false;
+	return true;
+}
+
+/*
+ * register an upcall request to user-space and queue it up for read() by the
+ * upcall daemon.
+ *
+ * Each request is at most one page long.
+ */
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
+		void (*cache_request)(struct cache_detail *,
+				      struct cache_head *,
+				      char **,
+				      int *))
+{
+
+	char *buf;
+	struct cache_request *crq;
+	char *bp;
+	int len;
+
+	if (!cache_listeners_exist(detail)) {
+		warn_no_listener(detail);
+		return -EINVAL;
+	}
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -EAGAIN;
+
+	crq = kmalloc(sizeof (*crq), GFP_KERNEL);
+	if (!crq) {
+		kfree(buf);
+		return -EAGAIN;
+	}
+
+	bp = buf; len = PAGE_SIZE;
+
+	cache_request(detail, h, &bp, &len);
+
+	if (len < 0) {
+		kfree(buf);
+		kfree(crq);
+		return -EAGAIN;
+	}
+	crq->q.reader = 0;
+	crq->item = cache_get(h);
+	crq->buf = buf;
+	crq->len = PAGE_SIZE - len;
+	crq->readers = 0;
+	spin_lock(&queue_lock);
+	list_add_tail(&crq->q.list, &detail->queue);
+	spin_unlock(&queue_lock);
+	wake_up(&queue_wait);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
+
+/*
+ * parse a message from user-space and pass it
+ * to an appropriate cache
+ * Messages are, like requests, separated into fields by
+ * spaces and dequotes as \xHEXSTRING or embedded \nnn octal
+ *
+ * Message is
+ *   reply cachename expiry key ... content....
+ *
+ * key and content are both parsed by cache
+ */
+
+#define isodigit(c) (isdigit(c) && c <= '7')
+int qword_get(char **bpp, char *dest, int bufsize)
+{
+	/* return bytes copied, or -1 on error */
+	char *bp = *bpp;
+	int len = 0;
+
+	while (*bp == ' ') bp++;
+
+	if (bp[0] == '\\' && bp[1] == 'x') {
+		/* HEX STRING */
+		bp += 2;
+		while (len < bufsize) {
+			int h, l;
+
+			h = hex_to_bin(bp[0]);
+			if (h < 0)
+				break;
+
+			l = hex_to_bin(bp[1]);
+			if (l < 0)
+				break;
+
+			*dest++ = (h << 4) | l;
+			bp += 2;
+			len++;
+		}
+	} else {
+		/* text with \nnn octal quoting */
+		while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) {
+			if (*bp == '\\' &&
+			    isodigit(bp[1]) && (bp[1] <= '3') &&
+			    isodigit(bp[2]) &&
+			    isodigit(bp[3])) {
+				int byte = (*++bp -'0');
+				bp++;
+				byte = (byte << 3) | (*bp++ - '0');
+				byte = (byte << 3) | (*bp++ - '0');
+				*dest++ = byte;
+				len++;
+			} else {
+				*dest++ = *bp++;
+				len++;
+			}
+		}
+	}
+
+	if (*bp != ' ' && *bp != '\n' && *bp != '\0')
+		return -1;
+	while (*bp == ' ') bp++;
+	*bpp = bp;
+	*dest = '\0';
+	return len;
+}
+EXPORT_SYMBOL_GPL(qword_get);
+
+
+/*
+ * support /proc/sunrpc/cache/$CACHENAME/content
+ * as a seqfile.
+ * We call ->cache_show passing NULL for the item to
+ * get a header, then pass each real item in the cache
+ */
+
+struct handle {
+	struct cache_detail *cd;
+};
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+	__acquires(cd->hash_lock)
+{
+	loff_t n = *pos;
+	unsigned hash, entry;
+	struct cache_head *ch;
+	struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+
+	read_lock(&cd->hash_lock);
+	if (!n--)
+		return SEQ_START_TOKEN;
+	hash = n >> 32;
+	entry = n & ((1LL<<32) - 1);
+
+	for (ch=cd->hash_table[hash]; ch; ch=ch->next)
+		if (!entry--)
+			return ch;
+	n &= ~((1LL<<32) - 1);
+	do {
+		hash++;
+		n += 1LL<<32;
+	} while(hash < cd->hash_size &&
+		cd->hash_table[hash]==NULL);
+	if (hash >= cd->hash_size)
+		return NULL;
+	*pos = n+1;
+	return cd->hash_table[hash];
+}
+
+static void *c_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cache_head *ch = p;
+	int hash = (*pos >> 32);
+	struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+	if (p == SEQ_START_TOKEN)
+		hash = 0;
+	else if (ch->next == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	} else {
+		++*pos;
+		return ch->next;
+	}
+	*pos &= ~((1LL<<32) - 1);
+	while (hash < cd->hash_size &&
+	       cd->hash_table[hash] == NULL) {
+		hash++;
+		*pos += 1LL<<32;
+	}
+	if (hash >= cd->hash_size)
+		return NULL;
+	++*pos;
+	return cd->hash_table[hash];
+}
+
+static void c_stop(struct seq_file *m, void *p)
+	__releases(cd->hash_lock)
+{
+	struct cache_detail *cd = ((struct handle*)m->private)->cd;
+	read_unlock(&cd->hash_lock);
+}
+
+static int c_show(struct seq_file *m, void *p)
+{
+	struct cache_head *cp = p;
+	struct cache_detail *cd = ((struct handle*)m->private)->cd;
+
+	if (p == SEQ_START_TOKEN)
+		return cd->cache_show(m, cd, NULL);
+
+	ifdebug(CACHE)
+		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
+			   convert_to_wallclock(cp->expiry_time),
+			   atomic_read(&cp->ref.refcount), cp->flags);
+	cache_get(cp);
+	if (cache_check(cd, cp, NULL))
+		/* cache_check does a cache_put on failure */
+		seq_printf(m, "# ");
+	else
+		cache_put(cp, cd);
+
+	return cd->cache_show(m, cd, cp);
+}
+
+static const struct seq_operations cache_content_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= c_show,
+};
+
+static int content_open(struct inode *inode, struct file *file,
+			struct cache_detail *cd)
+{
+	struct handle *han;
+
+	if (!cd || !try_module_get(cd->owner))
+		return -EACCES;
+	han = __seq_open_private(file, &cache_content_op, sizeof(*han));
+	if (han == NULL) {
+		module_put(cd->owner);
+		return -ENOMEM;
+	}
+
+	han->cd = cd;
+	return 0;
+}
+
+static int content_release(struct inode *inode, struct file *file,
+		struct cache_detail *cd)
+{
+	int ret = seq_release_private(inode, file);
+	module_put(cd->owner);
+	return ret;
+}
+
+static int open_flush(struct inode *inode, struct file *file,
+			struct cache_detail *cd)
+{
+	if (!cd || !try_module_get(cd->owner))
+		return -EACCES;
+	return nonseekable_open(inode, file);
+}
+
+static int release_flush(struct inode *inode, struct file *file,
+			struct cache_detail *cd)
+{
+	module_put(cd->owner);
+	return 0;
+}
+
+static ssize_t read_flush(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos,
+			  struct cache_detail *cd)
+{
+	char tbuf[20];
+	unsigned long p = *ppos;
+	size_t len;
+
+	sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
+	len = strlen(tbuf);
+	if (p >= len)
+		return 0;
+	len -= p;
+	if (len > count)
+		len = count;
+	if (copy_to_user(buf, (void*)(tbuf+p), len))
+		return -EFAULT;
+	*ppos += len;
+	return len;
+}
+
+static ssize_t write_flush(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos,
+			   struct cache_detail *cd)
+{
+	char tbuf[20];
+	char *bp, *ep;
+
+	if (*ppos || count > sizeof(tbuf)-1)
+		return -EINVAL;
+	if (copy_from_user(tbuf, buf, count))
+		return -EFAULT;
+	tbuf[count] = 0;
+	simple_strtoul(tbuf, &ep, 0);
+	if (*ep && *ep != '\n')
+		return -EINVAL;
+
+	bp = tbuf;
+	cd->flush_time = get_expiry(&bp);
+	cd->nextcheck = seconds_since_boot();
+	cache_flush();
+
+	*ppos += count;
+	return count;
+}
+
+static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+
+	return cache_read(filp, buf, count, ppos, cd);
+}
+
+static ssize_t cache_write_procfs(struct file *filp, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+
+	return cache_write(filp, buf, count, ppos, cd);
+}
+
+static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
+{
+	struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+
+	return cache_poll(filp, wait, cd);
+}
+
+static long cache_ioctl_procfs(struct file *filp,
+			       unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return cache_ioctl(inode, filp, cmd, arg, cd);
+}
+
+static int cache_open_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return cache_open(inode, filp, cd);
+}
+
+static int cache_release_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return cache_release(inode, filp, cd);
+}
+
+static const struct file_operations cache_file_operations_procfs = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= cache_read_procfs,
+	.write		= cache_write_procfs,
+	.poll		= cache_poll_procfs,
+	.unlocked_ioctl	= cache_ioctl_procfs, /* for FIONREAD */
+	.open		= cache_open_procfs,
+	.release	= cache_release_procfs,
+};
+
+static int content_open_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return content_open(inode, filp, cd);
+}
+
+static int content_release_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return content_release(inode, filp, cd);
+}
+
+static const struct file_operations content_file_operations_procfs = {
+	.open		= content_open_procfs,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= content_release_procfs,
+};
+
+static int open_flush_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return open_flush(inode, filp, cd);
+}
+
+static int release_flush_procfs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = PDE(inode)->data;
+
+	return release_flush(inode, filp, cd);
+}
+
+static ssize_t read_flush_procfs(struct file *filp, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+
+	return read_flush(filp, buf, count, ppos, cd);
+}
+
+static ssize_t write_flush_procfs(struct file *filp,
+				  const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = PDE(filp->f_path.dentry->d_inode)->data;
+
+	return write_flush(filp, buf, count, ppos, cd);
+}
+
+static const struct file_operations cache_flush_operations_procfs = {
+	.open		= open_flush_procfs,
+	.read		= read_flush_procfs,
+	.write		= write_flush_procfs,
+	.release	= release_flush_procfs,
+	.llseek		= no_llseek,
+};
+
+static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+	struct sunrpc_net *sn;
+
+	if (cd->u.procfs.proc_ent == NULL)
+		return;
+	if (cd->u.procfs.flush_ent)
+		remove_proc_entry("flush", cd->u.procfs.proc_ent);
+	if (cd->u.procfs.channel_ent)
+		remove_proc_entry("channel", cd->u.procfs.proc_ent);
+	if (cd->u.procfs.content_ent)
+		remove_proc_entry("content", cd->u.procfs.proc_ent);
+	cd->u.procfs.proc_ent = NULL;
+	sn = net_generic(net, sunrpc_net_id);
+	remove_proc_entry(cd->name, sn->proc_net_rpc);
+}
+
+#ifdef CONFIG_PROC_FS
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+	struct proc_dir_entry *p;
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
+	if (cd->u.procfs.proc_ent == NULL)
+		goto out_nomem;
+	cd->u.procfs.channel_ent = NULL;
+	cd->u.procfs.content_ent = NULL;
+
+	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
+			     cd->u.procfs.proc_ent,
+			     &cache_flush_operations_procfs, cd);
+	cd->u.procfs.flush_ent = p;
+	if (p == NULL)
+		goto out_nomem;
+
+	if (cd->cache_upcall || cd->cache_parse) {
+		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
+				     cd->u.procfs.proc_ent,
+				     &cache_file_operations_procfs, cd);
+		cd->u.procfs.channel_ent = p;
+		if (p == NULL)
+			goto out_nomem;
+	}
+	if (cd->cache_show) {
+		p = proc_create_data("content", S_IFREG|S_IRUSR|S_IWUSR,
+				cd->u.procfs.proc_ent,
+				&content_file_operations_procfs, cd);
+		cd->u.procfs.content_ent = p;
+		if (p == NULL)
+			goto out_nomem;
+	}
+	return 0;
+out_nomem:
+	remove_cache_proc_entries(cd, net);
+	return -ENOMEM;
+}
+#else /* CONFIG_PROC_FS */
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
+{
+	return 0;
+}
+#endif
+
+void __init cache_initialize(void)
+{
+	INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean);
+}
+
+int cache_register_net(struct cache_detail *cd, struct net *net)
+{
+	int ret;
+
+	sunrpc_init_cache_detail(cd);
+	ret = create_cache_proc_entries(cd, net);
+	if (ret)
+		sunrpc_destroy_cache_detail(cd);
+	return ret;
+}
+
+int cache_register(struct cache_detail *cd)
+{
+	return cache_register_net(cd, &init_net);
+}
+EXPORT_SYMBOL_GPL(cache_register);
+
+void cache_unregister_net(struct cache_detail *cd, struct net *net)
+{
+	remove_cache_proc_entries(cd, net);
+	sunrpc_destroy_cache_detail(cd);
+}
+
+void cache_unregister(struct cache_detail *cd)
+{
+	cache_unregister_net(cd, &init_net);
+}
+EXPORT_SYMBOL_GPL(cache_unregister);
+
+static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+
+	return cache_read(filp, buf, count, ppos, cd);
+}
+
+static ssize_t cache_write_pipefs(struct file *filp, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+
+	return cache_write(filp, buf, count, ppos, cd);
+}
+
+static unsigned int cache_poll_pipefs(struct file *filp, poll_table *wait)
+{
+	struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+
+	return cache_poll(filp, wait, cd);
+}
+
+static long cache_ioctl_pipefs(struct file *filp,
+			      unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return cache_ioctl(inode, filp, cmd, arg, cd);
+}
+
+static int cache_open_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return cache_open(inode, filp, cd);
+}
+
+static int cache_release_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return cache_release(inode, filp, cd);
+}
+
+const struct file_operations cache_file_operations_pipefs = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= cache_read_pipefs,
+	.write		= cache_write_pipefs,
+	.poll		= cache_poll_pipefs,
+	.unlocked_ioctl	= cache_ioctl_pipefs, /* for FIONREAD */
+	.open		= cache_open_pipefs,
+	.release	= cache_release_pipefs,
+};
+
+static int content_open_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return content_open(inode, filp, cd);
+}
+
+static int content_release_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return content_release(inode, filp, cd);
+}
+
+const struct file_operations content_file_operations_pipefs = {
+	.open		= content_open_pipefs,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= content_release_pipefs,
+};
+
+static int open_flush_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return open_flush(inode, filp, cd);
+}
+
+static int release_flush_pipefs(struct inode *inode, struct file *filp)
+{
+	struct cache_detail *cd = RPC_I(inode)->private;
+
+	return release_flush(inode, filp, cd);
+}
+
+static ssize_t read_flush_pipefs(struct file *filp, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+
+	return read_flush(filp, buf, count, ppos, cd);
+}
+
+static ssize_t write_flush_pipefs(struct file *filp,
+				  const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct cache_detail *cd = RPC_I(filp->f_path.dentry->d_inode)->private;
+
+	return write_flush(filp, buf, count, ppos, cd);
+}
+
+const struct file_operations cache_flush_operations_pipefs = {
+	.open		= open_flush_pipefs,
+	.read		= read_flush_pipefs,
+	.write		= write_flush_pipefs,
+	.release	= release_flush_pipefs,
+	.llseek		= no_llseek,
+};
+
+int sunrpc_cache_register_pipefs(struct dentry *parent,
+				 const char *name, mode_t umode,
+				 struct cache_detail *cd)
+{
+	struct qstr q;
+	struct dentry *dir;
+	int ret = 0;
+
+	sunrpc_init_cache_detail(cd);
+	q.name = name;
+	q.len = strlen(name);
+	q.hash = full_name_hash(q.name, q.len);
+	dir = rpc_create_cache_dir(parent, &q, umode, cd);
+	if (!IS_ERR(dir))
+		cd->u.pipefs.dir = dir;
+	else {
+		sunrpc_destroy_cache_detail(cd);
+		ret = PTR_ERR(dir);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
+
+void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
+{
+	rpc_remove_cache_dir(cd->u.pipefs.dir);
+	cd->u.pipefs.dir = NULL;
+	sunrpc_destroy_cache_detail(cd);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
+
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
new file mode 100644
index 00000000..8c914158
--- /dev/null
+++ b/net/sunrpc/clnt.c
@@ -0,0 +1,1876 @@
+/*
+ *  linux/net/sunrpc/clnt.c
+ *
+ *  This file contains the high-level RPC interface.
+ *  It is modeled as a finite state machine to support both synchronous
+ *  and asynchronous requests.
+ *
+ *  -	RPC header generation and argument serialization.
+ *  -	Credential refresh.
+ *  -	TCP connect handling.
+ *  -	Retry of operation when it is suspected the operation failed because
+ *	of uid squashing on the server, or when the credentials were stale
+ *	and need to be refreshed, or when a packet was damaged in transit.
+ *	This may be have to be moved to the VFS layer.
+ *
+ *  Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com>
+ *  Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <asm/system.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kallsyms.h>
+#include <linux/mm.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/utsname.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/un.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include "sunrpc.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_CALL
+#endif
+
+#define dprint_status(t)					\
+	dprintk("RPC: %5u %s (status %d)\n", t->tk_pid,		\
+			__func__, t->tk_status)
+
+/*
+ * All RPC clients are linked into this list
+ */
+static LIST_HEAD(all_clients);
+static DEFINE_SPINLOCK(rpc_client_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(destroy_wait);
+
+
+static void	call_start(struct rpc_task *task);
+static void	call_reserve(struct rpc_task *task);
+static void	call_reserveresult(struct rpc_task *task);
+static void	call_allocate(struct rpc_task *task);
+static void	call_decode(struct rpc_task *task);
+static void	call_bind(struct rpc_task *task);
+static void	call_bind_status(struct rpc_task *task);
+static void	call_transmit(struct rpc_task *task);
+#if defined(CONFIG_NFS_V4_1)
+static void	call_bc_transmit(struct rpc_task *task);
+#endif /* CONFIG_NFS_V4_1 */
+static void	call_status(struct rpc_task *task);
+static void	call_transmit_status(struct rpc_task *task);
+static void	call_refresh(struct rpc_task *task);
+static void	call_refreshresult(struct rpc_task *task);
+static void	call_timeout(struct rpc_task *task);
+static void	call_connect(struct rpc_task *task);
+static void	call_connect_status(struct rpc_task *task);
+
+static __be32	*rpc_encode_header(struct rpc_task *task);
+static __be32	*rpc_verify_header(struct rpc_task *task);
+static int	rpc_ping(struct rpc_clnt *clnt);
+
+static void rpc_register_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_client_lock);
+	list_add(&clnt->cl_clients, &all_clients);
+	spin_unlock(&rpc_client_lock);
+}
+
+static void rpc_unregister_client(struct rpc_clnt *clnt)
+{
+	spin_lock(&rpc_client_lock);
+	list_del(&clnt->cl_clients);
+	spin_unlock(&rpc_client_lock);
+}
+
+static int
+rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
+{
+	static uint32_t clntid;
+	struct nameidata nd;
+	struct path path;
+	char name[15];
+	struct qstr q = {
+		.name = name,
+	};
+	int error;
+
+	clnt->cl_path.mnt = ERR_PTR(-ENOENT);
+	clnt->cl_path.dentry = ERR_PTR(-ENOENT);
+	if (dir_name == NULL)
+		return 0;
+
+	path.mnt = rpc_get_mount();
+	if (IS_ERR(path.mnt))
+		return PTR_ERR(path.mnt);
+	error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &nd);
+	if (error)
+		goto err;
+
+	for (;;) {
+		q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
+		name[sizeof(name) - 1] = '\0';
+		q.hash = full_name_hash(q.name, q.len);
+		path.dentry = rpc_create_client_dir(nd.path.dentry, &q, clnt);
+		if (!IS_ERR(path.dentry))
+			break;
+		error = PTR_ERR(path.dentry);
+		if (error != -EEXIST) {
+			printk(KERN_INFO "RPC: Couldn't create pipefs entry"
+					" %s/%s, error %d\n",
+					dir_name, name, error);
+			goto err_path_put;
+		}
+	}
+	path_put(&nd.path);
+	clnt->cl_path = path;
+	return 0;
+err_path_put:
+	path_put(&nd.path);
+err:
+	rpc_put_mount();
+	return error;
+}
+
+static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt *xprt)
+{
+	struct rpc_program	*program = args->program;
+	struct rpc_version	*version;
+	struct rpc_clnt		*clnt = NULL;
+	struct rpc_auth		*auth;
+	int err;
+	size_t len;
+
+	/* sanity check the name before trying to print it */
+	err = -EINVAL;
+	len = strlen(args->servername);
+	if (len > RPC_MAXNETNAMELEN)
+		goto out_no_rpciod;
+	len++;
+
+	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
+			program->name, args->servername, xprt);
+
+	err = rpciod_up();
+	if (err)
+		goto out_no_rpciod;
+	err = -EINVAL;
+	if (!xprt)
+		goto out_no_xprt;
+
+	if (args->version >= program->nrvers)
+		goto out_err;
+	version = program->version[args->version];
+	if (version == NULL)
+		goto out_err;
+
+	err = -ENOMEM;
+	clnt = kzalloc(sizeof(*clnt), GFP_KERNEL);
+	if (!clnt)
+		goto out_err;
+	clnt->cl_parent = clnt;
+
+	clnt->cl_server = clnt->cl_inline_name;
+	if (len > sizeof(clnt->cl_inline_name)) {
+		char *buf = kmalloc(len, GFP_KERNEL);
+		if (buf != NULL)
+			clnt->cl_server = buf;
+		else
+			len = sizeof(clnt->cl_inline_name);
+	}
+	strlcpy(clnt->cl_server, args->servername, len);
+
+	clnt->cl_xprt     = xprt;
+	clnt->cl_procinfo = version->procs;
+	clnt->cl_maxproc  = version->nrprocs;
+	clnt->cl_protname = program->name;
+	clnt->cl_prog     = args->prognumber ? : program->number;
+	clnt->cl_vers     = version->number;
+	clnt->cl_stats    = program->stats;
+	clnt->cl_metrics  = rpc_alloc_iostats(clnt);
+	err = -ENOMEM;
+	if (clnt->cl_metrics == NULL)
+		goto out_no_stats;
+	clnt->cl_program  = program;
+	INIT_LIST_HEAD(&clnt->cl_tasks);
+	spin_lock_init(&clnt->cl_lock);
+
+	if (!xprt_bound(clnt->cl_xprt))
+		clnt->cl_autobind = 1;
+
+	clnt->cl_timeout = xprt->timeout;
+	if (args->timeout != NULL) {
+		memcpy(&clnt->cl_timeout_default, args->timeout,
+				sizeof(clnt->cl_timeout_default));
+		clnt->cl_timeout = &clnt->cl_timeout_default;
+	}
+
+	clnt->cl_rtt = &clnt->cl_rtt_default;
+	rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
+	clnt->cl_principal = NULL;
+	if (args->client_name) {
+		clnt->cl_principal = kstrdup(args->client_name, GFP_KERNEL);
+		if (!clnt->cl_principal)
+			goto out_no_principal;
+	}
+
+	atomic_set(&clnt->cl_count, 1);
+
+	err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
+	if (err < 0)
+		goto out_no_path;
+
+	auth = rpcauth_create(args->authflavor, clnt);
+	if (IS_ERR(auth)) {
+		printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
+				args->authflavor);
+		err = PTR_ERR(auth);
+		goto out_no_auth;
+	}
+
+	/* save the nodename */
+	clnt->cl_nodelen = strlen(init_utsname()->nodename);
+	if (clnt->cl_nodelen > UNX_MAXNODENAME)
+		clnt->cl_nodelen = UNX_MAXNODENAME;
+	memcpy(clnt->cl_nodename, init_utsname()->nodename, clnt->cl_nodelen);
+	rpc_register_client(clnt);
+	return clnt;
+
+out_no_auth:
+	if (!IS_ERR(clnt->cl_path.dentry)) {
+		rpc_remove_client_dir(clnt->cl_path.dentry);
+		rpc_put_mount();
+	}
+out_no_path:
+	kfree(clnt->cl_principal);
+out_no_principal:
+	rpc_free_iostats(clnt->cl_metrics);
+out_no_stats:
+	if (clnt->cl_server != clnt->cl_inline_name)
+		kfree(clnt->cl_server);
+	kfree(clnt);
+out_err:
+	xprt_put(xprt);
+out_no_xprt:
+	rpciod_down();
+out_no_rpciod:
+	return ERR_PTR(err);
+}
+
+/*
+ * rpc_create - create an RPC client and transport with one call
+ * @args: rpc_clnt create argument structure
+ *
+ * Creates and initializes an RPC transport and an RPC client.
+ *
+ * It can ping the server in order to determine if it is up, and to see if
+ * it supports this program and version.  RPC_CLNT_CREATE_NOPING disables
+ * this behavior so asynchronous tasks can also use rpc_create.
+ */
+struct rpc_clnt *rpc_create(struct rpc_create_args *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpc_clnt *clnt;
+	struct xprt_create xprtargs = {
+		.net = args->net,
+		.ident = args->protocol,
+		.srcaddr = args->saddress,
+		.dstaddr = args->address,
+		.addrlen = args->addrsize,
+		.bc_xprt = args->bc_xprt,
+	};
+	char servername[48];
+
+	/*
+	 * If the caller chooses not to specify a hostname, whip
+	 * up a string representation of the passed-in address.
+	 */
+	if (args->servername == NULL) {
+		struct sockaddr_un *sun =
+				(struct sockaddr_un *)args->address;
+		struct sockaddr_in *sin =
+				(struct sockaddr_in *)args->address;
+		struct sockaddr_in6 *sin6 =
+				(struct sockaddr_in6 *)args->address;
+
+		servername[0] = '\0';
+		switch (args->address->sa_family) {
+		case AF_LOCAL:
+			snprintf(servername, sizeof(servername), "%s",
+				 sun->sun_path);
+			break;
+		case AF_INET:
+			snprintf(servername, sizeof(servername), "%pI4",
+				 &sin->sin_addr.s_addr);
+			break;
+		case AF_INET6:
+			snprintf(servername, sizeof(servername), "%pI6",
+				 &sin6->sin6_addr);
+			break;
+		default:
+			/* caller wants default server name, but
+			 * address family isn't recognized. */
+			return ERR_PTR(-EINVAL);
+		}
+		args->servername = servername;
+	}
+
+	xprt = xprt_create_transport(&xprtargs);
+	if (IS_ERR(xprt))
+		return (struct rpc_clnt *)xprt;
+
+	/*
+	 * By default, kernel RPC client connects from a reserved port.
+	 * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters,
+	 * but it is always enabled for rpciod, which handles the connect
+	 * operation.
+	 */
+	xprt->resvport = 1;
+	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
+		xprt->resvport = 0;
+
+	clnt = rpc_new_client(args, xprt);
+	if (IS_ERR(clnt))
+		return clnt;
+
+	if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+		int err = rpc_ping(clnt);
+		if (err != 0) {
+			rpc_shutdown_client(clnt);
+			return ERR_PTR(err);
+		}
+	}
+
+	clnt->cl_softrtry = 1;
+	if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+		clnt->cl_softrtry = 0;
+
+	if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+		clnt->cl_autobind = 1;
+	if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
+		clnt->cl_discrtry = 1;
+	if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+		clnt->cl_chatty = 1;
+
+	return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_create);
+
+/*
+ * This function clones the RPC client structure. It allows us to share the
+ * same transport while varying parameters such as the authentication
+ * flavour.
+ */
+struct rpc_clnt *
+rpc_clone_client(struct rpc_clnt *clnt)
+{
+	struct rpc_clnt *new;
+	int err = -ENOMEM;
+
+	new = kmemdup(clnt, sizeof(*new), GFP_KERNEL);
+	if (!new)
+		goto out_no_clnt;
+	new->cl_parent = clnt;
+	/* Turn off autobind on clones */
+	new->cl_autobind = 0;
+	INIT_LIST_HEAD(&new->cl_tasks);
+	spin_lock_init(&new->cl_lock);
+	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval);
+	new->cl_metrics = rpc_alloc_iostats(clnt);
+	if (new->cl_metrics == NULL)
+		goto out_no_stats;
+	if (clnt->cl_principal) {
+		new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL);
+		if (new->cl_principal == NULL)
+			goto out_no_principal;
+	}
+	atomic_set(&new->cl_count, 1);
+	err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
+	if (err != 0)
+		goto out_no_path;
+	if (new->cl_auth)
+		atomic_inc(&new->cl_auth->au_count);
+	xprt_get(clnt->cl_xprt);
+	atomic_inc(&clnt->cl_count);
+	rpc_register_client(new);
+	rpciod_up();
+	return new;
+out_no_path:
+	kfree(new->cl_principal);
+out_no_principal:
+	rpc_free_iostats(new->cl_metrics);
+out_no_stats:
+	kfree(new);
+out_no_clnt:
+	dprintk("RPC:       %s: returned error %d\n", __func__, err);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rpc_clone_client);
+
+/*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+	struct rpc_task	*rovr;
+
+
+	if (list_empty(&clnt->cl_tasks))
+		return;
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+		if (!RPC_IS_ACTIVATED(rovr))
+			continue;
+		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
+			rovr->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(rovr, -EIO);
+			if (RPC_IS_QUEUED(rovr))
+				rpc_wake_up_queued_task(rovr->tk_waitqueue,
+							rovr);
+		}
+	}
+	spin_unlock(&clnt->cl_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+
+/*
+ * Properly shut down an RPC client, terminating all outstanding
+ * requests.
+ */
+void rpc_shutdown_client(struct rpc_clnt *clnt)
+{
+	dprintk("RPC:       shutting down %s client for %s\n",
+			clnt->cl_protname, clnt->cl_server);
+
+	while (!list_empty(&clnt->cl_tasks)) {
+		rpc_killall_tasks(clnt);
+		wait_event_timeout(destroy_wait,
+			list_empty(&clnt->cl_tasks), 1*HZ);
+	}
+
+	rpc_release_client(clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_shutdown_client);
+
+/*
+ * Free an RPC client
+ */
+static void
+rpc_free_client(struct rpc_clnt *clnt)
+{
+	dprintk("RPC:       destroying %s client for %s\n",
+			clnt->cl_protname, clnt->cl_server);
+	if (!IS_ERR(clnt->cl_path.dentry)) {
+		rpc_remove_client_dir(clnt->cl_path.dentry);
+		rpc_put_mount();
+	}
+	if (clnt->cl_parent != clnt) {
+		rpc_release_client(clnt->cl_parent);
+		goto out_free;
+	}
+	if (clnt->cl_server != clnt->cl_inline_name)
+		kfree(clnt->cl_server);
+out_free:
+	rpc_unregister_client(clnt);
+	rpc_free_iostats(clnt->cl_metrics);
+	kfree(clnt->cl_principal);
+	clnt->cl_metrics = NULL;
+	xprt_put(clnt->cl_xprt);
+	rpciod_down();
+	kfree(clnt);
+}
+
+/*
+ * Free an RPC client
+ */
+static void
+rpc_free_auth(struct rpc_clnt *clnt)
+{
+	if (clnt->cl_auth == NULL) {
+		rpc_free_client(clnt);
+		return;
+	}
+
+	/*
+	 * Note: RPCSEC_GSS may need to send NULL RPC calls in order to
+	 *       release remaining GSS contexts. This mechanism ensures
+	 *       that it can do so safely.
+	 */
+	atomic_inc(&clnt->cl_count);
+	rpcauth_release(clnt->cl_auth);
+	clnt->cl_auth = NULL;
+	if (atomic_dec_and_test(&clnt->cl_count))
+		rpc_free_client(clnt);
+}
+
+/*
+ * Release reference to the RPC client
+ */
+void
+rpc_release_client(struct rpc_clnt *clnt)
+{
+	dprintk("RPC:       rpc_release_client(%p)\n", clnt);
+
+	if (list_empty(&clnt->cl_tasks))
+		wake_up(&destroy_wait);
+	if (atomic_dec_and_test(&clnt->cl_count))
+		rpc_free_auth(clnt);
+}
+
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old: old rpc_client
+ * @program: rpc program to set
+ * @vers: rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+				      struct rpc_program *program,
+				      u32 vers)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_version *version;
+	int err;
+
+	BUG_ON(vers >= program->nrvers || !program->version[vers]);
+	version = program->version[vers];
+	clnt = rpc_clone_client(old);
+	if (IS_ERR(clnt))
+		goto out;
+	clnt->cl_procinfo = version->procs;
+	clnt->cl_maxproc  = version->nrprocs;
+	clnt->cl_protname = program->name;
+	clnt->cl_prog     = program->number;
+	clnt->cl_vers     = version->number;
+	clnt->cl_stats    = program->stats;
+	err = rpc_ping(clnt);
+	if (err != 0) {
+		rpc_shutdown_client(clnt);
+		clnt = ERR_PTR(err);
+	}
+out:
+	return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_bind_new_program);
+
+void rpc_task_release_client(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+
+	if (clnt != NULL) {
+		/* Remove from client task list */
+		spin_lock(&clnt->cl_lock);
+		list_del(&task->tk_task);
+		spin_unlock(&clnt->cl_lock);
+		task->tk_client = NULL;
+
+		rpc_release_client(clnt);
+	}
+}
+
+static
+void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	if (clnt != NULL) {
+		rpc_task_release_client(task);
+		task->tk_client = clnt;
+		atomic_inc(&clnt->cl_count);
+		if (clnt->cl_softrtry)
+			task->tk_flags |= RPC_TASK_SOFT;
+		/* Add to the client's list of all tasks */
+		spin_lock(&clnt->cl_lock);
+		list_add_tail(&task->tk_task, &clnt->cl_tasks);
+		spin_unlock(&clnt->cl_lock);
+	}
+}
+
+void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	rpc_task_release_client(task);
+	rpc_task_set_client(task, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_task_reset_client);
+
+
+static void
+rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
+{
+	if (msg != NULL) {
+		task->tk_msg.rpc_proc = msg->rpc_proc;
+		task->tk_msg.rpc_argp = msg->rpc_argp;
+		task->tk_msg.rpc_resp = msg->rpc_resp;
+		if (msg->rpc_cred != NULL)
+			task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
+	}
+}
+
+/*
+ * Default callback for async RPC calls
+ */
+static void
+rpc_default_callback(struct rpc_task *task, void *data)
+{
+}
+
+static const struct rpc_call_ops rpc_default_ops = {
+	.rpc_call_done = rpc_default_callback,
+};
+
+/**
+ * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
+ * @task_setup_data: pointer to task initialisation data
+ */
+struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
+{
+	struct rpc_task *task;
+
+	task = rpc_new_task(task_setup_data);
+	if (IS_ERR(task))
+		goto out;
+
+	rpc_task_set_client(task, task_setup_data->rpc_client);
+	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+
+	if (task->tk_action == NULL)
+		rpc_call_start(task);
+
+	atomic_inc(&task->tk_count);
+	rpc_execute(task);
+out:
+	return task;
+}
+EXPORT_SYMBOL_GPL(rpc_run_task);
+
+/**
+ * rpc_call_sync - Perform a synchronous RPC call
+ * @clnt: pointer to RPC client
+ * @msg: RPC call parameters
+ * @flags: RPC call flags
+ */
+int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags)
+{
+	struct rpc_task	*task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.rpc_message = msg,
+		.callback_ops = &rpc_default_ops,
+		.flags = flags,
+	};
+	int status;
+
+	BUG_ON(flags & RPC_TASK_ASYNC);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	status = task->tk_status;
+	rpc_put_task(task);
+	return status;
+}
+EXPORT_SYMBOL_GPL(rpc_call_sync);
+
+/**
+ * rpc_call_async - Perform an asynchronous RPC call
+ * @clnt: pointer to RPC client
+ * @msg: RPC call parameters
+ * @flags: RPC call flags
+ * @tk_ops: RPC call ops
+ * @data: user call data
+ */
+int
+rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags,
+	       const struct rpc_call_ops *tk_ops, void *data)
+{
+	struct rpc_task	*task;
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.rpc_message = msg,
+		.callback_ops = tk_ops,
+		.callback_data = data,
+		.flags = flags|RPC_TASK_ASYNC,
+	};
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rpc_call_async);
+
+#if defined(CONFIG_NFS_V4_1)
+/**
+ * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
+ * rpc_execute against it
+ * @req: RPC request
+ * @tk_ops: RPC call ops
+ */
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
+				const struct rpc_call_ops *tk_ops)
+{
+	struct rpc_task *task;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct rpc_task_setup task_setup_data = {
+		.callback_ops = tk_ops,
+	};
+
+	dprintk("RPC: rpc_run_bc_task req= %p\n", req);
+	/*
+	 * Create an rpc_task to send the data
+	 */
+	task = rpc_new_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		xprt_free_bc_request(req);
+		goto out;
+	}
+	task->tk_rqstp = req;
+
+	/*
+	 * Set up the xdr_buf length.
+	 * This also indicates that the buffer is XDR encoded already.
+	 */
+	xbufp->len = xbufp->head[0].iov_len + xbufp->page_len +
+			xbufp->tail[0].iov_len;
+
+	task->tk_action = call_bc_transmit;
+	atomic_inc(&task->tk_count);
+	BUG_ON(atomic_read(&task->tk_count) != 2);
+	rpc_execute(task);
+
+out:
+	dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
+	return task;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+void
+rpc_call_start(struct rpc_task *task)
+{
+	task->tk_action = call_start;
+}
+EXPORT_SYMBOL_GPL(rpc_call_start);
+
+/**
+ * rpc_peeraddr - extract remote peer address from clnt's xprt
+ * @clnt: RPC client structure
+ * @buf: target buffer
+ * @bufsize: length of target buffer
+ *
+ * Returns the number of bytes that are actually in the stored address.
+ */
+size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)
+{
+	size_t bytes;
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+
+	bytes = sizeof(xprt->addr);
+	if (bytes > bufsize)
+		bytes = bufsize;
+	memcpy(buf, &clnt->cl_xprt->addr, bytes);
+	return xprt->addrlen;
+}
+EXPORT_SYMBOL_GPL(rpc_peeraddr);
+
+/**
+ * rpc_peeraddr2str - return remote peer address in printable format
+ * @clnt: RPC client structure
+ * @format: address format
+ *
+ */
+const char *rpc_peeraddr2str(struct rpc_clnt *clnt,
+			     enum rpc_display_format_t format)
+{
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+
+	if (xprt->address_strings[format] != NULL)
+		return xprt->address_strings[format];
+	else
+		return "unprintable";
+}
+EXPORT_SYMBOL_GPL(rpc_peeraddr2str);
+
+void
+rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize)
+{
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+	if (xprt->ops->set_buffer_size)
+		xprt->ops->set_buffer_size(xprt, sndsize, rcvsize);
+}
+EXPORT_SYMBOL_GPL(rpc_setbufsize);
+
+/*
+ * Return size of largest payload RPC client can support, in bytes
+ *
+ * For stream transports, this is one RPC record fragment (see RFC
+ * 1831), as we don't support multi-record requests yet.  For datagram
+ * transports, this is the size of an IP packet minus the IP, UDP, and
+ * RPC header sizes.
+ */
+size_t rpc_max_payload(struct rpc_clnt *clnt)
+{
+	return clnt->cl_xprt->max_payload;
+}
+EXPORT_SYMBOL_GPL(rpc_max_payload);
+
+/**
+ * rpc_force_rebind - force transport to check that remote port is unchanged
+ * @clnt: client to rebind
+ *
+ */
+void rpc_force_rebind(struct rpc_clnt *clnt)
+{
+	if (clnt->cl_autobind)
+		xprt_clear_bound(clnt->cl_xprt);
+}
+EXPORT_SYMBOL_GPL(rpc_force_rebind);
+
+/*
+ * Restart an (async) RPC call from the call_prepare state.
+ * Usually called from within the exit handler.
+ */
+int
+rpc_restart_call_prepare(struct rpc_task *task)
+{
+	if (RPC_ASSASSINATED(task))
+		return 0;
+	task->tk_action = rpc_prepare_task;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
+
+/*
+ * Restart an (async) RPC call. Usually called from within the
+ * exit handler.
+ */
+int
+rpc_restart_call(struct rpc_task *task)
+{
+	if (RPC_ASSASSINATED(task))
+		return 0;
+	task->tk_action = call_start;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rpc_restart_call);
+
+#ifdef RPC_DEBUG
+static const char *rpc_proc_name(const struct rpc_task *task)
+{
+	const struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+
+	if (proc) {
+		if (proc->p_name)
+			return proc->p_name;
+		else
+			return "NULL";
+	} else
+		return "no proc";
+}
+#endif
+
+/*
+ * 0.  Initial state
+ *
+ *     Other FSM states can be visited zero or more times, but
+ *     this state is visited exactly once for each RPC.
+ */
+static void
+call_start(struct rpc_task *task)
+{
+	struct rpc_clnt	*clnt = task->tk_client;
+
+	dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid,
+			clnt->cl_protname, clnt->cl_vers,
+			rpc_proc_name(task),
+			(RPC_IS_ASYNC(task) ? "async" : "sync"));
+
+	/* Increment call count */
+	task->tk_msg.rpc_proc->p_count++;
+	clnt->cl_stats->rpccnt++;
+	task->tk_action = call_reserve;
+}
+
+/*
+ * 1.	Reserve an RPC call slot
+ */
+static void
+call_reserve(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_status  = 0;
+	task->tk_action  = call_reserveresult;
+	xprt_reserve(task);
+}
+
+/*
+ * 1b.	Grok the result of xprt_reserve()
+ */
+static void
+call_reserveresult(struct rpc_task *task)
+{
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	/*
+	 * After a call to xprt_reserve(), we must have either
+	 * a request slot or else an error status.
+	 */
+	task->tk_status = 0;
+	if (status >= 0) {
+		if (task->tk_rqstp) {
+			task->tk_action = call_refresh;
+			return;
+		}
+
+		printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
+				__func__, status);
+		rpc_exit(task, -EIO);
+		return;
+	}
+
+	/*
+	 * Even though there was an error, we may have acquired
+	 * a request slot somehow.  Make sure not to leak it.
+	 */
+	if (task->tk_rqstp) {
+		printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
+				__func__, status);
+		xprt_release(task);
+	}
+
+	switch (status) {
+	case -EAGAIN:	/* woken up; retry */
+		task->tk_action = call_reserve;
+		return;
+	case -EIO:	/* probably a shutdown */
+		break;
+	default:
+		printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
+				__func__, status);
+		break;
+	}
+	rpc_exit(task, status);
+}
+
+/*
+ * 2.	Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_action = call_refreshresult;
+	task->tk_status = 0;
+	task->tk_client->cl_stats->rpcauthrefresh++;
+	rpcauth_refreshcred(task);
+}
+
+/*
+ * 2a.	Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	task->tk_action = call_refresh;
+	switch (status) {
+	case 0:
+		if (rpcauth_uptodatecred(task))
+			task->tk_action = call_allocate;
+		return;
+	case -ETIMEDOUT:
+		rpc_delay(task, 3*HZ);
+	case -EAGAIN:
+		status = -EACCES;
+		if (!task->tk_cred_retry)
+			break;
+		task->tk_cred_retry--;
+		dprintk("RPC: %5u %s: retry refresh creds\n",
+				task->tk_pid, __func__);
+		return;
+	}
+	dprintk("RPC: %5u %s: refresh creds failed with error %d\n",
+				task->tk_pid, __func__, status);
+	rpc_exit(task, status);
+}
+
+/*
+ * 2b.	Allocate the buffer. For details, see sched.c:rpc_malloc.
+ *	(Note: buffer memory is freed in xprt_release).
+ */
+static void
+call_allocate(struct rpc_task *task)
+{
+	unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	task->tk_action = call_bind;
+
+	if (req->rq_buffer)
+		return;
+
+	if (proc->p_proc != 0) {
+		BUG_ON(proc->p_arglen == 0);
+		if (proc->p_decode != NULL)
+			BUG_ON(proc->p_replen == 0);
+	}
+
+	/*
+	 * Calculate the size (in quads) of the RPC call
+	 * and reply headers, and convert both values
+	 * to byte sizes.
+	 */
+	req->rq_callsize = RPC_CALLHDRSIZE + (slack << 1) + proc->p_arglen;
+	req->rq_callsize <<= 2;
+	req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
+	req->rq_rcvsize <<= 2;
+
+	req->rq_buffer = xprt->ops->buf_alloc(task,
+					req->rq_callsize + req->rq_rcvsize);
+	if (req->rq_buffer != NULL)
+		return;
+
+	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
+
+	if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) {
+		task->tk_action = call_allocate;
+		rpc_delay(task, HZ>>4);
+		return;
+	}
+
+	rpc_exit(task, -ERESTARTSYS);
+}
+
+static inline int
+rpc_task_need_encode(struct rpc_task *task)
+{
+	return task->tk_rqstp->rq_snd_buf.len == 0;
+}
+
+static inline void
+rpc_task_force_reencode(struct rpc_task *task)
+{
+	task->tk_rqstp->rq_snd_buf.len = 0;
+	task->tk_rqstp->rq_bytes_sent = 0;
+}
+
+static inline void
+rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+	buf->head[0].iov_base = start;
+	buf->head[0].iov_len = len;
+	buf->tail[0].iov_len = 0;
+	buf->page_len = 0;
+	buf->flags = 0;
+	buf->len = 0;
+	buf->buflen = len;
+}
+
+/*
+ * 3.	Encode arguments of an RPC call
+ */
+static void
+rpc_xdr_encode(struct rpc_task *task)
+{
+	struct rpc_rqst	*req = task->tk_rqstp;
+	kxdreproc_t	encode;
+	__be32		*p;
+
+	dprint_status(task);
+
+	rpc_xdr_buf_init(&req->rq_snd_buf,
+			 req->rq_buffer,
+			 req->rq_callsize);
+	rpc_xdr_buf_init(&req->rq_rcv_buf,
+			 (char *)req->rq_buffer + req->rq_callsize,
+			 req->rq_rcvsize);
+
+	p = rpc_encode_header(task);
+	if (p == NULL) {
+		printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
+		rpc_exit(task, -EIO);
+		return;
+	}
+
+	encode = task->tk_msg.rpc_proc->p_encode;
+	if (encode == NULL)
+		return;
+
+	task->tk_status = rpcauth_wrap_req(task, encode, req, p,
+			task->tk_msg.rpc_argp);
+}
+
+/*
+ * 4.	Get the server port number if not yet set
+ */
+static void
+call_bind(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+
+	dprint_status(task);
+
+	task->tk_action = call_connect;
+	if (!xprt_bound(xprt)) {
+		task->tk_action = call_bind_status;
+		task->tk_timeout = xprt->bind_timeout;
+		xprt->ops->rpcbind(task);
+	}
+}
+
+/*
+ * 4a.	Sort out bind result
+ */
+static void
+call_bind_status(struct rpc_task *task)
+{
+	int status = -EIO;
+
+	if (task->tk_status >= 0) {
+		dprint_status(task);
+		task->tk_status = 0;
+		task->tk_action = call_connect;
+		return;
+	}
+
+	switch (task->tk_status) {
+	case -ENOMEM:
+		dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
+		rpc_delay(task, HZ >> 2);
+		goto retry_timeout;
+	case -EACCES:
+		dprintk("RPC: %5u remote rpcbind: RPC program/version "
+				"unavailable\n", task->tk_pid);
+		/* fail immediately if this is an RPC ping */
+		if (task->tk_msg.rpc_proc->p_proc == 0) {
+			status = -EOPNOTSUPP;
+			break;
+		}
+		if (task->tk_rebind_retry == 0)
+			break;
+		task->tk_rebind_retry--;
+		rpc_delay(task, 3*HZ);
+		goto retry_timeout;
+	case -ETIMEDOUT:
+		dprintk("RPC: %5u rpcbind request timed out\n",
+				task->tk_pid);
+		goto retry_timeout;
+	case -EPFNOSUPPORT:
+		/* server doesn't support any rpcbind version we know of */
+		dprintk("RPC: %5u unrecognized remote rpcbind service\n",
+				task->tk_pid);
+		break;
+	case -EPROTONOSUPPORT:
+		dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
+				task->tk_pid);
+		task->tk_status = 0;
+		task->tk_action = call_bind;
+		return;
+	case -ECONNREFUSED:		/* connection problems */
+	case -ECONNRESET:
+	case -ENOTCONN:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -EPIPE:
+		dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
+				task->tk_pid, task->tk_status);
+		if (!RPC_IS_SOFTCONN(task)) {
+			rpc_delay(task, 5*HZ);
+			goto retry_timeout;
+		}
+		status = task->tk_status;
+		break;
+	default:
+		dprintk("RPC: %5u unrecognized rpcbind error (%d)\n",
+				task->tk_pid, -task->tk_status);
+	}
+
+	rpc_exit(task, status);
+	return;
+
+retry_timeout:
+	task->tk_action = call_timeout;
+}
+
+/*
+ * 4b.	Connect to the RPC server
+ */
+static void
+call_connect(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+
+	dprintk("RPC: %5u call_connect xprt %p %s connected\n",
+			task->tk_pid, xprt,
+			(xprt_connected(xprt) ? "is" : "is not"));
+
+	task->tk_action = call_transmit;
+	if (!xprt_connected(xprt)) {
+		task->tk_action = call_connect_status;
+		if (task->tk_status < 0)
+			return;
+		xprt_connect(task);
+	}
+}
+
+/*
+ * 4c.	Sort out connect result
+ */
+static void
+call_connect_status(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	if (status >= 0 || status == -EAGAIN) {
+		clnt->cl_stats->netreconn++;
+		task->tk_action = call_transmit;
+		return;
+	}
+
+	switch (status) {
+		/* if soft mounted, test if we've timed out */
+	case -ETIMEDOUT:
+		task->tk_action = call_timeout;
+		break;
+	default:
+		rpc_exit(task, -EIO);
+	}
+}
+
+/*
+ * 5.	Transmit the RPC request, and wait for reply
+ */
+static void
+call_transmit(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_action = call_status;
+	if (task->tk_status < 0)
+		return;
+	task->tk_status = xprt_prepare_transmit(task);
+	if (task->tk_status != 0)
+		return;
+	task->tk_action = call_transmit_status;
+	/* Encode here so that rpcsec_gss can use correct sequence number. */
+	if (rpc_task_need_encode(task)) {
+		BUG_ON(task->tk_rqstp->rq_bytes_sent != 0);
+		rpc_xdr_encode(task);
+		/* Did the encode result in an error condition? */
+		if (task->tk_status != 0) {
+			/* Was the error nonfatal? */
+			if (task->tk_status == -EAGAIN)
+				rpc_delay(task, HZ >> 4);
+			else
+				rpc_exit(task, task->tk_status);
+			return;
+		}
+	}
+	xprt_transmit(task);
+	if (task->tk_status < 0)
+		return;
+	/*
+	 * On success, ensure that we call xprt_end_transmit() before sleeping
+	 * in order to allow access to the socket to other RPC requests.
+	 */
+	call_transmit_status(task);
+	if (rpc_reply_expected(task))
+		return;
+	task->tk_action = rpc_exit_task;
+	rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
+}
+
+/*
+ * 5a.	Handle cleanup after a transmission
+ */
+static void
+call_transmit_status(struct rpc_task *task)
+{
+	task->tk_action = call_status;
+
+	/*
+	 * Common case: success.  Force the compiler to put this
+	 * test first.
+	 */
+	if (task->tk_status == 0) {
+		xprt_end_transmit(task);
+		rpc_task_force_reencode(task);
+		return;
+	}
+
+	switch (task->tk_status) {
+	case -EAGAIN:
+		break;
+	default:
+		dprint_status(task);
+		xprt_end_transmit(task);
+		rpc_task_force_reencode(task);
+		break;
+		/*
+		 * Special cases: if we've been waiting on the
+		 * socket's write_space() callback, or if the
+		 * socket just returned a connection error,
+		 * then hold onto the transport lock.
+		 */
+	case -ECONNREFUSED:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		if (RPC_IS_SOFTCONN(task)) {
+			xprt_end_transmit(task);
+			rpc_exit(task, task->tk_status);
+			break;
+		}
+	case -ECONNRESET:
+	case -ENOTCONN:
+	case -EPIPE:
+		rpc_task_force_reencode(task);
+	}
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * 5b.	Send the backchannel RPC reply.  On error, drop the reply.  In
+ * addition, disconnect on connectivity errors.
+ */
+static void
+call_bc_transmit(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	BUG_ON(task->tk_status != 0);
+	task->tk_status = xprt_prepare_transmit(task);
+	if (task->tk_status == -EAGAIN) {
+		/*
+		 * Could not reserve the transport. Try again after the
+		 * transport is released.
+		 */
+		task->tk_status = 0;
+		task->tk_action = call_bc_transmit;
+		return;
+	}
+
+	task->tk_action = rpc_exit_task;
+	if (task->tk_status < 0) {
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		return;
+	}
+
+	xprt_transmit(task);
+	xprt_end_transmit(task);
+	dprint_status(task);
+	switch (task->tk_status) {
+	case 0:
+		/* Success */
+		break;
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+	case -ETIMEDOUT:
+		/*
+		 * Problem reaching the server.  Disconnect and let the
+		 * forechannel reestablish the connection.  The server will
+		 * have to retransmit the backchannel request and we'll
+		 * reprocess it.  Since these ops are idempotent, there's no
+		 * need to cache our reply at this time.
+		 */
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		xprt_conditional_disconnect(task->tk_xprt,
+			req->rq_connect_cookie);
+		break;
+	default:
+		/*
+		 * We were unable to reply and will have to drop the
+		 * request.  The server should reconnect and retransmit.
+		 */
+		BUG_ON(task->tk_status == -EAGAIN);
+		printk(KERN_NOTICE "RPC: Could not send backchannel reply "
+			"error: %d\n", task->tk_status);
+		break;
+	}
+	rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * 6.	Sort out the RPC call status
+ */
+static void
+call_status(struct rpc_task *task)
+{
+	struct rpc_clnt	*clnt = task->tk_client;
+	struct rpc_rqst	*req = task->tk_rqstp;
+	int		status;
+
+	if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent)
+		task->tk_status = req->rq_reply_bytes_recvd;
+
+	dprint_status(task);
+
+	status = task->tk_status;
+	if (status >= 0) {
+		task->tk_action = call_decode;
+		return;
+	}
+
+	task->tk_status = 0;
+	switch(status) {
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		/*
+		 * Delay any retries for 3 seconds, then handle as if it
+		 * were a timeout.
+		 */
+		rpc_delay(task, 3*HZ);
+	case -ETIMEDOUT:
+		task->tk_action = call_timeout;
+		if (task->tk_client->cl_discrtry)
+			xprt_conditional_disconnect(task->tk_xprt,
+					req->rq_connect_cookie);
+		break;
+	case -ECONNRESET:
+	case -ECONNREFUSED:
+		rpc_force_rebind(clnt);
+		rpc_delay(task, 3*HZ);
+	case -EPIPE:
+	case -ENOTCONN:
+		task->tk_action = call_bind;
+		break;
+	case -EAGAIN:
+		task->tk_action = call_transmit;
+		break;
+	case -EIO:
+		/* shutdown or soft timeout */
+		rpc_exit(task, status);
+		break;
+	default:
+		if (clnt->cl_chatty)
+			printk("%s: RPC call returned error %d\n",
+			       clnt->cl_protname, -status);
+		rpc_exit(task, status);
+	}
+}
+
+/*
+ * 6a.	Handle RPC timeout
+ * 	We do not release the request slot, so we keep using the
+ *	same XID for all retransmits.
+ */
+static void
+call_timeout(struct rpc_task *task)
+{
+	struct rpc_clnt	*clnt = task->tk_client;
+
+	if (xprt_adjust_timeout(task->tk_rqstp) == 0) {
+		dprintk("RPC: %5u call_timeout (minor)\n", task->tk_pid);
+		goto retry;
+	}
+
+	dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
+	task->tk_timeouts++;
+
+	if (RPC_IS_SOFTCONN(task)) {
+		rpc_exit(task, -ETIMEDOUT);
+		return;
+	}
+	if (RPC_IS_SOFT(task)) {
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
+				clnt->cl_protname, clnt->cl_server);
+		if (task->tk_flags & RPC_TASK_TIMEOUT)
+			rpc_exit(task, -ETIMEDOUT);
+		else
+			rpc_exit(task, -EIO);
+		return;
+	}
+
+	if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
+		task->tk_flags |= RPC_CALL_MAJORSEEN;
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
+			clnt->cl_protname, clnt->cl_server);
+	}
+	rpc_force_rebind(clnt);
+	/*
+	 * Did our request time out due to an RPCSEC_GSS out-of-sequence
+	 * event? RFC2203 requires the server to drop all such requests.
+	 */
+	rpcauth_invalcred(task);
+
+retry:
+	clnt->cl_stats->rpcretrans++;
+	task->tk_action = call_bind;
+	task->tk_status = 0;
+}
+
+/*
+ * 7.	Decode the RPC reply
+ */
+static void
+call_decode(struct rpc_task *task)
+{
+	struct rpc_clnt	*clnt = task->tk_client;
+	struct rpc_rqst	*req = task->tk_rqstp;
+	kxdrdproc_t	decode = task->tk_msg.rpc_proc->p_decode;
+	__be32		*p;
+
+	dprintk("RPC: %5u call_decode (status %d)\n",
+			task->tk_pid, task->tk_status);
+
+	if (task->tk_flags & RPC_CALL_MAJORSEEN) {
+		if (clnt->cl_chatty)
+			printk(KERN_NOTICE "%s: server %s OK\n",
+				clnt->cl_protname, clnt->cl_server);
+		task->tk_flags &= ~RPC_CALL_MAJORSEEN;
+	}
+
+	/*
+	 * Ensure that we see all writes made by xprt_complete_rqst()
+	 * before it changed req->rq_reply_bytes_recvd.
+	 */
+	smp_rmb();
+	req->rq_rcv_buf.len = req->rq_private_buf.len;
+
+	/* Check that the softirq receive buffer is valid */
+	WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
+				sizeof(req->rq_rcv_buf)) != 0);
+
+	if (req->rq_rcv_buf.len < 12) {
+		if (!RPC_IS_SOFT(task)) {
+			task->tk_action = call_bind;
+			clnt->cl_stats->rpcretrans++;
+			goto out_retry;
+		}
+		dprintk("RPC:       %s: too small RPC reply size (%d bytes)\n",
+				clnt->cl_protname, task->tk_status);
+		task->tk_action = call_timeout;
+		goto out_retry;
+	}
+
+	p = rpc_verify_header(task);
+	if (IS_ERR(p)) {
+		if (p == ERR_PTR(-EAGAIN))
+			goto out_retry;
+		return;
+	}
+
+	task->tk_action = rpc_exit_task;
+
+	if (decode) {
+		task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
+						      task->tk_msg.rpc_resp);
+	}
+	dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
+			task->tk_status);
+	return;
+out_retry:
+	task->tk_status = 0;
+	/* Note: rpc_verify_header() may have freed the RPC slot */
+	if (task->tk_rqstp == req) {
+		req->rq_reply_bytes_recvd = req->rq_rcv_buf.len = 0;
+		if (task->tk_client->cl_discrtry)
+			xprt_conditional_disconnect(task->tk_xprt,
+					req->rq_connect_cookie);
+	}
+}
+
+static __be32 *
+rpc_encode_header(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+	struct rpc_rqst	*req = task->tk_rqstp;
+	__be32		*p = req->rq_svec[0].iov_base;
+
+	/* FIXME: check buffer size? */
+
+	p = xprt_skip_transport_header(task->tk_xprt, p);
+	*p++ = req->rq_xid;		/* XID */
+	*p++ = htonl(RPC_CALL);		/* CALL */
+	*p++ = htonl(RPC_VERSION);	/* RPC version */
+	*p++ = htonl(clnt->cl_prog);	/* program number */
+	*p++ = htonl(clnt->cl_vers);	/* program version */
+	*p++ = htonl(task->tk_msg.rpc_proc->p_proc);	/* procedure */
+	p = rpcauth_marshcred(task, p);
+	req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
+	return p;
+}
+
+static __be32 *
+rpc_verify_header(struct rpc_task *task)
+{
+	struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0];
+	int len = task->tk_rqstp->rq_rcv_buf.len >> 2;
+	__be32	*p = iov->iov_base;
+	u32 n;
+	int error = -EACCES;
+
+	if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) {
+		/* RFC-1014 says that the representation of XDR data must be a
+		 * multiple of four bytes
+		 * - if it isn't pointer subtraction in the NFS client may give
+		 *   undefined results
+		 */
+		dprintk("RPC: %5u %s: XDR representation not a multiple of"
+		       " 4 bytes: 0x%x\n", task->tk_pid, __func__,
+		       task->tk_rqstp->rq_rcv_buf.len);
+		goto out_eio;
+	}
+	if ((len -= 3) < 0)
+		goto out_overflow;
+
+	p += 1; /* skip XID */
+	if ((n = ntohl(*p++)) != RPC_REPLY) {
+		dprintk("RPC: %5u %s: not an RPC reply: %x\n",
+			task->tk_pid, __func__, n);
+		goto out_garbage;
+	}
+
+	if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) {
+		if (--len < 0)
+			goto out_overflow;
+		switch ((n = ntohl(*p++))) {
+			case RPC_AUTH_ERROR:
+				break;
+			case RPC_MISMATCH:
+				dprintk("RPC: %5u %s: RPC call version "
+						"mismatch!\n",
+						task->tk_pid, __func__);
+				error = -EPROTONOSUPPORT;
+				goto out_err;
+			default:
+				dprintk("RPC: %5u %s: RPC call rejected, "
+						"unknown error: %x\n",
+						task->tk_pid, __func__, n);
+				goto out_eio;
+		}
+		if (--len < 0)
+			goto out_overflow;
+		switch ((n = ntohl(*p++))) {
+		case RPC_AUTH_REJECTEDCRED:
+		case RPC_AUTH_REJECTEDVERF:
+		case RPCSEC_GSS_CREDPROBLEM:
+		case RPCSEC_GSS_CTXPROBLEM:
+			if (!task->tk_cred_retry)
+				break;
+			task->tk_cred_retry--;
+			dprintk("RPC: %5u %s: retry stale creds\n",
+					task->tk_pid, __func__);
+			rpcauth_invalcred(task);
+			/* Ensure we obtain a new XID! */
+			xprt_release(task);
+			task->tk_action = call_reserve;
+			goto out_retry;
+		case RPC_AUTH_BADCRED:
+		case RPC_AUTH_BADVERF:
+			/* possibly garbled cred/verf? */
+			if (!task->tk_garb_retry)
+				break;
+			task->tk_garb_retry--;
+			dprintk("RPC: %5u %s: retry garbled creds\n",
+					task->tk_pid, __func__);
+			task->tk_action = call_bind;
+			goto out_retry;
+		case RPC_AUTH_TOOWEAK:
+			printk(KERN_NOTICE "RPC: server %s requires stronger "
+			       "authentication.\n", task->tk_client->cl_server);
+			break;
+		default:
+			dprintk("RPC: %5u %s: unknown auth error: %x\n",
+					task->tk_pid, __func__, n);
+			error = -EIO;
+		}
+		dprintk("RPC: %5u %s: call rejected %d\n",
+				task->tk_pid, __func__, n);
+		goto out_err;
+	}
+	if (!(p = rpcauth_checkverf(task, p))) {
+		dprintk("RPC: %5u %s: auth check failed\n",
+				task->tk_pid, __func__);
+		goto out_garbage;		/* bad verifier, retry */
+	}
+	len = p - (__be32 *)iov->iov_base - 1;
+	if (len < 0)
+		goto out_overflow;
+	switch ((n = ntohl(*p++))) {
+	case RPC_SUCCESS:
+		return p;
+	case RPC_PROG_UNAVAIL:
+		dprintk("RPC: %5u %s: program %u is unsupported by server %s\n",
+				task->tk_pid, __func__,
+				(unsigned int)task->tk_client->cl_prog,
+				task->tk_client->cl_server);
+		error = -EPFNOSUPPORT;
+		goto out_err;
+	case RPC_PROG_MISMATCH:
+		dprintk("RPC: %5u %s: program %u, version %u unsupported by "
+				"server %s\n", task->tk_pid, __func__,
+				(unsigned int)task->tk_client->cl_prog,
+				(unsigned int)task->tk_client->cl_vers,
+				task->tk_client->cl_server);
+		error = -EPROTONOSUPPORT;
+		goto out_err;
+	case RPC_PROC_UNAVAIL:
+		dprintk("RPC: %5u %s: proc %s unsupported by program %u, "
+				"version %u on server %s\n",
+				task->tk_pid, __func__,
+				rpc_proc_name(task),
+				task->tk_client->cl_prog,
+				task->tk_client->cl_vers,
+				task->tk_client->cl_server);
+		error = -EOPNOTSUPP;
+		goto out_err;
+	case RPC_GARBAGE_ARGS:
+		dprintk("RPC: %5u %s: server saw garbage\n",
+				task->tk_pid, __func__);
+		break;			/* retry */
+	default:
+		dprintk("RPC: %5u %s: server accept status: %x\n",
+				task->tk_pid, __func__, n);
+		/* Also retry */
+	}
+
+out_garbage:
+	task->tk_client->cl_stats->rpcgarbage++;
+	if (task->tk_garb_retry) {
+		task->tk_garb_retry--;
+		dprintk("RPC: %5u %s: retrying\n",
+				task->tk_pid, __func__);
+		task->tk_action = call_bind;
+out_retry:
+		return ERR_PTR(-EAGAIN);
+	}
+out_eio:
+	error = -EIO;
+out_err:
+	rpc_exit(task, error);
+	dprintk("RPC: %5u %s: call failed with error %d\n", task->tk_pid,
+			__func__, error);
+	return ERR_PTR(error);
+out_overflow:
+	dprintk("RPC: %5u %s: server reply was truncated.\n", task->tk_pid,
+			__func__);
+	goto out_garbage;
+}
+
+static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
+{
+}
+
+static int rpcproc_decode_null(void *rqstp, struct xdr_stream *xdr, void *obj)
+{
+	return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+	.p_encode = rpcproc_encode_null,
+	.p_decode = rpcproc_decode_null,
+};
+
+static int rpc_ping(struct rpc_clnt *clnt)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+	};
+	int err;
+	msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN);
+	put_rpccred(msg.rpc_cred);
+	return err;
+}
+
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = &rpc_default_ops,
+		.flags = flags,
+	};
+	return rpc_run_task(&task_setup_data);
+}
+EXPORT_SYMBOL_GPL(rpc_call_null);
+
+#ifdef RPC_DEBUG
+static void rpc_show_header(void)
+{
+	printk(KERN_INFO "-pid- flgs status -client- --rqstp- "
+		"-timeout ---ops--\n");
+}
+
+static void rpc_show_task(const struct rpc_clnt *clnt,
+			  const struct rpc_task *task)
+{
+	const char *rpc_waitq = "none";
+
+	if (RPC_IS_QUEUED(task))
+		rpc_waitq = rpc_qname(task->tk_waitqueue);
+
+	printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n",
+		task->tk_pid, task->tk_flags, task->tk_status,
+		clnt, task->tk_rqstp, task->tk_timeout, task->tk_ops,
+		clnt->cl_protname, clnt->cl_vers, rpc_proc_name(task),
+		task->tk_action, rpc_waitq);
+}
+
+void rpc_show_tasks(void)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_task *task;
+	int header = 0;
+
+	spin_lock(&rpc_client_lock);
+	list_for_each_entry(clnt, &all_clients, cl_clients) {
+		spin_lock(&clnt->cl_lock);
+		list_for_each_entry(task, &clnt->cl_tasks, tk_task) {
+			if (!header) {
+				rpc_show_header();
+				header++;
+			}
+			rpc_show_task(clnt, task);
+		}
+		spin_unlock(&clnt->cl_lock);
+	}
+	spin_unlock(&rpc_client_lock);
+}
+#endif
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
new file mode 100644
index 00000000..d013bf21
--- /dev/null
+++ b/net/sunrpc/netns.h
@@ -0,0 +1,19 @@
+#ifndef __SUNRPC_NETNS_H__
+#define __SUNRPC_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct cache_detail;
+
+struct sunrpc_net {
+	struct proc_dir_entry *proc_net_rpc;
+	struct cache_detail *ip_map_cache;
+};
+
+extern int sunrpc_net_id;
+
+int ip_map_cache_create(struct net *);
+void ip_map_cache_destroy(struct net *);
+
+#endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
new file mode 100644
index 00000000..72bc5368
--- /dev/null
+++ b/net/sunrpc/rpc_pipe.c
@@ -0,0 +1,1086 @@
+/*
+ * net/sunrpc/rpc_pipe.c
+ *
+ * Userland/kernel interface for rpcauth_gss.
+ * Code shamelessly plagiarized from fs/nfsd/nfsctl.c
+ * and fs/sysfs/inode.c
+ *
+ * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include <linux/kernel.h>
+
+#include <asm/ioctls.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/seq_file.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/cache.h>
+
+static struct vfsmount *rpc_mnt __read_mostly;
+static int rpc_mount_count;
+
+static struct file_system_type rpc_pipe_fs_type;
+
+
+static struct kmem_cache *rpc_inode_cachep __read_mostly;
+
+#define RPC_UPCALL_TIMEOUT (30*HZ)
+
+static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
+		void (*destroy_msg)(struct rpc_pipe_msg *), int err)
+{
+	struct rpc_pipe_msg *msg;
+
+	if (list_empty(head))
+		return;
+	do {
+		msg = list_entry(head->next, struct rpc_pipe_msg, list);
+		list_del_init(&msg->list);
+		msg->errno = err;
+		destroy_msg(msg);
+	} while (!list_empty(head));
+	wake_up(&rpci->waitq);
+}
+
+static void
+rpc_timeout_upcall_queue(struct work_struct *work)
+{
+	LIST_HEAD(free_list);
+	struct rpc_inode *rpci =
+		container_of(work, struct rpc_inode, queue_timeout.work);
+	struct inode *inode = &rpci->vfs_inode;
+	void (*destroy_msg)(struct rpc_pipe_msg *);
+
+	spin_lock(&inode->i_lock);
+	if (rpci->ops == NULL) {
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+	destroy_msg = rpci->ops->destroy_msg;
+	if (rpci->nreaders == 0) {
+		list_splice_init(&rpci->pipe, &free_list);
+		rpci->pipelen = 0;
+	}
+	spin_unlock(&inode->i_lock);
+	rpc_purge_list(rpci, &free_list, destroy_msg, -ETIMEDOUT);
+}
+
+/**
+ * rpc_queue_upcall - queue an upcall message to userspace
+ * @inode: inode of upcall pipe on which to queue given message
+ * @msg: message to queue
+ *
+ * Call with an @inode created by rpc_mkpipe() to queue an upcall.
+ * A userspace process may then later read the upcall by performing a
+ * read on an open file for this inode.  It is up to the caller to
+ * initialize the fields of @msg (other than @msg->list) appropriately.
+ */
+int
+rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	int res = -EPIPE;
+
+	spin_lock(&inode->i_lock);
+	if (rpci->ops == NULL)
+		goto out;
+	if (rpci->nreaders) {
+		list_add_tail(&msg->list, &rpci->pipe);
+		rpci->pipelen += msg->len;
+		res = 0;
+	} else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) {
+		if (list_empty(&rpci->pipe))
+			queue_delayed_work(rpciod_workqueue,
+					&rpci->queue_timeout,
+					RPC_UPCALL_TIMEOUT);
+		list_add_tail(&msg->list, &rpci->pipe);
+		rpci->pipelen += msg->len;
+		res = 0;
+	}
+out:
+	spin_unlock(&inode->i_lock);
+	wake_up(&rpci->waitq);
+	return res;
+}
+EXPORT_SYMBOL_GPL(rpc_queue_upcall);
+
+static inline void
+rpc_inode_setowner(struct inode *inode, void *private)
+{
+	RPC_I(inode)->private = private;
+}
+
+static void
+rpc_close_pipes(struct inode *inode)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	const struct rpc_pipe_ops *ops;
+	int need_release;
+
+	mutex_lock(&inode->i_mutex);
+	ops = rpci->ops;
+	if (ops != NULL) {
+		LIST_HEAD(free_list);
+		spin_lock(&inode->i_lock);
+		need_release = rpci->nreaders != 0 || rpci->nwriters != 0;
+		rpci->nreaders = 0;
+		list_splice_init(&rpci->in_upcall, &free_list);
+		list_splice_init(&rpci->pipe, &free_list);
+		rpci->pipelen = 0;
+		rpci->ops = NULL;
+		spin_unlock(&inode->i_lock);
+		rpc_purge_list(rpci, &free_list, ops->destroy_msg, -EPIPE);
+		rpci->nwriters = 0;
+		if (need_release && ops->release_pipe)
+			ops->release_pipe(inode);
+		cancel_delayed_work_sync(&rpci->queue_timeout);
+	}
+	rpc_inode_setowner(inode, NULL);
+	mutex_unlock(&inode->i_mutex);
+}
+
+static struct inode *
+rpc_alloc_inode(struct super_block *sb)
+{
+	struct rpc_inode *rpci;
+	rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+	if (!rpci)
+		return NULL;
+	return &rpci->vfs_inode;
+}
+
+static void
+rpc_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	INIT_LIST_HEAD(&inode->i_dentry);
+	kmem_cache_free(rpc_inode_cachep, RPC_I(inode));
+}
+
+static void
+rpc_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, rpc_i_callback);
+}
+
+static int
+rpc_pipe_open(struct inode *inode, struct file *filp)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	int first_open;
+	int res = -ENXIO;
+
+	mutex_lock(&inode->i_mutex);
+	if (rpci->ops == NULL)
+		goto out;
+	first_open = rpci->nreaders == 0 && rpci->nwriters == 0;
+	if (first_open && rpci->ops->open_pipe) {
+		res = rpci->ops->open_pipe(inode);
+		if (res)
+			goto out;
+	}
+	if (filp->f_mode & FMODE_READ)
+		rpci->nreaders++;
+	if (filp->f_mode & FMODE_WRITE)
+		rpci->nwriters++;
+	res = 0;
+out:
+	mutex_unlock(&inode->i_mutex);
+	return res;
+}
+
+static int
+rpc_pipe_release(struct inode *inode, struct file *filp)
+{
+	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe_msg *msg;
+	int last_close;
+
+	mutex_lock(&inode->i_mutex);
+	if (rpci->ops == NULL)
+		goto out;
+	msg = filp->private_data;
+	if (msg != NULL) {
+		spin_lock(&inode->i_lock);
+		msg->errno = -EAGAIN;
+		list_del_init(&msg->list);
+		spin_unlock(&inode->i_lock);
+		rpci->ops->destroy_msg(msg);
+	}
+	if (filp->f_mode & FMODE_WRITE)
+		rpci->nwriters --;
+	if (filp->f_mode & FMODE_READ) {
+		rpci->nreaders --;
+		if (rpci->nreaders == 0) {
+			LIST_HEAD(free_list);
+			spin_lock(&inode->i_lock);
+			list_splice_init(&rpci->pipe, &free_list);
+			rpci->pipelen = 0;
+			spin_unlock(&inode->i_lock);
+			rpc_purge_list(rpci, &free_list,
+					rpci->ops->destroy_msg, -EAGAIN);
+		}
+	}
+	last_close = rpci->nwriters == 0 && rpci->nreaders == 0;
+	if (last_close && rpci->ops->release_pipe)
+		rpci->ops->release_pipe(inode);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+static ssize_t
+rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
+	struct rpc_pipe_msg *msg;
+	int res = 0;
+
+	mutex_lock(&inode->i_mutex);
+	if (rpci->ops == NULL) {
+		res = -EPIPE;
+		goto out_unlock;
+	}
+	msg = filp->private_data;
+	if (msg == NULL) {
+		spin_lock(&inode->i_lock);
+		if (!list_empty(&rpci->pipe)) {
+			msg = list_entry(rpci->pipe.next,
+					struct rpc_pipe_msg,
+					list);
+			list_move(&msg->list, &rpci->in_upcall);
+			rpci->pipelen -= msg->len;
+			filp->private_data = msg;
+			msg->copied = 0;
+		}
+		spin_unlock(&inode->i_lock);
+		if (msg == NULL)
+			goto out_unlock;
+	}
+	/* NOTE: it is up to the callback to update msg->copied */
+	res = rpci->ops->upcall(filp, msg, buf, len);
+	if (res < 0 || msg->len == msg->copied) {
+		filp->private_data = NULL;
+		spin_lock(&inode->i_lock);
+		list_del_init(&msg->list);
+		spin_unlock(&inode->i_lock);
+		rpci->ops->destroy_msg(msg);
+	}
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return res;
+}
+
+static ssize_t
+rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
+	int res;
+
+	mutex_lock(&inode->i_mutex);
+	res = -EPIPE;
+	if (rpci->ops != NULL)
+		res = rpci->ops->downcall(filp, buf, len);
+	mutex_unlock(&inode->i_mutex);
+	return res;
+}
+
+static unsigned int
+rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct rpc_inode *rpci;
+	unsigned int mask = 0;
+
+	rpci = RPC_I(filp->f_path.dentry->d_inode);
+	poll_wait(filp, &rpci->waitq, wait);
+
+	mask = POLLOUT | POLLWRNORM;
+	if (rpci->ops == NULL)
+		mask |= POLLERR | POLLHUP;
+	if (filp->private_data || !list_empty(&rpci->pipe))
+		mask |= POLLIN | POLLRDNORM;
+	return mask;
+}
+
+static long
+rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
+	int len;
+
+	switch (cmd) {
+	case FIONREAD:
+		spin_lock(&inode->i_lock);
+		if (rpci->ops == NULL) {
+			spin_unlock(&inode->i_lock);
+			return -EPIPE;
+		}
+		len = rpci->pipelen;
+		if (filp->private_data) {
+			struct rpc_pipe_msg *msg;
+			msg = filp->private_data;
+			len += msg->len - msg->copied;
+		}
+		spin_unlock(&inode->i_lock);
+		return put_user(len, (int __user *)arg);
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations rpc_pipe_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.read		= rpc_pipe_read,
+	.write		= rpc_pipe_write,
+	.poll		= rpc_pipe_poll,
+	.unlocked_ioctl	= rpc_pipe_ioctl,
+	.open		= rpc_pipe_open,
+	.release	= rpc_pipe_release,
+};
+
+static int
+rpc_show_info(struct seq_file *m, void *v)
+{
+	struct rpc_clnt *clnt = m->private;
+
+	seq_printf(m, "RPC server: %s\n", clnt->cl_server);
+	seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname,
+			clnt->cl_prog, clnt->cl_vers);
+	seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR));
+	seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO));
+	seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT));
+	return 0;
+}
+
+static int
+rpc_info_open(struct inode *inode, struct file *file)
+{
+	struct rpc_clnt *clnt = NULL;
+	int ret = single_open(file, rpc_show_info, NULL);
+
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		spin_lock(&file->f_path.dentry->d_lock);
+		if (!d_unhashed(file->f_path.dentry))
+			clnt = RPC_I(inode)->private;
+		if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
+			spin_unlock(&file->f_path.dentry->d_lock);
+			m->private = clnt;
+		} else {
+			spin_unlock(&file->f_path.dentry->d_lock);
+			single_release(inode, file);
+			ret = -EINVAL;
+		}
+	}
+	return ret;
+}
+
+static int
+rpc_info_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct rpc_clnt *clnt = (struct rpc_clnt *)m->private;
+
+	if (clnt)
+		rpc_release_client(clnt);
+	return single_release(inode, file);
+}
+
+static const struct file_operations rpc_info_operations = {
+	.owner		= THIS_MODULE,
+	.open		= rpc_info_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= rpc_info_release,
+};
+
+
+/*
+ * Description of fs contents.
+ */
+struct rpc_filelist {
+	const char *name;
+	const struct file_operations *i_fop;
+	umode_t mode;
+};
+
+struct vfsmount *rpc_get_mount(void)
+{
+	int err;
+
+	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count);
+	if (err != 0)
+		return ERR_PTR(err);
+	return rpc_mnt;
+}
+EXPORT_SYMBOL_GPL(rpc_get_mount);
+
+void rpc_put_mount(void)
+{
+	simple_release_fs(&rpc_mnt, &rpc_mount_count);
+}
+EXPORT_SYMBOL_GPL(rpc_put_mount);
+
+static int rpc_delete_dentry(const struct dentry *dentry)
+{
+	return 1;
+}
+
+static const struct dentry_operations rpc_dentry_operations = {
+	.d_delete = rpc_delete_dentry,
+};
+
+static struct inode *
+rpc_get_inode(struct super_block *sb, umode_t mode)
+{
+	struct inode *inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch(mode & S_IFMT) {
+		case S_IFDIR:
+			inode->i_fop = &simple_dir_operations;
+			inode->i_op = &simple_dir_inode_operations;
+			inc_nlink(inode);
+		default:
+			break;
+	}
+	return inode;
+}
+
+static int __rpc_create_common(struct inode *dir, struct dentry *dentry,
+			       umode_t mode,
+			       const struct file_operations *i_fop,
+			       void *private)
+{
+	struct inode *inode;
+
+	d_drop(dentry);
+	inode = rpc_get_inode(dir->i_sb, mode);
+	if (!inode)
+		goto out_err;
+	inode->i_ino = iunique(dir->i_sb, 100);
+	if (i_fop)
+		inode->i_fop = i_fop;
+	if (private)
+		rpc_inode_setowner(inode, private);
+	d_add(dentry, inode);
+	return 0;
+out_err:
+	printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n",
+			__FILE__, __func__, dentry->d_name.name);
+	dput(dentry);
+	return -ENOMEM;
+}
+
+static int __rpc_create(struct inode *dir, struct dentry *dentry,
+			umode_t mode,
+			const struct file_operations *i_fop,
+			void *private)
+{
+	int err;
+
+	err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private);
+	if (err)
+		return err;
+	fsnotify_create(dir, dentry);
+	return 0;
+}
+
+static int __rpc_mkdir(struct inode *dir, struct dentry *dentry,
+		       umode_t mode,
+		       const struct file_operations *i_fop,
+		       void *private)
+{
+	int err;
+
+	err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private);
+	if (err)
+		return err;
+	inc_nlink(dir);
+	fsnotify_mkdir(dir, dentry);
+	return 0;
+}
+
+static int __rpc_mkpipe(struct inode *dir, struct dentry *dentry,
+			umode_t mode,
+			const struct file_operations *i_fop,
+			void *private,
+			const struct rpc_pipe_ops *ops,
+			int flags)
+{
+	struct rpc_inode *rpci;
+	int err;
+
+	err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
+	if (err)
+		return err;
+	rpci = RPC_I(dentry->d_inode);
+	rpci->nkern_readwriters = 1;
+	rpci->private = private;
+	rpci->flags = flags;
+	rpci->ops = ops;
+	fsnotify_create(dir, dentry);
+	return 0;
+}
+
+static int __rpc_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int ret;
+
+	dget(dentry);
+	ret = simple_rmdir(dir, dentry);
+	d_delete(dentry);
+	dput(dentry);
+	return ret;
+}
+
+static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int ret;
+
+	dget(dentry);
+	ret = simple_unlink(dir, dentry);
+	d_delete(dentry);
+	dput(dentry);
+	return ret;
+}
+
+static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
+
+	rpci->nkern_readwriters--;
+	if (rpci->nkern_readwriters != 0)
+		return 0;
+	rpc_close_pipes(inode);
+	return __rpc_unlink(dir, dentry);
+}
+
+static struct dentry *__rpc_lookup_create(struct dentry *parent,
+					  struct qstr *name)
+{
+	struct dentry *dentry;
+
+	dentry = d_lookup(parent, name);
+	if (!dentry) {
+		dentry = d_alloc(parent, name);
+		if (!dentry) {
+			dentry = ERR_PTR(-ENOMEM);
+			goto out_err;
+		}
+	}
+	if (!dentry->d_inode)
+		d_set_d_op(dentry, &rpc_dentry_operations);
+out_err:
+	return dentry;
+}
+
+static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
+					  struct qstr *name)
+{
+	struct dentry *dentry;
+
+	dentry = __rpc_lookup_create(parent, name);
+	if (IS_ERR(dentry))
+		return dentry;
+	if (dentry->d_inode == NULL)
+		return dentry;
+	dput(dentry);
+	return ERR_PTR(-EEXIST);
+}
+
+/*
+ * FIXME: This probably has races.
+ */
+static void __rpc_depopulate(struct dentry *parent,
+			     const struct rpc_filelist *files,
+			     int start, int eof)
+{
+	struct inode *dir = parent->d_inode;
+	struct dentry *dentry;
+	struct qstr name;
+	int i;
+
+	for (i = start; i < eof; i++) {
+		name.name = files[i].name;
+		name.len = strlen(files[i].name);
+		name.hash = full_name_hash(name.name, name.len);
+		dentry = d_lookup(parent, &name);
+
+		if (dentry == NULL)
+			continue;
+		if (dentry->d_inode == NULL)
+			goto next;
+		switch (dentry->d_inode->i_mode & S_IFMT) {
+			default:
+				BUG();
+			case S_IFREG:
+				__rpc_unlink(dir, dentry);
+				break;
+			case S_IFDIR:
+				__rpc_rmdir(dir, dentry);
+		}
+next:
+		dput(dentry);
+	}
+}
+
+static void rpc_depopulate(struct dentry *parent,
+			   const struct rpc_filelist *files,
+			   int start, int eof)
+{
+	struct inode *dir = parent->d_inode;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+	__rpc_depopulate(parent, files, start, eof);
+	mutex_unlock(&dir->i_mutex);
+}
+
+static int rpc_populate(struct dentry *parent,
+			const struct rpc_filelist *files,
+			int start, int eof,
+			void *private)
+{
+	struct inode *dir = parent->d_inode;
+	struct dentry *dentry;
+	int i, err;
+
+	mutex_lock(&dir->i_mutex);
+	for (i = start; i < eof; i++) {
+		struct qstr q;
+
+		q.name = files[i].name;
+		q.len = strlen(files[i].name);
+		q.hash = full_name_hash(q.name, q.len);
+		dentry = __rpc_lookup_create_exclusive(parent, &q);
+		err = PTR_ERR(dentry);
+		if (IS_ERR(dentry))
+			goto out_bad;
+		switch (files[i].mode & S_IFMT) {
+			default:
+				BUG();
+			case S_IFREG:
+				err = __rpc_create(dir, dentry,
+						files[i].mode,
+						files[i].i_fop,
+						private);
+				break;
+			case S_IFDIR:
+				err = __rpc_mkdir(dir, dentry,
+						files[i].mode,
+						NULL,
+						private);
+		}
+		if (err != 0)
+			goto out_bad;
+	}
+	mutex_unlock(&dir->i_mutex);
+	return 0;
+out_bad:
+	__rpc_depopulate(parent, files, start, eof);
+	mutex_unlock(&dir->i_mutex);
+	printk(KERN_WARNING "%s: %s failed to populate directory %s\n",
+			__FILE__, __func__, parent->d_name.name);
+	return err;
+}
+
+static struct dentry *rpc_mkdir_populate(struct dentry *parent,
+		struct qstr *name, umode_t mode, void *private,
+		int (*populate)(struct dentry *, void *), void *args_populate)
+{
+	struct dentry *dentry;
+	struct inode *dir = parent->d_inode;
+	int error;
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	dentry = __rpc_lookup_create_exclusive(parent, name);
+	if (IS_ERR(dentry))
+		goto out;
+	error = __rpc_mkdir(dir, dentry, mode, NULL, private);
+	if (error != 0)
+		goto out_err;
+	if (populate != NULL) {
+		error = populate(dentry, args_populate);
+		if (error)
+			goto err_rmdir;
+	}
+out:
+	mutex_unlock(&dir->i_mutex);
+	return dentry;
+err_rmdir:
+	__rpc_rmdir(dir, dentry);
+out_err:
+	dentry = ERR_PTR(error);
+	goto out;
+}
+
+static int rpc_rmdir_depopulate(struct dentry *dentry,
+		void (*depopulate)(struct dentry *))
+{
+	struct dentry *parent;
+	struct inode *dir;
+	int error;
+
+	parent = dget_parent(dentry);
+	dir = parent->d_inode;
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (depopulate != NULL)
+		depopulate(dentry);
+	error = __rpc_rmdir(dir, dentry);
+	mutex_unlock(&dir->i_mutex);
+	dput(parent);
+	return error;
+}
+
+/**
+ * rpc_mkpipe - make an rpc_pipefs file for kernel<->userspace communication
+ * @parent: dentry of directory to create new "pipe" in
+ * @name: name of pipe
+ * @private: private data to associate with the pipe, for the caller's use
+ * @ops: operations defining the behavior of the pipe: upcall, downcall,
+ *	release_pipe, open_pipe, and destroy_msg.
+ * @flags: rpc_inode flags
+ *
+ * Data is made available for userspace to read by calls to
+ * rpc_queue_upcall().  The actual reads will result in calls to
+ * @ops->upcall, which will be called with the file pointer,
+ * message, and userspace buffer to copy to.
+ *
+ * Writes can come at any time, and do not necessarily have to be
+ * responses to upcalls.  They will result in calls to @msg->downcall.
+ *
+ * The @private argument passed here will be available to all these methods
+ * from the file pointer, via RPC_I(file->f_dentry->d_inode)->private.
+ */
+struct dentry *rpc_mkpipe(struct dentry *parent, const char *name,
+			  void *private, const struct rpc_pipe_ops *ops,
+			  int flags)
+{
+	struct dentry *dentry;
+	struct inode *dir = parent->d_inode;
+	umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
+	struct qstr q;
+	int err;
+
+	if (ops->upcall == NULL)
+		umode &= ~S_IRUGO;
+	if (ops->downcall == NULL)
+		umode &= ~S_IWUGO;
+
+	q.name = name;
+	q.len = strlen(name);
+	q.hash = full_name_hash(q.name, q.len),
+
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	dentry = __rpc_lookup_create(parent, &q);
+	if (IS_ERR(dentry))
+		goto out;
+	if (dentry->d_inode) {
+		struct rpc_inode *rpci = RPC_I(dentry->d_inode);
+		if (rpci->private != private ||
+				rpci->ops != ops ||
+				rpci->flags != flags) {
+			dput (dentry);
+			err = -EBUSY;
+			goto out_err;
+		}
+		rpci->nkern_readwriters++;
+		goto out;
+	}
+
+	err = __rpc_mkpipe(dir, dentry, umode, &rpc_pipe_fops,
+			   private, ops, flags);
+	if (err)
+		goto out_err;
+out:
+	mutex_unlock(&dir->i_mutex);
+	return dentry;
+out_err:
+	dentry = ERR_PTR(err);
+	printk(KERN_WARNING "%s: %s() failed to create pipe %s/%s (errno = %d)\n",
+			__FILE__, __func__, parent->d_name.name, name,
+			err);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(rpc_mkpipe);
+
+/**
+ * rpc_unlink - remove a pipe
+ * @dentry: dentry for the pipe, as returned from rpc_mkpipe
+ *
+ * After this call, lookups will no longer find the pipe, and any
+ * attempts to read or write using preexisting opens of the pipe will
+ * return -EPIPE.
+ */
+int
+rpc_unlink(struct dentry *dentry)
+{
+	struct dentry *parent;
+	struct inode *dir;
+	int error = 0;
+
+	parent = dget_parent(dentry);
+	dir = parent->d_inode;
+	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	error = __rpc_rmpipe(dir, dentry);
+	mutex_unlock(&dir->i_mutex);
+	dput(parent);
+	return error;
+}
+EXPORT_SYMBOL_GPL(rpc_unlink);
+
+enum {
+	RPCAUTH_info,
+	RPCAUTH_EOF
+};
+
+static const struct rpc_filelist authfiles[] = {
+	[RPCAUTH_info] = {
+		.name = "info",
+		.i_fop = &rpc_info_operations,
+		.mode = S_IFREG | S_IRUSR,
+	},
+};
+
+static int rpc_clntdir_populate(struct dentry *dentry, void *private)
+{
+	return rpc_populate(dentry,
+			    authfiles, RPCAUTH_info, RPCAUTH_EOF,
+			    private);
+}
+
+static void rpc_clntdir_depopulate(struct dentry *dentry)
+{
+	rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF);
+}
+
+/**
+ * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
+ * @dentry: dentry from the rpc_pipefs root to the new directory
+ * @name: &struct qstr for the name
+ * @rpc_client: rpc client to associate with this directory
+ *
+ * This creates a directory at the given @path associated with
+ * @rpc_clnt, which will contain a file named "info" with some basic
+ * information about the client, together with any "pipes" that may
+ * later be created using rpc_mkpipe().
+ */
+struct dentry *rpc_create_client_dir(struct dentry *dentry,
+				   struct qstr *name,
+				   struct rpc_clnt *rpc_client)
+{
+	return rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
+			rpc_clntdir_populate, rpc_client);
+}
+
+/**
+ * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
+ * @dentry: directory to remove
+ */
+int rpc_remove_client_dir(struct dentry *dentry)
+{
+	return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate);
+}
+
+static const struct rpc_filelist cache_pipefs_files[3] = {
+	[0] = {
+		.name = "channel",
+		.i_fop = &cache_file_operations_pipefs,
+		.mode = S_IFREG|S_IRUSR|S_IWUSR,
+	},
+	[1] = {
+		.name = "content",
+		.i_fop = &content_file_operations_pipefs,
+		.mode = S_IFREG|S_IRUSR,
+	},
+	[2] = {
+		.name = "flush",
+		.i_fop = &cache_flush_operations_pipefs,
+		.mode = S_IFREG|S_IRUSR|S_IWUSR,
+	},
+};
+
+static int rpc_cachedir_populate(struct dentry *dentry, void *private)
+{
+	return rpc_populate(dentry,
+			    cache_pipefs_files, 0, 3,
+			    private);
+}
+
+static void rpc_cachedir_depopulate(struct dentry *dentry)
+{
+	rpc_depopulate(dentry, cache_pipefs_files, 0, 3);
+}
+
+struct dentry *rpc_create_cache_dir(struct dentry *parent, struct qstr *name,
+				    mode_t umode, struct cache_detail *cd)
+{
+	return rpc_mkdir_populate(parent, name, umode, NULL,
+			rpc_cachedir_populate, cd);
+}
+
+void rpc_remove_cache_dir(struct dentry *dentry)
+{
+	rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate);
+}
+
+/*
+ * populate the filesystem
+ */
+static const struct super_operations s_ops = {
+	.alloc_inode	= rpc_alloc_inode,
+	.destroy_inode	= rpc_destroy_inode,
+	.statfs		= simple_statfs,
+};
+
+#define RPCAUTH_GSSMAGIC 0x67596969
+
+/*
+ * We have a single directory with 1 node in it.
+ */
+enum {
+	RPCAUTH_lockd,
+	RPCAUTH_mount,
+	RPCAUTH_nfs,
+	RPCAUTH_portmap,
+	RPCAUTH_statd,
+	RPCAUTH_nfsd4_cb,
+	RPCAUTH_cache,
+	RPCAUTH_RootEOF
+};
+
+static const struct rpc_filelist files[] = {
+	[RPCAUTH_lockd] = {
+		.name = "lockd",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_mount] = {
+		.name = "mount",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_nfs] = {
+		.name = "nfs",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_portmap] = {
+		.name = "portmap",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_statd] = {
+		.name = "statd",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_nfsd4_cb] = {
+		.name = "nfsd4_cb",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+	[RPCAUTH_cache] = {
+		.name = "cache",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
+};
+
+static int
+rpc_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = RPCAUTH_GSSMAGIC;
+	sb->s_op = &s_ops;
+	sb->s_time_gran = 1;
+
+	inode = rpc_get_inode(sb, S_IFDIR | 0755);
+	if (!inode)
+		return -ENOMEM;
+	sb->s_root = root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
+		return -ENOMEM;
+	return 0;
+}
+
+static struct dentry *
+rpc_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, rpc_fill_super);
+}
+
+static struct file_system_type rpc_pipe_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "rpc_pipefs",
+	.mount		= rpc_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+static void
+init_once(void *foo)
+{
+	struct rpc_inode *rpci = (struct rpc_inode *) foo;
+
+	inode_init_once(&rpci->vfs_inode);
+	rpci->private = NULL;
+	rpci->nreaders = 0;
+	rpci->nwriters = 0;
+	INIT_LIST_HEAD(&rpci->in_upcall);
+	INIT_LIST_HEAD(&rpci->in_downcall);
+	INIT_LIST_HEAD(&rpci->pipe);
+	rpci->pipelen = 0;
+	init_waitqueue_head(&rpci->waitq);
+	INIT_DELAYED_WORK(&rpci->queue_timeout,
+			    rpc_timeout_upcall_queue);
+	rpci->ops = NULL;
+}
+
+int register_rpc_pipefs(void)
+{
+	int err;
+
+	rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
+				sizeof(struct rpc_inode),
+				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+				init_once);
+	if (!rpc_inode_cachep)
+		return -ENOMEM;
+	err = register_filesystem(&rpc_pipe_fs_type);
+	if (err) {
+		kmem_cache_destroy(rpc_inode_cachep);
+		return err;
+	}
+
+	return 0;
+}
+
+void unregister_rpc_pipefs(void)
+{
+	kmem_cache_destroy(rpc_inode_cachep);
+	unregister_filesystem(&rpc_pipe_fs_type);
+}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
new file mode 100644
index 00000000..e45d2fbb
--- /dev/null
+++ b/net/sunrpc/rpcb_clnt.c
@@ -0,0 +1,1074 @@
+/*
+ * In-kernel rpcbind client supporting versions 2, 3, and 4 of the rpcbind
+ * protocol
+ *
+ * Based on RFC 1833: "Binding Protocols for ONC RPC Version 2" and
+ * RFC 3530: "Network File System (NFS) version 4 Protocol"
+ *
+ * Original: Gilles Quillard, Bull Open Source, 2005 <gilles.quillard@bull.net>
+ * Updated: Chuck Lever, Oracle Corporation, 2007 <chuck.lever@oracle.com>
+ *
+ * Descended from net/sunrpc/pmap_clnt.c,
+ *  Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <net/ipv6.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_BIND
+#endif
+
+#define RPCBIND_SOCK_PATHNAME	"/var/run/rpcbind.sock"
+
+#define RPCBIND_PROGRAM		(100000u)
+#define RPCBIND_PORT		(111u)
+
+#define RPCBVERS_2		(2u)
+#define RPCBVERS_3		(3u)
+#define RPCBVERS_4		(4u)
+
+enum {
+	RPCBPROC_NULL,
+	RPCBPROC_SET,
+	RPCBPROC_UNSET,
+	RPCBPROC_GETPORT,
+	RPCBPROC_GETADDR = 3,		/* alias for GETPORT */
+	RPCBPROC_DUMP,
+	RPCBPROC_CALLIT,
+	RPCBPROC_BCAST = 5,		/* alias for CALLIT */
+	RPCBPROC_GETTIME,
+	RPCBPROC_UADDR2TADDR,
+	RPCBPROC_TADDR2UADDR,
+	RPCBPROC_GETVERSADDR,
+	RPCBPROC_INDIRECT,
+	RPCBPROC_GETADDRLIST,
+	RPCBPROC_GETSTAT,
+};
+
+/*
+ * r_owner
+ *
+ * The "owner" is allowed to unset a service in the rpcbind database.
+ *
+ * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
+ * UID which it maps to a local user name via a password lookup.
+ * In all other cases it is ignored.
+ *
+ * For SET/UNSET requests, user space provides a value, even for
+ * network requests, and GETADDR uses an empty string.  We follow
+ * those precedents here.
+ */
+#define RPCB_OWNER_STRING	"0"
+#define RPCB_MAXOWNERLEN	sizeof(RPCB_OWNER_STRING)
+
+/*
+ * XDR data type sizes
+ */
+#define RPCB_program_sz		(1)
+#define RPCB_version_sz		(1)
+#define RPCB_protocol_sz	(1)
+#define RPCB_port_sz		(1)
+#define RPCB_boolean_sz		(1)
+
+#define RPCB_netid_sz		(1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN))
+#define RPCB_addr_sz		(1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN))
+#define RPCB_ownerstring_sz	(1 + XDR_QUADLEN(RPCB_MAXOWNERLEN))
+
+/*
+ * XDR argument and result sizes
+ */
+#define RPCB_mappingargs_sz	(RPCB_program_sz + RPCB_version_sz + \
+				RPCB_protocol_sz + RPCB_port_sz)
+#define RPCB_getaddrargs_sz	(RPCB_program_sz + RPCB_version_sz + \
+				RPCB_netid_sz + RPCB_addr_sz + \
+				RPCB_ownerstring_sz)
+
+#define RPCB_getportres_sz	RPCB_port_sz
+#define RPCB_setres_sz		RPCB_boolean_sz
+
+/*
+ * Note that RFC 1833 does not put any size restrictions on the
+ * address string returned by the remote rpcbind database.
+ */
+#define RPCB_getaddrres_sz	RPCB_addr_sz
+
+static void			rpcb_getport_done(struct rpc_task *, void *);
+static void			rpcb_map_release(void *data);
+static struct rpc_program	rpcb_program;
+
+static struct rpc_clnt *	rpcb_local_clnt;
+static struct rpc_clnt *	rpcb_local_clnt4;
+
+struct rpcbind_args {
+	struct rpc_xprt *	r_xprt;
+
+	u32			r_prog;
+	u32			r_vers;
+	u32			r_prot;
+	unsigned short		r_port;
+	const char *		r_netid;
+	const char *		r_addr;
+	const char *		r_owner;
+
+	int			r_status;
+};
+
+static struct rpc_procinfo rpcb_procedures2[];
+static struct rpc_procinfo rpcb_procedures3[];
+static struct rpc_procinfo rpcb_procedures4[];
+
+struct rpcb_info {
+	u32			rpc_vers;
+	struct rpc_procinfo *	rpc_proc;
+};
+
+static struct rpcb_info rpcb_next_version[];
+static struct rpcb_info rpcb_next_version6[];
+
+static const struct rpc_call_ops rpcb_getport_ops = {
+	.rpc_call_done		= rpcb_getport_done,
+	.rpc_release		= rpcb_map_release,
+};
+
+static void rpcb_wake_rpcbind_waiters(struct rpc_xprt *xprt, int status)
+{
+	xprt_clear_binding(xprt);
+	rpc_wake_up_status(&xprt->binding, status);
+}
+
+static void rpcb_map_release(void *data)
+{
+	struct rpcbind_args *map = data;
+
+	rpcb_wake_rpcbind_waiters(map->r_xprt, map->r_status);
+	xprt_put(map->r_xprt);
+	kfree(map->r_addr);
+	kfree(map);
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local_unix(void)
+{
+	static const struct sockaddr_un rpcb_localaddr_rpcbind = {
+		.sun_family		= AF_LOCAL,
+		.sun_path		= RPCBIND_SOCK_PATHNAME,
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= XPRT_TRANSPORT_LOCAL,
+		.address	= (struct sockaddr *)&rpcb_localaddr_rpcbind,
+		.addrsize	= sizeof(rpcb_localaddr_rpcbind),
+		.servername	= "localhost",
+		.program	= &rpcb_program,
+		.version	= RPCBVERS_2,
+		.authflavor	= RPC_AUTH_NULL,
+	};
+	struct rpc_clnt *clnt, *clnt4;
+	int result = 0;
+
+	/*
+	 * Because we requested an RPC PING at transport creation time,
+	 * this works only if the user space portmapper is rpcbind, and
+	 * it's listening on AF_LOCAL on the named socket.
+	 */
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("RPC:       failed to create AF_LOCAL rpcbind "
+				"client (errno %ld).\n", PTR_ERR(clnt));
+		result = -PTR_ERR(clnt);
+		goto out;
+	}
+
+	clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+	if (IS_ERR(clnt4)) {
+		dprintk("RPC:       failed to bind second program to "
+				"rpcbind v4 client (errno %ld).\n",
+				PTR_ERR(clnt4));
+		clnt4 = NULL;
+	}
+
+	/* Protected by rpcb_create_local_mutex */
+	rpcb_local_clnt = clnt;
+	rpcb_local_clnt4 = clnt4;
+
+out:
+	return result;
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local_net(void)
+{
+	static const struct sockaddr_in rpcb_inaddr_loopback = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+		.sin_port		= htons(RPCBIND_PORT),
+	};
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= XPRT_TRANSPORT_TCP,
+		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
+		.addrsize	= sizeof(rpcb_inaddr_loopback),
+		.servername	= "localhost",
+		.program	= &rpcb_program,
+		.version	= RPCBVERS_2,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= RPC_CLNT_CREATE_NOPING,
+	};
+	struct rpc_clnt *clnt, *clnt4;
+	int result = 0;
+
+	clnt = rpc_create(&args);
+	if (IS_ERR(clnt)) {
+		dprintk("RPC:       failed to create local rpcbind "
+				"client (errno %ld).\n", PTR_ERR(clnt));
+		result = -PTR_ERR(clnt);
+		goto out;
+	}
+
+	/*
+	 * This results in an RPC ping.  On systems running portmapper,
+	 * the v4 ping will fail.  Proceed anyway, but disallow rpcb
+	 * v4 upcalls.
+	 */
+	clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
+	if (IS_ERR(clnt4)) {
+		dprintk("RPC:       failed to bind second program to "
+				"rpcbind v4 client (errno %ld).\n",
+				PTR_ERR(clnt4));
+		clnt4 = NULL;
+	}
+
+	/* Protected by rpcb_create_local_mutex */
+	rpcb_local_clnt = clnt;
+	rpcb_local_clnt4 = clnt4;
+
+out:
+	return result;
+}
+
+/*
+ * Returns zero on success, otherwise a negative errno value
+ * is returned.
+ */
+static int rpcb_create_local(void)
+{
+	static DEFINE_MUTEX(rpcb_create_local_mutex);
+	int result = 0;
+
+	if (rpcb_local_clnt)
+		return result;
+
+	mutex_lock(&rpcb_create_local_mutex);
+	if (rpcb_local_clnt)
+		goto out;
+
+	if (rpcb_create_local_unix() != 0)
+		result = rpcb_create_local_net();
+
+out:
+	mutex_unlock(&rpcb_create_local_mutex);
+	return result;
+}
+
+static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
+				    size_t salen, int proto, u32 version)
+{
+	struct rpc_create_args args = {
+		.net		= &init_net,
+		.protocol	= proto,
+		.address	= srvaddr,
+		.addrsize	= salen,
+		.servername	= hostname,
+		.program	= &rpcb_program,
+		.version	= version,
+		.authflavor	= RPC_AUTH_UNIX,
+		.flags		= (RPC_CLNT_CREATE_NOPING |
+					RPC_CLNT_CREATE_NONPRIVPORT),
+	};
+
+	switch (srvaddr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT);
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
+		break;
+	default:
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	return rpc_create(&args);
+}
+
+static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg)
+{
+	int result, error = 0;
+
+	msg->rpc_resp = &result;
+
+	error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN);
+	if (error < 0) {
+		dprintk("RPC:       failed to contact local rpcbind "
+				"server (errno %d).\n", -error);
+		return error;
+	}
+
+	if (!result)
+		return -EACCES;
+	return 0;
+}
+
+/**
+ * rpcb_register - set or unset a port registration with the local rpcbind svc
+ * @prog: RPC program number to bind
+ * @vers: RPC version number to bind
+ * @prot: transport protocol to register
+ * @port: port value to register
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success.  Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
+ *
+ * RPC services invoke this function to advertise their contact
+ * information via the system's rpcbind daemon.  RPC services
+ * invoke this function once for each [program, version, transport]
+ * tuple they wish to advertise.
+ *
+ * Callers may also unregister RPC services that are no longer
+ * available by setting the passed-in port to zero.  This removes
+ * all registered transports for [program, version] from the local
+ * rpcbind database.
+ *
+ * This function uses rpcbind protocol version 2 to contact the
+ * local rpcbind daemon.
+ *
+ * Registration works over both AF_INET and AF_INET6, and services
+ * registered via this function are advertised as available for any
+ * address.  If the local rpcbind daemon is listening on AF_INET6,
+ * services registered via this function will be advertised on
+ * IN6ADDR_ANY (ie available for all AF_INET and AF_INET6
+ * addresses).
+ */
+int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
+{
+	struct rpcbind_args map = {
+		.r_prog		= prog,
+		.r_vers		= vers,
+		.r_prot		= prot,
+		.r_port		= port,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &map,
+	};
+	int error;
+
+	error = rpcb_create_local();
+	if (error)
+		return error;
+
+	dprintk("RPC:       %sregistering (%u, %u, %d, %u) with local "
+			"rpcbind\n", (port ? "" : "un"),
+			prog, vers, prot, port);
+
+	msg.rpc_proc = &rpcb_procedures2[RPCBPROC_UNSET];
+	if (port)
+		msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
+
+	return rpcb_register_call(rpcb_local_clnt, &msg);
+}
+
+/*
+ * Fill in AF_INET family-specific arguments to register
+ */
+static int rpcb_register_inet4(const struct sockaddr *sap,
+			       struct rpc_message *msg)
+{
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
+	struct rpcbind_args *map = msg->rpc_argp;
+	unsigned short port = ntohs(sin->sin_port);
+	int result;
+
+	map->r_addr = rpc_sockaddr2uaddr(sap);
+
+	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
+		"local rpcbind\n", (port ? "" : "un"),
+			map->r_prog, map->r_vers,
+			map->r_addr, map->r_netid);
+
+	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+	if (port)
+		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+
+	result = rpcb_register_call(rpcb_local_clnt4, msg);
+	kfree(map->r_addr);
+	return result;
+}
+
+/*
+ * Fill in AF_INET6 family-specific arguments to register
+ */
+static int rpcb_register_inet6(const struct sockaddr *sap,
+			       struct rpc_message *msg)
+{
+	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
+	struct rpcbind_args *map = msg->rpc_argp;
+	unsigned short port = ntohs(sin6->sin6_port);
+	int result;
+
+	map->r_addr = rpc_sockaddr2uaddr(sap);
+
+	dprintk("RPC:       %sregistering [%u, %u, %s, '%s'] with "
+		"local rpcbind\n", (port ? "" : "un"),
+			map->r_prog, map->r_vers,
+			map->r_addr, map->r_netid);
+
+	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+	if (port)
+		msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
+
+	result = rpcb_register_call(rpcb_local_clnt4, msg);
+	kfree(map->r_addr);
+	return result;
+}
+
+static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
+{
+	struct rpcbind_args *map = msg->rpc_argp;
+
+	dprintk("RPC:       unregistering [%u, %u, '%s'] with "
+		"local rpcbind\n",
+			map->r_prog, map->r_vers, map->r_netid);
+
+	map->r_addr = "";
+	msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+
+	return rpcb_register_call(rpcb_local_clnt4, msg);
+}
+
+/**
+ * rpcb_v4_register - set or unset a port registration with the local rpcbind
+ * @program: RPC program number of service to (un)register
+ * @version: RPC version number of service to (un)register
+ * @address: address family, IP address, and port to (un)register
+ * @netid: netid of transport protocol to (un)register
+ *
+ * Returns zero if the registration request was dispatched successfully
+ * and the rpcbind daemon returned success.  Otherwise, returns an errno
+ * value that reflects the nature of the error (request could not be
+ * dispatched, timed out, or rpcbind returned an error).
+ *
+ * RPC services invoke this function to advertise their contact
+ * information via the system's rpcbind daemon.  RPC services
+ * invoke this function once for each [program, version, address,
+ * netid] tuple they wish to advertise.
+ *
+ * Callers may also unregister RPC services that are registered at a
+ * specific address by setting the port number in @address to zero.
+ * They may unregister all registered protocol families at once for
+ * a service by passing a NULL @address argument.  If @netid is ""
+ * then all netids for [program, version, address] are unregistered.
+ *
+ * This function uses rpcbind protocol version 4 to contact the
+ * local rpcbind daemon.  The local rpcbind daemon must support
+ * version 4 of the rpcbind protocol in order for these functions
+ * to register a service successfully.
+ *
+ * Supported netids include "udp" and "tcp" for UDP and TCP over
+ * IPv4, and "udp6" and "tcp6" for UDP and TCP over IPv6,
+ * respectively.
+ *
+ * The contents of @address determine the address family and the
+ * port to be registered.  The usual practice is to pass INADDR_ANY
+ * as the raw address, but specifying a non-zero address is also
+ * supported by this API if the caller wishes to advertise an RPC
+ * service on a specific network interface.
+ *
+ * Note that passing in INADDR_ANY does not create the same service
+ * registration as IN6ADDR_ANY.  The former advertises an RPC
+ * service on any IPv4 address, but not on IPv6.  The latter
+ * advertises the service on all IPv4 and IPv6 addresses.
+ */
+int rpcb_v4_register(const u32 program, const u32 version,
+		     const struct sockaddr *address, const char *netid)
+{
+	struct rpcbind_args map = {
+		.r_prog		= program,
+		.r_vers		= version,
+		.r_netid	= netid,
+		.r_owner	= RPCB_OWNER_STRING,
+	};
+	struct rpc_message msg = {
+		.rpc_argp	= &map,
+	};
+	int error;
+
+	error = rpcb_create_local();
+	if (error)
+		return error;
+	if (rpcb_local_clnt4 == NULL)
+		return -EPROTONOSUPPORT;
+
+	if (address == NULL)
+		return rpcb_unregister_all_protofamilies(&msg);
+
+	switch (address->sa_family) {
+	case AF_INET:
+		return rpcb_register_inet4(address, &msg);
+	case AF_INET6:
+		return rpcb_register_inet6(address, &msg);
+	}
+
+	return -EAFNOSUPPORT;
+}
+
+static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc)
+{
+	struct rpc_message msg = {
+		.rpc_proc = proc,
+		.rpc_argp = map,
+		.rpc_resp = map,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = rpcb_clnt,
+		.rpc_message = &msg,
+		.callback_ops = &rpcb_getport_ops,
+		.callback_data = map,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN,
+	};
+
+	return rpc_run_task(&task_setup_data);
+}
+
+/*
+ * In the case where rpc clients have been cloned, we want to make
+ * sure that we use the program number/version etc of the actual
+ * owner of the xprt. To do so, we walk back up the tree of parents
+ * to find whoever created the transport and/or whoever has the
+ * autobind flag set.
+ */
+static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
+{
+	struct rpc_clnt *parent = clnt->cl_parent;
+
+	while (parent != clnt) {
+		if (parent->cl_xprt != clnt->cl_xprt)
+			break;
+		if (clnt->cl_autobind)
+			break;
+		clnt = parent;
+		parent = parent->cl_parent;
+	}
+	return clnt;
+}
+
+/**
+ * rpcb_getport_async - obtain the port for a given RPC service on a given host
+ * @task: task that is waiting for portmapper request
+ *
+ * This one can be called for an ongoing RPC request, and can be used in
+ * an async (rpciod) context.
+ */
+void rpcb_getport_async(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_procinfo *proc;
+	u32 bind_version;
+	struct rpc_xprt *xprt;
+	struct rpc_clnt	*rpcb_clnt;
+	struct rpcbind_args *map;
+	struct rpc_task	*child;
+	struct sockaddr_storage addr;
+	struct sockaddr *sap = (struct sockaddr *)&addr;
+	size_t salen;
+	int status;
+
+	clnt = rpcb_find_transport_owner(task->tk_client);
+	xprt = clnt->cl_xprt;
+
+	dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
+		task->tk_pid, __func__,
+		clnt->cl_server, clnt->cl_prog, clnt->cl_vers, xprt->prot);
+
+	/* Put self on the wait queue to ensure we get notified if
+	 * some other task is already attempting to bind the port */
+	rpc_sleep_on(&xprt->binding, task, NULL);
+
+	if (xprt_test_and_set_binding(xprt)) {
+		dprintk("RPC: %5u %s: waiting for another binder\n",
+			task->tk_pid, __func__);
+		return;
+	}
+
+	/* Someone else may have bound if we slept */
+	if (xprt_bound(xprt)) {
+		status = 0;
+		dprintk("RPC: %5u %s: already bound\n",
+			task->tk_pid, __func__);
+		goto bailout_nofree;
+	}
+
+	/* Parent transport's destination address */
+	salen = rpc_peeraddr(clnt, sap, sizeof(addr));
+
+	/* Don't ever use rpcbind v2 for AF_INET6 requests */
+	switch (sap->sa_family) {
+	case AF_INET:
+		proc = rpcb_next_version[xprt->bind_index].rpc_proc;
+		bind_version = rpcb_next_version[xprt->bind_index].rpc_vers;
+		break;
+	case AF_INET6:
+		proc = rpcb_next_version6[xprt->bind_index].rpc_proc;
+		bind_version = rpcb_next_version6[xprt->bind_index].rpc_vers;
+		break;
+	default:
+		status = -EAFNOSUPPORT;
+		dprintk("RPC: %5u %s: bad address family\n",
+				task->tk_pid, __func__);
+		goto bailout_nofree;
+	}
+	if (proc == NULL) {
+		xprt->bind_index = 0;
+		status = -EPFNOSUPPORT;
+		dprintk("RPC: %5u %s: no more getport versions available\n",
+			task->tk_pid, __func__);
+		goto bailout_nofree;
+	}
+
+	dprintk("RPC: %5u %s: trying rpcbind version %u\n",
+		task->tk_pid, __func__, bind_version);
+
+	rpcb_clnt = rpcb_create(clnt->cl_server, sap, salen, xprt->prot,
+				bind_version);
+	if (IS_ERR(rpcb_clnt)) {
+		status = PTR_ERR(rpcb_clnt);
+		dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n",
+			task->tk_pid, __func__, PTR_ERR(rpcb_clnt));
+		goto bailout_nofree;
+	}
+
+	map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC);
+	if (!map) {
+		status = -ENOMEM;
+		dprintk("RPC: %5u %s: no memory available\n",
+			task->tk_pid, __func__);
+		goto bailout_release_client;
+	}
+	map->r_prog = clnt->cl_prog;
+	map->r_vers = clnt->cl_vers;
+	map->r_prot = xprt->prot;
+	map->r_port = 0;
+	map->r_xprt = xprt_get(xprt);
+	map->r_status = -EIO;
+
+	switch (bind_version) {
+	case RPCBVERS_4:
+	case RPCBVERS_3:
+		map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
+		map->r_addr = rpc_sockaddr2uaddr(sap);
+		map->r_owner = "";
+		break;
+	case RPCBVERS_2:
+		map->r_addr = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	child = rpcb_call_async(rpcb_clnt, map, proc);
+	rpc_release_client(rpcb_clnt);
+	if (IS_ERR(child)) {
+		/* rpcb_map_release() has freed the arguments */
+		dprintk("RPC: %5u %s: rpc_run_task failed\n",
+			task->tk_pid, __func__);
+		return;
+	}
+
+	xprt->stat.bind_count++;
+	rpc_put_task(child);
+	return;
+
+bailout_release_client:
+	rpc_release_client(rpcb_clnt);
+bailout_nofree:
+	rpcb_wake_rpcbind_waiters(xprt, status);
+	task->tk_status = status;
+}
+EXPORT_SYMBOL_GPL(rpcb_getport_async);
+
+/*
+ * Rpcbind child task calls this callback via tk_exit.
+ */
+static void rpcb_getport_done(struct rpc_task *child, void *data)
+{
+	struct rpcbind_args *map = data;
+	struct rpc_xprt *xprt = map->r_xprt;
+	int status = child->tk_status;
+
+	/* Garbage reply: retry with a lesser rpcbind version */
+	if (status == -EIO)
+		status = -EPROTONOSUPPORT;
+
+	/* rpcbind server doesn't support this rpcbind protocol version */
+	if (status == -EPROTONOSUPPORT)
+		xprt->bind_index++;
+
+	if (status < 0) {
+		/* rpcbind server not available on remote host? */
+		xprt->ops->set_port(xprt, 0);
+	} else if (map->r_port == 0) {
+		/* Requested RPC service wasn't registered on remote host */
+		xprt->ops->set_port(xprt, 0);
+		status = -EACCES;
+	} else {
+		/* Succeeded */
+		xprt->ops->set_port(xprt, map->r_port);
+		xprt_set_bound(xprt);
+		status = 0;
+	}
+
+	dprintk("RPC: %5u rpcb_getport_done(status %d, port %u)\n",
+			child->tk_pid, status, map->r_port);
+
+	map->r_status = status;
+}
+
+/*
+ * XDR functions for rpcbind
+ */
+
+static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct rpcbind_args *rpcb)
+{
+	struct rpc_task *task = req->rq_task;
+	__be32 *p;
+
+	dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n",
+			task->tk_pid, task->tk_msg.rpc_proc->p_name,
+			rpcb->r_prog, rpcb->r_vers, rpcb->r_prot, rpcb->r_port);
+
+	p = xdr_reserve_space(xdr, RPCB_mappingargs_sz << 2);
+	*p++ = cpu_to_be32(rpcb->r_prog);
+	*p++ = cpu_to_be32(rpcb->r_vers);
+	*p++ = cpu_to_be32(rpcb->r_prot);
+	*p   = cpu_to_be32(rpcb->r_port);
+}
+
+static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    struct rpcbind_args *rpcb)
+{
+	struct rpc_task *task = req->rq_task;
+	unsigned long port;
+	__be32 *p;
+
+	rpcb->r_port = 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	port = be32_to_cpup(p);
+	dprintk("RPC: %5u PMAP_%s result: %lu\n", task->tk_pid,
+			task->tk_msg.rpc_proc->p_name, port);
+	if (unlikely(port > USHRT_MAX))
+		return -EIO;
+
+	rpcb->r_port = port;
+	return 0;
+}
+
+static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr,
+			unsigned int *boolp)
+{
+	struct rpc_task *task = req->rq_task;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		return -EIO;
+
+	*boolp = 0;
+	if (*p != xdr_zero)
+		*boolp = 1;
+
+	dprintk("RPC: %5u RPCB_%s call %s\n",
+			task->tk_pid, task->tk_msg.rpc_proc->p_name,
+			(*boolp ? "succeeded" : "failed"));
+	return 0;
+}
+
+static void encode_rpcb_string(struct xdr_stream *xdr, const char *string,
+			       const u32 maxstrlen)
+{
+	__be32 *p;
+	u32 len;
+
+	len = strlen(string);
+	BUG_ON(len > maxstrlen);
+	p = xdr_reserve_space(xdr, 4 + len);
+	xdr_encode_opaque(p, string, len);
+}
+
+static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
+			     const struct rpcbind_args *rpcb)
+{
+	struct rpc_task *task = req->rq_task;
+	__be32 *p;
+
+	dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n",
+			task->tk_pid, task->tk_msg.rpc_proc->p_name,
+			rpcb->r_prog, rpcb->r_vers,
+			rpcb->r_netid, rpcb->r_addr);
+
+	p = xdr_reserve_space(xdr, (RPCB_program_sz + RPCB_version_sz) << 2);
+	*p++ = cpu_to_be32(rpcb->r_prog);
+	*p = cpu_to_be32(rpcb->r_vers);
+
+	encode_rpcb_string(xdr, rpcb->r_netid, RPCBIND_MAXNETIDLEN);
+	encode_rpcb_string(xdr, rpcb->r_addr, RPCBIND_MAXUADDRLEN);
+	encode_rpcb_string(xdr, rpcb->r_owner, RPCB_MAXOWNERLEN);
+}
+
+static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr,
+			    struct rpcbind_args *rpcb)
+{
+	struct sockaddr_storage address;
+	struct sockaddr *sap = (struct sockaddr *)&address;
+	struct rpc_task *task = req->rq_task;
+	__be32 *p;
+	u32 len;
+
+	rpcb->r_port = 0;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(p == NULL))
+		goto out_fail;
+	len = be32_to_cpup(p);
+
+	/*
+	 * If the returned universal address is a null string,
+	 * the requested RPC service was not registered.
+	 */
+	if (len == 0) {
+		dprintk("RPC: %5u RPCB reply: program not registered\n",
+				task->tk_pid);
+		return 0;
+	}
+
+	if (unlikely(len > RPCBIND_MAXUADDRLEN))
+		goto out_fail;
+
+	p = xdr_inline_decode(xdr, len);
+	if (unlikely(p == NULL))
+		goto out_fail;
+	dprintk("RPC: %5u RPCB_%s reply: %s\n", task->tk_pid,
+			task->tk_msg.rpc_proc->p_name, (char *)p);
+
+	if (rpc_uaddr2sockaddr((char *)p, len, sap, sizeof(address)) == 0)
+		goto out_fail;
+	rpcb->r_port = rpc_get_port(sap);
+
+	return 0;
+
+out_fail:
+	dprintk("RPC: %5u malformed RPCB_%s reply\n",
+			task->tk_pid, task->tk_msg.rpc_proc->p_name);
+	return -EIO;
+}
+
+/*
+ * Not all rpcbind procedures described in RFC 1833 are implemented
+ * since the Linux kernel RPC code requires only these.
+ */
+
+static struct rpc_procinfo rpcb_procedures2[] = {
+	[RPCBPROC_SET] = {
+		.p_proc		= RPCBPROC_SET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_mappingargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_SET,
+		.p_timer	= 0,
+		.p_name		= "SET",
+	},
+	[RPCBPROC_UNSET] = {
+		.p_proc		= RPCBPROC_UNSET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_mappingargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_UNSET,
+		.p_timer	= 0,
+		.p_name		= "UNSET",
+	},
+	[RPCBPROC_GETPORT] = {
+		.p_proc		= RPCBPROC_GETPORT,
+		.p_encode	= (kxdreproc_t)rpcb_enc_mapping,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getport,
+		.p_arglen	= RPCB_mappingargs_sz,
+		.p_replen	= RPCB_getportres_sz,
+		.p_statidx	= RPCBPROC_GETPORT,
+		.p_timer	= 0,
+		.p_name		= "GETPORT",
+	},
+};
+
+static struct rpc_procinfo rpcb_procedures3[] = {
+	[RPCBPROC_SET] = {
+		.p_proc		= RPCBPROC_SET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_SET,
+		.p_timer	= 0,
+		.p_name		= "SET",
+	},
+	[RPCBPROC_UNSET] = {
+		.p_proc		= RPCBPROC_UNSET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_UNSET,
+		.p_timer	= 0,
+		.p_name		= "UNSET",
+	},
+	[RPCBPROC_GETADDR] = {
+		.p_proc		= RPCBPROC_GETADDR,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getaddr,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_getaddrres_sz,
+		.p_statidx	= RPCBPROC_GETADDR,
+		.p_timer	= 0,
+		.p_name		= "GETADDR",
+	},
+};
+
+static struct rpc_procinfo rpcb_procedures4[] = {
+	[RPCBPROC_SET] = {
+		.p_proc		= RPCBPROC_SET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_SET,
+		.p_timer	= 0,
+		.p_name		= "SET",
+	},
+	[RPCBPROC_UNSET] = {
+		.p_proc		= RPCBPROC_UNSET,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_set,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_setres_sz,
+		.p_statidx	= RPCBPROC_UNSET,
+		.p_timer	= 0,
+		.p_name		= "UNSET",
+	},
+	[RPCBPROC_GETADDR] = {
+		.p_proc		= RPCBPROC_GETADDR,
+		.p_encode	= (kxdreproc_t)rpcb_enc_getaddr,
+		.p_decode	= (kxdrdproc_t)rpcb_dec_getaddr,
+		.p_arglen	= RPCB_getaddrargs_sz,
+		.p_replen	= RPCB_getaddrres_sz,
+		.p_statidx	= RPCBPROC_GETADDR,
+		.p_timer	= 0,
+		.p_name		= "GETADDR",
+	},
+};
+
+static struct rpcb_info rpcb_next_version[] = {
+	{
+		.rpc_vers	= RPCBVERS_2,
+		.rpc_proc	= &rpcb_procedures2[RPCBPROC_GETPORT],
+	},
+	{
+		.rpc_proc	= NULL,
+	},
+};
+
+static struct rpcb_info rpcb_next_version6[] = {
+	{
+		.rpc_vers	= RPCBVERS_4,
+		.rpc_proc	= &rpcb_procedures4[RPCBPROC_GETADDR],
+	},
+	{
+		.rpc_vers	= RPCBVERS_3,
+		.rpc_proc	= &rpcb_procedures3[RPCBPROC_GETADDR],
+	},
+	{
+		.rpc_proc	= NULL,
+	},
+};
+
+static struct rpc_version rpcb_version2 = {
+	.number		= RPCBVERS_2,
+	.nrprocs	= ARRAY_SIZE(rpcb_procedures2),
+	.procs		= rpcb_procedures2
+};
+
+static struct rpc_version rpcb_version3 = {
+	.number		= RPCBVERS_3,
+	.nrprocs	= ARRAY_SIZE(rpcb_procedures3),
+	.procs		= rpcb_procedures3
+};
+
+static struct rpc_version rpcb_version4 = {
+	.number		= RPCBVERS_4,
+	.nrprocs	= ARRAY_SIZE(rpcb_procedures4),
+	.procs		= rpcb_procedures4
+};
+
+static struct rpc_version *rpcb_version[] = {
+	NULL,
+	NULL,
+	&rpcb_version2,
+	&rpcb_version3,
+	&rpcb_version4
+};
+
+static struct rpc_stat rpcb_stats;
+
+static struct rpc_program rpcb_program = {
+	.name		= "rpcbind",
+	.number		= RPCBIND_PROGRAM,
+	.nrvers		= ARRAY_SIZE(rpcb_version),
+	.version	= rpcb_version,
+	.stats		= &rpcb_stats,
+};
+
+/**
+ * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister
+ *
+ */
+void cleanup_rpcb_clnt(void)
+{
+	if (rpcb_local_clnt4)
+		rpc_shutdown_client(rpcb_local_clnt4);
+	if (rpcb_local_clnt)
+		rpc_shutdown_client(rpcb_local_clnt);
+}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
new file mode 100644
index 00000000..b6bb2257
--- /dev/null
+++ b/net/sunrpc/sched.c
@@ -0,0 +1,1016 @@
+/*
+ * linux/net/sunrpc/sched.c
+ *
+ * Scheduling for synchronous and asynchronous RPC requests.
+ *
+ * Copyright (C) 1996 Olaf Kirch, <okir@monad.swb.de>
+ *
+ * TCP NFS related read + write fixes
+ * (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
+ */
+
+#include <linux/module.h>
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#include "sunrpc.h"
+
+#ifdef RPC_DEBUG
+#define RPCDBG_FACILITY		RPCDBG_SCHED
+#endif
+
+/*
+ * RPC slabs and memory pools
+ */
+#define RPC_BUFFER_MAXSIZE	(2048)
+#define RPC_BUFFER_POOLSIZE	(8)
+#define RPC_TASK_POOLSIZE	(8)
+static struct kmem_cache	*rpc_task_slabp __read_mostly;
+static struct kmem_cache	*rpc_buffer_slabp __read_mostly;
+static mempool_t	*rpc_task_mempool __read_mostly;
+static mempool_t	*rpc_buffer_mempool __read_mostly;
+
+static void			rpc_async_schedule(struct work_struct *);
+static void			 rpc_release_task(struct rpc_task *task);
+static void __rpc_queue_timer_fn(unsigned long ptr);
+
+/*
+ * RPC tasks sit here while waiting for conditions to improve.
+ */
+static struct rpc_wait_queue delay_queue;
+
+/*
+ * rpciod-related stuff
+ */
+struct workqueue_struct *rpciod_workqueue;
+
+/*
+ * Disable the timer for a given RPC task. Should be called with
+ * queue->lock and bh_disabled in order to avoid races within
+ * rpc_run_timer().
+ */
+static void
+__rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	if (task->tk_timeout == 0)
+		return;
+	dprintk("RPC: %5u disabling timer\n", task->tk_pid);
+	task->tk_timeout = 0;
+	list_del(&task->u.tk_wait.timer_list);
+	if (list_empty(&queue->timer_list.list))
+		del_timer(&queue->timer_list.timer);
+}
+
+static void
+rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
+{
+	queue->timer_list.expires = expires;
+	mod_timer(&queue->timer_list.timer, expires);
+}
+
+/*
+ * Set up a timer for the current task.
+ */
+static void
+__rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	if (!task->tk_timeout)
+		return;
+
+	dprintk("RPC: %5u setting alarm for %lu ms\n",
+			task->tk_pid, task->tk_timeout * 1000 / HZ);
+
+	task->u.tk_wait.expires = jiffies + task->tk_timeout;
+	if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
+		rpc_set_queue_timer(queue, task->u.tk_wait.expires);
+	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
+}
+
+/*
+ * Add new request to a priority queue.
+ */
+static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	struct list_head *q;
+	struct rpc_task *t;
+
+	INIT_LIST_HEAD(&task->u.tk_wait.links);
+	q = &queue->tasks[task->tk_priority];
+	if (unlikely(task->tk_priority > queue->maxpriority))
+		q = &queue->tasks[queue->maxpriority];
+	list_for_each_entry(t, q, u.tk_wait.list) {
+		if (t->tk_owner == task->tk_owner) {
+			list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+			return;
+		}
+	}
+	list_add_tail(&task->u.tk_wait.list, q);
+}
+
+/*
+ * Add new request to wait queue.
+ *
+ * Swapper tasks always get inserted at the head of the queue.
+ * This should avoid many nasty memory deadlocks and hopefully
+ * improve overall performance.
+ * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+ */
+static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	BUG_ON (RPC_IS_QUEUED(task));
+
+	if (RPC_IS_PRIORITY(queue))
+		__rpc_add_wait_queue_priority(queue, task);
+	else if (RPC_IS_SWAPPER(task))
+		list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+	else
+		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
+	task->tk_waitqueue = queue;
+	queue->qlen++;
+	rpc_set_queued(task);
+
+	dprintk("RPC: %5u added to queue %p \"%s\"\n",
+			task->tk_pid, queue, rpc_qname(queue));
+}
+
+/*
+ * Remove request from a priority queue.
+ */
+static void __rpc_remove_wait_queue_priority(struct rpc_task *task)
+{
+	struct rpc_task *t;
+
+	if (!list_empty(&task->u.tk_wait.links)) {
+		t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
+		list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
+		list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+	}
+}
+
+/*
+ * Remove request from queue.
+ * Note: must be called with spin lock held.
+ */
+static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	__rpc_disable_timer(queue, task);
+	if (RPC_IS_PRIORITY(queue))
+		__rpc_remove_wait_queue_priority(task);
+	list_del(&task->u.tk_wait.list);
+	queue->qlen--;
+	dprintk("RPC: %5u removed from queue %p \"%s\"\n",
+			task->tk_pid, queue, rpc_qname(queue));
+}
+
+static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+{
+	queue->priority = priority;
+	queue->count = 1 << (priority * 2);
+}
+
+static inline void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid)
+{
+	queue->owner = pid;
+	queue->nr = RPC_BATCH_COUNT;
+}
+
+static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue)
+{
+	rpc_set_waitqueue_priority(queue, queue->maxpriority);
+	rpc_set_waitqueue_owner(queue, 0);
+}
+
+static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, unsigned char nr_queues)
+{
+	int i;
+
+	spin_lock_init(&queue->lock);
+	for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+		INIT_LIST_HEAD(&queue->tasks[i]);
+	queue->maxpriority = nr_queues - 1;
+	rpc_reset_waitqueue_priority(queue);
+	queue->qlen = 0;
+	setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue);
+	INIT_LIST_HEAD(&queue->timer_list.list);
+#ifdef RPC_DEBUG
+	queue->name = qname;
+#endif
+}
+
+void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname)
+{
+	__rpc_init_priority_wait_queue(queue, qname, RPC_NR_PRIORITY);
+}
+EXPORT_SYMBOL_GPL(rpc_init_priority_wait_queue);
+
+void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname)
+{
+	__rpc_init_priority_wait_queue(queue, qname, 1);
+}
+EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
+
+void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
+{
+	del_timer_sync(&queue->timer_list.timer);
+}
+EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
+
+static int rpc_wait_bit_killable(void *word)
+{
+	if (fatal_signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
+#ifdef RPC_DEBUG
+static void rpc_task_set_debuginfo(struct rpc_task *task)
+{
+	static atomic_t rpc_pid;
+
+	task->tk_pid = atomic_inc_return(&rpc_pid);
+}
+#else
+static inline void rpc_task_set_debuginfo(struct rpc_task *task)
+{
+}
+#endif
+
+static void rpc_set_active(struct rpc_task *task)
+{
+	rpc_task_set_debuginfo(task);
+	set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
+}
+
+/*
+ * Mark an RPC call as having completed by clearing the 'active' bit
+ * and then waking up all tasks that were sleeping.
+ */
+static int rpc_complete_task(struct rpc_task *task)
+{
+	void *m = &task->tk_runstate;
+	wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE);
+	struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&wq->lock, flags);
+	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
+	ret = atomic_dec_and_test(&task->tk_count);
+	if (waitqueue_active(wq))
+		__wake_up_locked_key(wq, TASK_NORMAL, &k);
+	spin_unlock_irqrestore(&wq->lock, flags);
+	return ret;
+}
+
+/*
+ * Allow callers to wait for completion of an RPC call
+ *
+ * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit()
+ * to enforce taking of the wq->lock and hence avoid races with
+ * rpc_complete_task().
+ */
+int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *))
+{
+	if (action == NULL)
+		action = rpc_wait_bit_killable;
+	return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
+			action, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
+
+/*
+ * Make an RPC task runnable.
+ *
+ * Note: If the task is ASYNC, this must be called with
+ * the spinlock held to protect the wait queue operation.
+ */
+static void rpc_make_runnable(struct rpc_task *task)
+{
+	rpc_clear_queued(task);
+	if (rpc_test_and_set_running(task))
+		return;
+	if (RPC_IS_ASYNC(task)) {
+		INIT_WORK(&task->u.tk_work, rpc_async_schedule);
+		queue_work(rpciod_workqueue, &task->u.tk_work);
+	} else
+		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
+}
+
+/*
+ * Prepare for sleeping on a wait queue.
+ * By always appending tasks to the list we ensure FIFO behavior.
+ * NB: An RPC task will only receive interrupt-driven events as long
+ * as it's on a wait queue.
+ */
+static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+			rpc_action action)
+{
+	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
+			task->tk_pid, rpc_qname(q), jiffies);
+
+	__rpc_add_wait_queue(q, task);
+
+	BUG_ON(task->tk_callback != NULL);
+	task->tk_callback = action;
+	__rpc_add_timer(q, task);
+}
+
+void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+				rpc_action action)
+{
+	/* We shouldn't ever put an inactive task to sleep */
+	BUG_ON(!RPC_IS_ACTIVATED(task));
+
+	/*
+	 * Protect the queue operations.
+	 */
+	spin_lock_bh(&q->lock);
+	__rpc_sleep_on(q, task, action);
+	spin_unlock_bh(&q->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_sleep_on);
+
+/**
+ * __rpc_do_wake_up_task - wake up a single rpc_task
+ * @queue: wait queue
+ * @task: task to be woken up
+ *
+ * Caller must hold queue->lock, and have cleared the task queued flag.
+ */
+static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
+			task->tk_pid, jiffies);
+
+	/* Has the task been executed yet? If not, we cannot wake it up! */
+	if (!RPC_IS_ACTIVATED(task)) {
+		printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+		return;
+	}
+
+	__rpc_remove_wait_queue(queue, task);
+
+	rpc_make_runnable(task);
+
+	dprintk("RPC:       __rpc_wake_up_task done\n");
+}
+
+/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	if (RPC_IS_QUEUED(task) && task->tk_waitqueue == queue)
+		__rpc_do_wake_up_task(queue, task);
+}
+
+/*
+ * Tests whether rpc queue is empty
+ */
+int rpc_queue_empty(struct rpc_wait_queue *queue)
+{
+	int res;
+
+	spin_lock_bh(&queue->lock);
+	res = queue->qlen;
+	spin_unlock_bh(&queue->lock);
+	return res == 0;
+}
+EXPORT_SYMBOL_GPL(rpc_queue_empty);
+
+/*
+ * Wake up a task on a specific queue
+ */
+void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+	spin_lock_bh(&queue->lock);
+	rpc_wake_up_task_queue_locked(queue, task);
+	spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
+
+/*
+ * Wake up the next task on a priority queue.
+ */
+static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue)
+{
+	struct list_head *q;
+	struct rpc_task *task;
+
+	/*
+	 * Service a batch of tasks from a single owner.
+	 */
+	q = &queue->tasks[queue->priority];
+	if (!list_empty(q)) {
+		task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+		if (queue->owner == task->tk_owner) {
+			if (--queue->nr)
+				goto out;
+			list_move_tail(&task->u.tk_wait.list, q);
+		}
+		/*
+		 * Check if we need to switch queues.
+		 */
+		if (--queue->count)
+			goto new_owner;
+	}
+
+	/*
+	 * Service the next queue.
+	 */
+	do {
+		if (q == &queue->tasks[0])
+			q = &queue->tasks[queue->maxpriority];
+		else
+			q = q - 1;
+		if (!list_empty(q)) {
+			task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+			goto new_queue;
+		}
+	} while (q != &queue->tasks[queue->priority]);
+
+	rpc_reset_waitqueue_priority(queue);
+	return NULL;
+
+new_queue:
+	rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0]));
+new_owner:
+	rpc_set_waitqueue_owner(queue, task->tk_owner);
+out:
+	rpc_wake_up_task_queue_locked(queue, task);
+	return task;
+}
+
+/*
+ * Wake up the next task on the wait queue.
+ */
+struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue)
+{
+	struct rpc_task	*task = NULL;
+
+	dprintk("RPC:       wake_up_next(%p \"%s\")\n",
+			queue, rpc_qname(queue));
+	spin_lock_bh(&queue->lock);
+	if (RPC_IS_PRIORITY(queue))
+		task = __rpc_wake_up_next_priority(queue);
+	else {
+		task_for_first(task, &queue->tasks[0])
+			rpc_wake_up_task_queue_locked(queue, task);
+	}
+	spin_unlock_bh(&queue->lock);
+
+	return task;
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_next);
+
+/**
+ * rpc_wake_up - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ * Grabs queue->lock
+ */
+void rpc_wake_up(struct rpc_wait_queue *queue)
+{
+	struct list_head *head;
+
+	spin_lock_bh(&queue->lock);
+	head = &queue->tasks[queue->maxpriority];
+	for (;;) {
+		while (!list_empty(head)) {
+			struct rpc_task *task;
+			task = list_first_entry(head,
+					struct rpc_task,
+					u.tk_wait.list);
+			rpc_wake_up_task_queue_locked(queue, task);
+		}
+		if (head == &queue->tasks[0])
+			break;
+		head--;
+	}
+	spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ *
+ * Grabs queue->lock
+ */
+void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+{
+	struct list_head *head;
+
+	spin_lock_bh(&queue->lock);
+	head = &queue->tasks[queue->maxpriority];
+	for (;;) {
+		while (!list_empty(head)) {
+			struct rpc_task *task;
+			task = list_first_entry(head,
+					struct rpc_task,
+					u.tk_wait.list);
+			task->tk_status = status;
+			rpc_wake_up_task_queue_locked(queue, task);
+		}
+		if (head == &queue->tasks[0])
+			break;
+		head--;
+	}
+	spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_status);
+
+static void __rpc_queue_timer_fn(unsigned long ptr)
+{
+	struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
+	struct rpc_task *task, *n;
+	unsigned long expires, now, timeo;
+
+	spin_lock(&queue->lock);
+	expires = now = jiffies;
+	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
+		timeo = task->u.tk_wait.expires;
+		if (time_after_eq(now, timeo)) {
+			dprintk("RPC: %5u timeout\n", task->tk_pid);
+			task->tk_status = -ETIMEDOUT;
+			rpc_wake_up_task_queue_locked(queue, task);
+			continue;
+		}
+		if (expires == now || time_after(expires, timeo))
+			expires = timeo;
+	}
+	if (!list_empty(&queue->timer_list.list))
+		rpc_set_queue_timer(queue, expires);
+	spin_unlock(&queue->lock);
+}
+
+static void __rpc_atrun(struct rpc_task *task)
+{
+	task->tk_status = 0;
+}
+
+/*
+ * Run a task at a later time
+ */
+void rpc_delay(struct rpc_task *task, unsigned long delay)
+{
+	task->tk_timeout = delay;
+	rpc_sleep_on(&delay_queue, task, __rpc_atrun);
+}
+EXPORT_SYMBOL_GPL(rpc_delay);
+
+/*
+ * Helper to call task->tk_ops->rpc_call_prepare
+ */
+void rpc_prepare_task(struct rpc_task *task)
+{
+	task->tk_ops->rpc_call_prepare(task, task->tk_calldata);
+}
+
+/*
+ * Helper that calls task->tk_ops->rpc_call_done if it exists
+ */
+void rpc_exit_task(struct rpc_task *task)
+{
+	task->tk_action = NULL;
+	if (task->tk_ops->rpc_call_done != NULL) {
+		task->tk_ops->rpc_call_done(task, task->tk_calldata);
+		if (task->tk_action != NULL) {
+			WARN_ON(RPC_ASSASSINATED(task));
+			/* Always release the RPC slot and buffer memory */
+			xprt_release(task);
+		}
+	}
+}
+
+void rpc_exit(struct rpc_task *task, int status)
+{
+	task->tk_status = status;
+	task->tk_action = rpc_exit_task;
+	if (RPC_IS_QUEUED(task))
+		rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
+EXPORT_SYMBOL_GPL(rpc_exit);
+
+void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
+{
+	if (ops->rpc_release != NULL)
+		ops->rpc_release(calldata);
+}
+
+/*
+ * This is the RPC `scheduler' (or rather, the finite state machine).
+ */
+static void __rpc_execute(struct rpc_task *task)
+{
+	struct rpc_wait_queue *queue;
+	int task_is_async = RPC_IS_ASYNC(task);
+	int status = 0;
+
+	dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
+			task->tk_pid, task->tk_flags);
+
+	BUG_ON(RPC_IS_QUEUED(task));
+
+	for (;;) {
+		void (*do_action)(struct rpc_task *);
+
+		/*
+		 * Execute any pending callback first.
+		 */
+		do_action = task->tk_callback;
+		task->tk_callback = NULL;
+		if (do_action == NULL) {
+			/*
+			 * Perform the next FSM step.
+			 * tk_action may be NULL if the task has been killed.
+			 * In particular, note that rpc_killall_tasks may
+			 * do this at any time, so beware when dereferencing.
+			 */
+			do_action = task->tk_action;
+			if (do_action == NULL)
+				break;
+		}
+		do_action(task);
+
+		/*
+		 * Lockless check for whether task is sleeping or not.
+		 */
+		if (!RPC_IS_QUEUED(task))
+			continue;
+		/*
+		 * The queue->lock protects against races with
+		 * rpc_make_runnable().
+		 *
+		 * Note that once we clear RPC_TASK_RUNNING on an asynchronous
+		 * rpc_task, rpc_make_runnable() can assign it to a
+		 * different workqueue. We therefore cannot assume that the
+		 * rpc_task pointer may still be dereferenced.
+		 */
+		queue = task->tk_waitqueue;
+		spin_lock_bh(&queue->lock);
+		if (!RPC_IS_QUEUED(task)) {
+			spin_unlock_bh(&queue->lock);
+			continue;
+		}
+		rpc_clear_running(task);
+		spin_unlock_bh(&queue->lock);
+		if (task_is_async)
+			return;
+
+		/* sync task: sleep here */
+		dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
+		status = out_of_line_wait_on_bit(&task->tk_runstate,
+				RPC_TASK_QUEUED, rpc_wait_bit_killable,
+				TASK_KILLABLE);
+		if (status == -ERESTARTSYS) {
+			/*
+			 * When a sync task receives a signal, it exits with
+			 * -ERESTARTSYS. In order to catch any callbacks that
+			 * clean up after sleeping on some queue, we don't
+			 * break the loop here, but go around once more.
+			 */
+			dprintk("RPC: %5u got signal\n", task->tk_pid);
+			task->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(task, -ERESTARTSYS);
+		}
+		rpc_set_running(task);
+		dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
+	}
+
+	dprintk("RPC: %5u return %d, status %d\n", task->tk_pid, status,
+			task->tk_status);
+	/* Release all resources associated with the task */
+	rpc_release_task(task);
+}
+
+/*
+ * User-visible entry point to the scheduler.
+ *
+ * This may be called recursively if e.g. an async NFS task updates
+ * the attributes and finds that dirty pages must be flushed.
+ * NOTE: Upon exit of this function the task is guaranteed to be
+ *	 released. In particular note that tk_release() will have
+ *	 been called, so your task memory may have been freed.
+ */
+void rpc_execute(struct rpc_task *task)
+{
+	rpc_set_active(task);
+	rpc_make_runnable(task);
+	if (!RPC_IS_ASYNC(task))
+		__rpc_execute(task);
+}
+
+static void rpc_async_schedule(struct work_struct *work)
+{
+	__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
+}
+
+/**
+ * rpc_malloc - allocate an RPC buffer
+ * @task: RPC task that will use this buffer
+ * @size: requested byte size
+ *
+ * To prevent rpciod from hanging, this allocator never sleeps,
+ * returning NULL if the request cannot be serviced immediately.
+ * The caller can arrange to sleep in a way that is safe for rpciod.
+ *
+ * Most requests are 'small' (under 2KiB) and can be serviced from a
+ * mempool, ensuring that NFS reads and writes can always proceed,
+ * and that there is good locality of reference for these buffers.
+ *
+ * In order to avoid memory starvation triggering more writebacks of
+ * NFS requests, we avoid using GFP_KERNEL.
+ */
+void *rpc_malloc(struct rpc_task *task, size_t size)
+{
+	struct rpc_buffer *buf;
+	gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
+
+	size += sizeof(struct rpc_buffer);
+	if (size <= RPC_BUFFER_MAXSIZE)
+		buf = mempool_alloc(rpc_buffer_mempool, gfp);
+	else
+		buf = kmalloc(size, gfp);
+
+	if (!buf)
+		return NULL;
+
+	buf->len = size;
+	dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
+			task->tk_pid, size, buf);
+	return &buf->data;
+}
+EXPORT_SYMBOL_GPL(rpc_malloc);
+
+/**
+ * rpc_free - free buffer allocated via rpc_malloc
+ * @buffer: buffer to free
+ *
+ */
+void rpc_free(void *buffer)
+{
+	size_t size;
+	struct rpc_buffer *buf;
+
+	if (!buffer)
+		return;
+
+	buf = container_of(buffer, struct rpc_buffer, data);
+	size = buf->len;
+
+	dprintk("RPC:       freeing buffer of size %zu at %p\n",
+			size, buf);
+
+	if (size <= RPC_BUFFER_MAXSIZE)
+		mempool_free(buf, rpc_buffer_mempool);
+	else
+		kfree(buf);
+}
+EXPORT_SYMBOL_GPL(rpc_free);
+
+/*
+ * Creation and deletion of RPC task structures
+ */
+static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
+{
+	memset(task, 0, sizeof(*task));
+	atomic_set(&task->tk_count, 1);
+	task->tk_flags  = task_setup_data->flags;
+	task->tk_ops = task_setup_data->callback_ops;
+	task->tk_calldata = task_setup_data->callback_data;
+	INIT_LIST_HEAD(&task->tk_task);
+
+	/* Initialize retry counters */
+	task->tk_garb_retry = 2;
+	task->tk_cred_retry = 2;
+	task->tk_rebind_retry = 2;
+
+	task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
+	task->tk_owner = current->tgid;
+
+	/* Initialize workqueue for async tasks */
+	task->tk_workqueue = task_setup_data->workqueue;
+
+	if (task->tk_ops->rpc_call_prepare != NULL)
+		task->tk_action = rpc_prepare_task;
+
+	/* starting timestamp */
+	task->tk_start = ktime_get();
+
+	dprintk("RPC:       new task initialized, procpid %u\n",
+				task_pid_nr(current));
+}
+
+static struct rpc_task *
+rpc_alloc_task(void)
+{
+	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+}
+
+/*
+ * Create a new task for the specified client.
+ */
+struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
+{
+	struct rpc_task	*task = setup_data->task;
+	unsigned short flags = 0;
+
+	if (task == NULL) {
+		task = rpc_alloc_task();
+		if (task == NULL) {
+			rpc_release_calldata(setup_data->callback_ops,
+					setup_data->callback_data);
+			return ERR_PTR(-ENOMEM);
+		}
+		flags = RPC_TASK_DYNAMIC;
+	}
+
+	rpc_init_task(task, setup_data);
+	task->tk_flags |= flags;
+	dprintk("RPC:       allocated task %p\n", task);
+	return task;
+}
+
+static void rpc_free_task(struct rpc_task *task)
+{
+	const struct rpc_call_ops *tk_ops = task->tk_ops;
+	void *calldata = task->tk_calldata;
+
+	if (task->tk_flags & RPC_TASK_DYNAMIC) {
+		dprintk("RPC: %5u freeing task\n", task->tk_pid);
+		mempool_free(task, rpc_task_mempool);
+	}
+	rpc_release_calldata(tk_ops, calldata);
+}
+
+static void rpc_async_release(struct work_struct *work)
+{
+	rpc_free_task(container_of(work, struct rpc_task, u.tk_work));
+}
+
+static void rpc_release_resources_task(struct rpc_task *task)
+{
+	if (task->tk_rqstp)
+		xprt_release(task);
+	if (task->tk_msg.rpc_cred) {
+		put_rpccred(task->tk_msg.rpc_cred);
+		task->tk_msg.rpc_cred = NULL;
+	}
+	rpc_task_release_client(task);
+}
+
+static void rpc_final_put_task(struct rpc_task *task,
+		struct workqueue_struct *q)
+{
+	if (q != NULL) {
+		INIT_WORK(&task->u.tk_work, rpc_async_release);
+		queue_work(q, &task->u.tk_work);
+	} else
+		rpc_free_task(task);
+}
+
+static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q)
+{
+	if (atomic_dec_and_test(&task->tk_count)) {
+		rpc_release_resources_task(task);
+		rpc_final_put_task(task, q);
+	}
+}
+
+void rpc_put_task(struct rpc_task *task)
+{
+	rpc_do_put_task(task, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_put_task);
+
+void rpc_put_task_async(struct rpc_task *task)
+{
+	rpc_do_put_task(task, task->tk_workqueue);
+}
+EXPORT_SYMBOL_GPL(rpc_put_task_async);
+
+static void rpc_release_task(struct rpc_task *task)
+{
+	dprintk("RPC: %5u release task\n", task->tk_pid);
+
+	BUG_ON (RPC_IS_QUEUED(task));
+
+	rpc_release_resources_task(task);
+
+	/*
+	 * Note: at this point we have been removed from rpc_clnt->cl_tasks,
+	 * so it should be safe to use task->tk_count as a test for whether
+	 * or not any other processes still hold references to our rpc_task.
+	 */
+	if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) {
+		/* Wake up anyone who may be waiting for task completion */
+		if (!rpc_complete_task(task))
+			return;
+	} else {
+		if (!atomic_dec_and_test(&task->tk_count))
+			return;
+	}
+	rpc_final_put_task(task, task->tk_workqueue);
+}
+
+int rpciod_up(void)
+{
+	return try_module_get(THIS_MODULE) ? 0 : -EINVAL;
+}
+
+void rpciod_down(void)
+{
+	module_put(THIS_MODULE);
+}
+
+/*
+ * Start up the rpciod workqueue.
+ */
+static int rpciod_start(void)
+{
+	struct workqueue_struct *wq;
+
+	/*
+	 * Create the rpciod thread and wait for it to start.
+	 */
+	dprintk("RPC:       creating workqueue rpciod\n");
+	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+	rpciod_workqueue = wq;
+	return rpciod_workqueue != NULL;
+}
+
+static void rpciod_stop(void)
+{
+	struct workqueue_struct *wq = NULL;
+
+	if (rpciod_workqueue == NULL)
+		return;
+	dprintk("RPC:       destroying workqueue rpciod\n");
+
+	wq = rpciod_workqueue;
+	rpciod_workqueue = NULL;
+	destroy_workqueue(wq);
+}
+
+void
+rpc_destroy_mempool(void)
+{
+	rpciod_stop();
+	if (rpc_buffer_mempool)
+		mempool_destroy(rpc_buffer_mempool);
+	if (rpc_task_mempool)
+		mempool_destroy(rpc_task_mempool);
+	if (rpc_task_slabp)
+		kmem_cache_destroy(rpc_task_slabp);
+	if (rpc_buffer_slabp)
+		kmem_cache_destroy(rpc_buffer_slabp);
+	rpc_destroy_wait_queue(&delay_queue);
+}
+
+int
+rpc_init_mempool(void)
+{
+	/*
+	 * The following is not strictly a mempool initialisation,
+	 * but there is no harm in doing it here
+	 */
+	rpc_init_wait_queue(&delay_queue, "delayq");
+	if (!rpciod_start())
+		goto err_nomem;
+
+	rpc_task_slabp = kmem_cache_create("rpc_tasks",
+					     sizeof(struct rpc_task),
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (!rpc_task_slabp)
+		goto err_nomem;
+	rpc_buffer_slabp = kmem_cache_create("rpc_buffers",
+					     RPC_BUFFER_MAXSIZE,
+					     0, SLAB_HWCACHE_ALIGN,
+					     NULL);
+	if (!rpc_buffer_slabp)
+		goto err_nomem;
+	rpc_task_mempool = mempool_create_slab_pool(RPC_TASK_POOLSIZE,
+						    rpc_task_slabp);
+	if (!rpc_task_mempool)
+		goto err_nomem;
+	rpc_buffer_mempool = mempool_create_slab_pool(RPC_BUFFER_POOLSIZE,
+						      rpc_buffer_slabp);
+	if (!rpc_buffer_mempool)
+		goto err_nomem;
+	return 0;
+err_nomem:
+	rpc_destroy_mempool();
+	return -ENOMEM;
+}
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
new file mode 100644
index 00000000..10b4319e
--- /dev/null
+++ b/net/sunrpc/socklib.c
@@ -0,0 +1,185 @@
+/*
+ * linux/net/sunrpc/socklib.c
+ *
+ * Common socket helper routines for RPC client and server
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/gfp.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/pagemap.h>
+#include <linux/udp.h>
+#include <linux/sunrpc/xdr.h>
+
+
+/**
+ * xdr_skb_read_bits - copy some data bits from skb to internal buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Possibly called several times to iterate over an sk_buff and copy
+ * data out of it.
+ */
+size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+{
+	if (len > desc->count)
+		len = desc->count;
+	if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len)))
+		return 0;
+	desc->count -= len;
+	desc->offset += len;
+	return len;
+}
+EXPORT_SYMBOL_GPL(xdr_skb_read_bits);
+
+/**
+ * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer
+ * @desc: sk_buff copy helper
+ * @to: copy destination
+ * @len: number of bytes to copy
+ *
+ * Same as skb_read_bits, but calculate a checksum at the same time.
+ */
+static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len)
+{
+	unsigned int pos;
+	__wsum csum2;
+
+	if (len > desc->count)
+		len = desc->count;
+	pos = desc->offset;
+	csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0);
+	desc->csum = csum_block_add(desc->csum, csum2, pos);
+	desc->count -= len;
+	desc->offset += len;
+	return len;
+}
+
+/**
+ * xdr_partial_copy_from_skb - copy data out of an skb
+ * @xdr: target XDR buffer
+ * @base: starting offset
+ * @desc: sk_buff copy helper
+ * @copy_actor: virtual method for copying data
+ *
+ */
+ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor)
+{
+	struct page	**ppage = xdr->pages;
+	unsigned int	len, pglen = xdr->page_len;
+	ssize_t		copied = 0;
+	size_t		ret;
+
+	len = xdr->head[0].iov_len;
+	if (base < len) {
+		len -= base;
+		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+		copied += ret;
+		if (ret != len || !desc->count)
+			goto out;
+		base = 0;
+	} else
+		base -= len;
+
+	if (unlikely(pglen == 0))
+		goto copy_tail;
+	if (unlikely(base >= pglen)) {
+		base -= pglen;
+		goto copy_tail;
+	}
+	if (base || xdr->page_base) {
+		pglen -= base;
+		base += xdr->page_base;
+		ppage += base >> PAGE_CACHE_SHIFT;
+		base &= ~PAGE_CACHE_MASK;
+	}
+	do {
+		char *kaddr;
+
+		/* ACL likes to be lazy in allocating pages - ACLs
+		 * are small by default but can get huge. */
+		if (unlikely(*ppage == NULL)) {
+			*ppage = alloc_page(GFP_ATOMIC);
+			if (unlikely(*ppage == NULL)) {
+				if (copied == 0)
+					copied = -ENOMEM;
+				goto out;
+			}
+		}
+
+		len = PAGE_CACHE_SIZE;
+		kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
+		if (base) {
+			len -= base;
+			if (pglen < len)
+				len = pglen;
+			ret = copy_actor(desc, kaddr + base, len);
+			base = 0;
+		} else {
+			if (pglen < len)
+				len = pglen;
+			ret = copy_actor(desc, kaddr, len);
+		}
+		flush_dcache_page(*ppage);
+		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+		copied += ret;
+		if (ret != len || !desc->count)
+			goto out;
+		ppage++;
+	} while ((pglen -= len) != 0);
+copy_tail:
+	len = xdr->tail[0].iov_len;
+	if (base < len)
+		copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+	return copied;
+}
+EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb);
+
+/**
+ * csum_partial_copy_to_xdr - checksum and copy data
+ * @xdr: target XDR buffer
+ * @skb: source skb
+ *
+ * We have set things up such that we perform the checksum of the UDP
+ * packet in parallel with the copies into the RPC client iovec.  -DaveM
+ */
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+	struct xdr_skb_reader	desc;
+
+	desc.skb = skb;
+	desc.offset = sizeof(struct udphdr);
+	desc.count = skb->len - desc.offset;
+
+	if (skb_csum_unnecessary(skb))
+		goto no_checksum;
+
+	desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0)
+		return -1;
+	if (desc.offset != skb->len) {
+		__wsum csum2;
+		csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
+		desc.csum = csum_block_add(desc.csum, csum2, desc.offset);
+	}
+	if (desc.count)
+		return -1;
+	if (csum_fold(desc.csum))
+		return -1;
+	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
+		netdev_rx_csum_fault(skb->dev);
+	return 0;
+no_checksum:
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
+		return -1;
+	if (desc.count)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
new file mode 100644
index 00000000..80df89d9
--- /dev/null
+++ b/net/sunrpc/stats.c
@@ -0,0 +1,277 @@
+/*
+ * linux/net/sunrpc/stats.c
+ *
+ * procfs-based user access to generic RPC statistics. The stats files
+ * reside in /proc/net/rpc.
+ *
+ * The read routines assume that the buffer passed in is just big enough.
+ * If you implement an RPC service that has its own stats routine which
+ * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE
+ * limit.
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/metrics.h>
+
+#include "netns.h"
+
+#define RPCDBG_FACILITY	RPCDBG_MISC
+
+/*
+ * Get RPC client stats
+ */
+static int rpc_proc_show(struct seq_file *seq, void *v) {
+	const struct rpc_stat	*statp = seq->private;
+	const struct rpc_program *prog = statp->program;
+	unsigned int i, j;
+
+	seq_printf(seq,
+		"net %u %u %u %u\n",
+			statp->netcnt,
+			statp->netudpcnt,
+			statp->nettcpcnt,
+			statp->nettcpconn);
+	seq_printf(seq,
+		"rpc %u %u %u\n",
+			statp->rpccnt,
+			statp->rpcretrans,
+			statp->rpcauthrefresh);
+
+	for (i = 0; i < prog->nrvers; i++) {
+		const struct rpc_version *vers = prog->version[i];
+		if (!vers)
+			continue;
+		seq_printf(seq, "proc%u %u",
+					vers->number, vers->nrprocs);
+		for (j = 0; j < vers->nrprocs; j++)
+			seq_printf(seq, " %u",
+					vers->procs[j].p_count);
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static int rpc_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rpc_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations rpc_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = rpc_proc_open,
+	.read  = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * Get RPC server stats
+ */
+void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
+	const struct svc_program *prog = statp->program;
+	const struct svc_procedure *proc;
+	const struct svc_version *vers;
+	unsigned int i, j;
+
+	seq_printf(seq,
+		"net %u %u %u %u\n",
+			statp->netcnt,
+			statp->netudpcnt,
+			statp->nettcpcnt,
+			statp->nettcpconn);
+	seq_printf(seq,
+		"rpc %u %u %u %u %u\n",
+			statp->rpccnt,
+			statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt,
+			statp->rpcbadfmt,
+			statp->rpcbadauth,
+			statp->rpcbadclnt);
+
+	for (i = 0; i < prog->pg_nvers; i++) {
+		if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc))
+			continue;
+		seq_printf(seq, "proc%d %u", i, vers->vs_nproc);
+		for (j = 0; j < vers->vs_nproc; j++, proc++)
+			seq_printf(seq, " %u", proc->pc_count);
+		seq_putc(seq, '\n');
+	}
+}
+EXPORT_SYMBOL_GPL(svc_seq_show);
+
+/**
+ * rpc_alloc_iostats - allocate an rpc_iostats structure
+ * @clnt: RPC program, version, and xprt
+ *
+ */
+struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt)
+{
+	return kcalloc(clnt->cl_maxproc, sizeof(struct rpc_iostats), GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(rpc_alloc_iostats);
+
+/**
+ * rpc_free_iostats - release an rpc_iostats structure
+ * @stats: doomed rpc_iostats structure
+ *
+ */
+void rpc_free_iostats(struct rpc_iostats *stats)
+{
+	kfree(stats);
+}
+EXPORT_SYMBOL_GPL(rpc_free_iostats);
+
+/**
+ * rpc_count_iostats - tally up per-task stats
+ * @task: completed rpc_task
+ *
+ * Relies on the caller for serialization.
+ */
+void rpc_count_iostats(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_iostats *stats;
+	struct rpc_iostats *op_metrics;
+	ktime_t delta;
+
+	if (!task->tk_client || !task->tk_client->cl_metrics || !req)
+		return;
+
+	stats = task->tk_client->cl_metrics;
+	op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx];
+
+	op_metrics->om_ops++;
+	op_metrics->om_ntrans += req->rq_ntrans;
+	op_metrics->om_timeouts += task->tk_timeouts;
+
+	op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent;
+	op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd;
+
+	delta = ktime_sub(req->rq_xtime, task->tk_start);
+	op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta);
+
+	op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt);
+
+	delta = ktime_sub(ktime_get(), task->tk_start);
+	op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta);
+}
+
+static void _print_name(struct seq_file *seq, unsigned int op,
+			struct rpc_procinfo *procs)
+{
+	if (procs[op].p_name)
+		seq_printf(seq, "\t%12s: ", procs[op].p_name);
+	else if (op == 0)
+		seq_printf(seq, "\t        NULL: ");
+	else
+		seq_printf(seq, "\t%12u: ", op);
+}
+
+void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
+{
+	struct rpc_iostats *stats = clnt->cl_metrics;
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+	unsigned int op, maxproc = clnt->cl_maxproc;
+
+	if (!stats)
+		return;
+
+	seq_printf(seq, "\tRPC iostats version: %s  ", RPC_IOSTATS_VERS);
+	seq_printf(seq, "p/v: %u/%u (%s)\n",
+			clnt->cl_prog, clnt->cl_vers, clnt->cl_protname);
+
+	if (xprt)
+		xprt->ops->print_stats(xprt, seq);
+
+	seq_printf(seq, "\tper-op statistics\n");
+	for (op = 0; op < maxproc; op++) {
+		struct rpc_iostats *metrics = &stats[op];
+		_print_name(seq, op, clnt->cl_procinfo);
+		seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+				metrics->om_ops,
+				metrics->om_ntrans,
+				metrics->om_timeouts,
+				metrics->om_bytes_sent,
+				metrics->om_bytes_recv,
+				ktime_to_ms(metrics->om_queue),
+				ktime_to_ms(metrics->om_rtt),
+				ktime_to_ms(metrics->om_execute));
+	}
+}
+EXPORT_SYMBOL_GPL(rpc_print_iostats);
+
+/*
+ * Register/unregister RPC proc files
+ */
+static inline struct proc_dir_entry *
+do_register(const char *name, void *data, const struct file_operations *fops)
+{
+	struct sunrpc_net *sn;
+
+	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
+	sn = net_generic(&init_net, sunrpc_net_id);
+	return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+}
+
+struct proc_dir_entry *
+rpc_proc_register(struct rpc_stat *statp)
+{
+	return do_register(statp->program->name, statp, &rpc_proc_fops);
+}
+EXPORT_SYMBOL_GPL(rpc_proc_register);
+
+void
+rpc_proc_unregister(const char *name)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(&init_net, sunrpc_net_id);
+	remove_proc_entry(name, sn->proc_net_rpc);
+}
+EXPORT_SYMBOL_GPL(rpc_proc_unregister);
+
+struct proc_dir_entry *
+svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
+{
+	return do_register(statp->program->pg_name, statp, fops);
+}
+EXPORT_SYMBOL_GPL(svc_proc_register);
+
+void
+svc_proc_unregister(const char *name)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(&init_net, sunrpc_net_id);
+	remove_proc_entry(name, sn->proc_net_rpc);
+}
+EXPORT_SYMBOL_GPL(svc_proc_unregister);
+
+int rpc_proc_init(struct net *net)
+{
+	struct sunrpc_net *sn;
+
+	dprintk("RPC:       registering /proc/net/rpc\n");
+	sn = net_generic(net, sunrpc_net_id);
+	sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
+	if (sn->proc_net_rpc == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rpc_proc_exit(struct net *net)
+{
+	dprintk("RPC:       unregistering /proc/net/rpc\n");
+	remove_proc_entry("rpc", net->proc_net);
+}
+
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
new file mode 100644
index 00000000..90c292e2
--- /dev/null
+++ b/net/sunrpc/sunrpc.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+
+(c) 2008 NetApp.  All Rights Reserved.
+
+NetApp provides this source code under the GPL v2 License.
+The GPL v2 license is available at
+http://opensource.org/licenses/gpl-license.php.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+******************************************************************************/
+
+/*
+ * Functions and macros used internally by RPC
+ */
+
+#ifndef _NET_SUNRPC_SUNRPC_H
+#define _NET_SUNRPC_SUNRPC_H
+
+#include <linux/net.h>
+
+/*
+ * Header for dynamically allocated rpc buffers.
+ */
+struct rpc_buffer {
+	size_t	len;
+	char	data[];
+};
+
+static inline int rpc_reply_expected(struct rpc_task *task)
+{
+	return (task->tk_msg.rpc_proc != NULL) &&
+		(task->tk_msg.rpc_proc->p_decode != NULL);
+}
+
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+		    struct page *headpage, unsigned long headoffset,
+		    struct page *tailpage, unsigned long tailoffset);
+
+#endif /* _NET_SUNRPC_SUNRPC_H */
+
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
new file mode 100644
index 00000000..9d080916
--- /dev/null
+++ b/net/sunrpc/sunrpc_syms.c
@@ -0,0 +1,120 @@
+/*
+ * linux/net/sunrpc/sunrpc_syms.c
+ *
+ * Symbols exported by the sunrpc module.
+ *
+ * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/auth.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/xprtsock.h>
+
+#include "netns.h"
+
+int sunrpc_net_id;
+
+static __net_init int sunrpc_init_net(struct net *net)
+{
+	int err;
+
+	err = rpc_proc_init(net);
+	if (err)
+		goto err_proc;
+
+	err = ip_map_cache_create(net);
+	if (err)
+		goto err_ipmap;
+
+	return 0;
+
+err_ipmap:
+	rpc_proc_exit(net);
+err_proc:
+	return err;
+}
+
+static __net_exit void sunrpc_exit_net(struct net *net)
+{
+	ip_map_cache_destroy(net);
+	rpc_proc_exit(net);
+}
+
+static struct pernet_operations sunrpc_net_ops = {
+	.init = sunrpc_init_net,
+	.exit = sunrpc_exit_net,
+	.id = &sunrpc_net_id,
+	.size = sizeof(struct sunrpc_net),
+};
+
+extern struct cache_detail unix_gid_cache;
+
+extern void cleanup_rpcb_clnt(void);
+
+static int __init
+init_sunrpc(void)
+{
+	int err = register_rpc_pipefs();
+	if (err)
+		goto out;
+	err = rpc_init_mempool();
+	if (err)
+		goto out2;
+	err = rpcauth_init_module();
+	if (err)
+		goto out3;
+
+	cache_initialize();
+
+	err = register_pernet_subsys(&sunrpc_net_ops);
+	if (err)
+		goto out4;
+#ifdef RPC_DEBUG
+	rpc_register_sysctl();
+#endif
+	cache_register(&unix_gid_cache);
+	svc_init_xprt_sock();	/* svc sock transport */
+	init_socket_xprt();	/* clnt sock transport */
+	return 0;
+
+out4:
+	rpcauth_remove_module();
+out3:
+	rpc_destroy_mempool();
+out2:
+	unregister_rpc_pipefs();
+out:
+	return err;
+}
+
+static void __exit
+cleanup_sunrpc(void)
+{
+	cleanup_rpcb_clnt();
+	rpcauth_remove_module();
+	cleanup_socket_xprt();
+	svc_cleanup_xprt_sock();
+	unregister_rpc_pipefs();
+	rpc_destroy_mempool();
+	cache_unregister(&unix_gid_cache);
+	unregister_pernet_subsys(&sunrpc_net_ops);
+#ifdef RPC_DEBUG
+	rpc_unregister_sysctl();
+#endif
+	rcu_barrier(); /* Wait for completion of call_rcu()'s */
+}
+MODULE_LICENSE("GPL");
+fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */
+module_exit(cleanup_sunrpc);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
new file mode 100644
index 00000000..ce5f111f
--- /dev/null
+++ b/net/sunrpc/svc.c
@@ -0,0 +1,1323 @@
+/*
+ * linux/net/sunrpc/svc.c
+ *
+ * High-level RPC service routines
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * Multiple threads pools and NUMAisation
+ * Copyright (c) 2006 Silicon Graphics, Inc.
+ * by Greg Banks <gnb@melbourne.sgi.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCDSP
+
+static void svc_unregister(const struct svc_serv *serv);
+
+#define svc_serv_is_pooled(serv)    ((serv)->sv_function)
+
+/*
+ * Mode for mapping cpus to pools.
+ */
+enum {
+	SVC_POOL_AUTO = -1,	/* choose one of the others */
+	SVC_POOL_GLOBAL,	/* no mapping, just a single global pool
+				 * (legacy & UP mode) */
+	SVC_POOL_PERCPU,	/* one pool per cpu */
+	SVC_POOL_PERNODE	/* one pool per numa node */
+};
+#define SVC_POOL_DEFAULT	SVC_POOL_GLOBAL
+
+/*
+ * Structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+static struct svc_pool_map {
+	int count;			/* How many svc_servs use us */
+	int mode;			/* Note: int not enum to avoid
+					 * warnings about "enumeration value
+					 * not handled in switch" */
+	unsigned int npools;
+	unsigned int *pool_to;		/* maps pool id to cpu or node */
+	unsigned int *to_pool;		/* maps cpu or node to pool id */
+} svc_pool_map = {
+	.count = 0,
+	.mode = SVC_POOL_DEFAULT
+};
+static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
+
+static int
+param_set_pool_mode(const char *val, struct kernel_param *kp)
+{
+	int *ip = (int *)kp->arg;
+	struct svc_pool_map *m = &svc_pool_map;
+	int err;
+
+	mutex_lock(&svc_pool_map_mutex);
+
+	err = -EBUSY;
+	if (m->count)
+		goto out;
+
+	err = 0;
+	if (!strncmp(val, "auto", 4))
+		*ip = SVC_POOL_AUTO;
+	else if (!strncmp(val, "global", 6))
+		*ip = SVC_POOL_GLOBAL;
+	else if (!strncmp(val, "percpu", 6))
+		*ip = SVC_POOL_PERCPU;
+	else if (!strncmp(val, "pernode", 7))
+		*ip = SVC_POOL_PERNODE;
+	else
+		err = -EINVAL;
+
+out:
+	mutex_unlock(&svc_pool_map_mutex);
+	return err;
+}
+
+static int
+param_get_pool_mode(char *buf, struct kernel_param *kp)
+{
+	int *ip = (int *)kp->arg;
+
+	switch (*ip)
+	{
+	case SVC_POOL_AUTO:
+		return strlcpy(buf, "auto", 20);
+	case SVC_POOL_GLOBAL:
+		return strlcpy(buf, "global", 20);
+	case SVC_POOL_PERCPU:
+		return strlcpy(buf, "percpu", 20);
+	case SVC_POOL_PERNODE:
+		return strlcpy(buf, "pernode", 20);
+	default:
+		return sprintf(buf, "%d", *ip);
+	}
+}
+
+module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
+		 &svc_pool_map.mode, 0644);
+
+/*
+ * Detect best pool mapping mode heuristically,
+ * according to the machine's topology.
+ */
+static int
+svc_pool_map_choose_mode(void)
+{
+	unsigned int node;
+
+	if (nr_online_nodes > 1) {
+		/*
+		 * Actually have multiple NUMA nodes,
+		 * so split pools on NUMA node boundaries
+		 */
+		return SVC_POOL_PERNODE;
+	}
+
+	node = first_online_node;
+	if (nr_cpus_node(node) > 2) {
+		/*
+		 * Non-trivial SMP, or CONFIG_NUMA on
+		 * non-NUMA hardware, e.g. with a generic
+		 * x86_64 kernel on Xeons.  In this case we
+		 * want to divide the pools on cpu boundaries.
+		 */
+		return SVC_POOL_PERCPU;
+	}
+
+	/* default: one global pool */
+	return SVC_POOL_GLOBAL;
+}
+
+/*
+ * Allocate the to_pool[] and pool_to[] arrays.
+ * Returns 0 on success or an errno.
+ */
+static int
+svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
+{
+	m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+	if (!m->to_pool)
+		goto fail;
+	m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
+	if (!m->pool_to)
+		goto fail_free;
+
+	return 0;
+
+fail_free:
+	kfree(m->to_pool);
+	m->to_pool = NULL;
+fail:
+	return -ENOMEM;
+}
+
+/*
+ * Initialise the pool map for SVC_POOL_PERCPU mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_percpu(struct svc_pool_map *m)
+{
+	unsigned int maxpools = nr_cpu_ids;
+	unsigned int pidx = 0;
+	unsigned int cpu;
+	int err;
+
+	err = svc_pool_map_alloc_arrays(m, maxpools);
+	if (err)
+		return err;
+
+	for_each_online_cpu(cpu) {
+		BUG_ON(pidx > maxpools);
+		m->to_pool[cpu] = pidx;
+		m->pool_to[pidx] = cpu;
+		pidx++;
+	}
+	/* cpus brought online later all get mapped to pool0, sorry */
+
+	return pidx;
+};
+
+
+/*
+ * Initialise the pool map for SVC_POOL_PERNODE mode.
+ * Returns number of pools or <0 on error.
+ */
+static int
+svc_pool_map_init_pernode(struct svc_pool_map *m)
+{
+	unsigned int maxpools = nr_node_ids;
+	unsigned int pidx = 0;
+	unsigned int node;
+	int err;
+
+	err = svc_pool_map_alloc_arrays(m, maxpools);
+	if (err)
+		return err;
+
+	for_each_node_with_cpus(node) {
+		/* some architectures (e.g. SN2) have cpuless nodes */
+		BUG_ON(pidx > maxpools);
+		m->to_pool[node] = pidx;
+		m->pool_to[pidx] = node;
+		pidx++;
+	}
+	/* nodes brought online later all get mapped to pool0, sorry */
+
+	return pidx;
+}
+
+
+/*
+ * Add a reference to the global map of cpus to pools (and
+ * vice versa).  Initialise the map if we're the first user.
+ * Returns the number of pools.
+ */
+static unsigned int
+svc_pool_map_get(void)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	int npools = -1;
+
+	mutex_lock(&svc_pool_map_mutex);
+
+	if (m->count++) {
+		mutex_unlock(&svc_pool_map_mutex);
+		return m->npools;
+	}
+
+	if (m->mode == SVC_POOL_AUTO)
+		m->mode = svc_pool_map_choose_mode();
+
+	switch (m->mode) {
+	case SVC_POOL_PERCPU:
+		npools = svc_pool_map_init_percpu(m);
+		break;
+	case SVC_POOL_PERNODE:
+		npools = svc_pool_map_init_pernode(m);
+		break;
+	}
+
+	if (npools < 0) {
+		/* default, or memory allocation failure */
+		npools = 1;
+		m->mode = SVC_POOL_GLOBAL;
+	}
+	m->npools = npools;
+
+	mutex_unlock(&svc_pool_map_mutex);
+	return m->npools;
+}
+
+
+/*
+ * Drop a reference to the global map of cpus to pools.
+ * When the last reference is dropped, the map data is
+ * freed; this allows the sysadmin to change the pool
+ * mode using the pool_mode module option without
+ * rebooting or re-loading sunrpc.ko.
+ */
+static void
+svc_pool_map_put(void)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+
+	mutex_lock(&svc_pool_map_mutex);
+
+	if (!--m->count) {
+		m->mode = SVC_POOL_DEFAULT;
+		kfree(m->to_pool);
+		m->to_pool = NULL;
+		kfree(m->pool_to);
+		m->pool_to = NULL;
+		m->npools = 0;
+	}
+
+	mutex_unlock(&svc_pool_map_mutex);
+}
+
+
+/*
+ * Set the given thread's cpus_allowed mask so that it
+ * will only run on cpus in the given pool.
+ */
+static inline void
+svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int node = m->pool_to[pidx];
+
+	/*
+	 * The caller checks for sv_nrpools > 1, which
+	 * implies that we've been initialized.
+	 */
+	BUG_ON(m->count == 0);
+
+	switch (m->mode) {
+	case SVC_POOL_PERCPU:
+	{
+		set_cpus_allowed_ptr(task, cpumask_of(node));
+		break;
+	}
+	case SVC_POOL_PERNODE:
+	{
+		set_cpus_allowed_ptr(task, cpumask_of_node(node));
+		break;
+	}
+	}
+}
+
+/*
+ * Use the mapping mode to choose a pool for a given CPU.
+ * Used when enqueueing an incoming RPC.  Always returns
+ * a non-NULL pool pointer.
+ */
+struct svc_pool *
+svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+{
+	struct svc_pool_map *m = &svc_pool_map;
+	unsigned int pidx = 0;
+
+	/*
+	 * An uninitialised map happens in a pure client when
+	 * lockd is brought up, so silently treat it the
+	 * same as SVC_POOL_GLOBAL.
+	 */
+	if (svc_serv_is_pooled(serv)) {
+		switch (m->mode) {
+		case SVC_POOL_PERCPU:
+			pidx = m->to_pool[cpu];
+			break;
+		case SVC_POOL_PERNODE:
+			pidx = m->to_pool[cpu_to_node(cpu)];
+			break;
+		}
+	}
+	return &serv->sv_pools[pidx % serv->sv_nrpools];
+}
+
+
+/*
+ * Create an RPC service
+ */
+static struct svc_serv *
+__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
+	     void (*shutdown)(struct svc_serv *serv))
+{
+	struct svc_serv	*serv;
+	unsigned int vers;
+	unsigned int xdrsize;
+	unsigned int i;
+
+	if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
+		return NULL;
+	serv->sv_name      = prog->pg_name;
+	serv->sv_program   = prog;
+	serv->sv_nrthreads = 1;
+	serv->sv_stats     = prog->pg_stats;
+	if (bufsize > RPCSVC_MAXPAYLOAD)
+		bufsize = RPCSVC_MAXPAYLOAD;
+	serv->sv_max_payload = bufsize? bufsize : 4096;
+	serv->sv_max_mesg  = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
+	serv->sv_shutdown  = shutdown;
+	xdrsize = 0;
+	while (prog) {
+		prog->pg_lovers = prog->pg_nvers-1;
+		for (vers=0; vers<prog->pg_nvers ; vers++)
+			if (prog->pg_vers[vers]) {
+				prog->pg_hivers = vers;
+				if (prog->pg_lovers > vers)
+					prog->pg_lovers = vers;
+				if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+					xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+			}
+		prog = prog->pg_next;
+	}
+	serv->sv_xdrsize   = xdrsize;
+	INIT_LIST_HEAD(&serv->sv_tempsocks);
+	INIT_LIST_HEAD(&serv->sv_permsocks);
+	init_timer(&serv->sv_temptimer);
+	spin_lock_init(&serv->sv_lock);
+
+	serv->sv_nrpools = npools;
+	serv->sv_pools =
+		kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
+			GFP_KERNEL);
+	if (!serv->sv_pools) {
+		kfree(serv);
+		return NULL;
+	}
+
+	for (i = 0; i < serv->sv_nrpools; i++) {
+		struct svc_pool *pool = &serv->sv_pools[i];
+
+		dprintk("svc: initialising pool %u for %s\n",
+				i, serv->sv_name);
+
+		pool->sp_id = i;
+		INIT_LIST_HEAD(&pool->sp_threads);
+		INIT_LIST_HEAD(&pool->sp_sockets);
+		INIT_LIST_HEAD(&pool->sp_all_threads);
+		spin_lock_init(&pool->sp_lock);
+	}
+
+	/* Remove any stale portmap registrations */
+	svc_unregister(serv);
+
+	return serv;
+}
+
+struct svc_serv *
+svc_create(struct svc_program *prog, unsigned int bufsize,
+	   void (*shutdown)(struct svc_serv *serv))
+{
+	return __svc_create(prog, bufsize, /*npools*/1, shutdown);
+}
+EXPORT_SYMBOL_GPL(svc_create);
+
+struct svc_serv *
+svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
+		  void (*shutdown)(struct svc_serv *serv),
+		  svc_thread_fn func, struct module *mod)
+{
+	struct svc_serv *serv;
+	unsigned int npools = svc_pool_map_get();
+
+	serv = __svc_create(prog, bufsize, npools, shutdown);
+
+	if (serv != NULL) {
+		serv->sv_function = func;
+		serv->sv_module = mod;
+	}
+
+	return serv;
+}
+EXPORT_SYMBOL_GPL(svc_create_pooled);
+
+/*
+ * Destroy an RPC service. Should be called with appropriate locking to
+ * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
+ */
+void
+svc_destroy(struct svc_serv *serv)
+{
+	dprintk("svc: svc_destroy(%s, %d)\n",
+				serv->sv_program->pg_name,
+				serv->sv_nrthreads);
+
+	if (serv->sv_nrthreads) {
+		if (--(serv->sv_nrthreads) != 0) {
+			svc_sock_update_bufs(serv);
+			return;
+		}
+	} else
+		printk("svc_destroy: no threads for serv=%p!\n", serv);
+
+	del_timer_sync(&serv->sv_temptimer);
+	/*
+	 * The set of xprts (contained in the sv_tempsocks and
+	 * sv_permsocks lists) is now constant, since it is modified
+	 * only by accepting new sockets (done by service threads in
+	 * svc_recv) or aging old ones (done by sv_temptimer), or
+	 * configuration changes (excluded by whatever locking the
+	 * caller is using--nfsd_mutex in the case of nfsd).  So it's
+	 * safe to traverse those lists and shut everything down:
+	 */
+	svc_close_all(serv);
+
+	if (serv->sv_shutdown)
+		serv->sv_shutdown(serv);
+
+	cache_clean_deferred(serv);
+
+	if (svc_serv_is_pooled(serv))
+		svc_pool_map_put();
+
+	svc_unregister(serv);
+	kfree(serv->sv_pools);
+	kfree(serv);
+}
+EXPORT_SYMBOL_GPL(svc_destroy);
+
+/*
+ * Allocate an RPC server's buffer space.
+ * We allocate pages and place them in rq_argpages.
+ */
+static int
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
+{
+	unsigned int pages, arghi;
+
+	/* bc_xprt uses fore channel allocated buffers */
+	if (svc_is_backchannel(rqstp))
+		return 1;
+
+	pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply.
+				       * We assume one is at most one page
+				       */
+	arghi = 0;
+	BUG_ON(pages > RPCSVC_MAXPAGES);
+	while (pages) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p)
+			break;
+		rqstp->rq_pages[arghi++] = p;
+		pages--;
+	}
+	return pages == 0;
+}
+
+/*
+ * Release an RPC server buffer
+ */
+static void
+svc_release_buffer(struct svc_rqst *rqstp)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++)
+		if (rqstp->rq_pages[i])
+			put_page(rqstp->rq_pages[i]);
+}
+
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
+{
+	struct svc_rqst	*rqstp;
+
+	rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+	if (!rqstp)
+		goto out_enomem;
+
+	init_waitqueue_head(&rqstp->rq_wait);
+
+	serv->sv_nrthreads++;
+	spin_lock_bh(&pool->sp_lock);
+	pool->sp_nrthreads++;
+	list_add(&rqstp->rq_all, &pool->sp_all_threads);
+	spin_unlock_bh(&pool->sp_lock);
+	rqstp->rq_server = serv;
+	rqstp->rq_pool = pool;
+
+	rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	if (!rqstp->rq_argp)
+		goto out_thread;
+
+	rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+	if (!rqstp->rq_resp)
+		goto out_thread;
+
+	if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+		goto out_thread;
+
+	return rqstp;
+out_thread:
+	svc_exit_thread(rqstp);
+out_enomem:
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(svc_prepare_thread);
+
+/*
+ * Choose a pool in which to create a new thread, for svc_set_num_threads
+ */
+static inline struct svc_pool *
+choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+{
+	if (pool != NULL)
+		return pool;
+
+	return &serv->sv_pools[(*state)++ % serv->sv_nrpools];
+}
+
+/*
+ * Choose a thread to kill, for svc_set_num_threads
+ */
+static inline struct task_struct *
+choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state)
+{
+	unsigned int i;
+	struct task_struct *task = NULL;
+
+	if (pool != NULL) {
+		spin_lock_bh(&pool->sp_lock);
+	} else {
+		/* choose a pool in round-robin fashion */
+		for (i = 0; i < serv->sv_nrpools; i++) {
+			pool = &serv->sv_pools[--(*state) % serv->sv_nrpools];
+			spin_lock_bh(&pool->sp_lock);
+			if (!list_empty(&pool->sp_all_threads))
+				goto found_pool;
+			spin_unlock_bh(&pool->sp_lock);
+		}
+		return NULL;
+	}
+
+found_pool:
+	if (!list_empty(&pool->sp_all_threads)) {
+		struct svc_rqst *rqstp;
+
+		/*
+		 * Remove from the pool->sp_all_threads list
+		 * so we don't try to kill it again.
+		 */
+		rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all);
+		list_del_init(&rqstp->rq_all);
+		task = rqstp->rq_task;
+	}
+	spin_unlock_bh(&pool->sp_lock);
+
+	return task;
+}
+
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number.  If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools.  Must be called with a svc_get() reference and
+ * the BKL or another lock to protect access to svc_serv fields.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do.  Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	struct svc_rqst	*rqstp;
+	struct task_struct *task;
+	struct svc_pool *chosen_pool;
+	int error = 0;
+	unsigned int state = serv->sv_nrthreads-1;
+
+	if (pool == NULL) {
+		/* The -1 assumes caller has done a svc_get() */
+		nrservs -= (serv->sv_nrthreads-1);
+	} else {
+		spin_lock_bh(&pool->sp_lock);
+		nrservs -= pool->sp_nrthreads;
+		spin_unlock_bh(&pool->sp_lock);
+	}
+
+	/* create new threads */
+	while (nrservs > 0) {
+		nrservs--;
+		chosen_pool = choose_pool(serv, pool, &state);
+
+		rqstp = svc_prepare_thread(serv, chosen_pool);
+		if (IS_ERR(rqstp)) {
+			error = PTR_ERR(rqstp);
+			break;
+		}
+
+		__module_get(serv->sv_module);
+		task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
+		if (IS_ERR(task)) {
+			error = PTR_ERR(task);
+			module_put(serv->sv_module);
+			svc_exit_thread(rqstp);
+			break;
+		}
+
+		rqstp->rq_task = task;
+		if (serv->sv_nrpools > 1)
+			svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
+
+		svc_sock_update_bufs(serv);
+		wake_up_process(task);
+	}
+	/* destroy old threads */
+	while (nrservs < 0 &&
+	       (task = choose_victim(serv, pool, &state)) != NULL) {
+		send_sig(SIGINT, task, 1);
+		nrservs++;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads);
+
+/*
+ * Called from a server thread as it's exiting. Caller must hold the BKL or
+ * the "service mutex", whichever is appropriate for the service.
+ */
+void
+svc_exit_thread(struct svc_rqst *rqstp)
+{
+	struct svc_serv	*serv = rqstp->rq_server;
+	struct svc_pool	*pool = rqstp->rq_pool;
+
+	svc_release_buffer(rqstp);
+	kfree(rqstp->rq_resp);
+	kfree(rqstp->rq_argp);
+	kfree(rqstp->rq_auth_data);
+
+	spin_lock_bh(&pool->sp_lock);
+	pool->sp_nrthreads--;
+	list_del(&rqstp->rq_all);
+	spin_unlock_bh(&pool->sp_lock);
+
+	kfree(rqstp);
+
+	/* Release the server */
+	if (serv)
+		svc_destroy(serv);
+}
+EXPORT_SYMBOL_GPL(svc_exit_thread);
+
+/*
+ * Register an "inet" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register4(const u32 program, const u32 version,
+				const unsigned short protocol,
+				const unsigned short port)
+{
+	const struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+		.sin_port		= htons(port),
+	};
+	const char *netid;
+	int error;
+
+	switch (protocol) {
+	case IPPROTO_UDP:
+		netid = RPCBIND_NETID_UDP;
+		break;
+	case IPPROTO_TCP:
+		netid = RPCBIND_NETID_TCP;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	error = rpcb_v4_register(program, version,
+					(const struct sockaddr *)&sin, netid);
+
+	/*
+	 * User space didn't support rpcbind v4, so retry this
+	 * registration request with the legacy rpcbind v2 protocol.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = rpcb_register(program, version, protocol, port);
+
+	return error;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/*
+ * Register an "inet6" protocol family netid with the local
+ * rpcbind daemon via an rpcbind v4 SET request.
+ *
+ * No netconfig infrastructure is available in the kernel, so
+ * we map IP_ protocol numbers to netids by hand.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_rpcb_register6(const u32 program, const u32 version,
+				const unsigned short protocol,
+				const unsigned short port)
+{
+	const struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= htons(port),
+	};
+	const char *netid;
+	int error;
+
+	switch (protocol) {
+	case IPPROTO_UDP:
+		netid = RPCBIND_NETID_UDP6;
+		break;
+	case IPPROTO_TCP:
+		netid = RPCBIND_NETID_TCP6;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	error = rpcb_v4_register(program, version,
+					(const struct sockaddr *)&sin6, netid);
+
+	/*
+	 * User space didn't support rpcbind version 4, so we won't
+	 * use a PF_INET6 listener.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = -EAFNOSUPPORT;
+
+	return error;
+}
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+
+/*
+ * Register a kernel RPC service via rpcbind version 4.
+ *
+ * Returns zero on success; a negative errno value is returned
+ * if any error occurs.
+ */
+static int __svc_register(const char *progname,
+			  const u32 program, const u32 version,
+			  const int family,
+			  const unsigned short protocol,
+			  const unsigned short port)
+{
+	int error = -EAFNOSUPPORT;
+
+	switch (family) {
+	case PF_INET:
+		error = __svc_rpcb_register4(program, version,
+						protocol, port);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case PF_INET6:
+		error = __svc_rpcb_register6(program, version,
+						protocol, port);
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+	}
+
+	if (error < 0)
+		printk(KERN_WARNING "svc: failed to register %sv%u RPC "
+			"service (errno %d).\n", progname, version, -error);
+	return error;
+}
+
+/**
+ * svc_register - register an RPC service with the local portmapper
+ * @serv: svc_serv struct for the service to register
+ * @family: protocol family of service's listener socket
+ * @proto: transport protocol number to advertise
+ * @port: port to advertise
+ *
+ * Service is registered for any address in the passed-in protocol family
+ */
+int svc_register(const struct svc_serv *serv, const int family,
+		 const unsigned short proto, const unsigned short port)
+{
+	struct svc_program	*progp;
+	unsigned int		i;
+	int			error = 0;
+
+	BUG_ON(proto == 0 && port == 0);
+
+	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+		for (i = 0; i < progp->pg_nvers; i++) {
+			if (progp->pg_vers[i] == NULL)
+				continue;
+
+			dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
+					progp->pg_name,
+					i,
+					proto == IPPROTO_UDP?  "udp" : "tcp",
+					port,
+					family,
+					progp->pg_vers[i]->vs_hidden?
+						" (but not telling portmap)" : "");
+
+			if (progp->pg_vers[i]->vs_hidden)
+				continue;
+
+			error = __svc_register(progp->pg_name, progp->pg_prog,
+						i, family, proto, port);
+			if (error < 0)
+				break;
+		}
+	}
+
+	return error;
+}
+
+/*
+ * If user space is running rpcbind, it should take the v4 UNSET
+ * and clear everything for this [program, version].  If user space
+ * is running portmap, it will reject the v4 UNSET, but won't have
+ * any "inet6" entries anyway.  So a PMAP_UNSET should be sufficient
+ * in this case to clear all existing entries for [program, version].
+ */
+static void __svc_unregister(const u32 program, const u32 version,
+			     const char *progname)
+{
+	int error;
+
+	error = rpcb_v4_register(program, version, NULL, "");
+
+	/*
+	 * User space didn't support rpcbind v4, so retry this
+	 * request with the legacy rpcbind v2 protocol.
+	 */
+	if (error == -EPROTONOSUPPORT)
+		error = rpcb_register(program, version, 0, 0);
+
+	dprintk("svc: %s(%sv%u), error %d\n",
+			__func__, progname, version, error);
+}
+
+/*
+ * All netids, bind addresses and ports registered for [program, version]
+ * are removed from the local rpcbind database (if the service is not
+ * hidden) to make way for a new instance of the service.
+ *
+ * The result of unregistration is reported via dprintk for those who want
+ * verification of the result, but is otherwise not important.
+ */
+static void svc_unregister(const struct svc_serv *serv)
+{
+	struct svc_program *progp;
+	unsigned long flags;
+	unsigned int i;
+
+	clear_thread_flag(TIF_SIGPENDING);
+
+	for (progp = serv->sv_program; progp; progp = progp->pg_next) {
+		for (i = 0; i < progp->pg_nvers; i++) {
+			if (progp->pg_vers[i] == NULL)
+				continue;
+			if (progp->pg_vers[i]->vs_hidden)
+				continue;
+
+			dprintk("svc: attempting to unregister %sv%u\n",
+				progp->pg_name, i);
+			__svc_unregister(progp->pg_prog, i, progp->pg_name);
+		}
+	}
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	recalc_sigpending();
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+/*
+ * Printk the given error with the address of the client that caused it.
+ */
+static int
+__attribute__ ((format (printf, 2, 3)))
+svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
+{
+	va_list args;
+	int 	r;
+	char 	buf[RPC_MAX_ADDRBUFLEN];
+
+	if (!net_ratelimit())
+		return 0;
+
+	printk(KERN_WARNING "svc: %s: ",
+		svc_print_addr(rqstp, buf, sizeof(buf)));
+
+	va_start(args, fmt);
+	r = vprintk(fmt, args);
+	va_end(args);
+
+	return r;
+}
+
+/*
+ * Common routine for processing the RPC request.
+ */
+static int
+svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+{
+	struct svc_program	*progp;
+	struct svc_version	*versp = NULL;	/* compiler food */
+	struct svc_procedure	*procp = NULL;
+	struct svc_serv		*serv = rqstp->rq_server;
+	kxdrproc_t		xdr;
+	__be32			*statp;
+	u32			prog, vers, proc;
+	__be32			auth_stat, rpc_stat;
+	int			auth_res;
+	__be32			*reply_statp;
+
+	rpc_stat = rpc_success;
+
+	if (argv->iov_len < 6*4)
+		goto err_short_len;
+
+	/* Will be turned off only in gss privacy case: */
+	rqstp->rq_splice_ok = 1;
+	/* Will be turned off only when NFSv4 Sessions are used */
+	rqstp->rq_usedeferral = 1;
+	rqstp->rq_dropme = false;
+
+	/* Setup reply header */
+	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
+
+	svc_putu32(resv, rqstp->rq_xid);
+
+	vers = svc_getnl(argv);
+
+	/* First words of reply: */
+	svc_putnl(resv, 1);		/* REPLY */
+
+	if (vers != 2)		/* RPC version number */
+		goto err_bad_rpc;
+
+	/* Save position in case we later decide to reject: */
+	reply_statp = resv->iov_base + resv->iov_len;
+
+	svc_putnl(resv, 0);		/* ACCEPT */
+
+	rqstp->rq_prog = prog = svc_getnl(argv);	/* program number */
+	rqstp->rq_vers = vers = svc_getnl(argv);	/* version number */
+	rqstp->rq_proc = proc = svc_getnl(argv);	/* procedure number */
+
+	progp = serv->sv_program;
+
+	for (progp = serv->sv_program; progp; progp = progp->pg_next)
+		if (prog == progp->pg_prog)
+			break;
+
+	/*
+	 * Decode auth data, and add verifier to reply buffer.
+	 * We do this before anything else in order to get a decent
+	 * auth verifier.
+	 */
+	auth_res = svc_authenticate(rqstp, &auth_stat);
+	/* Also give the program a chance to reject this call: */
+	if (auth_res == SVC_OK && progp) {
+		auth_stat = rpc_autherr_badcred;
+		auth_res = progp->pg_authenticate(rqstp);
+	}
+	switch (auth_res) {
+	case SVC_OK:
+		break;
+	case SVC_GARBAGE:
+		goto err_garbage;
+	case SVC_SYSERR:
+		rpc_stat = rpc_system_err;
+		goto err_bad;
+	case SVC_DENIED:
+		goto err_bad_auth;
+	case SVC_CLOSE:
+		if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+			svc_close_xprt(rqstp->rq_xprt);
+	case SVC_DROP:
+		goto dropit;
+	case SVC_COMPLETE:
+		goto sendit;
+	}
+
+	if (progp == NULL)
+		goto err_bad_prog;
+
+	if (vers >= progp->pg_nvers ||
+	  !(versp = progp->pg_vers[vers]))
+		goto err_bad_vers;
+
+	procp = versp->vs_proc + proc;
+	if (proc >= versp->vs_nproc || !procp->pc_func)
+		goto err_bad_proc;
+	rqstp->rq_procinfo = procp;
+
+	/* Syntactic check complete */
+	serv->sv_stats->rpccnt++;
+
+	/* Build the reply header. */
+	statp = resv->iov_base +resv->iov_len;
+	svc_putnl(resv, RPC_SUCCESS);
+
+	/* Bump per-procedure stats counter */
+	procp->pc_count++;
+
+	/* Initialize storage for argp and resp */
+	memset(rqstp->rq_argp, 0, procp->pc_argsize);
+	memset(rqstp->rq_resp, 0, procp->pc_ressize);
+
+	/* un-reserve some of the out-queue now that we have a
+	 * better idea of reply size
+	 */
+	if (procp->pc_xdrressize)
+		svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
+
+	/* Call the function that processes the request. */
+	if (!versp->vs_dispatch) {
+		/* Decode arguments */
+		xdr = procp->pc_decode;
+		if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp))
+			goto err_garbage;
+
+		*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
+
+		/* Encode reply */
+		if (rqstp->rq_dropme) {
+			if (procp->pc_release)
+				procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+			goto dropit;
+		}
+		if (*statp == rpc_success &&
+		    (xdr = procp->pc_encode) &&
+		    !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
+			dprintk("svc: failed to encode reply\n");
+			/* serv->sv_stats->rpcsystemerr++; */
+			*statp = rpc_system_err;
+		}
+	} else {
+		dprintk("svc: calling dispatcher\n");
+		if (!versp->vs_dispatch(rqstp, statp)) {
+			/* Release reply info */
+			if (procp->pc_release)
+				procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+			goto dropit;
+		}
+	}
+
+	/* Check RPC status result */
+	if (*statp != rpc_success)
+		resv->iov_len = ((void*)statp)  - resv->iov_base + 4;
+
+	/* Release reply info */
+	if (procp->pc_release)
+		procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+
+	if (procp->pc_encode == NULL)
+		goto dropit;
+
+ sendit:
+	if (svc_authorise(rqstp))
+		goto dropit;
+	return 1;		/* Caller can now send it */
+
+ dropit:
+	svc_authorise(rqstp);	/* doesn't hurt to call this twice */
+	dprintk("svc: svc_process dropit\n");
+	return 0;
+
+err_short_len:
+	svc_printk(rqstp, "short len %Zd, dropping request\n",
+			argv->iov_len);
+
+	goto dropit;			/* drop request */
+
+err_bad_rpc:
+	serv->sv_stats->rpcbadfmt++;
+	svc_putnl(resv, 1);	/* REJECT */
+	svc_putnl(resv, 0);	/* RPC_MISMATCH */
+	svc_putnl(resv, 2);	/* Only RPCv2 supported */
+	svc_putnl(resv, 2);
+	goto sendit;
+
+err_bad_auth:
+	dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
+	serv->sv_stats->rpcbadauth++;
+	/* Restore write pointer to location of accept status: */
+	xdr_ressize_check(rqstp, reply_statp);
+	svc_putnl(resv, 1);	/* REJECT */
+	svc_putnl(resv, 1);	/* AUTH_ERROR */
+	svc_putnl(resv, ntohl(auth_stat));	/* status */
+	goto sendit;
+
+err_bad_prog:
+	dprintk("svc: unknown program %d\n", prog);
+	serv->sv_stats->rpcbadfmt++;
+	svc_putnl(resv, RPC_PROG_UNAVAIL);
+	goto sendit;
+
+err_bad_vers:
+	svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n",
+		       vers, prog, progp->pg_name);
+
+	serv->sv_stats->rpcbadfmt++;
+	svc_putnl(resv, RPC_PROG_MISMATCH);
+	svc_putnl(resv, progp->pg_lovers);
+	svc_putnl(resv, progp->pg_hivers);
+	goto sendit;
+
+err_bad_proc:
+	svc_printk(rqstp, "unknown procedure (%d)\n", proc);
+
+	serv->sv_stats->rpcbadfmt++;
+	svc_putnl(resv, RPC_PROC_UNAVAIL);
+	goto sendit;
+
+err_garbage:
+	svc_printk(rqstp, "failed to decode args\n");
+
+	rpc_stat = rpc_garbage_args;
+err_bad:
+	serv->sv_stats->rpcbadfmt++;
+	svc_putnl(resv, ntohl(rpc_stat));
+	goto sendit;
+}
+EXPORT_SYMBOL_GPL(svc_process);
+
+/*
+ * Process the RPC request.
+ */
+int
+svc_process(struct svc_rqst *rqstp)
+{
+	struct kvec		*argv = &rqstp->rq_arg.head[0];
+	struct kvec		*resv = &rqstp->rq_res.head[0];
+	struct svc_serv		*serv = rqstp->rq_server;
+	u32			dir;
+
+	/*
+	 * Setup response xdr_buf.
+	 * Initially it has just one page
+	 */
+	rqstp->rq_resused = 1;
+	resv->iov_base = page_address(rqstp->rq_respages[0]);
+	resv->iov_len = 0;
+	rqstp->rq_res.pages = rqstp->rq_respages + 1;
+	rqstp->rq_res.len = 0;
+	rqstp->rq_res.page_base = 0;
+	rqstp->rq_res.page_len = 0;
+	rqstp->rq_res.buflen = PAGE_SIZE;
+	rqstp->rq_res.tail[0].iov_base = NULL;
+	rqstp->rq_res.tail[0].iov_len = 0;
+
+	rqstp->rq_xid = svc_getu32(argv);
+
+	dir  = svc_getnl(argv);
+	if (dir != 0) {
+		/* direction != CALL */
+		svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
+		serv->sv_stats->rpcbadfmt++;
+		svc_drop(rqstp);
+		return 0;
+	}
+
+	/* Returns 1 for send, 0 for drop */
+	if (svc_process_common(rqstp, argv, resv))
+		return svc_send(rqstp);
+	else {
+		svc_drop(rqstp);
+		return 0;
+	}
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Process a backchannel RPC request that arrived over an existing
+ * outbound connection
+ */
+int
+bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
+	       struct svc_rqst *rqstp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+
+	/* Build the svc_rqst used by the common processing routine */
+	rqstp->rq_xprt = serv->sv_bc_xprt;
+	rqstp->rq_xid = req->rq_xid;
+	rqstp->rq_prot = req->rq_xprt->prot;
+	rqstp->rq_server = serv;
+
+	rqstp->rq_addrlen = sizeof(req->rq_xprt->addr);
+	memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+	memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
+	memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+
+	/* reset result send buffer "put" position */
+	resv->iov_len = 0;
+
+	if (rqstp->rq_prot != IPPROTO_TCP) {
+		printk(KERN_ERR "No support for Non-TCP transports!\n");
+		BUG();
+	}
+
+	/*
+	 * Skip the next two words because they've already been
+	 * processed in the trasport
+	 */
+	svc_getu32(argv);	/* XID */
+	svc_getnl(argv);	/* CALLDIR */
+
+	/* Returns 1 for send, 0 for drop */
+	if (svc_process_common(rqstp, argv, resv)) {
+		memcpy(&req->rq_snd_buf, &rqstp->rq_res,
+						sizeof(req->rq_snd_buf));
+		return bc_send(req);
+	} else {
+		/* Nothing to do to drop request */
+		return 0;
+	}
+}
+EXPORT_SYMBOL(bc_svc_process);
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Return (transport-specific) limit on the rpc payload.
+ */
+u32 svc_max_payload(const struct svc_rqst *rqstp)
+{
+	u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
+
+	if (rqstp->rq_server->sv_max_payload < max)
+		max = rqstp->rq_server->sv_max_payload;
+	return max;
+}
+EXPORT_SYMBOL_GPL(svc_max_payload);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 00000000..9d7ed0b4
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1270 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ *   http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
+/* List of registered transport classes */
+static DEFINE_SPINLOCK(svc_xprt_class_lock);
+static LIST_HEAD(svc_xprt_class_list);
+
+/* SMP locking strategy:
+ *
+ *	svc_pool->sp_lock protects most of the fields of that pool.
+ *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ *	when both need to be taken (rare), svc_serv->sv_lock is first.
+ *	BKL protects svc_serv->sv_nrthread.
+ *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ *             and the ->sk_info_authunix cache.
+ *
+ *	The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
+ *	enqueued multiply. During normal transport processing this bit
+ *	is set by svc_xprt_enqueue and cleared by svc_xprt_received.
+ *	Providers should not manipulate this bit directly.
+ *
+ *	Some flags can be set to certain values at any time
+ *	providing that certain rules are followed:
+ *
+ *	XPT_CONN, XPT_DATA:
+ *		- Can be set or cleared at any time.
+ *		- After a set, svc_xprt_enqueue must be called to enqueue
+ *		  the transport for processing.
+ *		- After a clear, the transport must be read/accepted.
+ *		  If this succeeds, it must be set again.
+ *	XPT_CLOSE:
+ *		- Can set at any time. It is never cleared.
+ *      XPT_DEAD:
+ *		- Can only be set while XPT_BUSY is held which ensures
+ *		  that no other thread will be using the transport or will
+ *		  try to set XPT_DEAD.
+ */
+
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+	struct svc_xprt_class *cl;
+	int res = -EEXIST;
+
+	dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
+
+	INIT_LIST_HEAD(&xcl->xcl_list);
+	spin_lock(&svc_xprt_class_lock);
+	/* Make sure there isn't already a class with the same name */
+	list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+		if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
+			goto out;
+	}
+	list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+	res = 0;
+out:
+	spin_unlock(&svc_xprt_class_lock);
+	return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+
+void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+	dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+	spin_lock(&svc_xprt_class_lock);
+	list_del_init(&xcl->xcl_list);
+	spin_unlock(&svc_xprt_class_lock);
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+	struct svc_xprt_class *xcl;
+	char tmpstr[80];
+	int len = 0;
+	buf[0] = '\0';
+
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+		int slen;
+
+		sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+		slen = strlen(tmpstr);
+		if (len + slen > maxlen)
+			break;
+		len += slen;
+		strcat(buf, tmpstr);
+	}
+	spin_unlock(&svc_xprt_class_lock);
+
+	return len;
+}
+
+static void svc_xprt_free(struct kref *kref)
+{
+	struct svc_xprt *xprt =
+		container_of(kref, struct svc_xprt, xpt_ref);
+	struct module *owner = xprt->xpt_class->xcl_owner;
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
+		svcauth_unix_info_release(xprt);
+	put_net(xprt->xpt_net);
+	/* See comment on corresponding get in xs_setup_bc_tcp(): */
+	if (xprt->xpt_bc_xprt)
+		xprt_put(xprt->xpt_bc_xprt);
+	xprt->xpt_ops->xpo_free(xprt);
+	module_put(owner);
+}
+
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+	kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
+		   struct svc_serv *serv)
+{
+	memset(xprt, 0, sizeof(*xprt));
+	xprt->xpt_class = xcl;
+	xprt->xpt_ops = xcl->xcl_ops;
+	kref_init(&xprt->xpt_ref);
+	xprt->xpt_server = serv;
+	INIT_LIST_HEAD(&xprt->xpt_list);
+	INIT_LIST_HEAD(&xprt->xpt_ready);
+	INIT_LIST_HEAD(&xprt->xpt_deferred);
+	INIT_LIST_HEAD(&xprt->xpt_users);
+	mutex_init(&xprt->xpt_mutex);
+	spin_lock_init(&xprt->xpt_lock);
+	set_bit(XPT_BUSY, &xprt->xpt_flags);
+	rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
+	xprt->xpt_net = get_net(&init_net);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+
+static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+					 struct svc_serv *serv,
+					 struct net *net,
+					 const int family,
+					 const unsigned short port,
+					 int flags)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+		.sin_port		= htons(port),
+	};
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= htons(port),
+	};
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+	struct sockaddr *sap;
+	size_t len;
+
+	switch (family) {
+	case PF_INET:
+		sap = (struct sockaddr *)&sin;
+		len = sizeof(sin);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case PF_INET6:
+		sap = (struct sockaddr *)&sin6;
+		len = sizeof(sin6);
+		break;
+#endif	/* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+	default:
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+
+	return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
+}
+
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+		    struct net *net, const int family,
+		    const unsigned short port, int flags)
+{
+	struct svc_xprt_class *xcl;
+
+	dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+	spin_lock(&svc_xprt_class_lock);
+	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+		struct svc_xprt *newxprt;
+		unsigned short newport;
+
+		if (strcmp(xprt_name, xcl->xcl_name))
+			continue;
+
+		if (!try_module_get(xcl->xcl_owner))
+			goto err;
+
+		spin_unlock(&svc_xprt_class_lock);
+		newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
+		if (IS_ERR(newxprt)) {
+			module_put(xcl->xcl_owner);
+			return PTR_ERR(newxprt);
+		}
+
+		clear_bit(XPT_TEMP, &newxprt->xpt_flags);
+		spin_lock_bh(&serv->sv_lock);
+		list_add(&newxprt->xpt_list, &serv->sv_permsocks);
+		spin_unlock_bh(&serv->sv_lock);
+		newport = svc_xprt_local_port(newxprt);
+		clear_bit(XPT_BUSY, &newxprt->xpt_flags);
+		return newport;
+	}
+ err:
+	spin_unlock(&svc_xprt_class_lock);
+	dprintk("svc: transport %s not found\n", xprt_name);
+
+	/* This errno is exposed to user space.  Provide a reasonable
+	 * perror msg for a bad transport. */
+	return -EPROTONOSUPPORT;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+
+/*
+ * Copy the local and remote xprt addresses to the rqstp structure
+ */
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+	struct sockaddr *sin;
+
+	memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+	rqstp->rq_addrlen = xprt->xpt_remotelen;
+
+	/*
+	 * Destination address in request is needed for binding the
+	 * source address in RPC replies/callbacks later.
+	 */
+	sin = (struct sockaddr *)&xprt->xpt_local;
+	switch (sin->sa_family) {
+	case AF_INET:
+		rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+		break;
+	case AF_INET6:
+		rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
+
+/**
+ * svc_print_addr - Format rq_addr field for printing
+ * @rqstp: svc_rqst struct containing address to print
+ * @buf: target buffer for formatted address
+ * @len: length of target buffer
+ *
+ */
+char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
+{
+	return __svc_print_addr(svc_addr(rqstp), buf, len);
+}
+EXPORT_SYMBOL_GPL(svc_print_addr);
+
+/*
+ * Queue up an idle server thread.  Must have pool->sp_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't pollute
+ * the cache.
+ */
+static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+	list_add(&rqstp->rq_list, &pool->sp_threads);
+}
+
+/*
+ * Dequeue an nfsd thread.  Must have pool->sp_lock held.
+ */
+static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+	list_del(&rqstp->rq_list);
+}
+
+static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
+{
+	if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
+		return true;
+	if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
+		return xprt->xpt_ops->xpo_has_wspace(xprt);
+	return false;
+}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+	struct svc_serv	*serv = xprt->xpt_server;
+	struct svc_pool *pool;
+	struct svc_rqst	*rqstp;
+	int cpu;
+
+	if (!svc_xprt_has_something_to_do(xprt))
+		return;
+
+	cpu = get_cpu();
+	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+	put_cpu();
+
+	spin_lock_bh(&pool->sp_lock);
+
+	if (!list_empty(&pool->sp_threads) &&
+	    !list_empty(&pool->sp_sockets))
+		printk(KERN_ERR
+		       "svc_xprt_enqueue: "
+		       "threads and transports both waiting??\n");
+
+	pool->sp_stats.packets++;
+
+	/* Mark transport as busy. It will remain in this state until
+	 * the provider calls svc_xprt_received. We update XPT_BUSY
+	 * atomically because it also guards against trying to enqueue
+	 * the transport twice.
+	 */
+	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+		/* Don't enqueue transport while already enqueued */
+		dprintk("svc: transport %p busy, not enqueued\n", xprt);
+		goto out_unlock;
+	}
+
+	if (!list_empty(&pool->sp_threads)) {
+		rqstp = list_entry(pool->sp_threads.next,
+				   struct svc_rqst,
+				   rq_list);
+		dprintk("svc: transport %p served by daemon %p\n",
+			xprt, rqstp);
+		svc_thread_dequeue(pool, rqstp);
+		if (rqstp->rq_xprt)
+			printk(KERN_ERR
+				"svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
+				rqstp, rqstp->rq_xprt);
+		rqstp->rq_xprt = xprt;
+		svc_xprt_get(xprt);
+		rqstp->rq_reserved = serv->sv_max_mesg;
+		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+		pool->sp_stats.threads_woken++;
+		wake_up(&rqstp->rq_wait);
+	} else {
+		dprintk("svc: transport %p put into queue\n", xprt);
+		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+		pool->sp_stats.sockets_queued++;
+	}
+
+out_unlock:
+	spin_unlock_bh(&pool->sp_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+/*
+ * Dequeue the first transport.  Must be called with the pool->sp_lock held.
+ */
+static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
+{
+	struct svc_xprt	*xprt;
+
+	if (list_empty(&pool->sp_sockets))
+		return NULL;
+
+	xprt = list_entry(pool->sp_sockets.next,
+			  struct svc_xprt, xpt_ready);
+	list_del_init(&xprt->xpt_ready);
+
+	dprintk("svc: transport %p dequeued, inuse=%d\n",
+		xprt, atomic_read(&xprt->xpt_ref.refcount));
+
+	return xprt;
+}
+
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+void svc_xprt_received(struct svc_xprt *xprt)
+{
+	BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+	/* As soon as we clear busy, the xprt could be closed and
+	 * 'put', so we need a reference to call svc_xprt_enqueue with:
+	 */
+	svc_xprt_get(xprt);
+	clear_bit(XPT_BUSY, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+	svc_xprt_put(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp:  The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits.  This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+	space += rqstp->rq_res.head[0].iov_len;
+
+	if (space < rqstp->rq_reserved) {
+		struct svc_xprt *xprt = rqstp->rq_xprt;
+		atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+		rqstp->rq_reserved = space;
+
+		svc_xprt_enqueue(xprt);
+	}
+}
+EXPORT_SYMBOL_GPL(svc_reserve);
+
+static void svc_xprt_release(struct svc_rqst *rqstp)
+{
+	struct svc_xprt	*xprt = rqstp->rq_xprt;
+
+	rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+	kfree(rqstp->rq_deferred);
+	rqstp->rq_deferred = NULL;
+
+	svc_free_res_pages(rqstp);
+	rqstp->rq_res.page_len = 0;
+	rqstp->rq_res.page_base = 0;
+
+	/* Reset response buffer and release
+	 * the reservation.
+	 * But first, check that enough space was reserved
+	 * for the reply, otherwise we have a bug!
+	 */
+	if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
+		printk(KERN_ERR "RPC request reserved %d but used %d\n",
+		       rqstp->rq_reserved,
+		       rqstp->rq_res.len);
+
+	rqstp->rq_res.head[0].iov_len = 0;
+	svc_reserve(rqstp, 0);
+	rqstp->rq_xprt = NULL;
+
+	svc_xprt_put(xprt);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ * This really only makes sense for services like lockd
+ * which have exactly one thread anyway.
+ */
+void svc_wake_up(struct svc_serv *serv)
+{
+	struct svc_rqst	*rqstp;
+	unsigned int i;
+	struct svc_pool *pool;
+
+	for (i = 0; i < serv->sv_nrpools; i++) {
+		pool = &serv->sv_pools[i];
+
+		spin_lock_bh(&pool->sp_lock);
+		if (!list_empty(&pool->sp_threads)) {
+			rqstp = list_entry(pool->sp_threads.next,
+					   struct svc_rqst,
+					   rq_list);
+			dprintk("svc: daemon %p woken up.\n", rqstp);
+			/*
+			svc_thread_dequeue(pool, rqstp);
+			rqstp->rq_xprt = NULL;
+			 */
+			wake_up(&rqstp->rq_wait);
+		}
+		spin_unlock_bh(&pool->sp_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(svc_wake_up);
+
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+	switch (sin->sa_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)sin)->sin_port)
+			< PROT_SOCK;
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+			< PROT_SOCK;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Make sure that we don't have too many active connections. If we have,
+ * something must be dropped. It's not clear what will happen if we allow
+ * "too many" connections, but when dealing with network-facing software,
+ * we have to code defensively. Here we do that by imposing hard limits.
+ *
+ * There's no point in trying to do random drop here for DoS
+ * prevention. The NFS clients does 1 reconnect in 15 seconds. An
+ * attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop old
+ * connections from the same IP first. But right now we don't even
+ * record the client IP in svc_sock.
+ *
+ * single-threaded services that expect a lot of clients will probably
+ * need to set sv_maxconn to override the default value which is based
+ * on the number of threads
+ */
+static void svc_check_conn_limits(struct svc_serv *serv)
+{
+	unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn :
+				(serv->sv_nrthreads+3) * 20;
+
+	if (serv->sv_tmpcnt > limit) {
+		struct svc_xprt *xprt = NULL;
+		spin_lock_bh(&serv->sv_lock);
+		if (!list_empty(&serv->sv_tempsocks)) {
+			if (net_ratelimit()) {
+				/* Try to help the admin */
+				printk(KERN_NOTICE "%s: too many open  "
+				       "connections, consider increasing %s\n",
+				       serv->sv_name, serv->sv_maxconn ?
+				       "the max number of connections." :
+				       "the number of threads.");
+			}
+			/*
+			 * Always select the oldest connection. It's not fair,
+			 * but so is life
+			 */
+			xprt = list_entry(serv->sv_tempsocks.prev,
+					  struct svc_xprt,
+					  xpt_list);
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_get(xprt);
+		}
+		spin_unlock_bh(&serv->sv_lock);
+
+		if (xprt) {
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+	}
+}
+
+/*
+ * Receive the next request on any transport.  This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+	struct svc_xprt		*xprt = NULL;
+	struct svc_serv		*serv = rqstp->rq_server;
+	struct svc_pool		*pool = rqstp->rq_pool;
+	int			len, i;
+	int			pages;
+	struct xdr_buf		*arg;
+	DECLARE_WAITQUEUE(wait, current);
+	long			time_left;
+
+	dprintk("svc: server %p waiting for data (to = %ld)\n",
+		rqstp, timeout);
+
+	if (rqstp->rq_xprt)
+		printk(KERN_ERR
+			"svc_recv: service %p, transport not NULL!\n",
+			 rqstp);
+	if (waitqueue_active(&rqstp->rq_wait))
+		printk(KERN_ERR
+			"svc_recv: service %p, wait queue active!\n",
+			 rqstp);
+
+	/* now allocate needed pages.  If we get a failure, sleep briefly */
+	pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+	for (i = 0; i < pages ; i++)
+		while (rqstp->rq_pages[i] == NULL) {
+			struct page *p = alloc_page(GFP_KERNEL);
+			if (!p) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (signalled() || kthread_should_stop()) {
+					set_current_state(TASK_RUNNING);
+					return -EINTR;
+				}
+				schedule_timeout(msecs_to_jiffies(500));
+			}
+			rqstp->rq_pages[i] = p;
+		}
+	rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+	BUG_ON(pages >= RPCSVC_MAXPAGES);
+
+	/* Make arg->head point to first page and arg->pages point to rest */
+	arg = &rqstp->rq_arg;
+	arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+	arg->head[0].iov_len = PAGE_SIZE;
+	arg->pages = rqstp->rq_pages + 1;
+	arg->page_base = 0;
+	/* save at least one page for response */
+	arg->page_len = (pages-2)*PAGE_SIZE;
+	arg->len = (pages-1)*PAGE_SIZE;
+	arg->tail[0].iov_len = 0;
+
+	try_to_freeze();
+	cond_resched();
+	if (signalled() || kthread_should_stop())
+		return -EINTR;
+
+	/* Normally we will wait up to 5 seconds for any required
+	 * cache information to be provided.
+	 */
+	rqstp->rq_chandle.thread_wait = 5*HZ;
+
+	spin_lock_bh(&pool->sp_lock);
+	xprt = svc_xprt_dequeue(pool);
+	if (xprt) {
+		rqstp->rq_xprt = xprt;
+		svc_xprt_get(xprt);
+		rqstp->rq_reserved = serv->sv_max_mesg;
+		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+
+		/* As there is a shortage of threads and this request
+		 * had to be queued, don't allow the thread to wait so
+		 * long for cache updates.
+		 */
+		rqstp->rq_chandle.thread_wait = 1*HZ;
+	} else {
+		/* No data pending. Go to sleep */
+		svc_thread_enqueue(pool, rqstp);
+
+		/*
+		 * We have to be able to interrupt this wait
+		 * to bring down the daemons ...
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/*
+		 * checking kthread_should_stop() here allows us to avoid
+		 * locking and signalling when stopping kthreads that call
+		 * svc_recv. If the thread has already been woken up, then
+		 * we can exit here without sleeping. If not, then it
+		 * it'll be woken up quickly during the schedule_timeout
+		 */
+		if (kthread_should_stop()) {
+			set_current_state(TASK_RUNNING);
+			spin_unlock_bh(&pool->sp_lock);
+			return -EINTR;
+		}
+
+		add_wait_queue(&rqstp->rq_wait, &wait);
+		spin_unlock_bh(&pool->sp_lock);
+
+		time_left = schedule_timeout(timeout);
+
+		try_to_freeze();
+
+		spin_lock_bh(&pool->sp_lock);
+		remove_wait_queue(&rqstp->rq_wait, &wait);
+		if (!time_left)
+			pool->sp_stats.threads_timedout++;
+
+		xprt = rqstp->rq_xprt;
+		if (!xprt) {
+			svc_thread_dequeue(pool, rqstp);
+			spin_unlock_bh(&pool->sp_lock);
+			dprintk("svc: server %p, no data yet\n", rqstp);
+			if (signalled() || kthread_should_stop())
+				return -EINTR;
+			else
+				return -EAGAIN;
+		}
+	}
+	spin_unlock_bh(&pool->sp_lock);
+
+	len = 0;
+	if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+		dprintk("svc_recv: found XPT_CLOSE\n");
+		svc_delete_xprt(xprt);
+		/* Leave XPT_BUSY set on the dead xprt: */
+		goto out;
+	}
+	if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+		struct svc_xprt *newxpt;
+		newxpt = xprt->xpt_ops->xpo_accept(xprt);
+		if (newxpt) {
+			/*
+			 * We know this module_get will succeed because the
+			 * listener holds a reference too
+			 */
+			__module_get(newxpt->xpt_class->xcl_owner);
+			svc_check_conn_limits(xprt->xpt_server);
+			spin_lock_bh(&serv->sv_lock);
+			set_bit(XPT_TEMP, &newxpt->xpt_flags);
+			list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+			serv->sv_tmpcnt++;
+			if (serv->sv_temptimer.function == NULL) {
+				/* setup timer to age temp transports */
+				setup_timer(&serv->sv_temptimer,
+					    svc_age_temp_xprts,
+					    (unsigned long)serv);
+				mod_timer(&serv->sv_temptimer,
+					  jiffies + svc_conn_age_period * HZ);
+			}
+			spin_unlock_bh(&serv->sv_lock);
+			svc_xprt_received(newxpt);
+		}
+	} else if (xprt->xpt_ops->xpo_has_wspace(xprt)) {
+		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+			rqstp, pool->sp_id, xprt,
+			atomic_read(&xprt->xpt_ref.refcount));
+		rqstp->rq_deferred = svc_deferred_dequeue(xprt);
+		if (rqstp->rq_deferred)
+			len = svc_deferred_recv(rqstp);
+		else
+			len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+		dprintk("svc: got len=%d\n", len);
+	}
+	svc_xprt_received(xprt);
+
+	/* No data, incomplete (TCP) read, or accept() */
+	if (len == 0 || len == -EAGAIN)
+		goto out;
+
+	clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+	rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+	rqstp->rq_chandle.defer = svc_defer;
+
+	if (serv->sv_stats)
+		serv->sv_stats->netcnt++;
+	return len;
+out:
+	rqstp->rq_res.len = 0;
+	svc_xprt_release(rqstp);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(svc_recv);
+
+/*
+ * Drop request
+ */
+void svc_drop(struct svc_rqst *rqstp)
+{
+	dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+	svc_xprt_release(rqstp);
+}
+EXPORT_SYMBOL_GPL(svc_drop);
+
+/*
+ * Return reply to client.
+ */
+int svc_send(struct svc_rqst *rqstp)
+{
+	struct svc_xprt	*xprt;
+	int		len;
+	struct xdr_buf	*xb;
+
+	xprt = rqstp->rq_xprt;
+	if (!xprt)
+		return -EFAULT;
+
+	/* release the receive skb before sending the reply */
+	rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+	/* calculate over-all length */
+	xb = &rqstp->rq_res;
+	xb->len = xb->head[0].iov_len +
+		xb->page_len +
+		xb->tail[0].iov_len;
+
+	/* Grab mutex to serialize outgoing data. */
+	mutex_lock(&xprt->xpt_mutex);
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		len = -ENOTCONN;
+	else
+		len = xprt->xpt_ops->xpo_sendto(rqstp);
+	mutex_unlock(&xprt->xpt_mutex);
+	rpc_wake_up(&xprt->xpt_bc_pending);
+	svc_xprt_release(rqstp);
+
+	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+		return 0;
+	return len;
+}
+
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void svc_age_temp_xprts(unsigned long closure)
+{
+	struct svc_serv *serv = (struct svc_serv *)closure;
+	struct svc_xprt *xprt;
+	struct list_head *le, *next;
+	LIST_HEAD(to_be_aged);
+
+	dprintk("svc_age_temp_xprts\n");
+
+	if (!spin_trylock_bh(&serv->sv_lock)) {
+		/* busy, try again 1 sec later */
+		dprintk("svc_age_temp_xprts: busy\n");
+		mod_timer(&serv->sv_temptimer, jiffies + HZ);
+		return;
+	}
+
+	list_for_each_safe(le, next, &serv->sv_tempsocks) {
+		xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+		/* First time through, just mark it OLD. Second time
+		 * through, close it. */
+		if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+			continue;
+		if (atomic_read(&xprt->xpt_ref.refcount) > 1 ||
+		    test_bit(XPT_BUSY, &xprt->xpt_flags))
+			continue;
+		svc_xprt_get(xprt);
+		list_move(le, &to_be_aged);
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		set_bit(XPT_DETACHED, &xprt->xpt_flags);
+	}
+	spin_unlock_bh(&serv->sv_lock);
+
+	while (!list_empty(&to_be_aged)) {
+		le = to_be_aged.next;
+		/* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
+		list_del_init(le);
+		xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+		dprintk("queuing xprt %p for closing\n", xprt);
+
+		/* a thread will dequeue and close it soon */
+		svc_xprt_enqueue(xprt);
+		svc_xprt_put(xprt);
+	}
+
+	mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+static void call_xpt_users(struct svc_xprt *xprt)
+{
+	struct svc_xpt_user *u;
+
+	spin_lock(&xprt->xpt_lock);
+	while (!list_empty(&xprt->xpt_users)) {
+		u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list);
+		list_del(&u->list);
+		u->callback(u);
+	}
+	spin_unlock(&xprt->xpt_lock);
+}
+
+/*
+ * Remove a dead transport
+ */
+void svc_delete_xprt(struct svc_xprt *xprt)
+{
+	struct svc_serv	*serv = xprt->xpt_server;
+	struct svc_deferred_req *dr;
+
+	/* Only do this once */
+	if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
+		BUG();
+
+	dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+	xprt->xpt_ops->xpo_detach(xprt);
+
+	spin_lock_bh(&serv->sv_lock);
+	if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
+		list_del_init(&xprt->xpt_list);
+	BUG_ON(!list_empty(&xprt->xpt_ready));
+	if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+		serv->sv_tmpcnt--;
+	spin_unlock_bh(&serv->sv_lock);
+
+	while ((dr = svc_deferred_dequeue(xprt)) != NULL)
+		kfree(dr);
+
+	call_xpt_users(xprt);
+	svc_xprt_put(xprt);
+}
+
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+		/* someone else will have to effect the close */
+		return;
+	/*
+	 * We expect svc_close_xprt() to work even when no threads are
+	 * running (e.g., while configuring the server before starting
+	 * any threads), so if the transport isn't busy, we delete
+	 * it ourself:
+	 */
+	svc_delete_xprt(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_close_xprt);
+
+static void svc_close_list(struct list_head *xprt_list)
+{
+	struct svc_xprt *xprt;
+
+	list_for_each_entry(xprt, xprt_list, xpt_list) {
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		set_bit(XPT_BUSY, &xprt->xpt_flags);
+	}
+}
+
+void svc_close_all(struct svc_serv *serv)
+{
+	struct svc_pool *pool;
+	struct svc_xprt *xprt;
+	struct svc_xprt *tmp;
+	int i;
+
+	svc_close_list(&serv->sv_tempsocks);
+	svc_close_list(&serv->sv_permsocks);
+
+	for (i = 0; i < serv->sv_nrpools; i++) {
+		pool = &serv->sv_pools[i];
+
+		spin_lock_bh(&pool->sp_lock);
+		while (!list_empty(&pool->sp_sockets)) {
+			xprt = list_first_entry(&pool->sp_sockets, struct svc_xprt, xpt_ready);
+			list_del_init(&xprt->xpt_ready);
+		}
+		spin_unlock_bh(&pool->sp_lock);
+	}
+	/*
+	 * At this point the sp_sockets lists will stay empty, since
+	 * svc_enqueue will not add new entries without taking the
+	 * sp_lock and checking XPT_BUSY.
+	 */
+	list_for_each_entry_safe(xprt, tmp, &serv->sv_tempsocks, xpt_list)
+		svc_delete_xprt(xprt);
+	list_for_each_entry_safe(xprt, tmp, &serv->sv_permsocks, xpt_list)
+		svc_delete_xprt(xprt);
+
+	BUG_ON(!list_empty(&serv->sv_permsocks));
+	BUG_ON(!list_empty(&serv->sv_tempsocks));
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+	struct svc_deferred_req *dr =
+		container_of(dreq, struct svc_deferred_req, handle);
+	struct svc_xprt *xprt = dr->xprt;
+
+	spin_lock(&xprt->xpt_lock);
+	set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+	if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+		spin_unlock(&xprt->xpt_lock);
+		dprintk("revisit canceled\n");
+		svc_xprt_put(xprt);
+		kfree(dr);
+		return;
+	}
+	dprintk("revisit queued\n");
+	dr->xprt = NULL;
+	list_add(&dr->handle.recent, &xprt->xpt_deferred);
+	spin_unlock(&xprt->xpt_lock);
+	svc_xprt_enqueue(xprt);
+	svc_xprt_put(xprt);
+}
+
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *svc_defer(struct cache_req *req)
+{
+	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+	struct svc_deferred_req *dr;
+
+	if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral)
+		return NULL; /* if more than a page, give up FIXME */
+	if (rqstp->rq_deferred) {
+		dr = rqstp->rq_deferred;
+		rqstp->rq_deferred = NULL;
+	} else {
+		size_t skip;
+		size_t size;
+		/* FIXME maybe discard if size too large */
+		size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
+		dr = kmalloc(size, GFP_KERNEL);
+		if (dr == NULL)
+			return NULL;
+
+		dr->handle.owner = rqstp->rq_server;
+		dr->prot = rqstp->rq_prot;
+		memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+		dr->addrlen = rqstp->rq_addrlen;
+		dr->daddr = rqstp->rq_daddr;
+		dr->argslen = rqstp->rq_arg.len >> 2;
+		dr->xprt_hlen = rqstp->rq_xprt_hlen;
+
+		/* back up head to the start of the buffer and copy */
+		skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+		       dr->argslen << 2);
+	}
+	svc_xprt_get(rqstp->rq_xprt);
+	dr->xprt = rqstp->rq_xprt;
+	rqstp->rq_dropme = true;
+
+	dr->handle.revisit = svc_revisit;
+	return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+	struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+	/* setup iov_base past transport header */
+	rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+	/* The iov_len does not include the transport header bytes */
+	rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+	rqstp->rq_arg.page_len = 0;
+	/* The rq_arg.len includes the transport header bytes */
+	rqstp->rq_arg.len     = dr->argslen<<2;
+	rqstp->rq_prot        = dr->prot;
+	memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+	rqstp->rq_addrlen     = dr->addrlen;
+	/* Save off transport header len in case we get deferred again */
+	rqstp->rq_xprt_hlen   = dr->xprt_hlen;
+	rqstp->rq_daddr       = dr->daddr;
+	rqstp->rq_respages    = rqstp->rq_pages;
+	return (dr->argslen<<2) - dr->xprt_hlen;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+	struct svc_deferred_req *dr = NULL;
+
+	if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+		return NULL;
+	spin_lock(&xprt->xpt_lock);
+	if (!list_empty(&xprt->xpt_deferred)) {
+		dr = list_entry(xprt->xpt_deferred.next,
+				struct svc_deferred_req,
+				handle.recent);
+		list_del_init(&dr->handle.recent);
+	} else
+		clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+	spin_unlock(&xprt->xpt_lock);
+	return dr;
+}
+
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * address family and port.
+ *
+ * Specifying 0 for the address family or port is effectively a
+ * wild-card, and will result in matching the first transport in the
+ * service's list that has a matching class name.
+ */
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+			       const sa_family_t af, const unsigned short port)
+{
+	struct svc_xprt *xprt;
+	struct svc_xprt *found = NULL;
+
+	/* Sanity check the args */
+	if (serv == NULL || xcl_name == NULL)
+		return found;
+
+	spin_lock_bh(&serv->sv_lock);
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+		if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+			continue;
+		if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
+			continue;
+		if (port != 0 && port != svc_xprt_local_port(xprt))
+			continue;
+		found = xprt;
+		svc_xprt_get(xprt);
+		break;
+	}
+	spin_unlock_bh(&serv->sv_lock);
+	return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_xprt);
+
+static int svc_one_xprt_name(const struct svc_xprt *xprt,
+			     char *pos, int remaining)
+{
+	int len;
+
+	len = snprintf(pos, remaining, "%s %u\n",
+			xprt->xpt_class->xcl_name,
+			svc_xprt_local_port(xprt));
+	if (len >= remaining)
+		return -ENAMETOOLONG;
+	return len;
+}
+
+/**
+ * svc_xprt_names - format a buffer with a list of transport names
+ * @serv: pointer to an RPC service
+ * @buf: pointer to a buffer to be filled in
+ * @buflen: length of buffer to be filled in
+ *
+ * Fills in @buf with a string containing a list of transport names,
+ * each name terminated with '\n'.
+ *
+ * Returns positive length of the filled-in string on success; otherwise
+ * a negative errno value is returned if an error occurs.
+ */
+int svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen)
+{
+	struct svc_xprt *xprt;
+	int len, totlen;
+	char *pos;
+
+	/* Sanity check args */
+	if (!serv)
+		return 0;
+
+	spin_lock_bh(&serv->sv_lock);
+
+	pos = buf;
+	totlen = 0;
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+		len = svc_one_xprt_name(xprt, pos, buflen - totlen);
+		if (len < 0) {
+			*buf = '\0';
+			totlen = len;
+		}
+		if (len <= 0)
+			break;
+
+		pos += len;
+		totlen += len;
+	}
+
+	spin_unlock_bh(&serv->sv_lock);
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_names);
+
+
+/*----------------------------------------------------------------------------*/
+
+static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
+{
+	unsigned int pidx = (unsigned int)*pos;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
+
+	if (!pidx)
+		return SEQ_START_TOKEN;
+	return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]);
+}
+
+static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct svc_pool *pool = p;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_next, *pos=%llu\n", *pos);
+
+	if (p == SEQ_START_TOKEN) {
+		pool = &serv->sv_pools[0];
+	} else {
+		unsigned int pidx = (pool - &serv->sv_pools[0]);
+		if (pidx < serv->sv_nrpools-1)
+			pool = &serv->sv_pools[pidx+1];
+		else
+			pool = NULL;
+	}
+	++*pos;
+	return pool;
+}
+
+static void svc_pool_stats_stop(struct seq_file *m, void *p)
+{
+}
+
+static int svc_pool_stats_show(struct seq_file *m, void *p)
+{
+	struct svc_pool *pool = p;
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken threads-timedout\n");
+		return 0;
+	}
+
+	seq_printf(m, "%u %lu %lu %lu %lu\n",
+		pool->sp_id,
+		pool->sp_stats.packets,
+		pool->sp_stats.sockets_queued,
+		pool->sp_stats.threads_woken,
+		pool->sp_stats.threads_timedout);
+
+	return 0;
+}
+
+static const struct seq_operations svc_pool_stats_seq_ops = {
+	.start	= svc_pool_stats_start,
+	.next	= svc_pool_stats_next,
+	.stop	= svc_pool_stats_stop,
+	.show	= svc_pool_stats_show,
+};
+
+int svc_pool_stats_open(struct svc_serv *serv, struct file *file)
+{
+	int err;
+
+	err = seq_open(file, &svc_pool_stats_seq_ops);
+	if (!err)
+		((struct seq_file *) file->private_data)->private = serv;
+	return err;
+}
+EXPORT_SYMBOL(svc_pool_stats_open);
+
+/*----------------------------------------------------------------------------*/
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
new file mode 100644
index 00000000..7963569f
--- /dev/null
+++ b/net/sunrpc/svcauth.c
@@ -0,0 +1,165 @@
+/*
+ * linux/net/sunrpc/svcauth.c
+ *
+ * The generic interface for RPC authentication on the server side.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ *
+ * CHANGES
+ * 19-Apr-2000 Chris Evans      - Security fix
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/err.h>
+#include <linux/hash.h>
+
+#define RPCDBG_FACILITY	RPCDBG_AUTH
+
+
+/*
+ * Table of authenticators
+ */
+extern struct auth_ops svcauth_null;
+extern struct auth_ops svcauth_unix;
+
+static DEFINE_SPINLOCK(authtab_lock);
+static struct auth_ops	*authtab[RPC_AUTH_MAXFLAVOR] = {
+	[0] = &svcauth_null,
+	[1] = &svcauth_unix,
+};
+
+int
+svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
+{
+	rpc_authflavor_t	flavor;
+	struct auth_ops		*aops;
+
+	*authp = rpc_auth_ok;
+
+	flavor = svc_getnl(&rqstp->rq_arg.head[0]);
+
+	dprintk("svc: svc_authenticate (%d)\n", flavor);
+
+	spin_lock(&authtab_lock);
+	if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) ||
+	    !try_module_get(aops->owner)) {
+		spin_unlock(&authtab_lock);
+		*authp = rpc_autherr_badcred;
+		return SVC_DENIED;
+	}
+	spin_unlock(&authtab_lock);
+
+	rqstp->rq_authop = aops;
+	return aops->accept(rqstp, authp);
+}
+EXPORT_SYMBOL_GPL(svc_authenticate);
+
+int svc_set_client(struct svc_rqst *rqstp)
+{
+	return rqstp->rq_authop->set_client(rqstp);
+}
+EXPORT_SYMBOL_GPL(svc_set_client);
+
+/* A request, which was authenticated, has now executed.
+ * Time to finalise the credentials and verifier
+ * and release and resources
+ */
+int svc_authorise(struct svc_rqst *rqstp)
+{
+	struct auth_ops *aops = rqstp->rq_authop;
+	int rv = 0;
+
+	rqstp->rq_authop = NULL;
+
+	if (aops) {
+		rv = aops->release(rqstp);
+		module_put(aops->owner);
+	}
+	return rv;
+}
+
+int
+svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops)
+{
+	int rv = -EINVAL;
+	spin_lock(&authtab_lock);
+	if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) {
+		authtab[flavor] = aops;
+		rv = 0;
+	}
+	spin_unlock(&authtab_lock);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(svc_auth_register);
+
+void
+svc_auth_unregister(rpc_authflavor_t flavor)
+{
+	spin_lock(&authtab_lock);
+	if (flavor < RPC_AUTH_MAXFLAVOR)
+		authtab[flavor] = NULL;
+	spin_unlock(&authtab_lock);
+}
+EXPORT_SYMBOL_GPL(svc_auth_unregister);
+
+/**************************************************
+ * 'auth_domains' are stored in a hash table indexed by name.
+ * When the last reference to an 'auth_domain' is dropped,
+ * the object is unhashed and freed.
+ * If auth_domain_lookup fails to find an entry, it will return
+ * it's second argument 'new'.  If this is non-null, it will
+ * have been atomically linked into the table.
+ */
+
+#define	DN_HASHBITS	6
+#define	DN_HASHMAX	(1<<DN_HASHBITS)
+
+static struct hlist_head	auth_domain_table[DN_HASHMAX];
+static spinlock_t	auth_domain_lock =
+	__SPIN_LOCK_UNLOCKED(auth_domain_lock);
+
+void auth_domain_put(struct auth_domain *dom)
+{
+	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
+		hlist_del(&dom->hash);
+		dom->flavour->domain_release(dom);
+		spin_unlock(&auth_domain_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(auth_domain_put);
+
+struct auth_domain *
+auth_domain_lookup(char *name, struct auth_domain *new)
+{
+	struct auth_domain *hp;
+	struct hlist_head *head;
+	struct hlist_node *np;
+
+	head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+	spin_lock(&auth_domain_lock);
+
+	hlist_for_each_entry(hp, np, head, hash) {
+		if (strcmp(hp->name, name)==0) {
+			kref_get(&hp->ref);
+			spin_unlock(&auth_domain_lock);
+			return hp;
+		}
+	}
+	if (new)
+		hlist_add_head(&new->hash, head);
+	spin_unlock(&auth_domain_lock);
+	return new;
+}
+EXPORT_SYMBOL_GPL(auth_domain_lookup);
+
+struct auth_domain *auth_domain_find(char *name)
+{
+	return auth_domain_lookup(name, NULL);
+}
+EXPORT_SYMBOL_GPL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
new file mode 100644
index 00000000..c8e10216
--- /dev/null
+++ b/net/sunrpc/svcauth_unix.c
@@ -0,0 +1,981 @@
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/gss_api.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <linux/kernel.h>
+#define RPCDBG_FACILITY	RPCDBG_AUTH
+
+#include <linux/sunrpc/clnt.h>
+
+#include "netns.h"
+
+/*
+ * AUTHUNIX and AUTHNULL credentials are both handled here.
+ * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
+ * are always nobody (-2).  i.e. we do the same IP address checks for
+ * AUTHNULL as for AUTHUNIX, and that is done here.
+ */
+
+
+struct unix_domain {
+	struct auth_domain	h;
+#ifdef CONFIG_NFSD_DEPRECATED
+	int	addr_changes;
+#endif /* CONFIG_NFSD_DEPRECATED */
+	/* other stuff later */
+};
+
+extern struct auth_ops svcauth_unix;
+
+static void svcauth_unix_domain_release(struct auth_domain *dom)
+{
+	struct unix_domain *ud = container_of(dom, struct unix_domain, h);
+
+	kfree(dom->name);
+	kfree(ud);
+}
+
+struct auth_domain *unix_domain_find(char *name)
+{
+	struct auth_domain *rv;
+	struct unix_domain *new = NULL;
+
+	rv = auth_domain_lookup(name, NULL);
+	while(1) {
+		if (rv) {
+			if (new && rv != &new->h)
+				svcauth_unix_domain_release(&new->h);
+
+			if (rv->flavour != &svcauth_unix) {
+				auth_domain_put(rv);
+				return NULL;
+			}
+			return rv;
+		}
+
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		kref_init(&new->h.ref);
+		new->h.name = kstrdup(name, GFP_KERNEL);
+		if (new->h.name == NULL) {
+			kfree(new);
+			return NULL;
+		}
+		new->h.flavour = &svcauth_unix;
+#ifdef CONFIG_NFSD_DEPRECATED
+		new->addr_changes = 0;
+#endif /* CONFIG_NFSD_DEPRECATED */
+		rv = auth_domain_lookup(name, &new->h);
+	}
+}
+EXPORT_SYMBOL_GPL(unix_domain_find);
+
+
+/**************************************************
+ * cache for IP address to unix_domain
+ * as needed by AUTH_UNIX
+ */
+#define	IP_HASHBITS	8
+#define	IP_HASHMAX	(1<<IP_HASHBITS)
+
+struct ip_map {
+	struct cache_head	h;
+	char			m_class[8]; /* e.g. "nfsd" */
+	struct in6_addr		m_addr;
+	struct unix_domain	*m_client;
+#ifdef CONFIG_NFSD_DEPRECATED
+	int			m_add_change;
+#endif /* CONFIG_NFSD_DEPRECATED */
+};
+
+static void ip_map_put(struct kref *kref)
+{
+	struct cache_head *item = container_of(kref, struct cache_head, ref);
+	struct ip_map *im = container_of(item, struct ip_map,h);
+
+	if (test_bit(CACHE_VALID, &item->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &item->flags))
+		auth_domain_put(&im->m_client->h);
+	kfree(im);
+}
+
+#if IP_HASHBITS == 8
+/* hash_long on a 64 bit machine is currently REALLY BAD for
+ * IP addresses in reverse-endian (i.e. on a little-endian machine).
+ * So use a trivial but reliable hash instead
+ */
+static inline int hash_ip(__be32 ip)
+{
+	int hash = (__force u32)ip ^ ((__force u32)ip>>16);
+	return (hash ^ (hash>>8)) & 0xff;
+}
+#endif
+static inline int hash_ip6(struct in6_addr ip)
+{
+	return (hash_ip(ip.s6_addr32[0]) ^
+		hash_ip(ip.s6_addr32[1]) ^
+		hash_ip(ip.s6_addr32[2]) ^
+		hash_ip(ip.s6_addr32[3]));
+}
+static int ip_map_match(struct cache_head *corig, struct cache_head *cnew)
+{
+	struct ip_map *orig = container_of(corig, struct ip_map, h);
+	struct ip_map *new = container_of(cnew, struct ip_map, h);
+	return strcmp(orig->m_class, new->m_class) == 0 &&
+	       ipv6_addr_equal(&orig->m_addr, &new->m_addr);
+}
+static void ip_map_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct ip_map *new = container_of(cnew, struct ip_map, h);
+	struct ip_map *item = container_of(citem, struct ip_map, h);
+
+	strcpy(new->m_class, item->m_class);
+	ipv6_addr_copy(&new->m_addr, &item->m_addr);
+}
+static void update(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct ip_map *new = container_of(cnew, struct ip_map, h);
+	struct ip_map *item = container_of(citem, struct ip_map, h);
+
+	kref_get(&item->m_client->h.ref);
+	new->m_client = item->m_client;
+#ifdef CONFIG_NFSD_DEPRECATED
+	new->m_add_change = item->m_add_change;
+#endif /* CONFIG_NFSD_DEPRECATED */
+}
+static struct cache_head *ip_map_alloc(void)
+{
+	struct ip_map *i = kmalloc(sizeof(*i), GFP_KERNEL);
+	if (i)
+		return &i->h;
+	else
+		return NULL;
+}
+
+static void ip_map_request(struct cache_detail *cd,
+				  struct cache_head *h,
+				  char **bpp, int *blen)
+{
+	char text_addr[40];
+	struct ip_map *im = container_of(h, struct ip_map, h);
+
+	if (ipv6_addr_v4mapped(&(im->m_addr))) {
+		snprintf(text_addr, 20, "%pI4", &im->m_addr.s6_addr32[3]);
+	} else {
+		snprintf(text_addr, 40, "%pI6", &im->m_addr);
+	}
+	qword_add(bpp, blen, im->m_class);
+	qword_add(bpp, blen, text_addr);
+	(*bpp)[-1] = '\n';
+}
+
+static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, ip_map_request);
+}
+
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
+
+static int ip_map_parse(struct cache_detail *cd,
+			  char *mesg, int mlen)
+{
+	/* class ipaddress [domainname] */
+	/* should be safe just to use the start of the input buffer
+	 * for scratch: */
+	char *buf = mesg;
+	int len;
+	char class[8];
+	union {
+		struct sockaddr		sa;
+		struct sockaddr_in	s4;
+		struct sockaddr_in6	s6;
+	} address;
+	struct sockaddr_in6 sin6;
+	int err;
+
+	struct ip_map *ipmp;
+	struct auth_domain *dom;
+	time_t expiry;
+
+	if (mesg[mlen-1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	/* class */
+	len = qword_get(&mesg, class, sizeof(class));
+	if (len <= 0) return -EINVAL;
+
+	/* ip address */
+	len = qword_get(&mesg, buf, mlen);
+	if (len <= 0) return -EINVAL;
+
+	if (rpc_pton(buf, len, &address.sa, sizeof(address)) == 0)
+		return -EINVAL;
+	switch (address.sa.sa_family) {
+	case AF_INET:
+		/* Form a mapped IPv4 address in sin6 */
+		sin6.sin6_family = AF_INET6;
+		ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr,
+				&sin6.sin6_addr);
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		memcpy(&sin6, &address.s6, sizeof(sin6));
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	expiry = get_expiry(&mesg);
+	if (expiry ==0)
+		return -EINVAL;
+
+	/* domainname, or empty for NEGATIVE */
+	len = qword_get(&mesg, buf, mlen);
+	if (len < 0) return -EINVAL;
+
+	if (len) {
+		dom = unix_domain_find(buf);
+		if (dom == NULL)
+			return -ENOENT;
+	} else
+		dom = NULL;
+
+	/* IPv6 scope IDs are ignored for now */
+	ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr);
+	if (ipmp) {
+		err = __ip_map_update(cd, ipmp,
+			     container_of(dom, struct unix_domain, h),
+			     expiry);
+	} else
+		err = -ENOMEM;
+
+	if (dom)
+		auth_domain_put(dom);
+
+	cache_flush();
+	return err;
+}
+
+static int ip_map_show(struct seq_file *m,
+		       struct cache_detail *cd,
+		       struct cache_head *h)
+{
+	struct ip_map *im;
+	struct in6_addr addr;
+	char *dom = "-no-domain-";
+
+	if (h == NULL) {
+		seq_puts(m, "#class IP domain\n");
+		return 0;
+	}
+	im = container_of(h, struct ip_map, h);
+	/* class addr domain */
+	ipv6_addr_copy(&addr, &im->m_addr);
+
+	if (test_bit(CACHE_VALID, &h->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &h->flags))
+		dom = im->m_client->h.name;
+
+	if (ipv6_addr_v4mapped(&addr)) {
+		seq_printf(m, "%s %pI4 %s\n",
+			im->m_class, &addr.s6_addr32[3], dom);
+	} else {
+		seq_printf(m, "%s %pI6 %s\n", im->m_class, &addr, dom);
+	}
+	return 0;
+}
+
+
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
+		struct in6_addr *addr)
+{
+	struct ip_map ip;
+	struct cache_head *ch;
+
+	strcpy(ip.m_class, class);
+	ipv6_addr_copy(&ip.m_addr, addr);
+	ch = sunrpc_cache_lookup(cd, &ip.h,
+				 hash_str(class, IP_HASHBITS) ^
+				 hash_ip6(*addr));
+
+	if (ch)
+		return container_of(ch, struct ip_map, h);
+	else
+		return NULL;
+}
+
+static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
+		struct in6_addr *addr)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	return __ip_map_lookup(sn->ip_map_cache, class, addr);
+}
+
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
+		struct unix_domain *udom, time_t expiry)
+{
+	struct ip_map ip;
+	struct cache_head *ch;
+
+	ip.m_client = udom;
+	ip.h.flags = 0;
+	if (!udom)
+		set_bit(CACHE_NEGATIVE, &ip.h.flags);
+#ifdef CONFIG_NFSD_DEPRECATED
+	else {
+		ip.m_add_change = udom->addr_changes;
+		/* if this is from the legacy set_client system call,
+		 * we need m_add_change to be one higher
+		 */
+		if (expiry == NEVER)
+			ip.m_add_change++;
+	}
+#endif /* CONFIG_NFSD_DEPRECATED */
+	ip.h.expiry_time = expiry;
+	ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
+				 hash_str(ipm->m_class, IP_HASHBITS) ^
+				 hash_ip6(ipm->m_addr));
+	if (!ch)
+		return -ENOMEM;
+	cache_put(ch, cd);
+	return 0;
+}
+
+static inline int ip_map_update(struct net *net, struct ip_map *ipm,
+		struct unix_domain *udom, time_t expiry)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
+}
+
+#ifdef CONFIG_NFSD_DEPRECATED
+int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
+{
+	struct unix_domain *udom;
+	struct ip_map *ipmp;
+
+	if (dom->flavour != &svcauth_unix)
+		return -EINVAL;
+	udom = container_of(dom, struct unix_domain, h);
+	ipmp = ip_map_lookup(net, "nfsd", addr);
+
+	if (ipmp)
+		return ip_map_update(net, ipmp, udom, NEVER);
+	else
+		return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(auth_unix_add_addr);
+
+int auth_unix_forget_old(struct auth_domain *dom)
+{
+	struct unix_domain *udom;
+
+	if (dom->flavour != &svcauth_unix)
+		return -EINVAL;
+	udom = container_of(dom, struct unix_domain, h);
+	udom->addr_changes++;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(auth_unix_forget_old);
+
+struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr)
+{
+	struct ip_map *ipm;
+	struct auth_domain *rv;
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	ipm = ip_map_lookup(net, "nfsd", addr);
+
+	if (!ipm)
+		return NULL;
+	if (cache_check(sn->ip_map_cache, &ipm->h, NULL))
+		return NULL;
+
+	if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
+		sunrpc_invalidate(&ipm->h, sn->ip_map_cache);
+		rv = NULL;
+	} else {
+		rv = &ipm->m_client->h;
+		kref_get(&rv->ref);
+	}
+	cache_put(&ipm->h, sn->ip_map_cache);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(auth_unix_lookup);
+#endif /* CONFIG_NFSD_DEPRECATED */
+
+void svcauth_unix_purge(void)
+{
+	struct net *net;
+
+	for_each_net(net) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(net, sunrpc_net_id);
+		cache_purge(sn->ip_map_cache);
+	}
+}
+EXPORT_SYMBOL_GPL(svcauth_unix_purge);
+
+static inline struct ip_map *
+ip_map_cached_get(struct svc_xprt *xprt)
+{
+	struct ip_map *ipm = NULL;
+	struct sunrpc_net *sn;
+
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		ipm = xprt->xpt_auth_cache;
+		if (ipm != NULL) {
+			if (!cache_valid(&ipm->h)) {
+				/*
+				 * The entry has been invalidated since it was
+				 * remembered, e.g. by a second mount from the
+				 * same IP address.
+				 */
+				sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+				xprt->xpt_auth_cache = NULL;
+				spin_unlock(&xprt->xpt_lock);
+				cache_put(&ipm->h, sn->ip_map_cache);
+				return NULL;
+			}
+			cache_get(&ipm->h);
+		}
+		spin_unlock(&xprt->xpt_lock);
+	}
+	return ipm;
+}
+
+static inline void
+ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm)
+{
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+		spin_lock(&xprt->xpt_lock);
+		if (xprt->xpt_auth_cache == NULL) {
+			/* newly cached, keep the reference */
+			xprt->xpt_auth_cache = ipm;
+			ipm = NULL;
+		}
+		spin_unlock(&xprt->xpt_lock);
+	}
+	if (ipm) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+		cache_put(&ipm->h, sn->ip_map_cache);
+	}
+}
+
+void
+svcauth_unix_info_release(struct svc_xprt *xpt)
+{
+	struct ip_map *ipm;
+
+	ipm = xpt->xpt_auth_cache;
+	if (ipm != NULL) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(xpt->xpt_net, sunrpc_net_id);
+		cache_put(&ipm->h, sn->ip_map_cache);
+	}
+}
+
+/****************************************************************************
+ * auth.unix.gid cache
+ * simple cache to map a UID to a list of GIDs
+ * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ */
+#define	GID_HASHBITS	8
+#define	GID_HASHMAX	(1<<GID_HASHBITS)
+
+struct unix_gid {
+	struct cache_head	h;
+	uid_t			uid;
+	struct group_info	*gi;
+};
+static struct cache_head	*gid_table[GID_HASHMAX];
+
+static void unix_gid_put(struct kref *kref)
+{
+	struct cache_head *item = container_of(kref, struct cache_head, ref);
+	struct unix_gid *ug = container_of(item, struct unix_gid, h);
+	if (test_bit(CACHE_VALID, &item->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &item->flags))
+		put_group_info(ug->gi);
+	kfree(ug);
+}
+
+static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew)
+{
+	struct unix_gid *orig = container_of(corig, struct unix_gid, h);
+	struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+	return orig->uid == new->uid;
+}
+static void unix_gid_init(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+	struct unix_gid *item = container_of(citem, struct unix_gid, h);
+	new->uid = item->uid;
+}
+static void unix_gid_update(struct cache_head *cnew, struct cache_head *citem)
+{
+	struct unix_gid *new = container_of(cnew, struct unix_gid, h);
+	struct unix_gid *item = container_of(citem, struct unix_gid, h);
+
+	get_group_info(item->gi);
+	new->gi = item->gi;
+}
+static struct cache_head *unix_gid_alloc(void)
+{
+	struct unix_gid *g = kmalloc(sizeof(*g), GFP_KERNEL);
+	if (g)
+		return &g->h;
+	else
+		return NULL;
+}
+
+static void unix_gid_request(struct cache_detail *cd,
+			     struct cache_head *h,
+			     char **bpp, int *blen)
+{
+	char tuid[20];
+	struct unix_gid *ug = container_of(h, struct unix_gid, h);
+
+	snprintf(tuid, 20, "%u", ug->uid);
+	qword_add(bpp, blen, tuid);
+	(*bpp)[-1] = '\n';
+}
+
+static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+	return sunrpc_cache_pipe_upcall(cd, h, unix_gid_request);
+}
+
+static struct unix_gid *unix_gid_lookup(uid_t uid);
+extern struct cache_detail unix_gid_cache;
+
+static int unix_gid_parse(struct cache_detail *cd,
+			char *mesg, int mlen)
+{
+	/* uid expiry Ngid gid0 gid1 ... gidN-1 */
+	int uid;
+	int gids;
+	int rv;
+	int i;
+	int err;
+	time_t expiry;
+	struct unix_gid ug, *ugp;
+
+	if (mlen <= 0 || mesg[mlen-1] != '\n')
+		return -EINVAL;
+	mesg[mlen-1] = 0;
+
+	rv = get_int(&mesg, &uid);
+	if (rv)
+		return -EINVAL;
+	ug.uid = uid;
+
+	expiry = get_expiry(&mesg);
+	if (expiry == 0)
+		return -EINVAL;
+
+	rv = get_int(&mesg, &gids);
+	if (rv || gids < 0 || gids > 8192)
+		return -EINVAL;
+
+	ug.gi = groups_alloc(gids);
+	if (!ug.gi)
+		return -ENOMEM;
+
+	for (i = 0 ; i < gids ; i++) {
+		int gid;
+		rv = get_int(&mesg, &gid);
+		err = -EINVAL;
+		if (rv)
+			goto out;
+		GROUP_AT(ug.gi, i) = gid;
+	}
+
+	ugp = unix_gid_lookup(uid);
+	if (ugp) {
+		struct cache_head *ch;
+		ug.h.flags = 0;
+		ug.h.expiry_time = expiry;
+		ch = sunrpc_cache_update(&unix_gid_cache,
+					 &ug.h, &ugp->h,
+					 hash_long(uid, GID_HASHBITS));
+		if (!ch)
+			err = -ENOMEM;
+		else {
+			err = 0;
+			cache_put(ch, &unix_gid_cache);
+		}
+	} else
+		err = -ENOMEM;
+ out:
+	if (ug.gi)
+		put_group_info(ug.gi);
+	return err;
+}
+
+static int unix_gid_show(struct seq_file *m,
+			 struct cache_detail *cd,
+			 struct cache_head *h)
+{
+	struct unix_gid *ug;
+	int i;
+	int glen;
+
+	if (h == NULL) {
+		seq_puts(m, "#uid cnt: gids...\n");
+		return 0;
+	}
+	ug = container_of(h, struct unix_gid, h);
+	if (test_bit(CACHE_VALID, &h->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &h->flags))
+		glen = ug->gi->ngroups;
+	else
+		glen = 0;
+
+	seq_printf(m, "%u %d:", ug->uid, glen);
+	for (i = 0; i < glen; i++)
+		seq_printf(m, " %d", GROUP_AT(ug->gi, i));
+	seq_printf(m, "\n");
+	return 0;
+}
+
+struct cache_detail unix_gid_cache = {
+	.owner		= THIS_MODULE,
+	.hash_size	= GID_HASHMAX,
+	.hash_table	= gid_table,
+	.name		= "auth.unix.gid",
+	.cache_put	= unix_gid_put,
+	.cache_upcall	= unix_gid_upcall,
+	.cache_parse	= unix_gid_parse,
+	.cache_show	= unix_gid_show,
+	.match		= unix_gid_match,
+	.init		= unix_gid_init,
+	.update		= unix_gid_update,
+	.alloc		= unix_gid_alloc,
+};
+
+static struct unix_gid *unix_gid_lookup(uid_t uid)
+{
+	struct unix_gid ug;
+	struct cache_head *ch;
+
+	ug.uid = uid;
+	ch = sunrpc_cache_lookup(&unix_gid_cache, &ug.h,
+				 hash_long(uid, GID_HASHBITS));
+	if (ch)
+		return container_of(ch, struct unix_gid, h);
+	else
+		return NULL;
+}
+
+static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp)
+{
+	struct unix_gid *ug;
+	struct group_info *gi;
+	int ret;
+
+	ug = unix_gid_lookup(uid);
+	if (!ug)
+		return ERR_PTR(-EAGAIN);
+	ret = cache_check(&unix_gid_cache, &ug->h, &rqstp->rq_chandle);
+	switch (ret) {
+	case -ENOENT:
+		return ERR_PTR(-ENOENT);
+	case -ETIMEDOUT:
+		return ERR_PTR(-ESHUTDOWN);
+	case 0:
+		gi = get_group_info(ug->gi);
+		cache_put(&ug->h, &unix_gid_cache);
+		return gi;
+	default:
+		return ERR_PTR(-EAGAIN);
+	}
+}
+
+int
+svcauth_unix_set_client(struct svc_rqst *rqstp)
+{
+	struct sockaddr_in *sin;
+	struct sockaddr_in6 *sin6, sin6_storage;
+	struct ip_map *ipm;
+	struct group_info *gi;
+	struct svc_cred *cred = &rqstp->rq_cred;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct net *net = xprt->xpt_net;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	switch (rqstp->rq_addr.ss_family) {
+	case AF_INET:
+		sin = svc_addr_in(rqstp);
+		sin6 = &sin6_storage;
+		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &sin6->sin6_addr);
+		break;
+	case AF_INET6:
+		sin6 = svc_addr_in6(rqstp);
+		break;
+	default:
+		BUG();
+	}
+
+	rqstp->rq_client = NULL;
+	if (rqstp->rq_proc == 0)
+		return SVC_OK;
+
+	ipm = ip_map_cached_get(xprt);
+	if (ipm == NULL)
+		ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
+				    &sin6->sin6_addr);
+
+	if (ipm == NULL)
+		return SVC_DENIED;
+
+	switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+		default:
+			BUG();
+		case -ETIMEDOUT:
+			return SVC_CLOSE;
+		case -EAGAIN:
+			return SVC_DROP;
+		case -ENOENT:
+			return SVC_DENIED;
+		case 0:
+			rqstp->rq_client = &ipm->m_client->h;
+			kref_get(&rqstp->rq_client->ref);
+			ip_map_cached_put(xprt, ipm);
+			break;
+	}
+
+	gi = unix_gid_find(cred->cr_uid, rqstp);
+	switch (PTR_ERR(gi)) {
+	case -EAGAIN:
+		return SVC_DROP;
+	case -ESHUTDOWN:
+		return SVC_CLOSE;
+	case -ENOENT:
+		break;
+	default:
+		put_group_info(cred->cr_group_info);
+		cred->cr_group_info = gi;
+	}
+	return SVC_OK;
+}
+
+EXPORT_SYMBOL_GPL(svcauth_unix_set_client);
+
+static int
+svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+	struct svc_cred	*cred = &rqstp->rq_cred;
+
+	cred->cr_group_info = NULL;
+	rqstp->rq_client = NULL;
+
+	if (argv->iov_len < 3*4)
+		return SVC_GARBAGE;
+
+	if (svc_getu32(argv) != 0) {
+		dprintk("svc: bad null cred\n");
+		*authp = rpc_autherr_badcred;
+		return SVC_DENIED;
+	}
+	if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+		dprintk("svc: bad null verf\n");
+		*authp = rpc_autherr_badverf;
+		return SVC_DENIED;
+	}
+
+	/* Signal that mapping to nobody uid/gid is required */
+	cred->cr_uid = (uid_t) -1;
+	cred->cr_gid = (gid_t) -1;
+	cred->cr_group_info = groups_alloc(0);
+	if (cred->cr_group_info == NULL)
+		return SVC_CLOSE; /* kmalloc failure - client must retry */
+
+	/* Put NULL verifier */
+	svc_putnl(resv, RPC_AUTH_NULL);
+	svc_putnl(resv, 0);
+
+	rqstp->rq_flavor = RPC_AUTH_NULL;
+	return SVC_OK;
+}
+
+static int
+svcauth_null_release(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_client)
+		auth_domain_put(rqstp->rq_client);
+	rqstp->rq_client = NULL;
+	if (rqstp->rq_cred.cr_group_info)
+		put_group_info(rqstp->rq_cred.cr_group_info);
+	rqstp->rq_cred.cr_group_info = NULL;
+
+	return 0; /* don't drop */
+}
+
+
+struct auth_ops svcauth_null = {
+	.name		= "null",
+	.owner		= THIS_MODULE,
+	.flavour	= RPC_AUTH_NULL,
+	.accept 	= svcauth_null_accept,
+	.release	= svcauth_null_release,
+	.set_client	= svcauth_unix_set_client,
+};
+
+
+static int
+svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
+{
+	struct kvec	*argv = &rqstp->rq_arg.head[0];
+	struct kvec	*resv = &rqstp->rq_res.head[0];
+	struct svc_cred	*cred = &rqstp->rq_cred;
+	u32		slen, i;
+	int		len   = argv->iov_len;
+
+	cred->cr_group_info = NULL;
+	rqstp->rq_client = NULL;
+
+	if ((len -= 3*4) < 0)
+		return SVC_GARBAGE;
+
+	svc_getu32(argv);			/* length */
+	svc_getu32(argv);			/* time stamp */
+	slen = XDR_QUADLEN(svc_getnl(argv));	/* machname length */
+	if (slen > 64 || (len -= (slen + 3)*4) < 0)
+		goto badcred;
+	argv->iov_base = (void*)((__be32*)argv->iov_base + slen);	/* skip machname */
+	argv->iov_len -= slen*4;
+
+	cred->cr_uid = svc_getnl(argv);		/* uid */
+	cred->cr_gid = svc_getnl(argv);		/* gid */
+	slen = svc_getnl(argv);			/* gids length */
+	if (slen > 16 || (len -= (slen + 2)*4) < 0)
+		goto badcred;
+	cred->cr_group_info = groups_alloc(slen);
+	if (cred->cr_group_info == NULL)
+		return SVC_CLOSE;
+	for (i = 0; i < slen; i++)
+		GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv);
+	if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+		*authp = rpc_autherr_badverf;
+		return SVC_DENIED;
+	}
+
+	/* Put NULL verifier */
+	svc_putnl(resv, RPC_AUTH_NULL);
+	svc_putnl(resv, 0);
+
+	rqstp->rq_flavor = RPC_AUTH_UNIX;
+	return SVC_OK;
+
+badcred:
+	*authp = rpc_autherr_badcred;
+	return SVC_DENIED;
+}
+
+static int
+svcauth_unix_release(struct svc_rqst *rqstp)
+{
+	/* Verifier (such as it is) is already in place.
+	 */
+	if (rqstp->rq_client)
+		auth_domain_put(rqstp->rq_client);
+	rqstp->rq_client = NULL;
+	if (rqstp->rq_cred.cr_group_info)
+		put_group_info(rqstp->rq_cred.cr_group_info);
+	rqstp->rq_cred.cr_group_info = NULL;
+
+	return 0;
+}
+
+
+struct auth_ops svcauth_unix = {
+	.name		= "unix",
+	.owner		= THIS_MODULE,
+	.flavour	= RPC_AUTH_UNIX,
+	.accept 	= svcauth_unix_accept,
+	.release	= svcauth_unix_release,
+	.domain_release	= svcauth_unix_domain_release,
+	.set_client	= svcauth_unix_set_client,
+};
+
+int ip_map_cache_create(struct net *net)
+{
+	int err = -ENOMEM;
+	struct cache_detail *cd;
+	struct cache_head **tbl;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
+	if (cd == NULL)
+		goto err_cd;
+
+	tbl = kzalloc(IP_HASHMAX * sizeof(struct cache_head *), GFP_KERNEL);
+	if (tbl == NULL)
+		goto err_tbl;
+
+	cd->owner = THIS_MODULE,
+	cd->hash_size = IP_HASHMAX,
+	cd->hash_table = tbl,
+	cd->name = "auth.unix.ip",
+	cd->cache_put = ip_map_put,
+	cd->cache_upcall = ip_map_upcall,
+	cd->cache_parse = ip_map_parse,
+	cd->cache_show = ip_map_show,
+	cd->match = ip_map_match,
+	cd->init = ip_map_init,
+	cd->update = update,
+	cd->alloc = ip_map_alloc,
+
+	err = cache_register_net(cd, net);
+	if (err)
+		goto err_reg;
+
+	sn->ip_map_cache = cd;
+	return 0;
+
+err_reg:
+	kfree(tbl);
+err_tbl:
+	kfree(cd);
+err_cd:
+	return err;
+}
+
+void ip_map_cache_destroy(struct net *net)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	cache_purge(sn->ip_map_cache);
+	cache_unregister_net(sn->ip_map_cache, net);
+	kfree(sn->ip_map_cache->hash_table);
+	kfree(sn->ip_map_cache);
+}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
new file mode 100644
index 00000000..af04f779
--- /dev/null
+++ b/net/sunrpc/svcsock.c
@@ -0,0 +1,1663 @@
+/*
+ * linux/net/sunrpc/svcsock.c
+ *
+ * These are the RPC server socket internals.
+ *
+ * The server scheduling algorithm does not always distribute the load
+ * evenly when servicing a single client. May need to modify the
+ * svc_xprt_enqueue procedure...
+ *
+ * TCP support is largely untested and may be a little slow. The problem
+ * is that we currently do two separate recvfrom's, one for the 4-byte
+ * record length, and the second for the actual record. This could possibly
+ * be improved by always reading a minimum size of around 100 bytes and
+ * tucking any superfluous bytes away in a temporary store. Still, that
+ * leaves write requests out in the rain. An alternative may be to peek at
+ * the first skb in the queue, and if it matches the next TCP sequence
+ * number, to extract the record marker. Yuck.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/xprt.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+
+static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
+					 int *errp, int flags);
+static void		svc_udp_data_ready(struct sock *, int);
+static int		svc_udp_recvfrom(struct svc_rqst *);
+static int		svc_udp_sendto(struct svc_rqst *);
+static void		svc_sock_detach(struct svc_xprt *);
+static void		svc_tcp_sock_detach(struct svc_xprt *);
+static void		svc_sock_free(struct svc_xprt *);
+
+static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
+					  struct net *, struct sockaddr *,
+					  int, int);
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+					     struct net *, struct sockaddr *,
+					     int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+#endif /* CONFIG_NFS_V4_1 */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key svc_key[2];
+static struct lock_class_key svc_slock_key[2];
+
+static void svc_reclassify_socket(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	switch (sk->sk_family) {
+	case AF_INET:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
+					      &svc_slock_key[0],
+					      "sk_xprt.xpt_lock-AF_INET-NFSD",
+					      &svc_key[0]);
+		break;
+
+	case AF_INET6:
+		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
+					      &svc_slock_key[1],
+					      "sk_xprt.xpt_lock-AF_INET6-NFSD",
+					      &svc_key[1]);
+		break;
+
+	default:
+		BUG();
+	}
+}
+#else
+static void svc_reclassify_socket(struct socket *sock)
+{
+}
+#endif
+
+/*
+ * Release an skbuff after use
+ */
+static void svc_release_skb(struct svc_rqst *rqstp)
+{
+	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
+
+	if (skb) {
+		struct svc_sock *svsk =
+			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+		rqstp->rq_xprt_ctxt = NULL;
+
+		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+		skb_free_datagram_locked(svsk->sk_sk, skb);
+	}
+}
+
+union svc_pktinfo_u {
+	struct in_pktinfo pkti;
+	struct in6_pktinfo pkti6;
+};
+#define SVC_PKTINFO_SPACE \
+	CMSG_SPACE(sizeof(union svc_pktinfo_u))
+
+static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
+{
+	struct svc_sock *svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	switch (svsk->sk_sk->sk_family) {
+	case AF_INET: {
+			struct in_pktinfo *pki = CMSG_DATA(cmh);
+
+			cmh->cmsg_level = SOL_IP;
+			cmh->cmsg_type = IP_PKTINFO;
+			pki->ipi_ifindex = 0;
+			pki->ipi_spec_dst.s_addr = rqstp->rq_daddr.addr.s_addr;
+			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
+		}
+		break;
+
+	case AF_INET6: {
+			struct in6_pktinfo *pki = CMSG_DATA(cmh);
+
+			cmh->cmsg_level = SOL_IPV6;
+			cmh->cmsg_type = IPV6_PKTINFO;
+			pki->ipi6_ifindex = 0;
+			ipv6_addr_copy(&pki->ipi6_addr,
+					&rqstp->rq_daddr.addr6);
+			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
+		}
+		break;
+	}
+}
+
+/*
+ * send routine intended to be shared by the fore- and back-channel
+ */
+int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
+		    struct page *headpage, unsigned long headoffset,
+		    struct page *tailpage, unsigned long tailoffset)
+{
+	int		result;
+	int		size;
+	struct page	**ppage = xdr->pages;
+	size_t		base = xdr->page_base;
+	unsigned int	pglen = xdr->page_len;
+	unsigned int	flags = MSG_MORE;
+	int		slen;
+	int		len = 0;
+
+	slen = xdr->len;
+
+	/* send head */
+	if (slen == xdr->head[0].iov_len)
+		flags = 0;
+	len = kernel_sendpage(sock, headpage, headoffset,
+				  xdr->head[0].iov_len, flags);
+	if (len != xdr->head[0].iov_len)
+		goto out;
+	slen -= xdr->head[0].iov_len;
+	if (slen == 0)
+		goto out;
+
+	/* send page data */
+	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
+	while (pglen > 0) {
+		if (slen == size)
+			flags = 0;
+		result = kernel_sendpage(sock, *ppage, base, size, flags);
+		if (result > 0)
+			len += result;
+		if (result != size)
+			goto out;
+		slen -= size;
+		pglen -= size;
+		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
+		base = 0;
+		ppage++;
+	}
+
+	/* send tail */
+	if (xdr->tail[0].iov_len) {
+		result = kernel_sendpage(sock, tailpage, tailoffset,
+				   xdr->tail[0].iov_len, 0);
+		if (result > 0)
+			len += result;
+	}
+
+out:
+	return len;
+}
+
+
+/*
+ * Generic sendto routine
+ */
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+{
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct socket	*sock = svsk->sk_sock;
+	union {
+		struct cmsghdr	hdr;
+		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
+	} buffer;
+	struct cmsghdr *cmh = &buffer.hdr;
+	int		len = 0;
+	unsigned long tailoff;
+	unsigned long headoff;
+	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+	if (rqstp->rq_prot == IPPROTO_UDP) {
+		struct msghdr msg = {
+			.msg_name	= &rqstp->rq_addr,
+			.msg_namelen	= rqstp->rq_addrlen,
+			.msg_control	= cmh,
+			.msg_controllen	= sizeof(buffer),
+			.msg_flags	= MSG_MORE,
+		};
+
+		svc_set_cmsg_data(rqstp, cmh);
+
+		if (sock_sendmsg(sock, &msg, 0) < 0)
+			goto out;
+	}
+
+	tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
+	headoff = 0;
+	len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
+			       rqstp->rq_respages[0], tailoff);
+
+out:
+	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
+		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
+
+	return len;
+}
+
+/*
+ * Report socket names for nfsdfs
+ */
+static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
+{
+	const struct sock *sk = svsk->sk_sk;
+	const char *proto_name = sk->sk_protocol == IPPROTO_UDP ?
+							"udp" : "tcp";
+	int len;
+
+	switch (sk->sk_family) {
+	case PF_INET:
+		len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
+				proto_name,
+				&inet_sk(sk)->inet_rcv_saddr,
+				inet_sk(sk)->inet_num);
+		break;
+	case PF_INET6:
+		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
+				proto_name,
+				&inet6_sk(sk)->rcv_saddr,
+				inet_sk(sk)->inet_num);
+		break;
+	default:
+		len = snprintf(buf, remaining, "*unknown-%d*\n",
+				sk->sk_family);
+	}
+
+	if (len >= remaining) {
+		*buf = '\0';
+		return -ENAMETOOLONG;
+	}
+	return len;
+}
+
+/**
+ * svc_sock_names - construct a list of listener names in a string
+ * @serv: pointer to RPC service
+ * @buf: pointer to a buffer to fill in with socket names
+ * @buflen: size of the buffer to be filled
+ * @toclose: pointer to '\0'-terminated C string containing the name
+ *		of a listener to be closed
+ *
+ * Fills in @buf with a '\n'-separated list of names of listener
+ * sockets.  If @toclose is not NULL, the socket named by @toclose
+ * is closed, and is not included in the output list.
+ *
+ * Returns positive length of the socket name string, or a negative
+ * errno value on error.
+ */
+int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen,
+		   const char *toclose)
+{
+	struct svc_sock *svsk, *closesk = NULL;
+	int len = 0;
+
+	if (!serv)
+		return 0;
+
+	spin_lock_bh(&serv->sv_lock);
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
+		int onelen = svc_one_sock_name(svsk, buf + len, buflen - len);
+		if (onelen < 0) {
+			len = onelen;
+			break;
+		}
+		if (toclose && strcmp(toclose, buf + len) == 0) {
+			closesk = svsk;
+			svc_xprt_get(&closesk->sk_xprt);
+		} else
+			len += onelen;
+	}
+	spin_unlock_bh(&serv->sv_lock);
+
+	if (closesk) {
+		/* Should unregister with portmap, but you cannot
+		 * unregister just one protocol...
+		 */
+		svc_close_xprt(&closesk->sk_xprt);
+		svc_xprt_put(&closesk->sk_xprt);
+	} else if (toclose)
+		return -ENOENT;
+	return len;
+}
+EXPORT_SYMBOL_GPL(svc_sock_names);
+
+/*
+ * Check input queue length
+ */
+static int svc_recv_available(struct svc_sock *svsk)
+{
+	struct socket	*sock = svsk->sk_sock;
+	int		avail, err;
+
+	err = kernel_sock_ioctl(sock, TIOCINQ, (unsigned long) &avail);
+
+	return (err >= 0)? avail : err;
+}
+
+/*
+ * Generic recvfrom routine.
+ */
+static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
+			int buflen)
+{
+	struct svc_sock *svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct msghdr msg = {
+		.msg_flags	= MSG_DONTWAIT,
+	};
+	int len;
+
+	rqstp->rq_xprt_hlen = 0;
+
+	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
+				msg.msg_flags);
+
+	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+		svsk, iov[0].iov_base, iov[0].iov_len, len);
+	return len;
+}
+
+static int svc_partial_recvfrom(struct svc_rqst *rqstp,
+				struct kvec *iov, int nr,
+				int buflen, unsigned int base)
+{
+	size_t save_iovlen;
+	void __user *save_iovbase;
+	unsigned int i;
+	int ret;
+
+	if (base == 0)
+		return svc_recvfrom(rqstp, iov, nr, buflen);
+
+	for (i = 0; i < nr; i++) {
+		if (iov[i].iov_len > base)
+			break;
+		base -= iov[i].iov_len;
+	}
+	save_iovlen = iov[i].iov_len;
+	save_iovbase = iov[i].iov_base;
+	iov[i].iov_len -= base;
+	iov[i].iov_base += base;
+	ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
+	iov[i].iov_len = save_iovlen;
+	iov[i].iov_base = save_iovbase;
+	return ret;
+}
+
+/*
+ * Set socket snd and rcv buffer lengths
+ */
+static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
+				unsigned int rcv)
+{
+#if 0
+	mm_segment_t	oldfs;
+	oldfs = get_fs(); set_fs(KERNEL_DS);
+	sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+			(char*)&snd, sizeof(snd));
+	sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+			(char*)&rcv, sizeof(rcv));
+#else
+	/* sock_setsockopt limits use to sysctl_?mem_max,
+	 * which isn't acceptable.  Until that is made conditional
+	 * on not having CAP_SYS_RESOURCE or similar, we go direct...
+	 * DaveM said I could!
+	 */
+	lock_sock(sock->sk);
+	sock->sk->sk_sndbuf = snd * 2;
+	sock->sk->sk_rcvbuf = rcv * 2;
+	sock->sk->sk_write_space(sock->sk);
+	release_sock(sock->sk);
+#endif
+}
+/*
+ * INET callback when data has been received on the socket.
+ */
+static void svc_udp_data_ready(struct sock *sk, int count)
+{
+	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
+	wait_queue_head_t *wq = sk_sleep(sk);
+
+	if (svsk) {
+		dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
+			svsk, sk, count,
+			test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
+	}
+	if (wq && waitqueue_active(wq))
+		wake_up_interruptible(wq);
+}
+
+/*
+ * INET callback when space is newly available on the socket.
+ */
+static void svc_write_space(struct sock *sk)
+{
+	struct svc_sock	*svsk = (struct svc_sock *)(sk->sk_user_data);
+	wait_queue_head_t *wq = sk_sleep(sk);
+
+	if (svsk) {
+		dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
+			svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+		svc_xprt_enqueue(&svsk->sk_xprt);
+	}
+
+	if (wq && waitqueue_active(wq)) {
+		dprintk("RPC svc_write_space: someone sleeping on %p\n",
+		       svsk);
+		wake_up_interruptible(wq);
+	}
+}
+
+static void svc_tcp_write_space(struct sock *sk)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock)
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+	svc_write_space(sk);
+}
+
+/*
+ * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
+ */
+static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
+				     struct cmsghdr *cmh)
+{
+	struct in_pktinfo *pki = CMSG_DATA(cmh);
+	if (cmh->cmsg_type != IP_PKTINFO)
+		return 0;
+	rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
+	return 1;
+}
+
+/*
+ * See net/ipv6/datagram.c : datagram_recv_ctl
+ */
+static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
+				     struct cmsghdr *cmh)
+{
+	struct in6_pktinfo *pki = CMSG_DATA(cmh);
+	if (cmh->cmsg_type != IPV6_PKTINFO)
+		return 0;
+	ipv6_addr_copy(&rqstp->rq_daddr.addr6, &pki->ipi6_addr);
+	return 1;
+}
+
+/*
+ * Copy the UDP datagram's destination address to the rqstp structure.
+ * The 'destination' address in this case is the address to which the
+ * peer sent the datagram, i.e. our local address. For multihomed
+ * hosts, this can change from msg to msg. Note that only the IP
+ * address changes, the port number should remain the same.
+ */
+static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
+				    struct cmsghdr *cmh)
+{
+	switch (cmh->cmsg_level) {
+	case SOL_IP:
+		return svc_udp_get_dest_address4(rqstp, cmh);
+	case SOL_IPV6:
+		return svc_udp_get_dest_address6(rqstp, cmh);
+	}
+
+	return 0;
+}
+
+/*
+ * Receive a datagram from a UDP socket.
+ */
+static int svc_udp_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	struct sk_buff	*skb;
+	union {
+		struct cmsghdr	hdr;
+		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
+	} buffer;
+	struct cmsghdr *cmh = &buffer.hdr;
+	struct msghdr msg = {
+		.msg_name = svc_addr(rqstp),
+		.msg_control = cmh,
+		.msg_controllen = sizeof(buffer),
+		.msg_flags = MSG_DONTWAIT,
+	};
+	size_t len;
+	int err;
+
+	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
+	    /* udp sockets need large rcvbuf as all pending
+	     * requests are still in that buffer.  sndbuf must
+	     * also be large enough that there is enough space
+	     * for one reply per thread.  We count all threads
+	     * rather than threads in a particular pool, which
+	     * provides an upper bound on the number of threads
+	     * which will access the socket.
+	     */
+	    svc_sock_setbufsize(svsk->sk_sock,
+				(serv->sv_nrthreads+3) * serv->sv_max_mesg,
+				(serv->sv_nrthreads+3) * serv->sv_max_mesg);
+
+	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+	skb = NULL;
+	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
+			     0, 0, MSG_PEEK | MSG_DONTWAIT);
+	if (err >= 0)
+		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+
+	if (skb == NULL) {
+		if (err != -EAGAIN) {
+			/* possibly an icmp error */
+			dprintk("svc: recvfrom returned error %d\n", -err);
+			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		}
+		return -EAGAIN;
+	}
+	len = svc_addr_len(svc_addr(rqstp));
+	if (len == 0)
+		return -EAFNOSUPPORT;
+	rqstp->rq_addrlen = len;
+	if (skb->tstamp.tv64 == 0) {
+		skb->tstamp = ktime_get_real();
+		/* Don't enable netstamp, sunrpc doesn't
+		   need that much accuracy */
+	}
+	svsk->sk_sk->sk_stamp = skb->tstamp;
+	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
+
+	len  = skb->len - sizeof(struct udphdr);
+	rqstp->rq_arg.len = len;
+
+	rqstp->rq_prot = IPPROTO_UDP;
+
+	if (!svc_udp_get_dest_address(rqstp, cmh)) {
+		if (net_ratelimit())
+			printk(KERN_WARNING
+				"svc: received unknown control message %d/%d; "
+				"dropping RPC reply datagram\n",
+					cmh->cmsg_level, cmh->cmsg_type);
+		skb_free_datagram_locked(svsk->sk_sk, skb);
+		return 0;
+	}
+
+	if (skb_is_nonlinear(skb)) {
+		/* we have to copy */
+		local_bh_disable();
+		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+			local_bh_enable();
+			/* checksum error */
+			skb_free_datagram_locked(svsk->sk_sk, skb);
+			return 0;
+		}
+		local_bh_enable();
+		skb_free_datagram_locked(svsk->sk_sk, skb);
+	} else {
+		/* we can use it in-place */
+		rqstp->rq_arg.head[0].iov_base = skb->data +
+			sizeof(struct udphdr);
+		rqstp->rq_arg.head[0].iov_len = len;
+		if (skb_checksum_complete(skb)) {
+			skb_free_datagram_locked(svsk->sk_sk, skb);
+			return 0;
+		}
+		rqstp->rq_xprt_ctxt = skb;
+	}
+
+	rqstp->rq_arg.page_base = 0;
+	if (len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = len;
+		rqstp->rq_arg.page_len = 0;
+		rqstp->rq_respages = rqstp->rq_pages+1;
+	} else {
+		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+		rqstp->rq_respages = rqstp->rq_pages + 1 +
+			DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
+	}
+
+	if (serv->sv_stats)
+		serv->sv_stats->netudpcnt++;
+
+	return len;
+}
+
+static int
+svc_udp_sendto(struct svc_rqst *rqstp)
+{
+	int		error;
+
+	error = svc_sendto(rqstp, &rqstp->rq_res);
+	if (error == -ECONNREFUSED)
+		/* ICMP error on earlier request. */
+		error = svc_sendto(rqstp, &rqstp->rq_res);
+
+	return error;
+}
+
+static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+static int svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = xprt->xpt_server;
+	unsigned long required;
+
+	/*
+	 * Set the SOCK_NOSPACE flag before checking the available
+	 * sock space.
+	 */
+	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+	if (required*2 > sock_wspace(svsk->sk_sk))
+		return 0;
+	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	return 1;
+}
+
+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+	BUG();
+	return NULL;
+}
+
+static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+				       struct net *net,
+				       struct sockaddr *sa, int salen,
+				       int flags)
+{
+	return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_udp_ops = {
+	.xpo_create = svc_udp_create,
+	.xpo_recvfrom = svc_udp_recvfrom,
+	.xpo_sendto = svc_udp_sendto,
+	.xpo_release_rqst = svc_release_skb,
+	.xpo_detach = svc_sock_detach,
+	.xpo_free = svc_sock_free,
+	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+	.xpo_has_wspace = svc_udp_has_wspace,
+	.xpo_accept = svc_udp_accept,
+};
+
+static struct svc_xprt_class svc_udp_class = {
+	.xcl_name = "udp",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_udp_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+};
+
+static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
+{
+	int err, level, optname, one = 1;
+
+	svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
+	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+	svsk->sk_sk->sk_write_space = svc_write_space;
+
+	/* initialise setting must have enough space to
+	 * receive and respond to one request.
+	 * svc_udp_recvfrom will re-adjust if necessary
+	 */
+	svc_sock_setbufsize(svsk->sk_sock,
+			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
+
+	/* data might have come in before data_ready set up */
+	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+	set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+
+	/* make sure we get destination address info */
+	switch (svsk->sk_sk->sk_family) {
+	case AF_INET:
+		level = SOL_IP;
+		optname = IP_PKTINFO;
+		break;
+	case AF_INET6:
+		level = SOL_IPV6;
+		optname = IPV6_RECVPKTINFO;
+		break;
+	default:
+		BUG();
+	}
+	err = kernel_setsockopt(svsk->sk_sock, level, optname,
+					(char *)&one, sizeof(one));
+	dprintk("svc: kernel_setsockopt returned %d\n", err);
+}
+
+/*
+ * A data_ready event on a listening socket means there's a connection
+ * pending. Do not use state_change as a substitute for it.
+ */
+static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+{
+	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
+	wait_queue_head_t *wq;
+
+	dprintk("svc: socket %p TCP (listen) state change %d\n",
+		sk, sk->sk_state);
+
+	/*
+	 * This callback may called twice when a new connection
+	 * is established as a child socket inherits everything
+	 * from a parent LISTEN socket.
+	 * 1) data_ready method of the parent socket will be called
+	 *    when one of child sockets become ESTABLISHED.
+	 * 2) data_ready method of the child socket may be called
+	 *    when it receives data before the socket is accepted.
+	 * In case of 2, we should ignore it silently.
+	 */
+	if (sk->sk_state == TCP_LISTEN) {
+		if (svsk) {
+			set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+			svc_xprt_enqueue(&svsk->sk_xprt);
+		} else
+			printk("svc: socket %p: no user data\n", sk);
+	}
+
+	wq = sk_sleep(sk);
+	if (wq && waitqueue_active(wq))
+		wake_up_interruptible_all(wq);
+}
+
+/*
+ * A state change on a connected socket means it's dying or dead.
+ */
+static void svc_tcp_state_change(struct sock *sk)
+{
+	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
+	wait_queue_head_t *wq = sk_sleep(sk);
+
+	dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
+		sk, sk->sk_state, sk->sk_user_data);
+
+	if (!svsk)
+		printk("svc: socket %p: no user data\n", sk);
+	else {
+		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
+	}
+	if (wq && waitqueue_active(wq))
+		wake_up_interruptible_all(wq);
+}
+
+static void svc_tcp_data_ready(struct sock *sk, int count)
+{
+	struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
+	wait_queue_head_t *wq = sk_sleep(sk);
+
+	dprintk("svc: socket %p TCP data ready (svsk %p)\n",
+		sk, sk->sk_user_data);
+	if (svsk) {
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		svc_xprt_enqueue(&svsk->sk_xprt);
+	}
+	if (wq && waitqueue_active(wq))
+		wake_up_interruptible(wq);
+}
+
+/*
+ * Accept a TCP connection
+ */
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct sockaddr_storage addr;
+	struct sockaddr	*sin = (struct sockaddr *) &addr;
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	struct socket	*sock = svsk->sk_sock;
+	struct socket	*newsock;
+	struct svc_sock	*newsvsk;
+	int		err, slen;
+	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+	if (!sock)
+		return NULL;
+
+	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+	err = kernel_accept(sock, &newsock, O_NONBLOCK);
+	if (err < 0) {
+		if (err == -ENOMEM)
+			printk(KERN_WARNING "%s: no more sockets!\n",
+			       serv->sv_name);
+		else if (err != -EAGAIN && net_ratelimit())
+			printk(KERN_WARNING "%s: accept failed (err %d)!\n",
+				   serv->sv_name, -err);
+		return NULL;
+	}
+	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+
+	err = kernel_getpeername(newsock, sin, &slen);
+	if (err < 0) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "%s: peername failed (err %d)!\n",
+				   serv->sv_name, -err);
+		goto failed;		/* aborted connection or whatever */
+	}
+
+	/* Ideally, we would want to reject connections from unauthorized
+	 * hosts here, but when we get encryption, the IP of the host won't
+	 * tell us anything.  For now just warn about unpriv connections.
+	 */
+	if (!svc_port_is_privileged(sin)) {
+		dprintk(KERN_WARNING
+			"%s: connect from unprivileged port: %s\n",
+			serv->sv_name,
+			__svc_print_addr(sin, buf, sizeof(buf)));
+	}
+	dprintk("%s: connect from %s\n", serv->sv_name,
+		__svc_print_addr(sin, buf, sizeof(buf)));
+
+	/* make sure that a write doesn't block forever when
+	 * low on memory
+	 */
+	newsock->sk->sk_sndtimeo = HZ*30;
+
+	if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
+				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
+		goto failed;
+	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
+	err = kernel_getsockname(newsock, sin, &slen);
+	if (unlikely(err < 0)) {
+		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
+		slen = offsetof(struct sockaddr, sa_data);
+	}
+	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
+
+	if (serv->sv_stats)
+		serv->sv_stats->nettcpconn++;
+
+	return &newsvsk->sk_xprt;
+
+failed:
+	sock_release(newsock);
+	return NULL;
+}
+
+static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return 0;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		if (rqstp->rq_pages[i] != NULL)
+			put_page(rqstp->rq_pages[i]);
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		rqstp->rq_pages[i] = svsk->sk_pages[i];
+		svsk->sk_pages[i] = NULL;
+	}
+	rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
+	return len;
+}
+
+static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		return;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		svsk->sk_pages[i] = rqstp->rq_pages[i];
+		rqstp->rq_pages[i] = NULL;
+	}
+}
+
+static void svc_tcp_clear_pages(struct svc_sock *svsk)
+{
+	unsigned int i, len, npages;
+
+	if (svsk->sk_tcplen <= sizeof(rpc_fraghdr))
+		goto out;
+	len = svsk->sk_tcplen - sizeof(rpc_fraghdr);
+	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		BUG_ON(svsk->sk_pages[i] == NULL);
+		put_page(svsk->sk_pages[i]);
+		svsk->sk_pages[i] = NULL;
+	}
+out:
+	svsk->sk_tcplen = 0;
+}
+
+/*
+ * Receive data.
+ * If we haven't gotten the record length yet, get the next four bytes.
+ * Otherwise try to gobble up as much as possible up to the complete
+ * record length.
+ */
+static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	unsigned int want;
+	int len;
+
+	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+
+	if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) {
+		struct kvec	iov;
+
+		want = sizeof(rpc_fraghdr) - svsk->sk_tcplen;
+		iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+		iov.iov_len  = want;
+		if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+			goto error;
+		svsk->sk_tcplen += len;
+
+		if (len < want) {
+			dprintk("svc: short recvfrom while reading record "
+				"length (%d of %d)\n", len, want);
+			return -EAGAIN;
+		}
+
+		svsk->sk_reclen = ntohl(svsk->sk_reclen);
+		if (!(svsk->sk_reclen & RPC_LAST_STREAM_FRAGMENT)) {
+			/* FIXME: technically, a record can be fragmented,
+			 *  and non-terminal fragments will not have the top
+			 *  bit set in the fragment length header.
+			 *  But apparently no known nfs clients send fragmented
+			 *  records. */
+			if (net_ratelimit())
+				printk(KERN_NOTICE "RPC: multiple fragments "
+					"per record not supported\n");
+			goto err_delete;
+		}
+
+		svsk->sk_reclen &= RPC_FRAGMENT_SIZE_MASK;
+		dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
+		if (svsk->sk_reclen > serv->sv_max_mesg) {
+			if (net_ratelimit())
+				printk(KERN_NOTICE "RPC: "
+					"fragment too large: 0x%08lx\n",
+					(unsigned long)svsk->sk_reclen);
+			goto err_delete;
+		}
+	}
+
+	if (svsk->sk_reclen < 8)
+		goto err_delete; /* client is nuts. */
+
+	len = svsk->sk_reclen;
+
+	return len;
+error:
+	dprintk("RPC: TCP recv_record got %d\n", len);
+	return len;
+err_delete:
+	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+	return -EAGAIN;
+}
+
+static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
+{
+	struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt;
+	struct rpc_rqst *req = NULL;
+	struct kvec *src, *dst;
+	__be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	__be32 xid;
+	__be32 calldir;
+
+	xid = *p++;
+	calldir = *p;
+
+	if (bc_xprt)
+		req = xprt_lookup_rqst(bc_xprt, xid);
+
+	if (!req) {
+		printk(KERN_NOTICE
+			"%s: Got unrecognized reply: "
+			"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
+			__func__, ntohl(calldir),
+			bc_xprt, xid);
+		return -EAGAIN;
+	}
+
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	/*
+	 * XXX!: cheating for now!  Only copying HEAD.
+	 * But we know this is good enough for now (in fact, for any
+	 * callback reply in the forseeable future).
+	 */
+	dst = &req->rq_private_buf.head[0];
+	src = &rqstp->rq_arg.head[0];
+	if (dst->iov_len < src->iov_len)
+		return -EAGAIN; /* whatever; just giving up. */
+	memcpy(dst->iov_base, src->iov_base, src->iov_len);
+	xprt_complete_rqst(req->rq_task, svsk->sk_reclen);
+	rqstp->rq_arg.len = 0;
+	return 0;
+}
+
+static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
+{
+	int i = 0;
+	int t = 0;
+
+	while (t < len) {
+		vec[i].iov_base = page_address(pages[i]);
+		vec[i].iov_len = PAGE_SIZE;
+		i++;
+		t += PAGE_SIZE;
+	}
+	return i;
+}
+
+
+/*
+ * Receive data from a TCP socket.
+ */
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_sock	*svsk =
+		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
+	int		len;
+	struct kvec *vec;
+	unsigned int want, base;
+	__be32 *p;
+	__be32 calldir;
+	int pnum;
+
+	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
+		svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
+		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
+		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
+
+	len = svc_tcp_recv_record(svsk, rqstp);
+	if (len < 0)
+		goto error;
+
+	base = svc_tcp_restore_pages(svsk, rqstp);
+	want = svsk->sk_reclen - base;
+
+	vec = rqstp->rq_vec;
+
+	pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0],
+						svsk->sk_reclen);
+
+	rqstp->rq_respages = &rqstp->rq_pages[pnum];
+
+	/* Now receive data */
+	len = svc_partial_recvfrom(rqstp, vec, pnum, want, base);
+	if (len >= 0)
+		svsk->sk_tcplen += len;
+	if (len != want) {
+		if (len < 0 && len != -EAGAIN)
+			goto err_other;
+		svc_tcp_save_pages(svsk, rqstp);
+		dprintk("svc: incomplete TCP record (%d of %d)\n",
+			svsk->sk_tcplen, svsk->sk_reclen);
+		goto err_noclose;
+	}
+
+	rqstp->rq_arg.len = svsk->sk_reclen;
+	rqstp->rq_arg.page_base = 0;
+	if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
+		rqstp->rq_arg.page_len = 0;
+	} else
+		rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+
+	rqstp->rq_xprt_ctxt   = NULL;
+	rqstp->rq_prot	      = IPPROTO_TCP;
+
+	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	calldir = p[1];
+	if (calldir)
+		len = receive_cb_reply(svsk, rqstp);
+
+	/* Reset TCP read info */
+	svsk->sk_reclen = 0;
+	svsk->sk_tcplen = 0;
+	/* If we have more data, signal svc_xprt_enqueue() to try again */
+	if (svc_recv_available(svsk) > sizeof(rpc_fraghdr))
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+
+	if (len < 0)
+		goto error;
+
+	svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
+	if (serv->sv_stats)
+		serv->sv_stats->nettcpcnt++;
+
+	dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len);
+	return rqstp->rq_arg.len;
+
+error:
+	if (len != -EAGAIN)
+		goto err_other;
+	dprintk("RPC: TCP recvfrom got EAGAIN\n");
+	return -EAGAIN;
+err_other:
+	printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+	       svsk->sk_xprt.xpt_server->sv_name, -len);
+	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+err_noclose:
+	return -EAGAIN;	/* record not complete */
+}
+
+/*
+ * Send out data on TCP socket.
+ */
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
+{
+	struct xdr_buf	*xbufp = &rqstp->rq_res;
+	int sent;
+	__be32 reclen;
+
+	/* Set up the first element of the reply kvec.
+	 * Any other kvecs that may be in use have been taken
+	 * care of by the server implementation itself.
+	 */
+	reclen = htonl(0x80000000|((xbufp->len ) - 4));
+	memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+	sent = svc_sendto(rqstp, &rqstp->rq_res);
+	if (sent != xbufp->len) {
+		printk(KERN_NOTICE
+		       "rpc-srv/tcp: %s: %s %d when sending %d bytes "
+		       "- shutting down socket\n",
+		       rqstp->rq_xprt->xpt_server->sv_name,
+		       (sent<0)?"got error":"sent only",
+		       sent, xbufp->len);
+		set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
+		svc_xprt_enqueue(rqstp->rq_xprt);
+		sent = -EAGAIN;
+	}
+	return sent;
+}
+
+/*
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	/* tcp needs a space for the record length... */
+	svc_putnl(resv, 0);
+}
+
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk =	container_of(xprt, struct svc_sock, sk_xprt);
+	struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+	int required;
+
+	if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
+		return 1;
+	required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
+	if (sk_stream_wspace(svsk->sk_sk) >= required)
+		return 1;
+	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+	return 0;
+}
+
+static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+				       struct net *net,
+				       struct sockaddr *sa, int salen,
+				       int flags)
+{
+	return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
+					     struct net *, struct sockaddr *,
+					     int, int);
+static void svc_bc_sock_free(struct svc_xprt *xprt);
+
+static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv,
+				       struct net *net,
+				       struct sockaddr *sa, int salen,
+				       int flags)
+{
+	return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
+}
+
+static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
+{
+}
+
+static struct svc_xprt_ops svc_tcp_bc_ops = {
+	.xpo_create = svc_bc_tcp_create,
+	.xpo_detach = svc_bc_tcp_sock_detach,
+	.xpo_free = svc_bc_sock_free,
+	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+};
+
+static struct svc_xprt_class svc_tcp_bc_class = {
+	.xcl_name = "tcp-bc",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_tcp_bc_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+static void svc_init_bc_xprt_sock(void)
+{
+	svc_reg_xprt_class(&svc_tcp_bc_class);
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+	svc_unreg_xprt_class(&svc_tcp_bc_class);
+}
+#else /* CONFIG_NFS_V4_1 */
+static void svc_init_bc_xprt_sock(void)
+{
+}
+
+static void svc_cleanup_bc_xprt_sock(void)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+static struct svc_xprt_ops svc_tcp_ops = {
+	.xpo_create = svc_tcp_create,
+	.xpo_recvfrom = svc_tcp_recvfrom,
+	.xpo_sendto = svc_tcp_sendto,
+	.xpo_release_rqst = svc_release_skb,
+	.xpo_detach = svc_tcp_sock_detach,
+	.xpo_free = svc_sock_free,
+	.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+	.xpo_has_wspace = svc_tcp_has_wspace,
+	.xpo_accept = svc_tcp_accept,
+};
+
+static struct svc_xprt_class svc_tcp_class = {
+	.xcl_name = "tcp",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_tcp_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+void svc_init_xprt_sock(void)
+{
+	svc_reg_xprt_class(&svc_tcp_class);
+	svc_reg_xprt_class(&svc_udp_class);
+	svc_init_bc_xprt_sock();
+}
+
+void svc_cleanup_xprt_sock(void)
+{
+	svc_unreg_xprt_class(&svc_tcp_class);
+	svc_unreg_xprt_class(&svc_udp_class);
+	svc_cleanup_bc_xprt_sock();
+}
+
+static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
+{
+	struct sock	*sk = svsk->sk_sk;
+
+	svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
+	set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+	if (sk->sk_state == TCP_LISTEN) {
+		dprintk("setting up TCP socket for listening\n");
+		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
+		sk->sk_data_ready = svc_tcp_listen_data_ready;
+		set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+	} else {
+		dprintk("setting up TCP socket for reading\n");
+		sk->sk_state_change = svc_tcp_state_change;
+		sk->sk_data_ready = svc_tcp_data_ready;
+		sk->sk_write_space = svc_tcp_write_space;
+
+		svsk->sk_reclen = 0;
+		svsk->sk_tcplen = 0;
+		memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages));
+
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+		if (sk->sk_state != TCP_ESTABLISHED)
+			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+	}
+}
+
+void svc_sock_update_bufs(struct svc_serv *serv)
+{
+	/*
+	 * The number of server threads has changed. Update
+	 * rcvbuf and sndbuf accordingly on all sockets
+	 */
+	struct svc_sock *svsk;
+
+	spin_lock_bh(&serv->sv_lock);
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list)
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+	list_for_each_entry(svsk, &serv->sv_tempsocks, sk_xprt.xpt_list)
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+	spin_unlock_bh(&serv->sv_lock);
+}
+EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
+
+/*
+ * Initialize socket for RPC use and create svc_sock struct
+ * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
+ */
+static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
+						struct socket *sock,
+						int *errp, int flags)
+{
+	struct svc_sock	*svsk;
+	struct sock	*inet;
+	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+
+	dprintk("svc: svc_setup_socket %p\n", sock);
+	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
+		*errp = -ENOMEM;
+		return NULL;
+	}
+
+	inet = sock->sk;
+
+	/* Register socket with portmapper */
+	if (*errp >= 0 && pmap_register)
+		*errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
+				     ntohs(inet_sk(inet)->inet_sport));
+
+	if (*errp < 0) {
+		kfree(svsk);
+		return NULL;
+	}
+
+	inet->sk_user_data = svsk;
+	svsk->sk_sock = sock;
+	svsk->sk_sk = inet;
+	svsk->sk_ostate = inet->sk_state_change;
+	svsk->sk_odata = inet->sk_data_ready;
+	svsk->sk_owspace = inet->sk_write_space;
+
+	/* Initialize the socket */
+	if (sock->type == SOCK_DGRAM)
+		svc_udp_init(svsk, serv);
+	else {
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
+					4 * serv->sv_max_mesg);
+		svc_tcp_init(svsk, serv);
+	}
+
+	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
+				svsk, svsk->sk_sk);
+
+	return svsk;
+}
+
+/**
+ * svc_addsock - add a listener socket to an RPC service
+ * @serv: pointer to RPC service to which to add a new listener
+ * @fd: file descriptor of the new listener
+ * @name_return: pointer to buffer to fill in with name of listener
+ * @len: size of the buffer
+ *
+ * Fills in socket name and returns positive length of name if successful.
+ * Name is terminated with '\n'.  On error, returns a negative errno
+ * value.
+ */
+int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+		const size_t len)
+{
+	int err = 0;
+	struct socket *so = sockfd_lookup(fd, &err);
+	struct svc_sock *svsk = NULL;
+
+	if (!so)
+		return err;
+	if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
+		err =  -EAFNOSUPPORT;
+	else if (so->sk->sk_protocol != IPPROTO_TCP &&
+	    so->sk->sk_protocol != IPPROTO_UDP)
+		err =  -EPROTONOSUPPORT;
+	else if (so->state > SS_UNCONNECTED)
+		err = -EISCONN;
+	else {
+		if (!try_module_get(THIS_MODULE))
+			err = -ENOENT;
+		else
+			svsk = svc_setup_socket(serv, so, &err,
+						SVC_SOCK_DEFAULTS);
+		if (svsk) {
+			struct sockaddr_storage addr;
+			struct sockaddr *sin = (struct sockaddr *)&addr;
+			int salen;
+			if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+				svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+			clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+			spin_lock_bh(&serv->sv_lock);
+			list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+			spin_unlock_bh(&serv->sv_lock);
+			svc_xprt_received(&svsk->sk_xprt);
+			err = 0;
+		} else
+			module_put(THIS_MODULE);
+	}
+	if (err) {
+		sockfd_put(so);
+		return err;
+	}
+	return svc_one_sock_name(svsk, name_return, len);
+}
+EXPORT_SYMBOL_GPL(svc_addsock);
+
+/*
+ * Create socket for RPC service.
+ */
+static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+					  int protocol,
+					  struct net *net,
+					  struct sockaddr *sin, int len,
+					  int flags)
+{
+	struct svc_sock	*svsk;
+	struct socket	*sock;
+	int		error;
+	int		type;
+	struct sockaddr_storage addr;
+	struct sockaddr *newsin = (struct sockaddr *)&addr;
+	int		newlen;
+	int		family;
+	int		val;
+	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+
+	dprintk("svc: svc_create_socket(%s, %d, %s)\n",
+			serv->sv_program->pg_name, protocol,
+			__svc_print_addr(sin, buf, sizeof(buf)));
+
+	if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
+		printk(KERN_WARNING "svc: only UDP and TCP "
+				"sockets supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+	switch (sin->sa_family) {
+	case AF_INET6:
+		family = PF_INET6;
+		break;
+	case AF_INET:
+		family = PF_INET;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	error = __sock_create(net, family, type, protocol, &sock, 1);
+	if (error < 0)
+		return ERR_PTR(error);
+
+	svc_reclassify_socket(sock);
+
+	/*
+	 * If this is an PF_INET6 listener, we want to avoid
+	 * getting requests from IPv4 remotes.  Those should
+	 * be shunted to a PF_INET listener via rpcbind.
+	 */
+	val = 1;
+	if (family == PF_INET6)
+		kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+					(char *)&val, sizeof(val));
+
+	if (type == SOCK_STREAM)
+		sock->sk->sk_reuse = 1;		/* allow address reuse */
+	error = kernel_bind(sock, sin, len);
+	if (error < 0)
+		goto bummer;
+
+	newlen = len;
+	error = kernel_getsockname(sock, newsin, &newlen);
+	if (error < 0)
+		goto bummer;
+
+	if (protocol == IPPROTO_TCP) {
+		if ((error = kernel_listen(sock, 64)) < 0)
+			goto bummer;
+	}
+
+	if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
+		svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
+		return (struct svc_xprt *)svsk;
+	}
+
+bummer:
+	dprintk("svc: svc_create_socket error = %d\n", -error);
+	sock_release(sock);
+	return ERR_PTR(error);
+}
+
+/*
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
+ */
+static void svc_sock_detach(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	struct sock *sk = svsk->sk_sk;
+	wait_queue_head_t *wq;
+
+	dprintk("svc: svc_sock_detach(%p)\n", svsk);
+
+	/* put back the old socket callbacks */
+	sk->sk_state_change = svsk->sk_ostate;
+	sk->sk_data_ready = svsk->sk_odata;
+	sk->sk_write_space = svsk->sk_owspace;
+
+	wq = sk_sleep(sk);
+	if (wq && waitqueue_active(wq))
+		wake_up_interruptible(wq);
+}
+
+/*
+ * Disconnect the socket, and reset the callbacks
+ */
+static void svc_tcp_sock_detach(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+	dprintk("svc: svc_tcp_sock_detach(%p)\n", svsk);
+
+	svc_sock_detach(xprt);
+
+	if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+		svc_tcp_clear_pages(svsk);
+		kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR);
+	}
+}
+
+/*
+ * Free the svc_sock's socket resources and the svc_sock itself.
+ */
+static void svc_sock_free(struct svc_xprt *xprt)
+{
+	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+	dprintk("svc: svc_sock_free(%p)\n", svsk);
+
+	if (svsk->sk_sock->file)
+		sockfd_put(svsk->sk_sock);
+	else
+		sock_release(svsk->sk_sock);
+	kfree(svsk);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Create a back channel svc_xprt which shares the fore channel socket.
+ */
+static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv,
+					     int protocol,
+					     struct net *net,
+					     struct sockaddr *sin, int len,
+					     int flags)
+{
+	struct svc_sock *svsk;
+	struct svc_xprt *xprt;
+
+	if (protocol != IPPROTO_TCP) {
+		printk(KERN_WARNING "svc: only TCP sockets"
+			" supported on shared back channel\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+	if (!svsk)
+		return ERR_PTR(-ENOMEM);
+
+	xprt = &svsk->sk_xprt;
+	svc_xprt_init(&svc_tcp_bc_class, xprt, serv);
+
+	serv->sv_bc_xprt = xprt;
+
+	return xprt;
+}
+
+/*
+ * Free a back channel svc_sock.
+ */
+static void svc_bc_sock_free(struct svc_xprt *xprt)
+{
+	if (xprt)
+		kfree(container_of(xprt, struct svc_sock, sk_xprt));
+}
+#endif /* CONFIG_NFS_V4_1 */
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
new file mode 100644
index 00000000..e65dcc61
--- /dev/null
+++ b/net/sunrpc/sysctl.c
@@ -0,0 +1,183 @@
+/*
+ * linux/net/sunrpc/sysctl.c
+ *
+ * Sysctl interface to sunrpc module.
+ *
+ * I would prefer to register the sunrpc table below sys/net, but that's
+ * impossible at the moment.
+ */
+
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+/*
+ * Declare the debug flags here
+ */
+unsigned int	rpc_debug;
+EXPORT_SYMBOL_GPL(rpc_debug);
+
+unsigned int	nfs_debug;
+EXPORT_SYMBOL_GPL(nfs_debug);
+
+unsigned int	nfsd_debug;
+EXPORT_SYMBOL_GPL(nfsd_debug);
+
+unsigned int	nlm_debug;
+EXPORT_SYMBOL_GPL(nlm_debug);
+
+#ifdef RPC_DEBUG
+
+static struct ctl_table_header *sunrpc_table_header;
+static ctl_table		sunrpc_table[];
+
+void
+rpc_register_sysctl(void)
+{
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+}
+
+void
+rpc_unregister_sysctl(void)
+{
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+}
+
+static int proc_do_xprt(ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char tmpbuf[256];
+	size_t len;
+
+	if ((*ppos && !write) || !*lenp) {
+		*lenp = 0;
+		return 0;
+	}
+	len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+	return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len);
+}
+
+static int
+proc_dodebug(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char		tmpbuf[20], c, *s;
+	char __user *p;
+	unsigned int	value;
+	size_t		left, len;
+
+	if ((*ppos && !write) || !*lenp) {
+		*lenp = 0;
+		return 0;
+	}
+
+	left = *lenp;
+
+	if (write) {
+		if (!access_ok(VERIFY_READ, buffer, left))
+			return -EFAULT;
+		p = buffer;
+		while (left && __get_user(c, p) >= 0 && isspace(c))
+			left--, p++;
+		if (!left)
+			goto done;
+
+		if (left > sizeof(tmpbuf) - 1)
+			return -EINVAL;
+		if (copy_from_user(tmpbuf, p, left))
+			return -EFAULT;
+		tmpbuf[left] = '\0';
+
+		for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--)
+			value = 10 * value + (*s - '0');
+		if (*s && !isspace(*s))
+			return -EINVAL;
+		while (left && isspace(*s))
+			left--, s++;
+		*(unsigned int *) table->data = value;
+		/* Display the RPC tasks on writing to rpc_debug */
+		if (strcmp(table->procname, "rpc_debug") == 0)
+			rpc_show_tasks();
+	} else {
+		if (!access_ok(VERIFY_WRITE, buffer, left))
+			return -EFAULT;
+		len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data);
+		if (len > left)
+			len = left;
+		if (__copy_to_user(buffer, tmpbuf, len))
+			return -EFAULT;
+		if ((left -= len) > 0) {
+			if (put_user('\n', (char __user *)buffer + len))
+				return -EFAULT;
+			left--;
+		}
+	}
+
+done:
+	*lenp -= left;
+	*ppos += *lenp;
+	return 0;
+}
+
+
+static ctl_table debug_table[] = {
+	{
+		.procname	= "rpc_debug",
+		.data		= &rpc_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dodebug
+	},
+	{
+		.procname	= "nfs_debug",
+		.data		= &nfs_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dodebug
+	},
+	{
+		.procname	= "nfsd_debug",
+		.data		= &nfsd_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dodebug
+	},
+	{
+		.procname	= "nlm_debug",
+		.data		= &nlm_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dodebug
+	},
+	{
+		.procname	= "transports",
+		.maxlen		= 256,
+		.mode		= 0444,
+		.proc_handler	= proc_do_xprt,
+	},
+	{ }
+};
+
+static ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= debug_table
+	},
+	{ }
+};
+
+#endif
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
new file mode 100644
index 00000000..dd824341
--- /dev/null
+++ b/net/sunrpc/timer.c
@@ -0,0 +1,122 @@
+/*
+ * linux/net/sunrpc/timer.c
+ *
+ * Estimate RPC request round trip time.
+ *
+ * Based on packet round-trip and variance estimator algorithms described
+ * in appendix A of "Congestion Avoidance and Control" by Van Jacobson
+ * and Michael J. Karels (ACM Computer Communication Review; Proceedings
+ * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988).
+ *
+ * This RTT estimator is used only for RPC over datagram protocols.
+ *
+ * Copyright (C) 2002 Trond Myklebust <trond.myklebust@fys.uio.no>
+ */
+
+#include <asm/param.h>
+
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/clnt.h>
+
+#define RPC_RTO_MAX (60*HZ)
+#define RPC_RTO_INIT (HZ/5)
+#define RPC_RTO_MIN (HZ/10)
+
+/**
+ * rpc_init_rtt - Initialize an RPC RTT estimator context
+ * @rt: context to initialize
+ * @timeo: initial timeout value, in jiffies
+ *
+ */
+void rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo)
+{
+	unsigned long init = 0;
+	unsigned i;
+
+	rt->timeo = timeo;
+
+	if (timeo > RPC_RTO_INIT)
+		init = (timeo - RPC_RTO_INIT) << 3;
+	for (i = 0; i < 5; i++) {
+		rt->srtt[i] = init;
+		rt->sdrtt[i] = RPC_RTO_INIT;
+		rt->ntimeouts[i] = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(rpc_init_rtt);
+
+/**
+ * rpc_update_rtt - Update an RPC RTT estimator context
+ * @rt: context to update
+ * @timer: timer array index (request type)
+ * @m: recent actual RTT, in jiffies
+ *
+ * NB: When computing the smoothed RTT and standard deviation,
+ *     be careful not to produce negative intermediate results.
+ */
+void rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m)
+{
+	long *srtt, *sdrtt;
+
+	if (timer-- == 0)
+		return;
+
+	/* jiffies wrapped; ignore this one */
+	if (m < 0)
+		return;
+
+	if (m == 0)
+		m = 1L;
+
+	srtt = (long *)&rt->srtt[timer];
+	m -= *srtt >> 3;
+	*srtt += m;
+
+	if (m < 0)
+		m = -m;
+
+	sdrtt = (long *)&rt->sdrtt[timer];
+	m -= *sdrtt >> 2;
+	*sdrtt += m;
+
+	/* Set lower bound on the variance */
+	if (*sdrtt < RPC_RTO_MIN)
+		*sdrtt = RPC_RTO_MIN;
+}
+EXPORT_SYMBOL_GPL(rpc_update_rtt);
+
+/**
+ * rpc_calc_rto - Provide an estimated timeout value
+ * @rt: context to use for calculation
+ * @timer: timer array index (request type)
+ *
+ * Estimate RTO for an NFS RPC sent via an unreliable datagram.  Use
+ * the mean and mean deviation of RTT for the appropriate type of RPC
+ * for frequently issued RPCs, and a fixed default for the others.
+ *
+ * The justification for doing "other" this way is that these RPCs
+ * happen so infrequently that timer estimation would probably be
+ * stale.  Also, since many of these RPCs are non-idempotent, a
+ * conservative timeout is desired.
+ *
+ * getattr, lookup,
+ * read, write, commit     - A+4D
+ * other                   - timeo
+ */
+unsigned long rpc_calc_rto(struct rpc_rtt *rt, unsigned timer)
+{
+	unsigned long res;
+
+	if (timer-- == 0)
+		return rt->timeo;
+
+	res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer];
+	if (res > RPC_RTO_MAX)
+		res = RPC_RTO_MAX;
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(rpc_calc_rto);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
new file mode 100644
index 00000000..f008c14a
--- /dev/null
+++ b/net/sunrpc/xdr.c
@@ -0,0 +1,1267 @@
+/*
+ * linux/net/sunrpc/xdr.c
+ *
+ * Generic XDR support.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/errno.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/msg_prot.h>
+
+/*
+ * XDR functions for basic NFS types
+ */
+__be32 *
+xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
+{
+	unsigned int	quadlen = XDR_QUADLEN(obj->len);
+
+	p[quadlen] = 0;		/* zero trailing bytes */
+	*p++ = cpu_to_be32(obj->len);
+	memcpy(p, obj->data, obj->len);
+	return p + XDR_QUADLEN(obj->len);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_netobj);
+
+__be32 *
+xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
+{
+	unsigned int	len;
+
+	if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ)
+		return NULL;
+	obj->len  = len;
+	obj->data = (u8 *) p;
+	return p + XDR_QUADLEN(len);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_netobj);
+
+/**
+ * xdr_encode_opaque_fixed - Encode fixed length opaque data
+ * @p: pointer to current position in XDR buffer.
+ * @ptr: pointer to data to encode (or NULL)
+ * @nbytes: size of data.
+ *
+ * Copy the array of data of length nbytes at ptr to the XDR buffer
+ * at position p, then align to the next 32-bit boundary by padding
+ * with zero bytes (see RFC1832).
+ * Note: if ptr is NULL, only the padding is performed.
+ *
+ * Returns the updated current XDR buffer position
+ *
+ */
+__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int nbytes)
+{
+	if (likely(nbytes != 0)) {
+		unsigned int quadlen = XDR_QUADLEN(nbytes);
+		unsigned int padding = (quadlen << 2) - nbytes;
+
+		if (ptr != NULL)
+			memcpy(p, ptr, nbytes);
+		if (padding != 0)
+			memset((char *)p + nbytes, 0, padding);
+		p += quadlen;
+	}
+	return p;
+}
+EXPORT_SYMBOL_GPL(xdr_encode_opaque_fixed);
+
+/**
+ * xdr_encode_opaque - Encode variable length opaque data
+ * @p: pointer to current position in XDR buffer.
+ * @ptr: pointer to data to encode (or NULL)
+ * @nbytes: size of data.
+ *
+ * Returns the updated current XDR buffer position
+ */
+__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int nbytes)
+{
+	*p++ = cpu_to_be32(nbytes);
+	return xdr_encode_opaque_fixed(p, ptr, nbytes);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_opaque);
+
+__be32 *
+xdr_encode_string(__be32 *p, const char *string)
+{
+	return xdr_encode_array(p, string, strlen(string));
+}
+EXPORT_SYMBOL_GPL(xdr_encode_string);
+
+__be32 *
+xdr_decode_string_inplace(__be32 *p, char **sp,
+			  unsigned int *lenp, unsigned int maxlen)
+{
+	u32 len;
+
+	len = be32_to_cpu(*p++);
+	if (len > maxlen)
+		return NULL;
+	*lenp = len;
+	*sp = (char *) p;
+	return p + XDR_QUADLEN(len);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
+
+/**
+ * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
+ * @buf: XDR buffer where string resides
+ * @len: length of string, in bytes
+ *
+ */
+void
+xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+{
+	char *kaddr;
+
+	kaddr = kmap_atomic(buf->pages[0], KM_USER0);
+	kaddr[buf->page_base + len] = '\0';
+	kunmap_atomic(kaddr, KM_USER0);
+}
+EXPORT_SYMBOL(xdr_terminate_string);
+
+void
+xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
+		 unsigned int len)
+{
+	struct kvec *tail = xdr->tail;
+	u32 *p;
+
+	xdr->pages = pages;
+	xdr->page_base = base;
+	xdr->page_len = len;
+
+	p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len);
+	tail->iov_base = p;
+	tail->iov_len = 0;
+
+	if (len & 3) {
+		unsigned int pad = 4 - (len & 3);
+
+		*p = 0;
+		tail->iov_base = (char *)p + (len & 3);
+		tail->iov_len  = pad;
+		len += pad;
+	}
+	xdr->buflen += len;
+	xdr->len += len;
+}
+EXPORT_SYMBOL_GPL(xdr_encode_pages);
+
+void
+xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
+		 struct page **pages, unsigned int base, unsigned int len)
+{
+	struct kvec *head = xdr->head;
+	struct kvec *tail = xdr->tail;
+	char *buf = (char *)head->iov_base;
+	unsigned int buflen = head->iov_len;
+
+	head->iov_len  = offset;
+
+	xdr->pages = pages;
+	xdr->page_base = base;
+	xdr->page_len = len;
+
+	tail->iov_base = buf + offset;
+	tail->iov_len = buflen - offset;
+
+	xdr->buflen += len;
+}
+EXPORT_SYMBOL_GPL(xdr_inline_pages);
+
+/*
+ * Helper routines for doing 'memmove' like operations on a struct xdr_buf
+ *
+ * _shift_data_right_pages
+ * @pages: vector of pages containing both the source and dest memory area.
+ * @pgto_base: page vector address of destination
+ * @pgfrom_base: page vector address of source
+ * @len: number of bytes to copy
+ *
+ * Note: the addresses pgto_base and pgfrom_base are both calculated in
+ *       the same way:
+ *            if a memory area starts at byte 'base' in page 'pages[i]',
+ *            then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * Also note: pgfrom_base must be < pgto_base, but the memory areas
+ * 	they point to may overlap.
+ */
+static void
+_shift_data_right_pages(struct page **pages, size_t pgto_base,
+		size_t pgfrom_base, size_t len)
+{
+	struct page **pgfrom, **pgto;
+	char *vfrom, *vto;
+	size_t copy;
+
+	BUG_ON(pgto_base <= pgfrom_base);
+
+	pgto_base += len;
+	pgfrom_base += len;
+
+	pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT);
+	pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT);
+
+	pgto_base &= ~PAGE_CACHE_MASK;
+	pgfrom_base &= ~PAGE_CACHE_MASK;
+
+	do {
+		/* Are any pointers crossing a page boundary? */
+		if (pgto_base == 0) {
+			pgto_base = PAGE_CACHE_SIZE;
+			pgto--;
+		}
+		if (pgfrom_base == 0) {
+			pgfrom_base = PAGE_CACHE_SIZE;
+			pgfrom--;
+		}
+
+		copy = len;
+		if (copy > pgto_base)
+			copy = pgto_base;
+		if (copy > pgfrom_base)
+			copy = pgfrom_base;
+		pgto_base -= copy;
+		pgfrom_base -= copy;
+
+		vto = kmap_atomic(*pgto, KM_USER0);
+		vfrom = kmap_atomic(*pgfrom, KM_USER1);
+		memmove(vto + pgto_base, vfrom + pgfrom_base, copy);
+		flush_dcache_page(*pgto);
+		kunmap_atomic(vfrom, KM_USER1);
+		kunmap_atomic(vto, KM_USER0);
+
+	} while ((len -= copy) != 0);
+}
+
+/*
+ * _copy_to_pages
+ * @pages: array of pages
+ * @pgbase: page vector address of destination
+ * @p: pointer to source data
+ * @len: length
+ *
+ * Copies data from an arbitrary memory location into an array of pages
+ * The copy is assumed to be non-overlapping.
+ */
+static void
+_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
+{
+	struct page **pgto;
+	char *vto;
+	size_t copy;
+
+	pgto = pages + (pgbase >> PAGE_CACHE_SHIFT);
+	pgbase &= ~PAGE_CACHE_MASK;
+
+	for (;;) {
+		copy = PAGE_CACHE_SIZE - pgbase;
+		if (copy > len)
+			copy = len;
+
+		vto = kmap_atomic(*pgto, KM_USER0);
+		memcpy(vto + pgbase, p, copy);
+		kunmap_atomic(vto, KM_USER0);
+
+		len -= copy;
+		if (len == 0)
+			break;
+
+		pgbase += copy;
+		if (pgbase == PAGE_CACHE_SIZE) {
+			flush_dcache_page(*pgto);
+			pgbase = 0;
+			pgto++;
+		}
+		p += copy;
+	}
+	flush_dcache_page(*pgto);
+}
+
+/*
+ * _copy_from_pages
+ * @p: pointer to destination
+ * @pages: array of pages
+ * @pgbase: offset of source data
+ * @len: length
+ *
+ * Copies data into an arbitrary memory location from an array of pages
+ * The copy is assumed to be non-overlapping.
+ */
+static void
+_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
+{
+	struct page **pgfrom;
+	char *vfrom;
+	size_t copy;
+
+	pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT);
+	pgbase &= ~PAGE_CACHE_MASK;
+
+	do {
+		copy = PAGE_CACHE_SIZE - pgbase;
+		if (copy > len)
+			copy = len;
+
+		vfrom = kmap_atomic(*pgfrom, KM_USER0);
+		memcpy(p, vfrom + pgbase, copy);
+		kunmap_atomic(vfrom, KM_USER0);
+
+		pgbase += copy;
+		if (pgbase == PAGE_CACHE_SIZE) {
+			pgbase = 0;
+			pgfrom++;
+		}
+		p += copy;
+
+	} while ((len -= copy) != 0);
+}
+
+/*
+ * xdr_shrink_bufhead
+ * @buf: xdr_buf
+ * @len: bytes to remove from buf->head[0]
+ *
+ * Shrinks XDR buffer's header kvec buf->head[0] by
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the inlined pages and/or the tail.
+ */
+static void
+xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+{
+	struct kvec *head, *tail;
+	size_t copy, offs;
+	unsigned int pglen = buf->page_len;
+
+	tail = buf->tail;
+	head = buf->head;
+	BUG_ON (len > head->iov_len);
+
+	/* Shift the tail first */
+	if (tail->iov_len != 0) {
+		if (tail->iov_len > len) {
+			copy = tail->iov_len - len;
+			memmove((char *)tail->iov_base + len,
+					tail->iov_base, copy);
+		}
+		/* Copy from the inlined pages into the tail */
+		copy = len;
+		if (copy > pglen)
+			copy = pglen;
+		offs = len - copy;
+		if (offs >= tail->iov_len)
+			copy = 0;
+		else if (copy > tail->iov_len - offs)
+			copy = tail->iov_len - offs;
+		if (copy != 0)
+			_copy_from_pages((char *)tail->iov_base + offs,
+					buf->pages,
+					buf->page_base + pglen + offs - len,
+					copy);
+		/* Do we also need to copy data from the head into the tail ? */
+		if (len > pglen) {
+			offs = copy = len - pglen;
+			if (copy > tail->iov_len)
+				copy = tail->iov_len;
+			memcpy(tail->iov_base,
+					(char *)head->iov_base +
+					head->iov_len - offs,
+					copy);
+		}
+	}
+	/* Now handle pages */
+	if (pglen != 0) {
+		if (pglen > len)
+			_shift_data_right_pages(buf->pages,
+					buf->page_base + len,
+					buf->page_base,
+					pglen - len);
+		copy = len;
+		if (len > pglen)
+			copy = pglen;
+		_copy_to_pages(buf->pages, buf->page_base,
+				(char *)head->iov_base + head->iov_len - len,
+				copy);
+	}
+	head->iov_len -= len;
+	buf->buflen -= len;
+	/* Have we truncated the message? */
+	if (buf->len > buf->buflen)
+		buf->len = buf->buflen;
+}
+
+/*
+ * xdr_shrink_pagelen
+ * @buf: xdr_buf
+ * @len: bytes to remove from buf->pages
+ *
+ * Shrinks XDR buffer's page array buf->pages by
+ * 'len' bytes. The extra data is not lost, but is instead
+ * moved into the tail.
+ */
+static void
+xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+{
+	struct kvec *tail;
+	size_t copy;
+	unsigned int pglen = buf->page_len;
+	unsigned int tailbuf_len;
+
+	tail = buf->tail;
+	BUG_ON (len > pglen);
+
+	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
+
+	/* Shift the tail first */
+	if (tailbuf_len != 0) {
+		unsigned int free_space = tailbuf_len - tail->iov_len;
+
+		if (len < free_space)
+			free_space = len;
+		tail->iov_len += free_space;
+
+		copy = len;
+		if (tail->iov_len > len) {
+			char *p = (char *)tail->iov_base + len;
+			memmove(p, tail->iov_base, tail->iov_len - len);
+		} else
+			copy = tail->iov_len;
+		/* Copy from the inlined pages into the tail */
+		_copy_from_pages((char *)tail->iov_base,
+				buf->pages, buf->page_base + pglen - len,
+				copy);
+	}
+	buf->page_len -= len;
+	buf->buflen -= len;
+	/* Have we truncated the message? */
+	if (buf->len > buf->buflen)
+		buf->len = buf->buflen;
+}
+
+void
+xdr_shift_buf(struct xdr_buf *buf, size_t len)
+{
+	xdr_shrink_bufhead(buf, len);
+}
+EXPORT_SYMBOL_GPL(xdr_shift_buf);
+
+/**
+ * xdr_init_encode - Initialize a struct xdr_stream for sending data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer in which to encode data
+ * @p: current pointer inside XDR buffer
+ *
+ * Note: at the moment the RPC client only passes the length of our
+ *	 scratch buffer in the xdr_buf's header kvec. Previously this
+ *	 meant we needed to call xdr_adjust_iovec() after encoding the
+ *	 data. With the new scheme, the xdr_stream manages the details
+ *	 of the buffer length, and takes care of adjusting the kvec
+ *	 length for us.
+ */
+void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+{
+	struct kvec *iov = buf->head;
+	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+
+	BUG_ON(scratch_len < 0);
+	xdr->buf = buf;
+	xdr->iov = iov;
+	xdr->p = (__be32 *)((char *)iov->iov_base + iov->iov_len);
+	xdr->end = (__be32 *)((char *)iov->iov_base + scratch_len);
+	BUG_ON(iov->iov_len > scratch_len);
+
+	if (p != xdr->p && p != NULL) {
+		size_t len;
+
+		BUG_ON(p < xdr->p || p > xdr->end);
+		len = (char *)p - (char *)xdr->p;
+		xdr->p = p;
+		buf->len += len;
+		iov->iov_len += len;
+	}
+}
+EXPORT_SYMBOL_GPL(xdr_init_encode);
+
+/**
+ * xdr_reserve_space - Reserve buffer space for sending
+ * @xdr: pointer to xdr_stream
+ * @nbytes: number of bytes to reserve
+ *
+ * Checks that we have enough buffer space to encode 'nbytes' more
+ * bytes of data. If so, update the total xdr_buf length, and
+ * adjust the length of the current kvec.
+ */
+__be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr->p;
+	__be32 *q;
+
+	/* align nbytes on the next 32-bit boundary */
+	nbytes += 3;
+	nbytes &= ~3;
+	q = p + (nbytes >> 2);
+	if (unlikely(q > xdr->end || q < p))
+		return NULL;
+	xdr->p = q;
+	xdr->iov->iov_len += nbytes;
+	xdr->buf->len += nbytes;
+	return p;
+}
+EXPORT_SYMBOL_GPL(xdr_reserve_space);
+
+/**
+ * xdr_write_pages - Insert a list of pages into an XDR buffer for sending
+ * @xdr: pointer to xdr_stream
+ * @pages: list of pages
+ * @base: offset of first byte
+ * @len: length of data in bytes
+ *
+ */
+void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base,
+		 unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+	struct kvec *iov = buf->tail;
+	buf->pages = pages;
+	buf->page_base = base;
+	buf->page_len = len;
+
+	iov->iov_base = (char *)xdr->p;
+	iov->iov_len  = 0;
+	xdr->iov = iov;
+
+	if (len & 3) {
+		unsigned int pad = 4 - (len & 3);
+
+		BUG_ON(xdr->p >= xdr->end);
+		iov->iov_base = (char *)xdr->p + (len & 3);
+		iov->iov_len  += pad;
+		len += pad;
+		*xdr->p++ = 0;
+	}
+	buf->buflen += len;
+	buf->len += len;
+}
+EXPORT_SYMBOL_GPL(xdr_write_pages);
+
+static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+		__be32 *p, unsigned int len)
+{
+	if (len > iov->iov_len)
+		len = iov->iov_len;
+	if (p == NULL)
+		p = (__be32*)iov->iov_base;
+	xdr->p = p;
+	xdr->end = (__be32*)(iov->iov_base + len);
+	xdr->iov = iov;
+	xdr->page_ptr = NULL;
+}
+
+static int xdr_set_page_base(struct xdr_stream *xdr,
+		unsigned int base, unsigned int len)
+{
+	unsigned int pgnr;
+	unsigned int maxlen;
+	unsigned int pgoff;
+	unsigned int pgend;
+	void *kaddr;
+
+	maxlen = xdr->buf->page_len;
+	if (base >= maxlen)
+		return -EINVAL;
+	maxlen -= base;
+	if (len > maxlen)
+		len = maxlen;
+
+	base += xdr->buf->page_base;
+
+	pgnr = base >> PAGE_SHIFT;
+	xdr->page_ptr = &xdr->buf->pages[pgnr];
+	kaddr = page_address(*xdr->page_ptr);
+
+	pgoff = base & ~PAGE_MASK;
+	xdr->p = (__be32*)(kaddr + pgoff);
+
+	pgend = pgoff + len;
+	if (pgend > PAGE_SIZE)
+		pgend = PAGE_SIZE;
+	xdr->end = (__be32*)(kaddr + pgend);
+	xdr->iov = NULL;
+	return 0;
+}
+
+static void xdr_set_next_page(struct xdr_stream *xdr)
+{
+	unsigned int newbase;
+
+	newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
+	newbase -= xdr->buf->page_base;
+
+	if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
+		xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+}
+
+static bool xdr_set_next_buffer(struct xdr_stream *xdr)
+{
+	if (xdr->page_ptr != NULL)
+		xdr_set_next_page(xdr);
+	else if (xdr->iov == xdr->buf->head) {
+		if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
+			xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
+	}
+	return xdr->p != xdr->end;
+}
+
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @p: current pointer inside XDR buffer
+ */
+void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+{
+	xdr->buf = buf;
+	xdr->scratch.iov_base = NULL;
+	xdr->scratch.iov_len = 0;
+	if (buf->head[0].iov_len != 0)
+		xdr_set_iov(xdr, buf->head, p, buf->len);
+	else if (buf->page_len != 0)
+		xdr_set_page_base(xdr, 0, buf->len);
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode);
+
+/**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer from which to decode data
+ * @pages: list of pages to decode into
+ * @len: length in bytes of buffer in pages
+ */
+void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+			   struct page **pages, unsigned int len)
+{
+	memset(buf, 0, sizeof(*buf));
+	buf->pages =  pages;
+	buf->page_len =  len;
+	buf->buflen =  len;
+	buf->len = len;
+	xdr_init_decode(xdr, buf, NULL);
+}
+EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
+
+static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr->p;
+	__be32 *q = p + XDR_QUADLEN(nbytes);
+
+	if (unlikely(q > xdr->end || q < p))
+		return NULL;
+	xdr->p = q;
+	return p;
+}
+
+/**
+ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to an empty buffer
+ * @buflen: size of 'buf'
+ *
+ * The scratch buffer is used when decoding from an array of pages.
+ * If an xdr_inline_decode() call spans across page boundaries, then
+ * we copy the data into the scratch buffer in order to allow linear
+ * access.
+ */
+void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
+{
+	xdr->scratch.iov_base = buf;
+	xdr->scratch.iov_len = buflen;
+}
+EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
+
+static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p;
+	void *cpdest = xdr->scratch.iov_base;
+	size_t cplen = (char *)xdr->end - (char *)xdr->p;
+
+	if (nbytes > xdr->scratch.iov_len)
+		return NULL;
+	memcpy(cpdest, xdr->p, cplen);
+	cpdest += cplen;
+	nbytes -= cplen;
+	if (!xdr_set_next_buffer(xdr))
+		return NULL;
+	p = __xdr_inline_decode(xdr, nbytes);
+	if (p == NULL)
+		return NULL;
+	memcpy(cpdest, p, nbytes);
+	return xdr->scratch.iov_base;
+}
+
+/**
+ * xdr_inline_decode - Retrieve XDR data to decode
+ * @xdr: pointer to xdr_stream struct
+ * @nbytes: number of bytes of data to decode
+ *
+ * Check if the input buffer is long enough to enable us to decode
+ * 'nbytes' more bytes of data starting at the current position.
+ * If so return the current pointer, then update the current
+ * pointer position.
+ */
+__be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p;
+
+	if (nbytes == 0)
+		return xdr->p;
+	if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+		return NULL;
+	p = __xdr_inline_decode(xdr, nbytes);
+	if (p != NULL)
+		return p;
+	return xdr_copy_to_scratch(xdr, nbytes);
+}
+EXPORT_SYMBOL_GPL(xdr_inline_decode);
+
+/**
+ * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[].
+ */
+void xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+	struct kvec *iov;
+	ssize_t shift;
+	unsigned int end;
+	int padding;
+
+	/* Realign pages to current pointer position */
+	iov  = buf->head;
+	shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p;
+	if (shift > 0)
+		xdr_shrink_bufhead(buf, shift);
+
+	/* Truncate page data and move it into the tail */
+	if (buf->page_len > len)
+		xdr_shrink_pagelen(buf, buf->page_len - len);
+	padding = (XDR_QUADLEN(len) << 2) - len;
+	xdr->iov = iov = buf->tail;
+	/* Compute remaining message length.  */
+	end = iov->iov_len;
+	shift = buf->buflen - buf->len;
+	if (shift < end)
+		end -= shift;
+	else if (shift > 0)
+		end = 0;
+	/*
+	 * Position current pointer at beginning of tail, and
+	 * set remaining message length.
+	 */
+	xdr->p = (__be32 *)((char *)iov->iov_base + padding);
+	xdr->end = (__be32 *)((char *)iov->iov_base + end);
+}
+EXPORT_SYMBOL_GPL(xdr_read_pages);
+
+/**
+ * xdr_enter_page - decode data from the XDR page
+ * @xdr: pointer to xdr_stream struct
+ * @len: number of bytes of page data
+ *
+ * Moves data beyond the current pointer position from the XDR head[] buffer
+ * into the page list. Any data that lies beyond current position + "len"
+ * bytes is moved into the XDR tail[]. The current pointer is then
+ * repositioned at the beginning of the first XDR page.
+ */
+void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+{
+	xdr_read_pages(xdr, len);
+	/*
+	 * Position current pointer at beginning of tail, and
+	 * set remaining message length.
+	 */
+	xdr_set_page_base(xdr, 0, len);
+}
+EXPORT_SYMBOL_GPL(xdr_enter_page);
+
+static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
+
+void
+xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+{
+	buf->head[0] = *iov;
+	buf->tail[0] = empty_iov;
+	buf->page_len = 0;
+	buf->buflen = buf->len = iov->iov_len;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
+
+/* Sets subbuf to the portion of buf of length len beginning base bytes
+ * from the start of buf. Returns -1 if base of length are out of bounds. */
+int
+xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
+			unsigned int base, unsigned int len)
+{
+	subbuf->buflen = subbuf->len = len;
+	if (base < buf->head[0].iov_len) {
+		subbuf->head[0].iov_base = buf->head[0].iov_base + base;
+		subbuf->head[0].iov_len = min_t(unsigned int, len,
+						buf->head[0].iov_len - base);
+		len -= subbuf->head[0].iov_len;
+		base = 0;
+	} else {
+		subbuf->head[0].iov_base = NULL;
+		subbuf->head[0].iov_len = 0;
+		base -= buf->head[0].iov_len;
+	}
+
+	if (base < buf->page_len) {
+		subbuf->page_len = min(buf->page_len - base, len);
+		base += buf->page_base;
+		subbuf->page_base = base & ~PAGE_CACHE_MASK;
+		subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT];
+		len -= subbuf->page_len;
+		base = 0;
+	} else {
+		base -= buf->page_len;
+		subbuf->page_len = 0;
+	}
+
+	if (base < buf->tail[0].iov_len) {
+		subbuf->tail[0].iov_base = buf->tail[0].iov_base + base;
+		subbuf->tail[0].iov_len = min_t(unsigned int, len,
+						buf->tail[0].iov_len - base);
+		len -= subbuf->tail[0].iov_len;
+		base = 0;
+	} else {
+		subbuf->tail[0].iov_base = NULL;
+		subbuf->tail[0].iov_len = 0;
+		base -= buf->tail[0].iov_len;
+	}
+
+	if (base || len)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
+
+static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+{
+	unsigned int this_len;
+
+	this_len = min_t(unsigned int, len, subbuf->head[0].iov_len);
+	memcpy(obj, subbuf->head[0].iov_base, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min_t(unsigned int, len, subbuf->page_len);
+	if (this_len)
+		_copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
+	memcpy(obj, subbuf->tail[0].iov_base, this_len);
+}
+
+/* obj is assumed to point to allocated memory of size at least len: */
+int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+{
+	struct xdr_buf subbuf;
+	int status;
+
+	status = xdr_buf_subsegment(buf, &subbuf, base, len);
+	if (status != 0)
+		return status;
+	__read_bytes_from_xdr_buf(&subbuf, obj, len);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
+
+static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+{
+	unsigned int this_len;
+
+	this_len = min_t(unsigned int, len, subbuf->head[0].iov_len);
+	memcpy(subbuf->head[0].iov_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min_t(unsigned int, len, subbuf->page_len);
+	if (this_len)
+		_copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
+	memcpy(subbuf->tail[0].iov_base, obj, this_len);
+}
+
+/* obj is assumed to point to allocated memory of size at least len: */
+int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+{
+	struct xdr_buf subbuf;
+	int status;
+
+	status = xdr_buf_subsegment(buf, &subbuf, base, len);
+	if (status != 0)
+		return status;
+	__write_bytes_to_xdr_buf(&subbuf, obj, len);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
+
+int
+xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+{
+	__be32	raw;
+	int	status;
+
+	status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+	if (status)
+		return status;
+	*obj = be32_to_cpu(raw);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_decode_word);
+
+int
+xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+{
+	__be32	raw = cpu_to_be32(obj);
+
+	return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+EXPORT_SYMBOL_GPL(xdr_encode_word);
+
+/* If the netobj starting offset bytes from the start of xdr_buf is contained
+ * entirely in the head or the tail, set object to point to it; otherwise
+ * try to find space for it at the end of the tail, copy it there, and
+ * set obj to point to it. */
+int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
+{
+	struct xdr_buf subbuf;
+
+	if (xdr_decode_word(buf, offset, &obj->len))
+		return -EFAULT;
+	if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
+		return -EFAULT;
+
+	/* Is the obj contained entirely in the head? */
+	obj->data = subbuf.head[0].iov_base;
+	if (subbuf.head[0].iov_len == obj->len)
+		return 0;
+	/* ..or is the obj contained entirely in the tail? */
+	obj->data = subbuf.tail[0].iov_base;
+	if (subbuf.tail[0].iov_len == obj->len)
+		return 0;
+
+	/* use end of tail as storage for obj:
+	 * (We don't copy to the beginning because then we'd have
+	 * to worry about doing a potentially overlapping copy.
+	 * This assumes the object is at most half the length of the
+	 * tail.) */
+	if (obj->len > buf->buflen - buf->len)
+		return -ENOMEM;
+	if (buf->tail[0].iov_len != 0)
+		obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+	else
+		obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
+	__read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+		 struct xdr_array2_desc *desc, int encode)
+{
+	char *elem = NULL, *c;
+	unsigned int copied = 0, todo, avail_here;
+	struct page **ppages = NULL;
+	int err;
+
+	if (encode) {
+		if (xdr_encode_word(buf, base, desc->array_len) != 0)
+			return -EINVAL;
+	} else {
+		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    desc->array_len > desc->array_maxlen ||
+		    (unsigned long) base + 4 + desc->array_len *
+				    desc->elem_size > buf->len)
+			return -EINVAL;
+	}
+	base += 4;
+
+	if (!desc->xcode)
+		return 0;
+
+	todo = desc->array_len * desc->elem_size;
+
+	/* process head */
+	if (todo && base < buf->head->iov_len) {
+		c = buf->head->iov_base + base;
+		avail_here = min_t(unsigned int, todo,
+				   buf->head->iov_len - base);
+		todo -= avail_here;
+
+		while (avail_here >= desc->elem_size) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			avail_here -= desc->elem_size;
+		}
+		if (avail_here) {
+			if (!elem) {
+				elem = kmalloc(desc->elem_size, GFP_KERNEL);
+				err = -ENOMEM;
+				if (!elem)
+					goto out;
+			}
+			if (encode) {
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+				memcpy(c, elem, avail_here);
+			} else
+				memcpy(elem, c, avail_here);
+			copied = avail_here;
+		}
+		base = buf->head->iov_len;  /* align to start of pages */
+	}
+
+	/* process pages array */
+	base -= buf->head->iov_len;
+	if (todo && base < buf->page_len) {
+		unsigned int avail_page;
+
+		avail_here = min(todo, buf->page_len - base);
+		todo -= avail_here;
+
+		base += buf->page_base;
+		ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+		base &= ~PAGE_CACHE_MASK;
+		avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+					avail_here);
+		c = kmap(*ppages) + base;
+
+		while (avail_here) {
+			avail_here -= avail_page;
+			if (copied || avail_page < desc->elem_size) {
+				unsigned int l = min(avail_page,
+					desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+				avail_page -= l;
+				c += l;
+			}
+			while (avail_page >= desc->elem_size) {
+				err = desc->xcode(desc, c);
+				if (err)
+					goto out;
+				c += desc->elem_size;
+				avail_page -= desc->elem_size;
+			}
+			if (avail_page) {
+				unsigned int l = min(avail_page,
+					    desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+			}
+			if (avail_here) {
+				kunmap(*ppages);
+				ppages++;
+				c = kmap(*ppages);
+			}
+
+			avail_page = min(avail_here,
+				 (unsigned int) PAGE_CACHE_SIZE);
+		}
+		base = buf->page_len;  /* align to start of tail */
+	}
+
+	/* process tail */
+	base -= buf->page_len;
+	if (todo) {
+		c = buf->tail->iov_base + base;
+		if (copied) {
+			unsigned int l = desc->elem_size - copied;
+
+			if (encode)
+				memcpy(c, elem + copied, l);
+			else {
+				memcpy(elem + copied, c, l);
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+			}
+			todo -= l;
+			c += l;
+		}
+		while (todo) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			todo -= desc->elem_size;
+		}
+	}
+	err = 0;
+
+out:
+	kfree(elem);
+	if (ppages)
+		kunmap(*ppages);
+	return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if (base >= buf->len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 0);
+}
+EXPORT_SYMBOL_GPL(xdr_decode_array2);
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 1);
+}
+EXPORT_SYMBOL_GPL(xdr_encode_array2);
+
+int
+xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
+		int (*actor)(struct scatterlist *, void *), void *data)
+{
+	int i, ret = 0;
+	unsigned page_len, thislen, page_offset;
+	struct scatterlist      sg[1];
+
+	sg_init_table(sg, 1);
+
+	if (offset >= buf->head[0].iov_len) {
+		offset -= buf->head[0].iov_len;
+	} else {
+		thislen = buf->head[0].iov_len - offset;
+		if (thislen > len)
+			thislen = len;
+		sg_set_buf(sg, buf->head[0].iov_base + offset, thislen);
+		ret = actor(sg, data);
+		if (ret)
+			goto out;
+		offset = 0;
+		len -= thislen;
+	}
+	if (len == 0)
+		goto out;
+
+	if (offset >= buf->page_len) {
+		offset -= buf->page_len;
+	} else {
+		page_len = buf->page_len - offset;
+		if (page_len > len)
+			page_len = len;
+		len -= page_len;
+		page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
+		i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
+		thislen = PAGE_CACHE_SIZE - page_offset;
+		do {
+			if (thislen > page_len)
+				thislen = page_len;
+			sg_set_page(sg, buf->pages[i], thislen, page_offset);
+			ret = actor(sg, data);
+			if (ret)
+				goto out;
+			page_len -= thislen;
+			i++;
+			page_offset = 0;
+			thislen = PAGE_CACHE_SIZE;
+		} while (page_len != 0);
+		offset = 0;
+	}
+	if (len == 0)
+		goto out;
+	if (offset < buf->tail[0].iov_len) {
+		thislen = buf->tail[0].iov_len - offset;
+		if (thislen > len)
+			thislen = len;
+		sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen);
+		ret = actor(sg, data);
+		len -= thislen;
+	}
+	if (len != 0)
+		ret = -EINVAL;
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_process_buf);
+
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
new file mode 100644
index 00000000..ce5eb68a
--- /dev/null
+++ b/net/sunrpc/xprt.c
@@ -0,0 +1,1190 @@
+/*
+ *  linux/net/sunrpc/xprt.c
+ *
+ *  This is a generic RPC call interface supporting congestion avoidance,
+ *  and asynchronous calls.
+ *
+ *  The interface works like this:
+ *
+ *  -	When a process places a call, it allocates a request slot if
+ *	one is available. Otherwise, it sleeps on the backlog queue
+ *	(xprt_reserve).
+ *  -	Next, the caller puts together the RPC message, stuffs it into
+ *	the request struct, and calls xprt_transmit().
+ *  -	xprt_transmit sends the message and installs the caller on the
+ *	transport's wait list. At the same time, if a reply is expected,
+ *	it installs a timer that is run after the packet's timeout has
+ *	expired.
+ *  -	When a packet arrives, the data_ready handler walks the list of
+ *	pending requests for that transport. If a matching XID is found, the
+ *	caller is woken up, and the timer removed.
+ *  -	When no reply arrives within the timeout interval, the timer is
+ *	fired by the kernel and runs xprt_timer(). It either adjusts the
+ *	timeout values (minor timeout) or wakes up the caller with a status
+ *	of -ETIMEDOUT.
+ *  -	When the caller receives a notification from RPC that a reply arrived,
+ *	it should release the RPC slot, and process the reply.
+ *	If the call timed out, it may choose to retry the operation by
+ *	adjusting the initial timeout value, and simply calling rpc_call
+ *	again.
+ *
+ *  Support for async RPC is done through a set of RPC-specific scheduling
+ *  primitives that `transparently' work for processes as well as async
+ *  tasks that rely on callbacks.
+ *
+ *  Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de>
+ *
+ *  Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com>
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/ktime.h>
+
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
+
+#include "sunrpc.h"
+
+/*
+ * Local variables
+ */
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_XPRT
+#endif
+
+/*
+ * Local functions
+ */
+static void	xprt_request_init(struct rpc_task *, struct rpc_xprt *);
+static void	xprt_connect_status(struct rpc_task *task);
+static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
+
+static DEFINE_SPINLOCK(xprt_list_lock);
+static LIST_HEAD(xprt_list);
+
+/*
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ *	-	a reply is received and
+ *	-	a full number of requests are outstanding and
+ *	-	the congestion window hasn't been updated recently.
+ */
+#define RPC_CWNDSHIFT		(8U)
+#define RPC_CWNDSCALE		(1U << RPC_CWNDSHIFT)
+#define RPC_INITCWND		RPC_CWNDSCALE
+#define RPC_MAXCWND(xprt)	((xprt)->max_reqs << RPC_CWNDSHIFT)
+
+#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
+
+/**
+ * xprt_register_transport - register a transport implementation
+ * @transport: transport to register
+ *
+ * If a transport implementation is loaded as a kernel module, it can
+ * call this interface to make itself known to the RPC client.
+ *
+ * Returns:
+ * 0:		transport successfully registered
+ * -EEXIST:	transport already registered
+ * -EINVAL:	transport module being unloaded
+ */
+int xprt_register_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = -EEXIST;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		/* don't register the same transport class twice */
+		if (t->ident == transport->ident)
+			goto out;
+	}
+
+	list_add_tail(&transport->list, &xprt_list);
+	printk(KERN_INFO "RPC: Registered %s transport module.\n",
+	       transport->name);
+	result = 0;
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_register_transport);
+
+/**
+ * xprt_unregister_transport - unregister a transport implementation
+ * @transport: transport to unregister
+ *
+ * Returns:
+ * 0:		transport successfully unregistered
+ * -ENOENT:	transport never registered
+ */
+int xprt_unregister_transport(struct xprt_class *transport)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t == transport) {
+			printk(KERN_INFO
+				"RPC: Unregistered %s transport module.\n",
+				transport->name);
+			list_del_init(&transport->list);
+			goto out;
+		}
+	}
+	result = -ENOENT;
+
+out:
+	spin_unlock(&xprt_list_lock);
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_unregister_transport);
+
+/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+	struct xprt_class *t;
+	int result;
+
+	result = 0;
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (strcmp(t->name, transport_name) == 0) {
+			spin_unlock(&xprt_list_lock);
+			goto out;
+		}
+	}
+	spin_unlock(&xprt_list_lock);
+	result = request_module("xprt%s", transport_name);
+out:
+	return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
+/**
+ * xprt_reserve_xprt - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ *
+ * This prevents mixing the payload of separate requests, and prevents
+ * transport connects from colliding with writes.  No congestion control
+ * is provided.
+ */
+int xprt_reserve_xprt(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt	*xprt = req->rq_xprt;
+
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
+		if (task == xprt->snd_task)
+			return 1;
+		goto out_sleep;
+	}
+	xprt->snd_task = task;
+	req->rq_bytes_sent = 0;
+	req->rq_ntrans++;
+
+	return 1;
+
+out_sleep:
+	dprintk("RPC: %5u failed to lock transport %p\n",
+			task->tk_pid, xprt);
+	task->tk_timeout = 0;
+	task->tk_status = -EAGAIN;
+	if (req->rq_ntrans)
+		rpc_sleep_on(&xprt->resend, task, NULL);
+	else
+		rpc_sleep_on(&xprt->sending, task, NULL);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
+
+static void xprt_clear_locked(struct rpc_xprt *xprt)
+{
+	xprt->snd_task = NULL;
+	if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) {
+		smp_mb__before_clear_bit();
+		clear_bit(XPRT_LOCKED, &xprt->state);
+		smp_mb__after_clear_bit();
+	} else
+		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+}
+
+/*
+ * xprt_reserve_xprt_cong - serialize write access to transports
+ * @task: task that is requesting access to the transport
+ *
+ * Same as xprt_reserve_xprt, but Van Jacobson congestion control is
+ * integrated into the decision of whether a request is allowed to be
+ * woken up and given access to the transport.
+ */
+int xprt_reserve_xprt_cong(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt = task->tk_xprt;
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
+		if (task == xprt->snd_task)
+			return 1;
+		goto out_sleep;
+	}
+	if (__xprt_get_cong(xprt, task)) {
+		xprt->snd_task = task;
+		if (req) {
+			req->rq_bytes_sent = 0;
+			req->rq_ntrans++;
+		}
+		return 1;
+	}
+	xprt_clear_locked(xprt);
+out_sleep:
+	dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
+	task->tk_timeout = 0;
+	task->tk_status = -EAGAIN;
+	if (req && req->rq_ntrans)
+		rpc_sleep_on(&xprt->resend, task, NULL);
+	else
+		rpc_sleep_on(&xprt->sending, task, NULL);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
+
+static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	int retval;
+
+	spin_lock_bh(&xprt->transport_lock);
+	retval = xprt->ops->reserve_xprt(task);
+	spin_unlock_bh(&xprt->transport_lock);
+	return retval;
+}
+
+static void __xprt_lock_write_next(struct rpc_xprt *xprt)
+{
+	struct rpc_task *task;
+	struct rpc_rqst *req;
+
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+		return;
+
+	task = rpc_wake_up_next(&xprt->resend);
+	if (!task) {
+		task = rpc_wake_up_next(&xprt->sending);
+		if (!task)
+			goto out_unlock;
+	}
+
+	req = task->tk_rqstp;
+	xprt->snd_task = task;
+	if (req) {
+		req->rq_bytes_sent = 0;
+		req->rq_ntrans++;
+	}
+	return;
+
+out_unlock:
+	xprt_clear_locked(xprt);
+}
+
+static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
+{
+	struct rpc_task *task;
+
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+		return;
+	if (RPCXPRT_CONGESTED(xprt))
+		goto out_unlock;
+	task = rpc_wake_up_next(&xprt->resend);
+	if (!task) {
+		task = rpc_wake_up_next(&xprt->sending);
+		if (!task)
+			goto out_unlock;
+	}
+	if (__xprt_get_cong(xprt, task)) {
+		struct rpc_rqst *req = task->tk_rqstp;
+		xprt->snd_task = task;
+		if (req) {
+			req->rq_bytes_sent = 0;
+			req->rq_ntrans++;
+		}
+		return;
+	}
+out_unlock:
+	xprt_clear_locked(xprt);
+}
+
+/**
+ * xprt_release_xprt - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL.  No congestion control is provided.
+ */
+void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	if (xprt->snd_task == task) {
+		xprt_clear_locked(xprt);
+		__xprt_lock_write_next(xprt);
+	}
+}
+EXPORT_SYMBOL_GPL(xprt_release_xprt);
+
+/**
+ * xprt_release_xprt_cong - allow other requests to use a transport
+ * @xprt: transport with other tasks potentially waiting
+ * @task: task that is releasing access to the transport
+ *
+ * Note that "task" can be NULL.  Another task is awoken to use the
+ * transport if the transport's congestion window allows it.
+ */
+void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	if (xprt->snd_task == task) {
+		xprt_clear_locked(xprt);
+		__xprt_lock_write_next_cong(xprt);
+	}
+}
+EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
+
+static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	spin_lock_bh(&xprt->transport_lock);
+	xprt->ops->release_xprt(xprt, task);
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * Van Jacobson congestion avoidance. Check if the congestion window
+ * overflowed. Put the task to sleep if this is the case.
+ */
+static int
+__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+
+	if (req->rq_cong)
+		return 1;
+	dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
+			task->tk_pid, xprt->cong, xprt->cwnd);
+	if (RPCXPRT_CONGESTED(xprt))
+		return 0;
+	req->rq_cong = 1;
+	xprt->cong += RPC_CWNDSCALE;
+	return 1;
+}
+
+/*
+ * Adjust the congestion window, and wake up the next task
+ * that has been sleeping due to congestion
+ */
+static void
+__xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+	if (!req->rq_cong)
+		return;
+	req->rq_cong = 0;
+	xprt->cong -= RPC_CWNDSCALE;
+	__xprt_lock_write_next_cong(xprt);
+}
+
+/**
+ * xprt_release_rqst_cong - housekeeping when request is complete
+ * @task: RPC request that recently completed
+ *
+ * Useful for transports that require congestion control.
+ */
+void xprt_release_rqst_cong(struct rpc_task *task)
+{
+	__xprt_put_cong(task->tk_xprt, task->tk_rqstp);
+}
+EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
+
+/**
+ * xprt_adjust_cwnd - adjust transport congestion window
+ * @task: recently completed RPC request used to adjust window
+ * @result: result code of completed RPC request
+ *
+ * We use a time-smoothed congestion estimator to avoid heavy oscillation.
+ */
+void xprt_adjust_cwnd(struct rpc_task *task, int result)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
+	unsigned long cwnd = xprt->cwnd;
+
+	if (result >= 0 && cwnd <= xprt->cong) {
+		/* The (cwnd >> 1) term makes sure
+		 * the result gets rounded properly. */
+		cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd;
+		if (cwnd > RPC_MAXCWND(xprt))
+			cwnd = RPC_MAXCWND(xprt);
+		__xprt_lock_write_next_cong(xprt);
+	} else if (result == -ETIMEDOUT) {
+		cwnd >>= 1;
+		if (cwnd < RPC_CWNDSCALE)
+			cwnd = RPC_CWNDSCALE;
+	}
+	dprintk("RPC:       cong %ld, cwnd was %ld, now %ld\n",
+			xprt->cong, xprt->cwnd, cwnd);
+	xprt->cwnd = cwnd;
+	__xprt_put_cong(xprt, req);
+}
+EXPORT_SYMBOL_GPL(xprt_adjust_cwnd);
+
+/**
+ * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue
+ * @xprt: transport with waiting tasks
+ * @status: result code to plant in each task before waking it
+ *
+ */
+void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status)
+{
+	if (status < 0)
+		rpc_wake_up_status(&xprt->pending, status);
+	else
+		rpc_wake_up(&xprt->pending);
+}
+EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks);
+
+/**
+ * xprt_wait_for_buffer_space - wait for transport output buffer to clear
+ * @task: task to be put to sleep
+ * @action: function pointer to be executed after wait
+ */
+void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	task->tk_timeout = req->rq_timeout;
+	rpc_sleep_on(&xprt->pending, task, action);
+}
+EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
+
+/**
+ * xprt_write_space - wake the task waiting for transport output buffer space
+ * @xprt: transport with waiting tasks
+ *
+ * Can be called in a soft IRQ context, so xprt_write_space never sleeps.
+ */
+void xprt_write_space(struct rpc_xprt *xprt)
+{
+	if (unlikely(xprt->shutdown))
+		return;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (xprt->snd_task) {
+		dprintk("RPC:       write space: waking waiting task on "
+				"xprt %p\n", xprt);
+		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_write_space);
+
+/**
+ * xprt_set_retrans_timeout_def - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout based on the transport's
+ * default timeout parameters.  Used by transports that don't adjust
+ * the retransmit timeout based on round-trip time estimation.
+ */
+void xprt_set_retrans_timeout_def(struct rpc_task *task)
+{
+	task->tk_timeout = task->tk_rqstp->rq_timeout;
+}
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def);
+
+/*
+ * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout
+ * @task: task whose timeout is to be set
+ *
+ * Set a request's retransmit timeout using the RTT estimator.
+ */
+void xprt_set_retrans_timeout_rtt(struct rpc_task *task)
+{
+	int timer = task->tk_msg.rpc_proc->p_timer;
+	struct rpc_clnt *clnt = task->tk_client;
+	struct rpc_rtt *rtt = clnt->cl_rtt;
+	struct rpc_rqst *req = task->tk_rqstp;
+	unsigned long max_timeout = clnt->cl_timeout->to_maxval;
+
+	task->tk_timeout = rpc_calc_rto(rtt, timer);
+	task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries;
+	if (task->tk_timeout > max_timeout || task->tk_timeout == 0)
+		task->tk_timeout = max_timeout;
+}
+EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt);
+
+static void xprt_reset_majortimeo(struct rpc_rqst *req)
+{
+	const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+
+	req->rq_majortimeo = req->rq_timeout;
+	if (to->to_exponential)
+		req->rq_majortimeo <<= to->to_retries;
+	else
+		req->rq_majortimeo += to->to_increment * to->to_retries;
+	if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0)
+		req->rq_majortimeo = to->to_maxval;
+	req->rq_majortimeo += jiffies;
+}
+
+/**
+ * xprt_adjust_timeout - adjust timeout values for next retransmit
+ * @req: RPC request containing parameters to use for the adjustment
+ *
+ */
+int xprt_adjust_timeout(struct rpc_rqst *req)
+{
+	struct rpc_xprt *xprt = req->rq_xprt;
+	const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout;
+	int status = 0;
+
+	if (time_before(jiffies, req->rq_majortimeo)) {
+		if (to->to_exponential)
+			req->rq_timeout <<= 1;
+		else
+			req->rq_timeout += to->to_increment;
+		if (to->to_maxval && req->rq_timeout >= to->to_maxval)
+			req->rq_timeout = to->to_maxval;
+		req->rq_retries++;
+	} else {
+		req->rq_timeout = to->to_initval;
+		req->rq_retries = 0;
+		xprt_reset_majortimeo(req);
+		/* Reset the RTT counters == "slow start" */
+		spin_lock_bh(&xprt->transport_lock);
+		rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
+		spin_unlock_bh(&xprt->transport_lock);
+		status = -ETIMEDOUT;
+	}
+
+	if (req->rq_timeout == 0) {
+		printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n");
+		req->rq_timeout = 5 * HZ;
+	}
+	return status;
+}
+
+static void xprt_autoclose(struct work_struct *work)
+{
+	struct rpc_xprt *xprt =
+		container_of(work, struct rpc_xprt, task_cleanup);
+
+	xprt->ops->close(xprt);
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	xprt_release_write(xprt, NULL);
+}
+
+/**
+ * xprt_disconnect_done - mark a transport as disconnected
+ * @xprt: transport to flag for disconnect
+ *
+ */
+void xprt_disconnect_done(struct rpc_xprt *xprt)
+{
+	dprintk("RPC:       disconnected transport %p\n", xprt);
+	spin_lock_bh(&xprt->transport_lock);
+	xprt_clear_connected(xprt);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
+	spin_unlock_bh(&xprt->transport_lock);
+}
+EXPORT_SYMBOL_GPL(xprt_disconnect_done);
+
+/**
+ * xprt_force_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ *
+ */
+void xprt_force_disconnect(struct rpc_xprt *xprt)
+{
+	/* Don't race with the test_bit() in xprt_clear_locked() */
+	spin_lock_bh(&xprt->transport_lock);
+	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	/* Try to schedule an autoclose RPC call */
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/**
+ * xprt_conditional_disconnect - force a transport to disconnect
+ * @xprt: transport to disconnect
+ * @cookie: 'connection cookie'
+ *
+ * This attempts to break the connection if and only if 'cookie' matches
+ * the current transport 'connection cookie'. It ensures that we don't
+ * try to break the connection more than once when we need to retransmit
+ * a batch of RPC requests.
+ *
+ */
+void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
+{
+	/* Don't race with the test_bit() in xprt_clear_locked() */
+	spin_lock_bh(&xprt->transport_lock);
+	if (cookie != xprt->connect_cookie)
+		goto out;
+	if (test_bit(XPRT_CLOSING, &xprt->state) || !xprt_connected(xprt))
+		goto out;
+	set_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	/* Try to schedule an autoclose RPC call */
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
+		queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
+out:
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void
+xprt_init_autodisconnect(unsigned long data)
+{
+	struct rpc_xprt *xprt = (struct rpc_xprt *)data;
+
+	spin_lock(&xprt->transport_lock);
+	if (!list_empty(&xprt->recv) || xprt->shutdown)
+		goto out_abort;
+	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
+		goto out_abort;
+	spin_unlock(&xprt->transport_lock);
+	set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
+	queue_work(rpciod_workqueue, &xprt->task_cleanup);
+	return;
+out_abort:
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_connect - schedule a transport connect operation
+ * @task: RPC task that is requesting the connect
+ *
+ */
+void xprt_connect(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt = task->tk_xprt;
+
+	dprintk("RPC: %5u xprt_connect xprt %p %s connected\n", task->tk_pid,
+			xprt, (xprt_connected(xprt) ? "is" : "is not"));
+
+	if (!xprt_bound(xprt)) {
+		task->tk_status = -EAGAIN;
+		return;
+	}
+	if (!xprt_lock_write(xprt, task))
+		return;
+
+	if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state))
+		xprt->ops->close(xprt);
+
+	if (xprt_connected(xprt))
+		xprt_release_write(xprt, task);
+	else {
+		if (task->tk_rqstp)
+			task->tk_rqstp->rq_bytes_sent = 0;
+
+		task->tk_timeout = task->tk_rqstp->rq_timeout;
+		rpc_sleep_on(&xprt->pending, task, xprt_connect_status);
+
+		if (test_bit(XPRT_CLOSING, &xprt->state))
+			return;
+		if (xprt_test_and_set_connecting(xprt))
+			return;
+		xprt->stat.connect_start = jiffies;
+		xprt->ops->connect(task);
+	}
+}
+
+static void xprt_connect_status(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt = task->tk_xprt;
+
+	if (task->tk_status == 0) {
+		xprt->stat.connect_count++;
+		xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start;
+		dprintk("RPC: %5u xprt_connect_status: connection established\n",
+				task->tk_pid);
+		return;
+	}
+
+	switch (task->tk_status) {
+	case -EAGAIN:
+		dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
+		break;
+	case -ETIMEDOUT:
+		dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
+				"out\n", task->tk_pid);
+		break;
+	default:
+		dprintk("RPC: %5u xprt_connect_status: error %d connecting to "
+				"server %s\n", task->tk_pid, -task->tk_status,
+				task->tk_client->cl_server);
+		xprt_release_write(xprt, task);
+		task->tk_status = -EIO;
+	}
+}
+
+/**
+ * xprt_lookup_rqst - find an RPC request corresponding to an XID
+ * @xprt: transport on which the original request was transmitted
+ * @xid: RPC XID of incoming reply
+ *
+ */
+struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
+{
+	struct rpc_rqst *entry;
+
+	list_for_each_entry(entry, &xprt->recv, rq_list)
+		if (entry->rq_xid == xid)
+			return entry;
+
+	dprintk("RPC:       xprt_lookup_rqst did not find xid %08x\n",
+			ntohl(xid));
+	xprt->stat.bad_xids++;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
+
+static void xprt_update_rtt(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_rtt *rtt = task->tk_client->cl_rtt;
+	unsigned timer = task->tk_msg.rpc_proc->p_timer;
+	long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt));
+
+	if (timer) {
+		if (req->rq_ntrans == 1)
+			rpc_update_rtt(rtt, timer, m);
+		rpc_set_timeo(rtt, timer, req->rq_ntrans - 1);
+	}
+}
+
+/**
+ * xprt_complete_rqst - called when reply processing is complete
+ * @task: RPC request that recently completed
+ * @copied: actual number of bytes received from the transport
+ *
+ * Caller holds transport lock.
+ */
+void xprt_complete_rqst(struct rpc_task *task, int copied)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
+			task->tk_pid, ntohl(req->rq_xid), copied);
+
+	xprt->stat.recvs++;
+	req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
+	if (xprt->ops->timer != NULL)
+		xprt_update_rtt(task);
+
+	list_del_init(&req->rq_list);
+	req->rq_private_buf.len = copied;
+	/* Ensure all writes are done before we update */
+	/* req->rq_reply_bytes_recvd */
+	smp_wmb();
+	req->rq_reply_bytes_recvd = copied;
+	rpc_wake_up_queued_task(&xprt->pending, task);
+}
+EXPORT_SYMBOL_GPL(xprt_complete_rqst);
+
+static void xprt_timer(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	if (task->tk_status != -ETIMEDOUT)
+		return;
+	dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (!req->rq_reply_bytes_recvd) {
+		if (xprt->ops->timer)
+			xprt->ops->timer(task);
+	} else
+		task->tk_status = 0;
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+static inline int xprt_has_timer(struct rpc_xprt *xprt)
+{
+	return xprt->idle_timeout != 0;
+}
+
+/**
+ * xprt_prepare_transmit - reserve the transport before sending a request
+ * @task: RPC task about to send a request
+ *
+ */
+int xprt_prepare_transmit(struct rpc_task *task)
+{
+	struct rpc_rqst	*req = task->tk_rqstp;
+	struct rpc_xprt	*xprt = req->rq_xprt;
+	int err = 0;
+
+	dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid);
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (req->rq_reply_bytes_recvd && !req->rq_bytes_sent) {
+		err = req->rq_reply_bytes_recvd;
+		goto out_unlock;
+	}
+	if (!xprt->ops->reserve_xprt(task))
+		err = -EAGAIN;
+out_unlock:
+	spin_unlock_bh(&xprt->transport_lock);
+	return err;
+}
+
+void xprt_end_transmit(struct rpc_task *task)
+{
+	xprt_release_write(task->tk_rqstp->rq_xprt, task);
+}
+
+/**
+ * xprt_transmit - send an RPC request on a transport
+ * @task: controlling RPC task
+ *
+ * We have to copy the iovec because sendmsg fiddles with its contents.
+ */
+void xprt_transmit(struct rpc_task *task)
+{
+	struct rpc_rqst	*req = task->tk_rqstp;
+	struct rpc_xprt	*xprt = req->rq_xprt;
+	int status;
+
+	dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen);
+
+	if (!req->rq_reply_bytes_recvd) {
+		if (list_empty(&req->rq_list) && rpc_reply_expected(task)) {
+			/*
+			 * Add to the list only if we're expecting a reply
+			 */
+			spin_lock_bh(&xprt->transport_lock);
+			/* Update the softirq receive buffer */
+			memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+					sizeof(req->rq_private_buf));
+			/* Add request to the receive list */
+			list_add_tail(&req->rq_list, &xprt->recv);
+			spin_unlock_bh(&xprt->transport_lock);
+			xprt_reset_majortimeo(req);
+			/* Turn off autodisconnect */
+			del_singleshot_timer_sync(&xprt->timer);
+		}
+	} else if (!req->rq_bytes_sent)
+		return;
+
+	req->rq_connect_cookie = xprt->connect_cookie;
+	req->rq_xtime = ktime_get();
+	status = xprt->ops->send_request(task);
+	if (status != 0) {
+		task->tk_status = status;
+		return;
+	}
+
+	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
+	task->tk_flags |= RPC_TASK_SENT;
+	spin_lock_bh(&xprt->transport_lock);
+
+	xprt->ops->set_retrans_timeout(task);
+
+	xprt->stat.sends++;
+	xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+	xprt->stat.bklog_u += xprt->backlog.qlen;
+
+	/* Don't race with disconnect */
+	if (!xprt_connected(xprt))
+		task->tk_status = -ENOTCONN;
+	else if (!req->rq_reply_bytes_recvd && rpc_reply_expected(task)) {
+		/*
+		 * Sleep on the pending queue since
+		 * we're expecting a reply.
+		 */
+		rpc_sleep_on(&xprt->pending, task, xprt_timer);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xprt_alloc_slot(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt = task->tk_xprt;
+
+	task->tk_status = 0;
+	if (task->tk_rqstp)
+		return;
+	if (!list_empty(&xprt->free)) {
+		struct rpc_rqst	*req = list_entry(xprt->free.next, struct rpc_rqst, rq_list);
+		list_del_init(&req->rq_list);
+		task->tk_rqstp = req;
+		xprt_request_init(task, xprt);
+		return;
+	}
+	dprintk("RPC:       waiting for request slot\n");
+	task->tk_status = -EAGAIN;
+	task->tk_timeout = 0;
+	rpc_sleep_on(&xprt->backlog, task, NULL);
+}
+
+static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
+{
+	memset(req, 0, sizeof(*req));	/* mark unused */
+
+	spin_lock(&xprt->reserve_lock);
+	list_add(&req->rq_list, &xprt->free);
+	rpc_wake_up_next(&xprt->backlog);
+	spin_unlock(&xprt->reserve_lock);
+}
+
+struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
+{
+	struct rpc_xprt *xprt;
+
+	xprt = kzalloc(size, GFP_KERNEL);
+	if (xprt == NULL)
+		goto out;
+	atomic_set(&xprt->count, 1);
+
+	xprt->max_reqs = max_req;
+	xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
+	if (xprt->slot == NULL)
+		goto out_free;
+
+	xprt->xprt_net = get_net(net);
+	return xprt;
+
+out_free:
+	kfree(xprt);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_alloc);
+
+void xprt_free(struct rpc_xprt *xprt)
+{
+	put_net(xprt->xprt_net);
+	kfree(xprt->slot);
+	kfree(xprt);
+}
+EXPORT_SYMBOL_GPL(xprt_free);
+
+/**
+ * xprt_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If no more slots are available, place the task on the transport's
+ * backlog queue.
+ */
+void xprt_reserve(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt = task->tk_xprt;
+
+	task->tk_status = -EIO;
+	spin_lock(&xprt->reserve_lock);
+	xprt_alloc_slot(task);
+	spin_unlock(&xprt->reserve_lock);
+}
+
+static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
+{
+	return (__force __be32)xprt->xid++;
+}
+
+static inline void xprt_init_xid(struct rpc_xprt *xprt)
+{
+	xprt->xid = net_random();
+}
+
+static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
+{
+	struct rpc_rqst	*req = task->tk_rqstp;
+
+	req->rq_timeout = task->tk_client->cl_timeout->to_initval;
+	req->rq_task	= task;
+	req->rq_xprt    = xprt;
+	req->rq_buffer  = NULL;
+	req->rq_xid     = xprt_alloc_xid(xprt);
+	req->rq_release_snd_buf = NULL;
+	xprt_reset_majortimeo(req);
+	dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
+			req, ntohl(req->rq_xid));
+}
+
+/**
+ * xprt_release - release an RPC request slot
+ * @task: task which is finished with the slot
+ *
+ */
+void xprt_release(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt;
+	struct rpc_rqst	*req;
+
+	if (!(req = task->tk_rqstp))
+		return;
+
+	xprt = req->rq_xprt;
+	rpc_count_iostats(task);
+	spin_lock_bh(&xprt->transport_lock);
+	xprt->ops->release_xprt(xprt, task);
+	if (xprt->ops->release_request)
+		xprt->ops->release_request(task);
+	if (!list_empty(&req->rq_list))
+		list_del(&req->rq_list);
+	xprt->last_used = jiffies;
+	if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+		mod_timer(&xprt->timer,
+				xprt->last_used + xprt->idle_timeout);
+	spin_unlock_bh(&xprt->transport_lock);
+	if (req->rq_buffer)
+		xprt->ops->buf_free(req->rq_buffer);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	task->tk_rqstp = NULL;
+	if (req->rq_release_snd_buf)
+		req->rq_release_snd_buf(req);
+
+	dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
+	if (likely(!bc_prealloc(req)))
+		xprt_free_slot(xprt, req);
+	else
+		xprt_free_bc_request(req);
+}
+
+/**
+ * xprt_create_transport - create an RPC transport
+ * @args: rpc transport creation arguments
+ *
+ */
+struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
+{
+	struct rpc_xprt	*xprt;
+	struct rpc_rqst	*req;
+	struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t->ident == args->ident) {
+			spin_unlock(&xprt_list_lock);
+			goto found;
+		}
+	}
+	spin_unlock(&xprt_list_lock);
+	printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
+	return ERR_PTR(-EIO);
+
+found:
+	xprt = t->setup(args);
+	if (IS_ERR(xprt)) {
+		dprintk("RPC:       xprt_create_transport: failed, %ld\n",
+				-PTR_ERR(xprt));
+		return xprt;
+	}
+	if (test_and_set_bit(XPRT_INITIALIZED, &xprt->state))
+		/* ->setup returned a pre-initialized xprt: */
+		return xprt;
+
+	spin_lock_init(&xprt->transport_lock);
+	spin_lock_init(&xprt->reserve_lock);
+
+	INIT_LIST_HEAD(&xprt->free);
+	INIT_LIST_HEAD(&xprt->recv);
+#if defined(CONFIG_NFS_V4_1)
+	spin_lock_init(&xprt->bc_pa_lock);
+	INIT_LIST_HEAD(&xprt->bc_pa_list);
+#endif /* CONFIG_NFS_V4_1 */
+
+	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
+	if (xprt_has_timer(xprt))
+		setup_timer(&xprt->timer, xprt_init_autodisconnect,
+			    (unsigned long)xprt);
+	else
+		init_timer(&xprt->timer);
+	xprt->last_used = jiffies;
+	xprt->cwnd = RPC_INITCWND;
+	xprt->bind_index = 0;
+
+	rpc_init_wait_queue(&xprt->binding, "xprt_binding");
+	rpc_init_wait_queue(&xprt->pending, "xprt_pending");
+	rpc_init_wait_queue(&xprt->sending, "xprt_sending");
+	rpc_init_wait_queue(&xprt->resend, "xprt_resend");
+	rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
+
+	/* initialize free list */
+	for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--)
+		list_add(&req->rq_list, &xprt->free);
+
+	xprt_init_xid(xprt);
+
+	dprintk("RPC:       created transport %p with %u slots\n", xprt,
+			xprt->max_reqs);
+	return xprt;
+}
+
+/**
+ * xprt_destroy - destroy an RPC transport, killing off all requests.
+ * @xprt: transport to destroy
+ *
+ */
+static void xprt_destroy(struct rpc_xprt *xprt)
+{
+	dprintk("RPC:       destroying transport %p\n", xprt);
+	xprt->shutdown = 1;
+	del_timer_sync(&xprt->timer);
+
+	rpc_destroy_wait_queue(&xprt->binding);
+	rpc_destroy_wait_queue(&xprt->pending);
+	rpc_destroy_wait_queue(&xprt->sending);
+	rpc_destroy_wait_queue(&xprt->resend);
+	rpc_destroy_wait_queue(&xprt->backlog);
+	cancel_work_sync(&xprt->task_cleanup);
+	/*
+	 * Tear down transport state and free the rpc_xprt
+	 */
+	xprt->ops->destroy(xprt);
+}
+
+/**
+ * xprt_put - release a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+void xprt_put(struct rpc_xprt *xprt)
+{
+	if (atomic_dec_and_test(&xprt->count))
+		xprt_destroy(xprt);
+}
+
+/**
+ * xprt_get - return a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
+{
+	if (atomic_inc_not_zero(&xprt->count))
+		return xprt;
+	return NULL;
+}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 00000000..5a8f268b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o
+
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 00000000..554d0814
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,882 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+#ifdef RPC_DEBUG
+static const char transfertypes[][12] = {
+	"pure inline",	/* no chunks */
+	" read chunk",	/* some argument via rdma read */
+	"*read chunk",	/* entire request via rdma read */
+	"write chunk",	/* some result via rdma write */
+	"reply chunk"	/* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Note, this routine is never called if the connection's memory
+ * registration strategy is 0 (bounce buffers).
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+	int len, n = 0, p;
+	int page_base;
+	struct page **ppages;
+
+	if (pos == 0 && xdrbuf->head[0].iov_len) {
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->head[0].iov_base;
+		seg[n].mr_len = xdrbuf->head[0].iov_len;
+		++n;
+	}
+
+	len = xdrbuf->page_len;
+	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+	page_base = xdrbuf->page_base & ~PAGE_MASK;
+	p = 0;
+	while (len && n < nsegs) {
+		seg[n].mr_page = ppages[p];
+		seg[n].mr_offset = (void *)(unsigned long) page_base;
+		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+		BUG_ON(seg[n].mr_len > PAGE_SIZE);
+		len -= seg[n].mr_len;
+		++n;
+		++p;
+		page_base = 0;	/* page offset only applies to first page */
+	}
+
+	/* Message overflows the seg array */
+	if (len && n == nsegs)
+		return 0;
+
+	if (xdrbuf->tail[0].iov_len) {
+		/* the rpcrdma protocol allows us to omit any trailing
+		 * xdr pad bytes, saving the server an RDMA operation. */
+		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+			return n;
+		if (n == nsegs)
+			/* Tail remains, but we're out of segments */
+			return 0;
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+		seg[n].mr_len = xdrbuf->tail[0].iov_len;
+		++n;
+	}
+
+	return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ *   Assume check against THRESHOLD has been done, and chunks are required.
+ *   Assume only encoding one list entry for read|write chunks. The NFSv3
+ *     protocol is simple enough to allow this as it only has a single "bulk
+ *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ */
+
+static unsigned int
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt);
+	int nsegs, nchunks = 0;
+	unsigned int pos;
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_read_chunk *cur_rchunk = NULL;
+	struct rpcrdma_write_array *warray = NULL;
+	struct rpcrdma_write_chunk *cur_wchunk = NULL;
+	__be32 *iptr = headerp->rm_body.rm_chunks;
+
+	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+		/* a read chunk - server will RDMA Read our memory */
+		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+	} else {
+		/* a write or reply chunk - server will RDMA Write our memory */
+		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
+		if (type == rpcrdma_replych)
+			*iptr++ = xdr_zero;	/* a NULL write chunk list */
+		warray = (struct rpcrdma_write_array *) iptr;
+		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+	}
+
+	if (type == rpcrdma_replych || type == rpcrdma_areadch)
+		pos = 0;
+	else
+		pos = target->head[0].iov_len;
+
+	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+	if (nsegs == 0)
+		return 0;
+
+	do {
+		/* bind/register the memory, then build chunk from result. */
+		int n = rpcrdma_register_external(seg, nsegs,
+						cur_wchunk != NULL, r_xprt);
+		if (n <= 0)
+			goto out;
+		if (cur_rchunk) {	/* read */
+			cur_rchunk->rc_discrim = xdr_one;
+			/* all read chunks have the same "position" */
+			cur_rchunk->rc_position = htonl(pos);
+			cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(__be32 *)&cur_rchunk->rc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: read chunk "
+				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
+				seg->mr_len, (unsigned long long)seg->mr_base,
+				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
+			cur_rchunk++;
+			r_xprt->rx_stats.read_chunk_count++;
+		} else {		/* write/reply */
+			cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(__be32 *)&cur_wchunk->wc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: %s chunk "
+				"elem %d@0x%llx:0x%x (%s)\n", __func__,
+				(type == rpcrdma_replych) ? "reply" : "write",
+				seg->mr_len, (unsigned long long)seg->mr_base,
+				seg->mr_rkey, n < nsegs ? "more" : "last");
+			cur_wchunk++;
+			if (type == rpcrdma_replych)
+				r_xprt->rx_stats.reply_chunk_count++;
+			else
+				r_xprt->rx_stats.write_chunk_count++;
+			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		}
+		nchunks++;
+		seg   += n;
+		nsegs -= n;
+	} while (nsegs);
+
+	/* success. all failures return above */
+	req->rl_nchunks = nchunks;
+
+	BUG_ON(nchunks == 0);
+	BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+	       && (nchunks > 3));
+
+	/*
+	 * finish off header. If write, marshal discrim and nchunks.
+	 */
+	if (cur_rchunk) {
+		iptr = (__be32 *) cur_rchunk;
+		*iptr++ = xdr_zero;	/* finish the read chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
+	} else {
+		warray->wc_discrim = xdr_one;
+		warray->wc_nchunks = htonl(nchunks);
+		iptr = (__be32 *) cur_wchunk;
+		if (type == rpcrdma_writech) {
+			*iptr++ = xdr_zero; /* finish the write chunk list */
+			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
+		}
+	}
+
+	/*
+	 * Return header size.
+	 */
+	return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+	for (pos = 0; nchunks--;)
+		pos += rpcrdma_deregister_external(
+				&req->rl_segments[pos], r_xprt, NULL);
+	return 0;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+	int i, npages, curlen;
+	int copy_len;
+	unsigned char *srcp, *destp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	int page_base;
+	struct page **ppages;
+
+	destp = rqst->rq_svec[0].iov_base;
+	curlen = rqst->rq_svec[0].iov_len;
+	destp += curlen;
+	/*
+	 * Do optional padding where it makes sense. Alignment of write
+	 * payload can help the server, if our setting is accurate.
+	 */
+	pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+	if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+		pad = 0;	/* don't pad this request */
+
+	dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+		__func__, pad, destp, rqst->rq_slen, curlen);
+
+	copy_len = rqst->rq_snd_buf.page_len;
+
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
+			memmove(destp + copy_len,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
+			__func__, destp + copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
+	r_xprt->rx_stats.pullup_copy_count += copy_len;
+
+	page_base = rqst->rq_snd_buf.page_base;
+	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
+	for (i = 0; copy_len && i < npages; i++) {
+		curlen = PAGE_SIZE - page_base;
+		if (curlen > copy_len)
+			curlen = copy_len;
+		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
+			__func__, i, destp, copy_len, curlen);
+		srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
+		memcpy(destp, srcp+page_base, curlen);
+		kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA);
+		rqst->rq_svec[0].iov_len += curlen;
+		destp += curlen;
+		copy_len -= curlen;
+		page_base = 0;
+	}
+	/* header now contains entire send message */
+	return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ *  [0] -- RPC RDMA header, which uses memory from the *start* of the
+ *         preregistered buffer that already holds the RPC data in
+ *         its middle.
+ *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ *  [2] -- optional padding.
+ *  [3] -- if padded, header only in [1] and data here.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	char *base;
+	size_t hdrlen, rpclen, padlen;
+	enum rpcrdma_chunktype rtype, wtype;
+	struct rpcrdma_msg *headerp;
+
+	/*
+	 * rpclen gets amount of data in first buffer, which is the
+	 * pre-registered buffer.
+	 */
+	base = rqst->rq_svec[0].iov_base;
+	rpclen = rqst->rq_svec[0].iov_len;
+
+	/* build RDMA header in private area at front */
+	headerp = (struct rpcrdma_msg *) req->rl_base;
+	/* don't htonl XID, it's already done in request */
+	headerp->rm_xid = rqst->rq_xid;
+	headerp->rm_vers = xdr_one;
+	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
+	headerp->rm_type = htonl(RDMA_MSG);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline (but see later).
+	 * o Large non-read ops return as a single reply chunk.
+	 * o Large read ops return data as write chunk(s), header as inline.
+	 *
+	 * Note: the NFS code sending down multiple result segments implies
+	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+	 */
+
+	/*
+	 * This code can handle read chunks, write chunks OR reply
+	 * chunks -- only one type. If the request is too big to fit
+	 * inline, then we will choose read chunks. If the request is
+	 * a READ, then use write chunks to separate the file data
+	 * into pages; otherwise use reply chunks.
+	 */
+	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+		wtype = rpcrdma_noch;
+	else if (rqst->rq_rcv_buf.page_len == 0)
+		wtype = rpcrdma_replych;
+	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		wtype = rpcrdma_writech;
+	else
+		wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 *
+	 * Note: the NFS code sending down multiple argument segments
+	 * implies the op is a write.
+	 * TBD check NFSv4 setacl
+	 */
+	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+		rtype = rpcrdma_noch;
+	else if (rqst->rq_snd_buf.page_len == 0)
+		rtype = rpcrdma_areadch;
+	else
+		rtype = rpcrdma_readch;
+
+	/* The following simplification is not true forever */
+	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+		wtype = rpcrdma_noch;
+	BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+
+	if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+	    (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+		/* forced to "pure inline"? */
+		dprintk("RPC:       %s: too much data (%d/%d) for inline\n",
+			__func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+		return -1;
+	}
+
+	hdrlen = 28; /*sizeof *headerp;*/
+	padlen = 0;
+
+	/*
+	 * Pull up any extra send data into the preregistered buffer.
+	 * When padding is in use and applies to the transfer, insert
+	 * it and change the message type.
+	 */
+	if (rtype == rpcrdma_noch) {
+
+		padlen = rpcrdma_inline_pullup(rqst,
+						RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+		if (padlen) {
+			headerp->rm_type = htonl(RDMA_MSGP);
+			headerp->rm_body.rm_padded.rm_align =
+				htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+			headerp->rm_body.rm_padded.rm_thresh =
+				htonl(RPCRDMA_INLINE_PAD_THRESH);
+			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+			BUG_ON(wtype != rpcrdma_noch);
+
+		} else {
+			headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+			/* new length after pullup */
+			rpclen = rqst->rq_svec[0].iov_len;
+			/*
+			 * Currently we try to not actually use read inline.
+			 * Reply chunks have the desirable property that
+			 * they land, packed, directly in the target buffers
+			 * without headers, so they require no fixup. The
+			 * additional RDMA Write op sends the same amount
+			 * of data, streams on-the-wire and adds no overhead
+			 * on receive. Therefore, we request a reply chunk
+			 * for non-writes wherever feasible and efficient.
+			 */
+			if (wtype == rpcrdma_noch &&
+			    r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+				wtype = rpcrdma_replych;
+		}
+	}
+
+	/*
+	 * Marshal chunks. This routine will return the header length
+	 * consumed by marshaling.
+	 */
+	if (rtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_snd_buf, headerp, rtype);
+		wtype = rtype;	/* simplify dprintk */
+
+	} else if (wtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst,
+					&rqst->rq_rcv_buf, headerp, wtype);
+	}
+
+	if (hdrlen == 0)
+		return -1;
+
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+		" headerp 0x%p base 0x%p lkey 0x%x\n",
+		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+		headerp, base, req->rl_iov.lkey);
+
+	/*
+	 * initialize send_iov's - normally only two: rdma chunk header and
+	 * single preregistered RPC header buffer, but if padding is present,
+	 * then use a preregistered (and zeroed) pad buffer between the RPC
+	 * header and any write data. In all non-rdma cases, any following
+	 * data has been copied into the RPC header buffer.
+	 */
+	req->rl_send_iov[0].addr = req->rl_iov.addr;
+	req->rl_send_iov[0].length = hdrlen;
+	req->rl_send_iov[0].lkey = req->rl_iov.lkey;
+
+	req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
+	req->rl_send_iov[1].length = rpclen;
+	req->rl_send_iov[1].lkey = req->rl_iov.lkey;
+
+	req->rl_niovs = 2;
+
+	if (padlen) {
+		struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+		req->rl_send_iov[2].addr = ep->rep_pad.addr;
+		req->rl_send_iov[2].length = padlen;
+		req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
+
+		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+		req->rl_send_iov[3].lkey = req->rl_iov.lkey;
+
+		req->rl_niovs = 4;
+	}
+
+	return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+{
+	unsigned int i, total_len;
+	struct rpcrdma_write_chunk *cur_wchunk;
+
+	i = ntohl(**iptrp);	/* get array count */
+	if (i > max)
+		return -1;
+	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+	total_len = 0;
+	while (i--) {
+		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+		ifdebug(FACILITY) {
+			u64 off;
+			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
+			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+				__func__,
+				ntohl(seg->rs_length),
+				(unsigned long long)off,
+				ntohl(seg->rs_handle));
+		}
+		total_len += ntohl(seg->rs_length);
+		++cur_wchunk;
+	}
+	/* check and adjust for properly terminated write chunk */
+	if (wrchunk) {
+		__be32 *w = (__be32 *) cur_wchunk;
+		if (*w++ != xdr_zero)
+			return -1;
+		cur_wchunk = (struct rpcrdma_write_chunk *) w;
+	}
+	if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
+		return -1;
+
+	*iptrp = (__be32 *) cur_wchunk;
+	return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+{
+	int i, npages, curlen, olen;
+	char *destp;
+	struct page **ppages;
+	int page_base;
+
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len) {	/* write chunk header fixup */
+		curlen = copy_len;
+		rqst->rq_rcv_buf.head[0].iov_len = curlen;
+	}
+
+	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
+		__func__, srcp, copy_len, curlen);
+
+	/* Shift pointer for first receive segment only */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	srcp += curlen;
+	copy_len -= curlen;
+
+	olen = copy_len;
+	i = 0;
+	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+	page_base = rqst->rq_rcv_buf.page_base;
+	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		npages = PAGE_ALIGN(page_base +
+			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+		for (; i < npages; i++) {
+			curlen = PAGE_SIZE - page_base;
+			if (curlen > copy_len)
+				curlen = copy_len;
+			dprintk("RPC:       %s: page %d"
+				" srcp 0x%p len %d curlen %d\n",
+				__func__, i, srcp, copy_len, curlen);
+			destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA);
+			memcpy(destp + page_base, srcp, curlen);
+			flush_dcache_page(ppages[i]);
+			kunmap_atomic(destp, KM_SKB_SUNRPC_DATA);
+			srcp += curlen;
+			copy_len -= curlen;
+			if (copy_len == 0)
+				break;
+			page_base = 0;
+		}
+		rqst->rq_rcv_buf.page_len = olen - copy_len;
+	} else
+		rqst->rq_rcv_buf.page_len = 0;
+
+	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+		curlen = copy_len;
+		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
+			__func__, srcp, copy_len, curlen);
+		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+		copy_len -= curlen; ++i;
+	} else
+		rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+	if (pad) {
+		/* implicit padding on terminal chunk */
+		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+		while (pad--)
+			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+	}
+
+	if (copy_len)
+		dprintk("RPC:       %s: %d bytes in"
+			" %d extra segments (%d lost)\n",
+			__func__, olen, i, copy_len);
+
+	/* TBD avoid a warning from call_decode() */
+	rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	struct rpc_xprt *xprt = ep->rep_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
+		++xprt->connect_cookie;
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when memory window unbind which we are waiting
+ * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ */
+static void
+rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+{
+	wake_up(&rep->rr_unbind);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_msg *headerp;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct rpc_xprt *xprt = rep->rr_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	__be32 *iptr;
+	int i, rdmalen, status;
+
+	/* Check status. If bad, signal disconnect and return rep to pool */
+	if (rep->rr_len == ~0U) {
+		rpcrdma_recv_buffer_put(rep);
+		if (r_xprt->rx_ep.rep_connected == 1) {
+			r_xprt->rx_ep.rep_connected = -EIO;
+			rpcrdma_conn_func(&r_xprt->rx_ep);
+		}
+		return;
+	}
+	if (rep->rr_len < 28) {
+		dprintk("RPC:       %s: short/invalid reply\n", __func__);
+		goto repost;
+	}
+	headerp = (struct rpcrdma_msg *) rep->rr_base;
+	if (headerp->rm_vers != xdr_one) {
+		dprintk("RPC:       %s: invalid version %d\n",
+			__func__, ntohl(headerp->rm_vers));
+		goto repost;
+	}
+
+	/* Get XID and try for a match. */
+	spin_lock(&xprt->transport_lock);
+	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+	if (rqst == NULL) {
+		spin_unlock(&xprt->transport_lock);
+		dprintk("RPC:       %s: reply 0x%p failed "
+			"to match any request xid 0x%08x len %d\n",
+			__func__, rep, headerp->rm_xid, rep->rr_len);
+repost:
+		r_xprt->rx_stats.bad_reply_count++;
+		rep->rr_func = rpcrdma_reply_handler;
+		if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+			rpcrdma_recv_buffer_put(rep);
+
+		return;
+	}
+
+	/* get request object */
+	req = rpcr_to_rdmar(rqst);
+
+	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
+		"                   RPC request 0x%p xid 0x%08x\n",
+			__func__, rep, req, rqst, headerp->rm_xid);
+
+	BUG_ON(!req || req->rl_reply);
+
+	/* from here on, the reply is no longer an orphan */
+	req->rl_reply = rep;
+
+	/* check for expected message types */
+	/* The order of some of these tests is important. */
+	switch (headerp->rm_type) {
+	case htonl(RDMA_MSG):
+		/* never expect read chunks */
+		/* never expect reply chunks (two ways to check) */
+		/* never expect write chunks without having offered RDMA */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+		     req->rl_nchunks == 0))
+			goto badheader;
+		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+			/* count any expected write chunks in read reply */
+			/* start at write chunk array count */
+			iptr = &headerp->rm_body.rm_chunks[2];
+			rdmalen = rpcrdma_count_chunks(rep,
+						req->rl_nchunks, 1, &iptr);
+			/* check for validity, and no reply chunk after */
+			if (rdmalen < 0 || *iptr++ != xdr_zero)
+				goto badheader;
+			rep->rr_len -=
+			    ((unsigned char *)iptr - (unsigned char *)headerp);
+			status = rep->rr_len + rdmalen;
+			r_xprt->rx_stats.total_rdma_reply += rdmalen;
+			/* special case - last chunk may omit padding */
+			if (rdmalen &= 3) {
+				rdmalen = 4 - rdmalen;
+				status += rdmalen;
+			}
+		} else {
+			/* else ordinary inline */
+			rdmalen = 0;
+			iptr = (__be32 *)((unsigned char *)headerp + 28);
+			rep->rr_len -= 28; /*sizeof *headerp;*/
+			status = rep->rr_len;
+		}
+		/* Fix up the rpc results for upper layer */
+		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+		break;
+
+	case htonl(RDMA_NOMSG):
+		/* never expect read or write chunks, always reply chunks */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[2] != xdr_one ||
+		    req->rl_nchunks == 0)
+			goto badheader;
+		iptr = (__be32 *)((unsigned char *)headerp + 28);
+		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+		if (rdmalen < 0)
+			goto badheader;
+		r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		/* Reply chunk buffer already is the reply vector - no fixup. */
+		status = rdmalen;
+		break;
+
+badheader:
+	default:
+		dprintk("%s: invalid rpcrdma reply header (type %d):"
+				" chunks[012] == %d %d %d"
+				" expected chunks <= %d\n",
+				__func__, ntohl(headerp->rm_type),
+				headerp->rm_body.rm_chunks[0],
+				headerp->rm_body.rm_chunks[1],
+				headerp->rm_body.rm_chunks[2],
+				req->rl_nchunks);
+		status = -EIO;
+		r_xprt->rx_stats.bad_reply_count++;
+		break;
+	}
+
+	/* If using mw bind, start the deregister process now. */
+	/* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+	if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS:
+		for (i = 0; req->rl_nchunks-- > 1;)
+			i += rpcrdma_deregister_external(
+				&req->rl_segments[i], r_xprt, NULL);
+		/* Optionally wait (not here) for unbinds to complete */
+		rep->rr_func = rpcrdma_unbind_func;
+		(void) rpcrdma_deregister_external(&req->rl_segments[i],
+						   r_xprt, rep);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		for (i = 0; req->rl_nchunks--;)
+			i += rpcrdma_deregister_external(&req->rl_segments[i],
+							 r_xprt, NULL);
+		break;
+	default:
+		break;
+	}
+
+	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+			__func__, xprt, rqst, status);
+	xprt_complete_rqst(rqst->rq_task, status);
+	spin_unlock(&xprt->transport_lock);
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 00000000..09af4fab
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct workqueue_struct *svc_rdma_wq;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp,
+			   loff_t *ppos)
+{
+	atomic_t *stat = (atomic_t *)table->data;
+
+	if (!stat)
+		return -EINVAL;
+
+	if (write)
+		atomic_set(stat, 0);
+	else {
+		char str_buf[32];
+		char *data;
+		int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+		if (len >= 32)
+			return -EFAULT;
+		len = strlen(str_buf);
+		if (*ppos > len) {
+			*lenp = 0;
+			return 0;
+		}
+		data = &str_buf[*ppos];
+		len -= *ppos;
+		if (len > *lenp)
+			len = *lenp;
+		if (len && copy_to_user(buffer, str_buf, len))
+			return -EFAULT;
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static ctl_table svcrdma_parm_table[] = {
+	{
+		.procname	= "max_requests",
+		.data		= &svcrdma_max_requests,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_requests,
+		.extra2		= &max_max_requests
+	},
+	{
+		.procname	= "max_req_size",
+		.data		= &svcrdma_max_req_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_inline,
+		.extra2		= &max_max_inline
+	},
+	{
+		.procname	= "max_outbound_read_requests",
+		.data		= &svcrdma_ord,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ord,
+		.extra2		= &max_ord,
+	},
+
+	{
+		.procname	= "rdma_stat_read",
+		.data		= &rdma_stat_read,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_recv",
+		.data		= &rdma_stat_recv,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_write",
+		.data		= &rdma_stat_write,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_starve",
+		.data		= &rdma_stat_sq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_starve",
+		.data		= &rdma_stat_rq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_poll",
+		.data		= &rdma_stat_rq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_prod",
+		.data		= &rdma_stat_rq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_poll",
+		.data		= &rdma_stat_sq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_prod",
+		.data		= &rdma_stat_sq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{ },
+};
+
+static ctl_table svcrdma_table[] = {
+	{
+		.procname	= "svc_rdma",
+		.mode		= 0555,
+		.child		= svcrdma_parm_table
+	},
+	{ },
+};
+
+static ctl_table svcrdma_root_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= svcrdma_table
+	},
+	{ },
+};
+
+void svc_rdma_cleanup(void)
+{
+	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+	destroy_workqueue(svc_rdma_wq);
+	if (svcrdma_table_header) {
+		unregister_sysctl_table(svcrdma_table_header);
+		svcrdma_table_header = NULL;
+	}
+	svc_unreg_xprt_class(&svc_rdma_class);
+	kmem_cache_destroy(svc_rdma_map_cachep);
+	kmem_cache_destroy(svc_rdma_ctxt_cachep);
+}
+
+int svc_rdma_init(void)
+{
+	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+	dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
+	dprintk("\tsq_depth         : %d\n",
+		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+
+	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
+	if (!svc_rdma_wq)
+		return -ENOMEM;
+
+	if (!svcrdma_table_header)
+		svcrdma_table_header =
+			register_sysctl_table(svcrdma_root_table);
+
+	/* Create the temporary map cache */
+	svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+						sizeof(struct svc_rdma_req_map),
+						0,
+						SLAB_HWCACHE_ALIGN,
+						NULL);
+	if (!svc_rdma_map_cachep) {
+		printk(KERN_INFO "Could not allocate map cache.\n");
+		goto err0;
+	}
+
+	/* Create the temporary context cache */
+	svc_rdma_ctxt_cachep =
+		kmem_cache_create("svc_rdma_ctxt_cache",
+				  sizeof(struct svc_rdma_op_ctxt),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  NULL);
+	if (!svc_rdma_ctxt_cachep) {
+		printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+		goto err1;
+	}
+
+	/* Register RDMA with the SVC transport switch */
+	svc_reg_xprt_class(&svc_rdma_class);
+	return 0;
+ err1:
+	kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+	unregister_sysctl_table(svcrdma_table_header);
+	destroy_workqueue(svc_rdma_wq);
+	return -ENOMEM;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 00000000..9530ef2d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    position : u32 offset into XDR stream
+ *    handle   : u32 RKEY
+ *    . . .
+ *  end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+	struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+
+	while (ch->rc_discrim != xdr_zero) {
+		u64 ch_offset;
+
+		if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+		    (unsigned long)vaend) {
+			dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+			return NULL;
+		}
+
+		ch->rc_discrim = ntohl(ch->rc_discrim);
+		ch->rc_position = ntohl(ch->rc_position);
+		ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
+		ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
+		va = (u32 *)&ch->rc_target.rs_offset;
+		xdr_decode_hyper(va, &ch_offset);
+		put_unaligned(ch_offset, (u64 *)va);
+		ch++;
+	}
+	return (u32 *)&ch->rc_position;
+}
+
+/*
+ * Determine number of chunks and total bytes in chunk list. The chunk
+ * list has already been verified to fit within the RPCRDMA header.
+ */
+void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
+			       int *ch_count, int *byte_count)
+{
+	/* compute the number of bytes represented by read chunks */
+	*byte_count = 0;
+	*ch_count = 0;
+	for (; ch->rc_discrim != 0; ch++) {
+		*byte_count = *byte_count + ch->rc_target.rs_length;
+		*ch_count = *ch_count + 1;
+	}
+}
+
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    nchunks  : <count>
+ *       handle   : u32 RKEY              ---+
+ *       length   : u32 <len of segment>     |
+ *       offset   : remove va                + <count>
+ *       . . .                               |
+ *                                        ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+	int ch_no;
+	struct rpcrdma_write_array *ary =
+		(struct rpcrdma_write_array *)va;
+
+	/* Check for not write-array */
+	if (ary->wc_discrim == xdr_zero)
+		return (u32 *)&ary->wc_nchunks;
+
+	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+		return NULL;
+	}
+	ary->wc_discrim = ntohl(ary->wc_discrim);
+	ary->wc_nchunks = ntohl(ary->wc_nchunks);
+	if (((unsigned long)&ary->wc_array[0] +
+	     (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+			ary, ary->wc_nchunks, vaend);
+		return NULL;
+	}
+	for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+		u64 ch_offset;
+
+		ary->wc_array[ch_no].wc_target.rs_handle =
+			ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+		ary->wc_array[ch_no].wc_target.rs_length =
+			ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+		va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+		xdr_decode_hyper(va, &ch_offset);
+		put_unaligned(ch_offset, (u64 *)va);
+	}
+
+	/*
+	 * rs_length is the 2nd 4B field in wc_target and taking its
+	 * address skips the list terminator
+	 */
+	return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
+}
+
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+	int ch_no;
+	struct rpcrdma_write_array *ary =
+		(struct rpcrdma_write_array *)va;
+
+	/* Check for no reply-array */
+	if (ary->wc_discrim == xdr_zero)
+		return (u32 *)&ary->wc_nchunks;
+
+	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+		return NULL;
+	}
+	ary->wc_discrim = ntohl(ary->wc_discrim);
+	ary->wc_nchunks = ntohl(ary->wc_nchunks);
+	if (((unsigned long)&ary->wc_array[0] +
+	     (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+			ary, ary->wc_nchunks, vaend);
+		return NULL;
+	}
+	for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+		u64 ch_offset;
+
+		ary->wc_array[ch_no].wc_target.rs_handle =
+			ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+		ary->wc_array[ch_no].wc_target.rs_length =
+			ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+		va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+		xdr_decode_hyper(va, &ch_offset);
+		put_unaligned(ch_offset, (u64 *)va);
+	}
+
+	return (u32 *)&ary->wc_array[ch_no];
+}
+
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+			    struct svc_rqst *rqstp)
+{
+	struct rpcrdma_msg *rmsgp = NULL;
+	u32 *va;
+	u32 *vaend;
+	u32 hdr_len;
+
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+	/* Verify that there's enough bytes for header + something */
+	if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+		dprintk("svcrdma: header too short = %d\n",
+			rqstp->rq_arg.len);
+		return -EINVAL;
+	}
+
+	/* Decode the header */
+	rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+	rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+	rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+	rmsgp->rm_type = ntohl(rmsgp->rm_type);
+
+	if (rmsgp->rm_vers != RPCRDMA_VERSION)
+		return -ENOSYS;
+
+	/* Pull in the extra for the padded case and bump our pointer */
+	if (rmsgp->rm_type == RDMA_MSGP) {
+		int hdrlen;
+		rmsgp->rm_body.rm_padded.rm_align =
+			ntohl(rmsgp->rm_body.rm_padded.rm_align);
+		rmsgp->rm_body.rm_padded.rm_thresh =
+			ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+
+		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+		rqstp->rq_arg.head[0].iov_base = va;
+		hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+		rqstp->rq_arg.head[0].iov_len -= hdrlen;
+		if (hdrlen > rqstp->rq_arg.len)
+			return -EINVAL;
+		return hdrlen;
+	}
+
+	/* The chunk list may contain either a read chunk list or a write
+	 * chunk list and a reply chunk list.
+	 */
+	va = &rmsgp->rm_body.rm_chunks[0];
+	vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	va = decode_read_list(va, vaend);
+	if (!va)
+		return -EINVAL;
+	va = decode_write_list(va, vaend);
+	if (!va)
+		return -EINVAL;
+	va = decode_reply_array(va, vaend);
+	if (!va)
+		return -EINVAL;
+
+	rqstp->rq_arg.head[0].iov_base = va;
+	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+	rqstp->rq_arg.head[0].iov_len -= hdr_len;
+
+	*rdma_req = rmsgp;
+	return hdr_len;
+}
+
+int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
+{
+	struct rpcrdma_msg *rmsgp = NULL;
+	struct rpcrdma_read_chunk *ch;
+	struct rpcrdma_write_array *ary;
+	u32 *va;
+	u32 hdrlen;
+
+	dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
+		rqstp);
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+	/* Pull in the extra for the padded case and bump our pointer */
+	if (rmsgp->rm_type == RDMA_MSGP) {
+		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+		rqstp->rq_arg.head[0].iov_base = va;
+		hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+		rqstp->rq_arg.head[0].iov_len -= hdrlen;
+		return hdrlen;
+	}
+
+	/*
+	 * Skip all chunks to find RPC msg. These were previously processed
+	 */
+	va = &rmsgp->rm_body.rm_chunks[0];
+
+	/* Skip read-list */
+	for (ch = (struct rpcrdma_read_chunk *)va;
+	     ch->rc_discrim != xdr_zero; ch++);
+	va = (u32 *)&ch->rc_position;
+
+	/* Skip write-list */
+	ary = (struct rpcrdma_write_array *)va;
+	if (ary->wc_discrim == xdr_zero)
+		va = (u32 *)&ary->wc_nchunks;
+	else
+		/*
+		 * rs_length is the 2nd 4B field in wc_target and taking its
+		 * address skips the list terminator
+		 */
+		va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
+
+	/* Skip reply-array */
+	ary = (struct rpcrdma_write_array *)va;
+	if (ary->wc_discrim == xdr_zero)
+		va = (u32 *)&ary->wc_nchunks;
+	else
+		va = (u32 *)&ary->wc_array[ary->wc_nchunks];
+
+	rqstp->rq_arg.head[0].iov_base = va;
+	hdrlen = (unsigned long)va - (unsigned long)rmsgp;
+	rqstp->rq_arg.head[0].iov_len -= hdrlen;
+
+	return hdrlen;
+}
+
+int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
+			      struct rpcrdma_msg *rmsgp,
+			      enum rpcrdma_errcode err, u32 *va)
+{
+	u32 *startp = va;
+
+	*va++ = htonl(rmsgp->rm_xid);
+	*va++ = htonl(rmsgp->rm_vers);
+	*va++ = htonl(xprt->sc_max_requests);
+	*va++ = htonl(RDMA_ERROR);
+	*va++ = htonl(err);
+	if (err == ERR_VERS) {
+		*va++ = htonl(RPCRDMA_VERSION);
+		*va++ = htonl(RPCRDMA_VERSION);
+	}
+
+	return (int)((unsigned long)va - (unsigned long)startp);
+}
+
+int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+{
+	struct rpcrdma_write_array *wr_ary;
+
+	/* There is no read-list in a reply */
+
+	/* skip write list */
+	wr_ary = (struct rpcrdma_write_array *)
+		&rmsgp->rm_body.rm_chunks[1];
+	if (wr_ary->wc_discrim)
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+			wc_target.rs_length;
+	else
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_nchunks;
+
+	/* skip reply array */
+	if (wr_ary->wc_discrim)
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+	else
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_nchunks;
+
+	return (unsigned long) wr_ary - (unsigned long) rmsgp;
+}
+
+void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
+{
+	struct rpcrdma_write_array *ary;
+
+	/* no read-list */
+	rmsgp->rm_body.rm_chunks[0] = xdr_zero;
+
+	/* write-array discrim */
+	ary = (struct rpcrdma_write_array *)
+		&rmsgp->rm_body.rm_chunks[1];
+	ary->wc_discrim = xdr_one;
+	ary->wc_nchunks = htonl(chunks);
+
+	/* write-list terminator */
+	ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
+
+	/* reply-array discriminator */
+	ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
+}
+
+void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
+				 int chunks)
+{
+	ary->wc_discrim = xdr_one;
+	ary->wc_nchunks = htonl(chunks);
+}
+
+void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
+				     int chunk_no,
+				     u32 rs_handle, u64 rs_offset,
+				     u32 write_len)
+{
+	struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
+	seg->rs_handle = htonl(rs_handle);
+	seg->rs_length = htonl(write_len);
+	xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
+}
+
+void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
+				  struct rpcrdma_msg *rdma_argp,
+				  struct rpcrdma_msg *rdma_resp,
+				  enum rpcrdma_proc rdma_type)
+{
+	rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
+	rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
+	rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
+	rdma_resp->rm_type = htonl(rdma_type);
+
+	/* Encode <nul> chunks lists */
+	rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
+	rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
+	rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 00000000..df67211c
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *ctxt,
+			       u32 byte_count)
+{
+	struct page *page;
+	u32 bc;
+	int sge_no;
+
+	/* Swap the page in the SGE with the page in argpages */
+	page = ctxt->pages[0];
+	put_page(rqstp->rq_pages[0]);
+	rqstp->rq_pages[0] = page;
+
+	/* Set up the XDR head */
+	rqstp->rq_arg.head[0].iov_base = page_address(page);
+	rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+	rqstp->rq_arg.len = byte_count;
+	rqstp->rq_arg.buflen = byte_count;
+
+	/* Compute bytes past head in the SGL */
+	bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+
+	/* If data remains, store it in the pagelist */
+	rqstp->rq_arg.page_len = bc;
+	rqstp->rq_arg.page_base = 0;
+	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+	sge_no = 1;
+	while (bc && sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no];
+		put_page(rqstp->rq_pages[sge_no]);
+		rqstp->rq_pages[sge_no] = page;
+		bc -= min(bc, ctxt->sge[sge_no].length);
+		rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+		sge_no++;
+	}
+	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+
+	/* We should never run out of SGE because the limit is defined to
+	 * support the max allowed RPC data length
+	 */
+	BUG_ON(bc && (sge_no == ctxt->count));
+	BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
+	       != byte_count);
+	BUG_ON(rqstp->rq_arg.len != byte_count);
+
+	/* If not all pages were used from the SGL, free the remaining ones */
+	bc = sge_no;
+	while (sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no++];
+		put_page(page);
+	}
+	ctxt->count = bc;
+
+	/* Set up tail */
+	rqstp->rq_arg.tail[0].iov_base = NULL;
+	rqstp->rq_arg.tail[0].iov_len = 0;
+}
+
+/* Encode a read-chunk-list as an array of IB SGE
+ *
+ * Assumptions:
+ * - chunk[0]->position points to pages[0] at an offset of 0
+ * - pages[] is not physically or virtually contiguous and consists of
+ *   PAGE_SIZE elements.
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ *   chunk in the read list
+ *
+ */
+static int map_read_chunks(struct svcxprt_rdma *xprt,
+			   struct svc_rqst *rqstp,
+			   struct svc_rdma_op_ctxt *head,
+			   struct rpcrdma_msg *rmsgp,
+			   struct svc_rdma_req_map *rpl_map,
+			   struct svc_rdma_req_map *chl_map,
+			   int ch_count,
+			   int byte_count)
+{
+	int sge_no;
+	int sge_bytes;
+	int page_off;
+	int page_no;
+	int ch_bytes;
+	int ch_no;
+	struct rpcrdma_read_chunk *ch;
+
+	sge_no = 0;
+	page_no = 0;
+	page_off = 0;
+	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	ch_no = 0;
+	ch_bytes = ch->rc_target.rs_length;
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count; /* save count of hdr pages */
+	head->arg.page_base = 0;
+	head->arg.page_len = ch_bytes;
+	head->arg.len = rqstp->rq_arg.len + ch_bytes;
+	head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
+	head->count++;
+	chl_map->ch[0].start = 0;
+	while (byte_count) {
+		rpl_map->sge[sge_no].iov_base =
+			page_address(rqstp->rq_arg.pages[page_no]) + page_off;
+		sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
+		rpl_map->sge[sge_no].iov_len = sge_bytes;
+		/*
+		 * Don't bump head->count here because the same page
+		 * may be used by multiple SGE.
+		 */
+		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+		rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+
+		byte_count -= sge_bytes;
+		ch_bytes -= sge_bytes;
+		sge_no++;
+		/*
+		 * If all bytes for this chunk have been mapped to an
+		 * SGE, move to the next SGE
+		 */
+		if (ch_bytes == 0) {
+			chl_map->ch[ch_no].count =
+				sge_no - chl_map->ch[ch_no].start;
+			ch_no++;
+			ch++;
+			chl_map->ch[ch_no].start = sge_no;
+			ch_bytes = ch->rc_target.rs_length;
+			/* If bytes remaining account for next chunk */
+			if (byte_count) {
+				head->arg.page_len += ch_bytes;
+				head->arg.len += ch_bytes;
+				head->arg.buflen += ch_bytes;
+			}
+		}
+		/*
+		 * If this SGE consumed all of the page, move to the
+		 * next page
+		 */
+		if ((sge_bytes + page_off) == PAGE_SIZE) {
+			page_no++;
+			page_off = 0;
+			/*
+			 * If there are still bytes left to map, bump
+			 * the page count
+			 */
+			if (byte_count)
+				head->count++;
+		} else
+			page_off += sge_bytes;
+	}
+	BUG_ON(byte_count != 0);
+	return sge_no;
+}
+
+/* Map a read-chunk-list to an XDR and fast register the page-list.
+ *
+ * Assumptions:
+ * - chunk[0]	position points to pages[0] at an offset of 0
+ * - pages[]	will be made physically contiguous by creating a one-off memory
+ *		region using the fastreg verb.
+ * - byte_count is # of bytes in read-chunk-list
+ * - ch_count	is # of chunks in read-chunk-list
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ *   chunk in the read list
+ */
+static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+				struct svc_rqst *rqstp,
+				struct svc_rdma_op_ctxt *head,
+				struct rpcrdma_msg *rmsgp,
+				struct svc_rdma_req_map *rpl_map,
+				struct svc_rdma_req_map *chl_map,
+				int ch_count,
+				int byte_count)
+{
+	int page_no;
+	int ch_no;
+	u32 offset;
+	struct rpcrdma_read_chunk *ch;
+	struct svc_rdma_fastreg_mr *frmr;
+	int ret = 0;
+
+	frmr = svc_rdma_get_frmr(xprt);
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+
+	head->frmr = frmr;
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count; /* save count of hdr pages */
+	head->arg.page_base = 0;
+	head->arg.page_len = byte_count;
+	head->arg.len = rqstp->rq_arg.len + byte_count;
+	head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
+
+	/* Fast register the page list */
+	frmr->kva = page_address(rqstp->rq_arg.pages[0]);
+	frmr->direction = DMA_FROM_DEVICE;
+	frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+	frmr->map_len = byte_count;
+	frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					rqstp->rq_arg.pages[page_no], 0,
+					PAGE_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+		atomic_inc(&xprt->sc_dma_used);
+		head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+	}
+	head->count += page_no;
+
+	/* rq_respages points one past arg pages */
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+
+	/* Create the reply and chunk maps */
+	offset = 0;
+	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	for (ch_no = 0; ch_no < ch_count; ch_no++) {
+		rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+		rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
+		chl_map->ch[ch_no].count = 1;
+		chl_map->ch[ch_no].start = ch_no;
+		offset += ch->rc_target.rs_length;
+		ch++;
+	}
+
+	ret = svc_rdma_fastreg(xprt, frmr);
+	if (ret)
+		goto fatal_err;
+
+	return ch_no;
+
+ fatal_err:
+	printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+	svc_rdma_put_frmr(xprt, frmr);
+	return -EIO;
+}
+
+static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+			     struct svc_rdma_op_ctxt *ctxt,
+			     struct svc_rdma_fastreg_mr *frmr,
+			     struct kvec *vec,
+			     u64 *sgl_offset,
+			     int count)
+{
+	int i;
+	unsigned long off;
+
+	ctxt->count = count;
+	ctxt->direction = DMA_FROM_DEVICE;
+	for (i = 0; i < count; i++) {
+		ctxt->sge[i].length = 0; /* in case map fails */
+		if (!frmr) {
+			BUG_ON(0 == virt_to_page(vec[i].iov_base));
+			off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
+			ctxt->sge[i].addr =
+				ib_dma_map_page(xprt->sc_cm_id->device,
+						virt_to_page(vec[i].iov_base),
+						off,
+						vec[i].iov_len,
+						DMA_FROM_DEVICE);
+			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+						 ctxt->sge[i].addr))
+				return -EINVAL;
+			ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+			atomic_inc(&xprt->sc_dma_used);
+		} else {
+			ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+			ctxt->sge[i].lkey = frmr->mr->lkey;
+		}
+		ctxt->sge[i].length = vec[i].iov_len;
+		*sgl_offset = *sgl_offset + vec[i].iov_len;
+	}
+	return 0;
+}
+
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+{
+	if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+	     RDMA_TRANSPORT_IWARP) &&
+	    sge_count > 1)
+		return 1;
+	else
+		return min_t(int, sge_count, xprt->sc_max_sge);
+}
+
+/*
+ * Use RDMA_READ to read data from the advertised client buffer into the
+ * XDR stream starting at rq_arg.head[0].iov_base.
+ * Each chunk in the array
+ * contains the following fields:
+ * discrim      - '1', This isn't used for data placement
+ * position     - The xdr stream offset (the same for every chunk)
+ * handle       - RMR for client memory region
+ * length       - data transfer length
+ * offset       - 64 bit tagged offset in remote memory region
+ *
+ * On our side, we need to read into a pagelist. The first page immediately
+ * follows the RPC header.
+ *
+ * This function returns:
+ * 0 - No error and no read-list found.
+ *
+ * 1 - Successful read-list processing. The data is not yet in
+ * the pagelist and therefore the RPC request must be deferred. The
+ * I/O completion will enqueue the transport again and
+ * svc_rdma_recvfrom will complete the request.
+ *
+ * <0 - Error processing/posting read-list.
+ *
+ * NOTE: The ctxt must not be touched after the last WR has been posted
+ * because the I/O completion processing may occur on another
+ * processor and free / modify the context. Ne touche pas!
+ */
+static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+			 struct rpcrdma_msg *rmsgp,
+			 struct svc_rqst *rqstp,
+			 struct svc_rdma_op_ctxt *hdr_ctxt)
+{
+	struct ib_send_wr read_wr;
+	struct ib_send_wr inv_wr;
+	int err = 0;
+	int ch_no;
+	int ch_count;
+	int byte_count;
+	int sge_count;
+	u64 sgl_offset;
+	struct rpcrdma_read_chunk *ch;
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+	struct svc_rdma_req_map *rpl_map;
+	struct svc_rdma_req_map *chl_map;
+
+	/* If no read list is present, return 0 */
+	ch = svc_rdma_get_read_chunk(rmsgp);
+	if (!ch)
+		return 0;
+
+	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+	if (ch_count > RPCSVC_MAXPAGES)
+		return -EINVAL;
+
+	/* Allocate temporary reply and chunk maps */
+	rpl_map = svc_rdma_get_req_map();
+	chl_map = svc_rdma_get_req_map();
+
+	if (!xprt->sc_frmr_pg_list_len)
+		sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+					    rpl_map, chl_map, ch_count,
+					    byte_count);
+	else
+		sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+						 rpl_map, chl_map, ch_count,
+						 byte_count);
+	if (sge_count < 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	sgl_offset = 0;
+	ch_no = 0;
+
+	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	     ch->rc_discrim != 0; ch++, ch_no++) {
+next_sge:
+		ctxt = svc_rdma_get_context(xprt);
+		ctxt->direction = DMA_FROM_DEVICE;
+		ctxt->frmr = hdr_ctxt->frmr;
+		ctxt->read_hdr = NULL;
+		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+
+		/* Prepare READ WR */
+		memset(&read_wr, 0, sizeof read_wr);
+		read_wr.wr_id = (unsigned long)ctxt;
+		read_wr.opcode = IB_WR_RDMA_READ;
+		ctxt->wr_op = read_wr.opcode;
+		read_wr.send_flags = IB_SEND_SIGNALED;
+		read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
+		read_wr.wr.rdma.remote_addr =
+			get_unaligned(&(ch->rc_target.rs_offset)) +
+			sgl_offset;
+		read_wr.sg_list = ctxt->sge;
+		read_wr.num_sge =
+			rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+		err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+					&rpl_map->sge[chl_map->ch[ch_no].start],
+					&sgl_offset,
+					read_wr.num_sge);
+		if (err) {
+			svc_rdma_unmap_dma(ctxt);
+			svc_rdma_put_context(ctxt, 0);
+			goto out;
+		}
+		if (((ch+1)->rc_discrim == 0) &&
+		    (read_wr.num_sge == chl_map->ch[ch_no].count)) {
+			/*
+			 * Mark the last RDMA_READ with a bit to
+			 * indicate all RPC data has been fetched from
+			 * the client and the RPC needs to be enqueued.
+			 */
+			set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+			if (hdr_ctxt->frmr) {
+				set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+				/*
+				 * Invalidate the local MR used to map the data
+				 * sink.
+				 */
+				if (xprt->sc_dev_caps &
+				    SVCRDMA_DEVCAP_READ_W_INV) {
+					read_wr.opcode =
+						IB_WR_RDMA_READ_WITH_INV;
+					ctxt->wr_op = read_wr.opcode;
+					read_wr.ex.invalidate_rkey =
+						ctxt->frmr->mr->lkey;
+				} else {
+					/* Prepare INVALIDATE WR */
+					memset(&inv_wr, 0, sizeof inv_wr);
+					inv_wr.opcode = IB_WR_LOCAL_INV;
+					inv_wr.send_flags = IB_SEND_SIGNALED;
+					inv_wr.ex.invalidate_rkey =
+						hdr_ctxt->frmr->mr->lkey;
+					read_wr.next = &inv_wr;
+				}
+			}
+			ctxt->read_hdr = hdr_ctxt;
+		}
+		/* Post the read */
+		err = svc_rdma_send(xprt, &read_wr);
+		if (err) {
+			printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
+			       err);
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_unmap_dma(ctxt);
+			svc_rdma_put_context(ctxt, 0);
+			goto out;
+		}
+		atomic_inc(&rdma_stat_read);
+
+		if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+			chl_map->ch[ch_no].count -= read_wr.num_sge;
+			chl_map->ch[ch_no].start += read_wr.num_sge;
+			goto next_sge;
+		}
+		sgl_offset = 0;
+		err = 1;
+	}
+
+ out:
+	svc_rdma_put_req_map(rpl_map);
+	svc_rdma_put_req_map(chl_map);
+
+	/* Detach arg pages. svc_recv will replenish them */
+	for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+		rqstp->rq_pages[ch_no] = NULL;
+
+	/*
+	 * Detach res pages. svc_release must see a resused count of
+	 * zero or it will attempt to put them.
+	 */
+	while (rqstp->rq_resused)
+		rqstp->rq_respages[--rqstp->rq_resused] = NULL;
+
+	return err;
+}
+
+static int rdma_read_complete(struct svc_rqst *rqstp,
+			      struct svc_rdma_op_ctxt *head)
+{
+	int page_no;
+	int ret;
+
+	BUG_ON(!head);
+
+	/* Copy RPC pages */
+	for (page_no = 0; page_no < head->count; page_no++) {
+		put_page(rqstp->rq_pages[page_no]);
+		rqstp->rq_pages[page_no] = head->pages[page_no];
+	}
+	/* Point rq_arg.pages past header */
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
+	rqstp->rq_arg.page_len = head->arg.page_len;
+	rqstp->rq_arg.page_base = head->arg.page_base;
+
+	/* rq_respages starts after the last arg page */
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+	rqstp->rq_resused = 0;
+
+	/* Rebuild rq_arg head and tail. */
+	rqstp->rq_arg.head[0] = head->arg.head[0];
+	rqstp->rq_arg.tail[0] = head->arg.tail[0];
+	rqstp->rq_arg.len = head->arg.len;
+	rqstp->rq_arg.buflen = head->arg.buflen;
+
+	/* Free the context */
+	svc_rdma_put_context(head, 0);
+
+	/* XXX: What should this be? */
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
+
+	ret = rqstp->rq_arg.head[0].iov_len
+		+ rqstp->rq_arg.page_len
+		+ rqstp->rq_arg.tail[0].iov_len;
+	dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
+		rqstp->rq_arg.head[0].iov_len);
+
+	return ret;
+}
+
+/*
+ * Set up the rqstp thread context to point to the RQ buffer. If
+ * necessary, pull additional data from the client with an RDMA_READ
+ * request.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma_xprt =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+	struct rpcrdma_msg *rmsgp;
+	int ret = 0;
+	int len;
+
+	dprintk("svcrdma: rqstp=%p\n", rqstp);
+
+	spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+	}
+	if (ctxt) {
+		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		return rdma_read_complete(rqstp, ctxt);
+	}
+
+	if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+	} else {
+		atomic_inc(&rdma_stat_rq_starve);
+		clear_bit(XPT_DATA, &xprt->xpt_flags);
+		ctxt = NULL;
+	}
+	spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+	if (!ctxt) {
+		/* This is the EAGAIN path. The svc_recv routine will
+		 * return -EAGAIN, the nfsd thread will go to call into
+		 * svc_recv again and we shouldn't be on the active
+		 * transport list
+		 */
+		if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+			goto close_out;
+
+		BUG_ON(ret);
+		goto out;
+	}
+	dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+		ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+	BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
+	atomic_inc(&rdma_stat_recv);
+
+	/* Build up the XDR from the receive buffers. */
+	rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+
+	/* Decode the RDMA header. */
+	len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
+	rqstp->rq_xprt_hlen = len;
+
+	/* If the request is invalid, reply with an error */
+	if (len < 0) {
+		if (len == -ENOSYS)
+			svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+		goto close_out;
+	}
+
+	/* Read read-list data. */
+	ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
+	if (ret > 0) {
+		/* read-list posted, defer until data received from client. */
+		goto defer;
+	}
+	if (ret < 0) {
+		/* Post of read-list failed, free context. */
+		svc_rdma_put_context(ctxt, 1);
+		return 0;
+	}
+
+	ret = rqstp->rq_arg.head[0].iov_len
+		+ rqstp->rq_arg.page_len
+		+ rqstp->rq_arg.tail[0].iov_len;
+	svc_rdma_put_context(ctxt, 0);
+ out:
+	dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+		ret, rqstp->rq_arg.len,
+		rqstp->rq_arg.head[0].iov_base,
+		rqstp->rq_arg.head[0].iov_len);
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, xprt);
+	return ret;
+
+ close_out:
+	if (ctxt)
+		svc_rdma_put_context(ctxt, 1);
+	dprintk("svcrdma: transport %p is closing\n", xprt);
+	/*
+	 * Set the close bit and enqueue it. svc_recv will see the
+	 * close bit and call svc_xprt_delete
+	 */
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+defer:
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 00000000..249a835b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* Encode an XDR as an array of IB SGE
+ *
+ * Assumptions:
+ * - head[0] is physically contiguous.
+ * - tail[0] is physically contiguous.
+ * - pages[] is not physically or virtually contiguous and consists of
+ *   PAGE_SIZE elements.
+ *
+ * Output:
+ * SGE[0]              reserved for RCPRDMA header
+ * SGE[1]              data from xdr->head[]
+ * SGE[2..sge_count-2] data from xdr->pages[]
+ * SGE[sge_count-1]    data from xdr->tail.
+ *
+ * The max SGE we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+ * reserves a page for both the request and the reply header, and this
+ * array is only concerned with the reply we are assured that we have
+ * on extra page for the RPCRMDA header.
+ */
+static int fast_reg_xdr(struct svcxprt_rdma *xprt,
+			struct xdr_buf *xdr,
+			struct svc_rdma_req_map *vec)
+{
+	int sge_no;
+	u32 sge_bytes;
+	u32 page_bytes;
+	u32 page_off;
+	int page_no = 0;
+	u8 *frva;
+	struct svc_rdma_fastreg_mr *frmr;
+
+	frmr = svc_rdma_get_frmr(xprt);
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+	vec->frmr = frmr;
+
+	/* Skip the RPCRDMA header */
+	sge_no = 1;
+
+	/* Map the head. */
+	frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+	vec->count = 2;
+	sge_no++;
+
+	/* Map the XDR head */
+	frmr->kva = frva;
+	frmr->direction = DMA_TO_DEVICE;
+	frmr->access_flags = 0;
+	frmr->map_len = PAGE_SIZE;
+	frmr->page_list_len = 1;
+	page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+	frmr->page_list->page_list[page_no] =
+		ib_dma_map_page(xprt->sc_cm_id->device,
+				virt_to_page(xdr->head[0].iov_base),
+				page_off,
+				PAGE_SIZE - page_off,
+				DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+				 frmr->page_list->page_list[page_no]))
+		goto fatal_err;
+	atomic_inc(&xprt->sc_dma_used);
+
+	/* Map the XDR page list */
+	page_off = xdr->page_base;
+	page_bytes = xdr->page_len + page_off;
+	if (!page_bytes)
+		goto encode_tail;
+
+	/* Map the pages */
+	vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+	vec->sge[sge_no].iov_len = page_bytes;
+	sge_no++;
+	while (page_bytes) {
+		struct page *page;
+
+		page = xdr->pages[page_no++];
+		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+		page_bytes -= sge_bytes;
+
+		frmr->page_list->page_list[page_no] =
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					page, page_off,
+					sge_bytes, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+
+		atomic_inc(&xprt->sc_dma_used);
+		page_off = 0; /* reset for next time through loop */
+		frmr->map_len += PAGE_SIZE;
+		frmr->page_list_len++;
+	}
+	vec->count++;
+
+ encode_tail:
+	/* Map tail */
+	if (0 == xdr->tail[0].iov_len)
+		goto done;
+
+	vec->count++;
+	vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+
+	if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+	    ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+		/*
+		 * If head and tail use the same page, we don't need
+		 * to map it again.
+		 */
+		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+	} else {
+		void *va;
+
+		/* Map another page for the tail */
+		page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+		va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+		vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+
+		frmr->page_list->page_list[page_no] =
+		    ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
+				    page_off,
+				    PAGE_SIZE,
+				    DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 frmr->page_list->page_list[page_no]))
+			goto fatal_err;
+		atomic_inc(&xprt->sc_dma_used);
+		frmr->map_len += PAGE_SIZE;
+		frmr->page_list_len++;
+	}
+
+ done:
+	if (svc_rdma_fastreg(xprt, frmr))
+		goto fatal_err;
+
+	return 0;
+
+ fatal_err:
+	printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+	vec->frmr = NULL;
+	svc_rdma_put_frmr(xprt, frmr);
+	return -EIO;
+}
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+		   struct xdr_buf *xdr,
+		   struct svc_rdma_req_map *vec)
+{
+	int sge_no;
+	u32 sge_bytes;
+	u32 page_bytes;
+	u32 page_off;
+	int page_no;
+
+	BUG_ON(xdr->len !=
+	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
+	if (xprt->sc_frmr_pg_list_len)
+		return fast_reg_xdr(xprt, xdr, vec);
+
+	/* Skip the first sge, this is for the RPCRDMA header */
+	sge_no = 1;
+
+	/* Head SGE */
+	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+	sge_no++;
+
+	/* pages SGE */
+	page_no = 0;
+	page_bytes = xdr->page_len;
+	page_off = xdr->page_base;
+	while (page_bytes) {
+		vec->sge[sge_no].iov_base =
+			page_address(xdr->pages[page_no]) + page_off;
+		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+		page_bytes -= sge_bytes;
+		vec->sge[sge_no].iov_len = sge_bytes;
+
+		sge_no++;
+		page_no++;
+		page_off = 0; /* reset for next time through loop */
+	}
+
+	/* Tail SGE */
+	if (xdr->tail[0].iov_len) {
+		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+		vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+		sge_no++;
+	}
+
+	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+		"page_base %u page_len %u head_len %zu tail_len %zu\n",
+		sge_no, page_no, xdr->page_base, xdr->page_len,
+		xdr->head[0].iov_len, xdr->tail[0].iov_len);
+
+	vec->count = sge_no;
+	return 0;
+}
+
+static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+			      struct xdr_buf *xdr,
+			      u32 xdr_off, size_t len, int dir)
+{
+	struct page *page;
+	dma_addr_t dma_addr;
+	if (xdr_off < xdr->head[0].iov_len) {
+		/* This offset is in the head */
+		xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+		page = virt_to_page(xdr->head[0].iov_base);
+	} else {
+		xdr_off -= xdr->head[0].iov_len;
+		if (xdr_off < xdr->page_len) {
+			/* This offset is in the page list */
+			page = xdr->pages[xdr_off >> PAGE_SHIFT];
+			xdr_off &= ~PAGE_MASK;
+		} else {
+			/* This offset is in the tail */
+			xdr_off -= xdr->page_len;
+			xdr_off += (unsigned long)
+				xdr->tail[0].iov_base & ~PAGE_MASK;
+			page = virt_to_page(xdr->tail[0].iov_base);
+		}
+	}
+	dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
+				   min_t(size_t, PAGE_SIZE, len), dir);
+	return dma_addr;
+}
+
+/* Assumptions:
+ * - We are using FRMR
+ *     - or -
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+		      u32 rmr, u64 to,
+		      u32 xdr_off, int write_len,
+		      struct svc_rdma_req_map *vec)
+{
+	struct ib_send_wr write_wr;
+	struct ib_sge *sge;
+	int xdr_sge_no;
+	int sge_no;
+	int sge_bytes;
+	int sge_off;
+	int bc;
+	struct svc_rdma_op_ctxt *ctxt;
+
+	BUG_ON(vec->count > RPCSVC_MAXPAGES);
+	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
+		rmr, (unsigned long long)to, xdr_off,
+		write_len, vec->sge, vec->count);
+
+	ctxt = svc_rdma_get_context(xprt);
+	ctxt->direction = DMA_TO_DEVICE;
+	sge = ctxt->sge;
+
+	/* Find the SGE associated with xdr_off */
+	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
+	     xdr_sge_no++) {
+		if (vec->sge[xdr_sge_no].iov_len > bc)
+			break;
+		bc -= vec->sge[xdr_sge_no].iov_len;
+	}
+
+	sge_off = bc;
+	bc = write_len;
+	sge_no = 0;
+
+	/* Copy the remaining SGE */
+	while (bc != 0) {
+		sge_bytes = min_t(size_t,
+			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
+		sge[sge_no].length = sge_bytes;
+		if (!vec->frmr) {
+			sge[sge_no].addr =
+				dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+					    sge_bytes, DMA_TO_DEVICE);
+			xdr_off += sge_bytes;
+			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+						 sge[sge_no].addr))
+				goto err;
+			atomic_inc(&xprt->sc_dma_used);
+			sge[sge_no].lkey = xprt->sc_dma_lkey;
+		} else {
+			sge[sge_no].addr = (unsigned long)
+				vec->sge[xdr_sge_no].iov_base + sge_off;
+			sge[sge_no].lkey = vec->frmr->mr->lkey;
+		}
+		ctxt->count++;
+		ctxt->frmr = vec->frmr;
+		sge_off = 0;
+		sge_no++;
+		xdr_sge_no++;
+		BUG_ON(xdr_sge_no > vec->count);
+		bc -= sge_bytes;
+	}
+
+	/* Prepare WRITE WR */
+	memset(&write_wr, 0, sizeof write_wr);
+	ctxt->wr_op = IB_WR_RDMA_WRITE;
+	write_wr.wr_id = (unsigned long)ctxt;
+	write_wr.sg_list = &sge[0];
+	write_wr.num_sge = sge_no;
+	write_wr.opcode = IB_WR_RDMA_WRITE;
+	write_wr.send_flags = IB_SEND_SIGNALED;
+	write_wr.wr.rdma.rkey = rmr;
+	write_wr.wr.rdma.remote_addr = to;
+
+	/* Post It */
+	atomic_inc(&rdma_stat_write);
+	if (svc_rdma_send(xprt, &write_wr))
+		goto err;
+	return 0;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_frmr(xprt, vec->frmr);
+	svc_rdma_put_context(ctxt, 0);
+	/* Fatal error, close transport */
+	return -EIO;
+}
+
+static int send_write_chunks(struct svcxprt_rdma *xprt,
+			     struct rpcrdma_msg *rdma_argp,
+			     struct rpcrdma_msg *rdma_resp,
+			     struct svc_rqst *rqstp,
+			     struct svc_rdma_req_map *vec)
+{
+	u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+	int write_len;
+	int max_write;
+	u32 xdr_off;
+	int chunk_off;
+	int chunk_no;
+	struct rpcrdma_write_array *arg_ary;
+	struct rpcrdma_write_array *res_ary;
+	int ret;
+
+	arg_ary = svc_rdma_get_write_array(rdma_argp);
+	if (!arg_ary)
+		return 0;
+	res_ary = (struct rpcrdma_write_array *)
+		&rdma_resp->rm_body.rm_chunks[1];
+
+	if (vec->frmr)
+		max_write = vec->frmr->map_len;
+	else
+		max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+	/* Write chunks start at the pagelist */
+	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+	     xfer_len && chunk_no < arg_ary->wc_nchunks;
+	     chunk_no++) {
+		struct rpcrdma_segment *arg_ch;
+		u64 rs_offset;
+
+		arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+		write_len = min(xfer_len, arg_ch->rs_length);
+
+		/* Prepare the response chunk given the length actually
+		 * written */
+		rs_offset = get_unaligned(&(arg_ch->rs_offset));
+		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+					    arg_ch->rs_handle,
+					    rs_offset,
+					    write_len);
+		chunk_off = 0;
+		while (write_len) {
+			int this_write;
+			this_write = min(write_len, max_write);
+			ret = send_write(xprt, rqstp,
+					 arg_ch->rs_handle,
+					 rs_offset + chunk_off,
+					 xdr_off,
+					 this_write,
+					 vec);
+			if (ret) {
+				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+					ret);
+				return -EIO;
+			}
+			chunk_off += this_write;
+			xdr_off += this_write;
+			xfer_len -= this_write;
+			write_len -= this_write;
+		}
+	}
+	/* Update the req with the number of chunks actually used */
+	svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
+
+	return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+}
+
+static int send_reply_chunks(struct svcxprt_rdma *xprt,
+			     struct rpcrdma_msg *rdma_argp,
+			     struct rpcrdma_msg *rdma_resp,
+			     struct svc_rqst *rqstp,
+			     struct svc_rdma_req_map *vec)
+{
+	u32 xfer_len = rqstp->rq_res.len;
+	int write_len;
+	int max_write;
+	u32 xdr_off;
+	int chunk_no;
+	int chunk_off;
+	struct rpcrdma_segment *ch;
+	struct rpcrdma_write_array *arg_ary;
+	struct rpcrdma_write_array *res_ary;
+	int ret;
+
+	arg_ary = svc_rdma_get_reply_array(rdma_argp);
+	if (!arg_ary)
+		return 0;
+	/* XXX: need to fix when reply lists occur with read-list and or
+	 * write-list */
+	res_ary = (struct rpcrdma_write_array *)
+		&rdma_resp->rm_body.rm_chunks[2];
+
+	if (vec->frmr)
+		max_write = vec->frmr->map_len;
+	else
+		max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+	/* xdr offset starts at RPC message */
+	for (xdr_off = 0, chunk_no = 0;
+	     xfer_len && chunk_no < arg_ary->wc_nchunks;
+	     chunk_no++) {
+		u64 rs_offset;
+		ch = &arg_ary->wc_array[chunk_no].wc_target;
+		write_len = min(xfer_len, ch->rs_length);
+
+		/* Prepare the reply chunk given the length actually
+		 * written */
+		rs_offset = get_unaligned(&(ch->rs_offset));
+		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+					    ch->rs_handle, rs_offset,
+					    write_len);
+		chunk_off = 0;
+		while (write_len) {
+			int this_write;
+
+			this_write = min(write_len, max_write);
+			ret = send_write(xprt, rqstp,
+					 ch->rs_handle,
+					 rs_offset + chunk_off,
+					 xdr_off,
+					 this_write,
+					 vec);
+			if (ret) {
+				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+					ret);
+				return -EIO;
+			}
+			chunk_off += this_write;
+			xdr_off += this_write;
+			xfer_len -= this_write;
+			write_len -= this_write;
+		}
+	}
+	/* Update the req with the number of chunks actually used */
+	svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+
+	return rqstp->rq_res.len;
+}
+
+/* This function prepares the portion of the RPCRDMA message to be
+ * sent in the RDMA_SEND. This function is called after data sent via
+ * RDMA has already been transmitted. There are three cases:
+ * - The RPCRDMA header, RPC header, and payload are all sent in a
+ *   single RDMA_SEND. This is the "inline" case.
+ * - The RPCRDMA header and some portion of the RPC header and data
+ *   are sent via this RDMA_SEND and another portion of the data is
+ *   sent via RDMA.
+ * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ *   header and data are all transmitted via RDMA.
+ * In all three cases, this function prepares the RPCRDMA header in
+ * sge[0], the 'type' parameter indicates the type to place in the
+ * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * the XDR to include in this RDMA_SEND. NB: The offset of the payload
+ * to send is zero in the XDR.
+ */
+static int send_reply(struct svcxprt_rdma *rdma,
+		      struct svc_rqst *rqstp,
+		      struct page *page,
+		      struct rpcrdma_msg *rdma_resp,
+		      struct svc_rdma_op_ctxt *ctxt,
+		      struct svc_rdma_req_map *vec,
+		      int byte_count)
+{
+	struct ib_send_wr send_wr;
+	struct ib_send_wr inv_wr;
+	int sge_no;
+	int sge_bytes;
+	int page_no;
+	int ret;
+
+	/* Post a recv buffer to handle another request. */
+	ret = svc_rdma_post_recv(rdma);
+	if (ret) {
+		printk(KERN_INFO
+		       "svcrdma: could not post a receive buffer, err=%d."
+		       "Closing transport %p.\n", ret, rdma);
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_rdma_put_frmr(rdma, vec->frmr);
+		svc_rdma_put_context(ctxt, 0);
+		return -ENOTCONN;
+	}
+
+	/* Prepare the context */
+	ctxt->pages[0] = page;
+	ctxt->count = 1;
+	ctxt->frmr = vec->frmr;
+	if (vec->frmr)
+		set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+
+	/* Prepare the SGE for the RPCRDMA Header */
+	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+	ctxt->sge[0].addr =
+	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
+			    ctxt->sge[0].length, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+		goto err;
+	atomic_inc(&rdma->sc_dma_used);
+
+	ctxt->direction = DMA_TO_DEVICE;
+
+	/* Map the payload indicated by 'byte_count' */
+	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+		int xdr_off = 0;
+		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+		byte_count -= sge_bytes;
+		if (!vec->frmr) {
+			ctxt->sge[sge_no].addr =
+				dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+					    sge_bytes, DMA_TO_DEVICE);
+			xdr_off += sge_bytes;
+			if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+						 ctxt->sge[sge_no].addr))
+				goto err;
+			atomic_inc(&rdma->sc_dma_used);
+			ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+		} else {
+			ctxt->sge[sge_no].addr = (unsigned long)
+				vec->sge[sge_no].iov_base;
+			ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+		}
+		ctxt->sge[sge_no].length = sge_bytes;
+	}
+	BUG_ON(byte_count != 0);
+
+	/* Save all respages in the ctxt and remove them from the
+	 * respages array. They are our pages until the I/O
+	 * completes.
+	 */
+	for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
+		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+		ctxt->count++;
+		rqstp->rq_respages[page_no] = NULL;
+		/*
+		 * If there are more pages than SGE, terminate SGE
+		 * list so that svc_rdma_unmap_dma doesn't attempt to
+		 * unmap garbage.
+		 */
+		if (page_no+1 >= sge_no)
+			ctxt->sge[page_no+1].length = 0;
+	}
+	BUG_ON(sge_no > rdma->sc_max_sge);
+	memset(&send_wr, 0, sizeof send_wr);
+	ctxt->wr_op = IB_WR_SEND;
+	send_wr.wr_id = (unsigned long)ctxt;
+	send_wr.sg_list = ctxt->sge;
+	send_wr.num_sge = sge_no;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.send_flags =  IB_SEND_SIGNALED;
+	if (vec->frmr) {
+		/* Prepare INVALIDATE WR */
+		memset(&inv_wr, 0, sizeof inv_wr);
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.send_flags = IB_SEND_SIGNALED;
+		inv_wr.ex.invalidate_rkey =
+			vec->frmr->mr->lkey;
+		send_wr.next = &inv_wr;
+	}
+
+	ret = svc_rdma_send(rdma, &send_wr);
+	if (ret)
+		goto err;
+
+	return 0;
+
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_frmr(rdma, vec->frmr);
+	svc_rdma_put_context(ctxt, 1);
+	return -EIO;
+}
+
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * Return the start of an xdr buffer.
+ */
+static void *xdr_start(struct xdr_buf *xdr)
+{
+	return xdr->head[0].iov_base -
+		(xdr->len -
+		 xdr->page_len -
+		 xdr->tail[0].iov_len -
+		 xdr->head[0].iov_len);
+}
+
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct rpcrdma_msg *rdma_argp;
+	struct rpcrdma_msg *rdma_resp;
+	struct rpcrdma_write_array *reply_ary;
+	enum rpcrdma_proc reply_type;
+	int ret;
+	int inline_bytes;
+	struct page *res_page;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_req_map *vec;
+
+	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+
+	/* Get the RDMA request header. */
+	rdma_argp = xdr_start(&rqstp->rq_arg);
+
+	/* Build an req vec for the XDR */
+	ctxt = svc_rdma_get_context(rdma);
+	ctxt->direction = DMA_TO_DEVICE;
+	vec = svc_rdma_get_req_map();
+	ret = map_xdr(rdma, &rqstp->rq_res, vec);
+	if (ret)
+		goto err0;
+	inline_bytes = rqstp->rq_res.len;
+
+	/* Create the RDMA response header */
+	res_page = svc_rdma_get_page();
+	rdma_resp = page_address(res_page);
+	reply_ary = svc_rdma_get_reply_array(rdma_argp);
+	if (reply_ary)
+		reply_type = RDMA_NOMSG;
+	else
+		reply_type = RDMA_MSG;
+	svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
+					 rdma_resp, reply_type);
+
+	/* Send any write-chunk data and build resp write-list */
+	ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
+				rqstp, vec);
+	if (ret < 0) {
+		printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
+		       ret);
+		goto err1;
+	}
+	inline_bytes -= ret;
+
+	/* Send any reply-list data and update resp reply-list */
+	ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
+				rqstp, vec);
+	if (ret < 0) {
+		printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
+		       ret);
+		goto err1;
+	}
+	inline_bytes -= ret;
+
+	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+			 inline_bytes);
+	svc_rdma_put_req_map(vec);
+	dprintk("svcrdma: send_reply returns %d\n", ret);
+	return ret;
+
+ err1:
+	put_page(res_page);
+ err0:
+	svc_rdma_put_req_map(vec);
+	svc_rdma_put_context(ctxt, 0);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 00000000..c3c232a8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1363 @@
+/*
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void dto_tasklet_func(unsigned long data);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static void rq_cq_reap(struct svcxprt_rdma *xprt);
+static void sq_cq_reap(struct svcxprt_rdma *xprt);
+
+static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
+static DEFINE_SPINLOCK(dto_lock);
+static LIST_HEAD(dto_xprt_q);
+
+static struct svc_xprt_ops svc_rdma_ops = {
+	.xpo_create = svc_rdma_create,
+	.xpo_recvfrom = svc_rdma_recvfrom,
+	.xpo_sendto = svc_rdma_sendto,
+	.xpo_release_rqst = svc_rdma_release_rqst,
+	.xpo_detach = svc_rdma_detach,
+	.xpo_free = svc_rdma_free,
+	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+	.xpo_has_wspace = svc_rdma_has_wspace,
+	.xpo_accept = svc_rdma_accept,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+	.xcl_name = "rdma",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_rdma_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+/* WR context cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+
+/* Workqueue created in svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+
+	while (1) {
+		ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+		if (ctxt)
+			break;
+		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+	}
+	ctxt->xprt = xprt;
+	INIT_LIST_HEAD(&ctxt->dto_q);
+	ctxt->count = 0;
+	ctxt->frmr = NULL;
+	atomic_inc(&xprt->sc_ctxt_used);
+	return ctxt;
+}
+
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+{
+	struct svcxprt_rdma *xprt = ctxt->xprt;
+	int i;
+	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+		/*
+		 * Unmap the DMA addr in the SGE if the lkey matches
+		 * the sc_dma_lkey, otherwise, ignore it since it is
+		 * an FRMR lkey and will be unmapped later when the
+		 * last WR that uses it completes.
+		 */
+		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+			atomic_dec(&xprt->sc_dma_used);
+			ib_dma_unmap_page(xprt->sc_cm_id->device,
+					    ctxt->sge[i].addr,
+					    ctxt->sge[i].length,
+					    ctxt->direction);
+		}
+	}
+}
+
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+	struct svcxprt_rdma *xprt;
+	int i;
+
+	BUG_ON(!ctxt);
+	xprt = ctxt->xprt;
+	if (free_pages)
+		for (i = 0; i < ctxt->count; i++)
+			put_page(ctxt->pages[i]);
+
+	kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
+	atomic_dec(&xprt->sc_ctxt_used);
+}
+
+/* Temporary NFS request map cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_map_cachep;
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+	struct svc_rdma_req_map *map;
+	while (1) {
+		map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+		if (map)
+			break;
+		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+	}
+	map->count = 0;
+	map->frmr = NULL;
+	return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+	kmem_cache_free(svc_rdma_map_cachep, map);
+}
+
+/* ib_cq event handler */
+static void cq_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+	dprintk("svcrdma: received CQ event id=%d, context=%p\n",
+		event->event, context);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+}
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+
+	switch (event->event) {
+	/* These are considered benign events */
+	case IB_EVENT_PATH_MIG:
+	case IB_EVENT_COMM_EST:
+	case IB_EVENT_SQ_DRAINED:
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		dprintk("svcrdma: QP event %d received for QP=%p\n",
+			event->event, event->element.qp);
+		break;
+	/* These are considered fatal events */
+	case IB_EVENT_PATH_MIG_ERR:
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+	default:
+		dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+			"closing transport\n",
+			event->event, event->element.qp);
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		break;
+	}
+}
+
+/*
+ * Data Transfer Operation Tasklet
+ *
+ * Walks a list of transports with I/O pending, removing entries as
+ * they are added to the server's I/O pending list. Two bits indicate
+ * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
+ * spinlock that serializes access to the transport list with the RQ
+ * and SQ interrupt handlers.
+ */
+static void dto_tasklet_func(unsigned long data)
+{
+	struct svcxprt_rdma *xprt;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dto_lock, flags);
+	while (!list_empty(&dto_xprt_q)) {
+		xprt = list_entry(dto_xprt_q.next,
+				  struct svcxprt_rdma, sc_dto_q);
+		list_del_init(&xprt->sc_dto_q);
+		spin_unlock_irqrestore(&dto_lock, flags);
+
+		rq_cq_reap(xprt);
+		sq_cq_reap(xprt);
+
+		svc_xprt_put(&xprt->sc_xprt);
+		spin_lock_irqsave(&dto_lock, flags);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+}
+
+/*
+ * Receive Queue Completion Handler
+ *
+ * Since an RQ completion handler is called on interrupt context, we
+ * need to defer the handling of the I/O to a tasklet
+ */
+static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct svcxprt_rdma *xprt = cq_context;
+	unsigned long flags;
+
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
+	/*
+	 * Set the bit regardless of whether or not it's on the list
+	 * because it may be on the list already due to an SQ
+	 * completion.
+	 */
+	set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+
+	/*
+	 * If this transport is not already on the DTO transport queue,
+	 * add it
+	 */
+	spin_lock_irqsave(&dto_lock, flags);
+	if (list_empty(&xprt->sc_dto_q)) {
+		svc_xprt_get(&xprt->sc_xprt);
+		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+
+	/* Tasklet does all the work to avoid irqsave locks. */
+	tasklet_schedule(&dto_tasklet);
+}
+
+/*
+ * rq_cq_reap - Process the RQ CQ.
+ *
+ * Take all completing WC off the CQE and enqueue the associated DTO
+ * context on the dto_q for the transport.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void rq_cq_reap(struct svcxprt_rdma *xprt)
+{
+	int ret;
+	struct ib_wc wc;
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+
+	if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+	atomic_inc(&rdma_stat_rq_poll);
+
+	while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
+		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+		ctxt->wc_status = wc.status;
+		ctxt->byte_len = wc.byte_len;
+		svc_rdma_unmap_dma(ctxt);
+		if (wc.status != IB_WC_SUCCESS) {
+			/* Close the transport */
+			dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_put_context(ctxt, 1);
+			svc_xprt_put(&xprt->sc_xprt);
+			continue;
+		}
+		spin_lock_bh(&xprt->sc_rq_dto_lock);
+		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+		spin_unlock_bh(&xprt->sc_rq_dto_lock);
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+
+	if (ctxt)
+		atomic_inc(&rdma_stat_rq_prod);
+
+	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+	/*
+	 * If data arrived before established event,
+	 * don't enqueue. This defers RPC I/O until the
+	 * RDMA connection is complete.
+	 */
+	if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+		svc_xprt_enqueue(&xprt->sc_xprt);
+}
+
+/*
+ * Process a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+			    struct svc_rdma_op_ctxt *ctxt)
+{
+	svc_rdma_unmap_dma(ctxt);
+
+	switch (ctxt->wr_op) {
+	case IB_WR_SEND:
+		if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+			svc_rdma_put_frmr(xprt, ctxt->frmr);
+		svc_rdma_put_context(ctxt, 1);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+			BUG_ON(!read_hdr);
+			if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+				svc_rdma_put_frmr(xprt, ctxt->frmr);
+			spin_lock_bh(&xprt->sc_rq_dto_lock);
+			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+			list_add_tail(&read_hdr->dto_q,
+				      &xprt->sc_read_complete_q);
+			spin_unlock_bh(&xprt->sc_rq_dto_lock);
+			svc_xprt_enqueue(&xprt->sc_xprt);
+		}
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	default:
+		printk(KERN_ERR "svcrdma: unexpected completion type, "
+		       "opcode=%d\n",
+		       ctxt->wr_op);
+		break;
+	}
+}
+
+/*
+ * Send Queue Completion Handler - potentially called on interrupt context.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void sq_cq_reap(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+	struct ib_wc wc;
+	struct ib_cq *cq = xprt->sc_sq_cq;
+	int ret;
+
+	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+	atomic_inc(&rdma_stat_sq_poll);
+	while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+		if (wc.status != IB_WC_SUCCESS)
+			/* Close the transport */
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+
+		/* Decrement used SQ WR count */
+		atomic_dec(&xprt->sc_sq_count);
+		wake_up(&xprt->sc_send_wait);
+
+		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+		if (ctxt)
+			process_context(xprt, ctxt);
+
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+
+	if (ctxt)
+		atomic_inc(&rdma_stat_sq_prod);
+}
+
+static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct svcxprt_rdma *xprt = cq_context;
+	unsigned long flags;
+
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
+	/*
+	 * Set the bit regardless of whether or not it's on the list
+	 * because it may be on the list already due to an RQ
+	 * completion.
+	 */
+	set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
+
+	/*
+	 * If this transport is not already on the DTO transport queue,
+	 * add it
+	 */
+	spin_lock_irqsave(&dto_lock, flags);
+	if (list_empty(&xprt->sc_dto_q)) {
+		svc_xprt_get(&xprt->sc_xprt);
+		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+
+	/* Tasklet does all the work to avoid irqsave locks. */
+	tasklet_schedule(&dto_tasklet);
+}
+
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+					     int listener)
+{
+	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+	if (!cma_xprt)
+		return NULL;
+	svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
+	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+	init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+	spin_lock_init(&cma_xprt->sc_lock);
+	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
+
+	cma_xprt->sc_ord = svcrdma_ord;
+
+	cma_xprt->sc_max_req_size = svcrdma_max_req_size;
+	cma_xprt->sc_max_requests = svcrdma_max_requests;
+	cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
+	atomic_set(&cma_xprt->sc_sq_count, 0);
+	atomic_set(&cma_xprt->sc_ctxt_used, 0);
+
+	if (listener)
+		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+
+	return cma_xprt;
+}
+
+struct page *svc_rdma_get_page(void)
+{
+	struct page *page;
+
+	while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+		/* If we can't get memory, wait a bit and try again */
+		printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+		       "jiffies.\n");
+		schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+	}
+	return page;
+}
+
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct page *page;
+	dma_addr_t pa;
+	int sge_no;
+	int buflen;
+	int ret;
+
+	ctxt = svc_rdma_get_context(xprt);
+	buflen = 0;
+	ctxt->direction = DMA_FROM_DEVICE;
+	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+		BUG_ON(sge_no >= xprt->sc_max_sge);
+		page = svc_rdma_get_page();
+		ctxt->pages[sge_no] = page;
+		pa = ib_dma_map_page(xprt->sc_cm_id->device,
+				     page, 0, PAGE_SIZE,
+				     DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+			goto err_put_ctxt;
+		atomic_inc(&xprt->sc_dma_used);
+		ctxt->sge[sge_no].addr = pa;
+		ctxt->sge[sge_no].length = PAGE_SIZE;
+		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+		ctxt->count = sge_no + 1;
+		buflen += PAGE_SIZE;
+	}
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &ctxt->sge[0];
+	recv_wr.num_sge = ctxt->count;
+	recv_wr.wr_id = (u64)(unsigned long)ctxt;
+
+	svc_xprt_get(&xprt->sc_xprt);
+	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+	if (ret) {
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+	return ret;
+
+ err_put_ctxt:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return -ENOMEM;
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+{
+	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+	struct svcxprt_rdma *newxprt;
+	struct sockaddr *sa;
+
+	/* Create a new transport */
+	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+	if (!newxprt) {
+		dprintk("svcrdma: failed to create new transport\n");
+		return;
+	}
+	newxprt->sc_cm_id = new_cma_id;
+	new_cma_id->context = newxprt;
+	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+		newxprt, newxprt->sc_cm_id, listen_xprt);
+
+	/* Save client advertised inbound read limit for use later in accept. */
+	newxprt->sc_ord = client_ird;
+
+	/* Set the local and remote addresses in the transport */
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+	/*
+	 * Enqueue the new transport on the accept queue of the listening
+	 * transport
+	 */
+	spin_lock_bh(&listen_xprt->sc_lock);
+	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+	spin_unlock_bh(&listen_xprt->sc_lock);
+
+	/*
+	 * Can't use svc_xprt_received here because we are not on a
+	 * rqstp thread
+	*/
+	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal  events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+			       struct rdma_cm_event *event)
+{
+	struct svcxprt_rdma *xprt = cma_id->context;
+	int ret = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+			"event=%d\n", cma_id, cma_id->context, event->event);
+		handle_connect_req(cma_id,
+				   event->param.conn.initiator_depth);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		break;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt)
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		break;
+
+	default:
+		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+			"event=%d\n", cma_id, event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+			    struct rdma_cm_event *event)
+{
+	struct svc_xprt *xprt = cma_id->context;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		svc_xprt_get(xprt);
+		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+			"event=%d\n", cma_id, xprt, event->event);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+		}
+		break;
+	default:
+		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+			"event=%d\n", cma_id, event->event);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags)
+{
+	struct rdma_cm_id *listen_id;
+	struct svcxprt_rdma *cma_xprt;
+	struct svc_xprt *xprt;
+	int ret;
+
+	dprintk("svcrdma: Creating RDMA socket\n");
+	if (sa->sa_family != AF_INET) {
+		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+	cma_xprt = rdma_create_xprt(serv, 1);
+	if (!cma_xprt)
+		return ERR_PTR(-ENOMEM);
+	xprt = &cma_xprt->sc_xprt;
+
+	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
+				   IB_QPT_RC);
+	if (IS_ERR(listen_id)) {
+		ret = PTR_ERR(listen_id);
+		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+		goto err0;
+	}
+
+	ret = rdma_bind_addr(listen_id, sa);
+	if (ret) {
+		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+		goto err1;
+	}
+	cma_xprt->sc_cm_id = listen_id;
+
+	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+	if (ret) {
+		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+		goto err1;
+	}
+
+	/*
+	 * We need to use the address from the cm_id in case the
+	 * caller specified 0 for the port number.
+	 */
+	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+	return &cma_xprt->sc_xprt;
+
+ err1:
+	rdma_destroy_id(listen_id);
+ err0:
+	kfree(cma_xprt);
+	return ERR_PTR(ret);
+}
+
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *pl;
+	struct svc_rdma_fastreg_mr *frmr;
+
+	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+	if (!frmr)
+		goto err;
+
+	mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+	if (IS_ERR(mr))
+		goto err_free_frmr;
+
+	pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+					 RPCSVC_MAXPAGES);
+	if (IS_ERR(pl))
+		goto err_free_mr;
+
+	frmr->mr = mr;
+	frmr->page_list = pl;
+	INIT_LIST_HEAD(&frmr->frmr_list);
+	return frmr;
+
+ err_free_mr:
+	ib_dereg_mr(mr);
+ err_free_frmr:
+	kfree(frmr);
+ err:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_fastreg_mr *frmr;
+
+	while (!list_empty(&xprt->sc_frmr_q)) {
+		frmr = list_entry(xprt->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		ib_dereg_mr(frmr->mr);
+		ib_free_fast_reg_page_list(frmr->page_list);
+		kfree(frmr);
+	}
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_fastreg_mr *frmr = NULL;
+
+	spin_lock_bh(&rdma->sc_frmr_q_lock);
+	if (!list_empty(&rdma->sc_frmr_q)) {
+		frmr = list_entry(rdma->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		frmr->map_len = 0;
+		frmr->page_list_len = 0;
+	}
+	spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	if (frmr)
+		return frmr;
+
+	return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+			   struct svc_rdma_fastreg_mr *frmr)
+{
+	int page_no;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		dma_addr_t addr = frmr->page_list->page_list[page_no];
+		if (ib_dma_mapping_error(frmr->mr->device, addr))
+			continue;
+		atomic_dec(&xprt->sc_dma_used);
+		ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
+				  frmr->direction);
+	}
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+		       struct svc_rdma_fastreg_mr *frmr)
+{
+	if (frmr) {
+		frmr_unmap_dma(rdma, frmr);
+		spin_lock_bh(&rdma->sc_frmr_q_lock);
+		BUG_ON(!list_empty(&frmr->frmr_list));
+		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+		spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	}
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *listen_rdma;
+	struct svcxprt_rdma *newxprt = NULL;
+	struct rdma_conn_param conn_param;
+	struct ib_qp_init_attr qp_attr;
+	struct ib_device_attr devattr;
+	int uninitialized_var(dma_mr_acc);
+	int need_dma_mr;
+	int ret;
+	int i;
+
+	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	clear_bit(XPT_CONN, &xprt->xpt_flags);
+	/* Get the next entry off the accept list */
+	spin_lock_bh(&listen_rdma->sc_lock);
+	if (!list_empty(&listen_rdma->sc_accept_q)) {
+		newxprt = list_entry(listen_rdma->sc_accept_q.next,
+				     struct svcxprt_rdma, sc_accept_q);
+		list_del_init(&newxprt->sc_accept_q);
+	}
+	if (!list_empty(&listen_rdma->sc_accept_q))
+		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+	spin_unlock_bh(&listen_rdma->sc_lock);
+	if (!newxprt)
+		return NULL;
+
+	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+		newxprt, newxprt->sc_cm_id);
+
+	ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
+	if (ret) {
+		dprintk("svcrdma: could not query device attributes on "
+			"device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
+		goto errout;
+	}
+
+	/* Qualify the transport resource defaults with the
+	 * capabilities of this particular device */
+	newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
+				   (size_t)svcrdma_max_requests);
+	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+
+	/*
+	 * Limit ORD based on client limit, local device limit, and
+	 * configured svcrdma limit.
+	 */
+	newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
+
+	newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+	if (IS_ERR(newxprt->sc_pd)) {
+		dprintk("svcrdma: error creating PD for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+					 sq_comp_handler,
+					 cq_event_handler,
+					 newxprt,
+					 newxprt->sc_sq_depth,
+					 0);
+	if (IS_ERR(newxprt->sc_sq_cq)) {
+		dprintk("svcrdma: error creating SQ CQ for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+					 rq_comp_handler,
+					 cq_event_handler,
+					 newxprt,
+					 newxprt->sc_max_requests,
+					 0);
+	if (IS_ERR(newxprt->sc_rq_cq)) {
+		dprintk("svcrdma: error creating RQ CQ for connect request\n");
+		goto errout;
+	}
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+	qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = newxprt->sc_sq_cq;
+	qp_attr.recv_cq = newxprt->sc_rq_cq;
+	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
+		"    cm_id->device=%p, sc_pd->device=%p\n"
+		"    cap.max_send_wr = %d\n"
+		"    cap.max_recv_wr = %d\n"
+		"    cap.max_send_sge = %d\n"
+		"    cap.max_recv_sge = %d\n",
+		newxprt->sc_cm_id, newxprt->sc_pd,
+		newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+		qp_attr.cap.max_send_wr,
+		qp_attr.cap.max_recv_wr,
+		qp_attr.cap.max_send_sge,
+		qp_attr.cap.max_recv_sge);
+
+	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+	if (ret) {
+		/*
+		 * XXX: This is a hack. We need a xx_request_qp interface
+		 * that will adjust the qp_attr's with a best-effort
+		 * number
+		 */
+		qp_attr.cap.max_send_sge -= 2;
+		qp_attr.cap.max_recv_sge -= 2;
+		ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
+				     &qp_attr);
+		if (ret) {
+			dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+			goto errout;
+		}
+		newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
+		newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
+		newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
+		newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
+	}
+	newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+	/*
+	 * Use the most secure set of MR resources based on the
+	 * transport type and available memory management features in
+	 * the device. Here's the table implemented below:
+	 *
+	 *		Fast	Global	DMA	Remote WR
+	 *		Reg	LKEY	MR	Access
+	 *		Sup'd	Sup'd	Needed	Needed
+	 *
+	 * IWARP	N	N	Y	Y
+	 *		N	Y	Y	Y
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * IB		N	N	Y	N
+	 *		N	Y	N	-
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * NB:	iWARP requires remote write access for the data sink
+	 *	of an RDMA_READ. IB does not.
+	 */
+	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		newxprt->sc_frmr_pg_list_len =
+			devattr.max_fast_reg_page_list_len;
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+	}
+
+	/*
+	 * Determine if a DMA MR is required and if so, what privs are required
+	 */
+	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+	case RDMA_TRANSPORT_IWARP:
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+			need_dma_mr = 1;
+			dma_mr_acc =
+				(IB_ACCESS_LOCAL_WRITE |
+				 IB_ACCESS_REMOTE_WRITE);
+		} else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	case RDMA_TRANSPORT_IB:
+		if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	default:
+		goto errout;
+	}
+
+	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+	if (need_dma_mr) {
+		/* Register all of physical memory */
+		newxprt->sc_phys_mr =
+			ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+		if (IS_ERR(newxprt->sc_phys_mr)) {
+			dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+				ret);
+			goto errout;
+		}
+		newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+	} else
+		newxprt->sc_dma_lkey =
+			newxprt->sc_cm_id->device->local_dma_lkey;
+
+	/* Post receive buffers */
+	for (i = 0; i < newxprt->sc_max_requests; i++) {
+		ret = svc_rdma_post_recv(newxprt);
+		if (ret) {
+			dprintk("svcrdma: failure posting receive buffers\n");
+			goto errout;
+		}
+	}
+
+	/* Swap out the handler */
+	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+
+	/*
+	 * Arm the CQs for the SQ and RQ before accepting so we can't
+	 * miss the first message
+	 */
+	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+
+	/* Accept Connection */
+	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 0;
+	conn_param.initiator_depth = newxprt->sc_ord;
+	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+	if (ret) {
+		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
+		       ret);
+		goto errout;
+	}
+
+	dprintk("svcrdma: new connection %p accepted with the following "
+		"attributes:\n"
+		"    local_ip        : %pI4\n"
+		"    local_port	     : %d\n"
+		"    remote_ip       : %pI4\n"
+		"    remote_port     : %d\n"
+		"    max_sge         : %d\n"
+		"    sq_depth        : %d\n"
+		"    max_requests    : %d\n"
+		"    ord             : %d\n",
+		newxprt,
+		&((struct sockaddr_in *)&newxprt->sc_cm_id->
+			 route.addr.src_addr)->sin_addr.s_addr,
+		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+		       route.addr.src_addr)->sin_port),
+		&((struct sockaddr_in *)&newxprt->sc_cm_id->
+			 route.addr.dst_addr)->sin_addr.s_addr,
+		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+		       route.addr.dst_addr)->sin_port),
+		newxprt->sc_max_sge,
+		newxprt->sc_sq_depth,
+		newxprt->sc_max_requests,
+		newxprt->sc_ord);
+
+	return &newxprt->sc_xprt;
+
+ errout:
+	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+	/* Take a reference in case the DTO handler runs */
+	svc_xprt_get(&newxprt->sc_xprt);
+	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+		ib_destroy_qp(newxprt->sc_qp);
+	rdma_destroy_id(newxprt->sc_cm_id);
+	/* This call to put will destroy the transport */
+	svc_xprt_put(&newxprt->sc_xprt);
+	return NULL;
+}
+
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * When connected, an svc_xprt has at least two references:
+ *
+ * - A reference held by the cm_id between the ESTABLISHED and
+ *   DISCONNECTED events. If the remote peer disconnected first, this
+ *   reference could be gone.
+ *
+ * - A reference held by the svc_recv code that called this function
+ *   as part of close processing.
+ *
+ * At a minimum one references should still be held.
+ */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+
+	/* Disconnect and flush posted WQE */
+	rdma_disconnect(rdma->sc_cm_id);
+}
+
+static void __svc_rdma_free(struct work_struct *work)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(work, struct svcxprt_rdma, sc_work);
+	dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+
+	/* We should only be called from kref_put */
+	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+
+	/*
+	 * Destroy queued, but not processed read completions. Note
+	 * that this cleanup has to be done before destroying the
+	 * cm_id because the device ptr is needed to unmap the dma in
+	 * svc_rdma_put_context.
+	 */
+	while (!list_empty(&rdma->sc_read_complete_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_read_complete_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+
+	/* Destroy queued, but not processed recv completions */
+	while (!list_empty(&rdma->sc_rq_dto_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_rq_dto_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+
+	/* Warn if we leaked a resource or under-referenced */
+	WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+	WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+
+	/* De-allocate fastreg mr */
+	rdma_dealloc_frmr_q(rdma);
+
+	/* Destroy the QP if present (not a listener) */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_destroy_qp(rdma->sc_qp);
+
+	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
+		ib_destroy_cq(rdma->sc_sq_cq);
+
+	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
+		ib_destroy_cq(rdma->sc_rq_cq);
+
+	if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
+		ib_dereg_mr(rdma->sc_phys_mr);
+
+	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
+		ib_dealloc_pd(rdma->sc_pd);
+
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
+	kfree(rdma);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+	queue_work(svc_rdma_wq, &rdma->sc_work);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	/*
+	 * If there are fewer SQ WR available than required to send a
+	 * simple response, return false.
+	 */
+	if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
+		return 0;
+
+	/*
+	 * ...or there are already waiters on the SQ,
+	 * return false.
+	 */
+	if (waitqueue_active(&rdma->sc_send_wait))
+		return 0;
+
+	/* Otherwise return true. */
+	return 1;
+}
+
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ *  NULL : The device does not support fastreg or there were no more
+ *         fastreg mr.
+ *  frmr : The kvec register request was successfully posted.
+ *    <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+		     struct svc_rdma_fastreg_mr *frmr)
+{
+	struct ib_send_wr fastreg_wr;
+	u8 key;
+
+	/* Bump the key */
+	key = (u8)(frmr->mr->lkey & 0x000000FF);
+	ib_update_fast_reg_key(frmr->mr, ++key);
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof fastreg_wr);
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = frmr->map_len;
+	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+	return svc_rdma_send(xprt, &fastreg_wr);
+}
+
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+	struct ib_send_wr *bad_wr, *n_wr;
+	int wr_count;
+	int i;
+	int ret;
+
+	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+		return -ENOTCONN;
+
+	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
+	wr_count = 1;
+	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+		wr_count++;
+
+	/* If the SQ is full, wait until an SQ entry is available */
+	while (1) {
+		spin_lock_bh(&xprt->sc_lock);
+		if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
+			spin_unlock_bh(&xprt->sc_lock);
+			atomic_inc(&rdma_stat_sq_starve);
+
+			/* See if we can opportunistically reap SQ WR to make room */
+			sq_cq_reap(xprt);
+
+			/* Wait until SQ WR available if SQ still full */
+			wait_event(xprt->sc_send_wait,
+				   atomic_read(&xprt->sc_sq_count) <
+				   xprt->sc_sq_depth);
+			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+				return -ENOTCONN;
+			continue;
+		}
+		/* Take a transport ref for each WR posted */
+		for (i = 0; i < wr_count; i++)
+			svc_xprt_get(&xprt->sc_xprt);
+
+		/* Bump used SQ WR count and post */
+		atomic_add(wr_count, &xprt->sc_sq_count);
+		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+		if (ret) {
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			atomic_sub(wr_count, &xprt->sc_sq_count);
+			for (i = 0; i < wr_count; i ++)
+				svc_xprt_put(&xprt->sc_xprt);
+			dprintk("svcrdma: failed to post SQ WR rc=%d, "
+			       "sc_sq_count=%d, sc_sq_depth=%d\n",
+			       ret, atomic_read(&xprt->sc_sq_count),
+			       xprt->sc_sq_depth);
+		}
+		spin_unlock_bh(&xprt->sc_lock);
+		if (ret)
+			wake_up(&xprt->sc_send_wait);
+		break;
+	}
+	return ret;
+}
+
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+			 enum rpcrdma_errcode err)
+{
+	struct ib_send_wr err_wr;
+	struct page *p;
+	struct svc_rdma_op_ctxt *ctxt;
+	u32 *va;
+	int length;
+	int ret;
+
+	p = svc_rdma_get_page();
+	va = page_address(p);
+
+	/* XDR encode error */
+	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+
+	ctxt = svc_rdma_get_context(xprt);
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->count = 1;
+	ctxt->pages[0] = p;
+
+	/* Prepare SGE for local address */
+	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
+					    p, 0, length, DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
+		put_page(p);
+		svc_rdma_put_context(ctxt, 1);
+		return;
+	}
+	atomic_inc(&xprt->sc_dma_used);
+	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+	ctxt->sge[0].length = length;
+
+	/* Prepare SEND WR */
+	memset(&err_wr, 0, sizeof err_wr);
+	ctxt->wr_op = IB_WR_SEND;
+	err_wr.wr_id = (unsigned long)ctxt;
+	err_wr.sg_list = ctxt->sge;
+	err_wr.num_sge = 1;
+	err_wr.opcode = IB_WR_SEND;
+	err_wr.send_flags = IB_SEND_SIGNALED;
+
+	/* Post It */
+	ret = svc_rdma_send(xprt, &err_wr);
+	if (ret) {
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+	}
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 00000000..0867070b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include "xprt_rdma.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static ctl_table xr_tunables_table[] = {
+	{
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rdma_inline_write_padding",
+		.data		= &xprt_rdma_inline_write_padding,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &max_padding,
+	},
+	{
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.procname	= "rdma_pad_optimize",
+		.data		= &xprt_rdma_pad_optimize,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ },
+};
+
+static ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{ },
+};
+
+#endif
+
+static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
+
+static void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr *sap = (struct sockaddr *)
+					&rpcx_to_rdmad(xprt).addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[64];
+
+	(void)rpc_ntop(sap, buf, sizeof(buf));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	/* netid */
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+}
+
+static void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	unsigned int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		switch (i) {
+		case RPC_DISPLAY_PROTO:
+		case RPC_DISPLAY_NETID:
+			continue;
+		default:
+			kfree(xprt->address_strings[i]);
+		}
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt =
+		container_of(work, struct rpcrdma_xprt, rdma_connect.work);
+	struct rpc_xprt *xprt = &r_xprt->xprt;
+	int rc = 0;
+
+	if (!xprt->shutdown) {
+		xprt_clear_connected(xprt);
+
+		dprintk("RPC:       %s: %sconnect\n", __func__,
+				r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+		rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+		if (rc)
+			goto out;
+	}
+	goto out_clear;
+
+out:
+	xprt_wake_pending_tasks(xprt, rc);
+
+out_clear:
+	dprintk("RPC:       %s: exit\n", __func__);
+	xprt_clear_connecting(xprt);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc;
+
+	dprintk("RPC:       %s: called\n", __func__);
+
+	cancel_delayed_work_sync(&r_xprt->rdma_connect);
+
+	xprt_clear_connected(xprt);
+
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+	rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		dprintk("RPC:       %s: rpcrdma_ep_destroy returned %i\n",
+			__func__, rc);
+	rpcrdma_ia_close(&r_xprt->rx_ia);
+
+	xprt_rdma_free_addresses(xprt);
+
+	xprt_free(xprt);
+
+	dprintk("RPC:       %s: returning\n", __func__);
+
+	module_put(THIS_MODULE);
+}
+
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpcrdma_create_data_internal cdata;
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct rpcrdma_ep *new_ep;
+	struct sockaddr_in *sin;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
+			xprt_rdma_slot_table_entries);
+	if (xprt == NULL) {
+		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* 60 second timeout, no retries */
+	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->bind_timeout = (60U * HZ);
+	xprt->reestablish_timeout = (5U * HZ);
+	xprt->idle_timeout = (5U * 60 * HZ);
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
+	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+
+	/* Put server RDMA address in local cdata */
+	memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+
+	sin = (struct sockaddr_in *)&cdata.addr;
+	if (ntohs(sin->sin_port) != 0)
+		xprt_set_bound(xprt);
+
+	dprintk("RPC:       %s: %pI4:%u\n",
+		__func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
+
+	/* Set max requests */
+	cdata.max_requests = xprt->max_reqs;
+
+	/* Set some length limits */
+	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+	cdata.inline_wsize = xprt_rdma_max_inline_write;
+	if (cdata.inline_wsize > cdata.wsize)
+		cdata.inline_wsize = cdata.wsize;
+
+	cdata.inline_rsize = xprt_rdma_max_inline_read;
+	if (cdata.inline_rsize > cdata.rsize)
+		cdata.inline_rsize = cdata.rsize;
+
+	cdata.padding = xprt_rdma_inline_write_padding;
+
+	/*
+	 * Create new transport instance, which includes initialized
+	 *  o ia
+	 *  o endpoint
+	 *  o buffers
+	 */
+
+	new_xprt = rpcx_to_rdmax(xprt);
+
+	rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
+				xprt_rdma_memreg_strategy);
+	if (rc)
+		goto out1;
+
+	/*
+	 * initialize and create ep
+	 */
+	new_xprt->rx_data = cdata;
+	new_ep = &new_xprt->rx_ep;
+	new_ep->rep_remote_addr = cdata.addr;
+
+	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+				&new_xprt->rx_ia, &new_xprt->rx_data);
+	if (rc)
+		goto out2;
+
+	/*
+	 * Allocate pre-registered send and receive buffers for headers and
+	 * any inline data. Also specify any padding which will be provided
+	 * from a preregistered zero buffer.
+	 */
+	rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
+				&new_xprt->rx_data);
+	if (rc)
+		goto out3;
+
+	/*
+	 * Register a callback for connection events. This is necessary because
+	 * connection loss notification is async. We also catch connection loss
+	 * when reaping receives.
+	 */
+	INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
+	new_ep->rep_func = rpcrdma_conn_func;
+	new_ep->rep_xprt = xprt;
+
+	xprt_rdma_format_addresses(xprt);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out4;
+
+	return xprt;
+
+out4:
+	xprt_rdma_free_addresses(xprt);
+	rc = -EINVAL;
+out3:
+	(void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+	rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+	xprt_free(xprt);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Close a connection, during shutdown or timeout/reconnect
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	dprintk("RPC:       %s: closing\n", __func__);
+	if (r_xprt->rx_ep.rep_connected > 0)
+		xprt->reestablish_timeout = 0;
+	xprt_disconnect_done(xprt);
+	(void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+}
+
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr_in *sap;
+
+	sap = (struct sockaddr_in *)&xprt->addr;
+	sap->sin_port = htons(port);
+	sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
+	sap->sin_port = htons(port);
+	dprintk("RPC:       %s: %u\n", __func__, port);
+}
+
+static void
+xprt_rdma_connect(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	if (r_xprt->rx_ep.rep_connected != 0) {
+		/* Reconnect */
+		schedule_delayed_work(&r_xprt->rdma_connect,
+			xprt->reestablish_timeout);
+		xprt->reestablish_timeout <<= 1;
+		if (xprt->reestablish_timeout > (30 * HZ))
+			xprt->reestablish_timeout = (30 * HZ);
+		else if (xprt->reestablish_timeout < (5 * HZ))
+			xprt->reestablish_timeout = (5 * HZ);
+	} else {
+		schedule_delayed_work(&r_xprt->rdma_connect, 0);
+		if (!RPC_IS_ASYNC(task))
+			flush_delayed_work(&r_xprt->rdma_connect);
+	}
+}
+
+static int
+xprt_rdma_reserve_xprt(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
+
+	/* == RPC_CWNDSCALE @ init, but *after* setup */
+	if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
+		r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
+		dprintk("RPC:       %s: cwndscale %lu\n", __func__,
+			r_xprt->rx_buf.rb_cwndscale);
+		BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
+	}
+	xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
+	return xprt_reserve_xprt_cong(task);
+}
+
+/*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+ * sequence. For this reason, the recv buffers are attached to send
+ * buffers for portions of the RPC. Note that the RPC layer allocates
+ * both send and receive buffers in the same call. We may register
+ * the receive buffer portion when using reply chunks.
+ */
+static void *
+xprt_rdma_allocate(struct rpc_task *task, size_t size)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req, *nreq;
+
+	req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+	BUG_ON(NULL == req);
+
+	if (size > req->rl_size) {
+		dprintk("RPC:       %s: size %zd too large for buffer[%zd]: "
+			"prog %d vers %d proc %d\n",
+			__func__, size, req->rl_size,
+			task->tk_client->cl_prog, task->tk_client->cl_vers,
+			task->tk_msg.rpc_proc->p_proc);
+		/*
+		 * Outgoing length shortage. Our inline write max must have
+		 * been configured to perform direct i/o.
+		 *
+		 * This is therefore a large metadata operation, and the
+		 * allocate call was made on the maximum possible message,
+		 * e.g. containing long filename(s) or symlink data. In
+		 * fact, while these metadata operations *might* carry
+		 * large outgoing payloads, they rarely *do*. However, we
+		 * have to commit to the request here, so reallocate and
+		 * register it now. The data path will never require this
+		 * reallocation.
+		 *
+		 * If the allocation or registration fails, the RPC framework
+		 * will (doggedly) retry.
+		 */
+		if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
+				RPCRDMA_BOUNCEBUFFERS) {
+			/* forced to "pure inline" */
+			dprintk("RPC:       %s: too much data (%zd) for inline "
+					"(r/w max %d/%d)\n", __func__, size,
+					rpcx_to_rdmad(xprt).inline_rsize,
+					rpcx_to_rdmad(xprt).inline_wsize);
+			size = req->rl_size;
+			rpc_exit(task, -EIO);		/* fail the operation */
+			rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+			goto out;
+		}
+		if (task->tk_flags & RPC_TASK_SWAPPER)
+			nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+		else
+			nreq = kmalloc(sizeof *req + size, GFP_NOFS);
+		if (nreq == NULL)
+			goto outfail;
+
+		if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
+				nreq->rl_base, size + sizeof(struct rpcrdma_req)
+				- offsetof(struct rpcrdma_req, rl_base),
+				&nreq->rl_handle, &nreq->rl_iov)) {
+			kfree(nreq);
+			goto outfail;
+		}
+		rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
+		nreq->rl_size = size;
+		nreq->rl_niovs = 0;
+		nreq->rl_nchunks = 0;
+		nreq->rl_buffer = (struct rpcrdma_buffer *)req;
+		nreq->rl_reply = req->rl_reply;
+		memcpy(nreq->rl_segments,
+			req->rl_segments, sizeof nreq->rl_segments);
+		/* flag the swap with an unused field */
+		nreq->rl_iov.length = 0;
+		req->rl_reply = NULL;
+		req = nreq;
+	}
+	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+out:
+	req->rl_connect_cookie = 0;	/* our reserved value */
+	return req->rl_xdr_buf;
+
+outfail:
+	rpcrdma_buffer_put(req);
+	rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+	return NULL;
+}
+
+/*
+ * This function returns all RDMA resources to the pool.
+ */
+static void
+xprt_rdma_free(void *buffer)
+{
+	struct rpcrdma_req *req;
+	struct rpcrdma_xprt *r_xprt;
+	struct rpcrdma_rep *rep;
+	int i;
+
+	if (buffer == NULL)
+		return;
+
+	req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer,
+				      struct rpcrdma_xprt, rx_buf);
+	} else
+		r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
+	rep = req->rl_reply;
+
+	dprintk("RPC:       %s: called on 0x%p%s\n",
+		__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+
+	/*
+	 * Finish the deregistration. When using mw bind, this was
+	 * begun in rpcrdma_reply_handler(). In all other modes, we
+	 * do it here, in thread context. The process is considered
+	 * complete when the rr_func vector becomes NULL - this
+	 * was put in place during rpcrdma_reply_handler() - the wait
+	 * call below will not block if the dereg is "done". If
+	 * interrupted, our framework will clean up.
+	 */
+	for (i = 0; req->rl_nchunks;) {
+		--req->rl_nchunks;
+		i += rpcrdma_deregister_external(
+			&req->rl_segments[i], r_xprt, NULL);
+	}
+
+	if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
+		rep->rr_func = NULL;	/* abandon the callback */
+		req->rl_reply = NULL;
+	}
+
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+		oreq->rl_reply = req->rl_reply;
+		(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
+						   req->rl_handle,
+						   &req->rl_iov);
+		kfree(req);
+		req = oreq;
+	}
+
+	/* Put back request+reply buffers */
+	rpcrdma_buffer_put(req);
+}
+
+/*
+ * send_request invokes the meat of RPC RDMA. It must do the following:
+ *  1.  Marshal the RPC request into an RPC RDMA request, which means
+ *	putting a header in front of data, and creating IOVs for RDMA
+ *	from those in the request.
+ *  2.  In marshaling, detect opportunities for RDMA, and use them.
+ *  3.  Post a recv message to set up asynch completion, then send
+ *	the request (rpcrdma_ep_post).
+ *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ */
+
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	/* marshal the send itself */
+	if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
+		r_xprt->rx_stats.failed_marshal_count++;
+		dprintk("RPC:       %s: rpcrdma_marshal_req failed\n",
+			__func__);
+		return -EIO;
+	}
+
+	if (req->rl_reply == NULL) 		/* e.g. reconnection */
+		rpcrdma_recv_buffer_get(req);
+
+	if (req->rl_reply) {
+		req->rl_reply->rr_func = rpcrdma_reply_handler;
+		/* this need only be done once, but... */
+		req->rl_reply->rr_xprt = xprt;
+	}
+
+	/* Must suppress retransmit to maintain credits */
+	if (req->rl_connect_cookie == xprt->connect_cookie)
+		goto drop_connection;
+	req->rl_connect_cookie = xprt->connect_cookie;
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
+
+	rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
+	rqst->rq_bytes_sent = 0;
+	return 0;
+
+drop_connection:
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;	/* implies disconnect */
+}
+
+static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq,
+	  "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
+	  "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
+
+	   0,	/* need a local port? */
+	   xprt->stat.bind_count,
+	   xprt->stat.connect_count,
+	   xprt->stat.connect_time,
+	   idle_time,
+	   xprt->stat.sends,
+	   xprt->stat.recvs,
+	   xprt->stat.bad_xids,
+	   xprt->stat.req_u,
+	   xprt->stat.bklog_u,
+
+	   r_xprt->rx_stats.read_chunk_count,
+	   r_xprt->rx_stats.write_chunk_count,
+	   r_xprt->rx_stats.reply_chunk_count,
+	   r_xprt->rx_stats.total_rdma_request,
+	   r_xprt->rx_stats.total_rdma_reply,
+	   r_xprt->rx_stats.pullup_copy_count,
+	   r_xprt->rx_stats.fixup_copy_count,
+	   r_xprt->rx_stats.hardway_register_count,
+	   r_xprt->rx_stats.failed_marshal_count,
+	   r_xprt->rx_stats.bad_reply_count);
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_rdma_reserve_xprt,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+};
+
+static void __exit xprt_rdma_cleanup(void)
+{
+	int rc;
+
+	dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#ifdef RPC_DEBUG
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+	rc = xprt_unregister_transport(&xprt_rdma);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister returned %i\n",
+			__func__, rc);
+}
+
+static int __init xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+
+	if (rc)
+		return rc;
+
+	dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+
+	dprintk(KERN_INFO "Defaults:\n");
+	dprintk(KERN_INFO "\tSlots %d\n"
+		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+		xprt_rdma_slot_table_entries,
+		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+	dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+#ifdef RPC_DEBUG
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
+
+module_init(xprt_rdma_init);
+module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 00000000..80f8da34
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1952 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/pci.h>	/* for Tavor hack below */
+#include <linux/slab.h>
+
+#include "xprt_rdma.h"
+
+/*
+ * Globals/Macros
+ */
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+
+/*
+ * handle replies in tasklet context, using a single, global list
+ * rdma tasklet function -- just turn around and call the func
+ * for all replies on the list
+ */
+
+static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
+static LIST_HEAD(rpcrdma_tasklets_g);
+
+static void
+rpcrdma_run_tasklet(unsigned long data)
+{
+	struct rpcrdma_rep *rep;
+	void (*func)(struct rpcrdma_rep *);
+	unsigned long flags;
+
+	data = data;
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	while (!list_empty(&rpcrdma_tasklets_g)) {
+		rep = list_entry(rpcrdma_tasklets_g.next,
+				 struct rpcrdma_rep, rr_list);
+		list_del(&rep->rr_list);
+		func = rep->rr_func;
+		rep->rr_func = NULL;
+		spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+
+		if (func)
+			func(rep);
+		else
+			rpcrdma_recv_buffer_put(rep);
+
+		spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	}
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+}
+
+static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
+
+static inline void
+rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+}
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static void
+rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static inline
+void rpcrdma_event_process(struct ib_wc *wc)
+{
+	struct rpcrdma_mw *frmr;
+	struct rpcrdma_rep *rep =
+			(struct rpcrdma_rep *)(unsigned long) wc->wr_id;
+
+	dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
+		__func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+	if (!rep) /* send or bind completion that we don't care about */
+		return;
+
+	if (IB_WC_SUCCESS != wc->status) {
+		dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
+			__func__, wc->opcode, wc->status);
+		rep->rr_len = ~0U;
+		if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
+			rpcrdma_schedule_tasklet(rep);
+		return;
+	}
+
+	switch (wc->opcode) {
+	case IB_WC_FAST_REG_MR:
+		frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+		frmr->r.frmr.state = FRMR_IS_VALID;
+		break;
+	case IB_WC_LOCAL_INV:
+		frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+		frmr->r.frmr.state = FRMR_IS_INVALID;
+		break;
+	case IB_WC_RECV:
+		rep->rr_len = wc->byte_len;
+		ib_dma_sync_single_for_cpu(
+			rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+			rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+		/* Keep (only) the most recent credits, after check validity */
+		if (rep->rr_len >= 16) {
+			struct rpcrdma_msg *p =
+					(struct rpcrdma_msg *) rep->rr_base;
+			unsigned int credits = ntohl(p->rm_credit);
+			if (credits == 0) {
+				dprintk("RPC:       %s: server"
+					" dropped credits to 0!\n", __func__);
+				/* don't deadlock */
+				credits = 1;
+			} else if (credits > rep->rr_buffer->rb_max_requests) {
+				dprintk("RPC:       %s: server"
+					" over-crediting: %d (%d)\n",
+					__func__, credits,
+					rep->rr_buffer->rb_max_requests);
+				credits = rep->rr_buffer->rb_max_requests;
+			}
+			atomic_set(&rep->rr_buffer->rb_credits, credits);
+		}
+		/* fall through */
+	case IB_WC_BIND_MW:
+		rpcrdma_schedule_tasklet(rep);
+		break;
+	default:
+		dprintk("RPC:       %s: unexpected WC event %X\n",
+			__func__, wc->opcode);
+		break;
+	}
+}
+
+static inline int
+rpcrdma_cq_poll(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int rc;
+
+	for (;;) {
+		rc = ib_poll_cq(cq, 1, &wc);
+		if (rc < 0) {
+			dprintk("RPC:       %s: ib_poll_cq failed %i\n",
+				__func__, rc);
+			return rc;
+		}
+		if (rc == 0)
+			break;
+
+		rpcrdma_event_process(&wc);
+	}
+
+	return 0;
+}
+
+/*
+ * rpcrdma_cq_event_upcall
+ *
+ * This upcall handles recv, send, bind and unbind events.
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+ * It is the responsibility of the scheduled tasklet to return
+ * recv buffers to the pool. NOTE: this affects synchronization of
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+ *
+ * Note that send events are suppressed and do not result in an upcall.
+ */
+static void
+rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
+{
+	int rc;
+
+	rc = rpcrdma_cq_poll(cq);
+	if (rc)
+		return;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rpcrdma_cq_poll(cq);
+}
+
+#ifdef RPC_DEBUG
+static const char * const conn[] = {
+	"address resolved",
+	"address error",
+	"route resolved",
+	"route error",
+	"connect request",
+	"connect response",
+	"connect error",
+	"unreachable",
+	"rejected",
+	"established",
+	"disconnected",
+	"device removal"
+};
+#endif
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rpcrdma_xprt *xprt = id->context;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	struct rpcrdma_ep *ep = &xprt->rx_ep;
+#ifdef RPC_DEBUG
+	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+#endif
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr iattr;
+	int connstate = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ia->ri_async_rc = 0;
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ia->ri_async_rc = -EHOSTUNREACH;
+		dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ia->ri_async_rc = -ENETUNREACH;
+		dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		connstate = 1;
+		ib_query_qp(ia->ri_id->qp, &attr,
+			IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+			&iattr);
+		dprintk("RPC:       %s: %d responder resources"
+			" (%d initiator)\n",
+			__func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
+		goto connected;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		connstate = -ENOTCONN;
+		goto connected;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		connstate = -ENETDOWN;
+		goto connected;
+	case RDMA_CM_EVENT_REJECTED:
+		connstate = -ECONNREFUSED;
+		goto connected;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		connstate = -ECONNABORTED;
+		goto connected;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		connstate = -ENODEV;
+connected:
+		dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
+			__func__,
+			(event->event <= 11) ? conn[event->event] :
+						"unknown connection error",
+			&addr->sin_addr.s_addr,
+			ntohs(addr->sin_port),
+			ep, event->event);
+		atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
+		dprintk("RPC:       %s: %sconnected\n",
+					__func__, connstate > 0 ? "" : "dis");
+		ep->rep_connected = connstate;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+		break;
+	default:
+		dprintk("RPC:       %s: unexpected CM event %d\n",
+			__func__, event->event);
+		break;
+	}
+
+#ifdef RPC_DEBUG
+	if (connstate == 1) {
+		int ird = attr.max_dest_rd_atomic;
+		int tird = ep->rep_remote_cma.responder_resources;
+		printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
+			"on %s, memreg %d slots %d ird %d%s\n",
+			&addr->sin_addr.s_addr,
+			ntohs(addr->sin_port),
+			ia->ri_id->device->name,
+			ia->ri_memreg_strategy,
+			xprt->rx_buf.rb_max_requests,
+			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+	} else if (connstate < 0) {
+		printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
+			&addr->sin_addr.s_addr,
+			ntohs(addr->sin_port),
+			connstate);
+	}
+#endif
+
+	return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt,
+			struct rpcrdma_ia *ia, struct sockaddr *addr)
+{
+	struct rdma_cm_id *id;
+	int rc;
+
+	init_completion(&ia->ri_done);
+
+	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
+			__func__, rc);
+		return id;
+	}
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Drain any cq, prior to teardown.
+ */
+static void
+rpcrdma_clean_cq(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int count = 0;
+
+	while (1 == ib_poll_cq(cq, 1, &wc))
+		++count;
+
+	if (count)
+		dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
+			__func__, count, wc.opcode);
+}
+
+/*
+ * Exported functions.
+ */
+
+/*
+ * Open and initialize an Interface Adapter.
+ *  o initializes fields of struct rpcrdma_ia, including
+ *    interface and provider attributes and protection zone.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+{
+	int rc, mem_priv;
+	struct ib_device_attr devattr;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+
+	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+	if (IS_ERR(ia->ri_id)) {
+		rc = PTR_ERR(ia->ri_id);
+		goto out1;
+	}
+
+	ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+	if (IS_ERR(ia->ri_pd)) {
+		rc = PTR_ERR(ia->ri_pd);
+		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	/*
+	 * Query the device to determine if the requested memory
+	 * registration strategy is supported. If it isn't, set the
+	 * strategy to a globally supported model.
+	 */
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+		ia->ri_have_dma_lkey = 1;
+		ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+	}
+
+	switch (memreg) {
+	case RPCRDMA_MEMWINDOWS:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+		if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+			dprintk("RPC:       %s: MEMWINDOWS registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+		}
+		break;
+	case RPCRDMA_MTHCAFMR:
+		if (!ia->ri_id->device->alloc_fmr) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+			dprintk("RPC:       %s: MTHCAFMR registration "
+				"specified but not supported by adapter, "
+				"using riskier RPCRDMA_ALLPHYSICAL\n",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+#else
+			dprintk("RPC:       %s: MTHCAFMR registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+#endif
+		}
+		break;
+	case RPCRDMA_FRMR:
+		/* Requires both frmr reg and local dma lkey */
+		if ((devattr.device_cap_flags &
+		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+#if RPCRDMA_PERSISTENT_REGISTRATION
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using riskier RPCRDMA_ALLPHYSICAL\n",
+				__func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+#else
+			dprintk("RPC:       %s: FRMR registration "
+				"specified but not supported by adapter, "
+				"using slower RPCRDMA_REGISTER\n",
+				__func__);
+			memreg = RPCRDMA_REGISTER;
+#endif
+		}
+		break;
+	}
+
+	/*
+	 * Optionally obtain an underlying physical identity mapping in
+	 * order to do a memory window-based bind. This base registration
+	 * is protected from remote access - that is enabled only by binding
+	 * for the specific bytes targeted during each RPC operation, and
+	 * revoked after the corresponding completion similar to a storage
+	 * adapter.
+	 */
+	switch (memreg) {
+	case RPCRDMA_BOUNCEBUFFERS:
+	case RPCRDMA_REGISTER:
+	case RPCRDMA_FRMR:
+		break;
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		mem_priv = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE |
+				IB_ACCESS_REMOTE_READ;
+		goto register_setup;
+#endif
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		mem_priv = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_MW_BIND;
+		goto register_setup;
+	case RPCRDMA_MTHCAFMR:
+		if (ia->ri_have_dma_lkey)
+			break;
+		mem_priv = IB_ACCESS_LOCAL_WRITE;
+	register_setup:
+		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+		if (IS_ERR(ia->ri_bind_mem)) {
+			printk(KERN_ALERT "%s: ib_get_dma_mr for "
+				"phys register failed with %lX\n\t"
+				"Will continue with degraded performance\n",
+				__func__, PTR_ERR(ia->ri_bind_mem));
+			memreg = RPCRDMA_REGISTER;
+			ia->ri_bind_mem = NULL;
+		}
+		break;
+	default:
+		printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+				__func__, memreg);
+		rc = -EINVAL;
+		goto out2;
+	}
+	dprintk("RPC:       %s: memory registration strategy is %d\n",
+		__func__, memreg);
+
+	/* Else will do memory reg/dereg for each chunk */
+	ia->ri_memreg_strategy = memreg;
+
+	return 0;
+out2:
+	rdma_destroy_id(ia->ri_id);
+	ia->ri_id = NULL;
+out1:
+	return rc;
+}
+
+/*
+ * Clean up/close an IA.
+ *   o if event handles and PD have been initialized, free them.
+ *   o close the IA
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering\n", __func__);
+	if (ia->ri_bind_mem != NULL) {
+		rc = ib_dereg_mr(ia->ri_bind_mem);
+		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+		if (ia->ri_id->qp)
+			rdma_destroy_qp(ia->ri_id);
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = NULL;
+	}
+	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+		rc = ib_dealloc_pd(ia->ri_pd);
+		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
+			__func__, rc);
+	}
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+				struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr devattr;
+	int rc, err;
+
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		return rc;
+	}
+
+	/* check provider's send/recv wr limits */
+	if (cdata->max_requests > devattr.max_qp_wr)
+		cdata->max_requests = devattr.max_qp_wr;
+
+	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+	ep->rep_attr.qp_context = ep;
+	/* send_cq and recv_cq initialized below */
+	ep->rep_attr.srq = NULL;
+	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		/* Add room for frmr register and invalidate WRs.
+		 * 1. FRMR reg WR for head
+		 * 2. FRMR invalidate WR for head
+		 * 3. FRMR reg WR for pagelist
+		 * 4. FRMR invalidate WR for pagelist
+		 * 5. FRMR reg WR for tail
+		 * 6. FRMR invalidate WR for tail
+		 * 7. The RDMA_SEND WR
+		 */
+		ep->rep_attr.cap.max_send_wr *= 7;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+			cdata->max_requests = devattr.max_qp_wr / 7;
+			if (!cdata->max_requests)
+				return -EINVAL;
+			ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+		}
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/* Add room for mw_binds+unbinds - overkill! */
+		ep->rep_attr.cap.max_send_wr++;
+		ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+	ep->rep_attr.cap.max_recv_sge = 1;
+	ep->rep_attr.cap.max_inline_data = 0;
+	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->rep_attr.qp_type = IB_QPT_RC;
+	ep->rep_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr,
+		ep->rep_attr.cap.max_send_sge,
+		ep->rep_attr.cap.max_recv_sge);
+
+	/* set trigger for requesting send completion */
+	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
+		break;
+	default:
+		break;
+	}
+	if (ep->rep_cqinit <= 2)
+		ep->rep_cqinit = 0;
+	INIT_CQCOUNT(ep);
+	ep->rep_ia = ia;
+	init_waitqueue_head(&ep->rep_connect_wait);
+
+	/*
+	 * Create a single cq for receive dto and mw_bind (only ever
+	 * care about unbind, really). Send completions are suppressed.
+	 * Use single threaded tasklet upcalls to maintain ordering.
+	 */
+	ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+				  rpcrdma_cq_async_error_upcall, NULL,
+				  ep->rep_attr.cap.max_recv_wr +
+				  ep->rep_attr.cap.max_send_wr + 1, 0);
+	if (IS_ERR(ep->rep_cq)) {
+		rc = PTR_ERR(ep->rep_cq);
+		dprintk("RPC:       %s: ib_create_cq failed: %i\n",
+			__func__, rc);
+		goto out1;
+	}
+
+	rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	ep->rep_attr.send_cq = ep->rep_cq;
+	ep->rep_attr.recv_cq = ep->rep_cq;
+
+	/* Initialize cma parameters */
+
+	/* RPC/RDMA does not use private data */
+	ep->rep_remote_cma.private_data = NULL;
+	ep->rep_remote_cma.private_data_len = 0;
+
+	/* Client offers RDMA Read but does not initiate */
+	ep->rep_remote_cma.initiator_depth = 0;
+	if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
+		ep->rep_remote_cma.responder_resources = 0;
+	else if (devattr.max_qp_rd_atom > 32)	/* arbitrary but <= 255 */
+		ep->rep_remote_cma.responder_resources = 32;
+	else
+		ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+
+	ep->rep_remote_cma.retry_count = 7;
+	ep->rep_remote_cma.flow_control = 0;
+	ep->rep_remote_cma.rnr_retry_count = 0;
+
+	return 0;
+
+out2:
+	err = ib_destroy_cq(ep->rep_cq);
+	if (err)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, err);
+out1:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ *
+ * The caller's error handling must be sure to not leak the endpoint
+ * if this function fails.
+ */
+int
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering, connected is %d\n",
+		__func__, ep->rep_connected);
+
+	if (ia->ri_id->qp) {
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" returned %i\n", __func__, rc);
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+
+	/* padding - could be done in rpcrdma_buffer_destroy... */
+	if (ep->rep_pad_mr) {
+		rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
+		ep->rep_pad_mr = NULL;
+	}
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = ib_destroy_cq(ep->rep_cq);
+	if (rc)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, rc);
+
+	return rc;
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	struct rdma_cm_id *id;
+	int rc = 0;
+	int retry_count = 0;
+
+	if (ep->rep_connected != 0) {
+		struct rpcrdma_xprt *xprt;
+retry:
+		rc = rpcrdma_ep_disconnect(ep, ia);
+		if (rc && rc != -ENOTCONN)
+			dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+				" status %i\n", __func__, rc);
+		rpcrdma_clean_cq(ep->rep_cq);
+
+		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		id = rpcrdma_create_id(xprt, ia,
+				(struct sockaddr *)&xprt->rx_data.addr);
+		if (IS_ERR(id)) {
+			rc = PTR_ERR(id);
+			goto out;
+		}
+		/* TEMP TEMP TEMP - fail if new device:
+		 * Deregister/remarshal *all* requests!
+		 * Close and recreate adapter, pd, etc!
+		 * Re-determine all attributes still sane!
+		 * More stuff I haven't thought of!
+		 * Rrrgh!
+		 */
+		if (ia->ri_id->device != id->device) {
+			printk("RPC:       %s: can't reconnect on "
+				"different device!\n", __func__);
+			rdma_destroy_id(id);
+			rc = -ENETDOWN;
+			goto out;
+		}
+		/* END TEMP */
+		rdma_destroy_qp(ia->ri_id);
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = id;
+	}
+
+	rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+
+/* XXX Tavor device performs badly with 2K MTU! */
+if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
+	struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
+	if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
+	    (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
+	     pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
+		struct ib_qp_attr attr = {
+			.path_mtu = IB_MTU_1024
+		};
+		rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
+	}
+}
+
+	ep->rep_connected = 0;
+
+	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
+				__func__, rc);
+		goto out;
+	}
+
+	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+
+	/*
+	 * Check state. A non-peer reject indicates no listener
+	 * (ECONNREFUSED), which may be a transient state. All
+	 * others indicate a transport condition which has already
+	 * undergone a best-effort.
+	 */
+	if (ep->rep_connected == -ECONNREFUSED &&
+	    ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+		dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
+		goto retry;
+	}
+	if (ep->rep_connected <= 0) {
+		/* Sometimes, the only way to reliably connect to remote
+		 * CMs is to use same nonzero values for ORD and IRD. */
+		if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+		    (ep->rep_remote_cma.responder_resources == 0 ||
+		     ep->rep_remote_cma.initiator_depth !=
+				ep->rep_remote_cma.responder_resources)) {
+			if (ep->rep_remote_cma.responder_resources == 0)
+				ep->rep_remote_cma.responder_resources = 1;
+			ep->rep_remote_cma.initiator_depth =
+				ep->rep_remote_cma.responder_resources;
+			goto retry;
+		}
+		rc = ep->rep_connected;
+	} else {
+		dprintk("RPC:       %s: connected\n", __func__);
+	}
+
+out:
+	if (rc)
+		ep->rep_connected = rc;
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+int
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	rpcrdma_clean_cq(ep->rep_cq);
+	rc = rdma_disconnect(ia->ri_id);
+	if (!rc) {
+		/* returns without wait if not connected */
+		wait_event_interruptible(ep->rep_connect_wait,
+							ep->rep_connected != 1);
+		dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
+			(ep->rep_connected == 1) ? "still " : "dis");
+	} else {
+		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
+		ep->rep_connected = rc;
+	}
+	return rc;
+}
+
+/*
+ * Initialize buffer memory
+ */
+int
+rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+	struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+{
+	char *p;
+	size_t len;
+	int i, rc;
+	struct rpcrdma_mw *r;
+
+	buf->rb_max_requests = cdata->max_requests;
+	spin_lock_init(&buf->rb_lock);
+	atomic_set(&buf->rb_credits, 1);
+
+	/* Need to allocate:
+	 *   1.  arrays for send and recv pointers
+	 *   2.  arrays of struct rpcrdma_req to fill in pointers
+	 *   3.  array of struct rpcrdma_rep for replies
+	 *   4.  padding, if any
+	 *   5.  mw's, fmr's or frmr's, if any
+	 * Send/recv buffers in req/rep need to be registered
+	 */
+
+	len = buf->rb_max_requests *
+		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
+	len += cdata->padding;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	case RPCRDMA_MTHCAFMR:
+		/* TBD we are perhaps overallocating here */
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+				sizeof(struct rpcrdma_mw);
+		break;
+	default:
+		break;
+	}
+
+	/* allocate 1, 4 and 5 in one shot */
+	p = kzalloc(len, GFP_KERNEL);
+	if (p == NULL) {
+		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
+			__func__, len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	buf->rb_pool = p;	/* for freeing it later */
+
+	buf->rb_send_bufs = (struct rpcrdma_req **) p;
+	p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
+	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
+	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+
+	/*
+	 * Register the zeroed pad buffer, if any.
+	 */
+	if (cdata->padding) {
+		rc = rpcrdma_register_internal(ia, p, cdata->padding,
+					    &ep->rep_pad_mr, &ep->rep_pad);
+		if (rc)
+			goto out;
+	}
+	p += cdata->padding;
+
+	/*
+	 * Allocate the fmr's, or mw's for mw_bind chunk registration.
+	 * We "cycle" the mw's in order to minimize rkey reuse,
+	 * and also reduce unbind-to-bind collision.
+	 */
+	INIT_LIST_HEAD(&buf->rb_mws);
+	r = (struct rpcrdma_mw *)p;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+							 RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_mr)) {
+				rc = PTR_ERR(r->r.frmr.fr_mr);
+				dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			r->r.frmr.fr_pgl =
+				ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							    RPCRDMA_MAX_SEGS);
+			if (IS_ERR(r->r.frmr.fr_pgl)) {
+				rc = PTR_ERR(r->r.frmr.fr_pgl);
+				dprintk("RPC:       %s: "
+					"ib_alloc_fast_reg_page_list "
+					"failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		break;
+	case RPCRDMA_MTHCAFMR:
+		/* TBD we are perhaps overallocating here */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			static struct ib_fmr_attr fa =
+				{ RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
+			r->r.fmr = ib_alloc_fmr(ia->ri_pd,
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
+				&fa);
+			if (IS_ERR(r->r.fmr)) {
+				rc = PTR_ERR(r->r.fmr);
+				dprintk("RPC:       %s: ib_alloc_fmr"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		break;
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/* Allocate one extra request's worth, for full cycling */
+		for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+			r->r.mw = ib_alloc_mw(ia->ri_pd);
+			if (IS_ERR(r->r.mw)) {
+				rc = PTR_ERR(r->r.mw);
+				dprintk("RPC:       %s: ib_alloc_mw"
+					" failed %i\n", __func__, rc);
+				goto out;
+			}
+			list_add(&r->mw_list, &buf->rb_mws);
+			++r;
+		}
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Allocate/init the request/reply buffers. Doing this
+	 * using kmalloc for now -- one for each buf.
+	 */
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		struct rpcrdma_req *req;
+		struct rpcrdma_rep *rep;
+
+		len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
+		/* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
+		/* Typical ~2400b, so rounding up saves work later */
+		if (len < 4096)
+			len = 4096;
+		req = kmalloc(len, GFP_KERNEL);
+		if (req == NULL) {
+			dprintk("RPC:       %s: request buffer %d alloc"
+				" failed\n", __func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(req, 0, sizeof(struct rpcrdma_req));
+		buf->rb_send_bufs[i] = req;
+		buf->rb_send_bufs[i]->rl_buffer = buf;
+
+		rc = rpcrdma_register_internal(ia, req->rl_base,
+				len - offsetof(struct rpcrdma_req, rl_base),
+				&buf->rb_send_bufs[i]->rl_handle,
+				&buf->rb_send_bufs[i]->rl_iov);
+		if (rc)
+			goto out;
+
+		buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
+
+		len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
+		rep = kmalloc(len, GFP_KERNEL);
+		if (rep == NULL) {
+			dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+				__func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(rep, 0, sizeof(struct rpcrdma_rep));
+		buf->rb_recv_bufs[i] = rep;
+		buf->rb_recv_bufs[i]->rr_buffer = buf;
+		init_waitqueue_head(&rep->rr_unbind);
+
+		rc = rpcrdma_register_internal(ia, rep->rr_base,
+				len - offsetof(struct rpcrdma_rep, rr_base),
+				&buf->rb_recv_bufs[i]->rr_handle,
+				&buf->rb_recv_bufs[i]->rr_iov);
+		if (rc)
+			goto out;
+
+	}
+	dprintk("RPC:       %s: max_requests %d\n",
+		__func__, buf->rb_max_requests);
+	/* done */
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+/*
+ * Unregister and destroy buffer memory. Need to deal with
+ * partial initialization, so it's callable from failed create.
+ * Must be called before destroying endpoint, as registrations
+ * reference it.
+ */
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	int rc, i;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct rpcrdma_mw *r;
+
+	/* clean up in reverse order from create
+	 *   1.  recv mr memory (mr free, then kfree)
+	 *   1a. bind mw memory
+	 *   2.  send mr memory (mr free, then kfree)
+	 *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
+	 *   4.  arrays
+	 */
+	dprintk("RPC:       %s: entering\n", __func__);
+
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
+			rpcrdma_deregister_internal(ia,
+					buf->rb_recv_bufs[i]->rr_handle,
+					&buf->rb_recv_bufs[i]->rr_iov);
+			kfree(buf->rb_recv_bufs[i]);
+		}
+		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+			while (!list_empty(&buf->rb_mws)) {
+				r = list_entry(buf->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+				list_del(&r->mw_list);
+				switch (ia->ri_memreg_strategy) {
+				case RPCRDMA_FRMR:
+					rc = ib_dereg_mr(r->r.frmr.fr_mr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dereg_mr"
+							" failed %i\n",
+							__func__, rc);
+					ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+					break;
+				case RPCRDMA_MTHCAFMR:
+					rc = ib_dealloc_fmr(r->r.fmr);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_fmr"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				case RPCRDMA_MEMWINDOWS_ASYNC:
+				case RPCRDMA_MEMWINDOWS:
+					rc = ib_dealloc_mw(r->r.mw);
+					if (rc)
+						dprintk("RPC:       %s:"
+							" ib_dealloc_mw"
+							" failed %i\n",
+							__func__, rc);
+					break;
+				default:
+					break;
+				}
+			}
+			rpcrdma_deregister_internal(ia,
+					buf->rb_send_bufs[i]->rl_handle,
+					&buf->rb_send_bufs[i]->rl_iov);
+			kfree(buf->rb_send_bufs[i]);
+		}
+	}
+
+	kfree(buf->rb_pool);
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if needed) is attached to send buffer upon return.
+ * Rule:
+ *    rb_send_index and rb_recv_index MUST always be pointing to the
+ *    *next* available buffer (non-NULL). They are incremented after
+ *    removing buffers, and decremented *before* returning them.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_req *req;
+	unsigned long flags;
+	int i;
+	struct rpcrdma_mw *r;
+
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_send_index == buffers->rb_max_requests) {
+		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		dprintk("RPC:       %s: out of request buffers\n", __func__);
+		return ((struct rpcrdma_req *)NULL);
+	}
+
+	req = buffers->rb_send_bufs[buffers->rb_send_index];
+	if (buffers->rb_send_index < buffers->rb_recv_index) {
+		dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
+			__func__,
+			buffers->rb_recv_index - buffers->rb_send_index);
+		req->rl_reply = NULL;
+	} else {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+	if (!list_empty(&buffers->rb_mws)) {
+		i = RPCRDMA_MAX_SEGS - 1;
+		do {
+			r = list_entry(buffers->rb_mws.next,
+					struct rpcrdma_mw, mw_list);
+			list_del(&r->mw_list);
+			req->rl_segments[i].mr_chunk.rl_mw = r;
+		} while (--i >= 0);
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	return req;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	int i;
+	unsigned long flags;
+
+	BUG_ON(req->rl_nchunks != 0);
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_send_bufs[--buffers->rb_send_index] = req;
+	req->rl_niovs = 0;
+	if (req->rl_reply) {
+		buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
+		init_waitqueue_head(&req->rl_reply->rr_unbind);
+		req->rl_reply->rr_func = NULL;
+		req->rl_reply = NULL;
+	}
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+	case RPCRDMA_MTHCAFMR:
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		/*
+		 * Cycle mw's back in reverse order, and "spin" them.
+		 * This delays and scrambles reuse as much as possible.
+		 */
+		i = 1;
+		do {
+			struct rpcrdma_mw **mw;
+			mw = &req->rl_segments[i].mr_chunk.rl_mw;
+			list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
+			*mw = NULL;
+		} while (++i < RPCRDMA_MAX_SEGS);
+		list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
+					&buffers->rb_mws);
+		req->rl_segments[0].mr_chunk.rl_mw = NULL;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from error conditions.
+ * Post-increment counter/array index.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	unsigned long flags;
+
+	if (req->rl_iov.length == 0)	/* special case xprt_rdma_allocate() */
+		buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_recv_index < buffers->rb_max_requests) {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions, and when
+ * aborting unbinds. Pre-decrement counter/array index.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_buffer *buffers = rep->rr_buffer;
+	unsigned long flags;
+
+	rep->rr_func = NULL;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Wrappers for internal-use kmalloc memory registration, used by buffer code.
+ */
+
+int
+rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
+				struct ib_mr **mrp, struct ib_sge *iov)
+{
+	struct ib_phys_buf ipb;
+	struct ib_mr *mr;
+	int rc;
+
+	/*
+	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
+	 */
+	iov->addr = ib_dma_map_single(ia->ri_id->device,
+			va, len, DMA_BIDIRECTIONAL);
+	iov->length = len;
+
+	if (ia->ri_have_dma_lkey) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_dma_lkey;
+		return 0;
+	} else if (ia->ri_bind_mem != NULL) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_bind_mem->lkey;
+		return 0;
+	}
+
+	ipb.addr = iov->addr;
+	ipb.size = iov->length;
+	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
+			IB_ACCESS_LOCAL_WRITE, &iov->addr);
+
+	dprintk("RPC:       %s: phys convert: 0x%llx "
+			"registered 0x%llx length %d\n",
+			__func__, (unsigned long long)ipb.addr,
+			(unsigned long long)iov->addr, len);
+
+	if (IS_ERR(mr)) {
+		*mrp = NULL;
+		rc = PTR_ERR(mr);
+		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
+	} else {
+		*mrp = mr;
+		iov->lkey = mr->lkey;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+int
+rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
+				struct ib_mr *mr, struct ib_sge *iov)
+{
+	int rc;
+
+	ib_dma_unmap_single(ia->ri_id->device,
+			iov->addr, iov->length, DMA_BIDIRECTIONAL);
+
+	if (NULL == mr)
+		return 0;
+
+	rc = ib_dereg_mr(mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
+	return rc;
+}
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static void
+rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
+{
+	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	seg->mr_dmalen = seg->mr_len;
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+	if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
+		dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
+			__func__,
+			(unsigned long long)seg->mr_dma,
+			seg->mr_offset, seg->mr_dmalen);
+	}
+}
+
+static void
+rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
+
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
+		len += seg->mr_len;
+		BUG_ON(seg->mr_len > PAGE_SIZE);
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+		__func__, seg1->mr_chunk.rl_mw, i);
+
+	if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
+		dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
+			__func__,
+			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
+		/* Invalidate before using. */
+		memset(&invalidate_wr, 0, sizeof invalidate_wr);
+		invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+		invalidate_wr.next = &frmr_wr;
+		invalidate_wr.opcode = IB_WR_LOCAL_INV;
+		invalidate_wr.send_flags = IB_SEND_SIGNALED;
+		invalidate_wr.ex.invalidate_rkey =
+			seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+		post_wr = &invalidate_wr;
+	} else
+		post_wr = &frmr_wr;
+
+	/* Bump the key */
+	key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+
+	/* Prepare FRMR WR */
+	memset(&frmr_wr, 0, sizeof frmr_wr);
+	frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+	frmr_wr.opcode = IB_WR_FAST_REG_MR;
+	frmr_wr.send_flags = IB_SEND_SIGNALED;
+	frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+	frmr_wr.wr.fast_reg.page_list_len = i;
+	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
+	BUG_ON(frmr_wr.wr.fast_reg.length < len);
+	frmr_wr.wr.fast_reg.access_flags = (writing ?
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ);
+	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
+
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_post_send for register,"
+			" status %i\n", __func__, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+
+	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.send_flags = IB_SEND_SIGNALED;
+	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+	int len, pageoff, i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		physaddrs[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+				physaddrs, i, seg1->mr_dma);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_map_phys_fmr "
+			"%u@0x%llx+%i (%d)... status %i\n", __func__,
+			len, (unsigned long long)seg1->mr_dma,
+			pageoff, i, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	LIST_HEAD(l);
+	int rc;
+
+	list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_unmap_fmr,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+static int
+rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct ib_mw_bind param;
+	int rc;
+
+	*nsegs = 1;
+	rpcrdma_map_one(ia, seg, writing);
+	param.mr = ia->ri_bind_mem;
+	param.wr_id = 0ULL;	/* no send cookie */
+	param.addr = seg->mr_dma;
+	param.length = seg->mr_len;
+	param.send_flags = 0;
+	param.mw_access_flags = mem_priv;
+
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+	rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_bind_mw "
+			"%u@0x%llx status %i\n",
+			__func__, seg->mr_len,
+			(unsigned long long)seg->mr_dma, rc);
+		rpcrdma_unmap_one(ia, seg);
+	} else {
+		seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+		seg->mr_base = param.addr;
+		seg->mr_nsegs = 1;
+	}
+	return rc;
+}
+
+static int
+rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt, void **r)
+{
+	struct ib_mw_bind param;
+	LIST_HEAD(l);
+	int rc;
+
+	BUG_ON(seg->mr_nsegs != 1);
+	param.mr = ia->ri_bind_mem;
+	param.addr = 0ULL;	/* unbind */
+	param.length = 0;
+	param.mw_access_flags = 0;
+	if (*r) {
+		param.wr_id = (u64) (unsigned long) *r;
+		param.send_flags = IB_SEND_SIGNALED;
+		INIT_CQCOUNT(&r_xprt->rx_ep);
+	} else {
+		param.wr_id = 0ULL;
+		param.send_flags = 0;
+		DECR_CQCOUNT(&r_xprt->rx_ep);
+	}
+	rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+	rpcrdma_unmap_one(ia, seg);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+			" status %i\n", __func__, rc);
+	else
+		*r = NULL;	/* will upcall on completion */
+	return rc;
+}
+
+static int
+rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+	int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+				  IB_ACCESS_REMOTE_READ);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+	int len, i, rc = 0;
+
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (len = 0, i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		ipb[i].addr = seg->mr_dma;
+		ipb[i].size = seg->mr_len;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+			break;
+	}
+	seg1->mr_base = seg1->mr_dma;
+	seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+				ipb, i, mem_priv, &seg1->mr_base);
+	if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+		rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+		dprintk("RPC:       %s: failed ib_reg_phys_mr "
+			"%u@0x%llx (%d)... status %i\n",
+			__func__, len,
+			(unsigned long long)seg1->mr_dma, i, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int rc;
+
+	rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+	seg1->mr_chunk.rl_mr = NULL;
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_dereg_mr,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+int
+rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int rc = 0;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		rpcrdma_map_one(ia, seg, writing);
+		seg->mr_rkey = ia->ri_bind_mem->rkey;
+		seg->mr_base = seg->mr_dma;
+		seg->mr_nsegs = 1;
+		nsegs = 1;
+		break;
+#endif
+
+	/* Registration using frmr registration */
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+		break;
+
+	/* Registration using fmr memory registration */
+	case RPCRDMA_MTHCAFMR:
+		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+		break;
+
+	/* Registration using memory windows */
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
+		break;
+
+	/* Default registration each time */
+	default:
+		rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
+		break;
+	}
+	if (rc)
+		return -1;
+
+	return nsegs;
+}
+
+int
+rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+		struct rpcrdma_xprt *r_xprt, void *r)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int nsegs = seg->mr_nsegs, rc;
+
+	switch (ia->ri_memreg_strategy) {
+
+#if RPCRDMA_PERSISTENT_REGISTRATION
+	case RPCRDMA_ALLPHYSICAL:
+		BUG_ON(nsegs != 1);
+		rpcrdma_unmap_one(ia, seg);
+		rc = 0;
+		break;
+#endif
+
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+		break;
+
+	case RPCRDMA_MTHCAFMR:
+		rc = rpcrdma_deregister_fmr_external(seg, ia);
+		break;
+
+	case RPCRDMA_MEMWINDOWS_ASYNC:
+	case RPCRDMA_MEMWINDOWS:
+		rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
+		break;
+
+	default:
+		rc = rpcrdma_deregister_default_external(seg, ia);
+		break;
+	}
+	if (r) {
+		struct rpcrdma_rep *rep = r;
+		void (*func)(struct rpcrdma_rep *) = rep->rr_func;
+		rep->rr_func = NULL;
+		func(rep);	/* dereg done, callback now */
+	}
+	return nsegs;
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+		struct rpcrdma_ep *ep,
+		struct rpcrdma_req *req)
+{
+	struct ib_send_wr send_wr, *send_wr_fail;
+	struct rpcrdma_rep *rep = req->rl_reply;
+	int rc;
+
+	if (rep) {
+		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		if (rc)
+			goto out;
+		req->rl_reply = NULL;
+	}
+
+	send_wr.next = NULL;
+	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.sg_list = req->rl_send_iov;
+	send_wr.num_sge = req->rl_niovs;
+	send_wr.opcode = IB_WR_SEND;
+	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
+		ib_dma_sync_single_for_device(ia->ri_id->device,
+			req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
+			DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
+		DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
+		DMA_TO_DEVICE);
+
+	if (DECR_CQCOUNT(ep) > 0)
+		send_wr.send_flags = 0;
+	else { /* Provider must take a send completion every now and then */
+		INIT_CQCOUNT(ep);
+		send_wr.send_flags = IB_SEND_SIGNALED;
+	}
+
+	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	if (rc)
+		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
+			rc);
+out:
+	return rc;
+}
+
+/*
+ * (Re)post a receive buffer.
+ */
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+		     struct rpcrdma_ep *ep,
+		     struct rpcrdma_rep *rep)
+{
+	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	int rc;
+
+	recv_wr.next = NULL;
+	recv_wr.wr_id = (u64) (unsigned long) rep;
+	recv_wr.sg_list = &rep->rr_iov;
+	recv_wr.num_sge = 1;
+
+	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+		rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+
+	DECR_CQCOUNT(ep);
+	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+	if (rc)
+		dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
+			rc);
+	return rc;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 00000000..cae761a8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <asm/atomic.h>			/* atomic_t, etc */
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+
+#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+	struct rdma_cm_id 	*ri_id;
+	struct ib_pd		*ri_pd;
+	struct ib_mr		*ri_bind_mem;
+	u32			ri_dma_lkey;
+	int			ri_have_dma_lkey;
+	struct completion	ri_done;
+	int			ri_async_rc;
+	enum rpcrdma_memreg	ri_memreg_strategy;
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+struct rpcrdma_ep {
+	atomic_t		rep_cqcount;
+	int			rep_cqinit;
+	int			rep_connected;
+	struct rpcrdma_ia	*rep_ia;
+	struct ib_cq		*rep_cq;
+	struct ib_qp_init_attr	rep_attr;
+	wait_queue_head_t 	rep_connect_wait;
+	struct ib_sge		rep_pad;	/* holds zeroed pad */
+	struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */
+	void			(*rep_func)(struct rpcrdma_ep *);
+	struct rpc_xprt		*rep_xprt;	/* for rep_func */
+	struct rdma_conn_param	rep_remote_cma;
+	struct sockaddr_storage	rep_remote_addr;
+};
+
+#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required to recv
+ * and complete a reply, asychronously. It needs several pieces of
+ * state:
+ *   o recv buffer (posted to provider)
+ *   o ib_sge (also donated to provider)
+ *   o status of reply (length, success or not)
+ *   o bookkeeping state to get run by tasklet (list, etc)
+ *
+ * These are allocated during initialization, per-transport instance;
+ * however, the tasklet execution list itself is global, as it should
+ * always be pretty short.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ */
+
+/* temporary static scatter/gather max */
+#define RPCRDMA_MAX_DATA_SEGS	(8)	/* max scatter/gather */
+#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+#define MAX_RPCRDMAHDR	(\
+	/* max supported RPC/RDMA header */ \
+	sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
+	(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
+
+struct rpcrdma_buffer;
+
+struct rpcrdma_rep {
+	unsigned int	rr_len;		/* actual received reply length */
+	struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
+	struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */
+	void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+	struct list_head rr_list;	/* tasklet list */
+	wait_queue_head_t rr_unbind;	/* optional unbind wait */
+	struct ib_sge	rr_iov;		/* for posting */
+	struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */
+	char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ *
+ * NOTES:
+ *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
+ *     marshal. The number needed varies depending on the iov lists that
+ *     are passed to us, the memory registration mode we are in, and if
+ *     physical addressing is used, the layout.
+ */
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	union {				/* chunk memory handles */
+		struct ib_mr	*rl_mr;		/* if registered directly */
+		struct rpcrdma_mw {		/* if registered from region */
+			union {
+				struct ib_mw	*mw;
+				struct ib_fmr	*fmr;
+				struct {
+					struct ib_fast_reg_page_list *fr_pgl;
+					struct ib_mr *fr_mr;
+					enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
+				} frmr;
+			} r;
+			struct list_head mw_list;
+		} *rl_mw;
+	} mr_chunk;
+	u64		mr_base;	/* registration result */
+	u32		mr_rkey;	/* registration result */
+	u32		mr_len;		/* length of chunk or segment */
+	int		mr_nsegs;	/* number of segments in chunk or 0 */
+	enum dma_data_direction	mr_dir;	/* segment mapping direction */
+	dma_addr_t	mr_dma;		/* segment mapping address */
+	size_t		mr_dmalen;	/* segment mapping length */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+struct rpcrdma_req {
+	size_t 		rl_size;	/* actual length of buffer */
+	unsigned int	rl_niovs;	/* 0, 2 or 4 */
+	unsigned int	rl_nchunks;	/* non-zero if chunks */
+	unsigned int	rl_connect_cookie;	/* retry detection */
+	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
+	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
+	struct ib_sge	rl_send_iov[4];	/* for active requests */
+	struct ib_sge	rl_iov;		/* for posting */
+	struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */
+	char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
+	__u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */
+};
+#define rpcr_to_rdmar(r) \
+	container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t	rb_lock;	/* protects indexes */
+	atomic_t	rb_credits;	/* most recent server credits */
+	unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */
+	int		rb_max_requests;/* client max requests */
+	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
+	int		rb_send_index;
+	struct rpcrdma_req	**rb_send_bufs;
+	int		rb_recv_index;
+	struct rpcrdma_rep	**rb_recv_bufs;
+	char		*rb_pool;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+	struct sockaddr_storage	addr;	/* RDMA server address */
+	unsigned int	max_requests;	/* max requests (slots) in flight */
+	unsigned int	rsize;		/* mount rsize - max read hdr+data */
+	unsigned int	wsize;		/* mount wsize - max write hdr+data */
+	unsigned int	inline_rsize;	/* max non-rdma read data payload */
+	unsigned int	inline_wsize;	/* max non-rdma write data payload */
+	unsigned int	padding;	/* non-rdma write header padding */
+};
+
+#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
+
+#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
+	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
+
+#define RPCRDMA_INLINE_PAD_VALUE(rq)\
+	rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+
+	unsigned long long	total_rdma_request;
+	unsigned long long	total_rdma_reply;
+
+	unsigned long long	pullup_copy_count;
+	unsigned long long	fixup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+};
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		xprt;
+	struct rpcrdma_ia	rx_ia;
+	struct rpcrdma_ep	rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct rpcrdma_create_data_internal rx_data;
+	struct delayed_work	rdma_connect;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
+				struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
+				struct ib_mr **, struct ib_sge *);
+int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+				struct ib_mr *, struct ib_sge *);
+
+int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+				int, int, struct rpcrdma_xprt *);
+int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+				struct rpcrdma_xprt *, void *);
+
+/*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
+void rpcrdma_conn_func(struct rpcrdma_ep *);
+void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+int rpcrdma_marshal_req(struct rpc_rqst *);
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
new file mode 100644
index 00000000..ea750797
--- /dev/null
+++ b/net/sunrpc/xprtsock.c
@@ -0,0 +1,2953 @@
+/*
+ * linux/net/sunrpc/xprtsock.c
+ *
+ * Client-side transport implementation for sockets.
+ *
+ * TCP callback races fixes (C) 1998 Red Hat
+ * TCP send fixes (C) 1998 Red Hat
+ * TCP NFS related read + write fixes
+ *  (C) 1999 Dave Airlie, University of Limerick, Ireland <airlied@linux.ie>
+ *
+ * Rewrite of larges part of the code in order to stabilize TCP stuff.
+ * Fix behaviour when socket buffer is full.
+ *  (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no>
+ *
+ * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com>
+ *
+ * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005.
+ *   <gilles.quillard@bull.net>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/pagemap.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/mm.h>
+#include <linux/un.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/xprtsock.h>
+#include <linux/file.h>
+#ifdef CONFIG_NFS_V4_1
+#include <linux/sunrpc/bc_xprt.h>
+#endif
+
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+#include "sunrpc.h"
+
+static void xs_close(struct rpc_xprt *xprt);
+
+/*
+ * xprtsock tunables
+ */
+unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
+
+unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
+unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
+
+#define XS_TCP_LINGER_TO	(15U * HZ)
+static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
+
+/*
+ * We can register our own files under /proc/sys/sunrpc by
+ * calling register_sysctl_table() again.  The files in that
+ * directory become the union of all files registered there.
+ *
+ * We simply need to make sure that we don't collide with
+ * someone else's file names!
+ */
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
+static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT;
+static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+/*
+ * FIXME: changing the UDP slot table size should also resize the UDP
+ *        socket buffers for existing UDP transports
+ */
+static ctl_table xs_tunables_table[] = {
+	{
+		.procname	= "udp_slot_table_entries",
+		.data		= &xprt_udp_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "tcp_slot_table_entries",
+		.data		= &xprt_tcp_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "min_resvport",
+		.data		= &xprt_min_resvport,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xprt_min_resvport_limit,
+		.extra2		= &xprt_max_resvport_limit
+	},
+	{
+		.procname	= "max_resvport",
+		.data		= &xprt_max_resvport,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xprt_min_resvport_limit,
+		.extra2		= &xprt_max_resvport_limit
+	},
+	{
+		.procname	= "tcp_fin_timeout",
+		.data		= &xs_tcp_fin_timeout,
+		.maxlen		= sizeof(xs_tcp_fin_timeout),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_jiffies,
+	},
+	{ },
+};
+
+static ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xs_tunables_table
+	},
+	{ },
+};
+
+#endif
+
+/*
+ * Wait duration for a reply from the RPC portmapper.
+ */
+#define XS_BIND_TO		(60U * HZ)
+
+/*
+ * Delay if a UDP socket connect error occurs.  This is most likely some
+ * kind of resource problem on the local host.
+ */
+#define XS_UDP_REEST_TO		(2U * HZ)
+
+/*
+ * The reestablish timeout allows clients to delay for a bit before attempting
+ * to reconnect to a server that just dropped our connection.
+ *
+ * We implement an exponential backoff when trying to reestablish a TCP
+ * transport connection with the server.  Some servers like to drop a TCP
+ * connection when they are overworked, so we start with a short timeout and
+ * increase over time if the server is down or not responding.
+ */
+#define XS_TCP_INIT_REEST_TO	(3U * HZ)
+#define XS_TCP_MAX_REEST_TO	(5U * 60 * HZ)
+
+/*
+ * TCP idle timeout; client drops the transport socket if it is idle
+ * for this long.  Note that we also timeout UDP sockets to prevent
+ * holding port numbers when there is no RPC traffic.
+ */
+#define XS_IDLE_DISC_TO		(5U * 60 * HZ)
+
+#ifdef RPC_DEBUG
+# undef  RPC_DEBUG_DATA
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#ifdef RPC_DEBUG_DATA
+static void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+	u8 *buf = (u8 *) packet;
+	int j;
+
+	dprintk("RPC:       %s\n", msg);
+	for (j = 0; j < count && j < 128; j += 4) {
+		if (!(j & 31)) {
+			if (j)
+				dprintk("\n");
+			dprintk("0x%04x ", j);
+		}
+		dprintk("%02x%02x%02x%02x ",
+			buf[j], buf[j+1], buf[j+2], buf[j+3]);
+	}
+	dprintk("\n");
+}
+#else
+static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
+{
+	/* NOP */
+}
+#endif
+
+struct sock_xprt {
+	struct rpc_xprt		xprt;
+
+	/*
+	 * Network layer
+	 */
+	struct socket *		sock;
+	struct sock *		inet;
+
+	/*
+	 * State of TCP reply receive
+	 */
+	__be32			tcp_fraghdr,
+				tcp_xid,
+				tcp_calldir;
+
+	u32			tcp_offset,
+				tcp_reclen;
+
+	unsigned long		tcp_copied,
+				tcp_flags;
+
+	/*
+	 * Connection of transports
+	 */
+	struct delayed_work	connect_worker;
+	struct sockaddr_storage	srcaddr;
+	unsigned short		srcport;
+
+	/*
+	 * UDP socket buffer size parameters
+	 */
+	size_t			rcvsize,
+				sndsize;
+
+	/*
+	 * Saved socket callback addresses
+	 */
+	void			(*old_data_ready)(struct sock *, int);
+	void			(*old_state_change)(struct sock *);
+	void			(*old_write_space)(struct sock *);
+	void			(*old_error_report)(struct sock *);
+};
+
+/*
+ * TCP receive state flags
+ */
+#define TCP_RCV_LAST_FRAG	(1UL << 0)
+#define TCP_RCV_COPY_FRAGHDR	(1UL << 1)
+#define TCP_RCV_COPY_XID	(1UL << 2)
+#define TCP_RCV_COPY_DATA	(1UL << 3)
+#define TCP_RCV_READ_CALLDIR	(1UL << 4)
+#define TCP_RCV_COPY_CALLDIR	(1UL << 5)
+
+/*
+ * TCP RPC flags
+ */
+#define TCP_RPC_REPLY		(1UL << 6)
+
+static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr *) &xprt->addr;
+}
+
+static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_un *) &xprt->addr;
+}
+
+static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in *) &xprt->addr;
+}
+
+static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt)
+{
+	return (struct sockaddr_in6 *) &xprt->addr;
+}
+
+static void xs_format_common_peer_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr *sap = xs_addr(xprt);
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_in *sin;
+	struct sockaddr_un *sun;
+	char buf[128];
+
+	switch (sap->sa_family) {
+	case AF_LOCAL:
+		sun = xs_addr_un(xprt);
+		strlcpy(buf, sun->sun_path, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
+		break;
+	case AF_INET:
+		(void)rpc_ntop(sap, buf, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
+		sin = xs_addr_in(xprt);
+		snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+		break;
+	case AF_INET6:
+		(void)rpc_ntop(sap, buf, sizeof(buf));
+		xprt->address_strings[RPC_DISPLAY_ADDR] =
+						kstrdup(buf, GFP_KERNEL);
+		sin6 = xs_addr_in6(xprt);
+		snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+		break;
+	default:
+		BUG();
+	}
+
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+}
+
+static void xs_format_common_peer_ports(struct rpc_xprt *xprt)
+{
+	struct sockaddr *sap = xs_addr(xprt);
+	char buf[128];
+
+	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+}
+
+static void xs_format_peer_addresses(struct rpc_xprt *xprt,
+				     const char *protocol,
+				     const char *netid)
+{
+	xprt->address_strings[RPC_DISPLAY_PROTO] = protocol;
+	xprt->address_strings[RPC_DISPLAY_NETID] = netid;
+	xs_format_common_peer_addresses(xprt);
+	xs_format_common_peer_ports(xprt);
+}
+
+static void xs_update_peer_port(struct rpc_xprt *xprt)
+{
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+
+	xs_format_common_peer_ports(xprt);
+}
+
+static void xs_free_peer_addresses(struct rpc_xprt *xprt)
+{
+	unsigned int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		switch (i) {
+		case RPC_DISPLAY_PROTO:
+		case RPC_DISPLAY_NETID:
+			continue;
+		default:
+			kfree(xprt->address_strings[i]);
+		}
+}
+
+#define XS_SENDMSG_FLAGS	(MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more)
+{
+	struct msghdr msg = {
+		.msg_name	= addr,
+		.msg_namelen	= addrlen,
+		.msg_flags	= XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0),
+	};
+	struct kvec iov = {
+		.iov_base	= vec->iov_base + base,
+		.iov_len	= vec->iov_len - base,
+	};
+
+	if (iov.iov_len != 0)
+		return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
+	return kernel_sendmsg(sock, &msg, NULL, 0, 0);
+}
+
+static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
+{
+	struct page **ppage;
+	unsigned int remainder;
+	int err, sent = 0;
+
+	remainder = xdr->page_len - base;
+	base += xdr->page_base;
+	ppage = xdr->pages + (base >> PAGE_SHIFT);
+	base &= ~PAGE_MASK;
+	for(;;) {
+		unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
+		int flags = XS_SENDMSG_FLAGS;
+
+		remainder -= len;
+		if (remainder != 0 || more)
+			flags |= MSG_MORE;
+		err = sock->ops->sendpage(sock, *ppage, base, len, flags);
+		if (remainder == 0 || err != len)
+			break;
+		sent += err;
+		ppage++;
+		base = 0;
+	}
+	if (sent == 0)
+		return err;
+	if (err > 0)
+		sent += err;
+	return sent;
+}
+
+/**
+ * xs_sendpages - write pages directly to a socket
+ * @sock: socket to send on
+ * @addr: UDP only -- address of destination
+ * @addrlen: UDP only -- length of destination address
+ * @xdr: buffer containing this request
+ * @base: starting position in the buffer
+ *
+ */
+static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
+{
+	unsigned int remainder = xdr->len - base;
+	int err, sent = 0;
+
+	if (unlikely(!sock))
+		return -ENOTSOCK;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+	if (base != 0) {
+		addr = NULL;
+		addrlen = 0;
+	}
+
+	if (base < xdr->head[0].iov_len || addr != NULL) {
+		unsigned int len = xdr->head[0].iov_len - base;
+		remainder -= len;
+		err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0);
+		if (remainder == 0 || err != len)
+			goto out;
+		sent += err;
+		base = 0;
+	} else
+		base -= xdr->head[0].iov_len;
+
+	if (base < xdr->page_len) {
+		unsigned int len = xdr->page_len - base;
+		remainder -= len;
+		err = xs_send_pagedata(sock, xdr, base, remainder != 0);
+		if (remainder == 0 || err != len)
+			goto out;
+		sent += err;
+		base = 0;
+	} else
+		base -= xdr->page_len;
+
+	if (base >= xdr->tail[0].iov_len)
+		return sent;
+	err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0);
+out:
+	if (sent == 0)
+		return err;
+	if (err > 0)
+		sent += err;
+	return sent;
+}
+
+static void xs_nospace_callback(struct rpc_task *task)
+{
+	struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
+
+	transport->inet->sk_write_pending--;
+	clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+}
+
+/**
+ * xs_nospace - place task on wait queue if transmit was incomplete
+ * @task: task to put to sleep
+ *
+ */
+static int xs_nospace(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	int ret = -EAGAIN;
+
+	dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
+			task->tk_pid, req->rq_slen - req->rq_bytes_sent,
+			req->rq_slen);
+
+	/* Protect against races with write_space */
+	spin_lock_bh(&xprt->transport_lock);
+
+	/* Don't race with disconnect */
+	if (xprt_connected(xprt)) {
+		if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+			/*
+			 * Notify TCP that we're limited by the application
+			 * window size
+			 */
+			set_bit(SOCK_NOSPACE, &transport->sock->flags);
+			transport->inet->sk_write_pending++;
+			/* ...and wait for more buffer space */
+			xprt_wait_for_buffer_space(task, xs_nospace_callback);
+		}
+	} else {
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+		ret = -ENOTCONN;
+	}
+
+	spin_unlock_bh(&xprt->transport_lock);
+	return ret;
+}
+
+/*
+ * Construct a stream transport record marker in @buf.
+ */
+static inline void xs_encode_stream_record_marker(struct xdr_buf *buf)
+{
+	u32 reclen = buf->len - sizeof(rpc_fraghdr);
+	rpc_fraghdr *base = buf->head[0].iov_base;
+	*base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen);
+}
+
+/**
+ * xs_local_send_request - write an RPC request to an AF_LOCAL socket
+ * @task: RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ *        0:	The request has been sent
+ *   EAGAIN:	The socket was blocked, please call again later to
+ *		complete the request
+ * ENOTCONN:	Caller needs to invoke connect logic then call again
+ *    other:	Some other error occured, the request was not sent
+ */
+static int xs_local_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_buf *xdr = &req->rq_snd_buf;
+	int status;
+
+	xs_encode_stream_record_marker(&req->rq_snd_buf);
+
+	xs_pktdump("packet data:",
+			req->rq_svec->iov_base, req->rq_svec->iov_len);
+
+	status = xs_sendpages(transport->sock, NULL, 0,
+						xdr, req->rq_bytes_sent);
+	dprintk("RPC:       %s(%u) = %d\n",
+			__func__, xdr->len - req->rq_bytes_sent, status);
+	if (likely(status >= 0)) {
+		req->rq_bytes_sent += status;
+		req->rq_xmit_bytes_sent += status;
+		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+			req->rq_bytes_sent = 0;
+			return 0;
+		}
+		status = -EAGAIN;
+	}
+
+	switch (status) {
+	case -EAGAIN:
+		status = xs_nospace(task);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
+	case -EPIPE:
+		xs_close(xprt);
+		status = -ENOTCONN;
+	}
+
+	return status;
+}
+
+/**
+ * xs_udp_send_request - write an RPC request to a UDP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ *        0:	The request has been sent
+ *   EAGAIN:	The socket was blocked, please call again later to
+ *		complete the request
+ * ENOTCONN:	Caller needs to invoke connect logic then call again
+ *    other:	Some other error occurred, the request was not sent
+ */
+static int xs_udp_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_buf *xdr = &req->rq_snd_buf;
+	int status;
+
+	xs_pktdump("packet data:",
+				req->rq_svec->iov_base,
+				req->rq_svec->iov_len);
+
+	if (!xprt_bound(xprt))
+		return -ENOTCONN;
+	status = xs_sendpages(transport->sock,
+			      xs_addr(xprt),
+			      xprt->addrlen, xdr,
+			      req->rq_bytes_sent);
+
+	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
+			xdr->len - req->rq_bytes_sent, status);
+
+	if (status >= 0) {
+		req->rq_xmit_bytes_sent += status;
+		if (status >= req->rq_slen)
+			return 0;
+		/* Still some bytes left; set up for a retry later. */
+		status = -EAGAIN;
+	}
+
+	switch (status) {
+	case -ENOTSOCK:
+		status = -ENOTCONN;
+		/* Should we call xs_close() here? */
+		break;
+	case -EAGAIN:
+		status = xs_nospace(task);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
+	case -ENETUNREACH:
+	case -EPIPE:
+	case -ECONNREFUSED:
+		/* When the server has died, an ICMP port unreachable message
+		 * prompts ECONNREFUSED. */
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+	}
+
+	return status;
+}
+
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_WR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct socket *sock = transport->sock;
+
+	if (sock != NULL)
+		kernel_sock_shutdown(sock, SHUT_WR);
+}
+
+/**
+ * xs_tcp_send_request - write an RPC request to a TCP socket
+ * @task: address of RPC task that manages the state of an RPC request
+ *
+ * Return values:
+ *        0:	The request has been sent
+ *   EAGAIN:	The socket was blocked, please call again later to
+ *		complete the request
+ * ENOTCONN:	Caller needs to invoke connect logic then call again
+ *    other:	Some other error occurred, the request was not sent
+ *
+ * XXX: In the case of soft timeouts, should we eventually give up
+ *	if sendmsg is not able to make progress?
+ */
+static int xs_tcp_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_buf *xdr = &req->rq_snd_buf;
+	int status;
+
+	xs_encode_stream_record_marker(&req->rq_snd_buf);
+
+	xs_pktdump("packet data:",
+				req->rq_svec->iov_base,
+				req->rq_svec->iov_len);
+
+	/* Continue transmitting the packet/record. We must be careful
+	 * to cope with writespace callbacks arriving _after_ we have
+	 * called sendmsg(). */
+	while (1) {
+		status = xs_sendpages(transport->sock,
+					NULL, 0, xdr, req->rq_bytes_sent);
+
+		dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
+				xdr->len - req->rq_bytes_sent, status);
+
+		if (unlikely(status < 0))
+			break;
+
+		/* If we've sent the entire packet, immediately
+		 * reset the count of bytes sent. */
+		req->rq_bytes_sent += status;
+		req->rq_xmit_bytes_sent += status;
+		if (likely(req->rq_bytes_sent >= req->rq_slen)) {
+			req->rq_bytes_sent = 0;
+			return 0;
+		}
+
+		if (status != 0)
+			continue;
+		status = -EAGAIN;
+		break;
+	}
+
+	switch (status) {
+	case -ENOTSOCK:
+		status = -ENOTCONN;
+		/* Should we call xs_close() here? */
+		break;
+	case -EAGAIN:
+		status = xs_nospace(task);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
+	case -ECONNRESET:
+	case -EPIPE:
+		xs_tcp_shutdown(xprt);
+	case -ECONNREFUSED:
+	case -ENOTCONN:
+		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+	}
+
+	return status;
+}
+
+/**
+ * xs_tcp_release_xprt - clean up after a tcp transmission
+ * @xprt: transport
+ * @task: rpc task
+ *
+ * This cleans up if an error causes us to abort the transmission of a request.
+ * In this case, the socket may need to be reset in order to avoid confusing
+ * the server.
+ */
+static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpc_rqst *req;
+
+	if (task != xprt->snd_task)
+		return;
+	if (task == NULL)
+		goto out_release;
+	req = task->tk_rqstp;
+	if (req->rq_bytes_sent == 0)
+		goto out_release;
+	if (req->rq_bytes_sent == req->rq_snd_buf.len)
+		goto out_release;
+	set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state);
+out_release:
+	xprt_release_xprt(xprt, task);
+}
+
+static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
+{
+	transport->old_data_ready = sk->sk_data_ready;
+	transport->old_state_change = sk->sk_state_change;
+	transport->old_write_space = sk->sk_write_space;
+	transport->old_error_report = sk->sk_error_report;
+}
+
+static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk)
+{
+	sk->sk_data_ready = transport->old_data_ready;
+	sk->sk_state_change = transport->old_state_change;
+	sk->sk_write_space = transport->old_write_space;
+	sk->sk_error_report = transport->old_error_report;
+}
+
+static void xs_reset_transport(struct sock_xprt *transport)
+{
+	struct socket *sock = transport->sock;
+	struct sock *sk = transport->inet;
+
+	if (sk == NULL)
+		return;
+
+	transport->srcport = 0;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	transport->inet = NULL;
+	transport->sock = NULL;
+
+	sk->sk_user_data = NULL;
+
+	xs_restore_old_callbacks(transport, sk);
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	sk->sk_no_check = 0;
+
+	sock_release(sock);
+}
+
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ *
+ * The caller _must_ be holding XPRT_LOCKED in order to avoid issues with
+ * xs_reset_transport() zeroing the socket from underneath a writer.
+ */
+static void xs_close(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	dprintk("RPC:       xs_close xprt %p\n", xprt);
+
+	xs_reset_transport(transport);
+	xprt->reestablish_timeout = 0;
+
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	clear_bit(XPRT_CLOSING, &xprt->state);
+	smp_mb__after_clear_bit();
+	xprt_disconnect_done(xprt);
+}
+
+static void xs_tcp_close(struct rpc_xprt *xprt)
+{
+	if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state))
+		xs_close(xprt);
+	else
+		xs_tcp_shutdown(xprt);
+}
+
+/**
+ * xs_destroy - prepare to shutdown a transport
+ * @xprt: doomed transport
+ *
+ */
+static void xs_destroy(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	dprintk("RPC:       xs_destroy xprt %p\n", xprt);
+
+	cancel_delayed_work_sync(&transport->connect_worker);
+
+	xs_close(xprt);
+	xs_free_peer_addresses(xprt);
+	xprt_free(xprt);
+	module_put(THIS_MODULE);
+}
+
+static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
+{
+	return (struct rpc_xprt *) sk->sk_user_data;
+}
+
+static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
+{
+	struct xdr_skb_reader desc = {
+		.skb		= skb,
+		.offset		= sizeof(rpc_fraghdr),
+		.count		= skb->len - sizeof(rpc_fraghdr),
+	};
+
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0)
+		return -1;
+	if (desc.count)
+		return -1;
+	return 0;
+}
+
+/**
+ * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ * Currently this assumes we can read the whole reply in a single gulp.
+ */
+static void xs_local_data_ready(struct sock *sk, int len)
+{
+	struct rpc_task *task;
+	struct rpc_xprt *xprt;
+	struct rpc_rqst *rovr;
+	struct sk_buff *skb;
+	int err, repsize, copied;
+	u32 _xid;
+	__be32 *xp;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	dprintk("RPC:       %s...\n", __func__);
+	xprt = xprt_from_sock(sk);
+	if (xprt == NULL)
+		goto out;
+
+	skb = skb_recv_datagram(sk, 0, 1, &err);
+	if (skb == NULL)
+		goto out;
+
+	if (xprt->shutdown)
+		goto dropit;
+
+	repsize = skb->len - sizeof(rpc_fraghdr);
+	if (repsize < 4) {
+		dprintk("RPC:       impossible RPC reply size %d\n", repsize);
+		goto dropit;
+	}
+
+	/* Copy the XID from the skb... */
+	xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
+	if (xp == NULL)
+		goto dropit;
+
+	/* Look up and lock the request corresponding to the given XID */
+	spin_lock(&xprt->transport_lock);
+	rovr = xprt_lookup_rqst(xprt, *xp);
+	if (!rovr)
+		goto out_unlock;
+	task = rovr->rq_task;
+
+	copied = rovr->rq_private_buf.buflen;
+	if (copied > repsize)
+		copied = repsize;
+
+	if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
+		dprintk("RPC:       sk_buff copy failed\n");
+		goto out_unlock;
+	}
+
+	xprt_complete_rqst(task, copied);
+
+ out_unlock:
+	spin_unlock(&xprt->transport_lock);
+ dropit:
+	skb_free_datagram(sk, skb);
+ out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_udp_data_ready - "data ready" callback for UDP sockets
+ * @sk: socket with data to read
+ * @len: how much data to read
+ *
+ */
+static void xs_udp_data_ready(struct sock *sk, int len)
+{
+	struct rpc_task *task;
+	struct rpc_xprt *xprt;
+	struct rpc_rqst *rovr;
+	struct sk_buff *skb;
+	int err, repsize, copied;
+	u32 _xid;
+	__be32 *xp;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	dprintk("RPC:       xs_udp_data_ready...\n");
+	if (!(xprt = xprt_from_sock(sk)))
+		goto out;
+
+	if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
+		goto out;
+
+	if (xprt->shutdown)
+		goto dropit;
+
+	repsize = skb->len - sizeof(struct udphdr);
+	if (repsize < 4) {
+		dprintk("RPC:       impossible RPC reply size %d!\n", repsize);
+		goto dropit;
+	}
+
+	/* Copy the XID from the skb... */
+	xp = skb_header_pointer(skb, sizeof(struct udphdr),
+				sizeof(_xid), &_xid);
+	if (xp == NULL)
+		goto dropit;
+
+	/* Look up and lock the request corresponding to the given XID */
+	spin_lock(&xprt->transport_lock);
+	rovr = xprt_lookup_rqst(xprt, *xp);
+	if (!rovr)
+		goto out_unlock;
+	task = rovr->rq_task;
+
+	if ((copied = rovr->rq_private_buf.buflen) > repsize)
+		copied = repsize;
+
+	/* Suck it into the iovec, verify checksum if not done by hw. */
+	if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
+		UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS);
+		goto out_unlock;
+	}
+
+	UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS);
+
+	/* Something worked... */
+	dst_confirm(skb_dst(skb));
+
+	xprt_adjust_cwnd(task, copied);
+	xprt_complete_rqst(task, copied);
+
+ out_unlock:
+	spin_unlock(&xprt->transport_lock);
+ dropit:
+	skb_free_datagram(sk, skb);
+ out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	size_t len, used;
+	char *p;
+
+	p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset;
+	len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset;
+	used = xdr_skb_read_bits(desc, p, len);
+	transport->tcp_offset += used;
+	if (used != len)
+		return;
+
+	transport->tcp_reclen = ntohl(transport->tcp_fraghdr);
+	if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT)
+		transport->tcp_flags |= TCP_RCV_LAST_FRAG;
+	else
+		transport->tcp_flags &= ~TCP_RCV_LAST_FRAG;
+	transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK;
+
+	transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR;
+	transport->tcp_offset = 0;
+
+	/* Sanity check of the record length */
+	if (unlikely(transport->tcp_reclen < 8)) {
+		dprintk("RPC:       invalid TCP record fragment length\n");
+		xprt_force_disconnect(xprt);
+		return;
+	}
+	dprintk("RPC:       reading TCP record fragment of length %d\n",
+			transport->tcp_reclen);
+}
+
+static void xs_tcp_check_fraghdr(struct sock_xprt *transport)
+{
+	if (transport->tcp_offset == transport->tcp_reclen) {
+		transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR;
+		transport->tcp_offset = 0;
+		if (transport->tcp_flags & TCP_RCV_LAST_FRAG) {
+			transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+			transport->tcp_flags |= TCP_RCV_COPY_XID;
+			transport->tcp_copied = 0;
+		}
+	}
+}
+
+static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc)
+{
+	size_t len, used;
+	char *p;
+
+	len = sizeof(transport->tcp_xid) - transport->tcp_offset;
+	dprintk("RPC:       reading XID (%Zu bytes)\n", len);
+	p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
+	used = xdr_skb_read_bits(desc, p, len);
+	transport->tcp_offset += used;
+	if (used != len)
+		return;
+	transport->tcp_flags &= ~TCP_RCV_COPY_XID;
+	transport->tcp_flags |= TCP_RCV_READ_CALLDIR;
+	transport->tcp_copied = 4;
+	dprintk("RPC:       reading %s XID %08x\n",
+			(transport->tcp_flags & TCP_RPC_REPLY) ? "reply for"
+							      : "request with",
+			ntohl(transport->tcp_xid));
+	xs_tcp_check_fraghdr(transport);
+}
+
+static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
+				       struct xdr_skb_reader *desc)
+{
+	size_t len, used;
+	u32 offset;
+	char *p;
+
+	/*
+	 * We want transport->tcp_offset to be 8 at the end of this routine
+	 * (4 bytes for the xid and 4 bytes for the call/reply flag).
+	 * When this function is called for the first time,
+	 * transport->tcp_offset is 4 (after having already read the xid).
+	 */
+	offset = transport->tcp_offset - sizeof(transport->tcp_xid);
+	len = sizeof(transport->tcp_calldir) - offset;
+	dprintk("RPC:       reading CALL/REPLY flag (%Zu bytes)\n", len);
+	p = ((char *) &transport->tcp_calldir) + offset;
+	used = xdr_skb_read_bits(desc, p, len);
+	transport->tcp_offset += used;
+	if (used != len)
+		return;
+	transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR;
+	/*
+	 * We don't yet have the XDR buffer, so we will write the calldir
+	 * out after we get the buffer from the 'struct rpc_rqst'
+	 */
+	switch (ntohl(transport->tcp_calldir)) {
+	case RPC_REPLY:
+		transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
+		transport->tcp_flags |= TCP_RCV_COPY_DATA;
+		transport->tcp_flags |= TCP_RPC_REPLY;
+		break;
+	case RPC_CALL:
+		transport->tcp_flags |= TCP_RCV_COPY_CALLDIR;
+		transport->tcp_flags |= TCP_RCV_COPY_DATA;
+		transport->tcp_flags &= ~TCP_RPC_REPLY;
+		break;
+	default:
+		dprintk("RPC:       invalid request message type\n");
+		xprt_force_disconnect(&transport->xprt);
+	}
+	xs_tcp_check_fraghdr(transport);
+}
+
+static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
+				     struct xdr_skb_reader *desc,
+				     struct rpc_rqst *req)
+{
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_buf *rcvbuf;
+	size_t len;
+	ssize_t r;
+
+	rcvbuf = &req->rq_private_buf;
+
+	if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) {
+		/*
+		 * Save the RPC direction in the XDR buffer
+		 */
+		memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied,
+			&transport->tcp_calldir,
+			sizeof(transport->tcp_calldir));
+		transport->tcp_copied += sizeof(transport->tcp_calldir);
+		transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR;
+	}
+
+	len = desc->count;
+	if (len > transport->tcp_reclen - transport->tcp_offset) {
+		struct xdr_skb_reader my_desc;
+
+		len = transport->tcp_reclen - transport->tcp_offset;
+		memcpy(&my_desc, desc, sizeof(my_desc));
+		my_desc.count = len;
+		r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+					  &my_desc, xdr_skb_read_bits);
+		desc->count -= r;
+		desc->offset += r;
+	} else
+		r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+					  desc, xdr_skb_read_bits);
+
+	if (r > 0) {
+		transport->tcp_copied += r;
+		transport->tcp_offset += r;
+	}
+	if (r != len) {
+		/* Error when copying to the receive buffer,
+		 * usually because we weren't able to allocate
+		 * additional buffer pages. All we can do now
+		 * is turn off TCP_RCV_COPY_DATA, so the request
+		 * will not receive any additional updates,
+		 * and time out.
+		 * Any remaining data from this record will
+		 * be discarded.
+		 */
+		transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+		dprintk("RPC:       XID %08x truncated request\n",
+				ntohl(transport->tcp_xid));
+		dprintk("RPC:       xprt = %p, tcp_copied = %lu, "
+				"tcp_offset = %u, tcp_reclen = %u\n",
+				xprt, transport->tcp_copied,
+				transport->tcp_offset, transport->tcp_reclen);
+		return;
+	}
+
+	dprintk("RPC:       XID %08x read %Zd bytes\n",
+			ntohl(transport->tcp_xid), r);
+	dprintk("RPC:       xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
+			"tcp_reclen = %u\n", xprt, transport->tcp_copied,
+			transport->tcp_offset, transport->tcp_reclen);
+
+	if (transport->tcp_copied == req->rq_private_buf.buflen)
+		transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+	else if (transport->tcp_offset == transport->tcp_reclen) {
+		if (transport->tcp_flags & TCP_RCV_LAST_FRAG)
+			transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+	}
+}
+
+/*
+ * Finds the request corresponding to the RPC xid and invokes the common
+ * tcp read code to read the data.
+ */
+static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
+				    struct xdr_skb_reader *desc)
+{
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct rpc_rqst *req;
+
+	dprintk("RPC:       read reply XID %08x\n", ntohl(transport->tcp_xid));
+
+	/* Find and lock the request corresponding to this xid */
+	spin_lock(&xprt->transport_lock);
+	req = xprt_lookup_rqst(xprt, transport->tcp_xid);
+	if (!req) {
+		dprintk("RPC:       XID %08x request not found!\n",
+				ntohl(transport->tcp_xid));
+		spin_unlock(&xprt->transport_lock);
+		return -1;
+	}
+
+	xs_tcp_read_common(xprt, desc, req);
+
+	if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
+		xprt_complete_rqst(req->rq_task, transport->tcp_copied);
+
+	spin_unlock(&xprt->transport_lock);
+	return 0;
+}
+
+#if defined(CONFIG_NFS_V4_1)
+/*
+ * Obtains an rpc_rqst previously allocated and invokes the common
+ * tcp read code to read the data.  The result is placed in the callback
+ * queue.
+ * If we're unable to obtain the rpc_rqst we schedule the closing of the
+ * connection and return -1.
+ */
+static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
+				       struct xdr_skb_reader *desc)
+{
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct rpc_rqst *req;
+
+	req = xprt_alloc_bc_request(xprt);
+	if (req == NULL) {
+		printk(KERN_WARNING "Callback slot table overflowed\n");
+		xprt_force_disconnect(xprt);
+		return -1;
+	}
+
+	req->rq_xid = transport->tcp_xid;
+	dprintk("RPC:       read callback  XID %08x\n", ntohl(req->rq_xid));
+	xs_tcp_read_common(xprt, desc, req);
+
+	if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
+		struct svc_serv *bc_serv = xprt->bc_serv;
+
+		/*
+		 * Add callback request to callback list.  The callback
+		 * service sleeps on the sv_cb_waitq waiting for new
+		 * requests.  Wake it up after adding enqueing the
+		 * request.
+		 */
+		dprintk("RPC:       add callback request to list\n");
+		spin_lock(&bc_serv->sv_cb_lock);
+		list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
+		spin_unlock(&bc_serv->sv_cb_lock);
+		wake_up(&bc_serv->sv_cb_waitq);
+	}
+
+	req->rq_private_buf.len = transport->tcp_copied;
+
+	return 0;
+}
+
+static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
+					struct xdr_skb_reader *desc)
+{
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+
+	return (transport->tcp_flags & TCP_RPC_REPLY) ?
+		xs_tcp_read_reply(xprt, desc) :
+		xs_tcp_read_callback(xprt, desc);
+}
+#else
+static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
+					struct xdr_skb_reader *desc)
+{
+	return xs_tcp_read_reply(xprt, desc);
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+/*
+ * Read data off the transport.  This can be either an RPC_CALL or an
+ * RPC_REPLY.  Relay the processing to helper functions.
+ */
+static void xs_tcp_read_data(struct rpc_xprt *xprt,
+				    struct xdr_skb_reader *desc)
+{
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+
+	if (_xs_tcp_read_data(xprt, desc) == 0)
+		xs_tcp_check_fraghdr(transport);
+	else {
+		/*
+		 * The transport_lock protects the request handling.
+		 * There's no need to hold it to update the tcp_flags.
+		 */
+		transport->tcp_flags &= ~TCP_RCV_COPY_DATA;
+	}
+}
+
+static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc)
+{
+	size_t len;
+
+	len = transport->tcp_reclen - transport->tcp_offset;
+	if (len > desc->count)
+		len = desc->count;
+	desc->count -= len;
+	desc->offset += len;
+	transport->tcp_offset += len;
+	dprintk("RPC:       discarded %Zu bytes\n", len);
+	xs_tcp_check_fraghdr(transport);
+}
+
+static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len)
+{
+	struct rpc_xprt *xprt = rd_desc->arg.data;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct xdr_skb_reader desc = {
+		.skb	= skb,
+		.offset	= offset,
+		.count	= len,
+	};
+
+	dprintk("RPC:       xs_tcp_data_recv started\n");
+	do {
+		/* Read in a new fragment marker if necessary */
+		/* Can we ever really expect to get completely empty fragments? */
+		if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) {
+			xs_tcp_read_fraghdr(xprt, &desc);
+			continue;
+		}
+		/* Read in the xid if necessary */
+		if (transport->tcp_flags & TCP_RCV_COPY_XID) {
+			xs_tcp_read_xid(transport, &desc);
+			continue;
+		}
+		/* Read in the call/reply flag */
+		if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) {
+			xs_tcp_read_calldir(transport, &desc);
+			continue;
+		}
+		/* Read in the request data */
+		if (transport->tcp_flags & TCP_RCV_COPY_DATA) {
+			xs_tcp_read_data(xprt, &desc);
+			continue;
+		}
+		/* Skip over any trailing bytes on short reads */
+		xs_tcp_read_discard(transport, &desc);
+	} while (desc.count);
+	dprintk("RPC:       xs_tcp_data_recv done\n");
+	return len - desc.count;
+}
+
+/**
+ * xs_tcp_data_ready - "data ready" callback for TCP sockets
+ * @sk: socket with data to read
+ * @bytes: how much data to read
+ *
+ */
+static void xs_tcp_data_ready(struct sock *sk, int bytes)
+{
+	struct rpc_xprt *xprt;
+	read_descriptor_t rd_desc;
+	int read;
+
+	dprintk("RPC:       xs_tcp_data_ready...\n");
+
+	read_lock_bh(&sk->sk_callback_lock);
+	if (!(xprt = xprt_from_sock(sk)))
+		goto out;
+	if (xprt->shutdown)
+		goto out;
+
+	/* Any data means we had a useful conversation, so
+	 * the we don't need to delay the next reconnect
+	 */
+	if (xprt->reestablish_timeout)
+		xprt->reestablish_timeout = 0;
+
+	/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+	rd_desc.arg.data = xprt;
+	do {
+		rd_desc.count = 65536;
+		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+	} while (read > 0);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/*
+ * Do the equivalent of linger/linger2 handling for dealing with
+ * broken servers that don't close the socket in a timely
+ * fashion
+ */
+static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
+		unsigned long timeout)
+{
+	struct sock_xprt *transport;
+
+	if (xprt_test_and_set_connecting(xprt))
+		return;
+	set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	transport = container_of(xprt, struct sock_xprt, xprt);
+	queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
+			   timeout);
+}
+
+static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport;
+
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
+	    !cancel_delayed_work(&transport->connect_worker))
+		return;
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	xprt_clear_connecting(xprt);
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+	clear_bit(XPRT_CLOSING, &xprt->state);
+	smp_mb__after_clear_bit();
+	/* Mark transport as closed and wake up all pending tasks */
+	xprt_disconnect_done(xprt);
+}
+
+/**
+ * xs_tcp_state_change - callback to handle TCP socket state changes
+ * @sk: socket whose state has changed
+ *
+ */
+static void xs_tcp_state_change(struct sock *sk)
+{
+	struct rpc_xprt *xprt;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	if (!(xprt = xprt_from_sock(sk)))
+		goto out;
+	dprintk("RPC:       xs_tcp_state_change client %p...\n", xprt);
+	dprintk("RPC:       state %x conn %d dead %d zapped %d sk_shutdown %d\n",
+			sk->sk_state, xprt_connected(xprt),
+			sock_flag(sk, SOCK_DEAD),
+			sock_flag(sk, SOCK_ZAPPED),
+			sk->sk_shutdown);
+
+	switch (sk->sk_state) {
+	case TCP_ESTABLISHED:
+		spin_lock(&xprt->transport_lock);
+		if (!xprt_test_and_set_connected(xprt)) {
+			struct sock_xprt *transport = container_of(xprt,
+					struct sock_xprt, xprt);
+
+			/* Reset TCP record info */
+			transport->tcp_offset = 0;
+			transport->tcp_reclen = 0;
+			transport->tcp_copied = 0;
+			transport->tcp_flags =
+				TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
+
+			xprt_wake_pending_tasks(xprt, -EAGAIN);
+		}
+		spin_unlock(&xprt->transport_lock);
+		break;
+	case TCP_FIN_WAIT1:
+		/* The client initiated a shutdown of the socket */
+		xprt->connect_cookie++;
+		xprt->reestablish_timeout = 0;
+		set_bit(XPRT_CLOSING, &xprt->state);
+		smp_mb__before_clear_bit();
+		clear_bit(XPRT_CONNECTED, &xprt->state);
+		clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+		smp_mb__after_clear_bit();
+		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
+		break;
+	case TCP_CLOSE_WAIT:
+		/* The server initiated a shutdown of the socket */
+		xprt_force_disconnect(xprt);
+		xprt->connect_cookie++;
+	case TCP_CLOSING:
+		/*
+		 * If the server closed down the connection, make sure that
+		 * we back off before reconnecting
+		 */
+		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+		break;
+	case TCP_LAST_ACK:
+		set_bit(XPRT_CLOSING, &xprt->state);
+		xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
+		smp_mb__before_clear_bit();
+		clear_bit(XPRT_CONNECTED, &xprt->state);
+		smp_mb__after_clear_bit();
+		break;
+	case TCP_CLOSE:
+		xs_tcp_cancel_linger_timeout(xprt);
+		xs_sock_mark_closed(xprt);
+	}
+ out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_error_report - callback mainly for catching socket errors
+ * @sk: socket
+ */
+static void xs_error_report(struct sock *sk)
+{
+	struct rpc_xprt *xprt;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	if (!(xprt = xprt_from_sock(sk)))
+		goto out;
+	dprintk("RPC:       %s client %p...\n"
+			"RPC:       error %d\n",
+			__func__, xprt, sk->sk_err);
+	xprt_wake_pending_tasks(xprt, -EAGAIN);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void xs_write_space(struct sock *sk)
+{
+	struct socket *sock;
+	struct rpc_xprt *xprt;
+
+	if (unlikely(!(sock = sk->sk_socket)))
+		return;
+	clear_bit(SOCK_NOSPACE, &sock->flags);
+
+	if (unlikely(!(xprt = xprt_from_sock(sk))))
+		return;
+	if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
+		return;
+
+	xprt_write_space(xprt);
+}
+
+/**
+ * xs_udp_write_space - callback invoked when socket buffer space
+ *                             becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_udp_write_space(struct sock *sk)
+{
+	read_lock_bh(&sk->sk_callback_lock);
+
+	/* from net/core/sock.c:sock_def_write_space */
+	if (sock_writeable(sk))
+		xs_write_space(sk);
+
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+/**
+ * xs_tcp_write_space - callback invoked when socket buffer space
+ *                             becomes available
+ * @sk: socket whose state has changed
+ *
+ * Called when more output buffer space is available for this socket.
+ * We try not to wake our writers until they can make "significant"
+ * progress, otherwise we'll waste resources thrashing kernel_sendmsg
+ * with a bunch of small requests.
+ */
+static void xs_tcp_write_space(struct sock *sk)
+{
+	read_lock_bh(&sk->sk_callback_lock);
+
+	/* from net/core/stream.c:sk_stream_write_space */
+	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+		xs_write_space(sk);
+
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct sock *sk = transport->inet;
+
+	if (transport->rcvsize) {
+		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+		sk->sk_rcvbuf = transport->rcvsize * xprt->max_reqs * 2;
+	}
+	if (transport->sndsize) {
+		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+		sk->sk_sndbuf = transport->sndsize * xprt->max_reqs * 2;
+		sk->sk_write_space(sk);
+	}
+}
+
+/**
+ * xs_udp_set_buffer_size - set send and receive limits
+ * @xprt: generic transport
+ * @sndsize: requested size of send buffer, in bytes
+ * @rcvsize: requested size of receive buffer, in bytes
+ *
+ * Set socket send and receive buffer size limits.
+ */
+static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	transport->sndsize = 0;
+	if (sndsize)
+		transport->sndsize = sndsize + 1024;
+	transport->rcvsize = 0;
+	if (rcvsize)
+		transport->rcvsize = rcvsize + 1024;
+
+	xs_udp_do_set_buffer_size(xprt);
+}
+
+/**
+ * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport
+ * @task: task that timed out
+ *
+ * Adjust the congestion window after a retransmit timeout has occurred.
+ */
+static void xs_udp_timer(struct rpc_task *task)
+{
+	xprt_adjust_cwnd(task, -ETIMEDOUT);
+}
+
+static unsigned short xs_get_random_port(void)
+{
+	unsigned short range = xprt_max_resvport - xprt_min_resvport;
+	unsigned short rand = (unsigned short) net_random() % range;
+	return rand + xprt_min_resvport;
+}
+
+/**
+ * xs_set_port - reset the port number in the remote endpoint address
+ * @xprt: generic transport
+ * @port: new port number
+ *
+ */
+static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+	dprintk("RPC:       setting port for xprt %p to %u\n", xprt, port);
+
+	rpc_set_port(xs_addr(xprt), port);
+	xs_update_peer_port(xprt);
+}
+
+static unsigned short xs_get_srcport(struct sock_xprt *transport)
+{
+	unsigned short port = transport->srcport;
+
+	if (port == 0 && transport->xprt.resvport)
+		port = xs_get_random_port();
+	return port;
+}
+
+static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
+{
+	if (transport->srcport != 0)
+		transport->srcport = 0;
+	if (!transport->xprt.resvport)
+		return 0;
+	if (port <= xprt_min_resvport || port > xprt_max_resvport)
+		return xprt_max_resvport;
+	return --port;
+}
+static int xs_bind(struct sock_xprt *transport, struct socket *sock)
+{
+	struct sockaddr_storage myaddr;
+	int err, nloop = 0;
+	unsigned short port = xs_get_srcport(transport);
+	unsigned short last;
+
+	memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
+	do {
+		rpc_set_port((struct sockaddr *)&myaddr, port);
+		err = kernel_bind(sock, (struct sockaddr *)&myaddr,
+				transport->xprt.addrlen);
+		if (port == 0)
+			break;
+		if (err == 0) {
+			transport->srcport = port;
+			break;
+		}
+		last = port;
+		port = xs_next_srcport(transport, port);
+		if (port > last)
+			nloop++;
+	} while (err == -EADDRINUSE && nloop != 2);
+
+	if (myaddr.ss_family == AF_INET)
+		dprintk("RPC:       %s %pI4:%u: %s (%d)\n", __func__,
+				&((struct sockaddr_in *)&myaddr)->sin_addr,
+				port, err ? "failed" : "ok", err);
+	else
+		dprintk("RPC:       %s %pI6:%u: %s (%d)\n", __func__,
+				&((struct sockaddr_in6 *)&myaddr)->sin6_addr,
+				port, err ? "failed" : "ok", err);
+	return err;
+}
+
+/*
+ * We don't support autobind on AF_LOCAL sockets
+ */
+static void xs_local_rpcbind(struct rpc_task *task)
+{
+	xprt_set_bound(task->tk_xprt);
+}
+
+static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
+{
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key xs_key[2];
+static struct lock_class_key xs_slock_key[2];
+
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC",
+		&xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]);
+}
+
+static inline void xs_reclassify_socket4(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC",
+		&xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]);
+}
+
+static inline void xs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
+		&xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
+}
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+	switch (family) {
+	case AF_LOCAL:
+		xs_reclassify_socketu(sock);
+		break;
+	case AF_INET:
+		xs_reclassify_socket4(sock);
+		break;
+	case AF_INET6:
+		xs_reclassify_socket6(sock);
+		break;
+	}
+}
+#else
+static inline void xs_reclassify_socketu(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket6(struct socket *sock)
+{
+}
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+}
+#endif
+
+static struct socket *xs_create_sock(struct rpc_xprt *xprt,
+		struct sock_xprt *transport, int family, int type, int protocol)
+{
+	struct socket *sock;
+	int err;
+
+	err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1);
+	if (err < 0) {
+		dprintk("RPC:       can't create %d transport socket (%d).\n",
+				protocol, -err);
+		goto out;
+	}
+	xs_reclassify_socket(family, sock);
+
+	err = xs_bind(transport, sock);
+	if (err) {
+		sock_release(sock);
+		goto out;
+	}
+
+	return sock;
+out:
+	return ERR_PTR(err);
+}
+
+static int xs_local_finish_connecting(struct rpc_xprt *xprt,
+				      struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+									xprt);
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		xs_save_old_callbacks(transport, sk);
+
+		sk->sk_user_data = xprt;
+		sk->sk_data_ready = xs_local_data_ready;
+		sk->sk_write_space = xs_udp_write_space;
+		sk->sk_error_report = xs_error_report;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		xprt_clear_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+
+	/* Tell the socket layer to start connecting... */
+	xprt->stat.connect_count++;
+	xprt->stat.connect_start = jiffies;
+	return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0);
+}
+
+/**
+ * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_local_setup_socket(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock;
+	int status = -EIO;
+
+	if (xprt->shutdown)
+		goto out;
+
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	status = __sock_create(xprt->xprt_net, AF_LOCAL,
+					SOCK_STREAM, 0, &sock, 1);
+	if (status < 0) {
+		dprintk("RPC:       can't create AF_LOCAL "
+			"transport socket (%d).\n", -status);
+		goto out;
+	}
+	xs_reclassify_socketu(sock);
+
+	dprintk("RPC:       worker connecting xprt %p via AF_LOCAL to %s\n",
+			xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	status = xs_local_finish_connecting(xprt, sock);
+	switch (status) {
+	case 0:
+		dprintk("RPC:       xprt %p connected to %s\n",
+				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		xprt_set_connected(xprt);
+		break;
+	case -ENOENT:
+		dprintk("RPC:       xprt %p: socket %s does not exist\n",
+				xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
+		break;
+	default:
+		printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n",
+				__func__, -status,
+				xprt->address_strings[RPC_DISPLAY_ADDR]);
+	}
+
+out:
+	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
+}
+
+static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		xs_save_old_callbacks(transport, sk);
+
+		sk->sk_user_data = xprt;
+		sk->sk_data_ready = xs_udp_data_ready;
+		sk->sk_write_space = xs_udp_write_space;
+		sk->sk_error_report = xs_error_report;
+		sk->sk_no_check = UDP_CSUM_NORCV;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		xprt_set_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+	xs_udp_do_set_buffer_size(xprt);
+}
+
+static void xs_udp_setup_socket(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct rpc_xprt *xprt = &transport->xprt;
+	struct socket *sock = transport->sock;
+	int status = -EIO;
+
+	if (xprt->shutdown)
+		goto out;
+
+	/* Start by resetting any existing state */
+	xs_reset_transport(transport);
+	sock = xs_create_sock(xprt, transport,
+			xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP);
+	if (IS_ERR(sock))
+		goto out;
+
+	dprintk("RPC:       worker connecting xprt %p via %s to "
+				"%s (port %s)\n", xprt,
+			xprt->address_strings[RPC_DISPLAY_PROTO],
+			xprt->address_strings[RPC_DISPLAY_ADDR],
+			xprt->address_strings[RPC_DISPLAY_PORT]);
+
+	xs_udp_finish_connecting(xprt, sock);
+	status = 0;
+out:
+	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
+}
+
+/*
+ * We need to preserve the port number so the reply cache on the server can
+ * find our cached RPC replies when we get around to reconnecting.
+ */
+static void xs_abort_connection(struct sock_xprt *transport)
+{
+	int result;
+	struct sockaddr any;
+
+	dprintk("RPC:       disconnecting xprt %p to reuse port\n", transport);
+
+	/*
+	 * Disconnect the transport socket by doing a connect operation
+	 * with AF_UNSPEC.  This should return immediately...
+	 */
+	memset(&any, 0, sizeof(any));
+	any.sa_family = AF_UNSPEC;
+	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
+	if (!result)
+		xs_sock_mark_closed(&transport->xprt);
+	else
+		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
+				result);
+}
+
+static void xs_tcp_reuse_connection(struct sock_xprt *transport)
+{
+	unsigned int state = transport->inet->sk_state;
+
+	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
+		/* we don't need to abort the connection if the socket
+		 * hasn't undergone a shutdown
+		 */
+		if (transport->inet->sk_shutdown == 0)
+			return;
+		dprintk("RPC:       %s: TCP_CLOSEd and sk_shutdown set to %d\n",
+				__func__, transport->inet->sk_shutdown);
+	}
+	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
+		/* we don't need to abort the connection if the socket
+		 * hasn't undergone a shutdown
+		 */
+		if (transport->inet->sk_shutdown == 0)
+			return;
+		dprintk("RPC:       %s: ESTABLISHED/SYN_SENT "
+				"sk_shutdown set to %d\n",
+				__func__, transport->inet->sk_shutdown);
+	}
+	xs_abort_connection(transport);
+}
+
+static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	int ret = -ENOTCONN;
+
+	if (!transport->inet) {
+		struct sock *sk = sock->sk;
+
+		write_lock_bh(&sk->sk_callback_lock);
+
+		xs_save_old_callbacks(transport, sk);
+
+		sk->sk_user_data = xprt;
+		sk->sk_data_ready = xs_tcp_data_ready;
+		sk->sk_state_change = xs_tcp_state_change;
+		sk->sk_write_space = xs_tcp_write_space;
+		sk->sk_error_report = xs_error_report;
+		sk->sk_allocation = GFP_ATOMIC;
+
+		/* socket options */
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+		sock_reset_flag(sk, SOCK_LINGER);
+		tcp_sk(sk)->linger2 = 0;
+		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
+
+		xprt_clear_connected(xprt);
+
+		/* Reset to new socket */
+		transport->sock = sock;
+		transport->inet = sk;
+
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+
+	if (!xprt_bound(xprt))
+		goto out;
+
+	/* Tell the socket layer to start connecting... */
+	xprt->stat.connect_count++;
+	xprt->stat.connect_start = jiffies;
+	ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);
+	switch (ret) {
+	case 0:
+	case -EINPROGRESS:
+		/* SYN_SENT! */
+		xprt->connect_cookie++;
+		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	}
+out:
+	return ret;
+}
+
+/**
+ * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_setup_socket(struct work_struct *work)
+{
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
+	struct socket *sock = transport->sock;
+	struct rpc_xprt *xprt = &transport->xprt;
+	int status = -EIO;
+
+	if (xprt->shutdown)
+		goto out;
+
+	if (!sock) {
+		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+		sock = xs_create_sock(xprt, transport,
+				xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP);
+		if (IS_ERR(sock)) {
+			status = PTR_ERR(sock);
+			goto out;
+		}
+	} else {
+		int abort_and_exit;
+
+		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+				&xprt->state);
+		/* "close" the socket, preserving the local port */
+		xs_tcp_reuse_connection(transport);
+
+		if (abort_and_exit)
+			goto out_eagain;
+	}
+
+	dprintk("RPC:       worker connecting xprt %p via %s to "
+				"%s (port %s)\n", xprt,
+			xprt->address_strings[RPC_DISPLAY_PROTO],
+			xprt->address_strings[RPC_DISPLAY_ADDR],
+			xprt->address_strings[RPC_DISPLAY_PORT]);
+
+	status = xs_tcp_finish_connecting(xprt, sock);
+	dprintk("RPC:       %p connect status %d connected %d sock state %d\n",
+			xprt, -status, xprt_connected(xprt),
+			sock->sk->sk_state);
+	switch (status) {
+	default:
+		printk("%s: connect returned unhandled error %d\n",
+			__func__, status);
+	case -EADDRNOTAVAIL:
+		/* We're probably in TIME_WAIT. Get rid of existing socket,
+		 * and retry
+		 */
+		set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
+		xprt_force_disconnect(xprt);
+		break;
+	case -ECONNREFUSED:
+	case -ECONNRESET:
+	case -ENETUNREACH:
+		/* retry with existing socket, after a delay */
+	case 0:
+	case -EINPROGRESS:
+	case -EALREADY:
+		xprt_clear_connecting(xprt);
+		return;
+	case -EINVAL:
+		/* Happens, for instance, if the user specified a link
+		 * local IPv6 address without a scope-id.
+		 */
+		goto out;
+	}
+out_eagain:
+	status = -EAGAIN;
+out:
+	xprt_clear_connecting(xprt);
+	xprt_wake_pending_tasks(xprt, status);
+}
+
+/**
+ * xs_connect - connect a socket to a remote endpoint
+ * @task: address of RPC task that manages state of connect request
+ *
+ * TCP: If the remote end dropped the connection, delay reconnecting.
+ *
+ * UDP socket connects are synchronous, but we use a work queue anyway
+ * to guarantee that even unprivileged user processes can set up a
+ * socket on a privileged port.
+ *
+ * If a UDP socket connect fails, the delay behavior here prevents
+ * retry floods (hard mounts).
+ */
+static void xs_connect(struct rpc_task *task)
+{
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
+				"seconds\n",
+				xprt, xprt->reestablish_timeout / HZ);
+		queue_delayed_work(rpciod_workqueue,
+				   &transport->connect_worker,
+				   xprt->reestablish_timeout);
+		xprt->reestablish_timeout <<= 1;
+		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+		if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
+			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
+	} else {
+		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
+		queue_delayed_work(rpciod_workqueue,
+				   &transport->connect_worker, 0);
+	}
+}
+
+/**
+ * xs_local_print_stats - display AF_LOCAL socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu "
+			"%llu %llu\n",
+			xprt->stat.bind_count,
+			xprt->stat.connect_count,
+			xprt->stat.connect_time,
+			idle_time,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
+/**
+ * xs_udp_print_stats - display UDP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+	seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n",
+			transport->srcport,
+			xprt->stat.bind_count,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
+/**
+ * xs_tcp_print_stats - display TCP socket-specifc stats
+ * @xprt: rpc_xprt struct containing statistics
+ * @seq: output file
+ *
+ */
+static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n",
+			transport->srcport,
+			xprt->stat.bind_count,
+			xprt->stat.connect_count,
+			xprt->stat.connect_time,
+			idle_time,
+			xprt->stat.sends,
+			xprt->stat.recvs,
+			xprt->stat.bad_xids,
+			xprt->stat.req_u,
+			xprt->stat.bklog_u);
+}
+
+/*
+ * Allocate a bunch of pages for a scratch buffer for the rpc code. The reason
+ * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
+ * to use the server side send routines.
+ */
+static void *bc_malloc(struct rpc_task *task, size_t size)
+{
+	struct page *page;
+	struct rpc_buffer *buf;
+
+	BUG_ON(size > PAGE_SIZE - sizeof(struct rpc_buffer));
+	page = alloc_page(GFP_KERNEL);
+
+	if (!page)
+		return NULL;
+
+	buf = page_address(page);
+	buf->len = PAGE_SIZE;
+
+	return buf->data;
+}
+
+/*
+ * Free the space allocated in the bc_alloc routine
+ */
+static void bc_free(void *buffer)
+{
+	struct rpc_buffer *buf;
+
+	if (!buffer)
+		return;
+
+	buf = container_of(buffer, struct rpc_buffer, data);
+	free_page((unsigned long)buf);
+}
+
+/*
+ * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
+ * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
+ */
+static int bc_sendto(struct rpc_rqst *req)
+{
+	int len;
+	struct xdr_buf *xbufp = &req->rq_snd_buf;
+	struct rpc_xprt *xprt = req->rq_xprt;
+	struct sock_xprt *transport =
+				container_of(xprt, struct sock_xprt, xprt);
+	struct socket *sock = transport->sock;
+	unsigned long headoff;
+	unsigned long tailoff;
+
+	xs_encode_stream_record_marker(xbufp);
+
+	tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
+	headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
+	len = svc_send_common(sock, xbufp,
+			      virt_to_page(xbufp->head[0].iov_base), headoff,
+			      xbufp->tail[0].iov_base, tailoff);
+
+	if (len != xbufp->len) {
+		printk(KERN_NOTICE "Error sending entire callback!\n");
+		len = -EAGAIN;
+	}
+
+	return len;
+}
+
+/*
+ * The send routine. Borrows from svc_send
+ */
+static int bc_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct svc_xprt	*xprt;
+	struct svc_sock         *svsk;
+	u32                     len;
+
+	dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
+	/*
+	 * Get the server socket associated with this callback xprt
+	 */
+	xprt = req->rq_xprt->bc_xprt;
+	svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+	/*
+	 * Grab the mutex to serialize data as the connection is shared
+	 * with the fore channel
+	 */
+	if (!mutex_trylock(&xprt->xpt_mutex)) {
+		rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL);
+		if (!mutex_trylock(&xprt->xpt_mutex))
+			return -EAGAIN;
+		rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task);
+	}
+	if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+		len = -ENOTCONN;
+	else
+		len = bc_sendto(req);
+	mutex_unlock(&xprt->xpt_mutex);
+
+	if (len > 0)
+		len = 0;
+
+	return len;
+}
+
+/*
+ * The close routine. Since this is client initiated, we do nothing
+ */
+
+static void bc_close(struct rpc_xprt *xprt)
+{
+}
+
+/*
+ * The xprt destroy routine. Again, because this connection is client
+ * initiated, we do nothing
+ */
+
+static void bc_destroy(struct rpc_xprt *xprt)
+{
+}
+
+static struct rpc_xprt_ops xs_local_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xs_tcp_release_xprt,
+	.rpcbind		= xs_local_rpcbind,
+	.set_port		= xs_local_set_port,
+	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
+	.send_request		= xs_local_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= xs_close,
+	.destroy		= xs_destroy,
+	.print_stats		= xs_local_print_stats,
+};
+
+static struct rpc_xprt_ops xs_udp_ops = {
+	.set_buffer_size	= xs_udp_set_buffer_size,
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong,
+	.rpcbind		= rpcb_getport_async,
+	.set_port		= xs_set_port,
+	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
+	.send_request		= xs_udp_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_rtt,
+	.timer			= xs_udp_timer,
+	.release_request	= xprt_release_rqst_cong,
+	.close			= xs_close,
+	.destroy		= xs_destroy,
+	.print_stats		= xs_udp_print_stats,
+};
+
+static struct rpc_xprt_ops xs_tcp_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xs_tcp_release_xprt,
+	.rpcbind		= rpcb_getport_async,
+	.set_port		= xs_set_port,
+	.connect		= xs_connect,
+	.buf_alloc		= rpc_malloc,
+	.buf_free		= rpc_free,
+	.send_request		= xs_tcp_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= xs_tcp_close,
+	.destroy		= xs_destroy,
+	.print_stats		= xs_tcp_print_stats,
+};
+
+/*
+ * The rpc_xprt_ops for the server backchannel
+ */
+
+static struct rpc_xprt_ops bc_tcp_ops = {
+	.reserve_xprt		= xprt_reserve_xprt,
+	.release_xprt		= xprt_release_xprt,
+	.buf_alloc		= bc_malloc,
+	.buf_free		= bc_free,
+	.send_request		= bc_send_request,
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
+	.close			= bc_close,
+	.destroy		= bc_destroy,
+	.print_stats		= xs_tcp_print_stats,
+};
+
+static int xs_init_anyaddr(const int family, struct sockaddr *sap)
+{
+	static const struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+	};
+	static const struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+	};
+
+	switch (family) {
+	case AF_LOCAL:
+		break;
+	case AF_INET:
+		memcpy(sap, &sin, sizeof(sin));
+		break;
+	case AF_INET6:
+		memcpy(sap, &sin6, sizeof(sin6));
+		break;
+	default:
+		dprintk("RPC:       %s: Bad address family\n", __func__);
+		return -EAFNOSUPPORT;
+	}
+	return 0;
+}
+
+static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
+				      unsigned int slot_table_size)
+{
+	struct rpc_xprt *xprt;
+	struct sock_xprt *new;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       xs_setup_xprt: address too large\n");
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size);
+	if (xprt == NULL) {
+		dprintk("RPC:       xs_setup_xprt: couldn't allocate "
+				"rpc_xprt\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	new = container_of(xprt, struct sock_xprt, xprt);
+	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+	xprt->addrlen = args->addrlen;
+	if (args->srcaddr)
+		memcpy(&new->srcaddr, args->srcaddr, args->addrlen);
+	else {
+		int err;
+		err = xs_init_anyaddr(args->dstaddr->sa_family,
+					(struct sockaddr *)&new->srcaddr);
+		if (err != 0)
+			return ERR_PTR(err);
+	}
+
+	return xprt;
+}
+
+static const struct rpc_timeout xs_local_default_timeout = {
+	.to_initval = 10 * HZ,
+	.to_maxval = 10 * HZ,
+	.to_retries = 2,
+};
+
+/**
+ * xs_setup_local - Set up transport to use an AF_LOCAL socket
+ * @args: rpc transport creation arguments
+ *
+ * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP
+ */
+static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
+{
+	struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr;
+	struct sock_xprt *transport;
+	struct rpc_xprt *xprt;
+	struct rpc_xprt *ret;
+
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = 0;
+	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+	xprt->bind_timeout = XS_BIND_TO;
+	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+	xprt->ops = &xs_local_ops;
+	xprt->timeout = &xs_local_default_timeout;
+
+	switch (sun->sun_family) {
+	case AF_LOCAL:
+		if (sun->sun_path[0] != '/') {
+			dprintk("RPC:       bad AF_LOCAL address: %s\n",
+					sun->sun_path);
+			ret = ERR_PTR(-EINVAL);
+			goto out_err;
+		}
+		xprt_set_bound(xprt);
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_local_setup_socket);
+		xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL);
+		break;
+	default:
+		ret = ERR_PTR(-EAFNOSUPPORT);
+		goto out_err;
+	}
+
+	dprintk("RPC:       set up xprt to %s via AF_LOCAL\n",
+			xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	ret = ERR_PTR(-EINVAL);
+out_err:
+	xprt_free(xprt);
+	return ret;
+}
+
+static const struct rpc_timeout xs_udp_default_timeout = {
+	.to_initval = 5 * HZ,
+	.to_maxval = 30 * HZ,
+	.to_increment = 5 * HZ,
+	.to_retries = 5,
+};
+
+/**
+ * xs_setup_udp - Set up transport to use a UDP socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
+{
+	struct sockaddr *addr = args->dstaddr;
+	struct rpc_xprt *xprt;
+	struct sock_xprt *transport;
+	struct rpc_xprt *ret;
+
+	xprt = xs_setup_xprt(args, xprt_udp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = IPPROTO_UDP;
+	xprt->tsh_size = 0;
+	/* XXX: header size can vary due to auth type, IPv6, etc. */
+	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
+
+	xprt->bind_timeout = XS_BIND_TO;
+	xprt->reestablish_timeout = XS_UDP_REEST_TO;
+	xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+	xprt->ops = &xs_udp_ops;
+
+	xprt->timeout = &xs_udp_default_timeout;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_setup_socket);
+		xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_udp_setup_socket);
+		xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
+		break;
+	default:
+		ret = ERR_PTR(-EAFNOSUPPORT);
+		goto out_err;
+	}
+
+	if (xprt_bound(xprt))
+		dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PORT],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+	else
+		dprintk("RPC:       set up xprt to %s (autobind) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	ret = ERR_PTR(-EINVAL);
+out_err:
+	xprt_free(xprt);
+	return ret;
+}
+
+static const struct rpc_timeout xs_tcp_default_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+	.to_retries = 2,
+};
+
+/**
+ * xs_setup_tcp - Set up transport to use a TCP socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
+{
+	struct sockaddr *addr = args->dstaddr;
+	struct rpc_xprt *xprt;
+	struct sock_xprt *transport;
+	struct rpc_xprt *ret;
+
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = IPPROTO_TCP;
+	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+
+	xprt->bind_timeout = XS_BIND_TO;
+	xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+	xprt->idle_timeout = XS_IDLE_DISC_TO;
+
+	xprt->ops = &xs_tcp_ops;
+	xprt->timeout = &xs_tcp_default_timeout;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_tcp_setup_socket);
+		xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
+		break;
+	case AF_INET6:
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			xprt_set_bound(xprt);
+
+		INIT_DELAYED_WORK(&transport->connect_worker,
+					xs_tcp_setup_socket);
+		xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
+		break;
+	default:
+		ret = ERR_PTR(-EAFNOSUPPORT);
+		goto out_err;
+	}
+
+	if (xprt_bound(xprt))
+		dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PORT],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+	else
+		dprintk("RPC:       set up xprt to %s (autobind) via %s\n",
+				xprt->address_strings[RPC_DISPLAY_ADDR],
+				xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	ret = ERR_PTR(-EINVAL);
+out_err:
+	xprt_free(xprt);
+	return ret;
+}
+
+/**
+ * xs_setup_bc_tcp - Set up transport to use a TCP backchannel socket
+ * @args: rpc transport creation arguments
+ *
+ */
+static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
+{
+	struct sockaddr *addr = args->dstaddr;
+	struct rpc_xprt *xprt;
+	struct sock_xprt *transport;
+	struct svc_sock *bc_sock;
+	struct rpc_xprt *ret;
+
+	if (args->bc_xprt->xpt_bc_xprt) {
+		/*
+		 * This server connection already has a backchannel
+		 * export; we can't create a new one, as we wouldn't be
+		 * able to match replies based on xid any more.  So,
+		 * reuse the already-existing one:
+		 */
+		 return args->bc_xprt->xpt_bc_xprt;
+	}
+	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries);
+	if (IS_ERR(xprt))
+		return xprt;
+	transport = container_of(xprt, struct sock_xprt, xprt);
+
+	xprt->prot = IPPROTO_TCP;
+	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
+	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
+	xprt->timeout = &xs_tcp_default_timeout;
+
+	/* backchannel */
+	xprt_set_bound(xprt);
+	xprt->bind_timeout = 0;
+	xprt->reestablish_timeout = 0;
+	xprt->idle_timeout = 0;
+
+	xprt->ops = &bc_tcp_ops;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		xs_format_peer_addresses(xprt, "tcp",
+					 RPCBIND_NETID_TCP);
+		break;
+	case AF_INET6:
+		xs_format_peer_addresses(xprt, "tcp",
+				   RPCBIND_NETID_TCP6);
+		break;
+	default:
+		ret = ERR_PTR(-EAFNOSUPPORT);
+		goto out_err;
+	}
+
+	dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
+			xprt->address_strings[RPC_DISPLAY_ADDR],
+			xprt->address_strings[RPC_DISPLAY_PORT],
+			xprt->address_strings[RPC_DISPLAY_PROTO]);
+
+	/*
+	 * Once we've associated a backchannel xprt with a connection,
+	 * we want to keep it around as long as long as the connection
+	 * lasts, in case we need to start using it for a backchannel
+	 * again; this reference won't be dropped until bc_xprt is
+	 * destroyed.
+	 */
+	xprt_get(xprt);
+	args->bc_xprt->xpt_bc_xprt = xprt;
+	xprt->bc_xprt = args->bc_xprt;
+	bc_sock = container_of(args->bc_xprt, struct svc_sock, sk_xprt);
+	transport->sock = bc_sock->sk_sock;
+	transport->inet = bc_sock->sk_sk;
+
+	/*
+	 * Since we don't want connections for the backchannel, we set
+	 * the xprt status to connected
+	 */
+	xprt_set_connected(xprt);
+
+
+	if (try_module_get(THIS_MODULE))
+		return xprt;
+	xprt_put(xprt);
+	ret = ERR_PTR(-EINVAL);
+out_err:
+	xprt_free(xprt);
+	return ret;
+}
+
+static struct xprt_class	xs_local_transport = {
+	.list		= LIST_HEAD_INIT(xs_local_transport.list),
+	.name		= "named UNIX socket",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_LOCAL,
+	.setup		= xs_setup_local,
+};
+
+static struct xprt_class	xs_udp_transport = {
+	.list		= LIST_HEAD_INIT(xs_udp_transport.list),
+	.name		= "udp",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_UDP,
+	.setup		= xs_setup_udp,
+};
+
+static struct xprt_class	xs_tcp_transport = {
+	.list		= LIST_HEAD_INIT(xs_tcp_transport.list),
+	.name		= "tcp",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_TCP,
+	.setup		= xs_setup_tcp,
+};
+
+static struct xprt_class	xs_bc_tcp_transport = {
+	.list		= LIST_HEAD_INIT(xs_bc_tcp_transport.list),
+	.name		= "tcp NFSv4.1 backchannel",
+	.owner		= THIS_MODULE,
+	.ident		= XPRT_TRANSPORT_BC_TCP,
+	.setup		= xs_setup_bc_tcp,
+};
+
+/**
+ * init_socket_xprt - set up xprtsock's sysctls, register with RPC client
+ *
+ */
+int init_socket_xprt(void)
+{
+#ifdef RPC_DEBUG
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+
+	xprt_register_transport(&xs_local_transport);
+	xprt_register_transport(&xs_udp_transport);
+	xprt_register_transport(&xs_tcp_transport);
+	xprt_register_transport(&xs_bc_tcp_transport);
+
+	return 0;
+}
+
+/**
+ * cleanup_socket_xprt - remove xprtsock's sysctls, unregister
+ *
+ */
+void cleanup_socket_xprt(void)
+{
+#ifdef RPC_DEBUG
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+
+	xprt_unregister_transport(&xs_local_transport);
+	xprt_unregister_transport(&xs_udp_transport);
+	xprt_unregister_transport(&xs_tcp_transport);
+	xprt_unregister_transport(&xs_bc_tcp_transport);
+}
+
+static int param_set_uint_minmax(const char *val,
+		const struct kernel_param *kp,
+		unsigned int min, unsigned int max)
+{
+	unsigned long num;
+	int ret;
+
+	if (!val)
+		return -EINVAL;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL || num < min || num > max)
+		return -EINVAL;
+	*((unsigned int *)kp->arg) = num;
+	return 0;
+}
+
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
+{
+	return param_set_uint_minmax(val, kp,
+			RPC_MIN_RESVPORT,
+			RPC_MAX_RESVPORT);
+}
+
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
+
+#define param_check_portnr(name, p) \
+	__param_check(name, p, unsigned int);
+
+module_param_named(min_resvport, xprt_min_resvport, portnr, 0644);
+module_param_named(max_resvport, xprt_max_resvport, portnr, 0644);
+
+static int param_set_slot_table_size(const char *val,
+				     const struct kernel_param *kp)
+{
+	return param_set_uint_minmax(val, kp,
+			RPC_MIN_SLOT_TABLE,
+			RPC_MAX_SLOT_TABLE);
+}
+
+static struct kernel_param_ops param_ops_slot_table_size = {
+	.set = param_set_slot_table_size,
+	.get = param_get_uint,
+};
+
+#define param_check_slot_table_size(name, p) \
+	__param_check(name, p, unsigned int);
+
+module_param_named(tcp_slot_table_entries, xprt_tcp_slot_table_entries,
+		   slot_table_size, 0644);
+module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries,
+		   slot_table_size, 0644);
+
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
new file mode 100644
index 00000000..ca84212c
--- /dev/null
+++ b/net/sysctl_net.c
@@ -0,0 +1,129 @@
+/* -*- linux-c -*-
+ * sysctl_net.c: sysctl interface to net subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net directories for each protocol family. [MS]
+ *
+ * Revision 1.2  1996/05/08  20:24:40  shaver
+ * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and
+ * NET_IPV4_IP_FORWARD.
+ *
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/nsproxy.h>
+
+#include <net/sock.h>
+
+#ifdef CONFIG_INET
+#include <net/ip.h>
+#endif
+
+#ifdef CONFIG_NET
+#include <linux/if_ether.h>
+#endif
+
+#ifdef CONFIG_TR
+#include <linux/if_tr.h>
+#endif
+
+static struct ctl_table_set *
+net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	return &namespaces->net_ns->sysctls;
+}
+
+static int is_seen(struct ctl_table_set *set)
+{
+	return &current->nsproxy->net_ns->sysctls == set;
+}
+
+/* Return standard mode bits for table entry. */
+static int net_ctl_permissions(struct ctl_table_root *root,
+			       struct nsproxy *nsproxy,
+			       struct ctl_table *table)
+{
+	/* Allow network administrator to have same access as root. */
+	if (capable(CAP_NET_ADMIN)) {
+		int mode = (table->mode >> 6) & 7;
+		return (mode << 6) | (mode << 3) | mode;
+	}
+	return table->mode;
+}
+
+static struct ctl_table_root net_sysctl_root = {
+	.lookup = net_ctl_header_lookup,
+	.permissions = net_ctl_permissions,
+};
+
+static int net_ctl_ro_header_perms(struct ctl_table_root *root,
+		struct nsproxy *namespaces, struct ctl_table *table)
+{
+	if (net_eq(namespaces->net_ns, &init_net))
+		return table->mode;
+	else
+		return table->mode & ~0222;
+}
+
+static struct ctl_table_root net_sysctl_ro_root = {
+	.permissions = net_ctl_ro_header_perms,
+};
+
+static int __net_init sysctl_net_init(struct net *net)
+{
+	setup_sysctl_set(&net->sysctls,
+			 &net_sysctl_ro_root.default_set,
+			 is_seen);
+	return 0;
+}
+
+static void __net_exit sysctl_net_exit(struct net *net)
+{
+	WARN_ON(!list_empty(&net->sysctls.list));
+}
+
+static struct pernet_operations sysctl_pernet_ops = {
+	.init = sysctl_net_init,
+	.exit = sysctl_net_exit,
+};
+
+static __init int sysctl_init(void)
+{
+	int ret;
+	ret = register_pernet_subsys(&sysctl_pernet_ops);
+	if (ret)
+		goto out;
+	register_sysctl_root(&net_sysctl_root);
+	setup_sysctl_set(&net_sysctl_ro_root.default_set, NULL, NULL);
+	register_sysctl_root(&net_sysctl_ro_root);
+out:
+	return ret;
+}
+subsys_initcall(sysctl_init);
+
+struct ctl_table_header *register_net_sysctl_table(struct net *net,
+	const struct ctl_path *path, struct ctl_table *table)
+{
+	struct nsproxy namespaces;
+	namespaces = *current->nsproxy;
+	namespaces.net_ns = net;
+	return __register_sysctl_paths(&net_sysctl_root,
+					&namespaces, path, table);
+}
+EXPORT_SYMBOL_GPL(register_net_sysctl_table);
+
+struct ctl_table_header *register_net_sysctl_rotable(const
+		struct ctl_path *path, struct ctl_table *table)
+{
+	return __register_sysctl_paths(&net_sysctl_ro_root,
+			&init_nsproxy, path, table);
+}
+EXPORT_SYMBOL_GPL(register_net_sysctl_rotable);
+
+void unregister_net_sysctl_table(struct ctl_table_header *header)
+{
+	unregister_sysctl_table(header);
+}
+EXPORT_SYMBOL_GPL(unregister_net_sysctl_table);
diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
new file mode 100644
index 00000000..2c5954b8
--- /dev/null
+++ b/net/tipc/Kconfig
@@ -0,0 +1,69 @@
+#
+# TIPC configuration
+#
+
+menuconfig TIPC
+	tristate "The TIPC Protocol (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	---help---
+	  The Transparent Inter Process Communication (TIPC) protocol is
+	  specially designed for intra cluster communication. This protocol
+	  originates from Ericsson where it has been used in carrier grade
+	  cluster applications for many years.
+
+	  For more information about TIPC, see http://tipc.sourceforge.net.
+
+	  This protocol support is also available as a module ( = code which
+	  can be inserted in and removed from the running kernel whenever you
+	  want). The module will be called tipc. If you want to compile it
+	  as a module, say M here and read <file:Documentation/kbuild/modules.txt>.
+
+	  If in doubt, say N.
+
+if TIPC
+
+config TIPC_ADVANCED
+	bool "Advanced TIPC configuration"
+	default n
+	help
+	  Saying Y here will open some advanced configuration for TIPC.
+	  Most users do not need to bother; if unsure, just say N.
+
+config TIPC_PORTS
+	int "Maximum number of ports in a node"
+	depends on TIPC_ADVANCED
+	range 127 65535
+	default "8191"
+	help
+	  Specifies how many ports can be supported by a node.
+	  Can range from 127 to 65535 ports; default is 8191.
+
+	  Setting this to a smaller value saves some memory,
+	  setting it to higher allows for more ports.
+
+config TIPC_LOG
+	int "Size of log buffer"
+	depends on TIPC_ADVANCED
+	range 0 32768
+	default "0"
+	help
+	  Size (in bytes) of TIPC's internal log buffer, which records the
+	  occurrence of significant events.  Can range from 0 to 32768 bytes;
+	  default is 0.
+
+	  There is no need to enable the log buffer unless the node will be
+	  managed remotely via TIPC.
+
+config TIPC_DEBUG
+	bool "Enable debugging support"
+	default n
+	help
+	  Saying Y here enables TIPC debugging capabilities used by developers.
+	  Most users do not need to bother; if unsure, just say N.
+
+	  Enabling debugging support causes TIPC to display data about its
+	  internal state when certain abnormal conditions occur. It also
+	  makes it easy for developers to capture additional information of
+	  interest using the dbg() or msg_dbg() macros.
+
+endif # TIPC
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
new file mode 100644
index 00000000..521d24d0
--- /dev/null
+++ b/net/tipc/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the Linux TIPC layer
+#
+
+obj-$(CONFIG_TIPC) := tipc.o
+
+tipc-y	+= addr.o bcast.o bearer.o config.o \
+	   core.o handler.o link.o discover.o msg.o  \
+	   name_distr.o  subscr.o name_table.o net.o  \
+	   netlink.o node.o node_subscr.o port.o ref.o  \
+	   socket.o log.o eth_media.o
+
+# End of file
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
new file mode 100644
index 00000000..a6fdab33
--- /dev/null
+++ b/net/tipc/addr.c
@@ -0,0 +1,106 @@
+/*
+ * net/tipc/addr.c: TIPC address utility routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "addr.h"
+
+/**
+ * tipc_addr_domain_valid - validates a network domain address
+ *
+ * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>,
+ * where Z, C, and N are non-zero.
+ *
+ * Returns 1 if domain address is valid, otherwise 0
+ */
+
+int tipc_addr_domain_valid(u32 addr)
+{
+	u32 n = tipc_node(addr);
+	u32 c = tipc_cluster(addr);
+	u32 z = tipc_zone(addr);
+
+	if (n && (!z || !c))
+		return 0;
+	if (c && !z)
+		return 0;
+	return 1;
+}
+
+/**
+ * tipc_addr_node_valid - validates a proposed network address for this node
+ *
+ * Accepts <Z.C.N>, where Z, C, and N are non-zero.
+ *
+ * Returns 1 if address can be used, otherwise 0
+ */
+
+int tipc_addr_node_valid(u32 addr)
+{
+	return tipc_addr_domain_valid(addr) && tipc_node(addr);
+}
+
+int tipc_in_scope(u32 domain, u32 addr)
+{
+	if (!domain || (domain == addr))
+		return 1;
+	if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
+		return 1;
+	if (domain == tipc_zone_mask(addr)) /* domain <Z.0.0> */
+		return 1;
+	return 0;
+}
+
+/**
+ * tipc_addr_scope - convert message lookup domain to a 2-bit scope value
+ */
+
+int tipc_addr_scope(u32 domain)
+{
+	if (likely(!domain))
+		return TIPC_ZONE_SCOPE;
+	if (tipc_node(domain))
+		return TIPC_NODE_SCOPE;
+	if (tipc_cluster(domain))
+		return TIPC_CLUSTER_SCOPE;
+	return TIPC_ZONE_SCOPE;
+}
+
+char *tipc_addr_string_fill(char *string, u32 addr)
+{
+	snprintf(string, 16, "<%u.%u.%u>",
+		 tipc_zone(addr), tipc_cluster(addr), tipc_node(addr));
+	return string;
+}
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
new file mode 100644
index 00000000..e4f35afe
--- /dev/null
+++ b/net/tipc/addr.h
@@ -0,0 +1,79 @@
+/*
+ * net/tipc/addr.h: Include file for TIPC address utility routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_ADDR_H
+#define _TIPC_ADDR_H
+
+#define TIPC_ZONE_MASK		0xff000000u
+#define TIPC_CLUSTER_MASK	0xfffff000u
+
+static inline u32 tipc_zone_mask(u32 addr)
+{
+	return addr & TIPC_ZONE_MASK;
+}
+
+static inline u32 tipc_cluster_mask(u32 addr)
+{
+	return addr & TIPC_CLUSTER_MASK;
+}
+
+static inline int in_own_cluster(u32 addr)
+{
+	return !((addr ^ tipc_own_addr) >> 12);
+}
+
+/**
+ * addr_domain - convert 2-bit scope value to equivalent message lookup domain
+ *
+ * Needed when address of a named message must be looked up a second time
+ * after a network hop.
+ */
+
+static inline u32 addr_domain(u32 sc)
+{
+	if (likely(sc == TIPC_NODE_SCOPE))
+		return tipc_own_addr;
+	if (sc == TIPC_CLUSTER_SCOPE)
+		return tipc_cluster_mask(tipc_own_addr);
+	return tipc_zone_mask(tipc_own_addr);
+}
+
+int tipc_addr_domain_valid(u32);
+int tipc_addr_node_valid(u32 addr);
+int tipc_in_scope(u32 domain, u32 addr);
+int tipc_addr_scope(u32 domain);
+char *tipc_addr_string_fill(char *string, u32 addr);
+#endif
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
new file mode 100644
index 00000000..fa68d1e9
--- /dev/null
+++ b/net/tipc/bcast.c
@@ -0,0 +1,914 @@
+/*
+ * net/tipc/bcast.c: TIPC broadcast code
+ *
+ * Copyright (c) 2004-2006, Ericsson AB
+ * Copyright (c) 2004, Intel Corporation.
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "port.h"
+#include "bcast.h"
+
+#define MAX_PKT_DEFAULT_MCAST 1500	/* bcast link max packet size (fixed) */
+
+#define BCLINK_WIN_DEFAULT 20		/* bcast link window size (default) */
+
+/**
+ * struct bcbearer_pair - a pair of bearers used by broadcast link
+ * @primary: pointer to primary bearer
+ * @secondary: pointer to secondary bearer
+ *
+ * Bearers must have same priority and same set of reachable destinations
+ * to be paired.
+ */
+
+struct bcbearer_pair {
+	struct tipc_bearer *primary;
+	struct tipc_bearer *secondary;
+};
+
+/**
+ * struct bcbearer - bearer used by broadcast link
+ * @bearer: (non-standard) broadcast bearer structure
+ * @media: (non-standard) broadcast media structure
+ * @bpairs: array of bearer pairs
+ * @bpairs_temp: temporary array of bearer pairs used by tipc_bcbearer_sort()
+ * @remains: temporary node map used by tipc_bcbearer_send()
+ * @remains_new: temporary node map used tipc_bcbearer_send()
+ *
+ * Note: The fields labelled "temporary" are incorporated into the bearer
+ * to avoid consuming potentially limited stack space through the use of
+ * large local variables within multicast routines.  Concurrent access is
+ * prevented through use of the spinlock "bc_lock".
+ */
+
+struct bcbearer {
+	struct tipc_bearer bearer;
+	struct media media;
+	struct bcbearer_pair bpairs[MAX_BEARERS];
+	struct bcbearer_pair bpairs_temp[TIPC_MAX_LINK_PRI + 1];
+	struct tipc_node_map remains;
+	struct tipc_node_map remains_new;
+};
+
+/**
+ * struct bclink - link used for broadcast messages
+ * @link: (non-standard) broadcast link structure
+ * @node: (non-standard) node structure representing b'cast link's peer node
+ * @retransmit_to: node that most recently requested a retransmit
+ *
+ * Handles sequence numbering, fragmentation, bundling, etc.
+ */
+
+struct bclink {
+	struct link link;
+	struct tipc_node node;
+	struct tipc_node *retransmit_to;
+};
+
+
+static struct bcbearer *bcbearer;
+static struct bclink *bclink;
+static struct link *bcl;
+static DEFINE_SPINLOCK(bc_lock);
+
+/* broadcast-capable node map */
+struct tipc_node_map tipc_bcast_nmap;
+
+const char tipc_bclink_name[] = "broadcast-link";
+
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+			   struct tipc_node_map *nm_b,
+			   struct tipc_node_map *nm_diff);
+
+static u32 buf_seqno(struct sk_buff *buf)
+{
+	return msg_seqno(buf_msg(buf));
+}
+
+static u32 bcbuf_acks(struct sk_buff *buf)
+{
+	return (u32)(unsigned long)TIPC_SKB_CB(buf)->handle;
+}
+
+static void bcbuf_set_acks(struct sk_buff *buf, u32 acks)
+{
+	TIPC_SKB_CB(buf)->handle = (void *)(unsigned long)acks;
+}
+
+static void bcbuf_decr_acks(struct sk_buff *buf)
+{
+	bcbuf_set_acks(buf, bcbuf_acks(buf) - 1);
+}
+
+
+static void bclink_set_last_sent(void)
+{
+	if (bcl->next_out)
+		bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1);
+	else
+		bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
+}
+
+u32 tipc_bclink_get_last_sent(void)
+{
+	return bcl->fsm_msg_cnt;
+}
+
+/**
+ * bclink_set_gap - set gap according to contents of current deferred pkt queue
+ *
+ * Called with 'node' locked, bc_lock unlocked
+ */
+
+static void bclink_set_gap(struct tipc_node *n_ptr)
+{
+	struct sk_buff *buf = n_ptr->bclink.deferred_head;
+
+	n_ptr->bclink.gap_after = n_ptr->bclink.gap_to =
+		mod(n_ptr->bclink.last_in);
+	if (unlikely(buf != NULL))
+		n_ptr->bclink.gap_to = mod(buf_seqno(buf) - 1);
+}
+
+/**
+ * bclink_ack_allowed - test if ACK or NACK message can be sent at this moment
+ *
+ * This mechanism endeavours to prevent all nodes in network from trying
+ * to ACK or NACK at the same time.
+ *
+ * Note: TIPC uses a different trigger to distribute ACKs than it does to
+ *       distribute NACKs, but tries to use the same spacing (divide by 16).
+ */
+
+static int bclink_ack_allowed(u32 n)
+{
+	return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag;
+}
+
+
+/**
+ * tipc_bclink_retransmit_to - get most recent node to request retransmission
+ *
+ * Called with bc_lock locked
+ */
+
+struct tipc_node *tipc_bclink_retransmit_to(void)
+{
+	return bclink->retransmit_to;
+}
+
+/**
+ * bclink_retransmit_pkt - retransmit broadcast packets
+ * @after: sequence number of last packet to *not* retransmit
+ * @to: sequence number of last packet to retransmit
+ *
+ * Called with bc_lock locked
+ */
+
+static void bclink_retransmit_pkt(u32 after, u32 to)
+{
+	struct sk_buff *buf;
+
+	buf = bcl->first_out;
+	while (buf && less_eq(buf_seqno(buf), after))
+		buf = buf->next;
+	tipc_link_retransmit(bcl, buf, mod(to - after));
+}
+
+/**
+ * tipc_bclink_acknowledge - handle acknowledgement of broadcast packets
+ * @n_ptr: node that sent acknowledgement info
+ * @acked: broadcast sequence # that has been acknowledged
+ *
+ * Node is locked, bc_lock unlocked.
+ */
+
+void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
+{
+	struct sk_buff *crs;
+	struct sk_buff *next;
+	unsigned int released = 0;
+
+	if (less_eq(acked, n_ptr->bclink.acked))
+		return;
+
+	spin_lock_bh(&bc_lock);
+
+	/* Skip over packets that node has previously acknowledged */
+
+	crs = bcl->first_out;
+	while (crs && less_eq(buf_seqno(crs), n_ptr->bclink.acked))
+		crs = crs->next;
+
+	/* Update packets that node is now acknowledging */
+
+	while (crs && less_eq(buf_seqno(crs), acked)) {
+		next = crs->next;
+		bcbuf_decr_acks(crs);
+		if (bcbuf_acks(crs) == 0) {
+			bcl->first_out = next;
+			bcl->out_queue_size--;
+			buf_discard(crs);
+			released = 1;
+		}
+		crs = next;
+	}
+	n_ptr->bclink.acked = acked;
+
+	/* Try resolving broadcast link congestion, if necessary */
+
+	if (unlikely(bcl->next_out)) {
+		tipc_link_push_queue(bcl);
+		bclink_set_last_sent();
+	}
+	if (unlikely(released && !list_empty(&bcl->waiting_ports)))
+		tipc_link_wakeup_ports(bcl, 0);
+	spin_unlock_bh(&bc_lock);
+}
+
+/**
+ * bclink_send_ack - unicast an ACK msg
+ *
+ * tipc_net_lock and node lock set
+ */
+
+static void bclink_send_ack(struct tipc_node *n_ptr)
+{
+	struct link *l_ptr = n_ptr->active_links[n_ptr->addr & 1];
+
+	if (l_ptr != NULL)
+		tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+}
+
+/**
+ * bclink_send_nack- broadcast a NACK msg
+ *
+ * tipc_net_lock and node lock set
+ */
+
+static void bclink_send_nack(struct tipc_node *n_ptr)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+
+	if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to))
+		return;
+
+	buf = tipc_buf_acquire(INT_H_SIZE);
+	if (buf) {
+		msg = buf_msg(buf);
+		tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
+			 INT_H_SIZE, n_ptr->addr);
+		msg_set_non_seq(msg, 1);
+		msg_set_mc_netid(msg, tipc_net_id);
+		msg_set_bcast_ack(msg, mod(n_ptr->bclink.last_in));
+		msg_set_bcgap_after(msg, n_ptr->bclink.gap_after);
+		msg_set_bcgap_to(msg, n_ptr->bclink.gap_to);
+		msg_set_bcast_tag(msg, tipc_own_tag);
+
+		if (tipc_bearer_send(&bcbearer->bearer, buf, NULL)) {
+			bcl->stats.sent_nacks++;
+			buf_discard(buf);
+		} else {
+			tipc_bearer_schedule(bcl->b_ptr, bcl);
+			bcl->proto_msg_queue = buf;
+			bcl->stats.bearer_congs++;
+		}
+
+		/*
+		 * Ensure we doesn't send another NACK msg to the node
+		 * until 16 more deferred messages arrive from it
+		 * (i.e. helps prevent all nodes from NACK'ing at same time)
+		 */
+
+		n_ptr->bclink.nack_sync = tipc_own_tag;
+	}
+}
+
+/**
+ * tipc_bclink_check_gap - send a NACK if a sequence gap exists
+ *
+ * tipc_net_lock and node lock set
+ */
+
+void tipc_bclink_check_gap(struct tipc_node *n_ptr, u32 last_sent)
+{
+	if (!n_ptr->bclink.supported ||
+	    less_eq(last_sent, mod(n_ptr->bclink.last_in)))
+		return;
+
+	bclink_set_gap(n_ptr);
+	if (n_ptr->bclink.gap_after == n_ptr->bclink.gap_to)
+		n_ptr->bclink.gap_to = last_sent;
+	bclink_send_nack(n_ptr);
+}
+
+/**
+ * tipc_bclink_peek_nack - process a NACK msg meant for another node
+ *
+ * Only tipc_net_lock set.
+ */
+
+static void tipc_bclink_peek_nack(u32 dest, u32 sender_tag, u32 gap_after, u32 gap_to)
+{
+	struct tipc_node *n_ptr = tipc_node_find(dest);
+	u32 my_after, my_to;
+
+	if (unlikely(!n_ptr || !tipc_node_is_up(n_ptr)))
+		return;
+	tipc_node_lock(n_ptr);
+	/*
+	 * Modify gap to suppress unnecessary NACKs from this node
+	 */
+	my_after = n_ptr->bclink.gap_after;
+	my_to = n_ptr->bclink.gap_to;
+
+	if (less_eq(gap_after, my_after)) {
+		if (less(my_after, gap_to) && less(gap_to, my_to))
+			n_ptr->bclink.gap_after = gap_to;
+		else if (less_eq(my_to, gap_to))
+			n_ptr->bclink.gap_to = n_ptr->bclink.gap_after;
+	} else if (less_eq(gap_after, my_to)) {
+		if (less_eq(my_to, gap_to))
+			n_ptr->bclink.gap_to = gap_after;
+	} else {
+		/*
+		 * Expand gap if missing bufs not in deferred queue:
+		 */
+		struct sk_buff *buf = n_ptr->bclink.deferred_head;
+		u32 prev = n_ptr->bclink.gap_to;
+
+		for (; buf; buf = buf->next) {
+			u32 seqno = buf_seqno(buf);
+
+			if (mod(seqno - prev) != 1) {
+				buf = NULL;
+				break;
+			}
+			if (seqno == gap_after)
+				break;
+			prev = seqno;
+		}
+		if (buf == NULL)
+			n_ptr->bclink.gap_to = gap_after;
+	}
+	/*
+	 * Some nodes may send a complementary NACK now:
+	 */
+	if (bclink_ack_allowed(sender_tag + 1)) {
+		if (n_ptr->bclink.gap_to != n_ptr->bclink.gap_after) {
+			bclink_send_nack(n_ptr);
+			bclink_set_gap(n_ptr);
+		}
+	}
+	tipc_node_unlock(n_ptr);
+}
+
+/**
+ * tipc_bclink_send_msg - broadcast a packet to all nodes in cluster
+ */
+
+int tipc_bclink_send_msg(struct sk_buff *buf)
+{
+	int res;
+
+	spin_lock_bh(&bc_lock);
+
+	res = tipc_link_send_buf(bcl, buf);
+	if (likely(res > 0))
+		bclink_set_last_sent();
+
+	bcl->stats.queue_sz_counts++;
+	bcl->stats.accu_queue_sz += bcl->out_queue_size;
+
+	spin_unlock_bh(&bc_lock);
+	return res;
+}
+
+/**
+ * tipc_bclink_recv_pkt - receive a broadcast packet, and deliver upwards
+ *
+ * tipc_net_lock is read_locked, no other locks set
+ */
+
+void tipc_bclink_recv_pkt(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct tipc_node *node = tipc_node_find(msg_prevnode(msg));
+	u32 next_in;
+	u32 seqno;
+	struct sk_buff *deferred;
+
+	if (unlikely(!node || !tipc_node_is_up(node) || !node->bclink.supported ||
+		     (msg_mc_netid(msg) != tipc_net_id))) {
+		buf_discard(buf);
+		return;
+	}
+
+	if (unlikely(msg_user(msg) == BCAST_PROTOCOL)) {
+		if (msg_destnode(msg) == tipc_own_addr) {
+			tipc_node_lock(node);
+			tipc_bclink_acknowledge(node, msg_bcast_ack(msg));
+			tipc_node_unlock(node);
+			spin_lock_bh(&bc_lock);
+			bcl->stats.recv_nacks++;
+			bclink->retransmit_to = node;
+			bclink_retransmit_pkt(msg_bcgap_after(msg),
+					      msg_bcgap_to(msg));
+			spin_unlock_bh(&bc_lock);
+		} else {
+			tipc_bclink_peek_nack(msg_destnode(msg),
+					      msg_bcast_tag(msg),
+					      msg_bcgap_after(msg),
+					      msg_bcgap_to(msg));
+		}
+		buf_discard(buf);
+		return;
+	}
+
+	tipc_node_lock(node);
+receive:
+	deferred = node->bclink.deferred_head;
+	next_in = mod(node->bclink.last_in + 1);
+	seqno = msg_seqno(msg);
+
+	if (likely(seqno == next_in)) {
+		bcl->stats.recv_info++;
+		node->bclink.last_in++;
+		bclink_set_gap(node);
+		if (unlikely(bclink_ack_allowed(seqno))) {
+			bclink_send_ack(node);
+			bcl->stats.sent_acks++;
+		}
+		if (likely(msg_isdata(msg))) {
+			tipc_node_unlock(node);
+			tipc_port_recv_mcast(buf, NULL);
+		} else if (msg_user(msg) == MSG_BUNDLER) {
+			bcl->stats.recv_bundles++;
+			bcl->stats.recv_bundled += msg_msgcnt(msg);
+			tipc_node_unlock(node);
+			tipc_link_recv_bundle(buf);
+		} else if (msg_user(msg) == MSG_FRAGMENTER) {
+			bcl->stats.recv_fragments++;
+			if (tipc_link_recv_fragment(&node->bclink.defragm,
+						    &buf, &msg))
+				bcl->stats.recv_fragmented++;
+			tipc_node_unlock(node);
+			tipc_net_route_msg(buf);
+		} else {
+			tipc_node_unlock(node);
+			tipc_net_route_msg(buf);
+		}
+		if (deferred && (buf_seqno(deferred) == mod(next_in + 1))) {
+			tipc_node_lock(node);
+			buf = deferred;
+			msg = buf_msg(buf);
+			node->bclink.deferred_head = deferred->next;
+			goto receive;
+		}
+		return;
+	} else if (less(next_in, seqno)) {
+		u32 gap_after = node->bclink.gap_after;
+		u32 gap_to = node->bclink.gap_to;
+
+		if (tipc_link_defer_pkt(&node->bclink.deferred_head,
+					&node->bclink.deferred_tail,
+					buf)) {
+			node->bclink.nack_sync++;
+			bcl->stats.deferred_recv++;
+			if (seqno == mod(gap_after + 1))
+				node->bclink.gap_after = seqno;
+			else if (less(gap_after, seqno) && less(seqno, gap_to))
+				node->bclink.gap_to = seqno;
+		}
+		if (bclink_ack_allowed(node->bclink.nack_sync)) {
+			if (gap_to != gap_after)
+				bclink_send_nack(node);
+			bclink_set_gap(node);
+		}
+	} else {
+		bcl->stats.duplicates++;
+		buf_discard(buf);
+	}
+	tipc_node_unlock(node);
+}
+
+u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
+{
+	return (n_ptr->bclink.supported &&
+		(tipc_bclink_get_last_sent() != n_ptr->bclink.acked));
+}
+
+
+/**
+ * tipc_bcbearer_send - send a packet through the broadcast pseudo-bearer
+ *
+ * Send through as many bearers as necessary to reach all nodes
+ * that support TIPC multicasting.
+ *
+ * Returns 0 if packet sent successfully, non-zero if not
+ */
+
+static int tipc_bcbearer_send(struct sk_buff *buf,
+			      struct tipc_bearer *unused1,
+			      struct tipc_media_addr *unused2)
+{
+	int bp_index;
+
+	/* Prepare buffer for broadcasting (if first time trying to send it) */
+
+	if (likely(!msg_non_seq(buf_msg(buf)))) {
+		struct tipc_msg *msg;
+
+		assert(tipc_bcast_nmap.count != 0);
+		bcbuf_set_acks(buf, tipc_bcast_nmap.count);
+		msg = buf_msg(buf);
+		msg_set_non_seq(msg, 1);
+		msg_set_mc_netid(msg, tipc_net_id);
+		bcl->stats.sent_info++;
+	}
+
+	/* Send buffer over bearers until all targets reached */
+
+	bcbearer->remains = tipc_bcast_nmap;
+
+	for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
+		struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
+		struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
+
+		if (!p)
+			break;	/* no more bearers to try */
+
+		tipc_nmap_diff(&bcbearer->remains, &p->nodes, &bcbearer->remains_new);
+		if (bcbearer->remains_new.count == bcbearer->remains.count)
+			continue;	/* bearer pair doesn't add anything */
+
+		if (p->blocked ||
+		    p->media->send_msg(buf, p, &p->media->bcast_addr)) {
+			/* unable to send on primary bearer */
+			if (!s || s->blocked ||
+			    s->media->send_msg(buf, s,
+					       &s->media->bcast_addr)) {
+				/* unable to send on either bearer */
+				continue;
+			}
+		}
+
+		if (s) {
+			bcbearer->bpairs[bp_index].primary = s;
+			bcbearer->bpairs[bp_index].secondary = p;
+		}
+
+		if (bcbearer->remains_new.count == 0)
+			return 0;
+
+		bcbearer->remains = bcbearer->remains_new;
+	}
+
+	/*
+	 * Unable to reach all targets (indicate success, since currently
+	 * there isn't code in place to properly block & unblock the
+	 * pseudo-bearer used by the broadcast link)
+	 */
+
+	return TIPC_OK;
+}
+
+/**
+ * tipc_bcbearer_sort - create sets of bearer pairs used by broadcast bearer
+ */
+
+void tipc_bcbearer_sort(void)
+{
+	struct bcbearer_pair *bp_temp = bcbearer->bpairs_temp;
+	struct bcbearer_pair *bp_curr;
+	int b_index;
+	int pri;
+
+	spin_lock_bh(&bc_lock);
+
+	/* Group bearers by priority (can assume max of two per priority) */
+
+	memset(bp_temp, 0, sizeof(bcbearer->bpairs_temp));
+
+	for (b_index = 0; b_index < MAX_BEARERS; b_index++) {
+		struct tipc_bearer *b = &tipc_bearers[b_index];
+
+		if (!b->active || !b->nodes.count)
+			continue;
+
+		if (!bp_temp[b->priority].primary)
+			bp_temp[b->priority].primary = b;
+		else
+			bp_temp[b->priority].secondary = b;
+	}
+
+	/* Create array of bearer pairs for broadcasting */
+
+	bp_curr = bcbearer->bpairs;
+	memset(bcbearer->bpairs, 0, sizeof(bcbearer->bpairs));
+
+	for (pri = TIPC_MAX_LINK_PRI; pri >= 0; pri--) {
+
+		if (!bp_temp[pri].primary)
+			continue;
+
+		bp_curr->primary = bp_temp[pri].primary;
+
+		if (bp_temp[pri].secondary) {
+			if (tipc_nmap_equal(&bp_temp[pri].primary->nodes,
+					    &bp_temp[pri].secondary->nodes)) {
+				bp_curr->secondary = bp_temp[pri].secondary;
+			} else {
+				bp_curr++;
+				bp_curr->primary = bp_temp[pri].secondary;
+			}
+		}
+
+		bp_curr++;
+	}
+
+	spin_unlock_bh(&bc_lock);
+}
+
+/**
+ * tipc_bcbearer_push - resolve bearer congestion
+ *
+ * Forces bclink to push out any unsent packets, until all packets are gone
+ * or congestion reoccurs.
+ * No locks set when function called
+ */
+
+void tipc_bcbearer_push(void)
+{
+	struct tipc_bearer *b_ptr;
+
+	spin_lock_bh(&bc_lock);
+	b_ptr = &bcbearer->bearer;
+	if (b_ptr->blocked) {
+		b_ptr->blocked = 0;
+		tipc_bearer_lock_push(b_ptr);
+	}
+	spin_unlock_bh(&bc_lock);
+}
+
+
+int tipc_bclink_stats(char *buf, const u32 buf_size)
+{
+	struct print_buf pb;
+
+	if (!bcl)
+		return 0;
+
+	tipc_printbuf_init(&pb, buf, buf_size);
+
+	spin_lock_bh(&bc_lock);
+
+	tipc_printf(&pb, "Link <%s>\n"
+			 "  Window:%u packets\n",
+		    bcl->name, bcl->queue_limit[0]);
+	tipc_printf(&pb, "  RX packets:%u fragments:%u/%u bundles:%u/%u\n",
+		    bcl->stats.recv_info,
+		    bcl->stats.recv_fragments,
+		    bcl->stats.recv_fragmented,
+		    bcl->stats.recv_bundles,
+		    bcl->stats.recv_bundled);
+	tipc_printf(&pb, "  TX packets:%u fragments:%u/%u bundles:%u/%u\n",
+		    bcl->stats.sent_info,
+		    bcl->stats.sent_fragments,
+		    bcl->stats.sent_fragmented,
+		    bcl->stats.sent_bundles,
+		    bcl->stats.sent_bundled);
+	tipc_printf(&pb, "  RX naks:%u defs:%u dups:%u\n",
+		    bcl->stats.recv_nacks,
+		    bcl->stats.deferred_recv,
+		    bcl->stats.duplicates);
+	tipc_printf(&pb, "  TX naks:%u acks:%u dups:%u\n",
+		    bcl->stats.sent_nacks,
+		    bcl->stats.sent_acks,
+		    bcl->stats.retransmitted);
+	tipc_printf(&pb, "  Congestion bearer:%u link:%u  Send queue max:%u avg:%u\n",
+		    bcl->stats.bearer_congs,
+		    bcl->stats.link_congs,
+		    bcl->stats.max_queue_sz,
+		    bcl->stats.queue_sz_counts
+		    ? (bcl->stats.accu_queue_sz / bcl->stats.queue_sz_counts)
+		    : 0);
+
+	spin_unlock_bh(&bc_lock);
+	return tipc_printbuf_validate(&pb);
+}
+
+int tipc_bclink_reset_stats(void)
+{
+	if (!bcl)
+		return -ENOPROTOOPT;
+
+	spin_lock_bh(&bc_lock);
+	memset(&bcl->stats, 0, sizeof(bcl->stats));
+	spin_unlock_bh(&bc_lock);
+	return 0;
+}
+
+int tipc_bclink_set_queue_limits(u32 limit)
+{
+	if (!bcl)
+		return -ENOPROTOOPT;
+	if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN))
+		return -EINVAL;
+
+	spin_lock_bh(&bc_lock);
+	tipc_link_set_queue_limits(bcl, limit);
+	spin_unlock_bh(&bc_lock);
+	return 0;
+}
+
+int tipc_bclink_init(void)
+{
+	bcbearer = kzalloc(sizeof(*bcbearer), GFP_ATOMIC);
+	bclink = kzalloc(sizeof(*bclink), GFP_ATOMIC);
+	if (!bcbearer || !bclink) {
+		warn("Multicast link creation failed, no memory\n");
+		kfree(bcbearer);
+		bcbearer = NULL;
+		kfree(bclink);
+		bclink = NULL;
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&bcbearer->bearer.cong_links);
+	bcbearer->bearer.media = &bcbearer->media;
+	bcbearer->media.send_msg = tipc_bcbearer_send;
+	sprintf(bcbearer->media.name, "tipc-multicast");
+
+	bcl = &bclink->link;
+	INIT_LIST_HEAD(&bcl->waiting_ports);
+	bcl->next_out_no = 1;
+	spin_lock_init(&bclink->node.lock);
+	bcl->owner = &bclink->node;
+	bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
+	tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
+	bcl->b_ptr = &bcbearer->bearer;
+	bcl->state = WORKING_WORKING;
+	strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
+
+	return 0;
+}
+
+void tipc_bclink_stop(void)
+{
+	spin_lock_bh(&bc_lock);
+	if (bcbearer) {
+		tipc_link_stop(bcl);
+		bcl = NULL;
+		kfree(bclink);
+		bclink = NULL;
+		kfree(bcbearer);
+		bcbearer = NULL;
+	}
+	spin_unlock_bh(&bc_lock);
+}
+
+
+/**
+ * tipc_nmap_add - add a node to a node map
+ */
+
+void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node)
+{
+	int n = tipc_node(node);
+	int w = n / WSIZE;
+	u32 mask = (1 << (n % WSIZE));
+
+	if ((nm_ptr->map[w] & mask) == 0) {
+		nm_ptr->count++;
+		nm_ptr->map[w] |= mask;
+	}
+}
+
+/**
+ * tipc_nmap_remove - remove a node from a node map
+ */
+
+void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
+{
+	int n = tipc_node(node);
+	int w = n / WSIZE;
+	u32 mask = (1 << (n % WSIZE));
+
+	if ((nm_ptr->map[w] & mask) != 0) {
+		nm_ptr->map[w] &= ~mask;
+		nm_ptr->count--;
+	}
+}
+
+/**
+ * tipc_nmap_diff - find differences between node maps
+ * @nm_a: input node map A
+ * @nm_b: input node map B
+ * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
+ */
+
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+			   struct tipc_node_map *nm_b,
+			   struct tipc_node_map *nm_diff)
+{
+	int stop = ARRAY_SIZE(nm_a->map);
+	int w;
+	int b;
+	u32 map;
+
+	memset(nm_diff, 0, sizeof(*nm_diff));
+	for (w = 0; w < stop; w++) {
+		map = nm_a->map[w] ^ (nm_a->map[w] & nm_b->map[w]);
+		nm_diff->map[w] = map;
+		if (map != 0) {
+			for (b = 0 ; b < WSIZE; b++) {
+				if (map & (1 << b))
+					nm_diff->count++;
+			}
+		}
+	}
+}
+
+/**
+ * tipc_port_list_add - add a port to a port list, ensuring no duplicates
+ */
+
+void tipc_port_list_add(struct port_list *pl_ptr, u32 port)
+{
+	struct port_list *item = pl_ptr;
+	int i;
+	int item_sz = PLSIZE;
+	int cnt = pl_ptr->count;
+
+	for (; ; cnt -= item_sz, item = item->next) {
+		if (cnt < PLSIZE)
+			item_sz = cnt;
+		for (i = 0; i < item_sz; i++)
+			if (item->ports[i] == port)
+				return;
+		if (i < PLSIZE) {
+			item->ports[i] = port;
+			pl_ptr->count++;
+			return;
+		}
+		if (!item->next) {
+			item->next = kmalloc(sizeof(*item), GFP_ATOMIC);
+			if (!item->next) {
+				warn("Incomplete multicast delivery, no memory\n");
+				return;
+			}
+			item->next->next = NULL;
+		}
+	}
+}
+
+/**
+ * tipc_port_list_free - free dynamically created entries in port_list chain
+ *
+ */
+
+void tipc_port_list_free(struct port_list *pl_ptr)
+{
+	struct port_list *item;
+	struct port_list *next;
+
+	for (item = pl_ptr->next; item; item = next) {
+		next = item->next;
+		kfree(item);
+	}
+}
+
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
new file mode 100644
index 00000000..500c97f1
--- /dev/null
+++ b/net/tipc/bcast.h
@@ -0,0 +1,106 @@
+/*
+ * net/tipc/bcast.h: Include file for TIPC broadcast code
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BCAST_H
+#define _TIPC_BCAST_H
+
+#define MAX_NODES 4096
+#define WSIZE 32
+
+/**
+ * struct tipc_node_map - set of node identifiers
+ * @count: # of nodes in set
+ * @map: bitmap of node identifiers that are in the set
+ */
+
+struct tipc_node_map {
+	u32 count;
+	u32 map[MAX_NODES / WSIZE];
+};
+
+extern struct tipc_node_map tipc_bcast_nmap;
+
+#define PLSIZE 32
+
+/**
+ * struct port_list - set of node local destination ports
+ * @count: # of ports in set (only valid for first entry in list)
+ * @next: pointer to next entry in list
+ * @ports: array of port references
+ */
+
+struct port_list {
+	int count;
+	struct port_list *next;
+	u32 ports[PLSIZE];
+};
+
+
+struct tipc_node;
+
+extern const char tipc_bclink_name[];
+
+void tipc_nmap_add(struct tipc_node_map *nm_ptr, u32 node);
+void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node);
+
+/**
+ * tipc_nmap_equal - test for equality of node maps
+ */
+
+static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b)
+{
+	return !memcmp(nm_a, nm_b, sizeof(*nm_a));
+}
+
+void tipc_port_list_add(struct port_list *pl_ptr, u32 port);
+void tipc_port_list_free(struct port_list *pl_ptr);
+
+int  tipc_bclink_init(void);
+void tipc_bclink_stop(void);
+struct tipc_node *tipc_bclink_retransmit_to(void);
+void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked);
+int  tipc_bclink_send_msg(struct sk_buff *buf);
+void tipc_bclink_recv_pkt(struct sk_buff *buf);
+u32  tipc_bclink_get_last_sent(void);
+u32  tipc_bclink_acks_missing(struct tipc_node *n_ptr);
+void tipc_bclink_check_gap(struct tipc_node *n_ptr, u32 seqno);
+int  tipc_bclink_stats(char *stats_buf, const u32 buf_size);
+int  tipc_bclink_reset_stats(void);
+int  tipc_bclink_set_queue_limits(u32 limit);
+void tipc_bcbearer_sort(void);
+void tipc_bcbearer_push(void);
+
+#endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
new file mode 100644
index 00000000..85209ead
--- /dev/null
+++ b/net/tipc/bearer.c
@@ -0,0 +1,677 @@
+/*
+ * net/tipc/bearer.c: TIPC bearer code
+ *
+ * Copyright (c) 1996-2006, Ericsson AB
+ * Copyright (c) 2004-2006, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "bearer.h"
+#include "discover.h"
+
+#define MAX_ADDR_STR 32
+
+static struct media media_list[MAX_MEDIA];
+static u32 media_count;
+
+struct tipc_bearer tipc_bearers[MAX_BEARERS];
+
+static void bearer_disable(struct tipc_bearer *b_ptr);
+
+/**
+ * media_name_valid - validate media name
+ *
+ * Returns 1 if media name is valid, otherwise 0.
+ */
+
+static int media_name_valid(const char *name)
+{
+	u32 len;
+
+	len = strlen(name);
+	if ((len + 1) > TIPC_MAX_MEDIA_NAME)
+		return 0;
+	return strspn(name, tipc_alphabet) == len;
+}
+
+/**
+ * media_find - locates specified media object by name
+ */
+
+static struct media *media_find(const char *name)
+{
+	struct media *m_ptr;
+	u32 i;
+
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		if (!strcmp(m_ptr->name, name))
+			return m_ptr;
+	}
+	return NULL;
+}
+
+/**
+ * tipc_register_media - register a media type
+ *
+ * Bearers for this media type must be activated separately at a later stage.
+ */
+
+int  tipc_register_media(u32 media_type,
+			 char *name,
+			 int (*enable)(struct tipc_bearer *),
+			 void (*disable)(struct tipc_bearer *),
+			 int (*send_msg)(struct sk_buff *,
+					 struct tipc_bearer *,
+					 struct tipc_media_addr *),
+			 char *(*addr2str)(struct tipc_media_addr *a,
+					   char *str_buf, int str_size),
+			 struct tipc_media_addr *bcast_addr,
+			 const u32 bearer_priority,
+			 const u32 link_tolerance,  /* [ms] */
+			 const u32 send_window_limit)
+{
+	struct media *m_ptr;
+	u32 media_id;
+	u32 i;
+	int res = -EINVAL;
+
+	write_lock_bh(&tipc_net_lock);
+
+	if (tipc_mode != TIPC_NET_MODE) {
+		warn("Media <%s> rejected, not in networked mode yet\n", name);
+		goto exit;
+	}
+	if (!media_name_valid(name)) {
+		warn("Media <%s> rejected, illegal name\n", name);
+		goto exit;
+	}
+	if (!bcast_addr) {
+		warn("Media <%s> rejected, no broadcast address\n", name);
+		goto exit;
+	}
+	if ((bearer_priority < TIPC_MIN_LINK_PRI) ||
+	    (bearer_priority > TIPC_MAX_LINK_PRI)) {
+		warn("Media <%s> rejected, illegal priority (%u)\n", name,
+		     bearer_priority);
+		goto exit;
+	}
+	if ((link_tolerance < TIPC_MIN_LINK_TOL) ||
+	    (link_tolerance > TIPC_MAX_LINK_TOL)) {
+		warn("Media <%s> rejected, illegal tolerance (%u)\n", name,
+		     link_tolerance);
+		goto exit;
+	}
+
+	media_id = media_count++;
+	if (media_id >= MAX_MEDIA) {
+		warn("Media <%s> rejected, media limit reached (%u)\n", name,
+		     MAX_MEDIA);
+		media_count--;
+		goto exit;
+	}
+	for (i = 0; i < media_id; i++) {
+		if (media_list[i].type_id == media_type) {
+			warn("Media <%s> rejected, duplicate type (%u)\n", name,
+			     media_type);
+			media_count--;
+			goto exit;
+		}
+		if (!strcmp(name, media_list[i].name)) {
+			warn("Media <%s> rejected, duplicate name\n", name);
+			media_count--;
+			goto exit;
+		}
+	}
+
+	m_ptr = &media_list[media_id];
+	m_ptr->type_id = media_type;
+	m_ptr->send_msg = send_msg;
+	m_ptr->enable_bearer = enable;
+	m_ptr->disable_bearer = disable;
+	m_ptr->addr2str = addr2str;
+	memcpy(&m_ptr->bcast_addr, bcast_addr, sizeof(*bcast_addr));
+	strcpy(m_ptr->name, name);
+	m_ptr->priority = bearer_priority;
+	m_ptr->tolerance = link_tolerance;
+	m_ptr->window = send_window_limit;
+	res = 0;
+exit:
+	write_unlock_bh(&tipc_net_lock);
+	return res;
+}
+
+/**
+ * tipc_media_addr_printf - record media address in print buffer
+ */
+
+void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a)
+{
+	struct media *m_ptr;
+	u32 media_type;
+	u32 i;
+
+	media_type = ntohl(a->type);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		if (m_ptr->type_id == media_type)
+			break;
+	}
+
+	if ((i < media_count) && (m_ptr->addr2str != NULL)) {
+		char addr_str[MAX_ADDR_STR];
+
+		tipc_printf(pb, "%s(%s)", m_ptr->name,
+			    m_ptr->addr2str(a, addr_str, sizeof(addr_str)));
+	} else {
+		unchar *addr = (unchar *)&a->dev_addr;
+
+		tipc_printf(pb, "UNKNOWN(%u)", media_type);
+		for (i = 0; i < (sizeof(*a) - sizeof(a->type)); i++)
+			tipc_printf(pb, "-%02x", addr[i]);
+	}
+}
+
+/**
+ * tipc_media_get_names - record names of registered media in buffer
+ */
+
+struct sk_buff *tipc_media_get_names(void)
+{
+	struct sk_buff *buf;
+	struct media *m_ptr;
+	int i;
+
+	buf = tipc_cfg_reply_alloc(MAX_MEDIA * TLV_SPACE(TIPC_MAX_MEDIA_NAME));
+	if (!buf)
+		return NULL;
+
+	read_lock_bh(&tipc_net_lock);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		tipc_cfg_append_tlv(buf, TIPC_TLV_MEDIA_NAME, m_ptr->name,
+				    strlen(m_ptr->name) + 1);
+	}
+	read_unlock_bh(&tipc_net_lock);
+	return buf;
+}
+
+/**
+ * bearer_name_validate - validate & (optionally) deconstruct bearer name
+ * @name - ptr to bearer name string
+ * @name_parts - ptr to area for bearer name components (or NULL if not needed)
+ *
+ * Returns 1 if bearer name is valid, otherwise 0.
+ */
+
+static int bearer_name_validate(const char *name,
+				struct bearer_name *name_parts)
+{
+	char name_copy[TIPC_MAX_BEARER_NAME];
+	char *media_name;
+	char *if_name;
+	u32 media_len;
+	u32 if_len;
+
+	/* copy bearer name & ensure length is OK */
+
+	name_copy[TIPC_MAX_BEARER_NAME - 1] = 0;
+	/* need above in case non-Posix strncpy() doesn't pad with nulls */
+	strncpy(name_copy, name, TIPC_MAX_BEARER_NAME);
+	if (name_copy[TIPC_MAX_BEARER_NAME - 1] != 0)
+		return 0;
+
+	/* ensure all component parts of bearer name are present */
+
+	media_name = name_copy;
+	if_name = strchr(media_name, ':');
+	if (if_name == NULL)
+		return 0;
+	*(if_name++) = 0;
+	media_len = if_name - media_name;
+	if_len = strlen(if_name) + 1;
+
+	/* validate component parts of bearer name */
+
+	if ((media_len <= 1) || (media_len > TIPC_MAX_MEDIA_NAME) ||
+	    (if_len <= 1) || (if_len > TIPC_MAX_IF_NAME) ||
+	    (strspn(media_name, tipc_alphabet) != (media_len - 1)) ||
+	    (strspn(if_name, tipc_alphabet) != (if_len - 1)))
+		return 0;
+
+	/* return bearer name components, if necessary */
+
+	if (name_parts) {
+		strcpy(name_parts->media_name, media_name);
+		strcpy(name_parts->if_name, if_name);
+	}
+	return 1;
+}
+
+/**
+ * bearer_find - locates bearer object with matching bearer name
+ */
+
+static struct tipc_bearer *bearer_find(const char *name)
+{
+	struct tipc_bearer *b_ptr;
+	u32 i;
+
+	for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
+		if (b_ptr->active && (!strcmp(b_ptr->name, name)))
+			return b_ptr;
+	}
+	return NULL;
+}
+
+/**
+ * tipc_bearer_find_interface - locates bearer object with matching interface name
+ */
+
+struct tipc_bearer *tipc_bearer_find_interface(const char *if_name)
+{
+	struct tipc_bearer *b_ptr;
+	char *b_if_name;
+	u32 i;
+
+	for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
+		if (!b_ptr->active)
+			continue;
+		b_if_name = strchr(b_ptr->name, ':') + 1;
+		if (!strcmp(b_if_name, if_name))
+			return b_ptr;
+	}
+	return NULL;
+}
+
+/**
+ * tipc_bearer_get_names - record names of bearers in buffer
+ */
+
+struct sk_buff *tipc_bearer_get_names(void)
+{
+	struct sk_buff *buf;
+	struct media *m_ptr;
+	struct tipc_bearer *b_ptr;
+	int i, j;
+
+	buf = tipc_cfg_reply_alloc(MAX_BEARERS * TLV_SPACE(TIPC_MAX_BEARER_NAME));
+	if (!buf)
+		return NULL;
+
+	read_lock_bh(&tipc_net_lock);
+	for (i = 0, m_ptr = media_list; i < media_count; i++, m_ptr++) {
+		for (j = 0; j < MAX_BEARERS; j++) {
+			b_ptr = &tipc_bearers[j];
+			if (b_ptr->active && (b_ptr->media == m_ptr)) {
+				tipc_cfg_append_tlv(buf, TIPC_TLV_BEARER_NAME,
+						    b_ptr->name,
+						    strlen(b_ptr->name) + 1);
+			}
+		}
+	}
+	read_unlock_bh(&tipc_net_lock);
+	return buf;
+}
+
+void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest)
+{
+	tipc_nmap_add(&b_ptr->nodes, dest);
+	tipc_bcbearer_sort();
+	tipc_disc_add_dest(b_ptr->link_req);
+}
+
+void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest)
+{
+	tipc_nmap_remove(&b_ptr->nodes, dest);
+	tipc_bcbearer_sort();
+	tipc_disc_remove_dest(b_ptr->link_req);
+}
+
+/*
+ * bearer_push(): Resolve bearer congestion. Force the waiting
+ * links to push out their unsent packets, one packet per link
+ * per iteration, until all packets are gone or congestion reoccurs.
+ * 'tipc_net_lock' is read_locked when this function is called
+ * bearer.lock must be taken before calling
+ * Returns binary true(1) ore false(0)
+ */
+static int bearer_push(struct tipc_bearer *b_ptr)
+{
+	u32 res = 0;
+	struct link *ln, *tln;
+
+	if (b_ptr->blocked)
+		return 0;
+
+	while (!list_empty(&b_ptr->cong_links) && (res != PUSH_FAILED)) {
+		list_for_each_entry_safe(ln, tln, &b_ptr->cong_links, link_list) {
+			res = tipc_link_push_packet(ln);
+			if (res == PUSH_FAILED)
+				break;
+			if (res == PUSH_FINISHED)
+				list_move_tail(&ln->link_list, &b_ptr->links);
+		}
+	}
+	return list_empty(&b_ptr->cong_links);
+}
+
+void tipc_bearer_lock_push(struct tipc_bearer *b_ptr)
+{
+	int res;
+
+	spin_lock_bh(&b_ptr->lock);
+	res = bearer_push(b_ptr);
+	spin_unlock_bh(&b_ptr->lock);
+	if (res)
+		tipc_bcbearer_push();
+}
+
+
+/*
+ * Interrupt enabling new requests after bearer congestion or blocking:
+ * See bearer_send().
+ */
+void tipc_continue(struct tipc_bearer *b_ptr)
+{
+	spin_lock_bh(&b_ptr->lock);
+	b_ptr->continue_count++;
+	if (!list_empty(&b_ptr->cong_links))
+		tipc_k_signal((Handler)tipc_bearer_lock_push, (unsigned long)b_ptr);
+	b_ptr->blocked = 0;
+	spin_unlock_bh(&b_ptr->lock);
+}
+
+/*
+ * Schedule link for sending of messages after the bearer
+ * has been deblocked by 'continue()'. This method is called
+ * when somebody tries to send a message via this link while
+ * the bearer is congested. 'tipc_net_lock' is in read_lock here
+ * bearer.lock is busy
+ */
+
+static void tipc_bearer_schedule_unlocked(struct tipc_bearer *b_ptr, struct link *l_ptr)
+{
+	list_move_tail(&l_ptr->link_list, &b_ptr->cong_links);
+}
+
+/*
+ * Schedule link for sending of messages after the bearer
+ * has been deblocked by 'continue()'. This method is called
+ * when somebody tries to send a message via this link while
+ * the bearer is congested. 'tipc_net_lock' is in read_lock here,
+ * bearer.lock is free
+ */
+
+void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr)
+{
+	spin_lock_bh(&b_ptr->lock);
+	tipc_bearer_schedule_unlocked(b_ptr, l_ptr);
+	spin_unlock_bh(&b_ptr->lock);
+}
+
+
+/*
+ * tipc_bearer_resolve_congestion(): Check if there is bearer congestion,
+ * and if there is, try to resolve it before returning.
+ * 'tipc_net_lock' is read_locked when this function is called
+ */
+int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr)
+{
+	int res = 1;
+
+	if (list_empty(&b_ptr->cong_links))
+		return 1;
+	spin_lock_bh(&b_ptr->lock);
+	if (!bearer_push(b_ptr)) {
+		tipc_bearer_schedule_unlocked(b_ptr, l_ptr);
+		res = 0;
+	}
+	spin_unlock_bh(&b_ptr->lock);
+	return res;
+}
+
+/**
+ * tipc_bearer_congested - determines if bearer is currently congested
+ */
+
+int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr)
+{
+	if (unlikely(b_ptr->blocked))
+		return 1;
+	if (likely(list_empty(&b_ptr->cong_links)))
+		return 0;
+	return !tipc_bearer_resolve_congestion(b_ptr, l_ptr);
+}
+
+/**
+ * tipc_enable_bearer - enable bearer with the given name
+ */
+
+int tipc_enable_bearer(const char *name, u32 disc_domain, u32 priority)
+{
+	struct tipc_bearer *b_ptr;
+	struct media *m_ptr;
+	struct bearer_name b_name;
+	char addr_string[16];
+	u32 bearer_id;
+	u32 with_this_prio;
+	u32 i;
+	int res = -EINVAL;
+
+	if (tipc_mode != TIPC_NET_MODE) {
+		warn("Bearer <%s> rejected, not supported in standalone mode\n",
+		     name);
+		return -ENOPROTOOPT;
+	}
+	if (!bearer_name_validate(name, &b_name)) {
+		warn("Bearer <%s> rejected, illegal name\n", name);
+		return -EINVAL;
+	}
+	if (tipc_addr_domain_valid(disc_domain) &&
+	    (disc_domain != tipc_own_addr)) {
+		if (tipc_in_scope(disc_domain, tipc_own_addr)) {
+			disc_domain = tipc_own_addr & TIPC_CLUSTER_MASK;
+			res = 0;   /* accept any node in own cluster */
+		} else if (in_own_cluster(disc_domain))
+			res = 0;   /* accept specified node in own cluster */
+	}
+	if (res) {
+		warn("Bearer <%s> rejected, illegal discovery domain\n", name);
+		return -EINVAL;
+	}
+	if ((priority < TIPC_MIN_LINK_PRI ||
+	     priority > TIPC_MAX_LINK_PRI) &&
+	    (priority != TIPC_MEDIA_LINK_PRI)) {
+		warn("Bearer <%s> rejected, illegal priority\n", name);
+		return -EINVAL;
+	}
+
+	write_lock_bh(&tipc_net_lock);
+
+	m_ptr = media_find(b_name.media_name);
+	if (!m_ptr) {
+		warn("Bearer <%s> rejected, media <%s> not registered\n", name,
+		     b_name.media_name);
+		goto exit;
+	}
+
+	if (priority == TIPC_MEDIA_LINK_PRI)
+		priority = m_ptr->priority;
+
+restart:
+	bearer_id = MAX_BEARERS;
+	with_this_prio = 1;
+	for (i = MAX_BEARERS; i-- != 0; ) {
+		if (!tipc_bearers[i].active) {
+			bearer_id = i;
+			continue;
+		}
+		if (!strcmp(name, tipc_bearers[i].name)) {
+			warn("Bearer <%s> rejected, already enabled\n", name);
+			goto exit;
+		}
+		if ((tipc_bearers[i].priority == priority) &&
+		    (++with_this_prio > 2)) {
+			if (priority-- == 0) {
+				warn("Bearer <%s> rejected, duplicate priority\n",
+				     name);
+				goto exit;
+			}
+			warn("Bearer <%s> priority adjustment required %u->%u\n",
+			     name, priority + 1, priority);
+			goto restart;
+		}
+	}
+	if (bearer_id >= MAX_BEARERS) {
+		warn("Bearer <%s> rejected, bearer limit reached (%u)\n",
+		     name, MAX_BEARERS);
+		goto exit;
+	}
+
+	b_ptr = &tipc_bearers[bearer_id];
+	strcpy(b_ptr->name, name);
+	res = m_ptr->enable_bearer(b_ptr);
+	if (res) {
+		warn("Bearer <%s> rejected, enable failure (%d)\n", name, -res);
+		goto exit;
+	}
+
+	b_ptr->identity = bearer_id;
+	b_ptr->media = m_ptr;
+	b_ptr->net_plane = bearer_id + 'A';
+	b_ptr->active = 1;
+	b_ptr->priority = priority;
+	INIT_LIST_HEAD(&b_ptr->cong_links);
+	INIT_LIST_HEAD(&b_ptr->links);
+	spin_lock_init(&b_ptr->lock);
+
+	res = tipc_disc_create(b_ptr, &m_ptr->bcast_addr, disc_domain);
+	if (res) {
+		bearer_disable(b_ptr);
+		warn("Bearer <%s> rejected, discovery object creation failed\n",
+		     name);
+		goto exit;
+	}
+	info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
+	     name, tipc_addr_string_fill(addr_string, disc_domain), priority);
+exit:
+	write_unlock_bh(&tipc_net_lock);
+	return res;
+}
+
+/**
+ * tipc_block_bearer(): Block the bearer with the given name,
+ *                      and reset all its links
+ */
+
+int tipc_block_bearer(const char *name)
+{
+	struct tipc_bearer *b_ptr = NULL;
+	struct link *l_ptr;
+	struct link *temp_l_ptr;
+
+	read_lock_bh(&tipc_net_lock);
+	b_ptr = bearer_find(name);
+	if (!b_ptr) {
+		warn("Attempt to block unknown bearer <%s>\n", name);
+		read_unlock_bh(&tipc_net_lock);
+		return -EINVAL;
+	}
+
+	info("Blocking bearer <%s>\n", name);
+	spin_lock_bh(&b_ptr->lock);
+	b_ptr->blocked = 1;
+	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
+		struct tipc_node *n_ptr = l_ptr->owner;
+
+		spin_lock_bh(&n_ptr->lock);
+		tipc_link_reset(l_ptr);
+		spin_unlock_bh(&n_ptr->lock);
+	}
+	spin_unlock_bh(&b_ptr->lock);
+	read_unlock_bh(&tipc_net_lock);
+	return 0;
+}
+
+/**
+ * bearer_disable -
+ *
+ * Note: This routine assumes caller holds tipc_net_lock.
+ */
+
+static void bearer_disable(struct tipc_bearer *b_ptr)
+{
+	struct link *l_ptr;
+	struct link *temp_l_ptr;
+
+	info("Disabling bearer <%s>\n", b_ptr->name);
+	spin_lock_bh(&b_ptr->lock);
+	b_ptr->blocked = 1;
+	b_ptr->media->disable_bearer(b_ptr);
+	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
+		tipc_link_delete(l_ptr);
+	}
+	if (b_ptr->link_req)
+		tipc_disc_delete(b_ptr->link_req);
+	spin_unlock_bh(&b_ptr->lock);
+	memset(b_ptr, 0, sizeof(struct tipc_bearer));
+}
+
+int tipc_disable_bearer(const char *name)
+{
+	struct tipc_bearer *b_ptr;
+	int res;
+
+	write_lock_bh(&tipc_net_lock);
+	b_ptr = bearer_find(name);
+	if (b_ptr == NULL) {
+		warn("Attempt to disable unknown bearer <%s>\n", name);
+		res = -EINVAL;
+	} else {
+		bearer_disable(b_ptr);
+		res = 0;
+	}
+	write_unlock_bh(&tipc_net_lock);
+	return res;
+}
+
+
+
+void tipc_bearer_stop(void)
+{
+	u32 i;
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+		if (tipc_bearers[i].active)
+			bearer_disable(&tipc_bearers[i]);
+	}
+	media_count = 0;
+}
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
new file mode 100644
index 00000000..31d6172b
--- /dev/null
+++ b/net/tipc/bearer.h
@@ -0,0 +1,214 @@
+/*
+ * net/tipc/bearer.h: Include file for TIPC bearer code
+ *
+ * Copyright (c) 1996-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_BEARER_H
+#define _TIPC_BEARER_H
+
+#include "bcast.h"
+
+#define MAX_BEARERS 8
+#define MAX_MEDIA 4
+
+/*
+ * Identifiers of supported TIPC media types
+ */
+#define TIPC_MEDIA_TYPE_ETH	1
+
+/*
+ * Destination address structure used by TIPC bearers when sending messages
+ *
+ * IMPORTANT: The fields of this structure MUST be stored using the specified
+ * byte order indicated below, as the structure is exchanged between nodes
+ * as part of a link setup process.
+ */
+struct tipc_media_addr {
+	__be32  type;			/* bearer type (network byte order) */
+	union {
+		__u8   eth_addr[6];	/* 48 bit Ethernet addr (byte array) */
+	} dev_addr;
+};
+
+struct tipc_bearer;
+
+/**
+ * struct media - TIPC media information available to internal users
+ * @send_msg: routine which handles buffer transmission
+ * @enable_bearer: routine which enables a bearer
+ * @disable_bearer: routine which disables a bearer
+ * @addr2str: routine which converts bearer's address to string form
+ * @bcast_addr: media address used in broadcasting
+ * @priority: default link (and bearer) priority
+ * @tolerance: default time (in ms) before declaring link failure
+ * @window: default window (in packets) before declaring link congestion
+ * @type_id: TIPC media identifier
+ * @name: media name
+ */
+
+struct media {
+	int (*send_msg)(struct sk_buff *buf,
+			struct tipc_bearer *b_ptr,
+			struct tipc_media_addr *dest);
+	int (*enable_bearer)(struct tipc_bearer *b_ptr);
+	void (*disable_bearer)(struct tipc_bearer *b_ptr);
+	char *(*addr2str)(struct tipc_media_addr *a,
+			  char *str_buf, int str_size);
+	struct tipc_media_addr bcast_addr;
+	u32 priority;
+	u32 tolerance;
+	u32 window;
+	u32 type_id;
+	char name[TIPC_MAX_MEDIA_NAME];
+};
+
+/**
+ * struct tipc_bearer - TIPC bearer structure
+ * @usr_handle: pointer to additional media-specific information about bearer
+ * @mtu: max packet size bearer can support
+ * @blocked: non-zero if bearer is blocked
+ * @lock: spinlock for controlling access to bearer
+ * @addr: media-specific address associated with bearer
+ * @name: bearer name (format = media:interface)
+ * @media: ptr to media structure associated with bearer
+ * @priority: default link priority for bearer
+ * @identity: array index of this bearer within TIPC bearer array
+ * @link_req: ptr to (optional) structure making periodic link setup requests
+ * @links: list of non-congested links associated with bearer
+ * @cong_links: list of congested links associated with bearer
+ * @continue_count: # of times bearer has resumed after congestion or blocking
+ * @active: non-zero if bearer structure is represents a bearer
+ * @net_plane: network plane ('A' through 'H') currently associated with bearer
+ * @nodes: indicates which nodes in cluster can be reached through bearer
+ *
+ * Note: media-specific code is responsible for initialization of the fields
+ * indicated below when a bearer is enabled; TIPC's generic bearer code takes
+ * care of initializing all other fields.
+ */
+struct tipc_bearer {
+	void *usr_handle;			/* initalized by media */
+	u32 mtu;				/* initalized by media */
+	int blocked;				/* initalized by media */
+	struct tipc_media_addr addr;		/* initalized by media */
+	char name[TIPC_MAX_BEARER_NAME];
+	spinlock_t lock;
+	struct media *media;
+	u32 priority;
+	u32 identity;
+	struct link_req *link_req;
+	struct list_head links;
+	struct list_head cong_links;
+	u32 continue_count;
+	int active;
+	char net_plane;
+	struct tipc_node_map nodes;
+};
+
+struct bearer_name {
+	char media_name[TIPC_MAX_MEDIA_NAME];
+	char if_name[TIPC_MAX_IF_NAME];
+};
+
+struct link;
+
+extern struct tipc_bearer tipc_bearers[];
+
+/*
+ * TIPC routines available to supported media types
+ */
+int tipc_register_media(u32 media_type,
+		 char *media_name, int (*enable)(struct tipc_bearer *),
+		 void (*disable)(struct tipc_bearer *),
+		 int (*send_msg)(struct sk_buff *,
+			struct tipc_bearer *, struct tipc_media_addr *),
+		 char *(*addr2str)(struct tipc_media_addr *a,
+			char *str_buf, int str_size),
+		 struct tipc_media_addr *bcast_addr, const u32 bearer_priority,
+		 const u32 link_tolerance,  /* [ms] */
+		 const u32 send_window_limit);
+
+void tipc_recv_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr);
+
+int  tipc_block_bearer(const char *name);
+void tipc_continue(struct tipc_bearer *tb_ptr);
+
+int tipc_enable_bearer(const char *bearer_name, u32 disc_domain, u32 priority);
+int tipc_disable_bearer(const char *name);
+
+/*
+ * Routines made available to TIPC by supported media types
+ */
+int  tipc_eth_media_start(void);
+void tipc_eth_media_stop(void);
+
+void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a);
+struct sk_buff *tipc_media_get_names(void);
+
+struct sk_buff *tipc_bearer_get_names(void);
+void tipc_bearer_add_dest(struct tipc_bearer *b_ptr, u32 dest);
+void tipc_bearer_remove_dest(struct tipc_bearer *b_ptr, u32 dest);
+void tipc_bearer_schedule(struct tipc_bearer *b_ptr, struct link *l_ptr);
+struct tipc_bearer *tipc_bearer_find_interface(const char *if_name);
+int tipc_bearer_resolve_congestion(struct tipc_bearer *b_ptr, struct link *l_ptr);
+int tipc_bearer_congested(struct tipc_bearer *b_ptr, struct link *l_ptr);
+void tipc_bearer_stop(void);
+void tipc_bearer_lock_push(struct tipc_bearer *b_ptr);
+
+
+/**
+ * tipc_bearer_send- sends buffer to destination over bearer
+ *
+ * Returns true (1) if successful, or false (0) if unable to send
+ *
+ * IMPORTANT:
+ * The media send routine must not alter the buffer being passed in
+ * as it may be needed for later retransmission!
+ *
+ * If the media send routine returns a non-zero value (indicating that
+ * it was unable to send the buffer), it must:
+ *   1) mark the bearer as blocked,
+ *   2) call tipc_continue() once the bearer is able to send again.
+ * Media types that are unable to meet these two critera must ensure their
+ * send routine always returns success -- even if the buffer was not sent --
+ * and let TIPC's link code deal with the undelivered message.
+ */
+
+static inline int tipc_bearer_send(struct tipc_bearer *b_ptr,
+				   struct sk_buff *buf,
+				   struct tipc_media_addr *dest)
+{
+	return !b_ptr->media->send_msg(buf, b_ptr, dest);
+}
+
+#endif	/* _TIPC_BEARER_H */
diff --git a/net/tipc/config.c b/net/tipc/config.c
new file mode 100644
index 00000000..b25a396b
--- /dev/null
+++ b/net/tipc/config.c
@@ -0,0 +1,502 @@
+/*
+ * net/tipc/config.c: TIPC configuration management code
+ *
+ * Copyright (c) 2002-2006, Ericsson AB
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "port.h"
+#include "name_table.h"
+#include "config.h"
+
+static u32 config_port_ref;
+
+static DEFINE_SPINLOCK(config_lock);
+
+static const void *req_tlv_area;	/* request message TLV area */
+static int req_tlv_space;		/* request message TLV area size */
+static int rep_headroom;		/* reply message headroom to use */
+
+
+struct sk_buff *tipc_cfg_reply_alloc(int payload_size)
+{
+	struct sk_buff *buf;
+
+	buf = alloc_skb(rep_headroom + payload_size, GFP_ATOMIC);
+	if (buf)
+		skb_reserve(buf, rep_headroom);
+	return buf;
+}
+
+int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
+			void *tlv_data, int tlv_data_size)
+{
+	struct tlv_desc *tlv = (struct tlv_desc *)skb_tail_pointer(buf);
+	int new_tlv_space = TLV_SPACE(tlv_data_size);
+
+	if (skb_tailroom(buf) < new_tlv_space)
+		return 0;
+	skb_put(buf, new_tlv_space);
+	tlv->tlv_type = htons(tlv_type);
+	tlv->tlv_len  = htons(TLV_LENGTH(tlv_data_size));
+	if (tlv_data_size && tlv_data)
+		memcpy(TLV_DATA(tlv), tlv_data, tlv_data_size);
+	return 1;
+}
+
+static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
+{
+	struct sk_buff *buf;
+	__be32 value_net;
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(sizeof(value)));
+	if (buf) {
+		value_net = htonl(value);
+		tipc_cfg_append_tlv(buf, tlv_type, &value_net,
+				    sizeof(value_net));
+	}
+	return buf;
+}
+
+static struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
+{
+	return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
+}
+
+struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
+{
+	struct sk_buff *buf;
+	int string_len = strlen(string) + 1;
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(string_len));
+	if (buf)
+		tipc_cfg_append_tlv(buf, tlv_type, string, string_len);
+	return buf;
+}
+
+#define MAX_STATS_INFO 2000
+
+static struct sk_buff *tipc_show_stats(void)
+{
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	struct print_buf pb;
+	int str_len;
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = ntohl(*(u32 *)TLV_DATA(req_tlv_area));
+	if (value != 0)
+		return tipc_cfg_reply_error_string("unsupported argument");
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_STATS_INFO));
+	if (buf == NULL)
+		return NULL;
+
+	rep_tlv = (struct tlv_desc *)buf->data;
+	tipc_printbuf_init(&pb, (char *)TLV_DATA(rep_tlv), MAX_STATS_INFO);
+
+	tipc_printf(&pb, "TIPC version " TIPC_MOD_VER "\n");
+
+	/* Use additional tipc_printf()'s to return more info ... */
+
+	str_len = tipc_printbuf_validate(&pb);
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+static struct sk_buff *cfg_enable_bearer(void)
+{
+	struct tipc_bearer_config *args;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_CONFIG))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	args = (struct tipc_bearer_config *)TLV_DATA(req_tlv_area);
+	if (tipc_enable_bearer(args->name,
+			       ntohl(args->disc_domain),
+			       ntohl(args->priority)))
+		return tipc_cfg_reply_error_string("unable to enable bearer");
+
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_disable_bearer(void)
+{
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_BEARER_NAME))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	if (tipc_disable_bearer((char *)TLV_DATA(req_tlv_area)))
+		return tipc_cfg_reply_error_string("unable to disable bearer");
+
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_own_addr(void)
+{
+	u32 addr;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	addr = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (addr == tipc_own_addr)
+		return tipc_cfg_reply_none();
+	if (!tipc_addr_node_valid(addr))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (node address)");
+	if (tipc_mode == TIPC_NET_MODE)
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+						   " (cannot change node address once assigned)");
+
+	/*
+	 * Must release all spinlocks before calling start_net() because
+	 * Linux version of TIPC calls eth_media_start() which calls
+	 * register_netdevice_notifier() which may block!
+	 *
+	 * Temporarily releasing the lock should be harmless for non-Linux TIPC,
+	 * but Linux version of eth_media_start() should really be reworked
+	 * so that it can be called with spinlocks held.
+	 */
+
+	spin_unlock_bh(&config_lock);
+	tipc_core_start_net(addr);
+	spin_lock_bh(&config_lock);
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_remote_mng(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	tipc_remote_management = (value != 0);
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_publications(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (value != delimit(value, 1, 65535))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (max publications must be 1-65535)");
+	tipc_max_publications = value;
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_subscriptions(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (value != delimit(value, 1, 65535))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (max subscriptions must be 1-65535");
+	tipc_max_subscriptions = value;
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_max_ports(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (value == tipc_max_ports)
+		return tipc_cfg_reply_none();
+	if (value != delimit(value, 127, 65535))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (max ports must be 127-65535)");
+	if (tipc_mode != TIPC_NOT_RUNNING)
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+			" (cannot change max ports while TIPC is active)");
+	tipc_max_ports = value;
+	return tipc_cfg_reply_none();
+}
+
+static struct sk_buff *cfg_set_netid(void)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (value == tipc_net_id)
+		return tipc_cfg_reply_none();
+	if (value != delimit(value, 1, 9999))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (network id must be 1-9999)");
+	if (tipc_mode == TIPC_NET_MODE)
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+			" (cannot change network id once TIPC has joined a network)");
+	tipc_net_id = value;
+	return tipc_cfg_reply_none();
+}
+
+struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area,
+				int request_space, int reply_headroom)
+{
+	struct sk_buff *rep_tlv_buf;
+
+	spin_lock_bh(&config_lock);
+
+	/* Save request and reply details in a well-known location */
+
+	req_tlv_area = request_area;
+	req_tlv_space = request_space;
+	rep_headroom = reply_headroom;
+
+	/* Check command authorization */
+
+	if (likely(orig_node == tipc_own_addr)) {
+		/* command is permitted */
+	} else if (cmd >= 0x8000) {
+		rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+							  " (cannot be done remotely)");
+		goto exit;
+	} else if (!tipc_remote_management) {
+		rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NO_REMOTE);
+		goto exit;
+	} else if (cmd >= 0x4000) {
+		u32 domain = 0;
+
+		if ((tipc_nametbl_translate(TIPC_ZM_SRV, 0, &domain) == 0) ||
+		    (domain != orig_node)) {
+			rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_ZONE_MSTR);
+			goto exit;
+		}
+	}
+
+	/* Call appropriate processing routine */
+
+	switch (cmd) {
+	case TIPC_CMD_NOOP:
+		rep_tlv_buf = tipc_cfg_reply_none();
+		break;
+	case TIPC_CMD_GET_NODES:
+		rep_tlv_buf = tipc_node_get_nodes(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_GET_LINKS:
+		rep_tlv_buf = tipc_node_get_links(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_SHOW_LINK_STATS:
+		rep_tlv_buf = tipc_link_cmd_show_stats(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_RESET_LINK_STATS:
+		rep_tlv_buf = tipc_link_cmd_reset_stats(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_SHOW_NAME_TABLE:
+		rep_tlv_buf = tipc_nametbl_get(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_GET_BEARER_NAMES:
+		rep_tlv_buf = tipc_bearer_get_names();
+		break;
+	case TIPC_CMD_GET_MEDIA_NAMES:
+		rep_tlv_buf = tipc_media_get_names();
+		break;
+	case TIPC_CMD_SHOW_PORTS:
+		rep_tlv_buf = tipc_port_get_ports();
+		break;
+	case TIPC_CMD_SET_LOG_SIZE:
+		rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space);
+		break;
+	case TIPC_CMD_DUMP_LOG:
+		rep_tlv_buf = tipc_log_dump();
+		break;
+	case TIPC_CMD_SHOW_STATS:
+		rep_tlv_buf = tipc_show_stats();
+		break;
+	case TIPC_CMD_SET_LINK_TOL:
+	case TIPC_CMD_SET_LINK_PRI:
+	case TIPC_CMD_SET_LINK_WINDOW:
+		rep_tlv_buf = tipc_link_cmd_config(req_tlv_area, req_tlv_space, cmd);
+		break;
+	case TIPC_CMD_ENABLE_BEARER:
+		rep_tlv_buf = cfg_enable_bearer();
+		break;
+	case TIPC_CMD_DISABLE_BEARER:
+		rep_tlv_buf = cfg_disable_bearer();
+		break;
+	case TIPC_CMD_SET_NODE_ADDR:
+		rep_tlv_buf = cfg_set_own_addr();
+		break;
+	case TIPC_CMD_SET_REMOTE_MNG:
+		rep_tlv_buf = cfg_set_remote_mng();
+		break;
+	case TIPC_CMD_SET_MAX_PORTS:
+		rep_tlv_buf = cfg_set_max_ports();
+		break;
+	case TIPC_CMD_SET_MAX_PUBL:
+		rep_tlv_buf = cfg_set_max_publications();
+		break;
+	case TIPC_CMD_SET_MAX_SUBSCR:
+		rep_tlv_buf = cfg_set_max_subscriptions();
+		break;
+	case TIPC_CMD_SET_NETID:
+		rep_tlv_buf = cfg_set_netid();
+		break;
+	case TIPC_CMD_GET_REMOTE_MNG:
+		rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_remote_management);
+		break;
+	case TIPC_CMD_GET_MAX_PORTS:
+		rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_ports);
+		break;
+	case TIPC_CMD_GET_MAX_PUBL:
+		rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_publications);
+		break;
+	case TIPC_CMD_GET_MAX_SUBSCR:
+		rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_max_subscriptions);
+		break;
+	case TIPC_CMD_GET_NETID:
+		rep_tlv_buf = tipc_cfg_reply_unsigned(tipc_net_id);
+		break;
+	case TIPC_CMD_NOT_NET_ADMIN:
+		rep_tlv_buf =
+			tipc_cfg_reply_error_string(TIPC_CFG_NOT_NET_ADMIN);
+		break;
+	case TIPC_CMD_SET_MAX_ZONES:
+	case TIPC_CMD_GET_MAX_ZONES:
+	case TIPC_CMD_SET_MAX_SLAVES:
+	case TIPC_CMD_GET_MAX_SLAVES:
+	case TIPC_CMD_SET_MAX_CLUSTERS:
+	case TIPC_CMD_GET_MAX_CLUSTERS:
+	case TIPC_CMD_SET_MAX_NODES:
+	case TIPC_CMD_GET_MAX_NODES:
+		rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+							  " (obsolete command)");
+		break;
+	default:
+		rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+							  " (unknown command)");
+		break;
+	}
+
+	/* Return reply buffer */
+exit:
+	spin_unlock_bh(&config_lock);
+	return rep_tlv_buf;
+}
+
+static void cfg_named_msg_event(void *userdata,
+				u32 port_ref,
+				struct sk_buff **buf,
+				const unchar *msg,
+				u32 size,
+				u32 importance,
+				struct tipc_portid const *orig,
+				struct tipc_name_seq const *dest)
+{
+	struct tipc_cfg_msg_hdr *req_hdr;
+	struct tipc_cfg_msg_hdr *rep_hdr;
+	struct sk_buff *rep_buf;
+
+	/* Validate configuration message header (ignore invalid message) */
+
+	req_hdr = (struct tipc_cfg_msg_hdr *)msg;
+	if ((size < sizeof(*req_hdr)) ||
+	    (size != TCM_ALIGN(ntohl(req_hdr->tcm_len))) ||
+	    (ntohs(req_hdr->tcm_flags) != TCM_F_REQUEST)) {
+		warn("Invalid configuration message discarded\n");
+		return;
+	}
+
+	/* Generate reply for request (if can't, return request) */
+
+	rep_buf = tipc_cfg_do_cmd(orig->node,
+				  ntohs(req_hdr->tcm_type),
+				  msg + sizeof(*req_hdr),
+				  size - sizeof(*req_hdr),
+				  BUF_HEADROOM + MAX_H_SIZE + sizeof(*rep_hdr));
+	if (rep_buf) {
+		skb_push(rep_buf, sizeof(*rep_hdr));
+		rep_hdr = (struct tipc_cfg_msg_hdr *)rep_buf->data;
+		memcpy(rep_hdr, req_hdr, sizeof(*rep_hdr));
+		rep_hdr->tcm_len = htonl(rep_buf->len);
+		rep_hdr->tcm_flags &= htons(~TCM_F_REQUEST);
+	} else {
+		rep_buf = *buf;
+		*buf = NULL;
+	}
+
+	/* NEED TO ADD CODE TO HANDLE FAILED SEND (SUCH AS CONGESTION) */
+	tipc_send_buf2port(port_ref, orig, rep_buf, rep_buf->len);
+}
+
+int tipc_cfg_init(void)
+{
+	struct tipc_name_seq seq;
+	int res;
+
+	res = tipc_createport(NULL, TIPC_CRITICAL_IMPORTANCE,
+			      NULL, NULL, NULL,
+			      NULL, cfg_named_msg_event, NULL,
+			      NULL, &config_port_ref);
+	if (res)
+		goto failed;
+
+	seq.type = TIPC_CFG_SRV;
+	seq.lower = seq.upper = tipc_own_addr;
+	res = tipc_nametbl_publish_rsv(config_port_ref, TIPC_ZONE_SCOPE, &seq);
+	if (res)
+		goto failed;
+
+	return 0;
+
+failed:
+	err("Unable to create configuration service\n");
+	return res;
+}
+
+void tipc_cfg_stop(void)
+{
+	if (config_port_ref) {
+		tipc_deleteport(config_port_ref);
+		config_port_ref = 0;
+	}
+}
diff --git a/net/tipc/config.h b/net/tipc/config.h
new file mode 100644
index 00000000..443159a1
--- /dev/null
+++ b/net/tipc/config.h
@@ -0,0 +1,72 @@
+/*
+ * net/tipc/config.h: Include file for TIPC configuration service code
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CONFIG_H
+#define _TIPC_CONFIG_H
+
+/* ---------------------------------------------------------------------- */
+
+#include "link.h"
+
+struct sk_buff *tipc_cfg_reply_alloc(int payload_size);
+int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
+			void *tlv_data, int tlv_data_size);
+struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string);
+
+static inline struct sk_buff *tipc_cfg_reply_none(void)
+{
+	return tipc_cfg_reply_alloc(0);
+}
+
+static inline struct sk_buff *tipc_cfg_reply_error_string(char *string)
+{
+	return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string);
+}
+
+static inline struct sk_buff *tipc_cfg_reply_ultra_string(char *string)
+{
+	return tipc_cfg_reply_string_type(TIPC_TLV_ULTRA_STRING, string);
+}
+
+struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd,
+				const void *req_tlv_area, int req_tlv_space,
+				int headroom);
+
+void tipc_cfg_link_event(u32 addr, char *name, int up);
+int  tipc_cfg_init(void);
+void tipc_cfg_stop(void);
+
+#endif
diff --git a/net/tipc/core.c b/net/tipc/core.c
new file mode 100644
index 00000000..943b6af8
--- /dev/null
+++ b/net/tipc/core.c
@@ -0,0 +1,211 @@
+/*
+ * net/tipc/core.c: TIPC module code
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "ref.h"
+#include "name_table.h"
+#include "subscr.h"
+#include "config.h"
+
+
+#ifndef CONFIG_TIPC_PORTS
+#define CONFIG_TIPC_PORTS 8191
+#endif
+
+#ifndef CONFIG_TIPC_LOG
+#define CONFIG_TIPC_LOG 0
+#endif
+
+/* global variables used by multiple sub-systems within TIPC */
+
+int tipc_mode = TIPC_NOT_RUNNING;
+int tipc_random;
+
+const char tipc_alphabet[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_.";
+
+/* configurable TIPC parameters */
+
+u32 tipc_own_addr;
+int tipc_max_ports;
+int tipc_max_subscriptions;
+int tipc_max_publications;
+int tipc_net_id;
+int tipc_remote_management;
+
+
+/**
+ * tipc_buf_acquire - creates a TIPC message buffer
+ * @size: message size (including TIPC header)
+ *
+ * Returns a new buffer with data pointers set to the specified size.
+ *
+ * NOTE: Headroom is reserved to allow prepending of a data link header.
+ *       There may also be unrequested tailroom present at the buffer's end.
+ */
+
+struct sk_buff *tipc_buf_acquire(u32 size)
+{
+	struct sk_buff *skb;
+	unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
+
+	skb = alloc_skb_fclone(buf_size, GFP_ATOMIC);
+	if (skb) {
+		skb_reserve(skb, BUF_HEADROOM);
+		skb_put(skb, size);
+		skb->next = NULL;
+	}
+	return skb;
+}
+
+/**
+ * tipc_core_stop_net - shut down TIPC networking sub-systems
+ */
+
+static void tipc_core_stop_net(void)
+{
+	tipc_eth_media_stop();
+	tipc_net_stop();
+}
+
+/**
+ * start_net - start TIPC networking sub-systems
+ */
+
+int tipc_core_start_net(unsigned long addr)
+{
+	int res;
+
+	res = tipc_net_start(addr);
+	if (!res)
+		res = tipc_eth_media_start();
+	if (res)
+		tipc_core_stop_net();
+	return res;
+}
+
+/**
+ * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode
+ */
+
+static void tipc_core_stop(void)
+{
+	if (tipc_mode != TIPC_NODE_MODE)
+		return;
+
+	tipc_mode = TIPC_NOT_RUNNING;
+
+	tipc_netlink_stop();
+	tipc_handler_stop();
+	tipc_cfg_stop();
+	tipc_subscr_stop();
+	tipc_nametbl_stop();
+	tipc_ref_table_stop();
+	tipc_socket_stop();
+	tipc_log_resize(0);
+}
+
+/**
+ * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode
+ */
+
+static int tipc_core_start(void)
+{
+	int res;
+
+	if (tipc_mode != TIPC_NOT_RUNNING)
+		return -ENOPROTOOPT;
+
+	get_random_bytes(&tipc_random, sizeof(tipc_random));
+	tipc_mode = TIPC_NODE_MODE;
+
+	res = tipc_handler_start();
+	if (!res)
+		res = tipc_ref_table_init(tipc_max_ports, tipc_random);
+	if (!res)
+		res = tipc_nametbl_init();
+	if (!res)
+		res = tipc_k_signal((Handler)tipc_subscr_start, 0);
+	if (!res)
+		res = tipc_k_signal((Handler)tipc_cfg_init, 0);
+	if (!res)
+		res = tipc_netlink_start();
+	if (!res)
+		res = tipc_socket_init();
+	if (res)
+		tipc_core_stop();
+
+	return res;
+}
+
+
+static int __init tipc_init(void)
+{
+	int res;
+
+	if (tipc_log_resize(CONFIG_TIPC_LOG) != 0)
+		warn("Unable to create log buffer\n");
+
+	info("Activated (version " TIPC_MOD_VER ")\n");
+
+	tipc_own_addr = 0;
+	tipc_remote_management = 1;
+	tipc_max_publications = 10000;
+	tipc_max_subscriptions = 2000;
+	tipc_max_ports = CONFIG_TIPC_PORTS;
+	tipc_net_id = 4711;
+
+	res = tipc_core_start();
+	if (res)
+		err("Unable to start in single node mode\n");
+	else
+		info("Started in single node mode\n");
+	return res;
+}
+
+static void __exit tipc_exit(void)
+{
+	tipc_core_stop_net();
+	tipc_core_stop();
+	info("Deactivated\n");
+}
+
+module_init(tipc_init);
+module_exit(tipc_exit);
+
+MODULE_DESCRIPTION("TIPC: Transparent Inter Process Communication");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(TIPC_MOD_VER);
diff --git a/net/tipc/core.h b/net/tipc/core.h
new file mode 100644
index 00000000..436dda11
--- /dev/null
+++ b/net/tipc/core.h
@@ -0,0 +1,312 @@
+/*
+ * net/tipc/core.h: Include file for TIPC global declarations
+ *
+ * Copyright (c) 2005-2006, Ericsson AB
+ * Copyright (c) 2005-2007, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_CORE_H
+#define _TIPC_CORE_H
+
+#include <linux/tipc.h>
+#include <linux/tipc_config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <asm/uaccess.h>
+#include <linux/interrupt.h>
+#include <asm/atomic.h>
+#include <asm/hardirq.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+
+#define TIPC_MOD_VER "2.0.0"
+
+struct tipc_msg;	/* msg.h */
+struct print_buf;	/* log.h */
+
+/*
+ * TIPC sanity test macros
+ */
+
+#define assert(i)  BUG_ON(!(i))
+
+/*
+ * TIPC system monitoring code
+ */
+
+/*
+ * TIPC's print buffer subsystem supports the following print buffers:
+ *
+ * TIPC_NULL : null buffer (i.e. print nowhere)
+ * TIPC_CONS : system console
+ * TIPC_LOG  : TIPC log buffer
+ * &buf	     : user-defined buffer (struct print_buf *)
+ *
+ * Note: TIPC_LOG is configured to echo its output to the system console;
+ *       user-defined buffers can be configured to do the same thing.
+ */
+extern struct print_buf *const TIPC_NULL;
+extern struct print_buf *const TIPC_CONS;
+extern struct print_buf *const TIPC_LOG;
+
+void tipc_printf(struct print_buf *, const char *fmt, ...);
+
+/*
+ * TIPC_OUTPUT is the destination print buffer for system messages.
+ */
+
+#ifndef TIPC_OUTPUT
+#define TIPC_OUTPUT TIPC_LOG
+#endif
+
+#define err(fmt, arg...)  tipc_printf(TIPC_OUTPUT, \
+				      KERN_ERR "TIPC: " fmt, ## arg)
+#define warn(fmt, arg...) tipc_printf(TIPC_OUTPUT, \
+				      KERN_WARNING "TIPC: " fmt, ## arg)
+#define info(fmt, arg...) tipc_printf(TIPC_OUTPUT, \
+				      KERN_NOTICE "TIPC: " fmt, ## arg)
+
+#ifdef CONFIG_TIPC_DEBUG
+
+/*
+ * DBG_OUTPUT is the destination print buffer for debug messages.
+ */
+
+#ifndef DBG_OUTPUT
+#define DBG_OUTPUT TIPC_LOG
+#endif
+
+#define dbg(fmt, arg...)  tipc_printf(DBG_OUTPUT, KERN_DEBUG fmt, ## arg);
+
+#define msg_dbg(msg, txt) tipc_msg_dbg(DBG_OUTPUT, msg, txt);
+
+void tipc_msg_dbg(struct print_buf *, struct tipc_msg *, const char *);
+
+#else
+
+#define dbg(fmt, arg...)	do {} while (0)
+#define msg_dbg(msg, txt)	do {} while (0)
+
+#define tipc_msg_dbg(buf, msg, txt) do {} while (0)
+
+#endif
+
+
+/*
+ * TIPC-specific error codes
+ */
+
+#define ELINKCONG EAGAIN	/* link congestion <=> resource unavailable */
+
+/*
+ * TIPC operating mode routines
+ */
+#define TIPC_NOT_RUNNING  0
+#define TIPC_NODE_MODE    1
+#define TIPC_NET_MODE     2
+
+/*
+ * Global configuration variables
+ */
+
+extern u32 tipc_own_addr;
+extern int tipc_max_ports;
+extern int tipc_max_subscriptions;
+extern int tipc_max_publications;
+extern int tipc_net_id;
+extern int tipc_remote_management;
+
+/*
+ * Other global variables
+ */
+
+extern int tipc_mode;
+extern int tipc_random;
+extern const char tipc_alphabet[];
+
+
+/*
+ * Routines available to privileged subsystems
+ */
+
+extern int tipc_core_start_net(unsigned long);
+extern int  tipc_handler_start(void);
+extern void tipc_handler_stop(void);
+extern int  tipc_netlink_start(void);
+extern void tipc_netlink_stop(void);
+extern int  tipc_socket_init(void);
+extern void tipc_socket_stop(void);
+
+static inline int delimit(int val, int min, int max)
+{
+	if (val > max)
+		return max;
+	if (val < min)
+		return min;
+	return val;
+}
+
+
+/*
+ * TIPC timer and signal code
+ */
+
+typedef void (*Handler) (unsigned long);
+
+u32 tipc_k_signal(Handler routine, unsigned long argument);
+
+/**
+ * k_init_timer - initialize a timer
+ * @timer: pointer to timer structure
+ * @routine: pointer to routine to invoke when timer expires
+ * @argument: value to pass to routine when timer expires
+ *
+ * Timer must be initialized before use (and terminated when no longer needed).
+ */
+
+static inline void k_init_timer(struct timer_list *timer, Handler routine,
+				unsigned long argument)
+{
+	setup_timer(timer, routine, argument);
+}
+
+/**
+ * k_start_timer - start a timer
+ * @timer: pointer to timer structure
+ * @msec: time to delay (in ms)
+ *
+ * Schedules a previously initialized timer for later execution.
+ * If timer is already running, the new timeout overrides the previous request.
+ *
+ * To ensure the timer doesn't expire before the specified delay elapses,
+ * the amount of delay is rounded up when converting to the jiffies
+ * then an additional jiffy is added to account for the fact that
+ * the starting time may be in the middle of the current jiffy.
+ */
+
+static inline void k_start_timer(struct timer_list *timer, unsigned long msec)
+{
+	mod_timer(timer, jiffies + msecs_to_jiffies(msec) + 1);
+}
+
+/**
+ * k_cancel_timer - cancel a timer
+ * @timer: pointer to timer structure
+ *
+ * Cancels a previously initialized timer.
+ * Can be called safely even if the timer is already inactive.
+ *
+ * WARNING: Must not be called when holding locks required by the timer's
+ *          timeout routine, otherwise deadlock can occur on SMP systems!
+ */
+
+static inline void k_cancel_timer(struct timer_list *timer)
+{
+	del_timer_sync(timer);
+}
+
+/**
+ * k_term_timer - terminate a timer
+ * @timer: pointer to timer structure
+ *
+ * Prevents further use of a previously initialized timer.
+ *
+ * WARNING: Caller must ensure timer isn't currently running.
+ *
+ * (Do not "enhance" this routine to automatically cancel an active timer,
+ * otherwise deadlock can arise when a timeout routine calls k_term_timer.)
+ */
+
+static inline void k_term_timer(struct timer_list *timer)
+{
+}
+
+
+/*
+ * TIPC message buffer code
+ *
+ * TIPC message buffer headroom reserves space for the worst-case
+ * link-level device header (in case the message is sent off-node).
+ *
+ * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields
+ *       are word aligned for quicker access
+ */
+
+#define BUF_HEADROOM LL_MAX_HEADER
+
+struct tipc_skb_cb {
+	void *handle;
+};
+
+#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
+
+
+static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
+{
+	return (struct tipc_msg *)skb->data;
+}
+
+extern struct sk_buff *tipc_buf_acquire(u32 size);
+
+/**
+ * buf_discard - frees a TIPC message buffer
+ * @skb: message buffer
+ *
+ * Frees a message buffer.  If passed NULL, just returns.
+ */
+
+static inline void buf_discard(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+/**
+ * buf_linearize - convert a TIPC message buffer into a single contiguous piece
+ * @skb: message buffer
+ *
+ * Returns 0 on success.
+ */
+
+static inline int buf_linearize(struct sk_buff *skb)
+{
+	return skb_linearize(skb);
+}
+
+#endif
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
new file mode 100644
index 00000000..09879331
--- /dev/null
+++ b/net/tipc/discover.c
@@ -0,0 +1,358 @@
+/*
+ * net/tipc/discover.c
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "discover.h"
+
+#define TIPC_LINK_REQ_INIT	125	/* min delay during bearer start up */
+#define TIPC_LINK_REQ_FAST	1000	/* max delay if bearer has no links */
+#define TIPC_LINK_REQ_SLOW	60000	/* max delay if bearer has links */
+#define TIPC_LINK_REQ_INACTIVE	0xffffffff /* indicates no timer in use */
+
+
+/**
+ * struct link_req - information about an ongoing link setup request
+ * @bearer: bearer issuing requests
+ * @dest: destination address for request messages
+ * @domain: network domain to which links can be established
+ * @num_nodes: number of nodes currently discovered (i.e. with an active link)
+ * @buf: request message to be (repeatedly) sent
+ * @timer: timer governing period between requests
+ * @timer_intv: current interval between requests (in ms)
+ */
+struct link_req {
+	struct tipc_bearer *bearer;
+	struct tipc_media_addr dest;
+	u32 domain;
+	int num_nodes;
+	struct sk_buff *buf;
+	struct timer_list timer;
+	unsigned int timer_intv;
+};
+
+/**
+ * tipc_disc_init_msg - initialize a link setup message
+ * @type: message type (request or response)
+ * @dest_domain: network domain of node(s) which should respond to message
+ * @b_ptr: ptr to bearer issuing message
+ */
+
+static struct sk_buff *tipc_disc_init_msg(u32 type,
+					  u32 dest_domain,
+					  struct tipc_bearer *b_ptr)
+{
+	struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE);
+	struct tipc_msg *msg;
+
+	if (buf) {
+		msg = buf_msg(buf);
+		tipc_msg_init(msg, LINK_CONFIG, type, INT_H_SIZE, dest_domain);
+		msg_set_non_seq(msg, 1);
+		msg_set_dest_domain(msg, dest_domain);
+		msg_set_bc_netid(msg, tipc_net_id);
+		msg_set_media_addr(msg, &b_ptr->addr);
+	}
+	return buf;
+}
+
+/**
+ * disc_dupl_alert - issue node address duplication alert
+ * @b_ptr: pointer to bearer detecting duplication
+ * @node_addr: duplicated node address
+ * @media_addr: media address advertised by duplicated node
+ */
+
+static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
+			    struct tipc_media_addr *media_addr)
+{
+	char node_addr_str[16];
+	char media_addr_str[64];
+	struct print_buf pb;
+
+	tipc_addr_string_fill(node_addr_str, node_addr);
+	tipc_printbuf_init(&pb, media_addr_str, sizeof(media_addr_str));
+	tipc_media_addr_printf(&pb, media_addr);
+	tipc_printbuf_validate(&pb);
+	warn("Duplicate %s using %s seen on <%s>\n",
+	     node_addr_str, media_addr_str, b_ptr->name);
+}
+
+/**
+ * tipc_disc_recv_msg - handle incoming link setup message (request or response)
+ * @buf: buffer containing message
+ * @b_ptr: bearer that message arrived on
+ */
+
+void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr)
+{
+	struct tipc_node *n_ptr;
+	struct link *link;
+	struct tipc_media_addr media_addr, *addr;
+	struct sk_buff *rbuf;
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 dest = msg_dest_domain(msg);
+	u32 orig = msg_prevnode(msg);
+	u32 net_id = msg_bc_netid(msg);
+	u32 type = msg_type(msg);
+	int link_fully_up;
+
+	msg_get_media_addr(msg, &media_addr);
+	buf_discard(buf);
+
+	/* Validate discovery message from requesting node */
+	if (net_id != tipc_net_id)
+		return;
+	if (!tipc_addr_domain_valid(dest))
+		return;
+	if (!tipc_addr_node_valid(orig))
+		return;
+	if (orig == tipc_own_addr) {
+		if (memcmp(&media_addr, &b_ptr->addr, sizeof(media_addr)))
+			disc_dupl_alert(b_ptr, tipc_own_addr, &media_addr);
+		return;
+	}
+	if (!tipc_in_scope(dest, tipc_own_addr))
+		return;
+	if (!tipc_in_scope(b_ptr->link_req->domain, orig))
+		return;
+
+	/* Locate structure corresponding to requesting node */
+	n_ptr = tipc_node_find(orig);
+	if (!n_ptr) {
+		n_ptr = tipc_node_create(orig);
+		if (!n_ptr)
+			return;
+	}
+	tipc_node_lock(n_ptr);
+
+	/* Don't talk to neighbor during cleanup after last session */
+	if (n_ptr->cleanup_required) {
+		tipc_node_unlock(n_ptr);
+		return;
+	}
+
+	link = n_ptr->links[b_ptr->identity];
+
+	/* Create a link endpoint for this bearer, if necessary */
+	if (!link) {
+		link = tipc_link_create(n_ptr, b_ptr, &media_addr);
+		if (!link) {
+			tipc_node_unlock(n_ptr);
+			return;
+		}
+	}
+
+	/*
+	 * Ensure requesting node's media address is correct
+	 *
+	 * If media address doesn't match and the link is working, reject the
+	 * request (must be from a duplicate node).
+	 *
+	 * If media address doesn't match and the link is not working, accept
+	 * the new media address and reset the link to ensure it starts up
+	 * cleanly.
+	 */
+	addr = &link->media_addr;
+	if (memcmp(addr, &media_addr, sizeof(*addr))) {
+		if (tipc_link_is_up(link) || (!link->started)) {
+			disc_dupl_alert(b_ptr, orig, &media_addr);
+			tipc_node_unlock(n_ptr);
+			return;
+		}
+		warn("Resetting link <%s>, peer interface address changed\n",
+		     link->name);
+		memcpy(addr, &media_addr, sizeof(*addr));
+		tipc_link_reset(link);
+	}
+
+	/* Accept discovery message & send response, if necessary */
+	link_fully_up = link_working_working(link);
+
+	if ((type == DSC_REQ_MSG) && !link_fully_up && !b_ptr->blocked) {
+		rbuf = tipc_disc_init_msg(DSC_RESP_MSG, orig, b_ptr);
+		if (rbuf) {
+			b_ptr->media->send_msg(rbuf, b_ptr, &media_addr);
+			buf_discard(rbuf);
+		}
+	}
+
+	tipc_node_unlock(n_ptr);
+}
+
+/**
+ * disc_update - update frequency of periodic link setup requests
+ * @req: ptr to link request structure
+ *
+ * Reinitiates discovery process if discovery object has no associated nodes
+ * and is either not currently searching or is searching at a slow rate
+ */
+
+static void disc_update(struct link_req *req)
+{
+	if (!req->num_nodes) {
+		if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) ||
+		    (req->timer_intv > TIPC_LINK_REQ_FAST)) {
+			req->timer_intv = TIPC_LINK_REQ_INIT;
+			k_start_timer(&req->timer, req->timer_intv);
+		}
+	}
+}
+
+/**
+ * tipc_disc_add_dest - increment set of discovered nodes
+ * @req: ptr to link request structure
+ */
+
+void tipc_disc_add_dest(struct link_req *req)
+{
+	req->num_nodes++;
+}
+
+/**
+ * tipc_disc_remove_dest - decrement set of discovered nodes
+ * @req: ptr to link request structure
+ */
+
+void tipc_disc_remove_dest(struct link_req *req)
+{
+	req->num_nodes--;
+	disc_update(req);
+}
+
+/**
+ * disc_send_msg - send link setup request message
+ * @req: ptr to link request structure
+ */
+
+static void disc_send_msg(struct link_req *req)
+{
+	if (!req->bearer->blocked)
+		tipc_bearer_send(req->bearer, req->buf, &req->dest);
+}
+
+/**
+ * disc_timeout - send a periodic link setup request
+ * @req: ptr to link request structure
+ *
+ * Called whenever a link setup request timer associated with a bearer expires.
+ */
+
+static void disc_timeout(struct link_req *req)
+{
+	int max_delay;
+
+	spin_lock_bh(&req->bearer->lock);
+
+	/* Stop searching if only desired node has been found */
+
+	if (tipc_node(req->domain) && req->num_nodes) {
+		req->timer_intv = TIPC_LINK_REQ_INACTIVE;
+		goto exit;
+	}
+
+	/*
+	 * Send discovery message, then update discovery timer
+	 *
+	 * Keep doubling time between requests until limit is reached;
+	 * hold at fast polling rate if don't have any associated nodes,
+	 * otherwise hold at slow polling rate
+	 */
+
+	disc_send_msg(req);
+
+	req->timer_intv *= 2;
+	if (req->num_nodes)
+		max_delay = TIPC_LINK_REQ_SLOW;
+	else
+		max_delay = TIPC_LINK_REQ_FAST;
+	if (req->timer_intv > max_delay)
+		req->timer_intv = max_delay;
+
+	k_start_timer(&req->timer, req->timer_intv);
+exit:
+	spin_unlock_bh(&req->bearer->lock);
+}
+
+/**
+ * tipc_disc_create - create object to send periodic link setup requests
+ * @b_ptr: ptr to bearer issuing requests
+ * @dest: destination address for request messages
+ * @dest_domain: network domain to which links can be established
+ *
+ * Returns 0 if successful, otherwise -errno.
+ */
+
+int tipc_disc_create(struct tipc_bearer *b_ptr,
+		     struct tipc_media_addr *dest, u32 dest_domain)
+{
+	struct link_req *req;
+
+	req = kmalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req)
+		return -ENOMEM;
+
+	req->buf = tipc_disc_init_msg(DSC_REQ_MSG, dest_domain, b_ptr);
+	if (!req->buf) {
+		kfree(req);
+		return -ENOMSG;
+	}
+
+	memcpy(&req->dest, dest, sizeof(*dest));
+	req->bearer = b_ptr;
+	req->domain = dest_domain;
+	req->num_nodes = 0;
+	req->timer_intv = TIPC_LINK_REQ_INIT;
+	k_init_timer(&req->timer, (Handler)disc_timeout, (unsigned long)req);
+	k_start_timer(&req->timer, req->timer_intv);
+	b_ptr->link_req = req;
+	disc_send_msg(req);
+	return 0;
+}
+
+/**
+ * tipc_disc_delete - destroy object sending periodic link setup requests
+ * @req: ptr to link request structure
+ */
+
+void tipc_disc_delete(struct link_req *req)
+{
+	k_cancel_timer(&req->timer);
+	k_term_timer(&req->timer);
+	buf_discard(req->buf);
+	kfree(req);
+}
+
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
new file mode 100644
index 00000000..a3af595b
--- /dev/null
+++ b/net/tipc/discover.h
@@ -0,0 +1,49 @@
+/*
+ * net/tipc/discover.h
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_DISCOVER_H
+#define _TIPC_DISCOVER_H
+
+struct link_req;
+
+int tipc_disc_create(struct tipc_bearer *b_ptr, struct tipc_media_addr *dest,
+		     u32 dest_domain);
+void tipc_disc_delete(struct link_req *req);
+void tipc_disc_add_dest(struct link_req *req);
+void tipc_disc_remove_dest(struct link_req *req);
+void tipc_disc_recv_msg(struct sk_buff *buf, struct tipc_bearer *b_ptr);
+
+#endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
new file mode 100644
index 00000000..b69092eb
--- /dev/null
+++ b/net/tipc/eth_media.c
@@ -0,0 +1,323 @@
+/*
+ * net/tipc/eth_media.c: Ethernet bearer support for TIPC
+ *
+ * Copyright (c) 2001-2007, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "bearer.h"
+
+#define MAX_ETH_BEARERS		2
+#define ETH_LINK_PRIORITY	TIPC_DEF_LINK_PRI
+#define ETH_LINK_TOLERANCE	TIPC_DEF_LINK_TOL
+#define ETH_LINK_WINDOW		TIPC_DEF_LINK_WIN
+
+/**
+ * struct eth_bearer - Ethernet bearer data structure
+ * @bearer: ptr to associated "generic" bearer structure
+ * @dev: ptr to associated Ethernet network device
+ * @tipc_packet_type: used in binding TIPC to Ethernet driver
+ */
+
+struct eth_bearer {
+	struct tipc_bearer *bearer;
+	struct net_device *dev;
+	struct packet_type tipc_packet_type;
+};
+
+static struct eth_bearer eth_bearers[MAX_ETH_BEARERS];
+static int eth_started;
+static struct notifier_block notifier;
+
+/**
+ * send_msg - send a TIPC message out over an Ethernet interface
+ */
+
+static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
+		    struct tipc_media_addr *dest)
+{
+	struct sk_buff *clone;
+	struct net_device *dev;
+	int delta;
+
+	clone = skb_clone(buf, GFP_ATOMIC);
+	if (!clone)
+		return 0;
+
+	dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
+	delta = dev->hard_header_len - skb_headroom(buf);
+
+	if ((delta > 0) &&
+	    pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+		kfree_skb(clone);
+		return 0;
+	}
+
+	skb_reset_network_header(clone);
+	clone->dev = dev;
+	dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr,
+			dev->dev_addr, clone->len);
+	dev_queue_xmit(clone);
+	return 0;
+}
+
+/**
+ * recv_msg - handle incoming TIPC message from an Ethernet interface
+ *
+ * Accept only packets explicitly sent to this node, or broadcast packets;
+ * ignores packets sent using Ethernet multicast, and traffic sent to other
+ * nodes (which can happen if interface is running in promiscuous mode).
+ */
+
+static int recv_msg(struct sk_buff *buf, struct net_device *dev,
+		    struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv;
+
+	if (!net_eq(dev_net(dev), &init_net)) {
+		kfree_skb(buf);
+		return 0;
+	}
+
+	if (likely(eb_ptr->bearer)) {
+		if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
+			buf->next = NULL;
+			tipc_recv_msg(buf, eb_ptr->bearer);
+			return 0;
+		}
+	}
+	kfree_skb(buf);
+	return 0;
+}
+
+/**
+ * enable_bearer - attach TIPC bearer to an Ethernet interface
+ */
+
+static int enable_bearer(struct tipc_bearer *tb_ptr)
+{
+	struct net_device *dev = NULL;
+	struct net_device *pdev = NULL;
+	struct eth_bearer *eb_ptr = &eth_bearers[0];
+	struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
+	char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1;
+	int pending_dev = 0;
+
+	/* Find unused Ethernet bearer structure */
+
+	while (eb_ptr->dev) {
+		if (!eb_ptr->bearer)
+			pending_dev++;
+		if (++eb_ptr == stop)
+			return pending_dev ? -EAGAIN : -EDQUOT;
+	}
+
+	/* Find device with specified name */
+
+	for_each_netdev(&init_net, pdev) {
+		if (!strncmp(pdev->name, driver_name, IFNAMSIZ)) {
+			dev = pdev;
+			break;
+		}
+	}
+	if (!dev)
+		return -ENODEV;
+
+	/* Find Ethernet bearer for device (or create one) */
+
+	while ((eb_ptr != stop) && eb_ptr->dev && (eb_ptr->dev != dev))
+		eb_ptr++;
+	if (eb_ptr == stop)
+		return -EDQUOT;
+	if (!eb_ptr->dev) {
+		eb_ptr->dev = dev;
+		eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC);
+		eb_ptr->tipc_packet_type.dev = dev;
+		eb_ptr->tipc_packet_type.func = recv_msg;
+		eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr;
+		INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list));
+		dev_hold(dev);
+		dev_add_pack(&eb_ptr->tipc_packet_type);
+	}
+
+	/* Associate TIPC bearer with Ethernet bearer */
+
+	eb_ptr->bearer = tb_ptr;
+	tb_ptr->usr_handle = (void *)eb_ptr;
+	tb_ptr->mtu = dev->mtu;
+	tb_ptr->blocked = 0;
+	tb_ptr->addr.type = htonl(TIPC_MEDIA_TYPE_ETH);
+	memcpy(&tb_ptr->addr.dev_addr, dev->dev_addr, ETH_ALEN);
+	return 0;
+}
+
+/**
+ * disable_bearer - detach TIPC bearer from an Ethernet interface
+ *
+ * We really should do dev_remove_pack() here, but this function can not be
+ * called at tasklet level. => Use eth_bearer->bearer as a flag to throw away
+ * incoming buffers, & postpone dev_remove_pack() to eth_media_stop() on exit.
+ */
+
+static void disable_bearer(struct tipc_bearer *tb_ptr)
+{
+	((struct eth_bearer *)tb_ptr->usr_handle)->bearer = NULL;
+}
+
+/**
+ * recv_notification - handle device updates from OS
+ *
+ * Change the state of the Ethernet bearer (if any) associated with the
+ * specified device.
+ */
+
+static int recv_notification(struct notifier_block *nb, unsigned long evt,
+			     void *dv)
+{
+	struct net_device *dev = (struct net_device *)dv;
+	struct eth_bearer *eb_ptr = &eth_bearers[0];
+	struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	while ((eb_ptr->dev != dev)) {
+		if (++eb_ptr == stop)
+			return NOTIFY_DONE;	/* couldn't find device */
+	}
+	if (!eb_ptr->bearer)
+		return NOTIFY_DONE;		/* bearer had been disabled */
+
+	eb_ptr->bearer->mtu = dev->mtu;
+
+	switch (evt) {
+	case NETDEV_CHANGE:
+		if (netif_carrier_ok(dev))
+			tipc_continue(eb_ptr->bearer);
+		else
+			tipc_block_bearer(eb_ptr->bearer->name);
+		break;
+	case NETDEV_UP:
+		tipc_continue(eb_ptr->bearer);
+		break;
+	case NETDEV_DOWN:
+		tipc_block_bearer(eb_ptr->bearer->name);
+		break;
+	case NETDEV_CHANGEMTU:
+	case NETDEV_CHANGEADDR:
+		tipc_block_bearer(eb_ptr->bearer->name);
+		tipc_continue(eb_ptr->bearer);
+		break;
+	case NETDEV_UNREGISTER:
+	case NETDEV_CHANGENAME:
+		tipc_disable_bearer(eb_ptr->bearer->name);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/**
+ * eth_addr2str - convert Ethernet address to string
+ */
+
+static char *eth_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size)
+{
+	unchar *addr = (unchar *)&a->dev_addr;
+
+	if (str_size < 18)
+		*str_buf = '\0';
+	else
+		sprintf(str_buf, "%pM", addr);
+	return str_buf;
+}
+
+/**
+ * tipc_eth_media_start - activate Ethernet bearer support
+ *
+ * Register Ethernet media type with TIPC bearer code.  Also register
+ * with OS for notifications about device state changes.
+ */
+
+int tipc_eth_media_start(void)
+{
+	struct tipc_media_addr bcast_addr;
+	int res;
+
+	if (eth_started)
+		return -EINVAL;
+
+	bcast_addr.type = htonl(TIPC_MEDIA_TYPE_ETH);
+	memset(&bcast_addr.dev_addr, 0xff, ETH_ALEN);
+
+	memset(eth_bearers, 0, sizeof(eth_bearers));
+
+	res = tipc_register_media(TIPC_MEDIA_TYPE_ETH, "eth",
+				  enable_bearer, disable_bearer, send_msg,
+				  eth_addr2str, &bcast_addr, ETH_LINK_PRIORITY,
+				  ETH_LINK_TOLERANCE, ETH_LINK_WINDOW);
+	if (res)
+		return res;
+
+	notifier.notifier_call = &recv_notification;
+	notifier.priority = 0;
+	res = register_netdevice_notifier(&notifier);
+	if (!res)
+		eth_started = 1;
+	return res;
+}
+
+/**
+ * tipc_eth_media_stop - deactivate Ethernet bearer support
+ */
+
+void tipc_eth_media_stop(void)
+{
+	int i;
+
+	if (!eth_started)
+		return;
+
+	unregister_netdevice_notifier(&notifier);
+	for (i = 0; i < MAX_ETH_BEARERS ; i++) {
+		if (eth_bearers[i].bearer) {
+			eth_bearers[i].bearer->blocked = 1;
+			eth_bearers[i].bearer = NULL;
+		}
+		if (eth_bearers[i].dev) {
+			dev_remove_pack(&eth_bearers[i].tipc_packet_type);
+			dev_put(eth_bearers[i].dev);
+		}
+	}
+	memset(&eth_bearers, 0, sizeof(eth_bearers));
+	eth_started = 0;
+}
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
new file mode 100644
index 00000000..274c98e1
--- /dev/null
+++ b/net/tipc/handler.c
@@ -0,0 +1,132 @@
+/*
+ * net/tipc/handler.c: TIPC signal handling
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+
+struct queue_item {
+	struct list_head next_signal;
+	void (*handler) (unsigned long);
+	unsigned long data;
+};
+
+static struct kmem_cache *tipc_queue_item_cache;
+static struct list_head signal_queue_head;
+static DEFINE_SPINLOCK(qitem_lock);
+static int handler_enabled;
+
+static void process_signal_queue(unsigned long dummy);
+
+static DECLARE_TASKLET_DISABLED(tipc_tasklet, process_signal_queue, 0);
+
+
+unsigned int tipc_k_signal(Handler routine, unsigned long argument)
+{
+	struct queue_item *item;
+
+	if (!handler_enabled) {
+		err("Signal request ignored by handler\n");
+		return -ENOPROTOOPT;
+	}
+
+	spin_lock_bh(&qitem_lock);
+	item = kmem_cache_alloc(tipc_queue_item_cache, GFP_ATOMIC);
+	if (!item) {
+		err("Signal queue out of memory\n");
+		spin_unlock_bh(&qitem_lock);
+		return -ENOMEM;
+	}
+	item->handler = routine;
+	item->data = argument;
+	list_add_tail(&item->next_signal, &signal_queue_head);
+	spin_unlock_bh(&qitem_lock);
+	tasklet_schedule(&tipc_tasklet);
+	return 0;
+}
+
+static void process_signal_queue(unsigned long dummy)
+{
+	struct queue_item *__volatile__ item;
+	struct list_head *l, *n;
+
+	spin_lock_bh(&qitem_lock);
+	list_for_each_safe(l, n, &signal_queue_head) {
+		item = list_entry(l, struct queue_item, next_signal);
+		list_del(&item->next_signal);
+		spin_unlock_bh(&qitem_lock);
+		item->handler(item->data);
+		spin_lock_bh(&qitem_lock);
+		kmem_cache_free(tipc_queue_item_cache, item);
+	}
+	spin_unlock_bh(&qitem_lock);
+}
+
+int tipc_handler_start(void)
+{
+	tipc_queue_item_cache =
+		kmem_cache_create("tipc_queue_items", sizeof(struct queue_item),
+				  0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!tipc_queue_item_cache)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&signal_queue_head);
+	tasklet_enable(&tipc_tasklet);
+	handler_enabled = 1;
+	return 0;
+}
+
+void tipc_handler_stop(void)
+{
+	struct list_head *l, *n;
+	struct queue_item *item;
+
+	if (!handler_enabled)
+		return;
+
+	handler_enabled = 0;
+	tasklet_disable(&tipc_tasklet);
+	tasklet_kill(&tipc_tasklet);
+
+	spin_lock_bh(&qitem_lock);
+	list_for_each_safe(l, n, &signal_queue_head) {
+		item = list_entry(l, struct queue_item, next_signal);
+		list_del(&item->next_signal);
+		kmem_cache_free(tipc_queue_item_cache, item);
+	}
+	spin_unlock_bh(&qitem_lock);
+
+	kmem_cache_destroy(tipc_queue_item_cache);
+}
+
diff --git a/net/tipc/link.c b/net/tipc/link.c
new file mode 100644
index 00000000..5ed4b4f7
--- /dev/null
+++ b/net/tipc/link.c
@@ -0,0 +1,3042 @@
+/*
+ * net/tipc/link.c: TIPC link code
+ *
+ * Copyright (c) 1996-2007, Ericsson AB
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "port.h"
+#include "name_distr.h"
+#include "discover.h"
+#include "config.h"
+
+
+/*
+ * Out-of-range value for link session numbers
+ */
+
+#define INVALID_SESSION 0x10000
+
+/*
+ * Link state events:
+ */
+
+#define  STARTING_EVT    856384768	/* link processing trigger */
+#define  TRAFFIC_MSG_EVT 560815u	/* rx'd ??? */
+#define  TIMEOUT_EVT     560817u	/* link timer expired */
+
+/*
+ * The following two 'message types' is really just implementation
+ * data conveniently stored in the message header.
+ * They must not be considered part of the protocol
+ */
+#define OPEN_MSG   0
+#define CLOSED_MSG 1
+
+/*
+ * State value stored in 'exp_msg_count'
+ */
+
+#define START_CHANGEOVER 100000u
+
+/**
+ * struct link_name - deconstructed link name
+ * @addr_local: network address of node at this end
+ * @if_local: name of interface at this end
+ * @addr_peer: network address of node at far end
+ * @if_peer: name of interface at far end
+ */
+
+struct link_name {
+	u32 addr_local;
+	char if_local[TIPC_MAX_IF_NAME];
+	u32 addr_peer;
+	char if_peer[TIPC_MAX_IF_NAME];
+};
+
+static void link_handle_out_of_seq_msg(struct link *l_ptr,
+				       struct sk_buff *buf);
+static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
+static int  link_recv_changeover_msg(struct link **l_ptr, struct sk_buff **buf);
+static void link_set_supervision_props(struct link *l_ptr, u32 tolerance);
+static int  link_send_sections_long(struct tipc_port *sender,
+				    struct iovec const *msg_sect,
+				    u32 num_sect, unsigned int total_len,
+				    u32 destnode);
+static void link_check_defragm_bufs(struct link *l_ptr);
+static void link_state_event(struct link *l_ptr, u32 event);
+static void link_reset_statistics(struct link *l_ptr);
+static void link_print(struct link *l_ptr, const char *str);
+static void link_start(struct link *l_ptr);
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
+
+/*
+ *  Simple link routines
+ */
+
+static unsigned int align(unsigned int i)
+{
+	return (i + 3) & ~3u;
+}
+
+static void link_init_max_pkt(struct link *l_ptr)
+{
+	u32 max_pkt;
+
+	max_pkt = (l_ptr->b_ptr->mtu & ~3);
+	if (max_pkt > MAX_MSG_SIZE)
+		max_pkt = MAX_MSG_SIZE;
+
+	l_ptr->max_pkt_target = max_pkt;
+	if (l_ptr->max_pkt_target < MAX_PKT_DEFAULT)
+		l_ptr->max_pkt = l_ptr->max_pkt_target;
+	else
+		l_ptr->max_pkt = MAX_PKT_DEFAULT;
+
+	l_ptr->max_pkt_probes = 0;
+}
+
+static u32 link_next_sent(struct link *l_ptr)
+{
+	if (l_ptr->next_out)
+		return msg_seqno(buf_msg(l_ptr->next_out));
+	return mod(l_ptr->next_out_no);
+}
+
+static u32 link_last_sent(struct link *l_ptr)
+{
+	return mod(link_next_sent(l_ptr) - 1);
+}
+
+/*
+ *  Simple non-static link routines (i.e. referenced outside this file)
+ */
+
+int tipc_link_is_up(struct link *l_ptr)
+{
+	if (!l_ptr)
+		return 0;
+	return link_working_working(l_ptr) || link_working_unknown(l_ptr);
+}
+
+int tipc_link_is_active(struct link *l_ptr)
+{
+	return	(l_ptr->owner->active_links[0] == l_ptr) ||
+		(l_ptr->owner->active_links[1] == l_ptr);
+}
+
+/**
+ * link_name_validate - validate & (optionally) deconstruct link name
+ * @name - ptr to link name string
+ * @name_parts - ptr to area for link name components (or NULL if not needed)
+ *
+ * Returns 1 if link name is valid, otherwise 0.
+ */
+
+static int link_name_validate(const char *name, struct link_name *name_parts)
+{
+	char name_copy[TIPC_MAX_LINK_NAME];
+	char *addr_local;
+	char *if_local;
+	char *addr_peer;
+	char *if_peer;
+	char dummy;
+	u32 z_local, c_local, n_local;
+	u32 z_peer, c_peer, n_peer;
+	u32 if_local_len;
+	u32 if_peer_len;
+
+	/* copy link name & ensure length is OK */
+
+	name_copy[TIPC_MAX_LINK_NAME - 1] = 0;
+	/* need above in case non-Posix strncpy() doesn't pad with nulls */
+	strncpy(name_copy, name, TIPC_MAX_LINK_NAME);
+	if (name_copy[TIPC_MAX_LINK_NAME - 1] != 0)
+		return 0;
+
+	/* ensure all component parts of link name are present */
+
+	addr_local = name_copy;
+	if_local = strchr(addr_local, ':');
+	if (if_local == NULL)
+		return 0;
+	*(if_local++) = 0;
+	addr_peer = strchr(if_local, '-');
+	if (addr_peer == NULL)
+		return 0;
+	*(addr_peer++) = 0;
+	if_local_len = addr_peer - if_local;
+	if_peer = strchr(addr_peer, ':');
+	if (if_peer == NULL)
+		return 0;
+	*(if_peer++) = 0;
+	if_peer_len = strlen(if_peer) + 1;
+
+	/* validate component parts of link name */
+
+	if ((sscanf(addr_local, "%u.%u.%u%c",
+		    &z_local, &c_local, &n_local, &dummy) != 3) ||
+	    (sscanf(addr_peer, "%u.%u.%u%c",
+		    &z_peer, &c_peer, &n_peer, &dummy) != 3) ||
+	    (z_local > 255) || (c_local > 4095) || (n_local > 4095) ||
+	    (z_peer  > 255) || (c_peer  > 4095) || (n_peer  > 4095) ||
+	    (if_local_len <= 1) || (if_local_len > TIPC_MAX_IF_NAME) ||
+	    (if_peer_len  <= 1) || (if_peer_len  > TIPC_MAX_IF_NAME) ||
+	    (strspn(if_local, tipc_alphabet) != (if_local_len - 1)) ||
+	    (strspn(if_peer, tipc_alphabet) != (if_peer_len - 1)))
+		return 0;
+
+	/* return link name components, if necessary */
+
+	if (name_parts) {
+		name_parts->addr_local = tipc_addr(z_local, c_local, n_local);
+		strcpy(name_parts->if_local, if_local);
+		name_parts->addr_peer = tipc_addr(z_peer, c_peer, n_peer);
+		strcpy(name_parts->if_peer, if_peer);
+	}
+	return 1;
+}
+
+/**
+ * link_timeout - handle expiration of link timer
+ * @l_ptr: pointer to link
+ *
+ * This routine must not grab "tipc_net_lock" to avoid a potential deadlock conflict
+ * with tipc_link_delete().  (There is no risk that the node will be deleted by
+ * another thread because tipc_link_delete() always cancels the link timer before
+ * tipc_node_delete() is called.)
+ */
+
+static void link_timeout(struct link *l_ptr)
+{
+	tipc_node_lock(l_ptr->owner);
+
+	/* update counters used in statistical profiling of send traffic */
+
+	l_ptr->stats.accu_queue_sz += l_ptr->out_queue_size;
+	l_ptr->stats.queue_sz_counts++;
+
+	if (l_ptr->first_out) {
+		struct tipc_msg *msg = buf_msg(l_ptr->first_out);
+		u32 length = msg_size(msg);
+
+		if ((msg_user(msg) == MSG_FRAGMENTER) &&
+		    (msg_type(msg) == FIRST_FRAGMENT)) {
+			length = msg_size(msg_get_wrapped(msg));
+		}
+		if (length) {
+			l_ptr->stats.msg_lengths_total += length;
+			l_ptr->stats.msg_length_counts++;
+			if (length <= 64)
+				l_ptr->stats.msg_length_profile[0]++;
+			else if (length <= 256)
+				l_ptr->stats.msg_length_profile[1]++;
+			else if (length <= 1024)
+				l_ptr->stats.msg_length_profile[2]++;
+			else if (length <= 4096)
+				l_ptr->stats.msg_length_profile[3]++;
+			else if (length <= 16384)
+				l_ptr->stats.msg_length_profile[4]++;
+			else if (length <= 32768)
+				l_ptr->stats.msg_length_profile[5]++;
+			else
+				l_ptr->stats.msg_length_profile[6]++;
+		}
+	}
+
+	/* do all other link processing performed on a periodic basis */
+
+	link_check_defragm_bufs(l_ptr);
+
+	link_state_event(l_ptr, TIMEOUT_EVT);
+
+	if (l_ptr->next_out)
+		tipc_link_push_queue(l_ptr);
+
+	tipc_node_unlock(l_ptr->owner);
+}
+
+static void link_set_timer(struct link *l_ptr, u32 time)
+{
+	k_start_timer(&l_ptr->timer, time);
+}
+
+/**
+ * tipc_link_create - create a new link
+ * @n_ptr: pointer to associated node
+ * @b_ptr: pointer to associated bearer
+ * @media_addr: media address to use when sending messages over link
+ *
+ * Returns pointer to link.
+ */
+
+struct link *tipc_link_create(struct tipc_node *n_ptr,
+			      struct tipc_bearer *b_ptr,
+			      const struct tipc_media_addr *media_addr)
+{
+	struct link *l_ptr;
+	struct tipc_msg *msg;
+	char *if_name;
+	char addr_string[16];
+	u32 peer = n_ptr->addr;
+
+	if (n_ptr->link_cnt >= 2) {
+		tipc_addr_string_fill(addr_string, n_ptr->addr);
+		err("Attempt to establish third link to %s\n", addr_string);
+		return NULL;
+	}
+
+	if (n_ptr->links[b_ptr->identity]) {
+		tipc_addr_string_fill(addr_string, n_ptr->addr);
+		err("Attempt to establish second link on <%s> to %s\n",
+		    b_ptr->name, addr_string);
+		return NULL;
+	}
+
+	l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC);
+	if (!l_ptr) {
+		warn("Link creation failed, no memory\n");
+		return NULL;
+	}
+
+	l_ptr->addr = peer;
+	if_name = strchr(b_ptr->name, ':') + 1;
+	sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:",
+		tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
+		tipc_node(tipc_own_addr),
+		if_name,
+		tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+		/* note: peer i/f is appended to link name by reset/activate */
+	memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr));
+	l_ptr->owner = n_ptr;
+	l_ptr->checkpoint = 1;
+	l_ptr->b_ptr = b_ptr;
+	link_set_supervision_props(l_ptr, b_ptr->media->tolerance);
+	l_ptr->state = RESET_UNKNOWN;
+
+	l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg;
+	msg = l_ptr->pmsg;
+	tipc_msg_init(msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, l_ptr->addr);
+	msg_set_size(msg, sizeof(l_ptr->proto_msg));
+	msg_set_session(msg, (tipc_random & 0xffff));
+	msg_set_bearer_id(msg, b_ptr->identity);
+	strcpy((char *)msg_data(msg), if_name);
+
+	l_ptr->priority = b_ptr->priority;
+	tipc_link_set_queue_limits(l_ptr, b_ptr->media->window);
+
+	link_init_max_pkt(l_ptr);
+
+	l_ptr->next_out_no = 1;
+	INIT_LIST_HEAD(&l_ptr->waiting_ports);
+
+	link_reset_statistics(l_ptr);
+
+	tipc_node_attach_link(n_ptr, l_ptr);
+
+	k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
+	list_add_tail(&l_ptr->link_list, &b_ptr->links);
+	tipc_k_signal((Handler)link_start, (unsigned long)l_ptr);
+
+	return l_ptr;
+}
+
+/**
+ * tipc_link_delete - delete a link
+ * @l_ptr: pointer to link
+ *
+ * Note: 'tipc_net_lock' is write_locked, bearer is locked.
+ * This routine must not grab the node lock until after link timer cancellation
+ * to avoid a potential deadlock situation.
+ */
+
+void tipc_link_delete(struct link *l_ptr)
+{
+	if (!l_ptr) {
+		err("Attempt to delete non-existent link\n");
+		return;
+	}
+
+	k_cancel_timer(&l_ptr->timer);
+
+	tipc_node_lock(l_ptr->owner);
+	tipc_link_reset(l_ptr);
+	tipc_node_detach_link(l_ptr->owner, l_ptr);
+	tipc_link_stop(l_ptr);
+	list_del_init(&l_ptr->link_list);
+	tipc_node_unlock(l_ptr->owner);
+	k_term_timer(&l_ptr->timer);
+	kfree(l_ptr);
+}
+
+static void link_start(struct link *l_ptr)
+{
+	tipc_node_lock(l_ptr->owner);
+	link_state_event(l_ptr, STARTING_EVT);
+	tipc_node_unlock(l_ptr->owner);
+}
+
+/**
+ * link_schedule_port - schedule port for deferred sending
+ * @l_ptr: pointer to link
+ * @origport: reference to sending port
+ * @sz: amount of data to be sent
+ *
+ * Schedules port for renewed sending of messages after link congestion
+ * has abated.
+ */
+
+static int link_schedule_port(struct link *l_ptr, u32 origport, u32 sz)
+{
+	struct tipc_port *p_ptr;
+
+	spin_lock_bh(&tipc_port_list_lock);
+	p_ptr = tipc_port_lock(origport);
+	if (p_ptr) {
+		if (!p_ptr->wakeup)
+			goto exit;
+		if (!list_empty(&p_ptr->wait_list))
+			goto exit;
+		p_ptr->congested = 1;
+		p_ptr->waiting_pkts = 1 + ((sz - 1) / l_ptr->max_pkt);
+		list_add_tail(&p_ptr->wait_list, &l_ptr->waiting_ports);
+		l_ptr->stats.link_congs++;
+exit:
+		tipc_port_unlock(p_ptr);
+	}
+	spin_unlock_bh(&tipc_port_list_lock);
+	return -ELINKCONG;
+}
+
+void tipc_link_wakeup_ports(struct link *l_ptr, int all)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_port *temp_p_ptr;
+	int win = l_ptr->queue_limit[0] - l_ptr->out_queue_size;
+
+	if (all)
+		win = 100000;
+	if (win <= 0)
+		return;
+	if (!spin_trylock_bh(&tipc_port_list_lock))
+		return;
+	if (link_congested(l_ptr))
+		goto exit;
+	list_for_each_entry_safe(p_ptr, temp_p_ptr, &l_ptr->waiting_ports,
+				 wait_list) {
+		if (win <= 0)
+			break;
+		list_del_init(&p_ptr->wait_list);
+		spin_lock_bh(p_ptr->lock);
+		p_ptr->congested = 0;
+		p_ptr->wakeup(p_ptr);
+		win -= p_ptr->waiting_pkts;
+		spin_unlock_bh(p_ptr->lock);
+	}
+
+exit:
+	spin_unlock_bh(&tipc_port_list_lock);
+}
+
+/**
+ * link_release_outqueue - purge link's outbound message queue
+ * @l_ptr: pointer to link
+ */
+
+static void link_release_outqueue(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->first_out;
+	struct sk_buff *next;
+
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	l_ptr->first_out = NULL;
+	l_ptr->out_queue_size = 0;
+}
+
+/**
+ * tipc_link_reset_fragments - purge link's inbound message fragments queue
+ * @l_ptr: pointer to link
+ */
+
+void tipc_link_reset_fragments(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->defragm_buf;
+	struct sk_buff *next;
+
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	l_ptr->defragm_buf = NULL;
+}
+
+/**
+ * tipc_link_stop - purge all inbound and outbound messages associated with link
+ * @l_ptr: pointer to link
+ */
+
+void tipc_link_stop(struct link *l_ptr)
+{
+	struct sk_buff *buf;
+	struct sk_buff *next;
+
+	buf = l_ptr->oldest_deferred_in;
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+
+	buf = l_ptr->first_out;
+	while (buf) {
+		next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+
+	tipc_link_reset_fragments(l_ptr);
+
+	buf_discard(l_ptr->proto_msg_queue);
+	l_ptr->proto_msg_queue = NULL;
+}
+
+/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
+#define link_send_event(fcn, l_ptr, up) do { } while (0)
+
+void tipc_link_reset(struct link *l_ptr)
+{
+	struct sk_buff *buf;
+	u32 prev_state = l_ptr->state;
+	u32 checkpoint = l_ptr->next_in_no;
+	int was_active_link = tipc_link_is_active(l_ptr);
+
+	msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff));
+
+	/* Link is down, accept any session */
+	l_ptr->peer_session = INVALID_SESSION;
+
+	/* Prepare for max packet size negotiation */
+	link_init_max_pkt(l_ptr);
+
+	l_ptr->state = RESET_UNKNOWN;
+
+	if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET))
+		return;
+
+	tipc_node_link_down(l_ptr->owner, l_ptr);
+	tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
+
+	if (was_active_link && tipc_node_active_links(l_ptr->owner) &&
+	    l_ptr->owner->permit_changeover) {
+		l_ptr->reset_checkpoint = checkpoint;
+		l_ptr->exp_msg_count = START_CHANGEOVER;
+	}
+
+	/* Clean up all queues: */
+
+	link_release_outqueue(l_ptr);
+	buf_discard(l_ptr->proto_msg_queue);
+	l_ptr->proto_msg_queue = NULL;
+	buf = l_ptr->oldest_deferred_in;
+	while (buf) {
+		struct sk_buff *next = buf->next;
+		buf_discard(buf);
+		buf = next;
+	}
+	if (!list_empty(&l_ptr->waiting_ports))
+		tipc_link_wakeup_ports(l_ptr, 1);
+
+	l_ptr->retransm_queue_head = 0;
+	l_ptr->retransm_queue_size = 0;
+	l_ptr->last_out = NULL;
+	l_ptr->first_out = NULL;
+	l_ptr->next_out = NULL;
+	l_ptr->unacked_window = 0;
+	l_ptr->checkpoint = 1;
+	l_ptr->next_out_no = 1;
+	l_ptr->deferred_inqueue_sz = 0;
+	l_ptr->oldest_deferred_in = NULL;
+	l_ptr->newest_deferred_in = NULL;
+	l_ptr->fsm_msg_cnt = 0;
+	l_ptr->stale_count = 0;
+	link_reset_statistics(l_ptr);
+
+	link_send_event(tipc_cfg_link_event, l_ptr, 0);
+	if (!in_own_cluster(l_ptr->addr))
+		link_send_event(tipc_disc_link_event, l_ptr, 0);
+}
+
+
+static void link_activate(struct link *l_ptr)
+{
+	l_ptr->next_in_no = l_ptr->stats.recv_info = 1;
+	tipc_node_link_up(l_ptr->owner, l_ptr);
+	tipc_bearer_add_dest(l_ptr->b_ptr, l_ptr->addr);
+	link_send_event(tipc_cfg_link_event, l_ptr, 1);
+	if (!in_own_cluster(l_ptr->addr))
+		link_send_event(tipc_disc_link_event, l_ptr, 1);
+}
+
+/**
+ * link_state_event - link finite state machine
+ * @l_ptr: pointer to link
+ * @event: state machine event to process
+ */
+
+static void link_state_event(struct link *l_ptr, unsigned event)
+{
+	struct link *other;
+	u32 cont_intv = l_ptr->continuity_interval;
+
+	if (!l_ptr->started && (event != STARTING_EVT))
+		return;		/* Not yet. */
+
+	if (link_blocked(l_ptr)) {
+		if (event == TIMEOUT_EVT)
+			link_set_timer(l_ptr, cont_intv);
+		return;	  /* Changeover going on */
+	}
+
+	switch (l_ptr->state) {
+	case WORKING_WORKING:
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+		case ACTIVATE_MSG:
+			break;
+		case TIMEOUT_EVT:
+			if (l_ptr->next_in_no != l_ptr->checkpoint) {
+				l_ptr->checkpoint = l_ptr->next_in_no;
+				if (tipc_bclink_acks_missing(l_ptr->owner)) {
+					tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+								 0, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				} else if (l_ptr->max_pkt < l_ptr->max_pkt_target) {
+					tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+								 1, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				}
+				link_set_timer(l_ptr, cont_intv);
+				break;
+			}
+			l_ptr->state = WORKING_UNKNOWN;
+			l_ptr->fsm_msg_cnt = 0;
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv / 4);
+			break;
+		case RESET_MSG:
+			info("Resetting link <%s>, requested by peer\n",
+			     l_ptr->name);
+			tipc_link_reset(l_ptr);
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		default:
+			err("Unknown link event %u in WW state\n", event);
+		}
+		break;
+	case WORKING_UNKNOWN:
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+		case ACTIVATE_MSG:
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			info("Resetting link <%s>, requested by peer "
+			     "while probing\n", l_ptr->name);
+			tipc_link_reset(l_ptr);
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case TIMEOUT_EVT:
+			if (l_ptr->next_in_no != l_ptr->checkpoint) {
+				l_ptr->state = WORKING_WORKING;
+				l_ptr->fsm_msg_cnt = 0;
+				l_ptr->checkpoint = l_ptr->next_in_no;
+				if (tipc_bclink_acks_missing(l_ptr->owner)) {
+					tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+								 0, 0, 0, 0, 0);
+					l_ptr->fsm_msg_cnt++;
+				}
+				link_set_timer(l_ptr, cont_intv);
+			} else if (l_ptr->fsm_msg_cnt < l_ptr->abort_limit) {
+				tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+							 1, 0, 0, 0, 0);
+				l_ptr->fsm_msg_cnt++;
+				link_set_timer(l_ptr, cont_intv / 4);
+			} else {	/* Link has failed */
+				warn("Resetting link <%s>, peer not responding\n",
+				     l_ptr->name);
+				tipc_link_reset(l_ptr);
+				l_ptr->state = RESET_UNKNOWN;
+				l_ptr->fsm_msg_cnt = 0;
+				tipc_link_send_proto_msg(l_ptr, RESET_MSG,
+							 0, 0, 0, 0, 0);
+				l_ptr->fsm_msg_cnt++;
+				link_set_timer(l_ptr, cont_intv);
+			}
+			break;
+		default:
+			err("Unknown link event %u in WU state\n", event);
+		}
+		break;
+	case RESET_UNKNOWN:
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+			break;
+		case ACTIVATE_MSG:
+			other = l_ptr->owner->active_links[0];
+			if (other && link_working_unknown(other))
+				break;
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_activate(l_ptr);
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			l_ptr->state = RESET_RESET;
+			l_ptr->fsm_msg_cnt = 0;
+			tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case STARTING_EVT:
+			l_ptr->started = 1;
+			/* fall through */
+		case TIMEOUT_EVT:
+			tipc_link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		default:
+			err("Unknown link event %u in RU state\n", event);
+		}
+		break;
+	case RESET_RESET:
+		switch (event) {
+		case TRAFFIC_MSG_EVT:
+		case ACTIVATE_MSG:
+			other = l_ptr->owner->active_links[0];
+			if (other && link_working_unknown(other))
+				break;
+			l_ptr->state = WORKING_WORKING;
+			l_ptr->fsm_msg_cnt = 0;
+			link_activate(l_ptr);
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG, 1, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		case RESET_MSG:
+			break;
+		case TIMEOUT_EVT:
+			tipc_link_send_proto_msg(l_ptr, ACTIVATE_MSG, 0, 0, 0, 0, 0);
+			l_ptr->fsm_msg_cnt++;
+			link_set_timer(l_ptr, cont_intv);
+			break;
+		default:
+			err("Unknown link event %u in RR state\n", event);
+		}
+		break;
+	default:
+		err("Unknown link state %u/%u\n", l_ptr->state, event);
+	}
+}
+
+/*
+ * link_bundle_buf(): Append contents of a buffer to
+ * the tail of an existing one.
+ */
+
+static int link_bundle_buf(struct link *l_ptr,
+			   struct sk_buff *bundler,
+			   struct sk_buff *buf)
+{
+	struct tipc_msg *bundler_msg = buf_msg(bundler);
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 size = msg_size(msg);
+	u32 bundle_size = msg_size(bundler_msg);
+	u32 to_pos = align(bundle_size);
+	u32 pad = to_pos - bundle_size;
+
+	if (msg_user(bundler_msg) != MSG_BUNDLER)
+		return 0;
+	if (msg_type(bundler_msg) != OPEN_MSG)
+		return 0;
+	if (skb_tailroom(bundler) < (pad + size))
+		return 0;
+	if (l_ptr->max_pkt < (to_pos + size))
+		return 0;
+
+	skb_put(bundler, pad + size);
+	skb_copy_to_linear_data_offset(bundler, to_pos, buf->data, size);
+	msg_set_size(bundler_msg, to_pos + size);
+	msg_set_msgcnt(bundler_msg, msg_msgcnt(bundler_msg) + 1);
+	buf_discard(buf);
+	l_ptr->stats.sent_bundled++;
+	return 1;
+}
+
+static void link_add_to_outqueue(struct link *l_ptr,
+				 struct sk_buff *buf,
+				 struct tipc_msg *msg)
+{
+	u32 ack = mod(l_ptr->next_in_no - 1);
+	u32 seqno = mod(l_ptr->next_out_no++);
+
+	msg_set_word(msg, 2, ((ack << 16) | seqno));
+	msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+	buf->next = NULL;
+	if (l_ptr->first_out) {
+		l_ptr->last_out->next = buf;
+		l_ptr->last_out = buf;
+	} else
+		l_ptr->first_out = l_ptr->last_out = buf;
+
+	l_ptr->out_queue_size++;
+	if (l_ptr->out_queue_size > l_ptr->stats.max_queue_sz)
+		l_ptr->stats.max_queue_sz = l_ptr->out_queue_size;
+}
+
+static void link_add_chain_to_outqueue(struct link *l_ptr,
+				       struct sk_buff *buf_chain,
+				       u32 long_msgno)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+
+	if (!l_ptr->next_out)
+		l_ptr->next_out = buf_chain;
+	while (buf_chain) {
+		buf = buf_chain;
+		buf_chain = buf_chain->next;
+
+		msg = buf_msg(buf);
+		msg_set_long_msgno(msg, long_msgno);
+		link_add_to_outqueue(l_ptr, buf, msg);
+	}
+}
+
+/*
+ * tipc_link_send_buf() is the 'full path' for messages, called from
+ * inside TIPC when the 'fast path' in tipc_send_buf
+ * has failed, and from link_send()
+ */
+
+int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 size = msg_size(msg);
+	u32 dsz = msg_data_sz(msg);
+	u32 queue_size = l_ptr->out_queue_size;
+	u32 imp = tipc_msg_tot_importance(msg);
+	u32 queue_limit = l_ptr->queue_limit[imp];
+	u32 max_packet = l_ptr->max_pkt;
+
+	msg_set_prevnode(msg, tipc_own_addr);	/* If routed message */
+
+	/* Match msg importance against queue limits: */
+
+	if (unlikely(queue_size >= queue_limit)) {
+		if (imp <= TIPC_CRITICAL_IMPORTANCE) {
+			link_schedule_port(l_ptr, msg_origport(msg), size);
+			buf_discard(buf);
+			return -ELINKCONG;
+		}
+		buf_discard(buf);
+		if (imp > CONN_MANAGER) {
+			warn("Resetting link <%s>, send queue full", l_ptr->name);
+			tipc_link_reset(l_ptr);
+		}
+		return dsz;
+	}
+
+	/* Fragmentation needed ? */
+
+	if (size > max_packet)
+		return link_send_long_buf(l_ptr, buf);
+
+	/* Packet can be queued or sent: */
+
+	if (likely(!tipc_bearer_congested(l_ptr->b_ptr, l_ptr) &&
+		   !link_congested(l_ptr))) {
+		link_add_to_outqueue(l_ptr, buf, msg);
+
+		if (likely(tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr))) {
+			l_ptr->unacked_window = 0;
+		} else {
+			tipc_bearer_schedule(l_ptr->b_ptr, l_ptr);
+			l_ptr->stats.bearer_congs++;
+			l_ptr->next_out = buf;
+		}
+		return dsz;
+	}
+	/* Congestion: can message be bundled ?: */
+
+	if ((msg_user(msg) != CHANGEOVER_PROTOCOL) &&
+	    (msg_user(msg) != MSG_FRAGMENTER)) {
+
+		/* Try adding message to an existing bundle */
+
+		if (l_ptr->next_out &&
+		    link_bundle_buf(l_ptr, l_ptr->last_out, buf)) {
+			tipc_bearer_resolve_congestion(l_ptr->b_ptr, l_ptr);
+			return dsz;
+		}
+
+		/* Try creating a new bundle */
+
+		if (size <= max_packet * 2 / 3) {
+			struct sk_buff *bundler = tipc_buf_acquire(max_packet);
+			struct tipc_msg bundler_hdr;
+
+			if (bundler) {
+				tipc_msg_init(&bundler_hdr, MSG_BUNDLER, OPEN_MSG,
+					 INT_H_SIZE, l_ptr->addr);
+				skb_copy_to_linear_data(bundler, &bundler_hdr,
+							INT_H_SIZE);
+				skb_trim(bundler, INT_H_SIZE);
+				link_bundle_buf(l_ptr, bundler, buf);
+				buf = bundler;
+				msg = buf_msg(buf);
+				l_ptr->stats.sent_bundles++;
+			}
+		}
+	}
+	if (!l_ptr->next_out)
+		l_ptr->next_out = buf;
+	link_add_to_outqueue(l_ptr, buf, msg);
+	tipc_bearer_resolve_congestion(l_ptr->b_ptr, l_ptr);
+	return dsz;
+}
+
+/*
+ * tipc_link_send(): same as tipc_link_send_buf(), but the link to use has
+ * not been selected yet, and the the owner node is not locked
+ * Called by TIPC internal users, e.g. the name distributor
+ */
+
+int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector)
+{
+	struct link *l_ptr;
+	struct tipc_node *n_ptr;
+	int res = -ELINKCONG;
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find(dest);
+	if (n_ptr) {
+		tipc_node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector & 1];
+		if (l_ptr)
+			res = tipc_link_send_buf(l_ptr, buf);
+		else
+			buf_discard(buf);
+		tipc_node_unlock(n_ptr);
+	} else {
+		buf_discard(buf);
+	}
+	read_unlock_bh(&tipc_net_lock);
+	return res;
+}
+
+/*
+ * link_send_buf_fast: Entry for data messages where the
+ * destination link is known and the header is complete,
+ * inclusive total message length. Very time critical.
+ * Link is locked. Returns user data length.
+ */
+
+static int link_send_buf_fast(struct link *l_ptr, struct sk_buff *buf,
+			      u32 *used_max_pkt)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	int res = msg_data_sz(msg);
+
+	if (likely(!link_congested(l_ptr))) {
+		if (likely(msg_size(msg) <= l_ptr->max_pkt)) {
+			if (likely(list_empty(&l_ptr->b_ptr->cong_links))) {
+				link_add_to_outqueue(l_ptr, buf, msg);
+				if (likely(tipc_bearer_send(l_ptr->b_ptr, buf,
+							    &l_ptr->media_addr))) {
+					l_ptr->unacked_window = 0;
+					return res;
+				}
+				tipc_bearer_schedule(l_ptr->b_ptr, l_ptr);
+				l_ptr->stats.bearer_congs++;
+				l_ptr->next_out = buf;
+				return res;
+			}
+		} else
+			*used_max_pkt = l_ptr->max_pkt;
+	}
+	return tipc_link_send_buf(l_ptr, buf);  /* All other cases */
+}
+
+/*
+ * tipc_send_buf_fast: Entry for data messages where the
+ * destination node is known and the header is complete,
+ * inclusive total message length.
+ * Returns user data length.
+ */
+int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode)
+{
+	struct link *l_ptr;
+	struct tipc_node *n_ptr;
+	int res;
+	u32 selector = msg_origport(buf_msg(buf)) & 1;
+	u32 dummy;
+
+	if (destnode == tipc_own_addr)
+		return tipc_port_recv_msg(buf);
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find(destnode);
+	if (likely(n_ptr)) {
+		tipc_node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector];
+		if (likely(l_ptr)) {
+			res = link_send_buf_fast(l_ptr, buf, &dummy);
+			tipc_node_unlock(n_ptr);
+			read_unlock_bh(&tipc_net_lock);
+			return res;
+		}
+		tipc_node_unlock(n_ptr);
+	}
+	read_unlock_bh(&tipc_net_lock);
+	res = msg_data_sz(buf_msg(buf));
+	tipc_reject_msg(buf, TIPC_ERR_NO_NODE);
+	return res;
+}
+
+
+/*
+ * tipc_link_send_sections_fast: Entry for messages where the
+ * destination processor is known and the header is complete,
+ * except for total message length.
+ * Returns user data length or errno.
+ */
+int tipc_link_send_sections_fast(struct tipc_port *sender,
+				 struct iovec const *msg_sect,
+				 const u32 num_sect,
+				 unsigned int total_len,
+				 u32 destaddr)
+{
+	struct tipc_msg *hdr = &sender->phdr;
+	struct link *l_ptr;
+	struct sk_buff *buf;
+	struct tipc_node *node;
+	int res;
+	u32 selector = msg_origport(hdr) & 1;
+
+again:
+	/*
+	 * Try building message using port's max_pkt hint.
+	 * (Must not hold any locks while building message.)
+	 */
+
+	res = tipc_msg_build(hdr, msg_sect, num_sect, total_len,
+			     sender->max_pkt, !sender->user_port, &buf);
+
+	read_lock_bh(&tipc_net_lock);
+	node = tipc_node_find(destaddr);
+	if (likely(node)) {
+		tipc_node_lock(node);
+		l_ptr = node->active_links[selector];
+		if (likely(l_ptr)) {
+			if (likely(buf)) {
+				res = link_send_buf_fast(l_ptr, buf,
+							 &sender->max_pkt);
+exit:
+				tipc_node_unlock(node);
+				read_unlock_bh(&tipc_net_lock);
+				return res;
+			}
+
+			/* Exit if build request was invalid */
+
+			if (unlikely(res < 0))
+				goto exit;
+
+			/* Exit if link (or bearer) is congested */
+
+			if (link_congested(l_ptr) ||
+			    !list_empty(&l_ptr->b_ptr->cong_links)) {
+				res = link_schedule_port(l_ptr,
+							 sender->ref, res);
+				goto exit;
+			}
+
+			/*
+			 * Message size exceeds max_pkt hint; update hint,
+			 * then re-try fast path or fragment the message
+			 */
+
+			sender->max_pkt = l_ptr->max_pkt;
+			tipc_node_unlock(node);
+			read_unlock_bh(&tipc_net_lock);
+
+
+			if ((msg_hdr_sz(hdr) + res) <= sender->max_pkt)
+				goto again;
+
+			return link_send_sections_long(sender, msg_sect,
+						       num_sect, total_len,
+						       destaddr);
+		}
+		tipc_node_unlock(node);
+	}
+	read_unlock_bh(&tipc_net_lock);
+
+	/* Couldn't find a link to the destination node */
+
+	if (buf)
+		return tipc_reject_msg(buf, TIPC_ERR_NO_NODE);
+	if (res >= 0)
+		return tipc_port_reject_sections(sender, hdr, msg_sect, num_sect,
+						 total_len, TIPC_ERR_NO_NODE);
+	return res;
+}
+
+/*
+ * link_send_sections_long(): Entry for long messages where the
+ * destination node is known and the header is complete,
+ * inclusive total message length.
+ * Link and bearer congestion status have been checked to be ok,
+ * and are ignored if they change.
+ *
+ * Note that fragments do not use the full link MTU so that they won't have
+ * to undergo refragmentation if link changeover causes them to be sent
+ * over another link with an additional tunnel header added as prefix.
+ * (Refragmentation will still occur if the other link has a smaller MTU.)
+ *
+ * Returns user data length or errno.
+ */
+static int link_send_sections_long(struct tipc_port *sender,
+				   struct iovec const *msg_sect,
+				   u32 num_sect,
+				   unsigned int total_len,
+				   u32 destaddr)
+{
+	struct link *l_ptr;
+	struct tipc_node *node;
+	struct tipc_msg *hdr = &sender->phdr;
+	u32 dsz = total_len;
+	u32 max_pkt, fragm_sz, rest;
+	struct tipc_msg fragm_hdr;
+	struct sk_buff *buf, *buf_chain, *prev;
+	u32 fragm_crs, fragm_rest, hsz, sect_rest;
+	const unchar *sect_crs;
+	int curr_sect;
+	u32 fragm_no;
+
+again:
+	fragm_no = 1;
+	max_pkt = sender->max_pkt - INT_H_SIZE;
+		/* leave room for tunnel header in case of link changeover */
+	fragm_sz = max_pkt - INT_H_SIZE;
+		/* leave room for fragmentation header in each fragment */
+	rest = dsz;
+	fragm_crs = 0;
+	fragm_rest = 0;
+	sect_rest = 0;
+	sect_crs = NULL;
+	curr_sect = -1;
+
+	/* Prepare reusable fragment header: */
+
+	tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
+		 INT_H_SIZE, msg_destnode(hdr));
+	msg_set_size(&fragm_hdr, max_pkt);
+	msg_set_fragm_no(&fragm_hdr, 1);
+
+	/* Prepare header of first fragment: */
+
+	buf_chain = buf = tipc_buf_acquire(max_pkt);
+	if (!buf)
+		return -ENOMEM;
+	buf->next = NULL;
+	skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE);
+	hsz = msg_hdr_sz(hdr);
+	skb_copy_to_linear_data_offset(buf, INT_H_SIZE, hdr, hsz);
+
+	/* Chop up message: */
+
+	fragm_crs = INT_H_SIZE + hsz;
+	fragm_rest = fragm_sz - hsz;
+
+	do {		/* For all sections */
+		u32 sz;
+
+		if (!sect_rest) {
+			sect_rest = msg_sect[++curr_sect].iov_len;
+			sect_crs = (const unchar *)msg_sect[curr_sect].iov_base;
+		}
+
+		if (sect_rest < fragm_rest)
+			sz = sect_rest;
+		else
+			sz = fragm_rest;
+
+		if (likely(!sender->user_port)) {
+			if (copy_from_user(buf->data + fragm_crs, sect_crs, sz)) {
+error:
+				for (; buf_chain; buf_chain = buf) {
+					buf = buf_chain->next;
+					buf_discard(buf_chain);
+				}
+				return -EFAULT;
+			}
+		} else
+			skb_copy_to_linear_data_offset(buf, fragm_crs,
+						       sect_crs, sz);
+		sect_crs += sz;
+		sect_rest -= sz;
+		fragm_crs += sz;
+		fragm_rest -= sz;
+		rest -= sz;
+
+		if (!fragm_rest && rest) {
+
+			/* Initiate new fragment: */
+			if (rest <= fragm_sz) {
+				fragm_sz = rest;
+				msg_set_type(&fragm_hdr, LAST_FRAGMENT);
+			} else {
+				msg_set_type(&fragm_hdr, FRAGMENT);
+			}
+			msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
+			msg_set_fragm_no(&fragm_hdr, ++fragm_no);
+			prev = buf;
+			buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
+			if (!buf)
+				goto error;
+
+			buf->next = NULL;
+			prev->next = buf;
+			skb_copy_to_linear_data(buf, &fragm_hdr, INT_H_SIZE);
+			fragm_crs = INT_H_SIZE;
+			fragm_rest = fragm_sz;
+		}
+	} while (rest > 0);
+
+	/*
+	 * Now we have a buffer chain. Select a link and check
+	 * that packet size is still OK
+	 */
+	node = tipc_node_find(destaddr);
+	if (likely(node)) {
+		tipc_node_lock(node);
+		l_ptr = node->active_links[sender->ref & 1];
+		if (!l_ptr) {
+			tipc_node_unlock(node);
+			goto reject;
+		}
+		if (l_ptr->max_pkt < max_pkt) {
+			sender->max_pkt = l_ptr->max_pkt;
+			tipc_node_unlock(node);
+			for (; buf_chain; buf_chain = buf) {
+				buf = buf_chain->next;
+				buf_discard(buf_chain);
+			}
+			goto again;
+		}
+	} else {
+reject:
+		for (; buf_chain; buf_chain = buf) {
+			buf = buf_chain->next;
+			buf_discard(buf_chain);
+		}
+		return tipc_port_reject_sections(sender, hdr, msg_sect, num_sect,
+						 total_len, TIPC_ERR_NO_NODE);
+	}
+
+	/* Append chain of fragments to send queue & send them */
+
+	l_ptr->long_msg_seq_no++;
+	link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no);
+	l_ptr->stats.sent_fragments += fragm_no;
+	l_ptr->stats.sent_fragmented++;
+	tipc_link_push_queue(l_ptr);
+	tipc_node_unlock(node);
+	return dsz;
+}
+
+/*
+ * tipc_link_push_packet: Push one unsent packet to the media
+ */
+u32 tipc_link_push_packet(struct link *l_ptr)
+{
+	struct sk_buff *buf = l_ptr->first_out;
+	u32 r_q_size = l_ptr->retransm_queue_size;
+	u32 r_q_head = l_ptr->retransm_queue_head;
+
+	/* Step to position where retransmission failed, if any,    */
+	/* consider that buffers may have been released in meantime */
+
+	if (r_q_size && buf) {
+		u32 last = lesser(mod(r_q_head + r_q_size),
+				  link_last_sent(l_ptr));
+		u32 first = msg_seqno(buf_msg(buf));
+
+		while (buf && less(first, r_q_head)) {
+			first = mod(first + 1);
+			buf = buf->next;
+		}
+		l_ptr->retransm_queue_head = r_q_head = first;
+		l_ptr->retransm_queue_size = r_q_size = mod(last - first);
+	}
+
+	/* Continue retransmission now, if there is anything: */
+
+	if (r_q_size && buf) {
+		msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in);
+		if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+			l_ptr->retransm_queue_head = mod(++r_q_head);
+			l_ptr->retransm_queue_size = --r_q_size;
+			l_ptr->stats.retransmitted++;
+			return 0;
+		} else {
+			l_ptr->stats.bearer_congs++;
+			return PUSH_FAILED;
+		}
+	}
+
+	/* Send deferred protocol message, if any: */
+
+	buf = l_ptr->proto_msg_queue;
+	if (buf) {
+		msg_set_ack(buf_msg(buf), mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(buf_msg(buf), l_ptr->owner->bclink.last_in);
+		if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+			l_ptr->unacked_window = 0;
+			buf_discard(buf);
+			l_ptr->proto_msg_queue = NULL;
+			return 0;
+		} else {
+			l_ptr->stats.bearer_congs++;
+			return PUSH_FAILED;
+		}
+	}
+
+	/* Send one deferred data message, if send window not full: */
+
+	buf = l_ptr->next_out;
+	if (buf) {
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 next = msg_seqno(msg);
+		u32 first = msg_seqno(buf_msg(l_ptr->first_out));
+
+		if (mod(next - first) < l_ptr->queue_limit[0]) {
+			msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+			msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+			if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+				if (msg_user(msg) == MSG_BUNDLER)
+					msg_set_type(msg, CLOSED_MSG);
+				l_ptr->next_out = buf->next;
+				return 0;
+			} else {
+				l_ptr->stats.bearer_congs++;
+				return PUSH_FAILED;
+			}
+		}
+	}
+	return PUSH_FINISHED;
+}
+
+/*
+ * push_queue(): push out the unsent messages of a link where
+ *               congestion has abated. Node is locked
+ */
+void tipc_link_push_queue(struct link *l_ptr)
+{
+	u32 res;
+
+	if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr))
+		return;
+
+	do {
+		res = tipc_link_push_packet(l_ptr);
+	} while (!res);
+
+	if (res == PUSH_FAILED)
+		tipc_bearer_schedule(l_ptr->b_ptr, l_ptr);
+}
+
+static void link_reset_all(unsigned long addr)
+{
+	struct tipc_node *n_ptr;
+	char addr_string[16];
+	u32 i;
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find((u32)addr);
+	if (!n_ptr) {
+		read_unlock_bh(&tipc_net_lock);
+		return;	/* node no longer exists */
+	}
+
+	tipc_node_lock(n_ptr);
+
+	warn("Resetting all links to %s\n",
+	     tipc_addr_string_fill(addr_string, n_ptr->addr));
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+		if (n_ptr->links[i]) {
+			link_print(n_ptr->links[i], "Resetting link\n");
+			tipc_link_reset(n_ptr->links[i]);
+		}
+	}
+
+	tipc_node_unlock(n_ptr);
+	read_unlock_bh(&tipc_net_lock);
+}
+
+static void link_retransmit_failure(struct link *l_ptr, struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+
+	warn("Retransmission failure on link <%s>\n", l_ptr->name);
+
+	if (l_ptr->addr) {
+
+		/* Handle failure on standard link */
+
+		link_print(l_ptr, "Resetting link\n");
+		tipc_link_reset(l_ptr);
+
+	} else {
+
+		/* Handle failure on broadcast link */
+
+		struct tipc_node *n_ptr;
+		char addr_string[16];
+
+		info("Msg seq number: %u,  ", msg_seqno(msg));
+		info("Outstanding acks: %lu\n",
+		     (unsigned long) TIPC_SKB_CB(buf)->handle);
+
+		n_ptr = tipc_bclink_retransmit_to();
+		tipc_node_lock(n_ptr);
+
+		tipc_addr_string_fill(addr_string, n_ptr->addr);
+		info("Multicast link info for %s\n", addr_string);
+		info("Supported: %d,  ", n_ptr->bclink.supported);
+		info("Acked: %u\n", n_ptr->bclink.acked);
+		info("Last in: %u,  ", n_ptr->bclink.last_in);
+		info("Gap after: %u,  ", n_ptr->bclink.gap_after);
+		info("Gap to: %u\n", n_ptr->bclink.gap_to);
+		info("Nack sync: %u\n\n", n_ptr->bclink.nack_sync);
+
+		tipc_k_signal((Handler)link_reset_all, (unsigned long)n_ptr->addr);
+
+		tipc_node_unlock(n_ptr);
+
+		l_ptr->stale_count = 0;
+	}
+}
+
+void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *buf,
+			  u32 retransmits)
+{
+	struct tipc_msg *msg;
+
+	if (!buf)
+		return;
+
+	msg = buf_msg(buf);
+
+	if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) {
+		if (l_ptr->retransm_queue_size == 0) {
+			l_ptr->retransm_queue_head = msg_seqno(msg);
+			l_ptr->retransm_queue_size = retransmits;
+		} else {
+			err("Unexpected retransmit on link %s (qsize=%d)\n",
+			    l_ptr->name, l_ptr->retransm_queue_size);
+		}
+		return;
+	} else {
+		/* Detect repeated retransmit failures on uncongested bearer */
+
+		if (l_ptr->last_retransmitted == msg_seqno(msg)) {
+			if (++l_ptr->stale_count > 100) {
+				link_retransmit_failure(l_ptr, buf);
+				return;
+			}
+		} else {
+			l_ptr->last_retransmitted = msg_seqno(msg);
+			l_ptr->stale_count = 1;
+		}
+	}
+
+	while (retransmits && (buf != l_ptr->next_out) && buf) {
+		msg = buf_msg(buf);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+		msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+		if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+			buf = buf->next;
+			retransmits--;
+			l_ptr->stats.retransmitted++;
+		} else {
+			tipc_bearer_schedule(l_ptr->b_ptr, l_ptr);
+			l_ptr->stats.bearer_congs++;
+			l_ptr->retransm_queue_head = msg_seqno(buf_msg(buf));
+			l_ptr->retransm_queue_size = retransmits;
+			return;
+		}
+	}
+
+	l_ptr->retransm_queue_head = l_ptr->retransm_queue_size = 0;
+}
+
+/**
+ * link_insert_deferred_queue - insert deferred messages back into receive chain
+ */
+
+static struct sk_buff *link_insert_deferred_queue(struct link *l_ptr,
+						  struct sk_buff *buf)
+{
+	u32 seq_no;
+
+	if (l_ptr->oldest_deferred_in == NULL)
+		return buf;
+
+	seq_no = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+	if (seq_no == mod(l_ptr->next_in_no)) {
+		l_ptr->newest_deferred_in->next = buf;
+		buf = l_ptr->oldest_deferred_in;
+		l_ptr->oldest_deferred_in = NULL;
+		l_ptr->deferred_inqueue_sz = 0;
+	}
+	return buf;
+}
+
+/**
+ * link_recv_buf_validate - validate basic format of received message
+ *
+ * This routine ensures a TIPC message has an acceptable header, and at least
+ * as much data as the header indicates it should.  The routine also ensures
+ * that the entire message header is stored in the main fragment of the message
+ * buffer, to simplify future access to message header fields.
+ *
+ * Note: Having extra info present in the message header or data areas is OK.
+ * TIPC will ignore the excess, under the assumption that it is optional info
+ * introduced by a later release of the protocol.
+ */
+
+static int link_recv_buf_validate(struct sk_buff *buf)
+{
+	static u32 min_data_hdr_size[8] = {
+		SHORT_H_SIZE, MCAST_H_SIZE, LONG_H_SIZE, DIR_MSG_H_SIZE,
+		MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE, MAX_H_SIZE
+		};
+
+	struct tipc_msg *msg;
+	u32 tipc_hdr[2];
+	u32 size;
+	u32 hdr_size;
+	u32 min_hdr_size;
+
+	if (unlikely(buf->len < MIN_H_SIZE))
+		return 0;
+
+	msg = skb_header_pointer(buf, 0, sizeof(tipc_hdr), tipc_hdr);
+	if (msg == NULL)
+		return 0;
+
+	if (unlikely(msg_version(msg) != TIPC_VERSION))
+		return 0;
+
+	size = msg_size(msg);
+	hdr_size = msg_hdr_sz(msg);
+	min_hdr_size = msg_isdata(msg) ?
+		min_data_hdr_size[msg_type(msg)] : INT_H_SIZE;
+
+	if (unlikely((hdr_size < min_hdr_size) ||
+		     (size < hdr_size) ||
+		     (buf->len < size) ||
+		     (size - hdr_size > TIPC_MAX_USER_MSG_SIZE)))
+		return 0;
+
+	return pskb_may_pull(buf, hdr_size);
+}
+
+/**
+ * tipc_recv_msg - process TIPC messages arriving from off-node
+ * @head: pointer to message buffer chain
+ * @tb_ptr: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held.  Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+
+void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *b_ptr)
+{
+	read_lock_bh(&tipc_net_lock);
+	while (head) {
+		struct tipc_node *n_ptr;
+		struct link *l_ptr;
+		struct sk_buff *crs;
+		struct sk_buff *buf = head;
+		struct tipc_msg *msg;
+		u32 seq_no;
+		u32 ackd;
+		u32 released = 0;
+		int type;
+
+		head = head->next;
+
+		/* Ensure bearer is still enabled */
+
+		if (unlikely(!b_ptr->active))
+			goto cont;
+
+		/* Ensure message is well-formed */
+
+		if (unlikely(!link_recv_buf_validate(buf)))
+			goto cont;
+
+		/* Ensure message data is a single contiguous unit */
+
+		if (unlikely(buf_linearize(buf)))
+			goto cont;
+
+		/* Handle arrival of a non-unicast link message */
+
+		msg = buf_msg(buf);
+
+		if (unlikely(msg_non_seq(msg))) {
+			if (msg_user(msg) ==  LINK_CONFIG)
+				tipc_disc_recv_msg(buf, b_ptr);
+			else
+				tipc_bclink_recv_pkt(buf);
+			continue;
+		}
+
+		if (unlikely(!msg_short(msg) &&
+			     (msg_destnode(msg) != tipc_own_addr)))
+			goto cont;
+
+		/* Discard non-routeable messages destined for another node */
+
+		if (unlikely(!msg_isdata(msg) &&
+			     (msg_destnode(msg) != tipc_own_addr))) {
+			if ((msg_user(msg) != CONN_MANAGER) &&
+			    (msg_user(msg) != MSG_FRAGMENTER))
+				goto cont;
+		}
+
+		/* Locate neighboring node that sent message */
+
+		n_ptr = tipc_node_find(msg_prevnode(msg));
+		if (unlikely(!n_ptr))
+			goto cont;
+		tipc_node_lock(n_ptr);
+
+		/* Don't talk to neighbor during cleanup after last session */
+
+		if (n_ptr->cleanup_required) {
+			tipc_node_unlock(n_ptr);
+			goto cont;
+		}
+
+		/* Locate unicast link endpoint that should handle message */
+
+		l_ptr = n_ptr->links[b_ptr->identity];
+		if (unlikely(!l_ptr)) {
+			tipc_node_unlock(n_ptr);
+			goto cont;
+		}
+
+		/* Validate message sequence number info */
+
+		seq_no = msg_seqno(msg);
+		ackd = msg_ack(msg);
+
+		/* Release acked messages */
+
+		if (less(n_ptr->bclink.acked, msg_bcast_ack(msg))) {
+			if (tipc_node_is_up(n_ptr) && n_ptr->bclink.supported)
+				tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg));
+		}
+
+		crs = l_ptr->first_out;
+		while ((crs != l_ptr->next_out) &&
+		       less_eq(msg_seqno(buf_msg(crs)), ackd)) {
+			struct sk_buff *next = crs->next;
+
+			buf_discard(crs);
+			crs = next;
+			released++;
+		}
+		if (released) {
+			l_ptr->first_out = crs;
+			l_ptr->out_queue_size -= released;
+		}
+
+		/* Try sending any messages link endpoint has pending */
+
+		if (unlikely(l_ptr->next_out))
+			tipc_link_push_queue(l_ptr);
+		if (unlikely(!list_empty(&l_ptr->waiting_ports)))
+			tipc_link_wakeup_ports(l_ptr, 0);
+		if (unlikely(++l_ptr->unacked_window >= TIPC_MIN_LINK_WIN)) {
+			l_ptr->stats.sent_acks++;
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+		}
+
+		/* Now (finally!) process the incoming message */
+
+protocol_check:
+		if (likely(link_working_working(l_ptr))) {
+			if (likely(seq_no == mod(l_ptr->next_in_no))) {
+				l_ptr->next_in_no++;
+				if (unlikely(l_ptr->oldest_deferred_in))
+					head = link_insert_deferred_queue(l_ptr,
+									  head);
+				if (likely(msg_is_dest(msg, tipc_own_addr))) {
+deliver:
+					if (likely(msg_isdata(msg))) {
+						tipc_node_unlock(n_ptr);
+						tipc_port_recv_msg(buf);
+						continue;
+					}
+					switch (msg_user(msg)) {
+					case MSG_BUNDLER:
+						l_ptr->stats.recv_bundles++;
+						l_ptr->stats.recv_bundled +=
+							msg_msgcnt(msg);
+						tipc_node_unlock(n_ptr);
+						tipc_link_recv_bundle(buf);
+						continue;
+					case NAME_DISTRIBUTOR:
+						tipc_node_unlock(n_ptr);
+						tipc_named_recv(buf);
+						continue;
+					case CONN_MANAGER:
+						tipc_node_unlock(n_ptr);
+						tipc_port_recv_proto_msg(buf);
+						continue;
+					case MSG_FRAGMENTER:
+						l_ptr->stats.recv_fragments++;
+						if (tipc_link_recv_fragment(&l_ptr->defragm_buf,
+									    &buf, &msg)) {
+							l_ptr->stats.recv_fragmented++;
+							goto deliver;
+						}
+						break;
+					case CHANGEOVER_PROTOCOL:
+						type = msg_type(msg);
+						if (link_recv_changeover_msg(&l_ptr, &buf)) {
+							msg = buf_msg(buf);
+							seq_no = msg_seqno(msg);
+							if (type == ORIGINAL_MSG)
+								goto deliver;
+							goto protocol_check;
+						}
+						break;
+					default:
+						buf_discard(buf);
+						buf = NULL;
+						break;
+					}
+				}
+				tipc_node_unlock(n_ptr);
+				tipc_net_route_msg(buf);
+				continue;
+			}
+			link_handle_out_of_seq_msg(l_ptr, buf);
+			head = link_insert_deferred_queue(l_ptr, head);
+			tipc_node_unlock(n_ptr);
+			continue;
+		}
+
+		if (msg_user(msg) == LINK_PROTOCOL) {
+			link_recv_proto_msg(l_ptr, buf);
+			head = link_insert_deferred_queue(l_ptr, head);
+			tipc_node_unlock(n_ptr);
+			continue;
+		}
+		link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+
+		if (link_working_working(l_ptr)) {
+			/* Re-insert in front of queue */
+			buf->next = head;
+			head = buf;
+			tipc_node_unlock(n_ptr);
+			continue;
+		}
+		tipc_node_unlock(n_ptr);
+cont:
+		buf_discard(buf);
+	}
+	read_unlock_bh(&tipc_net_lock);
+}
+
+/*
+ * link_defer_buf(): Sort a received out-of-sequence packet
+ *                   into the deferred reception queue.
+ * Returns the increase of the queue length,i.e. 0 or 1
+ */
+
+u32 tipc_link_defer_pkt(struct sk_buff **head,
+			struct sk_buff **tail,
+			struct sk_buff *buf)
+{
+	struct sk_buff *prev = NULL;
+	struct sk_buff *crs = *head;
+	u32 seq_no = msg_seqno(buf_msg(buf));
+
+	buf->next = NULL;
+
+	/* Empty queue ? */
+	if (*head == NULL) {
+		*head = *tail = buf;
+		return 1;
+	}
+
+	/* Last ? */
+	if (less(msg_seqno(buf_msg(*tail)), seq_no)) {
+		(*tail)->next = buf;
+		*tail = buf;
+		return 1;
+	}
+
+	/* Scan through queue and sort it in */
+	do {
+		struct tipc_msg *msg = buf_msg(crs);
+
+		if (less(seq_no, msg_seqno(msg))) {
+			buf->next = crs;
+			if (prev)
+				prev->next = buf;
+			else
+				*head = buf;
+			return 1;
+		}
+		if (seq_no == msg_seqno(msg))
+			break;
+		prev = crs;
+		crs = crs->next;
+	} while (crs);
+
+	/* Message is a duplicate of an existing message */
+
+	buf_discard(buf);
+	return 0;
+}
+
+/**
+ * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet
+ */
+
+static void link_handle_out_of_seq_msg(struct link *l_ptr,
+				       struct sk_buff *buf)
+{
+	u32 seq_no = msg_seqno(buf_msg(buf));
+
+	if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) {
+		link_recv_proto_msg(l_ptr, buf);
+		return;
+	}
+
+	/* Record OOS packet arrival (force mismatch on next timeout) */
+
+	l_ptr->checkpoint--;
+
+	/*
+	 * Discard packet if a duplicate; otherwise add it to deferred queue
+	 * and notify peer of gap as per protocol specification
+	 */
+
+	if (less(seq_no, mod(l_ptr->next_in_no))) {
+		l_ptr->stats.duplicates++;
+		buf_discard(buf);
+		return;
+	}
+
+	if (tipc_link_defer_pkt(&l_ptr->oldest_deferred_in,
+				&l_ptr->newest_deferred_in, buf)) {
+		l_ptr->deferred_inqueue_sz++;
+		l_ptr->stats.deferred_recv++;
+		if ((l_ptr->deferred_inqueue_sz % 16) == 1)
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG, 0, 0, 0, 0, 0);
+	} else
+		l_ptr->stats.duplicates++;
+}
+
+/*
+ * Send protocol message to the other endpoint.
+ */
+void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
+			      u32 gap, u32 tolerance, u32 priority, u32 ack_mtu)
+{
+	struct sk_buff *buf = NULL;
+	struct tipc_msg *msg = l_ptr->pmsg;
+	u32 msg_size = sizeof(l_ptr->proto_msg);
+	int r_flag;
+
+	if (link_blocked(l_ptr))
+		return;
+	msg_set_type(msg, msg_typ);
+	msg_set_net_plane(msg, l_ptr->b_ptr->net_plane);
+	msg_set_bcast_ack(msg, mod(l_ptr->owner->bclink.last_in));
+	msg_set_last_bcast(msg, tipc_bclink_get_last_sent());
+
+	if (msg_typ == STATE_MSG) {
+		u32 next_sent = mod(l_ptr->next_out_no);
+
+		if (!tipc_link_is_up(l_ptr))
+			return;
+		if (l_ptr->next_out)
+			next_sent = msg_seqno(buf_msg(l_ptr->next_out));
+		msg_set_next_sent(msg, next_sent);
+		if (l_ptr->oldest_deferred_in) {
+			u32 rec = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+			gap = mod(rec - mod(l_ptr->next_in_no));
+		}
+		msg_set_seq_gap(msg, gap);
+		if (gap)
+			l_ptr->stats.sent_nacks++;
+		msg_set_link_tolerance(msg, tolerance);
+		msg_set_linkprio(msg, priority);
+		msg_set_max_pkt(msg, ack_mtu);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));
+		msg_set_probe(msg, probe_msg != 0);
+		if (probe_msg) {
+			u32 mtu = l_ptr->max_pkt;
+
+			if ((mtu < l_ptr->max_pkt_target) &&
+			    link_working_working(l_ptr) &&
+			    l_ptr->fsm_msg_cnt) {
+				msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3;
+				if (l_ptr->max_pkt_probes == 10) {
+					l_ptr->max_pkt_target = (msg_size - 4);
+					l_ptr->max_pkt_probes = 0;
+					msg_size = (mtu + (l_ptr->max_pkt_target - mtu)/2 + 2) & ~3;
+				}
+				l_ptr->max_pkt_probes++;
+			}
+
+			l_ptr->stats.sent_probes++;
+		}
+		l_ptr->stats.sent_states++;
+	} else {		/* RESET_MSG or ACTIVATE_MSG */
+		msg_set_ack(msg, mod(l_ptr->reset_checkpoint - 1));
+		msg_set_seq_gap(msg, 0);
+		msg_set_next_sent(msg, 1);
+		msg_set_probe(msg, 0);
+		msg_set_link_tolerance(msg, l_ptr->tolerance);
+		msg_set_linkprio(msg, l_ptr->priority);
+		msg_set_max_pkt(msg, l_ptr->max_pkt_target);
+	}
+
+	r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr));
+	msg_set_redundant_link(msg, r_flag);
+	msg_set_linkprio(msg, l_ptr->priority);
+
+	/* Ensure sequence number will not fit : */
+
+	msg_set_seqno(msg, mod(l_ptr->next_out_no + (0xffff/2)));
+
+	/* Congestion? */
+
+	if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) {
+		if (!l_ptr->proto_msg_queue) {
+			l_ptr->proto_msg_queue =
+				tipc_buf_acquire(sizeof(l_ptr->proto_msg));
+		}
+		buf = l_ptr->proto_msg_queue;
+		if (!buf)
+			return;
+		skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
+		return;
+	}
+
+	/* Message can be sent */
+
+	buf = tipc_buf_acquire(msg_size);
+	if (!buf)
+		return;
+
+	skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg));
+	msg_set_size(buf_msg(buf), msg_size);
+
+	if (tipc_bearer_send(l_ptr->b_ptr, buf, &l_ptr->media_addr)) {
+		l_ptr->unacked_window = 0;
+		buf_discard(buf);
+		return;
+	}
+
+	/* New congestion */
+	tipc_bearer_schedule(l_ptr->b_ptr, l_ptr);
+	l_ptr->proto_msg_queue = buf;
+	l_ptr->stats.bearer_congs++;
+}
+
+/*
+ * Receive protocol message :
+ * Note that network plane id propagates through the network, and may
+ * change at any time. The node with lowest address rules
+ */
+
+static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf)
+{
+	u32 rec_gap = 0;
+	u32 max_pkt_info;
+	u32 max_pkt_ack;
+	u32 msg_tol;
+	struct tipc_msg *msg = buf_msg(buf);
+
+	if (link_blocked(l_ptr))
+		goto exit;
+
+	/* record unnumbered packet arrival (force mismatch on next timeout) */
+
+	l_ptr->checkpoint--;
+
+	if (l_ptr->b_ptr->net_plane != msg_net_plane(msg))
+		if (tipc_own_addr > msg_prevnode(msg))
+			l_ptr->b_ptr->net_plane = msg_net_plane(msg);
+
+	l_ptr->owner->permit_changeover = msg_redundant_link(msg);
+
+	switch (msg_type(msg)) {
+
+	case RESET_MSG:
+		if (!link_working_unknown(l_ptr) &&
+		    (l_ptr->peer_session != INVALID_SESSION)) {
+			if (msg_session(msg) == l_ptr->peer_session)
+				break; /* duplicate: ignore */
+		}
+		/* fall thru' */
+	case ACTIVATE_MSG:
+		/* Update link settings according other endpoint's values */
+
+		strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg));
+
+		msg_tol = msg_link_tolerance(msg);
+		if (msg_tol > l_ptr->tolerance)
+			link_set_supervision_props(l_ptr, msg_tol);
+
+		if (msg_linkprio(msg) > l_ptr->priority)
+			l_ptr->priority = msg_linkprio(msg);
+
+		max_pkt_info = msg_max_pkt(msg);
+		if (max_pkt_info) {
+			if (max_pkt_info < l_ptr->max_pkt_target)
+				l_ptr->max_pkt_target = max_pkt_info;
+			if (l_ptr->max_pkt > l_ptr->max_pkt_target)
+				l_ptr->max_pkt = l_ptr->max_pkt_target;
+		} else {
+			l_ptr->max_pkt = l_ptr->max_pkt_target;
+		}
+		l_ptr->owner->bclink.supported = (max_pkt_info != 0);
+
+		link_state_event(l_ptr, msg_type(msg));
+
+		l_ptr->peer_session = msg_session(msg);
+		l_ptr->peer_bearer_id = msg_bearer_id(msg);
+
+		/* Synchronize broadcast sequence numbers */
+		if (!tipc_node_redundant_links(l_ptr->owner))
+			l_ptr->owner->bclink.last_in = mod(msg_last_bcast(msg));
+		break;
+	case STATE_MSG:
+
+		msg_tol = msg_link_tolerance(msg);
+		if (msg_tol)
+			link_set_supervision_props(l_ptr, msg_tol);
+
+		if (msg_linkprio(msg) &&
+		    (msg_linkprio(msg) != l_ptr->priority)) {
+			warn("Resetting link <%s>, priority change %u->%u\n",
+			     l_ptr->name, l_ptr->priority, msg_linkprio(msg));
+			l_ptr->priority = msg_linkprio(msg);
+			tipc_link_reset(l_ptr); /* Enforce change to take effect */
+			break;
+		}
+		link_state_event(l_ptr, TRAFFIC_MSG_EVT);
+		l_ptr->stats.recv_states++;
+		if (link_reset_unknown(l_ptr))
+			break;
+
+		if (less_eq(mod(l_ptr->next_in_no), msg_next_sent(msg))) {
+			rec_gap = mod(msg_next_sent(msg) -
+				      mod(l_ptr->next_in_no));
+		}
+
+		max_pkt_ack = msg_max_pkt(msg);
+		if (max_pkt_ack > l_ptr->max_pkt) {
+			l_ptr->max_pkt = max_pkt_ack;
+			l_ptr->max_pkt_probes = 0;
+		}
+
+		max_pkt_ack = 0;
+		if (msg_probe(msg)) {
+			l_ptr->stats.recv_probes++;
+			if (msg_size(msg) > sizeof(l_ptr->proto_msg))
+				max_pkt_ack = msg_size(msg);
+		}
+
+		/* Protocol message before retransmits, reduce loss risk */
+
+		tipc_bclink_check_gap(l_ptr->owner, msg_last_bcast(msg));
+
+		if (rec_gap || (msg_probe(msg))) {
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+						 0, rec_gap, 0, 0, max_pkt_ack);
+		}
+		if (msg_seq_gap(msg)) {
+			l_ptr->stats.recv_nacks++;
+			tipc_link_retransmit(l_ptr, l_ptr->first_out,
+					     msg_seq_gap(msg));
+		}
+		break;
+	}
+exit:
+	buf_discard(buf);
+}
+
+
+/*
+ * tipc_link_tunnel(): Send one message via a link belonging to
+ * another bearer. Owner node is locked.
+ */
+static void tipc_link_tunnel(struct link *l_ptr,
+			     struct tipc_msg *tunnel_hdr,
+			     struct tipc_msg  *msg,
+			     u32 selector)
+{
+	struct link *tunnel;
+	struct sk_buff *buf;
+	u32 length = msg_size(msg);
+
+	tunnel = l_ptr->owner->active_links[selector & 1];
+	if (!tipc_link_is_up(tunnel)) {
+		warn("Link changeover error, "
+		     "tunnel link no longer available\n");
+		return;
+	}
+	msg_set_size(tunnel_hdr, length + INT_H_SIZE);
+	buf = tipc_buf_acquire(length + INT_H_SIZE);
+	if (!buf) {
+		warn("Link changeover error, "
+		     "unable to send tunnel msg\n");
+		return;
+	}
+	skb_copy_to_linear_data(buf, tunnel_hdr, INT_H_SIZE);
+	skb_copy_to_linear_data_offset(buf, INT_H_SIZE, msg, length);
+	tipc_link_send_buf(tunnel, buf);
+}
+
+
+
+/*
+ * changeover(): Send whole message queue via the remaining link
+ *               Owner node is locked.
+ */
+
+void tipc_link_changeover(struct link *l_ptr)
+{
+	u32 msgcount = l_ptr->out_queue_size;
+	struct sk_buff *crs = l_ptr->first_out;
+	struct link *tunnel = l_ptr->owner->active_links[0];
+	struct tipc_msg tunnel_hdr;
+	int split_bundles;
+
+	if (!tunnel)
+		return;
+
+	if (!l_ptr->owner->permit_changeover) {
+		warn("Link changeover error, "
+		     "peer did not permit changeover\n");
+		return;
+	}
+
+	tipc_msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL,
+		 ORIGINAL_MSG, INT_H_SIZE, l_ptr->addr);
+	msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
+	msg_set_msgcnt(&tunnel_hdr, msgcount);
+
+	if (!l_ptr->first_out) {
+		struct sk_buff *buf;
+
+		buf = tipc_buf_acquire(INT_H_SIZE);
+		if (buf) {
+			skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE);
+			msg_set_size(&tunnel_hdr, INT_H_SIZE);
+			tipc_link_send_buf(tunnel, buf);
+		} else {
+			warn("Link changeover error, "
+			     "unable to send changeover msg\n");
+		}
+		return;
+	}
+
+	split_bundles = (l_ptr->owner->active_links[0] !=
+			 l_ptr->owner->active_links[1]);
+
+	while (crs) {
+		struct tipc_msg *msg = buf_msg(crs);
+
+		if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) {
+			struct tipc_msg *m = msg_get_wrapped(msg);
+			unchar *pos = (unchar *)m;
+
+			msgcount = msg_msgcnt(msg);
+			while (msgcount--) {
+				msg_set_seqno(m, msg_seqno(msg));
+				tipc_link_tunnel(l_ptr, &tunnel_hdr, m,
+						 msg_link_selector(m));
+				pos += align(msg_size(m));
+				m = (struct tipc_msg *)pos;
+			}
+		} else {
+			tipc_link_tunnel(l_ptr, &tunnel_hdr, msg,
+					 msg_link_selector(msg));
+		}
+		crs = crs->next;
+	}
+}
+
+void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel)
+{
+	struct sk_buff *iter;
+	struct tipc_msg tunnel_hdr;
+
+	tipc_msg_init(&tunnel_hdr, CHANGEOVER_PROTOCOL,
+		 DUPLICATE_MSG, INT_H_SIZE, l_ptr->addr);
+	msg_set_msgcnt(&tunnel_hdr, l_ptr->out_queue_size);
+	msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id);
+	iter = l_ptr->first_out;
+	while (iter) {
+		struct sk_buff *outbuf;
+		struct tipc_msg *msg = buf_msg(iter);
+		u32 length = msg_size(msg);
+
+		if (msg_user(msg) == MSG_BUNDLER)
+			msg_set_type(msg, CLOSED_MSG);
+		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));	/* Update */
+		msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
+		msg_set_size(&tunnel_hdr, length + INT_H_SIZE);
+		outbuf = tipc_buf_acquire(length + INT_H_SIZE);
+		if (outbuf == NULL) {
+			warn("Link changeover error, "
+			     "unable to send duplicate msg\n");
+			return;
+		}
+		skb_copy_to_linear_data(outbuf, &tunnel_hdr, INT_H_SIZE);
+		skb_copy_to_linear_data_offset(outbuf, INT_H_SIZE, iter->data,
+					       length);
+		tipc_link_send_buf(tunnel, outbuf);
+		if (!tipc_link_is_up(l_ptr))
+			return;
+		iter = iter->next;
+	}
+}
+
+
+
+/**
+ * buf_extract - extracts embedded TIPC message from another message
+ * @skb: encapsulating message buffer
+ * @from_pos: offset to extract from
+ *
+ * Returns a new message buffer containing an embedded message.  The
+ * encapsulating message itself is left unchanged.
+ */
+
+static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
+{
+	struct tipc_msg *msg = (struct tipc_msg *)(skb->data + from_pos);
+	u32 size = msg_size(msg);
+	struct sk_buff *eb;
+
+	eb = tipc_buf_acquire(size);
+	if (eb)
+		skb_copy_to_linear_data(eb, msg, size);
+	return eb;
+}
+
+/*
+ *  link_recv_changeover_msg(): Receive tunneled packet sent
+ *  via other link. Node is locked. Return extracted buffer.
+ */
+
+static int link_recv_changeover_msg(struct link **l_ptr,
+				    struct sk_buff **buf)
+{
+	struct sk_buff *tunnel_buf = *buf;
+	struct link *dest_link;
+	struct tipc_msg *msg;
+	struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf);
+	u32 msg_typ = msg_type(tunnel_msg);
+	u32 msg_count = msg_msgcnt(tunnel_msg);
+
+	dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)];
+	if (!dest_link)
+		goto exit;
+	if (dest_link == *l_ptr) {
+		err("Unexpected changeover message on link <%s>\n",
+		    (*l_ptr)->name);
+		goto exit;
+	}
+	*l_ptr = dest_link;
+	msg = msg_get_wrapped(tunnel_msg);
+
+	if (msg_typ == DUPLICATE_MSG) {
+		if (less(msg_seqno(msg), mod(dest_link->next_in_no)))
+			goto exit;
+		*buf = buf_extract(tunnel_buf, INT_H_SIZE);
+		if (*buf == NULL) {
+			warn("Link changeover error, duplicate msg dropped\n");
+			goto exit;
+		}
+		buf_discard(tunnel_buf);
+		return 1;
+	}
+
+	/* First original message ?: */
+
+	if (tipc_link_is_up(dest_link)) {
+		info("Resetting link <%s>, changeover initiated by peer\n",
+		     dest_link->name);
+		tipc_link_reset(dest_link);
+		dest_link->exp_msg_count = msg_count;
+		if (!msg_count)
+			goto exit;
+	} else if (dest_link->exp_msg_count == START_CHANGEOVER) {
+		dest_link->exp_msg_count = msg_count;
+		if (!msg_count)
+			goto exit;
+	}
+
+	/* Receive original message */
+
+	if (dest_link->exp_msg_count == 0) {
+		warn("Link switchover error, "
+		     "got too many tunnelled messages\n");
+		goto exit;
+	}
+	dest_link->exp_msg_count--;
+	if (less(msg_seqno(msg), dest_link->reset_checkpoint)) {
+		goto exit;
+	} else {
+		*buf = buf_extract(tunnel_buf, INT_H_SIZE);
+		if (*buf != NULL) {
+			buf_discard(tunnel_buf);
+			return 1;
+		} else {
+			warn("Link changeover error, original msg dropped\n");
+		}
+	}
+exit:
+	*buf = NULL;
+	buf_discard(tunnel_buf);
+	return 0;
+}
+
+/*
+ *  Bundler functionality:
+ */
+void tipc_link_recv_bundle(struct sk_buff *buf)
+{
+	u32 msgcount = msg_msgcnt(buf_msg(buf));
+	u32 pos = INT_H_SIZE;
+	struct sk_buff *obuf;
+
+	while (msgcount--) {
+		obuf = buf_extract(buf, pos);
+		if (obuf == NULL) {
+			warn("Link unable to unbundle message(s)\n");
+			break;
+		}
+		pos += align(msg_size(buf_msg(obuf)));
+		tipc_net_route_msg(obuf);
+	}
+	buf_discard(buf);
+}
+
+/*
+ *  Fragmentation/defragmentation:
+ */
+
+
+/*
+ * link_send_long_buf: Entry for buffers needing fragmentation.
+ * The buffer is complete, inclusive total message length.
+ * Returns user data length.
+ */
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
+{
+	struct sk_buff *buf_chain = NULL;
+	struct sk_buff *buf_chain_tail = (struct sk_buff *)&buf_chain;
+	struct tipc_msg *inmsg = buf_msg(buf);
+	struct tipc_msg fragm_hdr;
+	u32 insize = msg_size(inmsg);
+	u32 dsz = msg_data_sz(inmsg);
+	unchar *crs = buf->data;
+	u32 rest = insize;
+	u32 pack_sz = l_ptr->max_pkt;
+	u32 fragm_sz = pack_sz - INT_H_SIZE;
+	u32 fragm_no = 0;
+	u32 destaddr;
+
+	if (msg_short(inmsg))
+		destaddr = l_ptr->addr;
+	else
+		destaddr = msg_destnode(inmsg);
+
+	/* Prepare reusable fragment header: */
+
+	tipc_msg_init(&fragm_hdr, MSG_FRAGMENTER, FIRST_FRAGMENT,
+		 INT_H_SIZE, destaddr);
+
+	/* Chop up message: */
+
+	while (rest > 0) {
+		struct sk_buff *fragm;
+
+		if (rest <= fragm_sz) {
+			fragm_sz = rest;
+			msg_set_type(&fragm_hdr, LAST_FRAGMENT);
+		}
+		fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
+		if (fragm == NULL) {
+			buf_discard(buf);
+			while (buf_chain) {
+				buf = buf_chain;
+				buf_chain = buf_chain->next;
+				buf_discard(buf);
+			}
+			return -ENOMEM;
+		}
+		msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
+		fragm_no++;
+		msg_set_fragm_no(&fragm_hdr, fragm_no);
+		skb_copy_to_linear_data(fragm, &fragm_hdr, INT_H_SIZE);
+		skb_copy_to_linear_data_offset(fragm, INT_H_SIZE, crs,
+					       fragm_sz);
+		buf_chain_tail->next = fragm;
+		buf_chain_tail = fragm;
+
+		rest -= fragm_sz;
+		crs += fragm_sz;
+		msg_set_type(&fragm_hdr, FRAGMENT);
+	}
+	buf_discard(buf);
+
+	/* Append chain of fragments to send queue & send them */
+
+	l_ptr->long_msg_seq_no++;
+	link_add_chain_to_outqueue(l_ptr, buf_chain, l_ptr->long_msg_seq_no);
+	l_ptr->stats.sent_fragments += fragm_no;
+	l_ptr->stats.sent_fragmented++;
+	tipc_link_push_queue(l_ptr);
+
+	return dsz;
+}
+
+/*
+ * A pending message being re-assembled must store certain values
+ * to handle subsequent fragments correctly. The following functions
+ * help storing these values in unused, available fields in the
+ * pending message. This makes dynamic memory allocation unnecessary.
+ */
+
+static void set_long_msg_seqno(struct sk_buff *buf, u32 seqno)
+{
+	msg_set_seqno(buf_msg(buf), seqno);
+}
+
+static u32 get_fragm_size(struct sk_buff *buf)
+{
+	return msg_ack(buf_msg(buf));
+}
+
+static void set_fragm_size(struct sk_buff *buf, u32 sz)
+{
+	msg_set_ack(buf_msg(buf), sz);
+}
+
+static u32 get_expected_frags(struct sk_buff *buf)
+{
+	return msg_bcast_ack(buf_msg(buf));
+}
+
+static void set_expected_frags(struct sk_buff *buf, u32 exp)
+{
+	msg_set_bcast_ack(buf_msg(buf), exp);
+}
+
+static u32 get_timer_cnt(struct sk_buff *buf)
+{
+	return msg_reroute_cnt(buf_msg(buf));
+}
+
+static void incr_timer_cnt(struct sk_buff *buf)
+{
+	msg_incr_reroute_cnt(buf_msg(buf));
+}
+
+/*
+ * tipc_link_recv_fragment(): Called with node lock on. Returns
+ * the reassembled buffer if message is complete.
+ */
+int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
+			    struct tipc_msg **m)
+{
+	struct sk_buff *prev = NULL;
+	struct sk_buff *fbuf = *fb;
+	struct tipc_msg *fragm = buf_msg(fbuf);
+	struct sk_buff *pbuf = *pending;
+	u32 long_msg_seq_no = msg_long_msgno(fragm);
+
+	*fb = NULL;
+
+	/* Is there an incomplete message waiting for this fragment? */
+
+	while (pbuf && ((msg_seqno(buf_msg(pbuf)) != long_msg_seq_no) ||
+			(msg_orignode(fragm) != msg_orignode(buf_msg(pbuf))))) {
+		prev = pbuf;
+		pbuf = pbuf->next;
+	}
+
+	if (!pbuf && (msg_type(fragm) == FIRST_FRAGMENT)) {
+		struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm);
+		u32 msg_sz = msg_size(imsg);
+		u32 fragm_sz = msg_data_sz(fragm);
+		u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz);
+		u32 max =  TIPC_MAX_USER_MSG_SIZE + LONG_H_SIZE;
+		if (msg_type(imsg) == TIPC_MCAST_MSG)
+			max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE;
+		if (msg_size(imsg) > max) {
+			buf_discard(fbuf);
+			return 0;
+		}
+		pbuf = tipc_buf_acquire(msg_size(imsg));
+		if (pbuf != NULL) {
+			pbuf->next = *pending;
+			*pending = pbuf;
+			skb_copy_to_linear_data(pbuf, imsg,
+						msg_data_sz(fragm));
+			/*  Prepare buffer for subsequent fragments. */
+
+			set_long_msg_seqno(pbuf, long_msg_seq_no);
+			set_fragm_size(pbuf, fragm_sz);
+			set_expected_frags(pbuf, exp_fragm_cnt - 1);
+		} else {
+			warn("Link unable to reassemble fragmented message\n");
+		}
+		buf_discard(fbuf);
+		return 0;
+	} else if (pbuf && (msg_type(fragm) != FIRST_FRAGMENT)) {
+		u32 dsz = msg_data_sz(fragm);
+		u32 fsz = get_fragm_size(pbuf);
+		u32 crs = ((msg_fragm_no(fragm) - 1) * fsz);
+		u32 exp_frags = get_expected_frags(pbuf) - 1;
+		skb_copy_to_linear_data_offset(pbuf, crs,
+					       msg_data(fragm), dsz);
+		buf_discard(fbuf);
+
+		/* Is message complete? */
+
+		if (exp_frags == 0) {
+			if (prev)
+				prev->next = pbuf->next;
+			else
+				*pending = pbuf->next;
+			msg_reset_reroute_cnt(buf_msg(pbuf));
+			*fb = pbuf;
+			*m = buf_msg(pbuf);
+			return 1;
+		}
+		set_expected_frags(pbuf, exp_frags);
+		return 0;
+	}
+	buf_discard(fbuf);
+	return 0;
+}
+
+/**
+ * link_check_defragm_bufs - flush stale incoming message fragments
+ * @l_ptr: pointer to link
+ */
+
+static void link_check_defragm_bufs(struct link *l_ptr)
+{
+	struct sk_buff *prev = NULL;
+	struct sk_buff *next = NULL;
+	struct sk_buff *buf = l_ptr->defragm_buf;
+
+	if (!buf)
+		return;
+	if (!link_working_working(l_ptr))
+		return;
+	while (buf) {
+		u32 cnt = get_timer_cnt(buf);
+
+		next = buf->next;
+		if (cnt < 4) {
+			incr_timer_cnt(buf);
+			prev = buf;
+		} else {
+			if (prev)
+				prev->next = buf->next;
+			else
+				l_ptr->defragm_buf = buf->next;
+			buf_discard(buf);
+		}
+		buf = next;
+	}
+}
+
+
+
+static void link_set_supervision_props(struct link *l_ptr, u32 tolerance)
+{
+	if ((tolerance < TIPC_MIN_LINK_TOL) || (tolerance > TIPC_MAX_LINK_TOL))
+		return;
+
+	l_ptr->tolerance = tolerance;
+	l_ptr->continuity_interval =
+		((tolerance / 4) > 500) ? 500 : tolerance / 4;
+	l_ptr->abort_limit = tolerance / (l_ptr->continuity_interval / 4);
+}
+
+
+void tipc_link_set_queue_limits(struct link *l_ptr, u32 window)
+{
+	/* Data messages from this node, inclusive FIRST_FRAGM */
+	l_ptr->queue_limit[TIPC_LOW_IMPORTANCE] = window;
+	l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE] = (window / 3) * 4;
+	l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE] = (window / 3) * 5;
+	l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE] = (window / 3) * 6;
+	/* Transiting data messages,inclusive FIRST_FRAGM */
+	l_ptr->queue_limit[TIPC_LOW_IMPORTANCE + 4] = 300;
+	l_ptr->queue_limit[TIPC_MEDIUM_IMPORTANCE + 4] = 600;
+	l_ptr->queue_limit[TIPC_HIGH_IMPORTANCE + 4] = 900;
+	l_ptr->queue_limit[TIPC_CRITICAL_IMPORTANCE + 4] = 1200;
+	l_ptr->queue_limit[CONN_MANAGER] = 1200;
+	l_ptr->queue_limit[CHANGEOVER_PROTOCOL] = 2500;
+	l_ptr->queue_limit[NAME_DISTRIBUTOR] = 3000;
+	/* FRAGMENT and LAST_FRAGMENT packets */
+	l_ptr->queue_limit[MSG_FRAGMENTER] = 4000;
+}
+
+/**
+ * link_find_link - locate link by name
+ * @name - ptr to link name string
+ * @node - ptr to area to be filled with ptr to associated node
+ *
+ * Caller must hold 'tipc_net_lock' to ensure node and bearer are not deleted;
+ * this also prevents link deletion.
+ *
+ * Returns pointer to link (or 0 if invalid link name).
+ */
+
+static struct link *link_find_link(const char *name, struct tipc_node **node)
+{
+	struct link_name link_name_parts;
+	struct tipc_bearer *b_ptr;
+	struct link *l_ptr;
+
+	if (!link_name_validate(name, &link_name_parts))
+		return NULL;
+
+	b_ptr = tipc_bearer_find_interface(link_name_parts.if_local);
+	if (!b_ptr)
+		return NULL;
+
+	*node = tipc_node_find(link_name_parts.addr_peer);
+	if (!*node)
+		return NULL;
+
+	l_ptr = (*node)->links[b_ptr->identity];
+	if (!l_ptr || strcmp(l_ptr->name, name))
+		return NULL;
+
+	return l_ptr;
+}
+
+struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space,
+				     u16 cmd)
+{
+	struct tipc_link_config *args;
+	u32 new_value;
+	struct link *l_ptr;
+	struct tipc_node *node;
+	int res;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_CONFIG))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	args = (struct tipc_link_config *)TLV_DATA(req_tlv_area);
+	new_value = ntohl(args->value);
+
+	if (!strcmp(args->name, tipc_bclink_name)) {
+		if ((cmd == TIPC_CMD_SET_LINK_WINDOW) &&
+		    (tipc_bclink_set_queue_limits(new_value) == 0))
+			return tipc_cfg_reply_none();
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+						   " (cannot change setting on broadcast link)");
+	}
+
+	read_lock_bh(&tipc_net_lock);
+	l_ptr = link_find_link(args->name, &node);
+	if (!l_ptr) {
+		read_unlock_bh(&tipc_net_lock);
+		return tipc_cfg_reply_error_string("link not found");
+	}
+
+	tipc_node_lock(node);
+	res = -EINVAL;
+	switch (cmd) {
+	case TIPC_CMD_SET_LINK_TOL:
+		if ((new_value >= TIPC_MIN_LINK_TOL) &&
+		    (new_value <= TIPC_MAX_LINK_TOL)) {
+			link_set_supervision_props(l_ptr, new_value);
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+						 0, 0, new_value, 0, 0);
+			res = 0;
+		}
+		break;
+	case TIPC_CMD_SET_LINK_PRI:
+		if ((new_value >= TIPC_MIN_LINK_PRI) &&
+		    (new_value <= TIPC_MAX_LINK_PRI)) {
+			l_ptr->priority = new_value;
+			tipc_link_send_proto_msg(l_ptr, STATE_MSG,
+						 0, 0, 0, new_value, 0);
+			res = 0;
+		}
+		break;
+	case TIPC_CMD_SET_LINK_WINDOW:
+		if ((new_value >= TIPC_MIN_LINK_WIN) &&
+		    (new_value <= TIPC_MAX_LINK_WIN)) {
+			tipc_link_set_queue_limits(l_ptr, new_value);
+			res = 0;
+		}
+		break;
+	}
+	tipc_node_unlock(node);
+
+	read_unlock_bh(&tipc_net_lock);
+	if (res)
+		return tipc_cfg_reply_error_string("cannot change link setting");
+
+	return tipc_cfg_reply_none();
+}
+
+/**
+ * link_reset_statistics - reset link statistics
+ * @l_ptr: pointer to link
+ */
+
+static void link_reset_statistics(struct link *l_ptr)
+{
+	memset(&l_ptr->stats, 0, sizeof(l_ptr->stats));
+	l_ptr->stats.sent_info = l_ptr->next_out_no;
+	l_ptr->stats.recv_info = l_ptr->next_in_no;
+}
+
+struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space)
+{
+	char *link_name;
+	struct link *l_ptr;
+	struct tipc_node *node;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	link_name = (char *)TLV_DATA(req_tlv_area);
+	if (!strcmp(link_name, tipc_bclink_name)) {
+		if (tipc_bclink_reset_stats())
+			return tipc_cfg_reply_error_string("link not found");
+		return tipc_cfg_reply_none();
+	}
+
+	read_lock_bh(&tipc_net_lock);
+	l_ptr = link_find_link(link_name, &node);
+	if (!l_ptr) {
+		read_unlock_bh(&tipc_net_lock);
+		return tipc_cfg_reply_error_string("link not found");
+	}
+
+	tipc_node_lock(node);
+	link_reset_statistics(l_ptr);
+	tipc_node_unlock(node);
+	read_unlock_bh(&tipc_net_lock);
+	return tipc_cfg_reply_none();
+}
+
+/**
+ * percent - convert count to a percentage of total (rounding up or down)
+ */
+
+static u32 percent(u32 count, u32 total)
+{
+	return (count * 100 + (total / 2)) / total;
+}
+
+/**
+ * tipc_link_stats - print link statistics
+ * @name: link name
+ * @buf: print buffer area
+ * @buf_size: size of print buffer area
+ *
+ * Returns length of print buffer data string (or 0 if error)
+ */
+
+static int tipc_link_stats(const char *name, char *buf, const u32 buf_size)
+{
+	struct print_buf pb;
+	struct link *l_ptr;
+	struct tipc_node *node;
+	char *status;
+	u32 profile_total = 0;
+
+	if (!strcmp(name, tipc_bclink_name))
+		return tipc_bclink_stats(buf, buf_size);
+
+	tipc_printbuf_init(&pb, buf, buf_size);
+
+	read_lock_bh(&tipc_net_lock);
+	l_ptr = link_find_link(name, &node);
+	if (!l_ptr) {
+		read_unlock_bh(&tipc_net_lock);
+		return 0;
+	}
+	tipc_node_lock(node);
+
+	if (tipc_link_is_active(l_ptr))
+		status = "ACTIVE";
+	else if (tipc_link_is_up(l_ptr))
+		status = "STANDBY";
+	else
+		status = "DEFUNCT";
+	tipc_printf(&pb, "Link <%s>\n"
+			 "  %s  MTU:%u  Priority:%u  Tolerance:%u ms"
+			 "  Window:%u packets\n",
+		    l_ptr->name, status, l_ptr->max_pkt,
+		    l_ptr->priority, l_ptr->tolerance, l_ptr->queue_limit[0]);
+	tipc_printf(&pb, "  RX packets:%u fragments:%u/%u bundles:%u/%u\n",
+		    l_ptr->next_in_no - l_ptr->stats.recv_info,
+		    l_ptr->stats.recv_fragments,
+		    l_ptr->stats.recv_fragmented,
+		    l_ptr->stats.recv_bundles,
+		    l_ptr->stats.recv_bundled);
+	tipc_printf(&pb, "  TX packets:%u fragments:%u/%u bundles:%u/%u\n",
+		    l_ptr->next_out_no - l_ptr->stats.sent_info,
+		    l_ptr->stats.sent_fragments,
+		    l_ptr->stats.sent_fragmented,
+		    l_ptr->stats.sent_bundles,
+		    l_ptr->stats.sent_bundled);
+	profile_total = l_ptr->stats.msg_length_counts;
+	if (!profile_total)
+		profile_total = 1;
+	tipc_printf(&pb, "  TX profile sample:%u packets  average:%u octets\n"
+			 "  0-64:%u%% -256:%u%% -1024:%u%% -4096:%u%% "
+			 "-16354:%u%% -32768:%u%% -66000:%u%%\n",
+		    l_ptr->stats.msg_length_counts,
+		    l_ptr->stats.msg_lengths_total / profile_total,
+		    percent(l_ptr->stats.msg_length_profile[0], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[1], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[2], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[3], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[4], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[5], profile_total),
+		    percent(l_ptr->stats.msg_length_profile[6], profile_total));
+	tipc_printf(&pb, "  RX states:%u probes:%u naks:%u defs:%u dups:%u\n",
+		    l_ptr->stats.recv_states,
+		    l_ptr->stats.recv_probes,
+		    l_ptr->stats.recv_nacks,
+		    l_ptr->stats.deferred_recv,
+		    l_ptr->stats.duplicates);
+	tipc_printf(&pb, "  TX states:%u probes:%u naks:%u acks:%u dups:%u\n",
+		    l_ptr->stats.sent_states,
+		    l_ptr->stats.sent_probes,
+		    l_ptr->stats.sent_nacks,
+		    l_ptr->stats.sent_acks,
+		    l_ptr->stats.retransmitted);
+	tipc_printf(&pb, "  Congestion bearer:%u link:%u  Send queue max:%u avg:%u\n",
+		    l_ptr->stats.bearer_congs,
+		    l_ptr->stats.link_congs,
+		    l_ptr->stats.max_queue_sz,
+		    l_ptr->stats.queue_sz_counts
+		    ? (l_ptr->stats.accu_queue_sz / l_ptr->stats.queue_sz_counts)
+		    : 0);
+
+	tipc_node_unlock(node);
+	read_unlock_bh(&tipc_net_lock);
+	return tipc_printbuf_validate(&pb);
+}
+
+#define MAX_LINK_STATS_INFO 2000
+
+struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space)
+{
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	int str_len;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_LINK_NAME))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_LINK_STATS_INFO));
+	if (!buf)
+		return NULL;
+
+	rep_tlv = (struct tlv_desc *)buf->data;
+
+	str_len = tipc_link_stats((char *)TLV_DATA(req_tlv_area),
+				  (char *)TLV_DATA(rep_tlv), MAX_LINK_STATS_INFO);
+	if (!str_len) {
+		buf_discard(buf);
+		return tipc_cfg_reply_error_string("link not found");
+	}
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+/**
+ * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination
+ * @dest: network address of destination node
+ * @selector: used to select from set of active links
+ *
+ * If no active link can be found, uses default maximum packet size.
+ */
+
+u32 tipc_link_get_max_pkt(u32 dest, u32 selector)
+{
+	struct tipc_node *n_ptr;
+	struct link *l_ptr;
+	u32 res = MAX_PKT_DEFAULT;
+
+	if (dest == tipc_own_addr)
+		return MAX_MSG_SIZE;
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find(dest);
+	if (n_ptr) {
+		tipc_node_lock(n_ptr);
+		l_ptr = n_ptr->active_links[selector & 1];
+		if (l_ptr)
+			res = l_ptr->max_pkt;
+		tipc_node_unlock(n_ptr);
+	}
+	read_unlock_bh(&tipc_net_lock);
+	return res;
+}
+
+static void link_print(struct link *l_ptr, const char *str)
+{
+	char print_area[256];
+	struct print_buf pb;
+	struct print_buf *buf = &pb;
+
+	tipc_printbuf_init(buf, print_area, sizeof(print_area));
+
+	tipc_printf(buf, str);
+	tipc_printf(buf, "Link %x<%s>:",
+		    l_ptr->addr, l_ptr->b_ptr->name);
+
+#ifdef CONFIG_TIPC_DEBUG
+	if (link_reset_reset(l_ptr) || link_reset_unknown(l_ptr))
+		goto print_state;
+
+	tipc_printf(buf, ": NXO(%u):", mod(l_ptr->next_out_no));
+	tipc_printf(buf, "NXI(%u):", mod(l_ptr->next_in_no));
+	tipc_printf(buf, "SQUE");
+	if (l_ptr->first_out) {
+		tipc_printf(buf, "[%u..", msg_seqno(buf_msg(l_ptr->first_out)));
+		if (l_ptr->next_out)
+			tipc_printf(buf, "%u..",
+				    msg_seqno(buf_msg(l_ptr->next_out)));
+		tipc_printf(buf, "%u]", msg_seqno(buf_msg(l_ptr->last_out)));
+		if ((mod(msg_seqno(buf_msg(l_ptr->last_out)) -
+			 msg_seqno(buf_msg(l_ptr->first_out)))
+		     != (l_ptr->out_queue_size - 1)) ||
+		    (l_ptr->last_out->next != NULL)) {
+			tipc_printf(buf, "\nSend queue inconsistency\n");
+			tipc_printf(buf, "first_out= %p ", l_ptr->first_out);
+			tipc_printf(buf, "next_out= %p ", l_ptr->next_out);
+			tipc_printf(buf, "last_out= %p ", l_ptr->last_out);
+		}
+	} else
+		tipc_printf(buf, "[]");
+	tipc_printf(buf, "SQSIZ(%u)", l_ptr->out_queue_size);
+	if (l_ptr->oldest_deferred_in) {
+		u32 o = msg_seqno(buf_msg(l_ptr->oldest_deferred_in));
+		u32 n = msg_seqno(buf_msg(l_ptr->newest_deferred_in));
+		tipc_printf(buf, ":RQUE[%u..%u]", o, n);
+		if (l_ptr->deferred_inqueue_sz != mod((n + 1) - o)) {
+			tipc_printf(buf, ":RQSIZ(%u)",
+				    l_ptr->deferred_inqueue_sz);
+		}
+	}
+print_state:
+#endif
+
+	if (link_working_unknown(l_ptr))
+		tipc_printf(buf, ":WU");
+	else if (link_reset_reset(l_ptr))
+		tipc_printf(buf, ":RR");
+	else if (link_reset_unknown(l_ptr))
+		tipc_printf(buf, ":RU");
+	else if (link_working_working(l_ptr))
+		tipc_printf(buf, ":WW");
+	tipc_printf(buf, "\n");
+
+	tipc_printbuf_validate(buf);
+	info("%s", print_area);
+}
+
diff --git a/net/tipc/link.h b/net/tipc/link.h
new file mode 100644
index 00000000..74fbecab
--- /dev/null
+++ b/net/tipc/link.h
@@ -0,0 +1,314 @@
+/*
+ * net/tipc/link.h: Include file for TIPC link code
+ *
+ * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_LINK_H
+#define _TIPC_LINK_H
+
+#include "log.h"
+#include "msg.h"
+#include "node.h"
+
+#define PUSH_FAILED   1
+#define PUSH_FINISHED 2
+
+/*
+ * Link states
+ */
+
+#define WORKING_WORKING 560810u
+#define WORKING_UNKNOWN 560811u
+#define RESET_UNKNOWN   560812u
+#define RESET_RESET     560813u
+
+/*
+ * Starting value for maximum packet size negotiation on unicast links
+ * (unless bearer MTU is less)
+ */
+
+#define MAX_PKT_DEFAULT 1500
+
+/**
+ * struct link - TIPC link data structure
+ * @addr: network address of link's peer node
+ * @name: link name character string
+ * @media_addr: media address to use when sending messages over link
+ * @timer: link timer
+ * @owner: pointer to peer node
+ * @link_list: adjacent links in bearer's list of links
+ * @started: indicates if link has been started
+ * @checkpoint: reference point for triggering link continuity checking
+ * @peer_session: link session # being used by peer end of link
+ * @peer_bearer_id: bearer id used by link's peer endpoint
+ * @b_ptr: pointer to bearer used by link
+ * @tolerance: minimum link continuity loss needed to reset link [in ms]
+ * @continuity_interval: link continuity testing interval [in ms]
+ * @abort_limit: # of unacknowledged continuity probes needed to reset link
+ * @state: current state of link FSM
+ * @blocked: indicates if link has been administratively blocked
+ * @fsm_msg_cnt: # of protocol messages link FSM has sent in current state
+ * @proto_msg: template for control messages generated by link
+ * @pmsg: convenience pointer to "proto_msg" field
+ * @priority: current link priority
+ * @queue_limit: outbound message queue congestion thresholds (indexed by user)
+ * @exp_msg_count: # of tunnelled messages expected during link changeover
+ * @reset_checkpoint: seq # of last acknowledged message at time of link reset
+ * @max_pkt: current maximum packet size for this link
+ * @max_pkt_target: desired maximum packet size for this link
+ * @max_pkt_probes: # of probes based on current (max_pkt, max_pkt_target)
+ * @out_queue_size: # of messages in outbound message queue
+ * @first_out: ptr to first outbound message in queue
+ * @last_out: ptr to last outbound message in queue
+ * @next_out_no: next sequence number to use for outbound messages
+ * @last_retransmitted: sequence number of most recently retransmitted message
+ * @stale_count: # of identical retransmit requests made by peer
+ * @next_in_no: next sequence number to expect for inbound messages
+ * @deferred_inqueue_sz: # of messages in inbound message queue
+ * @oldest_deferred_in: ptr to first inbound message in queue
+ * @newest_deferred_in: ptr to last inbound message in queue
+ * @unacked_window: # of inbound messages rx'd without ack'ing back to peer
+ * @proto_msg_queue: ptr to (single) outbound control message
+ * @retransm_queue_size: number of messages to retransmit
+ * @retransm_queue_head: sequence number of first message to retransmit
+ * @next_out: ptr to first unsent outbound message in queue
+ * @waiting_ports: linked list of ports waiting for link congestion to abate
+ * @long_msg_seq_no: next identifier to use for outbound fragmented messages
+ * @defragm_buf: list of partially reassembled inbound message fragments
+ * @stats: collects statistics regarding link activity
+ */
+
+struct link {
+	u32 addr;
+	char name[TIPC_MAX_LINK_NAME];
+	struct tipc_media_addr media_addr;
+	struct timer_list timer;
+	struct tipc_node *owner;
+	struct list_head link_list;
+
+	/* Management and link supervision data */
+	int started;
+	u32 checkpoint;
+	u32 peer_session;
+	u32 peer_bearer_id;
+	struct tipc_bearer *b_ptr;
+	u32 tolerance;
+	u32 continuity_interval;
+	u32 abort_limit;
+	int state;
+	int blocked;
+	u32 fsm_msg_cnt;
+	struct {
+		unchar hdr[INT_H_SIZE];
+		unchar body[TIPC_MAX_IF_NAME];
+	} proto_msg;
+	struct tipc_msg *pmsg;
+	u32 priority;
+	u32 queue_limit[15];	/* queue_limit[0]==window limit */
+
+	/* Changeover */
+	u32 exp_msg_count;
+	u32 reset_checkpoint;
+
+	/* Max packet negotiation */
+	u32 max_pkt;
+	u32 max_pkt_target;
+	u32 max_pkt_probes;
+
+	/* Sending */
+	u32 out_queue_size;
+	struct sk_buff *first_out;
+	struct sk_buff *last_out;
+	u32 next_out_no;
+	u32 last_retransmitted;
+	u32 stale_count;
+
+	/* Reception */
+	u32 next_in_no;
+	u32 deferred_inqueue_sz;
+	struct sk_buff *oldest_deferred_in;
+	struct sk_buff *newest_deferred_in;
+	u32 unacked_window;
+
+	/* Congestion handling */
+	struct sk_buff *proto_msg_queue;
+	u32 retransm_queue_size;
+	u32 retransm_queue_head;
+	struct sk_buff *next_out;
+	struct list_head waiting_ports;
+
+	/* Fragmentation/defragmentation */
+	u32 long_msg_seq_no;
+	struct sk_buff *defragm_buf;
+
+	/* Statistics */
+	struct {
+		u32 sent_info;		/* used in counting # sent packets */
+		u32 recv_info;		/* used in counting # recv'd packets */
+		u32 sent_states;
+		u32 recv_states;
+		u32 sent_probes;
+		u32 recv_probes;
+		u32 sent_nacks;
+		u32 recv_nacks;
+		u32 sent_acks;
+		u32 sent_bundled;
+		u32 sent_bundles;
+		u32 recv_bundled;
+		u32 recv_bundles;
+		u32 retransmitted;
+		u32 sent_fragmented;
+		u32 sent_fragments;
+		u32 recv_fragmented;
+		u32 recv_fragments;
+		u32 link_congs;		/* # port sends blocked by congestion */
+		u32 bearer_congs;
+		u32 deferred_recv;
+		u32 duplicates;
+		u32 max_queue_sz;	/* send queue size high water mark */
+		u32 accu_queue_sz;	/* used for send queue size profiling */
+		u32 queue_sz_counts;	/* used for send queue size profiling */
+		u32 msg_length_counts;	/* used for message length profiling */
+		u32 msg_lengths_total;	/* used for message length profiling */
+		u32 msg_length_profile[7]; /* used for msg. length profiling */
+	} stats;
+};
+
+struct tipc_port;
+
+struct link *tipc_link_create(struct tipc_node *n_ptr,
+			      struct tipc_bearer *b_ptr,
+			      const struct tipc_media_addr *media_addr);
+void tipc_link_delete(struct link *l_ptr);
+void tipc_link_changeover(struct link *l_ptr);
+void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest);
+void tipc_link_reset_fragments(struct link *l_ptr);
+int tipc_link_is_up(struct link *l_ptr);
+int tipc_link_is_active(struct link *l_ptr);
+u32 tipc_link_push_packet(struct link *l_ptr);
+void tipc_link_stop(struct link *l_ptr);
+struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd);
+struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_space);
+struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area, int req_tlv_space);
+void tipc_link_reset(struct link *l_ptr);
+int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector);
+int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf);
+u32 tipc_link_get_max_pkt(u32 dest, u32 selector);
+int tipc_link_send_sections_fast(struct tipc_port *sender,
+				 struct iovec const *msg_sect,
+				 const u32 num_sect,
+				 unsigned int total_len,
+				 u32 destnode);
+void tipc_link_recv_bundle(struct sk_buff *buf);
+int  tipc_link_recv_fragment(struct sk_buff **pending,
+			     struct sk_buff **fb,
+			     struct tipc_msg **msg);
+void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int prob, u32 gap,
+			      u32 tolerance, u32 priority, u32 acked_mtu);
+void tipc_link_push_queue(struct link *l_ptr);
+u32 tipc_link_defer_pkt(struct sk_buff **head, struct sk_buff **tail,
+		   struct sk_buff *buf);
+void tipc_link_wakeup_ports(struct link *l_ptr, int all);
+void tipc_link_set_queue_limits(struct link *l_ptr, u32 window);
+void tipc_link_retransmit(struct link *l_ptr, struct sk_buff *start, u32 retransmits);
+
+/*
+ * Link sequence number manipulation routines (uses modulo 2**16 arithmetic)
+ */
+
+static inline u32 mod(u32 x)
+{
+	return x & 0xffffu;
+}
+
+static inline int between(u32 lower, u32 upper, u32 n)
+{
+	if ((lower < n) && (n < upper))
+		return 1;
+	if ((upper < lower) && ((n > lower) || (n < upper)))
+		return 1;
+	return 0;
+}
+
+static inline int less_eq(u32 left, u32 right)
+{
+	return mod(right - left) < 32768u;
+}
+
+static inline int less(u32 left, u32 right)
+{
+	return less_eq(left, right) && (mod(right) != mod(left));
+}
+
+static inline u32 lesser(u32 left, u32 right)
+{
+	return less_eq(left, right) ? left : right;
+}
+
+
+/*
+ * Link status checking routines
+ */
+
+static inline int link_working_working(struct link *l_ptr)
+{
+	return l_ptr->state == WORKING_WORKING;
+}
+
+static inline int link_working_unknown(struct link *l_ptr)
+{
+	return l_ptr->state == WORKING_UNKNOWN;
+}
+
+static inline int link_reset_unknown(struct link *l_ptr)
+{
+	return l_ptr->state == RESET_UNKNOWN;
+}
+
+static inline int link_reset_reset(struct link *l_ptr)
+{
+	return l_ptr->state == RESET_RESET;
+}
+
+static inline int link_blocked(struct link *l_ptr)
+{
+	return l_ptr->exp_msg_count || l_ptr->blocked;
+}
+
+static inline int link_congested(struct link *l_ptr)
+{
+	return l_ptr->out_queue_size >= l_ptr->queue_limit[0];
+}
+
+#endif
diff --git a/net/tipc/log.c b/net/tipc/log.c
new file mode 100644
index 00000000..952c39f6
--- /dev/null
+++ b/net/tipc/log.c
@@ -0,0 +1,351 @@
+/*
+ * net/tipc/log.c: TIPC print buffer routines for debugging
+ *
+ * Copyright (c) 1996-2006, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "log.h"
+
+/*
+ * TIPC pre-defines the following print buffers:
+ *
+ * TIPC_NULL : null buffer (i.e. print nowhere)
+ * TIPC_CONS : system console
+ * TIPC_LOG  : TIPC log buffer
+ *
+ * Additional user-defined print buffers are also permitted.
+ */
+
+static struct print_buf null_buf = { NULL, 0, NULL, 0 };
+struct print_buf *const TIPC_NULL = &null_buf;
+
+static struct print_buf cons_buf = { NULL, 0, NULL, 1 };
+struct print_buf *const TIPC_CONS = &cons_buf;
+
+static struct print_buf log_buf = { NULL, 0, NULL, 1 };
+struct print_buf *const TIPC_LOG = &log_buf;
+
+/*
+ * Locking policy when using print buffers.
+ *
+ * 1) tipc_printf() uses 'print_lock' to protect against concurrent access to
+ * 'print_string' when writing to a print buffer. This also protects against
+ * concurrent writes to the print buffer being written to.
+ *
+ * 2) tipc_log_XXX() leverages the aforementioned use of 'print_lock' to
+ * protect against all types of concurrent operations on their associated
+ * print buffer (not just write operations).
+ *
+ * Note: All routines of the form tipc_printbuf_XXX() are lock-free, and rely
+ * on the caller to prevent simultaneous use of the print buffer(s) being
+ * manipulated.
+ */
+
+static char print_string[TIPC_PB_MAX_STR];
+static DEFINE_SPINLOCK(print_lock);
+
+static void tipc_printbuf_move(struct print_buf *pb_to,
+			       struct print_buf *pb_from);
+
+#define FORMAT(PTR, LEN, FMT) \
+{\
+	va_list args;\
+	va_start(args, FMT);\
+	LEN = vsprintf(PTR, FMT, args);\
+	va_end(args);\
+	*(PTR + LEN) = '\0';\
+}
+
+/**
+ * tipc_printbuf_init - initialize print buffer to empty
+ * @pb: pointer to print buffer structure
+ * @raw: pointer to character array used by print buffer
+ * @size: size of character array
+ *
+ * Note: If the character array is too small (or absent), the print buffer
+ * becomes a null device that discards anything written to it.
+ */
+
+void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size)
+{
+	pb->buf = raw;
+	pb->crs = raw;
+	pb->size = size;
+	pb->echo = 0;
+
+	if (size < TIPC_PB_MIN_SIZE) {
+		pb->buf = NULL;
+	} else if (raw) {
+		pb->buf[0] = 0;
+		pb->buf[size - 1] = ~0;
+	}
+}
+
+/**
+ * tipc_printbuf_reset - reinitialize print buffer to empty state
+ * @pb: pointer to print buffer structure
+ */
+
+static void tipc_printbuf_reset(struct print_buf *pb)
+{
+	if (pb->buf) {
+		pb->crs = pb->buf;
+		pb->buf[0] = 0;
+		pb->buf[pb->size - 1] = ~0;
+	}
+}
+
+/**
+ * tipc_printbuf_empty - test if print buffer is in empty state
+ * @pb: pointer to print buffer structure
+ *
+ * Returns non-zero if print buffer is empty.
+ */
+
+static int tipc_printbuf_empty(struct print_buf *pb)
+{
+	return !pb->buf || (pb->crs == pb->buf);
+}
+
+/**
+ * tipc_printbuf_validate - check for print buffer overflow
+ * @pb: pointer to print buffer structure
+ *
+ * Verifies that a print buffer has captured all data written to it.
+ * If data has been lost, linearize buffer and prepend an error message
+ *
+ * Returns length of print buffer data string (including trailing NUL)
+ */
+
+int tipc_printbuf_validate(struct print_buf *pb)
+{
+	char *err = "\n\n*** PRINT BUFFER OVERFLOW ***\n\n";
+	char *cp_buf;
+	struct print_buf cb;
+
+	if (!pb->buf)
+		return 0;
+
+	if (pb->buf[pb->size - 1] == 0) {
+		cp_buf = kmalloc(pb->size, GFP_ATOMIC);
+		if (cp_buf) {
+			tipc_printbuf_init(&cb, cp_buf, pb->size);
+			tipc_printbuf_move(&cb, pb);
+			tipc_printbuf_move(pb, &cb);
+			kfree(cp_buf);
+			memcpy(pb->buf, err, strlen(err));
+		} else {
+			tipc_printbuf_reset(pb);
+			tipc_printf(pb, err);
+		}
+	}
+	return pb->crs - pb->buf + 1;
+}
+
+/**
+ * tipc_printbuf_move - move print buffer contents to another print buffer
+ * @pb_to: pointer to destination print buffer structure
+ * @pb_from: pointer to source print buffer structure
+ *
+ * Current contents of destination print buffer (if any) are discarded.
+ * Source print buffer becomes empty if a successful move occurs.
+ */
+
+static void tipc_printbuf_move(struct print_buf *pb_to,
+			       struct print_buf *pb_from)
+{
+	int len;
+
+	/* Handle the cases where contents can't be moved */
+
+	if (!pb_to->buf)
+		return;
+
+	if (!pb_from->buf) {
+		tipc_printbuf_reset(pb_to);
+		return;
+	}
+
+	if (pb_to->size < pb_from->size) {
+		strcpy(pb_to->buf, "*** PRINT BUFFER MOVE ERROR ***");
+		pb_to->buf[pb_to->size - 1] = ~0;
+		pb_to->crs = strchr(pb_to->buf, 0);
+		return;
+	}
+
+	/* Copy data from char after cursor to end (if used) */
+
+	len = pb_from->buf + pb_from->size - pb_from->crs - 2;
+	if ((pb_from->buf[pb_from->size - 1] == 0) && (len > 0)) {
+		strcpy(pb_to->buf, pb_from->crs + 1);
+		pb_to->crs = pb_to->buf + len;
+	} else
+		pb_to->crs = pb_to->buf;
+
+	/* Copy data from start to cursor (always) */
+
+	len = pb_from->crs - pb_from->buf;
+	strcpy(pb_to->crs, pb_from->buf);
+	pb_to->crs += len;
+
+	tipc_printbuf_reset(pb_from);
+}
+
+/**
+ * tipc_printf - append formatted output to print buffer
+ * @pb: pointer to print buffer
+ * @fmt: formatted info to be printed
+ */
+
+void tipc_printf(struct print_buf *pb, const char *fmt, ...)
+{
+	int chars_to_add;
+	int chars_left;
+	char save_char;
+
+	spin_lock_bh(&print_lock);
+
+	FORMAT(print_string, chars_to_add, fmt);
+	if (chars_to_add >= TIPC_PB_MAX_STR)
+		strcpy(print_string, "*** PRINT BUFFER STRING TOO LONG ***");
+
+	if (pb->buf) {
+		chars_left = pb->buf + pb->size - pb->crs - 1;
+		if (chars_to_add <= chars_left) {
+			strcpy(pb->crs, print_string);
+			pb->crs += chars_to_add;
+		} else if (chars_to_add >= (pb->size - 1)) {
+			strcpy(pb->buf, print_string + chars_to_add + 1
+			       - pb->size);
+			pb->crs = pb->buf + pb->size - 1;
+		} else {
+			strcpy(pb->buf, print_string + chars_left);
+			save_char = print_string[chars_left];
+			print_string[chars_left] = 0;
+			strcpy(pb->crs, print_string);
+			print_string[chars_left] = save_char;
+			pb->crs = pb->buf + chars_to_add - chars_left;
+		}
+	}
+
+	if (pb->echo)
+		printk("%s", print_string);
+
+	spin_unlock_bh(&print_lock);
+}
+
+/**
+ * tipc_log_resize - change the size of the TIPC log buffer
+ * @log_size: print buffer size to use
+ */
+
+int tipc_log_resize(int log_size)
+{
+	int res = 0;
+
+	spin_lock_bh(&print_lock);
+	kfree(TIPC_LOG->buf);
+	TIPC_LOG->buf = NULL;
+	if (log_size) {
+		if (log_size < TIPC_PB_MIN_SIZE)
+			log_size = TIPC_PB_MIN_SIZE;
+		res = TIPC_LOG->echo;
+		tipc_printbuf_init(TIPC_LOG, kmalloc(log_size, GFP_ATOMIC),
+				   log_size);
+		TIPC_LOG->echo = res;
+		res = !TIPC_LOG->buf;
+	}
+	spin_unlock_bh(&print_lock);
+
+	return res;
+}
+
+/**
+ * tipc_log_resize_cmd - reconfigure size of TIPC log buffer
+ */
+
+struct sk_buff *tipc_log_resize_cmd(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 value;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_UNSIGNED))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	value = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (value != delimit(value, 0, 32768))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (log size must be 0-32768)");
+	if (tipc_log_resize(value))
+		return tipc_cfg_reply_error_string(
+			"unable to create specified log (log size is now 0)");
+	return tipc_cfg_reply_none();
+}
+
+/**
+ * tipc_log_dump - capture TIPC log buffer contents in configuration message
+ */
+
+struct sk_buff *tipc_log_dump(void)
+{
+	struct sk_buff *reply;
+
+	spin_lock_bh(&print_lock);
+	if (!TIPC_LOG->buf) {
+		spin_unlock_bh(&print_lock);
+		reply = tipc_cfg_reply_ultra_string("log not activated\n");
+	} else if (tipc_printbuf_empty(TIPC_LOG)) {
+		spin_unlock_bh(&print_lock);
+		reply = tipc_cfg_reply_ultra_string("log is empty\n");
+	} else {
+		struct tlv_desc *rep_tlv;
+		struct print_buf pb;
+		int str_len;
+
+		str_len = min(TIPC_LOG->size, 32768u);
+		spin_unlock_bh(&print_lock);
+		reply = tipc_cfg_reply_alloc(TLV_SPACE(str_len));
+		if (reply) {
+			rep_tlv = (struct tlv_desc *)reply->data;
+			tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), str_len);
+			spin_lock_bh(&print_lock);
+			tipc_printbuf_move(&pb, TIPC_LOG);
+			spin_unlock_bh(&print_lock);
+			str_len = strlen(TLV_DATA(rep_tlv)) + 1;
+			skb_put(reply, TLV_SPACE(str_len));
+			TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+		}
+	}
+	return reply;
+}
diff --git a/net/tipc/log.h b/net/tipc/log.h
new file mode 100644
index 00000000..2248d962
--- /dev/null
+++ b/net/tipc/log.h
@@ -0,0 +1,67 @@
+/*
+ * net/tipc/log.h: Include file for TIPC print buffer routines
+ *
+ * Copyright (c) 1997-2006, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_LOG_H
+#define _TIPC_LOG_H
+
+/**
+ * struct print_buf - TIPC print buffer structure
+ * @buf: pointer to character array containing print buffer contents
+ * @size: size of character array
+ * @crs: pointer to first unused space in character array (i.e. final NUL)
+ * @echo: echo output to system console if non-zero
+ */
+
+struct print_buf {
+	char *buf;
+	u32 size;
+	char *crs;
+	int echo;
+};
+
+#define TIPC_PB_MIN_SIZE 64	/* minimum size for a print buffer's array */
+#define TIPC_PB_MAX_STR 512	/* max printable string (with trailing NUL) */
+
+void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size);
+int  tipc_printbuf_validate(struct print_buf *pb);
+
+int tipc_log_resize(int log_size);
+
+struct sk_buff *tipc_log_resize_cmd(const void *req_tlv_area,
+				    int req_tlv_space);
+struct sk_buff *tipc_log_dump(void);
+
+#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
new file mode 100644
index 00000000..03e57bf9
--- /dev/null
+++ b/net/tipc/msg.c
@@ -0,0 +1,355 @@
+/*
+ * net/tipc/msg.c: TIPC message header routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "msg.h"
+
+u32 tipc_msg_tot_importance(struct tipc_msg *m)
+{
+	if (likely(msg_isdata(m))) {
+		if (likely(msg_orignode(m) == tipc_own_addr))
+			return msg_importance(m);
+		return msg_importance(m) + 4;
+	}
+	if ((msg_user(m) == MSG_FRAGMENTER)  &&
+	    (msg_type(m) == FIRST_FRAGMENT))
+		return msg_importance(msg_get_wrapped(m));
+	return msg_importance(m);
+}
+
+
+void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type,
+			    u32 hsize, u32 destnode)
+{
+	memset(m, 0, hsize);
+	msg_set_version(m);
+	msg_set_user(m, user);
+	msg_set_hdr_sz(m, hsize);
+	msg_set_size(m, hsize);
+	msg_set_prevnode(m, tipc_own_addr);
+	msg_set_type(m, type);
+	if (!msg_short(m)) {
+		msg_set_orignode(m, tipc_own_addr);
+		msg_set_destnode(m, destnode);
+	}
+}
+
+/**
+ * tipc_msg_build - create message using specified header and data
+ *
+ * Note: Caller must not hold any locks in case copy_from_user() is interrupted!
+ *
+ * Returns message data size or errno
+ */
+
+int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect,
+		   u32 num_sect, unsigned int total_len,
+			    int max_size, int usrmem, struct sk_buff **buf)
+{
+	int dsz, sz, hsz, pos, res, cnt;
+
+	dsz = total_len;
+	pos = hsz = msg_hdr_sz(hdr);
+	sz = hsz + dsz;
+	msg_set_size(hdr, sz);
+	if (unlikely(sz > max_size)) {
+		*buf = NULL;
+		return dsz;
+	}
+
+	*buf = tipc_buf_acquire(sz);
+	if (!(*buf))
+		return -ENOMEM;
+	skb_copy_to_linear_data(*buf, hdr, hsz);
+	for (res = 1, cnt = 0; res && (cnt < num_sect); cnt++) {
+		if (likely(usrmem))
+			res = !copy_from_user((*buf)->data + pos,
+					      msg_sect[cnt].iov_base,
+					      msg_sect[cnt].iov_len);
+		else
+			skb_copy_to_linear_data_offset(*buf, pos,
+						       msg_sect[cnt].iov_base,
+						       msg_sect[cnt].iov_len);
+		pos += msg_sect[cnt].iov_len;
+	}
+	if (likely(res))
+		return dsz;
+
+	buf_discard(*buf);
+	*buf = NULL;
+	return -EFAULT;
+}
+
+#ifdef CONFIG_TIPC_DEBUG
+
+void tipc_msg_dbg(struct print_buf *buf, struct tipc_msg *msg, const char *str)
+{
+	u32 usr = msg_user(msg);
+	tipc_printf(buf, KERN_DEBUG);
+	tipc_printf(buf, str);
+
+	switch (usr) {
+	case MSG_BUNDLER:
+		tipc_printf(buf, "BNDL::");
+		tipc_printf(buf, "MSGS(%u):", msg_msgcnt(msg));
+		break;
+	case BCAST_PROTOCOL:
+		tipc_printf(buf, "BCASTP::");
+		break;
+	case MSG_FRAGMENTER:
+		tipc_printf(buf, "FRAGM::");
+		switch (msg_type(msg)) {
+		case FIRST_FRAGMENT:
+			tipc_printf(buf, "FIRST:");
+			break;
+		case FRAGMENT:
+			tipc_printf(buf, "BODY:");
+			break;
+		case LAST_FRAGMENT:
+			tipc_printf(buf, "LAST:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN:%x", msg_type(msg));
+
+		}
+		tipc_printf(buf, "NO(%u/%u):", msg_long_msgno(msg),
+			    msg_fragm_no(msg));
+		break;
+	case TIPC_LOW_IMPORTANCE:
+	case TIPC_MEDIUM_IMPORTANCE:
+	case TIPC_HIGH_IMPORTANCE:
+	case TIPC_CRITICAL_IMPORTANCE:
+		tipc_printf(buf, "DAT%u:", msg_user(msg));
+		if (msg_short(msg)) {
+			tipc_printf(buf, "CON:");
+			break;
+		}
+		switch (msg_type(msg)) {
+		case TIPC_CONN_MSG:
+			tipc_printf(buf, "CON:");
+			break;
+		case TIPC_MCAST_MSG:
+			tipc_printf(buf, "MCST:");
+			break;
+		case TIPC_NAMED_MSG:
+			tipc_printf(buf, "NAM:");
+			break;
+		case TIPC_DIRECT_MSG:
+			tipc_printf(buf, "DIR:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE %u", msg_type(msg));
+		}
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):",
+				    msg_reroute_cnt(msg));
+		break;
+	case NAME_DISTRIBUTOR:
+		tipc_printf(buf, "NMD::");
+		switch (msg_type(msg)) {
+		case PUBLICATION:
+			tipc_printf(buf, "PUBL(%u):", (msg_size(msg) - msg_hdr_sz(msg)) / 20);	/* Items */
+			break;
+		case WITHDRAWAL:
+			tipc_printf(buf, "WDRW:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN:%x", msg_type(msg));
+		}
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):",
+				    msg_reroute_cnt(msg));
+		break;
+	case CONN_MANAGER:
+		tipc_printf(buf, "CONN_MNG:");
+		switch (msg_type(msg)) {
+		case CONN_PROBE:
+			tipc_printf(buf, "PROBE:");
+			break;
+		case CONN_PROBE_REPLY:
+			tipc_printf(buf, "PROBE_REPLY:");
+			break;
+		case CONN_ACK:
+			tipc_printf(buf, "CONN_ACK:");
+			tipc_printf(buf, "ACK(%u):", msg_msgcnt(msg));
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
+		}
+		if (msg_reroute_cnt(msg))
+			tipc_printf(buf, "REROUTED(%u):", msg_reroute_cnt(msg));
+		break;
+	case LINK_PROTOCOL:
+		switch (msg_type(msg)) {
+		case STATE_MSG:
+			tipc_printf(buf, "STATE:");
+			tipc_printf(buf, "%s:", msg_probe(msg) ? "PRB" : "");
+			tipc_printf(buf, "NXS(%u):", msg_next_sent(msg));
+			tipc_printf(buf, "GAP(%u):", msg_seq_gap(msg));
+			tipc_printf(buf, "LSTBC(%u):", msg_last_bcast(msg));
+			break;
+		case RESET_MSG:
+			tipc_printf(buf, "RESET:");
+			if (msg_size(msg) != msg_hdr_sz(msg))
+				tipc_printf(buf, "BEAR:%s:", msg_data(msg));
+			break;
+		case ACTIVATE_MSG:
+			tipc_printf(buf, "ACTIVATE:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
+		}
+		tipc_printf(buf, "PLANE(%c):", msg_net_plane(msg));
+		tipc_printf(buf, "SESS(%u):", msg_session(msg));
+		break;
+	case CHANGEOVER_PROTOCOL:
+		tipc_printf(buf, "TUNL:");
+		switch (msg_type(msg)) {
+		case DUPLICATE_MSG:
+			tipc_printf(buf, "DUPL:");
+			break;
+		case ORIGINAL_MSG:
+			tipc_printf(buf, "ORIG:");
+			tipc_printf(buf, "EXP(%u)", msg_msgcnt(msg));
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x", msg_type(msg));
+		}
+		break;
+	case LINK_CONFIG:
+		tipc_printf(buf, "CFG:");
+		switch (msg_type(msg)) {
+		case DSC_REQ_MSG:
+			tipc_printf(buf, "DSC_REQ:");
+			break;
+		case DSC_RESP_MSG:
+			tipc_printf(buf, "DSC_RESP:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN TYPE:%x:", msg_type(msg));
+			break;
+		}
+		break;
+	default:
+		tipc_printf(buf, "UNKNOWN USER:");
+	}
+
+	switch (usr) {
+	case CONN_MANAGER:
+	case TIPC_LOW_IMPORTANCE:
+	case TIPC_MEDIUM_IMPORTANCE:
+	case TIPC_HIGH_IMPORTANCE:
+	case TIPC_CRITICAL_IMPORTANCE:
+		switch (msg_errcode(msg)) {
+		case TIPC_OK:
+			break;
+		case TIPC_ERR_NO_NAME:
+			tipc_printf(buf, "NO_NAME:");
+			break;
+		case TIPC_ERR_NO_PORT:
+			tipc_printf(buf, "NO_PORT:");
+			break;
+		case TIPC_ERR_NO_NODE:
+			tipc_printf(buf, "NO_PROC:");
+			break;
+		case TIPC_ERR_OVERLOAD:
+			tipc_printf(buf, "OVERLOAD:");
+			break;
+		case TIPC_CONN_SHUTDOWN:
+			tipc_printf(buf, "SHUTDOWN:");
+			break;
+		default:
+			tipc_printf(buf, "UNKNOWN ERROR(%x):",
+				    msg_errcode(msg));
+		}
+	default:
+		break;
+	}
+
+	tipc_printf(buf, "HZ(%u):", msg_hdr_sz(msg));
+	tipc_printf(buf, "SZ(%u):", msg_size(msg));
+	tipc_printf(buf, "SQNO(%u):", msg_seqno(msg));
+
+	if (msg_non_seq(msg))
+		tipc_printf(buf, "NOSEQ:");
+	else
+		tipc_printf(buf, "ACK(%u):", msg_ack(msg));
+	tipc_printf(buf, "BACK(%u):", msg_bcast_ack(msg));
+	tipc_printf(buf, "PRND(%x)", msg_prevnode(msg));
+
+	if (msg_isdata(msg)) {
+		if (msg_named(msg)) {
+			tipc_printf(buf, "NTYP(%u):", msg_nametype(msg));
+			tipc_printf(buf, "NINST(%u)", msg_nameinst(msg));
+		}
+	}
+
+	if ((usr != LINK_PROTOCOL) && (usr != LINK_CONFIG) &&
+	    (usr != MSG_BUNDLER)) {
+		if (!msg_short(msg)) {
+			tipc_printf(buf, ":ORIG(%x:%u):",
+				    msg_orignode(msg), msg_origport(msg));
+			tipc_printf(buf, ":DEST(%x:%u):",
+				    msg_destnode(msg), msg_destport(msg));
+		} else {
+			tipc_printf(buf, ":OPRT(%u):", msg_origport(msg));
+			tipc_printf(buf, ":DPRT(%u):", msg_destport(msg));
+		}
+	}
+	if (msg_user(msg) == NAME_DISTRIBUTOR) {
+		tipc_printf(buf, ":ONOD(%x):", msg_orignode(msg));
+		tipc_printf(buf, ":DNOD(%x):", msg_destnode(msg));
+	}
+
+	if (msg_user(msg) ==  LINK_CONFIG) {
+		u32 *raw = (u32 *)msg;
+		struct tipc_media_addr *orig = (struct tipc_media_addr *)&raw[5];
+		tipc_printf(buf, ":DDOM(%x):", msg_dest_domain(msg));
+		tipc_printf(buf, ":NETID(%u):", msg_bc_netid(msg));
+		tipc_media_addr_printf(buf, orig);
+	}
+	if (msg_user(msg) == BCAST_PROTOCOL) {
+		tipc_printf(buf, "BCNACK:AFTER(%u):", msg_bcgap_after(msg));
+		tipc_printf(buf, "TO(%u):", msg_bcgap_to(msg));
+	}
+	tipc_printf(buf, "\n");
+	if ((usr == CHANGEOVER_PROTOCOL) && (msg_msgcnt(msg)))
+		tipc_msg_dbg(buf, msg_get_wrapped(msg), "      /");
+	if ((usr == MSG_FRAGMENTER) && (msg_type(msg) == FIRST_FRAGMENT))
+		tipc_msg_dbg(buf, msg_get_wrapped(msg), "      /");
+}
+
+#endif
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
new file mode 100644
index 00000000..84524547
--- /dev/null
+++ b/net/tipc/msg.h
@@ -0,0 +1,767 @@
+/*
+ * net/tipc/msg.h: Include file for TIPC message header routines
+ *
+ * Copyright (c) 2000-2007, Ericsson AB
+ * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_MSG_H
+#define _TIPC_MSG_H
+
+#include "bearer.h"
+
+/*
+ * Constants and routines used to read and write TIPC payload message headers
+ *
+ * Note: Some items are also used with TIPC internal message headers
+ */
+
+#define TIPC_VERSION              2
+
+/*
+ * Payload message users are defined in TIPC's public API:
+ * - TIPC_LOW_IMPORTANCE
+ * - TIPC_MEDIUM_IMPORTANCE
+ * - TIPC_HIGH_IMPORTANCE
+ * - TIPC_CRITICAL_IMPORTANCE
+ */
+
+/*
+ * Payload message types
+ */
+
+#define TIPC_CONN_MSG		0
+#define TIPC_MCAST_MSG		1
+#define TIPC_NAMED_MSG		2
+#define TIPC_DIRECT_MSG		3
+
+/*
+ * Message header sizes
+ */
+
+#define SHORT_H_SIZE              24	/* Connected, in-cluster messages */
+#define DIR_MSG_H_SIZE            32	/* Directly addressed messages */
+#define LONG_H_SIZE               40	/* Named messages */
+#define MCAST_H_SIZE              44	/* Multicast messages */
+#define INT_H_SIZE                40	/* Internal messages */
+#define MIN_H_SIZE                24	/* Smallest legal TIPC header size */
+#define MAX_H_SIZE                60	/* Largest possible TIPC header size */
+
+#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
+
+
+struct tipc_msg {
+	__be32 hdr[15];
+};
+
+
+static inline u32 msg_word(struct tipc_msg *m, u32 pos)
+{
+	return ntohl(m->hdr[pos]);
+}
+
+static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val)
+{
+	m->hdr[w] = htonl(val);
+}
+
+static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask)
+{
+	return (msg_word(m, w) >> pos) & mask;
+}
+
+static inline void msg_set_bits(struct tipc_msg *m, u32 w,
+				u32 pos, u32 mask, u32 val)
+{
+	val = (val & mask) << pos;
+	mask = mask << pos;
+	m->hdr[w] &= ~htonl(mask);
+	m->hdr[w] |= htonl(val);
+}
+
+static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b)
+{
+	u32 temp = msg->hdr[a];
+
+	msg->hdr[a] = msg->hdr[b];
+	msg->hdr[b] = temp;
+}
+
+/*
+ * Word 0
+ */
+
+static inline u32 msg_version(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 29, 7);
+}
+
+static inline void msg_set_version(struct tipc_msg *m)
+{
+	msg_set_bits(m, 0, 29, 7, TIPC_VERSION);
+}
+
+static inline u32 msg_user(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 25, 0xf);
+}
+
+static inline u32 msg_isdata(struct tipc_msg *m)
+{
+	return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
+}
+
+static inline void msg_set_user(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 0, 25, 0xf, n);
+}
+
+static inline u32 msg_importance(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 25, 0xf);
+}
+
+static inline void msg_set_importance(struct tipc_msg *m, u32 i)
+{
+	msg_set_user(m, i);
+}
+
+static inline u32 msg_hdr_sz(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 21, 0xf) << 2;
+}
+
+static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 0, 21, 0xf, n>>2);
+}
+
+static inline u32 msg_size(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 0, 0x1ffff);
+}
+
+static inline u32 msg_data_sz(struct tipc_msg *m)
+{
+	return msg_size(m) - msg_hdr_sz(m);
+}
+
+static inline int msg_non_seq(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 20, 1);
+}
+
+static inline void msg_set_non_seq(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 0, 20, 1, n);
+}
+
+static inline int msg_dest_droppable(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 19, 1);
+}
+
+static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d)
+{
+	msg_set_bits(m, 0, 19, 1, d);
+}
+
+static inline int msg_src_droppable(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d)
+{
+	msg_set_bits(m, 0, 18, 1, d);
+}
+
+static inline void msg_set_size(struct tipc_msg *m, u32 sz)
+{
+	m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz);
+}
+
+
+/*
+ * Word 1
+ */
+
+static inline u32 msg_type(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 29, 0x7);
+}
+
+static inline void msg_set_type(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 1, 29, 0x7, n);
+}
+
+static inline u32 msg_named(struct tipc_msg *m)
+{
+	return msg_type(m) == TIPC_NAMED_MSG;
+}
+
+static inline u32 msg_mcast(struct tipc_msg *m)
+{
+	return msg_type(m) == TIPC_MCAST_MSG;
+}
+
+static inline u32 msg_connected(struct tipc_msg *m)
+{
+	return msg_type(m) == TIPC_CONN_MSG;
+}
+
+static inline u32 msg_errcode(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 25, 0xf);
+}
+
+static inline void msg_set_errcode(struct tipc_msg *m, u32 err)
+{
+	msg_set_bits(m, 1, 25, 0xf, err);
+}
+
+static inline u32 msg_reroute_cnt(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 21, 0xf);
+}
+
+static inline void msg_incr_reroute_cnt(struct tipc_msg *m)
+{
+	msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
+}
+
+static inline void msg_reset_reroute_cnt(struct tipc_msg *m)
+{
+	msg_set_bits(m, 1, 21, 0xf, 0);
+}
+
+static inline u32 msg_lookup_scope(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 19, 0x3);
+}
+
+static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 1, 19, 0x3, n);
+}
+
+static inline u32 msg_bcast_ack(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 0, 0xffff);
+}
+
+static inline void msg_set_bcast_ack(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 1, 0, 0xffff, n);
+}
+
+
+/*
+ * Word 2
+ */
+
+static inline u32 msg_ack(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_ack(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_seqno(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_seqno(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+/*
+ * TIPC may utilize the "link ack #" and "link seq #" fields of a short
+ * message header to hold the destination node for the message, since the
+ * normal "dest node" field isn't present.  This cache is only referenced
+ * when required, so populating the cache of a longer message header is
+ * harmless (as long as the header has the two link sequence fields present).
+ *
+ * Note: Host byte order is OK here, since the info never goes off-card.
+ */
+
+static inline u32 msg_destnode_cache(struct tipc_msg *m)
+{
+	return m->hdr[2];
+}
+
+static inline void msg_set_destnode_cache(struct tipc_msg *m, u32 dnode)
+{
+	m->hdr[2] = dnode;
+}
+
+/*
+ * Words 3-10
+ */
+
+
+static inline u32 msg_prevnode(struct tipc_msg *m)
+{
+	return msg_word(m, 3);
+}
+
+static inline void msg_set_prevnode(struct tipc_msg *m, u32 a)
+{
+	msg_set_word(m, 3, a);
+}
+
+static inline u32 msg_origport(struct tipc_msg *m)
+{
+	return msg_word(m, 4);
+}
+
+static inline void msg_set_origport(struct tipc_msg *m, u32 p)
+{
+	msg_set_word(m, 4, p);
+}
+
+static inline u32 msg_destport(struct tipc_msg *m)
+{
+	return msg_word(m, 5);
+}
+
+static inline void msg_set_destport(struct tipc_msg *m, u32 p)
+{
+	msg_set_word(m, 5, p);
+}
+
+static inline u32 msg_mc_netid(struct tipc_msg *m)
+{
+	return msg_word(m, 5);
+}
+
+static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p)
+{
+	msg_set_word(m, 5, p);
+}
+
+static inline int msg_short(struct tipc_msg *m)
+{
+	return msg_hdr_sz(m) == 24;
+}
+
+static inline u32 msg_orignode(struct tipc_msg *m)
+{
+	if (likely(msg_short(m)))
+		return msg_prevnode(m);
+	return msg_word(m, 6);
+}
+
+static inline void msg_set_orignode(struct tipc_msg *m, u32 a)
+{
+	msg_set_word(m, 6, a);
+}
+
+static inline u32 msg_destnode(struct tipc_msg *m)
+{
+	return msg_word(m, 7);
+}
+
+static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
+{
+	msg_set_word(m, 7, a);
+}
+
+static inline int msg_is_dest(struct tipc_msg *m, u32 d)
+{
+	return msg_short(m) || (msg_destnode(m) == d);
+}
+
+static inline u32 msg_nametype(struct tipc_msg *m)
+{
+	return msg_word(m, 8);
+}
+
+static inline void msg_set_nametype(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 8, n);
+}
+
+static inline u32 msg_nameinst(struct tipc_msg *m)
+{
+	return msg_word(m, 9);
+}
+
+static inline u32 msg_namelower(struct tipc_msg *m)
+{
+	return msg_nameinst(m);
+}
+
+static inline void msg_set_namelower(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 9, n);
+}
+
+static inline void msg_set_nameinst(struct tipc_msg *m, u32 n)
+{
+	msg_set_namelower(m, n);
+}
+
+static inline u32 msg_nameupper(struct tipc_msg *m)
+{
+	return msg_word(m, 10);
+}
+
+static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 10, n);
+}
+
+static inline unchar *msg_data(struct tipc_msg *m)
+{
+	return ((unchar *)m) + msg_hdr_sz(m);
+}
+
+static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m)
+{
+	return (struct tipc_msg *)msg_data(m);
+}
+
+
+/*
+ * Constants and routines used to read and write TIPC internal message headers
+ */
+
+/*
+ * Internal message users
+ */
+
+#define  BCAST_PROTOCOL       5
+#define  MSG_BUNDLER          6
+#define  LINK_PROTOCOL        7
+#define  CONN_MANAGER         8
+#define  ROUTE_DISTRIBUTOR    9		/* obsoleted */
+#define  CHANGEOVER_PROTOCOL  10
+#define  NAME_DISTRIBUTOR     11
+#define  MSG_FRAGMENTER       12
+#define  LINK_CONFIG          13
+
+/*
+ *  Connection management protocol message types
+ */
+
+#define CONN_PROBE        0
+#define CONN_PROBE_REPLY  1
+#define CONN_ACK          2
+
+/*
+ * Name distributor message types
+ */
+
+#define PUBLICATION       0
+#define WITHDRAWAL        1
+
+/*
+ * Segmentation message types
+ */
+
+#define FIRST_FRAGMENT		0
+#define FRAGMENT		1
+#define LAST_FRAGMENT		2
+
+/*
+ * Link management protocol message types
+ */
+
+#define STATE_MSG		0
+#define RESET_MSG		1
+#define ACTIVATE_MSG		2
+
+/*
+ * Changeover tunnel message types
+ */
+#define DUPLICATE_MSG		0
+#define ORIGINAL_MSG		1
+
+/*
+ * Config protocol message types
+ */
+
+#define DSC_REQ_MSG		0
+#define DSC_RESP_MSG		1
+
+
+/*
+ * Word 1
+ */
+
+static inline u32 msg_seq_gap(struct tipc_msg *m)
+{
+	return msg_bits(m, 1, 16, 0x1fff);
+}
+
+static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 1, 16, 0x1fff, n);
+}
+
+
+/*
+ * Word 2
+ */
+
+static inline u32 msg_dest_domain(struct tipc_msg *m)
+{
+	return msg_word(m, 2);
+}
+
+static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 2, n);
+}
+
+static inline u32 msg_bcgap_after(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 16, 0xffff);
+}
+
+static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 2, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcgap_to(struct tipc_msg *m)
+{
+	return msg_bits(m, 2, 0, 0xffff);
+}
+
+static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 2, 0, 0xffff, n);
+}
+
+
+/*
+ * Word 4
+ */
+
+static inline u32 msg_last_bcast(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+
+static inline u32 msg_fragm_no(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 16, 0xffff);
+}
+
+static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 16, 0xffff, n);
+}
+
+
+static inline u32 msg_next_sent(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_next_sent(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+
+static inline u32 msg_long_msgno(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline u32 msg_bc_netid(struct tipc_msg *m)
+{
+	return msg_word(m, 4);
+}
+
+static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
+{
+	msg_set_word(m, 4, id);
+}
+
+static inline u32 msg_link_selector(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 1);
+}
+
+static inline void msg_set_link_selector(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 1, (n & 1));
+}
+
+/*
+ * Word 5
+ */
+
+static inline u32 msg_session(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 16, 0xffff);
+}
+
+static inline void msg_set_session(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 16, 0xffff, n);
+}
+
+static inline u32 msg_probe(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 0, 1);
+}
+
+static inline void msg_set_probe(struct tipc_msg *m, u32 val)
+{
+	msg_set_bits(m, 5, 0, 1, (val & 1));
+}
+
+static inline char msg_net_plane(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 1, 7) + 'A';
+}
+
+static inline void msg_set_net_plane(struct tipc_msg *m, char n)
+{
+	msg_set_bits(m, 5, 1, 7, (n - 'A'));
+}
+
+static inline u32 msg_linkprio(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 4, 0x1f);
+}
+
+static inline void msg_set_linkprio(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 4, 0x1f, n);
+}
+
+static inline u32 msg_bearer_id(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 9, 0x7);
+}
+
+static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 5, 9, 0x7, n);
+}
+
+static inline u32 msg_redundant_link(struct tipc_msg *m)
+{
+	return msg_bits(m, 5, 12, 0x1);
+}
+
+static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r)
+{
+	msg_set_bits(m, 5, 12, 0x1, r);
+}
+
+
+/*
+ * Word 9
+ */
+
+static inline u32 msg_msgcnt(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_bcast_tag(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
+static inline u32 msg_max_pkt(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 16, 0xffff) * 4;
+}
+
+static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 16, 0xffff, (n / 4));
+}
+
+static inline u32 msg_link_tolerance(struct tipc_msg *m)
+{
+	return msg_bits(m, 9, 0, 0xffff);
+}
+
+static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 9, 0, 0xffff, n);
+}
+
+u32 tipc_msg_tot_importance(struct tipc_msg *m);
+void tipc_msg_init(struct tipc_msg *m, u32 user, u32 type,
+			    u32 hsize, u32 destnode);
+int tipc_msg_build(struct tipc_msg *hdr, struct iovec const *msg_sect,
+		   u32 num_sect, unsigned int total_len,
+			    int max_size, int usrmem, struct sk_buff **buf);
+
+static inline void msg_set_media_addr(struct tipc_msg *m, struct tipc_media_addr *a)
+{
+	memcpy(&((int *)m)[5], a, sizeof(*a));
+}
+
+static inline void msg_get_media_addr(struct tipc_msg *m, struct tipc_media_addr *a)
+{
+	memcpy(a, &((int *)m)[5], sizeof(*a));
+}
+
+#endif
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
new file mode 100644
index 00000000..80025a1b
--- /dev/null
+++ b/net/tipc/name_distr.c
@@ -0,0 +1,319 @@
+/*
+ * net/tipc/name_distr.c: TIPC name distribution code
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "link.h"
+#include "name_distr.h"
+
+#define ITEM_SIZE sizeof(struct distr_item)
+
+/**
+ * struct distr_item - publication info distributed to other nodes
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @ref: publishing port reference
+ * @key: publication key
+ *
+ * ===> All fields are stored in network byte order. <===
+ *
+ * First 3 fields identify (name or) name sequence being published.
+ * Reference field uniquely identifies port that published name sequence.
+ * Key field uniquely identifies publication, in the event a port has
+ * multiple publications of the same name sequence.
+ *
+ * Note: There is no field that identifies the publishing node because it is
+ * the same for all items contained within a publication message.
+ */
+
+struct distr_item {
+	__be32 type;
+	__be32 lower;
+	__be32 upper;
+	__be32 ref;
+	__be32 key;
+};
+
+/**
+ * List of externally visible publications by this node --
+ * that is, all publications having scope > TIPC_NODE_SCOPE.
+ */
+
+static LIST_HEAD(publ_root);
+static u32 publ_cnt;
+
+/**
+ * publ_to_item - add publication info to a publication message
+ */
+
+static void publ_to_item(struct distr_item *i, struct publication *p)
+{
+	i->type = htonl(p->type);
+	i->lower = htonl(p->lower);
+	i->upper = htonl(p->upper);
+	i->ref = htonl(p->ref);
+	i->key = htonl(p->key);
+}
+
+/**
+ * named_prepare_buf - allocate & initialize a publication message
+ */
+
+static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest)
+{
+	struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size);
+	struct tipc_msg *msg;
+
+	if (buf != NULL) {
+		msg = buf_msg(buf);
+		tipc_msg_init(msg, NAME_DISTRIBUTOR, type, LONG_H_SIZE, dest);
+		msg_set_size(msg, LONG_H_SIZE + size);
+	}
+	return buf;
+}
+
+static void named_cluster_distribute(struct sk_buff *buf)
+{
+	struct sk_buff *buf_copy;
+	struct tipc_node *n_ptr;
+
+	list_for_each_entry(n_ptr, &tipc_node_list, list) {
+		if (tipc_node_active_links(n_ptr)) {
+			buf_copy = skb_copy(buf, GFP_ATOMIC);
+			if (!buf_copy)
+				break;
+			msg_set_destnode(buf_msg(buf_copy), n_ptr->addr);
+			tipc_link_send(buf_copy, n_ptr->addr, n_ptr->addr);
+		}
+	}
+
+	buf_discard(buf);
+}
+
+/**
+ * tipc_named_publish - tell other nodes about a new publication by this node
+ */
+
+void tipc_named_publish(struct publication *publ)
+{
+	struct sk_buff *buf;
+	struct distr_item *item;
+
+	list_add_tail(&publ->local_list, &publ_root);
+	publ_cnt++;
+
+	buf = named_prepare_buf(PUBLICATION, ITEM_SIZE, 0);
+	if (!buf) {
+		warn("Publication distribution failure\n");
+		return;
+	}
+
+	item = (struct distr_item *)msg_data(buf_msg(buf));
+	publ_to_item(item, publ);
+	named_cluster_distribute(buf);
+}
+
+/**
+ * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node
+ */
+
+void tipc_named_withdraw(struct publication *publ)
+{
+	struct sk_buff *buf;
+	struct distr_item *item;
+
+	list_del(&publ->local_list);
+	publ_cnt--;
+
+	buf = named_prepare_buf(WITHDRAWAL, ITEM_SIZE, 0);
+	if (!buf) {
+		warn("Withdrawal distribution failure\n");
+		return;
+	}
+
+	item = (struct distr_item *)msg_data(buf_msg(buf));
+	publ_to_item(item, publ);
+	named_cluster_distribute(buf);
+}
+
+/**
+ * tipc_named_node_up - tell specified node about all publications by this node
+ */
+
+void tipc_named_node_up(unsigned long node)
+{
+	struct publication *publ;
+	struct distr_item *item = NULL;
+	struct sk_buff *buf = NULL;
+	u32 left = 0;
+	u32 rest;
+	u32 max_item_buf;
+
+	read_lock_bh(&tipc_nametbl_lock);
+	max_item_buf = TIPC_MAX_USER_MSG_SIZE / ITEM_SIZE;
+	max_item_buf *= ITEM_SIZE;
+	rest = publ_cnt * ITEM_SIZE;
+
+	list_for_each_entry(publ, &publ_root, local_list) {
+		if (!buf) {
+			left = (rest <= max_item_buf) ? rest : max_item_buf;
+			rest -= left;
+			buf = named_prepare_buf(PUBLICATION, left, node);
+			if (!buf) {
+				warn("Bulk publication distribution failure\n");
+				goto exit;
+			}
+			item = (struct distr_item *)msg_data(buf_msg(buf));
+		}
+		publ_to_item(item, publ);
+		item++;
+		left -= ITEM_SIZE;
+		if (!left) {
+			msg_set_link_selector(buf_msg(buf), node);
+			tipc_link_send(buf, node, node);
+			buf = NULL;
+		}
+	}
+exit:
+	read_unlock_bh(&tipc_nametbl_lock);
+}
+
+/**
+ * named_purge_publ - remove publication associated with a failed node
+ *
+ * Invoked for each publication issued by a newly failed node.
+ * Removes publication structure from name table & deletes it.
+ * In rare cases the link may have come back up again when this
+ * function is called, and we have two items representing the same
+ * publication. Nudge this item's key to distinguish it from the other.
+ */
+
+static void named_purge_publ(struct publication *publ)
+{
+	struct publication *p;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	publ->key += 1222345;
+	p = tipc_nametbl_remove_publ(publ->type, publ->lower,
+				     publ->node, publ->ref, publ->key);
+	if (p)
+		tipc_nodesub_unsubscribe(&p->subscr);
+	write_unlock_bh(&tipc_nametbl_lock);
+
+	if (p != publ) {
+		err("Unable to remove publication from failed node\n"
+		    "(type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n",
+		    publ->type, publ->lower, publ->node, publ->ref, publ->key);
+	}
+
+	kfree(p);
+}
+
+/**
+ * tipc_named_recv - process name table update message sent by another node
+ */
+
+void tipc_named_recv(struct sk_buff *buf)
+{
+	struct publication *publ;
+	struct tipc_msg *msg = buf_msg(buf);
+	struct distr_item *item = (struct distr_item *)msg_data(msg);
+	u32 count = msg_data_sz(msg) / ITEM_SIZE;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	while (count--) {
+		if (msg_type(msg) == PUBLICATION) {
+			publ = tipc_nametbl_insert_publ(ntohl(item->type),
+							ntohl(item->lower),
+							ntohl(item->upper),
+							TIPC_CLUSTER_SCOPE,
+							msg_orignode(msg),
+							ntohl(item->ref),
+							ntohl(item->key));
+			if (publ) {
+				tipc_nodesub_subscribe(&publ->subscr,
+						       msg_orignode(msg),
+						       publ,
+						       (net_ev_handler)
+						       named_purge_publ);
+			}
+		} else if (msg_type(msg) == WITHDRAWAL) {
+			publ = tipc_nametbl_remove_publ(ntohl(item->type),
+							ntohl(item->lower),
+							msg_orignode(msg),
+							ntohl(item->ref),
+							ntohl(item->key));
+
+			if (publ) {
+				tipc_nodesub_unsubscribe(&publ->subscr);
+				kfree(publ);
+			} else {
+				err("Unable to remove publication by node 0x%x\n"
+				    "(type=%u, lower=%u, ref=%u, key=%u)\n",
+				    msg_orignode(msg),
+				    ntohl(item->type), ntohl(item->lower),
+				    ntohl(item->ref), ntohl(item->key));
+			}
+		} else {
+			warn("Unrecognized name table message received\n");
+		}
+		item++;
+	}
+	write_unlock_bh(&tipc_nametbl_lock);
+	buf_discard(buf);
+}
+
+/**
+ * tipc_named_reinit - re-initialize local publication list
+ *
+ * This routine is called whenever TIPC networking is (re)enabled.
+ * All existing publications by this node that have "cluster" or "zone" scope
+ * are updated to reflect the node's current network address.
+ * (If the node's address is unchanged, the update loop terminates immediately.)
+ */
+
+void tipc_named_reinit(void)
+{
+	struct publication *publ;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	list_for_each_entry(publ, &publ_root, local_list) {
+		if (publ->node == tipc_own_addr)
+			break;
+		publ->node = tipc_own_addr;
+	}
+	write_unlock_bh(&tipc_nametbl_lock);
+}
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
new file mode 100644
index 00000000..1e41bdd4
--- /dev/null
+++ b/net/tipc/name_distr.h
@@ -0,0 +1,48 @@
+/*
+ * net/tipc/name_distr.h: Include file for TIPC name distribution code
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_DISTR_H
+#define _TIPC_NAME_DISTR_H
+
+#include "name_table.h"
+
+void tipc_named_publish(struct publication *publ);
+void tipc_named_withdraw(struct publication *publ);
+void tipc_named_node_up(unsigned long node);
+void tipc_named_recv(struct sk_buff *buf);
+void tipc_named_reinit(void);
+
+#endif
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
new file mode 100644
index 00000000..205ed4a4
--- /dev/null
+++ b/net/tipc/name_table.c
@@ -0,0 +1,1037 @@
+/*
+ * net/tipc/name_table.c: TIPC name table code
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2008, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "name_table.h"
+#include "name_distr.h"
+#include "subscr.h"
+#include "port.h"
+
+static int tipc_nametbl_size = 1024;		/* must be a power of 2 */
+
+/**
+ * struct sub_seq - container for all published instances of a name sequence
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @node_list: circular list of publications made by own node
+ * @cluster_list: circular list of publications made by own cluster
+ * @zone_list: circular list of publications made by own zone
+ * @node_list_size: number of entries in "node_list"
+ * @cluster_list_size: number of entries in "cluster_list"
+ * @zone_list_size: number of entries in "zone_list"
+ *
+ * Note: The zone list always contains at least one entry, since all
+ *       publications of the associated name sequence belong to it.
+ *       (The cluster and node lists may be empty.)
+ */
+
+struct sub_seq {
+	u32 lower;
+	u32 upper;
+	struct publication *node_list;
+	struct publication *cluster_list;
+	struct publication *zone_list;
+	u32 node_list_size;
+	u32 cluster_list_size;
+	u32 zone_list_size;
+};
+
+/**
+ * struct name_seq - container for all published instances of a name type
+ * @type: 32 bit 'type' value for name sequence
+ * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type';
+ *        sub-sequences are sorted in ascending order
+ * @alloc: number of sub-sequences currently in array
+ * @first_free: array index of first unused sub-sequence entry
+ * @ns_list: links to adjacent name sequences in hash chain
+ * @subscriptions: list of subscriptions for this 'type'
+ * @lock: spinlock controlling access to publication lists of all sub-sequences
+ */
+
+struct name_seq {
+	u32 type;
+	struct sub_seq *sseqs;
+	u32 alloc;
+	u32 first_free;
+	struct hlist_node ns_list;
+	struct list_head subscriptions;
+	spinlock_t lock;
+};
+
+/**
+ * struct name_table - table containing all existing port name publications
+ * @types: pointer to fixed-sized array of name sequence lists,
+ *         accessed via hashing on 'type'; name sequence lists are *not* sorted
+ * @local_publ_count: number of publications issued by this node
+ */
+
+struct name_table {
+	struct hlist_head *types;
+	u32 local_publ_count;
+};
+
+static struct name_table table;
+static atomic_t rsv_publ_ok = ATOMIC_INIT(0);
+DEFINE_RWLOCK(tipc_nametbl_lock);
+
+
+static int hash(int x)
+{
+	return x & (tipc_nametbl_size - 1);
+}
+
+/**
+ * publ_create - create a publication structure
+ */
+
+static struct publication *publ_create(u32 type, u32 lower, u32 upper,
+				       u32 scope, u32 node, u32 port_ref,
+				       u32 key)
+{
+	struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC);
+	if (publ == NULL) {
+		warn("Publication creation failure, no memory\n");
+		return NULL;
+	}
+
+	publ->type = type;
+	publ->lower = lower;
+	publ->upper = upper;
+	publ->scope = scope;
+	publ->node = node;
+	publ->ref = port_ref;
+	publ->key = key;
+	INIT_LIST_HEAD(&publ->local_list);
+	INIT_LIST_HEAD(&publ->pport_list);
+	INIT_LIST_HEAD(&publ->subscr.nodesub_list);
+	return publ;
+}
+
+/**
+ * tipc_subseq_alloc - allocate a specified number of sub-sequence structures
+ */
+
+static struct sub_seq *tipc_subseq_alloc(u32 cnt)
+{
+	struct sub_seq *sseq = kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC);
+	return sseq;
+}
+
+/**
+ * tipc_nameseq_create - create a name sequence structure for the specified 'type'
+ *
+ * Allocates a single sub-sequence structure and sets it to all 0's.
+ */
+
+static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head)
+{
+	struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC);
+	struct sub_seq *sseq = tipc_subseq_alloc(1);
+
+	if (!nseq || !sseq) {
+		warn("Name sequence creation failed, no memory\n");
+		kfree(nseq);
+		kfree(sseq);
+		return NULL;
+	}
+
+	spin_lock_init(&nseq->lock);
+	nseq->type = type;
+	nseq->sseqs = sseq;
+	nseq->alloc = 1;
+	INIT_HLIST_NODE(&nseq->ns_list);
+	INIT_LIST_HEAD(&nseq->subscriptions);
+	hlist_add_head(&nseq->ns_list, seq_head);
+	return nseq;
+}
+
+/**
+ * nameseq_find_subseq - find sub-sequence (if any) matching a name instance
+ *
+ * Very time-critical, so binary searches through sub-sequence array.
+ */
+
+static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq,
+					   u32 instance)
+{
+	struct sub_seq *sseqs = nseq->sseqs;
+	int low = 0;
+	int high = nseq->first_free - 1;
+	int mid;
+
+	while (low <= high) {
+		mid = (low + high) / 2;
+		if (instance < sseqs[mid].lower)
+			high = mid - 1;
+		else if (instance > sseqs[mid].upper)
+			low = mid + 1;
+		else
+			return &sseqs[mid];
+	}
+	return NULL;
+}
+
+/**
+ * nameseq_locate_subseq - determine position of name instance in sub-sequence
+ *
+ * Returns index in sub-sequence array of the entry that contains the specified
+ * instance value; if no entry contains that value, returns the position
+ * where a new entry for it would be inserted in the array.
+ *
+ * Note: Similar to binary search code for locating a sub-sequence.
+ */
+
+static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance)
+{
+	struct sub_seq *sseqs = nseq->sseqs;
+	int low = 0;
+	int high = nseq->first_free - 1;
+	int mid;
+
+	while (low <= high) {
+		mid = (low + high) / 2;
+		if (instance < sseqs[mid].lower)
+			high = mid - 1;
+		else if (instance > sseqs[mid].upper)
+			low = mid + 1;
+		else
+			return mid;
+	}
+	return low;
+}
+
+/**
+ * tipc_nameseq_insert_publ -
+ */
+
+static struct publication *tipc_nameseq_insert_publ(struct name_seq *nseq,
+						    u32 type, u32 lower, u32 upper,
+						    u32 scope, u32 node, u32 port, u32 key)
+{
+	struct subscription *s;
+	struct subscription *st;
+	struct publication *publ;
+	struct sub_seq *sseq;
+	int created_subseq = 0;
+
+	sseq = nameseq_find_subseq(nseq, lower);
+	if (sseq) {
+
+		/* Lower end overlaps existing entry => need an exact match */
+
+		if ((sseq->lower != lower) || (sseq->upper != upper)) {
+			warn("Cannot publish {%u,%u,%u}, overlap error\n",
+			     type, lower, upper);
+			return NULL;
+		}
+	} else {
+		u32 inspos;
+		struct sub_seq *freesseq;
+
+		/* Find where lower end should be inserted */
+
+		inspos = nameseq_locate_subseq(nseq, lower);
+
+		/* Fail if upper end overlaps into an existing entry */
+
+		if ((inspos < nseq->first_free) &&
+		    (upper >= nseq->sseqs[inspos].lower)) {
+			warn("Cannot publish {%u,%u,%u}, overlap error\n",
+			     type, lower, upper);
+			return NULL;
+		}
+
+		/* Ensure there is space for new sub-sequence */
+
+		if (nseq->first_free == nseq->alloc) {
+			struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2);
+
+			if (!sseqs) {
+				warn("Cannot publish {%u,%u,%u}, no memory\n",
+				     type, lower, upper);
+				return NULL;
+			}
+			memcpy(sseqs, nseq->sseqs,
+			       nseq->alloc * sizeof(struct sub_seq));
+			kfree(nseq->sseqs);
+			nseq->sseqs = sseqs;
+			nseq->alloc *= 2;
+		}
+
+		/* Insert new sub-sequence */
+
+		sseq = &nseq->sseqs[inspos];
+		freesseq = &nseq->sseqs[nseq->first_free];
+		memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq));
+		memset(sseq, 0, sizeof(*sseq));
+		nseq->first_free++;
+		sseq->lower = lower;
+		sseq->upper = upper;
+		created_subseq = 1;
+	}
+
+	/* Insert a publication: */
+
+	publ = publ_create(type, lower, upper, scope, node, port, key);
+	if (!publ)
+		return NULL;
+
+	sseq->zone_list_size++;
+	if (!sseq->zone_list)
+		sseq->zone_list = publ->zone_list_next = publ;
+	else {
+		publ->zone_list_next = sseq->zone_list->zone_list_next;
+		sseq->zone_list->zone_list_next = publ;
+	}
+
+	if (in_own_cluster(node)) {
+		sseq->cluster_list_size++;
+		if (!sseq->cluster_list)
+			sseq->cluster_list = publ->cluster_list_next = publ;
+		else {
+			publ->cluster_list_next =
+			sseq->cluster_list->cluster_list_next;
+			sseq->cluster_list->cluster_list_next = publ;
+		}
+	}
+
+	if (node == tipc_own_addr) {
+		sseq->node_list_size++;
+		if (!sseq->node_list)
+			sseq->node_list = publ->node_list_next = publ;
+		else {
+			publ->node_list_next = sseq->node_list->node_list_next;
+			sseq->node_list->node_list_next = publ;
+		}
+	}
+
+	/*
+	 * Any subscriptions waiting for notification?
+	 */
+	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+		tipc_subscr_report_overlap(s,
+					   publ->lower,
+					   publ->upper,
+					   TIPC_PUBLISHED,
+					   publ->ref,
+					   publ->node,
+					   created_subseq);
+	}
+	return publ;
+}
+
+/**
+ * tipc_nameseq_remove_publ -
+ *
+ * NOTE: There may be cases where TIPC is asked to remove a publication
+ * that is not in the name table.  For example, if another node issues a
+ * publication for a name sequence that overlaps an existing name sequence
+ * the publication will not be recorded, which means the publication won't
+ * be found when the name sequence is later withdrawn by that node.
+ * A failed withdraw request simply returns a failure indication and lets the
+ * caller issue any error or warning messages associated with such a problem.
+ */
+
+static struct publication *tipc_nameseq_remove_publ(struct name_seq *nseq, u32 inst,
+						    u32 node, u32 ref, u32 key)
+{
+	struct publication *publ;
+	struct publication *curr;
+	struct publication *prev;
+	struct sub_seq *sseq = nameseq_find_subseq(nseq, inst);
+	struct sub_seq *free;
+	struct subscription *s, *st;
+	int removed_subseq = 0;
+
+	if (!sseq)
+		return NULL;
+
+	/* Remove publication from zone scope list */
+
+	prev = sseq->zone_list;
+	publ = sseq->zone_list->zone_list_next;
+	while ((publ->key != key) || (publ->ref != ref) ||
+	       (publ->node && (publ->node != node))) {
+		prev = publ;
+		publ = publ->zone_list_next;
+		if (prev == sseq->zone_list) {
+
+			/* Prevent endless loop if publication not found */
+
+			return NULL;
+		}
+	}
+	if (publ != sseq->zone_list)
+		prev->zone_list_next = publ->zone_list_next;
+	else if (publ->zone_list_next != publ) {
+		prev->zone_list_next = publ->zone_list_next;
+		sseq->zone_list = publ->zone_list_next;
+	} else {
+		sseq->zone_list = NULL;
+	}
+	sseq->zone_list_size--;
+
+	/* Remove publication from cluster scope list, if present */
+
+	if (in_own_cluster(node)) {
+		prev = sseq->cluster_list;
+		curr = sseq->cluster_list->cluster_list_next;
+		while (curr != publ) {
+			prev = curr;
+			curr = curr->cluster_list_next;
+			if (prev == sseq->cluster_list) {
+
+				/* Prevent endless loop for malformed list */
+
+				err("Unable to de-list cluster publication\n"
+				    "{%u%u}, node=0x%x, ref=%u, key=%u)\n",
+				    publ->type, publ->lower, publ->node,
+				    publ->ref, publ->key);
+				goto end_cluster;
+			}
+		}
+		if (publ != sseq->cluster_list)
+			prev->cluster_list_next = publ->cluster_list_next;
+		else if (publ->cluster_list_next != publ) {
+			prev->cluster_list_next = publ->cluster_list_next;
+			sseq->cluster_list = publ->cluster_list_next;
+		} else {
+			sseq->cluster_list = NULL;
+		}
+		sseq->cluster_list_size--;
+	}
+end_cluster:
+
+	/* Remove publication from node scope list, if present */
+
+	if (node == tipc_own_addr) {
+		prev = sseq->node_list;
+		curr = sseq->node_list->node_list_next;
+		while (curr != publ) {
+			prev = curr;
+			curr = curr->node_list_next;
+			if (prev == sseq->node_list) {
+
+				/* Prevent endless loop for malformed list */
+
+				err("Unable to de-list node publication\n"
+				    "{%u%u}, node=0x%x, ref=%u, key=%u)\n",
+				    publ->type, publ->lower, publ->node,
+				    publ->ref, publ->key);
+				goto end_node;
+			}
+		}
+		if (publ != sseq->node_list)
+			prev->node_list_next = publ->node_list_next;
+		else if (publ->node_list_next != publ) {
+			prev->node_list_next = publ->node_list_next;
+			sseq->node_list = publ->node_list_next;
+		} else {
+			sseq->node_list = NULL;
+		}
+		sseq->node_list_size--;
+	}
+end_node:
+
+	/* Contract subseq list if no more publications for that subseq */
+
+	if (!sseq->zone_list) {
+		free = &nseq->sseqs[nseq->first_free--];
+		memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq));
+		removed_subseq = 1;
+	}
+
+	/* Notify any waiting subscriptions */
+
+	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
+		tipc_subscr_report_overlap(s,
+					   publ->lower,
+					   publ->upper,
+					   TIPC_WITHDRAWN,
+					   publ->ref,
+					   publ->node,
+					   removed_subseq);
+	}
+
+	return publ;
+}
+
+/**
+ * tipc_nameseq_subscribe: attach a subscription, and issue
+ * the prescribed number of events if there is any sub-
+ * sequence overlapping with the requested sequence
+ */
+
+static void tipc_nameseq_subscribe(struct name_seq *nseq, struct subscription *s)
+{
+	struct sub_seq *sseq = nseq->sseqs;
+
+	list_add(&s->nameseq_list, &nseq->subscriptions);
+
+	if (!sseq)
+		return;
+
+	while (sseq != &nseq->sseqs[nseq->first_free]) {
+		struct publication *zl = sseq->zone_list;
+		if (zl && tipc_subscr_overlap(s, sseq->lower, sseq->upper)) {
+			struct publication *crs = zl;
+			int must_report = 1;
+
+			do {
+				tipc_subscr_report_overlap(s,
+							   sseq->lower,
+							   sseq->upper,
+							   TIPC_PUBLISHED,
+							   crs->ref,
+							   crs->node,
+							   must_report);
+				must_report = 0;
+				crs = crs->zone_list_next;
+			} while (crs != zl);
+		}
+		sseq++;
+	}
+}
+
+static struct name_seq *nametbl_find_seq(u32 type)
+{
+	struct hlist_head *seq_head;
+	struct hlist_node *seq_node;
+	struct name_seq *ns;
+
+	seq_head = &table.types[hash(type)];
+	hlist_for_each_entry(ns, seq_node, seq_head, ns_list) {
+		if (ns->type == type)
+			return ns;
+	}
+
+	return NULL;
+};
+
+struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper,
+					     u32 scope, u32 node, u32 port, u32 key)
+{
+	struct name_seq *seq = nametbl_find_seq(type);
+
+	if (lower > upper) {
+		warn("Failed to publish illegal {%u,%u,%u}\n",
+		     type, lower, upper);
+		return NULL;
+	}
+
+	if (!seq)
+		seq = tipc_nameseq_create(type, &table.types[hash(type)]);
+	if (!seq)
+		return NULL;
+
+	return tipc_nameseq_insert_publ(seq, type, lower, upper,
+					scope, node, port, key);
+}
+
+struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
+					     u32 node, u32 ref, u32 key)
+{
+	struct publication *publ;
+	struct name_seq *seq = nametbl_find_seq(type);
+
+	if (!seq)
+		return NULL;
+
+	publ = tipc_nameseq_remove_publ(seq, lower, node, ref, key);
+
+	if (!seq->first_free && list_empty(&seq->subscriptions)) {
+		hlist_del_init(&seq->ns_list);
+		kfree(seq->sseqs);
+		kfree(seq);
+	}
+	return publ;
+}
+
+/*
+ * tipc_nametbl_translate - translate name to port id
+ *
+ * Note: on entry 'destnode' is the search domain used during translation;
+ *       on exit it passes back the node address of the matching port (if any)
+ */
+
+u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *destnode)
+{
+	struct sub_seq *sseq;
+	struct publication *publ = NULL;
+	struct name_seq *seq;
+	u32 ref;
+
+	if (!tipc_in_scope(*destnode, tipc_own_addr))
+		return 0;
+
+	read_lock_bh(&tipc_nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (unlikely(!seq))
+		goto not_found;
+	sseq = nameseq_find_subseq(seq, instance);
+	if (unlikely(!sseq))
+		goto not_found;
+	spin_lock_bh(&seq->lock);
+
+	/* Closest-First Algorithm: */
+	if (likely(!*destnode)) {
+		publ = sseq->node_list;
+		if (publ) {
+			sseq->node_list = publ->node_list_next;
+found:
+			ref = publ->ref;
+			*destnode = publ->node;
+			spin_unlock_bh(&seq->lock);
+			read_unlock_bh(&tipc_nametbl_lock);
+			return ref;
+		}
+		publ = sseq->cluster_list;
+		if (publ) {
+			sseq->cluster_list = publ->cluster_list_next;
+			goto found;
+		}
+		publ = sseq->zone_list;
+		if (publ) {
+			sseq->zone_list = publ->zone_list_next;
+			goto found;
+		}
+	}
+
+	/* Round-Robin Algorithm: */
+	else if (*destnode == tipc_own_addr) {
+		publ = sseq->node_list;
+		if (publ) {
+			sseq->node_list = publ->node_list_next;
+			goto found;
+		}
+	} else if (in_own_cluster(*destnode)) {
+		publ = sseq->cluster_list;
+		if (publ) {
+			sseq->cluster_list = publ->cluster_list_next;
+			goto found;
+		}
+	} else {
+		publ = sseq->zone_list;
+		if (publ) {
+			sseq->zone_list = publ->zone_list_next;
+			goto found;
+		}
+	}
+	spin_unlock_bh(&seq->lock);
+not_found:
+	read_unlock_bh(&tipc_nametbl_lock);
+	return 0;
+}
+
+/**
+ * tipc_nametbl_mc_translate - find multicast destinations
+ *
+ * Creates list of all local ports that overlap the given multicast address;
+ * also determines if any off-node ports overlap.
+ *
+ * Note: Publications with a scope narrower than 'limit' are ignored.
+ * (i.e. local node-scope publications mustn't receive messages arriving
+ * from another node, even if the multcast link brought it here)
+ *
+ * Returns non-zero if any off-node ports overlap
+ */
+
+int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit,
+			      struct port_list *dports)
+{
+	struct name_seq *seq;
+	struct sub_seq *sseq;
+	struct sub_seq *sseq_stop;
+	int res = 0;
+
+	read_lock_bh(&tipc_nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (!seq)
+		goto exit;
+
+	spin_lock_bh(&seq->lock);
+
+	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
+	sseq_stop = seq->sseqs + seq->first_free;
+	for (; sseq != sseq_stop; sseq++) {
+		struct publication *publ;
+
+		if (sseq->lower > upper)
+			break;
+
+		publ = sseq->node_list;
+		if (publ) {
+			do {
+				if (publ->scope <= limit)
+					tipc_port_list_add(dports, publ->ref);
+				publ = publ->node_list_next;
+			} while (publ != sseq->node_list);
+		}
+
+		if (sseq->cluster_list_size != sseq->node_list_size)
+			res = 1;
+	}
+
+	spin_unlock_bh(&seq->lock);
+exit:
+	read_unlock_bh(&tipc_nametbl_lock);
+	return res;
+}
+
+/**
+ * tipc_nametbl_publish_rsv - publish port name using a reserved name type
+ */
+
+int tipc_nametbl_publish_rsv(u32 ref, unsigned int scope,
+			struct tipc_name_seq const *seq)
+{
+	int res;
+
+	atomic_inc(&rsv_publ_ok);
+	res = tipc_publish(ref, scope, seq);
+	atomic_dec(&rsv_publ_ok);
+	return res;
+}
+
+/**
+ * tipc_nametbl_publish - add name publication to network name tables
+ */
+
+struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper,
+				    u32 scope, u32 port_ref, u32 key)
+{
+	struct publication *publ;
+
+	if (table.local_publ_count >= tipc_max_publications) {
+		warn("Publication failed, local publication limit reached (%u)\n",
+		     tipc_max_publications);
+		return NULL;
+	}
+	if ((type < TIPC_RESERVED_TYPES) && !atomic_read(&rsv_publ_ok)) {
+		warn("Publication failed, reserved name {%u,%u,%u}\n",
+		     type, lower, upper);
+		return NULL;
+	}
+
+	write_lock_bh(&tipc_nametbl_lock);
+	table.local_publ_count++;
+	publ = tipc_nametbl_insert_publ(type, lower, upper, scope,
+				   tipc_own_addr, port_ref, key);
+	if (publ && (scope != TIPC_NODE_SCOPE))
+		tipc_named_publish(publ);
+	write_unlock_bh(&tipc_nametbl_lock);
+	return publ;
+}
+
+/**
+ * tipc_nametbl_withdraw - withdraw name publication from network name tables
+ */
+
+int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key)
+{
+	struct publication *publ;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	publ = tipc_nametbl_remove_publ(type, lower, tipc_own_addr, ref, key);
+	if (likely(publ)) {
+		table.local_publ_count--;
+		if (publ->scope != TIPC_NODE_SCOPE)
+			tipc_named_withdraw(publ);
+		write_unlock_bh(&tipc_nametbl_lock);
+		list_del_init(&publ->pport_list);
+		kfree(publ);
+		return 1;
+	}
+	write_unlock_bh(&tipc_nametbl_lock);
+	err("Unable to remove local publication\n"
+	    "(type=%u, lower=%u, ref=%u, key=%u)\n",
+	    type, lower, ref, key);
+	return 0;
+}
+
+/**
+ * tipc_nametbl_subscribe - add a subscription object to the name table
+ */
+
+void tipc_nametbl_subscribe(struct subscription *s)
+{
+	u32 type = s->seq.type;
+	struct name_seq *seq;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	seq = nametbl_find_seq(type);
+	if (!seq)
+		seq = tipc_nameseq_create(type, &table.types[hash(type)]);
+	if (seq) {
+		spin_lock_bh(&seq->lock);
+		tipc_nameseq_subscribe(seq, s);
+		spin_unlock_bh(&seq->lock);
+	} else {
+		warn("Failed to create subscription for {%u,%u,%u}\n",
+		     s->seq.type, s->seq.lower, s->seq.upper);
+	}
+	write_unlock_bh(&tipc_nametbl_lock);
+}
+
+/**
+ * tipc_nametbl_unsubscribe - remove a subscription object from name table
+ */
+
+void tipc_nametbl_unsubscribe(struct subscription *s)
+{
+	struct name_seq *seq;
+
+	write_lock_bh(&tipc_nametbl_lock);
+	seq = nametbl_find_seq(s->seq.type);
+	if (seq != NULL) {
+		spin_lock_bh(&seq->lock);
+		list_del_init(&s->nameseq_list);
+		spin_unlock_bh(&seq->lock);
+		if ((seq->first_free == 0) && list_empty(&seq->subscriptions)) {
+			hlist_del_init(&seq->ns_list);
+			kfree(seq->sseqs);
+			kfree(seq);
+		}
+	}
+	write_unlock_bh(&tipc_nametbl_lock);
+}
+
+
+/**
+ * subseq_list: print specified sub-sequence contents into the given buffer
+ */
+
+static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
+			u32 index)
+{
+	char portIdStr[27];
+	const char *scope_str[] = {"", " zone", " cluster", " node"};
+	struct publication *publ = sseq->zone_list;
+
+	tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper);
+
+	if (depth == 2 || !publ) {
+		tipc_printf(buf, "\n");
+		return;
+	}
+
+	do {
+		sprintf(portIdStr, "<%u.%u.%u:%u>",
+			 tipc_zone(publ->node), tipc_cluster(publ->node),
+			 tipc_node(publ->node), publ->ref);
+		tipc_printf(buf, "%-26s ", portIdStr);
+		if (depth > 3) {
+			tipc_printf(buf, "%-10u %s", publ->key,
+				    scope_str[publ->scope]);
+		}
+
+		publ = publ->zone_list_next;
+		if (publ == sseq->zone_list)
+			break;
+
+		tipc_printf(buf, "\n%33s", " ");
+	} while (1);
+
+	tipc_printf(buf, "\n");
+}
+
+/**
+ * nameseq_list: print specified name sequence contents into the given buffer
+ */
+
+static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
+			 u32 type, u32 lowbound, u32 upbound, u32 index)
+{
+	struct sub_seq *sseq;
+	char typearea[11];
+
+	if (seq->first_free == 0)
+		return;
+
+	sprintf(typearea, "%-10u", seq->type);
+
+	if (depth == 1) {
+		tipc_printf(buf, "%s\n", typearea);
+		return;
+	}
+
+	for (sseq = seq->sseqs; sseq != &seq->sseqs[seq->first_free]; sseq++) {
+		if ((lowbound <= sseq->upper) && (upbound >= sseq->lower)) {
+			tipc_printf(buf, "%s ", typearea);
+			spin_lock_bh(&seq->lock);
+			subseq_list(sseq, buf, depth, index);
+			spin_unlock_bh(&seq->lock);
+			sprintf(typearea, "%10s", " ");
+		}
+	}
+}
+
+/**
+ * nametbl_header - print name table header into the given buffer
+ */
+
+static void nametbl_header(struct print_buf *buf, u32 depth)
+{
+	const char *header[] = {
+		"Type       ",
+		"Lower      Upper      ",
+		"Port Identity              ",
+		"Publication Scope"
+	};
+
+	int i;
+
+	if (depth > 4)
+		depth = 4;
+	for (i = 0; i < depth; i++)
+		tipc_printf(buf, header[i]);
+	tipc_printf(buf, "\n");
+}
+
+/**
+ * nametbl_list - print specified name table contents into the given buffer
+ */
+
+static void nametbl_list(struct print_buf *buf, u32 depth_info,
+			 u32 type, u32 lowbound, u32 upbound)
+{
+	struct hlist_head *seq_head;
+	struct hlist_node *seq_node;
+	struct name_seq *seq;
+	int all_types;
+	u32 depth;
+	u32 i;
+
+	all_types = (depth_info & TIPC_NTQ_ALLTYPES);
+	depth = (depth_info & ~TIPC_NTQ_ALLTYPES);
+
+	if (depth == 0)
+		return;
+
+	if (all_types) {
+		/* display all entries in name table to specified depth */
+		nametbl_header(buf, depth);
+		lowbound = 0;
+		upbound = ~0;
+		for (i = 0; i < tipc_nametbl_size; i++) {
+			seq_head = &table.types[i];
+			hlist_for_each_entry(seq, seq_node, seq_head, ns_list) {
+				nameseq_list(seq, buf, depth, seq->type,
+					     lowbound, upbound, i);
+			}
+		}
+	} else {
+		/* display only the sequence that matches the specified type */
+		if (upbound < lowbound) {
+			tipc_printf(buf, "invalid name sequence specified\n");
+			return;
+		}
+		nametbl_header(buf, depth);
+		i = hash(type);
+		seq_head = &table.types[i];
+		hlist_for_each_entry(seq, seq_node, seq_head, ns_list) {
+			if (seq->type == type) {
+				nameseq_list(seq, buf, depth, type,
+					     lowbound, upbound, i);
+				break;
+			}
+		}
+	}
+}
+
+#define MAX_NAME_TBL_QUERY 32768
+
+struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
+{
+	struct sk_buff *buf;
+	struct tipc_name_table_query *argv;
+	struct tlv_desc *rep_tlv;
+	struct print_buf b;
+	int str_len;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NAME_TBL_QUERY))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_NAME_TBL_QUERY));
+	if (!buf)
+		return NULL;
+
+	rep_tlv = (struct tlv_desc *)buf->data;
+	tipc_printbuf_init(&b, TLV_DATA(rep_tlv), MAX_NAME_TBL_QUERY);
+	argv = (struct tipc_name_table_query *)TLV_DATA(req_tlv_area);
+	read_lock_bh(&tipc_nametbl_lock);
+	nametbl_list(&b, ntohl(argv->depth), ntohl(argv->type),
+		     ntohl(argv->lowbound), ntohl(argv->upbound));
+	read_unlock_bh(&tipc_nametbl_lock);
+	str_len = tipc_printbuf_validate(&b);
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+int tipc_nametbl_init(void)
+{
+	table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head),
+			      GFP_ATOMIC);
+	if (!table.types)
+		return -ENOMEM;
+
+	table.local_publ_count = 0;
+	return 0;
+}
+
+void tipc_nametbl_stop(void)
+{
+	u32 i;
+
+	if (!table.types)
+		return;
+
+	/* Verify name table is empty, then release it */
+
+	write_lock_bh(&tipc_nametbl_lock);
+	for (i = 0; i < tipc_nametbl_size; i++) {
+		if (!hlist_empty(&table.types[i]))
+			err("tipc_nametbl_stop(): hash chain %u is non-null\n", i);
+	}
+	kfree(table.types);
+	table.types = NULL;
+	write_unlock_bh(&tipc_nametbl_lock);
+}
+
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
new file mode 100644
index 00000000..d228bd68
--- /dev/null
+++ b/net/tipc/name_table.h
@@ -0,0 +1,108 @@
+/*
+ * net/tipc/name_table.h: Include file for TIPC name table code
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2004-2005, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NAME_TABLE_H
+#define _TIPC_NAME_TABLE_H
+
+#include "node_subscr.h"
+
+struct subscription;
+struct port_list;
+
+/*
+ * TIPC name types reserved for internal TIPC use (both current and planned)
+ */
+
+#define TIPC_ZM_SRV 3		/* zone master service name type */
+
+
+/**
+ * struct publication - info about a published (name or) name sequence
+ * @type: name sequence type
+ * @lower: name sequence lower bound
+ * @upper: name sequence upper bound
+ * @scope: scope of publication
+ * @node: network address of publishing port's node
+ * @ref: publishing port
+ * @key: publication key
+ * @subscr: subscription to "node down" event (for off-node publications only)
+ * @local_list: adjacent entries in list of publications made by this node
+ * @pport_list: adjacent entries in list of publications made by this port
+ * @node_list: next matching name seq publication with >= node scope
+ * @cluster_list: next matching name seq publication with >= cluster scope
+ * @zone_list: next matching name seq publication with >= zone scope
+ *
+ * Note that the node list, cluster list, and zone list are circular lists.
+ */
+
+struct publication {
+	u32 type;
+	u32 lower;
+	u32 upper;
+	u32 scope;
+	u32 node;
+	u32 ref;
+	u32 key;
+	struct tipc_node_subscr subscr;
+	struct list_head local_list;
+	struct list_head pport_list;
+	struct publication *node_list_next;
+	struct publication *cluster_list_next;
+	struct publication *zone_list_next;
+};
+
+
+extern rwlock_t tipc_nametbl_lock;
+
+struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space);
+u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *node);
+int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit,
+			 struct port_list *dports);
+int tipc_nametbl_publish_rsv(u32 ref, unsigned int scope,
+			struct tipc_name_seq const *seq);
+struct publication *tipc_nametbl_publish(u32 type, u32 lower, u32 upper,
+				    u32 scope, u32 port_ref, u32 key);
+int tipc_nametbl_withdraw(u32 type, u32 lower, u32 ref, u32 key);
+struct publication *tipc_nametbl_insert_publ(u32 type, u32 lower, u32 upper,
+					u32 scope, u32 node, u32 ref, u32 key);
+struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
+					u32 node, u32 ref, u32 key);
+void tipc_nametbl_subscribe(struct subscription *s);
+void tipc_nametbl_unsubscribe(struct subscription *s);
+int tipc_nametbl_init(void);
+void tipc_nametbl_stop(void);
+
+#endif
diff --git a/net/tipc/net.c b/net/tipc/net.c
new file mode 100644
index 00000000..68b3dd63
--- /dev/null
+++ b/net/tipc/net.c
@@ -0,0 +1,228 @@
+/*
+ * net/tipc/net.c: TIPC network routing code
+ *
+ * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "net.h"
+#include "name_distr.h"
+#include "subscr.h"
+#include "port.h"
+#include "node.h"
+#include "config.h"
+
+/*
+ * The TIPC locking policy is designed to ensure a very fine locking
+ * granularity, permitting complete parallel access to individual
+ * port and node/link instances. The code consists of three major
+ * locking domains, each protected with their own disjunct set of locks.
+ *
+ * 1: The routing hierarchy.
+ *    Comprises the structures 'zone', 'cluster', 'node', 'link'
+ *    and 'bearer'. The whole hierarchy is protected by a big
+ *    read/write lock, tipc_net_lock, to enssure that nothing is added
+ *    or removed while code is accessing any of these structures.
+ *    This layer must not be called from the two others while they
+ *    hold any of their own locks.
+ *    Neither must it itself do any upcalls to the other two before
+ *    it has released tipc_net_lock and other protective locks.
+ *
+ *   Within the tipc_net_lock domain there are two sub-domains;'node' and
+ *   'bearer', where local write operations are permitted,
+ *   provided that those are protected by individual spin_locks
+ *   per instance. Code holding tipc_net_lock(read) and a node spin_lock
+ *   is permitted to poke around in both the node itself and its
+ *   subordinate links. I.e, it can update link counters and queues,
+ *   change link state, send protocol messages, and alter the
+ *   "active_links" array in the node; but it can _not_ remove a link
+ *   or a node from the overall structure.
+ *   Correspondingly, individual bearers may change status within a
+ *   tipc_net_lock(read), protected by an individual spin_lock ber bearer
+ *   instance, but it needs tipc_net_lock(write) to remove/add any bearers.
+ *
+ *
+ *  2: The transport level of the protocol.
+ *     This consists of the structures port, (and its user level
+ *     representations, such as user_port and tipc_sock), reference and
+ *     tipc_user (port.c, reg.c, socket.c).
+ *
+ *     This layer has four different locks:
+ *     - The tipc_port spin_lock. This is protecting each port instance
+ *       from parallel data access and removal. Since we can not place
+ *       this lock in the port itself, it has been placed in the
+ *       corresponding reference table entry, which has the same life
+ *       cycle as the module. This entry is difficult to access from
+ *       outside the TIPC core, however, so a pointer to the lock has
+ *       been added in the port instance, -to be used for unlocking
+ *       only.
+ *     - A read/write lock to protect the reference table itself (teg.c).
+ *       (Nobody is using read-only access to this, so it can just as
+ *       well be changed to a spin_lock)
+ *     - A spin lock to protect the registry of kernel/driver users (reg.c)
+ *     - A global spin_lock (tipc_port_lock), which only task is to ensure
+ *       consistency where more than one port is involved in an operation,
+ *       i.e., whe a port is part of a linked list of ports.
+ *       There are two such lists; 'port_list', which is used for management,
+ *       and 'wait_list', which is used to queue ports during congestion.
+ *
+ *  3: The name table (name_table.c, name_distr.c, subscription.c)
+ *     - There is one big read/write-lock (tipc_nametbl_lock) protecting the
+ *       overall name table structure. Nothing must be added/removed to
+ *       this structure without holding write access to it.
+ *     - There is one local spin_lock per sub_sequence, which can be seen
+ *       as a sub-domain to the tipc_nametbl_lock domain. It is used only
+ *       for translation operations, and is needed because a translation
+ *       steps the root of the 'publication' linked list between each lookup.
+ *       This is always used within the scope of a tipc_nametbl_lock(read).
+ *     - A local spin_lock protecting the queue of subscriber events.
+*/
+
+DEFINE_RWLOCK(tipc_net_lock);
+
+static void net_route_named_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 dnode;
+	u32 dport;
+
+	if (!msg_named(msg)) {
+		buf_discard(buf);
+		return;
+	}
+
+	dnode = addr_domain(msg_lookup_scope(msg));
+	dport = tipc_nametbl_translate(msg_nametype(msg), msg_nameinst(msg), &dnode);
+	if (dport) {
+		msg_set_destnode(msg, dnode);
+		msg_set_destport(msg, dport);
+		tipc_net_route_msg(buf);
+		return;
+	}
+	tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
+}
+
+void tipc_net_route_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg;
+	u32 dnode;
+
+	if (!buf)
+		return;
+	msg = buf_msg(buf);
+
+	msg_incr_reroute_cnt(msg);
+	if (msg_reroute_cnt(msg) > 6) {
+		if (msg_errcode(msg)) {
+			buf_discard(buf);
+		} else {
+			tipc_reject_msg(buf, msg_destport(msg) ?
+					TIPC_ERR_NO_PORT : TIPC_ERR_NO_NAME);
+		}
+		return;
+	}
+
+	/* Handle message for this node */
+	dnode = msg_short(msg) ? tipc_own_addr : msg_destnode(msg);
+	if (tipc_in_scope(dnode, tipc_own_addr)) {
+		if (msg_isdata(msg)) {
+			if (msg_mcast(msg))
+				tipc_port_recv_mcast(buf, NULL);
+			else if (msg_destport(msg))
+				tipc_port_recv_msg(buf);
+			else
+				net_route_named_msg(buf);
+			return;
+		}
+		switch (msg_user(msg)) {
+		case NAME_DISTRIBUTOR:
+			tipc_named_recv(buf);
+			break;
+		case CONN_MANAGER:
+			tipc_port_recv_proto_msg(buf);
+			break;
+		default:
+			buf_discard(buf);
+		}
+		return;
+	}
+
+	/* Handle message for another node */
+	skb_trim(buf, msg_size(msg));
+	tipc_link_send(buf, dnode, msg_link_selector(msg));
+}
+
+int tipc_net_start(u32 addr)
+{
+	char addr_string[16];
+	int res;
+
+	if (tipc_mode != TIPC_NODE_MODE)
+		return -ENOPROTOOPT;
+
+	tipc_subscr_stop();
+	tipc_cfg_stop();
+
+	tipc_own_addr = addr;
+	tipc_mode = TIPC_NET_MODE;
+	tipc_named_reinit();
+	tipc_port_reinit();
+
+	res = tipc_bclink_init();
+	if (res)
+		return res;
+
+	tipc_k_signal((Handler)tipc_subscr_start, 0);
+	tipc_k_signal((Handler)tipc_cfg_init, 0);
+
+	info("Started in network mode\n");
+	info("Own node address %s, network identity %u\n",
+	     tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id);
+	return 0;
+}
+
+void tipc_net_stop(void)
+{
+	struct tipc_node *node, *t_node;
+
+	if (tipc_mode != TIPC_NET_MODE)
+		return;
+	write_lock_bh(&tipc_net_lock);
+	tipc_bearer_stop();
+	tipc_mode = TIPC_NODE_MODE;
+	tipc_bclink_stop();
+	list_for_each_entry_safe(node, t_node, &tipc_node_list, list)
+		tipc_node_delete(node);
+	write_unlock_bh(&tipc_net_lock);
+	info("Left network mode\n");
+}
diff --git a/net/tipc/net.h b/net/tipc/net.h
new file mode 100644
index 00000000..9eb4b9e2
--- /dev/null
+++ b/net/tipc/net.h
@@ -0,0 +1,47 @@
+/*
+ * net/tipc/net.h: Include file for TIPC network routing code
+ *
+ * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NET_H
+#define _TIPC_NET_H
+
+extern rwlock_t tipc_net_lock;
+
+void tipc_net_route_msg(struct sk_buff *buf);
+
+int tipc_net_start(u32 addr);
+void tipc_net_stop(void);
+
+#endif
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
new file mode 100644
index 00000000..7bda8e3d
--- /dev/null
+++ b/net/tipc/netlink.c
@@ -0,0 +1,108 @@
+/*
+ * net/tipc/netlink.c: TIPC configuration handling
+ *
+ * Copyright (c) 2005-2006, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include <net/genetlink.h>
+
+static int handle_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *rep_buf;
+	struct nlmsghdr *rep_nlh;
+	struct nlmsghdr *req_nlh = info->nlhdr;
+	struct tipc_genlmsghdr *req_userhdr = info->userhdr;
+	int hdr_space = NLMSG_SPACE(GENL_HDRLEN + TIPC_GENL_HDRLEN);
+	u16 cmd;
+
+	if ((req_userhdr->cmd & 0xC000) && (!capable(CAP_NET_ADMIN)))
+		cmd = TIPC_CMD_NOT_NET_ADMIN;
+	else
+		cmd = req_userhdr->cmd;
+
+	rep_buf = tipc_cfg_do_cmd(req_userhdr->dest, cmd,
+			NLMSG_DATA(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN,
+			NLMSG_PAYLOAD(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN),
+			hdr_space);
+
+	if (rep_buf) {
+		skb_push(rep_buf, hdr_space);
+		rep_nlh = nlmsg_hdr(rep_buf);
+		memcpy(rep_nlh, req_nlh, hdr_space);
+		rep_nlh->nlmsg_len = rep_buf->len;
+		genlmsg_unicast(&init_net, rep_buf, NETLINK_CB(skb).pid);
+	}
+
+	return 0;
+}
+
+static struct genl_family tipc_genl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TIPC_GENL_NAME,
+	.version	= TIPC_GENL_VERSION,
+	.hdrsize	= TIPC_GENL_HDRLEN,
+	.maxattr	= 0,
+};
+
+static struct genl_ops tipc_genl_ops = {
+	.cmd		= TIPC_GENL_CMD,
+	.doit		= handle_cmd,
+};
+
+static int tipc_genl_family_registered;
+
+int tipc_netlink_start(void)
+{
+	int res;
+
+	res = genl_register_family_with_ops(&tipc_genl_family,
+		&tipc_genl_ops, 1);
+	if (res) {
+		err("Failed to register netlink interface\n");
+		return res;
+	}
+
+	tipc_genl_family_registered = 1;
+	return 0;
+}
+
+void tipc_netlink_stop(void)
+{
+	if (!tipc_genl_family_registered)
+		return;
+
+	genl_unregister_family(&tipc_genl_family);
+	tipc_genl_family_registered = 0;
+}
diff --git a/net/tipc/node.c b/net/tipc/node.c
new file mode 100644
index 00000000..2d106ef4
--- /dev/null
+++ b/net/tipc/node.c
@@ -0,0 +1,492 @@
+/*
+ * net/tipc/node.c: TIPC node management routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "node.h"
+#include "name_distr.h"
+
+static void node_lost_contact(struct tipc_node *n_ptr);
+static void node_established_contact(struct tipc_node *n_ptr);
+
+static DEFINE_SPINLOCK(node_create_lock);
+
+static struct hlist_head node_htable[NODE_HTABLE_SIZE];
+LIST_HEAD(tipc_node_list);
+static u32 tipc_num_nodes;
+
+static atomic_t tipc_num_links = ATOMIC_INIT(0);
+u32 tipc_own_tag;
+
+/**
+ * tipc_node_find - locate specified node object, if it exists
+ */
+
+struct tipc_node *tipc_node_find(u32 addr)
+{
+	struct tipc_node *node;
+	struct hlist_node *pos;
+
+	if (unlikely(!in_own_cluster(addr)))
+		return NULL;
+
+	hlist_for_each_entry(node, pos, &node_htable[tipc_hashfn(addr)], hash) {
+		if (node->addr == addr)
+			return node;
+	}
+	return NULL;
+}
+
+/**
+ * tipc_node_create - create neighboring node
+ *
+ * Currently, this routine is called by neighbor discovery code, which holds
+ * net_lock for reading only.  We must take node_create_lock to ensure a node
+ * isn't created twice if two different bearers discover the node at the same
+ * time.  (It would be preferable to switch to holding net_lock in write mode,
+ * but this is a non-trivial change.)
+ */
+
+struct tipc_node *tipc_node_create(u32 addr)
+{
+	struct tipc_node *n_ptr, *temp_node;
+
+	spin_lock_bh(&node_create_lock);
+
+	n_ptr = tipc_node_find(addr);
+	if (n_ptr) {
+		spin_unlock_bh(&node_create_lock);
+		return n_ptr;
+	}
+
+	n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC);
+	if (!n_ptr) {
+		spin_unlock_bh(&node_create_lock);
+		warn("Node creation failed, no memory\n");
+		return NULL;
+	}
+
+	n_ptr->addr = addr;
+	spin_lock_init(&n_ptr->lock);
+	INIT_HLIST_NODE(&n_ptr->hash);
+	INIT_LIST_HEAD(&n_ptr->list);
+	INIT_LIST_HEAD(&n_ptr->nsub);
+
+	hlist_add_head(&n_ptr->hash, &node_htable[tipc_hashfn(addr)]);
+
+	list_for_each_entry(temp_node, &tipc_node_list, list) {
+		if (n_ptr->addr < temp_node->addr)
+			break;
+	}
+	list_add_tail(&n_ptr->list, &temp_node->list);
+
+	tipc_num_nodes++;
+
+	spin_unlock_bh(&node_create_lock);
+	return n_ptr;
+}
+
+void tipc_node_delete(struct tipc_node *n_ptr)
+{
+	list_del(&n_ptr->list);
+	hlist_del(&n_ptr->hash);
+	kfree(n_ptr);
+
+	tipc_num_nodes--;
+}
+
+
+/**
+ * tipc_node_link_up - handle addition of link
+ *
+ * Link becomes active (alone or shared) or standby, depending on its priority.
+ */
+
+void tipc_node_link_up(struct tipc_node *n_ptr, struct link *l_ptr)
+{
+	struct link **active = &n_ptr->active_links[0];
+
+	n_ptr->working_links++;
+
+	info("Established link <%s> on network plane %c\n",
+	     l_ptr->name, l_ptr->b_ptr->net_plane);
+
+	if (!active[0]) {
+		active[0] = active[1] = l_ptr;
+		node_established_contact(n_ptr);
+		return;
+	}
+	if (l_ptr->priority < active[0]->priority) {
+		info("New link <%s> becomes standby\n", l_ptr->name);
+		return;
+	}
+	tipc_link_send_duplicate(active[0], l_ptr);
+	if (l_ptr->priority == active[0]->priority) {
+		active[0] = l_ptr;
+		return;
+	}
+	info("Old link <%s> becomes standby\n", active[0]->name);
+	if (active[1] != active[0])
+		info("Old link <%s> becomes standby\n", active[1]->name);
+	active[0] = active[1] = l_ptr;
+}
+
+/**
+ * node_select_active_links - select active link
+ */
+
+static void node_select_active_links(struct tipc_node *n_ptr)
+{
+	struct link **active = &n_ptr->active_links[0];
+	u32 i;
+	u32 highest_prio = 0;
+
+	active[0] = active[1] = NULL;
+
+	for (i = 0; i < MAX_BEARERS; i++) {
+		struct link *l_ptr = n_ptr->links[i];
+
+		if (!l_ptr || !tipc_link_is_up(l_ptr) ||
+		    (l_ptr->priority < highest_prio))
+			continue;
+
+		if (l_ptr->priority > highest_prio) {
+			highest_prio = l_ptr->priority;
+			active[0] = active[1] = l_ptr;
+		} else {
+			active[1] = l_ptr;
+		}
+	}
+}
+
+/**
+ * tipc_node_link_down - handle loss of link
+ */
+
+void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr)
+{
+	struct link **active;
+
+	n_ptr->working_links--;
+
+	if (!tipc_link_is_active(l_ptr)) {
+		info("Lost standby link <%s> on network plane %c\n",
+		     l_ptr->name, l_ptr->b_ptr->net_plane);
+		return;
+	}
+	info("Lost link <%s> on network plane %c\n",
+		l_ptr->name, l_ptr->b_ptr->net_plane);
+
+	active = &n_ptr->active_links[0];
+	if (active[0] == l_ptr)
+		active[0] = active[1];
+	if (active[1] == l_ptr)
+		active[1] = active[0];
+	if (active[0] == l_ptr)
+		node_select_active_links(n_ptr);
+	if (tipc_node_is_up(n_ptr))
+		tipc_link_changeover(l_ptr);
+	else
+		node_lost_contact(n_ptr);
+}
+
+int tipc_node_active_links(struct tipc_node *n_ptr)
+{
+	return n_ptr->active_links[0] != NULL;
+}
+
+int tipc_node_redundant_links(struct tipc_node *n_ptr)
+{
+	return n_ptr->working_links > 1;
+}
+
+int tipc_node_is_up(struct tipc_node *n_ptr)
+{
+	return tipc_node_active_links(n_ptr);
+}
+
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr)
+{
+	n_ptr->links[l_ptr->b_ptr->identity] = l_ptr;
+	atomic_inc(&tipc_num_links);
+	n_ptr->link_cnt++;
+}
+
+void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr)
+{
+	n_ptr->links[l_ptr->b_ptr->identity] = NULL;
+	atomic_dec(&tipc_num_links);
+	n_ptr->link_cnt--;
+}
+
+/*
+ * Routing table management - five cases to handle:
+ *
+ * 1: A link towards a zone/cluster external node comes up.
+ *    => Send a multicast message updating routing tables of all
+ *    system nodes within own cluster that the new destination
+ *    can be reached via this node.
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *
+ * 2: A link towards a slave node comes up.
+ *    => Send a multicast message updating routing tables of all
+ *    system nodes within own cluster that the new destination
+ *    can be reached via this node.
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *    => Send a  message to the slave node about existence
+ *    of all system nodes within cluster:
+ *    (node.establishedContact()=>cluster.sendLocalRoutes())
+ *
+ * 3: A new cluster local system node becomes available.
+ *    => Send message(s) to this particular node containing
+ *    information about all cluster external and slave
+ *     nodes which can be reached via this node.
+ *    (node.establishedContact()==>network.sendExternalRoutes())
+ *    (node.establishedContact()==>network.sendSlaveRoutes())
+ *    => Send messages to all directly connected slave nodes
+ *    containing information about the existence of the new node
+ *    (node.establishedContact()=>cluster.multicastNewRoute())
+ *
+ * 4: The link towards a zone/cluster external node or slave
+ *    node goes down.
+ *    => Send a multcast message updating routing tables of all
+ *    nodes within cluster that the new destination can not any
+ *    longer be reached via this node.
+ *    (node.lostAllLinks()=>cluster.bcastLostRoute())
+ *
+ * 5: A cluster local system node becomes unavailable.
+ *    => Remove all references to this node from the local
+ *    routing tables. Note: This is a completely node
+ *    local operation.
+ *    (node.lostAllLinks()=>network.removeAsRouter())
+ *    => Send messages to all directly connected slave nodes
+ *    containing information about loss of the node
+ *    (node.establishedContact()=>cluster.multicastLostRoute())
+ *
+ */
+
+static void node_established_contact(struct tipc_node *n_ptr)
+{
+	tipc_k_signal((Handler)tipc_named_node_up, n_ptr->addr);
+
+	/* Syncronize broadcast acks */
+	n_ptr->bclink.acked = tipc_bclink_get_last_sent();
+
+	if (n_ptr->bclink.supported) {
+		tipc_nmap_add(&tipc_bcast_nmap, n_ptr->addr);
+		if (n_ptr->addr < tipc_own_addr)
+			tipc_own_tag++;
+	}
+}
+
+static void node_cleanup_finished(unsigned long node_addr)
+{
+	struct tipc_node *n_ptr;
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find(node_addr);
+	if (n_ptr) {
+		tipc_node_lock(n_ptr);
+		n_ptr->cleanup_required = 0;
+		tipc_node_unlock(n_ptr);
+	}
+	read_unlock_bh(&tipc_net_lock);
+}
+
+static void node_lost_contact(struct tipc_node *n_ptr)
+{
+	char addr_string[16];
+	u32 i;
+
+	/* Clean up broadcast reception remains */
+	n_ptr->bclink.gap_after = n_ptr->bclink.gap_to = 0;
+	while (n_ptr->bclink.deferred_head) {
+		struct sk_buff *buf = n_ptr->bclink.deferred_head;
+		n_ptr->bclink.deferred_head = buf->next;
+		buf_discard(buf);
+	}
+	if (n_ptr->bclink.defragm) {
+		buf_discard(n_ptr->bclink.defragm);
+		n_ptr->bclink.defragm = NULL;
+	}
+
+	if (n_ptr->bclink.supported) {
+		tipc_bclink_acknowledge(n_ptr,
+					mod(n_ptr->bclink.acked + 10000));
+		tipc_nmap_remove(&tipc_bcast_nmap, n_ptr->addr);
+		if (n_ptr->addr < tipc_own_addr)
+			tipc_own_tag--;
+	}
+
+	info("Lost contact with %s\n",
+	     tipc_addr_string_fill(addr_string, n_ptr->addr));
+
+	/* Abort link changeover */
+	for (i = 0; i < MAX_BEARERS; i++) {
+		struct link *l_ptr = n_ptr->links[i];
+		if (!l_ptr)
+			continue;
+		l_ptr->reset_checkpoint = l_ptr->next_in_no;
+		l_ptr->exp_msg_count = 0;
+		tipc_link_reset_fragments(l_ptr);
+	}
+
+	/* Notify subscribers */
+	tipc_nodesub_notify(n_ptr);
+
+	/* Prevent re-contact with node until all cleanup is done */
+
+	n_ptr->cleanup_required = 1;
+	tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr);
+}
+
+struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 domain;
+	struct sk_buff *buf;
+	struct tipc_node *n_ptr;
+	struct tipc_node_info node_info;
+	u32 payload_size;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	domain = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (!tipc_addr_domain_valid(domain))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (network address)");
+
+	read_lock_bh(&tipc_net_lock);
+	if (!tipc_num_nodes) {
+		read_unlock_bh(&tipc_net_lock);
+		return tipc_cfg_reply_none();
+	}
+
+	/* For now, get space for all other nodes */
+
+	payload_size = TLV_SPACE(sizeof(node_info)) * tipc_num_nodes;
+	if (payload_size > 32768u) {
+		read_unlock_bh(&tipc_net_lock);
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+						   " (too many nodes)");
+	}
+	buf = tipc_cfg_reply_alloc(payload_size);
+	if (!buf) {
+		read_unlock_bh(&tipc_net_lock);
+		return NULL;
+	}
+
+	/* Add TLVs for all nodes in scope */
+
+	list_for_each_entry(n_ptr, &tipc_node_list, list) {
+		if (!tipc_in_scope(domain, n_ptr->addr))
+			continue;
+		node_info.addr = htonl(n_ptr->addr);
+		node_info.up = htonl(tipc_node_is_up(n_ptr));
+		tipc_cfg_append_tlv(buf, TIPC_TLV_NODE_INFO,
+				    &node_info, sizeof(node_info));
+	}
+
+	read_unlock_bh(&tipc_net_lock);
+	return buf;
+}
+
+struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
+{
+	u32 domain;
+	struct sk_buff *buf;
+	struct tipc_node *n_ptr;
+	struct tipc_link_info link_info;
+	u32 payload_size;
+
+	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_NET_ADDR))
+		return tipc_cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
+
+	domain = ntohl(*(__be32 *)TLV_DATA(req_tlv_area));
+	if (!tipc_addr_domain_valid(domain))
+		return tipc_cfg_reply_error_string(TIPC_CFG_INVALID_VALUE
+						   " (network address)");
+
+	if (tipc_mode != TIPC_NET_MODE)
+		return tipc_cfg_reply_none();
+
+	read_lock_bh(&tipc_net_lock);
+
+	/* Get space for all unicast links + multicast link */
+
+	payload_size = TLV_SPACE(sizeof(link_info)) *
+		(atomic_read(&tipc_num_links) + 1);
+	if (payload_size > 32768u) {
+		read_unlock_bh(&tipc_net_lock);
+		return tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED
+						   " (too many links)");
+	}
+	buf = tipc_cfg_reply_alloc(payload_size);
+	if (!buf) {
+		read_unlock_bh(&tipc_net_lock);
+		return NULL;
+	}
+
+	/* Add TLV for broadcast link */
+
+	link_info.dest = htonl(tipc_cluster_mask(tipc_own_addr));
+	link_info.up = htonl(1);
+	strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME);
+	tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
+
+	/* Add TLVs for any other links in scope */
+
+	list_for_each_entry(n_ptr, &tipc_node_list, list) {
+		u32 i;
+
+		if (!tipc_in_scope(domain, n_ptr->addr))
+			continue;
+		tipc_node_lock(n_ptr);
+		for (i = 0; i < MAX_BEARERS; i++) {
+			if (!n_ptr->links[i])
+				continue;
+			link_info.dest = htonl(n_ptr->addr);
+			link_info.up = htonl(tipc_link_is_up(n_ptr->links[i]));
+			strcpy(link_info.str, n_ptr->links[i]->name);
+			tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO,
+					    &link_info, sizeof(link_info));
+		}
+		tipc_node_unlock(n_ptr);
+	}
+
+	read_unlock_bh(&tipc_net_lock);
+	return buf;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
new file mode 100644
index 00000000..5c61afc7
--- /dev/null
+++ b/net/tipc/node.h
@@ -0,0 +1,134 @@
+/*
+ * net/tipc/node.h: Include file for TIPC node management routines
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NODE_H
+#define _TIPC_NODE_H
+
+#include "node_subscr.h"
+#include "addr.h"
+#include "net.h"
+#include "bearer.h"
+
+/**
+ * struct tipc_node - TIPC node structure
+ * @addr: network address of node
+ * @lock: spinlock governing access to structure
+ * @hash: links to adjacent nodes in unsorted hash chain
+ * @list: links to adjacent nodes in sorted list of cluster's nodes
+ * @nsub: list of "node down" subscriptions monitoring node
+ * @active_links: pointers to active links to node
+ * @links: pointers to all links to node
+ * @working_links: number of working links to node (both active and standby)
+ * @cleanup_required: non-zero if cleaning up after a prior loss of contact
+ * @link_cnt: number of links to node
+ * @permit_changeover: non-zero if node has redundant links to this system
+ * @bclink: broadcast-related info
+ *    @supported: non-zero if node supports TIPC b'cast capability
+ *    @acked: sequence # of last outbound b'cast message acknowledged by node
+ *    @last_in: sequence # of last in-sequence b'cast message received from node
+ *    @gap_after: sequence # of last message not requiring a NAK request
+ *    @gap_to: sequence # of last message requiring a NAK request
+ *    @nack_sync: counter that determines when NAK requests should be sent
+ *    @deferred_head: oldest OOS b'cast message received from node
+ *    @deferred_tail: newest OOS b'cast message received from node
+ *    @defragm: list of partially reassembled b'cast message fragments from node
+ */
+
+struct tipc_node {
+	u32 addr;
+	spinlock_t lock;
+	struct hlist_node hash;
+	struct list_head list;
+	struct list_head nsub;
+	struct link *active_links[2];
+	struct link *links[MAX_BEARERS];
+	int link_cnt;
+	int working_links;
+	int cleanup_required;
+	int permit_changeover;
+	struct {
+		int supported;
+		u32 acked;
+		u32 last_in;
+		u32 gap_after;
+		u32 gap_to;
+		u32 nack_sync;
+		struct sk_buff *deferred_head;
+		struct sk_buff *deferred_tail;
+		struct sk_buff *defragm;
+	} bclink;
+};
+
+#define NODE_HTABLE_SIZE 512
+extern struct list_head tipc_node_list;
+
+/*
+ * A trivial power-of-two bitmask technique is used for speed, since this
+ * operation is done for every incoming TIPC packet. The number of hash table
+ * entries has been chosen so that no hash chain exceeds 8 nodes and will
+ * usually be much smaller (typically only a single node).
+ */
+static inline unsigned int tipc_hashfn(u32 addr)
+{
+	return addr & (NODE_HTABLE_SIZE - 1);
+}
+
+extern u32 tipc_own_tag;
+
+struct tipc_node *tipc_node_find(u32 addr);
+struct tipc_node *tipc_node_create(u32 addr);
+void tipc_node_delete(struct tipc_node *n_ptr);
+void tipc_node_attach_link(struct tipc_node *n_ptr, struct link *l_ptr);
+void tipc_node_detach_link(struct tipc_node *n_ptr, struct link *l_ptr);
+void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr);
+void tipc_node_link_up(struct tipc_node *n_ptr, struct link *l_ptr);
+int tipc_node_active_links(struct tipc_node *n_ptr);
+int tipc_node_redundant_links(struct tipc_node *n_ptr);
+int tipc_node_is_up(struct tipc_node *n_ptr);
+struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space);
+struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space);
+
+static inline void tipc_node_lock(struct tipc_node *n_ptr)
+{
+	spin_lock_bh(&n_ptr->lock);
+}
+
+static inline void tipc_node_unlock(struct tipc_node *n_ptr)
+{
+	spin_unlock_bh(&n_ptr->lock);
+}
+
+#endif
diff --git a/net/tipc/node_subscr.c b/net/tipc/node_subscr.c
new file mode 100644
index 00000000..c3c2815a
--- /dev/null
+++ b/net/tipc/node_subscr.c
@@ -0,0 +1,97 @@
+/*
+ * net/tipc/node_subscr.c: TIPC "node down" subscription handling
+ *
+ * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "node_subscr.h"
+#include "node.h"
+
+/**
+ * tipc_nodesub_subscribe - create "node down" subscription for specified node
+ */
+
+void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr,
+		       void *usr_handle, net_ev_handler handle_down)
+{
+	if (addr == tipc_own_addr) {
+		node_sub->node = NULL;
+		return;
+	}
+
+	node_sub->node = tipc_node_find(addr);
+	if (!node_sub->node) {
+		warn("Node subscription rejected, unknown node 0x%x\n", addr);
+		return;
+	}
+	node_sub->handle_node_down = handle_down;
+	node_sub->usr_handle = usr_handle;
+
+	tipc_node_lock(node_sub->node);
+	list_add_tail(&node_sub->nodesub_list, &node_sub->node->nsub);
+	tipc_node_unlock(node_sub->node);
+}
+
+/**
+ * tipc_nodesub_unsubscribe - cancel "node down" subscription (if any)
+ */
+
+void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub)
+{
+	if (!node_sub->node)
+		return;
+
+	tipc_node_lock(node_sub->node);
+	list_del_init(&node_sub->nodesub_list);
+	tipc_node_unlock(node_sub->node);
+}
+
+/**
+ * tipc_nodesub_notify - notify subscribers that a node is unreachable
+ *
+ * Note: node is locked by caller
+ */
+
+void tipc_nodesub_notify(struct tipc_node *node)
+{
+	struct tipc_node_subscr *ns;
+
+	list_for_each_entry(ns, &node->nsub, nodesub_list) {
+		if (ns->handle_node_down) {
+			tipc_k_signal((Handler)ns->handle_node_down,
+				      (unsigned long)ns->usr_handle);
+			ns->handle_node_down = NULL;
+		}
+	}
+}
diff --git a/net/tipc/node_subscr.h b/net/tipc/node_subscr.h
new file mode 100644
index 00000000..4bc2ca08
--- /dev/null
+++ b/net/tipc/node_subscr.h
@@ -0,0 +1,64 @@
+/*
+ * net/tipc/node_subscr.h: Include file for TIPC "node down" subscription handling
+ *
+ * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 2005, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_NODE_SUBSCR_H
+#define _TIPC_NODE_SUBSCR_H
+
+#include "addr.h"
+
+typedef void (*net_ev_handler) (void *usr_handle);
+
+/**
+ * struct tipc_node_subscr - "node down" subscription entry
+ * @node: ptr to node structure of interest (or NULL, if none)
+ * @handle_node_down: routine to invoke when node fails
+ * @usr_handle: argument to pass to routine when node fails
+ * @nodesub_list: adjacent entries in list of subscriptions for the node
+ */
+
+struct tipc_node_subscr {
+	struct tipc_node *node;
+	net_ev_handler handle_node_down;
+	void *usr_handle;
+	struct list_head nodesub_list;
+};
+
+void tipc_nodesub_subscribe(struct tipc_node_subscr *node_sub, u32 addr,
+			    void *usr_handle, net_ev_handler handle_down);
+void tipc_nodesub_unsubscribe(struct tipc_node_subscr *node_sub);
+void tipc_nodesub_notify(struct tipc_node *node);
+
+#endif
diff --git a/net/tipc/port.c b/net/tipc/port.c
new file mode 100644
index 00000000..c68dc956
--- /dev/null
+++ b/net/tipc/port.c
@@ -0,0 +1,1355 @@
+/*
+ * net/tipc/port.c: TIPC port code
+ *
+ * Copyright (c) 1992-2007, Ericsson AB
+ * Copyright (c) 2004-2008, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "config.h"
+#include "port.h"
+#include "name_table.h"
+
+/* Connection management: */
+#define PROBING_INTERVAL 3600000	/* [ms] => 1 h */
+#define CONFIRMED 0
+#define PROBING 1
+
+#define MAX_REJECT_SIZE 1024
+
+static struct sk_buff *msg_queue_head;
+static struct sk_buff *msg_queue_tail;
+
+DEFINE_SPINLOCK(tipc_port_list_lock);
+static DEFINE_SPINLOCK(queue_lock);
+
+static LIST_HEAD(ports);
+static void port_handle_node_down(unsigned long ref);
+static struct sk_buff *port_build_self_abort_msg(struct tipc_port *, u32 err);
+static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *, u32 err);
+static void port_timeout(unsigned long ref);
+
+
+static u32 port_peernode(struct tipc_port *p_ptr)
+{
+	return msg_destnode(&p_ptr->phdr);
+}
+
+static u32 port_peerport(struct tipc_port *p_ptr)
+{
+	return msg_destport(&p_ptr->phdr);
+}
+
+/**
+ * tipc_multicast - send a multicast message to local and remote destinations
+ */
+
+int tipc_multicast(u32 ref, struct tipc_name_seq const *seq,
+		   u32 num_sect, struct iovec const *msg_sect,
+		   unsigned int total_len)
+{
+	struct tipc_msg *hdr;
+	struct sk_buff *buf;
+	struct sk_buff *ibuf = NULL;
+	struct port_list dports = {0, NULL, };
+	struct tipc_port *oport = tipc_port_deref(ref);
+	int ext_targets;
+	int res;
+
+	if (unlikely(!oport))
+		return -EINVAL;
+
+	/* Create multicast message */
+
+	hdr = &oport->phdr;
+	msg_set_type(hdr, TIPC_MCAST_MSG);
+	msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
+	msg_set_destport(hdr, 0);
+	msg_set_destnode(hdr, 0);
+	msg_set_nametype(hdr, seq->type);
+	msg_set_namelower(hdr, seq->lower);
+	msg_set_nameupper(hdr, seq->upper);
+	msg_set_hdr_sz(hdr, MCAST_H_SIZE);
+	res = tipc_msg_build(hdr, msg_sect, num_sect, total_len, MAX_MSG_SIZE,
+			!oport->user_port, &buf);
+	if (unlikely(!buf))
+		return res;
+
+	/* Figure out where to send multicast message */
+
+	ext_targets = tipc_nametbl_mc_translate(seq->type, seq->lower, seq->upper,
+						TIPC_NODE_SCOPE, &dports);
+
+	/* Send message to destinations (duplicate it only if necessary) */
+
+	if (ext_targets) {
+		if (dports.count != 0) {
+			ibuf = skb_copy(buf, GFP_ATOMIC);
+			if (ibuf == NULL) {
+				tipc_port_list_free(&dports);
+				buf_discard(buf);
+				return -ENOMEM;
+			}
+		}
+		res = tipc_bclink_send_msg(buf);
+		if ((res < 0) && (dports.count != 0))
+			buf_discard(ibuf);
+	} else {
+		ibuf = buf;
+	}
+
+	if (res >= 0) {
+		if (ibuf)
+			tipc_port_recv_mcast(ibuf, &dports);
+	} else {
+		tipc_port_list_free(&dports);
+	}
+	return res;
+}
+
+/**
+ * tipc_port_recv_mcast - deliver multicast message to all destination ports
+ *
+ * If there is no port list, perform a lookup to create one
+ */
+
+void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp)
+{
+	struct tipc_msg *msg;
+	struct port_list dports = {0, NULL, };
+	struct port_list *item = dp;
+	int cnt = 0;
+
+	msg = buf_msg(buf);
+
+	/* Create destination port list, if one wasn't supplied */
+
+	if (dp == NULL) {
+		tipc_nametbl_mc_translate(msg_nametype(msg),
+				     msg_namelower(msg),
+				     msg_nameupper(msg),
+				     TIPC_CLUSTER_SCOPE,
+				     &dports);
+		item = dp = &dports;
+	}
+
+	/* Deliver a copy of message to each destination port */
+
+	if (dp->count != 0) {
+		msg_set_destnode(msg, tipc_own_addr);
+		if (dp->count == 1) {
+			msg_set_destport(msg, dp->ports[0]);
+			tipc_port_recv_msg(buf);
+			tipc_port_list_free(dp);
+			return;
+		}
+		for (; cnt < dp->count; cnt++) {
+			int index = cnt % PLSIZE;
+			struct sk_buff *b = skb_clone(buf, GFP_ATOMIC);
+
+			if (b == NULL) {
+				warn("Unable to deliver multicast message(s)\n");
+				goto exit;
+			}
+			if ((index == 0) && (cnt != 0))
+				item = item->next;
+			msg_set_destport(buf_msg(b), item->ports[index]);
+			tipc_port_recv_msg(b);
+		}
+	}
+exit:
+	buf_discard(buf);
+	tipc_port_list_free(dp);
+}
+
+/**
+ * tipc_createport_raw - create a generic TIPC port
+ *
+ * Returns pointer to (locked) TIPC port, or NULL if unable to create it
+ */
+
+struct tipc_port *tipc_createport_raw(void *usr_handle,
+			u32 (*dispatcher)(struct tipc_port *, struct sk_buff *),
+			void (*wakeup)(struct tipc_port *),
+			const u32 importance)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+	u32 ref;
+
+	p_ptr = kzalloc(sizeof(*p_ptr), GFP_ATOMIC);
+	if (!p_ptr) {
+		warn("Port creation failed, no memory\n");
+		return NULL;
+	}
+	ref = tipc_ref_acquire(p_ptr, &p_ptr->lock);
+	if (!ref) {
+		warn("Port creation failed, reference table exhausted\n");
+		kfree(p_ptr);
+		return NULL;
+	}
+
+	p_ptr->usr_handle = usr_handle;
+	p_ptr->max_pkt = MAX_PKT_DEFAULT;
+	p_ptr->ref = ref;
+	msg = &p_ptr->phdr;
+	tipc_msg_init(msg, importance, TIPC_NAMED_MSG, LONG_H_SIZE, 0);
+	msg_set_origport(msg, ref);
+	INIT_LIST_HEAD(&p_ptr->wait_list);
+	INIT_LIST_HEAD(&p_ptr->subscription.nodesub_list);
+	p_ptr->dispatcher = dispatcher;
+	p_ptr->wakeup = wakeup;
+	p_ptr->user_port = NULL;
+	k_init_timer(&p_ptr->timer, (Handler)port_timeout, ref);
+	spin_lock_bh(&tipc_port_list_lock);
+	INIT_LIST_HEAD(&p_ptr->publications);
+	INIT_LIST_HEAD(&p_ptr->port_list);
+	list_add_tail(&p_ptr->port_list, &ports);
+	spin_unlock_bh(&tipc_port_list_lock);
+	return p_ptr;
+}
+
+int tipc_deleteport(u32 ref)
+{
+	struct tipc_port *p_ptr;
+	struct sk_buff *buf = NULL;
+
+	tipc_withdraw(ref, 0, NULL);
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+
+	tipc_ref_discard(ref);
+	tipc_port_unlock(p_ptr);
+
+	k_cancel_timer(&p_ptr->timer);
+	if (p_ptr->connected) {
+		buf = port_build_peer_abort_msg(p_ptr, TIPC_ERR_NO_PORT);
+		tipc_nodesub_unsubscribe(&p_ptr->subscription);
+	}
+	kfree(p_ptr->user_port);
+
+	spin_lock_bh(&tipc_port_list_lock);
+	list_del(&p_ptr->port_list);
+	list_del(&p_ptr->wait_list);
+	spin_unlock_bh(&tipc_port_list_lock);
+	k_term_timer(&p_ptr->timer);
+	kfree(p_ptr);
+	tipc_net_route_msg(buf);
+	return 0;
+}
+
+static int port_unreliable(struct tipc_port *p_ptr)
+{
+	return msg_src_droppable(&p_ptr->phdr);
+}
+
+int tipc_portunreliable(u32 ref, unsigned int *isunreliable)
+{
+	struct tipc_port *p_ptr;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*isunreliable = port_unreliable(p_ptr);
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+int tipc_set_portunreliable(u32 ref, unsigned int isunreliable)
+{
+	struct tipc_port *p_ptr;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_src_droppable(&p_ptr->phdr, (isunreliable != 0));
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+static int port_unreturnable(struct tipc_port *p_ptr)
+{
+	return msg_dest_droppable(&p_ptr->phdr);
+}
+
+int tipc_portunreturnable(u32 ref, unsigned int *isunrejectable)
+{
+	struct tipc_port *p_ptr;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*isunrejectable = port_unreturnable(p_ptr);
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+int tipc_set_portunreturnable(u32 ref, unsigned int isunrejectable)
+{
+	struct tipc_port *p_ptr;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_dest_droppable(&p_ptr->phdr, (isunrejectable != 0));
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+/*
+ * port_build_proto_msg(): build a port level protocol
+ * or a connection abortion message. Called with
+ * tipc_port lock on.
+ */
+static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
+					    u32 origport, u32 orignode,
+					    u32 usr, u32 type, u32 err,
+					    u32 ack)
+{
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+
+	buf = tipc_buf_acquire(LONG_H_SIZE);
+	if (buf) {
+		msg = buf_msg(buf);
+		tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode);
+		msg_set_errcode(msg, err);
+		msg_set_destport(msg, destport);
+		msg_set_origport(msg, origport);
+		msg_set_orignode(msg, orignode);
+		msg_set_msgcnt(msg, ack);
+	}
+	return buf;
+}
+
+int tipc_reject_msg(struct sk_buff *buf, u32 err)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct sk_buff *rbuf;
+	struct tipc_msg *rmsg;
+	int hdr_sz;
+	u32 imp = msg_importance(msg);
+	u32 data_sz = msg_data_sz(msg);
+
+	if (data_sz > MAX_REJECT_SIZE)
+		data_sz = MAX_REJECT_SIZE;
+	if (msg_connected(msg) && (imp < TIPC_CRITICAL_IMPORTANCE))
+		imp++;
+
+	/* discard rejected message if it shouldn't be returned to sender */
+	if (msg_errcode(msg) || msg_dest_droppable(msg)) {
+		buf_discard(buf);
+		return data_sz;
+	}
+
+	/* construct rejected message */
+	if (msg_mcast(msg))
+		hdr_sz = MCAST_H_SIZE;
+	else
+		hdr_sz = LONG_H_SIZE;
+	rbuf = tipc_buf_acquire(data_sz + hdr_sz);
+	if (rbuf == NULL) {
+		buf_discard(buf);
+		return data_sz;
+	}
+	rmsg = buf_msg(rbuf);
+	tipc_msg_init(rmsg, imp, msg_type(msg), hdr_sz, msg_orignode(msg));
+	msg_set_errcode(rmsg, err);
+	msg_set_destport(rmsg, msg_origport(msg));
+	msg_set_origport(rmsg, msg_destport(msg));
+	if (msg_short(msg)) {
+		msg_set_orignode(rmsg, tipc_own_addr);
+		/* leave name type & instance as zeroes */
+	} else {
+		msg_set_orignode(rmsg, msg_destnode(msg));
+		msg_set_nametype(rmsg, msg_nametype(msg));
+		msg_set_nameinst(rmsg, msg_nameinst(msg));
+	}
+	msg_set_size(rmsg, data_sz + hdr_sz);
+	skb_copy_to_linear_data_offset(rbuf, hdr_sz, msg_data(msg), data_sz);
+
+	/* send self-abort message when rejecting on a connected port */
+	if (msg_connected(msg)) {
+		struct sk_buff *abuf = NULL;
+		struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg));
+
+		if (p_ptr) {
+			if (p_ptr->connected)
+				abuf = port_build_self_abort_msg(p_ptr, err);
+			tipc_port_unlock(p_ptr);
+		}
+		tipc_net_route_msg(abuf);
+	}
+
+	/* send rejected message */
+	buf_discard(buf);
+	tipc_net_route_msg(rbuf);
+	return data_sz;
+}
+
+int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr,
+			      struct iovec const *msg_sect, u32 num_sect,
+			      unsigned int total_len, int err)
+{
+	struct sk_buff *buf;
+	int res;
+
+	res = tipc_msg_build(hdr, msg_sect, num_sect, total_len, MAX_MSG_SIZE,
+			!p_ptr->user_port, &buf);
+	if (!buf)
+		return res;
+
+	return tipc_reject_msg(buf, err);
+}
+
+static void port_timeout(unsigned long ref)
+{
+	struct tipc_port *p_ptr = tipc_port_lock(ref);
+	struct sk_buff *buf = NULL;
+
+	if (!p_ptr)
+		return;
+
+	if (!p_ptr->connected) {
+		tipc_port_unlock(p_ptr);
+		return;
+	}
+
+	/* Last probe answered ? */
+	if (p_ptr->probing_state == PROBING) {
+		buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_PORT);
+	} else {
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   p_ptr->ref,
+					   tipc_own_addr,
+					   CONN_MANAGER,
+					   CONN_PROBE,
+					   TIPC_OK,
+					   0);
+		p_ptr->probing_state = PROBING;
+		k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
+	}
+	tipc_port_unlock(p_ptr);
+	tipc_net_route_msg(buf);
+}
+
+
+static void port_handle_node_down(unsigned long ref)
+{
+	struct tipc_port *p_ptr = tipc_port_lock(ref);
+	struct sk_buff *buf = NULL;
+
+	if (!p_ptr)
+		return;
+	buf = port_build_self_abort_msg(p_ptr, TIPC_ERR_NO_NODE);
+	tipc_port_unlock(p_ptr);
+	tipc_net_route_msg(buf);
+}
+
+
+static struct sk_buff *port_build_self_abort_msg(struct tipc_port *p_ptr, u32 err)
+{
+	u32 imp = msg_importance(&p_ptr->phdr);
+
+	if (!p_ptr->connected)
+		return NULL;
+	if (imp < TIPC_CRITICAL_IMPORTANCE)
+		imp++;
+	return port_build_proto_msg(p_ptr->ref,
+				    tipc_own_addr,
+				    port_peerport(p_ptr),
+				    port_peernode(p_ptr),
+				    imp,
+				    TIPC_CONN_MSG,
+				    err,
+				    0);
+}
+
+
+static struct sk_buff *port_build_peer_abort_msg(struct tipc_port *p_ptr, u32 err)
+{
+	u32 imp = msg_importance(&p_ptr->phdr);
+
+	if (!p_ptr->connected)
+		return NULL;
+	if (imp < TIPC_CRITICAL_IMPORTANCE)
+		imp++;
+	return port_build_proto_msg(port_peerport(p_ptr),
+				    port_peernode(p_ptr),
+				    p_ptr->ref,
+				    tipc_own_addr,
+				    imp,
+				    TIPC_CONN_MSG,
+				    err,
+				    0);
+}
+
+void tipc_port_recv_proto_msg(struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	struct tipc_port *p_ptr = tipc_port_lock(msg_destport(msg));
+	u32 err = TIPC_OK;
+	struct sk_buff *r_buf = NULL;
+	struct sk_buff *abort_buf = NULL;
+
+	if (!p_ptr) {
+		err = TIPC_ERR_NO_PORT;
+	} else if (p_ptr->connected) {
+		if ((port_peernode(p_ptr) != msg_orignode(msg)) ||
+		    (port_peerport(p_ptr) != msg_origport(msg))) {
+			err = TIPC_ERR_NO_PORT;
+		} else if (msg_type(msg) == CONN_ACK) {
+			int wakeup = tipc_port_congested(p_ptr) &&
+				     p_ptr->congested &&
+				     p_ptr->wakeup;
+			p_ptr->acked += msg_msgcnt(msg);
+			if (tipc_port_congested(p_ptr))
+				goto exit;
+			p_ptr->congested = 0;
+			if (!wakeup)
+				goto exit;
+			p_ptr->wakeup(p_ptr);
+			goto exit;
+		}
+	} else if (p_ptr->published) {
+		err = TIPC_ERR_NO_PORT;
+	}
+	if (err) {
+		r_buf = port_build_proto_msg(msg_origport(msg),
+					     msg_orignode(msg),
+					     msg_destport(msg),
+					     tipc_own_addr,
+					     TIPC_HIGH_IMPORTANCE,
+					     TIPC_CONN_MSG,
+					     err,
+					     0);
+		goto exit;
+	}
+
+	/* All is fine */
+	if (msg_type(msg) == CONN_PROBE) {
+		r_buf = port_build_proto_msg(msg_origport(msg),
+					     msg_orignode(msg),
+					     msg_destport(msg),
+					     tipc_own_addr,
+					     CONN_MANAGER,
+					     CONN_PROBE_REPLY,
+					     TIPC_OK,
+					     0);
+	}
+	p_ptr->probing_state = CONFIRMED;
+exit:
+	if (p_ptr)
+		tipc_port_unlock(p_ptr);
+	tipc_net_route_msg(r_buf);
+	tipc_net_route_msg(abort_buf);
+	buf_discard(buf);
+}
+
+static void port_print(struct tipc_port *p_ptr, struct print_buf *buf, int full_id)
+{
+	struct publication *publ;
+
+	if (full_id)
+		tipc_printf(buf, "<%u.%u.%u:%u>:",
+			    tipc_zone(tipc_own_addr), tipc_cluster(tipc_own_addr),
+			    tipc_node(tipc_own_addr), p_ptr->ref);
+	else
+		tipc_printf(buf, "%-10u:", p_ptr->ref);
+
+	if (p_ptr->connected) {
+		u32 dport = port_peerport(p_ptr);
+		u32 destnode = port_peernode(p_ptr);
+
+		tipc_printf(buf, " connected to <%u.%u.%u:%u>",
+			    tipc_zone(destnode), tipc_cluster(destnode),
+			    tipc_node(destnode), dport);
+		if (p_ptr->conn_type != 0)
+			tipc_printf(buf, " via {%u,%u}",
+				    p_ptr->conn_type,
+				    p_ptr->conn_instance);
+	} else if (p_ptr->published) {
+		tipc_printf(buf, " bound to");
+		list_for_each_entry(publ, &p_ptr->publications, pport_list) {
+			if (publ->lower == publ->upper)
+				tipc_printf(buf, " {%u,%u}", publ->type,
+					    publ->lower);
+			else
+				tipc_printf(buf, " {%u,%u,%u}", publ->type,
+					    publ->lower, publ->upper);
+		}
+	}
+	tipc_printf(buf, "\n");
+}
+
+#define MAX_PORT_QUERY 32768
+
+struct sk_buff *tipc_port_get_ports(void)
+{
+	struct sk_buff *buf;
+	struct tlv_desc *rep_tlv;
+	struct print_buf pb;
+	struct tipc_port *p_ptr;
+	int str_len;
+
+	buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_QUERY));
+	if (!buf)
+		return NULL;
+	rep_tlv = (struct tlv_desc *)buf->data;
+
+	tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_QUERY);
+	spin_lock_bh(&tipc_port_list_lock);
+	list_for_each_entry(p_ptr, &ports, port_list) {
+		spin_lock_bh(p_ptr->lock);
+		port_print(p_ptr, &pb, 0);
+		spin_unlock_bh(p_ptr->lock);
+	}
+	spin_unlock_bh(&tipc_port_list_lock);
+	str_len = tipc_printbuf_validate(&pb);
+
+	skb_put(buf, TLV_SPACE(str_len));
+	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
+
+	return buf;
+}
+
+void tipc_port_reinit(void)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+
+	spin_lock_bh(&tipc_port_list_lock);
+	list_for_each_entry(p_ptr, &ports, port_list) {
+		msg = &p_ptr->phdr;
+		if (msg_orignode(msg) == tipc_own_addr)
+			break;
+		msg_set_prevnode(msg, tipc_own_addr);
+		msg_set_orignode(msg, tipc_own_addr);
+	}
+	spin_unlock_bh(&tipc_port_list_lock);
+}
+
+
+/*
+ *  port_dispatcher_sigh(): Signal handler for messages destinated
+ *                          to the tipc_port interface.
+ */
+
+static void port_dispatcher_sigh(void *dummy)
+{
+	struct sk_buff *buf;
+
+	spin_lock_bh(&queue_lock);
+	buf = msg_queue_head;
+	msg_queue_head = NULL;
+	spin_unlock_bh(&queue_lock);
+
+	while (buf) {
+		struct tipc_port *p_ptr;
+		struct user_port *up_ptr;
+		struct tipc_portid orig;
+		struct tipc_name_seq dseq;
+		void *usr_handle;
+		int connected;
+		int published;
+		u32 message_type;
+
+		struct sk_buff *next = buf->next;
+		struct tipc_msg *msg = buf_msg(buf);
+		u32 dref = msg_destport(msg);
+
+		message_type = msg_type(msg);
+		if (message_type > TIPC_DIRECT_MSG)
+			goto reject;	/* Unsupported message type */
+
+		p_ptr = tipc_port_lock(dref);
+		if (!p_ptr)
+			goto reject;	/* Port deleted while msg in queue */
+
+		orig.ref = msg_origport(msg);
+		orig.node = msg_orignode(msg);
+		up_ptr = p_ptr->user_port;
+		usr_handle = up_ptr->usr_handle;
+		connected = p_ptr->connected;
+		published = p_ptr->published;
+
+		if (unlikely(msg_errcode(msg)))
+			goto err;
+
+		switch (message_type) {
+
+		case TIPC_CONN_MSG:{
+				tipc_conn_msg_event cb = up_ptr->conn_msg_cb;
+				u32 peer_port = port_peerport(p_ptr);
+				u32 peer_node = port_peernode(p_ptr);
+				u32 dsz;
+
+				tipc_port_unlock(p_ptr);
+				if (unlikely(!cb))
+					goto reject;
+				if (unlikely(!connected)) {
+					if (tipc_connect2port(dref, &orig))
+						goto reject;
+				} else if ((msg_origport(msg) != peer_port) ||
+					   (msg_orignode(msg) != peer_node))
+					goto reject;
+				dsz = msg_data_sz(msg);
+				if (unlikely(dsz &&
+					     (++p_ptr->conn_unacked >=
+					      TIPC_FLOW_CONTROL_WIN)))
+					tipc_acknowledge(dref,
+							 p_ptr->conn_unacked);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg), dsz);
+				break;
+			}
+		case TIPC_DIRECT_MSG:{
+				tipc_msg_event cb = up_ptr->msg_cb;
+
+				tipc_port_unlock(p_ptr);
+				if (unlikely(!cb || connected))
+					goto reject;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_importance(msg),
+				   &orig);
+				break;
+			}
+		case TIPC_MCAST_MSG:
+		case TIPC_NAMED_MSG:{
+				tipc_named_msg_event cb = up_ptr->named_msg_cb;
+
+				tipc_port_unlock(p_ptr);
+				if (unlikely(!cb || connected || !published))
+					goto reject;
+				dseq.type =  msg_nametype(msg);
+				dseq.lower = msg_nameinst(msg);
+				dseq.upper = (message_type == TIPC_NAMED_MSG)
+					? dseq.lower : msg_nameupper(msg);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_importance(msg),
+				   &orig, &dseq);
+				break;
+			}
+		}
+		if (buf)
+			buf_discard(buf);
+		buf = next;
+		continue;
+err:
+		switch (message_type) {
+
+		case TIPC_CONN_MSG:{
+				tipc_conn_shutdown_event cb =
+					up_ptr->conn_err_cb;
+				u32 peer_port = port_peerport(p_ptr);
+				u32 peer_node = port_peernode(p_ptr);
+
+				tipc_port_unlock(p_ptr);
+				if (!cb || !connected)
+					break;
+				if ((msg_origport(msg) != peer_port) ||
+				    (msg_orignode(msg) != peer_node))
+					break;
+				tipc_disconnect(dref);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_errcode(msg));
+				break;
+			}
+		case TIPC_DIRECT_MSG:{
+				tipc_msg_err_event cb = up_ptr->err_cb;
+
+				tipc_port_unlock(p_ptr);
+				if (!cb || connected)
+					break;
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_errcode(msg), &orig);
+				break;
+			}
+		case TIPC_MCAST_MSG:
+		case TIPC_NAMED_MSG:{
+				tipc_named_msg_err_event cb =
+					up_ptr->named_err_cb;
+
+				tipc_port_unlock(p_ptr);
+				if (!cb || connected)
+					break;
+				dseq.type =  msg_nametype(msg);
+				dseq.lower = msg_nameinst(msg);
+				dseq.upper = (message_type == TIPC_NAMED_MSG)
+					? dseq.lower : msg_nameupper(msg);
+				skb_pull(buf, msg_hdr_sz(msg));
+				cb(usr_handle, dref, &buf, msg_data(msg),
+				   msg_data_sz(msg), msg_errcode(msg), &dseq);
+				break;
+			}
+		}
+		if (buf)
+			buf_discard(buf);
+		buf = next;
+		continue;
+reject:
+		tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+		buf = next;
+	}
+}
+
+/*
+ *  port_dispatcher(): Dispatcher for messages destinated
+ *  to the tipc_port interface. Called with port locked.
+ */
+
+static u32 port_dispatcher(struct tipc_port *dummy, struct sk_buff *buf)
+{
+	buf->next = NULL;
+	spin_lock_bh(&queue_lock);
+	if (msg_queue_head) {
+		msg_queue_tail->next = buf;
+		msg_queue_tail = buf;
+	} else {
+		msg_queue_tail = msg_queue_head = buf;
+		tipc_k_signal((Handler)port_dispatcher_sigh, 0);
+	}
+	spin_unlock_bh(&queue_lock);
+	return 0;
+}
+
+/*
+ * Wake up port after congestion: Called with port locked,
+ *
+ */
+
+static void port_wakeup_sh(unsigned long ref)
+{
+	struct tipc_port *p_ptr;
+	struct user_port *up_ptr;
+	tipc_continue_event cb = NULL;
+	void *uh = NULL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (p_ptr) {
+		up_ptr = p_ptr->user_port;
+		if (up_ptr) {
+			cb = up_ptr->continue_event_cb;
+			uh = up_ptr->usr_handle;
+		}
+		tipc_port_unlock(p_ptr);
+	}
+	if (cb)
+		cb(uh, ref);
+}
+
+
+static void port_wakeup(struct tipc_port *p_ptr)
+{
+	tipc_k_signal((Handler)port_wakeup_sh, p_ptr->ref);
+}
+
+void tipc_acknowledge(u32 ref, u32 ack)
+{
+	struct tipc_port *p_ptr;
+	struct sk_buff *buf = NULL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return;
+	if (p_ptr->connected) {
+		p_ptr->conn_unacked -= ack;
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   ref,
+					   tipc_own_addr,
+					   CONN_MANAGER,
+					   CONN_ACK,
+					   TIPC_OK,
+					   ack);
+	}
+	tipc_port_unlock(p_ptr);
+	tipc_net_route_msg(buf);
+}
+
+/*
+ * tipc_createport(): user level call.
+ */
+
+int tipc_createport(void *usr_handle,
+		    unsigned int importance,
+		    tipc_msg_err_event error_cb,
+		    tipc_named_msg_err_event named_error_cb,
+		    tipc_conn_shutdown_event conn_error_cb,
+		    tipc_msg_event msg_cb,
+		    tipc_named_msg_event named_msg_cb,
+		    tipc_conn_msg_event conn_msg_cb,
+		    tipc_continue_event continue_event_cb,/* May be zero */
+		    u32 *portref)
+{
+	struct user_port *up_ptr;
+	struct tipc_port *p_ptr;
+
+	up_ptr = kmalloc(sizeof(*up_ptr), GFP_ATOMIC);
+	if (!up_ptr) {
+		warn("Port creation failed, no memory\n");
+		return -ENOMEM;
+	}
+	p_ptr = (struct tipc_port *)tipc_createport_raw(NULL, port_dispatcher,
+						   port_wakeup, importance);
+	if (!p_ptr) {
+		kfree(up_ptr);
+		return -ENOMEM;
+	}
+
+	p_ptr->user_port = up_ptr;
+	up_ptr->usr_handle = usr_handle;
+	up_ptr->ref = p_ptr->ref;
+	up_ptr->err_cb = error_cb;
+	up_ptr->named_err_cb = named_error_cb;
+	up_ptr->conn_err_cb = conn_error_cb;
+	up_ptr->msg_cb = msg_cb;
+	up_ptr->named_msg_cb = named_msg_cb;
+	up_ptr->conn_msg_cb = conn_msg_cb;
+	up_ptr->continue_event_cb = continue_event_cb;
+	*portref = p_ptr->ref;
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+int tipc_portimportance(u32 ref, unsigned int *importance)
+{
+	struct tipc_port *p_ptr;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	*importance = (unsigned int)msg_importance(&p_ptr->phdr);
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+int tipc_set_portimportance(u32 ref, unsigned int imp)
+{
+	struct tipc_port *p_ptr;
+
+	if (imp > TIPC_CRITICAL_IMPORTANCE)
+		return -EINVAL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	msg_set_importance(&p_ptr->phdr, (u32)imp);
+	tipc_port_unlock(p_ptr);
+	return 0;
+}
+
+
+int tipc_publish(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
+{
+	struct tipc_port *p_ptr;
+	struct publication *publ;
+	u32 key;
+	int res = -EINVAL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+
+	if (p_ptr->connected)
+		goto exit;
+	if (seq->lower > seq->upper)
+		goto exit;
+	if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE))
+		goto exit;
+	key = ref + p_ptr->pub_count + 1;
+	if (key == ref) {
+		res = -EADDRINUSE;
+		goto exit;
+	}
+	publ = tipc_nametbl_publish(seq->type, seq->lower, seq->upper,
+				    scope, p_ptr->ref, key);
+	if (publ) {
+		list_add(&publ->pport_list, &p_ptr->publications);
+		p_ptr->pub_count++;
+		p_ptr->published = 1;
+		res = 0;
+	}
+exit:
+	tipc_port_unlock(p_ptr);
+	return res;
+}
+
+int tipc_withdraw(u32 ref, unsigned int scope, struct tipc_name_seq const *seq)
+{
+	struct tipc_port *p_ptr;
+	struct publication *publ;
+	struct publication *tpubl;
+	int res = -EINVAL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (!seq) {
+		list_for_each_entry_safe(publ, tpubl,
+					 &p_ptr->publications, pport_list) {
+			tipc_nametbl_withdraw(publ->type, publ->lower,
+					      publ->ref, publ->key);
+		}
+		res = 0;
+	} else {
+		list_for_each_entry_safe(publ, tpubl,
+					 &p_ptr->publications, pport_list) {
+			if (publ->scope != scope)
+				continue;
+			if (publ->type != seq->type)
+				continue;
+			if (publ->lower != seq->lower)
+				continue;
+			if (publ->upper != seq->upper)
+				break;
+			tipc_nametbl_withdraw(publ->type, publ->lower,
+					      publ->ref, publ->key);
+			res = 0;
+			break;
+		}
+	}
+	if (list_empty(&p_ptr->publications))
+		p_ptr->published = 0;
+	tipc_port_unlock(p_ptr);
+	return res;
+}
+
+int tipc_connect2port(u32 ref, struct tipc_portid const *peer)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+	int res = -EINVAL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	if (p_ptr->published || p_ptr->connected)
+		goto exit;
+	if (!peer->ref)
+		goto exit;
+
+	msg = &p_ptr->phdr;
+	msg_set_destnode(msg, peer->node);
+	msg_set_destport(msg, peer->ref);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_origport(msg, p_ptr->ref);
+	msg_set_type(msg, TIPC_CONN_MSG);
+	msg_set_lookup_scope(msg, 0);
+	msg_set_hdr_sz(msg, SHORT_H_SIZE);
+
+	p_ptr->probing_interval = PROBING_INTERVAL;
+	p_ptr->probing_state = CONFIRMED;
+	p_ptr->connected = 1;
+	k_start_timer(&p_ptr->timer, p_ptr->probing_interval);
+
+	tipc_nodesub_subscribe(&p_ptr->subscription, peer->node,
+			  (void *)(unsigned long)ref,
+			  (net_ev_handler)port_handle_node_down);
+	res = 0;
+exit:
+	tipc_port_unlock(p_ptr);
+	p_ptr->max_pkt = tipc_link_get_max_pkt(peer->node, ref);
+	return res;
+}
+
+/**
+ * tipc_disconnect_port - disconnect port from peer
+ *
+ * Port must be locked.
+ */
+
+int tipc_disconnect_port(struct tipc_port *tp_ptr)
+{
+	int res;
+
+	if (tp_ptr->connected) {
+		tp_ptr->connected = 0;
+		/* let timer expire on it's own to avoid deadlock! */
+		tipc_nodesub_unsubscribe(
+			&((struct tipc_port *)tp_ptr)->subscription);
+		res = 0;
+	} else {
+		res = -ENOTCONN;
+	}
+	return res;
+}
+
+/*
+ * tipc_disconnect(): Disconnect port form peer.
+ *                    This is a node local operation.
+ */
+
+int tipc_disconnect(u32 ref)
+{
+	struct tipc_port *p_ptr;
+	int res;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+	res = tipc_disconnect_port((struct tipc_port *)p_ptr);
+	tipc_port_unlock(p_ptr);
+	return res;
+}
+
+/*
+ * tipc_shutdown(): Send a SHUTDOWN msg to peer and disconnect
+ */
+int tipc_shutdown(u32 ref)
+{
+	struct tipc_port *p_ptr;
+	struct sk_buff *buf = NULL;
+
+	p_ptr = tipc_port_lock(ref);
+	if (!p_ptr)
+		return -EINVAL;
+
+	if (p_ptr->connected) {
+		u32 imp = msg_importance(&p_ptr->phdr);
+		if (imp < TIPC_CRITICAL_IMPORTANCE)
+			imp++;
+		buf = port_build_proto_msg(port_peerport(p_ptr),
+					   port_peernode(p_ptr),
+					   ref,
+					   tipc_own_addr,
+					   imp,
+					   TIPC_CONN_MSG,
+					   TIPC_CONN_SHUTDOWN,
+					   0);
+	}
+	tipc_port_unlock(p_ptr);
+	tipc_net_route_msg(buf);
+	return tipc_disconnect(ref);
+}
+
+/*
+ *  tipc_port_recv_sections(): Concatenate and deliver sectioned
+ *                        message for this node.
+ */
+
+static int tipc_port_recv_sections(struct tipc_port *sender, unsigned int num_sect,
+				   struct iovec const *msg_sect,
+				   unsigned int total_len)
+{
+	struct sk_buff *buf;
+	int res;
+
+	res = tipc_msg_build(&sender->phdr, msg_sect, num_sect, total_len,
+			MAX_MSG_SIZE, !sender->user_port, &buf);
+	if (likely(buf))
+		tipc_port_recv_msg(buf);
+	return res;
+}
+
+/**
+ * tipc_send - send message sections on connection
+ */
+
+int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect,
+	      unsigned int total_len)
+{
+	struct tipc_port *p_ptr;
+	u32 destnode;
+	int res;
+
+	p_ptr = tipc_port_deref(ref);
+	if (!p_ptr || !p_ptr->connected)
+		return -EINVAL;
+
+	p_ptr->congested = 1;
+	if (!tipc_port_congested(p_ptr)) {
+		destnode = port_peernode(p_ptr);
+		if (likely(destnode != tipc_own_addr))
+			res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect,
+							   total_len, destnode);
+		else
+			res = tipc_port_recv_sections(p_ptr, num_sect, msg_sect,
+						      total_len);
+
+		if (likely(res != -ELINKCONG)) {
+			p_ptr->congested = 0;
+			if (res > 0)
+				p_ptr->sent++;
+			return res;
+		}
+	}
+	if (port_unreliable(p_ptr)) {
+		p_ptr->congested = 0;
+		return total_len;
+	}
+	return -ELINKCONG;
+}
+
+/**
+ * tipc_send2name - send message sections to port name
+ */
+
+int tipc_send2name(u32 ref, struct tipc_name const *name, unsigned int domain,
+		   unsigned int num_sect, struct iovec const *msg_sect,
+		   unsigned int total_len)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+	u32 destnode = domain;
+	u32 destport;
+	int res;
+
+	p_ptr = tipc_port_deref(ref);
+	if (!p_ptr || p_ptr->connected)
+		return -EINVAL;
+
+	msg = &p_ptr->phdr;
+	msg_set_type(msg, TIPC_NAMED_MSG);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_origport(msg, ref);
+	msg_set_hdr_sz(msg, LONG_H_SIZE);
+	msg_set_nametype(msg, name->type);
+	msg_set_nameinst(msg, name->instance);
+	msg_set_lookup_scope(msg, tipc_addr_scope(domain));
+	destport = tipc_nametbl_translate(name->type, name->instance, &destnode);
+	msg_set_destnode(msg, destnode);
+	msg_set_destport(msg, destport);
+
+	if (likely(destport)) {
+		if (likely(destnode == tipc_own_addr))
+			res = tipc_port_recv_sections(p_ptr, num_sect,
+						      msg_sect, total_len);
+		else
+			res = tipc_link_send_sections_fast(p_ptr, msg_sect,
+							   num_sect, total_len,
+							   destnode);
+		if (likely(res != -ELINKCONG)) {
+			if (res > 0)
+				p_ptr->sent++;
+			return res;
+		}
+		if (port_unreliable(p_ptr)) {
+			return total_len;
+		}
+		return -ELINKCONG;
+	}
+	return tipc_port_reject_sections(p_ptr, msg, msg_sect, num_sect,
+					 total_len, TIPC_ERR_NO_NAME);
+}
+
+/**
+ * tipc_send2port - send message sections to port identity
+ */
+
+int tipc_send2port(u32 ref, struct tipc_portid const *dest,
+		   unsigned int num_sect, struct iovec const *msg_sect,
+		   unsigned int total_len)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+	int res;
+
+	p_ptr = tipc_port_deref(ref);
+	if (!p_ptr || p_ptr->connected)
+		return -EINVAL;
+
+	msg = &p_ptr->phdr;
+	msg_set_type(msg, TIPC_DIRECT_MSG);
+	msg_set_lookup_scope(msg, 0);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_origport(msg, ref);
+	msg_set_destnode(msg, dest->node);
+	msg_set_destport(msg, dest->ref);
+	msg_set_hdr_sz(msg, DIR_MSG_H_SIZE);
+
+	if (dest->node == tipc_own_addr)
+		res =  tipc_port_recv_sections(p_ptr, num_sect, msg_sect,
+					       total_len);
+	else
+		res = tipc_link_send_sections_fast(p_ptr, msg_sect, num_sect,
+						   total_len, dest->node);
+	if (likely(res != -ELINKCONG)) {
+		if (res > 0)
+			p_ptr->sent++;
+		return res;
+	}
+	if (port_unreliable(p_ptr)) {
+		return total_len;
+	}
+	return -ELINKCONG;
+}
+
+/**
+ * tipc_send_buf2port - send message buffer to port identity
+ */
+
+int tipc_send_buf2port(u32 ref, struct tipc_portid const *dest,
+	       struct sk_buff *buf, unsigned int dsz)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg;
+	int res;
+
+	p_ptr = (struct tipc_port *)tipc_ref_deref(ref);
+	if (!p_ptr || p_ptr->connected)
+		return -EINVAL;
+
+	msg = &p_ptr->phdr;
+	msg_set_type(msg, TIPC_DIRECT_MSG);
+	msg_set_orignode(msg, tipc_own_addr);
+	msg_set_origport(msg, ref);
+	msg_set_destnode(msg, dest->node);
+	msg_set_destport(msg, dest->ref);
+	msg_set_hdr_sz(msg, DIR_MSG_H_SIZE);
+	msg_set_size(msg, DIR_MSG_H_SIZE + dsz);
+	if (skb_cow(buf, DIR_MSG_H_SIZE))
+		return -ENOMEM;
+
+	skb_push(buf, DIR_MSG_H_SIZE);
+	skb_copy_to_linear_data(buf, msg, DIR_MSG_H_SIZE);
+
+	if (dest->node == tipc_own_addr)
+		res = tipc_port_recv_msg(buf);
+	else
+		res = tipc_send_buf_fast(buf, dest->node);
+	if (likely(res != -ELINKCONG)) {
+		if (res > 0)
+			p_ptr->sent++;
+		return res;
+	}
+	if (port_unreliable(p_ptr))
+		return dsz;
+	return -ELINKCONG;
+}
+
diff --git a/net/tipc/port.h b/net/tipc/port.h
new file mode 100644
index 00000000..b9aa3419
--- /dev/null
+++ b/net/tipc/port.h
@@ -0,0 +1,315 @@
+/*
+ * net/tipc/port.h: Include file for TIPC port code
+ *
+ * Copyright (c) 1994-2007, Ericsson AB
+ * Copyright (c) 2004-2007, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_PORT_H
+#define _TIPC_PORT_H
+
+#include "ref.h"
+#include "net.h"
+#include "msg.h"
+#include "node_subscr.h"
+
+#define TIPC_FLOW_CONTROL_WIN 512
+
+typedef void (*tipc_msg_err_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size, int reason,
+		struct tipc_portid const *attmpt_destid);
+
+typedef void (*tipc_named_msg_err_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size, int reason,
+		struct tipc_name_seq const *attmpt_dest);
+
+typedef void (*tipc_conn_shutdown_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size, int reason);
+
+typedef void (*tipc_msg_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size, unsigned int importance,
+		struct tipc_portid const *origin);
+
+typedef void (*tipc_named_msg_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size, unsigned int importance,
+		struct tipc_portid const *orig,
+		struct tipc_name_seq const *dest);
+
+typedef void (*tipc_conn_msg_event) (void *usr_handle, u32 portref,
+		struct sk_buff **buf, unsigned char const *data,
+		unsigned int size);
+
+typedef void (*tipc_continue_event) (void *usr_handle, u32 portref);
+
+/**
+ * struct user_port - TIPC user port (used with native API)
+ * @usr_handle: user-specified field
+ * @ref: object reference to associated TIPC port
+ * <various callback routines>
+ */
+
+struct user_port {
+	void *usr_handle;
+	u32 ref;
+	tipc_msg_err_event err_cb;
+	tipc_named_msg_err_event named_err_cb;
+	tipc_conn_shutdown_event conn_err_cb;
+	tipc_msg_event msg_cb;
+	tipc_named_msg_event named_msg_cb;
+	tipc_conn_msg_event conn_msg_cb;
+	tipc_continue_event continue_event_cb;
+};
+
+/**
+ * struct tipc_port - TIPC port structure
+ * @usr_handle: pointer to additional user-defined information about port
+ * @lock: pointer to spinlock for controlling access to port
+ * @connected: non-zero if port is currently connected to a peer port
+ * @conn_type: TIPC type used when connection was established
+ * @conn_instance: TIPC instance used when connection was established
+ * @conn_unacked: number of unacknowledged messages received from peer port
+ * @published: non-zero if port has one or more associated names
+ * @congested: non-zero if cannot send because of link or port congestion
+ * @max_pkt: maximum packet size "hint" used when building messages sent by port
+ * @ref: unique reference to port in TIPC object registry
+ * @phdr: preformatted message header used when sending messages
+ * @port_list: adjacent ports in TIPC's global list of ports
+ * @dispatcher: ptr to routine which handles received messages
+ * @wakeup: ptr to routine to call when port is no longer congested
+ * @user_port: ptr to user port associated with port (if any)
+ * @wait_list: adjacent ports in list of ports waiting on link congestion
+ * @waiting_pkts:
+ * @sent: # of non-empty messages sent by port
+ * @acked: # of non-empty message acknowledgements from connected port's peer
+ * @publications: list of publications for port
+ * @pub_count: total # of publications port has made during its lifetime
+ * @probing_state:
+ * @probing_interval:
+ * @timer_ref:
+ * @subscription: "node down" subscription used to terminate failed connections
+ */
+struct tipc_port {
+	void *usr_handle;
+	spinlock_t *lock;
+	int connected;
+	u32 conn_type;
+	u32 conn_instance;
+	u32 conn_unacked;
+	int published;
+	u32 congested;
+	u32 max_pkt;
+	u32 ref;
+	struct tipc_msg phdr;
+	struct list_head port_list;
+	u32 (*dispatcher)(struct tipc_port *, struct sk_buff *);
+	void (*wakeup)(struct tipc_port *);
+	struct user_port *user_port;
+	struct list_head wait_list;
+	u32 waiting_pkts;
+	u32 sent;
+	u32 acked;
+	struct list_head publications;
+	u32 pub_count;
+	u32 probing_state;
+	u32 probing_interval;
+	struct timer_list timer;
+	struct tipc_node_subscr subscription;
+};
+
+extern spinlock_t tipc_port_list_lock;
+struct port_list;
+
+/*
+ * TIPC port manipulation routines
+ */
+struct tipc_port *tipc_createport_raw(void *usr_handle,
+		u32 (*dispatcher)(struct tipc_port *, struct sk_buff *),
+		void (*wakeup)(struct tipc_port *), const u32 importance);
+
+int tipc_reject_msg(struct sk_buff *buf, u32 err);
+
+int tipc_send_buf_fast(struct sk_buff *buf, u32 destnode);
+
+void tipc_acknowledge(u32 port_ref, u32 ack);
+
+int tipc_createport(void *usr_handle,
+		unsigned int importance, tipc_msg_err_event error_cb,
+		tipc_named_msg_err_event named_error_cb,
+		tipc_conn_shutdown_event conn_error_cb, tipc_msg_event msg_cb,
+		tipc_named_msg_event named_msg_cb,
+		tipc_conn_msg_event conn_msg_cb,
+		tipc_continue_event continue_event_cb, u32 *portref);
+
+int tipc_deleteport(u32 portref);
+
+int tipc_portimportance(u32 portref, unsigned int *importance);
+int tipc_set_portimportance(u32 portref, unsigned int importance);
+
+int tipc_portunreliable(u32 portref, unsigned int *isunreliable);
+int tipc_set_portunreliable(u32 portref, unsigned int isunreliable);
+
+int tipc_portunreturnable(u32 portref, unsigned int *isunreturnable);
+int tipc_set_portunreturnable(u32 portref, unsigned int isunreturnable);
+
+int tipc_publish(u32 portref, unsigned int scope,
+		struct tipc_name_seq const *name_seq);
+int tipc_withdraw(u32 portref, unsigned int scope,
+		struct tipc_name_seq const *name_seq);
+
+int tipc_connect2port(u32 portref, struct tipc_portid const *port);
+
+int tipc_disconnect(u32 portref);
+
+int tipc_shutdown(u32 ref);
+
+
+/*
+ * The following routines require that the port be locked on entry
+ */
+int tipc_disconnect_port(struct tipc_port *tp_ptr);
+
+/*
+ * TIPC messaging routines
+ */
+int tipc_send(u32 portref, unsigned int num_sect, struct iovec const *msg_sect,
+	      unsigned int total_len);
+
+int tipc_send2name(u32 portref, struct tipc_name const *name, u32 domain,
+		   unsigned int num_sect, struct iovec const *msg_sect,
+		   unsigned int total_len);
+
+int tipc_send2port(u32 portref, struct tipc_portid const *dest,
+		   unsigned int num_sect, struct iovec const *msg_sect,
+		   unsigned int total_len);
+
+int tipc_send_buf2port(u32 portref, struct tipc_portid const *dest,
+		struct sk_buff *buf, unsigned int dsz);
+
+int tipc_multicast(u32 portref, struct tipc_name_seq const *seq,
+		   unsigned int section_count, struct iovec const *msg,
+		   unsigned int total_len);
+
+int tipc_port_reject_sections(struct tipc_port *p_ptr, struct tipc_msg *hdr,
+			      struct iovec const *msg_sect, u32 num_sect,
+			      unsigned int total_len, int err);
+struct sk_buff *tipc_port_get_ports(void);
+void tipc_port_recv_proto_msg(struct sk_buff *buf);
+void tipc_port_recv_mcast(struct sk_buff *buf, struct port_list *dp);
+void tipc_port_reinit(void);
+
+/**
+ * tipc_port_lock - lock port instance referred to and return its pointer
+ */
+
+static inline struct tipc_port *tipc_port_lock(u32 ref)
+{
+	return (struct tipc_port *)tipc_ref_lock(ref);
+}
+
+/**
+ * tipc_port_unlock - unlock a port instance
+ *
+ * Can use pointer instead of tipc_ref_unlock() since port is already locked.
+ */
+
+static inline void tipc_port_unlock(struct tipc_port *p_ptr)
+{
+	spin_unlock_bh(p_ptr->lock);
+}
+
+static inline struct tipc_port *tipc_port_deref(u32 ref)
+{
+	return (struct tipc_port *)tipc_ref_deref(ref);
+}
+
+static inline u32 tipc_peer_port(struct tipc_port *p_ptr)
+{
+	return msg_destport(&p_ptr->phdr);
+}
+
+static inline u32 tipc_peer_node(struct tipc_port *p_ptr)
+{
+	return msg_destnode(&p_ptr->phdr);
+}
+
+static inline int tipc_port_congested(struct tipc_port *p_ptr)
+{
+	return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
+}
+
+/**
+ * tipc_port_recv_msg - receive message from lower layer and deliver to port user
+ */
+
+static inline int tipc_port_recv_msg(struct sk_buff *buf)
+{
+	struct tipc_port *p_ptr;
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 destport = msg_destport(msg);
+	u32 dsz = msg_data_sz(msg);
+	u32 err;
+
+	/* forward unresolved named message */
+	if (unlikely(!destport)) {
+		tipc_net_route_msg(buf);
+		return dsz;
+	}
+
+	/* validate destination & pass to port, otherwise reject message */
+	p_ptr = tipc_port_lock(destport);
+	if (likely(p_ptr)) {
+		if (likely(p_ptr->connected)) {
+			if ((unlikely(msg_origport(msg) != tipc_peer_port(p_ptr))) ||
+			    (unlikely(msg_orignode(msg) != tipc_peer_node(p_ptr))) ||
+			    (unlikely(!msg_connected(msg)))) {
+				err = TIPC_ERR_NO_PORT;
+				tipc_port_unlock(p_ptr);
+				goto reject;
+			}
+		}
+		err = p_ptr->dispatcher(p_ptr, buf);
+		tipc_port_unlock(p_ptr);
+		if (likely(!err))
+			return dsz;
+	} else {
+		err = TIPC_ERR_NO_PORT;
+	}
+reject:
+	return tipc_reject_msg(buf, err);
+}
+
+#endif
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
new file mode 100644
index 00000000..83116892
--- /dev/null
+++ b/net/tipc/ref.c
@@ -0,0 +1,300 @@
+/*
+ * net/tipc/ref.c: TIPC object registry code
+ *
+ * Copyright (c) 1991-2006, Ericsson AB
+ * Copyright (c) 2004-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "ref.h"
+
+/**
+ * struct reference - TIPC object reference entry
+ * @object: pointer to object associated with reference entry
+ * @lock: spinlock controlling access to object
+ * @ref: reference value for object (combines instance & array index info)
+ */
+
+struct reference {
+	void *object;
+	spinlock_t lock;
+	u32 ref;
+};
+
+/**
+ * struct tipc_ref_table - table of TIPC object reference entries
+ * @entries: pointer to array of reference entries
+ * @capacity: array index of first unusable entry
+ * @init_point: array index of first uninitialized entry
+ * @first_free: array index of first unused object reference entry
+ * @last_free: array index of last unused object reference entry
+ * @index_mask: bitmask for array index portion of reference values
+ * @start_mask: initial value for instance value portion of reference values
+ */
+
+struct ref_table {
+	struct reference *entries;
+	u32 capacity;
+	u32 init_point;
+	u32 first_free;
+	u32 last_free;
+	u32 index_mask;
+	u32 start_mask;
+};
+
+/*
+ * Object reference table consists of 2**N entries.
+ *
+ * State	Object ptr	Reference
+ * -----        ----------      ---------
+ * In use        non-NULL       XXXX|own index
+ *				(XXXX changes each time entry is acquired)
+ * Free            NULL         YYYY|next free index
+ *				(YYYY is one more than last used XXXX)
+ * Uninitialized   NULL         0
+ *
+ * Entry 0 is not used; this allows index 0 to denote the end of the free list.
+ *
+ * Note that a reference value of 0 does not necessarily indicate that an
+ * entry is uninitialized, since the last entry in the free list could also
+ * have a reference value of 0 (although this is unlikely).
+ */
+
+static struct ref_table tipc_ref_table;
+
+static DEFINE_RWLOCK(ref_table_lock);
+
+/**
+ * tipc_ref_table_init - create reference table for objects
+ */
+
+int tipc_ref_table_init(u32 requested_size, u32 start)
+{
+	struct reference *table;
+	u32 actual_size;
+
+	/* account for unused entry, then round up size to a power of 2 */
+
+	requested_size++;
+	for (actual_size = 16; actual_size < requested_size; actual_size <<= 1)
+		/* do nothing */ ;
+
+	/* allocate table & mark all entries as uninitialized */
+
+	table = __vmalloc(actual_size * sizeof(struct reference),
+			  GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+	if (table == NULL)
+		return -ENOMEM;
+
+	tipc_ref_table.entries = table;
+	tipc_ref_table.capacity = requested_size;
+	tipc_ref_table.init_point = 1;
+	tipc_ref_table.first_free = 0;
+	tipc_ref_table.last_free = 0;
+	tipc_ref_table.index_mask = actual_size - 1;
+	tipc_ref_table.start_mask = start & ~tipc_ref_table.index_mask;
+
+	return 0;
+}
+
+/**
+ * tipc_ref_table_stop - destroy reference table for objects
+ */
+
+void tipc_ref_table_stop(void)
+{
+	if (!tipc_ref_table.entries)
+		return;
+
+	vfree(tipc_ref_table.entries);
+	tipc_ref_table.entries = NULL;
+}
+
+/**
+ * tipc_ref_acquire - create reference to an object
+ *
+ * Register an object pointer in reference table and lock the object.
+ * Returns a unique reference value that is used from then on to retrieve the
+ * object pointer, or to determine that the object has been deregistered.
+ *
+ * Note: The object is returned in the locked state so that the caller can
+ * register a partially initialized object, without running the risk that
+ * the object will be accessed before initialization is complete.
+ */
+
+u32 tipc_ref_acquire(void *object, spinlock_t **lock)
+{
+	u32 index;
+	u32 index_mask;
+	u32 next_plus_upper;
+	u32 ref;
+	struct reference *entry = NULL;
+
+	if (!object) {
+		err("Attempt to acquire reference to non-existent object\n");
+		return 0;
+	}
+	if (!tipc_ref_table.entries) {
+		err("Reference table not found during acquisition attempt\n");
+		return 0;
+	}
+
+	/* take a free entry, if available; otherwise initialize a new entry */
+
+	write_lock_bh(&ref_table_lock);
+	if (tipc_ref_table.first_free) {
+		index = tipc_ref_table.first_free;
+		entry = &(tipc_ref_table.entries[index]);
+		index_mask = tipc_ref_table.index_mask;
+		next_plus_upper = entry->ref;
+		tipc_ref_table.first_free = next_plus_upper & index_mask;
+		ref = (next_plus_upper & ~index_mask) + index;
+	} else if (tipc_ref_table.init_point < tipc_ref_table.capacity) {
+		index = tipc_ref_table.init_point++;
+		entry = &(tipc_ref_table.entries[index]);
+		spin_lock_init(&entry->lock);
+		ref = tipc_ref_table.start_mask + index;
+	} else {
+		ref = 0;
+	}
+	write_unlock_bh(&ref_table_lock);
+
+	/*
+	 * Grab the lock so no one else can modify this entry
+	 * While we assign its ref value & object pointer
+	 */
+	if (entry) {
+		spin_lock_bh(&entry->lock);
+		entry->ref = ref;
+		entry->object = object;
+		*lock = &entry->lock;
+		/*
+		 * keep it locked, the caller is responsible
+		 * for unlocking this when they're done with it
+		 */
+	}
+
+	return ref;
+}
+
+/**
+ * tipc_ref_discard - invalidate references to an object
+ *
+ * Disallow future references to an object and free up the entry for re-use.
+ * Note: The entry's spin_lock may still be busy after discard
+ */
+
+void tipc_ref_discard(u32 ref)
+{
+	struct reference *entry;
+	u32 index;
+	u32 index_mask;
+
+	if (!tipc_ref_table.entries) {
+		err("Reference table not found during discard attempt\n");
+		return;
+	}
+
+	index_mask = tipc_ref_table.index_mask;
+	index = ref & index_mask;
+	entry = &(tipc_ref_table.entries[index]);
+
+	write_lock_bh(&ref_table_lock);
+
+	if (!entry->object) {
+		err("Attempt to discard reference to non-existent object\n");
+		goto exit;
+	}
+	if (entry->ref != ref) {
+		err("Attempt to discard non-existent reference\n");
+		goto exit;
+	}
+
+	/*
+	 * mark entry as unused; increment instance part of entry's reference
+	 * to invalidate any subsequent references
+	 */
+
+	entry->object = NULL;
+	entry->ref = (ref & ~index_mask) + (index_mask + 1);
+
+	/* append entry to free entry list */
+
+	if (tipc_ref_table.first_free == 0)
+		tipc_ref_table.first_free = index;
+	else
+		tipc_ref_table.entries[tipc_ref_table.last_free].ref |= index;
+	tipc_ref_table.last_free = index;
+
+exit:
+	write_unlock_bh(&ref_table_lock);
+}
+
+/**
+ * tipc_ref_lock - lock referenced object and return pointer to it
+ */
+
+void *tipc_ref_lock(u32 ref)
+{
+	if (likely(tipc_ref_table.entries)) {
+		struct reference *entry;
+
+		entry = &tipc_ref_table.entries[ref &
+						tipc_ref_table.index_mask];
+		if (likely(entry->ref != 0)) {
+			spin_lock_bh(&entry->lock);
+			if (likely((entry->ref == ref) && (entry->object)))
+				return entry->object;
+			spin_unlock_bh(&entry->lock);
+		}
+	}
+	return NULL;
+}
+
+
+/**
+ * tipc_ref_deref - return pointer referenced object (without locking it)
+ */
+
+void *tipc_ref_deref(u32 ref)
+{
+	if (likely(tipc_ref_table.entries)) {
+		struct reference *entry;
+
+		entry = &tipc_ref_table.entries[ref &
+						tipc_ref_table.index_mask];
+		if (likely(entry->ref == ref))
+			return entry->object;
+	}
+	return NULL;
+}
+
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
new file mode 100644
index 00000000..5bc8e7ab
--- /dev/null
+++ b/net/tipc/ref.h
@@ -0,0 +1,49 @@
+/*
+ * net/tipc/ref.h: Include file for TIPC object registry code
+ *
+ * Copyright (c) 1991-2006, Ericsson AB
+ * Copyright (c) 2005-2006, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_REF_H
+#define _TIPC_REF_H
+
+int tipc_ref_table_init(u32 requested_size, u32 start);
+void tipc_ref_table_stop(void);
+
+u32 tipc_ref_acquire(void *object, spinlock_t **lock);
+void tipc_ref_discard(u32 ref);
+
+void *tipc_ref_lock(u32 ref);
+void *tipc_ref_deref(u32 ref);
+
+#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
new file mode 100644
index 00000000..33883739
--- /dev/null
+++ b/net/tipc/socket.c
@@ -0,0 +1,1904 @@
+/*
+ * net/tipc/socket.c: TIPC socket API
+ *
+ * Copyright (c) 2001-2007, Ericsson AB
+ * Copyright (c) 2004-2008, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/sock.h>
+
+#include <linux/tipc.h>
+#include <linux/tipc_config.h>
+
+#include "core.h"
+#include "port.h"
+
+#define SS_LISTENING	-1	/* socket is listening */
+#define SS_READY	-2	/* socket is connectionless */
+
+#define OVERLOAD_LIMIT_BASE	5000
+#define CONN_TIMEOUT_DEFAULT	8000	/* default connect timeout = 8s */
+
+struct tipc_sock {
+	struct sock sk;
+	struct tipc_port *p;
+	struct tipc_portid peer_name;
+	long conn_timeout;
+};
+
+#define tipc_sk(sk) ((struct tipc_sock *)(sk))
+#define tipc_sk_port(sk) ((struct tipc_port *)(tipc_sk(sk)->p))
+
+#define tipc_rx_ready(sock) (!skb_queue_empty(&sock->sk->sk_receive_queue) || \
+			(sock->state == SS_DISCONNECTING))
+
+static int backlog_rcv(struct sock *sk, struct sk_buff *skb);
+static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf);
+static void wakeupdispatch(struct tipc_port *tport);
+
+static const struct proto_ops packet_ops;
+static const struct proto_ops stream_ops;
+static const struct proto_ops msg_ops;
+
+static struct proto tipc_proto;
+
+static int sockets_enabled;
+
+static atomic_t tipc_queue_size = ATOMIC_INIT(0);
+
+/*
+ * Revised TIPC socket locking policy:
+ *
+ * Most socket operations take the standard socket lock when they start
+ * and hold it until they finish (or until they need to sleep).  Acquiring
+ * this lock grants the owner exclusive access to the fields of the socket
+ * data structures, with the exception of the backlog queue.  A few socket
+ * operations can be done without taking the socket lock because they only
+ * read socket information that never changes during the life of the socket.
+ *
+ * Socket operations may acquire the lock for the associated TIPC port if they
+ * need to perform an operation on the port.  If any routine needs to acquire
+ * both the socket lock and the port lock it must take the socket lock first
+ * to avoid the risk of deadlock.
+ *
+ * The dispatcher handling incoming messages cannot grab the socket lock in
+ * the standard fashion, since invoked it runs at the BH level and cannot block.
+ * Instead, it checks to see if the socket lock is currently owned by someone,
+ * and either handles the message itself or adds it to the socket's backlog
+ * queue; in the latter case the queued message is processed once the process
+ * owning the socket lock releases it.
+ *
+ * NOTE: Releasing the socket lock while an operation is sleeping overcomes
+ * the problem of a blocked socket operation preventing any other operations
+ * from occurring.  However, applications must be careful if they have
+ * multiple threads trying to send (or receive) on the same socket, as these
+ * operations might interfere with each other.  For example, doing a connect
+ * and a receive at the same time might allow the receive to consume the
+ * ACK message meant for the connect.  While additional work could be done
+ * to try and overcome this, it doesn't seem to be worthwhile at the present.
+ *
+ * NOTE: Releasing the socket lock while an operation is sleeping also ensures
+ * that another operation that must be performed in a non-blocking manner is
+ * not delayed for very long because the lock has already been taken.
+ *
+ * NOTE: This code assumes that certain fields of a port/socket pair are
+ * constant over its lifetime; such fields can be examined without taking
+ * the socket lock and/or port lock, and do not need to be re-read even
+ * after resuming processing after waiting.  These fields include:
+ *   - socket type
+ *   - pointer to socket sk structure (aka tipc_sock structure)
+ *   - pointer to port structure
+ *   - port reference
+ */
+
+/**
+ * advance_rx_queue - discard first buffer in socket receive queue
+ *
+ * Caller must hold socket lock
+ */
+
+static void advance_rx_queue(struct sock *sk)
+{
+	buf_discard(__skb_dequeue(&sk->sk_receive_queue));
+	atomic_dec(&tipc_queue_size);
+}
+
+/**
+ * discard_rx_queue - discard all buffers in socket receive queue
+ *
+ * Caller must hold socket lock
+ */
+
+static void discard_rx_queue(struct sock *sk)
+{
+	struct sk_buff *buf;
+
+	while ((buf = __skb_dequeue(&sk->sk_receive_queue))) {
+		atomic_dec(&tipc_queue_size);
+		buf_discard(buf);
+	}
+}
+
+/**
+ * reject_rx_queue - reject all buffers in socket receive queue
+ *
+ * Caller must hold socket lock
+ */
+
+static void reject_rx_queue(struct sock *sk)
+{
+	struct sk_buff *buf;
+
+	while ((buf = __skb_dequeue(&sk->sk_receive_queue))) {
+		tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+		atomic_dec(&tipc_queue_size);
+	}
+}
+
+/**
+ * tipc_create - create a TIPC socket
+ * @net: network namespace (must be default network)
+ * @sock: pre-allocated socket structure
+ * @protocol: protocol indicator (must be 0)
+ * @kern: caused by kernel or by userspace?
+ *
+ * This routine creates additional data structures used by the TIPC socket,
+ * initializes them, and links them together.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int tipc_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	const struct proto_ops *ops;
+	socket_state state;
+	struct sock *sk;
+	struct tipc_port *tp_ptr;
+
+	/* Validate arguments */
+
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
+
+	if (unlikely(protocol != 0))
+		return -EPROTONOSUPPORT;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		ops = &stream_ops;
+		state = SS_UNCONNECTED;
+		break;
+	case SOCK_SEQPACKET:
+		ops = &packet_ops;
+		state = SS_UNCONNECTED;
+		break;
+	case SOCK_DGRAM:
+	case SOCK_RDM:
+		ops = &msg_ops;
+		state = SS_READY;
+		break;
+	default:
+		return -EPROTOTYPE;
+	}
+
+	/* Allocate socket's protocol area */
+
+	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+	if (sk == NULL)
+		return -ENOMEM;
+
+	/* Allocate TIPC port for socket to use */
+
+	tp_ptr = tipc_createport_raw(sk, &dispatch, &wakeupdispatch,
+				     TIPC_LOW_IMPORTANCE);
+	if (unlikely(!tp_ptr)) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	/* Finish initializing socket data structures */
+
+	sock->ops = ops;
+	sock->state = state;
+
+	sock_init_data(sock, sk);
+	sk->sk_backlog_rcv = backlog_rcv;
+	tipc_sk(sk)->p = tp_ptr;
+	tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
+
+	spin_unlock_bh(tp_ptr->lock);
+
+	if (sock->state == SS_READY) {
+		tipc_set_portunreturnable(tp_ptr->ref, 1);
+		if (sock->type == SOCK_DGRAM)
+			tipc_set_portunreliable(tp_ptr->ref, 1);
+	}
+
+	return 0;
+}
+
+/**
+ * release - destroy a TIPC socket
+ * @sock: socket to destroy
+ *
+ * This routine cleans up any messages that are still queued on the socket.
+ * For DGRAM and RDM socket types, all queued messages are rejected.
+ * For SEQPACKET and STREAM socket types, the first message is rejected
+ * and any others are discarded.  (If the first message on a STREAM socket
+ * is partially-read, it is discarded and the next one is rejected instead.)
+ *
+ * NOTE: Rejected messages are not necessarily returned to the sender!  They
+ * are returned or discarded according to the "destination droppable" setting
+ * specified for the message by the sender.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport;
+	struct sk_buff *buf;
+	int res;
+
+	/*
+	 * Exit if socket isn't fully initialized (occurs when a failed accept()
+	 * releases a pre-allocated child socket that was never used)
+	 */
+
+	if (sk == NULL)
+		return 0;
+
+	tport = tipc_sk_port(sk);
+	lock_sock(sk);
+
+	/*
+	 * Reject all unreceived messages, except on an active connection
+	 * (which disconnects locally & sends a 'FIN+' to peer)
+	 */
+
+	while (sock->state != SS_DISCONNECTING) {
+		buf = __skb_dequeue(&sk->sk_receive_queue);
+		if (buf == NULL)
+			break;
+		atomic_dec(&tipc_queue_size);
+		if (TIPC_SKB_CB(buf)->handle != 0)
+			buf_discard(buf);
+		else {
+			if ((sock->state == SS_CONNECTING) ||
+			    (sock->state == SS_CONNECTED)) {
+				sock->state = SS_DISCONNECTING;
+				tipc_disconnect(tport->ref);
+			}
+			tipc_reject_msg(buf, TIPC_ERR_NO_PORT);
+		}
+	}
+
+	/*
+	 * Delete TIPC port; this ensures no more messages are queued
+	 * (also disconnects an active connection & sends a 'FIN-' to peer)
+	 */
+
+	res = tipc_deleteport(tport->ref);
+
+	/* Discard any remaining (connection-based) messages in receive queue */
+
+	discard_rx_queue(sk);
+
+	/* Reject any messages that accumulated in backlog queue */
+
+	sock->state = SS_DISCONNECTING;
+	release_sock(sk);
+
+	sock_put(sk);
+	sock->sk = NULL;
+
+	return res;
+}
+
+/**
+ * bind - associate or disassocate TIPC name(s) with a socket
+ * @sock: socket structure
+ * @uaddr: socket address describing name(s) and desired operation
+ * @uaddr_len: size of socket address data structure
+ *
+ * Name and name sequence binding is indicated using a positive scope value;
+ * a negative scope value unbinds the specified name.  Specifying no name
+ * (i.e. a socket address length of 0) unbinds all names from the socket.
+ *
+ * Returns 0 on success, errno otherwise
+ *
+ * NOTE: This routine doesn't need to take the socket lock since it doesn't
+ *       access any non-constant socket information.
+ */
+
+static int bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len)
+{
+	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+	u32 portref = tipc_sk_port(sock->sk)->ref;
+
+	if (unlikely(!uaddr_len))
+		return tipc_withdraw(portref, 0, NULL);
+
+	if (uaddr_len < sizeof(struct sockaddr_tipc))
+		return -EINVAL;
+	if (addr->family != AF_TIPC)
+		return -EAFNOSUPPORT;
+
+	if (addr->addrtype == TIPC_ADDR_NAME)
+		addr->addr.nameseq.upper = addr->addr.nameseq.lower;
+	else if (addr->addrtype != TIPC_ADDR_NAMESEQ)
+		return -EAFNOSUPPORT;
+
+	return (addr->scope > 0) ?
+		tipc_publish(portref, addr->scope, &addr->addr.nameseq) :
+		tipc_withdraw(portref, -addr->scope, &addr->addr.nameseq);
+}
+
+/**
+ * get_name - get port ID of socket or peer socket
+ * @sock: socket structure
+ * @uaddr: area for returned socket address
+ * @uaddr_len: area for returned length of socket address
+ * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID
+ *
+ * Returns 0 on success, errno otherwise
+ *
+ * NOTE: This routine doesn't need to take the socket lock since it only
+ *       accesses socket information that is unchanging (or which changes in
+ *       a completely predictable manner).
+ */
+
+static int get_name(struct socket *sock, struct sockaddr *uaddr,
+		    int *uaddr_len, int peer)
+{
+	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+
+	memset(addr, 0, sizeof(*addr));
+	if (peer) {
+		if ((sock->state != SS_CONNECTED) &&
+			((peer != 2) || (sock->state != SS_DISCONNECTING)))
+			return -ENOTCONN;
+		addr->addr.id.ref = tsock->peer_name.ref;
+		addr->addr.id.node = tsock->peer_name.node;
+	} else {
+		addr->addr.id.ref = tsock->p->ref;
+		addr->addr.id.node = tipc_own_addr;
+	}
+
+	*uaddr_len = sizeof(*addr);
+	addr->addrtype = TIPC_ADDR_ID;
+	addr->family = AF_TIPC;
+	addr->scope = 0;
+	addr->addr.name.domain = 0;
+
+	return 0;
+}
+
+/**
+ * poll - read and possibly block on pollmask
+ * @file: file structure associated with the socket
+ * @sock: socket for which to calculate the poll bits
+ * @wait: ???
+ *
+ * Returns pollmask value
+ *
+ * COMMENTARY:
+ * It appears that the usual socket locking mechanisms are not useful here
+ * since the pollmask info is potentially out-of-date the moment this routine
+ * exits.  TCP and other protocols seem to rely on higher level poll routines
+ * to handle any preventable race conditions, so TIPC will do the same ...
+ *
+ * TIPC sets the returned events as follows:
+ *
+ * socket state		flags set
+ * ------------		---------
+ * unconnected		no read flags
+ *			no write flags
+ *
+ * connecting		POLLIN/POLLRDNORM if ACK/NACK in rx queue
+ *			no write flags
+ *
+ * connected		POLLIN/POLLRDNORM if data in rx queue
+ *			POLLOUT if port is not congested
+ *
+ * disconnecting	POLLIN/POLLRDNORM/POLLHUP
+ *			no write flags
+ *
+ * listening		POLLIN if SYN in rx queue
+ *			no write flags
+ *
+ * ready		POLLIN/POLLRDNORM if data in rx queue
+ * [connectionless]	POLLOUT (since port cannot be congested)
+ *
+ * IMPORTANT: The fact that a read or write operation is indicated does NOT
+ * imply that the operation will succeed, merely that it should be performed
+ * and will not block.
+ */
+
+static unsigned int poll(struct file *file, struct socket *sock,
+			 poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	u32 mask = 0;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	switch ((int)sock->state) {
+	case SS_READY:
+	case SS_CONNECTED:
+		if (!tipc_sk_port(sk)->congested)
+			mask |= POLLOUT;
+		/* fall thru' */
+	case SS_CONNECTING:
+	case SS_LISTENING:
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			mask |= (POLLIN | POLLRDNORM);
+		break;
+	case SS_DISCONNECTING:
+		mask = (POLLIN | POLLRDNORM | POLLHUP);
+		break;
+	}
+
+	return mask;
+}
+
+/**
+ * dest_name_check - verify user is permitted to send to specified port name
+ * @dest: destination address
+ * @m: descriptor for message to be sent
+ *
+ * Prevents restricted configuration commands from being issued by
+ * unauthorized users.
+ *
+ * Returns 0 if permission is granted, otherwise errno
+ */
+
+static int dest_name_check(struct sockaddr_tipc *dest, struct msghdr *m)
+{
+	struct tipc_cfg_msg_hdr hdr;
+
+	if (likely(dest->addr.name.name.type >= TIPC_RESERVED_TYPES))
+		return 0;
+	if (likely(dest->addr.name.name.type == TIPC_TOP_SRV))
+		return 0;
+	if (likely(dest->addr.name.name.type != TIPC_CFG_SRV))
+		return -EACCES;
+
+	if (!m->msg_iovlen || (m->msg_iov[0].iov_len < sizeof(hdr)))
+		return -EMSGSIZE;
+	if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
+		return -EFAULT;
+	if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN)))
+		return -EACCES;
+
+	return 0;
+}
+
+/**
+ * send_msg - send message in connectionless manner
+ * @iocb: if NULL, indicates that socket lock is already held
+ * @sock: socket structure
+ * @m: message to send
+ * @total_len: length of message
+ *
+ * Message must have an destination specified explicitly.
+ * Used for SOCK_RDM and SOCK_DGRAM messages,
+ * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections.
+ * (Note: 'SYN+' is prohibited on SOCK_STREAM.)
+ *
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+
+static int send_msg(struct kiocb *iocb, struct socket *sock,
+		    struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name;
+	int needs_conn;
+	int res = -EINVAL;
+
+	if (unlikely(!dest))
+		return -EDESTADDRREQ;
+	if (unlikely((m->msg_namelen < sizeof(*dest)) ||
+		     (dest->family != AF_TIPC)))
+		return -EINVAL;
+	if ((total_len > TIPC_MAX_USER_MSG_SIZE) ||
+	    (m->msg_iovlen > (unsigned)INT_MAX))
+		return -EMSGSIZE;
+
+	if (iocb)
+		lock_sock(sk);
+
+	needs_conn = (sock->state != SS_READY);
+	if (unlikely(needs_conn)) {
+		if (sock->state == SS_LISTENING) {
+			res = -EPIPE;
+			goto exit;
+		}
+		if (sock->state != SS_UNCONNECTED) {
+			res = -EISCONN;
+			goto exit;
+		}
+		if ((tport->published) ||
+		    ((sock->type == SOCK_STREAM) && (total_len != 0))) {
+			res = -EOPNOTSUPP;
+			goto exit;
+		}
+		if (dest->addrtype == TIPC_ADDR_NAME) {
+			tport->conn_type = dest->addr.name.name.type;
+			tport->conn_instance = dest->addr.name.name.instance;
+		}
+
+		/* Abort any pending connection attempts (very unlikely) */
+
+		reject_rx_queue(sk);
+	}
+
+	do {
+		if (dest->addrtype == TIPC_ADDR_NAME) {
+			res = dest_name_check(dest, m);
+			if (res)
+				break;
+			res = tipc_send2name(tport->ref,
+					     &dest->addr.name.name,
+					     dest->addr.name.domain,
+					     m->msg_iovlen,
+					     m->msg_iov,
+					     total_len);
+		} else if (dest->addrtype == TIPC_ADDR_ID) {
+			res = tipc_send2port(tport->ref,
+					     &dest->addr.id,
+					     m->msg_iovlen,
+					     m->msg_iov,
+					     total_len);
+		} else if (dest->addrtype == TIPC_ADDR_MCAST) {
+			if (needs_conn) {
+				res = -EOPNOTSUPP;
+				break;
+			}
+			res = dest_name_check(dest, m);
+			if (res)
+				break;
+			res = tipc_multicast(tport->ref,
+					     &dest->addr.nameseq,
+					     m->msg_iovlen,
+					     m->msg_iov,
+					     total_len);
+		}
+		if (likely(res != -ELINKCONG)) {
+			if (needs_conn && (res >= 0))
+				sock->state = SS_CONNECTING;
+			break;
+		}
+		if (m->msg_flags & MSG_DONTWAIT) {
+			res = -EWOULDBLOCK;
+			break;
+		}
+		release_sock(sk);
+		res = wait_event_interruptible(*sk_sleep(sk),
+					       !tport->congested);
+		lock_sock(sk);
+		if (res)
+			break;
+	} while (1);
+
+exit:
+	if (iocb)
+		release_sock(sk);
+	return res;
+}
+
+/**
+ * send_packet - send a connection-oriented message
+ * @iocb: if NULL, indicates that socket lock is already held
+ * @sock: socket structure
+ * @m: message to send
+ * @total_len: length of message
+ *
+ * Used for SOCK_SEQPACKET messages and SOCK_STREAM data.
+ *
+ * Returns the number of bytes sent on success, or errno otherwise
+ */
+
+static int send_packet(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct sockaddr_tipc *dest = (struct sockaddr_tipc *)m->msg_name;
+	int res;
+
+	/* Handle implied connection establishment */
+
+	if (unlikely(dest))
+		return send_msg(iocb, sock, m, total_len);
+
+	if ((total_len > TIPC_MAX_USER_MSG_SIZE) ||
+	    (m->msg_iovlen > (unsigned)INT_MAX))
+		return -EMSGSIZE;
+
+	if (iocb)
+		lock_sock(sk);
+
+	do {
+		if (unlikely(sock->state != SS_CONNECTED)) {
+			if (sock->state == SS_DISCONNECTING)
+				res = -EPIPE;
+			else
+				res = -ENOTCONN;
+			break;
+		}
+
+		res = tipc_send(tport->ref, m->msg_iovlen, m->msg_iov,
+				total_len);
+		if (likely(res != -ELINKCONG))
+			break;
+		if (m->msg_flags & MSG_DONTWAIT) {
+			res = -EWOULDBLOCK;
+			break;
+		}
+		release_sock(sk);
+		res = wait_event_interruptible(*sk_sleep(sk),
+			(!tport->congested || !tport->connected));
+		lock_sock(sk);
+		if (res)
+			break;
+	} while (1);
+
+	if (iocb)
+		release_sock(sk);
+	return res;
+}
+
+/**
+ * send_stream - send stream-oriented data
+ * @iocb: (unused)
+ * @sock: socket structure
+ * @m: data to send
+ * @total_len: total length of data to be sent
+ *
+ * Used for SOCK_STREAM data.
+ *
+ * Returns the number of bytes sent on success (or partial success),
+ * or errno if no data sent
+ */
+
+static int send_stream(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct msghdr my_msg;
+	struct iovec my_iov;
+	struct iovec *curr_iov;
+	int curr_iovlen;
+	char __user *curr_start;
+	u32 hdr_size;
+	int curr_left;
+	int bytes_to_send;
+	int bytes_sent;
+	int res;
+
+	lock_sock(sk);
+
+	/* Handle special cases where there is no connection */
+
+	if (unlikely(sock->state != SS_CONNECTED)) {
+		if (sock->state == SS_UNCONNECTED) {
+			res = send_packet(NULL, sock, m, total_len);
+			goto exit;
+		} else if (sock->state == SS_DISCONNECTING) {
+			res = -EPIPE;
+			goto exit;
+		} else {
+			res = -ENOTCONN;
+			goto exit;
+		}
+	}
+
+	if (unlikely(m->msg_name)) {
+		res = -EISCONN;
+		goto exit;
+	}
+
+	if ((total_len > (unsigned)INT_MAX) ||
+	    (m->msg_iovlen > (unsigned)INT_MAX)) {
+		res = -EMSGSIZE;
+		goto exit;
+	}
+
+	/*
+	 * Send each iovec entry using one or more messages
+	 *
+	 * Note: This algorithm is good for the most likely case
+	 * (i.e. one large iovec entry), but could be improved to pass sets
+	 * of small iovec entries into send_packet().
+	 */
+
+	curr_iov = m->msg_iov;
+	curr_iovlen = m->msg_iovlen;
+	my_msg.msg_iov = &my_iov;
+	my_msg.msg_iovlen = 1;
+	my_msg.msg_flags = m->msg_flags;
+	my_msg.msg_name = NULL;
+	bytes_sent = 0;
+
+	hdr_size = msg_hdr_sz(&tport->phdr);
+
+	while (curr_iovlen--) {
+		curr_start = curr_iov->iov_base;
+		curr_left = curr_iov->iov_len;
+
+		while (curr_left) {
+			bytes_to_send = tport->max_pkt - hdr_size;
+			if (bytes_to_send > TIPC_MAX_USER_MSG_SIZE)
+				bytes_to_send = TIPC_MAX_USER_MSG_SIZE;
+			if (curr_left < bytes_to_send)
+				bytes_to_send = curr_left;
+			my_iov.iov_base = curr_start;
+			my_iov.iov_len = bytes_to_send;
+			res = send_packet(NULL, sock, &my_msg, bytes_to_send);
+			if (res < 0) {
+				if (bytes_sent)
+					res = bytes_sent;
+				goto exit;
+			}
+			curr_left -= bytes_to_send;
+			curr_start += bytes_to_send;
+			bytes_sent += bytes_to_send;
+		}
+
+		curr_iov++;
+	}
+	res = bytes_sent;
+exit:
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * auto_connect - complete connection setup to a remote port
+ * @sock: socket structure
+ * @msg: peer's response message
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int auto_connect(struct socket *sock, struct tipc_msg *msg)
+{
+	struct tipc_sock *tsock = tipc_sk(sock->sk);
+
+	if (msg_errcode(msg)) {
+		sock->state = SS_DISCONNECTING;
+		return -ECONNREFUSED;
+	}
+
+	tsock->peer_name.ref = msg_origport(msg);
+	tsock->peer_name.node = msg_orignode(msg);
+	tipc_connect2port(tsock->p->ref, &tsock->peer_name);
+	tipc_set_portimportance(tsock->p->ref, msg_importance(msg));
+	sock->state = SS_CONNECTED;
+	return 0;
+}
+
+/**
+ * set_orig_addr - capture sender's address for received message
+ * @m: descriptor for message info
+ * @msg: received message header
+ *
+ * Note: Address is not captured if not requested by receiver.
+ */
+
+static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg)
+{
+	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)m->msg_name;
+
+	if (addr) {
+		addr->family = AF_TIPC;
+		addr->addrtype = TIPC_ADDR_ID;
+		addr->addr.id.ref = msg_origport(msg);
+		addr->addr.id.node = msg_orignode(msg);
+		addr->addr.name.domain = 0;	/* could leave uninitialized */
+		addr->scope = 0;		/* could leave uninitialized */
+		m->msg_namelen = sizeof(struct sockaddr_tipc);
+	}
+}
+
+/**
+ * anc_data_recv - optionally capture ancillary data for received message
+ * @m: descriptor for message info
+ * @msg: received message header
+ * @tport: TIPC port associated with message
+ *
+ * Note: Ancillary data is not captured if not requested by receiver.
+ *
+ * Returns 0 if successful, otherwise errno
+ */
+
+static int anc_data_recv(struct msghdr *m, struct tipc_msg *msg,
+				struct tipc_port *tport)
+{
+	u32 anc_data[3];
+	u32 err;
+	u32 dest_type;
+	int has_name;
+	int res;
+
+	if (likely(m->msg_controllen == 0))
+		return 0;
+
+	/* Optionally capture errored message object(s) */
+
+	err = msg ? msg_errcode(msg) : 0;
+	if (unlikely(err)) {
+		anc_data[0] = err;
+		anc_data[1] = msg_data_sz(msg);
+		res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data);
+		if (res)
+			return res;
+		if (anc_data[1]) {
+			res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1],
+				       msg_data(msg));
+			if (res)
+				return res;
+		}
+	}
+
+	/* Optionally capture message destination object */
+
+	dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG;
+	switch (dest_type) {
+	case TIPC_NAMED_MSG:
+		has_name = 1;
+		anc_data[0] = msg_nametype(msg);
+		anc_data[1] = msg_namelower(msg);
+		anc_data[2] = msg_namelower(msg);
+		break;
+	case TIPC_MCAST_MSG:
+		has_name = 1;
+		anc_data[0] = msg_nametype(msg);
+		anc_data[1] = msg_namelower(msg);
+		anc_data[2] = msg_nameupper(msg);
+		break;
+	case TIPC_CONN_MSG:
+		has_name = (tport->conn_type != 0);
+		anc_data[0] = tport->conn_type;
+		anc_data[1] = tport->conn_instance;
+		anc_data[2] = tport->conn_instance;
+		break;
+	default:
+		has_name = 0;
+	}
+	if (has_name) {
+		res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data);
+		if (res)
+			return res;
+	}
+
+	return 0;
+}
+
+/**
+ * recv_msg - receive packet-oriented message
+ * @iocb: (unused)
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ *
+ * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages.
+ * If the complete message doesn't fit in user area, truncate it.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+
+static int recv_msg(struct kiocb *iocb, struct socket *sock,
+		    struct msghdr *m, size_t buf_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	long timeout;
+	unsigned int sz;
+	u32 err;
+	int res;
+
+	/* Catch invalid receive requests */
+
+	if (unlikely(!buf_len))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (unlikely(sock->state == SS_UNCONNECTED)) {
+		res = -ENOTCONN;
+		goto exit;
+	}
+
+	timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+restart:
+
+	/* Look for a message in receive queue; wait if necessary */
+
+	while (skb_queue_empty(&sk->sk_receive_queue)) {
+		if (sock->state == SS_DISCONNECTING) {
+			res = -ENOTCONN;
+			goto exit;
+		}
+		if (timeout <= 0L) {
+			res = timeout ? timeout : -EWOULDBLOCK;
+			goto exit;
+		}
+		release_sock(sk);
+		timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
+							   tipc_rx_ready(sock),
+							   timeout);
+		lock_sock(sk);
+	}
+
+	/* Look at first message in receive queue */
+
+	buf = skb_peek(&sk->sk_receive_queue);
+	msg = buf_msg(buf);
+	sz = msg_data_sz(msg);
+	err = msg_errcode(msg);
+
+	/* Complete connection setup for an implied connect */
+
+	if (unlikely(sock->state == SS_CONNECTING)) {
+		res = auto_connect(sock, msg);
+		if (res)
+			goto exit;
+	}
+
+	/* Discard an empty non-errored message & try again */
+
+	if ((!sz) && (!err)) {
+		advance_rx_queue(sk);
+		goto restart;
+	}
+
+	/* Capture sender's address (optional) */
+
+	set_orig_addr(m, msg);
+
+	/* Capture ancillary data (optional) */
+
+	res = anc_data_recv(m, msg, tport);
+	if (res)
+		goto exit;
+
+	/* Capture message data (if valid) & compute return value (always) */
+
+	if (!err) {
+		if (unlikely(buf_len < sz)) {
+			sz = buf_len;
+			m->msg_flags |= MSG_TRUNC;
+		}
+		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
+					      m->msg_iov, sz);
+		if (res)
+			goto exit;
+		res = sz;
+	} else {
+		if ((sock->state == SS_READY) ||
+		    ((err == TIPC_CONN_SHUTDOWN) || m->msg_control))
+			res = 0;
+		else
+			res = -ECONNRESET;
+	}
+
+	/* Consume received message (optional) */
+
+	if (likely(!(flags & MSG_PEEK))) {
+		if ((sock->state != SS_READY) &&
+		    (++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN))
+			tipc_acknowledge(tport->ref, tport->conn_unacked);
+		advance_rx_queue(sk);
+	}
+exit:
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * recv_stream - receive stream-oriented data
+ * @iocb: (unused)
+ * @m: descriptor for message info
+ * @buf_len: total size of user buffer area
+ * @flags: receive flags
+ *
+ * Used for SOCK_STREAM messages only.  If not enough data is available
+ * will optionally wait for more; never truncates data.
+ *
+ * Returns size of returned message data, errno otherwise
+ */
+
+static int recv_stream(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t buf_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	long timeout;
+	unsigned int sz;
+	int sz_to_copy, target, needed;
+	int sz_copied = 0;
+	u32 err;
+	int res = 0;
+
+	/* Catch invalid receive attempts */
+
+	if (unlikely(!buf_len))
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (unlikely((sock->state == SS_UNCONNECTED) ||
+		     (sock->state == SS_CONNECTING))) {
+		res = -ENOTCONN;
+		goto exit;
+	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
+	timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+restart:
+
+	/* Look for a message in receive queue; wait if necessary */
+
+	while (skb_queue_empty(&sk->sk_receive_queue)) {
+		if (sock->state == SS_DISCONNECTING) {
+			res = -ENOTCONN;
+			goto exit;
+		}
+		if (timeout <= 0L) {
+			res = timeout ? timeout : -EWOULDBLOCK;
+			goto exit;
+		}
+		release_sock(sk);
+		timeout = wait_event_interruptible_timeout(*sk_sleep(sk),
+							   tipc_rx_ready(sock),
+							   timeout);
+		lock_sock(sk);
+	}
+
+	/* Look at first message in receive queue */
+
+	buf = skb_peek(&sk->sk_receive_queue);
+	msg = buf_msg(buf);
+	sz = msg_data_sz(msg);
+	err = msg_errcode(msg);
+
+	/* Discard an empty non-errored message & try again */
+
+	if ((!sz) && (!err)) {
+		advance_rx_queue(sk);
+		goto restart;
+	}
+
+	/* Optionally capture sender's address & ancillary data of first msg */
+
+	if (sz_copied == 0) {
+		set_orig_addr(m, msg);
+		res = anc_data_recv(m, msg, tport);
+		if (res)
+			goto exit;
+	}
+
+	/* Capture message data (if valid) & compute return value (always) */
+
+	if (!err) {
+		u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
+
+		sz -= offset;
+		needed = (buf_len - sz_copied);
+		sz_to_copy = (sz <= needed) ? sz : needed;
+
+		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
+					      m->msg_iov, sz_to_copy);
+		if (res)
+			goto exit;
+
+		sz_copied += sz_to_copy;
+
+		if (sz_to_copy < sz) {
+			if (!(flags & MSG_PEEK))
+				TIPC_SKB_CB(buf)->handle =
+				(void *)(unsigned long)(offset + sz_to_copy);
+			goto exit;
+		}
+	} else {
+		if (sz_copied != 0)
+			goto exit; /* can't add error msg to valid data */
+
+		if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control)
+			res = 0;
+		else
+			res = -ECONNRESET;
+	}
+
+	/* Consume received message (optional) */
+
+	if (likely(!(flags & MSG_PEEK))) {
+		if (unlikely(++tport->conn_unacked >= TIPC_FLOW_CONTROL_WIN))
+			tipc_acknowledge(tport->ref, tport->conn_unacked);
+		advance_rx_queue(sk);
+	}
+
+	/* Loop around if more data is required */
+
+	if ((sz_copied < buf_len) &&	/* didn't get all requested data */
+	    (!skb_queue_empty(&sk->sk_receive_queue) ||
+	    (sz_copied < target)) &&	/* and more is ready or required */
+	    (!(flags & MSG_PEEK)) &&	/* and aren't just peeking at data */
+	    (!err))			/* and haven't reached a FIN */
+		goto restart;
+
+exit:
+	release_sock(sk);
+	return sz_copied ? sz_copied : res;
+}
+
+/**
+ * rx_queue_full - determine if receive queue can accept another message
+ * @msg: message to be added to queue
+ * @queue_size: current size of queue
+ * @base: nominal maximum size of queue
+ *
+ * Returns 1 if queue is unable to accept message, 0 otherwise
+ */
+
+static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
+{
+	u32 threshold;
+	u32 imp = msg_importance(msg);
+
+	if (imp == TIPC_LOW_IMPORTANCE)
+		threshold = base;
+	else if (imp == TIPC_MEDIUM_IMPORTANCE)
+		threshold = base * 2;
+	else if (imp == TIPC_HIGH_IMPORTANCE)
+		threshold = base * 100;
+	else
+		return 0;
+
+	if (msg_connected(msg))
+		threshold *= 4;
+
+	return queue_size >= threshold;
+}
+
+/**
+ * filter_rcv - validate incoming message
+ * @sk: socket
+ * @buf: message
+ *
+ * Enqueues message on receive queue if acceptable; optionally handles
+ * disconnect indication for a connected socket.
+ *
+ * Called with socket lock already taken; port lock may also be taken.
+ *
+ * Returns TIPC error status code (TIPC_OK if message is not to be rejected)
+ */
+
+static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
+{
+	struct socket *sock = sk->sk_socket;
+	struct tipc_msg *msg = buf_msg(buf);
+	u32 recv_q_len;
+
+	/* Reject message if it is wrong sort of message for socket */
+
+	/*
+	 * WOULD IT BE BETTER TO JUST DISCARD THESE MESSAGES INSTEAD?
+	 * "NO PORT" ISN'T REALLY THE RIGHT ERROR CODE, AND THERE MAY
+	 * BE SECURITY IMPLICATIONS INHERENT IN REJECTING INVALID TRAFFIC
+	 */
+
+	if (sock->state == SS_READY) {
+		if (msg_connected(msg))
+			return TIPC_ERR_NO_PORT;
+	} else {
+		if (msg_mcast(msg))
+			return TIPC_ERR_NO_PORT;
+		if (sock->state == SS_CONNECTED) {
+			if (!msg_connected(msg))
+				return TIPC_ERR_NO_PORT;
+		} else if (sock->state == SS_CONNECTING) {
+			if (!msg_connected(msg) && (msg_errcode(msg) == 0))
+				return TIPC_ERR_NO_PORT;
+		} else if (sock->state == SS_LISTENING) {
+			if (msg_connected(msg) || msg_errcode(msg))
+				return TIPC_ERR_NO_PORT;
+		} else if (sock->state == SS_DISCONNECTING) {
+			return TIPC_ERR_NO_PORT;
+		} else /* (sock->state == SS_UNCONNECTED) */ {
+			if (msg_connected(msg) || msg_errcode(msg))
+				return TIPC_ERR_NO_PORT;
+		}
+	}
+
+	/* Reject message if there isn't room to queue it */
+
+	recv_q_len = (u32)atomic_read(&tipc_queue_size);
+	if (unlikely(recv_q_len >= OVERLOAD_LIMIT_BASE)) {
+		if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE))
+			return TIPC_ERR_OVERLOAD;
+	}
+	recv_q_len = skb_queue_len(&sk->sk_receive_queue);
+	if (unlikely(recv_q_len >= (OVERLOAD_LIMIT_BASE / 2))) {
+		if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE / 2))
+			return TIPC_ERR_OVERLOAD;
+	}
+
+	/* Enqueue message (finally!) */
+
+	TIPC_SKB_CB(buf)->handle = 0;
+	atomic_inc(&tipc_queue_size);
+	__skb_queue_tail(&sk->sk_receive_queue, buf);
+
+	/* Initiate connection termination for an incoming 'FIN' */
+
+	if (unlikely(msg_errcode(msg) && (sock->state == SS_CONNECTED))) {
+		sock->state = SS_DISCONNECTING;
+		tipc_disconnect_port(tipc_sk_port(sk));
+	}
+
+	if (waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
+	return TIPC_OK;
+}
+
+/**
+ * backlog_rcv - handle incoming message from backlog queue
+ * @sk: socket
+ * @buf: message
+ *
+ * Caller must hold socket lock, but not port lock.
+ *
+ * Returns 0
+ */
+
+static int backlog_rcv(struct sock *sk, struct sk_buff *buf)
+{
+	u32 res;
+
+	res = filter_rcv(sk, buf);
+	if (res)
+		tipc_reject_msg(buf, res);
+	return 0;
+}
+
+/**
+ * dispatch - handle incoming message
+ * @tport: TIPC port that received message
+ * @buf: message
+ *
+ * Called with port lock already taken.
+ *
+ * Returns TIPC error status code (TIPC_OK if message is not to be rejected)
+ */
+
+static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf)
+{
+	struct sock *sk = (struct sock *)tport->usr_handle;
+	u32 res;
+
+	/*
+	 * Process message if socket is unlocked; otherwise add to backlog queue
+	 *
+	 * This code is based on sk_receive_skb(), but must be distinct from it
+	 * since a TIPC-specific filter/reject mechanism is utilized
+	 */
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		res = filter_rcv(sk, buf);
+	} else {
+		if (sk_add_backlog(sk, buf))
+			res = TIPC_ERR_OVERLOAD;
+		else
+			res = TIPC_OK;
+	}
+	bh_unlock_sock(sk);
+
+	return res;
+}
+
+/**
+ * wakeupdispatch - wake up port after congestion
+ * @tport: port to wakeup
+ *
+ * Called with port lock already taken.
+ */
+
+static void wakeupdispatch(struct tipc_port *tport)
+{
+	struct sock *sk = (struct sock *)tport->usr_handle;
+
+	if (waitqueue_active(sk_sleep(sk)))
+		wake_up_interruptible(sk_sleep(sk));
+}
+
+/**
+ * connect - establish a connection to another TIPC port
+ * @sock: socket structure
+ * @dest: socket address for destination port
+ * @destlen: size of socket address data structure
+ * @flags: file-related flags associated with socket
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
+		   int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
+	struct msghdr m = {NULL,};
+	struct sk_buff *buf;
+	struct tipc_msg *msg;
+	long timeout;
+	int res;
+
+	lock_sock(sk);
+
+	/* For now, TIPC does not allow use of connect() with DGRAM/RDM types */
+
+	if (sock->state == SS_READY) {
+		res = -EOPNOTSUPP;
+		goto exit;
+	}
+
+	/* For now, TIPC does not support the non-blocking form of connect() */
+
+	if (flags & O_NONBLOCK) {
+		res = -EOPNOTSUPP;
+		goto exit;
+	}
+
+	/* Issue Posix-compliant error code if socket is in the wrong state */
+
+	if (sock->state == SS_LISTENING) {
+		res = -EOPNOTSUPP;
+		goto exit;
+	}
+	if (sock->state == SS_CONNECTING) {
+		res = -EALREADY;
+		goto exit;
+	}
+	if (sock->state != SS_UNCONNECTED) {
+		res = -EISCONN;
+		goto exit;
+	}
+
+	/*
+	 * Reject connection attempt using multicast address
+	 *
+	 * Note: send_msg() validates the rest of the address fields,
+	 *       so there's no need to do it here
+	 */
+
+	if (dst->addrtype == TIPC_ADDR_MCAST) {
+		res = -EINVAL;
+		goto exit;
+	}
+
+	/* Reject any messages already in receive queue (very unlikely) */
+
+	reject_rx_queue(sk);
+
+	/* Send a 'SYN-' to destination */
+
+	m.msg_name = dest;
+	m.msg_namelen = destlen;
+	res = send_msg(NULL, sock, &m, 0);
+	if (res < 0)
+		goto exit;
+
+	/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
+
+	timeout = tipc_sk(sk)->conn_timeout;
+	release_sock(sk);
+	res = wait_event_interruptible_timeout(*sk_sleep(sk),
+			(!skb_queue_empty(&sk->sk_receive_queue) ||
+			(sock->state != SS_CONNECTING)),
+			timeout ? timeout : MAX_SCHEDULE_TIMEOUT);
+	lock_sock(sk);
+
+	if (res > 0) {
+		buf = skb_peek(&sk->sk_receive_queue);
+		if (buf != NULL) {
+			msg = buf_msg(buf);
+			res = auto_connect(sock, msg);
+			if (!res) {
+				if (!msg_data_sz(msg))
+					advance_rx_queue(sk);
+			}
+		} else {
+			if (sock->state == SS_CONNECTED)
+				res = -EISCONN;
+			else
+				res = -ECONNREFUSED;
+		}
+	} else {
+		if (res == 0)
+			res = -ETIMEDOUT;
+		else
+			; /* leave "res" unchanged */
+		sock->state = SS_DISCONNECTING;
+	}
+
+exit:
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * listen - allow socket to listen for incoming connections
+ * @sock: socket structure
+ * @len: (unused)
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int listen(struct socket *sock, int len)
+{
+	struct sock *sk = sock->sk;
+	int res;
+
+	lock_sock(sk);
+
+	if (sock->state == SS_READY)
+		res = -EOPNOTSUPP;
+	else if (sock->state != SS_UNCONNECTED)
+		res = -EINVAL;
+	else {
+		sock->state = SS_LISTENING;
+		res = 0;
+	}
+
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * accept - wait for connection request
+ * @sock: listening socket
+ * @newsock: new socket that is to be connected
+ * @flags: file-related flags associated with socket
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int accept(struct socket *sock, struct socket *new_sock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff *buf;
+	int res;
+
+	lock_sock(sk);
+
+	if (sock->state == SS_READY) {
+		res = -EOPNOTSUPP;
+		goto exit;
+	}
+	if (sock->state != SS_LISTENING) {
+		res = -EINVAL;
+		goto exit;
+	}
+
+	while (skb_queue_empty(&sk->sk_receive_queue)) {
+		if (flags & O_NONBLOCK) {
+			res = -EWOULDBLOCK;
+			goto exit;
+		}
+		release_sock(sk);
+		res = wait_event_interruptible(*sk_sleep(sk),
+				(!skb_queue_empty(&sk->sk_receive_queue)));
+		lock_sock(sk);
+		if (res)
+			goto exit;
+	}
+
+	buf = skb_peek(&sk->sk_receive_queue);
+
+	res = tipc_create(sock_net(sock->sk), new_sock, 0, 0);
+	if (!res) {
+		struct sock *new_sk = new_sock->sk;
+		struct tipc_sock *new_tsock = tipc_sk(new_sk);
+		struct tipc_port *new_tport = new_tsock->p;
+		u32 new_ref = new_tport->ref;
+		struct tipc_msg *msg = buf_msg(buf);
+
+		lock_sock(new_sk);
+
+		/*
+		 * Reject any stray messages received by new socket
+		 * before the socket lock was taken (very, very unlikely)
+		 */
+
+		reject_rx_queue(new_sk);
+
+		/* Connect new socket to it's peer */
+
+		new_tsock->peer_name.ref = msg_origport(msg);
+		new_tsock->peer_name.node = msg_orignode(msg);
+		tipc_connect2port(new_ref, &new_tsock->peer_name);
+		new_sock->state = SS_CONNECTED;
+
+		tipc_set_portimportance(new_ref, msg_importance(msg));
+		if (msg_named(msg)) {
+			new_tport->conn_type = msg_nametype(msg);
+			new_tport->conn_instance = msg_nameinst(msg);
+		}
+
+		/*
+		 * Respond to 'SYN-' by discarding it & returning 'ACK'-.
+		 * Respond to 'SYN+' by queuing it on new socket.
+		 */
+
+		if (!msg_data_sz(msg)) {
+			struct msghdr m = {NULL,};
+
+			advance_rx_queue(sk);
+			send_packet(NULL, new_sock, &m, 0);
+		} else {
+			__skb_dequeue(&sk->sk_receive_queue);
+			__skb_queue_head(&new_sk->sk_receive_queue, buf);
+		}
+		release_sock(new_sk);
+	}
+exit:
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * shutdown - shutdown socket connection
+ * @sock: socket structure
+ * @how: direction to close (must be SHUT_RDWR)
+ *
+ * Terminates connection (if necessary), then purges socket's receive queue.
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	struct sk_buff *buf;
+	int res;
+
+	if (how != SHUT_RDWR)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	switch (sock->state) {
+	case SS_CONNECTING:
+	case SS_CONNECTED:
+
+		/* Disconnect and send a 'FIN+' or 'FIN-' message to peer */
+restart:
+		buf = __skb_dequeue(&sk->sk_receive_queue);
+		if (buf) {
+			atomic_dec(&tipc_queue_size);
+			if (TIPC_SKB_CB(buf)->handle != 0) {
+				buf_discard(buf);
+				goto restart;
+			}
+			tipc_disconnect(tport->ref);
+			tipc_reject_msg(buf, TIPC_CONN_SHUTDOWN);
+		} else {
+			tipc_shutdown(tport->ref);
+		}
+
+		sock->state = SS_DISCONNECTING;
+
+		/* fall through */
+
+	case SS_DISCONNECTING:
+
+		/* Discard any unreceived messages; wake up sleeping tasks */
+
+		discard_rx_queue(sk);
+		if (waitqueue_active(sk_sleep(sk)))
+			wake_up_interruptible(sk_sleep(sk));
+		res = 0;
+		break;
+
+	default:
+		res = -ENOTCONN;
+	}
+
+	release_sock(sk);
+	return res;
+}
+
+/**
+ * setsockopt - set socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: pointer to new option value
+ * @ol: length of option value
+ *
+ * For stream sockets only, accepts and ignores all IPPROTO_TCP options
+ * (to ease compatibility).
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int setsockopt(struct socket *sock,
+		      int lvl, int opt, char __user *ov, unsigned int ol)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	u32 value;
+	int res;
+
+	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+		return 0;
+	if (lvl != SOL_TIPC)
+		return -ENOPROTOOPT;
+	if (ol < sizeof(value))
+		return -EINVAL;
+	res = get_user(value, (u32 __user *)ov);
+	if (res)
+		return res;
+
+	lock_sock(sk);
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+		res = tipc_set_portimportance(tport->ref, value);
+		break;
+	case TIPC_SRC_DROPPABLE:
+		if (sock->type != SOCK_STREAM)
+			res = tipc_set_portunreliable(tport->ref, value);
+		else
+			res = -ENOPROTOOPT;
+		break;
+	case TIPC_DEST_DROPPABLE:
+		res = tipc_set_portunreturnable(tport->ref, value);
+		break;
+	case TIPC_CONN_TIMEOUT:
+		tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value);
+		/* no need to set "res", since already 0 at this point */
+		break;
+	default:
+		res = -EINVAL;
+	}
+
+	release_sock(sk);
+
+	return res;
+}
+
+/**
+ * getsockopt - get socket option
+ * @sock: socket structure
+ * @lvl: option level
+ * @opt: option identifier
+ * @ov: receptacle for option value
+ * @ol: receptacle for length of option value
+ *
+ * For stream sockets only, returns 0 length result for all IPPROTO_TCP options
+ * (to ease compatibility).
+ *
+ * Returns 0 on success, errno otherwise
+ */
+
+static int getsockopt(struct socket *sock,
+		      int lvl, int opt, char __user *ov, int __user *ol)
+{
+	struct sock *sk = sock->sk;
+	struct tipc_port *tport = tipc_sk_port(sk);
+	int len;
+	u32 value;
+	int res;
+
+	if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
+		return put_user(0, ol);
+	if (lvl != SOL_TIPC)
+		return -ENOPROTOOPT;
+	res = get_user(len, ol);
+	if (res)
+		return res;
+
+	lock_sock(sk);
+
+	switch (opt) {
+	case TIPC_IMPORTANCE:
+		res = tipc_portimportance(tport->ref, &value);
+		break;
+	case TIPC_SRC_DROPPABLE:
+		res = tipc_portunreliable(tport->ref, &value);
+		break;
+	case TIPC_DEST_DROPPABLE:
+		res = tipc_portunreturnable(tport->ref, &value);
+		break;
+	case TIPC_CONN_TIMEOUT:
+		value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout);
+		/* no need to set "res", since already 0 at this point */
+		break;
+	case TIPC_NODE_RECVQ_DEPTH:
+		value = (u32)atomic_read(&tipc_queue_size);
+		break;
+	case TIPC_SOCK_RECVQ_DEPTH:
+		value = skb_queue_len(&sk->sk_receive_queue);
+		break;
+	default:
+		res = -EINVAL;
+	}
+
+	release_sock(sk);
+
+	if (res)
+		return res;	/* "get" failed */
+
+	if (len < sizeof(value))
+		return -EINVAL;
+
+	if (copy_to_user(ov, &value, sizeof(value)))
+		return -EFAULT;
+
+	return put_user(sizeof(value), ol);
+}
+
+/**
+ * Protocol switches for the various types of TIPC sockets
+ */
+
+static const struct proto_ops msg_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_msg,
+	.recvmsg	= recv_msg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage
+};
+
+static const struct proto_ops packet_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_packet,
+	.recvmsg	= recv_msg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage
+};
+
+static const struct proto_ops stream_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.release	= release,
+	.bind		= bind,
+	.connect	= connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= accept,
+	.getname	= get_name,
+	.poll		= poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= listen,
+	.shutdown	= shutdown,
+	.setsockopt	= setsockopt,
+	.getsockopt	= getsockopt,
+	.sendmsg	= send_stream,
+	.recvmsg	= recv_stream,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage
+};
+
+static const struct net_proto_family tipc_family_ops = {
+	.owner		= THIS_MODULE,
+	.family		= AF_TIPC,
+	.create		= tipc_create
+};
+
+static struct proto tipc_proto = {
+	.name		= "TIPC",
+	.owner		= THIS_MODULE,
+	.obj_size	= sizeof(struct tipc_sock)
+};
+
+/**
+ * tipc_socket_init - initialize TIPC socket interface
+ *
+ * Returns 0 on success, errno otherwise
+ */
+int tipc_socket_init(void)
+{
+	int res;
+
+	res = proto_register(&tipc_proto, 1);
+	if (res) {
+		err("Failed to register TIPC protocol type\n");
+		goto out;
+	}
+
+	res = sock_register(&tipc_family_ops);
+	if (res) {
+		err("Failed to register TIPC socket type\n");
+		proto_unregister(&tipc_proto);
+		goto out;
+	}
+
+	sockets_enabled = 1;
+ out:
+	return res;
+}
+
+/**
+ * tipc_socket_stop - stop TIPC socket interface
+ */
+
+void tipc_socket_stop(void)
+{
+	if (!sockets_enabled)
+		return;
+
+	sockets_enabled = 0;
+	sock_unregister(tipc_family_ops.family);
+	proto_unregister(&tipc_proto);
+}
+
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
new file mode 100644
index 00000000..6cf72686
--- /dev/null
+++ b/net/tipc/subscr.c
@@ -0,0 +1,589 @@
+/*
+ * net/tipc/subscr.c: TIPC network topology service
+ *
+ * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2005-2007, 2010-2011, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "name_table.h"
+#include "port.h"
+#include "subscr.h"
+
+/**
+ * struct subscriber - TIPC network topology subscriber
+ * @port_ref: object reference to server port connecting to subscriber
+ * @lock: pointer to spinlock controlling access to subscriber's server port
+ * @subscriber_list: adjacent subscribers in top. server's list of subscribers
+ * @subscription_list: list of subscription objects for this subscriber
+ */
+
+struct subscriber {
+	u32 port_ref;
+	spinlock_t *lock;
+	struct list_head subscriber_list;
+	struct list_head subscription_list;
+};
+
+/**
+ * struct top_srv - TIPC network topology subscription service
+ * @user_ref: TIPC userid of subscription service
+ * @setup_port: reference to TIPC port that handles subscription requests
+ * @subscription_count: number of active subscriptions (not subscribers!)
+ * @subscriber_list: list of ports subscribing to service
+ * @lock: spinlock govering access to subscriber list
+ */
+
+struct top_srv {
+	u32 setup_port;
+	atomic_t subscription_count;
+	struct list_head subscriber_list;
+	spinlock_t lock;
+};
+
+static struct top_srv topsrv;
+
+/**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ *
+ * Returns converted value
+ */
+
+static u32 htohl(u32 in, int swap)
+{
+	return swap ? swab32(in) : in;
+}
+
+/**
+ * subscr_send_event - send a message containing a tipc_event to the subscriber
+ *
+ * Note: Must not hold subscriber's server port lock, since tipc_send() will
+ *       try to take the lock if the message is rejected and returned!
+ */
+
+static void subscr_send_event(struct subscription *sub,
+			      u32 found_lower,
+			      u32 found_upper,
+			      u32 event,
+			      u32 port_ref,
+			      u32 node)
+{
+	struct iovec msg_sect;
+
+	msg_sect.iov_base = (void *)&sub->evt;
+	msg_sect.iov_len = sizeof(struct tipc_event);
+
+	sub->evt.event = htohl(event, sub->swap);
+	sub->evt.found_lower = htohl(found_lower, sub->swap);
+	sub->evt.found_upper = htohl(found_upper, sub->swap);
+	sub->evt.port.ref = htohl(port_ref, sub->swap);
+	sub->evt.port.node = htohl(node, sub->swap);
+	tipc_send(sub->server_ref, 1, &msg_sect, msg_sect.iov_len);
+}
+
+/**
+ * tipc_subscr_overlap - test for subscription overlap with the given values
+ *
+ * Returns 1 if there is overlap, otherwise 0.
+ */
+
+int tipc_subscr_overlap(struct subscription *sub,
+			u32 found_lower,
+			u32 found_upper)
+
+{
+	if (found_lower < sub->seq.lower)
+		found_lower = sub->seq.lower;
+	if (found_upper > sub->seq.upper)
+		found_upper = sub->seq.upper;
+	if (found_lower > found_upper)
+		return 0;
+	return 1;
+}
+
+/**
+ * tipc_subscr_report_overlap - issue event if there is subscription overlap
+ *
+ * Protected by nameseq.lock in name_table.c
+ */
+
+void tipc_subscr_report_overlap(struct subscription *sub,
+				u32 found_lower,
+				u32 found_upper,
+				u32 event,
+				u32 port_ref,
+				u32 node,
+				int must)
+{
+	if (!tipc_subscr_overlap(sub, found_lower, found_upper))
+		return;
+	if (!must && !(sub->filter & TIPC_SUB_PORTS))
+		return;
+
+	sub->event_cb(sub, found_lower, found_upper, event, port_ref, node);
+}
+
+/**
+ * subscr_timeout - subscription timeout has occurred
+ */
+
+static void subscr_timeout(struct subscription *sub)
+{
+	struct tipc_port *server_port;
+
+	/* Validate server port reference (in case subscriber is terminating) */
+
+	server_port = tipc_port_lock(sub->server_ref);
+	if (server_port == NULL)
+		return;
+
+	/* Validate timeout (in case subscription is being cancelled) */
+
+	if (sub->timeout == TIPC_WAIT_FOREVER) {
+		tipc_port_unlock(server_port);
+		return;
+	}
+
+	/* Unlink subscription from name table */
+
+	tipc_nametbl_unsubscribe(sub);
+
+	/* Unlink subscription from subscriber */
+
+	list_del(&sub->subscription_list);
+
+	/* Release subscriber's server port */
+
+	tipc_port_unlock(server_port);
+
+	/* Notify subscriber of timeout */
+
+	subscr_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
+			  TIPC_SUBSCR_TIMEOUT, 0, 0);
+
+	/* Now destroy subscription */
+
+	k_term_timer(&sub->timer);
+	kfree(sub);
+	atomic_dec(&topsrv.subscription_count);
+}
+
+/**
+ * subscr_del - delete a subscription within a subscription list
+ *
+ * Called with subscriber port locked.
+ */
+
+static void subscr_del(struct subscription *sub)
+{
+	tipc_nametbl_unsubscribe(sub);
+	list_del(&sub->subscription_list);
+	kfree(sub);
+	atomic_dec(&topsrv.subscription_count);
+}
+
+/**
+ * subscr_terminate - terminate communication with a subscriber
+ *
+ * Called with subscriber port locked.  Routine must temporarily release lock
+ * to enable subscription timeout routine(s) to finish without deadlocking;
+ * the lock is then reclaimed to allow caller to release it upon return.
+ * (This should work even in the unlikely event some other thread creates
+ * a new object reference in the interim that uses this lock; this routine will
+ * simply wait for it to be released, then claim it.)
+ */
+
+static void subscr_terminate(struct subscriber *subscriber)
+{
+	u32 port_ref;
+	struct subscription *sub;
+	struct subscription *sub_temp;
+
+	/* Invalidate subscriber reference */
+
+	port_ref = subscriber->port_ref;
+	subscriber->port_ref = 0;
+	spin_unlock_bh(subscriber->lock);
+
+	/* Sever connection to subscriber */
+
+	tipc_shutdown(port_ref);
+	tipc_deleteport(port_ref);
+
+	/* Destroy any existing subscriptions for subscriber */
+
+	list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
+				 subscription_list) {
+		if (sub->timeout != TIPC_WAIT_FOREVER) {
+			k_cancel_timer(&sub->timer);
+			k_term_timer(&sub->timer);
+		}
+		subscr_del(sub);
+	}
+
+	/* Remove subscriber from topology server's subscriber list */
+
+	spin_lock_bh(&topsrv.lock);
+	list_del(&subscriber->subscriber_list);
+	spin_unlock_bh(&topsrv.lock);
+
+	/* Reclaim subscriber lock */
+
+	spin_lock_bh(subscriber->lock);
+
+	/* Now destroy subscriber */
+
+	kfree(subscriber);
+}
+
+/**
+ * subscr_cancel - handle subscription cancellation request
+ *
+ * Called with subscriber port locked.  Routine must temporarily release lock
+ * to enable the subscription timeout routine to finish without deadlocking;
+ * the lock is then reclaimed to allow caller to release it upon return.
+ *
+ * Note that fields of 's' use subscriber's endianness!
+ */
+
+static void subscr_cancel(struct tipc_subscr *s,
+			  struct subscriber *subscriber)
+{
+	struct subscription *sub;
+	struct subscription *sub_temp;
+	int found = 0;
+
+	/* Find first matching subscription, exit if not found */
+
+	list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
+				 subscription_list) {
+		if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return;
+
+	/* Cancel subscription timer (if used), then delete subscription */
+
+	if (sub->timeout != TIPC_WAIT_FOREVER) {
+		sub->timeout = TIPC_WAIT_FOREVER;
+		spin_unlock_bh(subscriber->lock);
+		k_cancel_timer(&sub->timer);
+		k_term_timer(&sub->timer);
+		spin_lock_bh(subscriber->lock);
+	}
+	subscr_del(sub);
+}
+
+/**
+ * subscr_subscribe - create subscription for subscriber
+ *
+ * Called with subscriber port locked.
+ */
+
+static struct subscription *subscr_subscribe(struct tipc_subscr *s,
+					     struct subscriber *subscriber)
+{
+	struct subscription *sub;
+	int swap;
+
+	/* Determine subscriber's endianness */
+
+	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
+
+	/* Detect & process a subscription cancellation request */
+
+	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+		subscr_cancel(s, subscriber);
+		return NULL;
+	}
+
+	/* Refuse subscription if global limit exceeded */
+
+	if (atomic_read(&topsrv.subscription_count) >= tipc_max_subscriptions) {
+		warn("Subscription rejected, subscription limit reached (%u)\n",
+		     tipc_max_subscriptions);
+		subscr_terminate(subscriber);
+		return NULL;
+	}
+
+	/* Allocate subscription object */
+
+	sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
+	if (!sub) {
+		warn("Subscription rejected, no memory\n");
+		subscr_terminate(subscriber);
+		return NULL;
+	}
+
+	/* Initialize subscription object */
+
+	sub->seq.type = htohl(s->seq.type, swap);
+	sub->seq.lower = htohl(s->seq.lower, swap);
+	sub->seq.upper = htohl(s->seq.upper, swap);
+	sub->timeout = htohl(s->timeout, swap);
+	sub->filter = htohl(s->filter, swap);
+	if ((!(sub->filter & TIPC_SUB_PORTS) ==
+	     !(sub->filter & TIPC_SUB_SERVICE)) ||
+	    (sub->seq.lower > sub->seq.upper)) {
+		warn("Subscription rejected, illegal request\n");
+		kfree(sub);
+		subscr_terminate(subscriber);
+		return NULL;
+	}
+	sub->event_cb = subscr_send_event;
+	INIT_LIST_HEAD(&sub->nameseq_list);
+	list_add(&sub->subscription_list, &subscriber->subscription_list);
+	sub->server_ref = subscriber->port_ref;
+	sub->swap = swap;
+	memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
+	atomic_inc(&topsrv.subscription_count);
+	if (sub->timeout != TIPC_WAIT_FOREVER) {
+		k_init_timer(&sub->timer,
+			     (Handler)subscr_timeout, (unsigned long)sub);
+		k_start_timer(&sub->timer, sub->timeout);
+	}
+
+	return sub;
+}
+
+/**
+ * subscr_conn_shutdown_event - handle termination request from subscriber
+ *
+ * Called with subscriber's server port unlocked.
+ */
+
+static void subscr_conn_shutdown_event(void *usr_handle,
+				       u32 port_ref,
+				       struct sk_buff **buf,
+				       unsigned char const *data,
+				       unsigned int size,
+				       int reason)
+{
+	struct subscriber *subscriber = usr_handle;
+	spinlock_t *subscriber_lock;
+
+	if (tipc_port_lock(port_ref) == NULL)
+		return;
+
+	subscriber_lock = subscriber->lock;
+	subscr_terminate(subscriber);
+	spin_unlock_bh(subscriber_lock);
+}
+
+/**
+ * subscr_conn_msg_event - handle new subscription request from subscriber
+ *
+ * Called with subscriber's server port unlocked.
+ */
+
+static void subscr_conn_msg_event(void *usr_handle,
+				  u32 port_ref,
+				  struct sk_buff **buf,
+				  const unchar *data,
+				  u32 size)
+{
+	struct subscriber *subscriber = usr_handle;
+	spinlock_t *subscriber_lock;
+	struct subscription *sub;
+
+	/*
+	 * Lock subscriber's server port (& make a local copy of lock pointer,
+	 * in case subscriber is deleted while processing subscription request)
+	 */
+
+	if (tipc_port_lock(port_ref) == NULL)
+		return;
+
+	subscriber_lock = subscriber->lock;
+
+	if (size != sizeof(struct tipc_subscr)) {
+		subscr_terminate(subscriber);
+		spin_unlock_bh(subscriber_lock);
+	} else {
+		sub = subscr_subscribe((struct tipc_subscr *)data, subscriber);
+		spin_unlock_bh(subscriber_lock);
+		if (sub != NULL) {
+
+			/*
+			 * We must release the server port lock before adding a
+			 * subscription to the name table since TIPC needs to be
+			 * able to (re)acquire the port lock if an event message
+			 * issued by the subscription process is rejected and
+			 * returned.  The subscription cannot be deleted while
+			 * it is being added to the name table because:
+			 * a) the single-threading of the native API port code
+			 *    ensures the subscription cannot be cancelled and
+			 *    the subscriber connection cannot be broken, and
+			 * b) the name table lock ensures the subscription
+			 *    timeout code cannot delete the subscription,
+			 * so the subscription object is still protected.
+			 */
+
+			tipc_nametbl_subscribe(sub);
+		}
+	}
+}
+
+/**
+ * subscr_named_msg_event - handle request to establish a new subscriber
+ */
+
+static void subscr_named_msg_event(void *usr_handle,
+				   u32 port_ref,
+				   struct sk_buff **buf,
+				   const unchar *data,
+				   u32 size,
+				   u32 importance,
+				   struct tipc_portid const *orig,
+				   struct tipc_name_seq const *dest)
+{
+	struct subscriber *subscriber;
+	u32 server_port_ref;
+
+	/* Create subscriber object */
+
+	subscriber = kzalloc(sizeof(struct subscriber), GFP_ATOMIC);
+	if (subscriber == NULL) {
+		warn("Subscriber rejected, no memory\n");
+		return;
+	}
+	INIT_LIST_HEAD(&subscriber->subscription_list);
+	INIT_LIST_HEAD(&subscriber->subscriber_list);
+
+	/* Create server port & establish connection to subscriber */
+
+	tipc_createport(subscriber,
+			importance,
+			NULL,
+			NULL,
+			subscr_conn_shutdown_event,
+			NULL,
+			NULL,
+			subscr_conn_msg_event,
+			NULL,
+			&subscriber->port_ref);
+	if (subscriber->port_ref == 0) {
+		warn("Subscriber rejected, unable to create port\n");
+		kfree(subscriber);
+		return;
+	}
+	tipc_connect2port(subscriber->port_ref, orig);
+
+	/* Lock server port (& save lock address for future use) */
+
+	subscriber->lock = tipc_port_lock(subscriber->port_ref)->lock;
+
+	/* Add subscriber to topology server's subscriber list */
+
+	spin_lock_bh(&topsrv.lock);
+	list_add(&subscriber->subscriber_list, &topsrv.subscriber_list);
+	spin_unlock_bh(&topsrv.lock);
+
+	/* Unlock server port */
+
+	server_port_ref = subscriber->port_ref;
+	spin_unlock_bh(subscriber->lock);
+
+	/* Send an ACK- to complete connection handshaking */
+
+	tipc_send(server_port_ref, 0, NULL, 0);
+
+	/* Handle optional subscription request */
+
+	if (size != 0) {
+		subscr_conn_msg_event(subscriber, server_port_ref,
+				      buf, data, size);
+	}
+}
+
+int tipc_subscr_start(void)
+{
+	struct tipc_name_seq seq = {TIPC_TOP_SRV, TIPC_TOP_SRV, TIPC_TOP_SRV};
+	int res;
+
+	memset(&topsrv, 0, sizeof(topsrv));
+	spin_lock_init(&topsrv.lock);
+	INIT_LIST_HEAD(&topsrv.subscriber_list);
+
+	res = tipc_createport(NULL,
+			      TIPC_CRITICAL_IMPORTANCE,
+			      NULL,
+			      NULL,
+			      NULL,
+			      NULL,
+			      subscr_named_msg_event,
+			      NULL,
+			      NULL,
+			      &topsrv.setup_port);
+	if (res)
+		goto failed;
+
+	res = tipc_nametbl_publish_rsv(topsrv.setup_port, TIPC_NODE_SCOPE, &seq);
+	if (res) {
+		tipc_deleteport(topsrv.setup_port);
+		topsrv.setup_port = 0;
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	err("Failed to create subscription service\n");
+	return res;
+}
+
+void tipc_subscr_stop(void)
+{
+	struct subscriber *subscriber;
+	struct subscriber *subscriber_temp;
+	spinlock_t *subscriber_lock;
+
+	if (topsrv.setup_port) {
+		tipc_deleteport(topsrv.setup_port);
+		topsrv.setup_port = 0;
+
+		list_for_each_entry_safe(subscriber, subscriber_temp,
+					 &topsrv.subscriber_list,
+					 subscriber_list) {
+			subscriber_lock = subscriber->lock;
+			spin_lock_bh(subscriber_lock);
+			subscr_terminate(subscriber);
+			spin_unlock_bh(subscriber_lock);
+		}
+	}
+}
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
new file mode 100644
index 00000000..45d89bf4
--- /dev/null
+++ b/net/tipc/subscr.h
@@ -0,0 +1,90 @@
+/*
+ * net/tipc/subscr.h: Include file for TIPC network topology service
+ *
+ * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2005-2007, Wind River Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SUBSCR_H
+#define _TIPC_SUBSCR_H
+
+struct subscription;
+
+typedef void (*tipc_subscr_event) (struct subscription *sub,
+				   u32 found_lower, u32 found_upper,
+				   u32 event, u32 port_ref, u32 node);
+
+/**
+ * struct subscription - TIPC network topology subscription object
+ * @seq: name sequence associated with subscription
+ * @timeout: duration of subscription (in ms)
+ * @filter: event filtering to be done for subscription
+ * @event_cb: routine invoked when a subscription event is detected
+ * @timer: timer governing subscription duration (optional)
+ * @nameseq_list: adjacent subscriptions in name sequence's subscription list
+ * @subscription_list: adjacent subscriptions in subscriber's subscription list
+ * @server_ref: object reference of server port associated with subscription
+ * @swap: indicates if subscriber uses opposite endianness in its messages
+ * @evt: template for events generated by subscription
+ */
+
+struct subscription {
+	struct tipc_name_seq seq;
+	u32 timeout;
+	u32 filter;
+	tipc_subscr_event event_cb;
+	struct timer_list timer;
+	struct list_head nameseq_list;
+	struct list_head subscription_list;
+	u32 server_ref;
+	int swap;
+	struct tipc_event evt;
+};
+
+int tipc_subscr_overlap(struct subscription *sub,
+			u32 found_lower,
+			u32 found_upper);
+
+void tipc_subscr_report_overlap(struct subscription *sub,
+				u32 found_lower,
+				u32 found_upper,
+				u32 event,
+				u32 port_ref,
+				u32 node,
+				int must_report);
+
+int tipc_subscr_start(void);
+
+void tipc_subscr_stop(void);
+
+
+#endif
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 00000000..5a69733b
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
+#
+# Unix Domain Sockets
+#
+
+config UNIX
+	tristate "Unix domain sockets"
+	---help---
+	  If you say Y here, you will include support for Unix domain sockets;
+	  sockets are the standard Unix mechanism for establishing and
+	  accessing network connections.  Many commonly used programs such as
+	  the X Window system and syslog use these sockets even if your
+	  machine is not connected to any network.  Unless you are working on
+	  an embedded system or something similar, you therefore definitely
+	  want to say Y here.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called unix.  Note that several important services won't work
+	  correctly if you say M here and then neglect to load the module.
+
+	  Say Y unless you know what you are doing.
+
diff --git a/net/unix/Makefile b/net/unix/Makefile
new file mode 100644
index 00000000..b852a2bd
--- /dev/null
+++ b/net/unix/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Linux unix domain socket layer.
+#
+
+obj-$(CONFIG_UNIX)	+= unix.o
+
+unix-y			:= af_unix.o garbage.o
+unix-$(CONFIG_SYSCTL)	+= sysctl_net_unix.o
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
new file mode 100644
index 00000000..0722a25a
--- /dev/null
+++ b/net/unix/af_unix.c
@@ -0,0 +1,2385 @@
+/*
+ * NET4:	Implementation of BSD Unix domain sockets.
+ *
+ * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *		Linus Torvalds	:	Assorted bug cures.
+ *		Niibe Yutaka	:	async I/O support.
+ *		Carsten Paeth	:	PF_UNIX check, address fixes.
+ *		Alan Cox	:	Limit size of allocated blocks.
+ *		Alan Cox	:	Fixed the stupid socketpair bug.
+ *		Alan Cox	:	BSD compatibility fine tuning.
+ *		Alan Cox	:	Fixed a bug in connect when interrupted.
+ *		Alan Cox	:	Sorted out a proper draft version of
+ *					file descriptor passing hacked up from
+ *					Mike Shaver's work.
+ *		Marty Leisner	:	Fixes to fd passing
+ *		Nick Nevin	:	recvmsg bugfix.
+ *		Alan Cox	:	Started proper garbage collector
+ *		Heiko EiBfeldt	:	Missing verify_area check
+ *		Alan Cox	:	Started POSIXisms
+ *		Andreas Schwab	:	Replace inode by dentry for proper
+ *					reference counting
+ *		Kirk Petersen	:	Made this a module
+ *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
+ *					Lots of bug fixes.
+ *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
+ *					by above two patches.
+ *	     Andrea Arcangeli	:	If possible we block in connect(2)
+ *					if the max backlog of the listen socket
+ *					is been reached. This won't break
+ *					old apps and it will avoid huge amount
+ *					of socks hashed (this for unix_gc()
+ *					performances reasons).
+ *					Security fix that limits the max
+ *					number of socks to 2*max_files and
+ *					the number of skb queueable in the
+ *					dgram receiver.
+ *		Artur Skawina   :	Hash function optimizations
+ *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
+ *	      Malcolm Beattie   :	Set peercred for socketpair
+ *	     Michal Ostrowski   :       Module initialization cleanup.
+ *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
+ *	     				the core infrastructure is doing that
+ *	     				for all net proto families now (2.5.69+)
+ *
+ *
+ * Known differences from reference BSD that was tested:
+ *
+ *	[TO FIX]
+ *	ECONNREFUSED is not returned from one end of a connected() socket to the
+ *		other the moment one end closes.
+ *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
+ *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
+ *	[NOT TO FIX]
+ *	accept() returns a path name even if the connecting socket has closed
+ *		in the meantime (BSD loses the path and gives up).
+ *	accept() returns 0 length path for an unbound connector. BSD returns 16
+ *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
+ *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
+ *	BSD af_unix apparently has connect forgetting to block properly.
+ *		(need to check this with the POSIX spec in detail)
+ *
+ * Differences from 2.0.0-11-... (ANK)
+ *	Bug fixes and improvements.
+ *		- client shutdown killed server socket.
+ *		- removed all useless cli/sti pairs.
+ *
+ *	Semantic changes/extensions.
+ *		- generic control message passing.
+ *		- SCM_CREDENTIALS control message.
+ *		- "Abstract" (not FS based) socket bindings.
+ *		  Abstract names are sequences of bytes (not zero terminated)
+ *		  started by 0, so that this name space does not intersect
+ *		  with BSD names.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/af_unix.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/scm.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/rtnetlink.h>
+#include <linux/mount.h>
+#include <net/checksum.h>
+#include <linux/security.h>
+
+static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
+static DEFINE_SPINLOCK(unix_table_lock);
+static atomic_long_t unix_nr_socks;
+
+#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
+
+#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
+}
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	scm->secid = *UNIXSID(skb);
+}
+#else
+static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+
+static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
+{ }
+#endif /* CONFIG_SECURITY_NETWORK */
+
+/*
+ *  SMP locking strategy:
+ *    hash table is protected with spinlock unix_table_lock
+ *    each socket state is protected by separate spin lock.
+ */
+
+static inline unsigned unix_hash_fold(__wsum n)
+{
+	unsigned hash = (__force unsigned)n;
+	hash ^= hash>>16;
+	hash ^= hash>>8;
+	return hash&(UNIX_HASH_SIZE-1);
+}
+
+#define unix_peer(sk) (unix_sk(sk)->peer)
+
+static inline int unix_our_peer(struct sock *sk, struct sock *osk)
+{
+	return unix_peer(osk) == sk;
+}
+
+static inline int unix_may_send(struct sock *sk, struct sock *osk)
+{
+	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
+}
+
+static inline int unix_recvq_full(struct sock const *sk)
+{
+	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
+}
+
+static struct sock *unix_peer_get(struct sock *s)
+{
+	struct sock *peer;
+
+	unix_state_lock(s);
+	peer = unix_peer(s);
+	if (peer)
+		sock_hold(peer);
+	unix_state_unlock(s);
+	return peer;
+}
+
+static inline void unix_release_addr(struct unix_address *addr)
+{
+	if (atomic_dec_and_test(&addr->refcnt))
+		kfree(addr);
+}
+
+/*
+ *	Check unix socket name:
+ *		- should be not zero length.
+ *	        - if started by not zero, should be NULL terminated (FS object)
+ *		- if started by zero, it is abstract name.
+ */
+
+static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned *hashp)
+{
+	if (len <= sizeof(short) || len > sizeof(*sunaddr))
+		return -EINVAL;
+	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
+		return -EINVAL;
+	if (sunaddr->sun_path[0]) {
+		/*
+		 * This may look like an off by one error but it is a bit more
+		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
+		 * sun_path[108] doesn't as such exist.  However in kernel space
+		 * we are guaranteed that it is a valid memory location in our
+		 * kernel address buffer.
+		 */
+		((char *)sunaddr)[len] = 0;
+		len = strlen(sunaddr->sun_path)+1+sizeof(short);
+		return len;
+	}
+
+	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
+	return len;
+}
+
+static void __unix_remove_socket(struct sock *sk)
+{
+	sk_del_node_init(sk);
+}
+
+static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
+{
+	WARN_ON(!sk_unhashed(sk));
+	sk_add_node(sk, list);
+}
+
+static inline void unix_remove_socket(struct sock *sk)
+{
+	spin_lock(&unix_table_lock);
+	__unix_remove_socket(sk);
+	spin_unlock(&unix_table_lock);
+}
+
+static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
+{
+	spin_lock(&unix_table_lock);
+	__unix_insert_socket(list, sk);
+	spin_unlock(&unix_table_lock);
+}
+
+static struct sock *__unix_find_socket_byname(struct net *net,
+					      struct sockaddr_un *sunname,
+					      int len, int type, unsigned hash)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
+		struct unix_sock *u = unix_sk(s);
+
+		if (!net_eq(sock_net(s), net))
+			continue;
+
+		if (u->addr->len == len &&
+		    !memcmp(u->addr->name, sunname, len))
+			goto found;
+	}
+	s = NULL;
+found:
+	return s;
+}
+
+static inline struct sock *unix_find_socket_byname(struct net *net,
+						   struct sockaddr_un *sunname,
+						   int len, int type,
+						   unsigned hash)
+{
+	struct sock *s;
+
+	spin_lock(&unix_table_lock);
+	s = __unix_find_socket_byname(net, sunname, len, type, hash);
+	if (s)
+		sock_hold(s);
+	spin_unlock(&unix_table_lock);
+	return s;
+}
+
+static struct sock *unix_find_socket_byinode(struct inode *i)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	spin_lock(&unix_table_lock);
+	sk_for_each(s, node,
+		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
+		struct dentry *dentry = unix_sk(s)->dentry;
+
+		if (dentry && dentry->d_inode == i) {
+			sock_hold(s);
+			goto found;
+		}
+	}
+	s = NULL;
+found:
+	spin_unlock(&unix_table_lock);
+	return s;
+}
+
+static inline int unix_writable(struct sock *sk)
+{
+	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
+}
+
+static void unix_write_space(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	rcu_read_lock();
+	if (unix_writable(sk)) {
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync_poll(&wq->wait,
+				POLLOUT | POLLWRNORM | POLLWRBAND);
+		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	}
+	rcu_read_unlock();
+}
+
+/* When dgram socket disconnects (or changes its peer), we clear its receive
+ * queue of packets arrived from previous peer. First, it allows to do
+ * flow control based only on wmem_alloc; second, sk connected to peer
+ * may receive messages only from that peer. */
+static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
+{
+	if (!skb_queue_empty(&sk->sk_receive_queue)) {
+		skb_queue_purge(&sk->sk_receive_queue);
+		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
+
+		/* If one link of bidirectional dgram pipe is disconnected,
+		 * we signal error. Messages are lost. Do not make this,
+		 * when peer was not connected to us.
+		 */
+		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
+			other->sk_err = ECONNRESET;
+			other->sk_error_report(other);
+		}
+	}
+}
+
+static void unix_sock_destructor(struct sock *sk)
+{
+	struct unix_sock *u = unix_sk(sk);
+
+	skb_queue_purge(&sk->sk_receive_queue);
+
+	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+	WARN_ON(!sk_unhashed(sk));
+	WARN_ON(sk->sk_socket);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
+		return;
+	}
+
+	if (u->addr)
+		unix_release_addr(u->addr);
+
+	atomic_long_dec(&unix_nr_socks);
+	local_bh_disable();
+	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	local_bh_enable();
+#ifdef UNIX_REFCNT_DEBUG
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
+#endif
+}
+
+static int unix_release_sock(struct sock *sk, int embrion)
+{
+	struct unix_sock *u = unix_sk(sk);
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+	struct sock *skpair;
+	struct sk_buff *skb;
+	int state;
+
+	unix_remove_socket(sk);
+
+	/* Clear state */
+	unix_state_lock(sk);
+	sock_orphan(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+	dentry	     = u->dentry;
+	u->dentry    = NULL;
+	mnt	     = u->mnt;
+	u->mnt	     = NULL;
+	state = sk->sk_state;
+	sk->sk_state = TCP_CLOSE;
+	unix_state_unlock(sk);
+
+	wake_up_interruptible_all(&u->peer_wait);
+
+	skpair = unix_peer(sk);
+
+	if (skpair != NULL) {
+		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
+			unix_state_lock(skpair);
+			/* No more writes */
+			skpair->sk_shutdown = SHUTDOWN_MASK;
+			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
+				skpair->sk_err = ECONNRESET;
+			unix_state_unlock(skpair);
+			skpair->sk_state_change(skpair);
+			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
+		}
+		sock_put(skpair); /* It may now die */
+		unix_peer(sk) = NULL;
+	}
+
+	/* Try to flush out this socket. Throw out buffers at least */
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (state == TCP_LISTEN)
+			unix_release_sock(skb->sk, 1);
+		/* passed fds are erased in the kfree_skb hook	      */
+		kfree_skb(skb);
+	}
+
+	if (dentry) {
+		dput(dentry);
+		mntput(mnt);
+	}
+
+	sock_put(sk);
+
+	/* ---- Socket is dead now and most probably destroyed ---- */
+
+	/*
+	 * Fixme: BSD difference: In BSD all sockets connected to use get
+	 *	  ECONNRESET and we die on the spot. In Linux we behave
+	 *	  like files and pipes do and wait for the last
+	 *	  dereference.
+	 *
+	 * Can't we simply set sock->err?
+	 *
+	 *	  What the above comment does talk about? --ANK(980817)
+	 */
+
+	if (unix_tot_inflight)
+		unix_gc();		/* Garbage collect fds */
+
+	return 0;
+}
+
+static void init_peercred(struct sock *sk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(task_tgid(current));
+	sk->sk_peer_cred = get_current_cred();
+}
+
+static void copy_peercred(struct sock *sk, struct sock *peersk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
+	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
+}
+
+static int unix_listen(struct socket *sock, int backlog)
+{
+	int err;
+	struct sock *sk = sock->sk;
+	struct unix_sock *u = unix_sk(sk);
+	struct pid *old_pid = NULL;
+	const struct cred *old_cred = NULL;
+
+	err = -EOPNOTSUPP;
+	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+		goto out;	/* Only stream/seqpacket sockets accept */
+	err = -EINVAL;
+	if (!u->addr)
+		goto out;	/* No listens on an unbound socket */
+	unix_state_lock(sk);
+	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
+		goto out_unlock;
+	if (backlog > sk->sk_max_ack_backlog)
+		wake_up_interruptible_all(&u->peer_wait);
+	sk->sk_max_ack_backlog	= backlog;
+	sk->sk_state		= TCP_LISTEN;
+	/* set credentials so connect can copy them */
+	init_peercred(sk);
+	err = 0;
+
+out_unlock:
+	unix_state_unlock(sk);
+	put_pid(old_pid);
+	if (old_cred)
+		put_cred(old_cred);
+out:
+	return err;
+}
+
+static int unix_release(struct socket *);
+static int unix_bind(struct socket *, struct sockaddr *, int);
+static int unix_stream_connect(struct socket *, struct sockaddr *,
+			       int addr_len, int flags);
+static int unix_socketpair(struct socket *, struct socket *);
+static int unix_accept(struct socket *, struct socket *, int);
+static int unix_getname(struct socket *, struct sockaddr *, int *, int);
+static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
+static unsigned int unix_dgram_poll(struct file *, struct socket *,
+				    poll_table *);
+static int unix_ioctl(struct socket *, unsigned int, unsigned long);
+static int unix_shutdown(struct socket *, int);
+static int unix_stream_sendmsg(struct kiocb *, struct socket *,
+			       struct msghdr *, size_t);
+static int unix_stream_recvmsg(struct kiocb *, struct socket *,
+			       struct msghdr *, size_t, int);
+static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
+			      struct msghdr *, size_t);
+static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
+			      struct msghdr *, size_t, int);
+static int unix_dgram_connect(struct socket *, struct sockaddr *,
+			      int, int);
+static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
+				  struct msghdr *, size_t);
+static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
+				  struct msghdr *, size_t, int);
+
+static const struct proto_ops unix_stream_ops = {
+	.family =	PF_UNIX,
+	.owner =	THIS_MODULE,
+	.release =	unix_release,
+	.bind =		unix_bind,
+	.connect =	unix_stream_connect,
+	.socketpair =	unix_socketpair,
+	.accept =	unix_accept,
+	.getname =	unix_getname,
+	.poll =		unix_poll,
+	.ioctl =	unix_ioctl,
+	.listen =	unix_listen,
+	.shutdown =	unix_shutdown,
+	.setsockopt =	sock_no_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	unix_stream_sendmsg,
+	.recvmsg =	unix_stream_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct proto_ops unix_dgram_ops = {
+	.family =	PF_UNIX,
+	.owner =	THIS_MODULE,
+	.release =	unix_release,
+	.bind =		unix_bind,
+	.connect =	unix_dgram_connect,
+	.socketpair =	unix_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	unix_getname,
+	.poll =		unix_dgram_poll,
+	.ioctl =	unix_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	unix_shutdown,
+	.setsockopt =	sock_no_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	unix_dgram_sendmsg,
+	.recvmsg =	unix_dgram_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static const struct proto_ops unix_seqpacket_ops = {
+	.family =	PF_UNIX,
+	.owner =	THIS_MODULE,
+	.release =	unix_release,
+	.bind =		unix_bind,
+	.connect =	unix_stream_connect,
+	.socketpair =	unix_socketpair,
+	.accept =	unix_accept,
+	.getname =	unix_getname,
+	.poll =		unix_dgram_poll,
+	.ioctl =	unix_ioctl,
+	.listen =	unix_listen,
+	.shutdown =	unix_shutdown,
+	.setsockopt =	sock_no_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	unix_seqpacket_sendmsg,
+	.recvmsg =	unix_seqpacket_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static struct proto unix_proto = {
+	.name			= "UNIX",
+	.owner			= THIS_MODULE,
+	.obj_size		= sizeof(struct unix_sock),
+};
+
+/*
+ * AF_UNIX sockets do not interact with hardware, hence they
+ * dont trigger interrupts - so it's safe for them to have
+ * bh-unsafe locking for their sk_receive_queue.lock. Split off
+ * this special lock-class by reinitializing the spinlock key:
+ */
+static struct lock_class_key af_unix_sk_receive_queue_lock_key;
+
+static struct sock *unix_create1(struct net *net, struct socket *sock)
+{
+	struct sock *sk = NULL;
+	struct unix_sock *u;
+
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
+		goto out;
+
+	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+	if (!sk)
+		goto out;
+
+	sock_init_data(sock, sk);
+	lockdep_set_class(&sk->sk_receive_queue.lock,
+				&af_unix_sk_receive_queue_lock_key);
+
+	sk->sk_write_space	= unix_write_space;
+	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
+	sk->sk_destruct		= unix_sock_destructor;
+	u	  = unix_sk(sk);
+	u->dentry = NULL;
+	u->mnt	  = NULL;
+	spin_lock_init(&u->lock);
+	atomic_long_set(&u->inflight, 0);
+	INIT_LIST_HEAD(&u->link);
+	mutex_init(&u->readlock); /* single task reading lock */
+	init_waitqueue_head(&u->peer_wait);
+	unix_insert_socket(unix_sockets_unbound, sk);
+out:
+	if (sk == NULL)
+		atomic_long_dec(&unix_nr_socks);
+	else {
+		local_bh_disable();
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+		local_bh_enable();
+	}
+	return sk;
+}
+
+static int unix_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
+{
+	if (protocol && protocol != PF_UNIX)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		sock->ops = &unix_stream_ops;
+		break;
+		/*
+		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
+		 *	nothing uses it.
+		 */
+	case SOCK_RAW:
+		sock->type = SOCK_DGRAM;
+	case SOCK_DGRAM:
+		sock->ops = &unix_dgram_ops;
+		break;
+	case SOCK_SEQPACKET:
+		sock->ops = &unix_seqpacket_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	return unix_create1(net, sock) ? 0 : -ENOMEM;
+}
+
+static int unix_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (!sk)
+		return 0;
+
+	sock->sk = NULL;
+
+	return unix_release_sock(sk, 0);
+}
+
+static int unix_autobind(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct unix_sock *u = unix_sk(sk);
+	static u32 ordernum = 1;
+	struct unix_address *addr;
+	int err;
+	unsigned int retries = 0;
+
+	mutex_lock(&u->readlock);
+
+	err = 0;
+	if (u->addr)
+		goto out;
+
+	err = -ENOMEM;
+	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
+	if (!addr)
+		goto out;
+
+	addr->name->sun_family = AF_UNIX;
+	atomic_set(&addr->refcnt, 1);
+
+retry:
+	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
+	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
+
+	spin_lock(&unix_table_lock);
+	ordernum = (ordernum+1)&0xFFFFF;
+
+	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
+				      addr->hash)) {
+		spin_unlock(&unix_table_lock);
+		/*
+		 * __unix_find_socket_byname() may take long time if many names
+		 * are already in use.
+		 */
+		cond_resched();
+		/* Give up if all names seems to be in use. */
+		if (retries++ == 0xFFFFF) {
+			err = -ENOSPC;
+			kfree(addr);
+			goto out;
+		}
+		goto retry;
+	}
+	addr->hash ^= sk->sk_type;
+
+	__unix_remove_socket(sk);
+	u->addr = addr;
+	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
+	spin_unlock(&unix_table_lock);
+	err = 0;
+
+out:	mutex_unlock(&u->readlock);
+	return err;
+}
+
+static struct sock *unix_find_other(struct net *net,
+				    struct sockaddr_un *sunname, int len,
+				    int type, unsigned hash, int *error)
+{
+	struct sock *u;
+	struct path path;
+	int err = 0;
+
+	if (sunname->sun_path[0]) {
+		struct inode *inode;
+		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
+		if (err)
+			goto fail;
+		inode = path.dentry->d_inode;
+		err = inode_permission(inode, MAY_WRITE);
+		if (err)
+			goto put_fail;
+
+		err = -ECONNREFUSED;
+		if (!S_ISSOCK(inode->i_mode))
+			goto put_fail;
+		u = unix_find_socket_byinode(inode);
+		if (!u)
+			goto put_fail;
+
+		if (u->sk_type == type)
+			touch_atime(path.mnt, path.dentry);
+
+		path_put(&path);
+
+		err = -EPROTOTYPE;
+		if (u->sk_type != type) {
+			sock_put(u);
+			goto fail;
+		}
+	} else {
+		err = -ECONNREFUSED;
+		u = unix_find_socket_byname(net, sunname, len, type, hash);
+		if (u) {
+			struct dentry *dentry;
+			dentry = unix_sk(u)->dentry;
+			if (dentry)
+				touch_atime(unix_sk(u)->mnt, dentry);
+		} else
+			goto fail;
+	}
+	return u;
+
+put_fail:
+	path_put(&path);
+fail:
+	*error = err;
+	return NULL;
+}
+
+
+static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct unix_sock *u = unix_sk(sk);
+	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+	struct dentry *dentry = NULL;
+	struct nameidata nd;
+	int err;
+	unsigned hash;
+	struct unix_address *addr;
+	struct hlist_head *list;
+
+	err = -EINVAL;
+	if (sunaddr->sun_family != AF_UNIX)
+		goto out;
+
+	if (addr_len == sizeof(short)) {
+		err = unix_autobind(sock);
+		goto out;
+	}
+
+	err = unix_mkname(sunaddr, addr_len, &hash);
+	if (err < 0)
+		goto out;
+	addr_len = err;
+
+	mutex_lock(&u->readlock);
+
+	err = -EINVAL;
+	if (u->addr)
+		goto out_up;
+
+	err = -ENOMEM;
+	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
+	if (!addr)
+		goto out_up;
+
+	memcpy(addr->name, sunaddr, addr_len);
+	addr->len = addr_len;
+	addr->hash = hash ^ sk->sk_type;
+	atomic_set(&addr->refcnt, 1);
+
+	if (sunaddr->sun_path[0]) {
+		unsigned int mode;
+		err = 0;
+		/*
+		 * Get the parent directory, calculate the hash for last
+		 * component.
+		 */
+		err = kern_path_parent(sunaddr->sun_path, &nd);
+		if (err)
+			goto out_mknod_parent;
+
+		dentry = lookup_create(&nd, 0);
+		err = PTR_ERR(dentry);
+		if (IS_ERR(dentry))
+			goto out_mknod_unlock;
+
+		/*
+		 * All right, let's create it.
+		 */
+		mode = S_IFSOCK |
+		       (SOCK_INODE(sock)->i_mode & ~current_umask());
+		err = mnt_want_write(nd.path.mnt);
+		if (err)
+			goto out_mknod_dput;
+		err = security_path_mknod(&nd.path, dentry, mode, 0);
+		if (err)
+			goto out_mknod_drop_write;
+		err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0);
+out_mknod_drop_write:
+		mnt_drop_write(nd.path.mnt);
+		if (err)
+			goto out_mknod_dput;
+		mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+		dput(nd.path.dentry);
+		nd.path.dentry = dentry;
+
+		addr->hash = UNIX_HASH_SIZE;
+	}
+
+	spin_lock(&unix_table_lock);
+
+	if (!sunaddr->sun_path[0]) {
+		err = -EADDRINUSE;
+		if (__unix_find_socket_byname(net, sunaddr, addr_len,
+					      sk->sk_type, hash)) {
+			unix_release_addr(addr);
+			goto out_unlock;
+		}
+
+		list = &unix_socket_table[addr->hash];
+	} else {
+		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
+		u->dentry = nd.path.dentry;
+		u->mnt    = nd.path.mnt;
+	}
+
+	err = 0;
+	__unix_remove_socket(sk);
+	u->addr = addr;
+	__unix_insert_socket(list, sk);
+
+out_unlock:
+	spin_unlock(&unix_table_lock);
+out_up:
+	mutex_unlock(&u->readlock);
+out:
+	return err;
+
+out_mknod_dput:
+	dput(dentry);
+out_mknod_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+	path_put(&nd.path);
+out_mknod_parent:
+	if (err == -EEXIST)
+		err = -EADDRINUSE;
+	unix_release_addr(addr);
+	goto out_up;
+}
+
+static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
+{
+	if (unlikely(sk1 == sk2) || !sk2) {
+		unix_state_lock(sk1);
+		return;
+	}
+	if (sk1 < sk2) {
+		unix_state_lock(sk1);
+		unix_state_lock_nested(sk2);
+	} else {
+		unix_state_lock(sk2);
+		unix_state_lock_nested(sk1);
+	}
+}
+
+static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
+{
+	if (unlikely(sk1 == sk2) || !sk2) {
+		unix_state_unlock(sk1);
+		return;
+	}
+	unix_state_unlock(sk1);
+	unix_state_unlock(sk2);
+}
+
+static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
+			      int alen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
+	struct sock *other;
+	unsigned hash;
+	int err;
+
+	if (addr->sa_family != AF_UNSPEC) {
+		err = unix_mkname(sunaddr, alen, &hash);
+		if (err < 0)
+			goto out;
+		alen = err;
+
+		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
+		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
+			goto out;
+
+restart:
+		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
+		if (!other)
+			goto out;
+
+		unix_state_double_lock(sk, other);
+
+		/* Apparently VFS overslept socket death. Retry. */
+		if (sock_flag(other, SOCK_DEAD)) {
+			unix_state_double_unlock(sk, other);
+			sock_put(other);
+			goto restart;
+		}
+
+		err = -EPERM;
+		if (!unix_may_send(sk, other))
+			goto out_unlock;
+
+		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
+		if (err)
+			goto out_unlock;
+
+	} else {
+		/*
+		 *	1003.1g breaking connected state with AF_UNSPEC
+		 */
+		other = NULL;
+		unix_state_double_lock(sk, other);
+	}
+
+	/*
+	 * If it was connected, reconnect.
+	 */
+	if (unix_peer(sk)) {
+		struct sock *old_peer = unix_peer(sk);
+		unix_peer(sk) = other;
+		unix_state_double_unlock(sk, other);
+
+		if (other != old_peer)
+			unix_dgram_disconnected(sk, old_peer);
+		sock_put(old_peer);
+	} else {
+		unix_peer(sk) = other;
+		unix_state_double_unlock(sk, other);
+	}
+	return 0;
+
+out_unlock:
+	unix_state_double_unlock(sk, other);
+	sock_put(other);
+out:
+	return err;
+}
+
+static long unix_wait_for_peer(struct sock *other, long timeo)
+{
+	struct unix_sock *u = unix_sk(other);
+	int sched;
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
+
+	sched = !sock_flag(other, SOCK_DEAD) &&
+		!(other->sk_shutdown & RCV_SHUTDOWN) &&
+		unix_recvq_full(other);
+
+	unix_state_unlock(other);
+
+	if (sched)
+		timeo = schedule_timeout(timeo);
+
+	finish_wait(&u->peer_wait, &wait);
+	return timeo;
+}
+
+static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			       int addr_len, int flags)
+{
+	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
+	struct sock *newsk = NULL;
+	struct sock *other = NULL;
+	struct sk_buff *skb = NULL;
+	unsigned hash;
+	int st;
+	int err;
+	long timeo;
+
+	err = unix_mkname(sunaddr, addr_len, &hash);
+	if (err < 0)
+		goto out;
+	addr_len = err;
+
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
+	    (err = unix_autobind(sock)) != 0)
+		goto out;
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	/* First of all allocate resources.
+	   If we will make it after state is locked,
+	   we will have to recheck all again in any case.
+	 */
+
+	err = -ENOMEM;
+
+	/* create new sock for complete connection */
+	newsk = unix_create1(sock_net(sk), NULL);
+	if (newsk == NULL)
+		goto out;
+
+	/* Allocate skb for sending to listening sock */
+	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+	if (skb == NULL)
+		goto out;
+
+restart:
+	/*  Find listening sock. */
+	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
+	if (!other)
+		goto out;
+
+	/* Latch state of peer */
+	unix_state_lock(other);
+
+	/* Apparently VFS overslept socket death. Retry. */
+	if (sock_flag(other, SOCK_DEAD)) {
+		unix_state_unlock(other);
+		sock_put(other);
+		goto restart;
+	}
+
+	err = -ECONNREFUSED;
+	if (other->sk_state != TCP_LISTEN)
+		goto out_unlock;
+	if (other->sk_shutdown & RCV_SHUTDOWN)
+		goto out_unlock;
+
+	if (unix_recvq_full(other)) {
+		err = -EAGAIN;
+		if (!timeo)
+			goto out_unlock;
+
+		timeo = unix_wait_for_peer(other, timeo);
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out;
+		sock_put(other);
+		goto restart;
+	}
+
+	/* Latch our state.
+
+	   It is tricky place. We need to grab our state lock and cannot
+	   drop lock on peer. It is dangerous because deadlock is
+	   possible. Connect to self case and simultaneous
+	   attempt to connect are eliminated by checking socket
+	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
+	   check this before attempt to grab lock.
+
+	   Well, and we have to recheck the state after socket locked.
+	 */
+	st = sk->sk_state;
+
+	switch (st) {
+	case TCP_CLOSE:
+		/* This is ok... continue with connect */
+		break;
+	case TCP_ESTABLISHED:
+		/* Socket is already connected */
+		err = -EISCONN;
+		goto out_unlock;
+	default:
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	unix_state_lock_nested(sk);
+
+	if (sk->sk_state != st) {
+		unix_state_unlock(sk);
+		unix_state_unlock(other);
+		sock_put(other);
+		goto restart;
+	}
+
+	err = security_unix_stream_connect(sk, other, newsk);
+	if (err) {
+		unix_state_unlock(sk);
+		goto out_unlock;
+	}
+
+	/* The way is open! Fastly set all the necessary fields... */
+
+	sock_hold(sk);
+	unix_peer(newsk)	= sk;
+	newsk->sk_state		= TCP_ESTABLISHED;
+	newsk->sk_type		= sk->sk_type;
+	init_peercred(newsk);
+	newu = unix_sk(newsk);
+	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
+	otheru = unix_sk(other);
+
+	/* copy address information from listening to new sock*/
+	if (otheru->addr) {
+		atomic_inc(&otheru->addr->refcnt);
+		newu->addr = otheru->addr;
+	}
+	if (otheru->dentry) {
+		newu->dentry	= dget(otheru->dentry);
+		newu->mnt	= mntget(otheru->mnt);
+	}
+
+	/* Set credentials */
+	copy_peercred(sk, other);
+
+	sock->state	= SS_CONNECTED;
+	sk->sk_state	= TCP_ESTABLISHED;
+	sock_hold(newsk);
+
+	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
+	unix_peer(sk)	= newsk;
+
+	unix_state_unlock(sk);
+
+	/* take ten and and send info to listening sock */
+	spin_lock(&other->sk_receive_queue.lock);
+	__skb_queue_tail(&other->sk_receive_queue, skb);
+	spin_unlock(&other->sk_receive_queue.lock);
+	unix_state_unlock(other);
+	other->sk_data_ready(other, 0);
+	sock_put(other);
+	return 0;
+
+out_unlock:
+	if (other)
+		unix_state_unlock(other);
+
+out:
+	kfree_skb(skb);
+	if (newsk)
+		unix_release_sock(newsk, 0);
+	if (other)
+		sock_put(other);
+	return err;
+}
+
+static int unix_socketpair(struct socket *socka, struct socket *sockb)
+{
+	struct sock *ska = socka->sk, *skb = sockb->sk;
+
+	/* Join our sockets back to back */
+	sock_hold(ska);
+	sock_hold(skb);
+	unix_peer(ska) = skb;
+	unix_peer(skb) = ska;
+	init_peercred(ska);
+	init_peercred(skb);
+
+	if (ska->sk_type != SOCK_DGRAM) {
+		ska->sk_state = TCP_ESTABLISHED;
+		skb->sk_state = TCP_ESTABLISHED;
+		socka->state  = SS_CONNECTED;
+		sockb->state  = SS_CONNECTED;
+	}
+	return 0;
+}
+
+static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sock *tsk;
+	struct sk_buff *skb;
+	int err;
+
+	err = -EOPNOTSUPP;
+	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
+		goto out;
+
+	err = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out;
+
+	/* If socket state is TCP_LISTEN it cannot change (for now...),
+	 * so that no locks are necessary.
+	 */
+
+	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
+	if (!skb) {
+		/* This means receive shutdown. */
+		if (err == 0)
+			err = -EINVAL;
+		goto out;
+	}
+
+	tsk = skb->sk;
+	skb_free_datagram(sk, skb);
+	wake_up_interruptible(&unix_sk(sk)->peer_wait);
+
+	/* attach accepted sock to socket */
+	unix_state_lock(tsk);
+	newsock->state = SS_CONNECTED;
+	sock_graft(tsk, newsock);
+	unix_state_unlock(tsk);
+	return 0;
+
+out:
+	return err;
+}
+
+
+static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
+{
+	struct sock *sk = sock->sk;
+	struct unix_sock *u;
+	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
+	int err = 0;
+
+	if (peer) {
+		sk = unix_peer_get(sk);
+
+		err = -ENOTCONN;
+		if (!sk)
+			goto out;
+		err = 0;
+	} else {
+		sock_hold(sk);
+	}
+
+	u = unix_sk(sk);
+	unix_state_lock(sk);
+	if (!u->addr) {
+		sunaddr->sun_family = AF_UNIX;
+		sunaddr->sun_path[0] = 0;
+		*uaddr_len = sizeof(short);
+	} else {
+		struct unix_address *addr = u->addr;
+
+		*uaddr_len = addr->len;
+		memcpy(sunaddr, addr->name, *uaddr_len);
+	}
+	unix_state_unlock(sk);
+	sock_put(sk);
+out:
+	return err;
+}
+
+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+
+	scm->fp = UNIXCB(skb).fp;
+	UNIXCB(skb).fp = NULL;
+
+	for (i = scm->fp->count-1; i >= 0; i--)
+		unix_notinflight(scm->fp->fp[i]);
+}
+
+static void unix_destruct_scm(struct sk_buff *skb)
+{
+	struct scm_cookie scm;
+	memset(&scm, 0, sizeof(scm));
+	scm.pid  = UNIXCB(skb).pid;
+	scm.cred = UNIXCB(skb).cred;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
+
+	/* Alas, it calls VFS */
+	/* So fscking what? fput() had been SMP-safe since the last Summer */
+	scm_destroy(&scm);
+	sock_wfree(skb);
+}
+
+#define MAX_RECURSION_LEVEL 4
+
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	int i;
+	unsigned char max_level = 0;
+	int unix_sock_count = 0;
+
+	for (i = scm->fp->count - 1; i >= 0; i--) {
+		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
+
+		if (sk) {
+			unix_sock_count++;
+			max_level = max(max_level,
+					unix_sk(sk)->recursion_level);
+		}
+	}
+	if (unlikely(max_level > MAX_RECURSION_LEVEL))
+		return -ETOOMANYREFS;
+
+	/*
+	 * Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
+	if (unix_sock_count) {
+		for (i = scm->fp->count - 1; i >= 0; i--)
+			unix_inflight(scm->fp->fp[i]);
+	}
+	return max_level;
+}
+
+static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+{
+	int err = 0;
+	UNIXCB(skb).pid  = get_pid(scm->pid);
+	UNIXCB(skb).cred = get_cred(scm->cred);
+	UNIXCB(skb).fp = NULL;
+	if (scm->fp && send_fds)
+		err = unix_attach_fds(scm, skb);
+
+	skb->destructor = unix_destruct_scm;
+	return err;
+}
+
+/*
+ *	Send AF_UNIX data.
+ */
+
+static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
+			      struct msghdr *msg, size_t len)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
+	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sk);
+	struct unix_sock *u = unix_sk(sk);
+	struct sockaddr_un *sunaddr = msg->msg_name;
+	struct sock *other = NULL;
+	int namelen = 0; /* fake GCC */
+	int err;
+	unsigned hash;
+	struct sk_buff *skb;
+	long timeo;
+	struct scm_cookie tmp_scm;
+	int max_level;
+
+	if (NULL == siocb->scm)
+		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
+	err = scm_send(sock, msg, siocb->scm);
+	if (err < 0)
+		return err;
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags&MSG_OOB)
+		goto out;
+
+	if (msg->msg_namelen) {
+		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
+		if (err < 0)
+			goto out;
+		namelen = err;
+	} else {
+		sunaddr = NULL;
+		err = -ENOTCONN;
+		other = unix_peer_get(sk);
+		if (!other)
+			goto out;
+	}
+
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
+	    && (err = unix_autobind(sock)) != 0)
+		goto out;
+
+	err = -EMSGSIZE;
+	if (len > sk->sk_sndbuf - 32)
+		goto out;
+
+	skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto out;
+
+	err = unix_scm_to_skb(siocb->scm, skb, true);
+	if (err < 0)
+		goto out_free;
+	max_level = err + 1;
+	unix_get_secdata(siocb->scm, skb);
+
+	skb_reset_transport_header(skb);
+	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (err)
+		goto out_free;
+
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+restart:
+	if (!other) {
+		err = -ECONNRESET;
+		if (sunaddr == NULL)
+			goto out_free;
+
+		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
+					hash, &err);
+		if (other == NULL)
+			goto out_free;
+	}
+
+	if (sk_filter(other, skb) < 0) {
+		/* Toss the packet but do not return any error to the sender */
+		err = len;
+		goto out_free;
+	}
+
+	unix_state_lock(other);
+	err = -EPERM;
+	if (!unix_may_send(sk, other))
+		goto out_unlock;
+
+	if (sock_flag(other, SOCK_DEAD)) {
+		/*
+		 *	Check with 1003.1g - what should
+		 *	datagram error
+		 */
+		unix_state_unlock(other);
+		sock_put(other);
+
+		err = 0;
+		unix_state_lock(sk);
+		if (unix_peer(sk) == other) {
+			unix_peer(sk) = NULL;
+			unix_state_unlock(sk);
+
+			unix_dgram_disconnected(sk, other);
+			sock_put(other);
+			err = -ECONNREFUSED;
+		} else {
+			unix_state_unlock(sk);
+		}
+
+		other = NULL;
+		if (err)
+			goto out_free;
+		goto restart;
+	}
+
+	err = -EPIPE;
+	if (other->sk_shutdown & RCV_SHUTDOWN)
+		goto out_unlock;
+
+	if (sk->sk_type != SOCK_SEQPACKET) {
+		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (unix_peer(other) != sk && unix_recvq_full(other)) {
+		if (!timeo) {
+			err = -EAGAIN;
+			goto out_unlock;
+		}
+
+		timeo = unix_wait_for_peer(other, timeo);
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out_free;
+
+		goto restart;
+	}
+
+	if (sock_flag(other, SOCK_RCVTSTAMP))
+		__net_timestamp(skb);
+	skb_queue_tail(&other->sk_receive_queue, skb);
+	if (max_level > unix_sk(other)->recursion_level)
+		unix_sk(other)->recursion_level = max_level;
+	unix_state_unlock(other);
+	other->sk_data_ready(other, len);
+	sock_put(other);
+	scm_destroy(siocb->scm);
+	return len;
+
+out_unlock:
+	unix_state_unlock(other);
+out_free:
+	kfree_skb(skb);
+out:
+	if (other)
+		sock_put(other);
+	scm_destroy(siocb->scm);
+	return err;
+}
+
+
+static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
+			       struct msghdr *msg, size_t len)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
+	struct sock *sk = sock->sk;
+	struct sock *other = NULL;
+	int err, size;
+	struct sk_buff *skb;
+	int sent = 0;
+	struct scm_cookie tmp_scm;
+	bool fds_sent = false;
+	int max_level;
+
+	if (NULL == siocb->scm)
+		siocb->scm = &tmp_scm;
+	wait_for_unix_gc();
+	err = scm_send(sock, msg, siocb->scm);
+	if (err < 0)
+		return err;
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags&MSG_OOB)
+		goto out_err;
+
+	if (msg->msg_namelen) {
+		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
+		goto out_err;
+	} else {
+		err = -ENOTCONN;
+		other = unix_peer(sk);
+		if (!other)
+			goto out_err;
+	}
+
+	if (sk->sk_shutdown & SEND_SHUTDOWN)
+		goto pipe_err;
+
+	while (sent < len) {
+		/*
+		 *	Optimisation for the fact that under 0.01% of X
+		 *	messages typically need breaking up.
+		 */
+
+		size = len-sent;
+
+		/* Keep two messages in the pipe so it schedules better */
+		if (size > ((sk->sk_sndbuf >> 1) - 64))
+			size = (sk->sk_sndbuf >> 1) - 64;
+
+		if (size > SKB_MAX_ALLOC)
+			size = SKB_MAX_ALLOC;
+
+		/*
+		 *	Grab a buffer
+		 */
+
+		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
+					  &err);
+
+		if (skb == NULL)
+			goto out_err;
+
+		/*
+		 *	If you pass two values to the sock_alloc_send_skb
+		 *	it tries to grab the large buffer with GFP_NOFS
+		 *	(which can fail easily), and if it fails grab the
+		 *	fallback size buffer which is under a page and will
+		 *	succeed. [Alan]
+		 */
+		size = min_t(int, size, skb_tailroom(skb));
+
+
+		/* Only send the fds in the first buffer */
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
+		if (err < 0) {
+			kfree_skb(skb);
+			goto out_err;
+		}
+		max_level = err + 1;
+		fds_sent = true;
+
+		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
+		if (err) {
+			kfree_skb(skb);
+			goto out_err;
+		}
+
+		unix_state_lock(other);
+
+		if (sock_flag(other, SOCK_DEAD) ||
+		    (other->sk_shutdown & RCV_SHUTDOWN))
+			goto pipe_err_free;
+
+		skb_queue_tail(&other->sk_receive_queue, skb);
+		if (max_level > unix_sk(other)->recursion_level)
+			unix_sk(other)->recursion_level = max_level;
+		unix_state_unlock(other);
+		other->sk_data_ready(other, size);
+		sent += size;
+	}
+
+	scm_destroy(siocb->scm);
+	siocb->scm = NULL;
+
+	return sent;
+
+pipe_err_free:
+	unix_state_unlock(other);
+	kfree_skb(skb);
+pipe_err:
+	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+		send_sig(SIGPIPE, current, 0);
+	err = -EPIPE;
+out_err:
+	scm_destroy(siocb->scm);
+	siocb->scm = NULL;
+	return sent ? : err;
+}
+
+static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
+				  struct msghdr *msg, size_t len)
+{
+	int err;
+	struct sock *sk = sock->sk;
+
+	err = sock_error(sk);
+	if (err)
+		return err;
+
+	if (sk->sk_state != TCP_ESTABLISHED)
+		return -ENOTCONN;
+
+	if (msg->msg_namelen)
+		msg->msg_namelen = 0;
+
+	return unix_dgram_sendmsg(kiocb, sock, msg, len);
+}
+
+static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
+			      struct msghdr *msg, size_t size,
+			      int flags)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk->sk_state != TCP_ESTABLISHED)
+		return -ENOTCONN;
+
+	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
+}
+
+static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
+{
+	struct unix_sock *u = unix_sk(sk);
+
+	msg->msg_namelen = 0;
+	if (u->addr) {
+		msg->msg_namelen = u->addr->len;
+		memcpy(msg->msg_name, u->addr->name, u->addr->len);
+	}
+}
+
+static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
+			      struct msghdr *msg, size_t size,
+			      int flags)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
+	struct scm_cookie tmp_scm;
+	struct sock *sk = sock->sk;
+	struct unix_sock *u = unix_sk(sk);
+	int noblock = flags & MSG_DONTWAIT;
+	struct sk_buff *skb;
+	int err;
+
+	err = -EOPNOTSUPP;
+	if (flags&MSG_OOB)
+		goto out;
+
+	msg->msg_namelen = 0;
+
+	err = mutex_lock_interruptible(&u->readlock);
+	if (err) {
+		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
+		goto out;
+	}
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb) {
+		unix_state_lock(sk);
+		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
+		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
+		    (sk->sk_shutdown & RCV_SHUTDOWN))
+			err = 0;
+		unix_state_unlock(sk);
+		goto out_unlock;
+	}
+
+	wake_up_interruptible_sync_poll(&u->peer_wait,
+					POLLOUT | POLLWRNORM | POLLWRBAND);
+
+	if (msg->msg_name)
+		unix_copy_addr(msg, skb->sk);
+
+	if (size > skb->len)
+		size = skb->len;
+	else if (size < skb->len)
+		msg->msg_flags |= MSG_TRUNC;
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
+	if (err)
+		goto out_free;
+
+	if (sock_flag(sk, SOCK_RCVTSTAMP))
+		__sock_recv_timestamp(msg, sk, skb);
+
+	if (!siocb->scm) {
+		siocb->scm = &tmp_scm;
+		memset(&tmp_scm, 0, sizeof(tmp_scm));
+	}
+	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+	unix_set_secdata(siocb->scm, skb);
+
+	if (!(flags & MSG_PEEK)) {
+		if (UNIXCB(skb).fp)
+			unix_detach_fds(siocb->scm, skb);
+	} else {
+		/* It is questionable: on PEEK we could:
+		   - do not return fds - good, but too simple 8)
+		   - return fds, and do not return them on read (old strategy,
+		     apparently wrong)
+		   - clone fds (I chose it for now, it is the most universal
+		     solution)
+
+		   POSIX 1003.1g does not actually define this clearly
+		   at all. POSIX 1003.1g doesn't define a lot of things
+		   clearly however!
+
+		*/
+		if (UNIXCB(skb).fp)
+			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+	}
+	err = size;
+
+	scm_recv(sock, msg, siocb->scm, flags);
+
+out_free:
+	skb_free_datagram(sk, skb);
+out_unlock:
+	mutex_unlock(&u->readlock);
+out:
+	return err;
+}
+
+/*
+ *	Sleep until data has arrive. But check for races..
+ */
+
+static long unix_stream_data_wait(struct sock *sk, long timeo)
+{
+	DEFINE_WAIT(wait);
+
+	unix_state_lock(sk);
+
+	for (;;) {
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		    sk->sk_err ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current) ||
+		    !timeo)
+			break;
+
+		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		unix_state_unlock(sk);
+		timeo = schedule_timeout(timeo);
+		unix_state_lock(sk);
+		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	}
+
+	finish_wait(sk_sleep(sk), &wait);
+	unix_state_unlock(sk);
+	return timeo;
+}
+
+
+
+static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t size,
+			       int flags)
+{
+	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
+	struct scm_cookie tmp_scm;
+	struct sock *sk = sock->sk;
+	struct unix_sock *u = unix_sk(sk);
+	struct sockaddr_un *sunaddr = msg->msg_name;
+	int copied = 0;
+	int check_creds = 0;
+	int target;
+	int err = 0;
+	long timeo;
+
+	err = -EINVAL;
+	if (sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	err = -EOPNOTSUPP;
+	if (flags&MSG_OOB)
+		goto out;
+
+	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
+
+	msg->msg_namelen = 0;
+
+	/* Lock the socket to prevent queue disordering
+	 * while sleeps in memcpy_tomsg
+	 */
+
+	if (!siocb->scm) {
+		siocb->scm = &tmp_scm;
+		memset(&tmp_scm, 0, sizeof(tmp_scm));
+	}
+
+	err = mutex_lock_interruptible(&u->readlock);
+	if (err) {
+		err = sock_intr_errno(timeo);
+		goto out;
+	}
+
+	do {
+		int chunk;
+		struct sk_buff *skb;
+
+		unix_state_lock(sk);
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (skb == NULL) {
+			unix_sk(sk)->recursion_level = 0;
+			if (copied >= target)
+				goto unlock;
+
+			/*
+			 *	POSIX 1003.1g mandates this order.
+			 */
+
+			err = sock_error(sk);
+			if (err)
+				goto unlock;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				goto unlock;
+
+			unix_state_unlock(sk);
+			err = -EAGAIN;
+			if (!timeo)
+				break;
+			mutex_unlock(&u->readlock);
+
+			timeo = unix_stream_data_wait(sk, timeo);
+
+			if (signal_pending(current)
+			    ||  mutex_lock_interruptible(&u->readlock)) {
+				err = sock_intr_errno(timeo);
+				goto out;
+			}
+
+			continue;
+ unlock:
+			unix_state_unlock(sk);
+			break;
+		}
+		unix_state_unlock(sk);
+
+		if (check_creds) {
+			/* Never glue messages from different writers */
+			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
+			    (UNIXCB(skb).cred != siocb->scm->cred)) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+		} else {
+			/* Copy credentials */
+			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+			check_creds = 1;
+		}
+
+		/* Copy address just once */
+		if (sunaddr) {
+			unix_copy_addr(msg, skb->sk);
+			sunaddr = NULL;
+		}
+
+		chunk = min_t(unsigned int, skb->len, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			if (copied == 0)
+				copied = -EFAULT;
+			break;
+		}
+		copied += chunk;
+		size -= chunk;
+
+		/* Mark read part of skb as used */
+		if (!(flags & MSG_PEEK)) {
+			skb_pull(skb, chunk);
+
+			if (UNIXCB(skb).fp)
+				unix_detach_fds(siocb->scm, skb);
+
+			/* put the skb back if we didn't use it up.. */
+			if (skb->len) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+
+			consume_skb(skb);
+
+			if (siocb->scm->fp)
+				break;
+		} else {
+			/* It is questionable, see note in unix_dgram_recvmsg.
+			 */
+			if (UNIXCB(skb).fp)
+				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+
+			/* put message back and return */
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			break;
+		}
+	} while (size);
+
+	mutex_unlock(&u->readlock);
+	scm_recv(sock, msg, siocb->scm, flags);
+out:
+	return copied ? : err;
+}
+
+static int unix_shutdown(struct socket *sock, int mode)
+{
+	struct sock *sk = sock->sk;
+	struct sock *other;
+
+	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
+
+	if (!mode)
+		return 0;
+
+	unix_state_lock(sk);
+	sk->sk_shutdown |= mode;
+	other = unix_peer(sk);
+	if (other)
+		sock_hold(other);
+	unix_state_unlock(sk);
+	sk->sk_state_change(sk);
+
+	if (other &&
+		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
+
+		int peer_mode = 0;
+
+		if (mode&RCV_SHUTDOWN)
+			peer_mode |= SEND_SHUTDOWN;
+		if (mode&SEND_SHUTDOWN)
+			peer_mode |= RCV_SHUTDOWN;
+		unix_state_lock(other);
+		other->sk_shutdown |= peer_mode;
+		unix_state_unlock(other);
+		other->sk_state_change(other);
+		if (peer_mode == SHUTDOWN_MASK)
+			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
+		else if (peer_mode & RCV_SHUTDOWN)
+			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
+	}
+	if (other)
+		sock_put(other);
+
+	return 0;
+}
+
+static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	long amount = 0;
+	int err;
+
+	switch (cmd) {
+	case SIOCOUTQ:
+		amount = sk_wmem_alloc_get(sk);
+		err = put_user(amount, (int __user *)arg);
+		break;
+	case SIOCINQ:
+		{
+			struct sk_buff *skb;
+
+			if (sk->sk_state == TCP_LISTEN) {
+				err = -EINVAL;
+				break;
+			}
+
+			spin_lock(&sk->sk_receive_queue.lock);
+			if (sk->sk_type == SOCK_STREAM ||
+			    sk->sk_type == SOCK_SEQPACKET) {
+				skb_queue_walk(&sk->sk_receive_queue, skb)
+					amount += skb->len;
+			} else {
+				skb = skb_peek(&sk->sk_receive_queue);
+				if (skb)
+					amount = skb->len;
+			}
+			spin_unlock(&sk->sk_receive_queue.lock);
+			err = put_user(amount, (int __user *)arg);
+			break;
+		}
+
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+	return err;
+}
+
+static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* exceptional events? */
+	if (sk->sk_err)
+		mask |= POLLERR;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connection-based need to check for termination and startup */
+	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
+	    sk->sk_state == TCP_CLOSE)
+		mask |= POLLHUP;
+
+	/*
+	 * we set writable also when the other side has shut down the
+	 * connection. This prevents stuck sockets.
+	 */
+	if (unix_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+
+static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
+				    poll_table *wait)
+{
+	struct sock *sk = sock->sk, *other;
+	unsigned int mask, writable;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	/* exceptional events? */
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+	if (sk->sk_shutdown == SHUTDOWN_MASK)
+		mask |= POLLHUP;
+
+	/* readable? */
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connection-based need to check for termination and startup */
+	if (sk->sk_type == SOCK_SEQPACKET) {
+		if (sk->sk_state == TCP_CLOSE)
+			mask |= POLLHUP;
+		/* connection hasn't started yet? */
+		if (sk->sk_state == TCP_SYN_SENT)
+			return mask;
+	}
+
+	/* No write status requested, avoid expensive OUT tests. */
+	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+		return mask;
+
+	writable = unix_writable(sk);
+	other = unix_peer_get(sk);
+	if (other) {
+		if (unix_peer(other) != sk) {
+			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
+			if (unix_recvq_full(other))
+				writable = 0;
+		}
+		sock_put(other);
+	}
+
+	if (writable)
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+	else
+		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	return mask;
+}
+
+#ifdef CONFIG_PROC_FS
+static struct sock *first_unix_socket(int *i)
+{
+	for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
+		if (!hlist_empty(&unix_socket_table[*i]))
+			return __sk_head(&unix_socket_table[*i]);
+	}
+	return NULL;
+}
+
+static struct sock *next_unix_socket(int *i, struct sock *s)
+{
+	struct sock *next = sk_next(s);
+	/* More in this chain? */
+	if (next)
+		return next;
+	/* Look for next non-empty chain. */
+	for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
+		if (!hlist_empty(&unix_socket_table[*i]))
+			return __sk_head(&unix_socket_table[*i]);
+	}
+	return NULL;
+}
+
+struct unix_iter_state {
+	struct seq_net_private p;
+	int i;
+};
+
+static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
+{
+	struct unix_iter_state *iter = seq->private;
+	loff_t off = 0;
+	struct sock *s;
+
+	for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
+		if (sock_net(s) != seq_file_net(seq))
+			continue;
+		if (off == pos)
+			return s;
+		++off;
+	}
+	return NULL;
+}
+
+static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(unix_table_lock)
+{
+	spin_lock(&unix_table_lock);
+	return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct unix_iter_state *iter = seq->private;
+	struct sock *sk = v;
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		sk = first_unix_socket(&iter->i);
+	else
+		sk = next_unix_socket(&iter->i, sk);
+	while (sk && (sock_net(sk) != seq_file_net(seq)))
+		sk = next_unix_socket(&iter->i, sk);
+	return sk;
+}
+
+static void unix_seq_stop(struct seq_file *seq, void *v)
+	__releases(unix_table_lock)
+{
+	spin_unlock(&unix_table_lock);
+}
+
+static int unix_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
+			 "Inode Path\n");
+	else {
+		struct sock *s = v;
+		struct unix_sock *u = unix_sk(s);
+		unix_state_lock(s);
+
+		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
+			s,
+			atomic_read(&s->sk_refcnt),
+			0,
+			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
+			s->sk_type,
+			s->sk_socket ?
+			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
+			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
+			sock_i_ino(s));
+
+		if (u->addr) {
+			int i, len;
+			seq_putc(seq, ' ');
+
+			i = 0;
+			len = u->addr->len - sizeof(short);
+			if (!UNIX_ABSTRACT(s))
+				len--;
+			else {
+				seq_putc(seq, '@');
+				i++;
+			}
+			for ( ; i < len; i++)
+				seq_putc(seq, u->addr->name->sun_path[i]);
+		}
+		unix_state_unlock(s);
+		seq_putc(seq, '\n');
+	}
+
+	return 0;
+}
+
+static const struct seq_operations unix_seq_ops = {
+	.start  = unix_seq_start,
+	.next   = unix_seq_next,
+	.stop   = unix_seq_stop,
+	.show   = unix_seq_show,
+};
+
+static int unix_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &unix_seq_ops,
+			    sizeof(struct unix_iter_state));
+}
+
+static const struct file_operations unix_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= unix_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+#endif
+
+static const struct net_proto_family unix_family_ops = {
+	.family = PF_UNIX,
+	.create = unix_create,
+	.owner	= THIS_MODULE,
+};
+
+
+static int __net_init unix_net_init(struct net *net)
+{
+	int error = -ENOMEM;
+
+	net->unx.sysctl_max_dgram_qlen = 10;
+	if (unix_sysctl_register(net))
+		goto out;
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) {
+		unix_sysctl_unregister(net);
+		goto out;
+	}
+#endif
+	error = 0;
+out:
+	return error;
+}
+
+static void __net_exit unix_net_exit(struct net *net)
+{
+	unix_sysctl_unregister(net);
+	proc_net_remove(net, "unix");
+}
+
+static struct pernet_operations unix_net_ops = {
+	.init = unix_net_init,
+	.exit = unix_net_exit,
+};
+
+static int __init af_unix_init(void)
+{
+	int rc = -1;
+	struct sk_buff *dummy_skb;
+
+	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
+
+	rc = proto_register(&unix_proto, 1);
+	if (rc != 0) {
+		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
+		       __func__);
+		goto out;
+	}
+
+	sock_register(&unix_family_ops);
+	register_pernet_subsys(&unix_net_ops);
+out:
+	return rc;
+}
+
+static void __exit af_unix_exit(void)
+{
+	sock_unregister(PF_UNIX);
+	proto_unregister(&unix_proto);
+	unregister_pernet_subsys(&unix_net_ops);
+}
+
+/* Earlier than device_initcall() so that other drivers invoking
+   request_module() don't end up in a loop when modprobe tries
+   to use a UNIX socket. But later than subsys_initcall() because
+   we depend on stuff initialised there */
+fs_initcall(af_unix_init);
+module_exit(af_unix_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_UNIX);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
new file mode 100644
index 00000000..b6f4b994
--- /dev/null
+++ b/net/unix/garbage.c
@@ -0,0 +1,386 @@
+/*
+ * NET3:	Garbage Collector For AF_UNIX sockets
+ *
+ * Garbage Collector:
+ *	Copyright (C) Barak A. Pearlmutter.
+ *	Released under the GPL version 2 or later.
+ *
+ * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem.
+ * If it doesn't work blame me, it worked when Barak sent it.
+ *
+ * Assumptions:
+ *
+ *  - object w/ a bit
+ *  - free list
+ *
+ * Current optimizations:
+ *
+ *  - explicit stack instead of recursion
+ *  - tail recurse on first born instead of immediate push/pop
+ *  - we gather the stuff that should not be killed into tree
+ *    and stack is just a path from root to the current pointer.
+ *
+ *  Future optimizations:
+ *
+ *  - don't just push entire root set; process in place
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Fixes:
+ *	Alan Cox	07 Sept	1997	Vmalloc internal stack as needed.
+ *					Cope with changing max_files.
+ *	Al Viro		11 Oct 1998
+ *		Graph may have cycles. That is, we can send the descriptor
+ *		of foo to bar and vice versa. Current code chokes on that.
+ *		Fix: move SCM_RIGHTS ones into the separate list and then
+ *		skb_free() them all instead of doing explicit fput's.
+ *		Another problem: since fput() may block somebody may
+ *		create a new unix_socket when we are in the middle of sweep
+ *		phase. Fix: revert the logic wrt MARKED. Mark everything
+ *		upon the beginning and unmark non-junk ones.
+ *
+ *		[12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS
+ *		sent to connect()'ed but still not accept()'ed sockets.
+ *		Fixed. Old code had slightly different problem here:
+ *		extra fput() in situation when we passed the descriptor via
+ *		such socket and closed it (descriptor). That would happen on
+ *		each unix_gc() until the accept(). Since the struct file in
+ *		question would go to the free list and might be reused...
+ *		That might be the reason of random oopses on filp_close()
+ *		in unrelated processes.
+ *
+ *	AV		28 Feb 1999
+ *		Kill the explicit allocation of stack. Now we keep the tree
+ *		with root in dummy + pointer (gc_current) to one of the nodes.
+ *		Stack is represented as path from gc_current to dummy. Unmark
+ *		now means "add to tree". Push == "make it a son of gc_current".
+ *		Pop == "move gc_current to parent". We keep only pointers to
+ *		parents (->gc_tree).
+ *	AV		1 Mar 1999
+ *		Damn. Added missing check for ->dead in listen queues scanning.
+ *
+ *	Miklos Szeredi 25 Jun 2007
+ *		Reimplement with a cycle collecting algorithm. This should
+ *		solve several problems with the previous code, like being racy
+ *		wrt receive and holding up unrelated socket operations.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/net.h>
+#include <linux/fs.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+#include <net/sock.h>
+#include <net/af_unix.h>
+#include <net/scm.h>
+#include <net/tcp_states.h>
+
+/* Internal data structures and random procedures: */
+
+static LIST_HEAD(gc_inflight_list);
+static LIST_HEAD(gc_candidates);
+static DEFINE_SPINLOCK(unix_gc_lock);
+static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+
+unsigned int unix_tot_inflight;
+
+
+struct sock *unix_get_socket(struct file *filp)
+{
+	struct sock *u_sock = NULL;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	/*
+	 *	Socket ?
+	 */
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+		struct socket *sock = SOCKET_I(inode);
+		struct sock *s = sock->sk;
+
+		/*
+		 *	PF_UNIX ?
+		 */
+		if (s && sock->ops && sock->ops->family == PF_UNIX)
+			u_sock = s;
+	}
+	return u_sock;
+}
+
+/*
+ *	Keep the number of times in flight count for the file
+ *	descriptor if it is for an AF_UNIX socket.
+ */
+
+void unix_inflight(struct file *fp)
+{
+	struct sock *s = unix_get_socket(fp);
+	if (s) {
+		struct unix_sock *u = unix_sk(s);
+		spin_lock(&unix_gc_lock);
+		if (atomic_long_inc_return(&u->inflight) == 1) {
+			BUG_ON(!list_empty(&u->link));
+			list_add_tail(&u->link, &gc_inflight_list);
+		} else {
+			BUG_ON(list_empty(&u->link));
+		}
+		unix_tot_inflight++;
+		spin_unlock(&unix_gc_lock);
+	}
+}
+
+void unix_notinflight(struct file *fp)
+{
+	struct sock *s = unix_get_socket(fp);
+	if (s) {
+		struct unix_sock *u = unix_sk(s);
+		spin_lock(&unix_gc_lock);
+		BUG_ON(list_empty(&u->link));
+		if (atomic_long_dec_and_test(&u->inflight))
+			list_del_init(&u->link);
+		unix_tot_inflight--;
+		spin_unlock(&unix_gc_lock);
+	}
+}
+
+static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+			  struct sk_buff_head *hitlist)
+{
+	struct sk_buff *skb;
+	struct sk_buff *next;
+
+	spin_lock(&x->sk_receive_queue.lock);
+	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+		/*
+		 *	Do we have file descriptors ?
+		 */
+		if (UNIXCB(skb).fp) {
+			bool hit = false;
+			/*
+			 *	Process the descriptors of this socket
+			 */
+			int nfd = UNIXCB(skb).fp->count;
+			struct file **fp = UNIXCB(skb).fp->fp;
+			while (nfd--) {
+				/*
+				 *	Get the socket the fd matches
+				 *	if it indeed does so
+				 */
+				struct sock *sk = unix_get_socket(*fp++);
+				if (sk) {
+					struct unix_sock *u = unix_sk(sk);
+
+					/*
+					 * Ignore non-candidates, they could
+					 * have been added to the queues after
+					 * starting the garbage collection
+					 */
+					if (u->gc_candidate) {
+						hit = true;
+						func(u);
+					}
+				}
+			}
+			if (hit && hitlist != NULL) {
+				__skb_unlink(skb, &x->sk_receive_queue);
+				__skb_queue_tail(hitlist, skb);
+			}
+		}
+	}
+	spin_unlock(&x->sk_receive_queue.lock);
+}
+
+static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
+			  struct sk_buff_head *hitlist)
+{
+	if (x->sk_state != TCP_LISTEN)
+		scan_inflight(x, func, hitlist);
+	else {
+		struct sk_buff *skb;
+		struct sk_buff *next;
+		struct unix_sock *u;
+		LIST_HEAD(embryos);
+
+		/*
+		 * For a listening socket collect the queued embryos
+		 * and perform a scan on them as well.
+		 */
+		spin_lock(&x->sk_receive_queue.lock);
+		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+			u = unix_sk(skb->sk);
+
+			/*
+			 * An embryo cannot be in-flight, so it's safe
+			 * to use the list link.
+			 */
+			BUG_ON(!list_empty(&u->link));
+			list_add_tail(&u->link, &embryos);
+		}
+		spin_unlock(&x->sk_receive_queue.lock);
+
+		while (!list_empty(&embryos)) {
+			u = list_entry(embryos.next, struct unix_sock, link);
+			scan_inflight(&u->sk, func, hitlist);
+			list_del_init(&u->link);
+		}
+	}
+}
+
+static void dec_inflight(struct unix_sock *usk)
+{
+	atomic_long_dec(&usk->inflight);
+}
+
+static void inc_inflight(struct unix_sock *usk)
+{
+	atomic_long_inc(&usk->inflight);
+}
+
+static void inc_inflight_move_tail(struct unix_sock *u)
+{
+	atomic_long_inc(&u->inflight);
+	/*
+	 * If this still might be part of a cycle, move it to the end
+	 * of the list, so that it's checked even if it was already
+	 * passed over
+	 */
+	if (u->gc_maybe_cycle)
+		list_move_tail(&u->link, &gc_candidates);
+}
+
+static bool gc_in_progress = false;
+#define UNIX_INFLIGHT_TRIGGER_GC 16000
+
+void wait_for_unix_gc(void)
+{
+	/*
+	 * If number of inflight sockets is insane,
+	 * force a garbage collect right now.
+	 */
+	if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
+		unix_gc();
+	wait_event(unix_gc_wait, gc_in_progress == false);
+}
+
+/* The external entry point: unix_gc() */
+void unix_gc(void)
+{
+	struct unix_sock *u;
+	struct unix_sock *next;
+	struct sk_buff_head hitlist;
+	struct list_head cursor;
+	LIST_HEAD(not_cycle_list);
+
+	spin_lock(&unix_gc_lock);
+
+	/* Avoid a recursive GC. */
+	if (gc_in_progress)
+		goto out;
+
+	gc_in_progress = true;
+	/*
+	 * First, select candidates for garbage collection.  Only
+	 * in-flight sockets are considered, and from those only ones
+	 * which don't have any external reference.
+	 *
+	 * Holding unix_gc_lock will protect these candidates from
+	 * being detached, and hence from gaining an external
+	 * reference.  Since there are no possible receivers, all
+	 * buffers currently on the candidates' queues stay there
+	 * during the garbage collection.
+	 *
+	 * We also know that no new candidate can be added onto the
+	 * receive queues.  Other, non candidate sockets _can_ be
+	 * added to queue, so we must make sure only to touch
+	 * candidates.
+	 */
+	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
+		long total_refs;
+		long inflight_refs;
+
+		total_refs = file_count(u->sk.sk_socket->file);
+		inflight_refs = atomic_long_read(&u->inflight);
+
+		BUG_ON(inflight_refs < 1);
+		BUG_ON(total_refs < inflight_refs);
+		if (total_refs == inflight_refs) {
+			list_move_tail(&u->link, &gc_candidates);
+			u->gc_candidate = 1;
+			u->gc_maybe_cycle = 1;
+		}
+	}
+
+	/*
+	 * Now remove all internal in-flight reference to children of
+	 * the candidates.
+	 */
+	list_for_each_entry(u, &gc_candidates, link)
+		scan_children(&u->sk, dec_inflight, NULL);
+
+	/*
+	 * Restore the references for children of all candidates,
+	 * which have remaining references.  Do this recursively, so
+	 * only those remain, which form cyclic references.
+	 *
+	 * Use a "cursor" link, to make the list traversal safe, even
+	 * though elements might be moved about.
+	 */
+	list_add(&cursor, &gc_candidates);
+	while (cursor.next != &gc_candidates) {
+		u = list_entry(cursor.next, struct unix_sock, link);
+
+		/* Move cursor to after the current position. */
+		list_move(&cursor, &u->link);
+
+		if (atomic_long_read(&u->inflight) > 0) {
+			list_move_tail(&u->link, &not_cycle_list);
+			u->gc_maybe_cycle = 0;
+			scan_children(&u->sk, inc_inflight_move_tail, NULL);
+		}
+	}
+	list_del(&cursor);
+
+	/*
+	 * not_cycle_list contains those sockets which do not make up a
+	 * cycle.  Restore these to the inflight list.
+	 */
+	while (!list_empty(&not_cycle_list)) {
+		u = list_entry(not_cycle_list.next, struct unix_sock, link);
+		u->gc_candidate = 0;
+		list_move_tail(&u->link, &gc_inflight_list);
+	}
+
+	/*
+	 * Now gc_candidates contains only garbage.  Restore original
+	 * inflight counters for these as well, and remove the skbuffs
+	 * which are creating the cycle(s).
+	 */
+	skb_queue_head_init(&hitlist);
+	list_for_each_entry(u, &gc_candidates, link)
+	scan_children(&u->sk, inc_inflight, &hitlist);
+
+	spin_unlock(&unix_gc_lock);
+
+	/* Here we are. Hitlist is filled. Die. */
+	__skb_queue_purge(&hitlist);
+
+	spin_lock(&unix_gc_lock);
+
+	/* All candidates should have been detached by now. */
+	BUG_ON(!list_empty(&gc_candidates));
+	gc_in_progress = false;
+	wake_up(&unix_gc_wait);
+
+ out:
+	spin_unlock(&unix_gc_lock);
+}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
new file mode 100644
index 00000000..397cffeb
--- /dev/null
+++ b/net/unix/sysctl_net_unix.c
@@ -0,0 +1,63 @@
+/*
+ * NET4:	Sysctl interface to net af_unix subsystem.
+ *
+ * Authors:	Mike Shaver.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+
+#include <net/af_unix.h>
+
+static ctl_table unix_table[] = {
+	{
+		.procname	= "max_dgram_qlen",
+		.data		= &init_net.unx.sysctl_max_dgram_qlen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{ }
+};
+
+static struct ctl_path unix_path[] = {
+	{ .procname = "net", },
+	{ .procname = "unix", },
+	{ },
+};
+
+int __net_init unix_sysctl_register(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(unix_table, sizeof(unix_table), GFP_KERNEL);
+	if (table == NULL)
+		goto err_alloc;
+
+	table[0].data = &net->unx.sysctl_max_dgram_qlen;
+	net->unx.ctl = register_net_sysctl_table(net, unix_path, table);
+	if (net->unx.ctl == NULL)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+void unix_sysctl_unregister(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->unx.ctl->ctl_table_arg;
+	unregister_sysctl_table(net->unx.ctl);
+	kfree(table);
+}
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 00000000..61ceae0b
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for WAN router
+#
+
+config WAN_ROUTER
+	tristate "WAN router"
+	depends on EXPERIMENTAL
+	---help---
+	  Wide Area Networks (WANs), such as X.25, frame relay and leased
+	  lines, are used to interconnect Local Area Networks (LANs) over vast
+	  distances with data transfer rates significantly higher than those
+	  achievable with commonly used asynchronous modem connections.
+	  Usually, a quite expensive external device called a `WAN router' is
+	  needed to connect to a WAN.
+
+	  As an alternative, WAN routing can be built into the Linux kernel.
+	  With relatively inexpensive WAN interface cards available on the
+	  market, a perfectly usable router can be built for less than half
+	  the price of an external router.  If you have one of those cards and
+	  wish to use your Linux box as a WAN router, say Y here and also to
+	  the WAN driver for your card, below.  You will then need the
+	  wan-tools package which is available from <ftp://ftp.sangoma.com/>.
+
+	  To compile WAN routing support as a module, choose M here: the
+	  module will be called wanrouter.
+
+	  If unsure, say N.
diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile
new file mode 100644
index 00000000..4da14bc4
--- /dev/null
+++ b/net/wanrouter/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux WAN router layer.
+#
+
+obj-$(CONFIG_WAN_ROUTER) += wanrouter.o
+
+wanrouter-y :=  wanproc.o wanmain.o
diff --git a/net/wanrouter/patchlevel b/net/wanrouter/patchlevel
new file mode 100644
index 00000000..c043eea7
--- /dev/null
+++ b/net/wanrouter/patchlevel
@@ -0,0 +1 @@
+2.2.1
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
new file mode 100644
index 00000000..788a12c1
--- /dev/null
+++ b/net/wanrouter/wanmain.c
@@ -0,0 +1,787 @@
+/*****************************************************************************
+* wanmain.c	WAN Multiprotocol Router Module. Main code.
+*
+*		This module is completely hardware-independent and provides
+*		the following common services for the WAN Link Drivers:
+*		 o WAN device management (registering, unregistering)
+*		 o Network interface management
+*		 o Physical connection management (dial-up, incoming calls)
+*		 o Logical connection management (switched virtual circuits)
+*		 o Protocol encapsulation/decapsulation
+*
+* Author:	Gideon Hack
+*
+* Copyright:	(c) 1995-1999 Sangoma Technologies Inc.
+*
+*		This program is free software; you can redistribute it and/or
+*		modify it under the terms of the GNU General Public License
+*		as published by the Free Software Foundation; either version
+*		2 of the License, or (at your option) any later version.
+* ============================================================================
+* Nov 24, 2000  Nenad Corbic	Updated for 2.4.X kernels
+* Nov 07, 2000  Nenad Corbic	Fixed the Mulit-Port PPP for kernels 2.2.16 and
+*  				greater.
+* Aug 2,  2000  Nenad Corbic	Block the Multi-Port PPP from running on
+*  			        kernels 2.2.16 or greater.  The SyncPPP
+*  			        has changed.
+* Jul 13, 2000  Nenad Corbic	Added SyncPPP support
+* 				Added extra debugging in device_setup().
+* Oct 01, 1999  Gideon Hack     Update for s514 PCI card
+* Dec 27, 1996	Gene Kozin	Initial version (based on Sangoma's WANPIPE)
+* Jan 16, 1997	Gene Kozin	router_devlist made public
+* Jan 31, 1997  Alan Cox	Hacked it about a bit for 2.1
+* Jun 27, 1997  Alan Cox	realigned with vendor code
+* Oct 15, 1997  Farhan Thawar   changed wan_encapsulate to add a pad byte of 0
+* Apr 20, 1998	Alan Cox	Fixed 2.1 symbols
+* May 17, 1998  K. Baranowski	Fixed SNAP encapsulation in wan_encapsulate
+* Dec 15, 1998  Arnaldo Melo    support for firmwares of up to 128000 bytes
+*                               check wandev->setup return value
+* Dec 22, 1998  Arnaldo Melo    vmalloc/vfree used in device_setup to allocate
+*                               kernel memory and copy configuration data to
+*                               kernel space (for big firmwares)
+* Jun 02, 1999  Gideon Hack	Updates for Linux 2.0.X and 2.2.X kernels.
+*****************************************************************************/
+
+#include <linux/stddef.h>	/* offsetof(), etc. */
+#include <linux/capability.h>
+#include <linux/errno.h>	/* return codes */
+#include <linux/kernel.h>
+#include <linux/module.h>	/* support for loadable modules */
+#include <linux/slab.h>		/* kmalloc(), kfree() */
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/string.h>	/* inline mem*, str* functions */
+
+#include <asm/byteorder.h>	/* htons(), etc. */
+#include <linux/wanrouter.h>	/* WAN router API definitions */
+
+#include <linux/vmalloc.h>	/* vmalloc, vfree */
+#include <asm/uaccess.h>        /* copy_to/from_user */
+#include <linux/init.h>         /* __initfunc et al. */
+
+#define DEV_TO_SLAVE(dev)	(*((struct net_device **)netdev_priv(dev)))
+
+/*
+ * 	Function Prototypes
+ */
+
+/*
+ *	WAN device IOCTL handlers
+ */
+
+static DEFINE_MUTEX(wanrouter_mutex);
+static int wanrouter_device_setup(struct wan_device *wandev,
+				  wandev_conf_t __user *u_conf);
+static int wanrouter_device_stat(struct wan_device *wandev,
+				 wandev_stat_t __user *u_stat);
+static int wanrouter_device_shutdown(struct wan_device *wandev);
+static int wanrouter_device_new_if(struct wan_device *wandev,
+				   wanif_conf_t __user *u_conf);
+static int wanrouter_device_del_if(struct wan_device *wandev,
+				   char __user *u_name);
+
+/*
+ *	Miscellaneous
+ */
+
+static struct wan_device *wanrouter_find_device(char *name);
+static int wanrouter_delete_interface(struct wan_device *wandev, char *name);
+static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__acquires(lock);
+static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__releases(lock);
+
+
+
+/*
+ *	Global Data
+ */
+
+static char wanrouter_fullname[]  = "Sangoma WANPIPE Router";
+static char wanrouter_copyright[] = "(c) 1995-2000 Sangoma Technologies Inc.";
+static char wanrouter_modname[] = ROUTER_NAME; /* short module name */
+struct wan_device* wanrouter_router_devlist; /* list of registered devices */
+
+/*
+ *	Organize Unique Identifiers for encapsulation/decapsulation
+ */
+
+#if 0
+static unsigned char wanrouter_oui_ether[] = { 0x00, 0x00, 0x00 };
+static unsigned char wanrouter_oui_802_2[] = { 0x00, 0x80, 0xC2 };
+#endif
+
+static int __init wanrouter_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "%s v%u.%u %s\n",
+	       wanrouter_fullname, ROUTER_VERSION, ROUTER_RELEASE,
+	       wanrouter_copyright);
+
+	err = wanrouter_proc_init();
+	if (err)
+		printk(KERN_INFO "%s: can't create entry in proc filesystem!\n",
+		       wanrouter_modname);
+
+	return err;
+}
+
+static void __exit wanrouter_cleanup (void)
+{
+	wanrouter_proc_cleanup();
+}
+
+/*
+ * This is just plain dumb.  We should move the bugger to drivers/net/wan,
+ * slap it first in directory and make it module_init().  The only reason
+ * for subsys_initcall() here is that net goes after drivers (why, BTW?)
+ */
+subsys_initcall(wanrouter_init);
+module_exit(wanrouter_cleanup);
+
+/*
+ * 	Kernel APIs
+ */
+
+/*
+ * 	Register WAN device.
+ * 	o verify device credentials
+ * 	o create an entry for the device in the /proc/net/router directory
+ * 	o initialize internally maintained fields of the wan_device structure
+ * 	o link device data space to a singly-linked list
+ * 	o if it's the first device, then start kernel 'thread'
+ * 	o increment module use count
+ *
+ * 	Return:
+ *	0	Ok
+ *	< 0	error.
+ *
+ * 	Context:	process
+ */
+
+
+int register_wan_device(struct wan_device *wandev)
+{
+	int err, namelen;
+
+	if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC) ||
+	    (wandev->name == NULL))
+		return -EINVAL;
+
+	namelen = strlen(wandev->name);
+	if (!namelen || (namelen > WAN_DRVNAME_SZ))
+		return -EINVAL;
+
+	if (wanrouter_find_device(wandev->name))
+		return -EEXIST;
+
+#ifdef WANDEBUG
+	printk(KERN_INFO "%s: registering WAN device %s\n",
+	       wanrouter_modname, wandev->name);
+#endif
+
+	/*
+	 *	Register /proc directory entry
+	 */
+	err = wanrouter_proc_add(wandev);
+	if (err) {
+		printk(KERN_INFO
+			"%s: can't create /proc/net/router/%s entry!\n",
+			wanrouter_modname, wandev->name);
+		return err;
+	}
+
+	/*
+	 *	Initialize fields of the wan_device structure maintained by the
+	 *	router and update local data.
+	 */
+
+	wandev->ndev = 0;
+	wandev->dev  = NULL;
+	wandev->next = wanrouter_router_devlist;
+	wanrouter_router_devlist = wandev;
+	return 0;
+}
+
+/*
+ *	Unregister WAN device.
+ *	o shut down device
+ *	o unlink device data space from the linked list
+ *	o delete device entry in the /proc/net/router directory
+ *	o decrement module use count
+ *
+ *	Return:		0	Ok
+ *			<0	error.
+ *	Context:	process
+ */
+
+
+int unregister_wan_device(char *name)
+{
+	struct wan_device *wandev, *prev;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	for (wandev = wanrouter_router_devlist, prev = NULL;
+		wandev && strcmp(wandev->name, name);
+		prev = wandev, wandev = wandev->next)
+		;
+	if (wandev == NULL)
+		return -ENODEV;
+
+#ifdef WANDEBUG
+	printk(KERN_INFO "%s: unregistering WAN device %s\n",
+	       wanrouter_modname, name);
+#endif
+
+	if (wandev->state != WAN_UNCONFIGURED)
+		wanrouter_device_shutdown(wandev);
+
+	if (prev)
+		prev->next = wandev->next;
+	else
+		wanrouter_router_devlist = wandev->next;
+
+	wanrouter_proc_delete(wandev);
+	return 0;
+}
+
+#if 0
+
+/*
+ *	Encapsulate packet.
+ *
+ *	Return:	encapsulation header size
+ *		< 0	- unsupported Ethertype
+ *
+ *	Notes:
+ *	1. This function may be called on interrupt context.
+ */
+
+
+int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
+			  unsigned short type)
+{
+	int hdr_len = 0;
+
+	switch (type) {
+	case ETH_P_IP:		/* IP datagram encapsulation */
+		hdr_len += 1;
+		skb_push(skb, 1);
+		skb->data[0] = NLPID_IP;
+		break;
+
+	case ETH_P_IPX:		/* SNAP encapsulation */
+	case ETH_P_ARP:
+		hdr_len += 7;
+		skb_push(skb, 7);
+		skb->data[0] = 0;
+		skb->data[1] = NLPID_SNAP;
+		skb_copy_to_linear_data_offset(skb, 2, wanrouter_oui_ether,
+					       sizeof(wanrouter_oui_ether));
+		*((unsigned short*)&skb->data[5]) = htons(type);
+		break;
+
+	default:		/* Unknown packet type */
+		printk(KERN_INFO
+			"%s: unsupported Ethertype 0x%04X on interface %s!\n",
+			wanrouter_modname, type, dev->name);
+		hdr_len = -EINVAL;
+	}
+	return hdr_len;
+}
+
+
+/*
+ *	Decapsulate packet.
+ *
+ *	Return:	Ethertype (in network order)
+ *			0	unknown encapsulation
+ *
+ *	Notes:
+ *	1. This function may be called on interrupt context.
+ */
+
+
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+	int cnt = skb->data[0] ? 0 : 1;	/* there may be a pad present */
+	__be16 ethertype;
+
+	switch (skb->data[cnt]) {
+	case NLPID_IP:		/* IP datagramm */
+		ethertype = htons(ETH_P_IP);
+		cnt += 1;
+		break;
+
+	case NLPID_SNAP:	/* SNAP encapsulation */
+		if (memcmp(&skb->data[cnt + 1], wanrouter_oui_ether,
+			   sizeof(wanrouter_oui_ether))){
+			printk(KERN_INFO
+				"%s: unsupported SNAP OUI %02X-%02X-%02X "
+				"on interface %s!\n", wanrouter_modname,
+				skb->data[cnt+1], skb->data[cnt+2],
+				skb->data[cnt+3], dev->name);
+			return 0;
+		}
+		ethertype = *((__be16*)&skb->data[cnt+4]);
+		cnt += 6;
+		break;
+
+	/* add other protocols, e.g. CLNP, ESIS, ISIS, if needed */
+
+	default:
+		printk(KERN_INFO
+			"%s: unsupported NLPID 0x%02X on interface %s!\n",
+			wanrouter_modname, skb->data[cnt], dev->name);
+		return 0;
+	}
+	skb->protocol = ethertype;
+	skb->pkt_type = PACKET_HOST;	/*	Physically point to point */
+	skb_pull(skb, cnt);
+	skb_reset_mac_header(skb);
+	return ethertype;
+}
+
+#endif  /*  0  */
+
+/*
+ *	WAN device IOCTL.
+ *	o find WAN device associated with this node
+ *	o execute requested action or pass command to the device driver
+ */
+
+long wanrouter_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int err = 0;
+	struct proc_dir_entry *dent;
+	struct wan_device *wandev;
+	void __user *data = (void __user *)arg;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if ((cmd >> 8) != ROUTER_IOCTL)
+		return -EINVAL;
+
+	dent = PDE(inode);
+	if ((dent == NULL) || (dent->data == NULL))
+		return -EINVAL;
+
+	wandev = dent->data;
+	if (wandev->magic != ROUTER_MAGIC)
+		return -EINVAL;
+
+	mutex_lock(&wanrouter_mutex);
+	switch (cmd) {
+	case ROUTER_SETUP:
+		err = wanrouter_device_setup(wandev, data);
+		break;
+
+	case ROUTER_DOWN:
+		err = wanrouter_device_shutdown(wandev);
+		break;
+
+	case ROUTER_STAT:
+		err = wanrouter_device_stat(wandev, data);
+		break;
+
+	case ROUTER_IFNEW:
+		err = wanrouter_device_new_if(wandev, data);
+		break;
+
+	case ROUTER_IFDEL:
+		err = wanrouter_device_del_if(wandev, data);
+		break;
+
+	case ROUTER_IFSTAT:
+		break;
+
+	default:
+		if ((cmd >= ROUTER_USER) &&
+		    (cmd <= ROUTER_USER_MAX) &&
+		    wandev->ioctl)
+			err = wandev->ioctl(wandev, cmd, arg);
+		else err = -EINVAL;
+	}
+	mutex_unlock(&wanrouter_mutex);
+	return err;
+}
+
+/*
+ *	WAN Driver IOCTL Handlers
+ */
+
+/*
+ *	Setup WAN link device.
+ *	o verify user address space
+ *	o allocate kernel memory and copy configuration data to kernel space
+ *	o if configuration data includes extension, copy it to kernel space too
+ *	o call driver's setup() entry point
+ */
+
+static int wanrouter_device_setup(struct wan_device *wandev,
+				  wandev_conf_t __user *u_conf)
+{
+	void *data = NULL;
+	wandev_conf_t *conf;
+	int err = -EINVAL;
+
+	if (wandev->setup == NULL) {	/* Nothing to do ? */
+		printk(KERN_INFO "%s: ERROR, No setup script: wandev->setup()\n",
+				wandev->name);
+		return 0;
+	}
+
+	conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL);
+	if (conf == NULL){
+		printk(KERN_INFO "%s: ERROR, Failed to allocate kernel memory !\n",
+				wandev->name);
+		return -ENOBUFS;
+	}
+
+	if (copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) {
+		printk(KERN_INFO "%s: Failed to copy user config data to kernel space!\n",
+				wandev->name);
+		kfree(conf);
+		return -EFAULT;
+	}
+
+	if (conf->magic != ROUTER_MAGIC) {
+		kfree(conf);
+		printk(KERN_INFO "%s: ERROR, Invalid MAGIC Number\n",
+				wandev->name);
+		return -EINVAL;
+	}
+
+	if (conf->data_size && conf->data) {
+		if (conf->data_size > 128000) {
+			printk(KERN_INFO
+			    "%s: ERROR, Invalid firmware data size %i !\n",
+					wandev->name, conf->data_size);
+			kfree(conf);
+			return -EINVAL;
+		}
+
+		data = vmalloc(conf->data_size);
+		if (!data) {
+			printk(KERN_INFO
+				"%s: ERROR, Failed allocate kernel memory !\n",
+				wandev->name);
+			kfree(conf);
+			return -ENOBUFS;
+		}
+		if (!copy_from_user(data, conf->data, conf->data_size)) {
+			conf->data = data;
+			err = wandev->setup(wandev, conf);
+		} else {
+			printk(KERN_INFO
+			     "%s: ERROR, Failed to copy from user data !\n",
+			       wandev->name);
+			err = -EFAULT;
+		}
+		vfree(data);
+	} else {
+		printk(KERN_INFO
+		    "%s: ERROR, No firmware found ! Firmware size = %i !\n",
+				wandev->name, conf->data_size);
+	}
+
+	kfree(conf);
+	return err;
+}
+
+/*
+ *	Shutdown WAN device.
+ *	o delete all not opened logical channels for this device
+ *	o call driver's shutdown() entry point
+ */
+
+static int wanrouter_device_shutdown(struct wan_device *wandev)
+{
+	struct net_device *dev;
+	int err=0;
+
+	if (wandev->state == WAN_UNCONFIGURED)
+		return 0;
+
+	printk(KERN_INFO "\n%s: Shutting Down!\n",wandev->name);
+
+	for (dev = wandev->dev; dev;) {
+		err = wanrouter_delete_interface(wandev, dev->name);
+		if (err)
+			return err;
+		/* The above function deallocates the current dev
+		 * structure. Therefore, we cannot use netdev_priv(dev)
+		 * as the next element: wandev->dev points to the
+		 * next element */
+		dev = wandev->dev;
+	}
+
+	if (wandev->ndev)
+		return -EBUSY;	/* there are opened interfaces  */
+
+	if (wandev->shutdown)
+		err=wandev->shutdown(wandev);
+
+	return err;
+}
+
+/*
+ *	Get WAN device status & statistics.
+ */
+
+static int wanrouter_device_stat(struct wan_device *wandev,
+				 wandev_stat_t __user *u_stat)
+{
+	wandev_stat_t stat;
+
+	memset(&stat, 0, sizeof(stat));
+
+	/* Ask device driver to update device statistics */
+	if ((wandev->state != WAN_UNCONFIGURED) && wandev->update)
+		wandev->update(wandev);
+
+	/* Fill out structure */
+	stat.ndev  = wandev->ndev;
+	stat.state = wandev->state;
+
+	if (copy_to_user(u_stat, &stat, sizeof(stat)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ *	Create new WAN interface.
+ *	o verify user address space
+ *	o copy configuration data to kernel address space
+ *	o allocate network interface data space
+ *	o call driver's new_if() entry point
+ *	o make sure there is no interface name conflict
+ *	o register network interface
+ */
+
+static int wanrouter_device_new_if(struct wan_device *wandev,
+				   wanif_conf_t __user *u_conf)
+{
+	wanif_conf_t *cnf;
+	struct net_device *dev = NULL;
+	int err;
+
+	if ((wandev->state == WAN_UNCONFIGURED) || (wandev->new_if == NULL))
+		return -ENODEV;
+
+	cnf = kmalloc(sizeof(wanif_conf_t), GFP_KERNEL);
+	if (!cnf)
+		return -ENOBUFS;
+
+	err = -EFAULT;
+	if (copy_from_user(cnf, u_conf, sizeof(wanif_conf_t)))
+		goto out;
+
+	err = -EINVAL;
+	if (cnf->magic != ROUTER_MAGIC)
+		goto out;
+
+	if (cnf->config_id == WANCONFIG_MPPP) {
+		printk(KERN_INFO "%s: Wanpipe Mulit-Port PPP support has not been compiled in!\n",
+				wandev->name);
+		err = -EPROTONOSUPPORT;
+		goto out;
+	} else {
+		err = wandev->new_if(wandev, dev, cnf);
+	}
+
+	if (!err) {
+		/* Register network interface. This will invoke init()
+		 * function supplied by the driver.  If device registered
+		 * successfully, add it to the interface list.
+		 */
+
+		if (dev->name == NULL) {
+			err = -EINVAL;
+		} else {
+
+			#ifdef WANDEBUG
+			printk(KERN_INFO "%s: registering interface %s...\n",
+				wanrouter_modname, dev->name);
+			#endif
+
+			err = register_netdev(dev);
+			if (!err) {
+				struct net_device *slave = NULL;
+				unsigned long smp_flags=0;
+
+				lock_adapter_irq(&wandev->lock, &smp_flags);
+
+				if (wandev->dev == NULL) {
+					wandev->dev = dev;
+				} else {
+					for (slave=wandev->dev;
+					     DEV_TO_SLAVE(slave);
+					     slave = DEV_TO_SLAVE(slave))
+						DEV_TO_SLAVE(slave) = dev;
+				}
+				++wandev->ndev;
+
+				unlock_adapter_irq(&wandev->lock, &smp_flags);
+				err = 0;	/* done !!! */
+				goto out;
+			}
+		}
+		if (wandev->del_if)
+			wandev->del_if(wandev, dev);
+		free_netdev(dev);
+	}
+
+out:
+	kfree(cnf);
+	return err;
+}
+
+
+/*
+ *	Delete WAN logical channel.
+ *	 o verify user address space
+ *	 o copy configuration data to kernel address space
+ */
+
+static int wanrouter_device_del_if(struct wan_device *wandev, char __user *u_name)
+{
+	char name[WAN_IFNAME_SZ + 1];
+	int err = 0;
+
+	if (wandev->state == WAN_UNCONFIGURED)
+		return -ENODEV;
+
+	memset(name, 0, sizeof(name));
+
+	if (copy_from_user(name, u_name, WAN_IFNAME_SZ))
+		return -EFAULT;
+
+	err = wanrouter_delete_interface(wandev, name);
+	if (err)
+		return err;
+
+	/* If last interface being deleted, shutdown card
+	 * This helps with administration at leaf nodes
+	 * (You can tell if the person at the other end of the phone
+	 * has an interface configured) and avoids DoS vulnerabilities
+	 * in binary driver files - this fixes a problem with the current
+	 * Sangoma driver going into strange states when all the network
+	 * interfaces are deleted and the link irrecoverably disconnected.
+	 */
+
+	if (!wandev->ndev && wandev->shutdown)
+		err = wandev->shutdown(wandev);
+
+	return err;
+}
+
+/*
+ *	Miscellaneous Functions
+ */
+
+/*
+ *	Find WAN device by name.
+ *	Return pointer to the WAN device data space or NULL if device not found.
+ */
+
+static struct wan_device *wanrouter_find_device(char *name)
+{
+	struct wan_device *wandev;
+
+	for (wandev = wanrouter_router_devlist;
+	     wandev && strcmp(wandev->name, name);
+		wandev = wandev->next);
+	return wandev;
+}
+
+/*
+ *	Delete WAN logical channel identified by its name.
+ *	o find logical channel by its name
+ *	o call driver's del_if() entry point
+ *	o unregister network interface
+ *	o unlink channel data space from linked list of channels
+ *	o release channel data space
+ *
+ *	Return:	0		success
+ *		-ENODEV		channel not found.
+ *		-EBUSY		interface is open
+ *
+ *	Note: If (force != 0), then device will be destroyed even if interface
+ *	associated with it is open. It's caller's responsibility to make
+ *	sure that opened interfaces are not removed!
+ */
+
+static int wanrouter_delete_interface(struct wan_device *wandev, char *name)
+{
+	struct net_device *dev = NULL, *prev = NULL;
+	unsigned long smp_flags=0;
+
+	lock_adapter_irq(&wandev->lock, &smp_flags);
+	dev = wandev->dev;
+	prev = NULL;
+	while (dev && strcmp(name, dev->name)) {
+		struct net_device **slave = netdev_priv(dev);
+		prev = dev;
+		dev = *slave;
+	}
+	unlock_adapter_irq(&wandev->lock, &smp_flags);
+
+	if (dev == NULL)
+		return -ENODEV;	/* interface not found */
+
+	if (netif_running(dev))
+		return -EBUSY;	/* interface in use */
+
+	if (wandev->del_if)
+		wandev->del_if(wandev, dev);
+
+	lock_adapter_irq(&wandev->lock, &smp_flags);
+	if (prev) {
+		struct net_device **prev_slave = netdev_priv(prev);
+		struct net_device **slave = netdev_priv(dev);
+
+		*prev_slave = *slave;
+	} else {
+		struct net_device **slave = netdev_priv(dev);
+		wandev->dev = *slave;
+	}
+	--wandev->ndev;
+	unlock_adapter_irq(&wandev->lock, &smp_flags);
+
+	printk(KERN_INFO "%s: unregistering '%s'\n", wandev->name, dev->name);
+
+	unregister_netdev(dev);
+
+	free_netdev(dev);
+
+	return 0;
+}
+
+static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__acquires(lock)
+{
+	spin_lock_irqsave(lock, *smp_flags);
+}
+
+
+static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
+	__releases(lock)
+{
+	spin_unlock_irqrestore(lock, *smp_flags);
+}
+
+EXPORT_SYMBOL(register_wan_device);
+EXPORT_SYMBOL(unregister_wan_device);
+
+MODULE_LICENSE("GPL");
+
+/*
+ *	End
+ */
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
new file mode 100644
index 00000000..f3463953
--- /dev/null
+++ b/net/wanrouter/wanproc.c
@@ -0,0 +1,382 @@
+/*****************************************************************************
+* wanproc.c	WAN Router Module. /proc filesystem interface.
+*
+*		This module is completely hardware-independent and provides
+*		access to the router using Linux /proc filesystem.
+*
+* Author: 	Gideon Hack
+*
+* Copyright:	(c) 1995-1999 Sangoma Technologies Inc.
+*
+*		This program is free software; you can redistribute it and/or
+*		modify it under the terms of the GNU General Public License
+*		as published by the Free Software Foundation; either version
+*		2 of the License, or (at your option) any later version.
+* ============================================================================
+* Jun 02, 1999  Gideon Hack	Updates for Linux 2.2.X kernels.
+* Jun 29, 1997	Alan Cox	Merged with 1.0.3 vendor code
+* Jan 29, 1997	Gene Kozin	v1.0.1. Implemented /proc read routines
+* Jan 30, 1997	Alan Cox	Hacked around for 2.1
+* Dec 13, 1996	Gene Kozin	Initial version (based on Sangoma's WANPIPE)
+*****************************************************************************/
+
+#include <linux/init.h>		/* __initfunc et al. */
+#include <linux/stddef.h>	/* offsetof(), etc. */
+#include <linux/errno.h>	/* return codes */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/wanrouter.h>	/* WAN router API definitions */
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <asm/io.h>
+
+#define PROC_STATS_FORMAT "%30s: %12lu\n"
+
+/****** Defines and Macros **************************************************/
+
+#define PROT_DECODE(prot) ((prot == WANCONFIG_FR) ? " FR" :\
+			      (prot == WANCONFIG_X25) ? " X25" : \
+				 (prot == WANCONFIG_PPP) ? " PPP" : \
+				    (prot == WANCONFIG_CHDLC) ? " CHDLC": \
+				       (prot == WANCONFIG_MPPP) ? " MPPP" : \
+					   " Unknown" )
+
+/****** Function Prototypes *************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+/* Miscellaneous */
+
+/*
+ *	Structures for interfacing with the /proc filesystem.
+ *	Router creates its own directory /proc/net/router with the following
+ *	entries:
+ *	config		device configuration
+ *	status		global device statistics
+ *	<device>	entry for each WAN device
+ */
+
+/*
+ *	Generic /proc/net/router/<file> file and inode operations
+ */
+
+/*
+ *	/proc/net/router
+ */
+
+static DEFINE_MUTEX(config_mutex);
+static struct proc_dir_entry *proc_router;
+
+/* Strings */
+
+/*
+ *	Interface functions
+ */
+
+/****** Proc filesystem entry points ****************************************/
+
+/*
+ *	Iterator
+ */
+static void *r_start(struct seq_file *m, loff_t *pos)
+	__acquires(kernel_lock)
+{
+	struct wan_device *wandev;
+	loff_t l = *pos;
+
+	mutex_lock(&config_mutex);
+	if (!l--)
+		return SEQ_START_TOKEN;
+	for (wandev = wanrouter_router_devlist; l-- && wandev;
+	     wandev = wandev->next)
+		;
+	return wandev;
+}
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct wan_device *wandev = v;
+	(*pos)++;
+	return (v == SEQ_START_TOKEN) ? wanrouter_router_devlist : wandev->next;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+	__releases(kernel_lock)
+{
+	mutex_unlock(&config_mutex);
+}
+
+static int config_show(struct seq_file *m, void *v)
+{
+	struct wan_device *p = v;
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "Device name    | port |IRQ|DMA|  mem.addr  |"
+			    "mem.size|option1|option2|option3|option4\n");
+		return 0;
+	}
+	if (!p->state)
+		return 0;
+	seq_printf(m, "%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n",
+			p->name, p->ioport, p->irq, p->dma, p->maddr, p->msize,
+			p->hw_opt[0], p->hw_opt[1], p->hw_opt[2], p->hw_opt[3]);
+	return 0;
+}
+
+static int status_show(struct seq_file *m, void *v)
+{
+	struct wan_device *p = v;
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "Device name    |protocol|station|interface|"
+			    "clocking|baud rate| MTU |ndev|link state\n");
+		return 0;
+	}
+	if (!p->state)
+		return 0;
+	seq_printf(m, "%-15s|%-8s| %-7s| %-9s|%-8s|%9u|%5u|%3u |",
+		p->name,
+		PROT_DECODE(p->config_id),
+		p->config_id == WANCONFIG_FR ?
+			(p->station ? "Node" : "CPE") :
+			(p->config_id == WANCONFIG_X25 ?
+			(p->station ? "DCE" : "DTE") :
+			("N/A")),
+		p->interface ? "V.35" : "RS-232",
+		p->clocking ? "internal" : "external",
+		p->bps,
+		p->mtu,
+		p->ndev);
+
+	switch (p->state) {
+	case WAN_UNCONFIGURED:
+		seq_printf(m, "%-12s\n", "unconfigured");
+		break;
+	case WAN_DISCONNECTED:
+		seq_printf(m, "%-12s\n", "disconnected");
+		break;
+	case WAN_CONNECTING:
+		seq_printf(m, "%-12s\n", "connecting");
+		break;
+	case WAN_CONNECTED:
+		seq_printf(m, "%-12s\n", "connected");
+		break;
+	default:
+		seq_printf(m, "%-12s\n", "invalid");
+		break;
+	}
+	return 0;
+}
+
+static const struct seq_operations config_op = {
+	.start	= r_start,
+	.next	= r_next,
+	.stop	= r_stop,
+	.show	= config_show,
+};
+
+static const struct seq_operations status_op = {
+	.start	= r_start,
+	.next	= r_next,
+	.stop	= r_stop,
+	.show	= status_show,
+};
+
+static int config_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &config_op);
+}
+
+static int status_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &status_op);
+}
+
+static const struct file_operations config_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = config_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+static const struct file_operations status_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = status_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+static int wandev_show(struct seq_file *m, void *v)
+{
+	struct wan_device *wandev = m->private;
+
+	if (wandev->magic != ROUTER_MAGIC)
+		return 0;
+
+	if (!wandev->state) {
+		seq_puts(m, "device is not configured!\n");
+		return 0;
+	}
+
+	/* Update device statistics */
+	if (wandev->update) {
+		int err = wandev->update(wandev);
+		if (err == -EAGAIN) {
+			seq_puts(m, "Device is busy!\n");
+			return 0;
+		}
+		if (err) {
+			seq_puts(m, "Device is not configured!\n");
+			return 0;
+		}
+	}
+
+	seq_printf(m, PROC_STATS_FORMAT,
+		"total packets received", wandev->stats.rx_packets);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"total packets transmitted", wandev->stats.tx_packets);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"total bytes received", wandev->stats.rx_bytes);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"total bytes transmitted", wandev->stats.tx_bytes);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"bad packets received", wandev->stats.rx_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"packet transmit problems", wandev->stats.tx_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"received frames dropped", wandev->stats.rx_dropped);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"transmit frames dropped", wandev->stats.tx_dropped);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"multicast packets received", wandev->stats.multicast);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"transmit collisions", wandev->stats.collisions);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"receive length errors", wandev->stats.rx_length_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"receiver overrun errors", wandev->stats.rx_over_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"CRC errors", wandev->stats.rx_crc_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"frame format errors (aborts)", wandev->stats.rx_frame_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"receiver fifo overrun", wandev->stats.rx_fifo_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"receiver missed packet", wandev->stats.rx_missed_errors);
+	seq_printf(m, PROC_STATS_FORMAT,
+		"aborted frames transmitted", wandev->stats.tx_aborted_errors);
+	return 0;
+}
+
+static int wandev_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wandev_show, PDE(inode)->data);
+}
+
+static const struct file_operations wandev_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = wandev_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+	.unlocked_ioctl  = wanrouter_ioctl,
+};
+
+/*
+ *	Initialize router proc interface.
+ */
+
+int __init wanrouter_proc_init(void)
+{
+	struct proc_dir_entry *p;
+	proc_router = proc_mkdir(ROUTER_NAME, init_net.proc_net);
+	if (!proc_router)
+		goto fail;
+
+	p = proc_create("config", S_IRUGO, proc_router, &config_fops);
+	if (!p)
+		goto fail_config;
+	p = proc_create("status", S_IRUGO, proc_router, &status_fops);
+	if (!p)
+		goto fail_stat;
+	return 0;
+fail_stat:
+	remove_proc_entry("config", proc_router);
+fail_config:
+	remove_proc_entry(ROUTER_NAME, init_net.proc_net);
+fail:
+	return -ENOMEM;
+}
+
+/*
+ *	Clean up router proc interface.
+ */
+
+void wanrouter_proc_cleanup(void)
+{
+	remove_proc_entry("config", proc_router);
+	remove_proc_entry("status", proc_router);
+	remove_proc_entry(ROUTER_NAME, init_net.proc_net);
+}
+
+/*
+ *	Add directory entry for WAN device.
+ */
+
+int wanrouter_proc_add(struct wan_device* wandev)
+{
+	if (wandev->magic != ROUTER_MAGIC)
+		return -EINVAL;
+
+	wandev->dent = proc_create(wandev->name, S_IRUGO,
+				   proc_router, &wandev_fops);
+	if (!wandev->dent)
+		return -ENOMEM;
+	wandev->dent->data	= wandev;
+	return 0;
+}
+
+/*
+ *	Delete directory entry for WAN device.
+ */
+int wanrouter_proc_delete(struct wan_device* wandev)
+{
+	if (wandev->magic != ROUTER_MAGIC)
+		return -EINVAL;
+	remove_proc_entry(wandev->name, proc_router);
+	return 0;
+}
+
+#else
+
+/*
+ *	No /proc - output stubs
+ */
+
+int __init wanrouter_proc_init(void)
+{
+	return 0;
+}
+
+void wanrouter_proc_cleanup(void)
+{
+}
+
+int wanrouter_proc_add(struct wan_device *wandev)
+{
+	return 0;
+}
+
+int wanrouter_proc_delete(struct wan_device *wandev)
+{
+	return 0;
+}
+
+#endif
+
+/*
+ *	End
+ */
+
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
new file mode 100644
index 00000000..e4d97ab4
--- /dev/null
+++ b/net/wimax/Kconfig
@@ -0,0 +1,39 @@
+#
+# WiMAX LAN device configuration
+#
+
+menuconfig WIMAX
+	tristate "WiMAX Wireless Broadband support"
+	depends on RFKILL || !RFKILL
+	help
+
+	  Select to configure support for devices that provide
+	  wireless broadband connectivity using the WiMAX protocol
+	  (IEEE 802.16).
+
+	  Please note that most of these devices require signing up
+	  for a service plan with a provider.
+
+	  The different WiMAX drivers can be enabled in the menu entry
+
+	  Device Drivers > Network device support > WiMAX Wireless
+	  Broadband devices
+
+	  If unsure, it is safe to select M (module).
+
+config WIMAX_DEBUG_LEVEL
+	int "WiMAX debug level"
+	depends on WIMAX
+	default 8
+	help
+
+	  Select the maximum debug verbosity level to be compiled into
+	  the WiMAX stack code.
+
+	  By default, debug messages are disabled at runtime and can
+	  be selectively enabled for different parts of the code using
+	  the sysfs debug-levels file.
+
+	  If set at zero, this will compile out all the debug code.
+
+	  It is recommended that it is left at 8.
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
new file mode 100644
index 00000000..8f1510d0
--- /dev/null
+++ b/net/wimax/Makefile
@@ -0,0 +1,14 @@
+
+obj-$(CONFIG_WIMAX)		+= wimax.o
+
+wimax-y :=		\
+	id-table.o	\
+	op-msg.o	\
+	op-reset.o	\
+	op-rfkill.o	\
+	op-state-get.o	\
+	stack.o
+
+wimax-$(CONFIG_DEBUG_FS) += debugfs.o
+
+
diff --git a/net/wimax/debug-levels.h b/net/wimax/debug-levels.h
new file mode 100644
index 00000000..0975adba
--- /dev/null
+++ b/net/wimax/debug-levels.h
@@ -0,0 +1,43 @@
+/*
+ * Linux WiMAX Stack
+ * Debug levels control file for the wimax module
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#ifndef __debug_levels__h__
+#define __debug_levels__h__
+
+/* Maximum compile and run time debug level for all submodules */
+#define D_MODULENAME wimax
+#define D_MASTER CONFIG_WIMAX_DEBUG_LEVEL
+
+#include <linux/wimax/debug.h>
+
+/* List of all the enabled modules */
+enum d_module {
+	D_SUBMODULE_DECLARE(debugfs),
+	D_SUBMODULE_DECLARE(id_table),
+	D_SUBMODULE_DECLARE(op_msg),
+	D_SUBMODULE_DECLARE(op_reset),
+	D_SUBMODULE_DECLARE(op_rfkill),
+	D_SUBMODULE_DECLARE(op_state_get),
+	D_SUBMODULE_DECLARE(stack),
+};
+
+#endif /* #ifndef __debug_levels__h__ */
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
new file mode 100644
index 00000000..6c9bedb7
--- /dev/null
+++ b/net/wimax/debugfs.c
@@ -0,0 +1,80 @@
+/*
+ * Linux WiMAX
+ * Debugfs support
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#include <linux/debugfs.h>
+#include <linux/wimax.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE debugfs
+#include "debug-levels.h"
+
+
+#define __debugfs_register(prefix, name, parent)			\
+do {									\
+	result = d_level_register_debugfs(prefix, name, parent);	\
+	if (result < 0)							\
+		goto error;						\
+} while (0)
+
+
+int wimax_debugfs_add(struct wimax_dev *wimax_dev)
+{
+	int result;
+	struct net_device *net_dev = wimax_dev->net_dev;
+	struct device *dev = net_dev->dev.parent;
+	struct dentry *dentry;
+	char buf[128];
+
+	snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
+	dentry = debugfs_create_dir(buf, NULL);
+	result = PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
+		if (result == -ENODEV)
+			result = 0;	/* No debugfs support */
+		else
+			dev_err(dev, "Can't create debugfs dentry: %d\n",
+				result);
+		goto out;
+	}
+	wimax_dev->debugfs_dentry = dentry;
+	__debugfs_register("wimax_dl_", debugfs, dentry);
+	__debugfs_register("wimax_dl_", id_table, dentry);
+	__debugfs_register("wimax_dl_", op_msg, dentry);
+	__debugfs_register("wimax_dl_", op_reset, dentry);
+	__debugfs_register("wimax_dl_", op_rfkill, dentry);
+	__debugfs_register("wimax_dl_", op_state_get, dentry);
+	__debugfs_register("wimax_dl_", stack, dentry);
+	result = 0;
+out:
+	return result;
+
+error:
+	debugfs_remove_recursive(wimax_dev->debugfs_dentry);
+	return result;
+}
+
+void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
+{
+	debugfs_remove_recursive(wimax_dev->debugfs_dentry);
+}
+
+
diff --git a/net/wimax/id-table.c b/net/wimax/id-table.c
new file mode 100644
index 00000000..72273abf
--- /dev/null
+++ b/net/wimax/id-table.c
@@ -0,0 +1,145 @@
+/*
+ * Linux WiMAX
+ * Mappping of generic netlink family IDs to net devices
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * We assign a single generic netlink family ID to each device (to
+ * simplify lookup).
+ *
+ * We need a way to map family ID to a wimax_dev pointer.
+ *
+ * The idea is to use a very simple lookup. Using a netlink attribute
+ * with (for example) the interface name implies a heavier search over
+ * all the network devices; seemed kind of a waste given that we know
+ * we are looking for a WiMAX device and that most systems will have
+ * just a single WiMAX adapter.
+ *
+ * We put all the WiMAX devices in the system in a linked list and
+ * match the generic link family ID against the list.
+ *
+ * By using a linked list, the case of a single adapter in the system
+ * becomes (almost) no overhead, while still working for many more. If
+ * it ever goes beyond two, I'll be surprised.
+ */
+#include <linux/device.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/list.h>
+#include <linux/wimax.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE id_table
+#include "debug-levels.h"
+
+
+static DEFINE_SPINLOCK(wimax_id_table_lock);
+static struct list_head wimax_id_table = LIST_HEAD_INIT(wimax_id_table);
+
+
+/*
+ * wimax_id_table_add - add a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @wimax_dev: WiMAX device descriptor to associate to the Generic
+ *     Netlink family ID.
+ *
+ * Look for an empty spot in the ID table; if none found, double the
+ * table's size and get the first spot.
+ */
+void wimax_id_table_add(struct wimax_dev *wimax_dev)
+{
+	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+	spin_lock(&wimax_id_table_lock);
+	list_add(&wimax_dev->id_table_node, &wimax_id_table);
+	spin_unlock(&wimax_id_table_lock);
+	d_fnend(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * wimax_get_netdev_by_info - lookup a wimax_dev from the gennetlink info
+ *
+ * The generic netlink family ID has been filled out in the
+ * nlmsghdr->nlmsg_type field, so we pull it from there, look it up in
+ * the mapping table and reference the wimax_dev.
+ *
+ * When done, the reference should be dropped with
+ * 'dev_put(wimax_dev->net_dev)'.
+ */
+struct wimax_dev *wimax_dev_get_by_genl_info(
+	struct genl_info *info, int ifindex)
+{
+	struct wimax_dev *wimax_dev = NULL;
+
+	d_fnstart(3, NULL, "(info %p ifindex %d)\n", info, ifindex);
+	spin_lock(&wimax_id_table_lock);
+	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+		if (wimax_dev->net_dev->ifindex == ifindex) {
+			dev_hold(wimax_dev->net_dev);
+			goto found;
+		}
+	}
+	wimax_dev = NULL;
+	d_printf(1, NULL, "wimax: no devices found with ifindex %d\n",
+		 ifindex);
+found:
+	spin_unlock(&wimax_id_table_lock);
+	d_fnend(3, NULL, "(info %p ifindex %d) = %p\n",
+		info, ifindex, wimax_dev);
+	return wimax_dev;
+}
+
+
+/*
+ * wimax_id_table_rm - Remove a gennetlink familiy ID / wimax_dev mapping
+ *
+ * @id: family ID to remove from the table
+ */
+void wimax_id_table_rm(struct wimax_dev *wimax_dev)
+{
+	spin_lock(&wimax_id_table_lock);
+	list_del_init(&wimax_dev->id_table_node);
+	spin_unlock(&wimax_id_table_lock);
+}
+
+
+/*
+ * Release the gennetlink family id / mapping table
+ *
+ * On debug, verify that the table is empty upon removal. We want the
+ * code always compiled, to ensure it doesn't bit rot. It will be
+ * compiled out if CONFIG_BUG is disabled.
+ */
+void wimax_id_table_release(void)
+{
+	struct wimax_dev *wimax_dev;
+
+#ifndef CONFIG_BUG
+	return;
+#endif
+	spin_lock(&wimax_id_table_lock);
+	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
+		printk(KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n",
+		       __func__, wimax_dev, wimax_dev->net_dev->ifindex);
+		WARN_ON(1);
+	}
+	spin_unlock(&wimax_id_table_lock);
+}
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
new file mode 100644
index 00000000..d5b7c377
--- /dev/null
+++ b/net/wimax/op-msg.c
@@ -0,0 +1,432 @@
+/*
+ * Linux WiMAX
+ * Generic messaging interface between userspace and driver/device
+ *
+ *
+ * Copyright (C) 2007-2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements a direct communication channel between user space and
+ * the driver/device, by which free form messages can be sent back and
+ * forth.
+ *
+ * This is intended for device-specific features, vendor quirks, etc.
+ *
+ * See include/net/wimax.h
+ *
+ * GENERIC NETLINK ENCODING AND CAPACITY
+ *
+ * A destination "pipe name" is added to each message; it is up to the
+ * drivers to assign or use those names (if using them at all).
+ *
+ * Messages are encoded as a binary netlink attribute using nla_put()
+ * using type NLA_UNSPEC (as some versions of libnl still in
+ * deployment don't yet understand NLA_BINARY).
+ *
+ * The maximum capacity of this transport is PAGESIZE per message (so
+ * the actual payload will be bit smaller depending on the
+ * netlink/generic netlink attributes and headers).
+ *
+ * RECEPTION OF MESSAGES
+ *
+ * When a message is received from user space, it is passed verbatim
+ * to the driver calling wimax_dev->op_msg_from_user(). The return
+ * value from this function is passed back to user space as an ack
+ * over the generic netlink protocol.
+ *
+ * The stack doesn't do any processing or interpretation of these
+ * messages.
+ *
+ * SENDING MESSAGES
+ *
+ * Messages can be sent with wimax_msg().
+ *
+ * If the message delivery needs to happen on a different context to
+ * that of its creation, wimax_msg_alloc() can be used to get a
+ * pointer to the message that can be delivered later on with
+ * wimax_msg_send().
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_msg_from_user()    Process a message from user space
+ *   wimax_dev_get_by_genl_info()
+ *   wimax_dev->op_msg_from_user()   Delivery of message to the driver
+ *
+ * wimax_msg()                       Send a message to user space
+ *   wimax_msg_alloc()
+ *   wimax_msg_send()
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE op_msg
+#include "debug-levels.h"
+
+
+/**
+ * wimax_msg_alloc - Create a new skb for sending a message to userspace
+ *
+ * @wimax_dev: WiMAX device descriptor
+ * @pipe_name: "named pipe" the message will be sent to
+ * @msg: pointer to the message data to send
+ * @size: size of the message to send (in bytes), including the header.
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error
+ *
+ * Description:
+ *
+ * Allocates an skb that will contain the message to send to user
+ * space over the messaging pipe and initializes it, copying the
+ * payload.
+ *
+ * Once this call is done, you can deliver it with
+ * wimax_msg_send().
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+struct sk_buff *wimax_msg_alloc(struct wimax_dev *wimax_dev,
+				const char *pipe_name,
+				const void *msg, size_t size,
+				gfp_t gfp_flags)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	size_t msg_size;
+	void *genl_msg;
+	struct sk_buff *skb;
+
+	msg_size = nla_total_size(size)
+		+ nla_total_size(sizeof(u32))
+		+ (pipe_name ? nla_total_size(strlen(pipe_name)) : 0);
+	result = -ENOMEM;
+	skb = genlmsg_new(msg_size, gfp_flags);
+	if (skb == NULL)
+		goto error_new;
+	genl_msg = genlmsg_put(skb, 0, 0, &wimax_gnl_family,
+			       0, WIMAX_GNL_OP_MSG_TO_USER);
+	if (genl_msg == NULL) {
+		dev_err(dev, "no memory to create generic netlink message\n");
+		goto error_genlmsg_put;
+	}
+	result = nla_put_u32(skb, WIMAX_GNL_MSG_IFIDX,
+			     wimax_dev->net_dev->ifindex);
+	if (result < 0) {
+		dev_err(dev, "no memory to add ifindex attribute\n");
+		goto error_nla_put;
+	}
+	if (pipe_name) {
+		result = nla_put_string(skb, WIMAX_GNL_MSG_PIPE_NAME,
+					pipe_name);
+		if (result < 0) {
+			dev_err(dev, "no memory to add pipe_name attribute\n");
+			goto error_nla_put;
+		}
+	}
+	result = nla_put(skb, WIMAX_GNL_MSG_DATA, size, msg);
+	if (result < 0) {
+		dev_err(dev, "no memory to add payload (msg %p size %zu) in "
+			"attribute: %d\n", msg, size, result);
+		goto error_nla_put;
+	}
+	genlmsg_end(skb, genl_msg);
+	return skb;
+
+error_nla_put:
+error_genlmsg_put:
+error_new:
+	nlmsg_free(skb);
+	return ERR_PTR(result);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_alloc);
+
+
+/**
+ * wimax_msg_data_len - Return a pointer and size of a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ * @size: Pointer to where to store the message's size
+ *
+ * Returns the pointer to the message data.
+ */
+const void *wimax_msg_data_len(struct sk_buff *msg, size_t *size)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return NULL;
+	}
+	*size = nla_len(nla);
+	return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data_len);
+
+
+/**
+ * wimax_msg_data - Return a pointer to a message's payload
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+const void *wimax_msg_data(struct sk_buff *msg)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return NULL;
+	}
+	return nla_data(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_data);
+
+
+/**
+ * wimax_msg_len - Return a message's payload length
+ *
+ * @msg: Pointer to a message created with wimax_msg_alloc()
+ */
+ssize_t wimax_msg_len(struct sk_buff *msg)
+{
+	struct nlmsghdr *nlh = (void *) msg->head;
+	struct nlattr *nla;
+
+	nla = nlmsg_find_attr(nlh, sizeof(struct genlmsghdr),
+			      WIMAX_GNL_MSG_DATA);
+	if (nla == NULL) {
+		printk(KERN_ERR "Cannot find attribute WIMAX_GNL_MSG_DATA\n");
+		return -EINVAL;
+	}
+	return nla_len(nla);
+}
+EXPORT_SYMBOL_GPL(wimax_msg_len);
+
+
+/**
+ * wimax_msg_send - Send a pre-allocated message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @skb: &struct sk_buff returned by wimax_msg_alloc(). Note the
+ *     ownership of @skb is transferred to this function.
+ *
+ * Returns: 0 if ok, < 0 errno code on error
+ *
+ * Description:
+ *
+ * Sends a free-form message that was preallocated with
+ * wimax_msg_alloc() and filled up.
+ *
+ * Assumes that once you pass an skb to this function for sending, it
+ * owns it and will release it when done (on success).
+ *
+ * IMPORTANT:
+ *
+ * Don't use skb_push()/skb_pull()/skb_reserve() on the skb, as
+ * wimax_msg_send() depends on skb->data being placed at the
+ * beginning of the user message.
+ *
+ * Unlike other WiMAX stack calls, this call can be used way early,
+ * even before wimax_dev_add() is called, as long as the
+ * wimax_dev->net_dev pointer is set to point to a proper
+ * net_dev. This is so that drivers can use it early in case they need
+ * to send stuff around or communicate with user space.
+ */
+int wimax_msg_send(struct wimax_dev *wimax_dev, struct sk_buff *skb)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	void *msg = skb->data;
+	size_t size = skb->len;
+	might_sleep();
+
+	d_printf(1, dev, "CTX: wimax msg, %zu bytes\n", size);
+	d_dump(2, dev, msg, size);
+	genlmsg_multicast(skb, 0, wimax_gnl_mcg.id, GFP_KERNEL);
+	d_printf(1, dev, "CTX: genl multicast done\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wimax_msg_send);
+
+
+/**
+ * wimax_msg - Send a message to user space
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @pipe_name: "named pipe" the message will be sent to
+ * @buf: pointer to the message to send.
+ * @size: size of the buffer pointed to by @buf (in bytes).
+ * @gfp_flags: flags for memory allocation.
+ *
+ * Returns: %0 if ok, negative errno code on error.
+ *
+ * Description:
+ *
+ * Sends a free-form message to user space on the device @wimax_dev.
+ *
+ * NOTES:
+ *
+ * Once the @skb is given to this function, who will own it and will
+ * release it when done (unless it returns error).
+ */
+int wimax_msg(struct wimax_dev *wimax_dev, const char *pipe_name,
+	      const void *buf, size_t size, gfp_t gfp_flags)
+{
+	int result = -ENOMEM;
+	struct sk_buff *skb;
+
+	skb = wimax_msg_alloc(wimax_dev, pipe_name, buf, size, gfp_flags);
+	if (IS_ERR(skb))
+		result = PTR_ERR(skb);
+	else
+		result = wimax_msg_send(wimax_dev, skb);
+	return result;
+}
+EXPORT_SYMBOL_GPL(wimax_msg);
+
+
+static const struct nla_policy wimax_gnl_msg_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_MSG_IFIDX] = {
+		.type = NLA_U32,
+	},
+	[WIMAX_GNL_MSG_DATA] = {
+		.type = NLA_UNSPEC,	/* libnl doesn't grok BINARY yet */
+	},
+};
+
+
+/*
+ * Relays a message from user space to the driver
+ *
+ * The skb is passed to the driver-specific function with the netlink
+ * and generic netlink headers already stripped.
+ *
+ * This call will block while handling/relaying the message.
+ */
+static
+int wimax_gnl_doit_msg_from_user(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+	struct device *dev;
+	struct nlmsghdr *nlh = info->nlhdr;
+	char *pipe_name;
+	void *msg_buf;
+	size_t msg_len;
+
+	might_sleep();
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_MSG_IFIDX] == NULL) {
+		printk(KERN_ERR "WIMAX_GNL_MSG_FROM_USER: can't find IFIDX "
+		       "attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_MSG_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	dev = wimax_dev_to_dev(wimax_dev);
+
+	/* Unpack arguments */
+	result = -EINVAL;
+	if (info->attrs[WIMAX_GNL_MSG_DATA] == NULL) {
+		dev_err(dev, "WIMAX_GNL_MSG_FROM_USER: can't find MSG_DATA "
+			"attribute\n");
+		goto error_no_data;
+	}
+	msg_buf = nla_data(info->attrs[WIMAX_GNL_MSG_DATA]);
+	msg_len = nla_len(info->attrs[WIMAX_GNL_MSG_DATA]);
+
+	if (info->attrs[WIMAX_GNL_MSG_PIPE_NAME] == NULL)
+		pipe_name = NULL;
+	else {
+		struct nlattr *attr = info->attrs[WIMAX_GNL_MSG_PIPE_NAME];
+		size_t attr_len = nla_len(attr);
+		/* libnl-1.1 does not yet support NLA_NUL_STRING */
+		result = -ENOMEM;
+		pipe_name = kstrndup(nla_data(attr), attr_len + 1, GFP_KERNEL);
+		if (pipe_name == NULL)
+			goto error_alloc;
+		pipe_name[attr_len] = 0;
+	}
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result == -ENOMEDIUM)
+		result = 0;
+	if (result < 0)
+		goto error_not_ready;
+	result = -ENOSYS;
+	if (wimax_dev->op_msg_from_user == NULL)
+		goto error_noop;
+
+	d_printf(1, dev,
+		 "CRX: nlmsghdr len %u type %u flags 0x%04x seq 0x%x pid %u\n",
+		 nlh->nlmsg_len, nlh->nlmsg_type, nlh->nlmsg_flags,
+		 nlh->nlmsg_seq, nlh->nlmsg_pid);
+	d_printf(1, dev, "CRX: wimax message %zu bytes\n", msg_len);
+	d_dump(2, dev, msg_buf, msg_len);
+
+	result = wimax_dev->op_msg_from_user(wimax_dev, pipe_name,
+					     msg_buf, msg_len, info);
+error_noop:
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+error_alloc:
+	kfree(pipe_name);
+error_no_data:
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
+
+
+/*
+ * Generic Netlink glue
+ */
+
+struct genl_ops wimax_gnl_msg_from_user = {
+	.cmd = WIMAX_GNL_OP_MSG_FROM_USER,
+	.flags = GENL_ADMIN_PERM,
+	.policy = wimax_gnl_msg_policy,
+	.doit = wimax_gnl_doit_msg_from_user,
+	.dumpit = NULL,
+};
+
diff --git a/net/wimax/op-reset.c b/net/wimax/op-reset.c
new file mode 100644
index 00000000..68bedf3e
--- /dev/null
+++ b/net/wimax/op-reset.c
@@ -0,0 +1,140 @@
+/*
+ * Linux WiMAX
+ * Implement and export a method for resetting a WiMAX device
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements a simple synchronous call to reset a WiMAX device.
+ *
+ * Resets aim at being warm, keeping the device handles active;
+ * however, when that fails, it falls back to a cold reset (that will
+ * disconnect and reconnect the device).
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_reset
+#include "debug-levels.h"
+
+
+/**
+ * wimax_reset - Reset a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns:
+ *
+ * %0 if ok and a warm reset was done (the device still exists in
+ * the system).
+ *
+ * -%ENODEV if a cold/bus reset had to be done (device has
+ * disconnected and reconnected, so current handle is not valid
+ * any more).
+ *
+ * -%EINVAL if the device is not even registered.
+ *
+ * Any other negative error code shall be considered as
+ * non-recoverable.
+ *
+ * Description:
+ *
+ * Called when wanting to reset the device for any reason. Device is
+ * taken back to power on status.
+ *
+ * This call blocks; on successful return, the device has completed the
+ * reset process and is ready to operate.
+ */
+int wimax_reset(struct wimax_dev *wimax_dev)
+{
+	int result = -EINVAL;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st state;
+
+	might_sleep();
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	mutex_lock(&wimax_dev->mutex);
+	dev_hold(wimax_dev->net_dev);
+	state = wimax_dev->state;
+	mutex_unlock(&wimax_dev->mutex);
+
+	if (state >= WIMAX_ST_DOWN) {
+		mutex_lock(&wimax_dev->mutex_reset);
+		result = wimax_dev->op_reset(wimax_dev);
+		mutex_unlock(&wimax_dev->mutex_reset);
+	}
+	dev_put(wimax_dev->net_dev);
+
+	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+	return result;
+}
+EXPORT_SYMBOL(wimax_reset);
+
+
+static const struct nla_policy wimax_gnl_reset_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_RESET_IFIDX] = {
+		.type = NLA_U32,
+	},
+};
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the reset command from user space, return error code.
+ *
+ * No attributes.
+ */
+static
+int wimax_gnl_doit_reset(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_RESET_IFIDX] == NULL) {
+		printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX "
+			"attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RESET_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	/* Execute the operation and send the result back to user space */
+	result = wimax_reset(wimax_dev);
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
+
+
+struct genl_ops wimax_gnl_reset = {
+	.cmd = WIMAX_GNL_OP_RESET,
+	.flags = GENL_ADMIN_PERM,
+	.policy = wimax_gnl_reset_policy,
+	.doit = wimax_gnl_doit_reset,
+	.dumpit = NULL,
+};
diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c
new file mode 100644
index 00000000..2609e445
--- /dev/null
+++ b/net/wimax/op-rfkill.c
@@ -0,0 +1,468 @@
+/*
+ * Linux WiMAX
+ * RF-kill framework integration
+ *
+ *
+ * Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This integrates into the Linux Kernel rfkill susbystem so that the
+ * drivers just have to do the bare minimal work, which is providing a
+ * method to set the software RF-Kill switch and to report changes in
+ * the software and hardware switch status.
+ *
+ * A non-polled generic rfkill device is embedded into the WiMAX
+ * subsystem's representation of a device.
+ *
+ * FIXME: Need polled support? Let drivers provide a poll routine
+ *	  and hand it to rfkill ops then?
+ *
+ * All device drivers have to do is after wimax_dev_init(), call
+ * wimax_report_rfkill_hw() and wimax_report_rfkill_sw() to update
+ * initial state and then every time it changes. See wimax.h:struct
+ * wimax_dev for more information.
+ *
+ * ROADMAP
+ *
+ * wimax_gnl_doit_rfkill()      User space calling wimax_rfkill()
+ *   wimax_rfkill()             Kernel calling wimax_rfkill()
+ *     __wimax_rf_toggle_radio()
+ *
+ * wimax_rfkill_set_radio_block()  RF-Kill subsystem calling
+ *   __wimax_rf_toggle_radio()
+ *
+ * __wimax_rf_toggle_radio()
+ *   wimax_dev->op_rfkill_sw_toggle() Driver backend
+ *   __wimax_state_change()
+ *
+ * wimax_report_rfkill_sw()     Driver reports state change
+ *   __wimax_state_change()
+ *
+ * wimax_report_rfkill_hw()     Driver reports state change
+ *   __wimax_state_change()
+ *
+ * wimax_rfkill_add()           Initialize/shutdown rfkill support
+ * wimax_rfkill_rm()            [called by wimax_dev_add/rm()]
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include <linux/rfkill.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_rfkill
+#include "debug-levels.h"
+
+/**
+ * wimax_report_rfkill_hw - Reports changes in the hardware RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF Kill switch. %WIMAX_RF_ON radio on,
+ *     %WIMAX_RF_OFF radio off.
+ *
+ * When the device detects a change in the state of thehardware RF
+ * switch, it must call this function to let the WiMAX kernel stack
+ * know that the state has changed so it can be properly propagated.
+ *
+ * The WiMAX stack caches the state (the driver doesn't need to). As
+ * well, as the change is propagated it will come back as a request to
+ * change the software state to mirror the hardware state.
+ *
+ * If the device doesn't have a hardware kill switch, just report
+ * it on initialization as always on (%WIMAX_RF_ON, radio on).
+ */
+void wimax_report_rfkill_hw(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	BUG_ON(state == WIMAX_RF_QUERY);
+	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0)
+		goto error_not_ready;
+
+	if (state != wimax_dev->rf_hw) {
+		wimax_dev->rf_hw = state;
+		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+		    wimax_dev->rf_sw == WIMAX_RF_ON)
+			wimax_state = WIMAX_ST_READY;
+		else
+			wimax_state = WIMAX_ST_RADIO_OFF;
+
+		result = rfkill_set_hw_state(wimax_dev->rfkill,
+					     state == WIMAX_RF_OFF);
+
+		__wimax_state_change(wimax_dev, wimax_state);
+	}
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+		wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw);
+
+
+/**
+ * wimax_report_rfkill_sw - Reports changes in the software RF switch
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on,
+ *     %WIMAX_RF_OFF radio off.
+ *
+ * Reports changes in the software RF switch state to the the WiMAX
+ * stack.
+ *
+ * The main use is during initialization, so the driver can query the
+ * device for its current software radio kill switch state and feed it
+ * to the system.
+ *
+ * On the side, the device does not change the software state by
+ * itself. In practice, this can happen, as the device might decide to
+ * switch (in software) the radio off for different reasons.
+ */
+void wimax_report_rfkill_sw(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	BUG_ON(state == WIMAX_RF_QUERY);
+	BUG_ON(state != WIMAX_RF_ON && state != WIMAX_RF_OFF);
+
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0)
+		goto error_not_ready;
+
+	if (state != wimax_dev->rf_sw) {
+		wimax_dev->rf_sw = state;
+		if (wimax_dev->rf_hw == WIMAX_RF_ON &&
+		    wimax_dev->rf_sw == WIMAX_RF_ON)
+			wimax_state = WIMAX_ST_READY;
+		else
+			wimax_state = WIMAX_ST_RADIO_OFF;
+		__wimax_state_change(wimax_dev, wimax_state);
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+	}
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = void [%d]\n",
+		wimax_dev, state, result);
+}
+EXPORT_SYMBOL_GPL(wimax_report_rfkill_sw);
+
+
+/*
+ * Callback for the RF Kill toggle operation
+ *
+ * This function is called by:
+ *
+ * - The rfkill subsystem when the RF-Kill key is pressed in the
+ *   hardware and the driver notifies through
+ *   wimax_report_rfkill_hw(). The rfkill subsystem ends up calling back
+ *   here so the software RF Kill switch state is changed to reflect
+ *   the hardware switch state.
+ *
+ * - When the user sets the state through sysfs' rfkill/state file
+ *
+ * - When the user calls wimax_rfkill().
+ *
+ * This call blocks!
+ *
+ * WARNING! When we call rfkill_unregister(), this will be called with
+ * state 0!
+ *
+ * WARNING: wimax_dev must be locked
+ */
+static
+int __wimax_rf_toggle_radio(struct wimax_dev *wimax_dev,
+			    enum wimax_rf_state state)
+{
+	int result = 0;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st wimax_state;
+
+	might_sleep();
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	if (wimax_dev->rf_sw == state)
+		goto out_no_change;
+	if (wimax_dev->op_rfkill_sw_toggle != NULL)
+		result = wimax_dev->op_rfkill_sw_toggle(wimax_dev, state);
+	else if (state == WIMAX_RF_OFF)	/* No op? can't turn off */
+		result = -ENXIO;
+	else				/* No op? can turn on */
+		result = 0;		/* should never happen tho */
+	if (result >= 0) {
+		result = 0;
+		wimax_dev->rf_sw = state;
+		wimax_state = state == WIMAX_RF_ON ?
+			WIMAX_ST_READY : WIMAX_ST_RADIO_OFF;
+		__wimax_state_change(wimax_dev, wimax_state);
+	}
+out_no_change:
+	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+		wimax_dev, state, result);
+	return result;
+}
+
+
+/*
+ * Translate from rfkill state to wimax state
+ *
+ * NOTE: Special state handling rules here
+ *
+ *     Just pretend the call didn't happen if we are in a state where
+ *     we know for sure it cannot be handled (WIMAX_ST_DOWN or
+ *     __WIMAX_ST_QUIESCING). rfkill() needs it to register and
+ *     unregister, as it will run this path.
+ *
+ * NOTE: This call will block until the operation is completed.
+ */
+static int wimax_rfkill_set_radio_block(void *data, bool blocked)
+{
+	int result;
+	struct wimax_dev *wimax_dev = data;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_rf_state rf_state;
+
+	d_fnstart(3, dev, "(wimax_dev %p blocked %u)\n", wimax_dev, blocked);
+	rf_state = WIMAX_RF_ON;
+	if (blocked)
+		rf_state = WIMAX_RF_OFF;
+	mutex_lock(&wimax_dev->mutex);
+	if (wimax_dev->state <= __WIMAX_ST_QUIESCING)
+		result = 0;
+	else
+		result = __wimax_rf_toggle_radio(wimax_dev, rf_state);
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p blocked %u) = %d\n",
+		wimax_dev, blocked, result);
+	return result;
+}
+
+static const struct rfkill_ops wimax_rfkill_ops = {
+	.set_block = wimax_rfkill_set_radio_block,
+};
+
+/**
+ * wimax_rfkill - Set the software RF switch state for a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * @state: New RF state.
+ *
+ * Returns:
+ *
+ * >= 0 toggle state if ok, < 0 errno code on error. The toggle state
+ * is returned as a bitmap, bit 0 being the hardware RF state, bit 1
+ * the software RF state.
+ *
+ * 0 means disabled (%WIMAX_RF_ON, radio on), 1 means enabled radio
+ * off (%WIMAX_RF_OFF).
+ *
+ * Description:
+ *
+ * Called by the user when he wants to request the WiMAX radio to be
+ * switched on (%WIMAX_RF_ON) or off (%WIMAX_RF_OFF). With
+ * %WIMAX_RF_QUERY, just the current state is returned.
+ *
+ * NOTE:
+ *
+ * This call will block until the operation is complete.
+ */
+int wimax_rfkill(struct wimax_dev *wimax_dev, enum wimax_rf_state state)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+	d_fnstart(3, dev, "(wimax_dev %p state %u)\n", wimax_dev, state);
+	mutex_lock(&wimax_dev->mutex);
+	result = wimax_dev_is_ready(wimax_dev);
+	if (result < 0) {
+		/* While initializing, < 1.4.3 wimax-tools versions use
+		 * this call to check if the device is a valid WiMAX
+		 * device; so we allow it to proceed always,
+		 * considering the radios are all off. */
+		if (result == -ENOMEDIUM && state == WIMAX_RF_QUERY)
+			result = WIMAX_RF_OFF << 1 | WIMAX_RF_OFF;
+		goto error_not_ready;
+	}
+	switch (state) {
+	case WIMAX_RF_ON:
+	case WIMAX_RF_OFF:
+		result = __wimax_rf_toggle_radio(wimax_dev, state);
+		if (result < 0)
+			goto error;
+		rfkill_set_sw_state(wimax_dev->rfkill, state == WIMAX_RF_OFF);
+		break;
+	case WIMAX_RF_QUERY:
+		break;
+	default:
+		result = -EINVAL;
+		goto error;
+	}
+	result = wimax_dev->rf_sw << 1 | wimax_dev->rf_hw;
+error:
+error_not_ready:
+	mutex_unlock(&wimax_dev->mutex);
+	d_fnend(3, dev, "(wimax_dev %p state %u) = %d\n",
+		wimax_dev, state, result);
+	return result;
+}
+EXPORT_SYMBOL(wimax_rfkill);
+
+
+/*
+ * Register a new WiMAX device's RF Kill support
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+int wimax_rfkill_add(struct wimax_dev *wimax_dev)
+{
+	int result;
+	struct rfkill *rfkill;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	/* Initialize RF Kill */
+	result = -ENOMEM;
+	rfkill = rfkill_alloc(wimax_dev->name, dev, RFKILL_TYPE_WIMAX,
+			      &wimax_rfkill_ops, wimax_dev);
+	if (rfkill == NULL)
+		goto error_rfkill_allocate;
+
+	d_printf(1, dev, "rfkill %p\n", rfkill);
+
+	wimax_dev->rfkill = rfkill;
+
+	rfkill_init_sw_state(rfkill, 1);
+	result = rfkill_register(wimax_dev->rfkill);
+	if (result < 0)
+		goto error_rfkill_register;
+
+	/* If there is no SW toggle op, SW RFKill is always on */
+	if (wimax_dev->op_rfkill_sw_toggle == NULL)
+		wimax_dev->rf_sw = WIMAX_RF_ON;
+
+	d_fnend(3, dev, "(wimax_dev %p) = 0\n", wimax_dev);
+	return 0;
+
+error_rfkill_register:
+	rfkill_destroy(wimax_dev->rfkill);
+error_rfkill_allocate:
+	d_fnend(3, dev, "(wimax_dev %p) = %d\n", wimax_dev, result);
+	return result;
+}
+
+
+/*
+ * Deregister a WiMAX device's RF Kill support
+ *
+ * Ick, we can't call rfkill_free() after rfkill_unregister()...oh
+ * well.
+ *
+ * WARNING: wimax_dev->mutex must be unlocked
+ */
+void wimax_rfkill_rm(struct wimax_dev *wimax_dev)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	d_fnstart(3, dev, "(wimax_dev %p)\n", wimax_dev);
+	rfkill_unregister(wimax_dev->rfkill);
+	rfkill_destroy(wimax_dev->rfkill);
+	d_fnend(3, dev, "(wimax_dev %p)\n", wimax_dev);
+}
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the rfkill command from user space, return a combination
+ * value that describe the states of the different toggles.
+ *
+ * Only one attribute: the new state requested (on, off or no change,
+ * just query).
+ */
+
+static const struct nla_policy wimax_gnl_rfkill_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_RFKILL_IFIDX] = {
+		.type = NLA_U32,
+	},
+	[WIMAX_GNL_RFKILL_STATE] = {
+		.type = NLA_U32		/* enum wimax_rf_state */
+	},
+};
+
+
+static
+int wimax_gnl_doit_rfkill(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+	struct device *dev;
+	enum wimax_rf_state new_state;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_RFKILL_IFIDX] == NULL) {
+		printk(KERN_ERR "WIMAX_GNL_OP_RFKILL: can't find IFIDX "
+			"attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	dev = wimax_dev_to_dev(wimax_dev);
+	result = -EINVAL;
+	if (info->attrs[WIMAX_GNL_RFKILL_STATE] == NULL) {
+		dev_err(dev, "WIMAX_GNL_RFKILL: can't find RFKILL_STATE "
+			"attribute\n");
+		goto error_no_pid;
+	}
+	new_state = nla_get_u32(info->attrs[WIMAX_GNL_RFKILL_STATE]);
+
+	/* Execute the operation and send the result back to user space */
+	result = wimax_rfkill(wimax_dev, new_state);
+error_no_pid:
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
+
+
+struct genl_ops wimax_gnl_rfkill = {
+	.cmd = WIMAX_GNL_OP_RFKILL,
+	.flags = GENL_ADMIN_PERM,
+	.policy = wimax_gnl_rfkill_policy,
+	.doit = wimax_gnl_doit_rfkill,
+	.dumpit = NULL,
+};
+
diff --git a/net/wimax/op-state-get.c b/net/wimax/op-state-get.c
new file mode 100644
index 00000000..aff8776e
--- /dev/null
+++ b/net/wimax/op-state-get.c
@@ -0,0 +1,83 @@
+/*
+ * Linux WiMAX
+ * Implement and export a method for getting a WiMAX device current state
+ *
+ * Copyright (C) 2009 Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * Based on previous WiMAX core work by:
+ *  Copyright (C) 2008 Intel Corporation <linux-wimax@intel.com>
+ *  Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <net/wimax.h>
+#include <net/genetlink.h>
+#include <linux/wimax.h>
+#include <linux/security.h>
+#include "wimax-internal.h"
+
+#define D_SUBMODULE op_state_get
+#include "debug-levels.h"
+
+
+static const struct nla_policy wimax_gnl_state_get_policy[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_STGET_IFIDX] = {
+		.type = NLA_U32,
+	},
+};
+
+
+/*
+ * Exporting to user space over generic netlink
+ *
+ * Parse the state get command from user space, return a combination
+ * value that describe the current state.
+ *
+ * No attributes.
+ */
+static
+int wimax_gnl_doit_state_get(struct sk_buff *skb, struct genl_info *info)
+{
+	int result, ifindex;
+	struct wimax_dev *wimax_dev;
+
+	d_fnstart(3, NULL, "(skb %p info %p)\n", skb, info);
+	result = -ENODEV;
+	if (info->attrs[WIMAX_GNL_STGET_IFIDX] == NULL) {
+		printk(KERN_ERR "WIMAX_GNL_OP_STATE_GET: can't find IFIDX "
+			"attribute\n");
+		goto error_no_wimax_dev;
+	}
+	ifindex = nla_get_u32(info->attrs[WIMAX_GNL_STGET_IFIDX]);
+	wimax_dev = wimax_dev_get_by_genl_info(info, ifindex);
+	if (wimax_dev == NULL)
+		goto error_no_wimax_dev;
+	/* Execute the operation and send the result back to user space */
+	result = wimax_state_get(wimax_dev);
+	dev_put(wimax_dev->net_dev);
+error_no_wimax_dev:
+	d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
+	return result;
+}
+
+
+struct genl_ops wimax_gnl_state_get = {
+	.cmd = WIMAX_GNL_OP_STATE_GET,
+	.flags = GENL_ADMIN_PERM,
+	.policy = wimax_gnl_state_get_policy,
+	.doit = wimax_gnl_doit_state_get,
+	.dumpit = NULL,
+};
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
new file mode 100644
index 00000000..ee99e7df
--- /dev/null
+++ b/net/wimax/stack.c
@@ -0,0 +1,633 @@
+/*
+ * Linux WiMAX
+ * Initialization, addition and removal of wimax devices
+ *
+ *
+ * Copyright (C) 2005-2006 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This implements:
+ *
+ *   - basic life cycle of 'struct wimax_dev' [wimax_dev_*()]; on
+ *     addition/registration initialize all subfields and allocate
+ *     generic netlink resources for user space communication. On
+ *     removal/unregistration, undo all that.
+ *
+ *   - device state machine [wimax_state_change()] and support to send
+ *     reports to user space when the state changes
+ *     [wimax_gnl_re_state_change*()].
+ *
+ * See include/net/wimax.h for rationales and design.
+ *
+ * ROADMAP
+ *
+ * [__]wimax_state_change()     Called by drivers to update device's state
+ *   wimax_gnl_re_state_change_alloc()
+ *   wimax_gnl_re_state_change_send()
+ *
+ * wimax_dev_init()	        Init a device
+ * wimax_dev_add()              Register
+ *   wimax_rfkill_add()
+ *   wimax_gnl_add()            Register all the generic netlink resources.
+ *   wimax_id_table_add()
+ * wimax_dev_rm()               Unregister
+ *   wimax_id_table_rm()
+ *   wimax_gnl_rm()
+ *   wimax_rfkill_rm()
+ */
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <net/genetlink.h>
+#include <linux/netdevice.h>
+#include <linux/wimax.h>
+#include "wimax-internal.h"
+
+
+#define D_SUBMODULE stack
+#include "debug-levels.h"
+
+static char wimax_debug_params[128];
+module_param_string(debug, wimax_debug_params, sizeof(wimax_debug_params),
+		    0644);
+MODULE_PARM_DESC(debug,
+		 "String of space-separated NAME:VALUE pairs, where NAMEs "
+		 "are the different debug submodules and VALUE are the "
+		 "initial debug value to set.");
+
+/*
+ * Authoritative source for the RE_STATE_CHANGE attribute policy
+ *
+ * We don't really use it here, but /me likes to keep the definition
+ * close to where the data is generated.
+ */
+/*
+static const struct nla_policy wimax_gnl_re_status_change[WIMAX_GNL_ATTR_MAX + 1] = {
+	[WIMAX_GNL_STCH_STATE_OLD] = { .type = NLA_U8 },
+	[WIMAX_GNL_STCH_STATE_NEW] = { .type = NLA_U8 },
+};
+*/
+
+
+/*
+ * Allocate a Report State Change message
+ *
+ * @header: save it, you need it for _send()
+ *
+ * Creates and fills a basic state change message; different code
+ * paths can then add more attributes to the message as needed.
+ *
+ * Use wimax_gnl_re_state_change_send() to send the returned skb.
+ *
+ * Returns: skb with the genl message if ok, IS_ERR() ptr on error
+ *     with an errno code.
+ */
+static
+struct sk_buff *wimax_gnl_re_state_change_alloc(
+	struct wimax_dev *wimax_dev,
+	enum wimax_st new_state, enum wimax_st old_state,
+	void **header)
+{
+	int result;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	void *data;
+	struct sk_buff *report_skb;
+
+	d_fnstart(3, dev, "(wimax_dev %p new_state %u old_state %u)\n",
+		  wimax_dev, new_state, old_state);
+	result = -ENOMEM;
+	report_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (report_skb == NULL) {
+		dev_err(dev, "RE_STCH: can't create message\n");
+		goto error_new;
+	}
+	data = genlmsg_put(report_skb, 0, wimax_gnl_mcg.id, &wimax_gnl_family,
+			   0, WIMAX_GNL_RE_STATE_CHANGE);
+	if (data == NULL) {
+		dev_err(dev, "RE_STCH: can't put data into message\n");
+		goto error_put;
+	}
+	*header = data;
+
+	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_OLD, old_state);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding OLD attr: %d\n", result);
+		goto error_put;
+	}
+	result = nla_put_u8(report_skb, WIMAX_GNL_STCH_STATE_NEW, new_state);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding NEW attr: %d\n", result);
+		goto error_put;
+	}
+	result = nla_put_u32(report_skb, WIMAX_GNL_STCH_IFIDX,
+			     wimax_dev->net_dev->ifindex);
+	if (result < 0) {
+		dev_err(dev, "RE_STCH: Error adding IFINDEX attribute\n");
+		goto error_put;
+	}
+	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %p\n",
+		wimax_dev, new_state, old_state, report_skb);
+	return report_skb;
+
+error_put:
+	nlmsg_free(report_skb);
+error_new:
+	d_fnend(3, dev, "(wimax_dev %p new_state %u old_state %u) = %d\n",
+		wimax_dev, new_state, old_state, result);
+	return ERR_PTR(result);
+}
+
+
+/*
+ * Send a Report State Change message (as created with _alloc).
+ *
+ * @report_skb: as returned by wimax_gnl_re_state_change_alloc()
+ * @header: as returned by wimax_gnl_re_state_change_alloc()
+ *
+ * Returns: 0 if ok, < 0 errno code on error.
+ *
+ * If the message is  NULL, pretend it didn't happen.
+ */
+static
+int wimax_gnl_re_state_change_send(
+	struct wimax_dev *wimax_dev, struct sk_buff *report_skb,
+	void *header)
+{
+	int result = 0;
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	d_fnstart(3, dev, "(wimax_dev %p report_skb %p)\n",
+		  wimax_dev, report_skb);
+	if (report_skb == NULL) {
+		result = -ENOMEM;
+		goto out;
+	}
+	genlmsg_end(report_skb, header);
+	genlmsg_multicast(report_skb, 0, wimax_gnl_mcg.id, GFP_KERNEL);
+out:
+	d_fnend(3, dev, "(wimax_dev %p report_skb %p) = %d\n",
+		wimax_dev, report_skb, result);
+	return result;
+}
+
+
+static
+void __check_new_state(enum wimax_st old_state, enum wimax_st new_state,
+		       unsigned allowed_states_bm)
+{
+	if (WARN_ON(((1 << new_state) & allowed_states_bm) == 0)) {
+		printk(KERN_ERR "SW BUG! Forbidden state change %u -> %u\n",
+			old_state, new_state);
+	}
+}
+
+
+/*
+ * Set the current state of a WiMAX device [unlocking version of
+ * wimax_state_change().
+ */
+void __wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+	struct device *dev = wimax_dev_to_dev(wimax_dev);
+	enum wimax_st old_state = wimax_dev->state;
+	struct sk_buff *stch_skb;
+	void *header;
+
+	d_fnstart(3, dev, "(wimax_dev %p new_state %u [old %u])\n",
+		  wimax_dev, new_state, old_state);
+
+	if (WARN_ON(new_state >= __WIMAX_ST_INVALID)) {
+		dev_err(dev, "SW BUG: requesting invalid state %u\n",
+			new_state);
+		goto out;
+	}
+	if (old_state == new_state)
+		goto out;
+	header = NULL;	/* gcc complains? can't grok why */
+	stch_skb = wimax_gnl_re_state_change_alloc(
+		wimax_dev, new_state, old_state, &header);
+
+	/* Verify the state transition and do exit-from-state actions */
+	switch (old_state) {
+	case __WIMAX_ST_NULL:
+		__check_new_state(old_state, new_state,
+				  1 << WIMAX_ST_DOWN);
+		break;
+	case WIMAX_ST_DOWN:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_UNINITIALIZED
+				  | 1 << WIMAX_ST_RADIO_OFF);
+		break;
+	case __WIMAX_ST_QUIESCING:
+		__check_new_state(old_state, new_state, 1 << WIMAX_ST_DOWN);
+		break;
+	case WIMAX_ST_UNINITIALIZED:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF);
+		break;
+	case WIMAX_ST_RADIO_OFF:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_READY);
+		break;
+	case WIMAX_ST_READY:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_SCANNING
+				  | 1 << WIMAX_ST_CONNECTING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_SCANNING:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY
+				  | 1 << WIMAX_ST_CONNECTING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_CONNECTING:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY
+				  | 1 << WIMAX_ST_SCANNING
+				  | 1 << WIMAX_ST_CONNECTED);
+		break;
+	case WIMAX_ST_CONNECTED:
+		__check_new_state(old_state, new_state,
+				  1 << __WIMAX_ST_QUIESCING
+				  | 1 << WIMAX_ST_RADIO_OFF
+				  | 1 << WIMAX_ST_READY);
+		netif_tx_disable(wimax_dev->net_dev);
+		netif_carrier_off(wimax_dev->net_dev);
+		break;
+	case __WIMAX_ST_INVALID:
+	default:
+		dev_err(dev, "SW BUG: wimax_dev %p is in unknown state %u\n",
+			wimax_dev, wimax_dev->state);
+		WARN_ON(1);
+		goto out;
+	}
+
+	/* Execute the actions of entry to the new state */
+	switch (new_state) {
+	case __WIMAX_ST_NULL:
+		dev_err(dev, "SW BUG: wimax_dev %p entering NULL state "
+			"from %u\n", wimax_dev, wimax_dev->state);
+		WARN_ON(1);		/* Nobody can enter this state */
+		break;
+	case WIMAX_ST_DOWN:
+		break;
+	case __WIMAX_ST_QUIESCING:
+		break;
+	case WIMAX_ST_UNINITIALIZED:
+		break;
+	case WIMAX_ST_RADIO_OFF:
+		break;
+	case WIMAX_ST_READY:
+		break;
+	case WIMAX_ST_SCANNING:
+		break;
+	case WIMAX_ST_CONNECTING:
+		break;
+	case WIMAX_ST_CONNECTED:
+		netif_carrier_on(wimax_dev->net_dev);
+		netif_wake_queue(wimax_dev->net_dev);
+		break;
+	case __WIMAX_ST_INVALID:
+	default:
+		BUG();
+	}
+	__wimax_state_set(wimax_dev, new_state);
+	if (!IS_ERR(stch_skb))
+		wimax_gnl_re_state_change_send(wimax_dev, stch_skb, header);
+out:
+	d_fnend(3, dev, "(wimax_dev %p new_state %u [old %u]) = void\n",
+		wimax_dev, new_state, old_state);
+}
+
+
+/**
+ * wimax_state_change - Set the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (properly referenced)
+ * @new_state: New state to switch to
+ *
+ * This implements the state changes for the wimax devices. It will
+ *
+ * - verify that the state transition is legal (for now it'll just
+ *   print a warning if not) according to the table in
+ *   linux/wimax.h's documentation for 'enum wimax_st'.
+ *
+ * - perform the actions needed for leaving the current state and
+ *   whichever are needed for entering the new state.
+ *
+ * - issue a report to user space indicating the new state (and an
+ *   optional payload with information about the new state).
+ *
+ * NOTE: @wimax_dev must be locked
+ */
+void wimax_state_change(struct wimax_dev *wimax_dev, enum wimax_st new_state)
+{
+	/*
+	 * A driver cannot take the wimax_dev out of the
+	 * __WIMAX_ST_NULL state unless by calling wimax_dev_add(). If
+	 * the wimax_dev's state is still NULL, we ignore any request
+	 * to change its state because it means it hasn't been yet
+	 * registered.
+	 *
+	 * There is no need to complain about it, as routines that
+	 * call this might be shared from different code paths that
+	 * are called before or after wimax_dev_add() has done its
+	 * job.
+	 */
+	mutex_lock(&wimax_dev->mutex);
+	if (wimax_dev->state > __WIMAX_ST_NULL)
+		__wimax_state_change(wimax_dev, new_state);
+	mutex_unlock(&wimax_dev->mutex);
+}
+EXPORT_SYMBOL_GPL(wimax_state_change);
+
+
+/**
+ * wimax_state_get() - Return the current state of a WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Returns: Current state of the device according to its driver.
+ */
+enum wimax_st wimax_state_get(struct wimax_dev *wimax_dev)
+{
+	enum wimax_st state;
+	mutex_lock(&wimax_dev->mutex);
+	state = wimax_dev->state;
+	mutex_unlock(&wimax_dev->mutex);
+	return state;
+}
+EXPORT_SYMBOL_GPL(wimax_state_get);
+
+
+/**
+ * wimax_dev_init - initialize a newly allocated instance
+ *
+ * @wimax_dev: WiMAX device descriptor to initialize.
+ *
+ * Initializes fields of a freshly allocated @wimax_dev instance. This
+ * function assumes that after allocation, the memory occupied by
+ * @wimax_dev was zeroed.
+ */
+void wimax_dev_init(struct wimax_dev *wimax_dev)
+{
+	INIT_LIST_HEAD(&wimax_dev->id_table_node);
+	__wimax_state_set(wimax_dev, __WIMAX_ST_NULL);
+	mutex_init(&wimax_dev->mutex);
+	mutex_init(&wimax_dev->mutex_reset);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_init);
+
+/*
+ * This extern is declared here because it's easier to keep track --
+ * both declarations are a list of the same
+ */
+extern struct genl_ops
+	wimax_gnl_msg_from_user,
+	wimax_gnl_reset,
+	wimax_gnl_rfkill,
+	wimax_gnl_state_get;
+
+static
+struct genl_ops *wimax_gnl_ops[] = {
+	&wimax_gnl_msg_from_user,
+	&wimax_gnl_reset,
+	&wimax_gnl_rfkill,
+	&wimax_gnl_state_get,
+};
+
+
+static
+size_t wimax_addr_scnprint(char *addr_str, size_t addr_str_size,
+			   unsigned char *addr, size_t addr_len)
+{
+	unsigned cnt, total;
+	for (total = cnt = 0; cnt < addr_len; cnt++)
+		total += scnprintf(addr_str + total, addr_str_size - total,
+				   "%02x%c", addr[cnt],
+				   cnt == addr_len - 1 ? '\0' : ':');
+	return total;
+}
+
+
+/**
+ * wimax_dev_add - Register a new WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor (as embedded in your @net_dev's
+ *     priv data). You must have called wimax_dev_init() on it before.
+ *
+ * @net_dev: net device the @wimax_dev is associated with. The
+ *     function expects SET_NETDEV_DEV() and register_netdev() were
+ *     already called on it.
+ *
+ * Registers the new WiMAX device, sets up the user-kernel control
+ * interface (generic netlink) and common WiMAX infrastructure.
+ *
+ * Note that the parts that will allow interaction with user space are
+ * setup at the very end, when the rest is in place, as once that
+ * happens, the driver might get user space control requests via
+ * netlink or from debugfs that might translate into calls into
+ * wimax_dev->op_*().
+ */
+int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
+{
+	int result;
+	struct device *dev = net_dev->dev.parent;
+	char addr_str[32];
+
+	d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
+
+	/* Do the RFKILL setup before locking, as RFKILL will call
+	 * into our functions. */
+	wimax_dev->net_dev = net_dev;
+	result = wimax_rfkill_add(wimax_dev);
+	if (result < 0)
+		goto error_rfkill_add;
+
+	/* Set up user-space interaction */
+	mutex_lock(&wimax_dev->mutex);
+	wimax_id_table_add(wimax_dev);
+	result = wimax_debugfs_add(wimax_dev);
+	if (result < 0) {
+		dev_err(dev, "cannot initialize debugfs: %d\n",
+			result);
+		goto error_debugfs_add;
+	}
+
+	__wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
+	mutex_unlock(&wimax_dev->mutex);
+
+	wimax_addr_scnprint(addr_str, sizeof(addr_str),
+			    net_dev->dev_addr, net_dev->addr_len);
+	dev_err(dev, "WiMAX interface %s (%s) ready\n",
+		net_dev->name, addr_str);
+	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
+	return 0;
+
+error_debugfs_add:
+	wimax_id_table_rm(wimax_dev);
+	mutex_unlock(&wimax_dev->mutex);
+	wimax_rfkill_rm(wimax_dev);
+error_rfkill_add:
+	d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
+		wimax_dev, net_dev, result);
+	return result;
+}
+EXPORT_SYMBOL_GPL(wimax_dev_add);
+
+
+/**
+ * wimax_dev_rm - Unregister an existing WiMAX device
+ *
+ * @wimax_dev: WiMAX device descriptor
+ *
+ * Unregisters a WiMAX device previously registered for use with
+ * wimax_add_rm().
+ *
+ * IMPORTANT! Must call before calling unregister_netdev().
+ *
+ * After this function returns, you will not get any more user space
+ * control requests (via netlink or debugfs) and thus to wimax_dev->ops.
+ *
+ * Reentrancy control is ensured by setting the state to
+ * %__WIMAX_ST_QUIESCING. rfkill operations coming through
+ * wimax_*rfkill*() will be stopped by the quiescing state; ops coming
+ * from the rfkill subsystem will be stopped by the support being
+ * removed by wimax_rfkill_rm().
+ */
+void wimax_dev_rm(struct wimax_dev *wimax_dev)
+{
+	d_fnstart(3, NULL, "(wimax_dev %p)\n", wimax_dev);
+
+	mutex_lock(&wimax_dev->mutex);
+	__wimax_state_change(wimax_dev, __WIMAX_ST_QUIESCING);
+	wimax_debugfs_rm(wimax_dev);
+	wimax_id_table_rm(wimax_dev);
+	__wimax_state_change(wimax_dev, WIMAX_ST_DOWN);
+	mutex_unlock(&wimax_dev->mutex);
+	wimax_rfkill_rm(wimax_dev);
+	d_fnend(3, NULL, "(wimax_dev %p) = void\n", wimax_dev);
+}
+EXPORT_SYMBOL_GPL(wimax_dev_rm);
+
+
+/* Debug framework control of debug levels */
+struct d_level D_LEVEL[] = {
+	D_SUBMODULE_DEFINE(debugfs),
+	D_SUBMODULE_DEFINE(id_table),
+	D_SUBMODULE_DEFINE(op_msg),
+	D_SUBMODULE_DEFINE(op_reset),
+	D_SUBMODULE_DEFINE(op_rfkill),
+	D_SUBMODULE_DEFINE(op_state_get),
+	D_SUBMODULE_DEFINE(stack),
+};
+size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
+
+
+struct genl_family wimax_gnl_family = {
+	.id = GENL_ID_GENERATE,
+	.name = "WiMAX",
+	.version = WIMAX_GNL_VERSION,
+	.hdrsize = 0,
+	.maxattr = WIMAX_GNL_ATTR_MAX,
+};
+
+struct genl_multicast_group wimax_gnl_mcg = {
+	.name = "msg",
+};
+
+
+
+/* Shutdown the wimax stack */
+static
+int __init wimax_subsys_init(void)
+{
+	int result, cnt;
+
+	d_fnstart(4, NULL, "()\n");
+	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
+		       "wimax.debug");
+
+	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
+		 "WiMAX");
+	result = genl_register_family(&wimax_gnl_family);
+	if (unlikely(result < 0)) {
+		printk(KERN_ERR "cannot register generic netlink family: %d\n",
+		       result);
+		goto error_register_family;
+	}
+
+	for (cnt = 0; cnt < ARRAY_SIZE(wimax_gnl_ops); cnt++) {
+		result = genl_register_ops(&wimax_gnl_family,
+					   wimax_gnl_ops[cnt]);
+		d_printf(4, NULL, "registering generic netlink op code "
+			 "%u: %d\n", wimax_gnl_ops[cnt]->cmd, result);
+		if (unlikely(result < 0)) {
+			printk(KERN_ERR "cannot register generic netlink op "
+			       "code %u: %d\n",
+			       wimax_gnl_ops[cnt]->cmd, result);
+			goto error_register_ops;
+		}
+	}
+
+	result = genl_register_mc_group(&wimax_gnl_family, &wimax_gnl_mcg);
+	if (result < 0)
+		goto error_mc_group;
+	d_fnend(4, NULL, "() = 0\n");
+	return 0;
+
+error_mc_group:
+error_register_ops:
+	for (cnt--; cnt >= 0; cnt--)
+		genl_unregister_ops(&wimax_gnl_family,
+				    wimax_gnl_ops[cnt]);
+	genl_unregister_family(&wimax_gnl_family);
+error_register_family:
+	d_fnend(4, NULL, "() = %d\n", result);
+	return result;
+
+}
+module_init(wimax_subsys_init);
+
+
+/* Shutdown the wimax stack */
+static
+void __exit wimax_subsys_exit(void)
+{
+	int cnt;
+	wimax_id_table_release();
+	genl_unregister_mc_group(&wimax_gnl_family, &wimax_gnl_mcg);
+	for (cnt = ARRAY_SIZE(wimax_gnl_ops) - 1; cnt >= 0; cnt--)
+		genl_unregister_ops(&wimax_gnl_family,
+				    wimax_gnl_ops[cnt]);
+	genl_unregister_family(&wimax_gnl_family);
+}
+module_exit(wimax_subsys_exit);
+
+MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
+MODULE_DESCRIPTION("Linux WiMAX stack");
+MODULE_LICENSE("GPL");
+
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
new file mode 100644
index 00000000..1e743d21
--- /dev/null
+++ b/net/wimax/wimax-internal.h
@@ -0,0 +1,91 @@
+/*
+ * Linux WiMAX
+ * Internal API for kernel space WiMAX stack
+ *
+ *
+ * Copyright (C) 2007 Intel Corporation <linux-wimax@intel.com>
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ *
+ * This header file is for declarations and definitions internal to
+ * the WiMAX stack. For public APIs and documentation, see
+ * include/net/wimax.h and include/linux/wimax.h.
+ */
+
+#ifndef __WIMAX_INTERNAL_H__
+#define __WIMAX_INTERNAL_H__
+#ifdef __KERNEL__
+
+#include <linux/device.h>
+#include <net/wimax.h>
+
+
+/*
+ * Decide if a (locked) device is ready for use
+ *
+ * Before using the device structure, it must be locked
+ * (wimax_dev->mutex). As well, most operations need to call this
+ * function to check if the state is the right one.
+ *
+ * An error value will be returned if the state is not the right
+ * one. In that case, the caller should not attempt to use the device
+ * and just unlock it.
+ */
+static inline __must_check
+int wimax_dev_is_ready(struct wimax_dev *wimax_dev)
+{
+	if (wimax_dev->state == __WIMAX_ST_NULL)
+		return -EINVAL;	/* Device is not even registered! */
+	if (wimax_dev->state == WIMAX_ST_DOWN)
+		return -ENOMEDIUM;
+	if (wimax_dev->state == __WIMAX_ST_QUIESCING)
+		return -ESHUTDOWN;
+	return 0;
+}
+
+
+static inline
+void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
+{
+	wimax_dev->state = state;
+}
+extern void __wimax_state_change(struct wimax_dev *, enum wimax_st);
+
+#ifdef CONFIG_DEBUG_FS
+extern int wimax_debugfs_add(struct wimax_dev *);
+extern void wimax_debugfs_rm(struct wimax_dev *);
+#else
+static inline int wimax_debugfs_add(struct wimax_dev *wimax_dev)
+{
+	return 0;
+}
+static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
+#endif
+
+extern void wimax_id_table_add(struct wimax_dev *);
+extern struct wimax_dev *wimax_dev_get_by_genl_info(struct genl_info *, int);
+extern void wimax_id_table_rm(struct wimax_dev *);
+extern void wimax_id_table_release(void);
+
+extern int wimax_rfkill_add(struct wimax_dev *);
+extern void wimax_rfkill_rm(struct wimax_dev *);
+
+extern struct genl_family wimax_gnl_family;
+extern struct genl_multicast_group wimax_gnl_mcg;
+
+#endif /* #ifdef __KERNEL__ */
+#endif /* #ifndef __WIMAX_INTERNAL_H__ */
diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore
new file mode 100644
index 00000000..c33451b8
--- /dev/null
+++ b/net/wireless/.gitignore
@@ -0,0 +1 @@
+regdb.c
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
new file mode 100644
index 00000000..39caddcd
--- /dev/null
+++ b/net/wireless/Kconfig
@@ -0,0 +1,184 @@
+config WIRELESS_EXT
+	bool "WIRELESS_EXT - Wireless extention interface "
+	depends on WIRELESS
+	help
+	 if your wifi module is build outside this kernel source tree .
+	 please choose this option by your self .
+
+config WEXT_CORE
+	def_bool y
+	depends on CFG80211_WEXT || WIRELESS_EXT
+
+config WEXT_PROC
+	def_bool y
+	depends on PROC_FS
+	depends on WEXT_CORE
+
+config WEXT_SPY
+	bool "WEXT_SPY - "
+	depends on WIRELESS_EXT
+	help
+	 if your wifi module is build outside this kernel source tree .
+	 please choose this option by your self .
+
+config WEXT_PRIV
+	bool "WEXT_PRIV - "
+	depends on WIRELESS_EXT
+	help
+	 if your wifi module is build outside this kernel source tree .
+	 please choose this option by your self .
+
+config CFG80211
+	tristate "cfg80211 - wireless configuration API"
+	depends on RFKILL || !RFKILL
+	---help---
+	  cfg80211 is the Linux wireless LAN (802.11) configuration API.
+	  Enable this if you have a wireless device.
+
+	  For more information refer to documentation on the wireless wiki:
+
+	  http://wireless.kernel.org/en/developers/Documentation/cfg80211
+
+	  When built as a module it will be called cfg80211.
+
+config NL80211_TESTMODE
+	bool "nl80211 testmode command"
+	depends on CFG80211
+	help
+	  The nl80211 testmode command helps implementing things like
+	  factory calibration or validation tools for wireless chips.
+
+	  Select this option ONLY for kernels that are specifically
+	  built for such purposes.
+
+	  Debugging tools that are supposed to end up in the hands of
+	  users should better be implemented with debugfs.
+
+	  Say N.
+
+config CFG80211_DEVELOPER_WARNINGS
+	bool "enable developer warnings"
+	depends on CFG80211
+	default n
+	help
+	  This option enables some additional warnings that help
+	  cfg80211 developers and driver developers, but that can
+	  trigger due to races with userspace.
+
+	  For example, when a driver reports that it was disconnected
+	  from the AP, but the user disconnects manually at the same
+	  time, the warning might trigger spuriously due to races.
+
+	  Say Y only if you are developing cfg80211 or a driver based
+	  on it (or mac80211).
+
+
+config CFG80211_REG_DEBUG
+	bool "cfg80211 regulatory debugging"
+	depends on CFG80211
+	default n
+	---help---
+	  You can enable this if you want to debug regulatory changes.
+	  For more information on cfg80211 regulatory refer to the wireless
+	  wiki:
+
+	  http://wireless.kernel.org/en/developers/Regulatory
+
+	  If unsure, say N.
+
+config CFG80211_DEFAULT_PS
+	bool "enable powersave by default"
+	depends on CFG80211
+	default y
+	help
+	  This option enables powersave mode by default.
+
+	  If this causes your applications to misbehave you should fix your
+	  applications instead -- they need to register their network
+	  latency requirement, see Documentation/power/pm_qos_interface.txt.
+
+config CFG80211_DEBUGFS
+	bool "cfg80211 DebugFS entries"
+	depends on CFG80211
+	depends on DEBUG_FS
+	---help---
+	  You can enable this if you want to debugfs entries for cfg80211.
+
+	  If unsure, say N.
+
+config CFG80211_INTERNAL_REGDB
+	bool "use statically compiled regulatory rules database" if EXPERT
+	default n
+	depends on CFG80211
+	---help---
+	  This option generates an internal data structure representing
+	  the wireless regulatory rules described in net/wireless/db.txt
+	  and includes code to query that database.  This is an alternative
+	  to using CRDA for defining regulatory rules for the kernel.
+
+	  For details see:
+
+	  http://wireless.kernel.org/en/developers/Regulatory
+
+	  Most distributions have a CRDA package.  So if unsure, say N.
+
+config CFG80211_WEXT
+	bool "cfg80211 wireless extensions compatibility"
+	depends on CFG80211
+	select WEXT_CORE
+	default y
+	help
+	  Enable this option if you need old userspace for wireless
+	  extensions with cfg80211-based drivers.
+
+config WIRELESS_EXT_SYSFS
+	bool "Wireless extensions sysfs files"
+	default y
+	depends on WEXT_CORE && SYSFS
+	help
+	  This option enables the deprecated wireless statistics
+	  files in /sys/class/net/*/wireless/. The same information
+	  is available via the ioctls as well.
+
+	  Say Y if you have programs using it, like old versions of
+	  hal.
+
+config LIB80211
+	tristate "Common routines for IEEE802.11 drivers"
+	default n
+	help
+	  This options enables a library of common routines used
+	  by IEEE802.11 wireless LAN drivers.
+
+	  Drivers should select this themselves if needed.  Say Y if
+	  you want this built into your kernel.
+
+config LIB80211_CRYPT_WEP
+	tristate
+
+config LIB80211_CRYPT_CCMP
+	tristate
+
+config LIB80211_CRYPT_TKIP
+	tristate
+
+config LIB80211_DEBUG
+	bool "lib80211 debugging messages"
+	depends on LIB80211
+	default n
+	---help---
+	  You can enable this if you want verbose debugging messages
+	  from lib80211.
+
+	  If unsure, say N.
+
+config CFG80211_ALLOW_RECONNECT
+	bool "Allow reconnect while already connected"
+	depends on CFG80211
+	default n
+	help
+	  cfg80211 stack doesn't allow to connect if you are already
+	  connected. This option allows to make a connection in this case.
+
+	  Select this option ONLY for wlan drivers that are specifically
+	  built for such purposes.
diff --git a/net/wireless/Makefile b/net/wireless/Makefile
new file mode 100644
index 00000000..55a28ab2
--- /dev/null
+++ b/net/wireless/Makefile
@@ -0,0 +1,23 @@
+obj-$(CONFIG_CFG80211) += cfg80211.o
+obj-$(CONFIG_LIB80211) += lib80211.o
+obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o
+obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o
+obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o
+
+obj-$(CONFIG_WEXT_CORE) += wext-core.o
+obj-$(CONFIG_WEXT_PROC) += wext-proc.o
+obj-$(CONFIG_WEXT_SPY) += wext-spy.o
+obj-$(CONFIG_WEXT_PRIV) += wext-priv.o
+
+cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o
+cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o
+cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o
+cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o
+cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o
+
+ccflags-y += -D__CHECK_ENDIAN__
+
+$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk
+	@$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@
+
+clean-files := regdb.c
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
new file mode 100644
index 00000000..17cd0c04
--- /dev/null
+++ b/net/wireless/chan.c
@@ -0,0 +1,135 @@
+/*
+ * This file contains helper code to handle channel
+ * settings and keeping track of what is possible at
+ * any point in time.
+ *
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <net/cfg80211.h>
+#include "core.h"
+
+struct ieee80211_channel *
+rdev_freq_to_chan(struct cfg80211_registered_device *rdev,
+		  int freq, enum nl80211_channel_type channel_type)
+{
+	struct ieee80211_channel *chan;
+	struct ieee80211_sta_ht_cap *ht_cap;
+
+	chan = ieee80211_get_channel(&rdev->wiphy, freq);
+
+	/* Primary channel not allowed */
+	if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+		return NULL;
+
+	if (channel_type == NL80211_CHAN_HT40MINUS &&
+	    chan->flags & IEEE80211_CHAN_NO_HT40MINUS)
+		return NULL;
+	else if (channel_type == NL80211_CHAN_HT40PLUS &&
+		 chan->flags & IEEE80211_CHAN_NO_HT40PLUS)
+		return NULL;
+
+	ht_cap = &rdev->wiphy.bands[chan->band]->ht_cap;
+
+	if (channel_type != NL80211_CHAN_NO_HT) {
+		if (!ht_cap->ht_supported)
+			return NULL;
+
+		if (channel_type != NL80211_CHAN_HT20 &&
+		    (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
+		    ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT))
+			return NULL;
+	}
+
+	return chan;
+}
+
+static bool can_beacon_sec_chan(struct wiphy *wiphy,
+				struct ieee80211_channel *chan,
+				enum nl80211_channel_type channel_type)
+{
+	struct ieee80211_channel *sec_chan;
+	int diff;
+
+	switch (channel_type) {
+	case NL80211_CHAN_HT40PLUS:
+		diff = 20;
+		break;
+	case NL80211_CHAN_HT40MINUS:
+		diff = -20;
+		break;
+	default:
+		return false;
+	}
+
+	sec_chan = ieee80211_get_channel(wiphy, chan->center_freq + diff);
+	if (!sec_chan)
+		return false;
+
+	/* we'll need a DFS capability later */
+	if (sec_chan->flags & (IEEE80211_CHAN_DISABLED |
+			       IEEE80211_CHAN_PASSIVE_SCAN |
+			       IEEE80211_CHAN_NO_IBSS |
+			       IEEE80211_CHAN_RADAR))
+		return false;
+
+	return true;
+}
+
+int cfg80211_set_freq(struct cfg80211_registered_device *rdev,
+		      struct wireless_dev *wdev, int freq,
+		      enum nl80211_channel_type channel_type)
+{
+	struct ieee80211_channel *chan;
+	int result;
+
+	if (wdev && wdev->iftype == NL80211_IFTYPE_MONITOR)
+		wdev = NULL;
+
+	if (wdev) {
+		ASSERT_WDEV_LOCK(wdev);
+
+		if (!netif_running(wdev->netdev))
+			return -ENETDOWN;
+	}
+
+	if (!rdev->ops->set_channel)
+		return -EOPNOTSUPP;
+
+	chan = rdev_freq_to_chan(rdev, freq, channel_type);
+	if (!chan)
+		return -EINVAL;
+
+	/* Both channels should be able to initiate communication */
+	if (wdev && (wdev->iftype == NL80211_IFTYPE_ADHOC ||
+		     wdev->iftype == NL80211_IFTYPE_AP ||
+		     wdev->iftype == NL80211_IFTYPE_AP_VLAN ||
+		     wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
+		     wdev->iftype == NL80211_IFTYPE_P2P_GO)) {
+		switch (channel_type) {
+		case NL80211_CHAN_HT40PLUS:
+		case NL80211_CHAN_HT40MINUS:
+			if (!can_beacon_sec_chan(&rdev->wiphy, chan,
+						 channel_type)) {
+				printk(KERN_DEBUG
+				       "cfg80211: Secondary channel not "
+				       "allowed to initiate communication\n");
+				return -EINVAL;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	result = rdev->ops->set_channel(&rdev->wiphy,
+					wdev ? wdev->netdev : NULL,
+					chan, channel_type);
+	if (result)
+		return result;
+
+	if (wdev)
+		wdev->channel = chan;
+
+	return 0;
+}
diff --git a/net/wireless/core.c b/net/wireless/core.c
new file mode 100644
index 00000000..880dbe2e
--- /dev/null
+++ b/net/wireless/core.c
@@ -0,0 +1,1056 @@
+/*
+ * This is the linux wireless configuration interface.
+ *
+ * Copyright 2006-2010		Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/nl80211.h>
+#include <linux/debugfs.h>
+#include <linux/notifier.h>
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+#include "sysfs.h"
+#include "debugfs.h"
+#include "wext-compat.h"
+#include "ethtool.h"
+
+/* name for sysfs, %d is appended */
+#define PHY_NAME "phy"
+
+MODULE_AUTHOR("Johannes Berg");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("wireless configuration support");
+
+/* RCU-protected (and cfg80211_mutex for writers) */
+LIST_HEAD(cfg80211_rdev_list);
+int cfg80211_rdev_list_generation;
+
+DEFINE_MUTEX(cfg80211_mutex);
+
+/* for debugfs */
+static struct dentry *ieee80211_debugfs_dir;
+
+/* for the cleanup, scan and event works */
+struct workqueue_struct *cfg80211_wq;
+
+static bool cfg80211_disable_40mhz_24ghz;
+module_param(cfg80211_disable_40mhz_24ghz, bool, 0644);
+MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz,
+		 "Disable 40MHz support in the 2.4GHz band");
+
+/* requires cfg80211_mutex to be held! */
+struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx)
+{
+	struct cfg80211_registered_device *result = NULL, *rdev;
+
+	if (!wiphy_idx_valid(wiphy_idx))
+		return NULL;
+
+	assert_cfg80211_lock();
+
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+		if (rdev->wiphy_idx == wiphy_idx) {
+			result = rdev;
+			break;
+		}
+	}
+
+	return result;
+}
+
+int get_wiphy_idx(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev;
+	if (!wiphy)
+		return WIPHY_IDX_STALE;
+	rdev = wiphy_to_dev(wiphy);
+	return rdev->wiphy_idx;
+}
+
+/* requires cfg80211_rdev_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
+{
+	struct cfg80211_registered_device *rdev;
+
+	if (!wiphy_idx_valid(wiphy_idx))
+		return NULL;
+
+	assert_cfg80211_lock();
+
+	rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx);
+	if (!rdev)
+		return NULL;
+	return &rdev->wiphy;
+}
+
+/* requires cfg80211_mutex to be held! */
+struct cfg80211_registered_device *
+__cfg80211_rdev_from_info(struct genl_info *info)
+{
+	int ifindex;
+	struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL;
+	struct net_device *dev;
+	int err = -EINVAL;
+
+	assert_cfg80211_lock();
+
+	if (info->attrs[NL80211_ATTR_WIPHY]) {
+		bywiphyidx = cfg80211_rdev_by_wiphy_idx(
+				nla_get_u32(info->attrs[NL80211_ATTR_WIPHY]));
+		err = -ENODEV;
+	}
+
+	if (info->attrs[NL80211_ATTR_IFINDEX]) {
+		ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]);
+		dev = dev_get_by_index(genl_info_net(info), ifindex);
+		if (dev) {
+			if (dev->ieee80211_ptr)
+				byifidx =
+					wiphy_to_dev(dev->ieee80211_ptr->wiphy);
+			dev_put(dev);
+		}
+		err = -ENODEV;
+	}
+
+	if (bywiphyidx && byifidx) {
+		if (bywiphyidx != byifidx)
+			return ERR_PTR(-EINVAL);
+		else
+			return bywiphyidx; /* == byifidx */
+	}
+	if (bywiphyidx)
+		return bywiphyidx;
+
+	if (byifidx)
+		return byifidx;
+
+	return ERR_PTR(err);
+}
+
+struct cfg80211_registered_device *
+cfg80211_get_dev_from_info(struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+
+	mutex_lock(&cfg80211_mutex);
+	rdev = __cfg80211_rdev_from_info(info);
+
+	/* if it is not an error we grab the lock on
+	 * it to assure it won't be going away while
+	 * we operate on it */
+	if (!IS_ERR(rdev))
+		mutex_lock(&rdev->mtx);
+
+	mutex_unlock(&cfg80211_mutex);
+
+	return rdev;
+}
+
+struct cfg80211_registered_device *
+cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
+{
+	struct cfg80211_registered_device *rdev = ERR_PTR(-ENODEV);
+	struct net_device *dev;
+
+	mutex_lock(&cfg80211_mutex);
+	dev = dev_get_by_index(net, ifindex);
+	if (!dev)
+		goto out;
+	if (dev->ieee80211_ptr) {
+		rdev = wiphy_to_dev(dev->ieee80211_ptr->wiphy);
+		mutex_lock(&rdev->mtx);
+	} else
+		rdev = ERR_PTR(-ENODEV);
+	dev_put(dev);
+ out:
+	mutex_unlock(&cfg80211_mutex);
+	return rdev;
+}
+
+/* requires cfg80211_mutex to be held */
+int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
+			char *newname)
+{
+	struct cfg80211_registered_device *rdev2;
+	int wiphy_idx, taken = -1, result, digits;
+
+	assert_cfg80211_lock();
+
+	/* prohibit calling the thing phy%d when %d is not its number */
+	sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
+	if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
+		/* count number of places needed to print wiphy_idx */
+		digits = 1;
+		while (wiphy_idx /= 10)
+			digits++;
+		/*
+		 * deny the name if it is phy<idx> where <idx> is printed
+		 * without leading zeroes. taken == strlen(newname) here
+		 */
+		if (taken == strlen(PHY_NAME) + digits)
+			return -EINVAL;
+	}
+
+
+	/* Ignore nop renames */
+	if (strcmp(newname, dev_name(&rdev->wiphy.dev)) == 0)
+		return 0;
+
+	/* Ensure another device does not already have this name. */
+	list_for_each_entry(rdev2, &cfg80211_rdev_list, list)
+		if (strcmp(newname, dev_name(&rdev2->wiphy.dev)) == 0)
+			return -EINVAL;
+
+	result = device_rename(&rdev->wiphy.dev, newname);
+	if (result)
+		return result;
+
+	if (rdev->wiphy.debugfsdir &&
+	    !debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
+			    rdev->wiphy.debugfsdir,
+			    rdev->wiphy.debugfsdir->d_parent,
+			    newname))
+		pr_err("failed to rename debugfs dir to %s!\n", newname);
+
+	nl80211_notify_dev_rename(rdev);
+
+	return 0;
+}
+
+int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
+			  struct net *net)
+{
+	struct wireless_dev *wdev;
+	int err = 0;
+
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK))
+		return -EOPNOTSUPP;
+
+	list_for_each_entry(wdev, &rdev->netdev_list, list) {
+		wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
+		if (err)
+			break;
+		wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+	}
+
+	if (err) {
+		/* failed -- clean up to old netns */
+		net = wiphy_net(&rdev->wiphy);
+
+		list_for_each_entry_continue_reverse(wdev, &rdev->netdev_list,
+						     list) {
+			wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+			err = dev_change_net_namespace(wdev->netdev, net,
+							"wlan%d");
+			WARN_ON(err);
+			wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
+		}
+
+		return err;
+	}
+
+	wiphy_net_set(&rdev->wiphy, net);
+
+	err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
+	WARN_ON(err);
+
+	return 0;
+}
+
+static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
+{
+	struct cfg80211_registered_device *rdev = data;
+
+	rdev->ops->rfkill_poll(&rdev->wiphy);
+}
+
+static int cfg80211_rfkill_set_block(void *data, bool blocked)
+{
+	struct cfg80211_registered_device *rdev = data;
+	struct wireless_dev *wdev;
+
+	if (!blocked)
+		return 0;
+
+	rtnl_lock();
+	mutex_lock(&rdev->devlist_mtx);
+
+	list_for_each_entry(wdev, &rdev->netdev_list, list)
+		dev_close(wdev->netdev);
+
+	mutex_unlock(&rdev->devlist_mtx);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static void cfg80211_rfkill_sync_work(struct work_struct *work)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rdev = container_of(work, struct cfg80211_registered_device, rfkill_sync);
+	cfg80211_rfkill_set_block(rdev, rfkill_blocked(rdev->rfkill));
+}
+
+static void cfg80211_event_work(struct work_struct *work)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rdev = container_of(work, struct cfg80211_registered_device,
+			    event_work);
+
+	rtnl_lock();
+	cfg80211_lock_rdev(rdev);
+
+	cfg80211_process_rdev_events(rdev);
+	cfg80211_unlock_rdev(rdev);
+	rtnl_unlock();
+}
+
+/* exported functions */
+
+struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv)
+{
+	static int wiphy_counter;
+
+	struct cfg80211_registered_device *rdev;
+	int alloc_size;
+
+	WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key));
+	WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc));
+	WARN_ON(ops->connect && !ops->disconnect);
+	WARN_ON(ops->join_ibss && !ops->leave_ibss);
+	WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf);
+	WARN_ON(ops->add_station && !ops->del_station);
+	WARN_ON(ops->add_mpath && !ops->del_mpath);
+	WARN_ON(ops->join_mesh && !ops->leave_mesh);
+
+	alloc_size = sizeof(*rdev) + sizeof_priv;
+
+	rdev = kzalloc(alloc_size, GFP_KERNEL);
+	if (!rdev)
+		return NULL;
+
+	rdev->ops = ops;
+
+	mutex_lock(&cfg80211_mutex);
+
+	rdev->wiphy_idx = wiphy_counter++;
+
+	if (unlikely(!wiphy_idx_valid(rdev->wiphy_idx))) {
+		wiphy_counter--;
+		mutex_unlock(&cfg80211_mutex);
+		/* ugh, wrapped! */
+		kfree(rdev);
+		return NULL;
+	}
+
+	mutex_unlock(&cfg80211_mutex);
+
+	/* give it a proper name */
+	dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx);
+
+	mutex_init(&rdev->mtx);
+	mutex_init(&rdev->devlist_mtx);
+	mutex_init(&rdev->sched_scan_mtx);
+	INIT_LIST_HEAD(&rdev->netdev_list);
+	spin_lock_init(&rdev->bss_lock);
+	INIT_LIST_HEAD(&rdev->bss_list);
+	INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done);
+	INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results);
+#ifdef CONFIG_CFG80211_WEXT
+	rdev->wiphy.wext = &cfg80211_wext_handler;
+#endif
+
+	device_initialize(&rdev->wiphy.dev);
+	rdev->wiphy.dev.class = &ieee80211_class;
+	rdev->wiphy.dev.platform_data = rdev;
+
+#ifdef CONFIG_CFG80211_DEFAULT_PS
+	rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
+#endif
+
+	wiphy_net_set(&rdev->wiphy, &init_net);
+
+	rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block;
+	rdev->rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev),
+				   &rdev->wiphy.dev, RFKILL_TYPE_WLAN,
+				   &rdev->rfkill_ops, rdev);
+
+	if (!rdev->rfkill) {
+		kfree(rdev);
+		return NULL;
+	}
+
+	INIT_WORK(&rdev->rfkill_sync, cfg80211_rfkill_sync_work);
+	INIT_WORK(&rdev->conn_work, cfg80211_conn_work);
+	INIT_WORK(&rdev->event_work, cfg80211_event_work);
+
+	init_waitqueue_head(&rdev->dev_wait);
+
+	/*
+	 * Initialize wiphy parameters to IEEE 802.11 MIB default values.
+	 * Fragmentation and RTS threshold are disabled by default with the
+	 * special -1 value.
+	 */
+	rdev->wiphy.retry_short = 7;
+	rdev->wiphy.retry_long = 4;
+	rdev->wiphy.frag_threshold = (u32) -1;
+	rdev->wiphy.rts_threshold = (u32) -1;
+	rdev->wiphy.coverage_class = 0;
+
+	return &rdev->wiphy;
+}
+EXPORT_SYMBOL(wiphy_new);
+
+static int wiphy_verify_combinations(struct wiphy *wiphy)
+{
+	const struct ieee80211_iface_combination *c;
+	int i, j;
+
+	/* If we have combinations enforce them */
+	if (wiphy->n_iface_combinations)
+		wiphy->flags |= WIPHY_FLAG_ENFORCE_COMBINATIONS;
+
+	for (i = 0; i < wiphy->n_iface_combinations; i++) {
+		u32 cnt = 0;
+		u16 all_iftypes = 0;
+
+		c = &wiphy->iface_combinations[i];
+
+		/* Combinations with just one interface aren't real */
+		if (WARN_ON(c->max_interfaces < 2))
+			return -EINVAL;
+
+		/* Need at least one channel */
+		if (WARN_ON(!c->num_different_channels))
+			return -EINVAL;
+
+		if (WARN_ON(!c->n_limits))
+			return -EINVAL;
+
+		for (j = 0; j < c->n_limits; j++) {
+			u16 types = c->limits[j].types;
+
+			/*
+			 * interface types shouldn't overlap, this is
+			 * used in cfg80211_can_change_interface()
+			 */
+			if (WARN_ON(types & all_iftypes))
+				return -EINVAL;
+			all_iftypes |= types;
+
+			if (WARN_ON(!c->limits[j].max))
+				return -EINVAL;
+
+			/* Shouldn't list software iftypes in combinations! */
+			if (WARN_ON(wiphy->software_iftypes & types))
+				return -EINVAL;
+
+			cnt += c->limits[j].max;
+			/*
+			 * Don't advertise an unsupported type
+			 * in a combination.
+			 */
+			if (WARN_ON((wiphy->interface_modes & types) != types))
+				return -EINVAL;
+		}
+
+		/* You can't even choose that many! */
+		if (WARN_ON(cnt < c->max_interfaces))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int wiphy_register(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	int res;
+	enum ieee80211_band band;
+	struct ieee80211_supported_band *sband;
+	bool have_band = false;
+	int i;
+	u16 ifmodes = wiphy->interface_modes;
+
+	if (WARN_ON(wiphy->addresses && !wiphy->n_addresses))
+		return -EINVAL;
+
+	if (WARN_ON(wiphy->addresses &&
+		    !is_zero_ether_addr(wiphy->perm_addr) &&
+		    memcmp(wiphy->perm_addr, wiphy->addresses[0].addr,
+			   ETH_ALEN)))
+		return -EINVAL;
+
+	if (wiphy->addresses)
+		memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
+
+	/* sanity check ifmodes */
+	WARN_ON(!ifmodes);
+	ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
+	if (WARN_ON(ifmodes != wiphy->interface_modes))
+		wiphy->interface_modes = ifmodes;
+
+	res = wiphy_verify_combinations(wiphy);
+	if (res)
+		return res;
+
+	/* sanity check supported bands/channels */
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		sband = wiphy->bands[band];
+		if (!sband)
+			continue;
+
+		sband->band = band;
+
+		if (WARN_ON(!sband->n_channels || !sband->n_bitrates))
+			return -EINVAL;
+
+		/*
+		 * Since cfg80211_disable_40mhz_24ghz is global, we can
+		 * modify the sband's ht data even if the driver uses a
+		 * global structure for that.
+		 */
+		if (cfg80211_disable_40mhz_24ghz &&
+		    band == IEEE80211_BAND_2GHZ &&
+		    sband->ht_cap.ht_supported) {
+			sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+			sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40;
+		}
+
+		/*
+		 * Since we use a u32 for rate bitmaps in
+		 * ieee80211_get_response_rate, we cannot
+		 * have more than 32 legacy rates.
+		 */
+		if (WARN_ON(sband->n_bitrates > 32))
+			return -EINVAL;
+
+		for (i = 0; i < sband->n_channels; i++) {
+			sband->channels[i].orig_flags =
+				sband->channels[i].flags;
+			sband->channels[i].orig_mag =
+				sband->channels[i].max_antenna_gain;
+			sband->channels[i].orig_mpwr =
+				sband->channels[i].max_power;
+			sband->channels[i].band = band;
+		}
+
+		have_band = true;
+	}
+
+	if (!have_band) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (rdev->wiphy.wowlan.n_patterns) {
+		if (WARN_ON(!rdev->wiphy.wowlan.pattern_min_len ||
+			    rdev->wiphy.wowlan.pattern_min_len >
+			    rdev->wiphy.wowlan.pattern_max_len))
+			return -EINVAL;
+	}
+
+	/* check and set up bitrates */
+	ieee80211_set_bitrate_flags(wiphy);
+
+	mutex_lock(&cfg80211_mutex);
+
+	res = device_add(&rdev->wiphy.dev);
+	if (res) {
+		mutex_unlock(&cfg80211_mutex);
+		return res;
+	}
+
+	/* set up regulatory info */
+	wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
+
+	list_add_rcu(&rdev->list, &cfg80211_rdev_list);
+	cfg80211_rdev_list_generation++;
+
+	/* add to debugfs */
+	rdev->wiphy.debugfsdir =
+		debugfs_create_dir(wiphy_name(&rdev->wiphy),
+				   ieee80211_debugfs_dir);
+	if (IS_ERR(rdev->wiphy.debugfsdir))
+		rdev->wiphy.debugfsdir = NULL;
+
+	if (wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) {
+		struct regulatory_request request;
+
+		request.wiphy_idx = get_wiphy_idx(wiphy);
+		request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
+		request.alpha2[0] = '9';
+		request.alpha2[1] = '9';
+
+		nl80211_send_reg_change_event(&request);
+	}
+
+	cfg80211_debugfs_rdev_add(rdev);
+	mutex_unlock(&cfg80211_mutex);
+
+	/*
+	 * due to a locking dependency this has to be outside of the
+	 * cfg80211_mutex lock
+	 */
+	res = rfkill_register(rdev->rfkill);
+	if (res)
+		goto out_rm_dev;
+
+	return 0;
+
+out_rm_dev:
+	device_del(&rdev->wiphy.dev);
+	return res;
+}
+EXPORT_SYMBOL(wiphy_register);
+
+void wiphy_rfkill_start_polling(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	if (!rdev->ops->rfkill_poll)
+		return;
+	rdev->rfkill_ops.poll = cfg80211_rfkill_poll;
+	rfkill_resume_polling(rdev->rfkill);
+}
+EXPORT_SYMBOL(wiphy_rfkill_start_polling);
+
+void wiphy_rfkill_stop_polling(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	rfkill_pause_polling(rdev->rfkill);
+}
+EXPORT_SYMBOL(wiphy_rfkill_stop_polling);
+
+void wiphy_unregister(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	rfkill_unregister(rdev->rfkill);
+
+	/* protect the device list */
+	mutex_lock(&cfg80211_mutex);
+
+	wait_event(rdev->dev_wait, ({
+		int __count;
+		mutex_lock(&rdev->devlist_mtx);
+		__count = rdev->opencount;
+		mutex_unlock(&rdev->devlist_mtx);
+		__count == 0;}));
+
+	mutex_lock(&rdev->devlist_mtx);
+	BUG_ON(!list_empty(&rdev->netdev_list));
+	mutex_unlock(&rdev->devlist_mtx);
+
+	/*
+	 * First remove the hardware from everywhere, this makes
+	 * it impossible to find from userspace.
+	 */
+	debugfs_remove_recursive(rdev->wiphy.debugfsdir);
+	list_del_rcu(&rdev->list);
+	synchronize_rcu();
+
+	/*
+	 * Try to grab rdev->mtx. If a command is still in progress,
+	 * hopefully the driver will refuse it since it's tearing
+	 * down the device already. We wait for this command to complete
+	 * before unlinking the item from the list.
+	 * Note: as codified by the BUG_ON above we cannot get here if
+	 * a virtual interface is still present. Hence, we can only get
+	 * to lock contention here if userspace issues a command that
+	 * identified the hardware by wiphy index.
+	 */
+	cfg80211_lock_rdev(rdev);
+	/* nothing */
+	cfg80211_unlock_rdev(rdev);
+
+	/* If this device got a regulatory hint tell core its
+	 * free to listen now to a new shiny device regulatory hint */
+	reg_device_remove(wiphy);
+
+	cfg80211_rdev_list_generation++;
+	device_del(&rdev->wiphy.dev);
+
+	mutex_unlock(&cfg80211_mutex);
+
+	flush_work(&rdev->scan_done_wk);
+	cancel_work_sync(&rdev->conn_work);
+	flush_work(&rdev->event_work);
+}
+EXPORT_SYMBOL(wiphy_unregister);
+
+void cfg80211_dev_free(struct cfg80211_registered_device *rdev)
+{
+	struct cfg80211_internal_bss *scan, *tmp;
+	rfkill_destroy(rdev->rfkill);
+	mutex_destroy(&rdev->mtx);
+	mutex_destroy(&rdev->devlist_mtx);
+	mutex_destroy(&rdev->sched_scan_mtx);
+	list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list)
+		cfg80211_put_bss(&scan->pub);
+	cfg80211_rdev_free_wowlan(rdev);
+	kfree(rdev);
+}
+
+void wiphy_free(struct wiphy *wiphy)
+{
+	put_device(&wiphy->dev);
+}
+EXPORT_SYMBOL(wiphy_free);
+
+void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	if (rfkill_set_hw_state(rdev->rfkill, blocked))
+		schedule_work(&rdev->rfkill_sync);
+}
+EXPORT_SYMBOL(wiphy_rfkill_set_hw_state);
+
+static void wdev_cleanup_work(struct work_struct *work)
+{
+	struct wireless_dev *wdev;
+	struct cfg80211_registered_device *rdev;
+
+	wdev = container_of(work, struct wireless_dev, cleanup_work);
+	rdev = wiphy_to_dev(wdev->wiphy);
+
+	cfg80211_lock_rdev(rdev);
+
+	if (WARN_ON(rdev->scan_req && rdev->scan_req->dev == wdev->netdev)) {
+		rdev->scan_req->aborted = true;
+		___cfg80211_scan_done(rdev, true);
+	}
+
+	cfg80211_unlock_rdev(rdev);
+
+	mutex_lock(&rdev->sched_scan_mtx);
+
+	if (WARN_ON(rdev->sched_scan_req &&
+		    rdev->sched_scan_req->dev == wdev->netdev)) {
+		__cfg80211_stop_sched_scan(rdev, false);
+	}
+
+	mutex_unlock(&rdev->sched_scan_mtx);
+
+	mutex_lock(&rdev->devlist_mtx);
+	rdev->opencount--;
+	mutex_unlock(&rdev->devlist_mtx);
+	wake_up(&rdev->dev_wait);
+
+	dev_put(wdev->netdev);
+}
+
+static struct device_type wiphy_type = {
+	.name	= "wlan",
+};
+
+static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
+					 unsigned long state,
+					 void *ndev)
+{
+	struct net_device *dev = ndev;
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev;
+	int ret;
+
+	if (!wdev)
+		return NOTIFY_DONE;
+
+	rdev = wiphy_to_dev(wdev->wiphy);
+
+	WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED);
+
+	switch (state) {
+	case NETDEV_POST_INIT:
+		SET_NETDEV_DEVTYPE(dev, &wiphy_type);
+		break;
+	case NETDEV_REGISTER:
+		/*
+		 * NB: cannot take rdev->mtx here because this may be
+		 * called within code protected by it when interfaces
+		 * are added with nl80211.
+		 */
+		mutex_init(&wdev->mtx);
+		INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work);
+		INIT_LIST_HEAD(&wdev->event_list);
+		spin_lock_init(&wdev->event_lock);
+		INIT_LIST_HEAD(&wdev->mgmt_registrations);
+		spin_lock_init(&wdev->mgmt_registrations_lock);
+
+		mutex_lock(&rdev->devlist_mtx);
+		list_add_rcu(&wdev->list, &rdev->netdev_list);
+		rdev->devlist_generation++;
+		/* can only change netns with wiphy */
+		dev->features |= NETIF_F_NETNS_LOCAL;
+
+		if (sysfs_create_link(&dev->dev.kobj, &rdev->wiphy.dev.kobj,
+				      "phy80211")) {
+			pr_err("failed to add phy80211 symlink to netdev!\n");
+		}
+		wdev->netdev = dev;
+		wdev->sme_state = CFG80211_SME_IDLE;
+		mutex_unlock(&rdev->devlist_mtx);
+#ifdef CONFIG_CFG80211_WEXT
+		wdev->wext.default_key = -1;
+		wdev->wext.default_mgmt_key = -1;
+		wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+#endif
+
+		if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT)
+			wdev->ps = true;
+		else
+			wdev->ps = false;
+		/* allow mac80211 to determine the timeout */
+		wdev->ps_timeout = -1;
+
+		if (!dev->ethtool_ops)
+			dev->ethtool_ops = &cfg80211_ethtool_ops;
+
+		if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+		     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
+		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
+			dev->priv_flags |= IFF_DONT_BRIDGE;
+		break;
+	case NETDEV_GOING_DOWN:
+		switch (wdev->iftype) {
+		case NL80211_IFTYPE_ADHOC:
+			cfg80211_leave_ibss(rdev, dev, true);
+			break;
+		case NL80211_IFTYPE_P2P_CLIENT:
+		case NL80211_IFTYPE_STATION:
+			mutex_lock(&rdev->sched_scan_mtx);
+			__cfg80211_stop_sched_scan(rdev, false);
+			mutex_unlock(&rdev->sched_scan_mtx);
+
+			wdev_lock(wdev);
+#ifdef CONFIG_CFG80211_WEXT
+			kfree(wdev->wext.ie);
+			wdev->wext.ie = NULL;
+			wdev->wext.ie_len = 0;
+			wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+#endif
+			__cfg80211_disconnect(rdev, dev,
+					      WLAN_REASON_DEAUTH_LEAVING, true);
+			cfg80211_mlme_down(rdev, dev);
+			wdev_unlock(wdev);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			cfg80211_leave_mesh(rdev, dev);
+			break;
+		default:
+			break;
+		}
+		wdev->beacon_interval = 0;
+		break;
+	case NETDEV_DOWN:
+		dev_hold(dev);
+		queue_work(cfg80211_wq, &wdev->cleanup_work);
+		break;
+	case NETDEV_UP:
+		/*
+		 * If we have a really quick DOWN/UP succession we may
+		 * have this work still pending ... cancel it and see
+		 * if it was pending, in which case we need to account
+		 * for some of the work it would have done.
+		 */
+		if (cancel_work_sync(&wdev->cleanup_work)) {
+			mutex_lock(&rdev->devlist_mtx);
+			rdev->opencount--;
+			mutex_unlock(&rdev->devlist_mtx);
+			dev_put(dev);
+		}
+		cfg80211_lock_rdev(rdev);
+		mutex_lock(&rdev->devlist_mtx);
+		wdev_lock(wdev);
+		switch (wdev->iftype) {
+#ifdef CONFIG_CFG80211_WEXT
+		case NL80211_IFTYPE_ADHOC:
+			cfg80211_ibss_wext_join(rdev, wdev);
+			break;
+		case NL80211_IFTYPE_STATION:
+			cfg80211_mgd_wext_connect(rdev, wdev);
+			break;
+#endif
+#ifdef CONFIG_MAC80211_MESH
+		case NL80211_IFTYPE_MESH_POINT:
+			{
+				/* backward compat code... */
+				struct mesh_setup setup;
+				memcpy(&setup, &default_mesh_setup,
+						sizeof(setup));
+				 /* back compat only needed for mesh_id */
+				setup.mesh_id = wdev->ssid;
+				setup.mesh_id_len = wdev->mesh_id_up_len;
+				if (wdev->mesh_id_up_len)
+					__cfg80211_join_mesh(rdev, dev,
+							&setup,
+							&default_mesh_config);
+				break;
+			}
+#endif
+		default:
+			break;
+		}
+		wdev_unlock(wdev);
+		rdev->opencount++;
+		mutex_unlock(&rdev->devlist_mtx);
+		cfg80211_unlock_rdev(rdev);
+
+		/*
+		 * Configure power management to the driver here so that its
+		 * correctly set also after interface type changes etc.
+		 */
+		if (wdev->iftype == NL80211_IFTYPE_STATION &&
+		    rdev->ops->set_power_mgmt)
+			if (rdev->ops->set_power_mgmt(wdev->wiphy, dev,
+						      wdev->ps,
+						      wdev->ps_timeout)) {
+				/* assume this means it's off */
+				wdev->ps = false;
+			}
+		break;
+	case NETDEV_UNREGISTER:
+		/*
+		 * NB: cannot take rdev->mtx here because this may be
+		 * called within code protected by it when interfaces
+		 * are removed with nl80211.
+		 */
+		mutex_lock(&rdev->devlist_mtx);
+		/*
+		 * It is possible to get NETDEV_UNREGISTER
+		 * multiple times. To detect that, check
+		 * that the interface is still on the list
+		 * of registered interfaces, and only then
+		 * remove and clean it up.
+		 */
+		if (!list_empty(&wdev->list)) {
+			sysfs_remove_link(&dev->dev.kobj, "phy80211");
+			list_del_rcu(&wdev->list);
+			rdev->devlist_generation++;
+			cfg80211_mlme_purge_registrations(wdev);
+#ifdef CONFIG_CFG80211_WEXT
+			kfree(wdev->wext.keys);
+#endif
+		}
+		mutex_unlock(&rdev->devlist_mtx);
+		/*
+		 * synchronise (so that we won't find this netdev
+		 * from other code any more) and then clear the list
+		 * head so that the above code can safely check for
+		 * !list_empty() to avoid double-cleanup.
+		 */
+		synchronize_rcu();
+		INIT_LIST_HEAD(&wdev->list);
+		break;
+	case NETDEV_PRE_UP:
+		if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype)))
+			return notifier_from_errno(-EOPNOTSUPP);
+		if (rfkill_blocked(rdev->rfkill))
+			return notifier_from_errno(-ERFKILL);
+		ret = cfg80211_can_add_interface(rdev, wdev->iftype);
+		if (ret)
+			return notifier_from_errno(ret);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cfg80211_netdev_notifier = {
+	.notifier_call = cfg80211_netdev_notifier_call,
+};
+
+static void __net_exit cfg80211_pernet_exit(struct net *net)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rtnl_lock();
+	mutex_lock(&cfg80211_mutex);
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+		if (net_eq(wiphy_net(&rdev->wiphy), net))
+			WARN_ON(cfg80211_switch_netns(rdev, &init_net));
+	}
+	mutex_unlock(&cfg80211_mutex);
+	rtnl_unlock();
+}
+
+static struct pernet_operations cfg80211_pernet_ops = {
+	.exit = cfg80211_pernet_exit,
+};
+
+static int __init cfg80211_init(void)
+{
+	int err;
+
+	err = register_pernet_device(&cfg80211_pernet_ops);
+	if (err)
+		goto out_fail_pernet;
+
+	err = wiphy_sysfs_init();
+	if (err)
+		goto out_fail_sysfs;
+
+	err = register_netdevice_notifier(&cfg80211_netdev_notifier);
+	if (err)
+		goto out_fail_notifier;
+
+	err = nl80211_init();
+	if (err)
+		goto out_fail_nl80211;
+
+	ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL);
+
+	err = regulatory_init();
+	if (err)
+		goto out_fail_reg;
+
+	cfg80211_wq = create_singlethread_workqueue("cfg80211");
+	if (!cfg80211_wq)
+		goto out_fail_wq;
+
+	return 0;
+
+out_fail_wq:
+	regulatory_exit();
+out_fail_reg:
+	debugfs_remove(ieee80211_debugfs_dir);
+out_fail_nl80211:
+	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
+out_fail_notifier:
+	wiphy_sysfs_exit();
+out_fail_sysfs:
+	unregister_pernet_device(&cfg80211_pernet_ops);
+out_fail_pernet:
+	return err;
+}
+subsys_initcall(cfg80211_init);
+
+static void __exit cfg80211_exit(void)
+{
+	debugfs_remove(ieee80211_debugfs_dir);
+	nl80211_exit();
+	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
+	wiphy_sysfs_exit();
+	regulatory_exit();
+	unregister_pernet_device(&cfg80211_pernet_ops);
+	destroy_workqueue(cfg80211_wq);
+}
+module_exit(cfg80211_exit);
diff --git a/net/wireless/core.h b/net/wireless/core.h
new file mode 100644
index 00000000..a570ff92
--- /dev/null
+++ b/net/wireless/core.h
@@ -0,0 +1,464 @@
+/*
+ * Wireless configuration interface internals.
+ *
+ * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
+ */
+#ifndef __NET_WIRELESS_CORE_H
+#define __NET_WIRELESS_CORE_H
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/debugfs.h>
+#include <linux/rfkill.h>
+#include <linux/workqueue.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include "reg.h"
+
+struct cfg80211_registered_device {
+	const struct cfg80211_ops *ops;
+	struct list_head list;
+	/* we hold this mutex during any call so that
+	 * we cannot do multiple calls at once, and also
+	 * to avoid the deregister call to proceed while
+	 * any call is in progress */
+	struct mutex mtx;
+
+	/* rfkill support */
+	struct rfkill_ops rfkill_ops;
+	struct rfkill *rfkill;
+	struct work_struct rfkill_sync;
+
+	/* ISO / IEC 3166 alpha2 for which this device is receiving
+	 * country IEs on, this can help disregard country IEs from APs
+	 * on the same alpha2 quickly. The alpha2 may differ from
+	 * cfg80211_regdomain's alpha2 when an intersection has occurred.
+	 * If the AP is reconfigured this can also be used to tell us if
+	 * the country on the country IE changed. */
+	char country_ie_alpha2[2];
+
+	/* If a Country IE has been received this tells us the environment
+	 * which its telling us its in. This defaults to ENVIRON_ANY */
+	enum environment_cap env;
+
+	/* wiphy index, internal only */
+	int wiphy_idx;
+
+	/* associate netdev list */
+	struct mutex devlist_mtx;
+	/* protected by devlist_mtx or RCU */
+	struct list_head netdev_list;
+	int devlist_generation;
+	int opencount; /* also protected by devlist_mtx */
+	wait_queue_head_t dev_wait;
+
+	/* BSSes/scanning */
+	spinlock_t bss_lock;
+	struct list_head bss_list;
+	struct rb_root bss_tree;
+	u32 bss_generation;
+	struct cfg80211_scan_request *scan_req; /* protected by RTNL */
+	struct cfg80211_sched_scan_request *sched_scan_req;
+	unsigned long suspend_at;
+	struct work_struct scan_done_wk;
+	struct work_struct sched_scan_results_wk;
+
+	struct mutex sched_scan_mtx;
+
+#ifdef CONFIG_NL80211_TESTMODE
+	struct genl_info *testmode_info;
+#endif
+
+	struct work_struct conn_work;
+	struct work_struct event_work;
+
+	struct cfg80211_wowlan *wowlan;
+
+	/* must be last because of the way we do wiphy_priv(),
+	 * and it should at least be aligned to NETDEV_ALIGN */
+	struct wiphy wiphy __attribute__((__aligned__(NETDEV_ALIGN)));
+};
+
+static inline
+struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
+{
+	BUG_ON(!wiphy);
+	return container_of(wiphy, struct cfg80211_registered_device, wiphy);
+}
+
+/* Note 0 is valid, hence phy0 */
+static inline
+bool wiphy_idx_valid(int wiphy_idx)
+{
+	return wiphy_idx >= 0;
+}
+
+static inline void
+cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
+{
+	int i;
+
+	if (!rdev->wowlan)
+		return;
+	for (i = 0; i < rdev->wowlan->n_patterns; i++)
+		kfree(rdev->wowlan->patterns[i].mask);
+	kfree(rdev->wowlan->patterns);
+	kfree(rdev->wowlan);
+}
+
+extern struct workqueue_struct *cfg80211_wq;
+extern struct mutex cfg80211_mutex;
+extern struct list_head cfg80211_rdev_list;
+extern int cfg80211_rdev_list_generation;
+
+static inline void assert_cfg80211_lock(void)
+{
+	lockdep_assert_held(&cfg80211_mutex);
+}
+
+/*
+ * You can use this to mark a wiphy_idx as not having an associated wiphy.
+ * It guarantees cfg80211_rdev_by_wiphy_idx(wiphy_idx) will return NULL
+ */
+#define WIPHY_IDX_STALE -1
+
+struct cfg80211_internal_bss {
+	struct list_head list;
+	struct rb_node rbn;
+	unsigned long ts;
+	struct kref ref;
+	atomic_t hold;
+	bool beacon_ies_allocated;
+	bool proberesp_ies_allocated;
+
+	/* must be last because of priv member */
+	struct cfg80211_bss pub;
+};
+
+static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub)
+{
+	return container_of(pub, struct cfg80211_internal_bss, pub);
+}
+
+static inline void cfg80211_ref_bss(struct cfg80211_internal_bss *bss)
+{
+	kref_get(&bss->ref);
+}
+
+static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss)
+{
+	atomic_inc(&bss->hold);
+}
+
+static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss)
+{
+	int r = atomic_dec_return(&bss->hold);
+	WARN_ON(r < 0);
+}
+
+
+struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx);
+int get_wiphy_idx(struct wiphy *wiphy);
+
+struct cfg80211_registered_device *
+__cfg80211_rdev_from_info(struct genl_info *info);
+
+/*
+ * This function returns a pointer to the driver
+ * that the genl_info item that is passed refers to.
+ * If successful, it returns non-NULL and also locks
+ * the driver's mutex!
+ *
+ * This means that you need to call cfg80211_unlock_rdev()
+ * before being allowed to acquire &cfg80211_mutex!
+ *
+ * This is necessary because we need to lock the global
+ * mutex to get an item off the list safely, and then
+ * we lock the rdev mutex so it doesn't go away under us.
+ *
+ * We don't want to keep cfg80211_mutex locked
+ * for all the time in order to allow requests on
+ * other interfaces to go through at the same time.
+ *
+ * The result of this can be a PTR_ERR and hence must
+ * be checked with IS_ERR() for errors.
+ */
+extern struct cfg80211_registered_device *
+cfg80211_get_dev_from_info(struct genl_info *info);
+
+/* requires cfg80211_rdev_mutex to be held! */
+struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
+
+/* identical to cfg80211_get_dev_from_info but only operate on ifindex */
+extern struct cfg80211_registered_device *
+cfg80211_get_dev_from_ifindex(struct net *net, int ifindex);
+
+int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
+			  struct net *net);
+
+static inline void cfg80211_lock_rdev(struct cfg80211_registered_device *rdev)
+{
+	mutex_lock(&rdev->mtx);
+}
+
+static inline void cfg80211_unlock_rdev(struct cfg80211_registered_device *rdev)
+{
+	BUG_ON(IS_ERR(rdev) || !rdev);
+	mutex_unlock(&rdev->mtx);
+}
+
+static inline void wdev_lock(struct wireless_dev *wdev)
+	__acquires(wdev)
+{
+	mutex_lock(&wdev->mtx);
+	__acquire(wdev->mtx);
+}
+
+static inline void wdev_unlock(struct wireless_dev *wdev)
+	__releases(wdev)
+{
+	__release(wdev->mtx);
+	mutex_unlock(&wdev->mtx);
+}
+
+#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx)
+#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)
+
+enum cfg80211_event_type {
+	EVENT_CONNECT_RESULT,
+	EVENT_ROAMED,
+	EVENT_DISCONNECTED,
+	EVENT_IBSS_JOINED,
+};
+
+struct cfg80211_event {
+	struct list_head list;
+	enum cfg80211_event_type type;
+
+	union {
+		struct {
+			u8 bssid[ETH_ALEN];
+			const u8 *req_ie;
+			const u8 *resp_ie;
+			size_t req_ie_len;
+			size_t resp_ie_len;
+			u16 status;
+		} cr;
+		struct {
+			struct ieee80211_channel *channel;
+			u8 bssid[ETH_ALEN];
+			const u8 *req_ie;
+			const u8 *resp_ie;
+			size_t req_ie_len;
+			size_t resp_ie_len;
+		} rm;
+		struct {
+			const u8 *ie;
+			size_t ie_len;
+			u16 reason;
+		} dc;
+		struct {
+			u8 bssid[ETH_ALEN];
+		} ij;
+	};
+};
+
+struct cfg80211_cached_keys {
+	struct key_params params[6];
+	u8 data[6][WLAN_MAX_KEY_LEN];
+	int def, defmgmt;
+};
+
+
+/* free object */
+extern void cfg80211_dev_free(struct cfg80211_registered_device *rdev);
+
+extern int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
+			       char *newname);
+
+void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator setby);
+
+void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+                      unsigned long age_secs);
+
+/* IBSS */
+int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct cfg80211_ibss_params *params,
+			 struct cfg80211_cached_keys *connkeys);
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_ibss_params *params,
+		       struct cfg80211_cached_keys *connkeys);
+void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
+int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, bool nowext);
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			struct net_device *dev, bool nowext);
+void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid);
+int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
+			    struct wireless_dev *wdev);
+
+/* mesh */
+extern const struct mesh_config default_mesh_config;
+extern const struct mesh_setup default_mesh_setup;
+int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 const struct mesh_setup *setup,
+			 const struct mesh_config *conf);
+int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       const struct mesh_setup *setup,
+		       const struct mesh_config *conf);
+int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+			struct net_device *dev);
+
+/* MLME */
+int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct ieee80211_channel *chan,
+			 enum nl80211_auth_type auth_type,
+			 const u8 *bssid,
+			 const u8 *ssid, int ssid_len,
+			 const u8 *ie, int ie_len,
+			 const u8 *key, int key_len, int key_idx,
+			 bool local_state_change);
+int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev, struct ieee80211_channel *chan,
+		       enum nl80211_auth_type auth_type, const u8 *bssid,
+		       const u8 *ssid, int ssid_len,
+		       const u8 *ie, int ie_len,
+		       const u8 *key, int key_len, int key_idx,
+		       bool local_state_change);
+int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan,
+			  const u8 *bssid, const u8 *prev_bssid,
+			  const u8 *ssid, int ssid_len,
+			  const u8 *ie, int ie_len, bool use_mfp,
+			  struct cfg80211_crypto_settings *crypt);
+int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+			struct net_device *dev, struct ieee80211_channel *chan,
+			const u8 *bssid, const u8 *prev_bssid,
+			const u8 *ssid, int ssid_len,
+			const u8 *ie, int ie_len, bool use_mfp,
+			struct cfg80211_crypto_settings *crypt);
+int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+			   struct net_device *dev, const u8 *bssid,
+			   const u8 *ie, int ie_len, u16 reason,
+			   bool local_state_change);
+int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev, const u8 *bssid,
+			 const u8 *ie, int ie_len, u16 reason,
+			 bool local_state_change);
+int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *dev, const u8 *bssid,
+			   const u8 *ie, int ie_len, u16 reason,
+			   bool local_state_change);
+void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
+			struct net_device *dev);
+void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+			       const u8 *req_ie, size_t req_ie_len,
+			       const u8 *resp_ie, size_t resp_ie_len,
+			       u16 status, bool wextev,
+			       struct cfg80211_bss *bss);
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+				u16 frame_type, const u8 *match_data,
+				int match_len);
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan, bool offchan,
+			  enum nl80211_channel_type channel_type,
+			  bool channel_type_valid, unsigned int wait,
+			  const u8 *buf, size_t len, u64 *cookie);
+
+/* SME */
+int __cfg80211_connect(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_connect_params *connect,
+		       struct cfg80211_cached_keys *connkeys,
+		       const u8 *prev_bssid);
+int cfg80211_connect(struct cfg80211_registered_device *rdev,
+		     struct net_device *dev,
+		     struct cfg80211_connect_params *connect,
+		     struct cfg80211_cached_keys *connkeys);
+int __cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, u16 reason,
+			  bool wextev);
+int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+			struct net_device *dev, u16 reason,
+			bool wextev);
+void __cfg80211_roamed(struct wireless_dev *wdev,
+		       struct ieee80211_channel *channel,
+		       const u8 *bssid,
+		       const u8 *req_ie, size_t req_ie_len,
+		       const u8 *resp_ie, size_t resp_ie_len);
+int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
+			      struct wireless_dev *wdev);
+
+void cfg80211_conn_work(struct work_struct *work);
+void cfg80211_sme_failed_assoc(struct wireless_dev *wdev);
+bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev);
+
+/* internal helpers */
+int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
+				   struct key_params *params, int key_idx,
+				   bool pairwise, const u8 *mac_addr);
+void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
+			     size_t ie_len, u16 reason, bool from_ap);
+void cfg80211_sme_scan_done(struct net_device *dev);
+void cfg80211_sme_rx_auth(struct net_device *dev, const u8 *buf, size_t len);
+void cfg80211_sme_disassoc(struct net_device *dev, int idx);
+void __cfg80211_scan_done(struct work_struct *wk);
+void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak);
+void __cfg80211_sched_scan_results(struct work_struct *wk);
+int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
+			       bool driver_initiated);
+void cfg80211_upload_connect_keys(struct wireless_dev *wdev);
+int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, enum nl80211_iftype ntype,
+			  u32 *flags, struct vif_params *params);
+void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
+
+int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev,
+				  struct wireless_dev *wdev,
+				  enum nl80211_iftype iftype);
+
+static inline int
+cfg80211_can_add_interface(struct cfg80211_registered_device *rdev,
+			   enum nl80211_iftype iftype)
+{
+	return cfg80211_can_change_interface(rdev, NULL, iftype);
+}
+
+struct ieee80211_channel *
+rdev_freq_to_chan(struct cfg80211_registered_device *rdev,
+		  int freq, enum nl80211_channel_type channel_type);
+int cfg80211_set_freq(struct cfg80211_registered_device *rdev,
+		      struct wireless_dev *wdev, int freq,
+		      enum nl80211_channel_type channel_type);
+
+u16 cfg80211_calculate_bitrate(struct rate_info *rate);
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+				 u32 beacon_int);
+
+#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
+#define CFG80211_DEV_WARN_ON(cond)	WARN_ON(cond)
+#else
+/*
+ * Trick to enable using it as a condition,
+ * and also not give a warning when it's
+ * not used that way.
+ */
+#define CFG80211_DEV_WARN_ON(cond)	({bool __r = (cond); __r; })
+#endif
+
+#endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/db.txt b/net/wireless/db.txt
new file mode 100644
index 00000000..a2fc3a09
--- /dev/null
+++ b/net/wireless/db.txt
@@ -0,0 +1,17 @@
+#
+# This file is a placeholder to prevent accidental build breakage if someone
+# enables CONFIG_CFG80211_INTERNAL_REGDB.  Almost no one actually needs to
+# enable that build option.
+#
+# You should be using CRDA instead.  It is even better if you use the CRDA
+# package provided by your distribution, since they will probably keep it
+# up-to-date on your behalf.
+#
+# If you _really_ intend to use CONFIG_CFG80211_INTERNAL_REGDB then you will
+# need to replace this file with one containing appropriately formatted
+# regulatory rules that cover the regulatory domains you will be using.  Your
+# best option is to extract the db.txt file from the wireless-regdb git
+# repository:
+#
+#   git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git
+#
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
new file mode 100644
index 00000000..39765bcf
--- /dev/null
+++ b/net/wireless/debugfs.c
@@ -0,0 +1,121 @@
+/*
+ * cfg80211 debugfs
+ *
+ * Copyright 2009	Luis R. Rodriguez <lrodriguez@atheros.com>
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include "core.h"
+#include "debugfs.h"
+
+static int cfg80211_open_file_generic(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define DEBUGFS_READONLY_FILE(name, buflen, fmt, value...)		\
+static ssize_t name## _read(struct file *file, char __user *userbuf,	\
+			    size_t count, loff_t *ppos)			\
+{									\
+	struct wiphy *wiphy= file->private_data;		\
+	char buf[buflen];						\
+	int res;							\
+									\
+	res = scnprintf(buf, buflen, fmt "\n", ##value);		\
+	return simple_read_from_buffer(userbuf, count, ppos, buf, res);	\
+}									\
+									\
+static const struct file_operations name## _ops = {			\
+	.read = name## _read,						\
+	.open = cfg80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
+};
+
+DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
+		      wiphy->rts_threshold)
+DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d",
+		      wiphy->frag_threshold);
+DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d",
+		      wiphy->retry_short)
+DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d",
+		      wiphy->retry_long);
+
+static int ht_print_chan(struct ieee80211_channel *chan,
+			 char *buf, int buf_size, int offset)
+{
+	if (WARN_ON(offset > buf_size))
+		return 0;
+
+	if (chan->flags & IEEE80211_CHAN_DISABLED)
+		return snprintf(buf + offset,
+				buf_size - offset,
+				"%d Disabled\n",
+				chan->center_freq);
+
+	return snprintf(buf + offset,
+			buf_size - offset,
+			"%d HT40 %c%c\n",
+			chan->center_freq,
+			(chan->flags & IEEE80211_CHAN_NO_HT40MINUS) ? ' ' : '-',
+			(chan->flags & IEEE80211_CHAN_NO_HT40PLUS)  ? ' ' : '+');
+}
+
+static ssize_t ht40allow_map_read(struct file *file,
+				  char __user *user_buf,
+				  size_t count, loff_t *ppos)
+{
+	struct wiphy *wiphy = file->private_data;
+	char *buf;
+	unsigned int offset = 0, buf_size = PAGE_SIZE, i, r;
+	enum ieee80211_band band;
+	struct ieee80211_supported_band *sband;
+
+	buf = kzalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_lock(&cfg80211_mutex);
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		sband = wiphy->bands[band];
+		if (!sband)
+			continue;
+		for (i = 0; i < sband->n_channels; i++)
+			offset += ht_print_chan(&sband->channels[i],
+						buf, buf_size, offset);
+	}
+
+	mutex_unlock(&cfg80211_mutex);
+
+	r = simple_read_from_buffer(user_buf, count, ppos, buf, offset);
+
+	kfree(buf);
+
+	return r;
+}
+
+static const struct file_operations ht40allow_map_ops = {
+	.read = ht40allow_map_read,
+	.open = cfg80211_open_file_generic,
+	.llseek = default_llseek,
+};
+
+#define DEBUGFS_ADD(name)						\
+	debugfs_create_file(#name, S_IRUGO, phyd, &rdev->wiphy, &name## _ops);
+
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev)
+{
+	struct dentry *phyd = rdev->wiphy.debugfsdir;
+
+	DEBUGFS_ADD(rts_threshold);
+	DEBUGFS_ADD(fragmentation_threshold);
+	DEBUGFS_ADD(short_retry_limit);
+	DEBUGFS_ADD(long_retry_limit);
+	DEBUGFS_ADD(ht40allow_map);
+}
diff --git a/net/wireless/debugfs.h b/net/wireless/debugfs.h
new file mode 100644
index 00000000..74fdd381
--- /dev/null
+++ b/net/wireless/debugfs.h
@@ -0,0 +1,11 @@
+#ifndef __CFG80211_DEBUGFS_H
+#define __CFG80211_DEBUGFS_H
+
+#ifdef CONFIG_CFG80211_DEBUGFS
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev);
+#else
+static inline
+void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) {}
+#endif
+
+#endif /* __CFG80211_DEBUGFS_H */
diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
new file mode 100644
index 00000000..9bde4d1d
--- /dev/null
+++ b/net/wireless/ethtool.c
@@ -0,0 +1,78 @@
+#include <linux/utsname.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "ethtool.h"
+
+static void cfg80211_get_drvinfo(struct net_device *dev,
+					struct ethtool_drvinfo *info)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name,
+		sizeof(info->driver));
+
+	strlcpy(info->version, init_utsname()->release, sizeof(info->version));
+
+	if (wdev->wiphy->fw_version[0])
+		strncpy(info->fw_version, wdev->wiphy->fw_version,
+			sizeof(info->fw_version));
+	else
+		strncpy(info->fw_version, "N/A", sizeof(info->fw_version));
+
+	strlcpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)),
+		sizeof(info->bus_info));
+}
+
+static int cfg80211_get_regs_len(struct net_device *dev)
+{
+	/* For now, return 0... */
+	return 0;
+}
+
+static void cfg80211_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+			void *data)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	regs->version = wdev->wiphy->hw_version;
+	regs->len = 0;
+}
+
+static void cfg80211_get_ringparam(struct net_device *dev,
+				   struct ethtool_ringparam *rp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+
+	memset(rp, 0, sizeof(*rp));
+
+	if (rdev->ops->get_ringparam)
+		rdev->ops->get_ringparam(wdev->wiphy,
+					 &rp->tx_pending, &rp->tx_max_pending,
+					 &rp->rx_pending, &rp->rx_max_pending);
+}
+
+static int cfg80211_set_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *rp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+
+	if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0)
+		return -EINVAL;
+
+	if (rdev->ops->set_ringparam)
+		return rdev->ops->set_ringparam(wdev->wiphy,
+						rp->tx_pending, rp->rx_pending);
+
+	return -ENOTSUPP;
+}
+
+const struct ethtool_ops cfg80211_ethtool_ops = {
+	.get_drvinfo = cfg80211_get_drvinfo,
+	.get_regs_len = cfg80211_get_regs_len,
+	.get_regs = cfg80211_get_regs,
+	.get_link = ethtool_op_get_link,
+	.get_ringparam = cfg80211_get_ringparam,
+	.set_ringparam = cfg80211_set_ringparam,
+};
diff --git a/net/wireless/ethtool.h b/net/wireless/ethtool.h
new file mode 100644
index 00000000..695ecad2
--- /dev/null
+++ b/net/wireless/ethtool.h
@@ -0,0 +1,6 @@
+#ifndef __CFG80211_ETHTOOL__
+#define __CFG80211_ETHTOOL__
+
+extern const struct ethtool_ops cfg80211_ethtool_ops;
+
+#endif /* __CFG80211_ETHTOOL__ */
diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk
new file mode 100644
index 00000000..53c143f5
--- /dev/null
+++ b/net/wireless/genregdb.awk
@@ -0,0 +1,119 @@
+#!/usr/bin/awk -f
+#
+# genregdb.awk -- generate regdb.c from db.txt
+#
+# Actually, it reads from stdin (presumed to be db.txt) and writes
+# to stdout (presumed to be regdb.c), but close enough...
+#
+# Copyright 2009 John W. Linville <linville@tuxdriver.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+
+BEGIN {
+	active = 0
+	rules = 0;
+	print "/*"
+	print " * DO NOT EDIT -- file generated from data in db.txt"
+	print " */"
+	print ""
+	print "#include <linux/nl80211.h>"
+	print "#include <net/cfg80211.h>"
+	print "#include \"regdb.h\""
+	print ""
+	regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n"
+}
+
+/^[ \t]*#/ {
+	# Ignore
+}
+
+!active && /^[ \t]*$/ {
+	# Ignore
+}
+
+!active && /country/ {
+	country=$2
+	sub(/:/, "", country)
+	printf "static const struct ieee80211_regdomain regdom_%s = {\n", country
+	printf "\t.alpha2 = \"%s\",\n", country
+	printf "\t.reg_rules = {\n"
+	active = 1
+	regdb = regdb "\t&regdom_" country ",\n"
+}
+
+active && /^[ \t]*\(/ {
+	start = $1
+	sub(/\(/, "", start)
+	end = $3
+	bw = $5
+	sub(/\),/, "", bw)
+	gain = $6
+	sub(/\(/, "", gain)
+	sub(/,/, "", gain)
+	power = $7
+	sub(/\)/, "", power)
+	sub(/,/, "", power)
+	# power might be in mW...
+	units = $8
+	sub(/\)/, "", units)
+	sub(/,/, "", units)
+	if (units == "mW") {
+		if (power == 100) {
+			power = 20
+		} else if (power == 200) {
+			power = 23
+		} else if (power == 500) {
+			power = 27
+		} else if (power == 1000) {
+			power = 30
+		} else {
+			print "Unknown power value in database!"
+		}
+	}
+	flagstr = ""
+	for (i=8; i<=NF; i++)
+		flagstr = flagstr $i
+	split(flagstr, flagarray, ",")
+	flags = ""
+	for (arg in flagarray) {
+		if (flagarray[arg] == "NO-OFDM") {
+			flags = flags "\n\t\t\tNL80211_RRF_NO_OFDM | "
+		} else if (flagarray[arg] == "NO-CCK") {
+			flags = flags "\n\t\t\tNL80211_RRF_NO_CCK | "
+		} else if (flagarray[arg] == "NO-INDOOR") {
+			flags = flags "\n\t\t\tNL80211_RRF_NO_INDOOR | "
+		} else if (flagarray[arg] == "NO-OUTDOOR") {
+			flags = flags "\n\t\t\tNL80211_RRF_NO_OUTDOOR | "
+		} else if (flagarray[arg] == "DFS") {
+			flags = flags "\n\t\t\tNL80211_RRF_DFS | "
+		} else if (flagarray[arg] == "PTP-ONLY") {
+			flags = flags "\n\t\t\tNL80211_RRF_PTP_ONLY | "
+		} else if (flagarray[arg] == "PTMP-ONLY") {
+			flags = flags "\n\t\t\tNL80211_RRF_PTMP_ONLY | "
+		} else if (flagarray[arg] == "PASSIVE-SCAN") {
+			flags = flags "\n\t\t\tNL80211_RRF_PASSIVE_SCAN | "
+		} else if (flagarray[arg] == "NO-IBSS") {
+			flags = flags "\n\t\t\tNL80211_RRF_NO_IBSS | "
+		}
+	}
+	flags = flags "0"
+	printf "\t\tREG_RULE(%d, %d, %d, %d, %d, %s),\n", start, end, bw, gain, power, flags
+	rules++
+}
+
+active && /^[ \t]*$/ {
+	active = 0
+	printf "\t},\n"
+	printf "\t.n_reg_rules = %d\n", rules
+	printf "};\n\n"
+	rules = 0;
+}
+
+END {
+	print regdb "};"
+	print ""
+	print "int reg_regdb_size = ARRAY_SIZE(reg_regdb);"
+}
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
new file mode 100644
index 00000000..f33fbb79
--- /dev/null
+++ b/net/wireless/ibss.c
@@ -0,0 +1,526 @@
+/*
+ * Some IBSS support code for cfg80211.
+ *
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <net/cfg80211.h>
+#include "wext-compat.h"
+#include "nl80211.h"
+
+
+void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_bss *bss;
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+#endif
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return;
+
+	if (!wdev->ssid_len)
+		return;
+
+	bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
+			       wdev->ssid, wdev->ssid_len,
+			       WLAN_CAPABILITY_IBSS, WLAN_CAPABILITY_IBSS);
+
+	if (WARN_ON(!bss))
+		return;
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(&wdev->current_bss->pub);
+	}
+
+	cfg80211_hold_bss(bss_from_pub(bss));
+	wdev->current_bss = bss_from_pub(bss);
+
+	cfg80211_upload_connect_keys(wdev);
+
+	nl80211_send_ibss_bssid(wiphy_to_dev(wdev->wiphy), dev, bssid,
+				GFP_KERNEL);
+#ifdef CONFIG_CFG80211_WEXT
+	memset(&wrqu, 0, sizeof(wrqu));
+	memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+#endif
+}
+
+void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	CFG80211_DEV_WARN_ON(!wdev->ssid_len);
+
+	ev = kzalloc(sizeof(*ev), gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_IBSS_JOINED;
+	memcpy(ev->cr.bssid, bssid, ETH_ALEN);
+
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_ibss_joined);
+
+int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct cfg80211_ibss_params *params,
+			 struct cfg80211_cached_keys *connkeys)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->ssid_len)
+		return -EALREADY;
+
+	if (!params->basic_rates) {
+		/*
+		* If no rates were explicitly configured,
+		* use the mandatory rate set for 11b or
+		* 11a for maximum compatibility.
+		*/
+		struct ieee80211_supported_band *sband =
+			rdev->wiphy.bands[params->channel->band];
+		int j;
+		u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ?
+			IEEE80211_RATE_MANDATORY_A :
+			IEEE80211_RATE_MANDATORY_B;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].flags & flag)
+				params->basic_rates |= BIT(j);
+		}
+	}
+
+	if (WARN_ON(wdev->connect_keys))
+		kfree(wdev->connect_keys);
+	wdev->connect_keys = connkeys;
+
+#ifdef CONFIG_CFG80211_WEXT
+	wdev->wext.ibss.channel = params->channel;
+#endif
+	err = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
+	if (err) {
+		wdev->connect_keys = NULL;
+		return err;
+	}
+
+	memcpy(wdev->ssid, params->ssid, params->ssid_len);
+	wdev->ssid_len = params->ssid_len;
+
+	return 0;
+}
+
+int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_ibss_params *params,
+		       struct cfg80211_cached_keys *connkeys)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+	err = __cfg80211_join_ibss(rdev, dev, params, connkeys);
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int i;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	kfree(wdev->connect_keys);
+	wdev->connect_keys = NULL;
+
+	/*
+	 * Delete all the keys ... pairwise keys can't really
+	 * exist any more anyway, but default keys might.
+	 */
+	if (rdev->ops->del_key)
+		for (i = 0; i < 6; i++)
+			rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(&wdev->current_bss->pub);
+	}
+
+	wdev->current_bss = NULL;
+	wdev->ssid_len = 0;
+#ifdef CONFIG_CFG80211_WEXT
+	if (!nowext)
+		wdev->wext.ibss.ssid_len = 0;
+#endif
+}
+
+void cfg80211_clear_ibss(struct net_device *dev, bool nowext)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	wdev_lock(wdev);
+	__cfg80211_clear_ibss(dev, nowext);
+	wdev_unlock(wdev);
+}
+
+int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, bool nowext)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!wdev->ssid_len)
+		return -ENOLINK;
+
+	err = rdev->ops->leave_ibss(&rdev->wiphy, dev);
+
+	if (err)
+		return err;
+
+	__cfg80211_clear_ibss(dev, nowext);
+
+	return 0;
+}
+
+int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
+			struct net_device *dev, bool nowext)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_leave_ibss(rdev, dev, nowext);
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+#ifdef CONFIG_CFG80211_WEXT
+int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
+			    struct wireless_dev *wdev)
+{
+	struct cfg80211_cached_keys *ck = NULL;
+	enum ieee80211_band band;
+	int i, err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!wdev->wext.ibss.beacon_interval)
+		wdev->wext.ibss.beacon_interval = 100;
+
+	/* try to find an IBSS channel if none requested ... */
+	if (!wdev->wext.ibss.channel) {
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			struct ieee80211_supported_band *sband;
+			struct ieee80211_channel *chan;
+
+			sband = rdev->wiphy.bands[band];
+			if (!sband)
+				continue;
+
+			for (i = 0; i < sband->n_channels; i++) {
+				chan = &sband->channels[i];
+				if (chan->flags & IEEE80211_CHAN_NO_IBSS)
+					continue;
+				if (chan->flags & IEEE80211_CHAN_DISABLED)
+					continue;
+				wdev->wext.ibss.channel = chan;
+				break;
+			}
+
+			if (wdev->wext.ibss.channel)
+				break;
+		}
+
+		if (!wdev->wext.ibss.channel)
+			return -EINVAL;
+	}
+
+	/* don't join -- SSID is not there */
+	if (!wdev->wext.ibss.ssid_len)
+		return 0;
+
+	if (!netif_running(wdev->netdev))
+		return 0;
+
+	if (wdev->wext.keys) {
+		wdev->wext.keys->def = wdev->wext.default_key;
+		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+	}
+
+	wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
+
+	if (wdev->wext.keys) {
+		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
+		if (!ck)
+			return -ENOMEM;
+		for (i = 0; i < 6; i++)
+			ck->params[i].key = ck->data[i];
+	}
+	err = __cfg80211_join_ibss(rdev, wdev->netdev,
+				   &wdev->wext.ibss, ck);
+	if (err)
+		kfree(ck);
+
+	return err;
+}
+
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *wextfreq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct ieee80211_channel *chan = NULL;
+	int err, freq;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!rdev->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	freq = cfg80211_wext_freq(wdev->wiphy, wextfreq);
+	if (freq < 0)
+		return freq;
+
+	if (freq) {
+		chan = ieee80211_get_channel(wdev->wiphy, freq);
+		if (!chan)
+			return -EINVAL;
+		if (chan->flags & IEEE80211_CHAN_NO_IBSS ||
+		    chan->flags & IEEE80211_CHAN_DISABLED)
+			return -EINVAL;
+	}
+
+	if (wdev->wext.ibss.channel == chan)
+		return 0;
+
+	wdev_lock(wdev);
+	err = 0;
+	if (wdev->ssid_len)
+		err = __cfg80211_leave_ibss(rdev, dev, true);
+	wdev_unlock(wdev);
+
+	if (err)
+		return err;
+
+	if (chan) {
+		wdev->wext.ibss.channel = chan;
+		wdev->wext.ibss.channel_fixed = true;
+	} else {
+		/* cfg80211_ibss_wext_join will pick one if needed */
+		wdev->wext.ibss.channel_fixed = false;
+	}
+
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+	err = cfg80211_ibss_wext_join(rdev, wdev);
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_channel *chan = NULL;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	if (wdev->current_bss)
+		chan = wdev->current_bss->pub.channel;
+	else if (wdev->wext.ibss.channel)
+		chan = wdev->wext.ibss.channel;
+	wdev_unlock(wdev);
+
+	if (chan) {
+		freq->m = chan->center_freq;
+		freq->e = 6;
+		return 0;
+	}
+
+	/* no channel if not joining */
+	return -EINVAL;
+}
+
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	size_t len = data->length;
+	int err;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!rdev->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	wdev_lock(wdev);
+	err = 0;
+	if (wdev->ssid_len)
+		err = __cfg80211_leave_ibss(rdev, dev, true);
+	wdev_unlock(wdev);
+
+	if (err)
+		return err;
+
+	/* iwconfig uses nul termination in SSID.. */
+	if (len > 0 && ssid[len - 1] == '\0')
+		len--;
+
+	wdev->wext.ibss.ssid = wdev->ssid;
+	memcpy(wdev->wext.ibss.ssid, ssid, len);
+	wdev->wext.ibss.ssid_len = len;
+
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+	err = cfg80211_ibss_wext_join(rdev, wdev);
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	data->flags = 0;
+
+	wdev_lock(wdev);
+	if (wdev->ssid_len) {
+		data->flags = 1;
+		data->length = wdev->ssid_len;
+		memcpy(ssid, wdev->ssid, data->length);
+	} else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) {
+		data->flags = 1;
+		data->length = wdev->wext.ibss.ssid_len;
+		memcpy(ssid, wdev->wext.ibss.ssid, data->length);
+	}
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u8 *bssid = ap_addr->sa_data;
+	int err;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	if (!rdev->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	if (ap_addr->sa_family != ARPHRD_ETHER)
+		return -EINVAL;
+
+	/* automatic mode */
+	if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
+		bssid = NULL;
+
+	/* both automatic */
+	if (!bssid && !wdev->wext.ibss.bssid)
+		return 0;
+
+	/* fixed already - and no change */
+	if (wdev->wext.ibss.bssid && bssid &&
+	    compare_ether_addr(bssid, wdev->wext.ibss.bssid) == 0)
+		return 0;
+
+	wdev_lock(wdev);
+	err = 0;
+	if (wdev->ssid_len)
+		err = __cfg80211_leave_ibss(rdev, dev, true);
+	wdev_unlock(wdev);
+
+	if (err)
+		return err;
+
+	if (bssid) {
+		memcpy(wdev->wext.bssid, bssid, ETH_ALEN);
+		wdev->wext.ibss.bssid = wdev->wext.bssid;
+	} else
+		wdev->wext.ibss.bssid = NULL;
+
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+	err = cfg80211_ibss_wext_join(rdev, wdev);
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for ibss! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC))
+		return -EINVAL;
+
+	ap_addr->sa_family = ARPHRD_ETHER;
+
+	wdev_lock(wdev);
+	if (wdev->current_bss)
+		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	else if (wdev->wext.ibss.bssid)
+		memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN);
+	else
+		memset(ap_addr->sa_data, 0, ETH_ALEN);
+
+	wdev_unlock(wdev);
+
+	return 0;
+}
+#endif
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
new file mode 100644
index 00000000..3268fac5
--- /dev/null
+++ b/net/wireless/lib80211.c
@@ -0,0 +1,286 @@
+/*
+ * lib80211 -- common bits for IEEE802.11 drivers
+ *
+ * Copyright(c) 2008 John W. Linville <linville@tuxdriver.com>
+ *
+ * Portions copied from old ieee80211 component, w/ original copyright
+ * notices below:
+ *
+ * Host AP crypto routines
+ *
+ * Copyright (c) 2002-2003, Jouni Malinen <j@w1.fi>
+ * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/ieee80211.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <net/lib80211.h>
+
+#define DRV_NAME        "lib80211"
+
+#define DRV_DESCRIPTION	"common routines for IEEE802.11 drivers"
+
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
+MODULE_LICENSE("GPL");
+
+struct lib80211_crypto_alg {
+	struct list_head list;
+	struct lib80211_crypto_ops *ops;
+};
+
+static LIST_HEAD(lib80211_crypto_algs);
+static DEFINE_SPINLOCK(lib80211_crypto_lock);
+
+const char *print_ssid(char *buf, const char *ssid, u8 ssid_len)
+{
+	const char *s = ssid;
+	char *d = buf;
+
+	ssid_len = min_t(u8, ssid_len, IEEE80211_MAX_SSID_LEN);
+	while (ssid_len--) {
+		if (isprint(*s)) {
+			*d++ = *s++;
+			continue;
+		}
+
+		*d++ = '\\';
+		if (*s == '\0')
+			*d++ = '0';
+		else if (*s == '\n')
+			*d++ = 'n';
+		else if (*s == '\r')
+			*d++ = 'r';
+		else if (*s == '\t')
+			*d++ = 't';
+		else if (*s == '\\')
+			*d++ = '\\';
+		else
+			d += snprintf(d, 3, "%03o", *s);
+		s++;
+	}
+	*d = '\0';
+	return buf;
+}
+EXPORT_SYMBOL(print_ssid);
+
+int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
+				spinlock_t *lock)
+{
+	memset(info, 0, sizeof(*info));
+
+	info->name = name;
+	info->lock = lock;
+
+	INIT_LIST_HEAD(&info->crypt_deinit_list);
+	setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
+			(unsigned long)info);
+
+	return 0;
+}
+EXPORT_SYMBOL(lib80211_crypt_info_init);
+
+void lib80211_crypt_info_free(struct lib80211_crypt_info *info)
+{
+	int i;
+
+        lib80211_crypt_quiescing(info);
+        del_timer_sync(&info->crypt_deinit_timer);
+        lib80211_crypt_deinit_entries(info, 1);
+
+        for (i = 0; i < NUM_WEP_KEYS; i++) {
+                struct lib80211_crypt_data *crypt = info->crypt[i];
+                if (crypt) {
+                        if (crypt->ops) {
+                                crypt->ops->deinit(crypt->priv);
+                                module_put(crypt->ops->owner);
+                        }
+                        kfree(crypt);
+                        info->crypt[i] = NULL;
+                }
+        }
+}
+EXPORT_SYMBOL(lib80211_crypt_info_free);
+
+void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, int force)
+{
+	struct lib80211_crypt_data *entry, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(info->lock, flags);
+	list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) {
+		if (atomic_read(&entry->refcnt) != 0 && !force)
+			continue;
+
+		list_del(&entry->list);
+
+		if (entry->ops) {
+			entry->ops->deinit(entry->priv);
+			module_put(entry->ops->owner);
+		}
+		kfree(entry);
+	}
+	spin_unlock_irqrestore(info->lock, flags);
+}
+EXPORT_SYMBOL(lib80211_crypt_deinit_entries);
+
+/* After this, crypt_deinit_list won't accept new members */
+void lib80211_crypt_quiescing(struct lib80211_crypt_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(info->lock, flags);
+	info->crypt_quiesced = 1;
+	spin_unlock_irqrestore(info->lock, flags);
+}
+EXPORT_SYMBOL(lib80211_crypt_quiescing);
+
+void lib80211_crypt_deinit_handler(unsigned long data)
+{
+	struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data;
+	unsigned long flags;
+
+	lib80211_crypt_deinit_entries(info, 0);
+
+	spin_lock_irqsave(info->lock, flags);
+	if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) {
+		printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
+		       "deletion list\n", info->name);
+		info->crypt_deinit_timer.expires = jiffies + HZ;
+		add_timer(&info->crypt_deinit_timer);
+	}
+	spin_unlock_irqrestore(info->lock, flags);
+}
+EXPORT_SYMBOL(lib80211_crypt_deinit_handler);
+
+void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info,
+				    struct lib80211_crypt_data **crypt)
+{
+	struct lib80211_crypt_data *tmp;
+	unsigned long flags;
+
+	if (*crypt == NULL)
+		return;
+
+	tmp = *crypt;
+	*crypt = NULL;
+
+	/* must not run ops->deinit() while there may be pending encrypt or
+	 * decrypt operations. Use a list of delayed deinits to avoid needing
+	 * locking. */
+
+	spin_lock_irqsave(info->lock, flags);
+	if (!info->crypt_quiesced) {
+		list_add(&tmp->list, &info->crypt_deinit_list);
+		if (!timer_pending(&info->crypt_deinit_timer)) {
+			info->crypt_deinit_timer.expires = jiffies + HZ;
+			add_timer(&info->crypt_deinit_timer);
+		}
+	}
+	spin_unlock_irqrestore(info->lock, flags);
+}
+EXPORT_SYMBOL(lib80211_crypt_delayed_deinit);
+
+int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops)
+{
+	unsigned long flags;
+	struct lib80211_crypto_alg *alg;
+
+	alg = kzalloc(sizeof(*alg), GFP_KERNEL);
+	if (alg == NULL)
+		return -ENOMEM;
+
+	alg->ops = ops;
+
+	spin_lock_irqsave(&lib80211_crypto_lock, flags);
+	list_add(&alg->list, &lib80211_crypto_algs);
+	spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+
+	printk(KERN_DEBUG "lib80211_crypt: registered algorithm '%s'\n",
+	       ops->name);
+
+	return 0;
+}
+EXPORT_SYMBOL(lib80211_register_crypto_ops);
+
+int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops)
+{
+	struct lib80211_crypto_alg *alg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lib80211_crypto_lock, flags);
+	list_for_each_entry(alg, &lib80211_crypto_algs, list) {
+		if (alg->ops == ops)
+			goto found;
+	}
+	spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+	return -EINVAL;
+
+      found:
+	printk(KERN_DEBUG "lib80211_crypt: unregistered algorithm '%s'\n",
+	       ops->name);
+	list_del(&alg->list);
+	spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+	kfree(alg);
+	return 0;
+}
+EXPORT_SYMBOL(lib80211_unregister_crypto_ops);
+
+struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name)
+{
+	struct lib80211_crypto_alg *alg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&lib80211_crypto_lock, flags);
+	list_for_each_entry(alg, &lib80211_crypto_algs, list) {
+		if (strcmp(alg->ops->name, name) == 0)
+			goto found;
+	}
+	spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+	return NULL;
+
+      found:
+	spin_unlock_irqrestore(&lib80211_crypto_lock, flags);
+	return alg->ops;
+}
+EXPORT_SYMBOL(lib80211_get_crypto_ops);
+
+static void *lib80211_crypt_null_init(int keyidx)
+{
+	return (void *)1;
+}
+
+static void lib80211_crypt_null_deinit(void *priv)
+{
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_null = {
+	.name = "NULL",
+	.init = lib80211_crypt_null_init,
+	.deinit = lib80211_crypt_null_deinit,
+	.owner = THIS_MODULE,
+};
+
+static int __init lib80211_init(void)
+{
+	pr_info(DRV_DESCRIPTION "\n");
+	return lib80211_register_crypto_ops(&lib80211_crypt_null);
+}
+
+static void __exit lib80211_exit(void)
+{
+	lib80211_unregister_crypto_ops(&lib80211_crypt_null);
+	BUG_ON(!list_empty(&lib80211_crypto_algs));
+}
+
+module_init(lib80211_init);
+module_exit(lib80211_exit);
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
new file mode 100644
index 00000000..dacb3b4b
--- /dev/null
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -0,0 +1,493 @@
+/*
+ * lib80211 crypt: host-based CCMP encryption implementation for lib80211
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+#include <linux/wireless.h>
+
+#include <linux/ieee80211.h>
+
+#include <linux/crypto.h>
+
+#include <net/lib80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: CCMP");
+MODULE_LICENSE("GPL");
+
+#define AES_BLOCK_LEN 16
+#define CCMP_HDR_LEN 8
+#define CCMP_MIC_LEN 8
+#define CCMP_TK_LEN 16
+#define CCMP_PN_LEN 6
+
+struct lib80211_ccmp_data {
+	u8 key[CCMP_TK_LEN];
+	int key_set;
+
+	u8 tx_pn[CCMP_PN_LEN];
+	u8 rx_pn[CCMP_PN_LEN];
+
+	u32 dot11RSNAStatsCCMPFormatErrors;
+	u32 dot11RSNAStatsCCMPReplays;
+	u32 dot11RSNAStatsCCMPDecryptErrors;
+
+	int key_idx;
+
+	struct crypto_cipher *tfm;
+
+	/* scratch buffers for virt_to_page() (crypto API) */
+	u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
+	    tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
+	u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
+};
+
+static inline void lib80211_ccmp_aes_encrypt(struct crypto_cipher *tfm,
+					      const u8 pt[16], u8 ct[16])
+{
+	crypto_cipher_encrypt_one(tfm, ct, pt);
+}
+
+static void *lib80211_ccmp_init(int key_idx)
+{
+	struct lib80211_ccmp_data *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+	if (priv == NULL)
+		goto fail;
+	priv->key_idx = key_idx;
+
+	priv->tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->tfm)) {
+		printk(KERN_DEBUG "lib80211_crypt_ccmp: could not allocate "
+		       "crypto API aes\n");
+		priv->tfm = NULL;
+		goto fail;
+	}
+
+	return priv;
+
+      fail:
+	if (priv) {
+		if (priv->tfm)
+			crypto_free_cipher(priv->tfm);
+		kfree(priv);
+	}
+
+	return NULL;
+}
+
+static void lib80211_ccmp_deinit(void *priv)
+{
+	struct lib80211_ccmp_data *_priv = priv;
+	if (_priv && _priv->tfm)
+		crypto_free_cipher(_priv->tfm);
+	kfree(priv);
+}
+
+static inline void xor_block(u8 * b, u8 * a, size_t len)
+{
+	int i;
+	for (i = 0; i < len; i++)
+		b[i] ^= a[i];
+}
+
+static void ccmp_init_blocks(struct crypto_cipher *tfm,
+			     struct ieee80211_hdr *hdr,
+			     u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
+{
+	u8 *pos, qc = 0;
+	size_t aad_len;
+	int a4_included, qc_included;
+	u8 aad[2 * AES_BLOCK_LEN];
+
+	a4_included = ieee80211_has_a4(hdr->frame_control);
+	qc_included = ieee80211_is_data_qos(hdr->frame_control);
+
+	aad_len = 22;
+	if (a4_included)
+		aad_len += 6;
+	if (qc_included) {
+		pos = (u8 *) & hdr->addr4;
+		if (a4_included)
+			pos += 6;
+		qc = *pos & 0x0f;
+		aad_len += 2;
+	}
+
+	/* CCM Initial Block:
+	 * Flag (Include authentication header, M=3 (8-octet MIC),
+	 *       L=1 (2-octet Dlen))
+	 * Nonce: 0x00 | A2 | PN
+	 * Dlen */
+	b0[0] = 0x59;
+	b0[1] = qc;
+	memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
+	memcpy(b0 + 8, pn, CCMP_PN_LEN);
+	b0[14] = (dlen >> 8) & 0xff;
+	b0[15] = dlen & 0xff;
+
+	/* AAD:
+	 * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
+	 * A1 | A2 | A3
+	 * SC with bits 4..15 (seq#) masked to zero
+	 * A4 (if present)
+	 * QC (if present)
+	 */
+	pos = (u8 *) hdr;
+	aad[0] = 0;		/* aad_len >> 8 */
+	aad[1] = aad_len & 0xff;
+	aad[2] = pos[0] & 0x8f;
+	aad[3] = pos[1] & 0xc7;
+	memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
+	pos = (u8 *) & hdr->seq_ctrl;
+	aad[22] = pos[0] & 0x0f;
+	aad[23] = 0;		/* all bits masked */
+	memset(aad + 24, 0, 8);
+	if (a4_included)
+		memcpy(aad + 24, hdr->addr4, ETH_ALEN);
+	if (qc_included) {
+		aad[a4_included ? 30 : 24] = qc;
+		/* rest of QC masked */
+	}
+
+	/* Start with the first block and AAD */
+	lib80211_ccmp_aes_encrypt(tfm, b0, auth);
+	xor_block(auth, aad, AES_BLOCK_LEN);
+	lib80211_ccmp_aes_encrypt(tfm, auth, auth);
+	xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
+	lib80211_ccmp_aes_encrypt(tfm, auth, auth);
+	b0[0] &= 0x07;
+	b0[14] = b0[15] = 0;
+	lib80211_ccmp_aes_encrypt(tfm, b0, s0);
+}
+
+static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len,
+			      u8 *aeskey, int keylen, void *priv)
+{
+	struct lib80211_ccmp_data *key = priv;
+	int i;
+	u8 *pos;
+
+	if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len)
+		return -1;
+
+	if (aeskey != NULL && keylen >= CCMP_TK_LEN)
+		memcpy(aeskey, key->key, CCMP_TK_LEN);
+
+	pos = skb_push(skb, CCMP_HDR_LEN);
+	memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
+	pos += hdr_len;
+
+	i = CCMP_PN_LEN - 1;
+	while (i >= 0) {
+		key->tx_pn[i]++;
+		if (key->tx_pn[i] != 0)
+			break;
+		i--;
+	}
+
+	*pos++ = key->tx_pn[5];
+	*pos++ = key->tx_pn[4];
+	*pos++ = 0;
+	*pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+	*pos++ = key->tx_pn[3];
+	*pos++ = key->tx_pn[2];
+	*pos++ = key->tx_pn[1];
+	*pos++ = key->tx_pn[0];
+
+	return CCMP_HDR_LEN;
+}
+
+static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_ccmp_data *key = priv;
+	int data_len, i, blocks, last, len;
+	u8 *pos, *mic;
+	struct ieee80211_hdr *hdr;
+	u8 *b0 = key->tx_b0;
+	u8 *b = key->tx_b;
+	u8 *e = key->tx_e;
+	u8 *s0 = key->tx_s0;
+
+	if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
+		return -1;
+
+	data_len = skb->len - hdr_len;
+	len = lib80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv);
+	if (len < 0)
+		return -1;
+
+	pos = skb->data + hdr_len + CCMP_HDR_LEN;
+	hdr = (struct ieee80211_hdr *)skb->data;
+	ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
+
+	blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+	last = data_len % AES_BLOCK_LEN;
+
+	for (i = 1; i <= blocks; i++) {
+		len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+		/* Authentication */
+		xor_block(b, pos, len);
+		lib80211_ccmp_aes_encrypt(key->tfm, b, b);
+		/* Encryption, with counter */
+		b0[14] = (i >> 8) & 0xff;
+		b0[15] = i & 0xff;
+		lib80211_ccmp_aes_encrypt(key->tfm, b0, e);
+		xor_block(pos, e, len);
+		pos += len;
+	}
+
+	mic = skb_put(skb, CCMP_MIC_LEN);
+	for (i = 0; i < CCMP_MIC_LEN; i++)
+		mic[i] = b[i] ^ s0[i];
+
+	return 0;
+}
+
+/*
+ * deal with seq counter wrapping correctly.
+ * refer to timer_after() for jiffies wrapping handling
+ */
+static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o)
+{
+	u32 iv32_n, iv16_n;
+	u32 iv32_o, iv16_o;
+
+	iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3];
+	iv16_n = (pn_n[4] << 8) | pn_n[5];
+
+	iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3];
+	iv16_o = (pn_o[4] << 8) | pn_o[5];
+
+	if ((s32)iv32_n - (s32)iv32_o < 0 ||
+	    (iv32_n == iv32_o && iv16_n <= iv16_o))
+		return 1;
+	return 0;
+}
+
+static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_ccmp_data *key = priv;
+	u8 keyidx, *pos;
+	struct ieee80211_hdr *hdr;
+	u8 *b0 = key->rx_b0;
+	u8 *b = key->rx_b;
+	u8 *a = key->rx_a;
+	u8 pn[6];
+	int i, blocks, last, len;
+	size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
+	u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
+
+	if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
+		key->dot11RSNAStatsCCMPFormatErrors++;
+		return -1;
+	}
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+	pos = skb->data + hdr_len;
+	keyidx = pos[3];
+	if (!(keyidx & (1 << 5))) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "CCMP: received packet without ExtIV"
+			       " flag from %pM\n", hdr->addr2);
+		}
+		key->dot11RSNAStatsCCMPFormatErrors++;
+		return -2;
+	}
+	keyidx >>= 6;
+	if (key->key_idx != keyidx) {
+		printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
+		       "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
+		return -6;
+	}
+	if (!key->key_set) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "CCMP: received packet from %pM"
+			       " with keyid=%d that does not have a configured"
+			       " key\n", hdr->addr2, keyidx);
+		}
+		return -3;
+	}
+
+	pn[0] = pos[7];
+	pn[1] = pos[6];
+	pn[2] = pos[5];
+	pn[3] = pos[4];
+	pn[4] = pos[1];
+	pn[5] = pos[0];
+	pos += 8;
+
+	if (ccmp_replay_check(pn, key->rx_pn)) {
+#ifdef CONFIG_LIB80211_DEBUG
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "CCMP: replay detected: STA=%pM "
+				 "previous PN %02x%02x%02x%02x%02x%02x "
+				 "received PN %02x%02x%02x%02x%02x%02x\n",
+				 hdr->addr2,
+				 key->rx_pn[0], key->rx_pn[1], key->rx_pn[2],
+				 key->rx_pn[3], key->rx_pn[4], key->rx_pn[5],
+				 pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]);
+		}
+#endif
+		key->dot11RSNAStatsCCMPReplays++;
+		return -4;
+	}
+
+	ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
+	xor_block(mic, b, CCMP_MIC_LEN);
+
+	blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
+	last = data_len % AES_BLOCK_LEN;
+
+	for (i = 1; i <= blocks; i++) {
+		len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+		/* Decrypt, with counter */
+		b0[14] = (i >> 8) & 0xff;
+		b0[15] = i & 0xff;
+		lib80211_ccmp_aes_encrypt(key->tfm, b0, b);
+		xor_block(pos, b, len);
+		/* Authentication */
+		xor_block(a, pos, len);
+		lib80211_ccmp_aes_encrypt(key->tfm, a, a);
+		pos += len;
+	}
+
+	if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "CCMP: decrypt failed: STA="
+			       "%pM\n", hdr->addr2);
+		}
+		key->dot11RSNAStatsCCMPDecryptErrors++;
+		return -5;
+	}
+
+	memcpy(key->rx_pn, pn, CCMP_PN_LEN);
+
+	/* Remove hdr and MIC */
+	memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
+	skb_pull(skb, CCMP_HDR_LEN);
+	skb_trim(skb, skb->len - CCMP_MIC_LEN);
+
+	return keyidx;
+}
+
+static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_ccmp_data *data = priv;
+	int keyidx;
+	struct crypto_cipher *tfm = data->tfm;
+
+	keyidx = data->key_idx;
+	memset(data, 0, sizeof(*data));
+	data->key_idx = keyidx;
+	data->tfm = tfm;
+	if (len == CCMP_TK_LEN) {
+		memcpy(data->key, key, CCMP_TK_LEN);
+		data->key_set = 1;
+		if (seq) {
+			data->rx_pn[0] = seq[5];
+			data->rx_pn[1] = seq[4];
+			data->rx_pn[2] = seq[3];
+			data->rx_pn[3] = seq[2];
+			data->rx_pn[4] = seq[1];
+			data->rx_pn[5] = seq[0];
+		}
+		crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
+	} else if (len == 0)
+		data->key_set = 0;
+	else
+		return -1;
+
+	return 0;
+}
+
+static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_ccmp_data *data = priv;
+
+	if (len < CCMP_TK_LEN)
+		return -1;
+
+	if (!data->key_set)
+		return 0;
+	memcpy(key, data->key, CCMP_TK_LEN);
+
+	if (seq) {
+		seq[0] = data->tx_pn[5];
+		seq[1] = data->tx_pn[4];
+		seq[2] = data->tx_pn[3];
+		seq[3] = data->tx_pn[2];
+		seq[4] = data->tx_pn[1];
+		seq[5] = data->tx_pn[0];
+	}
+
+	return CCMP_TK_LEN;
+}
+
+static char *lib80211_ccmp_print_stats(char *p, void *priv)
+{
+	struct lib80211_ccmp_data *ccmp = priv;
+
+	p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
+		     "tx_pn=%02x%02x%02x%02x%02x%02x "
+		     "rx_pn=%02x%02x%02x%02x%02x%02x "
+		     "format_errors=%d replays=%d decrypt_errors=%d\n",
+		     ccmp->key_idx, ccmp->key_set,
+		     ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2],
+		     ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5],
+		     ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2],
+		     ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5],
+		     ccmp->dot11RSNAStatsCCMPFormatErrors,
+		     ccmp->dot11RSNAStatsCCMPReplays,
+		     ccmp->dot11RSNAStatsCCMPDecryptErrors);
+
+	return p;
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_ccmp = {
+	.name = "CCMP",
+	.init = lib80211_ccmp_init,
+	.deinit = lib80211_ccmp_deinit,
+	.encrypt_mpdu = lib80211_ccmp_encrypt,
+	.decrypt_mpdu = lib80211_ccmp_decrypt,
+	.encrypt_msdu = NULL,
+	.decrypt_msdu = NULL,
+	.set_key = lib80211_ccmp_set_key,
+	.get_key = lib80211_ccmp_get_key,
+	.print_stats = lib80211_ccmp_print_stats,
+	.extra_mpdu_prefix_len = CCMP_HDR_LEN,
+	.extra_mpdu_postfix_len = CCMP_MIC_LEN,
+	.owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_ccmp_init(void)
+{
+	return lib80211_register_crypto_ops(&lib80211_crypt_ccmp);
+}
+
+static void __exit lib80211_crypto_ccmp_exit(void)
+{
+	lib80211_unregister_crypto_ops(&lib80211_crypt_ccmp);
+}
+
+module_init(lib80211_crypto_ccmp_init);
+module_exit(lib80211_crypto_ccmp_exit);
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
new file mode 100644
index 00000000..7ea4f2b0
--- /dev/null
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -0,0 +1,784 @@
+/*
+ * lib80211 crypt: host-based TKIP encryption implementation for lib80211
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+
+#include <linux/wireless.h>
+#include <linux/ieee80211.h>
+#include <net/iw_handler.h>
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+#include <net/lib80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("lib80211 crypt: TKIP");
+MODULE_LICENSE("GPL");
+
+#define TKIP_HDR_LEN 8
+
+struct lib80211_tkip_data {
+#define TKIP_KEY_LEN 32
+	u8 key[TKIP_KEY_LEN];
+	int key_set;
+
+	u32 tx_iv32;
+	u16 tx_iv16;
+	u16 tx_ttak[5];
+	int tx_phase1_done;
+
+	u32 rx_iv32;
+	u16 rx_iv16;
+	u16 rx_ttak[5];
+	int rx_phase1_done;
+	u32 rx_iv32_new;
+	u16 rx_iv16_new;
+
+	u32 dot11RSNAStatsTKIPReplays;
+	u32 dot11RSNAStatsTKIPICVErrors;
+	u32 dot11RSNAStatsTKIPLocalMICFailures;
+
+	int key_idx;
+
+	struct crypto_blkcipher *rx_tfm_arc4;
+	struct crypto_hash *rx_tfm_michael;
+	struct crypto_blkcipher *tx_tfm_arc4;
+	struct crypto_hash *tx_tfm_michael;
+
+	/* scratch buffers for virt_to_page() (crypto API) */
+	u8 rx_hdr[16], tx_hdr[16];
+
+	unsigned long flags;
+};
+
+static unsigned long lib80211_tkip_set_flags(unsigned long flags, void *priv)
+{
+	struct lib80211_tkip_data *_priv = priv;
+	unsigned long old_flags = _priv->flags;
+	_priv->flags = flags;
+	return old_flags;
+}
+
+static unsigned long lib80211_tkip_get_flags(void *priv)
+{
+	struct lib80211_tkip_data *_priv = priv;
+	return _priv->flags;
+}
+
+static void *lib80211_tkip_init(int key_idx)
+{
+	struct lib80211_tkip_data *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+	if (priv == NULL)
+		goto fail;
+
+	priv->key_idx = key_idx;
+
+	priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
+						CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->tx_tfm_arc4)) {
+		printk(KERN_DEBUG pr_fmt("could not allocate crypto API arc4\n"));
+		priv->tx_tfm_arc4 = NULL;
+		goto fail;
+	}
+
+	priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
+						 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->tx_tfm_michael)) {
+		printk(KERN_DEBUG pr_fmt("could not allocate crypto API michael_mic\n"));
+		priv->tx_tfm_michael = NULL;
+		goto fail;
+	}
+
+	priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
+						CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->rx_tfm_arc4)) {
+		printk(KERN_DEBUG pr_fmt("could not allocate crypto API arc4\n"));
+		priv->rx_tfm_arc4 = NULL;
+		goto fail;
+	}
+
+	priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
+						 CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->rx_tfm_michael)) {
+		printk(KERN_DEBUG pr_fmt("could not allocate crypto API michael_mic\n"));
+		priv->rx_tfm_michael = NULL;
+		goto fail;
+	}
+
+	return priv;
+
+      fail:
+	if (priv) {
+		if (priv->tx_tfm_michael)
+			crypto_free_hash(priv->tx_tfm_michael);
+		if (priv->tx_tfm_arc4)
+			crypto_free_blkcipher(priv->tx_tfm_arc4);
+		if (priv->rx_tfm_michael)
+			crypto_free_hash(priv->rx_tfm_michael);
+		if (priv->rx_tfm_arc4)
+			crypto_free_blkcipher(priv->rx_tfm_arc4);
+		kfree(priv);
+	}
+
+	return NULL;
+}
+
+static void lib80211_tkip_deinit(void *priv)
+{
+	struct lib80211_tkip_data *_priv = priv;
+	if (_priv) {
+		if (_priv->tx_tfm_michael)
+			crypto_free_hash(_priv->tx_tfm_michael);
+		if (_priv->tx_tfm_arc4)
+			crypto_free_blkcipher(_priv->tx_tfm_arc4);
+		if (_priv->rx_tfm_michael)
+			crypto_free_hash(_priv->rx_tfm_michael);
+		if (_priv->rx_tfm_arc4)
+			crypto_free_blkcipher(_priv->rx_tfm_arc4);
+	}
+	kfree(priv);
+}
+
+static inline u16 RotR1(u16 val)
+{
+	return (val >> 1) | (val << 15);
+}
+
+static inline u8 Lo8(u16 val)
+{
+	return val & 0xff;
+}
+
+static inline u8 Hi8(u16 val)
+{
+	return val >> 8;
+}
+
+static inline u16 Lo16(u32 val)
+{
+	return val & 0xffff;
+}
+
+static inline u16 Hi16(u32 val)
+{
+	return val >> 16;
+}
+
+static inline u16 Mk16(u8 hi, u8 lo)
+{
+	return lo | (((u16) hi) << 8);
+}
+
+static inline u16 Mk16_le(__le16 * v)
+{
+	return le16_to_cpu(*v);
+}
+
+static const u16 Sbox[256] = {
+	0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+	0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+	0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+	0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+	0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+	0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+	0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+	0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+	0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+	0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+	0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+	0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+	0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+	0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+	0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+	0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+	0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+	0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+	0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+	0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+	0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+	0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+	0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+	0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+	0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+	0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+	0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+	0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+	0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+	0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+	0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+	0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static inline u16 _S_(u16 v)
+{
+	u16 t = Sbox[Hi8(v)];
+	return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
+}
+
+#define PHASE1_LOOP_COUNT 8
+
+static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA,
+			       u32 IV32)
+{
+	int i, j;
+
+	/* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
+	TTAK[0] = Lo16(IV32);
+	TTAK[1] = Hi16(IV32);
+	TTAK[2] = Mk16(TA[1], TA[0]);
+	TTAK[3] = Mk16(TA[3], TA[2]);
+	TTAK[4] = Mk16(TA[5], TA[4]);
+
+	for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+		j = 2 * (i & 1);
+		TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
+		TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
+		TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
+		TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
+		TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
+	}
+}
+
+static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK,
+			       u16 IV16)
+{
+	/* Make temporary area overlap WEP seed so that the final copy can be
+	 * avoided on little endian hosts. */
+	u16 *PPK = (u16 *) & WEPSeed[4];
+
+	/* Step 1 - make copy of TTAK and bring in TSC */
+	PPK[0] = TTAK[0];
+	PPK[1] = TTAK[1];
+	PPK[2] = TTAK[2];
+	PPK[3] = TTAK[3];
+	PPK[4] = TTAK[4];
+	PPK[5] = TTAK[4] + IV16;
+
+	/* Step 2 - 96-bit bijective mixing using S-box */
+	PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0]));
+	PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2]));
+	PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4]));
+	PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6]));
+	PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8]));
+	PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10]));
+
+	PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12]));
+	PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14]));
+	PPK[2] += RotR1(PPK[1]);
+	PPK[3] += RotR1(PPK[2]);
+	PPK[4] += RotR1(PPK[3]);
+	PPK[5] += RotR1(PPK[4]);
+
+	/* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
+	 * WEPSeed[0..2] is transmitted as WEP IV */
+	WEPSeed[0] = Hi8(IV16);
+	WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
+	WEPSeed[2] = Lo8(IV16);
+	WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1);
+
+#ifdef __BIG_ENDIAN
+	{
+		int i;
+		for (i = 0; i < 6; i++)
+			PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
+	}
+#endif
+}
+
+static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
+			      u8 * rc4key, int keylen, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	u8 *pos;
+	struct ieee80211_hdr *hdr;
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len)
+		return -1;
+
+	if (rc4key == NULL || keylen < 16)
+		return -1;
+
+	if (!tkey->tx_phase1_done) {
+		tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
+				   tkey->tx_iv32);
+		tkey->tx_phase1_done = 1;
+	}
+	tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
+
+	pos = skb_push(skb, TKIP_HDR_LEN);
+	memmove(pos, pos + TKIP_HDR_LEN, hdr_len);
+	pos += hdr_len;
+
+	*pos++ = *rc4key;
+	*pos++ = *(rc4key + 1);
+	*pos++ = *(rc4key + 2);
+	*pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+	*pos++ = tkey->tx_iv32 & 0xff;
+	*pos++ = (tkey->tx_iv32 >> 8) & 0xff;
+	*pos++ = (tkey->tx_iv32 >> 16) & 0xff;
+	*pos++ = (tkey->tx_iv32 >> 24) & 0xff;
+
+	tkey->tx_iv16++;
+	if (tkey->tx_iv16 == 0) {
+		tkey->tx_phase1_done = 0;
+		tkey->tx_iv32++;
+	}
+
+	return TKIP_HDR_LEN;
+}
+
+static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 };
+	int len;
+	u8 rc4key[16], *pos, *icv;
+	u32 crc;
+	struct scatterlist sg;
+
+	if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+		if (net_ratelimit()) {
+			struct ieee80211_hdr *hdr =
+			    (struct ieee80211_hdr *)skb->data;
+			printk(KERN_DEBUG ": TKIP countermeasures: dropped "
+			       "TX packet to %pM\n", hdr->addr1);
+		}
+		return -1;
+	}
+
+	if (skb_tailroom(skb) < 4 || skb->len < hdr_len)
+		return -1;
+
+	len = skb->len - hdr_len;
+	pos = skb->data + hdr_len;
+
+	if ((lib80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0)
+		return -1;
+
+	crc = ~crc32_le(~0, pos, len);
+	icv = skb_put(skb, 4);
+	icv[0] = crc;
+	icv[1] = crc >> 8;
+	icv[2] = crc >> 16;
+	icv[3] = crc >> 24;
+
+	crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
+	sg_init_one(&sg, pos, len + 4);
+	return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+}
+
+/*
+ * deal with seq counter wrapping correctly.
+ * refer to timer_after() for jiffies wrapping handling
+ */
+static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
+				    u32 iv32_o, u16 iv16_o)
+{
+	if ((s32)iv32_n - (s32)iv32_o < 0 ||
+	    (iv32_n == iv32_o && iv16_n <= iv16_o))
+		return 1;
+	return 0;
+}
+
+static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 };
+	u8 rc4key[16];
+	u8 keyidx, *pos;
+	u32 iv32;
+	u16 iv16;
+	struct ieee80211_hdr *hdr;
+	u8 icv[4];
+	u32 crc;
+	struct scatterlist sg;
+	int plen;
+
+	hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG ": TKIP countermeasures: dropped "
+			       "received packet from %pM\n", hdr->addr2);
+		}
+		return -1;
+	}
+
+	if (skb->len < hdr_len + TKIP_HDR_LEN + 4)
+		return -1;
+
+	pos = skb->data + hdr_len;
+	keyidx = pos[3];
+	if (!(keyidx & (1 << 5))) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "TKIP: received packet without ExtIV"
+			       " flag from %pM\n", hdr->addr2);
+		}
+		return -2;
+	}
+	keyidx >>= 6;
+	if (tkey->key_idx != keyidx) {
+		printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
+		       "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
+		return -6;
+	}
+	if (!tkey->key_set) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "TKIP: received packet from %pM"
+			       " with keyid=%d that does not have a configured"
+			       " key\n", hdr->addr2, keyidx);
+		}
+		return -3;
+	}
+	iv16 = (pos[0] << 8) | pos[2];
+	iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
+	pos += TKIP_HDR_LEN;
+
+	if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
+#ifdef CONFIG_LIB80211_DEBUG
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "TKIP: replay detected: STA=%pM"
+			       " previous TSC %08x%04x received TSC "
+			       "%08x%04x\n", hdr->addr2,
+			       tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
+		}
+#endif
+		tkey->dot11RSNAStatsTKIPReplays++;
+		return -4;
+	}
+
+	if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
+		tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
+		tkey->rx_phase1_done = 1;
+	}
+	tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
+
+	plen = skb->len - hdr_len - 12;
+
+	crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
+	sg_init_one(&sg, pos, plen + 4);
+	if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) {
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG ": TKIP: failed to decrypt "
+			       "received packet from %pM\n",
+			       hdr->addr2);
+		}
+		return -7;
+	}
+
+	crc = ~crc32_le(~0, pos, plen);
+	icv[0] = crc;
+	icv[1] = crc >> 8;
+	icv[2] = crc >> 16;
+	icv[3] = crc >> 24;
+	if (memcmp(icv, pos + plen, 4) != 0) {
+		if (iv32 != tkey->rx_iv32) {
+			/* Previously cached Phase1 result was already lost, so
+			 * it needs to be recalculated for the next packet. */
+			tkey->rx_phase1_done = 0;
+		}
+#ifdef CONFIG_LIB80211_DEBUG
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "TKIP: ICV error detected: STA="
+			       "%pM\n", hdr->addr2);
+		}
+#endif
+		tkey->dot11RSNAStatsTKIPICVErrors++;
+		return -5;
+	}
+
+	/* Update real counters only after Michael MIC verification has
+	 * completed */
+	tkey->rx_iv32_new = iv32;
+	tkey->rx_iv16_new = iv16;
+
+	/* Remove IV and ICV */
+	memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len);
+	skb_pull(skb, TKIP_HDR_LEN);
+	skb_trim(skb, skb->len - 4);
+
+	return keyidx;
+}
+
+static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
+		       u8 * data, size_t data_len, u8 * mic)
+{
+	struct hash_desc desc;
+	struct scatterlist sg[2];
+
+	if (tfm_michael == NULL) {
+		pr_warn("%s(): tfm_michael == NULL\n", __func__);
+		return -1;
+	}
+	sg_init_table(sg, 2);
+	sg_set_buf(&sg[0], hdr, 16);
+	sg_set_buf(&sg[1], data, data_len);
+
+	if (crypto_hash_setkey(tfm_michael, key, 8))
+		return -1;
+
+	desc.tfm = tfm_michael;
+	desc.flags = 0;
+	return crypto_hash_digest(&desc, sg, data_len + 16, mic);
+}
+
+static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
+{
+	struct ieee80211_hdr *hdr11;
+
+	hdr11 = (struct ieee80211_hdr *)skb->data;
+
+	switch (le16_to_cpu(hdr11->frame_control) &
+		(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+	case IEEE80211_FCTL_TODS:
+		memcpy(hdr, hdr11->addr3, ETH_ALEN);	/* DA */
+		memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN);	/* SA */
+		break;
+	case IEEE80211_FCTL_FROMDS:
+		memcpy(hdr, hdr11->addr1, ETH_ALEN);	/* DA */
+		memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN);	/* SA */
+		break;
+	case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+		memcpy(hdr, hdr11->addr3, ETH_ALEN);	/* DA */
+		memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN);	/* SA */
+		break;
+	case 0:
+		memcpy(hdr, hdr11->addr1, ETH_ALEN);	/* DA */
+		memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN);	/* SA */
+		break;
+	}
+
+	if (ieee80211_is_data_qos(hdr11->frame_control)) {
+		hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11)))
+			& IEEE80211_QOS_CTL_TID_MASK;
+	} else
+		hdr[12] = 0;		/* priority */
+
+	hdr[13] = hdr[14] = hdr[15] = 0;	/* reserved */
+}
+
+static int lib80211_michael_mic_add(struct sk_buff *skb, int hdr_len,
+				     void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	u8 *pos;
+
+	if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
+		printk(KERN_DEBUG "Invalid packet for Michael MIC add "
+		       "(tailroom=%d hdr_len=%d skb->len=%d)\n",
+		       skb_tailroom(skb), hdr_len, skb->len);
+		return -1;
+	}
+
+	michael_mic_hdr(skb, tkey->tx_hdr);
+	pos = skb_put(skb, 8);
+	if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr,
+			skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
+		return -1;
+
+	return 0;
+}
+
+static void lib80211_michael_mic_failure(struct net_device *dev,
+					  struct ieee80211_hdr *hdr,
+					  int keyidx)
+{
+	union iwreq_data wrqu;
+	struct iw_michaelmicfailure ev;
+
+	/* TODO: needed parameters: count, keyid, key type, TSC */
+	memset(&ev, 0, sizeof(ev));
+	ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
+	if (hdr->addr1[0] & 0x01)
+		ev.flags |= IW_MICFAILURE_GROUP;
+	else
+		ev.flags |= IW_MICFAILURE_PAIRWISE;
+	ev.src_addr.sa_family = ARPHRD_ETHER;
+	memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
+	memset(&wrqu, 0, sizeof(wrqu));
+	wrqu.data.length = sizeof(ev);
+	wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev);
+}
+
+static int lib80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
+					int hdr_len, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	u8 mic[8];
+
+	if (!tkey->key_set)
+		return -1;
+
+	michael_mic_hdr(skb, tkey->rx_hdr);
+	if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr,
+			skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
+		return -1;
+	if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
+		struct ieee80211_hdr *hdr;
+		hdr = (struct ieee80211_hdr *)skb->data;
+		printk(KERN_DEBUG "%s: Michael MIC verification failed for "
+		       "MSDU from %pM keyidx=%d\n",
+		       skb->dev ? skb->dev->name : "N/A", hdr->addr2,
+		       keyidx);
+		if (skb->dev)
+			lib80211_michael_mic_failure(skb->dev, hdr, keyidx);
+		tkey->dot11RSNAStatsTKIPLocalMICFailures++;
+		return -1;
+	}
+
+	/* Update TSC counters for RX now that the packet verification has
+	 * completed. */
+	tkey->rx_iv32 = tkey->rx_iv32_new;
+	tkey->rx_iv16 = tkey->rx_iv16_new;
+
+	skb_trim(skb, skb->len - 8);
+
+	return 0;
+}
+
+static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+	int keyidx;
+	struct crypto_hash *tfm = tkey->tx_tfm_michael;
+	struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4;
+	struct crypto_hash *tfm3 = tkey->rx_tfm_michael;
+	struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4;
+
+	keyidx = tkey->key_idx;
+	memset(tkey, 0, sizeof(*tkey));
+	tkey->key_idx = keyidx;
+	tkey->tx_tfm_michael = tfm;
+	tkey->tx_tfm_arc4 = tfm2;
+	tkey->rx_tfm_michael = tfm3;
+	tkey->rx_tfm_arc4 = tfm4;
+	if (len == TKIP_KEY_LEN) {
+		memcpy(tkey->key, key, TKIP_KEY_LEN);
+		tkey->key_set = 1;
+		tkey->tx_iv16 = 1;	/* TSC is initialized to 1 */
+		if (seq) {
+			tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
+			    (seq[3] << 8) | seq[2];
+			tkey->rx_iv16 = (seq[1] << 8) | seq[0];
+		}
+	} else if (len == 0)
+		tkey->key_set = 0;
+	else
+		return -1;
+
+	return 0;
+}
+
+static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_tkip_data *tkey = priv;
+
+	if (len < TKIP_KEY_LEN)
+		return -1;
+
+	if (!tkey->key_set)
+		return 0;
+	memcpy(key, tkey->key, TKIP_KEY_LEN);
+
+	if (seq) {
+		/* Return the sequence number of the last transmitted frame. */
+		u16 iv16 = tkey->tx_iv16;
+		u32 iv32 = tkey->tx_iv32;
+		if (iv16 == 0)
+			iv32--;
+		iv16--;
+		seq[0] = tkey->tx_iv16;
+		seq[1] = tkey->tx_iv16 >> 8;
+		seq[2] = tkey->tx_iv32;
+		seq[3] = tkey->tx_iv32 >> 8;
+		seq[4] = tkey->tx_iv32 >> 16;
+		seq[5] = tkey->tx_iv32 >> 24;
+	}
+
+	return TKIP_KEY_LEN;
+}
+
+static char *lib80211_tkip_print_stats(char *p, void *priv)
+{
+	struct lib80211_tkip_data *tkip = priv;
+	p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
+		     "tx_pn=%02x%02x%02x%02x%02x%02x "
+		     "rx_pn=%02x%02x%02x%02x%02x%02x "
+		     "replays=%d icv_errors=%d local_mic_failures=%d\n",
+		     tkip->key_idx, tkip->key_set,
+		     (tkip->tx_iv32 >> 24) & 0xff,
+		     (tkip->tx_iv32 >> 16) & 0xff,
+		     (tkip->tx_iv32 >> 8) & 0xff,
+		     tkip->tx_iv32 & 0xff,
+		     (tkip->tx_iv16 >> 8) & 0xff,
+		     tkip->tx_iv16 & 0xff,
+		     (tkip->rx_iv32 >> 24) & 0xff,
+		     (tkip->rx_iv32 >> 16) & 0xff,
+		     (tkip->rx_iv32 >> 8) & 0xff,
+		     tkip->rx_iv32 & 0xff,
+		     (tkip->rx_iv16 >> 8) & 0xff,
+		     tkip->rx_iv16 & 0xff,
+		     tkip->dot11RSNAStatsTKIPReplays,
+		     tkip->dot11RSNAStatsTKIPICVErrors,
+		     tkip->dot11RSNAStatsTKIPLocalMICFailures);
+	return p;
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_tkip = {
+	.name = "TKIP",
+	.init = lib80211_tkip_init,
+	.deinit = lib80211_tkip_deinit,
+	.encrypt_mpdu = lib80211_tkip_encrypt,
+	.decrypt_mpdu = lib80211_tkip_decrypt,
+	.encrypt_msdu = lib80211_michael_mic_add,
+	.decrypt_msdu = lib80211_michael_mic_verify,
+	.set_key = lib80211_tkip_set_key,
+	.get_key = lib80211_tkip_get_key,
+	.print_stats = lib80211_tkip_print_stats,
+	.extra_mpdu_prefix_len = 4 + 4,	/* IV + ExtIV */
+	.extra_mpdu_postfix_len = 4,	/* ICV */
+	.extra_msdu_postfix_len = 8,	/* MIC */
+	.get_flags = lib80211_tkip_get_flags,
+	.set_flags = lib80211_tkip_set_flags,
+	.owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_tkip_init(void)
+{
+	return lib80211_register_crypto_ops(&lib80211_crypt_tkip);
+}
+
+static void __exit lib80211_crypto_tkip_exit(void)
+{
+	lib80211_unregister_crypto_ops(&lib80211_crypt_tkip);
+}
+
+module_init(lib80211_crypto_tkip_init);
+module_exit(lib80211_crypto_tkip_exit);
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
new file mode 100644
index 00000000..2f265e03
--- /dev/null
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -0,0 +1,294 @@
+/*
+ * lib80211 crypt: host-based WEP encryption implementation for lib80211
+ *
+ * Copyright (c) 2002-2004, Jouni Malinen <j@w1.fi>
+ * Copyright (c) 2008, John W. Linville <linville@tuxdriver.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+#include <asm/string.h>
+
+#include <net/lib80211.h>
+
+#include <linux/crypto.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("lib80211 crypt: WEP");
+MODULE_LICENSE("GPL");
+
+struct lib80211_wep_data {
+	u32 iv;
+#define WEP_KEY_LEN 13
+	u8 key[WEP_KEY_LEN + 1];
+	u8 key_len;
+	u8 key_idx;
+	struct crypto_blkcipher *tx_tfm;
+	struct crypto_blkcipher *rx_tfm;
+};
+
+static void *lib80211_wep_init(int keyidx)
+{
+	struct lib80211_wep_data *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_ATOMIC);
+	if (priv == NULL)
+		goto fail;
+	priv->key_idx = keyidx;
+
+	priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->tx_tfm)) {
+		printk(KERN_DEBUG "lib80211_crypt_wep: could not allocate "
+		       "crypto API arc4\n");
+		priv->tx_tfm = NULL;
+		goto fail;
+	}
+
+	priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(priv->rx_tfm)) {
+		printk(KERN_DEBUG "lib80211_crypt_wep: could not allocate "
+		       "crypto API arc4\n");
+		priv->rx_tfm = NULL;
+		goto fail;
+	}
+	/* start WEP IV from a random value */
+	get_random_bytes(&priv->iv, 4);
+
+	return priv;
+
+      fail:
+	if (priv) {
+		if (priv->tx_tfm)
+			crypto_free_blkcipher(priv->tx_tfm);
+		if (priv->rx_tfm)
+			crypto_free_blkcipher(priv->rx_tfm);
+		kfree(priv);
+	}
+	return NULL;
+}
+
+static void lib80211_wep_deinit(void *priv)
+{
+	struct lib80211_wep_data *_priv = priv;
+	if (_priv) {
+		if (_priv->tx_tfm)
+			crypto_free_blkcipher(_priv->tx_tfm);
+		if (_priv->rx_tfm)
+			crypto_free_blkcipher(_priv->rx_tfm);
+	}
+	kfree(priv);
+}
+
+/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */
+static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
+			       u8 *key, int keylen, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+	u32 klen;
+	u8 *pos;
+
+	if (skb_headroom(skb) < 4 || skb->len < hdr_len)
+		return -1;
+
+	pos = skb_push(skb, 4);
+	memmove(pos, pos + 4, hdr_len);
+	pos += hdr_len;
+
+	klen = 3 + wep->key_len;
+
+	wep->iv++;
+
+	/* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
+	 * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
+	 * can be used to speedup attacks, so avoid using them. */
+	if ((wep->iv & 0xff00) == 0xff00) {
+		u8 B = (wep->iv >> 16) & 0xff;
+		if (B >= 3 && B < klen)
+			wep->iv += 0x0100;
+	}
+
+	/* Prepend 24-bit IV to RC4 key and TX frame */
+	*pos++ = (wep->iv >> 16) & 0xff;
+	*pos++ = (wep->iv >> 8) & 0xff;
+	*pos++ = wep->iv & 0xff;
+	*pos++ = wep->key_idx << 6;
+
+	return 0;
+}
+
+/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
+ * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
+ * so the payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+	struct blkcipher_desc desc = { .tfm = wep->tx_tfm };
+	u32 crc, klen, len;
+	u8 *pos, *icv;
+	struct scatterlist sg;
+	u8 key[WEP_KEY_LEN + 3];
+
+	/* other checks are in lib80211_wep_build_iv */
+	if (skb_tailroom(skb) < 4)
+		return -1;
+
+	/* add the IV to the frame */
+	if (lib80211_wep_build_iv(skb, hdr_len, NULL, 0, priv))
+		return -1;
+
+	/* Copy the IV into the first 3 bytes of the key */
+	skb_copy_from_linear_data_offset(skb, hdr_len, key, 3);
+
+	/* Copy rest of the WEP key (the secret part) */
+	memcpy(key + 3, wep->key, wep->key_len);
+
+	len = skb->len - hdr_len - 4;
+	pos = skb->data + hdr_len + 4;
+	klen = 3 + wep->key_len;
+
+	/* Append little-endian CRC32 over only the data and encrypt it to produce ICV */
+	crc = ~crc32_le(~0, pos, len);
+	icv = skb_put(skb, 4);
+	icv[0] = crc;
+	icv[1] = crc >> 8;
+	icv[2] = crc >> 16;
+	icv[3] = crc >> 24;
+
+	crypto_blkcipher_setkey(wep->tx_tfm, key, klen);
+	sg_init_one(&sg, pos, len + 4);
+	return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+}
+
+/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed.
+ */
+static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+	struct blkcipher_desc desc = { .tfm = wep->rx_tfm };
+	u32 crc, klen, plen;
+	u8 key[WEP_KEY_LEN + 3];
+	u8 keyidx, *pos, icv[4];
+	struct scatterlist sg;
+
+	if (skb->len < hdr_len + 8)
+		return -1;
+
+	pos = skb->data + hdr_len;
+	key[0] = *pos++;
+	key[1] = *pos++;
+	key[2] = *pos++;
+	keyidx = *pos++ >> 6;
+	if (keyidx != wep->key_idx)
+		return -1;
+
+	klen = 3 + wep->key_len;
+
+	/* Copy rest of the WEP key (the secret part) */
+	memcpy(key + 3, wep->key, wep->key_len);
+
+	/* Apply RC4 to data and compute CRC32 over decrypted data */
+	plen = skb->len - hdr_len - 8;
+
+	crypto_blkcipher_setkey(wep->rx_tfm, key, klen);
+	sg_init_one(&sg, pos, plen + 4);
+	if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4))
+		return -7;
+
+	crc = ~crc32_le(~0, pos, plen);
+	icv[0] = crc;
+	icv[1] = crc >> 8;
+	icv[2] = crc >> 16;
+	icv[3] = crc >> 24;
+	if (memcmp(icv, pos + plen, 4) != 0) {
+		/* ICV mismatch - drop frame */
+		return -2;
+	}
+
+	/* Remove IV and ICV */
+	memmove(skb->data + 4, skb->data, hdr_len);
+	skb_pull(skb, 4);
+	skb_trim(skb, skb->len - 4);
+
+	return 0;
+}
+
+static int lib80211_wep_set_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+
+	if (len < 0 || len > WEP_KEY_LEN)
+		return -1;
+
+	memcpy(wep->key, key, len);
+	wep->key_len = len;
+
+	return 0;
+}
+
+static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+
+	if (len < wep->key_len)
+		return -1;
+
+	memcpy(key, wep->key, wep->key_len);
+
+	return wep->key_len;
+}
+
+static char *lib80211_wep_print_stats(char *p, void *priv)
+{
+	struct lib80211_wep_data *wep = priv;
+	p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len);
+	return p;
+}
+
+static struct lib80211_crypto_ops lib80211_crypt_wep = {
+	.name = "WEP",
+	.init = lib80211_wep_init,
+	.deinit = lib80211_wep_deinit,
+	.encrypt_mpdu = lib80211_wep_encrypt,
+	.decrypt_mpdu = lib80211_wep_decrypt,
+	.encrypt_msdu = NULL,
+	.decrypt_msdu = NULL,
+	.set_key = lib80211_wep_set_key,
+	.get_key = lib80211_wep_get_key,
+	.print_stats = lib80211_wep_print_stats,
+	.extra_mpdu_prefix_len = 4,	/* IV */
+	.extra_mpdu_postfix_len = 4,	/* ICV */
+	.owner = THIS_MODULE,
+};
+
+static int __init lib80211_crypto_wep_init(void)
+{
+	return lib80211_register_crypto_ops(&lib80211_crypt_wep);
+}
+
+static void __exit lib80211_crypto_wep_exit(void)
+{
+	lib80211_unregister_crypto_ops(&lib80211_crypt_wep);
+}
+
+module_init(lib80211_crypto_wep_init);
+module_exit(lib80211_crypto_wep_exit);
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
new file mode 100644
index 00000000..5c116083
--- /dev/null
+++ b/net/wireless/mesh.c
@@ -0,0 +1,161 @@
+#include <linux/ieee80211.h>
+#include <net/cfg80211.h>
+#include "nl80211.h"
+#include "core.h"
+
+/* Default values, timeouts in ms */
+#define MESH_TTL 		31
+#define MESH_DEFAULT_ELEMENT_TTL 31
+#define MESH_MAX_RETR	 	3
+#define MESH_RET_T 		100
+#define MESH_CONF_T 		100
+#define MESH_HOLD_T 		100
+
+#define MESH_PATH_TIMEOUT	5000
+
+/*
+ * Minimum interval between two consecutive PREQs originated by the same
+ * interface
+ */
+#define MESH_PREQ_MIN_INT	10
+#define MESH_DIAM_TRAVERSAL_TIME 50
+
+/*
+ * A path will be refreshed if it is used PATH_REFRESH_TIME milliseconds
+ * before timing out.  This way it will remain ACTIVE and no data frames
+ * will be unnecessarily held in the pending queue.
+ */
+#define MESH_PATH_REFRESH_TIME			1000
+#define MESH_MIN_DISCOVERY_TIMEOUT (2 * MESH_DIAM_TRAVERSAL_TIME)
+
+/* Default maximum number of established plinks per interface */
+#define MESH_MAX_ESTAB_PLINKS	32
+
+#define MESH_MAX_PREQ_RETRIES	4
+
+
+const struct mesh_config default_mesh_config = {
+	.dot11MeshRetryTimeout = MESH_RET_T,
+	.dot11MeshConfirmTimeout = MESH_CONF_T,
+	.dot11MeshHoldingTimeout = MESH_HOLD_T,
+	.dot11MeshMaxRetries = MESH_MAX_RETR,
+	.dot11MeshTTL = MESH_TTL,
+	.element_ttl = MESH_DEFAULT_ELEMENT_TTL,
+	.auto_open_plinks = true,
+	.dot11MeshMaxPeerLinks = MESH_MAX_ESTAB_PLINKS,
+	.dot11MeshHWMPactivePathTimeout = MESH_PATH_TIMEOUT,
+	.dot11MeshHWMPpreqMinInterval = MESH_PREQ_MIN_INT,
+	.dot11MeshHWMPnetDiameterTraversalTime = MESH_DIAM_TRAVERSAL_TIME,
+	.dot11MeshHWMPmaxPREQretries = MESH_MAX_PREQ_RETRIES,
+	.path_refresh_time = MESH_PATH_REFRESH_TIME,
+	.min_discovery_timeout = MESH_MIN_DISCOVERY_TIMEOUT,
+};
+
+const struct mesh_setup default_mesh_setup = {
+	.path_sel_proto = IEEE80211_PATH_PROTOCOL_HWMP,
+	.path_metric = IEEE80211_PATH_METRIC_AIRTIME,
+	.ie = NULL,
+	.ie_len = 0,
+	.is_secure = false,
+};
+
+int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 const struct mesh_setup *setup,
+			 const struct mesh_config *conf)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN != IEEE80211_MAX_MESH_ID_LEN);
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) &&
+	      setup->is_secure)
+		return -EOPNOTSUPP;
+
+	if (wdev->mesh_id_len)
+		return -EALREADY;
+
+	if (!setup->mesh_id_len)
+		return -EINVAL;
+
+	if (!rdev->ops->join_mesh)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup);
+	if (!err) {
+		memcpy(wdev->ssid, setup->mesh_id, setup->mesh_id_len);
+		wdev->mesh_id_len = setup->mesh_id_len;
+	}
+
+	return err;
+}
+
+int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       const struct mesh_setup *setup,
+		       const struct mesh_config *conf)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_join_mesh(rdev, dev, setup, conf);
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+void cfg80211_notify_new_peer_candidate(struct net_device *dev,
+		const u8 *macaddr, const u8* ie, u8 ie_len, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT))
+		return;
+
+	nl80211_send_new_peer_candidate(wiphy_to_dev(wdev->wiphy), dev,
+			macaddr, ie, ie_len, gfp);
+}
+EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate);
+
+static int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+				 struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->leave_mesh)
+		return -EOPNOTSUPP;
+
+	if (!wdev->mesh_id_len)
+		return -ENOTCONN;
+
+	err = rdev->ops->leave_mesh(&rdev->wiphy, dev);
+	if (!err)
+		wdev->mesh_id_len = 0;
+	return err;
+}
+
+int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
+			struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_leave_mesh(rdev, dev);
+	wdev_unlock(wdev);
+
+	return err;
+}
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
new file mode 100644
index 00000000..493b9399
--- /dev/null
+++ b/net/wireless/mlme.c
@@ -0,0 +1,1084 @@
+/*
+ * cfg80211 MLME SAP interface
+ *
+ * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include "core.h"
+#include "nl80211.h"
+
+void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	u8 *bssid = mgmt->bssid;
+	int i;
+	u16 status = le16_to_cpu(mgmt->u.auth.status_code);
+	bool done = false;
+
+	wdev_lock(wdev);
+
+	for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (wdev->authtry_bsses[i] &&
+		    memcmp(wdev->authtry_bsses[i]->pub.bssid, bssid,
+							ETH_ALEN) == 0) {
+			if (status == WLAN_STATUS_SUCCESS) {
+				wdev->auth_bsses[i] = wdev->authtry_bsses[i];
+			} else {
+				cfg80211_unhold_bss(wdev->authtry_bsses[i]);
+				cfg80211_put_bss(&wdev->authtry_bsses[i]->pub);
+			}
+			wdev->authtry_bsses[i] = NULL;
+			done = true;
+			break;
+		}
+	}
+
+	if (done) {
+		nl80211_send_rx_auth(rdev, dev, buf, len, GFP_KERNEL);
+		cfg80211_sme_rx_auth(dev, buf, len);
+	}
+
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_auth);
+
+void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len)
+{
+	u16 status_code;
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	u8 *ie = mgmt->u.assoc_resp.variable;
+	int i, ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable);
+	struct cfg80211_internal_bss *bss = NULL;
+
+	wdev_lock(wdev);
+
+	status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
+
+	/*
+	 * This is a bit of a hack, we don't notify userspace of
+	 * a (re-)association reply if we tried to send a reassoc
+	 * and got a reject -- we only try again with an assoc
+	 * frame instead of reassoc.
+	 */
+	if (status_code != WLAN_STATUS_SUCCESS && wdev->conn &&
+	    cfg80211_sme_failed_reassoc(wdev))
+		goto out;
+
+	nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL);
+
+	if (status_code == WLAN_STATUS_SUCCESS) {
+		for (i = 0; i < MAX_AUTH_BSSES; i++) {
+			if (!wdev->auth_bsses[i])
+				continue;
+			if (memcmp(wdev->auth_bsses[i]->pub.bssid, mgmt->bssid,
+				   ETH_ALEN) == 0) {
+				bss = wdev->auth_bsses[i];
+				wdev->auth_bsses[i] = NULL;
+				/* additional reference to drop hold */
+				cfg80211_ref_bss(bss);
+				break;
+			}
+		}
+
+		/*
+		 * We might be coming here because the driver reported
+		 * a successful association at the same time as the
+		 * user requested a deauth. In that case, we will have
+		 * removed the BSS from the auth_bsses list due to the
+		 * deauth request when the assoc response makes it. If
+		 * the two code paths acquire the lock the other way
+		 * around, that's just the standard situation of a
+		 * deauth being requested while connected.
+		 */
+		if (!bss)
+			goto out;
+	} else if (wdev->conn) {
+		cfg80211_sme_failed_assoc(wdev);
+		/*
+		 * do not call connect_result() now because the
+		 * sme will schedule work that does it later.
+		 */
+		goto out;
+	}
+
+	if (!wdev->conn && wdev->sme_state == CFG80211_SME_IDLE) {
+		/*
+		 * This is for the userspace SME, the CONNECTING
+		 * state will be changed to CONNECTED by
+		 * __cfg80211_connect_result() below.
+		 */
+		wdev->sme_state = CFG80211_SME_CONNECTING;
+	}
+
+	/* this consumes one bss reference (unless bss is NULL) */
+	__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs,
+				  status_code,
+				  status_code == WLAN_STATUS_SUCCESS,
+				  bss ? &bss->pub : NULL);
+	/* drop hold now, and also reference acquired above */
+	if (bss) {
+		cfg80211_unhold_bss(bss);
+		cfg80211_put_bss(&bss->pub);
+	}
+
+ out:
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_rx_assoc);
+
+void __cfg80211_send_deauth(struct net_device *dev,
+				   const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	const u8 *bssid = mgmt->bssid;
+	int i;
+	bool found = false, was_current = false;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->current_bss &&
+	    memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(&wdev->current_bss->pub);
+		wdev->current_bss = NULL;
+		found = true;
+		was_current = true;
+	} else for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (wdev->auth_bsses[i] &&
+		    memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) {
+			cfg80211_unhold_bss(wdev->auth_bsses[i]);
+			cfg80211_put_bss(&wdev->auth_bsses[i]->pub);
+			wdev->auth_bsses[i] = NULL;
+			found = true;
+			break;
+		}
+		if (wdev->authtry_bsses[i] &&
+		    memcmp(wdev->authtry_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) {
+			cfg80211_unhold_bss(wdev->authtry_bsses[i]);
+			cfg80211_put_bss(&wdev->authtry_bsses[i]->pub);
+			wdev->authtry_bsses[i] = NULL;
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return;
+
+	nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL);
+
+	if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) {
+		u16 reason_code;
+		bool from_ap;
+
+		reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
+
+		from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0;
+		__cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap);
+	} else if (wdev->sme_state == CFG80211_SME_CONNECTING) {
+		__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, NULL, 0,
+					  WLAN_STATUS_UNSPECIFIED_FAILURE,
+					  false, NULL);
+	}
+}
+EXPORT_SYMBOL(__cfg80211_send_deauth);
+
+void cfg80211_send_deauth(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	wdev_lock(wdev);
+	__cfg80211_send_deauth(dev, buf, len);
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_deauth);
+
+void __cfg80211_send_disassoc(struct net_device *dev,
+				     const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	const u8 *bssid = mgmt->bssid;
+	int i;
+	u16 reason_code;
+	bool from_ap;
+	bool done = false;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	nl80211_send_disassoc(rdev, dev, buf, len, GFP_KERNEL);
+
+	if (wdev->sme_state != CFG80211_SME_CONNECTED)
+		return;
+
+	if (wdev->current_bss &&
+	    memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+		for (i = 0; i < MAX_AUTH_BSSES; i++) {
+			if (wdev->authtry_bsses[i] || wdev->auth_bsses[i])
+				continue;
+			wdev->auth_bsses[i] = wdev->current_bss;
+			wdev->current_bss = NULL;
+			done = true;
+			cfg80211_sme_disassoc(dev, i);
+			break;
+		}
+		WARN_ON(!done);
+	} else
+		WARN_ON(1);
+
+
+	reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
+
+	from_ap = memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0;
+	__cfg80211_disconnected(dev, NULL, 0, reason_code, from_ap);
+}
+EXPORT_SYMBOL(__cfg80211_send_disassoc);
+
+void cfg80211_send_disassoc(struct net_device *dev, const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	wdev_lock(wdev);
+	__cfg80211_send_disassoc(dev, buf, len);
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_disassoc);
+
+void cfg80211_send_unprot_deauth(struct net_device *dev, const u8 *buf,
+				 size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_unprot_deauth(rdev, dev, buf, len, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(cfg80211_send_unprot_deauth);
+
+void cfg80211_send_unprot_disassoc(struct net_device *dev, const u8 *buf,
+				   size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_unprot_disassoc(rdev, dev, buf, len, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(cfg80211_send_unprot_disassoc);
+
+static void __cfg80211_auth_remove(struct wireless_dev *wdev, const u8 *addr)
+{
+	int i;
+	bool done = false;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	for (i = 0; addr && i < MAX_AUTH_BSSES; i++) {
+		if (wdev->authtry_bsses[i] &&
+		    memcmp(wdev->authtry_bsses[i]->pub.bssid,
+			   addr, ETH_ALEN) == 0) {
+			cfg80211_unhold_bss(wdev->authtry_bsses[i]);
+			cfg80211_put_bss(&wdev->authtry_bsses[i]->pub);
+			wdev->authtry_bsses[i] = NULL;
+			done = true;
+			break;
+		}
+	}
+
+	WARN_ON(!done);
+}
+
+void __cfg80211_auth_canceled(struct net_device *dev, const u8 *addr)
+{
+	__cfg80211_auth_remove(dev->ieee80211_ptr, addr);
+}
+EXPORT_SYMBOL(__cfg80211_auth_canceled);
+
+void cfg80211_send_auth_timeout(struct net_device *dev, const u8 *addr)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	wdev_lock(wdev);
+
+	nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL);
+	if (wdev->sme_state == CFG80211_SME_CONNECTING)
+		__cfg80211_connect_result(dev, addr, NULL, 0, NULL, 0,
+					  WLAN_STATUS_UNSPECIFIED_FAILURE,
+					  false, NULL);
+
+	__cfg80211_auth_remove(wdev, addr);
+
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_auth_timeout);
+
+void cfg80211_send_assoc_timeout(struct net_device *dev, const u8 *addr)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	int i;
+	bool done = false;
+
+	wdev_lock(wdev);
+
+	nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL);
+	if (wdev->sme_state == CFG80211_SME_CONNECTING)
+		__cfg80211_connect_result(dev, addr, NULL, 0, NULL, 0,
+					  WLAN_STATUS_UNSPECIFIED_FAILURE,
+					  false, NULL);
+
+	for (i = 0; addr && i < MAX_AUTH_BSSES; i++) {
+		if (wdev->auth_bsses[i] &&
+		    memcmp(wdev->auth_bsses[i]->pub.bssid,
+			   addr, ETH_ALEN) == 0) {
+			cfg80211_unhold_bss(wdev->auth_bsses[i]);
+			cfg80211_put_bss(&wdev->auth_bsses[i]->pub);
+			wdev->auth_bsses[i] = NULL;
+			done = true;
+			break;
+		}
+	}
+
+	WARN_ON(!done);
+
+	wdev_unlock(wdev);
+}
+EXPORT_SYMBOL(cfg80211_send_assoc_timeout);
+
+void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
+				  enum nl80211_key_type key_type, int key_id,
+				  const u8 *tsc, gfp_t gfp)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+	char *buf = kmalloc(128, gfp);
+
+	if (buf) {
+		sprintf(buf, "MLME-MICHAELMICFAILURE.indication("
+			"keyid=%d %scast addr=%pM)", key_id,
+			key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni",
+			addr);
+		memset(&wrqu, 0, sizeof(wrqu));
+		wrqu.data.length = strlen(buf);
+		wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+		kfree(buf);
+	}
+#endif
+
+	nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp);
+}
+EXPORT_SYMBOL(cfg80211_michael_mic_failure);
+
+/* some MLME handling for userspace SME */
+int __cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct ieee80211_channel *chan,
+			 enum nl80211_auth_type auth_type,
+			 const u8 *bssid,
+			 const u8 *ssid, int ssid_len,
+			 const u8 *ie, int ie_len,
+			 const u8 *key, int key_len, int key_idx,
+			 bool local_state_change)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_auth_request req;
+	struct cfg80211_internal_bss *bss;
+	int i, err, slot = -1, nfree = 0;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (auth_type == NL80211_AUTHTYPE_SHARED_KEY)
+		if (!key || !key_len || key_idx < 0 || key_idx > 4)
+			return -EINVAL;
+
+	if (wdev->current_bss &&
+	    memcmp(bssid, wdev->current_bss->pub.bssid, ETH_ALEN) == 0)
+		return -EALREADY;
+
+	for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (wdev->authtry_bsses[i] &&
+		    memcmp(bssid, wdev->authtry_bsses[i]->pub.bssid,
+						ETH_ALEN) == 0)
+			return -EALREADY;
+		if (wdev->auth_bsses[i] &&
+		    memcmp(bssid, wdev->auth_bsses[i]->pub.bssid,
+						ETH_ALEN) == 0)
+			return -EALREADY;
+	}
+
+	memset(&req, 0, sizeof(req));
+
+	req.local_state_change = local_state_change;
+	req.ie = ie;
+	req.ie_len = ie_len;
+	req.auth_type = auth_type;
+	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
+				   WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS);
+	req.key = key;
+	req.key_len = key_len;
+	req.key_idx = key_idx;
+	if (!req.bss)
+		return -ENOENT;
+
+	bss = bss_from_pub(req.bss);
+
+	for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (!wdev->auth_bsses[i] && !wdev->authtry_bsses[i]) {
+			slot = i;
+			nfree++;
+		}
+	}
+
+	/* we need one free slot for disassoc and one for this auth */
+	if (nfree < 2) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (local_state_change)
+		wdev->auth_bsses[slot] = bss;
+	else
+		wdev->authtry_bsses[slot] = bss;
+	cfg80211_hold_bss(bss);
+
+	err = rdev->ops->auth(&rdev->wiphy, dev, &req);
+	if (err) {
+		if (local_state_change)
+			wdev->auth_bsses[slot] = NULL;
+		else
+			wdev->authtry_bsses[slot] = NULL;
+		cfg80211_unhold_bss(bss);
+	}
+
+ out:
+	if (err)
+		cfg80211_put_bss(req.bss);
+	return err;
+}
+
+int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev, struct ieee80211_channel *chan,
+		       enum nl80211_auth_type auth_type, const u8 *bssid,
+		       const u8 *ssid, int ssid_len,
+		       const u8 *ie, int ie_len,
+		       const u8 *key, int key_len, int key_idx,
+		       bool local_state_change)
+{
+	int err;
+
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
+				   ssid, ssid_len, ie, ie_len,
+				   key, key_len, key_idx, local_state_change);
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
+}
+
+int __cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan,
+			  const u8 *bssid, const u8 *prev_bssid,
+			  const u8 *ssid, int ssid_len,
+			  const u8 *ie, int ie_len, bool use_mfp,
+			  struct cfg80211_crypto_settings *crypt)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_assoc_request req;
+	struct cfg80211_internal_bss *bss;
+	int i, err, slot = -1;
+	bool was_connected = false;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	memset(&req, 0, sizeof(req));
+
+	if (wdev->current_bss && prev_bssid &&
+	    memcmp(wdev->current_bss->pub.bssid, prev_bssid, ETH_ALEN) == 0) {
+		/*
+		 * Trying to reassociate: Allow this to proceed and let the old
+		 * association to be dropped when the new one is completed.
+		 */
+		if (wdev->sme_state == CFG80211_SME_CONNECTED) {
+			was_connected = true;
+			wdev->sme_state = CFG80211_SME_CONNECTING;
+		}
+	} else if (wdev->current_bss)
+		return -EALREADY;
+
+	req.ie = ie;
+	req.ie_len = ie_len;
+	memcpy(&req.crypto, crypt, sizeof(req.crypto));
+	req.use_mfp = use_mfp;
+	req.prev_bssid = prev_bssid;
+	req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
+				   WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS);
+	if (!req.bss) {
+		if (was_connected)
+			wdev->sme_state = CFG80211_SME_CONNECTED;
+		return -ENOENT;
+	}
+
+	bss = bss_from_pub(req.bss);
+
+	for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (bss == wdev->auth_bsses[i]) {
+			slot = i;
+			break;
+		}
+	}
+
+	if (slot < 0) {
+		err = -ENOTCONN;
+		goto out;
+	}
+
+	err = rdev->ops->assoc(&rdev->wiphy, dev, &req);
+ out:
+	if (err && was_connected)
+		wdev->sme_state = CFG80211_SME_CONNECTED;
+	/* still a reference in wdev->auth_bsses[slot] */
+	cfg80211_put_bss(req.bss);
+	return err;
+}
+
+int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
+			struct net_device *dev,
+			struct ieee80211_channel *chan,
+			const u8 *bssid, const u8 *prev_bssid,
+			const u8 *ssid, int ssid_len,
+			const u8 *ie, int ie_len, bool use_mfp,
+			struct cfg80211_crypto_settings *crypt)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid,
+				    ssid, ssid_len, ie, ie_len, use_mfp, crypt);
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+int __cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+			   struct net_device *dev, const u8 *bssid,
+			   const u8 *ie, int ie_len, u16 reason,
+			   bool local_state_change)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_deauth_request req;
+	int i;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	memset(&req, 0, sizeof(req));
+	req.reason_code = reason;
+	req.local_state_change = local_state_change;
+	req.ie = ie;
+	req.ie_len = ie_len;
+	if (wdev->current_bss &&
+	    memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0) {
+		req.bss = &wdev->current_bss->pub;
+	} else for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (wdev->auth_bsses[i] &&
+		    memcmp(bssid, wdev->auth_bsses[i]->pub.bssid, ETH_ALEN) == 0) {
+			req.bss = &wdev->auth_bsses[i]->pub;
+			break;
+		}
+		if (wdev->authtry_bsses[i] &&
+		    memcmp(bssid, wdev->authtry_bsses[i]->pub.bssid, ETH_ALEN) == 0) {
+			req.bss = &wdev->authtry_bsses[i]->pub;
+			break;
+		}
+	}
+
+	if (!req.bss)
+		return -ENOTCONN;
+
+	return rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev);
+}
+
+int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev, const u8 *bssid,
+			 const u8 *ie, int ie_len, u16 reason,
+			 bool local_state_change)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason,
+				     local_state_change);
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+static int __cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
+				    struct net_device *dev, const u8 *bssid,
+				    const u8 *ie, int ie_len, u16 reason,
+				    bool local_state_change)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_disassoc_request req;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->sme_state != CFG80211_SME_CONNECTED)
+		return -ENOTCONN;
+
+	if (WARN_ON(!wdev->current_bss))
+		return -ENOTCONN;
+
+	memset(&req, 0, sizeof(req));
+	req.reason_code = reason;
+	req.local_state_change = local_state_change;
+	req.ie = ie;
+	req.ie_len = ie_len;
+	if (memcmp(wdev->current_bss->pub.bssid, bssid, ETH_ALEN) == 0)
+		req.bss = &wdev->current_bss->pub;
+	else
+		return -ENOTCONN;
+
+	return rdev->ops->disassoc(&rdev->wiphy, dev, &req, wdev);
+}
+
+int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *dev, const u8 *bssid,
+			   const u8 *ie, int ie_len, u16 reason,
+			   bool local_state_change)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	wdev_lock(wdev);
+	err = __cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason,
+				       local_state_change);
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
+			struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_deauth_request req;
+	int i;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!rdev->ops->deauth)
+		return;
+
+	memset(&req, 0, sizeof(req));
+	req.reason_code = WLAN_REASON_DEAUTH_LEAVING;
+	req.ie = NULL;
+	req.ie_len = 0;
+
+	if (wdev->current_bss) {
+		req.bss = &wdev->current_bss->pub;
+		rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev);
+		if (wdev->current_bss) {
+			cfg80211_unhold_bss(wdev->current_bss);
+			cfg80211_put_bss(&wdev->current_bss->pub);
+			wdev->current_bss = NULL;
+		}
+	}
+
+	for (i = 0; i < MAX_AUTH_BSSES; i++) {
+		if (wdev->auth_bsses[i]) {
+			req.bss = &wdev->auth_bsses[i]->pub;
+			rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev);
+			if (wdev->auth_bsses[i]) {
+				cfg80211_unhold_bss(wdev->auth_bsses[i]);
+				cfg80211_put_bss(&wdev->auth_bsses[i]->pub);
+				wdev->auth_bsses[i] = NULL;
+			}
+		}
+		if (wdev->authtry_bsses[i]) {
+			req.bss = &wdev->authtry_bsses[i]->pub;
+			rdev->ops->deauth(&rdev->wiphy, dev, &req, wdev);
+			if (wdev->authtry_bsses[i]) {
+				cfg80211_unhold_bss(wdev->authtry_bsses[i]);
+				cfg80211_put_bss(&wdev->authtry_bsses[i]->pub);
+				wdev->authtry_bsses[i] = NULL;
+			}
+		}
+	}
+}
+
+void cfg80211_ready_on_channel(struct net_device *dev, u64 cookie,
+			       struct ieee80211_channel *chan,
+			       enum nl80211_channel_type channel_type,
+			       unsigned int duration, gfp_t gfp)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_remain_on_channel(rdev, dev, cookie, chan, channel_type,
+				       duration, gfp);
+}
+EXPORT_SYMBOL(cfg80211_ready_on_channel);
+
+void cfg80211_remain_on_channel_expired(struct net_device *dev,
+					u64 cookie,
+					struct ieee80211_channel *chan,
+					enum nl80211_channel_type channel_type,
+					gfp_t gfp)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_remain_on_channel_cancel(rdev, dev, cookie, chan,
+					      channel_type, gfp);
+}
+EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);
+
+void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
+		      struct station_info *sinfo, gfp_t gfp)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_sta_event(rdev, dev, mac_addr, sinfo, gfp);
+}
+EXPORT_SYMBOL(cfg80211_new_sta);
+
+void cfg80211_del_sta(struct net_device *dev, const u8 *mac_addr, gfp_t gfp)
+{
+	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	nl80211_send_sta_del_event(rdev, dev, mac_addr, gfp);
+}
+EXPORT_SYMBOL(cfg80211_del_sta);
+
+struct cfg80211_mgmt_registration {
+	struct list_head list;
+
+	u32 nlpid;
+
+	int match_len;
+
+	__le16 frame_type;
+
+	u8 match[];
+};
+
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+				u16 frame_type, const u8 *match_data,
+				int match_len)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct cfg80211_mgmt_registration *reg, *nreg;
+	int err = 0;
+	u16 mgmt_type;
+
+	if (!wdev->wiphy->mgmt_stypes)
+		return -EOPNOTSUPP;
+
+	if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
+		return -EINVAL;
+
+	if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
+		return -EINVAL;
+
+	mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
+	if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
+		return -EINVAL;
+
+	nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
+	if (!nreg)
+		return -ENOMEM;
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+		int mlen = min(match_len, reg->match_len);
+
+		if (frame_type != le16_to_cpu(reg->frame_type))
+			continue;
+
+		if (memcmp(reg->match, match_data, mlen) == 0) {
+			err = -EALREADY;
+			break;
+		}
+	}
+
+	if (err) {
+		kfree(nreg);
+		goto out;
+	}
+
+	memcpy(nreg->match, match_data, match_len);
+	nreg->match_len = match_len;
+	nreg->nlpid = snd_pid;
+	nreg->frame_type = cpu_to_le16(frame_type);
+	list_add(&nreg->list, &wdev->mgmt_registrations);
+
+	if (rdev->ops->mgmt_frame_register)
+		rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+					       frame_type, true);
+
+ out:
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+	return err;
+}
+
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct cfg80211_mgmt_registration *reg, *tmp;
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+		if (reg->nlpid != nlpid)
+			continue;
+
+		if (rdev->ops->mgmt_frame_register) {
+			u16 frame_type = le16_to_cpu(reg->frame_type);
+
+			rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+						       frame_type, false);
+		}
+
+		list_del(&reg->list);
+		kfree(reg);
+	}
+
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+}
+
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
+{
+	struct cfg80211_mgmt_registration *reg, *tmp;
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+		list_del(&reg->list);
+		kfree(reg);
+	}
+
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+}
+
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan, bool offchan,
+			  enum nl80211_channel_type channel_type,
+			  bool channel_type_valid, unsigned int wait,
+			  const u8 *buf, size_t len, u64 *cookie)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	const struct ieee80211_mgmt *mgmt;
+	u16 stype;
+
+	if (!wdev->wiphy->mgmt_stypes)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->mgmt_tx)
+		return -EOPNOTSUPP;
+
+	if (len < 24 + 1)
+		return -EINVAL;
+
+	mgmt = (const struct ieee80211_mgmt *) buf;
+
+	if (!ieee80211_is_mgmt(mgmt->frame_control))
+		return -EINVAL;
+
+	stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+	if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
+		return -EINVAL;
+
+	if (ieee80211_is_action(mgmt->frame_control) &&
+	    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
+		int err = 0;
+
+		wdev_lock(wdev);
+
+		switch (wdev->iftype) {
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			if (!wdev->current_bss) {
+				err = -ENOTCONN;
+				break;
+			}
+
+			if (memcmp(wdev->current_bss->pub.bssid,
+				   mgmt->bssid, ETH_ALEN)) {
+				err = -ENOTCONN;
+				break;
+			}
+
+			/*
+			 * check for IBSS DA must be done by driver as
+			 * cfg80211 doesn't track the stations
+			 */
+			if (wdev->iftype == NL80211_IFTYPE_ADHOC)
+				break;
+
+			/* for station, check that DA is the AP */
+			if (memcmp(wdev->current_bss->pub.bssid,
+				   mgmt->da, ETH_ALEN)) {
+				err = -ENOTCONN;
+				break;
+			}
+			break;
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_AP_VLAN:
+			if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN))
+				err = -EINVAL;
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			if (memcmp(mgmt->sa, mgmt->bssid, ETH_ALEN)) {
+				err = -EINVAL;
+				break;
+			}
+			/*
+			 * check for mesh DA must be done by driver as
+			 * cfg80211 doesn't track the stations
+			 */
+			break;
+		default:
+			err = -EOPNOTSUPP;
+			break;
+		}
+		wdev_unlock(wdev);
+
+		if (err)
+			return err;
+	}
+
+	if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0)
+		return -EINVAL;
+
+	/* Transmit the Action frame as requested by user space */
+	return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, offchan,
+				  channel_type, channel_type_valid,
+				  wait, buf, len, cookie);
+}
+
+bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf,
+		      size_t len, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct cfg80211_mgmt_registration *reg;
+	const struct ieee80211_txrx_stypes *stypes =
+		&wiphy->mgmt_stypes[wdev->iftype];
+	struct ieee80211_mgmt *mgmt = (void *)buf;
+	const u8 *data;
+	int data_len;
+	bool result = false;
+	__le16 ftype = mgmt->frame_control &
+		cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
+	u16 stype;
+
+	stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
+
+	if (!(stypes->rx & BIT(stype)))
+		return false;
+
+	data = buf + ieee80211_hdrlen(mgmt->frame_control);
+	data_len = len - ieee80211_hdrlen(mgmt->frame_control);
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+		if (reg->frame_type != ftype)
+			continue;
+
+		if (reg->match_len > data_len)
+			continue;
+
+		if (memcmp(reg->match, data, reg->match_len))
+			continue;
+
+		/* found match! */
+
+		/* Indicate the received Action frame to user space */
+		if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq,
+				      buf, len, gfp))
+			continue;
+
+		result = true;
+		break;
+	}
+
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
+	return result;
+}
+EXPORT_SYMBOL(cfg80211_rx_mgmt);
+
+void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie,
+			     const u8 *buf, size_t len, bool ack, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	/* Indicate TX status of the Action frame to user space */
+	nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
+}
+EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
+
+void cfg80211_cqm_rssi_notify(struct net_device *dev,
+			      enum nl80211_cqm_rssi_threshold_event rssi_event,
+			      gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	/* Indicate roaming trigger event to user space */
+	nl80211_send_cqm_rssi_notify(rdev, dev, rssi_event, gfp);
+}
+EXPORT_SYMBOL(cfg80211_cqm_rssi_notify);
+
+void cfg80211_cqm_pktloss_notify(struct net_device *dev,
+				 const u8 *peer, u32 num_packets, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	/* Indicate roaming trigger event to user space */
+	nl80211_send_cqm_pktloss_notify(rdev, dev, peer, num_packets, gfp);
+}
+EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
new file mode 100644
index 00000000..4e84e222
--- /dev/null
+++ b/net/wireless/nl80211.c
@@ -0,0 +1,6994 @@
+/*
+ * This is the new netlink-based wireless configuration interface.
+ *
+ * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/if.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/if_ether.h>
+#include <linux/ieee80211.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <linux/etherdevice.h>
+#include <net/net_namespace.h>
+#include <net/genetlink.h>
+#include <net/cfg80211.h>
+#include <net/sock.h>
+#include "core.h"
+#include "nl80211.h"
+#include "reg.h"
+
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+			    struct genl_info *info);
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+			      struct genl_info *info);
+
+/* the netlink family */
+static struct genl_family nl80211_fam = {
+	.id = GENL_ID_GENERATE,	/* don't bother with a hardcoded ID */
+	.name = "nl80211",	/* have users key off the name instead */
+	.hdrsize = 0,		/* no private header */
+	.version = 1,		/* no particular meaning now */
+	.maxattr = NL80211_ATTR_MAX,
+	.netnsok = true,
+	.pre_doit = nl80211_pre_doit,
+	.post_doit = nl80211_post_doit,
+};
+
+/* internal helper: get rdev and dev */
+static int get_rdev_dev_by_info_ifindex(struct genl_info *info,
+				       struct cfg80211_registered_device **rdev,
+				       struct net_device **dev)
+{
+	struct nlattr **attrs = info->attrs;
+	int ifindex;
+
+	if (!attrs[NL80211_ATTR_IFINDEX])
+		return -EINVAL;
+
+	ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);
+	*dev = dev_get_by_index(genl_info_net(info), ifindex);
+	if (!*dev)
+		return -ENODEV;
+
+	*rdev = cfg80211_get_dev_from_ifindex(genl_info_net(info), ifindex);
+	if (IS_ERR(*rdev)) {
+		dev_put(*dev);
+		return PTR_ERR(*rdev);
+	}
+
+	return 0;
+}
+
+/* policy for the attributes */
+static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
+	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
+				      .len = 20-1 },
+	[NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
+	[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 },
+	[NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 },
+	[NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 },
+
+	[NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
+	[NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
+
+	[NL80211_ATTR_MAC] = { .len = ETH_ALEN },
+	[NL80211_ATTR_PREV_BSSID] = { .len = ETH_ALEN },
+
+	[NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
+	[NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
+				    .len = WLAN_MAX_KEY_LEN },
+	[NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 },
+	[NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
+	[NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
+	[NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
+
+	[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
+	[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
+	[NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
+				       .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY,
+				       .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_STA_AID] = { .type = NLA_U16 },
+	[NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED },
+	[NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 },
+	[NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY,
+					       .len = NL80211_MAX_SUPP_RATES },
+	[NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 },
+	[NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 },
+	[NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
+	[NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
+				.len = IEEE80211_MAX_MESH_ID_LEN },
+	[NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
+
+	[NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
+	[NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
+
+	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_BASIC_RATES] = { .type = NLA_BINARY,
+					   .len = NL80211_MAX_SUPP_RATES },
+	[NL80211_ATTR_BSS_HT_OPMODE] = { .type = NLA_U16 },
+
+	[NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED },
+	[NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG },
+
+	[NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN },
+
+	[NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
+	[NL80211_ATTR_IE] = { .type = NLA_BINARY,
+			      .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
+
+	[NL80211_ATTR_SSID] = { .type = NLA_BINARY,
+				.len = IEEE80211_MAX_SSID_LEN },
+	[NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
+	[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
+	[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
+	[NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_USE_MFP] = { .type = NLA_U32 },
+	[NL80211_ATTR_STA_FLAGS2] = {
+		.len = sizeof(struct nl80211_sta_flag_update),
+	},
+	[NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
+	[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
+	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
+	[NL80211_ATTR_PID] = { .type = NLA_U32 },
+	[NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
+	[NL80211_ATTR_PMKID] = { .type = NLA_BINARY,
+				 .len = WLAN_PMKID_LEN },
+	[NL80211_ATTR_DURATION] = { .type = NLA_U32 },
+	[NL80211_ATTR_COOKIE] = { .type = NLA_U64 },
+	[NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_FRAME] = { .type = NLA_BINARY,
+				 .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, },
+	[NL80211_ATTR_PS_STATE] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
+	[NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
+	[NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
+	[NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
+	[NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
+	[NL80211_ATTR_WIPHY_ANTENNA_TX] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_ANTENNA_RX] = { .type = NLA_U32 },
+	[NL80211_ATTR_MCAST_RATE] = { .type = NLA_U32 },
+	[NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG },
+	[NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
+	[NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 },
+	[NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
+	[NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
+	[NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 },
+	[NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY,
+					 .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY,
+					 .len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED },
+};
+
+/* policy for the key attributes */
+static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
+	[NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
+	[NL80211_KEY_IDX] = { .type = NLA_U8 },
+	[NL80211_KEY_CIPHER] = { .type = NLA_U32 },
+	[NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
+	[NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
+	[NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
+	[NL80211_KEY_TYPE] = { .type = NLA_U32 },
+	[NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
+};
+
+/* policy for the key default flags */
+static const struct nla_policy
+nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
+	[NL80211_KEY_DEFAULT_TYPE_UNICAST] = { .type = NLA_FLAG },
+	[NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
+};
+
+/* policy for WoWLAN attributes */
+static const struct nla_policy
+nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
+	[NL80211_WOWLAN_TRIG_ANY] = { .type = NLA_FLAG },
+	[NL80211_WOWLAN_TRIG_DISCONNECT] = { .type = NLA_FLAG },
+	[NL80211_WOWLAN_TRIG_MAGIC_PKT] = { .type = NLA_FLAG },
+	[NL80211_WOWLAN_TRIG_PKT_PATTERN] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
+	[NL80211_ATTR_SCHED_SCAN_MATCH_SSID] = { .type = NLA_BINARY,
+						 .len = IEEE80211_MAX_SSID_LEN },
+};
+
+/* ifidx get helper */
+static int nl80211_get_ifidx(struct netlink_callback *cb)
+{
+	int res;
+
+	res = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize,
+			  nl80211_fam.attrbuf, nl80211_fam.maxattr,
+			  nl80211_policy);
+	if (res)
+		return res;
+
+	if (!nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX])
+		return -EINVAL;
+
+	res = nla_get_u32(nl80211_fam.attrbuf[NL80211_ATTR_IFINDEX]);
+	if (!res)
+		return -EINVAL;
+	return res;
+}
+
+static int nl80211_prepare_netdev_dump(struct sk_buff *skb,
+				       struct netlink_callback *cb,
+				       struct cfg80211_registered_device **rdev,
+				       struct net_device **dev)
+{
+	int ifidx = cb->args[0];
+	int err;
+
+	if (!ifidx)
+		ifidx = nl80211_get_ifidx(cb);
+	if (ifidx < 0)
+		return ifidx;
+
+	cb->args[0] = ifidx;
+
+	rtnl_lock();
+
+	*dev = __dev_get_by_index(sock_net(skb->sk), ifidx);
+	if (!*dev) {
+		err = -ENODEV;
+		goto out_rtnl;
+	}
+
+	*rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
+	if (IS_ERR(*rdev)) {
+		err = PTR_ERR(*rdev);
+		goto out_rtnl;
+	}
+
+	return 0;
+ out_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev)
+{
+	cfg80211_unlock_rdev(rdev);
+	rtnl_unlock();
+}
+
+/* IE validation */
+static bool is_valid_ie_attr(const struct nlattr *attr)
+{
+	const u8 *pos;
+	int len;
+
+	if (!attr)
+		return true;
+
+	pos = nla_data(attr);
+	len = nla_len(attr);
+
+	while (len) {
+		u8 elemlen;
+
+		if (len < 2)
+			return false;
+		len -= 2;
+
+		elemlen = pos[1];
+		if (elemlen > len)
+			return false;
+
+		len -= elemlen;
+		pos += 2 + elemlen;
+	}
+
+	return true;
+}
+
+/* message building helper */
+static inline void *nl80211hdr_put(struct sk_buff *skb, u32 pid, u32 seq,
+				   int flags, u8 cmd)
+{
+	/* since there is no private header just add the generic one */
+	return genlmsg_put(skb, pid, seq, &nl80211_fam, flags, cmd);
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg,
+				   struct ieee80211_channel *chan)
+{
+	NLA_PUT_U32(msg, NL80211_FREQUENCY_ATTR_FREQ,
+		    chan->center_freq);
+
+	if (chan->flags & IEEE80211_CHAN_DISABLED)
+		NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_DISABLED);
+	if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN)
+		NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_PASSIVE_SCAN);
+	if (chan->flags & IEEE80211_CHAN_NO_IBSS)
+		NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_NO_IBSS);
+	if (chan->flags & IEEE80211_CHAN_RADAR)
+		NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_RADAR);
+
+	NLA_PUT_U32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
+		    DBM_TO_MBM(chan->max_power));
+
+	return 0;
+
+ nla_put_failure:
+	return -ENOBUFS;
+}
+
+/* netlink command implementations */
+
+struct key_parse {
+	struct key_params p;
+	int idx;
+	int type;
+	bool def, defmgmt;
+	bool def_uni, def_multi;
+};
+
+static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
+{
+	struct nlattr *tb[NL80211_KEY_MAX + 1];
+	int err = nla_parse_nested(tb, NL80211_KEY_MAX, key,
+				   nl80211_key_policy);
+	if (err)
+		return err;
+
+	k->def = !!tb[NL80211_KEY_DEFAULT];
+	k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT];
+
+	if (k->def) {
+		k->def_uni = true;
+		k->def_multi = true;
+	}
+	if (k->defmgmt)
+		k->def_multi = true;
+
+	if (tb[NL80211_KEY_IDX])
+		k->idx = nla_get_u8(tb[NL80211_KEY_IDX]);
+
+	if (tb[NL80211_KEY_DATA]) {
+		k->p.key = nla_data(tb[NL80211_KEY_DATA]);
+		k->p.key_len = nla_len(tb[NL80211_KEY_DATA]);
+	}
+
+	if (tb[NL80211_KEY_SEQ]) {
+		k->p.seq = nla_data(tb[NL80211_KEY_SEQ]);
+		k->p.seq_len = nla_len(tb[NL80211_KEY_SEQ]);
+	}
+
+	if (tb[NL80211_KEY_CIPHER])
+		k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
+
+	if (tb[NL80211_KEY_TYPE]) {
+		k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
+		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+	}
+
+	if (tb[NL80211_KEY_DEFAULT_TYPES]) {
+		struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
+		int err = nla_parse_nested(kdt,
+					   NUM_NL80211_KEY_DEFAULT_TYPES - 1,
+					   tb[NL80211_KEY_DEFAULT_TYPES],
+					   nl80211_key_default_policy);
+		if (err)
+			return err;
+
+		k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
+		k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
+	}
+
+	return 0;
+}
+
+static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
+{
+	if (info->attrs[NL80211_ATTR_KEY_DATA]) {
+		k->p.key = nla_data(info->attrs[NL80211_ATTR_KEY_DATA]);
+		k->p.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]);
+	}
+
+	if (info->attrs[NL80211_ATTR_KEY_SEQ]) {
+		k->p.seq = nla_data(info->attrs[NL80211_ATTR_KEY_SEQ]);
+		k->p.seq_len = nla_len(info->attrs[NL80211_ATTR_KEY_SEQ]);
+	}
+
+	if (info->attrs[NL80211_ATTR_KEY_IDX])
+		k->idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
+
+	if (info->attrs[NL80211_ATTR_KEY_CIPHER])
+		k->p.cipher = nla_get_u32(info->attrs[NL80211_ATTR_KEY_CIPHER]);
+
+	k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
+	k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];
+
+	if (k->def) {
+		k->def_uni = true;
+		k->def_multi = true;
+	}
+	if (k->defmgmt)
+		k->def_multi = true;
+
+	if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+		k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
+		struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
+		int err = nla_parse_nested(
+				kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1,
+				info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES],
+				nl80211_key_default_policy);
+		if (err)
+			return err;
+
+		k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
+		k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
+	}
+
+	return 0;
+}
+
+static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
+{
+	int err;
+
+	memset(k, 0, sizeof(*k));
+	k->idx = -1;
+	k->type = -1;
+
+	if (info->attrs[NL80211_ATTR_KEY])
+		err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
+	else
+		err = nl80211_parse_key_old(info, k);
+
+	if (err)
+		return err;
+
+	if (k->def && k->defmgmt)
+		return -EINVAL;
+
+	if (k->defmgmt) {
+		if (k->def_uni || !k->def_multi)
+			return -EINVAL;
+	}
+
+	if (k->idx != -1) {
+		if (k->defmgmt) {
+			if (k->idx < 4 || k->idx > 5)
+				return -EINVAL;
+		} else if (k->def) {
+			if (k->idx < 0 || k->idx > 3)
+				return -EINVAL;
+		} else {
+			if (k->idx < 0 || k->idx > 5)
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct cfg80211_cached_keys *
+nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
+		       struct nlattr *keys)
+{
+	struct key_parse parse;
+	struct nlattr *key;
+	struct cfg80211_cached_keys *result;
+	int rem, err, def = 0;
+
+	result = kzalloc(sizeof(*result), GFP_KERNEL);
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	result->def = -1;
+	result->defmgmt = -1;
+
+	nla_for_each_nested(key, keys, rem) {
+		memset(&parse, 0, sizeof(parse));
+		parse.idx = -1;
+
+		err = nl80211_parse_key_new(key, &parse);
+		if (err)
+			goto error;
+		err = -EINVAL;
+		if (!parse.p.key)
+			goto error;
+		if (parse.idx < 0 || parse.idx > 4)
+			goto error;
+		if (parse.def) {
+			if (def)
+				goto error;
+			def = 1;
+			result->def = parse.idx;
+			if (!parse.def_uni || !parse.def_multi)
+				goto error;
+		} else if (parse.defmgmt)
+			goto error;
+		err = cfg80211_validate_key_settings(rdev, &parse.p,
+						     parse.idx, false, NULL);
+		if (err)
+			goto error;
+		result->params[parse.idx].cipher = parse.p.cipher;
+		result->params[parse.idx].key_len = parse.p.key_len;
+		result->params[parse.idx].key = result->data[parse.idx];
+		memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);
+	}
+
+	return result;
+ error:
+	kfree(result);
+	return ERR_PTR(err);
+}
+
+static int nl80211_key_allowed(struct wireless_dev *wdev)
+{
+	ASSERT_WDEV_LOCK(wdev);
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
+	case NL80211_IFTYPE_MESH_POINT:
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		if (!wdev->current_bss)
+			return -ENOLINK;
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		if (wdev->sme_state != CFG80211_SME_CONNECTED)
+			return -ENOLINK;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes)
+{
+	struct nlattr *nl_modes = nla_nest_start(msg, attr);
+	int i;
+
+	if (!nl_modes)
+		goto nla_put_failure;
+
+	i = 0;
+	while (ifmodes) {
+		if (ifmodes & 1)
+			NLA_PUT_FLAG(msg, i);
+		ifmodes >>= 1;
+		i++;
+	}
+
+	nla_nest_end(msg, nl_modes);
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_put_iface_combinations(struct wiphy *wiphy,
+					  struct sk_buff *msg)
+{
+	struct nlattr *nl_combis;
+	int i, j;
+
+	nl_combis = nla_nest_start(msg,
+				NL80211_ATTR_INTERFACE_COMBINATIONS);
+	if (!nl_combis)
+		goto nla_put_failure;
+
+	for (i = 0; i < wiphy->n_iface_combinations; i++) {
+		const struct ieee80211_iface_combination *c;
+		struct nlattr *nl_combi, *nl_limits;
+
+		c = &wiphy->iface_combinations[i];
+
+		nl_combi = nla_nest_start(msg, i + 1);
+		if (!nl_combi)
+			goto nla_put_failure;
+
+		nl_limits = nla_nest_start(msg, NL80211_IFACE_COMB_LIMITS);
+		if (!nl_limits)
+			goto nla_put_failure;
+
+		for (j = 0; j < c->n_limits; j++) {
+			struct nlattr *nl_limit;
+
+			nl_limit = nla_nest_start(msg, j + 1);
+			if (!nl_limit)
+				goto nla_put_failure;
+			NLA_PUT_U32(msg, NL80211_IFACE_LIMIT_MAX,
+				    c->limits[j].max);
+			if (nl80211_put_iftypes(msg, NL80211_IFACE_LIMIT_TYPES,
+						c->limits[j].types))
+				goto nla_put_failure;
+			nla_nest_end(msg, nl_limit);
+		}
+
+		nla_nest_end(msg, nl_limits);
+
+		if (c->beacon_int_infra_match)
+			NLA_PUT_FLAG(msg,
+				NL80211_IFACE_COMB_STA_AP_BI_MATCH);
+		NLA_PUT_U32(msg, NL80211_IFACE_COMB_NUM_CHANNELS,
+			    c->num_different_channels);
+		NLA_PUT_U32(msg, NL80211_IFACE_COMB_MAXNUM,
+			    c->max_interfaces);
+
+		nla_nest_end(msg, nl_combi);
+	}
+
+	nla_nest_end(msg, nl_combis);
+
+	return 0;
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
+			      struct cfg80211_registered_device *dev)
+{
+	void *hdr;
+	struct nlattr *nl_bands, *nl_band;
+	struct nlattr *nl_freqs, *nl_freq;
+	struct nlattr *nl_rates, *nl_rate;
+	struct nlattr *nl_cmds;
+	enum ieee80211_band band;
+	struct ieee80211_channel *chan;
+	struct ieee80211_rate *rate;
+	int i;
+	const struct ieee80211_txrx_stypes *mgmt_stypes =
+				dev->wiphy.mgmt_stypes;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx);
+	NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
+
+	NLA_PUT_U32(msg, NL80211_ATTR_GENERATION,
+		    cfg80211_rdev_list_generation);
+
+	NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT,
+		   dev->wiphy.retry_short);
+	NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_RETRY_LONG,
+		   dev->wiphy.retry_long);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
+		    dev->wiphy.frag_threshold);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD,
+		    dev->wiphy.rts_threshold);
+	NLA_PUT_U8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS,
+		    dev->wiphy.coverage_class);
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
+		   dev->wiphy.max_scan_ssids);
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS,
+		   dev->wiphy.max_sched_scan_ssids);
+	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
+		    dev->wiphy.max_scan_ie_len);
+	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN,
+		    dev->wiphy.max_sched_scan_ie_len);
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_MATCH_SETS,
+		   dev->wiphy.max_match_sets);
+
+	if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN);
+	if (dev->wiphy.flags & WIPHY_FLAG_MESH_AUTH)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_MESH_AUTH);
+
+	NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES,
+		sizeof(u32) * dev->wiphy.n_cipher_suites,
+		dev->wiphy.cipher_suites);
+
+	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
+		   dev->wiphy.max_num_pmkids);
+
+	if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE);
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX,
+		    dev->wiphy.available_antennas_tx);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX,
+		    dev->wiphy.available_antennas_rx);
+
+	if ((dev->wiphy.available_antennas_tx ||
+	     dev->wiphy.available_antennas_rx) && dev->ops->get_antenna) {
+		u32 tx_ant = 0, rx_ant = 0;
+		int res;
+		res = dev->ops->get_antenna(&dev->wiphy, &tx_ant, &rx_ant);
+		if (!res) {
+			NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, tx_ant);
+			NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_ANTENNA_RX, rx_ant);
+		}
+	}
+
+	if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES,
+				dev->wiphy.interface_modes))
+		goto nla_put_failure;
+
+	nl_bands = nla_nest_start(msg, NL80211_ATTR_WIPHY_BANDS);
+	if (!nl_bands)
+		goto nla_put_failure;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		if (!dev->wiphy.bands[band])
+			continue;
+
+		nl_band = nla_nest_start(msg, band);
+		if (!nl_band)
+			goto nla_put_failure;
+
+		/* add HT info */
+		if (dev->wiphy.bands[band]->ht_cap.ht_supported) {
+			NLA_PUT(msg, NL80211_BAND_ATTR_HT_MCS_SET,
+				sizeof(dev->wiphy.bands[band]->ht_cap.mcs),
+				&dev->wiphy.bands[band]->ht_cap.mcs);
+			NLA_PUT_U16(msg, NL80211_BAND_ATTR_HT_CAPA,
+				dev->wiphy.bands[band]->ht_cap.cap);
+			NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR,
+				dev->wiphy.bands[band]->ht_cap.ampdu_factor);
+			NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY,
+				dev->wiphy.bands[band]->ht_cap.ampdu_density);
+		}
+
+		/* add frequencies */
+		nl_freqs = nla_nest_start(msg, NL80211_BAND_ATTR_FREQS);
+		if (!nl_freqs)
+			goto nla_put_failure;
+
+		for (i = 0; i < dev->wiphy.bands[band]->n_channels; i++) {
+			nl_freq = nla_nest_start(msg, i);
+			if (!nl_freq)
+				goto nla_put_failure;
+
+			chan = &dev->wiphy.bands[band]->channels[i];
+
+			if (nl80211_msg_put_channel(msg, chan))
+				goto nla_put_failure;
+
+			nla_nest_end(msg, nl_freq);
+		}
+
+		nla_nest_end(msg, nl_freqs);
+
+		/* add bitrates */
+		nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
+		if (!nl_rates)
+			goto nla_put_failure;
+
+		for (i = 0; i < dev->wiphy.bands[band]->n_bitrates; i++) {
+			nl_rate = nla_nest_start(msg, i);
+			if (!nl_rate)
+				goto nla_put_failure;
+
+			rate = &dev->wiphy.bands[band]->bitrates[i];
+			NLA_PUT_U32(msg, NL80211_BITRATE_ATTR_RATE,
+				    rate->bitrate);
+			if (rate->flags & IEEE80211_RATE_SHORT_PREAMBLE)
+				NLA_PUT_FLAG(msg,
+					NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE);
+
+			nla_nest_end(msg, nl_rate);
+		}
+
+		nla_nest_end(msg, nl_rates);
+
+		nla_nest_end(msg, nl_band);
+	}
+	nla_nest_end(msg, nl_bands);
+
+	nl_cmds = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_COMMANDS);
+	if (!nl_cmds)
+		goto nla_put_failure;
+
+	i = 0;
+#define CMD(op, n)						\
+	 do {							\
+		if (dev->ops->op) {				\
+			i++;					\
+			NLA_PUT_U32(msg, i, NL80211_CMD_ ## n);	\
+		}						\
+	} while (0)
+
+	CMD(add_virtual_intf, NEW_INTERFACE);
+	CMD(change_virtual_intf, SET_INTERFACE);
+	CMD(add_key, NEW_KEY);
+	CMD(add_beacon, NEW_BEACON);
+	CMD(add_station, NEW_STATION);
+	CMD(add_mpath, NEW_MPATH);
+	CMD(update_mesh_config, SET_MESH_CONFIG);
+	CMD(change_bss, SET_BSS);
+	CMD(auth, AUTHENTICATE);
+	CMD(assoc, ASSOCIATE);
+	CMD(deauth, DEAUTHENTICATE);
+	CMD(disassoc, DISASSOCIATE);
+	CMD(join_ibss, JOIN_IBSS);
+	CMD(join_mesh, JOIN_MESH);
+	CMD(set_pmksa, SET_PMKSA);
+	CMD(del_pmksa, DEL_PMKSA);
+	CMD(flush_pmksa, FLUSH_PMKSA);
+	CMD(remain_on_channel, REMAIN_ON_CHANNEL);
+	CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
+	CMD(mgmt_tx, FRAME);
+	CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
+	if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
+		i++;
+		NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS);
+	}
+	CMD(set_channel, SET_CHANNEL);
+	CMD(set_wds_peer, SET_WDS_PEER);
+	if (dev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN)
+		CMD(sched_scan_start, START_SCHED_SCAN);
+
+#undef CMD
+
+	if (dev->ops->connect || dev->ops->auth) {
+		i++;
+		NLA_PUT_U32(msg, i, NL80211_CMD_CONNECT);
+	}
+
+	if (dev->ops->disconnect || dev->ops->deauth) {
+		i++;
+		NLA_PUT_U32(msg, i, NL80211_CMD_DISCONNECT);
+	}
+
+	nla_nest_end(msg, nl_cmds);
+
+	if (dev->ops->remain_on_channel)
+		NLA_PUT_U32(msg, NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION,
+			    dev->wiphy.max_remain_on_channel_duration);
+
+	/* for now at least assume all drivers have it */
+	if (dev->ops->mgmt_tx)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_OFFCHANNEL_TX_OK);
+
+	if (mgmt_stypes) {
+		u16 stypes;
+		struct nlattr *nl_ftypes, *nl_ifs;
+		enum nl80211_iftype ift;
+
+		nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES);
+		if (!nl_ifs)
+			goto nla_put_failure;
+
+		for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+			nl_ftypes = nla_nest_start(msg, ift);
+			if (!nl_ftypes)
+				goto nla_put_failure;
+			i = 0;
+			stypes = mgmt_stypes[ift].tx;
+			while (stypes) {
+				if (stypes & 1)
+					NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+						    (i << 4) | IEEE80211_FTYPE_MGMT);
+				stypes >>= 1;
+				i++;
+			}
+			nla_nest_end(msg, nl_ftypes);
+		}
+
+		nla_nest_end(msg, nl_ifs);
+
+		nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES);
+		if (!nl_ifs)
+			goto nla_put_failure;
+
+		for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+			nl_ftypes = nla_nest_start(msg, ift);
+			if (!nl_ftypes)
+				goto nla_put_failure;
+			i = 0;
+			stypes = mgmt_stypes[ift].rx;
+			while (stypes) {
+				if (stypes & 1)
+					NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+						    (i << 4) | IEEE80211_FTYPE_MGMT);
+				stypes >>= 1;
+				i++;
+			}
+			nla_nest_end(msg, nl_ftypes);
+		}
+		nla_nest_end(msg, nl_ifs);
+	}
+
+	if (dev->wiphy.wowlan.flags || dev->wiphy.wowlan.n_patterns) {
+		struct nlattr *nl_wowlan;
+
+		nl_wowlan = nla_nest_start(msg,
+				NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED);
+		if (!nl_wowlan)
+			goto nla_put_failure;
+
+		if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_ANY)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_ANY);
+		if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_DISCONNECT)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_DISCONNECT);
+		if (dev->wiphy.wowlan.flags & WIPHY_WOWLAN_MAGIC_PKT)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT);
+		if (dev->wiphy.wowlan.n_patterns) {
+			struct nl80211_wowlan_pattern_support pat = {
+				.max_patterns = dev->wiphy.wowlan.n_patterns,
+				.min_pattern_len =
+					dev->wiphy.wowlan.pattern_min_len,
+				.max_pattern_len =
+					dev->wiphy.wowlan.pattern_max_len,
+			};
+			NLA_PUT(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN,
+				sizeof(pat), &pat);
+		}
+
+		nla_nest_end(msg, nl_wowlan);
+	}
+
+	if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES,
+				dev->wiphy.software_iftypes))
+		goto nla_put_failure;
+
+	if (nl80211_put_iface_combinations(&dev->wiphy, msg))
+		goto nla_put_failure;
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int idx = 0;
+	int start = cb->args[0];
+	struct cfg80211_registered_device *dev;
+
+	mutex_lock(&cfg80211_mutex);
+	list_for_each_entry(dev, &cfg80211_rdev_list, list) {
+		if (!net_eq(wiphy_net(&dev->wiphy), sock_net(skb->sk)))
+			continue;
+		if (++idx <= start)
+			continue;
+		if (nl80211_send_wiphy(skb, NETLINK_CB(cb->skb).pid,
+				       cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				       dev) < 0) {
+			idx--;
+			break;
+		}
+	}
+	mutex_unlock(&cfg80211_mutex);
+
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct cfg80211_registered_device *dev = info->user_ptr[0];
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
+	[NL80211_TXQ_ATTR_QUEUE]		= { .type = NLA_U8 },
+	[NL80211_TXQ_ATTR_TXOP]			= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_CWMIN]		= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_CWMAX]		= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_AIFS]			= { .type = NLA_U8 },
+};
+
+static int parse_txq_params(struct nlattr *tb[],
+			    struct ieee80211_txq_params *txq_params)
+{
+	if (!tb[NL80211_TXQ_ATTR_QUEUE] || !tb[NL80211_TXQ_ATTR_TXOP] ||
+	    !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
+	    !tb[NL80211_TXQ_ATTR_AIFS])
+		return -EINVAL;
+
+	txq_params->queue = nla_get_u8(tb[NL80211_TXQ_ATTR_QUEUE]);
+	txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
+	txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
+	txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
+	txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
+
+	return 0;
+}
+
+static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
+{
+	/*
+	 * You can only set the channel explicitly for AP, mesh
+	 * and WDS type interfaces; all others have their channel
+	 * managed via their respective "establish a connection"
+	 * command (connect, join, ...)
+	 *
+	 * Monitors are special as they are normally slaved to
+	 * whatever else is going on, so they behave as though
+	 * you tried setting the wiphy channel itself.
+	 */
+	return !wdev ||
+		wdev->iftype == NL80211_IFTYPE_AP ||
+		wdev->iftype == NL80211_IFTYPE_WDS ||
+		wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
+		wdev->iftype == NL80211_IFTYPE_MONITOR ||
+		wdev->iftype == NL80211_IFTYPE_P2P_GO;
+}
+
+static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
+				 struct wireless_dev *wdev,
+				 struct genl_info *info)
+{
+	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
+	u32 freq;
+	int result;
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+		return -EINVAL;
+
+	if (!nl80211_can_set_dev_channel(wdev))
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+		channel_type = nla_get_u32(info->attrs[
+				   NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+		if (channel_type != NL80211_CHAN_NO_HT &&
+		    channel_type != NL80211_CHAN_HT20 &&
+		    channel_type != NL80211_CHAN_HT40PLUS &&
+		    channel_type != NL80211_CHAN_HT40MINUS)
+			return -EINVAL;
+	}
+
+	freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+
+	mutex_lock(&rdev->devlist_mtx);
+	if (wdev) {
+		wdev_lock(wdev);
+		result = cfg80211_set_freq(rdev, wdev, freq, channel_type);
+		wdev_unlock(wdev);
+	} else {
+		result = cfg80211_set_freq(rdev, NULL, freq, channel_type);
+	}
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return result;
+}
+
+static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *netdev = info->user_ptr[1];
+
+	return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
+}
+
+static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	const u8 *bssid;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (netif_running(dev))
+		return -EBUSY;
+
+	if (!rdev->ops->set_wds_peer)
+		return -EOPNOTSUPP;
+
+	if (wdev->iftype != NL80211_IFTYPE_WDS)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid);
+}
+
+
+static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+	struct net_device *netdev = NULL;
+	struct wireless_dev *wdev;
+	int result = 0, rem_txq_params = 0;
+	struct nlattr *nl_txq_params;
+	u32 changed;
+	u8 retry_short = 0, retry_long = 0;
+	u32 frag_threshold = 0, rts_threshold = 0;
+	u8 coverage_class = 0;
+
+	/*
+	 * Try to find the wiphy and netdev. Normally this
+	 * function shouldn't need the netdev, but this is
+	 * done for backward compatibility -- previously
+	 * setting the channel was done per wiphy, but now
+	 * it is per netdev. Previous userland like hostapd
+	 * also passed a netdev to set_wiphy, so that it is
+	 * possible to let that go to the right netdev!
+	 */
+	mutex_lock(&cfg80211_mutex);
+
+	if (info->attrs[NL80211_ATTR_IFINDEX]) {
+		int ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]);
+
+		netdev = dev_get_by_index(genl_info_net(info), ifindex);
+		if (netdev && netdev->ieee80211_ptr) {
+			rdev = wiphy_to_dev(netdev->ieee80211_ptr->wiphy);
+			mutex_lock(&rdev->mtx);
+		} else
+			netdev = NULL;
+	}
+
+	if (!netdev) {
+		rdev = __cfg80211_rdev_from_info(info);
+		if (IS_ERR(rdev)) {
+			mutex_unlock(&cfg80211_mutex);
+			return PTR_ERR(rdev);
+		}
+		wdev = NULL;
+		netdev = NULL;
+		result = 0;
+
+		mutex_lock(&rdev->mtx);
+	} else if (netif_running(netdev) &&
+		   nl80211_can_set_dev_channel(netdev->ieee80211_ptr))
+		wdev = netdev->ieee80211_ptr;
+	else
+		wdev = NULL;
+
+	/*
+	 * end workaround code, by now the rdev is available
+	 * and locked, and wdev may or may not be NULL.
+	 */
+
+	if (info->attrs[NL80211_ATTR_WIPHY_NAME])
+		result = cfg80211_dev_rename(
+			rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME]));
+
+	mutex_unlock(&cfg80211_mutex);
+
+	if (result)
+		goto bad_res;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
+		struct ieee80211_txq_params txq_params;
+		struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];
+
+		if (!rdev->ops->set_txq_params) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		if (!netif_running(netdev)) {
+			result = -ENETDOWN;
+			goto bad_res;
+		}
+
+		nla_for_each_nested(nl_txq_params,
+				    info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
+				    rem_txq_params) {
+			nla_parse(tb, NL80211_TXQ_ATTR_MAX,
+				  nla_data(nl_txq_params),
+				  nla_len(nl_txq_params),
+				  txq_params_policy);
+			result = parse_txq_params(tb, &txq_params);
+			if (result)
+				goto bad_res;
+
+			result = rdev->ops->set_txq_params(&rdev->wiphy,
+							   &txq_params);
+			if (result)
+				goto bad_res;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+		result = __nl80211_set_channel(rdev, wdev, info);
+		if (result)
+			goto bad_res;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
+		enum nl80211_tx_power_setting type;
+		int idx, mbm = 0;
+
+		if (!rdev->ops->set_tx_power) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
+		type = nla_get_u32(info->attrs[idx]);
+
+		if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
+		    (type != NL80211_TX_POWER_AUTOMATIC)) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+
+		if (type != NL80211_TX_POWER_AUTOMATIC) {
+			idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
+			mbm = nla_get_u32(info->attrs[idx]);
+		}
+
+		result = rdev->ops->set_tx_power(&rdev->wiphy, type, mbm);
+		if (result)
+			goto bad_res;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] &&
+	    info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) {
+		u32 tx_ant, rx_ant;
+		if ((!rdev->wiphy.available_antennas_tx &&
+		     !rdev->wiphy.available_antennas_rx) ||
+		    !rdev->ops->set_antenna) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]);
+		rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]);
+
+		/* reject antenna configurations which don't match the
+		 * available antenna masks, except for the "all" mask */
+		if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) ||
+		    (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+
+		tx_ant = tx_ant & rdev->wiphy.available_antennas_tx;
+		rx_ant = rx_ant & rdev->wiphy.available_antennas_rx;
+
+		result = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant);
+		if (result)
+			goto bad_res;
+	}
+
+	changed = 0;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
+		retry_short = nla_get_u8(
+			info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);
+		if (retry_short == 0) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
+		retry_long = nla_get_u8(
+			info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);
+		if (retry_long == 0) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		changed |= WIPHY_PARAM_RETRY_LONG;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) {
+		frag_threshold = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]);
+		if (frag_threshold < 256) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+		if (frag_threshold != (u32) -1) {
+			/*
+			 * Fragments (apart from the last one) are required to
+			 * have even length. Make the fragmentation code
+			 * simpler by stripping LSB should someone try to use
+			 * odd threshold value.
+			 */
+			frag_threshold &= ~0x1;
+		}
+		changed |= WIPHY_PARAM_FRAG_THRESHOLD;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) {
+		rts_threshold = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]);
+		changed |= WIPHY_PARAM_RTS_THRESHOLD;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) {
+		coverage_class = nla_get_u8(
+			info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]);
+		changed |= WIPHY_PARAM_COVERAGE_CLASS;
+	}
+
+	if (changed) {
+		u8 old_retry_short, old_retry_long;
+		u32 old_frag_threshold, old_rts_threshold;
+		u8 old_coverage_class;
+
+		if (!rdev->ops->set_wiphy_params) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		old_retry_short = rdev->wiphy.retry_short;
+		old_retry_long = rdev->wiphy.retry_long;
+		old_frag_threshold = rdev->wiphy.frag_threshold;
+		old_rts_threshold = rdev->wiphy.rts_threshold;
+		old_coverage_class = rdev->wiphy.coverage_class;
+
+		if (changed & WIPHY_PARAM_RETRY_SHORT)
+			rdev->wiphy.retry_short = retry_short;
+		if (changed & WIPHY_PARAM_RETRY_LONG)
+			rdev->wiphy.retry_long = retry_long;
+		if (changed & WIPHY_PARAM_FRAG_THRESHOLD)
+			rdev->wiphy.frag_threshold = frag_threshold;
+		if (changed & WIPHY_PARAM_RTS_THRESHOLD)
+			rdev->wiphy.rts_threshold = rts_threshold;
+		if (changed & WIPHY_PARAM_COVERAGE_CLASS)
+			rdev->wiphy.coverage_class = coverage_class;
+
+		result = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
+		if (result) {
+			rdev->wiphy.retry_short = old_retry_short;
+			rdev->wiphy.retry_long = old_retry_long;
+			rdev->wiphy.frag_threshold = old_frag_threshold;
+			rdev->wiphy.rts_threshold = old_rts_threshold;
+			rdev->wiphy.coverage_class = old_coverage_class;
+		}
+	}
+
+ bad_res:
+	mutex_unlock(&rdev->mtx);
+	if (netdev)
+		dev_put(netdev);
+	return result;
+}
+
+
+static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags,
+			      struct cfg80211_registered_device *rdev,
+			      struct net_device *dev)
+{
+	void *hdr;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_INTERFACE);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_STRING(msg, NL80211_ATTR_IFNAME, dev->name);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFTYPE, dev->ieee80211_ptr->iftype);
+
+	NLA_PUT_U32(msg, NL80211_ATTR_GENERATION,
+		    rdev->devlist_generation ^
+			(cfg80211_rdev_list_generation << 2));
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int wp_idx = 0;
+	int if_idx = 0;
+	int wp_start = cb->args[0];
+	int if_start = cb->args[1];
+	struct cfg80211_registered_device *rdev;
+	struct wireless_dev *wdev;
+
+	mutex_lock(&cfg80211_mutex);
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+		if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
+			continue;
+		if (wp_idx < wp_start) {
+			wp_idx++;
+			continue;
+		}
+		if_idx = 0;
+
+		mutex_lock(&rdev->devlist_mtx);
+		list_for_each_entry(wdev, &rdev->netdev_list, list) {
+			if (if_idx < if_start) {
+				if_idx++;
+				continue;
+			}
+			if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).pid,
+					       cb->nlh->nlmsg_seq, NLM_F_MULTI,
+					       rdev, wdev->netdev) < 0) {
+				mutex_unlock(&rdev->devlist_mtx);
+				goto out;
+			}
+			if_idx++;
+		}
+		mutex_unlock(&rdev->devlist_mtx);
+
+		wp_idx++;
+	}
+ out:
+	mutex_unlock(&cfg80211_mutex);
+
+	cb->args[0] = wp_idx;
+	cb->args[1] = if_idx;
+
+	return skb->len;
+}
+
+static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	struct cfg80211_registered_device *dev = info->user_ptr[0];
+	struct net_device *netdev = info->user_ptr[1];
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0,
+			       dev, netdev) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
+	[NL80211_MNTR_FLAG_FCSFAIL] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_PLCPFAIL] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_CONTROL] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG },
+	[NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG },
+};
+
+static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
+{
+	struct nlattr *flags[NL80211_MNTR_FLAG_MAX + 1];
+	int flag;
+
+	*mntrflags = 0;
+
+	if (!nla)
+		return -EINVAL;
+
+	if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX,
+			     nla, mntr_flags_policy))
+		return -EINVAL;
+
+	for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++)
+		if (flags[flag])
+			*mntrflags |= (1<<flag);
+
+	return 0;
+}
+
+static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, u8 use_4addr,
+			       enum nl80211_iftype iftype)
+{
+	if (!use_4addr) {
+		if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT))
+			return -EBUSY;
+		return 0;
+	}
+
+	switch (iftype) {
+	case NL80211_IFTYPE_AP_VLAN:
+		if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP)
+			return 0;
+		break;
+	case NL80211_IFTYPE_STATION:
+		if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_STATION)
+			return 0;
+		break;
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct vif_params params;
+	int err;
+	enum nl80211_iftype otype, ntype;
+	struct net_device *dev = info->user_ptr[1];
+	u32 _flags, *flags = NULL;
+	bool change = false;
+
+	memset(&params, 0, sizeof(params));
+
+	otype = ntype = dev->ieee80211_ptr->iftype;
+
+	if (info->attrs[NL80211_ATTR_IFTYPE]) {
+		ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
+		if (otype != ntype)
+			change = true;
+		if (ntype > NL80211_IFTYPE_MAX)
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_MESH_ID]) {
+		struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+		if (ntype != NL80211_IFTYPE_MESH_POINT)
+			return -EINVAL;
+		if (netif_running(dev))
+			return -EBUSY;
+
+		wdev_lock(wdev);
+		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
+			     IEEE80211_MAX_MESH_ID_LEN);
+		wdev->mesh_id_up_len =
+			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->mesh_id_up_len);
+		wdev_unlock(wdev);
+	}
+
+	if (info->attrs[NL80211_ATTR_4ADDR]) {
+		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+		change = true;
+		err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
+		if (err)
+			return err;
+	} else {
+		params.use_4addr = -1;
+	}
+
+	if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
+		if (ntype != NL80211_IFTYPE_MONITOR)
+			return -EINVAL;
+		err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
+					  &_flags);
+		if (err)
+			return err;
+
+		flags = &_flags;
+		change = true;
+	}
+
+	if (change)
+		err = cfg80211_change_iface(rdev, dev, ntype, flags, &params);
+	else
+		err = 0;
+
+	if (!err && params.use_4addr != -1)
+		dev->ieee80211_ptr->use_4addr = params.use_4addr;
+
+	return err;
+}
+
+static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct vif_params params;
+	struct net_device *dev;
+	int err;
+	enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
+	u32 flags;
+
+	memset(&params, 0, sizeof(params));
+
+	if (!info->attrs[NL80211_ATTR_IFNAME])
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_IFTYPE]) {
+		type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
+		if (type > NL80211_IFTYPE_MAX)
+			return -EINVAL;
+	}
+
+	if (!rdev->ops->add_virtual_intf ||
+	    !(rdev->wiphy.interface_modes & (1 << type)))
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_4ADDR]) {
+		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
+		err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
+		if (err)
+			return err;
+	}
+
+	err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
+				  info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL,
+				  &flags);
+	dev = rdev->ops->add_virtual_intf(&rdev->wiphy,
+		nla_data(info->attrs[NL80211_ATTR_IFNAME]),
+		type, err ? NULL : &flags, &params);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	if (type == NL80211_IFTYPE_MESH_POINT &&
+	    info->attrs[NL80211_ATTR_MESH_ID]) {
+		struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+		wdev_lock(wdev);
+		BUILD_BUG_ON(IEEE80211_MAX_SSID_LEN !=
+			     IEEE80211_MAX_MESH_ID_LEN);
+		wdev->mesh_id_up_len =
+			nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+		memcpy(wdev->ssid, nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
+		       wdev->mesh_id_up_len);
+		wdev_unlock(wdev);
+	}
+
+	return 0;
+}
+
+static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	if (!rdev->ops->del_virtual_intf)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
+}
+
+struct get_key_cookie {
+	struct sk_buff *msg;
+	int error;
+	int idx;
+};
+
+static void get_key_callback(void *c, struct key_params *params)
+{
+	struct nlattr *key;
+	struct get_key_cookie *cookie = c;
+
+	if (params->key)
+		NLA_PUT(cookie->msg, NL80211_ATTR_KEY_DATA,
+			params->key_len, params->key);
+
+	if (params->seq)
+		NLA_PUT(cookie->msg, NL80211_ATTR_KEY_SEQ,
+			params->seq_len, params->seq);
+
+	if (params->cipher)
+		NLA_PUT_U32(cookie->msg, NL80211_ATTR_KEY_CIPHER,
+			    params->cipher);
+
+	key = nla_nest_start(cookie->msg, NL80211_ATTR_KEY);
+	if (!key)
+		goto nla_put_failure;
+
+	if (params->key)
+		NLA_PUT(cookie->msg, NL80211_KEY_DATA,
+			params->key_len, params->key);
+
+	if (params->seq)
+		NLA_PUT(cookie->msg, NL80211_KEY_SEQ,
+			params->seq_len, params->seq);
+
+	if (params->cipher)
+		NLA_PUT_U32(cookie->msg, NL80211_KEY_CIPHER,
+			    params->cipher);
+
+	NLA_PUT_U8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx);
+
+	nla_nest_end(cookie->msg, key);
+
+	return;
+ nla_put_failure:
+	cookie->error = 1;
+}
+
+static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	u8 key_idx = 0;
+	const u8 *mac_addr = NULL;
+	bool pairwise;
+	struct get_key_cookie cookie = {
+		.error = 0,
+	};
+	void *hdr;
+	struct sk_buff *msg;
+
+	if (info->attrs[NL80211_ATTR_KEY_IDX])
+		key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
+
+	if (key_idx > 5)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	pairwise = !!mac_addr;
+	if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+		u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+		if (kt >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+		if (kt != NL80211_KEYTYPE_GROUP &&
+		    kt != NL80211_KEYTYPE_PAIRWISE)
+			return -EINVAL;
+		pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
+	}
+
+	if (!rdev->ops->get_key)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_NEW_KEY);
+	if (IS_ERR(hdr))
+		return PTR_ERR(hdr);
+
+	cookie.msg = msg;
+	cookie.idx = key_idx;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_idx);
+	if (mac_addr)
+		NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
+
+	if (pairwise && mac_addr &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		return -ENOENT;
+
+	err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise,
+				 mac_addr, &cookie, get_key_callback);
+
+	if (err)
+		goto free_msg;
+
+	if (cookie.error)
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+	err = -ENOBUFS;
+ free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct key_parse key;
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+
+	err = nl80211_parse_key(info, &key);
+	if (err)
+		return err;
+
+	if (key.idx < 0)
+		return -EINVAL;
+
+	/* only support setting default key */
+	if (!key.def && !key.defmgmt)
+		return -EINVAL;
+
+	wdev_lock(dev->ieee80211_ptr);
+
+	if (key.def) {
+		if (!rdev->ops->set_default_key) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		err = nl80211_key_allowed(dev->ieee80211_ptr);
+		if (err)
+			goto out;
+
+		err = rdev->ops->set_default_key(&rdev->wiphy, dev, key.idx,
+						 key.def_uni, key.def_multi);
+
+		if (err)
+			goto out;
+
+#ifdef CONFIG_CFG80211_WEXT
+		dev->ieee80211_ptr->wext.default_key = key.idx;
+#endif
+	} else {
+		if (key.def_uni || !key.def_multi) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (!rdev->ops->set_default_mgmt_key) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		err = nl80211_key_allowed(dev->ieee80211_ptr);
+		if (err)
+			goto out;
+
+		err = rdev->ops->set_default_mgmt_key(&rdev->wiphy,
+						      dev, key.idx);
+		if (err)
+			goto out;
+
+#ifdef CONFIG_CFG80211_WEXT
+		dev->ieee80211_ptr->wext.default_mgmt_key = key.idx;
+#endif
+	}
+
+ out:
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
+}
+
+static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	struct key_parse key;
+	const u8 *mac_addr = NULL;
+
+	err = nl80211_parse_key(info, &key);
+	if (err)
+		return err;
+
+	if (!key.p.key)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (key.type == -1) {
+		if (mac_addr)
+			key.type = NL80211_KEYTYPE_PAIRWISE;
+		else
+			key.type = NL80211_KEYTYPE_GROUP;
+	}
+
+	/* for now */
+	if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+	    key.type != NL80211_KEYTYPE_GROUP)
+		return -EINVAL;
+
+	if (!rdev->ops->add_key)
+		return -EOPNOTSUPP;
+
+	if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
+					   key.type == NL80211_KEYTYPE_PAIRWISE,
+					   mac_addr))
+		return -EINVAL;
+
+	wdev_lock(dev->ieee80211_ptr);
+	err = nl80211_key_allowed(dev->ieee80211_ptr);
+	if (!err)
+		err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx,
+					 key.type == NL80211_KEYTYPE_PAIRWISE,
+					 mac_addr, &key.p);
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
+}
+
+static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	u8 *mac_addr = NULL;
+	struct key_parse key;
+
+	err = nl80211_parse_key(info, &key);
+	if (err)
+		return err;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (key.type == -1) {
+		if (mac_addr)
+			key.type = NL80211_KEYTYPE_PAIRWISE;
+		else
+			key.type = NL80211_KEYTYPE_GROUP;
+	}
+
+	/* for now */
+	if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+	    key.type != NL80211_KEYTYPE_GROUP)
+		return -EINVAL;
+
+	if (!rdev->ops->del_key)
+		return -EOPNOTSUPP;
+
+	wdev_lock(dev->ieee80211_ptr);
+	err = nl80211_key_allowed(dev->ieee80211_ptr);
+
+	if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		err = -ENOENT;
+
+	if (!err)
+		err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx,
+					 key.type == NL80211_KEYTYPE_PAIRWISE,
+					 mac_addr);
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (!err) {
+		if (key.idx == dev->ieee80211_ptr->wext.default_key)
+			dev->ieee80211_ptr->wext.default_key = -1;
+		else if (key.idx == dev->ieee80211_ptr->wext.default_mgmt_key)
+			dev->ieee80211_ptr->wext.default_mgmt_key = -1;
+	}
+#endif
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
+}
+
+static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
+{
+        int (*call)(struct wiphy *wiphy, struct net_device *dev,
+		    struct beacon_parameters *info);
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct beacon_parameters params;
+	int haveinfo = 0, err;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL]))
+		return -EINVAL;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	memset(&params, 0, sizeof(params));
+
+	switch (info->genlhdr->cmd) {
+	case NL80211_CMD_NEW_BEACON:
+		/* these are required for NEW_BEACON */
+		if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
+		    !info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
+		    !info->attrs[NL80211_ATTR_BEACON_HEAD])
+			return -EINVAL;
+
+		params.interval =
+			nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+		params.dtim_period =
+			nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
+
+		err = cfg80211_validate_beacon_int(rdev, params.interval);
+		if (err)
+			return err;
+
+		call = rdev->ops->add_beacon;
+		break;
+	case NL80211_CMD_SET_BEACON:
+		call = rdev->ops->set_beacon;
+		break;
+	default:
+		WARN_ON(1);
+		return -EOPNOTSUPP;
+	}
+
+	if (!call)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_BEACON_HEAD]) {
+		params.head = nla_data(info->attrs[NL80211_ATTR_BEACON_HEAD]);
+		params.head_len =
+		    nla_len(info->attrs[NL80211_ATTR_BEACON_HEAD]);
+		haveinfo = 1;
+	}
+
+	if (info->attrs[NL80211_ATTR_BEACON_TAIL]) {
+		params.tail = nla_data(info->attrs[NL80211_ATTR_BEACON_TAIL]);
+		params.tail_len =
+		    nla_len(info->attrs[NL80211_ATTR_BEACON_TAIL]);
+		haveinfo = 1;
+	}
+
+	if (!haveinfo)
+		return -EINVAL;
+
+	err = call(&rdev->wiphy, dev, &params);
+	if (!err && params.interval)
+		wdev->beacon_interval = params.interval;
+	return err;
+}
+
+static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	if (!rdev->ops->del_beacon)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->del_beacon(&rdev->wiphy, dev);
+	if (!err)
+		wdev->beacon_interval = 0;
+	return err;
+}
+
+static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
+	[NL80211_STA_FLAG_AUTHORIZED] = { .type = NLA_FLAG },
+	[NL80211_STA_FLAG_SHORT_PREAMBLE] = { .type = NLA_FLAG },
+	[NL80211_STA_FLAG_WME] = { .type = NLA_FLAG },
+	[NL80211_STA_FLAG_MFP] = { .type = NLA_FLAG },
+	[NL80211_STA_FLAG_AUTHENTICATED] = { .type = NLA_FLAG },
+};
+
+static int parse_station_flags(struct genl_info *info,
+			       struct station_parameters *params)
+{
+	struct nlattr *flags[NL80211_STA_FLAG_MAX + 1];
+	struct nlattr *nla;
+	int flag;
+
+	/*
+	 * Try parsing the new attribute first so userspace
+	 * can specify both for older kernels.
+	 */
+	nla = info->attrs[NL80211_ATTR_STA_FLAGS2];
+	if (nla) {
+		struct nl80211_sta_flag_update *sta_flags;
+
+		sta_flags = nla_data(nla);
+		params->sta_flags_mask = sta_flags->mask;
+		params->sta_flags_set = sta_flags->set;
+		if ((params->sta_flags_mask |
+		     params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID))
+			return -EINVAL;
+		return 0;
+	}
+
+	/* if present, parse the old attribute */
+
+	nla = info->attrs[NL80211_ATTR_STA_FLAGS];
+	if (!nla)
+		return 0;
+
+	if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX,
+			     nla, sta_flags_policy))
+		return -EINVAL;
+
+	params->sta_flags_mask = (1 << __NL80211_STA_FLAG_AFTER_LAST) - 1;
+	params->sta_flags_mask &= ~1;
+
+	for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++)
+		if (flags[flag])
+			params->sta_flags_set |= (1<<flag);
+
+	return 0;
+}
+
+static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
+				 int attr)
+{
+	struct nlattr *rate;
+	u16 bitrate;
+
+	rate = nla_nest_start(msg, attr);
+	if (!rate)
+		goto nla_put_failure;
+
+	/* cfg80211_calculate_bitrate will return 0 for mcs >= 32 */
+	bitrate = cfg80211_calculate_bitrate(info);
+	if (bitrate > 0)
+		NLA_PUT_U16(msg, NL80211_RATE_INFO_BITRATE, bitrate);
+
+	if (info->flags & RATE_INFO_FLAGS_MCS)
+		NLA_PUT_U8(msg, NL80211_RATE_INFO_MCS, info->mcs);
+	if (info->flags & RATE_INFO_FLAGS_40_MHZ_WIDTH)
+		NLA_PUT_FLAG(msg, NL80211_RATE_INFO_40_MHZ_WIDTH);
+	if (info->flags & RATE_INFO_FLAGS_SHORT_GI)
+		NLA_PUT_FLAG(msg, NL80211_RATE_INFO_SHORT_GI);
+
+	nla_nest_end(msg, rate);
+	return true;
+
+nla_put_failure:
+	return false;
+}
+
+static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
+				int flags, struct net_device *dev,
+				const u8 *mac_addr, struct station_info *sinfo)
+{
+	void *hdr;
+	struct nlattr *sinfoattr, *bss_param;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
+
+	NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, sinfo->generation);
+
+	sinfoattr = nla_nest_start(msg, NL80211_ATTR_STA_INFO);
+	if (!sinfoattr)
+		goto nla_put_failure;
+	if (sinfo->filled & STATION_INFO_CONNECTED_TIME)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_CONNECTED_TIME,
+			    sinfo->connected_time);
+	if (sinfo->filled & STATION_INFO_INACTIVE_TIME)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_INACTIVE_TIME,
+			    sinfo->inactive_time);
+	if (sinfo->filled & STATION_INFO_RX_BYTES)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_RX_BYTES,
+			    sinfo->rx_bytes);
+	if (sinfo->filled & STATION_INFO_TX_BYTES)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_BYTES,
+			    sinfo->tx_bytes);
+	if (sinfo->filled & STATION_INFO_LLID)
+		NLA_PUT_U16(msg, NL80211_STA_INFO_LLID,
+			    sinfo->llid);
+	if (sinfo->filled & STATION_INFO_PLID)
+		NLA_PUT_U16(msg, NL80211_STA_INFO_PLID,
+			    sinfo->plid);
+	if (sinfo->filled & STATION_INFO_PLINK_STATE)
+		NLA_PUT_U8(msg, NL80211_STA_INFO_PLINK_STATE,
+			    sinfo->plink_state);
+	if (sinfo->filled & STATION_INFO_SIGNAL)
+		NLA_PUT_U8(msg, NL80211_STA_INFO_SIGNAL,
+			   sinfo->signal);
+	if (sinfo->filled & STATION_INFO_SIGNAL_AVG)
+		NLA_PUT_U8(msg, NL80211_STA_INFO_SIGNAL_AVG,
+			   sinfo->signal_avg);
+	if (sinfo->filled & STATION_INFO_TX_BITRATE) {
+		if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
+					  NL80211_STA_INFO_TX_BITRATE))
+			goto nla_put_failure;
+	}
+	if (sinfo->filled & STATION_INFO_RX_BITRATE) {
+		if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
+					  NL80211_STA_INFO_RX_BITRATE))
+			goto nla_put_failure;
+	}
+	if (sinfo->filled & STATION_INFO_RX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS,
+			    sinfo->rx_packets);
+	if (sinfo->filled & STATION_INFO_TX_PACKETS)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
+			    sinfo->tx_packets);
+	if (sinfo->filled & STATION_INFO_TX_RETRIES)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES,
+			    sinfo->tx_retries);
+	if (sinfo->filled & STATION_INFO_TX_FAILED)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED,
+			    sinfo->tx_failed);
+	if (sinfo->filled & STATION_INFO_BSS_PARAM) {
+		bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
+		if (!bss_param)
+			goto nla_put_failure;
+
+		if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_CTS_PROT)
+			NLA_PUT_FLAG(msg, NL80211_STA_BSS_PARAM_CTS_PROT);
+		if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_PREAMBLE)
+			NLA_PUT_FLAG(msg, NL80211_STA_BSS_PARAM_SHORT_PREAMBLE);
+		if (sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_SLOT_TIME)
+			NLA_PUT_FLAG(msg,
+				     NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME);
+		NLA_PUT_U8(msg, NL80211_STA_BSS_PARAM_DTIM_PERIOD,
+			   sinfo->bss_param.dtim_period);
+		NLA_PUT_U16(msg, NL80211_STA_BSS_PARAM_BEACON_INTERVAL,
+			    sinfo->bss_param.beacon_interval);
+
+		nla_nest_end(msg, bss_param);
+	}
+	nla_nest_end(msg, sinfoattr);
+
+	if (sinfo->filled & STATION_INFO_ASSOC_REQ_IES)
+		NLA_PUT(msg, NL80211_ATTR_IE, sinfo->assoc_req_ies_len,
+			sinfo->assoc_req_ies);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_station(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct station_info sinfo;
+	struct cfg80211_registered_device *dev;
+	struct net_device *netdev;
+	u8 mac_addr[ETH_ALEN];
+	int sta_idx = cb->args[1];
+	int err;
+
+	err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (err)
+		return err;
+
+	if (!dev->ops->dump_station) {
+		err = -EOPNOTSUPP;
+		goto out_err;
+	}
+
+	while (1) {
+		memset(&sinfo, 0, sizeof(sinfo));
+		err = dev->ops->dump_station(&dev->wiphy, netdev, sta_idx,
+					     mac_addr, &sinfo);
+		if (err == -ENOENT)
+			break;
+		if (err)
+			goto out_err;
+
+		if (nl80211_send_station(skb,
+				NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				netdev, mac_addr,
+				&sinfo) < 0)
+			goto out;
+
+		sta_idx++;
+	}
+
+
+ out:
+	cb->args[1] = sta_idx;
+	err = skb->len;
+ out_err:
+	nl80211_finish_netdev_dump(dev);
+
+	return err;
+}
+
+static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct station_info sinfo;
+	struct sk_buff *msg;
+	u8 *mac_addr = NULL;
+	int err;
+
+	memset(&sinfo, 0, sizeof(sinfo));
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (!rdev->ops->get_station)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo);
+	if (err)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0,
+				 dev, mac_addr, &sinfo) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+/*
+ * Get vlan interface making sure it is running and on the right wiphy.
+ */
+static int get_vlan(struct genl_info *info,
+		    struct cfg80211_registered_device *rdev,
+		    struct net_device **vlan)
+{
+	struct nlattr *vlanattr = info->attrs[NL80211_ATTR_STA_VLAN];
+	*vlan = NULL;
+
+	if (vlanattr) {
+		*vlan = dev_get_by_index(genl_info_net(info),
+					 nla_get_u32(vlanattr));
+		if (!*vlan)
+			return -ENODEV;
+		if (!(*vlan)->ieee80211_ptr)
+			return -EINVAL;
+		if ((*vlan)->ieee80211_ptr->wiphy != &rdev->wiphy)
+			return -EINVAL;
+		if (!netif_running(*vlan))
+			return -ENETDOWN;
+	}
+	return 0;
+}
+
+static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	struct station_parameters params;
+	u8 *mac_addr = NULL;
+
+	memset(&params, 0, sizeof(params));
+
+	params.listen_interval = -1;
+	params.plink_state = -1;
+
+	if (info->attrs[NL80211_ATTR_STA_AID])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
+		params.supported_rates =
+			nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+		params.supported_rates_len =
+			nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+	}
+
+	if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
+		params.listen_interval =
+		    nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
+	if (parse_station_flags(info, &params))
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
+		params.plink_action =
+		    nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
+
+	if (info->attrs[NL80211_ATTR_STA_PLINK_STATE])
+		params.plink_state =
+		    nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
+
+	err = get_vlan(info, rdev, &params.vlan);
+	if (err)
+		goto out;
+
+	/* validate settings */
+	err = 0;
+
+	switch (dev->ieee80211_ptr->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
+		/* disallow mesh-specific things */
+		if (params.plink_action)
+			err = -EINVAL;
+		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
+	case NL80211_IFTYPE_STATION:
+		/* disallow everything but AUTHORIZED flag */
+		if (params.plink_action)
+			err = -EINVAL;
+		if (params.vlan)
+			err = -EINVAL;
+		if (params.supported_rates)
+			err = -EINVAL;
+		if (params.ht_capa)
+			err = -EINVAL;
+		if (params.listen_interval >= 0)
+			err = -EINVAL;
+		if (params.sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED))
+			err = -EINVAL;
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		/* disallow things mesh doesn't support */
+		if (params.vlan)
+			err = -EINVAL;
+		if (params.ht_capa)
+			err = -EINVAL;
+		if (params.listen_interval >= 0)
+			err = -EINVAL;
+		if (params.sta_flags_mask &
+				~(BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+				  BIT(NL80211_STA_FLAG_MFP) |
+				  BIT(NL80211_STA_FLAG_AUTHORIZED)))
+			err = -EINVAL;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	if (err)
+		goto out;
+
+	if (!rdev->ops->change_station) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = rdev->ops->change_station(&rdev->wiphy, dev, mac_addr, &params);
+
+ out:
+	if (params.vlan)
+		dev_put(params.vlan);
+
+	return err;
+}
+
+static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	struct station_parameters params;
+	u8 *mac_addr = NULL;
+
+	memset(&params, 0, sizeof(params));
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_STA_AID])
+		return -EINVAL;
+
+	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	params.supported_rates =
+		nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+	params.supported_rates_len =
+		nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
+	params.listen_interval =
+		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+
+	params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+	if (!params.aid || params.aid > IEEE80211_MAX_AID)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
+		params.ht_capa =
+			nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
+
+	if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
+		params.plink_action =
+		    nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
+
+	if (parse_station_flags(info, &params))
+		return -EINVAL;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EINVAL;
+
+	err = get_vlan(info, rdev, &params.vlan);
+	if (err)
+		goto out;
+
+	/* validate settings */
+	err = 0;
+
+	if (!rdev->ops->add_station) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, &params);
+
+ out:
+	if (params.vlan)
+		dev_put(params.vlan);
+	return err;
+}
+
+static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u8 *mac_addr = NULL;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EINVAL;
+
+	if (!rdev->ops->del_station)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
+}
+
+static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
+				int flags, struct net_device *dev,
+				u8 *dst, u8 *next_hop,
+				struct mpath_info *pinfo)
+{
+	void *hdr;
+	struct nlattr *pinfoattr;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, dst);
+	NLA_PUT(msg, NL80211_ATTR_MPATH_NEXT_HOP, ETH_ALEN, next_hop);
+
+	NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, pinfo->generation);
+
+	pinfoattr = nla_nest_start(msg, NL80211_ATTR_MPATH_INFO);
+	if (!pinfoattr)
+		goto nla_put_failure;
+	if (pinfo->filled & MPATH_INFO_FRAME_QLEN)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_FRAME_QLEN,
+			    pinfo->frame_qlen);
+	if (pinfo->filled & MPATH_INFO_SN)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_SN,
+			    pinfo->sn);
+	if (pinfo->filled & MPATH_INFO_METRIC)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_METRIC,
+			    pinfo->metric);
+	if (pinfo->filled & MPATH_INFO_EXPTIME)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_EXPTIME,
+			    pinfo->exptime);
+	if (pinfo->filled & MPATH_INFO_FLAGS)
+		NLA_PUT_U8(msg, NL80211_MPATH_INFO_FLAGS,
+			    pinfo->flags);
+	if (pinfo->filled & MPATH_INFO_DISCOVERY_TIMEOUT)
+		NLA_PUT_U32(msg, NL80211_MPATH_INFO_DISCOVERY_TIMEOUT,
+			    pinfo->discovery_timeout);
+	if (pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES)
+		NLA_PUT_U8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES,
+			    pinfo->discovery_retries);
+
+	nla_nest_end(msg, pinfoattr);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_mpath(struct sk_buff *skb,
+			      struct netlink_callback *cb)
+{
+	struct mpath_info pinfo;
+	struct cfg80211_registered_device *dev;
+	struct net_device *netdev;
+	u8 dst[ETH_ALEN];
+	u8 next_hop[ETH_ALEN];
+	int path_idx = cb->args[1];
+	int err;
+
+	err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (err)
+		return err;
+
+	if (!dev->ops->dump_mpath) {
+		err = -EOPNOTSUPP;
+		goto out_err;
+	}
+
+	if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
+		err = -EOPNOTSUPP;
+		goto out_err;
+	}
+
+	while (1) {
+		err = dev->ops->dump_mpath(&dev->wiphy, netdev, path_idx,
+					   dst, next_hop, &pinfo);
+		if (err == -ENOENT)
+			break;
+		if (err)
+			goto out_err;
+
+		if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).pid,
+				       cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				       netdev, dst, next_hop,
+				       &pinfo) < 0)
+			goto out;
+
+		path_idx++;
+	}
+
+
+ out:
+	cb->args[1] = path_idx;
+	err = skb->len;
+ out_err:
+	nl80211_finish_netdev_dump(dev);
+	return err;
+}
+
+static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+	struct net_device *dev = info->user_ptr[1];
+	struct mpath_info pinfo;
+	struct sk_buff *msg;
+	u8 *dst = NULL;
+	u8 next_hop[ETH_ALEN];
+
+	memset(&pinfo, 0, sizeof(pinfo));
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (!rdev->ops->get_mpath)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo);
+	if (err)
+		return err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0,
+				 dev, dst, next_hop, &pinfo) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u8 *dst = NULL;
+	u8 *next_hop = NULL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
+		return -EINVAL;
+
+	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
+
+	if (!rdev->ops->change_mpath)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
+}
+
+static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u8 *dst = NULL;
+	u8 *next_hop = NULL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
+		return -EINVAL;
+
+	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
+
+	if (!rdev->ops->add_mpath)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
+}
+
+static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u8 *dst = NULL;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (!rdev->ops->del_mpath)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
+}
+
+static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct bss_parameters params;
+
+	memset(&params, 0, sizeof(params));
+	/* default to not changing parameters */
+	params.use_cts_prot = -1;
+	params.use_short_preamble = -1;
+	params.use_short_slot_time = -1;
+	params.ap_isolate = -1;
+	params.ht_opmode = -1;
+
+	if (info->attrs[NL80211_ATTR_BSS_CTS_PROT])
+		params.use_cts_prot =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]);
+	if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE])
+		params.use_short_preamble =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]);
+	if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
+		params.use_short_slot_time =
+		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
+	if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+		params.basic_rates =
+			nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		params.basic_rates_len =
+			nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+	}
+	if (info->attrs[NL80211_ATTR_AP_ISOLATE])
+		params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
+	if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE])
+		params.ht_opmode =
+			nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]);
+
+	if (!rdev->ops->change_bss)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->change_bss(&rdev->wiphy, dev, &params);
+}
+
+static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
+	[NL80211_ATTR_REG_RULE_FLAGS]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_START]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_END]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_MAX_BW]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_EIRP]	= { .type = NLA_U32 },
+};
+
+static int parse_reg_rule(struct nlattr *tb[],
+	struct ieee80211_reg_rule *reg_rule)
+{
+	struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
+	struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
+
+	if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_START])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_END])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
+		return -EINVAL;
+
+	reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
+
+	freq_range->start_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
+	freq_range->end_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
+	freq_range->max_bandwidth_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
+
+	power_rule->max_eirp =
+		nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
+
+	if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
+		power_rule->max_antenna_gain =
+			nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
+
+	return 0;
+}
+
+static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	int r;
+	char *data = NULL;
+
+	/*
+	 * You should only get this when cfg80211 hasn't yet initialized
+	 * completely when built-in to the kernel right between the time
+	 * window between nl80211_init() and regulatory_init(), if that is
+	 * even possible.
+	 */
+	mutex_lock(&cfg80211_mutex);
+	if (unlikely(!cfg80211_regdomain)) {
+		mutex_unlock(&cfg80211_mutex);
+		return -EINPROGRESS;
+	}
+	mutex_unlock(&cfg80211_mutex);
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+	r = regulatory_hint_user(data);
+
+	return r;
+}
+
+static int nl80211_get_mesh_config(struct sk_buff *skb,
+				   struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct mesh_config cur_params;
+	int err = 0;
+	void *hdr;
+	struct nlattr *pinfoattr;
+	struct sk_buff *msg;
+
+	if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->get_mesh_config)
+		return -EOPNOTSUPP;
+
+	wdev_lock(wdev);
+	/* If not connected, get default parameters */
+	if (!wdev->mesh_id_len)
+		memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
+	else
+		err = rdev->ops->get_mesh_config(&rdev->wiphy, dev,
+						 &cur_params);
+	wdev_unlock(wdev);
+
+	if (err)
+		return err;
+
+	/* Draw up a netlink message to send back */
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_MESH_CONFIG);
+	if (!hdr)
+		goto out;
+	pinfoattr = nla_nest_start(msg, NL80211_ATTR_MESH_CONFIG);
+	if (!pinfoattr)
+		goto nla_put_failure;
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_RETRY_TIMEOUT,
+			cur_params.dot11MeshRetryTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_CONFIRM_TIMEOUT,
+			cur_params.dot11MeshConfirmTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HOLDING_TIMEOUT,
+			cur_params.dot11MeshHoldingTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_MAX_PEER_LINKS,
+			cur_params.dot11MeshMaxPeerLinks);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_MAX_RETRIES,
+			cur_params.dot11MeshMaxRetries);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_TTL,
+			cur_params.dot11MeshTTL);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_ELEMENT_TTL,
+			cur_params.element_ttl);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+			cur_params.auto_open_plinks);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+			cur_params.dot11MeshHWMPmaxPREQretries);
+	NLA_PUT_U32(msg, NL80211_MESHCONF_PATH_REFRESH_TIME,
+			cur_params.path_refresh_time);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+			cur_params.min_discovery_timeout);
+	NLA_PUT_U32(msg, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+			cur_params.dot11MeshHWMPactivePathTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+			cur_params.dot11MeshHWMPpreqMinInterval);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			cur_params.dot11MeshHWMPnetDiameterTraversalTime);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_ROOTMODE,
+			cur_params.dot11MeshHWMPRootMode);
+	nla_nest_end(msg, pinfoattr);
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+ out:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
+	[NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_TTL] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 },
+
+	[NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
+	[NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
+	[NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy
+	nl80211_mesh_setup_params_policy[NL80211_MESH_SETUP_ATTR_MAX+1] = {
+	[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 },
+	[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 },
+	[NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
+	[NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY,
+		.len = IEEE80211_MAX_DATA_LEN },
+	[NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
+};
+
+static int nl80211_parse_mesh_config(struct genl_info *info,
+				     struct mesh_config *cfg,
+				     u32 *mask_out)
+{
+	struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
+	u32 mask = 0;
+
+#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \
+do {\
+	if (table[attr_num]) {\
+		cfg->param = nla_fn(table[attr_num]); \
+		mask |= (1 << (attr_num - 1)); \
+	} \
+} while (0);\
+
+
+	if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
+		return -EINVAL;
+	if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX,
+			     info->attrs[NL80211_ATTR_MESH_CONFIG],
+			     nl80211_meshconf_params_policy))
+		return -EINVAL;
+
+	/* This makes sure that there aren't more than 32 mesh config
+	 * parameters (otherwise our bitfield scheme would not work.) */
+	BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
+
+	/* Fill in the params struct */
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout,
+			mask, NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout,
+			mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout,
+			mask, NL80211_MESHCONF_HOLDING_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks,
+			mask, NL80211_MESHCONF_MAX_PEER_LINKS, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries,
+			mask, NL80211_MESHCONF_MAX_RETRIES, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL,
+			mask, NL80211_MESHCONF_TTL, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl,
+			mask, NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks,
+			mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries,
+			mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+			nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time,
+			mask, NL80211_MESHCONF_PATH_REFRESH_TIME, nla_get_u32);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout,
+			mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
+			mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+			nla_get_u32);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval,
+			mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+			dot11MeshHWMPnetDiameterTraversalTime,
+			mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+			dot11MeshHWMPRootMode, mask,
+			NL80211_MESHCONF_HWMP_ROOTMODE,
+			nla_get_u8);
+	if (mask_out)
+		*mask_out = mask;
+
+	return 0;
+
+#undef FILL_IN_MESH_PARAM_IF_SET
+}
+
+static int nl80211_parse_mesh_setup(struct genl_info *info,
+				     struct mesh_setup *setup)
+{
+	struct nlattr *tb[NL80211_MESH_SETUP_ATTR_MAX + 1];
+
+	if (!info->attrs[NL80211_ATTR_MESH_SETUP])
+		return -EINVAL;
+	if (nla_parse_nested(tb, NL80211_MESH_SETUP_ATTR_MAX,
+			     info->attrs[NL80211_ATTR_MESH_SETUP],
+			     nl80211_mesh_setup_params_policy))
+		return -EINVAL;
+
+	if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])
+		setup->path_sel_proto =
+		(nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])) ?
+		 IEEE80211_PATH_PROTOCOL_VENDOR :
+		 IEEE80211_PATH_PROTOCOL_HWMP;
+
+	if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])
+		setup->path_metric =
+		(nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])) ?
+		 IEEE80211_PATH_METRIC_VENDOR :
+		 IEEE80211_PATH_METRIC_AIRTIME;
+
+
+	if (tb[NL80211_MESH_SETUP_IE]) {
+		struct nlattr *ieattr =
+			tb[NL80211_MESH_SETUP_IE];
+		if (!is_valid_ie_attr(ieattr))
+			return -EINVAL;
+		setup->ie = nla_data(ieattr);
+		setup->ie_len = nla_len(ieattr);
+	}
+	setup->is_authenticated = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AUTH]);
+	setup->is_secure = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AMPE]);
+
+	return 0;
+}
+
+static int nl80211_update_mesh_config(struct sk_buff *skb,
+				      struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct mesh_config cfg;
+	u32 mask;
+	int err;
+
+	if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->update_mesh_config)
+		return -EOPNOTSUPP;
+
+	err = nl80211_parse_mesh_config(info, &cfg, &mask);
+	if (err)
+		return err;
+
+	wdev_lock(wdev);
+	if (!wdev->mesh_id_len)
+		err = -ENOLINK;
+
+	if (!err)
+		err = rdev->ops->update_mesh_config(&rdev->wiphy, dev,
+						    mask, &cfg);
+
+	wdev_unlock(wdev);
+
+	return err;
+}
+
+static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	struct sk_buff *msg;
+	void *hdr = NULL;
+	struct nlattr *nl_reg_rules;
+	unsigned int i;
+	int err = -EINVAL;
+
+	mutex_lock(&cfg80211_mutex);
+
+	if (!cfg80211_regdomain)
+		goto out;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_REG);
+	if (!hdr)
+		goto put_failure;
+
+	NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2,
+		cfg80211_regdomain->alpha2);
+
+	nl_reg_rules = nla_nest_start(msg, NL80211_ATTR_REG_RULES);
+	if (!nl_reg_rules)
+		goto nla_put_failure;
+
+	for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) {
+		struct nlattr *nl_reg_rule;
+		const struct ieee80211_reg_rule *reg_rule;
+		const struct ieee80211_freq_range *freq_range;
+		const struct ieee80211_power_rule *power_rule;
+
+		reg_rule = &cfg80211_regdomain->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		nl_reg_rule = nla_nest_start(msg, i);
+		if (!nl_reg_rule)
+			goto nla_put_failure;
+
+		NLA_PUT_U32(msg, NL80211_ATTR_REG_RULE_FLAGS,
+			reg_rule->flags);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_START,
+			freq_range->start_freq_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_END,
+			freq_range->end_freq_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW,
+			freq_range->max_bandwidth_khz);
+		NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN,
+			power_rule->max_antenna_gain);
+		NLA_PUT_U32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP,
+			power_rule->max_eirp);
+
+		nla_nest_end(msg, nl_reg_rule);
+	}
+
+	nla_nest_end(msg, nl_reg_rules);
+
+	genlmsg_end(msg, hdr);
+	err = genlmsg_reply(msg, info);
+	goto out;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+put_failure:
+	nlmsg_free(msg);
+	err = -EMSGSIZE;
+out:
+	mutex_unlock(&cfg80211_mutex);
+	return err;
+}
+
+static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
+	struct nlattr *nl_reg_rule;
+	char *alpha2 = NULL;
+	int rem_reg_rules = 0, r = 0;
+	u32 num_rules = 0, rule_idx = 0, size_of_regd;
+	struct ieee80211_regdomain *rd = NULL;
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_REG_RULES])
+		return -EINVAL;
+
+	alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		num_rules++;
+		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
+			return -EINVAL;
+	}
+
+	mutex_lock(&cfg80211_mutex);
+
+	if (!reg_is_valid_request(alpha2)) {
+		r = -EINVAL;
+		goto bad_reg;
+	}
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		(num_rules * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd) {
+		r = -ENOMEM;
+		goto bad_reg;
+	}
+
+	rd->n_reg_rules = num_rules;
+	rd->alpha2[0] = alpha2[0];
+	rd->alpha2[1] = alpha2[1];
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
+			nla_data(nl_reg_rule), nla_len(nl_reg_rule),
+			reg_rule_policy);
+		r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
+		if (r)
+			goto bad_reg;
+
+		rule_idx++;
+
+		if (rule_idx > NL80211_MAX_SUPP_REG_RULES) {
+			r = -EINVAL;
+			goto bad_reg;
+		}
+	}
+
+	BUG_ON(rule_idx != num_rules);
+
+	r = set_regdom(rd);
+
+	mutex_unlock(&cfg80211_mutex);
+
+	return r;
+
+ bad_reg:
+	mutex_unlock(&cfg80211_mutex);
+	kfree(rd);
+	return r;
+}
+
+static int validate_scan_freqs(struct nlattr *freqs)
+{
+	struct nlattr *attr1, *attr2;
+	int n_channels = 0, tmp1, tmp2;
+
+	nla_for_each_nested(attr1, freqs, tmp1) {
+		n_channels++;
+		/*
+		 * Some hardware has a limited channel list for
+		 * scanning, and it is pretty much nonsensical
+		 * to scan for a channel twice, so disallow that
+		 * and don't require drivers to check that the
+		 * channel list they get isn't longer than what
+		 * they can scan, as long as they can scan all
+		 * the channels they registered at once.
+		 */
+		nla_for_each_nested(attr2, freqs, tmp2)
+			if (attr1 != attr2 &&
+			    nla_get_u32(attr1) == nla_get_u32(attr2))
+				return 0;
+	}
+
+	return n_channels;
+}
+
+static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_scan_request *request;
+	struct nlattr *attr;
+	struct wiphy *wiphy;
+	int err, tmp, n_ssids = 0, n_channels, i;
+	enum ieee80211_band band;
+	size_t ie_len;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	wiphy = &rdev->wiphy;
+
+	if (!rdev->ops->scan)
+		return -EOPNOTSUPP;
+
+	if (rdev->scan_req)
+		return -EBUSY;
+
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		n_channels = validate_scan_freqs(
+				info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
+		if (!n_channels)
+			return -EINVAL;
+	} else {
+		n_channels = 0;
+
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+			if (wiphy->bands[band])
+				n_channels += wiphy->bands[band]->n_channels;
+	}
+
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS])
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
+			n_ssids++;
+
+	if (n_ssids > wiphy->max_scan_ssids)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_IE])
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	else
+		ie_len = 0;
+
+	if (ie_len > wiphy->max_scan_ie_len)
+		return -EINVAL;
+
+	request = kzalloc(sizeof(*request)
+			+ sizeof(*request->ssids) * n_ssids
+			+ sizeof(*request->channels) * n_channels
+			+ ie_len, GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	if (n_ssids)
+		request->ssids = (void *)&request->channels[n_channels];
+	request->n_ssids = n_ssids;
+	if (ie_len) {
+		if (request->ssids)
+			request->ie = (void *)(request->ssids + n_ssids);
+		else
+			request->ie = (void *)(request->channels + n_channels);
+	}
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		/* user specified, bail out if channel not found */
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) {
+			struct ieee80211_channel *chan;
+
+			chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+
+			if (!chan) {
+				err = -EINVAL;
+				goto out_free;
+			}
+
+			/* ignore disabled channels */
+			if (chan->flags & IEEE80211_CHAN_DISABLED)
+				continue;
+
+			request->channels[i] = chan;
+			i++;
+		}
+	} else {
+		/* all channels */
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			int j;
+			if (!wiphy->bands[band])
+				continue;
+			for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+				struct ieee80211_channel *chan;
+
+				chan = &wiphy->bands[band]->channels[j];
+
+				if (chan->flags & IEEE80211_CHAN_DISABLED)
+					continue;
+
+				request->channels[i] = chan;
+				i++;
+			}
+		}
+	}
+
+	if (!i) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	request->n_channels = i;
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {
+			if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
+				err = -EINVAL;
+				goto out_free;
+			}
+			request->ssids[i].ssid_len = nla_len(attr);
+			memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
+			i++;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+		memcpy((void *)request->ie,
+		       nla_data(info->attrs[NL80211_ATTR_IE]),
+		       request->ie_len);
+	}
+
+	request->dev = dev;
+	request->wiphy = &rdev->wiphy;
+
+	rdev->scan_req = request;
+	err = rdev->ops->scan(&rdev->wiphy, dev, request);
+
+	if (!err) {
+		nl80211_send_scan_start(rdev, dev);
+		dev_hold(dev);
+	} else {
+ out_free:
+		rdev->scan_req = NULL;
+		kfree(request);
+	}
+
+	return err;
+}
+
+static int nl80211_start_sched_scan(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	struct cfg80211_sched_scan_request *request;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct nlattr *attr;
+	struct wiphy *wiphy;
+	int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i;
+	u32 interval;
+	enum ieee80211_band band;
+	size_t ie_len;
+	struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
+
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) ||
+	    !rdev->ops->sched_scan_start)
+		return -EOPNOTSUPP;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
+		return -EINVAL;
+
+	interval = nla_get_u32(info->attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
+	if (interval == 0)
+		return -EINVAL;
+
+	wiphy = &rdev->wiphy;
+
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		n_channels = validate_scan_freqs(
+				info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
+		if (!n_channels)
+			return -EINVAL;
+	} else {
+		n_channels = 0;
+
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+			if (wiphy->bands[band])
+				n_channels += wiphy->bands[band]->n_channels;
+	}
+
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS])
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS],
+				    tmp)
+			n_ssids++;
+
+	if (n_ssids > wiphy->max_sched_scan_ssids)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH])
+		nla_for_each_nested(attr,
+				    info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
+				    tmp)
+			n_match_sets++;
+
+	if (n_match_sets > wiphy->max_match_sets)
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_IE])
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	else
+		ie_len = 0;
+
+	if (ie_len > wiphy->max_sched_scan_ie_len)
+		return -EINVAL;
+
+	mutex_lock(&rdev->sched_scan_mtx);
+
+	if (rdev->sched_scan_req) {
+		err = -EINPROGRESS;
+		goto out;
+	}
+
+	request = kzalloc(sizeof(*request)
+			+ sizeof(*request->ssids) * n_ssids
+			+ sizeof(*request->match_sets) * n_match_sets
+			+ sizeof(*request->channels) * n_channels
+			+ ie_len, GFP_KERNEL);
+	if (!request) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (n_ssids)
+		request->ssids = (void *)&request->channels[n_channels];
+	request->n_ssids = n_ssids;
+	if (ie_len) {
+		if (request->ssids)
+			request->ie = (void *)(request->ssids + n_ssids);
+		else
+			request->ie = (void *)(request->channels + n_channels);
+	}
+
+	if (n_match_sets) {
+		if (request->ie)
+			request->match_sets = (void *)(request->ie + ie_len);
+		else if (request->ssids)
+			request->match_sets =
+				(void *)(request->ssids + n_ssids);
+		else
+			request->match_sets =
+				(void *)(request->channels + n_channels);
+	}
+	request->n_match_sets = n_match_sets;
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
+		/* user specified, bail out if channel not found */
+		nla_for_each_nested(attr,
+				    info->attrs[NL80211_ATTR_SCAN_FREQUENCIES],
+				    tmp) {
+			struct ieee80211_channel *chan;
+
+			chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));
+
+			if (!chan) {
+				err = -EINVAL;
+				goto out_free;
+			}
+
+			/* ignore disabled channels */
+			if (chan->flags & IEEE80211_CHAN_DISABLED)
+				continue;
+
+			request->channels[i] = chan;
+			i++;
+		}
+	} else {
+		/* all channels */
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			int j;
+			if (!wiphy->bands[band])
+				continue;
+			for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+				struct ieee80211_channel *chan;
+
+				chan = &wiphy->bands[band]->channels[j];
+
+				if (chan->flags & IEEE80211_CHAN_DISABLED)
+					continue;
+
+				request->channels[i] = chan;
+				i++;
+			}
+		}
+	}
+
+	if (!i) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	request->n_channels = i;
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCAN_SSIDS]) {
+		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS],
+				    tmp) {
+			if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
+				err = -EINVAL;
+				goto out_free;
+			}
+			request->ssids[i].ssid_len = nla_len(attr);
+			memcpy(request->ssids[i].ssid, nla_data(attr),
+			       nla_len(attr));
+			i++;
+		}
+	}
+
+	i = 0;
+	if (info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) {
+		nla_for_each_nested(attr,
+				    info->attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
+				    tmp) {
+			struct nlattr *ssid;
+
+			nla_parse(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
+				  nla_data(attr), nla_len(attr),
+				  nl80211_match_policy);
+			ssid = tb[NL80211_ATTR_SCHED_SCAN_MATCH_SSID];
+			if (ssid) {
+				if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) {
+					err = -EINVAL;
+					goto out_free;
+				}
+				memcpy(request->match_sets[i].ssid.ssid,
+				       nla_data(ssid), nla_len(ssid));
+				request->match_sets[i].ssid.ssid_len =
+					nla_len(ssid);
+			}
+			i++;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+		memcpy((void *)request->ie,
+		       nla_data(info->attrs[NL80211_ATTR_IE]),
+		       request->ie_len);
+	}
+
+	request->dev = dev;
+	request->wiphy = &rdev->wiphy;
+	request->interval = interval;
+
+	err = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request);
+	if (!err) {
+		rdev->sched_scan_req = request;
+		nl80211_send_sched_scan(rdev, dev,
+					NL80211_CMD_START_SCHED_SCAN);
+		goto out;
+	}
+
+out_free:
+	kfree(request);
+out:
+	mutex_unlock(&rdev->sched_scan_mtx);
+	return err;
+}
+
+static int nl80211_stop_sched_scan(struct sk_buff *skb,
+				   struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) ||
+	    !rdev->ops->sched_scan_stop)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&rdev->sched_scan_mtx);
+	err = __cfg80211_stop_sched_scan(rdev, false);
+	mutex_unlock(&rdev->sched_scan_mtx);
+
+	return err;
+}
+
+static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
+			    struct cfg80211_registered_device *rdev,
+			    struct wireless_dev *wdev,
+			    struct cfg80211_internal_bss *intbss)
+{
+	struct cfg80211_bss *res = &intbss->pub;
+	void *hdr;
+	struct nlattr *bss;
+	int i;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags,
+			     NL80211_CMD_NEW_SCAN_RESULTS);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex);
+
+	bss = nla_nest_start(msg, NL80211_ATTR_BSS);
+	if (!bss)
+		goto nla_put_failure;
+	if (!is_zero_ether_addr(res->bssid))
+		NLA_PUT(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid);
+	if (res->information_elements && res->len_information_elements)
+		NLA_PUT(msg, NL80211_BSS_INFORMATION_ELEMENTS,
+			res->len_information_elements,
+			res->information_elements);
+	if (res->beacon_ies && res->len_beacon_ies &&
+	    res->beacon_ies != res->information_elements)
+		NLA_PUT(msg, NL80211_BSS_BEACON_IES,
+			res->len_beacon_ies, res->beacon_ies);
+	if (res->tsf)
+		NLA_PUT_U64(msg, NL80211_BSS_TSF, res->tsf);
+	if (res->beacon_interval)
+		NLA_PUT_U16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval);
+	NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
+	NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
+	NLA_PUT_U32(msg, NL80211_BSS_SEEN_MS_AGO,
+		jiffies_to_msecs(jiffies - intbss->ts));
+
+	switch (rdev->wiphy.signal_type) {
+	case CFG80211_SIGNAL_TYPE_MBM:
+		NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal);
+		break;
+	case CFG80211_SIGNAL_TYPE_UNSPEC:
+		NLA_PUT_U8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal);
+		break;
+	default:
+		break;
+	}
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_P2P_CLIENT:
+	case NL80211_IFTYPE_STATION:
+		if (intbss == wdev->current_bss)
+			NLA_PUT_U32(msg, NL80211_BSS_STATUS,
+				    NL80211_BSS_STATUS_ASSOCIATED);
+		else for (i = 0; i < MAX_AUTH_BSSES; i++) {
+			if (intbss != wdev->auth_bsses[i])
+				continue;
+			NLA_PUT_U32(msg, NL80211_BSS_STATUS,
+				    NL80211_BSS_STATUS_AUTHENTICATED);
+			break;
+		}
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		if (intbss == wdev->current_bss)
+			NLA_PUT_U32(msg, NL80211_BSS_STATUS,
+				    NL80211_BSS_STATUS_IBSS_JOINED);
+		break;
+	default:
+		break;
+	}
+
+	nla_nest_end(msg, bss);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_scan(struct sk_buff *skb,
+			     struct netlink_callback *cb)
+{
+	struct cfg80211_registered_device *rdev;
+	struct net_device *dev;
+	struct cfg80211_internal_bss *scan;
+	struct wireless_dev *wdev;
+	int start = cb->args[1], idx = 0;
+	int err;
+
+	err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev);
+	if (err)
+		return err;
+
+	wdev = dev->ieee80211_ptr;
+
+	wdev_lock(wdev);
+	spin_lock_bh(&rdev->bss_lock);
+	cfg80211_bss_expire(rdev);
+
+	list_for_each_entry(scan, &rdev->bss_list, list) {
+		if (++idx <= start)
+			continue;
+		if (nl80211_send_bss(skb,
+				NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				rdev, wdev, scan) < 0) {
+			idx--;
+			break;
+		}
+	}
+
+	spin_unlock_bh(&rdev->bss_lock);
+	wdev_unlock(wdev);
+
+	cb->args[1] = idx;
+	nl80211_finish_netdev_dump(rdev);
+
+	return skb->len;
+}
+
+static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
+				int flags, struct net_device *dev,
+				struct survey_info *survey)
+{
+	void *hdr;
+	struct nlattr *infoattr;
+
+	/* Survey without a channel doesn't make sense */
+	if (!survey->channel)
+		return -EINVAL;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags,
+			     NL80211_CMD_NEW_SURVEY_RESULTS);
+	if (!hdr)
+		return -ENOMEM;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+
+	infoattr = nla_nest_start(msg, NL80211_ATTR_SURVEY_INFO);
+	if (!infoattr)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(msg, NL80211_SURVEY_INFO_FREQUENCY,
+		    survey->channel->center_freq);
+	if (survey->filled & SURVEY_INFO_NOISE_DBM)
+		NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE,
+			    survey->noise);
+	if (survey->filled & SURVEY_INFO_IN_USE)
+		NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME,
+			    survey->channel_time);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY,
+			    survey->channel_time_busy);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY,
+			    survey->channel_time_ext_busy);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX,
+			    survey->channel_time_rx);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX,
+			    survey->channel_time_tx);
+
+	nla_nest_end(msg, infoattr);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nl80211_dump_survey(struct sk_buff *skb,
+			struct netlink_callback *cb)
+{
+	struct survey_info survey;
+	struct cfg80211_registered_device *dev;
+	struct net_device *netdev;
+	int survey_idx = cb->args[1];
+	int res;
+
+	res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (res)
+		return res;
+
+	if (!dev->ops->dump_survey) {
+		res = -EOPNOTSUPP;
+		goto out_err;
+	}
+
+	while (1) {
+		res = dev->ops->dump_survey(&dev->wiphy, netdev, survey_idx,
+					    &survey);
+		if (res == -ENOENT)
+			break;
+		if (res)
+			goto out_err;
+
+		if (nl80211_send_survey(skb,
+				NETLINK_CB(cb->skb).pid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				netdev,
+				&survey) < 0)
+			goto out;
+		survey_idx++;
+	}
+
+ out:
+	cb->args[1] = survey_idx;
+	res = skb->len;
+ out_err:
+	nl80211_finish_netdev_dump(dev);
+	return res;
+}
+
+static bool nl80211_valid_auth_type(enum nl80211_auth_type auth_type)
+{
+	return auth_type <= NL80211_AUTHTYPE_MAX;
+}
+
+static bool nl80211_valid_wpa_versions(u32 wpa_versions)
+{
+	return !(wpa_versions & ~(NL80211_WPA_VERSION_1 |
+				  NL80211_WPA_VERSION_2));
+}
+
+static bool nl80211_valid_akm_suite(u32 akm)
+{
+	return akm == WLAN_AKM_SUITE_8021X ||
+		akm == WLAN_AKM_SUITE_PSK;
+}
+
+static bool nl80211_valid_cipher_suite(u32 cipher)
+{
+	return cipher == WLAN_CIPHER_SUITE_WEP40 ||
+		cipher == WLAN_CIPHER_SUITE_WEP104 ||
+		cipher == WLAN_CIPHER_SUITE_TKIP ||
+		cipher == WLAN_CIPHER_SUITE_CCMP ||
+		cipher == WLAN_CIPHER_SUITE_AES_CMAC;
+}
+
+
+static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct ieee80211_channel *chan;
+	const u8 *bssid, *ssid, *ie = NULL;
+	int err, ssid_len, ie_len = 0;
+	enum nl80211_auth_type auth_type;
+	struct key_parse key;
+	bool local_state_change;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_AUTH_TYPE])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_SSID])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+		return -EINVAL;
+
+	err = nl80211_parse_key(info, &key);
+	if (err)
+		return err;
+
+	if (key.idx >= 0) {
+		if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
+			return -EINVAL;
+		if (!key.p.key || !key.p.key_len)
+			return -EINVAL;
+		if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
+		     key.p.key_len != WLAN_KEY_LEN_WEP40) &&
+		    (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
+		     key.p.key_len != WLAN_KEY_LEN_WEP104))
+			return -EINVAL;
+		if (key.idx > 4)
+			return -EINVAL;
+	} else {
+		key.p.key_len = 0;
+		key.p.key = NULL;
+	}
+
+	if (key.idx >= 0) {
+		int i;
+		bool ok = false;
+		for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
+			if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
+				ok = true;
+				break;
+			}
+		}
+		if (!ok)
+			return -EINVAL;
+	}
+
+	if (!rdev->ops->auth)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	chan = ieee80211_get_channel(&rdev->wiphy,
+		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
+
+	ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+	if (!nl80211_valid_auth_type(auth_type))
+		return -EINVAL;
+
+	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+	return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
+				  ssid, ssid_len, ie, ie_len,
+				  key.p.key, key.p.key_len, key.idx,
+				  local_state_change);
+}
+
+static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
+				   struct genl_info *info,
+				   struct cfg80211_crypto_settings *settings,
+				   int cipher_limit)
+{
+	memset(settings, 0, sizeof(*settings));
+
+	settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
+
+	if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
+		u16 proto;
+		proto = nla_get_u16(
+			info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
+		settings->control_port_ethertype = cpu_to_be16(proto);
+		if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
+		    proto != ETH_P_PAE)
+			return -EINVAL;
+		if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
+			settings->control_port_no_encrypt = true;
+	} else
+		settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
+
+	if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
+		void *data;
+		int len, i;
+
+		data = nla_data(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
+		len = nla_len(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
+		settings->n_ciphers_pairwise = len / sizeof(u32);
+
+		if (len % sizeof(u32))
+			return -EINVAL;
+
+		if (settings->n_ciphers_pairwise > cipher_limit)
+			return -EINVAL;
+
+		memcpy(settings->ciphers_pairwise, data, len);
+
+		for (i = 0; i < settings->n_ciphers_pairwise; i++)
+			if (!nl80211_valid_cipher_suite(
+					settings->ciphers_pairwise[i]))
+				return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]) {
+		settings->cipher_group =
+			nla_get_u32(info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]);
+		if (!nl80211_valid_cipher_suite(settings->cipher_group))
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_WPA_VERSIONS]) {
+		settings->wpa_versions =
+			nla_get_u32(info->attrs[NL80211_ATTR_WPA_VERSIONS]);
+		if (!nl80211_valid_wpa_versions(settings->wpa_versions))
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_AKM_SUITES]) {
+		void *data;
+		int len, i;
+
+		data = nla_data(info->attrs[NL80211_ATTR_AKM_SUITES]);
+		len = nla_len(info->attrs[NL80211_ATTR_AKM_SUITES]);
+		settings->n_akm_suites = len / sizeof(u32);
+
+		if (len % sizeof(u32))
+			return -EINVAL;
+
+		if (settings->n_akm_suites > NL80211_MAX_NR_AKM_SUITES)
+			return -EINVAL;
+
+		memcpy(settings->akm_suites, data, len);
+
+		for (i = 0; i < settings->n_akm_suites; i++)
+			if (!nl80211_valid_akm_suite(settings->akm_suites[i]))
+				return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_crypto_settings crypto;
+	struct ieee80211_channel *chan;
+	const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL;
+	int err, ssid_len, ie_len = 0;
+	bool use_mfp = false;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MAC] ||
+	    !info->attrs[NL80211_ATTR_SSID] ||
+	    !info->attrs[NL80211_ATTR_WIPHY_FREQ])
+		return -EINVAL;
+
+	if (!rdev->ops->assoc)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	chan = ieee80211_get_channel(&rdev->wiphy,
+		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
+
+	ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	if (info->attrs[NL80211_ATTR_USE_MFP]) {
+		enum nl80211_mfp mfp =
+			nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
+		if (mfp == NL80211_MFP_REQUIRED)
+			use_mfp = true;
+		else if (mfp != NL80211_MFP_NO)
+			return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_PREV_BSSID])
+		prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
+
+	err = nl80211_crypto_settings(rdev, info, &crypto, 1);
+	if (!err)
+		err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid,
+					  ssid, ssid_len, ie, ie_len, use_mfp,
+					  &crypto);
+
+	return err;
+}
+
+static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	const u8 *ie = NULL, *bssid;
+	int ie_len = 0;
+	u16 reason_code;
+	bool local_state_change;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_REASON_CODE])
+		return -EINVAL;
+
+	if (!rdev->ops->deauth)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+	if (reason_code == 0) {
+		/* Reason Code 0 is reserved */
+		return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+	return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
+				    local_state_change);
+}
+
+static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	const u8 *ie = NULL, *bssid;
+	int ie_len = 0;
+	u16 reason_code;
+	bool local_state_change;
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_REASON_CODE])
+		return -EINVAL;
+
+	if (!rdev->ops->disassoc)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+	if (reason_code == 0) {
+		/* Reason Code 0 is reserved */
+		return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
+
+	return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
+				      local_state_change);
+}
+
+static bool
+nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev,
+			 int mcast_rate[IEEE80211_NUM_BANDS],
+			 int rateval)
+{
+	struct wiphy *wiphy = &rdev->wiphy;
+	bool found = false;
+	int band, i;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		struct ieee80211_supported_band *sband;
+
+		sband = wiphy->bands[band];
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_bitrates; i++) {
+			if (sband->bitrates[i].bitrate == rateval) {
+				mcast_rate[band] = i + 1;
+				found = true;
+				break;
+			}
+		}
+	}
+
+	return found;
+}
+
+static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_ibss_params ibss;
+	struct wiphy *wiphy;
+	struct cfg80211_cached_keys *connkeys = NULL;
+	int err;
+
+	memset(&ibss, 0, sizeof(ibss));
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+	    !info->attrs[NL80211_ATTR_SSID] ||
+	    !nla_len(info->attrs[NL80211_ATTR_SSID]))
+		return -EINVAL;
+
+	ibss.beacon_interval = 100;
+
+	if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
+		ibss.beacon_interval =
+			nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
+		if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000)
+			return -EINVAL;
+	}
+
+	if (!rdev->ops->join_ibss)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	wiphy = &rdev->wiphy;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	ibss.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	ibss.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		ibss.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		ibss.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	ibss.channel = ieee80211_get_channel(wiphy,
+		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+	if (!ibss.channel ||
+	    ibss.channel->flags & IEEE80211_CHAN_NO_IBSS ||
+	    ibss.channel->flags & IEEE80211_CHAN_DISABLED)
+		return -EINVAL;
+
+	ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
+	ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
+
+	if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+		u8 *rates =
+			nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		int n_rates =
+			nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		struct ieee80211_supported_band *sband =
+			wiphy->bands[ibss.channel->band];
+		int i, j;
+
+		if (n_rates == 0)
+			return -EINVAL;
+
+		for (i = 0; i < n_rates; i++) {
+			int rate = (rates[i] & 0x7f) * 5;
+			bool found = false;
+
+			for (j = 0; j < sband->n_bitrates; j++) {
+				if (sband->bitrates[j].bitrate == rate) {
+					found = true;
+					ibss.basic_rates |= BIT(j);
+					break;
+				}
+			}
+			if (!found)
+				return -EINVAL;
+		}
+	}
+
+	if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
+	    !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate,
+			nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
+		return -EINVAL;
+
+	if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
+		connkeys = nl80211_parse_connkeys(rdev,
+					info->attrs[NL80211_ATTR_KEYS]);
+		if (IS_ERR(connkeys))
+			return PTR_ERR(connkeys);
+	}
+
+	err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
+	if (err)
+		kfree(connkeys);
+	return err;
+}
+
+static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	if (!rdev->ops->leave_ibss)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	return cfg80211_leave_ibss(rdev, dev, false);
+}
+
+#ifdef CONFIG_NL80211_TESTMODE
+static struct genl_multicast_group nl80211_testmode_mcgrp = {
+	.name = "testmode",
+};
+
+static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int err;
+
+	if (!info->attrs[NL80211_ATTR_TESTDATA])
+		return -EINVAL;
+
+	err = -EOPNOTSUPP;
+	if (rdev->ops->testmode_cmd) {
+		rdev->testmode_info = info;
+		err = rdev->ops->testmode_cmd(&rdev->wiphy,
+				nla_data(info->attrs[NL80211_ATTR_TESTDATA]),
+				nla_len(info->attrs[NL80211_ATTR_TESTDATA]));
+		rdev->testmode_info = NULL;
+	}
+
+	return err;
+}
+
+static struct sk_buff *
+__cfg80211_testmode_alloc_skb(struct cfg80211_registered_device *rdev,
+			      int approxlen, u32 pid, u32 seq, gfp_t gfp)
+{
+	struct sk_buff *skb;
+	void *hdr;
+	struct nlattr *data;
+
+	skb = nlmsg_new(approxlen + 100, gfp);
+	if (!skb)
+		return NULL;
+
+	hdr = nl80211hdr_put(skb, pid, seq, 0, NL80211_CMD_TESTMODE);
+	if (!hdr) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	NLA_PUT_U32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	data = nla_nest_start(skb, NL80211_ATTR_TESTDATA);
+
+	((void **)skb->cb)[0] = rdev;
+	((void **)skb->cb)[1] = hdr;
+	((void **)skb->cb)[2] = data;
+
+	return skb;
+
+ nla_put_failure:
+	kfree_skb(skb);
+	return NULL;
+}
+
+struct sk_buff *cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy,
+						  int approxlen)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	if (WARN_ON(!rdev->testmode_info))
+		return NULL;
+
+	return __cfg80211_testmode_alloc_skb(rdev, approxlen,
+				rdev->testmode_info->snd_pid,
+				rdev->testmode_info->snd_seq,
+				GFP_KERNEL);
+}
+EXPORT_SYMBOL(cfg80211_testmode_alloc_reply_skb);
+
+int cfg80211_testmode_reply(struct sk_buff *skb)
+{
+	struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0];
+	void *hdr = ((void **)skb->cb)[1];
+	struct nlattr *data = ((void **)skb->cb)[2];
+
+	if (WARN_ON(!rdev->testmode_info)) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	nla_nest_end(skb, data);
+	genlmsg_end(skb, hdr);
+	return genlmsg_reply(skb, rdev->testmode_info);
+}
+EXPORT_SYMBOL(cfg80211_testmode_reply);
+
+struct sk_buff *cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy,
+						  int approxlen, gfp_t gfp)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	return __cfg80211_testmode_alloc_skb(rdev, approxlen, 0, 0, gfp);
+}
+EXPORT_SYMBOL(cfg80211_testmode_alloc_event_skb);
+
+void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
+{
+	void *hdr = ((void **)skb->cb)[1];
+	struct nlattr *data = ((void **)skb->cb)[2];
+
+	nla_nest_end(skb, data);
+	genlmsg_end(skb, hdr);
+	genlmsg_multicast(skb, 0, nl80211_testmode_mcgrp.id, gfp);
+}
+EXPORT_SYMBOL(cfg80211_testmode_event);
+#endif
+
+static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_connect_params connect;
+	struct wiphy *wiphy;
+	struct cfg80211_cached_keys *connkeys = NULL;
+	int err;
+
+	memset(&connect, 0, sizeof(connect));
+
+	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_SSID] ||
+	    !nla_len(info->attrs[NL80211_ATTR_SSID]))
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+		connect.auth_type =
+			nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+		if (!nl80211_valid_auth_type(connect.auth_type))
+			return -EINVAL;
+	} else
+		connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+
+	connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
+
+	err = nl80211_crypto_settings(rdev, info, &connect.crypto,
+				      NL80211_MAX_NR_CIPHER_SUITES);
+	if (err)
+		return err;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	wiphy = &rdev->wiphy;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		connect.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	connect.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+	connect.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+
+	if (info->attrs[NL80211_ATTR_IE]) {
+		connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
+		connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+		connect.channel =
+			ieee80211_get_channel(wiphy,
+			    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
+		if (!connect.channel ||
+		    connect.channel->flags & IEEE80211_CHAN_DISABLED)
+			return -EINVAL;
+	}
+
+	if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
+		connkeys = nl80211_parse_connkeys(rdev,
+					info->attrs[NL80211_ATTR_KEYS]);
+		if (IS_ERR(connkeys))
+			return PTR_ERR(connkeys);
+	}
+
+	err = cfg80211_connect(rdev, dev, &connect, connkeys);
+	if (err)
+		kfree(connkeys);
+	return err;
+}
+
+static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u16 reason;
+
+	if (!info->attrs[NL80211_ATTR_REASON_CODE])
+		reason = WLAN_REASON_DEAUTH_LEAVING;
+	else
+		reason = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
+
+	if (reason == 0)
+		return -EINVAL;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	return cfg80211_disconnect(rdev, dev, reason, true);
+}
+
+static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net *net;
+	int err;
+	u32 pid;
+
+	if (!info->attrs[NL80211_ATTR_PID])
+		return -EINVAL;
+
+	pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);
+
+	net = get_net_ns_by_pid(pid);
+	if (IS_ERR(net))
+		return PTR_ERR(net);
+
+	err = 0;
+
+	/* check if anything to do */
+	if (!net_eq(wiphy_net(&rdev->wiphy), net))
+		err = cfg80211_switch_netns(rdev, net);
+
+	put_net(net);
+	return err;
+}
+
+static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
+			struct cfg80211_pmksa *pmksa) = NULL;
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_pmksa pmksa;
+
+	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_PMKID])
+		return -EINVAL;
+
+	pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
+	pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	switch (info->genlhdr->cmd) {
+	case NL80211_CMD_SET_PMKSA:
+		rdev_ops = rdev->ops->set_pmksa;
+		break;
+	case NL80211_CMD_DEL_PMKSA:
+		rdev_ops = rdev->ops->del_pmksa;
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+
+	if (!rdev_ops)
+		return -EOPNOTSUPP;
+
+	return rdev_ops(&rdev->wiphy, dev, &pmksa);
+}
+
+static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->flush_pmksa)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->flush_pmksa(&rdev->wiphy, dev);
+}
+
+static int nl80211_remain_on_channel(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct ieee80211_channel *chan;
+	struct sk_buff *msg;
+	void *hdr;
+	u64 cookie;
+	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
+	u32 freq, duration;
+	int err;
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+	    !info->attrs[NL80211_ATTR_DURATION])
+		return -EINVAL;
+
+	duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);
+
+	/*
+	 * We should be on that channel for at least one jiffie,
+	 * and more than 5 seconds seems excessive.
+	 */
+	if (!duration || !msecs_to_jiffies(duration) ||
+	    duration > rdev->wiphy.max_remain_on_channel_duration)
+		return -EINVAL;
+
+	if (!rdev->ops->remain_on_channel)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+		channel_type = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+		if (channel_type != NL80211_CHAN_NO_HT &&
+		    channel_type != NL80211_CHAN_HT20 &&
+		    channel_type != NL80211_CHAN_HT40PLUS &&
+		    channel_type != NL80211_CHAN_HT40MINUS)
+			return -EINVAL;
+	}
+
+	freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+	chan = rdev_freq_to_chan(rdev, freq, channel_type);
+	if (chan == NULL)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_REMAIN_ON_CHANNEL);
+
+	if (IS_ERR(hdr)) {
+		err = PTR_ERR(hdr);
+		goto free_msg;
+	}
+
+	err = rdev->ops->remain_on_channel(&rdev->wiphy, dev, chan,
+					   channel_type, duration, &cookie);
+
+	if (err)
+		goto free_msg;
+
+	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+	err = -ENOBUFS;
+ free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u64 cookie;
+
+	if (!info->attrs[NL80211_ATTR_COOKIE])
+		return -EINVAL;
+
+	if (!rdev->ops->cancel_remain_on_channel)
+		return -EOPNOTSUPP;
+
+	cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+	return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
+}
+
+static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
+			   u8 *rates, u8 rates_len)
+{
+	u8 i;
+	u32 mask = 0;
+
+	for (i = 0; i < rates_len; i++) {
+		int rate = (rates[i] & 0x7f) * 5;
+		int ridx;
+		for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
+			struct ieee80211_rate *srate =
+				&sband->bitrates[ridx];
+			if (rate == srate->bitrate) {
+				mask |= 1 << ridx;
+				break;
+			}
+		}
+		if (ridx == sband->n_bitrates)
+			return 0; /* rate not found */
+	}
+
+	return mask;
+}
+
+static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
+	[NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
+				    .len = NL80211_MAX_SUPP_RATES },
+};
+
+static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct nlattr *tb[NL80211_TXRATE_MAX + 1];
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct cfg80211_bitrate_mask mask;
+	int rem, i;
+	struct net_device *dev = info->user_ptr[1];
+	struct nlattr *tx_rates;
+	struct ieee80211_supported_band *sband;
+
+	if (info->attrs[NL80211_ATTR_TX_RATES] == NULL)
+		return -EINVAL;
+
+	if (!rdev->ops->set_bitrate_mask)
+		return -EOPNOTSUPP;
+
+	memset(&mask, 0, sizeof(mask));
+	/* Default to all rates enabled */
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = rdev->wiphy.bands[i];
+		mask.control[i].legacy =
+			sband ? (1 << sband->n_bitrates) - 1 : 0;
+	}
+
+	/*
+	 * The nested attribute uses enum nl80211_band as the index. This maps
+	 * directly to the enum ieee80211_band values used in cfg80211.
+	 */
+	nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem)
+	{
+		enum ieee80211_band band = nla_type(tx_rates);
+		if (band < 0 || band >= IEEE80211_NUM_BANDS)
+			return -EINVAL;
+		sband = rdev->wiphy.bands[band];
+		if (sband == NULL)
+			return -EINVAL;
+		nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
+			  nla_len(tx_rates), nl80211_txattr_policy);
+		if (tb[NL80211_TXRATE_LEGACY]) {
+			mask.control[band].legacy = rateset_to_mask(
+				sband,
+				nla_data(tb[NL80211_TXRATE_LEGACY]),
+				nla_len(tb[NL80211_TXRATE_LEGACY]));
+			if (mask.control[band].legacy == 0)
+				return -EINVAL;
+		}
+	}
+
+	return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
+}
+
+static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;
+
+	if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
+		return -EINVAL;
+
+	if (info->attrs[NL80211_ATTR_FRAME_TYPE])
+		frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	/* not much point in registering if we can't reply */
+	if (!rdev->ops->mgmt_tx)
+		return -EOPNOTSUPP;
+
+	return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid,
+			frame_type,
+			nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
+			nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
+}
+
+static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct ieee80211_channel *chan;
+	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
+	bool channel_type_valid = false;
+	u32 freq;
+	int err;
+	void *hdr;
+	u64 cookie;
+	struct sk_buff *msg;
+	unsigned int wait = 0;
+	bool offchan;
+
+	if (!info->attrs[NL80211_ATTR_FRAME] ||
+	    !info->attrs[NL80211_ATTR_WIPHY_FREQ])
+		return -EINVAL;
+
+	if (!rdev->ops->mgmt_tx)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_DURATION]) {
+		if (!rdev->ops->mgmt_tx_cancel_wait)
+			return -EINVAL;
+		wait = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+		channel_type = nla_get_u32(
+			info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+		if (channel_type != NL80211_CHAN_NO_HT &&
+		    channel_type != NL80211_CHAN_HT20 &&
+		    channel_type != NL80211_CHAN_HT40PLUS &&
+		    channel_type != NL80211_CHAN_HT40MINUS)
+			return -EINVAL;
+		channel_type_valid = true;
+	}
+
+	offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK];
+
+	freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+	chan = rdev_freq_to_chan(rdev, freq, channel_type);
+	if (chan == NULL)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_FRAME);
+
+	if (IS_ERR(hdr)) {
+		err = PTR_ERR(hdr);
+		goto free_msg;
+	}
+	err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, offchan, channel_type,
+				    channel_type_valid, wait,
+				    nla_data(info->attrs[NL80211_ATTR_FRAME]),
+				    nla_len(info->attrs[NL80211_ATTR_FRAME]),
+				    &cookie);
+	if (err)
+		goto free_msg;
+
+	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+	err = -ENOBUFS;
+ free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
+static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u64 cookie;
+
+	if (!info->attrs[NL80211_ATTR_COOKIE])
+		return -EINVAL;
+
+	if (!rdev->ops->mgmt_tx_cancel_wait)
+		return -EOPNOTSUPP;
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
+
+	return rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, dev, cookie);
+}
+
+static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev;
+	struct net_device *dev = info->user_ptr[1];
+	u8 ps_state;
+	bool state;
+	int err;
+
+	if (!info->attrs[NL80211_ATTR_PS_STATE])
+		return -EINVAL;
+
+	ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
+
+	if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
+		return -EINVAL;
+
+	wdev = dev->ieee80211_ptr;
+
+	if (!rdev->ops->set_power_mgmt)
+		return -EOPNOTSUPP;
+
+	state = (ps_state == NL80211_PS_ENABLED) ? true : false;
+
+	if (state == wdev->ps)
+		return 0;
+
+	err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state,
+					wdev->ps_timeout);
+	if (!err)
+		wdev->ps = state;
+	return err;
+}
+
+static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	enum nl80211_ps_state ps_state;
+	struct wireless_dev *wdev;
+	struct net_device *dev = info->user_ptr[1];
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+
+	wdev = dev->ieee80211_ptr;
+
+	if (!rdev->ops->set_power_mgmt)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_POWER_SAVE);
+	if (!hdr) {
+		err = -ENOBUFS;
+		goto free_msg;
+	}
+
+	if (wdev->ps)
+		ps_state = NL80211_PS_ENABLED;
+	else
+		ps_state = NL80211_PS_DISABLED;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state);
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+ nla_put_failure:
+	err = -ENOBUFS;
+ free_msg:
+	nlmsg_free(msg);
+	return err;
+}
+
+static struct nla_policy
+nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = {
+	[NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 },
+	[NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 },
+};
+
+static int nl80211_set_cqm_rssi(struct genl_info *info,
+				s32 threshold, u32 hysteresis)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct wireless_dev *wdev;
+	struct net_device *dev = info->user_ptr[1];
+
+	if (threshold > 0)
+		return -EINVAL;
+
+	wdev = dev->ieee80211_ptr;
+
+	if (!rdev->ops->set_cqm_rssi_config)
+		return -EOPNOTSUPP;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
+
+	return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
+					      threshold, hysteresis);
+}
+
+static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *attrs[NL80211_ATTR_CQM_MAX + 1];
+	struct nlattr *cqm;
+	int err;
+
+	cqm = info->attrs[NL80211_ATTR_CQM];
+	if (!cqm) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm,
+			       nl80211_attr_cqm_policy);
+	if (err)
+		goto out;
+
+	if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] &&
+	    attrs[NL80211_ATTR_CQM_RSSI_HYST]) {
+		s32 threshold;
+		u32 hysteresis;
+		threshold = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
+		hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]);
+		err = nl80211_set_cqm_rssi(info, threshold, hysteresis);
+	} else
+		err = -EINVAL;
+
+out:
+	return err;
+}
+
+static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct mesh_config cfg;
+	struct mesh_setup setup;
+	int err;
+
+	/* start with default */
+	memcpy(&cfg, &default_mesh_config, sizeof(cfg));
+	memcpy(&setup, &default_mesh_setup, sizeof(setup));
+
+	if (info->attrs[NL80211_ATTR_MESH_CONFIG]) {
+		/* and parse parameters if given */
+		err = nl80211_parse_mesh_config(info, &cfg, NULL);
+		if (err)
+			return err;
+	}
+
+	if (!info->attrs[NL80211_ATTR_MESH_ID] ||
+	    !nla_len(info->attrs[NL80211_ATTR_MESH_ID]))
+		return -EINVAL;
+
+	setup.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
+	setup.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
+
+	if (info->attrs[NL80211_ATTR_MESH_SETUP]) {
+		/* parse additional setup parameters if given */
+		err = nl80211_parse_mesh_setup(info, &setup);
+		if (err)
+			return err;
+	}
+
+	return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
+}
+
+static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+
+	return cfg80211_leave_mesh(rdev, dev);
+}
+
+static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_WOWLAN);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (rdev->wowlan) {
+		struct nlattr *nl_wowlan;
+
+		nl_wowlan = nla_nest_start(msg, NL80211_ATTR_WOWLAN_TRIGGERS);
+		if (!nl_wowlan)
+			goto nla_put_failure;
+
+		if (rdev->wowlan->any)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_ANY);
+		if (rdev->wowlan->disconnect)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_DISCONNECT);
+		if (rdev->wowlan->magic_pkt)
+			NLA_PUT_FLAG(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT);
+		if (rdev->wowlan->n_patterns) {
+			struct nlattr *nl_pats, *nl_pat;
+			int i, pat_len;
+
+			nl_pats = nla_nest_start(msg,
+					NL80211_WOWLAN_TRIG_PKT_PATTERN);
+			if (!nl_pats)
+				goto nla_put_failure;
+
+			for (i = 0; i < rdev->wowlan->n_patterns; i++) {
+				nl_pat = nla_nest_start(msg, i + 1);
+				if (!nl_pat)
+					goto nla_put_failure;
+				pat_len = rdev->wowlan->patterns[i].pattern_len;
+				NLA_PUT(msg, NL80211_WOWLAN_PKTPAT_MASK,
+					DIV_ROUND_UP(pat_len, 8),
+					rdev->wowlan->patterns[i].mask);
+				NLA_PUT(msg, NL80211_WOWLAN_PKTPAT_PATTERN,
+					pat_len,
+					rdev->wowlan->patterns[i].pattern);
+				nla_nest_end(msg, nl_pat);
+			}
+			nla_nest_end(msg, nl_pats);
+		}
+
+		nla_nest_end(msg, nl_wowlan);
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct nlattr *tb[NUM_NL80211_WOWLAN_TRIG];
+	struct cfg80211_wowlan no_triggers = {};
+	struct cfg80211_wowlan new_triggers = {};
+	struct wiphy_wowlan_support *wowlan = &rdev->wiphy.wowlan;
+	int err, i;
+
+	if (!rdev->wiphy.wowlan.flags && !rdev->wiphy.wowlan.n_patterns)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS])
+		goto no_triggers;
+
+	err = nla_parse(tb, MAX_NL80211_WOWLAN_TRIG,
+			nla_data(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
+			nla_len(info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]),
+			nl80211_wowlan_policy);
+	if (err)
+		return err;
+
+	if (tb[NL80211_WOWLAN_TRIG_ANY]) {
+		if (!(wowlan->flags & WIPHY_WOWLAN_ANY))
+			return -EINVAL;
+		new_triggers.any = true;
+	}
+
+	if (tb[NL80211_WOWLAN_TRIG_DISCONNECT]) {
+		if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT))
+			return -EINVAL;
+		new_triggers.disconnect = true;
+	}
+
+	if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) {
+		if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT))
+			return -EINVAL;
+		new_triggers.magic_pkt = true;
+	}
+
+	if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) {
+		struct nlattr *pat;
+		int n_patterns = 0;
+		int rem, pat_len, mask_len;
+		struct nlattr *pat_tb[NUM_NL80211_WOWLAN_PKTPAT];
+
+		nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
+				    rem)
+			n_patterns++;
+		if (n_patterns > wowlan->n_patterns)
+			return -EINVAL;
+
+		new_triggers.patterns = kcalloc(n_patterns,
+						sizeof(new_triggers.patterns[0]),
+						GFP_KERNEL);
+		if (!new_triggers.patterns)
+			return -ENOMEM;
+
+		new_triggers.n_patterns = n_patterns;
+		i = 0;
+
+		nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
+				    rem) {
+			nla_parse(pat_tb, MAX_NL80211_WOWLAN_PKTPAT,
+				  nla_data(pat), nla_len(pat), NULL);
+			err = -EINVAL;
+			if (!pat_tb[NL80211_WOWLAN_PKTPAT_MASK] ||
+			    !pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN])
+				goto error;
+			pat_len = nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]);
+			mask_len = DIV_ROUND_UP(pat_len, 8);
+			if (nla_len(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]) !=
+			    mask_len)
+				goto error;
+			if (pat_len > wowlan->pattern_max_len ||
+			    pat_len < wowlan->pattern_min_len)
+				goto error;
+
+			new_triggers.patterns[i].mask =
+				kmalloc(mask_len + pat_len, GFP_KERNEL);
+			if (!new_triggers.patterns[i].mask) {
+				err = -ENOMEM;
+				goto error;
+			}
+			new_triggers.patterns[i].pattern =
+				new_triggers.patterns[i].mask + mask_len;
+			memcpy(new_triggers.patterns[i].mask,
+			       nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_MASK]),
+			       mask_len);
+			new_triggers.patterns[i].pattern_len = pat_len;
+			memcpy(new_triggers.patterns[i].pattern,
+			       nla_data(pat_tb[NL80211_WOWLAN_PKTPAT_PATTERN]),
+			       pat_len);
+			i++;
+		}
+	}
+
+	if (memcmp(&new_triggers, &no_triggers, sizeof(new_triggers))) {
+		struct cfg80211_wowlan *ntrig;
+		ntrig = kmemdup(&new_triggers, sizeof(new_triggers),
+				GFP_KERNEL);
+		if (!ntrig) {
+			err = -ENOMEM;
+			goto error;
+		}
+		cfg80211_rdev_free_wowlan(rdev);
+		rdev->wowlan = ntrig;
+	} else {
+ no_triggers:
+		cfg80211_rdev_free_wowlan(rdev);
+		rdev->wowlan = NULL;
+	}
+
+	return 0;
+ error:
+	for (i = 0; i < new_triggers.n_patterns; i++)
+		kfree(new_triggers.patterns[i].mask);
+	kfree(new_triggers.patterns);
+	return err;
+}
+
+#define NL80211_FLAG_NEED_WIPHY		0x01
+#define NL80211_FLAG_NEED_NETDEV	0x02
+#define NL80211_FLAG_NEED_RTNL		0x04
+#define NL80211_FLAG_CHECK_NETDEV_UP	0x08
+#define NL80211_FLAG_NEED_NETDEV_UP	(NL80211_FLAG_NEED_NETDEV |\
+					 NL80211_FLAG_CHECK_NETDEV_UP)
+
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+			    struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+	struct net_device *dev;
+	int err;
+	bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL;
+
+	if (rtnl)
+		rtnl_lock();
+
+	if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) {
+		rdev = cfg80211_get_dev_from_info(info);
+		if (IS_ERR(rdev)) {
+			if (rtnl)
+				rtnl_unlock();
+			return PTR_ERR(rdev);
+		}
+		info->user_ptr[0] = rdev;
+	} else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) {
+		err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
+		if (err) {
+			if (rtnl)
+				rtnl_unlock();
+			return err;
+		}
+		if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+		    !netif_running(dev)) {
+			cfg80211_unlock_rdev(rdev);
+			dev_put(dev);
+			if (rtnl)
+				rtnl_unlock();
+			return -ENETDOWN;
+		}
+		info->user_ptr[0] = rdev;
+		info->user_ptr[1] = dev;
+	}
+
+	return 0;
+}
+
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+			      struct genl_info *info)
+{
+	if (info->user_ptr[0])
+		cfg80211_unlock_rdev(info->user_ptr[0]);
+	if (info->user_ptr[1])
+		dev_put(info->user_ptr[1]);
+	if (ops->internal_flags & NL80211_FLAG_NEED_RTNL)
+		rtnl_unlock();
+}
+
+static struct genl_ops nl80211_ops[] = {
+	{
+		.cmd = NL80211_CMD_GET_WIPHY,
+		.doit = nl80211_get_wiphy,
+		.dumpit = nl80211_dump_wiphy,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_WIPHY,
+	},
+	{
+		.cmd = NL80211_CMD_SET_WIPHY,
+		.doit = nl80211_set_wiphy,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_INTERFACE,
+		.doit = nl80211_get_interface,
+		.dumpit = nl80211_dump_interface,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV,
+	},
+	{
+		.cmd = NL80211_CMD_SET_INTERFACE,
+		.doit = nl80211_set_interface,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_NEW_INTERFACE,
+		.doit = nl80211_new_interface,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_INTERFACE,
+		.doit = nl80211_del_interface,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_KEY,
+		.doit = nl80211_get_key,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_KEY,
+		.doit = nl80211_set_key,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_NEW_KEY,
+		.doit = nl80211_new_key,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_KEY,
+		.doit = nl80211_del_key,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_BEACON,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.doit = nl80211_addset_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_NEW_BEACON,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.doit = nl80211_addset_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_BEACON,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.doit = nl80211_del_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_STATION,
+		.doit = nl80211_get_station,
+		.dumpit = nl80211_dump_station,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_STATION,
+		.doit = nl80211_set_station,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_NEW_STATION,
+		.doit = nl80211_new_station,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_STATION,
+		.doit = nl80211_del_station,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_MPATH,
+		.doit = nl80211_get_mpath,
+		.dumpit = nl80211_dump_mpath,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_MPATH,
+		.doit = nl80211_set_mpath,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_NEW_MPATH,
+		.doit = nl80211_new_mpath,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_MPATH,
+		.doit = nl80211_del_mpath,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_BSS,
+		.doit = nl80211_set_bss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_REG,
+		.doit = nl80211_get_reg,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = NL80211_CMD_SET_REG,
+		.doit = nl80211_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_REQ_SET_REG,
+		.doit = nl80211_req_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_GET_MESH_CONFIG,
+		.doit = nl80211_get_mesh_config,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_MESH_CONFIG,
+		.doit = nl80211_update_mesh_config,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_TRIGGER_SCAN,
+		.doit = nl80211_trigger_scan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_SCAN,
+		.policy = nl80211_policy,
+		.dumpit = nl80211_dump_scan,
+	},
+	{
+		.cmd = NL80211_CMD_START_SCHED_SCAN,
+		.doit = nl80211_start_sched_scan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_STOP_SCHED_SCAN,
+		.doit = nl80211_stop_sched_scan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_AUTHENTICATE,
+		.doit = nl80211_authenticate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_ASSOCIATE,
+		.doit = nl80211_associate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEAUTHENTICATE,
+		.doit = nl80211_deauthenticate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DISASSOCIATE,
+		.doit = nl80211_disassociate,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_JOIN_IBSS,
+		.doit = nl80211_join_ibss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_LEAVE_IBSS,
+		.doit = nl80211_leave_ibss,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+#ifdef CONFIG_NL80211_TESTMODE
+	{
+		.cmd = NL80211_CMD_TESTMODE,
+		.doit = nl80211_testmode_do,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+#endif
+	{
+		.cmd = NL80211_CMD_CONNECT,
+		.doit = nl80211_connect,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DISCONNECT,
+		.doit = nl80211_disconnect,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_WIPHY_NETNS,
+		.doit = nl80211_wiphy_netns,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_SURVEY,
+		.policy = nl80211_policy,
+		.dumpit = nl80211_dump_survey,
+	},
+	{
+		.cmd = NL80211_CMD_SET_PMKSA,
+		.doit = nl80211_setdel_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_DEL_PMKSA,
+		.doit = nl80211_setdel_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_FLUSH_PMKSA,
+		.doit = nl80211_flush_pmksa,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
+		.doit = nl80211_remain_on_channel,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
+		.doit = nl80211_cancel_remain_on_channel,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
+		.doit = nl80211_set_tx_bitrate_mask,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_REGISTER_FRAME,
+		.doit = nl80211_register_mgmt,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_FRAME,
+		.doit = nl80211_tx_mgmt,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_FRAME_WAIT_CANCEL,
+		.doit = nl80211_tx_mgmt_cancel_wait,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_POWER_SAVE,
+		.doit = nl80211_set_power_save,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_POWER_SAVE,
+		.doit = nl80211_get_power_save,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_CQM,
+		.doit = nl80211_set_cqm,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_CHANNEL,
+		.doit = nl80211_set_channel,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_WDS_PEER,
+		.doit = nl80211_set_wds_peer,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_JOIN_MESH,
+		.doit = nl80211_join_mesh,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_LEAVE_MESH,
+		.doit = nl80211_leave_mesh,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_WOWLAN,
+		.doit = nl80211_get_wowlan,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_WOWLAN,
+		.doit = nl80211_set_wowlan,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+};
+
+static struct genl_multicast_group nl80211_mlme_mcgrp = {
+	.name = "mlme",
+};
+
+/* multicast groups */
+static struct genl_multicast_group nl80211_config_mcgrp = {
+	.name = "config",
+};
+static struct genl_multicast_group nl80211_scan_mcgrp = {
+	.name = "scan",
+};
+static struct genl_multicast_group nl80211_regulatory_mcgrp = {
+	.name = "regulatory",
+};
+
+/* notification functions */
+
+void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_wiphy(msg, 0, 0, 0, rdev) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_config_mcgrp.id, GFP_KERNEL);
+}
+
+static int nl80211_add_scan_req(struct sk_buff *msg,
+				struct cfg80211_registered_device *rdev)
+{
+	struct cfg80211_scan_request *req = rdev->scan_req;
+	struct nlattr *nest;
+	int i;
+
+	ASSERT_RDEV_LOCK(rdev);
+
+	if (WARN_ON(!req))
+		return 0;
+
+	nest = nla_nest_start(msg, NL80211_ATTR_SCAN_SSIDS);
+	if (!nest)
+		goto nla_put_failure;
+	for (i = 0; i < req->n_ssids; i++)
+		NLA_PUT(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid);
+	nla_nest_end(msg, nest);
+
+	nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES);
+	if (!nest)
+		goto nla_put_failure;
+	for (i = 0; i < req->n_channels; i++)
+		NLA_PUT_U32(msg, i, req->channels[i]->center_freq);
+	nla_nest_end(msg, nest);
+
+	if (req->ie)
+		NLA_PUT(msg, NL80211_ATTR_IE, req->ie_len, req->ie);
+
+	return 0;
+ nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_send_scan_msg(struct sk_buff *msg,
+				 struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev,
+				 u32 pid, u32 seq, int flags,
+				 u32 cmd)
+{
+	void *hdr;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, cmd);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+
+	/* ignore errors and send incomplete event anyway */
+	nl80211_add_scan_req(msg, rdev);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int
+nl80211_send_sched_scan_msg(struct sk_buff *msg,
+			    struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev,
+			    u32 pid, u32 seq, int flags, u32 cmd)
+{
+	void *hdr;
+
+	hdr = nl80211hdr_put(msg, pid, seq, flags, cmd);
+	if (!hdr)
+		return -1;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+
+	return genlmsg_end(msg, hdr);
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0,
+				  NL80211_CMD_TRIGGER_SCAN) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0,
+				  NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_scan_msg(msg, rdev, netdev, 0, 0, 0,
+				  NL80211_CMD_SCAN_ABORTED) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
+				     struct net_device *netdev)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0,
+					NL80211_CMD_SCHED_SCAN_RESULTS) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, u32 cmd)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_scan_mcgrp.id, GFP_KERNEL);
+}
+
+/*
+ * This can happen on global regulatory changes or device specific settings
+ * based on custom world regulatory domains.
+ */
+void nl80211_send_reg_change_event(struct regulatory_request *request)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	/* Userspace can always count this one always being set */
+	NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator);
+
+	if (request->alpha2[0] == '0' && request->alpha2[1] == '0')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_WORLD);
+	else if (request->alpha2[0] == '9' && request->alpha2[1] == '9')
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_CUSTOM_WORLD);
+	else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
+		 request->intersect)
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_INTERSECTION);
+	else {
+		NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
+			   NL80211_REGDOM_TYPE_COUNTRY);
+		NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2);
+	}
+
+	if (wiphy_idx_valid(request->wiphy_idx))
+		NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	rcu_read_lock();
+	genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id,
+				GFP_ATOMIC);
+	rcu_read_unlock();
+
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
+				    struct net_device *netdev,
+				    const u8 *buf, size_t len,
+				    enum nl80211_commands cmd, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+			  struct net_device *netdev, const u8 *buf,
+			  size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_AUTHENTICATE, gfp);
+}
+
+void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *netdev, const u8 *buf,
+			   size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_ASSOCIATE, gfp);
+}
+
+void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
+			 struct net_device *netdev, const u8 *buf,
+			 size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_DEAUTHENTICATE, gfp);
+}
+
+void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *netdev, const u8 *buf,
+			   size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_DISASSOCIATE, gfp);
+}
+
+void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev, const u8 *buf,
+				size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_UNPROT_DEAUTHENTICATE, gfp);
+}
+
+void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev, const u8 *buf,
+				  size_t len, gfp_t gfp)
+{
+	nl80211_send_mlme_event(rdev, netdev, buf, len,
+				NL80211_CMD_UNPROT_DISASSOCIATE, gfp);
+}
+
+static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
+				      struct net_device *netdev, int cmd,
+				      const u8 *addr, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT_FLAG(msg, NL80211_ATTR_TIMED_OUT);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, const u8 *addr,
+			       gfp_t gfp)
+{
+	nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_AUTHENTICATE,
+				  addr, gfp);
+}
+
+void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev, const u8 *addr,
+				gfp_t gfp)
+{
+	nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_ASSOCIATE,
+				  addr, gfp);
+}
+
+void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, const u8 *bssid,
+				 const u8 *req_ie, size_t req_ie_len,
+				 const u8 *resp_ie, size_t resp_ie_len,
+				 u16 status, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONNECT);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	if (bssid)
+		NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid);
+	NLA_PUT_U16(msg, NL80211_ATTR_STATUS_CODE, status);
+	if (req_ie)
+		NLA_PUT(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie);
+	if (resp_ie)
+		NLA_PUT(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+
+}
+
+void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
+			 struct net_device *netdev, const u8 *bssid,
+			 const u8 *req_ie, size_t req_ie_len,
+			 const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ROAM);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid);
+	if (req_ie)
+		NLA_PUT(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie);
+	if (resp_ie)
+		NLA_PUT(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+
+}
+
+void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, u16 reason,
+			       const u8 *ie, size_t ie_len, bool from_ap)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DISCONNECT);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	if (from_ap && reason)
+		NLA_PUT_U16(msg, NL80211_ATTR_REASON_CODE, reason);
+	if (from_ap)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_DISCONNECTED_BY_AP);
+	if (ie)
+		NLA_PUT(msg, NL80211_ATTR_IE, ie_len, ie);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, GFP_KERNEL);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+
+}
+
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, const u8 *bssid,
+			     gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_JOIN_IBSS);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev,
+		struct net_device *netdev,
+		const u8 *macaddr, const u8* ie, u8 ie_len,
+		gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NEW_PEER_CANDIDATE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, macaddr);
+	if (ie_len && ie)
+		NLA_PUT(msg, NL80211_ATTR_IE, ie_len , ie);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, const u8 *addr,
+				 enum nl80211_key_type key_type, int key_id,
+				 const u8 *tsc, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_MICHAEL_MIC_FAILURE);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	if (addr)
+		NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, addr);
+	NLA_PUT_U32(msg, NL80211_ATTR_KEY_TYPE, key_type);
+	if (key_id != -1)
+		NLA_PUT_U8(msg, NL80211_ATTR_KEY_IDX, key_id);
+	if (tsc)
+		NLA_PUT(msg, NL80211_ATTR_KEY_SEQ, 6, tsc);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+				    struct ieee80211_channel *channel_before,
+				    struct ieee80211_channel *channel_after)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	struct nlattr *nl_freq;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_BEACON_HINT);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	/*
+	 * Since we are applying the beacon hint to a wiphy we know its
+	 * wiphy_idx is valid
+	 */
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy));
+
+	/* Before */
+	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
+	if (!nl_freq)
+		goto nla_put_failure;
+	if (nl80211_msg_put_channel(msg, channel_before))
+		goto nla_put_failure;
+	nla_nest_end(msg, nl_freq);
+
+	/* After */
+	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
+	if (!nl_freq)
+		goto nla_put_failure;
+	if (nl80211_msg_put_channel(msg, channel_after))
+		goto nla_put_failure;
+	nla_nest_end(msg, nl_freq);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	rcu_read_lock();
+	genlmsg_multicast_allns(msg, 0, nl80211_regulatory_mcgrp.id,
+				GFP_ATOMIC);
+	rcu_read_unlock();
+
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+static void nl80211_send_remain_on_chan_event(
+	int cmd, struct cfg80211_registered_device *rdev,
+	struct net_device *netdev, u64 cookie,
+	struct ieee80211_channel *chan,
+	enum nl80211_channel_type channel_type,
+	unsigned int duration, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE, channel_type);
+	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
+
+	if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL)
+		NLA_PUT_U32(msg, NL80211_ATTR_DURATION, duration);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev,
+				    struct net_device *netdev, u64 cookie,
+				    struct ieee80211_channel *chan,
+				    enum nl80211_channel_type channel_type,
+				    unsigned int duration, gfp_t gfp)
+{
+	nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL,
+					  rdev, netdev, cookie, chan,
+					  channel_type, duration, gfp);
+}
+
+void nl80211_send_remain_on_channel_cancel(
+	struct cfg80211_registered_device *rdev, struct net_device *netdev,
+	u64 cookie, struct ieee80211_channel *chan,
+	enum nl80211_channel_type channel_type, gfp_t gfp)
+{
+	nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
+					  rdev, netdev, cookie, chan,
+					  channel_type, 0, gfp);
+}
+
+void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
+			    struct net_device *dev, const u8 *mac_addr,
+			    struct station_info *sinfo, gfp_t gfp)
+{
+	struct sk_buff *msg;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	if (nl80211_send_station(msg, 0, 0, 0, dev, mac_addr, sinfo) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+}
+
+void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev,
+				struct net_device *dev, const u8 *mac_addr,
+				gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_STATION);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+		      struct net_device *netdev, u32 nlpid,
+		      int freq, const u8 *buf, size_t len, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return -ENOMEM;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY_FREQ, freq);
+	NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf);
+
+	err = genlmsg_end(msg, hdr);
+	if (err < 0) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	err = genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlpid);
+	if (err < 0)
+		return err;
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, u64 cookie,
+				 const u8 *buf, size_t len, bool ack,
+				 gfp_t gfp)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_FRAME, len, buf);
+	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
+	if (ack)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_ACK);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast(msg, 0, nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void
+nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev,
+			     enum nl80211_cqm_rssi_threshold_event rssi_event,
+			     gfp_t gfp)
+{
+	struct sk_buff *msg;
+	struct nlattr *pinfoattr;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+
+	pinfoattr = nla_nest_start(msg, NL80211_ATTR_CQM);
+	if (!pinfoattr)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT,
+		    rssi_event);
+
+	nla_nest_end(msg, pinfoattr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+void
+nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev, const u8 *peer,
+				u32 num_packets, gfp_t gfp)
+{
+	struct sk_buff *msg;
+	struct nlattr *pinfoattr;
+	void *hdr;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
+	NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, peer);
+
+	pinfoattr = nla_nest_start(msg, NL80211_ATTR_CQM);
+	if (!pinfoattr)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(msg, NL80211_ATTR_CQM_PKT_LOSS_EVENT, num_packets);
+
+	nla_nest_end(msg, pinfoattr);
+
+	if (genlmsg_end(msg, hdr) < 0) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(wiphy_net(&rdev->wiphy), msg, 0,
+				nl80211_mlme_mcgrp.id, gfp);
+	return;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
+}
+
+static int nl80211_netlink_notify(struct notifier_block * nb,
+				  unsigned long state,
+				  void *_notify)
+{
+	struct netlink_notify *notify = _notify;
+	struct cfg80211_registered_device *rdev;
+	struct wireless_dev *wdev;
+
+	if (state != NETLINK_URELEASE)
+		return NOTIFY_DONE;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list)
+		list_for_each_entry_rcu(wdev, &rdev->netdev_list, list)
+			cfg80211_mlme_unregister_socket(wdev, notify->pid);
+
+	rcu_read_unlock();
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nl80211_netlink_notifier = {
+	.notifier_call = nl80211_netlink_notify,
+};
+
+/* initialisation/exit functions */
+
+int nl80211_init(void)
+{
+	int err;
+
+	err = genl_register_family_with_ops(&nl80211_fam,
+		nl80211_ops, ARRAY_SIZE(nl80211_ops));
+	if (err)
+		return err;
+
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_config_mcgrp);
+	if (err)
+		goto err_out;
+
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_scan_mcgrp);
+	if (err)
+		goto err_out;
+
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
+	if (err)
+		goto err_out;
+
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_mlme_mcgrp);
+	if (err)
+		goto err_out;
+
+#ifdef CONFIG_NL80211_TESTMODE
+	err = genl_register_mc_group(&nl80211_fam, &nl80211_testmode_mcgrp);
+	if (err)
+		goto err_out;
+#endif
+
+	err = netlink_register_notifier(&nl80211_netlink_notifier);
+	if (err)
+		goto err_out;
+
+	return 0;
+ err_out:
+	genl_unregister_family(&nl80211_fam);
+	return err;
+}
+
+void nl80211_exit(void)
+{
+	netlink_unregister_notifier(&nl80211_netlink_notifier);
+	genl_unregister_family(&nl80211_fam);
+}
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
new file mode 100644
index 00000000..2f1bfb87
--- /dev/null
+++ b/net/wireless/nl80211.h
@@ -0,0 +1,112 @@
+#ifndef __NET_WIRELESS_NL80211_H
+#define __NET_WIRELESS_NL80211_H
+
+#include "core.h"
+
+int nl80211_init(void);
+void nl80211_exit(void);
+void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev);
+void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev);
+void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev);
+void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev);
+void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, u32 cmd);
+void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev,
+				     struct net_device *netdev);
+void nl80211_send_reg_change_event(struct regulatory_request *request);
+void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
+			  struct net_device *netdev,
+			  const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *netdev,
+			   const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
+			 struct net_device *netdev,
+			 const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
+			   struct net_device *netdev,
+			   const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_unprot_deauth(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev,
+				const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_unprot_disassoc(struct cfg80211_registered_device *rdev,
+				  struct net_device *netdev,
+				  const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev,
+			       const u8 *addr, gfp_t gfp);
+void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev,
+				const u8 *addr, gfp_t gfp);
+void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, const u8 *bssid,
+				 const u8 *req_ie, size_t req_ie_len,
+				 const u8 *resp_ie, size_t resp_ie_len,
+				 u16 status, gfp_t gfp);
+void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
+			 struct net_device *netdev, const u8 *bssid,
+			 const u8 *req_ie, size_t req_ie_len,
+			 const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp);
+void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
+			       struct net_device *netdev, u16 reason,
+			       const u8 *ie, size_t ie_len, bool from_ap);
+
+void nl80211_send_new_peer_candidate(struct cfg80211_registered_device *rdev,
+				     struct net_device *netdev,
+				     const u8 *macaddr, const u8* ie, u8 ie_len,
+				     gfp_t gfp);
+void
+nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
+			    struct net_device *netdev, const u8 *addr,
+			    enum nl80211_key_type key_type,
+			    int key_id, const u8 *tsc, gfp_t gfp);
+
+void
+nl80211_send_beacon_hint_event(struct wiphy *wiphy,
+			       struct ieee80211_channel *channel_before,
+			       struct ieee80211_channel *channel_after);
+
+void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev, const u8 *bssid,
+			     gfp_t gfp);
+
+void nl80211_send_remain_on_channel(struct cfg80211_registered_device *rdev,
+				    struct net_device *netdev,
+				    u64 cookie,
+				    struct ieee80211_channel *chan,
+				    enum nl80211_channel_type channel_type,
+				    unsigned int duration, gfp_t gfp);
+void nl80211_send_remain_on_channel_cancel(
+	struct cfg80211_registered_device *rdev, struct net_device *netdev,
+	u64 cookie, struct ieee80211_channel *chan,
+	enum nl80211_channel_type channel_type, gfp_t gfp);
+
+void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
+			    struct net_device *dev, const u8 *mac_addr,
+			    struct station_info *sinfo, gfp_t gfp);
+void nl80211_send_sta_del_event(struct cfg80211_registered_device *rdev,
+				struct net_device *dev, const u8 *mac_addr,
+				gfp_t gfp);
+
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+		      struct net_device *netdev, u32 nlpid, int freq,
+		      const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, u64 cookie,
+				 const u8 *buf, size_t len, bool ack,
+				 gfp_t gfp);
+
+void
+nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
+			     struct net_device *netdev,
+			     enum nl80211_cqm_rssi_threshold_event rssi_event,
+			     gfp_t gfp);
+void
+nl80211_send_cqm_pktloss_notify(struct cfg80211_registered_device *rdev,
+				struct net_device *netdev, const u8 *peer,
+				u32 num_packets, gfp_t gfp);
+
+#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
new file mode 100644
index 00000000..dbe35e13
--- /dev/null
+++ b/net/wireless/radiotap.c
@@ -0,0 +1,357 @@
+/*
+ * Radiotap parser
+ *
+ * Copyright 2007		Andy Green <andy@warmcat.com>
+ * Copyright 2009		Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Alternatively, this software may be distributed under the terms of BSD
+ * license.
+ *
+ * See COPYING for more details.
+ */
+
+#include <linux/kernel.h>
+#include <net/cfg80211.h>
+#include <net/ieee80211_radiotap.h>
+#include <asm/unaligned.h>
+
+/* function prototypes and related defs are in include/net/cfg80211.h */
+
+static const struct radiotap_align_size rtap_namespace_sizes[] = {
+	[IEEE80211_RADIOTAP_TSFT] = { .align = 8, .size = 8, },
+	[IEEE80211_RADIOTAP_FLAGS] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_RATE] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_CHANNEL] = { .align = 2, .size = 4, },
+	[IEEE80211_RADIOTAP_FHSS] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_DBM_ANTSIGNAL] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_DBM_ANTNOISE] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_LOCK_QUALITY] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_TX_ATTENUATION] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_DB_TX_ATTENUATION] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_DBM_TX_POWER] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_ANTENNA] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_DB_ANTSIGNAL] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_DB_ANTNOISE] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_RX_FLAGS] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_TX_FLAGS] = { .align = 2, .size = 2, },
+	[IEEE80211_RADIOTAP_RTS_RETRIES] = { .align = 1, .size = 1, },
+	[IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, },
+	/*
+	 * add more here as they are defined in radiotap.h
+	 */
+};
+
+static const struct ieee80211_radiotap_namespace radiotap_ns = {
+	.n_bits = ARRAY_SIZE(rtap_namespace_sizes),
+	.align_size = rtap_namespace_sizes,
+};
+
+/**
+ * ieee80211_radiotap_iterator_init - radiotap parser iterator initialization
+ * @iterator: radiotap_iterator to initialize
+ * @radiotap_header: radiotap header to parse
+ * @max_length: total length we can parse into (eg, whole packet length)
+ *
+ * Returns: 0 or a negative error code if there is a problem.
+ *
+ * This function initializes an opaque iterator struct which can then
+ * be passed to ieee80211_radiotap_iterator_next() to visit every radiotap
+ * argument which is present in the header.  It knows about extended
+ * present headers and handles them.
+ *
+ * How to use:
+ * call __ieee80211_radiotap_iterator_init() to init a semi-opaque iterator
+ * struct ieee80211_radiotap_iterator (no need to init the struct beforehand)
+ * checking for a good 0 return code.  Then loop calling
+ * __ieee80211_radiotap_iterator_next()... it returns either 0,
+ * -ENOENT if there are no more args to parse, or -EINVAL if there is a problem.
+ * The iterator's @this_arg member points to the start of the argument
+ * associated with the current argument index that is present, which can be
+ * found in the iterator's @this_arg_index member.  This arg index corresponds
+ * to the IEEE80211_RADIOTAP_... defines.
+ *
+ * Radiotap header length:
+ * You can find the CPU-endian total radiotap header length in
+ * iterator->max_length after executing ieee80211_radiotap_iterator_init()
+ * successfully.
+ *
+ * Alignment Gotcha:
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned.  Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ *
+ * Example code:
+ * See Documentation/networking/radiotap-headers.txt
+ */
+
+int ieee80211_radiotap_iterator_init(
+	struct ieee80211_radiotap_iterator *iterator,
+	struct ieee80211_radiotap_header *radiotap_header,
+	int max_length, const struct ieee80211_radiotap_vendor_namespaces *vns)
+{
+	/* Linux only supports version 0 radiotap format */
+	if (radiotap_header->it_version)
+		return -EINVAL;
+
+	/* sanity check for allowed length and radiotap length field */
+	if (max_length < get_unaligned_le16(&radiotap_header->it_len))
+		return -EINVAL;
+
+	iterator->_rtheader = radiotap_header;
+	iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len);
+	iterator->_arg_index = 0;
+	iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present);
+	iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header);
+	iterator->_reset_on_ext = 0;
+	iterator->_next_bitmap = &radiotap_header->it_present;
+	iterator->_next_bitmap++;
+	iterator->_vns = vns;
+	iterator->current_namespace = &radiotap_ns;
+	iterator->is_radiotap_ns = 1;
+
+	/* find payload start allowing for extended bitmap(s) */
+
+	if (iterator->_bitmap_shifter & (1<<IEEE80211_RADIOTAP_EXT)) {
+		while (get_unaligned_le32(iterator->_arg) &
+					(1 << IEEE80211_RADIOTAP_EXT)) {
+			iterator->_arg += sizeof(uint32_t);
+
+			/*
+			 * check for insanity where the present bitmaps
+			 * keep claiming to extend up to or even beyond the
+			 * stated radiotap header length
+			 */
+
+			if ((unsigned long)iterator->_arg -
+			    (unsigned long)iterator->_rtheader >
+			    (unsigned long)iterator->_max_length)
+				return -EINVAL;
+		}
+
+		iterator->_arg += sizeof(uint32_t);
+
+		/*
+		 * no need to check again for blowing past stated radiotap
+		 * header length, because ieee80211_radiotap_iterator_next
+		 * checks it before it is dereferenced
+		 */
+	}
+
+	iterator->this_arg = iterator->_arg;
+
+	/* we are all initialized happily */
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_radiotap_iterator_init);
+
+static void find_ns(struct ieee80211_radiotap_iterator *iterator,
+		    uint32_t oui, uint8_t subns)
+{
+	int i;
+
+	iterator->current_namespace = NULL;
+
+	if (!iterator->_vns)
+		return;
+
+	for (i = 0; i < iterator->_vns->n_ns; i++) {
+		if (iterator->_vns->ns[i].oui != oui)
+			continue;
+		if (iterator->_vns->ns[i].subns != subns)
+			continue;
+
+		iterator->current_namespace = &iterator->_vns->ns[i];
+		break;
+	}
+}
+
+
+
+/**
+ * ieee80211_radiotap_iterator_next - return next radiotap parser iterator arg
+ * @iterator: radiotap_iterator to move to next arg (if any)
+ *
+ * Returns: 0 if there is an argument to handle,
+ * -ENOENT if there are no more args or -EINVAL
+ * if there is something else wrong.
+ *
+ * This function provides the next radiotap arg index (IEEE80211_RADIOTAP_*)
+ * in @this_arg_index and sets @this_arg to point to the
+ * payload for the field.  It takes care of alignment handling and extended
+ * present fields.  @this_arg can be changed by the caller (eg,
+ * incremented to move inside a compound argument like
+ * IEEE80211_RADIOTAP_CHANNEL).  The args pointed to are in
+ * little-endian format whatever the endianess of your CPU.
+ *
+ * Alignment Gotcha:
+ * You must take care when dereferencing iterator.this_arg
+ * for multibyte types... the pointer is not aligned.  Use
+ * get_unaligned((type *)iterator.this_arg) to dereference
+ * iterator.this_arg for type "type" safely on all arches.
+ */
+
+int ieee80211_radiotap_iterator_next(
+	struct ieee80211_radiotap_iterator *iterator)
+{
+	while (1) {
+		int hit = 0;
+		int pad, align, size, subns;
+		uint32_t oui;
+
+		/* if no more EXT bits, that's it */
+		if ((iterator->_arg_index % 32) == IEEE80211_RADIOTAP_EXT &&
+		    !(iterator->_bitmap_shifter & 1))
+			return -ENOENT;
+
+		if (!(iterator->_bitmap_shifter & 1))
+			goto next_entry; /* arg not present */
+
+		/* get alignment/size of data */
+		switch (iterator->_arg_index % 32) {
+		case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
+		case IEEE80211_RADIOTAP_EXT:
+			align = 1;
+			size = 0;
+			break;
+		case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
+			align = 2;
+			size = 6;
+			break;
+		default:
+			if (!iterator->current_namespace ||
+			    iterator->_arg_index >= iterator->current_namespace->n_bits) {
+				if (iterator->current_namespace == &radiotap_ns)
+					return -ENOENT;
+				align = 0;
+			} else {
+				align = iterator->current_namespace->align_size[iterator->_arg_index].align;
+				size = iterator->current_namespace->align_size[iterator->_arg_index].size;
+			}
+			if (!align) {
+				/* skip all subsequent data */
+				iterator->_arg = iterator->_next_ns_data;
+				/* give up on this namespace */
+				iterator->current_namespace = NULL;
+				goto next_entry;
+			}
+			break;
+		}
+
+		/*
+		 * arg is present, account for alignment padding
+		 *
+		 * Note that these alignments are relative to the start
+		 * of the radiotap header.  There is no guarantee
+		 * that the radiotap header itself is aligned on any
+		 * kind of boundary.
+		 *
+		 * The above is why get_unaligned() is used to dereference
+		 * multibyte elements from the radiotap area.
+		 */
+
+		pad = ((unsigned long)iterator->_arg -
+		       (unsigned long)iterator->_rtheader) & (align - 1);
+
+		if (pad)
+			iterator->_arg += align - pad;
+
+		if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) {
+			int vnslen;
+
+			if ((unsigned long)iterator->_arg + size -
+			    (unsigned long)iterator->_rtheader >
+			    (unsigned long)iterator->_max_length)
+				return -EINVAL;
+
+			oui = (*iterator->_arg << 16) |
+				(*(iterator->_arg + 1) << 8) |
+				*(iterator->_arg + 2);
+			subns = *(iterator->_arg + 3);
+
+			find_ns(iterator, oui, subns);
+
+			vnslen = get_unaligned_le16(iterator->_arg + 4);
+			iterator->_next_ns_data = iterator->_arg + size + vnslen;
+			if (!iterator->current_namespace)
+				size += vnslen;
+		}
+
+		/*
+		 * this is what we will return to user, but we need to
+		 * move on first so next call has something fresh to test
+		 */
+		iterator->this_arg_index = iterator->_arg_index;
+		iterator->this_arg = iterator->_arg;
+		iterator->this_arg_size = size;
+
+		/* internally move on the size of this arg */
+		iterator->_arg += size;
+
+		/*
+		 * check for insanity where we are given a bitmap that
+		 * claims to have more arg content than the length of the
+		 * radiotap section.  We will normally end up equalling this
+		 * max_length on the last arg, never exceeding it.
+		 */
+
+		if ((unsigned long)iterator->_arg -
+		    (unsigned long)iterator->_rtheader >
+		    (unsigned long)iterator->_max_length)
+			return -EINVAL;
+
+		/* these special ones are valid in each bitmap word */
+		switch (iterator->_arg_index % 32) {
+		case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
+			iterator->_reset_on_ext = 1;
+
+			iterator->is_radiotap_ns = 0;
+			/*
+			 * If parser didn't register this vendor
+			 * namespace with us, allow it to show it
+			 * as 'raw. Do do that, set argument index
+			 * to vendor namespace.
+			 */
+			iterator->this_arg_index =
+				IEEE80211_RADIOTAP_VENDOR_NAMESPACE;
+			if (!iterator->current_namespace)
+				hit = 1;
+			goto next_entry;
+		case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
+			iterator->_reset_on_ext = 1;
+			iterator->current_namespace = &radiotap_ns;
+			iterator->is_radiotap_ns = 1;
+			goto next_entry;
+		case IEEE80211_RADIOTAP_EXT:
+			/*
+			 * bit 31 was set, there is more
+			 * -- move to next u32 bitmap
+			 */
+			iterator->_bitmap_shifter =
+				get_unaligned_le32(iterator->_next_bitmap);
+			iterator->_next_bitmap++;
+			if (iterator->_reset_on_ext)
+				iterator->_arg_index = 0;
+			else
+				iterator->_arg_index++;
+			iterator->_reset_on_ext = 0;
+			break;
+		default:
+			/* we've got a hit! */
+			hit = 1;
+ next_entry:
+			iterator->_bitmap_shifter >>= 1;
+			iterator->_arg_index++;
+		}
+
+		/* if we found a valid arg earlier, return it now */
+		if (hit)
+			return 0;
+	}
+}
+EXPORT_SYMBOL(ieee80211_radiotap_iterator_next);
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
new file mode 100644
index 00000000..9bcb6bc2
--- /dev/null
+++ b/net/wireless/reg.c
@@ -0,0 +1,2322 @@
+/*
+ * Copyright 2002-2005, Instant802 Networks, Inc.
+ * Copyright 2005-2006, Devicescape Software, Inc.
+ * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2008	Luis R. Rodriguez <lrodriguz@atheros.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/**
+ * DOC: Wireless regulatory infrastructure
+ *
+ * The usual implementation is for a driver to read a device EEPROM to
+ * determine which regulatory domain it should be operating under, then
+ * looking up the allowable channels in a driver-local table and finally
+ * registering those channels in the wiphy structure.
+ *
+ * Another set of compliance enforcement is for drivers to use their
+ * own compliance limits which can be stored on the EEPROM. The host
+ * driver or firmware may ensure these are used.
+ *
+ * In addition to all this we provide an extra layer of regulatory
+ * conformance. For drivers which do not have any regulatory
+ * information CRDA provides the complete regulatory solution.
+ * For others it provides a community effort on further restrictions
+ * to enhance compliance.
+ *
+ * Note: When number of rules --> infinity we will not be able to
+ * index on alpha2 any more, instead we'll probably have to
+ * rely on some SHA1 checksum of the regdomain for example.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/nl80211.h>
+#include <linux/platform_device.h>
+#include <net/cfg80211.h>
+#include "core.h"
+#include "reg.h"
+#include "regdb.h"
+#include "nl80211.h"
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+#define REG_DBG_PRINT(format, args...) \
+	do { \
+		printk(KERN_DEBUG pr_fmt(format), ##args);	\
+	} while (0)
+#else
+#define REG_DBG_PRINT(args...)
+#endif
+
+static struct regulatory_request core_request_world = {
+	.initiator = NL80211_REGDOM_SET_BY_CORE,
+	.alpha2[0] = '0',
+	.alpha2[1] = '0',
+	.intersect = false,
+	.processed = true,
+	.country_ie_env = ENVIRON_ANY,
+};
+
+/* Receipt of information from last regulatory request */
+static struct regulatory_request *last_request = &core_request_world;
+
+/* To trigger userspace events */
+static struct platform_device *reg_pdev;
+
+static struct device_type reg_device_type = {
+	.uevent = reg_device_uevent,
+};
+
+/*
+ * Central wireless core regulatory domains, we only need two,
+ * the current one and a world regulatory domain in case we have no
+ * information to give us an alpha2
+ */
+const struct ieee80211_regdomain *cfg80211_regdomain;
+
+/*
+ * Protects static reg.c components:
+ *     - cfg80211_world_regdom
+ *     - cfg80211_regdom
+ *     - last_request
+ */
+static DEFINE_MUTEX(reg_mutex);
+
+static inline void assert_reg_lock(void)
+{
+	lockdep_assert_held(&reg_mutex);
+}
+
+/* Used to queue up regulatory hints */
+static LIST_HEAD(reg_requests_list);
+static spinlock_t reg_requests_lock;
+
+/* Used to queue up beacon hints for review */
+static LIST_HEAD(reg_pending_beacons);
+static spinlock_t reg_pending_beacons_lock;
+
+/* Used to keep track of processed beacon hints */
+static LIST_HEAD(reg_beacon_list);
+
+struct reg_beacon {
+	struct list_head list;
+	struct ieee80211_channel chan;
+};
+
+static void reg_todo(struct work_struct *work);
+static DECLARE_WORK(reg_work, reg_todo);
+
+static void reg_timeout_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reg_timeout, reg_timeout_work);
+
+/* We keep a static world regulatory domain in case of the absence of CRDA */
+static const struct ieee80211_regdomain world_regdom = {
+	.n_reg_rules = 5,
+	.alpha2 =  "00",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..11 */
+		REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
+		/* IEEE 802.11b/g, channels 12..13. No HT40
+		 * channel fits here. */
+		REG_RULE(2467-10, 2472+10, 20, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+		/* IEEE 802.11 channel 14 - Only JP enables
+		 * this and for 802.11b only */
+		REG_RULE(2484-10, 2484+10, 20, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_NO_OFDM),
+		/* IEEE 802.11a, channel 36..48 */
+		REG_RULE(5180-10, 5240+10, 40, 6, 20,
+                        NL80211_RRF_PASSIVE_SCAN |
+                        NL80211_RRF_NO_IBSS),
+
+		/* NB: 5260 MHz - 5700 MHz requies DFS */
+
+		/* IEEE 802.11a, channel 149..165 */
+		REG_RULE(5745-10, 5825+10, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+	}
+};
+
+static const struct ieee80211_regdomain *cfg80211_world_regdom =
+	&world_regdom;
+
+static char *ieee80211_regdom = "00";
+static char user_alpha2[2];
+
+module_param(ieee80211_regdom, charp, 0444);
+MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+
+static void reset_regdomains(bool full_reset)
+{
+	/* avoid freeing static information or freeing something twice */
+	if (cfg80211_regdomain == cfg80211_world_regdom)
+		cfg80211_regdomain = NULL;
+	if (cfg80211_world_regdom == &world_regdom)
+		cfg80211_world_regdom = NULL;
+	if (cfg80211_regdomain == &world_regdom)
+		cfg80211_regdomain = NULL;
+
+	kfree(cfg80211_regdomain);
+	kfree(cfg80211_world_regdom);
+
+	cfg80211_world_regdom = &world_regdom;
+	cfg80211_regdomain = NULL;
+
+	if (!full_reset)
+		return;
+
+	if (last_request != &core_request_world)
+		kfree(last_request);
+	last_request = &core_request_world;
+}
+
+/*
+ * Dynamic world regulatory domain requested by the wireless
+ * core upon initialization
+ */
+static void update_world_regdomain(const struct ieee80211_regdomain *rd)
+{
+	BUG_ON(!last_request);
+
+	reset_regdomains(false);
+
+	cfg80211_world_regdom = rd;
+	cfg80211_regdomain = rd;
+}
+
+bool is_world_regdom(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] == '0' && alpha2[1] == '0')
+		return true;
+	return false;
+}
+
+static bool is_alpha2_set(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] != 0 && alpha2[1] != 0)
+		return true;
+	return false;
+}
+
+static bool is_unknown_alpha2(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	/*
+	 * Special case where regulatory domain was built by driver
+	 * but a specific alpha2 cannot be determined
+	 */
+	if (alpha2[0] == '9' && alpha2[1] == '9')
+		return true;
+	return false;
+}
+
+static bool is_intersected_alpha2(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	/*
+	 * Special case where regulatory domain is the
+	 * result of an intersection between two regulatory domain
+	 * structures
+	 */
+	if (alpha2[0] == '9' && alpha2[1] == '8')
+		return true;
+	return false;
+}
+
+static bool is_an_alpha2(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (isalpha(alpha2[0]) && isalpha(alpha2[1]))
+		return true;
+	return false;
+}
+
+static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
+{
+	if (!alpha2_x || !alpha2_y)
+		return false;
+	if (alpha2_x[0] == alpha2_y[0] &&
+		alpha2_x[1] == alpha2_y[1])
+		return true;
+	return false;
+}
+
+static bool regdom_changes(const char *alpha2)
+{
+	assert_cfg80211_lock();
+
+	if (!cfg80211_regdomain)
+		return true;
+	if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+		return false;
+	return true;
+}
+
+/*
+ * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets
+ * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER
+ * has ever been issued.
+ */
+static bool is_user_regdom_saved(void)
+{
+	if (user_alpha2[0] == '9' && user_alpha2[1] == '7')
+		return false;
+
+	/* This would indicate a mistake on the design */
+	if (WARN((!is_world_regdom(user_alpha2) &&
+		  !is_an_alpha2(user_alpha2)),
+		 "Unexpected user alpha2: %c%c\n",
+		 user_alpha2[0],
+	         user_alpha2[1]))
+		return false;
+
+	return true;
+}
+
+static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd,
+			 const struct ieee80211_regdomain *src_regd)
+{
+	struct ieee80211_regdomain *regd;
+	int size_of_regd = 0;
+	unsigned int i;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+	  ((src_regd->n_reg_rules + 1) * sizeof(struct ieee80211_reg_rule));
+
+	regd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!regd)
+		return -ENOMEM;
+
+	memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain));
+
+	for (i = 0; i < src_regd->n_reg_rules; i++)
+		memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i],
+			sizeof(struct ieee80211_reg_rule));
+
+	*dst_regd = regd;
+	return 0;
+}
+
+#ifdef CONFIG_CFG80211_INTERNAL_REGDB
+struct reg_regdb_search_request {
+	char alpha2[2];
+	struct list_head list;
+};
+
+static LIST_HEAD(reg_regdb_search_list);
+static DEFINE_MUTEX(reg_regdb_search_mutex);
+
+static void reg_regdb_search(struct work_struct *work)
+{
+	struct reg_regdb_search_request *request;
+	const struct ieee80211_regdomain *curdom, *regdom;
+	int i, r;
+
+	mutex_lock(&reg_regdb_search_mutex);
+	while (!list_empty(&reg_regdb_search_list)) {
+		request = list_first_entry(&reg_regdb_search_list,
+					   struct reg_regdb_search_request,
+					   list);
+		list_del(&request->list);
+
+		for (i=0; i<reg_regdb_size; i++) {
+			curdom = reg_regdb[i];
+
+			if (!memcmp(request->alpha2, curdom->alpha2, 2)) {
+				r = reg_copy_regd(&regdom, curdom);
+				if (r)
+					break;
+				mutex_lock(&cfg80211_mutex);
+				set_regdom(regdom);
+				mutex_unlock(&cfg80211_mutex);
+				break;
+			}
+		}
+
+		kfree(request);
+	}
+	mutex_unlock(&reg_regdb_search_mutex);
+}
+
+static DECLARE_WORK(reg_regdb_work, reg_regdb_search);
+
+static void reg_regdb_query(const char *alpha2)
+{
+	struct reg_regdb_search_request *request;
+
+	if (!alpha2)
+		return;
+
+	request = kzalloc(sizeof(struct reg_regdb_search_request), GFP_KERNEL);
+	if (!request)
+		return;
+
+	memcpy(request->alpha2, alpha2, 2);
+
+	mutex_lock(&reg_regdb_search_mutex);
+	list_add_tail(&request->list, &reg_regdb_search_list);
+	mutex_unlock(&reg_regdb_search_mutex);
+
+	schedule_work(&reg_regdb_work);
+}
+
+/* Feel free to add any other sanity checks here */
+static void reg_regdb_size_check(void)
+{
+	/* We should ideally BUILD_BUG_ON() but then random builds would fail */
+	WARN_ONCE(!reg_regdb_size, "db.txt is empty, you should update it...");
+}
+#else
+static inline void reg_regdb_size_check(void) {}
+static inline void reg_regdb_query(const char *alpha2) {}
+#endif /* CONFIG_CFG80211_INTERNAL_REGDB */
+
+/*
+ * This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace. Country information is filled in by
+ * reg_device_uevent
+ */
+static int call_crda(const char *alpha2)
+{
+	if (!is_world_regdom((char *) alpha2))
+		pr_info("Calling CRDA for country: %c%c\n",
+			alpha2[0], alpha2[1]);
+	else
+		pr_info("Calling CRDA to update world regulatory domain\n");
+
+	/* query internal regulatory database (if it exists) */
+	reg_regdb_query(alpha2);
+
+	return kobject_uevent(&reg_pdev->dev.kobj, KOBJ_CHANGE);
+}
+
+/* Used by nl80211 before kmalloc'ing our regulatory domain */
+bool reg_is_valid_request(const char *alpha2)
+{
+	assert_cfg80211_lock();
+
+	if (!last_request)
+		return false;
+
+	return alpha2_equal(last_request->alpha2, alpha2);
+}
+
+/* Sanity check on a regulatory rule */
+static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
+{
+	const struct ieee80211_freq_range *freq_range = &rule->freq_range;
+	u32 freq_diff;
+
+	if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0)
+		return false;
+
+	if (freq_range->start_freq_khz > freq_range->end_freq_khz)
+		return false;
+
+	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+
+	if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
+			freq_range->max_bandwidth_khz > freq_diff)
+		return false;
+
+	return true;
+}
+
+static bool is_valid_rd(const struct ieee80211_regdomain *rd)
+{
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	unsigned int i;
+
+	if (!rd->n_reg_rules)
+		return false;
+
+	if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES))
+		return false;
+
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		if (!is_valid_reg_rule(reg_rule))
+			return false;
+	}
+
+	return true;
+}
+
+static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range,
+			    u32 center_freq_khz,
+			    u32 bw_khz)
+{
+	u32 start_freq_khz, end_freq_khz;
+
+	start_freq_khz = center_freq_khz - (bw_khz/2);
+	end_freq_khz = center_freq_khz + (bw_khz/2);
+
+	if (start_freq_khz >= freq_range->start_freq_khz &&
+	    end_freq_khz <= freq_range->end_freq_khz)
+		return true;
+
+	return false;
+}
+
+/**
+ * freq_in_rule_band - tells us if a frequency is in a frequency band
+ * @freq_range: frequency rule we want to query
+ * @freq_khz: frequency we are inquiring about
+ *
+ * This lets us know if a specific frequency rule is or is not relevant to
+ * a specific frequency's band. Bands are device specific and artificial
+ * definitions (the "2.4 GHz band" and the "5 GHz band"), however it is
+ * safe for now to assume that a frequency rule should not be part of a
+ * frequency's band if the start freq or end freq are off by more than 2 GHz.
+ * This resolution can be lowered and should be considered as we add
+ * regulatory rule support for other "bands".
+ **/
+static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
+	u32 freq_khz)
+{
+#define ONE_GHZ_IN_KHZ	1000000
+	if (abs(freq_khz - freq_range->start_freq_khz) <= (2 * ONE_GHZ_IN_KHZ))
+		return true;
+	if (abs(freq_khz - freq_range->end_freq_khz) <= (2 * ONE_GHZ_IN_KHZ))
+		return true;
+	return false;
+#undef ONE_GHZ_IN_KHZ
+}
+
+/*
+ * Helper for regdom_intersect(), this does the real
+ * mathematical intersection fun
+ */
+static int reg_rules_intersect(
+	const struct ieee80211_reg_rule *rule1,
+	const struct ieee80211_reg_rule *rule2,
+	struct ieee80211_reg_rule *intersected_rule)
+{
+	const struct ieee80211_freq_range *freq_range1, *freq_range2;
+	struct ieee80211_freq_range *freq_range;
+	const struct ieee80211_power_rule *power_rule1, *power_rule2;
+	struct ieee80211_power_rule *power_rule;
+	u32 freq_diff;
+
+	freq_range1 = &rule1->freq_range;
+	freq_range2 = &rule2->freq_range;
+	freq_range = &intersected_rule->freq_range;
+
+	power_rule1 = &rule1->power_rule;
+	power_rule2 = &rule2->power_rule;
+	power_rule = &intersected_rule->power_rule;
+
+	freq_range->start_freq_khz = max(freq_range1->start_freq_khz,
+		freq_range2->start_freq_khz);
+	freq_range->end_freq_khz = min(freq_range1->end_freq_khz,
+		freq_range2->end_freq_khz);
+	freq_range->max_bandwidth_khz = min(freq_range1->max_bandwidth_khz,
+		freq_range2->max_bandwidth_khz);
+
+	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+	if (freq_range->max_bandwidth_khz > freq_diff)
+		freq_range->max_bandwidth_khz = freq_diff;
+
+	power_rule->max_eirp = min(power_rule1->max_eirp,
+		power_rule2->max_eirp);
+	power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain,
+		power_rule2->max_antenna_gain);
+
+	intersected_rule->flags = (rule1->flags | rule2->flags);
+
+	if (!is_valid_reg_rule(intersected_rule))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * regdom_intersect - do the intersection between two regulatory domains
+ * @rd1: first regulatory domain
+ * @rd2: second regulatory domain
+ *
+ * Use this function to get the intersection between two regulatory domains.
+ * Once completed we will mark the alpha2 for the rd as intersected, "98",
+ * as no one single alpha2 can represent this regulatory domain.
+ *
+ * Returns a pointer to the regulatory domain structure which will hold the
+ * resulting intersection of rules between rd1 and rd2. We will
+ * kzalloc() this structure for you.
+ */
+static struct ieee80211_regdomain *regdom_intersect(
+	const struct ieee80211_regdomain *rd1,
+	const struct ieee80211_regdomain *rd2)
+{
+	int r, size_of_regd;
+	unsigned int x, y;
+	unsigned int num_rules = 0, rule_idx = 0;
+	const struct ieee80211_reg_rule *rule1, *rule2;
+	struct ieee80211_reg_rule *intersected_rule;
+	struct ieee80211_regdomain *rd;
+	/* This is just a dummy holder to help us count */
+	struct ieee80211_reg_rule irule;
+
+	/* Uses the stack temporarily for counter arithmetic */
+	intersected_rule = &irule;
+
+	memset(intersected_rule, 0, sizeof(struct ieee80211_reg_rule));
+
+	if (!rd1 || !rd2)
+		return NULL;
+
+	/*
+	 * First we get a count of the rules we'll need, then we actually
+	 * build them. This is to so we can malloc() and free() a
+	 * regdomain once. The reason we use reg_rules_intersect() here
+	 * is it will return -EINVAL if the rule computed makes no sense.
+	 * All rules that do check out OK are valid.
+	 */
+
+	for (x = 0; x < rd1->n_reg_rules; x++) {
+		rule1 = &rd1->reg_rules[x];
+		for (y = 0; y < rd2->n_reg_rules; y++) {
+			rule2 = &rd2->reg_rules[y];
+			if (!reg_rules_intersect(rule1, rule2,
+					intersected_rule))
+				num_rules++;
+			memset(intersected_rule, 0,
+					sizeof(struct ieee80211_reg_rule));
+		}
+	}
+
+	if (!num_rules)
+		return NULL;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		((num_rules + 1) * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	for (x = 0; x < rd1->n_reg_rules; x++) {
+		rule1 = &rd1->reg_rules[x];
+		for (y = 0; y < rd2->n_reg_rules; y++) {
+			rule2 = &rd2->reg_rules[y];
+			/*
+			 * This time around instead of using the stack lets
+			 * write to the target rule directly saving ourselves
+			 * a memcpy()
+			 */
+			intersected_rule = &rd->reg_rules[rule_idx];
+			r = reg_rules_intersect(rule1, rule2,
+				intersected_rule);
+			/*
+			 * No need to memset here the intersected rule here as
+			 * we're not using the stack anymore
+			 */
+			if (r)
+				continue;
+			rule_idx++;
+		}
+	}
+
+	if (rule_idx != num_rules) {
+		kfree(rd);
+		return NULL;
+	}
+
+	rd->n_reg_rules = num_rules;
+	rd->alpha2[0] = '9';
+	rd->alpha2[1] = '8';
+
+	return rd;
+}
+
+/*
+ * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these
+ */
+static u32 map_regdom_flags(u32 rd_flags)
+{
+	u32 channel_flags = 0;
+	if (rd_flags & NL80211_RRF_PASSIVE_SCAN)
+		channel_flags |= IEEE80211_CHAN_PASSIVE_SCAN;
+	if (rd_flags & NL80211_RRF_NO_IBSS)
+		channel_flags |= IEEE80211_CHAN_NO_IBSS;
+	if (rd_flags & NL80211_RRF_DFS)
+		channel_flags |= IEEE80211_CHAN_RADAR;
+	return channel_flags;
+}
+
+static int freq_reg_info_regd(struct wiphy *wiphy,
+			      u32 center_freq,
+			      u32 desired_bw_khz,
+			      const struct ieee80211_reg_rule **reg_rule,
+			      const struct ieee80211_regdomain *custom_regd)
+{
+	int i;
+	bool band_rule_found = false;
+	const struct ieee80211_regdomain *regd;
+	bool bw_fits = false;
+
+	if (!desired_bw_khz)
+		desired_bw_khz = MHZ_TO_KHZ(20);
+
+	regd = custom_regd ? custom_regd : cfg80211_regdomain;
+
+	/*
+	 * Follow the driver's regulatory domain, if present, unless a country
+	 * IE has been processed or a user wants to help complaince further
+	 */
+	if (!custom_regd &&
+	    last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    last_request->initiator != NL80211_REGDOM_SET_BY_USER &&
+	    wiphy->regd)
+		regd = wiphy->regd;
+
+	if (!regd)
+		return -EINVAL;
+
+	for (i = 0; i < regd->n_reg_rules; i++) {
+		const struct ieee80211_reg_rule *rr;
+		const struct ieee80211_freq_range *fr = NULL;
+
+		rr = &regd->reg_rules[i];
+		fr = &rr->freq_range;
+
+		/*
+		 * We only need to know if one frequency rule was
+		 * was in center_freq's band, that's enough, so lets
+		 * not overwrite it once found
+		 */
+		if (!band_rule_found)
+			band_rule_found = freq_in_rule_band(fr, center_freq);
+
+		bw_fits = reg_does_bw_fit(fr,
+					  center_freq,
+					  desired_bw_khz);
+
+		if (band_rule_found && bw_fits) {
+			*reg_rule = rr;
+			return 0;
+		}
+	}
+
+	if (!band_rule_found)
+		return -ERANGE;
+
+	return -EINVAL;
+}
+
+int freq_reg_info(struct wiphy *wiphy,
+		  u32 center_freq,
+		  u32 desired_bw_khz,
+		  const struct ieee80211_reg_rule **reg_rule)
+{
+	assert_cfg80211_lock();
+	return freq_reg_info_regd(wiphy,
+				  center_freq,
+				  desired_bw_khz,
+				  reg_rule,
+				  NULL);
+}
+EXPORT_SYMBOL(freq_reg_info);
+
+#ifdef CONFIG_CFG80211_REG_DEBUG
+static const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
+{
+	switch (initiator) {
+	case NL80211_REGDOM_SET_BY_CORE:
+		return "Set by core";
+	case NL80211_REGDOM_SET_BY_USER:
+		return "Set by user";
+	case NL80211_REGDOM_SET_BY_DRIVER:
+		return "Set by driver";
+	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+		return "Set by country IE";
+	default:
+		WARN_ON(1);
+		return "Set by bug";
+	}
+}
+
+static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan,
+				    u32 desired_bw_khz,
+				    const struct ieee80211_reg_rule *reg_rule)
+{
+	const struct ieee80211_power_rule *power_rule;
+	const struct ieee80211_freq_range *freq_range;
+	char max_antenna_gain[32];
+
+	power_rule = &reg_rule->power_rule;
+	freq_range = &reg_rule->freq_range;
+
+	if (!power_rule->max_antenna_gain)
+		snprintf(max_antenna_gain, 32, "N/A");
+	else
+		snprintf(max_antenna_gain, 32, "%d", power_rule->max_antenna_gain);
+
+	REG_DBG_PRINT("Updating information on frequency %d MHz "
+		      "for a %d MHz width channel with regulatory rule:\n",
+		      chan->center_freq,
+		      KHZ_TO_MHZ(desired_bw_khz));
+
+	REG_DBG_PRINT("%d KHz - %d KHz @  KHz), (%s mBi, %d mBm)\n",
+		      freq_range->start_freq_khz,
+		      freq_range->end_freq_khz,
+		      max_antenna_gain,
+		      power_rule->max_eirp);
+}
+#else
+static void chan_reg_rule_print_dbg(struct ieee80211_channel *chan,
+				    u32 desired_bw_khz,
+				    const struct ieee80211_reg_rule *reg_rule)
+{
+	return;
+}
+#endif
+
+/*
+ * Note that right now we assume the desired channel bandwidth
+ * is always 20 MHz for each individual channel (HT40 uses 20 MHz
+ * per channel, the primary and the extension channel). To support
+ * smaller custom bandwidths such as 5 MHz or 10 MHz we'll need a
+ * new ieee80211_channel.target_bw and re run the regulatory check
+ * on the wiphy with the target_bw specified. Then we can simply use
+ * that below for the desired_bw_khz below.
+ */
+static void handle_channel(struct wiphy *wiphy,
+			   enum nl80211_reg_initiator initiator,
+			   enum ieee80211_band band,
+			   unsigned int chan_idx)
+{
+	int r;
+	u32 flags, bw_flags = 0;
+	u32 desired_bw_khz = MHZ_TO_KHZ(20);
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
+	const struct ieee80211_freq_range *freq_range = NULL;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *chan;
+	struct wiphy *request_wiphy = NULL;
+
+	assert_cfg80211_lock();
+
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+	sband = wiphy->bands[band];
+	BUG_ON(chan_idx >= sband->n_channels);
+	chan = &sband->channels[chan_idx];
+
+	flags = chan->orig_flags;
+
+	r = freq_reg_info(wiphy,
+			  MHZ_TO_KHZ(chan->center_freq),
+			  desired_bw_khz,
+			  &reg_rule);
+
+	if (r) {
+		/*
+		 * We will disable all channels that do not match our
+		 * received regulatory rule unless the hint is coming
+		 * from a Country IE and the Country IE had no information
+		 * about a band. The IEEE 802.11 spec allows for an AP
+		 * to send only a subset of the regulatory rules allowed,
+		 * so an AP in the US that only supports 2.4 GHz may only send
+		 * a country IE with information for the 2.4 GHz band
+		 * while 5 GHz is still supported.
+		 */
+		if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+		    r == -ERANGE)
+			return;
+
+		REG_DBG_PRINT("Disabling freq %d MHz\n", chan->center_freq);
+		chan->flags = IEEE80211_CHAN_DISABLED;
+		return;
+	}
+
+	chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule);
+
+	power_rule = &reg_rule->power_rule;
+	freq_range = &reg_rule->freq_range;
+
+	if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40))
+		bw_flags = IEEE80211_CHAN_NO_HT40;
+
+	if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+	    request_wiphy && request_wiphy == wiphy &&
+	    request_wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) {
+		/*
+		 * This guarantees the driver's requested regulatory domain
+		 * will always be used as a base for further regulatory
+		 * settings
+		 */
+		chan->flags = chan->orig_flags =
+			map_regdom_flags(reg_rule->flags) | bw_flags;
+		chan->max_antenna_gain = chan->orig_mag =
+			(int) MBI_TO_DBI(power_rule->max_antenna_gain);
+		chan->max_power = chan->orig_mpwr =
+			(int) MBM_TO_DBM(power_rule->max_eirp);
+		return;
+	}
+
+	chan->beacon_found = false;
+	chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags);
+	chan->max_antenna_gain = min(chan->orig_mag,
+		(int) MBI_TO_DBI(power_rule->max_antenna_gain));
+	if (chan->orig_mpwr)
+		chan->max_power = min(chan->orig_mpwr,
+			(int) MBM_TO_DBM(power_rule->max_eirp));
+	else
+		chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp);
+}
+
+static void handle_band(struct wiphy *wiphy,
+			enum ieee80211_band band,
+			enum nl80211_reg_initiator initiator)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+
+	BUG_ON(!wiphy->bands[band]);
+	sband = wiphy->bands[band];
+
+	for (i = 0; i < sband->n_channels; i++)
+		handle_channel(wiphy, initiator, band, i);
+}
+
+static bool ignore_reg_update(struct wiphy *wiphy,
+			      enum nl80211_reg_initiator initiator)
+{
+	if (!last_request) {
+		REG_DBG_PRINT("Ignoring regulatory request %s since "
+			      "last_request is not set\n",
+			      reg_initiator_name(initiator));
+		return true;
+	}
+
+	if (initiator == NL80211_REGDOM_SET_BY_CORE &&
+	    wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY) {
+		REG_DBG_PRINT("Ignoring regulatory request %s "
+			      "since the driver uses its own custom "
+			      "regulatory domain ",
+			      reg_initiator_name(initiator));
+		return true;
+	}
+
+	/*
+	 * wiphy->regd will be set once the device has its own
+	 * desired regulatory domain set
+	 */
+	if (wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY && !wiphy->regd &&
+	    initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    !is_world_regdom(last_request->alpha2)) {
+		REG_DBG_PRINT("Ignoring regulatory request %s "
+			      "since the driver requires its own regulaotry "
+			      "domain to be set first",
+			      reg_initiator_name(initiator));
+		return true;
+	}
+
+	return false;
+}
+
+static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
+{
+	struct cfg80211_registered_device *rdev;
+
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list)
+		wiphy_update_regulatory(&rdev->wiphy, initiator);
+}
+
+static void handle_reg_beacon(struct wiphy *wiphy,
+			      unsigned int chan_idx,
+			      struct reg_beacon *reg_beacon)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *chan;
+	bool channel_changed = false;
+	struct ieee80211_channel chan_before;
+
+	assert_cfg80211_lock();
+
+	sband = wiphy->bands[reg_beacon->chan.band];
+	chan = &sband->channels[chan_idx];
+
+	if (likely(chan->center_freq != reg_beacon->chan.center_freq))
+		return;
+
+	if (chan->beacon_found)
+		return;
+
+	chan->beacon_found = true;
+
+	if (wiphy->flags & WIPHY_FLAG_DISABLE_BEACON_HINTS)
+		return;
+
+	chan_before.center_freq = chan->center_freq;
+	chan_before.flags = chan->flags;
+
+	if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) {
+		chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
+		channel_changed = true;
+	}
+
+	if (chan->flags & IEEE80211_CHAN_NO_IBSS) {
+		chan->flags &= ~IEEE80211_CHAN_NO_IBSS;
+		channel_changed = true;
+	}
+
+	if (channel_changed)
+		nl80211_send_beacon_hint_event(wiphy, &chan_before, chan);
+}
+
+/*
+ * Called when a scan on a wiphy finds a beacon on
+ * new channel
+ */
+static void wiphy_update_new_beacon(struct wiphy *wiphy,
+				    struct reg_beacon *reg_beacon)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+
+	assert_cfg80211_lock();
+
+	if (!wiphy->bands[reg_beacon->chan.band])
+		return;
+
+	sband = wiphy->bands[reg_beacon->chan.band];
+
+	for (i = 0; i < sband->n_channels; i++)
+		handle_reg_beacon(wiphy, i, reg_beacon);
+}
+
+/*
+ * Called upon reg changes or a new wiphy is added
+ */
+static void wiphy_update_beacon_reg(struct wiphy *wiphy)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+	struct reg_beacon *reg_beacon;
+
+	assert_cfg80211_lock();
+
+	if (list_empty(&reg_beacon_list))
+		return;
+
+	list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
+		if (!wiphy->bands[reg_beacon->chan.band])
+			continue;
+		sband = wiphy->bands[reg_beacon->chan.band];
+		for (i = 0; i < sband->n_channels; i++)
+			handle_reg_beacon(wiphy, i, reg_beacon);
+	}
+}
+
+static bool reg_is_world_roaming(struct wiphy *wiphy)
+{
+	if (is_world_regdom(cfg80211_regdomain->alpha2) ||
+	    (wiphy->regd && is_world_regdom(wiphy->regd->alpha2)))
+		return true;
+	if (last_request &&
+	    last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY)
+		return true;
+	return false;
+}
+
+/* Reap the advantages of previously found beacons */
+static void reg_process_beacons(struct wiphy *wiphy)
+{
+	/*
+	 * Means we are just firing up cfg80211, so no beacons would
+	 * have been processed yet.
+	 */
+	if (!last_request)
+		return;
+	if (!reg_is_world_roaming(wiphy))
+		return;
+	wiphy_update_beacon_reg(wiphy);
+}
+
+static bool is_ht40_not_allowed(struct ieee80211_channel *chan)
+{
+	if (!chan)
+		return true;
+	if (chan->flags & IEEE80211_CHAN_DISABLED)
+		return true;
+	/* This would happen when regulatory rules disallow HT40 completely */
+	if (IEEE80211_CHAN_NO_HT40 == (chan->flags & (IEEE80211_CHAN_NO_HT40)))
+		return true;
+	return false;
+}
+
+static void reg_process_ht_flags_channel(struct wiphy *wiphy,
+					 enum ieee80211_band band,
+					 unsigned int chan_idx)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *channel;
+	struct ieee80211_channel *channel_before = NULL, *channel_after = NULL;
+	unsigned int i;
+
+	assert_cfg80211_lock();
+
+	sband = wiphy->bands[band];
+	BUG_ON(chan_idx >= sband->n_channels);
+	channel = &sband->channels[chan_idx];
+
+	if (is_ht40_not_allowed(channel)) {
+		channel->flags |= IEEE80211_CHAN_NO_HT40;
+		return;
+	}
+
+	/*
+	 * We need to ensure the extension channels exist to
+	 * be able to use HT40- or HT40+, this finds them (or not)
+	 */
+	for (i = 0; i < sband->n_channels; i++) {
+		struct ieee80211_channel *c = &sband->channels[i];
+		if (c->center_freq == (channel->center_freq - 20))
+			channel_before = c;
+		if (c->center_freq == (channel->center_freq + 20))
+			channel_after = c;
+	}
+
+	/*
+	 * Please note that this assumes target bandwidth is 20 MHz,
+	 * if that ever changes we also need to change the below logic
+	 * to include that as well.
+	 */
+	if (is_ht40_not_allowed(channel_before))
+		channel->flags |= IEEE80211_CHAN_NO_HT40MINUS;
+	else
+		channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS;
+
+	if (is_ht40_not_allowed(channel_after))
+		channel->flags |= IEEE80211_CHAN_NO_HT40PLUS;
+	else
+		channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS;
+}
+
+static void reg_process_ht_flags_band(struct wiphy *wiphy,
+				      enum ieee80211_band band)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+
+	BUG_ON(!wiphy->bands[band]);
+	sband = wiphy->bands[band];
+
+	for (i = 0; i < sband->n_channels; i++)
+		reg_process_ht_flags_channel(wiphy, band, i);
+}
+
+static void reg_process_ht_flags(struct wiphy *wiphy)
+{
+	enum ieee80211_band band;
+
+	if (!wiphy)
+		return;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		if (wiphy->bands[band])
+			reg_process_ht_flags_band(wiphy, band);
+	}
+
+}
+
+void wiphy_update_regulatory(struct wiphy *wiphy,
+			     enum nl80211_reg_initiator initiator)
+{
+	enum ieee80211_band band;
+
+	if (ignore_reg_update(wiphy, initiator))
+		return;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		if (wiphy->bands[band])
+			handle_band(wiphy, band, initiator);
+	}
+
+	reg_process_beacons(wiphy);
+	reg_process_ht_flags(wiphy);
+	if (wiphy->reg_notifier)
+		wiphy->reg_notifier(wiphy, last_request);
+}
+
+static void handle_channel_custom(struct wiphy *wiphy,
+				  enum ieee80211_band band,
+				  unsigned int chan_idx,
+				  const struct ieee80211_regdomain *regd)
+{
+	int r;
+	u32 desired_bw_khz = MHZ_TO_KHZ(20);
+	u32 bw_flags = 0;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
+	const struct ieee80211_freq_range *freq_range = NULL;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_channel *chan;
+
+	assert_reg_lock();
+
+	sband = wiphy->bands[band];
+	BUG_ON(chan_idx >= sband->n_channels);
+	chan = &sband->channels[chan_idx];
+
+	r = freq_reg_info_regd(wiphy,
+			       MHZ_TO_KHZ(chan->center_freq),
+			       desired_bw_khz,
+			       &reg_rule,
+			       regd);
+
+	if (r) {
+		REG_DBG_PRINT("Disabling freq %d MHz as custom "
+			      "regd has no rule that fits a %d MHz "
+			      "wide channel\n",
+			      chan->center_freq,
+			      KHZ_TO_MHZ(desired_bw_khz));
+		chan->flags = IEEE80211_CHAN_DISABLED;
+		return;
+	}
+
+	chan_reg_rule_print_dbg(chan, desired_bw_khz, reg_rule);
+
+	power_rule = &reg_rule->power_rule;
+	freq_range = &reg_rule->freq_range;
+
+	if (freq_range->max_bandwidth_khz < MHZ_TO_KHZ(40))
+		bw_flags = IEEE80211_CHAN_NO_HT40;
+
+	chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags;
+	chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain);
+	chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp);
+}
+
+static void handle_band_custom(struct wiphy *wiphy, enum ieee80211_band band,
+			       const struct ieee80211_regdomain *regd)
+{
+	unsigned int i;
+	struct ieee80211_supported_band *sband;
+
+	BUG_ON(!wiphy->bands[band]);
+	sband = wiphy->bands[band];
+
+	for (i = 0; i < sband->n_channels; i++)
+		handle_channel_custom(wiphy, band, i, regd);
+}
+
+/* Used by drivers prior to wiphy registration */
+void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
+				   const struct ieee80211_regdomain *regd)
+{
+	enum ieee80211_band band;
+	unsigned int bands_set = 0;
+
+	mutex_lock(&reg_mutex);
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		if (!wiphy->bands[band])
+			continue;
+		handle_band_custom(wiphy, band, regd);
+		bands_set++;
+	}
+	mutex_unlock(&reg_mutex);
+
+	/*
+	 * no point in calling this if it won't have any effect
+	 * on your device's supportd bands.
+	 */
+	WARN_ON(!bands_set);
+}
+EXPORT_SYMBOL(wiphy_apply_custom_regulatory);
+
+/*
+ * Return value which can be used by ignore_request() to indicate
+ * it has been determined we should intersect two regulatory domains
+ */
+#define REG_INTERSECT	1
+
+/* This has the logic which determines when a new request
+ * should be ignored. */
+static int ignore_request(struct wiphy *wiphy,
+			  struct regulatory_request *pending_request)
+{
+	struct wiphy *last_wiphy = NULL;
+
+	assert_cfg80211_lock();
+
+	/* All initial requests are respected */
+	if (!last_request)
+		return 0;
+
+	switch (pending_request->initiator) {
+	case NL80211_REGDOM_SET_BY_CORE:
+		return 0;
+	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
+
+		last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+		if (unlikely(!is_an_alpha2(pending_request->alpha2)))
+			return -EINVAL;
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+			if (last_wiphy != wiphy) {
+				/*
+				 * Two cards with two APs claiming different
+				 * Country IE alpha2s. We could
+				 * intersect them, but that seems unlikely
+				 * to be correct. Reject second one for now.
+				 */
+				if (regdom_changes(pending_request->alpha2))
+					return -EOPNOTSUPP;
+				return -EALREADY;
+			}
+			/*
+			 * Two consecutive Country IE hints on the same wiphy.
+			 * This should be picked up early by the driver/stack
+			 */
+			if (WARN_ON(regdom_changes(pending_request->alpha2)))
+				return 0;
+			return -EALREADY;
+		}
+		return 0;
+	case NL80211_REGDOM_SET_BY_DRIVER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
+			if (regdom_changes(pending_request->alpha2))
+				return 0;
+			return -EALREADY;
+		}
+
+		/*
+		 * This would happen if you unplug and plug your card
+		 * back in or if you add a new device for which the previously
+		 * loaded card also agrees on the regulatory domain.
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+		    !regdom_changes(pending_request->alpha2))
+			return -EALREADY;
+
+		return REG_INTERSECT;
+	case NL80211_REGDOM_SET_BY_USER:
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
+			return REG_INTERSECT;
+		/*
+		 * If the user knows better the user should set the regdom
+		 * to their country before the IE is picked up
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_USER &&
+			  last_request->intersect)
+			return -EOPNOTSUPP;
+		/*
+		 * Process user requests only after previous user/driver/core
+		 * requests have been processed
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+		    last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
+			if (regdom_changes(last_request->alpha2))
+				return -EAGAIN;
+		}
+
+		if (!regdom_changes(pending_request->alpha2))
+			return -EALREADY;
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void reg_set_request_processed(void)
+{
+	bool need_more_processing = false;
+
+	last_request->processed = true;
+
+	spin_lock(&reg_requests_lock);
+	if (!list_empty(&reg_requests_list))
+		need_more_processing = true;
+	spin_unlock(&reg_requests_lock);
+
+	if (last_request->initiator == NL80211_REGDOM_SET_BY_USER)
+		cancel_delayed_work_sync(&reg_timeout);
+
+	if (need_more_processing)
+		schedule_work(&reg_work);
+}
+
+/**
+ * __regulatory_hint - hint to the wireless core a regulatory domain
+ * @wiphy: if the hint comes from country information from an AP, this
+ *	is required to be set to the wiphy that received the information
+ * @pending_request: the regulatory request currently being processed
+ *
+ * The Wireless subsystem can use this function to hint to the wireless core
+ * what it believes should be the current regulatory domain.
+ *
+ * Returns zero if all went fine, %-EALREADY if a regulatory domain had
+ * already been set or other standard error codes.
+ *
+ * Caller must hold &cfg80211_mutex and &reg_mutex
+ */
+static int __regulatory_hint(struct wiphy *wiphy,
+			     struct regulatory_request *pending_request)
+{
+	bool intersect = false;
+	int r = 0;
+
+	assert_cfg80211_lock();
+
+	r = ignore_request(wiphy, pending_request);
+
+	if (r == REG_INTERSECT) {
+		if (pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
+			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
+			if (r) {
+				kfree(pending_request);
+				return r;
+			}
+		}
+		intersect = true;
+	} else if (r) {
+		/*
+		 * If the regulatory domain being requested by the
+		 * driver has already been set just copy it to the
+		 * wiphy
+		 */
+		if (r == -EALREADY &&
+		    pending_request->initiator ==
+		    NL80211_REGDOM_SET_BY_DRIVER) {
+			r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
+			if (r) {
+				kfree(pending_request);
+				return r;
+			}
+			r = -EALREADY;
+			goto new_request;
+		}
+		kfree(pending_request);
+		return r;
+	}
+
+new_request:
+	if (last_request != &core_request_world)
+		kfree(last_request);
+
+	last_request = pending_request;
+	last_request->intersect = intersect;
+
+	pending_request = NULL;
+
+	if (last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
+		user_alpha2[0] = last_request->alpha2[0];
+		user_alpha2[1] = last_request->alpha2[1];
+	}
+
+	/* When r == REG_INTERSECT we do need to call CRDA */
+	if (r < 0) {
+		/*
+		 * Since CRDA will not be called in this case as we already
+		 * have applied the requested regulatory domain before we just
+		 * inform userspace we have processed the request
+		 */
+		if (r == -EALREADY) {
+			nl80211_send_reg_change_event(last_request);
+			reg_set_request_processed();
+		}
+		return r;
+	}
+
+	return call_crda(last_request->alpha2);
+}
+
+/* This processes *all* regulatory hints */
+static void reg_process_hint(struct regulatory_request *reg_request)
+{
+	int r = 0;
+	struct wiphy *wiphy = NULL;
+	enum nl80211_reg_initiator initiator = reg_request->initiator;
+
+	BUG_ON(!reg_request->alpha2);
+
+	if (wiphy_idx_valid(reg_request->wiphy_idx))
+		wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
+
+	if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
+	    !wiphy) {
+		kfree(reg_request);
+		return;
+	}
+
+	r = __regulatory_hint(wiphy, reg_request);
+	/* This is required so that the orig_* parameters are saved */
+	if (r == -EALREADY && wiphy &&
+	    wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY) {
+		wiphy_update_regulatory(wiphy, initiator);
+		return;
+	}
+
+	/*
+	 * We only time out user hints, given that they should be the only
+	 * source of bogus requests.
+	 */
+	if (r != -EALREADY &&
+	    reg_request->initiator == NL80211_REGDOM_SET_BY_USER)
+		schedule_delayed_work(&reg_timeout, msecs_to_jiffies(3142));
+}
+
+/*
+ * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
+ * Regulatory hints come on a first come first serve basis and we
+ * must process each one atomically.
+ */
+static void reg_process_pending_hints(void)
+{
+	struct regulatory_request *reg_request;
+
+	mutex_lock(&cfg80211_mutex);
+	mutex_lock(&reg_mutex);
+
+	/* When last_request->processed becomes true this will be rescheduled */
+	if (last_request && !last_request->processed) {
+		REG_DBG_PRINT("Pending regulatory request, waiting "
+			      "for it to be processed...");
+		goto out;
+	}
+
+	spin_lock(&reg_requests_lock);
+
+	if (list_empty(&reg_requests_list)) {
+		spin_unlock(&reg_requests_lock);
+		goto out;
+	}
+
+	reg_request = list_first_entry(&reg_requests_list,
+				       struct regulatory_request,
+				       list);
+	list_del_init(&reg_request->list);
+
+	spin_unlock(&reg_requests_lock);
+
+	reg_process_hint(reg_request);
+
+out:
+	mutex_unlock(&reg_mutex);
+	mutex_unlock(&cfg80211_mutex);
+}
+
+/* Processes beacon hints -- this has nothing to do with country IEs */
+static void reg_process_pending_beacon_hints(void)
+{
+	struct cfg80211_registered_device *rdev;
+	struct reg_beacon *pending_beacon, *tmp;
+
+	/*
+	 * No need to hold the reg_mutex here as we just touch wiphys
+	 * and do not read or access regulatory variables.
+	 */
+	mutex_lock(&cfg80211_mutex);
+
+	/* This goes through the _pending_ beacon list */
+	spin_lock_bh(&reg_pending_beacons_lock);
+
+	if (list_empty(&reg_pending_beacons)) {
+		spin_unlock_bh(&reg_pending_beacons_lock);
+		goto out;
+	}
+
+	list_for_each_entry_safe(pending_beacon, tmp,
+				 &reg_pending_beacons, list) {
+
+		list_del_init(&pending_beacon->list);
+
+		/* Applies the beacon hint to current wiphys */
+		list_for_each_entry(rdev, &cfg80211_rdev_list, list)
+			wiphy_update_new_beacon(&rdev->wiphy, pending_beacon);
+
+		/* Remembers the beacon hint for new wiphys or reg changes */
+		list_add_tail(&pending_beacon->list, &reg_beacon_list);
+	}
+
+	spin_unlock_bh(&reg_pending_beacons_lock);
+out:
+	mutex_unlock(&cfg80211_mutex);
+}
+
+static void reg_todo(struct work_struct *work)
+{
+	reg_process_pending_hints();
+	reg_process_pending_beacon_hints();
+}
+
+static void queue_regulatory_request(struct regulatory_request *request)
+{
+	if (isalpha(request->alpha2[0]))
+		request->alpha2[0] = toupper(request->alpha2[0]);
+	if (isalpha(request->alpha2[1]))
+		request->alpha2[1] = toupper(request->alpha2[1]);
+
+	spin_lock(&reg_requests_lock);
+	list_add_tail(&request->list, &reg_requests_list);
+	spin_unlock(&reg_requests_lock);
+
+	schedule_work(&reg_work);
+}
+
+/*
+ * Core regulatory hint -- happens during cfg80211_init()
+ * and when we restore regulatory settings.
+ */
+static int regulatory_hint_core(const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	request = kzalloc(sizeof(struct regulatory_request),
+			  GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_CORE;
+
+	queue_regulatory_request(request);
+
+	return 0;
+}
+
+/* User hints */
+int regulatory_hint_user(const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	BUG_ON(!alpha2);
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->wiphy_idx = WIPHY_IDX_STALE;
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_USER;
+
+	queue_regulatory_request(request);
+
+	return 0;
+}
+
+/* Driver hints */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
+{
+	struct regulatory_request *request;
+
+	BUG_ON(!alpha2);
+	BUG_ON(!wiphy);
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->wiphy_idx = get_wiphy_idx(wiphy);
+
+	/* Must have registered wiphy first */
+	BUG_ON(!wiphy_idx_valid(request->wiphy_idx));
+
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
+
+	queue_regulatory_request(request);
+
+	return 0;
+}
+EXPORT_SYMBOL(regulatory_hint);
+
+/*
+ * We hold wdev_lock() here so we cannot hold cfg80211_mutex() and
+ * therefore cannot iterate over the rdev list here.
+ */
+void regulatory_hint_11d(struct wiphy *wiphy,
+			 enum ieee80211_band band,
+			 u8 *country_ie,
+			 u8 country_ie_len)
+{
+	char alpha2[2];
+	enum environment_cap env = ENVIRON_ANY;
+	struct regulatory_request *request;
+
+	mutex_lock(&reg_mutex);
+
+	if (unlikely(!last_request))
+		goto out;
+
+	/* IE len must be evenly divisible by 2 */
+	if (country_ie_len & 0x01)
+		goto out;
+
+	if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
+		goto out;
+
+	alpha2[0] = country_ie[0];
+	alpha2[1] = country_ie[1];
+
+	if (country_ie[2] == 'I')
+		env = ENVIRON_INDOOR;
+	else if (country_ie[2] == 'O')
+		env = ENVIRON_OUTDOOR;
+
+	/*
+	 * We will run this only upon a successful connection on cfg80211.
+	 * We leave conflict resolution to the workqueue, where can hold
+	 * cfg80211_mutex.
+	 */
+	if (likely(last_request->initiator ==
+	    NL80211_REGDOM_SET_BY_COUNTRY_IE &&
+	    wiphy_idx_valid(last_request->wiphy_idx)))
+		goto out;
+
+	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
+	if (!request)
+		goto out;
+
+	request->wiphy_idx = get_wiphy_idx(wiphy);
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
+	request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
+	request->country_ie_env = env;
+
+	mutex_unlock(&reg_mutex);
+
+	queue_regulatory_request(request);
+
+	return;
+
+out:
+	mutex_unlock(&reg_mutex);
+}
+
+static void restore_alpha2(char *alpha2, bool reset_user)
+{
+	/* indicates there is no alpha2 to consider for restoration */
+	alpha2[0] = '9';
+	alpha2[1] = '7';
+
+	/* The user setting has precedence over the module parameter */
+	if (is_user_regdom_saved()) {
+		/* Unless we're asked to ignore it and reset it */
+		if (reset_user) {
+			REG_DBG_PRINT("Restoring regulatory settings "
+			       "including user preference\n");
+			user_alpha2[0] = '9';
+			user_alpha2[1] = '7';
+
+			/*
+			 * If we're ignoring user settings, we still need to
+			 * check the module parameter to ensure we put things
+			 * back as they were for a full restore.
+			 */
+			if (!is_world_regdom(ieee80211_regdom)) {
+				REG_DBG_PRINT("Keeping preference on "
+				       "module parameter ieee80211_regdom: %c%c\n",
+				       ieee80211_regdom[0],
+				       ieee80211_regdom[1]);
+				alpha2[0] = ieee80211_regdom[0];
+				alpha2[1] = ieee80211_regdom[1];
+			}
+		} else {
+			REG_DBG_PRINT("Restoring regulatory settings "
+			       "while preserving user preference for: %c%c\n",
+			       user_alpha2[0],
+			       user_alpha2[1]);
+			alpha2[0] = user_alpha2[0];
+			alpha2[1] = user_alpha2[1];
+		}
+	} else if (!is_world_regdom(ieee80211_regdom)) {
+		REG_DBG_PRINT("Keeping preference on "
+		       "module parameter ieee80211_regdom: %c%c\n",
+		       ieee80211_regdom[0],
+		       ieee80211_regdom[1]);
+		alpha2[0] = ieee80211_regdom[0];
+		alpha2[1] = ieee80211_regdom[1];
+	} else
+		REG_DBG_PRINT("Restoring regulatory settings\n");
+}
+
+/*
+ * Restoring regulatory settings involves ingoring any
+ * possibly stale country IE information and user regulatory
+ * settings if so desired, this includes any beacon hints
+ * learned as we could have traveled outside to another country
+ * after disconnection. To restore regulatory settings we do
+ * exactly what we did at bootup:
+ *
+ *   - send a core regulatory hint
+ *   - send a user regulatory hint if applicable
+ *
+ * Device drivers that send a regulatory hint for a specific country
+ * keep their own regulatory domain on wiphy->regd so that does does
+ * not need to be remembered.
+ */
+static void restore_regulatory_settings(bool reset_user)
+{
+	char alpha2[2];
+	char world_alpha2[2];
+	struct reg_beacon *reg_beacon, *btmp;
+	struct regulatory_request *reg_request, *tmp;
+	LIST_HEAD(tmp_reg_req_list);
+
+	mutex_lock(&cfg80211_mutex);
+	mutex_lock(&reg_mutex);
+
+	reset_regdomains(true);
+	restore_alpha2(alpha2, reset_user);
+
+	/*
+	 * If there's any pending requests we simply
+	 * stash them to a temporary pending queue and
+	 * add then after we've restored regulatory
+	 * settings.
+	 */
+	spin_lock(&reg_requests_lock);
+	if (!list_empty(&reg_requests_list)) {
+		list_for_each_entry_safe(reg_request, tmp,
+					 &reg_requests_list, list) {
+			if (reg_request->initiator !=
+			    NL80211_REGDOM_SET_BY_USER)
+				continue;
+			list_del(&reg_request->list);
+			list_add_tail(&reg_request->list, &tmp_reg_req_list);
+		}
+	}
+	spin_unlock(&reg_requests_lock);
+
+	/* Clear beacon hints */
+	spin_lock_bh(&reg_pending_beacons_lock);
+	if (!list_empty(&reg_pending_beacons)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_pending_beacons, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+	spin_unlock_bh(&reg_pending_beacons_lock);
+
+	if (!list_empty(&reg_beacon_list)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_beacon_list, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+
+	/* First restore to the basic regulatory settings */
+	cfg80211_regdomain = cfg80211_world_regdom;
+	world_alpha2[0] = cfg80211_regdomain->alpha2[0];
+	world_alpha2[1] = cfg80211_regdomain->alpha2[1];
+
+	mutex_unlock(&reg_mutex);
+	mutex_unlock(&cfg80211_mutex);
+
+	regulatory_hint_core(world_alpha2);
+
+	/*
+	 * This restores the ieee80211_regdom module parameter
+	 * preference or the last user requested regulatory
+	 * settings, user regulatory settings takes precedence.
+	 */
+	if (is_an_alpha2(alpha2))
+		regulatory_hint_user(user_alpha2);
+
+	if (list_empty(&tmp_reg_req_list))
+		return;
+
+	mutex_lock(&cfg80211_mutex);
+	mutex_lock(&reg_mutex);
+
+	spin_lock(&reg_requests_lock);
+	list_for_each_entry_safe(reg_request, tmp, &tmp_reg_req_list, list) {
+		REG_DBG_PRINT("Adding request for country %c%c back "
+			      "into the queue\n",
+			      reg_request->alpha2[0],
+			      reg_request->alpha2[1]);
+		list_del(&reg_request->list);
+		list_add_tail(&reg_request->list, &reg_requests_list);
+	}
+	spin_unlock(&reg_requests_lock);
+
+	mutex_unlock(&reg_mutex);
+	mutex_unlock(&cfg80211_mutex);
+
+	REG_DBG_PRINT("Kicking the queue\n");
+
+	schedule_work(&reg_work);
+}
+
+void regulatory_hint_disconnect(void)
+{
+	REG_DBG_PRINT("All devices are disconnected, going to "
+		      "restore regulatory settings\n");
+	restore_regulatory_settings(false);
+}
+
+static bool freq_is_chan_12_13_14(u16 freq)
+{
+	if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) ||
+	    freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) ||
+	    freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ))
+		return true;
+	return false;
+}
+
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+				 struct ieee80211_channel *beacon_chan,
+				 gfp_t gfp)
+{
+	struct reg_beacon *reg_beacon;
+
+	if (likely((beacon_chan->beacon_found ||
+	    (beacon_chan->flags & IEEE80211_CHAN_RADAR) ||
+	    (beacon_chan->band == IEEE80211_BAND_2GHZ &&
+	     !freq_is_chan_12_13_14(beacon_chan->center_freq)))))
+		return 0;
+
+	reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
+	if (!reg_beacon)
+		return -ENOMEM;
+
+	REG_DBG_PRINT("Found new beacon on "
+		      "frequency: %d MHz (Ch %d) on %s\n",
+		      beacon_chan->center_freq,
+		      ieee80211_frequency_to_channel(beacon_chan->center_freq),
+		      wiphy_name(wiphy));
+
+	memcpy(&reg_beacon->chan, beacon_chan,
+		sizeof(struct ieee80211_channel));
+
+
+	/*
+	 * Since we can be called from BH or and non-BH context
+	 * we must use spin_lock_bh()
+	 */
+	spin_lock_bh(&reg_pending_beacons_lock);
+	list_add_tail(&reg_beacon->list, &reg_pending_beacons);
+	spin_unlock_bh(&reg_pending_beacons_lock);
+
+	schedule_work(&reg_work);
+
+	return 0;
+}
+
+static void print_rd_rules(const struct ieee80211_regdomain *rd)
+{
+	unsigned int i;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_freq_range *freq_range = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
+
+	pr_info("    (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp)\n");
+
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		/*
+		 * There may not be documentation for max antenna gain
+		 * in certain regions
+		 */
+		if (power_rule->max_antenna_gain)
+			pr_info("    (%d KHz - %d KHz @ %d KHz), (%d mBi, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_antenna_gain,
+				power_rule->max_eirp);
+		else
+			pr_info("    (%d KHz - %d KHz @ %d KHz), (N/A, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_eirp);
+	}
+}
+
+static void print_regdomain(const struct ieee80211_regdomain *rd)
+{
+
+	if (is_intersected_alpha2(rd->alpha2)) {
+
+		if (last_request->initiator ==
+		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+			struct cfg80211_registered_device *rdev;
+			rdev = cfg80211_rdev_by_wiphy_idx(
+				last_request->wiphy_idx);
+			if (rdev) {
+				pr_info("Current regulatory domain updated by AP to: %c%c\n",
+					rdev->country_ie_alpha2[0],
+					rdev->country_ie_alpha2[1]);
+			} else
+				pr_info("Current regulatory domain intersected:\n");
+		} else
+			pr_info("Current regulatory domain intersected:\n");
+	} else if (is_world_regdom(rd->alpha2))
+		pr_info("World regulatory domain updated:\n");
+	else {
+		if (is_unknown_alpha2(rd->alpha2))
+			pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n");
+		else
+			pr_info("Regulatory domain changed to country: %c%c\n",
+				rd->alpha2[0], rd->alpha2[1]);
+	}
+	print_rd_rules(rd);
+}
+
+static void print_regdomain_info(const struct ieee80211_regdomain *rd)
+{
+	pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
+	print_rd_rules(rd);
+}
+
+/* Takes ownership of rd only if it doesn't fail */
+static int __set_regdom(const struct ieee80211_regdomain *rd)
+{
+	const struct ieee80211_regdomain *intersected_rd = NULL;
+	struct cfg80211_registered_device *rdev = NULL;
+	struct wiphy *request_wiphy;
+	/* Some basic sanity checks first */
+
+	if (is_world_regdom(rd->alpha2)) {
+		if (WARN_ON(!reg_is_valid_request(rd->alpha2)))
+			return -EINVAL;
+		update_world_regdomain(rd);
+		return 0;
+	}
+
+	if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
+			!is_unknown_alpha2(rd->alpha2))
+		return -EINVAL;
+
+	if (!last_request)
+		return -EINVAL;
+
+	/*
+	 * Lets only bother proceeding on the same alpha2 if the current
+	 * rd is non static (it means CRDA was present and was used last)
+	 * and the pending request came in from a country IE
+	 */
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+		/*
+		 * If someone else asked us to change the rd lets only bother
+		 * checking if the alpha2 changes if CRDA was already called
+		 */
+		if (!regdom_changes(rd->alpha2))
+			return -EINVAL;
+	}
+
+	/*
+	 * Now lets set the regulatory domain, update all driver channels
+	 * and finally inform them of what we have done, in case they want
+	 * to review or adjust their own settings based on their own
+	 * internal EEPROM data
+	 */
+
+	if (WARN_ON(!reg_is_valid_request(rd->alpha2)))
+		return -EINVAL;
+
+	if (!is_valid_rd(rd)) {
+		pr_err("Invalid regulatory domain detected:\n");
+		print_regdomain_info(rd);
+		return -EINVAL;
+	}
+
+	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+	if (!request_wiphy &&
+	    (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
+	     last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)) {
+		schedule_delayed_work(&reg_timeout, 0);
+		return -ENODEV;
+	}
+
+	if (!last_request->intersect) {
+		int r;
+
+		if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) {
+			reset_regdomains(false);
+			cfg80211_regdomain = rd;
+			return 0;
+		}
+
+		/*
+		 * For a driver hint, lets copy the regulatory domain the
+		 * driver wanted to the wiphy to deal with conflicts
+		 */
+
+		/*
+		 * Userspace could have sent two replies with only
+		 * one kernel request.
+		 */
+		if (request_wiphy->regd)
+			return -EALREADY;
+
+		r = reg_copy_regd(&request_wiphy->regd, rd);
+		if (r)
+			return r;
+
+		reset_regdomains(false);
+		cfg80211_regdomain = rd;
+		return 0;
+	}
+
+	/* Intersection requires a bit more work */
+
+	if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
+
+		intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
+		if (!intersected_rd)
+			return -EINVAL;
+
+		/*
+		 * We can trash what CRDA provided now.
+		 * However if a driver requested this specific regulatory
+		 * domain we keep it for its private use
+		 */
+		if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER)
+			request_wiphy->regd = rd;
+		else
+			kfree(rd);
+
+		rd = NULL;
+
+		reset_regdomains(false);
+		cfg80211_regdomain = intersected_rd;
+
+		return 0;
+	}
+
+	if (!intersected_rd)
+		return -EINVAL;
+
+	rdev = wiphy_to_dev(request_wiphy);
+
+	rdev->country_ie_alpha2[0] = rd->alpha2[0];
+	rdev->country_ie_alpha2[1] = rd->alpha2[1];
+	rdev->env = last_request->country_ie_env;
+
+	BUG_ON(intersected_rd == rd);
+
+	kfree(rd);
+	rd = NULL;
+
+	reset_regdomains(false);
+	cfg80211_regdomain = intersected_rd;
+
+	return 0;
+}
+
+
+/*
+ * Use this call to set the current regulatory domain. Conflicts with
+ * multiple drivers can be ironed out later. Caller must've already
+ * kmalloc'd the rd structure. Caller must hold cfg80211_mutex
+ */
+int set_regdom(const struct ieee80211_regdomain *rd)
+{
+	int r;
+
+	assert_cfg80211_lock();
+
+	mutex_lock(&reg_mutex);
+
+	/* Note that this doesn't update the wiphys, this is done below */
+	r = __set_regdom(rd);
+	if (r) {
+		kfree(rd);
+		mutex_unlock(&reg_mutex);
+		return r;
+	}
+
+	/* This would make this whole thing pointless */
+	if (!last_request->intersect)
+		BUG_ON(rd != cfg80211_regdomain);
+
+	/* update all wiphys now with the new established regulatory domain */
+	update_all_wiphy_regulatory(last_request->initiator);
+
+	print_regdomain(cfg80211_regdomain);
+
+	nl80211_send_reg_change_event(last_request);
+
+	reg_set_request_processed();
+
+	mutex_unlock(&reg_mutex);
+
+	return r;
+}
+
+#ifdef CONFIG_HOTPLUG
+int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	if (last_request && !last_request->processed) {
+		if (add_uevent_var(env, "COUNTRY=%c%c",
+				   last_request->alpha2[0],
+				   last_request->alpha2[1]))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+#else
+int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG */
+
+/* Caller must hold cfg80211_mutex */
+void reg_device_remove(struct wiphy *wiphy)
+{
+	struct wiphy *request_wiphy = NULL;
+
+	assert_cfg80211_lock();
+
+	mutex_lock(&reg_mutex);
+
+	kfree(wiphy->regd);
+
+	if (last_request)
+		request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
+
+	if (!request_wiphy || request_wiphy != wiphy)
+		goto out;
+
+	last_request->wiphy_idx = WIPHY_IDX_STALE;
+	last_request->country_ie_env = ENVIRON_ANY;
+out:
+	mutex_unlock(&reg_mutex);
+}
+
+static void reg_timeout_work(struct work_struct *work)
+{
+	REG_DBG_PRINT("Timeout while waiting for CRDA to reply, "
+		      "restoring regulatory settings");
+	restore_regulatory_settings(true);
+}
+
+int __init regulatory_init(void)
+{
+	int err = 0;
+
+	reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
+	if (IS_ERR(reg_pdev))
+		return PTR_ERR(reg_pdev);
+
+	reg_pdev->dev.type = &reg_device_type;
+
+	spin_lock_init(&reg_requests_lock);
+	spin_lock_init(&reg_pending_beacons_lock);
+
+	reg_regdb_size_check();
+
+	cfg80211_regdomain = cfg80211_world_regdom;
+
+	user_alpha2[0] = '9';
+	user_alpha2[1] = '7';
+
+	/* We always try to get an update for the static regdomain */
+	err = regulatory_hint_core(cfg80211_regdomain->alpha2);
+	if (err) {
+		if (err == -ENOMEM)
+			return err;
+		/*
+		 * N.B. kobject_uevent_env() can fail mainly for when we're out
+		 * memory which is handled and propagated appropriately above
+		 * but it can also fail during a netlink_broadcast() or during
+		 * early boot for call_usermodehelper(). For now treat these
+		 * errors as non-fatal.
+		 */
+		pr_err("kobject_uevent_env() was unable to call CRDA during init\n");
+#ifdef CONFIG_CFG80211_REG_DEBUG
+		/* We want to find out exactly why when debugging */
+		WARN_ON(err);
+#endif
+	}
+
+	/*
+	 * Finally, if the user set the module parameter treat it
+	 * as a user hint.
+	 */
+	if (!is_world_regdom(ieee80211_regdom))
+		regulatory_hint_user(ieee80211_regdom);
+
+	return 0;
+}
+
+void /* __init_or_exit */ regulatory_exit(void)
+{
+	struct regulatory_request *reg_request, *tmp;
+	struct reg_beacon *reg_beacon, *btmp;
+
+	cancel_work_sync(&reg_work);
+	cancel_delayed_work_sync(&reg_timeout);
+
+	mutex_lock(&cfg80211_mutex);
+	mutex_lock(&reg_mutex);
+
+	reset_regdomains(true);
+
+	dev_set_uevent_suppress(&reg_pdev->dev, true);
+
+	platform_device_unregister(reg_pdev);
+
+	spin_lock_bh(&reg_pending_beacons_lock);
+	if (!list_empty(&reg_pending_beacons)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_pending_beacons, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+	spin_unlock_bh(&reg_pending_beacons_lock);
+
+	if (!list_empty(&reg_beacon_list)) {
+		list_for_each_entry_safe(reg_beacon, btmp,
+					 &reg_beacon_list, list) {
+			list_del(&reg_beacon->list);
+			kfree(reg_beacon);
+		}
+	}
+
+	spin_lock(&reg_requests_lock);
+	if (!list_empty(&reg_requests_list)) {
+		list_for_each_entry_safe(reg_request, tmp,
+					 &reg_requests_list, list) {
+			list_del(&reg_request->list);
+			kfree(reg_request);
+		}
+	}
+	spin_unlock(&reg_requests_lock);
+
+	mutex_unlock(&reg_mutex);
+	mutex_unlock(&cfg80211_mutex);
+}
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
new file mode 100644
index 00000000..b67d1c3a
--- /dev/null
+++ b/net/wireless/reg.h
@@ -0,0 +1,85 @@
+#ifndef __NET_WIRELESS_REG_H
+#define __NET_WIRELESS_REG_H
+
+extern const struct ieee80211_regdomain *cfg80211_regdomain;
+
+bool is_world_regdom(const char *alpha2);
+bool reg_is_valid_request(const char *alpha2);
+
+int regulatory_hint_user(const char *alpha2);
+
+int reg_device_uevent(struct device *dev, struct kobj_uevent_env *env);
+void reg_device_remove(struct wiphy *wiphy);
+
+int __init regulatory_init(void);
+void regulatory_exit(void);
+
+int set_regdom(const struct ieee80211_regdomain *rd);
+
+/**
+ * regulatory_hint_found_beacon - hints a beacon was found on a channel
+ * @wiphy: the wireless device where the beacon was found on
+ * @beacon_chan: the channel on which the beacon was found on
+ * @gfp: context flags
+ *
+ * This informs the wireless core that a beacon from an AP was found on
+ * the channel provided. This allows the wireless core to make educated
+ * guesses on regulatory to help with world roaming. This is only used for
+ * world roaming -- when we do not know our current location. This is
+ * only useful on channels 12, 13 and 14 on the 2 GHz band as channels
+ * 1-11 are already enabled by the world regulatory domain; and on
+ * non-radar 5 GHz channels.
+ *
+ * Drivers do not need to call this, cfg80211 will do it for after a scan
+ * on a newly found BSS. If you cannot make use of this feature you can
+ * set the wiphy->disable_beacon_hints to true.
+ */
+int regulatory_hint_found_beacon(struct wiphy *wiphy,
+					struct ieee80211_channel *beacon_chan,
+					gfp_t gfp);
+
+/**
+ * regulatory_hint_11d - hints a country IE as a regulatory domain
+ * @wiphy: the wireless device giving the hint (used only for reporting
+ *	conflicts)
+ * @band: the band on which the country IE was received on. This determines
+ *	the band we'll process the country IE channel triplets for.
+ * @country_ie: pointer to the country IE
+ * @country_ie_len: length of the country IE
+ *
+ * We will intersect the rd with the what CRDA tells us should apply
+ * for the alpha2 this country IE belongs to, this prevents APs from
+ * sending us incorrect or outdated information against a country.
+ *
+ * The AP is expected to provide Country IE channel triplets for the
+ * band it is on. It is technically possible for APs to send channel
+ * country IE triplets even for channels outside of the band they are
+ * in but for that they would have to use the regulatory extension
+ * in combination with a triplet but this behaviour is currently
+ * not observed. For this reason if a triplet is seen with channel
+ * information for a band the BSS is not present in it will be ignored.
+ */
+void regulatory_hint_11d(struct wiphy *wiphy,
+			 enum ieee80211_band band,
+			 u8 *country_ie,
+			 u8 country_ie_len);
+
+/**
+ * regulatory_hint_disconnect - informs all devices have been disconneted
+ *
+ * Regulotory rules can be enhanced further upon scanning and upon
+ * connection to an AP. These rules become stale if we disconnect
+ * and go to another country, whether or not we suspend and resume.
+ * If we suspend, go to another country and resume we'll automatically
+ * get disconnected shortly after resuming and things will be reset as well.
+ * This routine is a helper to restore regulatory settings to how they were
+ * prior to our first connect attempt. This includes ignoring country IE and
+ * beacon regulatory hints. The ieee80211_regdom module parameter will always
+ * be respected but if a user had set the regulatory domain that will take
+ * precedence.
+ *
+ * Must be called from process context.
+ */
+void regulatory_hint_disconnect(void);
+
+#endif  /* __NET_WIRELESS_REG_H */
diff --git a/net/wireless/regdb.h b/net/wireless/regdb.h
new file mode 100644
index 00000000..818222c9
--- /dev/null
+++ b/net/wireless/regdb.h
@@ -0,0 +1,7 @@
+#ifndef __REGDB_H__
+#define __REGDB_H__
+
+extern const struct ieee80211_regdomain *reg_regdb[];
+extern int reg_regdb_size;
+
+#endif /* __REGDB_H__ */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
new file mode 100644
index 00000000..cbbc9273
--- /dev/null
+++ b/net/wireless/scan.c
@@ -0,0 +1,1197 @@
+/*
+ * cfg80211 scan result handling
+ *
+ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/wireless.h>
+#include <linux/nl80211.h>
+#include <linux/etherdevice.h>
+#include <net/arp.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include "core.h"
+#include "nl80211.h"
+#include "wext-compat.h"
+
+#define IEEE80211_SCAN_RESULT_EXPIRE	(3 * HZ)
+
+void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool leak)
+{
+	struct cfg80211_scan_request *request;
+	struct net_device *dev;
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+#endif
+
+	ASSERT_RDEV_LOCK(rdev);
+
+	request = rdev->scan_req;
+
+	if (!request)
+		return;
+
+	dev = request->dev;
+
+	/*
+	 * This must be before sending the other events!
+	 * Otherwise, wpa_supplicant gets completely confused with
+	 * wext events.
+	 */
+	cfg80211_sme_scan_done(dev);
+
+	if (request->aborted)
+		nl80211_send_scan_aborted(rdev, dev);
+	else
+		nl80211_send_scan_done(rdev, dev);
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (!request->aborted) {
+		memset(&wrqu, 0, sizeof(wrqu));
+
+		wireless_send_event(dev, SIOCGIWSCAN, &wrqu, NULL);
+	}
+#endif
+
+	dev_put(dev);
+
+	rdev->scan_req = NULL;
+
+	/*
+	 * OK. If this is invoked with "leak" then we can't
+	 * free this ... but we've cleaned it up anyway. The
+	 * driver failed to call the scan_done callback, so
+	 * all bets are off, it might still be trying to use
+	 * the scan request or not ... if it accesses the dev
+	 * in there (it shouldn't anyway) then it may crash.
+	 */
+	if (!leak)
+		kfree(request);
+}
+
+void __cfg80211_scan_done(struct work_struct *wk)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rdev = container_of(wk, struct cfg80211_registered_device,
+			    scan_done_wk);
+
+	cfg80211_lock_rdev(rdev);
+	___cfg80211_scan_done(rdev, false);
+	cfg80211_unlock_rdev(rdev);
+}
+
+void cfg80211_scan_done(struct cfg80211_scan_request *request, bool aborted)
+{
+	WARN_ON(request != wiphy_to_dev(request->wiphy)->scan_req);
+
+	request->aborted = aborted;
+	queue_work(cfg80211_wq, &wiphy_to_dev(request->wiphy)->scan_done_wk);
+}
+EXPORT_SYMBOL(cfg80211_scan_done);
+
+void __cfg80211_sched_scan_results(struct work_struct *wk)
+{
+	struct cfg80211_registered_device *rdev;
+
+	rdev = container_of(wk, struct cfg80211_registered_device,
+			    sched_scan_results_wk);
+
+	mutex_lock(&rdev->sched_scan_mtx);
+
+	/* we don't have sched_scan_req anymore if the scan is stopping */
+	if (rdev->sched_scan_req)
+		nl80211_send_sched_scan_results(rdev,
+						rdev->sched_scan_req->dev);
+
+	mutex_unlock(&rdev->sched_scan_mtx);
+}
+
+void cfg80211_sched_scan_results(struct wiphy *wiphy)
+{
+	/* ignore if we're not scanning */
+	if (wiphy_to_dev(wiphy)->sched_scan_req)
+		queue_work(cfg80211_wq,
+			   &wiphy_to_dev(wiphy)->sched_scan_results_wk);
+}
+EXPORT_SYMBOL(cfg80211_sched_scan_results);
+
+void cfg80211_sched_scan_stopped(struct wiphy *wiphy)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	mutex_lock(&rdev->sched_scan_mtx);
+	__cfg80211_stop_sched_scan(rdev, true);
+	mutex_unlock(&rdev->sched_scan_mtx);
+}
+EXPORT_SYMBOL(cfg80211_sched_scan_stopped);
+
+int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
+			       bool driver_initiated)
+{
+	int err;
+	struct net_device *dev;
+
+	lockdep_assert_held(&rdev->sched_scan_mtx);
+
+	if (!rdev->sched_scan_req)
+		return 0;
+
+	dev = rdev->sched_scan_req->dev;
+
+	if (!driver_initiated) {
+		err = rdev->ops->sched_scan_stop(&rdev->wiphy, dev);
+		if (err)
+			return err;
+	}
+
+	nl80211_send_sched_scan(rdev, dev, NL80211_CMD_SCHED_SCAN_STOPPED);
+
+	kfree(rdev->sched_scan_req);
+	rdev->sched_scan_req = NULL;
+
+	return err;
+}
+
+static void bss_release(struct kref *ref)
+{
+	struct cfg80211_internal_bss *bss;
+
+	bss = container_of(ref, struct cfg80211_internal_bss, ref);
+	if (bss->pub.free_priv)
+		bss->pub.free_priv(&bss->pub);
+
+	if (bss->beacon_ies_allocated)
+		kfree(bss->pub.beacon_ies);
+	if (bss->proberesp_ies_allocated)
+		kfree(bss->pub.proberesp_ies);
+
+	BUG_ON(atomic_read(&bss->hold));
+
+	kfree(bss);
+}
+
+/* must hold dev->bss_lock! */
+void cfg80211_bss_age(struct cfg80211_registered_device *dev,
+                      unsigned long age_secs)
+{
+	struct cfg80211_internal_bss *bss;
+	unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		bss->ts -= age_jiffies;
+	}
+}
+
+/* must hold dev->bss_lock! */
+static void __cfg80211_unlink_bss(struct cfg80211_registered_device *dev,
+				  struct cfg80211_internal_bss *bss)
+{
+	list_del_init(&bss->list);
+	rb_erase(&bss->rbn, &dev->bss_tree);
+	kref_put(&bss->ref, bss_release);
+}
+
+/* must hold dev->bss_lock! */
+void cfg80211_bss_expire(struct cfg80211_registered_device *dev)
+{
+	struct cfg80211_internal_bss *bss, *tmp;
+	bool expired = false;
+
+	list_for_each_entry_safe(bss, tmp, &dev->bss_list, list) {
+		if (atomic_read(&bss->hold))
+			continue;
+		if (!time_after(jiffies, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE))
+			continue;
+		__cfg80211_unlink_bss(dev, bss);
+		expired = true;
+	}
+
+	if (expired)
+		dev->bss_generation++;
+}
+
+const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
+{
+	while (len > 2 && ies[0] != eid) {
+		len -= ies[1] + 2;
+		ies += ies[1] + 2;
+	}
+	if (len < 2)
+		return NULL;
+	if (len < 2 + ies[1])
+		return NULL;
+	return ies;
+}
+EXPORT_SYMBOL(cfg80211_find_ie);
+
+static int cmp_ies(u8 num, u8 *ies1, size_t len1, u8 *ies2, size_t len2)
+{
+	const u8 *ie1 = cfg80211_find_ie(num, ies1, len1);
+	const u8 *ie2 = cfg80211_find_ie(num, ies2, len2);
+	int r;
+
+	if (!ie1 && !ie2)
+		return 0;
+	if (!ie1 || !ie2)
+		return -1;
+
+	r = memcmp(ie1 + 2, ie2 + 2, min(ie1[1], ie2[1]));
+	if (r == 0 && ie1[1] != ie2[1])
+		return ie2[1] - ie1[1];
+	return r;
+}
+
+static bool is_bss(struct cfg80211_bss *a,
+		   const u8 *bssid,
+		   const u8 *ssid, size_t ssid_len)
+{
+	const u8 *ssidie;
+
+	if (bssid && compare_ether_addr(a->bssid, bssid))
+		return false;
+
+	if (!ssid)
+		return true;
+
+	ssidie = cfg80211_find_ie(WLAN_EID_SSID,
+				  a->information_elements,
+				  a->len_information_elements);
+	if (!ssidie)
+		return false;
+	if (ssidie[1] != ssid_len)
+		return false;
+	return memcmp(ssidie + 2, ssid, ssid_len) == 0;
+}
+
+static bool is_mesh_bss(struct cfg80211_bss *a)
+{
+	const u8 *ie;
+
+	if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability))
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_ID,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+
+	return true;
+}
+
+static bool is_mesh(struct cfg80211_bss *a,
+		    const u8 *meshid, size_t meshidlen,
+		    const u8 *meshcfg)
+{
+	const u8 *ie;
+
+	if (!WLAN_CAPABILITY_IS_STA_BSS(a->capability))
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_ID,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+	if (ie[1] != meshidlen)
+		return false;
+	if (memcmp(ie + 2, meshid, meshidlen))
+		return false;
+
+	ie = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
+			      a->information_elements,
+			      a->len_information_elements);
+	if (!ie)
+		return false;
+	if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
+		return false;
+
+	/*
+	 * Ignore mesh capability (last two bytes of the IE) when
+	 * comparing since that may differ between stations taking
+	 * part in the same mesh.
+	 */
+	return memcmp(ie + 2, meshcfg,
+	    sizeof(struct ieee80211_meshconf_ie) - 2) == 0;
+}
+
+static int cmp_bss(struct cfg80211_bss *a,
+		   struct cfg80211_bss *b)
+{
+	int r;
+
+	if (a->channel != b->channel)
+		return b->channel->center_freq - a->channel->center_freq;
+
+	if (is_mesh_bss(a) && is_mesh_bss(b)) {
+		r = cmp_ies(WLAN_EID_MESH_ID,
+			    a->information_elements,
+			    a->len_information_elements,
+			    b->information_elements,
+			    b->len_information_elements);
+		if (r)
+			return r;
+		return cmp_ies(WLAN_EID_MESH_CONFIG,
+			       a->information_elements,
+			       a->len_information_elements,
+			       b->information_elements,
+			       b->len_information_elements);
+	}
+
+	r = memcmp(a->bssid, b->bssid, ETH_ALEN);
+	if (r)
+		return r;
+
+	return cmp_ies(WLAN_EID_SSID,
+		       a->information_elements,
+		       a->len_information_elements,
+		       b->information_elements,
+		       b->len_information_elements);
+}
+
+struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
+				      struct ieee80211_channel *channel,
+				      const u8 *bssid,
+				      const u8 *ssid, size_t ssid_len,
+				      u16 capa_mask, u16 capa_val)
+{
+	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
+	struct cfg80211_internal_bss *bss, *res = NULL;
+	unsigned long now = jiffies;
+
+	spin_lock_bh(&dev->bss_lock);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if ((bss->pub.capability & capa_mask) != capa_val)
+			continue;
+		if (channel && bss->pub.channel != channel)
+			continue;
+		/* Don't get expired BSS structs */
+		if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) &&
+		    !atomic_read(&bss->hold))
+			continue;
+		if (is_bss(&bss->pub, bssid, ssid, ssid_len)) {
+			res = bss;
+			kref_get(&res->ref);
+			break;
+		}
+	}
+
+	spin_unlock_bh(&dev->bss_lock);
+	if (!res)
+		return NULL;
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_get_bss);
+
+struct cfg80211_bss *cfg80211_get_mesh(struct wiphy *wiphy,
+				       struct ieee80211_channel *channel,
+				       const u8 *meshid, size_t meshidlen,
+				       const u8 *meshcfg)
+{
+	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
+	struct cfg80211_internal_bss *bss, *res = NULL;
+
+	spin_lock_bh(&dev->bss_lock);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if (channel && bss->pub.channel != channel)
+			continue;
+		if (is_mesh(&bss->pub, meshid, meshidlen, meshcfg)) {
+			res = bss;
+			kref_get(&res->ref);
+			break;
+		}
+	}
+
+	spin_unlock_bh(&dev->bss_lock);
+	if (!res)
+		return NULL;
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_get_mesh);
+
+
+static void rb_insert_bss(struct cfg80211_registered_device *dev,
+			  struct cfg80211_internal_bss *bss)
+{
+	struct rb_node **p = &dev->bss_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfg80211_internal_bss *tbss;
+	int cmp;
+
+	while (*p) {
+		parent = *p;
+		tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn);
+
+		cmp = cmp_bss(&bss->pub, &tbss->pub);
+
+		if (WARN_ON(!cmp)) {
+			/* will sort of leak this BSS */
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&bss->rbn, parent, p);
+	rb_insert_color(&bss->rbn, &dev->bss_tree);
+}
+
+static struct cfg80211_internal_bss *
+rb_find_bss(struct cfg80211_registered_device *dev,
+	    struct cfg80211_internal_bss *res)
+{
+	struct rb_node *n = dev->bss_tree.rb_node;
+	struct cfg80211_internal_bss *bss;
+	int r;
+
+	while (n) {
+		bss = rb_entry(n, struct cfg80211_internal_bss, rbn);
+		r = cmp_bss(&res->pub, &bss->pub);
+
+		if (r == 0)
+			return bss;
+		else if (r < 0)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+	}
+
+	return NULL;
+}
+
+static struct cfg80211_internal_bss *
+cfg80211_bss_update(struct cfg80211_registered_device *dev,
+		    struct cfg80211_internal_bss *res)
+{
+	struct cfg80211_internal_bss *found = NULL;
+
+	/*
+	 * The reference to "res" is donated to this function.
+	 */
+
+	if (WARN_ON(!res->pub.channel)) {
+		kref_put(&res->ref, bss_release);
+		return NULL;
+	}
+
+	res->ts = jiffies;
+
+	spin_lock_bh(&dev->bss_lock);
+
+	found = rb_find_bss(dev, res);
+
+	if (found) {
+		found->pub.beacon_interval = res->pub.beacon_interval;
+		found->pub.tsf = res->pub.tsf;
+		found->pub.signal = res->pub.signal;
+		found->pub.capability = res->pub.capability;
+		found->ts = res->ts;
+
+		/* Update IEs */
+		if (res->pub.proberesp_ies) {
+			size_t used = dev->wiphy.bss_priv_size + sizeof(*res);
+			size_t ielen = res->pub.len_proberesp_ies;
+
+			if (found->pub.proberesp_ies &&
+			    !found->proberesp_ies_allocated &&
+			    ksize(found) >= used + ielen) {
+				memcpy(found->pub.proberesp_ies,
+				       res->pub.proberesp_ies, ielen);
+				found->pub.len_proberesp_ies = ielen;
+			} else {
+				u8 *ies = found->pub.proberesp_ies;
+
+				if (found->proberesp_ies_allocated)
+					ies = krealloc(ies, ielen, GFP_ATOMIC);
+				else
+					ies = kmalloc(ielen, GFP_ATOMIC);
+
+				if (ies) {
+					memcpy(ies, res->pub.proberesp_ies,
+					       ielen);
+					found->proberesp_ies_allocated = true;
+					found->pub.proberesp_ies = ies;
+					found->pub.len_proberesp_ies = ielen;
+				}
+			}
+
+			/* Override possible earlier Beacon frame IEs */
+			found->pub.information_elements =
+				found->pub.proberesp_ies;
+			found->pub.len_information_elements =
+				found->pub.len_proberesp_ies;
+		}
+		if (res->pub.beacon_ies) {
+			size_t used = dev->wiphy.bss_priv_size + sizeof(*res);
+			size_t ielen = res->pub.len_beacon_ies;
+			bool information_elements_is_beacon_ies =
+				(found->pub.information_elements ==
+				 found->pub.beacon_ies);
+
+			if (found->pub.beacon_ies &&
+			    !found->beacon_ies_allocated &&
+			    ksize(found) >= used + ielen) {
+				memcpy(found->pub.beacon_ies,
+				       res->pub.beacon_ies, ielen);
+				found->pub.len_beacon_ies = ielen;
+			} else {
+				u8 *ies = found->pub.beacon_ies;
+
+				if (found->beacon_ies_allocated)
+					ies = krealloc(ies, ielen, GFP_ATOMIC);
+				else
+					ies = kmalloc(ielen, GFP_ATOMIC);
+
+				if (ies) {
+					memcpy(ies, res->pub.beacon_ies,
+					       ielen);
+					found->beacon_ies_allocated = true;
+					found->pub.beacon_ies = ies;
+					found->pub.len_beacon_ies = ielen;
+				}
+			}
+
+			/* Override IEs if they were from a beacon before */
+			if (information_elements_is_beacon_ies) {
+				found->pub.information_elements =
+					found->pub.beacon_ies;
+				found->pub.len_information_elements =
+					found->pub.len_beacon_ies;
+			}
+		}
+
+		kref_put(&res->ref, bss_release);
+	} else {
+		/* this "consumes" the reference */
+		list_add_tail(&res->list, &dev->bss_list);
+		rb_insert_bss(dev, res);
+		found = res;
+	}
+
+	dev->bss_generation++;
+	spin_unlock_bh(&dev->bss_lock);
+
+	kref_get(&found->ref);
+	return found;
+}
+
+struct cfg80211_bss*
+cfg80211_inform_bss(struct wiphy *wiphy,
+		    struct ieee80211_channel *channel,
+		    const u8 *bssid,
+		    u64 timestamp, u16 capability, u16 beacon_interval,
+		    const u8 *ie, size_t ielen,
+		    s32 signal, gfp_t gfp)
+{
+	struct cfg80211_internal_bss *res;
+	size_t privsz;
+
+	if (WARN_ON(!wiphy))
+		return NULL;
+
+	privsz = wiphy->bss_priv_size;
+
+	if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
+			(signal < 0 || signal > 100)))
+		return NULL;
+
+	res = kzalloc(sizeof(*res) + privsz + ielen, gfp);
+	if (!res)
+		return NULL;
+
+	memcpy(res->pub.bssid, bssid, ETH_ALEN);
+	res->pub.channel = channel;
+	res->pub.signal = signal;
+	res->pub.tsf = timestamp;
+	res->pub.beacon_interval = beacon_interval;
+	res->pub.capability = capability;
+	/*
+	 * Since we do not know here whether the IEs are from a Beacon or Probe
+	 * Response frame, we need to pick one of the options and only use it
+	 * with the driver that does not provide the full Beacon/Probe Response
+	 * frame. Use Beacon frame pointer to avoid indicating that this should
+	 * override the information_elements pointer should we have received an
+	 * earlier indication of Probe Response data.
+	 *
+	 * The initial buffer for the IEs is allocated with the BSS entry and
+	 * is located after the private area.
+	 */
+	res->pub.beacon_ies = (u8 *)res + sizeof(*res) + privsz;
+	memcpy(res->pub.beacon_ies, ie, ielen);
+	res->pub.len_beacon_ies = ielen;
+	res->pub.information_elements = res->pub.beacon_ies;
+	res->pub.len_information_elements = res->pub.len_beacon_ies;
+
+	kref_init(&res->ref);
+
+	res = cfg80211_bss_update(wiphy_to_dev(wiphy), res);
+	if (!res)
+		return NULL;
+
+	if (res->pub.capability & WLAN_CAPABILITY_ESS)
+		regulatory_hint_found_beacon(wiphy, channel, gfp);
+
+	/* cfg80211_bss_update gives us a referenced result */
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_inform_bss);
+
+struct cfg80211_bss *
+cfg80211_inform_bss_frame(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  struct ieee80211_mgmt *mgmt, size_t len,
+			  s32 signal, gfp_t gfp)
+{
+	struct cfg80211_internal_bss *res;
+	size_t ielen = len - offsetof(struct ieee80211_mgmt,
+				      u.probe_resp.variable);
+	size_t privsz;
+
+	if (WARN_ON(!mgmt))
+		return NULL;
+
+	if (WARN_ON(!wiphy))
+		return NULL;
+
+	if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
+	            (signal < 0 || signal > 100)))
+		return NULL;
+
+	if (WARN_ON(len < offsetof(struct ieee80211_mgmt, u.probe_resp.variable)))
+		return NULL;
+
+	privsz = wiphy->bss_priv_size;
+
+	res = kzalloc(sizeof(*res) + privsz + ielen, gfp);
+	if (!res)
+		return NULL;
+
+	memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN);
+	res->pub.channel = channel;
+	res->pub.signal = signal;
+	res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
+	res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
+	res->pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
+	/*
+	 * The initial buffer for the IEs is allocated with the BSS entry and
+	 * is located after the private area.
+	 */
+	if (ieee80211_is_probe_resp(mgmt->frame_control)) {
+		res->pub.proberesp_ies = (u8 *) res + sizeof(*res) + privsz;
+		memcpy(res->pub.proberesp_ies, mgmt->u.probe_resp.variable,
+		       ielen);
+		res->pub.len_proberesp_ies = ielen;
+		res->pub.information_elements = res->pub.proberesp_ies;
+		res->pub.len_information_elements = res->pub.len_proberesp_ies;
+	} else {
+		res->pub.beacon_ies = (u8 *) res + sizeof(*res) + privsz;
+		memcpy(res->pub.beacon_ies, mgmt->u.beacon.variable, ielen);
+		res->pub.len_beacon_ies = ielen;
+		res->pub.information_elements = res->pub.beacon_ies;
+		res->pub.len_information_elements = res->pub.len_beacon_ies;
+	}
+
+	kref_init(&res->ref);
+
+	res = cfg80211_bss_update(wiphy_to_dev(wiphy), res);
+	if (!res)
+		return NULL;
+
+	if (res->pub.capability & WLAN_CAPABILITY_ESS)
+		regulatory_hint_found_beacon(wiphy, channel, gfp);
+
+	/* cfg80211_bss_update gives us a referenced result */
+	return &res->pub;
+}
+EXPORT_SYMBOL(cfg80211_inform_bss_frame);
+
+void cfg80211_put_bss(struct cfg80211_bss *pub)
+{
+	struct cfg80211_internal_bss *bss;
+
+	if (!pub)
+		return;
+
+	bss = container_of(pub, struct cfg80211_internal_bss, pub);
+	kref_put(&bss->ref, bss_release);
+}
+EXPORT_SYMBOL(cfg80211_put_bss);
+
+void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
+{
+	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
+	struct cfg80211_internal_bss *bss;
+
+	if (WARN_ON(!pub))
+		return;
+
+	bss = container_of(pub, struct cfg80211_internal_bss, pub);
+
+	spin_lock_bh(&dev->bss_lock);
+	if (!list_empty(&bss->list)) {
+		__cfg80211_unlink_bss(dev, bss);
+		dev->bss_generation++;
+	}
+	spin_unlock_bh(&dev->bss_lock);
+}
+EXPORT_SYMBOL(cfg80211_unlink_bss);
+
+#ifdef CONFIG_CFG80211_WEXT
+int cfg80211_wext_siwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  union iwreq_data *wrqu, char *extra)
+{
+	struct cfg80211_registered_device *rdev;
+	struct wiphy *wiphy;
+	struct iw_scan_req *wreq = NULL;
+	struct cfg80211_scan_request *creq = NULL;
+	int i, err, n_channels = 0;
+	enum ieee80211_band band;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	if (wrqu->data.length == sizeof(struct iw_scan_req))
+		wreq = (struct iw_scan_req *)extra;
+
+	rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+
+	if (rdev->scan_req) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	wiphy = &rdev->wiphy;
+
+	/* Determine number of channels, needed to allocate creq */
+	if (wreq && wreq->num_channels)
+		n_channels = wreq->num_channels;
+	else {
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+			if (wiphy->bands[band])
+				n_channels += wiphy->bands[band]->n_channels;
+	}
+
+	creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) +
+		       n_channels * sizeof(void *),
+		       GFP_ATOMIC);
+	if (!creq) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	creq->wiphy = wiphy;
+	creq->dev = dev;
+	/* SSIDs come after channels */
+	creq->ssids = (void *)&creq->channels[n_channels];
+	creq->n_channels = n_channels;
+	creq->n_ssids = 1;
+
+	/* translate "Scan on frequencies" request */
+	i = 0;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		int j;
+
+		if (!wiphy->bands[band])
+			continue;
+
+		for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
+			/* ignore disabled channels */
+			if (wiphy->bands[band]->channels[j].flags &
+						IEEE80211_CHAN_DISABLED)
+				continue;
+
+			/* If we have a wireless request structure and the
+			 * wireless request specifies frequencies, then search
+			 * for the matching hardware channel.
+			 */
+			if (wreq && wreq->num_channels) {
+				int k;
+				int wiphy_freq = wiphy->bands[band]->channels[j].center_freq;
+				for (k = 0; k < wreq->num_channels; k++) {
+					int wext_freq = cfg80211_wext_freq(wiphy, &wreq->channel_list[k]);
+					if (wext_freq == wiphy_freq)
+						goto wext_freq_found;
+				}
+				goto wext_freq_not_found;
+			}
+
+		wext_freq_found:
+			creq->channels[i] = &wiphy->bands[band]->channels[j];
+			i++;
+		wext_freq_not_found: ;
+		}
+	}
+	/* No channels found? */
+	if (!i) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Set real number of channels specified in creq->channels[] */
+	creq->n_channels = i;
+
+	/* translate "Scan for SSID" request */
+	if (wreq) {
+		if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
+			if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) {
+				err = -EINVAL;
+				goto out;
+			}
+			memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
+			creq->ssids[0].ssid_len = wreq->essid_len;
+		}
+		if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE)
+			creq->n_ssids = 0;
+	}
+
+	rdev->scan_req = creq;
+	err = rdev->ops->scan(wiphy, dev, creq);
+	if (err) {
+		rdev->scan_req = NULL;
+		/* creq will be freed below */
+	} else {
+		nl80211_send_scan_start(rdev, dev);
+		/* creq now owned by driver */
+		creq = NULL;
+		dev_hold(dev);
+	}
+ out:
+	kfree(creq);
+	cfg80211_unlock_rdev(rdev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwscan);
+
+static void ieee80211_scan_add_ies(struct iw_request_info *info,
+				   struct cfg80211_bss *bss,
+				   char **current_ev, char *end_buf)
+{
+	u8 *pos, *end, *next;
+	struct iw_event iwe;
+
+	if (!bss->information_elements ||
+	    !bss->len_information_elements)
+		return;
+
+	/*
+	 * If needed, fragment the IEs buffer (at IE boundaries) into short
+	 * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
+	 */
+	pos = bss->information_elements;
+	end = pos + bss->len_information_elements;
+
+	while (end - pos > IW_GENERIC_IE_MAX) {
+		next = pos + 2 + pos[1];
+		while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
+			next = next + 2 + next[1];
+
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = next - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+
+		pos = next;
+	}
+
+	if (end > pos) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVGENIE;
+		iwe.u.data.length = end - pos;
+		*current_ev = iwe_stream_add_point(info, *current_ev,
+						   end_buf, &iwe, pos);
+	}
+}
+
+static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
+{
+	unsigned long end = jiffies;
+
+	if (end >= start)
+		return jiffies_to_msecs(end - start);
+
+	return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1);
+}
+
+static char *
+ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
+	      struct cfg80211_internal_bss *bss, char *current_ev,
+	      char *end_buf)
+{
+	struct iw_event iwe;
+	u8 *buf, *cfg, *p;
+	u8 *ie = bss->pub.information_elements;
+	int rem = bss->pub.len_information_elements, i, sig;
+	bool ismesh = false;
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWAP;
+	iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+	memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN);
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_ADDR_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq);
+	iwe.u.freq.e = 0;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWFREQ;
+	iwe.u.freq.m = bss->pub.channel->center_freq;
+	iwe.u.freq.e = 6;
+	current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
+					  IW_EV_FREQ_LEN);
+
+	if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVQUAL;
+		iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
+				     IW_QUAL_NOISE_INVALID |
+				     IW_QUAL_QUAL_UPDATED;
+		switch (wiphy->signal_type) {
+		case CFG80211_SIGNAL_TYPE_MBM:
+			sig = bss->pub.signal / 100;
+			iwe.u.qual.level = sig;
+			iwe.u.qual.updated |= IW_QUAL_DBM;
+			if (sig < -110)		/* rather bad */
+				sig = -110;
+			else if (sig > -40)	/* perfect */
+				sig = -40;
+			/* will give a range of 0 .. 70 */
+			iwe.u.qual.qual = sig + 110;
+			break;
+		case CFG80211_SIGNAL_TYPE_UNSPEC:
+			iwe.u.qual.level = bss->pub.signal;
+			/* will give range 0 .. 100 */
+			iwe.u.qual.qual = bss->pub.signal;
+			break;
+		default:
+			/* not reached */
+			break;
+		}
+		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
+						  &iwe, IW_EV_QUAL_LEN);
+	}
+
+	memset(&iwe, 0, sizeof(iwe));
+	iwe.cmd = SIOCGIWENCODE;
+	if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY)
+		iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+	else
+		iwe.u.data.flags = IW_ENCODE_DISABLED;
+	iwe.u.data.length = 0;
+	current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+					  &iwe, "");
+
+	while (rem >= 2) {
+		/* invalid data */
+		if (ie[1] > rem - 2)
+			break;
+
+		switch (ie[0]) {
+		case WLAN_EID_SSID:
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWESSID;
+			iwe.u.data.length = ie[1];
+			iwe.u.data.flags = 1;
+			current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+							  &iwe, ie + 2);
+			break;
+		case WLAN_EID_MESH_ID:
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWESSID;
+			iwe.u.data.length = ie[1];
+			iwe.u.data.flags = 1;
+			current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+							  &iwe, ie + 2);
+			break;
+		case WLAN_EID_MESH_CONFIG:
+			ismesh = true;
+			if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
+				break;
+			buf = kmalloc(50, GFP_ATOMIC);
+			if (!buf)
+				break;
+			cfg = ie + 2;
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = IWEVCUSTOM;
+			sprintf(buf, "Mesh Network Path Selection Protocol ID: "
+				"0x%02X", cfg[0]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Path Selection Metric ID: 0x%02X",
+				cfg[1]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Congestion Control Mode ID: 0x%02X",
+				cfg[2]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Authentication ID: 0x%02X", cfg[4]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Formation Info: 0x%02X", cfg[5]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			sprintf(buf, "Capabilities: 0x%02X", cfg[6]);
+			iwe.u.data.length = strlen(buf);
+			current_ev = iwe_stream_add_point(info, current_ev,
+							  end_buf,
+							  &iwe, buf);
+			kfree(buf);
+			break;
+		case WLAN_EID_SUPP_RATES:
+		case WLAN_EID_EXT_SUPP_RATES:
+			/* display all supported rates in readable format */
+			p = current_ev + iwe_stream_lcp_len(info);
+
+			memset(&iwe, 0, sizeof(iwe));
+			iwe.cmd = SIOCGIWRATE;
+			/* Those two flags are ignored... */
+			iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+
+			for (i = 0; i < ie[1]; i++) {
+				iwe.u.bitrate.value =
+					((ie[i + 2] & 0x7f) * 500000);
+				p = iwe_stream_add_value(info, current_ev, p,
+						end_buf, &iwe, IW_EV_PARAM_LEN);
+			}
+			current_ev = p;
+			break;
+		}
+		rem -= ie[1] + 2;
+		ie += ie[1] + 2;
+	}
+
+	if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) ||
+	    ismesh) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = SIOCGIWMODE;
+		if (ismesh)
+			iwe.u.mode = IW_MODE_MESH;
+		else if (bss->pub.capability & WLAN_CAPABILITY_ESS)
+			iwe.u.mode = IW_MODE_MASTER;
+		else
+			iwe.u.mode = IW_MODE_ADHOC;
+		current_ev = iwe_stream_add_event(info, current_ev, end_buf,
+						  &iwe, IW_EV_UINT_LEN);
+	}
+
+	buf = kmalloc(30, GFP_ATOMIC);
+	if (buf) {
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, "tsf=%016llx", (unsigned long long)(bss->pub.tsf));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev, end_buf,
+						  &iwe, buf);
+		memset(&iwe, 0, sizeof(iwe));
+		iwe.cmd = IWEVCUSTOM;
+		sprintf(buf, " Last beacon: %ums ago",
+			elapsed_jiffies_msecs(bss->ts));
+		iwe.u.data.length = strlen(buf);
+		current_ev = iwe_stream_add_point(info, current_ev,
+						  end_buf, &iwe, buf);
+		kfree(buf);
+	}
+
+	ieee80211_scan_add_ies(info, &bss->pub, &current_ev, end_buf);
+
+	return current_ev;
+}
+
+
+static int ieee80211_scan_results(struct cfg80211_registered_device *dev,
+				  struct iw_request_info *info,
+				  char *buf, size_t len)
+{
+	char *current_ev = buf;
+	char *end_buf = buf + len;
+	struct cfg80211_internal_bss *bss;
+
+	spin_lock_bh(&dev->bss_lock);
+	cfg80211_bss_expire(dev);
+
+	list_for_each_entry(bss, &dev->bss_list, list) {
+		if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
+			spin_unlock_bh(&dev->bss_lock);
+			return -E2BIG;
+		}
+		current_ev = ieee80211_bss(&dev->wiphy, info, bss,
+					   current_ev, end_buf);
+	}
+	spin_unlock_bh(&dev->bss_lock);
+	return current_ev - buf;
+}
+
+
+int cfg80211_wext_giwscan(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_point *data, char *extra)
+{
+	struct cfg80211_registered_device *rdev;
+	int res;
+
+	if (!netif_running(dev))
+		return -ENETDOWN;
+
+	rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);
+
+	if (IS_ERR(rdev))
+		return PTR_ERR(rdev);
+
+	if (rdev->scan_req) {
+		res = -EAGAIN;
+		goto out;
+	}
+
+	res = ieee80211_scan_results(rdev, info, extra, data->length);
+	data->length = 0;
+	if (res >= 0) {
+		data->length = res;
+		res = 0;
+	}
+
+ out:
+	cfg80211_unlock_rdev(rdev);
+	return res;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwscan);
+#endif
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
new file mode 100644
index 00000000..cf4be212
--- /dev/null
+++ b/net/wireless/sme.c
@@ -0,0 +1,1020 @@
+/*
+ * SME code for cfg80211's connect emulation.
+ *
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2009   Intel Corporation. All rights reserved.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/wireless.h>
+#include <net/iw_handler.h>
+#include <net/cfg80211.h>
+#include <net/rtnetlink.h>
+#include "nl80211.h"
+#include "reg.h"
+
+struct cfg80211_conn {
+	struct cfg80211_connect_params params;
+	/* these are sub-states of the _CONNECTING sme_state */
+	enum {
+		CFG80211_CONN_IDLE,
+		CFG80211_CONN_SCANNING,
+		CFG80211_CONN_SCAN_AGAIN,
+		CFG80211_CONN_AUTHENTICATE_NEXT,
+		CFG80211_CONN_AUTHENTICATING,
+		CFG80211_CONN_ASSOCIATE_NEXT,
+		CFG80211_CONN_ASSOCIATING,
+		CFG80211_CONN_DEAUTH_ASSOC_FAIL,
+	} state;
+	u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
+	u8 *ie;
+	size_t ie_len;
+	bool auto_auth, prev_bssid_valid;
+};
+
+static bool cfg80211_is_all_idle(void)
+{
+	struct cfg80211_registered_device *rdev;
+	struct wireless_dev *wdev;
+	bool is_all_idle = true;
+
+	mutex_lock(&cfg80211_mutex);
+
+	/*
+	 * All devices must be idle as otherwise if you are actively
+	 * scanning some new beacon hints could be learned and would
+	 * count as new regulatory hints.
+	 */
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+		cfg80211_lock_rdev(rdev);
+		list_for_each_entry(wdev, &rdev->netdev_list, list) {
+			wdev_lock(wdev);
+			if (wdev->sme_state != CFG80211_SME_IDLE)
+				is_all_idle = false;
+			wdev_unlock(wdev);
+		}
+		cfg80211_unlock_rdev(rdev);
+	}
+
+	mutex_unlock(&cfg80211_mutex);
+
+	return is_all_idle;
+}
+
+static void disconnect_work(struct work_struct *work)
+{
+	if (!cfg80211_is_all_idle())
+		return;
+
+	regulatory_hint_disconnect();
+}
+
+static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work);
+
+static int cfg80211_conn_scan(struct wireless_dev *wdev)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_scan_request *request;
+	int n_channels, err;
+
+	ASSERT_RTNL();
+	ASSERT_RDEV_LOCK(rdev);
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (rdev->scan_req)
+		return -EBUSY;
+
+	if (wdev->conn->params.channel) {
+		n_channels = 1;
+	} else {
+		enum ieee80211_band band;
+		n_channels = 0;
+
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			if (!wdev->wiphy->bands[band])
+				continue;
+			n_channels += wdev->wiphy->bands[band]->n_channels;
+		}
+	}
+	request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) +
+			  sizeof(request->channels[0]) * n_channels,
+			  GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	if (wdev->conn->params.channel)
+		request->channels[0] = wdev->conn->params.channel;
+	else {
+		int i = 0, j;
+		enum ieee80211_band band;
+
+		for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+			if (!wdev->wiphy->bands[band])
+				continue;
+			for (j = 0; j < wdev->wiphy->bands[band]->n_channels;
+			     i++, j++)
+				request->channels[i] =
+					&wdev->wiphy->bands[band]->channels[j];
+		}
+	}
+	request->n_channels = n_channels;
+	request->ssids = (void *)&request->channels[n_channels];
+	request->n_ssids = 1;
+
+	memcpy(request->ssids[0].ssid, wdev->conn->params.ssid,
+		wdev->conn->params.ssid_len);
+	request->ssids[0].ssid_len = wdev->conn->params.ssid_len;
+
+	request->dev = wdev->netdev;
+	request->wiphy = &rdev->wiphy;
+
+	rdev->scan_req = request;
+
+	err = rdev->ops->scan(wdev->wiphy, wdev->netdev, request);
+	if (!err) {
+		wdev->conn->state = CFG80211_CONN_SCANNING;
+		nl80211_send_scan_start(rdev, wdev->netdev);
+		dev_hold(wdev->netdev);
+	} else {
+		rdev->scan_req = NULL;
+		kfree(request);
+	}
+	return err;
+}
+
+static int cfg80211_conn_do_work(struct wireless_dev *wdev)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_connect_params *params;
+	const u8 *prev_bssid = NULL;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!wdev->conn)
+		return 0;
+
+	params = &wdev->conn->params;
+
+	switch (wdev->conn->state) {
+	case CFG80211_CONN_SCAN_AGAIN:
+		return cfg80211_conn_scan(wdev);
+	case CFG80211_CONN_AUTHENTICATE_NEXT:
+		BUG_ON(!rdev->ops->auth);
+		wdev->conn->state = CFG80211_CONN_AUTHENTICATING;
+		return __cfg80211_mlme_auth(rdev, wdev->netdev,
+					    params->channel, params->auth_type,
+					    params->bssid,
+					    params->ssid, params->ssid_len,
+					    NULL, 0,
+					    params->key, params->key_len,
+					    params->key_idx, false);
+	case CFG80211_CONN_ASSOCIATE_NEXT:
+		BUG_ON(!rdev->ops->assoc);
+		wdev->conn->state = CFG80211_CONN_ASSOCIATING;
+		if (wdev->conn->prev_bssid_valid)
+			prev_bssid = wdev->conn->prev_bssid;
+		err = __cfg80211_mlme_assoc(rdev, wdev->netdev,
+					    params->channel, params->bssid,
+					    prev_bssid,
+					    params->ssid, params->ssid_len,
+					    params->ie, params->ie_len,
+					    false, &params->crypto);
+		if (err)
+			__cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
+					       NULL, 0,
+					       WLAN_REASON_DEAUTH_LEAVING,
+					       false);
+		return err;
+	case CFG80211_CONN_DEAUTH_ASSOC_FAIL:
+		__cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid,
+				       NULL, 0,
+				       WLAN_REASON_DEAUTH_LEAVING, false);
+		/* return an error so that we call __cfg80211_connect_result() */
+		return -EINVAL;
+	default:
+		return 0;
+	}
+}
+
+void cfg80211_conn_work(struct work_struct *work)
+{
+	struct cfg80211_registered_device *rdev =
+		container_of(work, struct cfg80211_registered_device, conn_work);
+	struct wireless_dev *wdev;
+	u8 bssid_buf[ETH_ALEN], *bssid = NULL;
+
+	rtnl_lock();
+	cfg80211_lock_rdev(rdev);
+	mutex_lock(&rdev->devlist_mtx);
+
+	list_for_each_entry(wdev, &rdev->netdev_list, list) {
+		wdev_lock(wdev);
+		if (!netif_running(wdev->netdev)) {
+			wdev_unlock(wdev);
+			continue;
+		}
+		if (wdev->sme_state != CFG80211_SME_CONNECTING) {
+			wdev_unlock(wdev);
+			continue;
+		}
+		if (wdev->conn->params.bssid) {
+			memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN);
+			bssid = bssid_buf;
+		}
+		if (cfg80211_conn_do_work(wdev))
+			__cfg80211_connect_result(
+					wdev->netdev, bssid,
+					NULL, 0, NULL, 0,
+					WLAN_STATUS_UNSPECIFIED_FAILURE,
+					false, NULL);
+		wdev_unlock(wdev);
+	}
+
+	mutex_unlock(&rdev->devlist_mtx);
+	cfg80211_unlock_rdev(rdev);
+	rtnl_unlock();
+}
+
+static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_bss *bss;
+	u16 capa = WLAN_CAPABILITY_ESS;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->conn->params.privacy)
+		capa |= WLAN_CAPABILITY_PRIVACY;
+
+	bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel,
+			       wdev->conn->params.bssid,
+			       wdev->conn->params.ssid,
+			       wdev->conn->params.ssid_len,
+			       WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_PRIVACY,
+			       capa);
+	if (!bss)
+		return NULL;
+
+	memcpy(wdev->conn->bssid, bss->bssid, ETH_ALEN);
+	wdev->conn->params.bssid = wdev->conn->bssid;
+	wdev->conn->params.channel = bss->channel;
+	wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT;
+	schedule_work(&rdev->conn_work);
+
+	return bss;
+}
+
+static void __cfg80211_sme_scan_done(struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_bss *bss;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->sme_state != CFG80211_SME_CONNECTING)
+		return;
+
+	if (!wdev->conn)
+		return;
+
+	if (wdev->conn->state != CFG80211_CONN_SCANNING &&
+	    wdev->conn->state != CFG80211_CONN_SCAN_AGAIN)
+		return;
+
+	bss = cfg80211_get_conn_bss(wdev);
+	if (bss) {
+		cfg80211_put_bss(bss);
+	} else {
+		/* not found */
+		if (wdev->conn->state == CFG80211_CONN_SCAN_AGAIN)
+			schedule_work(&rdev->conn_work);
+		else
+			__cfg80211_connect_result(
+					wdev->netdev,
+					wdev->conn->params.bssid,
+					NULL, 0, NULL, 0,
+					WLAN_STATUS_UNSPECIFIED_FAILURE,
+					false, NULL);
+	}
+}
+
+void cfg80211_sme_scan_done(struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	mutex_lock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx);
+	wdev_lock(wdev);
+	__cfg80211_sme_scan_done(dev);
+	wdev_unlock(wdev);
+	mutex_unlock(&wiphy_to_dev(wdev->wiphy)->devlist_mtx);
+}
+
+void cfg80211_sme_rx_auth(struct net_device *dev,
+			  const u8 *buf, size_t len)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
+	u16 status_code = le16_to_cpu(mgmt->u.auth.status_code);
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	/* should only RX auth frames when connecting */
+	if (wdev->sme_state != CFG80211_SME_CONNECTING)
+		return;
+
+	if (WARN_ON(!wdev->conn))
+		return;
+
+	if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG &&
+	    wdev->conn->auto_auth &&
+	    wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) {
+		/* select automatically between only open, shared, leap */
+		switch (wdev->conn->params.auth_type) {
+		case NL80211_AUTHTYPE_OPEN_SYSTEM:
+			if (wdev->connect_keys)
+				wdev->conn->params.auth_type =
+					NL80211_AUTHTYPE_SHARED_KEY;
+			else
+				wdev->conn->params.auth_type =
+					NL80211_AUTHTYPE_NETWORK_EAP;
+			break;
+		case NL80211_AUTHTYPE_SHARED_KEY:
+			wdev->conn->params.auth_type =
+				NL80211_AUTHTYPE_NETWORK_EAP;
+			break;
+		default:
+			/* huh? */
+			wdev->conn->params.auth_type =
+				NL80211_AUTHTYPE_OPEN_SYSTEM;
+			break;
+		}
+		wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT;
+		schedule_work(&rdev->conn_work);
+	} else if (status_code != WLAN_STATUS_SUCCESS) {
+		__cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, NULL, 0,
+					  status_code, false, NULL);
+	} else if (wdev->sme_state == CFG80211_SME_CONNECTING &&
+		 wdev->conn->state == CFG80211_CONN_AUTHENTICATING) {
+		wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
+		schedule_work(&rdev->conn_work);
+	}
+}
+
+bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	if (WARN_ON(!wdev->conn))
+		return false;
+
+	if (!wdev->conn->prev_bssid_valid)
+		return false;
+
+	/*
+	 * Some stupid APs don't accept reassoc, so we
+	 * need to fall back to trying regular assoc.
+	 */
+	wdev->conn->prev_bssid_valid = false;
+	wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT;
+	schedule_work(&rdev->conn_work);
+
+	return true;
+}
+
+void cfg80211_sme_failed_assoc(struct wireless_dev *wdev)
+{
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+
+	wdev->conn->state = CFG80211_CONN_DEAUTH_ASSOC_FAIL;
+	schedule_work(&rdev->conn_work);
+}
+
+void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+			       const u8 *req_ie, size_t req_ie_len,
+			       const u8 *resp_ie, size_t resp_ie_len,
+			       u16 status, bool wextev,
+			       struct cfg80211_bss *bss)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	u8 *country_ie;
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+#endif
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		return;
+
+	if (wdev->sme_state != CFG80211_SME_CONNECTING)
+		return;
+
+	nl80211_send_connect_result(wiphy_to_dev(wdev->wiphy), dev,
+				    bssid, req_ie, req_ie_len,
+				    resp_ie, resp_ie_len,
+				    status, GFP_KERNEL);
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (wextev) {
+		if (req_ie && status == WLAN_STATUS_SUCCESS) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = req_ie_len;
+			wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, req_ie);
+		}
+
+		if (resp_ie && status == WLAN_STATUS_SUCCESS) {
+			memset(&wrqu, 0, sizeof(wrqu));
+			wrqu.data.length = resp_ie_len;
+			wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, resp_ie);
+		}
+
+		memset(&wrqu, 0, sizeof(wrqu));
+		wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+		if (bssid && status == WLAN_STATUS_SUCCESS) {
+			memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+			memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN);
+			wdev->wext.prev_bssid_valid = true;
+		}
+		wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+	}
+#endif
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(&wdev->current_bss->pub);
+		wdev->current_bss = NULL;
+	}
+
+	if (wdev->conn)
+		wdev->conn->state = CFG80211_CONN_IDLE;
+
+	if (status != WLAN_STATUS_SUCCESS) {
+		wdev->sme_state = CFG80211_SME_IDLE;
+		if (wdev->conn)
+			kfree(wdev->conn->ie);
+		kfree(wdev->conn);
+		wdev->conn = NULL;
+		kfree(wdev->connect_keys);
+		wdev->connect_keys = NULL;
+		wdev->ssid_len = 0;
+		return;
+	}
+
+	if (!bss)
+		bss = cfg80211_get_bss(wdev->wiphy,
+				       wdev->conn ? wdev->conn->params.channel :
+				       NULL,
+				       bssid,
+				       wdev->ssid, wdev->ssid_len,
+				       WLAN_CAPABILITY_ESS,
+				       WLAN_CAPABILITY_ESS);
+
+	if (WARN_ON(!bss))
+		return;
+
+	cfg80211_hold_bss(bss_from_pub(bss));
+	wdev->current_bss = bss_from_pub(bss);
+
+	wdev->sme_state = CFG80211_SME_CONNECTED;
+	cfg80211_upload_connect_keys(wdev);
+
+	country_ie = (u8 *) ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY);
+
+	if (!country_ie)
+		return;
+
+	/*
+	 * ieee80211_bss_get_ie() ensures we can access:
+	 * - country_ie + 2, the start of the country ie data, and
+	 * - and country_ie[1] which is the IE length
+	 */
+	regulatory_hint_11d(wdev->wiphy,
+			    bss->channel->band,
+			    country_ie + 2,
+			    country_ie[1]);
+}
+
+void cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
+			     const u8 *req_ie, size_t req_ie_len,
+			     const u8 *resp_ie, size_t resp_ie_len,
+			     u16 status, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTING);
+
+	ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_CONNECT_RESULT;
+	if (bssid)
+		memcpy(ev->cr.bssid, bssid, ETH_ALEN);
+	if (req_ie_len) {
+		ev->cr.req_ie = ((u8 *)ev) + sizeof(*ev);
+		ev->cr.req_ie_len = req_ie_len;
+		memcpy((void *)ev->cr.req_ie, req_ie, req_ie_len);
+	}
+	if (resp_ie_len) {
+		ev->cr.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len;
+		ev->cr.resp_ie_len = resp_ie_len;
+		memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len);
+	}
+	ev->cr.status = status;
+
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_connect_result);
+
+void __cfg80211_roamed(struct wireless_dev *wdev,
+		       struct ieee80211_channel *channel,
+		       const u8 *bssid,
+		       const u8 *req_ie, size_t req_ie_len,
+		       const u8 *resp_ie, size_t resp_ie_len)
+{
+	struct cfg80211_bss *bss;
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+#endif
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		return;
+
+	if (wdev->sme_state != CFG80211_SME_CONNECTED)
+		return;
+
+	/* internal error -- how did we get to CONNECTED w/o BSS? */
+	if (WARN_ON(!wdev->current_bss)) {
+		return;
+	}
+
+	cfg80211_unhold_bss(wdev->current_bss);
+	cfg80211_put_bss(&wdev->current_bss->pub);
+	wdev->current_bss = NULL;
+
+	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid,
+			       wdev->ssid, wdev->ssid_len,
+			       WLAN_CAPABILITY_ESS, WLAN_CAPABILITY_ESS);
+
+	if (WARN_ON(!bss))
+		return;
+
+	cfg80211_hold_bss(bss_from_pub(bss));
+	wdev->current_bss = bss_from_pub(bss);
+
+	nl80211_send_roamed(wiphy_to_dev(wdev->wiphy), wdev->netdev, bssid,
+			    req_ie, req_ie_len, resp_ie, resp_ie_len,
+			    GFP_KERNEL);
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (req_ie) {
+		memset(&wrqu, 0, sizeof(wrqu));
+		wrqu.data.length = req_ie_len;
+		wireless_send_event(wdev->netdev, IWEVASSOCREQIE,
+				    &wrqu, req_ie);
+	}
+
+	if (resp_ie) {
+		memset(&wrqu, 0, sizeof(wrqu));
+		wrqu.data.length = resp_ie_len;
+		wireless_send_event(wdev->netdev, IWEVASSOCRESPIE,
+				    &wrqu, resp_ie);
+	}
+
+	memset(&wrqu, 0, sizeof(wrqu));
+	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+	memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
+	memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN);
+	wdev->wext.prev_bssid_valid = true;
+	wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL);
+#endif
+}
+
+void cfg80211_roamed(struct net_device *dev,
+		     struct ieee80211_channel *channel,
+		     const u8 *bssid,
+		     const u8 *req_ie, size_t req_ie_len,
+		     const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTED);
+
+	ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_ROAMED;
+	ev->rm.channel = channel;
+	memcpy(ev->rm.bssid, bssid, ETH_ALEN);
+	ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
+	ev->rm.req_ie_len = req_ie_len;
+	memcpy((void *)ev->rm.req_ie, req_ie, req_ie_len);
+	ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len;
+	ev->rm.resp_ie_len = resp_ie_len;
+	memcpy((void *)ev->rm.resp_ie, resp_ie, resp_ie_len);
+
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_roamed);
+
+void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
+			     size_t ie_len, u16 reason, bool from_ap)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int i;
+#ifdef CONFIG_CFG80211_WEXT
+	union iwreq_data wrqu;
+#endif
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+		return;
+
+#ifndef CONFIG_CFG80211_ALLOW_RECONNECT
+	if (wdev->sme_state != CFG80211_SME_CONNECTED)
+		return;
+#endif
+
+	if (wdev->current_bss) {
+		cfg80211_unhold_bss(wdev->current_bss);
+		cfg80211_put_bss(&wdev->current_bss->pub);
+	}
+
+	wdev->current_bss = NULL;
+	wdev->sme_state = CFG80211_SME_IDLE;
+	wdev->ssid_len = 0;
+
+	if (wdev->conn) {
+		const u8 *bssid;
+		int ret;
+
+		kfree(wdev->conn->ie);
+		wdev->conn->ie = NULL;
+		kfree(wdev->conn);
+		wdev->conn = NULL;
+
+		/*
+		 * If this disconnect was due to a disassoc, we
+		 * we might still have an auth BSS around. For
+		 * the userspace SME that's currently expected,
+		 * but for the kernel SME (nl80211 CONNECT or
+		 * wireless extensions) we want to clear up all
+		 * state.
+		 */
+		for (i = 0; i < MAX_AUTH_BSSES; i++) {
+			if (!wdev->auth_bsses[i])
+				continue;
+			bssid = wdev->auth_bsses[i]->pub.bssid;
+			ret = __cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0,
+						WLAN_REASON_DEAUTH_LEAVING,
+						false);
+			WARN(ret, "deauth failed: %d\n", ret);
+		}
+	}
+
+	nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
+
+	/*
+	 * Delete all the keys ... pairwise keys can't really
+	 * exist any more anyway, but default keys might.
+	 */
+	if (rdev->ops->del_key)
+		for (i = 0; i < 6; i++)
+			rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
+
+#ifdef CONFIG_CFG80211_WEXT
+	memset(&wrqu, 0, sizeof(wrqu));
+	wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+	wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
+	wdev->wext.connect.ssid_len = 0;
+#endif
+
+	schedule_work(&cfg80211_disconnect_work);
+}
+
+void cfg80211_disconnected(struct net_device *dev, u16 reason,
+			   u8 *ie, size_t ie_len, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_event *ev;
+	unsigned long flags;
+
+	CFG80211_DEV_WARN_ON(wdev->sme_state != CFG80211_SME_CONNECTED);
+
+	ev = kzalloc(sizeof(*ev) + ie_len, gfp);
+	if (!ev)
+		return;
+
+	ev->type = EVENT_DISCONNECTED;
+	ev->dc.ie = ((u8 *)ev) + sizeof(*ev);
+	ev->dc.ie_len = ie_len;
+	memcpy((void *)ev->dc.ie, ie, ie_len);
+	ev->dc.reason = reason;
+
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	list_add_tail(&ev->list, &wdev->event_list);
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+	queue_work(cfg80211_wq, &rdev->event_work);
+}
+EXPORT_SYMBOL(cfg80211_disconnected);
+
+int __cfg80211_connect(struct cfg80211_registered_device *rdev,
+		       struct net_device *dev,
+		       struct cfg80211_connect_params *connect,
+		       struct cfg80211_cached_keys *connkeys,
+		       const u8 *prev_bssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_bss *bss = NULL;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+#ifndef CONFIG_CFG80211_ALLOW_RECONNECT
+	if (wdev->sme_state != CFG80211_SME_IDLE)
+		return -EALREADY;
+
+	if (WARN_ON(wdev->connect_keys)) {
+#else
+	if (wdev->connect_keys) {
+#endif
+		kfree(wdev->connect_keys);
+		wdev->connect_keys = NULL;
+	}
+
+	if (connkeys && connkeys->def >= 0) {
+		int idx;
+		u32 cipher;
+
+		idx = connkeys->def;
+		cipher = connkeys->params[idx].cipher;
+		/* If given a WEP key we may need it for shared key auth */
+		if (cipher == WLAN_CIPHER_SUITE_WEP40 ||
+		    cipher == WLAN_CIPHER_SUITE_WEP104) {
+			connect->key_idx = idx;
+			connect->key = connkeys->params[idx].key;
+			connect->key_len = connkeys->params[idx].key_len;
+
+			/*
+			 * If ciphers are not set (e.g. when going through
+			 * iwconfig), we have to set them appropriately here.
+			 */
+			if (connect->crypto.cipher_group == 0)
+				connect->crypto.cipher_group = cipher;
+
+			if (connect->crypto.n_ciphers_pairwise == 0) {
+				connect->crypto.n_ciphers_pairwise = 1;
+				connect->crypto.ciphers_pairwise[0] = cipher;
+			}
+		}
+	}
+
+	if (!rdev->ops->connect) {
+		if (!rdev->ops->auth || !rdev->ops->assoc)
+			return -EOPNOTSUPP;
+
+		if (WARN_ON(wdev->conn))
+			return -EINPROGRESS;
+
+		wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL);
+		if (!wdev->conn)
+			return -ENOMEM;
+
+		/*
+		 * Copy all parameters, and treat explicitly IEs, BSSID, SSID.
+		 */
+		memcpy(&wdev->conn->params, connect, sizeof(*connect));
+		if (connect->bssid) {
+			wdev->conn->params.bssid = wdev->conn->bssid;
+			memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN);
+		}
+
+		if (connect->ie) {
+			wdev->conn->ie = kmemdup(connect->ie, connect->ie_len,
+						GFP_KERNEL);
+			wdev->conn->params.ie = wdev->conn->ie;
+			if (!wdev->conn->ie) {
+				kfree(wdev->conn);
+				wdev->conn = NULL;
+				return -ENOMEM;
+			}
+		}
+
+		if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) {
+			wdev->conn->auto_auth = true;
+			/* start with open system ... should mostly work */
+			wdev->conn->params.auth_type =
+				NL80211_AUTHTYPE_OPEN_SYSTEM;
+		} else {
+			wdev->conn->auto_auth = false;
+		}
+
+		memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
+		wdev->ssid_len = connect->ssid_len;
+		wdev->conn->params.ssid = wdev->ssid;
+		wdev->conn->params.ssid_len = connect->ssid_len;
+
+		/* see if we have the bss already */
+		bss = cfg80211_get_conn_bss(wdev);
+
+		wdev->sme_state = CFG80211_SME_CONNECTING;
+		wdev->connect_keys = connkeys;
+
+		if (prev_bssid) {
+			memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN);
+			wdev->conn->prev_bssid_valid = true;
+		}
+
+		/* we're good if we have a matching bss struct */
+		if (bss) {
+			wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT;
+			err = cfg80211_conn_do_work(wdev);
+			cfg80211_put_bss(bss);
+		} else {
+			/* otherwise we'll need to scan for the AP first */
+			err = cfg80211_conn_scan(wdev);
+			/*
+			 * If we can't scan right now, then we need to scan again
+			 * after the current scan finished, since the parameters
+			 * changed (unless we find a good AP anyway).
+			 */
+			if (err == -EBUSY) {
+				err = 0;
+				wdev->conn->state = CFG80211_CONN_SCAN_AGAIN;
+			}
+		}
+		if (err) {
+			kfree(wdev->conn->ie);
+			kfree(wdev->conn);
+			wdev->conn = NULL;
+			wdev->sme_state = CFG80211_SME_IDLE;
+			wdev->connect_keys = NULL;
+			wdev->ssid_len = 0;
+		}
+
+		return err;
+	} else {
+		wdev->sme_state = CFG80211_SME_CONNECTING;
+		wdev->connect_keys = connkeys;
+		err = rdev->ops->connect(&rdev->wiphy, dev, connect);
+		if (err) {
+			wdev->connect_keys = NULL;
+			wdev->sme_state = CFG80211_SME_IDLE;
+			return err;
+		}
+
+		memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
+		wdev->ssid_len = connect->ssid_len;
+
+		return 0;
+	}
+}
+
+int cfg80211_connect(struct cfg80211_registered_device *rdev,
+		     struct net_device *dev,
+		     struct cfg80211_connect_params *connect,
+		     struct cfg80211_cached_keys *connkeys)
+{
+	int err;
+
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_connect(rdev, dev, connect, connkeys, NULL);
+	wdev_unlock(dev->ieee80211_ptr);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+int __cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, u16 reason, bool wextev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err;
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (wdev->sme_state == CFG80211_SME_IDLE)
+		return -EINVAL;
+
+	kfree(wdev->connect_keys);
+	wdev->connect_keys = NULL;
+
+	if (!rdev->ops->disconnect) {
+		if (!rdev->ops->deauth)
+			return -EOPNOTSUPP;
+
+		/* was it connected by userspace SME? */
+		if (!wdev->conn) {
+			cfg80211_mlme_down(rdev, dev);
+			return 0;
+		}
+
+		if (wdev->sme_state == CFG80211_SME_CONNECTING &&
+		    (wdev->conn->state == CFG80211_CONN_SCANNING ||
+		     wdev->conn->state == CFG80211_CONN_SCAN_AGAIN)) {
+			wdev->sme_state = CFG80211_SME_IDLE;
+			kfree(wdev->conn->ie);
+			kfree(wdev->conn);
+			wdev->conn = NULL;
+			wdev->ssid_len = 0;
+			return 0;
+		}
+
+		/* wdev->conn->params.bssid must be set if > SCANNING */
+		err = __cfg80211_mlme_deauth(rdev, dev,
+					     wdev->conn->params.bssid,
+					     NULL, 0, reason, false);
+		if (err)
+			return err;
+	} else {
+		err = rdev->ops->disconnect(&rdev->wiphy, dev, reason);
+		if (err)
+			return err;
+	}
+
+	if (wdev->sme_state == CFG80211_SME_CONNECTED)
+		__cfg80211_disconnected(dev, NULL, 0, 0, false);
+	else if (wdev->sme_state == CFG80211_SME_CONNECTING)
+		__cfg80211_connect_result(dev, NULL, NULL, 0, NULL, 0,
+					  WLAN_STATUS_UNSPECIFIED_FAILURE,
+					  wextev, NULL);
+
+	return 0;
+}
+
+int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
+			struct net_device *dev,
+			u16 reason, bool wextev)
+{
+	int err;
+
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_disconnect(rdev, dev, reason, wextev);
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
+}
+
+void cfg80211_sme_disassoc(struct net_device *dev, int idx)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u8 bssid[ETH_ALEN];
+
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!wdev->conn)
+		return;
+
+	if (wdev->conn->state == CFG80211_CONN_IDLE)
+		return;
+
+	/*
+	 * Ok, so the association was made by this SME -- we don't
+	 * want it any more so deauthenticate too.
+	 */
+
+	if (!wdev->auth_bsses[idx])
+		return;
+
+	memcpy(bssid, wdev->auth_bsses[idx]->pub.bssid, ETH_ALEN);
+	if (__cfg80211_mlme_deauth(rdev, dev, bssid,
+				   NULL, 0, WLAN_REASON_DEAUTH_LEAVING,
+				   false)) {
+		/* whatever -- assume gone anyway */
+		cfg80211_unhold_bss(wdev->auth_bsses[idx]);
+		cfg80211_put_bss(&wdev->auth_bsses[idx]->pub);
+		wdev->auth_bsses[idx] = NULL;
+	}
+}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
new file mode 100644
index 00000000..c6e4ca6a
--- /dev/null
+++ b/net/wireless/sysfs.c
@@ -0,0 +1,151 @@
+/*
+ * This file provides /sys/class/ieee80211/<wiphy name>/
+ * and some default attributes.
+ *
+ * Copyright 2005-2006	Jiri Benc <jbenc@suse.cz>
+ * Copyright 2006	Johannes Berg <johannes@sipsolutions.net>
+ *
+ * This file is GPLv2 as found in COPYING.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/nl80211.h>
+#include <linux/rtnetlink.h>
+#include <net/cfg80211.h>
+#include "sysfs.h"
+#include "core.h"
+
+static inline struct cfg80211_registered_device *dev_to_rdev(
+	struct device *dev)
+{
+	return container_of(dev, struct cfg80211_registered_device, wiphy.dev);
+}
+
+#define SHOW_FMT(name, fmt, member)					\
+static ssize_t name ## _show(struct device *dev,			\
+			      struct device_attribute *attr,		\
+			      char *buf)				\
+{									\
+	return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member);	\
+}
+
+SHOW_FMT(index, "%d", wiphy_idx);
+SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
+SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
+
+static ssize_t name_show(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf) {
+	struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+	return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+}
+
+
+static ssize_t addresses_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+	char *start = buf;
+	int i;
+
+	if (!wiphy->addresses)
+		return sprintf(buf, "%pM\n", wiphy->perm_addr);
+
+	for (i = 0; i < wiphy->n_addresses; i++)
+		buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr);
+
+	return buf - start;
+}
+
+static struct device_attribute ieee80211_dev_attrs[] = {
+	__ATTR_RO(index),
+	__ATTR_RO(macaddress),
+	__ATTR_RO(address_mask),
+	__ATTR_RO(addresses),
+	__ATTR_RO(name),
+	{}
+};
+
+static void wiphy_dev_release(struct device *dev)
+{
+	struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+
+	cfg80211_dev_free(rdev);
+}
+
+#ifdef CONFIG_HOTPLUG
+static int wiphy_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	/* TODO, we probably need stuff here */
+	return 0;
+}
+#endif
+
+static int wiphy_suspend(struct device *dev, pm_message_t state)
+{
+	struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+	int ret = 0;
+
+	rdev->suspend_at = get_seconds();
+
+	if (rdev->ops->suspend) {
+		rtnl_lock();
+		ret = rdev->ops->suspend(&rdev->wiphy, rdev->wowlan);
+		rtnl_unlock();
+	}
+
+	return ret;
+}
+
+static int wiphy_resume(struct device *dev)
+{
+	struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
+	int ret = 0;
+
+	/* Age scan results with time spent in suspend */
+	spin_lock_bh(&rdev->bss_lock);
+	cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+	spin_unlock_bh(&rdev->bss_lock);
+
+	if (rdev->ops->resume) {
+		rtnl_lock();
+		ret = rdev->ops->resume(&rdev->wiphy);
+		rtnl_unlock();
+	}
+
+	return ret;
+}
+
+static const void *wiphy_namespace(struct device *d)
+{
+	struct wiphy *wiphy = container_of(d, struct wiphy, dev);
+
+	return wiphy_net(wiphy);
+}
+
+struct class ieee80211_class = {
+	.name = "ieee80211",
+	.owner = THIS_MODULE,
+	.dev_release = wiphy_dev_release,
+	.dev_attrs = ieee80211_dev_attrs,
+#ifdef CONFIG_HOTPLUG
+	.dev_uevent = wiphy_uevent,
+#endif
+	.suspend = wiphy_suspend,
+	.resume = wiphy_resume,
+	.ns_type = &net_ns_type_operations,
+	.namespace = wiphy_namespace,
+};
+
+int wiphy_sysfs_init(void)
+{
+	return class_register(&ieee80211_class);
+}
+
+void wiphy_sysfs_exit(void)
+{
+	class_unregister(&ieee80211_class);
+}
diff --git a/net/wireless/sysfs.h b/net/wireless/sysfs.h
new file mode 100644
index 00000000..65acbebd
--- /dev/null
+++ b/net/wireless/sysfs.h
@@ -0,0 +1,9 @@
+#ifndef __WIRELESS_SYSFS_H
+#define __WIRELESS_SYSFS_H
+
+extern int wiphy_sysfs_init(void);
+extern void wiphy_sysfs_exit(void);
+
+extern struct class ieee80211_class;
+
+#endif /* __WIRELESS_SYSFS_H */
diff --git a/net/wireless/util.c b/net/wireless/util.c
new file mode 100644
index 00000000..30f68dc7
--- /dev/null
+++ b/net/wireless/util.c
@@ -0,0 +1,1025 @@
+/*
+ * Wireless utility functions
+ *
+ * Copyright 2007-2009	Johannes Berg <johannes@sipsolutions.net>
+ */
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <net/cfg80211.h>
+#include <net/ip.h>
+#include "core.h"
+
+struct ieee80211_rate *
+ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
+			    u32 basic_rates, int bitrate)
+{
+	struct ieee80211_rate *result = &sband->bitrates[0];
+	int i;
+
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if (!(basic_rates & BIT(i)))
+			continue;
+		if (sband->bitrates[i].bitrate > bitrate)
+			continue;
+		result = &sband->bitrates[i];
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(ieee80211_get_response_rate);
+
+int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band)
+{
+	/* see 802.11 17.3.8.3.2 and Annex J
+	 * there are overlapping channel numbers in 5GHz and 2GHz bands */
+	if (band == IEEE80211_BAND_5GHZ) {
+		if (chan >= 182 && chan <= 196)
+			return 4000 + chan * 5;
+		else
+			return 5000 + chan * 5;
+	} else { /* IEEE80211_BAND_2GHZ */
+		if (chan == 14)
+			return 2484;
+		else if (chan < 14)
+			return 2407 + chan * 5;
+		else
+			return 0; /* not supported */
+	}
+}
+EXPORT_SYMBOL(ieee80211_channel_to_frequency);
+
+int ieee80211_frequency_to_channel(int freq)
+{
+	/* see 802.11 17.3.8.3.2 and Annex J */
+	if (freq == 2484)
+		return 14;
+	else if (freq < 2484)
+		return (freq - 2407) / 5;
+	else if (freq >= 4910 && freq <= 4980)
+		return (freq - 4000) / 5;
+	else
+		return (freq - 5000) / 5;
+}
+EXPORT_SYMBOL(ieee80211_frequency_to_channel);
+
+struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
+						  int freq)
+{
+	enum ieee80211_band band;
+	struct ieee80211_supported_band *sband;
+	int i;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		sband = wiphy->bands[band];
+
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_channels; i++) {
+			if (sband->channels[i].center_freq == freq)
+				return &sband->channels[i];
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(__ieee80211_get_channel);
+
+static void set_mandatory_flags_band(struct ieee80211_supported_band *sband,
+				     enum ieee80211_band band)
+{
+	int i, want;
+
+	switch (band) {
+	case IEEE80211_BAND_5GHZ:
+		want = 3;
+		for (i = 0; i < sband->n_bitrates; i++) {
+			if (sband->bitrates[i].bitrate == 60 ||
+			    sband->bitrates[i].bitrate == 120 ||
+			    sband->bitrates[i].bitrate == 240) {
+				sband->bitrates[i].flags |=
+					IEEE80211_RATE_MANDATORY_A;
+				want--;
+			}
+		}
+		WARN_ON(want);
+		break;
+	case IEEE80211_BAND_2GHZ:
+		want = 7;
+		for (i = 0; i < sband->n_bitrates; i++) {
+			if (sband->bitrates[i].bitrate == 10) {
+				sband->bitrates[i].flags |=
+					IEEE80211_RATE_MANDATORY_B |
+					IEEE80211_RATE_MANDATORY_G;
+				want--;
+			}
+
+			if (sband->bitrates[i].bitrate == 20 ||
+			    sband->bitrates[i].bitrate == 55 ||
+			    sband->bitrates[i].bitrate == 110 ||
+			    sband->bitrates[i].bitrate == 60 ||
+			    sband->bitrates[i].bitrate == 120 ||
+			    sband->bitrates[i].bitrate == 240) {
+				sband->bitrates[i].flags |=
+					IEEE80211_RATE_MANDATORY_G;
+				want--;
+			}
+
+			if (sband->bitrates[i].bitrate != 10 &&
+			    sband->bitrates[i].bitrate != 20 &&
+			    sband->bitrates[i].bitrate != 55 &&
+			    sband->bitrates[i].bitrate != 110)
+				sband->bitrates[i].flags |=
+					IEEE80211_RATE_ERP_G;
+		}
+		WARN_ON(want != 0 && want != 3 && want != 6);
+		break;
+	case IEEE80211_NUM_BANDS:
+		WARN_ON(1);
+		break;
+	}
+}
+
+void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
+{
+	enum ieee80211_band band;
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+		if (wiphy->bands[band])
+			set_mandatory_flags_band(wiphy->bands[band], band);
+}
+
+int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
+				   struct key_params *params, int key_idx,
+				   bool pairwise, const u8 *mac_addr)
+{
+	int i;
+
+	if (key_idx > 5)
+		return -EINVAL;
+
+	if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		return -EINVAL;
+
+	if (pairwise && !mac_addr)
+		return -EINVAL;
+
+	/*
+	 * Disallow pairwise keys with non-zero index unless it's WEP
+	 * or a vendor specific cipher (because current deployments use
+	 * pairwise WEP keys with non-zero indices and for vendor specific
+	 * ciphers this should be validated in the driver or hardware level
+	 * - but 802.11i clearly specifies to use zero)
+	 */
+	if (pairwise && key_idx &&
+	    ((params->cipher == WLAN_CIPHER_SUITE_TKIP) ||
+	     (params->cipher == WLAN_CIPHER_SUITE_CCMP) ||
+	     (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC)))
+		return -EINVAL;
+
+	switch (params->cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+		if (params->key_len != WLAN_KEY_LEN_WEP40)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_TKIP:
+		if (params->key_len != WLAN_KEY_LEN_TKIP)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_CCMP:
+		if (params->key_len != WLAN_KEY_LEN_CCMP)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_WEP104:
+		if (params->key_len != WLAN_KEY_LEN_WEP104)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
+			return -EINVAL;
+		break;
+	default:
+		/*
+		 * We don't know anything about this algorithm,
+		 * allow using it -- but the driver must check
+		 * all parameters! We still check below whether
+		 * or not the driver supports this algorithm,
+		 * of course.
+		 */
+		break;
+	}
+
+	if (params->seq) {
+		switch (params->cipher) {
+		case WLAN_CIPHER_SUITE_WEP40:
+		case WLAN_CIPHER_SUITE_WEP104:
+			/* These ciphers do not use key sequence */
+			return -EINVAL;
+		case WLAN_CIPHER_SUITE_TKIP:
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_AES_CMAC:
+			if (params->seq_len != 6)
+				return -EINVAL;
+			break;
+		}
+	}
+
+	for (i = 0; i < rdev->wiphy.n_cipher_suites; i++)
+		if (params->cipher == rdev->wiphy.cipher_suites[i])
+			break;
+	if (i == rdev->wiphy.n_cipher_suites)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
+/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
+const unsigned char rfc1042_header[] __aligned(2) =
+	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
+EXPORT_SYMBOL(rfc1042_header);
+
+/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
+const unsigned char bridge_tunnel_header[] __aligned(2) =
+	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
+EXPORT_SYMBOL(bridge_tunnel_header);
+
+unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
+{
+	unsigned int hdrlen = 24;
+
+	if (ieee80211_is_data(fc)) {
+		if (ieee80211_has_a4(fc))
+			hdrlen = 30;
+		if (ieee80211_is_data_qos(fc)) {
+			hdrlen += IEEE80211_QOS_CTL_LEN;
+			if (ieee80211_has_order(fc))
+				hdrlen += IEEE80211_HT_CTL_LEN;
+		}
+		goto out;
+	}
+
+	if (ieee80211_is_ctl(fc)) {
+		/*
+		 * ACK and CTS are 10 bytes, all others 16. To see how
+		 * to get this condition consider
+		 *   subtype mask:   0b0000000011110000 (0x00F0)
+		 *   ACK subtype:    0b0000000011010000 (0x00D0)
+		 *   CTS subtype:    0b0000000011000000 (0x00C0)
+		 *   bits that matter:         ^^^      (0x00E0)
+		 *   value of those: 0b0000000011000000 (0x00C0)
+		 */
+		if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0))
+			hdrlen = 10;
+		else
+			hdrlen = 16;
+	}
+out:
+	return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_hdrlen);
+
+unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
+{
+	const struct ieee80211_hdr *hdr =
+			(const struct ieee80211_hdr *)skb->data;
+	unsigned int hdrlen;
+
+	if (unlikely(skb->len < 10))
+		return 0;
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	if (unlikely(hdrlen > skb->len))
+		return 0;
+	return hdrlen;
+}
+EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
+
+static int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+{
+	int ae = meshhdr->flags & MESH_FLAGS_AE;
+	/* 7.1.3.5a.2 */
+	switch (ae) {
+	case 0:
+		return 6;
+	case MESH_FLAGS_AE_A4:
+		return 12;
+	case MESH_FLAGS_AE_A5_A6:
+		return 18;
+	case (MESH_FLAGS_AE_A4 | MESH_FLAGS_AE_A5_A6):
+		return 24;
+	default:
+		return 6;
+	}
+}
+
+int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
+			   enum nl80211_iftype iftype)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	u16 hdrlen, ethertype;
+	u8 *payload;
+	u8 dst[ETH_ALEN];
+	u8 src[ETH_ALEN] __aligned(2);
+
+	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
+		return -1;
+
+	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+
+	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
+	 * header
+	 * IEEE 802.11 address fields:
+	 * ToDS FromDS Addr1 Addr2 Addr3 Addr4
+	 *   0     0   DA    SA    BSSID n/a
+	 *   0     1   DA    BSSID SA    n/a
+	 *   1     0   BSSID SA    DA    n/a
+	 *   1     1   RA    TA    DA    SA
+	 */
+	memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
+	memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
+
+	switch (hdr->frame_control &
+		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
+	case cpu_to_le16(IEEE80211_FCTL_TODS):
+		if (unlikely(iftype != NL80211_IFTYPE_AP &&
+			     iftype != NL80211_IFTYPE_AP_VLAN &&
+			     iftype != NL80211_IFTYPE_P2P_GO))
+			return -1;
+		break;
+	case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
+		if (unlikely(iftype != NL80211_IFTYPE_WDS &&
+			     iftype != NL80211_IFTYPE_MESH_POINT &&
+			     iftype != NL80211_IFTYPE_AP_VLAN &&
+			     iftype != NL80211_IFTYPE_STATION))
+			return -1;
+		if (iftype == NL80211_IFTYPE_MESH_POINT) {
+			struct ieee80211s_hdr *meshdr =
+				(struct ieee80211s_hdr *) (skb->data + hdrlen);
+			/* make sure meshdr->flags is on the linear part */
+			if (!pskb_may_pull(skb, hdrlen + 1))
+				return -1;
+			if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+				skb_copy_bits(skb, hdrlen +
+					offsetof(struct ieee80211s_hdr, eaddr1),
+				       	dst, ETH_ALEN);
+				skb_copy_bits(skb, hdrlen +
+					offsetof(struct ieee80211s_hdr, eaddr2),
+				        src, ETH_ALEN);
+			}
+			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+		}
+		break;
+	case cpu_to_le16(IEEE80211_FCTL_FROMDS):
+		if ((iftype != NL80211_IFTYPE_STATION &&
+		     iftype != NL80211_IFTYPE_P2P_CLIENT &&
+		     iftype != NL80211_IFTYPE_MESH_POINT) ||
+		    (is_multicast_ether_addr(dst) &&
+		     !compare_ether_addr(src, addr)))
+			return -1;
+		if (iftype == NL80211_IFTYPE_MESH_POINT) {
+			struct ieee80211s_hdr *meshdr =
+				(struct ieee80211s_hdr *) (skb->data + hdrlen);
+			/* make sure meshdr->flags is on the linear part */
+			if (!pskb_may_pull(skb, hdrlen + 1))
+				return -1;
+			if (meshdr->flags & MESH_FLAGS_AE_A4)
+				skb_copy_bits(skb, hdrlen +
+					offsetof(struct ieee80211s_hdr, eaddr1),
+					src, ETH_ALEN);
+			hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+		}
+		break;
+	case cpu_to_le16(0):
+		if (iftype != NL80211_IFTYPE_ADHOC)
+			return -1;
+		break;
+	}
+
+	if (!pskb_may_pull(skb, hdrlen + 8))
+		return -1;
+
+	payload = skb->data + hdrlen;
+	ethertype = (payload[6] << 8) | payload[7];
+
+	if (likely((compare_ether_addr(payload, rfc1042_header) == 0 &&
+		    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+		   compare_ether_addr(payload, bridge_tunnel_header) == 0)) {
+		/* remove RFC1042 or Bridge-Tunnel encapsulation and
+		 * replace EtherType */
+		skb_pull(skb, hdrlen + 6);
+		memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+		memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+	} else {
+		struct ethhdr *ehdr;
+		__be16 len;
+
+		skb_pull(skb, hdrlen);
+		len = htons(skb->len);
+		ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
+		memcpy(ehdr->h_dest, dst, ETH_ALEN);
+		memcpy(ehdr->h_source, src, ETH_ALEN);
+		ehdr->h_proto = len;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_to_8023);
+
+int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
+			     enum nl80211_iftype iftype, u8 *bssid, bool qos)
+{
+	struct ieee80211_hdr hdr;
+	u16 hdrlen, ethertype;
+	__le16 fc;
+	const u8 *encaps_data;
+	int encaps_len, skip_header_bytes;
+	int nh_pos, h_pos;
+	int head_need;
+
+	if (unlikely(skb->len < ETH_HLEN))
+		return -EINVAL;
+
+	nh_pos = skb_network_header(skb) - skb->data;
+	h_pos = skb_transport_header(skb) - skb->data;
+
+	/* convert Ethernet header to proper 802.11 header (based on
+	 * operation mode) */
+	ethertype = (skb->data[12] << 8) | skb->data[13];
+	fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA);
+
+	switch (iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
+		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
+		/* DA BSSID SA */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, addr, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
+		/* BSSID SA DA */
+		memcpy(hdr.addr1, bssid, ETH_ALEN);
+		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+		memcpy(hdr.addr3, skb->data, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		/* DA SA BSSID */
+		memcpy(hdr.addr1, skb->data, ETH_ALEN);
+		memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
+		memcpy(hdr.addr3, bssid, ETH_ALEN);
+		hdrlen = 24;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (qos) {
+		fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA);
+		hdrlen += 2;
+	}
+
+	hdr.frame_control = fc;
+	hdr.duration_id = 0;
+	hdr.seq_ctrl = 0;
+
+	skip_header_bytes = ETH_HLEN;
+	if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) {
+		encaps_data = bridge_tunnel_header;
+		encaps_len = sizeof(bridge_tunnel_header);
+		skip_header_bytes -= 2;
+	} else if (ethertype > 0x600) {
+		encaps_data = rfc1042_header;
+		encaps_len = sizeof(rfc1042_header);
+		skip_header_bytes -= 2;
+	} else {
+		encaps_data = NULL;
+		encaps_len = 0;
+	}
+
+	skb_pull(skb, skip_header_bytes);
+	nh_pos -= skip_header_bytes;
+	h_pos -= skip_header_bytes;
+
+	head_need = hdrlen + encaps_len - skb_headroom(skb);
+
+	if (head_need > 0 || skb_cloned(skb)) {
+		head_need = max(head_need, 0);
+		if (head_need)
+			skb_orphan(skb);
+
+		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) {
+			pr_err("failed to reallocate Tx buffer\n");
+			return -ENOMEM;
+		}
+		skb->truesize += head_need;
+	}
+
+	if (encaps_data) {
+		memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len);
+		nh_pos += encaps_len;
+		h_pos += encaps_len;
+	}
+
+	memcpy(skb_push(skb, hdrlen), &hdr, hdrlen);
+
+	nh_pos += hdrlen;
+	h_pos += hdrlen;
+
+	/* Update skb pointers to various headers since this modified frame
+	 * is going to go through Linux networking code that may potentially
+	 * need things like pointer to IP header. */
+	skb_set_mac_header(skb, 0);
+	skb_set_network_header(skb, nh_pos);
+	skb_set_transport_header(skb, h_pos);
+
+	return 0;
+}
+EXPORT_SYMBOL(ieee80211_data_from_8023);
+
+
+void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
+			      const u8 *addr, enum nl80211_iftype iftype,
+			      const unsigned int extra_headroom,
+			      bool has_80211_header)
+{
+	struct sk_buff *frame = NULL;
+	u16 ethertype;
+	u8 *payload;
+	const struct ethhdr *eth;
+	int remaining, err;
+	u8 dst[ETH_ALEN], src[ETH_ALEN];
+
+	if (has_80211_header) {
+		err = ieee80211_data_to_8023(skb, addr, iftype);
+		if (err)
+			goto out;
+
+		/* skip the wrapping header */
+		eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr));
+		if (!eth)
+			goto out;
+	} else {
+		eth = (struct ethhdr *) skb->data;
+	}
+
+	while (skb != frame) {
+		u8 padding;
+		__be16 len = eth->h_proto;
+		unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len);
+
+		remaining = skb->len;
+		memcpy(dst, eth->h_dest, ETH_ALEN);
+		memcpy(src, eth->h_source, ETH_ALEN);
+
+		padding = (4 - subframe_len) & 0x3;
+		/* the last MSDU has no padding */
+		if (subframe_len > remaining)
+			goto purge;
+
+		skb_pull(skb, sizeof(struct ethhdr));
+		/* reuse skb for the last subframe */
+		if (remaining <= subframe_len + padding)
+			frame = skb;
+		else {
+			unsigned int hlen = ALIGN(extra_headroom, 4);
+			/*
+			 * Allocate and reserve two bytes more for payload
+			 * alignment since sizeof(struct ethhdr) is 14.
+			 */
+			frame = dev_alloc_skb(hlen + subframe_len + 2);
+			if (!frame)
+				goto purge;
+
+			skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
+			memcpy(skb_put(frame, ntohs(len)), skb->data,
+				ntohs(len));
+
+			eth = (struct ethhdr *)skb_pull(skb, ntohs(len) +
+							padding);
+			if (!eth) {
+				dev_kfree_skb(frame);
+				goto purge;
+			}
+		}
+
+		skb_reset_network_header(frame);
+		frame->dev = skb->dev;
+		frame->priority = skb->priority;
+
+		payload = frame->data;
+		ethertype = (payload[6] << 8) | payload[7];
+
+		if (likely((compare_ether_addr(payload, rfc1042_header) == 0 &&
+			    ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+			   compare_ether_addr(payload,
+					      bridge_tunnel_header) == 0)) {
+			/* remove RFC1042 or Bridge-Tunnel
+			 * encapsulation and replace EtherType */
+			skb_pull(frame, 6);
+			memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
+			memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+		} else {
+			memcpy(skb_push(frame, sizeof(__be16)), &len,
+				sizeof(__be16));
+			memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
+			memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+		}
+		__skb_queue_tail(list, frame);
+	}
+
+	return;
+
+ purge:
+	__skb_queue_purge(list);
+ out:
+	dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_amsdu_to_8023s);
+
+/* Given a data frame determine the 802.1p/1d tag to use. */
+unsigned int cfg80211_classify8021d(struct sk_buff *skb)
+{
+	unsigned int dscp;
+
+	/* skb->priority values from 256->263 are magic values to
+	 * directly indicate a specific 802.1d priority.  This is used
+	 * to allow 802.1d priority to be passed directly in from VLAN
+	 * tags, etc.
+	 */
+	if (skb->priority >= 256 && skb->priority <= 263)
+		return skb->priority - 256;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		dscp = ip_hdr(skb)->tos & 0xfc;
+		break;
+	default:
+		return 0;
+	}
+
+	return dscp >> 5;
+}
+EXPORT_SYMBOL(cfg80211_classify8021d);
+
+const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie)
+{
+	u8 *end, *pos;
+
+	pos = bss->information_elements;
+	if (pos == NULL)
+		return NULL;
+	end = pos + bss->len_information_elements;
+
+	while (pos + 1 < end) {
+		if (pos + 2 + pos[1] > end)
+			break;
+		if (pos[0] == ie)
+			return pos;
+		pos += 2 + pos[1];
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(ieee80211_bss_get_ie);
+
+void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
+{
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct net_device *dev = wdev->netdev;
+	int i;
+
+	if (!wdev->connect_keys)
+		return;
+
+	for (i = 0; i < 6; i++) {
+		if (!wdev->connect_keys->params[i].cipher)
+			continue;
+		if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL,
+					&wdev->connect_keys->params[i])) {
+			netdev_err(dev, "failed to set key %d\n", i);
+			continue;
+		}
+		if (wdev->connect_keys->def == i)
+			if (rdev->ops->set_default_key(wdev->wiphy, dev,
+						       i, true, true)) {
+				netdev_err(dev, "failed to set defkey %d\n", i);
+				continue;
+			}
+		if (wdev->connect_keys->defmgmt == i)
+			if (rdev->ops->set_default_mgmt_key(wdev->wiphy, dev, i))
+				netdev_err(dev, "failed to set mgtdef %d\n", i);
+	}
+
+	kfree(wdev->connect_keys);
+	wdev->connect_keys = NULL;
+}
+
+static void cfg80211_process_wdev_events(struct wireless_dev *wdev)
+{
+	struct cfg80211_event *ev;
+	unsigned long flags;
+	const u8 *bssid = NULL;
+
+	spin_lock_irqsave(&wdev->event_lock, flags);
+	while (!list_empty(&wdev->event_list)) {
+		ev = list_first_entry(&wdev->event_list,
+				      struct cfg80211_event, list);
+		list_del(&ev->list);
+		spin_unlock_irqrestore(&wdev->event_lock, flags);
+
+		wdev_lock(wdev);
+		switch (ev->type) {
+		case EVENT_CONNECT_RESULT:
+			if (!is_zero_ether_addr(ev->cr.bssid))
+				bssid = ev->cr.bssid;
+			__cfg80211_connect_result(
+				wdev->netdev, bssid,
+				ev->cr.req_ie, ev->cr.req_ie_len,
+				ev->cr.resp_ie, ev->cr.resp_ie_len,
+				ev->cr.status,
+				ev->cr.status == WLAN_STATUS_SUCCESS,
+				NULL);
+			break;
+		case EVENT_ROAMED:
+			__cfg80211_roamed(wdev, ev->rm.channel, ev->rm.bssid,
+					  ev->rm.req_ie, ev->rm.req_ie_len,
+					  ev->rm.resp_ie, ev->rm.resp_ie_len);
+			break;
+		case EVENT_DISCONNECTED:
+			__cfg80211_disconnected(wdev->netdev,
+						ev->dc.ie, ev->dc.ie_len,
+						ev->dc.reason, true);
+			break;
+		case EVENT_IBSS_JOINED:
+			__cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid);
+			break;
+		}
+		wdev_unlock(wdev);
+
+		kfree(ev);
+
+		spin_lock_irqsave(&wdev->event_lock, flags);
+	}
+	spin_unlock_irqrestore(&wdev->event_lock, flags);
+}
+
+void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev)
+{
+	struct wireless_dev *wdev;
+
+	ASSERT_RTNL();
+	ASSERT_RDEV_LOCK(rdev);
+
+	mutex_lock(&rdev->devlist_mtx);
+
+	list_for_each_entry(wdev, &rdev->netdev_list, list)
+		cfg80211_process_wdev_events(wdev);
+
+	mutex_unlock(&rdev->devlist_mtx);
+}
+
+int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev, enum nl80211_iftype ntype,
+			  u32 *flags, struct vif_params *params)
+{
+	int err;
+	enum nl80211_iftype otype = dev->ieee80211_ptr->iftype;
+
+	ASSERT_RDEV_LOCK(rdev);
+
+	/* don't support changing VLANs, you just re-create them */
+	if (otype == NL80211_IFTYPE_AP_VLAN)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->change_virtual_intf ||
+	    !(rdev->wiphy.interface_modes & (1 << ntype)))
+		return -EOPNOTSUPP;
+
+	/* if it's part of a bridge, reject changing type to station/ibss */
+	if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
+	    (ntype == NL80211_IFTYPE_ADHOC ||
+	     ntype == NL80211_IFTYPE_STATION ||
+	     ntype == NL80211_IFTYPE_P2P_CLIENT))
+		return -EBUSY;
+
+	if (ntype != otype) {
+		err = cfg80211_can_change_interface(rdev, dev->ieee80211_ptr,
+						    ntype);
+		if (err)
+			return err;
+
+		dev->ieee80211_ptr->use_4addr = false;
+		dev->ieee80211_ptr->mesh_id_up_len = 0;
+
+		switch (otype) {
+		case NL80211_IFTYPE_ADHOC:
+			cfg80211_leave_ibss(rdev, dev, false);
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			cfg80211_disconnect(rdev, dev,
+					    WLAN_REASON_DEAUTH_LEAVING, true);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			/* mesh should be handled? */
+			break;
+		default:
+			break;
+		}
+
+		cfg80211_process_rdev_events(rdev);
+	}
+
+	err = rdev->ops->change_virtual_intf(&rdev->wiphy, dev,
+					     ntype, flags, params);
+
+	WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype);
+
+	if (!err && params && params->use_4addr != -1)
+		dev->ieee80211_ptr->use_4addr = params->use_4addr;
+
+	if (!err) {
+		dev->priv_flags &= ~IFF_DONT_BRIDGE;
+		switch (ntype) {
+		case NL80211_IFTYPE_STATION:
+			if (dev->ieee80211_ptr->use_4addr)
+				break;
+			/* fall through */
+		case NL80211_IFTYPE_P2P_CLIENT:
+		case NL80211_IFTYPE_ADHOC:
+			dev->priv_flags |= IFF_DONT_BRIDGE;
+			break;
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_AP_VLAN:
+		case NL80211_IFTYPE_WDS:
+		case NL80211_IFTYPE_MESH_POINT:
+			/* bridging OK */
+			break;
+		case NL80211_IFTYPE_MONITOR:
+			/* monitor can't bridge anyway */
+			break;
+		case NL80211_IFTYPE_UNSPECIFIED:
+		case NUM_NL80211_IFTYPES:
+			/* not happening */
+			break;
+		}
+	}
+
+	return err;
+}
+
+u16 cfg80211_calculate_bitrate(struct rate_info *rate)
+{
+	int modulation, streams, bitrate;
+
+	if (!(rate->flags & RATE_INFO_FLAGS_MCS))
+		return rate->legacy;
+
+	/* the formula below does only work for MCS values smaller than 32 */
+	if (rate->mcs >= 32)
+		return 0;
+
+	modulation = rate->mcs & 7;
+	streams = (rate->mcs >> 3) + 1;
+
+	bitrate = (rate->flags & RATE_INFO_FLAGS_40_MHZ_WIDTH) ?
+			13500000 : 6500000;
+
+	if (modulation < 4)
+		bitrate *= (modulation + 1);
+	else if (modulation == 4)
+		bitrate *= (modulation + 2);
+	else
+		bitrate *= (modulation + 3);
+
+	bitrate *= streams;
+
+	if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+		bitrate = (bitrate / 9) * 10;
+
+	/* do NOT round down here */
+	return (bitrate + 50000) / 100000;
+}
+
+int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
+				 u32 beacon_int)
+{
+	struct wireless_dev *wdev;
+	int res = 0;
+
+	if (!beacon_int)
+		return -EINVAL;
+
+	mutex_lock(&rdev->devlist_mtx);
+
+	list_for_each_entry(wdev, &rdev->netdev_list, list) {
+		if (!wdev->beacon_interval)
+			continue;
+		if (wdev->beacon_interval != beacon_int) {
+			res = -EINVAL;
+			break;
+		}
+	}
+
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return res;
+}
+
+int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev,
+				  struct wireless_dev *wdev,
+				  enum nl80211_iftype iftype)
+{
+	struct wireless_dev *wdev_iter;
+	u32 used_iftypes = BIT(iftype);
+	int num[NUM_NL80211_IFTYPES];
+	int total = 1;
+	int i, j;
+
+	ASSERT_RTNL();
+
+	/* Always allow software iftypes */
+	if (rdev->wiphy.software_iftypes & BIT(iftype))
+		return 0;
+
+	/*
+	 * Drivers will gradually all set this flag, until all
+	 * have it we only enforce for those that set it.
+	 */
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_ENFORCE_COMBINATIONS))
+		return 0;
+
+	memset(num, 0, sizeof(num));
+
+	num[iftype] = 1;
+
+	mutex_lock(&rdev->devlist_mtx);
+	list_for_each_entry(wdev_iter, &rdev->netdev_list, list) {
+		if (wdev_iter == wdev)
+			continue;
+		if (!netif_running(wdev_iter->netdev))
+			continue;
+
+		if (rdev->wiphy.software_iftypes & BIT(wdev_iter->iftype))
+			continue;
+
+		num[wdev_iter->iftype]++;
+		total++;
+		used_iftypes |= BIT(wdev_iter->iftype);
+	}
+	mutex_unlock(&rdev->devlist_mtx);
+
+	for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) {
+		const struct ieee80211_iface_combination *c;
+		struct ieee80211_iface_limit *limits;
+		u32 all_iftypes = 0;
+
+		c = &rdev->wiphy.iface_combinations[i];
+
+		limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
+				 GFP_KERNEL);
+		if (!limits)
+			return -ENOMEM;
+		if (total > c->max_interfaces)
+			goto cont;
+
+		for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
+			if (rdev->wiphy.software_iftypes & BIT(iftype))
+				continue;
+			for (j = 0; j < c->n_limits; j++) {
+				all_iftypes |= limits[j].types;
+				if (!(limits[j].types & BIT(iftype)))
+					continue;
+				if (limits[j].max < num[iftype])
+					goto cont;
+				limits[j].max -= num[iftype];
+			}
+		}
+
+		/*
+		 * Finally check that all iftypes that we're currently
+		 * using are actually part of this combination. If they
+		 * aren't then we can't use this combination and have
+		 * to continue to the next.
+		 */
+		if ((all_iftypes & used_iftypes) != used_iftypes)
+			goto cont;
+
+		/*
+		 * This combination covered all interface types and
+		 * supported the requested numbers, so we're good.
+		 */
+		kfree(limits);
+		return 0;
+ cont:
+		kfree(limits);
+	}
+
+	return -EBUSY;
+}
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
new file mode 100644
index 00000000..0bf169bb
--- /dev/null
+++ b/net/wireless/wext-compat.c
@@ -0,0 +1,1537 @@
+/*
+ * cfg80211 - wext compat code
+ *
+ * This is temporary code until all wireless functionality is migrated
+ * into cfg80211, when that happens all the exports here go away and
+ * we directly assign the wireless handlers of wireless interfaces.
+ *
+ * Copyright 2008-2009	Johannes Berg <johannes@sipsolutions.net>
+ */
+
+#include <linux/wireless.h>
+#include <linux/nl80211.h>
+#include <linux/if_arp.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <net/iw_handler.h>
+#include <net/cfg80211.h>
+#include "wext-compat.h"
+#include "core.h"
+
+int cfg80211_wext_giwname(struct net_device *dev,
+			  struct iw_request_info *info,
+			  char *name, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_supported_band *sband;
+	bool is_ht = false, is_a = false, is_b = false, is_g = false;
+
+	if (!wdev)
+		return -EOPNOTSUPP;
+
+	sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ];
+	if (sband) {
+		is_a = true;
+		is_ht |= sband->ht_cap.ht_supported;
+	}
+
+	sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ];
+	if (sband) {
+		int i;
+		/* Check for mandatory rates */
+		for (i = 0; i < sband->n_bitrates; i++) {
+			if (sband->bitrates[i].bitrate == 10)
+				is_b = true;
+			if (sband->bitrates[i].bitrate == 60)
+				is_g = true;
+		}
+		is_ht |= sband->ht_cap.ht_supported;
+	}
+
+	strcpy(name, "IEEE 802.11");
+	if (is_a)
+		strcat(name, "a");
+	if (is_b)
+		strcat(name, "b");
+	if (is_g)
+		strcat(name, "g");
+	if (is_ht)
+		strcat(name, "n");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwname);
+
+int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
+			  u32 *mode, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev;
+	struct vif_params vifparams;
+	enum nl80211_iftype type;
+	int ret;
+
+	rdev = wiphy_to_dev(wdev->wiphy);
+
+	switch (*mode) {
+	case IW_MODE_INFRA:
+		type = NL80211_IFTYPE_STATION;
+		break;
+	case IW_MODE_ADHOC:
+		type = NL80211_IFTYPE_ADHOC;
+		break;
+	case IW_MODE_REPEAT:
+		type = NL80211_IFTYPE_WDS;
+		break;
+	case IW_MODE_MONITOR:
+		type = NL80211_IFTYPE_MONITOR;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (type == wdev->iftype)
+		return 0;
+
+	memset(&vifparams, 0, sizeof(vifparams));
+
+	cfg80211_lock_rdev(rdev);
+	ret = cfg80211_change_iface(rdev, dev, type, NULL, &vifparams);
+	cfg80211_unlock_rdev(rdev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwmode);
+
+int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
+			  u32 *mode, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (!wdev)
+		return -EOPNOTSUPP;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+		*mode = IW_MODE_MASTER;
+		break;
+	case NL80211_IFTYPE_STATION:
+		*mode = IW_MODE_INFRA;
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		*mode = IW_MODE_ADHOC;
+		break;
+	case NL80211_IFTYPE_MONITOR:
+		*mode = IW_MODE_MONITOR;
+		break;
+	case NL80211_IFTYPE_WDS:
+		*mode = IW_MODE_REPEAT;
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		*mode = IW_MODE_SECOND;		/* FIXME */
+		break;
+	default:
+		*mode = IW_MODE_AUTO;
+		break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwmode);
+
+
+int cfg80211_wext_giwrange(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct iw_range *range = (struct iw_range *) extra;
+	enum ieee80211_band band;
+	int i, c = 0;
+
+	if (!wdev)
+		return -EOPNOTSUPP;
+
+	data->length = sizeof(struct iw_range);
+	memset(range, 0, sizeof(struct iw_range));
+
+	range->we_version_compiled = WIRELESS_EXT;
+	range->we_version_source = 21;
+	range->retry_capa = IW_RETRY_LIMIT;
+	range->retry_flags = IW_RETRY_LIMIT;
+	range->min_retry = 0;
+	range->max_retry = 255;
+	range->min_rts = 0;
+	range->max_rts = 2347;
+	range->min_frag = 256;
+	range->max_frag = 2346;
+
+	range->max_encoding_tokens = 4;
+
+	range->max_qual.updated = IW_QUAL_NOISE_INVALID;
+
+	switch (wdev->wiphy->signal_type) {
+	case CFG80211_SIGNAL_TYPE_NONE:
+		break;
+	case CFG80211_SIGNAL_TYPE_MBM:
+		range->max_qual.level = -110;
+		range->max_qual.qual = 70;
+		range->avg_qual.qual = 35;
+		range->max_qual.updated |= IW_QUAL_DBM;
+		range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+		range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+		break;
+	case CFG80211_SIGNAL_TYPE_UNSPEC:
+		range->max_qual.level = 100;
+		range->max_qual.qual = 100;
+		range->avg_qual.qual = 50;
+		range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
+		range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
+		break;
+	}
+
+	range->avg_qual.level = range->max_qual.level / 2;
+	range->avg_qual.noise = range->max_qual.noise / 2;
+	range->avg_qual.updated = range->max_qual.updated;
+
+	for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) {
+		switch (wdev->wiphy->cipher_suites[i]) {
+		case WLAN_CIPHER_SUITE_TKIP:
+			range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP |
+					    IW_ENC_CAPA_WPA);
+			break;
+
+		case WLAN_CIPHER_SUITE_CCMP:
+			range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP |
+					    IW_ENC_CAPA_WPA2);
+			break;
+
+		case WLAN_CIPHER_SUITE_WEP40:
+			range->encoding_size[range->num_encoding_sizes++] =
+				WLAN_KEY_LEN_WEP40;
+			break;
+
+		case WLAN_CIPHER_SUITE_WEP104:
+			range->encoding_size[range->num_encoding_sizes++] =
+				WLAN_KEY_LEN_WEP104;
+			break;
+		}
+	}
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
+		struct ieee80211_supported_band *sband;
+
+		sband = wdev->wiphy->bands[band];
+
+		if (!sband)
+			continue;
+
+		for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
+			struct ieee80211_channel *chan = &sband->channels[i];
+
+			if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
+				range->freq[c].i =
+					ieee80211_frequency_to_channel(
+						chan->center_freq);
+				range->freq[c].m = chan->center_freq;
+				range->freq[c].e = 6;
+				c++;
+			}
+		}
+	}
+	range->num_channels = c;
+	range->num_frequency = c;
+
+	IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
+	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
+	IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
+
+	if (wdev->wiphy->max_scan_ssids > 0)
+		range->scan_capa |= IW_SCAN_CAPA_ESSID;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwrange);
+
+
+/**
+ * cfg80211_wext_freq - get wext frequency for non-"auto"
+ * @wiphy: the wiphy
+ * @freq: the wext freq encoding
+ *
+ * Returns a frequency, or a negative error code, or 0 for auto.
+ */
+int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq)
+{
+	/*
+	 * Parse frequency - return 0 for auto and
+	 * -EINVAL for impossible things.
+	 */
+	if (freq->e == 0) {
+		enum ieee80211_band band = IEEE80211_BAND_2GHZ;
+		if (freq->m < 0)
+			return 0;
+		if (freq->m > 14)
+			band = IEEE80211_BAND_5GHZ;
+		return ieee80211_channel_to_frequency(freq->m, band);
+	} else {
+		int i, div = 1000000;
+		for (i = 0; i < freq->e; i++)
+			div /= 10;
+		if (div <= 0)
+			return -EINVAL;
+		return freq->m / div;
+	}
+}
+
+int cfg80211_wext_siwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 orts = wdev->wiphy->rts_threshold;
+	int err;
+
+	if (rts->disabled || !rts->fixed)
+		wdev->wiphy->rts_threshold = (u32) -1;
+	else if (rts->value < 0)
+		return -EINVAL;
+	else
+		wdev->wiphy->rts_threshold = rts->value;
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy,
+					  WIPHY_PARAM_RTS_THRESHOLD);
+	if (err)
+		wdev->wiphy->rts_threshold = orts;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwrts);
+
+int cfg80211_wext_giwrts(struct net_device *dev,
+			 struct iw_request_info *info,
+			 struct iw_param *rts, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	rts->value = wdev->wiphy->rts_threshold;
+	rts->disabled = rts->value == (u32) -1;
+	rts->fixed = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwrts);
+
+int cfg80211_wext_siwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 ofrag = wdev->wiphy->frag_threshold;
+	int err;
+
+	if (frag->disabled || !frag->fixed)
+		wdev->wiphy->frag_threshold = (u32) -1;
+	else if (frag->value < 256)
+		return -EINVAL;
+	else {
+		/* Fragment length must be even, so strip LSB. */
+		wdev->wiphy->frag_threshold = frag->value & ~0x1;
+	}
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy,
+					  WIPHY_PARAM_FRAG_THRESHOLD);
+	if (err)
+		wdev->wiphy->frag_threshold = ofrag;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwfrag);
+
+int cfg80211_wext_giwfrag(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *frag, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	frag->value = wdev->wiphy->frag_threshold;
+	frag->disabled = frag->value == (u32) -1;
+	frag->fixed = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwfrag);
+
+int cfg80211_wext_siwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u32 changed = 0;
+	u8 olong = wdev->wiphy->retry_long;
+	u8 oshort = wdev->wiphy->retry_short;
+	int err;
+
+	if (retry->disabled ||
+	    (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT)
+		return -EINVAL;
+
+	if (retry->flags & IW_RETRY_LONG) {
+		wdev->wiphy->retry_long = retry->value;
+		changed |= WIPHY_PARAM_RETRY_LONG;
+	} else if (retry->flags & IW_RETRY_SHORT) {
+		wdev->wiphy->retry_short = retry->value;
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	} else {
+		wdev->wiphy->retry_short = retry->value;
+		wdev->wiphy->retry_long = retry->value;
+		changed |= WIPHY_PARAM_RETRY_LONG;
+		changed |= WIPHY_PARAM_RETRY_SHORT;
+	}
+
+	if (!changed)
+		return 0;
+
+	err = rdev->ops->set_wiphy_params(wdev->wiphy, changed);
+	if (err) {
+		wdev->wiphy->retry_short = oshort;
+		wdev->wiphy->retry_long = olong;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwretry);
+
+int cfg80211_wext_giwretry(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *retry, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	retry->disabled = 0;
+
+	if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) {
+		/*
+		 * First return short value, iwconfig will ask long value
+		 * later if needed
+		 */
+		retry->flags |= IW_RETRY_LIMIT;
+		retry->value = wdev->wiphy->retry_short;
+		if (wdev->wiphy->retry_long != wdev->wiphy->retry_short)
+			retry->flags |= IW_RETRY_LONG;
+
+		return 0;
+	}
+
+	if (retry->flags & IW_RETRY_LONG) {
+		retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG;
+		retry->value = wdev->wiphy->retry_long;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry);
+
+static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
+				     struct net_device *dev, bool pairwise,
+				     const u8 *addr, bool remove, bool tx_key,
+				     int idx, struct key_params *params)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int err, i;
+	bool rejoin = false;
+
+	if (pairwise && !addr)
+		return -EINVAL;
+
+	if (!wdev->wext.keys) {
+		wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
+					      GFP_KERNEL);
+		if (!wdev->wext.keys)
+			return -ENOMEM;
+		for (i = 0; i < 6; i++)
+			wdev->wext.keys->params[i].key =
+				wdev->wext.keys->data[i];
+	}
+
+	if (wdev->iftype != NL80211_IFTYPE_ADHOC &&
+	    wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
+		if (!wdev->current_bss)
+			return -ENOLINK;
+
+		if (!rdev->ops->set_default_mgmt_key)
+			return -EOPNOTSUPP;
+
+		if (idx < 4 || idx > 5)
+			return -EINVAL;
+	} else if (idx < 0 || idx > 3)
+		return -EINVAL;
+
+	if (remove) {
+		err = 0;
+		if (wdev->current_bss) {
+			/*
+			 * If removing the current TX key, we will need to
+			 * join a new IBSS without the privacy bit clear.
+			 */
+			if (idx == wdev->wext.default_key &&
+			    wdev->iftype == NL80211_IFTYPE_ADHOC) {
+				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
+				rejoin = true;
+			}
+
+			if (!pairwise && addr &&
+			    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+				err = -ENOENT;
+			else
+				err = rdev->ops->del_key(&rdev->wiphy, dev, idx,
+							 pairwise, addr);
+		}
+		wdev->wext.connect.privacy = false;
+		/*
+		 * Applications using wireless extensions expect to be
+		 * able to delete keys that don't exist, so allow that.
+		 */
+		if (err == -ENOENT)
+			err = 0;
+		if (!err) {
+			if (!addr) {
+				wdev->wext.keys->params[idx].key_len = 0;
+				wdev->wext.keys->params[idx].cipher = 0;
+			}
+			if (idx == wdev->wext.default_key)
+				wdev->wext.default_key = -1;
+			else if (idx == wdev->wext.default_mgmt_key)
+				wdev->wext.default_mgmt_key = -1;
+		}
+
+		if (!err && rejoin)
+			err = cfg80211_ibss_wext_join(rdev, wdev);
+
+		return err;
+	}
+
+	if (addr)
+		tx_key = false;
+
+	if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr))
+		return -EINVAL;
+
+	err = 0;
+	if (wdev->current_bss)
+		err = rdev->ops->add_key(&rdev->wiphy, dev, idx,
+					 pairwise, addr, params);
+	if (err)
+		return err;
+
+	if (!addr) {
+		wdev->wext.keys->params[idx] = *params;
+		memcpy(wdev->wext.keys->data[idx],
+			params->key, params->key_len);
+		wdev->wext.keys->params[idx].key =
+			wdev->wext.keys->data[idx];
+	}
+
+	if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 ||
+	     params->cipher == WLAN_CIPHER_SUITE_WEP104) &&
+	    (tx_key || (!addr && wdev->wext.default_key == -1))) {
+		if (wdev->current_bss) {
+			/*
+			 * If we are getting a new TX key from not having
+			 * had one before we need to join a new IBSS with
+			 * the privacy bit set.
+			 */
+			if (wdev->iftype == NL80211_IFTYPE_ADHOC &&
+			    wdev->wext.default_key == -1) {
+				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
+				rejoin = true;
+			}
+			err = rdev->ops->set_default_key(&rdev->wiphy, dev,
+							 idx, true, true);
+		}
+		if (!err) {
+			wdev->wext.default_key = idx;
+			if (rejoin)
+				err = cfg80211_ibss_wext_join(rdev, wdev);
+		}
+		return err;
+	}
+
+	if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC &&
+	    (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) {
+		if (wdev->current_bss)
+			err = rdev->ops->set_default_mgmt_key(&rdev->wiphy,
+							      dev, idx);
+		if (!err)
+			wdev->wext.default_mgmt_key = idx;
+		return err;
+	}
+
+	return 0;
+}
+
+static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
+				   struct net_device *dev, bool pairwise,
+				   const u8 *addr, bool remove, bool tx_key,
+				   int idx, struct key_params *params)
+{
+	int err;
+
+	/* devlist mutex needed for possible IBSS re-join */
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
+					remove, tx_key, idx, params);
+	wdev_unlock(dev->ieee80211_ptr);
+	mutex_unlock(&rdev->devlist_mtx);
+
+	return err;
+}
+
+int cfg80211_wext_siwencode(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct iw_point *erq, char *keybuf)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int idx, err;
+	bool remove = false;
+	struct key_params params;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	/* no use -- only MFP (set_default_mgmt_key) is optional */
+	if (!rdev->ops->del_key ||
+	    !rdev->ops->add_key ||
+	    !rdev->ops->set_default_key)
+		return -EOPNOTSUPP;
+
+	idx = erq->flags & IW_ENCODE_INDEX;
+	if (idx == 0) {
+		idx = wdev->wext.default_key;
+		if (idx < 0)
+			idx = 0;
+	} else if (idx < 1 || idx > 4)
+		return -EINVAL;
+	else
+		idx--;
+
+	if (erq->flags & IW_ENCODE_DISABLED)
+		remove = true;
+	else if (erq->length == 0) {
+		/* No key data - just set the default TX key index */
+		err = 0;
+		wdev_lock(wdev);
+		if (wdev->current_bss)
+			err = rdev->ops->set_default_key(&rdev->wiphy, dev,
+							 idx, true, true);
+		if (!err)
+			wdev->wext.default_key = idx;
+		wdev_unlock(wdev);
+		return err;
+	}
+
+	memset(&params, 0, sizeof(params));
+	params.key = keybuf;
+	params.key_len = erq->length;
+	if (erq->length == 5)
+		params.cipher = WLAN_CIPHER_SUITE_WEP40;
+	else if (erq->length == 13)
+		params.cipher = WLAN_CIPHER_SUITE_WEP104;
+	else if (!remove)
+		return -EINVAL;
+
+	return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
+				       wdev->wext.default_key == -1,
+				       idx, &params);
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwencode);
+
+int cfg80211_wext_siwencodeext(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_point *erq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct iw_encode_ext *ext = (struct iw_encode_ext *) extra;
+	const u8 *addr;
+	int idx;
+	bool remove = false;
+	struct key_params params;
+	u32 cipher;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	/* no use -- only MFP (set_default_mgmt_key) is optional */
+	if (!rdev->ops->del_key ||
+	    !rdev->ops->add_key ||
+	    !rdev->ops->set_default_key)
+		return -EOPNOTSUPP;
+
+	switch (ext->alg) {
+	case IW_ENCODE_ALG_NONE:
+		remove = true;
+		cipher = 0;
+		break;
+	case IW_ENCODE_ALG_WEP:
+		if (ext->key_len == 5)
+			cipher = WLAN_CIPHER_SUITE_WEP40;
+		else if (ext->key_len == 13)
+			cipher = WLAN_CIPHER_SUITE_WEP104;
+		else
+			return -EINVAL;
+		break;
+	case IW_ENCODE_ALG_TKIP:
+		cipher = WLAN_CIPHER_SUITE_TKIP;
+		break;
+	case IW_ENCODE_ALG_CCMP:
+		cipher = WLAN_CIPHER_SUITE_CCMP;
+		break;
+	case IW_ENCODE_ALG_AES_CMAC:
+		cipher = WLAN_CIPHER_SUITE_AES_CMAC;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (erq->flags & IW_ENCODE_DISABLED)
+		remove = true;
+
+	idx = erq->flags & IW_ENCODE_INDEX;
+	if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) {
+		if (idx < 4 || idx > 5) {
+			idx = wdev->wext.default_mgmt_key;
+			if (idx < 0)
+				return -EINVAL;
+		} else
+			idx--;
+	} else {
+		if (idx < 1 || idx > 4) {
+			idx = wdev->wext.default_key;
+			if (idx < 0)
+				return -EINVAL;
+		} else
+			idx--;
+	}
+
+	addr = ext->addr.sa_data;
+	if (is_broadcast_ether_addr(addr))
+		addr = NULL;
+
+	memset(&params, 0, sizeof(params));
+	params.key = ext->key;
+	params.key_len = ext->key_len;
+	params.cipher = cipher;
+
+	if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) {
+		params.seq = ext->rx_seq;
+		params.seq_len = 6;
+	}
+
+	return cfg80211_set_encryption(
+			rdev, dev,
+			!(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
+			addr, remove,
+			ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
+			idx, &params);
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwencodeext);
+
+int cfg80211_wext_giwencode(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct iw_point *erq, char *keybuf)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	int idx;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
+
+	idx = erq->flags & IW_ENCODE_INDEX;
+	if (idx == 0) {
+		idx = wdev->wext.default_key;
+		if (idx < 0)
+			idx = 0;
+	} else if (idx < 1 || idx > 4)
+		return -EINVAL;
+	else
+		idx--;
+
+	erq->flags = idx + 1;
+
+	if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) {
+		erq->flags |= IW_ENCODE_DISABLED;
+		erq->length = 0;
+		return 0;
+	}
+
+	erq->length = min_t(size_t, erq->length,
+			    wdev->wext.keys->params[idx].key_len);
+	memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length);
+	erq->flags |= IW_ENCODE_ENABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwencode);
+
+int cfg80211_wext_siwfreq(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_freq *wextfreq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int freq, err;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra);
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra);
+	case NL80211_IFTYPE_MONITOR:
+	case NL80211_IFTYPE_WDS:
+	case NL80211_IFTYPE_MESH_POINT:
+		freq = cfg80211_wext_freq(wdev->wiphy, wextfreq);
+		if (freq < 0)
+			return freq;
+		if (freq == 0)
+			return -EINVAL;
+		mutex_lock(&rdev->devlist_mtx);
+		wdev_lock(wdev);
+		err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT);
+		wdev_unlock(wdev);
+		mutex_unlock(&rdev->devlist_mtx);
+		return err;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwfreq);
+
+int cfg80211_wext_giwfreq(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_freq *freq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra);
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra);
+	default:
+		if (!wdev->channel)
+			return -EINVAL;
+		freq->m = wdev->channel->center_freq;
+		freq->e = 6;
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwfreq);
+
+int cfg80211_wext_siwtxpower(struct net_device *dev,
+			     struct iw_request_info *info,
+			     union iwreq_data *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	enum nl80211_tx_power_setting type;
+	int dbm = 0;
+
+	if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
+		return -EINVAL;
+	if (data->txpower.flags & IW_TXPOW_RANGE)
+		return -EINVAL;
+
+	if (!rdev->ops->set_tx_power)
+		return -EOPNOTSUPP;
+
+	/* only change when not disabling */
+	if (!data->txpower.disabled) {
+		rfkill_set_sw_state(rdev->rfkill, false);
+
+		if (data->txpower.fixed) {
+			/*
+			 * wext doesn't support negative values, see
+			 * below where it's for automatic
+			 */
+			if (data->txpower.value < 0)
+				return -EINVAL;
+			dbm = data->txpower.value;
+			type = NL80211_TX_POWER_FIXED;
+			/* TODO: do regulatory check! */
+		} else {
+			/*
+			 * Automatic power level setting, max being the value
+			 * passed in from userland.
+			 */
+			if (data->txpower.value < 0) {
+				type = NL80211_TX_POWER_AUTOMATIC;
+			} else {
+				dbm = data->txpower.value;
+				type = NL80211_TX_POWER_LIMITED;
+			}
+		}
+	} else {
+		rfkill_set_sw_state(rdev->rfkill, true);
+		schedule_work(&rdev->rfkill_sync);
+		return 0;
+	}
+
+	return rdev->ops->set_tx_power(wdev->wiphy, type, DBM_TO_MBM(dbm));
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwtxpower);
+
+int cfg80211_wext_giwtxpower(struct net_device *dev,
+			     struct iw_request_info *info,
+			     union iwreq_data *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int err, val;
+
+	if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
+		return -EINVAL;
+	if (data->txpower.flags & IW_TXPOW_RANGE)
+		return -EINVAL;
+
+	if (!rdev->ops->get_tx_power)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->get_tx_power(wdev->wiphy, &val);
+	if (err)
+		return err;
+
+	/* well... oh well */
+	data->txpower.fixed = 1;
+	data->txpower.disabled = rfkill_blocked(rdev->rfkill);
+	data->txpower.value = val;
+	data->txpower.flags = IW_TXPOW_DBM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwtxpower);
+
+static int cfg80211_set_auth_alg(struct wireless_dev *wdev,
+				 s32 auth_alg)
+{
+	int nr_alg = 0;
+
+	if (!auth_alg)
+		return -EINVAL;
+
+	if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM |
+			 IW_AUTH_ALG_SHARED_KEY |
+			 IW_AUTH_ALG_LEAP))
+		return -EINVAL;
+
+	if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) {
+		nr_alg++;
+		wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM;
+	}
+
+	if (auth_alg & IW_AUTH_ALG_SHARED_KEY) {
+		nr_alg++;
+		wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY;
+	}
+
+	if (auth_alg & IW_AUTH_ALG_LEAP) {
+		nr_alg++;
+		wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP;
+	}
+
+	if (nr_alg > 1)
+		wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
+
+	return 0;
+}
+
+static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions)
+{
+	if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA |
+			     IW_AUTH_WPA_VERSION_WPA2|
+		             IW_AUTH_WPA_VERSION_DISABLED))
+		return -EINVAL;
+
+	if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) &&
+	    (wpa_versions & (IW_AUTH_WPA_VERSION_WPA|
+			     IW_AUTH_WPA_VERSION_WPA2)))
+		return -EINVAL;
+
+	if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED)
+		wdev->wext.connect.crypto.wpa_versions &=
+			~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2);
+
+	if (wpa_versions & IW_AUTH_WPA_VERSION_WPA)
+		wdev->wext.connect.crypto.wpa_versions |=
+			NL80211_WPA_VERSION_1;
+
+	if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2)
+		wdev->wext.connect.crypto.wpa_versions |=
+			NL80211_WPA_VERSION_2;
+
+	return 0;
+}
+
+static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher)
+{
+	if (cipher & IW_AUTH_CIPHER_WEP40)
+		wdev->wext.connect.crypto.cipher_group =
+			WLAN_CIPHER_SUITE_WEP40;
+	else if (cipher & IW_AUTH_CIPHER_WEP104)
+		wdev->wext.connect.crypto.cipher_group =
+			WLAN_CIPHER_SUITE_WEP104;
+	else if (cipher & IW_AUTH_CIPHER_TKIP)
+		wdev->wext.connect.crypto.cipher_group =
+			WLAN_CIPHER_SUITE_TKIP;
+	else if (cipher & IW_AUTH_CIPHER_CCMP)
+		wdev->wext.connect.crypto.cipher_group =
+			WLAN_CIPHER_SUITE_CCMP;
+	else if (cipher & IW_AUTH_CIPHER_AES_CMAC)
+		wdev->wext.connect.crypto.cipher_group =
+			WLAN_CIPHER_SUITE_AES_CMAC;
+	else if (cipher & IW_AUTH_CIPHER_NONE)
+		wdev->wext.connect.crypto.cipher_group = 0;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher)
+{
+	int nr_ciphers = 0;
+	u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise;
+
+	if (cipher & IW_AUTH_CIPHER_WEP40) {
+		ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40;
+		nr_ciphers++;
+	}
+
+	if (cipher & IW_AUTH_CIPHER_WEP104) {
+		ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104;
+		nr_ciphers++;
+	}
+
+	if (cipher & IW_AUTH_CIPHER_TKIP) {
+		ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP;
+		nr_ciphers++;
+	}
+
+	if (cipher & IW_AUTH_CIPHER_CCMP) {
+		ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP;
+		nr_ciphers++;
+	}
+
+	if (cipher & IW_AUTH_CIPHER_AES_CMAC) {
+		ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC;
+		nr_ciphers++;
+	}
+
+	BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5);
+
+	wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers;
+
+	return 0;
+}
+
+
+static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt)
+{
+	int nr_akm_suites = 0;
+
+	if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X |
+			IW_AUTH_KEY_MGMT_PSK))
+		return -EINVAL;
+
+	if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) {
+		wdev->wext.connect.crypto.akm_suites[nr_akm_suites] =
+			WLAN_AKM_SUITE_8021X;
+		nr_akm_suites++;
+	}
+
+	if (key_mgt & IW_AUTH_KEY_MGMT_PSK) {
+		wdev->wext.connect.crypto.akm_suites[nr_akm_suites] =
+			WLAN_AKM_SUITE_PSK;
+		nr_akm_suites++;
+	}
+
+	wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites;
+
+	return 0;
+}
+
+int cfg80211_wext_siwauth(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	switch (data->flags & IW_AUTH_INDEX) {
+	case IW_AUTH_PRIVACY_INVOKED:
+		wdev->wext.connect.privacy = data->value;
+		return 0;
+	case IW_AUTH_WPA_VERSION:
+		return cfg80211_set_wpa_version(wdev, data->value);
+	case IW_AUTH_CIPHER_GROUP:
+		return cfg80211_set_cipher_group(wdev, data->value);
+	case IW_AUTH_KEY_MGMT:
+		return cfg80211_set_key_mgt(wdev, data->value);
+	case IW_AUTH_CIPHER_PAIRWISE:
+		return cfg80211_set_cipher_pairwise(wdev, data->value);
+	case IW_AUTH_80211_AUTH_ALG:
+		return cfg80211_set_auth_alg(wdev, data->value);
+	case IW_AUTH_WPA_ENABLED:
+	case IW_AUTH_RX_UNENCRYPTED_EAPOL:
+	case IW_AUTH_DROP_UNENCRYPTED:
+	case IW_AUTH_MFP:
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwauth);
+
+int cfg80211_wext_giwauth(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *data, char *extra)
+{
+	/* XXX: what do we need? */
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwauth);
+
+int cfg80211_wext_siwpower(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *wrq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	bool ps = wdev->ps;
+	int timeout = wdev->ps_timeout;
+	int err;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EINVAL;
+
+	if (!rdev->ops->set_power_mgmt)
+		return -EOPNOTSUPP;
+
+	if (wrq->disabled) {
+		ps = false;
+	} else {
+		switch (wrq->flags & IW_POWER_MODE) {
+		case IW_POWER_ON:       /* If not specified */
+		case IW_POWER_MODE:     /* If set all mask */
+		case IW_POWER_ALL_R:    /* If explicitely state all */
+			ps = true;
+			break;
+		default:                /* Otherwise we ignore */
+			return -EINVAL;
+		}
+
+		if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT))
+			return -EINVAL;
+
+		if (wrq->flags & IW_POWER_TIMEOUT)
+			timeout = wrq->value / 1000;
+	}
+
+	err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, ps, timeout);
+	if (err)
+		return err;
+
+	wdev->ps = ps;
+	wdev->ps_timeout = timeout;
+
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwpower);
+
+int cfg80211_wext_giwpower(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_param *wrq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	wrq->disabled = !wdev->ps;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwpower);
+
+static int cfg80211_wds_wext_siwap(struct net_device *dev,
+				   struct iw_request_info *info,
+				   struct sockaddr *addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	int err;
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
+		return -EINVAL;
+
+	if (addr->sa_family != ARPHRD_ETHER)
+		return -EINVAL;
+
+	if (netif_running(dev))
+		return -EBUSY;
+
+	if (!rdev->ops->set_wds_peer)
+		return -EOPNOTSUPP;
+
+	err = rdev->ops->set_wds_peer(wdev->wiphy, dev, (u8 *) &addr->sa_data);
+	if (err)
+		return err;
+
+	memcpy(&wdev->wext.bssid, (u8 *) &addr->sa_data, ETH_ALEN);
+
+	return 0;
+}
+
+static int cfg80211_wds_wext_giwap(struct net_device *dev,
+				   struct iw_request_info *info,
+				   struct sockaddr *addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_WDS))
+		return -EINVAL;
+
+	addr->sa_family = ARPHRD_ETHER;
+	memcpy(&addr->sa_data, wdev->wext.bssid, ETH_ALEN);
+
+	return 0;
+}
+
+int cfg80211_wext_siwrate(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *rate, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_bitrate_mask mask;
+	u32 fixed, maxrate;
+	struct ieee80211_supported_band *sband;
+	int band, ridx;
+	bool match = false;
+
+	if (!rdev->ops->set_bitrate_mask)
+		return -EOPNOTSUPP;
+
+	memset(&mask, 0, sizeof(mask));
+	fixed = 0;
+	maxrate = (u32)-1;
+
+	if (rate->value < 0) {
+		/* nothing */
+	} else if (rate->fixed) {
+		fixed = rate->value / 100000;
+	} else {
+		maxrate = rate->value / 100000;
+	}
+
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
+		sband = wdev->wiphy->bands[band];
+		if (sband == NULL)
+			continue;
+		for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
+			struct ieee80211_rate *srate = &sband->bitrates[ridx];
+			if (fixed == srate->bitrate) {
+				mask.control[band].legacy = 1 << ridx;
+				match = true;
+				break;
+			}
+			if (srate->bitrate <= maxrate) {
+				mask.control[band].legacy |= 1 << ridx;
+				match = true;
+			}
+		}
+	}
+
+	if (!match)
+		return -EINVAL;
+
+	return rdev->ops->set_bitrate_mask(wdev->wiphy, dev, NULL, &mask);
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwrate);
+
+int cfg80211_wext_giwrate(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_param *rate, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	/* we are under RTNL - globally locked - so can use a static struct */
+	static struct station_info sinfo;
+	u8 addr[ETH_ALEN];
+	int err;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->get_station)
+		return -EOPNOTSUPP;
+
+	err = 0;
+	wdev_lock(wdev);
+	if (wdev->current_bss)
+		memcpy(addr, wdev->current_bss->pub.bssid, ETH_ALEN);
+	else
+		err = -EOPNOTSUPP;
+	wdev_unlock(wdev);
+	if (err)
+		return err;
+
+	err = rdev->ops->get_station(&rdev->wiphy, dev, addr, &sinfo);
+	if (err)
+		return err;
+
+	if (!(sinfo.filled & STATION_INFO_TX_BITRATE))
+		return -EOPNOTSUPP;
+
+	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwrate);
+
+/* Get wireless statistics.  Called by /proc/net/wireless and by SIOCGIWSTATS */
+struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	/* we are under RTNL - globally locked - so can use static structs */
+	static struct iw_statistics wstats;
+	static struct station_info sinfo;
+	u8 bssid[ETH_ALEN];
+
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION)
+		return NULL;
+
+	if (!rdev->ops->get_station)
+		return NULL;
+
+	/* Grab BSSID of current BSS, if any */
+	wdev_lock(wdev);
+	if (!wdev->current_bss) {
+		wdev_unlock(wdev);
+		return NULL;
+	}
+	memcpy(bssid, wdev->current_bss->pub.bssid, ETH_ALEN);
+	wdev_unlock(wdev);
+
+	if (rdev->ops->get_station(&rdev->wiphy, dev, bssid, &sinfo))
+		return NULL;
+
+	memset(&wstats, 0, sizeof(wstats));
+
+	switch (rdev->wiphy.signal_type) {
+	case CFG80211_SIGNAL_TYPE_MBM:
+		if (sinfo.filled & STATION_INFO_SIGNAL) {
+			int sig = sinfo.signal;
+			wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
+			wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
+			wstats.qual.updated |= IW_QUAL_DBM;
+			wstats.qual.level = sig;
+			if (sig < -110)
+				sig = -110;
+			else if (sig > -40)
+				sig = -40;
+			wstats.qual.qual = sig + 110;
+			break;
+		}
+	case CFG80211_SIGNAL_TYPE_UNSPEC:
+		if (sinfo.filled & STATION_INFO_SIGNAL) {
+			wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
+			wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
+			wstats.qual.level = sinfo.signal;
+			wstats.qual.qual = sinfo.signal;
+			break;
+		}
+	default:
+		wstats.qual.updated |= IW_QUAL_LEVEL_INVALID;
+		wstats.qual.updated |= IW_QUAL_QUAL_INVALID;
+	}
+
+	wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
+	if (sinfo.filled & STATION_INFO_RX_DROP_MISC)
+		wstats.discard.misc = sinfo.rx_dropped_misc;
+	if (sinfo.filled & STATION_INFO_TX_FAILED)
+		wstats.discard.retries = sinfo.tx_failed;
+
+	return &wstats;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wireless_stats);
+
+int cfg80211_wext_siwap(struct net_device *dev,
+			struct iw_request_info *info,
+			struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra);
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra);
+	case NL80211_IFTYPE_WDS:
+		return cfg80211_wds_wext_siwap(dev, info, ap_addr, extra);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwap);
+
+int cfg80211_wext_giwap(struct net_device *dev,
+			struct iw_request_info *info,
+			struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra);
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra);
+	case NL80211_IFTYPE_WDS:
+		return cfg80211_wds_wext_giwap(dev, info, ap_addr, extra);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwap);
+
+int cfg80211_wext_siwessid(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_siwessid(dev, info, data, ssid);
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_siwessid(dev, info, data, ssid);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwessid);
+
+int cfg80211_wext_giwessid(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	data->flags = 0;
+	data->length = 0;
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_ADHOC:
+		return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
+	case NL80211_IFTYPE_STATION:
+		return cfg80211_mgd_wext_giwessid(dev, info, data, ssid);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_giwessid);
+
+int cfg80211_wext_siwpmksa(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct cfg80211_pmksa cfg_pmksa;
+	struct iw_pmksa *pmksa = (struct iw_pmksa *)extra;
+
+	memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EINVAL;
+
+	cfg_pmksa.bssid = pmksa->bssid.sa_data;
+	cfg_pmksa.pmkid = pmksa->pmkid;
+
+	switch (pmksa->cmd) {
+	case IW_PMKSA_ADD:
+		if (!rdev->ops->set_pmksa)
+			return -EOPNOTSUPP;
+
+		return rdev->ops->set_pmksa(&rdev->wiphy, dev, &cfg_pmksa);
+
+	case IW_PMKSA_REMOVE:
+		if (!rdev->ops->del_pmksa)
+			return -EOPNOTSUPP;
+
+		return rdev->ops->del_pmksa(&rdev->wiphy, dev, &cfg_pmksa);
+
+	case IW_PMKSA_FLUSH:
+		if (!rdev->ops->flush_pmksa)
+			return -EOPNOTSUPP;
+
+		return rdev->ops->flush_pmksa(&rdev->wiphy, dev);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwpmksa);
+
+static const iw_handler cfg80211_handlers[] = {
+	[IW_IOCTL_IDX(SIOCGIWNAME)]	= (iw_handler) cfg80211_wext_giwname,
+	[IW_IOCTL_IDX(SIOCSIWFREQ)]	= (iw_handler) cfg80211_wext_siwfreq,
+	[IW_IOCTL_IDX(SIOCGIWFREQ)]	= (iw_handler) cfg80211_wext_giwfreq,
+	[IW_IOCTL_IDX(SIOCSIWMODE)]	= (iw_handler) cfg80211_wext_siwmode,
+	[IW_IOCTL_IDX(SIOCGIWMODE)]	= (iw_handler) cfg80211_wext_giwmode,
+	[IW_IOCTL_IDX(SIOCGIWRANGE)]	= (iw_handler) cfg80211_wext_giwrange,
+	[IW_IOCTL_IDX(SIOCSIWAP)]	= (iw_handler) cfg80211_wext_siwap,
+	[IW_IOCTL_IDX(SIOCGIWAP)]	= (iw_handler) cfg80211_wext_giwap,
+	[IW_IOCTL_IDX(SIOCSIWMLME)]	= (iw_handler) cfg80211_wext_siwmlme,
+	[IW_IOCTL_IDX(SIOCSIWSCAN)]	= (iw_handler) cfg80211_wext_siwscan,
+	[IW_IOCTL_IDX(SIOCGIWSCAN)]	= (iw_handler) cfg80211_wext_giwscan,
+	[IW_IOCTL_IDX(SIOCSIWESSID)]	= (iw_handler) cfg80211_wext_siwessid,
+	[IW_IOCTL_IDX(SIOCGIWESSID)]	= (iw_handler) cfg80211_wext_giwessid,
+	[IW_IOCTL_IDX(SIOCSIWRATE)]	= (iw_handler) cfg80211_wext_siwrate,
+	[IW_IOCTL_IDX(SIOCGIWRATE)]	= (iw_handler) cfg80211_wext_giwrate,
+	[IW_IOCTL_IDX(SIOCSIWRTS)]	= (iw_handler) cfg80211_wext_siwrts,
+	[IW_IOCTL_IDX(SIOCGIWRTS)]	= (iw_handler) cfg80211_wext_giwrts,
+	[IW_IOCTL_IDX(SIOCSIWFRAG)]	= (iw_handler) cfg80211_wext_siwfrag,
+	[IW_IOCTL_IDX(SIOCGIWFRAG)]	= (iw_handler) cfg80211_wext_giwfrag,
+	[IW_IOCTL_IDX(SIOCSIWTXPOW)]	= (iw_handler) cfg80211_wext_siwtxpower,
+	[IW_IOCTL_IDX(SIOCGIWTXPOW)]	= (iw_handler) cfg80211_wext_giwtxpower,
+	[IW_IOCTL_IDX(SIOCSIWRETRY)]	= (iw_handler) cfg80211_wext_siwretry,
+	[IW_IOCTL_IDX(SIOCGIWRETRY)]	= (iw_handler) cfg80211_wext_giwretry,
+	[IW_IOCTL_IDX(SIOCSIWENCODE)]	= (iw_handler) cfg80211_wext_siwencode,
+	[IW_IOCTL_IDX(SIOCGIWENCODE)]	= (iw_handler) cfg80211_wext_giwencode,
+	[IW_IOCTL_IDX(SIOCSIWPOWER)]	= (iw_handler) cfg80211_wext_siwpower,
+	[IW_IOCTL_IDX(SIOCGIWPOWER)]	= (iw_handler) cfg80211_wext_giwpower,
+	[IW_IOCTL_IDX(SIOCSIWGENIE)]	= (iw_handler) cfg80211_wext_siwgenie,
+	[IW_IOCTL_IDX(SIOCSIWAUTH)]	= (iw_handler) cfg80211_wext_siwauth,
+	[IW_IOCTL_IDX(SIOCGIWAUTH)]	= (iw_handler) cfg80211_wext_giwauth,
+	[IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= (iw_handler) cfg80211_wext_siwencodeext,
+	[IW_IOCTL_IDX(SIOCSIWPMKSA)]	= (iw_handler) cfg80211_wext_siwpmksa,
+};
+
+const struct iw_handler_def cfg80211_wext_handler = {
+	.num_standard		= ARRAY_SIZE(cfg80211_handlers),
+	.standard		= cfg80211_handlers,
+	.get_wireless_stats = cfg80211_wireless_stats,
+};
diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h
new file mode 100644
index 00000000..20b3daef
--- /dev/null
+++ b/net/wireless/wext-compat.h
@@ -0,0 +1,49 @@
+#ifndef __WEXT_COMPAT
+#define __WEXT_COMPAT
+
+#include <net/iw_handler.h>
+#include <linux/wireless.h>
+
+int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_freq *freq, char *extra);
+int cfg80211_ibss_wext_siwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra);
+int cfg80211_ibss_wext_giwap(struct net_device *dev,
+			     struct iw_request_info *info,
+			     struct sockaddr *ap_addr, char *extra);
+int cfg80211_ibss_wext_siwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid);
+int cfg80211_ibss_wext_giwessid(struct net_device *dev,
+				struct iw_request_info *info,
+				struct iw_point *data, char *ssid);
+
+int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
+			      struct iw_request_info *info,
+			      struct iw_freq *freq, char *extra);
+int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
+			      struct iw_request_info *info,
+			      struct iw_freq *freq, char *extra);
+int cfg80211_mgd_wext_siwap(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct sockaddr *ap_addr, char *extra);
+int cfg80211_mgd_wext_giwap(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct sockaddr *ap_addr, char *extra);
+int cfg80211_mgd_wext_siwessid(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_point *data, char *ssid);
+int cfg80211_mgd_wext_giwessid(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_point *data, char *ssid);
+
+int cfg80211_wext_freq(struct wiphy *wiphy, struct iw_freq *freq);
+
+
+extern const struct iw_handler_def cfg80211_wext_handler;
+#endif /* __WEXT_COMPAT */
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
new file mode 100644
index 00000000..fdbc23c1
--- /dev/null
+++ b/net/wireless/wext-core.c
@@ -0,0 +1,1084 @@
+/*
+ * This file implement the Wireless Extensions core API.
+ *
+ * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ * Copyright	2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <linux/uaccess.h>
+#include <net/cfg80211.h>
+#include <net/iw_handler.h>
+#include <net/netlink.h>
+#include <net/wext.h>
+#include <net/net_namespace.h>
+
+typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *,
+			       unsigned int, struct iw_request_info *,
+			       iw_handler);
+
+
+/*
+ * Meta-data about all the standard Wireless Extension request we
+ * know about.
+ */
+static const struct iw_ioctl_description standard_ioctl[] = {
+	[IW_IOCTL_IDX(SIOCSIWCOMMIT)] = {
+		.header_type	= IW_HEADER_TYPE_NULL,
+	},
+	[IW_IOCTL_IDX(SIOCGIWNAME)] = {
+		.header_type	= IW_HEADER_TYPE_CHAR,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWNWID)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+		.flags		= IW_DESCR_FLAG_EVENT,
+	},
+	[IW_IOCTL_IDX(SIOCGIWNWID)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWFREQ)] = {
+		.header_type	= IW_HEADER_TYPE_FREQ,
+		.flags		= IW_DESCR_FLAG_EVENT,
+	},
+	[IW_IOCTL_IDX(SIOCGIWFREQ)] = {
+		.header_type	= IW_HEADER_TYPE_FREQ,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWMODE)] = {
+		.header_type	= IW_HEADER_TYPE_UINT,
+		.flags		= IW_DESCR_FLAG_EVENT,
+	},
+	[IW_IOCTL_IDX(SIOCGIWMODE)] = {
+		.header_type	= IW_HEADER_TYPE_UINT,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWSENS)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWSENS)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWRANGE)] = {
+		.header_type	= IW_HEADER_TYPE_NULL,
+	},
+	[IW_IOCTL_IDX(SIOCGIWRANGE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_range),
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWPRIV)] = {
+		.header_type	= IW_HEADER_TYPE_NULL,
+	},
+	[IW_IOCTL_IDX(SIOCGIWPRIV)] = { /* (handled directly by us) */
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct iw_priv_args),
+		.max_tokens	= 16,
+		.flags		= IW_DESCR_FLAG_NOMAX,
+	},
+	[IW_IOCTL_IDX(SIOCSIWSTATS)] = {
+		.header_type	= IW_HEADER_TYPE_NULL,
+	},
+	[IW_IOCTL_IDX(SIOCGIWSTATS)] = { /* (handled directly by us) */
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_statistics),
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWSPY)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct sockaddr),
+		.max_tokens	= IW_MAX_SPY,
+	},
+	[IW_IOCTL_IDX(SIOCGIWSPY)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct sockaddr) +
+				  sizeof(struct iw_quality),
+		.max_tokens	= IW_MAX_SPY,
+	},
+	[IW_IOCTL_IDX(SIOCSIWTHRSPY)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct iw_thrspy),
+		.min_tokens	= 1,
+		.max_tokens	= 1,
+	},
+	[IW_IOCTL_IDX(SIOCGIWTHRSPY)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct iw_thrspy),
+		.min_tokens	= 1,
+		.max_tokens	= 1,
+	},
+	[IW_IOCTL_IDX(SIOCSIWAP)] = {
+		.header_type	= IW_HEADER_TYPE_ADDR,
+	},
+	[IW_IOCTL_IDX(SIOCGIWAP)] = {
+		.header_type	= IW_HEADER_TYPE_ADDR,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWMLME)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_mlme),
+		.max_tokens	= sizeof(struct iw_mlme),
+	},
+	[IW_IOCTL_IDX(SIOCGIWAPLIST)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= sizeof(struct sockaddr) +
+				  sizeof(struct iw_quality),
+		.max_tokens	= IW_MAX_AP,
+		.flags		= IW_DESCR_FLAG_NOMAX,
+	},
+	[IW_IOCTL_IDX(SIOCSIWSCAN)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= 0,
+		.max_tokens	= sizeof(struct iw_scan_req),
+	},
+	[IW_IOCTL_IDX(SIOCGIWSCAN)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_SCAN_MAX_DATA,
+		.flags		= IW_DESCR_FLAG_NOMAX,
+	},
+	[IW_IOCTL_IDX(SIOCSIWESSID)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ESSID_MAX_SIZE,
+		.flags		= IW_DESCR_FLAG_EVENT,
+	},
+	[IW_IOCTL_IDX(SIOCGIWESSID)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ESSID_MAX_SIZE,
+		.flags		= IW_DESCR_FLAG_DUMP,
+	},
+	[IW_IOCTL_IDX(SIOCSIWNICKN)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ESSID_MAX_SIZE,
+	},
+	[IW_IOCTL_IDX(SIOCGIWNICKN)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ESSID_MAX_SIZE,
+	},
+	[IW_IOCTL_IDX(SIOCSIWRATE)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWRATE)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWRTS)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWRTS)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWFRAG)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWFRAG)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWTXPOW)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWTXPOW)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWRETRY)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWRETRY)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWENCODE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ENCODING_TOKEN_MAX,
+		.flags		= IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT,
+	},
+	[IW_IOCTL_IDX(SIOCGIWENCODE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_ENCODING_TOKEN_MAX,
+		.flags		= IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT,
+	},
+	[IW_IOCTL_IDX(SIOCSIWPOWER)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWPOWER)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWGENIE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IW_IOCTL_IDX(SIOCGIWGENIE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IW_IOCTL_IDX(SIOCSIWAUTH)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCGIWAUTH)] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[IW_IOCTL_IDX(SIOCSIWENCODEEXT)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_encode_ext),
+		.max_tokens	= sizeof(struct iw_encode_ext) +
+				  IW_ENCODING_TOKEN_MAX,
+	},
+	[IW_IOCTL_IDX(SIOCGIWENCODEEXT)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_encode_ext),
+		.max_tokens	= sizeof(struct iw_encode_ext) +
+				  IW_ENCODING_TOKEN_MAX,
+	},
+	[IW_IOCTL_IDX(SIOCSIWPMKSA)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_pmksa),
+		.max_tokens	= sizeof(struct iw_pmksa),
+	},
+};
+static const unsigned standard_ioctl_num = ARRAY_SIZE(standard_ioctl);
+
+/*
+ * Meta-data about all the additional standard Wireless Extension events
+ * we know about.
+ */
+static const struct iw_ioctl_description standard_event[] = {
+	[IW_EVENT_IDX(IWEVTXDROP)] = {
+		.header_type	= IW_HEADER_TYPE_ADDR,
+	},
+	[IW_EVENT_IDX(IWEVQUAL)] = {
+		.header_type	= IW_HEADER_TYPE_QUAL,
+	},
+	[IW_EVENT_IDX(IWEVCUSTOM)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_CUSTOM_MAX,
+	},
+	[IW_EVENT_IDX(IWEVREGISTERED)] = {
+		.header_type	= IW_HEADER_TYPE_ADDR,
+	},
+	[IW_EVENT_IDX(IWEVEXPIRED)] = {
+		.header_type	= IW_HEADER_TYPE_ADDR,
+	},
+	[IW_EVENT_IDX(IWEVGENIE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IW_EVENT_IDX(IWEVMICHAELMICFAILURE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_michaelmicfailure),
+	},
+	[IW_EVENT_IDX(IWEVASSOCREQIE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IW_EVENT_IDX(IWEVASSOCRESPIE)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IW_EVENT_IDX(IWEVPMKIDCAND)] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_pmkid_cand),
+	},
+};
+static const unsigned standard_event_num = ARRAY_SIZE(standard_event);
+
+/* Size (in bytes) of various events */
+static const int event_type_size[] = {
+	IW_EV_LCP_LEN,			/* IW_HEADER_TYPE_NULL */
+	0,
+	IW_EV_CHAR_LEN,			/* IW_HEADER_TYPE_CHAR */
+	0,
+	IW_EV_UINT_LEN,			/* IW_HEADER_TYPE_UINT */
+	IW_EV_FREQ_LEN,			/* IW_HEADER_TYPE_FREQ */
+	IW_EV_ADDR_LEN,			/* IW_HEADER_TYPE_ADDR */
+	0,
+	IW_EV_POINT_LEN,		/* Without variable payload */
+	IW_EV_PARAM_LEN,		/* IW_HEADER_TYPE_PARAM */
+	IW_EV_QUAL_LEN,			/* IW_HEADER_TYPE_QUAL */
+};
+
+#ifdef CONFIG_COMPAT
+static const int compat_event_type_size[] = {
+	IW_EV_COMPAT_LCP_LEN,		/* IW_HEADER_TYPE_NULL */
+	0,
+	IW_EV_COMPAT_CHAR_LEN,		/* IW_HEADER_TYPE_CHAR */
+	0,
+	IW_EV_COMPAT_UINT_LEN,		/* IW_HEADER_TYPE_UINT */
+	IW_EV_COMPAT_FREQ_LEN,		/* IW_HEADER_TYPE_FREQ */
+	IW_EV_COMPAT_ADDR_LEN,		/* IW_HEADER_TYPE_ADDR */
+	0,
+	IW_EV_COMPAT_POINT_LEN,		/* Without variable payload */
+	IW_EV_COMPAT_PARAM_LEN,		/* IW_HEADER_TYPE_PARAM */
+	IW_EV_COMPAT_QUAL_LEN,		/* IW_HEADER_TYPE_QUAL */
+};
+#endif
+
+
+/* IW event code */
+
+static int __net_init wext_pernet_init(struct net *net)
+{
+	skb_queue_head_init(&net->wext_nlevents);
+	return 0;
+}
+
+static void __net_exit wext_pernet_exit(struct net *net)
+{
+	skb_queue_purge(&net->wext_nlevents);
+}
+
+static struct pernet_operations wext_pernet_ops = {
+	.init = wext_pernet_init,
+	.exit = wext_pernet_exit,
+};
+
+static int __init wireless_nlevent_init(void)
+{
+	return register_pernet_subsys(&wext_pernet_ops);
+}
+
+subsys_initcall(wireless_nlevent_init);
+
+/* Process events generated by the wireless layer or the driver. */
+static void wireless_nlevent_process(struct work_struct *work)
+{
+	struct sk_buff *skb;
+	struct net *net;
+
+	rtnl_lock();
+
+	for_each_net(net) {
+		while ((skb = skb_dequeue(&net->wext_nlevents)))
+			rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+				    GFP_KERNEL);
+	}
+
+	rtnl_unlock();
+}
+
+static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process);
+
+static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev,
+					      struct sk_buff *skb)
+{
+	struct ifinfomsg *r;
+	struct nlmsghdr  *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*r), 0);
+	if (!nlh)
+		return NULL;
+
+	r = nlmsg_data(nlh);
+	r->ifi_family = AF_UNSPEC;
+	r->__ifi_pad = 0;
+	r->ifi_type = dev->type;
+	r->ifi_index = dev->ifindex;
+	r->ifi_flags = dev_get_flags(dev);
+	r->ifi_change = 0;	/* Wireless changes don't affect those flags */
+
+	NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+
+	return nlh;
+ nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return NULL;
+}
+
+
+/*
+ * Main event dispatcher. Called from other parts and drivers.
+ * Send the event on the appropriate channels.
+ * May be called from interrupt context.
+ */
+void wireless_send_event(struct net_device *	dev,
+			 unsigned int		cmd,
+			 union iwreq_data *	wrqu,
+			 const char *		extra)
+{
+	const struct iw_ioctl_description *	descr = NULL;
+	int extra_len = 0;
+	struct iw_event  *event;		/* Mallocated whole event */
+	int event_len;				/* Its size */
+	int hdr_len;				/* Size of the event header */
+	int wrqu_off = 0;			/* Offset in wrqu */
+	/* Don't "optimise" the following variable, it will crash */
+	unsigned	cmd_index;		/* *MUST* be unsigned */
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct nlattr *nla;
+#ifdef CONFIG_COMPAT
+	struct __compat_iw_event *compat_event;
+	struct compat_iw_point compat_wrqu;
+	struct sk_buff *compskb;
+#endif
+
+	/*
+	 * Nothing in the kernel sends scan events with data, be safe.
+	 * This is necessary because we cannot fix up scan event data
+	 * for compat, due to being contained in 'extra', but normally
+	 * applications are required to retrieve the scan data anyway
+	 * and no data is included in the event, this codifies that
+	 * practice.
+	 */
+	if (WARN_ON(cmd == SIOCGIWSCAN && extra))
+		extra = NULL;
+
+	/* Get the description of the Event */
+	if (cmd <= SIOCIWLAST) {
+		cmd_index = IW_IOCTL_IDX(cmd);
+		if (cmd_index < standard_ioctl_num)
+			descr = &(standard_ioctl[cmd_index]);
+	} else {
+		cmd_index = IW_EVENT_IDX(cmd);
+		if (cmd_index < standard_event_num)
+			descr = &(standard_event[cmd_index]);
+	}
+	/* Don't accept unknown events */
+	if (descr == NULL) {
+		/* Note : we don't return an error to the driver, because
+		 * the driver would not know what to do about it. It can't
+		 * return an error to the user, because the event is not
+		 * initiated by a user request.
+		 * The best the driver could do is to log an error message.
+		 * We will do it ourselves instead...
+		 */
+		netdev_err(dev, "(WE) : Invalid/Unknown Wireless Event (0x%04X)\n",
+			   cmd);
+		return;
+	}
+
+	/* Check extra parameters and set extra_len */
+	if (descr->header_type == IW_HEADER_TYPE_POINT) {
+		/* Check if number of token fits within bounds */
+		if (wrqu->data.length > descr->max_tokens) {
+			netdev_err(dev, "(WE) : Wireless Event too big (%d)\n",
+				   wrqu->data.length);
+			return;
+		}
+		if (wrqu->data.length < descr->min_tokens) {
+			netdev_err(dev, "(WE) : Wireless Event too small (%d)\n",
+				   wrqu->data.length);
+			return;
+		}
+		/* Calculate extra_len - extra is NULL for restricted events */
+		if (extra != NULL)
+			extra_len = wrqu->data.length * descr->token_size;
+		/* Always at an offset in wrqu */
+		wrqu_off = IW_EV_POINT_OFF;
+	}
+
+	/* Total length of the event */
+	hdr_len = event_type_size[descr->header_type];
+	event_len = hdr_len + extra_len;
+
+	/*
+	 * The problem for 64/32 bit.
+	 *
+	 * On 64-bit, a regular event is laid out as follows:
+	 *      |  0  |  1  |  2  |  3  |  4  |  5  |  6  |  7  |
+	 *      | event.len | event.cmd |     p a d d i n g     |
+	 *      | wrqu data ... (with the correct size)         |
+	 *
+	 * This padding exists because we manipulate event->u,
+	 * and 'event' is not packed.
+	 *
+	 * An iw_point event is laid out like this instead:
+	 *      |  0  |  1  |  2  |  3  |  4  |  5  |  6  |  7  |
+	 *      | event.len | event.cmd |     p a d d i n g     |
+	 *      | iwpnt.len | iwpnt.flg |     p a d d i n g     |
+	 *      | extra data  ...
+	 *
+	 * The second padding exists because struct iw_point is extended,
+	 * but this depends on the platform...
+	 *
+	 * On 32-bit, all the padding shouldn't be there.
+	 */
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	/* Send via the RtNetlink event channel */
+	nlh = rtnetlink_ifinfo_prep(dev, skb);
+	if (WARN_ON(!nlh)) {
+		kfree_skb(skb);
+		return;
+	}
+
+	/* Add the wireless events in the netlink packet */
+	nla = nla_reserve(skb, IFLA_WIRELESS, event_len);
+	if (!nla) {
+		kfree_skb(skb);
+		return;
+	}
+	event = nla_data(nla);
+
+	/* Fill event - first clear to avoid data leaking */
+	memset(event, 0, hdr_len);
+	event->len = event_len;
+	event->cmd = cmd;
+	memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
+	if (extra_len)
+		memcpy(((char *) event) + hdr_len, extra, extra_len);
+
+	nlmsg_end(skb, nlh);
+#ifdef CONFIG_COMPAT
+	hdr_len = compat_event_type_size[descr->header_type];
+	event_len = hdr_len + extra_len;
+
+	compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!compskb) {
+		kfree_skb(skb);
+		return;
+	}
+
+	/* Send via the RtNetlink event channel */
+	nlh = rtnetlink_ifinfo_prep(dev, compskb);
+	if (WARN_ON(!nlh)) {
+		kfree_skb(skb);
+		kfree_skb(compskb);
+		return;
+	}
+
+	/* Add the wireless events in the netlink packet */
+	nla = nla_reserve(compskb, IFLA_WIRELESS, event_len);
+	if (!nla) {
+		kfree_skb(skb);
+		kfree_skb(compskb);
+		return;
+	}
+	compat_event = nla_data(nla);
+
+	compat_event->len = event_len;
+	compat_event->cmd = cmd;
+	if (descr->header_type == IW_HEADER_TYPE_POINT) {
+		compat_wrqu.length = wrqu->data.length;
+		compat_wrqu.flags = wrqu->data.flags;
+		memcpy(&compat_event->pointer,
+			((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF,
+			hdr_len - IW_EV_COMPAT_LCP_LEN);
+		if (extra_len)
+			memcpy(((char *) compat_event) + hdr_len,
+				extra, extra_len);
+	} else {
+		/* extra_len must be zero, so no if (extra) needed */
+		memcpy(&compat_event->pointer, wrqu,
+			hdr_len - IW_EV_COMPAT_LCP_LEN);
+	}
+
+	nlmsg_end(compskb, nlh);
+
+	skb_shinfo(skb)->frag_list = compskb;
+#endif
+	skb_queue_tail(&dev_net(dev)->wext_nlevents, skb);
+	schedule_work(&wireless_nlevent_work);
+}
+EXPORT_SYMBOL(wireless_send_event);
+
+
+
+/* IW handlers */
+
+struct iw_statistics *get_wireless_stats(struct net_device *dev)
+{
+#ifdef CONFIG_WIRELESS_EXT
+	if ((dev->wireless_handlers != NULL) &&
+	   (dev->wireless_handlers->get_wireless_stats != NULL))
+		return dev->wireless_handlers->get_wireless_stats(dev);
+#endif
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (dev->ieee80211_ptr &&
+	    dev->ieee80211_ptr->wiphy &&
+	    dev->ieee80211_ptr->wiphy->wext &&
+	    dev->ieee80211_ptr->wiphy->wext->get_wireless_stats)
+		return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev);
+#endif
+
+	/* not found */
+	return NULL;
+}
+
+static int iw_handler_get_iwstats(struct net_device *		dev,
+				  struct iw_request_info *	info,
+				  union iwreq_data *		wrqu,
+				  char *			extra)
+{
+	/* Get stats from the driver */
+	struct iw_statistics *stats;
+
+	stats = get_wireless_stats(dev);
+	if (stats) {
+		/* Copy statistics to extra */
+		memcpy(extra, stats, sizeof(struct iw_statistics));
+		wrqu->data.length = sizeof(struct iw_statistics);
+
+		/* Check if we need to clear the updated flag */
+		if (wrqu->data.flags != 0)
+			stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
+		return 0;
+	} else
+		return -EOPNOTSUPP;
+}
+
+static iw_handler get_handler(struct net_device *dev, unsigned int cmd)
+{
+	/* Don't "optimise" the following variable, it will crash */
+	unsigned int	index;		/* *MUST* be unsigned */
+	const struct iw_handler_def *handlers = NULL;
+
+#ifdef CONFIG_CFG80211_WEXT
+	if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy)
+		handlers = dev->ieee80211_ptr->wiphy->wext;
+#endif
+#ifdef CONFIG_WIRELESS_EXT
+	if (dev->wireless_handlers)
+		handlers = dev->wireless_handlers;
+#endif
+
+	if (!handlers)
+		return NULL;
+
+	/* Try as a standard command */
+	index = IW_IOCTL_IDX(cmd);
+	if (index < handlers->num_standard)
+		return handlers->standard[index];
+
+#ifdef CONFIG_WEXT_PRIV
+	/* Try as a private command */
+	index = cmd - SIOCIWFIRSTPRIV;
+	if (index < handlers->num_private)
+		return handlers->private[index];
+#endif
+
+	/* Not found */
+	return NULL;
+}
+
+static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
+				   const struct iw_ioctl_description *descr,
+				   iw_handler handler, struct net_device *dev,
+				   struct iw_request_info *info)
+{
+	int err, extra_size, user_length = 0, essid_compat = 0;
+	char *extra;
+
+	/* Calculate space needed by arguments. Always allocate
+	 * for max space.
+	 */
+	extra_size = descr->max_tokens * descr->token_size;
+
+	/* Check need for ESSID compatibility for WE < 21 */
+	switch (cmd) {
+	case SIOCSIWESSID:
+	case SIOCGIWESSID:
+	case SIOCSIWNICKN:
+	case SIOCGIWNICKN:
+		if (iwp->length == descr->max_tokens + 1)
+			essid_compat = 1;
+		else if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+			char essid[IW_ESSID_MAX_SIZE + 1];
+			unsigned int len;
+			len = iwp->length * descr->token_size;
+
+			if (len > IW_ESSID_MAX_SIZE)
+				return -EFAULT;
+
+			err = copy_from_user(essid, iwp->pointer, len);
+			if (err)
+				return -EFAULT;
+
+			if (essid[iwp->length - 1] == '\0')
+				essid_compat = 1;
+		}
+		break;
+	default:
+		break;
+	}
+
+	iwp->length -= essid_compat;
+
+	/* Check what user space is giving us */
+	if (IW_IS_SET(cmd)) {
+		/* Check NULL pointer */
+		if (!iwp->pointer && iwp->length != 0)
+			return -EFAULT;
+		/* Check if number of token fits within bounds */
+		if (iwp->length > descr->max_tokens)
+			return -E2BIG;
+		if (iwp->length < descr->min_tokens)
+			return -EINVAL;
+	} else {
+		/* Check NULL pointer */
+		if (!iwp->pointer)
+			return -EFAULT;
+		/* Save user space buffer size for checking */
+		user_length = iwp->length;
+
+		/* Don't check if user_length > max to allow forward
+		 * compatibility. The test user_length < min is
+		 * implied by the test at the end.
+		 */
+
+		/* Support for very large requests */
+		if ((descr->flags & IW_DESCR_FLAG_NOMAX) &&
+		    (user_length > descr->max_tokens)) {
+			/* Allow userspace to GET more than max so
+			 * we can support any size GET requests.
+			 * There is still a limit : -ENOMEM.
+			 */
+			extra_size = user_length * descr->token_size;
+
+			/* Note : user_length is originally a __u16,
+			 * and token_size is controlled by us,
+			 * so extra_size won't get negative and
+			 * won't overflow...
+			 */
+		}
+	}
+
+	/* kzalloc() ensures NULL-termination for essid_compat. */
+	extra = kzalloc(extra_size, GFP_KERNEL);
+	if (!extra)
+		return -ENOMEM;
+
+	/* If it is a SET, get all the extra data in here */
+	if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+		if (copy_from_user(extra, iwp->pointer,
+				   iwp->length *
+				   descr->token_size)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		if (cmd == SIOCSIWENCODEEXT) {
+			struct iw_encode_ext *ee = (void *) extra;
+
+			if (iwp->length < sizeof(*ee) + ee->key_len)
+				return -EFAULT;
+		}
+	}
+
+	if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
+		/*
+		 * If this is a GET, but not NOMAX, it means that the extra
+		 * data is not bounded by userspace, but by max_tokens. Thus
+		 * set the length to max_tokens. This matches the extra data
+		 * allocation.
+		 * The driver should fill it with the number of tokens it
+		 * provided, and it may check iwp->length rather than having
+		 * knowledge of max_tokens. If the driver doesn't change the
+		 * iwp->length, this ioctl just copies back max_token tokens
+		 * filled with zeroes. Hopefully the driver isn't claiming
+		 * them to be valid data.
+		 */
+		iwp->length = descr->max_tokens;
+	}
+
+	err = handler(dev, info, (union iwreq_data *) iwp, extra);
+
+	iwp->length += essid_compat;
+
+	/* If we have something to return to the user */
+	if (!err && IW_IS_GET(cmd)) {
+		/* Check if there is enough buffer up there */
+		if (user_length < iwp->length) {
+			err = -E2BIG;
+			goto out;
+		}
+
+		if (copy_to_user(iwp->pointer, extra,
+				 iwp->length *
+				 descr->token_size)) {
+			err = -EFAULT;
+			goto out;
+		}
+	}
+
+	/* Generate an event to notify listeners of the change */
+	if ((descr->flags & IW_DESCR_FLAG_EVENT) &&
+	    ((err == 0) || (err == -EIWCOMMIT))) {
+		union iwreq_data *data = (union iwreq_data *) iwp;
+
+		if (descr->flags & IW_DESCR_FLAG_RESTRICT)
+			/* If the event is restricted, don't
+			 * export the payload.
+			 */
+			wireless_send_event(dev, cmd, data, NULL);
+		else
+			wireless_send_event(dev, cmd, data, extra);
+	}
+
+out:
+	kfree(extra);
+	return err;
+}
+
+/*
+ * Call the commit handler in the driver
+ * (if exist and if conditions are right)
+ *
+ * Note : our current commit strategy is currently pretty dumb,
+ * but we will be able to improve on that...
+ * The goal is to try to agreagate as many changes as possible
+ * before doing the commit. Drivers that will define a commit handler
+ * are usually those that need a reset after changing parameters, so
+ * we want to minimise the number of reset.
+ * A cool idea is to use a timer : at each "set" command, we re-set the
+ * timer, when the timer eventually fires, we call the driver.
+ * Hopefully, more on that later.
+ *
+ * Also, I'm waiting to see how many people will complain about the
+ * netif_running(dev) test. I'm open on that one...
+ * Hopefully, the driver will remember to do a commit in "open()" ;-)
+ */
+int call_commit_handler(struct net_device *dev)
+{
+#ifdef CONFIG_WIRELESS_EXT
+	if ((netif_running(dev)) &&
+	   (dev->wireless_handlers->standard[0] != NULL))
+		/* Call the commit handler on the driver */
+		return dev->wireless_handlers->standard[0](dev, NULL,
+							   NULL, NULL);
+	else
+		return 0;		/* Command completed successfully */
+#else
+	/* cfg80211 has no commit */
+	return 0;
+#endif
+}
+
+/*
+ * Main IOCTl dispatcher.
+ * Check the type of IOCTL and call the appropriate wrapper...
+ */
+static int wireless_process_ioctl(struct net *net, struct ifreq *ifr,
+				  unsigned int cmd,
+				  struct iw_request_info *info,
+				  wext_ioctl_func standard,
+				  wext_ioctl_func private)
+{
+	struct iwreq *iwr = (struct iwreq *) ifr;
+	struct net_device *dev;
+	iw_handler	handler;
+
+	/* Permissions are already checked in dev_ioctl() before calling us.
+	 * The copy_to/from_user() of ifr is also dealt with in there */
+
+	/* Make sure the device exist */
+	if ((dev = __dev_get_by_name(net, ifr->ifr_name)) == NULL)
+		return -ENODEV;
+
+	/* A bunch of special cases, then the generic case...
+	 * Note that 'cmd' is already filtered in dev_ioctl() with
+	 * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */
+	if (cmd == SIOCGIWSTATS)
+		return standard(dev, iwr, cmd, info,
+				&iw_handler_get_iwstats);
+
+#ifdef CONFIG_WEXT_PRIV
+	if (cmd == SIOCGIWPRIV && dev->wireless_handlers)
+		return standard(dev, iwr, cmd, info,
+				iw_handler_get_private);
+#endif
+
+	/* Basic check */
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	/* New driver API : try to find the handler */
+	handler = get_handler(dev, cmd);
+	if (handler) {
+		/* Standard and private are not the same */
+		if (cmd < SIOCIWFIRSTPRIV)
+			return standard(dev, iwr, cmd, info, handler);
+		else if (private)
+			return private(dev, iwr, cmd, info, handler);
+	}
+	/* Old driver API : call driver ioctl handler */
+	if (dev->netdev_ops->ndo_do_ioctl)
+		return dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd);
+	return -EOPNOTSUPP;
+}
+
+/* If command is `set a parameter', or `get the encoding parameters',
+ * check if the user has the right to do it.
+ */
+static int wext_permission_check(unsigned int cmd)
+{
+	if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE ||
+	     cmd == SIOCGIWENCODEEXT) &&
+	    !capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	return 0;
+}
+
+/* entry point from dev ioctl */
+static int wext_ioctl_dispatch(struct net *net, struct ifreq *ifr,
+			       unsigned int cmd, struct iw_request_info *info,
+			       wext_ioctl_func standard,
+			       wext_ioctl_func private)
+{
+	int ret = wext_permission_check(cmd);
+
+	if (ret)
+		return ret;
+
+	dev_load(net, ifr->ifr_name);
+	rtnl_lock();
+	ret = wireless_process_ioctl(net, ifr, cmd, info, standard, private);
+	rtnl_unlock();
+
+	return ret;
+}
+
+/*
+ * Wrapper to call a standard Wireless Extension handler.
+ * We do various checks and also take care of moving data between
+ * user space and kernel space.
+ */
+static int ioctl_standard_call(struct net_device *	dev,
+			       struct iwreq		*iwr,
+			       unsigned int		cmd,
+			       struct iw_request_info	*info,
+			       iw_handler		handler)
+{
+	const struct iw_ioctl_description *	descr;
+	int					ret = -EINVAL;
+
+	/* Get the description of the IOCTL */
+	if (IW_IOCTL_IDX(cmd) >= standard_ioctl_num)
+		return -EOPNOTSUPP;
+	descr = &(standard_ioctl[IW_IOCTL_IDX(cmd)]);
+
+	/* Check if we have a pointer to user space data or not */
+	if (descr->header_type != IW_HEADER_TYPE_POINT) {
+
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, info, &(iwr->u), NULL);
+
+		/* Generate an event to notify listeners of the change */
+		if ((descr->flags & IW_DESCR_FLAG_EVENT) &&
+		   ((ret == 0) || (ret == -EIWCOMMIT)))
+			wireless_send_event(dev, cmd, &(iwr->u), NULL);
+	} else {
+		ret = ioctl_standard_iw_point(&iwr->u.data, cmd, descr,
+					      handler, dev, info);
+	}
+
+	/* Call commit handler if needed and defined */
+	if (ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	/* Here, we will generate the appropriate event if needed */
+
+	return ret;
+}
+
+
+int wext_handle_ioctl(struct net *net, struct ifreq *ifr, unsigned int cmd,
+		      void __user *arg)
+{
+	struct iw_request_info info = { .cmd = cmd, .flags = 0 };
+	int ret;
+
+	ret = wext_ioctl_dispatch(net, ifr, cmd, &info,
+				  ioctl_standard_call,
+				  ioctl_private_call);
+	if (ret >= 0 &&
+	    IW_IS_GET(cmd) &&
+	    copy_to_user(arg, ifr, sizeof(struct iwreq)))
+		return -EFAULT;
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_standard_call(struct net_device	*dev,
+				struct iwreq		*iwr,
+				unsigned int		cmd,
+				struct iw_request_info	*info,
+				iw_handler		handler)
+{
+	const struct iw_ioctl_description *descr;
+	struct compat_iw_point *iwp_compat;
+	struct iw_point iwp;
+	int err;
+
+	descr = standard_ioctl + IW_IOCTL_IDX(cmd);
+
+	if (descr->header_type != IW_HEADER_TYPE_POINT)
+		return ioctl_standard_call(dev, iwr, cmd, info, handler);
+
+	iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+	iwp.pointer = compat_ptr(iwp_compat->pointer);
+	iwp.length = iwp_compat->length;
+	iwp.flags = iwp_compat->flags;
+
+	err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, info);
+
+	iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+	iwp_compat->length = iwp.length;
+	iwp_compat->flags = iwp.flags;
+
+	return err;
+}
+
+int compat_wext_handle_ioctl(struct net *net, unsigned int cmd,
+			     unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct iw_request_info info;
+	struct iwreq iwr;
+	char *colon;
+	int ret;
+
+	if (copy_from_user(&iwr, argp, sizeof(struct iwreq)))
+		return -EFAULT;
+
+	iwr.ifr_name[IFNAMSIZ-1] = 0;
+	colon = strchr(iwr.ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+	info.cmd = cmd;
+	info.flags = IW_REQUEST_FLAG_COMPAT;
+
+	ret = wext_ioctl_dispatch(net, (struct ifreq *) &iwr, cmd, &info,
+				  compat_standard_call,
+				  compat_private_call);
+
+	if (ret >= 0 &&
+	    IW_IS_GET(cmd) &&
+	    copy_to_user(argp, &iwr, sizeof(struct iwreq)))
+		return -EFAULT;
+
+	return ret;
+}
+#endif
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
new file mode 100644
index 00000000..674d426a
--- /dev/null
+++ b/net/wireless/wext-priv.c
@@ -0,0 +1,249 @@
+/*
+ * This file implement the Wireless Extensions priv API.
+ *
+ * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ * Copyright	2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+#include <linux/slab.h>
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <net/iw_handler.h>
+#include <net/wext.h>
+
+int iw_handler_get_private(struct net_device *		dev,
+			   struct iw_request_info *	info,
+			   union iwreq_data *		wrqu,
+			   char *			extra)
+{
+	/* Check if the driver has something to export */
+	if ((dev->wireless_handlers->num_private_args == 0) ||
+	   (dev->wireless_handlers->private_args == NULL))
+		return -EOPNOTSUPP;
+
+	/* Check if there is enough buffer up there */
+	if (wrqu->data.length < dev->wireless_handlers->num_private_args) {
+		/* User space can't know in advance how large the buffer
+		 * needs to be. Give it a hint, so that we can support
+		 * any size buffer we want somewhat efficiently... */
+		wrqu->data.length = dev->wireless_handlers->num_private_args;
+		return -E2BIG;
+	}
+
+	/* Set the number of available ioctls. */
+	wrqu->data.length = dev->wireless_handlers->num_private_args;
+
+	/* Copy structure to the user buffer. */
+	memcpy(extra, dev->wireless_handlers->private_args,
+	       sizeof(struct iw_priv_args) * wrqu->data.length);
+
+	return 0;
+}
+
+/* Size (in bytes) of the various private data types */
+static const char iw_priv_type_size[] = {
+	0,				/* IW_PRIV_TYPE_NONE */
+	1,				/* IW_PRIV_TYPE_BYTE */
+	1,				/* IW_PRIV_TYPE_CHAR */
+	0,				/* Not defined */
+	sizeof(__u32),			/* IW_PRIV_TYPE_INT */
+	sizeof(struct iw_freq),		/* IW_PRIV_TYPE_FLOAT */
+	sizeof(struct sockaddr),	/* IW_PRIV_TYPE_ADDR */
+	0,				/* Not defined */
+};
+
+static int get_priv_size(__u16 args)
+{
+	int	num = args & IW_PRIV_SIZE_MASK;
+	int	type = (args & IW_PRIV_TYPE_MASK) >> 12;
+
+	return num * iw_priv_type_size[type];
+}
+
+static int adjust_priv_size(__u16 args, struct iw_point *iwp)
+{
+	int	num = iwp->length;
+	int	max = args & IW_PRIV_SIZE_MASK;
+	int	type = (args & IW_PRIV_TYPE_MASK) >> 12;
+
+	/* Make sure the driver doesn't goof up */
+	if (max < num)
+		num = max;
+
+	return num * iw_priv_type_size[type];
+}
+
+/*
+ * Wrapper to call a private Wireless Extension handler.
+ * We do various checks and also take care of moving data between
+ * user space and kernel space.
+ * It's not as nice and slimline as the standard wrapper. The cause
+ * is struct iw_priv_args, which was not really designed for the
+ * job we are going here.
+ *
+ * IMPORTANT : This function prevent to set and get data on the same
+ * IOCTL and enforce the SET/GET convention. Not doing it would be
+ * far too hairy...
+ * If you need to set and get data at the same time, please don't use
+ * a iw_handler but process it in your ioctl handler (i.e. use the
+ * old driver API).
+ */
+static int get_priv_descr_and_size(struct net_device *dev, unsigned int cmd,
+				   const struct iw_priv_args **descrp)
+{
+	const struct iw_priv_args *descr;
+	int i, extra_size;
+
+	descr = NULL;
+	for (i = 0; i < dev->wireless_handlers->num_private_args; i++) {
+		if (cmd == dev->wireless_handlers->private_args[i].cmd) {
+			descr = &dev->wireless_handlers->private_args[i];
+			break;
+		}
+	}
+
+	extra_size = 0;
+	if (descr) {
+		if (IW_IS_SET(cmd)) {
+			int	offset = 0;	/* For sub-ioctls */
+			/* Check for sub-ioctl handler */
+			if (descr->name[0] == '\0')
+				/* Reserve one int for sub-ioctl index */
+				offset = sizeof(__u32);
+
+			/* Size of set arguments */
+			extra_size = get_priv_size(descr->set_args);
+
+			/* Does it fits in iwr ? */
+			if ((descr->set_args & IW_PRIV_SIZE_FIXED) &&
+			   ((extra_size + offset) <= IFNAMSIZ))
+				extra_size = 0;
+		} else {
+			/* Size of get arguments */
+			extra_size = get_priv_size(descr->get_args);
+
+			/* Does it fits in iwr ? */
+			if ((descr->get_args & IW_PRIV_SIZE_FIXED) &&
+			   (extra_size <= IFNAMSIZ))
+				extra_size = 0;
+		}
+	}
+	*descrp = descr;
+	return extra_size;
+}
+
+static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
+				  const struct iw_priv_args *descr,
+				  iw_handler handler, struct net_device *dev,
+				  struct iw_request_info *info, int extra_size)
+{
+	char *extra;
+	int err;
+
+	/* Check what user space is giving us */
+	if (IW_IS_SET(cmd)) {
+		if (!iwp->pointer && iwp->length != 0)
+			return -EFAULT;
+
+		if (iwp->length > (descr->set_args & IW_PRIV_SIZE_MASK))
+			return -E2BIG;
+	} else if (!iwp->pointer)
+		return -EFAULT;
+
+	extra = kzalloc(extra_size, GFP_KERNEL);
+	if (!extra)
+		return -ENOMEM;
+
+	/* If it is a SET, get all the extra data in here */
+	if (IW_IS_SET(cmd) && (iwp->length != 0)) {
+		if (copy_from_user(extra, iwp->pointer, extra_size)) {
+			err = -EFAULT;
+			goto out;
+		}
+	}
+
+	/* Call the handler */
+	err = handler(dev, info, (union iwreq_data *) iwp, extra);
+
+	/* If we have something to return to the user */
+	if (!err && IW_IS_GET(cmd)) {
+		/* Adjust for the actual length if it's variable,
+		 * avoid leaking kernel bits outside.
+		 */
+		if (!(descr->get_args & IW_PRIV_SIZE_FIXED))
+			extra_size = adjust_priv_size(descr->get_args, iwp);
+
+		if (copy_to_user(iwp->pointer, extra, extra_size))
+			err =  -EFAULT;
+	}
+
+out:
+	kfree(extra);
+	return err;
+}
+
+int ioctl_private_call(struct net_device *dev, struct iwreq *iwr,
+		       unsigned int cmd, struct iw_request_info *info,
+		       iw_handler handler)
+{
+	int extra_size = 0, ret = -EINVAL;
+	const struct iw_priv_args *descr;
+
+	extra_size = get_priv_descr_and_size(dev, cmd, &descr);
+
+	/* Check if we have a pointer to user space data or not. */
+	if (extra_size == 0) {
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u));
+	} else {
+		ret = ioctl_private_iw_point(&iwr->u.data, cmd, descr,
+					     handler, dev, info, extra_size);
+	}
+
+	/* Call commit handler if needed and defined */
+	if (ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_private_call(struct net_device *dev, struct iwreq *iwr,
+			unsigned int cmd, struct iw_request_info *info,
+			iw_handler handler)
+{
+	const struct iw_priv_args *descr;
+	int ret, extra_size;
+
+	extra_size = get_priv_descr_and_size(dev, cmd, &descr);
+
+	/* Check if we have a pointer to user space data or not. */
+	if (extra_size == 0) {
+		/* No extra arguments. Trivial to handle */
+		ret = handler(dev, info, &(iwr->u), (char *) &(iwr->u));
+	} else {
+		struct compat_iw_point *iwp_compat;
+		struct iw_point iwp;
+
+		iwp_compat = (struct compat_iw_point *) &iwr->u.data;
+		iwp.pointer = compat_ptr(iwp_compat->pointer);
+		iwp.length = iwp_compat->length;
+		iwp.flags = iwp_compat->flags;
+
+		ret = ioctl_private_iw_point(&iwp, cmd, descr,
+					     handler, dev, info, extra_size);
+
+		iwp_compat->pointer = ptr_to_compat(iwp.pointer);
+		iwp_compat->length = iwp.length;
+		iwp_compat->flags = iwp.flags;
+	}
+
+	/* Call commit handler if needed and defined */
+	if (ret == -EIWCOMMIT)
+		ret = call_commit_handler(dev);
+
+	return ret;
+}
+#endif
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
new file mode 100644
index 00000000..8bafa31f
--- /dev/null
+++ b/net/wireless/wext-proc.c
@@ -0,0 +1,155 @@
+/*
+ * This file implement the Wireless Extensions proc API.
+ *
+ * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+
+/*
+ * The /proc/net/wireless file is a human readable user-space interface
+ * exporting various wireless specific statistics from the wireless devices.
+ * This is the most popular part of the Wireless Extensions ;-)
+ *
+ * This interface is a pure clone of /proc/net/dev (in net/core/dev.c).
+ * The content of the file is basically the content of "struct iw_statistics".
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/iw_handler.h>
+#include <net/wext.h>
+
+
+static void wireless_seq_printf_stats(struct seq_file *seq,
+				      struct net_device *dev)
+{
+	/* Get stats from the driver */
+	struct iw_statistics *stats = get_wireless_stats(dev);
+	static struct iw_statistics nullstats = {};
+
+	/* show device if it's wireless regardless of current stats */
+	if (!stats) {
+#ifdef CONFIG_WIRELESS_EXT
+		if (dev->wireless_handlers)
+			stats = &nullstats;
+#endif
+#ifdef CONFIG_CFG80211
+		if (dev->ieee80211_ptr)
+			stats = &nullstats;
+#endif
+	}
+
+	if (stats) {
+		seq_printf(seq, "%6s: %04x  %3d%c  %3d%c  %3d%c  %6d %6d %6d "
+				"%6d %6d   %6d\n",
+			   dev->name, stats->status, stats->qual.qual,
+			   stats->qual.updated & IW_QUAL_QUAL_UPDATED
+			   ? '.' : ' ',
+			   ((__s32) stats->qual.level) -
+			   ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
+			   stats->qual.updated & IW_QUAL_LEVEL_UPDATED
+			   ? '.' : ' ',
+			   ((__s32) stats->qual.noise) -
+			   ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
+			   stats->qual.updated & IW_QUAL_NOISE_UPDATED
+			   ? '.' : ' ',
+			   stats->discard.nwid, stats->discard.code,
+			   stats->discard.fragment, stats->discard.retries,
+			   stats->discard.misc, stats->miss.beacon);
+
+		if (stats != &nullstats)
+			stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
+	}
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Print info for /proc/net/wireless (print all entries)
+ */
+static int wireless_dev_seq_show(struct seq_file *seq, void *v)
+{
+	might_sleep();
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "Inter-| sta-|   Quality        |   Discarded "
+				"packets               | Missed | WE\n"
+				" face | tus | link level noise |  nwid  "
+				"crypt   frag  retry   misc | beacon | %d\n",
+			   WIRELESS_EXT);
+	else
+		wireless_seq_printf_stats(seq, v);
+	return 0;
+}
+
+static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+	loff_t off;
+	struct net_device *dev;
+
+	rtnl_lock();
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	off = 1;
+	for_each_netdev(net, dev)
+		if (off++ == *pos)
+			return dev;
+	return NULL;
+}
+
+static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct net *net = seq_file_net(seq);
+
+	++*pos;
+
+	return v == SEQ_START_TOKEN ?
+		first_net_device(net) : next_net_device(v);
+}
+
+static void wireless_dev_seq_stop(struct seq_file *seq, void *v)
+{
+	rtnl_unlock();
+}
+
+static const struct seq_operations wireless_seq_ops = {
+	.start = wireless_dev_seq_start,
+	.next  = wireless_dev_seq_next,
+	.stop  = wireless_dev_seq_stop,
+	.show  = wireless_dev_seq_show,
+};
+
+static int seq_open_wireless(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &wireless_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations wireless_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = seq_open_wireless,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_net,
+};
+
+int __net_init wext_proc_init(struct net *net)
+{
+	/* Create /proc/net/wireless entry */
+	if (!proc_net_fops_create(net, "wireless", S_IRUGO, &wireless_seq_fops))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __net_exit wext_proc_exit(struct net *net)
+{
+	proc_net_remove(net, "wireless");
+}
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
new file mode 100644
index 00000000..6fffe62d
--- /dev/null
+++ b/net/wireless/wext-sme.c
@@ -0,0 +1,405 @@
+/*
+ * cfg80211 wext compat for managed mode.
+ *
+ * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2009   Intel Corporation. All rights reserved.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/slab.h>
+#include <net/cfg80211.h>
+#include "wext-compat.h"
+#include "nl80211.h"
+
+int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
+			      struct wireless_dev *wdev)
+{
+	struct cfg80211_cached_keys *ck = NULL;
+	const u8 *prev_bssid = NULL;
+	int err, i;
+
+	ASSERT_RDEV_LOCK(rdev);
+	ASSERT_WDEV_LOCK(wdev);
+
+	if (!netif_running(wdev->netdev))
+		return 0;
+
+	wdev->wext.connect.ie = wdev->wext.ie;
+	wdev->wext.connect.ie_len = wdev->wext.ie_len;
+
+	if (wdev->wext.keys) {
+		wdev->wext.keys->def = wdev->wext.default_key;
+		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+		if (wdev->wext.default_key != -1)
+			wdev->wext.connect.privacy = true;
+	}
+
+	if (!wdev->wext.connect.ssid_len)
+		return 0;
+
+	if (wdev->wext.keys) {
+		ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL);
+		if (!ck)
+			return -ENOMEM;
+		for (i = 0; i < 6; i++)
+			ck->params[i].key = ck->data[i];
+	}
+
+	if (wdev->wext.prev_bssid_valid)
+		prev_bssid = wdev->wext.prev_bssid;
+
+	err = __cfg80211_connect(rdev, wdev->netdev,
+				 &wdev->wext.connect, ck, prev_bssid);
+	if (err)
+		kfree(ck);
+
+	return err;
+}
+
+int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
+			      struct iw_request_info *info,
+			      struct iw_freq *wextfreq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	struct ieee80211_channel *chan = NULL;
+	int err, freq;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	freq = cfg80211_wext_freq(wdev->wiphy, wextfreq);
+	if (freq < 0)
+		return freq;
+
+	if (freq) {
+		chan = ieee80211_get_channel(wdev->wiphy, freq);
+		if (!chan)
+			return -EINVAL;
+		if (chan->flags & IEEE80211_CHAN_DISABLED)
+			return -EINVAL;
+	}
+
+	cfg80211_lock_rdev(rdev);
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+
+	if (wdev->sme_state != CFG80211_SME_IDLE) {
+		bool event = true;
+
+		if (wdev->wext.connect.channel == chan) {
+			err = 0;
+			goto out;
+		}
+
+		/* if SSID set, we'll try right again, avoid event */
+		if (wdev->wext.connect.ssid_len)
+			event = false;
+		err = __cfg80211_disconnect(rdev, dev,
+					    WLAN_REASON_DEAUTH_LEAVING, event);
+		if (err)
+			goto out;
+	}
+
+
+	wdev->wext.connect.channel = chan;
+
+	/* SSID is not set, we just want to switch channel */
+	if (chan && !wdev->wext.connect.ssid_len) {
+		err = cfg80211_set_freq(rdev, wdev, freq, NL80211_CHAN_NO_HT);
+		goto out;
+	}
+
+	err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+	cfg80211_unlock_rdev(rdev);
+	return err;
+}
+
+int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
+			      struct iw_request_info *info,
+			      struct iw_freq *freq, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct ieee80211_channel *chan = NULL;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	if (wdev->current_bss)
+		chan = wdev->current_bss->pub.channel;
+	else if (wdev->wext.connect.channel)
+		chan = wdev->wext.connect.channel;
+	wdev_unlock(wdev);
+
+	if (chan) {
+		freq->m = chan->center_freq;
+		freq->e = 6;
+		return 0;
+	}
+
+	/* no channel if not joining */
+	return -EINVAL;
+}
+
+int cfg80211_mgd_wext_siwessid(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	size_t len = data->length;
+	int err;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	if (!data->flags)
+		len = 0;
+
+	/* iwconfig uses nul termination in SSID.. */
+	if (len > 0 && ssid[len - 1] == '\0')
+		len--;
+
+	cfg80211_lock_rdev(rdev);
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+
+	err = 0;
+
+	if (wdev->sme_state != CFG80211_SME_IDLE) {
+		bool event = true;
+
+		if (wdev->wext.connect.ssid && len &&
+		    len == wdev->wext.connect.ssid_len &&
+		    memcmp(wdev->wext.connect.ssid, ssid, len) == 0)
+			goto out;
+
+		/* if SSID set now, we'll try to connect, avoid event */
+		if (len)
+			event = false;
+		err = __cfg80211_disconnect(rdev, dev,
+					    WLAN_REASON_DEAUTH_LEAVING, event);
+		if (err)
+			goto out;
+	}
+
+	wdev->wext.prev_bssid_valid = false;
+	wdev->wext.connect.ssid = wdev->wext.ssid;
+	memcpy(wdev->wext.ssid, ssid, len);
+	wdev->wext.connect.ssid_len = len;
+
+	wdev->wext.connect.crypto.control_port = false;
+	wdev->wext.connect.crypto.control_port_ethertype =
+					cpu_to_be16(ETH_P_PAE);
+
+	err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+	cfg80211_unlock_rdev(rdev);
+	return err;
+}
+
+int cfg80211_mgd_wext_giwessid(struct net_device *dev,
+			       struct iw_request_info *info,
+			       struct iw_point *data, char *ssid)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	data->flags = 0;
+
+	wdev_lock(wdev);
+	if (wdev->current_bss) {
+		const u8 *ie = ieee80211_bss_get_ie(&wdev->current_bss->pub,
+						    WLAN_EID_SSID);
+		if (ie) {
+			data->flags = 1;
+			data->length = ie[1];
+			memcpy(ssid, ie + 2, data->length);
+		}
+	} else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) {
+		data->flags = 1;
+		data->length = wdev->wext.connect.ssid_len;
+		memcpy(ssid, wdev->wext.connect.ssid, data->length);
+	}
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
+int cfg80211_mgd_wext_siwap(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u8 *bssid = ap_addr->sa_data;
+	int err;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	if (ap_addr->sa_family != ARPHRD_ETHER)
+		return -EINVAL;
+
+	/* automatic mode */
+	if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid))
+		bssid = NULL;
+
+	cfg80211_lock_rdev(rdev);
+	mutex_lock(&rdev->devlist_mtx);
+	wdev_lock(wdev);
+
+	if (wdev->sme_state != CFG80211_SME_IDLE) {
+		err = 0;
+		/* both automatic */
+		if (!bssid && !wdev->wext.connect.bssid)
+			goto out;
+
+		/* fixed already - and no change */
+		if (wdev->wext.connect.bssid && bssid &&
+		    compare_ether_addr(bssid, wdev->wext.connect.bssid) == 0)
+			goto out;
+
+		err = __cfg80211_disconnect(rdev, dev,
+					    WLAN_REASON_DEAUTH_LEAVING, false);
+		if (err)
+			goto out;
+	}
+
+	if (bssid) {
+		memcpy(wdev->wext.bssid, bssid, ETH_ALEN);
+		wdev->wext.connect.bssid = wdev->wext.bssid;
+	} else
+		wdev->wext.connect.bssid = NULL;
+
+	err = cfg80211_mgd_wext_connect(rdev, wdev);
+ out:
+	wdev_unlock(wdev);
+	mutex_unlock(&rdev->devlist_mtx);
+	cfg80211_unlock_rdev(rdev);
+	return err;
+}
+
+int cfg80211_mgd_wext_giwap(struct net_device *dev,
+			    struct iw_request_info *info,
+			    struct sockaddr *ap_addr, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+
+	/* call only for station! */
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+		return -EINVAL;
+
+	ap_addr->sa_family = ARPHRD_ETHER;
+
+	wdev_lock(wdev);
+	if (wdev->current_bss)
+		memcpy(ap_addr->sa_data, wdev->current_bss->pub.bssid, ETH_ALEN);
+	else
+		memset(ap_addr->sa_data, 0, ETH_ALEN);
+	wdev_unlock(wdev);
+
+	return 0;
+}
+
+int cfg80211_wext_siwgenie(struct net_device *dev,
+			   struct iw_request_info *info,
+			   struct iw_point *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
+	u8 *ie = extra;
+	int ie_len = data->length, err;
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EOPNOTSUPP;
+
+	if (!ie_len)
+		ie = NULL;
+
+	wdev_lock(wdev);
+
+	/* no change */
+	err = 0;
+	if (wdev->wext.ie_len == ie_len &&
+	    memcmp(wdev->wext.ie, ie, ie_len) == 0)
+		goto out;
+
+	if (ie_len) {
+		ie = kmemdup(extra, ie_len, GFP_KERNEL);
+		if (!ie) {
+			err = -ENOMEM;
+			goto out;
+		}
+	} else
+		ie = NULL;
+
+	kfree(wdev->wext.ie);
+	wdev->wext.ie = ie;
+	wdev->wext.ie_len = ie_len;
+
+	if (wdev->sme_state != CFG80211_SME_IDLE) {
+		err = __cfg80211_disconnect(rdev, dev,
+					    WLAN_REASON_DEAUTH_LEAVING, false);
+		if (err)
+			goto out;
+	}
+
+	/* userspace better not think we'll reconnect */
+	err = 0;
+ out:
+	wdev_unlock(wdev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwgenie);
+
+int cfg80211_wext_siwmlme(struct net_device *dev,
+			  struct iw_request_info *info,
+			  struct iw_point *data, char *extra)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct iw_mlme *mlme = (struct iw_mlme *)extra;
+	struct cfg80211_registered_device *rdev;
+	int err;
+
+	if (!wdev)
+		return -EOPNOTSUPP;
+
+	rdev = wiphy_to_dev(wdev->wiphy);
+
+	if (wdev->iftype != NL80211_IFTYPE_STATION)
+		return -EINVAL;
+
+	if (mlme->addr.sa_family != ARPHRD_ETHER)
+		return -EINVAL;
+
+	wdev_lock(wdev);
+	switch (mlme->cmd) {
+	case IW_MLME_DEAUTH:
+	case IW_MLME_DISASSOC:
+		err = __cfg80211_disconnect(rdev, dev, mlme->reason_code,
+					    true);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+	wdev_unlock(wdev);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwmlme);
diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c
new file mode 100644
index 00000000..6dcfe65a
--- /dev/null
+++ b/net/wireless/wext-spy.c
@@ -0,0 +1,231 @@
+/*
+ * This file implement the Wireless Extensions spy API.
+ *
+ * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
+ * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved.
+ *
+ * (As all part of the Linux kernel, this file is GPL)
+ */
+
+#include <linux/wireless.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/iw_handler.h>
+#include <net/arp.h>
+#include <net/wext.h>
+
+static inline struct iw_spy_data *get_spydata(struct net_device *dev)
+{
+	/* This is the new way */
+	if (dev->wireless_data)
+		return dev->wireless_data->spy_data;
+	return NULL;
+}
+
+int iw_handler_set_spy(struct net_device *	dev,
+		       struct iw_request_info *	info,
+		       union iwreq_data *	wrqu,
+		       char *			extra)
+{
+	struct iw_spy_data *	spydata = get_spydata(dev);
+	struct sockaddr *	address = (struct sockaddr *) extra;
+
+	/* Make sure driver is not buggy or using the old API */
+	if (!spydata)
+		return -EOPNOTSUPP;
+
+	/* Disable spy collection while we copy the addresses.
+	 * While we copy addresses, any call to wireless_spy_update()
+	 * will NOP. This is OK, as anyway the addresses are changing. */
+	spydata->spy_number = 0;
+
+	/* We want to operate without locking, because wireless_spy_update()
+	 * most likely will happen in the interrupt handler, and therefore
+	 * have its own locking constraints and needs performance.
+	 * The rtnl_lock() make sure we don't race with the other iw_handlers.
+	 * This make sure wireless_spy_update() "see" that the spy list
+	 * is temporarily disabled. */
+	smp_wmb();
+
+	/* Are there are addresses to copy? */
+	if (wrqu->data.length > 0) {
+		int i;
+
+		/* Copy addresses */
+		for (i = 0; i < wrqu->data.length; i++)
+			memcpy(spydata->spy_address[i], address[i].sa_data,
+			       ETH_ALEN);
+		/* Reset stats */
+		memset(spydata->spy_stat, 0,
+		       sizeof(struct iw_quality) * IW_MAX_SPY);
+	}
+
+	/* Make sure above is updated before re-enabling */
+	smp_wmb();
+
+	/* Enable addresses */
+	spydata->spy_number = wrqu->data.length;
+
+	return 0;
+}
+EXPORT_SYMBOL(iw_handler_set_spy);
+
+int iw_handler_get_spy(struct net_device *	dev,
+		       struct iw_request_info *	info,
+		       union iwreq_data *	wrqu,
+		       char *			extra)
+{
+	struct iw_spy_data *	spydata = get_spydata(dev);
+	struct sockaddr *	address = (struct sockaddr *) extra;
+	int			i;
+
+	/* Make sure driver is not buggy or using the old API */
+	if (!spydata)
+		return -EOPNOTSUPP;
+
+	wrqu->data.length = spydata->spy_number;
+
+	/* Copy addresses. */
+	for (i = 0; i < spydata->spy_number; i++) 	{
+		memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN);
+		address[i].sa_family = AF_UNIX;
+	}
+	/* Copy stats to the user buffer (just after). */
+	if (spydata->spy_number > 0)
+		memcpy(extra  + (sizeof(struct sockaddr) *spydata->spy_number),
+		       spydata->spy_stat,
+		       sizeof(struct iw_quality) * spydata->spy_number);
+	/* Reset updated flags. */
+	for (i = 0; i < spydata->spy_number; i++)
+		spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
+	return 0;
+}
+EXPORT_SYMBOL(iw_handler_get_spy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Standard Wireless Handler : set spy threshold
+ */
+int iw_handler_set_thrspy(struct net_device *	dev,
+			  struct iw_request_info *info,
+			  union iwreq_data *	wrqu,
+			  char *		extra)
+{
+	struct iw_spy_data *	spydata = get_spydata(dev);
+	struct iw_thrspy *	threshold = (struct iw_thrspy *) extra;
+
+	/* Make sure driver is not buggy or using the old API */
+	if (!spydata)
+		return -EOPNOTSUPP;
+
+	/* Just do it */
+	memcpy(&(spydata->spy_thr_low), &(threshold->low),
+	       2 * sizeof(struct iw_quality));
+
+	/* Clear flag */
+	memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under));
+
+	return 0;
+}
+EXPORT_SYMBOL(iw_handler_set_thrspy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Standard Wireless Handler : get spy threshold
+ */
+int iw_handler_get_thrspy(struct net_device *	dev,
+			  struct iw_request_info *info,
+			  union iwreq_data *	wrqu,
+			  char *		extra)
+{
+	struct iw_spy_data *	spydata = get_spydata(dev);
+	struct iw_thrspy *	threshold = (struct iw_thrspy *) extra;
+
+	/* Make sure driver is not buggy or using the old API */
+	if (!spydata)
+		return -EOPNOTSUPP;
+
+	/* Just do it */
+	memcpy(&(threshold->low), &(spydata->spy_thr_low),
+	       2 * sizeof(struct iw_quality));
+
+	return 0;
+}
+EXPORT_SYMBOL(iw_handler_get_thrspy);
+
+/*------------------------------------------------------------------*/
+/*
+ * Prepare and send a Spy Threshold event
+ */
+static void iw_send_thrspy_event(struct net_device *	dev,
+				 struct iw_spy_data *	spydata,
+				 unsigned char *	address,
+				 struct iw_quality *	wstats)
+{
+	union iwreq_data	wrqu;
+	struct iw_thrspy	threshold;
+
+	/* Init */
+	wrqu.data.length = 1;
+	wrqu.data.flags = 0;
+	/* Copy address */
+	memcpy(threshold.addr.sa_data, address, ETH_ALEN);
+	threshold.addr.sa_family = ARPHRD_ETHER;
+	/* Copy stats */
+	memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality));
+	/* Copy also thresholds */
+	memcpy(&(threshold.low), &(spydata->spy_thr_low),
+	       2 * sizeof(struct iw_quality));
+
+	/* Send event to user space */
+	wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold);
+}
+
+/* ---------------------------------------------------------------- */
+/*
+ * Call for the driver to update the spy data.
+ * For now, the spy data is a simple array. As the size of the array is
+ * small, this is good enough. If we wanted to support larger number of
+ * spy addresses, we should use something more efficient...
+ */
+void wireless_spy_update(struct net_device *	dev,
+			 unsigned char *	address,
+			 struct iw_quality *	wstats)
+{
+	struct iw_spy_data *	spydata = get_spydata(dev);
+	int			i;
+	int			match = -1;
+
+	/* Make sure driver is not buggy or using the old API */
+	if (!spydata)
+		return;
+
+	/* Update all records that match */
+	for (i = 0; i < spydata->spy_number; i++)
+		if (!compare_ether_addr(address, spydata->spy_address[i])) {
+			memcpy(&(spydata->spy_stat[i]), wstats,
+			       sizeof(struct iw_quality));
+			match = i;
+		}
+
+	/* Generate an event if we cross the spy threshold.
+	 * To avoid event storms, we have a simple hysteresis : we generate
+	 * event only when we go under the low threshold or above the
+	 * high threshold. */
+	if (match >= 0) {
+		if (spydata->spy_thr_under[match]) {
+			if (wstats->level > spydata->spy_thr_high.level) {
+				spydata->spy_thr_under[match] = 0;
+				iw_send_thrspy_event(dev, spydata,
+						     address, wstats);
+			}
+		} else {
+			if (wstats->level < spydata->spy_thr_low.level) {
+				spydata->spy_thr_under[match] = 1;
+				iw_send_thrspy_event(dev, spydata,
+						     address, wstats);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(wireless_spy_update);
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 00000000..e6759c96
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
+#
+# CCITT X.25 Packet Layer
+#
+
+config X25
+	tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  X.25 is a set of standardized network protocols, similar in scope to
+	  frame relay; the one physical line from your box to the X.25 network
+	  entry point can carry several logical point-to-point connections
+	  (called "virtual circuits") to other computers connected to the X.25
+	  network. Governments, banks, and other organizations tend to use it
+	  to connect to each other or to form Wide Area Networks (WANs). Many
+	  countries have public X.25 networks. X.25 consists of two
+	  protocols: the higher level Packet Layer Protocol (PLP) (say Y here
+	  if you want that) and the lower level data link layer protocol LAPB
+	  (say Y to "LAPB Data Link Driver" below if you want that).
+
+	  You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
+	  <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+	  Information about X.25 for Linux is contained in the files
+	  <file:Documentation/networking/x25.txt> and
+	  <file:Documentation/networking/x25-iface.txt>.
+
+	  One connects to an X.25 network either with a dedicated network card
+	  using the X.21 protocol (not yet supported by Linux) or one can do
+	  X.25 over a standard telephone line using an ordinary modem (say Y
+	  to "X.25 async driver" below) or over Ethernet using an ordinary
+	  Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
+	  Driver" and "LAPB over Ethernet driver" below).
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called x25. If unsure, say N.
+
+
diff --git a/net/x25/Makefile b/net/x25/Makefile
new file mode 100644
index 00000000..a2c34ab6
--- /dev/null
+++ b/net/x25/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Linux X.25 Packet layer.
+#
+
+obj-$(CONFIG_X25) += x25.o
+
+x25-y			:= af_x25.o x25_dev.o x25_facilities.o x25_in.o \
+			   x25_link.o x25_out.o x25_route.o x25_subr.o \
+			   x25_timer.o x25_proc.o x25_forward.o
+x25-$(CONFIG_SYSCTL)	+= sysctl_net_x25.o
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
new file mode 100644
index 00000000..373e14f2
--- /dev/null
+++ b/net/x25/af_x25.c
@@ -0,0 +1,1829 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	Started coding.
+ *	X.25 002	Jonathan Naylor	Centralised disconnect handling.
+ *					New timer architecture.
+ *	2000-03-11	Henner Eisen	MSG_EOR handling more POSIX compliant.
+ *	2000-03-22	Daniela Squassoni Allowed disabling/enabling of
+ *					  facilities negotiation and increased
+ *					  the throughput upper limit.
+ *	2000-08-27	Arnaldo C. Melo s/suser/capable/ + micro cleanups
+ *	2000-09-04	Henner Eisen	Set sock->state in x25_accept().
+ *					Fixed x25_output() related skb leakage.
+ *	2000-10-02	Henner Eisen	Made x25_kick() single threaded per socket.
+ *	2000-10-27	Henner Eisen    MSG_DONTWAIT for fragment allocation.
+ *	2000-11-14	Henner Eisen    Closing datalink from NETDEV_GOING_DOWN
+ *	2002-10-06	Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
+ *					x25_proc.c, using seq_file
+ *	2005-04-02	Shaun Pereira	Selective sub address matching
+ *					with call user data
+ *	2005-04-15	Shaun Pereira	Fast select with no restriction on
+ *					response
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <asm/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/termios.h>	/* For TIOCINQ/OUTQ */
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
+
+#include <net/x25.h>
+#include <net/compat.h>
+
+int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20;
+int sysctl_x25_call_request_timeout    = X25_DEFAULT_T21;
+int sysctl_x25_reset_request_timeout   = X25_DEFAULT_T22;
+int sysctl_x25_clear_request_timeout   = X25_DEFAULT_T23;
+int sysctl_x25_ack_holdback_timeout    = X25_DEFAULT_T2;
+int sysctl_x25_forward                 = 0;
+
+HLIST_HEAD(x25_list);
+DEFINE_RWLOCK(x25_list_lock);
+
+static const struct proto_ops x25_proto_ops;
+
+static struct x25_address null_x25_address = {"               "};
+
+#ifdef CONFIG_COMPAT
+struct compat_x25_subscrip_struct {
+	char device[200-sizeof(compat_ulong_t)];
+	compat_ulong_t global_facil_mask;
+	compat_uint_t extended;
+};
+#endif
+
+
+int x25_parse_address_block(struct sk_buff *skb,
+		struct x25_address *called_addr,
+		struct x25_address *calling_addr)
+{
+	unsigned char len;
+	int needed;
+	int rc;
+
+	if (skb->len < 1) {
+		/* packet has no address block */
+		rc = 0;
+		goto empty;
+	}
+
+	len = *skb->data;
+	needed = 1 + (len >> 4) + (len & 0x0f);
+
+	if (skb->len < needed) {
+		/* packet is too short to hold the addresses it claims
+		   to hold */
+		rc = -1;
+		goto empty;
+	}
+
+	return x25_addr_ntoa(skb->data, called_addr, calling_addr);
+
+empty:
+	*called_addr->x25_addr = 0;
+	*calling_addr->x25_addr = 0;
+
+	return rc;
+}
+
+
+int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr,
+		  struct x25_address *calling_addr)
+{
+	unsigned int called_len, calling_len;
+	char *called, *calling;
+	unsigned int i;
+
+	called_len  = (*p >> 0) & 0x0F;
+	calling_len = (*p >> 4) & 0x0F;
+
+	called  = called_addr->x25_addr;
+	calling = calling_addr->x25_addr;
+	p++;
+
+	for (i = 0; i < (called_len + calling_len); i++) {
+		if (i < called_len) {
+			if (i % 2 != 0) {
+				*called++ = ((*p >> 0) & 0x0F) + '0';
+				p++;
+			} else {
+				*called++ = ((*p >> 4) & 0x0F) + '0';
+			}
+		} else {
+			if (i % 2 != 0) {
+				*calling++ = ((*p >> 0) & 0x0F) + '0';
+				p++;
+			} else {
+				*calling++ = ((*p >> 4) & 0x0F) + '0';
+			}
+		}
+	}
+
+	*called = *calling = '\0';
+
+	return 1 + (called_len + calling_len + 1) / 2;
+}
+
+int x25_addr_aton(unsigned char *p, struct x25_address *called_addr,
+		  struct x25_address *calling_addr)
+{
+	unsigned int called_len, calling_len;
+	char *called, *calling;
+	int i;
+
+	called  = called_addr->x25_addr;
+	calling = calling_addr->x25_addr;
+
+	called_len  = strlen(called);
+	calling_len = strlen(calling);
+
+	*p++ = (calling_len << 4) | (called_len << 0);
+
+	for (i = 0; i < (called_len + calling_len); i++) {
+		if (i < called_len) {
+			if (i % 2 != 0) {
+				*p |= (*called++ - '0') << 0;
+				p++;
+			} else {
+				*p = 0x00;
+				*p |= (*called++ - '0') << 4;
+			}
+		} else {
+			if (i % 2 != 0) {
+				*p |= (*calling++ - '0') << 0;
+				p++;
+			} else {
+				*p = 0x00;
+				*p |= (*calling++ - '0') << 4;
+			}
+		}
+	}
+
+	return 1 + (called_len + calling_len + 1) / 2;
+}
+
+/*
+ *	Socket removal during an interrupt is now safe.
+ */
+static void x25_remove_socket(struct sock *sk)
+{
+	write_lock_bh(&x25_list_lock);
+	sk_del_node_init(sk);
+	write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ *	Kill all bound sockets on a dropped device.
+ */
+static void x25_kill_by_device(struct net_device *dev)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	write_lock_bh(&x25_list_lock);
+
+	sk_for_each(s, node, &x25_list)
+		if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev)
+			x25_disconnect(s, ENETUNREACH, 0, 0);
+
+	write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ *	Handle device status changes.
+ */
+static int x25_device_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct x25_neigh *nb;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	if (dev->type == ARPHRD_X25
+#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE)
+	 || dev->type == ARPHRD_ETHER
+#endif
+	 ) {
+		switch (event) {
+			case NETDEV_UP:
+				x25_link_device_up(dev);
+				break;
+			case NETDEV_GOING_DOWN:
+				nb = x25_get_neigh(dev);
+				if (nb) {
+					x25_terminate_link(nb);
+					x25_neigh_put(nb);
+				}
+				break;
+			case NETDEV_DOWN:
+				x25_kill_by_device(dev);
+				x25_route_device_down(dev);
+				x25_link_device_down(dev);
+				break;
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ *	Add a socket to the bound sockets list.
+ */
+static void x25_insert_socket(struct sock *sk)
+{
+	write_lock_bh(&x25_list_lock);
+	sk_add_node(sk, &x25_list);
+	write_unlock_bh(&x25_list_lock);
+}
+
+/*
+ *	Find a socket that wants to accept the Call Request we just
+ *	received. Check the full list for an address/cud match.
+ *	If no cuds match return the next_best thing, an address match.
+ *	Note: if a listening socket has cud set it must only get calls
+ *	with matching cud.
+ */
+static struct sock *x25_find_listener(struct x25_address *addr,
+					struct sk_buff *skb)
+{
+	struct sock *s;
+	struct sock *next_best;
+	struct hlist_node *node;
+
+	read_lock_bh(&x25_list_lock);
+	next_best = NULL;
+
+	sk_for_each(s, node, &x25_list)
+		if ((!strcmp(addr->x25_addr,
+			x25_sk(s)->source_addr.x25_addr) ||
+				!strcmp(addr->x25_addr,
+					null_x25_address.x25_addr)) &&
+					s->sk_state == TCP_LISTEN) {
+			/*
+			 * Found a listening socket, now check the incoming
+			 * call user data vs this sockets call user data
+			 */
+			if (x25_sk(s)->cudmatchlength > 0 &&
+				skb->len >= x25_sk(s)->cudmatchlength) {
+				if((memcmp(x25_sk(s)->calluserdata.cuddata,
+					skb->data,
+					x25_sk(s)->cudmatchlength)) == 0) {
+					sock_hold(s);
+					goto found;
+				 }
+			} else
+				next_best = s;
+		}
+	if (next_best) {
+		s = next_best;
+		sock_hold(s);
+		goto found;
+	}
+	s = NULL;
+found:
+	read_unlock_bh(&x25_list_lock);
+	return s;
+}
+
+/*
+ *	Find a connected X.25 socket given my LCI and neighbour.
+ */
+static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	sk_for_each(s, node, &x25_list)
+		if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) {
+			sock_hold(s);
+			goto found;
+		}
+	s = NULL;
+found:
+	return s;
+}
+
+struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb)
+{
+	struct sock *s;
+
+	read_lock_bh(&x25_list_lock);
+	s = __x25_find_socket(lci, nb);
+	read_unlock_bh(&x25_list_lock);
+	return s;
+}
+
+/*
+ *	Find a unique LCI for a given device.
+ */
+static unsigned int x25_new_lci(struct x25_neigh *nb)
+{
+	unsigned int lci = 1;
+	struct sock *sk;
+
+	read_lock_bh(&x25_list_lock);
+
+	while ((sk = __x25_find_socket(lci, nb)) != NULL) {
+		sock_put(sk);
+		if (++lci == 4096) {
+			lci = 0;
+			break;
+		}
+	}
+
+	read_unlock_bh(&x25_list_lock);
+	return lci;
+}
+
+/*
+ *	Deferred destroy.
+ */
+static void __x25_destroy_socket(struct sock *);
+
+/*
+ *	handler for deferred kills.
+ */
+static void x25_destroy_timer(unsigned long data)
+{
+	x25_destroy_socket_from_timer((struct sock *)data);
+}
+
+/*
+ *	This is called from user mode and the timers. Thus it protects itself
+ *	against interrupt users but doesn't worry about being called during
+ *	work. Once it is removed from the queue no interrupt or bottom half
+ *	will touch it and we are (fairly 8-) ) safe.
+ *	Not static as it's used by the timer
+ */
+static void __x25_destroy_socket(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	x25_stop_heartbeat(sk);
+	x25_stop_timer(sk);
+
+	x25_remove_socket(sk);
+	x25_clear_queues(sk);		/* Flush the queues */
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (skb->sk != sk) {		/* A pending connection */
+			/*
+			 * Queue the unaccepted socket for death
+			 */
+			skb->sk->sk_state = TCP_LISTEN;
+			sock_set_flag(skb->sk, SOCK_DEAD);
+			x25_start_heartbeat(skb->sk);
+			x25_sk(skb->sk)->state = X25_STATE_0;
+		}
+
+		kfree_skb(skb);
+	}
+
+	if (sk_has_allocations(sk)) {
+		/* Defer: outstanding buffers */
+		sk->sk_timer.expires  = jiffies + 10 * HZ;
+		sk->sk_timer.function = x25_destroy_timer;
+		sk->sk_timer.data = (unsigned long)sk;
+		add_timer(&sk->sk_timer);
+	} else {
+		/* drop last reference so sock_put will free */
+		__sock_put(sk);
+	}
+}
+
+void x25_destroy_socket_from_timer(struct sock *sk)
+{
+	sock_hold(sk);
+	bh_lock_sock(sk);
+	__x25_destroy_socket(sk);
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	Handling for system calls applied via the various interfaces to a
+ *	X.25 socket object.
+ */
+
+static int x25_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	int opt;
+	struct sock *sk = sock->sk;
+	int rc = -ENOPROTOOPT;
+
+	if (level != SOL_X25 || optname != X25_QBITINCL)
+		goto out;
+
+	rc = -EINVAL;
+	if (optlen < sizeof(int))
+		goto out;
+
+	rc = -EFAULT;
+	if (get_user(opt, (int __user *)optval))
+		goto out;
+
+	if (opt)
+		set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+	else
+		clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+	rc = 0;
+out:
+	return rc;
+}
+
+static int x25_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	int val, len, rc = -ENOPROTOOPT;
+
+	if (level != SOL_X25 || optname != X25_QBITINCL)
+		goto out;
+
+	rc = -EFAULT;
+	if (get_user(len, optlen))
+		goto out;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	rc = -EINVAL;
+	if (len < 0)
+		goto out;
+
+	rc = -EFAULT;
+	if (put_user(len, optlen))
+		goto out;
+
+	val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags);
+	rc = copy_to_user(optval, &val, len) ? -EFAULT : 0;
+out:
+	return rc;
+}
+
+static int x25_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	int rc = -EOPNOTSUPP;
+
+	lock_sock(sk);
+	if (sk->sk_state != TCP_LISTEN) {
+		memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN);
+		sk->sk_max_ack_backlog = backlog;
+		sk->sk_state           = TCP_LISTEN;
+		rc = 0;
+	}
+	release_sock(sk);
+
+	return rc;
+}
+
+static struct proto x25_proto = {
+	.name	  = "X25",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct x25_sock),
+};
+
+static struct sock *x25_alloc_socket(struct net *net)
+{
+	struct x25_sock *x25;
+	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+
+	if (!sk)
+		goto out;
+
+	sock_init_data(NULL, sk);
+
+	x25 = x25_sk(sk);
+	skb_queue_head_init(&x25->ack_queue);
+	skb_queue_head_init(&x25->fragment_queue);
+	skb_queue_head_init(&x25->interrupt_in_queue);
+	skb_queue_head_init(&x25->interrupt_out_queue);
+out:
+	return sk;
+}
+
+static int x25_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	struct x25_sock *x25;
+	int rc = -EAFNOSUPPORT;
+
+	if (!net_eq(net, &init_net))
+		goto out;
+
+	rc = -ESOCKTNOSUPPORT;
+	if (sock->type != SOCK_SEQPACKET)
+		goto out;
+
+	rc = -EINVAL;
+	if (protocol)
+		goto out;
+
+	rc = -ENOBUFS;
+	if ((sk = x25_alloc_socket(net)) == NULL)
+		goto out;
+
+	x25 = x25_sk(sk);
+
+	sock_init_data(sock, sk);
+
+	x25_init_timers(sk);
+
+	sock->ops    = &x25_proto_ops;
+	sk->sk_protocol = protocol;
+	sk->sk_backlog_rcv = x25_backlog_rcv;
+
+	x25->t21   = sysctl_x25_call_request_timeout;
+	x25->t22   = sysctl_x25_reset_request_timeout;
+	x25->t23   = sysctl_x25_clear_request_timeout;
+	x25->t2    = sysctl_x25_ack_holdback_timeout;
+	x25->state = X25_STATE_0;
+	x25->cudmatchlength = 0;
+	set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags);	/* normally no cud  */
+							/* on call accept   */
+
+	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
+	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
+	x25->facilities.pacsize_in  = X25_DEFAULT_PACKET_SIZE;
+	x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE;
+	x25->facilities.throughput  = 0;	/* by default don't negotiate
+						   throughput */
+	x25->facilities.reverse     = X25_DEFAULT_REVERSE;
+	x25->dte_facilities.calling_len = 0;
+	x25->dte_facilities.called_len = 0;
+	memset(x25->dte_facilities.called_ae, '\0',
+			sizeof(x25->dte_facilities.called_ae));
+	memset(x25->dte_facilities.calling_ae, '\0',
+			sizeof(x25->dte_facilities.calling_ae));
+
+	rc = 0;
+out:
+	return rc;
+}
+
+static struct sock *x25_make_new(struct sock *osk)
+{
+	struct sock *sk = NULL;
+	struct x25_sock *x25, *ox25;
+
+	if (osk->sk_type != SOCK_SEQPACKET)
+		goto out;
+
+	if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+		goto out;
+
+	x25 = x25_sk(sk);
+
+	sk->sk_type        = osk->sk_type;
+	sk->sk_priority    = osk->sk_priority;
+	sk->sk_protocol    = osk->sk_protocol;
+	sk->sk_rcvbuf      = osk->sk_rcvbuf;
+	sk->sk_sndbuf      = osk->sk_sndbuf;
+	sk->sk_state       = TCP_ESTABLISHED;
+	sk->sk_backlog_rcv = osk->sk_backlog_rcv;
+	sock_copy_flags(sk, osk);
+
+	ox25 = x25_sk(osk);
+	x25->t21        = ox25->t21;
+	x25->t22        = ox25->t22;
+	x25->t23        = ox25->t23;
+	x25->t2         = ox25->t2;
+	x25->flags	= ox25->flags;
+	x25->facilities = ox25->facilities;
+	x25->dte_facilities = ox25->dte_facilities;
+	x25->cudmatchlength = ox25->cudmatchlength;
+
+	clear_bit(X25_INTERRUPT_FLAG, &x25->flags);
+	x25_init_timers(sk);
+out:
+	return sk;
+}
+
+static int x25_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25;
+
+	if (!sk)
+		return 0;
+
+	x25 = x25_sk(sk);
+
+	sock_hold(sk);
+	lock_sock(sk);
+	switch (x25->state) {
+
+		case X25_STATE_0:
+		case X25_STATE_2:
+			x25_disconnect(sk, 0, 0, 0);
+			__x25_destroy_socket(sk);
+			goto out;
+
+		case X25_STATE_1:
+		case X25_STATE_3:
+		case X25_STATE_4:
+			x25_clear_queues(sk);
+			x25_write_internal(sk, X25_CLEAR_REQUEST);
+			x25_start_t23timer(sk);
+			x25->state = X25_STATE_2;
+			sk->sk_state	= TCP_CLOSE;
+			sk->sk_shutdown	|= SEND_SHUTDOWN;
+			sk->sk_state_change(sk);
+			sock_set_flag(sk, SOCK_DEAD);
+			sock_set_flag(sk, SOCK_DESTROY);
+			break;
+	}
+
+	sock_orphan(sk);
+out:
+	release_sock(sk);
+	sock_put(sk);
+	return 0;
+}
+
+static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
+	int len, i, rc = 0;
+
+	if (!sock_flag(sk, SOCK_ZAPPED) ||
+	    addr_len != sizeof(struct sockaddr_x25) ||
+	    addr->sx25_family != AF_X25) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	len = strlen(addr->sx25_addr.x25_addr);
+	for (i = 0; i < len; i++) {
+		if (!isdigit(addr->sx25_addr.x25_addr[i])) {
+			rc = -EINVAL;
+			goto out;
+		}
+	}
+
+	lock_sock(sk);
+	x25_sk(sk)->source_addr = addr->sx25_addr;
+	x25_insert_socket(sk);
+	sock_reset_flag(sk, SOCK_ZAPPED);
+	release_sock(sk);
+	SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
+out:
+	return rc;
+}
+
+static int x25_wait_for_connection_establishment(struct sock *sk)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int rc;
+
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	for (;;) {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		rc = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		rc = sock_error(sk);
+		if (rc) {
+			sk->sk_socket->state = SS_UNCONNECTED;
+			break;
+		}
+		rc = 0;
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			release_sock(sk);
+			schedule();
+			lock_sock(sk);
+		} else
+			break;
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25 = x25_sk(sk);
+	struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
+	struct x25_route *rt;
+	int rc = 0;
+
+	lock_sock(sk);
+	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
+		sock->state = SS_CONNECTED;
+		goto out; /* Connect completed during a ERESTARTSYS event */
+	}
+
+	rc = -ECONNREFUSED;
+	if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
+		sock->state = SS_UNCONNECTED;
+		goto out;
+	}
+
+	rc = -EISCONN;	/* No reconnect on a seqpacket socket */
+	if (sk->sk_state == TCP_ESTABLISHED)
+		goto out;
+
+	sk->sk_state   = TCP_CLOSE;
+	sock->state = SS_UNCONNECTED;
+
+	rc = -EINVAL;
+	if (addr_len != sizeof(struct sockaddr_x25) ||
+	    addr->sx25_family != AF_X25)
+		goto out;
+
+	rc = -ENETUNREACH;
+	rt = x25_get_route(&addr->sx25_addr);
+	if (!rt)
+		goto out;
+
+	x25->neighbour = x25_get_neigh(rt->dev);
+	if (!x25->neighbour)
+		goto out_put_route;
+
+	x25_limit_facilities(&x25->facilities, x25->neighbour);
+
+	x25->lci = x25_new_lci(x25->neighbour);
+	if (!x25->lci)
+		goto out_put_neigh;
+
+	rc = -EINVAL;
+	if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */
+		goto out_put_neigh;
+
+	if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr))
+		memset(&x25->source_addr, '\0', X25_ADDR_LEN);
+
+	x25->dest_addr = addr->sx25_addr;
+
+	/* Move to connecting socket, start sending Connect Requests */
+	sock->state   = SS_CONNECTING;
+	sk->sk_state  = TCP_SYN_SENT;
+
+	x25->state = X25_STATE_1;
+
+	x25_write_internal(sk, X25_CALL_REQUEST);
+
+	x25_start_heartbeat(sk);
+	x25_start_t21timer(sk);
+
+	/* Now the loop */
+	rc = -EINPROGRESS;
+	if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
+		goto out_put_neigh;
+
+	rc = x25_wait_for_connection_establishment(sk);
+	if (rc)
+		goto out_put_neigh;
+
+	sock->state = SS_CONNECTED;
+	rc = 0;
+out_put_neigh:
+	if (rc)
+		x25_neigh_put(x25->neighbour);
+out_put_route:
+	x25_route_put(rt);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+static int x25_wait_for_data(struct sock *sk, long timeout)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	int rc = 0;
+
+	add_wait_queue_exclusive(sk_sleep(sk), &wait);
+	for (;;) {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			break;
+		rc = -ERESTARTSYS;
+		if (signal_pending(current))
+			break;
+		rc = -EAGAIN;
+		if (!timeout)
+			break;
+		rc = 0;
+		if (skb_queue_empty(&sk->sk_receive_queue)) {
+			release_sock(sk);
+			timeout = schedule_timeout(timeout);
+			lock_sock(sk);
+		} else
+			break;
+	}
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return rc;
+}
+
+static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sock *newsk;
+	struct sk_buff *skb;
+	int rc = -EINVAL;
+
+	if (!sk)
+		goto out;
+
+	rc = -EOPNOTSUPP;
+	if (sk->sk_type != SOCK_SEQPACKET)
+		goto out;
+
+	lock_sock(sk);
+	rc = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out2;
+
+	rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
+	if (rc)
+		goto out2;
+	skb = skb_dequeue(&sk->sk_receive_queue);
+	rc = -EINVAL;
+	if (!skb->sk)
+		goto out2;
+	newsk		 = skb->sk;
+	sock_graft(newsk, newsock);
+
+	/* Now attach up the new socket */
+	skb->sk = NULL;
+	kfree_skb(skb);
+	sk->sk_ack_backlog--;
+	newsock->state = SS_CONNECTED;
+	rc = 0;
+out2:
+	release_sock(sk);
+out:
+	return rc;
+}
+
+static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int *uaddr_len, int peer)
+{
+	struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr;
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25 = x25_sk(sk);
+	int rc = 0;
+
+	if (peer) {
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			rc = -ENOTCONN;
+			goto out;
+		}
+		sx25->sx25_addr = x25->dest_addr;
+	} else
+		sx25->sx25_addr = x25->source_addr;
+
+	sx25->sx25_family = AF_X25;
+	*uaddr_len = sizeof(*sx25);
+
+out:
+	return rc;
+}
+
+int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
+			unsigned int lci)
+{
+	struct sock *sk;
+	struct sock *make;
+	struct x25_sock *makex25;
+	struct x25_address source_addr, dest_addr;
+	struct x25_facilities facilities;
+	struct x25_dte_facilities dte_facilities;
+	int len, addr_len, rc;
+
+	/*
+	 *	Remove the LCI and frame type.
+	 */
+	skb_pull(skb, X25_STD_MIN_LEN);
+
+	/*
+	 *	Extract the X.25 addresses and convert them to ASCII strings,
+	 *	and remove them.
+	 *
+	 *	Address block is mandatory in call request packets
+	 */
+	addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr);
+	if (addr_len <= 0)
+		goto out_clear_request;
+	skb_pull(skb, addr_len);
+
+	/*
+	 *	Get the length of the facilities, skip past them for the moment
+	 *	get the call user data because this is needed to determine
+	 *	the correct listener
+	 *
+	 *	Facilities length is mandatory in call request packets
+	 */
+	if (skb->len < 1)
+		goto out_clear_request;
+	len = skb->data[0] + 1;
+	if (skb->len < len)
+		goto out_clear_request;
+	skb_pull(skb,len);
+
+	/*
+	 *	Find a listener for the particular address/cud pair.
+	 */
+	sk = x25_find_listener(&source_addr,skb);
+	skb_push(skb,len);
+
+	if (sk != NULL && sk_acceptq_is_full(sk)) {
+		goto out_sock_put;
+	}
+
+	/*
+	 *	We dont have any listeners for this incoming call.
+	 *	Try forwarding it.
+	 */
+	if (sk == NULL) {
+		skb_push(skb, addr_len + X25_STD_MIN_LEN);
+		if (sysctl_x25_forward &&
+				x25_forward_call(&dest_addr, nb, skb, lci) > 0)
+		{
+			/* Call was forwarded, dont process it any more */
+			kfree_skb(skb);
+			rc = 1;
+			goto out;
+		} else {
+			/* No listeners, can't forward, clear the call */
+			goto out_clear_request;
+		}
+	}
+
+	/*
+	 *	Try to reach a compromise on the requested facilities.
+	 */
+	len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities);
+	if (len == -1)
+		goto out_sock_put;
+
+	/*
+	 * current neighbour/link might impose additional limits
+	 * on certain facilties
+	 */
+
+	x25_limit_facilities(&facilities, nb);
+
+	/*
+	 *	Try to create a new socket.
+	 */
+	make = x25_make_new(sk);
+	if (!make)
+		goto out_sock_put;
+
+	/*
+	 *	Remove the facilities
+	 */
+	skb_pull(skb, len);
+
+	skb->sk     = make;
+	make->sk_state = TCP_ESTABLISHED;
+
+	makex25 = x25_sk(make);
+	makex25->lci           = lci;
+	makex25->dest_addr     = dest_addr;
+	makex25->source_addr   = source_addr;
+	makex25->neighbour     = nb;
+	makex25->facilities    = facilities;
+	makex25->dte_facilities= dte_facilities;
+	makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
+	/* ensure no reverse facil on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+	/* ensure no calling address extension on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE;
+	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
+
+	/* Normally all calls are accepted immediately */
+	if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) {
+		x25_write_internal(make, X25_CALL_ACCEPTED);
+		makex25->state = X25_STATE_3;
+	}
+
+	/*
+	 *	Incoming Call User Data.
+	 */
+	skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
+	makex25->calluserdata.cudlength = skb->len;
+
+	sk->sk_ack_backlog++;
+
+	x25_insert_socket(make);
+
+	skb_queue_head(&sk->sk_receive_queue, skb);
+
+	x25_start_heartbeat(make);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb->len);
+	rc = 1;
+	sock_put(sk);
+out:
+	return rc;
+out_sock_put:
+	sock_put(sk);
+out_clear_request:
+	rc = 0;
+	x25_transmit_clear_request(nb, lci, 0x01);
+	goto out;
+}
+
+static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25 = x25_sk(sk);
+	struct sockaddr_x25 *usx25 = (struct sockaddr_x25 *)msg->msg_name;
+	struct sockaddr_x25 sx25;
+	struct sk_buff *skb;
+	unsigned char *asmptr;
+	int noblock = msg->msg_flags & MSG_DONTWAIT;
+	size_t size;
+	int qbit = 0, rc = -EINVAL;
+
+	lock_sock(sk);
+	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT))
+		goto out;
+
+	/* we currently don't support segmented records at the user interface */
+	if (!(msg->msg_flags & (MSG_EOR|MSG_OOB)))
+		goto out;
+
+	rc = -EADDRNOTAVAIL;
+	if (sock_flag(sk, SOCK_ZAPPED))
+		goto out;
+
+	rc = -EPIPE;
+	if (sk->sk_shutdown & SEND_SHUTDOWN) {
+		send_sig(SIGPIPE, current, 0);
+		goto out;
+	}
+
+	rc = -ENETUNREACH;
+	if (!x25->neighbour)
+		goto out;
+
+	if (usx25) {
+		rc = -EINVAL;
+		if (msg->msg_namelen < sizeof(sx25))
+			goto out;
+		memcpy(&sx25, usx25, sizeof(sx25));
+		rc = -EISCONN;
+		if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr))
+			goto out;
+		rc = -EINVAL;
+		if (sx25.sx25_family != AF_X25)
+			goto out;
+	} else {
+		/*
+		 *	FIXME 1003.1g - if the socket is like this because
+		 *	it has become closed (not started closed) we ought
+		 *	to SIGPIPE, EPIPE;
+		 */
+		rc = -ENOTCONN;
+		if (sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+
+		sx25.sx25_family = AF_X25;
+		sx25.sx25_addr   = x25->dest_addr;
+	}
+
+	/* Sanity check the packet size */
+	if (len > 65535) {
+		rc = -EMSGSIZE;
+		goto out;
+	}
+
+	SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n");
+
+	/* Build a packet */
+	SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n");
+
+	if ((msg->msg_flags & MSG_OOB) && len > 32)
+		len = 32;
+
+	size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN;
+
+	release_sock(sk);
+	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+	lock_sock(sk);
+	if (!skb)
+		goto out;
+	X25_SKB_CB(skb)->flags = msg->msg_flags;
+
+	skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN);
+
+	/*
+	 *	Put the data on the end
+	 */
+	SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n");
+
+	skb_reset_transport_header(skb);
+	skb_put(skb, len);
+
+	rc = memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len);
+	if (rc)
+		goto out_kfree_skb;
+
+	/*
+	 *	If the Q BIT Include socket option is in force, the first
+	 *	byte of the user data is the logical value of the Q Bit.
+	 */
+	if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+		qbit = skb->data[0];
+		skb_pull(skb, 1);
+	}
+
+	/*
+	 *	Push down the X.25 header
+	 */
+	SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n");
+
+	if (msg->msg_flags & MSG_OOB) {
+		if (x25->neighbour->extended) {
+			asmptr    = skb_push(skb, X25_STD_MIN_LEN);
+			*asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ;
+			*asmptr++ = (x25->lci >> 0) & 0xFF;
+			*asmptr++ = X25_INTERRUPT;
+		} else {
+			asmptr    = skb_push(skb, X25_STD_MIN_LEN);
+			*asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ;
+			*asmptr++ = (x25->lci >> 0) & 0xFF;
+			*asmptr++ = X25_INTERRUPT;
+		}
+	} else {
+		if (x25->neighbour->extended) {
+			/* Build an Extended X.25 header */
+			asmptr    = skb_push(skb, X25_EXT_MIN_LEN);
+			*asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ;
+			*asmptr++ = (x25->lci >> 0) & 0xFF;
+			*asmptr++ = X25_DATA;
+			*asmptr++ = X25_DATA;
+		} else {
+			/* Build an Standard X.25 header */
+			asmptr    = skb_push(skb, X25_STD_MIN_LEN);
+			*asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ;
+			*asmptr++ = (x25->lci >> 0) & 0xFF;
+			*asmptr++ = X25_DATA;
+		}
+
+		if (qbit)
+			skb->data[0] |= X25_Q_BIT;
+	}
+
+	SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n");
+	SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n");
+
+	rc = -ENOTCONN;
+	if (sk->sk_state != TCP_ESTABLISHED)
+		goto out_kfree_skb;
+
+	if (msg->msg_flags & MSG_OOB)
+		skb_queue_tail(&x25->interrupt_out_queue, skb);
+	else {
+		rc = x25_output(sk, skb);
+		len = rc;
+		if (rc < 0)
+			kfree_skb(skb);
+		else if (test_bit(X25_Q_BIT_FLAG, &x25->flags))
+			len++;
+	}
+
+	x25_kick(sk);
+	rc = len;
+out:
+	release_sock(sk);
+	return rc;
+out_kfree_skb:
+	kfree_skb(skb);
+	goto out;
+}
+
+
+static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *msg, size_t size,
+		       int flags)
+{
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25 = x25_sk(sk);
+	struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)msg->msg_name;
+	size_t copied;
+	int qbit;
+	struct sk_buff *skb;
+	unsigned char *asmptr;
+	int rc = -ENOTCONN;
+
+	lock_sock(sk);
+	/*
+	 * This works for seqpacket too. The receiver has ordered the queue for
+	 * us! We do one quick check first though
+	 */
+	if (sk->sk_state != TCP_ESTABLISHED)
+		goto out;
+
+	if (flags & MSG_OOB) {
+		rc = -EINVAL;
+		if (sock_flag(sk, SOCK_URGINLINE) ||
+		    !skb_peek(&x25->interrupt_in_queue))
+			goto out;
+
+		skb = skb_dequeue(&x25->interrupt_in_queue);
+
+		skb_pull(skb, X25_STD_MIN_LEN);
+
+		/*
+		 *	No Q bit information on Interrupt data.
+		 */
+		if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+			asmptr  = skb_push(skb, 1);
+			*asmptr = 0x00;
+		}
+
+		msg->msg_flags |= MSG_OOB;
+	} else {
+		/* Now we can treat all alike */
+		release_sock(sk);
+		skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
+					flags & MSG_DONTWAIT, &rc);
+		lock_sock(sk);
+		if (!skb)
+			goto out;
+
+		qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT;
+
+		skb_pull(skb, x25->neighbour->extended ?
+				X25_EXT_MIN_LEN : X25_STD_MIN_LEN);
+
+		if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) {
+			asmptr  = skb_push(skb, 1);
+			*asmptr = qbit;
+		}
+	}
+
+	skb_reset_transport_header(skb);
+	copied = skb->len;
+
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	/* Currently, each datagram always contains a complete record */
+	msg->msg_flags |= MSG_EOR;
+
+	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (rc)
+		goto out_free_dgram;
+
+	if (sx25) {
+		sx25->sx25_family = AF_X25;
+		sx25->sx25_addr   = x25->dest_addr;
+	}
+
+	msg->msg_namelen = sizeof(struct sockaddr_x25);
+
+	x25_check_rbuf(sk);
+	rc = copied;
+out_free_dgram:
+	skb_free_datagram(sk, skb);
+out:
+	release_sock(sk);
+	return rc;
+}
+
+
+static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	struct x25_sock *x25 = x25_sk(sk);
+	void __user *argp = (void __user *)arg;
+	int rc;
+
+	switch (cmd) {
+		case TIOCOUTQ: {
+			int amount;
+
+			amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
+			if (amount < 0)
+				amount = 0;
+			rc = put_user(amount, (unsigned int __user *)argp);
+			break;
+		}
+
+		case TIOCINQ: {
+			struct sk_buff *skb;
+			int amount = 0;
+			/*
+			 * These two are safe on a single CPU system as
+			 * only user tasks fiddle here
+			 */
+			lock_sock(sk);
+			if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
+				amount = skb->len;
+			release_sock(sk);
+			rc = put_user(amount, (unsigned int __user *)argp);
+			break;
+		}
+
+		case SIOCGSTAMP:
+			rc = -EINVAL;
+			if (sk)
+				rc = sock_get_timestamp(sk,
+						(struct timeval __user *)argp);
+			break;
+		case SIOCGSTAMPNS:
+			rc = -EINVAL;
+			if (sk)
+				rc = sock_get_timestampns(sk,
+						(struct timespec __user *)argp);
+			break;
+		case SIOCGIFADDR:
+		case SIOCSIFADDR:
+		case SIOCGIFDSTADDR:
+		case SIOCSIFDSTADDR:
+		case SIOCGIFBRDADDR:
+		case SIOCSIFBRDADDR:
+		case SIOCGIFNETMASK:
+		case SIOCSIFNETMASK:
+		case SIOCGIFMETRIC:
+		case SIOCSIFMETRIC:
+			rc = -EINVAL;
+			break;
+		case SIOCADDRT:
+		case SIOCDELRT:
+			rc = -EPERM;
+			if (!capable(CAP_NET_ADMIN))
+				break;
+			rc = x25_route_ioctl(cmd, argp);
+			break;
+		case SIOCX25GSUBSCRIP:
+			rc = x25_subscr_ioctl(cmd, argp);
+			break;
+		case SIOCX25SSUBSCRIP:
+			rc = -EPERM;
+			if (!capable(CAP_NET_ADMIN))
+				break;
+			rc = x25_subscr_ioctl(cmd, argp);
+			break;
+		case SIOCX25GFACILITIES: {
+			lock_sock(sk);
+			rc = copy_to_user(argp, &x25->facilities,
+						sizeof(x25->facilities))
+						? -EFAULT : 0;
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25SFACILITIES: {
+			struct x25_facilities facilities;
+			rc = -EFAULT;
+			if (copy_from_user(&facilities, argp,
+					   sizeof(facilities)))
+				break;
+			rc = -EINVAL;
+			lock_sock(sk);
+			if (sk->sk_state != TCP_LISTEN &&
+			    sk->sk_state != TCP_CLOSE)
+				goto out_fac_release;
+			if (facilities.pacsize_in < X25_PS16 ||
+			    facilities.pacsize_in > X25_PS4096)
+				goto out_fac_release;
+			if (facilities.pacsize_out < X25_PS16 ||
+			    facilities.pacsize_out > X25_PS4096)
+				goto out_fac_release;
+			if (facilities.winsize_in < 1 ||
+			    facilities.winsize_in > 127)
+				goto out_fac_release;
+			if (facilities.throughput) {
+				int out = facilities.throughput & 0xf0;
+				int in  = facilities.throughput & 0x0f;
+				if (!out)
+					facilities.throughput |=
+						X25_DEFAULT_THROUGHPUT << 4;
+				else if (out < 0x30 || out > 0xD0)
+					goto out_fac_release;
+				if (!in)
+					facilities.throughput |=
+						X25_DEFAULT_THROUGHPUT;
+				else if (in < 0x03 || in > 0x0D)
+					goto out_fac_release;
+			}
+			if (facilities.reverse &&
+				(facilities.reverse & 0x81) != 0x81)
+				goto out_fac_release;
+			x25->facilities = facilities;
+			rc = 0;
+out_fac_release:
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25GDTEFACILITIES: {
+			lock_sock(sk);
+			rc = copy_to_user(argp, &x25->dte_facilities,
+						sizeof(x25->dte_facilities));
+			release_sock(sk);
+			if (rc)
+				rc = -EFAULT;
+			break;
+		}
+
+		case SIOCX25SDTEFACILITIES: {
+			struct x25_dte_facilities dtefacs;
+			rc = -EFAULT;
+			if (copy_from_user(&dtefacs, argp, sizeof(dtefacs)))
+				break;
+			rc = -EINVAL;
+			lock_sock(sk);
+			if (sk->sk_state != TCP_LISTEN &&
+					sk->sk_state != TCP_CLOSE)
+				goto out_dtefac_release;
+			if (dtefacs.calling_len > X25_MAX_AE_LEN)
+				goto out_dtefac_release;
+			if (dtefacs.calling_ae == NULL)
+				goto out_dtefac_release;
+			if (dtefacs.called_len > X25_MAX_AE_LEN)
+				goto out_dtefac_release;
+			if (dtefacs.called_ae == NULL)
+				goto out_dtefac_release;
+			x25->dte_facilities = dtefacs;
+			rc = 0;
+out_dtefac_release:
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25GCALLUSERDATA: {
+			lock_sock(sk);
+			rc = copy_to_user(argp, &x25->calluserdata,
+					sizeof(x25->calluserdata))
+					? -EFAULT : 0;
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25SCALLUSERDATA: {
+			struct x25_calluserdata calluserdata;
+
+			rc = -EFAULT;
+			if (copy_from_user(&calluserdata, argp,
+					   sizeof(calluserdata)))
+				break;
+			rc = -EINVAL;
+			if (calluserdata.cudlength > X25_MAX_CUD_LEN)
+				break;
+			lock_sock(sk);
+			x25->calluserdata = calluserdata;
+			release_sock(sk);
+			rc = 0;
+			break;
+		}
+
+		case SIOCX25GCAUSEDIAG: {
+			lock_sock(sk);
+			rc = copy_to_user(argp, &x25->causediag,
+					sizeof(x25->causediag))
+					? -EFAULT : 0;
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25SCAUSEDIAG: {
+			struct x25_causediag causediag;
+			rc = -EFAULT;
+			if (copy_from_user(&causediag, argp, sizeof(causediag)))
+				break;
+			lock_sock(sk);
+			x25->causediag = causediag;
+			release_sock(sk);
+			rc = 0;
+			break;
+
+		}
+
+		case SIOCX25SCUDMATCHLEN: {
+			struct x25_subaddr sub_addr;
+			rc = -EINVAL;
+			lock_sock(sk);
+			if(sk->sk_state != TCP_CLOSE)
+				goto out_cud_release;
+			rc = -EFAULT;
+			if (copy_from_user(&sub_addr, argp,
+					sizeof(sub_addr)))
+				goto out_cud_release;
+			rc = -EINVAL;
+			if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+				goto out_cud_release;
+			x25->cudmatchlength = sub_addr.cudmatchlength;
+			rc = 0;
+out_cud_release:
+			release_sock(sk);
+			break;
+		}
+
+		case SIOCX25CALLACCPTAPPRV: {
+			rc = -EINVAL;
+			lock_sock(sk);
+			if (sk->sk_state != TCP_CLOSE)
+				break;
+			clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags);
+			release_sock(sk);
+			rc = 0;
+			break;
+		}
+
+		case SIOCX25SENDCALLACCPT:  {
+			rc = -EINVAL;
+			lock_sock(sk);
+			if (sk->sk_state != TCP_ESTABLISHED)
+				break;
+			/* must call accptapprv above */
+			if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags))
+				break;
+			x25_write_internal(sk, X25_CALL_ACCEPTED);
+			x25->state = X25_STATE_3;
+			release_sock(sk);
+			rc = 0;
+			break;
+		}
+
+		default:
+			rc = -ENOIOCTLCMD;
+			break;
+	}
+
+	return rc;
+}
+
+static const struct net_proto_family x25_family_ops = {
+	.family =	AF_X25,
+	.create =	x25_create,
+	.owner	=	THIS_MODULE,
+};
+
+#ifdef CONFIG_COMPAT
+static int compat_x25_subscr_ioctl(unsigned int cmd,
+		struct compat_x25_subscrip_struct __user *x25_subscr32)
+{
+	struct compat_x25_subscrip_struct x25_subscr;
+	struct x25_neigh *nb;
+	struct net_device *dev;
+	int rc = -EINVAL;
+
+	rc = -EFAULT;
+	if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32)))
+		goto out;
+
+	rc = -EINVAL;
+	dev = x25_dev_get(x25_subscr.device);
+	if (dev == NULL)
+		goto out;
+
+	nb = x25_get_neigh(dev);
+	if (nb == NULL)
+		goto out_dev_put;
+
+	dev_put(dev);
+
+	if (cmd == SIOCX25GSUBSCRIP) {
+		read_lock_bh(&x25_neigh_list_lock);
+		x25_subscr.extended = nb->extended;
+		x25_subscr.global_facil_mask = nb->global_facil_mask;
+		read_unlock_bh(&x25_neigh_list_lock);
+		rc = copy_to_user(x25_subscr32, &x25_subscr,
+				sizeof(*x25_subscr32)) ? -EFAULT : 0;
+	} else {
+		rc = -EINVAL;
+		if (x25_subscr.extended == 0 || x25_subscr.extended == 1) {
+			rc = 0;
+			write_lock_bh(&x25_neigh_list_lock);
+			nb->extended = x25_subscr.extended;
+			nb->global_facil_mask = x25_subscr.global_facil_mask;
+			write_unlock_bh(&x25_neigh_list_lock);
+		}
+	}
+	x25_neigh_put(nb);
+out:
+	return rc;
+out_dev_put:
+	dev_put(dev);
+	goto out;
+}
+
+static int compat_x25_ioctl(struct socket *sock, unsigned int cmd,
+				unsigned long arg)
+{
+	void __user *argp = compat_ptr(arg);
+	struct sock *sk = sock->sk;
+
+	int rc = -ENOIOCTLCMD;
+
+	switch(cmd) {
+	case TIOCOUTQ:
+	case TIOCINQ:
+		rc = x25_ioctl(sock, cmd, (unsigned long)argp);
+		break;
+	case SIOCGSTAMP:
+		rc = -EINVAL;
+		if (sk)
+			rc = compat_sock_get_timestamp(sk,
+					(struct timeval __user*)argp);
+		break;
+	case SIOCGSTAMPNS:
+		rc = -EINVAL;
+		if (sk)
+			rc = compat_sock_get_timestampns(sk,
+					(struct timespec __user*)argp);
+		break;
+	case SIOCGIFADDR:
+	case SIOCSIFADDR:
+	case SIOCGIFDSTADDR:
+	case SIOCSIFDSTADDR:
+	case SIOCGIFBRDADDR:
+	case SIOCSIFBRDADDR:
+	case SIOCGIFNETMASK:
+	case SIOCSIFNETMASK:
+	case SIOCGIFMETRIC:
+	case SIOCSIFMETRIC:
+		rc = -EINVAL;
+		break;
+	case SIOCADDRT:
+	case SIOCDELRT:
+		rc = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		rc = x25_route_ioctl(cmd, argp);
+		break;
+	case SIOCX25GSUBSCRIP:
+		rc = compat_x25_subscr_ioctl(cmd, argp);
+		break;
+	case SIOCX25SSUBSCRIP:
+		rc = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			break;
+		rc = compat_x25_subscr_ioctl(cmd, argp);
+		break;
+	case SIOCX25GFACILITIES:
+	case SIOCX25SFACILITIES:
+	case SIOCX25GDTEFACILITIES:
+	case SIOCX25SDTEFACILITIES:
+	case SIOCX25GCALLUSERDATA:
+	case SIOCX25SCALLUSERDATA:
+	case SIOCX25GCAUSEDIAG:
+	case SIOCX25SCAUSEDIAG:
+	case SIOCX25SCUDMATCHLEN:
+	case SIOCX25CALLACCPTAPPRV:
+	case SIOCX25SENDCALLACCPT:
+		rc = x25_ioctl(sock, cmd, (unsigned long)argp);
+		break;
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+	return rc;
+}
+#endif
+
+static const struct proto_ops x25_proto_ops = {
+	.family =	AF_X25,
+	.owner =	THIS_MODULE,
+	.release =	x25_release,
+	.bind =		x25_bind,
+	.connect =	x25_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	x25_accept,
+	.getname =	x25_getname,
+	.poll =		datagram_poll,
+	.ioctl =	x25_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = compat_x25_ioctl,
+#endif
+	.listen =	x25_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	x25_setsockopt,
+	.getsockopt =	x25_getsockopt,
+	.sendmsg =	x25_sendmsg,
+	.recvmsg =	x25_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static struct packet_type x25_packet_type __read_mostly = {
+	.type =	cpu_to_be16(ETH_P_X25),
+	.func =	x25_lapb_receive_frame,
+};
+
+static struct notifier_block x25_dev_notifier = {
+	.notifier_call = x25_device_event,
+};
+
+void x25_kill_by_neigh(struct x25_neigh *nb)
+{
+	struct sock *s;
+	struct hlist_node *node;
+
+	write_lock_bh(&x25_list_lock);
+
+	sk_for_each(s, node, &x25_list)
+		if (x25_sk(s)->neighbour == nb)
+			x25_disconnect(s, ENETUNREACH, 0, 0);
+
+	write_unlock_bh(&x25_list_lock);
+
+	/* Remove any related forwards */
+	x25_clear_forward_by_dev(nb->dev);
+}
+
+static int __init x25_init(void)
+{
+	int rc = proto_register(&x25_proto, 0);
+
+	if (rc != 0)
+		goto out;
+
+	rc = sock_register(&x25_family_ops);
+	if (rc != 0)
+		goto out_proto;
+
+	dev_add_pack(&x25_packet_type);
+
+	rc = register_netdevice_notifier(&x25_dev_notifier);
+	if (rc != 0)
+		goto out_sock;
+
+	printk(KERN_INFO "X.25 for Linux Version 0.2\n");
+
+	x25_register_sysctl();
+	rc = x25_proc_init();
+	if (rc != 0)
+		goto out_dev;
+out:
+	return rc;
+out_dev:
+	unregister_netdevice_notifier(&x25_dev_notifier);
+out_sock:
+	sock_unregister(AF_X25);
+out_proto:
+	proto_unregister(&x25_proto);
+	goto out;
+}
+module_init(x25_init);
+
+static void __exit x25_exit(void)
+{
+	x25_proc_exit();
+	x25_link_free();
+	x25_route_free();
+
+	x25_unregister_sysctl();
+
+	unregister_netdevice_notifier(&x25_dev_notifier);
+
+	dev_remove_pack(&x25_packet_type);
+
+	sock_unregister(AF_X25);
+	proto_unregister(&x25_proto);
+}
+module_exit(x25_exit);
+
+MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
+MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_X25);
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
new file mode 100644
index 00000000..d2efd29f
--- /dev/null
+++ b/net/x25/sysctl_net_x25.c
@@ -0,0 +1,90 @@
+/* -*- linux-c -*-
+ * sysctl_net_x25.c: sysctl interface to net X.25 subsystem.
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/x25 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/sysctl.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/init.h>
+#include <net/x25.h>
+
+static int min_timer[] = {   1 * HZ };
+static int max_timer[] = { 300 * HZ };
+
+static struct ctl_table_header *x25_table_header;
+
+static struct ctl_table x25_table[] = {
+	{
+		.procname =	"restart_request_timeout",
+		.data =		&sysctl_x25_restart_request_timeout,
+		.maxlen =	sizeof(int),
+		.mode =		0644,
+		.proc_handler =	proc_dointvec_minmax,
+		.extra1 =	&min_timer,
+		.extra2 =	&max_timer,
+	},
+	{
+		.procname =	"call_request_timeout",
+		.data =		&sysctl_x25_call_request_timeout,
+		.maxlen =	sizeof(int),
+		.mode =		0644,
+		.proc_handler =	proc_dointvec_minmax,
+		.extra1 =	&min_timer,
+		.extra2 =	&max_timer,
+	},
+	{
+		.procname =	"reset_request_timeout",
+		.data =		&sysctl_x25_reset_request_timeout,
+		.maxlen =	sizeof(int),
+		.mode =		0644,
+		.proc_handler =	proc_dointvec_minmax,
+		.extra1 =	&min_timer,
+		.extra2 =	&max_timer,
+	},
+	{
+		.procname =	"clear_request_timeout",
+		.data =		&sysctl_x25_clear_request_timeout,
+		.maxlen =	sizeof(int),
+		.mode =		0644,
+		.proc_handler =	proc_dointvec_minmax,
+		.extra1 =	&min_timer,
+		.extra2 =	&max_timer,
+	},
+	{
+		.procname =	"acknowledgement_hold_back_timeout",
+		.data =		&sysctl_x25_ack_holdback_timeout,
+		.maxlen =	sizeof(int),
+		.mode =		0644,
+		.proc_handler =	proc_dointvec_minmax,
+		.extra1 =	&min_timer,
+		.extra2 =	&max_timer,
+	},
+	{
+		.procname =	"x25_forward",
+		.data = 	&sysctl_x25_forward,
+		.maxlen = 	sizeof(int),
+		.mode = 	0644,
+		.proc_handler = proc_dointvec,
+	},
+	{ 0, },
+};
+
+static struct ctl_path x25_path[] = {
+	{ .procname = "net", },
+	{ .procname = "x25", },
+	{ }
+};
+
+void __init x25_register_sysctl(void)
+{
+	x25_table_header = register_sysctl_paths(x25_path, x25_table);
+}
+
+void x25_unregister_sysctl(void)
+{
+	unregister_sysctl_table(x25_table_header);
+}
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
new file mode 100644
index 00000000..9005f6da
--- /dev/null
+++ b/net/x25/x25_dev.c
@@ -0,0 +1,224 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine, randomly fail to work with new
+ *	releases, misbehave and/or generally screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	Started coding.
+ *      2000-09-04	Henner Eisen	Prevent freeing a dangling skb.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/if_arp.h>
+#include <net/x25.h>
+#include <net/x25device.h>
+
+static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
+{
+	struct sock *sk;
+	unsigned short frametype;
+	unsigned int lci;
+
+	frametype = skb->data[2];
+	lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF);
+
+	/*
+	 *	LCI of zero is always for us, and its always a link control
+	 *	frame.
+	 */
+	if (lci == 0) {
+		x25_link_control(skb, nb, frametype);
+		return 0;
+	}
+
+	/*
+	 *	Find an existing socket.
+	 */
+	if ((sk = x25_find_socket(lci, nb)) != NULL) {
+		int queued = 1;
+
+		skb_reset_transport_header(skb);
+		bh_lock_sock(sk);
+		if (!sock_owned_by_user(sk)) {
+			queued = x25_process_rx_frame(sk, skb);
+		} else {
+			queued = !sk_add_backlog(sk, skb);
+		}
+		bh_unlock_sock(sk);
+		sock_put(sk);
+		return queued;
+	}
+
+	/*
+	 *	Is is a Call Request ? if so process it.
+	 */
+	if (frametype == X25_CALL_REQUEST)
+		return x25_rx_call_request(skb, nb, lci);
+
+	/*
+	 * 	Its not a Call Request, nor is it a control frame.
+	 *	Can we forward it?
+	 */
+
+	if (x25_forward_data(lci, nb, skb)) {
+		if (frametype == X25_CLEAR_CONFIRMATION) {
+			x25_clear_forward_by_lci(lci);
+		}
+		kfree_skb(skb);
+		return 1;
+	}
+
+/*
+	x25_transmit_clear_request(nb, lci, 0x0D);
+*/
+
+	if (frametype != X25_CLEAR_CONFIRMATION)
+		printk(KERN_DEBUG "x25_receive_data(): unknown frame type %2x\n",frametype);
+
+	return 0;
+}
+
+int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
+			   struct packet_type *ptype, struct net_device *orig_dev)
+{
+	struct sk_buff *nskb;
+	struct x25_neigh *nb;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		goto drop;
+
+	nskb = skb_copy(skb, GFP_ATOMIC);
+	if (!nskb)
+		goto drop;
+	kfree_skb(skb);
+	skb = nskb;
+
+	/*
+	 * Packet received from unrecognised device, throw it away.
+	 */
+	nb = x25_get_neigh(dev);
+	if (!nb) {
+		printk(KERN_DEBUG "X.25: unknown neighbour - %s\n", dev->name);
+		goto drop;
+	}
+
+	switch (skb->data[0]) {
+
+	case X25_IFACE_DATA:
+		skb_pull(skb, 1);
+		if (x25_receive_data(skb, nb)) {
+			x25_neigh_put(nb);
+			goto out;
+		}
+		break;
+
+	case X25_IFACE_CONNECT:
+		x25_link_established(nb);
+		break;
+
+	case X25_IFACE_DISCONNECT:
+		x25_link_terminated(nb);
+		break;
+	}
+	x25_neigh_put(nb);
+drop:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+void x25_establish_link(struct x25_neigh *nb)
+{
+	struct sk_buff *skb;
+	unsigned char *ptr;
+
+	switch (nb->dev->type) {
+		case ARPHRD_X25:
+			if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) {
+				printk(KERN_ERR "x25_dev: out of memory\n");
+				return;
+			}
+			ptr  = skb_put(skb, 1);
+			*ptr = X25_IFACE_CONNECT;
+			break;
+
+#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE)
+		case ARPHRD_ETHER:
+			return;
+#endif
+		default:
+			return;
+	}
+
+	skb->protocol = htons(ETH_P_X25);
+	skb->dev      = nb->dev;
+
+	dev_queue_xmit(skb);
+}
+
+void x25_terminate_link(struct x25_neigh *nb)
+{
+	struct sk_buff *skb;
+	unsigned char *ptr;
+
+#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE)
+	if (nb->dev->type == ARPHRD_ETHER)
+		return;
+#endif
+	if (nb->dev->type != ARPHRD_X25)
+		return;
+
+	skb = alloc_skb(1, GFP_ATOMIC);
+	if (!skb) {
+		printk(KERN_ERR "x25_dev: out of memory\n");
+		return;
+	}
+
+	ptr  = skb_put(skb, 1);
+	*ptr = X25_IFACE_DISCONNECT;
+
+	skb->protocol = htons(ETH_P_X25);
+	skb->dev      = nb->dev;
+	dev_queue_xmit(skb);
+}
+
+void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb)
+{
+	unsigned char *dptr;
+
+	skb_reset_network_header(skb);
+
+	switch (nb->dev->type) {
+		case ARPHRD_X25:
+			dptr  = skb_push(skb, 1);
+			*dptr = X25_IFACE_DATA;
+			break;
+
+#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE)
+		case ARPHRD_ETHER:
+			kfree_skb(skb);
+			return;
+#endif
+		default:
+			kfree_skb(skb);
+			return;
+	}
+
+	skb->protocol = htons(ETH_P_X25);
+	skb->dev      = nb->dev;
+
+	dev_queue_xmit(skb);
+}
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
new file mode 100644
index 00000000..f77e4e75
--- /dev/null
+++ b/net/x25/x25_facilities.c
@@ -0,0 +1,346 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Split from x25_subr.c
+ *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
+ *					  negotiation.
+ *	apr/14/05	Shaun Pereira - Allow fast select with no restriction
+ *					on response.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+/**
+ * x25_parse_facilities - Parse facilities from skb into the facilities structs
+ *
+ * @skb: sk_buff to parse
+ * @facilities: Regular facilities, updated as facilities are found
+ * @dte_facs: ITU DTE facilities, updated as DTE facilities are found
+ * @vc_fac_mask: mask is updated with all facilities found
+ *
+ * Return codes:
+ *  -1 - Parsing error, caller should drop call and clean up
+ *   0 - Parse OK, this skb has no facilities
+ *  >0 - Parse OK, returns the length of the facilities header
+ *
+ */
+int x25_parse_facilities(struct sk_buff *skb, struct x25_facilities *facilities,
+		struct x25_dte_facilities *dte_facs, unsigned long *vc_fac_mask)
+{
+	unsigned char *p = skb->data;
+	unsigned int len;
+
+	*vc_fac_mask = 0;
+
+	/*
+	 * The kernel knows which facilities were set on an incoming call but
+	 * currently this information is not available to userspace.  Here we
+	 * give userspace who read incoming call facilities 0 length to indicate
+	 * it wasn't set.
+	 */
+	dte_facs->calling_len = 0;
+	dte_facs->called_len = 0;
+	memset(dte_facs->called_ae, '\0', sizeof(dte_facs->called_ae));
+	memset(dte_facs->calling_ae, '\0', sizeof(dte_facs->calling_ae));
+
+	if (skb->len < 1)
+		return 0;
+
+	len = *p++;
+
+	if (len >= skb->len)
+		return -1;
+
+	while (len > 0) {
+		switch (*p & X25_FAC_CLASS_MASK) {
+		case X25_FAC_CLASS_A:
+			if (len < 2)
+				return -1;
+			switch (*p) {
+			case X25_FAC_REVERSE:
+				if((p[1] & 0x81) == 0x81) {
+					facilities->reverse = p[1] & 0x81;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x01) == 0x01) {
+					facilities->reverse = p[1] & 0x01;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x80) == 0x80) {
+					facilities->reverse = p[1] & 0x80;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if(p[1] == 0x00) {
+					facilities->reverse
+						= X25_DEFAULT_REVERSE;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+			case X25_FAC_THROUGHPUT:
+				facilities->throughput = p[1];
+				*vc_fac_mask |= X25_MASK_THROUGHPUT;
+				break;
+			case X25_MARKER:
+				break;
+			default:
+				printk(KERN_DEBUG "X.25: unknown facility "
+				       "%02X, value %02X\n",
+				       p[0], p[1]);
+				break;
+			}
+			p   += 2;
+			len -= 2;
+			break;
+		case X25_FAC_CLASS_B:
+			if (len < 3)
+				return -1;
+			switch (*p) {
+			case X25_FAC_PACKET_SIZE:
+				facilities->pacsize_in  = p[1];
+				facilities->pacsize_out = p[2];
+				*vc_fac_mask |= X25_MASK_PACKET_SIZE;
+				break;
+			case X25_FAC_WINDOW_SIZE:
+				facilities->winsize_in  = p[1];
+				facilities->winsize_out = p[2];
+				*vc_fac_mask |= X25_MASK_WINDOW_SIZE;
+				break;
+			default:
+				printk(KERN_DEBUG "X.25: unknown facility "
+				       "%02X, values %02X, %02X\n",
+				       p[0], p[1], p[2]);
+				break;
+			}
+			p   += 3;
+			len -= 3;
+			break;
+		case X25_FAC_CLASS_C:
+			if (len < 4)
+				return -1;
+			printk(KERN_DEBUG "X.25: unknown facility %02X, "
+			       "values %02X, %02X, %02X\n",
+			       p[0], p[1], p[2], p[3]);
+			p   += 4;
+			len -= 4;
+			break;
+		case X25_FAC_CLASS_D:
+			if (len < p[1] + 2)
+				return -1;
+			switch (*p) {
+			case X25_FAC_CALLING_AE:
+				if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
+					return -1;
+				dte_facs->calling_len = p[2];
+				memcpy(dte_facs->calling_ae, &p[3], p[1] - 1);
+				*vc_fac_mask |= X25_MASK_CALLING_AE;
+				break;
+			case X25_FAC_CALLED_AE:
+				if (p[1] > X25_MAX_DTE_FACIL_LEN || p[1] <= 1)
+					return -1;
+				dte_facs->called_len = p[2];
+				memcpy(dte_facs->called_ae, &p[3], p[1] - 1);
+				*vc_fac_mask |= X25_MASK_CALLED_AE;
+				break;
+			default:
+				printk(KERN_DEBUG "X.25: unknown facility %02X,"
+					"length %d\n", p[0], p[1]);
+				break;
+			}
+			len -= p[1] + 2;
+			p += p[1] + 2;
+			break;
+		}
+	}
+
+	return p - skb->data;
+}
+
+/*
+ *	Create a set of facilities.
+ */
+int x25_create_facilities(unsigned char *buffer,
+		struct x25_facilities *facilities,
+		struct x25_dte_facilities *dte_facs, unsigned long facil_mask)
+{
+	unsigned char *p = buffer + 1;
+	int len;
+
+	if (!facil_mask) {
+		/*
+		 * Length of the facilities field in call_req or
+		 * call_accept packets
+		 */
+		buffer[0] = 0;
+		len = 1; /* 1 byte for the length field */
+		return len;
+	}
+
+	if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
+		*p++ = X25_FAC_REVERSE;
+		*p++ = facilities->reverse;
+	}
+
+	if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
+		*p++ = X25_FAC_THROUGHPUT;
+		*p++ = facilities->throughput;
+	}
+
+	if ((facilities->pacsize_in || facilities->pacsize_out) &&
+	    (facil_mask & X25_MASK_PACKET_SIZE)) {
+		*p++ = X25_FAC_PACKET_SIZE;
+		*p++ = facilities->pacsize_in ? : facilities->pacsize_out;
+		*p++ = facilities->pacsize_out ? : facilities->pacsize_in;
+	}
+
+	if ((facilities->winsize_in || facilities->winsize_out) &&
+	    (facil_mask & X25_MASK_WINDOW_SIZE)) {
+		*p++ = X25_FAC_WINDOW_SIZE;
+		*p++ = facilities->winsize_in ? : facilities->winsize_out;
+		*p++ = facilities->winsize_out ? : facilities->winsize_in;
+	}
+
+	if (facil_mask & (X25_MASK_CALLING_AE|X25_MASK_CALLED_AE)) {
+		*p++ = X25_MARKER;
+		*p++ = X25_DTE_SERVICES;
+	}
+
+	if (dte_facs->calling_len && (facil_mask & X25_MASK_CALLING_AE)) {
+		unsigned bytecount = (dte_facs->calling_len + 1) >> 1;
+		*p++ = X25_FAC_CALLING_AE;
+		*p++ = 1 + bytecount;
+		*p++ = dte_facs->calling_len;
+		memcpy(p, dte_facs->calling_ae, bytecount);
+		p += bytecount;
+	}
+
+	if (dte_facs->called_len && (facil_mask & X25_MASK_CALLED_AE)) {
+		unsigned bytecount = (dte_facs->called_len % 2) ?
+		dte_facs->called_len / 2 + 1 :
+		dte_facs->called_len / 2;
+		*p++ = X25_FAC_CALLED_AE;
+		*p++ = 1 + bytecount;
+		*p++ = dte_facs->called_len;
+		memcpy(p, dte_facs->called_ae, bytecount);
+		p+=bytecount;
+	}
+
+	len       = p - buffer;
+	buffer[0] = len - 1;
+
+	return len;
+}
+
+/*
+ *	Try to reach a compromise on a set of facilities.
+ *
+ *	The only real problem is with reverse charging.
+ */
+int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
+		struct x25_facilities *new, struct x25_dte_facilities *dte)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+	struct x25_facilities *ours = &x25->facilities;
+	struct x25_facilities theirs;
+	int len;
+
+	memset(&theirs, 0, sizeof(theirs));
+	memcpy(new, ours, sizeof(*new));
+
+	len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask);
+	if (len < 0)
+		return len;
+
+	/*
+	 *	They want reverse charging, we won't accept it.
+	 */
+	if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
+		SOCK_DEBUG(sk, "X.25: rejecting reverse charging request\n");
+		return -1;
+	}
+
+	new->reverse = theirs.reverse;
+
+	if (theirs.throughput) {
+		int theirs_in =  theirs.throughput & 0x0f;
+		int theirs_out = theirs.throughput & 0xf0;
+		int ours_in  = ours->throughput & 0x0f;
+		int ours_out = ours->throughput & 0xf0;
+		if (!ours_in || theirs_in < ours_in) {
+			SOCK_DEBUG(sk, "X.25: inbound throughput negotiated\n");
+			new->throughput = (new->throughput & 0xf0) | theirs_in;
+		}
+		if (!ours_out || theirs_out < ours_out) {
+			SOCK_DEBUG(sk,
+				"X.25: outbound throughput negotiated\n");
+			new->throughput = (new->throughput & 0x0f) | theirs_out;
+		}
+	}
+
+	if (theirs.pacsize_in && theirs.pacsize_out) {
+		if (theirs.pacsize_in < ours->pacsize_in) {
+			SOCK_DEBUG(sk, "X.25: packet size inwards negotiated down\n");
+			new->pacsize_in = theirs.pacsize_in;
+		}
+		if (theirs.pacsize_out < ours->pacsize_out) {
+			SOCK_DEBUG(sk, "X.25: packet size outwards negotiated down\n");
+			new->pacsize_out = theirs.pacsize_out;
+		}
+	}
+
+	if (theirs.winsize_in && theirs.winsize_out) {
+		if (theirs.winsize_in < ours->winsize_in) {
+			SOCK_DEBUG(sk, "X.25: window size inwards negotiated down\n");
+			new->winsize_in = theirs.winsize_in;
+		}
+		if (theirs.winsize_out < ours->winsize_out) {
+			SOCK_DEBUG(sk, "X.25: window size outwards negotiated down\n");
+			new->winsize_out = theirs.winsize_out;
+		}
+	}
+
+	return len;
+}
+
+/*
+ *	Limit values of certain facilities according to the capability of the
+ *      currently attached x25 link.
+ */
+void x25_limit_facilities(struct x25_facilities *facilities,
+			  struct x25_neigh *nb)
+{
+
+	if (!nb->extended) {
+		if (facilities->winsize_in  > 7) {
+			printk(KERN_DEBUG "X.25: incoming winsize limited to 7\n");
+			facilities->winsize_in = 7;
+		}
+		if (facilities->winsize_out > 7) {
+			facilities->winsize_out = 7;
+			printk( KERN_DEBUG "X.25: outgoing winsize limited to 7\n");
+		}
+	}
+}
diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c
new file mode 100644
index 00000000..c541b622
--- /dev/null
+++ b/net/x25/x25_forward.c
@@ -0,0 +1,167 @@
+/*
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	03-01-2007	Added forwarding for x.25	Andrew Hendry
+ */
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_forward_list);
+DEFINE_RWLOCK(x25_forward_list_lock);
+
+int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from,
+			struct sk_buff *skb, int lci)
+{
+	struct x25_route *rt;
+	struct x25_neigh *neigh_new = NULL;
+	struct list_head *entry;
+	struct x25_forward *x25_frwd, *new_frwd;
+	struct sk_buff *skbn;
+	short same_lci = 0;
+	int rc = 0;
+
+	if ((rt = x25_get_route(dest_addr)) == NULL)
+		goto out_no_route;
+
+	if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) {
+		/* This shouldn't happen, if it occurs somehow
+		 * do something sensible
+		 */
+		goto out_put_route;
+	}
+
+	/* Avoid a loop. This is the normal exit path for a
+	 * system with only one x.25 iface and default route
+	 */
+	if (rt->dev == from->dev) {
+		goto out_put_nb;
+	}
+
+	/* Remote end sending a call request on an already
+	 * established LCI? It shouldn't happen, just in case..
+	 */
+	read_lock_bh(&x25_forward_list_lock);
+	list_for_each(entry, &x25_forward_list) {
+		x25_frwd = list_entry(entry, struct x25_forward, node);
+		if (x25_frwd->lci == lci) {
+			printk(KERN_WARNING "X.25: call request for lci which is already registered!, transmitting but not registering new pair\n");
+			same_lci = 1;
+		}
+	}
+	read_unlock_bh(&x25_forward_list_lock);
+
+	/* Save the forwarding details for future traffic */
+	if (!same_lci){
+		if ((new_frwd = kmalloc(sizeof(struct x25_forward),
+						GFP_ATOMIC)) == NULL){
+			rc = -ENOMEM;
+			goto out_put_nb;
+		}
+		new_frwd->lci = lci;
+		new_frwd->dev1 = rt->dev;
+		new_frwd->dev2 = from->dev;
+		write_lock_bh(&x25_forward_list_lock);
+		list_add(&new_frwd->node, &x25_forward_list);
+		write_unlock_bh(&x25_forward_list_lock);
+	}
+
+	/* Forward the call request */
+	if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){
+		goto out_put_nb;
+	}
+	x25_transmit_link(skbn, neigh_new);
+	rc = 1;
+
+
+out_put_nb:
+	x25_neigh_put(neigh_new);
+
+out_put_route:
+	x25_route_put(rt);
+
+out_no_route:
+	return rc;
+}
+
+
+int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) {
+
+	struct x25_forward *frwd;
+	struct list_head *entry;
+	struct net_device *peer = NULL;
+	struct x25_neigh *nb;
+	struct sk_buff *skbn;
+	int rc = 0;
+
+	read_lock_bh(&x25_forward_list_lock);
+	list_for_each(entry, &x25_forward_list) {
+		frwd = list_entry(entry, struct x25_forward, node);
+		if (frwd->lci == lci) {
+			/* The call is established, either side can send */
+			if (from->dev == frwd->dev1) {
+				peer = frwd->dev2;
+			} else {
+				peer = frwd->dev1;
+			}
+			break;
+		}
+	}
+	read_unlock_bh(&x25_forward_list_lock);
+
+	if ( (nb = x25_get_neigh(peer)) == NULL)
+		goto out;
+
+	if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){
+		goto output;
+
+	}
+	x25_transmit_link(skbn, nb);
+
+	rc = 1;
+output:
+	x25_neigh_put(nb);
+out:
+	return rc;
+}
+
+void x25_clear_forward_by_lci(unsigned int lci)
+{
+	struct x25_forward *fwd;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_forward_list_lock);
+
+	list_for_each_safe(entry, tmp, &x25_forward_list) {
+		fwd = list_entry(entry, struct x25_forward, node);
+		if (fwd->lci == lci) {
+			list_del(&fwd->node);
+			kfree(fwd);
+		}
+	}
+	write_unlock_bh(&x25_forward_list_lock);
+}
+
+
+void x25_clear_forward_by_dev(struct net_device *dev)
+{
+	struct x25_forward *fwd;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_forward_list_lock);
+
+	list_for_each_safe(entry, tmp, &x25_forward_list) {
+		fwd = list_entry(entry, struct x25_forward, node);
+		if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){
+			list_del(&fwd->node);
+			kfree(fwd);
+		}
+	}
+	write_unlock_bh(&x25_forward_list_lock);
+}
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
new file mode 100644
index 00000000..15de65f0
--- /dev/null
+++ b/net/x25/x25_in.c
@@ -0,0 +1,384 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	  Started coding.
+ *	X.25 002	Jonathan Naylor	  Centralised disconnection code.
+ *					  New timer architecture.
+ *	2000-03-20	Daniela Squassoni Disabling/enabling of facilities
+ *					  negotiation.
+ *	2000-11-10	Henner Eisen	  Check and reset for out-of-sequence
+ *					  i-frames.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
+{
+	struct sk_buff *skbo, *skbn = skb;
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (more) {
+		x25->fraglen += skb->len;
+		skb_queue_tail(&x25->fragment_queue, skb);
+		skb_set_owner_r(skb, sk);
+		return 0;
+	}
+
+	if (!more && x25->fraglen > 0) {	/* End of fragment */
+		int len = x25->fraglen + skb->len;
+
+		if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL){
+			kfree_skb(skb);
+			return 1;
+		}
+
+		skb_queue_tail(&x25->fragment_queue, skb);
+
+		skb_reset_transport_header(skbn);
+
+		skbo = skb_dequeue(&x25->fragment_queue);
+		skb_copy_from_linear_data(skbo, skb_put(skbn, skbo->len),
+					  skbo->len);
+		kfree_skb(skbo);
+
+		while ((skbo =
+			skb_dequeue(&x25->fragment_queue)) != NULL) {
+			skb_pull(skbo, (x25->neighbour->extended) ?
+					X25_EXT_MIN_LEN : X25_STD_MIN_LEN);
+			skb_copy_from_linear_data(skbo,
+						  skb_put(skbn, skbo->len),
+						  skbo->len);
+			kfree_skb(skbo);
+		}
+
+		x25->fraglen = 0;
+	}
+
+	skb_set_owner_r(skbn, sk);
+	skb_queue_tail(&sk->sk_receive_queue, skbn);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skbn->len);
+
+	return 0;
+}
+
+/*
+ * State machine for state 1, Awaiting Call Accepted State.
+ * The handling of the timer(s) is in file x25_timer.c.
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	struct x25_address source_addr, dest_addr;
+	int len;
+	struct x25_sock *x25 = x25_sk(sk);
+
+	switch (frametype) {
+		case X25_CALL_ACCEPTED: {
+
+			x25_stop_timer(sk);
+			x25->condition = 0x00;
+			x25->vs        = 0;
+			x25->va        = 0;
+			x25->vr        = 0;
+			x25->vl        = 0;
+			x25->state     = X25_STATE_3;
+			sk->sk_state   = TCP_ESTABLISHED;
+			/*
+			 *	Parse the data in the frame.
+			 */
+			skb_pull(skb, X25_STD_MIN_LEN);
+
+			len = x25_parse_address_block(skb, &source_addr,
+						&dest_addr);
+			if (len > 0)
+				skb_pull(skb, len);
+			else if (len < 0)
+				goto out_clear;
+
+			len = x25_parse_facilities(skb, &x25->facilities,
+						&x25->dte_facilities,
+						&x25->vc_facil_mask);
+			if (len > 0)
+				skb_pull(skb, len);
+			else if (len < 0)
+				goto out_clear;
+			/*
+			 *	Copy any Call User Data.
+			 */
+			if (skb->len > 0) {
+				skb_copy_from_linear_data(skb,
+					      x25->calluserdata.cuddata,
+					      skb->len);
+				x25->calluserdata.cudlength = skb->len;
+			}
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_state_change(sk);
+			break;
+		}
+		case X25_CLEAR_REQUEST:
+			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+			x25_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]);
+			break;
+
+		default:
+			break;
+	}
+
+	return 0;
+
+out_clear:
+	x25_write_internal(sk, X25_CLEAR_REQUEST);
+	x25->state = X25_STATE_2;
+	x25_start_t23timer(sk);
+	return 0;
+}
+
+/*
+ * State machine for state 2, Awaiting Clear Confirmation State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	switch (frametype) {
+
+		case X25_CLEAR_REQUEST:
+			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+			break;
+
+		case X25_CLEAR_CONFIRMATION:
+			x25_disconnect(sk, 0, 0, 0);
+			break;
+
+		default:
+			break;
+	}
+
+	return 0;
+}
+
+/*
+ * State machine for state 3, Connected State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
+{
+	int queued = 0;
+	int modulus;
+	struct x25_sock *x25 = x25_sk(sk);
+
+	modulus = (x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS;
+
+	switch (frametype) {
+
+		case X25_RESET_REQUEST:
+			x25_write_internal(sk, X25_RESET_CONFIRMATION);
+			x25_stop_timer(sk);
+			x25->condition = 0x00;
+			x25->vs        = 0;
+			x25->vr        = 0;
+			x25->va        = 0;
+			x25->vl        = 0;
+			x25_requeue_frames(sk);
+			break;
+
+		case X25_CLEAR_REQUEST:
+			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+			break;
+
+		case X25_RR:
+		case X25_RNR:
+			if (!x25_validate_nr(sk, nr)) {
+				x25_clear_queues(sk);
+				x25_write_internal(sk, X25_RESET_REQUEST);
+				x25_start_t22timer(sk);
+				x25->condition = 0x00;
+				x25->vs        = 0;
+				x25->vr        = 0;
+				x25->va        = 0;
+				x25->vl        = 0;
+				x25->state     = X25_STATE_4;
+			} else {
+				x25_frames_acked(sk, nr);
+				if (frametype == X25_RNR) {
+					x25->condition |= X25_COND_PEER_RX_BUSY;
+				} else {
+					x25->condition &= ~X25_COND_PEER_RX_BUSY;
+				}
+			}
+			break;
+
+		case X25_DATA:	/* XXX */
+			x25->condition &= ~X25_COND_PEER_RX_BUSY;
+			if ((ns != x25->vr) || !x25_validate_nr(sk, nr)) {
+				x25_clear_queues(sk);
+				x25_write_internal(sk, X25_RESET_REQUEST);
+				x25_start_t22timer(sk);
+				x25->condition = 0x00;
+				x25->vs        = 0;
+				x25->vr        = 0;
+				x25->va        = 0;
+				x25->vl        = 0;
+				x25->state     = X25_STATE_4;
+				break;
+			}
+			x25_frames_acked(sk, nr);
+			if (ns == x25->vr) {
+				if (x25_queue_rx_frame(sk, skb, m) == 0) {
+					x25->vr = (x25->vr + 1) % modulus;
+					queued = 1;
+				} else {
+					/* Should never happen */
+					x25_clear_queues(sk);
+					x25_write_internal(sk, X25_RESET_REQUEST);
+					x25_start_t22timer(sk);
+					x25->condition = 0x00;
+					x25->vs        = 0;
+					x25->vr        = 0;
+					x25->va        = 0;
+					x25->vl        = 0;
+					x25->state     = X25_STATE_4;
+					break;
+				}
+				if (atomic_read(&sk->sk_rmem_alloc) >
+				    (sk->sk_rcvbuf >> 1))
+					x25->condition |= X25_COND_OWN_RX_BUSY;
+			}
+			/*
+			 *	If the window is full Ack it immediately, else
+			 *	start the holdback timer.
+			 */
+			if (((x25->vl + x25->facilities.winsize_in) % modulus) == x25->vr) {
+				x25->condition &= ~X25_COND_ACK_PENDING;
+				x25_stop_timer(sk);
+				x25_enquiry_response(sk);
+			} else {
+				x25->condition |= X25_COND_ACK_PENDING;
+				x25_start_t2timer(sk);
+			}
+			break;
+
+		case X25_INTERRUPT_CONFIRMATION:
+			clear_bit(X25_INTERRUPT_FLAG, &x25->flags);
+			break;
+
+		case X25_INTERRUPT:
+			if (sock_flag(sk, SOCK_URGINLINE))
+				queued = !sock_queue_rcv_skb(sk, skb);
+			else {
+				skb_set_owner_r(skb, sk);
+				skb_queue_tail(&x25->interrupt_in_queue, skb);
+				queued = 1;
+			}
+			sk_send_sigurg(sk);
+			x25_write_internal(sk, X25_INTERRUPT_CONFIRMATION);
+			break;
+
+		default:
+			printk(KERN_WARNING "x25: unknown %02X in state 3\n", frametype);
+			break;
+	}
+
+	return queued;
+}
+
+/*
+ * State machine for state 4, Awaiting Reset Confirmation State.
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+	switch (frametype) {
+
+		case X25_RESET_REQUEST:
+			x25_write_internal(sk, X25_RESET_CONFIRMATION);
+		case X25_RESET_CONFIRMATION: {
+			struct x25_sock *x25 = x25_sk(sk);
+
+			x25_stop_timer(sk);
+			x25->condition = 0x00;
+			x25->va        = 0;
+			x25->vr        = 0;
+			x25->vs        = 0;
+			x25->vl        = 0;
+			x25->state     = X25_STATE_3;
+			x25_requeue_frames(sk);
+			break;
+		}
+		case X25_CLEAR_REQUEST:
+			x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+			x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+			break;
+
+		default:
+			break;
+	}
+
+	return 0;
+}
+
+/* Higher level upcall for a LAPB frame */
+int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+	int queued = 0, frametype, ns, nr, q, d, m;
+
+	if (x25->state == X25_STATE_0)
+		return 0;
+
+	frametype = x25_decode(sk, skb, &ns, &nr, &q, &d, &m);
+
+	switch (x25->state) {
+		case X25_STATE_1:
+			queued = x25_state1_machine(sk, skb, frametype);
+			break;
+		case X25_STATE_2:
+			queued = x25_state2_machine(sk, skb, frametype);
+			break;
+		case X25_STATE_3:
+			queued = x25_state3_machine(sk, skb, frametype, ns, nr, q, d, m);
+			break;
+		case X25_STATE_4:
+			queued = x25_state4_machine(sk, skb, frametype);
+			break;
+	}
+
+	x25_kick(sk);
+
+	return queued;
+}
+
+int x25_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int queued = x25_process_rx_frame(sk, skb);
+
+	if (!queued)
+		kfree_skb(skb);
+
+	return 0;
+}
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
new file mode 100644
index 00000000..21306928
--- /dev/null
+++ b/net/x25/x25_link.c
@@ -0,0 +1,407 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	  Started coding.
+ *	X.25 002	Jonathan Naylor	  New timer architecture.
+ *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
+ *					  negotiation.
+ *	2000-09-04	Henner Eisen	  dev_hold() / dev_put() for x25_neigh.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_neigh_list);
+DEFINE_RWLOCK(x25_neigh_list_lock);
+
+static void x25_t20timer_expiry(unsigned long);
+
+static void x25_transmit_restart_confirmation(struct x25_neigh *nb);
+static void x25_transmit_restart_request(struct x25_neigh *nb);
+
+/*
+ *	Linux set/reset timer routines
+ */
+static inline void x25_start_t20timer(struct x25_neigh *nb)
+{
+	mod_timer(&nb->t20timer, jiffies + nb->t20);
+}
+
+static void x25_t20timer_expiry(unsigned long param)
+{
+	struct x25_neigh *nb = (struct x25_neigh *)param;
+
+	x25_transmit_restart_request(nb);
+
+	x25_start_t20timer(nb);
+}
+
+static inline void x25_stop_t20timer(struct x25_neigh *nb)
+{
+	del_timer(&nb->t20timer);
+}
+
+static inline int x25_t20timer_pending(struct x25_neigh *nb)
+{
+	return timer_pending(&nb->t20timer);
+}
+
+/*
+ *	This handles all restart and diagnostic frames.
+ */
+void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb,
+		      unsigned short frametype)
+{
+	struct sk_buff *skbn;
+	int confirm;
+
+	switch (frametype) {
+		case X25_RESTART_REQUEST:
+			confirm = !x25_t20timer_pending(nb);
+			x25_stop_t20timer(nb);
+			nb->state = X25_LINK_STATE_3;
+			if (confirm)
+				x25_transmit_restart_confirmation(nb);
+			break;
+
+		case X25_RESTART_CONFIRMATION:
+			x25_stop_t20timer(nb);
+			nb->state = X25_LINK_STATE_3;
+			break;
+
+		case X25_DIAGNOSTIC:
+			printk(KERN_WARNING "x25: diagnostic #%d - "
+			       "%02X %02X %02X\n",
+			       skb->data[3], skb->data[4],
+			       skb->data[5], skb->data[6]);
+			break;
+
+		default:
+			printk(KERN_WARNING "x25: received unknown %02X "
+			       "with LCI 000\n", frametype);
+			break;
+	}
+
+	if (nb->state == X25_LINK_STATE_3)
+		while ((skbn = skb_dequeue(&nb->queue)) != NULL)
+			x25_send_frame(skbn, nb);
+}
+
+/*
+ *	This routine is called when a Restart Request is needed
+ */
+static void x25_transmit_restart_request(struct x25_neigh *nb)
+{
+	unsigned char *dptr;
+	int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	skb_reserve(skb, X25_MAX_L2_LEN);
+
+	dptr = skb_put(skb, X25_STD_MIN_LEN + 2);
+
+	*dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ;
+	*dptr++ = 0x00;
+	*dptr++ = X25_RESTART_REQUEST;
+	*dptr++ = 0x00;
+	*dptr++ = 0;
+
+	skb->sk = NULL;
+
+	x25_send_frame(skb, nb);
+}
+
+/*
+ * This routine is called when a Restart Confirmation is needed
+ */
+static void x25_transmit_restart_confirmation(struct x25_neigh *nb)
+{
+	unsigned char *dptr;
+	int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	skb_reserve(skb, X25_MAX_L2_LEN);
+
+	dptr = skb_put(skb, X25_STD_MIN_LEN);
+
+	*dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ;
+	*dptr++ = 0x00;
+	*dptr++ = X25_RESTART_CONFIRMATION;
+
+	skb->sk = NULL;
+
+	x25_send_frame(skb, nb);
+}
+
+/*
+ *	This routine is called when a Clear Request is needed outside of the context
+ *	of a connected socket.
+ */
+void x25_transmit_clear_request(struct x25_neigh *nb, unsigned int lci,
+				unsigned char cause)
+{
+	unsigned char *dptr;
+	int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2;
+	struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC);
+
+	if (!skb)
+		return;
+
+	skb_reserve(skb, X25_MAX_L2_LEN);
+
+	dptr = skb_put(skb, X25_STD_MIN_LEN + 2);
+
+	*dptr++ = ((lci >> 8) & 0x0F) | (nb->extended ?
+					 X25_GFI_EXTSEQ :
+					 X25_GFI_STDSEQ);
+	*dptr++ = (lci >> 0) & 0xFF;
+	*dptr++ = X25_CLEAR_REQUEST;
+	*dptr++ = cause;
+	*dptr++ = 0x00;
+
+	skb->sk = NULL;
+
+	x25_send_frame(skb, nb);
+}
+
+void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *nb)
+{
+	switch (nb->state) {
+		case X25_LINK_STATE_0:
+			skb_queue_tail(&nb->queue, skb);
+			nb->state = X25_LINK_STATE_1;
+			x25_establish_link(nb);
+			break;
+		case X25_LINK_STATE_1:
+		case X25_LINK_STATE_2:
+			skb_queue_tail(&nb->queue, skb);
+			break;
+		case X25_LINK_STATE_3:
+			x25_send_frame(skb, nb);
+			break;
+	}
+}
+
+/*
+ *	Called when the link layer has become established.
+ */
+void x25_link_established(struct x25_neigh *nb)
+{
+	switch (nb->state) {
+		case X25_LINK_STATE_0:
+			nb->state = X25_LINK_STATE_2;
+			break;
+		case X25_LINK_STATE_1:
+			x25_transmit_restart_request(nb);
+			nb->state = X25_LINK_STATE_2;
+			x25_start_t20timer(nb);
+			break;
+	}
+}
+
+/*
+ *	Called when the link layer has terminated, or an establishment
+ *	request has failed.
+ */
+
+void x25_link_terminated(struct x25_neigh *nb)
+{
+	nb->state = X25_LINK_STATE_0;
+	/* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */
+	x25_kill_by_neigh(nb);
+}
+
+/*
+ *	Add a new device.
+ */
+void x25_link_device_up(struct net_device *dev)
+{
+	struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC);
+
+	if (!nb)
+		return;
+
+	skb_queue_head_init(&nb->queue);
+	setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb);
+
+	dev_hold(dev);
+	nb->dev      = dev;
+	nb->state    = X25_LINK_STATE_0;
+	nb->extended = 0;
+	/*
+	 * Enables negotiation
+	 */
+	nb->global_facil_mask = X25_MASK_REVERSE |
+				       X25_MASK_THROUGHPUT |
+				       X25_MASK_PACKET_SIZE |
+				       X25_MASK_WINDOW_SIZE;
+	nb->t20      = sysctl_x25_restart_request_timeout;
+	atomic_set(&nb->refcnt, 1);
+
+	write_lock_bh(&x25_neigh_list_lock);
+	list_add(&nb->node, &x25_neigh_list);
+	write_unlock_bh(&x25_neigh_list_lock);
+}
+
+/**
+ *	__x25_remove_neigh - remove neighbour from x25_neigh_list
+ *	@nb - neigh to remove
+ *
+ *	Remove neighbour from x25_neigh_list. If it was there.
+ *	Caller must hold x25_neigh_list_lock.
+ */
+static void __x25_remove_neigh(struct x25_neigh *nb)
+{
+	skb_queue_purge(&nb->queue);
+	x25_stop_t20timer(nb);
+
+	if (nb->node.next) {
+		list_del(&nb->node);
+		x25_neigh_put(nb);
+	}
+}
+
+/*
+ *	A device has been removed, remove its links.
+ */
+void x25_link_device_down(struct net_device *dev)
+{
+	struct x25_neigh *nb;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_neigh_list_lock);
+
+	list_for_each_safe(entry, tmp, &x25_neigh_list) {
+		nb = list_entry(entry, struct x25_neigh, node);
+
+		if (nb->dev == dev) {
+			__x25_remove_neigh(nb);
+			dev_put(dev);
+		}
+	}
+
+	write_unlock_bh(&x25_neigh_list_lock);
+}
+
+/*
+ *	Given a device, return the neighbour address.
+ */
+struct x25_neigh *x25_get_neigh(struct net_device *dev)
+{
+	struct x25_neigh *nb, *use = NULL;
+	struct list_head *entry;
+
+	read_lock_bh(&x25_neigh_list_lock);
+	list_for_each(entry, &x25_neigh_list) {
+		nb = list_entry(entry, struct x25_neigh, node);
+
+		if (nb->dev == dev) {
+			use = nb;
+			break;
+		}
+	}
+
+	if (use)
+		x25_neigh_hold(use);
+	read_unlock_bh(&x25_neigh_list_lock);
+	return use;
+}
+
+/*
+ *	Handle the ioctls that control the subscription functions.
+ */
+int x25_subscr_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct x25_subscrip_struct x25_subscr;
+	struct x25_neigh *nb;
+	struct net_device *dev;
+	int rc = -EINVAL;
+
+	if (cmd != SIOCX25GSUBSCRIP && cmd != SIOCX25SSUBSCRIP)
+		goto out;
+
+	rc = -EFAULT;
+	if (copy_from_user(&x25_subscr, arg, sizeof(x25_subscr)))
+		goto out;
+
+	rc = -EINVAL;
+	if ((dev = x25_dev_get(x25_subscr.device)) == NULL)
+		goto out;
+
+	if ((nb = x25_get_neigh(dev)) == NULL)
+		goto out_dev_put;
+
+	dev_put(dev);
+
+	if (cmd == SIOCX25GSUBSCRIP) {
+		read_lock_bh(&x25_neigh_list_lock);
+		x25_subscr.extended	     = nb->extended;
+		x25_subscr.global_facil_mask = nb->global_facil_mask;
+		read_unlock_bh(&x25_neigh_list_lock);
+		rc = copy_to_user(arg, &x25_subscr,
+				  sizeof(x25_subscr)) ? -EFAULT : 0;
+	} else {
+		rc = -EINVAL;
+		if (!(x25_subscr.extended && x25_subscr.extended != 1)) {
+			rc = 0;
+			write_lock_bh(&x25_neigh_list_lock);
+			nb->extended	     = x25_subscr.extended;
+			nb->global_facil_mask = x25_subscr.global_facil_mask;
+			write_unlock_bh(&x25_neigh_list_lock);
+		}
+	}
+	x25_neigh_put(nb);
+out:
+	return rc;
+out_dev_put:
+	dev_put(dev);
+	goto out;
+}
+
+
+/*
+ *	Release all memory associated with X.25 neighbour structures.
+ */
+void __exit x25_link_free(void)
+{
+	struct x25_neigh *nb;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_neigh_list_lock);
+
+	list_for_each_safe(entry, tmp, &x25_neigh_list) {
+		struct net_device *dev;
+
+		nb = list_entry(entry, struct x25_neigh, node);
+		dev = nb->dev;
+		__x25_remove_neigh(nb);
+		dev_put(dev);
+	}
+	write_unlock_bh(&x25_neigh_list_lock);
+}
diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c
new file mode 100644
index 00000000..0144271d
--- /dev/null
+++ b/net/x25/x25_out.c
@@ -0,0 +1,231 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	Started coding.
+ *	X.25 002	Jonathan Naylor	New timer architecture.
+ *	2000-09-04	Henner Eisen	Prevented x25_output() skb leakage.
+ *	2000-10-27	Henner Eisen	MSG_DONTWAIT for fragment allocation.
+ *	2000-11-10	Henner Eisen	x25_send_iframe(): re-queued frames
+ *					needed cleaned seq-number fields.
+ */
+
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+static int x25_pacsize_to_bytes(unsigned int pacsize)
+{
+	int bytes = 1;
+
+	if (!pacsize)
+		return 128;
+
+	while (pacsize-- > 0)
+		bytes *= 2;
+
+	return bytes;
+}
+
+/*
+ *	This is where all X.25 information frames pass.
+ *
+ *      Returns the amount of user data bytes sent on success
+ *      or a negative error code on failure.
+ */
+int x25_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *skbn;
+	unsigned char header[X25_EXT_MIN_LEN];
+	int err, frontlen, len;
+	int sent=0, noblock = X25_SKB_CB(skb)->flags & MSG_DONTWAIT;
+	struct x25_sock *x25 = x25_sk(sk);
+	int header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN :
+						    X25_STD_MIN_LEN;
+	int max_len = x25_pacsize_to_bytes(x25->facilities.pacsize_out);
+
+	if (skb->len - header_len > max_len) {
+		/* Save a copy of the Header */
+		skb_copy_from_linear_data(skb, header, header_len);
+		skb_pull(skb, header_len);
+
+		frontlen = skb_headroom(skb);
+
+		while (skb->len > 0) {
+			release_sock(sk);
+			skbn = sock_alloc_send_skb(sk, frontlen + max_len,
+						   noblock, &err);
+			lock_sock(sk);
+			if (!skbn) {
+				if (err == -EWOULDBLOCK && noblock){
+					kfree_skb(skb);
+					return sent;
+				}
+				SOCK_DEBUG(sk, "x25_output: fragment alloc"
+					       " failed, err=%d, %d bytes "
+					       "sent\n", err, sent);
+				return err;
+			}
+
+			skb_reserve(skbn, frontlen);
+
+			len = max_len > skb->len ? skb->len : max_len;
+
+			/* Copy the user data */
+			skb_copy_from_linear_data(skb, skb_put(skbn, len), len);
+			skb_pull(skb, len);
+
+			/* Duplicate the Header */
+			skb_push(skbn, header_len);
+			skb_copy_to_linear_data(skbn, header, header_len);
+
+			if (skb->len > 0) {
+				if (x25->neighbour->extended)
+					skbn->data[3] |= X25_EXT_M_BIT;
+				else
+					skbn->data[2] |= X25_STD_M_BIT;
+			}
+
+			skb_queue_tail(&sk->sk_write_queue, skbn);
+			sent += len;
+		}
+
+		kfree_skb(skb);
+	} else {
+		skb_queue_tail(&sk->sk_write_queue, skb);
+		sent = skb->len - header_len;
+	}
+	return sent;
+}
+
+/*
+ *	This procedure is passed a buffer descriptor for an iframe. It builds
+ *	the rest of the control part of the frame and then writes it out.
+ */
+static void x25_send_iframe(struct sock *sk, struct sk_buff *skb)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (!skb)
+		return;
+
+	if (x25->neighbour->extended) {
+		skb->data[2]  = (x25->vs << 1) & 0xFE;
+		skb->data[3] &= X25_EXT_M_BIT;
+		skb->data[3] |= (x25->vr << 1) & 0xFE;
+	} else {
+		skb->data[2] &= X25_STD_M_BIT;
+		skb->data[2] |= (x25->vs << 1) & 0x0E;
+		skb->data[2] |= (x25->vr << 5) & 0xE0;
+	}
+
+	x25_transmit_link(skb, x25->neighbour);
+}
+
+void x25_kick(struct sock *sk)
+{
+	struct sk_buff *skb, *skbn;
+	unsigned short start, end;
+	int modulus;
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (x25->state != X25_STATE_3)
+		return;
+
+	/*
+	 *	Transmit interrupt data.
+	 */
+	if (skb_peek(&x25->interrupt_out_queue) != NULL &&
+		!test_and_set_bit(X25_INTERRUPT_FLAG, &x25->flags)) {
+
+		skb = skb_dequeue(&x25->interrupt_out_queue);
+		x25_transmit_link(skb, x25->neighbour);
+	}
+
+	if (x25->condition & X25_COND_PEER_RX_BUSY)
+		return;
+
+	if (!skb_peek(&sk->sk_write_queue))
+		return;
+
+	modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+	start   = skb_peek(&x25->ack_queue) ? x25->vs : x25->va;
+	end     = (x25->va + x25->facilities.winsize_out) % modulus;
+
+	if (start == end)
+		return;
+
+	x25->vs = start;
+
+	/*
+	 * Transmit data until either we're out of data to send or
+	 * the window is full.
+	 */
+
+	skb = skb_dequeue(&sk->sk_write_queue);
+
+	do {
+		if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+			skb_queue_head(&sk->sk_write_queue, skb);
+			break;
+		}
+
+		skb_set_owner_w(skbn, sk);
+
+		/*
+		 * Transmit the frame copy.
+		 */
+		x25_send_iframe(sk, skbn);
+
+		x25->vs = (x25->vs + 1) % modulus;
+
+		/*
+		 * Requeue the original data frame.
+		 */
+		skb_queue_tail(&x25->ack_queue, skb);
+
+	} while (x25->vs != end &&
+		 (skb = skb_dequeue(&sk->sk_write_queue)) != NULL);
+
+	x25->vl         = x25->vr;
+	x25->condition &= ~X25_COND_ACK_PENDING;
+
+	x25_stop_timer(sk);
+}
+
+/*
+ * The following routines are taken from page 170 of the 7th ARRL Computer
+ * Networking Conference paper, as is the whole state machine.
+ */
+
+void x25_enquiry_response(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (x25->condition & X25_COND_OWN_RX_BUSY)
+		x25_write_internal(sk, X25_RNR);
+	else
+		x25_write_internal(sk, X25_RR);
+
+	x25->vl         = x25->vr;
+	x25->condition &= ~X25_COND_ACK_PENDING;
+
+	x25_stop_timer(sk);
+}
diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c
new file mode 100644
index 00000000..7ff37379
--- /dev/null
+++ b/net/x25/x25_proc.c
@@ -0,0 +1,266 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.4 with seq_file support
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	2002/10/06	Arnaldo Carvalho de Melo  seq_file support
+ */
+
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/x25.h>
+
+#ifdef CONFIG_PROC_FS
+
+static void *x25_seq_route_start(struct seq_file *seq, loff_t *pos)
+	__acquires(x25_route_list_lock)
+{
+	read_lock_bh(&x25_route_list_lock);
+	return seq_list_start_head(&x25_route_list, *pos);
+}
+
+static void *x25_seq_route_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &x25_route_list, pos);
+}
+
+static void x25_seq_route_stop(struct seq_file *seq, void *v)
+	__releases(x25_route_list_lock)
+{
+	read_unlock_bh(&x25_route_list_lock);
+}
+
+static int x25_seq_route_show(struct seq_file *seq, void *v)
+{
+	struct x25_route *rt = list_entry(v, struct x25_route, node);
+
+	if (v == &x25_route_list) {
+		seq_puts(seq, "Address          Digits  Device\n");
+		goto out;
+	}
+
+	rt = v;
+	seq_printf(seq, "%-15s  %-6d  %-5s\n",
+		   rt->address.x25_addr, rt->sigdigits,
+		   rt->dev ? rt->dev->name : "???");
+out:
+	return 0;
+}
+
+static void *x25_seq_socket_start(struct seq_file *seq, loff_t *pos)
+	__acquires(x25_list_lock)
+{
+	read_lock_bh(&x25_list_lock);
+	return seq_hlist_start_head(&x25_list, *pos);
+}
+
+static void *x25_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_hlist_next(v, &x25_list, pos);
+}
+
+static void x25_seq_socket_stop(struct seq_file *seq, void *v)
+	__releases(x25_list_lock)
+{
+	read_unlock_bh(&x25_list_lock);
+}
+
+static int x25_seq_socket_show(struct seq_file *seq, void *v)
+{
+	struct sock *s;
+	struct x25_sock *x25;
+	struct net_device *dev;
+	const char *devname;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "dest_addr  src_addr   dev   lci st vs vr "
+				"va   t  t2 t21 t22 t23 Snd-Q Rcv-Q inode\n");
+		goto out;
+	}
+
+	s = sk_entry(v);
+	x25 = x25_sk(s);
+
+	if (!x25->neighbour || (dev = x25->neighbour->dev) == NULL)
+		devname = "???";
+	else
+		devname = x25->neighbour->dev->name;
+
+	seq_printf(seq, "%-10s %-10s %-5s %3.3X  %d  %d  %d  %d %3lu %3lu "
+			"%3lu %3lu %3lu %5d %5d %ld\n",
+		   !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr,
+		   !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr,
+		   devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr,
+		   x25->va, x25_display_timer(s) / HZ, x25->t2  / HZ,
+		   x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ,
+		   sk_wmem_alloc_get(s),
+		   sk_rmem_alloc_get(s),
+		   s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
+out:
+	return 0;
+}
+
+static void *x25_seq_forward_start(struct seq_file *seq, loff_t *pos)
+	__acquires(x25_forward_list_lock)
+{
+	read_lock_bh(&x25_forward_list_lock);
+	return seq_list_start_head(&x25_forward_list, *pos);
+}
+
+static void *x25_seq_forward_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &x25_forward_list, pos);
+}
+
+static void x25_seq_forward_stop(struct seq_file *seq, void *v)
+	__releases(x25_forward_list_lock)
+{
+	read_unlock_bh(&x25_forward_list_lock);
+}
+
+static int x25_seq_forward_show(struct seq_file *seq, void *v)
+{
+	struct x25_forward *f = list_entry(v, struct x25_forward, node);
+
+	if (v == &x25_forward_list) {
+		seq_printf(seq, "lci dev1       dev2\n");
+		goto out;
+	}
+
+	f = v;
+
+	seq_printf(seq, "%d %-10s %-10s\n",
+			f->lci, f->dev1->name, f->dev2->name);
+out:
+	return 0;
+}
+
+static const struct seq_operations x25_seq_route_ops = {
+	.start  = x25_seq_route_start,
+	.next   = x25_seq_route_next,
+	.stop   = x25_seq_route_stop,
+	.show   = x25_seq_route_show,
+};
+
+static const struct seq_operations x25_seq_socket_ops = {
+	.start  = x25_seq_socket_start,
+	.next   = x25_seq_socket_next,
+	.stop   = x25_seq_socket_stop,
+	.show   = x25_seq_socket_show,
+};
+
+static const struct seq_operations x25_seq_forward_ops = {
+	.start  = x25_seq_forward_start,
+	.next   = x25_seq_forward_next,
+	.stop   = x25_seq_forward_stop,
+	.show   = x25_seq_forward_show,
+};
+
+static int x25_seq_socket_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &x25_seq_socket_ops);
+}
+
+static int x25_seq_route_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &x25_seq_route_ops);
+}
+
+static int x25_seq_forward_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &x25_seq_forward_ops);
+}
+
+static const struct file_operations x25_seq_socket_fops = {
+	.owner		= THIS_MODULE,
+	.open		= x25_seq_socket_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations x25_seq_route_fops = {
+	.owner		= THIS_MODULE,
+	.open		= x25_seq_route_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static const struct file_operations x25_seq_forward_fops = {
+	.owner		= THIS_MODULE,
+	.open		= x25_seq_forward_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct proc_dir_entry *x25_proc_dir;
+
+int __init x25_proc_init(void)
+{
+	struct proc_dir_entry *p;
+	int rc = -ENOMEM;
+
+	x25_proc_dir = proc_mkdir("x25", init_net.proc_net);
+	if (!x25_proc_dir)
+		goto out;
+
+	p = proc_create("route", S_IRUGO, x25_proc_dir, &x25_seq_route_fops);
+	if (!p)
+		goto out_route;
+
+	p = proc_create("socket", S_IRUGO, x25_proc_dir, &x25_seq_socket_fops);
+	if (!p)
+		goto out_socket;
+
+	p = proc_create("forward", S_IRUGO, x25_proc_dir,
+			&x25_seq_forward_fops);
+	if (!p)
+		goto out_forward;
+	rc = 0;
+
+out:
+	return rc;
+out_forward:
+	remove_proc_entry("socket", x25_proc_dir);
+out_socket:
+	remove_proc_entry("route", x25_proc_dir);
+out_route:
+	remove_proc_entry("x25", init_net.proc_net);
+	goto out;
+}
+
+void __exit x25_proc_exit(void)
+{
+	remove_proc_entry("forward", x25_proc_dir);
+	remove_proc_entry("route", x25_proc_dir);
+	remove_proc_entry("socket", x25_proc_dir);
+	remove_proc_entry("x25", init_net.proc_net);
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init x25_proc_init(void)
+{
+	return 0;
+}
+
+void __exit x25_proc_exit(void)
+{
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c
new file mode 100644
index 00000000..97d77c53
--- /dev/null
+++ b/net/x25/x25_route.c
@@ -0,0 +1,226 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	Started coding.
+ */
+
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <net/x25.h>
+
+LIST_HEAD(x25_route_list);
+DEFINE_RWLOCK(x25_route_list_lock);
+
+/*
+ *	Add a new route.
+ */
+static int x25_add_route(struct x25_address *address, unsigned int sigdigits,
+			 struct net_device *dev)
+{
+	struct x25_route *rt;
+	struct list_head *entry;
+	int rc = -EINVAL;
+
+	write_lock_bh(&x25_route_list_lock);
+
+	list_for_each(entry, &x25_route_list) {
+		rt = list_entry(entry, struct x25_route, node);
+
+		if (!memcmp(&rt->address, address, sigdigits) &&
+		    rt->sigdigits == sigdigits)
+			goto out;
+	}
+
+	rt = kmalloc(sizeof(*rt), GFP_ATOMIC);
+	rc = -ENOMEM;
+	if (!rt)
+		goto out;
+
+	strcpy(rt->address.x25_addr, "000000000000000");
+	memcpy(rt->address.x25_addr, address->x25_addr, sigdigits);
+
+	rt->sigdigits = sigdigits;
+	rt->dev       = dev;
+	atomic_set(&rt->refcnt, 1);
+
+	list_add(&rt->node, &x25_route_list);
+	rc = 0;
+out:
+	write_unlock_bh(&x25_route_list_lock);
+	return rc;
+}
+
+/**
+ * __x25_remove_route - remove route from x25_route_list
+ * @rt - route to remove
+ *
+ * Remove route from x25_route_list. If it was there.
+ * Caller must hold x25_route_list_lock.
+ */
+static void __x25_remove_route(struct x25_route *rt)
+{
+	if (rt->node.next) {
+		list_del(&rt->node);
+		x25_route_put(rt);
+	}
+}
+
+static int x25_del_route(struct x25_address *address, unsigned int sigdigits,
+			 struct net_device *dev)
+{
+	struct x25_route *rt;
+	struct list_head *entry;
+	int rc = -EINVAL;
+
+	write_lock_bh(&x25_route_list_lock);
+
+	list_for_each(entry, &x25_route_list) {
+		rt = list_entry(entry, struct x25_route, node);
+
+		if (!memcmp(&rt->address, address, sigdigits) &&
+		    rt->sigdigits == sigdigits && rt->dev == dev) {
+			__x25_remove_route(rt);
+			rc = 0;
+			break;
+		}
+	}
+
+	write_unlock_bh(&x25_route_list_lock);
+	return rc;
+}
+
+/*
+ *	A device has been removed, remove its routes.
+ */
+void x25_route_device_down(struct net_device *dev)
+{
+	struct x25_route *rt;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_route_list_lock);
+
+	list_for_each_safe(entry, tmp, &x25_route_list) {
+		rt = list_entry(entry, struct x25_route, node);
+
+		if (rt->dev == dev)
+			__x25_remove_route(rt);
+	}
+	write_unlock_bh(&x25_route_list_lock);
+
+	/* Remove any related forwarding */
+	x25_clear_forward_by_dev(dev);
+}
+
+/*
+ *	Check that the device given is a valid X.25 interface that is "up".
+ */
+struct net_device *x25_dev_get(char *devname)
+{
+	struct net_device *dev = dev_get_by_name(&init_net, devname);
+
+	if (dev &&
+	    (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25
+#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE)
+					&& dev->type != ARPHRD_ETHER
+#endif
+					))){
+		dev_put(dev);
+		dev = NULL;
+	}
+
+	return dev;
+}
+
+/**
+ * 	x25_get_route -	Find a route given an X.25 address.
+ * 	@addr - address to find a route for
+ *
+ * 	Find a route given an X.25 address.
+ */
+struct x25_route *x25_get_route(struct x25_address *addr)
+{
+	struct x25_route *rt, *use = NULL;
+	struct list_head *entry;
+
+	read_lock_bh(&x25_route_list_lock);
+
+	list_for_each(entry, &x25_route_list) {
+		rt = list_entry(entry, struct x25_route, node);
+
+		if (!memcmp(&rt->address, addr, rt->sigdigits)) {
+			if (!use)
+				use = rt;
+			else if (rt->sigdigits > use->sigdigits)
+				use = rt;
+		}
+	}
+
+	if (use)
+		x25_route_hold(use);
+
+	read_unlock_bh(&x25_route_list_lock);
+	return use;
+}
+
+/*
+ *	Handle the ioctls that control the routing functions.
+ */
+int x25_route_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct x25_route_struct rt;
+	struct net_device *dev;
+	int rc = -EINVAL;
+
+	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+		goto out;
+
+	rc = -EFAULT;
+	if (copy_from_user(&rt, arg, sizeof(rt)))
+		goto out;
+
+	rc = -EINVAL;
+	if (rt.sigdigits > 15)
+		goto out;
+
+	dev = x25_dev_get(rt.device);
+	if (!dev)
+		goto out;
+
+	if (cmd == SIOCADDRT)
+		rc = x25_add_route(&rt.address, rt.sigdigits, dev);
+	else
+		rc = x25_del_route(&rt.address, rt.sigdigits, dev);
+	dev_put(dev);
+out:
+	return rc;
+}
+
+/*
+ *	Release all memory associated with X.25 routing structures.
+ */
+void __exit x25_route_free(void)
+{
+	struct x25_route *rt;
+	struct list_head *entry, *tmp;
+
+	write_lock_bh(&x25_route_list_lock);
+	list_for_each_safe(entry, tmp, &x25_route_list) {
+		rt = list_entry(entry, struct x25_route, node);
+		__x25_remove_route(rt);
+	}
+	write_unlock_bh(&x25_route_list_lock);
+}
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
new file mode 100644
index 00000000..dc20cf12
--- /dev/null
+++ b/net/x25/x25_subr.c
@@ -0,0 +1,378 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	  Started coding.
+ *	X.25 002	Jonathan Naylor	  Centralised disconnection processing.
+ *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
+ *					  negotiation.
+ *	jun/24/01	Arnaldo C. Melo	  use skb_queue_purge, cleanups
+ *	apr/04/15	Shaun Pereira		Fast select with no
+ *						restriction on response.
+ */
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+/*
+ *	This routine purges all of the queues of frames.
+ */
+void x25_clear_queues(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	skb_queue_purge(&sk->sk_write_queue);
+	skb_queue_purge(&x25->ack_queue);
+	skb_queue_purge(&x25->interrupt_in_queue);
+	skb_queue_purge(&x25->interrupt_out_queue);
+	skb_queue_purge(&x25->fragment_queue);
+}
+
+
+/*
+ * This routine purges the input queue of those frames that have been
+ * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the
+ * SDL diagram.
+*/
+void x25_frames_acked(struct sock *sk, unsigned short nr)
+{
+	struct sk_buff *skb;
+	struct x25_sock *x25 = x25_sk(sk);
+	int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+	/*
+	 * Remove all the ack-ed frames from the ack queue.
+	 */
+	if (x25->va != nr)
+		while (skb_peek(&x25->ack_queue) && x25->va != nr) {
+			skb = skb_dequeue(&x25->ack_queue);
+			kfree_skb(skb);
+			x25->va = (x25->va + 1) % modulus;
+		}
+}
+
+void x25_requeue_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *skb_prev = NULL;
+
+	/*
+	 * Requeue all the un-ack-ed frames on the output queue to be picked
+	 * up by x25_kick. This arrangement handles the possibility of an empty
+	 * output queue.
+	 */
+	while ((skb = skb_dequeue(&x25_sk(sk)->ack_queue)) != NULL) {
+		if (!skb_prev)
+			skb_queue_head(&sk->sk_write_queue, skb);
+		else
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
+		skb_prev = skb;
+	}
+}
+
+/*
+ *	Validate that the value of nr is between va and vs. Return true or
+ *	false for testing.
+ */
+int x25_validate_nr(struct sock *sk, unsigned short nr)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+	unsigned short vc = x25->va;
+	int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS;
+
+	while (vc != x25->vs) {
+		if (nr == vc)
+			return 1;
+		vc = (vc + 1) % modulus;
+	}
+
+	return nr == x25->vs ? 1 : 0;
+}
+
+/*
+ *  This routine is called when the packet layer internally generates a
+ *  control frame.
+ */
+void x25_write_internal(struct sock *sk, int frametype)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+	struct sk_buff *skb;
+	unsigned char  *dptr;
+	unsigned char  facilities[X25_MAX_FAC_LEN];
+	unsigned char  addresses[1 + X25_ADDR_LEN];
+	unsigned char  lci1, lci2;
+	/*
+	 *	Default safe frame size.
+	 */
+	int len = X25_MAX_L2_LEN + X25_EXT_MIN_LEN;
+
+	/*
+	 *	Adjust frame size.
+	 */
+	switch (frametype) {
+		case X25_CALL_REQUEST:
+			len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
+			       X25_MAX_CUD_LEN;
+			break;
+		case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+			if(x25->facilities.reverse & 0x80) {
+				len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+			} else {
+				len += 1 + X25_MAX_FAC_LEN;
+			}
+			break;
+		case X25_CLEAR_REQUEST:
+		case X25_RESET_REQUEST:
+			len += 2;
+			break;
+		case X25_RR:
+		case X25_RNR:
+		case X25_REJ:
+		case X25_CLEAR_CONFIRMATION:
+		case X25_INTERRUPT_CONFIRMATION:
+		case X25_RESET_CONFIRMATION:
+			break;
+		default:
+			printk(KERN_ERR "X.25: invalid frame type %02X\n",
+			       frametype);
+			return;
+	}
+
+	if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL)
+		return;
+
+	/*
+	 *	Space for Ethernet and 802.2 LLC headers.
+	 */
+	skb_reserve(skb, X25_MAX_L2_LEN);
+
+	/*
+	 *	Make space for the GFI and LCI, and fill them in.
+	 */
+	dptr = skb_put(skb, 2);
+
+	lci1 = (x25->lci >> 8) & 0x0F;
+	lci2 = (x25->lci >> 0) & 0xFF;
+
+	if (x25->neighbour->extended) {
+		*dptr++ = lci1 | X25_GFI_EXTSEQ;
+		*dptr++ = lci2;
+	} else {
+		*dptr++ = lci1 | X25_GFI_STDSEQ;
+		*dptr++ = lci2;
+	}
+
+	/*
+	 *	Now fill in the frame type specific information.
+	 */
+	switch (frametype) {
+
+		case X25_CALL_REQUEST:
+			dptr    = skb_put(skb, 1);
+			*dptr++ = X25_CALL_REQUEST;
+			len     = x25_addr_aton(addresses, &x25->dest_addr,
+						&x25->source_addr);
+			dptr    = skb_put(skb, len);
+			memcpy(dptr, addresses, len);
+			len     = x25_create_facilities(facilities,
+					&x25->facilities,
+					&x25->dte_facilities,
+					x25->neighbour->global_facil_mask);
+			dptr    = skb_put(skb, len);
+			memcpy(dptr, facilities, len);
+			dptr = skb_put(skb, x25->calluserdata.cudlength);
+			memcpy(dptr, x25->calluserdata.cuddata,
+			       x25->calluserdata.cudlength);
+			x25->calluserdata.cudlength = 0;
+			break;
+
+		case X25_CALL_ACCEPTED:
+			dptr    = skb_put(skb, 2);
+			*dptr++ = X25_CALL_ACCEPTED;
+			*dptr++ = 0x00;		/* Address lengths */
+			len     = x25_create_facilities(facilities,
+							&x25->facilities,
+							&x25->dte_facilities,
+							x25->vc_facil_mask);
+			dptr    = skb_put(skb, len);
+			memcpy(dptr, facilities, len);
+
+			/* fast select with no restriction on response
+				allows call user data. Userland must
+				ensure it is ours and not theirs */
+			if(x25->facilities.reverse & 0x80) {
+				dptr = skb_put(skb,
+					x25->calluserdata.cudlength);
+				memcpy(dptr, x25->calluserdata.cuddata,
+				       x25->calluserdata.cudlength);
+			}
+			x25->calluserdata.cudlength = 0;
+			break;
+
+		case X25_CLEAR_REQUEST:
+			dptr    = skb_put(skb, 3);
+			*dptr++ = frametype;
+			*dptr++ = x25->causediag.cause;
+			*dptr++ = x25->causediag.diagnostic;
+			break;
+
+		case X25_RESET_REQUEST:
+			dptr    = skb_put(skb, 3);
+			*dptr++ = frametype;
+			*dptr++ = 0x00;		/* XXX */
+			*dptr++ = 0x00;		/* XXX */
+			break;
+
+		case X25_RR:
+		case X25_RNR:
+		case X25_REJ:
+			if (x25->neighbour->extended) {
+				dptr     = skb_put(skb, 2);
+				*dptr++  = frametype;
+				*dptr++  = (x25->vr << 1) & 0xFE;
+			} else {
+				dptr     = skb_put(skb, 1);
+				*dptr    = frametype;
+				*dptr++ |= (x25->vr << 5) & 0xE0;
+			}
+			break;
+
+		case X25_CLEAR_CONFIRMATION:
+		case X25_INTERRUPT_CONFIRMATION:
+		case X25_RESET_CONFIRMATION:
+			dptr  = skb_put(skb, 1);
+			*dptr = frametype;
+			break;
+	}
+
+	x25_transmit_link(skb, x25->neighbour);
+}
+
+/*
+ *	Unpick the contents of the passed X.25 Packet Layer frame.
+ */
+int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
+	       int *d, int *m)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+	unsigned char *frame = skb->data;
+
+	*ns = *nr = *q = *d = *m = 0;
+
+	switch (frame[2]) {
+		case X25_CALL_REQUEST:
+		case X25_CALL_ACCEPTED:
+		case X25_CLEAR_REQUEST:
+		case X25_CLEAR_CONFIRMATION:
+		case X25_INTERRUPT:
+		case X25_INTERRUPT_CONFIRMATION:
+		case X25_RESET_REQUEST:
+		case X25_RESET_CONFIRMATION:
+		case X25_RESTART_REQUEST:
+		case X25_RESTART_CONFIRMATION:
+		case X25_REGISTRATION_REQUEST:
+		case X25_REGISTRATION_CONFIRMATION:
+		case X25_DIAGNOSTIC:
+			return frame[2];
+	}
+
+	if (x25->neighbour->extended) {
+		if (frame[2] == X25_RR  ||
+		    frame[2] == X25_RNR ||
+		    frame[2] == X25_REJ) {
+			*nr = (frame[3] >> 1) & 0x7F;
+			return frame[2];
+		}
+	} else {
+		if ((frame[2] & 0x1F) == X25_RR  ||
+		    (frame[2] & 0x1F) == X25_RNR ||
+		    (frame[2] & 0x1F) == X25_REJ) {
+			*nr = (frame[2] >> 5) & 0x07;
+			return frame[2] & 0x1F;
+		}
+	}
+
+	if (x25->neighbour->extended) {
+		if ((frame[2] & 0x01) == X25_DATA) {
+			*q  = (frame[0] & X25_Q_BIT) == X25_Q_BIT;
+			*d  = (frame[0] & X25_D_BIT) == X25_D_BIT;
+			*m  = (frame[3] & X25_EXT_M_BIT) == X25_EXT_M_BIT;
+			*nr = (frame[3] >> 1) & 0x7F;
+			*ns = (frame[2] >> 1) & 0x7F;
+			return X25_DATA;
+		}
+	} else {
+		if ((frame[2] & 0x01) == X25_DATA) {
+			*q  = (frame[0] & X25_Q_BIT) == X25_Q_BIT;
+			*d  = (frame[0] & X25_D_BIT) == X25_D_BIT;
+			*m  = (frame[2] & X25_STD_M_BIT) == X25_STD_M_BIT;
+			*nr = (frame[2] >> 5) & 0x07;
+			*ns = (frame[2] >> 1) & 0x07;
+			return X25_DATA;
+		}
+	}
+
+	printk(KERN_DEBUG "X.25: invalid PLP frame %02X %02X %02X\n",
+	       frame[0], frame[1], frame[2]);
+
+	return X25_ILLEGAL;
+}
+
+void x25_disconnect(struct sock *sk, int reason, unsigned char cause,
+		    unsigned char diagnostic)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	x25_clear_queues(sk);
+	x25_stop_timer(sk);
+
+	x25->lci   = 0;
+	x25->state = X25_STATE_0;
+
+	x25->causediag.cause      = cause;
+	x25->causediag.diagnostic = diagnostic;
+
+	sk->sk_state     = TCP_CLOSE;
+	sk->sk_err       = reason;
+	sk->sk_shutdown |= SEND_SHUTDOWN;
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sock_set_flag(sk, SOCK_DEAD);
+	}
+}
+
+/*
+ * Clear an own-rx-busy condition and tell the peer about this, provided
+ * that there is a significant amount of free receive buffer space available.
+ */
+void x25_check_rbuf(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf >> 1) &&
+	    (x25->condition & X25_COND_OWN_RX_BUSY)) {
+		x25->condition &= ~X25_COND_OWN_RX_BUSY;
+		x25->condition &= ~X25_COND_ACK_PENDING;
+		x25->vl         = x25->vr;
+		x25_write_internal(sk, X25_RR);
+		x25_stop_timer(sk);
+	}
+}
+
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
new file mode 100644
index 00000000..5c5db1a3
--- /dev/null
+++ b/net/x25/x25_timer.c
@@ -0,0 +1,174 @@
+/*
+ *	X.25 Packet Layer release 002
+ *
+ *	This is ALPHA test software. This code may break your machine,
+ *	randomly fail to work with new releases, misbehave and/or generally
+ *	screw up. It might even work.
+ *
+ *	This code REQUIRES 2.1.15 or higher
+ *
+ *	This module:
+ *		This module is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	History
+ *	X.25 001	Jonathan Naylor	Started coding.
+ *	X.25 002	Jonathan Naylor	New timer architecture.
+ *					Centralised disconnection processing.
+ */
+
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/x25.h>
+
+static void x25_heartbeat_expiry(unsigned long);
+static void x25_timer_expiry(unsigned long);
+
+void x25_init_timers(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk);
+
+	/* initialized by sock_init_data */
+	sk->sk_timer.data     = (unsigned long)sk;
+	sk->sk_timer.function = &x25_heartbeat_expiry;
+}
+
+void x25_start_heartbeat(struct sock *sk)
+{
+	mod_timer(&sk->sk_timer, jiffies + 5 * HZ);
+}
+
+void x25_stop_heartbeat(struct sock *sk)
+{
+	del_timer(&sk->sk_timer);
+}
+
+void x25_start_t2timer(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	mod_timer(&x25->timer, jiffies + x25->t2);
+}
+
+void x25_start_t21timer(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	mod_timer(&x25->timer, jiffies + x25->t21);
+}
+
+void x25_start_t22timer(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	mod_timer(&x25->timer, jiffies + x25->t22);
+}
+
+void x25_start_t23timer(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	mod_timer(&x25->timer, jiffies + x25->t23);
+}
+
+void x25_stop_timer(struct sock *sk)
+{
+	del_timer(&x25_sk(sk)->timer);
+}
+
+unsigned long x25_display_timer(struct sock *sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	if (!timer_pending(&x25->timer))
+		return 0;
+
+	return x25->timer.expires - jiffies;
+}
+
+static void x25_heartbeat_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */
+		goto restart_heartbeat;
+
+	switch (x25_sk(sk)->state) {
+
+		case X25_STATE_0:
+			/*
+			 * Magic here: If we listen() and a new link dies
+			 * before it is accepted() it isn't 'dead' so doesn't
+			 * get removed.
+			 */
+			if (sock_flag(sk, SOCK_DESTROY) ||
+			    (sk->sk_state == TCP_LISTEN &&
+			     sock_flag(sk, SOCK_DEAD))) {
+				bh_unlock_sock(sk);
+				x25_destroy_socket_from_timer(sk);
+				return;
+			}
+			break;
+
+		case X25_STATE_3:
+			/*
+			 * Check for the state of the receive buffer.
+			 */
+			x25_check_rbuf(sk);
+			break;
+	}
+restart_heartbeat:
+	x25_start_heartbeat(sk);
+	bh_unlock_sock(sk);
+}
+
+/*
+ *	Timer has expired, it may have been T2, T21, T22, or T23. We can tell
+ *	by the state machine state.
+ */
+static inline void x25_do_timer_expiry(struct sock * sk)
+{
+	struct x25_sock *x25 = x25_sk(sk);
+
+	switch (x25->state) {
+
+		case X25_STATE_3:	/* T2 */
+			if (x25->condition & X25_COND_ACK_PENDING) {
+				x25->condition &= ~X25_COND_ACK_PENDING;
+				x25_enquiry_response(sk);
+			}
+			break;
+
+		case X25_STATE_1:	/* T21 */
+		case X25_STATE_4:	/* T22 */
+			x25_write_internal(sk, X25_CLEAR_REQUEST);
+			x25->state = X25_STATE_2;
+			x25_start_t23timer(sk);
+			break;
+
+		case X25_STATE_2:	/* T23 */
+			x25_disconnect(sk, ETIMEDOUT, 0, 0);
+			break;
+	}
+}
+
+static void x25_timer_expiry(unsigned long param)
+{
+	struct sock *sk = (struct sock *)param;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */
+		if (x25_sk(sk)->state == X25_STATE_3)
+			x25_start_t2timer(sk);
+	} else
+		x25_do_timer_expiry(sk);
+	bh_unlock_sock(sk);
+}
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
new file mode 100644
index 00000000..6d081674
--- /dev/null
+++ b/net/xfrm/Kconfig
@@ -0,0 +1,80 @@
+#
+# XFRM configuration
+#
+config XFRM
+       bool
+       select CRYPTO
+       depends on NET
+
+config XFRM_USER
+	tristate "Transformation user configuration interface"
+	depends on INET && XFRM
+	---help---
+	  Support for Transformation(XFRM) user configuration interface
+	  like IPsec used by native Linux tools.
+
+	  If unsure, say Y.
+
+config XFRM_SUB_POLICY
+	bool "Transformation sub policy support (EXPERIMENTAL)"
+	depends on XFRM && EXPERIMENTAL
+	---help---
+	  Support sub policy for developers. By using sub policy with main
+	  one, two policies can be applied to the same packet at once.
+	  Policy which lives shorter time in kernel should be a sub.
+
+	  If unsure, say N.
+
+config XFRM_MIGRATE
+	bool "Transformation migrate database (EXPERIMENTAL)"
+	depends on XFRM && EXPERIMENTAL
+	---help---
+	  A feature to update locator(s) of a given IPsec security
+	  association dynamically.  This feature is required, for
+	  instance, in a Mobile IPv6 environment with IPsec configuration
+	  where mobile nodes change their attachment point to the Internet.
+
+	  If unsure, say N.
+
+config XFRM_STATISTICS
+	bool "Transformation statistics (EXPERIMENTAL)"
+	depends on INET && XFRM && PROC_FS && EXPERIMENTAL
+	---help---
+	  This statistics is not a SNMP/MIB specification but shows
+	  statistics about transformation error (or almost error) factor
+	  at packet processing for developer.
+
+	  If unsure, say N.
+
+config XFRM_IPCOMP
+	tristate
+	select XFRM
+	select CRYPTO
+	select CRYPTO_DEFLATE
+
+config NET_KEY
+	tristate "PF_KEY sockets"
+	select XFRM
+	---help---
+	  PF_KEYv2 socket family, compatible to KAME ones.
+	  They are required if you are going to use IPsec tools ported
+	  from KAME.
+
+	  Say Y unless you know what you are doing.
+
+config NET_KEY_MIGRATE
+	bool "PF_KEY MIGRATE (EXPERIMENTAL)"
+	depends on NET_KEY && EXPERIMENTAL
+	select XFRM_MIGRATE
+	---help---
+	  Add a PF_KEY MIGRATE message to PF_KEYv2 socket family.
+	  The PF_KEY MIGRATE message is used to dynamically update
+	  locator(s) of a given IPsec security association.
+	  This feature is required, for instance, in a Mobile IPv6
+	  environment with IPsec configuration where mobile nodes
+	  change their attachment point to the Internet.  Detail
+	  information can be found in the internet-draft
+	  <draft-sugimoto-mip6-pfkey-migrate>.
+
+	  If unsure, say N.
+
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
new file mode 100644
index 00000000..aa429eef
--- /dev/null
+++ b/net/xfrm/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the XFRM subsystem.
+#
+
+obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
+		      xfrm_input.o xfrm_output.o xfrm_algo.o \
+		      xfrm_sysctl.o xfrm_replay.o
+obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
+obj-$(CONFIG_XFRM_USER) += xfrm_user.o
+obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
new file mode 100644
index 00000000..791ab2e7
--- /dev/null
+++ b/net/xfrm/xfrm_algo.c
@@ -0,0 +1,754 @@
+/*
+ * xfrm algorithm interface
+ *
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/pfkeyv2.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE)
+#include <net/ah.h>
+#endif
+#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+#include <net/esp.h>
+#endif
+
+/*
+ * Algorithms supported by IPsec.  These entries contain properties which
+ * are used in key negotiation and xfrm processing, and are used to verify
+ * that instantiated crypto transforms have correct parameters for IPsec
+ * purposes.
+ */
+static struct xfrm_algo_desc aead_list[] = {
+{
+	.name = "rfc4106(gcm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 64,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4106(gcm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 96,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4106(gcm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4309(ccm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 64,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4309(ccm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 96,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4309(ccm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc4543(gcm(aes))",
+
+	.uinfo = {
+		.aead = {
+			.icv_truncbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+};
+
+static struct xfrm_algo_desc aalg_list[] = {
+{
+	.name = "digest_null",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 0,
+			.icv_fullbits = 0,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_NULL,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 0,
+		.sadb_alg_maxbits = 0
+	}
+},
+{
+	.name = "hmac(md5)",
+	.compat = "md5",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_AALG_MD5HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 128
+	}
+},
+{
+	.name = "hmac(sha1)",
+	.compat = "sha1",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 160,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_AALG_SHA1HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 160,
+		.sadb_alg_maxbits = 160
+	}
+},
+{
+	.name = "hmac(sha256)",
+	.compat = "sha256",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 256,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_SHA2_256HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 256,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "hmac(sha384)",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 192,
+			.icv_fullbits = 384,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_SHA2_384HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 384,
+		.sadb_alg_maxbits = 384
+	}
+},
+{
+	.name = "hmac(sha512)",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 256,
+			.icv_fullbits = 512,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_SHA2_512HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 512,
+		.sadb_alg_maxbits = 512
+	}
+},
+{
+	.name = "hmac(rmd160)",
+	.compat = "rmd160",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 160,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 160,
+		.sadb_alg_maxbits = 160
+	}
+},
+{
+	.name = "xcbc(aes)",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 128
+	}
+},
+};
+
+static struct xfrm_algo_desc ealg_list[] = {
+{
+	.name = "ecb(cipher_null)",
+	.compat = "cipher_null",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 8,
+			.defkeybits = 0,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id =	SADB_EALG_NULL,
+		.sadb_alg_ivlen = 0,
+		.sadb_alg_minbits = 0,
+		.sadb_alg_maxbits = 0
+	}
+},
+{
+	.name = "cbc(des)",
+	.compat = "des",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 64,
+			.defkeybits = 64,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_EALG_DESCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 64,
+		.sadb_alg_maxbits = 64
+	}
+},
+{
+	.name = "cbc(des3_ede)",
+	.compat = "des3_ede",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 64,
+			.defkeybits = 192,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_EALG_3DESCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 192,
+		.sadb_alg_maxbits = 192
+	}
+},
+{
+	.name = "cbc(cast5)",
+	.compat = "cast5",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 64,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_CASTCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 40,
+		.sadb_alg_maxbits = 128
+	}
+},
+{
+	.name = "cbc(blowfish)",
+	.compat = "blowfish",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 64,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_BLOWFISHCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 40,
+		.sadb_alg_maxbits = 448
+	}
+},
+{
+	.name = "cbc(aes)",
+	.compat = "aes",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AESCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "cbc(serpent)",
+	.compat = "serpent",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_SERPENTCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256,
+	}
+},
+{
+	.name = "cbc(camellia)",
+	.compat = "camellia",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_CAMELLIACBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "cbc(twofish)",
+	.compat = "twofish",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 128,
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_TWOFISHCBC,
+		.sadb_alg_ivlen = 8,
+		.sadb_alg_minbits = 128,
+		.sadb_alg_maxbits = 256
+	}
+},
+{
+	.name = "rfc3686(ctr(aes))",
+
+	.uinfo = {
+		.encr = {
+			.blockbits = 128,
+			.defkeybits = 160, /* 128-bit key + 32-bit nonce */
+		}
+	},
+
+	.desc = {
+		.sadb_alg_id = SADB_X_EALG_AESCTR,
+		.sadb_alg_ivlen	= 8,
+		.sadb_alg_minbits = 160,
+		.sadb_alg_maxbits = 288
+	}
+},
+};
+
+static struct xfrm_algo_desc calg_list[] = {
+{
+	.name = "deflate",
+	.uinfo = {
+		.comp = {
+			.threshold = 90,
+		}
+	},
+	.desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE }
+},
+{
+	.name = "lzs",
+	.uinfo = {
+		.comp = {
+			.threshold = 90,
+		}
+	},
+	.desc = { .sadb_alg_id = SADB_X_CALG_LZS }
+},
+{
+	.name = "lzjh",
+	.uinfo = {
+		.comp = {
+			.threshold = 50,
+		}
+	},
+	.desc = { .sadb_alg_id = SADB_X_CALG_LZJH }
+},
+};
+
+static inline int aead_entries(void)
+{
+	return ARRAY_SIZE(aead_list);
+}
+
+static inline int aalg_entries(void)
+{
+	return ARRAY_SIZE(aalg_list);
+}
+
+static inline int ealg_entries(void)
+{
+	return ARRAY_SIZE(ealg_list);
+}
+
+static inline int calg_entries(void)
+{
+	return ARRAY_SIZE(calg_list);
+}
+
+struct xfrm_algo_list {
+	struct xfrm_algo_desc *algs;
+	int entries;
+	u32 type;
+	u32 mask;
+};
+
+static const struct xfrm_algo_list xfrm_aead_list = {
+	.algs = aead_list,
+	.entries = ARRAY_SIZE(aead_list),
+	.type = CRYPTO_ALG_TYPE_AEAD,
+	.mask = CRYPTO_ALG_TYPE_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_aalg_list = {
+	.algs = aalg_list,
+	.entries = ARRAY_SIZE(aalg_list),
+	.type = CRYPTO_ALG_TYPE_HASH,
+	.mask = CRYPTO_ALG_TYPE_HASH_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_ealg_list = {
+	.algs = ealg_list,
+	.entries = ARRAY_SIZE(ealg_list),
+	.type = CRYPTO_ALG_TYPE_BLKCIPHER,
+	.mask = CRYPTO_ALG_TYPE_BLKCIPHER_MASK,
+};
+
+static const struct xfrm_algo_list xfrm_calg_list = {
+	.algs = calg_list,
+	.entries = ARRAY_SIZE(calg_list),
+	.type = CRYPTO_ALG_TYPE_COMPRESS,
+	.mask = CRYPTO_ALG_TYPE_MASK,
+};
+
+static struct xfrm_algo_desc *xfrm_find_algo(
+	const struct xfrm_algo_list *algo_list,
+	int match(const struct xfrm_algo_desc *entry, const void *data),
+	const void *data, int probe)
+{
+	struct xfrm_algo_desc *list = algo_list->algs;
+	int i, status;
+
+	for (i = 0; i < algo_list->entries; i++) {
+		if (!match(list + i, data))
+			continue;
+
+		if (list[i].available)
+			return &list[i];
+
+		if (!probe)
+			break;
+
+		status = crypto_has_alg(list[i].name, algo_list->type,
+					algo_list->mask);
+		if (!status)
+			break;
+
+		list[i].available = status;
+		return &list[i];
+	}
+	return NULL;
+}
+
+static int xfrm_alg_id_match(const struct xfrm_algo_desc *entry,
+			     const void *data)
+{
+	return entry->desc.sadb_alg_id == (unsigned long)data;
+}
+
+struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id)
+{
+	return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_id_match,
+			      (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id)
+{
+	return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_id_match,
+			      (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid);
+
+struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id)
+{
+	return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_id_match,
+			      (void *)(unsigned long)alg_id, 1);
+}
+EXPORT_SYMBOL_GPL(xfrm_calg_get_byid);
+
+static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry,
+			       const void *data)
+{
+	const char *name = data;
+
+	return name && (!strcmp(name, entry->name) ||
+			(entry->compat && !strcmp(name, entry->compat)));
+}
+
+struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe)
+{
+	return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name,
+			      probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe)
+{
+	return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name,
+			      probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname);
+
+struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe)
+{
+	return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name,
+			      probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_calg_get_byname);
+
+struct xfrm_aead_name {
+	const char *name;
+	int icvbits;
+};
+
+static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry,
+				const void *data)
+{
+	const struct xfrm_aead_name *aead = data;
+	const char *name = aead->name;
+
+	return aead->icvbits == entry->uinfo.aead.icv_truncbits && name &&
+	       !strcmp(name, entry->name);
+}
+
+struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe)
+{
+	struct xfrm_aead_name data = {
+		.name = name,
+		.icvbits = icv_len,
+	};
+
+	return xfrm_find_algo(&xfrm_aead_list, xfrm_aead_name_match, &data,
+			      probe);
+}
+EXPORT_SYMBOL_GPL(xfrm_aead_get_byname);
+
+struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx)
+{
+	if (idx >= aalg_entries())
+		return NULL;
+
+	return &aalg_list[idx];
+}
+EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx);
+
+struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx)
+{
+	if (idx >= ealg_entries())
+		return NULL;
+
+	return &ealg_list[idx];
+}
+EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx);
+
+/*
+ * Probe for the availability of crypto algorithms, and set the available
+ * flag for any algorithms found on the system.  This is typically called by
+ * pfkey during userspace SA add, update or register.
+ */
+void xfrm_probe_algs(void)
+{
+	int i, status;
+
+	BUG_ON(in_softirq());
+
+	for (i = 0; i < aalg_entries(); i++) {
+		status = crypto_has_hash(aalg_list[i].name, 0,
+					 CRYPTO_ALG_ASYNC);
+		if (aalg_list[i].available != status)
+			aalg_list[i].available = status;
+	}
+
+	for (i = 0; i < ealg_entries(); i++) {
+		status = crypto_has_blkcipher(ealg_list[i].name, 0,
+					      CRYPTO_ALG_ASYNC);
+		if (ealg_list[i].available != status)
+			ealg_list[i].available = status;
+	}
+
+	for (i = 0; i < calg_entries(); i++) {
+		status = crypto_has_comp(calg_list[i].name, 0,
+					 CRYPTO_ALG_ASYNC);
+		if (calg_list[i].available != status)
+			calg_list[i].available = status;
+	}
+}
+EXPORT_SYMBOL_GPL(xfrm_probe_algs);
+
+int xfrm_count_auth_supported(void)
+{
+	int i, n;
+
+	for (i = 0, n = 0; i < aalg_entries(); i++)
+		if (aalg_list[i].available)
+			n++;
+	return n;
+}
+EXPORT_SYMBOL_GPL(xfrm_count_auth_supported);
+
+int xfrm_count_enc_supported(void)
+{
+	int i, n;
+
+	for (i = 0, n = 0; i < ealg_entries(); i++)
+		if (ealg_list[i].available)
+			n++;
+	return n;
+}
+EXPORT_SYMBOL_GPL(xfrm_count_enc_supported);
+
+#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
+
+void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
+{
+	if (tail != skb) {
+		skb->data_len += len;
+		skb->len += len;
+	}
+	return skb_put(tail, len);
+}
+EXPORT_SYMBOL_GPL(pskb_put);
+#endif
diff --git a/net/xfrm/xfrm_hash.c b/net/xfrm/xfrm_hash.c
new file mode 100644
index 00000000..1e98bc0f
--- /dev/null
+++ b/net/xfrm/xfrm_hash.c
@@ -0,0 +1,39 @@
+/* xfrm_hash.c: Common hash table code.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/xfrm.h>
+
+#include "xfrm_hash.h"
+
+struct hlist_head *xfrm_hash_alloc(unsigned int sz)
+{
+	struct hlist_head *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kzalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = vzalloc(sz);
+	else
+		n = (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+					 get_order(sz));
+
+	return n;
+}
+
+void xfrm_hash_free(struct hlist_head *n, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(n);
+	else if (hashdist)
+		vfree(n);
+	else
+		free_pages((unsigned long)n, get_order(sz));
+}
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
new file mode 100644
index 00000000..7199d78b
--- /dev/null
+++ b/net/xfrm/xfrm_hash.h
@@ -0,0 +1,136 @@
+#ifndef _XFRM_HASH_H
+#define _XFRM_HASH_H
+
+#include <linux/xfrm.h>
+#include <linux/socket.h>
+
+static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
+{
+	return ntohl(addr->a4);
+}
+
+static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
+{
+	return ntohl(addr->a6[2] ^ addr->a6[3]);
+}
+
+static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
+						    const xfrm_address_t *saddr)
+{
+	u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
+	return ntohl((__force __be32)sum);
+}
+
+static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
+						    const xfrm_address_t *saddr)
+{
+	return ntohl(daddr->a6[2] ^ daddr->a6[3] ^
+		     saddr->a6[2] ^ saddr->a6[3]);
+}
+
+static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
+					   const xfrm_address_t *saddr,
+					   u32 reqid, unsigned short family,
+					   unsigned int hmask)
+{
+	unsigned int h = family ^ reqid;
+	switch (family) {
+	case AF_INET:
+		h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
+		break;
+	case AF_INET6:
+		h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
+		break;
+	}
+	return (h ^ (h >> 16)) & hmask;
+}
+
+static inline unsigned __xfrm_src_hash(const xfrm_address_t *daddr,
+				       const xfrm_address_t *saddr,
+				       unsigned short family,
+				       unsigned int hmask)
+{
+	unsigned int h = family;
+	switch (family) {
+	case AF_INET:
+		h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
+		break;
+	case AF_INET6:
+		h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
+		break;
+	}
+	return (h ^ (h >> 16)) & hmask;
+}
+
+static inline unsigned int
+__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
+		unsigned short family, unsigned int hmask)
+{
+	unsigned int h = (__force u32)spi ^ proto;
+	switch (family) {
+	case AF_INET:
+		h ^= __xfrm4_addr_hash(daddr);
+		break;
+	case AF_INET6:
+		h ^= __xfrm6_addr_hash(daddr);
+		break;
+	}
+	return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
+}
+
+static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
+{
+	return (index ^ (index >> 8)) & hmask;
+}
+
+static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
+				      unsigned short family, unsigned int hmask)
+{
+	const xfrm_address_t *daddr = &sel->daddr;
+	const xfrm_address_t *saddr = &sel->saddr;
+	unsigned int h = 0;
+
+	switch (family) {
+	case AF_INET:
+		if (sel->prefixlen_d != 32 ||
+		    sel->prefixlen_s != 32)
+			return hmask + 1;
+
+		h = __xfrm4_daddr_saddr_hash(daddr, saddr);
+		break;
+
+	case AF_INET6:
+		if (sel->prefixlen_d != 128 ||
+		    sel->prefixlen_s != 128)
+			return hmask + 1;
+
+		h = __xfrm6_daddr_saddr_hash(daddr, saddr);
+		break;
+	}
+	h ^= (h >> 16);
+	return h & hmask;
+}
+
+static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
+				       const xfrm_address_t *saddr,
+				       unsigned short family, unsigned int hmask)
+{
+	unsigned int h = 0;
+
+	switch (family) {
+	case AF_INET:
+		h = __xfrm4_daddr_saddr_hash(daddr, saddr);
+		break;
+
+	case AF_INET6:
+		h = __xfrm6_daddr_saddr_hash(daddr, saddr);
+		break;
+	}
+	h ^= (h >> 16);
+	return h & hmask;
+}
+
+extern struct hlist_head *xfrm_hash_alloc(unsigned int sz);
+extern void xfrm_hash_free(struct hlist_head *n, unsigned int sz);
+
+#endif /* _XFRM_HASH_H */
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
new file mode 100644
index 00000000..54a0dc2e
--- /dev/null
+++ b/net/xfrm/xfrm_input.c
@@ -0,0 +1,291 @@
+/*
+ * xfrm_input.c
+ *
+ * Changes:
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+static struct kmem_cache *secpath_cachep __read_mostly;
+
+void __secpath_destroy(struct sec_path *sp)
+{
+	int i;
+	for (i = 0; i < sp->len; i++)
+		xfrm_state_put(sp->xvec[i]);
+	kmem_cache_free(secpath_cachep, sp);
+}
+EXPORT_SYMBOL(__secpath_destroy);
+
+struct sec_path *secpath_dup(struct sec_path *src)
+{
+	struct sec_path *sp;
+
+	sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC);
+	if (!sp)
+		return NULL;
+
+	sp->len = 0;
+	if (src) {
+		int i;
+
+		memcpy(sp, src, sizeof(*sp));
+		for (i = 0; i < sp->len; i++)
+			xfrm_state_hold(sp->xvec[i]);
+	}
+	atomic_set(&sp->refcnt, 1);
+	return sp;
+}
+EXPORT_SYMBOL(secpath_dup);
+
+/* Fetch spi and seq from ipsec header */
+
+int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq)
+{
+	int offset, offset_seq;
+	int hlen;
+
+	switch (nexthdr) {
+	case IPPROTO_AH:
+		hlen = sizeof(struct ip_auth_hdr);
+		offset = offsetof(struct ip_auth_hdr, spi);
+		offset_seq = offsetof(struct ip_auth_hdr, seq_no);
+		break;
+	case IPPROTO_ESP:
+		hlen = sizeof(struct ip_esp_hdr);
+		offset = offsetof(struct ip_esp_hdr, spi);
+		offset_seq = offsetof(struct ip_esp_hdr, seq_no);
+		break;
+	case IPPROTO_COMP:
+		if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr)))
+			return -EINVAL;
+		*spi = htonl(ntohs(*(__be16*)(skb_transport_header(skb) + 2)));
+		*seq = 0;
+		return 0;
+	default:
+		return 1;
+	}
+
+	if (!pskb_may_pull(skb, hlen))
+		return -EINVAL;
+
+	*spi = *(__be32*)(skb_transport_header(skb) + offset);
+	*seq = *(__be32*)(skb_transport_header(skb) + offset_seq);
+	return 0;
+}
+
+int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct xfrm_mode *inner_mode = x->inner_mode;
+	int err;
+
+	err = x->outer_mode->afinfo->extract_input(x, skb);
+	if (err)
+		return err;
+
+	if (x->sel.family == AF_UNSPEC) {
+		inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+		if (inner_mode == NULL)
+			return -EAFNOSUPPORT;
+	}
+
+	skb->protocol = inner_mode->afinfo->eth_proto;
+	return inner_mode->input2(x, skb);
+}
+EXPORT_SYMBOL(xfrm_prepare_input);
+
+int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+	struct net *net = dev_net(skb->dev);
+	int err;
+	__be32 seq;
+	__be32 seq_hi;
+	struct xfrm_state *x;
+	xfrm_address_t *daddr;
+	struct xfrm_mode *inner_mode;
+	unsigned int family;
+	int decaps = 0;
+	int async = 0;
+
+	/* A negative encap_type indicates async resumption. */
+	if (encap_type < 0) {
+		async = 1;
+		x = xfrm_input_state(skb);
+		seq = XFRM_SKB_CB(skb)->seq.input.low;
+		goto resume;
+	}
+
+	/* Allocate new secpath or COW existing one. */
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		struct sec_path *sp;
+
+		sp = secpath_dup(skb->sp);
+		if (!sp) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
+			goto drop;
+		}
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+
+	daddr = (xfrm_address_t *)(skb_network_header(skb) +
+				   XFRM_SPI_SKB_CB(skb)->daddroff);
+	family = XFRM_SPI_SKB_CB(skb)->family;
+
+	seq = 0;
+	if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+		goto drop;
+	}
+
+	do {
+		if (skb->sp->len == XFRM_MAX_DEPTH) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto drop;
+		}
+
+		x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, family);
+		if (x == NULL) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
+			xfrm_audit_state_notfound(skb, family, spi, seq);
+			goto drop;
+		}
+
+		skb->sp->xvec[skb->sp->len++] = x;
+
+		spin_lock(&x->lock);
+		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID);
+			goto drop_unlock;
+		}
+
+		if ((x->encap ? x->encap->encap_type : 0) != encap_type) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
+			goto drop_unlock;
+		}
+
+		if (x->repl->check(x, skb, seq)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
+			goto drop_unlock;
+		}
+
+		if (xfrm_state_check_expire(x)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED);
+			goto drop_unlock;
+		}
+
+		spin_unlock(&x->lock);
+
+		seq_hi = htonl(xfrm_replay_seqhi(x, seq));
+
+		XFRM_SKB_CB(skb)->seq.input.low = seq;
+		XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
+
+		skb_dst_force(skb);
+
+		nexthdr = x->type->input(x, skb);
+
+		if (nexthdr == -EINPROGRESS)
+			return 0;
+
+resume:
+		spin_lock(&x->lock);
+		if (nexthdr <= 0) {
+			if (nexthdr == -EBADMSG) {
+				xfrm_audit_state_icvfail(x, skb,
+							 x->type->proto);
+				x->stats.integrity_failed++;
+			}
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
+			goto drop_unlock;
+		}
+
+		/* only the first xfrm gets the encap type */
+		encap_type = 0;
+
+		if (async && x->repl->check(x, skb, seq)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR);
+			goto drop_unlock;
+		}
+
+		x->repl->advance(x, seq);
+
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
+
+		spin_unlock(&x->lock);
+
+		XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
+
+		inner_mode = x->inner_mode;
+
+		if (x->sel.family == AF_UNSPEC) {
+			inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+			if (inner_mode == NULL)
+				goto drop;
+		}
+
+		if (inner_mode->input(x, skb)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
+			goto drop;
+		}
+
+		if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) {
+			decaps = 1;
+			break;
+		}
+
+		/*
+		 * We need the inner address.  However, we only get here for
+		 * transport mode so the outer address is identical.
+		 */
+		daddr = &x->id.daddr;
+		family = x->outer_mode->afinfo->family;
+
+		err = xfrm_parse_spi(skb, nexthdr, &spi, &seq);
+		if (err < 0) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+			goto drop;
+		}
+	} while (!err);
+
+	nf_reset(skb);
+
+	if (decaps) {
+		skb_dst_drop(skb);
+		netif_rx(skb);
+		return 0;
+	} else {
+		return x->inner_mode->afinfo->transport_finish(skb, async);
+	}
+
+drop_unlock:
+	spin_unlock(&x->lock);
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_input);
+
+int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
+{
+	return xfrm_input(skb, nexthdr, 0, -1);
+}
+EXPORT_SYMBOL(xfrm_input_resume);
+
+void __init xfrm_input_init(void)
+{
+	secpath_cachep = kmem_cache_create("secpath_cache",
+					   sizeof(struct sec_path),
+					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					   NULL);
+}
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
new file mode 100644
index 00000000..fc91ad7e
--- /dev/null
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -0,0 +1,383 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2003-2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Todo:
+ *   - Tunable compression parameters.
+ *   - Compression stats.
+ *   - Adaptive compression.
+ */
+
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/ipcomp.h>
+#include <net/xfrm.h>
+
+struct ipcomp_tfms {
+	struct list_head list;
+	struct crypto_comp * __percpu *tfms;
+	int users;
+};
+
+static DEFINE_MUTEX(ipcomp_resource_mutex);
+static void * __percpu *ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipcomp_data *ipcd = x->data;
+	const int plen = skb->len;
+	int dlen = IPCOMP_SCRATCH_SIZE;
+	const u8 *start = skb->data;
+	const int cpu = get_cpu();
+	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+	int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+	int len;
+
+	if (err)
+		goto out;
+
+	if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	len = dlen - plen;
+	if (len > skb_tailroom(skb))
+		len = skb_tailroom(skb);
+
+	__skb_put(skb, len);
+
+	len += plen;
+	skb_copy_to_linear_data(skb, scratch, len);
+
+	while ((scratch += len, dlen -= len) > 0) {
+		skb_frag_t *frag;
+
+		err = -EMSGSIZE;
+		if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS))
+			goto out;
+
+		frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags;
+		frag->page = alloc_page(GFP_ATOMIC);
+
+		err = -ENOMEM;
+		if (!frag->page)
+			goto out;
+
+		len = PAGE_SIZE;
+		if (dlen < len)
+			len = dlen;
+
+		memcpy(page_address(frag->page), scratch, len);
+
+		frag->page_offset = 0;
+		frag->size = len;
+		skb->truesize += len;
+		skb->data_len += len;
+		skb->len += len;
+
+		skb_shinfo(skb)->nr_frags++;
+	}
+
+	err = 0;
+
+out:
+	put_cpu();
+	return err;
+}
+
+int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int nexthdr;
+	int err = -ENOMEM;
+	struct ip_comp_hdr *ipch;
+
+	if (skb_linearize_cow(skb))
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Remove ipcomp header and decompress original payload */
+	ipch = (void *)skb->data;
+	nexthdr = ipch->nexthdr;
+
+	skb->transport_header = skb->network_header + sizeof(*ipch);
+	__skb_pull(skb, sizeof(*ipch));
+	err = ipcomp_decompress(x, skb);
+	if (err)
+		goto out;
+
+	err = nexthdr;
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(ipcomp_input);
+
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipcomp_data *ipcd = x->data;
+	const int plen = skb->len;
+	int dlen = IPCOMP_SCRATCH_SIZE;
+	u8 *start = skb->data;
+	const int cpu = get_cpu();
+	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+	int err;
+
+	local_bh_disable();
+	err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+	local_bh_enable();
+	if (err)
+		goto out;
+
+	if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+	put_cpu();
+
+	pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
+	return 0;
+
+out:
+	put_cpu();
+	return err;
+}
+
+int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct ip_comp_hdr *ipch;
+	struct ipcomp_data *ipcd = x->data;
+
+	if (skb->len < ipcd->threshold) {
+		/* Don't bother compressing */
+		goto out_ok;
+	}
+
+	if (skb_linearize_cow(skb))
+		goto out_ok;
+
+	err = ipcomp_compress(x, skb);
+
+	if (err) {
+		goto out_ok;
+	}
+
+	/* Install ipcomp header, convert into ipcomp datagram. */
+	ipch = ip_comp_hdr(skb);
+	ipch->nexthdr = *skb_mac_header(skb);
+	ipch->flags = 0;
+	ipch->cpi = htons((u16 )ntohl(x->id.spi));
+	*skb_mac_header(skb) = IPPROTO_COMP;
+out_ok:
+	skb_push(skb, -skb_network_offset(skb));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipcomp_output);
+
+static void ipcomp_free_scratches(void)
+{
+	int i;
+	void * __percpu *scratches;
+
+	if (--ipcomp_scratch_users)
+		return;
+
+	scratches = ipcomp_scratches;
+	if (!scratches)
+		return;
+
+	for_each_possible_cpu(i)
+		vfree(*per_cpu_ptr(scratches, i));
+
+	free_percpu(scratches);
+}
+
+static void * __percpu *ipcomp_alloc_scratches(void)
+{
+	int i;
+	void * __percpu *scratches;
+
+	if (ipcomp_scratch_users++)
+		return ipcomp_scratches;
+
+	scratches = alloc_percpu(void *);
+	if (!scratches)
+		return NULL;
+
+	ipcomp_scratches = scratches;
+
+	for_each_possible_cpu(i) {
+		void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+		if (!scratch)
+			return NULL;
+		*per_cpu_ptr(scratches, i) = scratch;
+	}
+
+	return scratches;
+}
+
+static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms)
+{
+	struct ipcomp_tfms *pos;
+	int cpu;
+
+	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+		if (pos->tfms == tfms)
+			break;
+	}
+
+	WARN_ON(!pos);
+
+	if (--pos->users)
+		return;
+
+	list_del(&pos->list);
+	kfree(pos);
+
+	if (!tfms)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu);
+		crypto_free_comp(tfm);
+	}
+	free_percpu(tfms);
+}
+
+static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name)
+{
+	struct ipcomp_tfms *pos;
+	struct crypto_comp * __percpu *tfms;
+	int cpu;
+
+	/* This can be any valid CPU ID so we don't need locking. */
+	cpu = raw_smp_processor_id();
+
+	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+		struct crypto_comp *tfm;
+
+		tfms = pos->tfms;
+		tfm = *per_cpu_ptr(tfms, cpu);
+
+		if (!strcmp(crypto_comp_name(tfm), alg_name)) {
+			pos->users++;
+			return tfms;
+		}
+	}
+
+	pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+	if (!pos)
+		return NULL;
+
+	pos->users = 1;
+	INIT_LIST_HEAD(&pos->list);
+	list_add(&pos->list, &ipcomp_tfms_list);
+
+	pos->tfms = tfms = alloc_percpu(struct crypto_comp *);
+	if (!tfms)
+		goto error;
+
+	for_each_possible_cpu(cpu) {
+		struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0,
+							    CRYPTO_ALG_ASYNC);
+		if (IS_ERR(tfm))
+			goto error;
+		*per_cpu_ptr(tfms, cpu) = tfm;
+	}
+
+	return tfms;
+
+error:
+	ipcomp_free_tfms(tfms);
+	return NULL;
+}
+
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+	if (ipcd->tfms)
+		ipcomp_free_tfms(ipcd->tfms);
+	ipcomp_free_scratches();
+}
+
+void ipcomp_destroy(struct xfrm_state *x)
+{
+	struct ipcomp_data *ipcd = x->data;
+	if (!ipcd)
+		return;
+	xfrm_state_delete_tunnel(x);
+	mutex_lock(&ipcomp_resource_mutex);
+	ipcomp_free_data(ipcd);
+	mutex_unlock(&ipcomp_resource_mutex);
+	kfree(ipcd);
+}
+EXPORT_SYMBOL_GPL(ipcomp_destroy);
+
+int ipcomp_init_state(struct xfrm_state *x)
+{
+	int err;
+	struct ipcomp_data *ipcd;
+	struct xfrm_algo_desc *calg_desc;
+
+	err = -EINVAL;
+	if (!x->calg)
+		goto out;
+
+	if (x->encap)
+		goto out;
+
+	err = -ENOMEM;
+	ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
+	if (!ipcd)
+		goto out;
+
+	mutex_lock(&ipcomp_resource_mutex);
+	if (!ipcomp_alloc_scratches())
+		goto error;
+
+	ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+	if (!ipcd->tfms)
+		goto error;
+	mutex_unlock(&ipcomp_resource_mutex);
+
+	calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+	BUG_ON(!calg_desc);
+	ipcd->threshold = calg_desc->uinfo.comp.threshold;
+	x->data = ipcd;
+	err = 0;
+out:
+	return err;
+
+error:
+	ipcomp_free_data(ipcd);
+	mutex_unlock(&ipcomp_resource_mutex);
+	kfree(ipcd);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(ipcomp_init_state);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
new file mode 100644
index 00000000..47bacd8c
--- /dev/null
+++ b/net/xfrm/xfrm_output.c
@@ -0,0 +1,212 @@
+/*
+ * xfrm_output.c - Common IPsec encapsulation code.
+ *
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+
+static int xfrm_output2(struct sk_buff *skb);
+
+static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	int nhead = dst->header_len + LL_RESERVED_SPACE(dst->dev)
+		- skb_headroom(skb);
+	int ntail = dst->dev->needed_tailroom - skb_tailroom(skb);
+
+	if (nhead <= 0) {
+		if (ntail <= 0)
+			return 0;
+		nhead = 0;
+	} else if (ntail < 0)
+		ntail = 0;
+
+	return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC);
+}
+
+static int xfrm_output_one(struct sk_buff *skb, int err)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+	struct net *net = xs_net(x);
+
+	if (err <= 0)
+		goto resume;
+
+	do {
+		err = xfrm_state_check_space(x, skb);
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			goto error_nolock;
+		}
+
+		err = x->outer_mode->output(x, skb);
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
+			goto error_nolock;
+		}
+
+		spin_lock_bh(&x->lock);
+		err = xfrm_state_check_expire(x);
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEEXPIRED);
+			goto error;
+		}
+
+		err = x->repl->overflow(x, skb);
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATESEQERROR);
+			goto error;
+		}
+
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
+
+		spin_unlock_bh(&x->lock);
+
+		skb_dst_force(skb);
+
+		err = x->type->output(x, skb);
+		if (err == -EINPROGRESS)
+			goto out_exit;
+
+resume:
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			goto error_nolock;
+		}
+
+		dst = skb_dst_pop(skb);
+		if (!dst) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			err = -EHOSTUNREACH;
+			goto error_nolock;
+		}
+		skb_dst_set(skb, dst);
+		x = dst->xfrm;
+	} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
+
+	err = 0;
+
+out_exit:
+	return err;
+error:
+	spin_unlock_bh(&x->lock);
+error_nolock:
+	kfree_skb(skb);
+	goto out_exit;
+}
+
+int xfrm_output_resume(struct sk_buff *skb, int err)
+{
+	while (likely((err = xfrm_output_one(skb, err)) == 0)) {
+		nf_reset(skb);
+
+		err = skb_dst(skb)->ops->local_out(skb);
+		if (unlikely(err != 1))
+			goto out;
+
+		if (!skb_dst(skb)->xfrm)
+			return dst_output(skb);
+
+		err = nf_hook(skb_dst(skb)->ops->family,
+			      NF_INET_POST_ROUTING, skb,
+			      NULL, skb_dst(skb)->dev, xfrm_output2);
+		if (unlikely(err != 1))
+			goto out;
+	}
+
+	if (err == -EINPROGRESS)
+		err = 0;
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(xfrm_output_resume);
+
+static int xfrm_output2(struct sk_buff *skb)
+{
+	return xfrm_output_resume(skb, 1);
+}
+
+static int xfrm_output_gso(struct sk_buff *skb)
+{
+	struct sk_buff *segs;
+
+	segs = skb_gso_segment(skb, 0);
+	kfree_skb(skb);
+	if (IS_ERR(segs))
+		return PTR_ERR(segs);
+
+	do {
+		struct sk_buff *nskb = segs->next;
+		int err;
+
+		segs->next = NULL;
+		err = xfrm_output2(segs);
+
+		if (unlikely(err)) {
+			while ((segs = nskb)) {
+				nskb = segs->next;
+				segs->next = NULL;
+				kfree_skb(segs);
+			}
+			return err;
+		}
+
+		segs = nskb;
+	} while (segs);
+
+	return 0;
+}
+
+int xfrm_output(struct sk_buff *skb)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	int err;
+
+	if (skb_is_gso(skb))
+		return xfrm_output_gso(skb);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		err = skb_checksum_help(skb);
+		if (err) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			kfree_skb(skb);
+			return err;
+		}
+	}
+
+	return xfrm_output2(skb);
+}
+
+int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct xfrm_mode *inner_mode;
+	if (x->sel.family == AF_UNSPEC)
+		inner_mode = xfrm_ip2inner_mode(x,
+				xfrm_af2proto(skb_dst(skb)->ops->family));
+	else
+		inner_mode = x->inner_mode;
+
+	if (inner_mode == NULL)
+		return -EAFNOSUPPORT;
+	return inner_mode->afinfo->extract_output(x, skb);
+}
+
+EXPORT_SYMBOL_GPL(xfrm_output);
+EXPORT_SYMBOL_GPL(xfrm_inner_extract_output);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
new file mode 100644
index 00000000..0c0e40e9
--- /dev/null
+++ b/net/xfrm/xfrm_policy.c
@@ -0,0 +1,2978 @@
+/*
+ * xfrm_policy.c
+ *
+ * Changes:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * 		IPv6 support
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	YOSHIFUJI Hideaki
+ * 		Split up af-specific portion
+ *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/audit.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#ifdef CONFIG_XFRM_STATISTICS
+#include <net/snmp.h>
+#endif
+
+#include "xfrm_hash.h"
+
+DEFINE_MUTEX(xfrm_cfg_mutex);
+EXPORT_SYMBOL(xfrm_cfg_mutex);
+
+static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
+static struct dst_entry *xfrm_policy_sk_bundles;
+static DEFINE_RWLOCK(xfrm_policy_lock);
+
+static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
+static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
+
+static struct kmem_cache *xfrm_dst_cache __read_mostly;
+
+static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
+static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
+static void xfrm_init_pmtu(struct dst_entry *dst);
+static int stale_bundle(struct dst_entry *dst);
+static int xfrm_bundle_ok(struct xfrm_dst *xdst);
+
+
+static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
+						int dir);
+
+static inline int
+__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi4 *fl4 = &fl->u.ip4;
+
+	return  addr_match(&fl4->daddr, &sel->daddr, sel->prefixlen_d) &&
+		addr_match(&fl4->saddr, &sel->saddr, sel->prefixlen_s) &&
+		!((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
+		!((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
+		(fl4->flowi4_proto == sel->proto || !sel->proto) &&
+		(fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
+}
+
+static inline int
+__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
+{
+	const struct flowi6 *fl6 = &fl->u.ip6;
+
+	return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
+		addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
+		!((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
+		!((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
+		(fl6->flowi6_proto == sel->proto || !sel->proto) &&
+		(fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
+}
+
+int xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
+			unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_selector_match(sel, fl);
+	case AF_INET6:
+		return __xfrm6_selector_match(sel, fl);
+	}
+	return 0;
+}
+
+static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+						  const xfrm_address_t *saddr,
+						  const xfrm_address_t *daddr,
+						  int family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	struct dst_entry *dst;
+
+	afinfo = xfrm_policy_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return ERR_PTR(-EAFNOSUPPORT);
+
+	dst = afinfo->dst_lookup(net, tos, saddr, daddr);
+
+	xfrm_policy_put_afinfo(afinfo);
+
+	return dst;
+}
+
+static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+						xfrm_address_t *prev_saddr,
+						xfrm_address_t *prev_daddr,
+						int family)
+{
+	struct net *net = xs_net(x);
+	xfrm_address_t *saddr = &x->props.saddr;
+	xfrm_address_t *daddr = &x->id.daddr;
+	struct dst_entry *dst;
+
+	if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
+		saddr = x->coaddr;
+		daddr = prev_daddr;
+	}
+	if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
+		saddr = prev_saddr;
+		daddr = x->coaddr;
+	}
+
+	dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family);
+
+	if (!IS_ERR(dst)) {
+		if (prev_saddr != saddr)
+			memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
+		if (prev_daddr != daddr)
+			memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
+	}
+
+	return dst;
+}
+
+static inline unsigned long make_jiffies(long secs)
+{
+	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
+		return MAX_SCHEDULE_TIMEOUT-1;
+	else
+		return secs*HZ;
+}
+
+static void xfrm_policy_timer(unsigned long data)
+{
+	struct xfrm_policy *xp = (struct xfrm_policy*)data;
+	unsigned long now = get_seconds();
+	long next = LONG_MAX;
+	int warn = 0;
+	int dir;
+
+	read_lock(&xp->lock);
+
+	if (unlikely(xp->walk.dead))
+		goto out;
+
+	dir = xfrm_policy_id2dir(xp->index);
+
+	if (xp->lft.hard_add_expires_seconds) {
+		long tmo = xp->lft.hard_add_expires_seconds +
+			xp->curlft.add_time - now;
+		if (tmo <= 0)
+			goto expired;
+		if (tmo < next)
+			next = tmo;
+	}
+	if (xp->lft.hard_use_expires_seconds) {
+		long tmo = xp->lft.hard_use_expires_seconds +
+			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
+		if (tmo <= 0)
+			goto expired;
+		if (tmo < next)
+			next = tmo;
+	}
+	if (xp->lft.soft_add_expires_seconds) {
+		long tmo = xp->lft.soft_add_expires_seconds +
+			xp->curlft.add_time - now;
+		if (tmo <= 0) {
+			warn = 1;
+			tmo = XFRM_KM_TIMEOUT;
+		}
+		if (tmo < next)
+			next = tmo;
+	}
+	if (xp->lft.soft_use_expires_seconds) {
+		long tmo = xp->lft.soft_use_expires_seconds +
+			(xp->curlft.use_time ? : xp->curlft.add_time) - now;
+		if (tmo <= 0) {
+			warn = 1;
+			tmo = XFRM_KM_TIMEOUT;
+		}
+		if (tmo < next)
+			next = tmo;
+	}
+
+	if (warn)
+		km_policy_expired(xp, dir, 0, 0);
+	if (next != LONG_MAX &&
+	    !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
+		xfrm_pol_hold(xp);
+
+out:
+	read_unlock(&xp->lock);
+	xfrm_pol_put(xp);
+	return;
+
+expired:
+	read_unlock(&xp->lock);
+	if (!xfrm_policy_delete(xp, dir))
+		km_policy_expired(xp, dir, 1, 0);
+	xfrm_pol_put(xp);
+}
+
+static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	if (unlikely(pol->walk.dead))
+		flo = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return flo;
+}
+
+static int xfrm_policy_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
+{
+	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
+}
+
+static const struct flow_cache_ops xfrm_policy_fc_ops = {
+	.get = xfrm_policy_flo_get,
+	.check = xfrm_policy_flo_check,
+	.delete = xfrm_policy_flo_delete,
+};
+
+/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
+ * SPD calls.
+ */
+
+struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
+{
+	struct xfrm_policy *policy;
+
+	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
+
+	if (policy) {
+		write_pnet(&policy->xp_net, net);
+		INIT_LIST_HEAD(&policy->walk.all);
+		INIT_HLIST_NODE(&policy->bydst);
+		INIT_HLIST_NODE(&policy->byidx);
+		rwlock_init(&policy->lock);
+		atomic_set(&policy->refcnt, 1);
+		setup_timer(&policy->timer, xfrm_policy_timer,
+				(unsigned long)policy);
+		policy->flo.ops = &xfrm_policy_fc_ops;
+	}
+	return policy;
+}
+EXPORT_SYMBOL(xfrm_policy_alloc);
+
+/* Destroy xfrm_policy: descendant resources must be released to this moment. */
+
+void xfrm_policy_destroy(struct xfrm_policy *policy)
+{
+	BUG_ON(!policy->walk.dead);
+
+	if (del_timer(&policy->timer))
+		BUG();
+
+	security_xfrm_policy_free(policy->security);
+	kfree(policy);
+}
+EXPORT_SYMBOL(xfrm_policy_destroy);
+
+/* Rule must be locked. Release descentant resources, announce
+ * entry dead. The rule must be unlinked from lists to the moment.
+ */
+
+static void xfrm_policy_kill(struct xfrm_policy *policy)
+{
+	policy->walk.dead = 1;
+
+	atomic_inc(&policy->genid);
+
+	if (del_timer(&policy->timer))
+		xfrm_pol_put(policy);
+
+	xfrm_pol_put(policy);
+}
+
+static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
+
+static inline unsigned int idx_hash(struct net *net, u32 index)
+{
+	return __idx_hash(index, net->xfrm.policy_idx_hmask);
+}
+
+static struct hlist_head *policy_hash_bysel(struct net *net,
+					    const struct xfrm_selector *sel,
+					    unsigned short family, int dir)
+{
+	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+	unsigned int hash = __sel_hash(sel, family, hmask);
+
+	return (hash == hmask + 1 ?
+		&net->xfrm.policy_inexact[dir] :
+		net->xfrm.policy_bydst[dir].table + hash);
+}
+
+static struct hlist_head *policy_hash_direct(struct net *net,
+					     const xfrm_address_t *daddr,
+					     const xfrm_address_t *saddr,
+					     unsigned short family, int dir)
+{
+	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+	unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
+
+	return net->xfrm.policy_bydst[dir].table + hash;
+}
+
+static void xfrm_dst_hash_transfer(struct hlist_head *list,
+				   struct hlist_head *ndsttable,
+				   unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp, *entry0 = NULL;
+	struct xfrm_policy *pol;
+	unsigned int h0 = 0;
+
+redo:
+	hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) {
+		unsigned int h;
+
+		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
+				pol->family, nhashmask);
+		if (!entry0) {
+			hlist_del(entry);
+			hlist_add_head(&pol->bydst, ndsttable+h);
+			h0 = h;
+		} else {
+			if (h != h0)
+				continue;
+			hlist_del(entry);
+			hlist_add_after(entry0, &pol->bydst);
+		}
+		entry0 = entry;
+	}
+	if (!hlist_empty(list)) {
+		entry0 = NULL;
+		goto redo;
+	}
+}
+
+static void xfrm_idx_hash_transfer(struct hlist_head *list,
+				   struct hlist_head *nidxtable,
+				   unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp;
+	struct xfrm_policy *pol;
+
+	hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) {
+		unsigned int h;
+
+		h = __idx_hash(pol->index, nhashmask);
+		hlist_add_head(&pol->byidx, nidxtable+h);
+	}
+}
+
+static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
+{
+	return ((old_hmask + 1) << 1) - 1;
+}
+
+static void xfrm_bydst_resize(struct net *net, int dir)
+{
+	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+	struct hlist_head *odst = net->xfrm.policy_bydst[dir].table;
+	struct hlist_head *ndst = xfrm_hash_alloc(nsize);
+	int i;
+
+	if (!ndst)
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+
+	for (i = hmask; i >= 0; i--)
+		xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
+
+	net->xfrm.policy_bydst[dir].table = ndst;
+	net->xfrm.policy_bydst[dir].hmask = nhashmask;
+
+	write_unlock_bh(&xfrm_policy_lock);
+
+	xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static void xfrm_byidx_resize(struct net *net, int total)
+{
+	unsigned int hmask = net->xfrm.policy_idx_hmask;
+	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+	struct hlist_head *oidx = net->xfrm.policy_byidx;
+	struct hlist_head *nidx = xfrm_hash_alloc(nsize);
+	int i;
+
+	if (!nidx)
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+
+	for (i = hmask; i >= 0; i--)
+		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
+
+	net->xfrm.policy_byidx = nidx;
+	net->xfrm.policy_idx_hmask = nhashmask;
+
+	write_unlock_bh(&xfrm_policy_lock);
+
+	xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
+{
+	unsigned int cnt = net->xfrm.policy_count[dir];
+	unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
+
+	if (total)
+		*total += cnt;
+
+	if ((hmask + 1) < xfrm_policy_hashmax &&
+	    cnt > hmask)
+		return 1;
+
+	return 0;
+}
+
+static inline int xfrm_byidx_should_resize(struct net *net, int total)
+{
+	unsigned int hmask = net->xfrm.policy_idx_hmask;
+
+	if ((hmask + 1) < xfrm_policy_hashmax &&
+	    total > hmask)
+		return 1;
+
+	return 0;
+}
+
+void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
+{
+	read_lock_bh(&xfrm_policy_lock);
+	si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
+	si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
+	si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
+	si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
+	si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
+	si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
+	si->spdhcnt = net->xfrm.policy_idx_hmask;
+	si->spdhmcnt = xfrm_policy_hashmax;
+	read_unlock_bh(&xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_spd_getinfo);
+
+static DEFINE_MUTEX(hash_resize_mutex);
+static void xfrm_hash_resize(struct work_struct *work)
+{
+	struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
+	int dir, total;
+
+	mutex_lock(&hash_resize_mutex);
+
+	total = 0;
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		if (xfrm_bydst_should_resize(net, dir, &total))
+			xfrm_bydst_resize(net, dir);
+	}
+	if (xfrm_byidx_should_resize(net, total))
+		xfrm_byidx_resize(net, total);
+
+	mutex_unlock(&hash_resize_mutex);
+}
+
+/* Generate new index... KAME seems to generate them ordered by cost
+ * of an absolute inpredictability of ordering of rules. This will not pass. */
+static u32 xfrm_gen_index(struct net *net, int dir)
+{
+	static u32 idx_generator;
+
+	for (;;) {
+		struct hlist_node *entry;
+		struct hlist_head *list;
+		struct xfrm_policy *p;
+		u32 idx;
+		int found;
+
+		idx = (idx_generator | dir);
+		idx_generator += 8;
+		if (idx == 0)
+			idx = 8;
+		list = net->xfrm.policy_byidx + idx_hash(net, idx);
+		found = 0;
+		hlist_for_each_entry(p, entry, list, byidx) {
+			if (p->index == idx) {
+				found = 1;
+				break;
+			}
+		}
+		if (!found)
+			return idx;
+	}
+}
+
+static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
+{
+	u32 *p1 = (u32 *) s1;
+	u32 *p2 = (u32 *) s2;
+	int len = sizeof(struct xfrm_selector) / sizeof(u32);
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (p1[i] != p2[i])
+			return 1;
+	}
+
+	return 0;
+}
+
+int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+{
+	struct net *net = xp_net(policy);
+	struct xfrm_policy *pol;
+	struct xfrm_policy *delpol;
+	struct hlist_head *chain;
+	struct hlist_node *entry, *newpos;
+	u32 mark = policy->mark.v & policy->mark.m;
+
+	write_lock_bh(&xfrm_policy_lock);
+	chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+	delpol = NULL;
+	newpos = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (pol->type == policy->type &&
+		    !selector_cmp(&pol->selector, &policy->selector) &&
+		    (mark & pol->mark.m) == pol->mark.v &&
+		    xfrm_sec_ctx_match(pol->security, policy->security) &&
+		    !WARN_ON(delpol)) {
+			if (excl) {
+				write_unlock_bh(&xfrm_policy_lock);
+				return -EEXIST;
+			}
+			delpol = pol;
+			if (policy->priority > pol->priority)
+				continue;
+		} else if (policy->priority >= pol->priority) {
+			newpos = &pol->bydst;
+			continue;
+		}
+		if (delpol)
+			break;
+	}
+	if (newpos)
+		hlist_add_after(newpos, &policy->bydst);
+	else
+		hlist_add_head(&policy->bydst, chain);
+	xfrm_pol_hold(policy);
+	net->xfrm.policy_count[dir]++;
+	atomic_inc(&flow_cache_genid);
+	if (delpol)
+		__xfrm_policy_unlink(delpol, dir);
+	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir);
+	hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
+	policy->curlft.add_time = get_seconds();
+	policy->curlft.use_time = 0;
+	if (!mod_timer(&policy->timer, jiffies + HZ))
+		xfrm_pol_hold(policy);
+	list_add(&policy->walk.all, &net->xfrm.policy_all);
+	write_unlock_bh(&xfrm_policy_lock);
+
+	if (delpol)
+		xfrm_policy_kill(delpol);
+	else if (xfrm_bydst_should_resize(net, dir, NULL))
+		schedule_work(&net->xfrm.policy_hash_work);
+
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_policy_insert);
+
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
+					  int dir, struct xfrm_selector *sel,
+					  struct xfrm_sec_ctx *ctx, int delete,
+					  int *err)
+{
+	struct xfrm_policy *pol, *ret;
+	struct hlist_head *chain;
+	struct hlist_node *entry;
+
+	*err = 0;
+	write_lock_bh(&xfrm_policy_lock);
+	chain = policy_hash_bysel(net, sel, sel->family, dir);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (pol->type == type &&
+		    (mark & pol->mark.m) == pol->mark.v &&
+		    !selector_cmp(sel, &pol->selector) &&
+		    xfrm_sec_ctx_match(ctx, pol->security)) {
+			xfrm_pol_hold(pol);
+			if (delete) {
+				*err = security_xfrm_policy_delete(
+								pol->security);
+				if (*err) {
+					write_unlock_bh(&xfrm_policy_lock);
+					return pol;
+				}
+				__xfrm_policy_unlink(pol, dir);
+			}
+			ret = pol;
+			break;
+		}
+	}
+	write_unlock_bh(&xfrm_policy_lock);
+
+	if (ret && delete)
+		xfrm_policy_kill(ret);
+	return ret;
+}
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
+
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
+				     int dir, u32 id, int delete, int *err)
+{
+	struct xfrm_policy *pol, *ret;
+	struct hlist_head *chain;
+	struct hlist_node *entry;
+
+	*err = -ENOENT;
+	if (xfrm_policy_id2dir(id) != dir)
+		return NULL;
+
+	*err = 0;
+	write_lock_bh(&xfrm_policy_lock);
+	chain = net->xfrm.policy_byidx + idx_hash(net, id);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, byidx) {
+		if (pol->type == type && pol->index == id &&
+		    (mark & pol->mark.m) == pol->mark.v) {
+			xfrm_pol_hold(pol);
+			if (delete) {
+				*err = security_xfrm_policy_delete(
+								pol->security);
+				if (*err) {
+					write_unlock_bh(&xfrm_policy_lock);
+					return pol;
+				}
+				__xfrm_policy_unlink(pol, dir);
+			}
+			ret = pol;
+			break;
+		}
+	}
+	write_unlock_bh(&xfrm_policy_lock);
+
+	if (ret && delete)
+		xfrm_policy_kill(ret);
+	return ret;
+}
+EXPORT_SYMBOL(xfrm_policy_byid);
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int
+xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info)
+{
+	int dir, err = 0;
+
+	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+		struct xfrm_policy *pol;
+		struct hlist_node *entry;
+		int i;
+
+		hlist_for_each_entry(pol, entry,
+				     &net->xfrm.policy_inexact[dir], bydst) {
+			if (pol->type != type)
+				continue;
+			err = security_xfrm_policy_delete(pol->security);
+			if (err) {
+				xfrm_audit_policy_delete(pol, 0,
+							 audit_info->loginuid,
+							 audit_info->sessionid,
+							 audit_info->secid);
+				return err;
+			}
+		}
+		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
+			hlist_for_each_entry(pol, entry,
+					     net->xfrm.policy_bydst[dir].table + i,
+					     bydst) {
+				if (pol->type != type)
+					continue;
+				err = security_xfrm_policy_delete(
+								pol->security);
+				if (err) {
+					xfrm_audit_policy_delete(pol, 0,
+							audit_info->loginuid,
+							audit_info->sessionid,
+							audit_info->secid);
+					return err;
+				}
+			}
+		}
+	}
+	return err;
+}
+#else
+static inline int
+xfrm_policy_flush_secctx_check(struct net *net, u8 type, struct xfrm_audit *audit_info)
+{
+	return 0;
+}
+#endif
+
+int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
+{
+	int dir, err = 0, cnt = 0;
+
+	write_lock_bh(&xfrm_policy_lock);
+
+	err = xfrm_policy_flush_secctx_check(net, type, audit_info);
+	if (err)
+		goto out;
+
+	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
+		struct xfrm_policy *pol;
+		struct hlist_node *entry;
+		int i;
+
+	again1:
+		hlist_for_each_entry(pol, entry,
+				     &net->xfrm.policy_inexact[dir], bydst) {
+			if (pol->type != type)
+				continue;
+			__xfrm_policy_unlink(pol, dir);
+			write_unlock_bh(&xfrm_policy_lock);
+			cnt++;
+
+			xfrm_audit_policy_delete(pol, 1, audit_info->loginuid,
+						 audit_info->sessionid,
+						 audit_info->secid);
+
+			xfrm_policy_kill(pol);
+
+			write_lock_bh(&xfrm_policy_lock);
+			goto again1;
+		}
+
+		for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
+	again2:
+			hlist_for_each_entry(pol, entry,
+					     net->xfrm.policy_bydst[dir].table + i,
+					     bydst) {
+				if (pol->type != type)
+					continue;
+				__xfrm_policy_unlink(pol, dir);
+				write_unlock_bh(&xfrm_policy_lock);
+				cnt++;
+
+				xfrm_audit_policy_delete(pol, 1,
+							 audit_info->loginuid,
+							 audit_info->sessionid,
+							 audit_info->secid);
+				xfrm_policy_kill(pol);
+
+				write_lock_bh(&xfrm_policy_lock);
+				goto again2;
+			}
+		}
+
+	}
+	if (!cnt)
+		err = -ESRCH;
+out:
+	write_unlock_bh(&xfrm_policy_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_policy_flush);
+
+int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
+		     int (*func)(struct xfrm_policy *, int, int, void*),
+		     void *data)
+{
+	struct xfrm_policy *pol;
+	struct xfrm_policy_walk_entry *x;
+	int error = 0;
+
+	if (walk->type >= XFRM_POLICY_TYPE_MAX &&
+	    walk->type != XFRM_POLICY_TYPE_ANY)
+		return -EINVAL;
+
+	if (list_empty(&walk->walk.all) && walk->seq != 0)
+		return 0;
+
+	write_lock_bh(&xfrm_policy_lock);
+	if (list_empty(&walk->walk.all))
+		x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
+	else
+		x = list_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all);
+	list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
+		if (x->dead)
+			continue;
+		pol = container_of(x, struct xfrm_policy, walk);
+		if (walk->type != XFRM_POLICY_TYPE_ANY &&
+		    walk->type != pol->type)
+			continue;
+		error = func(pol, xfrm_policy_id2dir(pol->index),
+			     walk->seq, data);
+		if (error) {
+			list_move_tail(&walk->walk.all, &x->all);
+			goto out;
+		}
+		walk->seq++;
+	}
+	if (walk->seq == 0) {
+		error = -ENOENT;
+		goto out;
+	}
+	list_del_init(&walk->walk.all);
+out:
+	write_unlock_bh(&xfrm_policy_lock);
+	return error;
+}
+EXPORT_SYMBOL(xfrm_policy_walk);
+
+void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
+{
+	INIT_LIST_HEAD(&walk->walk.all);
+	walk->walk.dead = 1;
+	walk->type = type;
+	walk->seq = 0;
+}
+EXPORT_SYMBOL(xfrm_policy_walk_init);
+
+void xfrm_policy_walk_done(struct xfrm_policy_walk *walk)
+{
+	if (list_empty(&walk->walk.all))
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+	list_del(&walk->walk.all);
+	write_unlock_bh(&xfrm_policy_lock);
+}
+EXPORT_SYMBOL(xfrm_policy_walk_done);
+
+/*
+ * Find policy to apply to this flow.
+ *
+ * Returns 0 if policy found, else an -errno.
+ */
+static int xfrm_policy_match(const struct xfrm_policy *pol,
+			     const struct flowi *fl,
+			     u8 type, u16 family, int dir)
+{
+	const struct xfrm_selector *sel = &pol->selector;
+	int match, ret = -ESRCH;
+
+	if (pol->family != family ||
+	    (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
+	    pol->type != type)
+		return ret;
+
+	match = xfrm_selector_match(sel, fl, family);
+	if (match)
+		ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid,
+						  dir);
+
+	return ret;
+}
+
+static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
+						     const struct flowi *fl,
+						     u16 family, u8 dir)
+{
+	int err;
+	struct xfrm_policy *pol, *ret;
+	const xfrm_address_t *daddr, *saddr;
+	struct hlist_node *entry;
+	struct hlist_head *chain;
+	u32 priority = ~0U;
+
+	daddr = xfrm_flowi_daddr(fl, family);
+	saddr = xfrm_flowi_saddr(fl, family);
+	if (unlikely(!daddr || !saddr))
+		return NULL;
+
+	read_lock_bh(&xfrm_policy_lock);
+	chain = policy_hash_direct(net, daddr, saddr, family, dir);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		err = xfrm_policy_match(pol, fl, type, family, dir);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+	chain = &net->xfrm.policy_inexact[dir];
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		err = xfrm_policy_match(pol, fl, type, family, dir);
+		if (err) {
+			if (err == -ESRCH)
+				continue;
+			else {
+				ret = ERR_PTR(err);
+				goto fail;
+			}
+		} else if (pol->priority < priority) {
+			ret = pol;
+			break;
+		}
+	}
+	if (ret)
+		xfrm_pol_hold(ret);
+fail:
+	read_unlock_bh(&xfrm_policy_lock);
+
+	return ret;
+}
+
+static struct xfrm_policy *
+__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_policy *pol;
+
+	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol != NULL)
+		return pol;
+#endif
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+}
+
+static struct flow_cache_object *
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
+		   u8 dir, struct flow_cache_object *old_obj, void *ctx)
+{
+	struct xfrm_policy *pol;
+
+	if (old_obj)
+		xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
+
+	pol = __xfrm_policy_lookup(net, fl, family, dir);
+	if (IS_ERR_OR_NULL(pol))
+		return ERR_CAST(pol);
+
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->flo;
+}
+
+static inline int policy_to_flow_dir(int dir)
+{
+	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
+		return dir;
+	switch (dir) {
+	default:
+	case XFRM_POLICY_IN:
+		return FLOW_DIR_IN;
+	case XFRM_POLICY_OUT:
+		return FLOW_DIR_OUT;
+	case XFRM_POLICY_FWD:
+		return FLOW_DIR_FWD;
+	}
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir,
+						 const struct flowi *fl)
+{
+	struct xfrm_policy *pol;
+
+	read_lock_bh(&xfrm_policy_lock);
+	if ((pol = sk->sk_policy[dir]) != NULL) {
+		int match = xfrm_selector_match(&pol->selector, fl,
+						sk->sk_family);
+		int err = 0;
+
+		if (match) {
+			if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+				pol = NULL;
+				goto out;
+			}
+			err = security_xfrm_policy_lookup(pol->security,
+						      fl->flowi_secid,
+						      policy_to_flow_dir(dir));
+			if (!err)
+				xfrm_pol_hold(pol);
+			else if (err == -ESRCH)
+				pol = NULL;
+			else
+				pol = ERR_PTR(err);
+		} else
+			pol = NULL;
+	}
+out:
+	read_unlock_bh(&xfrm_policy_lock);
+	return pol;
+}
+
+static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
+{
+	struct net *net = xp_net(pol);
+	struct hlist_head *chain = policy_hash_bysel(net, &pol->selector,
+						     pol->family, dir);
+
+	list_add(&pol->walk.all, &net->xfrm.policy_all);
+	hlist_add_head(&pol->bydst, chain);
+	hlist_add_head(&pol->byidx, net->xfrm.policy_byidx+idx_hash(net, pol->index));
+	net->xfrm.policy_count[dir]++;
+	xfrm_pol_hold(pol);
+
+	if (xfrm_bydst_should_resize(net, dir, NULL))
+		schedule_work(&net->xfrm.policy_hash_work);
+}
+
+static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
+						int dir)
+{
+	struct net *net = xp_net(pol);
+
+	if (hlist_unhashed(&pol->bydst))
+		return NULL;
+
+	hlist_del(&pol->bydst);
+	hlist_del(&pol->byidx);
+	list_del(&pol->walk.all);
+	net->xfrm.policy_count[dir]--;
+
+	return pol;
+}
+
+int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
+{
+	write_lock_bh(&xfrm_policy_lock);
+	pol = __xfrm_policy_unlink(pol, dir);
+	write_unlock_bh(&xfrm_policy_lock);
+	if (pol) {
+		xfrm_policy_kill(pol);
+		return 0;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(xfrm_policy_delete);
+
+int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
+{
+	struct net *net = xp_net(pol);
+	struct xfrm_policy *old_pol;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
+		return -EINVAL;
+#endif
+
+	write_lock_bh(&xfrm_policy_lock);
+	old_pol = sk->sk_policy[dir];
+	sk->sk_policy[dir] = pol;
+	if (pol) {
+		pol->curlft.add_time = get_seconds();
+		pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir);
+		__xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
+	}
+	if (old_pol)
+		/* Unlinking succeeds always. This is the only function
+		 * allowed to delete or replace socket policy.
+		 */
+		__xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir);
+	write_unlock_bh(&xfrm_policy_lock);
+
+	if (old_pol) {
+		xfrm_policy_kill(old_pol);
+	}
+	return 0;
+}
+
+static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
+{
+	struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
+
+	if (newp) {
+		newp->selector = old->selector;
+		if (security_xfrm_policy_clone(old->security,
+					       &newp->security)) {
+			kfree(newp);
+			return NULL;  /* ENOMEM */
+		}
+		newp->lft = old->lft;
+		newp->curlft = old->curlft;
+		newp->mark = old->mark;
+		newp->action = old->action;
+		newp->flags = old->flags;
+		newp->xfrm_nr = old->xfrm_nr;
+		newp->index = old->index;
+		newp->type = old->type;
+		memcpy(newp->xfrm_vec, old->xfrm_vec,
+		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
+		write_lock_bh(&xfrm_policy_lock);
+		__xfrm_policy_link(newp, XFRM_POLICY_MAX+dir);
+		write_unlock_bh(&xfrm_policy_lock);
+		xfrm_pol_put(newp);
+	}
+	return newp;
+}
+
+int __xfrm_sk_clone_policy(struct sock *sk)
+{
+	struct xfrm_policy *p0 = sk->sk_policy[0],
+			   *p1 = sk->sk_policy[1];
+
+	sk->sk_policy[0] = sk->sk_policy[1] = NULL;
+	if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
+		return -ENOMEM;
+	if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static int
+xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote,
+	       unsigned short family)
+{
+	int err;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	err = afinfo->get_saddr(net, local, remote);
+	xfrm_policy_put_afinfo(afinfo);
+	return err;
+}
+
+/* Resolve list of templates for the flow, given policy. */
+
+static int
+xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
+		      struct xfrm_state **xfrm, unsigned short family)
+{
+	struct net *net = xp_net(policy);
+	int nx;
+	int i, error;
+	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
+	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
+	xfrm_address_t tmp;
+
+	for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
+		struct xfrm_state *x;
+		xfrm_address_t *remote = daddr;
+		xfrm_address_t *local  = saddr;
+		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
+
+		if (tmpl->mode == XFRM_MODE_TUNNEL ||
+		    tmpl->mode == XFRM_MODE_BEET) {
+			remote = &tmpl->id.daddr;
+			local = &tmpl->saddr;
+			if (xfrm_addr_any(local, tmpl->encap_family)) {
+				error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
+				if (error)
+					goto fail;
+				local = &tmp;
+			}
+		}
+
+		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
+
+		if (x && x->km.state == XFRM_STATE_VALID) {
+			xfrm[nx++] = x;
+			daddr = remote;
+			saddr = local;
+			continue;
+		}
+		if (x) {
+			error = (x->km.state == XFRM_STATE_ERROR ?
+				 -EINVAL : -EAGAIN);
+			xfrm_state_put(x);
+		}
+		else if (error == -ESRCH)
+			error = -EAGAIN;
+
+		if (!tmpl->optional)
+			goto fail;
+	}
+	return nx;
+
+fail:
+	for (nx--; nx>=0; nx--)
+		xfrm_state_put(xfrm[nx]);
+	return error;
+}
+
+static int
+xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
+		  struct xfrm_state **xfrm, unsigned short family)
+{
+	struct xfrm_state *tp[XFRM_MAX_DEPTH];
+	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
+	int cnx = 0;
+	int error;
+	int ret;
+	int i;
+
+	for (i = 0; i < npols; i++) {
+		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
+			error = -ENOBUFS;
+			goto fail;
+		}
+
+		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
+		if (ret < 0) {
+			error = ret;
+			goto fail;
+		} else
+			cnx += ret;
+	}
+
+	/* found states are sorted for outbound processing */
+	if (npols > 1)
+		xfrm_state_sort(xfrm, tpp, cnx, family);
+
+	return cnx;
+
+ fail:
+	for (cnx--; cnx>=0; cnx--)
+		xfrm_state_put(tpp[cnx]);
+	return error;
+
+}
+
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+static inline int xfrm_get_tos(const struct flowi *fl, int family)
+{
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	int tos;
+
+	if (!afinfo)
+		return -EINVAL;
+
+	tos = afinfo->get_tos(fl);
+
+	xfrm_policy_put_afinfo(afinfo);
+
+	return tos;
+}
+
+static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (xdst->route == NULL) {
+		/* Dummy bundle - if it has xfrms we were not
+		 * able to build bundle as template resolution failed.
+		 * It means we need to try again resolving. */
+		if (xdst->num_xfrms > 0)
+			return NULL;
+	} else {
+		/* Real bundle */
+		if (stale_bundle(dst))
+			return NULL;
+	}
+
+	dst_hold(dst);
+	return flo;
+}
+
+static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	if (!xdst->route)
+		return 0;
+	if (stale_bundle(dst))
+		return 0;
+
+	return 1;
+}
+
+static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
+{
+	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
+	struct dst_entry *dst = &xdst->u.dst;
+
+	dst_free(dst);
+}
+
+static const struct flow_cache_ops xfrm_bundle_fc_ops = {
+	.get = xfrm_bundle_flo_get,
+	.check = xfrm_bundle_flo_check,
+	.delete = xfrm_bundle_flo_delete,
+};
+
+static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
+{
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	struct dst_ops *dst_ops;
+	struct xfrm_dst *xdst;
+
+	if (!afinfo)
+		return ERR_PTR(-EINVAL);
+
+	switch (family) {
+	case AF_INET:
+		dst_ops = &net->xfrm.xfrm4_dst_ops;
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		dst_ops = &net->xfrm.xfrm6_dst_ops;
+		break;
+#endif
+	default:
+		BUG();
+	}
+	xdst = dst_alloc(dst_ops, NULL, 0, 0, 0);
+	memset(&xdst->u.rt6.rt6i_table, 0, sizeof(*xdst) - sizeof(struct dst_entry));
+	xfrm_policy_put_afinfo(afinfo);
+
+	if (likely(xdst))
+		xdst->flo.ops = &xfrm_bundle_fc_ops;
+	else
+		xdst = ERR_PTR(-ENOBUFS);
+
+	return xdst;
+}
+
+static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
+				 int nfheader_len)
+{
+	struct xfrm_policy_afinfo *afinfo =
+		xfrm_policy_get_afinfo(dst->ops->family);
+	int err;
+
+	if (!afinfo)
+		return -EINVAL;
+
+	err = afinfo->init_path(path, dst, nfheader_len);
+
+	xfrm_policy_put_afinfo(afinfo);
+
+	return err;
+}
+
+static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+				const struct flowi *fl)
+{
+	struct xfrm_policy_afinfo *afinfo =
+		xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
+	int err;
+
+	if (!afinfo)
+		return -EINVAL;
+
+	err = afinfo->fill_dst(xdst, dev, fl);
+
+	xfrm_policy_put_afinfo(afinfo);
+
+	return err;
+}
+
+
+/* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+
+static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
+					    struct xfrm_state **xfrm, int nx,
+					    const struct flowi *fl,
+					    struct dst_entry *dst)
+{
+	struct net *net = xp_net(policy);
+	unsigned long now = jiffies;
+	struct net_device *dev;
+	struct xfrm_mode *inner_mode;
+	struct dst_entry *dst_prev = NULL;
+	struct dst_entry *dst0 = NULL;
+	int i = 0;
+	int err;
+	int header_len = 0;
+	int nfheader_len = 0;
+	int trailer_len = 0;
+	int tos;
+	int family = policy->selector.family;
+	xfrm_address_t saddr, daddr;
+
+	xfrm_flowi_addr_get(fl, &saddr, &daddr, family);
+
+	tos = xfrm_get_tos(fl, family);
+	err = tos;
+	if (tos < 0)
+		goto put_states;
+
+	dst_hold(dst);
+
+	for (; i < nx; i++) {
+		struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
+		struct dst_entry *dst1 = &xdst->u.dst;
+
+		err = PTR_ERR(xdst);
+		if (IS_ERR(xdst)) {
+			dst_release(dst);
+			goto put_states;
+		}
+
+		if (xfrm[i]->sel.family == AF_UNSPEC) {
+			inner_mode = xfrm_ip2inner_mode(xfrm[i],
+							xfrm_af2proto(family));
+			if (!inner_mode) {
+				err = -EAFNOSUPPORT;
+				dst_release(dst);
+				goto put_states;
+			}
+		} else
+			inner_mode = xfrm[i]->inner_mode;
+
+		if (!dst_prev)
+			dst0 = dst1;
+		else {
+			dst_prev->child = dst_clone(dst1);
+			dst1->flags |= DST_NOHASH;
+		}
+
+		xdst->route = dst;
+		dst_copy_metrics(dst1, dst);
+
+		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+			family = xfrm[i]->props.family;
+			dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
+					      family);
+			err = PTR_ERR(dst);
+			if (IS_ERR(dst))
+				goto put_states;
+		} else
+			dst_hold(dst);
+
+		dst1->xfrm = xfrm[i];
+		xdst->xfrm_genid = xfrm[i]->genid;
+
+		dst1->obsolete = -1;
+		dst1->flags |= DST_HOST;
+		dst1->lastuse = now;
+
+		dst1->input = dst_discard;
+		dst1->output = inner_mode->afinfo->output;
+
+		dst1->next = dst_prev;
+		dst_prev = dst1;
+
+		header_len += xfrm[i]->props.header_len;
+		if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
+			nfheader_len += xfrm[i]->props.header_len;
+		trailer_len += xfrm[i]->props.trailer_len;
+	}
+
+	dst_prev->child = dst;
+	dst0->path = dst;
+
+	err = -ENODEV;
+	dev = dst->dev;
+	if (!dev)
+		goto free_dst;
+
+	/* Copy neighbour for reachability confirmation */
+	dst_set_neighbour(dst0, neigh_clone(dst_get_neighbour(dst)));
+
+	xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);
+	xfrm_init_pmtu(dst_prev);
+
+	for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) {
+		struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev;
+
+		err = xfrm_fill_dst(xdst, dev, fl);
+		if (err)
+			goto free_dst;
+
+		dst_prev->header_len = header_len;
+		dst_prev->trailer_len = trailer_len;
+		header_len -= xdst->u.dst.xfrm->props.header_len;
+		trailer_len -= xdst->u.dst.xfrm->props.trailer_len;
+	}
+
+out:
+	return dst0;
+
+put_states:
+	for (; i < nx; i++)
+		xfrm_state_put(xfrm[i]);
+free_dst:
+	if (dst0)
+		dst_free(dst0);
+	dst0 = ERR_PTR(err);
+	goto out;
+}
+
+static int inline
+xfrm_dst_alloc_copy(void **target, const void *src, int size)
+{
+	if (!*target) {
+		*target = kmalloc(size, GFP_ATOMIC);
+		if (!*target)
+			return -ENOMEM;
+	}
+	memcpy(*target, src, size);
+	return 0;
+}
+
+static int inline
+xfrm_dst_update_parent(struct dst_entry *dst, const struct xfrm_selector *sel)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->partner),
+				   sel, sizeof(*sel));
+#else
+	return 0;
+#endif
+}
+
+static int inline
+xfrm_dst_update_origin(struct dst_entry *dst, const struct flowi *fl)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
+#else
+	return 0;
+#endif
+}
+
+static int xfrm_expand_policies(const struct flowi *fl, u16 family,
+				struct xfrm_policy **pols,
+				int *num_pols, int *num_xfrms)
+{
+	int i;
+
+	if (*num_pols == 0 || !pols[0]) {
+		*num_pols = 0;
+		*num_xfrms = 0;
+		return 0;
+	}
+	if (IS_ERR(pols[0]))
+		return PTR_ERR(pols[0]);
+
+	*num_xfrms = pols[0]->xfrm_nr;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
+	    pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
+						    XFRM_POLICY_TYPE_MAIN,
+						    fl, family,
+						    XFRM_POLICY_OUT);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				xfrm_pols_put(pols, *num_pols);
+				return PTR_ERR(pols[1]);
+			}
+			(*num_pols) ++;
+			(*num_xfrms) += pols[1]->xfrm_nr;
+		}
+	}
+#endif
+	for (i = 0; i < *num_pols; i++) {
+		if (pols[i]->action != XFRM_POLICY_ALLOW) {
+			*num_xfrms = -1;
+			break;
+		}
+	}
+
+	return 0;
+
+}
+
+static struct xfrm_dst *
+xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
+			       const struct flowi *fl, u16 family,
+			       struct dst_entry *dst_orig)
+{
+	struct net *net = xp_net(pols[0]);
+	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct dst_entry *dst;
+	struct xfrm_dst *xdst;
+	int err;
+
+	/* Try to instantiate a bundle */
+	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
+	if (err <= 0) {
+		if (err != 0 && err != -EAGAIN)
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		return ERR_PTR(err);
+	}
+
+	dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
+	if (IS_ERR(dst)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+		return ERR_CAST(dst);
+	}
+
+	xdst = (struct xfrm_dst *)dst;
+	xdst->num_xfrms = err;
+	if (num_pols > 1)
+		err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+	else
+		err = xfrm_dst_update_origin(dst, fl);
+	if (unlikely(err)) {
+		dst_free(dst);
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+		return ERR_PTR(err);
+	}
+
+	xdst->num_pols = num_pols;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+	xdst->policy_genid = atomic_read(&pols[0]->genid);
+
+	return xdst;
+}
+
+static struct flow_cache_object *
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
+		   struct flow_cache_object *oldflo, void *ctx)
+{
+	struct dst_entry *dst_orig = (struct dst_entry *)ctx;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct xfrm_dst *xdst, *new_xdst;
+	int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+
+	/* Check if the policies from old bundle are usable */
+	xdst = NULL;
+	if (oldflo) {
+		xdst = container_of(oldflo, struct xfrm_dst, flo);
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		pol_dead = 0;
+		for (i = 0; i < num_pols; i++) {
+			pols[i] = xdst->pols[i];
+			pol_dead |= pols[i]->walk.dead;
+		}
+		if (pol_dead) {
+			dst_free(&xdst->u.dst);
+			xdst = NULL;
+			num_pols = 0;
+			num_xfrms = 0;
+			oldflo = NULL;
+		}
+	}
+
+	/* Resolve policies to use if we couldn't get them from
+	 * previous cache entry */
+	if (xdst == NULL) {
+		num_pols = 1;
+		pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto inc_error;
+		if (num_pols == 0)
+			return NULL;
+		if (num_xfrms <= 0)
+			goto make_dummy_bundle;
+	}
+
+	new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
+	if (IS_ERR(new_xdst)) {
+		err = PTR_ERR(new_xdst);
+		if (err != -EAGAIN)
+			goto error;
+		if (oldflo == NULL)
+			goto make_dummy_bundle;
+		dst_hold(&xdst->u.dst);
+		return oldflo;
+	} else if (new_xdst == NULL) {
+		num_xfrms = 0;
+		if (oldflo == NULL)
+			goto make_dummy_bundle;
+		xdst->num_xfrms = 0;
+		dst_hold(&xdst->u.dst);
+		return oldflo;
+	}
+
+	/* Kill the previous bundle */
+	if (xdst) {
+		/* The policies were stolen for newly generated bundle */
+		xdst->num_pols = 0;
+		dst_free(&xdst->u.dst);
+	}
+
+	/* Flow cache does not have reference, it dst_free()'s,
+	 * but we do need to return one reference for original caller */
+	dst_hold(&new_xdst->u.dst);
+	return &new_xdst->flo;
+
+make_dummy_bundle:
+	/* We found policies, but there's no bundles to instantiate:
+	 * either because the policy blocks, has no transformations or
+	 * we could not build template (no xfrm_states).*/
+	xdst = xfrm_alloc_dst(net, family);
+	if (IS_ERR(xdst)) {
+		xfrm_pols_put(pols, num_pols);
+		return ERR_CAST(xdst);
+	}
+	xdst->num_pols = num_pols;
+	xdst->num_xfrms = num_xfrms;
+	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
+
+	dst_hold(&xdst->u.dst);
+	return &xdst->flo;
+
+inc_error:
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+error:
+	if (xdst != NULL)
+		dst_free(&xdst->u.dst);
+	else
+		xfrm_pols_put(pols, num_pols);
+	return ERR_PTR(err);
+}
+
+static struct dst_entry *make_blackhole(struct net *net, u16 family,
+					struct dst_entry *dst_orig)
+{
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	struct dst_entry *ret;
+
+	if (!afinfo) {
+		dst_release(dst_orig);
+		ret = ERR_PTR(-EINVAL);
+	} else {
+		ret = afinfo->blackhole_route(net, dst_orig);
+	}
+	xfrm_policy_put_afinfo(afinfo);
+
+	return ret;
+}
+
+/* Main function: finds/creates a bundle for given flow.
+ *
+ * At the moment we eat a raw IP route. Mostly to speed up lookups
+ * on interfaces with disabled IPsec.
+ */
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+			      const struct flowi *fl,
+			      struct sock *sk, int flags)
+{
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	struct flow_cache_object *flo;
+	struct xfrm_dst *xdst;
+	struct dst_entry *dst, *route;
+	u16 family = dst_orig->ops->family;
+	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
+
+restart:
+	dst = NULL;
+	xdst = NULL;
+	route = NULL;
+
+	if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
+		num_pols = 1;
+		pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		err = xfrm_expand_policies(fl, family, pols,
+					   &num_pols, &num_xfrms);
+		if (err < 0)
+			goto dropdst;
+
+		if (num_pols) {
+			if (num_xfrms <= 0) {
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			xdst = xfrm_resolve_and_create_bundle(
+					pols, num_pols, fl,
+					family, dst_orig);
+			if (IS_ERR(xdst)) {
+				xfrm_pols_put(pols, num_pols);
+				err = PTR_ERR(xdst);
+				goto dropdst;
+			} else if (xdst == NULL) {
+				num_xfrms = 0;
+				drop_pols = num_pols;
+				goto no_transform;
+			}
+
+			dst_hold(&xdst->u.dst);
+
+			spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+			xdst->u.dst.next = xfrm_policy_sk_bundles;
+			xfrm_policy_sk_bundles = &xdst->u.dst;
+			spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+			route = xdst->route;
+		}
+	}
+
+	if (xdst == NULL) {
+		/* To accelerate a bit...  */
+		if ((dst_orig->flags & DST_NOXFRM) ||
+		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
+			goto nopol;
+
+		flo = flow_cache_lookup(net, fl, family, dir,
+					xfrm_bundle_lookup, dst_orig);
+		if (flo == NULL)
+			goto nopol;
+		if (IS_ERR(flo)) {
+			err = PTR_ERR(flo);
+			goto dropdst;
+		}
+		xdst = container_of(flo, struct xfrm_dst, flo);
+
+		num_pols = xdst->num_pols;
+		num_xfrms = xdst->num_xfrms;
+		memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
+		route = xdst->route;
+	}
+
+	dst = &xdst->u.dst;
+	if (route == NULL && num_xfrms > 0) {
+		/* The only case when xfrm_bundle_lookup() returns a
+		 * bundle with null route, is when the template could
+		 * not be resolved. It means policies are there, but
+		 * bundle could not be created, since we don't yet
+		 * have the xfrm_state's. We need to wait for KM to
+		 * negotiate new SA's or bail out with error.*/
+		if (net->xfrm.sysctl_larval_drop) {
+			/* EREMOTE tells the caller to generate
+			 * a one-shot blackhole route. */
+			dst_release(dst);
+			xfrm_pols_put(pols, drop_pols);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+
+			return make_blackhole(net, family, dst_orig);
+		}
+		if (fl->flowi_flags & FLOWI_FLAG_CAN_SLEEP) {
+			DECLARE_WAITQUEUE(wait, current);
+
+			add_wait_queue(&net->xfrm.km_waitq, &wait);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&net->xfrm.km_waitq, &wait);
+
+			if (!signal_pending(current)) {
+				dst_release(dst);
+				goto restart;
+			}
+
+			err = -ERESTART;
+		} else
+			err = -EAGAIN;
+
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+		goto error;
+	}
+
+no_transform:
+	if (num_pols == 0)
+		goto nopol;
+
+	if ((flags & XFRM_LOOKUP_ICMP) &&
+	    !(pols[0]->flags & XFRM_POLICY_ICMP)) {
+		err = -ENOENT;
+		goto error;
+	}
+
+	for (i = 0; i < num_pols; i++)
+		pols[i]->curlft.use_time = get_seconds();
+
+	if (num_xfrms < 0) {
+		/* Prohibit the flow */
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
+		err = -EPERM;
+		goto error;
+	} else if (num_xfrms > 0) {
+		/* Flow transformed */
+		dst_release(dst_orig);
+	} else {
+		/* Flow passes untransformed */
+		dst_release(dst);
+		dst = dst_orig;
+	}
+ok:
+	xfrm_pols_put(pols, drop_pols);
+	if (dst && dst->xfrm &&
+	    dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
+		dst->flags |= DST_XFRM_TUNNEL;
+	return dst;
+
+nopol:
+	if (!(flags & XFRM_LOOKUP_ICMP)) {
+		dst = dst_orig;
+		goto ok;
+	}
+	err = -ENOENT;
+error:
+	dst_release(dst);
+dropdst:
+	dst_release(dst_orig);
+	xfrm_pols_put(pols, drop_pols);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(xfrm_lookup);
+
+static inline int
+xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
+{
+	struct xfrm_state *x;
+
+	if (!skb->sp || idx < 0 || idx >= skb->sp->len)
+		return 0;
+	x = skb->sp->xvec[idx];
+	if (!x->type->reject)
+		return 0;
+	return x->type->reject(x, skb, fl);
+}
+
+/* When skb is transformed back to its "native" form, we have to
+ * check policy restrictions. At the moment we make this in maximally
+ * stupid way. Shame on me. :-) Of course, connected sockets must
+ * have policy cached at them.
+ */
+
+static inline int
+xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
+	      unsigned short family)
+{
+	if (xfrm_state_kern(x))
+		return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
+	return	x->id.proto == tmpl->id.proto &&
+		(x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
+		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
+		x->props.mode == tmpl->mode &&
+		(tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
+		 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
+		!(x->props.mode != XFRM_MODE_TRANSPORT &&
+		  xfrm_state_addr_cmp(tmpl, x, family));
+}
+
+/*
+ * 0 or more than 0 is returned when validation is succeeded (either bypass
+ * because of optional transport mode, or next index of the mathced secpath
+ * state with the template.
+ * -1 is returned when no matching template is found.
+ * Otherwise "-2 - errored_index" is returned.
+ */
+static inline int
+xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
+	       unsigned short family)
+{
+	int idx = start;
+
+	if (tmpl->optional) {
+		if (tmpl->mode == XFRM_MODE_TRANSPORT)
+			return start;
+	} else
+		start = -1;
+	for (; idx < sp->len; idx++) {
+		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
+			return ++idx;
+		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
+			if (start == -1)
+				start = -2-idx;
+			break;
+		}
+	}
+	return start;
+}
+
+int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
+			  unsigned int family, int reverse)
+{
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	int err;
+
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	afinfo->decode_session(skb, fl, reverse);
+	err = security_xfrm_decode_session(skb, &fl->flowi_secid);
+	xfrm_policy_put_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(__xfrm_decode_session);
+
+static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
+{
+	for (; k < sp->len; k++) {
+		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
+			*idxp = k;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
+			unsigned short family)
+{
+	struct net *net = dev_net(skb->dev);
+	struct xfrm_policy *pol;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int npols = 0;
+	int xfrm_nr;
+	int pi;
+	int reverse;
+	struct flowi fl;
+	u8 fl_dir;
+	int xerr_idx = -1;
+
+	reverse = dir & ~XFRM_POLICY_MASK;
+	dir &= XFRM_POLICY_MASK;
+	fl_dir = policy_to_flow_dir(dir);
+
+	if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+		return 0;
+	}
+
+	nf_nat_decode_session(skb, &fl, family);
+
+	/* First, check used SA against their selectors. */
+	if (skb->sp) {
+		int i;
+
+		for (i=skb->sp->len-1; i>=0; i--) {
+			struct xfrm_state *x = skb->sp->xvec[i];
+			if (!xfrm_selector_match(&x->sel, &fl, family)) {
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
+				return 0;
+			}
+		}
+	}
+
+	pol = NULL;
+	if (sk && sk->sk_policy[dir]) {
+		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+		if (IS_ERR(pol)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+			return 0;
+		}
+	}
+
+	if (!pol) {
+		struct flow_cache_object *flo;
+
+		flo = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR_OR_NULL(flo))
+			pol = ERR_CAST(flo);
+		else
+			pol = container_of(flo, struct xfrm_policy, flo);
+	}
+
+	if (IS_ERR(pol)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+		return 0;
+	}
+
+	if (!pol) {
+		if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) {
+			xfrm_secpath_reject(xerr_idx, skb, &fl);
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
+			return 0;
+		}
+		return 1;
+	}
+
+	pol->curlft.use_time = get_seconds();
+
+	pols[0] = pol;
+	npols ++;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
+						    &fl, family,
+						    XFRM_POLICY_IN);
+		if (pols[1]) {
+			if (IS_ERR(pols[1])) {
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
+				return 0;
+			}
+			pols[1]->curlft.use_time = get_seconds();
+			npols ++;
+		}
+	}
+#endif
+
+	if (pol->action == XFRM_POLICY_ALLOW) {
+		struct sec_path *sp;
+		static struct sec_path dummy;
+		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
+		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
+		struct xfrm_tmpl **tpp = tp;
+		int ti = 0;
+		int i, k;
+
+		if ((sp = skb->sp) == NULL)
+			sp = &dummy;
+
+		for (pi = 0; pi < npols; pi++) {
+			if (pols[pi] != pol &&
+			    pols[pi]->action != XFRM_POLICY_ALLOW) {
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
+				goto reject;
+			}
+			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+				goto reject_error;
+			}
+			for (i = 0; i < pols[pi]->xfrm_nr; i++)
+				tpp[ti++] = &pols[pi]->xfrm_vec[i];
+		}
+		xfrm_nr = ti;
+		if (npols > 1) {
+			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
+			tpp = stp;
+		}
+
+		/* For each tunnel xfrm, find the first matching tmpl.
+		 * For each tmpl before that, find corresponding xfrm.
+		 * Order is _important_. Later we will implement
+		 * some barriers, but at the moment barriers
+		 * are implied between each two transformations.
+		 */
+		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
+			k = xfrm_policy_ok(tpp[i], sp, k, family);
+			if (k < 0) {
+				if (k < -1)
+					/* "-2 - errored_index" returned */
+					xerr_idx = -(2+k);
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
+				goto reject;
+			}
+		}
+
+		if (secpath_has_nontransport(sp, k, &xerr_idx)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
+			goto reject;
+		}
+
+		xfrm_pols_put(pols, npols);
+		return 1;
+	}
+	XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
+
+reject:
+	xfrm_secpath_reject(xerr_idx, skb, &fl);
+reject_error:
+	xfrm_pols_put(pols, npols);
+	return 0;
+}
+EXPORT_SYMBOL(__xfrm_policy_check);
+
+int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
+{
+	struct net *net = dev_net(skb->dev);
+	struct flowi fl;
+	struct dst_entry *dst;
+	int res = 1;
+
+	if (xfrm_decode_session(skb, &fl, family) < 0) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
+		return 0;
+	}
+
+	skb_dst_force(skb);
+
+	dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, 0);
+	if (IS_ERR(dst)) {
+		res = 0;
+		dst = NULL;
+	}
+	skb_dst_set(skb, dst);
+	return res;
+}
+EXPORT_SYMBOL(__xfrm_route_forward);
+
+/* Optimize later using cookies and generation ids. */
+
+static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	/* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
+	 * to "-1" to force all XFRM destinations to get validated by
+	 * dst_ops->check on every use.  We do this because when a
+	 * normal route referenced by an XFRM dst is obsoleted we do
+	 * not go looking around for all parent referencing XFRM dsts
+	 * so that we can invalidate them.  It is just too much work.
+	 * Instead we make the checks here on every use.  For example:
+	 *
+	 *	XFRM dst A --> IPv4 dst X
+	 *
+	 * X is the "xdst->route" of A (X is also the "dst->path" of A
+	 * in this example).  If X is marked obsolete, "A" will not
+	 * notice.  That's what we are validating here via the
+	 * stale_bundle() check.
+	 *
+	 * When a policy's bundle is pruned, we dst_free() the XFRM
+	 * dst which causes it's ->obsolete field to be set to a
+	 * positive non-zero integer.  If an XFRM dst has been pruned
+	 * like this, we want to force a new route lookup.
+	 */
+	if (dst->obsolete < 0 && !stale_bundle(dst))
+		return dst;
+
+	return NULL;
+}
+
+static int stale_bundle(struct dst_entry *dst)
+{
+	return !xfrm_bundle_ok((struct xfrm_dst *)dst);
+}
+
+void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
+{
+	while ((dst = dst->child) && dst->xfrm && dst->dev == dev) {
+		dst->dev = dev_net(dev)->loopback_dev;
+		dev_hold(dst->dev);
+		dev_put(dev);
+	}
+}
+EXPORT_SYMBOL(xfrm_dst_ifdown);
+
+static void xfrm_link_failure(struct sk_buff *skb)
+{
+	/* Impossible. Such dst must be popped before reaches point of failure. */
+}
+
+static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
+{
+	if (dst) {
+		if (dst->obsolete) {
+			dst_release(dst);
+			dst = NULL;
+		}
+	}
+	return dst;
+}
+
+static void __xfrm_garbage_collect(struct net *net)
+{
+	struct dst_entry *head, *next;
+
+	flow_cache_flush();
+
+	spin_lock_bh(&xfrm_policy_sk_bundle_lock);
+	head = xfrm_policy_sk_bundles;
+	xfrm_policy_sk_bundles = NULL;
+	spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+	while (head) {
+		next = head->next;
+		dst_free(head);
+		head = next;
+	}
+}
+
+static void xfrm_init_pmtu(struct dst_entry *dst)
+{
+	do {
+		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+		u32 pmtu, route_mtu_cached;
+
+		pmtu = dst_mtu(dst->child);
+		xdst->child_mtu_cached = pmtu;
+
+		pmtu = xfrm_state_mtu(dst->xfrm, pmtu);
+
+		route_mtu_cached = dst_mtu(xdst->route);
+		xdst->route_mtu_cached = route_mtu_cached;
+
+		if (pmtu > route_mtu_cached)
+			pmtu = route_mtu_cached;
+
+		dst_metric_set(dst, RTAX_MTU, pmtu);
+	} while ((dst = dst->next));
+}
+
+/* Check that the bundle accepts the flow and its components are
+ * still valid.
+ */
+
+static int xfrm_bundle_ok(struct xfrm_dst *first)
+{
+	struct dst_entry *dst = &first->u.dst;
+	struct xfrm_dst *last;
+	u32 mtu;
+
+	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
+	    (dst->dev && !netif_running(dst->dev)))
+		return 0;
+
+	last = NULL;
+
+	do {
+		struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+		if (dst->xfrm->km.state != XFRM_STATE_VALID)
+			return 0;
+		if (xdst->xfrm_genid != dst->xfrm->genid)
+			return 0;
+		if (xdst->num_pols > 0 &&
+		    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
+			return 0;
+
+		mtu = dst_mtu(dst->child);
+		if (xdst->child_mtu_cached != mtu) {
+			last = xdst;
+			xdst->child_mtu_cached = mtu;
+		}
+
+		if (!dst_check(xdst->route, xdst->route_cookie))
+			return 0;
+		mtu = dst_mtu(xdst->route);
+		if (xdst->route_mtu_cached != mtu) {
+			last = xdst;
+			xdst->route_mtu_cached = mtu;
+		}
+
+		dst = dst->child;
+	} while (dst->xfrm);
+
+	if (likely(!last))
+		return 1;
+
+	mtu = last->child_mtu_cached;
+	for (;;) {
+		dst = &last->u.dst;
+
+		mtu = xfrm_state_mtu(dst->xfrm, mtu);
+		if (mtu > last->route_mtu_cached)
+			mtu = last->route_mtu_cached;
+		dst_metric_set(dst, RTAX_MTU, mtu);
+
+		if (last == first)
+			break;
+
+		last = (struct xfrm_dst *)last->u.dst.next;
+		last->child_mtu_cached = mtu;
+	}
+
+	return 1;
+}
+
+static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
+{
+	return dst_metric_advmss(dst->path);
+}
+
+static unsigned int xfrm_default_mtu(const struct dst_entry *dst)
+{
+	return dst_mtu(dst->path);
+}
+
+int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+	struct net *net;
+	int err = 0;
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	if (unlikely(afinfo->family >= NPROTO))
+		return -EAFNOSUPPORT;
+	write_lock_bh(&xfrm_policy_afinfo_lock);
+	if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL))
+		err = -ENOBUFS;
+	else {
+		struct dst_ops *dst_ops = afinfo->dst_ops;
+		if (likely(dst_ops->kmem_cachep == NULL))
+			dst_ops->kmem_cachep = xfrm_dst_cache;
+		if (likely(dst_ops->check == NULL))
+			dst_ops->check = xfrm_dst_check;
+		if (likely(dst_ops->default_advmss == NULL))
+			dst_ops->default_advmss = xfrm_default_advmss;
+		if (likely(dst_ops->default_mtu == NULL))
+			dst_ops->default_mtu = xfrm_default_mtu;
+		if (likely(dst_ops->negative_advice == NULL))
+			dst_ops->negative_advice = xfrm_negative_advice;
+		if (likely(dst_ops->link_failure == NULL))
+			dst_ops->link_failure = xfrm_link_failure;
+		if (likely(afinfo->garbage_collect == NULL))
+			afinfo->garbage_collect = __xfrm_garbage_collect;
+		xfrm_policy_afinfo[afinfo->family] = afinfo;
+	}
+	write_unlock_bh(&xfrm_policy_afinfo_lock);
+
+	rtnl_lock();
+	for_each_net(net) {
+		struct dst_ops *xfrm_dst_ops;
+
+		switch (afinfo->family) {
+		case AF_INET:
+			xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
+			break;
+#endif
+		default:
+			BUG();
+		}
+		*xfrm_dst_ops = *afinfo->dst_ops;
+	}
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_policy_register_afinfo);
+
+int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+	int err = 0;
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	if (unlikely(afinfo->family >= NPROTO))
+		return -EAFNOSUPPORT;
+	write_lock_bh(&xfrm_policy_afinfo_lock);
+	if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) {
+		if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo))
+			err = -EINVAL;
+		else {
+			struct dst_ops *dst_ops = afinfo->dst_ops;
+			xfrm_policy_afinfo[afinfo->family] = NULL;
+			dst_ops->kmem_cachep = NULL;
+			dst_ops->check = NULL;
+			dst_ops->negative_advice = NULL;
+			dst_ops->link_failure = NULL;
+			afinfo->garbage_collect = NULL;
+		}
+	}
+	write_unlock_bh(&xfrm_policy_afinfo_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
+
+static void __net_init xfrm_dst_ops_init(struct net *net)
+{
+	struct xfrm_policy_afinfo *afinfo;
+
+	read_lock_bh(&xfrm_policy_afinfo_lock);
+	afinfo = xfrm_policy_afinfo[AF_INET];
+	if (afinfo)
+		net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	afinfo = xfrm_policy_afinfo[AF_INET6];
+	if (afinfo)
+		net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
+#endif
+	read_unlock_bh(&xfrm_policy_afinfo_lock);
+}
+
+static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+{
+	struct xfrm_policy_afinfo *afinfo;
+	if (unlikely(family >= NPROTO))
+		return NULL;
+	read_lock(&xfrm_policy_afinfo_lock);
+	afinfo = xfrm_policy_afinfo[family];
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_policy_afinfo_lock);
+	return afinfo;
+}
+
+static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
+{
+	read_unlock(&xfrm_policy_afinfo_lock);
+}
+
+static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		__xfrm_garbage_collect(dev_net(dev));
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xfrm_dev_notifier = {
+	.notifier_call	= xfrm_dev_event,
+};
+
+#ifdef CONFIG_XFRM_STATISTICS
+static int __net_init xfrm_statistics_init(struct net *net)
+{
+	int rv;
+
+	if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics,
+			  sizeof(struct linux_xfrm_mib),
+			  __alignof__(struct linux_xfrm_mib)) < 0)
+		return -ENOMEM;
+	rv = xfrm_proc_init(net);
+	if (rv < 0)
+		snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
+	return rv;
+}
+
+static void xfrm_statistics_fini(struct net *net)
+{
+	xfrm_proc_fini(net);
+	snmp_mib_free((void __percpu **)net->mib.xfrm_statistics);
+}
+#else
+static int __net_init xfrm_statistics_init(struct net *net)
+{
+	return 0;
+}
+
+static void xfrm_statistics_fini(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm_policy_init(struct net *net)
+{
+	unsigned int hmask, sz;
+	int dir;
+
+	if (net_eq(net, &init_net))
+		xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
+					   sizeof(struct xfrm_dst),
+					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					   NULL);
+
+	hmask = 8 - 1;
+	sz = (hmask+1) * sizeof(struct hlist_head);
+
+	net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
+	if (!net->xfrm.policy_byidx)
+		goto out_byidx;
+	net->xfrm.policy_idx_hmask = hmask;
+
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		struct xfrm_policy_hash *htab;
+
+		net->xfrm.policy_count[dir] = 0;
+		INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);
+
+		htab = &net->xfrm.policy_bydst[dir];
+		htab->table = xfrm_hash_alloc(sz);
+		if (!htab->table)
+			goto out_bydst;
+		htab->hmask = hmask;
+	}
+
+	INIT_LIST_HEAD(&net->xfrm.policy_all);
+	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+	if (net_eq(net, &init_net))
+		register_netdevice_notifier(&xfrm_dev_notifier);
+	return 0;
+
+out_bydst:
+	for (dir--; dir >= 0; dir--) {
+		struct xfrm_policy_hash *htab;
+
+		htab = &net->xfrm.policy_bydst[dir];
+		xfrm_hash_free(htab->table, sz);
+	}
+	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+out_byidx:
+	return -ENOMEM;
+}
+
+static void xfrm_policy_fini(struct net *net)
+{
+	struct xfrm_audit audit_info;
+	unsigned int sz;
+	int dir;
+
+	flush_work(&net->xfrm.policy_hash_work);
+#ifdef CONFIG_XFRM_SUB_POLICY
+	audit_info.loginuid = -1;
+	audit_info.sessionid = -1;
+	audit_info.secid = 0;
+	xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, &audit_info);
+#endif
+	audit_info.loginuid = -1;
+	audit_info.sessionid = -1;
+	audit_info.secid = 0;
+	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
+
+	WARN_ON(!list_empty(&net->xfrm.policy_all));
+
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		struct xfrm_policy_hash *htab;
+
+		WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));
+
+		htab = &net->xfrm.policy_bydst[dir];
+		sz = (htab->hmask + 1);
+		WARN_ON(!hlist_empty(htab->table));
+		xfrm_hash_free(htab->table, sz);
+	}
+
+	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
+	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
+	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+}
+
+static int __net_init xfrm_net_init(struct net *net)
+{
+	int rv;
+
+	rv = xfrm_statistics_init(net);
+	if (rv < 0)
+		goto out_statistics;
+	rv = xfrm_state_init(net);
+	if (rv < 0)
+		goto out_state;
+	rv = xfrm_policy_init(net);
+	if (rv < 0)
+		goto out_policy;
+	xfrm_dst_ops_init(net);
+	rv = xfrm_sysctl_init(net);
+	if (rv < 0)
+		goto out_sysctl;
+	return 0;
+
+out_sysctl:
+	xfrm_policy_fini(net);
+out_policy:
+	xfrm_state_fini(net);
+out_state:
+	xfrm_statistics_fini(net);
+out_statistics:
+	return rv;
+}
+
+static void __net_exit xfrm_net_exit(struct net *net)
+{
+	xfrm_sysctl_fini(net);
+	xfrm_policy_fini(net);
+	xfrm_state_fini(net);
+	xfrm_statistics_fini(net);
+}
+
+static struct pernet_operations __net_initdata xfrm_net_ops = {
+	.init = xfrm_net_init,
+	.exit = xfrm_net_exit,
+};
+
+void __init xfrm_init(void)
+{
+	register_pernet_subsys(&xfrm_net_ops);
+	xfrm_input_init();
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
+					 struct audit_buffer *audit_buf)
+{
+	struct xfrm_sec_ctx *ctx = xp->security;
+	struct xfrm_selector *sel = &xp->selector;
+
+	if (ctx)
+		audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
+				 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
+
+	switch(sel->family) {
+	case AF_INET:
+		audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
+		if (sel->prefixlen_s != 32)
+			audit_log_format(audit_buf, " src_prefixlen=%d",
+					 sel->prefixlen_s);
+		audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
+		if (sel->prefixlen_d != 32)
+			audit_log_format(audit_buf, " dst_prefixlen=%d",
+					 sel->prefixlen_d);
+		break;
+	case AF_INET6:
+		audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
+		if (sel->prefixlen_s != 128)
+			audit_log_format(audit_buf, " src_prefixlen=%d",
+					 sel->prefixlen_s);
+		audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
+		if (sel->prefixlen_d != 128)
+			audit_log_format(audit_buf, " dst_prefixlen=%d",
+					 sel->prefixlen_d);
+		break;
+	}
+}
+
+void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
+			   uid_t auid, u32 sessionid, u32 secid)
+{
+	struct audit_buffer *audit_buf;
+
+	audit_buf = xfrm_audit_start("SPD-add");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
+	audit_log_format(audit_buf, " res=%u", result);
+	xfrm_audit_common_policyinfo(xp, audit_buf);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);
+
+void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
+			      uid_t auid, u32 sessionid, u32 secid)
+{
+	struct audit_buffer *audit_buf;
+
+	audit_buf = xfrm_audit_start("SPD-delete");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
+	audit_log_format(audit_buf, " res=%u", result);
+	xfrm_audit_common_policyinfo(xp, audit_buf);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
+#endif
+
+#ifdef CONFIG_XFRM_MIGRATE
+static int xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
+				       const struct xfrm_selector *sel_tgt)
+{
+	if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
+		if (sel_tgt->family == sel_cmp->family &&
+		    xfrm_addr_cmp(&sel_tgt->daddr, &sel_cmp->daddr,
+				  sel_cmp->family) == 0 &&
+		    xfrm_addr_cmp(&sel_tgt->saddr, &sel_cmp->saddr,
+				  sel_cmp->family) == 0 &&
+		    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
+		    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
+			return 1;
+		}
+	} else {
+		if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct xfrm_policy * xfrm_migrate_policy_find(const struct xfrm_selector *sel,
+						     u8 dir, u8 type)
+{
+	struct xfrm_policy *pol, *ret = NULL;
+	struct hlist_node *entry;
+	struct hlist_head *chain;
+	u32 priority = ~0U;
+
+	read_lock_bh(&xfrm_policy_lock);
+	chain = policy_hash_direct(&init_net, &sel->daddr, &sel->saddr, sel->family, dir);
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
+		    pol->type == type) {
+			ret = pol;
+			priority = ret->priority;
+			break;
+		}
+	}
+	chain = &init_net.xfrm.policy_inexact[dir];
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (xfrm_migrate_selector_match(sel, &pol->selector) &&
+		    pol->type == type &&
+		    pol->priority < priority) {
+			ret = pol;
+			break;
+		}
+	}
+
+	if (ret)
+		xfrm_pol_hold(ret);
+
+	read_unlock_bh(&xfrm_policy_lock);
+
+	return ret;
+}
+
+static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
+{
+	int match = 0;
+
+	if (t->mode == m->mode && t->id.proto == m->proto &&
+	    (m->reqid == 0 || t->reqid == m->reqid)) {
+		switch (t->mode) {
+		case XFRM_MODE_TUNNEL:
+		case XFRM_MODE_BEET:
+			if (xfrm_addr_cmp(&t->id.daddr, &m->old_daddr,
+					  m->old_family) == 0 &&
+			    xfrm_addr_cmp(&t->saddr, &m->old_saddr,
+					  m->old_family) == 0) {
+				match = 1;
+			}
+			break;
+		case XFRM_MODE_TRANSPORT:
+			/* in case of transport mode, template does not store
+			   any IP addresses, hence we just compare mode and
+			   protocol */
+			match = 1;
+			break;
+		default:
+			break;
+		}
+	}
+	return match;
+}
+
+/* update endpoint address(es) of template(s) */
+static int xfrm_policy_migrate(struct xfrm_policy *pol,
+			       struct xfrm_migrate *m, int num_migrate)
+{
+	struct xfrm_migrate *mp;
+	int i, j, n = 0;
+
+	write_lock_bh(&pol->lock);
+	if (unlikely(pol->walk.dead)) {
+		/* target policy has been deleted */
+		write_unlock_bh(&pol->lock);
+		return -ENOENT;
+	}
+
+	for (i = 0; i < pol->xfrm_nr; i++) {
+		for (j = 0, mp = m; j < num_migrate; j++, mp++) {
+			if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
+				continue;
+			n++;
+			if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
+			    pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
+				continue;
+			/* update endpoints */
+			memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
+			       sizeof(pol->xfrm_vec[i].id.daddr));
+			memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
+			       sizeof(pol->xfrm_vec[i].saddr));
+			pol->xfrm_vec[i].encap_family = mp->new_family;
+			/* flush bundles */
+			atomic_inc(&pol->genid);
+		}
+	}
+
+	write_unlock_bh(&pol->lock);
+
+	if (!n)
+		return -ENODATA;
+
+	return 0;
+}
+
+static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
+{
+	int i, j;
+
+	if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
+		return -EINVAL;
+
+	for (i = 0; i < num_migrate; i++) {
+		if ((xfrm_addr_cmp(&m[i].old_daddr, &m[i].new_daddr,
+				   m[i].old_family) == 0) &&
+		    (xfrm_addr_cmp(&m[i].old_saddr, &m[i].new_saddr,
+				   m[i].old_family) == 0))
+			return -EINVAL;
+		if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
+		    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
+			return -EINVAL;
+
+		/* check if there is any duplicated entry */
+		for (j = i + 1; j < num_migrate; j++) {
+			if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
+				    sizeof(m[i].old_daddr)) &&
+			    !memcmp(&m[i].old_saddr, &m[j].old_saddr,
+				    sizeof(m[i].old_saddr)) &&
+			    m[i].proto == m[j].proto &&
+			    m[i].mode == m[j].mode &&
+			    m[i].reqid == m[j].reqid &&
+			    m[i].old_family == m[j].old_family)
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+		 struct xfrm_migrate *m, int num_migrate,
+		 struct xfrm_kmaddress *k)
+{
+	int i, err, nx_cur = 0, nx_new = 0;
+	struct xfrm_policy *pol = NULL;
+	struct xfrm_state *x, *xc;
+	struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
+	struct xfrm_state *x_new[XFRM_MAX_DEPTH];
+	struct xfrm_migrate *mp;
+
+	if ((err = xfrm_migrate_check(m, num_migrate)) < 0)
+		goto out;
+
+	/* Stage 1 - find policy */
+	if ((pol = xfrm_migrate_policy_find(sel, dir, type)) == NULL) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	/* Stage 2 - find and update state(s) */
+	for (i = 0, mp = m; i < num_migrate; i++, mp++) {
+		if ((x = xfrm_migrate_state_find(mp))) {
+			x_cur[nx_cur] = x;
+			nx_cur++;
+			if ((xc = xfrm_state_migrate(x, mp))) {
+				x_new[nx_new] = xc;
+				nx_new++;
+			} else {
+				err = -ENODATA;
+				goto restore_state;
+			}
+		}
+	}
+
+	/* Stage 3 - update policy */
+	if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0)
+		goto restore_state;
+
+	/* Stage 4 - delete old state(s) */
+	if (nx_cur) {
+		xfrm_states_put(x_cur, nx_cur);
+		xfrm_states_delete(x_cur, nx_cur);
+	}
+
+	/* Stage 5 - announce */
+	km_migrate(sel, dir, type, m, num_migrate, k);
+
+	xfrm_pol_put(pol);
+
+	return 0;
+out:
+	return err;
+
+restore_state:
+	if (pol)
+		xfrm_pol_put(pol);
+	if (nx_cur)
+		xfrm_states_put(x_cur, nx_cur);
+	if (nx_new)
+		xfrm_states_delete(x_new, nx_new);
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_migrate);
+#endif
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
new file mode 100644
index 00000000..58d9ae00
--- /dev/null
+++ b/net/xfrm/xfrm_proc.c
@@ -0,0 +1,84 @@
+/*
+ * xfrm_proc.c
+ *
+ * Copyright (C)2006-2007 USAGI/WIDE Project
+ *
+ * Authors:	Masahide NAKAMURA <nakam@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/snmp.h>
+#include <net/xfrm.h>
+
+static const struct snmp_mib xfrm_mib_list[] = {
+	SNMP_MIB_ITEM("XfrmInError", LINUX_MIB_XFRMINERROR),
+	SNMP_MIB_ITEM("XfrmInBufferError", LINUX_MIB_XFRMINBUFFERERROR),
+	SNMP_MIB_ITEM("XfrmInHdrError", LINUX_MIB_XFRMINHDRERROR),
+	SNMP_MIB_ITEM("XfrmInNoStates", LINUX_MIB_XFRMINNOSTATES),
+	SNMP_MIB_ITEM("XfrmInStateProtoError", LINUX_MIB_XFRMINSTATEPROTOERROR),
+	SNMP_MIB_ITEM("XfrmInStateModeError", LINUX_MIB_XFRMINSTATEMODEERROR),
+	SNMP_MIB_ITEM("XfrmInStateSeqError", LINUX_MIB_XFRMINSTATESEQERROR),
+	SNMP_MIB_ITEM("XfrmInStateExpired", LINUX_MIB_XFRMINSTATEEXPIRED),
+	SNMP_MIB_ITEM("XfrmInStateMismatch", LINUX_MIB_XFRMINSTATEMISMATCH),
+	SNMP_MIB_ITEM("XfrmInStateInvalid", LINUX_MIB_XFRMINSTATEINVALID),
+	SNMP_MIB_ITEM("XfrmInTmplMismatch", LINUX_MIB_XFRMINTMPLMISMATCH),
+	SNMP_MIB_ITEM("XfrmInNoPols", LINUX_MIB_XFRMINNOPOLS),
+	SNMP_MIB_ITEM("XfrmInPolBlock", LINUX_MIB_XFRMINPOLBLOCK),
+	SNMP_MIB_ITEM("XfrmInPolError", LINUX_MIB_XFRMINPOLERROR),
+	SNMP_MIB_ITEM("XfrmOutError", LINUX_MIB_XFRMOUTERROR),
+	SNMP_MIB_ITEM("XfrmOutBundleGenError", LINUX_MIB_XFRMOUTBUNDLEGENERROR),
+	SNMP_MIB_ITEM("XfrmOutBundleCheckError", LINUX_MIB_XFRMOUTBUNDLECHECKERROR),
+	SNMP_MIB_ITEM("XfrmOutNoStates", LINUX_MIB_XFRMOUTNOSTATES),
+	SNMP_MIB_ITEM("XfrmOutStateProtoError", LINUX_MIB_XFRMOUTSTATEPROTOERROR),
+	SNMP_MIB_ITEM("XfrmOutStateModeError", LINUX_MIB_XFRMOUTSTATEMODEERROR),
+	SNMP_MIB_ITEM("XfrmOutStateSeqError", LINUX_MIB_XFRMOUTSTATESEQERROR),
+	SNMP_MIB_ITEM("XfrmOutStateExpired", LINUX_MIB_XFRMOUTSTATEEXPIRED),
+	SNMP_MIB_ITEM("XfrmOutPolBlock", LINUX_MIB_XFRMOUTPOLBLOCK),
+	SNMP_MIB_ITEM("XfrmOutPolDead", LINUX_MIB_XFRMOUTPOLDEAD),
+	SNMP_MIB_ITEM("XfrmOutPolError", LINUX_MIB_XFRMOUTPOLERROR),
+	SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR),
+	SNMP_MIB_SENTINEL
+};
+
+static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
+{
+	struct net *net = seq->private;
+	int i;
+	for (i=0; xfrm_mib_list[i].name; i++)
+		seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
+			   snmp_fold_field((void __percpu **)
+					   net->mib.xfrm_statistics,
+					   xfrm_mib_list[i].entry));
+	return 0;
+}
+
+static int xfrm_statistics_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open_net(inode, file, xfrm_statistics_seq_show);
+}
+
+static const struct file_operations xfrm_statistics_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = xfrm_statistics_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release_net,
+};
+
+int __net_init xfrm_proc_init(struct net *net)
+{
+	if (!proc_net_fops_create(net, "xfrm_stat", S_IRUGO,
+				  &xfrm_statistics_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+void xfrm_proc_fini(struct net *net)
+{
+	proc_net_remove(net, "xfrm_stat");
+}
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
new file mode 100644
index 00000000..3235023e
--- /dev/null
+++ b/net/xfrm/xfrm_replay.c
@@ -0,0 +1,550 @@
+/*
+ * xfrm_replay.c - xfrm replay detection, derived from xfrm_state.c.
+ *
+ * Copyright (C) 2010 secunet Security Networks AG
+ * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <net/xfrm.h>
+
+u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq)
+{
+	u32 seq, seq_hi, bottom;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+	if (!(x->props.flags & XFRM_STATE_ESN))
+		return 0;
+
+	seq = ntohl(net_seq);
+	seq_hi = replay_esn->seq_hi;
+	bottom = replay_esn->seq - replay_esn->replay_window + 1;
+
+	if (likely(replay_esn->seq >= replay_esn->replay_window - 1)) {
+		/* A. same subspace */
+		if (unlikely(seq < bottom))
+			seq_hi++;
+	} else {
+		/* B. window spans two subspaces */
+		if (unlikely(seq >= bottom))
+			seq_hi--;
+	}
+
+	return seq_hi;
+}
+
+static void xfrm_replay_notify(struct xfrm_state *x, int event)
+{
+	struct km_event c;
+	/* we send notify messages in case
+	 *  1. we updated on of the sequence numbers, and the seqno difference
+	 *     is at least x->replay_maxdiff, in this case we also update the
+	 *     timeout of our timer function
+	 *  2. if x->replay_maxage has elapsed since last update,
+	 *     and there were changes
+	 *
+	 *  The state structure must be locked!
+	 */
+
+	switch (event) {
+	case XFRM_REPLAY_UPDATE:
+		if (x->replay_maxdiff &&
+		    (x->replay.seq - x->preplay.seq < x->replay_maxdiff) &&
+		    (x->replay.oseq - x->preplay.oseq < x->replay_maxdiff)) {
+			if (x->xflags & XFRM_TIME_DEFER)
+				event = XFRM_REPLAY_TIMEOUT;
+			else
+				return;
+		}
+
+		break;
+
+	case XFRM_REPLAY_TIMEOUT:
+		if (memcmp(&x->replay, &x->preplay,
+			   sizeof(struct xfrm_replay_state)) == 0) {
+			x->xflags |= XFRM_TIME_DEFER;
+			return;
+		}
+
+		break;
+	}
+
+	memcpy(&x->preplay, &x->replay, sizeof(struct xfrm_replay_state));
+	c.event = XFRM_MSG_NEWAE;
+	c.data.aevent = event;
+	km_state_notify(x, &c);
+
+	if (x->replay_maxage &&
+	    !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+		x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = 0;
+	struct net *net = xs_net(x);
+
+	if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+		XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq;
+		if (unlikely(x->replay.oseq == 0)) {
+			x->replay.oseq--;
+			xfrm_audit_state_replay_overflow(x, skb);
+			err = -EOVERFLOW;
+
+			return err;
+		}
+		if (xfrm_aevent_is_on(net))
+			x->repl->notify(x, XFRM_REPLAY_UPDATE);
+	}
+
+	return err;
+}
+
+static int xfrm_replay_check(struct xfrm_state *x,
+		      struct sk_buff *skb, __be32 net_seq)
+{
+	u32 diff;
+	u32 seq = ntohl(net_seq);
+
+	if (!x->props.replay_window)
+		return 0;
+
+	if (unlikely(seq == 0))
+		goto err;
+
+	if (likely(seq > x->replay.seq))
+		return 0;
+
+	diff = x->replay.seq - seq;
+	if (diff >= min_t(unsigned int, x->props.replay_window,
+			  sizeof(x->replay.bitmap) * 8)) {
+		x->stats.replay_window++;
+		goto err;
+	}
+
+	if (x->replay.bitmap & (1U << diff)) {
+		x->stats.replay++;
+		goto err;
+	}
+	return 0;
+
+err:
+	xfrm_audit_state_replay(x, skb, net_seq);
+	return -EINVAL;
+}
+
+static void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
+{
+	u32 diff;
+	u32 seq = ntohl(net_seq);
+
+	if (!x->props.replay_window)
+		return;
+
+	if (seq > x->replay.seq) {
+		diff = seq - x->replay.seq;
+		if (diff < x->props.replay_window)
+			x->replay.bitmap = ((x->replay.bitmap) << diff) | 1;
+		else
+			x->replay.bitmap = 1;
+		x->replay.seq = seq;
+	} else {
+		diff = x->replay.seq - seq;
+		x->replay.bitmap |= (1U << diff);
+	}
+
+	if (xfrm_aevent_is_on(xs_net(x)))
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = 0;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	struct net *net = xs_net(x);
+
+	if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+		XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+		if (unlikely(replay_esn->oseq == 0)) {
+			replay_esn->oseq--;
+			xfrm_audit_state_replay_overflow(x, skb);
+			err = -EOVERFLOW;
+
+			return err;
+		}
+		if (xfrm_aevent_is_on(net))
+			x->repl->notify(x, XFRM_REPLAY_UPDATE);
+	}
+
+	return err;
+}
+
+static int xfrm_replay_check_bmp(struct xfrm_state *x,
+				 struct sk_buff *skb, __be32 net_seq)
+{
+	unsigned int bitnr, nr;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	u32 pos;
+	u32 seq = ntohl(net_seq);
+	u32 diff =  replay_esn->seq - seq;
+
+	if (!replay_esn->replay_window)
+		return 0;
+
+	pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+	if (unlikely(seq == 0))
+		goto err;
+
+	if (likely(seq > replay_esn->seq))
+		return 0;
+
+	if (diff >= replay_esn->replay_window) {
+		x->stats.replay_window++;
+		goto err;
+	}
+
+	if (pos >= diff) {
+		bitnr = (pos - diff) % replay_esn->replay_window;
+		nr = bitnr >> 5;
+		bitnr = bitnr & 0x1F;
+		if (replay_esn->bmp[nr] & (1U << bitnr))
+			goto err_replay;
+	} else {
+		bitnr = replay_esn->replay_window - (diff - pos);
+		nr = bitnr >> 5;
+		bitnr = bitnr & 0x1F;
+		if (replay_esn->bmp[nr] & (1U << bitnr))
+			goto err_replay;
+	}
+	return 0;
+
+err_replay:
+	x->stats.replay++;
+err:
+	xfrm_audit_state_replay(x, skb, net_seq);
+	return -EINVAL;
+}
+
+static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq)
+{
+	unsigned int bitnr, nr, i;
+	u32 diff;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	u32 seq = ntohl(net_seq);
+	u32 pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+	if (!replay_esn->replay_window)
+		return;
+
+	if (seq > replay_esn->seq) {
+		diff = seq - replay_esn->seq;
+
+		if (diff < replay_esn->replay_window) {
+			for (i = 1; i < diff; i++) {
+				bitnr = (pos + i) % replay_esn->replay_window;
+				nr = bitnr >> 5;
+				bitnr = bitnr & 0x1F;
+				replay_esn->bmp[nr] &=  ~(1U << bitnr);
+			}
+
+			bitnr = (pos + diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		} else {
+			nr = (replay_esn->replay_window - 1) >> 5;
+			for (i = 0; i <= nr; i++)
+				replay_esn->bmp[i] = 0;
+
+			bitnr = (pos + diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		}
+
+		replay_esn->seq = seq;
+	} else {
+		diff = replay_esn->seq - seq;
+
+		if (pos >= diff) {
+			bitnr = (pos - diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		} else {
+			bitnr = replay_esn->replay_window - (diff - pos);
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		}
+	}
+
+	if (xfrm_aevent_is_on(xs_net(x)))
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static void xfrm_replay_notify_bmp(struct xfrm_state *x, int event)
+{
+	struct km_event c;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	struct xfrm_replay_state_esn *preplay_esn = x->preplay_esn;
+
+	/* we send notify messages in case
+	 *  1. we updated on of the sequence numbers, and the seqno difference
+	 *     is at least x->replay_maxdiff, in this case we also update the
+	 *     timeout of our timer function
+	 *  2. if x->replay_maxage has elapsed since last update,
+	 *     and there were changes
+	 *
+	 *  The state structure must be locked!
+	 */
+
+	switch (event) {
+	case XFRM_REPLAY_UPDATE:
+		if (x->replay_maxdiff &&
+		    (replay_esn->seq - preplay_esn->seq < x->replay_maxdiff) &&
+		    (replay_esn->oseq - preplay_esn->oseq < x->replay_maxdiff)) {
+			if (x->xflags & XFRM_TIME_DEFER)
+				event = XFRM_REPLAY_TIMEOUT;
+			else
+				return;
+		}
+
+		break;
+
+	case XFRM_REPLAY_TIMEOUT:
+		if (memcmp(x->replay_esn, x->preplay_esn,
+			   xfrm_replay_state_esn_len(replay_esn)) == 0) {
+			x->xflags |= XFRM_TIME_DEFER;
+			return;
+		}
+
+		break;
+	}
+
+	memcpy(x->preplay_esn, x->replay_esn,
+	       xfrm_replay_state_esn_len(replay_esn));
+	c.event = XFRM_MSG_NEWAE;
+	c.data.aevent = event;
+	km_state_notify(x, &c);
+
+	if (x->replay_maxage &&
+	    !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
+		x->xflags &= ~XFRM_TIME_DEFER;
+}
+
+static int xfrm_replay_overflow_esn(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = 0;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	struct net *net = xs_net(x);
+
+	if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+		XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+		XFRM_SKB_CB(skb)->seq.output.hi = replay_esn->oseq_hi;
+
+		if (unlikely(replay_esn->oseq == 0)) {
+			XFRM_SKB_CB(skb)->seq.output.hi = ++replay_esn->oseq_hi;
+
+			if (replay_esn->oseq_hi == 0) {
+				replay_esn->oseq--;
+				replay_esn->oseq_hi--;
+				xfrm_audit_state_replay_overflow(x, skb);
+				err = -EOVERFLOW;
+
+				return err;
+			}
+		}
+		if (xfrm_aevent_is_on(net))
+			x->repl->notify(x, XFRM_REPLAY_UPDATE);
+	}
+
+	return err;
+}
+
+static int xfrm_replay_check_esn(struct xfrm_state *x,
+				 struct sk_buff *skb, __be32 net_seq)
+{
+	unsigned int bitnr, nr;
+	u32 diff;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	u32 pos;
+	u32 seq = ntohl(net_seq);
+	u32 wsize = replay_esn->replay_window;
+	u32 top = replay_esn->seq;
+	u32 bottom = top - wsize + 1;
+
+	if (!wsize)
+		return 0;
+
+	pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+
+	if (unlikely(seq == 0 && replay_esn->seq_hi == 0 &&
+		     (replay_esn->seq < replay_esn->replay_window - 1)))
+		goto err;
+
+	diff = top - seq;
+
+	if (likely(top >= wsize - 1)) {
+		/* A. same subspace */
+		if (likely(seq > top) || seq < bottom)
+			return 0;
+	} else {
+		/* B. window spans two subspaces */
+		if (likely(seq > top && seq < bottom))
+			return 0;
+		if (seq >= bottom)
+			diff = ~seq + top + 1;
+	}
+
+	if (diff >= replay_esn->replay_window) {
+		x->stats.replay_window++;
+		goto err;
+	}
+
+	if (pos >= diff) {
+		bitnr = (pos - diff) % replay_esn->replay_window;
+		nr = bitnr >> 5;
+		bitnr = bitnr & 0x1F;
+		if (replay_esn->bmp[nr] & (1U << bitnr))
+			goto err_replay;
+	} else {
+		bitnr = replay_esn->replay_window - (diff - pos);
+		nr = bitnr >> 5;
+		bitnr = bitnr & 0x1F;
+		if (replay_esn->bmp[nr] & (1U << bitnr))
+			goto err_replay;
+	}
+	return 0;
+
+err_replay:
+	x->stats.replay++;
+err:
+	xfrm_audit_state_replay(x, skb, net_seq);
+	return -EINVAL;
+}
+
+static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
+{
+	unsigned int bitnr, nr, i;
+	int wrap;
+	u32 diff, pos, seq, seq_hi;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+	if (!replay_esn->replay_window)
+		return;
+
+	seq = ntohl(net_seq);
+	pos = (replay_esn->seq - 1) % replay_esn->replay_window;
+	seq_hi = xfrm_replay_seqhi(x, net_seq);
+	wrap = seq_hi - replay_esn->seq_hi;
+
+	if ((!wrap && seq > replay_esn->seq) || wrap > 0) {
+		if (likely(!wrap))
+			diff = seq - replay_esn->seq;
+		else
+			diff = ~replay_esn->seq + seq + 1;
+
+		if (diff < replay_esn->replay_window) {
+			for (i = 1; i < diff; i++) {
+				bitnr = (pos + i) % replay_esn->replay_window;
+				nr = bitnr >> 5;
+				bitnr = bitnr & 0x1F;
+				replay_esn->bmp[nr] &=  ~(1U << bitnr);
+			}
+
+			bitnr = (pos + diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		} else {
+			nr = (replay_esn->replay_window - 1) >> 5;
+			for (i = 0; i <= nr; i++)
+				replay_esn->bmp[i] = 0;
+
+			bitnr = (pos + diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		}
+
+		replay_esn->seq = seq;
+
+		if (unlikely(wrap > 0))
+			replay_esn->seq_hi++;
+	} else {
+		diff = replay_esn->seq - seq;
+
+		if (pos >= diff) {
+			bitnr = (pos - diff) % replay_esn->replay_window;
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		} else {
+			bitnr = replay_esn->replay_window - (diff - pos);
+			nr = bitnr >> 5;
+			bitnr = bitnr & 0x1F;
+			replay_esn->bmp[nr] |= (1U << bitnr);
+		}
+	}
+
+	if (xfrm_aevent_is_on(xs_net(x)))
+		x->repl->notify(x, XFRM_REPLAY_UPDATE);
+}
+
+static struct xfrm_replay xfrm_replay_legacy = {
+	.advance	= xfrm_replay_advance,
+	.check		= xfrm_replay_check,
+	.notify		= xfrm_replay_notify,
+	.overflow	= xfrm_replay_overflow,
+};
+
+static struct xfrm_replay xfrm_replay_bmp = {
+	.advance	= xfrm_replay_advance_bmp,
+	.check		= xfrm_replay_check_bmp,
+	.notify		= xfrm_replay_notify_bmp,
+	.overflow	= xfrm_replay_overflow_bmp,
+};
+
+static struct xfrm_replay xfrm_replay_esn = {
+	.advance	= xfrm_replay_advance_esn,
+	.check		= xfrm_replay_check_esn,
+	.notify		= xfrm_replay_notify_bmp,
+	.overflow	= xfrm_replay_overflow_esn,
+};
+
+int xfrm_init_replay(struct xfrm_state *x)
+{
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+
+	if (replay_esn) {
+		if (replay_esn->replay_window >
+		    replay_esn->bmp_len * sizeof(__u32) * 8)
+			return -EINVAL;
+
+	if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0)
+		return -EINVAL;
+
+	if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn)
+		x->repl = &xfrm_replay_esn;
+	else
+		x->repl = &xfrm_replay_bmp;
+	} else
+		x->repl = &xfrm_replay_legacy;
+
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_init_replay);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
new file mode 100644
index 00000000..9414b9c5
--- /dev/null
+++ b/net/xfrm/xfrm_state.c
@@ -0,0 +1,2231 @@
+/*
+ * xfrm_state.c
+ *
+ * Changes:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * 		IPv6 support
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific functions
+ *	Derek Atkins <derek@ihtfp.com>
+ *		Add UDP Encapsulation
+ *
+ */
+
+#include <linux/workqueue.h>
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/audit.h>
+#include <asm/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include "xfrm_hash.h"
+
+/* Each xfrm_state may be linked to two tables:
+
+   1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
+   2. Hash table by (daddr,family,reqid) to find what SAs exist for given
+      destination/tunnel endpoint. (output)
+ */
+
+static DEFINE_SPINLOCK(xfrm_state_lock);
+
+static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+
+static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
+static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
+
+static inline unsigned int xfrm_dst_hash(struct net *net,
+					 const xfrm_address_t *daddr,
+					 const xfrm_address_t *saddr,
+					 u32 reqid,
+					 unsigned short family)
+{
+	return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask);
+}
+
+static inline unsigned int xfrm_src_hash(struct net *net,
+					 const xfrm_address_t *daddr,
+					 const xfrm_address_t *saddr,
+					 unsigned short family)
+{
+	return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask);
+}
+
+static inline unsigned int
+xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr,
+	      __be32 spi, u8 proto, unsigned short family)
+{
+	return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
+}
+
+static void xfrm_hash_transfer(struct hlist_head *list,
+			       struct hlist_head *ndsttable,
+			       struct hlist_head *nsrctable,
+			       struct hlist_head *nspitable,
+			       unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp;
+	struct xfrm_state *x;
+
+	hlist_for_each_entry_safe(x, entry, tmp, list, bydst) {
+		unsigned int h;
+
+		h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
+				    x->props.reqid, x->props.family,
+				    nhashmask);
+		hlist_add_head(&x->bydst, ndsttable+h);
+
+		h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
+				    x->props.family,
+				    nhashmask);
+		hlist_add_head(&x->bysrc, nsrctable+h);
+
+		if (x->id.spi) {
+			h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
+					    x->id.proto, x->props.family,
+					    nhashmask);
+			hlist_add_head(&x->byspi, nspitable+h);
+		}
+	}
+}
+
+static unsigned long xfrm_hash_new_size(unsigned int state_hmask)
+{
+	return ((state_hmask + 1) << 1) * sizeof(struct hlist_head);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void xfrm_hash_resize(struct work_struct *work)
+{
+	struct net *net = container_of(work, struct net, xfrm.state_hash_work);
+	struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi;
+	unsigned long nsize, osize;
+	unsigned int nhashmask, ohashmask;
+	int i;
+
+	mutex_lock(&hash_resize_mutex);
+
+	nsize = xfrm_hash_new_size(net->xfrm.state_hmask);
+	ndst = xfrm_hash_alloc(nsize);
+	if (!ndst)
+		goto out_unlock;
+	nsrc = xfrm_hash_alloc(nsize);
+	if (!nsrc) {
+		xfrm_hash_free(ndst, nsize);
+		goto out_unlock;
+	}
+	nspi = xfrm_hash_alloc(nsize);
+	if (!nspi) {
+		xfrm_hash_free(ndst, nsize);
+		xfrm_hash_free(nsrc, nsize);
+		goto out_unlock;
+	}
+
+	spin_lock_bh(&xfrm_state_lock);
+
+	nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+	for (i = net->xfrm.state_hmask; i >= 0; i--)
+		xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
+				   nhashmask);
+
+	odst = net->xfrm.state_bydst;
+	osrc = net->xfrm.state_bysrc;
+	ospi = net->xfrm.state_byspi;
+	ohashmask = net->xfrm.state_hmask;
+
+	net->xfrm.state_bydst = ndst;
+	net->xfrm.state_bysrc = nsrc;
+	net->xfrm.state_byspi = nspi;
+	net->xfrm.state_hmask = nhashmask;
+
+	spin_unlock_bh(&xfrm_state_lock);
+
+	osize = (ohashmask + 1) * sizeof(struct hlist_head);
+	xfrm_hash_free(odst, osize);
+	xfrm_hash_free(osrc, osize);
+	xfrm_hash_free(ospi, osize);
+
+out_unlock:
+	mutex_unlock(&hash_resize_mutex);
+}
+
+static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
+static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
+
+static DEFINE_SPINLOCK(xfrm_state_gc_lock);
+
+int __xfrm_state_delete(struct xfrm_state *x);
+
+int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
+void km_state_expired(struct xfrm_state *x, int hard, u32 pid);
+
+static struct xfrm_state_afinfo *xfrm_state_lock_afinfo(unsigned int family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	if (unlikely(family >= NPROTO))
+		return NULL;
+	write_lock_bh(&xfrm_state_afinfo_lock);
+	afinfo = xfrm_state_afinfo[family];
+	if (unlikely(!afinfo))
+		write_unlock_bh(&xfrm_state_afinfo_lock);
+	return afinfo;
+}
+
+static void xfrm_state_unlock_afinfo(struct xfrm_state_afinfo *afinfo)
+	__releases(xfrm_state_afinfo_lock)
+{
+	write_unlock_bh(&xfrm_state_afinfo_lock);
+}
+
+int xfrm_register_type(const struct xfrm_type *type, unsigned short family)
+{
+	struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family);
+	const struct xfrm_type **typemap;
+	int err = 0;
+
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+	typemap = afinfo->type_map;
+
+	if (likely(typemap[type->proto] == NULL))
+		typemap[type->proto] = type;
+	else
+		err = -EEXIST;
+	xfrm_state_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_register_type);
+
+int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family)
+{
+	struct xfrm_state_afinfo *afinfo = xfrm_state_lock_afinfo(family);
+	const struct xfrm_type **typemap;
+	int err = 0;
+
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+	typemap = afinfo->type_map;
+
+	if (unlikely(typemap[type->proto] != type))
+		err = -ENOENT;
+	else
+		typemap[type->proto] = NULL;
+	xfrm_state_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_type);
+
+static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	const struct xfrm_type **typemap;
+	const struct xfrm_type *type;
+	int modload_attempted = 0;
+
+retry:
+	afinfo = xfrm_state_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return NULL;
+	typemap = afinfo->type_map;
+
+	type = typemap[proto];
+	if (unlikely(type && !try_module_get(type->owner)))
+		type = NULL;
+	if (!type && !modload_attempted) {
+		xfrm_state_put_afinfo(afinfo);
+		request_module("xfrm-type-%d-%d", family, proto);
+		modload_attempted = 1;
+		goto retry;
+	}
+
+	xfrm_state_put_afinfo(afinfo);
+	return type;
+}
+
+static void xfrm_put_type(const struct xfrm_type *type)
+{
+	module_put(type->owner);
+}
+
+int xfrm_register_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_state_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -EEXIST;
+	modemap = afinfo->mode_map;
+	if (modemap[mode->encap])
+		goto out;
+
+	err = -ENOENT;
+	if (!try_module_get(afinfo->owner))
+		goto out;
+
+	mode->afinfo = afinfo;
+	modemap[mode->encap] = mode;
+	err = 0;
+
+out:
+	xfrm_state_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_register_mode);
+
+int xfrm_unregister_mode(struct xfrm_mode *mode, int family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	struct xfrm_mode **modemap;
+	int err;
+
+	if (unlikely(mode->encap >= XFRM_MODE_MAX))
+		return -EINVAL;
+
+	afinfo = xfrm_state_lock_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return -EAFNOSUPPORT;
+
+	err = -ENOENT;
+	modemap = afinfo->mode_map;
+	if (likely(modemap[mode->encap] == mode)) {
+		modemap[mode->encap] = NULL;
+		module_put(mode->afinfo->owner);
+		err = 0;
+	}
+
+	xfrm_state_unlock_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_unregister_mode);
+
+static struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	struct xfrm_mode *mode;
+	int modload_attempted = 0;
+
+	if (unlikely(encap >= XFRM_MODE_MAX))
+		return NULL;
+
+retry:
+	afinfo = xfrm_state_get_afinfo(family);
+	if (unlikely(afinfo == NULL))
+		return NULL;
+
+	mode = afinfo->mode_map[encap];
+	if (unlikely(mode && !try_module_get(mode->owner)))
+		mode = NULL;
+	if (!mode && !modload_attempted) {
+		xfrm_state_put_afinfo(afinfo);
+		request_module("xfrm-mode-%d-%d", family, encap);
+		modload_attempted = 1;
+		goto retry;
+	}
+
+	xfrm_state_put_afinfo(afinfo);
+	return mode;
+}
+
+static void xfrm_put_mode(struct xfrm_mode *mode)
+{
+	module_put(mode->owner);
+}
+
+static void xfrm_state_gc_destroy(struct xfrm_state *x)
+{
+	tasklet_hrtimer_cancel(&x->mtimer);
+	del_timer_sync(&x->rtimer);
+	kfree(x->aalg);
+	kfree(x->ealg);
+	kfree(x->calg);
+	kfree(x->encap);
+	kfree(x->coaddr);
+	kfree(x->replay_esn);
+	kfree(x->preplay_esn);
+	if (x->inner_mode)
+		xfrm_put_mode(x->inner_mode);
+	if (x->inner_mode_iaf)
+		xfrm_put_mode(x->inner_mode_iaf);
+	if (x->outer_mode)
+		xfrm_put_mode(x->outer_mode);
+	if (x->type) {
+		x->type->destructor(x);
+		xfrm_put_type(x->type);
+	}
+	security_xfrm_state_free(x);
+	kfree(x);
+}
+
+static void xfrm_state_gc_task(struct work_struct *work)
+{
+	struct net *net = container_of(work, struct net, xfrm.state_gc_work);
+	struct xfrm_state *x;
+	struct hlist_node *entry, *tmp;
+	struct hlist_head gc_list;
+
+	spin_lock_bh(&xfrm_state_gc_lock);
+	hlist_move_list(&net->xfrm.state_gc_list, &gc_list);
+	spin_unlock_bh(&xfrm_state_gc_lock);
+
+	hlist_for_each_entry_safe(x, entry, tmp, &gc_list, gclist)
+		xfrm_state_gc_destroy(x);
+
+	wake_up(&net->xfrm.km_waitq);
+}
+
+static inline unsigned long make_jiffies(long secs)
+{
+	if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
+		return MAX_SCHEDULE_TIMEOUT-1;
+	else
+		return secs*HZ;
+}
+
+static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
+{
+	struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
+	struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
+	struct net *net = xs_net(x);
+	unsigned long now = get_seconds();
+	long next = LONG_MAX;
+	int warn = 0;
+	int err = 0;
+
+	spin_lock(&x->lock);
+	if (x->km.state == XFRM_STATE_DEAD)
+		goto out;
+	if (x->km.state == XFRM_STATE_EXPIRED)
+		goto expired;
+	if (x->lft.hard_add_expires_seconds) {
+		long tmo = x->lft.hard_add_expires_seconds +
+			x->curlft.add_time - now;
+		if (tmo <= 0)
+			goto expired;
+		if (tmo < next)
+			next = tmo;
+	}
+	if (x->lft.hard_use_expires_seconds) {
+		long tmo = x->lft.hard_use_expires_seconds +
+			(x->curlft.use_time ? : now) - now;
+		if (tmo <= 0)
+			goto expired;
+		if (tmo < next)
+			next = tmo;
+	}
+	if (x->km.dying)
+		goto resched;
+	if (x->lft.soft_add_expires_seconds) {
+		long tmo = x->lft.soft_add_expires_seconds +
+			x->curlft.add_time - now;
+		if (tmo <= 0)
+			warn = 1;
+		else if (tmo < next)
+			next = tmo;
+	}
+	if (x->lft.soft_use_expires_seconds) {
+		long tmo = x->lft.soft_use_expires_seconds +
+			(x->curlft.use_time ? : now) - now;
+		if (tmo <= 0)
+			warn = 1;
+		else if (tmo < next)
+			next = tmo;
+	}
+
+	x->km.dying = warn;
+	if (warn)
+		km_state_expired(x, 0, 0);
+resched:
+	if (next != LONG_MAX){
+		tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
+	}
+
+	goto out;
+
+expired:
+	if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) {
+		x->km.state = XFRM_STATE_EXPIRED;
+		wake_up(&net->xfrm.km_waitq);
+		next = 2;
+		goto resched;
+	}
+
+	err = __xfrm_state_delete(x);
+	if (!err && x->id.spi)
+		km_state_expired(x, 1, 0);
+
+	xfrm_audit_state_delete(x, err ? 0 : 1,
+				audit_get_loginuid(current),
+				audit_get_sessionid(current), 0);
+
+out:
+	spin_unlock(&x->lock);
+	return HRTIMER_NORESTART;
+}
+
+static void xfrm_replay_timer_handler(unsigned long data);
+
+struct xfrm_state *xfrm_state_alloc(struct net *net)
+{
+	struct xfrm_state *x;
+
+	x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
+
+	if (x) {
+		write_pnet(&x->xs_net, net);
+		atomic_set(&x->refcnt, 1);
+		atomic_set(&x->tunnel_users, 0);
+		INIT_LIST_HEAD(&x->km.all);
+		INIT_HLIST_NODE(&x->bydst);
+		INIT_HLIST_NODE(&x->bysrc);
+		INIT_HLIST_NODE(&x->byspi);
+		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+		setup_timer(&x->rtimer, xfrm_replay_timer_handler,
+				(unsigned long)x);
+		x->curlft.add_time = get_seconds();
+		x->lft.soft_byte_limit = XFRM_INF;
+		x->lft.soft_packet_limit = XFRM_INF;
+		x->lft.hard_byte_limit = XFRM_INF;
+		x->lft.hard_packet_limit = XFRM_INF;
+		x->replay_maxage = 0;
+		x->replay_maxdiff = 0;
+		x->inner_mode = NULL;
+		x->inner_mode_iaf = NULL;
+		spin_lock_init(&x->lock);
+	}
+	return x;
+}
+EXPORT_SYMBOL(xfrm_state_alloc);
+
+void __xfrm_state_destroy(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+
+	WARN_ON(x->km.state != XFRM_STATE_DEAD);
+
+	spin_lock_bh(&xfrm_state_gc_lock);
+	hlist_add_head(&x->gclist, &net->xfrm.state_gc_list);
+	spin_unlock_bh(&xfrm_state_gc_lock);
+	schedule_work(&net->xfrm.state_gc_work);
+}
+EXPORT_SYMBOL(__xfrm_state_destroy);
+
+int __xfrm_state_delete(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	int err = -ESRCH;
+
+	if (x->km.state != XFRM_STATE_DEAD) {
+		x->km.state = XFRM_STATE_DEAD;
+		spin_lock(&xfrm_state_lock);
+		list_del(&x->km.all);
+		hlist_del(&x->bydst);
+		hlist_del(&x->bysrc);
+		if (x->id.spi)
+			hlist_del(&x->byspi);
+		net->xfrm.state_num--;
+		spin_unlock(&xfrm_state_lock);
+
+		/* All xfrm_state objects are created by xfrm_state_alloc.
+		 * The xfrm_state_alloc call gives a reference, and that
+		 * is what we are dropping here.
+		 */
+		xfrm_state_put(x);
+		err = 0;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(__xfrm_state_delete);
+
+int xfrm_state_delete(struct xfrm_state *x)
+{
+	int err;
+
+	spin_lock_bh(&x->lock);
+	err = __xfrm_state_delete(x);
+	spin_unlock_bh(&x->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_delete);
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int
+xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info)
+{
+	int i, err = 0;
+
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct hlist_node *entry;
+		struct xfrm_state *x;
+
+		hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) {
+			if (xfrm_id_proto_match(x->id.proto, proto) &&
+			   (err = security_xfrm_state_delete(x)) != 0) {
+				xfrm_audit_state_delete(x, 0,
+							audit_info->loginuid,
+							audit_info->sessionid,
+							audit_info->secid);
+				return err;
+			}
+		}
+	}
+
+	return err;
+}
+#else
+static inline int
+xfrm_state_flush_secctx_check(struct net *net, u8 proto, struct xfrm_audit *audit_info)
+{
+	return 0;
+}
+#endif
+
+int xfrm_state_flush(struct net *net, u8 proto, struct xfrm_audit *audit_info)
+{
+	int i, err = 0, cnt = 0;
+
+	spin_lock_bh(&xfrm_state_lock);
+	err = xfrm_state_flush_secctx_check(net, proto, audit_info);
+	if (err)
+		goto out;
+
+	err = -ESRCH;
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct hlist_node *entry;
+		struct xfrm_state *x;
+restart:
+		hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) {
+			if (!xfrm_state_kern(x) &&
+			    xfrm_id_proto_match(x->id.proto, proto)) {
+				xfrm_state_hold(x);
+				spin_unlock_bh(&xfrm_state_lock);
+
+				err = xfrm_state_delete(x);
+				xfrm_audit_state_delete(x, err ? 0 : 1,
+							audit_info->loginuid,
+							audit_info->sessionid,
+							audit_info->secid);
+				xfrm_state_put(x);
+				if (!err)
+					cnt++;
+
+				spin_lock_bh(&xfrm_state_lock);
+				goto restart;
+			}
+		}
+	}
+	if (cnt)
+		err = 0;
+
+out:
+	spin_unlock_bh(&xfrm_state_lock);
+	wake_up(&net->xfrm.km_waitq);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_flush);
+
+void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
+{
+	spin_lock_bh(&xfrm_state_lock);
+	si->sadcnt = net->xfrm.state_num;
+	si->sadhcnt = net->xfrm.state_hmask;
+	si->sadhmcnt = xfrm_state_hashmax;
+	spin_unlock_bh(&xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_sad_getinfo);
+
+static int
+xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl,
+		    const struct xfrm_tmpl *tmpl,
+		    const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+		    unsigned short family)
+{
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return -1;
+	afinfo->init_tempsel(&x->sel, fl);
+
+	if (family != tmpl->encap_family) {
+		xfrm_state_put_afinfo(afinfo);
+		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+		if (!afinfo)
+			return -1;
+	}
+	afinfo->init_temprop(x, tmpl, daddr, saddr);
+	xfrm_state_put_afinfo(afinfo);
+	return 0;
+}
+
+static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark,
+					      const xfrm_address_t *daddr,
+					      __be32 spi, u8 proto,
+					      unsigned short family)
+{
+	unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family);
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+
+	hlist_for_each_entry(x, entry, net->xfrm.state_byspi+h, byspi) {
+		if (x->props.family != family ||
+		    x->id.spi       != spi ||
+		    x->id.proto     != proto ||
+		    xfrm_addr_cmp(&x->id.daddr, daddr, family))
+			continue;
+
+		if ((mark & x->mark.m) != x->mark.v)
+			continue;
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	return NULL;
+}
+
+static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark,
+						     const xfrm_address_t *daddr,
+						     const xfrm_address_t *saddr,
+						     u8 proto, unsigned short family)
+{
+	unsigned int h = xfrm_src_hash(net, daddr, saddr, family);
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+
+	hlist_for_each_entry(x, entry, net->xfrm.state_bysrc+h, bysrc) {
+		if (x->props.family != family ||
+		    x->id.proto     != proto ||
+		    xfrm_addr_cmp(&x->id.daddr, daddr, family) ||
+		    xfrm_addr_cmp(&x->props.saddr, saddr, family))
+			continue;
+
+		if ((mark & x->mark.m) != x->mark.v)
+			continue;
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	return NULL;
+}
+
+static inline struct xfrm_state *
+__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
+{
+	struct net *net = xs_net(x);
+	u32 mark = x->mark.v & x->mark.m;
+
+	if (use_spi)
+		return __xfrm_state_lookup(net, mark, &x->id.daddr,
+					   x->id.spi, x->id.proto, family);
+	else
+		return __xfrm_state_lookup_byaddr(net, mark,
+						  &x->id.daddr,
+						  &x->props.saddr,
+						  x->id.proto, family);
+}
+
+static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
+{
+	if (have_hash_collision &&
+	    (net->xfrm.state_hmask + 1) < xfrm_state_hashmax &&
+	    net->xfrm.state_num > net->xfrm.state_hmask)
+		schedule_work(&net->xfrm.state_hash_work);
+}
+
+static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
+			       const struct flowi *fl, unsigned short family,
+			       struct xfrm_state **best, int *acq_in_progress,
+			       int *error)
+{
+	/* Resolution logic:
+	 * 1. There is a valid state with matching selector. Done.
+	 * 2. Valid state with inappropriate selector. Skip.
+	 *
+	 * Entering area of "sysdeps".
+	 *
+	 * 3. If state is not valid, selector is temporary, it selects
+	 *    only session which triggered previous resolution. Key
+	 *    manager will do something to install a state with proper
+	 *    selector.
+	 */
+	if (x->km.state == XFRM_STATE_VALID) {
+		if ((x->sel.family &&
+		     !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
+		    !security_xfrm_state_pol_flow_match(x, pol, fl))
+			return;
+
+		if (!*best ||
+		    (*best)->km.dying > x->km.dying ||
+		    ((*best)->km.dying == x->km.dying &&
+		     (*best)->curlft.add_time < x->curlft.add_time))
+			*best = x;
+	} else if (x->km.state == XFRM_STATE_ACQ) {
+		*acq_in_progress = 1;
+	} else if (x->km.state == XFRM_STATE_ERROR ||
+		   x->km.state == XFRM_STATE_EXPIRED) {
+		if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
+		    security_xfrm_state_pol_flow_match(x, pol, fl))
+			*error = -ESRCH;
+	}
+}
+
+struct xfrm_state *
+xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+		const struct flowi *fl, struct xfrm_tmpl *tmpl,
+		struct xfrm_policy *pol, int *err,
+		unsigned short family)
+{
+	static xfrm_address_t saddr_wildcard = { };
+	struct net *net = xp_net(pol);
+	unsigned int h, h_wildcard;
+	struct hlist_node *entry;
+	struct xfrm_state *x, *x0, *to_put;
+	int acquire_in_progress = 0;
+	int error = 0;
+	struct xfrm_state *best = NULL;
+	u32 mark = pol->mark.v & pol->mark.m;
+	unsigned short encap_family = tmpl->encap_family;
+
+	to_put = NULL;
+
+	spin_lock_bh(&xfrm_state_lock);
+	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+		if (x->props.family == encap_family &&
+		    x->props.reqid == tmpl->reqid &&
+		    (mark & x->mark.m) == x->mark.v &&
+		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
+		    tmpl->mode == x->props.mode &&
+		    tmpl->id.proto == x->id.proto &&
+		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+			xfrm_state_look_at(pol, x, fl, encap_family,
+					   &best, &acquire_in_progress, &error);
+	}
+	if (best)
+		goto found;
+
+	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
+		if (x->props.family == encap_family &&
+		    x->props.reqid == tmpl->reqid &&
+		    (mark & x->mark.m) == x->mark.v &&
+		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
+		    tmpl->mode == x->props.mode &&
+		    tmpl->id.proto == x->id.proto &&
+		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
+			xfrm_state_look_at(pol, x, fl, encap_family,
+					   &best, &acquire_in_progress, &error);
+	}
+
+found:
+	x = best;
+	if (!x && !error && !acquire_in_progress) {
+		if (tmpl->id.spi &&
+		    (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
+					      tmpl->id.proto, encap_family)) != NULL) {
+			to_put = x0;
+			error = -EEXIST;
+			goto out;
+		}
+		x = xfrm_state_alloc(net);
+		if (x == NULL) {
+			error = -ENOMEM;
+			goto out;
+		}
+		/* Initialize temporary state matching only
+		 * to current session. */
+		xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
+		memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+
+		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
+		if (error) {
+			x->km.state = XFRM_STATE_DEAD;
+			to_put = x;
+			x = NULL;
+			goto out;
+		}
+
+		if (km_query(x, tmpl, pol) == 0) {
+			x->km.state = XFRM_STATE_ACQ;
+			list_add(&x->km.all, &net->xfrm.state_all);
+			hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+			h = xfrm_src_hash(net, daddr, saddr, encap_family);
+			hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+			if (x->id.spi) {
+				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
+				hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+			}
+			x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
+			tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
+			net->xfrm.state_num++;
+			xfrm_hash_grow_check(net, x->bydst.next != NULL);
+		} else {
+			x->km.state = XFRM_STATE_DEAD;
+			to_put = x;
+			x = NULL;
+			error = -ESRCH;
+		}
+	}
+out:
+	if (x)
+		xfrm_state_hold(x);
+	else
+		*err = acquire_in_progress ? -EAGAIN : error;
+	spin_unlock_bh(&xfrm_state_lock);
+	if (to_put)
+		xfrm_state_put(to_put);
+	return x;
+}
+
+struct xfrm_state *
+xfrm_stateonly_find(struct net *net, u32 mark,
+		    xfrm_address_t *daddr, xfrm_address_t *saddr,
+		    unsigned short family, u8 mode, u8 proto, u32 reqid)
+{
+	unsigned int h;
+	struct xfrm_state *rx = NULL, *x = NULL;
+	struct hlist_node *entry;
+
+	spin_lock(&xfrm_state_lock);
+	h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+		if (x->props.family == family &&
+		    x->props.reqid == reqid &&
+		    (mark & x->mark.m) == x->mark.v &&
+		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
+		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    mode == x->props.mode &&
+		    proto == x->id.proto &&
+		    x->km.state == XFRM_STATE_VALID) {
+			rx = x;
+			break;
+		}
+	}
+
+	if (rx)
+		xfrm_state_hold(rx);
+	spin_unlock(&xfrm_state_lock);
+
+
+	return rx;
+}
+EXPORT_SYMBOL(xfrm_stateonly_find);
+
+static void __xfrm_state_insert(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	unsigned int h;
+
+	list_add(&x->km.all, &net->xfrm.state_all);
+
+	h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
+			  x->props.reqid, x->props.family);
+	hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+
+	h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family);
+	hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+
+	if (x->id.spi) {
+		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto,
+				  x->props.family);
+
+		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+	}
+
+	tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
+	if (x->replay_maxage)
+		mod_timer(&x->rtimer, jiffies + x->replay_maxage);
+
+	wake_up(&net->xfrm.km_waitq);
+
+	net->xfrm.state_num++;
+
+	xfrm_hash_grow_check(net, x->bydst.next != NULL);
+}
+
+/* xfrm_state_lock is held */
+static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
+{
+	struct net *net = xs_net(xnew);
+	unsigned short family = xnew->props.family;
+	u32 reqid = xnew->props.reqid;
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+	unsigned int h;
+	u32 mark = xnew->mark.v & xnew->mark.m;
+
+	h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+		if (x->props.family	== family &&
+		    x->props.reqid	== reqid &&
+		    (mark & x->mark.m) == x->mark.v &&
+		    !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family) &&
+		    !xfrm_addr_cmp(&x->props.saddr, &xnew->props.saddr, family))
+			x->genid++;
+	}
+}
+
+void xfrm_state_insert(struct xfrm_state *x)
+{
+	spin_lock_bh(&xfrm_state_lock);
+	__xfrm_state_bump_genids(x);
+	__xfrm_state_insert(x);
+	spin_unlock_bh(&xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_state_insert);
+
+/* xfrm_state_lock is held */
+static struct xfrm_state *__find_acq_core(struct net *net, struct xfrm_mark *m,
+					  unsigned short family, u8 mode,
+					  u32 reqid, u8 proto,
+					  const xfrm_address_t *daddr,
+					  const xfrm_address_t *saddr, int create)
+{
+	unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family);
+	struct hlist_node *entry;
+	struct xfrm_state *x;
+	u32 mark = m->v & m->m;
+
+	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
+		if (x->props.reqid  != reqid ||
+		    x->props.mode   != mode ||
+		    x->props.family != family ||
+		    x->km.state     != XFRM_STATE_ACQ ||
+		    x->id.spi       != 0 ||
+		    x->id.proto	    != proto ||
+		    (mark & x->mark.m) != x->mark.v ||
+		    xfrm_addr_cmp(&x->id.daddr, daddr, family) ||
+		    xfrm_addr_cmp(&x->props.saddr, saddr, family))
+			continue;
+
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	if (!create)
+		return NULL;
+
+	x = xfrm_state_alloc(net);
+	if (likely(x)) {
+		switch (family) {
+		case AF_INET:
+			x->sel.daddr.a4 = daddr->a4;
+			x->sel.saddr.a4 = saddr->a4;
+			x->sel.prefixlen_d = 32;
+			x->sel.prefixlen_s = 32;
+			x->props.saddr.a4 = saddr->a4;
+			x->id.daddr.a4 = daddr->a4;
+			break;
+
+		case AF_INET6:
+			ipv6_addr_copy((struct in6_addr *)x->sel.daddr.a6,
+				       (const struct in6_addr *)daddr);
+			ipv6_addr_copy((struct in6_addr *)x->sel.saddr.a6,
+				       (const struct in6_addr *)saddr);
+			x->sel.prefixlen_d = 128;
+			x->sel.prefixlen_s = 128;
+			ipv6_addr_copy((struct in6_addr *)x->props.saddr.a6,
+				       (const struct in6_addr *)saddr);
+			ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6,
+				       (const struct in6_addr *)daddr);
+			break;
+		}
+
+		x->km.state = XFRM_STATE_ACQ;
+		x->id.proto = proto;
+		x->props.family = family;
+		x->props.mode = mode;
+		x->props.reqid = reqid;
+		x->mark.v = m->v;
+		x->mark.m = m->m;
+		x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
+		xfrm_state_hold(x);
+		tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL);
+		list_add(&x->km.all, &net->xfrm.state_all);
+		hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
+		h = xfrm_src_hash(net, daddr, saddr, family);
+		hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
+
+		net->xfrm.state_num++;
+
+		xfrm_hash_grow_check(net, x->bydst.next != NULL);
+	}
+
+	return x;
+}
+
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
+
+int xfrm_state_add(struct xfrm_state *x)
+{
+	struct net *net = xs_net(x);
+	struct xfrm_state *x1, *to_put;
+	int family;
+	int err;
+	u32 mark = x->mark.v & x->mark.m;
+	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
+
+	family = x->props.family;
+
+	to_put = NULL;
+
+	spin_lock_bh(&xfrm_state_lock);
+
+	x1 = __xfrm_state_locate(x, use_spi, family);
+	if (x1) {
+		to_put = x1;
+		x1 = NULL;
+		err = -EEXIST;
+		goto out;
+	}
+
+	if (use_spi && x->km.seq) {
+		x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq);
+		if (x1 && ((x1->id.proto != x->id.proto) ||
+		    xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family))) {
+			to_put = x1;
+			x1 = NULL;
+		}
+	}
+
+	if (use_spi && !x1)
+		x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
+				     x->props.reqid, x->id.proto,
+				     &x->id.daddr, &x->props.saddr, 0);
+
+	__xfrm_state_bump_genids(x);
+	__xfrm_state_insert(x);
+	err = 0;
+
+out:
+	spin_unlock_bh(&xfrm_state_lock);
+
+	if (x1) {
+		xfrm_state_delete(x1);
+		xfrm_state_put(x1);
+	}
+
+	if (to_put)
+		xfrm_state_put(to_put);
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_add);
+
+#ifdef CONFIG_XFRM_MIGRATE
+static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp)
+{
+	struct net *net = xs_net(orig);
+	int err = -ENOMEM;
+	struct xfrm_state *x = xfrm_state_alloc(net);
+	if (!x)
+		goto out;
+
+	memcpy(&x->id, &orig->id, sizeof(x->id));
+	memcpy(&x->sel, &orig->sel, sizeof(x->sel));
+	memcpy(&x->lft, &orig->lft, sizeof(x->lft));
+	x->props.mode = orig->props.mode;
+	x->props.replay_window = orig->props.replay_window;
+	x->props.reqid = orig->props.reqid;
+	x->props.family = orig->props.family;
+	x->props.saddr = orig->props.saddr;
+
+	if (orig->aalg) {
+		x->aalg = xfrm_algo_auth_clone(orig->aalg);
+		if (!x->aalg)
+			goto error;
+	}
+	x->props.aalgo = orig->props.aalgo;
+
+	if (orig->ealg) {
+		x->ealg = xfrm_algo_clone(orig->ealg);
+		if (!x->ealg)
+			goto error;
+	}
+	x->props.ealgo = orig->props.ealgo;
+
+	if (orig->calg) {
+		x->calg = xfrm_algo_clone(orig->calg);
+		if (!x->calg)
+			goto error;
+	}
+	x->props.calgo = orig->props.calgo;
+
+	if (orig->encap) {
+		x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL);
+		if (!x->encap)
+			goto error;
+	}
+
+	if (orig->coaddr) {
+		x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr),
+				    GFP_KERNEL);
+		if (!x->coaddr)
+			goto error;
+	}
+
+	if (orig->replay_esn) {
+		err = xfrm_replay_clone(x, orig);
+		if (err)
+			goto error;
+	}
+
+	memcpy(&x->mark, &orig->mark, sizeof(x->mark));
+
+	err = xfrm_init_state(x);
+	if (err)
+		goto error;
+
+	x->props.flags = orig->props.flags;
+
+	x->curlft.add_time = orig->curlft.add_time;
+	x->km.state = orig->km.state;
+	x->km.seq = orig->km.seq;
+
+	return x;
+
+ error:
+	xfrm_state_put(x);
+out:
+	if (errp)
+		*errp = err;
+	return NULL;
+}
+
+/* xfrm_state_lock is held */
+struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m)
+{
+	unsigned int h;
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+
+	if (m->reqid) {
+		h = xfrm_dst_hash(&init_net, &m->old_daddr, &m->old_saddr,
+				  m->reqid, m->old_family);
+		hlist_for_each_entry(x, entry, init_net.xfrm.state_bydst+h, bydst) {
+			if (x->props.mode != m->mode ||
+			    x->id.proto != m->proto)
+				continue;
+			if (m->reqid && x->props.reqid != m->reqid)
+				continue;
+			if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr,
+					  m->old_family) ||
+			    xfrm_addr_cmp(&x->props.saddr, &m->old_saddr,
+					  m->old_family))
+				continue;
+			xfrm_state_hold(x);
+			return x;
+		}
+	} else {
+		h = xfrm_src_hash(&init_net, &m->old_daddr, &m->old_saddr,
+				  m->old_family);
+		hlist_for_each_entry(x, entry, init_net.xfrm.state_bysrc+h, bysrc) {
+			if (x->props.mode != m->mode ||
+			    x->id.proto != m->proto)
+				continue;
+			if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr,
+					  m->old_family) ||
+			    xfrm_addr_cmp(&x->props.saddr, &m->old_saddr,
+					  m->old_family))
+				continue;
+			xfrm_state_hold(x);
+			return x;
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(xfrm_migrate_state_find);
+
+struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x,
+				       struct xfrm_migrate *m)
+{
+	struct xfrm_state *xc;
+	int err;
+
+	xc = xfrm_state_clone(x, &err);
+	if (!xc)
+		return NULL;
+
+	memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr));
+	memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr));
+
+	/* add state */
+	if (!xfrm_addr_cmp(&x->id.daddr, &m->new_daddr, m->new_family)) {
+		/* a care is needed when the destination address of the
+		   state is to be updated as it is a part of triplet */
+		xfrm_state_insert(xc);
+	} else {
+		if ((err = xfrm_state_add(xc)) < 0)
+			goto error;
+	}
+
+	return xc;
+error:
+	xfrm_state_put(xc);
+	return NULL;
+}
+EXPORT_SYMBOL(xfrm_state_migrate);
+#endif
+
+int xfrm_state_update(struct xfrm_state *x)
+{
+	struct xfrm_state *x1, *to_put;
+	int err;
+	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
+
+	to_put = NULL;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x1 = __xfrm_state_locate(x, use_spi, x->props.family);
+
+	err = -ESRCH;
+	if (!x1)
+		goto out;
+
+	if (xfrm_state_kern(x1)) {
+		to_put = x1;
+		err = -EEXIST;
+		goto out;
+	}
+
+	if (x1->km.state == XFRM_STATE_ACQ) {
+		__xfrm_state_insert(x);
+		x = NULL;
+	}
+	err = 0;
+
+out:
+	spin_unlock_bh(&xfrm_state_lock);
+
+	if (to_put)
+		xfrm_state_put(to_put);
+
+	if (err)
+		return err;
+
+	if (!x) {
+		xfrm_state_delete(x1);
+		xfrm_state_put(x1);
+		return 0;
+	}
+
+	err = -EINVAL;
+	spin_lock_bh(&x1->lock);
+	if (likely(x1->km.state == XFRM_STATE_VALID)) {
+		if (x->encap && x1->encap)
+			memcpy(x1->encap, x->encap, sizeof(*x1->encap));
+		if (x->coaddr && x1->coaddr) {
+			memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
+		}
+		if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel)))
+			memcpy(&x1->sel, &x->sel, sizeof(x1->sel));
+		memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
+		x1->km.dying = 0;
+
+		tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL);
+		if (x1->curlft.use_time)
+			xfrm_state_check_expire(x1);
+
+		err = 0;
+		x->km.state = XFRM_STATE_DEAD;
+		__xfrm_state_put(x);
+	}
+	spin_unlock_bh(&x1->lock);
+
+	xfrm_state_put(x1);
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_update);
+
+int xfrm_state_check_expire(struct xfrm_state *x)
+{
+	if (!x->curlft.use_time)
+		x->curlft.use_time = get_seconds();
+
+	if (x->km.state != XFRM_STATE_VALID)
+		return -EINVAL;
+
+	if (x->curlft.bytes >= x->lft.hard_byte_limit ||
+	    x->curlft.packets >= x->lft.hard_packet_limit) {
+		x->km.state = XFRM_STATE_EXPIRED;
+		tasklet_hrtimer_start(&x->mtimer, ktime_set(0,0), HRTIMER_MODE_REL);
+		return -EINVAL;
+	}
+
+	if (!x->km.dying &&
+	    (x->curlft.bytes >= x->lft.soft_byte_limit ||
+	     x->curlft.packets >= x->lft.soft_packet_limit)) {
+		x->km.dying = 1;
+		km_state_expired(x, 0, 0);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_state_check_expire);
+
+struct xfrm_state *
+xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi,
+		  u8 proto, unsigned short family)
+{
+	struct xfrm_state *x;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family);
+	spin_unlock_bh(&xfrm_state_lock);
+	return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup);
+
+struct xfrm_state *
+xfrm_state_lookup_byaddr(struct net *net, u32 mark,
+			 const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+			 u8 proto, unsigned short family)
+{
+	struct xfrm_state *x;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family);
+	spin_unlock_bh(&xfrm_state_lock);
+	return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
+
+struct xfrm_state *
+xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto,
+	      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
+	      int create, unsigned short family)
+{
+	struct xfrm_state *x;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+	spin_unlock_bh(&xfrm_state_lock);
+
+	return x;
+}
+EXPORT_SYMBOL(xfrm_find_acq);
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+int
+xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
+	       unsigned short family)
+{
+	int err = 0;
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return -EAFNOSUPPORT;
+
+	spin_lock_bh(&xfrm_state_lock);
+	if (afinfo->tmpl_sort)
+		err = afinfo->tmpl_sort(dst, src, n);
+	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_tmpl_sort);
+
+int
+xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+		unsigned short family)
+{
+	int err = 0;
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return -EAFNOSUPPORT;
+
+	spin_lock_bh(&xfrm_state_lock);
+	if (afinfo->state_sort)
+		err = afinfo->state_sort(dst, src, n);
+	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_sort);
+#endif
+
+/* Silly enough, but I'm lazy to build resolution list */
+
+static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+{
+	int i;
+
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct hlist_node *entry;
+		struct xfrm_state *x;
+
+		hlist_for_each_entry(x, entry, net->xfrm.state_bydst+i, bydst) {
+			if (x->km.seq == seq &&
+			    (mark & x->mark.m) == x->mark.v &&
+			    x->km.state == XFRM_STATE_ACQ) {
+				xfrm_state_hold(x);
+				return x;
+			}
+		}
+	}
+	return NULL;
+}
+
+struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq)
+{
+	struct xfrm_state *x;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = __xfrm_find_acq_byseq(net, mark, seq);
+	spin_unlock_bh(&xfrm_state_lock);
+	return x;
+}
+EXPORT_SYMBOL(xfrm_find_acq_byseq);
+
+u32 xfrm_get_acqseq(void)
+{
+	u32 res;
+	static atomic_t acqseq;
+
+	do {
+		res = atomic_inc_return(&acqseq);
+	} while (!res);
+
+	return res;
+}
+EXPORT_SYMBOL(xfrm_get_acqseq);
+
+int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
+{
+	struct net *net = xs_net(x);
+	unsigned int h;
+	struct xfrm_state *x0;
+	int err = -ENOENT;
+	__be32 minspi = htonl(low);
+	__be32 maxspi = htonl(high);
+	u32 mark = x->mark.v & x->mark.m;
+
+	spin_lock_bh(&x->lock);
+	if (x->km.state == XFRM_STATE_DEAD)
+		goto unlock;
+
+	err = 0;
+	if (x->id.spi)
+		goto unlock;
+
+	err = -ENOENT;
+
+	if (minspi == maxspi) {
+		x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family);
+		if (x0) {
+			xfrm_state_put(x0);
+			goto unlock;
+		}
+		x->id.spi = minspi;
+	} else {
+		u32 spi = 0;
+		for (h=0; h<high-low+1; h++) {
+			spi = low + net_random()%(high-low+1);
+			x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family);
+			if (x0 == NULL) {
+				x->id.spi = htonl(spi);
+				break;
+			}
+			xfrm_state_put(x0);
+		}
+	}
+	if (x->id.spi) {
+		spin_lock_bh(&xfrm_state_lock);
+		h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
+		hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
+		spin_unlock_bh(&xfrm_state_lock);
+
+		err = 0;
+	}
+
+unlock:
+	spin_unlock_bh(&x->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(xfrm_alloc_spi);
+
+int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
+		    int (*func)(struct xfrm_state *, int, void*),
+		    void *data)
+{
+	struct xfrm_state *state;
+	struct xfrm_state_walk *x;
+	int err = 0;
+
+	if (walk->seq != 0 && list_empty(&walk->all))
+		return 0;
+
+	spin_lock_bh(&xfrm_state_lock);
+	if (list_empty(&walk->all))
+		x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all);
+	else
+		x = list_entry(&walk->all, struct xfrm_state_walk, all);
+	list_for_each_entry_from(x, &net->xfrm.state_all, all) {
+		if (x->state == XFRM_STATE_DEAD)
+			continue;
+		state = container_of(x, struct xfrm_state, km);
+		if (!xfrm_id_proto_match(state->id.proto, walk->proto))
+			continue;
+		err = func(state, walk->seq, data);
+		if (err) {
+			list_move_tail(&walk->all, &x->all);
+			goto out;
+		}
+		walk->seq++;
+	}
+	if (walk->seq == 0) {
+		err = -ENOENT;
+		goto out;
+	}
+	list_del_init(&walk->all);
+out:
+	spin_unlock_bh(&xfrm_state_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_walk);
+
+void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
+{
+	INIT_LIST_HEAD(&walk->all);
+	walk->proto = proto;
+	walk->state = XFRM_STATE_DEAD;
+	walk->seq = 0;
+}
+EXPORT_SYMBOL(xfrm_state_walk_init);
+
+void xfrm_state_walk_done(struct xfrm_state_walk *walk)
+{
+	if (list_empty(&walk->all))
+		return;
+
+	spin_lock_bh(&xfrm_state_lock);
+	list_del(&walk->all);
+	spin_unlock_bh(&xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_state_walk_done);
+
+static void xfrm_replay_timer_handler(unsigned long data)
+{
+	struct xfrm_state *x = (struct xfrm_state*)data;
+
+	spin_lock(&x->lock);
+
+	if (x->km.state == XFRM_STATE_VALID) {
+		if (xfrm_aevent_is_on(xs_net(x)))
+			x->repl->notify(x, XFRM_REPLAY_TIMEOUT);
+		else
+			x->xflags |= XFRM_TIME_DEFER;
+	}
+
+	spin_unlock(&x->lock);
+}
+
+static LIST_HEAD(xfrm_km_list);
+static DEFINE_RWLOCK(xfrm_km_lock);
+
+void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list)
+		if (km->notify_policy)
+			km->notify_policy(xp, dir, c);
+	read_unlock(&xfrm_km_lock);
+}
+
+void km_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+	struct xfrm_mgr *km;
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list)
+		if (km->notify)
+			km->notify(x, c);
+	read_unlock(&xfrm_km_lock);
+}
+
+EXPORT_SYMBOL(km_policy_notify);
+EXPORT_SYMBOL(km_state_notify);
+
+void km_state_expired(struct xfrm_state *x, int hard, u32 pid)
+{
+	struct net *net = xs_net(x);
+	struct km_event c;
+
+	c.data.hard = hard;
+	c.pid = pid;
+	c.event = XFRM_MSG_EXPIRE;
+	km_state_notify(x, &c);
+
+	if (hard)
+		wake_up(&net->xfrm.km_waitq);
+}
+
+EXPORT_SYMBOL(km_state_expired);
+/*
+ * We send to all registered managers regardless of failure
+ * We are happy with one success
+*/
+int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
+{
+	int err = -EINVAL, acqret;
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT);
+		if (!acqret)
+			err = acqret;
+	}
+	read_unlock(&xfrm_km_lock);
+	return err;
+}
+EXPORT_SYMBOL(km_query);
+
+int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
+{
+	int err = -EINVAL;
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		if (km->new_mapping)
+			err = km->new_mapping(x, ipaddr, sport);
+		if (!err)
+			break;
+	}
+	read_unlock(&xfrm_km_lock);
+	return err;
+}
+EXPORT_SYMBOL(km_new_mapping);
+
+void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid)
+{
+	struct net *net = xp_net(pol);
+	struct km_event c;
+
+	c.data.hard = hard;
+	c.pid = pid;
+	c.event = XFRM_MSG_POLEXPIRE;
+	km_policy_notify(pol, dir, &c);
+
+	if (hard)
+		wake_up(&net->xfrm.km_waitq);
+}
+EXPORT_SYMBOL(km_policy_expired);
+
+#ifdef CONFIG_XFRM_MIGRATE
+int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+	       const struct xfrm_migrate *m, int num_migrate,
+	       const struct xfrm_kmaddress *k)
+{
+	int err = -EINVAL;
+	int ret;
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		if (km->migrate) {
+			ret = km->migrate(sel, dir, type, m, num_migrate, k);
+			if (!ret)
+				err = ret;
+		}
+	}
+	read_unlock(&xfrm_km_lock);
+	return err;
+}
+EXPORT_SYMBOL(km_migrate);
+#endif
+
+int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+	int err = -EINVAL;
+	int ret;
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		if (km->report) {
+			ret = km->report(net, proto, sel, addr);
+			if (!ret)
+				err = ret;
+		}
+	}
+	read_unlock(&xfrm_km_lock);
+	return err;
+}
+EXPORT_SYMBOL(km_report);
+
+int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen)
+{
+	int err;
+	u8 *data;
+	struct xfrm_mgr *km;
+	struct xfrm_policy *pol = NULL;
+
+	if (optlen <= 0 || optlen > PAGE_SIZE)
+		return -EMSGSIZE;
+
+	data = kmalloc(optlen, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	err = -EFAULT;
+	if (copy_from_user(data, optval, optlen))
+		goto out;
+
+	err = -EINVAL;
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		pol = km->compile_policy(sk, optname, data,
+					 optlen, &err);
+		if (err >= 0)
+			break;
+	}
+	read_unlock(&xfrm_km_lock);
+
+	if (err >= 0) {
+		xfrm_sk_policy_insert(sk, err, pol);
+		xfrm_pol_put(pol);
+		err = 0;
+	}
+
+out:
+	kfree(data);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_user_policy);
+
+int xfrm_register_km(struct xfrm_mgr *km)
+{
+	write_lock_bh(&xfrm_km_lock);
+	list_add_tail(&km->list, &xfrm_km_list);
+	write_unlock_bh(&xfrm_km_lock);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_register_km);
+
+int xfrm_unregister_km(struct xfrm_mgr *km)
+{
+	write_lock_bh(&xfrm_km_lock);
+	list_del(&km->list);
+	write_unlock_bh(&xfrm_km_lock);
+	return 0;
+}
+EXPORT_SYMBOL(xfrm_unregister_km);
+
+int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
+{
+	int err = 0;
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	if (unlikely(afinfo->family >= NPROTO))
+		return -EAFNOSUPPORT;
+	write_lock_bh(&xfrm_state_afinfo_lock);
+	if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
+		err = -ENOBUFS;
+	else
+		xfrm_state_afinfo[afinfo->family] = afinfo;
+	write_unlock_bh(&xfrm_state_afinfo_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_register_afinfo);
+
+int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
+{
+	int err = 0;
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	if (unlikely(afinfo->family >= NPROTO))
+		return -EAFNOSUPPORT;
+	write_lock_bh(&xfrm_state_afinfo_lock);
+	if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
+		if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
+			err = -EINVAL;
+		else
+			xfrm_state_afinfo[afinfo->family] = NULL;
+	}
+	write_unlock_bh(&xfrm_state_afinfo_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
+
+static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family)
+{
+	struct xfrm_state_afinfo *afinfo;
+	if (unlikely(family >= NPROTO))
+		return NULL;
+	read_lock(&xfrm_state_afinfo_lock);
+	afinfo = xfrm_state_afinfo[family];
+	if (unlikely(!afinfo))
+		read_unlock(&xfrm_state_afinfo_lock);
+	return afinfo;
+}
+
+static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
+	__releases(xfrm_state_afinfo_lock)
+{
+	read_unlock(&xfrm_state_afinfo_lock);
+}
+
+/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
+void xfrm_state_delete_tunnel(struct xfrm_state *x)
+{
+	if (x->tunnel) {
+		struct xfrm_state *t = x->tunnel;
+
+		if (atomic_read(&t->tunnel_users) == 2)
+			xfrm_state_delete(t);
+		atomic_dec(&t->tunnel_users);
+		xfrm_state_put(t);
+		x->tunnel = NULL;
+	}
+}
+EXPORT_SYMBOL(xfrm_state_delete_tunnel);
+
+int xfrm_state_mtu(struct xfrm_state *x, int mtu)
+{
+	int res;
+
+	spin_lock_bh(&x->lock);
+	if (x->km.state == XFRM_STATE_VALID &&
+	    x->type && x->type->get_mtu)
+		res = x->type->get_mtu(x, mtu);
+	else
+		res = mtu - x->props.header_len;
+	spin_unlock_bh(&x->lock);
+	return res;
+}
+
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
+{
+	struct xfrm_state_afinfo *afinfo;
+	struct xfrm_mode *inner_mode;
+	int family = x->props.family;
+	int err;
+
+	err = -EAFNOSUPPORT;
+	afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		goto error;
+
+	err = 0;
+	if (afinfo->init_flags)
+		err = afinfo->init_flags(x);
+
+	xfrm_state_put_afinfo(afinfo);
+
+	if (err)
+		goto error;
+
+	err = -EPROTONOSUPPORT;
+
+	if (x->sel.family != AF_UNSPEC) {
+		inner_mode = xfrm_get_mode(x->props.mode, x->sel.family);
+		if (inner_mode == NULL)
+			goto error;
+
+		if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
+		    family != x->sel.family) {
+			xfrm_put_mode(inner_mode);
+			goto error;
+		}
+
+		x->inner_mode = inner_mode;
+	} else {
+		struct xfrm_mode *inner_mode_iaf;
+		int iafamily = AF_INET;
+
+		inner_mode = xfrm_get_mode(x->props.mode, x->props.family);
+		if (inner_mode == NULL)
+			goto error;
+
+		if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) {
+			xfrm_put_mode(inner_mode);
+			goto error;
+		}
+		x->inner_mode = inner_mode;
+
+		if (x->props.family == AF_INET)
+			iafamily = AF_INET6;
+
+		inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily);
+		if (inner_mode_iaf) {
+			if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL)
+				x->inner_mode_iaf = inner_mode_iaf;
+			else
+				xfrm_put_mode(inner_mode_iaf);
+		}
+	}
+
+	x->type = xfrm_get_type(x->id.proto, family);
+	if (x->type == NULL)
+		goto error;
+
+	err = x->type->init_state(x);
+	if (err)
+		goto error;
+
+	x->outer_mode = xfrm_get_mode(x->props.mode, family);
+	if (x->outer_mode == NULL)
+		goto error;
+
+	if (init_replay) {
+		err = xfrm_init_replay(x);
+		if (err)
+			goto error;
+	}
+
+	x->km.state = XFRM_STATE_VALID;
+
+error:
+	return err;
+}
+
+EXPORT_SYMBOL(__xfrm_init_state);
+
+int xfrm_init_state(struct xfrm_state *x)
+{
+	return __xfrm_init_state(x, true);
+}
+
+EXPORT_SYMBOL(xfrm_init_state);
+
+int __net_init xfrm_state_init(struct net *net)
+{
+	unsigned int sz;
+
+	INIT_LIST_HEAD(&net->xfrm.state_all);
+
+	sz = sizeof(struct hlist_head) * 8;
+
+	net->xfrm.state_bydst = xfrm_hash_alloc(sz);
+	if (!net->xfrm.state_bydst)
+		goto out_bydst;
+	net->xfrm.state_bysrc = xfrm_hash_alloc(sz);
+	if (!net->xfrm.state_bysrc)
+		goto out_bysrc;
+	net->xfrm.state_byspi = xfrm_hash_alloc(sz);
+	if (!net->xfrm.state_byspi)
+		goto out_byspi;
+	net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
+
+	net->xfrm.state_num = 0;
+	INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize);
+	INIT_HLIST_HEAD(&net->xfrm.state_gc_list);
+	INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task);
+	init_waitqueue_head(&net->xfrm.km_waitq);
+	return 0;
+
+out_byspi:
+	xfrm_hash_free(net->xfrm.state_bysrc, sz);
+out_bysrc:
+	xfrm_hash_free(net->xfrm.state_bydst, sz);
+out_bydst:
+	return -ENOMEM;
+}
+
+void xfrm_state_fini(struct net *net)
+{
+	struct xfrm_audit audit_info;
+	unsigned int sz;
+
+	flush_work(&net->xfrm.state_hash_work);
+	audit_info.loginuid = -1;
+	audit_info.sessionid = -1;
+	audit_info.secid = 0;
+	xfrm_state_flush(net, IPSEC_PROTO_ANY, &audit_info);
+	flush_work(&net->xfrm.state_gc_work);
+
+	WARN_ON(!list_empty(&net->xfrm.state_all));
+
+	sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head);
+	WARN_ON(!hlist_empty(net->xfrm.state_byspi));
+	xfrm_hash_free(net->xfrm.state_byspi, sz);
+	WARN_ON(!hlist_empty(net->xfrm.state_bysrc));
+	xfrm_hash_free(net->xfrm.state_bysrc, sz);
+	WARN_ON(!hlist_empty(net->xfrm.state_bydst));
+	xfrm_hash_free(net->xfrm.state_bydst, sz);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
+				     struct audit_buffer *audit_buf)
+{
+	struct xfrm_sec_ctx *ctx = x->security;
+	u32 spi = ntohl(x->id.spi);
+
+	if (ctx)
+		audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
+				 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);
+
+	switch(x->props.family) {
+	case AF_INET:
+		audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
+				 &x->props.saddr.a4, &x->id.daddr.a4);
+		break;
+	case AF_INET6:
+		audit_log_format(audit_buf, " src=%pI6 dst=%pI6",
+				 x->props.saddr.a6, x->id.daddr.a6);
+		break;
+	}
+
+	audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
+}
+
+static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
+				      struct audit_buffer *audit_buf)
+{
+	const struct iphdr *iph4;
+	const struct ipv6hdr *iph6;
+
+	switch (family) {
+	case AF_INET:
+		iph4 = ip_hdr(skb);
+		audit_log_format(audit_buf, " src=%pI4 dst=%pI4",
+				 &iph4->saddr, &iph4->daddr);
+		break;
+	case AF_INET6:
+		iph6 = ipv6_hdr(skb);
+		audit_log_format(audit_buf,
+				 " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x",
+				 &iph6->saddr,&iph6->daddr,
+				 iph6->flow_lbl[0] & 0x0f,
+				 iph6->flow_lbl[1],
+				 iph6->flow_lbl[2]);
+		break;
+	}
+}
+
+void xfrm_audit_state_add(struct xfrm_state *x, int result,
+			  uid_t auid, u32 sessionid, u32 secid)
+{
+	struct audit_buffer *audit_buf;
+
+	audit_buf = xfrm_audit_start("SAD-add");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
+	xfrm_audit_helper_sainfo(x, audit_buf);
+	audit_log_format(audit_buf, " res=%u", result);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_add);
+
+void xfrm_audit_state_delete(struct xfrm_state *x, int result,
+			     uid_t auid, u32 sessionid, u32 secid)
+{
+	struct audit_buffer *audit_buf;
+
+	audit_buf = xfrm_audit_start("SAD-delete");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_usrinfo(auid, sessionid, secid, audit_buf);
+	xfrm_audit_helper_sainfo(x, audit_buf);
+	audit_log_format(audit_buf, " res=%u", result);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_delete);
+
+void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
+				      struct sk_buff *skb)
+{
+	struct audit_buffer *audit_buf;
+	u32 spi;
+
+	audit_buf = xfrm_audit_start("SA-replay-overflow");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+	/* don't record the sequence number because it's inherent in this kind
+	 * of audit message */
+	spi = ntohl(x->id.spi);
+	audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow);
+
+void xfrm_audit_state_replay(struct xfrm_state *x,
+			     struct sk_buff *skb, __be32 net_seq)
+{
+	struct audit_buffer *audit_buf;
+	u32 spi;
+
+	audit_buf = xfrm_audit_start("SA-replayed-pkt");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+	spi = ntohl(x->id.spi);
+	audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+			 spi, spi, ntohl(net_seq));
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_replay);
+
+void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family)
+{
+	struct audit_buffer *audit_buf;
+
+	audit_buf = xfrm_audit_start("SA-notfound");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_pktinfo(skb, family, audit_buf);
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple);
+
+void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
+			       __be32 net_spi, __be32 net_seq)
+{
+	struct audit_buffer *audit_buf;
+	u32 spi;
+
+	audit_buf = xfrm_audit_start("SA-notfound");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_pktinfo(skb, family, audit_buf);
+	spi = ntohl(net_spi);
+	audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+			 spi, spi, ntohl(net_seq));
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound);
+
+void xfrm_audit_state_icvfail(struct xfrm_state *x,
+			      struct sk_buff *skb, u8 proto)
+{
+	struct audit_buffer *audit_buf;
+	__be32 net_spi;
+	__be32 net_seq;
+
+	audit_buf = xfrm_audit_start("SA-icv-failure");
+	if (audit_buf == NULL)
+		return;
+	xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf);
+	if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) {
+		u32 spi = ntohl(net_spi);
+		audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u",
+				 spi, spi, ntohl(net_seq));
+	}
+	audit_log_end(audit_buf);
+}
+EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail);
+#endif /* CONFIG_AUDITSYSCALL */
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
new file mode 100644
index 00000000..05640bc9
--- /dev/null
+++ b/net/xfrm/xfrm_sysctl.c
@@ -0,0 +1,82 @@
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/xfrm.h>
+
+static void __net_init __xfrm_sysctl_init(struct net *net)
+{
+	net->xfrm.sysctl_aevent_etime = XFRM_AE_ETIME;
+	net->xfrm.sysctl_aevent_rseqth = XFRM_AE_SEQT_SIZE;
+	net->xfrm.sysctl_larval_drop = 1;
+	net->xfrm.sysctl_acq_expires = 30;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table xfrm_table[] = {
+	{
+		.procname	= "xfrm_aevent_etime",
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "xfrm_aevent_rseqth",
+		.maxlen		= sizeof(u32),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "xfrm_larval_drop",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "xfrm_acq_expires",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+int __net_init xfrm_sysctl_init(struct net *net)
+{
+	struct ctl_table *table;
+
+	__xfrm_sysctl_init(net);
+
+	table = kmemdup(xfrm_table, sizeof(xfrm_table), GFP_KERNEL);
+	if (!table)
+		goto out_kmemdup;
+	table[0].data = &net->xfrm.sysctl_aevent_etime;
+	table[1].data = &net->xfrm.sysctl_aevent_rseqth;
+	table[2].data = &net->xfrm.sysctl_larval_drop;
+	table[3].data = &net->xfrm.sysctl_acq_expires;
+
+	net->xfrm.sysctl_hdr = register_net_sysctl_table(net, net_core_path, table);
+	if (!net->xfrm.sysctl_hdr)
+		goto out_register;
+	return 0;
+
+out_register:
+	kfree(table);
+out_kmemdup:
+	return -ENOMEM;
+}
+
+void __net_exit xfrm_sysctl_fini(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->xfrm.sysctl_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->xfrm.sysctl_hdr);
+	kfree(table);
+}
+#else
+int __net_init xfrm_sysctl_init(struct net *net)
+{
+	__xfrm_sysctl_init(net);
+	return 0;
+}
+#endif
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
new file mode 100644
index 00000000..c658cb3b
--- /dev/null
+++ b/net/xfrm/xfrm_user.c
@@ -0,0 +1,2973 @@
+/* xfrm_user.c: User interface to configure xfrm engine.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ * Changes:
+ *	Mitsuru KANDA @USAGI
+ * 	Kazunori MIYAZAWA @USAGI
+ * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
+ * 		IPv6 support
+ *
+ */
+
+#include <linux/crypto.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/netlink.h>
+#include <net/ah.h>
+#include <asm/uaccess.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <linux/in6.h>
+#endif
+
+static inline int aead_len(struct xfrm_algo_aead *alg)
+{
+	return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
+}
+
+static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
+{
+	struct nlattr *rt = attrs[type];
+	struct xfrm_algo *algp;
+
+	if (!rt)
+		return 0;
+
+	algp = nla_data(rt);
+	if (nla_len(rt) < xfrm_alg_len(algp))
+		return -EINVAL;
+
+	switch (type) {
+	case XFRMA_ALG_AUTH:
+	case XFRMA_ALG_CRYPT:
+	case XFRMA_ALG_COMP:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+	return 0;
+}
+
+static int verify_auth_trunc(struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
+	struct xfrm_algo_auth *algp;
+
+	if (!rt)
+		return 0;
+
+	algp = nla_data(rt);
+	if (nla_len(rt) < xfrm_alg_auth_len(algp))
+		return -EINVAL;
+
+	algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+	return 0;
+}
+
+static int verify_aead(struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
+	struct xfrm_algo_aead *algp;
+
+	if (!rt)
+		return 0;
+
+	algp = nla_data(rt);
+	if (nla_len(rt) < aead_len(algp))
+		return -EINVAL;
+
+	algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0';
+	return 0;
+}
+
+static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type,
+			   xfrm_address_t **addrp)
+{
+	struct nlattr *rt = attrs[type];
+
+	if (rt && addrp)
+		*addrp = nla_data(rt);
+}
+
+static inline int verify_sec_ctx_len(struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!rt)
+		return 0;
+
+	uctx = nla_data(rt);
+	if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline int verify_replay(struct xfrm_usersa_info *p,
+				struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
+
+	if ((p->flags & XFRM_STATE_ESN) && !rt)
+		return -EINVAL;
+
+	if (!rt)
+		return 0;
+
+	if (p->id.proto != IPPROTO_ESP)
+		return -EINVAL;
+
+	if (p->replay_window != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int verify_newsa_info(struct xfrm_usersa_info *p,
+			     struct nlattr **attrs)
+{
+	int err;
+
+	err = -EINVAL;
+	switch (p->family) {
+	case AF_INET:
+		break;
+
+	case AF_INET6:
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		break;
+#else
+		err = -EAFNOSUPPORT;
+		goto out;
+#endif
+
+	default:
+		goto out;
+	}
+
+	err = -EINVAL;
+	switch (p->id.proto) {
+	case IPPROTO_AH:
+		if ((!attrs[XFRMA_ALG_AUTH]	&&
+		     !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
+		    attrs[XFRMA_ALG_AEAD]	||
+		    attrs[XFRMA_ALG_CRYPT]	||
+		    attrs[XFRMA_ALG_COMP]	||
+		    attrs[XFRMA_TFCPAD])
+			goto out;
+		break;
+
+	case IPPROTO_ESP:
+		if (attrs[XFRMA_ALG_COMP])
+			goto out;
+		if (!attrs[XFRMA_ALG_AUTH] &&
+		    !attrs[XFRMA_ALG_AUTH_TRUNC] &&
+		    !attrs[XFRMA_ALG_CRYPT] &&
+		    !attrs[XFRMA_ALG_AEAD])
+			goto out;
+		if ((attrs[XFRMA_ALG_AUTH] ||
+		     attrs[XFRMA_ALG_AUTH_TRUNC] ||
+		     attrs[XFRMA_ALG_CRYPT]) &&
+		    attrs[XFRMA_ALG_AEAD])
+			goto out;
+		if (attrs[XFRMA_TFCPAD] &&
+		    p->mode != XFRM_MODE_TUNNEL)
+			goto out;
+		break;
+
+	case IPPROTO_COMP:
+		if (!attrs[XFRMA_ALG_COMP]	||
+		    attrs[XFRMA_ALG_AEAD]	||
+		    attrs[XFRMA_ALG_AUTH]	||
+		    attrs[XFRMA_ALG_AUTH_TRUNC]	||
+		    attrs[XFRMA_ALG_CRYPT]	||
+		    attrs[XFRMA_TFCPAD])
+			goto out;
+		break;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case IPPROTO_DSTOPTS:
+	case IPPROTO_ROUTING:
+		if (attrs[XFRMA_ALG_COMP]	||
+		    attrs[XFRMA_ALG_AUTH]	||
+		    attrs[XFRMA_ALG_AUTH_TRUNC]	||
+		    attrs[XFRMA_ALG_AEAD]	||
+		    attrs[XFRMA_ALG_CRYPT]	||
+		    attrs[XFRMA_ENCAP]		||
+		    attrs[XFRMA_SEC_CTX]	||
+		    attrs[XFRMA_TFCPAD]		||
+		    !attrs[XFRMA_COADDR])
+			goto out;
+		break;
+#endif
+
+	default:
+		goto out;
+	}
+
+	if ((err = verify_aead(attrs)))
+		goto out;
+	if ((err = verify_auth_trunc(attrs)))
+		goto out;
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH)))
+		goto out;
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT)))
+		goto out;
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
+		goto out;
+	if ((err = verify_sec_ctx_len(attrs)))
+		goto out;
+	if ((err = verify_replay(p, attrs)))
+		goto out;
+
+	err = -EINVAL;
+	switch (p->mode) {
+	case XFRM_MODE_TRANSPORT:
+	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_ROUTEOPTIMIZATION:
+	case XFRM_MODE_BEET:
+		break;
+
+	default:
+		goto out;
+	}
+
+	err = 0;
+
+out:
+	return err;
+}
+
+static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
+			   struct xfrm_algo_desc *(*get_byname)(const char *, int),
+			   struct nlattr *rta)
+{
+	struct xfrm_algo *p, *ualg;
+	struct xfrm_algo_desc *algo;
+
+	if (!rta)
+		return 0;
+
+	ualg = nla_data(rta);
+
+	algo = get_byname(ualg->alg_name, 1);
+	if (!algo)
+		return -ENOSYS;
+	*props = algo->desc.sadb_alg_id;
+
+	p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	strcpy(p->alg_name, algo->name);
+	*algpp = p;
+	return 0;
+}
+
+static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
+		       struct nlattr *rta)
+{
+	struct xfrm_algo *ualg;
+	struct xfrm_algo_auth *p;
+	struct xfrm_algo_desc *algo;
+
+	if (!rta)
+		return 0;
+
+	ualg = nla_data(rta);
+
+	algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
+	if (!algo)
+		return -ENOSYS;
+	*props = algo->desc.sadb_alg_id;
+
+	p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	strcpy(p->alg_name, algo->name);
+	p->alg_key_len = ualg->alg_key_len;
+	p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
+	memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8);
+
+	*algpp = p;
+	return 0;
+}
+
+static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
+			     struct nlattr *rta)
+{
+	struct xfrm_algo_auth *p, *ualg;
+	struct xfrm_algo_desc *algo;
+
+	if (!rta)
+		return 0;
+
+	ualg = nla_data(rta);
+
+	algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
+	if (!algo)
+		return -ENOSYS;
+	if ((ualg->alg_trunc_len / 8) > MAX_AH_AUTH_LEN ||
+	    ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits)
+		return -EINVAL;
+	*props = algo->desc.sadb_alg_id;
+
+	p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	strcpy(p->alg_name, algo->name);
+	if (!p->alg_trunc_len)
+		p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
+
+	*algpp = p;
+	return 0;
+}
+
+static int attach_aead(struct xfrm_algo_aead **algpp, u8 *props,
+		       struct nlattr *rta)
+{
+	struct xfrm_algo_aead *p, *ualg;
+	struct xfrm_algo_desc *algo;
+
+	if (!rta)
+		return 0;
+
+	ualg = nla_data(rta);
+
+	algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
+	if (!algo)
+		return -ENOSYS;
+	*props = algo->desc.sadb_alg_id;
+
+	p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	strcpy(p->alg_name, algo->name);
+	*algpp = p;
+	return 0;
+}
+
+static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn,
+					 struct nlattr *rp)
+{
+	struct xfrm_replay_state_esn *up;
+
+	if (!replay_esn || !rp)
+		return 0;
+
+	up = nla_data(rp);
+
+	if (xfrm_replay_state_esn_len(replay_esn) !=
+			xfrm_replay_state_esn_len(up))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn,
+				       struct xfrm_replay_state_esn **preplay_esn,
+				       struct nlattr *rta)
+{
+	struct xfrm_replay_state_esn *p, *pp, *up;
+
+	if (!rta)
+		return 0;
+
+	up = nla_data(rta);
+
+	p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL);
+	if (!pp) {
+		kfree(p);
+		return -ENOMEM;
+	}
+
+	*replay_esn = p;
+	*preplay_esn = pp;
+
+	return 0;
+}
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_sec_ctx *xfrm_ctx)
+{
+	int len = 0;
+
+	if (xfrm_ctx) {
+		len += sizeof(struct xfrm_user_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+	}
+	return len;
+}
+
+static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
+{
+	memcpy(&x->id, &p->id, sizeof(x->id));
+	memcpy(&x->sel, &p->sel, sizeof(x->sel));
+	memcpy(&x->lft, &p->lft, sizeof(x->lft));
+	x->props.mode = p->mode;
+	x->props.replay_window = p->replay_window;
+	x->props.reqid = p->reqid;
+	x->props.family = p->family;
+	memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
+	x->props.flags = p->flags;
+
+	if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
+		x->sel.family = p->family;
+}
+
+/*
+ * someday when pfkey also has support, we could have the code
+ * somehow made shareable and move it to xfrm_state.c - JHS
+ *
+*/
+static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs)
+{
+	struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+	struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
+	struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
+	struct nlattr *et = attrs[XFRMA_ETIMER_THRESH];
+	struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH];
+
+	if (re) {
+		struct xfrm_replay_state_esn *replay_esn;
+		replay_esn = nla_data(re);
+		memcpy(x->replay_esn, replay_esn,
+		       xfrm_replay_state_esn_len(replay_esn));
+		memcpy(x->preplay_esn, replay_esn,
+		       xfrm_replay_state_esn_len(replay_esn));
+	}
+
+	if (rp) {
+		struct xfrm_replay_state *replay;
+		replay = nla_data(rp);
+		memcpy(&x->replay, replay, sizeof(*replay));
+		memcpy(&x->preplay, replay, sizeof(*replay));
+	}
+
+	if (lt) {
+		struct xfrm_lifetime_cur *ltime;
+		ltime = nla_data(lt);
+		x->curlft.bytes = ltime->bytes;
+		x->curlft.packets = ltime->packets;
+		x->curlft.add_time = ltime->add_time;
+		x->curlft.use_time = ltime->use_time;
+	}
+
+	if (et)
+		x->replay_maxage = nla_get_u32(et);
+
+	if (rt)
+		x->replay_maxdiff = nla_get_u32(rt);
+}
+
+static struct xfrm_state *xfrm_state_construct(struct net *net,
+					       struct xfrm_usersa_info *p,
+					       struct nlattr **attrs,
+					       int *errp)
+{
+	struct xfrm_state *x = xfrm_state_alloc(net);
+	int err = -ENOMEM;
+
+	if (!x)
+		goto error_no_put;
+
+	copy_from_user_state(x, p);
+
+	if ((err = attach_aead(&x->aead, &x->props.ealgo,
+			       attrs[XFRMA_ALG_AEAD])))
+		goto error;
+	if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
+				     attrs[XFRMA_ALG_AUTH_TRUNC])))
+		goto error;
+	if (!x->props.aalgo) {
+		if ((err = attach_auth(&x->aalg, &x->props.aalgo,
+				       attrs[XFRMA_ALG_AUTH])))
+			goto error;
+	}
+	if ((err = attach_one_algo(&x->ealg, &x->props.ealgo,
+				   xfrm_ealg_get_byname,
+				   attrs[XFRMA_ALG_CRYPT])))
+		goto error;
+	if ((err = attach_one_algo(&x->calg, &x->props.calgo,
+				   xfrm_calg_get_byname,
+				   attrs[XFRMA_ALG_COMP])))
+		goto error;
+
+	if (attrs[XFRMA_ENCAP]) {
+		x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]),
+				   sizeof(*x->encap), GFP_KERNEL);
+		if (x->encap == NULL)
+			goto error;
+	}
+
+	if (attrs[XFRMA_TFCPAD])
+		x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]);
+
+	if (attrs[XFRMA_COADDR]) {
+		x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]),
+				    sizeof(*x->coaddr), GFP_KERNEL);
+		if (x->coaddr == NULL)
+			goto error;
+	}
+
+	xfrm_mark_get(attrs, &x->mark);
+
+	err = __xfrm_init_state(x, false);
+	if (err)
+		goto error;
+
+	if (attrs[XFRMA_SEC_CTX] &&
+	    security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
+		goto error;
+
+	if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
+					       attrs[XFRMA_REPLAY_ESN_VAL])))
+		goto error;
+
+	x->km.seq = p->seq;
+	x->replay_maxdiff = net->xfrm.sysctl_aevent_rseqth;
+	/* sysctl_xfrm_aevent_etime is in 100ms units */
+	x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;
+
+	if ((err = xfrm_init_replay(x)))
+		goto error;
+
+	/* override default values from above */
+	xfrm_update_ae_params(x, attrs);
+
+	return x;
+
+error:
+	x->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(x);
+error_no_put:
+	*errp = err;
+	return NULL;
+}
+
+static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_usersa_info *p = nlmsg_data(nlh);
+	struct xfrm_state *x;
+	int err;
+	struct km_event c;
+	uid_t loginuid = audit_get_loginuid(current);
+	u32 sessionid = audit_get_sessionid(current);
+	u32 sid;
+
+	err = verify_newsa_info(p, attrs);
+	if (err)
+		return err;
+
+	x = xfrm_state_construct(net, p, attrs, &err);
+	if (!x)
+		return err;
+
+	xfrm_state_hold(x);
+	if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
+		err = xfrm_state_add(x);
+	else
+		err = xfrm_state_update(x);
+
+	security_task_getsecid(current, &sid);
+	xfrm_audit_state_add(x, err ? 0 : 1, loginuid, sessionid, sid);
+
+	if (err < 0) {
+		x->km.state = XFRM_STATE_DEAD;
+		__xfrm_state_put(x);
+		goto out;
+	}
+
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.event = nlh->nlmsg_type;
+
+	km_state_notify(x, &c);
+out:
+	xfrm_state_put(x);
+	return err;
+}
+
+static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
+						 struct xfrm_usersa_id *p,
+						 struct nlattr **attrs,
+						 int *errp)
+{
+	struct xfrm_state *x = NULL;
+	struct xfrm_mark m;
+	int err;
+	u32 mark = xfrm_mark_get(attrs, &m);
+
+	if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
+		err = -ESRCH;
+		x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family);
+	} else {
+		xfrm_address_t *saddr = NULL;
+
+		verify_one_addr(attrs, XFRMA_SRCADDR, &saddr);
+		if (!saddr) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = -ESRCH;
+		x = xfrm_state_lookup_byaddr(net, mark,
+					     &p->daddr, saddr,
+					     p->proto, p->family);
+	}
+
+ out:
+	if (!x && errp)
+		*errp = err;
+	return x;
+}
+
+static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state *x;
+	int err = -ESRCH;
+	struct km_event c;
+	struct xfrm_usersa_id *p = nlmsg_data(nlh);
+	uid_t loginuid = audit_get_loginuid(current);
+	u32 sessionid = audit_get_sessionid(current);
+	u32 sid;
+
+	x = xfrm_user_state_lookup(net, p, attrs, &err);
+	if (x == NULL)
+		return err;
+
+	if ((err = security_xfrm_state_delete(x)) != 0)
+		goto out;
+
+	if (xfrm_state_kern(x)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	err = xfrm_state_delete(x);
+
+	if (err < 0)
+		goto out;
+
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.event = nlh->nlmsg_type;
+	km_state_notify(x, &c);
+
+out:
+	security_task_getsecid(current, &sid);
+	xfrm_audit_state_delete(x, err ? 0 : 1, loginuid, sessionid, sid);
+	xfrm_state_put(x);
+	return err;
+}
+
+static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
+{
+	memcpy(&p->id, &x->id, sizeof(p->id));
+	memcpy(&p->sel, &x->sel, sizeof(p->sel));
+	memcpy(&p->lft, &x->lft, sizeof(p->lft));
+	memcpy(&p->curlft, &x->curlft, sizeof(p->curlft));
+	memcpy(&p->stats, &x->stats, sizeof(p->stats));
+	memcpy(&p->saddr, &x->props.saddr, sizeof(p->saddr));
+	p->mode = x->props.mode;
+	p->replay_window = x->props.replay_window;
+	p->reqid = x->props.reqid;
+	p->family = x->props.family;
+	p->flags = x->props.flags;
+	p->seq = x->km.seq;
+}
+
+struct xfrm_dump_info {
+	struct sk_buff *in_skb;
+	struct sk_buff *out_skb;
+	u32 nlmsg_seq;
+	u16 nlmsg_flags;
+};
+
+static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
+{
+	struct xfrm_user_sec_ctx *uctx;
+	struct nlattr *attr;
+	int ctx_size = sizeof(*uctx) + s->ctx_len;
+
+	attr = nla_reserve(skb, XFRMA_SEC_CTX, ctx_size);
+	if (attr == NULL)
+		return -EMSGSIZE;
+
+	uctx = nla_data(attr);
+	uctx->exttype = XFRMA_SEC_CTX;
+	uctx->len = ctx_size;
+	uctx->ctx_doi = s->ctx_doi;
+	uctx->ctx_alg = s->ctx_alg;
+	uctx->ctx_len = s->ctx_len;
+	memcpy(uctx + 1, s->ctx_str, s->ctx_len);
+
+	return 0;
+}
+
+static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
+{
+	struct xfrm_algo *algo;
+	struct nlattr *nla;
+
+	nla = nla_reserve(skb, XFRMA_ALG_AUTH,
+			  sizeof(*algo) + (auth->alg_key_len + 7) / 8);
+	if (!nla)
+		return -EMSGSIZE;
+
+	algo = nla_data(nla);
+	strcpy(algo->alg_name, auth->alg_name);
+	memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8);
+	algo->alg_key_len = auth->alg_key_len;
+
+	return 0;
+}
+
+/* Don't change this without updating xfrm_sa_len! */
+static int copy_to_user_state_extra(struct xfrm_state *x,
+				    struct xfrm_usersa_info *p,
+				    struct sk_buff *skb)
+{
+	copy_to_user_state(x, p);
+
+	if (x->coaddr)
+		NLA_PUT(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
+
+	if (x->lastused)
+		NLA_PUT_U64(skb, XFRMA_LASTUSED, x->lastused);
+
+	if (x->aead)
+		NLA_PUT(skb, XFRMA_ALG_AEAD, aead_len(x->aead), x->aead);
+	if (x->aalg) {
+		if (copy_to_user_auth(x->aalg, skb))
+			goto nla_put_failure;
+
+		NLA_PUT(skb, XFRMA_ALG_AUTH_TRUNC,
+			xfrm_alg_auth_len(x->aalg), x->aalg);
+	}
+	if (x->ealg)
+		NLA_PUT(skb, XFRMA_ALG_CRYPT, xfrm_alg_len(x->ealg), x->ealg);
+	if (x->calg)
+		NLA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
+
+	if (x->encap)
+		NLA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+
+	if (x->tfcpad)
+		NLA_PUT_U32(skb, XFRMA_TFCPAD, x->tfcpad);
+
+	if (xfrm_mark_put(skb, &x->mark))
+		goto nla_put_failure;
+
+	if (x->replay_esn)
+		NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL,
+			xfrm_replay_state_esn_len(x->replay_esn), x->replay_esn);
+
+	if (x->security && copy_sec_ctx(x->security, skb) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
+{
+	struct xfrm_dump_info *sp = ptr;
+	struct sk_buff *in_skb = sp->in_skb;
+	struct sk_buff *skb = sp->out_skb;
+	struct xfrm_usersa_info *p;
+	struct nlmsghdr *nlh;
+	int err;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(in_skb).pid, sp->nlmsg_seq,
+			XFRM_MSG_NEWSA, sizeof(*p), sp->nlmsg_flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	p = nlmsg_data(nlh);
+
+	err = copy_to_user_state_extra(x, p, skb);
+	if (err)
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return err;
+}
+
+static int xfrm_dump_sa_done(struct netlink_callback *cb)
+{
+	struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
+	xfrm_state_walk_done(walk);
+	return 0;
+}
+
+static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state_walk *walk = (struct xfrm_state_walk *) &cb->args[1];
+	struct xfrm_dump_info info;
+
+	BUILD_BUG_ON(sizeof(struct xfrm_state_walk) >
+		     sizeof(cb->args) - sizeof(cb->args[0]));
+
+	info.in_skb = cb->skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = cb->nlh->nlmsg_seq;
+	info.nlmsg_flags = NLM_F_MULTI;
+
+	if (!cb->args[0]) {
+		cb->args[0] = 1;
+		xfrm_state_walk_init(walk, 0);
+	}
+
+	(void) xfrm_state_walk(net, walk, dump_one_state, &info);
+
+	return skb->len;
+}
+
+static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
+					  struct xfrm_state *x, u32 seq)
+{
+	struct xfrm_dump_info info;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	info.in_skb = in_skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = seq;
+	info.nlmsg_flags = 0;
+
+	if (dump_one_state(x, 0, &info)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	return skb;
+}
+
+static inline size_t xfrm_spdinfo_msgsize(void)
+{
+	return NLMSG_ALIGN(4)
+	       + nla_total_size(sizeof(struct xfrmu_spdinfo))
+	       + nla_total_size(sizeof(struct xfrmu_spdhinfo));
+}
+
+static int build_spdinfo(struct sk_buff *skb, struct net *net,
+			 u32 pid, u32 seq, u32 flags)
+{
+	struct xfrmk_spdinfo si;
+	struct xfrmu_spdinfo spc;
+	struct xfrmu_spdhinfo sph;
+	struct nlmsghdr *nlh;
+	u32 *f;
+
+	nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0);
+	if (nlh == NULL) /* shouldn't really happen ... */
+		return -EMSGSIZE;
+
+	f = nlmsg_data(nlh);
+	*f = flags;
+	xfrm_spd_getinfo(net, &si);
+	spc.incnt = si.incnt;
+	spc.outcnt = si.outcnt;
+	spc.fwdcnt = si.fwdcnt;
+	spc.inscnt = si.inscnt;
+	spc.outscnt = si.outscnt;
+	spc.fwdscnt = si.fwdscnt;
+	sph.spdhcnt = si.spdhcnt;
+	sph.spdhmcnt = si.spdhmcnt;
+
+	NLA_PUT(skb, XFRMA_SPD_INFO, sizeof(spc), &spc);
+	NLA_PUT(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct sk_buff *r_skb;
+	u32 *flags = nlmsg_data(nlh);
+	u32 spid = NETLINK_CB(skb).pid;
+	u32 seq = nlh->nlmsg_seq;
+
+	r_skb = nlmsg_new(xfrm_spdinfo_msgsize(), GFP_ATOMIC);
+	if (r_skb == NULL)
+		return -ENOMEM;
+
+	if (build_spdinfo(r_skb, net, spid, seq, *flags) < 0)
+		BUG();
+
+	return nlmsg_unicast(net->xfrm.nlsk, r_skb, spid);
+}
+
+static inline size_t xfrm_sadinfo_msgsize(void)
+{
+	return NLMSG_ALIGN(4)
+	       + nla_total_size(sizeof(struct xfrmu_sadhinfo))
+	       + nla_total_size(4); /* XFRMA_SAD_CNT */
+}
+
+static int build_sadinfo(struct sk_buff *skb, struct net *net,
+			 u32 pid, u32 seq, u32 flags)
+{
+	struct xfrmk_sadinfo si;
+	struct xfrmu_sadhinfo sh;
+	struct nlmsghdr *nlh;
+	u32 *f;
+
+	nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0);
+	if (nlh == NULL) /* shouldn't really happen ... */
+		return -EMSGSIZE;
+
+	f = nlmsg_data(nlh);
+	*f = flags;
+	xfrm_sad_getinfo(net, &si);
+
+	sh.sadhmcnt = si.sadhmcnt;
+	sh.sadhcnt = si.sadhcnt;
+
+	NLA_PUT_U32(skb, XFRMA_SAD_CNT, si.sadcnt);
+	NLA_PUT(skb, XFRMA_SAD_HINFO, sizeof(sh), &sh);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct sk_buff *r_skb;
+	u32 *flags = nlmsg_data(nlh);
+	u32 spid = NETLINK_CB(skb).pid;
+	u32 seq = nlh->nlmsg_seq;
+
+	r_skb = nlmsg_new(xfrm_sadinfo_msgsize(), GFP_ATOMIC);
+	if (r_skb == NULL)
+		return -ENOMEM;
+
+	if (build_sadinfo(r_skb, net, spid, seq, *flags) < 0)
+		BUG();
+
+	return nlmsg_unicast(net->xfrm.nlsk, r_skb, spid);
+}
+
+static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_usersa_id *p = nlmsg_data(nlh);
+	struct xfrm_state *x;
+	struct sk_buff *resp_skb;
+	int err = -ESRCH;
+
+	x = xfrm_user_state_lookup(net, p, attrs, &err);
+	if (x == NULL)
+		goto out_noput;
+
+	resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
+	if (IS_ERR(resp_skb)) {
+		err = PTR_ERR(resp_skb);
+	} else {
+		err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).pid);
+	}
+	xfrm_state_put(x);
+out_noput:
+	return err;
+}
+
+static int verify_userspi_info(struct xfrm_userspi_info *p)
+{
+	switch (p->info.id.proto) {
+	case IPPROTO_AH:
+	case IPPROTO_ESP:
+		break;
+
+	case IPPROTO_COMP:
+		/* IPCOMP spi is 16-bits. */
+		if (p->max >= 0x10000)
+			return -EINVAL;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (p->min > p->max)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state *x;
+	struct xfrm_userspi_info *p;
+	struct sk_buff *resp_skb;
+	xfrm_address_t *daddr;
+	int family;
+	int err;
+	u32 mark;
+	struct xfrm_mark m;
+
+	p = nlmsg_data(nlh);
+	err = verify_userspi_info(p);
+	if (err)
+		goto out_noput;
+
+	family = p->info.family;
+	daddr = &p->info.id.daddr;
+
+	x = NULL;
+
+	mark = xfrm_mark_get(attrs, &m);
+	if (p->info.seq) {
+		x = xfrm_find_acq_byseq(net, mark, p->info.seq);
+		if (x && xfrm_addr_cmp(&x->id.daddr, daddr, family)) {
+			xfrm_state_put(x);
+			x = NULL;
+		}
+	}
+
+	if (!x)
+		x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
+				  p->info.id.proto, daddr,
+				  &p->info.saddr, 1,
+				  family);
+	err = -ENOENT;
+	if (x == NULL)
+		goto out_noput;
+
+	err = xfrm_alloc_spi(x, p->min, p->max);
+	if (err)
+		goto out;
+
+	resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
+	if (IS_ERR(resp_skb)) {
+		err = PTR_ERR(resp_skb);
+		goto out;
+	}
+
+	err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).pid);
+
+out:
+	xfrm_state_put(x);
+out_noput:
+	return err;
+}
+
+static int verify_policy_dir(u8 dir)
+{
+	switch (dir) {
+	case XFRM_POLICY_IN:
+	case XFRM_POLICY_OUT:
+	case XFRM_POLICY_FWD:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int verify_policy_type(u8 type)
+{
+	switch (type) {
+	case XFRM_POLICY_TYPE_MAIN:
+#ifdef CONFIG_XFRM_SUB_POLICY
+	case XFRM_POLICY_TYPE_SUB:
+#endif
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
+{
+	switch (p->share) {
+	case XFRM_SHARE_ANY:
+	case XFRM_SHARE_SESSION:
+	case XFRM_SHARE_USER:
+	case XFRM_SHARE_UNIQUE:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	switch (p->action) {
+	case XFRM_POLICY_ALLOW:
+	case XFRM_POLICY_BLOCK:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	switch (p->sel.family) {
+	case AF_INET:
+		break;
+
+	case AF_INET6:
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		break;
+#else
+		return  -EAFNOSUPPORT;
+#endif
+
+	default:
+		return -EINVAL;
+	}
+
+	return verify_policy_dir(p->dir);
+}
+
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!rt)
+		return 0;
+
+	uctx = nla_data(rt);
+	return security_xfrm_policy_alloc(&pol->security, uctx);
+}
+
+static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
+			   int nr)
+{
+	int i;
+
+	xp->xfrm_nr = nr;
+	for (i = 0; i < nr; i++, ut++) {
+		struct xfrm_tmpl *t = &xp->xfrm_vec[i];
+
+		memcpy(&t->id, &ut->id, sizeof(struct xfrm_id));
+		memcpy(&t->saddr, &ut->saddr,
+		       sizeof(xfrm_address_t));
+		t->reqid = ut->reqid;
+		t->mode = ut->mode;
+		t->share = ut->share;
+		t->optional = ut->optional;
+		t->aalgos = ut->aalgos;
+		t->ealgos = ut->ealgos;
+		t->calgos = ut->calgos;
+		/* If all masks are ~0, then we allow all algorithms. */
+		t->allalgs = !~(t->aalgos & t->ealgos & t->calgos);
+		t->encap_family = ut->family;
+	}
+}
+
+static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
+{
+	int i;
+
+	if (nr > XFRM_MAX_DEPTH)
+		return -EINVAL;
+
+	for (i = 0; i < nr; i++) {
+		/* We never validated the ut->family value, so many
+		 * applications simply leave it at zero.  The check was
+		 * never made and ut->family was ignored because all
+		 * templates could be assumed to have the same family as
+		 * the policy itself.  Now that we will have ipv4-in-ipv6
+		 * and ipv6-in-ipv4 tunnels, this is no longer true.
+		 */
+		if (!ut[i].family)
+			ut[i].family = family;
+
+		switch (ut[i].family) {
+		case AF_INET:
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			break;
+#endif
+		default:
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_TMPL];
+
+	if (!rt) {
+		pol->xfrm_nr = 0;
+	} else {
+		struct xfrm_user_tmpl *utmpl = nla_data(rt);
+		int nr = nla_len(rt) / sizeof(*utmpl);
+		int err;
+
+		err = validate_tmpl(nr, utmpl, pol->family);
+		if (err)
+			return err;
+
+		copy_templates(pol, utmpl, nr);
+	}
+	return 0;
+}
+
+static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs)
+{
+	struct nlattr *rt = attrs[XFRMA_POLICY_TYPE];
+	struct xfrm_userpolicy_type *upt;
+	u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err;
+
+	if (rt) {
+		upt = nla_data(rt);
+		type = upt->type;
+	}
+
+	err = verify_policy_type(type);
+	if (err)
+		return err;
+
+	*tp = type;
+	return 0;
+}
+
+static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
+{
+	xp->priority = p->priority;
+	xp->index = p->index;
+	memcpy(&xp->selector, &p->sel, sizeof(xp->selector));
+	memcpy(&xp->lft, &p->lft, sizeof(xp->lft));
+	xp->action = p->action;
+	xp->flags = p->flags;
+	xp->family = p->sel.family;
+	/* XXX xp->share = p->share; */
+}
+
+static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir)
+{
+	memcpy(&p->sel, &xp->selector, sizeof(p->sel));
+	memcpy(&p->lft, &xp->lft, sizeof(p->lft));
+	memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft));
+	p->priority = xp->priority;
+	p->index = xp->index;
+	p->sel.family = xp->family;
+	p->dir = dir;
+	p->action = xp->action;
+	p->flags = xp->flags;
+	p->share = XFRM_SHARE_ANY; /* XXX xp->share */
+}
+
+static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp)
+{
+	struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL);
+	int err;
+
+	if (!xp) {
+		*errp = -ENOMEM;
+		return NULL;
+	}
+
+	copy_from_user_policy(xp, p);
+
+	err = copy_from_user_policy_type(&xp->type, attrs);
+	if (err)
+		goto error;
+
+	if (!(err = copy_from_user_tmpl(xp, attrs)))
+		err = copy_from_user_sec_ctx(xp, attrs);
+	if (err)
+		goto error;
+
+	xfrm_mark_get(attrs, &xp->mark);
+
+	return xp;
+ error:
+	*errp = err;
+	xp->walk.dead = 1;
+	xfrm_policy_destroy(xp);
+	return NULL;
+}
+
+static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_userpolicy_info *p = nlmsg_data(nlh);
+	struct xfrm_policy *xp;
+	struct km_event c;
+	int err;
+	int excl;
+	uid_t loginuid = audit_get_loginuid(current);
+	u32 sessionid = audit_get_sessionid(current);
+	u32 sid;
+
+	err = verify_newpolicy_info(p);
+	if (err)
+		return err;
+	err = verify_sec_ctx_len(attrs);
+	if (err)
+		return err;
+
+	xp = xfrm_policy_construct(net, p, attrs, &err);
+	if (!xp)
+		return err;
+
+	/* shouldn't excl be based on nlh flags??
+	 * Aha! this is anti-netlink really i.e  more pfkey derived
+	 * in netlink excl is a flag and you wouldnt need
+	 * a type XFRM_MSG_UPDPOLICY - JHS */
+	excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
+	err = xfrm_policy_insert(p->dir, xp, excl);
+	security_task_getsecid(current, &sid);
+	xfrm_audit_policy_add(xp, err ? 0 : 1, loginuid, sessionid, sid);
+
+	if (err) {
+		security_xfrm_policy_free(xp->security);
+		kfree(xp);
+		return err;
+	}
+
+	c.event = nlh->nlmsg_type;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	km_policy_notify(xp, p->dir, &c);
+
+	xfrm_pol_put(xp);
+
+	return 0;
+}
+
+static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH];
+	int i;
+
+	if (xp->xfrm_nr == 0)
+		return 0;
+
+	for (i = 0; i < xp->xfrm_nr; i++) {
+		struct xfrm_user_tmpl *up = &vec[i];
+		struct xfrm_tmpl *kp = &xp->xfrm_vec[i];
+
+		memcpy(&up->id, &kp->id, sizeof(up->id));
+		up->family = kp->encap_family;
+		memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr));
+		up->reqid = kp->reqid;
+		up->mode = kp->mode;
+		up->share = kp->share;
+		up->optional = kp->optional;
+		up->aalgos = kp->aalgos;
+		up->ealgos = kp->ealgos;
+		up->calgos = kp->calgos;
+	}
+
+	return nla_put(skb, XFRMA_TMPL,
+		       sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr, vec);
+}
+
+static inline int copy_to_user_state_sec_ctx(struct xfrm_state *x, struct sk_buff *skb)
+{
+	if (x->security) {
+		return copy_sec_ctx(x->security, skb);
+	}
+	return 0;
+}
+
+static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	if (xp->security) {
+		return copy_sec_ctx(xp->security, skb);
+	}
+	return 0;
+}
+static inline size_t userpolicy_type_attrsize(void)
+{
+#ifdef CONFIG_XFRM_SUB_POLICY
+	return nla_total_size(sizeof(struct xfrm_userpolicy_type));
+#else
+	return 0;
+#endif
+}
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
+{
+	struct xfrm_userpolicy_type upt = {
+		.type = type,
+	};
+
+	return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
+}
+
+#else
+static inline int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
+{
+	return 0;
+}
+#endif
+
+static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
+{
+	struct xfrm_dump_info *sp = ptr;
+	struct xfrm_userpolicy_info *p;
+	struct sk_buff *in_skb = sp->in_skb;
+	struct sk_buff *skb = sp->out_skb;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(in_skb).pid, sp->nlmsg_seq,
+			XFRM_MSG_NEWPOLICY, sizeof(*p), sp->nlmsg_flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	p = nlmsg_data(nlh);
+	copy_to_user_policy(xp, p, dir);
+	if (copy_to_user_tmpl(xp, skb) < 0)
+		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp->type, skb) < 0)
+		goto nlmsg_failure;
+	if (xfrm_mark_put(skb, &xp->mark))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+nlmsg_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_dump_policy_done(struct netlink_callback *cb)
+{
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+
+	xfrm_policy_walk_done(walk);
+	return 0;
+}
+
+static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_policy_walk *walk = (struct xfrm_policy_walk *) &cb->args[1];
+	struct xfrm_dump_info info;
+
+	BUILD_BUG_ON(sizeof(struct xfrm_policy_walk) >
+		     sizeof(cb->args) - sizeof(cb->args[0]));
+
+	info.in_skb = cb->skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = cb->nlh->nlmsg_seq;
+	info.nlmsg_flags = NLM_F_MULTI;
+
+	if (!cb->args[0]) {
+		cb->args[0] = 1;
+		xfrm_policy_walk_init(walk, XFRM_POLICY_TYPE_ANY);
+	}
+
+	(void) xfrm_policy_walk(net, walk, dump_one_policy, &info);
+
+	return skb->len;
+}
+
+static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
+					  struct xfrm_policy *xp,
+					  int dir, u32 seq)
+{
+	struct xfrm_dump_info info;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	info.in_skb = in_skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = seq;
+	info.nlmsg_flags = 0;
+
+	if (dump_one_policy(xp, dir, 0, &info) < 0) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	return skb;
+}
+
+static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_policy *xp;
+	struct xfrm_userpolicy_id *p;
+	u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err;
+	struct km_event c;
+	int delete;
+	struct xfrm_mark m;
+	u32 mark = xfrm_mark_get(attrs, &m);
+
+	p = nlmsg_data(nlh);
+	delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
+
+	err = copy_from_user_policy_type(&type, attrs);
+	if (err)
+		return err;
+
+	err = verify_policy_dir(p->dir);
+	if (err)
+		return err;
+
+	if (p->index)
+		xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+	else {
+		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+		struct xfrm_sec_ctx *ctx;
+
+		err = verify_sec_ctx_len(attrs);
+		if (err)
+			return err;
+
+		ctx = NULL;
+		if (rt) {
+			struct xfrm_user_sec_ctx *uctx = nla_data(rt);
+
+			err = security_xfrm_policy_alloc(&ctx, uctx);
+			if (err)
+				return err;
+		}
+		xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+					   ctx, delete, &err);
+		security_xfrm_policy_free(ctx);
+	}
+	if (xp == NULL)
+		return -ENOENT;
+
+	if (!delete) {
+		struct sk_buff *resp_skb;
+
+		resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq);
+		if (IS_ERR(resp_skb)) {
+			err = PTR_ERR(resp_skb);
+		} else {
+			err = nlmsg_unicast(net->xfrm.nlsk, resp_skb,
+					    NETLINK_CB(skb).pid);
+		}
+	} else {
+		uid_t loginuid = audit_get_loginuid(current);
+		u32 sessionid = audit_get_sessionid(current);
+		u32 sid;
+
+		security_task_getsecid(current, &sid);
+		xfrm_audit_policy_delete(xp, err ? 0 : 1, loginuid, sessionid,
+					 sid);
+
+		if (err != 0)
+			goto out;
+
+		c.data.byid = p->index;
+		c.event = nlh->nlmsg_type;
+		c.seq = nlh->nlmsg_seq;
+		c.pid = nlh->nlmsg_pid;
+		km_policy_notify(xp, p->dir, &c);
+	}
+
+out:
+	xfrm_pol_put(xp);
+	return err;
+}
+
+static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct km_event c;
+	struct xfrm_usersa_flush *p = nlmsg_data(nlh);
+	struct xfrm_audit audit_info;
+	int err;
+
+	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
+	security_task_getsecid(current, &audit_info.secid);
+	err = xfrm_state_flush(net, p->proto, &audit_info);
+	if (err) {
+		if (err == -ESRCH) /* empty table */
+			return 0;
+		return err;
+	}
+	c.data.proto = p->proto;
+	c.event = nlh->nlmsg_type;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.net = net;
+	km_state_notify(NULL, &c);
+
+	return 0;
+}
+
+static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x)
+{
+	size_t replay_size = x->replay_esn ?
+			      xfrm_replay_state_esn_len(x->replay_esn) :
+			      sizeof(struct xfrm_replay_state);
+
+	return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
+	       + nla_total_size(replay_size)
+	       + nla_total_size(sizeof(struct xfrm_lifetime_cur))
+	       + nla_total_size(sizeof(struct xfrm_mark))
+	       + nla_total_size(4) /* XFRM_AE_RTHR */
+	       + nla_total_size(4); /* XFRM_AE_ETHR */
+}
+
+static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
+{
+	struct xfrm_aevent_id *id;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_NEWAE, sizeof(*id), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	id = nlmsg_data(nlh);
+	memcpy(&id->sa_id.daddr, &x->id.daddr,sizeof(x->id.daddr));
+	id->sa_id.spi = x->id.spi;
+	id->sa_id.family = x->props.family;
+	id->sa_id.proto = x->id.proto;
+	memcpy(&id->saddr, &x->props.saddr,sizeof(x->props.saddr));
+	id->reqid = x->props.reqid;
+	id->flags = c->data.aevent;
+
+	if (x->replay_esn)
+		NLA_PUT(skb, XFRMA_REPLAY_ESN_VAL,
+			xfrm_replay_state_esn_len(x->replay_esn),
+			x->replay_esn);
+	else
+		NLA_PUT(skb, XFRMA_REPLAY_VAL, sizeof(x->replay), &x->replay);
+
+	NLA_PUT(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft);
+
+	if (id->flags & XFRM_AE_RTHR)
+		NLA_PUT_U32(skb, XFRMA_REPLAY_THRESH, x->replay_maxdiff);
+
+	if (id->flags & XFRM_AE_ETHR)
+		NLA_PUT_U32(skb, XFRMA_ETIMER_THRESH,
+			    x->replay_maxage * 10 / HZ);
+
+	if (xfrm_mark_put(skb, &x->mark))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state *x;
+	struct sk_buff *r_skb;
+	int err;
+	struct km_event c;
+	u32 mark;
+	struct xfrm_mark m;
+	struct xfrm_aevent_id *p = nlmsg_data(nlh);
+	struct xfrm_usersa_id *id = &p->sa_id;
+
+	mark = xfrm_mark_get(attrs, &m);
+
+	x = xfrm_state_lookup(net, mark, &id->daddr, id->spi, id->proto, id->family);
+	if (x == NULL)
+		return -ESRCH;
+
+	r_skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
+	if (r_skb == NULL) {
+		xfrm_state_put(x);
+		return -ENOMEM;
+	}
+
+	/*
+	 * XXX: is this lock really needed - none of the other
+	 * gets lock (the concern is things getting updated
+	 * while we are still reading) - jhs
+	*/
+	spin_lock_bh(&x->lock);
+	c.data.aevent = p->flags;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+
+	if (build_aevent(r_skb, x, &c) < 0)
+		BUG();
+	err = nlmsg_unicast(net->xfrm.nlsk, r_skb, NETLINK_CB(skb).pid);
+	spin_unlock_bh(&x->lock);
+	xfrm_state_put(x);
+	return err;
+}
+
+static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state *x;
+	struct km_event c;
+	int err = - EINVAL;
+	u32 mark = 0;
+	struct xfrm_mark m;
+	struct xfrm_aevent_id *p = nlmsg_data(nlh);
+	struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];
+	struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];
+	struct nlattr *lt = attrs[XFRMA_LTIME_VAL];
+
+	if (!lt && !rp && !re)
+		return err;
+
+	/* pedantic mode - thou shalt sayeth replaceth */
+	if (!(nlh->nlmsg_flags&NLM_F_REPLACE))
+		return err;
+
+	mark = xfrm_mark_get(attrs, &m);
+
+	x = xfrm_state_lookup(net, mark, &p->sa_id.daddr, p->sa_id.spi, p->sa_id.proto, p->sa_id.family);
+	if (x == NULL)
+		return -ESRCH;
+
+	if (x->km.state != XFRM_STATE_VALID)
+		goto out;
+
+	err = xfrm_replay_verify_len(x->replay_esn, rp);
+	if (err)
+		goto out;
+
+	spin_lock_bh(&x->lock);
+	xfrm_update_ae_params(x, attrs);
+	spin_unlock_bh(&x->lock);
+
+	c.event = nlh->nlmsg_type;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.data.aevent = XFRM_AE_CU;
+	km_state_notify(x, &c);
+	err = 0;
+out:
+	xfrm_state_put(x);
+	return err;
+}
+
+static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct km_event c;
+	u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err;
+	struct xfrm_audit audit_info;
+
+	err = copy_from_user_policy_type(&type, attrs);
+	if (err)
+		return err;
+
+	audit_info.loginuid = audit_get_loginuid(current);
+	audit_info.sessionid = audit_get_sessionid(current);
+	security_task_getsecid(current, &audit_info.secid);
+	err = xfrm_policy_flush(net, type, &audit_info);
+	if (err) {
+		if (err == -ESRCH) /* empty table */
+			return 0;
+		return err;
+	}
+
+	c.data.type = type;
+	c.event = nlh->nlmsg_type;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.net = net;
+	km_policy_notify(NULL, 0, &c);
+	return 0;
+}
+
+static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_policy *xp;
+	struct xfrm_user_polexpire *up = nlmsg_data(nlh);
+	struct xfrm_userpolicy_info *p = &up->pol;
+	u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err = -ENOENT;
+	struct xfrm_mark m;
+	u32 mark = xfrm_mark_get(attrs, &m);
+
+	err = copy_from_user_policy_type(&type, attrs);
+	if (err)
+		return err;
+
+	err = verify_policy_dir(p->dir);
+	if (err)
+		return err;
+
+	if (p->index)
+		xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+	else {
+		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
+		struct xfrm_sec_ctx *ctx;
+
+		err = verify_sec_ctx_len(attrs);
+		if (err)
+			return err;
+
+		ctx = NULL;
+		if (rt) {
+			struct xfrm_user_sec_ctx *uctx = nla_data(rt);
+
+			err = security_xfrm_policy_alloc(&ctx, uctx);
+			if (err)
+				return err;
+		}
+		xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+					   &p->sel, ctx, 0, &err);
+		security_xfrm_policy_free(ctx);
+	}
+	if (xp == NULL)
+		return -ENOENT;
+
+	if (unlikely(xp->walk.dead))
+		goto out;
+
+	err = 0;
+	if (up->hard) {
+		uid_t loginuid = audit_get_loginuid(current);
+		u32 sessionid = audit_get_sessionid(current);
+		u32 sid;
+
+		security_task_getsecid(current, &sid);
+		xfrm_policy_delete(xp, p->dir);
+		xfrm_audit_policy_delete(xp, 1, loginuid, sessionid, sid);
+
+	} else {
+		// reset the timers here?
+		WARN(1, "Dont know what to do with soft policy expire\n");
+	}
+	km_policy_expired(xp, p->dir, up->hard, current->pid);
+
+out:
+	xfrm_pol_put(xp);
+	return err;
+}
+
+static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_state *x;
+	int err;
+	struct xfrm_user_expire *ue = nlmsg_data(nlh);
+	struct xfrm_usersa_info *p = &ue->state;
+	struct xfrm_mark m;
+	u32 mark = xfrm_mark_get(attrs, &m);
+
+	x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
+
+	err = -ENOENT;
+	if (x == NULL)
+		return err;
+
+	spin_lock_bh(&x->lock);
+	err = -EINVAL;
+	if (x->km.state != XFRM_STATE_VALID)
+		goto out;
+	km_state_expired(x, ue->hard, current->pid);
+
+	if (ue->hard) {
+		uid_t loginuid = audit_get_loginuid(current);
+		u32 sessionid = audit_get_sessionid(current);
+		u32 sid;
+
+		security_task_getsecid(current, &sid);
+		__xfrm_state_delete(x);
+		xfrm_audit_state_delete(x, 1, loginuid, sessionid, sid);
+	}
+	err = 0;
+out:
+	spin_unlock_bh(&x->lock);
+	xfrm_state_put(x);
+	return err;
+}
+
+static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
+		struct nlattr **attrs)
+{
+	struct net *net = sock_net(skb->sk);
+	struct xfrm_policy *xp;
+	struct xfrm_user_tmpl *ut;
+	int i;
+	struct nlattr *rt = attrs[XFRMA_TMPL];
+	struct xfrm_mark mark;
+
+	struct xfrm_user_acquire *ua = nlmsg_data(nlh);
+	struct xfrm_state *x = xfrm_state_alloc(net);
+	int err = -ENOMEM;
+
+	if (!x)
+		goto nomem;
+
+	xfrm_mark_get(attrs, &mark);
+
+	err = verify_newpolicy_info(&ua->policy);
+	if (err)
+		goto bad_policy;
+
+	/*   build an XP */
+	xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
+	if (!xp)
+		goto free_state;
+
+	memcpy(&x->id, &ua->id, sizeof(ua->id));
+	memcpy(&x->props.saddr, &ua->saddr, sizeof(ua->saddr));
+	memcpy(&x->sel, &ua->sel, sizeof(ua->sel));
+	xp->mark.m = x->mark.m = mark.m;
+	xp->mark.v = x->mark.v = mark.v;
+	ut = nla_data(rt);
+	/* extract the templates and for each call km_key */
+	for (i = 0; i < xp->xfrm_nr; i++, ut++) {
+		struct xfrm_tmpl *t = &xp->xfrm_vec[i];
+		memcpy(&x->id, &t->id, sizeof(x->id));
+		x->props.mode = t->mode;
+		x->props.reqid = t->reqid;
+		x->props.family = ut->family;
+		t->aalgos = ua->aalgos;
+		t->ealgos = ua->ealgos;
+		t->calgos = ua->calgos;
+		err = km_query(x, t, xp);
+
+	}
+
+	kfree(x);
+	kfree(xp);
+
+	return 0;
+
+bad_policy:
+	WARN(1, "BAD policy passed\n");
+free_state:
+	kfree(x);
+nomem:
+	return err;
+}
+
+#ifdef CONFIG_XFRM_MIGRATE
+static int copy_from_user_migrate(struct xfrm_migrate *ma,
+				  struct xfrm_kmaddress *k,
+				  struct nlattr **attrs, int *num)
+{
+	struct nlattr *rt = attrs[XFRMA_MIGRATE];
+	struct xfrm_user_migrate *um;
+	int i, num_migrate;
+
+	if (k != NULL) {
+		struct xfrm_user_kmaddress *uk;
+
+		uk = nla_data(attrs[XFRMA_KMADDRESS]);
+		memcpy(&k->local, &uk->local, sizeof(k->local));
+		memcpy(&k->remote, &uk->remote, sizeof(k->remote));
+		k->family = uk->family;
+		k->reserved = uk->reserved;
+	}
+
+	um = nla_data(rt);
+	num_migrate = nla_len(rt) / sizeof(*um);
+
+	if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH)
+		return -EINVAL;
+
+	for (i = 0; i < num_migrate; i++, um++, ma++) {
+		memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr));
+		memcpy(&ma->old_saddr, &um->old_saddr, sizeof(ma->old_saddr));
+		memcpy(&ma->new_daddr, &um->new_daddr, sizeof(ma->new_daddr));
+		memcpy(&ma->new_saddr, &um->new_saddr, sizeof(ma->new_saddr));
+
+		ma->proto = um->proto;
+		ma->mode = um->mode;
+		ma->reqid = um->reqid;
+
+		ma->old_family = um->old_family;
+		ma->new_family = um->new_family;
+	}
+
+	*num = i;
+	return 0;
+}
+
+static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct nlattr **attrs)
+{
+	struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
+	struct xfrm_migrate m[XFRM_MAX_DEPTH];
+	struct xfrm_kmaddress km, *kmp;
+	u8 type;
+	int err;
+	int n = 0;
+
+	if (attrs[XFRMA_MIGRATE] == NULL)
+		return -EINVAL;
+
+	kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;
+
+	err = copy_from_user_policy_type(&type, attrs);
+	if (err)
+		return err;
+
+	err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n);
+	if (err)
+		return err;
+
+	if (!n)
+		return 0;
+
+	xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp);
+
+	return 0;
+}
+#else
+static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct nlattr **attrs)
+{
+	return -ENOPROTOOPT;
+}
+#endif
+
+#ifdef CONFIG_XFRM_MIGRATE
+static int copy_to_user_migrate(const struct xfrm_migrate *m, struct sk_buff *skb)
+{
+	struct xfrm_user_migrate um;
+
+	memset(&um, 0, sizeof(um));
+	um.proto = m->proto;
+	um.mode = m->mode;
+	um.reqid = m->reqid;
+	um.old_family = m->old_family;
+	memcpy(&um.old_daddr, &m->old_daddr, sizeof(um.old_daddr));
+	memcpy(&um.old_saddr, &m->old_saddr, sizeof(um.old_saddr));
+	um.new_family = m->new_family;
+	memcpy(&um.new_daddr, &m->new_daddr, sizeof(um.new_daddr));
+	memcpy(&um.new_saddr, &m->new_saddr, sizeof(um.new_saddr));
+
+	return nla_put(skb, XFRMA_MIGRATE, sizeof(um), &um);
+}
+
+static int copy_to_user_kmaddress(const struct xfrm_kmaddress *k, struct sk_buff *skb)
+{
+	struct xfrm_user_kmaddress uk;
+
+	memset(&uk, 0, sizeof(uk));
+	uk.family = k->family;
+	uk.reserved = k->reserved;
+	memcpy(&uk.local, &k->local, sizeof(uk.local));
+	memcpy(&uk.remote, &k->remote, sizeof(uk.remote));
+
+	return nla_put(skb, XFRMA_KMADDRESS, sizeof(uk), &uk);
+}
+
+static inline size_t xfrm_migrate_msgsize(int num_migrate, int with_kma)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_id))
+	      + (with_kma ? nla_total_size(sizeof(struct xfrm_kmaddress)) : 0)
+	      + nla_total_size(sizeof(struct xfrm_user_migrate) * num_migrate)
+	      + userpolicy_type_attrsize();
+}
+
+static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m,
+			 int num_migrate, const struct xfrm_kmaddress *k,
+			 const struct xfrm_selector *sel, u8 dir, u8 type)
+{
+	const struct xfrm_migrate *mp;
+	struct xfrm_userpolicy_id *pol_id;
+	struct nlmsghdr *nlh;
+	int i;
+
+	nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MIGRATE, sizeof(*pol_id), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	pol_id = nlmsg_data(nlh);
+	/* copy data from selector, dir, and type to the pol_id */
+	memset(pol_id, 0, sizeof(*pol_id));
+	memcpy(&pol_id->sel, sel, sizeof(pol_id->sel));
+	pol_id->dir = dir;
+
+	if (k != NULL && (copy_to_user_kmaddress(k, skb) < 0))
+			goto nlmsg_failure;
+
+	if (copy_to_user_policy_type(type, skb) < 0)
+		goto nlmsg_failure;
+
+	for (i = 0, mp = m ; i < num_migrate; i++, mp++) {
+		if (copy_to_user_migrate(mp, skb) < 0)
+			goto nlmsg_failure;
+	}
+
+	return nlmsg_end(skb, nlh);
+nlmsg_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+			     const struct xfrm_migrate *m, int num_migrate,
+			     const struct xfrm_kmaddress *k)
+{
+	struct net *net = &init_net;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	/* build migrate */
+	if (build_migrate(skb, m, num_migrate, k, sel, dir, type) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MIGRATE, GFP_ATOMIC);
+}
+#else
+static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
+			     const struct xfrm_migrate *m, int num_migrate,
+			     const struct xfrm_kmaddress *k)
+{
+	return -ENOPROTOOPT;
+}
+#endif
+
+#define XMSGSIZE(type) sizeof(struct type)
+
+static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
+	[XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
+	[XFRM_MSG_DELSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+	[XFRM_MSG_GETSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_id),
+	[XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
+	[XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+	[XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+	[XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userspi_info),
+	[XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_acquire),
+	[XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_expire),
+	[XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_info),
+	[XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_info),
+	[XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_polexpire),
+	[XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_usersa_flush),
+	[XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = 0,
+	[XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+	[XFRM_MSG_GETAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+	[XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
+	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = sizeof(u32),
+	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = sizeof(u32),
+};
+
+#undef XMSGSIZE
+
+static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
+	[XFRMA_SA]		= { .len = sizeof(struct xfrm_usersa_info)},
+	[XFRMA_POLICY]		= { .len = sizeof(struct xfrm_userpolicy_info)},
+	[XFRMA_LASTUSED]	= { .type = NLA_U64},
+	[XFRMA_ALG_AUTH_TRUNC]	= { .len = sizeof(struct xfrm_algo_auth)},
+	[XFRMA_ALG_AEAD]	= { .len = sizeof(struct xfrm_algo_aead) },
+	[XFRMA_ALG_AUTH]	= { .len = sizeof(struct xfrm_algo) },
+	[XFRMA_ALG_CRYPT]	= { .len = sizeof(struct xfrm_algo) },
+	[XFRMA_ALG_COMP]	= { .len = sizeof(struct xfrm_algo) },
+	[XFRMA_ENCAP]		= { .len = sizeof(struct xfrm_encap_tmpl) },
+	[XFRMA_TMPL]		= { .len = sizeof(struct xfrm_user_tmpl) },
+	[XFRMA_SEC_CTX]		= { .len = sizeof(struct xfrm_sec_ctx) },
+	[XFRMA_LTIME_VAL]	= { .len = sizeof(struct xfrm_lifetime_cur) },
+	[XFRMA_REPLAY_VAL]	= { .len = sizeof(struct xfrm_replay_state) },
+	[XFRMA_REPLAY_THRESH]	= { .type = NLA_U32 },
+	[XFRMA_ETIMER_THRESH]	= { .type = NLA_U32 },
+	[XFRMA_SRCADDR]		= { .len = sizeof(xfrm_address_t) },
+	[XFRMA_COADDR]		= { .len = sizeof(xfrm_address_t) },
+	[XFRMA_POLICY_TYPE]	= { .len = sizeof(struct xfrm_userpolicy_type)},
+	[XFRMA_MIGRATE]		= { .len = sizeof(struct xfrm_user_migrate) },
+	[XFRMA_KMADDRESS]	= { .len = sizeof(struct xfrm_user_kmaddress) },
+	[XFRMA_MARK]		= { .len = sizeof(struct xfrm_mark) },
+	[XFRMA_TFCPAD]		= { .type = NLA_U32 },
+	[XFRMA_REPLAY_ESN_VAL]	= { .len = sizeof(struct xfrm_replay_state_esn) },
+};
+
+static struct xfrm_link {
+	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+	int (*dump)(struct sk_buff *, struct netlink_callback *);
+	int (*done)(struct netlink_callback *);
+} xfrm_dispatch[XFRM_NR_MSGTYPES] = {
+	[XFRM_MSG_NEWSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
+	[XFRM_MSG_DELSA       - XFRM_MSG_BASE] = { .doit = xfrm_del_sa        },
+	[XFRM_MSG_GETSA       - XFRM_MSG_BASE] = { .doit = xfrm_get_sa,
+						   .dump = xfrm_dump_sa,
+						   .done = xfrm_dump_sa_done  },
+	[XFRM_MSG_NEWPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
+	[XFRM_MSG_DELPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy    },
+	[XFRM_MSG_GETPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_get_policy,
+						   .dump = xfrm_dump_policy,
+						   .done = xfrm_dump_policy_done },
+	[XFRM_MSG_ALLOCSPI    - XFRM_MSG_BASE] = { .doit = xfrm_alloc_userspi },
+	[XFRM_MSG_ACQUIRE     - XFRM_MSG_BASE] = { .doit = xfrm_add_acquire   },
+	[XFRM_MSG_EXPIRE      - XFRM_MSG_BASE] = { .doit = xfrm_add_sa_expire },
+	[XFRM_MSG_UPDPOLICY   - XFRM_MSG_BASE] = { .doit = xfrm_add_policy    },
+	[XFRM_MSG_UPDSA       - XFRM_MSG_BASE] = { .doit = xfrm_add_sa        },
+	[XFRM_MSG_POLEXPIRE   - XFRM_MSG_BASE] = { .doit = xfrm_add_pol_expire},
+	[XFRM_MSG_FLUSHSA     - XFRM_MSG_BASE] = { .doit = xfrm_flush_sa      },
+	[XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy  },
+	[XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = { .doit = xfrm_new_ae  },
+	[XFRM_MSG_GETAE       - XFRM_MSG_BASE] = { .doit = xfrm_get_ae  },
+	[XFRM_MSG_MIGRATE     - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate    },
+	[XFRM_MSG_GETSADINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo   },
+	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
+};
+
+static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *attrs[XFRMA_MAX+1];
+	struct xfrm_link *link;
+	int type, err;
+
+	type = nlh->nlmsg_type;
+	if (type > XFRM_MSG_MAX)
+		return -EINVAL;
+
+	type -= XFRM_MSG_BASE;
+	link = &xfrm_dispatch[type];
+
+	/* All operations require privileges, even GET */
+	if (security_netlink_recv(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
+	     type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
+	    (nlh->nlmsg_flags & NLM_F_DUMP)) {
+		if (link->dump == NULL)
+			return -EINVAL;
+
+		return netlink_dump_start(net->xfrm.nlsk, skb, nlh, link->dump, link->done);
+	}
+
+	err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX,
+			  xfrma_policy);
+	if (err < 0)
+		return err;
+
+	if (link->doit == NULL)
+		return -EINVAL;
+
+	return link->doit(skb, nlh, attrs);
+}
+
+static void xfrm_netlink_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&xfrm_cfg_mutex);
+	netlink_rcv_skb(skb, &xfrm_user_rcv_msg);
+	mutex_unlock(&xfrm_cfg_mutex);
+}
+
+static inline size_t xfrm_expire_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_expire))
+	       + nla_total_size(sizeof(struct xfrm_mark));
+}
+
+static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
+{
+	struct xfrm_user_expire *ue;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, c->pid, 0, XFRM_MSG_EXPIRE, sizeof(*ue), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ue = nlmsg_data(nlh);
+	copy_to_user_state(x, &ue->state);
+	ue->hard = (c->data.hard != 0) ? 1 : 0;
+
+	if (xfrm_mark_put(skb, &x->mark))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int xfrm_exp_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+	struct net *net = xs_net(x);
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_expire_msgsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_expire(skb, x, c) < 0) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
+}
+
+static int xfrm_aevent_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+	struct net *net = xs_net(x);
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_aevent_msgsize(x), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_aevent(skb, x, c) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_AEVENTS, GFP_ATOMIC);
+}
+
+static int xfrm_notify_sa_flush(const struct km_event *c)
+{
+	struct net *net = c->net;
+	struct xfrm_usersa_flush *p;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	int len = NLMSG_ALIGN(sizeof(struct xfrm_usersa_flush));
+
+	skb = nlmsg_new(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_FLUSHSA, sizeof(*p), 0);
+	if (nlh == NULL) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	p = nlmsg_data(nlh);
+	p->proto = c->data.proto;
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
+}
+
+static inline size_t xfrm_sa_len(struct xfrm_state *x)
+{
+	size_t l = 0;
+	if (x->aead)
+		l += nla_total_size(aead_len(x->aead));
+	if (x->aalg) {
+		l += nla_total_size(sizeof(struct xfrm_algo) +
+				    (x->aalg->alg_key_len + 7) / 8);
+		l += nla_total_size(xfrm_alg_auth_len(x->aalg));
+	}
+	if (x->ealg)
+		l += nla_total_size(xfrm_alg_len(x->ealg));
+	if (x->calg)
+		l += nla_total_size(sizeof(*x->calg));
+	if (x->encap)
+		l += nla_total_size(sizeof(*x->encap));
+	if (x->tfcpad)
+		l += nla_total_size(sizeof(x->tfcpad));
+	if (x->replay_esn)
+		l += nla_total_size(xfrm_replay_state_esn_len(x->replay_esn));
+	if (x->security)
+		l += nla_total_size(sizeof(struct xfrm_user_sec_ctx) +
+				    x->security->ctx_len);
+	if (x->coaddr)
+		l += nla_total_size(sizeof(*x->coaddr));
+
+	/* Must count x->lastused as it may become non-zero behind our back. */
+	l += nla_total_size(sizeof(u64));
+
+	return l;
+}
+
+static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
+{
+	struct net *net = xs_net(x);
+	struct xfrm_usersa_info *p;
+	struct xfrm_usersa_id *id;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	int len = xfrm_sa_len(x);
+	int headlen;
+
+	headlen = sizeof(*p);
+	if (c->event == XFRM_MSG_DELSA) {
+		len += nla_total_size(headlen);
+		headlen = sizeof(*id);
+		len += nla_total_size(sizeof(struct xfrm_mark));
+	}
+	len += NLMSG_ALIGN(headlen);
+
+	skb = nlmsg_new(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, c->pid, c->seq, c->event, headlen, 0);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	p = nlmsg_data(nlh);
+	if (c->event == XFRM_MSG_DELSA) {
+		struct nlattr *attr;
+
+		id = nlmsg_data(nlh);
+		memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
+		id->spi = x->id.spi;
+		id->family = x->props.family;
+		id->proto = x->id.proto;
+
+		attr = nla_reserve(skb, XFRMA_SA, sizeof(*p));
+		if (attr == NULL)
+			goto nla_put_failure;
+
+		p = nla_data(attr);
+	}
+
+	if (copy_to_user_state_extra(x, p, skb))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
+
+nla_put_failure:
+	/* Somebody screwed up with xfrm_sa_len! */
+	WARN_ON(1);
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_send_state_notify(struct xfrm_state *x, const struct km_event *c)
+{
+
+	switch (c->event) {
+	case XFRM_MSG_EXPIRE:
+		return xfrm_exp_state_notify(x, c);
+	case XFRM_MSG_NEWAE:
+		return xfrm_aevent_state_notify(x, c);
+	case XFRM_MSG_DELSA:
+	case XFRM_MSG_UPDSA:
+	case XFRM_MSG_NEWSA:
+		return xfrm_notify_sa(x, c);
+	case XFRM_MSG_FLUSHSA:
+		return xfrm_notify_sa_flush(c);
+	default:
+		printk(KERN_NOTICE "xfrm_user: Unknown SA event %d\n",
+		       c->event);
+		break;
+	}
+
+	return 0;
+
+}
+
+static inline size_t xfrm_acquire_msgsize(struct xfrm_state *x,
+					  struct xfrm_policy *xp)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_acquire))
+	       + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
+	       + nla_total_size(sizeof(struct xfrm_mark))
+	       + nla_total_size(xfrm_user_sec_ctx_size(x->security))
+	       + userpolicy_type_attrsize();
+}
+
+static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
+			 struct xfrm_tmpl *xt, struct xfrm_policy *xp,
+			 int dir)
+{
+	struct xfrm_user_acquire *ua;
+	struct nlmsghdr *nlh;
+	__u32 seq = xfrm_get_acqseq();
+
+	nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_ACQUIRE, sizeof(*ua), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ua = nlmsg_data(nlh);
+	memcpy(&ua->id, &x->id, sizeof(ua->id));
+	memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr));
+	memcpy(&ua->sel, &x->sel, sizeof(ua->sel));
+	copy_to_user_policy(xp, &ua->policy, dir);
+	ua->aalgos = xt->aalgos;
+	ua->ealgos = xt->ealgos;
+	ua->calgos = xt->calgos;
+	ua->seq = x->km.seq = seq;
+
+	if (copy_to_user_tmpl(xp, skb) < 0)
+		goto nlmsg_failure;
+	if (copy_to_user_state_sec_ctx(x, skb))
+		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp->type, skb) < 0)
+		goto nlmsg_failure;
+	if (xfrm_mark_put(skb, &xp->mark))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+nlmsg_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
+			     struct xfrm_policy *xp, int dir)
+{
+	struct net *net = xs_net(x);
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_acquire_msgsize(x, xp), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_acquire(skb, x, xt, xp, dir) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
+}
+
+/* User gives us xfrm_user_policy_info followed by an array of 0
+ * or more templates.
+ */
+static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
+					       u8 *data, int len, int *dir)
+{
+	struct net *net = sock_net(sk);
+	struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
+	struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1);
+	struct xfrm_policy *xp;
+	int nr;
+
+	switch (sk->sk_family) {
+	case AF_INET:
+		if (opt != IP_XFRM_POLICY) {
+			*dir = -EOPNOTSUPP;
+			return NULL;
+		}
+		break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case AF_INET6:
+		if (opt != IPV6_XFRM_POLICY) {
+			*dir = -EOPNOTSUPP;
+			return NULL;
+		}
+		break;
+#endif
+	default:
+		*dir = -EINVAL;
+		return NULL;
+	}
+
+	*dir = -EINVAL;
+
+	if (len < sizeof(*p) ||
+	    verify_newpolicy_info(p))
+		return NULL;
+
+	nr = ((len - sizeof(*p)) / sizeof(*ut));
+	if (validate_tmpl(nr, ut, p->sel.family))
+		return NULL;
+
+	if (p->dir > XFRM_POLICY_OUT)
+		return NULL;
+
+	xp = xfrm_policy_alloc(net, GFP_ATOMIC);
+	if (xp == NULL) {
+		*dir = -ENOBUFS;
+		return NULL;
+	}
+
+	copy_from_user_policy(xp, p);
+	xp->type = XFRM_POLICY_TYPE_MAIN;
+	copy_templates(xp, ut, nr);
+
+	*dir = p->dir;
+
+	return xp;
+}
+
+static inline size_t xfrm_polexpire_msgsize(struct xfrm_policy *xp)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_polexpire))
+	       + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr)
+	       + nla_total_size(xfrm_user_sec_ctx_size(xp->security))
+	       + nla_total_size(sizeof(struct xfrm_mark))
+	       + userpolicy_type_attrsize();
+}
+
+static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
+			   int dir, const struct km_event *c)
+{
+	struct xfrm_user_polexpire *upe;
+	struct nlmsghdr *nlh;
+	int hard = c->data.hard;
+
+	nlh = nlmsg_put(skb, c->pid, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	upe = nlmsg_data(nlh);
+	copy_to_user_policy(xp, &upe->pol, dir);
+	if (copy_to_user_tmpl(xp, skb) < 0)
+		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp->type, skb) < 0)
+		goto nlmsg_failure;
+	if (xfrm_mark_put(skb, &xp->mark))
+		goto nla_put_failure;
+	upe->hard = !!hard;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+nlmsg_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+	struct net *net = xp_net(xp);
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_polexpire_msgsize(xp), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_polexpire(skb, xp, dir, c) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
+}
+
+static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+	struct net *net = xp_net(xp);
+	struct xfrm_userpolicy_info *p;
+	struct xfrm_userpolicy_id *id;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	int len = nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
+	int headlen;
+
+	headlen = sizeof(*p);
+	if (c->event == XFRM_MSG_DELPOLICY) {
+		len += nla_total_size(headlen);
+		headlen = sizeof(*id);
+	}
+	len += userpolicy_type_attrsize();
+	len += nla_total_size(sizeof(struct xfrm_mark));
+	len += NLMSG_ALIGN(headlen);
+
+	skb = nlmsg_new(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, c->pid, c->seq, c->event, headlen, 0);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	p = nlmsg_data(nlh);
+	if (c->event == XFRM_MSG_DELPOLICY) {
+		struct nlattr *attr;
+
+		id = nlmsg_data(nlh);
+		memset(id, 0, sizeof(*id));
+		id->dir = dir;
+		if (c->data.byid)
+			id->index = xp->index;
+		else
+			memcpy(&id->sel, &xp->selector, sizeof(id->sel));
+
+		attr = nla_reserve(skb, XFRMA_POLICY, sizeof(*p));
+		if (attr == NULL)
+			goto nlmsg_failure;
+
+		p = nla_data(attr);
+	}
+
+	copy_to_user_policy(xp, p, dir);
+	if (copy_to_user_tmpl(xp, skb) < 0)
+		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp->type, skb) < 0)
+		goto nlmsg_failure;
+
+	if (xfrm_mark_put(skb, &xp->mark))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
+
+nla_put_failure:
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_notify_policy_flush(const struct km_event *c)
+{
+	struct net *net = c->net;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(userpolicy_type_attrsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0, 0);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+	if (copy_to_user_policy_type(c->data.type, skb) < 0)
+		goto nlmsg_failure;
+
+	nlmsg_end(skb, nlh);
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
+{
+
+	switch (c->event) {
+	case XFRM_MSG_NEWPOLICY:
+	case XFRM_MSG_UPDPOLICY:
+	case XFRM_MSG_DELPOLICY:
+		return xfrm_notify_policy(xp, dir, c);
+	case XFRM_MSG_FLUSHPOLICY:
+		return xfrm_notify_policy_flush(c);
+	case XFRM_MSG_POLEXPIRE:
+		return xfrm_exp_policy_notify(xp, dir, c);
+	default:
+		printk(KERN_NOTICE "xfrm_user: Unknown Policy event %d\n",
+		       c->event);
+	}
+
+	return 0;
+
+}
+
+static inline size_t xfrm_report_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_report));
+}
+
+static int build_report(struct sk_buff *skb, u8 proto,
+			struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+	struct xfrm_user_report *ur;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ur = nlmsg_data(nlh);
+	ur->proto = proto;
+	memcpy(&ur->sel, sel, sizeof(ur->sel));
+
+	if (addr)
+		NLA_PUT(skb, XFRMA_COADDR, sizeof(*addr), addr);
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int xfrm_send_report(struct net *net, u8 proto,
+			    struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+	struct sk_buff *skb;
+
+	skb = nlmsg_new(xfrm_report_msgsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_report(skb, proto, sel, addr) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_REPORT, GFP_ATOMIC);
+}
+
+static inline size_t xfrm_mapping_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping));
+}
+
+static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
+			 xfrm_address_t *new_saddr, __be16 new_sport)
+{
+	struct xfrm_user_mapping *um;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	um = nlmsg_data(nlh);
+
+	memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr));
+	um->id.spi = x->id.spi;
+	um->id.family = x->props.family;
+	um->id.proto = x->id.proto;
+	memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr));
+	memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr));
+	um->new_sport = new_sport;
+	um->old_sport = x->encap->encap_sport;
+	um->reqid = x->props.reqid;
+
+	return nlmsg_end(skb, nlh);
+}
+
+static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
+			     __be16 sport)
+{
+	struct net *net = xs_net(x);
+	struct sk_buff *skb;
+
+	if (x->id.proto != IPPROTO_ESP)
+		return -EINVAL;
+
+	if (!x->encap)
+		return -EINVAL;
+
+	skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_mapping(skb, x, ipaddr, sport) < 0)
+		BUG();
+
+	return nlmsg_multicast(net->xfrm.nlsk, skb, 0, XFRMNLGRP_MAPPING, GFP_ATOMIC);
+}
+
+static struct xfrm_mgr netlink_mgr = {
+	.id		= "netlink",
+	.notify		= xfrm_send_state_notify,
+	.acquire	= xfrm_send_acquire,
+	.compile_policy	= xfrm_compile_policy,
+	.notify_policy	= xfrm_send_policy_notify,
+	.report		= xfrm_send_report,
+	.migrate	= xfrm_send_migrate,
+	.new_mapping	= xfrm_send_mapping,
+};
+
+static int __net_init xfrm_user_net_init(struct net *net)
+{
+	struct sock *nlsk;
+
+	nlsk = netlink_kernel_create(net, NETLINK_XFRM, XFRMNLGRP_MAX,
+				     xfrm_netlink_rcv, NULL, THIS_MODULE);
+	if (nlsk == NULL)
+		return -ENOMEM;
+	net->xfrm.nlsk_stash = nlsk; /* Don't set to NULL */
+	rcu_assign_pointer(net->xfrm.nlsk, nlsk);
+	return 0;
+}
+
+static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
+{
+	struct net *net;
+	list_for_each_entry(net, net_exit_list, exit_list)
+		rcu_assign_pointer(net->xfrm.nlsk, NULL);
+	synchronize_net();
+	list_for_each_entry(net, net_exit_list, exit_list)
+		netlink_kernel_release(net->xfrm.nlsk_stash);
+}
+
+static struct pernet_operations xfrm_user_net_ops = {
+	.init	    = xfrm_user_net_init,
+	.exit_batch = xfrm_user_net_exit,
+};
+
+static int __init xfrm_user_init(void)
+{
+	int rv;
+
+	printk(KERN_INFO "Initializing XFRM netlink socket\n");
+
+	rv = register_pernet_subsys(&xfrm_user_net_ops);
+	if (rv < 0)
+		return rv;
+	rv = xfrm_register_km(&netlink_mgr);
+	if (rv < 0)
+		unregister_pernet_subsys(&xfrm_user_net_ops);
+	return rv;
+}
+
+static void __exit xfrm_user_exit(void)
+{
+	xfrm_unregister_km(&netlink_mgr);
+	unregister_pernet_subsys(&xfrm_user_net_ops);
+}
+
+module_init(xfrm_user_init);
+module_exit(xfrm_user_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
+
author	root <root@artemis.panaceas.org>	2015-12-25 04:40:36 +0000
committer	root <root@artemis.panaceas.org>	2015-12-25 04:40:36 +0000
commit	849369d6c66d3054688672f97d31fceb8e8230fb (patch)
tree	6135abc790ca67dedbe07c39806591e70eda81ce /net
download	linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.gz linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.bz2 linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.zip